/tools/perf/scripts/python/

eRussell King
summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h34
-rw-r--r--net/6lowpan/Kconfig105
-rw-r--r--net/6lowpan/Makefile22
-rw-r--r--net/6lowpan/core.c183
-rw-r--r--net/6lowpan/debugfs.c278
-rw-r--r--net/6lowpan/iphc.c1313
-rw-r--r--net/6lowpan/ndisc.c223
-rw-r--r--net/6lowpan/nhc.c169
-rw-r--r--net/6lowpan/nhc.h133
-rw-r--r--net/6lowpan/nhc_dest.c17
-rw-r--r--net/6lowpan/nhc_fragment.c16
-rw-r--r--net/6lowpan/nhc_ghc_ext_dest.c16
-rw-r--r--net/6lowpan/nhc_ghc_ext_frag.c17
-rw-r--r--net/6lowpan/nhc_ghc_ext_hop.c16
-rw-r--r--net/6lowpan/nhc_ghc_ext_route.c16
-rw-r--r--net/6lowpan/nhc_ghc_icmpv6.c16
-rw-r--r--net/6lowpan/nhc_ghc_udp.c16
-rw-r--r--net/6lowpan/nhc_hop.c16
-rw-r--r--net/6lowpan/nhc_ipv6.c16
-rw-r--r--net/6lowpan/nhc_mobility.c16
-rw-r--r--net/6lowpan/nhc_routing.c16
-rw-r--r--net/6lowpan/nhc_udp.c176
-rw-r--r--net/802/Kconfig11
-rw-r--r--net/802/Makefile12
-rw-r--r--net/802/fc.c59
-rw-r--r--net/802/fddi.c92
-rw-r--r--net/802/garp.c650
-rw-r--r--net/802/hippi.c110
-rw-r--r--net/802/mrp.c944
-rw-r--r--net/802/p8022.c66
-rw-r--r--net/802/p8023.c63
-rw-r--r--net/802/psnap.c56
-rw-r--r--net/802/stp.c102
-rw-r--r--net/802/sysctl_net_802.c33
-rw-r--r--net/802/tr.c646
-rw-r--r--net/8021q/Kconfig36
-rw-r--r--net/8021q/Makefile15
-rw-r--r--net/8021q/vlan.c1106
-rw-r--r--net/8021q/vlan.h241
-rw-r--r--net/8021q/vlan_core.c560
-rw-r--r--net/8021q/vlan_dev.c1561
-rw-r--r--net/8021q/vlan_gvrp.c67
-rw-r--r--net/8021q/vlan_mvrp.c73
-rw-r--r--net/8021q/vlan_netlink.c315
-rw-r--r--net/8021q/vlanproc.c297
-rw-r--r--net/8021q/vlanproc.h20
-rw-r--r--net/9p/Kconfig63
-rw-r--r--net/9p/Makefile29
-rw-r--r--net/9p/client.c2209
-rw-r--r--net/9p/error.c227
-rw-r--r--net/9p/mod.c212
-rw-r--r--net/9p/protocol.c801
-rw-r--r--net/9p/protocol.h19
-rw-r--r--net/9p/trans_common.c24
-rw-r--r--net/9p/trans_common.h7
-rw-r--r--net/9p/trans_fd.c1096
-rw-r--r--net/9p/trans_rdma.c669
-rw-r--r--net/9p/trans_usbg.c969
-rw-r--r--net/9p/trans_virtio.c838
-rw-r--r--net/9p/trans_xen.c576
-rw-r--r--net/Kconfig443
-rw-r--r--net/Kconfig.debug41
-rw-r--r--net/Makefile69
-rw-r--r--net/TUNABLE50
-rw-r--r--net/appletalk/Kconfig30
-rw-r--r--net/appletalk/Makefile3
-rw-r--r--net/appletalk/aarp.c389
-rw-r--r--net/appletalk/atalk_proc.c156
-rw-r--r--net/appletalk/ddp.c1169
-rw-r--r--net/appletalk/dev.c61
-rw-r--r--net/appletalk/sysctl_net_atalk.c46
-rw-r--r--net/atm/Kconfig18
-rw-r--r--net/atm/Makefile4
-rw-r--r--net/atm/addr.c15
-rw-r--r--net/atm/addr.h5
-rw-r--r--net/atm/atm_misc.c45
-rw-r--r--net/atm/atm_sysfs.c169
-rw-r--r--net/atm/br2684.c810
-rw-r--r--net/atm/clip.c530
-rw-r--r--net/atm/common.c646
-rw-r--r--net/atm/common.h18
-rw-r--r--net/atm/ioctl.c360
-rw-r--r--net/atm/ipcommon.c63
-rw-r--r--net/atm/ipcommon.h22
-rw-r--r--net/atm/lec.c1302
-rw-r--r--net/atm/lec.h27
-rw-r--r--net/atm/lec_arpc.h5
-rw-r--r--net/atm/mpc.c824
-rw-r--r--net/atm/mpc.h48
-rw-r--r--net/atm/mpoa_caches.c325
-rw-r--r--net/atm/mpoa_caches.h101
-rw-r--r--net/atm/mpoa_proc.c211
-rw-r--r--net/atm/pppoatm.c235
-rw-r--r--net/atm/proc.c329
-rw-r--r--net/atm/protocols.h1
-rw-r--r--net/atm/pvc.c68
-rw-r--r--net/atm/raw.c55
-rw-r--r--net/atm/resources.c438
-rw-r--r--net/atm/resources.h8
-rw-r--r--net/atm/signaling.c260
-rw-r--r--net/atm/signaling.h5
-rw-r--r--net/atm/svc.c413
-rw-r--r--net/ax25/Kconfig72
-rw-r--r--net/ax25/Makefile1
-rw-r--r--net/ax25/TODO24
-rw-r--r--net/ax25/af_ax25.c787
-rw-r--r--net/ax25/ax25_addr.c58
-rw-r--r--net/ax25/ax25_dev.c107
-rw-r--r--net/ax25/ax25_ds_in.c12
-rw-r--r--net/ax25/ax25_ds_subr.c21
-rw-r--r--net/ax25/ax25_ds_timer.c39
-rw-r--r--net/ax25/ax25_iface.c134
-rw-r--r--net/ax25/ax25_in.c98
-rw-r--r--net/ax25/ax25_ip.c159
-rw-r--r--net/ax25/ax25_out.c80
-rw-r--r--net/ax25/ax25_route.c174
-rw-r--r--net/ax25/ax25_std_in.c9
-rw-r--r--net/ax25/ax25_std_subr.c9
-rw-r--r--net/ax25/ax25_std_timer.c20
-rw-r--r--net/ax25/ax25_subr.c47
-rw-r--r--net/ax25/ax25_timer.c111
-rw-r--r--net/ax25/ax25_uid.c89
-rw-r--r--net/ax25/sysctl_net_ax25.c181
-rw-r--r--net/batman-adv/Kconfig84
-rw-r--r--net/batman-adv/Makefile34
-rw-r--r--net/batman-adv/bat_algo.c210
-rw-r--r--net/batman-adv/bat_algo.h23
-rw-r--r--net/batman-adv/bat_iv_ogm.c2545
-rw-r--r--net/batman-adv/bat_iv_ogm.h14
-rw-r--r--net/batman-adv/bat_v.c887
-rw-r--r--net/batman-adv/bat_v.h41
-rw-r--r--net/batman-adv/bat_v_elp.c596
-rw-r--r--net/batman-adv/bat_v_elp.h22
-rw-r--r--net/batman-adv/bat_v_ogm.c1080
-rw-r--r--net/batman-adv/bat_v_ogm.h27
-rw-r--r--net/batman-adv/bitarray.c89
-rw-r--r--net/batman-adv/bitarray.h56
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c2482
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h132
-rw-r--r--net/batman-adv/distributed-arp-table.c1820
-rw-r--r--net/batman-adv/distributed-arp-table.h186
-rw-r--r--net/batman-adv/fragmentation.c556
-rw-r--r--net/batman-adv/fragmentation.h44
-rw-r--r--net/batman-adv/gateway_client.c759
-rw-r--r--net/batman-adv/gateway_client.h55
-rw-r--r--net/batman-adv/gateway_common.c114
-rw-r--r--net/batman-adv/gateway_common.h31
-rw-r--r--net/batman-adv/hard-interface.c1016
-rw-r--r--net/batman-adv/hard-interface.h121
-rw-r--r--net/batman-adv/hash.c84
-rw-r--r--net/batman-adv/hash.h157
-rw-r--r--net/batman-adv/log.c36
-rw-r--r--net/batman-adv/log.h140
-rw-r--r--net/batman-adv/main.c688
-rw-r--r--net/batman-adv/main.h380
-rw-r--r--net/batman-adv/mesh-interface.c1133
-rw-r--r--net/batman-adv/mesh-interface.h41
-rw-r--r--net/batman-adv/multicast.c2195
-rw-r--r--net/batman-adv/multicast.h125
-rw-r--r--net/batman-adv/multicast_forw.c1178
-rw-r--r--net/batman-adv/netlink.c1559
-rw-r--r--net/batman-adv/netlink.h28
-rw-r--r--net/batman-adv/originator.c1373
-rw-r--r--net/batman-adv/originator.h170
-rw-r--r--net/batman-adv/routing.c1335
-rw-r--r--net/batman-adv/routing.h53
-rw-r--r--net/batman-adv/send.c1120
-rw-r--r--net/batman-adv/send.h116
-rw-r--r--net/batman-adv/tp_meter.c1490
-rw-r--r--net/batman-adv/tp_meter.h22
-rw-r--r--net/batman-adv/trace.c8
-rw-r--r--net/batman-adv/trace.h64
-rw-r--r--net/batman-adv/translation-table.c4238
-rw-r--r--net/batman-adv/translation-table.h74
-rw-r--r--net/batman-adv/tvlv.c663
-rw-r--r--net/batman-adv/tvlv.h52
-rw-r--r--net/batman-adv/types.h2212
-rw-r--r--net/bluetooth/6lowpan.c1336
-rw-r--r--net/bluetooth/Kconfig144
-rw-r--r--net/bluetooth/Makefile21
-rw-r--r--net/bluetooth/af_bluetooth.c821
-rw-r--r--net/bluetooth/aosp.c210
-rw-r--r--net/bluetooth/aosp.h29
-rw-r--r--net/bluetooth/bnep/Kconfig3
-rw-r--r--net/bluetooth/bnep/Makefile1
-rw-r--r--net/bluetooth/bnep/bnep.h195
-rw-r--r--net/bluetooth/bnep/core.c371
-rw-r--r--net/bluetooth/bnep/netdev.c141
-rw-r--r--net/bluetooth/bnep/sock.c137
-rw-r--r--net/bluetooth/cmtp/Kconfig5
-rw-r--r--net/bluetooth/cmtp/Makefile1
-rw-r--r--net/bluetooth/cmtp/capi.c145
-rw-r--r--net/bluetooth/cmtp/cmtp.h24
-rw-r--r--net/bluetooth/cmtp/core.c164
-rw-r--r--net/bluetooth/cmtp/sock.c91
-rw-r--r--net/bluetooth/coredump.c553
-rw-r--r--net/bluetooth/ecdh_helper.c203
-rw-r--r--net/bluetooth/ecdh_helper.h30
-rw-r--r--net/bluetooth/eir.c386
-rw-r--r--net/bluetooth/eir.h99
-rw-r--r--net/bluetooth/hci_codec.c253
-rw-r--r--net/bluetooth/hci_codec.h7
-rw-r--r--net/bluetooth/hci_conn.c3169
-rw-r--r--net/bluetooth/hci_core.c4195
-rw-r--r--net/bluetooth/hci_debugfs.c1395
-rw-r--r--net/bluetooth/hci_debugfs.h53
-rw-r--r--net/bluetooth/hci_drv.c105
-rw-r--r--net/bluetooth/hci_event.c8058
-rw-r--r--net/bluetooth/hci_sock.c2030
-rw-r--r--net/bluetooth/hci_sync.c7420
-rw-r--r--net/bluetooth/hci_sysfs.c383
-rw-r--r--net/bluetooth/hidp/Kconfig3
-rw-r--r--net/bluetooth/hidp/Makefile1
-rw-r--r--net/bluetooth/hidp/core.c1417
-rw-r--r--net/bluetooth/hidp/hidp.h93
-rw-r--r--net/bluetooth/hidp/sock.c191
-rw-r--r--net/bluetooth/iso.c2734
-rw-r--r--net/bluetooth/l2cap.c2264
-rw-r--r--net/bluetooth/l2cap_core.c7721
-rw-r--r--net/bluetooth/l2cap_sock.c2013
-rw-r--r--net/bluetooth/leds.c100
-rw-r--r--net/bluetooth/leds.h23
-rw-r--r--net/bluetooth/lib.c291
-rw-r--r--net/bluetooth/mgmt.c10613
-rw-r--r--net/bluetooth/mgmt_config.c346
-rw-r--r--net/bluetooth/mgmt_config.h17
-rw-r--r--net/bluetooth/mgmt_util.c441
-rw-r--r--net/bluetooth/mgmt_util.h78
-rw-r--r--net/bluetooth/msft.c1201
-rw-r--r--net/bluetooth/msft.h78
-rw-r--r--net/bluetooth/rfcomm/Kconfig4
-rw-r--r--net/bluetooth/rfcomm/Makefile1
-rw-r--r--net/bluetooth/rfcomm/core.c1003
-rw-r--r--net/bluetooth/rfcomm/sock.c679
-rw-r--r--net/bluetooth/rfcomm/tty.c790
-rw-r--r--net/bluetooth/sco.c1323
-rw-r--r--net/bluetooth/selftest.c309
-rw-r--r--net/bluetooth/selftest.h45
-rw-r--r--net/bluetooth/smp.c3847
-rw-r--r--net/bluetooth/smp.h215
-rw-r--r--net/bpf/Makefile5
-rw-r--r--net/bpf/bpf_dummy_struct_ops.c323
-rw-r--r--net/bpf/test_run.c1834
-rw-r--r--net/bridge/Kconfig58
-rw-r--r--net/bridge/Makefile24
-rw-r--r--net/bridge/br.c501
-rw-r--r--net/bridge/br_arp_nd_proxy.c513
-rw-r--r--net/bridge/br_cfm.c867
-rw-r--r--net/bridge/br_cfm_netlink.c726
-rw-r--r--net/bridge/br_device.c526
-rw-r--r--net/bridge/br_fdb.c1719
-rw-r--r--net/bridge/br_forward.c375
-rw-r--r--net/bridge/br_if.c724
-rw-r--r--net/bridge/br_input.c498
-rw-r--r--net/bridge/br_ioctl.c291
-rw-r--r--net/bridge/br_mdb.c1722
-rw-r--r--net/bridge/br_mrp.c1260
-rw-r--r--net/bridge/br_mrp_netlink.c571
-rw-r--r--net/bridge/br_mrp_switchdev.c241
-rw-r--r--net/bridge/br_mst.c366
-rw-r--r--net/bridge/br_multicast.c5247
-rw-r--r--net/bridge/br_multicast_eht.c822
-rw-r--r--net/bridge/br_netfilter.c1129
-rw-r--r--net/bridge/br_netfilter_hooks.c1340
-rw-r--r--net/bridge/br_netfilter_ipv6.c189
-rw-r--r--net/bridge/br_netlink.c1968
-rw-r--r--net/bridge/br_netlink_tunnel.c342
-rw-r--r--net/bridge/br_nf_core.c88
-rw-r--r--net/bridge/br_notify.c90
-rw-r--r--net/bridge/br_private.h2348
-rw-r--r--net/bridge/br_private_cfm.h147
-rw-r--r--net/bridge/br_private_mcast_eht.h94
-rw-r--r--net/bridge/br_private_mrp.h148
-rw-r--r--net/bridge/br_private_stp.h52
-rw-r--r--net/bridge/br_private_tunnel.h85
-rw-r--r--net/bridge/br_stp.c380
-rw-r--r--net/bridge/br_stp_bpdu.c108
-rw-r--r--net/bridge/br_stp_if.c216
-rw-r--r--net/bridge/br_stp_timer.c115
-rw-r--r--net/bridge/br_switchdev.c876
-rw-r--r--net/bridge/br_sysfs_br.c1046
-rw-r--r--net/bridge/br_sysfs_if.c322
-rw-r--r--net/bridge/br_vlan.c2349
-rw-r--r--net/bridge/br_vlan_options.c740
-rw-r--r--net/bridge/br_vlan_tunnel.c227
-rw-r--r--net/bridge/netfilter/Kconfig109
-rw-r--r--net/bridge/netfilter/Makefile12
-rw-r--r--net/bridge/netfilter/ebt_802_3.c64
-rw-r--r--net/bridge/netfilter/ebt_among.c171
-rw-r--r--net/bridge/netfilter/ebt_arp.c152
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c60
-rw-r--r--net/bridge/netfilter/ebt_dnat.c98
-rw-r--r--net/bridge/netfilter/ebt_ip.c150
-rw-r--r--net/bridge/netfilter/ebt_ip6.c164
-rw-r--r--net/bridge/netfilter/ebt_limit.c66
-rw-r--r--net/bridge/netfilter/ebt_log.c220
-rw-r--r--net/bridge/netfilter/ebt_mark.c81
-rw-r--r--net/bridge/netfilter/ebt_mark_m.c75
-rw-r--r--net/bridge/netfilter/ebt_nflog.c75
-rw-r--r--net/bridge/netfilter/ebt_pkttype.c40
-rw-r--r--net/bridge/netfilter/ebt_redirect.c76
-rw-r--r--net/bridge/netfilter/ebt_snat.c84
-rw-r--r--net/bridge/netfilter/ebt_stp.c187
-rw-r--r--net/bridge/netfilter/ebt_ulog.c347
-rw-r--r--net/bridge/netfilter/ebt_vlan.c146
-rw-r--r--net/bridge/netfilter/ebtable_broute.c108
-rw-r--r--net/bridge/netfilter/ebtable_filter.c98
-rw-r--r--net/bridge/netfilter/ebtable_nat.c101
-rw-r--r--net/bridge/netfilter/ebtables.c2440
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c455
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c253
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c215
-rw-r--r--net/caif/Kconfig54
-rw-r--r--net/caif/Makefile16
-rw-r--r--net/caif/caif_dev.c586
-rw-r--r--net/caif/caif_socket.c1114
-rw-r--r--net/caif/caif_usb.c216
-rw-r--r--net/caif/cfcnfg.c612
-rw-r--r--net/caif/cfctrl.c631
-rw-r--r--net/caif/cfdbgl.c55
-rw-r--r--net/caif/cfdgml.c113
-rw-r--r--net/caif/cffrml.c197
-rw-r--r--net/caif/cfmuxl.c267
-rw-r--r--net/caif/cfpkt_skbuff.c373
-rw-r--r--net/caif/cfrfml.c299
-rw-r--r--net/caif/cfserl.c192
-rw-r--r--net/caif/cfsrvl.c214
-rw-r--r--net/caif/cfutill.c104
-rw-r--r--net/caif/cfveil.c101
-rw-r--r--net/caif/cfvidl.c65
-rw-r--r--net/caif/chnl_net.c531
-rw-r--r--net/can/Kconfig73
-rw-r--r--net/can/Makefile22
-rw-r--r--net/can/af_can.c922
-rw-r--r--net/can/af_can.h103
-rw-r--r--net/can/bcm.c1865
-rw-r--r--net/can/gw.c1362
-rw-r--r--net/can/isotp.c1739
-rw-r--r--net/can/j1939/Kconfig15
-rw-r--r--net/can/j1939/Makefile10
-rw-r--r--net/can/j1939/address-claim.c270
-rw-r--r--net/can/j1939/bus.c336
-rw-r--r--net/can/j1939/j1939-priv.h345
-rw-r--r--net/can/j1939/main.c430
-rw-r--r--net/can/j1939/socket.c1393
-rw-r--r--net/can/j1939/transport.c2220
-rw-r--r--net/can/proc.c506
-rw-r--r--net/can/raw.c1158
-rw-r--r--net/ceph/Kconfig46
-rw-r--r--net/ceph/Makefile18
-rw-r--r--net/ceph/armor.c106
-rw-r--r--net/ceph/auth.c659
-rw-r--r--net/ceph/auth_none.c146
-rw-r--r--net/ceph/auth_none.h27
-rw-r--r--net/ceph/auth_x.c1124
-rw-r--r--net/ceph/auth_x.h54
-rw-r--r--net/ceph/auth_x_protocol.h99
-rw-r--r--net/ceph/buffer.c59
-rw-r--r--net/ceph/ceph_common.c926
-rw-r--r--net/ceph/ceph_hash.c131
-rw-r--r--net/ceph/ceph_strings.c90
-rw-r--r--net/ceph/cls_lock_client.c431
-rw-r--r--net/ceph/crush/crush.c142
-rw-r--r--net/ceph/crush/crush_ln_table.h164
-rw-r--r--net/ceph/crush/hash.c152
-rw-r--r--net/ceph/crush/mapper.c1099
-rw-r--r--net/ceph/crypto.c349
-rw-r--r--net/ceph/crypto.h38
-rw-r--r--net/ceph/debugfs.c484
-rw-r--r--net/ceph/decode.c193
-rw-r--r--net/ceph/messenger.c2222
-rw-r--r--net/ceph/messenger_v1.c1620
-rw-r--r--net/ceph/messenger_v2.c3804
-rw-r--r--net/ceph/mon_client.c1596
-rw-r--r--net/ceph/msgpool.c94
-rw-r--r--net/ceph/osd_client.c5917
-rw-r--r--net/ceph/osdmap.c3110
-rw-r--r--net/ceph/pagelist.c133
-rw-r--r--net/ceph/pagevec.c114
-rw-r--r--net/ceph/snapshot.c63
-rw-r--r--net/ceph/string_table.c106
-rw-r--r--net/ceph/striper.c278
-rw-r--r--net/compat.c679
-rw-r--r--net/core/Makefile47
-rw-r--r--net/core/bpf_sk_storage.c914
-rw-r--r--net/core/datagram.c902
-rw-r--r--net/core/dev.c13920
-rw-r--r--net/core/dev.h406
-rw-r--r--net/core/dev_addr_lists.c1051
-rw-r--r--net/core/dev_addr_lists_test.c247
-rw-r--r--net/core/dev_api.c382
-rw-r--r--net/core/dev_ioctl.c875
-rw-r--r--net/core/dev_mcast.c297
-rw-r--r--net/core/devmem.c522
-rw-r--r--net/core/devmem.h246
-rw-r--r--net/core/drop_monitor.c1789
-rw-r--r--net/core/dst.c494
-rw-r--r--net/core/dst_cache.c210
-rw-r--r--net/core/ethtool.c984
-rw-r--r--net/core/failover.c315
-rw-r--r--net/core/fib_notifier.c197
-rw-r--r--net/core/fib_rules.c1355
-rw-r--r--net/core/filter.c12766
-rw-r--r--net/core/flow.c381
-rw-r--r--net/core/flow_dissector.c2101
-rw-r--r--net/core/flow_offload.c638
-rw-r--r--net/core/gen_estimator.c339
-rw-r--r--net/core/gen_stats.c362
-rw-r--r--net/core/gro.c835
-rw-r--r--net/core/gro_cells.c148
-rw-r--r--net/core/gso.c273
-rw-r--r--net/core/hotdata.c29
-rw-r--r--net/core/hwbm.c85
-rw-r--r--net/core/ieee8021q_helpers.c224
-rw-r--r--net/core/iovec.c239
-rw-r--r--net/core/link_watch.c340
-rw-r--r--net/core/lock_debug.c122
-rw-r--r--net/core/lwt_bpf.c662
-rw-r--r--net/core/lwtunnel.c474
-rw-r--r--net/core/mp_dmabuf_devmem.h44
-rw-r--r--net/core/neighbour.c3838
-rw-r--r--net/core/net-procfs.c399
-rw-r--r--net/core/net-sysfs.c2448
-rw-r--r--net/core/net-sysfs.h16
-rw-r--r--net/core/net-traces.c68
-rw-r--r--net/core/net_namespace.c1549
-rw-r--r--net/core/net_test.c387
-rw-r--r--net/core/netclassid_cgroup.c154
-rw-r--r--net/core/netdev-genl-gen.c238
-rw-r--r--net/core/netdev-genl-gen.h50
-rw-r--r--net/core/netdev-genl.c1203
-rw-r--r--net/core/netdev_queues.c27
-rw-r--r--net/core/netdev_rx_queue.c194
-rw-r--r--net/core/netevent.c20
-rw-r--r--net/core/netmem_priv.h62
-rw-r--r--net/core/netpoll.c1114
-rw-r--r--net/core/netprio_cgroup.c295
-rw-r--r--net/core/of_net.c172
-rw-r--r--net/core/page_pool.c1354
-rw-r--r--net/core/page_pool_priv.h60
-rw-r--r--net/core/page_pool_user.c441
-rw-r--r--net/core/pktgen.c3872
-rw-r--r--net/core/ptp_classifier.c228
-rw-r--r--net/core/request_sock.c157
-rw-r--r--net/core/rtnetlink.c7130
-rw-r--r--net/core/scm.c438
-rw-r--r--net/core/secure_seq.c158
-rw-r--r--net/core/selftests.c448
-rw-r--r--net/core/skb_fault_injection.c106
-rw-r--r--net/core/skbuff.c6934
-rw-r--r--net/core/skmsg.c1289
-rw-r--r--net/core/sock.c4747
-rw-r--r--net/core/sock_destructor.h12
-rw-r--r--net/core/sock_diag.c355
-rw-r--r--net/core/sock_map.c1959
-rw-r--r--net/core/sock_reuseport.c748
-rw-r--r--net/core/stream.c194
-rw-r--r--net/core/sysctl_net_core.c796
-rw-r--r--net/core/timestamping.c114
-rw-r--r--net/core/tso.c89
-rw-r--r--net/core/user_dma.c132
-rw-r--r--net/core/utils.c300
-rw-r--r--net/core/wireless.c2353
-rw-r--r--net/core/xdp.c1055
-rw-r--r--net/dcb/Kconfig23
-rw-r--r--net/dcb/Makefile2
-rw-r--r--net/dcb/dcbevent.c30
-rw-r--r--net/dcb/dcbnl.c2428
-rw-r--r--net/dccp/Kconfig64
-rw-r--r--net/dccp/Makefile21
-rw-r--r--net/dccp/ackvec.c515
-rw-r--r--net/dccp/ackvec.h158
-rw-r--r--net/dccp/ccid.c256
-rw-r--r--net/dccp/ccid.h200
-rw-r--r--net/dccp/ccids/Kconfig92
-rw-r--r--net/dccp/ccids/Makefile9
-rw-r--r--net/dccp/ccids/ccid2.c857
-rw-r--r--net/dccp/ccids/ccid2.h91
-rw-r--r--net/dccp/ccids/ccid3.c1266
-rw-r--r--net/dccp/ccids/ccid3.h176
-rw-r--r--net/dccp/ccids/lib/Makefile3
-rw-r--r--net/dccp/ccids/lib/loss_interval.c143
-rw-r--r--net/dccp/ccids/lib/loss_interval.h57
-rw-r--r--net/dccp/ccids/lib/packet_history.c295
-rw-r--r--net/dccp/ccids/lib/packet_history.h196
-rw-r--r--net/dccp/ccids/lib/tfrc.h22
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c643
-rw-r--r--net/dccp/dccp.h453
-rw-r--r--net/dccp/diag.c70
-rw-r--r--net/dccp/feat.c644
-rw-r--r--net/dccp/feat.h66
-rw-r--r--net/dccp/input.c577
-rw-r--r--net/dccp/ipv4.c1076
-rw-r--r--net/dccp/ipv6.c1269
-rw-r--r--net/dccp/ipv6.h36
-rw-r--r--net/dccp/minisocks.c303
-rw-r--r--net/dccp/options.c587
-rw-r--r--net/dccp/output.c589
-rw-r--r--net/dccp/probe.c202
-rw-r--r--net/dccp/proto.c1115
-rw-r--r--net/dccp/sysctl.c141
-rw-r--r--net/dccp/timer.c268
-rw-r--r--net/decnet/Kconfig43
-rw-r--r--net/decnet/Makefile10
-rw-r--r--net/decnet/README8
-rw-r--r--net/decnet/TODO41
-rw-r--r--net/decnet/af_decnet.c2432
-rw-r--r--net/decnet/dn_dev.c1512
-rw-r--r--net/decnet/dn_fib.c755
-rw-r--r--net/decnet/dn_neigh.c621
-rw-r--r--net/decnet/dn_nsp_in.c917
-rw-r--r--net/decnet/dn_nsp_out.c723
-rw-r--r--net/decnet/dn_route.c1824
-rw-r--r--net/decnet/dn_rules.c275
-rw-r--r--net/decnet/dn_table.c900
-rw-r--r--net/decnet/dn_timer.c109
-rw-r--r--net/decnet/netfilter/Kconfig15
-rw-r--r--net/decnet/netfilter/Makefile6
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c169
-rw-r--r--net/decnet/sysctl_net_decnet.c512
-rw-r--r--net/devlink/Makefile4
-rw-r--r--net/devlink/core.c551
-rw-r--r--net/devlink/dev.c1442
-rw-r--r--net/devlink/devl_internal.h304
-rw-r--r--net/devlink/dpipe.c915
-rw-r--r--net/devlink/health.c1350
-rw-r--r--net/devlink/linecard.c628
-rw-r--r--net/devlink/netlink.c376
-rw-r--r--net/devlink/netlink_gen.c1287
-rw-r--r--net/devlink/netlink_gen.h150
-rw-r--r--net/devlink/param.c950
-rw-r--r--net/devlink/port.c1604
-rw-r--r--net/devlink/rate.c850
-rw-r--r--net/devlink/region.c1258
-rw-r--r--net/devlink/resource.c504
-rw-r--r--net/devlink/sb.c995
-rw-r--r--net/devlink/trap.c1854
-rw-r--r--net/devres.c95
-rw-r--r--net/dns_resolver/Kconfig28
-rw-r--r--net/dns_resolver/Makefile8
-rw-r--r--net/dns_resolver/dns_key.c391
-rw-r--r--net/dns_resolver/dns_query.c170
-rw-r--r--net/dns_resolver/internal.h51
-rw-r--r--net/dsa/Kconfig207
-rw-r--r--net/dsa/Makefile46
-rw-r--r--net/dsa/conduit.c549
-rw-r--r--net/dsa/conduit.h22
-rw-r--r--net/dsa/devlink.c402
-rw-r--r--net/dsa/devlink.h16
-rw-r--r--net/dsa/dsa.c1897
-rw-r--r--net/dsa/dsa.h40
-rw-r--r--net/dsa/netlink.c64
-rw-r--r--net/dsa/netlink.h8
-rw-r--r--net/dsa/port.c1955
-rw-r--r--net/dsa/port.h115
-rw-r--r--net/dsa/stubs.c10
-rw-r--r--net/dsa/switch.c1134
-rw-r--r--net/dsa/switch.h123
-rw-r--r--net/dsa/tag.c244
-rw-r--r--net/dsa/tag.h409
-rw-r--r--net/dsa/tag_8021q.c588
-rw-r--r--net/dsa/tag_8021q.h28
-rw-r--r--net/dsa/tag_ar9331.c95
-rw-r--r--net/dsa/tag_brcm.c418
-rw-r--r--net/dsa/tag_dsa.c410
-rw-r--r--net/dsa/tag_gswip.c112
-rw-r--r--net/dsa/tag_hellcreek.c73
-rw-r--r--net/dsa/tag_ksz.c465
-rw-r--r--net/dsa/tag_lan9303.c126
-rw-r--r--net/dsa/tag_mtk.c110
-rw-r--r--net/dsa/tag_mxl-gsw1xx.c117
-rw-r--r--net/dsa/tag_none.c31
-rw-r--r--net/dsa/tag_ocelot.c186
-rw-r--r--net/dsa/tag_ocelot_8021q.c140
-rw-r--r--net/dsa/tag_qca.c125
-rw-r--r--net/dsa/tag_rtl4_a.c126
-rw-r--r--net/dsa/tag_rtl8_4.c261
-rw-r--r--net/dsa/tag_rzn1_a5psw.c115
-rw-r--r--net/dsa/tag_sja1105.c762
-rw-r--r--net/dsa/tag_trailer.c65
-rw-r--r--net/dsa/tag_vsc73xx_8021q.c68
-rw-r--r--net/dsa/tag_xrs700x.c61
-rw-r--r--net/dsa/tag_yt921x.c139
-rw-r--r--net/dsa/trace.c39
-rw-r--r--net/dsa/trace.h447
-rw-r--r--net/dsa/user.c3877
-rw-r--r--net/dsa/user.h71
-rw-r--r--net/econet/Kconfig36
-rw-r--r--net/econet/Makefile7
-rw-r--r--net/econet/af_econet.c1174
-rw-r--r--net/ethernet/Makefile3
-rw-r--r--net/ethernet/eth.c546
-rw-r--r--net/ethernet/pe2.c39
-rw-r--r--net/ethtool/Makefile12
-rw-r--r--net/ethtool/bitset.c873
-rw-r--r--net/ethtool/bitset.h34
-rw-r--r--net/ethtool/cabletest.c453
-rw-r--r--net/ethtool/channels.c199
-rw-r--r--net/ethtool/cmis.h128
-rw-r--r--net/ethtool/cmis_cdb.c666
-rw-r--r--net/ethtool/cmis_fw_update.c485
-rw-r--r--net/ethtool/coalesce.c649
-rw-r--r--net/ethtool/common.c1169
-rw-r--r--net/ethtool/common.h92
-rw-r--r--net/ethtool/debug.c117
-rw-r--r--net/ethtool/eee.c172
-rw-r--r--net/ethtool/eeprom.c246
-rw-r--r--net/ethtool/features.c297
-rw-r--r--net/ethtool/fec.c364
-rw-r--r--net/ethtool/ioctl.c3853
-rw-r--r--net/ethtool/linkinfo.c145
-rw-r--r--net/ethtool/linkmodes.c362
-rw-r--r--net/ethtool/linkstate.c227
-rw-r--r--net/ethtool/mm.c561
-rw-r--r--net/ethtool/module.c557
-rw-r--r--net/ethtool/module_fw.h75
-rw-r--r--net/ethtool/mse.c329
-rw-r--r--net/ethtool/netlink.c1583
-rw-r--r--net/ethtool/netlink.h525
-rw-r--r--net/ethtool/pause.c219
-rw-r--r--net/ethtool/phc_vclocks.c94
-rw-r--r--net/ethtool/phy.c165
-rw-r--r--net/ethtool/plca.c271
-rw-r--r--net/ethtool/privflags.c195
-rw-r--r--net/ethtool/pse-pd.c382
-rw-r--r--net/ethtool/rings.c322
-rw-r--r--net/ethtool/rss.c1205
-rw-r--r--net/ethtool/stats.c623
-rw-r--r--net/ethtool/strset.c499
-rw-r--r--net/ethtool/ts.h20
-rw-r--r--net/ethtool/tsconfig.c455
-rw-r--r--net/ethtool/tsinfo.c564
-rw-r--r--net/ethtool/tunnels.c281
-rw-r--r--net/ethtool/wol.c158
-rw-r--r--net/handshake/.kunitconfig11
-rw-r--r--net/handshake/Makefile13
-rw-r--r--net/handshake/alert.c110
-rw-r--r--net/handshake/genl.c59
-rw-r--r--net/handshake/genl.h25
-rw-r--r--net/handshake/handshake-test.c540
-rw-r--r--net/handshake/handshake.h93
-rw-r--r--net/handshake/netlink.c289
-rw-r--r--net/handshake/request.c343
-rw-r--r--net/handshake/tlshd.c455
-rw-r--r--net/handshake/trace.c22
-rw-r--r--net/hsr/Kconfig58
-rw-r--r--net/hsr/Makefile12
-rw-r--r--net/hsr/hsr_debugfs.c123
-rw-r--r--net/hsr/hsr_device.c825
-rw-r--r--net/hsr/hsr_device.h23
-rw-r--r--net/hsr/hsr_forward.c760
-rw-r--r--net/hsr/hsr_forward.h31
-rw-r--r--net/hsr/hsr_framereg.c794
-rw-r--r--net/hsr/hsr_framereg.h99
-rw-r--r--net/hsr/hsr_main.c187
-rw-r--r--net/hsr/hsr_main.h300
-rw-r--r--net/hsr/hsr_netlink.c594
-rw-r--r--net/hsr/hsr_netlink.h27
-rw-r--r--net/hsr/hsr_slave.c250
-rw-r--r--net/hsr/hsr_slave.h37
-rw-r--r--net/hsr/prp_dup_discard_test.c212
-rw-r--r--net/ieee80211/Kconfig72
-rw-r--r--net/ieee80211/Makefile13
-rw-r--r--net/ieee80211/ieee80211_crypt.c206
-rw-r--r--net/ieee80211/ieee80211_crypt_ccmp.c486
-rw-r--r--net/ieee80211/ieee80211_crypt_tkip.c791
-rw-r--r--net/ieee80211/ieee80211_crypt_wep.c298
-rw-r--r--net/ieee80211/ieee80211_geo.c179
-rw-r--r--net/ieee80211/ieee80211_module.c337
-rw-r--r--net/ieee80211/ieee80211_rx.c1803
-rw-r--r--net/ieee80211/ieee80211_tx.c633
-rw-r--r--net/ieee80211/ieee80211_wx.c841
-rw-r--r--net/ieee80211/softmac/Kconfig12
-rw-r--r--net/ieee80211/softmac/Makefile9
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_assoc.c472
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_auth.c403
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_event.c187
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_io.c488
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_module.c576
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_priv.h241
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_scan.c251
-rw-r--r--net/ieee80211/softmac/ieee80211softmac_wx.c514
-rw-r--r--net/ieee802154/6lowpan/6lowpan_i.h48
-rw-r--r--net/ieee802154/6lowpan/Kconfig6
-rw-r--r--net/ieee802154/6lowpan/Makefile4
-rw-r--r--net/ieee802154/6lowpan/core.c289
-rw-r--r--net/ieee802154/6lowpan/reassembly.c558
-rw-r--r--net/ieee802154/6lowpan/rx.c323
-rw-r--r--net/ieee802154/6lowpan/tx.c309
-rw-r--r--net/ieee802154/Kconfig31
-rw-r--r--net/ieee802154/Makefile10
-rw-r--r--net/ieee802154/core.c415
-rw-r--r--net/ieee802154/core.h50
-rw-r--r--net/ieee802154/header_ops.c378
-rw-r--r--net/ieee802154/ieee802154.h75
-rw-r--r--net/ieee802154/netlink.c148
-rw-r--r--net/ieee802154/nl-mac.c1335
-rw-r--r--net/ieee802154/nl-phy.c346
-rw-r--r--net/ieee802154/nl802154.c3108
-rw-r--r--net/ieee802154/nl802154.h14
-rw-r--r--net/ieee802154/nl_policy.c74
-rw-r--r--net/ieee802154/pan.c109
-rw-r--r--net/ieee802154/rdev-ops.h407
-rw-r--r--net/ieee802154/socket.c1143
-rw-r--r--net/ieee802154/sysfs.c111
-rw-r--r--net/ieee802154/sysfs.h10
-rw-r--r--net/ieee802154/trace.c7
-rw-r--r--net/ieee802154/trace.h418
-rw-r--r--net/ife/Kconfig16
-rw-r--r--net/ife/Makefile6
-rw-r--r--net/ife/ife.c177
-rw-r--r--net/ipv4/Kconfig677
-rw-r--r--net/ipv4/Makefile55
-rw-r--r--net/ipv4/af_inet.c1583
-rw-r--r--net/ipv4/ah4.c499
-rw-r--r--net/ipv4/arp.c1359
-rw-r--r--net/ipv4/bpf_tcp_ca.c349
-rw-r--r--net/ipv4/cipso_ipv4.c1888
-rw-r--r--net/ipv4/datagram.c117
-rw-r--r--net/ipv4/devinet.c2782
-rw-r--r--net/ipv4/esp4.c1301
-rw-r--r--net/ipv4/esp4_offload.c414
-rw-r--r--net/ipv4/fib_frontend.c1448
-rw-r--r--net/ipv4/fib_hash.c1079
-rw-r--r--net/ipv4/fib_lookup.h67
-rw-r--r--net/ipv4/fib_notifier.c72
-rw-r--r--net/ipv4/fib_rules.c483
-rw-r--r--net/ipv4/fib_semantics.c2474
-rw-r--r--net/ipv4/fib_trie.c3848
-rw-r--r--net/ipv4/fou_bpf.c117
-rw-r--r--net/ipv4/fou_core.c1281
-rw-r--r--net/ipv4/fou_nl.c49
-rw-r--r--net/ipv4/fou_nl.h26
-rw-r--r--net/ipv4/gre_demux.c221
-rw-r--r--net/ipv4/gre_offload.c287
-rw-r--r--net/ipv4/icmp.c1594
-rw-r--r--net/ipv4/igmp.c1917
-rw-r--r--net/ipv4/igmp_internal.h17
-rw-r--r--net/ipv4/inet_connection_sock.c1639
-rw-r--r--net/ipv4/inet_diag.c1443
-rw-r--r--net/ipv4/inet_fragment.c651
-rw-r--r--net/ipv4/inet_hashtables.c1474
-rw-r--r--net/ipv4/inet_timewait_sock.c537
-rw-r--r--net/ipv4/inetpeer.c589
-rw-r--r--net/ipv4/ip_forward.c122
-rw-r--r--net/ipv4/ip_fragment.c1085
-rw-r--r--net/ipv4/ip_gre.c2479
-rw-r--r--net/ipv4/ip_input.c552
-rw-r--r--net/ipv4/ip_options.c390
-rw-r--r--net/ipv4/ip_output.c1896
-rw-r--r--net/ipv4/ip_sockglue.c2207
-rw-r--r--net/ipv4/ip_tunnel.c1340
-rw-r--r--net/ipv4/ip_tunnel_core.c1176
-rw-r--r--net/ipv4/ip_vti.c744
-rw-r--r--net/ipv4/ipcomp.c451
-rw-r--r--net/ipv4/ipconfig.c968
-rw-r--r--net/ipv4/ipip.c1136
-rw-r--r--net/ipv4/ipmr.c3794
-rw-r--r--net/ipv4/ipmr_base.c446
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c944
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1198
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2396
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c202
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c394
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c599
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c863
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c123
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c179
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c177
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c620
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c432
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c118
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c257
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c902
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c561
-rw-r--r--net/ipv4/metrics.c91
-rw-r--r--net/ipv4/multipath.c55
-rw-r--r--net/ipv4/multipath_drr.c250
-rw-r--r--net/ipv4/multipath_random.c129
-rw-r--r--net/ipv4/multipath_rr.c96
-rw-r--r--net/ipv4/multipath_wrandom.c341
-rw-r--r--net/ipv4/netfilter.c255
-rw-r--r--net/ipv4/netfilter/Kconfig676
-rw-r--r--net/ipv4/netfilter/Makefile91
-rw-r--r--net/ipv4/netfilter/arp_tables.c1839
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c60
-rw-r--r--net/ipv4/netfilter/arptable_filter.c209
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c229
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c1544
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c520
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_h323.c1841
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_pptp.c684
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c314
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netbios_ns.c143
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1575
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_generic.c75
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c330
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c316
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c660
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c1173
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c149
-rw-r--r--net/ipv4/netfilter/ip_conntrack_sip.c514
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c967
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c161
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c85
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c626
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c180
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c436
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_h323.c611
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_pptp.c350
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c122
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_gre.c174
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c87
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c149
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c139
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c55
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c308
-rw-r--r--net/ipv4/netfilter/ip_nat_sip.c282
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c1333
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c391
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c70
-rw-r--r--net/ipv4/netfilter/ip_queue.c742
-rw-r--r--net/ipv4/netfilter/ip_tables.c2385
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c759
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c141
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c498
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c199
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c109
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c124
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c264
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c202
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c121
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c207
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c86
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c103
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c442
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c65
-rw-r--r--net/ipv4/netfilter/ipt_ah.c95
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c131
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c85
-rw-r--r--net/ipv4/netfilter/ipt_owner.c91
-rw-r--r--net/ipv4/netfilter/ipt_recent.c505
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c127
-rw-r--r--net/ipv4/netfilter/ipt_tos.c53
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c70
-rw-r--r--net/ipv4/netfilter/iptable_filter.c196
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c268
-rw-r--r--net/ipv4/netfilter/iptable_nat.c175
-rw-r--r--net/ipv4/netfilter/iptable_raw.c187
-rw-r--r--net/ipv4/netfilter/iptable_security.c98
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c535
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c412
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c384
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c189
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c103
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c567
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c320
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.asn1185
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c231
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c370
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c152
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c153
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c112
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c226
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c76
-rw-r--r--net/ipv4/netlink.c33
-rw-r--r--net/ipv4/nexthop.c4157
-rw-r--r--net/ipv4/ping.c1181
-rw-r--r--net/ipv4/proc.c531
-rw-r--r--net/ipv4/protocol.c91
-rw-r--r--net/ipv4/raw.c942
-rw-r--r--net/ipv4/raw_diag.c259
-rw-r--r--net/ipv4/route.c5094
-rw-r--r--net/ipv4/syncookies.c546
-rw-r--r--net/ipv4/sysctl_net_ipv4.c1830
-rw-r--r--net/ipv4/tcp.c5257
-rw-r--r--net/ipv4/tcp_ao.c2442
-rw-r--r--net/ipv4/tcp_bbr.c1199
-rw-r--r--net/ipv4/tcp_bic.c91
-rw-r--r--net/ipv4/tcp_bpf.c739
-rw-r--r--net/ipv4/tcp_cdg.c428
-rw-r--r--net/ipv4/tcp_cong.c466
-rw-r--r--net/ipv4/tcp_cubic.c543
-rw-r--r--net/ipv4/tcp_dctcp.c313
-rw-r--r--net/ipv4/tcp_dctcp.h40
-rw-r--r--net/ipv4/tcp_diag.c672
-rw-r--r--net/ipv4/tcp_fastopen.c603
-rw-r--r--net/ipv4/tcp_highspeed.c206
-rw-r--r--net/ipv4/tcp_htcp.c123
-rw-r--r--net/ipv4/tcp_hybla.c67
-rw-r--r--net/ipv4/tcp_illinois.c360
-rw-r--r--net/ipv4/tcp_input.c8003
-rw-r--r--net/ipv4/tcp_ipv4.c4233
-rw-r--r--net/ipv4/tcp_lp.c69
-rw-r--r--net/ipv4/tcp_metrics.c1059
-rw-r--r--net/ipv4/tcp_minisocks.c974
-rw-r--r--net/ipv4/tcp_nv.c501
-rw-r--r--net/ipv4/tcp_offload.c478
-rw-r--r--net/ipv4/tcp_output.c4829
-rw-r--r--net/ipv4/tcp_plb.c109
-rw-r--r--net/ipv4/tcp_probe.c186
-rw-r--r--net/ipv4/tcp_rate.c209
-rw-r--r--net/ipv4/tcp_recovery.c237
-rw-r--r--net/ipv4/tcp_scalable.c36
-rw-r--r--net/ipv4/tcp_sigpool.c366
-rw-r--r--net/ipv4/tcp_timer.c844
-rw-r--r--net/ipv4/tcp_ulp.c168
-rw-r--r--net/ipv4/tcp_vegas.c216
-rw-r--r--net/ipv4/tcp_vegas.h26
-rw-r--r--net/ipv4/tcp_veno.c110
-rw-r--r--net/ipv4/tcp_westwood.c109
-rw-r--r--net/ipv4/tcp_yeah.c239
-rw-r--r--net/ipv4/tunnel4.c221
-rw-r--r--net/ipv4/udp.c4094
-rw-r--r--net/ipv4/udp_bpf.c157
-rw-r--r--net/ipv4/udp_diag.c299
-rw-r--r--net/ipv4/udp_impl.h38
-rw-r--r--net/ipv4/udp_offload.c996
-rw-r--r--net/ipv4/udp_tunnel_core.c280
-rw-r--r--net/ipv4/udp_tunnel_nic.c1010
-rw-r--r--net/ipv4/udp_tunnel_stub.c7
-rw-r--r--net/ipv4/udplite.c109
-rw-r--r--net/ipv4/xfrm4_input.c288
-rw-r--r--net/ipv4/xfrm4_mode_beet.c139
-rw-r--r--net/ipv4/xfrm4_mode_transport.c81
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c123
-rw-r--r--net/ipv4/xfrm4_output.c169
-rw-r--r--net/ipv4/xfrm4_policy.c455
-rw-r--r--net/ipv4/xfrm4_protocol.c306
-rw-r--r--net/ipv4/xfrm4_state.c53
-rw-r--r--net/ipv4/xfrm4_tunnel.c76
-rw-r--r--net/ipv6/Kconfig361
-rw-r--r--net/ipv6/Makefile45
-rw-r--r--net/ipv6/addrconf.c8351
-rw-r--r--net/ipv6/addrconf_core.c280
-rw-r--r--net/ipv6/addrlabel.c640
-rw-r--r--net/ipv6/af_inet6.c1158
-rw-r--r--net/ipv6/ah6.c665
-rw-r--r--net/ipv6/anycast.c577
-rw-r--r--net/ipv6/calipso.c1479
-rw-r--r--net/ipv6/datagram.c861
-rw-r--r--net/ipv6/esp6.c1368
-rw-r--r--net/ipv6/esp6_offload.c435
-rw-r--r--net/ipv6/exthdrs.c1337
-rw-r--r--net/ipv6/exthdrs_core.c210
-rw-r--r--net/ipv6/exthdrs_offload.c48
-rw-r--r--net/ipv6/fib6_notifier.c64
-rw-r--r--net/ipv6/fib6_rules.c610
-rw-r--r--net/ipv6/fou6.c227
-rw-r--r--net/ipv6/icmp.c1338
-rw-r--r--net/ipv6/ila/Makefile8
-rw-r--r--net/ipv6/ila/ila.h126
-rw-r--r--net/ipv6/ila/ila_common.c155
-rw-r--r--net/ipv6/ila/ila_lwt.c332
-rw-r--r--net/ipv6/ila/ila_main.c129
-rw-r--r--net/ipv6/ila/ila_xlat.c665
-rw-r--r--net/ipv6/inet6_connection_sock.c247
-rw-r--r--net/ipv6/inet6_hashtables.c531
-rw-r--r--net/ipv6/ioam6.c1040
-rw-r--r--net/ipv6/ioam6_iptunnel.c570
-rw-r--r--net/ipv6/ip6_checksum.c137
-rw-r--r--net/ipv6/ip6_fib.c2634
-rw-r--r--net/ipv6/ip6_flowlabel.c783
-rw-r--r--net/ipv6/ip6_gre.c2378
-rw-r--r--net/ipv6/ip6_icmp.c86
-rw-r--r--net/ipv6/ip6_input.c581
-rw-r--r--net/ipv6/ip6_offload.c489
-rw-r--r--net/ipv6/ip6_offload.h15
-rw-r--r--net/ipv6/ip6_output.c2208
-rw-r--r--net/ipv6/ip6_tunnel.c2391
-rw-r--r--net/ipv6/ip6_udp_tunnel.c186
-rw-r--r--net/ipv6/ip6_vti.c1314
-rw-r--r--net/ipv6/ip6mr.c2774
-rw-r--r--net/ipv6/ipcomp6.c439
-rw-r--r--net/ipv6/ipv6_sockglue.c1573
-rw-r--r--net/ipv6/ipv6_syms.c37
-rw-r--r--net/ipv6/mcast.c2932
-rw-r--r--net/ipv6/mcast_snoop.c190
-rw-r--r--net/ipv6/mip6.c351
-rw-r--r--net/ipv6/ndisc.c2150
-rw-r--r--net/ipv6/netfilter.c313
-rw-r--r--net/ipv6/netfilter/Kconfig301
-rw-r--r--net/ipv6/netfilter/Makefile52
-rw-r--r--net/ipv6/netfilter/ip6_queue.c729
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2303
-rw-r--r--net/ipv6/netfilter/ip6t_HL.c102
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c509
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c191
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c274
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c124
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c147
-rw-r--r--net/ipv6/netfilter/ip6t_eui64.c67
-rw-r--r--net/ipv6/netfilter/ip6t_frag.c181
-rw-r--r--net/ipv6/netfilter/ip6t_hbh.c159
-rw-r--r--net/ipv6/netfilter/ip6t_hl.c70
-rw-r--r--net/ipv6/netfilter/ip6t_ipv6header.c86
-rw-r--r--net/ipv6/netfilter/ip6t_mh.c90
-rw-r--r--net/ipv6/netfilter/ip6t_owner.c92
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c150
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c178
-rw-r--r--net/ipv6/netfilter/ip6t_srh.c320
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c209
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c289
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c177
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c199
-rw-r--r--net/ipv6/netfilter/ip6table_security.c97
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c492
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c341
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c998
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c185
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c81
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c451
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c168
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c152
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c110
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c288
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c77
-rw-r--r--net/ipv6/output_core.c161
-rw-r--r--net/ipv6/ping.c305
-rw-r--r--net/ipv6/proc.c382
-rw-r--r--net/ipv6/protocol.c82
-rw-r--r--net/ipv6/raw.c1316
-rw-r--r--net/ipv6/reassembly.c1048
-rw-r--r--net/ipv6/route.c7417
-rw-r--r--net/ipv6/rpl.c118
-rw-r--r--net/ipv6/rpl_iptunnel.c394
-rw-r--r--net/ipv6/seg6.c544
-rw-r--r--net/ipv6/seg6_hmac.c319
-rw-r--r--net/ipv6/seg6_iptunnel.c779
-rw-r--r--net/ipv6/seg6_local.c2720
-rw-r--r--net/ipv6/sit.c2115
-rw-r--r--net/ipv6/syncookies.c282
-rw-r--r--net/ipv6/sysctl_net_ipv6.c360
-rw-r--r--net/ipv6/tcp_ao.c168
-rw-r--r--net/ipv6/tcp_ipv6.c2930
-rw-r--r--net/ipv6/tcpv6_offload.c205
-rw-r--r--net/ipv6/tunnel6.c245
-rw-r--r--net/ipv6/udp.c2036
-rw-r--r--net/ipv6/udp_impl.h40
-rw-r--r--net/ipv6/udp_offload.c208
-rw-r--r--net/ipv6/udplite.c115
-rw-r--r--net/ipv6/xfrm6_input.c404
-rw-r--r--net/ipv6/xfrm6_mode_beet.c107
-rw-r--r--net/ipv6/xfrm6_mode_ro.c93
-rw-r--r--net/ipv6/xfrm6_mode_transport.c87
-rw-r--r--net/ipv6/xfrm6_mode_tunnel.c120
-rw-r--r--net/ipv6/xfrm6_output.c212
-rw-r--r--net/ipv6/xfrm6_policy.c537
-rw-r--r--net/ipv6/xfrm6_protocol.c327
-rw-r--r--net/ipv6/xfrm6_state.c178
-rw-r--r--net/ipv6/xfrm6_tunnel.c385
-rw-r--r--net/ipx/ChangeLog101
-rw-r--r--net/ipx/Kconfig64
-rw-r--r--net/ipx/Makefile8
-rw-r--r--net/ipx/af_ipx.c2059
-rw-r--r--net/ipx/ipx_proc.c407
-rw-r--r--net/ipx/ipx_route.c292
-rw-r--r--net/ipx/sysctl_net_ipx.c61
-rw-r--r--net/irda/Kconfig96
-rw-r--r--net/irda/Makefile15
-rw-r--r--net/irda/af_irda.c2618
-rw-r--r--net/irda/discovery.c418
-rw-r--r--net/irda/ircomm/Kconfig12
-rw-r--r--net/irda/ircomm/Makefile8
-rw-r--r--net/irda/ircomm/ircomm_core.c584
-rw-r--r--net/irda/ircomm/ircomm_event.c251
-rw-r--r--net/irda/ircomm/ircomm_lmp.c372
-rw-r--r--net/irda/ircomm/ircomm_param.c511
-rw-r--r--net/irda/ircomm/ircomm_ttp.c369
-rw-r--r--net/irda/ircomm/ircomm_tty.c1400
-rw-r--r--net/irda/ircomm/ircomm_tty_attach.c1006
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c428
-rw-r--r--net/irda/irda_device.c494
-rw-r--r--net/irda/iriap.c1090
-rw-r--r--net/irda/iriap_event.c502
-rw-r--r--net/irda/irias_object.c559
-rw-r--r--net/irda/irlan/Kconfig14
-rw-r--r--net/irda/irlan/Makefile7
-rw-r--r--net/irda/irlan/irlan_client.c577
-rw-r--r--net/irda/irlan/irlan_client_event.c533
-rw-r--r--net/irda/irlan/irlan_common.c1227
-rw-r--r--net/irda/irlan/irlan_eth.c386
-rw-r--r--net/irda/irlan/irlan_event.c60
-rw-r--r--net/irda/irlan/irlan_filter.c247
-rw-r--r--net/irda/irlan/irlan_provider.c419
-rw-r--r--net/irda/irlan/irlan_provider_event.c241
-rw-r--r--net/irda/irlap.c1255
-rw-r--r--net/irda/irlap_event.c2333
-rw-r--r--net/irda/irlap_frame.c1433
-rw-r--r--net/irda/irlmp.c2037
-rw-r--r--net/irda/irlmp_event.c911
-rw-r--r--net/irda/irlmp_frame.c490
-rw-r--r--net/irda/irmod.c184
-rw-r--r--net/irda/irnet/Kconfig13
-rw-r--r--net/irda/irnet/Makefile7
-rw-r--r--net/irda/irnet/irnet.h525
-rw-r--r--net/irda/irnet/irnet_irda.c1866
-rw-r--r--net/irda/irnet/irnet_irda.h186
-rw-r--r--net/irda/irnet/irnet_ppp.c1141
-rw-r--r--net/irda/irnet/irnet_ppp.h119
-rw-r--r--net/irda/irproc.c100
-rw-r--r--net/irda/irqueue.c913
-rw-r--r--net/irda/irsysctl.c296
-rw-r--r--net/irda/irttp.c1905
-rw-r--r--net/irda/parameters.c589
-rw-r--r--net/irda/qos.c774
-rw-r--r--net/irda/timer.c232
-rw-r--r--net/irda/wrapper.c491
-rw-r--r--net/iucv/Kconfig18
-rw-r--r--net/iucv/Makefile7
-rw-r--r--net/iucv/af_iucv.c2339
-rw-r--r--net/iucv/iucv.c1955
-rw-r--r--net/kcm/Kconfig11
-rw-r--r--net/kcm/Makefile4
-rw-r--r--net/kcm/kcmproc.c387
-rw-r--r--net/kcm/kcmsock.c1935
-rw-r--r--net/key/Makefile1
-rw-r--r--net/key/af_key.c2517
-rw-r--r--net/l2tp/Kconfig110
-rw-r--r--net/l2tp/Makefile18
-rw-r--r--net/l2tp/l2tp_core.c1946
-rw-r--r--net/l2tp/l2tp_core.h340
-rw-r--r--net/l2tp/l2tp_debugfs.c347
-rw-r--r--net/l2tp/l2tp_eth.c351
-rw-r--r--net/l2tp/l2tp_ip.c736
-rw-r--r--net/l2tp/l2tp_ip6.c868
-rw-r--r--net/l2tp/l2tp_netlink.c1060
-rw-r--r--net/l2tp/l2tp_ppp.c1723
-rw-r--r--net/l2tp/trace.h211
-rw-r--r--net/l3mdev/Kconfig11
-rw-r--r--net/l3mdev/Makefile6
-rw-r--r--net/l3mdev/l3mdev.c303
-rw-r--r--net/lapb/Kconfig8
-rw-r--r--net/lapb/Makefile3
-rw-r--r--net/lapb/lapb_iface.c268
-rw-r--r--net/lapb/lapb_in.c984
-rw-r--r--net/lapb/lapb_out.c55
-rw-r--r--net/lapb/lapb_subr.c52
-rw-r--r--net/lapb/lapb_timer.c119
-rw-r--r--net/llc/Kconfig4
-rw-r--r--net/llc/Makefile2
-rw-r--r--net/llc/af_llc.c377
-rw-r--r--net/llc/llc_c_ac.c126
-rw-r--r--net/llc/llc_c_ev.c12
-rw-r--r--net/llc/llc_c_st.c1208
-rw-r--r--net/llc/llc_conn.c342
-rw-r--r--net/llc/llc_core.c87
-rw-r--r--net/llc/llc_if.c20
-rw-r--r--net/llc/llc_input.c67
-rw-r--r--net/llc/llc_output.c58
-rw-r--r--net/llc/llc_pdu.c8
-rw-r--r--net/llc/llc_proc.c157
-rw-r--r--net/llc/llc_s_ac.c59
-rw-r--r--net/llc/llc_s_st.c48
-rw-r--r--net/llc/llc_sap.c209
-rw-r--r--net/llc/llc_station.c649
-rw-r--r--net/llc/sysctl_net_llc.c115
-rw-r--r--net/mac80211/Kconfig316
-rw-r--r--net/mac80211/Makefile72
-rw-r--r--net/mac80211/aead_api.c113
-rw-r--r--net/mac80211/aead_api.h23
-rw-r--r--net/mac80211/aes_ccm.h45
-rw-r--r--net/mac80211/aes_cmac.c84
-rw-r--r--net/mac80211/aes_cmac.h19
-rw-r--r--net/mac80211/aes_gcm.h43
-rw-r--r--net/mac80211/aes_gmac.c94
-rw-r--r--net/mac80211/aes_gmac.h20
-rw-r--r--net/mac80211/agg-rx.c538
-rw-r--r--net/mac80211/agg-tx.c1060
-rw-r--r--net/mac80211/airtime.c837
-rw-r--r--net/mac80211/cfg.c5652
-rw-r--r--net/mac80211/chan.c2299
-rw-r--r--net/mac80211/debug.h249
-rw-r--r--net/mac80211/debugfs.c736
-rw-r--r--net/mac80211/debugfs.h17
-rw-r--r--net/mac80211/debugfs_key.c422
-rw-r--r--net/mac80211/debugfs_key.h29
-rw-r--r--net/mac80211/debugfs_netdev.c1104
-rw-r--r--net/mac80211/debugfs_netdev.h45
-rw-r--r--net/mac80211/debugfs_sta.c1362
-rw-r--r--net/mac80211/debugfs_sta.h27
-rw-r--r--net/mac80211/driver-ops.c635
-rw-r--r--net/mac80211/driver-ops.h1775
-rw-r--r--net/mac80211/drop.h97
-rw-r--r--net/mac80211/eht.c104
-rw-r--r--net/mac80211/ethtool.c254
-rw-r--r--net/mac80211/fils_aead.c333
-rw-r--r--net/mac80211/fils_aead.h16
-rw-r--r--net/mac80211/he.c367
-rw-r--r--net/mac80211/ht.c643
-rw-r--r--net/mac80211/ibss.c1837
-rw-r--r--net/mac80211/ieee80211_i.h2886
-rw-r--r--net/mac80211/iface.c2504
-rw-r--r--net/mac80211/key.c1514
-rw-r--r--net/mac80211/key.h174
-rw-r--r--net/mac80211/led.c377
-rw-r--r--net/mac80211/led.h86
-rw-r--r--net/mac80211/link.c641
-rw-r--r--net/mac80211/main.c1798
-rw-r--r--net/mac80211/mesh.c1809
-rw-r--r--net/mac80211/mesh.h429
-rw-r--r--net/mac80211/mesh_hwmp.c1364
-rw-r--r--net/mac80211/mesh_pathtbl.c1100
-rw-r--r--net/mac80211/mesh_plink.c1259
-rw-r--r--net/mac80211/mesh_ps.c612
-rw-r--r--net/mac80211/mesh_sync.c215
-rw-r--r--net/mac80211/michael.c83
-rw-r--r--net/mac80211/michael.h22
-rw-r--r--net/mac80211/mlme.c11063
-rw-r--r--net/mac80211/ocb.c242
-rw-r--r--net/mac80211/offchannel.c1093
-rw-r--r--net/mac80211/parse.c1142
-rw-r--r--net/mac80211/pm.c200
-rw-r--r--net/mac80211/rate.c1043
-rw-r--r--net/mac80211/rate.h112
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c2044
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h202
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c334
-rw-r--r--net/mac80211/rx.c5589
-rw-r--r--net/mac80211/s1g.c222
-rw-r--r--net/mac80211/scan.c1477
-rw-r--r--net/mac80211/spectmgmt.c460
-rw-r--r--net/mac80211/sta_info.c3406
-rw-r--r--net/mac80211/sta_info.h1084
-rw-r--r--net/mac80211/status.c1306
-rw-r--r--net/mac80211/tdls.c2081
-rw-r--r--net/mac80211/tests/Makefile3
-rw-r--r--net/mac80211/tests/chan-mode.c274
-rw-r--r--net/mac80211/tests/elems.c104
-rw-r--r--net/mac80211/tests/mfp.c286
-rw-r--r--net/mac80211/tests/module.c10
-rw-r--r--net/mac80211/tests/s1g_tim.c356
-rw-r--r--net/mac80211/tests/tpe.c284
-rw-r--r--net/mac80211/tests/util.c309
-rw-r--r--net/mac80211/tests/util.h36
-rw-r--r--net/mac80211/tkip.c323
-rw-r--r--net/mac80211/tkip.h30
-rw-r--r--net/mac80211/trace.c77
-rw-r--r--net/mac80211/trace.h3362
-rw-r--r--net/mac80211/trace_msg.h55
-rw-r--r--net/mac80211/tx.c6462
-rw-r--r--net/mac80211/util.c4548
-rw-r--r--net/mac80211/vht.c801
-rw-r--r--net/mac80211/wbrf.c94
-rw-r--r--net/mac80211/wep.c307
-rw-r--r--net/mac80211/wep.h30
-rw-r--r--net/mac80211/wme.c246
-rw-r--r--net/mac80211/wme.h21
-rw-r--r--net/mac80211/wpa.c1048
-rw-r--r--net/mac80211/wpa.h47
-rw-r--r--net/mac802154/Kconfig22
-rw-r--r--net/mac802154/Makefile6
-rw-r--r--net/mac802154/cfg.c720
-rw-r--r--net/mac802154/cfg.h10
-rw-r--r--net/mac802154/driver-ops.h354
-rw-r--r--net/mac802154/ieee802154_i.h334
-rw-r--r--net/mac802154/iface.c744
-rw-r--r--net/mac802154/llsec.c1061
-rw-r--r--net/mac802154/llsec.h99
-rw-r--r--net/mac802154/mac_cmd.c144
-rw-r--r--net/mac802154/main.c308
-rw-r--r--net/mac802154/mib.c219
-rw-r--r--net/mac802154/rx.c450
-rw-r--r--net/mac802154/scan.c917
-rw-r--r--net/mac802154/trace.c10
-rw-r--r--net/mac802154/trace.h298
-rw-r--r--net/mac802154/tx.c240
-rw-r--r--net/mac802154/util.c173
-rw-r--r--net/mctp/Kconfig24
-rw-r--r--net/mctp/Makefile6
-rw-r--r--net/mctp/af_mctp.c911
-rw-r--r--net/mctp/device.c561
-rw-r--r--net/mctp/neigh.c353
-rw-r--r--net/mctp/route.c1790
-rw-r--r--net/mctp/test/route-test.c1598
-rw-r--r--net/mctp/test/sock-test.c396
-rw-r--r--net/mctp/test/utils.c284
-rw-r--r--net/mctp/test/utils.h74
-rw-r--r--net/mpls/Kconfig39
-rw-r--r--net/mpls/Makefile9
-rw-r--r--net/mpls/af_mpls.c2876
-rw-r--r--net/mpls/internal.h215
-rw-r--r--net/mpls/mpls_gso.c113
-rw-r--r--net/mpls/mpls_iptunnel.c303
-rw-r--r--net/mptcp/Kconfig39
-rw-r--r--net/mptcp/Makefile15
-rw-r--r--net/mptcp/bpf.c36
-rw-r--r--net/mptcp/crypto.c52
-rw-r--r--net/mptcp/crypto_test.c73
-rw-r--r--net/mptcp/ctrl.c589
-rw-r--r--net/mptcp/diag.c120
-rw-r--r--net/mptcp/fastopen.c62
-rw-r--r--net/mptcp/mib.c130
-rw-r--r--net/mptcp/mib.h128
-rw-r--r--net/mptcp/mptcp_diag.c243
-rw-r--r--net/mptcp/mptcp_pm_gen.c180
-rw-r--r--net/mptcp/mptcp_pm_gen.h59
-rw-r--r--net/mptcp/options.c1719
-rw-r--r--net/mptcp/pm.c1139
-rw-r--r--net/mptcp/pm_kernel.c1624
-rw-r--r--net/mptcp/pm_netlink.c645
-rw-r--r--net/mptcp/pm_userspace.c698
-rw-r--r--net/mptcp/protocol.c4462
-rw-r--r--net/mptcp/protocol.h1366
-rw-r--r--net/mptcp/sched.c215
-rw-r--r--net/mptcp/sockopt.c1654
-rw-r--r--net/mptcp/subflow.c2219
-rw-r--r--net/mptcp/syncookies.c133
-rw-r--r--net/mptcp/token.c422
-rw-r--r--net/mptcp/token_test.c151
-rw-r--r--net/ncsi/Kconfig25
-rw-r--r--net/ncsi/Makefile5
-rw-r--r--net/ncsi/internal.h417
-rw-r--r--net/ncsi/ncsi-aen.c246
-rw-r--r--net/ncsi/ncsi-cmd.c408
-rw-r--r--net/ncsi/ncsi-manage.c1974
-rw-r--r--net/ncsi/ncsi-netlink.c778
-rw-r--r--net/ncsi/ncsi-netlink.h25
-rw-r--r--net/ncsi/ncsi-pkt.h465
-rw-r--r--net/ncsi/ncsi-rsp.c1264
-rw-r--r--net/netfilter/Kconfig1498
-rw-r--r--net/netfilter/Makefile212
-rw-r--r--net/netfilter/core.c910
-rw-r--r--net/netfilter/ipset/Kconfig178
-rw-r--r--net/netfilter/ipset/Makefile31
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h316
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c384
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c423
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c332
-rw-r--r--net/netfilter/ipset/ip_set_core.c2458
-rw-r--r--net/netfilter/ipset/ip_set_getport.c150
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h1645
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c326
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c311
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c331
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c417
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c410
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c572
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c169
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c407
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c525
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c530
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c515
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c628
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c692
-rw-r--r--net/netfilter/ipset/pfxlen.c189
-rw-r--r--net/netfilter/ipvs/Kconfig (renamed from net/ipv4/ipvs/Kconfig)228
-rw-r--r--net/netfilter/ipvs/Makefile (renamed from net/ipv4/ipvs/Makefile)19
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c (renamed from net/ipv4/ipvs/ip_vs_app.c)293
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c1550
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2453
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4555
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c (renamed from net/ipv4/ipvs/ip_vs_dh.c)134
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c952
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c74
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c639
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c632
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c817
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c88
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c538
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c279
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c (renamed from net/ipv4/ipvs/ip_vs_nq.c)71
-rw-r--r--net/netfilter/ipvs/ip_vs_ovf.c81
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c111
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c187
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c381
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c156
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c597
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c745
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c502
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c125
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c (renamed from net/ipv4/ipvs/ip_vs_sched.c)165
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c (renamed from net/ipv4/ipvs/ip_vs_sed.c)72
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c378
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c2039
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c139
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c (renamed from net/ipv4/ipvs/ip_vs_wlc.c)86
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c265
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1628
-rw-r--r--net/netfilter/nf_bpf_link.c332
-rw-r--r--net/netfilter/nf_conncount.c700
-rw-r--r--net/netfilter/nf_conntrack_acct.c28
-rw-r--r--net/netfilter/nf_conntrack_amanda.c240
-rw-r--r--net/netfilter/nf_conntrack_bpf.c550
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c85
-rw-r--r--net/netfilter/nf_conntrack_core.c3150
-rw-r--r--net/netfilter/nf_conntrack_ecache.c402
-rw-r--r--net/netfilter/nf_conntrack_expect.c727
-rw-r--r--net/netfilter/nf_conntrack_extend.c159
-rw-r--r--net/netfilter/nf_conntrack_ftp.c434
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c (renamed from net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c)383
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c1803
-rw-r--r--net/netfilter/nf_conntrack_h323_types.c (renamed from net/ipv4/netfilter/ip_conntrack_helper_h323_types.c)381
-rw-r--r--net/netfilter/nf_conntrack_helper.c524
-rw-r--r--net/netfilter/nf_conntrack_irc.c314
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c95
-rw-r--r--net/netfilter/nf_conntrack_labels.c81
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c71
-rw-r--r--net/netfilter/nf_conntrack_netlink.c4112
-rw-r--r--net/netfilter/nf_conntrack_ovs.c185
-rw-r--r--net/netfilter/nf_conntrack_pptp.c613
-rw-r--r--net/netfilter/nf_conntrack_proto.c888
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c147
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c326
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c383
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c361
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c1113
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c2069
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c440
-rw-r--r--net/netfilter/nf_conntrack_sane.c213
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c234
-rw-r--r--net/netfilter/nf_conntrack_sip.c1707
-rw-r--r--net/netfilter/nf_conntrack_snmp.c75
-rw-r--r--net/netfilter/nf_conntrack_standalone.c1329
-rw-r--r--net/netfilter/nf_conntrack_tftp.c141
-rw-r--r--net/netfilter/nf_conntrack_timeout.c146
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c25
-rw-r--r--net/netfilter/nf_dup_netdev.c108
-rw-r--r--net/netfilter/nf_flow_table_bpf.c121
-rw-r--r--net/netfilter/nf_flow_table_core.c847
-rw-r--r--net/netfilter/nf_flow_table_inet.c122
-rw-r--r--net/netfilter/nf_flow_table_ip.c983
-rw-r--r--net/netfilter/nf_flow_table_offload.c1241
-rw-r--r--net/netfilter/nf_flow_table_path.c330
-rw-r--r--net/netfilter/nf_flow_table_procfs.c80
-rw-r--r--net/netfilter/nf_flow_table_xdp.c147
-rw-r--r--net/netfilter/nf_hooks_lwtunnel.c123
-rw-r--r--net/netfilter/nf_internals.h55
-rw-r--r--net/netfilter/nf_log.c571
-rw-r--r--net/netfilter/nf_log_syslog.c1085
-rw-r--r--net/netfilter/nf_nat_amanda.c80
-rw-r--r--net/netfilter/nf_nat_bpf.c77
-rw-r--r--net/netfilter/nf_nat_core.c1385
-rw-r--r--net/netfilter/nf_nat_ftp.c138
-rw-r--r--net/netfilter/nf_nat_helper.c231
-rw-r--r--net/netfilter/nf_nat_irc.c110
-rw-r--r--net/netfilter/nf_nat_masquerade.c368
-rw-r--r--net/netfilter/nf_nat_ovs.c136
-rw-r--r--net/netfilter/nf_nat_proto.c1119
-rw-r--r--net/netfilter/nf_nat_redirect.c138
-rw-r--r--net/netfilter/nf_nat_sip.c676
-rw-r--r--net/netfilter/nf_nat_tftp.c56
-rw-r--r--net/netfilter/nf_queue.c458
-rw-r--r--net/netfilter/nf_sockopt.c169
-rw-r--r--net/netfilter/nf_synproxy_core.c1220
-rw-r--r--net/netfilter/nf_sysctl.c134
-rw-r--r--net/netfilter/nf_tables_api.c12283
-rw-r--r--net/netfilter/nf_tables_core.c418
-rw-r--r--net/netfilter/nf_tables_offload.c699
-rw-r--r--net/netfilter/nf_tables_trace.c378
-rw-r--r--net/netfilter/nfnetlink.c928
-rw-r--r--net/netfilter/nfnetlink_acct.c560
-rw-r--r--net/netfilter/nfnetlink_cthelper.c804
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c675
-rw-r--r--net/netfilter/nfnetlink_hook.c486
-rw-r--r--net/netfilter/nfnetlink_log.c1163
-rw-r--r--net/netfilter/nfnetlink_osf.c451
-rw-r--r--net/netfilter/nfnetlink_queue.c1965
-rw-r--r--net/netfilter/nft_bitwise.c647
-rw-r--r--net/netfilter/nft_byteorder.c198
-rw-r--r--net/netfilter/nft_chain_filter.c488
-rw-r--r--net/netfilter/nft_chain_nat.c149
-rw-r--r--net/netfilter/nft_chain_route.c169
-rw-r--r--net/netfilter/nft_cmp.c436
-rw-r--r--net/netfilter/nft_compat.c986
-rw-r--r--net/netfilter/nft_connlimit.c304
-rw-r--r--net/netfilter/nft_counter.c316
-rw-r--r--net/netfilter/nft_ct.c1472
-rw-r--r--net/netfilter/nft_ct_fast.c62
-rw-r--r--net/netfilter/nft_dup_netdev.c113
-rw-r--r--net/netfilter/nft_dynset.c433
-rw-r--r--net/netfilter/nft_exthdr.c848
-rw-r--r--net/netfilter/nft_fib.c209
-rw-r--r--net/netfilter/nft_fib_inet.c80
-rw-r--r--net/netfilter/nft_fib_netdev.c89
-rw-r--r--net/netfilter/nft_flow_offload.c287
-rw-r--r--net/netfilter/nft_fwd_netdev.c273
-rw-r--r--net/netfilter/nft_hash.c287
-rw-r--r--net/netfilter/nft_immediate.c355
-rw-r--r--net/netfilter/nft_inner.c431
-rw-r--r--net/netfilter/nft_last.c138
-rw-r--r--net/netfilter/nft_limit.c484
-rw-r--r--net/netfilter/nft_log.c321
-rw-r--r--net/netfilter/nft_lookup.c299
-rw-r--r--net/netfilter/nft_masq.c288
-rw-r--r--net/netfilter/nft_meta.c1012
-rw-r--r--net/netfilter/nft_nat.c406
-rw-r--r--net/netfilter/nft_numgen.c257
-rw-r--r--net/netfilter/nft_objref.c282
-rw-r--r--net/netfilter/nft_osf.c189
-rw-r--r--net/netfilter/nft_payload.c1098
-rw-r--r--net/netfilter/nft_queue.c249
-rw-r--r--net/netfilter/nft_quota.c311
-rw-r--r--net/netfilter/nft_range.c150
-rw-r--r--net/netfilter/nft_redir.c270
-rw-r--r--net/netfilter/nft_reject.c133
-rw-r--r--net/netfilter/nft_reject_inet.c110
-rw-r--r--net/netfilter/nft_reject_netdev.c190
-rw-r--r--net/netfilter/nft_rt.c207
-rw-r--r--net/netfilter/nft_set_bitmap.c321
-rw-r--r--net/netfilter/nft_set_hash.c897
-rw-r--r--net/netfilter/nft_set_pipapo.c2400
-rw-r--r--net/netfilter/nft_set_pipapo.h302
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c1302
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h16
-rw-r--r--net/netfilter/nft_set_rbtree.c832
-rw-r--r--net/netfilter/nft_socket.c326
-rw-r--r--net/netfilter/nft_synproxy.c397
-rw-r--r--net/netfilter/nft_tproxy.c362
-rw-r--r--net/netfilter/nft_tunnel.c755
-rw-r--r--net/netfilter/nft_xfrm.c324
-rw-r--r--net/netfilter/utils.c232
-rw-r--r--net/netfilter/x_tables.c1782
-rw-r--r--net/netfilter/xt_AUDIT.c158
-rw-r--r--net/netfilter/xt_CHECKSUM.c100
-rw-r--r--net/netfilter/xt_CLASSIFY.c86
-rw-r--r--net/netfilter/xt_CONNMARK.c183
-rw-r--r--net/netfilter/xt_CONNSECMARK.c127
-rw-r--r--net/netfilter/xt_CT.c437
-rw-r--r--net/netfilter/xt_DSCP.c145
-rw-r--r--net/netfilter/xt_HL.c159
-rw-r--r--net/netfilter/xt_HMARK.c368
-rw-r--r--net/netfilter/xt_IDLETIMER.c565
-rw-r--r--net/netfilter/xt_LED.c219
-rw-r--r--net/netfilter/xt_LOG.c119
-rw-r--r--net/netfilter/xt_MARK.c189
-rw-r--r--net/netfilter/xt_MASQUERADE.c128
-rw-r--r--net/netfilter/xt_NETMAP.c169
-rw-r--r--net/netfilter/xt_NFLOG.c104
-rw-r--r--net/netfilter/xt_NFQUEUE.c141
-rw-r--r--net/netfilter/xt_NOTRACK.c65
-rw-r--r--net/netfilter/xt_RATEEST.c248
-rw-r--r--net/netfilter/xt_REDIRECT.c124
-rw-r--r--net/netfilter/xt_SECMARK.c201
-rw-r--r--net/netfilter/xt_TCPMSS.c345
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c153
-rw-r--r--net/netfilter/xt_TEE.c231
-rw-r--r--net/netfilter/xt_TPROXY.c269
-rw-r--r--net/netfilter/xt_TRACE.c69
-rw-r--r--net/netfilter/xt_addrtype.c243
-rw-r--r--net/netfilter/xt_bpf.c153
-rw-r--r--net/netfilter/xt_cgroup.c245
-rw-r--r--net/netfilter/xt_cluster.c188
-rw-r--r--net/netfilter/xt_comment.c51
-rw-r--r--net/netfilter/xt_connbytes.c194
-rw-r--r--net/netfilter/xt_connlabel.c102
-rw-r--r--net/netfilter/xt_connlimit.c155
-rw-r--r--net/netfilter/xt_connmark.c306
-rw-r--r--net/netfilter/xt_conntrack.c493
-rw-r--r--net/netfilter/xt_cpu.c61
-rw-r--r--net/netfilter/xt_dccp.c144
-rw-r--r--net/netfilter/xt_devgroup.c79
-rw-r--r--net/netfilter/xt_dscp.c113
-rw-r--r--net/netfilter/xt_ecn.c176
-rw-r--r--net/netfilter/xt_esp.c96
-rw-r--r--net/netfilter/xt_hashlimit.c1322
-rw-r--r--net/netfilter/xt_helper.c198
-rw-r--r--net/netfilter/xt_hl.c93
-rw-r--r--net/netfilter/xt_ipcomp.c109
-rw-r--r--net/netfilter/xt_iprange.c137
-rw-r--r--net/netfilter/xt_ipvs.c191
-rw-r--r--net/netfilter/xt_l2tp.c355
-rw-r--r--net/netfilter/xt_length.c64
-rw-r--r--net/netfilter/xt_limit.c194
-rw-r--r--net/netfilter/xt_mac.c81
-rw-r--r--net/netfilter/xt_mark.c155
-rw-r--r--net/netfilter/xt_multiport.c259
-rw-r--r--net/netfilter/xt_nat.c247
-rw-r--r--net/netfilter/xt_nfacct.c93
-rw-r--r--net/netfilter/xt_osf.c73
-rw-r--r--net/netfilter/xt_owner.c158
-rw-r--r--net/netfilter/xt_physdev.c170
-rw-r--r--net/netfilter/xt_pkttype.c68
-rw-r--r--net/netfilter/xt_policy.c133
-rw-r--r--net/netfilter/xt_quota.c103
-rw-r--r--net/netfilter/xt_rateest.c153
-rw-r--r--net/netfilter/xt_realm.c48
-rw-r--r--net/netfilter/xt_recent.c763
-rw-r--r--net/netfilter/xt_repldata.h48
-rw-r--r--net/netfilter/xt_sctp.c210
-rw-r--r--net/netfilter/xt_set.c712
-rw-r--r--net/netfilter/xt_socket.c336
-rw-r--r--net/netfilter/xt_state.c97
-rw-r--r--net/netfilter/xt_statistic.c110
-rw-r--r--net/netfilter/xt_string.c107
-rw-r--r--net/netfilter/xt_tcpmss.c62
-rw-r--r--net/netfilter/xt_tcpudp.c305
-rw-r--r--net/netfilter/xt_time.c300
-rw-r--r--net/netfilter/xt_u32.c145
-rw-r--r--net/netlabel/Kconfig6
-rw-r--r--net/netlabel/Makefile8
-rw-r--r--net/netlabel/netlabel_addrlist.c369
-rw-r--r--net/netlabel/netlabel_addrlist.h194
-rw-r--r--net/netlabel/netlabel_calipso.c735
-rw-r--r--net/netlabel/netlabel_calipso.h137
-rw-r--r--net/netlabel/netlabel_cipso_v4.c424
-rw-r--r--net/netlabel/netlabel_cipso_v4.h30
-rw-r--r--net/netlabel/netlabel_domainhash.c874
-rw-r--r--net/netlabel/netlabel_domainhash.h85
-rw-r--r--net/netlabel/netlabel_kapi.c1447
-rw-r--r--net/netlabel/netlabel_mgmt.c684
-rw-r--r--net/netlabel/netlabel_mgmt.h108
-rw-r--r--net/netlabel/netlabel_unlabeled.c1411
-rw-r--r--net/netlabel/netlabel_unlabeled.h180
-rw-r--r--net/netlabel/netlabel_user.c49
-rw-r--r--net/netlabel/netlabel_user.h28
-rw-r--r--net/netlink/Kconfig11
-rw-r--r--net/netlink/Makefile6
-rw-r--r--net/netlink/af_netlink.c2774
-rw-r--r--net/netlink/af_netlink.h79
-rw-r--r--net/netlink/attr.c431
-rw-r--r--net/netlink/diag.c263
-rw-r--r--net/netlink/genetlink.c2036
-rw-r--r--net/netlink/genetlink.h11
-rw-r--r--net/netlink/policy.c493
-rw-r--r--net/netrom/Makefile1
-rw-r--r--net/netrom/af_netrom.c509
-rw-r--r--net/netrom/nr_dev.c115
-rw-r--r--net/netrom/nr_in.c25
-rw-r--r--net/netrom/nr_loopback.c23
-rw-r--r--net/netrom/nr_out.c20
-rw-r--r--net/netrom/nr_route.c337
-rw-r--r--net/netrom/nr_subr.c28
-rw-r--r--net/netrom/nr_timer.c109
-rw-r--r--net/netrom/sysctl_net_netrom.c109
-rw-r--r--net/nfc/Kconfig34
-rw-r--r--net/nfc/Makefile14
-rw-r--r--net/nfc/af_nfc.c88
-rw-r--r--net/nfc/core.c1247
-rw-r--r--net/nfc/digital.h171
-rw-r--r--net/nfc/digital_core.c862
-rw-r--r--net/nfc/digital_dep.c1633
-rw-r--r--net/nfc/digital_technology.c1300
-rw-r--r--net/nfc/hci/Kconfig18
-rw-r--r--net/nfc/hci/Makefile9
-rw-r--r--net/nfc/hci/command.c344
-rw-r--r--net/nfc/hci/core.c1105
-rw-r--r--net/nfc/hci/hci.h120
-rw-r--r--net/nfc/hci/hcp.c136
-rw-r--r--net/nfc/hci/llc.c141
-rw-r--r--net/nfc/hci/llc.h55
-rw-r--r--net/nfc/hci/llc_nop.c86
-rw-r--r--net/nfc/hci/llc_shdlc.c818
-rw-r--r--net/nfc/llcp.h252
-rw-r--r--net/nfc/llcp_commands.c815
-rw-r--r--net/nfc/llcp_core.c1697
-rw-r--r--net/nfc/llcp_sock.c1066
-rw-r--r--net/nfc/nci/Kconfig29
-rw-r--r--net/nfc/nci/Makefile14
-rw-r--r--net/nfc/nci/core.c1614
-rw-r--r--net/nfc/nci/data.c301
-rw-r--r--net/nfc/nci/hci.c795
-rw-r--r--net/nfc/nci/lib.c73
-rw-r--r--net/nfc/nci/ntf.c917
-rw-r--r--net/nfc/nci/rsp.c428
-rw-r--r--net/nfc/nci/spi.c323
-rw-r--r--net/nfc/nci/uart.c454
-rw-r--r--net/nfc/netlink.c1931
-rw-r--r--net/nfc/nfc.h151
-rw-r--r--net/nfc/rawsock.c421
-rw-r--r--net/nonet.c25
-rw-r--r--net/nsh/Kconfig10
-rw-r--r--net/nsh/Makefile2
-rw-r--r--net/nsh/nsh.c153
-rw-r--r--net/openvswitch/Kconfig77
-rw-r--r--net/openvswitch/Makefile29
-rw-r--r--net/openvswitch/actions.c1597
-rw-r--r--net/openvswitch/conntrack.c2031
-rw-r--r--net/openvswitch/conntrack.h106
-rw-r--r--net/openvswitch/datapath.c2862
-rw-r--r--net/openvswitch/datapath.h346
-rw-r--r--net/openvswitch/dp_notify.c86
-rw-r--r--net/openvswitch/drop.h41
-rw-r--r--net/openvswitch/flow.c1116
-rw-r--r--net/openvswitch/flow.h298
-rw-r--r--net/openvswitch/flow_netlink.c3794
-rw-r--r--net/openvswitch/flow_netlink.h71
-rw-r--r--net/openvswitch/flow_table.c1219
-rw-r--r--net/openvswitch/flow_table.h115
-rw-r--r--net/openvswitch/meter.c766
-rw-r--r--net/openvswitch/meter.h62
-rw-r--r--net/openvswitch/openvswitch_trace.c10
-rw-r--r--net/openvswitch/openvswitch_trace.h158
-rw-r--r--net/openvswitch/vport-geneve.c140
-rw-r--r--net/openvswitch/vport-gre.c103
-rw-r--r--net/openvswitch/vport-internal_dev.c247
-rw-r--r--net/openvswitch/vport-internal_dev.h17
-rw-r--r--net/openvswitch/vport-netdev.c216
-rw-r--r--net/openvswitch/vport-netdev.h23
-rw-r--r--net/openvswitch/vport-vxlan.c169
-rw-r--r--net/openvswitch/vport.c582
-rw-r--r--net/openvswitch/vport.h218
-rw-r--r--net/packet/Kconfig15
-rw-r--r--net/packet/Makefile3
-rw-r--r--net/packet/af_packet.c4548
-rw-r--r--net/packet/diag.c267
-rw-r--r--net/packet/internal.h159
-rw-r--r--net/phonet/Kconfig17
-rw-r--r--net/phonet/Makefile12
-rw-r--r--net/phonet/af_phonet.c540
-rw-r--r--net/phonet/datagram.c196
-rw-r--r--net/phonet/pep-gprs.c310
-rw-r--r--net/phonet/pep.c1389
-rw-r--r--net/phonet/pn_dev.c441
-rw-r--r--net/phonet/pn_netlink.c336
-rw-r--r--net/phonet/socket.c772
-rw-r--r--net/phonet/sysctl.c95
-rw-r--r--net/psample/Kconfig15
-rw-r--r--net/psample/Makefile6
-rw-r--r--net/psample/psample.c533
-rw-r--r--net/psp/Kconfig15
-rw-r--r--net/psp/Makefile5
-rw-r--r--net/psp/psp-nl-gen.c139
-rw-r--r--net/psp/psp-nl-gen.h42
-rw-r--r--net/psp/psp.h54
-rw-r--r--net/psp/psp_main.c323
-rw-r--r--net/psp/psp_nl.c598
-rw-r--r--net/psp/psp_sock.c294
-rw-r--r--net/qrtr/Kconfig38
-rw-r--r--net/qrtr/Makefile10
-rw-r--r--net/qrtr/af_qrtr.c1328
-rw-r--r--net/qrtr/mhi.c183
-rw-r--r--net/qrtr/ns.c778
-rw-r--r--net/qrtr/qrtr.h36
-rw-r--r--net/qrtr/smd.c111
-rw-r--r--net/qrtr/tun.c176
-rw-r--r--net/rds/Kconfig37
-rw-r--r--net/rds/Makefile22
-rw-r--r--net/rds/af_rds.c962
-rw-r--r--net/rds/bind.c283
-rw-r--r--net/rds/cong.c428
-rw-r--r--net/rds/connection.c945
-rw-r--r--net/rds/ib.c607
-rw-r--r--net/rds/ib.h454
-rw-r--r--net/rds/ib_cm.c1287
-rw-r--r--net/rds/ib_frmr.c450
-rw-r--r--net/rds/ib_mr.h142
-rw-r--r--net/rds/ib_rdma.c698
-rw-r--r--net/rds/ib_recv.c1095
-rw-r--r--net/rds/ib_ring.c168
-rw-r--r--net/rds/ib_send.c1017
-rw-r--r--net/rds/ib_stats.c107
-rw-r--r--net/rds/ib_sysctl.c120
-rw-r--r--net/rds/info.c242
-rw-r--r--net/rds/info.h31
-rw-r--r--net/rds/loop.c254
-rw-r--r--net/rds/loop.h12
-rw-r--r--net/rds/message.c519
-rw-r--r--net/rds/page.c170
-rw-r--r--net/rds/rdma.c961
-rw-r--r--net/rds/rdma_transport.c322
-rw-r--r--net/rds/rdma_transport.h30
-rw-r--r--net/rds/rds.h1016
-rw-r--r--net/rds/rds_single_path.h31
-rw-r--r--net/rds/recv.c841
-rw-r--r--net/rds/send.c1509
-rw-r--r--net/rds/stats.c154
-rw-r--r--net/rds/sysctl.c109
-rw-r--r--net/rds/tcp.c776
-rw-r--r--net/rds/tcp.h97
-rw-r--r--net/rds/tcp_connect.c229
-rw-r--r--net/rds/tcp_listen.c334
-rw-r--r--net/rds/tcp_recv.c349
-rw-r--r--net/rds/tcp_send.c226
-rw-r--r--net/rds/tcp_stats.c74
-rw-r--r--net/rds/threads.c311
-rw-r--r--net/rds/transport.c169
-rw-r--r--net/rfkill/Kconfig34
-rw-r--r--net/rfkill/Makefile9
-rw-r--r--net/rfkill/core.c1458
-rw-r--r--net/rfkill/input.c343
-rw-r--r--net/rfkill/rfkill-gpio.c221
-rw-r--r--net/rfkill/rfkill.h23
-rw-r--r--net/rose/Makefile1
-rw-r--r--net/rose/af_rose.c604
-rw-r--r--net/rose/rose_dev.c120
-rw-r--r--net/rose/rose_in.c27
-rw-r--r--net/rose/rose_link.c65
-rw-r--r--net/rose/rose_loopback.c82
-rw-r--r--net/rose/rose_out.c8
-rw-r--r--net/rose/rose_route.c391
-rw-r--r--net/rose/rose_subr.c130
-rw-r--r--net/rose/rose_timer.c96
-rw-r--r--net/rose/sysctl_net_rose.c92
-rw-r--r--net/rxrpc/Kconfig100
-rw-r--r--net/rxrpc/Makefile61
-rw-r--r--net/rxrpc/af_rxrpc.c1148
-rw-r--r--net/rxrpc/ar-internal.h1695
-rw-r--r--net/rxrpc/call.c2277
-rw-r--r--net/rxrpc/call_accept.c483
-rw-r--r--net/rxrpc/call_event.c497
-rw-r--r--net/rxrpc/call_object.c778
-rw-r--r--net/rxrpc/call_state.c69
-rw-r--r--net/rxrpc/conn_client.c833
-rw-r--r--net/rxrpc/conn_event.c572
-rw-r--r--net/rxrpc/conn_object.c495
-rw-r--r--net/rxrpc/conn_service.c195
-rw-r--r--net/rxrpc/connection.c777
-rw-r--r--net/rxrpc/input.c1299
-rw-r--r--net/rxrpc/input_rack.c418
-rw-r--r--net/rxrpc/insecure.c96
-rw-r--r--net/rxrpc/internal.h106
-rw-r--r--net/rxrpc/io_thread.c602
-rw-r--r--net/rxrpc/key.c886
-rw-r--r--net/rxrpc/krxiod.c261
-rw-r--r--net/rxrpc/krxsecd.c269
-rw-r--r--net/rxrpc/krxtimod.c203
-rw-r--r--net/rxrpc/local_event.c84
-rw-r--r--net/rxrpc/local_object.c486
-rw-r--r--net/rxrpc/main.c180
-rw-r--r--net/rxrpc/misc.c62
-rw-r--r--net/rxrpc/net_ns.c124
-rw-r--r--net/rxrpc/oob.c379
-rw-r--r--net/rxrpc/output.c983
-rw-r--r--net/rxrpc/peer.c398
-rw-r--r--net/rxrpc/peer_event.c443
-rw-r--r--net/rxrpc/peer_object.c560
-rw-r--r--net/rxrpc/proc.c1005
-rw-r--r--net/rxrpc/protocol.h204
-rw-r--r--net/rxrpc/recvmsg.c658
-rw-r--r--net/rxrpc/rtt.c208
-rw-r--r--net/rxrpc/rxgk.c1373
-rw-r--r--net/rxrpc/rxgk_app.c297
-rw-r--r--net/rxrpc/rxgk_common.h149
-rw-r--r--net/rxrpc/rxgk_kdf.c288
-rw-r--r--net/rxrpc/rxkad.c1358
-rw-r--r--net/rxrpc/rxperf.c703
-rw-r--r--net/rxrpc/rxrpc_syms.c34
-rw-r--r--net/rxrpc/security.c210
-rw-r--r--net/rxrpc/sendmsg.c876
-rw-r--r--net/rxrpc/server_key.c213
-rw-r--r--net/rxrpc/skbuff.c83
-rw-r--r--net/rxrpc/sysctl.c210
-rw-r--r--net/rxrpc/transport.c847
-rw-r--r--net/rxrpc/txbuf.c96
-rw-r--r--net/rxrpc/utils.c44
-rw-r--r--net/sched/Kconfig880
-rw-r--r--net/sched/Makefile63
-rw-r--r--net/sched/act_api.c2535
-rw-r--r--net/sched/act_bpf.c438
-rw-r--r--net/sched/act_connmark.c284
-rw-r--r--net/sched/act_csum.c747
-rw-r--r--net/sched/act_ct.c1698
-rw-r--r--net/sched/act_ctinfo.c403
-rw-r--r--net/sched/act_gact.c346
-rw-r--r--net/sched/act_gate.c676
-rw-r--r--net/sched/act_ife.c930
-rw-r--r--net/sched/act_ipt.c313
-rw-r--r--net/sched/act_meta_mark.c73
-rw-r--r--net/sched/act_meta_skbprio.c71
-rw-r--r--net/sched/act_meta_skbtcindex.c73
-rw-r--r--net/sched/act_mirred.c769
-rw-r--r--net/sched/act_mpls.c489
-rw-r--r--net/sched/act_nat.c361
-rw-r--r--net/sched/act_pedit.c714
-rw-r--r--net/sched/act_police.c932
-rw-r--r--net/sched/act_sample.c367
-rw-r--r--net/sched/act_simple.c287
-rw-r--r--net/sched/act_skbedit.c463
-rw-r--r--net/sched/act_skbmod.c324
-rw-r--r--net/sched/act_tunnel_key.c879
-rw-r--r--net/sched/act_vlan.c465
-rw-r--r--net/sched/bpf_qdisc.c472
-rw-r--r--net/sched/cls_api.c4265
-rw-r--r--net/sched/cls_basic.c353
-rw-r--r--net/sched/cls_bpf.c709
-rw-r--r--net/sched/cls_cgroup.c227
-rw-r--r--net/sched/cls_flow.c723
-rw-r--r--net/sched/cls_flower.c3874
-rw-r--r--net/sched/cls_fw.c453
-rw-r--r--net/sched/cls_matchall.c420
-rw-r--r--net/sched/cls_route.c649
-rw-r--r--net/sched/cls_rsvp.c43
-rw-r--r--net/sched/cls_rsvp.h663
-rw-r--r--net/sched/cls_rsvp6.c44
-rw-r--r--net/sched/cls_tcindex.c529
-rw-r--r--net/sched/cls_u32.c1474
-rw-r--r--net/sched/em_canid.c234
-rw-r--r--net/sched/em_cmp.c77
-rw-r--r--net/sched/em_ipset.c134
-rw-r--r--net/sched/em_ipt.c297
-rw-r--r--net/sched/em_meta.c477
-rw-r--r--net/sched/em_nbyte.c28
-rw-r--r--net/sched/em_text.c52
-rw-r--r--net/sched/em_u32.c19
-rw-r--r--net/sched/ematch.c225
-rw-r--r--net/sched/sch_api.c2524
-rw-r--r--net/sched/sch_atm.c735
-rw-r--r--net/sched/sch_blackhole.c32
-rw-r--r--net/sched/sch_cake.c3153
-rw-r--r--net/sched/sch_cbq.c2125
-rw-r--r--net/sched/sch_cbs.c578
-rw-r--r--net/sched/sch_choke.c518
-rw-r--r--net/sched/sch_codel.c287
-rw-r--r--net/sched/sch_drr.c503
-rw-r--r--net/sched/sch_dsmark.c516
-rw-r--r--net/sched/sch_dualpi2.c1177
-rw-r--r--net/sched/sch_etf.c517
-rw-r--r--net/sched/sch_ets.c839
-rw-r--r--net/sched/sch_fifo.c249
-rw-r--r--net/sched/sch_fq.c1366
-rw-r--r--net/sched/sch_fq_codel.c753
-rw-r--r--net/sched/sch_fq_pie.c595
-rw-r--r--net/sched/sch_frag.c160
-rw-r--r--net/sched/sch_generic.c1708
-rw-r--r--net/sched/sch_gred.c731
-rw-r--r--net/sched/sch_hfsc.c890
-rw-r--r--net/sched/sch_hhf.c731
-rw-r--r--net/sched/sch_htb.c2051
-rw-r--r--net/sched/sch_ingress.c611
-rw-r--r--net/sched/sch_mq.c275
-rw-r--r--net/sched/sch_mqprio.c791
-rw-r--r--net/sched/sch_mqprio_lib.c132
-rw-r--r--net/sched/sch_mqprio_lib.h20
-rw-r--r--net/sched/sch_multiq.c414
-rw-r--r--net/sched/sch_netem.c1369
-rw-r--r--net/sched/sch_pie.c585
-rw-r--r--net/sched/sch_plug.c230
-rw-r--r--net/sched/sch_prio.c398
-rw-r--r--net/sched/sch_qfq.c1553
-rw-r--r--net/sched/sch_red.c528
-rw-r--r--net/sched/sch_sfb.c732
-rw-r--r--net/sched/sch_sfq.c1029
-rw-r--r--net/sched/sch_skbprio.c310
-rw-r--r--net/sched/sch_taprio.c2574
-rw-r--r--net/sched/sch_tbf.c583
-rw-r--r--net/sched/sch_teql.c317
-rw-r--r--net/sctp/Kconfig83
-rw-r--r--net/sctp/Makefile12
-rw-r--r--net/sctp/associola.c1266
-rw-r--r--net/sctp/auth.c981
-rw-r--r--net/sctp/bind_addr.c366
-rw-r--r--net/sctp/chunk.c349
-rw-r--r--net/sctp/command.c81
-rw-r--r--net/sctp/crc32c.c220
-rw-r--r--net/sctp/debug.c105
-rw-r--r--net/sctp/diag.c543
-rw-r--r--net/sctp/endpointola.c324
-rw-r--r--net/sctp/input.c1236
-rw-r--r--net/sctp/inqueue.c205
-rw-r--r--net/sctp/ipv6.c1098
-rw-r--r--net/sctp/objcnt.c131
-rw-r--r--net/sctp/offload.c120
-rw-r--r--net/sctp/output.c1032
-rw-r--r--net/sctp/outqueue.c1652
-rw-r--r--net/sctp/primitive.c46
-rw-r--r--net/sctp/proc.c415
-rw-r--r--net/sctp/protocol.c1497
-rw-r--r--net/sctp/sm_make_chunk.c2704
-rw-r--r--net/sctp/sm_sideeffect.c1268
-rw-r--r--net/sctp/sm_statefuns.c3938
-rw-r--r--net/sctp/sm_statetable.c407
-rw-r--r--net/sctp/socket.c7979
-rw-r--r--net/sctp/ssnmap.c132
-rw-r--r--net/sctp/stream.c1087
-rw-r--r--net/sctp/stream_interleave.c1353
-rw-r--r--net/sctp/stream_sched.c280
-rw-r--r--net/sctp/stream_sched_fc.c225
-rw-r--r--net/sctp/stream_sched_prio.c319
-rw-r--r--net/sctp/stream_sched_rr.c190
-rw-r--r--net/sctp/sysctl.c679
-rw-r--r--net/sctp/transport.c724
-rw-r--r--net/sctp/tsnmap.c396
-rw-r--r--net/sctp/ulpevent.c612
-rw-r--r--net/sctp/ulpqueue.c685
-rw-r--r--net/shaper/Makefile8
-rw-r--r--net/shaper/shaper.c1438
-rw-r--r--net/shaper/shaper_nl_gen.c155
-rw-r--r--net/shaper/shaper_nl_gen.h45
-rw-r--r--net/smc/Kconfig31
-rw-r--r--net/smc/Makefile9
-rw-r--r--net/smc/af_smc.c3681
-rw-r--r--net/smc/smc.h429
-rw-r--r--net/smc/smc_cdc.c525
-rw-r--r--net/smc/smc_cdc.h305
-rw-r--r--net/smc/smc_clc.c1370
-rw-r--r--net/smc/smc_clc.h477
-rw-r--r--net/smc/smc_close.c506
-rw-r--r--net/smc/smc_close.h30
-rw-r--r--net/smc/smc_core.c2782
-rw-r--r--net/smc/smc_core.h629
-rw-r--r--net/smc/smc_diag.c278
-rw-r--r--net/smc/smc_hs_bpf.c140
-rw-r--r--net/smc/smc_hs_bpf.h31
-rw-r--r--net/smc/smc_ib.c1025
-rw-r--r--net/smc/smc_ib.h119
-rw-r--r--net/smc/smc_inet.c163
-rw-r--r--net/smc/smc_inet.h22
-rw-r--r--net/smc/smc_ism.c651
-rw-r--r--net/smc/smc_ism.h123
-rw-r--r--net/smc/smc_llc.c2376
-rw-r--r--net/smc/smc_llc.h120
-rw-r--r--net/smc/smc_netlink.c157
-rw-r--r--net/smc/smc_netlink.h34
-rw-r--r--net/smc/smc_netns.h21
-rw-r--r--net/smc/smc_pnet.c1223
-rw-r--r--net/smc/smc_pnet.h70
-rw-r--r--net/smc/smc_rx.c520
-rw-r--r--net/smc/smc_rx.h31
-rw-r--r--net/smc/smc_stats.c419
-rw-r--r--net/smc/smc_stats.h280
-rw-r--r--net/smc/smc_sysctl.c266
-rw-r--r--net/smc/smc_sysctl.h37
-rw-r--r--net/smc/smc_tracepoint.c9
-rw-r--r--net/smc/smc_tracepoint.h125
-rw-r--r--net/smc/smc_tx.c737
-rw-r--r--net/smc/smc_tx.h40
-rw-r--r--net/smc/smc_wr.c944
-rw-r--r--net/smc/smc_wr.h137
-rw-r--r--net/socket.c3622
-rw-r--r--net/strparser/Kconfig3
-rw-r--r--net/strparser/Makefile2
-rw-r--r--net/strparser/strparser.c539
-rw-r--r--net/sunrpc/.kunitconfig29
-rw-r--r--net/sunrpc/Kconfig131
-rw-r--r--net/sunrpc/Makefile12
-rw-r--r--net/sunrpc/addr.c354
-rw-r--r--net/sunrpc/auth.c901
-rw-r--r--net/sunrpc/auth_gss/Makefile15
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2242
-rw-r--r--net/sunrpc/auth_gss/auth_gss_internal.h47
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c235
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c1000
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_internal.h195
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c546
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c694
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c140
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c88
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_test.c1859
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c155
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c521
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c283
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c403
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.h36
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c849
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.h252
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c300
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_seal.c127
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_token.c265
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_unseal.c126
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c1958
-rw-r--r--net/sunrpc/auth_gss/trace.c14
-rw-r--r--net/sunrpc/auth_null.c88
-rw-r--r--net/sunrpc/auth_tls.c175
-rw-r--r--net/sunrpc/auth_unix.c280
-rw-r--r--net/sunrpc/backchannel_rqst.c371
-rw-r--r--net/sunrpc/cache.c1643
-rw-r--r--net/sunrpc/clnt.c3571
-rw-r--r--net/sunrpc/debugfs.c309
-rw-r--r--net/sunrpc/fail.h25
-rw-r--r--net/sunrpc/netns.h44
-rw-r--r--net/sunrpc/pmap_clnt.c378
-rw-r--r--net/sunrpc/rpc_pipe.c1401
-rw-r--r--net/sunrpc/rpcb_clnt.c1122
-rw-r--r--net/sunrpc/sched.c1488
-rw-r--r--net/sunrpc/socklib.c313
-rw-r--r--net/sunrpc/socklib.h15
-rw-r--r--net/sunrpc/stats.c276
-rw-r--r--net/sunrpc/sunrpc.h46
-rw-r--r--net/sunrpc/sunrpc_syms.c235
-rw-r--r--net/sunrpc/svc.c1800
-rw-r--r--net/sunrpc/svc_xprt.c1500
-rw-r--r--net/sunrpc/svcauth.c233
-rw-r--r--net/sunrpc/svcauth_unix.c913
-rw-r--r--net/sunrpc/svcsock.c2527
-rw-r--r--net/sunrpc/sysctl.c197
-rw-r--r--net/sunrpc/sysfs.c829
-rw-r--r--net/sunrpc/sysfs.h35
-rw-r--r--net/sunrpc/timer.c52
-rw-r--r--net/sunrpc/xdr.c2043
-rw-r--r--net/sunrpc/xprt.c2130
-rw-r--r--net/sunrpc/xprtmultipath.c672
-rw-r--r--net/sunrpc/xprtrdma/Makefile8
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c280
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c697
-rw-r--r--net/sunrpc/xprtrdma/ib_client.c184
-rw-r--r--net/sunrpc/xprtrdma/module.c64
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c1509
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c307
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c284
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_pcl.c306
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c1020
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c1142
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c1110
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c656
-rw-r--r--net/sunrpc/xprtrdma/transport.c795
-rw-r--r--net/sunrpc/xprtrdma/verbs.c1410
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h605
-rw-r--r--net/sunrpc/xprtsock.c3918
-rw-r--r--net/switchdev/Kconfig14
-rw-r--r--net/switchdev/Makefile6
-rw-r--r--net/switchdev/switchdev.c1064
-rw-r--r--net/sysctl_net.c179
-rw-r--r--net/tipc/Kconfig135
-rw-r--r--net/tipc/Makefile21
-rw-r--r--net/tipc/addr.c128
-rw-r--r--net/tipc/addr.h127
-rw-r--r--net/tipc/bcast.c1315
-rw-r--r--net/tipc/bcast.h240
-rw-r--r--net/tipc/bearer.c1650
-rw-r--r--net/tipc/bearer.h276
-rw-r--r--net/tipc/cluster.c576
-rw-r--r--net/tipc/cluster.h92
-rw-r--r--net/tipc/config.c702
-rw-r--r--net/tipc/config.h79
-rw-r--r--net/tipc/core.c359
-rw-r--r--net/tipc/core.h398
-rw-r--r--net/tipc/crypto.c2484
-rw-r--r--net/tipc/crypto.h200
-rw-r--r--net/tipc/dbg.c436
-rw-r--r--net/tipc/dbg.h70
-rw-r--r--net/tipc/diag.c118
-rw-r--r--net/tipc/discover.c564
-rw-r--r--net/tipc/discover.h27
-rw-r--r--net/tipc/eth_media.c302
-rw-r--r--net/tipc/group.c959
-rw-r--r--net/tipc/group.h77
-rw-r--r--net/tipc/handler.c132
-rw-r--r--net/tipc/ib_media.c104
-rw-r--r--net/tipc/link.c5337
-rw-r--r--net/tipc/link.h357
-rw-r--r--net/tipc/monitor.c875
-rw-r--r--net/tipc/monitor.h83
-rw-r--r--net/tipc/msg.c1054
-rw-r--r--net/tipc/msg.h1158
-rw-r--r--net/tipc/name_distr.c511
-rw-r--r--net/tipc/name_distr.h43
-rw-r--r--net/tipc/name_table.c1810
-rw-r--r--net/tipc/name_table.h161
-rw-r--r--net/tipc/net.c440
-rw-r--r--net/tipc/net.h36
-rw-r--r--net/tipc/netlink.c319
-rw-r--r--net/tipc/netlink.h (renamed from net/tipc/zone.h)57
-rw-r--r--net/tipc/netlink_compat.c1376
-rw-r--r--net/tipc/node.c3423
-rw-r--r--net/tipc/node.h185
-rw-r--r--net/tipc/node_subscr.c80
-rw-r--r--net/tipc/port.c1711
-rw-r--r--net/tipc/port.h209
-rw-r--r--net/tipc/ref.c210
-rw-r--r--net/tipc/ref.h131
-rw-r--r--net/tipc/socket.c4701
-rw-r--r--net/tipc/socket.h80
-rw-r--r--net/tipc/subscr.c629
-rw-r--r--net/tipc/subscr.h106
-rw-r--r--net/tipc/sysctl.c107
-rw-r--r--net/tipc/topsrv.c733
-rw-r--r--net/tipc/topsrv.h (renamed from net/tipc/user_reg.h)28
-rw-r--r--net/tipc/trace.c206
-rw-r--r--net/tipc/trace.h434
-rw-r--r--net/tipc/udp_media.c863
-rw-r--r--net/tipc/udp_media.h (renamed from net/tipc/node_subscr.h)47
-rw-r--r--net/tipc/user_reg.c264
-rw-r--r--net/tipc/zone.c173
-rw-r--r--net/tls/Kconfig40
-rw-r--r--net/tls/Makefile13
-rw-r--r--net/tls/tls.h382
-rw-r--r--net/tls/tls_device.c1443
-rw-r--r--net/tls/tls_device_fallback.c474
-rw-r--r--net/tls/tls_main.c1274
-rw-r--r--net/tls/tls_proc.c63
-rw-r--r--net/tls/tls_strp.c642
-rw-r--r--net/tls/tls_sw.c2910
-rw-r--r--net/tls/tls_toe.c141
-rw-r--r--net/tls/trace.c10
-rw-r--r--net/tls/trace.h202
-rw-r--r--net/unix/Kconfig23
-rw-r--r--net/unix/Makefile5
-rw-r--r--net/unix/af_unix.c3920
-rw-r--r--net/unix/af_unix.h70
-rw-r--r--net/unix/diag.c325
-rw-r--r--net/unix/garbage.c699
-rw-r--r--net/unix/sysctl_net_unix.c80
-rw-r--r--net/unix/unix_bpf.c202
-rw-r--r--net/vmw_vsock/Kconfig83
-rw-r--r--net/vmw_vsock/Makefile22
-rw-r--r--net/vmw_vsock/af_vsock.c2795
-rw-r--r--net/vmw_vsock/af_vsock_tap.c110
-rw-r--r--net/vmw_vsock/diag.c180
-rw-r--r--net/vmw_vsock/hyperv_transport.c966
-rw-r--r--net/vmw_vsock/virtio_transport.c963
-rw-r--r--net/vmw_vsock/virtio_transport_common.c1787
-rw-r--r--net/vmw_vsock/vmci_transport.c2160
-rw-r--r--net/vmw_vsock/vmci_transport.h130
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c672
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h75
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c430
-rw-r--r--net/vmw_vsock/vsock_addr.c69
-rw-r--r--net/vmw_vsock/vsock_bpf.c175
-rw-r--r--net/vmw_vsock/vsock_loopback.c179
-rw-r--r--net/wanrouter/Kconfig29
-rw-r--r--net/wanrouter/Makefile7
-rw-r--r--net/wanrouter/af_wanpipe.c2600
-rw-r--r--net/wanrouter/patchlevel1
-rw-r--r--net/wanrouter/wanmain.c881
-rw-r--r--net/wanrouter/wanproc.c380
-rw-r--r--net/wireless/.gitignore3
-rw-r--r--net/wireless/Kconfig204
-rw-r--r--net/wireless/Makefile56
-rw-r--r--net/wireless/ap.c74
-rw-r--r--net/wireless/certs/sforshee.hex86
-rw-r--r--net/wireless/certs/wens.hex87
-rw-r--r--net/wireless/chan.c1609
-rw-r--r--net/wireless/core.c1922
-rw-r--r--net/wireless/core.h637
-rw-r--r--net/wireless/debugfs.c304
-rw-r--r--net/wireless/debugfs.h12
-rw-r--r--net/wireless/ethtool.c29
-rw-r--r--net/wireless/ibss.c496
-rw-r--r--net/wireless/mesh.c289
-rw-r--r--net/wireless/mlme.c1419
-rw-r--r--net/wireless/nl80211.c21921
-rw-r--r--net/wireless/nl80211.h130
-rw-r--r--net/wireless/ocb.c65
-rw-r--r--net/wireless/of.c138
-rw-r--r--net/wireless/pmsr.c668
-rw-r--r--net/wireless/radiotap.c370
-rw-r--r--net/wireless/rdev-ops.h1587
-rw-r--r--net/wireless/reg.c4372
-rw-r--r--net/wireless/reg.h197
-rw-r--r--net/wireless/scan.c4013
-rw-r--r--net/wireless/sme.c1622
-rw-r--r--net/wireless/sysfs.c181
-rw-r--r--net/wireless/sysfs.h10
-rw-r--r--net/wireless/tests/Makefile3
-rw-r--r--net/wireless/tests/chan.c228
-rw-r--r--net/wireless/tests/fragmentation.c177
-rw-r--r--net/wireless/tests/module.c10
-rw-r--r--net/wireless/tests/scan.c880
-rw-r--r--net/wireless/tests/util.c56
-rw-r--r--net/wireless/tests/util.h66
-rw-r--r--net/wireless/trace.c7
-rw-r--r--net/wireless/trace.h4231
-rw-r--r--net/wireless/util.c2986
-rw-r--r--net/wireless/wext-compat.c1520
-rw-r--r--net/wireless/wext-compat.h58
-rw-r--r--net/wireless/wext-core.c1215
-rw-r--r--net/wireless/wext-priv.c249
-rw-r--r--net/wireless/wext-proc.c142
-rw-r--r--net/wireless/wext-sme.c369
-rw-r--r--net/x25/Kconfig14
-rw-r--r--net/x25/Makefile3
-rw-r--r--net/x25/af_x25.c1016
-rw-r--r--net/x25/sysctl_net_x25.c64
-rw-r--r--net/x25/x25_dev.c150
-rw-r--r--net/x25/x25_facilities.c119
-rw-r--r--net/x25/x25_forward.c157
-rw-r--r--net/x25/x25_in.c232
-rw-r--r--net/x25/x25_link.c156
-rw-r--r--net/x25/x25_out.c38
-rw-r--r--net/x25/x25_proc.c201
-rw-r--r--net/x25/x25_route.c48
-rw-r--r--net/x25/x25_subr.c141
-rw-r--r--net/x25/x25_timer.c39
-rw-r--r--net/xdp/Kconfig16
-rw-r--r--net/xdp/Makefile4
-rw-r--r--net/xdp/xdp_umem.c271
-rw-r--r--net/xdp/xdp_umem.h15
-rw-r--r--net/xdp/xsk.c1959
-rw-r--r--net/xdp/xsk.h48
-rw-r--r--net/xdp/xsk_buff_pool.c755
-rw-r--r--net/xdp/xsk_diag.c216
-rw-r--r--net/xdp/xsk_queue.c66
-rw-r--r--net/xdp/xsk_queue.h497
-rw-r--r--net/xdp/xskmap.c281
-rw-r--r--net/xfrm/Kconfig149
-rw-r--r--net/xfrm/Makefile22
-rw-r--r--net/xfrm/espintcp.c589
-rw-r--r--net/xfrm/trace_iptfs.h218
-rw-r--r--net/xfrm/xfrm_algo.c911
-rw-r--r--net/xfrm/xfrm_compat.c688
-rw-r--r--net/xfrm/xfrm_device.c572
-rw-r--r--net/xfrm/xfrm_hash.c13
-rw-r--r--net/xfrm/xfrm_hash.h134
-rw-r--r--net/xfrm/xfrm_inout.h70
-rw-r--r--net/xfrm/xfrm_input.c812
-rw-r--r--net/xfrm/xfrm_interface_bpf.c110
-rw-r--r--net/xfrm/xfrm_interface_core.c1241
-rw-r--r--net/xfrm/xfrm_ipcomp.c364
-rw-r--r--net/xfrm/xfrm_iptfs.c2762
-rw-r--r--net/xfrm/xfrm_nat_keepalive.c302
-rw-r--r--net/xfrm/xfrm_output.c966
-rw-r--r--net/xfrm/xfrm_policy.c4789
-rw-r--r--net/xfrm/xfrm_proc.c80
-rw-r--r--net/xfrm/xfrm_replay.c799
-rw-r--r--net/xfrm/xfrm_state.c3363
-rw-r--r--net/xfrm/xfrm_state_bpf.c134
-rw-r--r--net/xfrm/xfrm_sysctl.c88
-rw-r--r--net/xfrm/xfrm_user.c3671
2291 files changed, 1142997 insertions, 279694 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
new file mode 100644
index 000000000000..01853cec0209
--- /dev/null
+++ b/net/6lowpan/6lowpan_i.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __6LOWPAN_I_H
+#define __6LOWPAN_I_H
+
+#include <linux/netdevice.h>
+
+#include <net/6lowpan.h>
+
+/* caller need to be sure it's dev->type is ARPHRD_6LOWPAN */
+static inline bool lowpan_is_ll(const struct net_device *dev,
+ enum lowpan_lltypes lltype)
+{
+ return lowpan_dev(dev)->lltype == lltype;
+}
+
+extern const struct ndisc_ops lowpan_ndisc_ops;
+
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev);
+
+#ifdef CONFIG_6LOWPAN_DEBUGFS
+void lowpan_dev_debugfs_init(struct net_device *dev);
+void lowpan_dev_debugfs_exit(struct net_device *dev);
+
+void __init lowpan_debugfs_init(void);
+void lowpan_debugfs_exit(void);
+#else
+static inline void lowpan_dev_debugfs_init(struct net_device *dev) { }
+static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { }
+
+static inline void __init lowpan_debugfs_init(void) { }
+static inline void lowpan_debugfs_exit(void) { }
+#endif /* CONFIG_6LOWPAN_DEBUGFS */
+
+#endif /* __6LOWPAN_I_H */
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig
new file mode 100644
index 000000000000..d8fc459492b0
--- /dev/null
+++ b/net/6lowpan/Kconfig
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig 6LOWPAN
+ tristate "6LoWPAN Support"
+ depends on IPV6
+ help
+ This enables IPv6 over Low power Wireless Personal Area Network -
+ "6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks.
+
+config 6LOWPAN_DEBUGFS
+ bool "6LoWPAN debugfs support"
+ depends on 6LOWPAN
+ depends on DEBUG_FS
+ help
+ This enables 6LoWPAN debugfs support. For example to manipulate
+ IPHC context information at runtime.
+
+menuconfig 6LOWPAN_NHC
+ tristate "Next Header and Generic Header Compression Support"
+ depends on 6LOWPAN
+ default y
+ help
+ Support for next header and generic header compression defined in
+ RFC6282 and RFC7400.
+
+if 6LOWPAN_NHC
+
+config 6LOWPAN_NHC_DEST
+ tristate "Destination Options Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 Destination Options Header compression according to
+ RFC6282.
+
+config 6LOWPAN_NHC_FRAGMENT
+ tristate "Fragment Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 Fragment Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_HOP
+ tristate "Hop-by-Hop Options Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to
+ RFC6282.
+
+config 6LOWPAN_NHC_IPV6
+ tristate "IPv6 Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_MOBILITY
+ tristate "Mobility Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 Mobility Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_ROUTING
+ tristate "Routing Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 Routing Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_UDP
+ tristate "UDP Header Support"
+ default y
+ help
+ 6LoWPAN IPv6 UDP Header compression according to RFC6282.
+
+config 6LOWPAN_GHC_EXT_HDR_HOP
+ tristate "GHC Hop-by-Hop Options Header Support"
+ help
+ 6LoWPAN IPv6 Hop-by-Hop option generic header compression according
+ to RFC7400.
+
+config 6LOWPAN_GHC_UDP
+ tristate "GHC UDP Support"
+ help
+ 6LoWPAN IPv6 UDP generic header compression according to RFC7400.
+
+config 6LOWPAN_GHC_ICMPV6
+ tristate "GHC ICMPv6 Support"
+ help
+ 6LoWPAN IPv6 ICMPv6 generic header compression according to RFC7400.
+
+config 6LOWPAN_GHC_EXT_HDR_DEST
+ tristate "GHC Destination Options Header Support"
+ help
+ 6LoWPAN IPv6 destination option generic header compression according
+ to RFC7400.
+
+config 6LOWPAN_GHC_EXT_HDR_FRAG
+ tristate "GHC Fragmentation Options Header Support"
+ help
+ 6LoWPAN IPv6 fragmentation option generic header compression
+ according to RFC7400.
+
+config 6LOWPAN_GHC_EXT_HDR_ROUTE
+ tristate "GHC Routing Options Header Support"
+ help
+ 6LoWPAN IPv6 routing option generic header compression according
+ to RFC7400.
+
+endif
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
new file mode 100644
index 000000000000..2247b96dbc75
--- /dev/null
+++ b/net/6lowpan/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_6LOWPAN) += 6lowpan.o
+
+6lowpan-y := core.o iphc.o nhc.o ndisc.o
+6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o
+
+#rfc6282 nhcs
+obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o
+obj-$(CONFIG_6LOWPAN_NHC_FRAGMENT) += nhc_fragment.o
+obj-$(CONFIG_6LOWPAN_NHC_HOP) += nhc_hop.o
+obj-$(CONFIG_6LOWPAN_NHC_IPV6) += nhc_ipv6.o
+obj-$(CONFIG_6LOWPAN_NHC_MOBILITY) += nhc_mobility.o
+obj-$(CONFIG_6LOWPAN_NHC_ROUTING) += nhc_routing.o
+obj-$(CONFIG_6LOWPAN_NHC_UDP) += nhc_udp.o
+
+#rfc7400 ghcs
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_HOP) += nhc_ghc_ext_hop.o
+obj-$(CONFIG_6LOWPAN_GHC_UDP) += nhc_ghc_udp.o
+obj-$(CONFIG_6LOWPAN_GHC_ICMPV6) += nhc_ghc_icmpv6.o
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_DEST) += nhc_ghc_ext_dest.o
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG) += nhc_ghc_ext_frag.o
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE) += nhc_ghc_ext_route.o
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
new file mode 100644
index 000000000000..850d4a185f55
--- /dev/null
+++ b/net/6lowpan/core.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors:
+ * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <linux/if_arp.h>
+#include <linux/module.h>
+
+#include <net/6lowpan.h>
+#include <net/addrconf.h>
+
+#include "6lowpan_i.h"
+
+int lowpan_register_netdevice(struct net_device *dev,
+ enum lowpan_lltypes lltype)
+{
+ int i, ret;
+
+ switch (lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ dev->addr_len = EUI64_ADDR_LEN;
+ break;
+
+ case LOWPAN_LLTYPE_BTLE:
+ dev->addr_len = ETH_ALEN;
+ break;
+ }
+
+ dev->type = ARPHRD_6LOWPAN;
+ dev->mtu = IPV6_MIN_MTU;
+
+ lowpan_dev(dev)->lltype = lltype;
+
+ spin_lock_init(&lowpan_dev(dev)->ctx.lock);
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ lowpan_dev(dev)->ctx.table[i].id = i;
+
+ dev->ndisc_ops = &lowpan_ndisc_ops;
+
+ ret = register_netdevice(dev);
+ if (ret < 0)
+ return ret;
+
+ lowpan_dev_debugfs_init(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(lowpan_register_netdevice);
+
+int lowpan_register_netdev(struct net_device *dev,
+ enum lowpan_lltypes lltype)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = lowpan_register_netdevice(dev, lltype);
+ rtnl_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(lowpan_register_netdev);
+
+void lowpan_unregister_netdevice(struct net_device *dev)
+{
+ unregister_netdevice(dev);
+ lowpan_dev_debugfs_exit(dev);
+}
+EXPORT_SYMBOL(lowpan_unregister_netdevice);
+
+void lowpan_unregister_netdev(struct net_device *dev)
+{
+ rtnl_lock();
+ lowpan_unregister_netdevice(dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(lowpan_unregister_netdev);
+
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
+{
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ /* Set short_addr autoconfiguration if short_addr is present only */
+ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+ return -1;
+
+ /* For either address format, all zero addresses MUST NOT be used */
+ if (wpan_dev->pan_id == cpu_to_le16(0x0000) &&
+ wpan_dev->short_addr == cpu_to_le16(0x0000))
+ return -1;
+
+ /* Alternatively, if no PAN ID is known, 16 zero bits may be used */
+ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+ memset(eui, 0, 2);
+ else
+ ieee802154_le16_to_be16(eui, &wpan_dev->pan_id);
+
+ /* The "Universal/Local" (U/L) bit shall be set to zero */
+ eui[0] &= ~2;
+ eui[2] = 0;
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[5] = 0;
+ ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr);
+ return 0;
+}
+
+static int lowpan_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct inet6_dev *idev;
+ struct in6_addr addr;
+ int i;
+
+ if (dev->type != ARPHRD_6LOWPAN)
+ return NOTIFY_DONE;
+
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ /* (802.15.4 6LoWPAN short address slaac handling */
+ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) &&
+ addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) {
+ __ipv6_addr_set_half(&addr.s6_addr32[0],
+ htonl(0xFE800000), 0);
+ addrconf_add_linklocal(idev, &addr, 0);
+ }
+ break;
+ case NETDEV_DOWN:
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
+ &lowpan_dev(dev)->ctx.table[i].flags);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block lowpan_notifier = {
+ .notifier_call = lowpan_event,
+};
+
+static int __init lowpan_module_init(void)
+{
+ int ret;
+
+ lowpan_debugfs_init();
+
+ ret = register_netdevice_notifier(&lowpan_notifier);
+ if (ret < 0) {
+ lowpan_debugfs_exit();
+ return ret;
+ }
+
+ request_module_nowait("nhc_dest");
+ request_module_nowait("nhc_fragment");
+ request_module_nowait("nhc_hop");
+ request_module_nowait("nhc_ipv6");
+ request_module_nowait("nhc_mobility");
+ request_module_nowait("nhc_routing");
+ request_module_nowait("nhc_udp");
+
+ return 0;
+}
+
+static void __exit lowpan_module_exit(void)
+{
+ lowpan_debugfs_exit();
+ unregister_netdevice_notifier(&lowpan_notifier);
+}
+
+module_init(lowpan_module_init);
+module_exit(lowpan_module_exit);
+
+MODULE_DESCRIPTION("IPv6 over Low-Power Wireless Personal Area Network core module");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
new file mode 100644
index 000000000000..600b9563bfc5
--- /dev/null
+++ b/net/6lowpan/debugfs.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors:
+ * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ * Copyright (c) 2015 Nordic Semiconductor. All Rights Reserved.
+ */
+
+#include <net/6lowpan.h>
+
+#include "6lowpan_i.h"
+
+#define LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS 8
+
+static struct dentry *lowpan_debugfs;
+
+static int lowpan_ctx_flag_active_set(void *data, u64 val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ if (val)
+ set_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+ else
+ clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+
+ return 0;
+}
+
+static int lowpan_ctx_flag_active_get(void *data, u64 *val)
+{
+ *val = lowpan_iphc_ctx_is_active(data);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_ctx_flag_active_fops,
+ lowpan_ctx_flag_active_get,
+ lowpan_ctx_flag_active_set, "%llu\n");
+
+static int lowpan_ctx_flag_c_set(void *data, u64 val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ if (val)
+ set_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+ else
+ clear_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+
+ return 0;
+}
+
+static int lowpan_ctx_flag_c_get(void *data, u64 *val)
+{
+ *val = lowpan_iphc_ctx_is_compression(data);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get,
+ lowpan_ctx_flag_c_set, "%llu\n");
+
+static int lowpan_ctx_plen_set(void *data, u64 val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+ if (val > 128)
+ return -EINVAL;
+
+ spin_lock_bh(&t->lock);
+ ctx->plen = val;
+ spin_unlock_bh(&t->lock);
+
+ return 0;
+}
+
+static int lowpan_ctx_plen_get(void *data, u64 *val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+ spin_lock_bh(&t->lock);
+ *val = ctx->plen;
+ spin_unlock_bh(&t->lock);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get,
+ lowpan_ctx_plen_set, "%llu\n");
+
+static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset)
+{
+ struct lowpan_iphc_ctx *ctx = file->private;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+ spin_lock_bh(&t->lock);
+ seq_printf(file, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(ctx->pfx.s6_addr16[0]),
+ be16_to_cpu(ctx->pfx.s6_addr16[1]),
+ be16_to_cpu(ctx->pfx.s6_addr16[2]),
+ be16_to_cpu(ctx->pfx.s6_addr16[3]),
+ be16_to_cpu(ctx->pfx.s6_addr16[4]),
+ be16_to_cpu(ctx->pfx.s6_addr16[5]),
+ be16_to_cpu(ctx->pfx.s6_addr16[6]),
+ be16_to_cpu(ctx->pfx.s6_addr16[7]));
+ spin_unlock_bh(&t->lock);
+
+ return 0;
+}
+
+static int lowpan_ctx_pfx_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lowpan_ctx_pfx_show, inode->i_private);
+}
+
+static ssize_t lowpan_ctx_pfx_write(struct file *fp,
+ const char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ char buf[128] = {};
+ struct seq_file *file = fp->private_data;
+ struct lowpan_iphc_ctx *ctx = file->private;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+ int status = count, n, i;
+ unsigned int addr[8];
+
+ if (copy_from_user(&buf, user_buf, min_t(size_t, sizeof(buf) - 1,
+ count))) {
+ status = -EFAULT;
+ goto out;
+ }
+
+ n = sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ &addr[0], &addr[1], &addr[2], &addr[3], &addr[4],
+ &addr[5], &addr[6], &addr[7]);
+ if (n != LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS) {
+ status = -EINVAL;
+ goto out;
+ }
+
+ spin_lock_bh(&t->lock);
+ for (i = 0; i < 8; i++)
+ ctx->pfx.s6_addr16[i] = cpu_to_be16(addr[i] & 0xffff);
+ spin_unlock_bh(&t->lock);
+
+out:
+ return status;
+}
+
+static const struct file_operations lowpan_ctx_pfx_fops = {
+ .open = lowpan_ctx_pfx_open,
+ .read = seq_read,
+ .write = lowpan_ctx_pfx_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void lowpan_dev_debugfs_ctx_init(struct net_device *dev,
+ struct dentry *ctx, u8 id)
+{
+ struct lowpan_dev *ldev = lowpan_dev(dev);
+ struct dentry *root;
+ char buf[32];
+
+ if (WARN_ON_ONCE(id >= LOWPAN_IPHC_CTX_TABLE_SIZE))
+ return;
+
+ sprintf(buf, "%d", id);
+
+ root = debugfs_create_dir(buf, ctx);
+
+ debugfs_create_file("active", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_flag_active_fops);
+
+ debugfs_create_file("compression", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_flag_c_fops);
+
+ debugfs_create_file("prefix", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_pfx_fops);
+
+ debugfs_create_file("prefix_len", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_plen_fops);
+}
+
+static int lowpan_context_show(struct seq_file *file, void *offset)
+{
+ struct lowpan_iphc_ctx_table *t = file->private;
+ int i;
+
+ seq_printf(file, "%3s|%-43s|%c\n", "cid", "prefix", 'C');
+ seq_puts(file, "-------------------------------------------------\n");
+
+ spin_lock_bh(&t->lock);
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ if (!lowpan_iphc_ctx_is_active(&t->table[i]))
+ continue;
+
+ seq_printf(file, "%3d|%39pI6c/%-3d|%d\n", t->table[i].id,
+ &t->table[i].pfx, t->table[i].plen,
+ lowpan_iphc_ctx_is_compression(&t->table[i]));
+ }
+ spin_unlock_bh(&t->lock);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(lowpan_context);
+
+static int lowpan_short_addr_get(void *data, u64 *val)
+{
+ struct wpan_dev *wdev = data;
+
+ rtnl_lock();
+ *val = le16_to_cpu(wdev->short_addr);
+ rtnl_unlock();
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, NULL,
+ "0x%04llx\n");
+
+static void lowpan_dev_debugfs_802154_init(const struct net_device *dev,
+ struct lowpan_dev *ldev)
+{
+ struct dentry *root;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ root = debugfs_create_dir("ieee802154", ldev->iface_debugfs);
+
+ debugfs_create_file("short_addr", 0444, root,
+ lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
+ &lowpan_short_addr_fops);
+}
+
+void lowpan_dev_debugfs_init(struct net_device *dev)
+{
+ struct lowpan_dev *ldev = lowpan_dev(dev);
+ struct dentry *contexts;
+ int i;
+
+ /* creating the root */
+ ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
+
+ contexts = debugfs_create_dir("contexts", ldev->iface_debugfs);
+
+ debugfs_create_file("show", 0644, contexts, &lowpan_dev(dev)->ctx,
+ &lowpan_context_fops);
+
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ lowpan_dev_debugfs_ctx_init(dev, contexts, i);
+
+ lowpan_dev_debugfs_802154_init(dev, ldev);
+}
+
+void lowpan_dev_debugfs_exit(struct net_device *dev)
+{
+ debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs);
+}
+
+void __init lowpan_debugfs_init(void)
+{
+ lowpan_debugfs = debugfs_create_dir("6lowpan", NULL);
+}
+
+void lowpan_debugfs_exit(void)
+{
+ debugfs_remove_recursive(lowpan_debugfs);
+}
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
new file mode 100644
index 000000000000..e116d308a8df
--- /dev/null
+++ b/net/6lowpan/iphc.c
@@ -0,0 +1,1313 @@
+/*
+ * Copyright 2011, Siemens AG
+ * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+/* Based on patches from Jon Smirl <jonsmirl@gmail.com>
+ * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* Jon's code is based on 6lowpan implementation for Contiki which is:
+ * Copyright (c) 2008, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/bitops.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+
+#include <net/6lowpan.h>
+#include <net/ipv6.h>
+
+#include "6lowpan_i.h"
+#include "nhc.h"
+
+/* Values of fields within the IPHC encoding first byte */
+#define LOWPAN_IPHC_TF_MASK 0x18
+#define LOWPAN_IPHC_TF_00 0x00
+#define LOWPAN_IPHC_TF_01 0x08
+#define LOWPAN_IPHC_TF_10 0x10
+#define LOWPAN_IPHC_TF_11 0x18
+
+#define LOWPAN_IPHC_NH 0x04
+
+#define LOWPAN_IPHC_HLIM_MASK 0x03
+#define LOWPAN_IPHC_HLIM_00 0x00
+#define LOWPAN_IPHC_HLIM_01 0x01
+#define LOWPAN_IPHC_HLIM_10 0x02
+#define LOWPAN_IPHC_HLIM_11 0x03
+
+/* Values of fields within the IPHC encoding second byte */
+#define LOWPAN_IPHC_CID 0x80
+
+#define LOWPAN_IPHC_SAC 0x40
+
+#define LOWPAN_IPHC_SAM_MASK 0x30
+#define LOWPAN_IPHC_SAM_00 0x00
+#define LOWPAN_IPHC_SAM_01 0x10
+#define LOWPAN_IPHC_SAM_10 0x20
+#define LOWPAN_IPHC_SAM_11 0x30
+
+#define LOWPAN_IPHC_M 0x08
+
+#define LOWPAN_IPHC_DAC 0x04
+
+#define LOWPAN_IPHC_DAM_MASK 0x03
+#define LOWPAN_IPHC_DAM_00 0x00
+#define LOWPAN_IPHC_DAM_01 0x01
+#define LOWPAN_IPHC_DAM_10 0x02
+#define LOWPAN_IPHC_DAM_11 0x03
+
+/* ipv6 address based on mac
+ * second bit-flip (Universe/Local) is done according RFC2464
+ */
+#define is_addr_mac_addr_based(a, m) \
+ ((((a)->s6_addr[8]) == (((m)[0]) ^ 0x02)) && \
+ (((a)->s6_addr[9]) == (m)[1]) && \
+ (((a)->s6_addr[10]) == (m)[2]) && \
+ (((a)->s6_addr[11]) == (m)[3]) && \
+ (((a)->s6_addr[12]) == (m)[4]) && \
+ (((a)->s6_addr[13]) == (m)[5]) && \
+ (((a)->s6_addr[14]) == (m)[6]) && \
+ (((a)->s6_addr[15]) == (m)[7]))
+
+/* check whether we can compress the IID to 16 bits,
+ * it's possible for unicast addresses with first 49 bits are zero only.
+ */
+#define lowpan_is_iid_16_bit_compressable(a) \
+ ((((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr[10]) == 0) && \
+ (((a)->s6_addr[11]) == 0xff) && \
+ (((a)->s6_addr[12]) == 0xfe) && \
+ (((a)->s6_addr[13]) == 0))
+
+/* check whether the 112-bit gid of the multicast address is mappable to: */
+
+/* 48 bits, FFXX::00XX:XXXX:XXXX */
+#define lowpan_is_mcast_addr_compressable48(a) \
+ ((((a)->s6_addr16[1]) == 0) && \
+ (((a)->s6_addr16[2]) == 0) && \
+ (((a)->s6_addr16[3]) == 0) && \
+ (((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr[10]) == 0))
+
+/* 32 bits, FFXX::00XX:XXXX */
+#define lowpan_is_mcast_addr_compressable32(a) \
+ ((((a)->s6_addr16[1]) == 0) && \
+ (((a)->s6_addr16[2]) == 0) && \
+ (((a)->s6_addr16[3]) == 0) && \
+ (((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr16[5]) == 0) && \
+ (((a)->s6_addr[12]) == 0))
+
+/* 8 bits, FF02::00XX */
+#define lowpan_is_mcast_addr_compressable8(a) \
+ ((((a)->s6_addr[1]) == 2) && \
+ (((a)->s6_addr16[1]) == 0) && \
+ (((a)->s6_addr16[2]) == 0) && \
+ (((a)->s6_addr16[3]) == 0) && \
+ (((a)->s6_addr16[4]) == 0) && \
+ (((a)->s6_addr16[5]) == 0) && \
+ (((a)->s6_addr16[6]) == 0) && \
+ (((a)->s6_addr[14]) == 0))
+
+#define lowpan_is_linklocal_zero_padded(a) \
+ (!(hdr->saddr.s6_addr[1] & 0x3f) && \
+ !hdr->saddr.s6_addr16[1] && \
+ !hdr->saddr.s6_addr32[1])
+
+#define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f)
+#define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4)
+
+static inline void
+lowpan_iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ u8 eui64[EUI64_ADDR_LEN];
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(eui64, &addr->extended_addr);
+ lowpan_iphc_uncompress_eui64_lladdr(ipaddr, eui64);
+ break;
+ case IEEE802154_ADDR_SHORT:
+ /* fe:80::ff:fe00:XXXX
+ * \__/
+ * short_addr
+ *
+ * Universe/Local bit is zero.
+ */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&ipaddr->s6_addr16[7],
+ &addr->short_addr);
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id)
+{
+ struct lowpan_iphc_ctx *ret = &lowpan_dev(dev)->ctx.table[id];
+
+ if (!lowpan_iphc_ctx_is_active(ret))
+ return NULL;
+
+ return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_addr(const struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table;
+ struct lowpan_iphc_ctx *ret = NULL;
+ struct in6_addr addr_pfx;
+ u8 addr_plen;
+ int i;
+
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ /* Check if context is valid. A context that is not valid
+ * MUST NOT be used for compression.
+ */
+ if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+ !lowpan_iphc_ctx_is_compression(&table[i]))
+ continue;
+
+ ipv6_addr_prefix(&addr_pfx, addr, table[i].plen);
+
+ /* if prefix len < 64, the remaining bits until 64th bit is
+ * zero. Otherwise we use table[i]->plen.
+ */
+ if (table[i].plen < 64)
+ addr_plen = 64;
+ else
+ addr_plen = table[i].plen;
+
+ if (ipv6_prefix_equal(&addr_pfx, &table[i].pfx, addr_plen)) {
+ /* remember first match */
+ if (!ret) {
+ ret = &table[i];
+ continue;
+ }
+
+ /* get the context with longest prefix len */
+ if (table[i].plen > ret->plen)
+ ret = &table[i];
+ }
+ }
+
+ return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table;
+ struct lowpan_iphc_ctx *ret = NULL;
+ struct in6_addr addr_mcast, network_pfx = {};
+ int i;
+
+ /* init mcast address with */
+ memcpy(&addr_mcast, addr, sizeof(*addr));
+
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ /* Check if context is valid. A context that is not valid
+ * MUST NOT be used for compression.
+ */
+ if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+ !lowpan_iphc_ctx_is_compression(&table[i]))
+ continue;
+
+ /* setting plen */
+ addr_mcast.s6_addr[3] = table[i].plen;
+ /* get network prefix to copy into multicast address */
+ ipv6_addr_prefix(&network_pfx, &table[i].pfx,
+ table[i].plen);
+ /* setting network prefix */
+ memcpy(&addr_mcast.s6_addr[4], &network_pfx, 8);
+
+ if (ipv6_addr_equal(addr, &addr_mcast)) {
+ ret = &table[i];
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void lowpan_iphc_uncompress_lladdr(const struct net_device *dev,
+ struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ switch (dev->addr_len) {
+ case ETH_ALEN:
+ lowpan_iphc_uncompress_eui48_lladdr(ipaddr, lladdr);
+ break;
+ case EUI64_ADDR_LEN:
+ lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+/* Uncompress address function for source and
+ * destination address(non-multicast).
+ *
+ * address_mode is the masked value for sam or dam value
+ */
+static int lowpan_iphc_uncompress_addr(struct sk_buff *skb,
+ const struct net_device *dev,
+ struct in6_addr *ipaddr,
+ u8 address_mode, const void *lladdr)
+{
+ bool fail;
+
+ switch (address_mode) {
+ /* SAM and DAM are the same here */
+ case LOWPAN_IPHC_DAM_00:
+ /* for global link addresses */
+ fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
+ break;
+ case LOWPAN_IPHC_SAM_01:
+ case LOWPAN_IPHC_DAM_01:
+ /* fe:80::XXXX:XXXX:XXXX:XXXX */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+ break;
+ case LOWPAN_IPHC_SAM_10:
+ case LOWPAN_IPHC_DAM_10:
+ /* fe:80::ff:fe00:XXXX */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+ break;
+ case LOWPAN_IPHC_SAM_11:
+ case LOWPAN_IPHC_DAM_11:
+ fail = false;
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+ break;
+ default:
+ lowpan_iphc_uncompress_lladdr(dev, ipaddr, lladdr);
+ break;
+ }
+ break;
+ default:
+ pr_debug("Invalid address mode value: 0x%x\n", address_mode);
+ return -EINVAL;
+ }
+
+ if (fail) {
+ pr_debug("Failed to fetch skb data\n");
+ return -EIO;
+ }
+
+ raw_dump_inline(NULL, "Reconstructed ipv6 addr is",
+ ipaddr->s6_addr, 16);
+
+ return 0;
+}
+
+/* Uncompress address function for source context
+ * based address(non-multicast).
+ */
+static int lowpan_iphc_uncompress_ctx_addr(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct lowpan_iphc_ctx *ctx,
+ struct in6_addr *ipaddr,
+ u8 address_mode, const void *lladdr)
+{
+ bool fail;
+
+ switch (address_mode) {
+ /* SAM and DAM are the same here */
+ case LOWPAN_IPHC_DAM_00:
+ fail = false;
+ /* SAM_00 -> unspec address ::
+ * Do nothing, address is already ::
+ *
+ * DAM 00 -> reserved should never occur.
+ */
+ break;
+ case LOWPAN_IPHC_SAM_01:
+ case LOWPAN_IPHC_DAM_01:
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+ ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+ break;
+ case LOWPAN_IPHC_SAM_10:
+ case LOWPAN_IPHC_DAM_10:
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+ ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+ break;
+ case LOWPAN_IPHC_SAM_11:
+ case LOWPAN_IPHC_DAM_11:
+ fail = false;
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+ break;
+ default:
+ lowpan_iphc_uncompress_lladdr(dev, ipaddr, lladdr);
+ break;
+ }
+ ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+ break;
+ default:
+ pr_debug("Invalid sam value: 0x%x\n", address_mode);
+ return -EINVAL;
+ }
+
+ if (fail) {
+ pr_debug("Failed to fetch skb data\n");
+ return -EIO;
+ }
+
+ raw_dump_inline(NULL,
+ "Reconstructed context based ipv6 src addr is",
+ ipaddr->s6_addr, 16);
+
+ return 0;
+}
+
+/* Uncompress function for multicast destination address,
+ * when M bit is set.
+ */
+static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
+ struct in6_addr *ipaddr,
+ u8 address_mode)
+{
+ bool fail;
+
+ switch (address_mode) {
+ case LOWPAN_IPHC_DAM_00:
+ /* 00: 128 bits. The full address
+ * is carried in-line.
+ */
+ fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
+ break;
+ case LOWPAN_IPHC_DAM_01:
+ /* 01: 48 bits. The address takes
+ * the form ffXX::00XX:XXXX:XXXX.
+ */
+ ipaddr->s6_addr[0] = 0xFF;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1);
+ fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[11], 5);
+ break;
+ case LOWPAN_IPHC_DAM_10:
+ /* 10: 32 bits. The address takes
+ * the form ffXX::00XX:XXXX.
+ */
+ ipaddr->s6_addr[0] = 0xFF;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1);
+ fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[13], 3);
+ break;
+ case LOWPAN_IPHC_DAM_11:
+ /* 11: 8 bits. The address takes
+ * the form ff02::00XX.
+ */
+ ipaddr->s6_addr[0] = 0xFF;
+ ipaddr->s6_addr[1] = 0x02;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1);
+ break;
+ default:
+ pr_debug("DAM value has a wrong value: 0x%x\n", address_mode);
+ return -EINVAL;
+ }
+
+ if (fail) {
+ pr_debug("Failed to fetch skb data\n");
+ return -EIO;
+ }
+
+ raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is",
+ ipaddr->s6_addr, 16);
+
+ return 0;
+}
+
+static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb,
+ struct lowpan_iphc_ctx *ctx,
+ struct in6_addr *ipaddr,
+ u8 address_mode)
+{
+ struct in6_addr network_pfx = {};
+ bool fail;
+
+ ipaddr->s6_addr[0] = 0xFF;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2);
+ fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4);
+ if (fail)
+ return -EIO;
+
+ /* take prefix_len and network prefix from the context */
+ ipaddr->s6_addr[3] = ctx->plen;
+ /* get network prefix to copy into multicast address */
+ ipv6_addr_prefix(&network_pfx, &ctx->pfx, ctx->plen);
+ /* setting network prefix */
+ memcpy(&ipaddr->s6_addr[4], &network_pfx, 8);
+
+ return 0;
+}
+
+/* get the ecn values from iphc tf format and set it to ipv6hdr */
+static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf)
+{
+ /* get the two higher bits which is ecn */
+ u8 ecn = tf[0] & 0xc0;
+
+ /* ECN takes 0x30 in hdr->flow_lbl[0] */
+ hdr->flow_lbl[0] |= (ecn >> 2);
+}
+
+/* get the dscp values from iphc tf format and set it to ipv6hdr */
+static inline void lowpan_iphc_tf_set_dscp(struct ipv6hdr *hdr, const u8 *tf)
+{
+ /* DSCP is at place after ECN */
+ u8 dscp = tf[0] & 0x3f;
+
+ /* The four highest bits need to be set at hdr->priority */
+ hdr->priority |= ((dscp & 0x3c) >> 2);
+ /* The two lower bits is part of hdr->flow_lbl[0] */
+ hdr->flow_lbl[0] |= ((dscp & 0x03) << 6);
+}
+
+/* get the flow label values from iphc tf format and set it to ipv6hdr */
+static inline void lowpan_iphc_tf_set_lbl(struct ipv6hdr *hdr, const u8 *lbl)
+{
+ /* flow label is always some array started with lower nibble of
+ * flow_lbl[0] and followed with two bytes afterwards. Inside inline
+ * data the flow_lbl position can be different, which will be handled
+ * by lbl pointer. E.g. case "01" vs "00" the traffic class is 8 bit
+ * shifted, the different lbl pointer will handle that.
+ *
+ * The flow label will started at lower nibble of flow_lbl[0], the
+ * higher nibbles are part of DSCP + ECN.
+ */
+ hdr->flow_lbl[0] |= lbl[0] & 0x0f;
+ memcpy(&hdr->flow_lbl[1], &lbl[1], 2);
+}
+
+/* lowpan_iphc_tf_decompress - decompress the traffic class.
+ * This function will return zero on success, a value lower than zero if
+ * failed.
+ */
+static int lowpan_iphc_tf_decompress(struct sk_buff *skb, struct ipv6hdr *hdr,
+ u8 val)
+{
+ u8 tf[4];
+
+ /* Traffic Class and Flow Label */
+ switch (val) {
+ case LOWPAN_IPHC_TF_00:
+ /* ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) */
+ if (lowpan_fetch_skb(skb, tf, 4))
+ return -EINVAL;
+
+ /* 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN| DSCP | rsv | Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ lowpan_iphc_tf_set_ecn(hdr, tf);
+ lowpan_iphc_tf_set_dscp(hdr, tf);
+ lowpan_iphc_tf_set_lbl(hdr, &tf[1]);
+ break;
+ case LOWPAN_IPHC_TF_01:
+ /* ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided. */
+ if (lowpan_fetch_skb(skb, tf, 3))
+ return -EINVAL;
+
+ /* 1 2
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN|rsv| Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ lowpan_iphc_tf_set_ecn(hdr, tf);
+ lowpan_iphc_tf_set_lbl(hdr, &tf[0]);
+ break;
+ case LOWPAN_IPHC_TF_10:
+ /* ECN + DSCP (1 byte), Flow Label is elided. */
+ if (lowpan_fetch_skb(skb, tf, 1))
+ return -EINVAL;
+
+ /* 0 1 2 3 4 5 6 7
+ * +-+-+-+-+-+-+-+-+
+ * |ECN| DSCP |
+ * +-+-+-+-+-+-+-+-+
+ */
+ lowpan_iphc_tf_set_ecn(hdr, tf);
+ lowpan_iphc_tf_set_dscp(hdr, tf);
+ break;
+ case LOWPAN_IPHC_TF_11:
+ /* Traffic Class and Flow Label are elided */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* TTL uncompression values */
+static const u8 lowpan_ttl_values[] = {
+ [LOWPAN_IPHC_HLIM_01] = 1,
+ [LOWPAN_IPHC_HLIM_10] = 64,
+ [LOWPAN_IPHC_HLIM_11] = 255,
+};
+
+int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
+ const void *daddr, const void *saddr)
+{
+ struct ipv6hdr hdr = {};
+ struct lowpan_iphc_ctx *ci;
+ u8 iphc0, iphc1, cid = 0;
+ int err;
+
+ raw_dump_table(__func__, "raw skb data dump uncompressed",
+ skb->data, skb->len);
+
+ if (lowpan_fetch_skb(skb, &iphc0, sizeof(iphc0)) ||
+ lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1)))
+ return -EINVAL;
+
+ hdr.version = 6;
+
+ /* default CID = 0, another if the CID flag is set */
+ if (iphc1 & LOWPAN_IPHC_CID) {
+ if (lowpan_fetch_skb(skb, &cid, sizeof(cid)))
+ return -EINVAL;
+ }
+
+ err = lowpan_iphc_tf_decompress(skb, &hdr,
+ iphc0 & LOWPAN_IPHC_TF_MASK);
+ if (err < 0)
+ return err;
+
+ /* Next Header */
+ if (!(iphc0 & LOWPAN_IPHC_NH)) {
+ /* Next header is carried inline */
+ if (lowpan_fetch_skb(skb, &hdr.nexthdr, sizeof(hdr.nexthdr)))
+ return -EINVAL;
+
+ pr_debug("NH flag is set, next header carried inline: %02x\n",
+ hdr.nexthdr);
+ }
+
+ /* Hop Limit */
+ if ((iphc0 & LOWPAN_IPHC_HLIM_MASK) != LOWPAN_IPHC_HLIM_00) {
+ hdr.hop_limit = lowpan_ttl_values[iphc0 & LOWPAN_IPHC_HLIM_MASK];
+ } else {
+ if (lowpan_fetch_skb(skb, &hdr.hop_limit,
+ sizeof(hdr.hop_limit)))
+ return -EINVAL;
+ }
+
+ if (iphc1 & LOWPAN_IPHC_SAC) {
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
+ ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid));
+ if (!ci) {
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+ return -EINVAL;
+ }
+
+ pr_debug("SAC bit is set. Handle context based source address.\n");
+ err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK,
+ saddr);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+ } else {
+ /* Source address uncompression */
+ pr_debug("source address stateless compression\n");
+ err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK,
+ saddr);
+ }
+
+ /* Check on error of previous branch */
+ if (err)
+ return -EINVAL;
+
+ switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) {
+ case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC:
+ skb->pkt_type = PACKET_BROADCAST;
+
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
+ ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+ if (!ci) {
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+ return -EINVAL;
+ }
+
+ /* multicast with context */
+ pr_debug("dest: context-based mcast compression\n");
+ err = lowpan_uncompress_multicast_ctx_daddr(skb, ci,
+ &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+ break;
+ case LOWPAN_IPHC_M:
+ skb->pkt_type = PACKET_BROADCAST;
+
+ /* multicast */
+ err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK);
+ break;
+ case LOWPAN_IPHC_DAC:
+ skb->pkt_type = PACKET_HOST;
+
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
+ ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+ if (!ci) {
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+ return -EINVAL;
+ }
+
+ /* Destination address context based uncompression */
+ pr_debug("DAC bit is set. Handle context based destination address.\n");
+ err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK,
+ daddr);
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+ break;
+ default:
+ skb->pkt_type = PACKET_HOST;
+
+ err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK,
+ daddr);
+ pr_debug("dest: stateless compression mode %d dest %pI6c\n",
+ iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr);
+ break;
+ }
+
+ if (err)
+ return -EINVAL;
+
+ /* Next header data uncompression */
+ if (iphc0 & LOWPAN_IPHC_NH) {
+ err = lowpan_nhc_do_uncompression(skb, dev, &hdr);
+ if (err < 0)
+ return err;
+ } else {
+ err = skb_cow(skb, sizeof(hdr));
+ if (unlikely(err))
+ return err;
+ }
+
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_802154_cb(skb)->d_size)
+ hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size -
+ sizeof(struct ipv6hdr));
+ else
+ hdr.payload_len = htons(skb->len);
+ break;
+ default:
+ hdr.payload_len = htons(skb->len);
+ break;
+ }
+
+ pr_debug("skb headroom size = %d, data length = %d\n",
+ skb_headroom(skb), skb->len);
+
+ pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n\t"
+ "nexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n",
+ hdr.version, ntohs(hdr.payload_len), hdr.nexthdr,
+ hdr.hop_limit, &hdr.daddr);
+
+ skb_push(skb, sizeof(hdr));
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_copy_to_linear_data(skb, &hdr, sizeof(hdr));
+
+ raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, sizeof(hdr));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lowpan_header_decompress);
+
+static const u8 lowpan_iphc_dam_to_sam_value[] = {
+ [LOWPAN_IPHC_DAM_00] = LOWPAN_IPHC_SAM_00,
+ [LOWPAN_IPHC_DAM_01] = LOWPAN_IPHC_SAM_01,
+ [LOWPAN_IPHC_DAM_10] = LOWPAN_IPHC_SAM_10,
+ [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
+};
+
+static inline bool
+lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr,
+ const struct lowpan_iphc_ctx *ctx,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ unsigned char extended_addr[EUI64_ADDR_LEN];
+ bool lladdr_compress = false;
+ struct in6_addr tmp = {};
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr);
+ /* check for SAM/DAM = 11 */
+ memcpy(&tmp.s6_addr[8], &extended_addr, EUI64_ADDR_LEN);
+ /* second bit-flip (Universe/Local) is done according RFC2464 */
+ tmp.s6_addr[8] ^= 0x02;
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&tmp.s6_addr16[7],
+ &addr->short_addr);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return lladdr_compress;
+}
+
+static bool lowpan_iphc_addr_equal(const struct net_device *dev,
+ const struct lowpan_iphc_ctx *ctx,
+ const struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ struct in6_addr tmp = {};
+
+ lowpan_iphc_uncompress_lladdr(dev, &tmp, lladdr);
+
+ if (ctx)
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+
+ return ipv6_addr_equal(&tmp, ipaddr);
+}
+
+static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev,
+ const struct in6_addr *ipaddr,
+ const struct lowpan_iphc_ctx *ctx,
+ const unsigned char *lladdr, bool sam)
+{
+ struct in6_addr tmp;
+ u8 dam;
+
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_iphc_compress_ctx_802154_lladdr(ipaddr, ctx,
+ lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+ break;
+ default:
+ if (lowpan_iphc_addr_equal(dev, ctx, ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+ break;
+ }
+
+ memset(&tmp, 0, sizeof(tmp));
+ /* check for SAM/DAM = 10 */
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[14], 2);
+ dam = LOWPAN_IPHC_DAM_10;
+ goto out;
+ }
+
+ memset(&tmp, 0, sizeof(tmp));
+ /* check for SAM/DAM = 01, should always match */
+ memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[8], 8);
+ dam = LOWPAN_IPHC_DAM_01;
+ goto out;
+ }
+
+ WARN_ONCE(1, "context found but no address mode matched\n");
+ return LOWPAN_IPHC_DAM_00;
+out:
+
+ if (sam)
+ return lowpan_iphc_dam_to_sam_value[dam];
+ else
+ return dam;
+}
+
+static inline bool
+lowpan_iphc_compress_802154_lladdr(const struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ unsigned char extended_addr[EUI64_ADDR_LEN];
+ bool lladdr_compress = false;
+ struct in6_addr tmp = {};
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr);
+ if (is_addr_mac_addr_based(ipaddr, extended_addr))
+ lladdr_compress = true;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ /* fe:80::ff:fe00:XXXX
+ * \__/
+ * short_addr
+ *
+ * Universe/Local bit is zero.
+ */
+ tmp.s6_addr[0] = 0xFE;
+ tmp.s6_addr[1] = 0x80;
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&tmp.s6_addr16[7],
+ &addr->short_addr);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return lladdr_compress;
+}
+
+static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev,
+ const struct in6_addr *ipaddr,
+ const unsigned char *lladdr, bool sam)
+{
+ u8 dam = LOWPAN_IPHC_DAM_01;
+
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_iphc_compress_802154_lladdr(ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
+ pr_debug("address compression 0 bits\n");
+ goto out;
+ }
+ break;
+ default:
+ if (lowpan_iphc_addr_equal(dev, NULL, ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ pr_debug("address compression 0 bits\n");
+ goto out;
+ }
+
+ break;
+ }
+
+ if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
+ /* compress IID to 16 bits xxxx::XXXX */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2);
+ dam = LOWPAN_IPHC_DAM_10; /* 16-bits */
+ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)",
+ *hc_ptr - 2, 2);
+ goto out;
+ }
+
+ /* do not compress IID => xxxx::IID */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
+ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
+ *hc_ptr - 8, 8);
+
+out:
+
+ if (sam)
+ return lowpan_iphc_dam_to_sam_value[dam];
+ else
+ return dam;
+}
+
+/* lowpan_iphc_get_tc - get the ECN + DCSP fields in hc format */
+static inline u8 lowpan_iphc_get_tc(const struct ipv6hdr *hdr)
+{
+ u8 dscp, ecn;
+
+ /* hdr->priority contains the higher bits of dscp, lower are part of
+ * flow_lbl[0]. Note ECN, DCSP is swapped in ipv6 hdr.
+ */
+ dscp = (hdr->priority << 2) | ((hdr->flow_lbl[0] & 0xc0) >> 6);
+ /* ECN is at the two lower bits from first nibble of flow_lbl[0] */
+ ecn = (hdr->flow_lbl[0] & 0x30);
+ /* for pretty debug output, also shift ecn to get the ecn value */
+ pr_debug("ecn 0x%02x dscp 0x%02x\n", ecn >> 4, dscp);
+ /* ECN is at 0x30 now, shift it to have ECN + DCSP */
+ return (ecn << 2) | dscp;
+}
+
+/* lowpan_iphc_is_flow_lbl_zero - check if flow label is zero */
+static inline bool lowpan_iphc_is_flow_lbl_zero(const struct ipv6hdr *hdr)
+{
+ return ((!(hdr->flow_lbl[0] & 0x0f)) &&
+ !hdr->flow_lbl[1] && !hdr->flow_lbl[2]);
+}
+
+/* lowpan_iphc_tf_compress - compress the traffic class which is set by
+ * ipv6hdr. Return the corresponding format identifier which is used.
+ */
+static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr)
+{
+ /* get ecn dscp data in a byteformat as: ECN(hi) + DSCP(lo) */
+ u8 tc = lowpan_iphc_get_tc(hdr), tf[4], val;
+
+ /* printout the traffic class in hc format */
+ pr_debug("tc 0x%02x\n", tc);
+
+ if (lowpan_iphc_is_flow_lbl_zero(hdr)) {
+ if (!tc) {
+ /* 11: Traffic Class and Flow Label are elided. */
+ val = LOWPAN_IPHC_TF_11;
+ } else {
+ /* 10: ECN + DSCP (1 byte), Flow Label is elided.
+ *
+ * 0 1 2 3 4 5 6 7
+ * +-+-+-+-+-+-+-+-+
+ * |ECN| DSCP |
+ * +-+-+-+-+-+-+-+-+
+ */
+ lowpan_push_hc_data(hc_ptr, &tc, sizeof(tc));
+ val = LOWPAN_IPHC_TF_10;
+ }
+ } else {
+ /* check if dscp is zero, it's after the first two bit */
+ if (!(tc & 0x3f)) {
+ /* 01: ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided
+ *
+ * 1 2
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN|rsv| Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ memcpy(&tf[0], &hdr->flow_lbl[0], 3);
+ /* zero the highest 4-bits, contains DCSP + ECN */
+ tf[0] &= ~0xf0;
+ /* set ECN */
+ tf[0] |= (tc & 0xc0);
+
+ lowpan_push_hc_data(hc_ptr, tf, 3);
+ val = LOWPAN_IPHC_TF_01;
+ } else {
+ /* 00: ECN + DSCP + 4-bit Pad + Flow Label (4 bytes)
+ *
+ * 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |ECN| DSCP | rsv | Flow Label |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+ memcpy(&tf[0], &tc, sizeof(tc));
+ /* highest nibble of flow_lbl[0] is part of DSCP + ECN
+ * which will be the 4-bit pad and will be filled with
+ * zeros afterwards.
+ */
+ memcpy(&tf[1], &hdr->flow_lbl[0], 3);
+ /* zero the 4-bit pad, which is reserved */
+ tf[1] &= ~0xf0;
+
+ lowpan_push_hc_data(hc_ptr, tf, 4);
+ val = LOWPAN_IPHC_TF_00;
+ }
+ }
+
+ return val;
+}
+
+static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr,
+ const struct lowpan_iphc_ctx *ctx,
+ const struct in6_addr *ipaddr)
+{
+ u8 data[6];
+
+ /* flags/scope, reserved (RIID) */
+ memcpy(data, &ipaddr->s6_addr[1], 2);
+ /* group ID */
+ memcpy(&data[1], &ipaddr->s6_addr[11], 4);
+ lowpan_push_hc_data(hc_ptr, data, 6);
+
+ return LOWPAN_IPHC_DAM_00;
+}
+
+static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
+ const struct in6_addr *ipaddr)
+{
+ u8 val;
+
+ if (lowpan_is_mcast_addr_compressable8(ipaddr)) {
+ pr_debug("compressed to 1 octet\n");
+ /* use last byte */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[15], 1);
+ val = LOWPAN_IPHC_DAM_11;
+ } else if (lowpan_is_mcast_addr_compressable32(ipaddr)) {
+ pr_debug("compressed to 4 octets\n");
+ /* second byte + the last three */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[1], 1);
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[13], 3);
+ val = LOWPAN_IPHC_DAM_10;
+ } else if (lowpan_is_mcast_addr_compressable48(ipaddr)) {
+ pr_debug("compressed to 6 octets\n");
+ /* second byte + the last five */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[1], 1);
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[11], 5);
+ val = LOWPAN_IPHC_DAM_01;
+ } else {
+ pr_debug("using full address\n");
+ lowpan_push_hc_data(hc_ptr, ipaddr->s6_addr, 16);
+ val = LOWPAN_IPHC_DAM_00;
+ }
+
+ return val;
+}
+
+int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
+ const void *daddr, const void *saddr)
+{
+ u8 iphc0, iphc1, *hc_ptr, cid = 0;
+ struct ipv6hdr *hdr;
+ u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {};
+ struct lowpan_iphc_ctx *dci, *sci, dci_entry, sci_entry;
+ int ret, ipv6_daddr_type, ipv6_saddr_type;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ hdr = ipv6_hdr(skb);
+ hc_ptr = head + 2;
+
+ pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n"
+ "\tnexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n",
+ hdr->version, ntohs(hdr->payload_len), hdr->nexthdr,
+ hdr->hop_limit, &hdr->daddr);
+
+ raw_dump_table(__func__, "raw skb network header dump",
+ skb_network_header(skb), sizeof(struct ipv6hdr));
+
+ /* As we copy some bit-length fields, in the IPHC encoding bytes,
+ * we sometimes use |=
+ * If the field is 0, and the current bit value in memory is 1,
+ * this does not work. We therefore reset the IPHC encoding here
+ */
+ iphc0 = LOWPAN_DISPATCH_IPHC;
+ iphc1 = 0;
+
+ raw_dump_table(__func__, "sending raw skb network uncompressed packet",
+ skb->data, skb->len);
+
+ ipv6_daddr_type = ipv6_addr_type(&hdr->daddr);
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
+ if (ipv6_daddr_type & IPV6_ADDR_MULTICAST)
+ dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr);
+ else
+ dci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->daddr);
+ if (dci) {
+ memcpy(&dci_entry, dci, sizeof(*dci));
+ cid |= dci->id;
+ }
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+
+ spin_lock_bh(&lowpan_dev(dev)->ctx.lock);
+ sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr);
+ if (sci) {
+ memcpy(&sci_entry, sci, sizeof(*sci));
+ cid |= (sci->id << 4);
+ }
+ spin_unlock_bh(&lowpan_dev(dev)->ctx.lock);
+
+ /* if cid is zero it will be compressed */
+ if (cid) {
+ iphc1 |= LOWPAN_IPHC_CID;
+ lowpan_push_hc_data(&hc_ptr, &cid, sizeof(cid));
+ }
+
+ /* Traffic Class, Flow Label compression */
+ iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr);
+
+ /* NOTE: payload length is always compressed */
+
+ /* Check if we provide the nhc format for nexthdr and compression
+ * functionality. If not nexthdr is handled inline and not compressed.
+ */
+ ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr);
+ if (ret == -ENOENT)
+ lowpan_push_hc_data(&hc_ptr, &hdr->nexthdr,
+ sizeof(hdr->nexthdr));
+ else
+ iphc0 |= LOWPAN_IPHC_NH;
+
+ /* Hop limit
+ * if 1: compress, encoding is 01
+ * if 64: compress, encoding is 10
+ * if 255: compress, encoding is 11
+ * else do not compress
+ */
+ switch (hdr->hop_limit) {
+ case 1:
+ iphc0 |= LOWPAN_IPHC_HLIM_01;
+ break;
+ case 64:
+ iphc0 |= LOWPAN_IPHC_HLIM_10;
+ break;
+ case 255:
+ iphc0 |= LOWPAN_IPHC_HLIM_11;
+ break;
+ default:
+ lowpan_push_hc_data(&hc_ptr, &hdr->hop_limit,
+ sizeof(hdr->hop_limit));
+ }
+
+ ipv6_saddr_type = ipv6_addr_type(&hdr->saddr);
+ /* source address compression */
+ if (ipv6_saddr_type == IPV6_ADDR_ANY) {
+ pr_debug("source address is unspecified, setting SAC\n");
+ iphc1 |= LOWPAN_IPHC_SAC;
+ } else {
+ if (sci) {
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev,
+ &hdr->saddr,
+ &sci_entry, saddr,
+ true);
+ iphc1 |= LOWPAN_IPHC_SAC;
+ } else {
+ if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL &&
+ lowpan_is_linklocal_zero_padded(hdr->saddr)) {
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev,
+ &hdr->saddr,
+ saddr, true);
+ pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
+ &hdr->saddr, iphc1);
+ } else {
+ pr_debug("send the full source address\n");
+ lowpan_push_hc_data(&hc_ptr,
+ hdr->saddr.s6_addr, 16);
+ }
+ }
+ }
+
+ /* destination address compression */
+ if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) {
+ pr_debug("destination address is multicast: ");
+ iphc1 |= LOWPAN_IPHC_M;
+ if (dci) {
+ iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr,
+ &dci_entry,
+ &hdr->daddr);
+ iphc1 |= LOWPAN_IPHC_DAC;
+ } else {
+ iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr,
+ &hdr->daddr);
+ }
+ } else {
+ if (dci) {
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev,
+ &hdr->daddr,
+ &dci_entry, daddr,
+ false);
+ iphc1 |= LOWPAN_IPHC_DAC;
+ } else {
+ if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL &&
+ lowpan_is_linklocal_zero_padded(hdr->daddr)) {
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev,
+ &hdr->daddr,
+ daddr, false);
+ pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n",
+ &hdr->daddr, iphc1);
+ } else {
+ pr_debug("dest address unicast %pI6c\n",
+ &hdr->daddr);
+ lowpan_push_hc_data(&hc_ptr,
+ hdr->daddr.s6_addr, 16);
+ }
+ }
+ }
+
+ /* next header compression */
+ if (iphc0 & LOWPAN_IPHC_NH) {
+ ret = lowpan_nhc_do_compression(skb, hdr, &hc_ptr);
+ if (ret < 0)
+ return ret;
+ }
+
+ head[0] = iphc0;
+ head[1] = iphc1;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
+ memcpy(skb_push(skb, hc_ptr - head), head, hc_ptr - head);
+ skb_reset_network_header(skb);
+
+ pr_debug("header len %d skb %u\n", (int)(hc_ptr - head), skb->len);
+
+ raw_dump_table(__func__, "raw skb data dump compressed",
+ skb->data, skb->len);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lowpan_header_compress);
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
new file mode 100644
index 000000000000..868d28583c0a
--- /dev/null
+++ b/net/6lowpan/ndisc.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors:
+ * (C) 2016 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <net/6lowpan.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+
+#include "6lowpan_i.h"
+
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+#define NDISC_802154_SHORT_ADDR_LENGTH 1
+static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
+ struct nd_opt_hdr *nd_opt,
+ struct ndisc_options *ndopts)
+{
+ switch (nd_opt->nd_opt_len) {
+ case NDISC_802154_SHORT_ADDR_LENGTH:
+ if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type])
+ net_dbg_ratelimited("%s: duplicated short addr ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
+ else
+ ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt;
+ return 1;
+ default:
+ /* all others will be handled by ndisc IPv6 option parsing */
+ return 0;
+ }
+}
+
+static int lowpan_ndisc_parse_options(const struct net_device *dev,
+ struct nd_opt_hdr *nd_opt,
+ struct ndisc_options *ndopts)
+{
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ switch (nd_opt->nd_opt_type) {
+ case ND_OPT_SOURCE_LL_ADDR:
+ case ND_OPT_TARGET_LL_ADDR:
+ return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts);
+ default:
+ return 0;
+ }
+}
+
+static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
+ u8 icmp6_type,
+ const struct ndisc_options *ndopts)
+{
+ struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+ u8 *lladdr_short = NULL;
+
+ switch (icmp6_type) {
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_ROUTER_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ if (ndopts->nd_802154_opts_src_lladdr) {
+ lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ if (!lladdr_short) {
+ net_dbg_ratelimited("NA: invalid short link-layer address length\n");
+ return;
+ }
+ }
+ break;
+ case NDISC_REDIRECT:
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ if (ndopts->nd_802154_opts_tgt_lladdr) {
+ lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ if (!lladdr_short) {
+ net_dbg_ratelimited("NA: invalid short link-layer address length\n");
+ return;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ write_lock_bh(&n->lock);
+ if (lladdr_short) {
+ ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
+ if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ }
+ write_unlock_bh(&n->lock);
+}
+
+static void lowpan_ndisc_update(const struct net_device *dev,
+ struct neighbour *n, u32 flags, u8 icmp6_type,
+ const struct ndisc_options *ndopts)
+{
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ /* react on overrides only. TODO check if this is really right. */
+ if (flags & NEIGH_UPDATE_F_OVERRIDE)
+ lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts);
+}
+
+static int lowpan_ndisc_opt_addr_space(const struct net_device *dev,
+ u8 icmp6_type, struct neighbour *neigh,
+ u8 *ha_buf, u8 **ha)
+{
+ struct lowpan_802154_neigh *n;
+ struct wpan_dev *wpan_dev;
+ int addr_space = 0;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ switch (icmp6_type) {
+ case NDISC_REDIRECT:
+ n = lowpan_802154_neigh(neighbour_priv(neigh));
+
+ read_lock_bh(&neigh->lock);
+ if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) {
+ memcpy(ha_buf, &n->short_addr,
+ IEEE802154_SHORT_ADDR_LEN);
+ read_unlock_bh(&neigh->lock);
+ addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+ *ha = ha_buf;
+ } else {
+ read_unlock_bh(&neigh->lock);
+ }
+ break;
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ case NDISC_ROUTER_SOLICITATION:
+ wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+ addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+ break;
+ default:
+ break;
+ }
+
+ return addr_space;
+}
+
+static void lowpan_ndisc_fill_addr_option(const struct net_device *dev,
+ struct sk_buff *skb, u8 icmp6_type,
+ const u8 *ha)
+{
+ struct wpan_dev *wpan_dev;
+ __be16 short_addr;
+ u8 opt_type;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ switch (icmp6_type) {
+ case NDISC_REDIRECT:
+ if (ha) {
+ ieee802154_le16_to_be16(&short_addr, ha);
+ __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+ &short_addr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ }
+ return;
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ opt_type = ND_OPT_TARGET_LL_ADDR;
+ break;
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ opt_type = ND_OPT_SOURCE_LL_ADDR;
+ break;
+ default:
+ return;
+ }
+
+ wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+ ieee802154_le16_to_be16(&short_addr,
+ &wpan_dev->short_addr);
+ __ndisc_fill_addr_option(skb, opt_type, &short_addr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ }
+}
+
+static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net,
+ struct net_device *dev,
+ const struct prefix_info *pinfo,
+ struct inet6_dev *in6_dev,
+ struct in6_addr *addr,
+ int addr_type, u32 addr_flags,
+ bool sllao, bool tokenized,
+ __u32 valid_lft,
+ u32 prefered_lft,
+ bool dev_addr_generated)
+{
+ int err;
+
+ /* generates short based address for RA PIO's */
+ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated &&
+ !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) {
+ err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+ addr, addr_type, addr_flags,
+ sllao, tokenized, valid_lft,
+ prefered_lft);
+ if (err)
+ net_dbg_ratelimited("RA: could not add a short address based address for prefix: %pI6c\n",
+ &pinfo->prefix);
+ }
+}
+#endif
+
+const struct ndisc_ops lowpan_ndisc_ops = {
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+ .parse_options = lowpan_ndisc_parse_options,
+ .update = lowpan_ndisc_update,
+ .opt_addr_space = lowpan_ndisc_opt_addr_space,
+ .fill_addr_option = lowpan_ndisc_fill_addr_option,
+ .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr,
+#endif
+};
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
new file mode 100644
index 000000000000..7b374595328d
--- /dev/null
+++ b/net/6lowpan/nhc.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN next header compression
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <linux/netdevice.h>
+
+#include <net/ipv6.h>
+
+#include "nhc.h"
+
+static const struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX + 1];
+static DEFINE_SPINLOCK(lowpan_nhc_lock);
+
+static const struct lowpan_nhc *lowpan_nhc_by_nhcid(struct sk_buff *skb)
+{
+ const struct lowpan_nhc *nhc;
+ int i;
+ u8 id;
+
+ if (!pskb_may_pull(skb, 1))
+ return NULL;
+
+ id = *skb->data;
+
+ for (i = 0; i < NEXTHDR_MAX + 1; i++) {
+ nhc = lowpan_nexthdr_nhcs[i];
+ if (!nhc)
+ continue;
+
+ if ((id & nhc->idmask) == nhc->id)
+ return nhc;
+ }
+
+ return NULL;
+}
+
+int lowpan_nhc_check_compression(struct sk_buff *skb,
+ const struct ipv6hdr *hdr, u8 **hc_ptr)
+{
+ const struct lowpan_nhc *nhc;
+ int ret = 0;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ nhc = lowpan_nexthdr_nhcs[hdr->nexthdr];
+ if (!(nhc && nhc->compress))
+ ret = -ENOENT;
+
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ return ret;
+}
+
+int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ u8 **hc_ptr)
+{
+ int ret;
+ const struct lowpan_nhc *nhc;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ nhc = lowpan_nexthdr_nhcs[hdr->nexthdr];
+ /* check if the nhc module was removed in unlocked part.
+ * TODO: this is a workaround we should prevent unloading
+ * of nhc modules while unlocked part, this will always drop
+ * the lowpan packet but it's very unlikely.
+ *
+ * Solution isn't easy because we need to decide at
+ * lowpan_nhc_check_compression if we do a compression or not.
+ * Because the inline data which is added to skb, we can't move this
+ * handling.
+ */
+ if (unlikely(!nhc || !nhc->compress)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* In the case of RAW sockets the transport header is not set by
+ * the ip6 stack so we must set it ourselves
+ */
+ if (skb->transport_header == skb->network_header)
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ ret = nhc->compress(skb, hc_ptr);
+ if (ret < 0)
+ goto out;
+
+ /* skip the transport header */
+ skb_pull(skb, nhc->nexthdrlen);
+
+out:
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ return ret;
+}
+
+int lowpan_nhc_do_uncompression(struct sk_buff *skb,
+ const struct net_device *dev,
+ struct ipv6hdr *hdr)
+{
+ const struct lowpan_nhc *nhc;
+ int ret;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ nhc = lowpan_nhc_by_nhcid(skb);
+ if (nhc) {
+ if (nhc->uncompress) {
+ ret = nhc->uncompress(skb, sizeof(struct ipv6hdr) +
+ nhc->nexthdrlen);
+ if (ret < 0) {
+ spin_unlock_bh(&lowpan_nhc_lock);
+ return ret;
+ }
+ } else {
+ spin_unlock_bh(&lowpan_nhc_lock);
+ netdev_warn(dev, "received nhc id for %s which is not implemented.\n",
+ nhc->name);
+ return -ENOTSUPP;
+ }
+ } else {
+ spin_unlock_bh(&lowpan_nhc_lock);
+ netdev_warn(dev, "received unknown nhc id which was not found.\n");
+ return -ENOENT;
+ }
+
+ hdr->nexthdr = nhc->nexthdr;
+ skb_reset_transport_header(skb);
+ raw_dump_table(__func__, "raw transport header dump",
+ skb_transport_header(skb), nhc->nexthdrlen);
+
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ return 0;
+}
+
+int lowpan_nhc_add(const struct lowpan_nhc *nhc)
+{
+ int ret = 0;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ if (lowpan_nexthdr_nhcs[nhc->nexthdr]) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ lowpan_nexthdr_nhcs[nhc->nexthdr] = nhc;
+out:
+ spin_unlock_bh(&lowpan_nhc_lock);
+ return ret;
+}
+EXPORT_SYMBOL(lowpan_nhc_add);
+
+void lowpan_nhc_del(const struct lowpan_nhc *nhc)
+{
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ lowpan_nexthdr_nhcs[nhc->nexthdr] = NULL;
+
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(lowpan_nhc_del);
diff --git a/net/6lowpan/nhc.h b/net/6lowpan/nhc.h
new file mode 100644
index 000000000000..ab7b4977c32b
--- /dev/null
+++ b/net/6lowpan/nhc.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __6LOWPAN_NHC_H
+#define __6LOWPAN_NHC_H
+
+#include <linux/skbuff.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+
+#include <net/6lowpan.h>
+#include <net/ipv6.h>
+
+/**
+ * LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct
+ *
+ * @__nhc: variable name of the lowpan_nhc struct.
+ * @_name: const char * of common header compression name.
+ * @_nexthdr: ipv6 nexthdr field for the header compression.
+ * @_nexthdrlen: ipv6 nexthdr len for the reserved space.
+ * @_id: one byte nhc id value.
+ * @_idmask: one byte nhc id mask value.
+ * @_uncompress: callback for uncompression call.
+ * @_compress: callback for compression call.
+ */
+#define LOWPAN_NHC(__nhc, _name, _nexthdr, \
+ _hdrlen, _id, _idmask, \
+ _uncompress, _compress) \
+static const struct lowpan_nhc __nhc = { \
+ .name = _name, \
+ .nexthdr = _nexthdr, \
+ .nexthdrlen = _hdrlen, \
+ .id = _id, \
+ .idmask = _idmask, \
+ .uncompress = _uncompress, \
+ .compress = _compress, \
+}
+
+#define module_lowpan_nhc(__nhc) \
+static int __init __nhc##_init(void) \
+{ \
+ return lowpan_nhc_add(&(__nhc)); \
+} \
+module_init(__nhc##_init); \
+static void __exit __nhc##_exit(void) \
+{ \
+ lowpan_nhc_del(&(__nhc)); \
+} \
+module_exit(__nhc##_exit);
+
+/**
+ * struct lowpan_nhc - hold 6lowpan next hdr compression ifnformation
+ *
+ * @name: name of the specific next header compression
+ * @nexthdr: next header value of the protocol which should be compressed.
+ * @nexthdrlen: ipv6 nexthdr len for the reserved space.
+ * @id: one byte nhc id value.
+ * @idmask: one byte nhc id mask value.
+ * @compress: callback to do the header compression.
+ * @uncompress: callback to do the header uncompression.
+ */
+struct lowpan_nhc {
+ const char *name;
+ u8 nexthdr;
+ size_t nexthdrlen;
+ u8 id;
+ u8 idmask;
+
+ int (*uncompress)(struct sk_buff *skb, size_t needed);
+ int (*compress)(struct sk_buff *skb, u8 **hc_ptr);
+};
+
+/**
+ * lowpan_nhc_by_nexthdr - return the 6lowpan nhc by ipv6 nexthdr.
+ *
+ * @nexthdr: ipv6 nexthdr value.
+ */
+struct lowpan_nhc *lowpan_nhc_by_nexthdr(u8 nexthdr);
+
+/**
+ * lowpan_nhc_check_compression - checks if we support compression format. If
+ * we support the nhc by nexthdr field, the function will return 0. If we
+ * don't support the nhc by nexthdr this function will return -ENOENT.
+ *
+ * @skb: skb of 6LoWPAN header to read nhc and replace header.
+ * @hdr: ipv6hdr to check the nexthdr value
+ * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of
+ * replaced header.
+ */
+int lowpan_nhc_check_compression(struct sk_buff *skb,
+ const struct ipv6hdr *hdr, u8 **hc_ptr);
+
+/**
+ * lowpan_nhc_do_compression - calling compress callback for nhc
+ *
+ * @skb: skb of 6LoWPAN header to read nhc and replace header.
+ * @hdr: ipv6hdr to set the nexthdr value
+ * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of
+ * replaced header.
+ */
+int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ u8 **hc_ptr);
+
+/**
+ * lowpan_nhc_do_uncompression - calling uncompress callback for nhc
+ *
+ * @nhc: 6LoWPAN nhc context, get by lowpan_nhc_by_ functions.
+ * @skb: skb of 6LoWPAN header, skb->data should be pointed to nhc id value.
+ * @dev: netdevice for print logging information.
+ * @hdr: ipv6hdr for setting nexthdr value.
+ */
+int lowpan_nhc_do_uncompression(struct sk_buff *skb,
+ const struct net_device *dev,
+ struct ipv6hdr *hdr);
+
+/**
+ * lowpan_nhc_add - register a next header compression to framework
+ *
+ * @nhc: nhc which should be add.
+ */
+int lowpan_nhc_add(const struct lowpan_nhc *nhc);
+
+/**
+ * lowpan_nhc_del - delete a next header compression from framework
+ *
+ * @nhc: nhc which should be delete.
+ */
+void lowpan_nhc_del(const struct lowpan_nhc *nhc);
+
+/**
+ * lowpan_nhc_init - adding all default nhcs
+ */
+void lowpan_nhc_init(void);
+
+#endif /* __6LOWPAN_NHC_H */
diff --git a/net/6lowpan/nhc_dest.c b/net/6lowpan/nhc_dest.c
new file mode 100644
index 000000000000..0cbcc7806469
--- /dev/null
+++ b/net/6lowpan/nhc_dest.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 Destination Options Header compression according to
+ * RFC6282
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_DEST_ID_0 0xe6
+#define LOWPAN_NHC_DEST_MASK_0 0xfe
+
+LOWPAN_NHC(nhc_dest, "RFC6282 Destination Options", NEXTHDR_DEST, 0,
+ LOWPAN_NHC_DEST_ID_0, LOWPAN_NHC_DEST_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(nhc_dest);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Destination Options compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_fragment.c b/net/6lowpan/nhc_fragment.c
new file mode 100644
index 000000000000..9414552df0ac
--- /dev/null
+++ b/net/6lowpan/nhc_fragment.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 Fragment Header compression according to RFC6282
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_FRAGMENT_ID_0 0xe4
+#define LOWPAN_NHC_FRAGMENT_MASK_0 0xfe
+
+LOWPAN_NHC(nhc_fragment, "RFC6282 Fragment", NEXTHDR_FRAGMENT, 0,
+ LOWPAN_NHC_FRAGMENT_ID_0, LOWPAN_NHC_FRAGMENT_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(nhc_fragment);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Fragment compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_dest.c b/net/6lowpan/nhc_ghc_ext_dest.c
new file mode 100644
index 000000000000..e4745ddd10a8
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_dest.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_DEST_ID_0 0xb6
+#define LOWPAN_GHC_EXT_DEST_MASK_0 0xfe
+
+LOWPAN_NHC(ghc_ext_dest, "RFC7400 Destination Extension Header", NEXTHDR_DEST,
+ 0, LOWPAN_GHC_EXT_DEST_ID_0, LOWPAN_GHC_EXT_DEST_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_dest);
+MODULE_DESCRIPTION("6LoWPAN generic header destination extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_frag.c b/net/6lowpan/nhc_ghc_ext_frag.c
new file mode 100644
index 000000000000..220e5abfa946
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_frag.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_FRAG_ID_0 0xb4
+#define LOWPAN_GHC_EXT_FRAG_MASK_0 0xfe
+
+LOWPAN_NHC(ghc_ext_frag, "RFC7400 Fragmentation Extension Header",
+ NEXTHDR_FRAGMENT, 0, LOWPAN_GHC_EXT_FRAG_ID_0,
+ LOWPAN_GHC_EXT_FRAG_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_frag);
+MODULE_DESCRIPTION("6LoWPAN generic header fragmentation extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_hop.c b/net/6lowpan/nhc_ghc_ext_hop.c
new file mode 100644
index 000000000000..9b0de4da7379
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_hop.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_HOP_ID_0 0xb0
+#define LOWPAN_GHC_EXT_HOP_MASK_0 0xfe
+
+LOWPAN_NHC(ghc_ext_hop, "RFC7400 Hop-by-Hop Extension Header", NEXTHDR_HOP, 0,
+ LOWPAN_GHC_EXT_HOP_ID_0, LOWPAN_GHC_EXT_HOP_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_hop);
+MODULE_DESCRIPTION("6LoWPAN generic header hop-by-hop extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_route.c b/net/6lowpan/nhc_ghc_ext_route.c
new file mode 100644
index 000000000000..3e86faec59c9
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_route.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_ROUTE_ID_0 0xb2
+#define LOWPAN_GHC_EXT_ROUTE_MASK_0 0xfe
+
+LOWPAN_NHC(ghc_ext_route, "RFC7400 Routing Extension Header", NEXTHDR_ROUTING,
+ 0, LOWPAN_GHC_EXT_ROUTE_ID_0, LOWPAN_GHC_EXT_ROUTE_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_route);
+MODULE_DESCRIPTION("6LoWPAN generic header routing extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_icmpv6.c b/net/6lowpan/nhc_ghc_icmpv6.c
new file mode 100644
index 000000000000..1634f3eb0be8
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_icmpv6.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN ICMPv6 compression according to RFC7400
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_ICMPV6_ID_0 0xdf
+#define LOWPAN_GHC_ICMPV6_MASK_0 0xff
+
+LOWPAN_NHC(ghc_icmpv6, "RFC7400 ICMPv6", NEXTHDR_ICMP, 0,
+ LOWPAN_GHC_ICMPV6_ID_0, LOWPAN_GHC_ICMPV6_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(ghc_icmpv6);
+MODULE_DESCRIPTION("6LoWPAN generic header ICMPv6 compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_udp.c b/net/6lowpan/nhc_ghc_udp.c
new file mode 100644
index 000000000000..4ac4813b77ad
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_udp.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN UDP compression according to RFC7400
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_UDP_ID_0 0xd0
+#define LOWPAN_GHC_UDP_MASK_0 0xf8
+
+LOWPAN_NHC(ghc_udp, "RFC7400 UDP", NEXTHDR_UDP, 0,
+ LOWPAN_GHC_UDP_ID_0, LOWPAN_GHC_UDP_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(ghc_udp);
+MODULE_DESCRIPTION("6LoWPAN generic header UDP compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_hop.c b/net/6lowpan/nhc_hop.c
new file mode 100644
index 000000000000..182087dfd09d
--- /dev/null
+++ b/net/6lowpan/nhc_hop.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to RFC6282
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_HOP_ID_0 0xe0
+#define LOWPAN_NHC_HOP_MASK_0 0xfe
+
+LOWPAN_NHC(nhc_hop, "RFC6282 Hop-by-Hop Options", NEXTHDR_HOP, 0,
+ LOWPAN_NHC_HOP_ID_0, LOWPAN_NHC_HOP_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(nhc_hop);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Hop-by-Hop Options compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ipv6.c b/net/6lowpan/nhc_ipv6.c
new file mode 100644
index 000000000000..20242360b1d4
--- /dev/null
+++ b/net/6lowpan/nhc_ipv6.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 Header compression according to RFC6282
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_IPV6_ID_0 0xee
+#define LOWPAN_NHC_IPV6_MASK_0 0xfe
+
+LOWPAN_NHC(nhc_ipv6, "RFC6282 IPv6", NEXTHDR_IPV6, 0, LOWPAN_NHC_IPV6_ID_0,
+ LOWPAN_NHC_IPV6_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(nhc_ipv6);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 IPv6 compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_mobility.c b/net/6lowpan/nhc_mobility.c
new file mode 100644
index 000000000000..1c31d872c804
--- /dev/null
+++ b/net/6lowpan/nhc_mobility.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 Mobility Header compression according to RFC6282
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_MOBILITY_ID_0 0xe8
+#define LOWPAN_NHC_MOBILITY_MASK_0 0xfe
+
+LOWPAN_NHC(nhc_mobility, "RFC6282 Mobility", NEXTHDR_MOBILITY, 0,
+ LOWPAN_NHC_MOBILITY_ID_0, LOWPAN_NHC_MOBILITY_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(nhc_mobility);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Mobility compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_routing.c b/net/6lowpan/nhc_routing.c
new file mode 100644
index 000000000000..dae03ebf7021
--- /dev/null
+++ b/net/6lowpan/nhc_routing.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 Routing Header compression according to RFC6282
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_ROUTING_ID_0 0xe2
+#define LOWPAN_NHC_ROUTING_MASK_0 0xfe
+
+LOWPAN_NHC(nhc_routing, "RFC6282 Routing", NEXTHDR_ROUTING, 0,
+ LOWPAN_NHC_ROUTING_ID_0, LOWPAN_NHC_ROUTING_MASK_0, NULL, NULL);
+
+module_lowpan_nhc(nhc_routing);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Routing compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c
new file mode 100644
index 000000000000..0a506c77283d
--- /dev/null
+++ b/net/6lowpan/nhc_udp.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * 6LoWPAN IPv6 UDP compression according to RFC6282
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Original written by:
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ * Jon Smirl <jonsmirl@gmail.com>
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_UDP_MASK 0xF8
+#define LOWPAN_NHC_UDP_ID 0xF0
+
+#define LOWPAN_NHC_UDP_4BIT_PORT 0xF0B0
+#define LOWPAN_NHC_UDP_4BIT_MASK 0xFFF0
+#define LOWPAN_NHC_UDP_8BIT_PORT 0xF000
+#define LOWPAN_NHC_UDP_8BIT_MASK 0xFF00
+
+/* values for port compression, _with checksum_ ie bit 5 set to 0 */
+
+/* all inline */
+#define LOWPAN_NHC_UDP_CS_P_00 0xF0
+/* source 16bit inline, dest = 0xF0 + 8 bit inline */
+#define LOWPAN_NHC_UDP_CS_P_01 0xF1
+/* source = 0xF0 + 8bit inline, dest = 16 bit inline */
+#define LOWPAN_NHC_UDP_CS_P_10 0xF2
+/* source & dest = 0xF0B + 4bit inline */
+#define LOWPAN_NHC_UDP_CS_P_11 0xF3
+/* checksum elided */
+#define LOWPAN_NHC_UDP_CS_C 0x04
+
+static int udp_uncompress(struct sk_buff *skb, size_t needed)
+{
+ u8 tmp = 0, val = 0;
+ struct udphdr uh;
+ bool fail;
+ int err;
+
+ fail = lowpan_fetch_skb(skb, &tmp, sizeof(tmp));
+
+ pr_debug("UDP header uncompression\n");
+ switch (tmp & LOWPAN_NHC_UDP_CS_P_11) {
+ case LOWPAN_NHC_UDP_CS_P_00:
+ fail |= lowpan_fetch_skb(skb, &uh.source, sizeof(uh.source));
+ fail |= lowpan_fetch_skb(skb, &uh.dest, sizeof(uh.dest));
+ break;
+ case LOWPAN_NHC_UDP_CS_P_01:
+ fail |= lowpan_fetch_skb(skb, &uh.source, sizeof(uh.source));
+ fail |= lowpan_fetch_skb(skb, &val, sizeof(val));
+ uh.dest = htons(val + LOWPAN_NHC_UDP_8BIT_PORT);
+ break;
+ case LOWPAN_NHC_UDP_CS_P_10:
+ fail |= lowpan_fetch_skb(skb, &val, sizeof(val));
+ uh.source = htons(val + LOWPAN_NHC_UDP_8BIT_PORT);
+ fail |= lowpan_fetch_skb(skb, &uh.dest, sizeof(uh.dest));
+ break;
+ case LOWPAN_NHC_UDP_CS_P_11:
+ fail |= lowpan_fetch_skb(skb, &val, sizeof(val));
+ uh.source = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val >> 4));
+ uh.dest = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val & 0x0f));
+ break;
+ default:
+ BUG();
+ }
+
+ pr_debug("uncompressed UDP ports: src = %d, dst = %d\n",
+ ntohs(uh.source), ntohs(uh.dest));
+
+ /* checksum */
+ if (tmp & LOWPAN_NHC_UDP_CS_C) {
+ pr_debug_ratelimited("checksum elided currently not supported\n");
+ fail = true;
+ } else {
+ fail |= lowpan_fetch_skb(skb, &uh.check, sizeof(uh.check));
+ }
+
+ if (fail)
+ return -EINVAL;
+
+ /* UDP length needs to be inferred from the lower layers
+ * here, we obtain the hint from the remaining size of the
+ * frame
+ */
+ switch (lowpan_dev(skb->dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_802154_cb(skb)->d_size)
+ uh.len = htons(lowpan_802154_cb(skb)->d_size -
+ sizeof(struct ipv6hdr));
+ else
+ uh.len = htons(skb->len + sizeof(struct udphdr));
+ break;
+ default:
+ uh.len = htons(skb->len + sizeof(struct udphdr));
+ break;
+ }
+ pr_debug("uncompressed UDP length: src = %d", ntohs(uh.len));
+
+ /* replace the compressed UDP head by the uncompressed UDP
+ * header
+ */
+ err = skb_cow(skb, needed);
+ if (unlikely(err))
+ return err;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr));
+
+ return 0;
+}
+
+static int udp_compress(struct sk_buff *skb, u8 **hc_ptr)
+{
+ const struct udphdr *uh = udp_hdr(skb);
+ u8 tmp;
+
+ if (((ntohs(uh->source) & LOWPAN_NHC_UDP_4BIT_MASK) ==
+ LOWPAN_NHC_UDP_4BIT_PORT) &&
+ ((ntohs(uh->dest) & LOWPAN_NHC_UDP_4BIT_MASK) ==
+ LOWPAN_NHC_UDP_4BIT_PORT)) {
+ pr_debug("UDP header: both ports compression to 4 bits\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_11;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source and destination port */
+ tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_4BIT_PORT +
+ ((ntohs(uh->source) - LOWPAN_NHC_UDP_4BIT_PORT) << 4);
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ } else if ((ntohs(uh->dest) & LOWPAN_NHC_UDP_8BIT_MASK) ==
+ LOWPAN_NHC_UDP_8BIT_PORT) {
+ pr_debug("UDP header: remove 8 bits of dest\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_01;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source port */
+ lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source));
+ /* destination port */
+ tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_8BIT_PORT;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ } else if ((ntohs(uh->source) & LOWPAN_NHC_UDP_8BIT_MASK) ==
+ LOWPAN_NHC_UDP_8BIT_PORT) {
+ pr_debug("UDP header: remove 8 bits of source\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_10;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source port */
+ tmp = ntohs(uh->source) - LOWPAN_NHC_UDP_8BIT_PORT;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* destination port */
+ lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest));
+ } else {
+ pr_debug("UDP header: can't compress\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_00;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source port */
+ lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source));
+ /* destination port */
+ lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest));
+ }
+
+ /* checksum is always inline */
+ lowpan_push_hc_data(hc_ptr, &uh->check, sizeof(uh->check));
+
+ return 0;
+}
+
+LOWPAN_NHC(nhc_udp, "RFC6282 UDP", NEXTHDR_UDP, sizeof(struct udphdr),
+ LOWPAN_NHC_UDP_ID, LOWPAN_NHC_UDP_MASK, udp_uncompress, udp_compress);
+
+module_lowpan_nhc(nhc_udp);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 UDP compression");
+MODULE_LICENSE("GPL");
diff --git a/net/802/Kconfig b/net/802/Kconfig
new file mode 100644
index 000000000000..aaa83e888240
--- /dev/null
+++ b/net/802/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config STP
+ tristate
+ select LLC
+
+config GARP
+ tristate
+ select STP
+
+config MRP
+ tristate
diff --git a/net/802/Makefile b/net/802/Makefile
index 977704a54f68..99abc29d537c 100644
--- a/net/802/Makefile
+++ b/net/802/Makefile
@@ -1,13 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux 802.x protocol layers.
#
-# Check the p8022 selections against net/core/Makefile.
-obj-$(CONFIG_SYSCTL) += sysctl_net_802.o
-obj-$(CONFIG_LLC) += p8022.o psnap.o
-obj-$(CONFIG_TR) += p8022.o psnap.o tr.o sysctl_net_802.o
+obj-$(CONFIG_LLC) += psnap.o
obj-$(CONFIG_NET_FC) += fc.o
obj-$(CONFIG_FDDI) += fddi.o
obj-$(CONFIG_HIPPI) += hippi.o
-obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o
-obj-$(CONFIG_ATALK) += p8022.o psnap.o
+obj-$(CONFIG_ATALK) += psnap.o
+obj-$(CONFIG_STP) += stp.o
+obj-$(CONFIG_GARP) += garp.o
+obj-$(CONFIG_MRP) += mrp.o
diff --git a/net/802/fc.c b/net/802/fc.c
index 2a27e37bc4cb..afd3d288a41d 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -1,20 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3: Fibre Channel device handling subroutines
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* Vineet Abraham <vma@iol.unh.edu>
* v 1.0 03/22/99
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -28,21 +22,22 @@
#include <linux/net.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
+#include <linux/export.h>
#include <net/arp.h>
/*
- * Put the headers on a Fibre Channel packet.
+ * Put the headers on a Fibre Channel packet.
*/
-
+
static int fc_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
- void *daddr, void *saddr, unsigned len)
+ const void *daddr, const void *saddr, unsigned int len)
{
struct fch_hdr *fch;
int hdr_len;
- /*
- * Add the 802.2 SNAP header if IP as the IPv4 code calls
+ /*
+ * Add the 802.2 SNAP header if IP as the IPv4 code calls
* dev->hard_header directly.
*/
if (type == ETH_P_IP || type == ETH_P_ARP)
@@ -50,7 +45,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
struct fcllc *fcllc;
hdr_len = sizeof(struct fch_hdr) + sizeof(struct fcllc);
- fch = (struct fch_hdr *)skb_push(skb, hdr_len);
+ fch = skb_push(skb, hdr_len);
fcllc = (struct fcllc *)(fch+1);
fcllc->dsap = fcllc->ssap = EXTENDED_SAP;
fcllc->llc = UI_CMD;
@@ -60,7 +55,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
else
{
hdr_len = sizeof(struct fch_hdr);
- fch = (struct fch_hdr *)skb_push(skb, hdr_len);
+ fch = skb_push(skb, hdr_len);
}
if(saddr)
@@ -68,39 +63,21 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
else
memcpy(fch->saddr,dev->dev_addr,dev->addr_len);
- if(daddr)
+ if(daddr)
{
memcpy(fch->daddr,daddr,dev->addr_len);
- return(hdr_len);
+ return hdr_len;
}
return -hdr_len;
}
-
-/*
- * A neighbour discovery of some species (eg arp) has completed. We
- * can now send the packet.
- */
-
-static int fc_rebuild_header(struct sk_buff *skb)
-{
- struct fch_hdr *fch=(struct fch_hdr *)skb->data;
- struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
- if(fcllc->ethertype != htons(ETH_P_IP)) {
- printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
- return 0;
- }
-#ifdef CONFIG_INET
- return arp_find(fch->daddr, skb);
-#else
- return 0;
-#endif
-}
+
+static const struct header_ops fc_header_ops = {
+ .create = fc_header,
+};
static void fc_setup(struct net_device *dev)
{
- dev->hard_header = fc_header;
- dev->rebuild_header = fc_rebuild_header;
-
+ dev->header_ops = &fc_header_ops;
dev->type = ARPHRD_IEEE802;
dev->hard_header_len = FC_HLEN;
dev->mtu = 2024;
@@ -124,6 +101,6 @@ static void fc_setup(struct net_device *dev)
*/
struct net_device *alloc_fcdev(int sizeof_priv)
{
- return alloc_netdev(sizeof_priv, "fc%d", fc_setup);
+ return alloc_netdev(sizeof_priv, "fc%d", NET_NAME_UNKNOWN, fc_setup);
}
EXPORT_SYMBOL(alloc_fcdev);
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 797c6d961deb..888379ae35ec 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -15,22 +16,15 @@
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Florian La Roche, <rzsfl@rz.uni-sb.de>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* Changes
* Alan Cox : New arp/rebuild header
* Maciej W. Rozycki : IPv6 support
*/
-
+
#include <linux/module.h>
-#include <asm/system.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -53,14 +47,14 @@
static int fddi_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
- void *daddr, void *saddr, unsigned len)
+ const void *daddr, const void *saddr, unsigned int len)
{
int hl = FDDI_K_SNAP_HLEN;
struct fddihdr *fddi;
-
+
if(type != ETH_P_IP && type != ETH_P_IPV6 && type != ETH_P_ARP)
hl=FDDI_K_8022_HLEN-3;
- fddi = (struct fddihdr *)skb_push(skb, hl);
+ fddi = skb_push(skb, hl);
fddi->fc = FDDI_FC_K_ASYNC_LLC_DEF;
if(type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP)
{
@@ -74,7 +68,7 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev,
}
/* Set the source and destination hardware addresses */
-
+
if (saddr != NULL)
memcpy(fddi->saddr, saddr, dev->addr_len);
else
@@ -83,69 +77,45 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev,
if (daddr != NULL)
{
memcpy(fddi->daddr, daddr, dev->addr_len);
- return(hl);
+ return hl;
}
- return(-hl);
+ return -hl;
}
-
-/*
- * Rebuild the FDDI MAC header. This is called after an ARP
- * (or in future other address resolution) has completed on
- * this sk_buff. We now let ARP fill in the other fields.
- */
-
-static int fddi_rebuild_header(struct sk_buff *skb)
-{
- struct fddihdr *fddi = (struct fddihdr *)skb->data;
-
-#ifdef CONFIG_INET
- if (fddi->hdr.llc_snap.ethertype == __constant_htons(ETH_P_IP))
- /* Try to get ARP to resolve the header and fill destination address */
- return arp_find(fddi->daddr, skb);
- else
-#endif
- {
- printk("%s: Don't know how to resolve type %04X addresses.\n",
- skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
- return(0);
- }
-}
-
-
/*
* Determine the packet's protocol ID and fill in skb fields.
* This routine is called before an incoming packet is passed
* up. It's used to fill in specific skb fields and to set
* the proper pointer to the start of packet data (skb->data).
*/
-
+
__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct fddihdr *fddi = (struct fddihdr *)skb->data;
__be16 type;
-
+
/*
* Set mac.raw field to point to FC byte, set data field to point
* to start of packet data. Assume 802.2 SNAP frames for now.
*/
- skb->mac.raw = skb->data; /* point to frame control (FC) */
-
+ skb->dev = dev;
+ skb_reset_mac_header(skb); /* point to frame control (FC) */
+
if(fddi->hdr.llc_8022_1.dsap==0xe0)
{
skb_pull(skb, FDDI_K_8022_HLEN-3);
- type = __constant_htons(ETH_P_802_2);
+ type = htons(ETH_P_802_2);
}
else
{
skb_pull(skb, FDDI_K_SNAP_HLEN); /* adjust for 21 byte header */
type=fddi->hdr.llc_snap.ethertype;
}
-
+
/* Set packet type based on destination address and flag settings */
-
+
if (*fddi->daddr & 0x01)
{
if (memcmp(fddi->daddr, dev->broadcast, FDDI_K_ALEN) == 0)
@@ -153,7 +123,7 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
else
skb->pkt_type = PACKET_MULTICAST;
}
-
+
else if (dev->flags & IFF_PROMISC)
{
if (memcmp(fddi->daddr, dev->dev_addr, FDDI_K_ALEN))
@@ -162,32 +132,28 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
/* Assume 802.2 SNAP frames, for now */
- return(type);
+ return type;
}
EXPORT_SYMBOL(fddi_type_trans);
-static int fddi_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
- return(-EINVAL);
- dev->mtu = new_mtu;
- return(0);
-}
+static const struct header_ops fddi_header_ops = {
+ .create = fddi_header,
+};
+
static void fddi_setup(struct net_device *dev)
{
- dev->change_mtu = fddi_change_mtu;
- dev->hard_header = fddi_header;
- dev->rebuild_header = fddi_rebuild_header;
-
+ dev->header_ops = &fddi_header_ops;
dev->type = ARPHRD_FDDI;
dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */
dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */
+ dev->min_mtu = FDDI_K_SNAP_HLEN;
+ dev->max_mtu = FDDI_K_SNAP_DLEN;
dev->addr_len = FDDI_K_ALEN;
dev->tx_queue_len = 100; /* Long queues on FDDI */
dev->flags = IFF_BROADCAST | IFF_MULTICAST;
-
+
memset(dev->broadcast, 0xFF, FDDI_K_ALEN);
}
@@ -204,6 +170,10 @@ static void fddi_setup(struct net_device *dev)
*/
struct net_device *alloc_fddidev(int sizeof_priv)
{
- return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup);
+ return alloc_netdev(sizeof_priv, "fddi%d", NET_NAME_UNKNOWN,
+ fddi_setup);
}
EXPORT_SYMBOL(alloc_fddidev);
+
+MODULE_DESCRIPTION("Core routines for FDDI network devices");
+MODULE_LICENSE("GPL");
diff --git a/net/802/garp.c b/net/802/garp.c
new file mode 100644
index 000000000000..2d1ffc4d9462
--- /dev/null
+++ b/net/802/garp.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IEEE 802.1D Generic Attribute Registration Protocol (GARP)
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/garp.h>
+#include <linux/unaligned.h>
+
+static unsigned int garp_join_time __read_mostly = 200;
+module_param(garp_join_time, uint, 0644);
+MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)");
+MODULE_DESCRIPTION("IEEE 802.1D Generic Attribute Registration Protocol (GARP)");
+MODULE_LICENSE("GPL");
+
+static const struct garp_state_trans {
+ u8 state;
+ u8 action;
+} garp_applicant_state_table[GARP_APPLICANT_MAX + 1][GARP_EVENT_MAX + 1] = {
+ [GARP_APPLICANT_VA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA },
+ },
+ [GARP_APPLICANT_AA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA },
+ },
+ [GARP_APPLICANT_QA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA },
+ },
+ [GARP_APPLICANT_LA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_VO,
+ .action = GARP_ACTION_S_LEAVE_EMPTY },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_LA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_LA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_LA },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+ [GARP_APPLICANT_VP] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AP },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_VO },
+ },
+ [GARP_APPLICANT_AP] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_AO },
+ },
+ [GARP_APPLICANT_QP] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_QO },
+ },
+ [GARP_APPLICANT_VO] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AO },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+ [GARP_APPLICANT_AO] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_AP },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+ [GARP_APPLICANT_QO] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_QP },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+};
+
+static int garp_attr_cmp(const struct garp_attr *attr,
+ const void *data, u8 len, u8 type)
+{
+ if (attr->type != type)
+ return attr->type - type;
+ if (attr->dlen != len)
+ return attr->dlen - len;
+ return memcmp(attr->data, data, len);
+}
+
+static struct garp_attr *garp_attr_lookup(const struct garp_applicant *app,
+ const void *data, u8 len, u8 type)
+{
+ struct rb_node *parent = app->gid.rb_node;
+ struct garp_attr *attr;
+ int d;
+
+ while (parent) {
+ attr = rb_entry(parent, struct garp_attr, node);
+ d = garp_attr_cmp(attr, data, len, type);
+ if (d > 0)
+ parent = parent->rb_left;
+ else if (d < 0)
+ parent = parent->rb_right;
+ else
+ return attr;
+ }
+ return NULL;
+}
+
+static struct garp_attr *garp_attr_create(struct garp_applicant *app,
+ const void *data, u8 len, u8 type)
+{
+ struct rb_node *parent = NULL, **p = &app->gid.rb_node;
+ struct garp_attr *attr;
+ int d;
+
+ while (*p) {
+ parent = *p;
+ attr = rb_entry(parent, struct garp_attr, node);
+ d = garp_attr_cmp(attr, data, len, type);
+ if (d > 0)
+ p = &parent->rb_left;
+ else if (d < 0)
+ p = &parent->rb_right;
+ else {
+ /* The attribute already exists; re-use it. */
+ return attr;
+ }
+ }
+ attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC);
+ if (!attr)
+ return attr;
+ attr->state = GARP_APPLICANT_VO;
+ attr->type = type;
+ attr->dlen = len;
+ memcpy(attr->data, data, len);
+
+ rb_link_node(&attr->node, parent, p);
+ rb_insert_color(&attr->node, &app->gid);
+ return attr;
+}
+
+static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr)
+{
+ rb_erase(&attr->node, &app->gid);
+ kfree(attr);
+}
+
+static void garp_attr_destroy_all(struct garp_applicant *app)
+{
+ struct rb_node *node, *next;
+ struct garp_attr *attr;
+
+ for (node = rb_first(&app->gid);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct garp_attr, node);
+ garp_attr_destroy(app, attr);
+ }
+}
+
+static int garp_pdu_init(struct garp_applicant *app)
+{
+ struct sk_buff *skb;
+ struct garp_pdu_hdr *gp;
+
+#define LLC_RESERVE sizeof(struct llc_pdu_un)
+ skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev),
+ GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ skb->dev = app->dev;
+ skb->protocol = htons(ETH_P_802_2);
+ skb_reserve(skb, LL_RESERVED_SPACE(app->dev) + LLC_RESERVE);
+
+ gp = __skb_put(skb, sizeof(*gp));
+ put_unaligned(htons(GARP_PROTOCOL_ID), &gp->protocol);
+
+ app->pdu = skb;
+ return 0;
+}
+
+static int garp_pdu_append_end_mark(struct garp_applicant *app)
+{
+ if (skb_tailroom(app->pdu) < sizeof(u8))
+ return -1;
+ __skb_put_u8(app->pdu, GARP_END_MARK);
+ return 0;
+}
+
+static void garp_pdu_queue(struct garp_applicant *app)
+{
+ if (!app->pdu)
+ return;
+
+ garp_pdu_append_end_mark(app);
+ garp_pdu_append_end_mark(app);
+
+ llc_pdu_header_init(app->pdu, LLC_PDU_TYPE_U, LLC_SAP_BSPAN,
+ LLC_SAP_BSPAN, LLC_PDU_CMD);
+ llc_pdu_init_as_ui_cmd(app->pdu);
+ llc_mac_hdr_init(app->pdu, app->dev->dev_addr,
+ app->app->proto.group_address);
+
+ skb_queue_tail(&app->queue, app->pdu);
+ app->pdu = NULL;
+}
+
+static void garp_queue_xmit(struct garp_applicant *app)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&app->queue)))
+ dev_queue_xmit(skb);
+}
+
+static int garp_pdu_append_msg(struct garp_applicant *app, u8 attrtype)
+{
+ struct garp_msg_hdr *gm;
+
+ if (skb_tailroom(app->pdu) < sizeof(*gm))
+ return -1;
+ gm = __skb_put(app->pdu, sizeof(*gm));
+ gm->attrtype = attrtype;
+ garp_cb(app->pdu)->cur_type = attrtype;
+ return 0;
+}
+
+static int garp_pdu_append_attr(struct garp_applicant *app,
+ const struct garp_attr *attr,
+ enum garp_attr_event event)
+{
+ struct garp_attr_hdr *ga;
+ unsigned int len;
+ int err;
+again:
+ if (!app->pdu) {
+ err = garp_pdu_init(app);
+ if (err < 0)
+ return err;
+ }
+
+ if (garp_cb(app->pdu)->cur_type != attr->type) {
+ if (garp_cb(app->pdu)->cur_type &&
+ garp_pdu_append_end_mark(app) < 0)
+ goto queue;
+ if (garp_pdu_append_msg(app, attr->type) < 0)
+ goto queue;
+ }
+
+ len = sizeof(*ga) + attr->dlen;
+ if (skb_tailroom(app->pdu) < len)
+ goto queue;
+ ga = __skb_put(app->pdu, len);
+ ga->len = len;
+ ga->event = event;
+ memcpy(ga->data, attr->data, attr->dlen);
+ return 0;
+
+queue:
+ garp_pdu_queue(app);
+ goto again;
+}
+
+static void garp_attr_event(struct garp_applicant *app,
+ struct garp_attr *attr, enum garp_event event)
+{
+ enum garp_applicant_state state;
+
+ state = garp_applicant_state_table[attr->state][event].state;
+ if (state == GARP_APPLICANT_INVALID)
+ return;
+
+ switch (garp_applicant_state_table[attr->state][event].action) {
+ case GARP_ACTION_NONE:
+ break;
+ case GARP_ACTION_S_JOIN_IN:
+ /* When appending the attribute fails, don't update state in
+ * order to retry on next TRANSMIT_PDU event. */
+ if (garp_pdu_append_attr(app, attr, GARP_JOIN_IN) < 0)
+ return;
+ break;
+ case GARP_ACTION_S_LEAVE_EMPTY:
+ garp_pdu_append_attr(app, attr, GARP_LEAVE_EMPTY);
+ /* As a pure applicant, sending a leave message implies that
+ * the attribute was unregistered and can be destroyed. */
+ garp_attr_destroy(app, attr);
+ return;
+ default:
+ WARN_ON(1);
+ }
+
+ attr->state = state;
+}
+
+int garp_request_join(const struct net_device *dev,
+ const struct garp_application *appl,
+ const void *data, u8 len, u8 type)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+ struct garp_attr *attr;
+
+ spin_lock_bh(&app->lock);
+ attr = garp_attr_create(app, data, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return -ENOMEM;
+ }
+ garp_attr_event(app, attr, GARP_EVENT_REQ_JOIN);
+ spin_unlock_bh(&app->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(garp_request_join);
+
+void garp_request_leave(const struct net_device *dev,
+ const struct garp_application *appl,
+ const void *data, u8 len, u8 type)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+ struct garp_attr *attr;
+
+ spin_lock_bh(&app->lock);
+ attr = garp_attr_lookup(app, data, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return;
+ }
+ garp_attr_event(app, attr, GARP_EVENT_REQ_LEAVE);
+ spin_unlock_bh(&app->lock);
+}
+EXPORT_SYMBOL_GPL(garp_request_leave);
+
+static void garp_gid_event(struct garp_applicant *app, enum garp_event event)
+{
+ struct rb_node *node, *next;
+ struct garp_attr *attr;
+
+ for (node = rb_first(&app->gid);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct garp_attr, node);
+ garp_attr_event(app, attr, event);
+ }
+}
+
+static void garp_join_timer_arm(struct garp_applicant *app)
+{
+ unsigned long delay;
+
+ delay = get_random_u32_below(msecs_to_jiffies(garp_join_time));
+ mod_timer(&app->join_timer, jiffies + delay);
+}
+
+static void garp_join_timer(struct timer_list *t)
+{
+ struct garp_applicant *app = timer_container_of(app, t, join_timer);
+
+ spin_lock(&app->lock);
+ garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+ garp_pdu_queue(app);
+ spin_unlock(&app->lock);
+
+ garp_queue_xmit(app);
+ garp_join_timer_arm(app);
+}
+
+static int garp_pdu_parse_end_mark(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, sizeof(u8)))
+ return -1;
+ if (*skb->data == GARP_END_MARK) {
+ skb_pull(skb, sizeof(u8));
+ return -1;
+ }
+ return 0;
+}
+
+static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb,
+ u8 attrtype)
+{
+ const struct garp_attr_hdr *ga;
+ struct garp_attr *attr;
+ enum garp_event event;
+ unsigned int dlen;
+
+ if (!pskb_may_pull(skb, sizeof(*ga)))
+ return -1;
+ ga = (struct garp_attr_hdr *)skb->data;
+ if (ga->len < sizeof(*ga))
+ return -1;
+
+ if (!pskb_may_pull(skb, ga->len))
+ return -1;
+ skb_pull(skb, ga->len);
+ dlen = sizeof(*ga) - ga->len;
+
+ if (attrtype > app->app->maxattr)
+ return 0;
+
+ switch (ga->event) {
+ case GARP_LEAVE_ALL:
+ if (dlen != 0)
+ return -1;
+ garp_gid_event(app, GARP_EVENT_R_LEAVE_EMPTY);
+ return 0;
+ case GARP_JOIN_EMPTY:
+ event = GARP_EVENT_R_JOIN_EMPTY;
+ break;
+ case GARP_JOIN_IN:
+ event = GARP_EVENT_R_JOIN_IN;
+ break;
+ case GARP_LEAVE_EMPTY:
+ event = GARP_EVENT_R_LEAVE_EMPTY;
+ break;
+ case GARP_EMPTY:
+ event = GARP_EVENT_R_EMPTY;
+ break;
+ default:
+ return 0;
+ }
+
+ if (dlen == 0)
+ return -1;
+ attr = garp_attr_lookup(app, ga->data, dlen, attrtype);
+ if (attr == NULL)
+ return 0;
+ garp_attr_event(app, attr, event);
+ return 0;
+}
+
+static int garp_pdu_parse_msg(struct garp_applicant *app, struct sk_buff *skb)
+{
+ const struct garp_msg_hdr *gm;
+
+ if (!pskb_may_pull(skb, sizeof(*gm)))
+ return -1;
+ gm = (struct garp_msg_hdr *)skb->data;
+ if (gm->attrtype == 0)
+ return -1;
+ skb_pull(skb, sizeof(*gm));
+
+ while (skb->len > 0) {
+ if (garp_pdu_parse_attr(app, skb, gm->attrtype) < 0)
+ return -1;
+ if (garp_pdu_parse_end_mark(skb) < 0)
+ break;
+ }
+ return 0;
+}
+
+static void garp_pdu_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct garp_application *appl = proto->data;
+ struct garp_port *port;
+ struct garp_applicant *app;
+ const struct garp_pdu_hdr *gp;
+
+ port = rcu_dereference(dev->garp_port);
+ if (!port)
+ goto err;
+ app = rcu_dereference(port->applicants[appl->type]);
+ if (!app)
+ goto err;
+
+ if (!pskb_may_pull(skb, sizeof(*gp)))
+ goto err;
+ gp = (struct garp_pdu_hdr *)skb->data;
+ if (get_unaligned(&gp->protocol) != htons(GARP_PROTOCOL_ID))
+ goto err;
+ skb_pull(skb, sizeof(*gp));
+
+ spin_lock(&app->lock);
+ while (skb->len > 0) {
+ if (garp_pdu_parse_msg(app, skb) < 0)
+ break;
+ if (garp_pdu_parse_end_mark(skb) < 0)
+ break;
+ }
+ spin_unlock(&app->lock);
+err:
+ kfree_skb(skb);
+}
+
+static int garp_init_port(struct net_device *dev)
+{
+ struct garp_port *port;
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+ rcu_assign_pointer(dev->garp_port, port);
+ return 0;
+}
+
+static void garp_release_port(struct net_device *dev)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ unsigned int i;
+
+ for (i = 0; i <= GARP_APPLICATION_MAX; i++) {
+ if (rtnl_dereference(port->applicants[i]))
+ return;
+ }
+ RCU_INIT_POINTER(dev->garp_port, NULL);
+ kfree_rcu(port, rcu);
+}
+
+int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
+{
+ struct garp_applicant *app;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!rtnl_dereference(dev->garp_port)) {
+ err = garp_init_port(dev);
+ if (err < 0)
+ goto err1;
+ }
+
+ err = -ENOMEM;
+ app = kzalloc(sizeof(*app), GFP_KERNEL);
+ if (!app)
+ goto err2;
+
+ err = dev_mc_add(dev, appl->proto.group_address);
+ if (err < 0)
+ goto err3;
+
+ app->dev = dev;
+ app->app = appl;
+ app->gid = RB_ROOT;
+ spin_lock_init(&app->lock);
+ skb_queue_head_init(&app->queue);
+ rcu_assign_pointer(dev->garp_port->applicants[appl->type], app);
+ timer_setup(&app->join_timer, garp_join_timer, 0);
+ garp_join_timer_arm(app);
+ return 0;
+
+err3:
+ kfree(app);
+err2:
+ garp_release_port(dev);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(garp_init_applicant);
+
+void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+
+ ASSERT_RTNL();
+
+ RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+
+ /* Delete timer and generate a final TRANSMIT_PDU event to flush out
+ * all pending messages before the applicant is gone. */
+ timer_shutdown_sync(&app->join_timer);
+
+ spin_lock_bh(&app->lock);
+ garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+ garp_attr_destroy_all(app);
+ garp_pdu_queue(app);
+ spin_unlock_bh(&app->lock);
+
+ garp_queue_xmit(app);
+
+ dev_mc_del(dev, appl->proto.group_address);
+ kfree_rcu(app, rcu);
+ garp_release_port(dev);
+}
+EXPORT_SYMBOL_GPL(garp_uninit_applicant);
+
+int garp_register_application(struct garp_application *appl)
+{
+ appl->proto.rcv = garp_pdu_rcv;
+ appl->proto.data = appl;
+ return stp_proto_register(&appl->proto);
+}
+EXPORT_SYMBOL_GPL(garp_register_application);
+
+void garp_unregister_application(struct garp_application *appl)
+{
+ stp_proto_unregister(&appl->proto);
+}
+EXPORT_SYMBOL_GPL(garp_unregister_application);
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 579e2ddf5ebe..1997b7dd265e 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -13,17 +14,11 @@
* Florian La Roche, <rzsfl@rz.uni-sb.de>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Jes Sorensen, <Jes.Sorensen@cern.ch>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -35,21 +30,20 @@
#include <linux/errno.h>
#include <net/arp.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
/*
- * Create the HIPPI MAC header for an arbitrary protocol layer
+ * Create the HIPPI MAC header for an arbitrary protocol layer
*
* saddr=NULL means use device source address
* daddr=NULL means leave destination address (eg unresolved arp)
*/
static int hippi_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type, void *daddr, void *saddr,
- unsigned len)
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
{
- struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+ struct hippi_hdr *hip = skb_push(skb, HIPPI_HLEN);
struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
if (!len){
@@ -61,7 +55,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
* Due to the stupidity of the little endian byte-order we
* have to set the fp field this way.
*/
- hip->fp.fixed = __constant_htonl(0x04800018);
+ hip->fp.fixed = htonl(0x04800018);
hip->fp.d2_size = htonl(len + 8);
hip->le.fc = 0;
hip->le.double_wide = 0; /* only HIPPI 800 for the time being */
@@ -71,7 +65,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
hip->le.src_addr_type = 2; /* 12 bit SC address */
memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
- memset(&hip->le.reserved, 0, 16);
+ memset_startat(&hip->le, 0, reserved);
hip->snap.dsap = HIPPI_EXTENDED_SAP;
hip->snap.ssap = HIPPI_EXTENDED_SAP;
@@ -93,48 +87,21 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
/*
- * Rebuild the HIPPI MAC header. This is called after an ARP has
- * completed on this sk_buff. We now let ARP fill in the other fields.
- */
-
-static int hippi_rebuild_header(struct sk_buff *skb)
-{
- struct hippi_hdr *hip = (struct hippi_hdr *)skb->data;
-
- /*
- * Only IP is currently supported
- */
-
- if(hip->snap.ethertype != __constant_htons(ETH_P_IP))
- {
- printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",skb->dev->name,ntohs(hip->snap.ethertype));
- return 0;
- }
-
- /*
- * We don't support dynamic ARP on HIPPI, but we use the ARP
- * static ARP tables to hold the I-FIELDs.
- */
- return arp_find(hip->le.daddr, skb);
-}
-
-
-/*
* Determine the packet's protocol ID.
*/
-
+
__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct hippi_hdr *hip;
-
- hip = (struct hippi_hdr *) skb->data;
/*
* This is actually wrong ... question is if we really should
* set the raw address here.
*/
- skb->mac.raw = skb->data;
- skb_pull(skb, HIPPI_HLEN);
+ skb->dev = dev;
+ skb_reset_mac_header(skb);
+ hip = (struct hippi_hdr *)skb_mac_header(skb);
+ skb_pull(skb, HIPPI_HLEN);
/*
* No fancy promisc stuff here now.
@@ -145,55 +112,43 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
EXPORT_SYMBOL(hippi_type_trans);
-static int hippi_change_mtu(struct net_device *dev, int new_mtu)
-{
- /*
- * HIPPI's got these nice large MTUs.
- */
- if ((new_mtu < 68) || (new_mtu > 65280))
- return -EINVAL;
- dev->mtu = new_mtu;
- return(0);
-}
-
/*
* For HIPPI we will actually use the lower 4 bytes of the hardware
* address as the I-FIELD rather than the actual hardware address.
*/
-static int hippi_mac_addr(struct net_device *dev, void *p)
+int hippi_mac_addr(struct net_device *dev, void *p)
{
struct sockaddr *addr = p;
if (netif_running(dev))
return -EBUSY;
- memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ dev_addr_set(dev, addr->sa_data);
return 0;
}
+EXPORT_SYMBOL(hippi_mac_addr);
-static int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
+int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
{
/* Never send broadcast/multicast ARP messages */
- p->mcast_probes = 0;
-
+ NEIGH_VAR_INIT(p, MCAST_PROBES, 0);
+
/* In IPv6 unicast probes are valid even on NBMA,
* because they are encapsulated in normal IPv6 protocol.
- * Should be a generic flag.
+ * Should be a generic flag.
*/
if (p->tbl->family != AF_INET6)
- p->ucast_probes = 0;
+ NEIGH_VAR_INIT(p, UCAST_PROBES, 0);
return 0;
}
+EXPORT_SYMBOL(hippi_neigh_setup_dev);
+
+static const struct header_ops hippi_header_ops = {
+ .create = hippi_header,
+};
+
static void hippi_setup(struct net_device *dev)
{
- dev->set_multicast_list = NULL;
- dev->change_mtu = hippi_change_mtu;
- dev->hard_header = hippi_header;
- dev->rebuild_header = hippi_rebuild_header;
- dev->set_mac_address = hippi_mac_addr;
- dev->hard_header_parse = NULL;
- dev->hard_header_cache = NULL;
- dev->header_cache_update = NULL;
- dev->neigh_setup = hippi_neigh_setup_dev;
+ dev->header_ops = &hippi_header_ops;
/*
* We don't support HIPPI `ARP' for the time being, and probably
@@ -203,6 +158,8 @@ static void hippi_setup(struct net_device *dev)
dev->type = ARPHRD_HIPPI;
dev->hard_header_len = HIPPI_HLEN;
dev->mtu = 65280;
+ dev->min_mtu = 68;
+ dev->max_mtu = 65280;
dev->addr_len = HIPPI_ALEN;
dev->tx_queue_len = 25 /* 5 */;
memset(dev->broadcast, 0xFF, HIPPI_ALEN);
@@ -210,9 +167,9 @@ static void hippi_setup(struct net_device *dev)
/*
* HIPPI doesn't support broadcast+multicast and we only use
- * static ARP tables. ARP is disabled by hippi_neigh_setup_dev.
+ * static ARP tables. ARP is disabled by hippi_neigh_setup_dev.
*/
- dev->flags = 0;
+ dev->flags = 0;
}
/**
@@ -229,7 +186,8 @@ static void hippi_setup(struct net_device *dev)
struct net_device *alloc_hippi_dev(int sizeof_priv)
{
- return alloc_netdev(sizeof_priv, "hip%d", hippi_setup);
+ return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN,
+ hippi_setup);
}
EXPORT_SYMBOL(alloc_hippi_dev);
diff --git a/net/802/mrp.c b/net/802/mrp.c
new file mode 100644
index 000000000000..23a88305f900
--- /dev/null
+++ b/net/802/mrp.c
@@ -0,0 +1,944 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IEEE 802.1Q Multiple Registration Protocol (MRP)
+ *
+ * Copyright (c) 2012 Massachusetts Institute of Technology
+ *
+ * Adapted from code in net/802/garp.c
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/mrp.h>
+#include <linux/unaligned.h>
+
+static unsigned int mrp_join_time __read_mostly = 200;
+module_param(mrp_join_time, uint, 0644);
+MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)");
+
+static unsigned int mrp_periodic_time __read_mostly = 1000;
+module_param(mrp_periodic_time, uint, 0644);
+MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)");
+
+MODULE_DESCRIPTION("IEEE 802.1Q Multiple Registration Protocol (MRP)");
+MODULE_LICENSE("GPL");
+
+static const u8
+mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = {
+ [MRP_APPLICANT_VO] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_VP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VO,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO,
+ },
+ [MRP_APPLICANT_VP] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_VP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP,
+ },
+ [MRP_APPLICANT_VN] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_VN,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VN,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN,
+ },
+ [MRP_APPLICANT_AN] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_AN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AN,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VN,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN,
+ },
+ [MRP_APPLICANT_AA] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AA,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA,
+ },
+ [MRP_APPLICANT_QA] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA,
+ },
+ [MRP_APPLICANT_LA] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AA,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_LA,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA,
+ },
+ [MRP_APPLICANT_AO] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_AO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VO,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO,
+ },
+ [MRP_APPLICANT_QO] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_QO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VO,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO,
+ },
+ [MRP_APPLICANT_AP] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_AO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP,
+ },
+ [MRP_APPLICANT_QP] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_QO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP,
+ },
+};
+
+static const u8
+mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = {
+ [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL,
+ [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN,
+ [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW,
+ [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW,
+ [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN,
+ [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL,
+ [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV,
+ [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL,
+ [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL,
+ [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN,
+ [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL,
+};
+
+static void mrp_attrvalue_inc(void *value, u8 len)
+{
+ u8 *v = (u8 *)value;
+
+ /* Add 1 to the last byte. If it becomes zero,
+ * go to the previous byte and repeat.
+ */
+ while (len > 0 && !++v[--len])
+ ;
+}
+
+static int mrp_attr_cmp(const struct mrp_attr *attr,
+ const void *value, u8 len, u8 type)
+{
+ if (attr->type != type)
+ return attr->type - type;
+ if (attr->len != len)
+ return attr->len - len;
+ return memcmp(attr->value, value, len);
+}
+
+static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app,
+ const void *value, u8 len, u8 type)
+{
+ struct rb_node *parent = app->mad.rb_node;
+ struct mrp_attr *attr;
+ int d;
+
+ while (parent) {
+ attr = rb_entry(parent, struct mrp_attr, node);
+ d = mrp_attr_cmp(attr, value, len, type);
+ if (d > 0)
+ parent = parent->rb_left;
+ else if (d < 0)
+ parent = parent->rb_right;
+ else
+ return attr;
+ }
+ return NULL;
+}
+
+static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app,
+ const void *value, u8 len, u8 type)
+{
+ struct rb_node *parent = NULL, **p = &app->mad.rb_node;
+ struct mrp_attr *attr;
+ int d;
+
+ while (*p) {
+ parent = *p;
+ attr = rb_entry(parent, struct mrp_attr, node);
+ d = mrp_attr_cmp(attr, value, len, type);
+ if (d > 0)
+ p = &parent->rb_left;
+ else if (d < 0)
+ p = &parent->rb_right;
+ else {
+ /* The attribute already exists; re-use it. */
+ return attr;
+ }
+ }
+ attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC);
+ if (!attr)
+ return attr;
+ attr->state = MRP_APPLICANT_VO;
+ attr->type = type;
+ attr->len = len;
+ memcpy(attr->value, value, len);
+
+ rb_link_node(&attr->node, parent, p);
+ rb_insert_color(&attr->node, &app->mad);
+ return attr;
+}
+
+static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr)
+{
+ rb_erase(&attr->node, &app->mad);
+ kfree(attr);
+}
+
+static void mrp_attr_destroy_all(struct mrp_applicant *app)
+{
+ struct rb_node *node, *next;
+ struct mrp_attr *attr;
+
+ for (node = rb_first(&app->mad);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct mrp_attr, node);
+ mrp_attr_destroy(app, attr);
+ }
+}
+
+static int mrp_pdu_init(struct mrp_applicant *app)
+{
+ struct sk_buff *skb;
+ struct mrp_pdu_hdr *ph;
+
+ skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev),
+ GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ skb->dev = app->dev;
+ skb->protocol = app->app->pkttype.type;
+ skb_reserve(skb, LL_RESERVED_SPACE(app->dev));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ ph = __skb_put(skb, sizeof(*ph));
+ ph->version = app->app->version;
+
+ app->pdu = skb;
+ return 0;
+}
+
+static int mrp_pdu_append_end_mark(struct mrp_applicant *app)
+{
+ __be16 *endmark;
+
+ if (skb_tailroom(app->pdu) < sizeof(*endmark))
+ return -1;
+ endmark = __skb_put(app->pdu, sizeof(*endmark));
+ put_unaligned(MRP_END_MARK, endmark);
+ return 0;
+}
+
+static void mrp_pdu_queue(struct mrp_applicant *app)
+{
+ if (!app->pdu)
+ return;
+
+ if (mrp_cb(app->pdu)->mh)
+ mrp_pdu_append_end_mark(app);
+ mrp_pdu_append_end_mark(app);
+
+ dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type),
+ app->app->group_address, app->dev->dev_addr,
+ app->pdu->len);
+
+ skb_queue_tail(&app->queue, app->pdu);
+ app->pdu = NULL;
+}
+
+static void mrp_queue_xmit(struct mrp_applicant *app)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&app->queue)))
+ dev_queue_xmit(skb);
+}
+
+static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app,
+ u8 attrtype, u8 attrlen)
+{
+ struct mrp_msg_hdr *mh;
+
+ if (mrp_cb(app->pdu)->mh) {
+ if (mrp_pdu_append_end_mark(app) < 0)
+ return -1;
+ mrp_cb(app->pdu)->mh = NULL;
+ mrp_cb(app->pdu)->vah = NULL;
+ }
+
+ if (skb_tailroom(app->pdu) < sizeof(*mh))
+ return -1;
+ mh = __skb_put(app->pdu, sizeof(*mh));
+ mh->attrtype = attrtype;
+ mh->attrlen = attrlen;
+ mrp_cb(app->pdu)->mh = mh;
+ return 0;
+}
+
+static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app,
+ const void *firstattrvalue, u8 attrlen)
+{
+ struct mrp_vecattr_hdr *vah;
+
+ if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen)
+ return -1;
+ vah = __skb_put(app->pdu, sizeof(*vah) + attrlen);
+ put_unaligned(0, &vah->lenflags);
+ memcpy(vah->firstattrvalue, firstattrvalue, attrlen);
+ mrp_cb(app->pdu)->vah = vah;
+ memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen);
+ return 0;
+}
+
+static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app,
+ const struct mrp_attr *attr,
+ enum mrp_vecattr_event vaevent)
+{
+ u16 len, pos;
+ u8 *vaevents;
+ int err;
+again:
+ if (!app->pdu) {
+ err = mrp_pdu_init(app);
+ if (err < 0)
+ return err;
+ }
+
+ /* If there is no Message header in the PDU, or the Message header is
+ * for a different attribute type, add an EndMark (if necessary) and a
+ * new Message header to the PDU.
+ */
+ if (!mrp_cb(app->pdu)->mh ||
+ mrp_cb(app->pdu)->mh->attrtype != attr->type ||
+ mrp_cb(app->pdu)->mh->attrlen != attr->len) {
+ if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0)
+ goto queue;
+ }
+
+ /* If there is no VectorAttribute header for this Message in the PDU,
+ * or this attribute's value does not sequentially follow the previous
+ * attribute's value, add a new VectorAttribute header to the PDU.
+ */
+ if (!mrp_cb(app->pdu)->vah ||
+ memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) {
+ if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0)
+ goto queue;
+ }
+
+ len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags));
+ pos = len % 3;
+
+ /* Events are packed into Vectors in the PDU, three to a byte. Add a
+ * byte to the end of the Vector if necessary.
+ */
+ if (!pos) {
+ if (skb_tailroom(app->pdu) < sizeof(u8))
+ goto queue;
+ vaevents = __skb_put(app->pdu, sizeof(u8));
+ } else {
+ vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8));
+ }
+
+ switch (pos) {
+ case 0:
+ *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX *
+ __MRP_VECATTR_EVENT_MAX);
+ break;
+ case 1:
+ *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX;
+ break;
+ case 2:
+ *vaevents += vaevent;
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ /* Increment the length of the VectorAttribute in the PDU, as well as
+ * the value of the next attribute that would continue its Vector.
+ */
+ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags);
+ mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len);
+
+ return 0;
+
+queue:
+ mrp_pdu_queue(app);
+ goto again;
+}
+
+static void mrp_attr_event(struct mrp_applicant *app,
+ struct mrp_attr *attr, enum mrp_event event)
+{
+ enum mrp_applicant_state state;
+
+ state = mrp_applicant_state_table[attr->state][event];
+ if (state == MRP_APPLICANT_INVALID) {
+ WARN_ON(1);
+ return;
+ }
+
+ if (event == MRP_EVENT_TX) {
+ /* When appending the attribute fails, don't update its state
+ * in order to retry at the next TX event.
+ */
+
+ switch (mrp_tx_action_table[attr->state]) {
+ case MRP_TX_ACTION_NONE:
+ case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL:
+ case MRP_TX_ACTION_S_IN_OPTIONAL:
+ break;
+ case MRP_TX_ACTION_S_NEW:
+ if (mrp_pdu_append_vecattr_event(
+ app, attr, MRP_VECATTR_EVENT_NEW) < 0)
+ return;
+ break;
+ case MRP_TX_ACTION_S_JOIN_IN:
+ if (mrp_pdu_append_vecattr_event(
+ app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0)
+ return;
+ break;
+ case MRP_TX_ACTION_S_LV:
+ if (mrp_pdu_append_vecattr_event(
+ app, attr, MRP_VECATTR_EVENT_LV) < 0)
+ return;
+ /* As a pure applicant, sending a leave message
+ * implies that the attribute was unregistered and
+ * can be destroyed.
+ */
+ mrp_attr_destroy(app, attr);
+ return;
+ default:
+ WARN_ON(1);
+ }
+ }
+
+ attr->state = state;
+}
+
+int mrp_request_join(const struct net_device *dev,
+ const struct mrp_application *appl,
+ const void *value, u8 len, u8 type)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ struct mrp_applicant *app = rtnl_dereference(
+ port->applicants[appl->type]);
+ struct mrp_attr *attr;
+
+ if (sizeof(struct mrp_skb_cb) + len >
+ sizeof_field(struct sk_buff, cb))
+ return -ENOMEM;
+
+ spin_lock_bh(&app->lock);
+ attr = mrp_attr_create(app, value, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return -ENOMEM;
+ }
+ mrp_attr_event(app, attr, MRP_EVENT_JOIN);
+ spin_unlock_bh(&app->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mrp_request_join);
+
+void mrp_request_leave(const struct net_device *dev,
+ const struct mrp_application *appl,
+ const void *value, u8 len, u8 type)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ struct mrp_applicant *app = rtnl_dereference(
+ port->applicants[appl->type]);
+ struct mrp_attr *attr;
+
+ if (sizeof(struct mrp_skb_cb) + len >
+ sizeof_field(struct sk_buff, cb))
+ return;
+
+ spin_lock_bh(&app->lock);
+ attr = mrp_attr_lookup(app, value, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return;
+ }
+ mrp_attr_event(app, attr, MRP_EVENT_LV);
+ spin_unlock_bh(&app->lock);
+}
+EXPORT_SYMBOL_GPL(mrp_request_leave);
+
+static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event)
+{
+ struct rb_node *node, *next;
+ struct mrp_attr *attr;
+
+ for (node = rb_first(&app->mad);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct mrp_attr, node);
+ mrp_attr_event(app, attr, event);
+ }
+}
+
+static void mrp_join_timer_arm(struct mrp_applicant *app)
+{
+ unsigned long delay;
+
+ delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time));
+ mod_timer(&app->join_timer, jiffies + delay);
+}
+
+static void mrp_join_timer(struct timer_list *t)
+{
+ struct mrp_applicant *app = timer_container_of(app, t, join_timer);
+
+ spin_lock(&app->lock);
+ mrp_mad_event(app, MRP_EVENT_TX);
+ mrp_pdu_queue(app);
+ spin_unlock(&app->lock);
+
+ mrp_queue_xmit(app);
+ spin_lock(&app->lock);
+ if (likely(app->active))
+ mrp_join_timer_arm(app);
+ spin_unlock(&app->lock);
+}
+
+static void mrp_periodic_timer_arm(struct mrp_applicant *app)
+{
+ mod_timer(&app->periodic_timer,
+ jiffies + msecs_to_jiffies(mrp_periodic_time));
+}
+
+static void mrp_periodic_timer(struct timer_list *t)
+{
+ struct mrp_applicant *app = timer_container_of(app, t, periodic_timer);
+
+ spin_lock(&app->lock);
+ if (likely(app->active)) {
+ mrp_mad_event(app, MRP_EVENT_PERIODIC);
+ mrp_pdu_queue(app);
+ mrp_periodic_timer_arm(app);
+ }
+ spin_unlock(&app->lock);
+}
+
+static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset)
+{
+ __be16 endmark;
+
+ if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0)
+ return -1;
+ if (endmark == MRP_END_MARK) {
+ *offset += sizeof(endmark);
+ return -1;
+ }
+ return 0;
+}
+
+static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app,
+ struct sk_buff *skb,
+ enum mrp_vecattr_event vaevent)
+{
+ struct mrp_attr *attr;
+ enum mrp_event event;
+
+ attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen,
+ mrp_cb(skb)->mh->attrtype);
+ if (attr == NULL)
+ return;
+
+ switch (vaevent) {
+ case MRP_VECATTR_EVENT_NEW:
+ event = MRP_EVENT_R_NEW;
+ break;
+ case MRP_VECATTR_EVENT_JOIN_IN:
+ event = MRP_EVENT_R_JOIN_IN;
+ break;
+ case MRP_VECATTR_EVENT_IN:
+ event = MRP_EVENT_R_IN;
+ break;
+ case MRP_VECATTR_EVENT_JOIN_MT:
+ event = MRP_EVENT_R_JOIN_MT;
+ break;
+ case MRP_VECATTR_EVENT_MT:
+ event = MRP_EVENT_R_MT;
+ break;
+ case MRP_VECATTR_EVENT_LV:
+ event = MRP_EVENT_R_LV;
+ break;
+ default:
+ return;
+ }
+
+ mrp_attr_event(app, attr, event);
+}
+
+static int mrp_pdu_parse_vecattr(struct mrp_applicant *app,
+ struct sk_buff *skb, int *offset)
+{
+ struct mrp_vecattr_hdr _vah;
+ u16 valen;
+ u8 vaevents, vaevent;
+
+ mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah),
+ &_vah);
+ if (!mrp_cb(skb)->vah)
+ return -1;
+ *offset += sizeof(_vah);
+
+ if (get_unaligned(&mrp_cb(skb)->vah->lenflags) &
+ MRP_VECATTR_HDR_FLAG_LA)
+ mrp_mad_event(app, MRP_EVENT_R_LA);
+ valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) &
+ MRP_VECATTR_HDR_LEN_MASK);
+
+ /* The VectorAttribute structure in a PDU carries event information
+ * about one or more attributes having consecutive values. Only the
+ * value for the first attribute is contained in the structure. So
+ * we make a copy of that value, and then increment it each time we
+ * advance to the next event in its Vector.
+ */
+ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen >
+ sizeof_field(struct sk_buff, cb))
+ return -1;
+ if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen) < 0)
+ return -1;
+ *offset += mrp_cb(skb)->mh->attrlen;
+
+ /* In a VectorAttribute, the Vector contains events which are packed
+ * three to a byte. We process one byte of the Vector at a time.
+ */
+ while (valen > 0) {
+ if (skb_copy_bits(skb, *offset, &vaevents,
+ sizeof(vaevents)) < 0)
+ return -1;
+ *offset += sizeof(vaevents);
+
+ /* Extract and process the first event. */
+ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX *
+ __MRP_VECATTR_EVENT_MAX);
+ if (vaevent >= __MRP_VECATTR_EVENT_MAX) {
+ /* The byte is malformed; stop processing. */
+ return -1;
+ }
+ mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+
+ /* If present, extract and process the second event. */
+ if (!--valen)
+ break;
+ mrp_attrvalue_inc(mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen);
+ vaevents %= (__MRP_VECATTR_EVENT_MAX *
+ __MRP_VECATTR_EVENT_MAX);
+ vaevent = vaevents / __MRP_VECATTR_EVENT_MAX;
+ mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+
+ /* If present, extract and process the third event. */
+ if (!--valen)
+ break;
+ mrp_attrvalue_inc(mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen);
+ vaevents %= __MRP_VECATTR_EVENT_MAX;
+ vaevent = vaevents;
+ mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+ }
+ return 0;
+}
+
+static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb,
+ int *offset)
+{
+ struct mrp_msg_hdr _mh;
+
+ mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh);
+ if (!mrp_cb(skb)->mh)
+ return -1;
+ *offset += sizeof(_mh);
+
+ if (mrp_cb(skb)->mh->attrtype == 0 ||
+ mrp_cb(skb)->mh->attrtype > app->app->maxattr ||
+ mrp_cb(skb)->mh->attrlen == 0)
+ return -1;
+
+ while (skb->len > *offset) {
+ if (mrp_pdu_parse_end_mark(skb, offset) < 0)
+ break;
+ if (mrp_pdu_parse_vecattr(app, skb, offset) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int mrp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct mrp_application *appl = container_of(pt, struct mrp_application,
+ pkttype);
+ struct mrp_port *port;
+ struct mrp_applicant *app;
+ struct mrp_pdu_hdr _ph;
+ const struct mrp_pdu_hdr *ph;
+ int offset = skb_network_offset(skb);
+
+ /* If the interface is in promiscuous mode, drop the packet if
+ * it was unicast to another host.
+ */
+ if (unlikely(skb->pkt_type == PACKET_OTHERHOST))
+ goto out;
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto out;
+ port = rcu_dereference(dev->mrp_port);
+ if (unlikely(!port))
+ goto out;
+ app = rcu_dereference(port->applicants[appl->type]);
+ if (unlikely(!app))
+ goto out;
+
+ ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph);
+ if (!ph)
+ goto out;
+ offset += sizeof(_ph);
+
+ if (ph->version != app->app->version)
+ goto out;
+
+ spin_lock(&app->lock);
+ while (skb->len > offset) {
+ if (mrp_pdu_parse_end_mark(skb, &offset) < 0)
+ break;
+ if (mrp_pdu_parse_msg(app, skb, &offset) < 0)
+ break;
+ }
+ spin_unlock(&app->lock);
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int mrp_init_port(struct net_device *dev)
+{
+ struct mrp_port *port;
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+ rcu_assign_pointer(dev->mrp_port, port);
+ return 0;
+}
+
+static void mrp_release_port(struct net_device *dev)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ unsigned int i;
+
+ for (i = 0; i <= MRP_APPLICATION_MAX; i++) {
+ if (rtnl_dereference(port->applicants[i]))
+ return;
+ }
+ RCU_INIT_POINTER(dev->mrp_port, NULL);
+ kfree_rcu(port, rcu);
+}
+
+int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
+{
+ struct mrp_applicant *app;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!rtnl_dereference(dev->mrp_port)) {
+ err = mrp_init_port(dev);
+ if (err < 0)
+ goto err1;
+ }
+
+ err = -ENOMEM;
+ app = kzalloc(sizeof(*app), GFP_KERNEL);
+ if (!app)
+ goto err2;
+
+ err = dev_mc_add(dev, appl->group_address);
+ if (err < 0)
+ goto err3;
+
+ app->dev = dev;
+ app->app = appl;
+ app->mad = RB_ROOT;
+ app->active = true;
+ spin_lock_init(&app->lock);
+ skb_queue_head_init(&app->queue);
+ rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
+ timer_setup(&app->join_timer, mrp_join_timer, 0);
+ mrp_join_timer_arm(app);
+ timer_setup(&app->periodic_timer, mrp_periodic_timer, 0);
+ mrp_periodic_timer_arm(app);
+ return 0;
+
+err3:
+ kfree(app);
+err2:
+ mrp_release_port(dev);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(mrp_init_applicant);
+
+void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ struct mrp_applicant *app = rtnl_dereference(
+ port->applicants[appl->type]);
+
+ ASSERT_RTNL();
+
+ RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+
+ spin_lock_bh(&app->lock);
+ app->active = false;
+ spin_unlock_bh(&app->lock);
+ /* Delete timer and generate a final TX event to flush out
+ * all pending messages before the applicant is gone.
+ */
+ timer_shutdown_sync(&app->join_timer);
+ timer_shutdown_sync(&app->periodic_timer);
+
+ spin_lock_bh(&app->lock);
+ mrp_mad_event(app, MRP_EVENT_TX);
+ mrp_attr_destroy_all(app);
+ mrp_pdu_queue(app);
+ spin_unlock_bh(&app->lock);
+
+ mrp_queue_xmit(app);
+
+ dev_mc_del(dev, appl->group_address);
+ kfree_rcu(app, rcu);
+ mrp_release_port(dev);
+}
+EXPORT_SYMBOL_GPL(mrp_uninit_applicant);
+
+int mrp_register_application(struct mrp_application *appl)
+{
+ appl->pkttype.func = mrp_rcv;
+ dev_add_pack(&appl->pkttype);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mrp_register_application);
+
+void mrp_unregister_application(struct mrp_application *appl)
+{
+ dev_remove_pack(&appl->pkttype);
+}
+EXPORT_SYMBOL_GPL(mrp_unregister_application);
diff --git a/net/802/p8022.c b/net/802/p8022.c
deleted file mode 100644
index 2530f35241cd..000000000000
--- a/net/802/p8022.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * NET3: Support for 802.2 demultiplexing off Ethernet (Token ring
- * is kept separate see p8022tr.c)
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Demultiplex 802.2 encoded protocols. We match the entry by the
- * SSAP/DSAP pair and then deliver to the registered datalink that
- * matches. The control byte is ignored and handling of such items
- * is up to the routine passed the frame.
- *
- * Unlike the 802.3 datalink we have a list of 802.2 entries as
- * there are multiple protocols to demux. The list is currently
- * short (3 or 4 entries at most). The current demux assumes this.
- */
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <net/datalink.h>
-#include <linux/mm.h>
-#include <linux/in.h>
-#include <linux/init.h>
-#include <net/llc.h>
-#include <net/p8022.h>
-
-static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
- unsigned char *dest)
-{
- llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
- return 0;
-}
-
-struct datalink_proto *register_8022_client(unsigned char type,
- int (*func)(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *pt,
- struct net_device *orig_dev))
-{
- struct datalink_proto *proto;
-
- proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
- if (proto) {
- proto->type[0] = type;
- proto->header_length = 3;
- proto->request = p8022_request;
- proto->sap = llc_sap_open(type, func);
- if (!proto->sap) {
- kfree(proto);
- proto = NULL;
- }
- }
- return proto;
-}
-
-void unregister_8022_client(struct datalink_proto *proto)
-{
- llc_sap_put(proto->sap);
- kfree(proto);
-}
-
-EXPORT_SYMBOL(register_8022_client);
-EXPORT_SYMBOL(unregister_8022_client);
-
-MODULE_LICENSE("GPL");
diff --git a/net/802/p8023.c b/net/802/p8023.c
deleted file mode 100644
index 53cf05709283..000000000000
--- a/net/802/p8023.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * NET3: 802.3 data link hooks used for IPX 802.3
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 802.3 isn't really a protocol data link layer. Some old IPX stuff
- * uses it however. Note that there is only one 802.3 protocol layer
- * in the system. We don't currently support different protocols
- * running raw 802.3 on different devices. Thankfully nobody else
- * has done anything like the old IPX.
- */
-
-#include <linux/in.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-
-#include <net/datalink.h>
-#include <net/p8022.h>
-
-/*
- * Place an 802.3 header on a packet. The driver will do the mac
- * addresses, we just need to give it the buffer length.
- */
-static int p8023_request(struct datalink_proto *dl,
- struct sk_buff *skb, unsigned char *dest_node)
-{
- struct net_device *dev = skb->dev;
-
- dev->hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len);
- return dev_queue_xmit(skb);
-}
-
-/*
- * Create an 802.3 client. Note there can be only one 802.3 client
- */
-struct datalink_proto *make_8023_client(void)
-{
- struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
-
- if (proto) {
- proto->header_length = 0;
- proto->request = p8023_request;
- }
- return proto;
-}
-
-/*
- * Destroy the 802.3 client.
- */
-void destroy_8023_client(struct datalink_proto *dl)
-{
- kfree(dl);
-}
-
-EXPORT_SYMBOL(destroy_8023_client);
-EXPORT_SYMBOL(make_8023_client);
-
-MODULE_LICENSE("GPL");
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 270b9d2cae65..389df460c8c4 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -1,25 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SNAP data link layer. Derived from 802.2
*
- * Alan Cox <Alan.Cox@linux.org>,
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>,
* from the 802.2 layer by Greg Page.
* Merged in additions from Greg Page's psnap.c.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/datalink.h>
#include <net/llc.h>
#include <net/psnap.h>
#include <linux/mm.h>
#include <linux/in.h>
#include <linux/init.h>
+#include <linux/rculist.h>
static LIST_HEAD(snap_list);
static DEFINE_SPINLOCK(snap_lock);
@@ -28,13 +26,11 @@ static struct llc_sap *snap_sap;
/*
* Find a snap client by matching the 5 bytes.
*/
-static struct datalink_proto *find_snap_client(unsigned char *desc)
+static struct datalink_proto *find_snap_client(const unsigned char *desc)
{
- struct list_head *entry;
struct datalink_proto *proto = NULL, *p;
- list_for_each_rcu(entry, &snap_list) {
- p = list_entry(entry, struct datalink_proto, node);
+ list_for_each_entry_rcu(p, &snap_list, node, lockdep_is_held(&snap_lock)) {
if (!memcmp(p->type, desc, 5)) {
proto = p;
break;
@@ -52,31 +48,38 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
int rc = 1;
struct datalink_proto *proto;
static struct packet_type snap_packet_type = {
- .type = __constant_htons(ETH_P_SNAP),
+ .type = cpu_to_be16(ETH_P_SNAP),
};
+ if (unlikely(!pskb_may_pull(skb, 5)))
+ goto drop;
+
rcu_read_lock();
- proto = find_snap_client(skb->h.raw);
+ proto = find_snap_client(skb->data);
if (proto) {
/* Pass the frame on. */
- skb->h.raw += 5;
skb_pull_rcsum(skb, 5);
+ skb_reset_transport_header(skb);
rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
- } else {
- skb->sk = NULL;
- kfree_skb(skb);
- rc = 1;
}
-
rcu_read_unlock();
+
+ if (unlikely(!proto))
+ goto drop;
+
+out:
return rc;
+
+drop:
+ kfree_skb(skb);
+ goto out;
}
/*
* Put a SNAP header on a frame and pass to 802.2
*/
static int snap_request(struct datalink_proto *dl,
- struct sk_buff *skb, u8 *dest)
+ struct sk_buff *skb, const u8 *dest)
{
memcpy(skb_push(skb, 5), dl->type, 5);
llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap);
@@ -89,15 +92,16 @@ static int snap_request(struct datalink_proto *dl,
EXPORT_SYMBOL(register_snap_client);
EXPORT_SYMBOL(unregister_snap_client);
-static char snap_err_msg[] __initdata =
+static const char snap_err_msg[] __initconst =
KERN_CRIT "SNAP - unable to register with 802.2\n";
static int __init snap_init(void)
{
snap_sap = llc_sap_open(0xAA, snap_rcv);
-
- if (!snap_sap)
+ if (!snap_sap) {
printk(snap_err_msg);
+ return -EBUSY;
+ }
return 0;
}
@@ -115,9 +119,9 @@ module_exit(snap_exit);
/*
* Register SNAP clients. We don't yet use this for IP.
*/
-struct datalink_proto *register_snap_client(unsigned char *desc,
+struct datalink_proto *register_snap_client(const unsigned char *desc,
int (*rcvfunc)(struct sk_buff *,
- struct net_device *,
+ struct net_device *,
struct packet_type *,
struct net_device *))
{
@@ -130,7 +134,7 @@ struct datalink_proto *register_snap_client(unsigned char *desc,
proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
if (proto) {
- memcpy(proto->type, desc,5);
+ memcpy(proto->type, desc, 5);
proto->rcvfunc = rcvfunc;
proto->header_length = 5 + 3; /* snap + 802.2 */
proto->request = snap_request;
@@ -139,7 +143,6 @@ struct datalink_proto *register_snap_client(unsigned char *desc,
out:
spin_unlock_bh(&snap_lock);
- synchronize_net();
return proto;
}
@@ -157,4 +160,5 @@ void unregister_snap_client(struct datalink_proto *proto)
kfree(proto);
}
+MODULE_DESCRIPTION("SNAP data link layer. Derived from 802.2");
MODULE_LICENSE("GPL");
diff --git a/net/802/stp.c b/net/802/stp.c
new file mode 100644
index 000000000000..03c9f75e92c9
--- /dev/null
+++ b/net/802/stp.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * STP SAP demux
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/stp.h>
+
+/* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */
+#define GARP_ADDR_MIN 0x20
+#define GARP_ADDR_MAX 0x2F
+#define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN)
+
+static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly;
+static const struct stp_proto __rcu *stp_proto __read_mostly;
+
+static struct llc_sap *sap __read_mostly;
+static unsigned int sap_registered;
+static DEFINE_MUTEX(stp_proto_mutex);
+
+/* Called under rcu_read_lock from LLC */
+static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct ethhdr *eh = eth_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct stp_proto *proto;
+
+ if (pdu->ssap != LLC_SAP_BSPAN ||
+ pdu->dsap != LLC_SAP_BSPAN ||
+ pdu->ctrl_1 != LLC_PDU_TYPE_U)
+ goto err;
+
+ if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) {
+ proto = rcu_dereference(garp_protos[eh->h_dest[5] -
+ GARP_ADDR_MIN]);
+ if (proto &&
+ !ether_addr_equal(eh->h_dest, proto->group_address))
+ goto err;
+ } else
+ proto = rcu_dereference(stp_proto);
+
+ if (!proto)
+ goto err;
+
+ proto->rcv(proto, skb, dev);
+ return 0;
+
+err:
+ kfree_skb(skb);
+ return 0;
+}
+
+int stp_proto_register(const struct stp_proto *proto)
+{
+ int err = 0;
+
+ mutex_lock(&stp_proto_mutex);
+ if (sap_registered++ == 0) {
+ sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv);
+ if (!sap) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (is_zero_ether_addr(proto->group_address))
+ rcu_assign_pointer(stp_proto, proto);
+ else
+ rcu_assign_pointer(garp_protos[proto->group_address[5] -
+ GARP_ADDR_MIN], proto);
+out:
+ mutex_unlock(&stp_proto_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(stp_proto_register);
+
+void stp_proto_unregister(const struct stp_proto *proto)
+{
+ mutex_lock(&stp_proto_mutex);
+ if (is_zero_ether_addr(proto->group_address))
+ RCU_INIT_POINTER(stp_proto, NULL);
+ else
+ RCU_INIT_POINTER(garp_protos[proto->group_address[5] -
+ GARP_ADDR_MIN], NULL);
+ synchronize_rcu();
+
+ if (--sap_registered == 0)
+ llc_sap_put(sap);
+ mutex_unlock(&stp_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(stp_proto_unregister);
+
+MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)");
+MODULE_LICENSE("GPL");
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
deleted file mode 100644
index ead56037398b..000000000000
--- a/net/802/sysctl_net_802.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* -*- linux-c -*-
- * sysctl_net_802.c: sysctl interface to net 802 subsystem.
- *
- * Begun April 1, 1996, Mike Shaver.
- * Added /proc/sys/net/802 directory entry (empty =) ). [MS]
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/mm.h>
-#include <linux/if_tr.h>
-#include <linux/sysctl.h>
-
-#ifdef CONFIG_TR
-extern int sysctl_tr_rif_timeout;
-#endif
-
-struct ctl_table tr_table[] = {
-#ifdef CONFIG_TR
- {
- .ctl_name = NET_TR_RIF_TIMEOUT,
- .procname = "rif_timeout",
- .data = &sysctl_tr_rif_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
-#endif /* CONFIG_TR */
- { 0 },
-};
diff --git a/net/802/tr.c b/net/802/tr.c
deleted file mode 100644
index 829deb41ce81..000000000000
--- a/net/802/tr.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * NET3: Token ring device handling subroutines
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Fixes: 3 Feb 97 Paul Norton <pnorton@cts.com> Minor routing fixes.
- * Added rif table to /proc/net/tr_rif and rif timeout to
- * /proc/sys/net/token-ring/rif_timeout.
- * 22 Jun 98 Paul Norton <p.norton@computer.org> Rearranged
- * tr_header and tr_type_trans to handle passing IPX SNAP and
- * 802.2 through the correct layers. Eliminated tr_reformat.
- *
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/trdevice.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/net.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <net/arp.h>
-
-static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev);
-static void rif_check_expire(unsigned long dummy);
-
-#define TR_SR_DEBUG 0
-
-/*
- * Each RIF entry we learn is kept this way
- */
-
-struct rif_cache {
- unsigned char addr[TR_ALEN];
- int iface;
- __be16 rcf;
- __be16 rseg[8];
- struct rif_cache *next;
- unsigned long last_used;
- unsigned char local_ring;
-};
-
-#define RIF_TABLE_SIZE 32
-
-/*
- * We hash the RIF cache 32 ways. We do after all have to look it
- * up a lot.
- */
-
-static struct rif_cache *rif_table[RIF_TABLE_SIZE];
-
-static DEFINE_SPINLOCK(rif_lock);
-
-
-/*
- * Garbage disposal timer.
- */
-
-static struct timer_list rif_timer;
-
-int sysctl_tr_rif_timeout = 60*10*HZ;
-
-static inline unsigned long rif_hash(const unsigned char *addr)
-{
- unsigned long x;
-
- x = addr[0];
- x = (x << 2) ^ addr[1];
- x = (x << 2) ^ addr[2];
- x = (x << 2) ^ addr[3];
- x = (x << 2) ^ addr[4];
- x = (x << 2) ^ addr[5];
-
- x ^= x >> 8;
-
- return x & (RIF_TABLE_SIZE - 1);
-}
-
-/*
- * Put the headers on a token ring packet. Token ring source routing
- * makes this a little more exciting than on ethernet.
- */
-
-static int tr_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type,
- void *daddr, void *saddr, unsigned len)
-{
- struct trh_hdr *trh;
- int hdr_len;
-
- /*
- * Add the 802.2 SNAP header if IP as the IPv4/IPv6 code calls
- * dev->hard_header directly.
- */
- if (type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP)
- {
- struct trllc *trllc;
-
- hdr_len = sizeof(struct trh_hdr) + sizeof(struct trllc);
- trh = (struct trh_hdr *)skb_push(skb, hdr_len);
- trllc = (struct trllc *)(trh+1);
- trllc->dsap = trllc->ssap = EXTENDED_SAP;
- trllc->llc = UI_CMD;
- trllc->protid[0] = trllc->protid[1] = trllc->protid[2] = 0x00;
- trllc->ethertype = htons(type);
- }
- else
- {
- hdr_len = sizeof(struct trh_hdr);
- trh = (struct trh_hdr *)skb_push(skb, hdr_len);
- }
-
- trh->ac=AC;
- trh->fc=LLC_FRAME;
-
- if(saddr)
- memcpy(trh->saddr,saddr,dev->addr_len);
- else
- memcpy(trh->saddr,dev->dev_addr,dev->addr_len);
-
- /*
- * Build the destination and then source route the frame
- */
-
- if(daddr)
- {
- memcpy(trh->daddr,daddr,dev->addr_len);
- tr_source_route(skb,trh,dev);
- return(hdr_len);
- }
-
- return -hdr_len;
-}
-
-/*
- * A neighbour discovery of some species (eg arp) has completed. We
- * can now send the packet.
- */
-
-static int tr_rebuild_header(struct sk_buff *skb)
-{
- struct trh_hdr *trh=(struct trh_hdr *)skb->data;
- struct trllc *trllc=(struct trllc *)(skb->data+sizeof(struct trh_hdr));
- struct net_device *dev = skb->dev;
-
- /*
- * FIXME: We don't yet support IPv6 over token rings
- */
-
- if(trllc->ethertype != htons(ETH_P_IP)) {
- printk("tr_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(trllc->ethertype));
- return 0;
- }
-
-#ifdef CONFIG_INET
- if(arp_find(trh->daddr, skb)) {
- return 1;
- }
- else
-#endif
- {
- tr_source_route(skb,trh,dev);
- return 0;
- }
-}
-
-/*
- * Some of this is a bit hackish. We intercept RIF information
- * used for source routing. We also grab IP directly and don't feed
- * it via SNAP.
- */
-
-__be16 tr_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
-
- struct trh_hdr *trh=(struct trh_hdr *)skb->data;
- struct trllc *trllc;
- unsigned riflen=0;
-
- skb->mac.raw = skb->data;
-
- if(trh->saddr[0] & TR_RII)
- riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8;
-
- trllc = (struct trllc *)(skb->data+sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen);
-
- skb_pull(skb,sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen);
-
- if(*trh->daddr & 0x80)
- {
- if(!memcmp(trh->daddr,dev->broadcast,TR_ALEN))
- skb->pkt_type=PACKET_BROADCAST;
- else
- skb->pkt_type=PACKET_MULTICAST;
- }
- else if ( (trh->daddr[0] & 0x01) && (trh->daddr[1] & 0x00) && (trh->daddr[2] & 0x5E))
- {
- skb->pkt_type=PACKET_MULTICAST;
- }
- else if(dev->flags & IFF_PROMISC)
- {
- if(memcmp(trh->daddr, dev->dev_addr, TR_ALEN))
- skb->pkt_type=PACKET_OTHERHOST;
- }
-
- if ((skb->pkt_type != PACKET_BROADCAST) &&
- (skb->pkt_type != PACKET_MULTICAST))
- tr_add_rif_info(trh,dev) ;
-
- /*
- * Strip the SNAP header from ARP packets since we don't
- * pass them through to the 802.2/SNAP layers.
- */
-
- if (trllc->dsap == EXTENDED_SAP &&
- (trllc->ethertype == htons(ETH_P_IP) ||
- trllc->ethertype == htons(ETH_P_IPV6) ||
- trllc->ethertype == htons(ETH_P_ARP)))
- {
- skb_pull(skb, sizeof(struct trllc));
- return trllc->ethertype;
- }
-
- return htons(ETH_P_TR_802_2);
-}
-
-/*
- * We try to do source routing...
- */
-
-void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *dev)
-{
- int slack;
- unsigned int hash;
- struct rif_cache *entry;
- unsigned char *olddata;
- unsigned long flags;
- static const unsigned char mcast_func_addr[]
- = {0xC0,0x00,0x00,0x04,0x00,0x00};
-
- spin_lock_irqsave(&rif_lock, flags);
-
- /*
- * Broadcasts are single route as stated in RFC 1042
- */
- if( (!memcmp(&(trh->daddr[0]),&(dev->broadcast[0]),TR_ALEN)) ||
- (!memcmp(&(trh->daddr[0]),&(mcast_func_addr[0]), TR_ALEN)) )
- {
- trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK)
- | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST);
- trh->saddr[0]|=TR_RII;
- }
- else
- {
- hash = rif_hash(trh->daddr);
- /*
- * Walk the hash table and look for an entry
- */
- for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->daddr[0]),TR_ALEN);entry=entry->next);
-
- /*
- * If we found an entry we can route the frame.
- */
- if(entry)
- {
-#if TR_SR_DEBUG
-printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
- trh->daddr[1],trh->daddr[2],trh->daddr[3],trh->daddr[4],trh->daddr[5]);
-#endif
- if(!entry->local_ring && (ntohs(entry->rcf) & TR_RCF_LEN_MASK) >> 8)
- {
- trh->rcf=entry->rcf;
- memcpy(&trh->rseg[0],&entry->rseg[0],8*sizeof(unsigned short));
- trh->rcf^=htons(TR_RCF_DIR_BIT);
- trh->rcf&=htons(0x1fff); /* Issam Chehab <ichehab@madge1.demon.co.uk> */
-
- trh->saddr[0]|=TR_RII;
-#if TR_SR_DEBUG
- printk("entry found with rcf %04x\n", entry->rcf);
- }
- else
- {
- printk("entry found but without rcf length, local=%02x\n", entry->local_ring);
-#endif
- }
- entry->last_used=jiffies;
- }
- else
- {
- /*
- * Without the information we simply have to shout
- * on the wire. The replies should rapidly clean this
- * situation up.
- */
- trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK)
- | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST);
- trh->saddr[0]|=TR_RII;
-#if TR_SR_DEBUG
- printk("no entry in rif table found - broadcasting frame\n");
-#endif
- }
- }
-
- /* Compress the RIF here so we don't have to do it in the driver(s) */
- if (!(trh->saddr[0] & 0x80))
- slack = 18;
- else
- slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
- olddata = skb->data;
- spin_unlock_irqrestore(&rif_lock, flags);
-
- skb_pull(skb, slack);
- memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
-}
-
-/*
- * We have learned some new RIF information for our source
- * routing.
- */
-
-static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
-{
- unsigned int hash, rii_p = 0;
- unsigned long flags;
- struct rif_cache *entry;
- unsigned char saddr0;
-
- spin_lock_irqsave(&rif_lock, flags);
- saddr0 = trh->saddr[0];
-
- /*
- * Firstly see if the entry exists
- */
-
- if(trh->saddr[0] & TR_RII)
- {
- trh->saddr[0]&=0x7f;
- if (((ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8) > 2)
- {
- rii_p = 1;
- }
- }
-
- hash = rif_hash(trh->saddr);
- for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN);entry=entry->next);
-
- if(entry==NULL)
- {
-#if TR_SR_DEBUG
-printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
- trh->saddr[0],trh->saddr[1],trh->saddr[2],
- trh->saddr[3],trh->saddr[4],trh->saddr[5],
- ntohs(trh->rcf));
-#endif
- /*
- * Allocate our new entry. A failure to allocate loses
- * use the information. This is harmless.
- *
- * FIXME: We ought to keep some kind of cache size
- * limiting and adjust the timers to suit.
- */
- entry=kmalloc(sizeof(struct rif_cache),GFP_ATOMIC);
-
- if(!entry)
- {
- printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
- spin_unlock_irqrestore(&rif_lock, flags);
- return;
- }
-
- memcpy(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN);
- entry->iface = dev->ifindex;
- entry->next=rif_table[hash];
- entry->last_used=jiffies;
- rif_table[hash]=entry;
-
- if (rii_p)
- {
- entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK);
- memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short));
- entry->local_ring = 0;
- }
- else
- {
- entry->local_ring = 1;
- }
- }
- else /* Y. Tahara added */
- {
- /*
- * Update existing entries
- */
- if (!entry->local_ring)
- if (entry->rcf != (trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK)) &&
- !(trh->rcf & htons(TR_RCF_BROADCAST_MASK)))
- {
-#if TR_SR_DEBUG
-printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
- trh->saddr[0],trh->saddr[1],trh->saddr[2],
- trh->saddr[3],trh->saddr[4],trh->saddr[5],
- ntohs(trh->rcf));
-#endif
- entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK);
- memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short));
- }
- entry->last_used=jiffies;
- }
- trh->saddr[0]=saddr0; /* put the routing indicator back for tcpdump */
- spin_unlock_irqrestore(&rif_lock, flags);
-}
-
-/*
- * Scan the cache with a timer and see what we need to throw out.
- */
-
-static void rif_check_expire(unsigned long dummy)
-{
- int i;
- unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
-
- spin_lock_irqsave(&rif_lock, flags);
-
- for(i =0; i < RIF_TABLE_SIZE; i++) {
- struct rif_cache *entry, **pentry;
-
- pentry = rif_table+i;
- while((entry=*pentry) != NULL) {
- unsigned long expires
- = entry->last_used + sysctl_tr_rif_timeout;
-
- if (time_before_eq(expires, jiffies)) {
- *pentry = entry->next;
- kfree(entry);
- } else {
- pentry = &entry->next;
-
- if (time_before(expires, next_interval))
- next_interval = expires;
- }
- }
- }
-
- spin_unlock_irqrestore(&rif_lock, flags);
-
- mod_timer(&rif_timer, next_interval);
-
-}
-
-/*
- * Generate the /proc/net information for the token ring RIF
- * routing.
- */
-
-#ifdef CONFIG_PROC_FS
-
-static struct rif_cache *rif_get_idx(loff_t pos)
-{
- int i;
- struct rif_cache *entry;
- loff_t off = 0;
-
- for(i = 0; i < RIF_TABLE_SIZE; i++)
- for(entry = rif_table[i]; entry; entry = entry->next) {
- if (off == pos)
- return entry;
- ++off;
- }
-
- return NULL;
-}
-
-static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
-{
- spin_lock_irq(&rif_lock);
-
- return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
-}
-
-static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- int i;
- struct rif_cache *ent = v;
-
- ++*pos;
-
- if (v == SEQ_START_TOKEN) {
- i = -1;
- goto scan;
- }
-
- if (ent->next)
- return ent->next;
-
- i = rif_hash(ent->addr);
- scan:
- while (++i < RIF_TABLE_SIZE) {
- if ((ent = rif_table[i]) != NULL)
- return ent;
- }
- return NULL;
-}
-
-static void rif_seq_stop(struct seq_file *seq, void *v)
-{
- spin_unlock_irq(&rif_lock);
-}
-
-static int rif_seq_show(struct seq_file *seq, void *v)
-{
- int j, rcf_len, segment, brdgnmb;
- struct rif_cache *entry = v;
-
- if (v == SEQ_START_TOKEN)
- seq_puts(seq,
- "if TR address TTL rcf routing segments\n");
- else {
- struct net_device *dev = dev_get_by_index(entry->iface);
- long ttl = (long) (entry->last_used + sysctl_tr_rif_timeout)
- - (long) jiffies;
-
- seq_printf(seq, "%s %02X:%02X:%02X:%02X:%02X:%02X %7li ",
- dev?dev->name:"?",
- entry->addr[0],entry->addr[1],entry->addr[2],
- entry->addr[3],entry->addr[4],entry->addr[5],
- ttl/HZ);
-
- if (entry->local_ring)
- seq_puts(seq, "local\n");
- else {
-
- seq_printf(seq, "%04X", ntohs(entry->rcf));
- rcf_len = ((ntohs(entry->rcf) & TR_RCF_LEN_MASK)>>8)-2;
- if (rcf_len)
- rcf_len >>= 1;
- for(j = 1; j < rcf_len; j++) {
- if(j==1) {
- segment=ntohs(entry->rseg[j-1])>>4;
- seq_printf(seq," %03X",segment);
- };
- segment=ntohs(entry->rseg[j])>>4;
- brdgnmb=ntohs(entry->rseg[j-1])&0x00f;
- seq_printf(seq,"-%01X-%03X",brdgnmb,segment);
- }
- seq_putc(seq, '\n');
- }
- }
- return 0;
-}
-
-
-static struct seq_operations rif_seq_ops = {
- .start = rif_seq_start,
- .next = rif_seq_next,
- .stop = rif_seq_stop,
- .show = rif_seq_show,
-};
-
-static int rif_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rif_seq_ops);
-}
-
-static struct file_operations rif_seq_fops = {
- .owner = THIS_MODULE,
- .open = rif_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-#endif
-
-static void tr_setup(struct net_device *dev)
-{
- /*
- * Configure and register
- */
-
- dev->hard_header = tr_header;
- dev->rebuild_header = tr_rebuild_header;
-
- dev->type = ARPHRD_IEEE802_TR;
- dev->hard_header_len = TR_HLEN;
- dev->mtu = 2000;
- dev->addr_len = TR_ALEN;
- dev->tx_queue_len = 100; /* Long queues on tr */
-
- memset(dev->broadcast,0xFF, TR_ALEN);
-
- /* New-style flags. */
- dev->flags = IFF_BROADCAST | IFF_MULTICAST ;
-}
-
-/**
- * alloc_trdev - Register token ring device
- * @sizeof_priv: Size of additional driver-private structure to be allocated
- * for this token ring device
- *
- * Fill in the fields of the device structure with token ring-generic values.
- *
- * Constructs a new net device, complete with a private data area of
- * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
- * this private data area.
- */
-struct net_device *alloc_trdev(int sizeof_priv)
-{
- return alloc_netdev(sizeof_priv, "tr%d", tr_setup);
-}
-
-/*
- * Called during bootup. We don't actually have to initialise
- * too much for this.
- */
-
-static int __init rif_init(void)
-{
- init_timer(&rif_timer);
- rif_timer.expires = sysctl_tr_rif_timeout;
- rif_timer.data = 0L;
- rif_timer.function = rif_check_expire;
- add_timer(&rif_timer);
-
- proc_net_fops_create("tr_rif", S_IRUGO, &rif_seq_fops);
- return 0;
-}
-
-module_init(rif_init);
-
-EXPORT_SYMBOL(tr_type_trans);
-EXPORT_SYMBOL(alloc_trdev);
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
index c4a382e450e2..8bf7a1765b78 100644
--- a/net/8021q/Kconfig
+++ b/net/8021q/Kconfig
@@ -1,19 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Configuration for 802.1Q VLAN support
#
config VLAN_8021Q
- tristate "802.1Q VLAN Support"
- ---help---
+ tristate "802.1Q/802.1ad VLAN Support"
+ help
Select this and you will be able to create 802.1Q VLAN interfaces
- on your ethernet interfaces. 802.1Q VLAN supports almost
- everything a regular ethernet interface does, including
- firewalling, bridging, and of course IP traffic. You will need
- the 'vconfig' tool from the VLAN project in order to effectively
- use VLANs. See the VLAN web page for more information:
+ on your Ethernet interfaces. 802.1Q VLAN supports almost
+ everything a regular Ethernet interface does, including
+ firewalling, bridging, and of course IP traffic. You will need
+ the 'ip' utility in order to effectively use VLANs.
+ See the VLAN web page for more information:
<http://www.candelatech.com/~greear/vlan.html>
To compile this code as a module, choose M here: the module
will be called 8021q.
If unsure, say N.
+
+config VLAN_8021Q_GVRP
+ bool "GVRP (GARP VLAN Registration Protocol) support"
+ depends on VLAN_8021Q
+ select GARP
+ help
+ Select this to enable GVRP end-system support. GVRP is used for
+ automatic propagation of registered VLANs to switches.
+
+ If unsure, say N.
+
+config VLAN_8021Q_MVRP
+ bool "MVRP (Multiple VLAN Registration Protocol) support"
+ depends on VLAN_8021Q
+ select MRP
+ help
+ Select this to enable MVRP end-system support. MVRP is used for
+ automatic propagation of registered VLANs to switches; it
+ supersedes GVRP and is not backwards-compatible.
+
+ If unsure, say N.
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
index 97feb44dbdce..e05d4d7aab35 100644
--- a/net/8021q/Makefile
+++ b/net/8021q/Makefile
@@ -1,12 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux VLAN layer.
#
+obj-$(subst m,y,$(CONFIG_VLAN_8021Q)) += vlan_core.o
+obj-$(CONFIG_VLAN_8021Q) += 8021q.o
-obj-$(CONFIG_VLAN_8021Q) += 8021q.o
-
-8021q-objs := vlan.o vlan_dev.o
-
-ifeq ($(CONFIG_PROC_FS),y)
-8021q-objs += vlanproc.o
-endif
-
+8021q-y := vlan.o vlan_dev.o vlan_netlink.o
+8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o
+8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o
+8021q-$(CONFIG_PROC_FS) += vlanproc.o
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 18fcb9fa518d..2b74ed56eb16 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -1,36 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET 802.1Q VLAN
* Ethernet-type device handling.
*
* Authors: Ben Greear <greearb@candelatech.com>
- * Please send support related email to: vlan@scry.wanfear.com
+ * Please send support related email to: netdev@vger.kernel.org
* VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
- *
+ *
* Fixes:
* Fix for packet capture - Nick Eggleston <nick@dccinc.com>;
* Add HW acceleration hooks - David S. Miller <davem@redhat.com>;
* Correct all the locking - David S. Miller <davem@redhat.com>;
* Use hash table for VLAN groups - David S. Miller <davem@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-#include <asm/uaccess.h> /* for copy_from_user */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <net/datalink.h>
-#include <linux/mm.h>
-#include <linux/in.h>
+#include <linux/slab.h>
#include <linux/init.h>
-#include <net/p8022.h>
+#include <linux/rculist.h>
#include <net/arp.h>
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/uaccess.h>
#include <linux/if_vlan.h>
#include "vlan.h"
@@ -40,563 +39,385 @@
/* Global VLAN variables */
-/* Our listing of VLAN group(s) */
-static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE];
-#define vlan_grp_hashfn(IDX) ((((IDX) >> VLAN_GRP_HASH_SHIFT) ^ (IDX)) & VLAN_GRP_HASH_MASK)
-
-static char vlan_fullname[] = "802.1Q VLAN Support";
-static char vlan_version[] = DRV_VERSION;
-static char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
-static char vlan_buggyright[] = "David S. Miller <davem@redhat.com>";
-
-static int vlan_device_event(struct notifier_block *, unsigned long, void *);
-static int vlan_ioctl_handler(void __user *);
-static int unregister_vlan_dev(struct net_device *, unsigned short );
+unsigned int vlan_net_id __read_mostly;
-static struct notifier_block vlan_notifier_block = {
- .notifier_call = vlan_device_event,
-};
-
-/* These may be changed at run-time through IOCTLs */
-
-/* Determines interface naming scheme. */
-unsigned short vlan_name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD;
-
-static struct packet_type vlan_packet_type = {
- .type = __constant_htons(ETH_P_8021Q),
- .func = vlan_skb_recv, /* VLAN receive method */
-};
+const char vlan_fullname[] = "802.1Q VLAN Support";
+const char vlan_version[] = DRV_VERSION;
/* End of global variables definitions. */
-/*
- * Function vlan_proto_init (pro)
- *
- * Initialize VLAN protocol layer,
- *
- */
-static int __init vlan_proto_init(void)
+static int vlan_group_prealloc_vid(struct vlan_group *vg,
+ __be16 vlan_proto, u16 vlan_id)
{
- int err;
+ struct net_device **array;
+ unsigned int vidx;
+ unsigned int size;
+ int pidx;
- printk(VLAN_INF "%s v%s %s\n",
- vlan_fullname, vlan_version, vlan_copyright);
- printk(VLAN_INF "All bugs added by %s\n",
- vlan_buggyright);
-
- /* proc file system initialization */
- err = vlan_proc_init();
- if (err < 0) {
- printk(KERN_ERR
- "%s %s: can't create entry in proc filesystem!\n",
- __FUNCTION__, VLAN_NAME);
- return err;
- }
+ ASSERT_RTNL();
- dev_add_pack(&vlan_packet_type);
+ pidx = vlan_proto_idx(vlan_proto);
+ if (pidx < 0)
+ return -EINVAL;
- /* Register us to receive netdevice events */
- err = register_netdevice_notifier(&vlan_notifier_block);
- if (err < 0) {
- dev_remove_pack(&vlan_packet_type);
- vlan_proc_cleanup();
- return err;
- }
+ vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN;
+ array = vg->vlan_devices_arrays[pidx][vidx];
+ if (array != NULL)
+ return 0;
- vlan_ioctl_set(vlan_ioctl_handler);
+ size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
+ array = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (array == NULL)
+ return -ENOBUFS;
+
+ /* paired with smp_rmb() in __vlan_group_get_device() */
+ smp_wmb();
+ vg->vlan_devices_arrays[pidx][vidx] = array;
return 0;
}
-/* Cleanup all vlan devices
- * Note: devices that have been registered that but not
- * brought up will exist but have no module ref count.
- */
-static void __exit vlan_cleanup_devices(void)
+static void vlan_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev,
+ struct vlan_dev_priv *vlan)
{
- struct net_device *dev, *nxt;
-
- rtnl_lock();
- for (dev = dev_base; dev; dev = nxt) {
- nxt = dev->next;
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev,
- VLAN_DEV_INFO(dev)->vlan_id);
-
- unregister_netdevice(dev);
- }
- }
- rtnl_unlock();
+ if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_stacked_transfer_operstate(rootdev, dev);
}
-/*
- * Module 'remove' entry point.
- * o delete /proc/net/router directory and static entries.
- */
-static void __exit vlan_cleanup_module(void)
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
{
- int i;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ struct vlan_info *vlan_info;
+ struct vlan_group *grp;
+ u16 vlan_id = vlan->vlan_id;
- vlan_ioctl_set(NULL);
+ ASSERT_RTNL();
- /* Un-register us from receiving netdevice events */
- unregister_netdevice_notifier(&vlan_notifier_block);
+ vlan_info = rtnl_dereference(real_dev->vlan_info);
+ BUG_ON(!vlan_info);
- dev_remove_pack(&vlan_packet_type);
- vlan_cleanup_devices();
+ grp = &vlan_info->grp;
- /* This table must be empty if there are no module
- * references left.
- */
- for (i = 0; i < VLAN_GRP_HASH_SIZE; i++) {
- BUG_ON(!hlist_empty(&vlan_group_hash[i]));
- }
- vlan_proc_cleanup();
+ grp->nr_vlan_devs--;
- synchronize_net();
-}
+ if (vlan->flags & VLAN_FLAG_MVRP)
+ vlan_mvrp_request_leave(dev);
+ if (vlan->flags & VLAN_FLAG_GVRP)
+ vlan_gvrp_request_leave(dev);
-module_init(vlan_proto_init);
-module_exit(vlan_cleanup_module);
+ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL);
-/* Must be invoked with RCU read lock (no preempt) */
-static struct vlan_group *__vlan_find_group(int real_dev_ifindex)
-{
- struct vlan_group *grp;
- struct hlist_node *n;
- int hash = vlan_grp_hashfn(real_dev_ifindex);
+ netdev_upper_dev_unlink(real_dev, dev);
+ /* Because unregister_netdevice_queue() makes sure at least one rcu
+ * grace period is respected before device freeing,
+ * we dont need to call synchronize_net() here.
+ */
+ unregister_netdevice_queue(dev, head);
- hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) {
- if (grp->real_dev_ifindex == real_dev_ifindex)
- return grp;
+ if (grp->nr_vlan_devs == 0) {
+ vlan_mvrp_uninit_applicant(real_dev);
+ vlan_gvrp_uninit_applicant(real_dev);
}
- return NULL;
+ vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
}
-/* Find the protocol handler. Assumes VID < VLAN_VID_MASK.
- *
- * Must be invoked with RCU read lock (no preempt)
- */
-struct net_device *__find_vlan_dev(struct net_device *real_dev,
- unsigned short VID)
+int vlan_check_real_dev(struct net_device *real_dev,
+ __be16 protocol, u16 vlan_id,
+ struct netlink_ext_ack *extack)
{
- struct vlan_group *grp = __vlan_find_group(real_dev->ifindex);
+ const char *name = real_dev->name;
- if (grp)
- return grp->vlan_devices[VID];
+ if (real_dev->features & NETIF_F_VLAN_CHALLENGED ||
+ real_dev->type != ARPHRD_ETHER) {
+ pr_info("VLANs not supported on %s\n", name);
+ NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
+ return -EOPNOTSUPP;
+ }
- return NULL;
-}
+ if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists");
+ return -EEXIST;
+ }
-static void vlan_rcu_free(struct rcu_head *rcu)
-{
- kfree(container_of(rcu, struct vlan_group, rcu));
+ return 0;
}
-
-/* This returns 0 if everything went fine.
- * It will return 1 if the group was killed as a result.
- * A negative return indicates failure.
- *
- * The RTNL lock must be held.
- */
-static int unregister_vlan_dev(struct net_device *real_dev,
- unsigned short vlan_id)
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
{
- struct net_device *dev = NULL;
- int real_dev_ifindex = real_dev->ifindex;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ u16 vlan_id = vlan->vlan_id;
+ struct vlan_info *vlan_info;
struct vlan_group *grp;
- int i, ret;
-
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: VID: %i\n", __FUNCTION__, vlan_id);
-#endif
+ int err;
- /* sanity check */
- if (vlan_id >= VLAN_VID_MASK)
- return -EINVAL;
+ err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id);
+ if (err)
+ return err;
- ASSERT_RTNL();
- grp = __vlan_find_group(real_dev_ifindex);
-
- ret = 0;
-
- if (grp) {
- dev = grp->vlan_devices[vlan_id];
- if (dev) {
- /* Remove proc entry */
- vlan_proc_rem_dev(dev);
-
- /* Take it out of our own structures, but be sure to
- * interlock with HW accelerating devices or SW vlan
- * input packet processing.
- */
- if (real_dev->features &
- (NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER)) {
- real_dev->vlan_rx_kill_vid(real_dev, vlan_id);
- }
-
- grp->vlan_devices[vlan_id] = NULL;
- synchronize_net();
-
-
- /* Caller unregisters (and if necessary, puts)
- * VLAN device, but we get rid of the reference to
- * real_dev here.
- */
- dev_put(real_dev);
-
- /* If the group is now empty, kill off the
- * group.
- */
- for (i = 0; i < VLAN_VID_MASK; i++)
- if (grp->vlan_devices[i])
- break;
-
- if (i == VLAN_VID_MASK) {
- if (real_dev->features & NETIF_F_HW_VLAN_RX)
- real_dev->vlan_rx_register(real_dev, NULL);
-
- hlist_del_rcu(&grp->hlist);
-
- /* Free the group, after all cpu's are done. */
- call_rcu(&grp->rcu, vlan_rcu_free);
-
- grp = NULL;
- ret = 1;
- }
- }
+ vlan_info = rtnl_dereference(real_dev->vlan_info);
+ /* vlan_info should be there now. vlan_vid_add took care of it */
+ BUG_ON(!vlan_info);
+
+ grp = &vlan_info->grp;
+ if (grp->nr_vlan_devs == 0) {
+ err = vlan_gvrp_init_applicant(real_dev);
+ if (err < 0)
+ goto out_vid_del;
+ err = vlan_mvrp_init_applicant(real_dev);
+ if (err < 0)
+ goto out_uninit_gvrp;
}
- return ret;
-}
-
-static int unregister_vlan_device(const char *vlan_IF_name)
-{
- struct net_device *dev = NULL;
- int ret;
-
+ err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id);
+ if (err < 0)
+ goto out_uninit_mvrp;
- dev = dev_get_by_name(vlan_IF_name);
- ret = -EINVAL;
- if (dev) {
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- rtnl_lock();
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out_uninit_mvrp;
- ret = unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev,
- VLAN_DEV_INFO(dev)->vlan_id);
+ err = netdev_upper_dev_link(real_dev, dev, extack);
+ if (err)
+ goto out_unregister_netdev;
- dev_put(dev);
- unregister_netdevice(dev);
+ vlan_stacked_transfer_operstate(real_dev, dev, vlan);
+ linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */
- rtnl_unlock();
-
- if (ret == 1)
- ret = 0;
- } else {
- printk(VLAN_ERR
- "%s: ERROR: Tried to remove a non-vlan device "
- "with VLAN code, name: %s priv_flags: %hX\n",
- __FUNCTION__, dev->name, dev->priv_flags);
- dev_put(dev);
- ret = -EPERM;
- }
- } else {
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: WARNING: Could not find dev.\n", __FUNCTION__);
-#endif
- ret = -EINVAL;
- }
-
- return ret;
-}
-
-static void vlan_setup(struct net_device *new_dev)
-{
- SET_MODULE_OWNER(new_dev);
-
- /* new_dev->ifindex = 0; it will be set when added to
- * the global list.
- * iflink is set as well.
+ /* So, got the sucker initialized, now lets place
+ * it into our local structure.
*/
- new_dev->get_stats = vlan_dev_get_stats;
+ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev);
+ grp->nr_vlan_devs++;
- /* Make this thing known as a VLAN device */
- new_dev->priv_flags |= IFF_802_1Q_VLAN;
-
- /* Set us up to have no queue, as the underlying Hardware device
- * can do all the queueing we could want.
- */
- new_dev->tx_queue_len = 0;
-
- /* set up method calls */
- new_dev->change_mtu = vlan_dev_change_mtu;
- new_dev->open = vlan_dev_open;
- new_dev->stop = vlan_dev_stop;
- new_dev->set_mac_address = vlan_dev_set_mac_address;
- new_dev->set_multicast_list = vlan_dev_set_multicast_list;
- new_dev->destructor = free_netdev;
- new_dev->do_ioctl = vlan_dev_ioctl;
-}
+ netdev_update_features(dev);
-static void vlan_transfer_operstate(const struct net_device *dev, struct net_device *vlandev)
-{
- /* Have to respect userspace enforced dormant state
- * of real device, also must allow supplicant running
- * on VLAN device
- */
- if (dev->operstate == IF_OPER_DORMANT)
- netif_dormant_on(vlandev);
- else
- netif_dormant_off(vlandev);
-
- if (netif_carrier_ok(dev)) {
- if (!netif_carrier_ok(vlandev))
- netif_carrier_on(vlandev);
- } else {
- if (netif_carrier_ok(vlandev))
- netif_carrier_off(vlandev);
- }
-}
-
-/*
- * vlan network devices have devices nesting below it, and are a special
- * "super class" of normal network devices; split their locks off into a
- * separate class since they always nest.
- */
-static struct lock_class_key vlan_netdev_xmit_lock_key;
+ return 0;
+out_unregister_netdev:
+ unregister_netdevice(dev);
+out_uninit_mvrp:
+ if (grp->nr_vlan_devs == 0)
+ vlan_mvrp_uninit_applicant(real_dev);
+out_uninit_gvrp:
+ if (grp->nr_vlan_devs == 0)
+ vlan_gvrp_uninit_applicant(real_dev);
+out_vid_del:
+ vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
+ return err;
+}
/* Attach a VLAN device to a mac address (ie Ethernet Card).
- * Returns the device that was created, or NULL if there was
- * an error of some kind.
+ * Returns 0 if the device was created or a negative error code otherwise.
*/
-static struct net_device *register_vlan_device(const char *eth_IF_name,
- unsigned short VLAN_ID)
+static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
{
- struct vlan_group *grp;
struct net_device *new_dev;
- struct net_device *real_dev; /* the ethernet device */
+ struct vlan_dev_priv *vlan;
+ struct net *net = dev_net(real_dev);
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
char name[IFNAMSIZ];
+ int err;
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: if_name -:%s:- vid: %i\n",
- __FUNCTION__, eth_IF_name, VLAN_ID);
-#endif
-
- if (VLAN_ID >= VLAN_VID_MASK)
- goto out_ret_null;
-
- /* find the device relating to eth_IF_name. */
- real_dev = dev_get_by_name(eth_IF_name);
- if (!real_dev)
- goto out_ret_null;
-
- if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
- printk(VLAN_DBG "%s: VLANs not supported on %s.\n",
- __FUNCTION__, real_dev->name);
- goto out_put_dev;
- }
-
- if ((real_dev->features & NETIF_F_HW_VLAN_RX) &&
- (real_dev->vlan_rx_register == NULL ||
- real_dev->vlan_rx_kill_vid == NULL)) {
- printk(VLAN_DBG "%s: Device %s has buggy VLAN hw accel.\n",
- __FUNCTION__, real_dev->name);
- goto out_put_dev;
- }
-
- if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) &&
- (real_dev->vlan_rx_add_vid == NULL ||
- real_dev->vlan_rx_kill_vid == NULL)) {
- printk(VLAN_DBG "%s: Device %s has buggy VLAN hw accel.\n",
- __FUNCTION__, real_dev->name);
- goto out_put_dev;
- }
-
- /* From this point on, all the data structures must remain
- * consistent.
- */
- rtnl_lock();
-
- /* The real device must be up and operating in order to
- * assosciate a VLAN device with it.
- */
- if (!(real_dev->flags & IFF_UP))
- goto out_unlock;
+ if (vlan_id >= VLAN_VID_MASK)
+ return -ERANGE;
- if (__find_vlan_dev(real_dev, VLAN_ID) != NULL) {
- /* was already registered. */
- printk(VLAN_DBG "%s: ALREADY had VLAN registered\n", __FUNCTION__);
- goto out_unlock;
- }
+ err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id,
+ NULL);
+ if (err < 0)
+ return err;
/* Gotta set up the fields for the device. */
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "About to allocate name, vlan_name_type: %i\n",
- vlan_name_type);
-#endif
- switch (vlan_name_type) {
+ switch (vn->name_type) {
case VLAN_NAME_TYPE_RAW_PLUS_VID:
/* name will look like: eth1.0005 */
- snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, VLAN_ID);
+ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id);
break;
case VLAN_NAME_TYPE_PLUS_VID_NO_PAD:
/* Put our vlan.VID in the name.
* Name will look like: vlan5
*/
- snprintf(name, IFNAMSIZ, "vlan%i", VLAN_ID);
+ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id);
break;
case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD:
/* Put our vlan.VID in the name.
* Name will look like: eth0.5
*/
- snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, VLAN_ID);
+ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id);
break;
case VLAN_NAME_TYPE_PLUS_VID:
/* Put our vlan.VID in the name.
* Name will look like: vlan0005
*/
default:
- snprintf(name, IFNAMSIZ, "vlan%.4i", VLAN_ID);
- };
-
- new_dev = alloc_netdev(sizeof(struct vlan_dev_info), name,
- vlan_setup);
-
- if (new_dev == NULL)
- goto out_unlock;
+ snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
+ }
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "Allocated new name -:%s:-\n", new_dev->name);
-#endif
- /* IFF_BROADCAST|IFF_MULTICAST; ??? */
- new_dev->flags = real_dev->flags;
- new_dev->flags &= ~IFF_UP;
+ new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name,
+ NET_NAME_UNKNOWN, vlan_setup);
- new_dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) |
- (1<<__LINK_STATE_DORMANT))) |
- (1<<__LINK_STATE_PRESENT);
+ if (new_dev == NULL)
+ return -ENOBUFS;
+ dev_net_set(new_dev, net);
/* need 4 bytes for extra VLAN header info,
* hope the underlying device can handle it.
*/
new_dev->mtu = real_dev->mtu;
- /* TODO: maybe just assign it to be ETHERNET? */
- new_dev->type = real_dev->type;
+ vlan = vlan_dev_priv(new_dev);
+ vlan->vlan_proto = htons(ETH_P_8021Q);
+ vlan->vlan_id = vlan_id;
+ vlan->real_dev = real_dev;
+ vlan->dent = NULL;
+ vlan->flags = VLAN_FLAG_REORDER_HDR;
- new_dev->hard_header_len = real_dev->hard_header_len;
- if (!(real_dev->features & NETIF_F_HW_VLAN_TX)) {
- /* Regular ethernet + 4 bytes (18 total). */
- new_dev->hard_header_len += VLAN_HLEN;
- }
+ new_dev->rtnl_link_ops = &vlan_link_ops;
+ err = register_vlan_dev(new_dev, NULL);
+ if (err < 0)
+ goto out_free_newdev;
- VLAN_MEM_DBG("new_dev->priv malloc, addr: %p size: %i\n",
- new_dev->priv,
- sizeof(struct vlan_dev_info));
-
- memcpy(new_dev->broadcast, real_dev->broadcast, real_dev->addr_len);
- memcpy(new_dev->dev_addr, real_dev->dev_addr, real_dev->addr_len);
- new_dev->addr_len = real_dev->addr_len;
-
- if (real_dev->features & NETIF_F_HW_VLAN_TX) {
- new_dev->hard_header = real_dev->hard_header;
- new_dev->hard_start_xmit = vlan_dev_hwaccel_hard_start_xmit;
- new_dev->rebuild_header = real_dev->rebuild_header;
- } else {
- new_dev->hard_header = vlan_dev_hard_header;
- new_dev->hard_start_xmit = vlan_dev_hard_start_xmit;
- new_dev->rebuild_header = vlan_dev_rebuild_header;
- }
- new_dev->hard_header_parse = real_dev->hard_header_parse;
+ return 0;
- VLAN_DEV_INFO(new_dev)->vlan_id = VLAN_ID; /* 1 through VLAN_VID_MASK */
- VLAN_DEV_INFO(new_dev)->real_dev = real_dev;
- VLAN_DEV_INFO(new_dev)->dent = NULL;
- VLAN_DEV_INFO(new_dev)->flags = 1;
+out_free_newdev:
+ free_netdev(new_dev);
+ return err;
+}
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "About to go find the group for idx: %i\n",
- real_dev->ifindex);
-#endif
-
- if (register_netdevice(new_dev))
- goto out_free_newdev;
+static void vlan_sync_address(struct net_device *dev,
+ struct net_device *vlandev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
- lockdep_set_class(&new_dev->_xmit_lock, &vlan_netdev_xmit_lock_key);
+ /* May be called without an actual change */
+ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
+ return;
- new_dev->iflink = real_dev->ifindex;
- vlan_transfer_operstate(real_dev, new_dev);
- linkwatch_fire_event(new_dev); /* _MUST_ call rfc2863_policy() */
+ /* vlan continues to inherit address of lower device */
+ if (vlan_dev_inherit_address(vlandev, dev))
+ goto out;
- /* So, got the sucker initialized, now lets place
- * it into our local structure.
- */
- grp = __vlan_find_group(real_dev->ifindex);
+ /* vlan address was different from the old address and is equal to
+ * the new address */
+ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
+ ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
+ dev_uc_del(dev, vlandev->dev_addr);
- /* Note, we are running under the RTNL semaphore
- * so it cannot "appear" on us.
- */
- if (!grp) { /* need to add a new group */
- grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL);
- if (!grp)
- goto out_free_unregister;
-
- /* printk(KERN_ALERT "VLAN REGISTER: Allocated new group.\n"); */
- grp->real_dev_ifindex = real_dev->ifindex;
-
- hlist_add_head_rcu(&grp->hlist,
- &vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]);
-
- if (real_dev->features & NETIF_F_HW_VLAN_RX)
- real_dev->vlan_rx_register(real_dev, grp);
- }
-
- grp->vlan_devices[VLAN_ID] = new_dev;
+ /* vlan address was equal to the old address and is different from
+ * the new address */
+ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
+ !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
+ dev_uc_add(dev, vlandev->dev_addr);
- if (vlan_proc_add_dev(new_dev)<0)/* create it's proc entry */
- printk(KERN_WARNING "VLAN: failed to add proc entry for %s\n",
- new_dev->name);
+out:
+ ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
+}
- if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
- real_dev->vlan_rx_add_vid(real_dev, VLAN_ID);
+static void vlan_transfer_features(struct net_device *dev,
+ struct net_device *vlandev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
- rtnl_unlock();
+ netif_inherit_tso_max(vlandev, dev);
+ if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
+ vlandev->hard_header_len = dev->hard_header_len;
+ else
+ vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN;
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "Allocated new device successfully, returning.\n");
+#if IS_ENABLED(CONFIG_FCOE)
+ vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid;
#endif
- return new_dev;
-out_free_unregister:
- unregister_netdev(new_dev);
- goto out_unlock;
+ vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
+ vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev);
-out_free_newdev:
- free_netdev(new_dev);
+ netdev_update_features(vlandev);
+}
-out_unlock:
- rtnl_unlock();
+static int __vlan_device_event(struct net_device *dev, unsigned long event)
+{
+ int err = 0;
-out_put_dev:
- dev_put(real_dev);
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ vlan_proc_rem_dev(dev);
+ err = vlan_proc_add_dev(dev);
+ break;
+ case NETDEV_REGISTER:
+ err = vlan_proc_add_dev(dev);
+ break;
+ case NETDEV_UNREGISTER:
+ vlan_proc_rem_dev(dev);
+ break;
+ }
+
+ return err;
+}
+
+static void vlan_vid0_add(struct net_device *dev)
+{
+ struct vlan_info *vlan_info;
+ int err;
+
+ if (!(dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+ return;
+
+ pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name);
+
+ err = vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
+ if (err)
+ return;
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ vlan_info->auto_vid0 = true;
+}
+
+static void vlan_vid0_del(struct net_device *dev)
+{
+ struct vlan_info *vlan_info = rtnl_dereference(dev->vlan_info);
+
+ if (!vlan_info || !vlan_info->auto_vid0)
+ return;
-out_ret_null:
- return NULL;
+ vlan_info->auto_vid0 = false;
+ vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
}
-static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
+static int vlan_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
{
- struct net_device *dev = ptr;
- struct vlan_group *grp = __vlan_find_group(dev->ifindex);
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct vlan_group *grp;
+ struct vlan_info *vlan_info;
int i, flgs;
struct net_device *vlandev;
+ struct vlan_dev_priv *vlan;
+ bool last = false;
+ LIST_HEAD(list);
+ int err;
+
+ if (is_vlan_dev(dev)) {
+ int err = __vlan_device_event(dev, event);
- if (!grp)
+ if (err)
+ return notifier_from_errno(err);
+ }
+
+ if (event == NETDEV_UP)
+ vlan_vid0_add(dev);
+ else if (event == NETDEV_DOWN)
+ vlan_vid0_del(dev);
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
goto out;
+ grp = &vlan_info->grp;
/* It is OK that we do not hold the group lock right now,
* as we run under the RTNL lock.
@@ -605,123 +426,214 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
switch (event) {
case NETDEV_CHANGE:
/* Propagate real device state to vlan devices */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
- vlandev = grp->vlan_devices[i];
- if (!vlandev)
+ vlan_group_for_each_dev(grp, i, vlandev)
+ vlan_stacked_transfer_operstate(dev, vlandev,
+ vlan_dev_priv(vlandev));
+ break;
+
+ case NETDEV_CHANGEADDR:
+ /* Adjust unicast filters on underlying device */
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ flgs = vlandev->flags;
+ if (!(flgs & IFF_UP))
continue;
- vlan_transfer_operstate(dev, vlandev);
+ vlan_sync_address(dev, vlandev);
}
break;
- case NETDEV_DOWN:
- /* Put all VLANs for this dev in the down state too. */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
- vlandev = grp->vlan_devices[i];
- if (!vlandev)
+ case NETDEV_CHANGEMTU:
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ if (vlandev->mtu <= dev->mtu)
continue;
+ dev_set_mtu(vlandev, dev->mtu);
+ }
+ break;
+
+ case NETDEV_FEAT_CHANGE:
+ /* Propagate device features to underlying device */
+ vlan_group_for_each_dev(grp, i, vlandev)
+ vlan_transfer_features(dev, vlandev);
+ break;
+
+ case NETDEV_DOWN: {
+ struct net_device *tmp;
+ LIST_HEAD(close_list);
+
+ /* Put all VLANs for this dev in the down state too. */
+ vlan_group_for_each_dev(grp, i, vlandev) {
flgs = vlandev->flags;
if (!(flgs & IFF_UP))
continue;
- dev_change_flags(vlandev, flgs & ~IFF_UP);
+ vlan = vlan_dev_priv(vlandev);
+ if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+ list_add(&vlandev->close_list, &close_list);
}
- break;
+ netif_close_many(&close_list, false);
+
+ list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) {
+ vlan_stacked_transfer_operstate(dev, vlandev,
+ vlan_dev_priv(vlandev));
+ list_del_init(&vlandev->close_list);
+ }
+ list_del(&close_list);
+ break;
+ }
case NETDEV_UP:
/* Put all VLANs for this dev in the up state too. */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
- vlandev = grp->vlan_devices[i];
- if (!vlandev)
- continue;
-
- flgs = vlandev->flags;
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ flgs = netif_get_flags(vlandev);
if (flgs & IFF_UP)
continue;
- dev_change_flags(vlandev, flgs | IFF_UP);
+ vlan = vlan_dev_priv(vlandev);
+ if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+ dev_change_flags(vlandev, flgs | IFF_UP,
+ extack);
+ vlan_stacked_transfer_operstate(dev, vlandev, vlan);
}
break;
-
+
case NETDEV_UNREGISTER:
- /* Delete all VLANs for this dev. */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
- int ret;
+ /* twiddle thumbs on netns device moves */
+ if (dev->reg_state != NETREG_UNREGISTERING)
+ break;
+
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ /* removal of last vid destroys vlan_info, abort
+ * afterwards */
+ if (vlan_info->nr_vids == 1)
+ last = true;
+
+ unregister_vlan_dev(vlandev, &list);
+ if (last)
+ break;
+ }
+ unregister_netdevice_many(&list);
+ break;
- vlandev = grp->vlan_devices[i];
- if (!vlandev)
- continue;
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* Forbid underlaying device to change its type. */
+ if (vlan_uses_dev(dev))
+ return NOTIFY_BAD;
+ break;
- ret = unregister_vlan_dev(dev,
- VLAN_DEV_INFO(vlandev)->vlan_id);
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_RESEND_IGMP:
+ /* Propagate to vlan devices */
+ vlan_group_for_each_dev(grp, i, vlandev)
+ call_netdevice_notifiers(event, vlandev);
+ break;
- unregister_netdevice(vlandev);
+ case NETDEV_CVLAN_FILTER_PUSH_INFO:
+ err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q));
+ if (err)
+ return notifier_from_errno(err);
+ break;
- /* Group was destroyed? */
- if (ret == 1)
- break;
- }
+ case NETDEV_CVLAN_FILTER_DROP_INFO:
+ vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q));
break;
- };
+
+ case NETDEV_SVLAN_FILTER_PUSH_INFO:
+ err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD));
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
+ case NETDEV_SVLAN_FILTER_DROP_INFO:
+ vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD));
+ break;
+ }
out:
return NOTIFY_DONE;
}
+static struct notifier_block vlan_notifier_block __read_mostly = {
+ .notifier_call = vlan_device_event,
+};
+
/*
* VLAN IOCTL handler.
* o execute requested action or pass command to the device driver
* arg is really a struct vlan_ioctl_args __user *.
*/
-static int vlan_ioctl_handler(void __user *arg)
+static int vlan_ioctl_handler(struct net *net, void __user *arg)
{
- int err = 0;
- unsigned short vid = 0;
+ int err;
struct vlan_ioctl_args args;
+ struct net_device *dev = NULL;
if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args)))
return -EFAULT;
/* Null terminate this sucker, just in case. */
- args.device1[23] = 0;
- args.u.device2[23] = 0;
+ args.device1[sizeof(args.device1) - 1] = 0;
+ args.u.device2[sizeof(args.u.device2) - 1] = 0;
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: args.cmd: %x\n", __FUNCTION__, args.cmd);
-#endif
+ rtnl_lock();
+
+ switch (args.cmd) {
+ case SET_VLAN_INGRESS_PRIORITY_CMD:
+ case SET_VLAN_EGRESS_PRIORITY_CMD:
+ case SET_VLAN_FLAG_CMD:
+ case ADD_VLAN_CMD:
+ case DEL_VLAN_CMD:
+ case GET_VLAN_REALDEV_NAME_CMD:
+ case GET_VLAN_VID_CMD:
+ err = -ENODEV;
+ dev = __dev_get_by_name(net, args.device1);
+ if (!dev)
+ goto out;
+
+ err = -EINVAL;
+ if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev))
+ goto out;
+ }
switch (args.cmd) {
case SET_VLAN_INGRESS_PRIORITY_CMD:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- err = vlan_dev_set_ingress_priority(args.device1,
- args.u.skb_priority,
- args.vlan_qos);
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ vlan_dev_set_ingress_priority(dev,
+ args.u.skb_priority,
+ args.vlan_qos);
+ err = 0;
break;
case SET_VLAN_EGRESS_PRIORITY_CMD:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- err = vlan_dev_set_egress_priority(args.device1,
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = vlan_dev_set_egress_priority(dev,
args.u.skb_priority,
args.vlan_qos);
break;
case SET_VLAN_FLAG_CMD:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- err = vlan_dev_set_vlan_flag(args.device1,
- args.u.flag,
- args.vlan_qos);
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = vlan_dev_change_flags(dev,
+ args.vlan_qos ? args.u.flag : 0,
+ args.u.flag);
break;
case SET_VLAN_NAME_TYPE_CMD:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- if ((args.u.name_type >= 0) &&
- (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
- vlan_name_type = args.u.name_type;
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) {
+ struct vlan_net *vn;
+
+ vn = net_generic(net, vlan_net_id);
+ vn->name_type = args.u.name_type;
err = 0;
} else {
err = -EINVAL;
@@ -729,78 +641,130 @@ static int vlan_ioctl_handler(void __user *arg)
break;
case ADD_VLAN_CMD:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- /* we have been given the name of the Ethernet Device we want to
- * talk to: args.dev1 We also have the
- * VLAN ID: args.u.VID
- */
- if (register_vlan_device(args.device1, args.u.VID)) {
- err = 0;
- } else {
- err = -EINVAL;
- }
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = register_vlan_device(dev, args.u.VID);
break;
case DEL_VLAN_CMD:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- /* Here, the args.dev1 is the actual VLAN we want
- * to get rid of.
- */
- err = unregister_vlan_device(args.device1);
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ unregister_vlan_dev(dev, NULL);
+ err = 0;
break;
- case GET_VLAN_INGRESS_PRIORITY_CMD:
- /* TODO: Implement
- err = vlan_dev_get_ingress_priority(args);
- if (copy_to_user((void*)arg, &args,
- sizeof(struct vlan_ioctl_args))) {
- err = -EFAULT;
- }
- */
- err = -EINVAL;
- break;
- case GET_VLAN_EGRESS_PRIORITY_CMD:
- /* TODO: Implement
- err = vlan_dev_get_egress_priority(args.device1, &(args.args);
- if (copy_to_user((void*)arg, &args,
- sizeof(struct vlan_ioctl_args))) {
- err = -EFAULT;
- }
- */
- err = -EINVAL;
- break;
case GET_VLAN_REALDEV_NAME_CMD:
- err = vlan_dev_get_realdev_name(args.device1, args.u.device2);
- if (err)
- goto out;
+ err = 0;
+ vlan_dev_get_realdev_name(dev, args.u.device2,
+ sizeof(args.u.device2));
if (copy_to_user(arg, &args,
- sizeof(struct vlan_ioctl_args))) {
+ sizeof(struct vlan_ioctl_args)))
err = -EFAULT;
- }
break;
case GET_VLAN_VID_CMD:
- err = vlan_dev_get_vid(args.device1, &vid);
- if (err)
- goto out;
- args.u.VID = vid;
+ err = 0;
+ args.u.VID = vlan_dev_vlan_id(dev);
if (copy_to_user(arg, &args,
- sizeof(struct vlan_ioctl_args))) {
- err = -EFAULT;
- }
+ sizeof(struct vlan_ioctl_args)))
+ err = -EFAULT;
break;
default:
- /* pass on to underlying device instead?? */
- printk(VLAN_DBG "%s: Unknown VLAN CMD: %x \n",
- __FUNCTION__, args.cmd);
- return -EINVAL;
- };
+ err = -EOPNOTSUPP;
+ break;
+ }
out:
+ rtnl_unlock();
+ return err;
+}
+
+static int __net_init vlan_init_net(struct net *net)
+{
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+ int err;
+
+ vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD;
+
+ err = vlan_proc_init(net);
+
return err;
}
+static void __net_exit vlan_exit_net(struct net *net)
+{
+ vlan_proc_cleanup(net);
+}
+
+static struct pernet_operations vlan_net_ops = {
+ .init = vlan_init_net,
+ .exit = vlan_exit_net,
+ .id = &vlan_net_id,
+ .size = sizeof(struct vlan_net),
+};
+
+static int __init vlan_proto_init(void)
+{
+ int err;
+
+ pr_info("%s v%s\n", vlan_fullname, vlan_version);
+
+ err = register_pernet_subsys(&vlan_net_ops);
+ if (err < 0)
+ goto err0;
+
+ err = register_netdevice_notifier(&vlan_notifier_block);
+ if (err < 0)
+ goto err2;
+
+ err = vlan_gvrp_init();
+ if (err < 0)
+ goto err3;
+
+ err = vlan_mvrp_init();
+ if (err < 0)
+ goto err4;
+
+ err = vlan_netlink_init();
+ if (err < 0)
+ goto err5;
+
+ vlan_ioctl_set(vlan_ioctl_handler);
+ return 0;
+
+err5:
+ vlan_mvrp_uninit();
+err4:
+ vlan_gvrp_uninit();
+err3:
+ unregister_netdevice_notifier(&vlan_notifier_block);
+err2:
+ unregister_pernet_subsys(&vlan_net_ops);
+err0:
+ return err;
+}
+
+static void __exit vlan_cleanup_module(void)
+{
+ vlan_ioctl_set(NULL);
+
+ vlan_netlink_fini();
+
+ unregister_netdevice_notifier(&vlan_notifier_block);
+
+ unregister_pernet_subsys(&vlan_net_ops);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ vlan_mvrp_uninit();
+ vlan_gvrp_uninit();
+}
+
+module_init(vlan_proto_init);
+module_exit(vlan_cleanup_module);
+
+MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 9ae3a14dd016..c7ffe591d593 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -1,72 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BEN_VLAN_802_1Q_INC__
#define __BEN_VLAN_802_1Q_INC__
#include <linux/if_vlan.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/list.h>
-/* Uncomment this if you want debug traces to be shown. */
-/* #define VLAN_DEBUG */
+/* if this changes, algorithm will have to be reworked because this
+ * depends on completely exhausting the VLAN identifier space. Thus
+ * it gives constant time look-up, but in many cases it wastes memory.
+ */
+#define VLAN_GROUP_ARRAY_SPLIT_PARTS 8
+#define VLAN_GROUP_ARRAY_PART_LEN (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)
-#define VLAN_ERR KERN_ERR
-#define VLAN_INF KERN_INFO
-#define VLAN_DBG KERN_ALERT /* change these... to debug, having a hard time
- * changing the log level at run-time..for some reason.
- */
+enum vlan_protos {
+ VLAN_PROTO_8021Q = 0,
+ VLAN_PROTO_8021AD,
+ VLAN_PROTO_NUM,
+};
-/*
+struct vlan_group {
+ unsigned int nr_vlan_devs;
+ struct hlist_node hlist; /* linked list */
+ struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
+ [VLAN_GROUP_ARRAY_SPLIT_PARTS];
+};
-These I use for memory debugging. I feared a leak at one time, but
-I never found it..and the problem seems to have dissappeared. Still,
-I'll bet they might prove useful again... --Ben
+struct vlan_info {
+ struct net_device *real_dev; /* The ethernet(like) device
+ * the vlan is attached to.
+ */
+ struct vlan_group grp;
+ struct list_head vid_list;
+ unsigned int nr_vids;
+ bool auto_vid0;
+ struct rcu_head rcu;
+};
+static inline int vlan_proto_idx(__be16 proto)
+{
+ switch (proto) {
+ case htons(ETH_P_8021Q):
+ return VLAN_PROTO_8021Q;
+ case htons(ETH_P_8021AD):
+ return VLAN_PROTO_8021AD;
+ default:
+ WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
+ return -EINVAL;
+ }
+}
-#define VLAN_MEM_DBG(x, y, z) printk(VLAN_DBG "%s: " x, __FUNCTION__, y, z);
-#define VLAN_FMEM_DBG(x, y) printk(VLAN_DBG "%s: " x, __FUNCTION__, y);
-*/
+static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
+ unsigned int pidx,
+ u16 vlan_id)
+{
+ struct net_device **array;
-/* This way they don't do anything! */
-#define VLAN_MEM_DBG(x, y, z)
-#define VLAN_FMEM_DBG(x, y)
+ array = vg->vlan_devices_arrays[pidx]
+ [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+ /* paired with smp_wmb() in vlan_group_prealloc_vid() */
+ smp_rmb();
-extern unsigned short vlan_name_type;
+ return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
+}
-#define VLAN_GRP_HASH_SHIFT 5
-#define VLAN_GRP_HASH_SIZE (1 << VLAN_GRP_HASH_SHIFT)
-#define VLAN_GRP_HASH_MASK (VLAN_GRP_HASH_SIZE - 1)
+static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
+ __be16 vlan_proto,
+ u16 vlan_id)
+{
+ int pidx = vlan_proto_idx(vlan_proto);
-/* Find a VLAN device by the MAC address of its Ethernet device, and
- * it's VLAN ID. The default configuration is to have VLAN's scope
- * to be box-wide, so the MAC will be ignored. The mac will only be
- * looked at if we are configured to have a separate set of VLANs per
- * each MAC addressable interface. Note that this latter option does
- * NOT follow the spec for VLANs, but may be useful for doing very
- * large quantities of VLAN MUX/DEMUX onto FrameRelay or ATM PVCs.
- *
- * Must be invoked with rcu_read_lock (ie preempt disabled)
- * or with RTNL.
- */
-struct net_device *__find_vlan_dev(struct net_device* real_dev,
- unsigned short VID); /* vlan.c */
+ if (pidx < 0)
+ return NULL;
+
+ return __vlan_group_get_device(vg, pidx, vlan_id);
+}
+
+static inline void vlan_group_set_device(struct vlan_group *vg,
+ __be16 vlan_proto, u16 vlan_id,
+ struct net_device *dev)
+{
+ int pidx = vlan_proto_idx(vlan_proto);
+ struct net_device **array;
+
+ if (!vg || pidx < 0)
+ return;
+ array = vg->vlan_devices_arrays[pidx]
+ [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+ array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
+}
+
+/* Must be invoked with rcu_read_lock or with RTNL. */
+static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
+ __be16 vlan_proto, u16 vlan_id)
+{
+ struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);
+
+ if (vlan_info)
+ return vlan_group_get_device(&vlan_info->grp,
+ vlan_proto, vlan_id);
+
+ return NULL;
+}
+
+static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
+{
+ netdev_features_t ret;
+
+ ret = real_dev->hw_enc_features &
+ (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL);
+
+ if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
+ return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
+ return 0;
+}
+
+#define vlan_group_for_each_dev(grp, i, dev) \
+ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
+ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
+ (i) % VLAN_N_VID)))
+
+int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto);
+void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto);
/* found in vlan_dev.c */
-int vlan_dev_rebuild_header(struct sk_buff *skb);
-int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype, struct net_device *orig_dev);
-int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type, void *daddr, void *saddr,
- unsigned len);
-int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev);
-int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev);
-int vlan_dev_change_mtu(struct net_device *dev, int new_mtu);
-int vlan_dev_set_mac_address(struct net_device *dev, void* addr);
-int vlan_dev_open(struct net_device* dev);
-int vlan_dev_stop(struct net_device* dev);
-int vlan_dev_ioctl(struct net_device* dev, struct ifreq *ifr, int cmd);
-int vlan_dev_set_ingress_priority(char* dev_name, __u32 skb_prio, short vlan_prio);
-int vlan_dev_set_egress_priority(char* dev_name, __u32 skb_prio, short vlan_prio);
-int vlan_dev_set_vlan_flag(char* dev_name, __u32 flag, short flag_val);
-int vlan_dev_get_realdev_name(const char* dev_name, char* result);
-int vlan_dev_get_vid(const char* dev_name, unsigned short* result);
-void vlan_dev_set_multicast_list(struct net_device *vlan_dev);
+void vlan_dev_set_ingress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio);
+int vlan_dev_set_egress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio);
+void vlan_dev_free_egress_priority(const struct net_device *dev);
+int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result,
+ size_t size);
+
+int vlan_check_real_dev(struct net_device *real_dev,
+ __be16 protocol, u16 vlan_id,
+ struct netlink_ext_ack *extack);
+void vlan_setup(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev);
+
+static inline u32 vlan_get_ingress_priority(struct net_device *dev,
+ u16 vlan_tci)
+{
+ struct vlan_dev_priv *vip = vlan_dev_priv(dev);
+
+ return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
+}
+
+#ifdef CONFIG_VLAN_8021Q_GVRP
+int vlan_gvrp_request_join(const struct net_device *dev);
+void vlan_gvrp_request_leave(const struct net_device *dev);
+int vlan_gvrp_init_applicant(struct net_device *dev);
+void vlan_gvrp_uninit_applicant(struct net_device *dev);
+int vlan_gvrp_init(void);
+void vlan_gvrp_uninit(void);
+#else
+static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
+static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
+static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
+static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
+static inline int vlan_gvrp_init(void) { return 0; }
+static inline void vlan_gvrp_uninit(void) {}
+#endif
+
+#ifdef CONFIG_VLAN_8021Q_MVRP
+int vlan_mvrp_request_join(const struct net_device *dev);
+void vlan_mvrp_request_leave(const struct net_device *dev);
+int vlan_mvrp_init_applicant(struct net_device *dev);
+void vlan_mvrp_uninit_applicant(struct net_device *dev);
+int vlan_mvrp_init(void);
+void vlan_mvrp_uninit(void);
+#else
+static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; }
+static inline void vlan_mvrp_request_leave(const struct net_device *dev) {}
+static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; }
+static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {}
+static inline int vlan_mvrp_init(void) { return 0; }
+static inline void vlan_mvrp_uninit(void) {}
+#endif
+
+extern const char vlan_fullname[];
+extern const char vlan_version[];
+int vlan_netlink_init(void);
+void vlan_netlink_fini(void);
+
+extern struct rtnl_link_ops vlan_link_ops;
+
+extern unsigned int vlan_net_id;
+
+struct proc_dir_entry;
+
+struct vlan_net {
+ /* /proc/net/vlan */
+ struct proc_dir_entry *proc_vlan_dir;
+ /* /proc/net/vlan/config */
+ struct proc_dir_entry *proc_vlan_conf;
+ /* Determines interface naming scheme. */
+ unsigned short name_type;
+};
#endif /* !(__BEN_VLAN_802_1Q_INC__) */
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
new file mode 100644
index 000000000000..9404dd551dfd
--- /dev/null
+++ b/net/8021q/vlan_core.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/netpoll.h>
+#include <linux/export.h>
+#include <net/gro.h>
+#include "vlan.h"
+
+bool vlan_do_receive(struct sk_buff **skbp)
+{
+ struct sk_buff *skb = *skbp;
+ __be16 vlan_proto = skb->vlan_proto;
+ u16 vlan_id = skb_vlan_tag_get_id(skb);
+ struct net_device *vlan_dev;
+ struct vlan_pcpu_stats *rx_stats;
+
+ vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
+ if (!vlan_dev)
+ return false;
+
+ skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return false;
+
+ if (unlikely(!(vlan_dev->flags & IFF_UP))) {
+ kfree_skb(skb);
+ *skbp = NULL;
+ return false;
+ }
+
+ skb->dev = vlan_dev;
+ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
+ /* Our lower layer thinks this is not local, let's make sure.
+ * This allows the VLAN to have a different MAC than the
+ * underlying device, and still route correctly. */
+ if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr))
+ skb->pkt_type = PACKET_HOST;
+ }
+
+ if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
+ !netif_is_macvlan_port(vlan_dev) &&
+ !netif_is_bridge_port(vlan_dev)) {
+ unsigned int offset = skb->data - skb_mac_header(skb);
+
+ /*
+ * vlan_insert_tag expect skb->data pointing to mac header.
+ * So change skb->data before calling it and change back to
+ * original position later
+ */
+ skb_push(skb, offset);
+ skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
+ skb->vlan_tci, skb->mac_len);
+ if (!skb)
+ return false;
+ skb_pull(skb, offset + VLAN_HLEN);
+ skb_reset_mac_len(skb);
+ }
+
+ skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
+ __vlan_hwaccel_clear_tag(skb);
+
+ rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
+
+ u64_stats_update_begin(&rx_stats->syncp);
+ u64_stats_inc(&rx_stats->rx_packets);
+ u64_stats_add(&rx_stats->rx_bytes, skb->len);
+ if (skb->pkt_type == PACKET_MULTICAST)
+ u64_stats_inc(&rx_stats->rx_multicast);
+ u64_stats_update_end(&rx_stats->syncp);
+
+ return true;
+}
+
+/* Must be invoked with rcu_read_lock. */
+struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
+ __be16 vlan_proto, u16 vlan_id)
+{
+ struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);
+
+ if (vlan_info) {
+ return vlan_group_get_device(&vlan_info->grp,
+ vlan_proto, vlan_id);
+ } else {
+ /*
+ * Lower devices of master uppers (bonding, team) do not have
+ * grp assigned to themselves. Grp is assigned to upper device
+ * instead.
+ */
+ struct net_device *upper_dev;
+
+ upper_dev = netdev_master_upper_dev_get_rcu(dev);
+ if (upper_dev)
+ return __vlan_find_dev_deep_rcu(upper_dev,
+ vlan_proto, vlan_id);
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);
+
+struct net_device *vlan_dev_real_dev(const struct net_device *dev)
+{
+ struct net_device *ret = vlan_dev_priv(dev)->real_dev;
+
+ while (is_vlan_dev(ret))
+ ret = vlan_dev_priv(ret)->real_dev;
+
+ return ret;
+}
+EXPORT_SYMBOL(vlan_dev_real_dev);
+
+u16 vlan_dev_vlan_id(const struct net_device *dev)
+{
+ return vlan_dev_priv(dev)->vlan_id;
+}
+EXPORT_SYMBOL(vlan_dev_vlan_id);
+
+__be16 vlan_dev_vlan_proto(const struct net_device *dev)
+{
+ return vlan_dev_priv(dev)->vlan_proto;
+}
+EXPORT_SYMBOL(vlan_dev_vlan_proto);
+
+/*
+ * vlan info and vid list
+ */
+
+static void vlan_group_free(struct vlan_group *grp)
+{
+ int i, j;
+
+ for (i = 0; i < VLAN_PROTO_NUM; i++)
+ for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++)
+ kfree(grp->vlan_devices_arrays[i][j]);
+}
+
+static void vlan_info_free(struct vlan_info *vlan_info)
+{
+ vlan_group_free(&vlan_info->grp);
+ kfree(vlan_info);
+}
+
+static void vlan_info_rcu_free(struct rcu_head *rcu)
+{
+ vlan_info_free(container_of(rcu, struct vlan_info, rcu));
+}
+
+static struct vlan_info *vlan_info_alloc(struct net_device *dev)
+{
+ struct vlan_info *vlan_info;
+
+ vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL);
+ if (!vlan_info)
+ return NULL;
+
+ vlan_info->real_dev = dev;
+ INIT_LIST_HEAD(&vlan_info->vid_list);
+ return vlan_info;
+}
+
+struct vlan_vid_info {
+ struct list_head list;
+ __be16 proto;
+ u16 vid;
+ int refcount;
+};
+
+static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
+{
+ if (proto == htons(ETH_P_8021Q) &&
+ dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
+ return true;
+ if (proto == htons(ETH_P_8021AD) &&
+ dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
+ return true;
+ return false;
+}
+
+static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
+ __be16 proto, u16 vid)
+{
+ struct vlan_vid_info *vid_info;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ if (vid_info->proto == proto && vid_info->vid == vid)
+ return vid_info;
+ }
+ return NULL;
+}
+
+static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
+{
+ struct vlan_vid_info *vid_info;
+
+ vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL);
+ if (!vid_info)
+ return NULL;
+ vid_info->proto = proto;
+ vid_info->vid = vid;
+
+ return vid_info;
+}
+
+static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
+{
+ if (!vlan_hw_filter_capable(dev, proto))
+ return 0;
+
+ if (netif_device_present(dev))
+ return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid);
+ else
+ return -ENODEV;
+}
+
+static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
+{
+ if (!vlan_hw_filter_capable(dev, proto))
+ return 0;
+
+ if (netif_device_present(dev))
+ return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
+ else
+ return -ENODEV;
+}
+
+int vlan_for_each(struct net_device *dev,
+ int (*action)(struct net_device *dev, int vid, void *arg),
+ void *arg)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+ struct net_device *vdev;
+ int ret;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return 0;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
+ vid_info->vid);
+ ret = action(vdev, vid_info->vid, arg);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(vlan_for_each);
+
+int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
+{
+ struct net_device *real_dev = vlan_info->real_dev;
+ struct vlan_vid_info *vlan_vid_info;
+ int err;
+
+ list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) {
+ if (vlan_vid_info->proto == proto) {
+ err = vlan_add_rx_filter_info(real_dev, proto,
+ vlan_vid_info->vid);
+ if (err)
+ goto unwind;
+ }
+ }
+
+ return 0;
+
+unwind:
+ list_for_each_entry_continue_reverse(vlan_vid_info,
+ &vlan_info->vid_list, list) {
+ if (vlan_vid_info->proto == proto)
+ vlan_kill_rx_filter_info(real_dev, proto,
+ vlan_vid_info->vid);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(vlan_filter_push_vids);
+
+void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto)
+{
+ struct vlan_vid_info *vlan_vid_info;
+
+ list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list)
+ if (vlan_vid_info->proto == proto)
+ vlan_kill_rx_filter_info(vlan_info->real_dev,
+ vlan_vid_info->proto,
+ vlan_vid_info->vid);
+}
+EXPORT_SYMBOL(vlan_filter_drop_vids);
+
+static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
+ struct vlan_vid_info **pvid_info)
+{
+ struct net_device *dev = vlan_info->real_dev;
+ struct vlan_vid_info *vid_info;
+ int err;
+
+ vid_info = vlan_vid_info_alloc(proto, vid);
+ if (!vid_info)
+ return -ENOMEM;
+
+ err = vlan_add_rx_filter_info(dev, proto, vid);
+ if (err) {
+ kfree(vid_info);
+ return err;
+ }
+
+ list_add(&vid_info->list, &vlan_info->vid_list);
+ vlan_info->nr_vids++;
+ *pvid_info = vid_info;
+ return 0;
+}
+
+int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
+{
+ struct vlan_info *vlan_info;
+ struct vlan_vid_info *vid_info;
+ bool vlan_info_created = false;
+ int err;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info) {
+ vlan_info = vlan_info_alloc(dev);
+ if (!vlan_info)
+ return -ENOMEM;
+ vlan_info_created = true;
+ }
+ vid_info = vlan_vid_info_get(vlan_info, proto, vid);
+ if (!vid_info) {
+ err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
+ if (err)
+ goto out_free_vlan_info;
+ }
+ vid_info->refcount++;
+
+ if (vlan_info_created)
+ rcu_assign_pointer(dev->vlan_info, vlan_info);
+
+ return 0;
+
+out_free_vlan_info:
+ if (vlan_info_created)
+ kfree(vlan_info);
+ return err;
+}
+EXPORT_SYMBOL(vlan_vid_add);
+
+static void __vlan_vid_del(struct vlan_info *vlan_info,
+ struct vlan_vid_info *vid_info)
+{
+ struct net_device *dev = vlan_info->real_dev;
+ __be16 proto = vid_info->proto;
+ u16 vid = vid_info->vid;
+ int err;
+
+ err = vlan_kill_rx_filter_info(dev, proto, vid);
+ if (err && dev->reg_state != NETREG_UNREGISTERING)
+ netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);
+
+ list_del(&vid_info->list);
+ kfree(vid_info);
+ vlan_info->nr_vids--;
+}
+
+void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
+{
+ struct vlan_info *vlan_info;
+ struct vlan_vid_info *vid_info;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return;
+
+ vid_info = vlan_vid_info_get(vlan_info, proto, vid);
+ if (!vid_info)
+ return;
+ vid_info->refcount--;
+ if (vid_info->refcount == 0) {
+ __vlan_vid_del(vlan_info, vid_info);
+ if (vlan_info->nr_vids == 0) {
+ RCU_INIT_POINTER(dev->vlan_info, NULL);
+ call_rcu(&vlan_info->rcu, vlan_info_rcu_free);
+ }
+ }
+}
+EXPORT_SYMBOL(vlan_vid_del);
+
+int vlan_vids_add_by_dev(struct net_device *dev,
+ const struct net_device *by_dev)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+ int err;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(by_dev->vlan_info);
+ if (!vlan_info)
+ return 0;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
+ continue;
+ err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
+ if (err)
+ goto unwind;
+ }
+ return 0;
+
+unwind:
+ list_for_each_entry_continue_reverse(vid_info,
+ &vlan_info->vid_list,
+ list) {
+ if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
+ continue;
+ vlan_vid_del(dev, vid_info->proto, vid_info->vid);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(vlan_vids_add_by_dev);
+
+void vlan_vids_del_by_dev(struct net_device *dev,
+ const struct net_device *by_dev)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(by_dev->vlan_info);
+ if (!vlan_info)
+ return;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
+ continue;
+ vlan_vid_del(dev, vid_info->proto, vid_info->vid);
+ }
+}
+EXPORT_SYMBOL(vlan_vids_del_by_dev);
+
+bool vlan_uses_dev(const struct net_device *dev)
+{
+ struct vlan_info *vlan_info;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return false;
+ return vlan_info->grp.nr_vlan_devs ? true : false;
+}
+EXPORT_SYMBOL(vlan_uses_dev);
+
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ const struct packet_offload *ptype;
+ unsigned int hlen, off_vlan;
+ struct sk_buff *pp = NULL;
+ struct vlan_hdr *vhdr;
+ struct sk_buff *p;
+ __be16 type;
+ int flush = 1;
+
+ off_vlan = skb_gro_offset(skb);
+ hlen = off_vlan + sizeof(*vhdr);
+ vhdr = skb_gro_header(skb, hlen, off_vlan);
+ if (unlikely(!vhdr))
+ goto out;
+
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;
+
+ type = vhdr->h_vlan_encapsulated_proto;
+
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out;
+
+ flush = 0;
+
+ list_for_each_entry(p, head, list) {
+ struct vlan_hdr *vhdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
+ if (compare_vlan_header(vhdr, vhdr2))
+ NAPI_GRO_CB(p)->same_flow = 0;
+ }
+
+ skb_gro_pull(skb, sizeof(*vhdr));
+ skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
+
+ pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ head, skb);
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+
+static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
+ __be16 type = vhdr->h_vlan_encapsulated_proto;
+ struct packet_offload *ptype;
+ int err = -ENOENT;
+
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, nhoff + sizeof(*vhdr));
+
+ return err;
+}
+
+static struct packet_offload vlan_packet_offloads[] __read_mostly = {
+ {
+ .type = cpu_to_be16(ETH_P_8021Q),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+ {
+ .type = cpu_to_be16(ETH_P_8021AD),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+};
+
+static int __init vlan_offload_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+ dev_add_offload(&vlan_packet_offloads[i]);
+
+ return 0;
+}
+
+fs_initcall(vlan_offload_init);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 60a508eb1945..fbf296137b09 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -1,891 +1,1078 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* -*- linux-c -*-
* INET 802.1Q VLAN
* Ethernet-type device handling.
*
* Authors: Ben Greear <greearb@candelatech.com>
- * Please send support related email to: vlan@scry.wanfear.com
+ * Please send support related email to: netdev@vger.kernel.org
* VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
- *
+ *
* Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com>
* - reset skb->pkt_type on incoming packets when MAC was changed
* - see that changed MAC is saddr for outgoing packets
* Oct 20, 2001: Ard van Breeman:
* - Fix MC-list, finally.
* - Flush MC-list on VLAN destroy.
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/in.h>
-#include <linux/init.h>
-#include <asm/uaccess.h> /* for copy_from_user */
+#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
#include <linux/etherdevice.h>
-#include <net/datalink.h>
-#include <net/p8022.h>
+#include <linux/ethtool.h>
+#include <linux/phy.h>
#include <net/arp.h>
+#include <net/macsec.h>
+#include <net/netdev_lock.h>
#include "vlan.h"
#include "vlanproc.h"
#include <linux/if_vlan.h>
-#include <net/ip.h>
+#include <linux/netpoll.h>
/*
- * Rebuild the Ethernet MAC header. This is called after an ARP
- * (or in future other address resolution) has completed on this
- * sk_buff. We now let ARP fill in the other fields.
+ * Create the VLAN header for an arbitrary protocol layer
*
- * This routine CANNOT use cached dst->neigh!
- * Really, it is used only when dst->neigh is wrong.
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp)
*
- * TODO: This needs a checkup, I'm ignorant here. --BLG
+ * This is called when the SKB is moving down the stack towards the
+ * physical devices.
*/
-int vlan_dev_rebuild_header(struct sk_buff *skb)
+static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr,
+ unsigned int len)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_hdr *vhdr;
+ unsigned int vhdrlen = 0;
+ u16 vlan_tci = 0;
+ int rc;
+
+ if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) {
+ vhdr = skb_push(skb, VLAN_HLEN);
+
+ vlan_tci = vlan->vlan_id;
+ vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority);
+ vhdr->h_vlan_TCI = htons(vlan_tci);
+
+ /*
+ * Set the protocol type. For a packet of type ETH_P_802_3/2 we
+ * put the length in here instead.
+ */
+ if (type != ETH_P_802_3 && type != ETH_P_802_2)
+ vhdr->h_vlan_encapsulated_proto = htons(type);
+ else
+ vhdr->h_vlan_encapsulated_proto = htons(len);
+
+ skb->protocol = vlan->vlan_proto;
+ type = ntohs(vlan->vlan_proto);
+ vhdrlen = VLAN_HLEN;
+ }
+
+ /* Before delegating work to the lower layer, enter our MAC-address */
+ if (saddr == NULL)
+ saddr = dev->dev_addr;
+
+ /* Now make the underlying real hard header */
+ dev = vlan->real_dev;
+ rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen);
+ if (rc > 0)
+ rc += vhdrlen;
+ return rc;
+}
+
+static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ return netpoll_send_skb(vlan->netpoll, skb);
+#else
+ BUG();
+ return NETDEV_TX_OK;
+#endif
+}
+
+static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
- struct net_device *dev = skb->dev;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
+ unsigned int len;
+ int ret;
- switch (veth->h_vlan_encapsulated_proto) {
-#ifdef CONFIG_INET
- case __constant_htons(ETH_P_IP):
-
- /* TODO: Confirm this will work with VLAN headers... */
- return arp_find(veth->h_dest, skb);
-#endif
- default:
- printk(VLAN_DBG
- "%s: unable to resolve type %X addresses.\n",
- dev->name, ntohs(veth->h_vlan_encapsulated_proto));
-
- memcpy(veth->h_source, dev->dev_addr, ETH_ALEN);
- break;
- };
+ /* Handle non-VLAN frames if they are sent to us, for example by DHCP.
+ *
+ * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
+ * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
+ */
+ if (vlan->flags & VLAN_FLAG_REORDER_HDR ||
+ veth->h_vlan_proto != vlan->vlan_proto) {
+ u16 vlan_tci;
+ vlan_tci = vlan->vlan_id;
+ vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority);
+ __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci);
+ }
+
+ skb->dev = vlan->real_dev;
+ len = skb->len;
+ if (unlikely(netpoll_tx_running(dev)))
+ return vlan_netpoll_send_skb(vlan, skb);
+
+ ret = dev_queue_xmit(skb);
+
+ if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
+ struct vlan_pcpu_stats *stats;
+
+ stats = this_cpu_ptr(vlan->vlan_pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_inc(&stats->tx_packets);
+ u64_stats_add(&stats->tx_bytes, len);
+ u64_stats_update_end(&stats->syncp);
+ } else {
+ this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped);
+ }
+
+ return ret;
+}
+
+static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ unsigned int max_mtu = real_dev->mtu;
+
+ if (netif_reduces_vlan_mtu(real_dev))
+ max_mtu -= VLAN_HLEN;
+ if (max_mtu < new_mtu)
+ return -ERANGE;
+
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
-static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
+void vlan_dev_set_ingress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio)
{
- if (VLAN_DEV_INFO(skb->dev)->flags & 1) {
- if (skb_shared(skb) || skb_cloned(skb)) {
- struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
- kfree_skb(skb);
- skb = nskb;
- }
- if (skb) {
- /* Lifted from Gleb's VLAN code... */
- memmove(skb->data - ETH_HLEN,
- skb->data - VLAN_ETH_HLEN, 12);
- skb->mac.raw += VLAN_HLEN;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+ if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio)
+ vlan->nr_ingress_mappings--;
+ else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio)
+ vlan->nr_ingress_mappings++;
+
+ vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio;
+}
+
+int vlan_dev_set_egress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_priority_tci_mapping *mp = NULL;
+ struct vlan_priority_tci_mapping *np;
+ u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
+
+ /* See if a priority mapping exists.. */
+ mp = vlan->egress_priority_map[skb_prio & 0xF];
+ while (mp) {
+ if (mp->priority == skb_prio) {
+ if (mp->vlan_qos && !vlan_qos)
+ vlan->nr_egress_mappings--;
+ else if (!mp->vlan_qos && vlan_qos)
+ vlan->nr_egress_mappings++;
+ mp->vlan_qos = vlan_qos;
+ return 0;
}
+ mp = mp->next;
}
- return skb;
+ /* Create a new mapping then. */
+ mp = vlan->egress_priority_map[skb_prio & 0xF];
+ np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL);
+ if (!np)
+ return -ENOBUFS;
+
+ np->next = mp;
+ np->priority = skb_prio;
+ np->vlan_qos = vlan_qos;
+ /* Before inserting this element in hash table, make sure all its fields
+ * are committed to memory.
+ * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask()
+ */
+ smp_wmb();
+ vlan->egress_priority_map[skb_prio & 0xF] = np;
+ if (vlan_qos)
+ vlan->nr_egress_mappings++;
+ return 0;
}
-/*
- * Determine the packet's protocol ID. The rule here is that we
- * assume 802.3 if the type field is short enough to be a length.
- * This is normal practice and works for any 'now in use' protocol.
- *
- * Also, at this point we assume that we ARE dealing exclusively with
- * VLAN packets, or packets that should be made into VLAN packets based
- * on a default VLAN ID.
- *
- * NOTE: Should be similar to ethernet/eth.c.
- *
- * SANITY NOTE: This method is called when a packet is moving up the stack
- * towards userland. To get here, it would have already passed
- * through the ethernet/eth.c eth_type_trans() method.
- * SANITY NOTE 2: We are referencing to the VLAN_HDR frields, which MAY be
- * stored UNALIGNED in the memory. RISC systems don't like
- * such cases very much...
- * SANITY NOTE 2a: According to Dave Miller & Alexey, it will always be aligned,
- * so there doesn't need to be any of the unaligned stuff. It has
- * been commented out now... --Ben
- *
+/* Flags are defined in the vlan_flags enum in
+ * include/uapi/linux/if_vlan.h file.
*/
-int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type* ptype, struct net_device *orig_dev)
+int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
{
- unsigned char *rawp = NULL;
- struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data);
- unsigned short vid;
- struct net_device_stats *stats;
- unsigned short vlan_TCI;
- __be16 proto;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ u32 old_flags = vlan->flags;
- /* vlan_TCI = ntohs(get_unaligned(&vhdr->h_vlan_TCI)); */
- vlan_TCI = ntohs(vhdr->h_vlan_TCI);
+ if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP |
+ VLAN_FLAG_BRIDGE_BINDING))
+ return -EINVAL;
- vid = (vlan_TCI & VLAN_VID_MASK);
+ vlan->flags = (old_flags & ~mask) | (flags & mask);
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: skb: %p vlan_id: %hx\n",
- __FUNCTION__, skb, vid);
-#endif
+ if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) {
+ if (vlan->flags & VLAN_FLAG_GVRP)
+ vlan_gvrp_request_join(dev);
+ else
+ vlan_gvrp_request_leave(dev);
+ }
- /* Ok, we will find the correct VLAN device, strip the header,
- * and then go on as usual.
- */
+ if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) {
+ if (vlan->flags & VLAN_FLAG_MVRP)
+ vlan_mvrp_request_join(dev);
+ else
+ vlan_mvrp_request_leave(dev);
+ }
+ return 0;
+}
- /* We have 12 bits of vlan ID.
- *
- * We must not drop allow preempt until we hold a
- * reference to the device (netif_rx does that) or we
- * fail.
- */
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size)
+{
+ strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size);
+}
- rcu_read_lock();
- skb->dev = __find_vlan_dev(dev, vid);
- if (!skb->dev) {
- rcu_read_unlock();
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev)
+{
+ if (dev->addr_assign_type != NET_ADDR_STOLEN)
+ return false;
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: ERROR: No net_device for VID: %i on dev: %s [%i]\n",
- __FUNCTION__, (unsigned int)(vid), dev->name, dev->ifindex);
-#endif
- kfree_skb(skb);
- return -1;
+ eth_hw_addr_set(dev, real_dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return true;
+}
+
+static int vlan_dev_open(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ int err;
+
+ if (!(real_dev->flags & IFF_UP) &&
+ !(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+ return -ENETDOWN;
+
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+ !vlan_dev_inherit_address(dev, real_dev)) {
+ err = dev_uc_add(real_dev, dev->dev_addr);
+ if (err < 0)
+ goto out;
}
- skb->dev->last_rx = jiffies;
+ ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr);
- /* Bump the rx counters for the VLAN device. */
- stats = vlan_dev_get_stats(skb->dev);
- stats->rx_packets++;
- stats->rx_bytes += skb->len;
+ if (vlan->flags & VLAN_FLAG_GVRP)
+ vlan_gvrp_request_join(dev);
- /* Take off the VLAN header (4 bytes currently) */
- skb_pull_rcsum(skb, VLAN_HLEN);
+ if (vlan->flags & VLAN_FLAG_MVRP)
+ vlan_mvrp_request_join(dev);
- /* Ok, lets check to make sure the device (dev) we
- * came in on is what this VLAN is attached to.
- */
+ if (netif_carrier_ok(real_dev) &&
+ !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_carrier_on(dev);
+ return 0;
- if (dev != VLAN_DEV_INFO(skb->dev)->real_dev) {
- rcu_read_unlock();
+out:
+ netif_carrier_off(dev);
+ return err;
+}
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: dropping skb: %p because came in on wrong device, dev: %s real_dev: %s, skb_dev: %s\n",
- __FUNCTION__, skb, dev->name,
- VLAN_DEV_INFO(skb->dev)->real_dev->name,
- skb->dev->name);
-#endif
- kfree_skb(skb);
- stats->rx_errors++;
- return -1;
+static int vlan_dev_stop(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+
+ dev_mc_unsync(real_dev, dev);
+ dev_uc_unsync(real_dev, dev);
+
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
+ dev_uc_del(real_dev, dev->dev_addr);
+
+ if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_carrier_off(dev);
+ return 0;
+}
+
+static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ struct sockaddr *addr = p;
+ int err;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ if (!(dev->flags & IFF_UP))
+ goto out;
+
+ if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) {
+ err = dev_uc_add(real_dev, addr->sa_data);
+ if (err < 0)
+ return err;
}
- /*
- * Deal with ingress priority mapping.
- */
- skb->priority = vlan_get_ingress_priority(skb->dev, ntohs(vhdr->h_vlan_TCI));
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
+ dev_uc_del(real_dev, dev->dev_addr);
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: priority: %lu for TCI: %hu (hbo)\n",
- __FUNCTION__, (unsigned long)(skb->priority),
- ntohs(vhdr->h_vlan_TCI));
-#endif
+out:
+ eth_hw_addr_set(dev, addr->sa_data);
+ return 0;
+}
- /* The ethernet driver already did the pkt_type calculations
- * for us...
- */
- switch (skb->pkt_type) {
- case PACKET_BROADCAST: /* Yeah, stats collect these together.. */
- // stats->broadcast ++; // no such counter :-(
- break;
+static int vlan_hwtstamp_get(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- case PACKET_MULTICAST:
- stats->multicast++;
- break;
+ return generic_hwtstamp_get_lower(real_dev, cfg);
+}
- case PACKET_OTHERHOST:
- /* Our lower layer thinks this is not local, let's make sure.
- * This allows the VLAN to have a different MAC than the underlying
- * device, and still route correctly.
- */
- if (!compare_ether_addr(eth_hdr(skb)->h_dest, skb->dev->dev_addr)) {
- /* It is for our (changed) MAC-address! */
- skb->pkt_type = PACKET_HOST;
- }
- break;
- default:
- break;
- };
+static int vlan_hwtstamp_set(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- /* Was a VLAN packet, grab the encapsulated protocol, which the layer
- * three protocols care about.
- */
- /* proto = get_unaligned(&vhdr->h_vlan_encapsulated_proto); */
- proto = vhdr->h_vlan_encapsulated_proto;
+ if (!net_eq(dev_net(dev), dev_net(real_dev)))
+ return -EOPNOTSUPP;
- skb->protocol = proto;
- if (ntohs(proto) >= 1536) {
- /* place it back on the queue to be handled by
- * true layer 3 protocols.
- */
+ return generic_hwtstamp_set_lower(real_dev, cfg, extack);
+}
- /* See if we are configured to re-write the VLAN header
- * to make it look like ethernet...
- */
- skb = vlan_check_reorder_header(skb);
-
- /* Can be null if skb-clone fails when re-ordering */
- if (skb) {
- netif_rx(skb);
- } else {
- /* TODO: Add a more specific counter here. */
- stats->rx_errors++;
- }
- rcu_read_unlock();
- return 0;
+static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ struct ifreq ifrr;
+ int err = -EOPNOTSUPP;
+
+ strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ);
+ ifrr.ifr_ifru = ifr->ifr_ifru;
+
+ switch (cmd) {
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSMIIREG:
+ err = dev_eth_ioctl(real_dev, &ifrr, cmd);
+ break;
}
- rawp = skb->data;
+ if (!err)
+ ifr->ifr_ifru = ifrr.ifr_ifru;
- /*
- * This is a magic hack to spot IPX packets. Older Novell breaks
- * the protocol design and runs IPX over 802.3 without an 802.2 LLC
- * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
- * won't work for fault tolerant netware but does for the rest.
- */
- if (*(unsigned short *)rawp == 0xFFFF) {
- skb->protocol = __constant_htons(ETH_P_802_3);
- /* place it back on the queue to be handled by true layer 3 protocols.
- */
+ return err;
+}
- /* See if we are configured to re-write the VLAN header
- * to make it look like ethernet...
- */
- skb = vlan_check_reorder_header(skb);
-
- /* Can be null if skb-clone fails when re-ordering */
- if (skb) {
- netif_rx(skb);
- } else {
- /* TODO: Add a more specific counter here. */
- stats->rx_errors++;
- }
- rcu_read_unlock();
- return 0;
- }
+static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int err = 0;
- /*
- * Real 802.2 LLC
- */
- skb->protocol = __constant_htons(ETH_P_802_2);
- /* place it back on the queue to be handled by upper layer protocols.
- */
+ if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
+ err = ops->ndo_neigh_setup(real_dev, pa);
- /* See if we are configured to re-write the VLAN header
- * to make it look like ethernet...
- */
- skb = vlan_check_reorder_header(skb);
+ return err;
+}
- /* Can be null if skb-clone fails when re-ordering */
- if (skb) {
- netif_rx(skb);
- } else {
- /* TODO: Add a more specific counter here. */
- stats->rx_errors++;
- }
- rcu_read_unlock();
- return 0;
+#if IS_ENABLED(CONFIG_FCOE)
+static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid,
+ struct scatterlist *sgl, unsigned int sgc)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = 0;
+
+ if (ops->ndo_fcoe_ddp_setup)
+ rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc);
+
+ return rc;
}
-static inline unsigned short vlan_dev_get_egress_qos_mask(struct net_device* dev,
- struct sk_buff* skb)
+static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid)
{
- struct vlan_priority_tci_mapping *mp =
- VLAN_DEV_INFO(dev)->egress_priority_map[(skb->priority & 0xF)];
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int len = 0;
- while (mp) {
- if (mp->priority == skb->priority) {
- return mp->vlan_qos; /* This should already be shifted to mask
- * correctly with the VLAN's TCI
- */
- }
- mp = mp->next;
- }
- return 0;
+ if (ops->ndo_fcoe_ddp_done)
+ len = ops->ndo_fcoe_ddp_done(real_dev, xid);
+
+ return len;
}
-/*
- * Create the VLAN header for an arbitrary protocol layer
- *
- * saddr=NULL means use device source address
- * daddr=NULL means leave destination address (eg unresolved arp)
- *
- * This is called when the SKB is moving down the stack towards the
- * physical devices.
- */
-int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type, void *daddr, void *saddr,
- unsigned len)
+static int vlan_dev_fcoe_enable(struct net_device *dev)
{
- struct vlan_hdr *vhdr;
- unsigned short veth_TCI = 0;
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = -EINVAL;
+
+ if (ops->ndo_fcoe_enable)
+ rc = ops->ndo_fcoe_enable(real_dev);
+ return rc;
+}
+
+static int vlan_dev_fcoe_disable(struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = -EINVAL;
+
+ if (ops->ndo_fcoe_disable)
+ rc = ops->ndo_fcoe_disable(real_dev);
+ return rc;
+}
+
+static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid,
+ struct scatterlist *sgl, unsigned int sgc)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
int rc = 0;
- int build_vlan_header = 0;
- struct net_device *vdev = dev; /* save this for the bottom of the method */
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: skb: %p type: %hx len: %x vlan_id: %hx, daddr: %p\n",
- __FUNCTION__, skb, type, len, VLAN_DEV_INFO(dev)->vlan_id, daddr);
+ if (ops->ndo_fcoe_ddp_target)
+ rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc);
+
+ return rc;
+}
#endif
- /* build vlan header only if re_order_header flag is NOT set. This
- * fixes some programs that get confused when they see a VLAN device
- * sending a frame that is VLAN encoded (the consensus is that the VLAN
- * device should look completely like an Ethernet device when the
- * REORDER_HEADER flag is set) The drawback to this is some extra
- * header shuffling in the hard_start_xmit. Users can turn off this
- * REORDER behaviour with the vconfig tool.
- */
- build_vlan_header = ((VLAN_DEV_INFO(dev)->flags & 1) == 0);
+#ifdef NETDEV_FCOE_WWNN
+static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = -EINVAL;
- if (build_vlan_header) {
- vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN);
+ if (ops->ndo_fcoe_get_wwn)
+ rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type);
+ return rc;
+}
+#endif
- /* build the four bytes that make this a VLAN header. */
+static void vlan_dev_change_rx_flags(struct net_device *dev, int change)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- /* Now, construct the second two bytes. This field looks something
- * like:
- * usr_priority: 3 bits (high bits)
- * CFI 1 bit
- * VLAN ID 12 bits (low bits)
- *
- */
- veth_TCI = VLAN_DEV_INFO(dev)->vlan_id;
- veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb);
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
+}
- vhdr->h_vlan_TCI = htons(veth_TCI);
+static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
+{
+ dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
+ dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
+}
- /*
- * Set the protocol type.
- * For a packet of type ETH_P_802_3 we put the length in here instead.
- * It is up to the 802.2 layer to carry protocol information.
- */
+static __be16 vlan_parse_protocol(const struct sk_buff *skb)
+{
+ struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
- if (type != ETH_P_802_3) {
- vhdr->h_vlan_encapsulated_proto = htons(type);
- } else {
- vhdr->h_vlan_encapsulated_proto = htons(len);
- }
- }
+ return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL);
+}
+
+static const struct header_ops vlan_header_ops = {
+ .create = vlan_dev_hard_header,
+ .parse = eth_header_parse,
+ .parse_protocol = vlan_parse_protocol,
+};
+
+static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr,
+ unsigned int len)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
- /* Before delegating work to the lower layer, enter our MAC-address */
if (saddr == NULL)
saddr = dev->dev_addr;
- dev = VLAN_DEV_INFO(dev)->real_dev;
+ return dev_hard_header(skb, real_dev, type, daddr, saddr, len);
+}
- /* MPLS can send us skbuffs w/out enough space. This check will grow the
- * skb if it doesn't have enough headroom. Not a beautiful solution, so
- * I'll tick a counter so that users can know it's happening... If they
- * care...
- */
+static const struct header_ops vlan_passthru_header_ops = {
+ .create = vlan_passthru_hard_header,
+ .parse = eth_header_parse,
+ .parse_protocol = vlan_parse_protocol,
+};
- /* NOTE: This may still break if the underlying device is not the final
- * device (and thus there are more headers to add...) It should work for
- * good-ole-ethernet though.
- */
- if (skb_headroom(skb) < dev->hard_header_len) {
- struct sk_buff *sk_tmp = skb;
- skb = skb_realloc_headroom(sk_tmp, dev->hard_header_len);
- kfree_skb(sk_tmp);
- if (skb == NULL) {
- struct net_device_stats *stats = vlan_dev_get_stats(vdev);
- stats->tx_dropped++;
- return -ENOMEM;
- }
- VLAN_DEV_INFO(vdev)->cnt_inc_headroom_on_tx++;
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: %s: had to grow skb.\n", __FUNCTION__, vdev->name);
+static const struct device_type vlan_type = {
+ .name = "vlan",
+};
+
+static const struct net_device_ops vlan_netdev_ops;
+
+static int vlan_dev_init(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+
+ netif_carrier_off(dev);
+
+ /* IFF_BROADCAST|IFF_MULTICAST; ??? */
+ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
+ IFF_MASTER | IFF_SLAVE);
+ dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) |
+ (1<<__LINK_STATE_DORMANT))) |
+ (1<<__LINK_STATE_PRESENT);
+
+ if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING)
+ dev->state |= (1 << __LINK_STATE_NOCARRIER);
+
+ dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
+ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL |
+ NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
+ NETIF_F_FCOE_CRC | NETIF_F_FSO;
+
+ if (real_dev->vlan_features & NETIF_F_HW_MACSEC)
+ dev->hw_features |= NETIF_F_HW_MACSEC;
+
+ dev->features |= dev->hw_features;
+ dev->lltx = true;
+ dev->fcoe_mtu = true;
+ netif_inherit_tso_max(dev, real_dev);
+ if (dev->features & NETIF_F_VLAN_FEATURES)
+ netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n");
+
+ dev->vlan_features = real_dev->vlan_features &
+ ~(NETIF_F_FCOE_CRC | NETIF_F_FSO);
+ dev->hw_enc_features = vlan_tnl_features(real_dev);
+ dev->mpls_features = real_dev->mpls_features;
+
+ /* ipv6 shared card related stuff */
+ dev->dev_id = real_dev->dev_id;
+
+ if (is_zero_ether_addr(dev->dev_addr)) {
+ eth_hw_addr_set(dev, real_dev->dev_addr);
+ dev->addr_assign_type = NET_ADDR_STOLEN;
+ }
+ if (is_zero_ether_addr(dev->broadcast))
+ memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
+
+#if IS_ENABLED(CONFIG_FCOE)
+ dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid;
#endif
+
+ dev->needed_headroom = real_dev->needed_headroom;
+ if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) {
+ dev->header_ops = &vlan_passthru_header_ops;
+ dev->hard_header_len = real_dev->hard_header_len;
+ } else {
+ dev->header_ops = &vlan_header_ops;
+ dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
}
- if (build_vlan_header) {
- /* Now make the underlying real hard header */
- rc = dev->hard_header(skb, dev, ETH_P_8021Q, daddr, saddr, len + VLAN_HLEN);
+ dev->netdev_ops = &vlan_netdev_ops;
+
+ SET_NETDEV_DEVTYPE(dev, &vlan_type);
- if (rc > 0) {
- rc += VLAN_HLEN;
- } else if (rc < 0) {
- rc -= VLAN_HLEN;
+ netdev_lockdep_set_classes(dev);
+
+ vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
+ if (!vlan->vlan_pcpu_stats)
+ return -ENOMEM;
+
+ /* Get vlan's reference to real_dev */
+ netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL);
+
+ return 0;
+}
+
+/* Note: this function might be called multiple times for the same device. */
+void vlan_dev_free_egress_priority(const struct net_device *dev)
+{
+ struct vlan_priority_tci_mapping *pm;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+ while ((pm = vlan->egress_priority_map[i]) != NULL) {
+ vlan->egress_priority_map[i] = pm->next;
+ kfree(pm);
}
- } else {
- /* If here, then we'll just make a normal looking ethernet frame,
- * but, the hard_start_xmit method will insert the tag (it has to
- * be able to do this for bridged and other skbs that don't come
- * down the protocol stack in an orderly manner.
- */
- rc = dev->hard_header(skb, dev, type, daddr, saddr, len);
}
+}
- return rc;
+static void vlan_dev_uninit(struct net_device *dev)
+{
+ vlan_dev_free_egress_priority(dev);
}
-int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
- struct net_device_stats *stats = vlan_dev_get_stats(dev);
- struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ netdev_features_t old_features = features;
+ netdev_features_t lower_features;
- /* Handle non-VLAN frames if they are sent to us, for example by DHCP.
- *
- * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
- * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
+ lower_features = netdev_intersect_features((real_dev->vlan_features |
+ NETIF_F_RXCSUM),
+ real_dev->features);
+
+ /* Add HW_CSUM setting to preserve user ability to control
+ * checksum offload on the vlan device.
*/
+ if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+ lower_features |= NETIF_F_HW_CSUM;
+ features = netdev_intersect_features(features, lower_features);
+ features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE);
- if (veth->h_vlan_proto != __constant_htons(ETH_P_8021Q)) {
- int orig_headroom = skb_headroom(skb);
- unsigned short veth_TCI;
+ return features;
+}
- /* This is not a VLAN frame...but we can fix that! */
- VLAN_DEV_INFO(dev)->cnt_encap_on_xmit++;
+static int vlan_ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: proto to encap: 0x%hx (hbo)\n",
- __FUNCTION__, htons(veth->h_vlan_proto));
-#endif
- /* Construct the second two bytes. This field looks something
- * like:
- * usr_priority: 3 bits (high bits)
- * CFI 1 bit
- * VLAN ID 12 bits (low bits)
- */
- veth_TCI = VLAN_DEV_INFO(dev)->vlan_id;
- veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb);
+ return __ethtool_get_link_ksettings(vlan->real_dev, cmd);
+}
- skb = __vlan_put_tag(skb, veth_TCI);
- if (!skb) {
- stats->tx_dropped++;
- return 0;
- }
+static void vlan_ethtool_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strscpy(info->driver, vlan_fullname, sizeof(info->driver));
+ strscpy(info->version, vlan_version, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
+}
- if (orig_headroom < VLAN_HLEN) {
- VLAN_DEV_INFO(dev)->cnt_inc_headroom_on_tx++;
- }
+static int vlan_ethtool_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ return ethtool_get_ts_info_by_layer(vlan->real_dev, info);
+}
+
+static void vlan_dev_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct vlan_pcpu_stats *p;
+ u32 rx_errors = 0, tx_dropped = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes;
+ unsigned int start;
+
+ p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
+ do {
+ start = u64_stats_fetch_begin(&p->syncp);
+ rxpackets = u64_stats_read(&p->rx_packets);
+ rxbytes = u64_stats_read(&p->rx_bytes);
+ rxmulticast = u64_stats_read(&p->rx_multicast);
+ txpackets = u64_stats_read(&p->tx_packets);
+ txbytes = u64_stats_read(&p->tx_bytes);
+ } while (u64_stats_fetch_retry(&p->syncp, start));
+
+ stats->rx_packets += rxpackets;
+ stats->rx_bytes += rxbytes;
+ stats->multicast += rxmulticast;
+ stats->tx_packets += txpackets;
+ stats->tx_bytes += txbytes;
+ /* rx_errors & tx_dropped are u32 */
+ rx_errors += READ_ONCE(p->rx_errors);
+ tx_dropped += READ_ONCE(p->tx_dropped);
}
+ stats->rx_errors = rx_errors;
+ stats->tx_dropped = tx_dropped;
+}
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: about to send skb: %p to dev: %s\n",
- __FUNCTION__, skb, skb->dev->name);
- printk(VLAN_DBG " %2hx.%2hx.%2hx.%2xh.%2hx.%2hx %2hx.%2hx.%2hx.%2hx.%2hx.%2hx %4hx %4hx %4hx\n",
- veth->h_dest[0], veth->h_dest[1], veth->h_dest[2], veth->h_dest[3], veth->h_dest[4], veth->h_dest[5],
- veth->h_source[0], veth->h_source[1], veth->h_source[2], veth->h_source[3], veth->h_source[4], veth->h_source[5],
- veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto);
-#endif
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void vlan_dev_poll_controller(struct net_device *dev)
+{
+ return;
+}
- stats->tx_packets++; /* for statics only */
- stats->tx_bytes += skb->len;
+static int vlan_dev_netpoll_setup(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ struct netpoll *netpoll;
+ int err = 0;
+
+ netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!netpoll)
+ goto out;
+
+ err = __netpoll_setup(netpoll, real_dev);
+ if (err) {
+ kfree(netpoll);
+ goto out;
+ }
- skb->dev = VLAN_DEV_INFO(dev)->real_dev;
- dev_queue_xmit(skb);
+ vlan->netpoll = netpoll;
- return 0;
+out:
+ return err;
}
-int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static void vlan_dev_netpoll_cleanup(struct net_device *dev)
{
- struct net_device_stats *stats = vlan_dev_get_stats(dev);
- unsigned short veth_TCI;
+ struct vlan_dev_priv *vlan= vlan_dev_priv(dev);
+ struct netpoll *netpoll = vlan->netpoll;
- /* Construct the second two bytes. This field looks something
- * like:
- * usr_priority: 3 bits (high bits)
- * CFI 1 bit
- * VLAN ID 12 bits (low bits)
- */
- veth_TCI = VLAN_DEV_INFO(dev)->vlan_id;
- veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb);
- skb = __vlan_hwaccel_put_tag(skb, veth_TCI);
+ if (!netpoll)
+ return;
- stats->tx_packets++;
- stats->tx_bytes += skb->len;
+ vlan->netpoll = NULL;
+ __netpoll_free(netpoll);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
- skb->dev = VLAN_DEV_INFO(dev)->real_dev;
- dev_queue_xmit(skb);
+static int vlan_dev_get_iflink(const struct net_device *dev)
+{
+ const struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- return 0;
+ return READ_ONCE(real_dev->ifindex);
}
-int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
+static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
{
- /* TODO: gotta make sure the underlying layer can handle it,
- * maybe an IFF_VLAN_CAPABLE flag for devices?
- */
- if (VLAN_DEV_INFO(dev)->real_dev->mtu < new_mtu)
- return -ERANGE;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev);
+
+ path->type = DEV_PATH_VLAN;
+ path->encap.id = vlan->vlan_id;
+ path->encap.proto = vlan->vlan_proto;
+ path->dev = ctx->dev;
+ ctx->dev = vlan->real_dev;
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
- dev->mtu = new_mtu;
+ ctx->vlan[ctx->num_vlans].id = vlan->vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto;
+ ctx->num_vlans++;
return 0;
}
-int vlan_dev_set_ingress_priority(char *dev_name, __u32 skb_prio, short vlan_prio)
+#if IS_ENABLED(CONFIG_MACSEC)
+
+static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx)
{
- struct net_device *dev = dev_get_by_name(dev_name);
+ return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops;
+}
- if (dev) {
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- /* see if a priority mapping exists.. */
- VLAN_DEV_INFO(dev)->ingress_priority_map[vlan_prio & 0x7] = skb_prio;
- dev_put(dev);
- return 0;
- }
+static int vlan_macsec_offload(int (* const func)(struct macsec_context *),
+ struct macsec_context *ctx)
+{
+ if (unlikely(!func))
+ return 0;
- dev_put(dev);
- }
- return -EINVAL;
+ return (*func)(ctx);
}
-int vlan_dev_set_egress_priority(char *dev_name, __u32 skb_prio, short vlan_prio)
+static int vlan_macsec_dev_open(struct macsec_context *ctx)
{
- struct net_device *dev = dev_get_by_name(dev_name);
- struct vlan_priority_tci_mapping *mp = NULL;
- struct vlan_priority_tci_mapping *np;
-
- if (dev) {
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- /* See if a priority mapping exists.. */
- mp = VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF];
- while (mp) {
- if (mp->priority == skb_prio) {
- mp->vlan_qos = ((vlan_prio << 13) & 0xE000);
- dev_put(dev);
- return 0;
- }
- mp = mp->next;
- }
-
- /* Create a new mapping then. */
- mp = VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF];
- np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL);
- if (np) {
- np->next = mp;
- np->priority = skb_prio;
- np->vlan_qos = ((vlan_prio << 13) & 0xE000);
- VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF] = np;
- dev_put(dev);
- return 0;
- } else {
- dev_put(dev);
- return -ENOBUFS;
- }
- }
- dev_put(dev);
- }
- return -EINVAL;
-}
-
-/* Flags are defined in the vlan_dev_info class in include/linux/if_vlan.h file. */
-int vlan_dev_set_vlan_flag(char *dev_name, __u32 flag, short flag_val)
-{
- struct net_device *dev = dev_get_by_name(dev_name);
-
- if (dev) {
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- /* verify flag is supported */
- if (flag == 1) {
- if (flag_val) {
- VLAN_DEV_INFO(dev)->flags |= 1;
- } else {
- VLAN_DEV_INFO(dev)->flags &= ~1;
- }
- dev_put(dev);
- return 0;
- } else {
- printk(KERN_ERR "%s: flag %i is not valid.\n",
- __FUNCTION__, (int)(flag));
- dev_put(dev);
- return -EINVAL;
- }
- } else {
- printk(KERN_ERR
- "%s: %s is not a vlan device, priv_flags: %hX.\n",
- __FUNCTION__, dev->name, dev->priv_flags);
- dev_put(dev);
- }
- } else {
- printk(KERN_ERR "%s: Could not find device: %s\n",
- __FUNCTION__, dev_name);
- }
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
- return -EINVAL;
+ return vlan_macsec_offload(ops->mdo_dev_open, ctx);
}
+static int vlan_macsec_dev_stop(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
-int vlan_dev_get_realdev_name(const char *dev_name, char* result)
+ return vlan_macsec_offload(ops->mdo_dev_stop, ctx);
+}
+
+static int vlan_macsec_add_secy(struct macsec_context *ctx)
{
- struct net_device *dev = dev_get_by_name(dev_name);
- int rv = 0;
- if (dev) {
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- strncpy(result, VLAN_DEV_INFO(dev)->real_dev->name, 23);
- rv = 0;
- } else {
- rv = -EINVAL;
- }
- dev_put(dev);
- } else {
- rv = -ENODEV;
- }
- return rv;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_secy, ctx);
}
-int vlan_dev_get_vid(const char *dev_name, unsigned short* result)
+static int vlan_macsec_upd_secy(struct macsec_context *ctx)
{
- struct net_device *dev = dev_get_by_name(dev_name);
- int rv = 0;
- if (dev) {
- if (dev->priv_flags & IFF_802_1Q_VLAN) {
- *result = VLAN_DEV_INFO(dev)->vlan_id;
- rv = 0;
- } else {
- rv = -EINVAL;
- }
- dev_put(dev);
- } else {
- rv = -ENODEV;
- }
- return rv;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_secy, ctx);
}
+static int vlan_macsec_del_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_secy, ctx);
+}
-int vlan_dev_set_mac_address(struct net_device *dev, void *addr_struct_p)
+static int vlan_macsec_add_rxsc(struct macsec_context *ctx)
{
- struct sockaddr *addr = (struct sockaddr *)(addr_struct_p);
- int i;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- if (netif_running(dev))
- return -EBUSY;
+ if (!ops)
+ return -EOPNOTSUPP;
- memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ return vlan_macsec_offload(ops->mdo_add_rxsc, ctx);
+}
- printk("%s: Setting MAC address to ", dev->name);
- for (i = 0; i < 6; i++)
- printk(" %2.2x", dev->dev_addr[i]);
- printk(".\n");
+static int vlan_macsec_upd_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- if (memcmp(VLAN_DEV_INFO(dev)->real_dev->dev_addr,
- dev->dev_addr,
- dev->addr_len) != 0) {
- if (!(VLAN_DEV_INFO(dev)->real_dev->flags & IFF_PROMISC)) {
- int flgs = VLAN_DEV_INFO(dev)->real_dev->flags;
+ if (!ops)
+ return -EOPNOTSUPP;
- /* Increment our in-use promiscuity counter */
- dev_set_promiscuity(VLAN_DEV_INFO(dev)->real_dev, 1);
+ return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx);
+}
- /* Make PROMISC visible to the user. */
- flgs |= IFF_PROMISC;
- printk("VLAN (%s): Setting underlying device (%s) to promiscious mode.\n",
- dev->name, VLAN_DEV_INFO(dev)->real_dev->name);
- dev_change_flags(VLAN_DEV_INFO(dev)->real_dev, flgs);
- }
- } else {
- printk("VLAN (%s): Underlying device (%s) has same MAC, not checking promiscious mode.\n",
- dev->name, VLAN_DEV_INFO(dev)->real_dev->name);
- }
+static int vlan_macsec_del_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- return 0;
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_rxsc, ctx);
}
-static inline int vlan_dmi_equals(struct dev_mc_list *dmi1,
- struct dev_mc_list *dmi2)
+static int vlan_macsec_add_rxsa(struct macsec_context *ctx)
{
- return ((dmi1->dmi_addrlen == dmi2->dmi_addrlen) &&
- (memcmp(dmi1->dmi_addr, dmi2->dmi_addr, dmi1->dmi_addrlen) == 0));
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_rxsa, ctx);
}
-/** dmi is a single entry into a dev_mc_list, a single node. mc_list is
- * an entire list, and we'll iterate through it.
- */
-static int vlan_should_add_mc(struct dev_mc_list *dmi, struct dev_mc_list *mc_list)
-{
- struct dev_mc_list *idmi;
-
- for (idmi = mc_list; idmi != NULL; ) {
- if (vlan_dmi_equals(dmi, idmi)) {
- if (dmi->dmi_users > idmi->dmi_users)
- return 1;
- else
- return 0;
- } else {
- idmi = idmi->next;
- }
- }
+static int vlan_macsec_upd_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
- return 1;
+ return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx);
}
-static inline void vlan_destroy_mc_list(struct dev_mc_list *mc_list)
+static int vlan_macsec_del_rxsa(struct macsec_context *ctx)
{
- struct dev_mc_list *dmi = mc_list;
- struct dev_mc_list *next;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- while(dmi) {
- next = dmi->next;
- kfree(dmi);
- dmi = next;
- }
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_rxsa, ctx);
}
-static void vlan_copy_mc_list(struct dev_mc_list *mc_list, struct vlan_dev_info *vlan_info)
+static int vlan_macsec_add_txsa(struct macsec_context *ctx)
{
- struct dev_mc_list *dmi, *new_dmi;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- vlan_destroy_mc_list(vlan_info->old_mc_list);
- vlan_info->old_mc_list = NULL;
+ if (!ops)
+ return -EOPNOTSUPP;
- for (dmi = mc_list; dmi != NULL; dmi = dmi->next) {
- new_dmi = kmalloc(sizeof(*new_dmi), GFP_ATOMIC);
- if (new_dmi == NULL) {
- printk(KERN_ERR "vlan: cannot allocate memory. "
- "Multicast may not work properly from now.\n");
- return;
- }
+ return vlan_macsec_offload(ops->mdo_add_txsa, ctx);
+}
- /* Copy whole structure, then make new 'next' pointer */
- *new_dmi = *dmi;
- new_dmi->next = vlan_info->old_mc_list;
- vlan_info->old_mc_list = new_dmi;
- }
+static int vlan_macsec_upd_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_txsa, ctx);
}
-static void vlan_flush_mc_list(struct net_device *dev)
+static int vlan_macsec_del_txsa(struct macsec_context *ctx)
{
- struct dev_mc_list *dmi = dev->mc_list;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- while (dmi) {
- printk(KERN_DEBUG "%s: del %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address from vlan interface\n",
- dev->name,
- dmi->dmi_addr[0],
- dmi->dmi_addr[1],
- dmi->dmi_addr[2],
- dmi->dmi_addr[3],
- dmi->dmi_addr[4],
- dmi->dmi_addr[5]);
- dev_mc_delete(dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
- dmi = dev->mc_list;
- }
+ if (!ops)
+ return -EOPNOTSUPP;
- /* dev->mc_list is NULL by the time we get here. */
- vlan_destroy_mc_list(VLAN_DEV_INFO(dev)->old_mc_list);
- VLAN_DEV_INFO(dev)->old_mc_list = NULL;
+ return vlan_macsec_offload(ops->mdo_del_txsa, ctx);
}
-int vlan_dev_open(struct net_device *dev)
+static int vlan_macsec_get_dev_stats(struct macsec_context *ctx)
{
- if (!(VLAN_DEV_INFO(dev)->real_dev->flags & IFF_UP))
- return -ENETDOWN;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- return 0;
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx);
}
-int vlan_dev_stop(struct net_device *dev)
+static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx)
{
- vlan_flush_mc_list(dev);
- return 0;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx);
}
-int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx)
{
- struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev;
- struct ifreq ifrr;
- int err = -EOPNOTSUPP;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ);
- ifrr.ifr_ifru = ifr->ifr_ifru;
+ if (!ops)
+ return -EOPNOTSUPP;
- switch(cmd) {
- case SIOCGMIIPHY:
- case SIOCGMIIREG:
- case SIOCSMIIREG:
- if (real_dev->do_ioctl && netif_device_present(real_dev))
- err = real_dev->do_ioctl(real_dev, &ifrr, cmd);
- break;
+ return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx);
+}
- case SIOCETHTOOL:
- err = dev_ethtool(&ifrr);
- }
+static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- if (!err)
- ifr->ifr_ifru = ifrr.ifr_ifru;
+ if (!ops)
+ return -EOPNOTSUPP;
- return err;
+ return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx);
}
-/** Taken from Gleb + Lennert's VLAN code, and modified... */
-void vlan_dev_set_multicast_list(struct net_device *vlan_dev)
+static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx)
{
- struct dev_mc_list *dmi;
- struct net_device *real_dev;
- int inc;
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
- if (vlan_dev && (vlan_dev->priv_flags & IFF_802_1Q_VLAN)) {
- /* Then it's a real vlan device, as far as we can tell.. */
- real_dev = VLAN_DEV_INFO(vlan_dev)->real_dev;
+ if (!ops)
+ return -EOPNOTSUPP;
- /* compare the current promiscuity to the last promisc we had.. */
- inc = vlan_dev->promiscuity - VLAN_DEV_INFO(vlan_dev)->old_promiscuity;
- if (inc) {
- printk(KERN_INFO "%s: dev_set_promiscuity(master, %d)\n",
- vlan_dev->name, inc);
- dev_set_promiscuity(real_dev, inc); /* found in dev.c */
- VLAN_DEV_INFO(vlan_dev)->old_promiscuity = vlan_dev->promiscuity;
- }
+ return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx);
+}
- inc = vlan_dev->allmulti - VLAN_DEV_INFO(vlan_dev)->old_allmulti;
- if (inc) {
- printk(KERN_INFO "%s: dev_set_allmulti(master, %d)\n",
- vlan_dev->name, inc);
- dev_set_allmulti(real_dev, inc); /* dev.c */
- VLAN_DEV_INFO(vlan_dev)->old_allmulti = vlan_dev->allmulti;
- }
+static const struct macsec_ops macsec_offload_ops = {
+ /* Device wide */
+ .mdo_dev_open = vlan_macsec_dev_open,
+ .mdo_dev_stop = vlan_macsec_dev_stop,
+ /* SecY */
+ .mdo_add_secy = vlan_macsec_add_secy,
+ .mdo_upd_secy = vlan_macsec_upd_secy,
+ .mdo_del_secy = vlan_macsec_del_secy,
+ /* Security channels */
+ .mdo_add_rxsc = vlan_macsec_add_rxsc,
+ .mdo_upd_rxsc = vlan_macsec_upd_rxsc,
+ .mdo_del_rxsc = vlan_macsec_del_rxsc,
+ /* Security associations */
+ .mdo_add_rxsa = vlan_macsec_add_rxsa,
+ .mdo_upd_rxsa = vlan_macsec_upd_rxsa,
+ .mdo_del_rxsa = vlan_macsec_del_rxsa,
+ .mdo_add_txsa = vlan_macsec_add_txsa,
+ .mdo_upd_txsa = vlan_macsec_upd_txsa,
+ .mdo_del_txsa = vlan_macsec_del_txsa,
+ /* Statistics */
+ .mdo_get_dev_stats = vlan_macsec_get_dev_stats,
+ .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats,
+ .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats,
+ .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats,
+ .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats,
+};
- /* looking for addresses to add to master's list */
- for (dmi = vlan_dev->mc_list; dmi != NULL; dmi = dmi->next) {
- if (vlan_should_add_mc(dmi, VLAN_DEV_INFO(vlan_dev)->old_mc_list)) {
- dev_mc_add(real_dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
- printk(KERN_DEBUG "%s: add %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address to master interface\n",
- vlan_dev->name,
- dmi->dmi_addr[0],
- dmi->dmi_addr[1],
- dmi->dmi_addr[2],
- dmi->dmi_addr[3],
- dmi->dmi_addr[4],
- dmi->dmi_addr[5]);
- }
- }
+#endif
- /* looking for addresses to delete from master's list */
- for (dmi = VLAN_DEV_INFO(vlan_dev)->old_mc_list; dmi != NULL; dmi = dmi->next) {
- if (vlan_should_add_mc(dmi, vlan_dev->mc_list)) {
- /* if we think we should add it to the new list, then we should really
- * delete it from the real list on the underlying device.
- */
- dev_mc_delete(real_dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
- printk(KERN_DEBUG "%s: del %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address from master interface\n",
- vlan_dev->name,
- dmi->dmi_addr[0],
- dmi->dmi_addr[1],
- dmi->dmi_addr[2],
- dmi->dmi_addr[3],
- dmi->dmi_addr[4],
- dmi->dmi_addr[5]);
- }
- }
+static const struct ethtool_ops vlan_ethtool_ops = {
+ .get_link_ksettings = vlan_ethtool_get_link_ksettings,
+ .get_drvinfo = vlan_ethtool_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_ts_info = vlan_ethtool_get_ts_info,
+};
+
+static const struct net_device_ops vlan_netdev_ops = {
+ .ndo_change_mtu = vlan_dev_change_mtu,
+ .ndo_init = vlan_dev_init,
+ .ndo_uninit = vlan_dev_uninit,
+ .ndo_open = vlan_dev_open,
+ .ndo_stop = vlan_dev_stop,
+ .ndo_start_xmit = vlan_dev_hard_start_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_mac_address = vlan_dev_set_mac_address,
+ .ndo_set_rx_mode = vlan_dev_set_rx_mode,
+ .ndo_change_rx_flags = vlan_dev_change_rx_flags,
+ .ndo_eth_ioctl = vlan_dev_ioctl,
+ .ndo_neigh_setup = vlan_dev_neigh_setup,
+ .ndo_get_stats64 = vlan_dev_get_stats64,
+#if IS_ENABLED(CONFIG_FCOE)
+ .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup,
+ .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done,
+ .ndo_fcoe_enable = vlan_dev_fcoe_enable,
+ .ndo_fcoe_disable = vlan_dev_fcoe_disable,
+ .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target,
+#endif
+#ifdef NETDEV_FCOE_WWNN
+ .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn,
+#endif
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = vlan_dev_poll_controller,
+ .ndo_netpoll_setup = vlan_dev_netpoll_setup,
+ .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
+#endif
+ .ndo_fix_features = vlan_dev_fix_features,
+ .ndo_get_iflink = vlan_dev_get_iflink,
+ .ndo_fill_forward_path = vlan_dev_fill_forward_path,
+ .ndo_hwtstamp_get = vlan_hwtstamp_get,
+ .ndo_hwtstamp_set = vlan_hwtstamp_set,
+};
+
+static void vlan_dev_free(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
- /* save multicast list */
- vlan_copy_mc_list(vlan_dev->mc_list, VLAN_DEV_INFO(vlan_dev));
- }
+ free_percpu(vlan->vlan_pcpu_stats);
+ vlan->vlan_pcpu_stats = NULL;
+
+ /* Get rid of the vlan's reference to real_dev */
+ netdev_put(vlan->real_dev, &vlan->dev_tracker);
+}
+
+void vlan_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE;
+ dev->priv_flags |= IFF_UNICAST_FLT;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netif_keep_dst(dev);
+
+ dev->netdev_ops = &vlan_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = vlan_dev_free;
+ dev->ethtool_ops = &vlan_ethtool_ops;
+
+#if IS_ENABLED(CONFIG_MACSEC)
+ dev->macsec_ops = &macsec_offload_ops;
+#endif
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
+
+ eth_zero_addr(dev->broadcast);
}
diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c
new file mode 100644
index 000000000000..6b34b72aa466
--- /dev/null
+++ b/net/8021q/vlan_gvrp.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IEEE 802.1Q GARP VLAN Registration Protocol (GVRP)
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/types.h>
+#include <linux/if_vlan.h>
+#include <net/garp.h>
+#include "vlan.h"
+
+#define GARP_GVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 }
+
+enum gvrp_attributes {
+ GVRP_ATTR_INVALID,
+ GVRP_ATTR_VID,
+ __GVRP_ATTR_MAX
+};
+#define GVRP_ATTR_MAX (__GVRP_ATTR_MAX - 1)
+
+static struct garp_application vlan_gvrp_app __read_mostly = {
+ .proto.group_address = GARP_GVRP_ADDRESS,
+ .maxattr = GVRP_ATTR_MAX,
+ .type = GARP_APPLICATION_GVRP,
+};
+
+int vlan_gvrp_request_join(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return 0;
+ return garp_request_join(vlan->real_dev, &vlan_gvrp_app,
+ &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
+}
+
+void vlan_gvrp_request_leave(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return;
+ garp_request_leave(vlan->real_dev, &vlan_gvrp_app,
+ &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
+}
+
+int vlan_gvrp_init_applicant(struct net_device *dev)
+{
+ return garp_init_applicant(dev, &vlan_gvrp_app);
+}
+
+void vlan_gvrp_uninit_applicant(struct net_device *dev)
+{
+ garp_uninit_applicant(dev, &vlan_gvrp_app);
+}
+
+int __init vlan_gvrp_init(void)
+{
+ return garp_register_application(&vlan_gvrp_app);
+}
+
+void vlan_gvrp_uninit(void)
+{
+ garp_unregister_application(&vlan_gvrp_app);
+}
diff --git a/net/8021q/vlan_mvrp.c b/net/8021q/vlan_mvrp.c
new file mode 100644
index 000000000000..689eceeaa360
--- /dev/null
+++ b/net/8021q/vlan_mvrp.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IEEE 802.1Q Multiple VLAN Registration Protocol (MVRP)
+ *
+ * Copyright (c) 2012 Massachusetts Institute of Technology
+ *
+ * Adapted from code in net/8021q/vlan_gvrp.c
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/mrp.h>
+#include "vlan.h"
+
+#define MRP_MVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 }
+
+enum mvrp_attributes {
+ MVRP_ATTR_INVALID,
+ MVRP_ATTR_VID,
+ __MVRP_ATTR_MAX
+};
+#define MVRP_ATTR_MAX (__MVRP_ATTR_MAX - 1)
+
+static struct mrp_application vlan_mrp_app __read_mostly = {
+ .type = MRP_APPLICATION_MVRP,
+ .maxattr = MVRP_ATTR_MAX,
+ .pkttype.type = htons(ETH_P_MVRP),
+ .group_address = MRP_MVRP_ADDRESS,
+ .version = 0,
+};
+
+int vlan_mvrp_request_join(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return 0;
+ return mrp_request_join(vlan->real_dev, &vlan_mrp_app,
+ &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID);
+}
+
+void vlan_mvrp_request_leave(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return;
+ mrp_request_leave(vlan->real_dev, &vlan_mrp_app,
+ &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID);
+}
+
+int vlan_mvrp_init_applicant(struct net_device *dev)
+{
+ return mrp_init_applicant(dev, &vlan_mrp_app);
+}
+
+void vlan_mvrp_uninit_applicant(struct net_device *dev)
+{
+ mrp_uninit_applicant(dev, &vlan_mrp_app);
+}
+
+int __init vlan_mvrp_init(void)
+{
+ return mrp_register_application(&vlan_mrp_app);
+}
+
+void vlan_mvrp_uninit(void)
+{
+ mrp_unregister_application(&vlan_mrp_app);
+}
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
new file mode 100644
index 000000000000..a000b1ef0520
--- /dev/null
+++ b/net/8021q/vlan_netlink.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VLAN netlink control interface
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include "vlan.h"
+
+
+static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = {
+ [IFLA_VLAN_ID] = { .type = NLA_U16 },
+ [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) },
+ [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED },
+ [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED },
+ [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 },
+};
+
+static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = {
+ [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) },
+};
+
+
+static inline int vlan_validate_qos_map(struct nlattr *attr)
+{
+ if (!attr)
+ return 0;
+ return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX,
+ vlan_map_policy, NULL);
+}
+
+static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ifla_vlan_flags *flags;
+ u16 id;
+ int err;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
+ return -EINVAL;
+ }
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
+ return -EADDRNOTAVAIL;
+ }
+ }
+
+ if (!data) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified");
+ return -EINVAL;
+ }
+
+ if (data[IFLA_VLAN_PROTOCOL]) {
+ switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol");
+ return -EPROTONOSUPPORT;
+ }
+ }
+
+ if (data[IFLA_VLAN_ID]) {
+ id = nla_get_u16(data[IFLA_VLAN_ID]);
+ if (id >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id");
+ return -ERANGE;
+ }
+ }
+ if (data[IFLA_VLAN_FLAGS]) {
+ flags = nla_data(data[IFLA_VLAN_FLAGS]);
+ if ((flags->flags & flags->mask) &
+ ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP |
+ VLAN_FLAG_BRIDGE_BINDING)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
+ return -EINVAL;
+ }
+ }
+
+ err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map");
+ return err;
+ }
+ err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map");
+ return err;
+ }
+ return 0;
+}
+
+static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ifla_vlan_flags *flags;
+ struct ifla_vlan_qos_mapping *m;
+ struct nlattr *attr;
+ int rem, err;
+
+ if (data[IFLA_VLAN_FLAGS]) {
+ flags = nla_data(data[IFLA_VLAN_FLAGS]);
+ err = vlan_dev_change_flags(dev, flags->flags, flags->mask);
+ if (err)
+ return err;
+ }
+ if (data[IFLA_VLAN_INGRESS_QOS]) {
+ nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING,
+ data[IFLA_VLAN_INGRESS_QOS], rem) {
+ m = nla_data(attr);
+ vlan_dev_set_ingress_priority(dev, m->to, m->from);
+ }
+ }
+ if (data[IFLA_VLAN_EGRESS_QOS]) {
+ nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING,
+ data[IFLA_VLAN_EGRESS_QOS], rem) {
+ m = nla_data(attr);
+ err = vlan_dev_set_egress_priority(dev, m->from, m->to);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int vlan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct net *link_net = rtnl_newlink_link_net(params);
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct net_device *real_dev;
+ unsigned int max_mtu;
+ __be16 proto;
+ int err;
+
+ if (!data[IFLA_VLAN_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified");
+ return -EINVAL;
+ }
+
+ if (!tb[IFLA_LINK]) {
+ NL_SET_ERR_MSG_MOD(extack, "link not specified");
+ return -EINVAL;
+ }
+
+ real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK]));
+ if (!real_dev) {
+ NL_SET_ERR_MSG_MOD(extack, "link does not exist");
+ return -ENODEV;
+ }
+
+ proto = nla_get_be16_default(data[IFLA_VLAN_PROTOCOL],
+ htons(ETH_P_8021Q));
+
+ vlan->vlan_proto = proto;
+ vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]);
+ vlan->real_dev = real_dev;
+ dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
+ vlan->flags = VLAN_FLAG_REORDER_HDR;
+
+ err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id,
+ extack);
+ if (err < 0)
+ return err;
+
+ max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN :
+ real_dev->mtu;
+ if (!tb[IFLA_MTU])
+ dev->mtu = max_mtu;
+ else if (dev->mtu > max_mtu)
+ return -EINVAL;
+
+ /* Note: If this initial vlan_changelink() fails, we need
+ * to call vlan_dev_free_egress_priority() to free memory.
+ */
+ err = vlan_changelink(dev, tb, data, extack);
+
+ if (!err)
+ err = register_vlan_dev(dev, extack);
+
+ if (err)
+ vlan_dev_free_egress_priority(dev);
+ return err;
+}
+
+static inline size_t vlan_qos_map_size(unsigned int n)
+{
+ if (n == 0)
+ return 0;
+ /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */
+ return nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n;
+}
+
+static size_t vlan_get_size(const struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+ return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */
+ nla_total_size(2) + /* IFLA_VLAN_ID */
+ nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */
+ vlan_qos_map_size(vlan->nr_ingress_mappings) +
+ vlan_qos_map_size(vlan->nr_egress_mappings);
+}
+
+static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_priority_tci_mapping *pm;
+ struct ifla_vlan_flags f;
+ struct ifla_vlan_qos_mapping m;
+ struct nlattr *nest;
+ unsigned int i;
+
+ if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) ||
+ nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id))
+ goto nla_put_failure;
+ if (vlan->flags) {
+ f.flags = vlan->flags;
+ f.mask = ~0;
+ if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f))
+ goto nla_put_failure;
+ }
+ if (vlan->nr_ingress_mappings) {
+ nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) {
+ if (!vlan->ingress_priority_map[i])
+ continue;
+
+ m.from = i;
+ m.to = vlan->ingress_priority_map[i];
+ if (nla_put(skb, IFLA_VLAN_QOS_MAPPING,
+ sizeof(m), &m))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+
+ if (vlan->nr_egress_mappings) {
+ nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+ for (pm = vlan->egress_priority_map[i]; pm;
+ pm = pm->next) {
+ if (!pm->vlan_qos)
+ continue;
+
+ m.from = pm->priority;
+ m.to = (pm->vlan_qos >> 13) & 0x7;
+ if (nla_put(skb, IFLA_VLAN_QOS_MAPPING,
+ sizeof(m), &m))
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct net *vlan_get_link_net(const struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ return dev_net(real_dev);
+}
+
+struct rtnl_link_ops vlan_link_ops __read_mostly = {
+ .kind = "vlan",
+ .maxtype = IFLA_VLAN_MAX,
+ .policy = vlan_policy,
+ .priv_size = sizeof(struct vlan_dev_priv),
+ .setup = vlan_setup,
+ .validate = vlan_validate,
+ .newlink = vlan_newlink,
+ .changelink = vlan_changelink,
+ .dellink = unregister_vlan_dev,
+ .get_size = vlan_get_size,
+ .fill_info = vlan_fill_info,
+ .get_link_net = vlan_get_link_net,
+};
+
+int __init vlan_netlink_init(void)
+{
+ return rtnl_link_register(&vlan_link_ops);
+}
+
+void __exit vlan_netlink_fini(void)
+{
+ rtnl_link_unregister(&vlan_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("vlan");
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index a8fc0de1f969..fa67374bda49 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/******************************************************************************
* vlanproc.c VLAN Module. /proc filesystem interface.
*
@@ -9,30 +10,23 @@
*
* Copyright: (c) 1998 Ben Greear
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
* ============================================================================
* Jan 20, 1998 Ben Greear Initial Version
*****************************************************************************/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
-#include <linux/stddef.h> /* offsetof(), etc. */
-#include <linux/errno.h> /* return codes */
+#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/slab.h> /* kmalloc(), kfree() */
-#include <linux/mm.h>
-#include <linux/string.h> /* inline mem*, str* functions */
-#include <linux/init.h> /* __initfunc et al. */
-#include <asm/byteorder.h> /* htons(), etc. */
-#include <asm/uaccess.h> /* copy_to_user */
-#include <asm/io.h>
+#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include "vlanproc.h"
#include "vlan.h"
@@ -51,7 +45,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *v);
/*
- * Names of the proc directory entries
+ * Names of the proc directory entries
*/
static const char name_root[] = "vlan";
@@ -59,75 +53,33 @@ static const char name_conf[] = "config";
/*
* Structures for interfacing with the /proc filesystem.
- * VLAN creates its own directory /proc/net/vlan with the folowing
+ * VLAN creates its own directory /proc/net/vlan with the following
* entries:
* config device status/configuration
* <device> entry for each device
*/
/*
- * Generic /proc/net/vlan/<file> file and inode operations
+ * Generic /proc/net/vlan/<file> file and inode operations
*/
-static struct seq_operations vlan_seq_ops = {
+static const struct seq_operations vlan_seq_ops = {
.start = vlan_seq_start,
.next = vlan_seq_next,
.stop = vlan_seq_stop,
.show = vlan_seq_show,
};
-static int vlan_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &vlan_seq_ops);
-}
-
-static struct file_operations vlan_fops = {
- .owner = THIS_MODULE,
- .open = vlan_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-/*
- * /proc/net/vlan/<device> file and inode operations
- */
-
-static int vlandev_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, vlandev_seq_show, PDE(inode)->data);
-}
-
-static struct file_operations vlandev_fops = {
- .owner = THIS_MODULE,
- .open = vlandev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/*
- * Proc filesystem derectory entries.
+ * Proc filesystem directory entries.
*/
-/*
- * /proc/net/vlan
- */
-
-static struct proc_dir_entry *proc_vlan_dir;
-
-/*
- * /proc/net/vlan/config
- */
-
-static struct proc_dir_entry *proc_vlan_conf;
-
/* Strings */
-static const char *vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = {
- [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID",
- [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD",
- [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD]= "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD",
- [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID",
+static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = {
+ [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID",
+ [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD",
+ [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD",
+ [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID",
};
/*
* Interface functions
@@ -137,13 +89,15 @@ static const char *vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = {
* Clean up /proc/net/vlan entries
*/
-void vlan_proc_cleanup(void)
+void vlan_proc_cleanup(struct net *net)
{
- if (proc_vlan_conf)
- remove_proc_entry(name_conf, proc_vlan_dir);
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+ if (vn->proc_vlan_conf)
+ remove_proc_entry(name_conf, vn->proc_vlan_dir);
- if (proc_vlan_dir)
- proc_net_remove(name_root);
+ if (vn->proc_vlan_dir)
+ remove_proc_entry(name_root, net->proc_net);
/* Dynamically added entries should be cleaned up as their vlan_device
* is removed, so we should not have to take care of it here...
@@ -154,19 +108,24 @@ void vlan_proc_cleanup(void)
* Create /proc/net/vlan entries
*/
-int __init vlan_proc_init(void)
+int __net_init vlan_proc_init(struct net *net)
{
- proc_vlan_dir = proc_mkdir(name_root, proc_net);
- if (proc_vlan_dir) {
- proc_vlan_conf = create_proc_entry(name_conf,
- S_IFREG|S_IRUSR|S_IWUSR,
- proc_vlan_dir);
- if (proc_vlan_conf) {
- proc_vlan_conf->proc_fops = &vlan_fops;
- return 0;
- }
- }
- vlan_proc_cleanup();
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+ vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net);
+ if (!vn->proc_vlan_dir)
+ goto err;
+
+ vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600,
+ vn->proc_vlan_dir, &vlan_seq_ops,
+ sizeof(struct seq_net_private));
+ if (!vn->proc_vlan_conf)
+ goto err;
+ return 0;
+
+err:
+ pr_err("can't create entry in proc filesystem!\n");
+ vlan_proc_cleanup(net);
return -ENOBUFS;
}
@@ -174,61 +133,28 @@ int __init vlan_proc_init(void)
* Add directory entry for VLAN device.
*/
-int vlan_proc_add_dev (struct net_device *vlandev)
+int vlan_proc_add_dev(struct net_device *vlandev)
{
- struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev);
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+ struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
- if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) {
- printk(KERN_ERR
- "ERROR: vlan_proc_add, device -:%s:- is NOT a VLAN\n",
- vlandev->name);
+ if (!strcmp(vlandev->name, name_conf))
return -EINVAL;
- }
-
- dev_info->dent = create_proc_entry(vlandev->name,
- S_IFREG|S_IRUSR|S_IWUSR,
- proc_vlan_dir);
- if (!dev_info->dent)
+ vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600,
+ vn->proc_vlan_dir, vlandev_seq_show, vlandev);
+ if (!vlan->dent)
return -ENOBUFS;
-
- dev_info->dent->proc_fops = &vlandev_fops;
- dev_info->dent->data = vlandev;
-
-#ifdef VLAN_DEBUG
- printk(KERN_ERR "vlan_proc_add, device -:%s:- being added.\n",
- vlandev->name);
-#endif
return 0;
}
/*
* Delete directory entry for VLAN device.
*/
-int vlan_proc_rem_dev(struct net_device *vlandev)
+void vlan_proc_rem_dev(struct net_device *vlandev)
{
- if (!vlandev) {
- printk(VLAN_ERR "%s: invalid argument: %p\n",
- __FUNCTION__, vlandev);
- return -EINVAL;
- }
-
- if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) {
- printk(VLAN_DBG "%s: invalid argument, device: %s is not a VLAN device, priv_flags: 0x%4hX.\n",
- __FUNCTION__, vlandev->name, vlandev->priv_flags);
- return -EINVAL;
- }
-
-#ifdef VLAN_DEBUG
- printk(VLAN_DBG "%s: dev: %p\n", __FUNCTION__, vlandev);
-#endif
-
/** NOTE: This will consume the memory pointed to by dent, it seems. */
- if (VLAN_DEV_INFO(vlandev)->dent) {
- remove_proc_entry(VLAN_DEV_INFO(vlandev)->dent->name, proc_vlan_dir);
- VLAN_DEV_INFO(vlandev)->dent = NULL;
- }
-
- return 0;
+ proc_remove(vlan_dev_priv(vlandev)->dent);
+ vlan_dev_priv(vlandev)->dent = NULL;
}
/****** Proc filesystem entry points ****************************************/
@@ -237,64 +163,63 @@ int vlan_proc_rem_dev(struct net_device *vlandev)
* The following few functions build the content of /proc/net/vlan/config
*/
-/* starting at dev, find a VLAN device */
-static struct net_device *vlan_skip(struct net_device *dev)
+static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos)
{
- while (dev && !(dev->priv_flags & IFF_802_1Q_VLAN))
- dev = dev->next;
+ unsigned long ifindex = *pos;
+ struct net_device *dev;
- return dev;
+ for_each_netdev_dump(seq_file_net(seq), dev, ifindex) {
+ if (!is_vlan_dev(dev))
+ continue;
+ *pos = dev->ifindex;
+ return dev;
+ }
+ return NULL;
}
-/* start read of /proc/net/vlan/config */
static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- struct net_device *dev;
- loff_t i = 1;
-
- read_lock(&dev_base_lock);
-
+ rcu_read_lock();
if (*pos == 0)
return SEQ_START_TOKEN;
-
- for (dev = vlan_skip(dev_base); dev && i < *pos;
- dev = vlan_skip(dev->next), ++i);
-
- return (i == *pos) ? dev : NULL;
-}
+
+ return vlan_seq_from_index(seq, pos);
+}
static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
-
- return vlan_skip((v == SEQ_START_TOKEN)
- ? dev_base
- : ((struct net_device *)v)->next);
+ return vlan_seq_from_index(seq, pos);
}
static void vlan_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int vlan_seq_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq_file_net(seq);
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+
if (v == SEQ_START_TOKEN) {
const char *nmtype = NULL;
seq_puts(seq, "VLAN Dev name | VLAN ID\n");
- if (vlan_name_type < ARRAY_SIZE(vlan_name_type_str))
- nmtype = vlan_name_type_str[vlan_name_type];
+ if (vn->name_type < ARRAY_SIZE(vlan_name_type_str))
+ nmtype = vlan_name_type_str[vn->name_type];
- seq_printf(seq, "Name-Type: %s\n",
- nmtype ? nmtype : "UNKNOWN" );
+ seq_printf(seq, "Name-Type: %s\n",
+ nmtype ? nmtype : "UNKNOWN");
} else {
const struct net_device *vlandev = v;
- const struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev);
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
- seq_printf(seq, "%-15s| %d | %s\n", vlandev->name,
- dev_info->vlan_id, dev_info->real_dev->name);
+ seq_printf(seq, "%-15s| %d | %s\n", vlandev->name,
+ vlan->vlan_id, vlan->real_dev->name);
}
return 0;
}
@@ -302,50 +227,46 @@ static int vlan_seq_show(struct seq_file *seq, void *v)
static int vlandev_seq_show(struct seq_file *seq, void *offset)
{
struct net_device *vlandev = (struct net_device *) seq->private;
- const struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev);
- struct net_device_stats *stats;
- static const char fmt[] = "%30s %12lu\n";
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats;
+ static const char fmt64[] = "%30s %12llu\n";
int i;
- if ((vlandev == NULL) || (!(vlandev->priv_flags & IFF_802_1Q_VLAN)))
+ if (!is_vlan_dev(vlandev))
return 0;
- seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n",
- vlandev->name, dev_info->vlan_id,
- (int)(dev_info->flags & 1), vlandev->priv_flags);
-
-
- stats = vlan_dev_get_stats(vlandev);
+ stats = dev_get_stats(vlandev, &temp);
+ seq_printf(seq,
+ "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n",
+ vlandev->name, vlan->vlan_id,
+ (int)(vlan->flags & 1), (u32)vlandev->priv_flags);
- seq_printf(seq, fmt, "total frames received", stats->rx_packets);
- seq_printf(seq, fmt, "total bytes received", stats->rx_bytes);
- seq_printf(seq, fmt, "Broadcast/Multicast Rcvd", stats->multicast);
+ seq_printf(seq, fmt64, "total frames received", stats->rx_packets);
+ seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes);
+ seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast);
seq_puts(seq, "\n");
- seq_printf(seq, fmt, "total frames transmitted", stats->tx_packets);
- seq_printf(seq, fmt, "total bytes transmitted", stats->tx_bytes);
- seq_printf(seq, fmt, "total headroom inc",
- dev_info->cnt_inc_headroom_on_tx);
- seq_printf(seq, fmt, "total encap on xmit",
- dev_info->cnt_encap_on_xmit);
- seq_printf(seq, "Device: %s", dev_info->real_dev->name);
+ seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets);
+ seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes);
+ seq_printf(seq, "Device: %s", vlan->real_dev->name);
/* now show all PRIORITY mappings relating to this VLAN */
- seq_printf(seq,
- "\nINGRESS priority mappings: 0:%lu 1:%lu 2:%lu 3:%lu 4:%lu 5:%lu 6:%lu 7:%lu\n",
- dev_info->ingress_priority_map[0],
- dev_info->ingress_priority_map[1],
- dev_info->ingress_priority_map[2],
- dev_info->ingress_priority_map[3],
- dev_info->ingress_priority_map[4],
- dev_info->ingress_priority_map[5],
- dev_info->ingress_priority_map[6],
- dev_info->ingress_priority_map[7]);
-
- seq_printf(seq, "EGRESSS priority Mappings: ");
+ seq_printf(seq, "\nINGRESS priority mappings: "
+ "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n",
+ vlan->ingress_priority_map[0],
+ vlan->ingress_priority_map[1],
+ vlan->ingress_priority_map[2],
+ vlan->ingress_priority_map[3],
+ vlan->ingress_priority_map[4],
+ vlan->ingress_priority_map[5],
+ vlan->ingress_priority_map[6],
+ vlan->ingress_priority_map[7]);
+
+ seq_printf(seq, " EGRESS priority mappings: ");
for (i = 0; i < 16; i++) {
const struct vlan_priority_tci_mapping *mp
- = dev_info->egress_priority_map[i];
+ = vlan->egress_priority_map[i];
while (mp) {
- seq_printf(seq, "%lu:%hu ",
+ seq_printf(seq, "%u:%d ",
mp->priority, ((mp->vlan_qos >> 13) & 0x7));
mp = mp->next;
}
diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h
index f908ee332fd8..48cd4b4784e8 100644
--- a/net/8021q/vlanproc.h
+++ b/net/8021q/vlanproc.h
@@ -1,19 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BEN_VLAN_PROC_INC__
#define __BEN_VLAN_PROC_INC__
#ifdef CONFIG_PROC_FS
-int vlan_proc_init(void);
-int vlan_proc_rem_dev(struct net_device *vlandev);
-int vlan_proc_add_dev (struct net_device *vlandev);
-void vlan_proc_cleanup (void);
+struct net;
-#else /* No CONFIG_PROC_FS */
+int vlan_proc_init(struct net *net);
+void vlan_proc_rem_dev(struct net_device *vlandev);
+int vlan_proc_add_dev(struct net_device *vlandev);
+void vlan_proc_cleanup(struct net *net);
-#define vlan_proc_init() (0)
-#define vlan_proc_cleanup() do {} while(0)
-#define vlan_proc_add_dev(dev) ({(void)(dev), 0;})
-#define vlan_proc_rem_dev(dev) ({(void)(dev), 0;})
+#else /* No CONFIG_PROC_FS */
+#define vlan_proc_init(net) (0)
+#define vlan_proc_cleanup(net) do {} while (0)
+#define vlan_proc_add_dev(dev) ({(void)(dev), 0; })
+#define vlan_proc_rem_dev(dev) do {} while (0)
#endif
#endif /* !(__BEN_VLAN_PROC_INC__) */
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
new file mode 100644
index 000000000000..22f8c167845d
--- /dev/null
+++ b/net/9p/Kconfig
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# 9P protocol configuration
+#
+
+menuconfig NET_9P
+ tristate "Plan 9 Resource Sharing Support (9P2000)"
+ select NETFS_SUPPORT
+ help
+ If you say Y here, you will get experimental support for
+ Plan 9 resource sharing via the 9P2000 protocol.
+
+ See <http://v9fs.sf.net> for more information.
+
+ If unsure, say N.
+
+if NET_9P
+
+config NET_9P_FD
+ default NET_9P
+ imply INET
+ imply UNIX
+ tristate "9P FD Transport"
+ help
+ This builds support for transports over TCP, Unix sockets and
+ filedescriptors.
+
+config NET_9P_VIRTIO
+ depends on VIRTIO
+ tristate "9P Virtio Transport"
+ help
+ This builds support for a transports between
+ guest partitions and a host partition.
+
+config NET_9P_XEN
+ depends on XEN
+ select XEN_XENBUS_FRONTEND
+ tristate "9P Xen Transport"
+ help
+ This builds support for a transport for 9pfs between
+ two Xen domains.
+
+config NET_9P_USBG
+ tristate "9P USB Gadget Transport"
+ depends on USB_GADGET
+ select CONFIGFS_FS
+ select USB_LIBCOMPOSITE
+ help
+ This builds support for a transport for 9pfs over
+ usb gadget.
+
+config NET_9P_RDMA
+ depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS
+ tristate "9P RDMA Transport (Experimental)"
+ help
+ This builds support for an RDMA transport.
+
+config NET_9P_DEBUG
+ bool "Debug information"
+ help
+ Say Y if you want the 9P subsystem to log debug information.
+
+endif
diff --git a/net/9p/Makefile b/net/9p/Makefile
new file mode 100644
index 000000000000..22794a451c3f
--- /dev/null
+++ b/net/9p/Makefile
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_NET_9P) := 9pnet.o
+obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o
+obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o
+obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
+obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
+obj-$(CONFIG_NET_9P_USBG) += 9pnet_usbg.o
+
+9pnet-objs := \
+ mod.o \
+ client.o \
+ error.o \
+ protocol.o \
+ trans_common.o \
+
+9pnet_fd-objs := \
+ trans_fd.o \
+
+9pnet_virtio-objs := \
+ trans_virtio.o \
+
+9pnet_xen-objs := \
+ trans_xen.o \
+
+9pnet_rdma-objs := \
+ trans_rdma.o \
+
+9pnet_usbg-objs := \
+ trans_usbg.o \
diff --git a/net/9p/client.c b/net/9p/client.c
new file mode 100644
index 000000000000..f60d1d041adb
--- /dev/null
+++ b/net/9p/client.c
@@ -0,0 +1,2209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 9P Client
+ *
+ * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/netfs.h>
+#include <net/9p/9p.h>
+#include <linux/seq_file.h>
+#include <linux/fs_context.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include "protocol.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/9p.h>
+
+/* Client Option Parsing (code inspired by NFS code)
+ * - a little lazy - parse all client options
+ */
+
+inline int p9_is_proto_dotl(struct p9_client *clnt)
+{
+ return clnt->proto_version == p9_proto_2000L;
+}
+EXPORT_SYMBOL(p9_is_proto_dotl);
+
+inline int p9_is_proto_dotu(struct p9_client *clnt)
+{
+ return clnt->proto_version == p9_proto_2000u;
+}
+EXPORT_SYMBOL(p9_is_proto_dotu);
+
+int p9_show_client_options(struct seq_file *m, struct p9_client *clnt)
+{
+ if (clnt->msize != DEFAULT_MSIZE)
+ seq_printf(m, ",msize=%u", clnt->msize);
+ seq_printf(m, ",trans=%s", clnt->trans_mod->name);
+
+ switch (clnt->proto_version) {
+ case p9_proto_legacy:
+ seq_puts(m, ",noextend");
+ break;
+ case p9_proto_2000u:
+ seq_puts(m, ",version=9p2000.u");
+ break;
+ case p9_proto_2000L:
+ /* Default */
+ break;
+ }
+
+ if (clnt->trans_mod->show_options)
+ return clnt->trans_mod->show_options(m, clnt);
+ return 0;
+}
+EXPORT_SYMBOL(p9_show_client_options);
+
+/* Some error codes are taken directly from the server replies,
+ * make sure they are valid.
+ */
+static int safe_errno(int err)
+{
+ if (err > 0 || err < -MAX_ERRNO) {
+ p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err);
+ return -EPROTO;
+ }
+ return err;
+}
+
+static int apply_client_options(struct p9_client *clnt, struct fs_context *fc)
+{
+ struct v9fs_context *ctx = fc->fs_private;
+
+ clnt->msize = ctx->client_opts.msize;
+ clnt->trans_mod = ctx->client_opts.trans_mod;
+ ctx->client_opts.trans_mod = NULL;
+ clnt->proto_version = ctx->client_opts.proto_version;
+
+ return 0;
+}
+
+static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+ int alloc_msize)
+{
+ if (likely(c->fcall_cache) && alloc_msize == c->msize) {
+ fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
+ fc->cache = c->fcall_cache;
+ if (!fc->sdata && c->trans_mod->supports_vmalloc) {
+ fc->sdata = kvmalloc(alloc_msize, GFP_NOFS);
+ fc->cache = NULL;
+ }
+ } else {
+ if (c->trans_mod->supports_vmalloc)
+ fc->sdata = kvmalloc(alloc_msize, GFP_NOFS);
+ else
+ fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+ fc->cache = NULL;
+ }
+ if (!fc->sdata)
+ return -ENOMEM;
+ fc->capacity = alloc_msize;
+ fc->id = 0;
+ fc->tag = P9_NOTAG;
+ return 0;
+}
+
+void p9_fcall_fini(struct p9_fcall *fc)
+{
+ /* sdata can be NULL for interrupted requests in trans_rdma,
+ * and kmem_cache_free does not do NULL-check for us
+ */
+ if (unlikely(!fc->sdata))
+ return;
+
+ if (fc->cache)
+ kmem_cache_free(fc->cache, fc->sdata);
+ else
+ kvfree(fc->sdata);
+}
+EXPORT_SYMBOL(p9_fcall_fini);
+
+static struct kmem_cache *p9_req_cache;
+
+/**
+ * p9_tag_alloc - Allocate a new request.
+ * @c: Client session.
+ * @type: Transaction type.
+ * @t_size: Buffer size for holding this request
+ * (automatic calculation by format template if 0).
+ * @r_size: Buffer size for holding server's reply on this request
+ * (automatic calculation by format template if 0).
+ * @fmt: Format template for assembling 9p request message
+ * (see p9pdu_vwritef).
+ * @ap: Variable arguments to be fed to passed format template
+ * (see p9pdu_vwritef).
+ *
+ * Context: Process context.
+ * Return: Pointer to new request.
+ */
+static struct p9_req_t *
+p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
+ const char *fmt, va_list ap)
+{
+ struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
+ int alloc_tsize;
+ int alloc_rsize;
+ int tag;
+ va_list apc;
+
+ va_copy(apc, ap);
+ alloc_tsize = min_t(size_t, c->msize,
+ t_size ?: p9_msg_buf_size(c, type, fmt, apc));
+ va_end(apc);
+
+ alloc_rsize = min_t(size_t, c->msize,
+ r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
+
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ if (p9_fcall_init(c, &req->tc, alloc_tsize))
+ goto free_req;
+ if (p9_fcall_init(c, &req->rc, alloc_rsize))
+ goto free;
+
+ p9pdu_reset(&req->tc);
+ p9pdu_reset(&req->rc);
+ req->t_err = 0;
+ req->status = REQ_STATUS_ALLOC;
+ /* refcount needs to be set to 0 before inserting into the idr
+ * so p9_tag_lookup does not accept a request that is not fully
+ * initialized. refcount_set to 2 below will mark request ready.
+ */
+ refcount_set(&req->refcount, 0);
+ init_waitqueue_head(&req->wq);
+ INIT_LIST_HEAD(&req->req_list);
+
+ idr_preload(GFP_NOFS);
+ spin_lock_irq(&c->lock);
+ if (type == P9_TVERSION)
+ tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1,
+ GFP_NOWAIT);
+ else
+ tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT);
+ req->tc.tag = tag;
+ spin_unlock_irq(&c->lock);
+ idr_preload_end();
+ if (tag < 0)
+ goto free;
+
+ /* Init ref to two because in the general case there is one ref
+ * that is put asynchronously by a writer thread, one ref
+ * temporarily given by p9_tag_lookup and put by p9_client_cb
+ * in the recv thread, and one ref put by p9_req_put in the
+ * main thread. The only exception is virtio that does not use
+ * p9_tag_lookup but does not have a writer thread either
+ * (the write happens synchronously in the request/zc_request
+ * callback), so p9_client_cb eats the second ref there
+ * as the pointer is duplicated directly by virtqueue_add_sgs()
+ */
+ refcount_set(&req->refcount, 2);
+
+ return req;
+
+free:
+ p9_fcall_fini(&req->tc);
+ p9_fcall_fini(&req->rc);
+free_req:
+ kmem_cache_free(p9_req_cache, req);
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * p9_tag_lookup - Look up a request by tag.
+ * @c: Client session.
+ * @tag: Transaction ID.
+ *
+ * Context: Any context.
+ * Return: A request, or %NULL if there is no request with that tag.
+ */
+struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
+{
+ struct p9_req_t *req;
+
+ rcu_read_lock();
+again:
+ req = idr_find(&c->reqs, tag);
+ if (req) {
+ /* We have to be careful with the req found under rcu_read_lock
+ * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the
+ * ref again without corrupting other data, then check again
+ * that the tag matches once we have the ref
+ */
+ if (!p9_req_try_get(req))
+ goto again;
+ if (req->tc.tag != tag) {
+ p9_req_put(c, req);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
+
+ return req;
+}
+EXPORT_SYMBOL(p9_tag_lookup);
+
+/**
+ * p9_tag_remove - Remove a tag.
+ * @c: Client session.
+ * @r: Request of reference.
+ *
+ * Context: Any context.
+ */
+static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
+{
+ unsigned long flags;
+ u16 tag = r->tc.tag;
+
+ p9_debug(P9_DEBUG_MUX, "freeing clnt %p req %p tag: %d\n", c, r, tag);
+ spin_lock_irqsave(&c->lock, flags);
+ idr_remove(&c->reqs, tag);
+ spin_unlock_irqrestore(&c->lock, flags);
+}
+
+int p9_req_put(struct p9_client *c, struct p9_req_t *r)
+{
+ if (refcount_dec_and_test(&r->refcount)) {
+ p9_tag_remove(c, r);
+
+ p9_fcall_fini(&r->tc);
+ p9_fcall_fini(&r->rc);
+ kmem_cache_free(p9_req_cache, r);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(p9_req_put);
+
+/**
+ * p9_tag_cleanup - cleans up tags structure and reclaims resources
+ * @c: v9fs client struct
+ *
+ * This frees resources associated with the tags structure
+ *
+ */
+static void p9_tag_cleanup(struct p9_client *c)
+{
+ struct p9_req_t *req;
+ int id;
+
+ rcu_read_lock();
+ idr_for_each_entry(&c->reqs, req, id) {
+ pr_info("Tag %d still in use\n", id);
+ if (p9_req_put(c, req) == 0)
+ pr_warn("Packet with tag %d has still references",
+ req->tc.tag);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * p9_client_cb - call back from transport to client
+ * @c: client state
+ * @req: request received
+ * @status: request status, one of REQ_STATUS_*
+ *
+ */
+void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
+{
+ p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag);
+
+ /* This barrier is needed to make sure any change made to req before
+ * the status change is visible to another thread
+ */
+ smp_wmb();
+ WRITE_ONCE(req->status, status);
+
+ wake_up(&req->wq);
+ p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
+ p9_req_put(c, req);
+}
+EXPORT_SYMBOL(p9_client_cb);
+
+/**
+ * p9_parse_header - parse header arguments out of a packet
+ * @pdu: packet to parse
+ * @size: size of packet
+ * @type: type of request
+ * @tag: tag of packet
+ * @rewind: set if we need to rewind offset afterwards
+ */
+
+int
+p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type,
+ int16_t *tag, int rewind)
+{
+ s8 r_type;
+ s16 r_tag;
+ s32 r_size;
+ int offset = pdu->offset;
+ int err;
+
+ pdu->offset = 0;
+
+ err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag);
+ if (err)
+ goto rewind_and_exit;
+
+ if (type)
+ *type = r_type;
+ if (tag)
+ *tag = r_tag;
+ if (size)
+ *size = r_size;
+
+ if (pdu->size != r_size || r_size < 7) {
+ err = -EINVAL;
+ goto rewind_and_exit;
+ }
+
+ pdu->id = r_type;
+ pdu->tag = r_tag;
+
+ p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
+ pdu->size, pdu->id, pdu->tag);
+
+rewind_and_exit:
+ if (rewind)
+ pdu->offset = offset;
+ return err;
+}
+EXPORT_SYMBOL(p9_parse_header);
+
+/**
+ * p9_check_errors - check 9p packet for error return and process it
+ * @c: current client instance
+ * @req: request to parse and check for error conditions
+ *
+ * returns error code if one is discovered, otherwise returns 0
+ *
+ * this will have to be more complicated if we have multiple
+ * error packet types
+ */
+
+static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
+{
+ s8 type;
+ int err;
+ int ecode;
+
+ err = p9_parse_header(&req->rc, NULL, &type, NULL, 0);
+ if (req->rc.size > req->rc.capacity && !req->rc.zc) {
+ pr_err("requested packet size too big: %d does not fit %zu (type=%d)\n",
+ req->rc.size, req->rc.capacity, req->rc.id);
+ return -EIO;
+ }
+ /* dump the response from server
+ * This should be after check errors which poplulate pdu_fcall.
+ */
+ trace_9p_protocol_dump(c, &req->rc);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+ return err;
+ }
+ if (type != P9_RERROR && type != P9_RLERROR)
+ return 0;
+
+ if (!p9_is_proto_dotl(c)) {
+ char *ename = NULL;
+
+ err = p9pdu_readf(&req->rc, c->proto_version, "s?d",
+ &ename, &ecode);
+ if (err) {
+ kfree(ename);
+ goto out_err;
+ }
+
+ if (p9_is_proto_dotu(c) && ecode < 512)
+ err = -ecode;
+
+ if (!err) {
+ err = p9_errstr2errno(ename, strlen(ename));
+
+ p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+ -ecode, ename);
+ }
+ kfree(ename);
+ } else {
+ err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode);
+ if (err)
+ goto out_err;
+ err = -ecode;
+
+ p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+ }
+
+ return err;
+
+out_err:
+ p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+
+ return err;
+}
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
+
+/**
+ * p9_client_flush - flush (cancel) a request
+ * @c: client state
+ * @oldreq: request to cancel
+ *
+ * This sents a flush for a particular request and links
+ * the flush request to the original request. The current
+ * code only supports a single flush request although the protocol
+ * allows for multiple flush requests to be sent for a single request.
+ *
+ */
+
+static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
+{
+ struct p9_req_t *req;
+ s16 oldtag;
+ int err;
+
+ err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1);
+ if (err)
+ return err;
+
+ p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
+
+ req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* if we haven't received a response for oldreq,
+ * remove it from the list
+ */
+ if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) {
+ if (c->trans_mod->cancelled)
+ c->trans_mod->cancelled(c, oldreq);
+ }
+
+ p9_req_put(c, req);
+ return 0;
+}
+
+static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+ int8_t type, uint t_size, uint r_size,
+ const char *fmt, va_list ap)
+{
+ int err;
+ struct p9_req_t *req;
+ va_list apc;
+
+ p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
+
+ /* we allow for any status other than disconnected */
+ if (c->status == Disconnected)
+ return ERR_PTR(-EIO);
+
+ /* if status is begin_disconnected we allow only clunk request */
+ if (c->status == BeginDisconnect && type != P9_TCLUNK)
+ return ERR_PTR(-EIO);
+
+ va_copy(apc, ap);
+ req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
+ va_end(apc);
+ if (IS_ERR(req))
+ return req;
+
+ /* marshall the data */
+ p9pdu_prepare(&req->tc, req->tc.tag, type);
+ err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
+ if (err)
+ goto reterr;
+ p9pdu_finalize(c, &req->tc);
+ trace_9p_client_req(c, type, req->tc.tag);
+ return req;
+reterr:
+ p9_req_put(c, req);
+ /* We have to put also the 2nd reference as it won't be used */
+ p9_req_put(c, req);
+ return ERR_PTR(err);
+}
+
+/**
+ * p9_client_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_req_put)
+ */
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+ va_list ap;
+ int sigpending, err;
+ unsigned long flags;
+ struct p9_req_t *req;
+ /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to
+ * auto determine an appropriate (small) request/response size
+ * according to actual message data being sent. Currently RDMA
+ * transport is excluded from this response message size optimization,
+ * as it would not cope with it, due to its pooled response buffers
+ * (using an optimized request size for RDMA as well though).
+ */
+ const uint tsize = 0;
+ const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0;
+
+ va_start(ap, fmt);
+ req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap);
+ va_end(ap);
+ if (IS_ERR(req))
+ return req;
+
+ req->tc.zc = false;
+ req->rc.zc = false;
+
+ if (signal_pending(current)) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ } else {
+ sigpending = 0;
+ }
+
+ err = c->trans_mod->request(c, req);
+ if (err < 0) {
+ /* write won't happen */
+ p9_req_put(c, req);
+ if (err != -ERESTARTSYS && err != -EFAULT)
+ c->status = Disconnected;
+ goto recalc_sigpending;
+ }
+again:
+ /* Wait for the response */
+ err = wait_event_killable(req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
+
+ /* Make sure our req is coherent with regard to updates in other
+ * threads - echoes to wmb() in the callback
+ */
+ smp_rmb();
+
+ if (err == -ERESTARTSYS && c->status == Connected &&
+ type == P9_TFLUSH) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ goto again;
+ }
+
+ if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
+ p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+ err = req->t_err;
+ }
+ if (err == -ERESTARTSYS && c->status == Connected) {
+ p9_debug(P9_DEBUG_MUX, "flushing\n");
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+
+ if (c->trans_mod->cancel(c, req))
+ p9_client_flush(c, req);
+
+ /* if we received the response anyway, don't signal error */
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
+ err = 0;
+ }
+recalc_sigpending:
+ if (sigpending) {
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ }
+ if (err < 0)
+ goto reterr;
+
+ err = p9_check_errors(c, req);
+ trace_9p_client_res(c, type, req->rc.tag, err);
+ if (!err)
+ return req;
+reterr:
+ p9_req_put(c, req);
+ return ERR_PTR(safe_errno(err));
+}
+
+/**
+ * p9_client_zc_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @uidata: destination for zero copy read
+ * @uodata: source for zero copy write
+ * @inlen: read buffer size
+ * @olen: write buffer size
+ * @in_hdrlen: reader header size, This is the size of response protocol data
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_req_put)
+ */
+static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
+ struct iov_iter *uidata,
+ struct iov_iter *uodata,
+ int inlen, int olen, int in_hdrlen,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int sigpending, err;
+ unsigned long flags;
+ struct p9_req_t *req;
+
+ va_start(ap, fmt);
+ /* We allocate a inline protocol data of only 4k bytes.
+ * The actual content is passed in zero-copy fashion.
+ */
+ req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap);
+ va_end(ap);
+ if (IS_ERR(req))
+ return req;
+
+ req->tc.zc = true;
+ req->rc.zc = true;
+
+ if (signal_pending(current)) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ } else {
+ sigpending = 0;
+ }
+
+ err = c->trans_mod->zc_request(c, req, uidata, uodata,
+ inlen, olen, in_hdrlen);
+ if (err < 0) {
+ if (err == -EIO)
+ c->status = Disconnected;
+ if (err != -ERESTARTSYS)
+ goto recalc_sigpending;
+ }
+ if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
+ p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+ err = req->t_err;
+ }
+ if (err == -ERESTARTSYS && c->status == Connected) {
+ p9_debug(P9_DEBUG_MUX, "flushing\n");
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+
+ if (c->trans_mod->cancel(c, req))
+ p9_client_flush(c, req);
+
+ /* if we received the response anyway, don't signal error */
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
+ err = 0;
+ }
+recalc_sigpending:
+ if (sigpending) {
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ }
+ if (err < 0)
+ goto reterr;
+
+ err = p9_check_errors(c, req);
+ trace_9p_client_res(c, type, req->rc.tag, err);
+ if (!err)
+ return req;
+reterr:
+ p9_req_put(c, req);
+ return ERR_PTR(safe_errno(err));
+}
+
+static struct p9_fid *p9_fid_create(struct p9_client *clnt)
+{
+ int ret;
+ struct p9_fid *fid;
+
+ p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
+ fid = kzalloc(sizeof(*fid), GFP_KERNEL);
+ if (!fid)
+ return NULL;
+
+ fid->mode = -1;
+ fid->uid = current_fsuid();
+ fid->clnt = clnt;
+ refcount_set(&fid->count, 1);
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&clnt->lock);
+ ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
+ GFP_NOWAIT);
+ spin_unlock_irq(&clnt->lock);
+ idr_preload_end();
+ if (!ret) {
+ trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
+ return fid;
+ }
+
+ kfree(fid);
+ return NULL;
+}
+
+static void p9_fid_destroy(struct p9_fid *fid)
+{
+ struct p9_client *clnt;
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
+ trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
+ clnt = fid->clnt;
+ spin_lock_irqsave(&clnt->lock, flags);
+ idr_remove(&clnt->fids, fid->fid);
+ spin_unlock_irqrestore(&clnt->lock, flags);
+ kfree(fid->rdir);
+ kfree(fid);
+}
+
+/* We also need to export tracepoint symbols for tracepoint_enabled() */
+EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
+
+void do_trace_9p_fid_get(struct p9_fid *fid)
+{
+ trace_9p_fid_ref(fid, P9_FID_REF_GET);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_get);
+
+void do_trace_9p_fid_put(struct p9_fid *fid)
+{
+ trace_9p_fid_ref(fid, P9_FID_REF_PUT);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_put);
+
+static int p9_client_version(struct p9_client *c)
+{
+ int err;
+ struct p9_req_t *req;
+ char *version = NULL;
+ int msize;
+
+ p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
+ c->msize, c->proto_version);
+
+ switch (c->proto_version) {
+ case p9_proto_2000L:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000.L");
+ break;
+ case p9_proto_2000u:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000.u");
+ break;
+ case p9_proto_legacy:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000");
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version);
+ if (err) {
+ p9_debug(P9_DEBUG_9P, "version error %d\n", err);
+ trace_9p_protocol_dump(c, &req->rc);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
+ if (!strncmp(version, "9P2000.L", 8)) {
+ c->proto_version = p9_proto_2000L;
+ } else if (!strncmp(version, "9P2000.u", 8)) {
+ c->proto_version = p9_proto_2000u;
+ } else if (!strncmp(version, "9P2000", 6)) {
+ c->proto_version = p9_proto_legacy;
+ } else {
+ p9_debug(P9_DEBUG_ERROR,
+ "server returned an unknown version: %s\n", version);
+ err = -EREMOTEIO;
+ goto error;
+ }
+
+ if (msize < 4096) {
+ p9_debug(P9_DEBUG_ERROR,
+ "server returned a msize < 4096: %d\n", msize);
+ err = -EREMOTEIO;
+ goto error;
+ }
+ if (msize < c->msize)
+ c->msize = msize;
+
+error:
+ kfree(version);
+ p9_req_put(c, req);
+
+ return err;
+}
+
+struct p9_client *p9_client_create(struct fs_context *fc)
+{
+ int err;
+ static atomic_t seqno = ATOMIC_INIT(0);
+ struct p9_client *clnt;
+ char *client_id;
+ char *cache_name;
+
+ clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
+ if (!clnt)
+ return ERR_PTR(-ENOMEM);
+
+ clnt->trans_mod = NULL;
+ clnt->trans = NULL;
+ clnt->fcall_cache = NULL;
+
+ client_id = utsname()->nodename;
+ memcpy(clnt->name, client_id, strlen(client_id) + 1);
+
+ spin_lock_init(&clnt->lock);
+ idr_init(&clnt->fids);
+ idr_init(&clnt->reqs);
+
+ err = apply_client_options(clnt, fc);
+ if (err)
+ goto free_client;
+
+ if (!clnt->trans_mod)
+ clnt->trans_mod = v9fs_get_default_trans();
+
+ if (!clnt->trans_mod) {
+ err = -EPROTONOSUPPORT;
+ p9_debug(P9_DEBUG_ERROR,
+ "No transport defined or default transport\n");
+ goto free_client;
+ }
+
+ p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
+ clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
+
+ err = clnt->trans_mod->create(clnt, fc);
+ if (err)
+ goto put_trans;
+
+ if (clnt->msize > clnt->trans_mod->maxsize) {
+ clnt->msize = clnt->trans_mod->maxsize;
+ pr_info("Limiting 'msize' to %d as this is the maximum "
+ "supported by transport %s\n",
+ clnt->msize, clnt->trans_mod->name
+ );
+ }
+
+ if (clnt->msize < 4096) {
+ p9_debug(P9_DEBUG_ERROR,
+ "Please specify a msize of at least 4k\n");
+ err = -EINVAL;
+ goto close_trans;
+ }
+
+ err = p9_client_version(clnt);
+ if (err)
+ goto close_trans;
+
+ cache_name = kasprintf(GFP_KERNEL,
+ "9p-fcall-cache-%u", atomic_inc_return(&seqno));
+ if (!cache_name) {
+ err = -ENOMEM;
+ goto close_trans;
+ }
+
+ /* P9_HDRSZ + 4 is the smallest packet header we can have that is
+ * followed by data accessed from userspace by read
+ */
+ clnt->fcall_cache =
+ kmem_cache_create_usercopy(cache_name, clnt->msize,
+ 0, 0, P9_HDRSZ + 4,
+ clnt->msize - (P9_HDRSZ + 4),
+ NULL);
+
+ kfree(cache_name);
+ return clnt;
+
+close_trans:
+ clnt->trans_mod->close(clnt);
+put_trans:
+ v9fs_put_trans(clnt->trans_mod);
+free_client:
+ kfree(clnt);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_create);
+
+void p9_client_destroy(struct p9_client *clnt)
+{
+ struct p9_fid *fid;
+ int id;
+
+ p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt);
+
+ if (clnt->trans_mod)
+ clnt->trans_mod->close(clnt);
+
+ v9fs_put_trans(clnt->trans_mod);
+
+ idr_for_each_entry(&clnt->fids, fid, id) {
+ pr_info("Found fid %d not clunked\n", fid->fid);
+ p9_fid_destroy(fid);
+ }
+
+ p9_tag_cleanup(clnt);
+
+ kmem_cache_destroy(clnt->fcall_cache);
+ kfree(clnt);
+}
+EXPORT_SYMBOL(p9_client_destroy);
+
+void p9_client_disconnect(struct p9_client *clnt)
+{
+ p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
+ clnt->status = Disconnected;
+}
+EXPORT_SYMBOL(p9_client_disconnect);
+
+void p9_client_begin_disconnect(struct p9_client *clnt)
+{
+ p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
+ clnt->status = BeginDisconnect;
+}
+EXPORT_SYMBOL(p9_client_begin_disconnect);
+
+struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
+ const char *uname, kuid_t n_uname,
+ const char *aname)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_fid *fid;
+ struct p9_qid qid;
+
+ p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
+ afid ? afid->fid : -1, uname, aname);
+ fid = p9_fid_create(clnt);
+ if (!fid) {
+ err = -ENOMEM;
+ goto error;
+ }
+ fid->uid = n_uname;
+
+ req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid,
+ afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
+ qid.type, qid.path, qid.version);
+
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+
+ p9_req_put(clnt, req);
+ return fid;
+
+error:
+ if (fid)
+ p9_fid_destroy(fid);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_attach);
+
+struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
+ const unsigned char * const *wnames, int clone)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_fid *fid;
+ struct p9_qid *wqids;
+ struct p9_req_t *req;
+ u16 nwqids, count;
+
+ wqids = NULL;
+ clnt = oldfid->clnt;
+ if (clone) {
+ fid = p9_fid_create(clnt);
+ if (!fid) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ fid->uid = oldfid->uid;
+ } else {
+ fid = oldfid;
+ }
+
+ p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
+ oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
+ req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
+ nwname, wnames);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ goto clunk_fid;
+ }
+ p9_req_put(clnt, req);
+
+ p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
+
+ if (nwqids != nwname) {
+ err = -ENOENT;
+ goto clunk_fid;
+ }
+
+ for (count = 0; count < nwqids; count++)
+ p9_debug(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n",
+ count, wqids[count].type,
+ wqids[count].path,
+ wqids[count].version);
+
+ if (nwname)
+ memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
+ else
+ memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid));
+
+ kfree(wqids);
+ return fid;
+
+clunk_fid:
+ kfree(wqids);
+ p9_fid_put(fid);
+ fid = NULL;
+
+error:
+ if (fid && fid != oldfid)
+ p9_fid_destroy(fid);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_walk);
+
+int p9_client_open(struct p9_fid *fid, int mode)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ struct p9_qid qid;
+ int iounit;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
+ p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
+
+ if (fid->mode != -1)
+ return -EINVAL;
+
+ if (p9_is_proto_dotl(clnt))
+ req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode & P9L_MODE_MASK);
+ else
+ req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode & P9L_MODE_MASK);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
+ p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type,
+ qid.path, qid.version, iounit);
+
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+ fid->mode = mode;
+ fid->iounit = iounit;
+
+free_and_error:
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_open);
+
+int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags,
+ u32 mode, kgid_t gid, struct p9_qid *qid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ int iounit;
+
+ p9_debug(P9_DEBUG_9P,
+ ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
+ ofid->fid, name, flags, mode,
+ from_kgid(&init_user_ns, gid));
+ clnt = ofid->clnt;
+
+ if (ofid->mode != -1)
+ return -EINVAL;
+
+ req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags,
+ mode & P9L_MODE_MASK, gid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
+ qid->type, qid->path, qid->version, iounit);
+
+ memmove(&ofid->qid, qid, sizeof(struct p9_qid));
+ ofid->mode = flags;
+ ofid->iounit = iounit;
+
+free_and_error:
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_create_dotl);
+
+int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
+ char *extension)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ struct p9_qid qid;
+ int iounit;
+
+ p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
+ fid->fid, name, perm, mode);
+ clnt = fid->clnt;
+
+ if (fid->mode != -1)
+ return -EINVAL;
+
+ req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm,
+ mode & P9L_MODE_MASK, extension);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
+ qid.type, qid.path, qid.version, iounit);
+
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+ fid->mode = mode;
+ fid->iounit = iounit;
+
+free_and_error:
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_fcreate);
+
+int p9_client_symlink(struct p9_fid *dfid, const char *name,
+ const char *symtgt, kgid_t gid, struct p9_qid *qid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n",
+ dfid->fid, name, symtgt);
+ clnt = dfid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt,
+ gid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
+ qid->type, qid->path, qid->version);
+
+free_and_error:
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_symlink);
+
+int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname)
+{
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
+ dfid->fid, oldfid->fid, newname);
+ clnt = dfid->clnt;
+ req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
+ newname);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ p9_debug(P9_DEBUG_9P, "<<< RLINK\n");
+ p9_req_put(clnt, req);
+ return 0;
+}
+EXPORT_SYMBOL(p9_client_link);
+
+int p9_client_fsync(struct p9_fid *fid, int datasync)
+{
+ int err = 0;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+ fid->fid, datasync);
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+
+ p9_req_put(clnt, req);
+
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_fsync);
+
+int p9_client_clunk(struct p9_fid *fid)
+{
+ int err = 0;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ int retries = 0;
+
+again:
+ p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n",
+ fid->fid, retries);
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
+
+ p9_req_put(clnt, req);
+error:
+ /* Fid is not valid even after a failed clunk
+ * If interrupted, retry once then give up and
+ * leak fid until umount.
+ */
+ if (err == -ERESTARTSYS) {
+ if (retries++ == 0)
+ goto again;
+ } else {
+ p9_fid_destroy(fid);
+ }
+ return err;
+}
+EXPORT_SYMBOL(p9_client_clunk);
+
+int p9_client_remove(struct p9_fid *fid)
+{
+ int err = 0;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
+
+ p9_req_put(clnt, req);
+error:
+ if (err == -ERESTARTSYS)
+ p9_fid_put(fid);
+ else
+ p9_fid_destroy(fid);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_remove);
+
+int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n",
+ dfid->fid, name, flags);
+
+ clnt = dfid->clnt;
+ req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name);
+
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_unlinkat);
+
+int
+p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+{
+ int total = 0;
+ *err = 0;
+
+ while (iov_iter_count(to)) {
+ int count;
+
+ count = p9_client_read_once(fid, offset, to, err);
+ if (!count || *err)
+ break;
+ offset += count;
+ total += count;
+ }
+ return total;
+}
+EXPORT_SYMBOL(p9_client_read);
+
+int
+p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+ int *err)
+{
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ int count = iov_iter_count(to);
+ u32 rsize, received;
+ bool non_zc = false;
+ char *dataptr;
+
+ *err = 0;
+ p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %zu\n",
+ fid->fid, offset, iov_iter_count(to));
+
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
+
+ if (count < rsize)
+ rsize = count;
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ /* response header len is 11
+ * PDU Header(7) + IO Size (4)
+ */
+ req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
+ 0, 11, "dqd", fid->fid,
+ offset, rsize);
+ } else {
+ non_zc = true;
+ req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
+ rsize);
+ }
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ if (!non_zc)
+ iov_iter_revert(to, count - iov_iter_count(to));
+ return 0;
+ }
+
+ *err = p9pdu_readf(&req->rc, clnt->proto_version,
+ "D", &received, &dataptr);
+ if (*err) {
+ if (!non_zc)
+ iov_iter_revert(to, count - iov_iter_count(to));
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ return 0;
+ }
+ if (rsize < received) {
+ pr_err("bogus RREAD count (%u > %u)\n", received, rsize);
+ *err = -EIO;
+ p9_req_put(clnt, req);
+ return 0;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREAD count %u\n", received);
+
+ if (non_zc) {
+ int n = copy_to_iter(dataptr, received, to);
+
+ if (n != received) {
+ *err = -EFAULT;
+ p9_req_put(clnt, req);
+ return n;
+ }
+ } else {
+ iov_iter_revert(to, count - received - iov_iter_count(to));
+ }
+ p9_req_put(clnt, req);
+ return received;
+}
+EXPORT_SYMBOL(p9_client_read_once);
+
+int
+p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ int total = 0;
+ *err = 0;
+
+ while (iov_iter_count(from)) {
+ size_t count = iov_iter_count(from);
+ u32 rsize = fid->iounit;
+ u32 written;
+
+ if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
+
+ if (count < rsize)
+ rsize = count;
+
+ p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %u (/%zu)\n",
+ fid->fid, offset, rsize, count);
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
+ rsize, P9_ZC_HDR_SZ, "dqd",
+ fid->fid, offset, rsize);
+ } else {
+ req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+ offset, rsize, from);
+ }
+ if (IS_ERR(req)) {
+ iov_iter_revert(from, count - iov_iter_count(from));
+ *err = PTR_ERR(req);
+ break;
+ }
+
+ *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+ if (*err) {
+ iov_iter_revert(from, count - iov_iter_count(from));
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ break;
+ }
+ if (rsize < written) {
+ pr_err("bogus RWRITE count (%u > %u)\n", written, rsize);
+ *err = -EIO;
+ iov_iter_revert(from, count - iov_iter_count(from));
+ p9_req_put(clnt, req);
+ break;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWRITE count %u\n", written);
+
+ p9_req_put(clnt, req);
+ iov_iter_revert(from, count - written - iov_iter_count(from));
+ total += written;
+ offset += written;
+ }
+ return total;
+}
+EXPORT_SYMBOL(p9_client_write);
+
+void
+p9_client_write_subreq(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct p9_fid *fid = wreq->netfs_priv;
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ unsigned long long start = subreq->start + subreq->transferred;
+ int written, len = subreq->len - subreq->transferred;
+ int err;
+
+ p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu len %d\n",
+ fid->fid, start, len);
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && len > 1024) {
+ req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &subreq->io_iter,
+ 0, wreq->len, P9_ZC_HDR_SZ, "dqd",
+ fid->fid, start, len);
+ } else {
+ req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+ start, len, &subreq->io_iter);
+ }
+ if (IS_ERR(req)) {
+ netfs_write_subrequest_terminated(subreq, PTR_ERR(req));
+ return;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ netfs_write_subrequest_terminated(subreq, err);
+ return;
+ }
+
+ if (written > len) {
+ pr_err("bogus RWRITE count (%d > %u)\n", written, len);
+ written = -EIO;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len);
+
+ p9_req_put(clnt, req);
+ netfs_write_subrequest_terminated(subreq, written);
+}
+EXPORT_SYMBOL(p9_client_write_subreq);
+
+struct p9_wstat *p9_client_stat(struct p9_fid *fid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_wstat *ret;
+ struct p9_req_t *req;
+ u16 ignored;
+
+ p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
+
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P,
+ "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+ "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+ "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+ "<<< uid=%d gid=%d n_muid=%d\n",
+ ret->size, ret->type, ret->dev, ret->qid.type, ret->qid.path,
+ ret->qid.version, ret->mode,
+ ret->atime, ret->mtime, ret->length,
+ ret->name, ret->uid, ret->gid, ret->muid, ret->extension,
+ from_kuid(&init_user_ns, ret->n_uid),
+ from_kgid(&init_user_ns, ret->n_gid),
+ from_kuid(&init_user_ns, ret->n_muid));
+
+ p9_req_put(clnt, req);
+ return ret;
+
+error:
+ kfree(ret);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_stat);
+
+struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
+ u64 request_mask)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_stat_dotl *ret;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
+ fid->fid, request_mask);
+
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RGETATTR st_result_mask=%lld\n"
+ "<<< qid=%x.%llx.%x\n"
+ "<<< st_mode=%8.8x st_nlink=%llu\n"
+ "<<< st_uid=%d st_gid=%d\n"
+ "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
+ "<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
+ "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
+ "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
+ "<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
+ "<<< st_gen=%lld st_data_version=%lld\n",
+ ret->st_result_mask,
+ ret->qid.type, ret->qid.path, ret->qid.version,
+ ret->st_mode, ret->st_nlink,
+ from_kuid(&init_user_ns, ret->st_uid),
+ from_kgid(&init_user_ns, ret->st_gid),
+ ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks,
+ ret->st_atime_sec, ret->st_atime_nsec,
+ ret->st_mtime_sec, ret->st_mtime_nsec,
+ ret->st_ctime_sec, ret->st_ctime_nsec,
+ ret->st_btime_sec, ret->st_btime_nsec,
+ ret->st_gen, ret->st_data_version);
+
+ p9_req_put(clnt, req);
+ return ret;
+
+error:
+ kfree(ret);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_getattr_dotl);
+
+static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
+{
+ int ret;
+
+ /* NOTE: size shouldn't include its own length */
+ /* size[2] type[2] dev[4] qid[13] */
+ /* mode[4] atime[4] mtime[4] length[8]*/
+ /* name[s] uid[s] gid[s] muid[s] */
+ ret = 2 + 4 + 13 + 4 + 4 + 4 + 8 + 2 + 2 + 2 + 2;
+
+ if (wst->name)
+ ret += strlen(wst->name);
+ if (wst->uid)
+ ret += strlen(wst->uid);
+ if (wst->gid)
+ ret += strlen(wst->gid);
+ if (wst->muid)
+ ret += strlen(wst->muid);
+
+ if (proto_version == p9_proto_2000u ||
+ proto_version == p9_proto_2000L) {
+ /* extension[s] n_uid[4] n_gid[4] n_muid[4] */
+ ret += 2 + 4 + 4 + 4;
+ if (wst->extension)
+ ret += strlen(wst->extension);
+ }
+
+ return ret;
+}
+
+int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ clnt = fid->clnt;
+ wst->size = p9_client_statsize(wst, clnt->proto_version);
+ p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n",
+ fid->fid);
+ p9_debug(P9_DEBUG_9P,
+ " sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+ " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+ " name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+ " uid=%d gid=%d n_muid=%d\n",
+ wst->size, wst->type, wst->dev, wst->qid.type,
+ wst->qid.path, wst->qid.version,
+ wst->mode, wst->atime, wst->mtime, wst->length,
+ wst->name, wst->uid, wst->gid, wst->muid, wst->extension,
+ from_kuid(&init_user_ns, wst->n_uid),
+ from_kgid(&init_user_ns, wst->n_gid),
+ from_kuid(&init_user_ns, wst->n_muid));
+
+ req = p9_client_rpc(clnt, P9_TWSTAT, "dwS",
+ fid->fid, wst->size + 2, wst);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
+
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_wstat);
+
+int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
+ p9_debug(P9_DEBUG_9P, " valid=%x mode=%x uid=%d gid=%d size=%lld\n",
+ p9attr->valid, p9attr->mode,
+ from_kuid(&init_user_ns, p9attr->uid),
+ from_kgid(&init_user_ns, p9attr->gid),
+ p9attr->size);
+ p9_debug(P9_DEBUG_9P, " atime_sec=%lld atime_nsec=%lld\n",
+ p9attr->atime_sec, p9attr->atime_nsec);
+ p9_debug(P9_DEBUG_9P, " mtime_sec=%lld mtime_nsec=%lld\n",
+ p9attr->mtime_sec, p9attr->mtime_nsec);
+
+ req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr);
+
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_setattr);
+
+int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ clnt = fid->clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
+
+ req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type,
+ &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
+ &sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P,
+ "<<< RSTATFS fid %d type 0x%x bsize %u blocks %llu bfree %llu bavail %llu files %llu ffree %llu fsid %llu namelen %u\n",
+ fid->fid, sb->type, sb->bsize, sb->blocks, sb->bfree,
+ sb->bavail, sb->files, sb->ffree, sb->fsid, sb->namelen);
+
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_statfs);
+
+int p9_client_rename(struct p9_fid *fid,
+ struct p9_fid *newdirfid, const char *name)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ clnt = fid->clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
+ fid->fid, newdirfid->fid, name);
+
+ req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid,
+ newdirfid->fid, name);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
+
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_rename);
+
+int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
+ struct p9_fid *newdirfid, const char *new_name)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ clnt = olddirfid->clnt;
+
+ p9_debug(P9_DEBUG_9P,
+ ">>> TRENAMEAT olddirfid %d old name %s newdirfid %d new name %s\n",
+ olddirfid->fid, old_name, newdirfid->fid, new_name);
+
+ req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid,
+ old_name, newdirfid->fid, new_name);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n",
+ newdirfid->fid, new_name);
+
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_renameat);
+
+/* An xattrwalk without @attr_name gives the fid for the lisxattr namespace
+ */
+struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
+ const char *attr_name, u64 *attr_size)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+ struct p9_fid *attr_fid;
+
+ clnt = file_fid->clnt;
+ attr_fid = p9_fid_create(clnt);
+ if (!attr_fid) {
+ err = -ENOMEM;
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P,
+ ">>> TXATTRWALK file_fid %d, attr_fid %d name '%s'\n",
+ file_fid->fid, attr_fid->fid, attr_name);
+
+ req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds",
+ file_fid->fid, attr_fid->fid, attr_name);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ goto clunk_fid;
+ }
+ p9_req_put(clnt, req);
+ p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n",
+ attr_fid->fid, *attr_size);
+ return attr_fid;
+clunk_fid:
+ p9_fid_put(attr_fid);
+ attr_fid = NULL;
+error:
+ if (attr_fid && attr_fid != file_fid)
+ p9_fid_destroy(attr_fid);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrwalk);
+
+int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
+ u64 attr_size, int flags)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ p9_debug(P9_DEBUG_9P,
+ ">>> TXATTRCREATE fid %d name %s size %llu flag %d\n",
+ fid->fid, name, attr_size, flags);
+ clnt = fid->clnt;
+ req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd",
+ fid->fid, name, attr_size, flags);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
+
+int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
+{
+ int err, non_zc = 0;
+ u32 rsize;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ char *dataptr;
+ struct kvec kv = {.iov_base = data, .iov_len = count};
+ struct iov_iter to;
+
+ iov_iter_kvec(&to, ITER_DEST, &kv, 1, count);
+
+ p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %u\n",
+ fid->fid, offset, count);
+
+ clnt = fid->clnt;
+
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize - P9_READDIRHDRSZ)
+ rsize = clnt->msize - P9_READDIRHDRSZ;
+
+ if (count < rsize)
+ rsize = count;
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ /* response header len is 11
+ * PDU Header(7) + IO Size (4)
+ */
+ req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0,
+ 11, "dqd", fid->fid, offset, rsize);
+ } else {
+ non_zc = 1;
+ req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid,
+ offset, rsize);
+ }
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto free_and_error;
+ }
+ if (rsize < count) {
+ pr_err("bogus RREADDIR count (%u > %u)\n", count, rsize);
+ err = -EIO;
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %u\n", count);
+
+ if (non_zc)
+ memmove(data, dataptr, count);
+
+ p9_req_put(clnt, req);
+ return count;
+
+free_and_error:
+ p9_req_put(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_readdir);
+
+int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode,
+ dev_t rdev, kgid_t gid, struct p9_qid *qid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P,
+ ">>> TMKNOD fid %d name %s mode %d major %d minor %d\n",
+ fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
+ req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode,
+ MAJOR(rdev), MINOR(rdev), gid);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n",
+ qid->type, qid->path, qid->version);
+
+error:
+ p9_req_put(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_mknod_dotl);
+
+int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
+ kgid_t gid, struct p9_qid *qid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
+ fid->fid, name, mode, from_kgid(&init_user_ns, gid));
+ req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg",
+ fid->fid, name, mode, gid);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
+ qid->path, qid->version);
+
+error:
+ p9_req_put(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_mkdir_dotl);
+
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P,
+ ">>> TLOCK fid %d type %i flags %d start %lld length %lld proc_id %d client_id %s\n",
+ fid->fid, flock->type, flock->flags, flock->start,
+ flock->length, flock->proc_id, flock->client_id);
+
+ req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
+ flock->flags, flock->start, flock->length,
+ flock->proc_id, flock->client_id);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+error:
+ p9_req_put(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_lock_dotl);
+
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P,
+ ">>> TGETLOCK fid %d, type %i start %lld length %lld proc_id %d client_id %s\n",
+ fid->fid, glock->type, glock->start, glock->length,
+ glock->proc_id, glock->client_id);
+
+ req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid,
+ glock->type, glock->start, glock->length,
+ glock->proc_id, glock->client_id);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type,
+ &glock->start, &glock->length, &glock->proc_id,
+ &glock->client_id);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P,
+ "<<< RGETLOCK type %i start %lld length %lld proc_id %d client_id %s\n",
+ glock->type, glock->start, glock->length,
+ glock->proc_id, glock->client_id);
+error:
+ p9_req_put(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_getlock_dotl);
+
+int p9_client_readlink(struct p9_fid *fid, char **target)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+
+ req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+error:
+ p9_req_put(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_readlink);
+
+int __init p9_client_init(void)
+{
+ p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU);
+ return p9_req_cache ? 0 : -ENOMEM;
+}
+
+void __exit p9_client_exit(void)
+{
+ kmem_cache_destroy(p9_req_cache);
+}
diff --git a/net/9p/error.c b/net/9p/error.c
new file mode 100644
index 000000000000..8ba8afc91482
--- /dev/null
+++ b/net/9p/error.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers. These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/errno.h>
+#include <linux/hashtable.h>
+#include <net/9p/9p.h>
+
+/**
+ * struct errormap - map string errors from Plan 9 to Linux numeric ids
+ * @name: string sent over 9P
+ * @val: numeric id most closely representing @name
+ * @namelen: length of string
+ * @list: hash-table list for string lookup
+ */
+struct errormap {
+ char *name;
+ int val;
+
+ int namelen;
+ struct hlist_node list;
+};
+
+#define ERRHASH_BITS 5
+static DEFINE_HASHTABLE(hash_errmap, ERRHASH_BITS);
+
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+ {"Operation not permitted", EPERM},
+ {"wstat prohibited", EPERM},
+ {"No such file or directory", ENOENT},
+ {"directory entry not found", ENOENT},
+ {"file not found", ENOENT},
+ {"Interrupted system call", EINTR},
+ {"Input/output error", EIO},
+ {"No such device or address", ENXIO},
+ {"Argument list too long", E2BIG},
+ {"Bad file descriptor", EBADF},
+ {"Resource temporarily unavailable", EAGAIN},
+ {"Cannot allocate memory", ENOMEM},
+ {"Permission denied", EACCES},
+ {"Bad address", EFAULT},
+ {"Block device required", ENOTBLK},
+ {"Device or resource busy", EBUSY},
+ {"File exists", EEXIST},
+ {"Invalid cross-device link", EXDEV},
+ {"No such device", ENODEV},
+ {"Not a directory", ENOTDIR},
+ {"Is a directory", EISDIR},
+ {"Invalid argument", EINVAL},
+ {"Too many open files in system", ENFILE},
+ {"Too many open files", EMFILE},
+ {"Text file busy", ETXTBSY},
+ {"File too large", EFBIG},
+ {"No space left on device", ENOSPC},
+ {"Illegal seek", ESPIPE},
+ {"Read-only file system", EROFS},
+ {"Too many links", EMLINK},
+ {"Broken pipe", EPIPE},
+ {"Numerical argument out of domain", EDOM},
+ {"Numerical result out of range", ERANGE},
+ {"Resource deadlock avoided", EDEADLK},
+ {"File name too long", ENAMETOOLONG},
+ {"No locks available", ENOLCK},
+ {"Function not implemented", ENOSYS},
+ {"Directory not empty", ENOTEMPTY},
+ {"Too many levels of symbolic links", ELOOP},
+ {"No message of desired type", ENOMSG},
+ {"Identifier removed", EIDRM},
+ {"No data available", ENODATA},
+ {"Machine is not on the network", ENONET},
+ {"Package not installed", ENOPKG},
+ {"Object is remote", EREMOTE},
+ {"Link has been severed", ENOLINK},
+ {"Communication error on send", ECOMM},
+ {"Protocol error", EPROTO},
+ {"Bad message", EBADMSG},
+ {"File descriptor in bad state", EBADFD},
+ {"Streams pipe error", ESTRPIPE},
+ {"Too many users", EUSERS},
+ {"Socket operation on non-socket", ENOTSOCK},
+ {"Message too long", EMSGSIZE},
+ {"Protocol not available", ENOPROTOOPT},
+ {"Protocol not supported", EPROTONOSUPPORT},
+ {"Socket type not supported", ESOCKTNOSUPPORT},
+ {"Operation not supported", EOPNOTSUPP},
+ {"Protocol family not supported", EPFNOSUPPORT},
+ {"Network is down", ENETDOWN},
+ {"Network is unreachable", ENETUNREACH},
+ {"Network dropped connection on reset", ENETRESET},
+ {"Software caused connection abort", ECONNABORTED},
+ {"Connection reset by peer", ECONNRESET},
+ {"No buffer space available", ENOBUFS},
+ {"Transport endpoint is already connected", EISCONN},
+ {"Transport endpoint is not connected", ENOTCONN},
+ {"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+ {"Connection timed out", ETIMEDOUT},
+ {"Connection refused", ECONNREFUSED},
+ {"Host is down", EHOSTDOWN},
+ {"No route to host", EHOSTUNREACH},
+ {"Operation already in progress", EALREADY},
+ {"Operation now in progress", EINPROGRESS},
+ {"Is a named type file", EISNAM},
+ {"Remote I/O error", EREMOTEIO},
+ {"Disk quota exceeded", EDQUOT},
+/* errors from fossil, vacfs, and u9fs */
+ {"fid unknown or out of range", EBADF},
+ {"permission denied", EACCES},
+ {"file does not exist", ENOENT},
+ {"authentication failed", ECONNREFUSED},
+ {"bad offset in directory read", ESPIPE},
+ {"bad use of fid", EBADF},
+ {"wstat can't convert between files and directories", EPERM},
+ {"directory is not empty", ENOTEMPTY},
+ {"file exists", EEXIST},
+ {"file already exists", EEXIST},
+ {"file or directory already exists", EEXIST},
+ {"fid already in use", EBADF},
+ {"file in use", ETXTBSY},
+ {"i/o error", EIO},
+ {"file already open for I/O", ETXTBSY},
+ {"illegal mode", EINVAL},
+ {"illegal name", ENAMETOOLONG},
+ {"not a directory", ENOTDIR},
+ {"not a member of proposed group", EPERM},
+ {"not owner", EACCES},
+ {"only owner can change group in wstat", EACCES},
+ {"read only file system", EROFS},
+ {"no access to special file", EPERM},
+ {"i/o count too large", EIO},
+ {"unknown group", EINVAL},
+ {"unknown user", EINVAL},
+ {"bogus wstat buffer", EPROTO},
+ {"exclusive use file already open", EAGAIN},
+ {"corrupted directory entry", EIO},
+ {"corrupted file entry", EIO},
+ {"corrupted block label", EIO},
+ {"corrupted meta data", EIO},
+ {"illegal offset", EINVAL},
+ {"illegal path element", ENOENT},
+ {"root of file system is corrupted", EIO},
+ {"corrupted super block", EIO},
+ {"protocol botch", EPROTO},
+ {"file system is full", ENOSPC},
+ {"file is in use", EAGAIN},
+ {"directory entry is not allocated", ENOENT},
+ {"file is read only", EROFS},
+ {"file has been removed", EIDRM},
+ {"only support truncation to zero length", EPERM},
+ {"cannot remove root", EPERM},
+ {"file too big", EFBIG},
+ {"venti i/o error", EIO},
+ /* these are not errors */
+ {"u9fs rhostsauth: no authentication required", 0},
+ {"u9fs authnone: no authentication required", 0},
+ {NULL, -1}
+};
+
+/**
+ * p9_error_init - preload mappings into hash list
+ *
+ */
+
+int p9_error_init(void)
+{
+ struct errormap *c;
+ u32 hash;
+
+ /* load initial error map into hash table */
+ for (c = errmap; c->name; c++) {
+ c->namelen = strlen(c->name);
+ hash = jhash(c->name, c->namelen, 0);
+ INIT_HLIST_NODE(&c->list);
+ hash_add(hash_errmap, &c->list, hash);
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(p9_error_init);
+
+/**
+ * p9_errstr2errno - convert error string to error number
+ * @errstr: error string
+ * @len: length of error string
+ *
+ */
+
+int p9_errstr2errno(char *errstr, int len)
+{
+ int errno;
+ struct errormap *c;
+ u32 hash;
+
+ errno = 0;
+ c = NULL;
+ hash = jhash(errstr, len, 0);
+ hash_for_each_possible(hash_errmap, c, list, hash) {
+ if (c->namelen == len && !memcmp(c->name, errstr, len)) {
+ errno = c->val;
+ break;
+ }
+ }
+
+ if (errno == 0) {
+ /* TODO: if error isn't found, add it dynamically */
+ errstr[len] = 0;
+ pr_err("%s: server reported unknown error %s\n",
+ __func__, errstr);
+ errno = ESERVERFAULT;
+ }
+
+ return -errno;
+}
+EXPORT_SYMBOL(p9_errstr2errno);
diff --git a/net/9p/mod.c b/net/9p/mod.c
new file mode 100644
index 000000000000..85160b52da55
--- /dev/null
+++ b/net/9p/mod.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 9P entry point
+ *
+ * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <net/9p/9p.h>
+#include <linux/fs.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_NET_9P_DEBUG
+unsigned int p9_debug_level; /* feature-rific global debug level */
+EXPORT_SYMBOL(p9_debug_level);
+module_param_named(debug, p9_debug_level, uint, 0);
+MODULE_PARM_DESC(debug, "9P debugging level");
+
+void _p9_debug(enum p9_debug_flags level, const char *func,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if ((p9_debug_level & level) != level)
+ return;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (level == P9_DEBUG_9P)
+ pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf);
+ else
+ pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(_p9_debug);
+#endif
+
+/* Dynamic Transport Registration Routines */
+
+static DEFINE_SPINLOCK(v9fs_trans_lock);
+static LIST_HEAD(v9fs_trans_list);
+
+/**
+ * v9fs_register_trans - register a new transport with 9p
+ * @m: structure describing the transport module and entry points
+ *
+ */
+void v9fs_register_trans(struct p9_trans_module *m)
+{
+ spin_lock(&v9fs_trans_lock);
+ list_add_tail(&m->list, &v9fs_trans_list);
+ spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_register_trans);
+
+/**
+ * v9fs_unregister_trans - unregister a 9p transport
+ * @m: the transport to remove
+ *
+ */
+void v9fs_unregister_trans(struct p9_trans_module *m)
+{
+ spin_lock(&v9fs_trans_lock);
+ list_del_init(&m->list);
+ spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_unregister_trans);
+
+static struct p9_trans_module *_p9_get_trans_by_name(const char *s)
+{
+ struct p9_trans_module *t, *found = NULL;
+
+ spin_lock(&v9fs_trans_lock);
+
+ list_for_each_entry(t, &v9fs_trans_list, list)
+ if (strcmp(t->name, s) == 0 &&
+ try_module_get(t->owner)) {
+ found = t;
+ break;
+ }
+
+ spin_unlock(&v9fs_trans_lock);
+
+ return found;
+}
+
+/**
+ * v9fs_get_trans_by_name - get transport with the matching name
+ * @s: string identifying transport
+ *
+ */
+struct p9_trans_module *v9fs_get_trans_by_name(const char *s)
+{
+ struct p9_trans_module *found = NULL;
+
+ found = _p9_get_trans_by_name(s);
+
+#ifdef CONFIG_MODULES
+ if (!found) {
+ request_module("9p-%s", s);
+ found = _p9_get_trans_by_name(s);
+ }
+#endif
+
+ return found;
+}
+EXPORT_SYMBOL(v9fs_get_trans_by_name);
+
+static const char * const v9fs_default_transports[] = {
+ "virtio", "tcp", "fd", "unix", "xen", "rdma",
+};
+
+/**
+ * v9fs_get_default_trans - get the default transport
+ *
+ */
+
+struct p9_trans_module *v9fs_get_default_trans(void)
+{
+ struct p9_trans_module *t, *found = NULL;
+ int i;
+
+ spin_lock(&v9fs_trans_lock);
+
+ list_for_each_entry(t, &v9fs_trans_list, list)
+ if (t->def && try_module_get(t->owner)) {
+ found = t;
+ break;
+ }
+
+ if (!found)
+ list_for_each_entry(t, &v9fs_trans_list, list)
+ if (try_module_get(t->owner)) {
+ found = t;
+ break;
+ }
+
+ spin_unlock(&v9fs_trans_lock);
+
+ for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++)
+ found = v9fs_get_trans_by_name(v9fs_default_transports[i]);
+
+ return found;
+}
+EXPORT_SYMBOL(v9fs_get_default_trans);
+
+/**
+ * v9fs_put_trans - put trans
+ * @m: transport to put
+ *
+ */
+void v9fs_put_trans(struct p9_trans_module *m)
+{
+ if (m)
+ module_put(m->owner);
+}
+EXPORT_SYMBOL(v9fs_put_trans);
+
+/**
+ * init_p9 - Initialize module
+ *
+ */
+static int __init init_p9(void)
+{
+ int ret;
+
+ ret = p9_client_init();
+ if (ret)
+ return ret;
+
+ p9_error_init();
+ pr_info("Installing 9P2000 support\n");
+
+ return ret;
+}
+
+/**
+ * exit_p9 - shutdown module
+ *
+ */
+
+static void __exit exit_p9(void)
+{
+ pr_info("Unloading 9P2000 support\n");
+
+ p9_client_exit();
+}
+
+module_init(init_p9)
+module_exit(exit_p9)
+
+MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)");
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
new file mode 100644
index 000000000000..0e6603b1ec90
--- /dev/null
+++ b/net/9p/protocol.c
@@ -0,0 +1,801 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 9P Protocol Support Code
+ *
+ * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ * Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2008 by IBM, Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "protocol.h"
+
+#include <trace/events/9p.h>
+
+/* len[2] text[len] */
+#define P9_STRLEN(s) \
+ (2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX))
+
+/**
+ * p9_msg_buf_size - Returns a buffer size sufficiently large to hold the
+ * intended 9p message.
+ * @c: client
+ * @type: message type
+ * @fmt: format template for assembling request message
+ * (see p9pdu_vwritef)
+ * @ap: variable arguments to be fed to passed format template
+ * (see p9pdu_vwritef)
+ *
+ * Note: Even for response types (P9_R*) the format template and variable
+ * arguments must always be for the originating request type (P9_T*).
+ */
+size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
+ const char *fmt, va_list ap)
+{
+ /* size[4] type[1] tag[2] */
+ const int hdr = 4 + 1 + 2;
+ /* ename[s] errno[4] */
+ const int rerror_size = hdr + P9_ERRMAX + 4;
+ /* ecode[4] */
+ const int rlerror_size = hdr + 4;
+ const int err_size =
+ c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size;
+
+ static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes "
+ "a max. allowed directory entry name length of 4k");
+
+ switch (type) {
+
+ /* message types not used at all */
+ case P9_TERROR:
+ case P9_TLERROR:
+ case P9_TAUTH:
+ case P9_RAUTH:
+ BUG();
+
+ /* variable length & potentially large message types */
+ case P9_TATTACH:
+ BUG_ON(strcmp("ddss?u", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int32_t);
+ {
+ const char *uname = va_arg(ap, const char *);
+ const char *aname = va_arg(ap, const char *);
+ /* fid[4] afid[4] uname[s] aname[s] n_uname[4] */
+ return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4;
+ }
+ case P9_TWALK:
+ BUG_ON(strcmp("ddT", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int32_t);
+ {
+ uint i, nwname = va_arg(ap, int);
+ size_t wname_all;
+ const char **wnames = va_arg(ap, const char **);
+ for (i = 0, wname_all = 0; i < nwname; ++i) {
+ wname_all += P9_STRLEN(wnames[i]);
+ }
+ /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
+ return hdr + 4 + 4 + 2 + wname_all;
+ }
+ case P9_RWALK:
+ BUG_ON(strcmp("ddT", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int32_t);
+ {
+ uint nwname = va_arg(ap, int);
+ /* nwqid[2] nwqid*(wqid[13]) */
+ return max_t(size_t, hdr + 2 + nwname * 13, err_size);
+ }
+ case P9_TCREATE:
+ BUG_ON(strcmp("dsdb?s", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *name = va_arg(ap, const char *);
+ if (c->proto_version == p9_proto_legacy) {
+ /* fid[4] name[s] perm[4] mode[1] */
+ return hdr + 4 + P9_STRLEN(name) + 4 + 1;
+ } else {
+ va_arg(ap, int32_t);
+ va_arg(ap, int);
+ {
+ const char *ext = va_arg(ap, const char *);
+ /* fid[4] name[s] perm[4] mode[1] extension[s] */
+ return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext);
+ }
+ }
+ }
+ case P9_TLCREATE:
+ BUG_ON(strcmp("dsddg", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *name = va_arg(ap, const char *);
+ /* fid[4] name[s] flags[4] mode[4] gid[4] */
+ return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4;
+ }
+ case P9_RREAD:
+ case P9_RREADDIR:
+ BUG_ON(strcmp("dqd", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int64_t);
+ {
+ const int32_t count = va_arg(ap, int32_t);
+ /* count[4] data[count] */
+ return max_t(size_t, hdr + 4 + count, err_size);
+ }
+ case P9_TWRITE:
+ BUG_ON(strcmp("dqV", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int64_t);
+ {
+ const int32_t count = va_arg(ap, int32_t);
+ /* fid[4] offset[8] count[4] data[count] */
+ return hdr + 4 + 8 + 4 + count;
+ }
+ case P9_TRENAMEAT:
+ BUG_ON(strcmp("dsds", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *oldname, *newname;
+ oldname = va_arg(ap, const char *);
+ va_arg(ap, int32_t);
+ newname = va_arg(ap, const char *);
+ /* olddirfid[4] oldname[s] newdirfid[4] newname[s] */
+ return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname);
+ }
+ case P9_TSYMLINK:
+ BUG_ON(strcmp("dssg", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *name = va_arg(ap, const char *);
+ const char *symtgt = va_arg(ap, const char *);
+ /* fid[4] name[s] symtgt[s] gid[4] */
+ return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4;
+ }
+
+ case P9_RERROR:
+ return rerror_size;
+ case P9_RLERROR:
+ return rlerror_size;
+
+ /* small message types */
+ case P9_TWSTAT:
+ case P9_RSTAT:
+ case P9_RREADLINK:
+ case P9_TXATTRWALK:
+ case P9_TXATTRCREATE:
+ case P9_TLINK:
+ case P9_TMKDIR:
+ case P9_TMKNOD:
+ case P9_TRENAME:
+ case P9_TUNLINKAT:
+ case P9_TLOCK:
+ return 8 * 1024;
+
+ /* tiny message types */
+ default:
+ return 4 * 1024;
+
+ }
+}
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+
+void p9stat_free(struct p9_wstat *stbuf)
+{
+ kfree(stbuf->name);
+ stbuf->name = NULL;
+ kfree(stbuf->uid);
+ stbuf->uid = NULL;
+ kfree(stbuf->gid);
+ stbuf->gid = NULL;
+ kfree(stbuf->muid);
+ stbuf->muid = NULL;
+ kfree(stbuf->extension);
+ stbuf->extension = NULL;
+}
+EXPORT_SYMBOL(p9stat_free);
+
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
+{
+ size_t len = min(pdu->size - pdu->offset, size);
+
+ memcpy(data, &pdu->sdata[pdu->offset], len);
+ pdu->offset += len;
+ return size - len;
+}
+
+static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size)
+{
+ size_t len = min(pdu->capacity - pdu->size, size);
+
+ memcpy(&pdu->sdata[pdu->size], data, len);
+ pdu->size += len;
+ return size - len;
+}
+
+static size_t
+pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size)
+{
+ size_t len = min(pdu->capacity - pdu->size, size);
+
+ if (!copy_from_iter_full(&pdu->sdata[pdu->size], len, from))
+ len = 0;
+
+ pdu->size += len;
+ return size - len;
+}
+
+/* b - int8_t
+ * w - int16_t
+ * d - int32_t
+ * q - int64_t
+ * s - string
+ * u - numeric uid
+ * g - numeric gid
+ * S - stat
+ * Q - qid
+ * D - data blob (int32_t size followed by void *, results are not freed)
+ * T - array of strings (int16_t count, followed by strings)
+ * R - array of qids (int16_t count, followed by qids)
+ * A - stat for 9p2000.L (p9_stat_dotl)
+ * ? - if optional = 1, continue parsing
+ */
+
+static int
+p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap)
+{
+ const char *ptr;
+ int errcode = 0;
+
+ for (ptr = fmt; *ptr; ptr++) {
+ switch (*ptr) {
+ case 'b':{
+ int8_t *val = va_arg(ap, int8_t *);
+ if (pdu_read(pdu, val, sizeof(*val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ }
+ break;
+ case 'w':{
+ int16_t *val = va_arg(ap, int16_t *);
+ __le16 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *val = le16_to_cpu(le_val);
+ }
+ break;
+ case 'd':{
+ int32_t *val = va_arg(ap, int32_t *);
+ __le32 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *val = le32_to_cpu(le_val);
+ }
+ break;
+ case 'q':{
+ int64_t *val = va_arg(ap, int64_t *);
+ __le64 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *val = le64_to_cpu(le_val);
+ }
+ break;
+ case 's':{
+ char **sptr = va_arg(ap, char **);
+ uint16_t len;
+
+ errcode = p9pdu_readf(pdu, proto_version,
+ "w", &len);
+ if (errcode)
+ break;
+
+ *sptr = kmalloc(len + 1, GFP_NOFS);
+ if (*sptr == NULL) {
+ errcode = -ENOMEM;
+ break;
+ }
+ if (pdu_read(pdu, *sptr, len)) {
+ errcode = -EFAULT;
+ kfree(*sptr);
+ *sptr = NULL;
+ } else
+ (*sptr)[len] = 0;
+ }
+ break;
+ case 'u': {
+ kuid_t *uid = va_arg(ap, kuid_t *);
+ __le32 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *uid = make_kuid(&init_user_ns,
+ le32_to_cpu(le_val));
+ } break;
+ case 'g': {
+ kgid_t *gid = va_arg(ap, kgid_t *);
+ __le32 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *gid = make_kgid(&init_user_ns,
+ le32_to_cpu(le_val));
+ } break;
+ case 'Q':{
+ struct p9_qid *qid =
+ va_arg(ap, struct p9_qid *);
+
+ errcode = p9pdu_readf(pdu, proto_version, "bdq",
+ &qid->type, &qid->version,
+ &qid->path);
+ }
+ break;
+ case 'S':{
+ struct p9_wstat *stbuf =
+ va_arg(ap, struct p9_wstat *);
+
+ memset(stbuf, 0, sizeof(struct p9_wstat));
+ stbuf->n_uid = stbuf->n_muid = INVALID_UID;
+ stbuf->n_gid = INVALID_GID;
+
+ errcode =
+ p9pdu_readf(pdu, proto_version,
+ "wwdQdddqssss?sugu",
+ &stbuf->size, &stbuf->type,
+ &stbuf->dev, &stbuf->qid,
+ &stbuf->mode, &stbuf->atime,
+ &stbuf->mtime, &stbuf->length,
+ &stbuf->name, &stbuf->uid,
+ &stbuf->gid, &stbuf->muid,
+ &stbuf->extension,
+ &stbuf->n_uid, &stbuf->n_gid,
+ &stbuf->n_muid);
+ if (errcode)
+ p9stat_free(stbuf);
+ }
+ break;
+ case 'D':{
+ uint32_t *count = va_arg(ap, uint32_t *);
+ void **data = va_arg(ap, void **);
+
+ errcode =
+ p9pdu_readf(pdu, proto_version, "d", count);
+ if (!errcode) {
+ *count =
+ min_t(uint32_t, *count,
+ pdu->size - pdu->offset);
+ *data = &pdu->sdata[pdu->offset];
+ }
+ }
+ break;
+ case 'T':{
+ uint16_t *nwname = va_arg(ap, uint16_t *);
+ char ***wnames = va_arg(ap, char ***);
+
+ *wnames = NULL;
+
+ errcode = p9pdu_readf(pdu, proto_version,
+ "w", nwname);
+ if (!errcode) {
+ *wnames =
+ kmalloc_array(*nwname,
+ sizeof(char *),
+ GFP_NOFS);
+ if (!*wnames)
+ errcode = -ENOMEM;
+ else
+ (*wnames)[0] = NULL;
+ }
+
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < *nwname; i++) {
+ errcode =
+ p9pdu_readf(pdu,
+ proto_version,
+ "s",
+ &(*wnames)[i]);
+ if (errcode) {
+ (*wnames)[i] = NULL;
+ break;
+ }
+ }
+ }
+
+ if (errcode) {
+ if (*wnames) {
+ int i;
+
+ for (i = 0; i < *nwname; i++) {
+ if (!(*wnames)[i])
+ break;
+ kfree((*wnames)[i]);
+ }
+ kfree(*wnames);
+ *wnames = NULL;
+ }
+ }
+ }
+ break;
+ case 'R':{
+ uint16_t *nwqid = va_arg(ap, uint16_t *);
+ struct p9_qid **wqids =
+ va_arg(ap, struct p9_qid **);
+
+ *wqids = NULL;
+
+ errcode =
+ p9pdu_readf(pdu, proto_version, "w", nwqid);
+ if (!errcode) {
+ *wqids =
+ kmalloc_array(*nwqid,
+ sizeof(struct p9_qid),
+ GFP_NOFS);
+ if (*wqids == NULL)
+ errcode = -ENOMEM;
+ }
+
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < *nwqid; i++) {
+ errcode =
+ p9pdu_readf(pdu,
+ proto_version,
+ "Q",
+ &(*wqids)[i]);
+ if (errcode)
+ break;
+ }
+ }
+
+ if (errcode) {
+ kfree(*wqids);
+ *wqids = NULL;
+ }
+ }
+ break;
+ case 'A': {
+ struct p9_stat_dotl *stbuf =
+ va_arg(ap, struct p9_stat_dotl *);
+
+ memset(stbuf, 0, sizeof(struct p9_stat_dotl));
+ errcode =
+ p9pdu_readf(pdu, proto_version,
+ "qQdugqqqqqqqqqqqqqqq",
+ &stbuf->st_result_mask,
+ &stbuf->qid,
+ &stbuf->st_mode,
+ &stbuf->st_uid, &stbuf->st_gid,
+ &stbuf->st_nlink,
+ &stbuf->st_rdev, &stbuf->st_size,
+ &stbuf->st_blksize, &stbuf->st_blocks,
+ &stbuf->st_atime_sec,
+ &stbuf->st_atime_nsec,
+ &stbuf->st_mtime_sec,
+ &stbuf->st_mtime_nsec,
+ &stbuf->st_ctime_sec,
+ &stbuf->st_ctime_nsec,
+ &stbuf->st_btime_sec,
+ &stbuf->st_btime_nsec,
+ &stbuf->st_gen,
+ &stbuf->st_data_version);
+ }
+ break;
+ case '?':
+ if ((proto_version != p9_proto_2000u) &&
+ (proto_version != p9_proto_2000L))
+ return 0;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (errcode)
+ break;
+ }
+
+ return errcode;
+}
+
+int
+p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap)
+{
+ const char *ptr;
+ int errcode = 0;
+
+ for (ptr = fmt; *ptr; ptr++) {
+ switch (*ptr) {
+ case 'b':{
+ int8_t val = va_arg(ap, int);
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'w':{
+ __le16 val = cpu_to_le16(va_arg(ap, int));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'd':{
+ __le32 val = cpu_to_le32(va_arg(ap, int32_t));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'q':{
+ __le64 val = cpu_to_le64(va_arg(ap, int64_t));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 's':{
+ const char *sptr = va_arg(ap, const char *);
+ uint16_t len = 0;
+ if (sptr)
+ len = min_t(size_t, strlen(sptr),
+ USHRT_MAX);
+
+ errcode = p9pdu_writef(pdu, proto_version,
+ "w", len);
+ if (!errcode && pdu_write(pdu, sptr, len))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'u': {
+ kuid_t uid = va_arg(ap, kuid_t);
+ __le32 val = cpu_to_le32(
+ from_kuid(&init_user_ns, uid));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ } break;
+ case 'g': {
+ kgid_t gid = va_arg(ap, kgid_t);
+ __le32 val = cpu_to_le32(
+ from_kgid(&init_user_ns, gid));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ } break;
+ case 'Q':{
+ const struct p9_qid *qid =
+ va_arg(ap, const struct p9_qid *);
+ errcode =
+ p9pdu_writef(pdu, proto_version, "bdq",
+ qid->type, qid->version,
+ qid->path);
+ } break;
+ case 'S':{
+ const struct p9_wstat *stbuf =
+ va_arg(ap, const struct p9_wstat *);
+ errcode =
+ p9pdu_writef(pdu, proto_version,
+ "wwdQdddqssss?sugu",
+ stbuf->size, stbuf->type,
+ stbuf->dev, &stbuf->qid,
+ stbuf->mode, stbuf->atime,
+ stbuf->mtime, stbuf->length,
+ stbuf->name, stbuf->uid,
+ stbuf->gid, stbuf->muid,
+ stbuf->extension, stbuf->n_uid,
+ stbuf->n_gid, stbuf->n_muid);
+ } break;
+ case 'V':{
+ uint32_t count = va_arg(ap, uint32_t);
+ struct iov_iter *from =
+ va_arg(ap, struct iov_iter *);
+ errcode = p9pdu_writef(pdu, proto_version, "d",
+ count);
+ if (!errcode && pdu_write_u(pdu, from, count))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'T':{
+ uint16_t nwname = va_arg(ap, int);
+ const char **wnames = va_arg(ap, const char **);
+
+ errcode = p9pdu_writef(pdu, proto_version, "w",
+ nwname);
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < nwname; i++) {
+ errcode =
+ p9pdu_writef(pdu,
+ proto_version,
+ "s",
+ wnames[i]);
+ if (errcode)
+ break;
+ }
+ }
+ }
+ break;
+ case 'R':{
+ uint16_t nwqid = va_arg(ap, int);
+ struct p9_qid *wqids =
+ va_arg(ap, struct p9_qid *);
+
+ errcode = p9pdu_writef(pdu, proto_version, "w",
+ nwqid);
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < nwqid; i++) {
+ errcode =
+ p9pdu_writef(pdu,
+ proto_version,
+ "Q",
+ &wqids[i]);
+ if (errcode)
+ break;
+ }
+ }
+ }
+ break;
+ case 'I':{
+ struct p9_iattr_dotl *p9attr = va_arg(ap,
+ struct p9_iattr_dotl *);
+
+ errcode = p9pdu_writef(pdu, proto_version,
+ "ddugqqqqq",
+ p9attr->valid,
+ p9attr->mode,
+ p9attr->uid,
+ p9attr->gid,
+ p9attr->size,
+ p9attr->atime_sec,
+ p9attr->atime_nsec,
+ p9attr->mtime_sec,
+ p9attr->mtime_nsec);
+ }
+ break;
+ case '?':
+ if ((proto_version != p9_proto_2000u) &&
+ (proto_version != p9_proto_2000L))
+ return 0;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (errcode)
+ break;
+ }
+
+ return errcode;
+}
+
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = p9pdu_vreadf(pdu, proto_version, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = p9pdu_vwritef(pdu, proto_version, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
+{
+ struct p9_fcall fake_pdu;
+ int ret;
+
+ fake_pdu.size = len;
+ fake_pdu.capacity = len;
+ fake_pdu.sdata = buf;
+ fake_pdu.offset = 0;
+
+ ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st);
+ if (ret) {
+ p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
+ trace_9p_protocol_dump(clnt, &fake_pdu);
+ return ret;
+ }
+
+ return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9stat_read);
+
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type)
+{
+ pdu->id = type;
+ return p9pdu_writef(pdu, 0, "dbw", 0, type, tag);
+}
+
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu)
+{
+ int size = pdu->size;
+ int err;
+
+ pdu->size = 0;
+ err = p9pdu_writef(pdu, 0, "d", size);
+ pdu->size = size;
+
+ trace_9p_protocol_dump(clnt, pdu);
+ p9_debug(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n",
+ pdu->size, pdu->id, pdu->tag);
+
+ return err;
+}
+
+void p9pdu_reset(struct p9_fcall *pdu)
+{
+ pdu->offset = 0;
+ pdu->size = 0;
+}
+
+int p9dirent_read(struct p9_client *clnt, char *buf, int len,
+ struct p9_dirent *dirent)
+{
+ struct p9_fcall fake_pdu;
+ int ret;
+ char *nameptr;
+
+ fake_pdu.size = len;
+ fake_pdu.capacity = len;
+ fake_pdu.sdata = buf;
+ fake_pdu.offset = 0;
+
+ ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid,
+ &dirent->d_off, &dirent->d_type, &nameptr);
+ if (ret) {
+ p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
+ trace_9p_protocol_dump(clnt, &fake_pdu);
+ return ret;
+ }
+
+ ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name));
+ if (ret < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "On the wire dirent name too long: %s\n",
+ nameptr);
+ kfree(nameptr);
+ return ret;
+ }
+ kfree(nameptr);
+
+ return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9dirent_read);
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
new file mode 100644
index 000000000000..ad2283d1f96b
--- /dev/null
+++ b/net/9p/protocol.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * 9P Protocol Support Code
+ *
+ * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ * Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2008 by IBM, Corp.
+ */
+
+size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
+ const char *fmt, va_list ap);
+int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap);
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu);
+void p9pdu_reset(struct p9_fcall *pdu);
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size);
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
new file mode 100644
index 000000000000..c827f694551c
--- /dev/null
+++ b/net/9p/trans_common.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include "trans_common.h"
+
+/**
+ * p9_release_pages - Release pages after the transaction.
+ * @pages: array of pages to be put
+ * @nr_pages: size of array
+ */
+void p9_release_pages(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++)
+ if (pages[i])
+ put_page(pages[i]);
+}
+EXPORT_SYMBOL(p9_release_pages);
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
new file mode 100644
index 000000000000..32134db6abf3
--- /dev/null
+++ b/net/9p/trans_common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ */
+
+void p9_release_pages(struct page **pages, int nr_pages);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
new file mode 100644
index 000000000000..0e331c1b2112
--- /dev/null
+++ b/net/9p/trans_fd.c
@@ -0,0 +1,1096 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Fd transport layer. Includes deprecated socket layer.
+ *
+ * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/file.h>
+#include <linux/fs_context.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#include <linux/syscalls.h> /* killme */
+
+#define MAX_SOCK_BUF (1024*1024)
+#define MAXPOLLWADDR 2
+
+static struct p9_trans_module p9_tcp_trans;
+static struct p9_trans_module p9_fd_trans;
+
+enum {
+ Rworksched = 1, /* read work scheduled or running */
+ Rpending = 2, /* can read */
+ Wworksched = 4, /* write work scheduled or running */
+ Wpending = 8, /* can write */
+};
+
+struct p9_poll_wait {
+ struct p9_conn *conn;
+ wait_queue_entry_t wait;
+ wait_queue_head_t *wait_addr;
+};
+
+/**
+ * struct p9_conn - fd mux connection state information
+ * @mux_list: list link for mux to manage multiple connections (?)
+ * @client: reference to client instance for this connection
+ * @err: error state
+ * @req_lock: lock protecting req_list and requests statuses
+ * @req_list: accounting for requests which have been sent
+ * @unsent_req_list: accounting for requests that haven't been sent
+ * @rreq: read request
+ * @wreq: write request
+ * @tmp_buf: temporary buffer to read in header
+ * @rc: temporary fcall for reading current frame
+ * @wpos: write position for current frame
+ * @wsize: amount of data to write for current frame
+ * @wbuf: current write buffer
+ * @poll_pending_link: pending links to be polled per conn
+ * @poll_wait: array of wait_q's for various worker threads
+ * @pt: poll state
+ * @rq: current read work
+ * @wq: current write work
+ * @wsched: ????
+ *
+ */
+
+struct p9_conn {
+ struct list_head mux_list;
+ struct p9_client *client;
+ int err;
+ spinlock_t req_lock;
+ struct list_head req_list;
+ struct list_head unsent_req_list;
+ struct p9_req_t *rreq;
+ struct p9_req_t *wreq;
+ char tmp_buf[P9_HDRSZ];
+ struct p9_fcall rc;
+ int wpos;
+ int wsize;
+ char *wbuf;
+ struct list_head poll_pending_link;
+ struct p9_poll_wait poll_wait[MAXPOLLWADDR];
+ poll_table pt;
+ struct work_struct rq;
+ struct work_struct wq;
+ unsigned long wsched;
+};
+
+/**
+ * struct p9_trans_fd - transport state
+ * @rd: reference to file to read from
+ * @wr: reference of file to write to
+ * @conn: connection state reference
+ *
+ */
+
+struct p9_trans_fd {
+ struct file *rd;
+ struct file *wr;
+ struct p9_conn conn;
+};
+
+static void p9_poll_workfn(struct work_struct *work);
+
+static DEFINE_SPINLOCK(p9_poll_lock);
+static LIST_HEAD(p9_poll_pending_list);
+static DECLARE_WORK(p9_poll_work, p9_poll_workfn);
+
+static unsigned int p9_ipport_resv_min = P9_DEF_MIN_RESVPORT;
+static unsigned int p9_ipport_resv_max = P9_DEF_MAX_RESVPORT;
+
+static void p9_mux_poll_stop(struct p9_conn *m)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+ struct p9_poll_wait *pwait = &m->poll_wait[i];
+
+ if (pwait->wait_addr) {
+ remove_wait_queue(pwait->wait_addr, &pwait->wait);
+ pwait->wait_addr = NULL;
+ }
+ }
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ list_del_init(&m->poll_pending_link);
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ flush_work(&p9_poll_work);
+}
+
+/**
+ * p9_conn_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ *
+ */
+
+static void p9_conn_cancel(struct p9_conn *m, int err)
+{
+ struct p9_req_t *req, *rtmp;
+ LIST_HEAD(cancel_list);
+
+ p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
+
+ spin_lock(&m->req_lock);
+
+ if (READ_ONCE(m->err)) {
+ spin_unlock(&m->req_lock);
+ return;
+ }
+
+ WRITE_ONCE(m->err, err);
+ ASSERT_EXCLUSIVE_WRITER(m->err);
+
+ list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+ list_move(&req->req_list, &cancel_list);
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+ }
+ list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
+ list_move(&req->req_list, &cancel_list);
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+ }
+
+ spin_unlock(&m->req_lock);
+
+ list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+ p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
+ list_del(&req->req_list);
+ if (!req->t_err)
+ req->t_err = err;
+ p9_client_cb(m->client, req, REQ_STATUS_ERROR);
+ }
+}
+
+static __poll_t
+p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
+{
+ __poll_t ret;
+ struct p9_trans_fd *ts = NULL;
+
+ if (client && client->status == Connected)
+ ts = client->trans;
+
+ if (!ts) {
+ if (err)
+ *err = -EREMOTEIO;
+ return EPOLLERR;
+ }
+
+ ret = vfs_poll(ts->rd, pt);
+ if (ts->rd != ts->wr)
+ ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
+ return ret;
+}
+
+/**
+ * p9_fd_read- read from a fd
+ * @client: client instance
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int p9_fd_read(struct p9_client *client, void *v, int len)
+{
+ int ret;
+ struct p9_trans_fd *ts = NULL;
+ loff_t pos;
+
+ if (client && client->status != Disconnected)
+ ts = client->trans;
+
+ if (!ts)
+ return -EREMOTEIO;
+
+ if (!(ts->rd->f_flags & O_NONBLOCK))
+ p9_debug(P9_DEBUG_ERROR, "blocking read ...\n");
+
+ pos = ts->rd->f_pos;
+ ret = kernel_read(ts->rd, v, len, &pos);
+ if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+ client->status = Disconnected;
+ return ret;
+}
+
+/**
+ * p9_read_work - called when there is some data to be read from a transport
+ * @work: container of work to be done
+ *
+ */
+
+static void p9_read_work(struct work_struct *work)
+{
+ __poll_t n;
+ int err;
+ struct p9_conn *m;
+
+ m = container_of(work, struct p9_conn, rq);
+
+ if (READ_ONCE(m->err) < 0)
+ return;
+
+ p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
+
+ if (!m->rc.sdata) {
+ m->rc.sdata = m->tmp_buf;
+ m->rc.offset = 0;
+ m->rc.capacity = P9_HDRSZ; /* start by reading header */
+ }
+
+ clear_bit(Rpending, &m->wsched);
+ p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
+ m, m->rc.offset, m->rc.capacity,
+ m->rc.capacity - m->rc.offset);
+ err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
+ m->rc.capacity - m->rc.offset);
+ p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
+ if (err == -EAGAIN)
+ goto end_clear;
+
+ if (err <= 0)
+ goto error;
+
+ m->rc.offset += err;
+
+ /* header read in */
+ if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) {
+ p9_debug(P9_DEBUG_TRANS, "got new header\n");
+
+ /* Header size */
+ m->rc.size = P9_HDRSZ;
+ err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR,
+ "error parsing header: %d\n", err);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_TRANS,
+ "mux %p pkt: size: %d bytes tag: %d\n",
+ m, m->rc.size, m->rc.tag);
+
+ m->rreq = p9_tag_lookup(m->client, m->rc.tag);
+ if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) {
+ p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
+ m->rc.tag);
+ err = -EIO;
+ goto error;
+ }
+
+ if (m->rc.size > m->rreq->rc.capacity) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ m->rc.size, m->rc.tag, m->rreq->rc.capacity);
+ err = -EIO;
+ goto error;
+ }
+
+ if (!m->rreq->rc.sdata) {
+ p9_debug(P9_DEBUG_ERROR,
+ "No recv fcall for tag %d (req %p), disconnecting!\n",
+ m->rc.tag, m->rreq);
+ p9_req_put(m->client, m->rreq);
+ m->rreq = NULL;
+ err = -EIO;
+ goto error;
+ }
+ m->rc.sdata = m->rreq->rc.sdata;
+ memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
+ m->rc.capacity = m->rc.size;
+ }
+
+ /* packet is read in
+ * not an else because some packets (like clunk) have no payload
+ */
+ if ((m->rreq) && (m->rc.offset == m->rc.capacity)) {
+ p9_debug(P9_DEBUG_TRANS, "got new packet\n");
+ m->rreq->rc.size = m->rc.offset;
+ spin_lock(&m->req_lock);
+ if (m->rreq->status == REQ_STATUS_SENT) {
+ list_del(&m->rreq->req_list);
+ p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD);
+ } else if (m->rreq->status == REQ_STATUS_FLSHD) {
+ /* Ignore replies associated with a cancelled request. */
+ p9_debug(P9_DEBUG_TRANS,
+ "Ignore replies associated with a cancelled request\n");
+ } else {
+ spin_unlock(&m->req_lock);
+ p9_debug(P9_DEBUG_ERROR,
+ "Request tag %d errored out while we were reading the reply\n",
+ m->rc.tag);
+ err = -EIO;
+ goto error;
+ }
+ spin_unlock(&m->req_lock);
+ m->rc.sdata = NULL;
+ m->rc.offset = 0;
+ m->rc.capacity = 0;
+ p9_req_put(m->client, m->rreq);
+ m->rreq = NULL;
+ }
+
+end_clear:
+ clear_bit(Rworksched, &m->wsched);
+
+ if (!list_empty(&m->req_list)) {
+ if (test_and_clear_bit(Rpending, &m->wsched))
+ n = EPOLLIN;
+ else
+ n = p9_fd_poll(m->client, NULL, NULL);
+
+ if ((n & EPOLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
+ schedule_work(&m->rq);
+ }
+ }
+
+ return;
+error:
+ p9_conn_cancel(m, err);
+ clear_bit(Rworksched, &m->wsched);
+}
+
+/**
+ * p9_fd_write - write to a socket
+ * @client: client instance
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int p9_fd_write(struct p9_client *client, void *v, int len)
+{
+ ssize_t ret;
+ struct p9_trans_fd *ts = NULL;
+
+ if (client && client->status != Disconnected)
+ ts = client->trans;
+
+ if (!ts)
+ return -EREMOTEIO;
+
+ if (!(ts->wr->f_flags & O_NONBLOCK))
+ p9_debug(P9_DEBUG_ERROR, "blocking write ...\n");
+
+ ret = kernel_write(ts->wr, v, len, &ts->wr->f_pos);
+ if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+ client->status = Disconnected;
+ return ret;
+}
+
+/**
+ * p9_write_work - called when a transport can send some data
+ * @work: container for work to be done
+ *
+ */
+
+static void p9_write_work(struct work_struct *work)
+{
+ __poll_t n;
+ int err;
+ struct p9_conn *m;
+ struct p9_req_t *req;
+
+ m = container_of(work, struct p9_conn, wq);
+
+ if (READ_ONCE(m->err) < 0) {
+ clear_bit(Wworksched, &m->wsched);
+ return;
+ }
+
+ if (!m->wsize) {
+ spin_lock(&m->req_lock);
+ if (list_empty(&m->unsent_req_list)) {
+ clear_bit(Wworksched, &m->wsched);
+ spin_unlock(&m->req_lock);
+ return;
+ }
+
+ req = list_entry(m->unsent_req_list.next, struct p9_req_t,
+ req_list);
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
+ p9_debug(P9_DEBUG_TRANS, "move req %p\n", req);
+ list_move_tail(&req->req_list, &m->req_list);
+
+ m->wbuf = req->tc.sdata;
+ m->wsize = req->tc.size;
+ m->wpos = 0;
+ p9_req_get(req);
+ m->wreq = req;
+ spin_unlock(&m->req_lock);
+ }
+
+ p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n",
+ m, m->wpos, m->wsize);
+ clear_bit(Wpending, &m->wsched);
+ err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos);
+ p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
+ if (err == -EAGAIN)
+ goto end_clear;
+
+
+ if (err < 0)
+ goto error;
+ else if (err == 0) {
+ err = -EREMOTEIO;
+ goto error;
+ }
+
+ m->wpos += err;
+ if (m->wpos == m->wsize) {
+ m->wpos = m->wsize = 0;
+ p9_req_put(m->client, m->wreq);
+ m->wreq = NULL;
+ }
+
+end_clear:
+ clear_bit(Wworksched, &m->wsched);
+
+ if (m->wsize || !list_empty(&m->unsent_req_list)) {
+ if (test_and_clear_bit(Wpending, &m->wsched))
+ n = EPOLLOUT;
+ else
+ n = p9_fd_poll(m->client, NULL, NULL);
+
+ if ((n & EPOLLOUT) &&
+ !test_and_set_bit(Wworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
+ schedule_work(&m->wq);
+ }
+ }
+
+ return;
+
+error:
+ p9_conn_cancel(m, err);
+ clear_bit(Wworksched, &m->wsched);
+}
+
+static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+ struct p9_poll_wait *pwait =
+ container_of(wait, struct p9_poll_wait, wait);
+ struct p9_conn *m = pwait->conn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ if (list_empty(&m->poll_pending_link))
+ list_add_tail(&m->poll_pending_link, &p9_poll_pending_list);
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ schedule_work(&p9_poll_work);
+ return 1;
+}
+
+/**
+ * p9_pollwait - add poll task to the wait queue
+ * @filp: file pointer being polled
+ * @wait_address: wait_q to block on
+ * @p: poll state
+ *
+ * called by files poll operation to add v9fs-poll task to files wait queue
+ */
+
+static void
+p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+ struct p9_conn *m = container_of(p, struct p9_conn, pt);
+ struct p9_poll_wait *pwait = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+ if (m->poll_wait[i].wait_addr == NULL) {
+ pwait = &m->poll_wait[i];
+ break;
+ }
+ }
+
+ if (!pwait) {
+ p9_debug(P9_DEBUG_ERROR, "not enough wait_address slots\n");
+ return;
+ }
+
+ pwait->conn = m;
+ pwait->wait_addr = wait_address;
+ init_waitqueue_func_entry(&pwait->wait, p9_pollwake);
+ add_wait_queue(wait_address, &pwait->wait);
+}
+
+/**
+ * p9_conn_create - initialize the per-session mux data
+ * @client: client instance
+ *
+ * Note: Creates the polling task if this is the first session.
+ */
+
+static void p9_conn_create(struct p9_client *client)
+{
+ __poll_t n;
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize);
+
+ INIT_LIST_HEAD(&m->mux_list);
+ m->client = client;
+
+ spin_lock_init(&m->req_lock);
+ INIT_LIST_HEAD(&m->req_list);
+ INIT_LIST_HEAD(&m->unsent_req_list);
+ INIT_WORK(&m->rq, p9_read_work);
+ INIT_WORK(&m->wq, p9_write_work);
+ INIT_LIST_HEAD(&m->poll_pending_link);
+ init_poll_funcptr(&m->pt, p9_pollwait);
+
+ n = p9_fd_poll(client, &m->pt, NULL);
+ if (n & EPOLLIN) {
+ p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
+ set_bit(Rpending, &m->wsched);
+ }
+
+ if (n & EPOLLOUT) {
+ p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
+ set_bit(Wpending, &m->wsched);
+ }
+}
+
+/**
+ * p9_poll_mux - polls a mux and schedules read or write works if necessary
+ * @m: connection to poll
+ *
+ */
+
+static void p9_poll_mux(struct p9_conn *m)
+{
+ __poll_t n;
+ int err = -ECONNRESET;
+
+ if (READ_ONCE(m->err) < 0)
+ return;
+
+ n = p9_fd_poll(m->client, NULL, &err);
+ if (n & (EPOLLERR | EPOLLHUP | EPOLLNVAL)) {
+ p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
+ p9_conn_cancel(m, err);
+ }
+
+ if (n & EPOLLIN) {
+ set_bit(Rpending, &m->wsched);
+ p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
+ if (!test_and_set_bit(Rworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
+ schedule_work(&m->rq);
+ }
+ }
+
+ if (n & EPOLLOUT) {
+ set_bit(Wpending, &m->wsched);
+ p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
+ if ((m->wsize || !list_empty(&m->unsent_req_list)) &&
+ !test_and_set_bit(Wworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
+ schedule_work(&m->wq);
+ }
+ }
+}
+
+/**
+ * p9_fd_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent successfully.
+ *
+ * @client: client instance
+ * @req: request to be sent
+ *
+ */
+
+static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
+{
+ int err;
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+
+ p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n",
+ m, current, &req->tc, req->tc.id);
+
+ spin_lock(&m->req_lock);
+
+ err = READ_ONCE(m->err);
+ if (err < 0) {
+ spin_unlock(&m->req_lock);
+ return err;
+ }
+
+ WRITE_ONCE(req->status, REQ_STATUS_UNSENT);
+ list_add_tail(&req->req_list, &m->unsent_req_list);
+ spin_unlock(&m->req_lock);
+
+ p9_poll_mux(m);
+
+ return 0;
+}
+
+static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+ int ret = 1;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ spin_lock(&m->req_lock);
+
+ if (req->status == REQ_STATUS_UNSENT) {
+ list_del(&req->req_list);
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+ p9_req_put(client, req);
+ ret = 0;
+ }
+ spin_unlock(&m->req_lock);
+
+ return ret;
+}
+
+static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ spin_lock(&m->req_lock);
+ /* Ignore cancelled request if status changed since the request was
+ * processed in p9_client_flush()
+ */
+ if (req->status != REQ_STATUS_SENT) {
+ spin_unlock(&m->req_lock);
+ return 0;
+ }
+
+ /* we haven't received a response for oldreq,
+ * remove it from the list.
+ */
+ list_del(&req->req_list);
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+ spin_unlock(&m->req_lock);
+
+ p9_req_put(client, req);
+
+ return 0;
+}
+
+static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt)
+{
+ if (clnt->trans_mod == &p9_tcp_trans) {
+ if (clnt->trans_opts.tcp.port != P9_FD_PORT)
+ seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port);
+ } else if (clnt->trans_mod == &p9_fd_trans) {
+ if (clnt->trans_opts.fd.rfd != ~0)
+ seq_printf(m, ",rfd=%u", clnt->trans_opts.fd.rfd);
+ if (clnt->trans_opts.fd.wfd != ~0)
+ seq_printf(m, ",wfd=%u", clnt->trans_opts.fd.wfd);
+ }
+ return 0;
+}
+
+static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
+{
+ struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd),
+ GFP_KERNEL);
+ if (!ts)
+ return -ENOMEM;
+
+ ts->rd = fget(rfd);
+ if (!ts->rd)
+ goto out_free_ts;
+ if (!(ts->rd->f_mode & FMODE_READ))
+ goto out_put_rd;
+ /* Prevent workers from hanging on IO when fd is a pipe.
+ * It's technically possible for userspace or concurrent mounts to
+ * modify this flag concurrently, which will likely result in a
+ * broken filesystem. However, just having bad flags here should
+ * not crash the kernel or cause any other sort of bug, so mark this
+ * particular data race as intentional so that tooling (like KCSAN)
+ * can allow it and detect further problems.
+ */
+ data_race(ts->rd->f_flags |= O_NONBLOCK);
+ ts->wr = fget(wfd);
+ if (!ts->wr)
+ goto out_put_rd;
+ if (!(ts->wr->f_mode & FMODE_WRITE))
+ goto out_put_wr;
+ data_race(ts->wr->f_flags |= O_NONBLOCK);
+
+ client->trans = ts;
+ client->status = Connected;
+
+ return 0;
+
+out_put_wr:
+ fput(ts->wr);
+out_put_rd:
+ fput(ts->rd);
+out_free_ts:
+ kfree(ts);
+ return -EIO;
+}
+
+static int p9_socket_open(struct p9_client *client, struct socket *csocket)
+{
+ struct p9_trans_fd *p;
+ struct file *file;
+
+ p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
+ if (!p) {
+ sock_release(csocket);
+ return -ENOMEM;
+ }
+
+ csocket->sk->sk_allocation = GFP_NOIO;
+ csocket->sk->sk_use_task_frag = false;
+ file = sock_alloc_file(csocket, 0, NULL);
+ if (IS_ERR(file)) {
+ pr_err("%s (%d): failed to map fd\n",
+ __func__, task_pid_nr(current));
+ kfree(p);
+ return PTR_ERR(file);
+ }
+
+ get_file(file);
+ p->wr = p->rd = file;
+ client->trans = p;
+ client->status = Connected;
+
+ p->rd->f_flags |= O_NONBLOCK;
+
+ p9_conn_create(client);
+ return 0;
+}
+
+/**
+ * p9_conn_destroy - cancels all pending requests of mux
+ * @m: mux to destroy
+ *
+ */
+
+static void p9_conn_destroy(struct p9_conn *m)
+{
+ p9_debug(P9_DEBUG_TRANS, "mux %p prev %p next %p\n",
+ m, m->mux_list.prev, m->mux_list.next);
+
+ p9_mux_poll_stop(m);
+ cancel_work_sync(&m->rq);
+ if (m->rreq) {
+ p9_req_put(m->client, m->rreq);
+ m->rreq = NULL;
+ }
+ cancel_work_sync(&m->wq);
+ if (m->wreq) {
+ p9_req_put(m->client, m->wreq);
+ m->wreq = NULL;
+ }
+
+ p9_conn_cancel(m, -ECONNRESET);
+
+ m->client = NULL;
+}
+
+/**
+ * p9_fd_close - shutdown file descriptor transport
+ * @client: client instance
+ *
+ */
+
+static void p9_fd_close(struct p9_client *client)
+{
+ struct p9_trans_fd *ts;
+
+ if (!client)
+ return;
+
+ ts = client->trans;
+ if (!ts)
+ return;
+
+ client->status = Disconnected;
+
+ p9_conn_destroy(&ts->conn);
+
+ if (ts->rd)
+ fput(ts->rd);
+ if (ts->wr)
+ fput(ts->wr);
+
+ kfree(ts);
+}
+
+static int p9_bind_privport(struct socket *sock)
+{
+ struct sockaddr_storage stor = { 0 };
+ int port, err = -EINVAL;
+
+ stor.ss_family = sock->ops->family;
+ if (stor.ss_family == AF_INET)
+ ((struct sockaddr_in *)&stor)->sin_addr.s_addr = htonl(INADDR_ANY);
+ else
+ ((struct sockaddr_in6 *)&stor)->sin6_addr = in6addr_any;
+ for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) {
+ if (stor.ss_family == AF_INET)
+ ((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port);
+ else
+ ((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port);
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&stor, sizeof(stor));
+ if (err != -EADDRINUSE)
+ break;
+ }
+ return err;
+}
+
+static int
+p9_fd_create_tcp(struct p9_client *client, struct fs_context *fc)
+{
+ const char *addr = fc->source;
+ struct v9fs_context *ctx = fc->fs_private;
+ int err;
+ char port_str[6];
+ struct socket *csocket;
+ struct sockaddr_storage stor = { 0 };
+ struct p9_fd_opts opts;
+
+ /* opts are already parsed in context */
+ opts = ctx->fd_opts;
+
+ if (!addr)
+ return -EINVAL;
+
+ sprintf(port_str, "%u", opts.port);
+ err = inet_pton_with_scope(current->nsproxy->net_ns, AF_UNSPEC, addr,
+ port_str, &stor);
+ if (err < 0)
+ return err;
+
+ csocket = NULL;
+
+ client->trans_opts.tcp.port = opts.port;
+ client->trans_opts.tcp.privport = opts.privport;
+ err = __sock_create(current->nsproxy->net_ns, stor.ss_family,
+ SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
+ if (err) {
+ pr_err("%s (%d): problem creating socket\n",
+ __func__, task_pid_nr(current));
+ return err;
+ }
+
+ if (opts.privport) {
+ err = p9_bind_privport(csocket);
+ if (err < 0) {
+ pr_err("%s (%d): problem binding to privport\n",
+ __func__, task_pid_nr(current));
+ sock_release(csocket);
+ return err;
+ }
+ }
+
+ err = READ_ONCE(csocket->ops)->connect(csocket,
+ (struct sockaddr_unsized *)&stor,
+ sizeof(stor), 0);
+ if (err < 0) {
+ pr_err("%s (%d): problem connecting socket to %s\n",
+ __func__, task_pid_nr(current), addr);
+ sock_release(csocket);
+ return err;
+ }
+
+ return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create_unix(struct p9_client *client, struct fs_context *fc)
+{
+ const char *addr = fc->source;
+ int err;
+ struct socket *csocket;
+ struct sockaddr_un sun_server;
+
+ csocket = NULL;
+
+ if (!addr || !strlen(addr))
+ return -EINVAL;
+
+ if (strlen(addr) >= UNIX_PATH_MAX) {
+ pr_err("%s (%d): address too long: %s\n",
+ __func__, task_pid_nr(current), addr);
+ return -ENAMETOOLONG;
+ }
+
+ sun_server.sun_family = PF_UNIX;
+ strcpy(sun_server.sun_path, addr);
+ err = __sock_create(current->nsproxy->net_ns, PF_UNIX,
+ SOCK_STREAM, 0, &csocket, 1);
+ if (err < 0) {
+ pr_err("%s (%d): problem creating socket\n",
+ __func__, task_pid_nr(current));
+
+ return err;
+ }
+ err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr_unsized *)&sun_server,
+ sizeof(struct sockaddr_un) - 1, 0);
+ if (err < 0) {
+ pr_err("%s (%d): problem connecting socket: %s: %d\n",
+ __func__, task_pid_nr(current), addr, err);
+ sock_release(csocket);
+ return err;
+ }
+
+ return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create(struct p9_client *client, struct fs_context *fc)
+{
+ struct v9fs_context *ctx = fc->fs_private;
+ struct p9_fd_opts opts = ctx->fd_opts;
+ int err;
+
+ client->trans_opts.fd.rfd = opts.rfd;
+ client->trans_opts.fd.wfd = opts.wfd;
+
+ if (opts.rfd == ~0 || opts.wfd == ~0) {
+ pr_err("Insufficient options for proto=fd\n");
+ return -ENOPROTOOPT;
+ }
+
+ err = p9_fd_open(client, opts.rfd, opts.wfd);
+ if (err < 0)
+ return err;
+
+ p9_conn_create(client);
+
+ return 0;
+}
+
+static struct p9_trans_module p9_tcp_trans = {
+ .name = "tcp",
+ .maxsize = MAX_SOCK_BUF,
+ .pooled_rbuffers = false,
+ .def = false,
+ .supports_vmalloc = true,
+ .create = p9_fd_create_tcp,
+ .close = p9_fd_close,
+ .request = p9_fd_request,
+ .cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
+ .show_options = p9_fd_show_options,
+ .owner = THIS_MODULE,
+};
+MODULE_ALIAS_9P("tcp");
+
+static struct p9_trans_module p9_unix_trans = {
+ .name = "unix",
+ .maxsize = MAX_SOCK_BUF,
+ .def = false,
+ .supports_vmalloc = true,
+ .create = p9_fd_create_unix,
+ .close = p9_fd_close,
+ .request = p9_fd_request,
+ .cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
+ .show_options = p9_fd_show_options,
+ .owner = THIS_MODULE,
+};
+MODULE_ALIAS_9P("unix");
+
+static struct p9_trans_module p9_fd_trans = {
+ .name = "fd",
+ .maxsize = MAX_SOCK_BUF,
+ .def = false,
+ .supports_vmalloc = true,
+ .create = p9_fd_create,
+ .close = p9_fd_close,
+ .request = p9_fd_request,
+ .cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
+ .show_options = p9_fd_show_options,
+ .owner = THIS_MODULE,
+};
+MODULE_ALIAS_9P("fd");
+
+/**
+ * p9_poll_workfn - poll worker thread
+ * @work: work queue
+ *
+ * polls all v9fs transports for new events and queues the appropriate
+ * work to the work queue
+ *
+ */
+
+static void p9_poll_workfn(struct work_struct *work)
+{
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_TRANS, "start %p\n", current);
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ while (!list_empty(&p9_poll_pending_list)) {
+ struct p9_conn *conn = list_first_entry(&p9_poll_pending_list,
+ struct p9_conn,
+ poll_pending_link);
+ list_del_init(&conn->poll_pending_link);
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ p9_poll_mux(conn);
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ }
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ p9_debug(P9_DEBUG_TRANS, "finish\n");
+}
+
+static int __init p9_trans_fd_init(void)
+{
+ v9fs_register_trans(&p9_tcp_trans);
+ v9fs_register_trans(&p9_unix_trans);
+ v9fs_register_trans(&p9_fd_trans);
+
+ return 0;
+}
+
+static void __exit p9_trans_fd_exit(void)
+{
+ flush_work(&p9_poll_work);
+ v9fs_unregister_trans(&p9_tcp_trans);
+ v9fs_unregister_trans(&p9_unix_trans);
+ v9fs_unregister_trans(&p9_fd_trans);
+}
+
+module_init(p9_trans_fd_init);
+module_exit(p9_trans_fd_exit);
+
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Filedescriptor Transport for 9P");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
new file mode 100644
index 000000000000..4d406479f83b
--- /dev/null
+++ b/net/9p/trans_rdma.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RDMA transport layer based on the trans_fd.c implementation.
+ *
+ * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
+ * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/file.h>
+#include <linux/fs_context.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#define P9_RDMA_SEND_SGE 4
+#define P9_RDMA_RECV_SGE 4
+#define P9_RDMA_IRD 0
+#define P9_RDMA_ORD 0
+#define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */
+
+/**
+ * struct p9_trans_rdma - RDMA transport instance
+ *
+ * @state: tracks the transport state machine for connection setup and tear down
+ * @cm_id: The RDMA CM ID
+ * @pd: Protection Domain pointer
+ * @qp: Queue Pair pointer
+ * @cq: Completion Queue pointer
+ * @timeout: Number of uSecs to wait for connection management events
+ * @privport: Whether a privileged port may be used
+ * @port: The port to use
+ * @sq_depth: The depth of the Send Queue
+ * @sq_sem: Semaphore for the SQ
+ * @rq_depth: The depth of the Receive Queue.
+ * @rq_sem: Semaphore for the RQ
+ * @excess_rc : Amount of posted Receive Contexts without a pending request.
+ * See rdma_request()
+ * @addr: The remote peer's address
+ * @req_lock: Protects the active request list
+ * @cm_done: Completion event for connection management tracking
+ */
+struct p9_trans_rdma {
+ enum {
+ P9_RDMA_INIT,
+ P9_RDMA_ADDR_RESOLVED,
+ P9_RDMA_ROUTE_RESOLVED,
+ P9_RDMA_CONNECTED,
+ P9_RDMA_FLUSHING,
+ P9_RDMA_CLOSING,
+ P9_RDMA_CLOSED,
+ } state;
+ struct rdma_cm_id *cm_id;
+ struct ib_pd *pd;
+ struct ib_qp *qp;
+ struct ib_cq *cq;
+ long timeout;
+ bool privport;
+ u16 port;
+ int sq_depth;
+ struct semaphore sq_sem;
+ int rq_depth;
+ struct semaphore rq_sem;
+ atomic_t excess_rc;
+ struct sockaddr_in addr;
+ spinlock_t req_lock;
+
+ struct completion cm_done;
+};
+
+struct p9_rdma_req;
+
+/**
+ * struct p9_rdma_context - Keeps track of in-process WR
+ *
+ * @cqe: completion queue entry
+ * @busa: Bus address to unmap when the WR completes
+ * @req: Keeps track of requests (send)
+ * @rc: Keepts track of replies (receive)
+ */
+struct p9_rdma_context {
+ struct ib_cqe cqe;
+ dma_addr_t busa;
+ union {
+ struct p9_req_t *req;
+ struct p9_fcall rc;
+ };
+};
+
+static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
+{
+ struct p9_trans_rdma *rdma = clnt->trans;
+
+ if (rdma->port != P9_RDMA_PORT)
+ seq_printf(m, ",port=%u", rdma->port);
+ if (rdma->sq_depth != P9_RDMA_SQ_DEPTH)
+ seq_printf(m, ",sq=%u", rdma->sq_depth);
+ if (rdma->rq_depth != P9_RDMA_RQ_DEPTH)
+ seq_printf(m, ",rq=%u", rdma->rq_depth);
+ if (rdma->timeout != P9_RDMA_TIMEOUT)
+ seq_printf(m, ",timeout=%lu", rdma->timeout);
+ if (rdma->privport)
+ seq_puts(m, ",privport");
+ return 0;
+}
+
+static int
+p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+ struct p9_client *c = id->context;
+ struct p9_trans_rdma *rdma = c->trans;
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ BUG_ON(rdma->state != P9_RDMA_INIT);
+ rdma->state = P9_RDMA_ADDR_RESOLVED;
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
+ rdma->state = P9_RDMA_ROUTE_RESOLVED;
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
+ rdma->state = P9_RDMA_CONNECTED;
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ if (rdma)
+ rdma->state = P9_RDMA_CLOSED;
+ c->status = Disconnected;
+ break;
+
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ break;
+
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ case RDMA_CM_EVENT_CONNECT_RESPONSE:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ c->status = Disconnected;
+ rdma_disconnect(rdma->cm_id);
+ break;
+ default:
+ BUG();
+ }
+ complete(&rdma->cm_done);
+ return 0;
+}
+
+static void
+recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct p9_client *client = cq->cq_context;
+ struct p9_trans_rdma *rdma = client->trans;
+ struct p9_rdma_context *c =
+ container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
+ struct p9_req_t *req;
+ int err = 0;
+ int16_t tag;
+
+ req = NULL;
+ ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
+ DMA_FROM_DEVICE);
+
+ if (wc->status != IB_WC_SUCCESS)
+ goto err_out;
+
+ c->rc.size = wc->byte_len;
+ err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1);
+ if (err)
+ goto err_out;
+
+ req = p9_tag_lookup(client, tag);
+ if (!req)
+ goto err_out;
+
+ /* Check that we have not yet received a reply for this request.
+ */
+ if (unlikely(req->rc.sdata)) {
+ pr_err("Duplicate reply for request %d", tag);
+ goto err_out;
+ }
+
+ req->rc.size = c->rc.size;
+ req->rc.sdata = c->rc.sdata;
+ p9_client_cb(client, req, REQ_STATUS_RCVD);
+
+ out:
+ up(&rdma->rq_sem);
+ kfree(c);
+ return;
+
+ err_out:
+ p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n",
+ req, err, wc->status);
+ rdma->state = P9_RDMA_FLUSHING;
+ client->status = Disconnected;
+ goto out;
+}
+
+static void
+send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct p9_client *client = cq->cq_context;
+ struct p9_trans_rdma *rdma = client->trans;
+ struct p9_rdma_context *c =
+ container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
+
+ ib_dma_unmap_single(rdma->cm_id->device,
+ c->busa, c->req->tc.size,
+ DMA_TO_DEVICE);
+ up(&rdma->sq_sem);
+ p9_req_put(client, c->req);
+ kfree(c);
+}
+
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+ p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
+ event->event, context);
+}
+
+static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
+{
+ if (!rdma)
+ return;
+
+ if (rdma->qp && !IS_ERR(rdma->qp))
+ ib_destroy_qp(rdma->qp);
+
+ if (rdma->pd && !IS_ERR(rdma->pd))
+ ib_dealloc_pd(rdma->pd);
+
+ if (rdma->cq && !IS_ERR(rdma->cq))
+ ib_free_cq(rdma->cq);
+
+ if (rdma->cm_id && !IS_ERR(rdma->cm_id))
+ rdma_destroy_id(rdma->cm_id);
+
+ kfree(rdma);
+}
+
+static int
+post_recv(struct p9_client *client, struct p9_rdma_context *c)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ struct ib_recv_wr wr;
+ struct ib_sge sge;
+ int ret;
+
+ c->busa = ib_dma_map_single(rdma->cm_id->device,
+ c->rc.sdata, client->msize,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
+ goto error;
+
+ c->cqe.done = recv_done;
+
+ sge.addr = c->busa;
+ sge.length = client->msize;
+ sge.lkey = rdma->pd->local_dma_lkey;
+
+ wr.next = NULL;
+ wr.wr_cqe = &c->cqe;
+ wr.sg_list = &sge;
+ wr.num_sge = 1;
+
+ ret = ib_post_recv(rdma->qp, &wr, NULL);
+ if (ret)
+ ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+ client->msize, DMA_FROM_DEVICE);
+ return ret;
+
+ error:
+ p9_debug(P9_DEBUG_ERROR, "EIO\n");
+ return -EIO;
+}
+
+static int rdma_request(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ struct ib_send_wr wr;
+ struct ib_sge sge;
+ int err = 0;
+ unsigned long flags;
+ struct p9_rdma_context *c = NULL;
+ struct p9_rdma_context *rpl_context = NULL;
+
+ /* When an error occurs between posting the recv and the send,
+ * there will be a receive context posted without a pending request.
+ * Since there is no way to "un-post" it, we remember it and skip
+ * post_recv() for the next request.
+ * So here,
+ * see if we are this `next request' and need to absorb an excess rc.
+ * If yes, then drop and free our own, and do not recv_post().
+ **/
+ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+ if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+ /* Got one! */
+ p9_fcall_fini(&req->rc);
+ req->rc.sdata = NULL;
+ goto dont_need_post_recv;
+ } else {
+ /* We raced and lost. */
+ atomic_inc(&rdma->excess_rc);
+ }
+ }
+
+ /* Allocate an fcall for the reply */
+ rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
+ if (!rpl_context) {
+ err = -ENOMEM;
+ goto recv_error;
+ }
+ rpl_context->rc.sdata = req->rc.sdata;
+
+ /*
+ * Post a receive buffer for this request. We need to ensure
+ * there is a reply buffer available for every outstanding
+ * request. A flushed request can result in no reply for an
+ * outstanding request, so we must keep a count to avoid
+ * overflowing the RQ.
+ */
+ if (down_interruptible(&rdma->rq_sem)) {
+ err = -EINTR;
+ goto recv_error;
+ }
+
+ err = post_recv(client, rpl_context);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err);
+ goto recv_error;
+ }
+ /* remove posted receive buffer from request structure */
+ req->rc.sdata = NULL;
+
+dont_need_post_recv:
+ /* Post the request */
+ c = kmalloc(sizeof *c, GFP_NOFS);
+ if (!c) {
+ err = -ENOMEM;
+ goto send_error;
+ }
+ c->req = req;
+
+ c->busa = ib_dma_map_single(rdma->cm_id->device,
+ c->req->tc.sdata, c->req->tc.size,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
+ err = -EIO;
+ goto send_error;
+ }
+
+ c->cqe.done = send_done;
+
+ sge.addr = c->busa;
+ sge.length = c->req->tc.size;
+ sge.lkey = rdma->pd->local_dma_lkey;
+
+ wr.next = NULL;
+ wr.wr_cqe = &c->cqe;
+ wr.opcode = IB_WR_SEND;
+ wr.send_flags = IB_SEND_SIGNALED;
+ wr.sg_list = &sge;
+ wr.num_sge = 1;
+
+ if (down_interruptible(&rdma->sq_sem)) {
+ err = -EINTR;
+ goto dma_unmap;
+ }
+
+ /* Mark request as `sent' *before* we actually send it,
+ * because doing if after could erase the REQ_STATUS_RCVD
+ * status in case of a very fast reply.
+ */
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
+ err = ib_post_send(rdma->qp, &wr, NULL);
+ if (err)
+ goto dma_unmap;
+
+ /* Success */
+ return 0;
+
+dma_unmap:
+ ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+ c->req->tc.size, DMA_TO_DEVICE);
+ /* Handle errors that happened during or while preparing the send: */
+ send_error:
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+ kfree(c);
+ p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
+
+ /* Ach.
+ * We did recv_post(), but not send. We have one recv_post in excess.
+ */
+ atomic_inc(&rdma->excess_rc);
+ return err;
+
+ /* Handle errors that happened during or while preparing post_recv(): */
+ recv_error:
+ kfree(rpl_context);
+ spin_lock_irqsave(&rdma->req_lock, flags);
+ if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) {
+ rdma->state = P9_RDMA_CLOSING;
+ spin_unlock_irqrestore(&rdma->req_lock, flags);
+ rdma_disconnect(rdma->cm_id);
+ } else
+ spin_unlock_irqrestore(&rdma->req_lock, flags);
+ return err;
+}
+
+static void rdma_close(struct p9_client *client)
+{
+ struct p9_trans_rdma *rdma;
+
+ if (!client)
+ return;
+
+ rdma = client->trans;
+ if (!rdma)
+ return;
+
+ client->status = Disconnected;
+ rdma_disconnect(rdma->cm_id);
+ rdma_destroy_trans(rdma);
+}
+
+/**
+ * alloc_rdma - Allocate and initialize the rdma transport structure
+ * @opts: Mount options structure
+ */
+static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
+{
+ struct p9_trans_rdma *rdma;
+
+ rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
+ if (!rdma)
+ return NULL;
+
+ rdma->port = opts->port;
+ rdma->privport = opts->privport;
+ rdma->sq_depth = opts->sq_depth;
+ rdma->rq_depth = opts->rq_depth;
+ rdma->timeout = opts->timeout;
+ spin_lock_init(&rdma->req_lock);
+ init_completion(&rdma->cm_done);
+ sema_init(&rdma->sq_sem, rdma->sq_depth);
+ sema_init(&rdma->rq_sem, rdma->rq_depth);
+ atomic_set(&rdma->excess_rc, 0);
+
+ return rdma;
+}
+
+static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ /* Nothing to do here.
+ * We will take care of it (if we have to) in rdma_cancelled()
+ */
+ return 1;
+}
+
+/* A request has been fully flushed without a reply.
+ * That means we have posted one buffer in excess.
+ */
+static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ atomic_inc(&rdma->excess_rc);
+ return 0;
+}
+
+static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
+{
+ struct sockaddr_in cl = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ };
+ int port, err = -EINVAL;
+
+ for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
+ cl.sin_port = htons((ushort)port);
+ err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
+ if (err != -EADDRINUSE)
+ break;
+ }
+ return err;
+}
+
+/**
+ * rdma_create_trans - Transport method for creating a transport instance
+ * @client: client instance
+ * @fc: The filesystem context
+ */
+static int
+rdma_create_trans(struct p9_client *client, struct fs_context *fc)
+{
+ const char *addr = fc->source;
+ struct v9fs_context *ctx = fc->fs_private;
+ struct p9_rdma_opts opts = ctx->rdma_opts;
+ int err;
+ struct p9_trans_rdma *rdma;
+ struct rdma_conn_param conn_param;
+ struct ib_qp_init_attr qp_attr;
+
+ if (addr == NULL)
+ return -EINVAL;
+
+ /* options are already parsed, in the fs context */
+ opts = ctx->rdma_opts;
+
+ /* Create and initialize the RDMA transport structure */
+ rdma = alloc_rdma(&opts);
+ if (!rdma)
+ return -ENOMEM;
+
+ /* Create the RDMA CM ID */
+ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(rdma->cm_id))
+ goto error;
+
+ /* Associate the client with the transport */
+ client->trans = rdma;
+
+ /* Bind to a privileged port if we need to */
+ if (opts.privport) {
+ err = p9_rdma_bind_privport(rdma);
+ if (err < 0) {
+ pr_err("%s (%d): problem binding to privport: %d\n",
+ __func__, task_pid_nr(current), -err);
+ goto error;
+ }
+ }
+
+ /* Resolve the server's address */
+ rdma->addr.sin_family = AF_INET;
+ rdma->addr.sin_addr.s_addr = in_aton(addr);
+ rdma->addr.sin_port = htons(opts.port);
+ err = rdma_resolve_addr(rdma->cm_id, NULL,
+ (struct sockaddr *)&rdma->addr,
+ rdma->timeout);
+ if (err)
+ goto error;
+ err = wait_for_completion_interruptible(&rdma->cm_done);
+ if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
+ goto error;
+
+ /* Resolve the route to the server */
+ err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
+ if (err)
+ goto error;
+ err = wait_for_completion_interruptible(&rdma->cm_done);
+ if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
+ goto error;
+
+ /* Create the Completion Queue */
+ rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
+ opts.sq_depth + opts.rq_depth + 1,
+ IB_POLL_SOFTIRQ);
+ if (IS_ERR(rdma->cq))
+ goto error;
+
+ /* Create the Protection Domain */
+ rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0);
+ if (IS_ERR(rdma->pd))
+ goto error;
+
+ /* Create the Queue Pair */
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.event_handler = qp_event_handler;
+ qp_attr.qp_context = client;
+ qp_attr.cap.max_send_wr = opts.sq_depth;
+ qp_attr.cap.max_recv_wr = opts.rq_depth;
+ qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
+ qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = rdma->cq;
+ qp_attr.recv_cq = rdma->cq;
+ err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
+ if (err)
+ goto error;
+ rdma->qp = rdma->cm_id->qp;
+
+ /* Request a connection */
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.private_data = NULL;
+ conn_param.private_data_len = 0;
+ conn_param.responder_resources = P9_RDMA_IRD;
+ conn_param.initiator_depth = P9_RDMA_ORD;
+ err = rdma_connect(rdma->cm_id, &conn_param);
+ if (err)
+ goto error;
+ err = wait_for_completion_interruptible(&rdma->cm_done);
+ if (err || (rdma->state != P9_RDMA_CONNECTED))
+ goto error;
+
+ client->status = Connected;
+
+ return 0;
+
+error:
+ rdma_destroy_trans(rdma);
+ return -ENOTCONN;
+}
+
+static struct p9_trans_module p9_rdma_trans = {
+ .name = "rdma",
+ .maxsize = P9_RDMA_MAXSIZE,
+ .pooled_rbuffers = true,
+ .def = false,
+ .supports_vmalloc = false,
+ .owner = THIS_MODULE,
+ .create = rdma_create_trans,
+ .close = rdma_close,
+ .request = rdma_request,
+ .cancel = rdma_cancel,
+ .cancelled = rdma_cancelled,
+ .show_options = p9_rdma_show_options,
+};
+
+/**
+ * p9_trans_rdma_init - Register the 9P RDMA transport driver
+ */
+static int __init p9_trans_rdma_init(void)
+{
+ v9fs_register_trans(&p9_rdma_trans);
+ return 0;
+}
+
+static void __exit p9_trans_rdma_exit(void)
+{
+ v9fs_unregister_trans(&p9_rdma_trans);
+}
+
+module_init(p9_trans_rdma_init);
+module_exit(p9_trans_rdma_exit);
+MODULE_ALIAS_9P("rdma");
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("RDMA Transport for 9P");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c
new file mode 100644
index 000000000000..93547637deae
--- /dev/null
+++ b/net/9p/trans_usbg.c
@@ -0,0 +1,969 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * trans_usbg.c - USB peripheral usb9pfs configuration driver and transport.
+ *
+ * Copyright (C) 2024 Michael Grzeschik <m.grzeschik@pengutronix.de>
+ */
+
+/* Gadget usb9pfs only needs two bulk endpoints, and will use the usb9pfs
+ * transport to mount host exported filesystem via usb gadget.
+ */
+
+/* +--------------------------+ | +--------------------------+
+ * | 9PFS mounting client | | | 9PFS exporting server |
+ * SW | | | | |
+ * | (this:trans_usbg) | | |(e.g. diod or nfs-ganesha)|
+ * +-------------^------------+ | +-------------^------------+
+ * | | |
+ * ------------------|------------------------------------|-------------
+ * | | |
+ * +-------------v------------+ | +-------------v------------+
+ * | | | | |
+ * HW | USB Device Controller <---------> USB Host Controller |
+ * | | | | |
+ * +--------------------------+ | +--------------------------+
+ */
+
+#include <linux/cleanup.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs_context.h>
+#include <linux/usb/composite.h>
+#include <linux/usb/func_utils.h>
+
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#define DEFAULT_BUFLEN 16384
+
+struct f_usb9pfs {
+ struct p9_client *client;
+
+ /* 9p request lock for en/dequeue */
+ spinlock_t lock;
+
+ struct usb_request *in_req;
+ struct usb_request *out_req;
+
+ struct usb_ep *in_ep;
+ struct usb_ep *out_ep;
+
+ struct completion send;
+ struct completion received;
+
+ unsigned int buflen;
+
+ struct usb_function function;
+};
+
+static inline struct f_usb9pfs *func_to_usb9pfs(struct usb_function *f)
+{
+ return container_of(f, struct f_usb9pfs, function);
+}
+
+struct f_usb9pfs_opts {
+ struct usb_function_instance func_inst;
+ unsigned int buflen;
+
+ struct f_usb9pfs_dev *dev;
+
+ /* Read/write access to configfs attributes is handled by configfs.
+ *
+ * This is to protect the data from concurrent access by read/write
+ * and create symlink/remove symlink.
+ */
+ struct mutex lock;
+ int refcnt;
+};
+
+struct f_usb9pfs_dev {
+ struct f_usb9pfs *usb9pfs;
+ struct f_usb9pfs_opts *opts;
+ char tag[41];
+ bool inuse;
+
+ struct list_head usb9pfs_instance;
+};
+
+static DEFINE_MUTEX(usb9pfs_lock);
+static struct list_head usbg_instance_list;
+
+static int usb9pfs_queue_tx(struct f_usb9pfs *usb9pfs, struct p9_req_t *p9_tx_req,
+ gfp_t gfp_flags)
+{
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ struct usb_request *req = usb9pfs->in_req;
+ int ret;
+
+ if (!(p9_tx_req->tc.size % usb9pfs->in_ep->maxpacket))
+ req->zero = 1;
+
+ req->buf = p9_tx_req->tc.sdata;
+ req->length = p9_tx_req->tc.size;
+ req->context = p9_tx_req;
+
+ dev_dbg(&cdev->gadget->dev, "%s usb9pfs send --> %d/%d, zero: %d\n",
+ usb9pfs->in_ep->name, req->actual, req->length, req->zero);
+
+ ret = usb_ep_queue(usb9pfs->in_ep, req, gfp_flags);
+ if (ret)
+ req->context = NULL;
+
+ dev_dbg(&cdev->gadget->dev, "tx submit --> %d\n", ret);
+
+ return ret;
+}
+
+static int usb9pfs_queue_rx(struct f_usb9pfs *usb9pfs, struct usb_request *req,
+ gfp_t gfp_flags)
+{
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ int ret;
+
+ ret = usb_ep_queue(usb9pfs->out_ep, req, gfp_flags);
+
+ dev_dbg(&cdev->gadget->dev, "rx submit --> %d\n", ret);
+
+ return ret;
+}
+
+static int usb9pfs_transmit(struct f_usb9pfs *usb9pfs, struct p9_req_t *p9_req)
+{
+ int ret = 0;
+
+ guard(spinlock_irqsave)(&usb9pfs->lock);
+
+ ret = usb9pfs_queue_tx(usb9pfs, p9_req, GFP_ATOMIC);
+ if (ret)
+ return ret;
+
+ list_del(&p9_req->req_list);
+
+ p9_req_get(p9_req);
+
+ return ret;
+}
+
+static void usb9pfs_tx_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct f_usb9pfs *usb9pfs = ep->driver_data;
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ struct p9_req_t *p9_tx_req = req->context;
+ unsigned long flags;
+
+ /* reset zero packages */
+ req->zero = 0;
+
+ if (req->status) {
+ dev_err(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n",
+ ep->name, req->status, req->actual, req->length);
+ return;
+ }
+
+ dev_dbg(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n",
+ ep->name, req->status, req->actual, req->length);
+
+ spin_lock_irqsave(&usb9pfs->lock, flags);
+ WRITE_ONCE(p9_tx_req->status, REQ_STATUS_SENT);
+
+ p9_req_put(usb9pfs->client, p9_tx_req);
+
+ req->context = NULL;
+
+ spin_unlock_irqrestore(&usb9pfs->lock, flags);
+
+ complete(&usb9pfs->send);
+}
+
+static struct p9_req_t *usb9pfs_rx_header(struct f_usb9pfs *usb9pfs, void *buf)
+{
+ struct p9_req_t *p9_rx_req;
+ struct p9_fcall rc;
+ int ret;
+
+ /* start by reading header */
+ rc.sdata = buf;
+ rc.offset = 0;
+ rc.capacity = P9_HDRSZ;
+ rc.size = P9_HDRSZ;
+
+ p9_debug(P9_DEBUG_TRANS, "mux %p got %zu bytes\n", usb9pfs,
+ rc.capacity - rc.offset);
+
+ ret = p9_parse_header(&rc, &rc.size, NULL, NULL, 0);
+ if (ret) {
+ p9_debug(P9_DEBUG_ERROR,
+ "error parsing header: %d\n", ret);
+ return NULL;
+ }
+
+ p9_debug(P9_DEBUG_TRANS,
+ "mux %p pkt: size: %d bytes tag: %d\n",
+ usb9pfs, rc.size, rc.tag);
+
+ p9_rx_req = p9_tag_lookup(usb9pfs->client, rc.tag);
+ if (!p9_rx_req || p9_rx_req->status != REQ_STATUS_SENT) {
+ p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", rc.tag);
+ return NULL;
+ }
+
+ if (rc.size > p9_rx_req->rc.capacity) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ rc.size, rc.tag, p9_rx_req->rc.capacity);
+ p9_req_put(usb9pfs->client, p9_rx_req);
+ return NULL;
+ }
+
+ if (!p9_rx_req->rc.sdata) {
+ p9_debug(P9_DEBUG_ERROR,
+ "No recv fcall for tag %d (req %p), disconnecting!\n",
+ rc.tag, p9_rx_req);
+ p9_req_put(usb9pfs->client, p9_rx_req);
+ return NULL;
+ }
+
+ return p9_rx_req;
+}
+
+static void usb9pfs_rx_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct f_usb9pfs *usb9pfs = ep->driver_data;
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ struct p9_req_t *p9_rx_req;
+ unsigned int req_size = req->actual;
+ int status = REQ_STATUS_RCVD;
+
+ if (req->status) {
+ dev_err(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n",
+ ep->name, req->status, req->actual, req->length);
+ return;
+ }
+
+ p9_rx_req = usb9pfs_rx_header(usb9pfs, req->buf);
+ if (!p9_rx_req)
+ return;
+
+ if (req_size > p9_rx_req->rc.capacity) {
+ dev_err(&cdev->gadget->dev,
+ "%s received data size %u exceeds buffer capacity %zu\n",
+ ep->name, req_size, p9_rx_req->rc.capacity);
+ req_size = 0;
+ status = REQ_STATUS_ERROR;
+ }
+
+ memcpy(p9_rx_req->rc.sdata, req->buf, req_size);
+
+ p9_rx_req->rc.size = req_size;
+
+ p9_client_cb(usb9pfs->client, p9_rx_req, status);
+ p9_req_put(usb9pfs->client, p9_rx_req);
+
+ complete(&usb9pfs->received);
+}
+
+static void disable_ep(struct usb_composite_dev *cdev, struct usb_ep *ep)
+{
+ int value;
+
+ value = usb_ep_disable(ep);
+ if (value < 0)
+ dev_info(&cdev->gadget->dev,
+ "disable %s --> %d\n", ep->name, value);
+}
+
+static void disable_usb9pfs(struct f_usb9pfs *usb9pfs)
+{
+ struct usb_composite_dev *cdev =
+ usb9pfs->function.config->cdev;
+
+ if (usb9pfs->in_req) {
+ usb_ep_free_request(usb9pfs->in_ep, usb9pfs->in_req);
+ usb9pfs->in_req = NULL;
+ }
+
+ if (usb9pfs->out_req) {
+ usb_ep_free_request(usb9pfs->out_ep, usb9pfs->out_req);
+ usb9pfs->out_req = NULL;
+ }
+
+ disable_ep(cdev, usb9pfs->in_ep);
+ disable_ep(cdev, usb9pfs->out_ep);
+ dev_dbg(&cdev->gadget->dev, "%s disabled\n",
+ usb9pfs->function.name);
+}
+
+static int alloc_requests(struct usb_composite_dev *cdev,
+ struct f_usb9pfs *usb9pfs)
+{
+ int ret;
+
+ usb9pfs->in_req = usb_ep_alloc_request(usb9pfs->in_ep, GFP_ATOMIC);
+ if (!usb9pfs->in_req) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ usb9pfs->out_req = alloc_ep_req(usb9pfs->out_ep, usb9pfs->buflen);
+ if (!usb9pfs->out_req) {
+ ret = -ENOENT;
+ goto fail_in;
+ }
+
+ usb9pfs->in_req->complete = usb9pfs_tx_complete;
+ usb9pfs->out_req->complete = usb9pfs_rx_complete;
+
+ /* length will be set in complete routine */
+ usb9pfs->in_req->context = usb9pfs;
+ usb9pfs->out_req->context = usb9pfs;
+
+ return 0;
+
+fail_in:
+ usb_ep_free_request(usb9pfs->in_ep, usb9pfs->in_req);
+fail:
+ return ret;
+}
+
+static int enable_endpoint(struct usb_composite_dev *cdev,
+ struct f_usb9pfs *usb9pfs, struct usb_ep *ep)
+{
+ int ret;
+
+ ret = config_ep_by_speed(cdev->gadget, &usb9pfs->function, ep);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(ep);
+ if (ret < 0)
+ return ret;
+
+ ep->driver_data = usb9pfs;
+
+ return 0;
+}
+
+static int
+enable_usb9pfs(struct usb_composite_dev *cdev, struct f_usb9pfs *usb9pfs)
+{
+ struct p9_client *client;
+ int ret = 0;
+
+ ret = enable_endpoint(cdev, usb9pfs, usb9pfs->in_ep);
+ if (ret)
+ goto out;
+
+ ret = enable_endpoint(cdev, usb9pfs, usb9pfs->out_ep);
+ if (ret)
+ goto disable_in;
+
+ ret = alloc_requests(cdev, usb9pfs);
+ if (ret)
+ goto disable_out;
+
+ client = usb9pfs->client;
+ if (client)
+ client->status = Connected;
+
+ dev_dbg(&cdev->gadget->dev, "%s enabled\n", usb9pfs->function.name);
+ return 0;
+
+disable_out:
+ usb_ep_disable(usb9pfs->out_ep);
+disable_in:
+ usb_ep_disable(usb9pfs->in_ep);
+out:
+ return ret;
+}
+
+static int p9_usbg_create(struct p9_client *client, struct fs_context *fc)
+{
+ const char *devname = fc->source;
+ struct f_usb9pfs_dev *dev;
+ struct f_usb9pfs *usb9pfs;
+ int ret = -ENOENT;
+ int found = 0;
+
+ if (!devname)
+ return -EINVAL;
+
+ guard(mutex)(&usb9pfs_lock);
+
+ list_for_each_entry(dev, &usbg_instance_list, usb9pfs_instance) {
+ if (!strncmp(devname, dev->tag, strlen(devname))) {
+ if (!dev->inuse) {
+ dev->inuse = true;
+ found = 1;
+ break;
+ }
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ if (!found) {
+ pr_err("no channels available for device %s\n", devname);
+ return ret;
+ }
+
+ usb9pfs = dev->usb9pfs;
+ if (!usb9pfs)
+ return -EINVAL;
+
+ client->trans = (void *)usb9pfs;
+ if (!usb9pfs->in_req)
+ client->status = Disconnected;
+ else
+ client->status = Connected;
+ usb9pfs->client = client;
+
+ client->trans_mod->maxsize = usb9pfs->buflen;
+
+ complete(&usb9pfs->received);
+
+ return 0;
+}
+
+static void usb9pfs_clear_tx(struct f_usb9pfs *usb9pfs)
+{
+ struct p9_req_t *req;
+
+ guard(spinlock_irqsave)(&usb9pfs->lock);
+
+ req = usb9pfs->in_req->context;
+ if (!req)
+ return;
+
+ if (!req->t_err)
+ req->t_err = -ECONNRESET;
+
+ p9_client_cb(usb9pfs->client, req, REQ_STATUS_ERROR);
+}
+
+static void p9_usbg_close(struct p9_client *client)
+{
+ struct f_usb9pfs *usb9pfs;
+ struct f_usb9pfs_dev *dev;
+ struct f_usb9pfs_opts *opts;
+
+ if (!client)
+ return;
+
+ usb9pfs = client->trans;
+ if (!usb9pfs)
+ return;
+
+ client->status = Disconnected;
+
+ usb9pfs_clear_tx(usb9pfs);
+
+ opts = container_of(usb9pfs->function.fi,
+ struct f_usb9pfs_opts, func_inst);
+
+ dev = opts->dev;
+
+ mutex_lock(&usb9pfs_lock);
+ dev->inuse = false;
+ mutex_unlock(&usb9pfs_lock);
+}
+
+static int p9_usbg_request(struct p9_client *client, struct p9_req_t *p9_req)
+{
+ struct f_usb9pfs *usb9pfs = client->trans;
+ int ret;
+
+ if (client->status != Connected)
+ return -EBUSY;
+
+ ret = wait_for_completion_killable(&usb9pfs->received);
+ if (ret)
+ return ret;
+
+ ret = usb9pfs_transmit(usb9pfs, p9_req);
+ if (ret)
+ return ret;
+
+ ret = wait_for_completion_killable(&usb9pfs->send);
+ if (ret)
+ return ret;
+
+ return usb9pfs_queue_rx(usb9pfs, usb9pfs->out_req, GFP_ATOMIC);
+}
+
+static int p9_usbg_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ struct f_usb9pfs *usb9pfs = client->trans;
+ int ret = 1;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ guard(spinlock_irqsave)(&usb9pfs->lock);
+
+ if (req->status == REQ_STATUS_UNSENT) {
+ list_del(&req->req_list);
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+ p9_req_put(client, req);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static struct p9_trans_module p9_usbg_trans = {
+ .name = "usbg",
+ .create = p9_usbg_create,
+ .close = p9_usbg_close,
+ .request = p9_usbg_request,
+ .cancel = p9_usbg_cancel,
+ .supports_vmalloc = false,
+ .owner = THIS_MODULE,
+};
+
+/*-------------------------------------------------------------------------*/
+
+#define USB_PROTOCOL_9PFS 0x09
+
+static struct usb_interface_descriptor usb9pfs_intf = {
+ .bLength = sizeof(usb9pfs_intf),
+ .bDescriptorType = USB_DT_INTERFACE,
+
+ .bNumEndpoints = 2,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = USB_PROTOCOL_9PFS,
+
+ /* .iInterface = DYNAMIC */
+};
+
+/* full speed support: */
+
+static struct usb_endpoint_descriptor fs_usb9pfs_source_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor fs_usb9pfs_sink_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_descriptor_header *fs_usb9pfs_descs[] = {
+ (struct usb_descriptor_header *)&usb9pfs_intf,
+ (struct usb_descriptor_header *)&fs_usb9pfs_sink_desc,
+ (struct usb_descriptor_header *)&fs_usb9pfs_source_desc,
+ NULL,
+};
+
+/* high speed support: */
+
+static struct usb_endpoint_descriptor hs_usb9pfs_source_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor hs_usb9pfs_sink_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(512),
+};
+
+static struct usb_descriptor_header *hs_usb9pfs_descs[] = {
+ (struct usb_descriptor_header *)&usb9pfs_intf,
+ (struct usb_descriptor_header *)&hs_usb9pfs_source_desc,
+ (struct usb_descriptor_header *)&hs_usb9pfs_sink_desc,
+ NULL,
+};
+
+/* super speed support: */
+
+static struct usb_endpoint_descriptor ss_usb9pfs_source_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor ss_usb9pfs_source_comp_desc = {
+ .bLength = USB_DT_SS_EP_COMP_SIZE,
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .bMaxBurst = 0,
+ .bmAttributes = 0,
+ .wBytesPerInterval = 0,
+};
+
+static struct usb_endpoint_descriptor ss_usb9pfs_sink_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor ss_usb9pfs_sink_comp_desc = {
+ .bLength = USB_DT_SS_EP_COMP_SIZE,
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .bMaxBurst = 0,
+ .bmAttributes = 0,
+ .wBytesPerInterval = 0,
+};
+
+static struct usb_descriptor_header *ss_usb9pfs_descs[] = {
+ (struct usb_descriptor_header *)&usb9pfs_intf,
+ (struct usb_descriptor_header *)&ss_usb9pfs_source_desc,
+ (struct usb_descriptor_header *)&ss_usb9pfs_source_comp_desc,
+ (struct usb_descriptor_header *)&ss_usb9pfs_sink_desc,
+ (struct usb_descriptor_header *)&ss_usb9pfs_sink_comp_desc,
+ NULL,
+};
+
+/* function-specific strings: */
+static struct usb_string strings_usb9pfs[] = {
+ [0].s = "usb9pfs input to output",
+ { } /* end of list */
+};
+
+static struct usb_gadget_strings stringtab_usb9pfs = {
+ .language = 0x0409, /* en-us */
+ .strings = strings_usb9pfs,
+};
+
+static struct usb_gadget_strings *usb9pfs_strings[] = {
+ &stringtab_usb9pfs,
+ NULL,
+};
+
+/*-------------------------------------------------------------------------*/
+
+static int usb9pfs_func_bind(struct usb_configuration *c,
+ struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+ struct f_usb9pfs_opts *opts;
+ struct usb_composite_dev *cdev = c->cdev;
+ int ret;
+ int id;
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ usb9pfs_intf.bInterfaceNumber = id;
+
+ id = usb_string_id(cdev);
+ if (id < 0)
+ return id;
+ strings_usb9pfs[0].id = id;
+ usb9pfs_intf.iInterface = id;
+
+ /* allocate endpoints */
+ usb9pfs->in_ep = usb_ep_autoconfig(cdev->gadget,
+ &fs_usb9pfs_source_desc);
+ if (!usb9pfs->in_ep)
+ goto autoconf_fail;
+
+ usb9pfs->out_ep = usb_ep_autoconfig(cdev->gadget,
+ &fs_usb9pfs_sink_desc);
+ if (!usb9pfs->out_ep)
+ goto autoconf_fail;
+
+ /* support high speed hardware */
+ hs_usb9pfs_source_desc.bEndpointAddress =
+ fs_usb9pfs_source_desc.bEndpointAddress;
+ hs_usb9pfs_sink_desc.bEndpointAddress =
+ fs_usb9pfs_sink_desc.bEndpointAddress;
+
+ /* support super speed hardware */
+ ss_usb9pfs_source_desc.bEndpointAddress =
+ fs_usb9pfs_source_desc.bEndpointAddress;
+ ss_usb9pfs_sink_desc.bEndpointAddress =
+ fs_usb9pfs_sink_desc.bEndpointAddress;
+
+ ret = usb_assign_descriptors(f, fs_usb9pfs_descs, hs_usb9pfs_descs,
+ ss_usb9pfs_descs, ss_usb9pfs_descs);
+ if (ret)
+ return ret;
+
+ opts = container_of(f->fi, struct f_usb9pfs_opts, func_inst);
+ opts->dev->usb9pfs = usb9pfs;
+
+ dev_dbg(&cdev->gadget->dev, "%s speed %s: IN/%s, OUT/%s\n",
+ (gadget_is_superspeed(c->cdev->gadget) ? "super" :
+ (gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full")),
+ f->name, usb9pfs->in_ep->name, usb9pfs->out_ep->name);
+
+ return 0;
+
+autoconf_fail:
+ ERROR(cdev, "%s: can't autoconfigure on %s\n",
+ f->name, cdev->gadget->name);
+ return -ENODEV;
+}
+
+static void usb9pfs_func_unbind(struct usb_configuration *c,
+ struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+
+ disable_usb9pfs(usb9pfs);
+}
+
+static void usb9pfs_free_func(struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+ struct f_usb9pfs_opts *opts;
+
+ kfree(usb9pfs);
+
+ opts = container_of(f->fi, struct f_usb9pfs_opts, func_inst);
+
+ mutex_lock(&opts->lock);
+ opts->refcnt--;
+ mutex_unlock(&opts->lock);
+
+ usb_free_all_descriptors(f);
+}
+
+static int usb9pfs_set_alt(struct usb_function *f,
+ unsigned int intf, unsigned int alt)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+
+ return enable_usb9pfs(cdev, usb9pfs);
+}
+
+static void usb9pfs_disable(struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+
+ usb9pfs_clear_tx(usb9pfs);
+}
+
+static struct usb_function *usb9pfs_alloc(struct usb_function_instance *fi)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts;
+ struct f_usb9pfs *usb9pfs;
+
+ usb9pfs = kzalloc(sizeof(*usb9pfs), GFP_KERNEL);
+ if (!usb9pfs)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&usb9pfs->lock);
+
+ init_completion(&usb9pfs->send);
+ init_completion(&usb9pfs->received);
+
+ usb9pfs_opts = container_of(fi, struct f_usb9pfs_opts, func_inst);
+
+ mutex_lock(&usb9pfs_opts->lock);
+ usb9pfs_opts->refcnt++;
+ mutex_unlock(&usb9pfs_opts->lock);
+
+ usb9pfs->buflen = usb9pfs_opts->buflen;
+
+ usb9pfs->function.name = "usb9pfs";
+ usb9pfs->function.bind = usb9pfs_func_bind;
+ usb9pfs->function.unbind = usb9pfs_func_unbind;
+ usb9pfs->function.set_alt = usb9pfs_set_alt;
+ usb9pfs->function.disable = usb9pfs_disable;
+ usb9pfs->function.strings = usb9pfs_strings;
+
+ usb9pfs->function.free_func = usb9pfs_free_func;
+
+ return &usb9pfs->function;
+}
+
+static inline struct f_usb9pfs_opts *to_f_usb9pfs_opts(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct f_usb9pfs_opts,
+ func_inst.group);
+}
+
+static inline struct f_usb9pfs_opts *fi_to_f_usb9pfs_opts(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct f_usb9pfs_opts, func_inst);
+}
+
+static void usb9pfs_attr_release(struct config_item *item)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts = to_f_usb9pfs_opts(item);
+
+ usb_put_function_instance(&usb9pfs_opts->func_inst);
+}
+
+static struct configfs_item_operations usb9pfs_item_ops = {
+ .release = usb9pfs_attr_release,
+};
+
+static ssize_t f_usb9pfs_opts_buflen_show(struct config_item *item, char *page)
+{
+ struct f_usb9pfs_opts *opts = to_f_usb9pfs_opts(item);
+ int ret;
+
+ mutex_lock(&opts->lock);
+ ret = sysfs_emit(page, "%d\n", opts->buflen);
+ mutex_unlock(&opts->lock);
+
+ return ret;
+}
+
+static ssize_t f_usb9pfs_opts_buflen_store(struct config_item *item,
+ const char *page, size_t len)
+{
+ struct f_usb9pfs_opts *opts = to_f_usb9pfs_opts(item);
+ int ret;
+ u32 num;
+
+ guard(mutex)(&opts->lock);
+
+ if (opts->refcnt)
+ return -EBUSY;
+
+ ret = kstrtou32(page, 0, &num);
+ if (ret)
+ return ret;
+
+ opts->buflen = num;
+
+ return len;
+}
+
+CONFIGFS_ATTR(f_usb9pfs_opts_, buflen);
+
+static struct configfs_attribute *usb9pfs_attrs[] = {
+ &f_usb9pfs_opts_attr_buflen,
+ NULL,
+};
+
+static const struct config_item_type usb9pfs_func_type = {
+ .ct_item_ops = &usb9pfs_item_ops,
+ .ct_attrs = usb9pfs_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct f_usb9pfs_dev *_usb9pfs_do_find_dev(const char *tag)
+{
+ struct f_usb9pfs_dev *usb9pfs_dev;
+
+ if (!tag)
+ return NULL;
+
+ list_for_each_entry(usb9pfs_dev, &usbg_instance_list, usb9pfs_instance) {
+ if (strcmp(usb9pfs_dev->tag, tag) == 0)
+ return usb9pfs_dev;
+ }
+
+ return NULL;
+}
+
+static int usb9pfs_tag_instance(struct f_usb9pfs_dev *dev, const char *tag)
+{
+ struct f_usb9pfs_dev *existing;
+ int ret = 0;
+
+ guard(mutex)(&usb9pfs_lock);
+
+ existing = _usb9pfs_do_find_dev(tag);
+ if (!existing)
+ strscpy(dev->tag, tag, ARRAY_SIZE(dev->tag));
+ else if (existing != dev)
+ ret = -EBUSY;
+
+ return ret;
+}
+
+static int usb9pfs_set_inst_tag(struct usb_function_instance *fi, const char *tag)
+{
+ if (strlen(tag) >= sizeof_field(struct f_usb9pfs_dev, tag))
+ return -ENAMETOOLONG;
+ return usb9pfs_tag_instance(fi_to_f_usb9pfs_opts(fi)->dev, tag);
+}
+
+static void usb9pfs_free_instance(struct usb_function_instance *fi)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts =
+ container_of(fi, struct f_usb9pfs_opts, func_inst);
+ struct f_usb9pfs_dev *dev = usb9pfs_opts->dev;
+
+ mutex_lock(&usb9pfs_lock);
+ list_del(&dev->usb9pfs_instance);
+ mutex_unlock(&usb9pfs_lock);
+
+ kfree(usb9pfs_opts);
+}
+
+static struct usb_function_instance *usb9pfs_alloc_instance(void)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts;
+ struct f_usb9pfs_dev *dev;
+
+ usb9pfs_opts = kzalloc(sizeof(*usb9pfs_opts), GFP_KERNEL);
+ if (!usb9pfs_opts)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&usb9pfs_opts->lock);
+
+ usb9pfs_opts->func_inst.set_inst_name = usb9pfs_set_inst_tag;
+ usb9pfs_opts->func_inst.free_func_inst = usb9pfs_free_instance;
+
+ usb9pfs_opts->buflen = DEFAULT_BUFLEN;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ kfree(usb9pfs_opts);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ usb9pfs_opts->dev = dev;
+ dev->opts = usb9pfs_opts;
+
+ config_group_init_type_name(&usb9pfs_opts->func_inst.group, "",
+ &usb9pfs_func_type);
+
+ mutex_lock(&usb9pfs_lock);
+ list_add_tail(&dev->usb9pfs_instance, &usbg_instance_list);
+ mutex_unlock(&usb9pfs_lock);
+
+ return &usb9pfs_opts->func_inst;
+}
+DECLARE_USB_FUNCTION(usb9pfs, usb9pfs_alloc_instance, usb9pfs_alloc);
+
+static int __init usb9pfs_modinit(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&usbg_instance_list);
+
+ ret = usb_function_register(&usb9pfsusb_func);
+ if (!ret)
+ v9fs_register_trans(&p9_usbg_trans);
+
+ return ret;
+}
+
+static void __exit usb9pfs_modexit(void)
+{
+ usb_function_unregister(&usb9pfsusb_func);
+ v9fs_unregister_trans(&p9_usbg_trans);
+}
+
+module_init(usb9pfs_modinit);
+module_exit(usb9pfs_modexit);
+
+MODULE_ALIAS_9P("usbg");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("USB gadget 9pfs transport");
+MODULE_AUTHOR("Michael Grzeschik");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
new file mode 100644
index 000000000000..10c2dd486438
--- /dev/null
+++ b/net/9p/trans_virtio.c
@@ -0,0 +1,838 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * The Virtio 9p transport driver
+ *
+ * This is a block based transport driver based on the lguest block driver
+ * code.
+ *
+ * Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation
+ *
+ * Based on virtio console driver
+ * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <linux/fs_context.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/virtio.h>
+#include <linux/virtio_9p.h>
+#include "trans_common.h"
+
+#define VIRTQUEUE_NUM 128
+
+/* a single mutex to manage channel initialization and attachment */
+static DEFINE_MUTEX(virtio_9p_lock);
+static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
+static atomic_t vp_pinned = ATOMIC_INIT(0);
+
+/**
+ * struct virtio_chan - per-instance transport information
+ * @inuse: whether the channel is in use
+ * @lock: protects multiple elements within this structure
+ * @client: client instance
+ * @vdev: virtio dev associated with this channel
+ * @vq: virtio queue associated with this channel
+ * @ring_bufs_avail: flag to indicate there is some available in the ring buf
+ * @vc_wq: wait queue for waiting for thing to be added to ring buf
+ * @p9_max_pages: maximum number of pinned pages
+ * @sg: scatter gather list which is used to pack a request (protected?)
+ * @chan_list: linked list of channels
+ *
+ * We keep all per-channel information in a structure.
+ * This structure is allocated within the devices dev->mem space.
+ * A pointer to the structure will get put in the transport private.
+ *
+ */
+
+struct virtio_chan {
+ bool inuse;
+
+ spinlock_t lock;
+
+ struct p9_client *client;
+ struct virtio_device *vdev;
+ struct virtqueue *vq;
+ int ring_bufs_avail;
+ wait_queue_head_t *vc_wq;
+ /* This is global limit. Since we don't have a global structure,
+ * will be placing it in each channel.
+ */
+ unsigned long p9_max_pages;
+ /* Scatterlist: can be too big for stack. */
+ struct scatterlist sg[VIRTQUEUE_NUM];
+ /**
+ * @tag: name to identify a mount null terminated
+ */
+ char *tag;
+
+ struct list_head chan_list;
+};
+
+static struct list_head virtio_chan_list;
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+ return PAGE_SIZE - offset_in_page(data);
+}
+
+/**
+ * p9_virtio_close - reclaim resources of a channel
+ * @client: client instance
+ *
+ * This reclaims a channel by freeing its resources and
+ * resetting its inuse flag.
+ *
+ */
+
+static void p9_virtio_close(struct p9_client *client)
+{
+ struct virtio_chan *chan = client->trans;
+
+ mutex_lock(&virtio_9p_lock);
+ if (chan)
+ chan->inuse = false;
+ mutex_unlock(&virtio_9p_lock);
+}
+
+/**
+ * req_done - callback which signals activity from the server
+ * @vq: virtio queue activity was received on
+ *
+ * This notifies us that the server has triggered some activity
+ * on the virtio channel - most likely a response to request we
+ * sent. Figure out which requests now have responses and wake up
+ * those threads.
+ *
+ * Bugs: could do with some additional sanity checking, but appears to work.
+ *
+ */
+
+static void req_done(struct virtqueue *vq)
+{
+ struct virtio_chan *chan = vq->vdev->priv;
+ unsigned int len;
+ struct p9_req_t *req;
+ bool need_wakeup = false;
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_TRANS, ": request done\n");
+
+ spin_lock_irqsave(&chan->lock, flags);
+ while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) {
+ if (!chan->ring_bufs_avail) {
+ chan->ring_bufs_avail = 1;
+ need_wakeup = true;
+ }
+
+ if (len) {
+ req->rc.size = len;
+ p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
+ }
+ }
+ spin_unlock_irqrestore(&chan->lock, flags);
+ /* Wakeup if anyone waiting for VirtIO ring space. */
+ if (need_wakeup)
+ wake_up(chan->vc_wq);
+}
+
+/**
+ * pack_sg_list - pack a scatter gather list from a linear buffer
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum segment to pack data to
+ * @data: data to pack into scatter/gather list
+ * @count: amount of data to pack into the scatter/gather list
+ *
+ * sg_lists have multiple segments of various sizes. This will pack
+ * arbitrary data into an existing scatter gather list, segmenting the
+ * data as necessary within constraints.
+ *
+ */
+
+static int pack_sg_list(struct scatterlist *sg, int start,
+ int limit, char *data, int count)
+{
+ int s;
+ int index = start;
+
+ while (count) {
+ s = rest_of_page(data);
+ if (s > count)
+ s = count;
+ BUG_ON(index >= limit);
+ /* Make sure we don't terminate early. */
+ sg_unmark_end(&sg[index]);
+ sg_set_buf(&sg[index++], data, s);
+ count -= s;
+ data += s;
+ }
+ if (index-start)
+ sg_mark_end(&sg[index - 1]);
+ return index-start;
+}
+
+/* We don't currently allow canceling of virtio requests */
+static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ return 1;
+}
+
+/* Reply won't come, so drop req ref */
+static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ p9_req_put(client, req);
+ return 0;
+}
+
+/**
+ * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
+ * this takes a list of pages.
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum number of pages in sg list.
+ * @pdata: a list of pages to add into sg.
+ * @nr_pages: number of pages to pack into the scatter/gather list
+ * @offs: amount of data in the beginning of first page _not_ to pack
+ * @count: amount of data to pack into the scatter/gather list
+ */
+static int
+pack_sg_list_p(struct scatterlist *sg, int start, int limit,
+ struct page **pdata, int nr_pages, size_t offs, int count)
+{
+ int i = 0, s;
+ int data_off = offs;
+ int index = start;
+
+ BUG_ON(nr_pages > (limit - start));
+ /*
+ * if the first page doesn't start at
+ * page boundary find the offset
+ */
+ while (nr_pages) {
+ s = PAGE_SIZE - data_off;
+ if (s > count)
+ s = count;
+ BUG_ON(index >= limit);
+ /* Make sure we don't terminate early. */
+ sg_unmark_end(&sg[index]);
+ sg_set_page(&sg[index++], pdata[i++], s, data_off);
+ data_off = 0;
+ count -= s;
+ nr_pages--;
+ }
+
+ if (index-start)
+ sg_mark_end(&sg[index - 1]);
+ return index - start;
+}
+
+/**
+ * p9_virtio_request - issue a request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ *
+ */
+
+static int
+p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
+{
+ int err;
+ int in, out, out_sgs, in_sgs;
+ unsigned long flags;
+ struct virtio_chan *chan = client->trans;
+ struct scatterlist *sgs[2];
+
+ p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
+req_retry:
+ spin_lock_irqsave(&chan->lock, flags);
+
+ out_sgs = in_sgs = 0;
+ /* Handle out VirtIO ring buffers */
+ out = pack_sg_list(chan->sg, 0,
+ VIRTQUEUE_NUM, req->tc.sdata, req->tc.size);
+ if (out)
+ sgs[out_sgs++] = chan->sg;
+
+ in = pack_sg_list(chan->sg, out,
+ VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity);
+ if (in)
+ sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
+ GFP_ATOMIC);
+ if (err < 0) {
+ if (err == -ENOSPC) {
+ chan->ring_bufs_avail = 0;
+ spin_unlock_irqrestore(&chan->lock, flags);
+ err = wait_event_killable(*chan->vc_wq,
+ chan->ring_bufs_avail);
+ if (err == -ERESTARTSYS)
+ return err;
+
+ p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
+ goto req_retry;
+ } else {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ p9_debug(P9_DEBUG_TRANS,
+ "virtio rpc add_sgs returned failure\n");
+ return -EIO;
+ }
+ }
+ virtqueue_kick(chan->vq);
+ spin_unlock_irqrestore(&chan->lock, flags);
+
+ p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
+ return 0;
+}
+
+static int p9_get_mapped_pages(struct virtio_chan *chan,
+ struct page ***pages,
+ struct iov_iter *data,
+ int count,
+ size_t *offs,
+ int *need_drop)
+{
+ int nr_pages;
+ int err;
+
+ if (!iov_iter_count(data))
+ return 0;
+
+ if (!iov_iter_is_kvec(data)) {
+ int n;
+ /*
+ * We allow only p9_max_pages pinned. We wait for the
+ * Other zc request to finish here
+ */
+ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
+ err = wait_event_killable(vp_wq,
+ (atomic_read(&vp_pinned) < chan->p9_max_pages));
+ if (err == -ERESTARTSYS)
+ return err;
+ }
+ n = iov_iter_get_pages_alloc2(data, pages, count, offs);
+ if (n < 0)
+ return n;
+ *need_drop = 1;
+ nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
+ atomic_add(nr_pages, &vp_pinned);
+ return n;
+ } else {
+ /* kernel buffer, no need to pin pages */
+ int index;
+ size_t len;
+ void *p;
+
+ /* we'd already checked that it's non-empty */
+ while (1) {
+ len = iov_iter_single_seg_count(data);
+ if (likely(len)) {
+ p = data->kvec->iov_base + data->iov_offset;
+ break;
+ }
+ iov_iter_advance(data, 0);
+ }
+ if (len > count)
+ len = count;
+
+ nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) -
+ (unsigned long)p / PAGE_SIZE;
+
+ *pages = kmalloc_array(nr_pages, sizeof(struct page *),
+ GFP_NOFS);
+ if (!*pages)
+ return -ENOMEM;
+
+ *need_drop = 0;
+ p -= (*offs = offset_in_page(p));
+ for (index = 0; index < nr_pages; index++) {
+ if (is_vmalloc_addr(p))
+ (*pages)[index] = vmalloc_to_page(p);
+ else
+ (*pages)[index] = kmap_to_page(p);
+ p += PAGE_SIZE;
+ }
+ iov_iter_advance(data, len);
+ return len;
+ }
+}
+
+static void handle_rerror(struct p9_req_t *req, int in_hdr_len,
+ size_t offs, struct page **pages)
+{
+ unsigned size, n;
+ void *to = req->rc.sdata + in_hdr_len;
+
+ // Fits entirely into the static data? Nothing to do.
+ if (req->rc.size < in_hdr_len || !pages)
+ return;
+
+ // Really long error message? Tough, truncate the reply. Might get
+ // rejected (we can't be arsed to adjust the size encoded in header,
+ // or string size for that matter), but it wouldn't be anything valid
+ // anyway.
+ if (unlikely(req->rc.size > P9_ZC_HDR_SZ))
+ req->rc.size = P9_ZC_HDR_SZ;
+
+ // data won't span more than two pages
+ size = req->rc.size - in_hdr_len;
+ n = PAGE_SIZE - offs;
+ if (size > n) {
+ memcpy_from_page(to, *pages++, offs, n);
+ offs = 0;
+ to += n;
+ size -= n;
+ }
+ memcpy_from_page(to, *pages, offs, size);
+}
+
+/**
+ * p9_virtio_zc_request - issue a zero copy request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ * @uidata: user buffer that should be used for zero copy read
+ * @uodata: user buffer that should be used for zero copy write
+ * @inlen: read buffer size
+ * @outlen: write buffer size
+ * @in_hdr_len: reader header size, This is the size of response protocol data
+ *
+ */
+static int
+p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
+ struct iov_iter *uidata, struct iov_iter *uodata,
+ int inlen, int outlen, int in_hdr_len)
+{
+ int in, out, err, out_sgs, in_sgs;
+ unsigned long flags;
+ int in_nr_pages = 0, out_nr_pages = 0;
+ struct page **in_pages = NULL, **out_pages = NULL;
+ struct virtio_chan *chan = client->trans;
+ struct scatterlist *sgs[4];
+ size_t offs = 0;
+ int need_drop = 0;
+ int kicked = 0;
+
+ p9_debug(P9_DEBUG_TRANS, "virtio request\n");
+
+ if (uodata) {
+ __le32 sz;
+ int n = p9_get_mapped_pages(chan, &out_pages, uodata,
+ outlen, &offs, &need_drop);
+ if (n < 0) {
+ err = n;
+ goto err_out;
+ }
+ out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+ if (n != outlen) {
+ __le32 v = cpu_to_le32(n);
+ memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
+ outlen = n;
+ }
+ /* The size field of the message must include the length of the
+ * header and the length of the data. We didn't actually know
+ * the length of the data until this point so add it in now.
+ */
+ sz = cpu_to_le32(req->tc.size + outlen);
+ memcpy(&req->tc.sdata[0], &sz, sizeof(sz));
+ } else if (uidata) {
+ int n = p9_get_mapped_pages(chan, &in_pages, uidata,
+ inlen, &offs, &need_drop);
+ if (n < 0) {
+ err = n;
+ goto err_out;
+ }
+ in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+ if (n != inlen) {
+ __le32 v = cpu_to_le32(n);
+ memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
+ inlen = n;
+ }
+ }
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
+req_retry_pinned:
+ spin_lock_irqsave(&chan->lock, flags);
+
+ out_sgs = in_sgs = 0;
+
+ /* out data */
+ out = pack_sg_list(chan->sg, 0,
+ VIRTQUEUE_NUM, req->tc.sdata, req->tc.size);
+
+ if (out)
+ sgs[out_sgs++] = chan->sg;
+
+ if (out_pages) {
+ sgs[out_sgs++] = chan->sg + out;
+ out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
+ out_pages, out_nr_pages, offs, outlen);
+ }
+
+ /*
+ * Take care of in data
+ * For example TREAD have 11.
+ * 11 is the read/write header = PDU Header(7) + IO Size (4).
+ * Arrange in such a way that server places header in the
+ * allocated memory and payload onto the user buffer.
+ */
+ in = pack_sg_list(chan->sg, out,
+ VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len);
+ if (in)
+ sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+ if (in_pages) {
+ sgs[out_sgs + in_sgs++] = chan->sg + out + in;
+ pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
+ in_pages, in_nr_pages, offs, inlen);
+ }
+
+ BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
+ GFP_ATOMIC);
+ if (err < 0) {
+ if (err == -ENOSPC) {
+ chan->ring_bufs_avail = 0;
+ spin_unlock_irqrestore(&chan->lock, flags);
+ err = wait_event_killable(*chan->vc_wq,
+ chan->ring_bufs_avail);
+ if (err == -ERESTARTSYS)
+ goto err_out;
+
+ p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
+ goto req_retry_pinned;
+ } else {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ p9_debug(P9_DEBUG_TRANS,
+ "virtio rpc add_sgs returned failure\n");
+ err = -EIO;
+ goto err_out;
+ }
+ }
+ virtqueue_kick(chan->vq);
+ spin_unlock_irqrestore(&chan->lock, flags);
+ kicked = 1;
+ p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
+ err = wait_event_killable(req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
+ // RERROR needs reply (== error string) in static data
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD &&
+ unlikely(req->rc.sdata[4] == P9_RERROR))
+ handle_rerror(req, in_hdr_len, offs, in_pages);
+
+ /*
+ * Non kernel buffers are pinned, unpin them
+ */
+err_out:
+ if (need_drop) {
+ if (in_pages) {
+ p9_release_pages(in_pages, in_nr_pages);
+ atomic_sub(in_nr_pages, &vp_pinned);
+ }
+ if (out_pages) {
+ p9_release_pages(out_pages, out_nr_pages);
+ atomic_sub(out_nr_pages, &vp_pinned);
+ }
+ /* wakeup anybody waiting for slots to pin pages */
+ wake_up(&vp_wq);
+ }
+ kvfree(in_pages);
+ kvfree(out_pages);
+ if (!kicked) {
+ /* reply won't come */
+ p9_req_put(client, req);
+ }
+ return err;
+}
+
+static ssize_t p9_mount_tag_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_chan *chan;
+ struct virtio_device *vdev;
+ int tag_len;
+
+ vdev = dev_to_virtio(dev);
+ chan = vdev->priv;
+ tag_len = strlen(chan->tag);
+
+ memcpy(buf, chan->tag, tag_len + 1);
+
+ return tag_len + 1;
+}
+
+static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
+
+/**
+ * p9_virtio_probe - probe for existence of 9P virtio channels
+ * @vdev: virtio device to probe
+ *
+ * This probes for existing virtio channels.
+ *
+ */
+
+static int p9_virtio_probe(struct virtio_device *vdev)
+{
+ __u16 tag_len;
+ char *tag;
+ int err;
+ struct virtio_chan *chan;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
+ if (!chan) {
+ pr_err("Failed to allocate virtio 9P channel\n");
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ chan->vdev = vdev;
+
+ /* We expect one virtqueue, for requests. */
+ chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
+ if (IS_ERR(chan->vq)) {
+ err = PTR_ERR(chan->vq);
+ goto out_free_chan;
+ }
+ chan->vq->vdev->priv = chan;
+ spin_lock_init(&chan->lock);
+
+ sg_init_table(chan->sg, VIRTQUEUE_NUM);
+
+ chan->inuse = false;
+ if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) {
+ virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len);
+ } else {
+ err = -EINVAL;
+ goto out_free_vq;
+ }
+ tag = kzalloc(tag_len + 1, GFP_KERNEL);
+ if (!tag) {
+ err = -ENOMEM;
+ goto out_free_vq;
+ }
+
+ virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag),
+ tag, tag_len);
+ chan->tag = tag;
+ err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+ if (err) {
+ goto out_free_tag;
+ }
+ chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
+ if (!chan->vc_wq) {
+ err = -ENOMEM;
+ goto out_remove_file;
+ }
+ init_waitqueue_head(chan->vc_wq);
+ chan->ring_bufs_avail = 1;
+ /* Ceiling limit to avoid denial of service attacks */
+ chan->p9_max_pages = nr_free_buffer_pages()/4;
+
+ virtio_device_ready(vdev);
+
+ mutex_lock(&virtio_9p_lock);
+ list_add_tail(&chan->chan_list, &virtio_chan_list);
+ mutex_unlock(&virtio_9p_lock);
+
+ /* Let udev rules use the new mount_tag attribute. */
+ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
+
+ return 0;
+
+out_remove_file:
+ sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr);
+out_free_tag:
+ kfree(tag);
+out_free_vq:
+ vdev->config->del_vqs(vdev);
+out_free_chan:
+ kfree(chan);
+fail:
+ return err;
+}
+
+
+/**
+ * p9_virtio_create - allocate a new virtio channel
+ * @client: client instance invoking this transport
+ * @fc: the filesystem context
+ *
+ * This sets up a transport channel for 9p communication. Right now
+ * we only match the first available channel, but eventually we could look up
+ * alternate channels by matching devname versus a virtio_config entry.
+ * We use a simple reference count mechanism to ensure that only a single
+ * mount has a channel open at a time.
+ *
+ */
+
+static int
+p9_virtio_create(struct p9_client *client, struct fs_context *fc)
+{
+ const char *devname = fc->source;
+ struct virtio_chan *chan;
+ int ret = -ENOENT;
+ int found = 0;
+
+ if (devname == NULL)
+ return -EINVAL;
+
+ mutex_lock(&virtio_9p_lock);
+ list_for_each_entry(chan, &virtio_chan_list, chan_list) {
+ if (!strcmp(devname, chan->tag)) {
+ if (!chan->inuse) {
+ chan->inuse = true;
+ found = 1;
+ break;
+ }
+ ret = -EBUSY;
+ }
+ }
+ mutex_unlock(&virtio_9p_lock);
+
+ if (!found) {
+ pr_err("no channels available for device %s\n", devname);
+ return ret;
+ }
+
+ client->trans = (void *)chan;
+ client->status = Connected;
+ chan->client = client;
+
+ return 0;
+}
+
+/**
+ * p9_virtio_remove - clean up resources associated with a virtio device
+ * @vdev: virtio device to remove
+ *
+ */
+
+static void p9_virtio_remove(struct virtio_device *vdev)
+{
+ struct virtio_chan *chan = vdev->priv;
+ unsigned long warning_time;
+
+ mutex_lock(&virtio_9p_lock);
+
+ /* Remove self from list so we don't get new users. */
+ list_del(&chan->chan_list);
+ warning_time = jiffies;
+
+ /* Wait for existing users to close. */
+ while (chan->inuse) {
+ mutex_unlock(&virtio_9p_lock);
+ msleep(250);
+ if (time_after(jiffies, warning_time + 10 * HZ)) {
+ dev_emerg(&vdev->dev,
+ "p9_virtio_remove: waiting for device in use.\n");
+ warning_time = jiffies;
+ }
+ mutex_lock(&virtio_9p_lock);
+ }
+
+ mutex_unlock(&virtio_9p_lock);
+
+ virtio_reset_device(vdev);
+ vdev->config->del_vqs(vdev);
+
+ sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
+ kfree(chan->tag);
+ kfree(chan->vc_wq);
+ kfree(chan);
+
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+ VIRTIO_9P_MOUNT_TAG,
+};
+
+/* The standard "struct lguest_driver": */
+static struct virtio_driver p9_virtio_drv = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .id_table = id_table,
+ .probe = p9_virtio_probe,
+ .remove = p9_virtio_remove,
+};
+
+static struct p9_trans_module p9_virtio_trans = {
+ .name = "virtio",
+ .create = p9_virtio_create,
+ .close = p9_virtio_close,
+ .request = p9_virtio_request,
+ .zc_request = p9_virtio_zc_request,
+ .cancel = p9_virtio_cancel,
+ .cancelled = p9_virtio_cancelled,
+ /*
+ * We leave one entry for input and one entry for response
+ * headers. We also skip one more entry to accommodate, address
+ * that are not at page boundary, that can result in an extra
+ * page in zero copy.
+ */
+ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
+ .pooled_rbuffers = false,
+ .def = true,
+ .supports_vmalloc = false,
+ .owner = THIS_MODULE,
+};
+
+/* The standard init function */
+static int __init p9_virtio_init(void)
+{
+ int rc;
+
+ INIT_LIST_HEAD(&virtio_chan_list);
+
+ v9fs_register_trans(&p9_virtio_trans);
+ rc = register_virtio_driver(&p9_virtio_drv);
+ if (rc)
+ v9fs_unregister_trans(&p9_virtio_trans);
+
+ return rc;
+}
+
+static void __exit p9_virtio_cleanup(void)
+{
+ unregister_virtio_driver(&p9_virtio_drv);
+ v9fs_unregister_trans(&p9_virtio_trans);
+}
+
+module_init(p9_virtio_init);
+module_exit(p9_virtio_cleanup);
+MODULE_ALIAS_9P("virtio");
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Virtio 9p Transport");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
new file mode 100644
index 000000000000..12f752a92332
--- /dev/null
+++ b/net/9p/trans_xen.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/9p/trans_xen
+ *
+ * Xen transport layer.
+ *
+ * Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com>
+ */
+
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/9pfs.h>
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/fs_context.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#define XEN_9PFS_NUM_RINGS 2
+#define XEN_9PFS_RING_ORDER 9
+#define XEN_9PFS_RING_SIZE(ring) XEN_FLEX_RING_SIZE(ring->intf->ring_order)
+
+struct xen_9pfs_header {
+ uint32_t size;
+ uint8_t id;
+ uint16_t tag;
+
+ /* uint8_t sdata[]; */
+} __attribute__((packed));
+
+/* One per ring, more than one per 9pfs share */
+struct xen_9pfs_dataring {
+ struct xen_9pfs_front_priv *priv;
+
+ struct xen_9pfs_data_intf *intf;
+ grant_ref_t ref;
+ int evtchn;
+ int irq;
+ /* protect a ring from concurrent accesses */
+ spinlock_t lock;
+
+ struct xen_9pfs_data data;
+ wait_queue_head_t wq;
+ struct work_struct work;
+};
+
+/* One per 9pfs share */
+struct xen_9pfs_front_priv {
+ struct list_head list;
+ struct xenbus_device *dev;
+ char *tag;
+ struct p9_client *client;
+
+ struct xen_9pfs_dataring *rings;
+};
+
+static LIST_HEAD(xen_9pfs_devs);
+static DEFINE_RWLOCK(xen_9pfs_lock);
+
+/* We don't currently allow canceling of requests */
+static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ return 1;
+}
+
+static int p9_xen_create(struct p9_client *client, struct fs_context *fc)
+{
+ const char *addr = fc->source;
+ struct xen_9pfs_front_priv *priv;
+
+ if (addr == NULL)
+ return -EINVAL;
+
+ read_lock(&xen_9pfs_lock);
+ list_for_each_entry(priv, &xen_9pfs_devs, list) {
+ if (!strcmp(priv->tag, addr)) {
+ priv->client = client;
+ read_unlock(&xen_9pfs_lock);
+ return 0;
+ }
+ }
+ read_unlock(&xen_9pfs_lock);
+ return -EINVAL;
+}
+
+static void p9_xen_close(struct p9_client *client)
+{
+ struct xen_9pfs_front_priv *priv;
+
+ read_lock(&xen_9pfs_lock);
+ list_for_each_entry(priv, &xen_9pfs_devs, list) {
+ if (priv->client == client) {
+ priv->client = NULL;
+ read_unlock(&xen_9pfs_lock);
+ return;
+ }
+ }
+ read_unlock(&xen_9pfs_lock);
+}
+
+static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size)
+{
+ RING_IDX cons, prod;
+
+ cons = ring->intf->out_cons;
+ prod = ring->intf->out_prod;
+ virt_mb();
+
+ return XEN_9PFS_RING_SIZE(ring) -
+ xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) >= size;
+}
+
+static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
+{
+ struct xen_9pfs_front_priv *priv;
+ RING_IDX cons, prod, masked_cons, masked_prod;
+ unsigned long flags;
+ u32 size = p9_req->tc.size;
+ struct xen_9pfs_dataring *ring;
+ int num;
+
+ read_lock(&xen_9pfs_lock);
+ list_for_each_entry(priv, &xen_9pfs_devs, list) {
+ if (priv->client == client)
+ break;
+ }
+ read_unlock(&xen_9pfs_lock);
+ if (list_entry_is_head(priv, &xen_9pfs_devs, list))
+ return -EINVAL;
+
+ num = p9_req->tc.tag % XEN_9PFS_NUM_RINGS;
+ ring = &priv->rings[num];
+
+again:
+ while (wait_event_killable(ring->wq,
+ p9_xen_write_todo(ring, size)) != 0)
+ ;
+
+ spin_lock_irqsave(&ring->lock, flags);
+ cons = ring->intf->out_cons;
+ prod = ring->intf->out_prod;
+ virt_mb();
+
+ if (XEN_9PFS_RING_SIZE(ring) -
+ xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < size) {
+ spin_unlock_irqrestore(&ring->lock, flags);
+ goto again;
+ }
+
+ masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring));
+ masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
+
+ xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size,
+ &masked_prod, masked_cons,
+ XEN_9PFS_RING_SIZE(ring));
+
+ WRITE_ONCE(p9_req->status, REQ_STATUS_SENT);
+ virt_wmb(); /* write ring before updating pointer */
+ prod += size;
+ ring->intf->out_prod = prod;
+ spin_unlock_irqrestore(&ring->lock, flags);
+ notify_remote_via_irq(ring->irq);
+ p9_req_put(client, p9_req);
+
+ return 0;
+}
+
+static void p9_xen_response(struct work_struct *work)
+{
+ struct xen_9pfs_front_priv *priv;
+ struct xen_9pfs_dataring *ring;
+ RING_IDX cons, prod, masked_cons, masked_prod;
+ struct xen_9pfs_header h;
+ struct p9_req_t *req;
+ int status;
+
+ ring = container_of(work, struct xen_9pfs_dataring, work);
+ priv = ring->priv;
+
+ while (1) {
+ cons = ring->intf->in_cons;
+ prod = ring->intf->in_prod;
+ virt_rmb();
+
+ if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) <
+ sizeof(h)) {
+ notify_remote_via_irq(ring->irq);
+ return;
+ }
+
+ masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring));
+ masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
+
+ /* First, read just the header */
+ xen_9pfs_read_packet(&h, ring->data.in, sizeof(h),
+ masked_prod, &masked_cons,
+ XEN_9PFS_RING_SIZE(ring));
+
+ req = p9_tag_lookup(priv->client, h.tag);
+ if (!req || req->status != REQ_STATUS_SENT) {
+ dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag);
+ cons += h.size;
+ virt_mb();
+ ring->intf->in_cons = cons;
+ continue;
+ }
+
+ if (h.size > req->rc.capacity) {
+ dev_warn(&priv->dev->dev,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ h.size, h.tag, req->rc.capacity);
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+ goto recv_error;
+ }
+
+ req->rc.size = h.size;
+ req->rc.id = h.id;
+ req->rc.tag = h.tag;
+ req->rc.offset = 0;
+
+ masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
+ /* Then, read the whole packet (including the header) */
+ xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size,
+ masked_prod, &masked_cons,
+ XEN_9PFS_RING_SIZE(ring));
+
+recv_error:
+ virt_mb();
+ cons += h.size;
+ ring->intf->in_cons = cons;
+
+ status = (req->status != REQ_STATUS_ERROR) ?
+ REQ_STATUS_RCVD : REQ_STATUS_ERROR;
+
+ p9_client_cb(priv->client, req, status);
+ }
+}
+
+static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r)
+{
+ struct xen_9pfs_dataring *ring = r;
+
+ if (!ring || !ring->priv->client) {
+ /* ignore spurious interrupt */
+ return IRQ_HANDLED;
+ }
+
+ wake_up_interruptible(&ring->wq);
+ schedule_work(&ring->work);
+
+ return IRQ_HANDLED;
+}
+
+static struct p9_trans_module p9_xen_trans = {
+ .name = "xen",
+ .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2),
+ .pooled_rbuffers = false,
+ .def = true,
+ .supports_vmalloc = false,
+ .create = p9_xen_create,
+ .close = p9_xen_close,
+ .request = p9_xen_request,
+ .cancel = p9_xen_cancel,
+ .owner = THIS_MODULE,
+};
+
+static const struct xenbus_device_id xen_9pfs_front_ids[] = {
+ { "9pfs" },
+ { "" }
+};
+
+static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv)
+{
+ int i, j;
+
+ write_lock(&xen_9pfs_lock);
+ list_del(&priv->list);
+ write_unlock(&xen_9pfs_lock);
+
+ for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) {
+ struct xen_9pfs_dataring *ring = &priv->rings[i];
+
+ cancel_work_sync(&ring->work);
+
+ if (!priv->rings[i].intf)
+ break;
+ if (priv->rings[i].irq > 0)
+ unbind_from_irqhandler(priv->rings[i].irq, ring);
+ if (priv->rings[i].data.in) {
+ for (j = 0;
+ j < (1 << priv->rings[i].intf->ring_order);
+ j++) {
+ grant_ref_t ref;
+
+ ref = priv->rings[i].intf->ref[j];
+ gnttab_end_foreign_access(ref, NULL);
+ }
+ free_pages_exact(priv->rings[i].data.in,
+ 1UL << (priv->rings[i].intf->ring_order +
+ XEN_PAGE_SHIFT));
+ }
+ gnttab_end_foreign_access(priv->rings[i].ref, NULL);
+ free_page((unsigned long)priv->rings[i].intf);
+ }
+ kfree(priv->rings);
+ kfree(priv->tag);
+ kfree(priv);
+}
+
+static void xen_9pfs_front_remove(struct xenbus_device *dev)
+{
+ struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev);
+
+ dev_set_drvdata(&dev->dev, NULL);
+ xen_9pfs_front_free(priv);
+}
+
+static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev,
+ struct xen_9pfs_dataring *ring,
+ unsigned int order)
+{
+ int i = 0;
+ int ret = -ENOMEM;
+ void *bytes = NULL;
+
+ init_waitqueue_head(&ring->wq);
+ spin_lock_init(&ring->lock);
+ INIT_WORK(&ring->work, p9_xen_response);
+
+ ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL);
+ if (!ring->intf)
+ return ret;
+ ret = gnttab_grant_foreign_access(dev->otherend_id,
+ virt_to_gfn(ring->intf), 0);
+ if (ret < 0)
+ goto out;
+ ring->ref = ret;
+ bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!bytes) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (; i < (1 << order); i++) {
+ ret = gnttab_grant_foreign_access(
+ dev->otherend_id, virt_to_gfn(bytes) + i, 0);
+ if (ret < 0)
+ goto out;
+ ring->intf->ref[i] = ret;
+ }
+ ring->intf->ring_order = order;
+ ring->data.in = bytes;
+ ring->data.out = bytes + XEN_FLEX_RING_SIZE(order);
+
+ ret = xenbus_alloc_evtchn(dev, &ring->evtchn);
+ if (ret)
+ goto out;
+ ring->irq = bind_evtchn_to_irqhandler(ring->evtchn,
+ xen_9pfs_front_event_handler,
+ 0, "xen_9pfs-frontend", ring);
+ if (ring->irq >= 0)
+ return 0;
+
+ xenbus_free_evtchn(dev, ring->evtchn);
+ ret = ring->irq;
+out:
+ if (bytes) {
+ for (i--; i >= 0; i--)
+ gnttab_end_foreign_access(ring->intf->ref[i], NULL);
+ free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT));
+ }
+ gnttab_end_foreign_access(ring->ref, NULL);
+ free_page((unsigned long)ring->intf);
+ return ret;
+}
+
+static int xen_9pfs_front_init(struct xenbus_device *dev)
+{
+ int ret, i;
+ struct xenbus_transaction xbt;
+ struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev);
+ char *versions, *v;
+ unsigned int max_rings, max_ring_order, len = 0;
+
+ versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len);
+ if (IS_ERR(versions))
+ return PTR_ERR(versions);
+ for (v = versions; *v; v++) {
+ if (simple_strtoul(v, &v, 10) == 1) {
+ v = NULL;
+ break;
+ }
+ }
+ if (v) {
+ kfree(versions);
+ return -EINVAL;
+ }
+ kfree(versions);
+ max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0);
+ if (max_rings < XEN_9PFS_NUM_RINGS)
+ return -EINVAL;
+ max_ring_order = xenbus_read_unsigned(dev->otherend,
+ "max-ring-page-order", 0);
+ if (max_ring_order > XEN_9PFS_RING_ORDER)
+ max_ring_order = XEN_9PFS_RING_ORDER;
+ if (p9_xen_trans.maxsize > XEN_FLEX_RING_SIZE(max_ring_order))
+ p9_xen_trans.maxsize = XEN_FLEX_RING_SIZE(max_ring_order) / 2;
+
+ priv->rings = kcalloc(XEN_9PFS_NUM_RINGS, sizeof(*priv->rings),
+ GFP_KERNEL);
+ if (!priv->rings) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) {
+ priv->rings[i].priv = priv;
+ ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i],
+ max_ring_order);
+ if (ret < 0)
+ goto error;
+ }
+
+ again:
+ ret = xenbus_transaction_start(&xbt);
+ if (ret) {
+ xenbus_dev_fatal(dev, ret, "starting transaction");
+ goto error;
+ }
+ ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1);
+ if (ret)
+ goto error_xenbus;
+ ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u",
+ XEN_9PFS_NUM_RINGS);
+ if (ret)
+ goto error_xenbus;
+
+ for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) {
+ char str[16];
+
+ BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9);
+ sprintf(str, "ring-ref%d", i);
+ ret = xenbus_printf(xbt, dev->nodename, str, "%d",
+ priv->rings[i].ref);
+ if (ret)
+ goto error_xenbus;
+
+ sprintf(str, "event-channel-%d", i);
+ ret = xenbus_printf(xbt, dev->nodename, str, "%u",
+ priv->rings[i].evtchn);
+ if (ret)
+ goto error_xenbus;
+ }
+ priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL);
+ if (IS_ERR(priv->tag)) {
+ ret = PTR_ERR(priv->tag);
+ goto error_xenbus;
+ }
+ ret = xenbus_transaction_end(xbt, 0);
+ if (ret) {
+ if (ret == -EAGAIN)
+ goto again;
+ xenbus_dev_fatal(dev, ret, "completing transaction");
+ goto error;
+ }
+
+ xenbus_switch_state(dev, XenbusStateInitialised);
+ return 0;
+
+ error_xenbus:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, ret, "writing xenstore");
+ error:
+ xen_9pfs_front_free(priv);
+ return ret;
+}
+
+static int xen_9pfs_front_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ struct xen_9pfs_front_priv *priv = NULL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->dev = dev;
+ dev_set_drvdata(&dev->dev, priv);
+
+ write_lock(&xen_9pfs_lock);
+ list_add_tail(&priv->list, &xen_9pfs_devs);
+ write_unlock(&xen_9pfs_lock);
+
+ return 0;
+}
+
+static int xen_9pfs_front_resume(struct xenbus_device *dev)
+{
+ dev_warn(&dev->dev, "suspend/resume unsupported\n");
+ return 0;
+}
+
+static void xen_9pfs_front_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ switch (backend_state) {
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
+ case XenbusStateUnknown:
+ break;
+
+ case XenbusStateInitWait:
+ if (dev->state != XenbusStateInitialising)
+ break;
+
+ xen_9pfs_front_init(dev);
+ break;
+
+ case XenbusStateConnected:
+ xenbus_switch_state(dev, XenbusStateConnected);
+ break;
+
+ case XenbusStateClosed:
+ if (dev->state == XenbusStateClosed)
+ break;
+ fallthrough; /* Missed the backend's CLOSING state */
+ case XenbusStateClosing:
+ xenbus_frontend_closed(dev);
+ break;
+ }
+}
+
+static struct xenbus_driver xen_9pfs_front_driver = {
+ .ids = xen_9pfs_front_ids,
+ .probe = xen_9pfs_front_probe,
+ .remove = xen_9pfs_front_remove,
+ .resume = xen_9pfs_front_resume,
+ .otherend_changed = xen_9pfs_front_changed,
+};
+
+static int __init p9_trans_xen_init(void)
+{
+ int rc;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ pr_info("Initialising Xen transport for 9pfs\n");
+
+ v9fs_register_trans(&p9_xen_trans);
+ rc = xenbus_register_frontend(&xen_9pfs_front_driver);
+ if (rc)
+ v9fs_unregister_trans(&p9_xen_trans);
+
+ return rc;
+}
+module_init(p9_trans_xen_init);
+MODULE_ALIAS_9P("xen");
+
+static void __exit p9_trans_xen_exit(void)
+{
+ v9fs_unregister_trans(&p9_xen_trans);
+ return xenbus_unregister_driver(&xen_9pfs_front_driver);
+}
+module_exit(p9_trans_xen_exit);
+
+MODULE_ALIAS("xen:9pfs");
+MODULE_AUTHOR("Stefano Stabellini <stefano@aporeto.com>");
+MODULE_DESCRIPTION("Xen Transport for 9P");
+MODULE_LICENSE("GPL");
diff --git a/net/Kconfig b/net/Kconfig
index 867f95032513..62266eaf0e95 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -1,17 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Network configuration
#
-menu "Networking"
-
-config NET
+menuconfig NET
bool "Networking support"
- ---help---
+ select NLATTR
+ select GENERIC_NET_UTILS
+ select BPF
+ help
Unless you really know what you are doing, you should say Y here.
The reason is that some programs need kernel networking support even
when running on a stand-alone machine that isn't connected to any
other computer.
-
+
If you are upgrading from an older kernel, you
should consider updating your networking tools too because changes
in the kernel and the tools often go hand in hand. The tools are
@@ -22,28 +24,99 @@ config NET
recommended to read the NET-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
-# Make sure that all config symbols are dependent on NET
if NET
-menu "Networking options"
+config WANT_COMPAT_NETLINK_MESSAGES
+ bool
+ help
+ This option can be selected by other options that need compat
+ netlink messages.
-config NETDEBUG
- bool "Network packet debugging"
+config COMPAT_NETLINK_MESSAGES
+ def_bool y
+ depends on COMPAT
+ depends on WEXT_CORE || WANT_COMPAT_NETLINK_MESSAGES
help
- You can say Y here if you want to get additional messages useful in
- debugging bad packets, but can overwhelm logs under denial of service
- attacks.
+ This option makes it possible to send different netlink messages
+ to tasks depending on whether the task is a compat task or not. To
+ achieve this, you need to set skb_shinfo(skb)->frag_list to the
+ compat skb before sending the skb, the netlink code will sort out
+ which message to actually pass to the task.
+
+ Newly written code should NEVER need this option but do
+ compat-independent messages instead!
+
+config NET_INGRESS
+ bool
+
+config NET_EGRESS
+ bool
+
+config NET_XGRESS
+ select NET_INGRESS
+ select NET_EGRESS
+ bool
+
+config NET_REDIRECT
+ bool
+
+config SKB_DECRYPTED
+ bool
+
+config SKB_EXTENSIONS
+ bool
+
+config NET_DEVMEM
+ def_bool y
+ select GENERIC_ALLOCATOR
+ depends on DMA_SHARED_BUFFER
+ depends on PAGE_POOL
+
+config NET_SHAPER
+ bool
+
+config NET_CRC32C
+ bool
+ select CRC32
+
+menu "Networking options"
source "net/packet/Kconfig"
+source "net/psp/Kconfig"
source "net/unix/Kconfig"
+source "net/tls/Kconfig"
source "net/xfrm/Kconfig"
+source "net/iucv/Kconfig"
+source "net/smc/Kconfig"
+source "drivers/dibs/Kconfig"
+source "net/xdp/Kconfig"
+
+config NET_HANDSHAKE
+ bool
+ depends on SUNRPC || NVME_TARGET_TCP || NVME_TCP
+ default y
+
+config NET_HANDSHAKE_KUNIT_TEST
+ tristate "KUnit tests for the handshake upcall mechanism" if !KUNIT_ALL_TESTS
+ default KUNIT_ALL_TESTS
+ depends on KUNIT
+ help
+ This builds the KUnit tests for the handshake upcall mechanism.
+
+ KUnit tests run during boot and output the results to the debug
+ log in TAP format (https://testanything.org/). Only useful for
+ kernel devs running KUnit test harness and are not for inclusion
+ into a production build.
+
+ For more information on KUnit and unit tests in general, refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
config INET
bool "TCP/IP networking"
- ---help---
+ help
These are the protocols used on the Internet and on most local
Ethernets. It is highly recommended to say Y here (this will enlarge
- your kernel by about 144 KB), since some programs (e.g. the X window
+ your kernel by about 400 KB), since some programs (e.g. the X window
system) use TCP/IP even if your machine is not connected to any
other computer. You will get the so-called loopback device which
allows you to ping yourself (great fun, that!).
@@ -56,7 +129,7 @@ config INET
"Sysctl support" below, you can change various aspects of the
behavior of the TCP/IP code by writing to the (virtual) files in
/proc/sys/net/ipv4/*; the options are explained in the file
- <file:Documentation/networking/ip-sysctl.txt>.
+ <file:Documentation/networking/ip-sysctl.rst>.
Short answer: say Y.
@@ -64,6 +137,7 @@ if INET
source "net/ipv4/Kconfig"
source "net/ipv6/Kconfig"
source "net/netlabel/Kconfig"
+source "net/mptcp/Kconfig"
endif # if INET
@@ -74,9 +148,23 @@ config NETWORK_SECMARK
to nfmark, but designated for security purposes.
If you are unsure how to answer this question, answer N.
+config NET_PTP_CLASSIFY
+ def_bool n
+
+config NETWORK_PHY_TIMESTAMPING
+ bool "Timestamping in PHY devices"
+ select NET_PTP_CLASSIFY
+ help
+ This allows timestamping of network packets by PHYs (or
+ other MII bus snooping devices) with hardware timestamping
+ capabilities. This option adds some overhead in the transmit
+ and receive paths.
+
+ If you are unsure how to answer this question, answer N.
+
menuconfig NETFILTER
- bool "Network packet filtering (replaces ipchains)"
- ---help---
+ bool "Network packet filtering framework (Netfilter)"
+ help
Netfilter is a framework for filtering and mangling network packets
that pass through your Linux box.
@@ -127,26 +215,27 @@ menuconfig NETFILTER
<file:Documentation/Changes> under "iptables" for the location of
these packages.
- Make sure to say N to "Fast switching" below if you intend to say Y
- here, as Fast switching currently bypasses netfilter.
-
- Chances are that you should say Y here if you compile a kernel which
- will run as a router and N for regular hosts. If unsure, say N.
-
if NETFILTER
-config NETFILTER_DEBUG
- bool "Network packet filtering debugging"
+config NETFILTER_ADVANCED
+ bool "Advanced netfilter configuration"
depends on NETFILTER
+ default y
help
- You can say Y here if you want to get additional messages useful in
- debugging the netfilter code.
+ If you say Y here you can select between all the netfilter modules.
+ If you say N the more unusual ones will not be shown and the
+ basic ones needed by most people will default to 'M'.
+
+ If unsure, say Y.
config BRIDGE_NETFILTER
- bool "Bridged IP/ARP packets filtering"
- depends on BRIDGE && NETFILTER && INET
- default y
- ---help---
+ tristate "Bridged IP/ARP packets filtering"
+ depends on BRIDGE
+ depends on NETFILTER && INET
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_FAMILY_BRIDGE
+ select SKB_EXTENSIONS
+ help
Enabling this option will let arptables resp. iptables see bridged
ARP resp. IP traffic. If you want a bridging firewall, you probably
want this option enabled.
@@ -158,74 +247,300 @@ config BRIDGE_NETFILTER
source "net/netfilter/Kconfig"
source "net/ipv4/netfilter/Kconfig"
source "net/ipv6/netfilter/Kconfig"
-source "net/decnet/netfilter/Kconfig"
source "net/bridge/netfilter/Kconfig"
-endif
+endif # if NETFILTER
-source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
+source "net/rds/Kconfig"
source "net/tipc/Kconfig"
source "net/atm/Kconfig"
+source "net/l2tp/Kconfig"
+source "net/802/Kconfig"
source "net/bridge/Kconfig"
+source "net/dsa/Kconfig"
source "net/8021q/Kconfig"
-source "net/decnet/Kconfig"
source "net/llc/Kconfig"
-source "net/ipx/Kconfig"
-source "drivers/net/appletalk/Kconfig"
+source "net/appletalk/Kconfig"
source "net/x25/Kconfig"
source "net/lapb/Kconfig"
-source "net/econet/Kconfig"
-source "net/wanrouter/Kconfig"
+source "net/phonet/Kconfig"
+source "net/6lowpan/Kconfig"
+source "net/ieee802154/Kconfig"
+source "net/mac802154/Kconfig"
source "net/sched/Kconfig"
+source "net/dcb/Kconfig"
+source "net/dns_resolver/Kconfig"
+source "net/batman-adv/Kconfig"
+source "net/openvswitch/Kconfig"
+source "net/vmw_vsock/Kconfig"
+source "net/netlink/Kconfig"
+source "net/mpls/Kconfig"
+source "net/nsh/Kconfig"
+source "net/hsr/Kconfig"
+source "net/switchdev/Kconfig"
+source "net/l3mdev/Kconfig"
+source "net/qrtr/Kconfig"
+source "net/ncsi/Kconfig"
+
+config PCPU_DEV_REFCNT
+ bool "Use percpu variables to maintain network device refcount"
+ depends on SMP
+ default y
+ help
+ network device refcount are using per cpu variables if this option is set.
+ This can be forced to N to detect underflows (with a performance drop).
+
+config MAX_SKB_FRAGS
+ int "Maximum number of fragments per skb_shared_info"
+ range 17 45
+ default 17
+ help
+ Having more fragments per skb_shared_info can help GRO efficiency.
+ This helps BIG TCP workloads, but might expose bugs in some
+ legacy drivers.
+ This also increases memory overhead of small packets,
+ and in drivers using build_skb().
+ If unsure, say 17.
+
+config RPS
+ bool "Receive packet steering"
+ depends on SMP && SYSFS
+ default y
+ help
+ Software receive side packet steering (RPS) distributes the
+ load of received packet processing across multiple CPUs.
+
+config RFS_ACCEL
+ bool "Hardware acceleration of RFS"
+ depends on RPS
+ select CPU_RMAP
+ default y
+ help
+ Allowing drivers for multiqueue hardware with flow filter tables to
+ accelerate RFS.
+
+config SOCK_RX_QUEUE_MAPPING
+ bool
+
+config XPS
+ bool
+ depends on SMP
+ select SOCK_RX_QUEUE_MAPPING
+ default y
+
+config HWBM
+ bool
+
+config CGROUP_NET_PRIO
+ bool "Network priority cgroup"
+ depends on CGROUPS
+ select SOCK_CGROUP_DATA
+ help
+ Cgroup subsystem for use in assigning processes to network priorities on
+ a per-interface basis.
+
+config CGROUP_NET_CLASSID
+ bool "Network classid cgroup"
+ depends on CGROUPS
+ select SOCK_CGROUP_DATA
+ help
+ Cgroup subsystem for use as general purpose socket classid marker that is
+ being used in cls_cgroup and for netfilter matching.
+
+config NET_RX_BUSY_POLL
+ bool
+ default y if !PREEMPT_RT || (PREEMPT_RT && !NETCONSOLE)
+
+config BQL
+ bool
+ prompt "Enable Byte Queue Limits"
+ depends on SYSFS
+ select DQL
+ default y
+
+config BPF_STREAM_PARSER
+ bool "enable BPF STREAM_PARSER"
+ depends on INET
+ depends on BPF_SYSCALL
+ depends on CGROUP_BPF
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ help
+ Enabling this allows a TCP stream parser to be used with
+ BPF_MAP_TYPE_SOCKMAP.
+
+config NET_FLOW_LIMIT
+ bool "Net flow limit"
+ depends on RPS
+ default y
+ help
+ The network stack has to drop packets when a receive processing CPU's
+ backlog reaches netdev_max_backlog. If a few out of many active flows
+ generate the vast majority of load, drop their traffic earlier to
+ maintain capacity for the other flows. This feature provides servers
+ with many clients some protection against DoS by a single (spoofed)
+ flow that greatly exceeds average workload.
menu "Network testing"
config NET_PKTGEN
tristate "Packet Generator (USE WITH CAUTION)"
- depends on PROC_FS
- ---help---
+ depends on INET && PROC_FS
+ help
This module will inject preconfigured packets, at a configurable
rate, out of a given interface. It is used for network interface
stress testing and performance analysis. If you don't understand
what was just said, you don't need it: say N.
Documentation on how to use the packet generator can be found
- at <file:Documentation/networking/pktgen.txt>.
+ at <file:Documentation/networking/pktgen.rst>.
To compile this code as a module, choose M here: the
module will be called pktgen.
-config NET_TCPPROBE
- tristate "TCP connection probing"
- depends on INET && EXPERIMENTAL && PROC_FS && KPROBES
- ---help---
- This module allows for capturing the changes to TCP connection
- state in response to incoming packets. It is used for debugging
- TCP congestion avoidance modules. If you don't understand
- what was just said, you don't need it: say N.
-
- Documentation on how to use TCP connection probing can be found
- at http://linux-net.osdl.org/index.php/TcpProbe
-
- To compile this code as a module, choose M here: the
- module will be called tcp_probe.
+config NET_DROP_MONITOR
+ tristate "Legacy network packet drop alerting service"
+ depends on INET && TRACEPOINTS
+ help
+ This feature provides an alerting service to userspace in the
+ event that packets are discarded in the network stack. Alerts
+ are broadcast via netlink socket to any listening user space
+ process. This feature is NOT related to "perf" based drop monitoring.
+ Say N here unless you need to support older userspace tools like
+ "dropwatch".
-endmenu
+endmenu # Network testing
-endmenu
+endmenu # Networking options
source "net/ax25/Kconfig"
-source "net/irda/Kconfig"
+source "net/can/Kconfig"
source "net/bluetooth/Kconfig"
-source "net/ieee80211/Kconfig"
+source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
+source "net/strparser/Kconfig"
+source "net/mctp/Kconfig"
-config WIRELESS_EXT
+config FIB_RULES
bool
-config FIB_RULES
+menuconfig WIRELESS
+ bool "Wireless"
+ depends on !S390
+ default y
+
+if WIRELESS
+
+source "net/wireless/Kconfig"
+source "net/mac80211/Kconfig"
+
+endif # WIRELESS
+
+source "net/rfkill/Kconfig"
+source "net/9p/Kconfig"
+source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
+source "net/nfc/Kconfig"
+source "net/psample/Kconfig"
+source "net/ife/Kconfig"
+
+config LWTUNNEL
+ bool "Network light weight tunnels"
+ help
+ This feature provides an infrastructure to support light weight
+ tunnels like mpls. There is no netdevice associated with a light
+ weight tunnel endpoint. Tunnel encapsulation parameters are stored
+ with light weight tunnel state associated with fib routes.
+
+config LWTUNNEL_BPF
+ bool "Execute BPF program as route nexthop action"
+ depends on LWTUNNEL && INET
+ default y if LWTUNNEL=y
+ help
+ Allows to run BPF programs as a nexthop action following a route
+ lookup for incoming and outgoing packets.
+
+config DST_CACHE
bool
+ default n
-endif # if NET
-endmenu # Networking
+config GRO_CELLS
+ bool
+ default n
+
+config SOCK_VALIDATE_XMIT
+ bool
+
+config NET_IEEE8021Q_HELPERS
+ bool
+
+config NET_SELFTESTS
+ def_tristate PHYLIB
+ depends on PHYLIB && INET
+
+config NET_SOCK_MSG
+ bool
+ default n
+ help
+ The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+ ULPs (upper layer modules, e.g. TLS) to process L7 application data
+ with the help of BPF programs.
+
+config NET_DEVLINK
+ bool
+ default n
+config PAGE_POOL
+ bool
+
+config PAGE_POOL_STATS
+ default n
+ bool "Page pool stats"
+ depends on PAGE_POOL
+ help
+ Enable page pool statistics to track page allocation and recycling
+ in page pools. This option incurs additional CPU cost in allocation
+ and recycle paths and additional memory cost to store the statistics.
+ These statistics are only available if this option is enabled and if
+ the driver using the page pool supports exporting this data.
+
+ If unsure, say N.
+
+config FAILOVER
+ tristate "Generic failover module"
+ help
+ The failover module provides a generic interface for paravirtual
+ drivers to register a netdev and a set of ops with a failover
+ instance. The ops are used as event handlers that get called to
+ handle netdev register/unregister/link change/name change events
+ on slave pci ethernet devices with the same mac address as the
+ failover netdev. This enables paravirtual drivers to use a
+ VF as an accelerated low latency datapath. It also allows live
+ migration of VMs with direct attached VFs by failing over to the
+ paravirtual datapath when the VF is unplugged.
+
+config ETHTOOL_NETLINK
+ bool "Netlink interface for ethtool"
+ select DIMLIB
+ default y
+ help
+ An alternative userspace interface for ethtool based on generic
+ netlink. It provides better extensibility and some new features,
+ e.g. notification messages.
+
+config NETDEV_ADDR_LIST_TEST
+ tristate "Unit tests for device address list"
+ default KUNIT_ALL_TESTS
+ depends on KUNIT
+
+config NET_TEST
+ tristate "KUnit tests for networking" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ KUnit tests covering core networking infra, such as sk_buff.
+
+ If unsure, say N.
+
+endif # if NET
diff --git a/net/Kconfig.debug b/net/Kconfig.debug
new file mode 100644
index 000000000000..277fab8c4d77
--- /dev/null
+++ b/net/Kconfig.debug
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NET_DEV_REFCNT_TRACKER
+ bool "Enable net device refcount tracking"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT && NET
+ select REF_TRACKER
+ default n
+ help
+ Enable debugging feature to track device references.
+ This adds memory and cpu costs.
+
+config NET_NS_REFCNT_TRACKER
+ bool "Enable networking namespace refcount tracking"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT && NET
+ select REF_TRACKER
+ default n
+ help
+ Enable debugging feature to track netns references.
+ This adds memory and cpu costs.
+
+config DEBUG_NET
+ bool "Add generic networking debug"
+ depends on DEBUG_KERNEL && NET
+ help
+ Enable extra sanity checks in networking.
+ This is mostly used by fuzzers, but is safe to select.
+
+config DEBUG_NET_SMALL_RTNL
+ bool "Add extra per-netns mutex inside RTNL"
+ depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT
+ select PROVE_LOCKING
+ default n
+ help
+ rtnl_lock() is being replaced with rtnl_net_lock() that
+ acquires the global RTNL and a small per-netns RTNL mutex.
+
+ During the conversion, rtnl_net_lock() just adds an extra
+ mutex in every RTNL scope and slows down the operations.
+
+ Once the conversion completes, rtnl_lock() will be removed
+ and rtnetlink will gain per-netns scalability.
diff --git a/net/Makefile b/net/Makefile
index ad4d14f4bb29..90e3d72bf58b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux networking.
#
@@ -5,49 +6,77 @@
# Rewritten to use lists instead of if-statements.
#
-obj-y := nonet.o
+obj-y := devres.o socket.o core/
-obj-$(CONFIG_NET) := socket.o core/
-
-tmp-$(CONFIG_COMPAT) := compat.o
-obj-$(CONFIG_NET) += $(tmp-y)
+obj-$(CONFIG_COMPAT) += compat.o
# LLC has to be linked before the files in net/802/
obj-$(CONFIG_LLC) += llc/
-obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
+obj-y += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_INET) += ipv4/
+obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
-ifneq ($(CONFIG_IPV6),)
+obj-$(CONFIG_INET_PSP) += psp/
obj-y += ipv6/
-endif
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
-obj-$(CONFIG_NET_SCHED) += sched/
obj-$(CONFIG_BRIDGE) += bridge/
-obj-$(CONFIG_IPX) += ipx/
+obj-$(CONFIG_NET_DEVLINK) += devlink/
+obj-y += dsa/
obj-$(CONFIG_ATALK) += appletalk/
-obj-$(CONFIG_WAN_ROUTER) += wanrouter/
obj-$(CONFIG_X25) += x25/
obj-$(CONFIG_LAPB) += lapb/
obj-$(CONFIG_NETROM) += netrom/
obj-$(CONFIG_ROSE) += rose/
obj-$(CONFIG_AX25) += ax25/
-obj-$(CONFIG_IRDA) += irda/
+obj-$(CONFIG_CAN) += can/
obj-$(CONFIG_BT) += bluetooth/
obj-$(CONFIG_SUNRPC) += sunrpc/
-obj-$(CONFIG_RXRPC) += rxrpc/
+obj-$(CONFIG_AF_RXRPC) += rxrpc/
+obj-$(CONFIG_AF_KCM) += kcm/
+obj-$(CONFIG_STREAM_PARSER) += strparser/
obj-$(CONFIG_ATM) += atm/
-obj-$(CONFIG_DECNET) += decnet/
-obj-$(CONFIG_ECONET) += econet/
-obj-$(CONFIG_VLAN_8021Q) += 8021q/
-obj-$(CONFIG_IP_DCCP) += dccp/
+obj-$(CONFIG_L2TP) += l2tp/
+obj-$(CONFIG_PHONET) += phonet/
+ifneq ($(CONFIG_VLAN_8021Q),)
+obj-y += 8021q/
+endif
obj-$(CONFIG_IP_SCTP) += sctp/
-obj-$(CONFIG_IEEE80211) += ieee80211/
+obj-$(CONFIG_RDS) += rds/
+obj-$(CONFIG_WIRELESS) += wireless/
+obj-$(CONFIG_MAC80211) += mac80211/
obj-$(CONFIG_TIPC) += tipc/
obj-$(CONFIG_NETLABEL) += netlabel/
+obj-$(CONFIG_IUCV) += iucv/
+obj-$(CONFIG_SMC) += smc/
+obj-$(CONFIG_RFKILL) += rfkill/
+obj-$(CONFIG_NET_9P) += 9p/
+obj-$(CONFIG_CAIF) += caif/
+obj-$(CONFIG_DCB) += dcb/
+obj-$(CONFIG_6LOWPAN) += 6lowpan/
+obj-$(CONFIG_IEEE802154) += ieee802154/
+obj-$(CONFIG_MAC802154) += mac802154/
-ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
-endif
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
+obj-$(CONFIG_CEPH_LIB) += ceph/
+obj-$(CONFIG_BATMAN_ADV) += batman-adv/
+obj-$(CONFIG_NFC) += nfc/
+obj-$(CONFIG_PSAMPLE) += psample/
+obj-$(CONFIG_NET_IFE) += ife/
+obj-$(CONFIG_OPENVSWITCH) += openvswitch/
+obj-$(CONFIG_VSOCKETS) += vmw_vsock/
+obj-$(CONFIG_MPLS) += mpls/
+obj-$(CONFIG_NET_NSH) += nsh/
+obj-$(CONFIG_HSR) += hsr/
+obj-$(CONFIG_NET_SWITCHDEV) += switchdev/
+obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev/
+obj-$(CONFIG_QRTR) += qrtr/
+obj-$(CONFIG_NET_NCSI) += ncsi/
+obj-$(CONFIG_XDP_SOCKETS) += xdp/
+obj-$(CONFIG_MPTCP) += mptcp/
+obj-$(CONFIG_MCTP) += mctp/
+obj-$(CONFIG_NET_HANDSHAKE) += handshake/
+obj-$(CONFIG_NET_SHAPER) += shaper/
diff --git a/net/TUNABLE b/net/TUNABLE
deleted file mode 100644
index 9913211f07a7..000000000000
--- a/net/TUNABLE
+++ /dev/null
@@ -1,50 +0,0 @@
-The following parameters should be tunable at compile time. Some of them
-exist as sysctls too.
-
-This is far from complete
-
-Item Description
-----------------------------------------------------------------------------
-MAX_LINKS Maximum number of netlink minor devices. (1-32)
-RIF_TABLE_SIZE Token ring RIF cache size (tunable)
-AARP_HASH_SIZE Size of Appletalk hash table (tunable)
-AX25_DEF_T1 AX.25 parameters. These are all tunable via
-AX25_DEF_T2 SIOCAX25SETPARMS
-AX25_DEF_T3 T1-T3,N2 have the meanings in the specification
-AX25_DEF_N2
-AX25_DEF_AXDEFMODE 8 = normal 128 is PE1CHL extended
-AX25_DEF_IPDEFMODE 'D' - datagram 'V' - virtual connection
-AX25_DEF_BACKOFF 'E'xponential 'L'inear
-AX25_DEF_NETROM Allow netrom 1=Y
-AX25_DF_TEXT Allow PID=Text 1=Y
-AX25_DEF_WINDOW Window for normal mode
-AX25_DEF_EWINDOW Window for PE1CHL mode
-AX25_DEF_DIGI 1 for inband 2 for cross band 3 for both
-AX25_DEF_CONMODE Allow connected modes 1=Yes
-AX25_ROUTE_MAX AX.25 route cache size - no currently tunable
-Unnamed (16) Number of protocol hash slots (tunable)
-DEV_NUMBUFFS Number of priority levels (not easily tunable)
-Unnamed (300) Maximum packet backlog queue (tunable)
-MAX_IOVEC Maximum number of iovecs in a message (tunable)
-MIN_WINDOW Offered minimum window (tunable)
-MAX_WINDOW Offered maximum window (tunable)
-MAX_HEADER Largest physical header (tunable)
-MAX_ADDR_LEN Largest physical address (tunable)
-SOCK_ARRAY_SIZE IP socket array hash size (tunable)
-IP_MAX_MEMBERSHIPS Largest number of groups per socket (BSD style) (tunable)
-16 Hard coded constant for amount of room allowed for
- cache align and faster forwarding (tunable)
-IP_FRAG_TIME Time we hold a fragment for. (tunable)
-PORT_MASQ_BEGIN First port reserved for masquerade (tunable)
-PORT_MASQ_END Last port used for masquerade (tunable)
-MASQUERADE_EXPIRE_TCP_FIN Time we keep a masquerade for after a FIN
-MASQUERADE_EXPIRE_UDP Time we keep a UDP masquerade for (tunable)
-MAXVIFS Maximum mrouted vifs (1-32)
-MFC_LINES Lines in the multicast router cache (tunable)
-
-NetROM parameters are tunable via an ioctl passing a struct
-
-4000 Size a Unix domain socket malloc falls back to
- (tunable) should be 8K - a bit for 8K machines like
- the ALPHA
-
diff --git a/net/appletalk/Kconfig b/net/appletalk/Kconfig
new file mode 100644
index 000000000000..041141abf925
--- /dev/null
+++ b/net/appletalk/Kconfig
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Appletalk configuration
+#
+config ATALK
+ tristate "Appletalk protocol support"
+ select LLC
+ help
+ AppleTalk is the protocol that Apple computers can use to communicate
+ on a network. If your Linux box is connected to such a network and you
+ wish to connect to it, say Y. You will need to use the netatalk package
+ so that your Linux box can act as a print and file server for Macs as
+ well as access AppleTalk printers. Check out
+ <http://www.zettabyte.net/netatalk/> on the WWW for details.
+ EtherTalk is the name used for AppleTalk over Ethernet and the
+ cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
+ network using serial links. EtherTalk and LocalTalk are fully
+ supported by Linux.
+
+ General information about how to connect Linux, Windows machines and
+ Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
+ NET3-4-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>, contains valuable
+ information as well.
+
+ To compile this driver as a module, choose M here: the module will be
+ called appletalk. You almost certainly want to compile it as a
+ module so you can restart your AppleTalk stack without rebooting
+ your machine. I hear that the GNU boycott of Apple is over, so
+ even politically correct people are allowed to say Y here.
diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile
index 5cda56edef57..152312a15180 100644
--- a/net/appletalk/Makefile
+++ b/net/appletalk/Makefile
@@ -1,9 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux AppleTalk layer.
#
obj-$(CONFIG_ATALK) += appletalk.o
-appletalk-y := aarp.o ddp.o dev.o
+appletalk-y := aarp.o ddp.o
appletalk-$(CONFIG_PROC_FS) += atalk_proc.o
appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index f3777ec5bcb9..4744e3fd4544 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* AARP: An implementation of the AppleTalk AARP protocol for
* Ethernet 'ELAP'.
@@ -13,12 +14,6 @@
* Use neighbour discovery code.
* Token Ring Support.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
* References:
* Inside AppleTalk (2nd Ed).
* Fixes:
@@ -26,10 +21,10 @@
* Rob Newberry - Added proxy AARP and AARP proc fs,
* moved probing from DDP module.
* Arnaldo C. Melo - don't mangle rx packets
- *
*/
#include <linux/if_arp.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/datalink.h>
#include <net/psnap.h>
@@ -38,6 +33,9 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+#include <linux/refcount.h>
int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME;
int sysctl_aarp_tick_time = AARP_TICK_TIME;
@@ -47,17 +45,19 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME;
/* Lists of aarp entries */
/**
* struct aarp_entry - AARP entry
- * @last_sent - Last time we xmitted the aarp request
- * @packet_queue - Queue of frames wait for resolution
- * @status - Used for proxy AARP
- * expires_at - Entry expiry time
- * target_addr - DDP Address
- * dev - Device to use
- * hwaddr - Physical i/f address of target/router
- * xmit_count - When this hits 10 we give up
- * next - Next entry in chain
+ * @refcnt: Reference count
+ * @last_sent: Last time we xmitted the aarp request
+ * @packet_queue: Queue of frames wait for resolution
+ * @status: Used for proxy AARP
+ * @expires_at: Entry expiry time
+ * @target_addr: DDP Address
+ * @dev: Device to use
+ * @hwaddr: Physical i/f address of target/router
+ * @xmit_count: When this hits 10 we give up
+ * @next: Next entry in chain
*/
struct aarp_entry {
+ refcount_t refcnt;
/* These first two are only used for unresolved entries */
unsigned long last_sent;
struct sk_buff_head packet_queue;
@@ -65,7 +65,7 @@ struct aarp_entry {
unsigned long expires_at;
struct atalk_addr target_addr;
struct net_device *dev;
- char hwaddr[6];
+ char hwaddr[ETH_ALEN];
unsigned short xmit_count;
struct aarp_entry *next;
};
@@ -82,6 +82,17 @@ static DEFINE_RWLOCK(aarp_lock);
/* Used to walk the list and purge/kick entries. */
static struct timer_list aarp_timer;
+static inline void aarp_entry_get(struct aarp_entry *a)
+{
+ refcount_inc(&a->refcnt);
+}
+
+static inline void aarp_entry_put(struct aarp_entry *a)
+{
+ if (refcount_dec_and_test(&a->refcnt))
+ kfree(a);
+}
+
/*
* Delete an aarp queue
*
@@ -90,7 +101,7 @@ static struct timer_list aarp_timer;
static void __aarp_expire(struct aarp_entry *a)
{
skb_queue_purge(&a->packet_queue);
- kfree(a);
+ aarp_entry_put(a);
}
/*
@@ -118,7 +129,9 @@ static void __aarp_send_query(struct aarp_entry *a)
/* Set up the buffer */
skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
- skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(*eah));
skb->protocol = htons(ETH_P_ATALK);
skb->dev = dev;
eah = aarp_hdr(skb);
@@ -130,13 +143,13 @@ static void __aarp_send_query(struct aarp_entry *a)
eah->pa_len = AARP_PA_ALEN;
eah->function = htons(AARP_REQUEST);
- memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN);
+ ether_addr_copy(eah->hw_src, dev->dev_addr);
eah->pa_src_zero = 0;
eah->pa_src_net = sat->s_net;
eah->pa_src_node = sat->s_node;
- memset(eah->hw_dst, '\0', ETH_ALEN);
+ eth_zero_addr(eah->hw_dst);
eah->pa_dst_zero = 0;
eah->pa_dst_net = a->target_addr.s_net;
@@ -163,7 +176,9 @@ static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us,
/* Set up the buffer */
skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
- skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(*eah));
skb->protocol = htons(ETH_P_ATALK);
skb->dev = dev;
eah = aarp_hdr(skb);
@@ -175,16 +190,16 @@ static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us,
eah->pa_len = AARP_PA_ALEN;
eah->function = htons(AARP_REPLY);
- memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN);
+ ether_addr_copy(eah->hw_src, dev->dev_addr);
eah->pa_src_zero = 0;
eah->pa_src_net = us->s_net;
eah->pa_src_node = us->s_node;
if (!sha)
- memset(eah->hw_dst, '\0', ETH_ALEN);
+ eth_zero_addr(eah->hw_dst);
else
- memcpy(eah->hw_dst, sha, ETH_ALEN);
+ ether_addr_copy(eah->hw_dst, sha);
eah->pa_dst_zero = 0;
eah->pa_dst_net = them->s_net;
@@ -212,7 +227,9 @@ static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us)
/* Set up the buffer */
skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
- skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(*eah));
skb->protocol = htons(ETH_P_ATALK);
skb->dev = dev;
eah = aarp_hdr(skb);
@@ -224,13 +241,13 @@ static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us)
eah->pa_len = AARP_PA_ALEN;
eah->function = htons(AARP_PROBE);
- memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN);
+ ether_addr_copy(eah->hw_src, dev->dev_addr);
eah->pa_src_zero = 0;
eah->pa_src_net = us->s_net;
eah->pa_src_node = us->s_node;
- memset(eah->hw_dst, '\0', ETH_ALEN);
+ eth_zero_addr(eah->hw_dst);
eah->pa_dst_zero = 0;
eah->pa_dst_net = us->s_net;
@@ -301,7 +318,7 @@ static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev)
}
/* Handle the timer event */
-static void aarp_expire_timeout(unsigned long unused)
+static void aarp_expire_timeout(struct timer_list *unused)
{
int ct;
@@ -324,15 +341,19 @@ static void aarp_expire_timeout(unsigned long unused)
static int aarp_device_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
int ct;
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
if (event == NETDEV_DOWN) {
write_lock_bh(&aarp_lock);
for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
- __aarp_expire_device(&resolved[ct], ptr);
- __aarp_expire_device(&unresolved[ct], ptr);
- __aarp_expire_device(&proxies[ct], ptr);
+ __aarp_expire_device(&resolved[ct], dev);
+ __aarp_expire_device(&unresolved[ct], dev);
+ __aarp_expire_device(&proxies[ct], dev);
}
write_unlock_bh(&aarp_lock);
@@ -373,9 +394,11 @@ static void aarp_purge(void)
static struct aarp_entry *aarp_alloc(void)
{
struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC);
+ if (!a)
+ return NULL;
- if (a)
- skb_queue_head_init(&a->packet_queue);
+ refcount_set(&a->refcnt, 1);
+ skb_queue_head_init(&a->packet_queue);
return a;
}
@@ -425,48 +448,18 @@ static struct atalk_addr *__aarp_proxy_find(struct net_device *dev,
return a ? sa : NULL;
}
-/*
- * Probe a Phase 1 device or a device that requires its Net:Node to
- * be set via an ioctl.
- */
-static void aarp_send_probe_phase1(struct atalk_iface *iface)
-{
- struct ifreq atreq;
- struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr;
-
- sa->sat_addr.s_node = iface->address.s_node;
- sa->sat_addr.s_net = ntohs(iface->address.s_net);
-
- /* We pass the Net:Node to the drivers/cards by a Device ioctl. */
- if (!(iface->dev->do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) {
- (void)iface->dev->do_ioctl(iface->dev, &atreq, SIOCGIFADDR);
- if (iface->address.s_net != htons(sa->sat_addr.s_net) ||
- iface->address.s_node != sa->sat_addr.s_node)
- iface->status |= ATIF_PROBE_FAIL;
-
- iface->address.s_net = htons(sa->sat_addr.s_net);
- iface->address.s_node = sa->sat_addr.s_node;
- }
-}
-
-
void aarp_probe_network(struct atalk_iface *atif)
{
- if (atif->dev->type == ARPHRD_LOCALTLK ||
- atif->dev->type == ARPHRD_PPP)
- aarp_send_probe_phase1(atif);
- else {
- unsigned int count;
+ unsigned int count;
- for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
- aarp_send_probe(atif->dev, &atif->address);
+ for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+ aarp_send_probe(atif->dev, &atif->address);
- /* Defer 1/10th */
- msleep(100);
+ /* Defer 1/10th */
+ msleep(100);
- if (atif->status & ATIF_PROBE_FAIL)
- break;
- }
+ if (atif->status & ATIF_PROBE_FAIL)
+ break;
}
}
@@ -500,6 +493,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
entry->dev = atif->dev;
write_lock_bh(&aarp_lock);
+ aarp_entry_get(entry);
hash = sa->s_node % (AARP_HASH_SIZE - 1);
entry->next = proxies[hash];
@@ -525,6 +519,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
retval = 1;
}
+ aarp_entry_put(entry);
write_unlock_bh(&aarp_lock);
out:
return retval;
@@ -539,7 +534,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
int hash;
struct aarp_entry *a;
- skb->nh.raw = skb->data;
+ skb_reset_network_header(skb);
/* Check for LocalTalk first */
if (dev->type == ARPHRD_LOCALTLK) {
@@ -588,7 +583,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
/* Non ELAP we cannot do. */
if (dev->type != ARPHRD_ETHER)
- return -1;
+ goto free_it;
skb->dev = dev;
skb->protocol = htons(ETH_P_ATALK);
@@ -623,7 +618,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
if (!a) {
/* Whoops slipped... good job it's an unreliable protocol 8) */
write_unlock_bh(&aarp_lock);
- return -1;
+ goto free_it;
}
/* Set up the queue */
@@ -652,15 +647,21 @@ out_unlock:
write_unlock_bh(&aarp_lock);
/* Tell the ddp layer we have taken over for this frame. */
- return 0;
+ goto sent;
sendit:
if (skb->sk)
- skb->priority = skb->sk->sk_priority;
- dev_queue_xmit(skb);
+ skb->priority = READ_ONCE(skb->sk->sk_priority);
+ if (dev_queue_xmit(skb))
+ goto drop;
sent:
- return 1;
+ return NET_XMIT_SUCCESS;
+free_it:
+ kfree_skb(skb);
+drop:
+ return NET_XMIT_DROP;
}
+EXPORT_SYMBOL(aarp_send_ddp);
/*
* An entry in the aarp unresolved queue has become resolved. Send
@@ -706,6 +707,9 @@ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
struct atalk_addr sa, *ma, da;
struct atalk_iface *ifa;
+ if (!net_eq(dev_net(dev), &init_net))
+ goto out0;
+
/* We only do Ethernet SNAP AARP. */
if (dev->type != ARPHRD_ETHER)
goto out0;
@@ -751,96 +755,94 @@ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
if (a && a->status & ATIF_PROBE) {
a->status |= ATIF_PROBE_FAIL;
/*
- * we do not respond to probe or request packets for
+ * we do not respond to probe or request packets of
* this address while we are probing this address
*/
goto unlock;
}
switch (function) {
- case AARP_REPLY:
- if (!unresolved_count) /* Speed up */
- break;
-
- /* Find the entry. */
- a = __aarp_find_entry(unresolved[hash], dev, &sa);
- if (!a || dev != a->dev)
- break;
-
- /* We can fill one in - this is good. */
- memcpy(a->hwaddr, ea->hw_src, ETH_ALEN);
- __aarp_resolved(&unresolved[hash], a, hash);
- if (!unresolved_count)
- mod_timer(&aarp_timer,
- jiffies + sysctl_aarp_expiry_time);
+ case AARP_REPLY:
+ if (!unresolved_count) /* Speed up */
+ break;
+
+ /* Find the entry. */
+ a = __aarp_find_entry(unresolved[hash], dev, &sa);
+ if (!a || dev != a->dev)
break;
- case AARP_REQUEST:
- case AARP_PROBE:
+ /* We can fill one in - this is good. */
+ ether_addr_copy(a->hwaddr, ea->hw_src);
+ __aarp_resolved(&unresolved[hash], a, hash);
+ if (!unresolved_count)
+ mod_timer(&aarp_timer,
+ jiffies + sysctl_aarp_expiry_time);
+ break;
+ case AARP_REQUEST:
+ case AARP_PROBE:
+
+ /*
+ * If it is my address set ma to my address and reply.
+ * We can treat probe and request the same. Probe
+ * simply means we shouldn't cache the querying host,
+ * as in a probe they are proposing an address not
+ * using one.
+ *
+ * Support for proxy-AARP added. We check if the
+ * address is one of our proxies before we toss the
+ * packet out.
+ */
+
+ sa.s_node = ea->pa_dst_node;
+ sa.s_net = ea->pa_dst_net;
+
+ /* See if we have a matching proxy. */
+ ma = __aarp_proxy_find(dev, &sa);
+ if (!ma)
+ ma = &ifa->address;
+ else { /* We need to make a copy of the entry. */
+ da.s_node = sa.s_node;
+ da.s_net = sa.s_net;
+ ma = &da;
+ }
+
+ if (function == AARP_PROBE) {
/*
- * If it is my address set ma to my address and reply.
- * We can treat probe and request the same. Probe
- * simply means we shouldn't cache the querying host,
- * as in a probe they are proposing an address not
- * using one.
- *
- * Support for proxy-AARP added. We check if the
- * address is one of our proxies before we toss the
- * packet out.
+ * A probe implies someone trying to get an
+ * address. So as a precaution flush any
+ * entries we have for this address.
*/
+ a = __aarp_find_entry(resolved[sa.s_node %
+ (AARP_HASH_SIZE - 1)],
+ skb->dev, &sa);
- sa.s_node = ea->pa_dst_node;
- sa.s_net = ea->pa_dst_net;
-
- /* See if we have a matching proxy. */
- ma = __aarp_proxy_find(dev, &sa);
- if (!ma)
- ma = &ifa->address;
- else { /* We need to make a copy of the entry. */
- da.s_node = sa.s_node;
- da.s_net = da.s_net;
- ma = &da;
- }
-
- if (function == AARP_PROBE) {
- /*
- * A probe implies someone trying to get an
- * address. So as a precaution flush any
- * entries we have for this address.
- */
- struct aarp_entry *a;
-
- a = __aarp_find_entry(resolved[sa.s_node %
- (AARP_HASH_SIZE - 1)],
- skb->dev, &sa);
-
- /*
- * Make it expire next tick - that avoids us
- * getting into a probe/flush/learn/probe/
- * flush/learn cycle during probing of a slow
- * to respond host addr.
- */
- if (a) {
- a->expires_at = jiffies - 1;
- mod_timer(&aarp_timer, jiffies +
- sysctl_aarp_tick_time);
- }
+ /*
+ * Make it expire next tick - that avoids us
+ * getting into a probe/flush/learn/probe/
+ * flush/learn cycle during probing of a slow
+ * to respond host addr.
+ */
+ if (a) {
+ a->expires_at = jiffies - 1;
+ mod_timer(&aarp_timer, jiffies +
+ sysctl_aarp_tick_time);
}
+ }
- if (sa.s_node != ma->s_node)
- break;
+ if (sa.s_node != ma->s_node)
+ break;
- if (sa.s_net && ma->s_net && sa.s_net != ma->s_net)
- break;
+ if (sa.s_net && ma->s_net && sa.s_net != ma->s_net)
+ break;
- sa.s_node = ea->pa_src_node;
- sa.s_net = ea->pa_src_net;
+ sa.s_node = ea->pa_src_node;
+ sa.s_net = ea->pa_src_net;
- /* aarp_my_address has found the address to use for us.
- */
- aarp_send_reply(dev, ma, &sa, ea->hw_src);
- break;
+ /* aarp_my_address has found the address to use for us.
+ */
+ aarp_send_reply(dev, ma, &sa, ea->hw_src);
+ break;
}
unlock:
@@ -858,17 +860,24 @@ static struct notifier_block aarp_notifier = {
static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 };
-void __init aarp_proto_init(void)
+int __init aarp_proto_init(void)
{
+ int rc;
+
aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
- if (!aarp_dl)
+ if (!aarp_dl) {
printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
- init_timer(&aarp_timer);
- aarp_timer.function = aarp_expire_timeout;
- aarp_timer.data = 0;
+ return -ENOMEM;
+ }
+ timer_setup(&aarp_timer, aarp_expire_timeout, 0);
aarp_timer.expires = jiffies + sysctl_aarp_expiry_time;
add_timer(&aarp_timer);
- register_netdevice_notifier(&aarp_notifier);
+ rc = register_netdevice_notifier(&aarp_notifier);
+ if (rc) {
+ timer_delete_sync(&aarp_timer);
+ unregister_snap_client(aarp_dl);
+ }
+ return rc;
}
/* Remove the AARP entries associated with a device. */
@@ -888,14 +897,9 @@ void aarp_device_down(struct net_device *dev)
}
#ifdef CONFIG_PROC_FS
-struct aarp_iter_state {
- int bucket;
- struct aarp_entry **table;
-};
-
/*
* Get the aarp entry that is in the chain described
- * by the iterator.
+ * by the iterator.
* If pos is set then skip till that index.
* pos = 1 is the first entry
*/
@@ -905,9 +909,9 @@ static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos)
struct aarp_entry **table = iter->table;
loff_t off = 0;
struct aarp_entry *entry;
-
+
rescan:
- while(ct < AARP_HASH_SIZE) {
+ while (ct < AARP_HASH_SIZE) {
for (entry = table[ct]; entry; entry = entry->next) {
if (!pos || ++off == *pos) {
iter->table = table;
@@ -932,6 +936,7 @@ static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos)
}
static void *aarp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(aarp_lock)
{
struct aarp_iter_state *iter = seq->private;
@@ -950,9 +955,9 @@ static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
/* first line after header */
- if (v == SEQ_START_TOKEN)
+ if (v == SEQ_START_TOKEN)
entry = iter_next(iter, NULL);
-
+
/* next entry in current bucket */
else if (entry->next)
entry = entry->next;
@@ -966,6 +971,7 @@ static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void aarp_seq_stop(struct seq_file *seq, void *v)
+ __releases(aarp_lock)
{
read_unlock_bh(&aarp_lock);
}
@@ -974,7 +980,7 @@ static const char *dt2str(unsigned long ticks)
{
static char buf[32];
- sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100 ) / HZ);
+ sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100) / HZ);
return buf;
}
@@ -986,7 +992,7 @@ static int aarp_seq_show(struct seq_file *seq, void *v)
unsigned long now = jiffies;
if (v == SEQ_START_TOKEN)
- seq_puts(seq,
+ seq_puts(seq,
"Address Interface Hardware Address"
" Expires LastSend Retry Status\n");
else {
@@ -994,13 +1000,7 @@ static int aarp_seq_show(struct seq_file *seq, void *v)
ntohs(entry->target_addr.s_net),
(unsigned int) entry->target_addr.s_node,
entry->dev ? entry->dev->name : "????");
- seq_printf(seq, "%02X:%02X:%02X:%02X:%02X:%02X",
- entry->hwaddr[0] & 0xFF,
- entry->hwaddr[1] & 0xFF,
- entry->hwaddr[2] & 0xFF,
- entry->hwaddr[3] & 0xFF,
- entry->hwaddr[4] & 0xFF,
- entry->hwaddr[5] & 0xFF);
+ seq_printf(seq, "%pM", entry->hwaddr);
seq_printf(seq, " %8s",
dt2str((long)entry->expires_at - (long)now));
if (iter->table == unresolved)
@@ -1014,53 +1014,22 @@ static int aarp_seq_show(struct seq_file *seq, void *v)
: (iter->table == unresolved) ? "unresolved"
: (iter->table == proxies) ? "proxies"
: "unknown");
- }
+ }
return 0;
}
-static struct seq_operations aarp_seq_ops = {
+const struct seq_operations aarp_seq_ops = {
.start = aarp_seq_start,
.next = aarp_seq_next,
.stop = aarp_seq_stop,
.show = aarp_seq_show,
};
-
-static int aarp_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct aarp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &aarp_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-struct file_operations atalk_seq_arp_fops = {
- .owner = THIS_MODULE,
- .open = aarp_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
#endif
/* General module cleanup. Called from cleanup_module() in ddp.c. */
void aarp_cleanup_module(void)
{
- del_timer_sync(&aarp_timer);
+ timer_delete_sync(&aarp_timer);
unregister_netdevice_notifier(&aarp_notifier);
unregister_snap_client(aarp_dl);
aarp_purge();
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 7ae4916cd26d..01787fb6a7bc 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -1,18 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* atalk_proc.c - proc support for Appletalk
*
* Copyright(c) Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation, version 2.
*/
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include <linux/atalk.h>
+#include <linux/export.h>
static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos)
@@ -26,6 +25,7 @@ static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos)
}
static void *atalk_seq_interface_start(struct seq_file *seq, loff_t *pos)
+ __acquires(atalk_interfaces_lock)
{
loff_t l = *pos;
@@ -51,6 +51,7 @@ out:
}
static void atalk_seq_interface_stop(struct seq_file *seq, void *v)
+ __releases(atalk_interfaces_lock)
{
read_unlock_bh(&atalk_interfaces_lock);
}
@@ -85,6 +86,7 @@ static __inline__ struct atalk_route *atalk_get_route_idx(loff_t pos)
}
static void *atalk_seq_route_start(struct seq_file *seq, loff_t *pos)
+ __acquires(atalk_routes_lock)
{
loff_t l = *pos;
@@ -110,6 +112,7 @@ out:
}
static void atalk_seq_route_stop(struct seq_file *seq, void *v)
+ __releases(atalk_routes_lock)
{
read_unlock_bh(&atalk_routes_lock);
}
@@ -139,42 +142,20 @@ out:
return 0;
}
-static __inline__ struct sock *atalk_get_socket_idx(loff_t pos)
-{
- struct sock *s;
- struct hlist_node *node;
-
- sk_for_each(s, node, &atalk_sockets)
- if (!pos--)
- goto found;
- s = NULL;
-found:
- return s;
-}
-
static void *atalk_seq_socket_start(struct seq_file *seq, loff_t *pos)
+ __acquires(atalk_sockets_lock)
{
- loff_t l = *pos;
-
read_lock_bh(&atalk_sockets_lock);
- return l ? atalk_get_socket_idx(--l) : SEQ_START_TOKEN;
+ return seq_hlist_start_head(&atalk_sockets, *pos);
}
static void *atalk_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct sock *i;
-
- ++*pos;
- if (v == SEQ_START_TOKEN) {
- i = sk_head(&atalk_sockets);
- goto out;
- }
- i = sk_next(v);
-out:
- return i;
+ return seq_hlist_next(v, &atalk_sockets, pos);
}
static void atalk_seq_socket_stop(struct seq_file *seq, void *v)
+ __releases(atalk_sockets_lock)
{
read_unlock_bh(&atalk_sockets_lock);
}
@@ -190,131 +171,72 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v)
goto out;
}
- s = v;
+ s = sk_entry(v);
at = at_sk(s);
seq_printf(seq, "%02X %04X:%02X:%02X %04X:%02X:%02X %08X:%08X "
- "%02X %d\n",
+ "%02X %u\n",
s->sk_type, ntohs(at->src_net), at->src_node, at->src_port,
ntohs(at->dest_net), at->dest_node, at->dest_port,
- atomic_read(&s->sk_wmem_alloc),
- atomic_read(&s->sk_rmem_alloc),
- s->sk_state, SOCK_INODE(s->sk_socket)->i_uid);
+ sk_wmem_alloc_get(s),
+ sk_rmem_alloc_get(s),
+ s->sk_state,
+ from_kuid_munged(seq_user_ns(seq), sk_uid(s)));
out:
return 0;
}
-static struct seq_operations atalk_seq_interface_ops = {
+static const struct seq_operations atalk_seq_interface_ops = {
.start = atalk_seq_interface_start,
.next = atalk_seq_interface_next,
.stop = atalk_seq_interface_stop,
.show = atalk_seq_interface_show,
};
-static struct seq_operations atalk_seq_route_ops = {
+static const struct seq_operations atalk_seq_route_ops = {
.start = atalk_seq_route_start,
.next = atalk_seq_route_next,
.stop = atalk_seq_route_stop,
.show = atalk_seq_route_show,
};
-static struct seq_operations atalk_seq_socket_ops = {
+static const struct seq_operations atalk_seq_socket_ops = {
.start = atalk_seq_socket_start,
.next = atalk_seq_socket_next,
.stop = atalk_seq_socket_stop,
.show = atalk_seq_socket_show,
};
-static int atalk_seq_interface_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &atalk_seq_interface_ops);
-}
-
-static int atalk_seq_route_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &atalk_seq_route_ops);
-}
-
-static int atalk_seq_socket_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &atalk_seq_socket_ops);
-}
-
-static struct file_operations atalk_seq_interface_fops = {
- .owner = THIS_MODULE,
- .open = atalk_seq_interface_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static struct file_operations atalk_seq_route_fops = {
- .owner = THIS_MODULE,
- .open = atalk_seq_route_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static struct file_operations atalk_seq_socket_fops = {
- .owner = THIS_MODULE,
- .open = atalk_seq_socket_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static struct proc_dir_entry *atalk_proc_dir;
-
int __init atalk_proc_init(void)
{
- struct proc_dir_entry *p;
- int rc = -ENOMEM;
+ if (!proc_mkdir("atalk", init_net.proc_net))
+ return -ENOMEM;
- atalk_proc_dir = proc_mkdir("atalk", proc_net);
- if (!atalk_proc_dir)
+ if (!proc_create_seq("atalk/interface", 0444, init_net.proc_net,
+ &atalk_seq_interface_ops))
goto out;
- atalk_proc_dir->owner = THIS_MODULE;
- p = create_proc_entry("interface", S_IRUGO, atalk_proc_dir);
- if (!p)
- goto out_interface;
- p->proc_fops = &atalk_seq_interface_fops;
+ if (!proc_create_seq("atalk/route", 0444, init_net.proc_net,
+ &atalk_seq_route_ops))
+ goto out;
- p = create_proc_entry("route", S_IRUGO, atalk_proc_dir);
- if (!p)
- goto out_route;
- p->proc_fops = &atalk_seq_route_fops;
+ if (!proc_create_seq("atalk/socket", 0444, init_net.proc_net,
+ &atalk_seq_socket_ops))
+ goto out;
- p = create_proc_entry("socket", S_IRUGO, atalk_proc_dir);
- if (!p)
- goto out_socket;
- p->proc_fops = &atalk_seq_socket_fops;
+ if (!proc_create_seq_private("atalk/arp", 0444, init_net.proc_net,
+ &aarp_seq_ops,
+ sizeof(struct aarp_iter_state), NULL))
+ goto out;
- p = create_proc_entry("arp", S_IRUGO, atalk_proc_dir);
- if (!p)
- goto out_arp;
- p->proc_fops = &atalk_seq_arp_fops;
+ return 0;
- rc = 0;
out:
- return rc;
-out_arp:
- remove_proc_entry("socket", atalk_proc_dir);
-out_socket:
- remove_proc_entry("route", atalk_proc_dir);
-out_route:
- remove_proc_entry("interface", atalk_proc_dir);
-out_interface:
- remove_proc_entry("atalk", proc_net);
- goto out;
+ remove_proc_subtree("atalk", init_net.proc_net);
+ return -ENOMEM;
}
-void __exit atalk_proc_exit(void)
+void atalk_proc_exit(void)
{
- remove_proc_entry("interface", atalk_proc_dir);
- remove_proc_entry("route", atalk_proc_dir);
- remove_proc_entry("socket", atalk_proc_dir);
- remove_proc_entry("arp", atalk_proc_dir);
- remove_proc_entry("atalk", proc_net);
+ remove_proc_subtree("atalk", init_net.proc_net);
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 485e35c3b28b..2a01fff46c9d 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* DDP: An implementation of the AppleTalk DDP protocol for
* Ethernet 'ELAP'.
*
- * Alan Cox <Alan.Cox@linux.org>
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
*
* With more than a little assistance from
*
@@ -28,39 +29,37 @@
* Bradford Johnson : IP-over-DDP (experimental)
* Jay Schulist : Moved IP-over-DDP to its own
* driver file. (ipddp.c & ipddp.h)
- * Jay Schulist : Made work as module with
+ * Jay Schulist : Made work as module with
* AppleTalk drivers, cleaned it.
* Rob Newberry : Added proxy AARP and AARP
* procfs, moved probing to AARP
* module.
- * Adrian Sun/
- * Michael Zuelsdorff : fix for net.0 packets. don't
+ * Adrian Sun/
+ * Michael Zuelsdorff : fix for net.0 packets. don't
* allow illegal ether/tokentalk
- * port assignment. we lose a
- * valid localtalk port as a
+ * port assignment. we lose a
+ * valid localtalk port as a
* result.
* Arnaldo C. de Melo : Cleanup, in preparation for
* shared skb support 8)
* Arnaldo C. de Melo : Move proc stuff to atalk_proc.c,
* use seq_file
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/termios.h> /* For TIOCOUTQ/INQ */
+#include <linux/compat.h>
+#include <linux/slab.h>
#include <net/datalink.h>
#include <net/psnap.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/route.h>
+#include <net/compat.h>
#include <linux/atalk.h>
+#include <linux/highmem.h>
struct datalink_proto *ddp_dl, *aarp_dl;
static const struct proto_ops atalk_dgram_ops;
@@ -89,27 +88,39 @@ static inline void atalk_remove_socket(struct sock *sk)
static struct sock *atalk_search_socket(struct sockaddr_at *to,
struct atalk_iface *atif)
{
+ struct sock *def_socket = NULL;
struct sock *s;
- struct hlist_node *node;
read_lock_bh(&atalk_sockets_lock);
- sk_for_each(s, node, &atalk_sockets) {
+ sk_for_each(s, &atalk_sockets) {
struct atalk_sock *at = at_sk(s);
if (to->sat_port != at->src_port)
continue;
- if (to->sat_addr.s_net == ATADDR_ANYNET &&
- to->sat_addr.s_node == ATADDR_BCAST)
- goto found;
+ if (to->sat_addr.s_net == ATADDR_ANYNET &&
+ to->sat_addr.s_node == ATADDR_BCAST) {
+ if (atif->address.s_node == at->src_node &&
+ atif->address.s_net == at->src_net) {
+ /* This socket's address matches the address of the interface
+ * that received the packet -- use it
+ */
+ goto found;
+ }
- if (to->sat_addr.s_net == at->src_net &&
+ /* Continue searching for a socket matching the interface address,
+ * but use this socket by default if no other one is found
+ */
+ def_socket = s;
+ }
+
+ if (to->sat_addr.s_net == at->src_net &&
(to->sat_addr.s_node == at->src_node ||
to->sat_addr.s_node == ATADDR_BCAST ||
to->sat_addr.s_node == ATADDR_ANYNODE))
goto found;
- /* XXXX.0 -- we got a request for this router. make sure
+ /* XXXX.0 -- we got a request for this router. make sure
* that the node is appropriately set. */
if (to->sat_addr.s_node == ATADDR_ANYNODE &&
to->sat_addr.s_net != ATADDR_ANYNET &&
@@ -118,7 +129,7 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to,
goto found;
}
}
- s = NULL;
+ s = def_socket;
found:
read_unlock_bh(&atalk_sockets_lock);
return s;
@@ -126,8 +137,8 @@ found:
/**
* atalk_find_or_insert_socket - Try to find a socket matching ADDR
- * @sk - socket to insert in the list if it is not there already
- * @sat - address to search for
+ * @sk: socket to insert in the list if it is not there already
+ * @sat: address to search for
*
* Try to find a socket matching ADDR in the socket list, if found then return
* it. If not, insert SK into the socket list.
@@ -138,11 +149,10 @@ static struct sock *atalk_find_or_insert_socket(struct sock *sk,
struct sockaddr_at *sat)
{
struct sock *s;
- struct hlist_node *node;
struct atalk_sock *at;
write_lock_bh(&atalk_sockets_lock);
- sk_for_each(s, node, &atalk_sockets) {
+ sk_for_each(s, &atalk_sockets) {
at = at_sk(s);
if (at->src_net == sat->sat_addr.s_net &&
@@ -157,12 +167,11 @@ found:
return s;
}
-static void atalk_destroy_timer(unsigned long data)
+static void atalk_destroy_timer(struct timer_list *t)
{
- struct sock *sk = (struct sock *)data;
+ struct sock *sk = timer_container_of(sk, t, sk_timer);
- if (atomic_read(&sk->sk_wmem_alloc) ||
- atomic_read(&sk->sk_rmem_alloc)) {
+ if (sk_has_allocations(sk)) {
sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
add_timer(&sk->sk_timer);
} else
@@ -174,12 +183,9 @@ static inline void atalk_destroy_socket(struct sock *sk)
atalk_remove_socket(sk);
skb_queue_purge(&sk->sk_receive_queue);
- if (atomic_read(&sk->sk_wmem_alloc) ||
- atomic_read(&sk->sk_rmem_alloc)) {
- init_timer(&sk->sk_timer);
+ if (sk_has_allocations(sk)) {
+ timer_setup(&sk->sk_timer, atalk_destroy_timer, 0);
sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
- sk->sk_timer.function = atalk_destroy_timer;
- sk->sk_timer.data = (unsigned long)sk;
add_timer(&sk->sk_timer);
} else
sock_put(sk);
@@ -295,7 +301,7 @@ static int atif_probe_device(struct atalk_iface *atif)
/* Perform AARP probing for a proxy address */
static int atif_proxy_probe_device(struct atalk_iface *atif,
- struct atalk_addr* proxy_addr)
+ struct atalk_addr *proxy_addr)
{
int netrange = ntohs(atif->nets.nr_lastnet) -
ntohs(atif->nets.nr_firstnet) + 1;
@@ -313,7 +319,7 @@ static int atif_proxy_probe_device(struct atalk_iface *atif,
if (probe_node == ATADDR_ANYNODE)
probe_node = jiffies & 0xFF;
-
+
/* Scan the networks */
for (netct = 0; netct <= netrange; netct++) {
/* Sweep the available nodes from a given start */
@@ -416,7 +422,7 @@ static struct atalk_iface *atalk_find_interface(__be16 net, int node)
if (node == ATADDR_ANYNODE && net != ATADDR_ANYNET &&
ntohs(iface->nets.nr_firstnet) <= ntohs(net) &&
ntohs(net) <= ntohs(iface->nets.nr_lastnet))
- break;
+ break;
}
read_unlock_bh(&atalk_interfaces_lock);
return iface;
@@ -431,13 +437,13 @@ static struct atalk_iface *atalk_find_interface(__be16 net, int node)
static struct atalk_route *atrtr_find(struct atalk_addr *target)
{
/*
- * we must search through all routes unless we find a
+ * we must search through all routes unless we find a
* host route, because some host routes might overlap
* network routes
*/
struct atalk_route *net_route = NULL;
struct atalk_route *r;
-
+
read_lock_bh(&atalk_routes_lock);
for (r = atalk_routes; r; r = r->next) {
if (!(r->flags & RTF_UP))
@@ -459,8 +465,8 @@ static struct atalk_route *atrtr_find(struct atalk_addr *target)
net_route = r;
}
}
-
- /*
+
+ /*
* if we found a network route but not a direct host
* route, then return it
*/
@@ -539,15 +545,15 @@ static int atrtr_create(struct rtentry *r, struct net_device *devhint)
for (iface = atalk_interfaces; iface; iface = iface->next) {
if (!riface &&
ntohs(ga->sat_addr.s_net) >=
- ntohs(iface->nets.nr_firstnet) &&
+ ntohs(iface->nets.nr_firstnet) &&
ntohs(ga->sat_addr.s_net) <=
- ntohs(iface->nets.nr_lastnet))
+ ntohs(iface->nets.nr_lastnet))
riface = iface;
if (ga->sat_addr.s_net == iface->address.s_net &&
ga->sat_addr.s_node == iface->address.s_node)
riface = iface;
- }
+ }
read_unlock_bh(&atalk_interfaces_lock);
retval = -ENETUNREACH;
@@ -570,6 +576,7 @@ static int atrtr_create(struct rtentry *r, struct net_device *devhint)
/* Fill in the routing entry */
rt->target = ta->sat_addr;
+ dev_put(rt->dev); /* Release old device */
dev_hold(devhint);
rt->dev = devhint;
rt->flags = r->rt_flags;
@@ -583,7 +590,7 @@ out:
}
/* Delete a route. Find it and discard it */
-static int atrtr_delete(struct atalk_addr * addr)
+static int atrtr_delete(struct atalk_addr *addr)
{
struct atalk_route **r = &atalk_routes;
int retval = 0;
@@ -646,9 +653,14 @@ static inline void atalk_dev_down(struct net_device *dev)
static int ddp_device_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
if (event == NETDEV_DOWN)
/* Discard any use of this */
- atalk_dev_down(ptr);
+ atalk_dev_down(dev);
return NOTIFY_DONE;
}
@@ -668,10 +680,10 @@ static int atif_ioctl(int cmd, void __user *arg)
struct rtentry rtdef;
int add_route;
- if (copy_from_user(&atreq, arg, sizeof(atreq)))
+ if (get_user_ifreq(&atreq, NULL, arg))
return -EFAULT;
- dev = __dev_get_by_name(atreq.ifr_name);
+ dev = __dev_get_by_name(&init_net, atreq.ifr_name);
if (!dev)
return -ENODEV;
@@ -679,198 +691,213 @@ static int atif_ioctl(int cmd, void __user *arg)
atif = atalk_find_dev(dev);
switch (cmd) {
- case SIOCSIFADDR:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- if (sa->sat_family != AF_APPLETALK)
- return -EINVAL;
- if (dev->type != ARPHRD_ETHER &&
- dev->type != ARPHRD_LOOPBACK &&
- dev->type != ARPHRD_LOCALTLK &&
- dev->type != ARPHRD_PPP)
- return -EPROTONOSUPPORT;
+ case SIOCSIFADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ if (dev->type != ARPHRD_ETHER &&
+ dev->type != ARPHRD_LOOPBACK &&
+ dev->type != ARPHRD_LOCALTLK &&
+ dev->type != ARPHRD_PPP)
+ return -EPROTONOSUPPORT;
- nr = (struct atalk_netrange *)&sa->sat_zero[0];
- add_route = 1;
+ nr = (struct atalk_netrange *)&sa->sat_zero[0];
+ add_route = 1;
- /*
- * if this is a point-to-point iface, and we already
- * have an iface for this AppleTalk address, then we
- * should not add a route
- */
- if ((dev->flags & IFF_POINTOPOINT) &&
- atalk_find_interface(sa->sat_addr.s_net,
- sa->sat_addr.s_node)) {
- printk(KERN_DEBUG "AppleTalk: point-to-point "
- "interface added with "
- "existing address\n");
- add_route = 0;
- }
-
- /*
- * Phase 1 is fine on LocalTalk but we don't do
- * EtherTalk phase 1. Anyone wanting to add it go ahead.
- */
- if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
- return -EPROTONOSUPPORT;
- if (sa->sat_addr.s_node == ATADDR_BCAST ||
- sa->sat_addr.s_node == 254)
+ /*
+ * if this is a point-to-point iface, and we already
+ * have an iface for this AppleTalk address, then we
+ * should not add a route
+ */
+ if ((dev->flags & IFF_POINTOPOINT) &&
+ atalk_find_interface(sa->sat_addr.s_net,
+ sa->sat_addr.s_node)) {
+ printk(KERN_DEBUG "AppleTalk: point-to-point "
+ "interface added with "
+ "existing address\n");
+ add_route = 0;
+ }
+
+ /*
+ * Phase 1 is fine on LocalTalk but we don't do
+ * EtherTalk phase 1. Anyone wanting to add it, go ahead.
+ */
+ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
+ return -EPROTONOSUPPORT;
+ if (sa->sat_addr.s_node == ATADDR_BCAST ||
+ sa->sat_addr.s_node == 254)
+ return -EINVAL;
+ if (atif) {
+ /* Already setting address */
+ if (atif->status & ATIF_PROBE)
+ return -EBUSY;
+
+ atif->address.s_net = sa->sat_addr.s_net;
+ atif->address.s_node = sa->sat_addr.s_node;
+ atrtr_device_down(dev); /* Flush old routes */
+ } else {
+ atif = atif_add_device(dev, &sa->sat_addr);
+ if (!atif)
+ return -ENOMEM;
+ }
+ atif->nets = *nr;
+
+ /*
+ * Check if the chosen address is used. If so we
+ * error and atalkd will try another.
+ */
+
+ if (!(dev->flags & IFF_LOOPBACK) &&
+ !(dev->flags & IFF_POINTOPOINT) &&
+ atif_probe_device(atif) < 0) {
+ atif_drop_device(dev);
+ return -EADDRINUSE;
+ }
+
+ /* Hey it worked - add the direct routes */
+ sa = (struct sockaddr_at *)&rtdef.rt_gateway;
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr.s_net = atif->address.s_net;
+ sa->sat_addr.s_node = atif->address.s_node;
+ sa = (struct sockaddr_at *)&rtdef.rt_dst;
+ rtdef.rt_flags = RTF_UP;
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr.s_node = ATADDR_ANYNODE;
+ if (dev->flags & IFF_LOOPBACK ||
+ dev->flags & IFF_POINTOPOINT)
+ rtdef.rt_flags |= RTF_HOST;
+
+ /* Routerless initial state */
+ if (nr->nr_firstnet == htons(0) &&
+ nr->nr_lastnet == htons(0xFFFE)) {
+ sa->sat_addr.s_net = atif->address.s_net;
+ atrtr_create(&rtdef, dev);
+ atrtr_set_default(dev);
+ } else {
+ limit = ntohs(nr->nr_lastnet);
+ if (limit - ntohs(nr->nr_firstnet) > 4096) {
+ printk(KERN_WARNING "Too many routes/"
+ "iface.\n");
return -EINVAL;
- if (atif) {
- /* Already setting address */
- if (atif->status & ATIF_PROBE)
- return -EBUSY;
-
- atif->address.s_net = sa->sat_addr.s_net;
- atif->address.s_node = sa->sat_addr.s_node;
- atrtr_device_down(dev); /* Flush old routes */
- } else {
- atif = atif_add_device(dev, &sa->sat_addr);
- if (!atif)
- return -ENOMEM;
}
- atif->nets = *nr;
+ if (add_route)
+ for (ct = ntohs(nr->nr_firstnet);
+ ct <= limit; ct++) {
+ sa->sat_addr.s_net = htons(ct);
+ atrtr_create(&rtdef, dev);
+ }
+ }
+ dev_mc_add_global(dev, aarp_mcast);
+ return 0;
- /*
- * Check if the chosen address is used. If so we
- * error and atalkd will try another.
- */
+ case SIOCGIFADDR:
+ if (!atif)
+ return -EADDRNOTAVAIL;
- if (!(dev->flags & IFF_LOOPBACK) &&
- !(dev->flags & IFF_POINTOPOINT) &&
- atif_probe_device(atif) < 0) {
- atif_drop_device(dev);
- return -EADDRINUSE;
- }
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr = atif->address;
+ break;
- /* Hey it worked - add the direct routes */
- sa = (struct sockaddr_at *)&rtdef.rt_gateway;
- sa->sat_family = AF_APPLETALK;
- sa->sat_addr.s_net = atif->address.s_net;
- sa->sat_addr.s_node = atif->address.s_node;
- sa = (struct sockaddr_at *)&rtdef.rt_dst;
- rtdef.rt_flags = RTF_UP;
- sa->sat_family = AF_APPLETALK;
- sa->sat_addr.s_node = ATADDR_ANYNODE;
- if (dev->flags & IFF_LOOPBACK ||
- dev->flags & IFF_POINTOPOINT)
- rtdef.rt_flags |= RTF_HOST;
-
- /* Routerless initial state */
- if (nr->nr_firstnet == htons(0) &&
- nr->nr_lastnet == htons(0xFFFE)) {
- sa->sat_addr.s_net = atif->address.s_net;
- atrtr_create(&rtdef, dev);
- atrtr_set_default(dev);
- } else {
- limit = ntohs(nr->nr_lastnet);
- if (limit - ntohs(nr->nr_firstnet) > 4096) {
- printk(KERN_WARNING "Too many routes/"
- "iface.\n");
- return -EINVAL;
- }
- if (add_route)
- for (ct = ntohs(nr->nr_firstnet);
- ct <= limit; ct++) {
- sa->sat_addr.s_net = htons(ct);
- atrtr_create(&rtdef, dev);
- }
- }
- dev_mc_add(dev, aarp_mcast, 6, 1);
- return 0;
+ case SIOCGIFBRDADDR:
+ if (!atif)
+ return -EADDRNOTAVAIL;
- case SIOCGIFADDR:
- if (!atif)
- return -EADDRNOTAVAIL;
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr.s_net = atif->address.s_net;
+ sa->sat_addr.s_node = ATADDR_BCAST;
+ break;
- sa->sat_family = AF_APPLETALK;
- sa->sat_addr = atif->address;
- break;
+ case SIOCATALKDIFADDR:
+ case SIOCDIFADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ atalk_dev_down(dev);
+ break;
- case SIOCGIFBRDADDR:
- if (!atif)
- return -EADDRNOTAVAIL;
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ /*
+ * for now, we only support proxy AARP on ELAP;
+ * we should be able to do it for LocalTalk, too.
+ */
+ if (dev->type != ARPHRD_ETHER)
+ return -EPROTONOSUPPORT;
- sa->sat_family = AF_APPLETALK;
- sa->sat_addr.s_net = atif->address.s_net;
- sa->sat_addr.s_node = ATADDR_BCAST;
- break;
+ /*
+ * atif points to the current interface on this network;
+ * we aren't concerned about its current status (at
+ * least for now), but it has all the settings about
+ * the network we're going to probe. Consequently, it
+ * must exist.
+ */
+ if (!atif)
+ return -EADDRNOTAVAIL;
- case SIOCATALKDIFADDR:
- case SIOCDIFADDR:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- if (sa->sat_family != AF_APPLETALK)
- return -EINVAL;
- atalk_dev_down(dev);
- break;
-
- case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- if (sa->sat_family != AF_APPLETALK)
- return -EINVAL;
- if (!atif)
- return -EADDRNOTAVAIL;
-
- /*
- * for now, we only support proxy AARP on ELAP;
- * we should be able to do it for LocalTalk, too.
- */
- if (dev->type != ARPHRD_ETHER)
- return -EPROTONOSUPPORT;
-
- /*
- * atif points to the current interface on this network;
- * we aren't concerned about its current status (at
- * least for now), but it has all the settings about
- * the network we're going to probe. Consequently, it
- * must exist.
- */
- if (!atif)
- return -EADDRNOTAVAIL;
-
- nr = (struct atalk_netrange *)&(atif->nets);
- /*
- * Phase 1 is fine on Localtalk but we don't do
- * Ethertalk phase 1. Anyone wanting to add it go ahead.
- */
- if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
- return -EPROTONOSUPPORT;
-
- if (sa->sat_addr.s_node == ATADDR_BCAST ||
- sa->sat_addr.s_node == 254)
- return -EINVAL;
-
- /*
- * Check if the chosen address is used. If so we
- * error and ATCP will try another.
- */
- if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0)
- return -EADDRINUSE;
-
- /*
- * We now have an address on the local network, and
- * the AARP code will defend it for us until we take it
- * down. We don't set up any routes right now, because
- * ATCP will install them manually via SIOCADDRT.
- */
- break;
-
- case SIOCDARP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- if (sa->sat_family != AF_APPLETALK)
- return -EINVAL;
- if (!atif)
- return -EADDRNOTAVAIL;
-
- /* give to aarp module to remove proxy entry */
- aarp_proxy_remove(atif->dev, &(sa->sat_addr));
- return 0;
+ nr = (struct atalk_netrange *)&(atif->nets);
+ /*
+ * Phase 1 is fine on Localtalk but we don't do
+ * Ethertalk phase 1. Anyone wanting to add it, go ahead.
+ */
+ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
+ return -EPROTONOSUPPORT;
+
+ if (sa->sat_addr.s_node == ATADDR_BCAST ||
+ sa->sat_addr.s_node == 254)
+ return -EINVAL;
+
+ /*
+ * Check if the chosen address is used. If so we
+ * error and ATCP will try another.
+ */
+ if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0)
+ return -EADDRINUSE;
+
+ /*
+ * We now have an address on the local network, and
+ * the AARP code will defend it for us until we take it
+ * down. We don't set up any routes right now, because
+ * ATCP will install them manually via SIOCADDRT.
+ */
+ break;
+
+ case SIOCDARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ if (!atif)
+ return -EADDRNOTAVAIL;
+
+ /* give to aarp module to remove proxy entry */
+ aarp_proxy_remove(atif->dev, &(sa->sat_addr));
+ return 0;
}
- return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0;
+ return put_user_ifreq(&atreq, arg);
+}
+
+static int atrtr_ioctl_addrt(struct rtentry *rt)
+{
+ struct net_device *dev = NULL;
+
+ if (rt->rt_dev) {
+ char name[IFNAMSIZ];
+
+ if (copy_from_user(name, rt->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+ name[IFNAMSIZ-1] = '\0';
+
+ dev = __dev_get_by_name(&init_net, name);
+ if (!dev)
+ return -ENODEV;
+ }
+ return atrtr_create(rt, dev);
}
/* Routing ioctl() calls */
@@ -882,25 +909,14 @@ static int atrtr_ioctl(unsigned int cmd, void __user *arg)
return -EFAULT;
switch (cmd) {
- case SIOCDELRT:
- if (rt.rt_dst.sa_family != AF_APPLETALK)
- return -EINVAL;
- return atrtr_delete(&((struct sockaddr_at *)
- &rt.rt_dst)->sat_addr);
-
- case SIOCADDRT: {
- struct net_device *dev = NULL;
- if (rt.rt_dev) {
- char name[IFNAMSIZ];
- if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1))
- return -EFAULT;
- name[IFNAMSIZ-1] = '\0';
- dev = __dev_get_by_name(name);
- if (!dev)
- return -ENODEV;
- }
- return atrtr_create(&rt, dev);
- }
+ case SIOCDELRT:
+ if (rt.rt_dst.sa_family != AF_APPLETALK)
+ return -EINVAL;
+ return atrtr_delete(&((struct sockaddr_at *)
+ &rt.rt_dst)->sat_addr);
+
+ case SIOCADDRT:
+ return atrtr_ioctl_addrt(&rt);
}
return -EINVAL;
}
@@ -916,18 +932,13 @@ static int atrtr_ioctl(unsigned int cmd, void __user *arg)
* Checksum: This is 'optional'. It's quite likely also a good
* candidate for assembler hackery 8)
*/
-static unsigned long atalk_sum_partial(const unsigned char *data,
+static unsigned long atalk_sum_partial(const unsigned char *data,
int len, unsigned long sum)
{
/* This ought to be unwrapped neatly. I'll trust gcc for now */
while (len--) {
- sum += *data;
- sum <<= 1;
- if (sum & 0x10000) {
- sum++;
- sum &= 0xffff;
- }
- data++;
+ sum += *data++;
+ sum = rol16(sum, 1);
}
return sum;
}
@@ -937,14 +948,15 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
int len, unsigned long sum)
{
int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
int i, copy;
/* checksum stuff in header space */
- if ( (copy = start - offset) > 0) {
+ if ((copy = start - offset) > 0) {
if (copy > len)
copy = len;
sum = atalk_sum_partial(skb->data + offset, copy, sum);
- if ( (len -= copy) == 0)
+ if ((len -= copy) == 0)
return sum;
offset += copy;
@@ -953,20 +965,19 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
/* checksum stuff in frags */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ WARN_ON(start > offset + len);
- BUG_TRAP(start <= offset + len);
-
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
- sum = atalk_sum_partial(vaddr + frag->page_offset +
- offset - start, copy, sum);
- kunmap_skb_frag(vaddr);
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ sum = atalk_sum_partial(vaddr + skb_frag_off(frag) +
+ offset - start, copy, sum);
+ kunmap_atomic(vaddr);
if (!(len -= copy))
return sum;
@@ -975,26 +986,22 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list = list->next) {
- int end;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- sum = atalk_sum_skb(list, offset - start,
- copy, sum);
- if ((len -= copy) == 0)
- return sum;
- offset += copy;
- }
- start = end;
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ sum = atalk_sum_skb(frag_iter, offset - start,
+ copy, sum);
+ if ((len -= copy) == 0)
+ return sum;
+ offset += copy;
}
+ start = end;
}
BUG_ON(len > 0);
@@ -1023,19 +1030,28 @@ static struct proto ddp_proto = {
* Create a socket. Initialise the socket, blank the addresses
* set the state.
*/
-static int atalk_create(struct socket *sock, int protocol)
+static int atalk_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
int rc = -ESOCKTNOSUPPORT;
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
/*
* We permit SOCK_DGRAM and RAW is an extension. It is trivial to do
- * and gives you the full ELAP frame. Should be handy for CAP 8)
+ * and gives you the full ELAP frame. Should be handy for CAP 8)
*/
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
goto out;
+
+ rc = -EPERM;
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ goto out;
+
rc = -ENOMEM;
- sk = sk_alloc(PF_APPLETALK, GFP_KERNEL, &ddp_proto, 1);
+ sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
if (!sk)
goto out;
rc = 0;
@@ -1054,17 +1070,23 @@ static int atalk_release(struct socket *sock)
struct sock *sk = sock->sk;
if (sk) {
+ sock_hold(sk);
+ lock_sock(sk);
+
sock_orphan(sk);
sock->sk = NULL;
atalk_destroy_socket(sk);
+
+ release_sock(sk);
+ sock_put(sk);
}
return 0;
}
/**
* atalk_pick_and_bind_port - Pick a source port when one is not given
- * @sk - socket to insert into the tables
- * @sat - address to search for
+ * @sk: socket to insert into the tables
+ * @sat: address to search for
*
* Pick a source port when one is not given. If we can find a suitable free
* one, we insert the socket into the tables using it.
@@ -1081,9 +1103,8 @@ static int atalk_pick_and_bind_port(struct sock *sk, struct sockaddr_at *sat)
sat->sat_port < ATPORT_LAST;
sat->sat_port++) {
struct sock *s;
- struct hlist_node *node;
- sk_for_each(s, node, &atalk_sockets) {
+ sk_for_each(s, &atalk_sockets) {
struct atalk_sock *at = at_sk(s);
if (at->src_net == sat->sat_addr.s_net &&
@@ -1128,11 +1149,12 @@ out:
}
/* Set the address 'our end' of the connection */
-static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int atalk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
struct sockaddr_at *addr = (struct sockaddr_at *)uaddr;
struct sock *sk = sock->sk;
struct atalk_sock *at = at_sk(sk);
+ int err;
if (!sock_flag(sk, SOCK_ZAPPED) ||
addr_len != sizeof(struct sockaddr_at))
@@ -1141,46 +1163,54 @@ static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr->sat_family != AF_APPLETALK)
return -EAFNOSUPPORT;
+ lock_sock(sk);
if (addr->sat_addr.s_net == htons(ATADDR_ANYNET)) {
struct atalk_addr *ap = atalk_find_primary();
+ err = -EADDRNOTAVAIL;
if (!ap)
- return -EADDRNOTAVAIL;
+ goto out;
at->src_net = addr->sat_addr.s_net = ap->s_net;
- at->src_node = addr->sat_addr.s_node= ap->s_node;
+ at->src_node = addr->sat_addr.s_node = ap->s_node;
} else {
+ err = -EADDRNOTAVAIL;
if (!atalk_find_interface(addr->sat_addr.s_net,
addr->sat_addr.s_node))
- return -EADDRNOTAVAIL;
+ goto out;
at->src_net = addr->sat_addr.s_net;
at->src_node = addr->sat_addr.s_node;
}
if (addr->sat_port == ATADDR_ANYPORT) {
- int n = atalk_pick_and_bind_port(sk, addr);
+ err = atalk_pick_and_bind_port(sk, addr);
- if (n < 0)
- return n;
+ if (err < 0)
+ goto out;
} else {
at->src_port = addr->sat_port;
+ err = -EADDRINUSE;
if (atalk_find_or_insert_socket(sk, addr))
- return -EADDRINUSE;
+ goto out;
}
sock_reset_flag(sk, SOCK_ZAPPED);
- return 0;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
}
/* Set the address we talk to */
-static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
+static int atalk_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
struct atalk_sock *at = at_sk(sk);
struct sockaddr_at *addr;
+ int err;
sk->sk_state = TCP_CLOSE;
sock->state = SS_UNCONNECTED;
@@ -1195,22 +1225,23 @@ static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
if (addr->sat_addr.s_node == ATADDR_BCAST &&
!sock_flag(sk, SOCK_BROADCAST)) {
-#if 1
- printk(KERN_WARNING "%s is broken and did not set "
- "SO_BROADCAST. It will break when 2.2 is "
- "released.\n",
+#if 1
+ pr_warn("atalk_connect: %s is broken and did not set SO_BROADCAST.\n",
current->comm);
#else
return -EACCES;
-#endif
+#endif
}
+ lock_sock(sk);
+ err = -EBUSY;
if (sock_flag(sk, SOCK_ZAPPED))
if (atalk_autobind(sk) < 0)
- return -EBUSY;
+ goto out;
+ err = -ENETUNREACH;
if (!atrtr_get_dev(&addr->sat_addr))
- return -ENETUNREACH;
+ goto out;
at->dest_port = addr->sat_port;
at->dest_net = addr->sat_addr.s_net;
@@ -1218,7 +1249,10 @@ static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
sk->sk_state = TCP_ESTABLISHED;
- return 0;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
}
/*
@@ -1226,21 +1260,25 @@ static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
* fields into the sockaddr.
*/
static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
- int *uaddr_len, int peer)
+ int peer)
{
struct sockaddr_at sat;
struct sock *sk = sock->sk;
struct atalk_sock *at = at_sk(sk);
+ int err;
+ lock_sock(sk);
+ err = -ENOBUFS;
if (sock_flag(sk, SOCK_ZAPPED))
if (atalk_autobind(sk) < 0)
- return -ENOBUFS;
+ goto out;
- *uaddr_len = sizeof(struct sockaddr_at);
+ memset(&sat, 0, sizeof(sat));
if (peer) {
+ err = -ENOTCONN;
if (sk->sk_state != TCP_ESTABLISHED)
- return -ENOTCONN;
+ goto out;
sat.sat_addr.s_net = at->dest_net;
sat.sat_addr.s_node = at->dest_node;
@@ -1253,51 +1291,22 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
sat.sat_family = AF_APPLETALK;
memcpy(uaddr, &sat, sizeof(sat));
- return 0;
-}
-
-#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
-static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
-{
- return skb->data[12] == 22;
-}
+ err = sizeof(struct sockaddr_at);
-static int handle_ip_over_ddp(struct sk_buff *skb)
-{
- struct net_device *dev = __dev_get_by_name("ipddp0");
- struct net_device_stats *stats;
-
- /* This needs to be able to handle ipddp"N" devices */
- if (!dev)
- return -ENODEV;
-
- skb->protocol = htons(ETH_P_IP);
- skb_pull(skb, 13);
- skb->dev = dev;
- skb->h.raw = skb->data;
-
- stats = dev->priv;
- stats->rx_packets++;
- stats->rx_bytes += skb->len + 13;
- netif_rx(skb); /* Send the SKB up to a higher place. */
- return 0;
+out:
+ release_sock(sk);
+ return err;
}
-#else
-/* make it easy for gcc to optimize this test out, i.e. kill the code */
-#define is_ip_over_ddp(skb) 0
-#define handle_ip_over_ddp(skb) 0
-#endif
-static void atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
- struct ddpehdr *ddp, __u16 len_hops,
- int origlen)
+static int atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
+ struct ddpehdr *ddp, __u16 len_hops, int origlen)
{
struct atalk_route *rt;
struct atalk_addr ta;
/*
* Don't route multicast, etc., packets, or packets sent to "this
- * network"
+ * network"
*/
if (skb->pkt_type != PACKET_HOST || !ddp->deh_dnet) {
/*
@@ -1334,8 +1343,8 @@ static void atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
ta.s_node = rt->gateway.s_node;
}
- /* Fix up skb->len field */
- skb_trim(skb, min_t(unsigned int, origlen,
+ /* Fix up skb->len field */
+ skb_trim(skb, min_t(unsigned int, origlen,
(rt->dev->hard_header_len +
ddp_dl->header_length + (len_hops & 1023))));
@@ -1357,35 +1366,38 @@ static void atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
/* 22 bytes - 12 ether, 2 len, 3 802.2 5 snap */
struct sk_buff *nskb = skb_realloc_headroom(skb, 32);
kfree_skb(skb);
- if (!nskb)
- goto out;
skb = nskb;
} else
skb = skb_unshare(skb, GFP_ATOMIC);
-
+
/*
* If the buffer didn't vanish into the lack of space bitbucket we can
* send it.
*/
- if (skb && aarp_send_ddp(rt->dev, skb, &ta, NULL) == -1)
- goto free_it;
-out:
- return;
+ if (skb == NULL)
+ goto drop;
+
+ if (aarp_send_ddp(rt->dev, skb, &ta, NULL) == NET_XMIT_DROP)
+ return NET_RX_DROP;
+ return NET_RX_SUCCESS;
free_it:
kfree_skb(skb);
+drop:
+ return NET_RX_DROP;
}
/**
* atalk_rcv - Receive a packet (in skb) from device dev
- * @skb - packet received
- * @dev - network device where the packet comes from
- * @pt - packet type
+ * @skb: packet received
+ * @dev: network device where the packet comes from
+ * @pt: packet type
+ * @orig_dev: the original receive net device
*
* Receive a packet (in skb) from device dev. This has come from the SNAP
- * decoder, and on entry skb->h.raw is the DDP header, skb->len is the DDP
- * header, skb->len is the DDP length. The physical headers have been
- * extracted. PPP should probably pass frames marked as for this layer.
- * [ie ARPHRD_ETHERTALK]
+ * decoder, and on entry skb->transport_header is the DDP header, skb->len
+ * is the DDP header, skb->len is the DDP length. The physical headers
+ * have been extracted. PPP should probably pass frames marked as for this
+ * layer. [ie ARPHRD_ETHERTALK]
*/
static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
@@ -1394,16 +1406,19 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
struct sock *sock;
struct atalk_iface *atif;
struct sockaddr_at tosat;
- int origlen;
+ int origlen;
__u16 len_hops;
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
/* Don't mangle buffer if shared */
- if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
+ if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
goto out;
-
+
/* Size check and make sure header is contiguous */
if (!pskb_may_pull(skb, sizeof(*ddp)))
- goto freeit;
+ goto drop;
ddp = ddp_hdr(skb);
@@ -1416,10 +1431,13 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
/*
* Size check to see if ddp->deh_len was crap
* (Otherwise we'll detonate most spectacularly
- * in the middle of recvmsg()).
+ * in the middle of atalk_checksum() or recvmsg()).
*/
- if (skb->len < sizeof(*ddp))
- goto freeit;
+ if (skb->len < sizeof(*ddp) || skb->len < (len_hops & 1023)) {
+ pr_debug("AppleTalk: dropping corrupted frame (deh_len=%u, "
+ "skb->len=%u)\n", len_hops & 1023, skb->len);
+ goto drop;
+ }
/*
* Any checksums. Note we don't do htons() on this == is assumed to be
@@ -1428,7 +1446,7 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
if (ddp->deh_sum &&
atalk_checksum(skb, len_hops & 1023) != ddp->deh_sum)
/* Not a valid AppleTalk frame - dustbin time */
- goto freeit;
+ goto drop;
/* Check the packet is aimed at us */
if (!ddp->deh_dnet) /* Net 0 is 'this network' */
@@ -1440,13 +1458,9 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
/* Not ours, so we route the packet via the correct
* AppleTalk iface
*/
- atalk_route_packet(skb, dev, ddp, len_hops, origlen);
- goto out;
+ return atalk_route_packet(skb, dev, ddp, len_hops, origlen);
}
- /* if IP over DDP is not selected this code will be optimized out */
- if (is_ip_over_ddp(skb))
- return handle_ip_over_ddp(skb);
/*
* Which socket - atalk_search_socket() looks for a *full match*
* of the <net, node, port> tuple.
@@ -1457,18 +1471,19 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
sock = atalk_search_socket(&tosat, atif);
if (!sock) /* But not one of our sockets */
- goto freeit;
+ goto drop;
/* Queue packet (standard) */
- skb->sk = sock;
-
if (sock_queue_rcv_skb(sock, skb) < 0)
- goto freeit;
-out:
- return 0;
-freeit:
+ goto drop;
+
+ return NET_RX_SUCCESS;
+
+drop:
kfree_skb(skb);
- goto out;
+out:
+ return NET_RX_DROP;
+
}
/*
@@ -1479,8 +1494,11 @@ freeit:
static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
+ if (!net_eq(dev_net(dev), &init_net))
+ goto freeit;
+
/* Expand any short form frames */
- if (skb->mac.raw[2] == 1) {
+ if (skb_mac_header(skb)[2] == 1) {
struct ddpehdr *ddp;
/* Find our address */
struct atalk_addr *ap = atalk_find_dev_addr(dev);
@@ -1489,25 +1507,25 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
goto freeit;
/* Don't mangle buffer if shared */
- if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
+ if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
return 0;
/*
* The push leaves us with a ddephdr not an shdr, and
* handily the port bytes in the right place preset.
*/
- ddp = (struct ddpehdr *) skb_push(skb, sizeof(*ddp) - 4);
+ ddp = skb_push(skb, sizeof(*ddp) - 4);
/* Now fill in the long header */
- /*
- * These two first. The mac overlays the new source/dest
- * network information so we MUST copy these before
- * we write the network numbers !
- */
+ /*
+ * These two first. The mac overlays the new source/dest
+ * network information so we MUST copy these before
+ * we write the network numbers !
+ */
- ddp->deh_dnode = skb->mac.raw[0]; /* From physical header */
- ddp->deh_snode = skb->mac.raw[1]; /* From physical header */
+ ddp->deh_dnode = skb_mac_header(skb)[0]; /* From physical header */
+ ddp->deh_snode = skb_mac_header(skb)[1]; /* From physical header */
ddp->deh_dnet = ap->s_net; /* Network number */
ddp->deh_snet = ap->s_net;
@@ -1518,7 +1536,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
/* Non routable, so force a drop if we slip up later */
ddp->deh_len_hops = htons(skb->len + (DDP_MAXHOPS << 10));
}
- skb->h.raw = skb->data;
+ skb_reset_transport_header(skb);
return atalk_rcv(skb, dev, pt, orig_dev);
freeit:
@@ -1526,20 +1544,19 @@ freeit:
return 0;
}
-static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t len)
+static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct atalk_sock *at = at_sk(sk);
- struct sockaddr_at *usat = (struct sockaddr_at *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_at *, usat, msg->msg_name);
int flags = msg->msg_flags;
int loopback = 0;
struct sockaddr_at local_satalk, gsat;
struct sk_buff *skb;
struct net_device *dev;
struct ddpehdr *ddp;
- int size;
- struct atalk_route *rt;
+ int size, hard_header_len;
+ struct atalk_route *rt, *rt_lo = NULL;
int err;
if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
@@ -1548,27 +1565,28 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
if (len > DDP_MAXSZ)
return -EMSGSIZE;
+ lock_sock(sk);
if (usat) {
+ err = -EBUSY;
if (sock_flag(sk, SOCK_ZAPPED))
if (atalk_autobind(sk) < 0)
- return -EBUSY;
+ goto out;
+ err = -EINVAL;
if (msg->msg_namelen < sizeof(*usat) ||
usat->sat_family != AF_APPLETALK)
- return -EINVAL;
+ goto out;
- /* netatalk doesn't implement this check */
+ err = -EPERM;
+ /* netatalk didn't implement this check */
if (usat->sat_addr.s_node == ATADDR_BCAST &&
!sock_flag(sk, SOCK_BROADCAST)) {
- printk(KERN_INFO "SO_BROADCAST: Fix your netatalk as "
- "it will break before 2.2\n");
-#if 0
- return -EPERM;
-#endif
+ goto out;
}
} else {
+ err = -ENOTCONN;
if (sk->sk_state != TCP_ESTABLISHED)
- return -ENOTCONN;
+ goto out;
usat = &local_satalk;
usat->sat_family = AF_APPLETALK;
usat->sat_port = at->dest_port;
@@ -1577,7 +1595,7 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
}
/* Build a packet */
- SOCK_DEBUG(sk, "SK %p: Got address.\n", sk);
+ net_dbg_ratelimited("SK %p: Got address.\n", sk);
/* For headers */
size = sizeof(struct ddpehdr) + len + ddp_dl->header_length;
@@ -1592,27 +1610,44 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
rt = atrtr_find(&at_hint);
}
+ err = -ENETUNREACH;
if (!rt)
- return -ENETUNREACH;
+ goto out;
dev = rt->dev;
- SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n",
+ net_dbg_ratelimited("SK %p: Size needed %d, device %s\n",
sk, size, dev->name);
- size += dev->hard_header_len;
+ hard_header_len = dev->hard_header_len;
+ /* Leave room for loopback hardware header if necessary */
+ if (usat->sat_addr.s_node == ATADDR_BCAST &&
+ (dev->flags & IFF_LOOPBACK || !(rt->flags & RTF_GATEWAY))) {
+ struct atalk_addr at_lo;
+
+ at_lo.s_node = 0;
+ at_lo.s_net = 0;
+
+ rt_lo = atrtr_find(&at_lo);
+
+ if (rt_lo && rt_lo->dev->hard_header_len > hard_header_len)
+ hard_header_len = rt_lo->dev->hard_header_len;
+ }
+
+ size += hard_header_len;
+ release_sock(sk);
skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err);
+ lock_sock(sk);
if (!skb)
- return err;
-
- skb->sk = sk;
+ goto out;
+
skb_reserve(skb, ddp_dl->header_length);
- skb_reserve(skb, dev->hard_header_len);
+ skb_reserve(skb, hard_header_len);
skb->dev = dev;
- SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk);
+ net_dbg_ratelimited("SK %p: Begin build.\n", sk);
- ddp = (struct ddpehdr *)skb_put(skb, sizeof(struct ddpehdr));
+ ddp = skb_put(skb, sizeof(struct ddpehdr));
ddp->deh_len_hops = htons(len + sizeof(*ddp));
ddp->deh_dnet = usat->sat_addr.s_net;
ddp->deh_snet = at->src_net;
@@ -1621,15 +1656,16 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
ddp->deh_dport = usat->sat_port;
ddp->deh_sport = at->src_port;
- SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+ net_dbg_ratelimited("SK %p: Copy user data (%zd bytes).\n", sk, len);
- err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+ err = memcpy_from_msg(skb_put(skb, len), msg, len);
if (err) {
kfree_skb(skb);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
- if (sk->sk_no_check == 1)
+ if (sk->sk_no_check_tx)
ddp->deh_sum = 0;
else
ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp));
@@ -1644,62 +1680,62 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
if (skb2) {
loopback = 1;
- SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk);
- if (aarp_send_ddp(dev, skb2,
- &usat->sat_addr, NULL) == -1)
- kfree_skb(skb2);
- /* else queued/sent above in the aarp queue */
+ net_dbg_ratelimited("SK %p: send out(copy).\n", sk);
+ /*
+ * If it fails it is queued/sent above in the aarp queue
+ */
+ aarp_send_ddp(dev, skb2, &usat->sat_addr, NULL);
}
}
if (dev->flags & IFF_LOOPBACK || loopback) {
- SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk);
+ net_dbg_ratelimited("SK %p: Loop back.\n", sk);
/* loop back */
skb_orphan(skb);
if (ddp->deh_dnode == ATADDR_BCAST) {
- struct atalk_addr at_lo;
-
- at_lo.s_node = 0;
- at_lo.s_net = 0;
-
- rt = atrtr_find(&at_lo);
- if (!rt) {
+ if (!rt_lo) {
kfree_skb(skb);
- return -ENETUNREACH;
+ err = -ENETUNREACH;
+ goto out;
}
- dev = rt->dev;
+ dev = rt_lo->dev;
skb->dev = dev;
}
ddp_dl->request(ddp_dl, skb, dev->dev_addr);
} else {
- SOCK_DEBUG(sk, "SK %p: send out.\n", sk);
+ net_dbg_ratelimited("SK %p: send out.\n", sk);
if (rt->flags & RTF_GATEWAY) {
gsat.sat_addr = rt->gateway;
usat = &gsat;
}
- if (aarp_send_ddp(dev, skb, &usat->sat_addr, NULL) == -1)
- kfree_skb(skb);
- /* else queued/sent above in the aarp queue */
+ /*
+ * If it fails it is queued/sent above in the aarp queue
+ */
+ aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
}
- SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+ net_dbg_ratelimited("SK %p: Done write (%zd).\n", sk, len);
- return len;
+out:
+ release_sock(sk);
+ return err ? : len;
}
-static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+static int atalk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
- struct sockaddr_at *sat = (struct sockaddr_at *)msg->msg_name;
struct ddpehdr *ddp;
int copied = 0;
int offset = 0;
int err = 0;
- struct sk_buff *skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
- flags & MSG_DONTWAIT, &err);
+ struct sk_buff *skb;
+
+ skb = skb_recv_datagram(sk, flags, &err);
+ lock_sock(sk);
+
if (!skb)
- return err;
+ goto out;
/* FIXME: use skb->cb to be able to use shared skbs */
ddp = ddp_hdr(skb);
@@ -1714,19 +1750,21 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
copied = size;
msg->msg_flags |= MSG_TRUNC;
}
- err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied);
-
- if (!err) {
- if (sat) {
- sat->sat_family = AF_APPLETALK;
- sat->sat_port = ddp->deh_sport;
- sat->sat_addr.s_node = ddp->deh_snode;
- sat->sat_addr.s_net = ddp->deh_snet;
- }
- msg->msg_namelen = sizeof(*sat);
+ err = skb_copy_datagram_msg(skb, offset, msg, copied);
+
+ if (!err && msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name);
+ sat->sat_family = AF_APPLETALK;
+ sat->sat_port = ddp->deh_sport;
+ sat->sat_addr.s_node = ddp->deh_snode;
+ sat->sat_addr.s_net = ddp->deh_snet;
+ msg->msg_namelen = sizeof(*sat);
}
skb_free_datagram(sk, skb); /* Free the datagram. */
+
+out:
+ release_sock(sk);
return err ? : copied;
}
@@ -1741,51 +1779,46 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
switch (cmd) {
- /* Protocol layer */
- case TIOCOUTQ: {
- long amount = sk->sk_sndbuf -
- atomic_read(&sk->sk_wmem_alloc);
-
- if (amount < 0)
- amount = 0;
- rc = put_user(amount, (int __user *)argp);
- break;
- }
- case TIOCINQ: {
- /*
- * These two are safe on a single CPU system as only
- * user tasks fiddle here
- */
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
- long amount = 0;
-
- if (skb)
- amount = skb->len - sizeof(struct ddpehdr);
- rc = put_user(amount, (int __user *)argp);
- break;
- }
- case SIOCGSTAMP:
- rc = sock_get_timestamp(sk, argp);
- break;
- /* Routing */
- case SIOCADDRT:
- case SIOCDELRT:
- rc = -EPERM;
- if (capable(CAP_NET_ADMIN))
- rc = atrtr_ioctl(cmd, argp);
- break;
- /* Interface */
- case SIOCGIFADDR:
- case SIOCSIFADDR:
- case SIOCGIFBRDADDR:
- case SIOCATALKDIFADDR:
- case SIOCDIFADDR:
- case SIOCSARP: /* proxy AARP */
- case SIOCDARP: /* proxy AARP */
- rtnl_lock();
- rc = atif_ioctl(cmd, argp);
- rtnl_unlock();
- break;
+ /* Protocol layer */
+ case TIOCOUTQ: {
+ long amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+
+ if (amount < 0)
+ amount = 0;
+ rc = put_user(amount, (int __user *)argp);
+ break;
+ }
+ case TIOCINQ: {
+ struct sk_buff *skb;
+ long amount = 0;
+
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount = skb->len - sizeof(struct ddpehdr);
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ rc = put_user(amount, (int __user *)argp);
+ break;
+ }
+ /* Routing */
+ case SIOCADDRT:
+ case SIOCDELRT:
+ rc = -EPERM;
+ if (capable(CAP_NET_ADMIN))
+ rc = atrtr_ioctl(cmd, argp);
+ break;
+ /* Interface */
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCATALKDIFADDR:
+ case SIOCDIFADDR:
+ case SIOCSARP: /* proxy AARP */
+ case SIOCDARP: /* proxy AARP */
+ rtnl_lock();
+ rc = atif_ioctl(cmd, argp);
+ rtnl_unlock();
+ break;
}
return rc;
@@ -1793,27 +1826,67 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
#ifdef CONFIG_COMPAT
+static int atalk_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_rtentry __user *ur)
+{
+ compat_uptr_t rtdev;
+ struct rtentry rt;
+
+ if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+ 3 * sizeof(struct sockaddr)) ||
+ get_user(rt.rt_flags, &ur->rt_flags) ||
+ get_user(rt.rt_metric, &ur->rt_metric) ||
+ get_user(rt.rt_mtu, &ur->rt_mtu) ||
+ get_user(rt.rt_window, &ur->rt_window) ||
+ get_user(rt.rt_irtt, &ur->rt_irtt) ||
+ get_user(rtdev, &ur->rt_dev))
+ return -EFAULT;
+
+ switch (cmd) {
+ case SIOCDELRT:
+ if (rt.rt_dst.sa_family != AF_APPLETALK)
+ return -EINVAL;
+ return atrtr_delete(&((struct sockaddr_at *)
+ &rt.rt_dst)->sat_addr);
+
+ case SIOCADDRT:
+ rt.rt_dev = compat_ptr(rtdev);
+ return atrtr_ioctl_addrt(&rt);
+ default:
+ return -EINVAL;
+ }
+}
static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return atalk_compat_routing_ioctl(sk, cmd, argp);
/*
- * All Appletalk ioctls except SIOCATALKDIFADDR are standard. And
- * SIOCATALKDIFADDR is handled by upper layer as well, so there is
- * nothing to do. Eventually SIOCATALKDIFADDR should be moved
- * here so there is no generic SIOCPROTOPRIVATE translation in the
- * system.
+ * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we
+ * cannot handle it in common code. The data we access if ifreq
+ * here is compatible, so we can simply call the native
+ * handler.
*/
- return -ENOIOCTLCMD;
+ case SIOCATALKDIFADDR:
+ return atalk_ioctl(sock, cmd, (unsigned long)argp);
+ default:
+ return -ENOIOCTLCMD;
+ }
}
-#endif
+#endif /* CONFIG_COMPAT */
-static struct net_proto_family atalk_family_ops = {
+static const struct net_proto_family atalk_family_ops = {
.family = PF_APPLETALK,
.create = atalk_create,
.owner = THIS_MODULE,
};
-static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
+static const struct proto_ops atalk_dgram_ops = {
.family = PF_APPLETALK,
.owner = THIS_MODULE,
.release = atalk_release,
@@ -1824,68 +1897,92 @@ static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
.getname = atalk_getname,
.poll = datagram_poll,
.ioctl = atalk_ioctl,
+ .gettstamp = sock_gettstamp,
#ifdef CONFIG_COMPAT
.compat_ioctl = atalk_compat_ioctl,
#endif
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.sendmsg = atalk_sendmsg,
.recvmsg = atalk_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
-#include <linux/smp_lock.h>
-SOCKOPS_WRAP(atalk_dgram, PF_APPLETALK);
-
static struct notifier_block ddp_notifier = {
.notifier_call = ddp_device_event,
};
-static struct packet_type ltalk_packet_type = {
- .type = __constant_htons(ETH_P_LOCALTALK),
+static struct packet_type ltalk_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_LOCALTALK),
.func = ltalk_rcv,
};
-static struct packet_type ppptalk_packet_type = {
- .type = __constant_htons(ETH_P_PPPTALK),
+static struct packet_type ppptalk_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_PPPTALK),
.func = atalk_rcv,
};
static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B };
/* Export symbols for use by drivers when AppleTalk is a module */
-EXPORT_SYMBOL(aarp_send_ddp);
EXPORT_SYMBOL(atrtr_get_dev);
EXPORT_SYMBOL(atalk_find_dev_addr);
-static char atalk_err_snap[] __initdata =
- KERN_CRIT "Unable to register DDP with SNAP.\n";
-
/* Called by proto.c on kernel start up */
static int __init atalk_init(void)
{
- int rc = proto_register(&ddp_proto, 0);
+ int rc;
- if (rc != 0)
+ rc = proto_register(&ddp_proto, 0);
+ if (rc)
goto out;
- (void)sock_register(&atalk_family_ops);
+ rc = sock_register(&atalk_family_ops);
+ if (rc)
+ goto out_proto;
+
ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv);
- if (!ddp_dl)
- printk(atalk_err_snap);
+ if (!ddp_dl) {
+ pr_crit("Unable to register DDP with SNAP.\n");
+ rc = -ENOMEM;
+ goto out_sock;
+ }
dev_add_pack(&ltalk_packet_type);
dev_add_pack(&ppptalk_packet_type);
- register_netdevice_notifier(&ddp_notifier);
- aarp_proto_init();
- atalk_proc_init();
- atalk_register_sysctl();
+ rc = register_netdevice_notifier(&ddp_notifier);
+ if (rc)
+ goto out_snap;
+
+ rc = aarp_proto_init();
+ if (rc)
+ goto out_dev;
+
+ rc = atalk_proc_init();
+ if (rc)
+ goto out_aarp;
+
+ rc = atalk_register_sysctl();
+ if (rc)
+ goto out_proc;
out:
return rc;
+out_proc:
+ atalk_proc_exit();
+out_aarp:
+ aarp_cleanup_module();
+out_dev:
+ unregister_netdevice_notifier(&ddp_notifier);
+out_snap:
+ dev_remove_pack(&ppptalk_packet_type);
+ dev_remove_pack(&ltalk_packet_type);
+ unregister_snap_client(ddp_dl);
+out_sock:
+ sock_unregister(PF_APPLETALK);
+out_proto:
+ proto_unregister(&ddp_proto);
+ goto out;
}
module_init(atalk_init);
@@ -1896,7 +1993,7 @@ module_init(atalk_init);
* by the network device layer.
*
* Ergo, before the AppleTalk module can be removed, all AppleTalk
- * sockets be closed from user space.
+ * sockets should be closed from user space.
*/
static void __exit atalk_exit(void)
{
@@ -1915,6 +2012,6 @@ static void __exit atalk_exit(void)
module_exit(atalk_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Alan Cox <Alan.Cox@linux.org>");
+MODULE_AUTHOR("Alan Cox <alan@lxorguk.ukuu.org.uk>");
MODULE_DESCRIPTION("AppleTalk 0.20\n");
MODULE_ALIAS_NETPROTO(PF_APPLETALK);
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
deleted file mode 100644
index 1237e208e246..000000000000
--- a/net/appletalk/dev.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Moved here from drivers/net/net_init.c, which is:
- * Written 1993,1994,1995 by Donald Becker.
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/if_ltalk.h>
-
-static int ltalk_change_mtu(struct net_device *dev, int mtu)
-{
- return -EINVAL;
-}
-
-static int ltalk_mac_addr(struct net_device *dev, void *addr)
-{
- return -EINVAL;
-}
-
-static void ltalk_setup(struct net_device *dev)
-{
- /* Fill in the fields of the device structure with localtalk-generic values. */
-
- dev->change_mtu = ltalk_change_mtu;
- dev->hard_header = NULL;
- dev->rebuild_header = NULL;
- dev->set_mac_address = ltalk_mac_addr;
- dev->hard_header_cache = NULL;
- dev->header_cache_update= NULL;
-
- dev->type = ARPHRD_LOCALTLK;
- dev->hard_header_len = LTALK_HLEN;
- dev->mtu = LTALK_MTU;
- dev->addr_len = LTALK_ALEN;
- dev->tx_queue_len = 10;
-
- dev->broadcast[0] = 0xFF;
-
- dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP;
-}
-
-/**
- * alloc_ltalkdev - Allocates and sets up an localtalk device
- * @sizeof_priv: Size of additional driver-private structure to be allocated
- * for this localtalk device
- *
- * Fill in the fields of the device structure with localtalk-generic
- * values. Basically does everything except registering the device.
- *
- * Constructs a new net device, complete with a private data area of
- * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
- * this private data area.
- */
-
-struct net_device *alloc_ltalkdev(int sizeof_priv)
-{
- return alloc_netdev(sizeof_priv, "lt%d", ltalk_setup);
-}
-EXPORT_SYMBOL(alloc_ltalkdev);
diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c
index 40b0af7437a2..7aebfe903242 100644
--- a/net/appletalk/sysctl_net_atalk.c
+++ b/net/appletalk/sysctl_net_atalk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* sysctl_net_atalk.c: sysctl interface to net AppleTalk subsystem.
*
@@ -12,71 +13,46 @@
static struct ctl_table atalk_table[] = {
{
- .ctl_name = NET_ATALK_AARP_EXPIRY_TIME,
.procname = "aarp-expiry-time",
.data = &sysctl_aarp_expiry_time,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_ATALK_AARP_TICK_TIME,
.procname = "aarp-tick-time",
.data = &sysctl_aarp_tick_time,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_ATALK_AARP_RETRANSMIT_LIMIT,
.procname = "aarp-retransmit-limit",
.data = &sysctl_aarp_retransmit_limit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_ATALK_AARP_RESOLVE_TIME,
.procname = "aarp-resolve-time",
.data = &sysctl_aarp_resolve_time,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
- { 0 },
-};
-
-static struct ctl_table atalk_dir_table[] = {
- {
- .ctl_name = NET_ATALK,
- .procname = "appletalk",
- .mode = 0555,
- .child = atalk_table,
- },
- { 0 },
-};
-
-static struct ctl_table atalk_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = atalk_dir_table,
- },
- { 0 },
};
static struct ctl_table_header *atalk_table_header;
-void atalk_register_sysctl(void)
+int __init atalk_register_sysctl(void)
{
- atalk_table_header = register_sysctl_table(atalk_root_table, 1);
+ atalk_table_header = register_net_sysctl(&init_net, "net/appletalk", atalk_table);
+ if (!atalk_table_header)
+ return -ENOMEM;
+ return 0;
}
void atalk_unregister_sysctl(void)
{
- unregister_sysctl_table(atalk_table_header);
+ unregister_net_sysctl_table(atalk_table_header);
}
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
index 21ff276b2d80..77343d57ff2a 100644
--- a/net/atm/Kconfig
+++ b/net/atm/Kconfig
@@ -1,11 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
-# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
+# Asynchronous Transfer Mode (ATM)
#
config ATM
- tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- ---help---
+ tristate "Asynchronous Transfer Mode (ATM)"
+ help
ATM is a high-speed networking technology for Local Area Networks
and Wide Area Networks. It uses a fixed packet size and is
connection oriented, allowing for the negotiation of minimum
@@ -16,11 +16,11 @@ config ATM
of your ATM card below.
Note that you need a set of user-space programs to actually make use
- of ATM. See the file <file:Documentation/networking/atm.txt> for
+ of ATM. See the file <file:Documentation/networking/atm.rst> for
further details.
config ATM_CLIP
- tristate "Classical IP over ATM (EXPERIMENTAL)"
+ tristate "Classical IP over ATM"
depends on ATM && INET
help
Classical IP over ATM for PVCs and SVCs, supporting InARP and
@@ -29,7 +29,7 @@ config ATM_CLIP
(LANE)" below.
config ATM_CLIP_NO_ICMP
- bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
+ bool "Do NOT send ICMP if no neighbour"
depends on ATM_CLIP
help
Normally, an "ICMP host unreachable" message is sent if a neighbour
@@ -39,7 +39,7 @@ config ATM_CLIP_NO_ICMP
such neighbours are silently discarded instead.
config ATM_LANE
- tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
+ tristate "LAN Emulation (LANE) support"
depends on ATM
help
LAN Emulation emulates services of existing LANs across an ATM
@@ -48,7 +48,7 @@ config ATM_LANE
ELAN and Ethernet segments. You need LANE if you want to try MPOA.
config ATM_MPOA
- tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
+ tristate "Multi-Protocol Over ATM (MPOA) support"
depends on ATM && INET && ATM_LANE!=n
help
Multi-Protocol Over ATM allows ATM edge devices such as routers,
diff --git a/net/atm/Makefile b/net/atm/Makefile
index 89656d6c0b90..bfec0f2d83b5 100644
--- a/net/atm/Makefile
+++ b/net/atm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the ATM Protocol Families.
#
@@ -7,10 +8,7 @@ mpoa-objs := mpc.o mpoa_caches.o mpoa_proc.o
obj-$(CONFIG_ATM) += atm.o
obj-$(CONFIG_ATM_CLIP) += clip.o
-atm-$(subst m,y,$(CONFIG_ATM_CLIP)) += ipcommon.o
obj-$(CONFIG_ATM_BR2684) += br2684.o
-atm-$(subst m,y,$(CONFIG_ATM_BR2684)) += ipcommon.o
-atm-$(subst m,y,$(CONFIG_NET_SCH_ATM)) += ipcommon.o
atm-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_ATM_LANE) += lec.o
diff --git a/net/atm/addr.c b/net/atm/addr.c
index 3060fd0ba4b9..0530b63f509a 100644
--- a/net/atm/addr.c
+++ b/net/atm/addr.c
@@ -1,16 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/addr.c - Local ATM address registry */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
#include <linux/atm.h>
#include <linux/atmdev.h>
-#include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include "signaling.h"
#include "addr.h"
-static int check_addr(struct sockaddr_atmsvc *addr)
+static int check_addr(const struct sockaddr_atmsvc *addr)
{
int i;
@@ -24,7 +25,7 @@ static int check_addr(struct sockaddr_atmsvc *addr)
return -EINVAL;
}
-static int identical(struct sockaddr_atmsvc *a, struct sockaddr_atmsvc *b)
+static int identical(const struct sockaddr_atmsvc *a, const struct sockaddr_atmsvc *b)
{
if (*a->sas_addr.prv)
if (memcmp(a->sas_addr.prv, b->sas_addr.prv, ATM_ESA_LEN))
@@ -36,7 +37,7 @@ static int identical(struct sockaddr_atmsvc *a, struct sockaddr_atmsvc *b)
return !strcmp(a->sas_addr.pub, b->sas_addr.pub);
}
-static void notify_sigd(struct atm_dev *dev)
+static void notify_sigd(const struct atm_dev *dev)
{
struct sockaddr_atmpvc pvc;
@@ -64,7 +65,7 @@ void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t atype)
notify_sigd(dev);
}
-int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
enum atm_addr_type_t atype)
{
unsigned long flags;
@@ -99,7 +100,7 @@ int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
return 0;
}
-int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
enum atm_addr_type_t atype)
{
unsigned long flags;
diff --git a/net/atm/addr.h b/net/atm/addr.h
index f39433ad45da..da3f848411a0 100644
--- a/net/atm/addr.h
+++ b/net/atm/addr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* net/atm/addr.h - Local ATM address registry */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
@@ -10,9 +11,9 @@
#include <linux/atmdev.h>
void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t type);
-int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
enum atm_addr_type_t type);
-int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
enum atm_addr_type_t type);
int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user *buf,
size_t size, enum atm_addr_type_t type);
diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c
index 02cc7e71efea..a30b83c1cb3f 100644
--- a/net/atm/atm_misc.c
+++ b/net/atm/atm_misc.c
@@ -1,38 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/atm_misc.c - Various functions for use by ATM drivers */
/* Written 1995-2000 by Werner Almesberger, EPFL ICA */
-
#include <linux/module.h>
#include <linux/atm.h>
#include <linux/atmdev.h>
#include <linux/skbuff.h>
#include <linux/sonet.h>
#include <linux/bitops.h>
-#include <asm/atomic.h>
-#include <asm/errno.h>
-
+#include <linux/errno.h>
+#include <linux/atomic.h>
-int atm_charge(struct atm_vcc *vcc,int truesize)
+int atm_charge(struct atm_vcc *vcc, int truesize)
{
- atm_force_charge(vcc,truesize);
+ atm_force_charge(vcc, truesize);
if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf)
return 1;
- atm_return(vcc,truesize);
+ atm_return(vcc, truesize);
atomic_inc(&vcc->stats->rx_drop);
return 0;
}
+EXPORT_SYMBOL(atm_charge);
-
-struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
- gfp_t gfp_flags)
+struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc, int pdu_size,
+ gfp_t gfp_flags)
{
struct sock *sk = sk_atm(vcc);
- int guess = atm_guess_pdu2truesize(pdu_size);
+ int guess = SKB_TRUESIZE(pdu_size);
- atm_force_charge(vcc,guess);
+ atm_force_charge(vcc, guess);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
- struct sk_buff *skb = alloc_skb(pdu_size,gfp_flags);
+ struct sk_buff *skb = alloc_skb(pdu_size, gfp_flags);
if (skb) {
atomic_add(skb->truesize-guess,
@@ -40,10 +39,11 @@ struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
return skb;
}
}
- atm_return(vcc,guess);
+ atm_return(vcc, guess);
atomic_inc(&vcc->stats->rx_drop);
return NULL;
}
+EXPORT_SYMBOL(atm_alloc_charge);
/*
@@ -73,7 +73,6 @@ struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
* else *
*/
-
int atm_pcr_goal(const struct atm_trafprm *tp)
{
if (tp->pcr && tp->pcr != ATM_MAX_PCR)
@@ -84,26 +83,20 @@ int atm_pcr_goal(const struct atm_trafprm *tp)
return -tp->max_pcr;
return 0;
}
+EXPORT_SYMBOL(atm_pcr_goal);
-
-void sonet_copy_stats(struct k_sonet_stats *from,struct sonet_stats *to)
+void sonet_copy_stats(struct k_sonet_stats *from, struct sonet_stats *to)
{
#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i)
__SONET_ITEMS
#undef __HANDLE_ITEM
}
+EXPORT_SYMBOL(sonet_copy_stats);
-
-void sonet_subtract_stats(struct k_sonet_stats *from,struct sonet_stats *to)
+void sonet_subtract_stats(struct k_sonet_stats *from, struct sonet_stats *to)
{
-#define __HANDLE_ITEM(i) atomic_sub(to->i,&from->i)
+#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i)
__SONET_ITEMS
#undef __HANDLE_ITEM
}
-
-
-EXPORT_SYMBOL(atm_charge);
-EXPORT_SYMBOL(atm_alloc_charge);
-EXPORT_SYMBOL(atm_pcr_goal);
-EXPORT_SYMBOL(sonet_copy_stats);
EXPORT_SYMBOL(sonet_subtract_stats);
diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c
index 62f6ed1f2f98..54e7fb1a4ee5 100644
--- a/net/atm/atm_sysfs.c
+++ b/net/atm/atm_sysfs.c
@@ -1,6 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/* ATM driver model support. */
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/atmdev.h>
@@ -9,123 +11,119 @@
#define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev)
-static ssize_t show_type(struct class_device *cdev, char *buf)
+static ssize_t type_show(struct device *cdev,
+ struct device_attribute *attr, char *buf)
{
struct atm_dev *adev = to_atm_dev(cdev);
- return sprintf(buf, "%s\n", adev->type);
+
+ return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type);
}
-static ssize_t show_address(struct class_device *cdev, char *buf)
+static ssize_t address_show(struct device *cdev,
+ struct device_attribute *attr, char *buf)
{
- char *pos = buf;
struct atm_dev *adev = to_atm_dev(cdev);
- int i;
-
- for (i = 0; i < (ESI_LEN - 1); i++)
- pos += sprintf(pos, "%02x:", adev->esi[i]);
- pos += sprintf(pos, "%02x\n", adev->esi[i]);
- return pos - buf;
+ return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
}
-static ssize_t show_atmaddress(struct class_device *cdev, char *buf)
+static ssize_t atmaddress_show(struct device *cdev,
+ struct device_attribute *attr, char *buf)
{
- unsigned long flags;
- char *pos = buf;
+ unsigned long flags;
struct atm_dev *adev = to_atm_dev(cdev);
- struct atm_dev_addr *aaddr;
- int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
- int i, j;
-
- spin_lock_irqsave(&adev->lock, flags);
- list_for_each_entry(aaddr, &adev->local, entry) {
- for(i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
- if (j == *fmt) {
- pos += sprintf(pos, ".");
- ++fmt;
- j = 0;
- }
- pos += sprintf(pos, "%02x", aaddr->addr.sas_addr.prv[i]);
- }
- pos += sprintf(pos, "\n");
+ struct atm_dev_addr *aaddr;
+ int count = 0;
+
+ spin_lock_irqsave(&adev->lock, flags);
+ list_for_each_entry(aaddr, &adev->local, entry) {
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%1phN.%2phN.%10phN.%6phN.%1phN\n",
+ &aaddr->addr.sas_addr.prv[0],
+ &aaddr->addr.sas_addr.prv[1],
+ &aaddr->addr.sas_addr.prv[3],
+ &aaddr->addr.sas_addr.prv[13],
+ &aaddr->addr.sas_addr.prv[19]);
}
- spin_unlock_irqrestore(&adev->lock, flags);
+ spin_unlock_irqrestore(&adev->lock, flags);
- return pos - buf;
+ return count;
}
-static ssize_t show_carrier(struct class_device *cdev, char *buf)
+static ssize_t atmindex_show(struct device *cdev,
+ struct device_attribute *attr, char *buf)
{
- char *pos = buf;
struct atm_dev *adev = to_atm_dev(cdev);
- pos += sprintf(pos, "%d\n",
- adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
-
- return pos - buf;
+ return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number);
}
-static ssize_t show_link_rate(struct class_device *cdev, char *buf)
+static ssize_t carrier_show(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
+}
+
+static ssize_t link_rate_show(struct device *cdev,
+ struct device_attribute *attr, char *buf)
{
- char *pos = buf;
struct atm_dev *adev = to_atm_dev(cdev);
int link_rate;
/* show the link rate, not the data rate */
switch (adev->link_rate) {
- case ATM_OC3_PCR:
- link_rate = 155520000;
- break;
- case ATM_OC12_PCR:
- link_rate = 622080000;
- break;
- case ATM_25_PCR:
- link_rate = 25600000;
- break;
- default:
- link_rate = adev->link_rate * 8 * 53;
+ case ATM_OC3_PCR:
+ link_rate = 155520000;
+ break;
+ case ATM_OC12_PCR:
+ link_rate = 622080000;
+ break;
+ case ATM_25_PCR:
+ link_rate = 25600000;
+ break;
+ default:
+ link_rate = adev->link_rate * 8 * 53;
}
- pos += sprintf(pos, "%d\n", link_rate);
-
- return pos - buf;
+ return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate);
}
-static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
-static CLASS_DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL);
-static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL);
-static CLASS_DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
-static CLASS_DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL);
-
-static struct class_device_attribute *atm_attrs[] = {
- &class_device_attr_atmaddress,
- &class_device_attr_address,
- &class_device_attr_carrier,
- &class_device_attr_type,
- &class_device_attr_link_rate,
+static DEVICE_ATTR_RO(address);
+static DEVICE_ATTR_RO(atmaddress);
+static DEVICE_ATTR_RO(atmindex);
+static DEVICE_ATTR_RO(carrier);
+static DEVICE_ATTR_RO(type);
+static DEVICE_ATTR_RO(link_rate);
+
+static struct device_attribute *atm_attrs[] = {
+ &dev_attr_atmaddress,
+ &dev_attr_address,
+ &dev_attr_atmindex,
+ &dev_attr_carrier,
+ &dev_attr_type,
+ &dev_attr_link_rate,
NULL
};
-static int atm_uevent(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
+
+static int atm_uevent(const struct device *cdev, struct kobj_uevent_env *env)
{
- struct atm_dev *adev;
- int i = 0, len = 0;
+ const struct atm_dev *adev;
if (!cdev)
return -ENODEV;
adev = to_atm_dev(cdev);
- if (!adev)
- return -ENODEV;
- if (add_uevent_var(envp, num_envp, &i, buf, size, &len,
- "NAME=%s%d", adev->type, adev->number))
+ if (add_uevent_var(env, "NAME=%s%d", adev->type, adev->number))
return -ENOMEM;
- envp[i] = NULL;
return 0;
}
-static void atm_release(struct class_device *cdev)
+static void atm_release(struct device *cdev)
{
struct atm_dev *adev = to_atm_dev(cdev);
@@ -134,25 +132,26 @@ static void atm_release(struct class_device *cdev)
static struct class atm_class = {
.name = "atm",
- .release = atm_release,
- .uevent = atm_uevent,
+ .dev_release = atm_release,
+ .dev_uevent = atm_uevent,
};
-int atm_register_sysfs(struct atm_dev *adev)
+int atm_register_sysfs(struct atm_dev *adev, struct device *parent)
{
- struct class_device *cdev = &adev->class_dev;
+ struct device *cdev = &adev->class_dev;
int i, j, err;
cdev->class = &atm_class;
- class_set_devdata(cdev, adev);
+ cdev->parent = parent;
+ dev_set_drvdata(cdev, adev);
- snprintf(cdev->class_id, BUS_ID_SIZE, "%s%d", adev->type, adev->number);
- err = class_device_register(cdev);
+ dev_set_name(cdev, "%s%d", adev->type, adev->number);
+ err = device_register(cdev);
if (err < 0)
return err;
for (i = 0; atm_attrs[i]; i++) {
- err = class_device_create_file(cdev, atm_attrs[i]);
+ err = device_create_file(cdev, atm_attrs[i]);
if (err)
goto err_out;
}
@@ -161,16 +160,16 @@ int atm_register_sysfs(struct atm_dev *adev)
err_out:
for (j = 0; j < i; j++)
- class_device_remove_file(cdev, atm_attrs[j]);
- class_device_del(cdev);
+ device_remove_file(cdev, atm_attrs[j]);
+ device_del(cdev);
return err;
}
void atm_unregister_sysfs(struct atm_dev *adev)
{
- struct class_device *cdev = &adev->class_dev;
+ struct device *cdev = &adev->class_dev;
- class_device_del(cdev);
+ device_del(cdev);
}
int __init atm_sysfs_init(void)
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index b04162f10d85..f666f2f98ba5 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -1,8 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
-Experimental ethernet netdevice using ATM AAL5 as underlying carrier
-(RFC1483 obsoleted by RFC2684) for Linux 2.4
-Author: Marcell GAL, 2000, XDSL Ltd, Hungary
-*/
+ * Ethernet netdevice using ATM AAL5 as underlying carrier
+ * (RFC1483 obsoleted by RFC2684) for Linux
+ *
+ * Authors: Marcell GAL, 2000, XDSL Ltd, Hungary
+ * Eric Kinzie, 2006-2007, US Naval Research Laboratory
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/module.h>
#include <linux/init.h>
@@ -13,7 +18,8 @@ Author: Marcell GAL, 2000, XDSL Ltd, Hungary
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/ip.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
#include <net/arp.h>
#include <linux/atm.h>
#include <linux/atmdev.h>
@@ -23,79 +29,72 @@ Author: Marcell GAL, 2000, XDSL Ltd, Hungary
#include <linux/atmbr2684.h>
#include "common.h"
-#include "ipcommon.h"
-
-/*
- * Define this to use a version of the code which interacts with the higher
- * layers in a more intellegent way, by always reserving enough space for
- * our header at the begining of the packet. However, there may still be
- * some problems with programs like tcpdump. In 2.5 we'll sort out what
- * we need to do to get this perfect. For now we just will copy the packet
- * if we need space for the header
- */
-/* #define FASTER_VERSION */
-#ifdef DEBUG
-#define DPRINTK(format, args...) printk(KERN_DEBUG "br2684: " format, ##args)
-#else
-#define DPRINTK(format, args...)
-#endif
-
-#ifdef SKB_DEBUG
static void skb_debug(const struct sk_buff *skb)
{
+#ifdef SKB_DEBUG
#define NUM2PRINT 50
- char buf[NUM2PRINT * 3 + 1]; /* 3 chars per byte */
- int i = 0;
- for (i = 0; i < skb->len && i < NUM2PRINT; i++) {
- sprintf(buf + i * 3, "%2.2x ", 0xff & skb->data[i]);
- }
- printk(KERN_DEBUG "br2684: skb: %s\n", buf);
-}
-#else
-#define skb_debug(skb) do {} while (0)
+ print_hex_dump(KERN_DEBUG, "br2684: skb: ", DUMP_OFFSET,
+ 16, 1, skb->data, min(NUM2PRINT, skb->len), true);
#endif
+}
-static unsigned char llc_oui_pid_pad[] =
- { 0xAA, 0xAA, 0x03, 0x00, 0x80, 0xC2, 0x00, 0x07, 0x00, 0x00 };
-#define PADLEN (2)
+#define BR2684_ETHERTYPE_LEN 2
+#define BR2684_PAD_LEN 2
+
+#define LLC 0xaa, 0xaa, 0x03
+#define SNAP_BRIDGED 0x00, 0x80, 0xc2
+#define SNAP_ROUTED 0x00, 0x00, 0x00
+#define PID_ETHERNET 0x00, 0x07
+#define ETHERTYPE_IPV4 0x08, 0x00
+#define ETHERTYPE_IPV6 0x86, 0xdd
+#define PAD_BRIDGED 0x00, 0x00
+
+static const unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 };
+static const unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 };
+static const unsigned char llc_oui_pid_pad[] =
+ { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED };
+static const unsigned char pad[] = { PAD_BRIDGED };
+static const unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 };
+static const unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 };
enum br2684_encaps {
- e_vc = BR2684_ENCAPS_VC,
+ e_vc = BR2684_ENCAPS_VC,
e_llc = BR2684_ENCAPS_LLC,
};
struct br2684_vcc {
- struct atm_vcc *atmvcc;
+ struct atm_vcc *atmvcc;
struct net_device *device;
- /* keep old push,pop functions for chaining */
- void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb);
- /* void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); */
+ /* keep old push, pop functions for chaining */
+ void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb);
+ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb);
+ void (*old_release_cb)(struct atm_vcc *vcc);
+ struct module *old_owner;
enum br2684_encaps encaps;
struct list_head brvccs;
#ifdef CONFIG_ATM_BR2684_IPFILTER
struct br2684_filter filter;
#endif /* CONFIG_ATM_BR2684_IPFILTER */
-#ifndef FASTER_VERSION
- unsigned copies_needed, copies_failed;
-#endif /* FASTER_VERSION */
+ unsigned int copies_needed, copies_failed;
+ atomic_t qspace;
};
struct br2684_dev {
struct net_device *net_dev;
struct list_head br2684_devs;
int number;
- struct list_head brvccs; /* one device <=> one vcc (before xmas) */
- struct net_device_stats stats;
+ struct list_head brvccs; /* one device <=> one vcc (before xmas) */
int mac_was_set;
+ enum br2684_payload payload;
};
/*
* This lock should be held for writing any time the list of devices or
* their attached vcc's could be altered. It should be held for reading
* any time these are being queried. Note that we sometimes need to
- * do read-locking under interrupt context, so write locking must block
- * the current CPU's interrupts
+ * do read-locking under interrupting context, so write locking must block
+ * the current CPU's interrupts.
*/
static DEFINE_RWLOCK(devs_lock);
@@ -103,7 +102,7 @@ static LIST_HEAD(br2684_devs);
static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev)
{
- return (struct br2684_dev *) net_dev->priv;
+ return netdev_priv(net_dev);
}
static inline struct net_device *list_entry_brdev(const struct list_head *le)
@@ -113,7 +112,7 @@ static inline struct net_device *list_entry_brdev(const struct list_head *le)
static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc)
{
- return (struct br2684_vcc *) (atmvcc->user_back);
+ return (struct br2684_vcc *)(atmvcc->user_back);
}
static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le)
@@ -145,23 +144,71 @@ static struct net_device *br2684_find_dev(const struct br2684_if_spec *s)
return NULL;
}
+static int atm_dev_event(struct notifier_block *this, unsigned long event,
+ void *arg)
+{
+ struct atm_dev *atm_dev = arg;
+ struct list_head *lh;
+ struct net_device *net_dev;
+ struct br2684_vcc *brvcc;
+ struct atm_vcc *atm_vcc;
+ unsigned long flags;
+
+ pr_debug("event=%ld dev=%p\n", event, atm_dev);
+
+ read_lock_irqsave(&devs_lock, flags);
+ list_for_each(lh, &br2684_devs) {
+ net_dev = list_entry_brdev(lh);
+
+ list_for_each_entry(brvcc, &BRPRIV(net_dev)->brvccs, brvccs) {
+ atm_vcc = brvcc->atmvcc;
+ if (atm_vcc && brvcc->atmvcc->dev == atm_dev) {
+
+ if (atm_vcc->dev->signal == ATM_PHY_SIG_LOST)
+ netif_carrier_off(net_dev);
+ else
+ netif_carrier_on(net_dev);
+
+ }
+ }
+ }
+ read_unlock_irqrestore(&devs_lock, flags);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block atm_dev_notifier = {
+ .notifier_call = atm_dev_event,
+};
+
+/* chained vcc->pop function. Check if we should wake the netif_queue */
+static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct br2684_vcc *brvcc = BR2684_VCC(vcc);
+
+ pr_debug("(vcc %p ; net_dev %p )\n", vcc, brvcc->device);
+ brvcc->old_pop(vcc, skb);
+
+ /* If the queue space just went up from zero, wake */
+ if (atomic_inc_return(&brvcc->qspace) == 1)
+ netif_wake_queue(brvcc->device);
+}
+
/*
* Send a packet out a particular vcc. Not to useful right now, but paves
* the way for multiple vcc's per itf. Returns true if we can send,
* otherwise false
*/
-static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
- struct br2684_vcc *brvcc)
+static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev,
+ struct br2684_vcc *brvcc)
{
+ struct br2684_dev *brdev = BRPRIV(dev);
struct atm_vcc *atmvcc;
-#ifdef FASTER_VERSION
- if (brvcc->encaps == e_llc)
- memcpy(skb_push(skb, 8), llc_oui_pid_pad, 8);
- /* last 2 bytes of llc_oui_pid_pad are managed by header routines;
- yes, you got it: 8 + 2 = sizeof(llc_oui_pid_pad)
- */
-#else
- int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2;
+ int minheadroom = (brvcc->encaps == e_llc) ?
+ ((brdev->payload == p_bridged) ?
+ sizeof(llc_oui_pid_pad) : sizeof(llc_oui_ipv4)) :
+ ((brdev->payload == p_bridged) ? BR2684_PAD_LEN : 0);
+
if (skb_headroom(skb) < minheadroom) {
struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom);
brvcc->copies_needed++;
@@ -172,168 +219,137 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
}
skb = skb2;
}
- skb_push(skb, minheadroom);
- if (brvcc->encaps == e_llc)
- memcpy(skb->data, llc_oui_pid_pad, 10);
- else
- memset(skb->data, 0, 2);
-#endif /* FASTER_VERSION */
+
+ if (brvcc->encaps == e_llc) {
+ if (brdev->payload == p_bridged) {
+ skb_push(skb, sizeof(llc_oui_pid_pad));
+ skb_copy_to_linear_data(skb, llc_oui_pid_pad,
+ sizeof(llc_oui_pid_pad));
+ } else if (brdev->payload == p_routed) {
+ unsigned short prot = ntohs(skb->protocol);
+
+ skb_push(skb, sizeof(llc_oui_ipv4));
+ switch (prot) {
+ case ETH_P_IP:
+ skb_copy_to_linear_data(skb, llc_oui_ipv4,
+ sizeof(llc_oui_ipv4));
+ break;
+ case ETH_P_IPV6:
+ skb_copy_to_linear_data(skb, llc_oui_ipv6,
+ sizeof(llc_oui_ipv6));
+ break;
+ default:
+ dev_kfree_skb(skb);
+ return 0;
+ }
+ }
+ } else { /* e_vc */
+ if (brdev->payload == p_bridged) {
+ skb_push(skb, 2);
+ memset(skb->data, 0, 2);
+ }
+ }
skb_debug(skb);
ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
- DPRINTK("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
- if (!atm_may_send(atmvcc, skb->truesize)) {
- /* we free this here for now, because we cannot know in a higher
- layer whether the skb point it supplied wasn't freed yet.
- now, it always is.
- */
- dev_kfree_skb(skb);
- return 0;
- }
- atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
- ATM_SKB(skb)->atm_options = atmvcc->atm_options;
- brdev->stats.tx_packets++;
- brdev->stats.tx_bytes += skb->len;
- atmvcc->send(atmvcc, skb);
- return 1;
+ pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
+ atm_account_tx(atmvcc, skb);
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ if (atomic_dec_return(&brvcc->qspace) < 1) {
+ /* No more please! */
+ netif_stop_queue(brvcc->device);
+ /* We might have raced with br2684_pop() */
+ if (unlikely(atomic_read(&brvcc->qspace) > 0))
+ netif_wake_queue(brvcc->device);
+ }
+
+ /* If this fails immediately, the skb will be freed and br2684_pop()
+ will wake the queue if appropriate. Just return an error so that
+ the stats are updated correctly */
+ return !atmvcc->send(atmvcc, skb);
+}
+
+static void br2684_release_cb(struct atm_vcc *atmvcc)
+{
+ struct br2684_vcc *brvcc = BR2684_VCC(atmvcc);
+
+ if (atomic_read(&brvcc->qspace) > 0)
+ netif_wake_queue(brvcc->device);
+
+ if (brvcc->old_release_cb)
+ brvcc->old_release_cb(atmvcc);
}
-static inline struct br2684_vcc *pick_outgoing_vcc(struct sk_buff *skb,
- struct br2684_dev *brdev)
+static inline struct br2684_vcc *pick_outgoing_vcc(const struct sk_buff *skb,
+ const struct br2684_dev *brdev)
{
- return list_empty(&brdev->brvccs) ? NULL :
- list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */
+ return list_empty(&brdev->brvccs) ? NULL : list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */
}
-static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t br2684_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct br2684_dev *brdev = BRPRIV(dev);
struct br2684_vcc *brvcc;
+ struct atm_vcc *atmvcc;
+ netdev_tx_t ret = NETDEV_TX_OK;
- DPRINTK("br2684_start_xmit, skb->dst=%p\n", skb->dst);
+ pr_debug("skb_dst(skb)=%p\n", skb_dst(skb));
read_lock(&devs_lock);
brvcc = pick_outgoing_vcc(skb, brdev);
if (brvcc == NULL) {
- DPRINTK("no vcc attached to dev %s\n", dev->name);
- brdev->stats.tx_errors++;
- brdev->stats.tx_carrier_errors++;
+ pr_debug("no vcc attached to dev %s\n", dev->name);
+ dev->stats.tx_errors++;
+ dev->stats.tx_carrier_errors++;
/* netif_stop_queue(dev); */
dev_kfree_skb(skb);
- read_unlock(&devs_lock);
- return 0;
- }
- if (!br2684_xmit_vcc(skb, brdev, brvcc)) {
- /*
- * We should probably use netif_*_queue() here, but that
- * involves added complication. We need to walk before
- * we can run
- */
- /* don't free here! this pointer might be no longer valid!
- dev_kfree_skb(skb);
- */
- brdev->stats.tx_errors++;
- brdev->stats.tx_fifo_errors++;
+ goto out_devs;
}
- read_unlock(&devs_lock);
- return 0;
-}
-
-static struct net_device_stats *br2684_get_stats(struct net_device *dev)
-{
- DPRINTK("br2684_get_stats\n");
- return &BRPRIV(dev)->stats;
-}
-
-#ifdef FASTER_VERSION
-/*
- * These mirror eth_header and eth_header_cache. They are not usually
- * exported for use in modules, so we grab them from net_device
- * after ether_setup() is done with it. Bit of a hack.
- */
-static int (*my_eth_header)(struct sk_buff *, struct net_device *,
- unsigned short, void *, void *, unsigned);
-static int (*my_eth_header_cache)(struct neighbour *, struct hh_cache *);
+ atmvcc = brvcc->atmvcc;
-static int
-br2684_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type, void *daddr, void *saddr, unsigned len)
-{
- u16 *pad_before_eth;
- int t = my_eth_header(skb, dev, type, daddr, saddr, len);
- if (t > 0) {
- pad_before_eth = (u16 *) skb_push(skb, 2);
- *pad_before_eth = 0;
- return dev->hard_header_len; /* or return 16; ? */
- } else
- return t;
-}
+ bh_lock_sock(sk_atm(atmvcc));
-static int
-br2684_header_cache(struct neighbour *neigh, struct hh_cache *hh)
-{
-/* hh_data is 16 bytes long. if encaps is ether-llc we need 24, so
-xmit will add the additional header part in that case */
- u16 *pad_before_eth = (u16 *)(hh->hh_data);
- int t = my_eth_header_cache(neigh, hh);
- DPRINTK("br2684_header_cache, neigh=%p, hh_cache=%p\n", neigh, hh);
- if (t < 0)
- return t;
- else {
- *pad_before_eth = 0;
- hh->hh_len = PADLEN + ETH_HLEN;
+ if (test_bit(ATM_VF_RELEASED, &atmvcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &atmvcc->flags) ||
+ !test_bit(ATM_VF_READY, &atmvcc->flags)) {
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ goto out;
}
- return 0;
-}
-
-/*
- * This is similar to eth_type_trans, which cannot be used because of
- * our dev->hard_header_len
- */
-static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
- struct ethhdr *eth;
- unsigned char *rawp;
- eth = eth_hdr(skb);
- if (is_multicast_ether_addr(eth->h_dest)) {
- if (!compare_ether_addr(eth->h_dest, dev->broadcast))
- skb->pkt_type = PACKET_BROADCAST;
- else
- skb->pkt_type = PACKET_MULTICAST;
+ if (sock_owned_by_user(sk_atm(atmvcc))) {
+ netif_stop_queue(brvcc->device);
+ ret = NETDEV_TX_BUSY;
+ goto out;
}
- else if (compare_ether_addr(eth->h_dest, dev->dev_addr))
- skb->pkt_type = PACKET_OTHERHOST;
-
- if (ntohs(eth->h_proto) >= 1536)
- return eth->h_proto;
-
- rawp = skb->data;
-
- /*
- * This is a magic hack to spot IPX packets. Older Novell breaks
- * the protocol design and runs IPX over 802.3 without an 802.2 LLC
- * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
- * won't work for fault tolerant netware but does for the rest.
- */
- if (*(unsigned short *) rawp == 0xFFFF)
- return htons(ETH_P_802_3);
-
- /*
- * Real 802.2 LLC
- */
- return htons(ETH_P_802_2);
+ if (!br2684_xmit_vcc(skb, dev, brvcc)) {
+ /*
+ * We should probably use netif_*_queue() here, but that
+ * involves added complication. We need to walk before
+ * we can run.
+ *
+ * Don't free here! this pointer might be no longer valid!
+ */
+ dev->stats.tx_errors++;
+ dev->stats.tx_fifo_errors++;
+ }
+ out:
+ bh_unlock_sock(sk_atm(atmvcc));
+ out_devs:
+ read_unlock(&devs_lock);
+ return ret;
}
-#endif /* FASTER_VERSION */
/*
* We remember when the MAC gets set, so we don't override it later with
* the ESI of the ATM card of the first VC
*/
-static int (*my_eth_mac_addr)(struct net_device *, void *);
static int br2684_mac_addr(struct net_device *dev, void *p)
{
- int err = my_eth_mac_addr(dev, p);
+ int err = eth_mac_addr(dev, p);
if (!err)
BRPRIV(dev)->mac_was_set = 1;
return err;
@@ -341,7 +357,7 @@ static int br2684_mac_addr(struct net_device *dev, void *p)
#ifdef CONFIG_ATM_BR2684_IPFILTER
/* this IOCTL is experimental. */
-static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg)
+static int br2684_setfilt(struct atm_vcc *atmvcc, void __user * arg)
{
struct br2684_vcc *brvcc;
struct br2684_filter_set fs;
@@ -351,13 +367,13 @@ static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg)
if (fs.ifspec.method != BR2684_FIND_BYNOTHING) {
/*
* This is really a per-vcc thing, but we can also search
- * by device
+ * by device.
*/
struct br2684_dev *brdev;
read_lock(&devs_lock);
brdev = BRPRIV(br2684_find_dev(&fs.ifspec));
if (brdev == NULL || list_empty(&brdev->brvccs) ||
- brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */
+ brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */
brvcc = NULL;
else
brvcc = list_entry_brvcc(brdev->brvccs.next);
@@ -375,15 +391,16 @@ static inline int
packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb)
{
if (brvcc->filter.netmask == 0)
- return 0; /* no filter in place */
- if (type == __constant_htons(ETH_P_IP) &&
- (((struct iphdr *) (skb->data))->daddr & brvcc->filter.
+ return 0; /* no filter in place */
+ if (type == htons(ETH_P_IP) &&
+ (((struct iphdr *)(skb->data))->daddr & brvcc->filter.
netmask) == brvcc->filter.prefix)
return 0;
- if (type == __constant_htons(ETH_P_ARP))
+ if (type == htons(ETH_P_ARP))
return 0;
- /* TODO: we should probably filter ARPs too.. don't want to have
- * them returning values that don't make sense, or is that ok?
+ /*
+ * TODO: we should probably filter ARPs too.. don't want to have
+ * them returning values that don't make sense, or is that ok?
*/
return 1; /* drop */
}
@@ -391,14 +408,15 @@ packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb)
static void br2684_close_vcc(struct br2684_vcc *brvcc)
{
- DPRINTK("removing VCC %p from dev %p\n", brvcc, brvcc->device);
+ pr_debug("removing VCC %p from dev %p\n", brvcc, brvcc->device);
write_lock_irq(&devs_lock);
list_del(&brvcc->brvccs);
write_unlock_irq(&devs_lock);
brvcc->atmvcc->user_back = NULL; /* what about vcc->recvq ??? */
+ brvcc->atmvcc->release_cb = brvcc->old_release_cb;
brvcc->old_push(brvcc->atmvcc, NULL); /* pass on the bad news */
+ module_put(brvcc->old_owner);
kfree(brvcc);
- module_put(THIS_MODULE);
}
/* when AAL5 PDU comes in: */
@@ -407,17 +425,16 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
struct br2684_vcc *brvcc = BR2684_VCC(atmvcc);
struct net_device *net_dev = brvcc->device;
struct br2684_dev *brdev = BRPRIV(net_dev);
- int plen = sizeof(llc_oui_pid_pad) + ETH_HLEN;
- DPRINTK("br2684_push\n");
+ pr_debug("\n");
if (unlikely(skb == NULL)) {
/* skb==NULL means VCC is being destroyed */
br2684_close_vcc(brvcc);
if (list_empty(&brdev->brvccs)) {
- read_lock(&devs_lock);
+ write_lock_irq(&devs_lock);
list_del(&brdev->br2684_devs);
- read_unlock(&devs_lock);
+ write_unlock_irq(&devs_lock);
unregister_netdev(net_dev);
free_netdev(net_dev);
}
@@ -426,96 +443,115 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
skb_debug(skb);
atm_return(atmvcc, skb->truesize);
- DPRINTK("skb from brdev %p\n", brdev);
+ pr_debug("skb from brdev %p\n", brdev);
if (brvcc->encaps == e_llc) {
- /* let us waste some time for checking the encapsulation.
- Note, that only 7 char is checked so frames with a valid FCS
- are also accepted (but FCS is not checked of course) */
- if (memcmp(skb->data, llc_oui_pid_pad, 7)) {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb);
- return;
- }
- /* Strip FCS if present */
if (skb->len > 7 && skb->data[7] == 0x01)
__skb_trim(skb, skb->len - 4);
- } else {
- plen = PADLEN + ETH_HLEN; /* pad, dstmac,srcmac, ethtype */
- /* first 2 chars should be 0 */
- if (*((u16 *) (skb->data)) != 0) {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb);
- return;
+
+ /* accept packets that have "ipv[46]" in the snap header */
+ if ((skb->len >= (sizeof(llc_oui_ipv4))) &&
+ (memcmp(skb->data, llc_oui_ipv4,
+ sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) {
+ if (memcmp(skb->data + 6, ethertype_ipv6,
+ sizeof(ethertype_ipv6)) == 0)
+ skb->protocol = htons(ETH_P_IPV6);
+ else if (memcmp(skb->data + 6, ethertype_ipv4,
+ sizeof(ethertype_ipv4)) == 0)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ goto error;
+ skb_pull(skb, sizeof(llc_oui_ipv4));
+ skb_reset_network_header(skb);
+ skb->pkt_type = PACKET_HOST;
+ /*
+ * Let us waste some time for checking the encapsulation.
+ * Note, that only 7 char is checked so frames with a valid FCS
+ * are also accepted (but FCS is not checked of course).
+ */
+ } else if ((skb->len >= sizeof(llc_oui_pid_pad)) &&
+ (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
+ skb_pull(skb, sizeof(llc_oui_pid_pad));
+ skb->protocol = eth_type_trans(skb, net_dev);
+ } else
+ goto error;
+
+ } else { /* e_vc */
+ if (brdev->payload == p_routed) {
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ if (iph->version == 4)
+ skb->protocol = htons(ETH_P_IP);
+ else if (iph->version == 6)
+ skb->protocol = htons(ETH_P_IPV6);
+ else
+ goto error;
+ skb->pkt_type = PACKET_HOST;
+ } else { /* p_bridged */
+ /* first 2 chars should be 0 */
+ if (memcmp(skb->data, pad, BR2684_PAD_LEN) != 0)
+ goto error;
+ skb_pull(skb, BR2684_PAD_LEN);
+ skb->protocol = eth_type_trans(skb, net_dev);
}
}
- if (skb->len < plen) {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb); /* dev_ not needed? */
- return;
- }
-#ifdef FASTER_VERSION
- /* FIXME: tcpdump shows that pointer to mac header is 2 bytes earlier,
- than should be. What else should I set? */
- skb_pull(skb, plen);
- skb->mac.raw = ((char *) (skb->data)) - ETH_HLEN;
- skb->pkt_type = PACKET_HOST;
-#ifdef CONFIG_BR2684_FAST_TRANS
- skb->protocol = ((u16 *) skb->data)[-1];
-#else /* some protocols might require this: */
- skb->protocol = br_type_trans(skb, net_dev);
-#endif /* CONFIG_BR2684_FAST_TRANS */
-#else
- skb_pull(skb, plen - ETH_HLEN);
- skb->protocol = eth_type_trans(skb, net_dev);
-#endif /* FASTER_VERSION */
#ifdef CONFIG_ATM_BR2684_IPFILTER
- if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) {
- brdev->stats.rx_dropped++;
- dev_kfree_skb(skb);
- return;
- }
+ if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb)))
+ goto dropped;
#endif /* CONFIG_ATM_BR2684_IPFILTER */
skb->dev = net_dev;
ATM_SKB(skb)->vcc = atmvcc; /* needed ? */
- DPRINTK("received packet's protocol: %x\n", ntohs(skb->protocol));
+ pr_debug("received packet's protocol: %x\n", ntohs(skb->protocol));
skb_debug(skb);
- if (unlikely(!(net_dev->flags & IFF_UP))) {
- /* sigh, interface is down */
- brdev->stats.rx_dropped++;
- dev_kfree_skb(skb);
- return;
- }
- brdev->stats.rx_packets++;
- brdev->stats.rx_bytes += skb->len;
+ /* sigh, interface is down? */
+ if (unlikely(!(net_dev->flags & IFF_UP)))
+ goto dropped;
+ net_dev->stats.rx_packets++;
+ net_dev->stats.rx_bytes += skb->len;
memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
netif_rx(skb);
+ return;
+
+dropped:
+ net_dev->stats.rx_dropped++;
+ goto free_skb;
+error:
+ net_dev->stats.rx_errors++;
+free_skb:
+ dev_kfree_skb(skb);
}
-static int br2684_regvcc(struct atm_vcc *atmvcc, void __user *arg)
+/*
+ * Assign a vcc to a dev
+ * Note: we do not have explicit unassign, but look at _push()
+ */
+static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
{
-/* assign a vcc to a dev
-Note: we do not have explicit unassign, but look at _push()
-*/
- int err;
struct br2684_vcc *brvcc;
- struct sk_buff_head copy;
- struct sk_buff *skb;
struct br2684_dev *brdev;
struct net_device *net_dev;
struct atm_backend_br2684 be;
+ int err;
if (copy_from_user(&be, arg, sizeof be))
return -EFAULT;
brvcc = kzalloc(sizeof(struct br2684_vcc), GFP_KERNEL);
if (!brvcc)
return -ENOMEM;
+ /*
+ * Allow two packets in the ATM queue. One actually being sent, and one
+ * for the ATM 'TX done' handler to send. It shouldn't take long to get
+ * the next one from the netdev queue, when we need it. More than that
+ * would be bufferbloat.
+ */
+ atomic_set(&brvcc->qspace, 2);
write_lock_irq(&devs_lock);
net_dev = br2684_find_dev(&be.ifspec);
if (net_dev == NULL) {
- printk(KERN_ERR
- "br2684: tried to attach to non-existant device\n");
+ pr_err("tried to attach to non-existent device\n");
err = -ENXIO;
goto error;
}
@@ -529,65 +565,98 @@ Note: we do not have explicit unassign, but look at _push()
err = -EEXIST;
goto error;
}
- if (be.fcs_in != BR2684_FCSIN_NO || be.fcs_out != BR2684_FCSOUT_NO ||
- be.fcs_auto || be.has_vpiid || be.send_padding || (be.encaps !=
- BR2684_ENCAPS_VC && be.encaps != BR2684_ENCAPS_LLC) ||
+ if (be.fcs_in != BR2684_FCSIN_NO ||
+ be.fcs_out != BR2684_FCSOUT_NO ||
+ be.fcs_auto || be.has_vpiid || be.send_padding ||
+ (be.encaps != BR2684_ENCAPS_VC &&
+ be.encaps != BR2684_ENCAPS_LLC) ||
be.min_size != 0) {
err = -EINVAL;
goto error;
}
- DPRINTK("br2684_regvcc vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps,
- brvcc);
+ pr_debug("vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, brvcc);
if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) {
unsigned char *esi = atmvcc->dev->esi;
+ const u8 one = 1;
+
if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5])
- memcpy(net_dev->dev_addr, esi, net_dev->addr_len);
+ dev_addr_set(net_dev, esi);
else
- net_dev->dev_addr[2] = 1;
+ dev_addr_mod(net_dev, 2, &one, 1);
}
list_add(&brvcc->brvccs, &brdev->brvccs);
write_unlock_irq(&devs_lock);
brvcc->device = net_dev;
brvcc->atmvcc = atmvcc;
atmvcc->user_back = brvcc;
- brvcc->encaps = (enum br2684_encaps) be.encaps;
+ brvcc->encaps = (enum br2684_encaps)be.encaps;
brvcc->old_push = atmvcc->push;
+ brvcc->old_pop = atmvcc->pop;
+ brvcc->old_release_cb = atmvcc->release_cb;
+ brvcc->old_owner = atmvcc->owner;
barrier();
atmvcc->push = br2684_push;
- skb_queue_head_init(&copy);
- skb_migrate(&sk_atm(atmvcc)->sk_receive_queue, &copy);
- while ((skb = skb_dequeue(&copy)) != NULL) {
- BRPRIV(skb->dev)->stats.rx_bytes -= skb->len;
- BRPRIV(skb->dev)->stats.rx_packets--;
- br2684_push(atmvcc, skb);
- }
+ atmvcc->pop = br2684_pop;
+ atmvcc->release_cb = br2684_release_cb;
+ atmvcc->owner = THIS_MODULE;
+
+ /* initialize netdev carrier state */
+ if (atmvcc->dev->signal == ATM_PHY_SIG_LOST)
+ netif_carrier_off(net_dev);
+ else
+ netif_carrier_on(net_dev);
+
__module_get(THIS_MODULE);
+
+ /* re-process everything received between connection setup and
+ backend setup */
+ vcc_process_recv_queue(atmvcc);
return 0;
- error:
+
+error:
write_unlock_irq(&devs_lock);
kfree(brvcc);
return err;
}
+static const struct net_device_ops br2684_netdev_ops = {
+ .ndo_start_xmit = br2684_start_xmit,
+ .ndo_set_mac_address = br2684_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+static const struct net_device_ops br2684_netdev_ops_routed = {
+ .ndo_start_xmit = br2684_start_xmit,
+ .ndo_set_mac_address = br2684_mac_addr,
+};
+
static void br2684_setup(struct net_device *netdev)
{
struct br2684_dev *brdev = BRPRIV(netdev);
ether_setup(netdev);
+ netdev->hard_header_len += sizeof(llc_oui_pid_pad); /* worst case */
brdev->net_dev = netdev;
-#ifdef FASTER_VERSION
- my_eth_header = netdev->hard_header;
- netdev->hard_header = br2684_header;
- my_eth_header_cache = netdev->hard_header_cache;
- netdev->hard_header_cache = br2684_header_cache;
- netdev->hard_header_len = sizeof(llc_oui_pid_pad) + ETH_HLEN; /* 10 + 14 */
-#endif
- my_eth_mac_addr = netdev->set_mac_address;
- netdev->set_mac_address = br2684_mac_addr;
- netdev->hard_start_xmit = br2684_start_xmit;
- netdev->get_stats = br2684_get_stats;
+ netdev->netdev_ops = &br2684_netdev_ops;
+
+ INIT_LIST_HEAD(&brdev->brvccs);
+}
+static void br2684_setup_routed(struct net_device *netdev)
+{
+ struct br2684_dev *brdev = BRPRIV(netdev);
+
+ brdev->net_dev = netdev;
+ netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */
+ netdev->netdev_ops = &br2684_netdev_ops_routed;
+ netdev->addr_len = 0;
+ netdev->mtu = ETH_DATA_LEN;
+ netdev->min_mtu = 0;
+ netdev->max_mtu = ETH_MAX_MTU;
+ netdev->type = ARPHRD_PPP;
+ netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+ netdev->tx_queue_len = 100;
INIT_LIST_HEAD(&brdev->brvccs);
}
@@ -597,36 +666,50 @@ static int br2684_create(void __user *arg)
struct net_device *netdev;
struct br2684_dev *brdev;
struct atm_newif_br2684 ni;
+ enum br2684_payload payload;
- DPRINTK("br2684_create\n");
+ pr_debug("\n");
- if (copy_from_user(&ni, arg, sizeof ni)) {
+ if (copy_from_user(&ni, arg, sizeof ni))
return -EFAULT;
- }
- if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500) {
+
+ if (ni.media & BR2684_FLAG_ROUTED)
+ payload = p_routed;
+ else
+ payload = p_bridged;
+ ni.media &= 0xffff; /* strip flags */
+
+ if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500)
return -EINVAL;
- }
netdev = alloc_netdev(sizeof(struct br2684_dev),
ni.ifname[0] ? ni.ifname : "nas%d",
- br2684_setup);
+ NET_NAME_UNKNOWN,
+ (payload == p_routed) ? br2684_setup_routed : br2684_setup);
if (!netdev)
return -ENOMEM;
brdev = BRPRIV(netdev);
- DPRINTK("registered netdev %s\n", netdev->name);
+ pr_debug("registered netdev %s\n", netdev->name);
/* open, stop, do_ioctl ? */
err = register_netdev(netdev);
if (err < 0) {
- printk(KERN_ERR "br2684_create: register_netdev failed\n");
+ pr_err("register_netdev failed\n");
free_netdev(netdev);
return err;
}
write_lock_irq(&devs_lock);
- brdev->number = list_empty(&br2684_devs) ? 1 :
- BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
+
+ brdev->payload = payload;
+
+ if (list_empty(&br2684_devs)) {
+ /* 1st br2684 device */
+ brdev->number = 1;
+ } else
+ brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
+
list_add_tail(&brdev->br2684_devs, &br2684_devs);
write_unlock_irq(&devs_lock);
return 0;
@@ -637,16 +720,16 @@ static int br2684_create(void __user *arg)
* -ENOIOCTLCMD for any unrecognized ioctl
*/
static int br2684_ioctl(struct socket *sock, unsigned int cmd,
- unsigned long arg)
+ unsigned long arg)
{
struct atm_vcc *atmvcc = ATM_SD(sock);
void __user *argp = (void __user *)arg;
+ atm_backend_t b;
int err;
- switch(cmd) {
+ switch (cmd) {
case ATM_SETBACKEND:
- case ATM_NEWBACKENDIF: {
- atm_backend_t b;
+ case ATM_NEWBACKENDIF:
err = get_user(b, (atm_backend_t __user *) argp);
if (err)
return -EFAULT;
@@ -654,9 +737,11 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd,
return -ENOIOCTLCMD;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (cmd == ATM_SETBACKEND)
+ if (cmd == ATM_SETBACKEND) {
+ if (sock->state != SS_CONNECTED)
+ return -EINVAL;
return br2684_regvcc(atmvcc, argp);
- else
+ } else {
return br2684_create(argp);
}
#ifdef CONFIG_ATM_BR2684_IPFILTER
@@ -666,6 +751,7 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
err = br2684_setfilt(atmvcc, argp);
+
return err;
#endif /* CONFIG_ATM_BR2684_IPFILTER */
}
@@ -673,119 +759,80 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd,
}
static struct atm_ioctl br2684_ioctl_ops = {
- .owner = THIS_MODULE,
- .ioctl = br2684_ioctl,
+ .owner = THIS_MODULE,
+ .ioctl = br2684_ioctl,
};
-
#ifdef CONFIG_PROC_FS
-static void *br2684_seq_start(struct seq_file *seq, loff_t *pos)
+static void *br2684_seq_start(struct seq_file *seq, loff_t * pos)
+ __acquires(devs_lock)
{
- loff_t offs = 0;
- struct br2684_dev *brd;
-
read_lock(&devs_lock);
-
- list_for_each_entry(brd, &br2684_devs, br2684_devs) {
- if (offs == *pos)
- return brd;
- ++offs;
- }
- return NULL;
+ return seq_list_start(&br2684_devs, *pos);
}
-static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t * pos)
{
- struct br2684_dev *brd = v;
-
- ++*pos;
-
- brd = list_entry(brd->br2684_devs.next,
- struct br2684_dev, br2684_devs);
- return (&brd->br2684_devs != &br2684_devs) ? brd : NULL;
+ return seq_list_next(v, &br2684_devs, pos);
}
static void br2684_seq_stop(struct seq_file *seq, void *v)
+ __releases(devs_lock)
{
read_unlock(&devs_lock);
}
static int br2684_seq_show(struct seq_file *seq, void *v)
{
- const struct br2684_dev *brdev = v;
+ const struct br2684_dev *brdev = list_entry(v, struct br2684_dev,
+ br2684_devs);
const struct net_device *net_dev = brdev->net_dev;
const struct br2684_vcc *brvcc;
- seq_printf(seq, "dev %.16s: num=%d, mac=%02X:%02X:"
- "%02X:%02X:%02X:%02X (%s)\n", net_dev->name,
- brdev->number,
- net_dev->dev_addr[0],
- net_dev->dev_addr[1],
- net_dev->dev_addr[2],
- net_dev->dev_addr[3],
- net_dev->dev_addr[4],
- net_dev->dev_addr[5],
- brdev->mac_was_set ? "set" : "auto");
+ seq_printf(seq, "dev %.16s: num=%d, mac=%pM (%s)\n",
+ net_dev->name,
+ brdev->number,
+ net_dev->dev_addr,
+ brdev->mac_was_set ? "set" : "auto");
list_for_each_entry(brvcc, &brdev->brvccs, brvccs) {
- seq_printf(seq, " vcc %d.%d.%d: encaps=%s"
-#ifndef FASTER_VERSION
- ", failed copies %u/%u"
-#endif /* FASTER_VERSION */
- "\n", brvcc->atmvcc->dev->number,
- brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
- (brvcc->encaps == e_llc) ? "LLC" : "VC"
-#ifndef FASTER_VERSION
- , brvcc->copies_failed
- , brvcc->copies_needed
-#endif /* FASTER_VERSION */
- );
+ seq_printf(seq, " vcc %d.%d.%d: encaps=%s payload=%s"
+ ", failed copies %u/%u"
+ "\n", brvcc->atmvcc->dev->number,
+ brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
+ (brvcc->encaps == e_llc) ? "LLC" : "VC",
+ (brdev->payload == p_bridged) ? "bridged" : "routed",
+ brvcc->copies_failed, brvcc->copies_needed);
#ifdef CONFIG_ATM_BR2684_IPFILTER
-#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte]
-#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
- if (brvcc->filter.netmask != 0)
- seq_printf(seq, " filter=%d.%d.%d.%d/"
- "%d.%d.%d.%d\n",
- bs(prefix), bs(netmask));
-#undef bs
-#undef b1
+ if (brvcc->filter.netmask != 0)
+ seq_printf(seq, " filter=%pI4/%pI4\n",
+ &brvcc->filter.prefix,
+ &brvcc->filter.netmask);
#endif /* CONFIG_ATM_BR2684_IPFILTER */
}
return 0;
}
-static struct seq_operations br2684_seq_ops = {
+static const struct seq_operations br2684_seq_ops = {
.start = br2684_seq_start,
- .next = br2684_seq_next,
- .stop = br2684_seq_stop,
- .show = br2684_seq_show,
-};
-
-static int br2684_proc_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &br2684_seq_ops);
-}
-
-static struct file_operations br2684_proc_ops = {
- .owner = THIS_MODULE,
- .open = br2684_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+ .next = br2684_seq_next,
+ .stop = br2684_seq_stop,
+ .show = br2684_seq_show,
};
extern struct proc_dir_entry *atm_proc_root; /* from proc.c */
-#endif
+#endif /* CONFIG_PROC_FS */
static int __init br2684_init(void)
{
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *p;
- if ((p = create_proc_entry("br2684", 0, atm_proc_root)) == NULL)
+ p = proc_create_seq("br2684", 0, atm_proc_root, &br2684_seq_ops);
+ if (p == NULL)
return -ENOMEM;
- p->proc_fops = &br2684_proc_ops;
#endif
register_atm_ioctl(&br2684_ioctl_ops);
+ register_atmdevice_notifier(&atm_dev_notifier);
return 0;
}
@@ -800,6 +847,9 @@ static void __exit br2684_exit(void)
remove_proc_entry("br2684", atm_proc_root);
#endif
+
+ unregister_atmdevice_notifier(&atm_dev_notifier);
+
while (!list_empty(&br2684_devs)) {
net_dev = list_entry_brdev(br2684_devs.next);
brdev = BRPRIV(net_dev);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 1c416934b7c1..8f152e5fa659 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -1,7 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/atm/clip.c - RFC1577 Classical IP over ATM */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/kernel.h> /* for UINT_MAX */
@@ -28,60 +31,63 @@
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <linux/jhash.h>
+#include <linux/slab.h>
#include <net/route.h> /* for struct rtable and routing */
#include <net/icmp.h> /* icmp_send */
-#include <asm/param.h> /* for HZ */
+#include <net/arp.h>
+#include <linux/param.h> /* for HZ */
+#include <linux/uaccess.h>
#include <asm/byteorder.h> /* for htons etc. */
-#include <asm/system.h> /* save/restore_flags */
-#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include "common.h"
#include "resources.h"
-#include "ipcommon.h"
#include <net/atmclip.h>
-
-#if 0
-#define DPRINTK(format,args...) printk(format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-
static struct net_device *clip_devs;
-static struct atm_vcc *atmarpd;
-static struct neigh_table clip_tbl;
+static struct atm_vcc __rcu *atmarpd;
+static DEFINE_MUTEX(atmarpd_lock);
static struct timer_list idle_timer;
+static const struct neigh_ops clip_neigh_ops;
static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip)
{
struct sock *sk;
struct atmarp_ctrl *ctrl;
+ struct atm_vcc *vcc;
struct sk_buff *skb;
+ int err = 0;
- DPRINTK("to_atmarpd(%d)\n", type);
- if (!atmarpd)
- return -EUNATCH;
- skb = alloc_skb(sizeof(struct atmarp_ctrl),GFP_ATOMIC);
- if (!skb)
- return -ENOMEM;
- ctrl = (struct atmarp_ctrl *) skb_put(skb,sizeof(struct atmarp_ctrl));
+ pr_debug("(%d)\n", type);
+
+ rcu_read_lock();
+ vcc = rcu_dereference(atmarpd);
+ if (!vcc) {
+ err = -EUNATCH;
+ goto unlock;
+ }
+ skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ ctrl = skb_put(skb, sizeof(struct atmarp_ctrl));
ctrl->type = type;
ctrl->itf_num = itf;
ctrl->ip = ip;
- atm_force_charge(atmarpd, skb->truesize);
+ atm_force_charge(vcc, skb->truesize);
- sk = sk_atm(atmarpd);
+ sk = sk_atm(vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
- return 0;
+ sk->sk_data_ready(sk);
+unlock:
+ rcu_read_unlock();
+ return err;
}
static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry)
{
- DPRINTK("link_vcc %p to entry %p (neigh %p)\n", clip_vcc, entry,
- entry->neigh);
+ pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh);
clip_vcc->entry = entry;
clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */
clip_vcc->next = entry->vccs;
@@ -95,7 +101,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
struct clip_vcc **walk;
if (!entry) {
- printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
+ pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
return;
}
netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
@@ -113,30 +119,30 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
entry->expires = jiffies - 1;
/* force resolution or expiration */
error = neigh_update(entry->neigh, NULL, NUD_NONE,
- NEIGH_UPDATE_F_ADMIN);
+ NEIGH_UPDATE_F_ADMIN, 0);
if (error)
- printk(KERN_CRIT "unlink_clip_vcc: "
- "neigh_update failed with %d\n", error);
+ pr_err("neigh_update failed with %d\n", error);
goto out;
}
- printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
- "0x%p)\n", entry, clip_vcc);
- out:
+ pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
+out:
netif_tx_unlock_bh(entry->neigh->dev);
}
/* The neighbour entry n->lock is held. */
static int neigh_check_cb(struct neighbour *n)
{
- struct atmarp_entry *entry = NEIGH2ENTRY(n);
+ struct atmarp_entry *entry = neighbour_priv(n);
struct clip_vcc *cv;
+ if (n->ops != &clip_neigh_ops)
+ return 0;
for (cv = entry->vccs; cv; cv = cv->next) {
unsigned long exp = cv->last_use + cv->idle_timeout;
if (cv->idle_timeout && time_after(jiffies, exp)) {
- DPRINTK("releasing vcc %p->%p of entry %p\n",
- cv, cv->vcc, entry);
+ pr_debug("releasing vcc %p->%p of entry %p\n",
+ cv, cv->vcc, entry);
vcc_release_async(cv->vcc, -ETIMEDOUT);
}
}
@@ -144,11 +150,11 @@ static int neigh_check_cb(struct neighbour *n)
if (entry->vccs || time_before(jiffies, entry->expires))
return 0;
- if (atomic_read(&n->refcnt) > 1) {
+ if (refcount_read(&n->refcnt) > 1) {
struct sk_buff *skb;
- DPRINTK("destruction postponed with ref %d\n",
- atomic_read(&n->refcnt));
+ pr_debug("destruction postponed with ref %d\n",
+ refcount_read(&n->refcnt));
while ((skb = skb_dequeue(&n->arp_queue)) != NULL)
dev_kfree_skb(skb);
@@ -156,30 +162,30 @@ static int neigh_check_cb(struct neighbour *n)
return 0;
}
- DPRINTK("expired neigh %p\n", n);
+ pr_debug("expired neigh %p\n", n);
return 1;
}
-static void idle_timer_check(unsigned long dummy)
+static void idle_timer_check(struct timer_list *unused)
{
- write_lock(&clip_tbl.lock);
- __neigh_for_each_release(&clip_tbl, neigh_check_cb);
+ spin_lock(&arp_tbl.lock);
+ __neigh_for_each_release(&arp_tbl, neigh_check_cb);
mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
- write_unlock(&clip_tbl.lock);
+ spin_unlock(&arp_tbl.lock);
}
static int clip_arp_rcv(struct sk_buff *skb)
{
struct atm_vcc *vcc;
- DPRINTK("clip_arp_rcv\n");
+ pr_debug("\n");
vcc = ATM_SKB(skb)->vcc;
if (!vcc || !atm_charge(vcc, skb->truesize)) {
dev_kfree_skb_any(skb);
return 0;
}
- DPRINTK("pushing to %p\n", vcc);
- DPRINTK("using %p\n", CLIP_VCC(vcc)->old_push);
+ pr_debug("pushing to %p\n", vcc);
+ pr_debug("using %p\n", CLIP_VCC(vcc)->old_push);
CLIP_VCC(vcc)->old_push(vcc, skb);
return 0;
}
@@ -197,9 +203,10 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
- DPRINTK("clip push\n");
+ pr_debug("\n");
+
if (!skb) {
- DPRINTK("removing VCC %p\n", clip_vcc);
+ pr_debug("removing VCC %p\n", clip_vcc);
if (clip_vcc->entry)
unlink_clip_vcc(clip_vcc);
clip_vcc->old_push(vcc, NULL); /* pass on the bad news */
@@ -207,6 +214,11 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
return;
}
atm_return(vcc, skb->truesize);
+ if (!clip_devs) {
+ kfree_skb(skb);
+ return;
+ }
+
skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs;
/* clip_vcc->entry == NULL if we don't have an IP address yet */
if (!skb->dev) {
@@ -214,24 +226,24 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
return;
}
ATM_SKB(skb)->vcc = vcc;
- skb->mac.raw = skb->data;
- if (!clip_vcc->encap
- || skb->len < RFC1483LLC_LEN
- || memcmp(skb->data, llc_oui, sizeof (llc_oui)))
+ skb_reset_mac_header(skb);
+ if (!clip_vcc->encap ||
+ skb->len < RFC1483LLC_LEN ||
+ memcmp(skb->data, llc_oui, sizeof(llc_oui)))
skb->protocol = htons(ETH_P_IP);
else {
- skb->protocol = ((__be16 *) skb->data)[3];
+ skb->protocol = ((__be16 *)skb->data)[3];
skb_pull(skb, RFC1483LLC_LEN);
if (skb->protocol == htons(ETH_P_ARP)) {
- PRIV(skb->dev)->stats.rx_packets++;
- PRIV(skb->dev)->stats.rx_bytes += skb->len;
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
clip_arp_rcv(skb);
return;
}
}
clip_vcc->last_use = jiffies;
- PRIV(skb->dev)->stats.rx_packets++;
- PRIV(skb->dev)->stats.rx_bytes += skb->len;
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
netif_rx(skb);
}
@@ -248,7 +260,7 @@ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb)
int old;
unsigned long flags;
- DPRINTK("clip_pop(vcc %p)\n", vcc);
+ pr_debug("(vcc %p)\n", vcc);
clip_vcc->old_pop(vcc, skb);
/* skb->dev == NULL in outbound ARP packets */
if (!dev)
@@ -262,18 +274,12 @@ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb)
spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags);
}
-static void clip_neigh_destroy(struct neighbour *neigh)
-{
- DPRINTK("clip_neigh_destroy (neigh %p)\n", neigh);
- if (NEIGH2ENTRY(neigh)->vccs)
- printk(KERN_CRIT "clip_neigh_destroy: vccs != NULL !!!\n");
- NEIGH2ENTRY(neigh)->vccs = (void *) NEIGHBOR_DEAD;
-}
-
static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
- DPRINTK("clip_neigh_solicit (neigh %p, skb %p)\n", neigh, skb);
- to_atmarpd(act_need, PRIV(neigh->dev)->number, NEIGH2ENTRY(neigh)->ip);
+ __be32 *ip = (__be32 *) neigh->primary_key;
+
+ pr_debug("(neigh %p, skb %p)\n", neigh, skb);
+ to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip);
}
static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb)
@@ -284,85 +290,34 @@ static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb)
kfree_skb(skb);
}
-static struct neigh_ops clip_neigh_ops = {
+static const struct neigh_ops clip_neigh_ops = {
.family = AF_INET,
.solicit = clip_neigh_solicit,
.error_report = clip_neigh_error,
- .output = dev_queue_xmit,
- .connected_output = dev_queue_xmit,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
};
-static int clip_constructor(struct neighbour *neigh)
+static int clip_constructor(struct net_device *dev, struct neighbour *neigh)
{
- struct atmarp_entry *entry = NEIGH2ENTRY(neigh);
- struct net_device *dev = neigh->dev;
- struct in_device *in_dev;
- struct neigh_parms *parms;
+ struct atmarp_entry *entry = neighbour_priv(neigh);
- DPRINTK("clip_constructor (neigh %p, entry %p)\n", neigh, entry);
- neigh->type = inet_addr_type(entry->ip);
- if (neigh->type != RTN_UNICAST)
+ if (neigh->tbl->family != AF_INET)
return -EINVAL;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (!in_dev) {
- rcu_read_unlock();
+ if (neigh->type != RTN_UNICAST)
return -EINVAL;
- }
-
- parms = in_dev->arp_parms;
- __neigh_parms_put(neigh->parms);
- neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
+ neigh->nud_state = NUD_NONE;
neigh->ops = &clip_neigh_ops;
- neigh->output = neigh->nud_state & NUD_VALID ?
- neigh->ops->connected_output : neigh->ops->output;
+ neigh->output = neigh->ops->output;
entry->neigh = neigh;
entry->vccs = NULL;
entry->expires = jiffies - 1;
- return 0;
-}
-static u32 clip_hash(const void *pkey, const struct net_device *dev)
-{
- return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd);
+ return 0;
}
-static struct neigh_table clip_tbl = {
- .family = AF_INET,
- .entry_size = sizeof(struct neighbour)+sizeof(struct atmarp_entry),
- .key_len = 4,
- .hash = clip_hash,
- .constructor = clip_constructor,
- .id = "clip_arp_cache",
-
- /* parameters are copied from ARP ... */
- .parms = {
- .tbl = &clip_tbl,
- .neigh_destructor = clip_neigh_destroy,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
- },
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
-};
-
/* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */
/*
@@ -374,57 +329,63 @@ static struct neigh_table clip_tbl = {
static int clip_encap(struct atm_vcc *vcc, int mode)
{
+ if (!CLIP_VCC(vcc))
+ return -EBADFD;
+
CLIP_VCC(vcc)->encap = mode;
return 0;
}
-static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct clip_priv *clip_priv = PRIV(dev);
+ struct dst_entry *dst = skb_dst(skb);
struct atmarp_entry *entry;
+ struct neighbour *n;
struct atm_vcc *vcc;
+ struct rtable *rt;
+ __be32 *daddr;
int old;
unsigned long flags;
- DPRINTK("clip_start_xmit (skb %p)\n", skb);
- if (!skb->dst) {
- printk(KERN_ERR "clip_start_xmit: skb->dst == NULL\n");
+ pr_debug("(skb %p)\n", skb);
+ if (!dst) {
+ pr_err("skb_dst(skb) == NULL\n");
dev_kfree_skb(skb);
- clip_priv->stats.tx_dropped++;
- return 0;
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
}
- if (!skb->dst->neighbour) {
-#if 0
- skb->dst->neighbour = clip_find_neighbour(skb->dst, 1);
- if (!skb->dst->neighbour) {
- dev_kfree_skb(skb); /* lost that one */
- clip_priv->stats.tx_dropped++;
- return 0;
- }
-#endif
- printk(KERN_ERR "clip_start_xmit: NO NEIGHBOUR !\n");
+ rt = dst_rtable(dst);
+ if (rt->rt_gw_family == AF_INET)
+ daddr = &rt->rt_gw4;
+ else
+ daddr = &ip_hdr(skb)->daddr;
+ n = dst_neigh_lookup(dst, daddr);
+ if (!n) {
+ pr_err("NO NEIGHBOUR !\n");
dev_kfree_skb(skb);
- clip_priv->stats.tx_dropped++;
- return 0;
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
}
- entry = NEIGH2ENTRY(skb->dst->neighbour);
+ entry = neighbour_priv(n);
if (!entry->vccs) {
if (time_after(jiffies, entry->expires)) {
/* should be resolved */
entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ;
- to_atmarpd(act_need, PRIV(dev)->number, entry->ip);
+ to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key));
}
if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS)
skb_queue_tail(&entry->neigh->arp_queue, skb);
else {
dev_kfree_skb(skb);
- clip_priv->stats.tx_dropped++;
+ dev->stats.tx_dropped++;
}
- return 0;
+ goto out_release_neigh;
}
- DPRINTK("neigh %p, vccs %p\n", entry, entry->vccs);
+ pr_debug("neigh %p, vccs %p\n", entry, entry->vccs);
ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc;
- DPRINTK("using neighbour %p, vcc %p\n", skb->dst->neighbour, vcc);
+ pr_debug("using neighbour %p, vcc %p\n", n, vcc);
if (entry->vccs->encap) {
void *here;
@@ -432,21 +393,20 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev)
memcpy(here, llc_oui, sizeof(llc_oui));
((__be16 *) here)[3] = skb->protocol;
}
- atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
- ATM_SKB(skb)->atm_options = vcc->atm_options;
+ atm_account_tx(vcc, skb);
entry->vccs->last_use = jiffies;
- DPRINTK("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev);
+ pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev);
old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */
if (old) {
- printk(KERN_WARNING "clip_start_xmit: XOFF->XOFF transition\n");
- return 0;
+ pr_warn("XOFF->XOFF transition\n");
+ goto out_release_neigh;
}
- clip_priv->stats.tx_packets++;
- clip_priv->stats.tx_bytes += skb->len;
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
vcc->send(vcc, skb);
if (atm_may_send(vcc, 0)) {
entry->vccs->xoff = 0;
- return 0;
+ goto out_release_neigh;
}
spin_lock_irqsave(&clip_priv->xoff_lock, flags);
netif_stop_queue(dev); /* XOFF -> throttle immediately */
@@ -458,26 +418,23 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev)
of the brief netif_stop_queue. If this isn't true or if it
changes, use netif_wake_queue instead. */
spin_unlock_irqrestore(&clip_priv->xoff_lock, flags);
- return 0;
-}
-
-static struct net_device_stats *clip_get_stats(struct net_device *dev)
-{
- return &PRIV(dev)->stats;
+out_release_neigh:
+ neigh_release(n);
+ return NETDEV_TX_OK;
}
static int clip_mkip(struct atm_vcc *vcc, int timeout)
{
struct clip_vcc *clip_vcc;
- struct sk_buff_head copy;
- struct sk_buff *skb;
if (!vcc->push)
return -EBADFD;
+ if (vcc->user_back)
+ return -EINVAL;
clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL);
if (!clip_vcc)
return -ENOMEM;
- DPRINTK("mkip clip_vcc %p vcc %p\n", clip_vcc, vcc);
+ pr_debug("%p vcc %p\n", clip_vcc, vcc);
clip_vcc->vcc = vcc;
vcc->user_back = clip_vcc;
set_bit(ATM_VF_IS_CLIP, &vcc->flags);
@@ -490,22 +447,10 @@ static int clip_mkip(struct atm_vcc *vcc, int timeout)
clip_vcc->old_pop = vcc->pop;
vcc->push = clip_push;
vcc->pop = clip_pop;
- skb_queue_head_init(&copy);
- skb_migrate(&sk_atm(vcc)->sk_receive_queue, &copy);
+
/* re-process everything received between connection setup and MKIP */
- while ((skb = skb_dequeue(&copy)) != NULL)
- if (!clip_devs) {
- atm_return(vcc, skb->truesize);
- kfree_skb(skb);
- } else {
- unsigned int len = skb->len;
-
- skb_get(skb);
- clip_push(vcc, skb);
- PRIV(skb->dev)->stats.rx_packets--;
- PRIV(skb->dev)->stats.rx_bytes -= len;
- kfree_skb(skb);
- }
+ vcc_process_recv_queue(vcc);
+
return 0;
}
@@ -515,52 +460,55 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
struct atmarp_entry *entry;
int error;
struct clip_vcc *clip_vcc;
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = 1}} };
struct rtable *rt;
if (vcc->push != clip_push) {
- printk(KERN_WARNING "clip_setentry: non-CLIP VCC\n");
+ pr_warn("non-CLIP VCC\n");
return -EBADF;
}
clip_vcc = CLIP_VCC(vcc);
if (!ip) {
if (!clip_vcc->entry) {
- printk(KERN_ERR "hiding hidden ATMARP entry\n");
+ pr_err("hiding hidden ATMARP entry\n");
return 0;
}
- DPRINTK("setentry: remove\n");
+ pr_debug("remove\n");
unlink_clip_vcc(clip_vcc);
return 0;
}
- error = ip_route_output_key(&rt, &fl);
- if (error)
- return error;
- neigh = __neigh_lookup(&clip_tbl, &ip, rt->u.dst.dev, 1);
+ rt = ip_route_output(&init_net, ip, 0, 0, 0, RT_SCOPE_LINK);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1);
ip_rt_put(rt);
if (!neigh)
return -ENOMEM;
- entry = NEIGH2ENTRY(neigh);
+ entry = neighbour_priv(neigh);
if (entry != clip_vcc->entry) {
if (!clip_vcc->entry)
- DPRINTK("setentry: add\n");
+ pr_debug("add\n");
else {
- DPRINTK("setentry: update\n");
+ pr_debug("update\n");
unlink_clip_vcc(clip_vcc);
}
link_vcc(clip_vcc, entry);
}
error = neigh_update(neigh, llc_oui, NUD_PERMANENT,
- NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN);
+ NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0);
neigh_release(neigh);
return error;
}
+static const struct net_device_ops clip_netdev_ops = {
+ .ndo_start_xmit = clip_start_xmit,
+ .ndo_neigh_construct = clip_constructor,
+};
+
static void clip_setup(struct net_device *dev)
{
- dev->hard_start_xmit = clip_start_xmit;
- /* sg_xmit ... */
- dev->get_stats = clip_get_stats;
+ dev->netdev_ops = &clip_netdev_ops;
dev->type = ARPHRD_ATM;
+ dev->neigh_priv_len = sizeof(struct atmarp_entry);
dev->hard_header_len = RFC1483LLC_LEN;
dev->mtu = RFC1626_MTU;
dev->tx_queue_len = 100; /* "normal" queue (packets) */
@@ -569,6 +517,7 @@ static void clip_setup(struct net_device *dev)
/* without any more elaborate queuing. 100 is a reasonable */
/* compromise between decent burst-tolerance and protection */
/* against memory hogs. */
+ netif_keep_dst(dev);
}
static int clip_create(int number)
@@ -587,7 +536,8 @@ static int clip_create(int number)
if (PRIV(dev)->number >= number)
number = PRIV(dev)->number + 1;
}
- dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup);
+ dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN,
+ clip_setup);
if (!dev)
return -ENOMEM;
clip_priv = PRIV(dev);
@@ -601,36 +551,37 @@ static int clip_create(int number)
}
clip_priv->next = clip_devs;
clip_devs = dev;
- DPRINTK("registered (net:%s)\n", dev->name);
+ pr_debug("registered (net:%s)\n", dev->name);
return number;
}
static int clip_device_event(struct notifier_block *this, unsigned long event,
- void *arg)
+ void *ptr)
{
- struct net_device *dev = arg;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- if (event == NETDEV_UNREGISTER) {
- neigh_ifdown(&clip_tbl, dev);
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UNREGISTER)
return NOTIFY_DONE;
- }
/* ignore non-CLIP devices */
- if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit)
+ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
return NOTIFY_DONE;
switch (event) {
case NETDEV_UP:
- DPRINTK("clip_device_event NETDEV_UP\n");
+ pr_debug("NETDEV_UP\n");
to_atmarpd(act_up, PRIV(dev)->number, 0);
break;
case NETDEV_GOING_DOWN:
- DPRINTK("clip_device_event NETDEV_DOWN\n");
+ pr_debug("NETDEV_DOWN\n");
to_atmarpd(act_down, PRIV(dev)->number, 0);
break;
case NETDEV_CHANGE:
case NETDEV_CHANGEMTU:
- DPRINTK("clip_device_event NETDEV_CHANGE*\n");
+ pr_debug("NETDEV_CHANGE*\n");
to_atmarpd(act_change, PRIV(dev)->number, 0);
break;
}
@@ -641,22 +592,19 @@ static int clip_inet_event(struct notifier_block *this, unsigned long event,
void *ifa)
{
struct in_device *in_dev;
+ struct netdev_notifier_info info;
in_dev = ((struct in_ifaddr *)ifa)->ifa_dev;
- if (!in_dev || !in_dev->dev) {
- printk(KERN_WARNING "clip_inet_event: no device\n");
- return NOTIFY_DONE;
- }
/*
* Transitions are of the down-change-up type, so it's sufficient to
* handle the change on up.
*/
if (event != NETDEV_UP)
return NOTIFY_DONE;
- return clip_device_event(this, NETDEV_CHANGE, in_dev->dev);
+ netdev_notifier_info_init(&info, in_dev->dev);
+ return clip_device_event(this, NETDEV_CHANGE, &info);
}
-
static struct notifier_block clip_dev_notifier = {
.notifier_call = clip_device_event,
};
@@ -671,20 +619,29 @@ static struct notifier_block clip_inet_notifier = {
static void atmarpd_close(struct atm_vcc *vcc)
{
- DPRINTK("atmarpd_close\n");
+ pr_debug("\n");
- rtnl_lock();
- atmarpd = NULL;
+ mutex_lock(&atmarpd_lock);
+ RCU_INIT_POINTER(atmarpd, NULL);
+ mutex_unlock(&atmarpd_lock);
+
+ synchronize_rcu();
skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
- rtnl_unlock();
- DPRINTK("(done)\n");
+ pr_debug("(done)\n");
module_put(THIS_MODULE);
}
+static int atmarpd_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ atm_return_tx(vcc, skb);
+ dev_kfree_skb_any(skb);
+ return 0;
+}
-static struct atmdev_ops atmarpd_dev_ops = {
- .close = atmarpd_close
+static const struct atmdev_ops atmarpd_dev_ops = {
+ .close = atmarpd_close,
+ .send = atmarpd_send
};
@@ -692,36 +649,40 @@ static struct atm_dev atmarpd_dev = {
.ops = &atmarpd_dev_ops,
.type = "arpd",
.number = 999,
- .lock = SPIN_LOCK_UNLOCKED
+ .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock)
};
static int atm_init_atmarp(struct atm_vcc *vcc)
{
- rtnl_lock();
+ if (vcc->push == clip_push)
+ return -EINVAL;
+
+ mutex_lock(&atmarpd_lock);
if (atmarpd) {
- rtnl_unlock();
+ mutex_unlock(&atmarpd_lock);
return -EADDRINUSE;
}
- mod_timer(&idle_timer, jiffies+CLIP_CHECK_INTERVAL*HZ);
+ mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
- atmarpd = vcc;
- set_bit(ATM_VF_META,&vcc->flags);
- set_bit(ATM_VF_READY,&vcc->flags);
+ rcu_assign_pointer(atmarpd, vcc);
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
/* allow replies and avoid getting closed if signaling dies */
vcc->dev = &atmarpd_dev;
vcc_insert_socket(sk_atm(vcc));
vcc->push = NULL;
vcc->pop = NULL; /* crash */
vcc->push_oam = NULL; /* crash */
- rtnl_unlock();
+ mutex_unlock(&atmarpd_lock);
return 0;
}
static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct atm_vcc *vcc = ATM_SD(sock);
+ struct sock *sk = sock->sk;
int err = 0;
switch (cmd) {
@@ -742,14 +703,18 @@ static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
err = clip_create(arg);
break;
case ATMARPD_CTRL:
+ lock_sock(sk);
err = atm_init_atmarp(vcc);
if (!err) {
sock->state = SS_CONNECTED;
__module_get(THIS_MODULE);
}
+ release_sock(sk);
break;
case ATMARP_MKIP:
+ lock_sock(sk);
err = clip_mkip(vcc, arg);
+ release_sock(sk);
break;
case ATMARP_SETENTRY:
err = clip_setentry(vcc, (__force __be32)arg);
@@ -799,9 +764,10 @@ static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr)
/* This means the neighbour entry has no attached VCC objects. */
#define SEQ_NO_VCC_TOKEN ((void *) 2)
-static void atmarp_info(struct seq_file *seq, struct net_device *dev,
+static void atmarp_info(struct seq_file *seq, struct neighbour *n,
struct atmarp_entry *entry, struct clip_vcc *clip_vcc)
{
+ struct net_device *dev = n->dev;
unsigned long exp;
char buf[17];
int svc, llc, off;
@@ -821,8 +787,7 @@ static void atmarp_info(struct seq_file *seq, struct net_device *dev,
seq_printf(seq, "%-6s%-4s%-4s%5ld ",
dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp);
- off = scnprintf(buf, sizeof(buf) - 1, "%d.%d.%d.%d",
- NIPQUAD(entry->ip));
+ off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key);
while (off < 16)
buf[off++] = ' ';
buf[off] = '\0';
@@ -833,7 +798,7 @@ static void atmarp_info(struct seq_file *seq, struct net_device *dev,
seq_printf(seq, "(resolving)\n");
else
seq_printf(seq, "(expired, ref %d)\n",
- atomic_read(&entry->neigh->refcnt));
+ refcount_read(&entry->neigh->refcnt));
} else if (!svc) {
seq_printf(seq, "%d.%d.%d\n",
clip_vcc->vcc->dev->number,
@@ -893,12 +858,17 @@ static void *clip_seq_sub_iter(struct neigh_seq_state *_state,
{
struct clip_seq_state *state = (struct clip_seq_state *)_state;
- return clip_seq_vcc_walk(state, NEIGH2ENTRY(n), pos);
+ if (n->dev->type != ARPHRD_ATM)
+ return NULL;
+
+ return clip_seq_vcc_walk(state, neighbour_priv(n), pos);
}
static void *clip_seq_start(struct seq_file *seq, loff_t * pos)
{
- return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY);
+ struct clip_seq_state *state = seq->private;
+ state->ns.neigh_sub_iter = clip_seq_sub_iter;
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY);
}
static int clip_seq_show(struct seq_file *seq, void *v)
@@ -910,87 +880,53 @@ static int clip_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, atm_arp_banner);
} else {
struct clip_seq_state *state = seq->private;
- struct neighbour *n = v;
struct clip_vcc *vcc = state->vcc;
+ struct neighbour *n = v;
- atmarp_info(seq, n->dev, NEIGH2ENTRY(n), vcc);
+ atmarp_info(seq, n, neighbour_priv(n), vcc);
}
return 0;
}
-static struct seq_operations arp_seq_ops = {
+static const struct seq_operations arp_seq_ops = {
.start = clip_seq_start,
.next = neigh_seq_next,
.stop = neigh_seq_stop,
.show = clip_seq_show,
};
-
-static int arp_seq_open(struct inode *inode, struct file *file)
-{
- struct clip_seq_state *state;
- struct seq_file *seq;
- int rc = -EAGAIN;
-
- state = kzalloc(sizeof(*state), GFP_KERNEL);
- if (!state) {
- rc = -ENOMEM;
- goto out_kfree;
- }
- state->ns.neigh_sub_iter = clip_seq_sub_iter;
-
- rc = seq_open(file, &arp_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = state;
-out:
- return rc;
-
-out_kfree:
- kfree(state);
- goto out;
-}
-
-static struct file_operations arp_seq_fops = {
- .open = arp_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
- .owner = THIS_MODULE
-};
#endif
+static void atm_clip_exit_noproc(void);
+
static int __init atm_clip_init(void)
{
- neigh_table_init_no_netlink(&clip_tbl);
-
- clip_tbl_hook = &clip_tbl;
register_atm_ioctl(&clip_ioctl_ops);
register_netdevice_notifier(&clip_dev_notifier);
register_inetaddr_notifier(&clip_inet_notifier);
- setup_timer(&idle_timer, idle_timer_check, 0);
+ timer_setup(&idle_timer, idle_timer_check, 0);
#ifdef CONFIG_PROC_FS
{
struct proc_dir_entry *p;
- p = create_proc_entry("arp", S_IRUGO, atm_proc_root);
- if (p)
- p->proc_fops = &arp_seq_fops;
+ p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops,
+ sizeof(struct clip_seq_state));
+ if (!p) {
+ pr_err("Unable to initialize /proc/net/atm/arp\n");
+ atm_clip_exit_noproc();
+ return -ENOMEM;
+ }
}
#endif
return 0;
}
-static void __exit atm_clip_exit(void)
+static void atm_clip_exit_noproc(void)
{
struct net_device *dev, *next;
- remove_proc_entry("arp", atm_proc_root);
-
unregister_inetaddr_notifier(&clip_inet_notifier);
unregister_netdevice_notifier(&clip_dev_notifier);
@@ -999,13 +935,7 @@ static void __exit atm_clip_exit(void)
/* First, stop the idle timer, so it stops banging
* on the table.
*/
- del_timer_sync(&idle_timer);
-
- /* Next, purge the table, so that the device
- * unregister loop below does not hang due to
- * device references remaining in the table.
- */
- neigh_ifdown(&clip_tbl, NULL);
+ timer_delete_sync(&idle_timer);
dev = clip_devs;
while (dev) {
@@ -1014,11 +944,13 @@ static void __exit atm_clip_exit(void)
free_netdev(dev);
dev = next;
}
+}
- /* Now it is safe to fully shutdown whole table. */
- neigh_table_clear(&clip_tbl);
+static void __exit atm_clip_exit(void)
+{
+ remove_proc_entry("arp", atm_proc_root);
- clip_tbl_hook = NULL;
+ atm_clip_exit_noproc();
}
module_init(atm_clip_init);
diff --git a/net/atm/common.c b/net/atm/common.c
index fbabff494468..fe77f51f6ce1 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/atm/common.c - ATM sockets (common part for PVC and SVC) */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/module.h>
#include <linux/kmod.h>
@@ -12,17 +14,17 @@
#include <linux/errno.h> /* error codes */
#include <linux/capability.h>
#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/time.h> /* struct timeval */
+#include <linux/sched/signal.h>
+#include <linux/time64.h> /* 64-bit time for seconds */
#include <linux/skbuff.h>
#include <linux/bitops.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/sock.h> /* struct sock */
+#include <linux/uaccess.h>
+#include <linux/poll.h>
-#include <asm/uaccess.h>
-#include <asm/atomic.h>
-#include <asm/poll.h>
-
+#include <linux/atomic.h>
#include "resources.h" /* atm_find_dev */
#include "common.h" /* prototypes */
@@ -30,21 +32,18 @@
#include "addr.h" /* address registry */
#include "signaling.h" /* for WAITING and sigd_attach */
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
struct hlist_head vcc_hash[VCC_HTABLE_SIZE];
+EXPORT_SYMBOL(vcc_hash);
+
DEFINE_RWLOCK(vcc_sklist_lock);
+EXPORT_SYMBOL(vcc_sklist_lock);
+
+static ATOMIC_NOTIFIER_HEAD(atm_dev_notify_chain);
static void __vcc_insert_socket(struct sock *sk)
{
struct atm_vcc *vcc = atm_sk(sk);
- struct hlist_head *head = &vcc_hash[vcc->vci &
- (VCC_HTABLE_SIZE - 1)];
+ struct hlist_head *head = &vcc_hash[vcc->vci & (VCC_HTABLE_SIZE - 1)];
sk->sk_hash = vcc->vci & (VCC_HTABLE_SIZE - 1);
sk_add_node(sk, head);
}
@@ -55,6 +54,7 @@ void vcc_insert_socket(struct sock *sk)
__vcc_insert_socket(sk);
write_unlock_irq(&vcc_sklist_lock);
}
+EXPORT_SYMBOL(vcc_insert_socket);
static void vcc_remove_socket(struct sock *sk)
{
@@ -63,45 +63,38 @@ static void vcc_remove_socket(struct sock *sk)
write_unlock_irq(&vcc_sklist_lock);
}
-
-static struct sk_buff *alloc_tx(struct atm_vcc *vcc,unsigned int size)
+static bool vcc_tx_ready(struct atm_vcc *vcc, unsigned int size)
{
- struct sk_buff *skb;
struct sock *sk = sk_atm(vcc);
- if (atomic_read(&sk->sk_wmem_alloc) && !atm_may_send(vcc, size)) {
- DPRINTK("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n",
- atomic_read(&sk->sk_wmem_alloc), size,
- sk->sk_sndbuf);
- return NULL;
+ if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) {
+ pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n",
+ sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
+ return false;
}
- while (!(skb = alloc_skb(size,GFP_KERNEL))) schedule();
- DPRINTK("AlTx %d += %d\n", atomic_read(&sk->sk_wmem_alloc),
- skb->truesize);
- atomic_add(skb->truesize, &sk->sk_wmem_alloc);
- return skb;
+ return true;
}
-
-EXPORT_SYMBOL(vcc_hash);
-EXPORT_SYMBOL(vcc_sklist_lock);
-EXPORT_SYMBOL(vcc_insert_socket);
-
static void vcc_sock_destruct(struct sock *sk)
{
if (atomic_read(&sk->sk_rmem_alloc))
- printk(KERN_DEBUG "vcc_sock_destruct: rmem leakage (%d bytes) detected.\n", atomic_read(&sk->sk_rmem_alloc));
+ printk(KERN_DEBUG "%s: rmem leakage (%d bytes) detected.\n",
+ __func__, atomic_read(&sk->sk_rmem_alloc));
- if (atomic_read(&sk->sk_wmem_alloc))
- printk(KERN_DEBUG "vcc_sock_destruct: wmem leakage (%d bytes) detected.\n", atomic_read(&sk->sk_wmem_alloc));
+ if (refcount_read(&sk->sk_wmem_alloc))
+ printk(KERN_DEBUG "%s: wmem leakage (%d bytes) detected.\n",
+ __func__, refcount_read(&sk->sk_wmem_alloc));
}
static void vcc_def_wakeup(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up(sk->sk_sleep);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up(&wq->wait);
+ rcu_read_unlock();
}
static inline int vcc_writable(struct sock *sk)
@@ -109,30 +102,42 @@ static inline int vcc_writable(struct sock *sk)
struct atm_vcc *vcc = atm_sk(sk);
return (vcc->qos.txtp.max_sdu +
- atomic_read(&sk->sk_wmem_alloc)) <= sk->sk_sndbuf;
+ refcount_read(&sk->sk_wmem_alloc)) <= sk->sk_sndbuf;
}
static void vcc_write_space(struct sock *sk)
-{
- read_lock(&sk->sk_callback_lock);
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
if (vcc_writable(sk)) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
- sk_wake_async(sk, 2, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
- read_unlock(&sk->sk_callback_lock);
+ rcu_read_unlock();
+}
+
+static void vcc_release_cb(struct sock *sk)
+{
+ struct atm_vcc *vcc = atm_sk(sk);
+
+ if (vcc->release_cb)
+ vcc->release_cb(vcc);
}
static struct proto vcc_proto = {
.name = "VCC",
.owner = THIS_MODULE,
.obj_size = sizeof(struct atm_vcc),
+ .release_cb = vcc_release_cb,
};
-
-int vcc_create(struct socket *sock, int protocol, int family)
+
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern)
{
struct sock *sk;
struct atm_vcc *vcc;
@@ -140,7 +145,7 @@ int vcc_create(struct socket *sock, int protocol, int family)
sock->sk = NULL;
if (sock->type == SOCK_STREAM)
return -EINVAL;
- sk = sk_alloc(family, GFP_KERNEL, &vcc_proto, 1);
+ sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern);
if (!sk)
return -ENOMEM;
sock_init_data(sock, sk);
@@ -149,21 +154,22 @@ int vcc_create(struct socket *sock, int protocol, int family)
vcc = atm_sk(sk);
vcc->dev = NULL;
- memset(&vcc->local,0,sizeof(struct sockaddr_atmsvc));
- memset(&vcc->remote,0,sizeof(struct sockaddr_atmsvc));
+ memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc));
+ memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc));
vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
- atomic_set(&sk->sk_wmem_alloc, 0);
+ refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
atomic_set(&sk->sk_rmem_alloc, 0);
vcc->push = NULL;
vcc->pop = NULL;
+ vcc->owner = NULL;
vcc->push_oam = NULL;
+ vcc->release_cb = NULL;
vcc->vpi = vcc->vci = 0; /* no VCI/VPI yet */
vcc->atm_options = vcc->aal_options = 0;
sk->sk_destruct = vcc_sock_destruct;
return 0;
}
-
static void vcc_destroy_socket(struct sock *sk)
{
struct atm_vcc *vcc = atm_sk(sk);
@@ -171,17 +177,18 @@ static void vcc_destroy_socket(struct sock *sk)
set_bit(ATM_VF_CLOSE, &vcc->flags);
clear_bit(ATM_VF_READY, &vcc->flags);
- if (vcc->dev) {
- if (vcc->dev->ops->close)
- vcc->dev->ops->close(vcc);
- if (vcc->push)
- vcc->push(vcc, NULL); /* atmarpd has no push */
-
- while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- atm_return(vcc,skb->truesize);
- kfree_skb(skb);
- }
+ if (vcc->dev && vcc->dev->ops->close)
+ vcc->dev->ops->close(vcc);
+ if (vcc->push)
+ vcc->push(vcc, NULL); /* atmarpd has no push */
+ module_put(vcc->owner);
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ atm_return(vcc, skb->truesize);
+ kfree_skb(skb);
+ }
+ if (vcc->dev && vcc->dev->ops->owner) {
module_put(vcc->dev->ops->owner);
atm_dev_put(vcc->dev);
}
@@ -189,7 +196,6 @@ static void vcc_destroy_socket(struct sock *sk)
vcc_remove_socket(sk);
}
-
int vcc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -204,7 +210,6 @@ int vcc_release(struct socket *sock)
return 0;
}
-
void vcc_release_async(struct atm_vcc *vcc, int reply)
{
struct sock *sk = sk_atm(vcc);
@@ -215,10 +220,44 @@ void vcc_release_async(struct atm_vcc *vcc, int reply)
clear_bit(ATM_VF_WAITING, &vcc->flags);
sk->sk_state_change(sk);
}
+EXPORT_SYMBOL(vcc_release_async);
+
+void vcc_process_recv_queue(struct atm_vcc *vcc)
+{
+ struct sk_buff_head queue, *rq;
+ struct sk_buff *skb, *tmp;
+ unsigned long flags;
+ __skb_queue_head_init(&queue);
+ rq = &sk_atm(vcc)->sk_receive_queue;
-EXPORT_SYMBOL(vcc_release_async);
+ spin_lock_irqsave(&rq->lock, flags);
+ skb_queue_splice_init(rq, &queue);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ skb_queue_walk_safe(&queue, skb, tmp) {
+ __skb_unlink(skb, &queue);
+ vcc->push(vcc, skb);
+ }
+}
+EXPORT_SYMBOL(vcc_process_recv_queue);
+
+void atm_dev_signal_change(struct atm_dev *dev, char signal)
+{
+ pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n",
+ __func__, signal, dev, dev->number, dev->signal);
+
+ /* atm driver sending invalid signal */
+ WARN_ON(signal < ATM_PHY_SIG_LOST || signal > ATM_PHY_SIG_FOUND);
+
+ if (dev->signal == signal)
+ return; /* no change */
+
+ dev->signal = signal;
+
+ atomic_notifier_call_chain(&atm_dev_notify_chain, signal, dev);
+}
+EXPORT_SYMBOL(atm_dev_signal_change);
void atm_dev_release_vccs(struct atm_dev *dev)
{
@@ -227,11 +266,11 @@ void atm_dev_release_vccs(struct atm_dev *dev)
write_lock_irq(&vcc_sklist_lock);
for (i = 0; i < VCC_HTABLE_SIZE; i++) {
struct hlist_head *head = &vcc_hash[i];
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
struct sock *s;
struct atm_vcc *vcc;
- sk_for_each_safe(s, node, tmp, head) {
+ sk_for_each_safe(s, tmp, head) {
vcc = atm_sk(s);
if (vcc->dev == dev) {
vcc_release_async(vcc, -EPIPE);
@@ -241,43 +280,43 @@ void atm_dev_release_vccs(struct atm_dev *dev)
}
write_unlock_irq(&vcc_sklist_lock);
}
+EXPORT_SYMBOL(atm_dev_release_vccs);
-
-static int adjust_tp(struct atm_trafprm *tp,unsigned char aal)
+static int adjust_tp(struct atm_trafprm *tp, unsigned char aal)
{
int max_sdu;
- if (!tp->traffic_class) return 0;
+ if (!tp->traffic_class)
+ return 0;
switch (aal) {
- case ATM_AAL0:
- max_sdu = ATM_CELL_SIZE-1;
- break;
- case ATM_AAL34:
- max_sdu = ATM_MAX_AAL34_PDU;
- break;
- default:
- printk(KERN_WARNING "ATM: AAL problems ... "
- "(%d)\n",aal);
- /* fall through */
- case ATM_AAL5:
- max_sdu = ATM_MAX_AAL5_PDU;
- }
- if (!tp->max_sdu) tp->max_sdu = max_sdu;
- else if (tp->max_sdu > max_sdu) return -EINVAL;
- if (!tp->max_cdv) tp->max_cdv = ATM_MAX_CDV;
+ case ATM_AAL0:
+ max_sdu = ATM_CELL_SIZE-1;
+ break;
+ case ATM_AAL34:
+ max_sdu = ATM_MAX_AAL34_PDU;
+ break;
+ default:
+ pr_warn("AAL problems ... (%d)\n", aal);
+ fallthrough;
+ case ATM_AAL5:
+ max_sdu = ATM_MAX_AAL5_PDU;
+ }
+ if (!tp->max_sdu)
+ tp->max_sdu = max_sdu;
+ else if (tp->max_sdu > max_sdu)
+ return -EINVAL;
+ if (!tp->max_cdv)
+ tp->max_cdv = ATM_MAX_CDV;
return 0;
}
-
-static int check_ci(struct atm_vcc *vcc, short vpi, int vci)
+static int check_ci(const struct atm_vcc *vcc, short vpi, int vci)
{
- struct hlist_head *head = &vcc_hash[vci &
- (VCC_HTABLE_SIZE - 1)];
- struct hlist_node *node;
+ struct hlist_head *head = &vcc_hash[vci & (VCC_HTABLE_SIZE - 1)];
struct sock *s;
struct atm_vcc *walk;
- sk_for_each(s, node, head) {
+ sk_for_each(s, head) {
walk = atm_sk(s);
if (walk->dev != vcc->dev)
continue;
@@ -296,8 +335,7 @@ static int check_ci(struct atm_vcc *vcc, short vpi, int vci)
return 0;
}
-
-static int find_ci(struct atm_vcc *vcc, short *vpi, int *vci)
+static int find_ci(const struct atm_vcc *vcc, short *vpi, int *vci)
{
static short p; /* poor man's per-device cache */
static int c;
@@ -334,14 +372,13 @@ static int find_ci(struct atm_vcc *vcc, short *vpi, int *vci)
if ((c == ATM_NOT_RSV_VCI || *vci != ATM_VCI_ANY) &&
*vpi == ATM_VPI_ANY) {
p++;
- if (p >= 1 << vcc->dev->ci_range.vpi_bits) p = 0;
+ if (p >= 1 << vcc->dev->ci_range.vpi_bits)
+ p = 0;
}
- }
- while (old_p != p || old_c != c);
+ } while (old_p != p || old_c != c);
return -EADDRINUSE;
}
-
static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
int vci)
{
@@ -359,7 +396,7 @@ static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
return error;
vcc->dev = dev;
write_lock_irq(&vcc_sklist_lock);
- if (test_bit(ATM_DF_REMOVED, &dev->flags) ||
+ if (test_bit(ATM_DF_REMOVED, &dev->flags) ||
(error = find_ci(vcc, &vpi, &vci))) {
write_unlock_irq(&vcc_sklist_lock);
goto fail_module_put;
@@ -369,37 +406,46 @@ static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
__vcc_insert_socket(sk);
write_unlock_irq(&vcc_sklist_lock);
switch (vcc->qos.aal) {
- case ATM_AAL0:
- error = atm_init_aal0(vcc);
- vcc->stats = &dev->stats.aal0;
- break;
- case ATM_AAL34:
- error = atm_init_aal34(vcc);
- vcc->stats = &dev->stats.aal34;
- break;
- case ATM_NO_AAL:
- /* ATM_AAL5 is also used in the "0 for default" case */
- vcc->qos.aal = ATM_AAL5;
- /* fall through */
- case ATM_AAL5:
- error = atm_init_aal5(vcc);
- vcc->stats = &dev->stats.aal5;
- break;
- default:
- error = -EPROTOTYPE;
+ case ATM_AAL0:
+ error = atm_init_aal0(vcc);
+ vcc->stats = &dev->stats.aal0;
+ break;
+ case ATM_AAL34:
+ error = atm_init_aal34(vcc);
+ vcc->stats = &dev->stats.aal34;
+ break;
+ case ATM_NO_AAL:
+ /* ATM_AAL5 is also used in the "0 for default" case */
+ vcc->qos.aal = ATM_AAL5;
+ fallthrough;
+ case ATM_AAL5:
+ error = atm_init_aal5(vcc);
+ vcc->stats = &dev->stats.aal5;
+ break;
+ default:
+ error = -EPROTOTYPE;
}
- if (!error) error = adjust_tp(&vcc->qos.txtp,vcc->qos.aal);
- if (!error) error = adjust_tp(&vcc->qos.rxtp,vcc->qos.aal);
+ if (!error)
+ error = adjust_tp(&vcc->qos.txtp, vcc->qos.aal);
+ if (!error)
+ error = adjust_tp(&vcc->qos.rxtp, vcc->qos.aal);
if (error)
goto fail;
- DPRINTK("VCC %d.%d, AAL %d\n",vpi,vci,vcc->qos.aal);
- DPRINTK(" TX: %d, PCR %d..%d, SDU %d\n",vcc->qos.txtp.traffic_class,
- vcc->qos.txtp.min_pcr,vcc->qos.txtp.max_pcr,vcc->qos.txtp.max_sdu);
- DPRINTK(" RX: %d, PCR %d..%d, SDU %d\n",vcc->qos.rxtp.traffic_class,
- vcc->qos.rxtp.min_pcr,vcc->qos.rxtp.max_pcr,vcc->qos.rxtp.max_sdu);
+ pr_debug("VCC %d.%d, AAL %d\n", vpi, vci, vcc->qos.aal);
+ pr_debug(" TX: %d, PCR %d..%d, SDU %d\n",
+ vcc->qos.txtp.traffic_class,
+ vcc->qos.txtp.min_pcr,
+ vcc->qos.txtp.max_pcr,
+ vcc->qos.txtp.max_sdu);
+ pr_debug(" RX: %d, PCR %d..%d, SDU %d\n",
+ vcc->qos.rxtp.traffic_class,
+ vcc->qos.rxtp.min_pcr,
+ vcc->qos.rxtp.max_pcr,
+ vcc->qos.rxtp.max_sdu);
if (dev->ops->open) {
- if ((error = dev->ops->open(vcc)))
+ error = dev->ops->open(vcc);
+ if (error)
goto fail;
}
return 0;
@@ -413,14 +459,13 @@ fail_module_put:
return error;
}
-
int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
{
struct atm_dev *dev;
struct atm_vcc *vcc = ATM_SD(sock);
int error;
- DPRINTK("vcc_connect (vpi %d, vci %d)\n",vpi,vci);
+ pr_debug("(vpi %d, vci %d)\n", vpi, vci);
if (sock->state == SS_CONNECTED)
return -EISCONN;
if (sock->state != SS_UNCONNECTED)
@@ -429,30 +474,33 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
return -EINVAL;
if (vpi != ATM_VPI_UNSPEC && vci != ATM_VCI_UNSPEC)
- clear_bit(ATM_VF_PARTIAL,&vcc->flags);
+ clear_bit(ATM_VF_PARTIAL, &vcc->flags);
else
- if (test_bit(ATM_VF_PARTIAL,&vcc->flags))
+ if (test_bit(ATM_VF_PARTIAL, &vcc->flags))
return -EINVAL;
- DPRINTK("vcc_connect (TX: cl %d,bw %d-%d,sdu %d; "
- "RX: cl %d,bw %d-%d,sdu %d,AAL %s%d)\n",
- vcc->qos.txtp.traffic_class,vcc->qos.txtp.min_pcr,
- vcc->qos.txtp.max_pcr,vcc->qos.txtp.max_sdu,
- vcc->qos.rxtp.traffic_class,vcc->qos.rxtp.min_pcr,
- vcc->qos.rxtp.max_pcr,vcc->qos.rxtp.max_sdu,
- vcc->qos.aal == ATM_AAL5 ? "" : vcc->qos.aal == ATM_AAL0 ? "" :
- " ??? code ",vcc->qos.aal == ATM_AAL0 ? 0 : vcc->qos.aal);
+ pr_debug("(TX: cl %d,bw %d-%d,sdu %d; "
+ "RX: cl %d,bw %d-%d,sdu %d,AAL %s%d)\n",
+ vcc->qos.txtp.traffic_class, vcc->qos.txtp.min_pcr,
+ vcc->qos.txtp.max_pcr, vcc->qos.txtp.max_sdu,
+ vcc->qos.rxtp.traffic_class, vcc->qos.rxtp.min_pcr,
+ vcc->qos.rxtp.max_pcr, vcc->qos.rxtp.max_sdu,
+ vcc->qos.aal == ATM_AAL5 ? "" :
+ vcc->qos.aal == ATM_AAL0 ? "" : " ??? code ",
+ vcc->qos.aal == ATM_AAL0 ? 0 : vcc->qos.aal);
if (!test_bit(ATM_VF_HASQOS, &vcc->flags))
return -EBADFD;
if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS ||
vcc->qos.rxtp.traffic_class == ATM_ANYCLASS)
return -EINVAL;
if (likely(itf != ATM_ITF_ANY)) {
- dev = try_then_request_module(atm_dev_lookup(itf), "atm-device-%d", itf);
+ dev = try_then_request_module(atm_dev_lookup(itf),
+ "atm-device-%d", itf);
} else {
dev = NULL;
mutex_lock(&atm_dev_mutex);
if (!list_empty(&atm_devs)) {
- dev = list_entry(atm_devs.next, struct atm_dev, dev_list);
+ dev = list_entry(atm_devs.next,
+ struct atm_dev, dev_list);
atm_dev_hold(dev);
}
mutex_unlock(&atm_dev_mutex);
@@ -465,15 +513,14 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
return error;
}
if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC)
- set_bit(ATM_VF_PARTIAL,&vcc->flags);
- if (test_bit(ATM_VF_READY,&ATM_SD(sock)->flags))
+ set_bit(ATM_VF_PARTIAL, &vcc->flags);
+ if (test_bit(ATM_VF_READY, &ATM_SD(sock)->flags))
sock->state = SS_CONNECTED;
return 0;
}
-
-int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
struct atm_vcc *vcc;
@@ -482,45 +529,49 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if (sock->state != SS_CONNECTED)
return -ENOTCONN;
- if (flags & ~MSG_DONTWAIT) /* only handle MSG_DONTWAIT */
+
+ /* only handle MSG_DONTWAIT and MSG_PEEK */
+ if (flags & ~(MSG_DONTWAIT | MSG_PEEK))
return -EOPNOTSUPP;
+
vcc = ATM_SD(sock);
- if (test_bit(ATM_VF_RELEASED,&vcc->flags) ||
- test_bit(ATM_VF_CLOSE,&vcc->flags) ||
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
!test_bit(ATM_VF_READY, &vcc->flags))
return 0;
- skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error);
+ skb = skb_recv_datagram(sk, flags, &error);
if (!skb)
return error;
- copied = skb->len;
+ copied = skb->len;
if (copied > size) {
- copied = size;
+ copied = size;
msg->msg_flags |= MSG_TRUNC;
}
- error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
- if (error)
- return error;
- sock_recv_timestamp(msg, sk, skb);
- DPRINTK("RcvM %d -= %d\n", atomic_read(&sk->rmem_alloc), skb->truesize);
- atm_return(vcc, skb->truesize);
- skb_free_datagram(sk, skb);
- return copied;
-}
+ error = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (error)
+ return error;
+ sock_recv_cmsgs(msg, sk, skb);
+ if (!(flags & MSG_PEEK)) {
+ pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc),
+ skb->truesize);
+ atm_return(vcc, skb->truesize);
+ }
-int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
- size_t total_len)
+ skb_free_datagram(sk, skb);
+ return copied;
+}
+
+int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
{
struct sock *sk = sock->sk;
DEFINE_WAIT(wait);
struct atm_vcc *vcc;
struct sk_buff *skb;
- int eff,error;
- const void __user *buff;
- int size;
+ int eff, error;
lock_sock(sk);
if (sock->state != SS_CONNECTED) {
@@ -531,12 +582,6 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
error = -EISCONN;
goto out;
}
- if (m->msg_iovlen != 1) {
- error = -ENOSYS; /* fix this later @@@ */
- goto out;
- }
- buff = m->msg_iov->iov_base;
- size = m->msg_iov->iov_len;
vcc = ATM_SD(sock);
if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
test_bit(ATM_VF_CLOSE, &vcc->flags) ||
@@ -549,15 +594,15 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
error = 0;
goto out;
}
- if (size < 0 || size > vcc->qos.txtp.max_sdu) {
+ if (size > vcc->qos.txtp.max_sdu) {
error = -EMSGSIZE;
goto out;
}
eff = (size+3) & ~3; /* align to word boundary */
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
error = 0;
- while (!(skb = alloc_tx(vcc,eff))) {
+ while (!vcc_tx_ready(vcc, eff)) {
if (m->msg_flags & MSG_DONTWAIT) {
error = -EAGAIN;
break;
@@ -567,56 +612,74 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
error = -ERESTARTSYS;
break;
}
- if (test_bit(ATM_VF_RELEASED,&vcc->flags) ||
- test_bit(ATM_VF_CLOSE,&vcc->flags) ||
- !test_bit(ATM_VF_READY,&vcc->flags)) {
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
error = -EPIPE;
send_sig(SIGPIPE, current, 0);
break;
}
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
if (error)
goto out;
+
+ skb = alloc_skb(eff, GFP_KERNEL);
+ if (!skb) {
+ error = -ENOMEM;
+ goto out;
+ }
+ pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
+ atm_account_tx(vcc, skb);
+
skb->dev = NULL; /* for paths shared with net_device interfaces */
- ATM_SKB(skb)->atm_options = vcc->atm_options;
- if (copy_from_user(skb_put(skb,size),buff,size)) {
- kfree_skb(skb);
+ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
error = -EFAULT;
- goto out;
+ goto free_skb;
+ }
+ if (eff != size)
+ memset(skb->data + size, 0, eff-size);
+
+ if (vcc->dev->ops->pre_send) {
+ error = vcc->dev->ops->pre_send(vcc, skb);
+ if (error)
+ goto free_skb;
}
- if (eff != size) memset(skb->data+size,0,eff-size);
- error = vcc->dev->ops->send(vcc,skb);
+
+ error = vcc->dev->ops->send(vcc, skb);
error = error ? error : size;
out:
release_sock(sk);
return error;
+free_skb:
+ atm_return_tx(vcc, skb);
+ kfree_skb(skb);
+ goto out;
}
-
-unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
struct atm_vcc *vcc;
- unsigned int mask;
+ __poll_t mask;
- poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
vcc = ATM_SD(sock);
/* exceptional events */
if (sk->sk_err)
- mask = POLLERR;
+ mask = EPOLLERR;
if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
test_bit(ATM_VF_CLOSE, &vcc->flags))
- mask |= POLLHUP;
+ mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
- mask |= POLLIN | POLLRDNORM;
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
/* writable? */
if (sock->state == SS_CONNECTING &&
@@ -625,13 +688,12 @@ unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
if (vcc->qos.txtp.traffic_class != ATM_NONE &&
vcc_writable(sk))
- mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
return mask;
}
-
-static int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
+static int atm_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
{
int error;
@@ -643,25 +705,31 @@ static int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
qos->rxtp.traffic_class != vcc->qos.rxtp.traffic_class ||
qos->txtp.traffic_class != vcc->qos.txtp.traffic_class)
return -EINVAL;
- error = adjust_tp(&qos->txtp,qos->aal);
- if (!error) error = adjust_tp(&qos->rxtp,qos->aal);
- if (error) return error;
- if (!vcc->dev->ops->change_qos) return -EOPNOTSUPP;
+ error = adjust_tp(&qos->txtp, qos->aal);
+ if (!error)
+ error = adjust_tp(&qos->rxtp, qos->aal);
+ if (error)
+ return error;
+ if (!vcc->dev->ops->change_qos)
+ return -EOPNOTSUPP;
if (sk_atm(vcc)->sk_family == AF_ATMPVC)
- return vcc->dev->ops->change_qos(vcc,qos,ATM_MF_SET);
- return svc_change_qos(vcc,qos);
+ return vcc->dev->ops->change_qos(vcc, qos, ATM_MF_SET);
+ return svc_change_qos(vcc, qos);
}
-
-static int check_tp(struct atm_trafprm *tp)
+static int check_tp(const struct atm_trafprm *tp)
{
/* @@@ Should be merged with adjust_tp */
- if (!tp->traffic_class || tp->traffic_class == ATM_ANYCLASS) return 0;
+ if (!tp->traffic_class || tp->traffic_class == ATM_ANYCLASS)
+ return 0;
if (tp->traffic_class != ATM_UBR && !tp->min_pcr && !tp->pcr &&
- !tp->max_pcr) return -EINVAL;
- if (tp->min_pcr == ATM_MAX_PCR) return -EINVAL;
+ !tp->max_pcr)
+ return -EINVAL;
+ if (tp->min_pcr == ATM_MAX_PCR)
+ return -EINVAL;
if (tp->min_pcr && tp->max_pcr && tp->max_pcr != ATM_MAX_PCR &&
- tp->min_pcr > tp->max_pcr) return -EINVAL;
+ tp->min_pcr > tp->max_pcr)
+ return -EINVAL;
/*
* We allow pcr to be outside [min_pcr,max_pcr], because later
* adjustment may still push it in the valid range.
@@ -669,24 +737,25 @@ static int check_tp(struct atm_trafprm *tp)
return 0;
}
-
-static int check_qos(struct atm_qos *qos)
+static int check_qos(const struct atm_qos *qos)
{
int error;
if (!qos->txtp.traffic_class && !qos->rxtp.traffic_class)
- return -EINVAL;
+ return -EINVAL;
if (qos->txtp.traffic_class != qos->rxtp.traffic_class &&
qos->txtp.traffic_class && qos->rxtp.traffic_class &&
qos->txtp.traffic_class != ATM_ANYCLASS &&
- qos->rxtp.traffic_class != ATM_ANYCLASS) return -EINVAL;
+ qos->rxtp.traffic_class != ATM_ANYCLASS)
+ return -EINVAL;
error = check_tp(&qos->txtp);
- if (error) return error;
+ if (error)
+ return error;
return check_tp(&qos->rxtp);
}
int vcc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct atm_vcc *vcc;
unsigned long value;
@@ -697,37 +766,36 @@ int vcc_setsockopt(struct socket *sock, int level, int optname,
vcc = ATM_SD(sock);
switch (optname) {
- case SO_ATMQOS:
- {
- struct atm_qos qos;
-
- if (copy_from_user(&qos,optval,sizeof(qos)))
- return -EFAULT;
- error = check_qos(&qos);
- if (error) return error;
- if (sock->state == SS_CONNECTED)
- return atm_change_qos(vcc,&qos);
- if (sock->state != SS_UNCONNECTED)
- return -EBADFD;
- vcc->qos = qos;
- set_bit(ATM_VF_HASQOS,&vcc->flags);
- return 0;
- }
- case SO_SETCLP:
- if (get_user(value,(unsigned long __user *)optval))
- return -EFAULT;
- if (value) vcc->atm_options |= ATM_ATMOPT_CLP;
- else vcc->atm_options &= ~ATM_ATMOPT_CLP;
- return 0;
- default:
- if (level == SOL_SOCKET) return -EINVAL;
- break;
+ case SO_ATMQOS:
+ {
+ struct atm_qos qos;
+
+ if (copy_from_sockptr(&qos, optval, sizeof(qos)))
+ return -EFAULT;
+ error = check_qos(&qos);
+ if (error)
+ return error;
+ if (sock->state == SS_CONNECTED)
+ return atm_change_qos(vcc, &qos);
+ if (sock->state != SS_UNCONNECTED)
+ return -EBADFD;
+ vcc->qos = qos;
+ set_bit(ATM_VF_HASQOS, &vcc->flags);
+ return 0;
+ }
+ case SO_SETCLP:
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+ if (value)
+ vcc->atm_options |= ATM_ATMOPT_CLP;
+ else
+ vcc->atm_options &= ~ATM_ATMOPT_CLP;
+ return 0;
+ default:
+ return -EINVAL;
}
- if (!vcc->dev || !vcc->dev->ops->setsockopt) return -EINVAL;
- return vcc->dev->ops->setsockopt(vcc,level,optname,optval,optlen);
}
-
int vcc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -741,57 +809,69 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
vcc = ATM_SD(sock);
switch (optname) {
- case SO_ATMQOS:
- if (!test_bit(ATM_VF_HASQOS,&vcc->flags))
- return -EINVAL;
- return copy_to_user(optval,&vcc->qos,sizeof(vcc->qos)) ?
- -EFAULT : 0;
- case SO_SETCLP:
- return put_user(vcc->atm_options & ATM_ATMOPT_CLP ? 1 :
- 0,(unsigned long __user *)optval) ? -EFAULT : 0;
- case SO_ATMPVC:
- {
- struct sockaddr_atmpvc pvc;
-
- if (!vcc->dev ||
- !test_bit(ATM_VF_ADDR,&vcc->flags))
- return -ENOTCONN;
- pvc.sap_family = AF_ATMPVC;
- pvc.sap_addr.itf = vcc->dev->number;
- pvc.sap_addr.vpi = vcc->vpi;
- pvc.sap_addr.vci = vcc->vci;
- return copy_to_user(optval,&pvc,sizeof(pvc)) ?
- -EFAULT : 0;
- }
- default:
- if (level == SOL_SOCKET) return -EINVAL;
- break;
+ case SO_ATMQOS:
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags))
+ return -EINVAL;
+ return copy_to_user(optval, &vcc->qos, sizeof(vcc->qos))
+ ? -EFAULT : 0;
+ case SO_SETCLP:
+ return put_user(vcc->atm_options & ATM_ATMOPT_CLP ? 1 : 0,
+ (unsigned long __user *)optval) ? -EFAULT : 0;
+ case SO_ATMPVC:
+ {
+ struct sockaddr_atmpvc pvc;
+
+ if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
+ return -ENOTCONN;
+ memset(&pvc, 0, sizeof(pvc));
+ pvc.sap_family = AF_ATMPVC;
+ pvc.sap_addr.itf = vcc->dev->number;
+ pvc.sap_addr.vpi = vcc->vpi;
+ pvc.sap_addr.vci = vcc->vci;
+ return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0;
+ }
+ default:
+ return -EINVAL;
}
- if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL;
- return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len);
}
+int register_atmdevice_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_atmdevice_notifier);
+
+void unregister_atmdevice_notifier(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_atmdevice_notifier);
+
static int __init atm_init(void)
{
int error;
- if ((error = proto_register(&vcc_proto, 0)) < 0)
+ error = proto_register(&vcc_proto, 0);
+ if (error < 0)
goto out;
-
- if ((error = atmpvc_init()) < 0) {
- printk(KERN_ERR "atmpvc_init() failed with %d\n", error);
+ error = atmpvc_init();
+ if (error < 0) {
+ pr_err("atmpvc_init() failed with %d\n", error);
goto out_unregister_vcc_proto;
}
- if ((error = atmsvc_init()) < 0) {
- printk(KERN_ERR "atmsvc_init() failed with %d\n", error);
+ error = atmsvc_init();
+ if (error < 0) {
+ pr_err("atmsvc_init() failed with %d\n", error);
goto out_atmpvc_exit;
}
- if ((error = atm_proc_init()) < 0) {
- printk(KERN_ERR "atm_proc_init() failed with %d\n",error);
+ error = atm_proc_init();
+ if (error < 0) {
+ pr_err("atm_proc_init() failed with %d\n", error);
goto out_atmsvc_exit;
}
- if ((error = atm_sysfs_init()) < 0) {
- printk(KERN_ERR "atm_sysfs_init() failed with %d\n",error);
+ error = atm_sysfs_init();
+ if (error < 0) {
+ pr_err("atm_sysfs_init() failed with %d\n", error);
goto out_atmproc_exit;
}
out:
@@ -801,7 +881,7 @@ out_atmproc_exit:
out_atmsvc_exit:
atmsvc_exit();
out_atmpvc_exit:
- atmsvc_exit();
+ atmpvc_exit();
out_unregister_vcc_proto:
proto_unregister(&vcc_proto);
goto out;
@@ -816,9 +896,11 @@ static void __exit atm_exit(void)
proto_unregister(&vcc_proto);
}
-module_init(atm_init);
+subsys_initcall(atm_init);
+
module_exit(atm_exit);
+MODULE_DESCRIPTION("Asynchronous Transfer Mode (ATM) networking core");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_ATMPVC);
MODULE_ALIAS_NETPROTO(PF_ATMSVC);
diff --git a/net/atm/common.h b/net/atm/common.h
index a422da7788fb..a1e56e8de698 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* net/atm/common.h - ATM sockets (common part for PVC and SVC) */
-
+
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
@@ -10,19 +11,20 @@
#include <linux/poll.h> /* for poll_table */
-int vcc_create(struct socket *sock, int protocol, int family);
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern);
int vcc_release(struct socket *sock);
int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
-int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size, int flags);
-int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
- size_t total_len);
-unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
+int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags);
+int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
+__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int vcc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen);
+ sockptr_t optval, unsigned int optlen);
int vcc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen);
+void vcc_process_recv_queue(struct atm_vcc *vcc);
int atmpvc_init(void);
void atmpvc_exit(void);
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 8c2022c3e81d..0f7a39aeccc8 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -1,8 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/* ATM ioctl handling */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
/* 2003 John Levon <levon@movementarian.org> */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/module.h>
#include <linux/kmod.h>
@@ -19,6 +21,7 @@
#include <linux/atmlec.h>
#include <linux/mutex.h>
#include <asm/ioctls.h>
+#include <net/compat.h>
#include "resources.h"
#include "signaling.h" /* for WAITING and sigd_attach */
@@ -35,6 +38,7 @@ void register_atm_ioctl(struct atm_ioctl *ioctl)
list_add_tail(&ioctl->list, &ioctl_list);
mutex_unlock(&ioctl_mutex);
}
+EXPORT_SYMBOL(register_atm_ioctl);
void deregister_atm_ioctl(struct atm_ioctl *ioctl)
{
@@ -42,104 +46,115 @@ void deregister_atm_ioctl(struct atm_ioctl *ioctl)
list_del(&ioctl->list);
mutex_unlock(&ioctl_mutex);
}
-
-EXPORT_SYMBOL(register_atm_ioctl);
EXPORT_SYMBOL(deregister_atm_ioctl);
-int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg, int compat)
{
struct sock *sk = sock->sk;
struct atm_vcc *vcc;
int error;
- struct list_head * pos;
+ struct list_head *pos;
void __user *argp = (void __user *)arg;
+ void __user *buf;
+ int __user *len;
vcc = ATM_SD(sock);
switch (cmd) {
- case SIOCOUTQ:
- if (sock->state != SS_CONNECTED ||
- !test_bit(ATM_VF_READY, &vcc->flags)) {
- error = -EINVAL;
- goto done;
- }
- error = put_user(sk->sk_sndbuf -
- atomic_read(&sk->sk_wmem_alloc),
- (int __user *) argp) ? -EFAULT : 0;
+ case SIOCOUTQ:
+ if (sock->state != SS_CONNECTED ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
+ error = -EINVAL;
+ goto done;
+ }
+ error = put_user(sk->sk_sndbuf - sk_wmem_alloc_get(sk),
+ (int __user *)argp);
+ goto done;
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ int amount;
+
+ if (sock->state != SS_CONNECTED) {
+ error = -EINVAL;
goto done;
- case SIOCINQ:
- {
- struct sk_buff *skb;
-
- if (sock->state != SS_CONNECTED) {
- error = -EINVAL;
- goto done;
- }
- skb = skb_peek(&sk->sk_receive_queue);
- error = put_user(skb ? skb->len : 0,
- (int __user *)argp) ? -EFAULT : 0;
- goto done;
- }
- case SIOCGSTAMP: /* borrowed from IP */
- error = sock_get_timestamp(sk, argp);
+ }
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ amount = skb ? skb->len : 0;
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ error = put_user(amount, (int __user *)argp);
+ goto done;
+ }
+ case ATM_SETSC:
+ net_warn_ratelimited("ATM_SETSC is obsolete; used by %s:%d\n",
+ current->comm, task_pid_nr(current));
+ error = 0;
+ goto done;
+ case ATMSIGD_CTRL:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
goto done;
- case ATM_SETSC:
- printk(KERN_WARNING "ATM_SETSC is obsolete\n");
- error = 0;
+ }
+ /*
+ * The user/kernel protocol for exchanging signalling
+ * info uses kernel pointers as opaque references,
+ * so the holder of the file descriptor can scribble
+ * on the kernel... so we should make sure that we
+ * have the same privileges that /proc/kcore needs
+ */
+ if (!capable(CAP_SYS_RAWIO)) {
+ error = -EPERM;
goto done;
- case ATMSIGD_CTRL:
- if (!capable(CAP_NET_ADMIN)) {
- error = -EPERM;
- goto done;
- }
- /*
- * The user/kernel protocol for exchanging signalling
- * info uses kernel pointers as opaque references,
- * so the holder of the file descriptor can scribble
- * on the kernel... so we should make sure that we
- * have the same privledges that /proc/kcore needs
- */
- if (!capable(CAP_SYS_RAWIO)) {
- error = -EPERM;
- goto done;
- }
- error = sigd_attach(vcc);
- if (!error)
- sock->state = SS_CONNECTED;
+ }
+#ifdef CONFIG_COMPAT
+ /* WTF? I don't even want to _think_ about making this
+ work for 32-bit userspace. TBH I don't really want
+ to think about it at all. dwmw2. */
+ if (compat) {
+ net_warn_ratelimited("32-bit task cannot be atmsigd\n");
+ error = -EINVAL;
goto done;
- case ATM_SETBACKEND:
- case ATM_NEWBACKENDIF:
- {
- atm_backend_t backend;
- error = get_user(backend, (atm_backend_t __user *) argp);
- if (error)
- goto done;
- switch (backend) {
- case ATM_BACKEND_PPP:
- request_module("pppoatm");
- break;
- case ATM_BACKEND_BR2684:
- request_module("br2684");
- break;
- }
- }
- break;
- case ATMMPC_CTRL:
- case ATMMPC_DATA:
- request_module("mpoa");
- break;
- case ATMARPD_CTRL:
- request_module("clip");
+ }
+#endif
+ error = sigd_attach(vcc);
+ if (!error)
+ sock->state = SS_CONNECTED;
+ goto done;
+ case ATM_SETBACKEND:
+ case ATM_NEWBACKENDIF:
+ {
+ atm_backend_t backend;
+ error = get_user(backend, (atm_backend_t __user *)argp);
+ if (error)
+ goto done;
+ switch (backend) {
+ case ATM_BACKEND_PPP:
+ request_module("pppoatm");
break;
- case ATMLEC_CTRL:
- request_module("lec");
+ case ATM_BACKEND_BR2684:
+ request_module("br2684");
break;
+ }
+ break;
+ }
+ case ATMMPC_CTRL:
+ case ATMMPC_DATA:
+ request_module("mpoa");
+ break;
+ case ATMARPD_CTRL:
+ request_module("clip");
+ break;
+ case ATMLEC_CTRL:
+ request_module("lec");
+ break;
}
error = -ENOIOCTLCMD;
mutex_lock(&ioctl_mutex);
list_for_each(pos, &ioctl_list) {
- struct atm_ioctl * ic = list_entry(pos, struct atm_ioctl, list);
+ struct atm_ioctl *ic = list_entry(pos, struct atm_ioctl, list);
if (try_module_get(ic->owner)) {
error = ic->ioctl(sock, cmd, arg);
module_put(ic->owner);
@@ -152,8 +167,199 @@ int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
if (error != -ENOIOCTLCMD)
goto done;
- error = atm_dev_ioctl(cmd, argp);
+ if (cmd == ATM_GETNAMES) {
+ if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+ struct compat_atm_iobuf __user *ciobuf = argp;
+ compat_uptr_t cbuf;
+ len = &ciobuf->length;
+ if (get_user(cbuf, &ciobuf->buffer))
+ return -EFAULT;
+ buf = compat_ptr(cbuf);
+#endif
+ } else {
+ struct atm_iobuf __user *iobuf = argp;
+ len = &iobuf->length;
+ if (get_user(buf, &iobuf->buffer))
+ return -EFAULT;
+ }
+ error = atm_getnames(buf, len);
+ } else {
+ int number;
+
+ if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+ struct compat_atmif_sioc __user *csioc = argp;
+ compat_uptr_t carg;
+
+ len = &csioc->length;
+ if (get_user(carg, &csioc->arg))
+ return -EFAULT;
+ buf = compat_ptr(carg);
+ if (get_user(number, &csioc->number))
+ return -EFAULT;
+#endif
+ } else {
+ struct atmif_sioc __user *sioc = argp;
+
+ len = &sioc->length;
+ if (get_user(buf, &sioc->arg))
+ return -EFAULT;
+ if (get_user(number, &sioc->number))
+ return -EFAULT;
+ }
+ error = atm_dev_ioctl(cmd, buf, len, number, compat);
+ }
done:
return error;
}
+
+int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_vcc_ioctl(sock, cmd, arg, 0);
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * FIXME:
+ * The compat_ioctl handling is duplicated, using both these conversion
+ * routines and the compat argument to the actual handlers. Both
+ * versions are somewhat incomplete and should be merged, e.g. by
+ * moving the ioctl number translation into the actual handlers and
+ * killing the conversion code.
+ *
+ * -arnd, November 2009
+ */
+#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct compat_atmif_sioc)
+#define ATM_GETNAMES32 _IOW('a', ATMIOC_ITF+3, struct compat_atm_iobuf)
+#define ATM_GETTYPE32 _IOW('a', ATMIOC_ITF+4, struct compat_atmif_sioc)
+#define ATM_GETESI32 _IOW('a', ATMIOC_ITF+5, struct compat_atmif_sioc)
+#define ATM_GETADDR32 _IOW('a', ATMIOC_ITF+6, struct compat_atmif_sioc)
+#define ATM_RSTADDR32 _IOW('a', ATMIOC_ITF+7, struct compat_atmif_sioc)
+#define ATM_ADDADDR32 _IOW('a', ATMIOC_ITF+8, struct compat_atmif_sioc)
+#define ATM_DELADDR32 _IOW('a', ATMIOC_ITF+9, struct compat_atmif_sioc)
+#define ATM_GETCIRANGE32 _IOW('a', ATMIOC_ITF+10, struct compat_atmif_sioc)
+#define ATM_SETCIRANGE32 _IOW('a', ATMIOC_ITF+11, struct compat_atmif_sioc)
+#define ATM_SETESI32 _IOW('a', ATMIOC_ITF+12, struct compat_atmif_sioc)
+#define ATM_SETESIF32 _IOW('a', ATMIOC_ITF+13, struct compat_atmif_sioc)
+#define ATM_GETSTAT32 _IOW('a', ATMIOC_SARCOM+0, struct compat_atmif_sioc)
+#define ATM_GETSTATZ32 _IOW('a', ATMIOC_SARCOM+1, struct compat_atmif_sioc)
+#define ATM_GETLOOP32 _IOW('a', ATMIOC_SARCOM+2, struct compat_atmif_sioc)
+#define ATM_SETLOOP32 _IOW('a', ATMIOC_SARCOM+3, struct compat_atmif_sioc)
+#define ATM_QUERYLOOP32 _IOW('a', ATMIOC_SARCOM+4, struct compat_atmif_sioc)
+
+static struct {
+ unsigned int cmd32;
+ unsigned int cmd;
+} atm_ioctl_map[] = {
+ { ATM_GETLINKRATE32, ATM_GETLINKRATE },
+ { ATM_GETNAMES32, ATM_GETNAMES },
+ { ATM_GETTYPE32, ATM_GETTYPE },
+ { ATM_GETESI32, ATM_GETESI },
+ { ATM_GETADDR32, ATM_GETADDR },
+ { ATM_RSTADDR32, ATM_RSTADDR },
+ { ATM_ADDADDR32, ATM_ADDADDR },
+ { ATM_DELADDR32, ATM_DELADDR },
+ { ATM_GETCIRANGE32, ATM_GETCIRANGE },
+ { ATM_SETCIRANGE32, ATM_SETCIRANGE },
+ { ATM_SETESI32, ATM_SETESI },
+ { ATM_SETESIF32, ATM_SETESIF },
+ { ATM_GETSTAT32, ATM_GETSTAT },
+ { ATM_GETSTATZ32, ATM_GETSTATZ },
+ { ATM_GETLOOP32, ATM_GETLOOP },
+ { ATM_SETLOOP32, ATM_SETLOOP },
+ { ATM_QUERYLOOP32, ATM_QUERYLOOP },
+};
+
+#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
+
+static int do_atm_iobuf(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct compat_atm_iobuf __user *iobuf32 = compat_ptr(arg);
+ u32 data;
+
+ if (get_user(data, &iobuf32->buffer))
+ return -EFAULT;
+
+ return atm_getnames(&iobuf32->length, compat_ptr(data));
+}
+
+static int do_atmif_sioc(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct compat_atmif_sioc __user *sioc32 = compat_ptr(arg);
+ int number;
+ u32 data;
+
+ if (get_user(data, &sioc32->arg) || get_user(number, &sioc32->number))
+ return -EFAULT;
+ return atm_dev_ioctl(cmd, compat_ptr(data), &sioc32->length, number, 0);
+}
+
+static int do_atm_ioctl(struct socket *sock, unsigned int cmd32,
+ unsigned long arg)
+{
+ int i;
+ unsigned int cmd = 0;
+
+ switch (cmd32) {
+ case SONET_GETSTAT:
+ case SONET_GETSTATZ:
+ case SONET_GETDIAG:
+ case SONET_SETDIAG:
+ case SONET_CLRDIAG:
+ case SONET_SETFRAMING:
+ case SONET_GETFRAMING:
+ case SONET_GETFRSENSE:
+ return do_atmif_sioc(sock, cmd32, arg);
+ }
+
+ for (i = 0; i < NR_ATM_IOCTL; i++) {
+ if (cmd32 == atm_ioctl_map[i].cmd32) {
+ cmd = atm_ioctl_map[i].cmd;
+ break;
+ }
+ }
+ if (i == NR_ATM_IOCTL)
+ return -EINVAL;
+
+ switch (cmd) {
+ case ATM_GETNAMES:
+ return do_atm_iobuf(sock, cmd, arg);
+
+ case ATM_GETLINKRATE:
+ case ATM_GETTYPE:
+ case ATM_GETESI:
+ case ATM_GETADDR:
+ case ATM_RSTADDR:
+ case ATM_ADDADDR:
+ case ATM_DELADDR:
+ case ATM_GETCIRANGE:
+ case ATM_SETCIRANGE:
+ case ATM_SETESI:
+ case ATM_SETESIF:
+ case ATM_GETSTAT:
+ case ATM_GETSTATZ:
+ case ATM_GETLOOP:
+ case ATM_SETLOOP:
+ case ATM_QUERYLOOP:
+ return do_atmif_sioc(sock, cmd, arg);
+ }
+
+ return -EINVAL;
+}
+
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+
+ ret = do_vcc_ioctl(sock, cmd, arg, 1);
+ if (ret != -ENOIOCTLCMD)
+ return ret;
+
+ return do_atm_ioctl(sock, cmd, arg);
+}
+#endif
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
deleted file mode 100644
index 1d3de42fada0..000000000000
--- a/net/atm/ipcommon.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/* net/atm/ipcommon.c - Common items for all ways of doing IP over ATM */
-
-/* Written 1996-2000 by Werner Almesberger, EPFL LRC/ICA */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/atmdev.h>
-#include <linux/atmclip.h>
-
-#include "common.h"
-#include "ipcommon.h"
-
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-
-/*
- * skb_migrate appends the list at "from" to "to", emptying "from" in the
- * process. skb_migrate is atomic with respect to all other skb operations on
- * "from" and "to". Note that it locks both lists at the same time, so to deal
- * with the lock ordering, the locks are taken in address order.
- *
- * This function should live in skbuff.c or skbuff.h.
- */
-
-
-void skb_migrate(struct sk_buff_head *from, struct sk_buff_head *to)
-{
- unsigned long flags;
- struct sk_buff *skb_from = (struct sk_buff *) from;
- struct sk_buff *skb_to = (struct sk_buff *) to;
- struct sk_buff *prev;
-
- if ((unsigned long) from < (unsigned long) to) {
- spin_lock_irqsave(&from->lock, flags);
- spin_lock_nested(&to->lock, SINGLE_DEPTH_NESTING);
- } else {
- spin_lock_irqsave(&to->lock, flags);
- spin_lock_nested(&from->lock, SINGLE_DEPTH_NESTING);
- }
- prev = from->prev;
- from->next->prev = to->prev;
- prev->next = skb_to;
- to->prev->next = from->next;
- to->prev = from->prev;
- to->qlen += from->qlen;
- spin_unlock(&to->lock);
- from->prev = skb_from;
- from->next = skb_from;
- from->qlen = 0;
- spin_unlock_irqrestore(&from->lock, flags);
-}
-
-
-EXPORT_SYMBOL(skb_migrate);
diff --git a/net/atm/ipcommon.h b/net/atm/ipcommon.h
deleted file mode 100644
index d72165f60939..000000000000
--- a/net/atm/ipcommon.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* net/atm/ipcommon.h - Common items for all ways of doing IP over ATM */
-
-/* Written 1996-2000 by Werner Almesberger, EPFL LRC/ICA */
-
-
-#ifndef NET_ATM_IPCOMMON_H
-#define NET_ATM_IPCOMMON_H
-
-
-#include <linux/string.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/atmdev.h>
-
-/*
- * Appends all skbs from "from" to "to". The operation is atomic with respect
- * to all other skb operations on "from" or "to".
- */
-
-void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to);
-
-#endif
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5946ec63724f..afb8d3eb2185 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1,9 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * lec.c: Lan Emulation driver
+ * lec.c: Lan Emulation driver
*
* Marko Kiiskila <mkiiskila@yahoo.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/capability.h>
@@ -16,26 +20,19 @@
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/arp.h>
#include <net/dst.h>
#include <linux/proc_fs.h>
#include <linux/spinlock.h>
-#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-/* TokenRing if needed */
-#ifdef CONFIG_TR
-#include <linux/trdevice.h>
-#endif
-
/* And atm device */
#include <linux/atmdev.h>
#include <linux/atmlec.h>
/* Proxy LEC knows about bridging */
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
-#include <linux/if_bridge.h>
+#if IS_ENABLED(CONFIG_BRIDGE)
#include "../bridge/br_private.h"
static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
@@ -45,16 +42,13 @@ static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
#include <linux/module.h>
#include <linux/init.h>
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
#include "lec.h"
#include "lec_arpc.h"
#include "resources.h"
-#if 0
-#define DPRINTK printk
-#else
-#define DPRINTK(format,args...)
-#endif
-
#define DUMP_PACKETS 0 /*
* 0 = None,
* 1 = 30 first bytes
@@ -67,74 +61,74 @@ static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
*/
static int lec_open(struct net_device *dev);
-static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev);
+static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
static int lec_close(struct net_device *dev);
-static struct net_device_stats *lec_get_stats(struct net_device *dev);
-static void lec_init(struct net_device *dev);
static struct lec_arp_table *lec_arp_find(struct lec_priv *priv,
- unsigned char *mac_addr);
+ const unsigned char *mac_addr);
static int lec_arp_remove(struct lec_priv *priv,
struct lec_arp_table *to_remove);
/* LANE2 functions */
-static void lane2_associate_ind(struct net_device *dev, u8 *mac_address,
- u8 *tlvs, u32 sizeoftlvs);
-static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force,
+static void lane2_associate_ind(struct net_device *dev, const u8 *mac_address,
+ const u8 *tlvs, u32 sizeoftlvs);
+static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force,
u8 **tlvs, u32 *sizeoftlvs);
-static int lane2_associate_req(struct net_device *dev, u8 *lan_dst,
- u8 *tlvs, u32 sizeoftlvs);
+static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst,
+ const u8 *tlvs, u32 sizeoftlvs);
-static int lec_addr_delete(struct lec_priv *priv, unsigned char *atm_addr,
+static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr,
unsigned long permanent);
static void lec_arp_check_empties(struct lec_priv *priv,
struct atm_vcc *vcc, struct sk_buff *skb);
static void lec_arp_destroy(struct lec_priv *priv);
static void lec_arp_init(struct lec_priv *priv);
static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
- unsigned char *mac_to_find,
+ const unsigned char *mac_to_find,
int is_rdesc,
struct lec_arp_table **ret_entry);
-static void lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr,
- unsigned char *atm_addr, unsigned long remoteflag,
+static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
+ const unsigned char *atm_addr,
+ unsigned long remoteflag,
unsigned int targetless_le_arp);
static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id);
static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc);
static void lec_set_flush_tran_id(struct lec_priv *priv,
- unsigned char *atm_addr,
+ const unsigned char *atm_addr,
unsigned long tran_id);
-static void lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
+static void lec_vcc_added(struct lec_priv *priv,
+ const struct atmlec_ioc *ioc_data,
struct atm_vcc *vcc,
- void (*old_push) (struct atm_vcc *vcc,
- struct sk_buff *skb));
+ void (*old_push)(struct atm_vcc *vcc,
+ struct sk_buff *skb));
static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc);
/* must be done under lec_arp_lock */
static inline void lec_arp_hold(struct lec_arp_table *entry)
{
- atomic_inc(&entry->usage);
+ refcount_inc(&entry->usage);
}
static inline void lec_arp_put(struct lec_arp_table *entry)
{
- if (atomic_dec_and_test(&entry->usage))
+ if (refcount_dec_and_test(&entry->usage))
kfree(entry);
}
-
static struct lane2_ops lane2_ops = {
- lane2_resolve, /* resolve, spec 3.1.3 */
- lane2_associate_req, /* associate_req, spec 3.1.4 */
- NULL /* associate indicator, spec 3.1.5 */
+ .resolve = lane2_resolve, /* spec 3.1.3 */
+ .associate_req = lane2_associate_req, /* spec 3.1.4 */
+ .associate_indicator = NULL /* spec 3.1.5 */
};
static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
/* Device structures */
static struct net_device *dev_lec[MAX_LEC_ITF];
+static DEFINE_MUTEX(lec_mutex);
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
{
- struct ethhdr *eth;
char *buff;
struct lec_priv *priv;
@@ -143,7 +137,6 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
* LE_TOPOLOGY_REQUEST with the same value of Topology Change bit
* as the Config BPDU has
*/
- eth = (struct ethhdr *)skb->data;
buff = skb->data + skb->dev->hard_header_len;
if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) {
struct sock *sk;
@@ -157,62 +150,17 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
mesg = (struct atmlec_msg *)skb2->data;
mesg->type = l_topology_change;
buff += 4;
- mesg->content.normal.flag = *buff & 0x01; /* 0x01 is topology change */
+ mesg->content.normal.flag = *buff & 0x01;
+ /* 0x01 is topology change */
- priv = (struct lec_priv *)dev->priv;
+ priv = netdev_priv(dev);
atm_force_charge(priv->lecd, skb2->truesize);
sk = sk_atm(priv->lecd);
skb_queue_tail(&sk->sk_receive_queue, skb2);
- sk->sk_data_ready(sk, skb2->len);
- }
-
- return;
-}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
-
-/*
- * Modelled after tr_type_trans
- * All multicast and ARE or STE frames go to BUS.
- * Non source routed frames go by destination address.
- * Last hop source routed frames go by destination address.
- * Not last hop source routed frames go by _next_ route descriptor.
- * Returns pointer to destination MAC address or fills in rdesc
- * and returns NULL.
- */
-#ifdef CONFIG_TR
-static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc)
-{
- struct trh_hdr *trh;
- int riflen, num_rdsc;
-
- trh = (struct trh_hdr *)packet;
- if (trh->daddr[0] & (uint8_t) 0x80)
- return bus_mac; /* multicast */
-
- if (trh->saddr[0] & TR_RII) {
- riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8;
- if ((ntohs(trh->rcf) >> 13) != 0)
- return bus_mac; /* ARE or STE */
- } else
- return trh->daddr; /* not source routed */
-
- if (riflen < 6)
- return trh->daddr; /* last hop, source routed */
-
- /* riflen is 6 or more, packet has more than one route descriptor */
- num_rdsc = (riflen / 2) - 1;
- memset(rdesc, 0, ETH_ALEN);
- /* offset 4 comes from LAN destination field in LE control frames */
- if (trh->rcf & htons((uint16_t) TR_RCF_DIR_BIT))
- memcpy(&rdesc[4], &trh->rseg[num_rdsc - 2], sizeof(__be16));
- else {
- memcpy(&rdesc[4], &trh->rseg[1], sizeof(__be16));
- rdesc[5] = ((ntohs(trh->rseg[0]) & 0x000f) | (rdesc[5] & 0xf0));
+ sk->sk_data_ready(sk);
}
-
- return NULL;
}
-#endif /* CONFIG_TR */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
/*
* Open/initialize the netdevice. This is called (in the current kernel)
@@ -225,126 +173,96 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc)
static int lec_open(struct net_device *dev)
{
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
-
netif_start_queue(dev);
- memset(&priv->stats, 0, sizeof(struct net_device_stats));
return 0;
}
-static __inline__ void
-lec_send(struct atm_vcc *vcc, struct sk_buff *skb, struct lec_priv *priv)
+static void
+lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
+ unsigned int len = skb->len;
+
ATM_SKB(skb)->vcc = vcc;
- ATM_SKB(skb)->atm_options = vcc->atm_options;
+ atm_account_tx(vcc, skb);
- atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
if (vcc->send(vcc, skb) < 0) {
- priv->stats.tx_dropped++;
+ dev->stats.tx_dropped++;
return;
}
- priv->stats.tx_packets++;
- priv->stats.tx_bytes += skb->len;
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += len;
}
-static void lec_tx_timeout(struct net_device *dev)
+static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue)
{
- printk(KERN_INFO "%s: tx timeout\n", dev->name);
- dev->trans_start = jiffies;
+ pr_info("%s\n", dev->name);
+ netif_trans_update(dev);
netif_wake_queue(dev);
}
-static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct sk_buff *skb2;
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
struct lecdatahdr_8023 *lec_h;
struct atm_vcc *vcc;
struct lec_arp_table *entry;
unsigned char *dst;
int min_frame_size;
-#ifdef CONFIG_TR
- unsigned char rdesc[ETH_ALEN]; /* Token Ring route descriptor */
-#endif
int is_rdesc;
-#if DUMP_PACKETS > 0
- char buf[300];
- int i = 0;
-#endif /* DUMP_PACKETS >0 */
- DPRINTK("lec_start_xmit called\n");
+ pr_debug("called\n");
if (!priv->lecd) {
- printk("%s:No lecd attached\n", dev->name);
- priv->stats.tx_errors++;
+ pr_info("%s:No lecd attached\n", dev->name);
+ dev->stats.tx_errors++;
netif_stop_queue(dev);
- return -EUNATCH;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
}
- DPRINTK("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
- (long)skb->head, (long)skb->data, (long)skb->tail,
- (long)skb->end);
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+ pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
+ (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
+ (long)skb_end_pointer(skb));
+#if IS_ENABLED(CONFIG_BRIDGE)
if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
lec_handle_bridge(skb, dev);
#endif
/* Make sure we have room for lec_id */
if (skb_headroom(skb) < 2) {
-
- DPRINTK("lec_start_xmit: reallocating skb\n");
+ pr_debug("reallocating skb\n");
skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN);
- kfree_skb(skb);
- if (skb2 == NULL)
- return 0;
+ if (unlikely(!skb2)) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+ consume_skb(skb);
skb = skb2;
}
skb_push(skb, 2);
- /* Put le header to place, works for TokenRing too */
+ /* Put le header to place */
lec_h = (struct lecdatahdr_8023 *)skb->data;
lec_h->le_header = htons(priv->lecid);
-#ifdef CONFIG_TR
- /*
- * Ugly. Use this to realign Token Ring packets for
- * e.g. PCA-200E driver.
- */
- if (priv->is_trdev) {
- skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN);
- kfree_skb(skb);
- if (skb2 == NULL)
- return 0;
- skb = skb2;
- }
-#endif
-
-#if DUMP_PACKETS > 0
- printk("%s: send datalen:%ld lecid:%4.4x\n", dev->name,
- skb->len, priv->lecid);
#if DUMP_PACKETS >= 2
- for (i = 0; i < skb->len && i < 99; i++) {
- sprintf(buf + i * 3, "%2.2x ", 0xff & skb->data[i]);
- }
+#define MAX_DUMP_SKB 99
#elif DUMP_PACKETS >= 1
- for (i = 0; i < skb->len && i < 30; i++) {
- sprintf(buf + i * 3, "%2.2x ", 0xff & skb->data[i]);
- }
+#define MAX_DUMP_SKB 30
+#endif
+#if DUMP_PACKETS >= 1
+ printk(KERN_DEBUG "%s: send datalen:%ld lecid:%4.4x\n",
+ dev->name, skb->len, priv->lecid);
+ print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1,
+ skb->data, min(skb->len, MAX_DUMP_SKB), true);
#endif /* DUMP_PACKETS >= 1 */
- if (i == skb->len)
- printk("%s\n", buf);
- else
- printk("%s...\n", buf);
-#endif /* DUMP_PACKETS > 0 */
/* Minimum ethernet-frame size */
-#ifdef CONFIG_TR
- if (priv->is_trdev)
- min_frame_size = LEC_MINIMUM_8025_SIZE;
- else
-#endif
- min_frame_size = LEC_MINIMUM_8023_SIZE;
+ min_frame_size = LEC_MINIMUM_8023_SIZE;
if (skb->len < min_frame_size) {
if ((skb->len + skb_tailroom(skb)) < min_frame_size) {
skb2 = skb_copy_expand(skb, 0,
@@ -352,8 +270,8 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
GFP_ATOMIC);
dev_kfree_skb(skb);
if (skb2 == NULL) {
- priv->stats.tx_dropped++;
- return 0;
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
}
skb = skb2;
}
@@ -363,54 +281,34 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
/* Send to right vcc */
is_rdesc = 0;
dst = lec_h->h_dest;
-#ifdef CONFIG_TR
- if (priv->is_trdev) {
- dst = get_tr_dst(skb->data + 2, rdesc);
- if (dst == NULL) {
- dst = rdesc;
- is_rdesc = 1;
- }
- }
-#endif
entry = NULL;
vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry);
- DPRINTK("%s:vcc:%p vcc_flags:%x, entry:%p\n", dev->name,
- vcc, vcc ? vcc->flags : 0, entry);
+ pr_debug("%s:vcc:%p vcc_flags:%lx, entry:%p\n",
+ dev->name, vcc, vcc ? vcc->flags : 0, entry);
if (!vcc || !test_bit(ATM_VF_READY, &vcc->flags)) {
if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) {
- DPRINTK("%s:lec_start_xmit: queuing packet, ",
- dev->name);
- DPRINTK("MAC address 0x%02x:%02x:%02x:%02x:%02x:%02x\n",
- lec_h->h_dest[0], lec_h->h_dest[1],
- lec_h->h_dest[2], lec_h->h_dest[3],
- lec_h->h_dest[4], lec_h->h_dest[5]);
+ pr_debug("%s:queuing packet, MAC address %pM\n",
+ dev->name, lec_h->h_dest);
skb_queue_tail(&entry->tx_wait, skb);
} else {
- DPRINTK
- ("%s:lec_start_xmit: tx queue full or no arp entry, dropping, ",
- dev->name);
- DPRINTK("MAC address 0x%02x:%02x:%02x:%02x:%02x:%02x\n",
- lec_h->h_dest[0], lec_h->h_dest[1],
- lec_h->h_dest[2], lec_h->h_dest[3],
- lec_h->h_dest[4], lec_h->h_dest[5]);
- priv->stats.tx_dropped++;
+ pr_debug("%s:tx queue full or no arp entry, dropping, MAC address: %pM\n",
+ dev->name, lec_h->h_dest);
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
}
goto out;
}
#if DUMP_PACKETS > 0
- printk("%s:sending to vpi:%d vci:%d\n", dev->name, vcc->vpi, vcc->vci);
+ printk(KERN_DEBUG "%s:sending to vpi:%d vci:%d\n",
+ dev->name, vcc->vpi, vcc->vci);
#endif /* DUMP_PACKETS > 0 */
while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) {
- DPRINTK("lec.c: emptying tx queue, ");
- DPRINTK("MAC address 0x%02x:%02x:%02x:%02x:%02x:%02x\n",
- lec_h->h_dest[0], lec_h->h_dest[1], lec_h->h_dest[2],
- lec_h->h_dest[3], lec_h->h_dest[4], lec_h->h_dest[5]);
- lec_send(vcc, skb2, priv);
+ pr_debug("emptying tx queue, MAC address %pM\n", lec_h->h_dest);
+ lec_send(vcc, skb2);
}
- lec_send(vcc, skb, priv);
+ lec_send(vcc, skb);
if (!atm_may_send(vcc, 0)) {
struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
@@ -431,8 +329,8 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev)
out:
if (entry)
lec_arp_put(entry);
- dev->trans_start = jiffies;
- return 0;
+ netif_trans_update(dev);
+ return NETDEV_TX_OK;
}
/* The inverse routine to net_open(). */
@@ -442,40 +340,27 @@ static int lec_close(struct net_device *dev)
return 0;
}
-/*
- * Get the current statistics.
- * This may be called with the card open or closed.
- */
-static struct net_device_stats *lec_get_stats(struct net_device *dev)
-{
- return &((struct lec_priv *)dev->priv)->stats;
-}
-
static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
{
+ static const u8 zero_addr[ETH_ALEN] = {};
unsigned long flags;
struct net_device *dev = (struct net_device *)vcc->proto_data;
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
struct atmlec_msg *mesg;
struct lec_arp_table *entry;
- int i;
char *tmp; /* FIXME */
- atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc));
mesg = (struct atmlec_msg *)skb->data;
tmp = skb->data;
tmp += sizeof(struct atmlec_msg);
- DPRINTK("%s: msg from zeppelin:%d\n", dev->name, mesg->type);
+ pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type);
switch (mesg->type) {
case l_set_mac_addr:
- for (i = 0; i < 6; i++) {
- dev->dev_addr[i] = mesg->content.normal.mac_addr[i];
- }
+ eth_hw_addr_set(dev, mesg->content.normal.mac_addr);
break;
case l_del_mac_addr:
- for (i = 0; i < 6; i++) {
- dev->dev_addr[i] = 0;
- }
+ eth_hw_addr_set(dev, zero_addr);
break;
case l_addr_delete:
lec_addr_delete(priv, mesg->content.normal.atm_addr,
@@ -495,16 +380,16 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
if (mesg->content.normal.no_source_le_narp)
break;
- /* FALL THROUGH */
+ fallthrough;
case l_arp_update:
lec_arp_update(priv, mesg->content.normal.mac_addr,
mesg->content.normal.atm_addr,
mesg->content.normal.flag,
mesg->content.normal.targetless_le_arp);
- DPRINTK("lec: in l_arp_update\n");
+ pr_debug("in l_arp_update\n");
if (mesg->sizeoftlvs != 0) { /* LANE2 3.1.5 */
- DPRINTK("lec: LANE2 3.1.5, got tlvs, size %d\n",
- mesg->sizeoftlvs);
+ pr_debug("LANE2 3.1.5, got tlvs, size %d\n",
+ mesg->sizeoftlvs);
lane2_associate_ind(dev, mesg->content.normal.mac_addr,
tmp, mesg->sizeoftlvs);
}
@@ -523,13 +408,16 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
priv->flush_timeout = (mesg->content.config.flush_timeout * HZ);
priv->path_switching_delay =
(mesg->content.config.path_switching_delay * HZ);
- priv->lane_version = mesg->content.config.lane_version; /* LANE2 */
+ priv->lane_version = mesg->content.config.lane_version;
+ /* LANE2 */
priv->lane2_ops = NULL;
if (priv->lane_version > 1)
priv->lane2_ops = &lane2_ops;
- if (dev->change_mtu(dev, mesg->content.config.mtu))
- printk("%s: change_mtu to %d failed\n", dev->name,
- mesg->content.config.mtu);
+ rtnl_lock();
+ if (dev_set_mtu(dev, mesg->content.config.mtu))
+ pr_info("%s: change_mtu to %d failed\n",
+ dev->name, mesg->content.config.mtu);
+ rtnl_unlock();
priv->is_proxy = mesg->content.config.is_proxy;
break;
case l_flush_tran_id:
@@ -541,55 +429,36 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
(unsigned short)(0xffff & mesg->content.normal.flag);
break;
case l_should_bridge:
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
- {
- struct net_bridge_fdb_entry *f;
-
- DPRINTK
- ("%s: bridge zeppelin asks about 0x%02x:%02x:%02x:%02x:%02x:%02x\n",
- dev->name, mesg->content.proxy.mac_addr[0],
- mesg->content.proxy.mac_addr[1],
- mesg->content.proxy.mac_addr[2],
- mesg->content.proxy.mac_addr[3],
- mesg->content.proxy.mac_addr[4],
- mesg->content.proxy.mac_addr[5]);
-
- if (br_fdb_get_hook == NULL || dev->br_port == NULL)
- break;
+#if IS_ENABLED(CONFIG_BRIDGE)
+ {
+ pr_debug("%s: bridge zeppelin asks about %pM\n",
+ dev->name, mesg->content.proxy.mac_addr);
- f = br_fdb_get_hook(dev->br_port->br,
- mesg->content.proxy.mac_addr);
- if (f != NULL && f->dst->dev != dev
- && f->dst->state == BR_STATE_FORWARDING) {
- /* hit from bridge table, send LE_ARP_RESPONSE */
- struct sk_buff *skb2;
- struct sock *sk;
-
- DPRINTK
- ("%s: entry found, responding to zeppelin\n",
- dev->name);
- skb2 =
- alloc_skb(sizeof(struct atmlec_msg),
- GFP_ATOMIC);
- if (skb2 == NULL) {
- br_fdb_put_hook(f);
- break;
- }
- skb2->len = sizeof(struct atmlec_msg);
- memcpy(skb2->data, mesg,
- sizeof(struct atmlec_msg));
- atm_force_charge(priv->lecd, skb2->truesize);
- sk = sk_atm(priv->lecd);
- skb_queue_tail(&sk->sk_receive_queue, skb2);
- sk->sk_data_ready(sk, skb2->len);
- }
- if (f != NULL)
- br_fdb_put_hook(f);
+ if (br_fdb_test_addr_hook == NULL)
+ break;
+
+ if (br_fdb_test_addr_hook(dev, mesg->content.proxy.mac_addr)) {
+ /* hit from bridge table, send LE_ARP_RESPONSE */
+ struct sk_buff *skb2;
+ struct sock *sk;
+
+ pr_debug("%s: entry found, responding to zeppelin\n",
+ dev->name);
+ skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+ if (skb2 == NULL)
+ break;
+ skb2->len = sizeof(struct atmlec_msg);
+ skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg));
+ atm_force_charge(priv->lecd, skb2->truesize);
+ sk = sk_atm(priv->lecd);
+ skb_queue_tail(&sk->sk_receive_queue, skb2);
+ sk->sk_data_ready(sk);
}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+ }
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
break;
default:
- printk("%s: Unknown message type %d\n", dev->name, mesg->type);
+ pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
dev_kfree_skb(skb);
return -EINVAL;
}
@@ -601,7 +470,7 @@ static void lec_atm_close(struct atm_vcc *vcc)
{
struct sk_buff *skb;
struct net_device *dev = (struct net_device *)vcc->proto_data;
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
priv->lecd = NULL;
/* Do something needful? */
@@ -610,18 +479,17 @@ static void lec_atm_close(struct atm_vcc *vcc)
lec_arp_destroy(priv);
if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
- printk("%s lec_atm_close: closing with messages pending\n",
- dev->name);
- while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue)) != NULL) {
+ pr_info("%s closing with messages pending\n", dev->name);
+ while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) {
atm_return(vcc, skb->truesize);
dev_kfree_skb(skb);
}
- printk("%s: Shut down!\n", dev->name);
+ pr_info("%s: Shut down!\n", dev->name);
module_put(THIS_MODULE);
}
-static struct atmdev_ops lecdev_ops = {
+static const struct atmdev_ops lecdev_ops = {
.close = lec_atm_close,
.send = lec_atm_send
};
@@ -630,7 +498,7 @@ static struct atm_dev lecatm_dev = {
.ops = &lecdev_ops,
.type = "lec",
.number = 999, /* dummy device number */
- .lock = SPIN_LOCK_UNLOCKED
+ .lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock)
};
/*
@@ -639,16 +507,15 @@ static struct atm_dev lecatm_dev = {
*/
static int
send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
- unsigned char *mac_addr, unsigned char *atm_addr,
+ const unsigned char *mac_addr, const unsigned char *atm_addr,
struct sk_buff *data)
{
struct sock *sk;
struct sk_buff *skb;
struct atmlec_msg *mesg;
- if (!priv || !priv->lecd) {
+ if (!priv || !priv->lecd)
return -1;
- }
skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
if (!skb)
return -1;
@@ -659,7 +526,7 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
if (data != NULL)
mesg->sizeoftlvs = data->len;
if (mac_addr)
- memcpy(&mesg->content.normal.mac_addr, mac_addr, ETH_ALEN);
+ ether_addr_copy(mesg->content.normal.mac_addr, mac_addr);
else
mesg->content.normal.targetless_le_arp = 1;
if (atm_addr)
@@ -668,52 +535,35 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
atm_force_charge(priv->lecd, skb->truesize);
sk = sk_atm(priv->lecd);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
if (data != NULL) {
- DPRINTK("lec: about to send %d bytes of data\n", data->len);
+ pr_debug("about to send %d bytes of data\n", data->len);
atm_force_charge(priv->lecd, data->truesize);
skb_queue_tail(&sk->sk_receive_queue, data);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
}
return 0;
}
-/* shamelessly stolen from drivers/net/net_init.c */
-static int lec_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < 68) || (new_mtu > 18190))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
static void lec_set_multicast_list(struct net_device *dev)
{
/*
* by default, all multicast frames arrive over the bus.
* eventually support selective multicast service
*/
- return;
}
-static void lec_init(struct net_device *dev)
-{
- dev->change_mtu = lec_change_mtu;
- dev->open = lec_open;
- dev->stop = lec_close;
- dev->hard_start_xmit = lec_start_xmit;
- dev->tx_timeout = lec_tx_timeout;
-
- dev->get_stats = lec_get_stats;
- dev->set_multicast_list = lec_set_multicast_list;
- dev->do_ioctl = NULL;
- printk("%s: Initialized!\n", dev->name);
- return;
-}
+static const struct net_device_ops lec_netdev_ops = {
+ .ndo_open = lec_open,
+ .ndo_stop = lec_close,
+ .ndo_start_xmit = lec_start_xmit,
+ .ndo_tx_timeout = lec_tx_timeout,
+ .ndo_set_rx_mode = lec_set_multicast_list,
+};
-static unsigned char lec_ctrl_magic[] = {
+static const unsigned char lec_ctrl_magic[] = {
0xff,
0x00,
0x01,
@@ -733,43 +583,35 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
{
unsigned long flags;
struct net_device *dev = (struct net_device *)vcc->proto_data;
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
-#if DUMP_PACKETS >0
- int i = 0;
- char buf[300];
-
- printk("%s: lec_push vcc vpi:%d vci:%d\n", dev->name,
- vcc->vpi, vcc->vci);
+#if DUMP_PACKETS > 0
+ printk(KERN_DEBUG "%s: vcc vpi:%d vci:%d\n",
+ dev->name, vcc->vpi, vcc->vci);
#endif
if (!skb) {
- DPRINTK("%s: null skb\n", dev->name);
+ pr_debug("%s: null skb\n", dev->name);
lec_vcc_close(priv, vcc);
return;
}
-#if DUMP_PACKETS > 0
- printk("%s: rcv datalen:%ld lecid:%4.4x\n", dev->name,
- skb->len, priv->lecid);
#if DUMP_PACKETS >= 2
- for (i = 0; i < skb->len && i < 99; i++) {
- sprintf(buf + i * 3, "%2.2x ", 0xff & skb->data[i]);
- }
+#define MAX_SKB_DUMP 99
#elif DUMP_PACKETS >= 1
- for (i = 0; i < skb->len && i < 30; i++) {
- sprintf(buf + i * 3, "%2.2x ", 0xff & skb->data[i]);
- }
-#endif /* DUMP_PACKETS >= 1 */
- if (i == skb->len)
- printk("%s\n", buf);
- else
- printk("%s...\n", buf);
+#define MAX_SKB_DUMP 30
+#endif
+#if DUMP_PACKETS > 0
+ printk(KERN_DEBUG "%s: rcv datalen:%ld lecid:%4.4x\n",
+ dev->name, skb->len, priv->lecid);
+ print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1,
+ skb->data, min(MAX_SKB_DUMP, skb->len), true);
#endif /* DUMP_PACKETS > 0 */
- if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) { /* Control frame, to daemon */
+ if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) {
+ /* Control frame, to daemon */
struct sock *sk = sk_atm(vcc);
- DPRINTK("%s: To daemon\n", dev->name);
+ pr_debug("%s: To daemon\n", dev->name);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
} else { /* Data frame, queue to protocol handlers */
struct lec_arp_table *entry;
unsigned char *src, *dst;
@@ -781,16 +623,11 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
* Probably looping back, or if lecd is missing,
* lecd has gone down
*/
- DPRINTK("Ignoring frame...\n");
+ pr_debug("Ignoring frame...\n");
dev_kfree_skb(skb);
return;
}
-#ifdef CONFIG_TR
- if (priv->is_trdev)
- dst = ((struct lecdatahdr_8025 *)skb->data)->h_dest;
- else
-#endif
- dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest;
+ dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest;
/*
* If this is a Data Direct VCC, and the VCC does not match
@@ -798,16 +635,7 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
*/
spin_lock_irqsave(&priv->lec_arp_lock, flags);
if (lec_is_data_direct(vcc)) {
-#ifdef CONFIG_TR
- if (priv->is_trdev)
- src =
- ((struct lecdatahdr_8025 *)skb->data)->
- h_source;
- else
-#endif
- src =
- ((struct lecdatahdr_8023 *)skb->data)->
- h_source;
+ src = ((struct lecdatahdr_8023 *)skb->data)->h_source;
entry = lec_arp_find(priv, src);
if (entry && entry->vcc != vcc) {
lec_arp_remove(priv, entry);
@@ -822,19 +650,12 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
dev_kfree_skb(skb);
return;
}
- if (!hlist_empty(&priv->lec_arp_empty_ones)) {
+ if (!hlist_empty(&priv->lec_arp_empty_ones))
lec_arp_check_empties(priv, vcc, skb);
- }
- skb->dev = dev;
skb_pull(skb, 2); /* skip lec_id */
-#ifdef CONFIG_TR
- if (priv->is_trdev)
- skb->protocol = tr_type_trans(skb, dev);
- else
-#endif
- skb->protocol = eth_type_trans(skb, dev);
- priv->stats.rx_packets++;
- priv->stats.rx_bytes += skb->len;
+ skb->protocol = eth_type_trans(skb, dev);
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += skb->len;
memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
netif_rx(skb);
}
@@ -846,7 +667,7 @@ static void lec_pop(struct atm_vcc *vcc, struct sk_buff *skb)
struct net_device *dev = skb->dev;
if (vpriv == NULL) {
- printk("lec_pop(): vpriv = NULL!?!?!?\n");
+ pr_info("vpriv = NULL!?!?!?\n");
return;
}
@@ -865,23 +686,24 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
int bytes_left;
struct atmlec_ioc ioc_data;
+ lockdep_assert_held(&lec_mutex);
/* Lecd must be up in this case */
bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc));
- if (bytes_left != 0) {
- printk
- ("lec: lec_vcc_attach, copy from user failed for %d bytes\n",
- bytes_left);
- }
- if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF ||
- !dev_lec[ioc_data.dev_num])
+ if (bytes_left != 0)
+ pr_info("copy from user failed for %d bytes\n", bytes_left);
+ if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF)
+ return -EINVAL;
+ ioc_data.dev_num = array_index_nospec(ioc_data.dev_num, MAX_LEC_ITF);
+ if (!dev_lec[ioc_data.dev_num])
return -EINVAL;
- if (!(vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL)))
+ vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
+ if (!vpriv)
return -ENOMEM;
vpriv->xoff = 0;
vpriv->old_pop = vcc->pop;
vcc->user_back = vpriv;
vcc->pop = lec_pop;
- lec_vcc_added(dev_lec[ioc_data.dev_num]->priv,
+ lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]),
&ioc_data, vcc, vcc->push);
vcc->proto_data = dev_lec[ioc_data.dev_num];
vcc->push = lec_push;
@@ -890,10 +712,14 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
static int lec_mcast_attach(struct atm_vcc *vcc, int arg)
{
- if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg])
+ lockdep_assert_held(&lec_mutex);
+ if (arg < 0 || arg >= MAX_LEC_ITF)
+ return -EINVAL;
+ arg = array_index_nospec(arg, MAX_LEC_ITF);
+ if (!dev_lec[arg])
return -EINVAL;
vcc->proto_data = dev_lec[arg];
- return (lec_mcast_make((struct lec_priv *)dev_lec[arg]->priv, vcc));
+ return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc);
}
/* Initialize device. */
@@ -902,44 +728,31 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
int i;
struct lec_priv *priv;
+ lockdep_assert_held(&lec_mutex);
if (arg < 0)
- i = 0;
- else
- i = arg;
-#ifdef CONFIG_TR
+ arg = 0;
if (arg >= MAX_LEC_ITF)
return -EINVAL;
-#else /* Reserve the top NUM_TR_DEVS for TR */
- if (arg >= (MAX_LEC_ITF - NUM_TR_DEVS))
- return -EINVAL;
-#endif
+ i = array_index_nospec(arg, MAX_LEC_ITF);
if (!dev_lec[i]) {
- int is_trdev, size;
-
- is_trdev = 0;
- if (i >= (MAX_LEC_ITF - NUM_TR_DEVS))
- is_trdev = 1;
+ int size;
size = sizeof(struct lec_priv);
-#ifdef CONFIG_TR
- if (is_trdev)
- dev_lec[i] = alloc_trdev(size);
- else
-#endif
- dev_lec[i] = alloc_etherdev(size);
+ dev_lec[i] = alloc_etherdev(size);
if (!dev_lec[i])
return -ENOMEM;
+ dev_lec[i]->netdev_ops = &lec_netdev_ops;
+ dev_lec[i]->max_mtu = 18190;
snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
if (register_netdev(dev_lec[i])) {
free_netdev(dev_lec[i]);
+ dev_lec[i] = NULL;
return -EINVAL;
}
- priv = dev_lec[i]->priv;
- priv->is_trdev = is_trdev;
- lec_init(dev_lec[i]);
+ priv = netdev_priv(dev_lec[i]);
} else {
- priv = dev_lec[i]->priv;
+ priv = netdev_priv(dev_lec[i]);
if (priv->lecd)
return -EADDRINUSE;
}
@@ -965,17 +778,16 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
priv->flush_timeout = (4 * HZ);
priv->path_switching_delay = (6 * HZ);
- if (dev_lec[i]->flags & IFF_UP) {
+ if (dev_lec[i]->flags & IFF_UP)
netif_start_queue(dev_lec[i]);
- }
__module_get(THIS_MODULE);
return i;
}
#ifdef CONFIG_PROC_FS
-static char *lec_arp_get_status_string(unsigned char status)
+static const char *lec_arp_get_status_string(unsigned char status)
{
- static char *lec_arp_status_string[] = {
+ static const char *const lec_arp_status_string[] = {
"ESI_UNKNOWN ",
"ESI_ARP_PENDING ",
"ESI_VC_PENDING ",
@@ -991,14 +803,9 @@ static char *lec_arp_get_status_string(unsigned char status)
static void lec_info(struct seq_file *seq, struct lec_arp_table *entry)
{
- int i;
-
- for (i = 0; i < ETH_ALEN; i++)
- seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff);
- seq_printf(seq, " ");
- for (i = 0; i < ATM_ESA_LEN; i++)
- seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff);
- seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status),
+ seq_printf(seq, "%pM ", entry->mac_addr);
+ seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr);
+ seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status),
entry->flags & 0xffff);
if (entry->vcc)
seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci);
@@ -1025,16 +832,15 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
loff_t *l)
{
struct hlist_node *e = state->node;
- struct lec_arp_table *tmp;
if (!e)
e = tbl->first;
- if (e == (void *)1) {
+ if (e == SEQ_START_TOKEN) {
e = tbl->first;
--*l;
}
- hlist_for_each_entry_from(tmp, e, next) {
+ for (; e; e = e->next) {
if (--*l < 0)
break;
}
@@ -1100,9 +906,9 @@ static void *lec_itf_walk(struct lec_state *state, loff_t *l)
void *v;
dev = state->dev ? state->dev : dev_lec[state->itf];
- v = (dev && dev->priv) ? lec_priv_walk(state, l, dev->priv) : NULL;
+ v = (dev && netdev_priv(dev)) ?
+ lec_priv_walk(state, l, netdev_priv(dev)) : NULL;
if (!v && dev) {
- dev_put(dev);
/* Partial state reset for the next time we get called */
dev = NULL;
}
@@ -1126,14 +932,15 @@ static void *lec_seq_start(struct seq_file *seq, loff_t *pos)
{
struct lec_state *state = seq->private;
+ mutex_lock(&lec_mutex);
state->itf = 0;
state->dev = NULL;
state->locked = NULL;
state->arp_table = 0;
state->misc_table = 0;
- state->node = (void *)1;
+ state->node = SEQ_START_TOKEN;
- return *pos ? lec_get_idx(state, *pos) : (void *)1;
+ return *pos ? lec_get_idx(state, *pos) : SEQ_START_TOKEN;
}
static void lec_seq_stop(struct seq_file *seq, void *v)
@@ -1143,31 +950,34 @@ static void lec_seq_stop(struct seq_file *seq, void *v)
if (state->dev) {
spin_unlock_irqrestore(&state->locked->lec_arp_lock,
state->flags);
- dev_put(state->dev);
+ state->dev = NULL;
}
+ mutex_unlock(&lec_mutex);
}
static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct lec_state *state = seq->private;
- v = lec_get_idx(state, 1);
- *pos += !!PTR_ERR(v);
- return v;
+ ++*pos;
+ return lec_get_idx(state, 1);
}
static int lec_seq_show(struct seq_file *seq, void *v)
{
- static char lec_banner[] = "Itf MAC ATM destination"
+ static const char lec_banner[] =
+ "Itf MAC ATM destination"
" Status Flags "
"VPI/VCI Recv VPI/VCI\n";
- if (v == (void *)1)
+ if (v == SEQ_START_TOKEN)
seq_puts(seq, lec_banner);
else {
struct lec_state *state = seq->private;
struct net_device *dev = state->dev;
- struct lec_arp_table *entry = hlist_entry(state->node, struct lec_arp_table, next);
+ struct lec_arp_table *entry = hlist_entry(state->node,
+ struct lec_arp_table,
+ next);
seq_printf(seq, "%s ", dev->name);
lec_info(seq, entry);
@@ -1175,50 +985,12 @@ static int lec_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations lec_seq_ops = {
+static const struct seq_operations lec_seq_ops = {
.start = lec_seq_start,
.next = lec_seq_next,
.stop = lec_seq_stop,
.show = lec_seq_show,
};
-
-static int lec_seq_open(struct inode *inode, struct file *file)
-{
- struct lec_state *state;
- struct seq_file *seq;
- int rc = -EAGAIN;
-
- state = kmalloc(sizeof(*state), GFP_KERNEL);
- if (!state) {
- rc = -ENOMEM;
- goto out;
- }
-
- rc = seq_open(file, &lec_seq_ops);
- if (rc)
- goto out_kfree;
- seq = file->private_data;
- seq->private = state;
-out:
- return rc;
-
-out_kfree:
- kfree(state);
- goto out;
-}
-
-static int lec_seq_release(struct inode *inode, struct file *file)
-{
- return seq_release_private(inode, file);
-}
-
-static struct file_operations lec_seq_fops = {
- .owner = THIS_MODULE,
- .open = lec_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = lec_seq_release,
-};
#endif
static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
@@ -1237,6 +1009,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
+ mutex_lock(&lec_mutex);
switch (cmd) {
case ATMLEC_CTRL:
err = lecd_attach(vcc, (int)arg);
@@ -1251,6 +1024,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
+ mutex_unlock(&lec_mutex);
return err;
}
@@ -1264,35 +1038,36 @@ static int __init lane_module_init(void)
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *p;
- p = create_proc_entry("lec", S_IRUGO, atm_proc_root);
- if (p)
- p->proc_fops = &lec_seq_fops;
+ p = proc_create_seq_private("lec", 0444, atm_proc_root, &lec_seq_ops,
+ sizeof(struct lec_state), NULL);
+ if (!p) {
+ pr_err("Unable to initialize /proc/net/atm/lec\n");
+ return -ENOMEM;
+ }
#endif
register_atm_ioctl(&lane_ioctl_ops);
- printk("lec.c: " __DATE__ " " __TIME__ " initialized\n");
+ pr_info("lec.c: initialized\n");
return 0;
}
static void __exit lane_module_cleanup(void)
{
int i;
- struct lec_priv *priv;
+#ifdef CONFIG_PROC_FS
remove_proc_entry("lec", atm_proc_root);
+#endif
deregister_atm_ioctl(&lane_ioctl_ops);
for (i = 0; i < MAX_LEC_ITF; i++) {
if (dev_lec[i] != NULL) {
- priv = (struct lec_priv *)dev_lec[i]->priv;
unregister_netdev(dev_lec[i]);
free_netdev(dev_lec[i]);
dev_lec[i] = NULL;
}
}
-
- return;
}
module_init(lane_module_init);
@@ -1301,15 +1076,15 @@ module_exit(lane_module_cleanup);
/*
* LANE2: 3.1.3, LE_RESOLVE.request
* Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs.
- * If sizeoftlvs == NULL the default TLVs associated with with this
+ * If sizeoftlvs == NULL the default TLVs associated with this
* lec will be used.
* If dst_mac == NULL, targetless LE_ARP will be sent
*/
-static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force,
+static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force,
u8 **tlvs, u32 *sizeoftlvs)
{
unsigned long flags;
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
struct lec_arp_table *table;
struct sk_buff *skb;
int retval;
@@ -1338,7 +1113,7 @@ static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force,
if (skb == NULL)
return -1;
skb->len = *sizeoftlvs;
- memcpy(skb->data, *tlvs, *sizeoftlvs);
+ skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs);
retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb);
}
return retval;
@@ -1351,49 +1126,49 @@ static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force,
* Returns 1 for success, 0 for failure (out of memory)
*
*/
-static int lane2_associate_req(struct net_device *dev, u8 *lan_dst,
- u8 *tlvs, u32 sizeoftlvs)
+static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst,
+ const u8 *tlvs, u32 sizeoftlvs)
{
int retval;
struct sk_buff *skb;
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
- if (compare_ether_addr(lan_dst, dev->dev_addr))
- return (0); /* not our mac address */
+ if (!ether_addr_equal(lan_dst, dev->dev_addr))
+ return 0; /* not our mac address */
kfree(priv->tlvs); /* NULL if there was no previous association */
priv->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL);
if (priv->tlvs == NULL)
- return (0);
+ return 0;
priv->sizeoftlvs = sizeoftlvs;
skb = alloc_skb(sizeoftlvs, GFP_ATOMIC);
if (skb == NULL)
return 0;
skb->len = sizeoftlvs;
- memcpy(skb->data, tlvs, sizeoftlvs);
+ skb_copy_to_linear_data(skb, tlvs, sizeoftlvs);
retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb);
if (retval != 0)
- printk("lec.c: lane2_associate_req() failed\n");
+ pr_info("lec.c: lane2_associate_req() failed\n");
/*
* If the previous association has changed we must
* somehow notify other LANE entities about the change
*/
- return (1);
+ return 1;
}
/*
* LANE2: 3.1.5, LE_ASSOCIATE.indication
*
*/
-static void lane2_associate_ind(struct net_device *dev, u8 *mac_addr,
- u8 *tlvs, u32 sizeoftlvs)
+static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr,
+ const u8 *tlvs, u32 sizeoftlvs)
{
#if 0
int i = 0;
#endif
- struct lec_priv *priv = (struct lec_priv *)dev->priv;
+ struct lec_priv *priv = netdev_priv(dev);
#if 0 /*
* Why have the TLVs in LE_ARP entries
* since we do not use them? When you
@@ -1413,12 +1188,12 @@ static void lane2_associate_ind(struct net_device *dev, u8 *mac_addr,
entry->sizeoftlvs = sizeoftlvs;
#endif
#if 0
- printk("lec.c: lane2_associate_ind()\n");
- printk("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs);
+ pr_info("\n");
+ pr_info("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs);
while (i < sizeoftlvs)
- printk("%02x ", tlvs[i++]);
+ pr_cont("%02x ", tlvs[i++]);
- printk("\n");
+ pr_cont("\n");
#endif
/* tell MPOA about the TLVs we saw */
@@ -1426,7 +1201,6 @@ static void lane2_associate_ind(struct net_device *dev, u8 *mac_addr,
priv->lane2_ops->associate_indicator(dev, mac_addr,
tlvs, sizeoftlvs);
}
- return;
}
/*
@@ -1437,31 +1211,30 @@ static void lane2_associate_ind(struct net_device *dev, u8 *mac_addr,
*/
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/timer.h>
-#include <asm/param.h>
-#include <asm/atomic.h>
+#include <linux/param.h>
+#include <linux/atomic.h>
#include <linux/inetdevice.h>
#include <net/route.h>
#if 0
-#define DPRINTK(format,args...)
+#define pr_debug(format, args...)
/*
-#define DPRINTK printk
+ #define pr_debug printk
*/
#endif
#define DEBUG_ARP_TABLE 0
#define LEC_ARP_REFRESH_INTERVAL (3*HZ)
-static void lec_arp_check_expire(void *data);
-static void lec_arp_expire_arp(unsigned long data);
+static void lec_arp_check_expire(struct work_struct *work);
+static void lec_arp_expire_arp(struct timer_list *t);
-/*
+/*
* Arp table funcs
*/
-#define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE -1))
+#define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE - 1))
/*
* Initialization of arp-cache
@@ -1470,14 +1243,13 @@ static void lec_arp_init(struct lec_priv *priv)
{
unsigned short i;
- for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
- }
- INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
- INIT_HLIST_HEAD(&priv->lec_no_forward);
- INIT_HLIST_HEAD(&priv->mcast_fwds);
+ INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
+ INIT_HLIST_HEAD(&priv->lec_no_forward);
+ INIT_HLIST_HEAD(&priv->mcast_fwds);
spin_lock_init(&priv->lec_arp_lock);
- INIT_WORK(&priv->lec_arp_work, lec_arp_check_expire, priv);
+ INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire);
schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL);
}
@@ -1498,6 +1270,12 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry)
entry->vcc = NULL;
}
if (entry->recv_vcc) {
+ struct atm_vcc *vcc = entry->recv_vcc;
+ struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+
+ kfree(vpriv);
+ vcc->user_back = NULL;
+
entry->recv_vcc->push = entry->old_recv_push;
vcc_release_async(entry->recv_vcc, -EPIPE);
entry->recv_vcc = NULL;
@@ -1516,10 +1294,7 @@ lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry)
tmp = &priv->lec_arp_tables[HASH(entry->mac_addr[ETH_ALEN - 1])];
hlist_add_head(&entry->next, tmp);
- DPRINTK("LEC_ARP: Added entry:%2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n",
- 0xff & entry->mac_addr[0], 0xff & entry->mac_addr[1],
- 0xff & entry->mac_addr[2], 0xff & entry->mac_addr[3],
- 0xff & entry->mac_addr[4], 0xff & entry->mac_addr[5]);
+ pr_debug("Added entry:%pM\n", entry->mac_addr);
}
/*
@@ -1528,24 +1303,26 @@ lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry)
static int
lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove)
{
- struct hlist_node *node;
struct lec_arp_table *entry;
int i, remove_vcc = 1;
- if (!to_remove) {
+ if (!to_remove)
return -1;
- }
hlist_del(&to_remove->next);
- del_timer(&to_remove->timer);
+ timer_delete(&to_remove->timer);
- /* If this is the only MAC connected to this VCC, also tear down the VCC */
+ /*
+ * If this is the only MAC connected to this VCC,
+ * also tear down the VCC
+ */
if (to_remove->status >= ESI_FLUSH_PENDING) {
/*
* ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT
*/
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry(entry, node, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
if (memcmp(to_remove->atm_addr,
entry->atm_addr, ATM_ESA_LEN) == 0) {
remove_vcc = 0;
@@ -1558,15 +1335,12 @@ lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove)
}
skb_queue_purge(&to_remove->tx_wait); /* FIXME: good place for this? */
- DPRINTK("LEC_ARP: Removed entry:%2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n",
- 0xff & to_remove->mac_addr[0], 0xff & to_remove->mac_addr[1],
- 0xff & to_remove->mac_addr[2], 0xff & to_remove->mac_addr[3],
- 0xff & to_remove->mac_addr[4], 0xff & to_remove->mac_addr[5]);
+ pr_debug("Removed entry:%pM\n", to_remove->mac_addr);
return 0;
}
#if DEBUG_ARP_TABLE
-static char *get_status_string(unsigned char st)
+static const char *get_status_string(unsigned char st)
{
switch (st) {
case ESI_UNKNOWN:
@@ -1579,35 +1353,26 @@ static char *get_status_string(unsigned char st)
return "ESI_FLUSH_PENDING";
case ESI_FORWARD_DIRECT:
return "ESI_FORWARD_DIRECT";
- default:
- return "<UNKNOWN>";
}
+ return "<UNKNOWN>";
}
static void dump_arp_table(struct lec_priv *priv)
{
- struct hlist_node *node;
struct lec_arp_table *rulla;
char buf[256];
- int i, j, offset;
+ int i, offset;
- printk("Dump %p:\n", priv);
+ pr_info("Dump %p:\n", priv);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry(rulla, node, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry(rulla,
+ &priv->lec_arp_tables[i], next) {
offset = 0;
offset += sprintf(buf, "%d: %p\n", i, rulla);
- offset += sprintf(buf + offset, "Mac:");
- for (j = 0; j < ETH_ALEN; j++) {
- offset += sprintf(buf + offset,
- "%2.2x ",
- rulla->mac_addr[j] & 0xff);
- }
- offset += sprintf(buf + offset, "Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset,
- "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ",
+ rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1622,24 +1387,17 @@ static void dump_arp_table(struct lec_priv *priv)
"Flags:%x, Packets_flooded:%x, Status: %s ",
rulla->flags, rulla->packets_flooded,
get_status_string(rulla->status));
- printk("%s\n", buf);
+ pr_info("%s\n", buf);
}
}
if (!hlist_empty(&priv->lec_no_forward))
- printk("No forward\n");
- hlist_for_each_entry(rulla, node, &priv->lec_no_forward, next) {
+ pr_info("No forward\n");
+ hlist_for_each_entry(rulla, &priv->lec_no_forward, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac:");
- for (j = 0; j < ETH_ALEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->mac_addr[j] & 0xff);
- }
- offset += sprintf(buf + offset, "Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1652,23 +1410,16 @@ static void dump_arp_table(struct lec_priv *priv)
"Flags:%x, Packets_flooded:%x, Status: %s ",
rulla->flags, rulla->packets_flooded,
get_status_string(rulla->status));
- printk("%s\n", buf);
+ pr_info("%s\n", buf);
}
if (!hlist_empty(&priv->lec_arp_empty_ones))
- printk("Empty ones\n");
- hlist_for_each_entry(rulla, node, &priv->lec_arp_empty_ones, next) {
+ pr_info("Empty ones\n");
+ hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac:");
- for (j = 0; j < ETH_ALEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->mac_addr[j] & 0xff);
- }
- offset += sprintf(buf + offset, "Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1681,23 +1432,16 @@ static void dump_arp_table(struct lec_priv *priv)
"Flags:%x, Packets_flooded:%x, Status: %s ",
rulla->flags, rulla->packets_flooded,
get_status_string(rulla->status));
- printk("%s", buf);
+ pr_info("%s", buf);
}
if (!hlist_empty(&priv->mcast_fwds))
- printk("Multicast Forward VCCs\n");
- hlist_for_each_entry(rulla, node, &priv->mcast_fwds, next) {
+ pr_info("Multicast Forward VCCs\n");
+ hlist_for_each_entry(rulla, &priv->mcast_fwds, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac:");
- for (j = 0; j < ETH_ALEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->mac_addr[j] & 0xff);
- }
- offset += sprintf(buf + offset, "Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1710,7 +1454,7 @@ static void dump_arp_table(struct lec_priv *priv)
"Flags:%x, Packets_flooded:%x, Status: %s ",
rulla->flags, rulla->packets_flooded,
get_status_string(rulla->status));
- printk("%s\n", buf);
+ pr_info("%s\n", buf);
}
}
@@ -1724,11 +1468,11 @@ static void dump_arp_table(struct lec_priv *priv)
static void lec_arp_destroy(struct lec_priv *priv)
{
unsigned long flags;
- struct hlist_node *node, *next;
+ struct hlist_node *next;
struct lec_arp_table *entry;
int i;
- cancel_rearming_delayed_work(&priv->lec_arp_work);
+ cancel_delayed_work_sync(&priv->lec_arp_work);
/*
* Remove all entries
@@ -1736,30 +1480,33 @@ static void lec_arp_destroy(struct lec_priv *priv)
spin_lock_irqsave(&priv->lec_arp_lock, flags);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
lec_arp_remove(priv, entry);
lec_arp_put(entry);
}
INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
}
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_empty_ones, next) {
- del_timer_sync(&entry->timer);
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
+ timer_delete_sync(&entry->timer);
lec_arp_clear_vccs(entry);
hlist_del(&entry->next);
lec_arp_put(entry);
}
INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_no_forward, next) {
- del_timer_sync(&entry->timer);
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_no_forward, next) {
+ timer_delete_sync(&entry->timer);
lec_arp_clear_vccs(entry);
hlist_del(&entry->next);
lec_arp_put(entry);
}
INIT_HLIST_HEAD(&priv->lec_no_forward);
- hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) {
+ hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) {
/* No timer, LANEv2 7.1.20 and 2.3.5.3 */
lec_arp_clear_vccs(entry);
hlist_del(&entry->next);
@@ -1770,59 +1517,51 @@ static void lec_arp_destroy(struct lec_priv *priv)
spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
}
-/*
+/*
* Find entry by mac_address
*/
static struct lec_arp_table *lec_arp_find(struct lec_priv *priv,
- unsigned char *mac_addr)
+ const unsigned char *mac_addr)
{
- struct hlist_node *node;
struct hlist_head *head;
struct lec_arp_table *entry;
- DPRINTK("LEC_ARP: lec_arp_find :%2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n",
- mac_addr[0] & 0xff, mac_addr[1] & 0xff, mac_addr[2] & 0xff,
- mac_addr[3] & 0xff, mac_addr[4] & 0xff, mac_addr[5] & 0xff);
+ pr_debug("%pM\n", mac_addr);
head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])];
- hlist_for_each_entry(entry, node, head, next) {
- if (!compare_ether_addr(mac_addr, entry->mac_addr)) {
+ hlist_for_each_entry(entry, head, next) {
+ if (ether_addr_equal(mac_addr, entry->mac_addr))
return entry;
- }
}
return NULL;
}
static struct lec_arp_table *make_entry(struct lec_priv *priv,
- unsigned char *mac_addr)
+ const unsigned char *mac_addr)
{
struct lec_arp_table *to_return;
to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC);
- if (!to_return) {
- printk("LEC: Arp entry kmalloc failed\n");
+ if (!to_return)
return NULL;
- }
- memcpy(to_return->mac_addr, mac_addr, ETH_ALEN);
+ ether_addr_copy(to_return->mac_addr, mac_addr);
INIT_HLIST_NODE(&to_return->next);
- init_timer(&to_return->timer);
- to_return->timer.function = lec_arp_expire_arp;
- to_return->timer.data = (unsigned long)to_return;
+ timer_setup(&to_return->timer, lec_arp_expire_arp, 0);
to_return->last_used = jiffies;
to_return->priv = priv;
skb_queue_head_init(&to_return->tx_wait);
- atomic_set(&to_return->usage, 1);
+ refcount_set(&to_return->usage, 1);
return to_return;
}
/* Arp sent timer expired */
-static void lec_arp_expire_arp(unsigned long data)
+static void lec_arp_expire_arp(struct timer_list *t)
{
struct lec_arp_table *entry;
- entry = (struct lec_arp_table *)data;
+ entry = timer_container_of(entry, t, timer);
- DPRINTK("lec_arp_expire_arp\n");
+ pr_debug("\n");
if (entry->status == ESI_ARP_PENDING) {
if (entry->no_tries <= entry->priv->max_retry_count) {
if (entry->is_rdesc)
@@ -1838,18 +1577,19 @@ static void lec_arp_expire_arp(unsigned long data)
}
/* Unknown/unused vcc expire, remove associated entry */
-static void lec_arp_expire_vcc(unsigned long data)
+static void lec_arp_expire_vcc(struct timer_list *t)
{
unsigned long flags;
- struct lec_arp_table *to_remove = (struct lec_arp_table *)data;
- struct lec_priv *priv = (struct lec_priv *)to_remove->priv;
+ struct lec_arp_table *to_remove = timer_container_of(to_remove, t,
+ timer);
+ struct lec_priv *priv = to_remove->priv;
- del_timer(&to_remove->timer);
+ timer_delete(&to_remove->timer);
- DPRINTK("LEC_ARP %p %p: lec_arp_expire_vcc vpi:%d vci:%d\n",
- to_remove, priv,
- to_remove->vcc ? to_remove->recv_vcc->vpi : 0,
- to_remove->vcc ? to_remove->recv_vcc->vci : 0);
+ pr_debug("%p %p: vpi:%d vci:%d\n",
+ to_remove, priv,
+ to_remove->vcc ? to_remove->recv_vcc->vpi : 0,
+ to_remove->vcc ? to_remove->recv_vcc->vci : 0);
spin_lock_irqsave(&priv->lec_arp_lock, flags);
hlist_del(&to_remove->next);
@@ -1859,6 +1599,50 @@ static void lec_arp_expire_vcc(unsigned long data)
lec_arp_put(to_remove);
}
+static bool __lec_arp_check_expire(struct lec_arp_table *entry,
+ unsigned long now,
+ struct lec_priv *priv)
+{
+ unsigned long time_to_check;
+
+ if ((entry->flags) & LEC_REMOTE_FLAG && priv->topology_change)
+ time_to_check = priv->forward_delay_time;
+ else
+ time_to_check = priv->aging_time;
+
+ pr_debug("About to expire: %lx - %lx > %lx\n",
+ now, entry->last_used, time_to_check);
+ if (time_after(now, entry->last_used + time_to_check) &&
+ !(entry->flags & LEC_PERMANENT_FLAG) &&
+ !(entry->mac_addr[0] & 0x01)) { /* LANE2: 7.1.20 */
+ /* Remove entry */
+ pr_debug("Entry timed out\n");
+ lec_arp_remove(priv, entry);
+ lec_arp_put(entry);
+ } else {
+ /* Something else */
+ if ((entry->status == ESI_VC_PENDING ||
+ entry->status == ESI_ARP_PENDING) &&
+ time_after_eq(now, entry->timestamp +
+ priv->max_unknown_frame_time)) {
+ entry->timestamp = jiffies;
+ entry->packets_flooded = 0;
+ if (entry->status == ESI_VC_PENDING)
+ send_to_lecd(priv, l_svc_setup,
+ entry->mac_addr,
+ entry->atm_addr,
+ NULL);
+ }
+ if (entry->status == ESI_FLUSH_PENDING &&
+ time_after_eq(now, entry->timestamp +
+ priv->path_switching_delay)) {
+ lec_arp_hold(entry);
+ return true;
+ }
+ }
+
+ return false;
+}
/*
* Expire entries.
* 1. Re-set timer
@@ -1875,69 +1659,36 @@ static void lec_arp_expire_vcc(unsigned long data)
* to ESI_FORWARD_DIRECT. This causes the flush period to end
* regardless of the progress of the flush protocol.
*/
-static void lec_arp_check_expire(void *data)
+static void lec_arp_check_expire(struct work_struct *work)
{
unsigned long flags;
- struct lec_priv *priv = data;
- struct hlist_node *node, *next;
+ struct lec_priv *priv =
+ container_of(work, struct lec_priv, lec_arp_work.work);
+ struct hlist_node *next;
struct lec_arp_table *entry;
unsigned long now;
- unsigned long time_to_check;
int i;
- DPRINTK("lec_arp_check_expire %p\n", priv);
+ pr_debug("%p\n", priv);
now = jiffies;
restart:
spin_lock_irqsave(&priv->lec_arp_lock, flags);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_tables[i], next) {
- if ((entry->flags) & LEC_REMOTE_FLAG &&
- priv->topology_change)
- time_to_check = priv->forward_delay_time;
- else
- time_to_check = priv->aging_time;
-
- DPRINTK("About to expire: %lx - %lx > %lx\n",
- now, entry->last_used, time_to_check);
- if (time_after(now, entry->last_used + time_to_check)
- && !(entry->flags & LEC_PERMANENT_FLAG)
- && !(entry->mac_addr[0] & 0x01)) { /* LANE2: 7.1.20 */
- /* Remove entry */
- DPRINTK("LEC:Entry timed out\n");
- lec_arp_remove(priv, entry);
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
+ if (__lec_arp_check_expire(entry, now, priv)) {
+ struct sk_buff *skb;
+ struct atm_vcc *vcc = entry->vcc;
+
+ spin_unlock_irqrestore(&priv->lec_arp_lock,
+ flags);
+ while ((skb = skb_dequeue(&entry->tx_wait)))
+ lec_send(vcc, skb);
+ entry->last_used = jiffies;
+ entry->status = ESI_FORWARD_DIRECT;
lec_arp_put(entry);
- } else {
- /* Something else */
- if ((entry->status == ESI_VC_PENDING ||
- entry->status == ESI_ARP_PENDING)
- && time_after_eq(now,
- entry->timestamp +
- priv->
- max_unknown_frame_time)) {
- entry->timestamp = jiffies;
- entry->packets_flooded = 0;
- if (entry->status == ESI_VC_PENDING)
- send_to_lecd(priv, l_svc_setup,
- entry->mac_addr,
- entry->atm_addr,
- NULL);
- }
- if (entry->status == ESI_FLUSH_PENDING
- &&
- time_after_eq(now, entry->timestamp +
- priv->path_switching_delay)) {
- struct sk_buff *skb;
- struct atm_vcc *vcc = entry->vcc;
-
- lec_arp_hold(entry);
- spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
- while ((skb = skb_dequeue(&entry->tx_wait)) != NULL)
- lec_send(vcc, skb, entry->priv);
- entry->last_used = jiffies;
- entry->status = ESI_FORWARD_DIRECT;
- lec_arp_put(entry);
- goto restart;
- }
+
+ goto restart;
}
}
}
@@ -1948,10 +1699,11 @@ restart:
/*
* Try to find vcc where mac_address is attached.
- *
+ *
*/
static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
- unsigned char *mac_to_find, int is_rdesc,
+ const unsigned char *mac_to_find,
+ int is_rdesc,
struct lec_arp_table **ret_entry)
{
unsigned long flags;
@@ -1962,9 +1714,8 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
switch (priv->lane_version) {
case 1:
return priv->mcast_vcc;
- break;
case 2: /* LANE2 wants arp for multicast addresses */
- if (!compare_ether_addr(mac_to_find, bus_mac))
+ if (ether_addr_equal(mac_to_find, bus_mac))
return priv->mcast_vcc;
break;
default:
@@ -1988,9 +1739,8 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
* If the LE_ARP cache entry is still pending, reset count to 0
* so another LE_ARP request can be made for this frame.
*/
- if (entry->status == ESI_ARP_PENDING) {
+ if (entry->status == ESI_ARP_PENDING)
entry->no_tries = 0;
- }
/*
* Data direct VC not yet set up, check to see if the unknown
* frame count is greater than the limit. If the limit has
@@ -2001,7 +1751,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
entry->packets_flooded <
priv->maximum_unknown_frame_count) {
entry->packets_flooded++;
- DPRINTK("LEC_ARP: Flooding..\n");
+ pr_debug("Flooding..\n");
found = priv->mcast_vcc;
goto out;
}
@@ -2012,13 +1762,13 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
*/
lec_arp_hold(entry);
*ret_entry = entry;
- DPRINTK("lec: entry->status %d entry->vcc %p\n", entry->status,
- entry->vcc);
+ pr_debug("entry->status %d entry->vcc %p\n", entry->status,
+ entry->vcc);
found = NULL;
} else {
/* No matching entry was found */
entry = make_entry(priv, mac_to_find);
- DPRINTK("LEC_ARP: Making entry\n");
+ pr_debug("Making entry\n");
if (!entry) {
found = priv->mcast_vcc;
goto out;
@@ -2047,21 +1797,22 @@ out:
}
static int
-lec_addr_delete(struct lec_priv *priv, unsigned char *atm_addr,
+lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr,
unsigned long permanent)
{
unsigned long flags;
- struct hlist_node *node, *next;
+ struct hlist_node *next;
struct lec_arp_table *entry;
int i;
- DPRINTK("lec_addr_delete\n");
+ pr_debug("\n");
spin_lock_irqsave(&priv->lec_arp_lock, flags);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_tables[i], next) {
- if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)
- && (permanent ||
- !(entry->flags & LEC_PERMANENT_FLAG))) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
+ if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) &&
+ (permanent ||
+ !(entry->flags & LEC_PERMANENT_FLAG))) {
lec_arp_remove(priv, entry);
lec_arp_put(entry);
}
@@ -2074,22 +1825,20 @@ lec_addr_delete(struct lec_priv *priv, unsigned char *atm_addr,
}
/*
- * Notifies: Response to arp_request (atm_addr != NULL)
+ * Notifies: Response to arp_request (atm_addr != NULL)
*/
static void
-lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr,
- unsigned char *atm_addr, unsigned long remoteflag,
+lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
+ const unsigned char *atm_addr, unsigned long remoteflag,
unsigned int targetless_le_arp)
{
unsigned long flags;
- struct hlist_node *node, *next;
+ struct hlist_node *next;
struct lec_arp_table *entry, *tmp;
int i;
- DPRINTK("lec:%s", (targetless_le_arp) ? "targetless " : " ");
- DPRINTK("lec_arp_update mac:%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- mac_addr[0], mac_addr[1], mac_addr[2], mac_addr[3],
- mac_addr[4], mac_addr[5]);
+ pr_debug("%smac:%pM\n",
+ (targetless_le_arp) ? "targetless " : "", mac_addr);
spin_lock_irqsave(&priv->lec_arp_lock, flags);
entry = lec_arp_find(priv, mac_addr);
@@ -2099,24 +1848,26 @@ lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr,
* we have no entry in the cache. 7.1.30
*/
if (!hlist_empty(&priv->lec_arp_empty_ones)) {
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_empty_ones, next) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) {
hlist_del(&entry->next);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
tmp = lec_arp_find(priv, mac_addr);
if (tmp) {
- del_timer(&tmp->timer);
+ timer_delete(&tmp->timer);
tmp->status = ESI_FORWARD_DIRECT;
memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN);
tmp->vcc = entry->vcc;
tmp->old_push = entry->old_push;
tmp->last_used = jiffies;
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
lec_arp_put(entry);
entry = tmp;
} else {
entry->status = ESI_FORWARD_DIRECT;
- memcpy(entry->mac_addr, mac_addr, ETH_ALEN);
+ ether_addr_copy(entry->mac_addr,
+ mac_addr);
entry->last_used = jiffies;
lec_arp_add(priv, entry);
}
@@ -2124,7 +1875,7 @@ lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr,
entry->flags |= LEC_REMOTE_FLAG;
else
entry->flags &= ~LEC_REMOTE_FLAG;
- DPRINTK("After update\n");
+ pr_debug("After update\n");
dump_arp_table(priv);
goto out;
}
@@ -2141,9 +1892,10 @@ lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr,
/* Temporary, changes before end of function */
}
memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry(tmp, node, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry(tmp,
+ &priv->lec_arp_tables[i], next) {
if (entry != tmp &&
!memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) {
/* Vcc to this host exists */
@@ -2168,34 +1920,32 @@ lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr,
entry->status = ESI_VC_PENDING;
send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL);
}
- DPRINTK("After update2\n");
+ pr_debug("After update2\n");
dump_arp_table(priv);
out:
spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
}
/*
- * Notifies: Vcc setup ready
+ * Notifies: Vcc setup ready
*/
static void
-lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
+lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
struct atm_vcc *vcc,
void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb))
{
unsigned long flags;
- struct hlist_node *node;
struct lec_arp_table *entry;
int i, found_entry = 0;
spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */
if (ioc_data->receive == 2) {
- /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */
-
- DPRINTK("LEC_ARP: Attaching mcast forward\n");
+ pr_debug("LEC_ARP: Attaching mcast forward\n");
#if 0
entry = lec_arp_find(priv, bus_mac);
if (!entry) {
- printk("LEC_ARP: Multicast entry not found!\n");
+ pr_info("LEC_ARP: Multicast entry not found!\n");
goto out;
}
memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
@@ -2205,7 +1955,7 @@ lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
entry = make_entry(priv, bus_mac);
if (entry == NULL)
goto out;
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
entry->recv_vcc = vcc;
entry->old_recv_push = old_push;
@@ -2216,24 +1966,13 @@ lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
* Vcc which we don't want to make default vcc,
* attach it anyway.
*/
- DPRINTK
- ("LEC_ARP:Attaching data direct, not default: "
- "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- ioc_data->atm_addr[0], ioc_data->atm_addr[1],
- ioc_data->atm_addr[2], ioc_data->atm_addr[3],
- ioc_data->atm_addr[4], ioc_data->atm_addr[5],
- ioc_data->atm_addr[6], ioc_data->atm_addr[7],
- ioc_data->atm_addr[8], ioc_data->atm_addr[9],
- ioc_data->atm_addr[10], ioc_data->atm_addr[11],
- ioc_data->atm_addr[12], ioc_data->atm_addr[13],
- ioc_data->atm_addr[14], ioc_data->atm_addr[15],
- ioc_data->atm_addr[16], ioc_data->atm_addr[17],
- ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n",
+ ATM_ESA_LEN, ioc_data->atm_addr);
entry = make_entry(priv, bus_mac);
if (entry == NULL)
goto out;
memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
- memset(entry->mac_addr, 0, ETH_ALEN);
+ eth_zero_addr(entry->mac_addr);
entry->recv_vcc = vcc;
entry->old_recv_push = old_push;
entry->status = ESI_UNKNOWN;
@@ -2244,31 +1983,21 @@ lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
dump_arp_table(priv);
goto out;
}
- DPRINTK
- ("LEC_ARP:Attaching data direct, default: "
- "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- ioc_data->atm_addr[0], ioc_data->atm_addr[1],
- ioc_data->atm_addr[2], ioc_data->atm_addr[3],
- ioc_data->atm_addr[4], ioc_data->atm_addr[5],
- ioc_data->atm_addr[6], ioc_data->atm_addr[7],
- ioc_data->atm_addr[8], ioc_data->atm_addr[9],
- ioc_data->atm_addr[10], ioc_data->atm_addr[11],
- ioc_data->atm_addr[12], ioc_data->atm_addr[13],
- ioc_data->atm_addr[14], ioc_data->atm_addr[15],
- ioc_data->atm_addr[16], ioc_data->atm_addr[17],
- ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n",
+ ATM_ESA_LEN, ioc_data->atm_addr);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry(entry, node, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
if (memcmp
(ioc_data->atm_addr, entry->atm_addr,
ATM_ESA_LEN) == 0) {
- DPRINTK("LEC_ARP: Attaching data direct\n");
- DPRINTK("Currently -> Vcc: %d, Rvcc:%d\n",
- entry->vcc ? entry->vcc->vci : 0,
- entry->recv_vcc ? entry->recv_vcc->
- vci : 0);
+ pr_debug("LEC_ARP: Attaching data direct\n");
+ pr_debug("Currently -> Vcc: %d, Rvcc:%d\n",
+ entry->vcc ? entry->vcc->vci : 0,
+ entry->recv_vcc ? entry->recv_vcc->
+ vci : 0);
found_entry = 1;
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
entry->vcc = vcc;
entry->old_push = old_push;
if (entry->status == ESI_VC_PENDING) {
@@ -2305,7 +2034,7 @@ lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
}
}
if (found_entry) {
- DPRINTK("After vcc was added\n");
+ pr_debug("After vcc was added\n");
dump_arp_table(priv);
goto out;
}
@@ -2319,13 +2048,13 @@ lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data,
entry->vcc = vcc;
entry->old_push = old_push;
memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
- memset(entry->mac_addr, 0, ETH_ALEN);
+ eth_zero_addr(entry->mac_addr);
entry->status = ESI_UNKNOWN;
hlist_add_head(&entry->next, &priv->lec_arp_empty_ones);
entry->timer.expires = jiffies + priv->vcc_timeout_period;
entry->timer.function = lec_arp_expire_vcc;
add_timer(&entry->timer);
- DPRINTK("After vcc was added\n");
+ pr_debug("After vcc was added\n");
dump_arp_table(priv);
out:
spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
@@ -2334,28 +2063,29 @@ out:
static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id)
{
unsigned long flags;
- struct hlist_node *node;
struct lec_arp_table *entry;
int i;
- DPRINTK("LEC:lec_flush_complete %lx\n", tran_id);
+ pr_debug("%lx\n", tran_id);
restart:
spin_lock_irqsave(&priv->lec_arp_lock, flags);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry(entry, node, &priv->lec_arp_tables[i], next) {
- if (entry->flush_tran_id == tran_id
- && entry->status == ESI_FLUSH_PENDING) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
+ if (entry->flush_tran_id == tran_id &&
+ entry->status == ESI_FLUSH_PENDING) {
struct sk_buff *skb;
struct atm_vcc *vcc = entry->vcc;
lec_arp_hold(entry);
- spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
- while ((skb = skb_dequeue(&entry->tx_wait)) != NULL)
- lec_send(vcc, skb, entry->priv);
+ spin_unlock_irqrestore(&priv->lec_arp_lock,
+ flags);
+ while ((skb = skb_dequeue(&entry->tx_wait)))
+ lec_send(vcc, skb);
entry->last_used = jiffies;
entry->status = ESI_FORWARD_DIRECT;
lec_arp_put(entry);
- DPRINTK("LEC_ARP: Flushed\n");
+ pr_debug("LEC_ARP: Flushed\n");
goto restart;
}
}
@@ -2366,20 +2096,20 @@ restart:
static void
lec_set_flush_tran_id(struct lec_priv *priv,
- unsigned char *atm_addr, unsigned long tran_id)
+ const unsigned char *atm_addr, unsigned long tran_id)
{
unsigned long flags;
- struct hlist_node *node;
struct lec_arp_table *entry;
int i;
spin_lock_irqsave(&priv->lec_arp_lock, flags);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
- hlist_for_each_entry(entry, node, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) {
entry->flush_tran_id = tran_id;
- DPRINTK("Set flush transaction id to %lx for %p\n",
- tran_id, entry);
+ pr_debug("Set flush transaction id to %lx for %p\n",
+ tran_id, entry);
}
}
spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
@@ -2395,7 +2125,8 @@ static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc)
struct lec_vcc_priv *vpriv;
int err = 0;
- if (!(vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL)))
+ vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
+ if (!vpriv)
return -ENOMEM;
vpriv->xoff = 0;
vpriv->old_pop = vcc->pop;
@@ -2425,46 +2156,48 @@ out:
static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc)
{
unsigned long flags;
- struct hlist_node *node, *next;
+ struct hlist_node *next;
struct lec_arp_table *entry;
int i;
- DPRINTK("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci);
+ pr_debug("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci);
dump_arp_table(priv);
spin_lock_irqsave(&priv->lec_arp_lock, flags);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_tables[i], next) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
if (vcc == entry->vcc) {
lec_arp_remove(priv, entry);
lec_arp_put(entry);
- if (priv->mcast_vcc == vcc) {
+ if (priv->mcast_vcc == vcc)
priv->mcast_vcc = NULL;
- }
}
}
}
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_empty_ones, next) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
if (entry->vcc == vcc) {
lec_arp_clear_vccs(entry);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
hlist_del(&entry->next);
lec_arp_put(entry);
}
}
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_no_forward, next) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_no_forward, next) {
if (entry->recv_vcc == vcc) {
lec_arp_clear_vccs(entry);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
hlist_del(&entry->next);
lec_arp_put(entry);
}
}
- hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) {
+ hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) {
if (entry->recv_vcc == vcc) {
lec_arp_clear_vccs(entry);
/* No timer, LANEv2 7.1.20 and 2.3.5.3 */
@@ -2482,28 +2215,22 @@ lec_arp_check_empties(struct lec_priv *priv,
struct atm_vcc *vcc, struct sk_buff *skb)
{
unsigned long flags;
- struct hlist_node *node, *next;
+ struct hlist_node *next;
struct lec_arp_table *entry, *tmp;
struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data;
- unsigned char *src;
-#ifdef CONFIG_TR
- struct lecdatahdr_8025 *tr_hdr = (struct lecdatahdr_8025 *)skb->data;
-
- if (priv->is_trdev)
- src = tr_hdr->h_source;
- else
-#endif
- src = hdr->h_source;
+ unsigned char *src = hdr->h_source;
spin_lock_irqsave(&priv->lec_arp_lock, flags);
- hlist_for_each_entry_safe(entry, node, next, &priv->lec_arp_empty_ones, next) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
if (vcc == entry->vcc) {
- del_timer(&entry->timer);
- memcpy(entry->mac_addr, src, ETH_ALEN);
+ timer_delete(&entry->timer);
+ ether_addr_copy(entry->mac_addr, src);
entry->status = ESI_FORWARD_DIRECT;
entry->last_used = jiffies;
/* We might have got an entry */
- if ((tmp = lec_arp_find(priv, src))) {
+ tmp = lec_arp_find(priv, src);
+ if (tmp) {
lec_arp_remove(priv, tmp);
lec_arp_put(tmp);
}
@@ -2512,9 +2239,10 @@ lec_arp_check_empties(struct lec_priv *priv,
goto out;
}
}
- DPRINTK("LEC_ARP: Arp_check_empties: entry not found!\n");
+ pr_debug("LEC_ARP: Arp_check_empties: entry not found!\n");
out:
spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
}
+MODULE_DESCRIPTION("ATM LAN Emulation (LANE) support");
MODULE_LICENSE("GPL");
diff --git a/net/atm/lec.h b/net/atm/lec.h
index 24cc95f86741..be0e2667bd8c 100644
--- a/net/atm/lec.h
+++ b/net/atm/lec.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Lan Emulation client header file
*
@@ -35,31 +36,31 @@ struct lecdatahdr_8025 {
* Operations that LANE2 capable device can do. Two first functions
* are used to make the device do things. See spec 3.1.3 and 3.1.4.
*
- * The third function is intented for the MPOA component sitting on
+ * The third function is intended for the MPOA component sitting on
* top of the LANE device. The MPOA component assigns it's own function
* to (*associate_indicator)() and the LANE device will use that
* function to tell about TLVs it sees floating through.
*
*/
struct lane2_ops {
- int (*resolve) (struct net_device *dev, u8 *dst_mac, int force,
+ int (*resolve) (struct net_device *dev, const u8 *dst_mac, int force,
u8 **tlvs, u32 *sizeoftlvs);
- int (*associate_req) (struct net_device *dev, u8 *lan_dst,
- u8 *tlvs, u32 sizeoftlvs);
- void (*associate_indicator) (struct net_device *dev, u8 *mac_addr,
- u8 *tlvs, u32 sizeoftlvs);
+ int (*associate_req) (struct net_device *dev, const u8 *lan_dst,
+ const u8 *tlvs, u32 sizeoftlvs);
+ void (*associate_indicator) (struct net_device *dev, const u8 *mac_addr,
+ const u8 *tlvs, u32 sizeoftlvs);
};
/*
* ATM LAN Emulation supports both LLC & Dix Ethernet EtherType
- * frames.
+ * frames.
*
* 1. Dix Ethernet EtherType frames encoded by placing EtherType
- * field in h_type field. Data follows immediatelly after header.
+ * field in h_type field. Data follows immediately after header.
* 2. LLC Data frames whose total length, including LLC field and data,
- * but not padding required to meet the minimum data frame length,
- * is less than 1536(0x0600) MUST be encoded by placing that length
- * in the h_type field. The LLC field follows header immediatelly.
+ * but not padding required to meet the minimum data frame length,
+ * is less than ETH_P_802_3_MIN MUST be encoded by placing that length
+ * in the h_type field. The LLC field follows header immediately.
* 3. LLC data frames longer than this maximum MUST be encoded by placing
* the value 0 in the h_type field.
*
@@ -69,7 +70,6 @@ struct lane2_ops {
#define LEC_ARP_TABLE_SIZE 16
struct lec_priv {
- struct net_device_stats stats;
unsigned short lecid; /* Lecid of this client */
struct hlist_head lec_arp_empty_ones;
/* Used for storing VCC's that don't have a MAC address attached yet */
@@ -92,7 +92,7 @@ struct lec_priv {
spinlock_t lec_arp_lock;
struct atm_vcc *mcast_vcc; /* Default Multicast Send VCC */
struct atm_vcc *lecd;
- struct work_struct lec_arp_work; /* C10 */
+ struct delayed_work lec_arp_work; /* C10 */
unsigned int maximum_unknown_frame_count;
/*
* Within the period of time defined by this variable, the client will send
@@ -143,7 +143,6 @@ struct lec_priv {
int itfnum; /* e.g. 2 for lec2, 5 for lec5 */
struct lane2_ops *lane2_ops; /* can be NULL for LANE v1 */
int is_proxy; /* bridge between ATM and Ethernet */
- int is_trdev; /* Device type, 0 = Ethernet, 1 = TokenRing */
};
struct lec_vcc_priv {
diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h
index ec67435a40a6..39115fe074c4 100644
--- a/net/atm/lec_arpc.h
+++ b/net/atm/lec_arpc.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Lec arp cache
*
@@ -43,11 +44,11 @@ struct lec_arp_table {
u8 *tlvs;
u32 sizeoftlvs; /*
* LANE2: Each MAC address can have TLVs
- * associated with it. sizeoftlvs tells the
+ * associated with it. sizeoftlvs tells
* the length of the tlvs array
*/
struct sk_buff_head tx_wait; /* wait queue for outgoing packets */
- atomic_t usage; /* usage count */
+ refcount_t usage; /* usage count */
};
/*
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index c18f73715ef9..f6b447bba329 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1,5 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/string.h>
+#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bitops.h>
@@ -13,8 +17,8 @@
#include <net/sock.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <linux/uaccess.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
#include <net/checksum.h> /* for ip_fast_csum() */
#include <net/arp.h>
#include <net/dst.h>
@@ -32,65 +36,83 @@
#include "resources.h"
/*
- * mpc.c: Implementation of MPOA client kernel part
+ * mpc.c: Implementation of MPOA client kernel part
*/
#if 0
-#define dprintk printk /* debug */
+#define dprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args)
+#define dprintk_cont(format, args...) printk(KERN_CONT format, ##args)
#else
-#define dprintk(format,args...)
+#define dprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\
+ } while (0)
+#define dprintk_cont(format, args...) \
+ do { if (0) printk(KERN_CONT format, ##args); } while (0)
#endif
#if 0
-#define ddprintk printk /* more debug */
+#define ddprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args)
+#define ddprintk_cont(format, args...) printk(KERN_CONT format, ##args)
#else
-#define ddprintk(format,args...)
+#define ddprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\
+ } while (0)
+#define ddprintk_cont(format, args...) \
+ do { if (0) printk(KERN_CONT format, ##args); } while (0)
#endif
-
-
-#define MPOA_TAG_LEN 4
-
/* mpc_daemon -> kernel */
-static void MPOA_trigger_rcvd (struct k_message *msg, struct mpoa_client *mpc);
+static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc);
static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc);
static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc);
static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc);
static void mps_death(struct k_message *msg, struct mpoa_client *mpc);
-static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action);
-static void MPOA_cache_impos_rcvd(struct k_message *msg, struct mpoa_client *mpc);
-static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc);
-static void set_mps_mac_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc);
-
-static uint8_t *copy_macs(struct mpoa_client *mpc, uint8_t *router_mac,
- uint8_t *tlvs, uint8_t mps_macs, uint8_t device_type);
+static void clean_up(struct k_message *msg, struct mpoa_client *mpc,
+ int action);
+static void MPOA_cache_impos_rcvd(struct k_message *msg,
+ struct mpoa_client *mpc);
+static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg,
+ struct mpoa_client *mpc);
+static void set_mps_mac_addr_rcvd(struct k_message *mesg,
+ struct mpoa_client *mpc);
+
+static const uint8_t *copy_macs(struct mpoa_client *mpc,
+ const uint8_t *router_mac,
+ const uint8_t *tlvs, uint8_t mps_macs,
+ uint8_t device_type);
static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry);
-static void send_set_mps_ctrl_addr(char *addr, struct mpoa_client *mpc);
+static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc);
static void mpoad_close(struct atm_vcc *vcc);
static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb);
static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb);
-static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev);
-static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev);
+static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
+ struct net_device *dev);
+static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
+ unsigned long event, void *dev);
static void mpc_timer_refresh(void);
-static void mpc_cache_check( unsigned long checking_time );
+static void mpc_cache_check(struct timer_list *unused);
static struct llc_snap_hdr llc_snap_mpoa_ctrl = {
0xaa, 0xaa, 0x03,
{0x00, 0x00, 0x5e},
{0x00, 0x03} /* For MPOA control PDUs */
-};
+};
static struct llc_snap_hdr llc_snap_mpoa_data = {
0xaa, 0xaa, 0x03,
{0x00, 0x00, 0x00},
{0x08, 0x00} /* This is for IP PDUs only */
-};
+};
static struct llc_snap_hdr llc_snap_mpoa_data_tagged = {
0xaa, 0xaa, 0x03,
{0x00, 0x00, 0x00},
{0x88, 0x4c} /* This is for tagged data PDUs */
-};
+};
static struct notifier_block mpoa_notifier = {
mpoa_event_listener,
@@ -100,18 +122,18 @@ static struct notifier_block mpoa_notifier = {
struct mpoa_client *mpcs = NULL; /* FIXME */
static struct atm_mpoa_qos *qos_head = NULL;
-static DEFINE_TIMER(mpc_timer, NULL, 0, 0);
+static DEFINE_TIMER(mpc_timer, mpc_cache_check);
static struct mpoa_client *find_mpc_by_itfnum(int itf)
{
struct mpoa_client *mpc;
-
+
mpc = mpcs; /* our global linked list */
while (mpc != NULL) {
if (mpc->dev_num == itf)
return mpc;
- mpc = mpc->next;
+ mpc = mpc->next;
}
return NULL; /* not found */
@@ -120,7 +142,7 @@ static struct mpoa_client *find_mpc_by_itfnum(int itf)
static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc)
{
struct mpoa_client *mpc;
-
+
mpc = mpcs; /* our global linked list */
while (mpc != NULL) {
if (mpc->mpoad_vcc == vcc)
@@ -134,7 +156,7 @@ static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc)
static struct mpoa_client *find_mpc_by_lec(struct net_device *dev)
{
struct mpoa_client *mpc;
-
+
mpc = mpcs; /* our global linked list */
while (mpc != NULL) {
if (mpc->dev == dev)
@@ -164,7 +186,7 @@ struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos)
entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL);
if (entry == NULL) {
- printk("mpoa: atm_mpoa_add_qos: out of memory\n");
+ pr_info("mpoa: out of memory\n");
return entry;
}
@@ -182,25 +204,24 @@ struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip)
struct atm_mpoa_qos *qos;
qos = qos_head;
- while( qos != NULL ){
- if(qos->ipaddr == dst_ip) {
+ while (qos) {
+ if (qos->ipaddr == dst_ip)
break;
- }
qos = qos->next;
}
return qos;
-}
+}
/*
* Returns 0 for failure
*/
int atm_mpoa_delete_qos(struct atm_mpoa_qos *entry)
{
-
struct atm_mpoa_qos *curr;
- if (entry == NULL) return 0;
+ if (entry == NULL)
+ return 0;
if (entry == qos_head) {
qos_head = qos_head->next;
kfree(entry);
@@ -230,10 +251,18 @@ void atm_mpoa_disp_qos(struct seq_file *m)
seq_printf(m, "IP address\n TX:max_pcr pcr min_pcr max_cdv max_sdu\n RX:max_pcr pcr min_pcr max_cdv max_sdu\n");
while (qos != NULL) {
- seq_printf(m, "%u.%u.%u.%u\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n",
- NIPQUAD(qos->ipaddr),
- qos->qos.txtp.max_pcr, qos->qos.txtp.pcr, qos->qos.txtp.min_pcr, qos->qos.txtp.max_cdv, qos->qos.txtp.max_sdu,
- qos->qos.rxtp.max_pcr, qos->qos.rxtp.pcr, qos->qos.rxtp.min_pcr, qos->qos.rxtp.max_cdv, qos->qos.rxtp.max_sdu);
+ seq_printf(m, "%pI4\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n",
+ &qos->ipaddr,
+ qos->qos.txtp.max_pcr,
+ qos->qos.txtp.pcr,
+ qos->qos.txtp.min_pcr,
+ qos->qos.txtp.max_cdv,
+ qos->qos.txtp.max_sdu,
+ qos->qos.rxtp.max_pcr,
+ qos->qos.rxtp.pcr,
+ qos->qos.rxtp.min_pcr,
+ qos->qos.rxtp.max_cdv,
+ qos->qos.rxtp.max_sdu);
qos = qos->next;
}
}
@@ -244,8 +273,8 @@ static struct net_device *find_lec_by_itfnum(int itf)
char name[IFNAMSIZ];
sprintf(name, "lec%d", itf);
- dev = dev_get_by_name(name);
-
+ dev = dev_get_by_name(&init_net, name);
+
return dev;
}
@@ -253,7 +282,7 @@ static struct mpoa_client *alloc_mpc(void)
{
struct mpoa_client *mpc;
- mpc = kzalloc(sizeof (struct mpoa_client), GFP_KERNEL);
+ mpc = kzalloc(sizeof(struct mpoa_client), GFP_KERNEL);
if (mpc == NULL)
return NULL;
rwlock_init(&mpc->ingress_lock);
@@ -263,84 +292,76 @@ static struct mpoa_client *alloc_mpc(void)
mpc->parameters.mpc_p1 = MPC_P1;
mpc->parameters.mpc_p2 = MPC_P2;
- memset(mpc->parameters.mpc_p3,0,sizeof(mpc->parameters.mpc_p3));
+ memset(mpc->parameters.mpc_p3, 0, sizeof(mpc->parameters.mpc_p3));
mpc->parameters.mpc_p4 = MPC_P4;
- mpc->parameters.mpc_p5 = MPC_P5;
+ mpc->parameters.mpc_p5 = MPC_P5;
mpc->parameters.mpc_p6 = MPC_P6;
-
+
mpcs = mpc;
-
+
return mpc;
}
/*
*
* start_mpc() puts the MPC on line. All the packets destined
- * to the lec underneath us are now being monitored and
+ * to the lec underneath us are now being monitored and
* shortcuts will be established.
*
*/
static void start_mpc(struct mpoa_client *mpc, struct net_device *dev)
{
-
- dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name);
- if (dev->hard_start_xmit == NULL) {
- printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n",
- dev->name);
- return;
- }
- mpc->old_hard_start_xmit = dev->hard_start_xmit;
- dev->hard_start_xmit = mpc_send_packet;
- return;
+ dprintk("(%s)\n", mpc->dev->name);
+ if (!dev->netdev_ops)
+ pr_info("(%s) not starting\n", dev->name);
+ else {
+ mpc->old_ops = dev->netdev_ops;
+ mpc->new_ops = *mpc->old_ops;
+ mpc->new_ops.ndo_start_xmit = mpc_send_packet;
+ dev->netdev_ops = &mpc->new_ops;
+ }
}
static void stop_mpc(struct mpoa_client *mpc)
{
-
- dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name);
+ struct net_device *dev = mpc->dev;
+ dprintk("(%s)", mpc->dev->name);
/* Lets not nullify lec device's dev->hard_start_xmit */
- if (mpc->dev->hard_start_xmit != mpc_send_packet) {
- dprintk(" mpc already stopped, not fatal\n");
+ if (dev->netdev_ops != &mpc->new_ops) {
+ dprintk_cont(" mpc already stopped, not fatal\n");
return;
}
- dprintk("\n");
- mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit;
- mpc->old_hard_start_xmit = NULL;
+ dprintk_cont("\n");
+
+ dev->netdev_ops = mpc->old_ops;
+ mpc->old_ops = NULL;
+
/* close_shortcuts(mpc); ??? FIXME */
-
- return;
}
static const char *mpoa_device_type_string(char type) __attribute__ ((unused));
static const char *mpoa_device_type_string(char type)
{
- switch(type) {
+ switch (type) {
case NON_MPOA:
return "non-MPOA device";
- break;
case MPS:
return "MPS";
- break;
case MPC:
return "MPC";
- break;
case MPS_AND_MPC:
return "both MPS and MPC";
- break;
- default:
- return "unspecified (non-MPOA) device";
- break;
}
- return ""; /* not reached */
+ return "unspecified (non-MPOA) device";
}
/*
- * lec device calls this via its dev->priv->lane2_ops->associate_indicator()
- * when it sees a TLV in LE_ARP packet.
+ * lec device calls this via its netdev_priv(dev)->lane2_ops
+ * ->associate_indicator() when it sees a TLV in LE_ARP packet.
* We fill in the pointer above when we see a LANE2 lec initializing
* See LANE2 spec 3.1.5
*
@@ -351,35 +372,37 @@ static const char *mpoa_device_type_string(char type)
* lec sees a TLV it uses the pointer to call this function.
*
*/
-static void lane2_assoc_ind(struct net_device *dev, uint8_t *mac_addr,
- uint8_t *tlvs, uint32_t sizeoftlvs)
+static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
+ const u8 *tlvs, u32 sizeoftlvs)
{
uint32_t type;
uint8_t length, mpoa_device_type, number_of_mps_macs;
- uint8_t *end_of_tlvs;
+ const uint8_t *end_of_tlvs;
struct mpoa_client *mpc;
-
+
mpoa_device_type = number_of_mps_macs = 0; /* silence gcc */
- dprintk("mpoa: (%s) lane2_assoc_ind: received TLV(s), ", dev->name);
+ dprintk("(%s) received TLV(s), ", dev->name);
dprintk("total length of all TLVs %d\n", sizeoftlvs);
mpc = find_mpc_by_lec(dev); /* Sampo-Fix: moved here from below */
if (mpc == NULL) {
- printk("mpoa: (%s) lane2_assoc_ind: no mpc\n", dev->name);
+ pr_info("(%s) no mpc\n", dev->name);
return;
}
end_of_tlvs = tlvs + sizeoftlvs;
while (end_of_tlvs - tlvs >= 5) {
- type = (tlvs[0] << 24) | (tlvs[1] << 16) | (tlvs[2] << 8) | tlvs[3];
+ type = ((tlvs[0] << 24) | (tlvs[1] << 16) |
+ (tlvs[2] << 8) | tlvs[3]);
length = tlvs[4];
tlvs += 5;
dprintk(" type 0x%x length %02x\n", type, length);
if (tlvs + length > end_of_tlvs) {
- printk("TLV value extends past its buffer, aborting parse\n");
+ pr_info("TLV value extends past its buffer, aborting parse\n");
return;
}
-
+
if (type == 0) {
- printk("mpoa: (%s) lane2_assoc_ind: TLV type was 0, returning\n", dev->name);
+ pr_info("mpoa: (%s) TLV type was 0, returning\n",
+ dev->name);
return;
}
@@ -389,40 +412,48 @@ static void lane2_assoc_ind(struct net_device *dev, uint8_t *mac_addr,
}
mpoa_device_type = *tlvs++;
number_of_mps_macs = *tlvs++;
- dprintk("mpoa: (%s) MPOA device type '%s', ", dev->name, mpoa_device_type_string(mpoa_device_type));
+ dprintk("(%s) MPOA device type '%s', ",
+ dev->name, mpoa_device_type_string(mpoa_device_type));
if (mpoa_device_type == MPS_AND_MPC &&
length < (42 + number_of_mps_macs*ETH_ALEN)) { /* :) */
- printk("\nmpoa: (%s) lane2_assoc_ind: short MPOA Device Type TLV\n",
- dev->name);
+ pr_info("(%s) short MPOA Device Type TLV\n",
+ dev->name);
continue;
}
- if ((mpoa_device_type == MPS || mpoa_device_type == MPC)
- && length < 22 + number_of_mps_macs*ETH_ALEN) {
- printk("\nmpoa: (%s) lane2_assoc_ind: short MPOA Device Type TLV\n",
- dev->name);
+ if ((mpoa_device_type == MPS || mpoa_device_type == MPC) &&
+ length < 22 + number_of_mps_macs*ETH_ALEN) {
+ pr_info("(%s) short MPOA Device Type TLV\n", dev->name);
continue;
}
- if (mpoa_device_type != MPS && mpoa_device_type != MPS_AND_MPC) {
- dprintk("ignoring non-MPS device\n");
- if (mpoa_device_type == MPC) tlvs += 20;
+ if (mpoa_device_type != MPS &&
+ mpoa_device_type != MPS_AND_MPC) {
+ dprintk("ignoring non-MPS device ");
+ if (mpoa_device_type == MPC)
+ tlvs += 20;
continue; /* we are only interested in MPSs */
}
- if (number_of_mps_macs == 0 && mpoa_device_type == MPS_AND_MPC) {
- printk("\nmpoa: (%s) lane2_assoc_ind: MPS_AND_MPC has zero MACs\n", dev->name);
+ if (number_of_mps_macs == 0 &&
+ mpoa_device_type == MPS_AND_MPC) {
+ pr_info("(%s) MPS_AND_MPC has zero MACs\n", dev->name);
continue; /* someone should read the spec */
}
- dprintk("this MPS has %d MAC addresses\n", number_of_mps_macs);
-
- /* ok, now we can go and tell our daemon the control address of MPS */
+ dprintk_cont("this MPS has %d MAC addresses\n",
+ number_of_mps_macs);
+
+ /*
+ * ok, now we can go and tell our daemon
+ * the control address of MPS
+ */
send_set_mps_ctrl_addr(tlvs, mpc);
-
- tlvs = copy_macs(mpc, mac_addr, tlvs, number_of_mps_macs, mpoa_device_type);
- if (tlvs == NULL) return;
+
+ tlvs = copy_macs(mpc, mac_addr, tlvs,
+ number_of_mps_macs, mpoa_device_type);
+ if (tlvs == NULL)
+ return;
}
if (end_of_tlvs - tlvs != 0)
- printk("mpoa: (%s) lane2_assoc_ind: ignoring %Zd bytes of trailing TLV carbage\n",
- dev->name, end_of_tlvs - tlvs);
- return;
+ pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
+ dev->name, end_of_tlvs - tlvs);
}
/*
@@ -430,22 +461,25 @@ static void lane2_assoc_ind(struct net_device *dev, uint8_t *mac_addr,
* plus the possible MAC address(es) to mpc->mps_macs.
* For a freshly allocated MPOA client mpc->mps_macs == 0.
*/
-static uint8_t *copy_macs(struct mpoa_client *mpc, uint8_t *router_mac,
- uint8_t *tlvs, uint8_t mps_macs, uint8_t device_type)
+static const uint8_t *copy_macs(struct mpoa_client *mpc,
+ const uint8_t *router_mac,
+ const uint8_t *tlvs, uint8_t mps_macs,
+ uint8_t device_type)
{
int num_macs;
num_macs = (mps_macs > 1) ? mps_macs : 1;
if (mpc->number_of_mps_macs != num_macs) { /* need to reallocate? */
- if (mpc->number_of_mps_macs != 0) kfree(mpc->mps_macs);
+ if (mpc->number_of_mps_macs != 0)
+ kfree(mpc->mps_macs);
mpc->number_of_mps_macs = 0;
- mpc->mps_macs = kmalloc(num_macs*ETH_ALEN, GFP_KERNEL);
+ mpc->mps_macs = kmalloc_array(ETH_ALEN, num_macs, GFP_KERNEL);
if (mpc->mps_macs == NULL) {
- printk("mpoa: (%s) copy_macs: out of mem\n", mpc->dev->name);
+ pr_info("(%s) out of mem\n", mpc->dev->name);
return NULL;
}
}
- memcpy(mpc->mps_macs, router_mac, ETH_ALEN);
+ ether_addr_copy(mpc->mps_macs, router_mac);
tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20;
if (mps_macs > 0)
memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN);
@@ -474,24 +508,30 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
iph = (struct iphdr *)buff;
ipaddr = iph->daddr;
- ddprintk("mpoa: (%s) send_via_shortcut: ipaddr 0x%x\n", mpc->dev->name, ipaddr);
+ ddprintk("(%s) ipaddr 0x%x\n",
+ mpc->dev->name, ipaddr);
entry = mpc->in_ops->get(ipaddr, mpc);
if (entry == NULL) {
entry = mpc->in_ops->add_entry(ipaddr, mpc);
- if (entry != NULL) mpc->in_ops->put(entry);
+ if (entry != NULL)
+ mpc->in_ops->put(entry);
return 1;
}
- if (mpc->in_ops->cache_hit(entry, mpc) != OPEN){ /* threshold not exceeded or VCC not ready */
- ddprintk("mpoa: (%s) send_via_shortcut: cache_hit: returns != OPEN\n", mpc->dev->name);
+ /* threshold not exceeded or VCC not ready */
+ if (mpc->in_ops->cache_hit(entry, mpc) != OPEN) {
+ ddprintk("(%s) cache_hit: returns != OPEN\n",
+ mpc->dev->name);
mpc->in_ops->put(entry);
return 1;
}
- ddprintk("mpoa: (%s) send_via_shortcut: using shortcut\n", mpc->dev->name);
+ ddprintk("(%s) using shortcut\n",
+ mpc->dev->name);
/* MPOA spec A.1.4, MPOA client must decrement IP ttl at least by one */
if (iph->ttl <= 1) {
- ddprintk("mpoa: (%s) send_via_shortcut: IP ttl = %u, using LANE\n", mpc->dev->name, iph->ttl);
+ ddprintk("(%s) IP ttl = %u, using LANE\n",
+ mpc->dev->name, iph->ttl);
mpc->in_ops->put(entry);
return 1;
}
@@ -500,19 +540,23 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
if (entry->ctrl_info.tag != 0) {
- ddprintk("mpoa: (%s) send_via_shortcut: adding tag 0x%x\n", mpc->dev->name, entry->ctrl_info.tag);
+ ddprintk("(%s) adding tag 0x%x\n",
+ mpc->dev->name, entry->ctrl_info.tag);
tagged_llc_snap_hdr.tag = entry->ctrl_info.tag;
- skb_pull(skb, ETH_HLEN); /* get rid of Eth header */
- skb_push(skb, sizeof(tagged_llc_snap_hdr)); /* add LLC/SNAP header */
- memcpy(skb->data, &tagged_llc_snap_hdr, sizeof(tagged_llc_snap_hdr));
+ skb_pull(skb, ETH_HLEN); /* get rid of Eth header */
+ skb_push(skb, sizeof(tagged_llc_snap_hdr));
+ /* add LLC/SNAP header */
+ skb_copy_to_linear_data(skb, &tagged_llc_snap_hdr,
+ sizeof(tagged_llc_snap_hdr));
} else {
- skb_pull(skb, ETH_HLEN); /* get rid of Eth header */
- skb_push(skb, sizeof(struct llc_snap_hdr)); /* add LLC/SNAP header + tag */
- memcpy(skb->data, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr));
+ skb_pull(skb, ETH_HLEN); /* get rid of Eth header */
+ skb_push(skb, sizeof(struct llc_snap_hdr));
+ /* add LLC/SNAP header + tag */
+ skb_copy_to_linear_data(skb, &llc_snap_mpoa_data,
+ sizeof(struct llc_snap_hdr));
}
- atomic_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc);
- ATM_SKB(skb)->atm_options = entry->shortcut->atm_options;
+ atm_account_tx(entry->shortcut, skb);
entry->shortcut->send(entry->shortcut, skb);
entry->packets_fwded++;
mpc->in_ops->put(entry);
@@ -523,16 +567,16 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
/*
* Probably needs some error checks and locking, not sure...
*/
-static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
+ struct net_device *dev)
{
- int retval;
struct mpoa_client *mpc;
struct ethhdr *eth;
int i = 0;
-
+
mpc = find_mpc_by_lec(dev); /* this should NEVER fail */
- if(mpc == NULL) {
- printk("mpoa: (%s) mpc_send_packet: no MPC found\n", dev->name);
+ if (mpc == NULL) {
+ pr_info("(%s) no MPC found\n", dev->name);
goto non_ip;
}
@@ -540,17 +584,22 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
if (eth->h_proto != htons(ETH_P_IP))
goto non_ip; /* Multi-Protocol Over ATM :-) */
+ /* Weed out funny packets (e.g., AF_PACKET or raw). */
+ if (skb->len < ETH_HLEN + sizeof(struct iphdr))
+ goto non_ip;
+ skb_set_network_header(skb, ETH_HLEN);
+ if (skb->len < ETH_HLEN + ip_hdr(skb)->ihl * 4 || ip_hdr(skb)->ihl < 5)
+ goto non_ip;
+
while (i < mpc->number_of_mps_macs) {
- if (!compare_ether_addr(eth->h_dest, (mpc->mps_macs + i*ETH_ALEN)))
- if ( send_via_shortcut(skb, mpc) == 0 ) /* try shortcut */
- return 0; /* success! */
+ if (ether_addr_equal(eth->h_dest, mpc->mps_macs + i * ETH_ALEN))
+ if (send_via_shortcut(skb, mpc) == 0) /* try shortcut */
+ return NETDEV_TX_OK;
i++;
}
- non_ip:
- retval = mpc->old_hard_start_xmit(skb,dev);
-
- return retval;
+non_ip:
+ return __netdev_start_xmit(mpc->old_ops, skb, dev, false);
}
static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
@@ -563,31 +612,34 @@ static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmmpc_ioc));
if (bytes_left != 0) {
- printk("mpoa: mpc_vcc_attach: Short read (missed %d bytes) from userland\n", bytes_left);
+ pr_info("mpoa:Short read (missed %d bytes) from userland\n",
+ bytes_left);
return -EFAULT;
}
ipaddr = ioc_data.ipaddr;
if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF)
return -EINVAL;
-
+
mpc = find_mpc_by_itfnum(ioc_data.dev_num);
if (mpc == NULL)
return -EINVAL;
-
+
if (ioc_data.type == MPC_SOCKET_INGRESS) {
in_entry = mpc->in_ops->get(ipaddr, mpc);
- if (in_entry == NULL || in_entry->entry_state < INGRESS_RESOLVED) {
- printk("mpoa: (%s) mpc_vcc_attach: did not find RESOLVED entry from ingress cache\n",
+ if (in_entry == NULL ||
+ in_entry->entry_state < INGRESS_RESOLVED) {
+ pr_info("(%s) did not find RESOLVED entry from ingress cache\n",
mpc->dev->name);
- if (in_entry != NULL) mpc->in_ops->put(in_entry);
+ if (in_entry != NULL)
+ mpc->in_ops->put(in_entry);
return -EINVAL;
}
- printk("mpoa: (%s) mpc_vcc_attach: attaching ingress SVC, entry = %u.%u.%u.%u\n",
- mpc->dev->name, NIPQUAD(in_entry->ctrl_info.in_dst_ip));
+ pr_info("(%s) attaching ingress SVC, entry = %pI4\n",
+ mpc->dev->name, &in_entry->ctrl_info.in_dst_ip);
in_entry->shortcut = vcc;
mpc->in_ops->put(in_entry);
} else {
- printk("mpoa: (%s) mpc_vcc_attach: attaching egress SVC\n", mpc->dev->name);
+ pr_info("(%s) attaching egress SVC\n", mpc->dev->name);
}
vcc->proto_data = mpc->dev;
@@ -604,32 +656,30 @@ static void mpc_vcc_close(struct atm_vcc *vcc, struct net_device *dev)
struct mpoa_client *mpc;
in_cache_entry *in_entry;
eg_cache_entry *eg_entry;
-
+
mpc = find_mpc_by_lec(dev);
if (mpc == NULL) {
- printk("mpoa: (%s) mpc_vcc_close: close for unknown MPC\n", dev->name);
+ pr_info("(%s) close for unknown MPC\n", dev->name);
return;
}
- dprintk("mpoa: (%s) mpc_vcc_close:\n", dev->name);
+ dprintk("(%s)\n", dev->name);
in_entry = mpc->in_ops->get_by_vcc(vcc, mpc);
if (in_entry) {
- dprintk("mpoa: (%s) mpc_vcc_close: ingress SVC closed ip = %u.%u.%u.%u\n",
- mpc->dev->name, NIPQUAD(in_entry->ctrl_info.in_dst_ip));
+ dprintk("(%s) ingress SVC closed ip = %pI4\n",
+ mpc->dev->name, &in_entry->ctrl_info.in_dst_ip);
in_entry->shortcut = NULL;
mpc->in_ops->put(in_entry);
}
eg_entry = mpc->eg_ops->get_by_vcc(vcc, mpc);
if (eg_entry) {
- dprintk("mpoa: (%s) mpc_vcc_close: egress SVC closed\n", mpc->dev->name);
+ dprintk("(%s) egress SVC closed\n", mpc->dev->name);
eg_entry->shortcut = NULL;
mpc->eg_ops->put(eg_entry);
}
if (in_entry == NULL && eg_entry == NULL)
- dprintk("mpoa: (%s) mpc_vcc_close: unused vcc closed\n", dev->name);
-
- return;
+ dprintk("(%s) unused vcc closed\n", dev->name);
}
static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
@@ -640,22 +690,23 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
struct mpoa_client *mpc;
__be32 tag;
char *tmp;
-
- ddprintk("mpoa: (%s) mpc_push:\n", dev->name);
+
+ ddprintk("(%s)\n", dev->name);
if (skb == NULL) {
- dprintk("mpoa: (%s) mpc_push: null skb, closing VCC\n", dev->name);
+ dprintk("(%s) null skb, closing VCC\n", dev->name);
mpc_vcc_close(vcc, dev);
return;
}
-
+
skb->dev = dev;
- if (memcmp(skb->data, &llc_snap_mpoa_ctrl, sizeof(struct llc_snap_hdr)) == 0) {
+ if (memcmp(skb->data, &llc_snap_mpoa_ctrl,
+ sizeof(struct llc_snap_hdr)) == 0) {
struct sock *sk = sk_atm(vcc);
- dprintk("mpoa: (%s) mpc_push: control packet arrived\n", dev->name);
+ dprintk("(%s) control packet arrived\n", dev->name);
/* Pass control packets to daemon */
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return;
}
@@ -664,20 +715,22 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
mpc = find_mpc_by_lec(dev);
if (mpc == NULL) {
- printk("mpoa: (%s) mpc_push: unknown MPC\n", dev->name);
+ pr_info("(%s) unknown MPC\n", dev->name);
return;
}
- if (memcmp(skb->data, &llc_snap_mpoa_data_tagged, sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */
- ddprintk("mpoa: (%s) mpc_push: tagged data packet arrived\n", dev->name);
+ if (memcmp(skb->data, &llc_snap_mpoa_data_tagged,
+ sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */
+ ddprintk("(%s) tagged data packet arrived\n", dev->name);
- } else if (memcmp(skb->data, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */
- printk("mpoa: (%s) mpc_push: non-tagged data packet arrived\n", dev->name);
- printk(" mpc_push: non-tagged data unsupported, purging\n");
+ } else if (memcmp(skb->data, &llc_snap_mpoa_data,
+ sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */
+ pr_info("(%s) Unsupported non-tagged data packet arrived. Purging\n",
+ dev->name);
dev_kfree_skb_any(skb);
return;
} else {
- printk("mpoa: (%s) mpc_push: garbage arrived, purging\n", dev->name);
+ pr_info("(%s) garbage arrived, purging\n", dev->name);
dev_kfree_skb_any(skb);
return;
}
@@ -687,45 +740,46 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
eg = mpc->eg_ops->get_by_tag(tag, mpc);
if (eg == NULL) {
- printk("mpoa: (%s) mpc_push: Didn't find egress cache entry, tag = %u\n",
- dev->name,tag);
+ pr_info("mpoa: (%s) Didn't find egress cache entry, tag = %u\n",
+ dev->name, tag);
purge_egress_shortcut(vcc, NULL);
dev_kfree_skb_any(skb);
return;
}
-
+
/*
* See if ingress MPC is using shortcut we opened as a return channel.
* This means we have a bi-directional vcc opened by us.
- */
+ */
if (eg->shortcut == NULL) {
eg->shortcut = vcc;
- printk("mpoa: (%s) mpc_push: egress SVC in use\n", dev->name);
+ pr_info("(%s) egress SVC in use\n", dev->name);
}
- skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag)); /* get rid of LLC/SNAP header */
- new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length); /* LLC/SNAP is shorter than MAC header :( */
+ skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag));
+ /* get rid of LLC/SNAP header */
+ new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length);
+ /* LLC/SNAP is shorter than MAC header :( */
dev_kfree_skb_any(skb);
- if (new_skb == NULL){
+ if (new_skb == NULL) {
mpc->eg_ops->put(eg);
return;
}
skb_push(new_skb, eg->ctrl_info.DH_length); /* add MAC header */
- memcpy(new_skb->data, eg->ctrl_info.DLL_header, eg->ctrl_info.DH_length);
+ skb_copy_to_linear_data(new_skb, eg->ctrl_info.DLL_header,
+ eg->ctrl_info.DH_length);
new_skb->protocol = eth_type_trans(new_skb, dev);
- new_skb->nh.raw = new_skb->data;
+ skb_reset_network_header(new_skb);
- eg->latest_ip_addr = new_skb->nh.iph->saddr;
+ eg->latest_ip_addr = ip_hdr(new_skb)->saddr;
eg->packets_rcvd++;
mpc->eg_ops->put(eg);
- memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+ memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
netif_rx(new_skb);
-
- return;
}
-static struct atmdev_ops mpc_ops = { /* only send is required */
+static const struct atmdev_ops mpc_ops = { /* only send is required */
.close = mpoad_close,
.send = msg_from_mpoad
};
@@ -734,66 +788,66 @@ static struct atm_dev mpc_dev = {
.ops = &mpc_ops,
.type = "mpc",
.number = 42,
- .lock = SPIN_LOCK_UNLOCKED
+ .lock = __SPIN_LOCK_UNLOCKED(mpc_dev.lock)
/* members not explicitly initialised will be 0 */
};
-static int atm_mpoa_mpoad_attach (struct atm_vcc *vcc, int arg)
+static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg)
{
struct mpoa_client *mpc;
struct lec_priv *priv;
int err;
-
+
if (mpcs == NULL) {
- init_timer(&mpc_timer);
mpc_timer_refresh();
/* This lets us now how our LECs are doing */
err = register_netdevice_notifier(&mpoa_notifier);
if (err < 0) {
- del_timer(&mpc_timer);
+ timer_delete(&mpc_timer);
return err;
}
}
-
+
mpc = find_mpc_by_itfnum(arg);
if (mpc == NULL) {
- dprintk("mpoa: mpoad_attach: allocating new mpc for itf %d\n", arg);
+ dprintk("allocating new mpc for itf %d\n", arg);
mpc = alloc_mpc();
if (mpc == NULL)
return -ENOMEM;
mpc->dev_num = arg;
- mpc->dev = find_lec_by_itfnum(arg); /* NULL if there was no lec */
+ mpc->dev = find_lec_by_itfnum(arg);
+ /* NULL if there was no lec */
}
if (mpc->mpoad_vcc) {
- printk("mpoa: mpoad_attach: mpoad is already present for itf %d\n", arg);
+ pr_info("mpoad is already present for itf %d\n", arg);
return -EADDRINUSE;
}
if (mpc->dev) { /* check if the lec is LANE2 capable */
- priv = (struct lec_priv *)mpc->dev->priv;
+ priv = netdev_priv(mpc->dev);
if (priv->lane_version < 2) {
dev_put(mpc->dev);
mpc->dev = NULL;
} else
- priv->lane2_ops->associate_indicator = lane2_assoc_ind;
+ priv->lane2_ops->associate_indicator = lane2_assoc_ind;
}
mpc->mpoad_vcc = vcc;
vcc->dev = &mpc_dev;
vcc_insert_socket(sk_atm(vcc));
- set_bit(ATM_VF_META,&vcc->flags);
- set_bit(ATM_VF_READY,&vcc->flags);
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
if (mpc->dev) {
char empty[ATM_ESA_LEN];
memset(empty, 0, ATM_ESA_LEN);
-
+
start_mpc(mpc, mpc->dev);
/* set address if mpcd e.g. gets killed and restarted.
* If we do not do it now we have to wait for the next LE_ARP
*/
- if ( memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0 )
+ if (memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0)
send_set_mps_ctrl_addr(mpc->mps_ctrl_addr, mpc);
}
@@ -801,17 +855,15 @@ static int atm_mpoa_mpoad_attach (struct atm_vcc *vcc, int arg)
return arg;
}
-static void send_set_mps_ctrl_addr(char *addr, struct mpoa_client *mpc)
+static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc)
{
struct k_message mesg;
- memcpy (mpc->mps_ctrl_addr, addr, ATM_ESA_LEN);
-
+ memcpy(mpc->mps_ctrl_addr, addr, ATM_ESA_LEN);
+
mesg.type = SET_MPS_CTRL_ADDR;
memcpy(mesg.MPS_ctrl, addr, ATM_ESA_LEN);
msg_to_mpoad(&mesg, mpc);
-
- return;
}
static void mpoad_close(struct atm_vcc *vcc)
@@ -821,17 +873,17 @@ static void mpoad_close(struct atm_vcc *vcc)
mpc = find_mpc_by_vcc(vcc);
if (mpc == NULL) {
- printk("mpoa: mpoad_close: did not find MPC\n");
+ pr_info("did not find MPC\n");
return;
}
if (!mpc->mpoad_vcc) {
- printk("mpoa: mpoad_close: close for non-present mpoad\n");
+ pr_info("close for non-present mpoad\n");
return;
}
-
+
mpc->mpoad_vcc = NULL;
if (mpc->dev) {
- struct lec_priv *priv = (struct lec_priv *)mpc->dev->priv;
+ struct lec_priv *priv = netdev_priv(mpc->dev);
priv->lane2_ops->associate_indicator = NULL;
stop_mpc(mpc);
dev_put(mpc->dev);
@@ -844,12 +896,10 @@ static void mpoad_close(struct atm_vcc *vcc)
atm_return(vcc, skb->truesize);
kfree_skb(skb);
}
-
- printk("mpoa: (%s) going down\n",
+
+ pr_info("(%s) going down\n",
(mpc->dev) ? mpc->dev->name : "<unknown>");
module_put(THIS_MODULE);
-
- return;
}
/*
@@ -857,63 +907,63 @@ static void mpoad_close(struct atm_vcc *vcc)
*/
static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb)
{
-
+
struct mpoa_client *mpc = find_mpc_by_vcc(vcc);
- struct k_message *mesg = (struct k_message*)skb->data;
- atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
-
+ struct k_message *mesg = (struct k_message *)skb->data;
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc));
+
if (mpc == NULL) {
- printk("mpoa: msg_from_mpoad: no mpc found\n");
+ pr_info("no mpc found\n");
return 0;
}
- dprintk("mpoa: (%s) msg_from_mpoad:", (mpc->dev) ? mpc->dev->name : "<unknown>");
- switch(mesg->type) {
+ dprintk("(%s)", mpc->dev ? mpc->dev->name : "<unknown>");
+ switch (mesg->type) {
case MPOA_RES_REPLY_RCVD:
- dprintk(" mpoa_res_reply_rcvd\n");
+ dprintk_cont("mpoa_res_reply_rcvd\n");
MPOA_res_reply_rcvd(mesg, mpc);
break;
case MPOA_TRIGGER_RCVD:
- dprintk(" mpoa_trigger_rcvd\n");
+ dprintk_cont("mpoa_trigger_rcvd\n");
MPOA_trigger_rcvd(mesg, mpc);
break;
case INGRESS_PURGE_RCVD:
- dprintk(" nhrp_purge_rcvd\n");
+ dprintk_cont("nhrp_purge_rcvd\n");
ingress_purge_rcvd(mesg, mpc);
break;
case EGRESS_PURGE_RCVD:
- dprintk(" egress_purge_reply_rcvd\n");
+ dprintk_cont("egress_purge_reply_rcvd\n");
egress_purge_rcvd(mesg, mpc);
break;
case MPS_DEATH:
- dprintk(" mps_death\n");
+ dprintk_cont("mps_death\n");
mps_death(mesg, mpc);
break;
case CACHE_IMPOS_RCVD:
- dprintk(" cache_impos_rcvd\n");
+ dprintk_cont("cache_impos_rcvd\n");
MPOA_cache_impos_rcvd(mesg, mpc);
break;
case SET_MPC_CTRL_ADDR:
- dprintk(" set_mpc_ctrl_addr\n");
+ dprintk_cont("set_mpc_ctrl_addr\n");
set_mpc_ctrl_addr_rcvd(mesg, mpc);
break;
case SET_MPS_MAC_ADDR:
- dprintk(" set_mps_mac_addr\n");
+ dprintk_cont("set_mps_mac_addr\n");
set_mps_mac_addr_rcvd(mesg, mpc);
break;
case CLEAN_UP_AND_EXIT:
- dprintk(" clean_up_and_exit\n");
+ dprintk_cont("clean_up_and_exit\n");
clean_up(mesg, mpc, DIE);
break;
case RELOAD:
- dprintk(" reload\n");
+ dprintk_cont("reload\n");
clean_up(mesg, mpc, RELOAD);
break;
case SET_MPC_PARAMS:
- dprintk(" set_mpc_params\n");
+ dprintk_cont("set_mpc_params\n");
mpc->parameters = mesg->content.params;
break;
default:
- dprintk(" unknown message %d\n", mesg->type);
+ dprintk_cont("unknown message %d\n", mesg->type);
break;
}
kfree_skb(skb);
@@ -928,7 +978,7 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc)
struct sock *sk;
if (mpc == NULL || !mpc->mpoad_vcc) {
- printk("mpoa: msg_to_mpoad: mesg %d to a non-existent mpoad\n", mesg->type);
+ pr_info("mesg %d to a non-existent mpoad\n", mesg->type);
return -ENXIO;
}
@@ -936,53 +986,55 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc)
if (skb == NULL)
return -ENOMEM;
skb_put(skb, sizeof(struct k_message));
- memcpy(skb->data, mesg, sizeof(struct k_message));
+ skb_copy_to_linear_data(skb, mesg, sizeof(*mesg));
atm_force_charge(mpc->mpoad_vcc, skb->truesize);
-
+
sk = sk_atm(mpc->mpoad_vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 0;
}
-static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev_ptr)
+static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
+ unsigned long event, void *ptr)
{
- struct net_device *dev;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct mpoa_client *mpc;
struct lec_priv *priv;
- dev = (struct net_device *)dev_ptr;
- if (dev->name == NULL || strncmp(dev->name, "lec", 3))
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (strncmp(dev->name, "lec", 3))
return NOTIFY_DONE; /* we are only interested in lec:s */
-
+
switch (event) {
case NETDEV_REGISTER: /* a new lec device was allocated */
- priv = (struct lec_priv *)dev->priv;
+ priv = netdev_priv(dev);
if (priv->lane_version < 2)
break;
priv->lane2_ops->associate_indicator = lane2_assoc_ind;
mpc = find_mpc_by_itfnum(priv->itfnum);
if (mpc == NULL) {
- dprintk("mpoa: mpoa_event_listener: allocating new mpc for %s\n",
- dev->name);
+ dprintk("allocating new mpc for %s\n", dev->name);
mpc = alloc_mpc();
if (mpc == NULL) {
- printk("mpoa: mpoa_event_listener: no new mpc");
+ pr_info("no new mpc");
break;
}
}
mpc->dev_num = priv->itfnum;
mpc->dev = dev;
dev_hold(dev);
- dprintk("mpoa: (%s) was initialized\n", dev->name);
+ dprintk("(%s) was initialized\n", dev->name);
break;
case NETDEV_UNREGISTER:
/* the lec device was deallocated */
mpc = find_mpc_by_lec(dev);
if (mpc == NULL)
break;
- dprintk("mpoa: device (%s) was deallocated\n", dev->name);
+ dprintk("device (%s) was deallocated\n", dev->name);
stop_mpc(mpc);
dev_put(mpc->dev);
mpc->dev = NULL;
@@ -992,9 +1044,8 @@ static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned lo
mpc = find_mpc_by_lec(dev);
if (mpc == NULL)
break;
- if (mpc->mpoad_vcc != NULL) {
+ if (mpc->mpoad_vcc != NULL)
start_mpc(mpc, dev);
- }
break;
case NETDEV_DOWN:
/* the dev was ifconfig'ed down */
@@ -1004,9 +1055,8 @@ static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned lo
mpc = find_mpc_by_lec(dev);
if (mpc == NULL)
break;
- if (mpc->mpoad_vcc != NULL) {
+ if (mpc->mpoad_vcc != NULL)
stop_mpc(mpc);
- }
break;
case NETDEV_REBOOT:
case NETDEV_CHANGE:
@@ -1033,54 +1083,56 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
in_cache_entry *entry;
entry = mpc->in_ops->get(dst_ip, mpc);
- if(entry == NULL){
+ if (entry == NULL) {
entry = mpc->in_ops->add_entry(dst_ip, mpc);
entry->entry_state = INGRESS_RESOLVING;
msg->type = SND_MPOA_RES_RQST;
msg->content.in_info = entry->ctrl_info;
msg_to_mpoad(msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
mpc->in_ops->put(entry);
return;
}
-
- if(entry->entry_state == INGRESS_INVALID){
+
+ if (entry->entry_state == INGRESS_INVALID) {
entry->entry_state = INGRESS_RESOLVING;
msg->type = SND_MPOA_RES_RQST;
msg->content.in_info = entry->ctrl_info;
msg_to_mpoad(msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
mpc->in_ops->put(entry);
return;
}
-
- printk("mpoa: (%s) MPOA_trigger_rcvd: entry already in resolving state\n",
+
+ pr_info("(%s) entry already in resolving state\n",
(mpc->dev) ? mpc->dev->name : "<unknown>");
mpc->in_ops->put(entry);
- return;
}
/*
* Things get complicated because we have to check if there's an egress
- * shortcut with suitable traffic parameters we could use.
+ * shortcut with suitable traffic parameters we could use.
*/
-static void check_qos_and_open_shortcut(struct k_message *msg, struct mpoa_client *client, in_cache_entry *entry)
+static void check_qos_and_open_shortcut(struct k_message *msg,
+ struct mpoa_client *client,
+ in_cache_entry *entry)
{
__be32 dst_ip = msg->content.in_info.in_dst_ip;
struct atm_mpoa_qos *qos = atm_mpoa_search_qos(dst_ip);
eg_cache_entry *eg_entry = client->eg_ops->get_by_src_ip(dst_ip, client);
- if(eg_entry && eg_entry->shortcut){
- if(eg_entry->shortcut->qos.txtp.traffic_class &
- msg->qos.txtp.traffic_class &
- (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)){
- if(eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR)
- entry->shortcut = eg_entry->shortcut;
- else if(eg_entry->shortcut->qos.txtp.max_pcr > 0)
- entry->shortcut = eg_entry->shortcut;
+ if (eg_entry && eg_entry->shortcut) {
+ if (eg_entry->shortcut->qos.txtp.traffic_class &
+ msg->qos.txtp.traffic_class &
+ (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)) {
+ if (eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR)
+ entry->shortcut = eg_entry->shortcut;
+ else if (eg_entry->shortcut->qos.txtp.max_pcr > 0)
+ entry->shortcut = eg_entry->shortcut;
}
- if(entry->shortcut){
- dprintk("mpoa: (%s) using egress SVC to reach %u.%u.%u.%u\n",client->dev->name, NIPQUAD(dst_ip));
+ if (entry->shortcut) {
+ dprintk("(%s) using egress SVC to reach %pI4\n",
+ client->dev->name, &dst_ip);
client->eg_ops->put(eg_entry);
return;
}
@@ -1090,14 +1142,14 @@ static void check_qos_and_open_shortcut(struct k_message *msg, struct mpoa_clien
/* No luck in the egress cache we must open an ingress SVC */
msg->type = OPEN_INGRESS_SVC;
- if (qos && (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class))
- {
+ if (qos &&
+ (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class)) {
msg->qos = qos->qos;
- printk("mpoa: (%s) trying to get a CBR shortcut\n",client->dev->name);
- }
- else memset(&msg->qos,0,sizeof(struct atm_qos));
+ pr_info("(%s) trying to get a CBR shortcut\n",
+ client->dev->name);
+ } else
+ memset(&msg->qos, 0, sizeof(struct atm_qos));
msg_to_mpoad(msg, client);
- return;
}
static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
@@ -1105,39 +1157,44 @@ static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
__be32 dst_ip = msg->content.in_info.in_dst_ip;
in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc);
- dprintk("mpoa: (%s) MPOA_res_reply_rcvd: ip %u.%u.%u.%u\n", mpc->dev->name, NIPQUAD(dst_ip));
- ddprintk("mpoa: (%s) MPOA_res_reply_rcvd() entry = %p", mpc->dev->name, entry);
- if(entry == NULL){
- printk("\nmpoa: (%s) ARGH, received res. reply for an entry that doesn't exist.\n", mpc->dev->name);
+ dprintk("(%s) ip %pI4\n",
+ mpc->dev->name, &dst_ip);
+ ddprintk("(%s) entry = %p",
+ mpc->dev->name, entry);
+ if (entry == NULL) {
+ pr_info("(%s) ARGH, received res. reply for an entry that doesn't exist.\n",
+ mpc->dev->name);
return;
}
- ddprintk(" entry_state = %d ", entry->entry_state);
+ ddprintk_cont(" entry_state = %d ", entry->entry_state);
if (entry->entry_state == INGRESS_RESOLVED) {
- printk("\nmpoa: (%s) MPOA_res_reply_rcvd for RESOLVED entry!\n", mpc->dev->name);
+ pr_info("(%s) RESOLVED entry!\n", mpc->dev->name);
mpc->in_ops->put(entry);
return;
}
entry->ctrl_info = msg->content.in_info;
- do_gettimeofday(&(entry->tv));
- do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */
+ entry->time = ktime_get_seconds();
+ /* Used in refreshing func from now on */
+ entry->reply_wait = ktime_get_seconds();
entry->refresh_time = 0;
- ddprintk("entry->shortcut = %p\n", entry->shortcut);
+ ddprintk_cont("entry->shortcut = %p\n", entry->shortcut);
- if(entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL){
- entry->entry_state = INGRESS_RESOLVED;
+ if (entry->entry_state == INGRESS_RESOLVING &&
+ entry->shortcut != NULL) {
+ entry->entry_state = INGRESS_RESOLVED;
mpc->in_ops->put(entry);
return; /* Shortcut already open... */
}
if (entry->shortcut != NULL) {
- printk("mpoa: (%s) MPOA_res_reply_rcvd: entry->shortcut != NULL, impossible!\n",
- mpc->dev->name);
+ pr_info("(%s) entry->shortcut != NULL, impossible!\n",
+ mpc->dev->name);
mpc->in_ops->put(entry);
return;
}
-
+
check_qos_and_open_shortcut(msg, mpc, entry);
entry->entry_state = INGRESS_RESOLVED;
mpc->in_ops->put(entry);
@@ -1152,32 +1209,31 @@ static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
__be32 mask = msg->ip_mask;
in_cache_entry *entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask);
- if(entry == NULL){
- printk("mpoa: (%s) ingress_purge_rcvd: purge for a non-existing entry, ", mpc->dev->name);
- printk("ip = %u.%u.%u.%u\n", NIPQUAD(dst_ip));
+ if (entry == NULL) {
+ pr_info("(%s) purge for a non-existing entry, ip = %pI4\n",
+ mpc->dev->name, &dst_ip);
return;
}
do {
- dprintk("mpoa: (%s) ingress_purge_rcvd: removing an ingress entry, ip = %u.%u.%u.%u\n" ,
- mpc->dev->name, NIPQUAD(dst_ip));
+ dprintk("(%s) removing an ingress entry, ip = %pI4\n",
+ mpc->dev->name, &dst_ip);
write_lock_bh(&mpc->ingress_lock);
mpc->in_ops->remove_entry(entry, mpc);
write_unlock_bh(&mpc->ingress_lock);
mpc->in_ops->put(entry);
entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask);
} while (entry != NULL);
-
- return;
-}
+}
static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
{
__be32 cache_id = msg->content.eg_info.cache_id;
eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(cache_id, mpc);
-
+
if (entry == NULL) {
- dprintk("mpoa: (%s) egress_purge_rcvd: purge for a non-existing entry\n", mpc->dev->name);
+ dprintk("(%s) purge for a non-existing entry\n",
+ mpc->dev->name);
return;
}
@@ -1186,9 +1242,7 @@ static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
write_unlock_irq(&mpc->egress_lock);
mpc->eg_ops->put(entry);
-
- return;
-}
+}
static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry)
{
@@ -1196,15 +1250,15 @@ static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry)
struct k_message *purge_msg;
struct sk_buff *skb;
- dprintk("mpoa: purge_egress_shortcut: entering\n");
+ dprintk("entering\n");
if (vcc == NULL) {
- printk("mpoa: purge_egress_shortcut: vcc == NULL\n");
+ pr_info("vcc == NULL\n");
return;
}
skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC);
if (skb == NULL) {
- printk("mpoa: purge_egress_shortcut: out of memory\n");
+ pr_info("out of memory\n");
return;
}
@@ -1219,24 +1273,22 @@ static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry)
sk = sk_atm(vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
- dprintk("mpoa: purge_egress_shortcut: exiting:\n");
-
- return;
+ sk->sk_data_ready(sk);
+ dprintk("exiting\n");
}
/*
* Our MPS died. Tell our daemon to send NHRP data plane purge to each
* of the egress shortcuts we have.
*/
-static void mps_death( struct k_message * msg, struct mpoa_client * mpc )
+static void mps_death(struct k_message *msg, struct mpoa_client *mpc)
{
eg_cache_entry *entry;
- dprintk("mpoa: (%s) mps_death:\n", mpc->dev->name);
+ dprintk("(%s)\n", mpc->dev->name);
- if(memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)){
- printk("mpoa: (%s) mps_death: wrong MPS\n", mpc->dev->name);
+ if (memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)) {
+ pr_info("(%s) wrong MPS\n", mpc->dev->name);
return;
}
@@ -1251,38 +1303,38 @@ static void mps_death( struct k_message * msg, struct mpoa_client * mpc )
mpc->in_ops->destroy_cache(mpc);
mpc->eg_ops->destroy_cache(mpc);
-
- return;
}
-static void MPOA_cache_impos_rcvd( struct k_message * msg, struct mpoa_client * mpc)
+static void MPOA_cache_impos_rcvd(struct k_message *msg,
+ struct mpoa_client *mpc)
{
uint16_t holding_time;
eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(msg->content.eg_info.cache_id, mpc);
-
+
holding_time = msg->content.eg_info.holding_time;
- dprintk("mpoa: (%s) MPOA_cache_impos_rcvd: entry = %p, holding_time = %u\n",
- mpc->dev->name, entry, holding_time);
- if(entry == NULL && holding_time) {
+ dprintk("(%s) entry = %p, holding_time = %u\n",
+ mpc->dev->name, entry, holding_time);
+ if (entry == NULL && !holding_time)
+ return;
+ if (entry == NULL && holding_time) {
entry = mpc->eg_ops->add_entry(msg, mpc);
mpc->eg_ops->put(entry);
return;
}
- if(holding_time){
+ if (holding_time) {
mpc->eg_ops->update(entry, holding_time);
return;
}
-
+
write_lock_irq(&mpc->egress_lock);
mpc->eg_ops->remove_entry(entry, mpc);
write_unlock_irq(&mpc->egress_lock);
mpc->eg_ops->put(entry);
-
- return;
}
-static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc)
+static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg,
+ struct mpoa_client *mpc)
{
struct lec_priv *priv;
int i, retval ;
@@ -1297,39 +1349,40 @@ static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *m
memcpy(&tlv[7], mesg->MPS_ctrl, ATM_ESA_LEN); /* MPC ctrl ATM addr */
memcpy(mpc->our_ctrl_addr, mesg->MPS_ctrl, ATM_ESA_LEN);
- dprintk("mpoa: (%s) setting MPC ctrl ATM address to ",
- (mpc->dev) ? mpc->dev->name : "<unknown>");
+ dprintk("(%s) setting MPC ctrl ATM address to",
+ mpc->dev ? mpc->dev->name : "<unknown>");
for (i = 7; i < sizeof(tlv); i++)
- dprintk("%02x ", tlv[i]);
- dprintk("\n");
+ dprintk_cont(" %02x", tlv[i]);
+ dprintk_cont("\n");
if (mpc->dev) {
- priv = (struct lec_priv *)mpc->dev->priv;
- retval = priv->lane2_ops->associate_req(mpc->dev, mpc->dev->dev_addr, tlv, sizeof(tlv));
+ priv = netdev_priv(mpc->dev);
+ retval = priv->lane2_ops->associate_req(mpc->dev,
+ mpc->dev->dev_addr,
+ tlv, sizeof(tlv));
if (retval == 0)
- printk("mpoa: (%s) MPOA device type TLV association failed\n", mpc->dev->name);
+ pr_info("(%s) MPOA device type TLV association failed\n",
+ mpc->dev->name);
retval = priv->lane2_ops->resolve(mpc->dev, NULL, 1, NULL, NULL);
if (retval < 0)
- printk("mpoa: (%s) targetless LE_ARP request failed\n", mpc->dev->name);
+ pr_info("(%s) targetless LE_ARP request failed\n",
+ mpc->dev->name);
}
-
- return;
}
-static void set_mps_mac_addr_rcvd(struct k_message *msg, struct mpoa_client *client)
+static void set_mps_mac_addr_rcvd(struct k_message *msg,
+ struct mpoa_client *client)
{
- if(client->number_of_mps_macs)
+ if (client->number_of_mps_macs)
kfree(client->mps_macs);
client->number_of_mps_macs = 0;
client->mps_macs = kmemdup(msg->MPS_ctrl, ETH_ALEN, GFP_KERNEL);
if (client->mps_macs == NULL) {
- printk("mpoa: set_mps_mac_addr_rcvd: out of memory\n");
+ pr_info("out of memory\n");
return;
}
client->number_of_mps_macs = 1;
-
- return;
}
/*
@@ -1345,54 +1398,53 @@ static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action)
/* FIXME: This knows too much of the cache structure */
read_lock_irq(&mpc->egress_lock);
entry = mpc->eg_cache;
- while (entry != NULL){
- msg->content.eg_info = entry->ctrl_info;
- dprintk("mpoa: cache_id %u\n", entry->ctrl_info.cache_id);
- msg_to_mpoad(msg, mpc);
- entry = entry->next;
+ while (entry != NULL) {
+ msg->content.eg_info = entry->ctrl_info;
+ dprintk("cache_id %u\n", entry->ctrl_info.cache_id);
+ msg_to_mpoad(msg, mpc);
+ entry = entry->next;
}
read_unlock_irq(&mpc->egress_lock);
msg->type = action;
msg_to_mpoad(msg, mpc);
- return;
}
+static unsigned long checking_time;
+
static void mpc_timer_refresh(void)
{
mpc_timer.expires = jiffies + (MPC_P2 * HZ);
- mpc_timer.data = mpc_timer.expires;
- mpc_timer.function = mpc_cache_check;
+ checking_time = mpc_timer.expires;
add_timer(&mpc_timer);
-
- return;
}
-static void mpc_cache_check( unsigned long checking_time )
+static void mpc_cache_check(struct timer_list *unused)
{
struct mpoa_client *mpc = mpcs;
static unsigned long previous_resolving_check_time;
static unsigned long previous_refresh_time;
-
- while( mpc != NULL ){
+
+ while (mpc != NULL) {
mpc->in_ops->clear_count(mpc);
mpc->eg_ops->clear_expired(mpc);
- if(checking_time - previous_resolving_check_time > mpc->parameters.mpc_p4 * HZ ){
+ if (checking_time - previous_resolving_check_time >
+ mpc->parameters.mpc_p4 * HZ) {
mpc->in_ops->check_resolving(mpc);
previous_resolving_check_time = checking_time;
}
- if(checking_time - previous_refresh_time > mpc->parameters.mpc_p5 * HZ ){
+ if (checking_time - previous_refresh_time >
+ mpc->parameters.mpc_p5 * HZ) {
mpc->in_ops->refresh(mpc);
previous_refresh_time = checking_time;
}
mpc = mpc->next;
}
mpc_timer_refresh();
-
- return;
}
-static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
{
int err = 0;
struct atm_vcc *vcc = ATM_SD(sock);
@@ -1404,21 +1456,20 @@ static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd, unsigned long a
return -EPERM;
switch (cmd) {
- case ATMMPC_CTRL:
- err = atm_mpoa_mpoad_attach(vcc, (int)arg);
- if (err >= 0)
- sock->state = SS_CONNECTED;
- break;
- case ATMMPC_DATA:
- err = atm_mpoa_vcc_attach(vcc, (void __user *)arg);
- break;
- default:
- break;
+ case ATMMPC_CTRL:
+ err = atm_mpoa_mpoad_attach(vcc, (int)arg);
+ if (err >= 0)
+ sock->state = SS_CONNECTED;
+ break;
+ case ATMMPC_DATA:
+ err = atm_mpoa_vcc_attach(vcc, (void __user *)arg);
+ break;
+ default:
+ break;
}
return err;
}
-
static struct atm_ioctl atm_ioctl_ops = {
.owner = THIS_MODULE,
.ioctl = atm_mpoa_ioctl,
@@ -1429,9 +1480,9 @@ static __init int atm_mpoa_init(void)
register_atm_ioctl(&atm_ioctl_ops);
if (mpc_proc_init() != 0)
- printk(KERN_INFO "mpoa: failed to initialize /proc/mpoa\n");
+ pr_info("failed to initialize /proc/mpoa\n");
- printk("mpc.c: " __DATE__ " " __TIME__ " initialized\n");
+ pr_info("mpc.c: initialized\n");
return 0;
}
@@ -1444,7 +1495,7 @@ static void __exit atm_mpoa_cleanup(void)
mpc_proc_clean();
- del_timer(&mpc_timer);
+ timer_delete_sync(&mpc_timer);
unregister_netdevice_notifier(&mpoa_notifier);
deregister_atm_ioctl(&atm_ioctl_ops);
@@ -1454,19 +1505,19 @@ static void __exit atm_mpoa_cleanup(void)
tmp = mpc->next;
if (mpc->dev != NULL) {
stop_mpc(mpc);
- priv = (struct lec_priv *)mpc->dev->priv;
+ priv = netdev_priv(mpc->dev);
if (priv->lane2_ops != NULL)
priv->lane2_ops->associate_indicator = NULL;
}
- ddprintk("mpoa: cleanup_module: about to clear caches\n");
+ ddprintk("about to clear caches\n");
mpc->in_ops->destroy_cache(mpc);
mpc->eg_ops->destroy_cache(mpc);
- ddprintk("mpoa: cleanup_module: caches cleared\n");
+ ddprintk("caches cleared\n");
kfree(mpc->mps_macs);
memset(mpc, 0, sizeof(struct mpoa_client));
- ddprintk("mpoa: cleanup_module: about to kfree %p\n", mpc);
+ ddprintk("about to kfree %p\n", mpc);
kfree(mpc);
- ddprintk("mpoa: cleanup_module: next mpc is at %p\n", tmp);
+ ddprintk("next mpc is at %p\n", tmp);
mpc = tmp;
}
@@ -1474,15 +1525,14 @@ static void __exit atm_mpoa_cleanup(void)
qos_head = NULL;
while (qos != NULL) {
nextqos = qos->next;
- dprintk("mpoa: cleanup_module: freeing qos entry %p\n", qos);
+ dprintk("freeing qos entry %p\n", qos);
kfree(qos);
qos = nextqos;
}
-
- return;
}
module_init(atm_mpoa_init);
module_exit(atm_mpoa_cleanup);
+MODULE_DESCRIPTION("Multi-Protocol Over ATM (MPOA) driver");
MODULE_LICENSE("GPL");
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
index 51f460d005c3..454abd07651a 100644
--- a/net/atm/mpc.h
+++ b/net/atm/mpc.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MPC_H_
#define _MPC_H_
@@ -12,32 +13,35 @@
int msg_to_mpoad(struct k_message *msg, struct mpoa_client *mpc);
struct mpoa_client {
- struct mpoa_client *next;
- struct net_device *dev; /* lec in question */
- int dev_num; /* e.g. 2 for lec2 */
- int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev);
- struct atm_vcc *mpoad_vcc; /* control channel to mpoad */
- uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */
- uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */
-
- rwlock_t ingress_lock;
- struct in_cache_ops *in_ops; /* ingress cache operations */
- in_cache_entry *in_cache; /* the ingress cache of this MPC */
-
- rwlock_t egress_lock;
- struct eg_cache_ops *eg_ops; /* egress cache operations */
- eg_cache_entry *eg_cache; /* the egress cache of this MPC */
-
- uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */
- int number_of_mps_macs; /* number of the above MAC addresses */
- struct mpc_parameters parameters; /* parameters for this client */
+ struct mpoa_client *next;
+ struct net_device *dev; /* lec in question */
+ int dev_num; /* e.g. 2 for lec2 */
+
+ struct atm_vcc *mpoad_vcc; /* control channel to mpoad */
+ uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */
+ uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */
+
+ rwlock_t ingress_lock;
+ const struct in_cache_ops *in_ops; /* ingress cache operations */
+ in_cache_entry *in_cache; /* the ingress cache of this MPC */
+
+ rwlock_t egress_lock;
+ const struct eg_cache_ops *eg_ops; /* egress cache operations */
+ eg_cache_entry *eg_cache; /* the egress cache of this MPC */
+
+ uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */
+ int number_of_mps_macs; /* number of the above MAC addresses */
+ struct mpc_parameters parameters; /* parameters for this client */
+
+ const struct net_device_ops *old_ops;
+ struct net_device_ops new_ops;
};
struct atm_mpoa_qos {
- struct atm_mpoa_qos *next;
- __be32 ipaddr;
- struct atm_qos qos;
+ struct atm_mpoa_qos *next;
+ __be32 ipaddr;
+ struct atm_qos qos;
};
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 697a081533b5..f7a2f0e41105 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/atmmpc.h>
+#include <linux/slab.h>
#include <linux/time.h>
#include "mpoa_caches.h"
@@ -11,15 +13,23 @@
*/
#if 0
-#define dprintk printk /* debug */
+#define dprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
#else
-#define dprintk(format,args...)
+#define dprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
#endif
#if 0
-#define ddprintk printk /* more debug */
+#define ddprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
#else
-#define ddprintk(format,args...)
+#define ddprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
#endif
static in_cache_entry *in_cache_get(__be32 dst_ip,
@@ -29,9 +39,9 @@ static in_cache_entry *in_cache_get(__be32 dst_ip,
read_lock_bh(&client->ingress_lock);
entry = client->in_cache;
- while(entry != NULL){
- if( entry->ctrl_info.in_dst_ip == dst_ip ){
- atomic_inc(&entry->use);
+ while (entry != NULL) {
+ if (entry->ctrl_info.in_dst_ip == dst_ip) {
+ refcount_inc(&entry->use);
read_unlock_bh(&client->ingress_lock);
return entry;
}
@@ -50,9 +60,9 @@ static in_cache_entry *in_cache_get_with_mask(__be32 dst_ip,
read_lock_bh(&client->ingress_lock);
entry = client->in_cache;
- while(entry != NULL){
- if((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask )){
- atomic_inc(&entry->use);
+ while (entry != NULL) {
+ if ((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask)) {
+ refcount_inc(&entry->use);
read_unlock_bh(&client->ingress_lock);
return entry;
}
@@ -65,15 +75,15 @@ static in_cache_entry *in_cache_get_with_mask(__be32 dst_ip,
}
static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc,
- struct mpoa_client *client )
+ struct mpoa_client *client)
{
in_cache_entry *entry;
read_lock_bh(&client->ingress_lock);
entry = client->in_cache;
- while(entry != NULL){
- if(entry->shortcut == vcc) {
- atomic_inc(&entry->use);
+ while (entry != NULL) {
+ if (entry->shortcut == vcc) {
+ refcount_inc(&entry->use);
read_unlock_bh(&client->ingress_lock);
return entry;
}
@@ -90,14 +100,14 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
in_cache_entry *entry = kzalloc(sizeof(in_cache_entry), GFP_KERNEL);
if (entry == NULL) {
- printk("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n");
+ pr_info("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n");
return NULL;
}
- dprintk("mpoa: mpoa_caches.c: adding an ingress entry, ip = %u.%u.%u.%u\n", NIPQUAD(dst_ip));
+ dprintk("adding an ingress entry, ip = %pI4\n", &dst_ip);
- atomic_set(&entry->use, 1);
- dprintk("mpoa: mpoa_caches.c: new_in_cache_entry: about to lock\n");
+ refcount_set(&entry->use, 1);
+ dprintk("new_in_cache_entry: about to lock\n");
write_lock_bh(&client->ingress_lock);
entry->next = client->in_cache;
entry->prev = NULL;
@@ -107,15 +117,15 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
entry->ctrl_info.in_dst_ip = dst_ip;
- do_gettimeofday(&(entry->tv));
+ entry->time = ktime_get_seconds();
entry->retry_time = client->parameters.mpc_p4;
entry->count = 1;
entry->entry_state = INGRESS_INVALID;
entry->ctrl_info.holding_time = HOLDING_TIME_DEFAULT;
- atomic_inc(&entry->use);
+ refcount_inc(&entry->use);
write_unlock_bh(&client->ingress_lock);
- dprintk("mpoa: mpoa_caches.c: new_in_cache_entry: unlocked\n");
+ dprintk("new_in_cache_entry: unlocked\n");
return entry;
}
@@ -126,39 +136,42 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
struct k_message msg;
entry->count++;
- if(entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL)
+ if (entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL)
return OPEN;
- if(entry->entry_state == INGRESS_REFRESHING){
- if(entry->count > mpc->parameters.mpc_p1){
+ if (entry->entry_state == INGRESS_REFRESHING) {
+ if (entry->count > mpc->parameters.mpc_p1) {
msg.type = SND_MPOA_RES_RQST;
msg.content.in_info = entry->ctrl_info;
memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN);
qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
- if (qos != NULL) msg.qos = qos->qos;
+ if (qos != NULL)
+ msg.qos = qos->qos;
msg_to_mpoad(&msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
entry->entry_state = INGRESS_RESOLVING;
}
- if(entry->shortcut != NULL)
+ if (entry->shortcut != NULL)
return OPEN;
return CLOSED;
}
- if(entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL)
+ if (entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL)
return OPEN;
- if( entry->count > mpc->parameters.mpc_p1 &&
- entry->entry_state == INGRESS_INVALID){
- dprintk("mpoa: (%s) mpoa_caches.c: threshold exceeded for ip %u.%u.%u.%u, sending MPOA res req\n", mpc->dev->name, NIPQUAD(entry->ctrl_info.in_dst_ip));
+ if (entry->count > mpc->parameters.mpc_p1 &&
+ entry->entry_state == INGRESS_INVALID) {
+ dprintk("(%s) threshold exceeded for ip %pI4, sending MPOA res req\n",
+ mpc->dev->name, &entry->ctrl_info.in_dst_ip);
entry->entry_state = INGRESS_RESOLVING;
- msg.type = SND_MPOA_RES_RQST;
- memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN );
+ msg.type = SND_MPOA_RES_RQST;
+ memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN);
msg.content.in_info = entry->ctrl_info;
qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
- if (qos != NULL) msg.qos = qos->qos;
- msg_to_mpoad( &msg, mpc);
- do_gettimeofday(&(entry->reply_wait));
+ if (qos != NULL)
+ msg.qos = qos->qos;
+ msg_to_mpoad(&msg, mpc);
+ entry->reply_wait = ktime_get_seconds();
}
return CLOSED;
@@ -166,12 +179,9 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
static void in_cache_put(in_cache_entry *entry)
{
- if (atomic_dec_and_test(&entry->use)) {
- memset(entry, 0, sizeof(in_cache_entry));
- kfree(entry);
+ if (refcount_dec_and_test(&entry->use)) {
+ kfree_sensitive(entry);
}
-
- return;
}
/*
@@ -184,7 +194,8 @@ static void in_cache_remove_entry(in_cache_entry *entry,
struct k_message msg;
vcc = entry->shortcut;
- dprintk("mpoa: mpoa_caches.c: removing an ingress entry, ip = %u.%u.%u.%u\n",NIPQUAD(entry->ctrl_info.in_dst_ip));
+ dprintk("removing an ingress entry, ip = %pI4\n",
+ &entry->ctrl_info.in_dst_ip);
if (entry->prev != NULL)
entry->prev->next = entry->next;
@@ -193,49 +204,45 @@ static void in_cache_remove_entry(in_cache_entry *entry,
if (entry->next != NULL)
entry->next->prev = entry->prev;
client->in_ops->put(entry);
- if(client->in_cache == NULL && client->eg_cache == NULL){
+ if (client->in_cache == NULL && client->eg_cache == NULL) {
msg.type = STOP_KEEP_ALIVE_SM;
- msg_to_mpoad(&msg,client);
+ msg_to_mpoad(&msg, client);
}
/* Check if the egress side still uses this VCC */
if (vcc != NULL) {
- eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc, client);
+ eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc,
+ client);
if (eg_entry != NULL) {
client->eg_ops->put(eg_entry);
return;
}
vcc_release_async(vcc, -EPIPE);
}
-
- return;
}
-
/* Call this every MPC-p2 seconds... Not exactly correct solution,
but an easy one... */
static void clear_count_and_expired(struct mpoa_client *client)
{
in_cache_entry *entry, *next_entry;
- struct timeval now;
+ time64_t now;
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
write_lock_bh(&client->ingress_lock);
entry = client->in_cache;
- while(entry != NULL){
- entry->count=0;
+ while (entry != NULL) {
+ entry->count = 0;
next_entry = entry->next;
- if((now.tv_sec - entry->tv.tv_sec)
- > entry->ctrl_info.holding_time){
- dprintk("mpoa: mpoa_caches.c: holding time expired, ip = %u.%u.%u.%u\n", NIPQUAD(entry->ctrl_info.in_dst_ip));
+ if ((now - entry->time) > entry->ctrl_info.holding_time) {
+ dprintk("holding time expired, ip = %pI4\n",
+ &entry->ctrl_info.in_dst_ip);
client->in_ops->remove_entry(entry, client);
}
entry = next_entry;
}
write_unlock_bh(&client->ingress_lock);
-
- return;
}
/* Call this every MPC-p4 seconds. */
@@ -244,38 +251,43 @@ static void check_resolving_entries(struct mpoa_client *client)
struct atm_mpoa_qos *qos;
in_cache_entry *entry;
- struct timeval now;
+ time64_t now;
struct k_message msg;
- do_gettimeofday( &now );
+ now = ktime_get_seconds();
read_lock_bh(&client->ingress_lock);
entry = client->in_cache;
- while( entry != NULL ){
- if(entry->entry_state == INGRESS_RESOLVING){
- if(now.tv_sec - entry->hold_down.tv_sec < client->parameters.mpc_p6){
- entry = entry->next; /* Entry in hold down */
+ while (entry != NULL) {
+ if (entry->entry_state == INGRESS_RESOLVING) {
+
+ if ((now - entry->hold_down)
+ < client->parameters.mpc_p6) {
+ entry = entry->next; /* Entry in hold down */
continue;
}
- if( (now.tv_sec - entry->reply_wait.tv_sec) >
- entry->retry_time ){
- entry->retry_time = MPC_C1*( entry->retry_time );
- if(entry->retry_time > client->parameters.mpc_p5){
- /* Retry time maximum exceeded, put entry in hold down. */
- do_gettimeofday(&(entry->hold_down));
+ if ((now - entry->reply_wait) > entry->retry_time) {
+ entry->retry_time = MPC_C1 * (entry->retry_time);
+ /*
+ * Retry time maximum exceeded,
+ * put entry in hold down.
+ */
+ if (entry->retry_time > client->parameters.mpc_p5) {
+ entry->hold_down = ktime_get_seconds();
entry->retry_time = client->parameters.mpc_p4;
entry = entry->next;
continue;
}
/* Ask daemon to send a resolution request. */
- memset(&(entry->hold_down),0,sizeof(struct timeval));
+ memset(&entry->hold_down, 0, sizeof(time64_t));
msg.type = SND_MPOA_RES_RTRY;
memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN);
msg.content.in_info = entry->ctrl_info;
qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
- if (qos != NULL) msg.qos = qos->qos;
+ if (qos != NULL)
+ msg.qos = qos->qos;
msg_to_mpoad(&msg, client);
- do_gettimeofday(&(entry->reply_wait));
+ entry->reply_wait = ktime_get_seconds();
}
}
entry = entry->next;
@@ -286,19 +298,20 @@ static void check_resolving_entries(struct mpoa_client *client)
/* Call this every MPC-p5 seconds. */
static void refresh_entries(struct mpoa_client *client)
{
- struct timeval now;
+ time64_t now;
struct in_cache_entry *entry = client->in_cache;
- ddprintk("mpoa: mpoa_caches.c: refresh_entries\n");
- do_gettimeofday(&now);
+ ddprintk("refresh_entries\n");
+ now = ktime_get_seconds();
read_lock_bh(&client->ingress_lock);
- while( entry != NULL ){
- if( entry->entry_state == INGRESS_RESOLVED ){
- if(!(entry->refresh_time))
- entry->refresh_time = (2*(entry->ctrl_info.holding_time))/3;
- if( (now.tv_sec - entry->reply_wait.tv_sec) > entry->refresh_time ){
- dprintk("mpoa: mpoa_caches.c: refreshing an entry.\n");
+ while (entry != NULL) {
+ if (entry->entry_state == INGRESS_RESOLVED) {
+ if (!(entry->refresh_time))
+ entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3;
+ if ((now - entry->reply_wait) >
+ entry->refresh_time) {
+ dprintk("refreshing an entry.\n");
entry->entry_state = INGRESS_REFRESHING;
}
@@ -311,22 +324,21 @@ static void refresh_entries(struct mpoa_client *client)
static void in_destroy_cache(struct mpoa_client *mpc)
{
write_lock_irq(&mpc->ingress_lock);
- while(mpc->in_cache != NULL)
+ while (mpc->in_cache != NULL)
mpc->in_ops->remove_entry(mpc->in_cache, mpc);
write_unlock_irq(&mpc->ingress_lock);
-
- return;
}
-static eg_cache_entry *eg_cache_get_by_cache_id(__be32 cache_id, struct mpoa_client *mpc)
+static eg_cache_entry *eg_cache_get_by_cache_id(__be32 cache_id,
+ struct mpoa_client *mpc)
{
eg_cache_entry *entry;
read_lock_irq(&mpc->egress_lock);
entry = mpc->eg_cache;
- while(entry != NULL){
- if(entry->ctrl_info.cache_id == cache_id){
- atomic_inc(&entry->use);
+ while (entry != NULL) {
+ if (entry->ctrl_info.cache_id == cache_id) {
+ refcount_inc(&entry->use);
read_unlock_irq(&mpc->egress_lock);
return entry;
}
@@ -345,9 +357,9 @@ static eg_cache_entry *eg_cache_get_by_tag(__be32 tag, struct mpoa_client *mpc)
read_lock_irqsave(&mpc->egress_lock, flags);
entry = mpc->eg_cache;
- while (entry != NULL){
+ while (entry != NULL) {
if (entry->ctrl_info.tag == tag) {
- atomic_inc(&entry->use);
+ refcount_inc(&entry->use);
read_unlock_irqrestore(&mpc->egress_lock, flags);
return entry;
}
@@ -359,17 +371,18 @@ static eg_cache_entry *eg_cache_get_by_tag(__be32 tag, struct mpoa_client *mpc)
}
/* This can be called from any context since it saves CPU flags */
-static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc, struct mpoa_client *mpc)
+static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc,
+ struct mpoa_client *mpc)
{
unsigned long flags;
eg_cache_entry *entry;
read_lock_irqsave(&mpc->egress_lock, flags);
entry = mpc->eg_cache;
- while (entry != NULL){
+ while (entry != NULL) {
if (entry->shortcut == vcc) {
- atomic_inc(&entry->use);
- read_unlock_irqrestore(&mpc->egress_lock, flags);
+ refcount_inc(&entry->use);
+ read_unlock_irqrestore(&mpc->egress_lock, flags);
return entry;
}
entry = entry->next;
@@ -379,16 +392,17 @@ static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc, struct mpoa_clie
return NULL;
}
-static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr, struct mpoa_client *mpc)
+static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
+ struct mpoa_client *mpc)
{
eg_cache_entry *entry;
read_lock_irq(&mpc->egress_lock);
entry = mpc->eg_cache;
- while(entry != NULL){
- if(entry->latest_ip_addr == ipaddr) {
- atomic_inc(&entry->use);
- read_unlock_irq(&mpc->egress_lock);
+ while (entry != NULL) {
+ if (entry->latest_ip_addr == ipaddr) {
+ refcount_inc(&entry->use);
+ read_unlock_irq(&mpc->egress_lock);
return entry;
}
entry = entry->next;
@@ -400,12 +414,9 @@ static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr, struct mpoa_client
static void eg_cache_put(eg_cache_entry *entry)
{
- if (atomic_dec_and_test(&entry->use)) {
- memset(entry, 0, sizeof(eg_cache_entry));
- kfree(entry);
+ if (refcount_dec_and_test(&entry->use)) {
+ kfree_sensitive(entry);
}
-
- return;
}
/*
@@ -418,7 +429,7 @@ static void eg_cache_remove_entry(eg_cache_entry *entry,
struct k_message msg;
vcc = entry->shortcut;
- dprintk("mpoa: mpoa_caches.c: removing an egress entry.\n");
+ dprintk("removing an egress entry.\n");
if (entry->prev != NULL)
entry->prev->next = entry->next;
else
@@ -426,9 +437,9 @@ static void eg_cache_remove_entry(eg_cache_entry *entry,
if (entry->next != NULL)
entry->next->prev = entry->prev;
client->eg_ops->put(entry);
- if(client->in_cache == NULL && client->eg_cache == NULL){
+ if (client->in_cache == NULL && client->eg_cache == NULL) {
msg.type = STOP_KEEP_ALIVE_SM;
- msg_to_mpoad(&msg,client);
+ msg_to_mpoad(&msg, client);
}
/* Check if the ingress side still uses this VCC */
@@ -440,23 +451,23 @@ static void eg_cache_remove_entry(eg_cache_entry *entry,
}
vcc_release_async(vcc, -EPIPE);
}
-
- return;
}
-static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, struct mpoa_client *client)
+static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
+ struct mpoa_client *client)
{
eg_cache_entry *entry = kzalloc(sizeof(eg_cache_entry), GFP_KERNEL);
if (entry == NULL) {
- printk("mpoa: mpoa_caches.c: new_eg_cache_entry: out of memory\n");
+ pr_info("out of memory\n");
return NULL;
}
- dprintk("mpoa: mpoa_caches.c: adding an egress entry, ip = %u.%u.%u.%u, this should be our IP\n", NIPQUAD(msg->content.eg_info.eg_dst_ip));
+ dprintk("adding an egress entry, ip = %pI4, this should be our IP\n",
+ &msg->content.eg_info.eg_dst_ip);
- atomic_set(&entry->use, 1);
- dprintk("mpoa: mpoa_caches.c: new_eg_cache_entry: about to lock\n");
+ refcount_set(&entry->use, 1);
+ dprintk("new_eg_cache_entry: about to lock\n");
write_lock_irq(&client->egress_lock);
entry->next = client->eg_cache;
entry->prev = NULL;
@@ -466,99 +477,89 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, struct mpoa_cli
memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
entry->ctrl_info = msg->content.eg_info;
- do_gettimeofday(&(entry->tv));
+ entry->time = ktime_get_seconds();
entry->entry_state = EGRESS_RESOLVED;
- dprintk("mpoa: mpoa_caches.c: new_eg_cache_entry cache_id %lu\n", ntohl(entry->ctrl_info.cache_id));
- dprintk("mpoa: mpoa_caches.c: mps_ip = %u.%u.%u.%u\n",
- NIPQUAD(entry->ctrl_info.mps_ip));
- atomic_inc(&entry->use);
+ dprintk("new_eg_cache_entry cache_id %u\n",
+ ntohl(entry->ctrl_info.cache_id));
+ dprintk("mps_ip = %pI4\n", &entry->ctrl_info.mps_ip);
+ refcount_inc(&entry->use);
write_unlock_irq(&client->egress_lock);
- dprintk("mpoa: mpoa_caches.c: new_eg_cache_entry: unlocked\n");
+ dprintk("new_eg_cache_entry: unlocked\n");
return entry;
}
-static void update_eg_cache_entry(eg_cache_entry * entry, uint16_t holding_time)
+static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
{
- do_gettimeofday(&(entry->tv));
+ entry->time = ktime_get_seconds();
entry->entry_state = EGRESS_RESOLVED;
entry->ctrl_info.holding_time = holding_time;
-
- return;
}
static void clear_expired(struct mpoa_client *client)
{
eg_cache_entry *entry, *next_entry;
- struct timeval now;
+ time64_t now;
struct k_message msg;
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
write_lock_irq(&client->egress_lock);
entry = client->eg_cache;
- while(entry != NULL){
+ while (entry != NULL) {
next_entry = entry->next;
- if((now.tv_sec - entry->tv.tv_sec)
- > entry->ctrl_info.holding_time){
+ if ((now - entry->time) > entry->ctrl_info.holding_time) {
msg.type = SND_EGRESS_PURGE;
msg.content.eg_info = entry->ctrl_info;
- dprintk("mpoa: mpoa_caches.c: egress_cache: holding time expired, cache_id = %lu.\n",ntohl(entry->ctrl_info.cache_id));
+ dprintk("egress_cache: holding time expired, cache_id = %u.\n",
+ ntohl(entry->ctrl_info.cache_id));
msg_to_mpoad(&msg, client);
client->eg_ops->remove_entry(entry, client);
}
entry = next_entry;
}
write_unlock_irq(&client->egress_lock);
-
- return;
}
static void eg_destroy_cache(struct mpoa_client *mpc)
{
write_lock_irq(&mpc->egress_lock);
- while(mpc->eg_cache != NULL)
+ while (mpc->eg_cache != NULL)
mpc->eg_ops->remove_entry(mpc->eg_cache, mpc);
write_unlock_irq(&mpc->egress_lock);
-
- return;
}
-
-static struct in_cache_ops ingress_ops = {
- in_cache_add_entry, /* add_entry */
- in_cache_get, /* get */
- in_cache_get_with_mask, /* get_with_mask */
- in_cache_get_by_vcc, /* get_by_vcc */
- in_cache_put, /* put */
- in_cache_remove_entry, /* remove_entry */
- cache_hit, /* cache_hit */
- clear_count_and_expired, /* clear_count */
- check_resolving_entries, /* check_resolving */
- refresh_entries, /* refresh */
- in_destroy_cache /* destroy_cache */
+static const struct in_cache_ops ingress_ops = {
+ .add_entry = in_cache_add_entry,
+ .get = in_cache_get,
+ .get_with_mask = in_cache_get_with_mask,
+ .get_by_vcc = in_cache_get_by_vcc,
+ .put = in_cache_put,
+ .remove_entry = in_cache_remove_entry,
+ .cache_hit = cache_hit,
+ .clear_count = clear_count_and_expired,
+ .check_resolving = check_resolving_entries,
+ .refresh = refresh_entries,
+ .destroy_cache = in_destroy_cache
};
-static struct eg_cache_ops egress_ops = {
- eg_cache_add_entry, /* add_entry */
- eg_cache_get_by_cache_id, /* get_by_cache_id */
- eg_cache_get_by_tag, /* get_by_tag */
- eg_cache_get_by_vcc, /* get_by_vcc */
- eg_cache_get_by_src_ip, /* get_by_src_ip */
- eg_cache_put, /* put */
- eg_cache_remove_entry, /* remove_entry */
- update_eg_cache_entry, /* update */
- clear_expired, /* clear_expired */
- eg_destroy_cache /* destroy_cache */
+static const struct eg_cache_ops egress_ops = {
+ .add_entry = eg_cache_add_entry,
+ .get_by_cache_id = eg_cache_get_by_cache_id,
+ .get_by_tag = eg_cache_get_by_tag,
+ .get_by_vcc = eg_cache_get_by_vcc,
+ .get_by_src_ip = eg_cache_get_by_src_ip,
+ .put = eg_cache_put,
+ .remove_entry = eg_cache_remove_entry,
+ .update = update_eg_cache_entry,
+ .clear_expired = clear_expired,
+ .destroy_cache = eg_destroy_cache
};
-
void atm_mpoa_init_cache(struct mpoa_client *mpc)
{
mpc->in_ops = &ingress_ops;
mpc->eg_ops = &egress_ops;
-
- return;
}
diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h
index 84de977def2e..464c4c7f8d1f 100644
--- a/net/atm/mpoa_caches.h
+++ b/net/atm/mpoa_caches.h
@@ -1,77 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef MPOA_CACHES_H
#define MPOA_CACHES_H
+#include <linux/time64.h>
#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/atm.h>
#include <linux/atmdev.h>
#include <linux/atmmpc.h>
+#include <linux/refcount.h>
struct mpoa_client;
void atm_mpoa_init_cache(struct mpoa_client *mpc);
typedef struct in_cache_entry {
- struct in_cache_entry *next;
- struct in_cache_entry *prev;
- struct timeval tv;
- struct timeval reply_wait;
- struct timeval hold_down;
- uint32_t packets_fwded;
- uint16_t entry_state;
- uint32_t retry_time;
- uint32_t refresh_time;
- uint32_t count;
- struct atm_vcc *shortcut;
- uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
- struct in_ctrl_info ctrl_info;
- atomic_t use;
+ struct in_cache_entry *next;
+ struct in_cache_entry *prev;
+ time64_t time;
+ time64_t reply_wait;
+ time64_t hold_down;
+ uint32_t packets_fwded;
+ uint16_t entry_state;
+ uint32_t retry_time;
+ uint32_t refresh_time;
+ uint32_t count;
+ struct atm_vcc *shortcut;
+ uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
+ struct in_ctrl_info ctrl_info;
+ refcount_t use;
} in_cache_entry;
struct in_cache_ops{
- in_cache_entry *(*add_entry)(__be32 dst_ip,
- struct mpoa_client *client);
- in_cache_entry *(*get)(__be32 dst_ip, struct mpoa_client *client);
- in_cache_entry *(*get_with_mask)(__be32 dst_ip,
+ in_cache_entry *(*add_entry)(__be32 dst_ip,
+ struct mpoa_client *client);
+ in_cache_entry *(*get)(__be32 dst_ip, struct mpoa_client *client);
+ in_cache_entry *(*get_with_mask)(__be32 dst_ip,
struct mpoa_client *client,
__be32 mask);
- in_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc,
- struct mpoa_client *client);
- void (*put)(in_cache_entry *entry);
- void (*remove_entry)(in_cache_entry *delEntry,
+ in_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc,
+ struct mpoa_client *client);
+ void (*put)(in_cache_entry *entry);
+ void (*remove_entry)(in_cache_entry *delEntry,
struct mpoa_client *client );
- int (*cache_hit)(in_cache_entry *entry,
- struct mpoa_client *client);
- void (*clear_count)(struct mpoa_client *client);
- void (*check_resolving)(struct mpoa_client *client);
- void (*refresh)(struct mpoa_client *client);
- void (*destroy_cache)(struct mpoa_client *mpc);
+ int (*cache_hit)(in_cache_entry *entry,
+ struct mpoa_client *client);
+ void (*clear_count)(struct mpoa_client *client);
+ void (*check_resolving)(struct mpoa_client *client);
+ void (*refresh)(struct mpoa_client *client);
+ void (*destroy_cache)(struct mpoa_client *mpc);
};
typedef struct eg_cache_entry{
- struct eg_cache_entry *next;
- struct eg_cache_entry *prev;
- struct timeval tv;
- uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
- struct atm_vcc *shortcut;
- uint32_t packets_rcvd;
- uint16_t entry_state;
- __be32 latest_ip_addr; /* The src IP address of the last packet */
- struct eg_ctrl_info ctrl_info;
- atomic_t use;
+ struct eg_cache_entry *next;
+ struct eg_cache_entry *prev;
+ time64_t time;
+ uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
+ struct atm_vcc *shortcut;
+ uint32_t packets_rcvd;
+ uint16_t entry_state;
+ __be32 latest_ip_addr; /* The src IP address of the last packet */
+ struct eg_ctrl_info ctrl_info;
+ refcount_t use;
} eg_cache_entry;
struct eg_cache_ops{
- eg_cache_entry *(*add_entry)(struct k_message *msg, struct mpoa_client *client);
- eg_cache_entry *(*get_by_cache_id)(__be32 cache_id, struct mpoa_client *client);
- eg_cache_entry *(*get_by_tag)(__be32 cache_id, struct mpoa_client *client);
- eg_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, struct mpoa_client *client);
- eg_cache_entry *(*get_by_src_ip)(__be32 ipaddr, struct mpoa_client *client);
- void (*put)(eg_cache_entry *entry);
- void (*remove_entry)(eg_cache_entry *entry, struct mpoa_client *client);
- void (*update)(eg_cache_entry *entry, uint16_t holding_time);
- void (*clear_expired)(struct mpoa_client *client);
- void (*destroy_cache)(struct mpoa_client *mpc);
+ eg_cache_entry *(*add_entry)(struct k_message *msg, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_cache_id)(__be32 cache_id, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_tag)(__be32 cache_id, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_src_ip)(__be32 ipaddr, struct mpoa_client *client);
+ void (*put)(eg_cache_entry *entry);
+ void (*remove_entry)(eg_cache_entry *entry, struct mpoa_client *client);
+ void (*update)(eg_cache_entry *entry, uint16_t holding_time);
+ void (*clear_expired)(struct mpoa_client *client);
+ void (*destroy_cache)(struct mpoa_client *mpc);
};
@@ -85,7 +88,7 @@ struct eg_cache_ops{
/* VCC states */
#define OPEN 1
-#define CLOSED 0
+#define CLOSED 0
/* Egress cache entry states */
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 3844c85d602f..aaf64b953915 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -1,28 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#ifdef CONFIG_PROC_FS
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/string.h>
+#include <linux/string.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/atmmpc.h>
#include <linux/atm.h>
+#include <linux/gfp.h>
#include "mpc.h"
#include "mpoa_caches.h"
/*
* mpoa_proc.c: Implementation MPOA client's proc
- * file system statistics
+ * file system statistics
*/
#if 1
-#define dprintk printk /* debug */
+#define dprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
#else
-#define dprintk(format,args...)
+#define dprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
+#else
+#define ddprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
#endif
#define STAT_FILE_NAME "mpc" /* Our statistic file's name */
@@ -32,61 +49,52 @@ extern struct proc_dir_entry *atm_proc_root; /* from proc.c. */
static int proc_mpc_open(struct inode *inode, struct file *file);
static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
- size_t nbytes, loff_t *ppos);
+ size_t nbytes, loff_t *ppos);
static int parse_qos(const char *buff);
-/*
- * Define allowed FILE OPERATIONS
- */
-static struct file_operations mpc_file_operations = {
- .owner = THIS_MODULE,
- .open = proc_mpc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = proc_mpc_write,
- .release = seq_release,
+static const struct proc_ops mpc_proc_ops = {
+ .proc_open = proc_mpc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = proc_mpc_write,
+ .proc_release = seq_release,
};
/*
* Returns the state of an ingress cache entry as a string
*/
-static const char *ingress_state_string(int state){
- switch(state) {
+static const char *ingress_state_string(int state)
+{
+ switch (state) {
case INGRESS_RESOLVING:
- return "resolving ";
- break;
+ return "resolving ";
case INGRESS_RESOLVED:
- return "resolved ";
- break;
+ return "resolved ";
case INGRESS_INVALID:
- return "invalid ";
- break;
+ return "invalid ";
case INGRESS_REFRESHING:
- return "refreshing ";
- break;
- default:
- return "";
+ return "refreshing ";
}
+
+ return "";
}
/*
* Returns the state of an egress cache entry as a string
*/
-static const char *egress_state_string(int state){
- switch(state) {
+static const char *egress_state_string(int state)
+{
+ switch (state) {
case EGRESS_RESOLVED:
- return "resolved ";
- break;
+ return "resolved ";
case EGRESS_PURGE:
- return "purge ";
- break;
+ return "purge ";
case EGRESS_INVALID:
- return "invalid ";
- break;
- default:
- return "";
+ return "invalid ";
}
+
+ return "";
}
/*
@@ -123,11 +131,10 @@ static void mpc_stop(struct seq_file *m, void *v)
static int mpc_show(struct seq_file *m, void *v)
{
struct mpoa_client *mpc = v;
- unsigned char *temp;
int i;
in_cache_entry *in_entry;
eg_cache_entry *eg_entry;
- struct timeval now;
+ time64_t now;
unsigned char ip_string[16];
if (v == SEQ_START_TOKEN) {
@@ -135,20 +142,24 @@ static int mpc_show(struct seq_file *m, void *v)
return 0;
}
- seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
+ seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n");
- do_gettimeofday(&now);
+ now = ktime_get_seconds();
for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) {
- temp = (unsigned char *)&in_entry->ctrl_info.in_dst_ip;
- sprintf(ip_string,"%d.%d.%d.%d", temp[0], temp[1], temp[2], temp[3]);
+ unsigned long seconds_delta = now - in_entry->time;
+
+ sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip);
seq_printf(m, "%-16s%s%-14lu%-12u",
- ip_string,
- ingress_state_string(in_entry->entry_state),
- in_entry->ctrl_info.holding_time-(now.tv_sec-in_entry->tv.tv_sec),
- in_entry->packets_fwded);
+ ip_string,
+ ingress_state_string(in_entry->entry_state),
+ in_entry->ctrl_info.holding_time -
+ seconds_delta,
+ in_entry->packets_fwded);
if (in_entry->shortcut)
- seq_printf(m, " %-3d %-3d",in_entry->shortcut->vpi,in_entry->shortcut->vci);
+ seq_printf(m, " %-3d %-3d",
+ in_entry->shortcut->vpi,
+ in_entry->shortcut->vci);
seq_printf(m, "\n");
}
@@ -156,28 +167,31 @@ static int mpc_show(struct seq_file *m, void *v)
seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n");
for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) {
unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr;
- for(i = 0; i < ATM_ESA_LEN; i++)
+ unsigned long seconds_delta = now - eg_entry->time;
+
+ for (i = 0; i < ATM_ESA_LEN; i++)
seq_printf(m, "%02x", p[i]);
seq_printf(m, "\n%-16lu%s%-14lu%-15u",
(unsigned long)ntohl(eg_entry->ctrl_info.cache_id),
egress_state_string(eg_entry->entry_state),
- (eg_entry->ctrl_info.holding_time-(now.tv_sec-eg_entry->tv.tv_sec)),
+ (eg_entry->ctrl_info.holding_time - seconds_delta),
eg_entry->packets_rcvd);
-
+
/* latest IP address */
- temp = (unsigned char *)&eg_entry->latest_ip_addr;
- sprintf(ip_string, "%d.%d.%d.%d", temp[0], temp[1], temp[2], temp[3]);
+ sprintf(ip_string, "%pI4", &eg_entry->latest_ip_addr);
seq_printf(m, "%-16s", ip_string);
if (eg_entry->shortcut)
- seq_printf(m, " %-3d %-3d",eg_entry->shortcut->vpi,eg_entry->shortcut->vci);
+ seq_printf(m, " %-3d %-3d",
+ eg_entry->shortcut->vpi,
+ eg_entry->shortcut->vci);
seq_printf(m, "\n");
}
seq_printf(m, "\n");
return 0;
}
-static struct seq_operations mpc_op = {
+static const struct seq_operations mpc_op = {
.start = mpc_start,
.next = mpc_next,
.stop = mpc_stop,
@@ -190,51 +204,52 @@ static int proc_mpc_open(struct inode *inode, struct file *file)
}
static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
- size_t nbytes, loff_t *ppos)
+ size_t nbytes, loff_t *ppos)
{
- char *page, *p;
- unsigned len;
+ char *page, *p;
+ unsigned int len;
- if (nbytes == 0)
+ if (nbytes == 0)
return 0;
- if (nbytes >= PAGE_SIZE)
+ if (nbytes >= PAGE_SIZE)
nbytes = PAGE_SIZE-1;
- page = (char *)__get_free_page(GFP_KERNEL);
- if (!page)
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
return -ENOMEM;
- for (p = page, len = 0; len < nbytes; p++, len++) {
- if (get_user(*p, buff++)) {
+ for (p = page, len = 0; len < nbytes; p++) {
+ if (get_user(*p, buff++)) {
free_page((unsigned long)page);
return -EFAULT;
}
- if (*p == '\0' || *p == '\n')
- break;
- }
+ len += 1;
+ if (*p == '\0' || *p == '\n')
+ break;
+ }
- *p = '\0';
+ *p = '\0';
if (!parse_qos(page))
- printk("mpoa: proc_mpc_write: could not parse '%s'\n", page);
+ printk("mpoa: proc_mpc_write: could not parse '%s'\n", page);
+
+ free_page((unsigned long)page);
- free_page((unsigned long)page);
-
- return len;
+ return len;
}
static int parse_qos(const char *buff)
{
- /* possible lines look like this
- * add 130.230.54.142 tx=max_pcr,max_sdu rx=max_pcr,max_sdu
- */
- unsigned char ip[4];
+ /* possible lines look like this
+ * add 130.230.54.142 tx=max_pcr,max_sdu rx=max_pcr,max_sdu
+ */
+ unsigned char ip[4];
int tx_pcr, tx_sdu, rx_pcr, rx_sdu;
- __be32 ipaddr;
- struct atm_qos qos;
-
- memset(&qos, 0, sizeof(struct atm_qos));
+ __be32 ipaddr;
+ struct atm_qos qos;
+
+ memset(&qos, 0, sizeof(struct atm_qos));
if (sscanf(buff, "del %hhu.%hhu.%hhu.%hhu",
ip, ip+1, ip+2, ip+3) == 4) {
@@ -250,20 +265,17 @@ static int parse_qos(const char *buff)
ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu, &rx_pcr, &rx_sdu) != 8)
return 0;
- ipaddr = *(__be32 *)ip;
+ ipaddr = *(__be32 *)ip;
qos.txtp.traffic_class = ATM_CBR;
qos.txtp.max_pcr = tx_pcr;
qos.txtp.max_sdu = tx_sdu;
qos.rxtp.traffic_class = ATM_CBR;
qos.rxtp.max_pcr = rx_pcr;
qos.rxtp.max_sdu = rx_sdu;
- qos.aal = ATM_AAL5;
- dprintk("mpoa: mpoa_proc.c: parse_qos(): setting qos paramameters to tx=%d,%d rx=%d,%d\n",
- qos.txtp.max_pcr,
- qos.txtp.max_sdu,
- qos.rxtp.max_pcr,
- qos.rxtp.max_sdu
- );
+ qos.aal = ATM_AAL5;
+ dprintk("parse_qos(): setting qos parameters to tx=%d,%d rx=%d,%d\n",
+ qos.txtp.max_pcr, qos.txtp.max_sdu,
+ qos.rxtp.max_pcr, qos.rxtp.max_sdu);
atm_mpoa_add_qos(ipaddr, &qos);
return 1;
@@ -276,13 +288,11 @@ int mpc_proc_init(void)
{
struct proc_dir_entry *p;
- p = create_proc_entry(STAT_FILE_NAME, 0, atm_proc_root);
+ p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_proc_ops);
if (!p) {
- printk(KERN_ERR "Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
- return -ENOMEM;
- }
- p->proc_fops = &mpc_file_operations;
- p->owner = THIS_MODULE;
+ pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
+ return -ENOMEM;
+ }
return 0;
}
@@ -291,14 +301,7 @@ int mpc_proc_init(void)
*/
void mpc_proc_clean(void)
{
- remove_proc_entry(STAT_FILE_NAME,atm_proc_root);
+ remove_proc_entry(STAT_FILE_NAME, atm_proc_root);
}
-
#endif /* CONFIG_PROC_FS */
-
-
-
-
-
-
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index 19d5dfc0702f..3e4f17d335fe 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* net/atm/pppoatm.c - RFC2364 PPP over ATM/AAL5 */
/* Copyright 1999-2000 by Mitchell Blank Jr */
@@ -6,10 +7,6 @@
/* And help from Jens Axboe */
/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* This driver provides the encapsulation and framing for sending
* and receiving PPP frames in ATM AAL5 PDUs.
@@ -33,26 +30,23 @@
* These hooks are not yet available in ppp_generic
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/atm.h>
#include <linux/atmdev.h>
#include <linux/capability.h>
#include <linux/ppp_defs.h>
-#include <linux/if_ppp.h>
+#include <linux/ppp-ioctl.h>
#include <linux/ppp_channel.h>
#include <linux/atmppp.h>
#include "common.h"
-#if 0
-#define DPRINTK(format, args...) \
- printk(KERN_DEBUG "pppoatm: " format, ##args)
-#else
-#define DPRINTK(format, args...)
-#endif
-
enum pppoatm_encaps {
e_autodetect = PPPOATM_ENCAPS_AUTODETECT,
e_vc = PPPOATM_ENCAPS_VC,
@@ -63,14 +57,29 @@ struct pppoatm_vcc {
struct atm_vcc *atmvcc; /* VCC descriptor */
void (*old_push)(struct atm_vcc *, struct sk_buff *);
void (*old_pop)(struct atm_vcc *, struct sk_buff *);
+ void (*old_release_cb)(struct atm_vcc *);
+ struct module *old_owner;
/* keep old push/pop for detaching */
enum pppoatm_encaps encaps;
+ atomic_t inflight;
+ unsigned long blocked;
int flags; /* SC_COMP_PROT - compress protocol */
struct ppp_channel chan; /* interface to generic ppp layer */
struct tasklet_struct wakeup_tasklet;
};
/*
+ * We want to allow two packets in the queue. The one that's currently in
+ * flight, and *one* queued up ready for the ATM device to send immediately
+ * from its TX done IRQ. We want to be able to use atomic_inc_not_zero(), so
+ * inflight == -2 represents an empty queue, -1 one packet, and zero means
+ * there are two packets in the queue.
+ */
+#define NONE_INFLIGHT -2
+
+#define BLOCKED 0
+
+/*
* Header used for LLC Encapsulated PPP (4 bytes) followed by the LCP protocol
* ID (0xC021) used in autodetection
*/
@@ -92,11 +101,31 @@ static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan)
* doesn't want to be called in interrupt context, so we do it from
* a tasklet
*/
-static void pppoatm_wakeup_sender(unsigned long arg)
+static void pppoatm_wakeup_sender(struct tasklet_struct *t)
{
- ppp_output_wakeup((struct ppp_channel *) arg);
+ struct pppoatm_vcc *pvcc = from_tasklet(pvcc, t, wakeup_tasklet);
+
+ ppp_output_wakeup(&pvcc->chan);
}
+static void pppoatm_release_cb(struct atm_vcc *atmvcc)
+{
+ struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+
+ /*
+ * As in pppoatm_pop(), it's safe to clear the BLOCKED bit here because
+ * the wakeup *can't* race with pppoatm_send(). They both hold the PPP
+ * channel's ->downl lock. And the potential race with *setting* it,
+ * which leads to the double-check dance in pppoatm_may_send(), doesn't
+ * exist here. In the sock_owned_by_user() case in pppoatm_send(), we
+ * set the BLOCKED bit while the socket is still locked. We know that
+ * ->release_cb() can't be called until that's done.
+ */
+ if (test_and_clear_bit(BLOCKED, &pvcc->blocked))
+ tasklet_schedule(&pvcc->wakeup_tasklet);
+ if (pvcc->old_release_cb)
+ pvcc->old_release_cb(atmvcc);
+}
/*
* This gets called every time the ATM card has finished sending our
* skb. The ->old_pop will take care up normal atm flow control,
@@ -105,16 +134,30 @@ static void pppoatm_wakeup_sender(unsigned long arg)
static void pppoatm_pop(struct atm_vcc *atmvcc, struct sk_buff *skb)
{
struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+
pvcc->old_pop(atmvcc, skb);
+ atomic_dec(&pvcc->inflight);
+
/*
- * We don't really always want to do this since it's
- * really inefficient - it would be much better if we could
- * test if we had actually throttled the generic layer.
- * Unfortunately then there would be a nasty SMP race where
- * we could clear that flag just as we refuse another packet.
- * For now we do the safe thing.
+ * We always used to run the wakeup tasklet unconditionally here, for
+ * fear of race conditions where we clear the BLOCKED flag just as we
+ * refuse another packet in pppoatm_send(). This was quite inefficient.
+ *
+ * In fact it's OK. The PPP core will only ever call pppoatm_send()
+ * while holding the channel->downl lock. And ppp_output_wakeup() as
+ * called by the tasklet will *also* grab that lock. So even if another
+ * CPU is in pppoatm_send() right now, the tasklet isn't going to race
+ * with it. The wakeup *will* happen after the other CPU is safely out
+ * of pppoatm_send() again.
+ *
+ * So if the CPU in pppoatm_send() has already set the BLOCKED bit and
+ * it about to return, that's fine. We trigger a wakeup which will
+ * happen later. And if the CPU in pppoatm_send() *hasn't* set the
+ * BLOCKED bit yet, that's fine too because of the double check in
+ * pppoatm_may_send() which is commented there.
*/
- tasklet_schedule(&pvcc->wakeup_tasklet);
+ if (test_and_clear_bit(BLOCKED, &pvcc->blocked))
+ tasklet_schedule(&pvcc->wakeup_tasklet);
}
/*
@@ -127,23 +170,26 @@ static void pppoatm_unassign_vcc(struct atm_vcc *atmvcc)
pvcc = atmvcc_to_pvcc(atmvcc);
atmvcc->push = pvcc->old_push;
atmvcc->pop = pvcc->old_pop;
+ atmvcc->release_cb = pvcc->old_release_cb;
tasklet_kill(&pvcc->wakeup_tasklet);
ppp_unregister_channel(&pvcc->chan);
atmvcc->user_back = NULL;
kfree(pvcc);
- /* Gee, I hope we have the big kernel lock here... */
- module_put(THIS_MODULE);
}
/* Called when an AAL5 PDU comes in */
static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
{
struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
- DPRINTK("pppoatm push\n");
+ pr_debug("\n");
if (skb == NULL) { /* VCC was closed */
- DPRINTK("removing ATMPPP VCC %p\n", pvcc);
+ struct module *module;
+
+ pr_debug("removing ATMPPP VCC %p\n", pvcc);
+ module = pvcc->old_owner;
pppoatm_unassign_vcc(atmvcc);
atmvcc->push(atmvcc, NULL); /* Pass along bad news */
+ module_put(module);
return;
}
atm_return(atmvcc, skb->truesize);
@@ -172,22 +218,64 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
pvcc->chan.mtu += LLC_LEN;
break;
}
- DPRINTK("(unit %d): Couldn't autodetect yet "
- "(skb: %02X %02X %02X %02X %02X %02X)\n",
- pvcc->chan.unit,
- skb->data[0], skb->data[1], skb->data[2],
- skb->data[3], skb->data[4], skb->data[5]);
+ pr_debug("Couldn't autodetect yet (skb: %6ph)\n", skb->data);
goto error;
case e_vc:
break;
}
ppp_input(&pvcc->chan, skb);
return;
- error:
+
+error:
kfree_skb(skb);
ppp_input_error(&pvcc->chan, 0);
}
+static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size)
+{
+ /*
+ * It's not clear that we need to bother with using atm_may_send()
+ * to check we don't exceed sk->sk_sndbuf. If userspace sets a
+ * value of sk_sndbuf which is lower than the MTU, we're going to
+ * block for ever. But the code always did that before we introduced
+ * the packet count limit, so...
+ */
+ if (atm_may_send(pvcc->atmvcc, size) &&
+ atomic_inc_not_zero(&pvcc->inflight))
+ return 1;
+
+ /*
+ * We use test_and_set_bit() rather than set_bit() here because
+ * we need to ensure there's a memory barrier after it. The bit
+ * *must* be set before we do the atomic_inc() on pvcc->inflight.
+ * There's no smp_mb__after_set_bit(), so it's this or abuse
+ * smp_mb__after_atomic().
+ */
+ test_and_set_bit(BLOCKED, &pvcc->blocked);
+
+ /*
+ * We may have raced with pppoatm_pop(). If it ran for the
+ * last packet in the queue, *just* before we set the BLOCKED
+ * bit, then it might never run again and the channel could
+ * remain permanently blocked. Cope with that race by checking
+ * *again*. If it did run in that window, we'll have space on
+ * the queue now and can return success. It's harmless to leave
+ * the BLOCKED flag set, since it's only used as a trigger to
+ * run the wakeup tasklet. Another wakeup will never hurt.
+ * If pppoatm_pop() is running but hasn't got as far as making
+ * space on the queue yet, then it hasn't checked the BLOCKED
+ * flag yet either, so we're safe in that case too. It'll issue
+ * an "immediate" wakeup... where "immediate" actually involves
+ * taking the PPP channel's ->downl lock, which is held by the
+ * code path that calls pppoatm_send(), and is thus going to
+ * wait for us to finish.
+ */
+ if (atm_may_send(pvcc->atmvcc, size) &&
+ atomic_inc_not_zero(&pvcc->inflight))
+ return 1;
+
+ return 0;
+}
/*
* Called by the ppp_generic.c to send a packet - returns true if packet
* was accepted. If we return false, then it's our job to call
@@ -201,47 +289,73 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb)
{
struct pppoatm_vcc *pvcc = chan_to_pvcc(chan);
+ struct atm_vcc *vcc;
+ int ret;
+
ATM_SKB(skb)->vcc = pvcc->atmvcc;
- DPRINTK("(unit %d): pppoatm_send (skb=0x%p, vcc=0x%p)\n",
- pvcc->chan.unit, skb, pvcc->atmvcc);
+ pr_debug("(skb=0x%p, vcc=0x%p)\n", skb, pvcc->atmvcc);
if (skb->data[0] == '\0' && (pvcc->flags & SC_COMP_PROT))
(void) skb_pull(skb, 1);
+
+ vcc = ATM_SKB(skb)->vcc;
+ bh_lock_sock(sk_atm(vcc));
+ if (sock_owned_by_user(sk_atm(vcc))) {
+ /*
+ * Needs to happen (and be flushed, hence test_and_) before we unlock
+ * the socket. It needs to be seen by the time our ->release_cb gets
+ * called.
+ */
+ test_and_set_bit(BLOCKED, &pvcc->blocked);
+ goto nospace;
+ }
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
+ bh_unlock_sock(sk_atm(vcc));
+ kfree_skb(skb);
+ return DROP_PACKET;
+ }
+
switch (pvcc->encaps) { /* LLC encapsulation needed */
case e_llc:
if (skb_headroom(skb) < LLC_LEN) {
struct sk_buff *n;
n = skb_realloc_headroom(skb, LLC_LEN);
if (n != NULL &&
- !atm_may_send(pvcc->atmvcc, n->truesize)) {
+ !pppoatm_may_send(pvcc, n->truesize)) {
kfree_skb(n);
goto nospace;
}
- kfree_skb(skb);
- if ((skb = n) == NULL)
+ consume_skb(skb);
+ skb = n;
+ if (skb == NULL) {
+ bh_unlock_sock(sk_atm(vcc));
return DROP_PACKET;
- } else if (!atm_may_send(pvcc->atmvcc, skb->truesize))
+ }
+ } else if (!pppoatm_may_send(pvcc, skb->truesize))
goto nospace;
memcpy(skb_push(skb, LLC_LEN), pppllc, LLC_LEN);
break;
case e_vc:
- if (!atm_may_send(pvcc->atmvcc, skb->truesize))
+ if (!pppoatm_may_send(pvcc, skb->truesize))
goto nospace;
break;
case e_autodetect:
- DPRINTK("(unit %d): Trying to send without setting encaps!\n",
- pvcc->chan.unit);
+ bh_unlock_sock(sk_atm(vcc));
+ pr_debug("Trying to send without setting encaps!\n");
kfree_skb(skb);
return 1;
}
- atomic_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc);
- ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options;
- DPRINTK("(unit %d): atm_skb(%p)->vcc(%p)->dev(%p)\n",
- pvcc->chan.unit, skb, ATM_SKB(skb)->vcc,
- ATM_SKB(skb)->vcc->dev);
- return ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
+ atm_account_tx(vcc, skb);
+ pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n",
+ skb, ATM_SKB(skb)->vcc, ATM_SKB(skb)->vcc->dev);
+ ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
? DROP_PACKET : 1;
- nospace:
+ bh_unlock_sock(sk_atm(vcc));
+ return ret;
+nospace:
+ bh_unlock_sock(sk_atm(vcc));
/*
* We don't have space to send this SKB now, but we might have
* already applied SC_COMP_PROT compression, so may need to undo
@@ -267,7 +381,7 @@ static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
return -ENOTTY;
}
-static /*const*/ struct ppp_channel_ops pppoatm_ops = {
+static const struct ppp_channel_ops pppoatm_ops = {
.start_xmit = pppoatm_send,
.ioctl = pppoatm_devppp_ioctl,
};
@@ -277,11 +391,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
struct atm_backend_ppp be;
struct pppoatm_vcc *pvcc;
int err;
- /*
- * Each PPPoATM instance has its own tasklet - this is just a
- * prototypical one used to initialize them
- */
- static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0);
+
if (copy_from_user(&be, arg, sizeof be))
return -EFAULT;
if (be.encaps != PPPOATM_ENCAPS_AUTODETECT &&
@@ -291,23 +401,34 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
if (pvcc == NULL)
return -ENOMEM;
pvcc->atmvcc = atmvcc;
+
+ /* Maximum is zero, so that we can use atomic_inc_not_zero() */
+ atomic_set(&pvcc->inflight, NONE_INFLIGHT);
pvcc->old_push = atmvcc->push;
pvcc->old_pop = atmvcc->pop;
+ pvcc->old_owner = atmvcc->owner;
+ pvcc->old_release_cb = atmvcc->release_cb;
pvcc->encaps = (enum pppoatm_encaps) be.encaps;
pvcc->chan.private = pvcc;
pvcc->chan.ops = &pppoatm_ops;
pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN -
(be.encaps == e_vc ? 0 : LLC_LEN);
- pvcc->wakeup_tasklet = tasklet_proto;
- pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan;
- if ((err = ppp_register_channel(&pvcc->chan)) != 0) {
+ tasklet_setup(&pvcc->wakeup_tasklet, pppoatm_wakeup_sender);
+ err = ppp_register_channel(&pvcc->chan);
+ if (err != 0) {
kfree(pvcc);
return err;
}
atmvcc->user_back = pvcc;
atmvcc->push = pppoatm_push;
atmvcc->pop = pppoatm_pop;
+ atmvcc->release_cb = pppoatm_release_cb;
__module_get(THIS_MODULE);
+ atmvcc->owner = THIS_MODULE;
+
+ /* re-process everything received between connection setup and
+ backend setup */
+ vcc_process_recv_queue(atmvcc);
return 0;
}
@@ -332,6 +453,8 @@ static int pppoatm_ioctl(struct socket *sock, unsigned int cmd,
return -ENOIOCTLCMD;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
+ if (sock->state != SS_CONNECTED)
+ return -EINVAL;
return pppoatm_assign_vcc(atmvcc, argp);
}
case PPPIOCGCHAN:
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 91fe5f53ff11..9bf736290e48 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/proc.c - ATM /proc interface
*
* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA
@@ -22,29 +23,31 @@
#include <linux/netdevice.h>
#include <linux/atmclip.h>
#include <linux/init.h> /* for __init */
+#include <linux/slab.h>
+#include <net/net_namespace.h>
#include <net/atmclip.h>
-#include <asm/uaccess.h>
-#include <asm/atomic.h>
-#include <asm/param.h> /* for HZ */
+#include <linux/uaccess.h>
+#include <linux/param.h> /* for HZ */
+#include <linux/atomic.h>
#include "resources.h"
#include "common.h" /* atm_proc_init prototype */
#include "signaling.h" /* to get sigd - ugly too */
-static ssize_t proc_dev_atm_read(struct file *file,char __user *buf,size_t count,
- loff_t *pos);
+static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos);
-static struct file_operations proc_atm_dev_ops = {
- .owner = THIS_MODULE,
- .read = proc_dev_atm_read,
+static const struct proc_ops atm_dev_proc_ops = {
+ .proc_read = proc_dev_atm_read,
+ .proc_lseek = noop_llseek,
};
static void add_stats(struct seq_file *seq, const char *aal,
const struct k_atm_aal_stats *stats)
{
seq_printf(seq, "%s ( %d %d %d %d %d )", aal,
- atomic_read(&stats->tx),atomic_read(&stats->tx_err),
- atomic_read(&stats->rx),atomic_read(&stats->rx_err),
- atomic_read(&stats->rx_drop));
+ atomic_read(&stats->tx), atomic_read(&stats->tx_err),
+ atomic_read(&stats->rx), atomic_read(&stats->rx_err),
+ atomic_read(&stats->rx_drop));
}
static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev)
@@ -58,14 +61,13 @@ static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev)
add_stats(seq, "0", &dev->stats.aal0);
seq_puts(seq, " ");
add_stats(seq, "5", &dev->stats.aal5);
- seq_printf(seq, "\t[%d]", atomic_read(&dev->refcnt));
+ seq_printf(seq, "\t[%d]", refcount_read(&dev->refcnt));
seq_putc(seq, '\n');
}
struct vcc_state {
int bucket;
struct sock *sk;
- int family;
};
static inline int compare_family(struct sock *sk, int family)
@@ -77,7 +79,7 @@ static int __vcc_walk(struct sock **sock, int family, int *bucket, loff_t l)
{
struct sock *sk = *sock;
- if (sk == (void *)1) {
+ if (sk == SEQ_START_TOKEN) {
for (*bucket = 0; *bucket < VCC_HTABLE_SIZE; ++*bucket) {
struct hlist_head *head = &vcc_hash[*bucket];
@@ -86,7 +88,7 @@ static int __vcc_walk(struct sock **sock, int family, int *bucket, loff_t l)
break;
}
l--;
- }
+ }
try_again:
for (; sk; sk = sk_next(sk)) {
l -= compare_family(sk, family);
@@ -97,88 +99,62 @@ try_again:
sk = sk_head(&vcc_hash[*bucket]);
goto try_again;
}
- sk = (void *)1;
+ sk = SEQ_START_TOKEN;
out:
*sock = sk;
return (l < 0);
}
-static inline void *vcc_walk(struct vcc_state *state, loff_t l)
+static inline void *vcc_walk(struct seq_file *seq, loff_t l)
{
- return __vcc_walk(&state->sk, state->family, &state->bucket, l) ?
- state : NULL;
-}
-
-static int __vcc_seq_open(struct inode *inode, struct file *file,
- int family, struct seq_operations *ops)
-{
- struct vcc_state *state;
- struct seq_file *seq;
- int rc = -ENOMEM;
-
- state = kmalloc(sizeof(*state), GFP_KERNEL);
- if (!state)
- goto out;
-
- rc = seq_open(file, ops);
- if (rc)
- goto out_kfree;
-
- state->family = family;
-
- seq = file->private_data;
- seq->private = state;
-out:
- return rc;
-out_kfree:
- kfree(state);
- goto out;
-}
+ struct vcc_state *state = seq->private;
+ int family = (uintptr_t)(pde_data(file_inode(seq->file)));
-static int vcc_seq_release(struct inode *inode, struct file *file)
-{
- return seq_release_private(inode, file);
+ return __vcc_walk(&state->sk, family, &state->bucket, l) ?
+ state : NULL;
}
static void *vcc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(vcc_sklist_lock)
{
struct vcc_state *state = seq->private;
loff_t left = *pos;
read_lock(&vcc_sklist_lock);
- state->sk = (void *)1;
- return left ? vcc_walk(state, left) : (void *)1;
+ state->sk = SEQ_START_TOKEN;
+ return left ? vcc_walk(seq, left) : SEQ_START_TOKEN;
}
static void vcc_seq_stop(struct seq_file *seq, void *v)
+ __releases(vcc_sklist_lock)
{
read_unlock(&vcc_sklist_lock);
}
static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct vcc_state *state = seq->private;
-
- v = vcc_walk(state, 1);
- *pos += !!PTR_ERR(v);
+ v = vcc_walk(seq, 1);
+ (*pos)++;
return v;
}
static void pvc_info(struct seq_file *seq, struct atm_vcc *vcc)
{
- static const char *class_name[] = { "off","UBR","CBR","VBR","ABR" };
- static const char *aal_name[] = {
+ static const char *const class_name[] = {
+ "off", "UBR", "CBR", "VBR", "ABR"};
+ static const char *const aal_name[] = {
"---", "1", "2", "3/4", /* 0- 3 */
"???", "5", "???", "???", /* 4- 7 */
"???", "???", "???", "???", /* 8-11 */
"???", "0", "???", "???"}; /* 12-15 */
seq_printf(seq, "%3d %3d %5d %-3s %7d %-5s %7d %-6s",
- vcc->dev->number,vcc->vpi,vcc->vci,
- vcc->qos.aal >= sizeof(aal_name)/sizeof(aal_name[0]) ? "err" :
- aal_name[vcc->qos.aal],vcc->qos.rxtp.min_pcr,
- class_name[vcc->qos.rxtp.traffic_class],vcc->qos.txtp.min_pcr,
- class_name[vcc->qos.txtp.traffic_class]);
+ vcc->dev->number, vcc->vpi, vcc->vci,
+ vcc->qos.aal >= ARRAY_SIZE(aal_name) ? "err" :
+ aal_name[vcc->qos.aal], vcc->qos.rxtp.min_pcr,
+ class_name[vcc->qos.rxtp.traffic_class],
+ vcc->qos.txtp.min_pcr,
+ class_name[vcc->qos.txtp.traffic_class]);
if (test_bit(ATM_VF_IS_CLIP, &vcc->flags)) {
struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
struct net_device *dev;
@@ -193,7 +169,7 @@ static void pvc_info(struct seq_file *seq, struct atm_vcc *vcc)
static const char *vcc_state(struct atm_vcc *vcc)
{
- static const char *map[] = { ATM_VS2TXT_MAP };
+ static const char *const map[] = { ATM_VS2TXT_MAP };
return map[ATM_VF2VS(vcc->flags)];
}
@@ -202,33 +178,34 @@ static void vcc_info(struct seq_file *seq, struct atm_vcc *vcc)
{
struct sock *sk = sk_atm(vcc);
- seq_printf(seq, "%p ", vcc);
+ seq_printf(seq, "%pK ", vcc);
if (!vcc->dev)
seq_printf(seq, "Unassigned ");
- else
+ else
seq_printf(seq, "%3d %3d %5d ", vcc->dev->number, vcc->vpi,
vcc->vci);
switch (sk->sk_family) {
- case AF_ATMPVC:
- seq_printf(seq, "PVC");
- break;
- case AF_ATMSVC:
- seq_printf(seq, "SVC");
- break;
- default:
- seq_printf(seq, "%3d", sk->sk_family);
+ case AF_ATMPVC:
+ seq_printf(seq, "PVC");
+ break;
+ case AF_ATMSVC:
+ seq_printf(seq, "SVC");
+ break;
+ default:
+ seq_printf(seq, "%3d", sk->sk_family);
}
- seq_printf(seq, " %04lx %5d %7d/%7d %7d/%7d [%d]\n", vcc->flags, sk->sk_err,
- atomic_read(&sk->sk_wmem_alloc), sk->sk_sndbuf,
- atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf,
- atomic_read(&sk->sk_refcnt));
+ seq_printf(seq, " %04lx %5d %7d/%7d %7d/%7d [%d]\n",
+ vcc->flags, sk->sk_err,
+ sk_wmem_alloc_get(sk), sk->sk_sndbuf,
+ sk_rmem_alloc_get(sk), sk->sk_rcvbuf,
+ refcount_read(&sk->sk_refcnt));
}
static void svc_info(struct seq_file *seq, struct atm_vcc *vcc)
{
if (!vcc->dev)
seq_printf(seq, sizeof(void *) == 4 ?
- "N/A@%p%10s" : "N/A@%p%2s", vcc, "");
+ "N/A@%pK%10s" : "N/A@%pK%2s", vcc, "");
else
seq_printf(seq, "%3d %3d %5d ",
vcc->dev->number, vcc->vpi, vcc->vci);
@@ -249,42 +226,30 @@ static int atm_dev_seq_show(struct seq_file *seq, void *v)
static char atm_dev_banner[] =
"Itf Type ESI/\"MAC\"addr "
"AAL(TX,err,RX,err,drop) ... [refcnt]\n";
-
- if (v == (void *)1)
+
+ if (v == &atm_devs)
seq_puts(seq, atm_dev_banner);
else {
struct atm_dev *dev = list_entry(v, struct atm_dev, dev_list);
atm_dev_info(seq, dev);
}
- return 0;
+ return 0;
}
-
-static struct seq_operations atm_dev_seq_ops = {
+
+static const struct seq_operations atm_dev_seq_ops = {
.start = atm_dev_seq_start,
.next = atm_dev_seq_next,
.stop = atm_dev_seq_stop,
.show = atm_dev_seq_show,
};
-
-static int atm_dev_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &atm_dev_seq_ops);
-}
-
-static struct file_operations devices_seq_fops = {
- .open = atm_dev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
static int pvc_seq_show(struct seq_file *seq, void *v)
{
- static char atm_pvc_banner[] =
+ static char atm_pvc_banner[] =
"Itf VPI VCI AAL RX(PCR,Class) TX(PCR,Class)\n";
- if (v == (void *)1)
+ if (v == SEQ_START_TOKEN)
seq_puts(seq, atm_pvc_banner);
else {
struct vcc_state *state = seq->private;
@@ -295,65 +260,41 @@ static int pvc_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations pvc_seq_ops = {
+static const struct seq_operations pvc_seq_ops = {
.start = vcc_seq_start,
.next = vcc_seq_next,
.stop = vcc_seq_stop,
.show = pvc_seq_show,
};
-static int pvc_seq_open(struct inode *inode, struct file *file)
-{
- return __vcc_seq_open(inode, file, PF_ATMPVC, &pvc_seq_ops);
-}
-
-static struct file_operations pvc_seq_fops = {
- .open = pvc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = vcc_seq_release,
-};
-
static int vcc_seq_show(struct seq_file *seq, void *v)
{
- if (v == (void *)1) {
- seq_printf(seq, sizeof(void *) == 4 ? "%-8s%s" : "%-16s%s",
- "Address ", "Itf VPI VCI Fam Flags Reply "
- "Send buffer Recv buffer [refcnt]\n");
- } else {
- struct vcc_state *state = seq->private;
- struct atm_vcc *vcc = atm_sk(state->sk);
-
- vcc_info(seq, vcc);
- }
- return 0;
-}
-
-static struct seq_operations vcc_seq_ops = {
- .start = vcc_seq_start,
- .next = vcc_seq_next,
- .stop = vcc_seq_stop,
- .show = vcc_seq_show,
-};
-
-static int vcc_seq_open(struct inode *inode, struct file *file)
-{
- return __vcc_seq_open(inode, file, 0, &vcc_seq_ops);
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, sizeof(void *) == 4 ? "%-8s%s" : "%-16s%s",
+ "Address ", "Itf VPI VCI Fam Flags Reply "
+ "Send buffer Recv buffer [refcnt]\n");
+ } else {
+ struct vcc_state *state = seq->private;
+ struct atm_vcc *vcc = atm_sk(state->sk);
+
+ vcc_info(seq, vcc);
+ }
+ return 0;
}
-
-static struct file_operations vcc_seq_fops = {
- .open = vcc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = vcc_seq_release,
+
+static const struct seq_operations vcc_seq_ops = {
+ .start = vcc_seq_start,
+ .next = vcc_seq_next,
+ .stop = vcc_seq_stop,
+ .show = vcc_seq_show,
};
static int svc_seq_show(struct seq_file *seq, void *v)
{
- static char atm_svc_banner[] =
+ static const char atm_svc_banner[] =
"Itf VPI VCI State Remote\n";
- if (v == (void *)1)
+ if (v == SEQ_START_TOKEN)
seq_puts(seq, atm_svc_banner);
else {
struct vcc_state *state = seq->private;
@@ -364,25 +305,13 @@ static int svc_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations svc_seq_ops = {
+static const struct seq_operations svc_seq_ops = {
.start = vcc_seq_start,
.next = vcc_seq_next,
.stop = vcc_seq_stop,
.show = svc_seq_show,
};
-static int svc_seq_open(struct inode *inode, struct file *file)
-{
- return __vcc_seq_open(inode, file, PF_ATMSVC, &svc_seq_ops);
-}
-
-static struct file_operations svc_seq_fops = {
- .open = svc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = vcc_seq_release,
-};
-
static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
size_t count, loff_t *pos)
{
@@ -390,32 +319,34 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
unsigned long page;
int length;
- if (count == 0) return 0;
+ if (count == 0)
+ return 0;
page = get_zeroed_page(GFP_KERNEL);
- if (!page) return -ENOMEM;
- dev = PDE(file->f_dentry->d_inode)->data;
+ if (!page)
+ return -ENOMEM;
+ dev = pde_data(file_inode(file));
if (!dev->ops->proc_read)
length = -EINVAL;
else {
- length = dev->ops->proc_read(dev,pos,(char *) page);
- if (length > count) length = -EINVAL;
+ length = dev->ops->proc_read(dev, pos, (char *)page);
+ if (length > count)
+ length = -EINVAL;
}
if (length >= 0) {
- if (copy_to_user(buf,(char *) page,length)) length = -EFAULT;
+ if (copy_to_user(buf, (char *)page, length))
+ length = -EFAULT;
(*pos)++;
}
free_page(page);
return length;
}
-
struct proc_dir_entry *atm_proc_root;
EXPORT_SYMBOL(atm_proc_root);
int atm_proc_dev_register(struct atm_dev *dev)
{
- int digits,num;
int error;
/* No proc info */
@@ -423,29 +354,22 @@ int atm_proc_dev_register(struct atm_dev *dev)
return 0;
error = -ENOMEM;
- digits = 0;
- for (num = dev->number; num; num /= 10) digits++;
- if (!digits) digits++;
-
- dev->proc_name = kmalloc(strlen(dev->type) + digits + 2, GFP_KERNEL);
+ dev->proc_name = kasprintf(GFP_KERNEL, "%s:%d", dev->type, dev->number);
if (!dev->proc_name)
goto err_out;
- sprintf(dev->proc_name,"%s:%d",dev->type, dev->number);
- dev->proc_entry = create_proc_entry(dev->proc_name, 0, atm_proc_root);
+ dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root,
+ &atm_dev_proc_ops, dev);
if (!dev->proc_entry)
goto err_free_name;
- dev->proc_entry->data = dev;
- dev->proc_entry->proc_fops = &proc_atm_dev_ops;
- dev->proc_entry->owner = THIS_MODULE;
return 0;
+
err_free_name:
kfree(dev->proc_name);
err_out:
return error;
}
-
void atm_proc_dev_deregister(struct atm_dev *dev)
{
if (!dev->ops->proc_read)
@@ -455,59 +379,22 @@ void atm_proc_dev_deregister(struct atm_dev *dev)
kfree(dev->proc_name);
}
-static struct atm_proc_entry {
- char *name;
- struct file_operations *proc_fops;
- struct proc_dir_entry *dirent;
-} atm_proc_ents[] = {
- { .name = "devices", .proc_fops = &devices_seq_fops },
- { .name = "pvc", .proc_fops = &pvc_seq_fops },
- { .name = "svc", .proc_fops = &svc_seq_fops },
- { .name = "vc", .proc_fops = &vcc_seq_fops },
- { .name = NULL, .proc_fops = NULL }
-};
-
-static void atm_proc_dirs_remove(void)
-{
- static struct atm_proc_entry *e;
-
- for (e = atm_proc_ents; e->name; e++) {
- if (e->dirent)
- remove_proc_entry(e->name, atm_proc_root);
- }
- remove_proc_entry("net/atm", NULL);
-}
-
int __init atm_proc_init(void)
{
- static struct atm_proc_entry *e;
- int ret;
-
- atm_proc_root = proc_mkdir("net/atm",NULL);
+ atm_proc_root = proc_net_mkdir(&init_net, "atm", init_net.proc_net);
if (!atm_proc_root)
- goto err_out;
- for (e = atm_proc_ents; e->name; e++) {
- struct proc_dir_entry *dirent;
-
- dirent = create_proc_entry(e->name, S_IRUGO, atm_proc_root);
- if (!dirent)
- goto err_out_remove;
- dirent->proc_fops = e->proc_fops;
- dirent->owner = THIS_MODULE;
- e->dirent = dirent;
- }
- ret = 0;
-out:
- return ret;
-
-err_out_remove:
- atm_proc_dirs_remove();
-err_out:
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
+ proc_create_seq("devices", 0444, atm_proc_root, &atm_dev_seq_ops);
+ proc_create_seq_private("pvc", 0444, atm_proc_root, &pvc_seq_ops,
+ sizeof(struct vcc_state), (void *)(uintptr_t)PF_ATMPVC);
+ proc_create_seq_private("svc", 0444, atm_proc_root, &svc_seq_ops,
+ sizeof(struct vcc_state), (void *)(uintptr_t)PF_ATMSVC);
+ proc_create_seq_private("vc", 0444, atm_proc_root, &vcc_seq_ops,
+ sizeof(struct vcc_state), NULL);
+ return 0;
}
void atm_proc_exit(void)
{
- atm_proc_dirs_remove();
+ remove_proc_subtree("atm", init_net.proc_net);
}
diff --git a/net/atm/protocols.h b/net/atm/protocols.h
index acdfc856222d..18d4d008bac3 100644
--- a/net/atm/protocols.h
+++ b/net/atm/protocols.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* net/atm/protocols.h - ATM protocol handler entry points */
/* Written 1995-1997 by Werner Almesberger, EPFL LRC */
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index b2148b43a426..8f5e76f5dd9e 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/pvc.c - ATM PVC sockets */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
@@ -11,38 +12,42 @@
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/bitops.h>
+#include <linux/export.h>
#include <net/sock.h> /* for sock_no_* */
#include "resources.h" /* devs and vccs */
#include "common.h" /* common for PVCs and SVCs */
-static int pvc_shutdown(struct socket *sock,int how)
+static int pvc_shutdown(struct socket *sock, int how)
{
return 0;
}
-
-static int pvc_bind(struct socket *sock,struct sockaddr *sockaddr,
- int sockaddr_len)
+static int pvc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr,
+ int sockaddr_len)
{
struct sock *sk = sock->sk;
struct sockaddr_atmpvc *addr;
struct atm_vcc *vcc;
int error;
- if (sockaddr_len != sizeof(struct sockaddr_atmpvc)) return -EINVAL;
- addr = (struct sockaddr_atmpvc *) sockaddr;
- if (addr->sap_family != AF_ATMPVC) return -EAFNOSUPPORT;
+ if (sockaddr_len != sizeof(struct sockaddr_atmpvc))
+ return -EINVAL;
+ addr = (struct sockaddr_atmpvc *)sockaddr;
+ if (addr->sap_family != AF_ATMPVC)
+ return -EAFNOSUPPORT;
lock_sock(sk);
vcc = ATM_SD(sock);
if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
error = -EBADFD;
goto out;
}
- if (test_bit(ATM_VF_PARTIAL,&vcc->flags)) {
- if (vcc->vpi != ATM_VPI_UNSPEC) addr->sap_addr.vpi = vcc->vpi;
- if (vcc->vci != ATM_VCI_UNSPEC) addr->sap_addr.vci = vcc->vci;
+ if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) {
+ if (vcc->vpi != ATM_VPI_UNSPEC)
+ addr->sap_addr.vpi = vcc->vpi;
+ if (vcc->vci != ATM_VCI_UNSPEC)
+ addr->sap_addr.vci = vcc->vci;
}
error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi,
addr->sap_addr.vci);
@@ -51,15 +56,14 @@ out:
return error;
}
-
-static int pvc_connect(struct socket *sock,struct sockaddr *sockaddr,
- int sockaddr_len,int flags)
+static int pvc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr,
+ int sockaddr_len, int flags)
{
- return pvc_bind(sock,sockaddr,sockaddr_len);
+ return pvc_bind(sock, sockaddr, sockaddr_len);
}
static int pvc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
int error;
@@ -70,9 +74,8 @@ static int pvc_setsockopt(struct socket *sock, int level, int optname,
return error;
}
-
static int pvc_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+ char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
int error;
@@ -83,24 +86,23 @@ static int pvc_getsockopt(struct socket *sock, int level, int optname,
return error;
}
-
-static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
- int *sockaddr_len,int peer)
+static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr,
+ int peer)
{
struct sockaddr_atmpvc *addr;
struct atm_vcc *vcc = ATM_SD(sock);
- if (!vcc->dev || !test_bit(ATM_VF_ADDR,&vcc->flags)) return -ENOTCONN;
- *sockaddr_len = sizeof(struct sockaddr_atmpvc);
- addr = (struct sockaddr_atmpvc *) sockaddr;
+ if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
+ return -ENOTCONN;
+ addr = (struct sockaddr_atmpvc *)sockaddr;
+ memset(addr, 0, sizeof(*addr));
addr->sap_family = AF_ATMPVC;
addr->sap_addr.itf = vcc->dev->number;
addr->sap_addr.vpi = vcc->vpi;
addr->sap_addr.vci = vcc->vci;
- return 0;
+ return sizeof(struct sockaddr_atmpvc);
}
-
static const struct proto_ops pvc_proto_ops = {
.family = PF_ATMPVC,
.owner = THIS_MODULE,
@@ -113,6 +115,10 @@ static const struct proto_ops pvc_proto_ops = {
.getname = pvc_getname,
.poll = vcc_poll,
.ioctl = vcc_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = vcc_compat_ioctl,
+#endif
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = pvc_shutdown,
.setsockopt = pvc_setsockopt,
@@ -120,18 +126,20 @@ static const struct proto_ops pvc_proto_ops = {
.sendmsg = vcc_sendmsg,
.recvmsg = vcc_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
-static int pvc_create(struct socket *sock,int protocol)
+static int pvc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
sock->ops = &pvc_proto_ops;
- return vcc_create(sock, protocol, PF_ATMPVC);
+ return vcc_create(net, sock, protocol, PF_ATMPVC, kern);
}
-
-static struct net_proto_family pvc_family_ops = {
+static const struct net_proto_family pvc_family_ops = {
.family = PF_ATMPVC,
.create = pvc_create,
.owner = THIS_MODULE,
diff --git a/net/atm/raw.c b/net/atm/raw.c
index 3e57b17ca523..1e6511ec842c 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -1,71 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/raw.c - Raw AAL0 and AAL5 transports */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/atmdev.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include "common.h"
#include "protocols.h"
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-
/*
* SKB == NULL indicates that the link is being closed
*/
-static void atm_push_raw(struct atm_vcc *vcc,struct sk_buff *skb)
+static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb)
{
if (skb) {
struct sock *sk = sk_atm(vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
}
}
-
-static void atm_pop_raw(struct atm_vcc *vcc,struct sk_buff *skb)
+static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct sock *sk = sk_atm(vcc);
- DPRINTK("APopR (%d) %d -= %d\n", vcc->vci, sk->sk_wmem_alloc,
- skb->truesize);
- atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ pr_debug("(%d) %d -= %d\n",
+ vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize);
+ atm_return_tx(vcc, skb);
dev_kfree_skb_any(skb);
sk->sk_write_space(sk);
}
-
-static int atm_send_aal0(struct atm_vcc *vcc,struct sk_buff *skb)
+static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb)
{
/*
* Note that if vpi/vci are _ANY or _UNSPEC the below will
* still work
*/
if (!capable(CAP_NET_ADMIN) &&
- (((u32 *) skb->data)[0] & (ATM_HDR_VPI_MASK | ATM_HDR_VCI_MASK)) !=
- ((vcc->vpi << ATM_HDR_VPI_SHIFT) | (vcc->vci << ATM_HDR_VCI_SHIFT)))
- {
+ (((u32 *)skb->data)[0] & (ATM_HDR_VPI_MASK | ATM_HDR_VCI_MASK)) !=
+ ((vcc->vpi << ATM_HDR_VPI_SHIFT) |
+ (vcc->vci << ATM_HDR_VCI_SHIFT))) {
kfree_skb(skb);
return -EADDRNOTAVAIL;
- }
- return vcc->dev->ops->send(vcc,skb);
+ }
+ if (vcc->dev->ops->send_bh)
+ return vcc->dev->ops->send_bh(vcc, skb);
+ return vcc->dev->ops->send(vcc, skb);
}
-
int atm_init_aal0(struct atm_vcc *vcc)
{
vcc->push = atm_push_raw;
@@ -75,25 +68,27 @@ int atm_init_aal0(struct atm_vcc *vcc)
return 0;
}
-
int atm_init_aal34(struct atm_vcc *vcc)
{
vcc->push = atm_push_raw;
vcc->pop = atm_pop_raw;
vcc->push_oam = NULL;
- vcc->send = vcc->dev->ops->send;
+ if (vcc->dev->ops->send_bh)
+ vcc->send = vcc->dev->ops->send_bh;
+ else
+ vcc->send = vcc->dev->ops->send;
return 0;
}
-
int atm_init_aal5(struct atm_vcc *vcc)
{
vcc->push = atm_push_raw;
vcc->pop = atm_pop_raw;
vcc->push_oam = NULL;
- vcc->send = vcc->dev->ops->send;
+ if (vcc->dev->ops->send_bh)
+ vcc->send = vcc->dev->ops->send_bh;
+ else
+ vcc->send = vcc->dev->ops->send;
return 0;
}
-
-
EXPORT_SYMBOL(atm_init_aal5);
diff --git a/net/atm/resources.c b/net/atm/resources.c
index 529f7e64aa2c..7c6fdedbcf4e 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/resources.c - Statically allocated resources */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
@@ -7,6 +8,7 @@
* 2002/01 - don't free the whole struct sock on sk->destruct time,
* use the default destruct function initialized by sock_init_data */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/ctype.h>
#include <linux/string.h>
@@ -18,6 +20,7 @@
#include <linux/capability.h>
#include <linux/delay.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include <net/sock.h> /* for struct sock */
@@ -49,10 +52,8 @@ static struct atm_dev *__alloc_atm_dev(const char *type)
static struct atm_dev *__atm_dev_lookup(int number)
{
struct atm_dev *dev;
- struct list_head *p;
- list_for_each(p, &atm_devs) {
- dev = list_entry(p, struct atm_dev, dev_list);
+ list_for_each_entry(dev, &atm_devs, dev_list) {
if (dev->number == number) {
atm_dev_hold(dev);
return dev;
@@ -70,22 +71,23 @@ struct atm_dev *atm_dev_lookup(int number)
mutex_unlock(&atm_dev_mutex);
return dev;
}
+EXPORT_SYMBOL(atm_dev_lookup);
-
-struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
- int number, unsigned long *flags)
+struct atm_dev *atm_dev_register(const char *type, struct device *parent,
+ const struct atmdev_ops *ops, int number,
+ unsigned long *flags)
{
struct atm_dev *dev, *inuse;
dev = __alloc_atm_dev(type);
if (!dev) {
- printk(KERN_ERR "atm_dev_register: no space for dev %s\n",
- type);
+ pr_err("no space for dev %s\n", type);
return NULL;
}
mutex_lock(&atm_dev_mutex);
if (number != -1) {
- if ((inuse = __atm_dev_lookup(number))) {
+ inuse = __atm_dev_lookup(number);
+ if (inuse) {
atm_dev_put(inuse);
mutex_unlock(&atm_dev_mutex);
kfree(dev);
@@ -106,19 +108,17 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
else
memset(&dev->flags, 0, sizeof(dev->flags));
memset(&dev->stats, 0, sizeof(dev->stats));
- atomic_set(&dev->refcnt, 1);
+ refcount_set(&dev->refcnt, 1);
if (atm_proc_dev_register(dev) < 0) {
- printk(KERN_ERR "atm_dev_register: "
- "atm_proc_dev_register failed for dev %s\n",
- type);
- goto out_fail;
+ pr_err("atm_proc_dev_register failed for dev %s\n", type);
+ mutex_unlock(&atm_dev_mutex);
+ kfree(dev);
+ return NULL;
}
- if (atm_register_sysfs(dev) < 0) {
- printk(KERN_ERR "atm_dev_register: "
- "atm_register_sysfs failed for dev %s\n",
- type);
+ if (atm_register_sysfs(dev, parent) < 0) {
+ pr_err("atm_register_sysfs failed for dev %s\n", type);
atm_proc_dev_deregister(dev);
goto out_fail;
}
@@ -130,11 +130,11 @@ out:
return dev;
out_fail:
- kfree(dev);
+ put_device(&dev->class_dev);
dev = NULL;
goto out;
}
-
+EXPORT_SYMBOL(atm_dev_register);
void atm_dev_deregister(struct atm_dev *dev)
{
@@ -142,21 +142,20 @@ void atm_dev_deregister(struct atm_dev *dev)
set_bit(ATM_DF_REMOVED, &dev->flags);
/*
- * if we remove current device from atm_devs list, new device
- * with same number can appear, such we need deregister proc,
+ * if we remove current device from atm_devs list, new device
+ * with same number can appear, such we need deregister proc,
* release async all vccs and remove them from vccs list too
*/
mutex_lock(&atm_dev_mutex);
list_del(&dev->dev_list);
- mutex_unlock(&atm_dev_mutex);
-
atm_dev_release_vccs(dev);
atm_unregister_sysfs(dev);
atm_proc_dev_deregister(dev);
+ mutex_unlock(&atm_dev_mutex);
atm_dev_put(dev);
}
-
+EXPORT_SYMBOL(atm_dev_deregister);
static void copy_aal_stats(struct k_atm_aal_stats *from,
struct atm_aal_stats *to)
@@ -166,7 +165,6 @@ static void copy_aal_stats(struct k_atm_aal_stats *from,
#undef __HANDLE_ITEM
}
-
static void subtract_aal_stats(struct k_atm_aal_stats *from,
struct atm_aal_stats *to)
{
@@ -175,8 +173,8 @@ static void subtract_aal_stats(struct k_atm_aal_stats *from,
#undef __HANDLE_ITEM
}
-
-static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, int zero)
+static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg,
+ int zero)
{
struct atm_dev_stats tmp;
int error = 0;
@@ -194,206 +192,208 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, in
return error ? -EFAULT : 0;
}
-
-int atm_dev_ioctl(unsigned int cmd, void __user *arg)
+int atm_getnames(void __user *buf, int __user *iobuf_len)
{
- void __user *buf;
- int error, len, number, size = 0;
+ int error, len, size = 0;
struct atm_dev *dev;
struct list_head *p;
int *tmp_buf, *tmp_p;
- struct atm_iobuf __user *iobuf = arg;
- struct atmif_sioc __user *sioc = arg;
- switch (cmd) {
- case ATM_GETNAMES:
- if (get_user(buf, &iobuf->buffer))
- return -EFAULT;
- if (get_user(len, &iobuf->length))
- return -EFAULT;
- mutex_lock(&atm_dev_mutex);
- list_for_each(p, &atm_devs)
- size += sizeof(int);
- if (size > len) {
- mutex_unlock(&atm_dev_mutex);
- return -E2BIG;
- }
- tmp_buf = kmalloc(size, GFP_ATOMIC);
- if (!tmp_buf) {
- mutex_unlock(&atm_dev_mutex);
- return -ENOMEM;
- }
- tmp_p = tmp_buf;
- list_for_each(p, &atm_devs) {
- dev = list_entry(p, struct atm_dev, dev_list);
- *tmp_p++ = dev->number;
- }
- mutex_unlock(&atm_dev_mutex);
- error = ((copy_to_user(buf, tmp_buf, size)) ||
- put_user(size, &iobuf->length))
- ? -EFAULT : 0;
- kfree(tmp_buf);
- return error;
- default:
- break;
- }
- if (get_user(buf, &sioc->arg))
+ if (get_user(len, iobuf_len))
return -EFAULT;
- if (get_user(len, &sioc->length))
- return -EFAULT;
- if (get_user(number, &sioc->number))
+ mutex_lock(&atm_dev_mutex);
+ list_for_each(p, &atm_devs)
+ size += sizeof(int);
+ if (size > len) {
+ mutex_unlock(&atm_dev_mutex);
+ return -E2BIG;
+ }
+ tmp_buf = kmalloc(size, GFP_ATOMIC);
+ if (!tmp_buf) {
+ mutex_unlock(&atm_dev_mutex);
+ return -ENOMEM;
+ }
+ tmp_p = tmp_buf;
+ list_for_each_entry(dev, &atm_devs, dev_list) {
+ *tmp_p++ = dev->number;
+ }
+ mutex_unlock(&atm_dev_mutex);
+ error = ((copy_to_user(buf, tmp_buf, size)) ||
+ put_user(size, iobuf_len))
+ ? -EFAULT : 0;
+ kfree(tmp_buf);
+ return error;
+}
+
+int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len,
+ int number, int compat)
+{
+ int error, len, size = 0;
+ struct atm_dev *dev;
+
+ if (get_user(len, sioc_len))
return -EFAULT;
- if (!(dev = try_then_request_module(atm_dev_lookup(number),
- "atm-device-%d", number)))
+ dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d",
+ number);
+ if (!dev)
return -ENODEV;
-
+
switch (cmd) {
- case ATM_GETTYPE:
- size = strlen(dev->type) + 1;
- if (copy_to_user(buf, dev->type, size)) {
- error = -EFAULT;
- goto done;
- }
- break;
- case ATM_GETESI:
- size = ESI_LEN;
- if (copy_to_user(buf, dev->esi, size)) {
- error = -EFAULT;
- goto done;
- }
- break;
- case ATM_SETESI:
- {
- int i;
-
- for (i = 0; i < ESI_LEN; i++)
- if (dev->esi[i]) {
- error = -EEXIST;
- goto done;
- }
- }
- /* fall through */
- case ATM_SETESIF:
- {
- unsigned char esi[ESI_LEN];
-
- if (!capable(CAP_NET_ADMIN)) {
- error = -EPERM;
- goto done;
- }
- if (copy_from_user(esi, buf, ESI_LEN)) {
- error = -EFAULT;
- goto done;
- }
- memcpy(dev->esi, esi, ESI_LEN);
- error = ESI_LEN;
- goto done;
- }
- case ATM_GETSTATZ:
- if (!capable(CAP_NET_ADMIN)) {
- error = -EPERM;
- goto done;
- }
- /* fall through */
- case ATM_GETSTAT:
- size = sizeof(struct atm_dev_stats);
- error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ);
- if (error)
- goto done;
- break;
- case ATM_GETCIRANGE:
- size = sizeof(struct atm_cirange);
- if (copy_to_user(buf, &dev->ci_range, size)) {
- error = -EFAULT;
- goto done;
- }
- break;
- case ATM_GETLINKRATE:
- size = sizeof(int);
- if (copy_to_user(buf, &dev->link_rate, size)) {
- error = -EFAULT;
- goto done;
- }
- break;
- case ATM_RSTADDR:
- if (!capable(CAP_NET_ADMIN)) {
- error = -EPERM;
- goto done;
- }
- atm_reset_addr(dev, ATM_ADDR_LOCAL);
- break;
- case ATM_ADDADDR:
- case ATM_DELADDR:
- case ATM_ADDLECSADDR:
- case ATM_DELLECSADDR:
- if (!capable(CAP_NET_ADMIN)) {
- error = -EPERM;
- goto done;
- }
- {
- struct sockaddr_atmsvc addr;
-
- if (copy_from_user(&addr, buf, sizeof(addr))) {
- error = -EFAULT;
- goto done;
- }
- if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR)
- error = atm_add_addr(dev, &addr,
- (cmd == ATM_ADDADDR ?
- ATM_ADDR_LOCAL : ATM_ADDR_LECS));
- else
- error = atm_del_addr(dev, &addr,
- (cmd == ATM_DELADDR ?
- ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+ case ATM_GETTYPE:
+ size = strlen(dev->type) + 1;
+ if (copy_to_user(buf, dev->type, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_GETESI:
+ size = ESI_LEN;
+ if (copy_to_user(buf, dev->esi, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_SETESI:
+ {
+ int i;
+
+ for (i = 0; i < ESI_LEN; i++)
+ if (dev->esi[i]) {
+ error = -EEXIST;
goto done;
}
- case ATM_GETADDR:
- case ATM_GETLECSADDR:
- error = atm_get_addr(dev, buf, len,
- (cmd == ATM_GETADDR ?
+ }
+ fallthrough;
+ case ATM_SETESIF:
+ {
+ unsigned char esi[ESI_LEN];
+
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ if (copy_from_user(esi, buf, ESI_LEN)) {
+ error = -EFAULT;
+ goto done;
+ }
+ memcpy(dev->esi, esi, ESI_LEN);
+ error = ESI_LEN;
+ goto done;
+ }
+ case ATM_GETSTATZ:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ fallthrough;
+ case ATM_GETSTAT:
+ size = sizeof(struct atm_dev_stats);
+ error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ);
+ if (error)
+ goto done;
+ break;
+ case ATM_GETCIRANGE:
+ size = sizeof(struct atm_cirange);
+ if (copy_to_user(buf, &dev->ci_range, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_GETLINKRATE:
+ size = sizeof(int);
+ if (copy_to_user(buf, &dev->link_rate, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_RSTADDR:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ atm_reset_addr(dev, ATM_ADDR_LOCAL);
+ break;
+ case ATM_ADDADDR:
+ case ATM_DELADDR:
+ case ATM_ADDLECSADDR:
+ case ATM_DELLECSADDR:
+ {
+ struct sockaddr_atmsvc addr;
+
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+
+ if (copy_from_user(&addr, buf, sizeof(addr))) {
+ error = -EFAULT;
+ goto done;
+ }
+ if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR)
+ error = atm_add_addr(dev, &addr,
+ (cmd == ATM_ADDADDR ?
ATM_ADDR_LOCAL : ATM_ADDR_LECS));
- if (error < 0)
- goto done;
- size = error;
- /* may return 0, but later on size == 0 means "don't
- write the length" */
- error = put_user(size, &sioc->length)
- ? -EFAULT : 0;
+ else
+ error = atm_del_addr(dev, &addr,
+ (cmd == ATM_DELADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+ goto done;
+ }
+ case ATM_GETADDR:
+ case ATM_GETLECSADDR:
+ error = atm_get_addr(dev, buf, len,
+ (cmd == ATM_GETADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+ if (error < 0)
goto done;
- case ATM_SETLOOP:
- if (__ATM_LM_XTRMT((int) (unsigned long) buf) &&
- __ATM_LM_XTLOC((int) (unsigned long) buf) >
- __ATM_LM_XTRMT((int) (unsigned long) buf)) {
+ size = error;
+ /* may return 0, but later on size == 0 means "don't
+ write the length" */
+ error = put_user(size, sioc_len) ? -EFAULT : 0;
+ goto done;
+ case ATM_SETLOOP:
+ if (__ATM_LM_XTRMT((int) (unsigned long) buf) &&
+ __ATM_LM_XTLOC((int) (unsigned long) buf) >
+ __ATM_LM_XTRMT((int) (unsigned long) buf)) {
+ error = -EINVAL;
+ goto done;
+ }
+ fallthrough;
+ case ATM_SETCIRANGE:
+ case SONET_GETSTATZ:
+ case SONET_SETDIAG:
+ case SONET_CLRDIAG:
+ case SONET_SETFRAMING:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ fallthrough;
+ default:
+ if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+ if (!dev->ops->compat_ioctl) {
error = -EINVAL;
goto done;
}
- /* fall through */
- case ATM_SETCIRANGE:
- case SONET_GETSTATZ:
- case SONET_SETDIAG:
- case SONET_CLRDIAG:
- case SONET_SETFRAMING:
- if (!capable(CAP_NET_ADMIN)) {
- error = -EPERM;
- goto done;
- }
- /* fall through */
- default:
+ size = dev->ops->compat_ioctl(dev, cmd, buf);
+#endif
+ } else {
if (!dev->ops->ioctl) {
error = -EINVAL;
goto done;
}
size = dev->ops->ioctl(dev, cmd, buf);
- if (size < 0) {
- error = (size == -ENOIOCTLCMD ? -EINVAL : size);
- goto done;
- }
+ }
+ if (size < 0) {
+ error = (size == -ENOIOCTLCMD ? -ENOTTY : size);
+ goto done;
+ }
}
-
+
if (size)
- error = put_user(size, &sioc->length)
- ? -EFAULT : 0;
+ error = put_user(size, sioc_len) ? -EFAULT : 0;
else
error = 0;
done:
@@ -401,36 +401,20 @@ done:
return error;
}
-static __inline__ void *dev_get_idx(loff_t left)
-{
- struct list_head *p;
-
- list_for_each(p, &atm_devs) {
- if (!--left)
- break;
- }
- return (p != &atm_devs) ? p : NULL;
-}
-
+#ifdef CONFIG_PROC_FS
void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos)
{
- mutex_lock(&atm_dev_mutex);
- return *pos ? dev_get_idx(*pos) : (void *) 1;
+ mutex_lock(&atm_dev_mutex);
+ return seq_list_start_head(&atm_devs, *pos);
}
void atm_dev_seq_stop(struct seq_file *seq, void *v)
{
- mutex_unlock(&atm_dev_mutex);
+ mutex_unlock(&atm_dev_mutex);
}
-
+
void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- ++*pos;
- v = (v == (void *)1) ? atm_devs.next : ((struct list_head *)v)->next;
- return (v == &atm_devs) ? NULL : v;
+ return seq_list_next(v, &atm_devs, pos);
}
-
-
-EXPORT_SYMBOL(atm_dev_register);
-EXPORT_SYMBOL(atm_dev_deregister);
-EXPORT_SYMBOL(atm_dev_lookup);
+#endif
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 1d004aaaeec1..4a0839e92ff3 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* net/atm/resources.h - ATM-related resources */
/* Written 1995-1998 by Werner Almesberger, EPFL LRC/ICA */
@@ -13,8 +14,9 @@
extern struct list_head atm_devs;
extern struct mutex atm_dev_mutex;
-int atm_dev_ioctl(unsigned int cmd, void __user *arg);
-
+int atm_getnames(void __user *buf, int __user *iobuf_len);
+int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len,
+ int number, int compat);
#ifdef CONFIG_PROC_FS
@@ -42,6 +44,6 @@ static inline void atm_proc_dev_deregister(struct atm_dev *dev)
#endif /* CONFIG_PROC_FS */
-int atm_register_sysfs(struct atm_dev *adev);
+int atm_register_sysfs(struct atm_dev *adev, struct device *parent);
void atm_unregister_sysfs(struct atm_dev *adev);
#endif
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 31d98b57e1de..e70ae2c113f9 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/signaling.c - ATM signaling */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/errno.h> /* error codes */
#include <linux/kernel.h> /* printk */
@@ -13,153 +15,123 @@
#include <linux/atmsvc.h>
#include <linux/atmdev.h>
#include <linux/bitops.h>
+#include <linux/slab.h>
#include "resources.h"
#include "signaling.h"
-
-#undef WAIT_FOR_DEMON /* #define this if system calls on SVC sockets
- should block until the demon runs.
- Danger: may cause nasty hangs if the demon
- crashes. */
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-
struct atm_vcc *sigd = NULL;
-#ifdef WAIT_FOR_DEMON
-static DECLARE_WAIT_QUEUE_HEAD(sigd_sleep);
-#endif
-
static void sigd_put_skb(struct sk_buff *skb)
{
-#ifdef WAIT_FOR_DEMON
- DECLARE_WAITQUEUE(wait,current);
-
- add_wait_queue(&sigd_sleep,&wait);
- while (!sigd) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- DPRINTK("atmsvc: waiting for signaling demon...\n");
- schedule();
- }
- current->state = TASK_RUNNING;
- remove_wait_queue(&sigd_sleep,&wait);
-#else
if (!sigd) {
- DPRINTK("atmsvc: no signaling demon\n");
+ pr_debug("atmsvc: no signaling daemon\n");
kfree_skb(skb);
return;
}
-#endif
- atm_force_charge(sigd,skb->truesize);
- skb_queue_tail(&sk_atm(sigd)->sk_receive_queue,skb);
- sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len);
+ atm_force_charge(sigd, skb->truesize);
+ skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb);
+ sk_atm(sigd)->sk_data_ready(sk_atm(sigd));
}
-
-static void modify_qos(struct atm_vcc *vcc,struct atmsvc_msg *msg)
+static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
{
struct sk_buff *skb;
- if (test_bit(ATM_VF_RELEASED,&vcc->flags) ||
- !test_bit(ATM_VF_READY,&vcc->flags))
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags))
return;
msg->type = as_error;
- if (!vcc->dev->ops->change_qos) msg->reply = -EOPNOTSUPP;
+ if (!vcc->dev->ops->change_qos)
+ msg->reply = -EOPNOTSUPP;
else {
/* should lock VCC */
- msg->reply = vcc->dev->ops->change_qos(vcc,&msg->qos,
- msg->reply);
- if (!msg->reply) msg->type = as_okay;
+ msg->reply = vcc->dev->ops->change_qos(vcc, &msg->qos,
+ msg->reply);
+ if (!msg->reply)
+ msg->type = as_okay;
}
/*
- * Should probably just turn around the old skb. But the, the buffer
+ * Should probably just turn around the old skb. But then, the buffer
* space accounting needs to follow the change too. Maybe later.
*/
- while (!(skb = alloc_skb(sizeof(struct atmsvc_msg),GFP_KERNEL)))
+ while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
schedule();
- *(struct atmsvc_msg *) skb_put(skb,sizeof(struct atmsvc_msg)) = *msg;
+ *(struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg)) = *msg;
sigd_put_skb(skb);
}
-
-static int sigd_send(struct atm_vcc *vcc,struct sk_buff *skb)
+static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct atmsvc_msg *msg;
struct atm_vcc *session_vcc;
struct sock *sk;
msg = (struct atmsvc_msg *) skb->data;
- atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
- DPRINTK("sigd_send %d (0x%lx)\n",(int) msg->type,
- (unsigned long) msg->vcc);
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc));
vcc = *(struct atm_vcc **) &msg->vcc;
+ pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc);
sk = sk_atm(vcc);
switch (msg->type) {
- case as_okay:
- sk->sk_err = -msg->reply;
- clear_bit(ATM_VF_WAITING, &vcc->flags);
- if (!*vcc->local.sas_addr.prv &&
- !*vcc->local.sas_addr.pub) {
- vcc->local.sas_family = AF_ATMSVC;
- memcpy(vcc->local.sas_addr.prv,
- msg->local.sas_addr.prv,ATM_ESA_LEN);
- memcpy(vcc->local.sas_addr.pub,
- msg->local.sas_addr.pub,ATM_E164_LEN+1);
- }
- session_vcc = vcc->session ? vcc->session : vcc;
- if (session_vcc->vpi || session_vcc->vci) break;
- session_vcc->itf = msg->pvc.sap_addr.itf;
- session_vcc->vpi = msg->pvc.sap_addr.vpi;
- session_vcc->vci = msg->pvc.sap_addr.vci;
- if (session_vcc->vpi || session_vcc->vci)
- session_vcc->qos = msg->qos;
- break;
- case as_error:
- clear_bit(ATM_VF_REGIS,&vcc->flags);
- clear_bit(ATM_VF_READY,&vcc->flags);
- sk->sk_err = -msg->reply;
- clear_bit(ATM_VF_WAITING, &vcc->flags);
+ case as_okay:
+ sk->sk_err = -msg->reply;
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ if (!*vcc->local.sas_addr.prv && !*vcc->local.sas_addr.pub) {
+ vcc->local.sas_family = AF_ATMSVC;
+ memcpy(vcc->local.sas_addr.prv,
+ msg->local.sas_addr.prv, ATM_ESA_LEN);
+ memcpy(vcc->local.sas_addr.pub,
+ msg->local.sas_addr.pub, ATM_E164_LEN + 1);
+ }
+ session_vcc = vcc->session ? vcc->session : vcc;
+ if (session_vcc->vpi || session_vcc->vci)
break;
- case as_indicate:
- vcc = *(struct atm_vcc **) &msg->listen_vcc;
- sk = sk_atm(vcc);
- DPRINTK("as_indicate!!!\n");
- lock_sock(sk);
- if (sk_acceptq_is_full(sk)) {
- sigd_enq(NULL,as_reject,vcc,NULL,NULL);
- dev_kfree_skb(skb);
- goto as_indicate_complete;
- }
- sk->sk_ack_backlog++;
- skb_queue_tail(&sk->sk_receive_queue, skb);
- DPRINTK("waking sk->sk_sleep 0x%p\n", sk->sk_sleep);
- sk->sk_state_change(sk);
+ session_vcc->itf = msg->pvc.sap_addr.itf;
+ session_vcc->vpi = msg->pvc.sap_addr.vpi;
+ session_vcc->vci = msg->pvc.sap_addr.vci;
+ if (session_vcc->vpi || session_vcc->vci)
+ session_vcc->qos = msg->qos;
+ break;
+ case as_error:
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
+ clear_bit(ATM_VF_READY, &vcc->flags);
+ sk->sk_err = -msg->reply;
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ break;
+ case as_indicate:
+ vcc = *(struct atm_vcc **)&msg->listen_vcc;
+ sk = sk_atm(vcc);
+ pr_debug("as_indicate!!!\n");
+ lock_sock(sk);
+ if (sk_acceptq_is_full(sk)) {
+ sigd_enq(NULL, as_reject, vcc, NULL, NULL);
+ dev_kfree_skb(skb);
+ goto as_indicate_complete;
+ }
+ sk_acceptq_added(sk);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk));
+ sk->sk_state_change(sk);
as_indicate_complete:
- release_sock(sk);
- return 0;
- case as_close:
- set_bit(ATM_VF_RELEASED,&vcc->flags);
- vcc_release_async(vcc, msg->reply);
- goto out;
- case as_modify:
- modify_qos(vcc,msg);
- break;
- case as_addparty:
- case as_dropparty:
- sk->sk_err_soft = msg->reply; /* < 0 failure, otherwise ep_ref */
- clear_bit(ATM_VF_WAITING, &vcc->flags);
- break;
- default:
- printk(KERN_ALERT "sigd_send: bad message type %d\n",
- (int) msg->type);
- return -EINVAL;
+ release_sock(sk);
+ return 0;
+ case as_close:
+ set_bit(ATM_VF_RELEASED, &vcc->flags);
+ vcc_release_async(vcc, msg->reply);
+ goto out;
+ case as_modify:
+ modify_qos(vcc, msg);
+ break;
+ case as_addparty:
+ case as_dropparty:
+ WRITE_ONCE(sk->sk_err_soft, -msg->reply);
+ /* < 0 failure, otherwise ep_ref */
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ break;
+ default:
+ pr_alert("bad message type %d\n", (int)msg->type);
+ return -EINVAL;
}
sk->sk_state_change(sk);
out:
@@ -167,48 +139,51 @@ out:
return 0;
}
-
-void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type,
- struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
- const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply)
+void sigd_enq2(struct atm_vcc *vcc, enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc, const struct atm_qos *qos,
+ int reply)
{
struct sk_buff *skb;
struct atmsvc_msg *msg;
- static unsigned session = 0;
+ static unsigned int session = 0;
- DPRINTK("sigd_enq %d (0x%p)\n",(int) type,vcc);
- while (!(skb = alloc_skb(sizeof(struct atmsvc_msg),GFP_KERNEL)))
+ pr_debug("%d (0x%p)\n", (int)type, vcc);
+ while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
schedule();
- msg = (struct atmsvc_msg *) skb_put(skb,sizeof(struct atmsvc_msg));
- memset(msg,0,sizeof(*msg));
+ msg = skb_put_zero(skb, sizeof(struct atmsvc_msg));
msg->type = type;
*(struct atm_vcc **) &msg->vcc = vcc;
*(struct atm_vcc **) &msg->listen_vcc = listen_vcc;
msg->reply = reply;
- if (qos) msg->qos = *qos;
- if (vcc) msg->sap = vcc->sap;
- if (svc) msg->svc = *svc;
- if (vcc) msg->local = vcc->local;
- if (pvc) msg->pvc = *pvc;
+ if (qos)
+ msg->qos = *qos;
+ if (vcc)
+ msg->sap = vcc->sap;
+ if (svc)
+ msg->svc = *svc;
+ if (vcc)
+ msg->local = vcc->local;
+ if (pvc)
+ msg->pvc = *pvc;
if (vcc) {
if (type == as_connect && test_bit(ATM_VF_SESSION, &vcc->flags))
msg->session = ++session;
/* every new pmp connect gets the next session number */
}
sigd_put_skb(skb);
- if (vcc) set_bit(ATM_VF_REGIS,&vcc->flags);
+ if (vcc)
+ set_bit(ATM_VF_REGIS, &vcc->flags);
}
-
-void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
- struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
- const struct sockaddr_atmsvc *svc)
+void sigd_enq(struct atm_vcc *vcc, enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc)
{
- sigd_enq2(vcc,type,listen_vcc,pvc,svc,vcc ? &vcc->qos : NULL,0);
+ sigd_enq2(vcc, type, listen_vcc, pvc, svc, vcc ? &vcc->qos : NULL, 0);
/* other ISP applications may use "reply" */
}
-
static void purge_vcc(struct atm_vcc *vcc)
{
if (sk_atm(vcc)->sk_family == PF_ATMSVC &&
@@ -219,25 +194,23 @@ static void purge_vcc(struct atm_vcc *vcc)
}
}
-
static void sigd_close(struct atm_vcc *vcc)
{
- struct hlist_node *node;
struct sock *s;
int i;
- DPRINTK("sigd_close\n");
+ pr_debug("\n");
sigd = NULL;
if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
- printk(KERN_ERR "sigd_close: closing with requests pending\n");
+ pr_err("closing with requests pending\n");
skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
read_lock(&vcc_sklist_lock);
- for(i = 0; i < VCC_HTABLE_SIZE; ++i) {
+ for (i = 0; i < VCC_HTABLE_SIZE; ++i) {
struct hlist_head *head = &vcc_hash[i];
- sk_for_each(s, node, head) {
- struct atm_vcc *vcc = atm_sk(s);
+ sk_for_each(s, head) {
+ vcc = atm_sk(s);
purge_vcc(vcc);
}
@@ -245,32 +218,27 @@ static void sigd_close(struct atm_vcc *vcc)
read_unlock(&vcc_sklist_lock);
}
-
-static struct atmdev_ops sigd_dev_ops = {
+static const struct atmdev_ops sigd_dev_ops = {
.close = sigd_close,
.send = sigd_send
};
-
static struct atm_dev sigd_dev = {
.ops = &sigd_dev_ops,
.type = "sig",
.number = 999,
- .lock = SPIN_LOCK_UNLOCKED
+ .lock = __SPIN_LOCK_UNLOCKED(sigd_dev.lock)
};
-
int sigd_attach(struct atm_vcc *vcc)
{
- if (sigd) return -EADDRINUSE;
- DPRINTK("sigd_attach\n");
+ if (sigd)
+ return -EADDRINUSE;
+ pr_debug("\n");
sigd = vcc;
vcc->dev = &sigd_dev;
vcc_insert_socket(sk_atm(vcc));
- set_bit(ATM_VF_META,&vcc->flags);
- set_bit(ATM_VF_READY,&vcc->flags);
-#ifdef WAIT_FOR_DEMON
- wake_up(&sigd_sleep);
-#endif
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
return 0;
}
diff --git a/net/atm/signaling.h b/net/atm/signaling.h
index 434ead455714..2df8220f7ab5 100644
--- a/net/atm/signaling.h
+++ b/net/atm/signaling.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* net/atm/signaling.h - ATM signaling */
-
+
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
-
+
#ifndef NET_ATM_SIGNALING_H
#define NET_ATM_SIGNALING_H
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 3a180cfd7b48..005964250ecd 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/* net/atm/svc.c - ATM SVC sockets */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/string.h>
#include <linux/net.h> /* struct socket, struct proto_ops */
@@ -9,7 +11,7 @@
#include <linux/kernel.h> /* printk */
#include <linux/skbuff.h>
#include <linux/wait.h>
-#include <linux/sched.h> /* jiffies and HZ */
+#include <linux/sched/signal.h>
#include <linux/fcntl.h> /* O_NONBLOCK */
#include <linux/init.h>
#include <linux/atm.h> /* ATM stuff */
@@ -18,23 +20,21 @@
#include <linux/atmdev.h>
#include <linux/bitops.h>
#include <net/sock.h> /* for sock_no_* */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
#include "resources.h"
#include "common.h" /* common for PVCs and SVCs */
#include "signaling.h"
#include "addr.h"
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
+#ifdef CONFIG_COMPAT
+/* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */
+#define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL + 4, struct compat_atm_iobuf)
#endif
-
-static int svc_create(struct socket *sock,int protocol);
-
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+ int kern);
/*
* Note: since all this is still nicely synchronized with the signaling demon,
@@ -43,62 +43,62 @@ static int svc_create(struct socket *sock,int protocol);
*/
-static int svc_shutdown(struct socket *sock,int how)
+static int svc_shutdown(struct socket *sock, int how)
{
return 0;
}
-
static void svc_disconnect(struct atm_vcc *vcc)
{
DEFINE_WAIT(wait);
struct sk_buff *skb;
struct sock *sk = sk_atm(vcc);
- DPRINTK("svc_disconnect %p\n",vcc);
- if (test_bit(ATM_VF_REGIS,&vcc->flags)) {
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
- sigd_enq(vcc,as_close,NULL,NULL,NULL);
- while (!test_bit(ATM_VF_RELEASED,&vcc->flags) && sigd) {
+ pr_debug("%p\n", vcc);
+ if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
+ sigd_enq(vcc, as_close, NULL, NULL, NULL);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd)
+ break;
schedule();
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
}
/* beware - socket is still in use by atmsigd until the last
as_indicate has been answered */
while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
atm_return(vcc, skb->truesize);
- DPRINTK("LISTEN REL\n");
- sigd_enq2(NULL,as_reject,vcc,NULL,NULL,&vcc->qos,0);
+ pr_debug("LISTEN REL\n");
+ sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0);
dev_kfree_skb(skb);
}
clear_bit(ATM_VF_REGIS, &vcc->flags);
/* ... may retry later */
}
-
static int svc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct atm_vcc *vcc;
- if (sk) {
+ if (sk) {
vcc = ATM_SD(sock);
- DPRINTK("svc_release %p\n", vcc);
+ pr_debug("%p\n", vcc);
clear_bit(ATM_VF_READY, &vcc->flags);
- /* VCC pointer is used as a reference, so we must not free it
- (thereby subjecting it to re-use) before all pending connections
- are closed */
+ /*
+ * VCC pointer is used as a reference,
+ * so we must not free it (thereby subjecting it to re-use)
+ * before all pending connections are closed
+ */
svc_disconnect(vcc);
vcc_release(sock);
}
return 0;
}
-
-static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
- int sockaddr_len)
+static int svc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr,
+ int sockaddr_len)
{
DEFINE_WAIT(wait);
struct sock *sk = sock->sk;
@@ -123,38 +123,38 @@ static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
error = -EAFNOSUPPORT;
goto out;
}
- clear_bit(ATM_VF_BOUND,&vcc->flags);
+ clear_bit(ATM_VF_BOUND, &vcc->flags);
/* failing rebind will kill old binding */
/* @@@ check memory (de)allocation on rebind */
- if (!test_bit(ATM_VF_HASQOS,&vcc->flags)) {
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
error = -EBADFD;
goto out;
}
vcc->local = *addr;
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
- sigd_enq(vcc,as_bind,NULL,NULL,&vcc->local);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+ sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
schedule();
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
- clear_bit(ATM_VF_REGIS,&vcc->flags); /* doesn't count */
+ finish_wait(sk_sleep(sk), &wait);
+ clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
if (!sigd) {
error = -EUNATCH;
goto out;
}
- if (!sk->sk_err)
- set_bit(ATM_VF_BOUND,&vcc->flags);
+ if (!sk->sk_err)
+ set_bit(ATM_VF_BOUND, &vcc->flags);
error = -sk->sk_err;
out:
release_sock(sk);
return error;
}
-
-static int svc_connect(struct socket *sock,struct sockaddr *sockaddr,
- int sockaddr_len,int flags)
+static int svc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr,
+ int sockaddr_len, int flags)
{
DEFINE_WAIT(wait);
struct sock *sk = sock->sk;
@@ -162,7 +162,7 @@ static int svc_connect(struct socket *sock,struct sockaddr *sockaddr,
struct atm_vcc *vcc = ATM_SD(sock);
int error;
- DPRINTK("svc_connect %p\n",vcc);
+ pr_debug("%p\n", vcc);
lock_sock(sk);
if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) {
error = -EINVAL;
@@ -209,27 +209,27 @@ static int svc_connect(struct socket *sock,struct sockaddr *sockaddr,
}
vcc->remote = *addr;
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- sigd_enq(vcc,as_connect,NULL,NULL,&vcc->remote);
+ sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
if (flags & O_NONBLOCK) {
- finish_wait(sk->sk_sleep, &wait);
sock->state = SS_CONNECTING;
error = -EINPROGRESS;
goto out;
}
error = 0;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
schedule();
if (!signal_pending(current)) {
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
continue;
}
- DPRINTK("*ABORT*\n");
+ pr_debug("*ABORT*\n");
/*
* This is tricky:
* Kernel ---close--> Demon
* Kernel <--close--- Demon
- * or
+ * or
* Kernel ---close--> Demon
* Kernel <--error--- Demon
* or
@@ -237,25 +237,27 @@ static int svc_connect(struct socket *sock,struct sockaddr *sockaddr,
* Kernel <--okay---- Demon
* Kernel <--close--- Demon
*/
- sigd_enq(vcc,as_close,NULL,NULL,NULL);
+ sigd_enq(vcc, as_close, NULL, NULL, NULL);
while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
schedule();
}
if (!sk->sk_err)
- while (!test_bit(ATM_VF_RELEASED,&vcc->flags)
- && sigd) {
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ while (!test_bit(ATM_VF_RELEASED, &vcc->flags) &&
+ sigd) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
schedule();
}
- clear_bit(ATM_VF_REGIS,&vcc->flags);
- clear_bit(ATM_VF_RELEASED,&vcc->flags);
- clear_bit(ATM_VF_CLOSE,&vcc->flags);
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
+ clear_bit(ATM_VF_RELEASED, &vcc->flags);
+ clear_bit(ATM_VF_CLOSE, &vcc->flags);
/* we're gone now but may connect later */
error = -EINTR;
break;
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
if (error)
goto out;
if (!sigd) {
@@ -267,55 +269,54 @@ static int svc_connect(struct socket *sock,struct sockaddr *sockaddr,
goto out;
}
}
-/*
- * Not supported yet
- *
- * #ifndef CONFIG_SINGLE_SIGITF
- */
+
vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp);
vcc->qos.txtp.pcr = 0;
vcc->qos.txtp.min_pcr = 0;
-/*
- * #endif
- */
- if (!(error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci)))
+
+ error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci);
+ if (!error)
sock->state = SS_CONNECTED;
else
- (void) svc_disconnect(vcc);
+ (void)svc_disconnect(vcc);
out:
release_sock(sk);
return error;
}
-
-static int svc_listen(struct socket *sock,int backlog)
+static int svc_listen(struct socket *sock, int backlog)
{
DEFINE_WAIT(wait);
struct sock *sk = sock->sk;
struct atm_vcc *vcc = ATM_SD(sock);
int error;
- DPRINTK("svc_listen %p\n",vcc);
+ pr_debug("%p\n", vcc);
lock_sock(sk);
/* let server handle listen on unbound sockets */
- if (test_bit(ATM_VF_SESSION,&vcc->flags)) {
+ if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
error = -EINVAL;
goto out;
}
- vcc_insert_socket(sk);
+ if (test_bit(ATM_VF_LISTEN, &vcc->flags)) {
+ error = -EADDRINUSE;
+ goto out;
+ }
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
- sigd_enq(vcc,as_listen,NULL,NULL,&vcc->local);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+ sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
schedule();
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
if (!sigd) {
error = -EUNATCH;
goto out;
}
- set_bit(ATM_VF_LISTEN,&vcc->flags);
+ set_bit(ATM_VF_LISTEN, &vcc->flags);
+ vcc_insert_socket(sk);
sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT;
error = -sk->sk_err;
out:
@@ -323,8 +324,8 @@ out:
return error;
}
-
-static int svc_accept(struct socket *sock,struct socket *newsock,int flags)
+static int svc_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sock *sk = sock->sk;
struct sk_buff *skb;
@@ -335,25 +336,26 @@ static int svc_accept(struct socket *sock,struct socket *newsock,int flags)
lock_sock(sk);
- error = svc_create(newsock,0);
+ error = svc_create(sock_net(sk), newsock, 0, arg->kern);
if (error)
goto out;
new_vcc = ATM_SD(newsock);
- DPRINTK("svc_accept %p -> %p\n",old_vcc,new_vcc);
+ pr_debug("%p -> %p\n", old_vcc, new_vcc);
while (1) {
DEFINE_WAIT(wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (!(skb = skb_dequeue(&sk->sk_receive_queue)) &&
sigd) {
- if (test_bit(ATM_VF_RELEASED,&old_vcc->flags)) break;
- if (test_bit(ATM_VF_CLOSE,&old_vcc->flags)) {
+ if (test_bit(ATM_VF_RELEASED, &old_vcc->flags))
+ break;
+ if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) {
error = -sk->sk_err;
break;
}
- if (flags & O_NONBLOCK) {
+ if (arg->flags & O_NONBLOCK) {
error = -EAGAIN;
break;
}
@@ -364,42 +366,46 @@ static int svc_accept(struct socket *sock,struct socket *newsock,int flags)
error = -ERESTARTSYS;
break;
}
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
if (error)
goto out;
if (!skb) {
error = -EUNATCH;
goto out;
}
- msg = (struct atmsvc_msg *) skb->data;
+ msg = (struct atmsvc_msg *)skb->data;
new_vcc->qos = msg->qos;
- set_bit(ATM_VF_HASQOS,&new_vcc->flags);
+ set_bit(ATM_VF_HASQOS, &new_vcc->flags);
new_vcc->remote = msg->svc;
new_vcc->local = msg->local;
new_vcc->sap = msg->sap;
error = vcc_connect(newsock, msg->pvc.sap_addr.itf,
- msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci);
+ msg->pvc.sap_addr.vpi,
+ msg->pvc.sap_addr.vci);
dev_kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
if (error) {
- sigd_enq2(NULL,as_reject,old_vcc,NULL,NULL,
- &old_vcc->qos,error);
+ sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL,
+ &old_vcc->qos, error);
error = error == -EAGAIN ? -EBUSY : error;
goto out;
}
/* wait should be short, so we ignore the non-blocking flag */
set_bit(ATM_VF_WAITING, &new_vcc->flags);
- prepare_to_wait(sk_atm(new_vcc)->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
- sigd_enq(new_vcc,as_accept,old_vcc,NULL,NULL);
- while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) {
+ sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd)
+ break;
release_sock(sk);
schedule();
lock_sock(sk);
- prepare_to_wait(sk_atm(new_vcc)->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
}
- finish_wait(sk_atm(new_vcc)->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
if (!sigd) {
error = -EUNATCH;
goto out;
@@ -417,41 +423,40 @@ out:
return error;
}
-
-static int svc_getname(struct socket *sock,struct sockaddr *sockaddr,
- int *sockaddr_len,int peer)
+static int svc_getname(struct socket *sock, struct sockaddr *sockaddr,
+ int peer)
{
struct sockaddr_atmsvc *addr;
- *sockaddr_len = sizeof(struct sockaddr_atmsvc);
addr = (struct sockaddr_atmsvc *) sockaddr;
- memcpy(addr,peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local,
- sizeof(struct sockaddr_atmsvc));
- return 0;
+ memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local,
+ sizeof(struct sockaddr_atmsvc));
+ return sizeof(struct sockaddr_atmsvc);
}
-
-int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
+int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
{
struct sock *sk = sk_atm(vcc);
DEFINE_WAIT(wait);
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
- sigd_enq2(vcc,as_modify,NULL,NULL,&vcc->local,qos,0);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) &&
- !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
+ sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) ||
+ test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) {
+ break;
+ }
schedule();
- prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
- if (!sigd) return -EUNATCH;
+ finish_wait(sk_sleep(sk), &wait);
+ if (!sigd)
+ return -EUNATCH;
return -sk->sk_err;
}
-
static int svc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct atm_vcc *vcc = ATM_SD(sock);
@@ -459,37 +464,35 @@ static int svc_setsockopt(struct socket *sock, int level, int optname,
lock_sock(sk);
switch (optname) {
- case SO_ATMSAP:
- if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) {
- error = -EINVAL;
- goto out;
- }
- if (copy_from_user(&vcc->sap, optval, optlen)) {
- error = -EFAULT;
- goto out;
- }
- set_bit(ATM_VF_HASSAP, &vcc->flags);
- break;
- case SO_MULTIPOINT:
- if (level != SOL_ATM || optlen != sizeof(int)) {
- error = -EINVAL;
- goto out;
- }
- if (get_user(value, (int __user *) optval)) {
- error = -EFAULT;
- goto out;
- }
- if (value == 1) {
- set_bit(ATM_VF_SESSION, &vcc->flags);
- } else if (value == 0) {
- clear_bit(ATM_VF_SESSION, &vcc->flags);
- } else {
- error = -EINVAL;
- }
- break;
- default:
- error = vcc_setsockopt(sock, level, optname,
- optval, optlen);
+ case SO_ATMSAP:
+ if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (copy_from_sockptr(&vcc->sap, optval, optlen)) {
+ error = -EFAULT;
+ goto out;
+ }
+ set_bit(ATM_VF_HASSAP, &vcc->flags);
+ break;
+ case SO_MULTIPOINT:
+ if (level != SOL_ATM || optlen != sizeof(int)) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (copy_from_sockptr(&value, optval, sizeof(int))) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (value == 1)
+ set_bit(ATM_VF_SESSION, &vcc->flags);
+ else if (value == 0)
+ clear_bit(ATM_VF_SESSION, &vcc->flags);
+ else
+ error = -EINVAL;
+ break;
+ default:
+ error = vcc_setsockopt(sock, level, optname, optval, optlen);
}
out:
@@ -497,9 +500,8 @@ out:
return error;
}
-
-static int svc_getsockopt(struct socket *sock,int level,int optname,
- char __user *optval,int __user *optlen)
+static int svc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
int error = 0, len;
@@ -526,7 +528,6 @@ out:
return error;
}
-
static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
int sockaddr_len, int flags)
{
@@ -537,27 +538,26 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
lock_sock(sk);
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
sigd_enq(vcc, as_addparty, NULL, NULL,
- (struct sockaddr_atmsvc *) sockaddr);
+ (struct sockaddr_atmsvc *) sockaddr);
if (flags & O_NONBLOCK) {
- finish_wait(sk->sk_sleep, &wait);
error = -EINPROGRESS;
goto out;
}
- DPRINTK("svc_addparty added wait queue\n");
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+ pr_debug("added wait queue\n");
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
schedule();
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
- error = xchg(&sk->sk_err_soft, 0);
+ finish_wait(sk_sleep(sk), &wait);
+ error = -xchg(&sk->sk_err_soft, 0);
out:
release_sock(sk);
return error;
}
-
static int svc_dropparty(struct socket *sock, int ep_ref)
{
DEFINE_WAIT(wait);
@@ -567,52 +567,70 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
lock_sock(sk);
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
schedule();
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
if (!sigd) {
error = -EUNATCH;
goto out;
}
- error = xchg(&sk->sk_err_soft, 0);
+ error = -xchg(&sk->sk_err_soft, 0);
out:
release_sock(sk);
return error;
}
-
static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- int error, ep_ref;
- struct sockaddr_atmsvc sa;
+ int error, ep_ref;
+ struct sockaddr_atmsvc sa;
struct atm_vcc *vcc = ATM_SD(sock);
-
+
switch (cmd) {
- case ATM_ADDPARTY:
- if (!test_bit(ATM_VF_SESSION, &vcc->flags))
- return -EINVAL;
- if (copy_from_user(&sa, (void __user *) arg, sizeof(sa)))
- return -EFAULT;
- error = svc_addparty(sock, (struct sockaddr *) &sa, sizeof(sa), 0);
- break;
- case ATM_DROPPARTY:
- if (!test_bit(ATM_VF_SESSION, &vcc->flags))
- return -EINVAL;
- if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int)))
- return -EFAULT;
- error = svc_dropparty(sock, ep_ref);
- break;
- default:
- error = vcc_ioctl(sock, cmd, arg);
+ case ATM_ADDPARTY:
+ if (!test_bit(ATM_VF_SESSION, &vcc->flags))
+ return -EINVAL;
+ if (copy_from_user(&sa, (void __user *) arg, sizeof(sa)))
+ return -EFAULT;
+ error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa),
+ 0);
+ break;
+ case ATM_DROPPARTY:
+ if (!test_bit(ATM_VF_SESSION, &vcc->flags))
+ return -EINVAL;
+ if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int)))
+ return -EFAULT;
+ error = svc_dropparty(sock, ep_ref);
+ break;
+ default:
+ error = vcc_ioctl(sock, cmd, arg);
}
return error;
}
+#ifdef CONFIG_COMPAT
+static int svc_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf.
+ But actually it takes a struct sockaddr_atmsvc, which doesn't need
+ compat handling. So all we have to do is fix up cmd... */
+ if (cmd == COMPAT_ATM_ADDPARTY)
+ cmd = ATM_ADDPARTY;
+
+ if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY)
+ return svc_ioctl(sock, cmd, arg);
+ else
+ return vcc_compat_ioctl(sock, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
static const struct proto_ops svc_proto_ops = {
.family = PF_ATMSVC,
.owner = THIS_MODULE,
@@ -625,6 +643,10 @@ static const struct proto_ops svc_proto_ops = {
.getname = svc_getname,
.poll = vcc_poll,
.ioctl = svc_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = svc_compat_ioctl,
+#endif
+ .gettstamp = sock_gettstamp,
.listen = svc_listen,
.shutdown = svc_shutdown,
.setsockopt = svc_setsockopt,
@@ -632,24 +654,27 @@ static const struct proto_ops svc_proto_ops = {
.sendmsg = vcc_sendmsg,
.recvmsg = vcc_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
-static int svc_create(struct socket *sock,int protocol)
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
int error;
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
sock->ops = &svc_proto_ops;
- error = vcc_create(sock, protocol, AF_ATMSVC);
- if (error) return error;
+ error = vcc_create(net, sock, protocol, AF_ATMSVC, kern);
+ if (error)
+ return error;
ATM_SD(sock)->local.sas_family = AF_ATMSVC;
ATM_SD(sock)->remote.sas_family = AF_ATMSVC;
return 0;
}
-
-static struct net_proto_family svc_family_ops = {
+static const struct net_proto_family svc_family_ops = {
.family = PF_ATMSVC,
.create = svc_create,
.owner = THIS_MODULE,
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index a8993a041724..e23a3dc14b93 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -1,30 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Amateur Radio protocols and AX.25 device configuration
#
-# 19971130 Now in an own category to make correct compilation of the
-# AX.25 stuff easier...
-# Joerg Reuter DL1BKE <jreuter@yaina.de>
-# 19980129 Moved to net/ax25/Config.in, sourcing device drivers.
menuconfig HAMRADIO
depends on NET
bool "Amateur Radio support"
help
If you want to connect your Linux box to an amateur radio, answer Y
- here. You want to read <http://www.tapr.org/tapr/html/pkthome.html> and
- the AX25-HOWTO, available from <http://www.tldp.org/docs.html#howto>.
+ here. You want to read <https://www.tapr.org/>
+ and more specifically about AX.25 on Linux
+ <https://linux-ax25.in-berlin.de>.
Note that the answer to this question won't directly affect the
kernel: saying N will just cause the configurator to skip all
the questions about amateur radio.
comment "Packet Radio protocols"
- depends on HAMRADIO && NET
+ depends on HAMRADIO
config AX25
tristate "Amateur Radio AX.25 Level 2 protocol"
- depends on HAMRADIO && NET
- ---help---
+ depends on HAMRADIO
+ help
This is the protocol used for computer communication over amateur
radio. It is either used by itself for point-to-point links, or to
carry other protocols such as tcp/ip. To use it, you need a device
@@ -41,17 +39,18 @@ config AX25
Information about where to get supporting software for Linux amateur
radio as well as information about how to configure an AX.25 port is
contained in the AX25-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. You might also want to
- check out the file <file:Documentation/networking/ax25.txt> in the
+ <https://www.tldp.org/docs.html#howto>. You might also want to
+ check out the file <file:Documentation/networking/ax25.rst> in the
kernel source. More information about digital amateur radio in
general is on the WWW at
- <http://www.tapr.org/tapr/html/pkthome.html>.
+ <https://www.tapr.org/>.
To compile this driver as a module, choose M here: the
module will be called ax25.
config AX25_DAMA_SLAVE
bool "AX.25 DAMA Slave support"
+ default y
depends on AX25
help
DAMA is a mechanism to prevent collisions when doing AX.25
@@ -59,24 +58,39 @@ config AX25_DAMA_SLAVE
from clients (called "slaves") and redistributes it to other slaves.
If you say Y here, your Linux box will act as a DAMA slave; this is
transparent in that you don't have to do any special DAMA
- configuration. (Linux cannot yet act as a DAMA server.) If unsure,
- say N.
+ configuration. Linux cannot yet act as a DAMA server. This option
+ only compiles DAMA slave support into the kernel. It still needs to
+ be enabled at runtime. For more about DAMA see
+ <https://linux-ax25.in-berlin.de>. If unsure, say Y.
+
+# placeholder until implemented
+config AX25_DAMA_MASTER
+ bool 'AX.25 DAMA Master support'
+ depends on AX25_DAMA_SLAVE && BROKEN
+ help
+ DAMA is a mechanism to prevent collisions when doing AX.25
+ networking. A DAMA server (called "master") accepts incoming traffic
+ from clients (called "slaves") and redistributes it to other slaves.
+ If you say Y here, your Linux box will act as a DAMA master; this is
+ transparent in that you don't have to do any special DAMA
+ configuration. Linux cannot yet act as a DAMA server. This option
+ only compiles DAMA slave support into the kernel. It still needs to
+ be explicitly enabled, so if unsure, say Y.
-# bool ' AX.25 DAMA Master support' CONFIG_AX25_DAMA_MASTER
config NETROM
tristate "Amateur Radio NET/ROM protocol"
depends on AX25
- ---help---
+ help
NET/ROM is a network layer protocol on top of AX.25 useful for
routing.
A comprehensive listing of all the software for Linux amateur radio
users as well as information about how to configure an AX.25 port is
- contained in the AX25-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. You also might want to
- check out the file <file:Documentation/networking/ax25.txt>. More
- information about digital amateur radio in general is on the WWW at
- <http://www.tapr.org/tapr/html/pkthome.html>.
+ contained in the Linux Ham Wiki, available from
+ <https://linux-ax25.in-berlin.de>. You also might want to check out
+ the file <file:Documentation/networking/ax25.rst>. More information
+ about digital amateur radio in general is on the WWW at
+ <https://www.tapr.org/>.
To compile this driver as a module, choose M here: the
module will be called netrom.
@@ -84,27 +98,25 @@ config NETROM
config ROSE
tristate "Amateur Radio X.25 PLP (Rose)"
depends on AX25
- ---help---
+ help
The Packet Layer Protocol (PLP) is a way to route packets over X.25
connections in general and amateur radio AX.25 connections in
particular, essentially an alternative to NET/ROM.
A comprehensive listing of all the software for Linux amateur radio
users as well as information about how to configure an AX.25 port is
- contained in the AX25-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. You also might want to
- check out the file <file:Documentation/networking/ax25.txt>. More
- information about digital amateur radio in general is on the WWW at
- <http://www.tapr.org/tapr/html/pkthome.html>.
+ contained in the Linux Ham Wiki, available from
+ <https://linux-ax25.in-berlin.de>. You also might want to check out
+ the file <file:Documentation/networking/ax25.rst>. More information
+ about digital amateur radio in general is on the WWW at
+ <https://www.tapr.org/>.
To compile this driver as a module, choose M here: the
module will be called rose.
-
menu "AX.25 network device drivers"
- depends on HAMRADIO && NET && AX25!=n
+ depends on HAMRADIO && AX25
source "drivers/net/hamradio/Kconfig"
endmenu
-
diff --git a/net/ax25/Makefile b/net/ax25/Makefile
index 43c46d2cafb6..2e53affc8568 100644
--- a/net/ax25/Makefile
+++ b/net/ax25/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux AX.25 layer.
#
diff --git a/net/ax25/TODO b/net/ax25/TODO
deleted file mode 100644
index 4089c49e45cc..000000000000
--- a/net/ax25/TODO
+++ /dev/null
@@ -1,24 +0,0 @@
-Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and
-listen_lock have to be bh-safe?
-
-Do the netrom and rose locks have to be bh-safe?
-
-A device might be deleted after lookup in the SIOCADDRT ioctl but before it's
-being used.
-
-Routes to a device being taken down might be deleted by ax25_rt_device_down
-but added by somebody else before the device has been deleted fully.
-
-Massive amounts of lock_kernel / unlock_kernel are just a temporary solution to
-get around the removal of SOCKOPS_WRAP. A serious locking strategy has to be
-implemented.
-
-The ax25_rt_find_route synopsys is pervert but I somehow had to deal with
-the race caused by the static variable in it's previous implementation.
-
-Implement proper socket locking in netrom and rose.
-
-Check socket locking when ax25_rcv is sending to raw sockets. In particular
-ax25_send_to_raw() seems fishy. Heck - ax25_rcv is fishy.
-
-Handle XID and TEST frames properly.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 6cabf6d8a751..7ebbff2f0020 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -20,20 +17,19 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/timer.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/sockios.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
@@ -41,10 +37,10 @@
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
-#include <linux/netfilter.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/spinlock.h>
+#include <net/net_namespace.h>
#include <net/tcp_states.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -58,7 +54,7 @@ static const struct proto_ops ax25_proto_ops;
static void ax25_free_sock(struct sock *sk)
{
- ax25_cb_put(ax25_sk(sk));
+ ax25_cb_put(sk_to_ax25(sk));
}
/*
@@ -66,12 +62,12 @@ static void ax25_free_sock(struct sock *sk)
*/
static void ax25_cb_del(ax25_cb *ax25)
{
+ spin_lock_bh(&ax25_list_lock);
if (!hlist_unhashed(&ax25->ax25_node)) {
- spin_lock_bh(&ax25_list_lock);
hlist_del_init(&ax25->ax25_node);
- spin_unlock_bh(&ax25_list_lock);
ax25_cb_put(ax25);
}
+ spin_unlock_bh(&ax25_list_lock);
}
/*
@@ -81,16 +77,47 @@ static void ax25_kill_by_device(struct net_device *dev)
{
ax25_dev *ax25_dev;
ax25_cb *s;
- struct hlist_node *node;
+ struct sock *sk;
if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
return;
+ ax25_dev->device_up = false;
spin_lock_bh(&ax25_list_lock);
- ax25_for_each(s, node, &ax25_list) {
+again:
+ ax25_for_each(s, &ax25_list) {
if (s->ax25_dev == ax25_dev) {
- s->ax25_dev = NULL;
+ sk = s->sk;
+ if (!sk) {
+ spin_unlock_bh(&ax25_list_lock);
+ ax25_disconnect(s, ENETUNREACH);
+ s->ax25_dev = NULL;
+ ax25_cb_del(s);
+ spin_lock_bh(&ax25_list_lock);
+ goto again;
+ }
+ sock_hold(sk);
+ spin_unlock_bh(&ax25_list_lock);
+ lock_sock(sk);
ax25_disconnect(s, ENETUNREACH);
+ s->ax25_dev = NULL;
+ if (sk->sk_socket) {
+ netdev_put(ax25_dev->dev,
+ &s->dev_tracker);
+ ax25_dev_put(ax25_dev);
+ }
+ ax25_cb_del(s);
+ release_sock(sk);
+ spin_lock_bh(&ax25_list_lock);
+ sock_put(sk);
+ /* The entry could have been deleted from the
+ * list meanwhile and thus the next pointer is
+ * no longer valid. Play it safe and restart
+ * the scan. Forward progress is ensured
+ * because we set s->ax25_dev to NULL and we
+ * are never passed a NULL 'dev' argument.
+ */
+ goto again;
}
}
spin_unlock_bh(&ax25_list_lock);
@@ -100,9 +127,12 @@ static void ax25_kill_by_device(struct net_device *dev)
* Handle device status changes.
*/
static int ax25_device_event(struct notifier_block *this, unsigned long event,
- void *ptr)
+ void *ptr)
{
- struct net_device *dev = (struct net_device *)ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
/* Reject non AX.25 devices */
if (dev->type != ARPHRD_AX25)
@@ -143,10 +173,9 @@ struct sock *ax25_find_listener(ax25_address *addr, int digi,
struct net_device *dev, int type)
{
ax25_cb *s;
- struct hlist_node *node;
spin_lock(&ax25_list_lock);
- ax25_for_each(s, node, &ax25_list) {
+ ax25_for_each(s, &ax25_list) {
if ((s->iamdigi && !digi) || (!s->iamdigi && digi))
continue;
if (s->sk && !ax25cmp(&s->source_addr, addr) &&
@@ -172,10 +201,9 @@ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr,
{
struct sock *sk = NULL;
ax25_cb *s;
- struct hlist_node *node;
spin_lock(&ax25_list_lock);
- ax25_for_each(s, node, &ax25_list) {
+ ax25_for_each(s, &ax25_list) {
if (s->sk && !ax25cmp(&s->source_addr, my_addr) &&
!ax25cmp(&s->dest_addr, dest_addr) &&
s->sk->sk_type == type) {
@@ -194,14 +222,13 @@ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr,
* Find an AX.25 control block given both ends. It will only pick up
* floating AX.25 control blocks or non Raw socket bound control blocks.
*/
-ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr,
+ax25_cb *ax25_find_cb(const ax25_address *src_addr, ax25_address *dest_addr,
ax25_digi *digi, struct net_device *dev)
{
ax25_cb *s;
- struct hlist_node *node;
spin_lock_bh(&ax25_list_lock);
- ax25_for_each(s, node, &ax25_list) {
+ ax25_for_each(s, &ax25_list) {
if (s->sk && s->sk->sk_type != SOCK_SEQPACKET)
continue;
if (s->ax25_dev == NULL)
@@ -233,10 +260,9 @@ void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto)
{
ax25_cb *s;
struct sk_buff *copy;
- struct hlist_node *node;
spin_lock(&ax25_list_lock);
- ax25_for_each(s, node, &ax25_list) {
+ ax25_for_each(s, &ax25_list) {
if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 &&
s->sk->sk_type == SOCK_RAW &&
s->sk->sk_protocol == proto &&
@@ -259,13 +285,13 @@ void ax25_destroy_socket(ax25_cb *);
/*
* Handler for deferred kills.
*/
-static void ax25_destroy_timer(unsigned long data)
+static void ax25_destroy_timer(struct timer_list *t)
{
- ax25_cb *ax25=(ax25_cb *)data;
+ ax25_cb *ax25 = timer_container_of(ax25, t, dtimer);
struct sock *sk;
-
+
sk=ax25->sk;
-
+
bh_lock_sock(sk);
sock_hold(sk);
ax25_destroy_socket(ax25);
@@ -297,11 +323,14 @@ void ax25_destroy_socket(ax25_cb *ax25)
while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) {
if (skb->sk != ax25->sk) {
/* A pending connection */
- ax25_cb *sax25 = ax25_sk(skb->sk);
+ ax25_cb *sax25 = sk_to_ax25(skb->sk);
/* Queue the unaccepted socket for death */
sock_orphan(skb->sk);
+ /* 9A4GL: hack to release unaccepted sockets */
+ skb->sk->sk_state = TCP_LISTEN;
+
ax25_start_heartbeat(sax25);
sax25->state = AX25_STATE_0;
}
@@ -312,13 +341,10 @@ void ax25_destroy_socket(ax25_cb *ax25)
}
if (ax25->sk != NULL) {
- if (atomic_read(&ax25->sk->sk_wmem_alloc) ||
- atomic_read(&ax25->sk->sk_rmem_alloc)) {
+ if (sk_has_allocations(ax25->sk)) {
/* Defer: outstanding buffers */
- init_timer(&ax25->dtimer);
+ timer_setup(&ax25->dtimer, ax25_destroy_timer, 0);
ax25->dtimer.expires = jiffies + 2 * HZ;
- ax25->dtimer.function = ax25_destroy_timer;
- ax25->dtimer.data = (unsigned long)ax25;
add_timer(&ax25->dtimer);
} else {
struct sock *sk=ax25->sk;
@@ -342,22 +368,30 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
ax25_dev *ax25_dev;
ax25_cb *ax25;
unsigned int k;
+ int ret = 0;
if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl)))
return -EFAULT;
- if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL)
- return -ENODEV;
-
if (ax25_ctl.digi_count > AX25_MAX_DIGIS)
return -EINVAL;
+ if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL)
+ return -EINVAL;
+
+ ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr);
+ if (!ax25_dev)
+ return -ENODEV;
+
digi.ndigi = ax25_ctl.digi_count;
for (k = 0; k < digi.ndigi; k++)
digi.calls[k] = ax25_ctl.digi_addr[k];
- if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL)
+ ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev);
+ if (!ax25) {
+ ax25_dev_put(ax25_dev);
return -ENOTCONN;
+ }
switch (ax25_ctl.cmd) {
case AX25_KILL:
@@ -369,63 +403,71 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
ax25_disconnect(ax25, ENETRESET);
break;
- case AX25_WINDOW:
- if (ax25->modulus == AX25_MODULUS) {
- if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7)
- return -EINVAL;
- } else {
- if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63)
- return -EINVAL;
- }
- ax25->window = ax25_ctl.arg;
- break;
-
- case AX25_T1:
- if (ax25_ctl.arg < 1)
- return -EINVAL;
- ax25->rtt = (ax25_ctl.arg * HZ) / 2;
- ax25->t1 = ax25_ctl.arg * HZ;
- break;
-
- case AX25_T2:
- if (ax25_ctl.arg < 1)
- return -EINVAL;
- ax25->t2 = ax25_ctl.arg * HZ;
- break;
-
- case AX25_N2:
- if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31)
- return -EINVAL;
- ax25->n2count = 0;
- ax25->n2 = ax25_ctl.arg;
- break;
-
- case AX25_T3:
- if (ax25_ctl.arg < 0)
- return -EINVAL;
- ax25->t3 = ax25_ctl.arg * HZ;
- break;
-
- case AX25_IDLE:
- if (ax25_ctl.arg < 0)
- return -EINVAL;
- ax25->idle = ax25_ctl.arg * 60 * HZ;
- break;
-
- case AX25_PACLEN:
- if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535)
- return -EINVAL;
- ax25->paclen = ax25_ctl.arg;
- break;
-
- default:
- return -EINVAL;
+ case AX25_WINDOW:
+ if (ax25->modulus == AX25_MODULUS) {
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7)
+ goto einval_put;
+ } else {
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63)
+ goto einval_put;
+ }
+ ax25->window = ax25_ctl.arg;
+ break;
+
+ case AX25_T1:
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ)
+ goto einval_put;
+ ax25->rtt = (ax25_ctl.arg * HZ) / 2;
+ ax25->t1 = ax25_ctl.arg * HZ;
+ break;
+
+ case AX25_T2:
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ)
+ goto einval_put;
+ ax25->t2 = ax25_ctl.arg * HZ;
+ break;
+
+ case AX25_N2:
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31)
+ goto einval_put;
+ ax25->n2count = 0;
+ ax25->n2 = ax25_ctl.arg;
+ break;
+
+ case AX25_T3:
+ if (ax25_ctl.arg > ULONG_MAX / HZ)
+ goto einval_put;
+ ax25->t3 = ax25_ctl.arg * HZ;
+ break;
+
+ case AX25_IDLE:
+ if (ax25_ctl.arg > ULONG_MAX / (60 * HZ))
+ goto einval_put;
+
+ ax25->idle = ax25_ctl.arg * 60 * HZ;
+ break;
+
+ case AX25_PACLEN:
+ if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535)
+ goto einval_put;
+ ax25->paclen = ax25_ctl.arg;
+ break;
+
+ default:
+ goto einval_put;
}
- return 0;
+out_put:
+ ax25_dev_put(ax25_dev);
+ ax25_cb_put(ax25);
+ return ret;
+
+einval_put:
+ ret = -EINVAL;
+ goto out_put;
}
-static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev)
+static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev)
{
ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2;
ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]);
@@ -489,18 +531,14 @@ ax25_cb *ax25_create_cb(void)
if ((ax25 = kzalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL)
return NULL;
- atomic_set(&ax25->refcount, 1);
+ refcount_set(&ax25->refcount, 1);
skb_queue_head_init(&ax25->write_queue);
skb_queue_head_init(&ax25->frag_queue);
skb_queue_head_init(&ax25->ack_queue);
skb_queue_head_init(&ax25->reseq_queue);
- init_timer(&ax25->timer);
- init_timer(&ax25->t1timer);
- init_timer(&ax25->t2timer);
- init_timer(&ax25->t3timer);
- init_timer(&ax25->idletimer);
+ ax25_setup_timers(ax25);
ax25_fillin_cb(ax25, NULL);
@@ -515,25 +553,26 @@ ax25_cb *ax25_create_cb(void)
*/
static int ax25_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
ax25_cb *ax25;
struct net_device *dev;
char devname[IFNAMSIZ];
- int opt, res = 0;
+ unsigned int opt;
+ int res = 0;
if (level != SOL_AX25)
return -ENOPROTOOPT;
- if (optlen < sizeof(int))
+ if (optlen < sizeof(unsigned int))
return -EINVAL;
- if (get_user(opt, (int __user *)optval))
+ if (copy_from_sockptr(&opt, optval, sizeof(unsigned int)))
return -EFAULT;
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
switch (optname) {
case AX25_WINDOW:
@@ -552,16 +591,16 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T1:
- if (opt < 1) {
+ if (opt < 1 || opt > UINT_MAX / HZ) {
res = -EINVAL;
break;
}
- ax25->rtt = (opt * HZ) / 2;
+ ax25->rtt = (opt * HZ) >> 1;
ax25->t1 = opt * HZ;
break;
case AX25_T2:
- if (opt < 1) {
+ if (opt < 1 || opt > UINT_MAX / HZ) {
res = -EINVAL;
break;
}
@@ -577,7 +616,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T3:
- if (opt < 1) {
+ if (opt < 1 || opt > UINT_MAX / HZ) {
res = -EINVAL;
break;
}
@@ -585,7 +624,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_IDLE:
- if (opt < 0) {
+ if (opt > UINT_MAX / (60 * HZ)) {
res = -EINVAL;
break;
}
@@ -593,7 +632,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_BACKOFF:
- if (opt < 0 || opt > 2) {
+ if (opt > 2) {
res = -EINVAL;
break;
}
@@ -621,16 +660,13 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case SO_BINDTODEVICE:
- if (optlen > IFNAMSIZ)
- optlen=IFNAMSIZ;
- if (copy_from_user(devname, optval, optlen)) {
- res = -EFAULT;
- break;
- }
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
- dev = dev_get_by_name(devname);
- if (dev == NULL) {
- res = -ENODEV;
+ memset(devname, 0, sizeof(devname));
+
+ if (copy_from_sockptr(devname, optval, optlen)) {
+ res = -EFAULT;
break;
}
@@ -638,12 +674,36 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
(sock->state != SS_UNCONNECTED ||
sk->sk_state == TCP_LISTEN)) {
res = -EADDRNOTAVAIL;
- dev_put(dev);
break;
}
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(&init_net, devname);
+ if (!dev) {
+ rcu_read_unlock();
+ res = -ENODEV;
+ break;
+ }
+
+ if (ax25->ax25_dev) {
+ if (dev == ax25->ax25_dev->dev) {
+ rcu_read_unlock();
+ break;
+ }
+ netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker);
+ ax25_dev_put(ax25->ax25_dev);
+ }
+
ax25->ax25_dev = ax25_dev_ax25dev(dev);
+ if (!ax25->ax25_dev) {
+ rcu_read_unlock();
+ res = -ENODEV;
+ break;
+ }
ax25_fillin_cb(ax25, ax25->ax25_dev);
+ netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC);
+ ax25_dev_hold(ax25->ax25_dev);
+ rcu_read_unlock();
break;
default:
@@ -674,11 +734,11 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
if (maxlen < 1)
return -EFAULT;
- valptr = (void *) &val;
+ valptr = &val;
length = min_t(unsigned int, maxlen, sizeof(int));
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
switch (optname) {
case AX25_WINDOW:
@@ -729,14 +789,14 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
ax25_dev = ax25->ax25_dev;
if (ax25_dev != NULL && ax25_dev->dev != NULL) {
- strlcpy(devname, ax25_dev->dev->name, sizeof(devname));
+ strscpy(devname, ax25_dev->dev->name, sizeof(devname));
length = strlen(devname) + 1;
} else {
*devname = '\0';
length = 1;
}
- valptr = (void *) devname;
+ valptr = devname;
break;
default:
@@ -777,14 +837,21 @@ out:
static struct proto ax25_proto = {
.name = "AX25",
.owner = THIS_MODULE,
- .obj_size = sizeof(struct sock),
+ .obj_size = sizeof(struct ax25_sock),
};
-static int ax25_create(struct socket *sock, int protocol)
+static int ax25_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
ax25_cb *ax25;
+ if (protocol < 0 || protocol > U8_MAX)
+ return -EINVAL;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
switch (sock->type) {
case SOCK_DGRAM:
if (protocol == 0 || protocol == PF_AX25)
@@ -813,11 +880,13 @@ static int ax25_create(struct socket *sock, int protocol)
case AX25_P_NETROM:
if (ax25_protocol_is_registered(AX25_P_NETROM))
return -ESOCKTNOSUPPORT;
+ break;
#endif
#ifdef CONFIG_ROSE_MODULE
case AX25_P_ROSE:
if (ax25_protocol_is_registered(AX25_P_ROSE))
return -ESOCKTNOSUPPORT;
+ break;
#endif
default:
break;
@@ -825,15 +894,18 @@ static int ax25_create(struct socket *sock, int protocol)
break;
case SOCK_RAW:
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
break;
default:
return -ESOCKTNOSUPPORT;
}
- if ((sk = sk_alloc(PF_AX25, GFP_ATOMIC, &ax25_proto, 1)) == NULL)
+ sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern);
+ if (sk == NULL)
return -ENOMEM;
- ax25 = sk->sk_protinfo = ax25_create_cb();
+ ax25 = ax25_sk(sk)->cb = ax25_create_cb();
if (!ax25) {
sk_free(sk);
return -ENOMEM;
@@ -855,7 +927,8 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
struct sock *sk;
ax25_cb *ax25, *oax25;
- if ((sk = sk_alloc(PF_AX25, GFP_ATOMIC, osk->sk_prot, 1)) == NULL)
+ sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0);
+ if (sk == NULL)
return NULL;
if ((ax25 = ax25_create_cb()) == NULL) {
@@ -876,18 +949,15 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sock_init_data(NULL, sk);
- sk->sk_destruct = ax25_free_sock;
sk->sk_type = osk->sk_type;
- sk->sk_socket = osk->sk_socket;
- sk->sk_priority = osk->sk_priority;
+ sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
- sk->sk_sleep = osk->sk_sleep;
sock_copy_flags(sk, osk);
- oax25 = ax25_sk(osk);
+ oax25 = sk_to_ax25(osk);
ax25->modulus = oax25->modulus;
ax25->backoff = oax25->backoff;
@@ -915,7 +985,8 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
}
}
- sk->sk_protinfo = ax25;
+ ax25_sk(sk)->cb = ax25;
+ sk->sk_destruct = ax25_free_sock;
ax25->sk = sk;
return sk;
@@ -925,21 +996,25 @@ static int ax25_release(struct socket *sock)
{
struct sock *sk = sock->sk;
ax25_cb *ax25;
+ ax25_dev *ax25_dev;
if (sk == NULL)
return 0;
sock_hold(sk);
- sock_orphan(sk);
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ sock_orphan(sk);
+ ax25 = sk_to_ax25(sk);
+ ax25_dev = ax25->ax25_dev;
if (sk->sk_type == SOCK_SEQPACKET) {
switch (ax25->state) {
case AX25_STATE_0:
- release_sock(sk);
- ax25_disconnect(ax25, 0);
- lock_sock(sk);
+ if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+ release_sock(sk);
+ ax25_disconnect(ax25, 0);
+ lock_sock(sk);
+ }
ax25_destroy_socket(ax25);
break;
@@ -949,7 +1024,8 @@ static int ax25_release(struct socket *sock)
release_sock(sk);
ax25_disconnect(ax25, 0);
lock_sock(sk);
- ax25_destroy_socket(ax25);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_destroy_socket(ax25);
break;
case AX25_STATE_3:
@@ -993,6 +1069,17 @@ static int ax25_release(struct socket *sock)
sk->sk_state_change(sk);
ax25_destroy_socket(ax25);
}
+ if (ax25_dev) {
+ if (!ax25_dev->device_up) {
+ timer_delete_sync(&ax25->timer);
+ timer_delete_sync(&ax25->t1timer);
+ timer_delete_sync(&ax25->t2timer);
+ timer_delete_sync(&ax25->t3timer);
+ timer_delete_sync(&ax25->idletimer);
+ }
+ netdev_put(ax25_dev->dev, &ax25->dev_tracker);
+ ax25_dev_put(ax25_dev);
+ }
sock->sk = NULL;
release_sock(sk);
@@ -1007,7 +1094,7 @@ static int ax25_release(struct socket *sock)
* that we've implemented support for SO_BINDTODEVICE. It is however small
* and trivially backward compatible.
*/
-static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int ax25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
@@ -1018,21 +1105,18 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
int err = 0;
if (addr_len != sizeof(struct sockaddr_ax25) &&
- addr_len != sizeof(struct full_sockaddr_ax25)) {
- /* support for old structure may go away some time */
+ addr_len != sizeof(struct full_sockaddr_ax25))
+ /* support for old structure may go away some time
+ * ax25_bind(): uses old (6 digipeater) socket structure.
+ */
if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
- (addr_len > sizeof(struct full_sockaddr_ax25))) {
+ (addr_len > sizeof(struct full_sockaddr_ax25)))
return -EINVAL;
- }
-
- printk(KERN_WARNING "ax25_bind(): %s uses old (6 digipeater) socket structure.\n",
- current->comm);
- }
if (addr->fsa_ax25.sax25_family != AF_AX25)
return -EINVAL;
- user = ax25_findbyuid(current->euid);
+ user = ax25_findbyuid(current_euid());
if (user) {
call = user->call;
ax25_uid_put(user);
@@ -1045,7 +1129,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (!sock_flag(sk, SOCK_ZAPPED)) {
err = -EINVAL;
goto out;
@@ -1072,8 +1156,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
}
}
- if (ax25_dev != NULL)
+ if (ax25_dev) {
ax25_fillin_cb(ax25, ax25_dev);
+ netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC);
+ }
done:
ax25_cb_add(ax25);
@@ -1082,17 +1168,17 @@ done:
out:
release_sock(sk);
- return 0;
+ return err;
}
/*
* FIXME: nonblock behaviour looks like it may have a bug.
*/
-static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+static int __must_check ax25_connect(struct socket *sock,
+ struct sockaddr_unsized *uaddr, int addr_len, int flags)
{
struct sock *sk = sock->sk;
- ax25_cb *ax25 = ax25_sk(sk), *ax25t;
+ ax25_cb *ax25 = sk_to_ax25(sk), *ax25t;
struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
ax25_digi *digi = NULL;
int ct = 0, err = 0;
@@ -1101,21 +1187,19 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
* some sanity checks. code further down depends on this
*/
- if (addr_len == sizeof(struct sockaddr_ax25)) {
- /* support for this will go away in early 2.5.x */
- printk(KERN_WARNING "ax25_connect(): %s uses obsolete socket structure\n",
- current->comm);
- }
- else if (addr_len != sizeof(struct full_sockaddr_ax25)) {
- /* support for old structure may go away some time */
+ if (addr_len == sizeof(struct sockaddr_ax25))
+ /* support for this will go away in early 2.5.x
+ * ax25_connect(): uses obsolete socket structure
+ */
+ ;
+ else if (addr_len != sizeof(struct full_sockaddr_ax25))
+ /* support for old structure may go away some time
+ * ax25_connect(): uses old (6 digipeater) socket structure.
+ */
if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
- (addr_len > sizeof(struct full_sockaddr_ax25))) {
+ (addr_len > sizeof(struct full_sockaddr_ax25)))
return -EINVAL;
- }
- printk(KERN_WARNING "ax25_connect(): %s uses old (6 digipeater) socket structure.\n",
- current->comm);
- }
if (fsa->fsa_ax25.sax25_family != AF_AX25)
return -EINVAL;
@@ -1127,22 +1211,22 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
switch (sk->sk_state) {
case TCP_SYN_SENT: /* still trying */
err = -EINPROGRESS;
- goto out;
+ goto out_release;
case TCP_ESTABLISHED: /* connection established */
sock->state = SS_CONNECTED;
- goto out;
+ goto out_release;
case TCP_CLOSE: /* connection refused */
sock->state = SS_UNCONNECTED;
err = -ECONNREFUSED;
- goto out;
+ goto out_release;
}
}
if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) {
err = -EISCONN; /* No reconnect on a seqpacket socket */
- goto out;
+ goto out_release;
}
sk->sk_state = TCP_CLOSE;
@@ -1157,14 +1241,17 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
if (addr_len > sizeof(struct sockaddr_ax25) &&
fsa->fsa_ax25.sax25_ndigis != 0) {
/* Valid number of digipeaters ? */
- if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) {
+ if (fsa->fsa_ax25.sax25_ndigis < 1 ||
+ fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS ||
+ addr_len < sizeof(struct sockaddr_ax25) +
+ sizeof(ax25_address) * fsa->fsa_ax25.sax25_ndigis) {
err = -EINVAL;
- goto out;
+ goto out_release;
}
if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) {
err = -ENOBUFS;
- goto out;
+ goto out_release;
}
digi->ndigi = fsa->fsa_ax25.sax25_ndigis;
@@ -1183,37 +1270,27 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
}
}
- /*
- * Must bind first - autobinding in this may or may not work. If
- * the socket is already bound, check to see if the device has
- * been filled in, error if it hasn't.
- */
+ /* Must bind first - autobinding does not work. */
if (sock_flag(sk, SOCK_ZAPPED)) {
- /* check if we can remove this feature. It is broken. */
- printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n",
- current->comm);
- if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) {
- kfree(digi);
- goto out;
- }
+ kfree(digi);
+ err = -EINVAL;
+ goto out_release;
+ }
- ax25_fillin_cb(ax25, ax25->ax25_dev);
- ax25_cb_add(ax25);
- } else {
- if (ax25->ax25_dev == NULL) {
- kfree(digi);
- err = -EHOSTUNREACH;
- goto out;
- }
+ /* Check to see if the device has been filled in, error if it hasn't. */
+ if (ax25->ax25_dev == NULL) {
+ kfree(digi);
+ err = -EHOSTUNREACH;
+ goto out_release;
}
if (sk->sk_type == SOCK_SEQPACKET &&
(ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi,
- ax25->ax25_dev->dev))) {
+ ax25->ax25_dev->dev))) {
kfree(digi);
err = -EADDRINUSE; /* Already such a connection */
ax25_cb_put(ax25t);
- goto out;
+ goto out_release;
}
ax25->dest_addr = fsa->fsa_ax25.sax25_call;
@@ -1223,7 +1300,7 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
if (sk->sk_type != SOCK_SEQPACKET) {
sock->state = SS_CONNECTED;
sk->sk_state = TCP_ESTABLISHED;
- goto out;
+ goto out_release;
}
/* Move to connecting socket, ax.25 lapb WAIT_UA.. */
@@ -1255,56 +1332,57 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
/* Now the loop */
if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
err = -EINPROGRESS;
- goto out;
+ goto out_release;
}
if (sk->sk_state == TCP_SYN_SENT) {
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
+ DEFINE_WAIT(wait);
- add_wait_queue(sk->sk_sleep, &wait);
for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
if (sk->sk_state != TCP_SYN_SENT)
break;
- set_current_state(TASK_INTERRUPTIBLE);
- release_sock(sk);
- if (!signal_pending(tsk)) {
+ if (!signal_pending(current)) {
+ release_sock(sk);
schedule();
lock_sock(sk);
continue;
}
- current->state = TASK_RUNNING;
- remove_wait_queue(sk->sk_sleep, &wait);
- return -ERESTARTSYS;
+ err = -ERESTARTSYS;
+ break;
}
- current->state = TASK_RUNNING;
- remove_wait_queue(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (err)
+ goto out_release;
}
if (sk->sk_state != TCP_ESTABLISHED) {
/* Not in ABM, not in WAIT_UA -> failed */
sock->state = SS_UNCONNECTED;
err = sock_error(sk); /* Always set at this point */
- goto out;
+ goto out_release;
}
sock->state = SS_CONNECTED;
- err=0;
-out:
+ err = 0;
+out_release:
release_sock(sk);
return err;
}
-
-static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
+static int ax25_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
struct sk_buff *skb;
struct sock *newsk;
+ ax25_dev *ax25_dev;
+ DEFINE_WAIT(wait);
struct sock *sk;
+ ax25_cb *ax25;
int err = 0;
if (sock->state != SS_UNCONNECTED)
@@ -1328,40 +1406,41 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
* The read queue this time is holding sockets ready to use
* hooked into the SABM we saved
*/
- add_wait_queue(sk->sk_sleep, &wait);
for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
skb = skb_dequeue(&sk->sk_receive_queue);
if (skb)
break;
- release_sock(sk);
- current->state = TASK_INTERRUPTIBLE;
- if (flags & O_NONBLOCK) {
- current->state = TASK_RUNNING;
- remove_wait_queue(sk->sk_sleep, &wait);
- return -EWOULDBLOCK;
+ if (arg->flags & O_NONBLOCK) {
+ err = -EWOULDBLOCK;
+ break;
}
- if (!signal_pending(tsk)) {
+ if (!signal_pending(current)) {
+ release_sock(sk);
schedule();
lock_sock(sk);
continue;
}
- current->state = TASK_RUNNING;
- remove_wait_queue(sk->sk_sleep, &wait);
- return -ERESTARTSYS;
+ err = -ERESTARTSYS;
+ break;
}
- current->state = TASK_RUNNING;
- remove_wait_queue(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (err)
+ goto out;
newsk = skb->sk;
- newsk->sk_socket = newsock;
- newsk->sk_sleep = &newsock->wait;
+ sock_graft(newsk, newsock);
/* Now attach up the new socket */
kfree_skb(skb);
- sk->sk_ack_backlog--;
- newsock->sk = newsk;
+ sk_acceptq_removed(sk);
newsock->state = SS_CONNECTED;
+ ax25 = sk_to_ax25(newsk);
+ ax25_dev = ax25->ax25_dev;
+ netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC);
+ ax25_dev_hold(ax25_dev);
out:
release_sock(sk);
@@ -1370,7 +1449,7 @@ out:
}
static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
- int *uaddr_len, int peer)
+ int peer)
{
struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
struct sock *sk = sock->sk;
@@ -1378,8 +1457,9 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
ax25_cb *ax25;
int err = 0;
+ memset(fsa, 0, sizeof(*fsa));
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (peer != 0) {
if (sk->sk_state != TCP_ESTABLISHED) {
@@ -1389,7 +1469,6 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
fsa->fsa_ax25.sax25_family = AF_AX25;
fsa->fsa_ax25.sax25_call = ax25->dest_addr;
- fsa->fsa_ax25.sax25_ndigis = 0;
if (ax25->digipeat != NULL) {
ndigi = ax25->digipeat->ndigi;
@@ -1409,7 +1488,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
fsa->fsa_digipeater[0] = null_ax25_address;
}
}
- *uaddr_len = sizeof (struct full_sockaddr_ax25);
+ err = sizeof (struct full_sockaddr_ax25);
out:
release_sock(sk);
@@ -1417,15 +1496,13 @@ out:
return err;
}
-static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
- struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name);
struct sock *sk = sock->sk;
struct sockaddr_ax25 sax;
struct sk_buff *skb;
ax25_digi dtmp, *dp;
- unsigned char *asmptr;
ax25_cb *ax25;
size_t size;
int lv, err, addr_len = msg->msg_namelen;
@@ -1434,7 +1511,7 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
return -EINVAL;
lock_sock(sk);
- ax25 = ax25_sk(sk);
+ ax25 = sk_to_ax25(sk);
if (sock_flag(sk, SOCK_ZAPPED)) {
err = -EADDRNOTAVAIL;
@@ -1456,35 +1533,37 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
err = -EMSGSIZE;
goto out;
}
-
+
if (usax != NULL) {
if (usax->sax25_family != AF_AX25) {
err = -EINVAL;
goto out;
}
- if (addr_len == sizeof(struct sockaddr_ax25)) {
- printk(KERN_WARNING "ax25_sendmsg(): %s uses obsolete socket structure\n",
- current->comm);
- }
- else if (addr_len != sizeof(struct full_sockaddr_ax25)) {
- /* support for old structure may go away some time */
+ if (addr_len == sizeof(struct sockaddr_ax25))
+ /* ax25_sendmsg(): uses obsolete socket structure */
+ ;
+ else if (addr_len != sizeof(struct full_sockaddr_ax25))
+ /* support for old structure may go away some time
+ * ax25_sendmsg(): uses old (6 digipeater)
+ * socket structure.
+ */
if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
- (addr_len > sizeof(struct full_sockaddr_ax25))) {
- err = -EINVAL;
+ (addr_len > sizeof(struct full_sockaddr_ax25))) {
+ err = -EINVAL;
goto out;
}
- printk(KERN_WARNING "ax25_sendmsg(): %s uses old (6 digipeater) socket structure.\n",
- current->comm);
- }
if (addr_len > sizeof(struct sockaddr_ax25) && usax->sax25_ndigis != 0) {
int ct = 0;
struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax;
/* Valid number of digipeaters ? */
- if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) {
+ if (usax->sax25_ndigis < 1 ||
+ usax->sax25_ndigis > AX25_MAX_DIGIS ||
+ addr_len < sizeof(struct sockaddr_ax25) +
+ sizeof(ax25_address) * usax->sax25_ndigis) {
err = -EINVAL;
goto out;
}
@@ -1525,11 +1604,7 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
dp = ax25->digipeat;
}
- SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n");
-
/* Build a packet */
- SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n");
-
/* Assume the worst case */
size = len + ax25->ax25_dev->dev->hard_header_len;
@@ -1539,24 +1614,18 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
skb_reserve(skb, size - len);
- SOCK_DEBUG(sk, "AX.25: Appending user data\n");
-
/* User data follows immediately after the AX.25 data */
- if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
err = -EFAULT;
kfree_skb(skb);
goto out;
}
- skb->nh.raw = skb->data;
+ skb_reset_network_header(skb);
/* Add the PID if one is not supplied by the user in the skb */
- if (!ax25->pidincl) {
- asmptr = skb_push(skb, 1);
- *asmptr = sk->sk_protocol;
- }
-
- SOCK_DEBUG(sk, "AX.25: Transmitting buffer\n");
+ if (!ax25->pidincl)
+ *(u8 *)skb_push(skb, 1) = sk->sk_protocol;
if (sk->sk_type == SOCK_SEQPACKET) {
/* Connected mode sockets go via the LAPB machine */
@@ -1573,25 +1642,17 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
- asmptr = skb_push(skb, 1 + ax25_addr_size(dp));
-
- SOCK_DEBUG(sk, "Building AX.25 Header (dp=%p).\n", dp);
+ skb_push(skb, 1 + ax25_addr_size(dp));
- if (dp != NULL)
- SOCK_DEBUG(sk, "Num digipeaters=%d\n", dp->ndigi);
+ /* Building AX.25 Header */
/* Build an AX.25 header */
- asmptr += (lv = ax25_addr_build(asmptr, &ax25->source_addr,
- &sax.sax25_call, dp,
- AX25_COMMAND, AX25_MODULUS));
+ lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call,
+ dp, AX25_COMMAND, AX25_MODULUS);
- SOCK_DEBUG(sk, "Built header (%d bytes)\n",lv);
+ skb_set_transport_header(skb, lv);
- skb->h.raw = asmptr;
-
- SOCK_DEBUG(sk, "base=%p pos=%p\n", skb->data, asmptr);
-
- *asmptr = AX25_UI;
+ *skb_transport_header(skb) = AX25_UI;
/* Datagram frames go straight out of the door as UI */
ax25_queue_xmit(skb, ax25->ax25_dev->dev);
@@ -1604,13 +1665,16 @@ out:
return err;
}
-static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
+static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
+ struct sk_buff_head *sk_queue;
int copied;
int err = 0;
+ int off = 0;
+ long timeo;
lock_sock(sk);
/*
@@ -1622,32 +1686,52 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
- /* Now we can treat all alike */
- skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
- flags & MSG_DONTWAIT, &err);
- if (skb == NULL)
- goto out;
+ /* We need support for non-blocking reads. */
+ sk_queue = &sk->sk_receive_queue;
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, &err, &last);
+ /* If no packet is available, release_sock(sk) and try again. */
+ if (!skb) {
+ if (err != -EAGAIN)
+ goto out;
+ release_sock(sk);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, &err,
+ &timeo, last)) {
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off,
+ &err, &last);
+ if (skb)
+ break;
+
+ if (err != -EAGAIN)
+ goto done;
+ }
+ if (!skb)
+ goto done;
+ lock_sock(sk);
+ }
- if (!ax25_sk(sk)->pidincl)
+ if (!sk_to_ax25(sk)->pidincl)
skb_pull(skb, 1); /* Remove PID */
- skb->h.raw = skb->data;
- copied = skb->len;
+ skb_reset_transport_header(skb);
+ copied = skb->len;
if (copied > size) {
copied = size;
msg->msg_flags |= MSG_TRUNC;
}
- skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ skb_copy_datagram_msg(skb, 0, msg, copied);
- if (msg->msg_namelen != 0) {
- struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name;
+ if (msg->msg_name) {
ax25_digi digi;
ax25_address src;
+ const unsigned char *mac = skb_mac_header(skb);
+ DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name);
- ax25_addr_parse(skb->mac.raw+1, skb->data-skb->mac.raw-1, &src, NULL, &digi, NULL, NULL);
-
+ memset(sax, 0, sizeof(struct full_sockaddr_ax25));
+ ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL,
+ &digi, NULL, NULL);
sax->sax25_family = AF_AX25;
/* We set this correctly, even though we may not let the
application know the digi calls further down (because it
@@ -1671,6 +1755,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
out:
release_sock(sk);
+done:
return err;
}
@@ -1690,7 +1775,8 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
switch (cmd) {
case TIOCOUTQ: {
long amount;
- amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
if (amount < 0)
amount = 0;
res = put_user(amount, (int __user *)argp);
@@ -1707,10 +1793,6 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
- case SIOCGSTAMP:
- res = sock_get_timestamp(sk, argp);
- break;
-
case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */
case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */
case SIOCAX25GETUID: {
@@ -1733,7 +1815,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
res = -EFAULT;
break;
}
- if (amount > AX25_NOUID_BLOCK) {
+ if (amount < 0 || amount > AX25_NOUID_BLOCK) {
res = -EINVAL;
break;
}
@@ -1762,7 +1844,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCAX25GETINFO:
case SIOCAX25GETINFOOLD: {
- ax25_cb *ax25 = ax25_sk(sk);
+ ax25_cb *ax25 = sk_to_ax25(sk);
struct ax25_info_struct ax25_info;
ax25_info.t1 = ax25->t1 / HZ;
@@ -1776,8 +1858,8 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ);
ax25_info.n2count = ax25->n2count;
ax25_info.state = ax25->state;
- ax25_info.rcv_q = atomic_read(&sk->sk_rmem_alloc);
- ax25_info.snd_q = atomic_read(&sk->sk_wmem_alloc);
+ ax25_info.rcv_q = sk_rmem_alloc_get(sk);
+ ax25_info.snd_q = sk_wmem_alloc_get(sk);
ax25_info.vs = ax25->vs;
ax25_info.vr = ax25->vr;
ax25_info.va = ax25->va;
@@ -1848,36 +1930,26 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
#ifdef CONFIG_PROC_FS
static void *ax25_info_start(struct seq_file *seq, loff_t *pos)
+ __acquires(ax25_list_lock)
{
- struct ax25_cb *ax25;
- struct hlist_node *node;
- int i = 0;
-
spin_lock_bh(&ax25_list_lock);
- ax25_for_each(ax25, node, &ax25_list) {
- if (i == *pos)
- return ax25;
- ++i;
- }
- return NULL;
+ return seq_hlist_start(&ax25_list, *pos);
}
static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
- ++*pos;
-
- return hlist_entry( ((struct ax25_cb *)v)->ax25_node.next,
- struct ax25_cb, ax25_node);
+ return seq_hlist_next(v, &ax25_list, pos);
}
-
+
static void ax25_info_stop(struct seq_file *seq, void *v)
+ __releases(ax25_list_lock)
{
spin_unlock_bh(&ax25_list_lock);
}
static int ax25_info_show(struct seq_file *seq, void *v)
{
- ax25_cb *ax25 = v;
+ ax25_cb *ax25 = hlist_entry(v, struct ax25_cb, ax25_node);
char buf[11];
int k;
@@ -1887,8 +1959,8 @@ static int ax25_info_show(struct seq_file *seq, void *v)
* magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode
*/
- seq_printf(seq, "%8.8lx %s %s%s ",
- (long) ax25,
+ seq_printf(seq, "%p %s %s%s ",
+ ax25,
ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
ax2asc(buf, &ax25->source_addr),
ax25->iamdigi? "*":"");
@@ -1914,41 +1986,25 @@ static int ax25_info_show(struct seq_file *seq, void *v)
ax25->paclen);
if (ax25->sk != NULL) {
- bh_lock_sock(ax25->sk);
- seq_printf(seq," %d %d %ld\n",
- atomic_read(&ax25->sk->sk_wmem_alloc),
- atomic_read(&ax25->sk->sk_rmem_alloc),
- ax25->sk->sk_socket != NULL ? SOCK_INODE(ax25->sk->sk_socket)->i_ino : 0L);
- bh_unlock_sock(ax25->sk);
+ seq_printf(seq, " %d %d %lu\n",
+ sk_wmem_alloc_get(ax25->sk),
+ sk_rmem_alloc_get(ax25->sk),
+ sock_i_ino(ax25->sk));
} else {
seq_puts(seq, " * * *\n");
}
return 0;
}
-static struct seq_operations ax25_info_seqops = {
+static const struct seq_operations ax25_info_seqops = {
.start = ax25_info_start,
.next = ax25_info_next,
.stop = ax25_info_stop,
.show = ax25_info_show,
};
-
-static int ax25_info_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &ax25_info_seqops);
-}
-
-static struct file_operations ax25_info_fops = {
- .owner = THIS_MODULE,
- .open = ax25_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#endif
-static struct net_proto_family ax25_family_ops = {
+static const struct net_proto_family ax25_family_ops = {
.family = PF_AX25,
.create = ax25_create,
.owner = THIS_MODULE,
@@ -1965,6 +2021,7 @@ static const struct proto_ops ax25_proto_ops = {
.getname = ax25_getname,
.poll = datagram_poll,
.ioctl = ax25_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = ax25_listen,
.shutdown = ax25_shutdown,
.setsockopt = ax25_setsockopt,
@@ -1972,20 +2029,18 @@ static const struct proto_ops ax25_proto_ops = {
.sendmsg = ax25_sendmsg,
.recvmsg = ax25_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
/*
* Called by socket.c on kernel start up
*/
-static struct packet_type ax25_packet_type = {
- .type = __constant_htons(ETH_P_AX25),
- .dev = NULL, /* All devices */
+static struct packet_type ax25_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_AX25),
.func = ax25_kiss_rcv,
};
static struct notifier_block ax25_dev_notifier = {
- .notifier_call =ax25_device_event,
+ .notifier_call = ax25_device_event,
};
static int __init ax25_init(void)
@@ -1998,11 +2053,11 @@ static int __init ax25_init(void)
sock_register(&ax25_family_ops);
dev_add_pack(&ax25_packet_type);
register_netdevice_notifier(&ax25_dev_notifier);
- ax25_register_sysctl();
- proc_net_fops_create("ax25_route", S_IRUGO, &ax25_route_fops);
- proc_net_fops_create("ax25", S_IRUGO, &ax25_info_fops);
- proc_net_fops_create("ax25_calls", S_IRUGO, &ax25_uid_fops);
+ proc_create_seq("ax25_route", 0444, init_net.proc_net, &ax25_rt_seqops);
+ proc_create_seq("ax25", 0444, init_net.proc_net, &ax25_info_seqops);
+ proc_create_seq("ax25_calls", 0444, init_net.proc_net,
+ &ax25_uid_seqops);
out:
return rc;
}
@@ -2016,19 +2071,19 @@ MODULE_ALIAS_NETPROTO(PF_AX25);
static void __exit ax25_exit(void)
{
- proc_net_remove("ax25_route");
- proc_net_remove("ax25");
- proc_net_remove("ax25_calls");
- ax25_rt_free();
- ax25_uid_free();
- ax25_dev_free();
+ remove_proc_entry("ax25_route", init_net.proc_net);
+ remove_proc_entry("ax25", init_net.proc_net);
+ remove_proc_entry("ax25_calls", init_net.proc_net);
- ax25_unregister_sysctl();
unregister_netdevice_notifier(&ax25_dev_notifier);
dev_remove_pack(&ax25_packet_type);
sock_unregister(PF_AX25);
proto_unregister(&ax25_proto);
+
+ ax25_rt_free();
+ ax25_uid_free();
+ ax25_dev_free();
}
module_exit(ax25_exit);
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index 5f0896ad0042..f68865a4d0ab 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -12,7 +9,6 @@
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -22,24 +18,32 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
/*
- * The null address is defined as a callsign of all spaces with an
- * SSID of zero.
+ * The default broadcast address of an interface is QST-0; the default address
+ * is LINUX-1. The null address is defined as a callsign of all spaces with
+ * an SSID of zero.
*/
-ax25_address null_ax25_address = {{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00}};
+const ax25_address ax25_bcast =
+ {{'Q' << 1, 'S' << 1, 'T' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}};
+const ax25_address ax25_defaddr =
+ {{'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, 1 << 1}};
+const ax25_address null_ax25_address =
+ {{' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}};
+
+EXPORT_SYMBOL_GPL(ax25_bcast);
+EXPORT_SYMBOL_GPL(ax25_defaddr);
EXPORT_SYMBOL(null_ax25_address);
/*
* ax25 -> ascii conversion
*/
-char *ax2asc(char *buf, ax25_address *a)
+char *ax2asc(char *buf, const ax25_address *a)
{
char c, *s;
int n;
@@ -72,9 +76,9 @@ EXPORT_SYMBOL(ax2asc);
/*
* ascii -> ax25 conversion
*/
-void asc2ax(ax25_address *addr, char *callsign)
+void asc2ax(ax25_address *addr, const char *callsign)
{
- char *s;
+ const char *s;
int n;
for (s = callsign, n = 0; n < 6; n++) {
@@ -107,7 +111,7 @@ EXPORT_SYMBOL(asc2ax);
/*
* Compare two ax.25 addresses
*/
-int ax25cmp(ax25_address *a, ax25_address *b)
+int ax25cmp(const ax25_address *a, const ax25_address *b)
{
int ct = 0;
@@ -117,10 +121,10 @@ int ax25cmp(ax25_address *a, ax25_address *b)
ct++;
}
- if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */
- return 0;
+ if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */
+ return 0;
- return 2; /* Partial match */
+ return 2; /* Partial match */
}
EXPORT_SYMBOL(ax25cmp);
@@ -128,7 +132,7 @@ EXPORT_SYMBOL(ax25cmp);
/*
* Compare two AX.25 digipeater paths.
*/
-int ax25digicmp(ax25_digi *digi1, ax25_digi *digi2)
+int ax25digicmp(const ax25_digi *digi1, const ax25_digi *digi2)
{
int i;
@@ -149,7 +153,9 @@ int ax25digicmp(ax25_digi *digi1, ax25_digi *digi2)
* Given an AX.25 address pull of to, from, digi list, command/response and the start of data
*
*/
-unsigned char *ax25_addr_parse(unsigned char *buf, int len, ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags, int *dama)
+const unsigned char *ax25_addr_parse(const unsigned char *buf, int len,
+ ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags,
+ int *dama)
{
int d = 0;
@@ -180,8 +186,10 @@ unsigned char *ax25_addr_parse(unsigned char *buf, int len, ax25_address *src, a
digi->ndigi = 0;
while (!(buf[-1] & AX25_EBIT)) {
- if (d >= AX25_MAX_DIGIS) return NULL; /* Max of 6 digis */
- if (len < 7) return NULL; /* Short packet */
+ if (d >= AX25_MAX_DIGIS)
+ return NULL;
+ if (len < AX25_ADDR_LEN)
+ return NULL;
memcpy(&digi->calls[d], buf, AX25_ADDR_LEN);
digi->ndigi = d + 1;
@@ -204,7 +212,8 @@ unsigned char *ax25_addr_parse(unsigned char *buf, int len, ax25_address *src, a
/*
* Assemble an AX.25 header from the bits
*/
-int ax25_addr_build(unsigned char *buf, ax25_address *src, ax25_address *dest, ax25_digi *d, int flag, int modulus)
+int ax25_addr_build(unsigned char *buf, const ax25_address *src,
+ const ax25_address *dest, const ax25_digi *d, int flag, int modulus)
{
int len = 0;
int ct = 0;
@@ -261,7 +270,7 @@ int ax25_addr_build(unsigned char *buf, ax25_address *src, ax25_address *dest, a
return len;
}
-int ax25_addr_size(ax25_digi *dp)
+int ax25_addr_size(const ax25_digi *dp)
{
if (dp == NULL)
return 2 * AX25_ADDR_LEN;
@@ -272,7 +281,7 @@ int ax25_addr_size(ax25_digi *dp)
/*
* Reverse Digipeat List. May not pass both parameters as same struct
*/
-void ax25_digi_invert(ax25_digi *in, ax25_digi *out)
+void ax25_digi_invert(const ax25_digi *in, ax25_digi *out)
{
int ct;
@@ -292,4 +301,3 @@ void ax25_digi_invert(ax25_digi *in, ax25_digi *out)
}
}
}
-
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index b787678220ff..3733c0254a50 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -1,17 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
+#include <linux/slab.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -23,14 +20,14 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
+#include <linux/list.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/init.h>
-ax25_dev *ax25_dev_list;
+static LIST_HEAD(ax25_dev_list);
DEFINE_SPINLOCK(ax25_dev_lock);
ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
@@ -38,9 +35,11 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
ax25_dev *ax25_dev, *res = NULL;
spin_lock_bh(&ax25_dev_lock);
- for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
- if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) {
+ list_for_each_entry(ax25_dev, &ax25_dev_list, list)
+ if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) {
res = ax25_dev;
+ ax25_dev_hold(ax25_dev);
+ break;
}
spin_unlock_bh(&ax25_dev_lock);
@@ -55,17 +54,17 @@ void ax25_dev_device_up(struct net_device *dev)
{
ax25_dev *ax25_dev;
- if ((ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_ATOMIC)) == NULL) {
+ ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_KERNEL);
+ if (!ax25_dev) {
printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n");
return;
}
- ax25_unregister_sysctl();
-
- dev->ax25_ptr = ax25_dev;
+ refcount_set(&ax25_dev->refcount, 1);
ax25_dev->dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL);
ax25_dev->forward = NULL;
+ ax25_dev->device_up = true;
ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE;
ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE;
@@ -80,18 +79,21 @@ void ax25_dev_device_up(struct net_device *dev)
ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2;
ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN;
ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT;
+#endif
#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
- init_timer(&ax25_dev->dama.slave_timer);
+ ax25_ds_setup_timer(ax25_dev);
#endif
spin_lock_bh(&ax25_dev_lock);
- ax25_dev->next = ax25_dev_list;
- ax25_dev_list = ax25_dev;
+ list_add(&ax25_dev->list, &ax25_dev_list);
+ rcu_assign_pointer(dev->ax25_ptr, ax25_dev);
spin_unlock_bh(&ax25_dev_lock);
- ax25_register_sysctl();
+ ax25_register_dev_sysctl(ax25_dev);
}
void ax25_dev_device_down(struct net_device *dev)
@@ -101,46 +103,32 @@ void ax25_dev_device_down(struct net_device *dev)
if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
return;
- ax25_unregister_sysctl();
+ ax25_unregister_dev_sysctl(ax25_dev);
spin_lock_bh(&ax25_dev_lock);
#ifdef CONFIG_AX25_DAMA_SLAVE
- ax25_ds_del_timer(ax25_dev);
+ timer_shutdown_sync(&ax25_dev->dama.slave_timer);
#endif
/*
* Remove any packet forwarding that points to this device.
*/
- for (s = ax25_dev_list; s != NULL; s = s->next)
+ list_for_each_entry(s, &ax25_dev_list, list)
if (s->forward == dev)
s->forward = NULL;
- if ((s = ax25_dev_list) == ax25_dev) {
- ax25_dev_list = s->next;
- spin_unlock_bh(&ax25_dev_lock);
- dev_put(dev);
- kfree(ax25_dev);
- ax25_register_sysctl();
- return;
- }
-
- while (s != NULL && s->next != NULL) {
- if (s->next == ax25_dev) {
- s->next = ax25_dev->next;
- spin_unlock_bh(&ax25_dev_lock);
- dev_put(dev);
- kfree(ax25_dev);
- ax25_register_sysctl();
- return;
+ list_for_each_entry(s, &ax25_dev_list, list) {
+ if (s == ax25_dev) {
+ list_del(&s->list);
+ break;
}
-
- s = s->next;
}
- spin_unlock_bh(&ax25_dev_lock);
- dev->ax25_ptr = NULL;
- ax25_register_sysctl();
+ RCU_INIT_POINTER(dev->ax25_ptr, NULL);
+ spin_unlock_bh(&ax25_dev_lock);
+ netdev_put(dev, &ax25_dev->dev_tracker);
+ ax25_dev_put(ax25_dev);
}
int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
@@ -152,20 +140,32 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
switch (cmd) {
case SIOCAX25ADDFWD:
- if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL)
+ fwd_dev = ax25_addr_ax25dev(&fwd->port_to);
+ if (!fwd_dev) {
+ ax25_dev_put(ax25_dev);
return -EINVAL;
- if (ax25_dev->forward != NULL)
+ }
+ if (ax25_dev->forward) {
+ ax25_dev_put(fwd_dev);
+ ax25_dev_put(ax25_dev);
return -EINVAL;
+ }
ax25_dev->forward = fwd_dev->dev;
+ ax25_dev_put(fwd_dev);
+ ax25_dev_put(ax25_dev);
break;
case SIOCAX25DELFWD:
- if (ax25_dev->forward == NULL)
+ if (!ax25_dev->forward) {
+ ax25_dev_put(ax25_dev);
return -EINVAL;
+ }
ax25_dev->forward = NULL;
+ ax25_dev_put(ax25_dev);
break;
default:
+ ax25_dev_put(ax25_dev);
return -EINVAL;
}
@@ -190,16 +190,13 @@ struct net_device *ax25_fwd_dev(struct net_device *dev)
*/
void __exit ax25_dev_free(void)
{
- ax25_dev *s, *ax25_dev;
+ ax25_dev *s, *n;
spin_lock_bh(&ax25_dev_lock);
- ax25_dev = ax25_dev_list;
- while (ax25_dev != NULL) {
- s = ax25_dev;
- dev_put(ax25_dev->dev);
- ax25_dev = ax25_dev->next;
- kfree(s);
+ list_for_each_entry_safe(s, n, &ax25_dev_list, list) {
+ netdev_put(s->dev, &s->dev_tracker);
+ list_del(&s->list);
+ ax25_dev_put(s);
}
- ax25_dev_list = NULL;
spin_unlock_bh(&ax25_dev_lock);
}
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index edcaa897027c..c62f8fb06189 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
@@ -12,7 +9,6 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -23,8 +19,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -76,7 +71,7 @@ static int ax25_ds_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int framet
}
ax25_dama_on(ax25);
- /* according to DK4EG´s spec we are required to
+ /* according to DK4EG's spec we are required to
* send a RR RESPONSE FINAL NR=0.
*/
@@ -301,4 +296,3 @@ int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
return queued;
}
-
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index 4d22d4430ec8..f00e27df3c76 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
@@ -12,19 +9,18 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/spinlock.h>
#include <linux/net.h>
+#include <linux/gfp.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -40,9 +36,8 @@ void ax25_ds_nr_error_recovery(ax25_cb *ax25)
void ax25_ds_enquiry_response(ax25_cb *ax25)
{
ax25_cb *ax25o;
- struct hlist_node *node;
- /* Please note that neither DK4EG´s nor DG2FEF´s
+ /* Please note that neither DK4EG's nor DG2FEF's
* DAMA spec mention the following behaviour as seen
* with TheFirmware:
*
@@ -81,7 +76,7 @@ void ax25_ds_enquiry_response(ax25_cb *ax25)
ax25_ds_set_timer(ax25->ax25_dev);
spin_lock(&ax25_list_lock);
- ax25_for_each(ax25o, node, &ax25_list) {
+ ax25_for_each(ax25o, &ax25_list) {
if (ax25o == ax25)
continue;
@@ -137,7 +132,7 @@ static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char p
if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL)
return;
- skb->nh.raw = skb->data;
+ skb_reset_network_header(skb);
p = skb_put(skb, 2);
*p++ = cmd;
@@ -160,10 +155,9 @@ static int ax25_check_dama_slave(ax25_dev *ax25_dev)
{
ax25_cb *ax25;
int res = 0;
- struct hlist_node *node;
spin_lock(&ax25_list_lock);
- ax25_for_each(ax25, node, &ax25_list)
+ ax25_for_each(ax25, &ax25_list)
if (ax25->ax25_dev == ax25_dev && (ax25->condition & AX25_COND_DAMA_MODE) && ax25->state > AX25_STATE_1) {
res = 1;
break;
@@ -208,4 +202,3 @@ void ax25_dama_off(ax25_cb *ax25)
ax25->condition &= ~AX25_COND_DAMA_MODE;
ax25_dev_dama_off(ax25->ax25_dev);
}
-
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 4f44185955c7..0c9e7775aa54 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
@@ -24,13 +21,12 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
-static void ax25_ds_timeout(unsigned long);
+static void ax25_ds_timeout(struct timer_list *);
/*
* Add DAMA slave timeout timer to timer list.
@@ -40,19 +36,15 @@ static void ax25_ds_timeout(unsigned long);
* 1/10th of a second.
*/
-static void ax25_ds_add_timer(ax25_dev *ax25_dev)
+void ax25_ds_setup_timer(ax25_dev *ax25_dev)
{
- struct timer_list *t = &ax25_dev->dama.slave_timer;
- t->data = (unsigned long) ax25_dev;
- t->function = &ax25_ds_timeout;
- t->expires = jiffies + HZ;
- add_timer(t);
+ timer_setup(&ax25_dev->dama.slave_timer, ax25_ds_timeout, 0);
}
void ax25_ds_del_timer(ax25_dev *ax25_dev)
{
if (ax25_dev)
- del_timer(&ax25_dev->dama.slave_timer);
+ timer_delete(&ax25_dev->dama.slave_timer);
}
void ax25_ds_set_timer(ax25_dev *ax25_dev)
@@ -60,10 +52,9 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev)
if (ax25_dev == NULL) /* paranoia */
return;
- del_timer(&ax25_dev->dama.slave_timer);
ax25_dev->dama.slave_timeout =
msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10;
- ax25_ds_add_timer(ax25_dev);
+ mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ);
}
/*
@@ -71,11 +62,10 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev)
* Silently discard all (slave) connections in case our master forgot us...
*/
-static void ax25_ds_timeout(unsigned long arg)
+static void ax25_ds_timeout(struct timer_list *t)
{
- ax25_dev *ax25_dev = (struct ax25_dev *) arg;
+ ax25_dev *ax25_dev = timer_container_of(ax25_dev, t, dama.slave_timer);
ax25_cb *ax25;
- struct hlist_node *node;
if (ax25_dev == NULL || !ax25_dev->dama.slave)
return; /* Yikes! */
@@ -86,7 +76,7 @@ static void ax25_ds_timeout(unsigned long arg)
}
spin_lock(&ax25_list_lock);
- ax25_for_each(ax25, node, &ax25_list) {
+ ax25_for_each(ax25, &ax25_list) {
if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE))
continue;
@@ -108,6 +98,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
switch (ax25->state) {
case AX25_STATE_0:
+ case AX25_STATE_2:
/* Magic here: If we listen() and a new link dies before it
is accepted() it isn't 'dead' so doesn't get removed. */
if (!sk || sock_flag(sk, SOCK_DESTROY) ||
@@ -116,8 +107,9 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
if (sk) {
sock_hold(sk);
ax25_destroy_socket(ax25);
- sock_put(sk);
bh_unlock_sock(sk);
+ /* Ungrab socket and destroy it */
+ sock_put(sk);
} else
ax25_destroy_socket(ax25);
return;
@@ -130,7 +122,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
*/
if (sk != NULL) {
if (atomic_read(&sk->sk_rmem_alloc) <
- (sk->sk_rcvbuf / 2) &&
+ (sk->sk_rcvbuf >> 1) &&
(ax25->condition & AX25_COND_OWN_RX_BUSY)) {
ax25->condition &= ~AX25_COND_OWN_RX_BUSY;
ax25->condition &= ~AX25_COND_ACK_PENDING;
@@ -219,7 +211,8 @@ void ax25_ds_t1_timeout(ax25_cb *ax25)
case AX25_STATE_2:
if (ax25->n2count == ax25->n2) {
ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
- ax25_disconnect(ax25, ETIMEDOUT);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_disconnect(ax25, ETIMEDOUT);
return;
} else {
ax25->n2count++;
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
index 07ac0207eb69..979bc4b828a0 100644
--- a/net/ax25/ax25_iface.c
+++ b/net/ax25/ax25_iface.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -12,34 +9,26 @@
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
-static struct protocol_struct {
- struct protocol_struct *next;
- unsigned int pid;
- int (*func)(struct sk_buff *, ax25_cb *);
-} *protocol_list = NULL;
+static struct ax25_protocol *protocol_list;
static DEFINE_RWLOCK(protocol_list_lock);
-static struct linkfail_struct {
- struct linkfail_struct *next;
- void (*func)(ax25_cb *, int);
-} *linkfail_list = NULL;
+static HLIST_HEAD(ax25_linkfail_list);
static DEFINE_SPINLOCK(linkfail_lock);
static struct listen_struct {
@@ -49,121 +38,67 @@ static struct listen_struct {
} *listen_list = NULL;
static DEFINE_SPINLOCK(listen_lock);
-int ax25_protocol_register(unsigned int pid,
- int (*func)(struct sk_buff *, ax25_cb *))
+/*
+ * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT,
+ * AX25_P_IP or AX25_P_ARP ...
+ */
+void ax25_register_pid(struct ax25_protocol *ap)
{
- struct protocol_struct *protocol;
-
- if (pid == AX25_P_TEXT || pid == AX25_P_SEGMENT)
- return 0;
-#ifdef CONFIG_INET
- if (pid == AX25_P_IP || pid == AX25_P_ARP)
- return 0;
-#endif
- if ((protocol = kmalloc(sizeof(*protocol), GFP_ATOMIC)) == NULL)
- return 0;
-
- protocol->pid = pid;
- protocol->func = func;
-
write_lock_bh(&protocol_list_lock);
- protocol->next = protocol_list;
- protocol_list = protocol;
+ ap->next = protocol_list;
+ protocol_list = ap;
write_unlock_bh(&protocol_list_lock);
-
- return 1;
}
-EXPORT_SYMBOL(ax25_protocol_register);
+EXPORT_SYMBOL_GPL(ax25_register_pid);
void ax25_protocol_release(unsigned int pid)
{
- struct protocol_struct *s, *protocol;
+ struct ax25_protocol *protocol;
write_lock_bh(&protocol_list_lock);
protocol = protocol_list;
- if (protocol == NULL) {
- write_unlock_bh(&protocol_list_lock);
- return;
- }
+ if (protocol == NULL)
+ goto out;
if (protocol->pid == pid) {
protocol_list = protocol->next;
- write_unlock_bh(&protocol_list_lock);
- kfree(protocol);
- return;
+ goto out;
}
while (protocol != NULL && protocol->next != NULL) {
if (protocol->next->pid == pid) {
- s = protocol->next;
protocol->next = protocol->next->next;
- write_unlock_bh(&protocol_list_lock);
- kfree(s);
- return;
+ goto out;
}
protocol = protocol->next;
}
+out:
write_unlock_bh(&protocol_list_lock);
}
EXPORT_SYMBOL(ax25_protocol_release);
-int ax25_linkfail_register(void (*func)(ax25_cb *, int))
+void ax25_linkfail_register(struct ax25_linkfail *lf)
{
- struct linkfail_struct *linkfail;
-
- if ((linkfail = kmalloc(sizeof(*linkfail), GFP_ATOMIC)) == NULL)
- return 0;
-
- linkfail->func = func;
-
spin_lock_bh(&linkfail_lock);
- linkfail->next = linkfail_list;
- linkfail_list = linkfail;
+ hlist_add_head(&lf->lf_node, &ax25_linkfail_list);
spin_unlock_bh(&linkfail_lock);
-
- return 1;
}
EXPORT_SYMBOL(ax25_linkfail_register);
-void ax25_linkfail_release(void (*func)(ax25_cb *, int))
+void ax25_linkfail_release(struct ax25_linkfail *lf)
{
- struct linkfail_struct *s, *linkfail;
-
spin_lock_bh(&linkfail_lock);
- linkfail = linkfail_list;
- if (linkfail == NULL) {
- spin_unlock_bh(&linkfail_lock);
- return;
- }
-
- if (linkfail->func == func) {
- linkfail_list = linkfail->next;
- spin_unlock_bh(&linkfail_lock);
- kfree(linkfail);
- return;
- }
-
- while (linkfail != NULL && linkfail->next != NULL) {
- if (linkfail->next->func == func) {
- s = linkfail->next;
- linkfail->next = linkfail->next->next;
- spin_unlock_bh(&linkfail_lock);
- kfree(s);
- return;
- }
-
- linkfail = linkfail->next;
- }
+ hlist_del_init(&lf->lf_node);
spin_unlock_bh(&linkfail_lock);
}
EXPORT_SYMBOL(ax25_linkfail_release);
-int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
+int ax25_listen_register(const ax25_address *callsign, struct net_device *dev)
{
struct listen_struct *listen;
@@ -171,7 +106,7 @@ int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
return 0;
if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL)
- return 0;
+ return -ENOMEM;
listen->callsign = *callsign;
listen->dev = dev;
@@ -181,12 +116,12 @@ int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
listen_list = listen;
spin_unlock_bh(&listen_lock);
- return 1;
+ return 0;
}
EXPORT_SYMBOL(ax25_listen_register);
-void ax25_listen_release(ax25_address *callsign, struct net_device *dev)
+void ax25_listen_release(const ax25_address *callsign, struct net_device *dev)
{
struct listen_struct *s, *listen;
@@ -223,7 +158,7 @@ EXPORT_SYMBOL(ax25_listen_release);
int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *)
{
int (*res)(struct sk_buff *, ax25_cb *) = NULL;
- struct protocol_struct *protocol;
+ struct ax25_protocol *protocol;
read_lock(&protocol_list_lock);
for (protocol = protocol_list; protocol != NULL; protocol = protocol->next)
@@ -236,13 +171,14 @@ int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *)
return res;
}
-int ax25_listen_mine(ax25_address *callsign, struct net_device *dev)
+int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev)
{
struct listen_struct *listen;
spin_lock_bh(&listen_lock);
for (listen = listen_list; listen != NULL; listen = listen->next)
- if (ax25cmp(&listen->callsign, callsign) == 0 && (listen->dev == dev || listen->dev == NULL)) {
+ if (ax25cmp(&listen->callsign, callsign) == 0 &&
+ (listen->dev == dev || listen->dev == NULL)) {
spin_unlock_bh(&listen_lock);
return 1;
}
@@ -253,17 +189,17 @@ int ax25_listen_mine(ax25_address *callsign, struct net_device *dev)
void ax25_link_failed(ax25_cb *ax25, int reason)
{
- struct linkfail_struct *linkfail;
+ struct ax25_linkfail *lf;
spin_lock_bh(&linkfail_lock);
- for (linkfail = linkfail_list; linkfail != NULL; linkfail = linkfail->next)
- (linkfail->func)(ax25, reason);
+ hlist_for_each_entry(lf, &ax25_linkfail_list, lf_node)
+ lf->func(ax25, reason);
spin_unlock_bh(&linkfail_lock);
}
int ax25_protocol_is_registered(unsigned int pid)
{
- struct protocol_struct *protocol;
+ struct ax25_protocol *protocol;
int res = 0;
read_lock_bh(&protocol_list_lock);
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index e9d94291581e..f2d66af86359 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -14,20 +11,18 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <linux/netfilter.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -62,12 +57,14 @@ static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb)
skb_reserve(skbn, AX25_MAX_HEADER_LEN);
skbn->dev = ax25->ax25_dev->dev;
- skbn->h.raw = skbn->data;
- skbn->nh.raw = skbn->data;
+ skb_reset_network_header(skbn);
+ skb_reset_transport_header(skbn);
/* Copy data from the fragments */
while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) {
- memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len);
+ skb_copy_from_linear_data(skbo,
+ skb_put(skbn, skbo->len),
+ skbo->len);
kfree_skb(skbo);
}
@@ -123,8 +120,8 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
}
skb_pull(skb, 1); /* Remove PID */
- skb->mac.raw = skb->nh.raw;
- skb->nh.raw = skb->data;
+ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
skb->dev = ax25->ax25_dev->dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_IP);
@@ -184,7 +181,7 @@ static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, i
}
static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
- ax25_address *dev_addr, struct packet_type *ptype)
+ const ax25_address *dev_addr, struct packet_type *ptype)
{
ax25_address src, dest, *next_digi = NULL;
int type = 0, mine = 0, dama;
@@ -197,21 +194,17 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
* Process the AX.25/LAPB frame.
*/
- skb->h.raw = skb->data;
+ skb_reset_transport_header(skb);
- if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
- kfree_skb(skb);
- return 0;
- }
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ goto free;
/*
* Parse the address header.
*/
- if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL) {
- kfree_skb(skb);
- return 0;
- }
+ if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL)
+ goto free;
/*
* Ours perhaps ?
@@ -234,21 +227,19 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
/* UI frame - bypass LAPB processing */
if ((*skb->data & ~0x10) == AX25_UI && dp.lastrepeat + 1 == dp.ndigi) {
- skb->h.raw = skb->data + 2; /* skip control and pid */
+ skb_set_transport_header(skb, 2); /* skip control and pid */
ax25_send_to_raw(&dest, skb, skb->data[1]);
- if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0) {
- kfree_skb(skb);
- return 0;
- }
+ if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0)
+ goto free;
/* Now we are pointing at the pid byte */
switch (skb->data[1]) {
case AX25_P_IP:
skb_pull(skb,2); /* drop PID/CTRL */
- skb->h.raw = skb->data;
- skb->nh.raw = skb->data;
+ skb_reset_transport_header(skb);
+ skb_reset_network_header(skb);
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_IP);
@@ -257,8 +248,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
case AX25_P_ARP:
skb_pull(skb,2);
- skb->h.raw = skb->data;
- skb->nh.raw = skb->data;
+ skb_reset_transport_header(skb);
+ skb_reset_network_header(skb);
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_ARP);
@@ -300,10 +291,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
* If not, should we DM the incoming frame (except DMs) or
* silently ignore them. For now we stay quiet.
*/
- if (ax25_dev->values[AX25_VALUES_CONMODE] == 0) {
- kfree_skb(skb);
- return 0;
- }
+ if (ax25_dev->values[AX25_VALUES_CONMODE] == 0)
+ goto free;
/* LAPB */
@@ -338,8 +327,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
if ((*skb->data & ~AX25_PF) != AX25_DM && mine)
ax25_return_dm(dev, &src, &dest, &dp);
- kfree_skb(skb);
- return 0;
+ goto free;
}
/* b) received SABM(E) */
@@ -362,24 +350,21 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
return 0;
}
- ax25 = ax25_sk(make);
+ ax25 = sk_to_ax25(make);
skb_set_owner_r(skb, make);
skb_queue_head(&sk->sk_receive_queue, skb);
make->sk_state = TCP_ESTABLISHED;
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
bh_unlock_sock(sk);
} else {
- if (!mine) {
- kfree_skb(skb);
- return 0;
- }
+ if (!mine)
+ goto free;
if ((ax25 = ax25_create_cb()) == NULL) {
ax25_return_dm(dev, &src, &dest, &dp);
- kfree_skb(skb);
- return 0;
+ goto free;
}
ax25_fillin_cb(ax25, ax25_dev);
@@ -433,11 +418,12 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
if (sk) {
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
sock_put(sk);
- } else
+ } else {
+free:
kfree_skb(skb);
-
+ }
return 0;
}
@@ -447,8 +433,16 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *ptype, struct net_device *orig_dev)
{
- skb->sk = NULL; /* Initially we don't know who it's for */
- skb->destructor = NULL; /* Who initializes this, dammit?! */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ skb_orphan(skb);
+
+ if (!net_eq(dev_net(dev), &init_net)) {
+ kfree_skb(skb);
+ return 0;
+ }
if ((*skb->data & 0x0F) != 0) {
kfree_skb(skb); /* Not a KISS data frame */
@@ -457,5 +451,5 @@ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */
- return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype);
+ return ax25_rcv(skb, dev, (const ax25_address *)dev->dev_addr, ptype);
}
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 136c3aefa9de..215d4ccf12b9 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -12,19 +9,18 @@
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/mm.h>
@@ -32,7 +28,6 @@
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
-#include <linux/netfilter.h>
#include <linux/sysctl.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -47,7 +42,9 @@
#ifdef CONFIG_INET
-int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len)
+static int ax25_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
{
unsigned char *buff;
@@ -55,51 +52,51 @@ int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short
if (type == ETH_P_AX25)
return 0;
- /* header is an AX.25 UI frame from us to them */
- buff = skb_push(skb, AX25_HEADER_LEN);
- *buff++ = 0x00; /* KISS DATA */
+ /* header is an AX.25 UI frame from us to them */
+ buff = skb_push(skb, AX25_HEADER_LEN);
+ *buff++ = 0x00; /* KISS DATA */
if (daddr != NULL)
memcpy(buff, daddr, dev->addr_len); /* Address specified */
- buff[6] &= ~AX25_CBIT;
- buff[6] &= ~AX25_EBIT;
- buff[6] |= AX25_SSSID_SPARE;
- buff += AX25_ADDR_LEN;
-
- if (saddr != NULL)
- memcpy(buff, saddr, dev->addr_len);
- else
- memcpy(buff, dev->dev_addr, dev->addr_len);
-
- buff[6] &= ~AX25_CBIT;
- buff[6] |= AX25_EBIT;
- buff[6] |= AX25_SSSID_SPARE;
- buff += AX25_ADDR_LEN;
-
- *buff++ = AX25_UI; /* UI */
-
- /* Append a suitable AX.25 PID */
- switch (type) {
- case ETH_P_IP:
- *buff++ = AX25_P_IP;
- break;
- case ETH_P_ARP:
- *buff++ = AX25_P_ARP;
- break;
- default:
- printk(KERN_ERR "AX.25: ax25_hard_header - wrong protocol type 0x%2.2x\n", type);
- *buff++ = 0;
- break;
- }
+ buff[6] &= ~AX25_CBIT;
+ buff[6] &= ~AX25_EBIT;
+ buff[6] |= AX25_SSSID_SPARE;
+ buff += AX25_ADDR_LEN;
+
+ if (saddr != NULL)
+ memcpy(buff, saddr, dev->addr_len);
+ else
+ memcpy(buff, dev->dev_addr, dev->addr_len);
+
+ buff[6] &= ~AX25_CBIT;
+ buff[6] |= AX25_EBIT;
+ buff[6] |= AX25_SSSID_SPARE;
+ buff += AX25_ADDR_LEN;
+
+ *buff++ = AX25_UI; /* UI */
+
+ /* Append a suitable AX.25 PID */
+ switch (type) {
+ case ETH_P_IP:
+ *buff++ = AX25_P_IP;
+ break;
+ case ETH_P_ARP:
+ *buff++ = AX25_P_ARP;
+ break;
+ default:
+ printk(KERN_ERR "AX.25: ax25_hard_header - wrong protocol type 0x%2.2x\n", type);
+ *buff++ = 0;
+ break;
+ }
if (daddr != NULL)
- return AX25_HEADER_LEN;
+ return AX25_HEADER_LEN;
return -AX25_HEADER_LEN; /* Unfinished header */
}
-int ax25_rebuild_header(struct sk_buff *skb)
+netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
{
struct sk_buff *ourskb;
unsigned char *bp = skb->data;
@@ -114,21 +111,21 @@ int ax25_rebuild_header(struct sk_buff *skb)
dst = (ax25_address *)(bp + 1);
src = (ax25_address *)(bp + 8);
- if (arp_find(bp + 1, skb))
- return 1;
-
+ ax25_route_lock_use();
route = ax25_get_route(dst, NULL);
if (route) {
digipeat = route->digipeat;
dev = route->dev;
ip_mode = route->ip_mode;
- };
+ }
if (dev == NULL)
dev = skb->dev;
- if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
- goto put;
+ rcu_read_lock();
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
+ kfree_skb(skb);
+ goto put;
}
if (bp[16] == AX25_P_IP) {
@@ -172,11 +169,11 @@ int ax25_rebuild_header(struct sk_buff *skb)
src_c = *(ax25_address *)(bp + 8);
skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */
- ourskb->nh.raw = ourskb->data;
+ skb_reset_network_header(ourskb);
ax25=ax25_send_frame(
- ourskb,
- ax25_dev->values[AX25_VALUES_PACLEN],
+ ourskb,
+ ax25_dev->values[AX25_VALUES_PACLEN],
&src_c,
&dst_c, digipeat, dev);
if (ax25) {
@@ -186,21 +183,19 @@ int ax25_rebuild_header(struct sk_buff *skb)
}
}
- bp[7] &= ~AX25_CBIT;
- bp[7] &= ~AX25_EBIT;
- bp[7] |= AX25_SSSID_SPARE;
+ bp[7] &= ~AX25_CBIT;
+ bp[7] &= ~AX25_EBIT;
+ bp[7] |= AX25_SSSID_SPARE;
- bp[14] &= ~AX25_CBIT;
- bp[14] |= AX25_EBIT;
- bp[14] |= AX25_SSSID_SPARE;
+ bp[14] &= ~AX25_CBIT;
+ bp[14] |= AX25_EBIT;
+ bp[14] |= AX25_SSSID_SPARE;
skb_pull(skb, AX25_KISS_HEADER_LEN);
if (digipeat != NULL) {
- if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) {
- kfree_skb(skb);
+ if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL)
goto put;
- }
skb = ourskb;
}
@@ -208,25 +203,45 @@ int ax25_rebuild_header(struct sk_buff *skb)
ax25_queue_xmit(skb, dev);
put:
- if (route)
- ax25_put_route(route);
-
- return 1;
+ rcu_read_unlock();
+ ax25_route_lock_unuse();
+ return NETDEV_TX_OK;
}
#else /* INET */
-int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len)
+static int ax25_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
{
return -AX25_HEADER_LEN;
}
-int ax25_rebuild_header(struct sk_buff *skb)
+netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
{
- return 1;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
}
-
#endif
-EXPORT_SYMBOL(ax25_hard_header);
-EXPORT_SYMBOL(ax25_rebuild_header);
+static bool ax25_validate_header(const char *header, unsigned int len)
+{
+ ax25_digi digi;
+
+ if (!len)
+ return false;
+
+ if (header[0])
+ return true;
+
+ return ax25_addr_parse(header + 1, len - 1, NULL, NULL, &digi, NULL,
+ NULL);
+}
+
+const struct header_ops ax25_header_ops = {
+ .create = ax25_hard_header,
+ .validate = ax25_validate_header,
+};
+
+EXPORT_SYMBOL(ax25_header_ops);
+EXPORT_SYMBOL(ax25_ip_xmit);
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index f84047d1e8ce..8bca2ace98e5 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -14,27 +11,25 @@
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/spinlock.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
-#include <linux/netfilter.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
static DEFINE_SPINLOCK(ax25_frag_lock);
-ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev)
+ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev)
{
ax25_dev *ax25_dev;
ax25_cb *ax25;
@@ -44,10 +39,14 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2
* specified.
*/
if (paclen == 0) {
- if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ rcu_read_lock();
+ ax25_dev = ax25_dev_ax25dev(dev);
+ if (!ax25_dev) {
+ rcu_read_unlock();
return NULL;
-
+ }
paclen = ax25_dev->values[AX25_VALUES_PACLEN];
+ rcu_read_unlock();
}
/*
@@ -58,13 +57,19 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2
return ax25; /* It already existed */
}
- if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ rcu_read_lock();
+ ax25_dev = ax25_dev_ax25dev(dev);
+ if (!ax25_dev) {
+ rcu_read_unlock();
return NULL;
+ }
- if ((ax25 = ax25_create_cb()) == NULL)
+ if ((ax25 = ax25_create_cb()) == NULL) {
+ rcu_read_unlock();
return NULL;
-
+ }
ax25_fillin_cb(ax25, ax25_dev);
+ rcu_read_unlock();
ax25->source_addr = *src;
ax25->dest_addr = *dest;
@@ -93,6 +98,12 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2
#endif
}
+ /*
+ * There is one ref for the state machine; a caller needs
+ * one more to put it back, just like with the existing one.
+ */
+ ax25_cb_hold(ax25);
+
ax25_cb_add(ax25);
ax25->state = AX25_STATE_1;
@@ -118,6 +129,12 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb)
unsigned char *p;
int frontlen, len, fragno, ka9qfrag, first = 1;
+ if (paclen < 16) {
+ WARN_ON_ONCE(1);
+ kfree_skb(skb);
+ return;
+ }
+
if ((skb->len - 1) > paclen) {
if (*skb->data == AX25_P_TEXT) {
skb_pull(skb, 1); /* skip PID */
@@ -149,8 +166,9 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb)
if (ka9qfrag == 1) {
skb_reserve(skbn, frontlen + 2);
- skbn->nh.raw = skbn->data + (skb->nh.raw - skb->data);
- memcpy(skb_put(skbn, len), skb->data, len);
+ skb_set_network_header(skbn,
+ skb_network_offset(skb));
+ skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
p = skb_push(skbn, 2);
*p++ = AX25_P_SEGMENT;
@@ -162,8 +180,9 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb)
}
} else {
skb_reserve(skbn, frontlen + 1);
- skbn->nh.raw = skbn->data + (skb->nh.raw - skb->data);
- memcpy(skb_put(skbn, len), skb->data, len);
+ skb_set_network_header(skbn,
+ skb_network_offset(skb));
+ skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
p = skb_push(skbn, 1);
*p = AX25_P_TEXT;
}
@@ -206,7 +225,7 @@ static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit)
if (skb == NULL)
return;
- skb->nh.raw = skb->data;
+ skb_reset_network_header(skb);
if (ax25->modulus == AX25_MODULUS) {
frame = skb_push(skb, 1);
@@ -250,8 +269,6 @@ void ax25_kick(ax25_cb *ax25)
if (start == end)
return;
- ax25->vs = start;
-
/*
* Transmit data until either we're out of data to send or
* the window is full. Send a poll on the final I frame if
@@ -260,8 +277,13 @@ void ax25_kick(ax25_cb *ax25)
/*
* Dequeue the frame and copy it.
+ * Check for race with ax25_clear_queues().
*/
skb = skb_dequeue(&ax25->write_queue);
+ if (!skb)
+ return;
+
+ ax25->vs = start;
do {
if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
@@ -313,7 +335,6 @@ void ax25_kick(ax25_cb *ax25)
void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
{
- struct sk_buff *skbn;
unsigned char *ptr;
int headroom;
@@ -324,18 +345,12 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
headroom = ax25_addr_size(ax25->digipeat);
- if (skb_headroom(skb) < headroom) {
- if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) {
+ if (unlikely(skb_headroom(skb) < headroom)) {
+ skb = skb_expand_head(skb, headroom);
+ if (!skb) {
printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n");
- kfree_skb(skb);
return;
}
-
- if (skb->sk != NULL)
- skb_set_owner_w(skbn, skb->sk);
-
- kfree_skb(skb);
- skb = skbn;
}
ptr = skb_push(skb, headroom);
@@ -353,7 +368,9 @@ void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
unsigned char *ptr;
+ rcu_read_lock();
skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev));
+ rcu_read_unlock();
ptr = skb_push(skb, 1);
*ptr = 0x00; /* KISS */
@@ -379,4 +396,3 @@ int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr)
}
return 0;
}
-
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 8580356ace5c..10577434f40b 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -23,6 +20,7 @@
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -30,22 +28,22 @@
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/seq_file.h>
+#include <linux/export.h>
static ax25_route *ax25_route_list;
-static DEFINE_RWLOCK(ax25_route_lock);
+DEFINE_RWLOCK(ax25_route_lock);
void ax25_rt_device_down(struct net_device *dev)
{
ax25_route *s, *t, *ax25_rt;
- write_lock(&ax25_route_lock);
+ write_lock_bh(&ax25_route_lock);
ax25_rt = ax25_route_list;
while (ax25_rt != NULL) {
s = ax25_rt;
@@ -68,31 +66,34 @@ void ax25_rt_device_down(struct net_device *dev)
}
}
}
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
}
-static int ax25_rt_add(struct ax25_routes_struct *route)
+static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
{
ax25_route *ax25_rt;
ax25_dev *ax25_dev;
int i;
- if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
- return -EINVAL;
if (route->digi_count > AX25_MAX_DIGIS)
return -EINVAL;
- write_lock(&ax25_route_lock);
+ ax25_dev = ax25_addr_ax25dev(&route->port_addr);
+ if (!ax25_dev)
+ return -EINVAL;
+
+ write_lock_bh(&ax25_route_lock);
ax25_rt = ax25_route_list;
while (ax25_rt != NULL) {
if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 &&
- ax25_rt->dev == ax25_dev->dev) {
+ ax25_rt->dev == ax25_dev->dev) {
kfree(ax25_rt->digipeat);
ax25_rt->digipeat = NULL;
if (route->digi_count != 0) {
if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return -ENOMEM;
}
ax25_rt->digipeat->lastrepeat = -1;
@@ -102,26 +103,28 @@ static int ax25_rt_add(struct ax25_routes_struct *route)
ax25_rt->digipeat->calls[i] = route->digi_addr[i];
}
}
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return 0;
}
ax25_rt = ax25_rt->next;
}
if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) {
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return -ENOMEM;
}
- atomic_set(&ax25_rt->refcount, 1);
ax25_rt->callsign = route->dest_addr;
ax25_rt->dev = ax25_dev->dev;
ax25_rt->digipeat = NULL;
ax25_rt->ip_mode = ' ';
if (route->digi_count != 0) {
if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
kfree(ax25_rt);
+ ax25_dev_put(ax25_dev);
return -ENOMEM;
}
ax25_rt->digipeat->lastrepeat = -1;
@@ -133,7 +136,8 @@ static int ax25_rt_add(struct ax25_routes_struct *route)
}
ax25_rt->next = ax25_route_list;
ax25_route_list = ax25_rt;
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return 0;
}
@@ -152,7 +156,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
return -EINVAL;
- write_lock(&ax25_route_lock);
+ write_lock_bh(&ax25_route_lock);
ax25_rt = ax25_route_list;
while (ax25_rt != NULL) {
@@ -162,19 +166,20 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
ax25cmp(&route->dest_addr, &s->callsign) == 0) {
if (ax25_route_list == s) {
ax25_route_list = s->next;
- ax25_put_route(s);
+ __ax25_put_route(s);
} else {
for (t = ax25_route_list; t != NULL; t = t->next) {
if (t->next == s) {
t->next = s->next;
- ax25_put_route(s);
+ __ax25_put_route(s);
break;
}
}
}
}
}
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return 0;
}
@@ -188,7 +193,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option)
if ((ax25_dev = ax25_addr_ax25dev(&rt_option->port_addr)) == NULL)
return -EINVAL;
- write_lock(&ax25_route_lock);
+ write_lock_bh(&ax25_route_lock);
ax25_rt = ax25_route_list;
while (ax25_rt != NULL) {
@@ -216,7 +221,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option)
}
out:
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return err;
}
@@ -249,11 +255,12 @@ int ax25_rt_ioctl(unsigned int cmd, void __user *arg)
#ifdef CONFIG_PROC_FS
static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(ax25_route_lock)
{
struct ax25_route *ax25_rt;
int i = 1;
-
- read_lock(&ax25_route_lock);
+
+ read_lock(&ax25_route_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
@@ -269,11 +276,12 @@ static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos)
static void *ax25_rt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
- return (v == SEQ_START_TOKEN) ? ax25_route_list :
+ return (v == SEQ_START_TOKEN) ? ax25_route_list :
((struct ax25_route *) v)->next;
}
static void ax25_rt_seq_stop(struct seq_file *seq, void *v)
+ __releases(ax25_route_lock)
{
read_unlock(&ax25_route_lock);
}
@@ -320,32 +328,19 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations ax25_rt_seqops = {
+const struct seq_operations ax25_rt_seqops = {
.start = ax25_rt_seq_start,
.next = ax25_rt_seq_next,
.stop = ax25_rt_seq_stop,
.show = ax25_rt_seq_show,
};
-
-static int ax25_rt_info_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &ax25_rt_seqops);
-}
-
-struct file_operations ax25_route_fops = {
- .owner = THIS_MODULE,
- .open = ax25_rt_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#endif
/*
* Find AX.25 route
*
* Only routes with a reference count of zero can be destroyed.
+ * Must be called with ax25_route_lock read locked.
*/
ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
{
@@ -353,7 +348,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
ax25_route *ax25_def_rt = NULL;
ax25_route *ax25_rt;
- read_lock(&ax25_route_lock);
/*
* Bind to the physical interface we heard them on, or the default
* route if none is found;
@@ -376,104 +370,24 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
if (ax25_spe_rt != NULL)
ax25_rt = ax25_spe_rt;
- if (ax25_rt != NULL)
- ax25_hold_route(ax25_rt);
-
- read_unlock(&ax25_route_lock);
-
return ax25_rt;
}
-/*
- * Adjust path: If you specify a default route and want to connect
- * a target on the digipeater path but w/o having a special route
- * set before, the path has to be truncated from your target on.
- */
-static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
-{
- int k;
-
- for (k = 0; k < digipeat->ndigi; k++) {
- if (ax25cmp(addr, &digipeat->calls[k]) == 0)
- break;
- }
-
- digipeat->ndigi = k;
-}
-
-
-/*
- * Find which interface to use.
- */
-int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
-{
- ax25_uid_assoc *user;
- ax25_route *ax25_rt;
- int err;
-
- if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
- return -EHOSTUNREACH;
-
- if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
- err = -EHOSTUNREACH;
- goto put;
- }
-
- user = ax25_findbyuid(current->euid);
- if (user) {
- ax25->source_addr = user->call;
- ax25_uid_put(user);
- } else {
- if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
- err = -EPERM;
- goto put;
- }
- ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
- }
-
- if (ax25_rt->digipeat != NULL) {
- ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi),
- GFP_ATOMIC);
- if (ax25->digipeat == NULL) {
- err = -ENOMEM;
- goto put;
- }
- ax25_adjust_path(addr, ax25->digipeat);
- }
-
- if (ax25->sk != NULL) {
- bh_lock_sock(ax25->sk);
- sock_reset_flag(ax25->sk, SOCK_ZAPPED);
- bh_unlock_sock(ax25->sk);
- }
-
-put:
- ax25_put_route(ax25_rt);
-
- return 0;
-}
struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
ax25_address *dest, ax25_digi *digi)
{
- struct sk_buff *skbn;
unsigned char *bp;
int len;
len = digi->ndigi * AX25_ADDR_LEN;
- if (skb_headroom(skb) < len) {
- if ((skbn = skb_realloc_headroom(skb, len)) == NULL) {
+ if (unlikely(skb_headroom(skb) < len)) {
+ skb = skb_expand_head(skb, len);
+ if (!skb) {
printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n");
return NULL;
}
-
- if (skb->sk != NULL)
- skb_set_owner_w(skbn, skb->sk);
-
- kfree_skb(skb);
-
- skb = skbn;
}
bp = skb_push(skb, len);
@@ -490,7 +404,7 @@ void __exit ax25_rt_free(void)
{
ax25_route *s, *ax25_rt = ax25_route_list;
- write_lock(&ax25_route_lock);
+ write_lock_bh(&ax25_route_lock);
while (ax25_rt != NULL) {
s = ax25_rt;
ax25_rt = ax25_rt->next;
@@ -498,5 +412,5 @@ void __exit ax25_rt_free(void)
kfree(s->digipeat);
kfree(s);
}
- write_unlock(&ax25_route_lock);
+ write_unlock_bh(&ax25_route_lock);
}
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index f6ed283e9de8..ba176196ae06 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -19,7 +16,6 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -30,8 +26,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
index 2b3c801ae486..4c36f1342558 100644
--- a/net/ax25/ax25_std_subr.c
+++ b/net/ax25/ax25_std_subr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -11,7 +8,6 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -21,8 +17,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index a29c480a4dc1..b17da41210cb 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -14,7 +11,6 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -25,21 +21,21 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
void ax25_std_heartbeat_expiry(ax25_cb *ax25)
{
- struct sock *sk=ax25->sk;
-
+ struct sock *sk = ax25->sk;
+
if (sk)
bh_lock_sock(sk);
switch (ax25->state) {
case AX25_STATE_0:
+ case AX25_STATE_2:
/* Magic here: If we listen() and a new link dies before it
is accepted() it isn't 'dead' so doesn't get removed. */
if (!sk || sock_flag(sk, SOCK_DESTROY) ||
@@ -49,6 +45,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25)
sock_hold(sk);
ax25_destroy_socket(ax25);
bh_unlock_sock(sk);
+ /* Ungrab socket and destroy it */
sock_put(sk);
} else
ax25_destroy_socket(ax25);
@@ -63,7 +60,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25)
*/
if (sk != NULL) {
if (atomic_read(&sk->sk_rmem_alloc) <
- (sk->sk_rcvbuf / 2) &&
+ (sk->sk_rcvbuf >> 1) &&
(ax25->condition & AX25_COND_OWN_RX_BUSY)) {
ax25->condition &= ~AX25_COND_OWN_RX_BUSY;
ax25->condition &= ~AX25_COND_ACK_PENDING;
@@ -146,7 +143,8 @@ void ax25_std_t1timer_expiry(ax25_cb *ax25)
case AX25_STATE_2:
if (ax25->n2count == ax25->n2) {
ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
- ax25_disconnect(ax25, ETIMEDOUT);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_disconnect(ax25, ETIMEDOUT);
return;
} else {
ax25->n2count++;
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index c41dbe5fadee..bff4b203a893 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -14,19 +11,18 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -56,7 +52,7 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr)
*/
if (ax25->va != nr) {
while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) {
- skb = skb_dequeue(&ax25->ack_queue);
+ skb = skb_dequeue(&ax25->ack_queue);
kfree_skb(skb);
ax25->va = (ax25->va + 1) % ax25->modulus;
}
@@ -65,20 +61,15 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr)
void ax25_requeue_frames(ax25_cb *ax25)
{
- struct sk_buff *skb, *skb_prev = NULL;
+ struct sk_buff *skb;
/*
* Requeue all the un-ack-ed frames on the output queue to be picked
* up by ax25_kick called from the timer. This arrangement handles the
* possibility of an empty output queue.
*/
- while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) {
- if (skb_prev == NULL)
- skb_queue_head(&ax25->write_queue, skb);
- else
- skb_append(skb_prev, skb, &ax25->write_queue);
- skb_prev = skb;
- }
+ while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL)
+ skb_queue_head(&ax25->write_queue, skb);
}
/*
@@ -163,7 +154,7 @@ void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type)
skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len);
- skb->nh.raw = skb->data;
+ skb_reset_network_header(skb);
/* Assume a response - address structure for DTE */
if (ax25->modulus == AX25_MODULUS) {
@@ -206,7 +197,7 @@ void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *des
return; /* Next SABM will get DM'd */
skb_reserve(skb, dev->hard_header_len);
- skb->nh.raw = skb->data;
+ skb_reset_network_header(skb);
ax25_digi_invert(digi, &retdigi);
@@ -270,16 +261,27 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
- ax25_stop_t1timer(ax25);
- ax25_stop_t2timer(ax25);
- ax25_stop_t3timer(ax25);
- ax25_stop_idletimer(ax25);
+ if (reason == ENETUNREACH) {
+ timer_delete_sync(&ax25->timer);
+ timer_delete_sync(&ax25->t1timer);
+ timer_delete_sync(&ax25->t2timer);
+ timer_delete_sync(&ax25->t3timer);
+ timer_delete_sync(&ax25->idletimer);
+ } else {
+ if (ax25->sk && !sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_stop_heartbeat(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_stop_t3timer(ax25);
+ ax25_stop_idletimer(ax25);
+ }
ax25->state = AX25_STATE_0;
ax25_link_failed(ax25, reason);
if (ax25->sk != NULL) {
+ local_bh_disable();
bh_lock_sock(ax25->sk);
ax25->sk->sk_state = TCP_CLOSE;
ax25->sk->sk_err = reason;
@@ -289,5 +291,6 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
sock_set_flag(ax25->sk, SOCK_DEAD);
}
bh_unlock_sock(ax25->sk);
+ local_bh_enable();
}
}
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
index 72594867fab6..a69bfbc8b679 100644
--- a/net/ax25/ax25_timer.c
+++ b/net/ax25/ax25_timer.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -28,100 +25,80 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
-static void ax25_heartbeat_expiry(unsigned long);
-static void ax25_t1timer_expiry(unsigned long);
-static void ax25_t2timer_expiry(unsigned long);
-static void ax25_t3timer_expiry(unsigned long);
-static void ax25_idletimer_expiry(unsigned long);
+static void ax25_heartbeat_expiry(struct timer_list *);
+static void ax25_t1timer_expiry(struct timer_list *);
+static void ax25_t2timer_expiry(struct timer_list *);
+static void ax25_t3timer_expiry(struct timer_list *);
+static void ax25_idletimer_expiry(struct timer_list *);
-void ax25_start_heartbeat(ax25_cb *ax25)
+void ax25_setup_timers(ax25_cb *ax25)
{
- del_timer(&ax25->timer);
-
- ax25->timer.data = (unsigned long)ax25;
- ax25->timer.function = &ax25_heartbeat_expiry;
- ax25->timer.expires = jiffies + 5 * HZ;
+ timer_setup(&ax25->timer, ax25_heartbeat_expiry, 0);
+ timer_setup(&ax25->t1timer, ax25_t1timer_expiry, 0);
+ timer_setup(&ax25->t2timer, ax25_t2timer_expiry, 0);
+ timer_setup(&ax25->t3timer, ax25_t3timer_expiry, 0);
+ timer_setup(&ax25->idletimer, ax25_idletimer_expiry, 0);
+}
- add_timer(&ax25->timer);
+void ax25_start_heartbeat(ax25_cb *ax25)
+{
+ mod_timer(&ax25->timer, jiffies + 5 * HZ);
}
void ax25_start_t1timer(ax25_cb *ax25)
{
- del_timer(&ax25->t1timer);
-
- ax25->t1timer.data = (unsigned long)ax25;
- ax25->t1timer.function = &ax25_t1timer_expiry;
- ax25->t1timer.expires = jiffies + ax25->t1;
-
- add_timer(&ax25->t1timer);
+ mod_timer(&ax25->t1timer, jiffies + ax25->t1);
}
void ax25_start_t2timer(ax25_cb *ax25)
{
- del_timer(&ax25->t2timer);
-
- ax25->t2timer.data = (unsigned long)ax25;
- ax25->t2timer.function = &ax25_t2timer_expiry;
- ax25->t2timer.expires = jiffies + ax25->t2;
-
- add_timer(&ax25->t2timer);
+ mod_timer(&ax25->t2timer, jiffies + ax25->t2);
}
void ax25_start_t3timer(ax25_cb *ax25)
{
- del_timer(&ax25->t3timer);
-
- if (ax25->t3 > 0) {
- ax25->t3timer.data = (unsigned long)ax25;
- ax25->t3timer.function = &ax25_t3timer_expiry;
- ax25->t3timer.expires = jiffies + ax25->t3;
-
- add_timer(&ax25->t3timer);
- }
+ if (ax25->t3 > 0)
+ mod_timer(&ax25->t3timer, jiffies + ax25->t3);
+ else
+ timer_delete(&ax25->t3timer);
}
void ax25_start_idletimer(ax25_cb *ax25)
{
- del_timer(&ax25->idletimer);
-
- if (ax25->idle > 0) {
- ax25->idletimer.data = (unsigned long)ax25;
- ax25->idletimer.function = &ax25_idletimer_expiry;
- ax25->idletimer.expires = jiffies + ax25->idle;
-
- add_timer(&ax25->idletimer);
- }
+ if (ax25->idle > 0)
+ mod_timer(&ax25->idletimer, jiffies + ax25->idle);
+ else
+ timer_delete(&ax25->idletimer);
}
void ax25_stop_heartbeat(ax25_cb *ax25)
{
- del_timer(&ax25->timer);
+ timer_delete(&ax25->timer);
}
void ax25_stop_t1timer(ax25_cb *ax25)
{
- del_timer(&ax25->t1timer);
+ timer_delete(&ax25->t1timer);
}
void ax25_stop_t2timer(ax25_cb *ax25)
{
- del_timer(&ax25->t2timer);
+ timer_delete(&ax25->t2timer);
}
void ax25_stop_t3timer(ax25_cb *ax25)
{
- del_timer(&ax25->t3timer);
+ timer_delete(&ax25->t3timer);
}
void ax25_stop_idletimer(ax25_cb *ax25)
{
- del_timer(&ax25->idletimer);
+ timer_delete(&ax25->idletimer);
}
int ax25_t1timer_running(ax25_cb *ax25)
@@ -131,18 +108,20 @@ int ax25_t1timer_running(ax25_cb *ax25)
unsigned long ax25_display_timer(struct timer_list *timer)
{
+ long delta = timer->expires - jiffies;
+
if (!timer_pending(timer))
return 0;
- return timer->expires - jiffies;
+ return max(0L, delta);
}
EXPORT_SYMBOL(ax25_display_timer);
-static void ax25_heartbeat_expiry(unsigned long param)
+static void ax25_heartbeat_expiry(struct timer_list *t)
{
int proto = AX25_PROTO_STD_SIMPLEX;
- ax25_cb *ax25 = (ax25_cb *)param;
+ ax25_cb *ax25 = timer_container_of(ax25, t, timer);
if (ax25->ax25_dev)
proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL];
@@ -164,9 +143,9 @@ static void ax25_heartbeat_expiry(unsigned long param)
}
}
-static void ax25_t1timer_expiry(unsigned long param)
+static void ax25_t1timer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = (ax25_cb *)param;
+ ax25_cb *ax25 = timer_container_of(ax25, t, t1timer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
@@ -183,9 +162,9 @@ static void ax25_t1timer_expiry(unsigned long param)
}
}
-static void ax25_t2timer_expiry(unsigned long param)
+static void ax25_t2timer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = (ax25_cb *)param;
+ ax25_cb *ax25 = timer_container_of(ax25, t, t2timer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
@@ -202,9 +181,9 @@ static void ax25_t2timer_expiry(unsigned long param)
}
}
-static void ax25_t3timer_expiry(unsigned long param)
+static void ax25_t3timer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = (ax25_cb *)param;
+ ax25_cb *ax25 = timer_container_of(ax25, t, t3timer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
@@ -223,9 +202,9 @@ static void ax25_t3timer_expiry(unsigned long param)
}
}
-static void ax25_idletimer_expiry(unsigned long param)
+static void ax25_idletimer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = (ax25_cb *)param;
+ ax25_cb *ax25 = timer_container_of(ax25, t, idletimer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 5e9a81e8b214..241e4680ecb1 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -13,20 +10,19 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/spinlock.h>
+#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -35,8 +31,8 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
-#include <linux/netfilter.h>
#include <linux/sysctl.h>
+#include <linux/export.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -44,21 +40,20 @@
* Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
*/
-HLIST_HEAD(ax25_uid_list);
+static HLIST_HEAD(ax25_uid_list);
static DEFINE_RWLOCK(ax25_uid_lock);
-int ax25_uid_policy = 0;
+int ax25_uid_policy;
EXPORT_SYMBOL(ax25_uid_policy);
-ax25_uid_assoc *ax25_findbyuid(uid_t uid)
+ax25_uid_assoc *ax25_findbyuid(kuid_t uid)
{
ax25_uid_assoc *ax25_uid, *res = NULL;
- struct hlist_node *node;
read_lock(&ax25_uid_lock);
- ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
- if (ax25_uid->uid == uid) {
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
+ if (uid_eq(ax25_uid->uid, uid)) {
ax25_uid_hold(ax25_uid);
res = ax25_uid;
break;
@@ -74,7 +69,6 @@ EXPORT_SYMBOL(ax25_findbyuid);
int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
{
ax25_uid_assoc *ax25_uid;
- struct hlist_node *node;
ax25_uid_assoc *user;
unsigned long res;
@@ -82,9 +76,9 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
case SIOCAX25GETUID:
res = -ENOENT;
read_lock(&ax25_uid_lock);
- ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
- res = ax25_uid->uid;
+ res = from_kuid_munged(current_user_ns(), ax25_uid->uid);
break;
}
}
@@ -93,9 +87,14 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
return res;
case SIOCAX25ADDUID:
+ {
+ kuid_t sax25_kuid;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- user = ax25_findbyuid(sax->sax25_uid);
+ sax25_kuid = make_kuid(current_user_ns(), sax->sax25_uid);
+ if (!uid_valid(sax25_kuid))
+ return -EINVAL;
+ user = ax25_findbyuid(sax25_kuid);
if (user) {
ax25_uid_put(user);
return -EEXIST;
@@ -105,8 +104,8 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
return -ENOMEM;
- atomic_set(&ax25_uid->refcount, 1);
- ax25_uid->uid = sax->sax25_uid;
+ refcount_set(&ax25_uid->refcount, 1);
+ ax25_uid->uid = sax25_kuid;
ax25_uid->call = sax->sax25_call;
write_lock(&ax25_uid_lock);
@@ -114,14 +113,14 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
write_unlock(&ax25_uid_lock);
return 0;
-
+ }
case SIOCAX25DELUID:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
ax25_uid = NULL;
write_lock(&ax25_uid_lock);
- ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
break;
}
@@ -145,29 +144,19 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
#ifdef CONFIG_PROC_FS
static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(ax25_uid_lock)
{
- struct ax25_uid_assoc *pt;
- struct hlist_node *node;
- int i = 0;
-
read_lock(&ax25_uid_lock);
- ax25_uid_for_each(pt, node, &ax25_uid_list) {
- if (i == *pos)
- return pt;
- ++i;
- }
- return NULL;
+ return seq_hlist_start_head(&ax25_uid_list, *pos);
}
static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- ++*pos;
-
- return hlist_entry(((ax25_uid_assoc *)v)->uid_node.next,
- ax25_uid_assoc, uid_node);
+ return seq_hlist_next(v, &ax25_uid_list, pos);
}
static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
+ __releases(ax25_uid_lock)
{
read_unlock(&ax25_uid_lock);
}
@@ -179,33 +168,22 @@ static int ax25_uid_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
else {
- struct ax25_uid_assoc *pt = v;
+ struct ax25_uid_assoc *pt;
- seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(buf, &pt->call));
+ pt = hlist_entry(v, struct ax25_uid_assoc, uid_node);
+ seq_printf(seq, "%6d %s\n",
+ from_kuid_munged(seq_user_ns(seq), pt->uid),
+ ax2asc(buf, &pt->call));
}
return 0;
}
-static struct seq_operations ax25_uid_seqops = {
+const struct seq_operations ax25_uid_seqops = {
.start = ax25_uid_seq_start,
.next = ax25_uid_seq_next,
.stop = ax25_uid_seq_stop,
.show = ax25_uid_seq_show,
};
-
-static int ax25_uid_info_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &ax25_uid_seqops);
-}
-
-struct file_operations ax25_uid_fops = {
- .owner = THIS_MODULE,
- .open = ax25_uid_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#endif
/*
@@ -214,12 +192,13 @@ struct file_operations ax25_uid_fops = {
void __exit ax25_uid_free(void)
{
ax25_uid_assoc *ax25_uid;
- struct hlist_node *node;
write_lock(&ax25_uid_lock);
- ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+again:
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
hlist_del_init(&ax25_uid->uid_node);
ax25_uid_put(ax25_uid);
+ goto again;
}
write_unlock(&ax25_uid_lock);
}
diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c
index d23a27f25d2f..68753aa30334 100644
--- a/net/ax25/sysctl_net_ax25.c
+++ b/net/ax25/sysctl_net_ax25.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
*/
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <net/ax25.h>
@@ -24,237 +22,160 @@ static int min_idle[1], max_idle[] = {65535000};
static int min_n2[] = {1}, max_n2[] = {31};
static int min_paclen[] = {1}, max_paclen[] = {512};
static int min_proto[1], max_proto[] = { AX25_PROTO_MAX };
+#ifdef CONFIG_AX25_DAMA_SLAVE
static int min_ds_timeout[1], max_ds_timeout[] = {65535000};
+#endif
-static struct ctl_table_header *ax25_table_header;
-
-static ctl_table *ax25_table;
-static int ax25_table_size;
-
-static ctl_table ax25_dir_table[] = {
- {
- .ctl_name = NET_AX25,
- .procname = "ax25",
- .mode = 0555,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ax25_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ax25_dir_table
- },
- { .ctl_name = 0 }
-};
-
-static const ctl_table ax25_param_table[] = {
+static const struct ctl_table ax25_param_table[] = {
{
- .ctl_name = NET_AX25_IP_DEFAULT_MODE,
.procname = "ip_default_mode",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_ipdefmode,
.extra2 = &max_ipdefmode
},
{
- .ctl_name = NET_AX25_DEFAULT_MODE,
.procname = "ax25_default_mode",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_axdefmode,
.extra2 = &max_axdefmode
},
{
- .ctl_name = NET_AX25_BACKOFF_TYPE,
.procname = "backoff_type",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_backoff,
.extra2 = &max_backoff
},
{
- .ctl_name = NET_AX25_CONNECT_MODE,
.procname = "connect_mode",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_conmode,
.extra2 = &max_conmode
},
{
- .ctl_name = NET_AX25_STANDARD_WINDOW,
.procname = "standard_window_size",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_window,
.extra2 = &max_window
},
{
- .ctl_name = NET_AX25_EXTENDED_WINDOW,
.procname = "extended_window_size",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_ewindow,
.extra2 = &max_ewindow
},
{
- .ctl_name = NET_AX25_T1_TIMEOUT,
.procname = "t1_timeout",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_t1,
.extra2 = &max_t1
},
{
- .ctl_name = NET_AX25_T2_TIMEOUT,
.procname = "t2_timeout",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_t2,
.extra2 = &max_t2
},
{
- .ctl_name = NET_AX25_T3_TIMEOUT,
.procname = "t3_timeout",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_t3,
.extra2 = &max_t3
},
{
- .ctl_name = NET_AX25_IDLE_TIMEOUT,
.procname = "idle_timeout",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_idle,
.extra2 = &max_idle
},
{
- .ctl_name = NET_AX25_N2,
.procname = "maximum_retry_count",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_n2,
.extra2 = &max_n2
},
{
- .ctl_name = NET_AX25_PACLEN,
.procname = "maximum_packet_length",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_paclen,
.extra2 = &max_paclen
},
{
- .ctl_name = NET_AX25_PROTOCOL,
.procname = "protocol",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_proto,
.extra2 = &max_proto
},
+#ifdef CONFIG_AX25_DAMA_SLAVE
{
- .ctl_name = NET_AX25_DAMA_SLAVE_TIMEOUT,
.procname = "dama_slave_timeout",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_ds_timeout,
.extra2 = &max_ds_timeout
},
- { .ctl_name = 0 } /* that's all, folks! */
+#endif
};
-void ax25_register_sysctl(void)
+int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
{
- ax25_dev *ax25_dev;
- int n, k;
-
- spin_lock_bh(&ax25_dev_lock);
- for (ax25_table_size = sizeof(ctl_table), ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
- ax25_table_size += sizeof(ctl_table);
-
- if ((ax25_table = kzalloc(ax25_table_size, GFP_ATOMIC)) == NULL) {
- spin_unlock_bh(&ax25_dev_lock);
- return;
+ char path[sizeof("net/ax25/") + IFNAMSIZ];
+ int k;
+ struct ctl_table *table;
+
+ table = kmemdup(ax25_param_table, sizeof(ax25_param_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ BUILD_BUG_ON(ARRAY_SIZE(ax25_param_table) != AX25_MAX_VALUES);
+ for (k = 0; k < AX25_MAX_VALUES; k++)
+ table[k].data = &ax25_dev->values[k];
+
+ snprintf(path, sizeof(path), "net/ax25/%s", ax25_dev->dev->name);
+ ax25_dev->sysheader = register_net_sysctl_sz(&init_net, path, table,
+ ARRAY_SIZE(ax25_param_table));
+ if (!ax25_dev->sysheader) {
+ kfree(table);
+ return -ENOMEM;
}
-
- for (n = 0, ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) {
- struct ctl_table *child = kmemdup(ax25_param_table,
- sizeof(ax25_param_table),
- GFP_ATOMIC);
- if (!child) {
- while (n--)
- kfree(ax25_table[n].child);
- kfree(ax25_table);
- spin_unlock_bh(&ax25_dev_lock);
- return;
- }
- ax25_table[n].child = ax25_dev->systable = child;
- ax25_table[n].ctl_name = n + 1;
- ax25_table[n].procname = ax25_dev->dev->name;
- ax25_table[n].mode = 0555;
-
-#ifndef CONFIG_AX25_DAMA_SLAVE
- /*
- * We do not wish to have a representation of this parameter
- * in /proc/sys/ when configured *not* to include the
- * AX.25 DAMA slave code, do we?
- */
-
- child[AX25_VALUES_DS_TIMEOUT].procname = NULL;
-#endif
-
- child[AX25_MAX_VALUES].ctl_name = 0; /* just in case... */
-
- for (k = 0; k < AX25_MAX_VALUES; k++)
- child[k].data = &ax25_dev->values[k];
-
- n++;
- }
- spin_unlock_bh(&ax25_dev_lock);
-
- ax25_dir_table[0].child = ax25_table;
-
- ax25_table_header = register_sysctl_table(ax25_root_table, 1);
+ return 0;
}
-void ax25_unregister_sysctl(void)
+void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev)
{
- ctl_table *p;
- unregister_sysctl_table(ax25_table_header);
-
- ax25_dir_table[0].child = NULL;
- for (p = ax25_table; p->ctl_name; p++)
- kfree(p->child);
- kfree(ax25_table);
+ struct ctl_table_header *header = ax25_dev->sysheader;
+ const struct ctl_table *table;
+
+ if (header) {
+ ax25_dev->sysheader = NULL;
+ table = header->ctl_table_arg;
+ unregister_net_sysctl_table(header);
+ kfree(table);
+ }
}
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
new file mode 100644
index 000000000000..58c408b7a7d9
--- /dev/null
+++ b/net/batman-adv/Kconfig
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) B.A.T.M.A.N. contributors:
+#
+# Marek Lindner, Simon Wunderlich
+
+#
+# B.A.T.M.A.N meshing protocol
+#
+
+config BATMAN_ADV
+ tristate "B.A.T.M.A.N. Advanced Meshing Protocol"
+ select CRC32
+ help
+ B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
+ a routing protocol for multi-hop ad-hoc mesh networks. The
+ networks may be wired or wireless. See
+ https://www.open-mesh.org/ for more information and user space
+ tools.
+
+config BATMAN_ADV_BATMAN_V
+ bool "B.A.T.M.A.N. V protocol"
+ depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
+ default y
+ help
+ This option enables the B.A.T.M.A.N. V protocol, the successor
+ of the currently used B.A.T.M.A.N. IV protocol. The main
+ changes include splitting of the OGM protocol into a neighbor
+ discovery protocol (Echo Location Protocol, ELP) and a new OGM
+ Protocol OGMv2 for flooding protocol information through the
+ network, as well as a throughput based metric.
+ B.A.T.M.A.N. V is currently considered experimental and not
+ compatible to B.A.T.M.A.N. IV networks.
+
+config BATMAN_ADV_BLA
+ bool "Bridge Loop Avoidance"
+ depends on BATMAN_ADV && INET
+ select CRC16
+ select NET_CRC32C
+ default y
+ help
+ This option enables BLA (Bridge Loop Avoidance), a mechanism
+ to avoid Ethernet frames looping when mesh nodes are connected
+ to both the same LAN and the same mesh. If you will never use
+ more than one mesh node in the same LAN, you can safely remove
+ this feature and save some space.
+
+config BATMAN_ADV_DAT
+ bool "Distributed ARP Table"
+ depends on BATMAN_ADV && INET
+ default y
+ help
+ This option enables DAT (Distributed ARP Table), a DHT based
+ mechanism that increases ARP reliability on sparse wireless
+ mesh networks. If you think that your network does not need
+ this option you can safely remove it and save some space.
+
+config BATMAN_ADV_MCAST
+ bool "Multicast optimisation"
+ depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
+ default y
+ help
+ This option enables the multicast optimisation which aims to
+ reduce the air overhead while improving the reliability of
+ multicast messages.
+
+config BATMAN_ADV_DEBUG
+ bool "B.A.T.M.A.N. debugging"
+ depends on BATMAN_ADV
+ help
+ This is an option for use by developers; most people should
+ say N here. This enables compilation of support for
+ outputting debugging information to the tracing buffer. The output is
+ controlled via the batadv netdev specific log_level setting.
+
+config BATMAN_ADV_TRACING
+ bool "B.A.T.M.A.N. tracing support"
+ depends on BATMAN_ADV
+ depends on EVENT_TRACING
+ help
+ This is an option for use by developers; most people should
+ say N here. Select this option to gather traces like the debug
+ messages using the generic tracing infrastructure of the kernel.
+ BATMAN_ADV_DEBUG must also be selected to get trace events for
+ batadv_dbg.
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
new file mode 100644
index 000000000000..d3c4d4143c14
--- /dev/null
+++ b/net/batman-adv/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) B.A.T.M.A.N. contributors:
+#
+# Marek Lindner, Simon Wunderlich
+
+obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
+batman-adv-y += bat_algo.o
+batman-adv-y += bat_iv_ogm.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
+batman-adv-y += bitarray.o
+batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
+batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
+batman-adv-y += fragmentation.o
+batman-adv-y += gateway_client.o
+batman-adv-y += gateway_common.o
+batman-adv-y += hard-interface.o
+batman-adv-y += hash.o
+batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o
+batman-adv-y += main.o
+batman-adv-y += mesh-interface.o
+batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
+batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o
+batman-adv-y += netlink.o
+batman-adv-y += originator.o
+batman-adv-y += routing.o
+batman-adv-y += send.o
+batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o
+batman-adv-y += tp_meter.o
+batman-adv-y += translation-table.o
+batman-adv-y += tvlv.o
+
+CFLAGS_trace.o := -I$(src)
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
new file mode 100644
index 000000000000..49e5861b58ec
--- /dev/null
+++ b/net/batman-adv/bat_algo.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "main.h"
+
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/netlink.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bat_algo.h"
+#include "netlink.h"
+
+char batadv_routing_algo[20] = "BATMAN_IV";
+static struct hlist_head batadv_algo_list;
+
+/**
+ * batadv_algo_init() - Initialize batman-adv algorithm management data
+ * structures
+ */
+void batadv_algo_init(void)
+{
+ INIT_HLIST_HEAD(&batadv_algo_list);
+}
+
+/**
+ * batadv_algo_get() - Search for algorithm with specific name
+ * @name: algorithm name to find
+ *
+ * Return: Pointer to batadv_algo_ops on success, NULL otherwise
+ */
+struct batadv_algo_ops *batadv_algo_get(const char *name)
+{
+ struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp;
+
+ hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) {
+ if (strcmp(bat_algo_ops_tmp->name, name) != 0)
+ continue;
+
+ bat_algo_ops = bat_algo_ops_tmp;
+ break;
+ }
+
+ return bat_algo_ops;
+}
+
+/**
+ * batadv_algo_register() - Register callbacks for a mesh algorithm
+ * @bat_algo_ops: mesh algorithm callbacks to add
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
+{
+ struct batadv_algo_ops *bat_algo_ops_tmp;
+
+ bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name);
+ if (bat_algo_ops_tmp) {
+ pr_info("Trying to register already registered routing algorithm: %s\n",
+ bat_algo_ops->name);
+ return -EEXIST;
+ }
+
+ /* all algorithms must implement all ops (for now) */
+ if (!bat_algo_ops->iface.enable ||
+ !bat_algo_ops->iface.disable ||
+ !bat_algo_ops->iface.update_mac ||
+ !bat_algo_ops->iface.primary_set ||
+ !bat_algo_ops->neigh.cmp ||
+ !bat_algo_ops->neigh.is_similar_or_better) {
+ pr_info("Routing algo '%s' does not implement required ops\n",
+ bat_algo_ops->name);
+ return -EINVAL;
+ }
+
+ INIT_HLIST_NODE(&bat_algo_ops->list);
+ hlist_add_head(&bat_algo_ops->list, &batadv_algo_list);
+
+ return 0;
+}
+
+/**
+ * batadv_algo_select() - Select algorithm of mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @name: name of the algorithm to select
+ *
+ * The algorithm callbacks for the mesh interface will be set when the algorithm
+ * with the correct name was found. Any previous selected algorithm will not be
+ * deinitialized and the new selected algorithm will also not be initialized.
+ * It is therefore not allowed to call batadv_algo_select outside the creation
+ * function of the mesh interface.
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_algo_select(struct batadv_priv *bat_priv, const char *name)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+
+ bat_algo_ops = batadv_algo_get(name);
+ if (!bat_algo_ops)
+ return -EINVAL;
+
+ bat_priv->algo_ops = bat_algo_ops;
+
+ return 0;
+}
+
+static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+ char *algo_name = (char *)val;
+ size_t name_len = strlen(algo_name);
+
+ if (name_len > 0 && algo_name[name_len - 1] == '\n')
+ algo_name[name_len - 1] = '\0';
+
+ bat_algo_ops = batadv_algo_get(algo_name);
+ if (!bat_algo_ops) {
+ pr_err("Routing algorithm '%s' is not supported\n", algo_name);
+ return -EINVAL;
+ }
+
+ return param_set_copystring(algo_name, kp);
+}
+
+static const struct kernel_param_ops batadv_param_ops_ra = {
+ .set = batadv_param_set_ra,
+ .get = param_get_string,
+};
+
+static struct kparam_string batadv_param_string_ra = {
+ .maxlen = sizeof(batadv_routing_algo),
+ .string = batadv_routing_algo,
+};
+
+module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
+ 0644);
+
+/**
+ * batadv_algo_dump_entry() - fill in information about one supported routing
+ * algorithm
+ * @msg: netlink message to be sent back
+ * @portid: Port to reply to
+ * @seq: Sequence number of message
+ * @bat_algo_ops: Algorithm to be dumped
+ *
+ * Return: Error number, or 0 on success
+ */
+static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_algo_ops *bat_algo_ops)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_algo_dump() - fill in information about supported routing
+ * algorithms
+ * @msg: netlink message to be sent back
+ * @cb: Parameters to the netlink request
+ *
+ * Return: Length of reply message.
+ */
+int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_algo_ops *bat_algo_ops;
+ int skip = cb->args[0];
+ int i = 0;
+
+ hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
+ if (i++ < skip)
+ continue;
+
+ if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
+ bat_algo_ops)) {
+ i--;
+ break;
+ }
+ }
+
+ cb->args[0] = i;
+
+ return msg->len;
+}
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
new file mode 100644
index 000000000000..7ce9abbdb4b4
--- /dev/null
+++ b/net/batman-adv/bat_algo.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Linus Lüssing
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_ALGO_H_
+#define _NET_BATMAN_ADV_BAT_ALGO_H_
+
+#include "main.h"
+
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+
+extern char batadv_routing_algo[];
+
+void batadv_algo_init(void);
+struct batadv_algo_ops *batadv_algo_get(const char *name);
+int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
+int batadv_algo_select(struct batadv_priv *bat_priv, const char *name);
+int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb);
+
+#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
new file mode 100644
index 000000000000..b75c2228e69a
--- /dev/null
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -0,0 +1,2545 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "bat_iv_ogm.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/container_of.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/string_choices.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bat_algo.h"
+#include "bitarray.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+#include "routing.h"
+#include "send.h"
+#include "translation-table.h"
+#include "tvlv.h"
+
+static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work);
+
+/**
+ * enum batadv_dup_status - duplicate status
+ */
+enum batadv_dup_status {
+ /** @BATADV_NO_DUP: the packet is no duplicate */
+ BATADV_NO_DUP = 0,
+
+ /**
+ * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for
+ * the neighbor)
+ */
+ BATADV_ORIG_DUP,
+
+ /** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */
+ BATADV_NEIGH_DUP,
+
+ /**
+ * @BATADV_PROTECTED: originator is currently protected (after reboot)
+ */
+ BATADV_PROTECTED,
+};
+
+/**
+ * batadv_ring_buffer_set() - update the ring buffer with the given value
+ * @lq_recv: pointer to the ring buffer
+ * @lq_index: index to store the value at
+ * @value: value to store in the ring buffer
+ */
+static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value)
+{
+ lq_recv[*lq_index] = value;
+ *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE;
+}
+
+/**
+ * batadv_ring_buffer_avg() - compute the average of all non-zero values stored
+ * in the given ring buffer
+ * @lq_recv: pointer to the ring buffer
+ *
+ * Return: computed average value.
+ */
+static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
+{
+ const u8 *ptr;
+ u16 count = 0;
+ u16 i = 0;
+ u16 sum = 0;
+
+ ptr = lq_recv;
+
+ while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) {
+ if (*ptr != 0) {
+ count++;
+ sum += *ptr;
+ }
+
+ i++;
+ ptr++;
+ }
+
+ if (count == 0)
+ return 0;
+
+ return (u8)(sum / count);
+}
+
+/**
+ * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an
+ * originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: mac address of the originator
+ *
+ * Return: the originator object corresponding to the passed mac address or NULL
+ * on failure.
+ * If the object does not exist, it is created and initialised.
+ */
+static struct batadv_orig_node *
+batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
+{
+ struct batadv_orig_node *orig_node;
+ int hash_added;
+
+ orig_node = batadv_orig_hash_find(bat_priv, addr);
+ if (orig_node)
+ return orig_node;
+
+ orig_node = batadv_orig_node_new(bat_priv, addr);
+ if (!orig_node)
+ return NULL;
+
+ spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock);
+
+ kref_get(&orig_node->refcount);
+ hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
+ batadv_choose_orig, orig_node,
+ &orig_node->hash_entry);
+ if (hash_added != 0)
+ goto free_orig_node_hash;
+
+ return orig_node;
+
+free_orig_node_hash:
+ /* reference for batadv_hash_add */
+ batadv_orig_node_put(orig_node);
+ /* reference from batadv_orig_node_new */
+ batadv_orig_node_put(orig_node);
+
+ return NULL;
+}
+
+static struct batadv_neigh_node *
+batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh)
+{
+ struct batadv_neigh_node *neigh_node;
+
+ neigh_node = batadv_neigh_node_get_or_create(orig_node,
+ hard_iface, neigh_addr);
+ if (!neigh_node)
+ goto out;
+
+ neigh_node->orig_node = orig_neigh;
+
+out:
+ return neigh_node;
+}
+
+static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ unsigned char *ogm_buff;
+ u32 random_seqno;
+
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno);
+
+ hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN;
+ ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC);
+ if (!ogm_buff) {
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+ return -ENOMEM;
+ }
+
+ hard_iface->bat_iv.ogm_buff = ogm_buff;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ batadv_ogm_packet->packet_type = BATADV_IV_OGM;
+ batadv_ogm_packet->version = BATADV_COMPAT_VERSION;
+ batadv_ogm_packet->ttl = 2;
+ batadv_ogm_packet->flags = BATADV_NO_FLAGS;
+ batadv_ogm_packet->reserved = 0;
+ batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE;
+
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ return 0;
+}
+
+static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ kfree(hard_iface->bat_iv.ogm_buff);
+ hard_iface->bat_iv.ogm_buff = NULL;
+
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+}
+
+static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ void *ogm_buff;
+
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ ogm_buff = hard_iface->bat_iv.ogm_buff;
+ if (!ogm_buff)
+ goto unlock;
+
+ batadv_ogm_packet = ogm_buff;
+ ether_addr_copy(batadv_ogm_packet->orig,
+ hard_iface->net_dev->dev_addr);
+ ether_addr_copy(batadv_ogm_packet->prev_sender,
+ hard_iface->net_dev->dev_addr);
+
+unlock:
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+}
+
+static void
+batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ void *ogm_buff;
+
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ ogm_buff = hard_iface->bat_iv.ogm_buff;
+ if (!ogm_buff)
+ goto unlock;
+
+ batadv_ogm_packet = ogm_buff;
+ batadv_ogm_packet->ttl = BATADV_TTL;
+
+unlock:
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+}
+
+/* when do we schedule our own ogm to be sent */
+static unsigned long
+batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv)
+{
+ unsigned int msecs;
+
+ msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
+ msecs += get_random_u32_below(2 * BATADV_JITTER);
+
+ return jiffies + msecs_to_jiffies(msecs);
+}
+
+/* when do we schedule a ogm packet to be sent */
+static unsigned long batadv_iv_ogm_fwd_send_time(void)
+{
+ return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2));
+}
+
+/* apply hop penalty for a normal link */
+static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
+{
+ int hop_penalty = atomic_read(&bat_priv->hop_penalty);
+ int new_tq;
+
+ new_tq = tq * (BATADV_TQ_MAX_VALUE - hop_penalty);
+ new_tq /= BATADV_TQ_MAX_VALUE;
+
+ return new_tq;
+}
+
+/**
+ * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached
+ * @buff_pos: current position in the skb
+ * @packet_len: total length of the skb
+ * @ogm_packet: potential OGM in buffer
+ *
+ * Return: true if there is enough space for another OGM, false otherwise.
+ */
+static bool
+batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
+ const struct batadv_ogm_packet *ogm_packet)
+{
+ int next_buff_pos = 0;
+
+ /* check if there is enough space for the header */
+ next_buff_pos += buff_pos + sizeof(*ogm_packet);
+ if (next_buff_pos > packet_len)
+ return false;
+
+ /* check if there is enough space for the optional TVLV */
+ next_buff_pos += ntohs(ogm_packet->tvlv_len);
+
+ return next_buff_pos <= packet_len;
+}
+
+/* send a batman ogm to a given interface */
+static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ const char *fwd_str;
+ u8 packet_num;
+ s16 buff_pos;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ struct sk_buff *skb;
+ u8 *packet_pos;
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ return;
+
+ packet_num = 0;
+ buff_pos = 0;
+ packet_pos = forw_packet->skb->data;
+ batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos;
+
+ /* adjust all flags and log packets */
+ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len,
+ batadv_ogm_packet)) {
+ /* we might have aggregated direct link packets with an
+ * ordinary base packet
+ */
+ if (test_bit(packet_num, forw_packet->direct_link_flags) &&
+ forw_packet->if_incoming == hard_iface)
+ batadv_ogm_packet->flags |= BATADV_DIRECTLINK;
+ else
+ batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK;
+
+ if (packet_num > 0 || !forw_packet->own)
+ fwd_str = "Forwarding";
+ else
+ fwd_str = "Sending own";
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s %spacket (originator %pM, seqno %u, TQ %d, TTL %d, IDF %s) on interface %s [%pM]\n",
+ fwd_str, (packet_num > 0 ? "aggregated " : ""),
+ batadv_ogm_packet->orig,
+ ntohl(batadv_ogm_packet->seqno),
+ batadv_ogm_packet->tq, batadv_ogm_packet->ttl,
+ str_on_off(batadv_ogm_packet->flags & BATADV_DIRECTLINK),
+ hard_iface->net_dev->name,
+ hard_iface->net_dev->dev_addr);
+
+ buff_pos += BATADV_OGM_HLEN;
+ buff_pos += ntohs(batadv_ogm_packet->tvlv_len);
+ packet_num++;
+ packet_pos = forw_packet->skb->data + buff_pos;
+ batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos;
+ }
+
+ /* create clone because function is called more than once */
+ skb = skb_clone(forw_packet->skb, GFP_ATOMIC);
+ if (skb) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
+ skb->len + ETH_HLEN);
+ batadv_send_broadcast_skb(skb, hard_iface);
+ }
+}
+
+/* send a batman ogm packet */
+static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
+{
+ struct net_device *mesh_iface;
+
+ if (!forw_packet->if_incoming) {
+ pr_err("Error - can't forward packet: incoming iface not specified\n");
+ return;
+ }
+
+ mesh_iface = forw_packet->if_incoming->mesh_iface;
+
+ if (WARN_ON(!forw_packet->if_outgoing))
+ return;
+
+ if (forw_packet->if_outgoing->mesh_iface != mesh_iface) {
+ pr_warn("%s: mesh interface switch for queued OGM\n", __func__);
+ return;
+ }
+
+ if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE)
+ return;
+
+ /* only for one specific outgoing interface */
+ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing);
+}
+
+/**
+ * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an
+ * existing forward packet
+ * @new_bat_ogm_packet: OGM packet to be aggregated
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @packet_len: (total) length of the OGM
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ * @directlink: true if this is a direct link packet
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @forw_packet: the forwarded packet which should be checked
+ *
+ * Return: true if new_packet can be aggregated with forw_packet
+ */
+static bool
+batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
+ struct batadv_priv *bat_priv,
+ int packet_len, unsigned long send_time,
+ bool directlink,
+ const struct batadv_hard_iface *if_incoming,
+ const struct batadv_hard_iface *if_outgoing,
+ const struct batadv_forw_packet *forw_packet)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ unsigned int aggregated_bytes = forw_packet->packet_len + packet_len;
+ struct batadv_hard_iface *primary_if = NULL;
+ u8 packet_num = forw_packet->num_packets;
+ bool res = false;
+ unsigned long aggregation_end_time;
+ unsigned int max_bytes;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data;
+ aggregation_end_time = send_time;
+ aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS);
+
+ max_bytes = min_t(unsigned int, if_outgoing->net_dev->mtu,
+ BATADV_MAX_AGGREGATION_BYTES);
+
+ /* we can aggregate the current packet to this aggregated packet
+ * if:
+ *
+ * - the send time is within our MAX_AGGREGATION_MS time
+ * - the resulting packet won't be bigger than
+ * MAX_AGGREGATION_BYTES and MTU of the outgoing interface
+ * - the number of packets is lower than MAX_AGGREGATION_PACKETS
+ * otherwise aggregation is not possible
+ */
+ if (!time_before(send_time, forw_packet->send_time) ||
+ !time_after_eq(aggregation_end_time, forw_packet->send_time))
+ return false;
+
+ if (aggregated_bytes > max_bytes)
+ return false;
+
+ if (packet_num >= BATADV_MAX_AGGREGATION_PACKETS)
+ return false;
+
+ /* packet is not leaving on the same interface. */
+ if (forw_packet->if_outgoing != if_outgoing)
+ return false;
+
+ /* check aggregation compatibility
+ * -> direct link packets are broadcasted on
+ * their interface only
+ * -> aggregate packet if the current packet is
+ * a "global" packet as well as the base
+ * packet
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return false;
+
+ /* packets without direct link flag and high TTL
+ * are flooded through the net
+ */
+ if (!directlink &&
+ !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) &&
+ batadv_ogm_packet->ttl != 1 &&
+
+ /* own packets originating non-primary
+ * interfaces leave only that interface
+ */
+ (!forw_packet->own ||
+ forw_packet->if_incoming == primary_if)) {
+ res = true;
+ goto out;
+ }
+
+ /* if the incoming packet is sent via this one
+ * interface only - we still can aggregate
+ */
+ if (directlink &&
+ new_bat_ogm_packet->ttl == 1 &&
+ forw_packet->if_incoming == if_incoming &&
+
+ /* packets from direct neighbors or
+ * own secondary interface packets
+ * (= secondary interface packets in general)
+ */
+ (batadv_ogm_packet->flags & BATADV_DIRECTLINK ||
+ (forw_packet->own &&
+ forw_packet->if_incoming != primary_if))) {
+ res = true;
+ goto out;
+ }
+
+out:
+ batadv_hardif_put(primary_if);
+ return res;
+}
+
+/**
+ * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this
+ * packet to it.
+ * @packet_buff: pointer to the OGM
+ * @packet_len: (total) length of the OGM
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ * @direct_link: whether this OGM has direct link status
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @own_packet: true if it is a self-generated ogm
+ */
+static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
+ int packet_len, unsigned long send_time,
+ bool direct_link,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ int own_packet)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_forw_packet *forw_packet_aggr;
+ struct sk_buff *skb;
+ unsigned char *skb_buff;
+ unsigned int skb_size;
+ atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left;
+
+ if (atomic_read(&bat_priv->aggregated_ogms))
+ skb_size = max_t(unsigned int, BATADV_MAX_AGGREGATION_BYTES,
+ packet_len);
+ else
+ skb_size = packet_len;
+
+ skb_size += ETH_HLEN;
+
+ skb = netdev_alloc_skb_ip_align(NULL, skb_size);
+ if (!skb)
+ return;
+
+ forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing,
+ queue_left, bat_priv, skb);
+ if (!forw_packet_aggr) {
+ kfree_skb(skb);
+ return;
+ }
+
+ forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(forw_packet_aggr->skb, ETH_HLEN);
+
+ skb_buff = skb_put(forw_packet_aggr->skb, packet_len);
+ forw_packet_aggr->packet_len = packet_len;
+ memcpy(skb_buff, packet_buff, packet_len);
+
+ forw_packet_aggr->own = own_packet;
+ bitmap_zero(forw_packet_aggr->direct_link_flags,
+ BATADV_MAX_AGGREGATION_PACKETS);
+ forw_packet_aggr->send_time = send_time;
+
+ /* save packet direct link flag status */
+ if (direct_link)
+ set_bit(0, forw_packet_aggr->direct_link_flags);
+
+ INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
+ batadv_iv_send_outstanding_bat_ogm_packet);
+
+ batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time);
+}
+
+/* aggregate a new packet into the existing ogm packet */
+static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr,
+ const unsigned char *packet_buff,
+ int packet_len, bool direct_link)
+{
+ skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len);
+ forw_packet_aggr->packet_len += packet_len;
+
+ /* save packet direct link flag status */
+ if (direct_link)
+ set_bit(forw_packet_aggr->num_packets,
+ forw_packet_aggr->direct_link_flags);
+
+ forw_packet_aggr->num_packets++;
+}
+
+/**
+ * batadv_iv_ogm_queue_add() - queue up an OGM for transmission
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @packet_buff: pointer to the OGM
+ * @packet_len: (total) length of the OGM
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @own_packet: true if it is a self-generated ogm
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ */
+static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv,
+ unsigned char *packet_buff,
+ int packet_len,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ int own_packet, unsigned long send_time)
+{
+ /* _aggr -> pointer to the packet we want to aggregate with
+ * _pos -> pointer to the position in the queue
+ */
+ struct batadv_forw_packet *forw_packet_aggr = NULL;
+ struct batadv_forw_packet *forw_packet_pos = NULL;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ bool direct_link;
+ unsigned long max_aggregation_jiffies;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff;
+ direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK);
+ max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS);
+
+ /* find position for the packet in the forward queue */
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ /* own packets are not to be aggregated */
+ if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) {
+ hlist_for_each_entry(forw_packet_pos,
+ &bat_priv->forw_bat_list, list) {
+ if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet,
+ bat_priv, packet_len,
+ send_time, direct_link,
+ if_incoming,
+ if_outgoing,
+ forw_packet_pos)) {
+ forw_packet_aggr = forw_packet_pos;
+ break;
+ }
+ }
+ }
+
+ /* nothing to aggregate with - either aggregation disabled or no
+ * suitable aggregation packet found
+ */
+ if (!forw_packet_aggr) {
+ /* the following section can run without the lock */
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* if we could not aggregate this packet with one of the others
+ * we hold it back for a while, so that it might be aggregated
+ * later on
+ */
+ if (!own_packet && atomic_read(&bat_priv->aggregated_ogms))
+ send_time += max_aggregation_jiffies;
+
+ batadv_iv_ogm_aggregate_new(packet_buff, packet_len,
+ send_time, direct_link,
+ if_incoming, if_outgoing,
+ own_packet);
+ } else {
+ batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff,
+ packet_len, direct_link);
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+ }
+}
+
+static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
+ const struct ethhdr *ethhdr,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ bool is_single_hop_neigh,
+ bool is_from_best_next_hop,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ u16 tvlv_len;
+
+ if (batadv_ogm_packet->ttl <= 1) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
+ return;
+ }
+
+ if (!is_from_best_next_hop) {
+ /* Mark the forwarded packet when it is not coming from our
+ * best next hop. We still need to forward the packet for our
+ * neighbor link quality detection to work in case the packet
+ * originated from a single hop neighbor. Otherwise we can
+ * simply drop the ogm.
+ */
+ if (is_single_hop_neigh)
+ batadv_ogm_packet->flags |= BATADV_NOT_BEST_NEXT_HOP;
+ else
+ return;
+ }
+
+ tvlv_len = ntohs(batadv_ogm_packet->tvlv_len);
+
+ batadv_ogm_packet->ttl--;
+ ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source);
+
+ /* apply hop penalty */
+ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq,
+ bat_priv);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding packet: tq: %i, ttl: %i\n",
+ batadv_ogm_packet->tq, batadv_ogm_packet->ttl);
+
+ if (is_single_hop_neigh)
+ batadv_ogm_packet->flags |= BATADV_DIRECTLINK;
+ else
+ batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK;
+
+ batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet,
+ BATADV_OGM_HLEN + tvlv_len,
+ if_incoming, if_outgoing, 0,
+ batadv_iv_ogm_fwd_send_time());
+}
+
+/**
+ * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows
+ * for the given interface
+ * @hard_iface: the interface for which the windows have to be shifted
+ */
+static void
+batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ unsigned long *word;
+ u32 i;
+ u8 *w;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ hlist_for_each_entry_rcu(orig_ifinfo,
+ &orig_node->ifinfo_list,
+ list) {
+ if (orig_ifinfo->if_outgoing != hard_iface)
+ continue;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ word = orig_ifinfo->bat_iv.bcast_own;
+ batadv_bit_get_packet(bat_priv, word, 1, 0);
+ w = &orig_ifinfo->bat_iv.bcast_own_sum;
+ *w = bitmap_weight(word,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ }
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer
+ * @hard_iface: interface whose ogm buffer should be transmitted
+ */
+static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ struct batadv_hard_iface *primary_if, *tmp_hard_iface;
+ int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len;
+ struct list_head *iter;
+ u32 seqno;
+ u16 tvlv_len = 0;
+ unsigned long send_time;
+
+ lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ /* interface already disabled by batadv_iv_ogm_iface_disable */
+ if (!*ogm_buff)
+ return;
+
+ /* the interface gets activated here to avoid race conditions between
+ * the moment of activating the interface in
+ * hardif_activate_interface() where the originator mac is set and
+ * outdated packets (especially uninitialized mac addresses) in the
+ * packet queue
+ */
+ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED)
+ hard_iface->if_status = BATADV_IF_ACTIVE;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (hard_iface == primary_if) {
+ /* tt changes have to be committed before the tvlv data is
+ * appended as it may alter the tt tvlv container
+ */
+ batadv_tt_local_commit_changes(bat_priv);
+ tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff,
+ ogm_buff_len,
+ BATADV_OGM_HLEN);
+ }
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff);
+ batadv_ogm_packet->tvlv_len = htons(tvlv_len);
+
+ /* change sequence number to network order */
+ seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno);
+ batadv_ogm_packet->seqno = htonl(seqno);
+ atomic_inc(&hard_iface->bat_iv.ogm_seqno);
+
+ batadv_iv_ogm_slide_own_bcast_window(hard_iface);
+
+ send_time = batadv_iv_ogm_emit_send_time(bat_priv);
+
+ if (hard_iface != primary_if) {
+ /* OGMs from secondary interfaces are only scheduled on their
+ * respective interfaces.
+ */
+ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len,
+ hard_iface, hard_iface, 1, send_time);
+ goto out;
+ }
+
+ /* OGMs from primary interfaces are scheduled on all
+ * interfaces.
+ */
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(hard_iface->mesh_iface, tmp_hard_iface, iter) {
+ if (!kref_get_unless_zero(&tmp_hard_iface->refcount))
+ continue;
+
+ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff,
+ *ogm_buff_len, hard_iface,
+ tmp_hard_iface, 1, send_time);
+
+ batadv_hardif_put(tmp_hard_iface);
+ }
+ rcu_read_unlock();
+
+out:
+ batadv_hardif_put(primary_if);
+}
+
+static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
+{
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+ hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
+ return;
+
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+ batadv_iv_ogm_schedule_buff(hard_iface);
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+}
+
+/**
+ * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface
+ * @orig_node: originator which reproadcasted the OGMs directly
+ * @if_outgoing: interface which transmitted the original OGM and received the
+ * direct rebroadcast
+ *
+ * Return: Number of replied (rebroadcasted) OGMs which were transmitted by
+ * an originator and directly (without intermediate hop) received by a specific
+ * interface
+ */
+static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ u8 sum;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ return 0;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ sum = orig_ifinfo->bat_iv.bcast_own_sum;
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ batadv_orig_ifinfo_put(orig_ifinfo);
+
+ return sum;
+}
+
+/**
+ * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an
+ * originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: the orig node who originally emitted the ogm packet
+ * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node
+ * @ethhdr: Ethernet header of the OGM
+ * @batadv_ogm_packet: the ogm packet
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @dup_status: the duplicate status of this ogm packet.
+ */
+static void
+batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_ifinfo *orig_ifinfo,
+ const struct ethhdr *ethhdr,
+ const struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ enum batadv_dup_status dup_status)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_neigh_node *tmp_neigh_node = NULL;
+ struct batadv_neigh_node *router = NULL;
+ u8 sum_orig, sum_neigh;
+ u8 *neigh_addr;
+ u8 tq_avg;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s(): Searching and updating originator entry of received packet\n",
+ __func__);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node,
+ &orig_node->neigh_list, list) {
+ neigh_addr = tmp_neigh_node->addr;
+ if (batadv_compare_eth(neigh_addr, ethhdr->h_source) &&
+ tmp_neigh_node->if_incoming == if_incoming &&
+ kref_get_unless_zero(&tmp_neigh_node->refcount)) {
+ if (WARN(neigh_node, "too many matching neigh_nodes"))
+ batadv_neigh_node_put(neigh_node);
+ neigh_node = tmp_neigh_node;
+ continue;
+ }
+
+ if (dup_status != BATADV_NO_DUP)
+ continue;
+
+ /* only update the entry for this outgoing interface */
+ neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node,
+ if_outgoing);
+ if (!neigh_ifinfo)
+ continue;
+
+ spin_lock_bh(&tmp_neigh_node->ifinfo_lock);
+ batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv,
+ &neigh_ifinfo->bat_iv.tq_index, 0);
+ tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv);
+ neigh_ifinfo->bat_iv.tq_avg = tq_avg;
+ spin_unlock_bh(&tmp_neigh_node->ifinfo_lock);
+
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ neigh_ifinfo = NULL;
+ }
+
+ if (!neigh_node) {
+ struct batadv_orig_node *orig_tmp;
+
+ orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source);
+ if (!orig_tmp)
+ goto unlock;
+
+ neigh_node = batadv_iv_ogm_neigh_new(if_incoming,
+ ethhdr->h_source,
+ orig_node, orig_tmp);
+
+ batadv_orig_node_put(orig_tmp);
+ if (!neigh_node)
+ goto unlock;
+ } else {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Updating existing last-hop neighbor of originator\n");
+ }
+
+ rcu_read_unlock();
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
+ neigh_node->last_seen = jiffies;
+
+ spin_lock_bh(&neigh_node->ifinfo_lock);
+ batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv,
+ &neigh_ifinfo->bat_iv.tq_index,
+ batadv_ogm_packet->tq);
+ tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv);
+ neigh_ifinfo->bat_iv.tq_avg = tq_avg;
+ spin_unlock_bh(&neigh_node->ifinfo_lock);
+
+ if (dup_status == BATADV_NO_DUP) {
+ orig_ifinfo->last_ttl = batadv_ogm_packet->ttl;
+ neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl;
+ }
+
+ /* if this neighbor already is our next hop there is nothing
+ * to change
+ */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router == neigh_node)
+ goto out;
+
+ if (router) {
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ if (!router_ifinfo)
+ goto out;
+
+ /* if this neighbor does not offer a better TQ we won't
+ * consider it
+ */
+ if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg)
+ goto out;
+ }
+
+ /* if the TQ is the same and the link not more symmetric we
+ * won't consider it either
+ */
+ if (router_ifinfo &&
+ neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) {
+ sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node,
+ router->if_incoming);
+ sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node,
+ neigh_node->if_incoming);
+ if (sum_orig >= sum_neigh)
+ goto out;
+ }
+
+ batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+out:
+ batadv_neigh_node_put(neigh_node);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+}
+
+/**
+ * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet
+ * @orig_node: the orig node who originally emitted the ogm packet
+ * @orig_neigh_node: the orig node struct of the neighbor who sent the packet
+ * @batadv_ogm_packet: the ogm packet
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ *
+ * Return: true if the link can be considered bidirectional, false otherwise
+ */
+static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node;
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+ u8 total_count;
+ u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
+ unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE;
+ unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
+ unsigned int tq_asym_penalty, inv_asym_penalty;
+ unsigned int combined_tq;
+ bool ret = false;
+
+ /* find corresponding one hop neighbor */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node,
+ &orig_neigh_node->neigh_list, list) {
+ if (!batadv_compare_eth(tmp_neigh_node->addr,
+ orig_neigh_node->orig))
+ continue;
+
+ if (tmp_neigh_node->if_incoming != if_incoming)
+ continue;
+
+ if (!kref_get_unless_zero(&tmp_neigh_node->refcount))
+ continue;
+
+ neigh_node = tmp_neigh_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!neigh_node)
+ neigh_node = batadv_iv_ogm_neigh_new(if_incoming,
+ orig_neigh_node->orig,
+ orig_neigh_node,
+ orig_neigh_node);
+
+ if (!neigh_node)
+ goto out;
+
+ /* if orig_node is direct neighbor update neigh_node last_seen */
+ if (orig_node == orig_neigh_node)
+ neigh_node->last_seen = jiffies;
+
+ orig_node->last_seen = jiffies;
+
+ /* find packet count of corresponding one hop neighbor */
+ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming);
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+ if (neigh_ifinfo) {
+ neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ } else {
+ neigh_rq_count = 0;
+ }
+
+ /* pay attention to not get a value bigger than 100 % */
+ if (orig_eq_count > neigh_rq_count)
+ total_count = neigh_rq_count;
+ else
+ total_count = orig_eq_count;
+
+ /* if we have too few packets (too less data) we set tq_own to zero
+ * if we receive too few packets it is not considered bidirectional
+ */
+ if (total_count < BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM ||
+ neigh_rq_count < BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM)
+ tq_own = 0;
+ else
+ /* neigh_node->real_packet_count is never zero as we
+ * only purge old information when getting new
+ * information
+ */
+ tq_own = (BATADV_TQ_MAX_VALUE * total_count) / neigh_rq_count;
+
+ /* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does
+ * affect the nearly-symmetric links only a little, but
+ * punishes asymmetric links more. This will give a value
+ * between 0 and TQ_MAX_VALUE
+ */
+ neigh_rq_inv = BATADV_TQ_LOCAL_WINDOW_SIZE - neigh_rq_count;
+ neigh_rq_inv_cube = neigh_rq_inv * neigh_rq_inv * neigh_rq_inv;
+ neigh_rq_max_cube = BATADV_TQ_LOCAL_WINDOW_SIZE *
+ BATADV_TQ_LOCAL_WINDOW_SIZE *
+ BATADV_TQ_LOCAL_WINDOW_SIZE;
+ inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube;
+ inv_asym_penalty /= neigh_rq_max_cube;
+ tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty;
+ tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty);
+
+ /* penalize if the OGM is forwarded on the same interface. WiFi
+ * interfaces and other half duplex devices suffer from throughput
+ * drops as they can't send and receive at the same time.
+ */
+ if (if_outgoing && if_incoming == if_outgoing &&
+ batadv_is_wifi_hardif(if_outgoing))
+ tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty,
+ bat_priv);
+
+ combined_tq = batadv_ogm_packet->tq *
+ tq_own *
+ tq_asym_penalty *
+ tq_iface_hop_penalty;
+ combined_tq /= BATADV_TQ_MAX_VALUE *
+ BATADV_TQ_MAX_VALUE *
+ BATADV_TQ_MAX_VALUE;
+ batadv_ogm_packet->tq = combined_tq;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n",
+ orig_node->orig, orig_neigh_node->orig, total_count,
+ neigh_rq_count, tq_own, tq_asym_penalty,
+ tq_iface_hop_penalty, batadv_ogm_packet->tq,
+ if_incoming->net_dev->name,
+ if_outgoing ? if_outgoing->net_dev->name : "DEFAULT");
+
+ /* if link has the minimum required transmission quality
+ * consider it bidirectional
+ */
+ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT)
+ ret = true;
+
+out:
+ batadv_neigh_node_put(neigh_node);
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_update_seqnos() - process a batman packet for all interfaces,
+ * adjust the sequence number and find out whether it is a duplicate
+ * @ethhdr: ethernet header of the packet
+ * @batadv_ogm_packet: OGM packet to be considered
+ * @if_incoming: interface on which the OGM packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ *
+ * Return: duplicate status as enum batadv_dup_status
+ */
+static enum batadv_dup_status
+batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
+ const struct batadv_ogm_packet *batadv_ogm_packet,
+ const struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+ bool is_dup;
+ s32 seq_diff;
+ bool need_update = false;
+ int set_mark;
+ enum batadv_dup_status ret = BATADV_NO_DUP;
+ u32 seqno = ntohl(batadv_ogm_packet->seqno);
+ u8 *neigh_addr;
+ u8 packet_count;
+ unsigned long *bitmap;
+
+ orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig);
+ if (!orig_node)
+ return BATADV_NO_DUP;
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (WARN_ON(!orig_ifinfo)) {
+ batadv_orig_node_put(orig_node);
+ return 0;
+ }
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ seq_diff = seqno - orig_ifinfo->last_real_seqno;
+
+ /* signalize caller that the packet is to be dropped. */
+ if (!hlist_empty(&orig_node->neigh_list) &&
+ batadv_window_protected(bat_priv, seq_diff,
+ BATADV_TQ_LOCAL_WINDOW_SIZE,
+ &orig_ifinfo->batman_seqno_reset, NULL)) {
+ ret = BATADV_PROTECTED;
+ goto out;
+ }
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node,
+ if_outgoing);
+ if (!neigh_ifinfo)
+ continue;
+
+ neigh_addr = neigh_node->addr;
+ is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits,
+ orig_ifinfo->last_real_seqno,
+ seqno);
+
+ if (batadv_compare_eth(neigh_addr, ethhdr->h_source) &&
+ neigh_node->if_incoming == if_incoming) {
+ set_mark = 1;
+ if (is_dup)
+ ret = BATADV_NEIGH_DUP;
+ } else {
+ set_mark = 0;
+ if (is_dup && ret != BATADV_NEIGH_DUP)
+ ret = BATADV_ORIG_DUP;
+ }
+
+ /* if the window moved, set the update flag. */
+ bitmap = neigh_ifinfo->bat_iv.real_bits;
+ need_update |= batadv_bit_get_packet(bat_priv, bitmap,
+ seq_diff, set_mark);
+
+ packet_count = bitmap_weight(bitmap,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ neigh_ifinfo->bat_iv.real_packet_count = packet_count;
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ }
+ rcu_read_unlock();
+
+ if (need_update) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s updating last_seqno: old %u, new %u\n",
+ if_outgoing ? if_outgoing->net_dev->name : "DEFAULT",
+ orig_ifinfo->last_real_seqno, seqno);
+ orig_ifinfo->last_real_seqno = seqno;
+ }
+
+out:
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ batadv_orig_node_put(orig_node);
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing
+ * interface
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset from skb->data to start of ogm header
+ * @orig_node: the (cached) orig node for the originator of this OGM
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void
+batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_neigh_node *router_router = NULL;
+ struct batadv_orig_node *orig_neigh_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *orig_neigh_router = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_ogm_packet *ogm_packet;
+ enum batadv_dup_status dup_status;
+ bool is_from_best_next_hop = false;
+ bool is_single_hop_neigh = false;
+ bool sameseq, similar_ttl;
+ struct sk_buff *skb_priv;
+ struct ethhdr *ethhdr;
+ u8 *prev_sender;
+ bool is_bidirect;
+
+ /* create a private copy of the skb, as some functions change tq value
+ * and/or flags.
+ */
+ skb_priv = skb_copy(skb, GFP_ATOMIC);
+ if (!skb_priv)
+ return;
+
+ ethhdr = eth_hdr(skb_priv);
+ ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset);
+
+ dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet,
+ if_incoming, if_outgoing);
+ if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig))
+ is_single_hop_neigh = true;
+
+ if (dup_status == BATADV_PROTECTED) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: packet within seqno protection time (sender: %pM)\n",
+ ethhdr->h_source);
+ goto out;
+ }
+
+ if (ogm_packet->tq == 0) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet with tq equal 0\n");
+ goto out;
+ }
+
+ if (is_single_hop_neigh) {
+ hardif_neigh = batadv_hardif_neigh_get(if_incoming,
+ ethhdr->h_source);
+ if (hardif_neigh)
+ hardif_neigh->last_seen = jiffies;
+ }
+
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router) {
+ router_router = batadv_orig_router_get(router->orig_node,
+ if_outgoing);
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ }
+
+ if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) &&
+ (batadv_compare_eth(router->addr, ethhdr->h_source)))
+ is_from_best_next_hop = true;
+
+ prev_sender = ogm_packet->prev_sender;
+ /* avoid temporary routing loops */
+ if (router && router_router &&
+ (batadv_compare_eth(router->addr, prev_sender)) &&
+ !(batadv_compare_eth(ogm_packet->orig, prev_sender)) &&
+ (batadv_compare_eth(router->addr, router_router->addr))) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n",
+ ethhdr->h_source);
+ goto out;
+ }
+
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node);
+
+ /* if sender is a direct neighbor the sender mac equals
+ * originator mac
+ */
+ if (is_single_hop_neigh)
+ orig_neigh_node = orig_node;
+ else
+ orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
+ ethhdr->h_source);
+
+ if (!orig_neigh_node)
+ goto out;
+
+ orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
+ if_outgoing);
+
+ /* drop packet if sender is not a direct neighbor and if we
+ * don't route towards it
+ */
+ if (!is_single_hop_neigh && !orig_neigh_router) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM via unknown neighbor!\n");
+ goto out_neigh;
+ }
+
+ is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node,
+ ogm_packet, if_incoming,
+ if_outgoing);
+
+ /* update ranking if it is not a duplicate or has the same
+ * seqno and similar ttl as the non-duplicate
+ */
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out_neigh;
+
+ sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno);
+ similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl;
+
+ if (is_bidirect && (dup_status == BATADV_NO_DUP ||
+ (sameseq && similar_ttl))) {
+ batadv_iv_ogm_orig_update(bat_priv, orig_node,
+ orig_ifinfo, ethhdr,
+ ogm_packet, if_incoming,
+ if_outgoing, dup_status);
+ }
+ batadv_orig_ifinfo_put(orig_ifinfo);
+
+ /* only forward for specific interface, not for the default one. */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ goto out_neigh;
+
+ /* is single hop (direct) neighbor */
+ if (is_single_hop_neigh) {
+ /* OGMs from secondary interfaces should only scheduled once
+ * per interface where it has been received, not multiple times
+ */
+ if (ogm_packet->ttl <= 2 &&
+ if_incoming != if_outgoing) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM from secondary interface and wrong outgoing interface\n");
+ goto out_neigh;
+ }
+ /* mark direct link on incoming interface */
+ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet,
+ is_single_hop_neigh,
+ is_from_best_next_hop, if_incoming,
+ if_outgoing);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding packet: rebroadcast neighbor packet with direct link flag\n");
+ goto out_neigh;
+ }
+
+ /* multihop originator */
+ if (!is_bidirect) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: not received via bidirectional link\n");
+ goto out_neigh;
+ }
+
+ if (dup_status == BATADV_NEIGH_DUP) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: duplicate packet received\n");
+ goto out_neigh;
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding packet: rebroadcast originator packet\n");
+ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet,
+ is_single_hop_neigh, is_from_best_next_hop,
+ if_incoming, if_outgoing);
+
+out_neigh:
+ if (orig_neigh_node && !is_single_hop_neigh)
+ batadv_orig_node_put(orig_neigh_node);
+out:
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
+ batadv_neigh_node_put(router_router);
+ batadv_neigh_node_put(orig_neigh_router);
+ batadv_hardif_neigh_put(hardif_neigh);
+
+ consume_skb(skb_priv);
+}
+
+/**
+ * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it
+ * @ogm_packet: rebroadcast OGM packet to process
+ * @if_incoming: the interface where this packet was received
+ * @orig_node: originator which reproadcasted the OGMs
+ * @if_incoming_seqno: OGM sequence number when rebroadcast was received
+ */
+static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_orig_node *orig_node,
+ u32 if_incoming_seqno)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ s32 bit_pos;
+ u8 *weight;
+
+ /* neighbor has to indicate direct link and it has to
+ * come via the corresponding interface
+ */
+ if (!(ogm_packet->flags & BATADV_DIRECTLINK))
+ return;
+
+ if (!batadv_compare_eth(if_incoming->net_dev->dev_addr,
+ ogm_packet->orig))
+ return;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming);
+ if (!orig_ifinfo)
+ return;
+
+ /* save packet seqno for bidirectional check */
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ bit_pos = if_incoming_seqno - 2;
+ bit_pos -= ntohl(ogm_packet->seqno);
+ batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos);
+ weight = &orig_ifinfo->bat_iv.bcast_own_sum;
+ *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ batadv_orig_ifinfo_put(orig_ifinfo);
+}
+
+/**
+ * batadv_iv_ogm_process() - process an incoming batman iv OGM
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset to the OGM which should be processed (for aggregates)
+ * @if_incoming: the interface where this packet was received
+ */
+static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_orig_node *orig_neigh_node, *orig_node;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_ogm_packet *ogm_packet;
+ u32 if_incoming_seqno;
+ bool has_directlink_flag;
+ struct ethhdr *ethhdr;
+ bool is_my_oldorig = false;
+ bool is_my_addr = false;
+ bool is_my_orig = false;
+ struct list_head *iter;
+
+ ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset);
+ ethhdr = eth_hdr(skb);
+
+ /* Silently drop when the batman packet is actually not a
+ * correct packet.
+ *
+ * This might happen if a packet is padded (e.g. Ethernet has a
+ * minimum frame length of 64 byte) and the aggregation interprets
+ * it as an additional length.
+ *
+ * TODO: A more sane solution would be to have a bit in the
+ * batadv_ogm_packet to detect whether the packet is the last
+ * packet in an aggregation. Here we expect that the padding
+ * is always zero (or not 0x01)
+ */
+ if (ogm_packet->packet_type != BATADV_IV_OGM)
+ return;
+
+ /* could be changed by schedule_own_packet() */
+ if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno);
+
+ if (ogm_packet->flags & BATADV_DIRECTLINK)
+ has_directlink_flag = true;
+ else
+ has_directlink_flag = false;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n",
+ ethhdr->h_source, if_incoming->net_dev->name,
+ if_incoming->net_dev->dev_addr, ogm_packet->orig,
+ ogm_packet->prev_sender, ntohl(ogm_packet->seqno),
+ ogm_packet->tq, ogm_packet->ttl,
+ ogm_packet->version, has_directlink_flag);
+
+ rcu_read_lock();
+
+ netdev_for_each_lower_private_rcu(if_incoming->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (batadv_compare_eth(ethhdr->h_source,
+ hard_iface->net_dev->dev_addr))
+ is_my_addr = true;
+
+ if (batadv_compare_eth(ogm_packet->orig,
+ hard_iface->net_dev->dev_addr))
+ is_my_orig = true;
+
+ if (batadv_compare_eth(ogm_packet->prev_sender,
+ hard_iface->net_dev->dev_addr))
+ is_my_oldorig = true;
+ }
+ rcu_read_unlock();
+
+ if (is_my_addr) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: received my own broadcast (sender: %pM)\n",
+ ethhdr->h_source);
+ return;
+ }
+
+ if (is_my_orig) {
+ orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
+ ethhdr->h_source);
+ if (!orig_neigh_node)
+ return;
+
+ batadv_iv_ogm_process_reply(ogm_packet, if_incoming,
+ orig_neigh_node, if_incoming_seqno);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet from myself (via neighbor)\n");
+ batadv_orig_node_put(orig_neigh_node);
+ return;
+ }
+
+ if (is_my_oldorig) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n",
+ ethhdr->h_source);
+ return;
+ }
+
+ if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n",
+ ethhdr->h_source);
+ return;
+ }
+
+ orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig);
+ if (!orig_node)
+ return;
+
+ batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node,
+ if_incoming, BATADV_IF_DEFAULT);
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
+ batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node,
+ if_incoming, hard_iface);
+
+ batadv_hardif_put(hard_iface);
+ }
+ rcu_read_unlock();
+
+ batadv_orig_node_put(orig_node);
+}
+
+static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_forw_packet *forw_packet;
+ struct batadv_priv *bat_priv;
+ bool dropped = false;
+
+ delayed_work = to_delayed_work(work);
+ forw_packet = container_of(delayed_work, struct batadv_forw_packet,
+ delayed_work);
+ bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+ dropped = true;
+ goto out;
+ }
+
+ batadv_iv_ogm_emit(forw_packet);
+
+ /* we have to have at least one packet in the queue to determine the
+ * queues wake up time unless we are shutting down.
+ *
+ * only re-schedule if this is the "original" copy, e.g. the OGM of the
+ * primary interface should only be rescheduled once per period, but
+ * this function will be called for the forw_packet instances of the
+ * other secondary interfaces as well.
+ */
+ if (forw_packet->own &&
+ forw_packet->if_incoming == forw_packet->if_outgoing)
+ batadv_iv_ogm_schedule(forw_packet->if_incoming);
+
+out:
+ /* do we get something for free()? */
+ if (batadv_forw_packet_steal(forw_packet,
+ &bat_priv->forw_bat_list_lock))
+ batadv_forw_packet_free(forw_packet, dropped);
+}
+
+static int batadv_iv_ogm_receive(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_ogm_packet *ogm_packet;
+ u8 *packet_pos;
+ int ogm_offset;
+ bool res;
+ int ret = NET_RX_DROP;
+
+ res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
+ if (!res)
+ goto free_skb;
+
+ /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
+ * that does not have B.A.T.M.A.N. IV enabled ?
+ */
+ if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
+ goto free_skb;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ ogm_offset = 0;
+ ogm_packet = (struct batadv_ogm_packet *)skb->data;
+
+ /* unpack the aggregated packets and process them one by one */
+ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
+ ogm_packet)) {
+ batadv_iv_ogm_process(skb, ogm_offset, if_incoming);
+
+ ogm_offset += BATADV_OGM_HLEN;
+ ogm_offset += ntohs(ogm_packet->tvlv_len);
+
+ packet_pos = skb->data + ogm_offset;
+ ogm_packet = (struct batadv_ogm_packet *)packet_pos;
+ }
+
+ ret = NET_RX_SUCCESS;
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a
+ * given outgoing interface.
+ * @neigh_node: Neighbour of interest
+ * @if_outgoing: Outgoing interface of interest
+ * @tq_avg: Pointer of where to store the TQ average
+ *
+ * Return: False if no average TQ available, otherwise true.
+ */
+static bool
+batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_outgoing,
+ u8 *tq_avg)
+{
+ struct batadv_neigh_ifinfo *n_ifinfo;
+
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!n_ifinfo)
+ return false;
+
+ *tq_avg = n_ifinfo->bat_iv.tq_avg;
+ batadv_neigh_ifinfo_put(n_ifinfo);
+
+ return true;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a
+ * message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @neigh_node: Single hops neighbour
+ * @best: Is the best originator
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ bool best)
+{
+ void *hdr;
+ u8 tq_avg;
+ unsigned int last_seen_msecs;
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
+
+ if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg))
+ return 0;
+
+ if (if_outgoing != BATADV_IF_DEFAULT &&
+ if_outgoing != neigh_node->if_incoming)
+ return 0;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ orig_node->orig) ||
+ nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ neigh_node->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ neigh_node->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ neigh_node->if_incoming->net_dev->ifindex) ||
+ nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs))
+ goto nla_put_failure;
+
+ if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @sub_s: Number of sub entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node, int *sub_s)
+{
+ struct batadv_neigh_node *neigh_node_best;
+ struct batadv_neigh_node *neigh_node;
+ int sub = 0;
+ bool best;
+ u8 tq_avg_best;
+
+ neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
+ if (!neigh_node_best)
+ goto out;
+
+ if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing,
+ &tq_avg_best))
+ goto out;
+
+ if (tq_avg_best == 0)
+ goto out;
+
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ if (sub++ < *sub_s)
+ continue;
+
+ best = (neigh_node == neigh_node_best);
+
+ if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq,
+ bat_priv, if_outgoing,
+ orig_node, neigh_node,
+ best)) {
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = sub - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ out:
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a
+ * message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @head: Bucket to be dumped
+ * @idx_s: Number of entries to be skipped
+ * @sub: Number of sub entries to be skipped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct hlist_head *head, int *idx_s, int *sub)
+{
+ struct batadv_orig_node *orig_node;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv,
+ if_outgoing, orig_node,
+ sub)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ *sub = 0;
+ return 0;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump() - Dump the originators into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ */
+static void
+batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int sub = cb->args[2];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_iv_ogm_orig_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, if_outgoing, head,
+ &idx, &sub))
+ break;
+
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+ cb->args[2] = sub;
+}
+
+/**
+ * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors
+ * @neigh1: the first neighbor object of the comparison
+ * @if_outgoing1: outgoing interface for the first neighbor
+ * @neigh2: the second neighbor object of the comparison
+ * @if_outgoing2: outgoing interface for the second neighbor
+ * @diff: pointer to integer receiving the calculated difference
+ *
+ * The content of *@diff is only valid when this function returns true.
+ * It is less, equal to or greater than 0 if the metric via neigh1 is lower,
+ * the same as or higher than the metric via neigh2
+ *
+ * Return: true when the difference could be calculated, false otherwise
+ */
+static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2,
+ int *diff)
+{
+ struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
+ u8 tq1, tq2;
+ bool ret = true;
+
+ neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+ if (!neigh1_ifinfo || !neigh2_ifinfo) {
+ ret = false;
+ goto out;
+ }
+
+ tq1 = neigh1_ifinfo->bat_iv.tq_avg;
+ tq2 = neigh2_ifinfo->bat_iv.tq_avg;
+ *diff = (int)tq1 - (int)tq2;
+
+out:
+ batadv_neigh_ifinfo_put(neigh1_ifinfo);
+ batadv_neigh_ifinfo_put(neigh2_ifinfo);
+
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hardif_neigh: Neighbour to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ void *hdr;
+ unsigned int last_seen_msecs;
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ hardif_neigh->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ hardif_neigh->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ hardif_neigh->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface
+ * into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @hard_iface: Hard interface to dump the neighbours for
+ * @idx_s: Number of entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface,
+ int *idx_s)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ int idx = 0;
+
+ hlist_for_each_entry_rcu(hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq,
+ hardif_neigh)) {
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ *idx_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @single_hardif: Limit dump to this hard interface
+ */
+static void
+batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *single_hardif)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+ int i_hardif = 0;
+ int i_hardif_s = cb->args[0];
+ int idx = cb->args[1];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ rcu_read_lock();
+ if (single_hardif) {
+ if (i_hardif_s == 0) {
+ if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv,
+ single_hardif,
+ &idx) == 0)
+ i_hardif++;
+ }
+ } else {
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (i_hardif++ < i_hardif_s)
+ continue;
+
+ if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv,
+ hard_iface, &idx)) {
+ i_hardif--;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = i_hardif;
+ cb->args[1] = idx;
+}
+
+/**
+ * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors
+ * @neigh1: the first neighbor object of the comparison
+ * @if_outgoing1: outgoing interface for the first neighbor
+ * @neigh2: the second neighbor object of the comparison
+ * @if_outgoing2: outgoing interface for the second neighbor
+ *
+ * Return: a value less, equal to or greater than 0 if the metric via neigh1 is
+ * lower, the same as or higher than the metric via neigh2
+ */
+static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ bool ret;
+ int diff;
+
+ ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2,
+ if_outgoing2, &diff);
+ if (!ret)
+ return 0;
+
+ return diff;
+}
+
+/**
+ * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better
+ * than neigh2 from the metric prospective
+ * @neigh1: the first neighbor object of the comparison
+ * @if_outgoing1: outgoing interface for the first neighbor
+ * @neigh2: the second neighbor object of the comparison
+ * @if_outgoing2: outgoing interface for the second neighbor
+ *
+ * Return: true if the metric via neigh1 is equally good or better than
+ * the metric via neigh2, false otherwise.
+ */
+static bool
+batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ bool ret;
+ int diff;
+
+ ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2,
+ if_outgoing2, &diff);
+ if (!ret)
+ return false;
+
+ ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD;
+ return ret;
+}
+
+static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface)
+{
+ /* begin scheduling originator messages on that interface */
+ batadv_iv_ogm_schedule(hard_iface);
+}
+
+/**
+ * batadv_iv_init_sel_class() - initialize GW selection class
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv)
+{
+ /* set default TQ difference threshold to 20 */
+ atomic_set(&bat_priv->gw.sel_class, 20);
+}
+
+static struct batadv_gw_node *
+batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_neigh_node *router;
+ struct batadv_neigh_ifinfo *router_ifinfo;
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+ u64 max_gw_factor = 0;
+ u64 tmp_gw_factor = 0;
+ u8 max_tq = 0;
+ u8 tq_avg;
+ struct batadv_orig_node *orig_node;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ orig_node = gw_node->orig_node;
+ router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ continue;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router,
+ BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto next;
+
+ if (!kref_get_unless_zero(&gw_node->refcount))
+ goto next;
+
+ tq_avg = router_ifinfo->bat_iv.tq_avg;
+
+ switch (atomic_read(&bat_priv->gw.sel_class)) {
+ case 1: /* fast connection */
+ tmp_gw_factor = tq_avg * tq_avg;
+ tmp_gw_factor *= gw_node->bandwidth_down;
+ tmp_gw_factor *= 100 * 100;
+ tmp_gw_factor >>= 18;
+
+ if (tmp_gw_factor > max_gw_factor ||
+ (tmp_gw_factor == max_gw_factor &&
+ tq_avg > max_tq)) {
+ batadv_gw_node_put(curr_gw);
+ curr_gw = gw_node;
+ kref_get(&curr_gw->refcount);
+ }
+ break;
+
+ default: /* 2: stable connection (use best statistic)
+ * 3: fast-switch (use best statistic but change as
+ * soon as a better gateway appears)
+ * XX: late-switch (use best statistic but change as
+ * soon as a better gateway appears which has
+ * $routing_class more tq points)
+ */
+ if (tq_avg > max_tq) {
+ batadv_gw_node_put(curr_gw);
+ curr_gw = gw_node;
+ kref_get(&curr_gw->refcount);
+ }
+ break;
+ }
+
+ if (tq_avg > max_tq)
+ max_tq = tq_avg;
+
+ if (tmp_gw_factor > max_gw_factor)
+ max_gw_factor = tmp_gw_factor;
+
+ batadv_gw_node_put(gw_node);
+
+next:
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ }
+ rcu_read_unlock();
+
+ return curr_gw;
+}
+
+static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *curr_gw_orig,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL;
+ struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL;
+ struct batadv_neigh_node *router_gw = NULL;
+ struct batadv_neigh_node *router_orig = NULL;
+ u8 gw_tq_avg, orig_tq_avg;
+ bool ret = false;
+
+ /* dynamic re-election is performed only on fast or late switch */
+ if (atomic_read(&bat_priv->gw.sel_class) <= 2)
+ return false;
+
+ router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT);
+ if (!router_gw) {
+ ret = true;
+ goto out;
+ }
+
+ router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw,
+ BATADV_IF_DEFAULT);
+ if (!router_gw_ifinfo) {
+ ret = true;
+ goto out;
+ }
+
+ router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router_orig)
+ goto out;
+
+ router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig,
+ BATADV_IF_DEFAULT);
+ if (!router_orig_ifinfo)
+ goto out;
+
+ gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg;
+ orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg;
+
+ /* the TQ value has to be better */
+ if (orig_tq_avg < gw_tq_avg)
+ goto out;
+
+ /* if the routing class is greater than 3 the value tells us how much
+ * greater the TQ value of the new gateway must be
+ */
+ if ((atomic_read(&bat_priv->gw.sel_class) > 3) &&
+ (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class)))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n",
+ gw_tq_avg, orig_tq_avg);
+
+ ret = true;
+out:
+ batadv_neigh_ifinfo_put(router_gw_ifinfo);
+ batadv_neigh_ifinfo_put(router_orig_ifinfo);
+ batadv_neigh_node_put(router_gw);
+ batadv_neigh_node_put(router_orig);
+
+ return ret;
+}
+
+/**
+ * batadv_iv_gw_dump_entry() - Dump a gateway into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @gw_node: Gateway to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_gw_node *gw_node)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_neigh_node *router;
+ struct batadv_gw_node *curr_gw = NULL;
+ int ret = 0;
+ void *hdr;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_GATEWAYS);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ genl_dump_check_consistent(cb, hdr);
+
+ ret = -EMSGSIZE;
+
+ if (curr_gw == gw_node)
+ if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ gw_node->orig_node->orig) ||
+ nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) ||
+ nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN,
+ router->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ router->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ router->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
+ gw_node->bandwidth_down) ||
+ nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP,
+ gw_node->bandwidth_up)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ batadv_gw_node_put(curr_gw);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
+ return ret;
+}
+
+/**
+ * batadv_iv_gw_dump() - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ */
+static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv)
+{
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_gw_node *gw_node;
+ int idx_skip = cb->args[0];
+ int idx = 0;
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ cb->seq = bat_priv->gw.generation << 1 | 1;
+
+ hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) {
+ if (idx++ < idx_skip)
+ continue;
+
+ if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv,
+ gw_node)) {
+ idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ idx_skip = idx;
+unlock:
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ cb->args[0] = idx_skip;
+}
+
+static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
+ .name = "BATMAN_IV",
+ .iface = {
+ .enable = batadv_iv_ogm_iface_enable,
+ .enabled = batadv_iv_iface_enabled,
+ .disable = batadv_iv_ogm_iface_disable,
+ .update_mac = batadv_iv_ogm_iface_update_mac,
+ .primary_set = batadv_iv_ogm_primary_iface_set,
+ },
+ .neigh = {
+ .cmp = batadv_iv_ogm_neigh_cmp,
+ .is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
+ .dump = batadv_iv_ogm_neigh_dump,
+ },
+ .orig = {
+ .dump = batadv_iv_ogm_orig_dump,
+ },
+ .gw = {
+ .init_sel_class = batadv_iv_init_sel_class,
+ .sel_class_max = BATADV_TQ_MAX_VALUE,
+ .get_best_gw_node = batadv_iv_gw_get_best_gw_node,
+ .is_eligible = batadv_iv_gw_is_eligible,
+ .dump = batadv_iv_gw_dump,
+ },
+};
+
+/**
+ * batadv_iv_init() - B.A.T.M.A.N. IV initialization function
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int __init batadv_iv_init(void)
+{
+ int ret;
+
+ /* batman originator packet */
+ ret = batadv_recv_handler_register(BATADV_IV_OGM,
+ batadv_iv_ogm_receive);
+ if (ret < 0)
+ goto out;
+
+ ret = batadv_algo_register(&batadv_batman_iv);
+ if (ret < 0)
+ goto handler_unregister;
+
+ goto out;
+
+handler_unregister:
+ batadv_recv_handler_unregister(BATADV_IV_OGM);
+out:
+ return ret;
+}
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
new file mode 100644
index 000000000000..04b01bd684e8
--- /dev/null
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_
+#define _NET_BATMAN_ADV_BAT_IV_OGM_H_
+
+#include "main.h"
+
+int batadv_iv_init(void);
+
+#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
new file mode 100644
index 000000000000..de9444714264
--- /dev/null
+++ b/net/batman-adv/bat_v.c
@@ -0,0 +1,887 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ */
+
+#include "bat_v.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bat_algo.h"
+#include "bat_v_elp.h"
+#include "bat_v_ogm.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+
+static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (primary_if) {
+ batadv_v_elp_iface_activate(primary_if, hard_iface);
+ batadv_hardif_put(primary_if);
+ }
+
+ /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can
+ * set the interface as ACTIVE right away, without any risk of race
+ * condition
+ */
+ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED)
+ hard_iface->if_status = BATADV_IF_ACTIVE;
+}
+
+static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ int ret;
+
+ ret = batadv_v_elp_iface_enable(hard_iface);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_v_ogm_iface_enable(hard_iface);
+ if (ret < 0)
+ batadv_v_elp_iface_disable(hard_iface);
+
+ return ret;
+}
+
+static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ batadv_v_ogm_iface_disable(hard_iface);
+ batadv_v_elp_iface_disable(hard_iface);
+}
+
+static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
+{
+ batadv_v_elp_primary_iface_set(hard_iface);
+ batadv_v_ogm_primary_iface_set(hard_iface);
+}
+
+/**
+ * batadv_v_iface_update_mac() - react to hard-interface MAC address change
+ * @hard_iface: the modified interface
+ *
+ * If the modified interface is the primary one, update the originator
+ * address in the ELP and OGM messages to reflect the new MAC address.
+ */
+static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if != hard_iface)
+ goto out;
+
+ batadv_v_primary_iface_set(hard_iface);
+out:
+ batadv_hardif_put(primary_if);
+}
+
+static void
+batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ ewma_throughput_init(&hardif_neigh->bat_v.throughput);
+}
+
+/**
+ * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hardif_neigh: Neighbour to dump
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ void *hdr;
+ unsigned int last_seen_msecs;
+ u32 throughput;
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
+ throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+ throughput = throughput * 100;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_NEIGHBORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ hardif_neigh->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ hardif_neigh->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ hardif_neigh->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs) ||
+ nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_v_neigh_dump_hardif() - Dump the neighbours of a hard interface into
+ * a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @hard_iface: The hard interface to be dumped
+ * @idx_s: Entries to be skipped
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface,
+ int *idx_s)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ int idx = 0;
+
+ hlist_for_each_entry_rcu(hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) {
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ *idx_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_v_neigh_dump() - Dump the neighbours of a hard interface into a
+ * message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @single_hardif: Limit dumping to this hard interface
+ */
+static void
+batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *single_hardif)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+ int i_hardif = 0;
+ int i_hardif_s = cb->args[0];
+ int idx = cb->args[1];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ rcu_read_lock();
+ if (single_hardif) {
+ if (i_hardif_s == 0) {
+ if (batadv_v_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, single_hardif,
+ &idx) == 0)
+ i_hardif++;
+ }
+ } else {
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (i_hardif++ < i_hardif_s)
+ continue;
+
+ if (batadv_v_neigh_dump_hardif(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, hard_iface,
+ &idx)) {
+ i_hardif--;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = i_hardif;
+ cb->args[1] = idx;
+}
+
+/**
+ * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @neigh_node: Single hops neighbour
+ * @best: Is the best originator
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ bool best)
+{
+ struct batadv_neigh_ifinfo *n_ifinfo;
+ unsigned int last_seen_msecs;
+ u32 throughput;
+ void *hdr;
+
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!n_ifinfo)
+ return 0;
+
+ throughput = n_ifinfo->bat_v.throughput * 100;
+
+ batadv_neigh_ifinfo_put(n_ifinfo);
+
+ last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
+
+ if (if_outgoing != BATADV_IF_DEFAULT &&
+ if_outgoing != neigh_node->if_incoming)
+ return 0;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_ORIGINATORS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) ||
+ nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+ neigh_node->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ neigh_node->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ neigh_node->if_incoming->net_dev->ifindex) ||
+ nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+ last_seen_msecs))
+ goto nla_put_failure;
+
+ if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_v_orig_dump_entry() - Dump an originator entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @sub_s: Number of sub entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct batadv_orig_node *orig_node, int *sub_s)
+{
+ struct batadv_neigh_node *neigh_node_best;
+ struct batadv_neigh_node *neigh_node;
+ int sub = 0;
+ bool best;
+
+ neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
+ if (!neigh_node_best)
+ goto out;
+
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ if (sub++ < *sub_s)
+ continue;
+
+ best = (neigh_node == neigh_node_best);
+
+ if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv,
+ if_outgoing, orig_node,
+ neigh_node, best)) {
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = sub - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ out:
+ batadv_neigh_node_put(neigh_node_best);
+
+ *sub_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @head: Bucket to be dumped
+ * @idx_s: Number of entries to be skipped
+ * @sub: Number of sub entries to be skipped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing,
+ struct hlist_head *head, int *idx_s, int *sub)
+{
+ struct batadv_orig_node *orig_node;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv,
+ if_outgoing, orig_node, sub)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ *sub = 0;
+ return 0;
+}
+
+/**
+ * batadv_v_orig_dump() - Dump the originators into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ */
+static void
+batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int sub = cb->args[2];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_v_orig_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq,
+ bat_priv, if_outgoing, head, &idx,
+ &sub))
+ break;
+
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+ cb->args[2] = sub;
+}
+
+static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+ int ret = 0;
+
+ ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ if (!ifinfo1)
+ goto err_ifinfo1;
+
+ ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ if (!ifinfo2)
+ goto err_ifinfo2;
+
+ ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
+
+ batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+ batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+ return ret;
+}
+
+static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+ u32 threshold;
+ bool ret = false;
+
+ ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ if (!ifinfo1)
+ goto err_ifinfo1;
+
+ ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ if (!ifinfo2)
+ goto err_ifinfo2;
+
+ threshold = ifinfo1->bat_v.throughput / 4;
+ threshold = ifinfo1->bat_v.throughput - threshold;
+
+ ret = ifinfo2->bat_v.throughput > threshold;
+
+ batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+ batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+ return ret;
+}
+
+/**
+ * batadv_v_init_sel_class() - initialize GW selection class
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
+{
+ /* set default throughput difference threshold to 5Mbps */
+ atomic_set(&bat_priv->gw.sel_class, 50);
+}
+
+/**
+ * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW
+ * @gw_node: the GW to retrieve the metric for
+ * @bw: the pointer where the metric will be stored. The metric is computed as
+ * the minimum between the GW advertised throughput and the path throughput to
+ * it in the mesh
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_orig_node *orig_node;
+ struct batadv_neigh_node *router;
+ int ret = -1;
+
+ orig_node = gw_node->orig_node;
+ router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ /* the GW metric is computed as the minimum between the path throughput
+ * to reach the GW itself and the advertised bandwidth.
+ * This gives us an approximation of the effective throughput that the
+ * client can expect via this particular GW node
+ */
+ *bw = router_ifinfo->bat_v.throughput;
+ *bw = min_t(u32, *bw, gw_node->bandwidth_down);
+
+ ret = 0;
+out:
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+
+ return ret;
+}
+
+/**
+ * batadv_v_gw_get_best_gw_node() - retrieve the best GW node
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: the GW node having the best GW-metric, NULL if no GW is known
+ */
+static struct batadv_gw_node *
+batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+ u32 max_bw = 0, bw;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ if (!kref_get_unless_zero(&gw_node->refcount))
+ continue;
+
+ if (batadv_v_gw_throughput_get(gw_node, &bw) < 0)
+ goto next;
+
+ if (curr_gw && bw <= max_bw)
+ goto next;
+
+ batadv_gw_node_put(curr_gw);
+
+ curr_gw = gw_node;
+ kref_get(&curr_gw->refcount);
+ max_bw = bw;
+
+next:
+ batadv_gw_node_put(gw_node);
+ }
+ rcu_read_unlock();
+
+ return curr_gw;
+}
+
+/**
+ * batadv_v_gw_is_eligible() - check if a originator would be selected as GW
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @curr_gw_orig: originator representing the currently selected GW
+ * @orig_node: the originator representing the new candidate
+ *
+ * Return: true if orig_node can be selected as current GW, false otherwise
+ */
+static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *curr_gw_orig,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_gw_node *curr_gw, *orig_gw = NULL;
+ u32 gw_throughput, orig_throughput, threshold;
+ bool ret = false;
+
+ threshold = atomic_read(&bat_priv->gw.sel_class);
+
+ curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig);
+ if (!curr_gw) {
+ ret = true;
+ goto out;
+ }
+
+ if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) {
+ ret = true;
+ goto out;
+ }
+
+ orig_gw = batadv_gw_node_get(bat_priv, orig_node);
+ if (!orig_gw)
+ goto out;
+
+ if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0)
+ goto out;
+
+ if (orig_throughput < gw_throughput)
+ goto out;
+
+ if ((orig_throughput - gw_throughput) < threshold)
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n",
+ gw_throughput, orig_throughput);
+
+ ret = true;
+out:
+ batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(orig_gw);
+
+ return ret;
+}
+
+/**
+ * batadv_v_gw_dump_entry() - Dump a gateway into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @gw_node: Gateway to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_gw_node *gw_node)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_neigh_node *router;
+ struct batadv_gw_node *curr_gw = NULL;
+ int ret = 0;
+ void *hdr;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_GATEWAYS);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ genl_dump_check_consistent(cb, hdr);
+
+ ret = -EMSGSIZE;
+
+ if (curr_gw == gw_node) {
+ if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+ }
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ gw_node->orig_node->orig)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT,
+ router_ifinfo->bat_v.throughput)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ router->if_incoming->net_dev->name)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ router->if_incoming->net_dev->ifindex)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
+ gw_node->bandwidth_down)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ batadv_gw_node_put(curr_gw);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
+ return ret;
+}
+
+/**
+ * batadv_v_gw_dump() - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ */
+static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *bat_priv)
+{
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_gw_node *gw_node;
+ int idx_skip = cb->args[0];
+ int idx = 0;
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ cb->seq = bat_priv->gw.generation << 1 | 1;
+
+ hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) {
+ if (idx++ < idx_skip)
+ continue;
+
+ if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv,
+ gw_node)) {
+ idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ idx_skip = idx;
+unlock:
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ cb->args[0] = idx_skip;
+}
+
+static struct batadv_algo_ops batadv_batman_v __read_mostly = {
+ .name = "BATMAN_V",
+ .iface = {
+ .activate = batadv_v_iface_activate,
+ .enable = batadv_v_iface_enable,
+ .disable = batadv_v_iface_disable,
+ .update_mac = batadv_v_iface_update_mac,
+ .primary_set = batadv_v_primary_iface_set,
+ },
+ .neigh = {
+ .hardif_init = batadv_v_hardif_neigh_init,
+ .cmp = batadv_v_neigh_cmp,
+ .is_similar_or_better = batadv_v_neigh_is_sob,
+ .dump = batadv_v_neigh_dump,
+ },
+ .orig = {
+ .dump = batadv_v_orig_dump,
+ },
+ .gw = {
+ .init_sel_class = batadv_v_init_sel_class,
+ .sel_class_max = U32_MAX,
+ .get_best_gw_node = batadv_v_gw_get_best_gw_node,
+ .is_eligible = batadv_v_gw_is_eligible,
+ .dump = batadv_v_gw_dump,
+ },
+};
+
+/**
+ * batadv_v_hardif_init() - initialize the algorithm specific fields in the
+ * hard-interface object
+ * @hard_iface: the hard-interface to initialize
+ */
+void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
+{
+ /* enable link throughput auto-detection by setting the throughput
+ * override to zero
+ */
+ atomic_set(&hard_iface->bat_v.throughput_override, 0);
+ atomic_set(&hard_iface->bat_v.elp_interval, 500);
+
+ hard_iface->bat_v.aggr_len = 0;
+ skb_queue_head_init(&hard_iface->bat_v.aggr_list);
+ INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq,
+ batadv_v_ogm_aggr_work);
+}
+
+/**
+ * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a
+ * mesh
+ * @bat_priv: the object representing the mesh interface to initialise
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+ int ret = 0;
+
+ ret = batadv_v_ogm_init(bat_priv);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh
+ * @bat_priv: the object representing the mesh interface to free
+ */
+void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+ batadv_v_ogm_free(bat_priv);
+}
+
+/**
+ * batadv_v_init() - B.A.T.M.A.N. V initialization function
+ *
+ * Description: Takes care of initializing all the subcomponents.
+ * It is invoked upon module load only.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int __init batadv_v_init(void)
+{
+ int ret;
+
+ /* B.A.T.M.A.N. V echo location protocol packet */
+ ret = batadv_recv_handler_register(BATADV_ELP,
+ batadv_v_elp_packet_recv);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_recv_handler_register(BATADV_OGM2,
+ batadv_v_ogm_packet_recv);
+ if (ret < 0)
+ goto elp_unregister;
+
+ ret = batadv_algo_register(&batadv_batman_v);
+ if (ret < 0)
+ goto ogm_unregister;
+
+ return ret;
+
+ogm_unregister:
+ batadv_recv_handler_unregister(BATADV_OGM2);
+
+elp_unregister:
+ batadv_recv_handler_unregister(BATADV_ELP);
+
+ return ret;
+}
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
new file mode 100644
index 000000000000..964431f4dc8d
--- /dev/null
+++ b/net/batman-adv/bat_v.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Linus Lüssing
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_V_H_
+#define _NET_BATMAN_ADV_BAT_V_H_
+
+#include "main.h"
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+int batadv_v_init(void);
+void batadv_v_hardif_init(struct batadv_hard_iface *hardif);
+int batadv_v_mesh_init(struct batadv_priv *bat_priv);
+void batadv_v_mesh_free(struct batadv_priv *bat_priv);
+
+#else
+
+static inline int batadv_v_init(void)
+{
+ return 0;
+}
+
+static inline void batadv_v_hardif_init(struct batadv_hard_iface *hardif)
+{
+}
+
+static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
+#endif /* _NET_BATMAN_ADV_BAT_V_H_ */
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
new file mode 100644
index 000000000000..cb16c1ed2a58
--- /dev/null
+++ b/net/batman-adv/bat_v_elp.c
@@ -0,0 +1,596 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ */
+
+#include "bat_v_elp.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/netdevice.h>
+#include <linux/nl80211.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/cfg80211.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "bat_v_ogm.h"
+#include "hard-interface.h"
+#include "log.h"
+#include "originator.h"
+#include "routing.h"
+#include "send.h"
+
+/**
+ * struct batadv_v_metric_queue_entry - list of hardif neighbors which require
+ * and metric update
+ */
+struct batadv_v_metric_queue_entry {
+ /** @hardif_neigh: hardif neighbor scheduled for metric update */
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ /** @list: list node for metric_queue */
+ struct list_head list;
+};
+
+/**
+ * batadv_v_elp_start_timer() - restart timer for ELP periodic work
+ * @hard_iface: the interface for which the timer has to be reset
+ */
+static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int msecs;
+
+ msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER;
+ msecs += get_random_u32_below(2 * BATADV_JITTER);
+
+ queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq,
+ msecs_to_jiffies(msecs));
+}
+
+/**
+ * batadv_v_elp_get_throughput() - get the throughput towards a neighbour
+ * @neigh: the neighbour for which the throughput has to be obtained
+ * @pthroughput: calculated throughput towards the given neighbour in multiples
+ * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc).
+ *
+ * Return: true when value behind @pthroughput was set
+ */
+static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh,
+ u32 *pthroughput)
+{
+ struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+ struct net_device *mesh_iface = hard_iface->mesh_iface;
+ struct ethtool_link_ksettings link_settings;
+ struct net_device *real_netdev;
+ struct station_info sinfo;
+ u32 throughput;
+ int ret;
+
+ /* don't query throughput when no longer associated with any
+ * batman-adv interface
+ */
+ if (!mesh_iface)
+ return false;
+
+ /* if the user specified a customised value for this interface, then
+ * return it directly
+ */
+ throughput = atomic_read(&hard_iface->bat_v.throughput_override);
+ if (throughput != 0) {
+ *pthroughput = throughput;
+ return true;
+ }
+
+ /* if this is a wireless device, then ask its throughput through
+ * cfg80211 API
+ */
+ if (batadv_is_wifi_hardif(hard_iface)) {
+ if (!batadv_is_cfg80211_hardif(hard_iface))
+ /* unsupported WiFi driver version */
+ goto default_throughput;
+
+ real_netdev = batadv_get_real_netdev(hard_iface->net_dev);
+ if (!real_netdev)
+ goto default_throughput;
+
+ ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo);
+
+ if (!ret) {
+ /* free the TID stats immediately */
+ cfg80211_sinfo_release_content(&sinfo);
+ }
+
+ dev_put(real_netdev);
+ if (ret == -ENOENT) {
+ /* Node is not associated anymore! It would be
+ * possible to delete this neighbor. For now set
+ * the throughput metric to 0.
+ */
+ *pthroughput = 0;
+ return true;
+ }
+ if (ret)
+ goto default_throughput;
+
+ if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) {
+ *pthroughput = sinfo.expected_throughput / 100;
+ return true;
+ }
+
+ /* try to estimate the expected throughput based on reported tx
+ * rates
+ */
+ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
+ *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
+ return true;
+ }
+
+ goto default_throughput;
+ }
+
+ /* only use rtnl_trylock because the elp worker will be cancelled while
+ * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise
+ * wait forever when the elp work_item was started and it is then also
+ * trying to rtnl_lock
+ */
+ if (!rtnl_trylock())
+ return false;
+
+ /* if not a wifi interface, check if this device provides data via
+ * ethtool (e.g. an Ethernet adapter)
+ */
+ ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
+ rtnl_unlock();
+ if (ret == 0) {
+ /* link characteristics might change over time */
+ if (link_settings.base.duplex == DUPLEX_FULL)
+ hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
+ else
+ hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+
+ throughput = link_settings.base.speed;
+ if (throughput && throughput != SPEED_UNKNOWN) {
+ *pthroughput = throughput * 10;
+ return true;
+ }
+ }
+
+default_throughput:
+ if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
+ batadv_info(mesh_iface,
+ "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n",
+ hard_iface->net_dev->name,
+ BATADV_THROUGHPUT_DEFAULT_VALUE / 10,
+ BATADV_THROUGHPUT_DEFAULT_VALUE % 10);
+ hard_iface->bat_v.flags |= BATADV_WARNING_DEFAULT;
+ }
+
+ /* if none of the above cases apply, return the base_throughput */
+ *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE;
+ return true;
+}
+
+/**
+ * batadv_v_elp_throughput_metric_update() - worker updating the throughput
+ * metric of a single hop neighbour
+ * @neigh: the neighbour to probe
+ */
+static void
+batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh)
+{
+ u32 throughput;
+ bool valid;
+
+ valid = batadv_v_elp_get_throughput(neigh, &throughput);
+ if (!valid)
+ return;
+
+ ewma_throughput_add(&neigh->bat_v.throughput, throughput);
+}
+
+/**
+ * batadv_v_elp_wifi_neigh_probe() - send link probing packets to a neighbour
+ * @neigh: the neighbour to probe
+ *
+ * Sends a predefined number of unicast wifi packets to a given neighbour in
+ * order to trigger the throughput estimation on this link by the RC algorithm.
+ * Packets are sent only if there is not enough payload unicast traffic towards
+ * this neighbour..
+ *
+ * Return: True on success and false in case of error during skb preparation.
+ */
+static bool
+batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
+{
+ struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ unsigned long last_tx_diff;
+ struct sk_buff *skb;
+ int probe_len, i;
+ int elp_skb_len;
+
+ /* this probing routine is for Wifi neighbours only */
+ if (!batadv_is_wifi_hardif(hard_iface))
+ return true;
+
+ /* probe the neighbor only if no unicast packets have been sent
+ * to it in the last 100 milliseconds: this is the rate control
+ * algorithm sampling interval (minstrel). In this way, if not
+ * enough traffic has been sent to the neighbor, batman-adv can
+ * generate 2 probe packets and push the RC algorithm to perform
+ * the sampling
+ */
+ last_tx_diff = jiffies_to_msecs(jiffies - neigh->bat_v.last_unicast_tx);
+ if (last_tx_diff <= BATADV_ELP_PROBE_MAX_TX_DIFF)
+ return true;
+
+ probe_len = max_t(int, sizeof(struct batadv_elp_packet),
+ BATADV_ELP_MIN_PROBE_SIZE);
+
+ for (i = 0; i < BATADV_ELP_PROBES_PER_NODE; i++) {
+ elp_skb_len = hard_iface->bat_v.elp_skb->len;
+ skb = skb_copy_expand(hard_iface->bat_v.elp_skb, 0,
+ probe_len - elp_skb_len,
+ GFP_ATOMIC);
+ if (!skb)
+ return false;
+
+ /* Tell the skb to get as big as the allocated space (we want
+ * the packet to be exactly of that size to make the link
+ * throughput estimation effective.
+ */
+ skb_put_zero(skb, probe_len - hard_iface->bat_v.elp_skb->len);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Sending unicast (probe) ELP packet on interface %s to %pM\n",
+ hard_iface->net_dev->name, neigh->addr);
+
+ batadv_send_skb_packet(skb, hard_iface, neigh->addr);
+ }
+
+ return true;
+}
+
+/**
+ * batadv_v_elp_periodic_work() - ELP periodic task per interface
+ * @work: work queue item
+ *
+ * Emits broadcast ELP messages in regular intervals.
+ */
+static void batadv_v_elp_periodic_work(struct work_struct *work)
+{
+ struct batadv_v_metric_queue_entry *metric_entry;
+ struct batadv_v_metric_queue_entry *metric_safe;
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_hard_iface_bat_v *bat_v;
+ struct batadv_elp_packet *elp_packet;
+ struct list_head metric_queue;
+ struct batadv_priv *bat_priv;
+ struct sk_buff *skb;
+ u32 elp_interval;
+
+ bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
+ hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ /* we are in the process of shutting this interface down */
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+ hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
+ goto out;
+
+ /* the interface was enabled but may not be ready yet */
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ goto restart_timer;
+
+ skb = skb_copy(hard_iface->bat_v.elp_skb, GFP_ATOMIC);
+ if (!skb)
+ goto restart_timer;
+
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+ elp_packet->seqno = htonl(atomic_read(&hard_iface->bat_v.elp_seqno));
+ elp_interval = atomic_read(&hard_iface->bat_v.elp_interval);
+ elp_packet->elp_interval = htonl(elp_interval);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Sending broadcast ELP packet on interface %s, seqno %u\n",
+ hard_iface->net_dev->name,
+ atomic_read(&hard_iface->bat_v.elp_seqno));
+
+ batadv_send_broadcast_skb(skb, hard_iface);
+
+ atomic_inc(&hard_iface->bat_v.elp_seqno);
+
+ INIT_LIST_HEAD(&metric_queue);
+
+ /* The throughput metric is updated on each sent packet. This way, if a
+ * node is dead and no longer sends packets, batman-adv is still able to
+ * react timely to its death.
+ *
+ * The throughput metric is updated by following these steps:
+ * 1) if the hard_iface is wifi => send a number of unicast ELPs for
+ * probing/sampling to each neighbor
+ * 2) update the throughput metric value of each neighbor (note that the
+ * value retrieved in this step might be 100ms old because the
+ * probing packets at point 1) could still be in the HW queue)
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) {
+ if (!batadv_v_elp_wifi_neigh_probe(hardif_neigh))
+ /* if something goes wrong while probing, better to stop
+ * sending packets immediately and reschedule the task
+ */
+ break;
+
+ if (!kref_get_unless_zero(&hardif_neigh->refcount))
+ continue;
+
+ /* Reading the estimated throughput from cfg80211 is a task that
+ * may sleep and that is not allowed in an rcu protected
+ * context. Therefore add it to metric_queue and process it
+ * outside rcu protected context.
+ */
+ metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC);
+ if (!metric_entry) {
+ batadv_hardif_neigh_put(hardif_neigh);
+ continue;
+ }
+
+ metric_entry->hardif_neigh = hardif_neigh;
+ list_add(&metric_entry->list, &metric_queue);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) {
+ batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh);
+
+ batadv_hardif_neigh_put(metric_entry->hardif_neigh);
+ list_del(&metric_entry->list);
+ kfree(metric_entry);
+ }
+
+restart_timer:
+ batadv_v_elp_start_timer(hard_iface);
+out:
+ return;
+}
+
+/**
+ * batadv_v_elp_iface_enable() - setup the ELP interface private resources
+ * @hard_iface: interface for which the data has to be prepared
+ *
+ * Return: 0 on success or a -ENOMEM in case of failure.
+ */
+int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ static const size_t tvlv_padding = sizeof(__be32);
+ struct batadv_elp_packet *elp_packet;
+ unsigned char *elp_buff;
+ u32 random_seqno;
+ size_t size;
+ int res = -ENOMEM;
+
+ size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN + tvlv_padding;
+ hard_iface->bat_v.elp_skb = dev_alloc_skb(size);
+ if (!hard_iface->bat_v.elp_skb)
+ goto out;
+
+ skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
+ elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb,
+ BATADV_ELP_HLEN + tvlv_padding);
+ elp_packet = (struct batadv_elp_packet *)elp_buff;
+
+ elp_packet->packet_type = BATADV_ELP;
+ elp_packet->version = BATADV_COMPAT_VERSION;
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno);
+
+ /* assume full-duplex by default */
+ hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
+
+ /* warn the user (again) if there is no throughput data is available */
+ hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT;
+
+ if (batadv_is_wifi_hardif(hard_iface))
+ hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+
+ INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
+ batadv_v_elp_periodic_work);
+ batadv_v_elp_start_timer(hard_iface);
+ res = 0;
+
+out:
+ return res;
+}
+
+/**
+ * batadv_v_elp_iface_disable() - release ELP interface private resources
+ * @hard_iface: interface for which the resources have to be released
+ */
+void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq);
+
+ dev_kfree_skb(hard_iface->bat_v.elp_skb);
+ hard_iface->bat_v.elp_skb = NULL;
+}
+
+/**
+ * batadv_v_elp_iface_activate() - update the ELP buffer belonging to the given
+ * hard-interface
+ * @primary_iface: the new primary interface
+ * @hard_iface: interface holding the to-be-updated buffer
+ */
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_elp_packet *elp_packet;
+ struct sk_buff *skb;
+
+ if (!hard_iface->bat_v.elp_skb)
+ return;
+
+ skb = hard_iface->bat_v.elp_skb;
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+ ether_addr_copy(elp_packet->orig,
+ primary_iface->net_dev->dev_addr);
+}
+
+/**
+ * batadv_v_elp_primary_iface_set() - change internal data to reflect the new
+ * primary interface
+ * @primary_iface: the new primary interface
+ */
+void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+
+ /* update orig field of every elp iface belonging to this mesh */
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(primary_iface->mesh_iface, hard_iface, iter)
+ batadv_v_elp_iface_activate(primary_iface, hard_iface);
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_v_elp_neigh_update() - update an ELP neighbour node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @neigh_addr: the neighbour interface address
+ * @if_incoming: the interface the packet was received through
+ * @elp_packet: the received ELP packet
+ *
+ * Updates the ELP neighbour node state with the data received within the new
+ * ELP packet.
+ */
+static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv,
+ u8 *neigh_addr,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_elp_packet *elp_packet)
+
+{
+ struct batadv_neigh_node *neigh;
+ struct batadv_orig_node *orig_neigh;
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ s32 seqno_diff;
+ s32 elp_latest_seqno;
+
+ orig_neigh = batadv_v_ogm_orig_get(bat_priv, elp_packet->orig);
+ if (!orig_neigh)
+ return;
+
+ neigh = batadv_neigh_node_get_or_create(orig_neigh,
+ if_incoming, neigh_addr);
+ if (!neigh)
+ goto orig_free;
+
+ hardif_neigh = batadv_hardif_neigh_get(if_incoming, neigh_addr);
+ if (!hardif_neigh)
+ goto neigh_free;
+
+ elp_latest_seqno = hardif_neigh->bat_v.elp_latest_seqno;
+ seqno_diff = ntohl(elp_packet->seqno) - elp_latest_seqno;
+
+ /* known or older sequence numbers are ignored. However always adopt
+ * if the router seems to have been restarted.
+ */
+ if (seqno_diff < 1 && seqno_diff > -BATADV_ELP_MAX_AGE)
+ goto hardif_free;
+
+ neigh->last_seen = jiffies;
+ hardif_neigh->last_seen = jiffies;
+ hardif_neigh->bat_v.elp_latest_seqno = ntohl(elp_packet->seqno);
+ hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval);
+
+hardif_free:
+ batadv_hardif_neigh_put(hardif_neigh);
+neigh_free:
+ batadv_neigh_node_put(neigh);
+orig_free:
+ batadv_orig_node_put(orig_neigh);
+}
+
+/**
+ * batadv_v_elp_packet_recv() - main ELP packet handler
+ * @skb: the received packet
+ * @if_incoming: the interface this packet was received through
+ *
+ * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly
+ * processed or NET_RX_DROP in case of failure.
+ */
+int batadv_v_elp_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_elp_packet *elp_packet;
+ struct batadv_hard_iface *primary_if;
+ struct ethhdr *ethhdr;
+ bool res;
+ int ret = NET_RX_DROP;
+
+ res = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
+ if (!res)
+ goto free_skb;
+
+ ethhdr = eth_hdr(skb);
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+ goto free_skb;
+
+ /* did we receive a B.A.T.M.A.N. V ELP packet on an interface
+ * that does not have B.A.T.M.A.N. V ELP enabled ?
+ */
+ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
+ goto free_skb;
+
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Received ELP packet from %pM seqno %u ORIG: %pM\n",
+ ethhdr->h_source, ntohl(elp_packet->seqno),
+ elp_packet->orig);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto free_skb;
+
+ batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
+ elp_packet);
+
+ ret = NET_RX_SUCCESS;
+ batadv_hardif_put(primary_if);
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
+}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
new file mode 100644
index 000000000000..c9cb0a307100
--- /dev/null
+++ b/net/batman-adv/bat_v_elp.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_
+#define _NET_BATMAN_ADV_BAT_V_ELP_H_
+
+#include "main.h"
+
+#include <linux/skbuff.h>
+
+int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+ struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
+int batadv_v_elp_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming);
+
+#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
new file mode 100644
index 000000000000..e3870492dab7
--- /dev/null
+++ b/net/batman-adv/bat_v_ogm.c
@@ -0,0 +1,1080 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ */
+
+#include "bat_v_ogm.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "originator.h"
+#include "routing.h"
+#include "send.h"
+#include "translation-table.h"
+#include "tvlv.h"
+
+/**
+ * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the address of the originator
+ *
+ * Return: the orig_node corresponding to the specified address. If such an
+ * object does not exist, it is allocated here. In case of allocation failure
+ * returns NULL.
+ */
+struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+ const u8 *addr)
+{
+ struct batadv_orig_node *orig_node;
+ int hash_added;
+
+ orig_node = batadv_orig_hash_find(bat_priv, addr);
+ if (orig_node)
+ return orig_node;
+
+ orig_node = batadv_orig_node_new(bat_priv, addr);
+ if (!orig_node)
+ return NULL;
+
+ kref_get(&orig_node->refcount);
+ hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
+ batadv_choose_orig, orig_node,
+ &orig_node->hash_entry);
+ if (hash_added != 0) {
+ /* remove refcnt for newly created orig_node and hash entry */
+ batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
+ orig_node = NULL;
+ }
+
+ return orig_node;
+}
+
+/**
+ * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer
+ * @hard_iface: the interface to use to send the OGM
+ */
+static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000;
+
+ /* msecs * [0.9, 1.1] */
+ msecs += get_random_u32_below(msecs / 5) - (msecs / 10);
+ queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq,
+ msecs_to_jiffies(msecs / 1000));
+}
+
+/**
+ * batadv_v_ogm_start_timer() - restart the OGM sending timer
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
+{
+ unsigned long msecs;
+ /* this function may be invoked in different contexts (ogm rescheduling
+ * or hard_iface activation), but the work timer should not be reset
+ */
+ if (delayed_work_pending(&bat_priv->bat_v.ogm_wq))
+ return;
+
+ msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
+ msecs += get_random_u32_below(2 * BATADV_JITTER);
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq,
+ msecs_to_jiffies(msecs));
+}
+
+/**
+ * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface
+ * @skb: the OGM to send
+ * @hard_iface: the interface to use to send the OGM
+ */
+static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE) {
+ kfree_skb(skb);
+ return;
+ }
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
+ skb->len + ETH_HLEN);
+
+ batadv_send_broadcast_skb(skb, hard_iface);
+}
+
+/**
+ * batadv_v_ogm_len() - OGMv2 packet length
+ * @skb: the OGM to check
+ *
+ * Return: Length of the given OGMv2 packet, including tvlv length, excluding
+ * ethernet header length.
+ */
+static unsigned int batadv_v_ogm_len(struct sk_buff *skb)
+{
+ struct batadv_ogm2_packet *ogm_packet;
+
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+ return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len);
+}
+
+/**
+ * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue
+ * @skb: the OGM to check
+ * @hard_iface: the interface to use to send the OGM
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ *
+ * Return: True, if the given OGMv2 packet still fits, false otherwise.
+ */
+static bool batadv_v_ogm_queue_left(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu,
+ BATADV_MAX_AGGREGATION_BYTES);
+ unsigned int ogm_len = batadv_v_ogm_len(skb);
+
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ return hard_iface->bat_v.aggr_len + ogm_len <= max;
+}
+
+/**
+ * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue
+ * @hard_iface: the interface holding the aggregation queue
+ *
+ * Empties the OGMv2 aggregation queue and frees all the skbs it contains.
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ */
+static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface)
+{
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ __skb_queue_purge(&hard_iface->bat_v.aggr_list);
+ hard_iface->bat_v.aggr_len = 0;
+}
+
+/**
+ * batadv_v_ogm_aggr_send() - flush & send aggregation queue
+ * @hard_iface: the interface with the aggregation queue to flush
+ *
+ * Aggregates all OGMv2 packets currently in the aggregation queue into a
+ * single OGMv2 packet and transmits this aggregate.
+ *
+ * The aggregation queue is empty after this call.
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ */
+static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int aggr_len = hard_iface->bat_v.aggr_len;
+ struct sk_buff *skb_aggr;
+ unsigned int ogm_len;
+ struct sk_buff *skb;
+
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ if (!aggr_len)
+ return;
+
+ skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN);
+ if (!skb_aggr) {
+ batadv_v_ogm_aggr_list_free(hard_iface);
+ return;
+ }
+
+ skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN);
+ skb_reset_network_header(skb_aggr);
+
+ while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) {
+ hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb);
+
+ ogm_len = batadv_v_ogm_len(skb);
+ skb_put_data(skb_aggr, skb->data, ogm_len);
+
+ consume_skb(skb);
+ }
+
+ batadv_v_ogm_send_to_if(skb_aggr, hard_iface);
+}
+
+/**
+ * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface
+ * @skb: the OGM to queue
+ * @hard_iface: the interface to queue the OGM on
+ */
+static void batadv_v_ogm_queue_on_if(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ if (!atomic_read(&bat_priv->aggregated_ogms)) {
+ batadv_v_ogm_send_to_if(skb, hard_iface);
+ return;
+ }
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ if (!batadv_v_ogm_queue_left(skb, hard_iface))
+ batadv_v_ogm_aggr_send(hard_iface);
+
+ hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb);
+ __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+}
+
+/**
+ * batadv_v_ogm_send_meshif() - periodic worker broadcasting the own OGM
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_ogm2_packet *ogm_packet;
+ struct sk_buff *skb, *skb_tmp;
+ unsigned char *ogm_buff;
+ struct list_head *iter;
+ int ogm_buff_len;
+ u16 tvlv_len = 0;
+ int ret;
+
+ lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ ogm_buff = bat_priv->bat_v.ogm_buff;
+ ogm_buff_len = bat_priv->bat_v.ogm_buff_len;
+ /* tt changes have to be committed before the tvlv data is
+ * appended as it may alter the tt tvlv container
+ */
+ batadv_tt_local_commit_changes(bat_priv);
+ tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff,
+ &ogm_buff_len,
+ BATADV_OGM2_HLEN);
+
+ bat_priv->bat_v.ogm_buff = ogm_buff;
+ bat_priv->bat_v.ogm_buff_len = ogm_buff_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len);
+ if (!skb)
+ goto reschedule;
+
+ skb_reserve(skb, ETH_HLEN);
+ skb_put_data(skb, ogm_buff, ogm_buff_len);
+
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+ ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno));
+ atomic_inc(&bat_priv->bat_v.ogm_seqno);
+ ogm_packet->tvlv_len = htons(tvlv_len);
+
+ /* broadcast on every interface */
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
+ ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL);
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselves on %s suppressed: %s\n",
+ hard_iface->net_dev->name, type);
+
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
+ ogm_packet->orig, ntohl(ogm_packet->seqno),
+ ntohl(ogm_packet->throughput), ogm_packet->ttl,
+ hard_iface->net_dev->name,
+ hard_iface->net_dev->dev_addr);
+
+ /* this skb gets consumed by batadv_v_ogm_send_to_if() */
+ skb_tmp = skb_clone(skb, GFP_ATOMIC);
+ if (!skb_tmp) {
+ batadv_hardif_put(hard_iface);
+ break;
+ }
+
+ batadv_v_ogm_queue_on_if(skb_tmp, hard_iface);
+ batadv_hardif_put(hard_iface);
+ }
+ rcu_read_unlock();
+
+ consume_skb(skb);
+
+reschedule:
+ batadv_v_ogm_start_timer(bat_priv);
+out:
+ return;
+}
+
+/**
+ * batadv_v_ogm_send() - periodic worker broadcasting the own OGM
+ * @work: work queue item
+ */
+static void batadv_v_ogm_send(struct work_struct *work)
+{
+ struct batadv_priv_bat_v *bat_v;
+ struct batadv_priv *bat_priv;
+
+ bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
+ bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+ batadv_v_ogm_send_meshif(bat_priv);
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
+}
+
+/**
+ * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface
+ * @work: work queue item
+ *
+ * Emits aggregated OGM messages in regular intervals.
+ */
+void batadv_v_ogm_aggr_work(struct work_struct *work)
+{
+ struct batadv_hard_iface_bat_v *batv;
+ struct batadv_hard_iface *hard_iface;
+
+ batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work);
+ hard_iface = container_of(batv, struct batadv_hard_iface, bat_v);
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ batadv_v_ogm_aggr_send(hard_iface);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+
+ batadv_v_ogm_start_queue_timer(hard_iface);
+}
+
+/**
+ * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V
+ * @hard_iface: the interface to prepare
+ *
+ * Takes care of scheduling its own OGM sending routine for this interface.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ batadv_v_ogm_start_queue_timer(hard_iface);
+ batadv_v_ogm_start_timer(bat_priv);
+
+ return 0;
+}
+
+/**
+ * batadv_v_ogm_iface_disable() - release OGM interface private resources
+ * @hard_iface: interface for which the resources have to be released
+ */
+void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq);
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ batadv_v_ogm_aggr_list_free(hard_iface);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+}
+
+/**
+ * batadv_v_ogm_primary_iface_set() - set a new primary interface
+ * @primary_iface: the new primary interface
+ */
+void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(primary_iface->mesh_iface);
+ struct batadv_ogm2_packet *ogm_packet;
+
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+ if (!bat_priv->bat_v.ogm_buff)
+ goto unlock;
+
+ ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff;
+ ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr);
+
+unlock:
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
+}
+
+/**
+ * batadv_v_forward_penalty() - apply a penalty to the throughput metric
+ * forwarded with B.A.T.M.A.N. V OGMs
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @if_incoming: the interface where the OGM has been received
+ * @if_outgoing: the interface where the OGM has to be forwarded to
+ * @throughput: the current throughput
+ *
+ * Apply a penalty on the current throughput metric value based on the
+ * characteristic of the interface where the OGM has been received.
+ *
+ * Initially the per hardif hop penalty is applied to the throughput. After
+ * that the return value is then computed as follows:
+ * - throughput * 50% if the incoming and outgoing interface are the
+ * same WiFi interface and the throughput is above
+ * 1MBit/s
+ * - throughput if the outgoing interface is the default
+ * interface (i.e. this OGM is processed for the
+ * internal table and not forwarded)
+ * - throughput * node hop penalty otherwise
+ *
+ * Return: the penalised throughput metric.
+ */
+static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ u32 throughput)
+{
+ int if_hop_penalty = atomic_read(&if_incoming->hop_penalty);
+ int hop_penalty = atomic_read(&bat_priv->hop_penalty);
+ int hop_penalty_max = BATADV_TQ_MAX_VALUE;
+
+ /* Apply per hardif hop penalty */
+ throughput = throughput * (hop_penalty_max - if_hop_penalty) /
+ hop_penalty_max;
+
+ /* Don't apply hop penalty in default originator table. */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ return throughput;
+
+ /* Forwarding on the same WiFi interface cuts the throughput in half
+ * due to the store & forward characteristics of WIFI.
+ * Very low throughput values are the exception.
+ */
+ if (throughput > 10 &&
+ if_incoming == if_outgoing &&
+ !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX))
+ return throughput / 2;
+
+ /* hop penalty of 255 equals 100% */
+ return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max;
+}
+
+/**
+ * batadv_v_ogm_forward() - check conditions and forward an OGM to the given
+ * outgoing interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ogm_received: previously received OGM to be forwarded
+ * @orig_node: the originator which has been updated
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface on which this OGM was received on
+ * @if_outgoing: the interface to which the OGM has to be forwarded to
+ *
+ * Forward an OGM to an interface after having altered the throughput metric and
+ * the TTL value contained in it. The original OGM isn't modified.
+ */
+static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
+ const struct batadv_ogm2_packet *ogm_received,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_ogm2_packet *ogm_forward;
+ unsigned char *skb_buff;
+ struct sk_buff *skb;
+ size_t packet_len;
+ u16 tvlv_len;
+
+ /* only forward for specific interfaces, not for the default one. */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ goto out;
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out;
+
+ /* acquire possibly updated router */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+
+ /* strict rule: forward packets coming from the best next hop only */
+ if (neigh_node != router)
+ goto out;
+
+ /* don't forward the same seqno twice on one interface */
+ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno))
+ goto out;
+
+ orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno);
+
+ if (ogm_received->ttl <= 1) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
+ goto out;
+ }
+
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
+ tvlv_len = ntohs(ogm_received->tvlv_len);
+
+ packet_len = BATADV_OGM2_HLEN + tvlv_len;
+ skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev,
+ ETH_HLEN + packet_len);
+ if (!skb)
+ goto out;
+
+ skb_reserve(skb, ETH_HLEN);
+ skb_buff = skb_put_data(skb, ogm_received, packet_len);
+
+ /* apply forward penalty */
+ ogm_forward = (struct batadv_ogm2_packet *)skb_buff;
+ ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput);
+ ogm_forward->ttl--;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n",
+ if_outgoing->net_dev->name, ntohl(ogm_forward->throughput),
+ ogm_forward->ttl, if_incoming->net_dev->name);
+
+ batadv_v_ogm_queue_on_if(skb, if_outgoing);
+
+out:
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+}
+
+/**
+ * batadv_v_ogm_metric_update() - update route metric based on OGM
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ *
+ * Return:
+ * 1 if the OGM is new,
+ * 0 if it is not new but valid,
+ * <0 on error (e.g. old OGM)
+ */
+static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ bool protection_started = false;
+ int ret = -EINVAL;
+ u32 path_throughput;
+ s32 seq_diff;
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out;
+
+ seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno;
+
+ if (!hlist_empty(&orig_node->neigh_list) &&
+ batadv_window_protected(bat_priv, seq_diff,
+ BATADV_OGM_MAX_AGE,
+ &orig_ifinfo->batman_seqno_reset,
+ &protection_started)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: packet within window protection time from %pM\n",
+ ogm2->orig);
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Last reset: %ld, %ld\n",
+ orig_ifinfo->batman_seqno_reset, jiffies);
+ goto out;
+ }
+
+ /* drop packets with old seqnos, however accept the first packet after
+ * a host has been rebooted.
+ */
+ if (seq_diff < 0 && !protection_started)
+ goto out;
+
+ neigh_node->last_seen = jiffies;
+
+ orig_node->last_seen = jiffies;
+
+ orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno);
+ orig_ifinfo->last_ttl = ogm2->ttl;
+
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
+ path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming,
+ if_outgoing,
+ ntohl(ogm2->throughput));
+ neigh_ifinfo->bat_v.throughput = path_throughput;
+ neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno);
+ neigh_ifinfo->last_ttl = ogm2->ttl;
+
+ if (seq_diff > 0 || protection_started)
+ ret = 1;
+ else
+ ret = 0;
+out:
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+
+ return ret;
+}
+
+/**
+ * batadv_v_ogm_route_update() - update routes based on OGM
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ethhdr: the Ethernet header of the OGM2
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ *
+ * Return: true if the packet should be forwarded, false otherwise
+ */
+static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
+ const struct ethhdr *ethhdr,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_orig_node *orig_neigh_node;
+ struct batadv_neigh_node *orig_neigh_router = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
+ u32 router_throughput, neigh_throughput;
+ u32 router_last_seqno;
+ u32 neigh_last_seqno;
+ s32 neigh_seq_diff;
+ bool forward = false;
+
+ orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source);
+ if (!orig_neigh_node)
+ goto out;
+
+ orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
+ if_outgoing);
+
+ /* drop packet if sender is not a direct neighbor and if we
+ * don't route towards it
+ */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router && router->orig_node != orig_node && !orig_neigh_router) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM via unknown neighbor!\n");
+ goto out;
+ }
+
+ /* Mark the OGM to be considered for forwarding, and update routes
+ * if needed.
+ */
+ forward = true;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Searching and updating originator entry of received packet\n");
+
+ /* if this neighbor already is our next hop there is nothing
+ * to change
+ */
+ if (router == neigh_node)
+ goto out;
+
+ /* don't consider neighbours with worse throughput.
+ * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than
+ * the last received seqno from our best next hop.
+ */
+ if (router) {
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+
+ /* if these are not allocated, something is wrong. */
+ if (!router_ifinfo || !neigh_ifinfo)
+ goto out;
+
+ neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno;
+ router_last_seqno = router_ifinfo->bat_v.last_seqno;
+ neigh_seq_diff = neigh_last_seqno - router_last_seqno;
+ router_throughput = router_ifinfo->bat_v.throughput;
+ neigh_throughput = neigh_ifinfo->bat_v.throughput;
+
+ if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF &&
+ router_throughput >= neigh_throughput)
+ goto out;
+ }
+
+ batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
+out:
+ batadv_neigh_node_put(router);
+ batadv_neigh_node_put(orig_neigh_router);
+ batadv_orig_node_put(orig_neigh_node);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+
+ return forward;
+}
+
+/**
+ * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ethhdr: the Ethernet header of the OGM2
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void
+batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
+ const struct ethhdr *ethhdr,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ int seqno_age;
+ bool forward;
+
+ /* first, update the metric with according sanity checks */
+ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node,
+ neigh_node, if_incoming,
+ if_outgoing);
+
+ /* outdated sequence numbers are to be discarded */
+ if (seqno_age < 0)
+ return;
+
+ /* only unknown & newer OGMs contain TVLVs we are interested in */
+ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT)
+ batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node,
+ NULL,
+ (unsigned char *)(ogm2 + 1),
+ ntohs(ogm2->tvlv_len));
+
+ /* if the metric update went through, update routes if needed */
+ forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node,
+ neigh_node, if_incoming,
+ if_outgoing);
+
+ /* if the routes have been processed correctly, check and forward */
+ if (forward)
+ batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node,
+ if_incoming, if_outgoing);
+}
+
+/**
+ * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated
+ * @buff_pos: current position in the skb
+ * @packet_len: total length of the skb
+ * @ogm2_packet: potential OGM2 in buffer
+ *
+ * Return: true if there is enough space for another OGM, false otherwise.
+ */
+static bool
+batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
+ const struct batadv_ogm2_packet *ogm2_packet)
+{
+ int next_buff_pos = 0;
+
+ /* check if there is enough space for the header */
+ next_buff_pos += buff_pos + sizeof(*ogm2_packet);
+ if (next_buff_pos > packet_len)
+ return false;
+
+ /* check if there is enough space for the optional TVLV */
+ next_buff_pos += ntohs(ogm2_packet->tvlv_len);
+
+ return next_buff_pos <= packet_len;
+}
+
+/**
+ * batadv_v_ogm_process() - process an incoming batman v OGM
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset to the OGM which should be processed (for aggregates)
+ * @if_incoming: the interface where this packet was received
+ */
+static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct ethhdr *ethhdr;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_ogm2_packet *ogm_packet;
+ u32 ogm_throughput, link_throughput, path_throughput;
+ struct list_head *iter;
+ int ret;
+
+ ethhdr = eth_hdr(skb);
+ ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
+
+ ogm_throughput = ntohl(ogm_packet->throughput);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, throughput %u, TTL %u, V %u, tvlv_len %u)\n",
+ ethhdr->h_source, if_incoming->net_dev->name,
+ if_incoming->net_dev->dev_addr, ogm_packet->orig,
+ ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl,
+ ogm_packet->version, ntohs(ogm_packet->tvlv_len));
+
+ if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet from ourself\n");
+ return;
+ }
+
+ /* If the throughput metric is 0, immediately drop the packet. No need
+ * to create orig_node / neigh_node for an unusable route.
+ */
+ if (ogm_throughput == 0) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet with throughput metric of 0\n");
+ return;
+ }
+
+ /* require ELP packets be to received from this neighbor first */
+ hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source);
+ if (!hardif_neigh) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM via unknown neighbor!\n");
+ goto out;
+ }
+
+ orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig);
+ if (!orig_node)
+ goto out;
+
+ neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming,
+ ethhdr->h_source);
+ if (!neigh_node)
+ goto out;
+
+ /* Update the received throughput metric to match the link
+ * characteristic:
+ * - If this OGM traveled one hop so far (emitted by single hop
+ * neighbor) the path throughput metric equals the link throughput.
+ * - For OGMs traversing more than hop the path throughput metric is
+ * the smaller of the path throughput and the link throughput.
+ */
+ link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+ path_throughput = min_t(u32, link_throughput, ogm_throughput);
+ ogm_packet->throughput = htonl(path_throughput);
+
+ batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node,
+ neigh_node, if_incoming,
+ BATADV_IF_DEFAULT);
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
+ ret = batadv_hardif_no_broadcast(hard_iface,
+ ogm_packet->orig,
+ hardif_neigh->orig);
+
+ if (ret) {
+ char *type;
+
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s suppressed: %s\n",
+ ogm_packet->orig, hard_iface->net_dev->name,
+ type);
+
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
+ batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
+ orig_node, neigh_node,
+ if_incoming, hard_iface);
+
+ batadv_hardif_put(hard_iface);
+ }
+ rcu_read_unlock();
+out:
+ batadv_orig_node_put(orig_node);
+ batadv_neigh_node_put(neigh_node);
+ batadv_hardif_neigh_put(hardif_neigh);
+}
+
+/**
+ * batadv_v_ogm_packet_recv() - OGM2 receiving handler
+ * @skb: the received OGM
+ * @if_incoming: the interface where this OGM has been received
+ *
+ * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP
+ * (without freeing the skb) on failure
+ */
+int batadv_v_ogm_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
+ struct batadv_ogm2_packet *ogm_packet;
+ struct ethhdr *ethhdr;
+ int ogm_offset;
+ u8 *packet_pos;
+ int ret = NET_RX_DROP;
+
+ /* did we receive a OGM2 packet on an interface that does not have
+ * B.A.T.M.A.N. V enabled ?
+ */
+ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
+ goto free_skb;
+
+ if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
+ goto free_skb;
+
+ ethhdr = eth_hdr(skb);
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+ goto free_skb;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ ogm_offset = 0;
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+
+ while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
+ ogm_packet)) {
+ batadv_v_ogm_process(skb, ogm_offset, if_incoming);
+
+ ogm_offset += BATADV_OGM2_HLEN;
+ ogm_offset += ntohs(ogm_packet->tvlv_len);
+
+ packet_pos = skb->data + ogm_offset;
+ ogm_packet = (struct batadv_ogm2_packet *)packet_pos;
+ }
+
+ ret = NET_RX_SUCCESS;
+
+free_skb:
+ if (ret == NET_RX_SUCCESS)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_v_ogm_init() - initialise the OGM2 engine
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success or a negative error code in case of failure
+ */
+int batadv_v_ogm_init(struct batadv_priv *bat_priv)
+{
+ struct batadv_ogm2_packet *ogm_packet;
+ unsigned char *ogm_buff;
+ u32 random_seqno;
+
+ bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN;
+ ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC);
+ if (!ogm_buff)
+ return -ENOMEM;
+
+ bat_priv->bat_v.ogm_buff = ogm_buff;
+ ogm_packet = (struct batadv_ogm2_packet *)ogm_buff;
+ ogm_packet->packet_type = BATADV_OGM2;
+ ogm_packet->version = BATADV_COMPAT_VERSION;
+ ogm_packet->ttl = BATADV_TTL;
+ ogm_packet->flags = BATADV_NO_FLAGS;
+ ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE);
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno);
+ INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send);
+
+ mutex_init(&bat_priv->bat_v.ogm_buff_mutex);
+
+ return 0;
+}
+
+/**
+ * batadv_v_ogm_free() - free OGM private resources
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_v_ogm_free(struct batadv_priv *bat_priv)
+{
+ cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq);
+
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+
+ kfree(bat_priv->bat_v.ogm_buff);
+ bat_priv->bat_v.ogm_buff = NULL;
+ bat_priv->bat_v.ogm_buff_len = 0;
+
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
+}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
new file mode 100644
index 000000000000..edeffedecade
--- /dev/null
+++ b/net/batman-adv/bat_v_ogm.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_
+#define _NET_BATMAN_ADV_BAT_V_OGM_H_
+
+#include "main.h"
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+int batadv_v_ogm_init(struct batadv_priv *bat_priv);
+void batadv_v_ogm_free(struct batadv_priv *bat_priv);
+void batadv_v_ogm_aggr_work(struct work_struct *work);
+int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface);
+struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+ const u8 *addr);
+void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
+int batadv_v_ogm_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming);
+
+#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
new file mode 100644
index 000000000000..2c49b2711650
--- /dev/null
+++ b/net/batman-adv/bitarray.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ */
+
+#include "bitarray.h"
+#include "main.h"
+
+#include <linux/bitmap.h>
+
+#include "log.h"
+
+/* shift the packet array by n places. */
+static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
+{
+ if (n <= 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE)
+ return;
+
+ bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE);
+}
+
+/**
+ * batadv_bit_get_packet() - receive and process one packet within the sequence
+ * number window
+ * @priv: the bat priv with all the mesh interface information
+ * @seq_bits: pointer to the sequence number receive packet
+ * @seq_num_diff: difference between the current/received sequence number and
+ * the last sequence number
+ * @set_mark: whether this packet should be marked in seq_bits
+ *
+ * Return: true if the window was moved (either new or very old),
+ * false if the window was not moved/shifted.
+ */
+bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
+ s32 seq_num_diff, int set_mark)
+{
+ struct batadv_priv *bat_priv = priv;
+
+ /* sequence number is slightly older. We already got a sequence number
+ * higher than this one, so we just mark it.
+ */
+ if (seq_num_diff <= 0 && seq_num_diff > -BATADV_TQ_LOCAL_WINDOW_SIZE) {
+ if (set_mark)
+ batadv_set_bit(seq_bits, -seq_num_diff);
+ return false;
+ }
+
+ /* sequence number is slightly newer, so we shift the window and
+ * set the mark if required
+ */
+ if (seq_num_diff > 0 && seq_num_diff < BATADV_TQ_LOCAL_WINDOW_SIZE) {
+ batadv_bitmap_shift_left(seq_bits, seq_num_diff);
+
+ if (set_mark)
+ batadv_set_bit(seq_bits, 0);
+ return true;
+ }
+
+ /* sequence number is much newer, probably missed a lot of packets */
+ if (seq_num_diff >= BATADV_TQ_LOCAL_WINDOW_SIZE &&
+ seq_num_diff < BATADV_EXPECTED_SEQNO_RANGE) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "We missed a lot of packets (%i) !\n",
+ seq_num_diff - 1);
+ bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ if (set_mark)
+ batadv_set_bit(seq_bits, 0);
+ return true;
+ }
+
+ /* received a much older packet. The other host either restarted
+ * or the old packet got delayed somewhere in the network. The
+ * packet should be dropped without calling this function if the
+ * seqno window is protected.
+ *
+ * seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE
+ * or
+ * seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE
+ */
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Other host probably restarted!\n");
+
+ bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ if (set_mark)
+ batadv_set_bit(seq_bits, 0);
+
+ return true;
+}
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
new file mode 100644
index 000000000000..37f7ae413bc6
--- /dev/null
+++ b/net/batman-adv/bitarray.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_BITARRAY_H_
+#define _NET_BATMAN_ADV_BITARRAY_H_
+
+#include "main.h"
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/**
+ * batadv_test_bit() - check if bit is set in the current window
+ *
+ * @seq_bits: pointer to the sequence number receive packet
+ * @last_seqno: latest sequence number in seq_bits
+ * @curr_seqno: sequence number to test for
+ *
+ * Return: true if the corresponding bit in the given seq_bits indicates true
+ * and curr_seqno is within range of last_seqno. Otherwise returns false.
+ */
+static inline bool batadv_test_bit(const unsigned long *seq_bits,
+ u32 last_seqno, u32 curr_seqno)
+{
+ s32 diff;
+
+ diff = last_seqno - curr_seqno;
+ if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE)
+ return false;
+ return test_bit(diff, seq_bits) != 0;
+}
+
+/**
+ * batadv_set_bit() - Turn corresponding bit on, so we can remember that we got
+ * the packet
+ * @seq_bits: bitmap of the packet receive window
+ * @n: relative sequence number of newly received packet
+ */
+static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
+{
+ /* if too old, just drop it */
+ if (n < 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE)
+ return;
+
+ set_bit(n, seq_bits); /* turn the position on */
+}
+
+bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
+ s32 seq_num_diff, int set_mark);
+
+#endif /* _NET_BATMAN_ADV_BITARRAY_H_ */
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
new file mode 100644
index 000000000000..3dc791c15bf7
--- /dev/null
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -0,0 +1,2482 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich
+ */
+
+#include "bridge_loop_avoidance.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/container_of.h>
+#include <linux/crc16.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/sprintf.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/string_choices.h>
+#include <linux/workqueue.h>
+#include <net/arp.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+#include "translation-table.h"
+
+static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
+
+static void batadv_bla_periodic_work(struct work_struct *work);
+static void
+batadv_bla_send_announce(struct batadv_priv *bat_priv,
+ struct batadv_bla_backbone_gw *backbone_gw);
+
+/**
+ * batadv_choose_claim() - choose the right bucket for a claim.
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Return: the hash index of the claim
+ */
+static inline u32 batadv_choose_claim(const void *data, u32 size)
+{
+ const struct batadv_bla_claim *claim = data;
+ u32 hash = 0;
+
+ hash = jhash(&claim->addr, sizeof(claim->addr), hash);
+ hash = jhash(&claim->vid, sizeof(claim->vid), hash);
+
+ return hash % size;
+}
+
+/**
+ * batadv_choose_backbone_gw() - choose the right bucket for a backbone gateway.
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Return: the hash index of the backbone gateway
+ */
+static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
+{
+ const struct batadv_bla_backbone_gw *gw;
+ u32 hash = 0;
+
+ gw = data;
+ hash = jhash(&gw->orig, sizeof(gw->orig), hash);
+ hash = jhash(&gw->vid, sizeof(gw->vid), hash);
+
+ return hash % size;
+}
+
+/**
+ * batadv_compare_backbone_gw() - compare address and vid of two backbone gws
+ * @node: list node of the first entry to compare
+ * @data2: pointer to the second backbone gateway
+ *
+ * Return: true if the backbones have the same data, false otherwise
+ */
+static bool batadv_compare_backbone_gw(const struct hlist_node *node,
+ const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_bla_backbone_gw,
+ hash_entry);
+ const struct batadv_bla_backbone_gw *gw1 = data1;
+ const struct batadv_bla_backbone_gw *gw2 = data2;
+
+ if (!batadv_compare_eth(gw1->orig, gw2->orig))
+ return false;
+
+ if (gw1->vid != gw2->vid)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_compare_claim() - compare address and vid of two claims
+ * @node: list node of the first entry to compare
+ * @data2: pointer to the second claims
+ *
+ * Return: true if the claim have the same data, 0 otherwise
+ */
+static bool batadv_compare_claim(const struct hlist_node *node,
+ const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_bla_claim,
+ hash_entry);
+ const struct batadv_bla_claim *cl1 = data1;
+ const struct batadv_bla_claim *cl2 = data2;
+
+ if (!batadv_compare_eth(cl1->addr, cl2->addr))
+ return false;
+
+ if (cl1->vid != cl2->vid)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_backbone_gw_release() - release backbone gw from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the backbone gw
+ */
+static void batadv_backbone_gw_release(struct kref *ref)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ backbone_gw = container_of(ref, struct batadv_bla_backbone_gw,
+ refcount);
+
+ kfree_rcu(backbone_gw, rcu);
+}
+
+/**
+ * batadv_backbone_gw_put() - decrement the backbone gw refcounter and possibly
+ * release it
+ * @backbone_gw: backbone gateway to be free'd
+ */
+static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ if (!backbone_gw)
+ return;
+
+ kref_put(&backbone_gw->refcount, batadv_backbone_gw_release);
+}
+
+/**
+ * batadv_claim_release() - release claim from lists and queue for free after
+ * rcu grace period
+ * @ref: kref pointer of the claim
+ */
+static void batadv_claim_release(struct kref *ref)
+{
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_backbone_gw *old_backbone_gw;
+
+ claim = container_of(ref, struct batadv_bla_claim, refcount);
+
+ spin_lock_bh(&claim->backbone_lock);
+ old_backbone_gw = claim->backbone_gw;
+ claim->backbone_gw = NULL;
+ spin_unlock_bh(&claim->backbone_lock);
+
+ spin_lock_bh(&old_backbone_gw->crc_lock);
+ old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&old_backbone_gw->crc_lock);
+
+ batadv_backbone_gw_put(old_backbone_gw);
+
+ kfree_rcu(claim, rcu);
+}
+
+/**
+ * batadv_claim_put() - decrement the claim refcounter and possibly release it
+ * @claim: claim to be free'd
+ */
+static void batadv_claim_put(struct batadv_bla_claim *claim)
+{
+ if (!claim)
+ return;
+
+ kref_put(&claim->refcount, batadv_claim_release);
+}
+
+/**
+ * batadv_claim_hash_find() - looks for a claim in the claim hash
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @data: search data (may be local/static data)
+ *
+ * Return: claim if found or NULL otherwise.
+ */
+static struct batadv_bla_claim *
+batadv_claim_hash_find(struct batadv_priv *bat_priv,
+ struct batadv_bla_claim *data)
+{
+ struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
+ struct hlist_head *head;
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_claim *claim_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = batadv_choose_claim(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ if (!batadv_compare_claim(&claim->hash_entry, data))
+ continue;
+
+ if (!kref_get_unless_zero(&claim->refcount))
+ continue;
+
+ claim_tmp = claim;
+ break;
+ }
+ rcu_read_unlock();
+
+ return claim_tmp;
+}
+
+/**
+ * batadv_backbone_hash_find() - looks for a backbone gateway in the hash
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the address of the originator
+ * @vid: the VLAN ID
+ *
+ * Return: backbone gateway if found or NULL otherwise
+ */
+static struct batadv_bla_backbone_gw *
+batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
+ struct hlist_head *head;
+ struct batadv_bla_backbone_gw search_entry, *backbone_gw;
+ struct batadv_bla_backbone_gw *backbone_gw_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ ether_addr_copy(search_entry.orig, addr);
+ search_entry.vid = vid;
+
+ index = batadv_choose_backbone_gw(&search_entry, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (!batadv_compare_backbone_gw(&backbone_gw->hash_entry,
+ &search_entry))
+ continue;
+
+ if (!kref_get_unless_zero(&backbone_gw->refcount))
+ continue;
+
+ backbone_gw_tmp = backbone_gw;
+ break;
+ }
+ rcu_read_unlock();
+
+ return backbone_gw_tmp;
+}
+
+/**
+ * batadv_bla_del_backbone_claims() - delete all claims for a backbone
+ * @backbone_gw: backbone gateway where the claims should be removed
+ */
+static void
+batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ struct batadv_hashtable *hash;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ struct batadv_bla_claim *claim;
+ int i;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+
+ hash = backbone_gw->bat_priv->bla.claim_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(claim, node_tmp,
+ head, hash_entry) {
+ if (claim->backbone_gw != backbone_gw)
+ continue;
+
+ batadv_claim_put(claim);
+ hlist_del_rcu(&claim->hash_entry);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ /* all claims gone, initialize CRC */
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_gw->crc = BATADV_BLA_CRC_INIT;
+ spin_unlock_bh(&backbone_gw->crc_lock);
+}
+
+/**
+ * batadv_bla_send_claim() - sends a claim frame according to the provided info
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @mac: the mac address to be announced within the claim
+ * @vid: the VLAN ID
+ * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...)
+ */
+static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac,
+ unsigned short vid, int claimtype)
+{
+ struct sk_buff *skb;
+ struct ethhdr *ethhdr;
+ struct batadv_hard_iface *primary_if;
+ struct net_device *mesh_iface;
+ u8 *hw_src;
+ struct batadv_bla_claim_dst local_claim_dest;
+ __be32 zeroip = 0;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return;
+
+ memcpy(&local_claim_dest, &bat_priv->bla.claim_dest,
+ sizeof(local_claim_dest));
+ local_claim_dest.type = claimtype;
+
+ mesh_iface = primary_if->mesh_iface;
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
+ /* IP DST: 0.0.0.0 */
+ zeroip,
+ primary_if->mesh_iface,
+ /* IP SRC: 0.0.0.0 */
+ zeroip,
+ /* Ethernet DST: Broadcast */
+ NULL,
+ /* Ethernet SRC/HW SRC: originator mac */
+ primary_if->net_dev->dev_addr,
+ /* HW DST: FF:43:05:XX:YY:YY
+ * with XX = claim type
+ * and YY:YY = group id
+ */
+ (u8 *)&local_claim_dest);
+
+ if (!skb)
+ goto out;
+
+ ethhdr = (struct ethhdr *)skb->data;
+ hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr);
+
+ /* now we pretend that the client would have sent this ... */
+ switch (claimtype) {
+ case BATADV_CLAIM_TYPE_CLAIM:
+ /* normal claim frame
+ * set Ethernet SRC to the clients mac
+ */
+ ether_addr_copy(ethhdr->h_source, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): CLAIM %pM on vid %d\n", __func__, mac,
+ batadv_print_vid(vid));
+ break;
+ case BATADV_CLAIM_TYPE_UNCLAIM:
+ /* unclaim frame
+ * set HW SRC to the clients mac
+ */
+ ether_addr_copy(hw_src, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): UNCLAIM %pM on vid %d\n", __func__, mac,
+ batadv_print_vid(vid));
+ break;
+ case BATADV_CLAIM_TYPE_ANNOUNCE:
+ /* announcement frame
+ * set HW SRC to the special mac containing the crc
+ */
+ ether_addr_copy(hw_src, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): ANNOUNCE of %pM on vid %d\n", __func__,
+ ethhdr->h_source, batadv_print_vid(vid));
+ break;
+ case BATADV_CLAIM_TYPE_REQUEST:
+ /* request frame
+ * set HW SRC and header destination to the receiving backbone
+ * gws mac
+ */
+ ether_addr_copy(hw_src, mac);
+ ether_addr_copy(ethhdr->h_dest, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): REQUEST of %pM to %pM on vid %d\n", __func__,
+ ethhdr->h_source, ethhdr->h_dest,
+ batadv_print_vid(vid));
+ break;
+ case BATADV_CLAIM_TYPE_LOOPDETECT:
+ ether_addr_copy(ethhdr->h_source, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): LOOPDETECT of %pM to %pM on vid %d\n",
+ __func__, ethhdr->h_source, ethhdr->h_dest,
+ batadv_print_vid(vid));
+
+ break;
+ }
+
+ if (vid & BATADV_VLAN_HAS_TAG) {
+ skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+ if (!skb)
+ goto out;
+ }
+
+ skb_reset_mac_header(skb);
+ skb->protocol = eth_type_trans(skb, mesh_iface);
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ netif_rx(skb);
+out:
+ batadv_hardif_put(primary_if);
+}
+
+/**
+ * batadv_bla_loopdetect_report() - worker for reporting the loop
+ * @work: work queue item
+ *
+ * Throws an uevent, as the loopdetect check function can't do that itself
+ * since the kernel may sleep while throwing uevents.
+ */
+static void batadv_bla_loopdetect_report(struct work_struct *work)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_priv *bat_priv;
+ char vid_str[6] = { '\0' };
+
+ backbone_gw = container_of(work, struct batadv_bla_backbone_gw,
+ report_work);
+ bat_priv = backbone_gw->bat_priv;
+
+ batadv_info(bat_priv->mesh_iface,
+ "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n",
+ batadv_print_vid(backbone_gw->vid));
+ snprintf(vid_str, sizeof(vid_str), "%d",
+ batadv_print_vid(backbone_gw->vid));
+ vid_str[sizeof(vid_str) - 1] = 0;
+
+ batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT,
+ vid_str);
+
+ batadv_backbone_gw_put(backbone_gw);
+}
+
+/**
+ * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the mac address of the originator
+ * @vid: the VLAN ID
+ * @own_backbone: set if the requested backbone is local
+ *
+ * Return: the (possibly created) backbone gateway or NULL on error
+ */
+static struct batadv_bla_backbone_gw *
+batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig,
+ unsigned short vid, bool own_backbone)
+{
+ struct batadv_bla_backbone_gw *entry;
+ struct batadv_orig_node *orig_node;
+ int hash_added;
+
+ entry = batadv_backbone_hash_find(bat_priv, orig, vid);
+
+ if (entry)
+ return entry;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): not found (%pM, %d), creating new entry\n", __func__,
+ orig, batadv_print_vid(vid));
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return NULL;
+
+ entry->vid = vid;
+ entry->lasttime = jiffies;
+ entry->crc = BATADV_BLA_CRC_INIT;
+ entry->bat_priv = bat_priv;
+ spin_lock_init(&entry->crc_lock);
+ atomic_set(&entry->request_sent, 0);
+ atomic_set(&entry->wait_periods, 0);
+ ether_addr_copy(entry->orig, orig);
+ INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report);
+ kref_init(&entry->refcount);
+
+ kref_get(&entry->refcount);
+ hash_added = batadv_hash_add(bat_priv->bla.backbone_hash,
+ batadv_compare_backbone_gw,
+ batadv_choose_backbone_gw, entry,
+ &entry->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* hash failed, free the structure */
+ kfree(entry);
+ return NULL;
+ }
+
+ /* this is a gateway now, remove any TT entry on this VLAN */
+ orig_node = batadv_orig_hash_find(bat_priv, orig);
+ if (orig_node) {
+ batadv_tt_global_del_orig(bat_priv, orig_node, vid,
+ "became a backbone gateway");
+ batadv_orig_node_put(orig_node);
+ }
+
+ if (own_backbone) {
+ batadv_bla_send_announce(bat_priv, entry);
+
+ /* this will be decreased in the worker thread */
+ atomic_inc(&entry->request_sent);
+ atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS);
+ atomic_inc(&bat_priv->bla.num_requests);
+ }
+
+ return entry;
+}
+
+/**
+ * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the selected primary interface
+ * @vid: VLAN identifier
+ *
+ * update or add the own backbone gw to make sure we announce
+ * where we receive other backbone gws
+ */
+static void
+batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv,
+ primary_if->net_dev->dev_addr,
+ vid, true);
+ if (unlikely(!backbone_gw))
+ return;
+
+ backbone_gw->lasttime = jiffies;
+ batadv_backbone_gw_put(backbone_gw);
+}
+
+/**
+ * batadv_bla_answer_request() - answer a bla request by sending own claims
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: interface where the request came on
+ * @vid: the vid where the request came on
+ *
+ * Repeat all of our own claims, and finally send an ANNOUNCE frame
+ * to allow the requester another check if the CRC is correct now.
+ */
+static void batadv_bla_answer_request(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ unsigned short vid)
+{
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ int i;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): received a claim request, send all of our own claims again\n",
+ __func__);
+
+ backbone_gw = batadv_backbone_hash_find(bat_priv,
+ primary_if->net_dev->dev_addr,
+ vid);
+ if (!backbone_gw)
+ return;
+
+ hash = bat_priv->bla.claim_hash;
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ /* only own claims are interesting */
+ if (claim->backbone_gw != backbone_gw)
+ continue;
+
+ batadv_bla_send_claim(bat_priv, claim->addr, claim->vid,
+ BATADV_CLAIM_TYPE_CLAIM);
+ }
+ rcu_read_unlock();
+ }
+
+ /* finally, send an announcement frame */
+ batadv_bla_send_announce(bat_priv, backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
+}
+
+/**
+ * batadv_bla_send_request() - send a request to repeat claims
+ * @backbone_gw: the backbone gateway from whom we are out of sync
+ *
+ * When the crc is wrong, ask the backbone gateway for a full table update.
+ * After the request, it will repeat all of his own claims and finally
+ * send an announcement claim with which we can check again.
+ */
+static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ /* first, remove all old entries */
+ batadv_bla_del_backbone_claims(backbone_gw);
+
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "Sending REQUEST to %pM\n", backbone_gw->orig);
+
+ /* send request */
+ batadv_bla_send_claim(backbone_gw->bat_priv, backbone_gw->orig,
+ backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST);
+
+ /* no local broadcasts should be sent or received, for now. */
+ if (!atomic_read(&backbone_gw->request_sent)) {
+ atomic_inc(&backbone_gw->bat_priv->bla.num_requests);
+ atomic_set(&backbone_gw->request_sent, 1);
+ }
+}
+
+/**
+ * batadv_bla_send_announce() - Send an announcement frame
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @backbone_gw: our backbone gateway which should be announced
+ */
+static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ u8 mac[ETH_ALEN];
+ __be16 crc;
+
+ memcpy(mac, batadv_announce_mac, 4);
+ spin_lock_bh(&backbone_gw->crc_lock);
+ crc = htons(backbone_gw->crc);
+ spin_unlock_bh(&backbone_gw->crc_lock);
+ memcpy(&mac[4], &crc, 2);
+
+ batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid,
+ BATADV_CLAIM_TYPE_ANNOUNCE);
+}
+
+/**
+ * batadv_bla_add_claim() - Adds a claim in the claim hash
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @mac: the mac address of the claim
+ * @vid: the VLAN ID of the frame
+ * @backbone_gw: the backbone gateway which claims it
+ */
+static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
+ const u8 *mac, const unsigned short vid,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ struct batadv_bla_backbone_gw *old_backbone_gw;
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_claim search_claim;
+ bool remove_crc = false;
+ int hash_added;
+
+ ether_addr_copy(search_claim.addr, mac);
+ search_claim.vid = vid;
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ /* create a new claim entry if it does not exist yet. */
+ if (!claim) {
+ claim = kzalloc(sizeof(*claim), GFP_ATOMIC);
+ if (!claim)
+ return;
+
+ ether_addr_copy(claim->addr, mac);
+ spin_lock_init(&claim->backbone_lock);
+ claim->vid = vid;
+ claim->lasttime = jiffies;
+ kref_get(&backbone_gw->refcount);
+ claim->backbone_gw = backbone_gw;
+ kref_init(&claim->refcount);
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): adding new entry %pM, vid %d to hash ...\n",
+ __func__, mac, batadv_print_vid(vid));
+
+ kref_get(&claim->refcount);
+ hash_added = batadv_hash_add(bat_priv->bla.claim_hash,
+ batadv_compare_claim,
+ batadv_choose_claim, claim,
+ &claim->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* only local changes happened. */
+ kfree(claim);
+ return;
+ }
+ } else {
+ claim->lasttime = jiffies;
+ if (claim->backbone_gw == backbone_gw)
+ /* no need to register a new backbone */
+ goto claim_free_ref;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): changing ownership for %pM, vid %d to gw %pM\n",
+ __func__, mac, batadv_print_vid(vid),
+ backbone_gw->orig);
+
+ remove_crc = true;
+ }
+
+ /* replace backbone_gw atomically and adjust reference counters */
+ spin_lock_bh(&claim->backbone_lock);
+ old_backbone_gw = claim->backbone_gw;
+ kref_get(&backbone_gw->refcount);
+ claim->backbone_gw = backbone_gw;
+ spin_unlock_bh(&claim->backbone_lock);
+
+ if (remove_crc) {
+ /* remove claim address from old backbone_gw */
+ spin_lock_bh(&old_backbone_gw->crc_lock);
+ old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&old_backbone_gw->crc_lock);
+ }
+
+ batadv_backbone_gw_put(old_backbone_gw);
+
+ /* add claim address to new backbone_gw */
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&backbone_gw->crc_lock);
+ backbone_gw->lasttime = jiffies;
+
+claim_free_ref:
+ batadv_claim_put(claim);
+}
+
+/**
+ * batadv_bla_claim_get_backbone_gw() - Get valid reference for backbone_gw of
+ * claim
+ * @claim: claim whose backbone_gw should be returned
+ *
+ * Return: valid reference to claim::backbone_gw
+ */
+static struct batadv_bla_backbone_gw *
+batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ spin_lock_bh(&claim->backbone_lock);
+ backbone_gw = claim->backbone_gw;
+ kref_get(&backbone_gw->refcount);
+ spin_unlock_bh(&claim->backbone_lock);
+
+ return backbone_gw;
+}
+
+/**
+ * batadv_bla_del_claim() - delete a claim from the claim hash
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @mac: mac address of the claim to be removed
+ * @vid: VLAN id for the claim to be removed
+ */
+static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
+ const u8 *mac, const unsigned short vid)
+{
+ struct batadv_bla_claim search_claim, *claim;
+ struct batadv_bla_claim *claim_removed_entry;
+ struct hlist_node *claim_removed_node;
+
+ ether_addr_copy(search_claim.addr, mac);
+ search_claim.vid = vid;
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+ if (!claim)
+ return;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__,
+ mac, batadv_print_vid(vid));
+
+ claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash,
+ batadv_compare_claim,
+ batadv_choose_claim, claim);
+ if (!claim_removed_node)
+ goto free_claim;
+
+ /* reference from the hash is gone */
+ claim_removed_entry = hlist_entry(claim_removed_node,
+ struct batadv_bla_claim, hash_entry);
+ batadv_claim_put(claim_removed_entry);
+
+free_claim:
+ /* don't need the reference from hash_find() anymore */
+ batadv_claim_put(claim);
+}
+
+/**
+ * batadv_handle_announce() - check for ANNOUNCE frame
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @an_addr: announcement mac address (ARP Sender HW address)
+ * @backbone_addr: originator address of the sender (Ethernet source MAC)
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: true if handled
+ */
+static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
+ u8 *backbone_addr, unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ u16 backbone_crc, crc;
+
+ if (memcmp(an_addr, batadv_announce_mac, 4) != 0)
+ return false;
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid,
+ false);
+
+ if (unlikely(!backbone_gw))
+ return true;
+
+ /* handle as ANNOUNCE frame */
+ backbone_gw->lasttime = jiffies;
+ crc = ntohs(*((__force __be16 *)(&an_addr[4])));
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n",
+ __func__, batadv_print_vid(vid), backbone_gw->orig, crc);
+
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_crc = backbone_gw->crc;
+ spin_unlock_bh(&backbone_gw->crc_lock);
+
+ if (backbone_crc != crc) {
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "%s(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n",
+ __func__, backbone_gw->orig,
+ batadv_print_vid(backbone_gw->vid),
+ backbone_crc, crc);
+
+ batadv_bla_send_request(backbone_gw);
+ } else {
+ /* if we have sent a request and the crc was OK,
+ * we can allow traffic again.
+ */
+ if (atomic_read(&backbone_gw->request_sent)) {
+ atomic_dec(&backbone_gw->bat_priv->bla.num_requests);
+ atomic_set(&backbone_gw->request_sent, 0);
+ }
+ }
+
+ batadv_backbone_gw_put(backbone_gw);
+ return true;
+}
+
+/**
+ * batadv_handle_request() - check for REQUEST frame
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
+ * @backbone_addr: backbone address to be requested (ARP sender HW MAC)
+ * @ethhdr: ethernet header of a packet
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: true if handled
+ */
+static bool batadv_handle_request(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ u8 *backbone_addr, struct ethhdr *ethhdr,
+ unsigned short vid)
+{
+ /* check for REQUEST frame */
+ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest))
+ return false;
+
+ /* sanity check, this should not happen on a normal switch,
+ * we ignore it in this case.
+ */
+ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr))
+ return true;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): REQUEST vid %d (sent by %pM)...\n",
+ __func__, batadv_print_vid(vid), ethhdr->h_source);
+
+ batadv_bla_answer_request(bat_priv, primary_if, vid);
+ return true;
+}
+
+/**
+ * batadv_handle_unclaim() - check for UNCLAIM frame
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
+ * @backbone_addr: originator address of the backbone (Ethernet source)
+ * @claim_addr: Client to be unclaimed (ARP sender HW MAC)
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: true if handled
+ */
+static bool batadv_handle_unclaim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ const u8 *backbone_addr, const u8 *claim_addr,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ /* unclaim in any case if it is our own */
+ if (primary_if && batadv_compare_eth(backbone_addr,
+ primary_if->net_dev->dev_addr))
+ batadv_bla_send_claim(bat_priv, claim_addr, vid,
+ BATADV_CLAIM_TYPE_UNCLAIM);
+
+ backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid);
+
+ if (!backbone_gw)
+ return true;
+
+ /* this must be an UNCLAIM frame */
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): UNCLAIM %pM on vid %d (sent by %pM)...\n", __func__,
+ claim_addr, batadv_print_vid(vid), backbone_gw->orig);
+
+ batadv_bla_del_claim(bat_priv, claim_addr, vid);
+ batadv_backbone_gw_put(backbone_gw);
+ return true;
+}
+
+/**
+ * batadv_handle_claim() - check for CLAIM frame
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
+ * @backbone_addr: originator address of the backbone (Ethernet Source)
+ * @claim_addr: client mac address to be claimed (ARP sender HW MAC)
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: true if handled
+ */
+static bool batadv_handle_claim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ const u8 *backbone_addr, const u8 *claim_addr,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ /* register the gateway if not yet available, and add the claim. */
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid,
+ false);
+
+ if (unlikely(!backbone_gw))
+ return true;
+
+ /* this must be a CLAIM frame */
+ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw);
+ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr))
+ batadv_bla_send_claim(bat_priv, claim_addr, vid,
+ BATADV_CLAIM_TYPE_CLAIM);
+
+ /* TODO: we could call something like tt_local_del() here. */
+
+ batadv_backbone_gw_put(backbone_gw);
+ return true;
+}
+
+/**
+ * batadv_check_claim_group() - check for claim group membership
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary interface of this batman interface
+ * @hw_src: the Hardware source in the ARP Header
+ * @hw_dst: the Hardware destination in the ARP Header
+ * @ethhdr: pointer to the Ethernet header of the claim frame
+ *
+ * checks if it is a claim packet and if it's on the same group.
+ * This function also applies the group ID of the sender
+ * if it is in the same mesh.
+ *
+ * Return:
+ * 2 - if it is a claim packet and on the same group
+ * 1 - if is a claim packet from another group
+ * 0 - if it is not a claim packet
+ */
+static int batadv_check_claim_group(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ u8 *hw_src, u8 *hw_dst,
+ struct ethhdr *ethhdr)
+{
+ u8 *backbone_addr;
+ struct batadv_orig_node *orig_node;
+ struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
+
+ bla_dst = (struct batadv_bla_claim_dst *)hw_dst;
+ bla_dst_own = &bat_priv->bla.claim_dest;
+
+ /* if announcement packet, use the source,
+ * otherwise assume it is in the hw_src
+ */
+ switch (bla_dst->type) {
+ case BATADV_CLAIM_TYPE_CLAIM:
+ backbone_addr = hw_src;
+ break;
+ case BATADV_CLAIM_TYPE_REQUEST:
+ case BATADV_CLAIM_TYPE_ANNOUNCE:
+ case BATADV_CLAIM_TYPE_UNCLAIM:
+ backbone_addr = ethhdr->h_source;
+ break;
+ default:
+ return 0;
+ }
+
+ /* don't accept claim frames from ourselves */
+ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr))
+ return 0;
+
+ /* if its already the same group, it is fine. */
+ if (bla_dst->group == bla_dst_own->group)
+ return 2;
+
+ /* lets see if this originator is in our mesh */
+ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr);
+
+ /* don't accept claims from gateways which are not in
+ * the same mesh or group.
+ */
+ if (!orig_node)
+ return 1;
+
+ /* if our mesh friends mac is bigger, use it for ourselves. */
+ if (ntohs(bla_dst->group) > ntohs(bla_dst_own->group)) {
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "taking other backbones claim group: %#.4x\n",
+ ntohs(bla_dst->group));
+ bla_dst_own->group = bla_dst->group;
+ }
+
+ batadv_orig_node_put(orig_node);
+
+ return 2;
+}
+
+/**
+ * batadv_bla_process_claim() - Check if this is a claim frame, and process it
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
+ * @skb: the frame to be checked
+ *
+ * Return: true if it was a claim frame, otherwise return false to
+ * tell the callee that it can use the frame on its own.
+ */
+static bool batadv_bla_process_claim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct sk_buff *skb)
+{
+ struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
+ u8 *hw_src, *hw_dst;
+ struct vlan_hdr *vhdr, vhdr_buf;
+ struct ethhdr *ethhdr;
+ struct arphdr *arphdr;
+ unsigned short vid;
+ int vlan_depth = 0;
+ __be16 proto;
+ int headlen;
+ int ret;
+
+ vid = batadv_get_vid(skb, 0);
+ ethhdr = eth_hdr(skb);
+
+ proto = ethhdr->h_proto;
+ headlen = ETH_HLEN;
+ if (vid & BATADV_VLAN_HAS_TAG) {
+ /* Traverse the VLAN/Ethertypes.
+ *
+ * At this point it is known that the first protocol is a VLAN
+ * header, so start checking at the encapsulated protocol.
+ *
+ * The depth of the VLAN headers is recorded to drop BLA claim
+ * frames encapsulated into multiple VLAN headers (QinQ).
+ */
+ do {
+ vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN,
+ &vhdr_buf);
+ if (!vhdr)
+ return false;
+
+ proto = vhdr->h_vlan_encapsulated_proto;
+ headlen += VLAN_HLEN;
+ vlan_depth++;
+ } while (proto == htons(ETH_P_8021Q));
+ }
+
+ if (proto != htons(ETH_P_ARP))
+ return false; /* not a claim frame */
+
+ /* this must be a ARP frame. check if it is a claim. */
+
+ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev))))
+ return false;
+
+ /* pskb_may_pull() may have modified the pointers, get ethhdr again */
+ ethhdr = eth_hdr(skb);
+ arphdr = (struct arphdr *)((u8 *)ethhdr + headlen);
+
+ /* Check whether the ARP frame carries a valid
+ * IP information
+ */
+ if (arphdr->ar_hrd != htons(ARPHRD_ETHER))
+ return false;
+ if (arphdr->ar_pro != htons(ETH_P_IP))
+ return false;
+ if (arphdr->ar_hln != ETH_ALEN)
+ return false;
+ if (arphdr->ar_pln != 4)
+ return false;
+
+ hw_src = (u8 *)arphdr + sizeof(struct arphdr);
+ hw_dst = hw_src + ETH_ALEN + 4;
+ bla_dst = (struct batadv_bla_claim_dst *)hw_dst;
+ bla_dst_own = &bat_priv->bla.claim_dest;
+
+ /* check if it is a claim frame in general */
+ if (memcmp(bla_dst->magic, bla_dst_own->magic,
+ sizeof(bla_dst->magic)) != 0)
+ return false;
+
+ /* check if there is a claim frame encapsulated deeper in (QinQ) and
+ * drop that, as this is not supported by BLA but should also not be
+ * sent via the mesh.
+ */
+ if (vlan_depth > 1)
+ return true;
+
+ /* Let the loopdetect frames on the mesh in any case. */
+ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT)
+ return false;
+
+ /* check if it is a claim frame. */
+ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst,
+ ethhdr);
+ if (ret == 1)
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n",
+ __func__, ethhdr->h_source, batadv_print_vid(vid),
+ hw_src, hw_dst);
+
+ if (ret < 2)
+ return !!ret;
+
+ /* become a backbone gw ourselves on this vlan if not happened yet */
+ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
+
+ /* check for the different types of claim frames ... */
+ switch (bla_dst->type) {
+ case BATADV_CLAIM_TYPE_CLAIM:
+ if (batadv_handle_claim(bat_priv, primary_if, hw_src,
+ ethhdr->h_source, vid))
+ return true;
+ break;
+ case BATADV_CLAIM_TYPE_UNCLAIM:
+ if (batadv_handle_unclaim(bat_priv, primary_if,
+ ethhdr->h_source, hw_src, vid))
+ return true;
+ break;
+
+ case BATADV_CLAIM_TYPE_ANNOUNCE:
+ if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source,
+ vid))
+ return true;
+ break;
+ case BATADV_CLAIM_TYPE_REQUEST:
+ if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr,
+ vid))
+ return true;
+ break;
+ }
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n",
+ __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src,
+ hw_dst);
+ return true;
+}
+
+/**
+ * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or
+ * immediately
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @now: whether the whole hash shall be wiped now
+ *
+ * Check when we last heard from other nodes, and remove them in case of
+ * a time out, or clean all backbone gws if now is set.
+ */
+static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ int i;
+
+ hash = bat_priv->bla.backbone_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(backbone_gw, node_tmp,
+ head, hash_entry) {
+ if (now)
+ goto purge_now;
+ if (!batadv_has_timed_out(backbone_gw->lasttime,
+ BATADV_BLA_BACKBONE_TIMEOUT))
+ continue;
+
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "%s(): backbone gw %pM timed out\n",
+ __func__, backbone_gw->orig);
+
+purge_now:
+ /* don't wait for the pending request anymore */
+ if (atomic_read(&backbone_gw->request_sent))
+ atomic_dec(&bat_priv->bla.num_requests);
+
+ batadv_bla_del_backbone_claims(backbone_gw);
+
+ hlist_del_rcu(&backbone_gw->hash_entry);
+ batadv_backbone_gw_put(backbone_gw);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+/**
+ * batadv_bla_purge_claims() - Remove claims after a timeout or immediately
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the selected primary interface, may be NULL if now is set
+ * @now: whether the whole hash shall be wiped now
+ *
+ * Check when we heard last time from our own claims, and remove them in case of
+ * a time out, or clean all claims if now is set
+ */
+static void batadv_bla_purge_claims(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ int now)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_bla_claim *claim;
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ int i;
+
+ hash = bat_priv->bla.claim_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
+ if (now)
+ goto purge_now;
+
+ if (!batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr))
+ goto skip;
+
+ if (!batadv_has_timed_out(claim->lasttime,
+ BATADV_BLA_CLAIM_TIMEOUT))
+ goto skip;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): timed out.\n", __func__);
+
+purge_now:
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): %pM, vid %d\n", __func__,
+ claim->addr, claim->vid);
+
+ batadv_handle_unclaim(bat_priv, primary_if,
+ backbone_gw->orig,
+ claim->addr, claim->vid);
+skip:
+ batadv_backbone_gw_put(backbone_gw);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_bla_update_orig_address() - Update the backbone gateways when the own
+ * originator address changes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the new selected primary_if
+ * @oldif: the old primary interface, may be NULL
+ */
+void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hard_iface *oldif)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ __be16 group;
+ int i;
+
+ /* reset bridge loop avoidance group id */
+ group = htons(crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN));
+ bat_priv->bla.claim_dest.group = group;
+
+ /* purge everything when bridge loop avoidance is turned off */
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ oldif = NULL;
+
+ if (!oldif) {
+ batadv_bla_purge_claims(bat_priv, NULL, 1);
+ batadv_bla_purge_backbone_gw(bat_priv, 1);
+ return;
+ }
+
+ hash = bat_priv->bla.backbone_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ /* own orig still holds the old value. */
+ if (!batadv_compare_eth(backbone_gw->orig,
+ oldif->net_dev->dev_addr))
+ continue;
+
+ ether_addr_copy(backbone_gw->orig,
+ primary_if->net_dev->dev_addr);
+ /* send an announce frame so others will ask for our
+ * claims and update their tables.
+ */
+ batadv_bla_send_announce(bat_priv, backbone_gw);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_bla_send_loopdetect() - send a loopdetect frame
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @backbone_gw: the backbone gateway for which a loop should be detected
+ *
+ * To detect loops that the bridge loop avoidance can't handle, send a loop
+ * detection packet on the backbone. Unlike other BLA frames, this frame will
+ * be allowed on the mesh by other nodes. If it is received on the mesh, this
+ * indicates that there is a loop.
+ */
+static void
+batadv_bla_send_loopdetect(struct batadv_priv *bat_priv,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n",
+ backbone_gw->vid);
+ batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr,
+ backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT);
+}
+
+/**
+ * batadv_bla_status_update() - purge bla interfaces if necessary
+ * @net_dev: the mesh interface net device
+ */
+void batadv_bla_status_update(struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return;
+
+ /* this function already purges everything when bla is disabled,
+ * so just call that one.
+ */
+ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if);
+ batadv_hardif_put(primary_if);
+}
+
+/**
+ * batadv_bla_periodic_work() - performs periodic bla work
+ * @work: kernel work struct
+ *
+ * periodic work to do:
+ * * purge structures when they are too old
+ * * send announcements
+ */
+static void batadv_bla_periodic_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv *bat_priv;
+ struct batadv_priv_bla *priv_bla;
+ struct hlist_head *head;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_hashtable *hash;
+ struct batadv_hard_iface *primary_if;
+ bool send_loopdetect = false;
+ int i;
+
+ delayed_work = to_delayed_work(work);
+ priv_bla = container_of(delayed_work, struct batadv_priv_bla, work);
+ bat_priv = container_of(priv_bla, struct batadv_priv, bla);
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ batadv_bla_purge_claims(bat_priv, primary_if, 0);
+ batadv_bla_purge_backbone_gw(bat_priv, 0);
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ goto out;
+
+ if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) {
+ /* set a new random mac address for the next bridge loop
+ * detection frames. Set the locally administered bit to avoid
+ * collisions with users mac addresses.
+ */
+ eth_random_addr(bat_priv->bla.loopdetect_addr);
+ bat_priv->bla.loopdetect_addr[0] = 0xba;
+ bat_priv->bla.loopdetect_addr[1] = 0xbe;
+ bat_priv->bla.loopdetect_lasttime = jiffies;
+ atomic_set(&bat_priv->bla.loopdetect_next,
+ BATADV_BLA_LOOPDETECT_PERIODS);
+
+ /* mark for sending loop detect on all VLANs */
+ send_loopdetect = true;
+ }
+
+ hash = bat_priv->bla.backbone_hash;
+ if (!hash)
+ goto out;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (!batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr))
+ continue;
+
+ backbone_gw->lasttime = jiffies;
+
+ batadv_bla_send_announce(bat_priv, backbone_gw);
+ if (send_loopdetect)
+ batadv_bla_send_loopdetect(bat_priv,
+ backbone_gw);
+
+ /* request_sent is only set after creation to avoid
+ * problems when we are not yet known as backbone gw
+ * in the backbone.
+ *
+ * We can reset this now after we waited some periods
+ * to give bridge forward delays and bla group forming
+ * some grace time.
+ */
+
+ if (atomic_read(&backbone_gw->request_sent) == 0)
+ continue;
+
+ if (!atomic_dec_and_test(&backbone_gw->wait_periods))
+ continue;
+
+ atomic_dec(&backbone_gw->bat_priv->bla.num_requests);
+ atomic_set(&backbone_gw->request_sent, 0);
+ }
+ rcu_read_unlock();
+ }
+out:
+ batadv_hardif_put(primary_if);
+
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
+ msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
+}
+
+/* The hash for claim and backbone hash receive the same key because they
+ * are getting initialized by hash_new with the same key. Reinitializing
+ * them with to different keys to allow nested locking without generating
+ * lockdep warnings
+ */
+static struct lock_class_key batadv_claim_hash_lock_class_key;
+static struct lock_class_key batadv_backbone_hash_lock_class_key;
+
+/**
+ * batadv_bla_init() - initialize all bla structures
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success, < 0 on error.
+ */
+int batadv_bla_init(struct batadv_priv *bat_priv)
+{
+ int i;
+ u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00};
+ struct batadv_hard_iface *primary_if;
+ u16 crc;
+ unsigned long entrytime;
+
+ spin_lock_init(&bat_priv->bla.bcast_duplist_lock);
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hash registering\n");
+
+ /* setting claim destination address */
+ memcpy(&bat_priv->bla.claim_dest.magic, claim_dest, 3);
+ bat_priv->bla.claim_dest.type = 0;
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if) {
+ crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN);
+ bat_priv->bla.claim_dest.group = htons(crc);
+ batadv_hardif_put(primary_if);
+ } else {
+ bat_priv->bla.claim_dest.group = 0; /* will be set later */
+ }
+
+ /* initialize the duplicate list */
+ entrytime = jiffies - msecs_to_jiffies(BATADV_DUPLIST_TIMEOUT);
+ for (i = 0; i < BATADV_DUPLIST_SIZE; i++)
+ bat_priv->bla.bcast_duplist[i].entrytime = entrytime;
+ bat_priv->bla.bcast_duplist_curr = 0;
+
+ atomic_set(&bat_priv->bla.loopdetect_next,
+ BATADV_BLA_LOOPDETECT_PERIODS);
+
+ if (bat_priv->bla.claim_hash)
+ return 0;
+
+ bat_priv->bla.claim_hash = batadv_hash_new(128);
+ if (!bat_priv->bla.claim_hash)
+ return -ENOMEM;
+
+ bat_priv->bla.backbone_hash = batadv_hash_new(32);
+ if (!bat_priv->bla.backbone_hash) {
+ batadv_hash_destroy(bat_priv->bla.claim_hash);
+ return -ENOMEM;
+ }
+
+ batadv_hash_set_lock_class(bat_priv->bla.claim_hash,
+ &batadv_claim_hash_lock_class_key);
+ batadv_hash_set_lock_class(bat_priv->bla.backbone_hash,
+ &batadv_backbone_hash_lock_class_key);
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hashes initialized\n");
+
+ INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work);
+
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
+ msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
+ return 0;
+}
+
+/**
+ * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: contains the multicast packet to be checked
+ * @payload_offset: offset in the skb, marking the start of the data to be CRC'ed
+ * @orig: originator mac address, NULL if unknown
+ *
+ * Check if it is on our broadcast list. Another gateway might have sent the
+ * same packet because it is connected to the same backbone, so we have to
+ * remove this duplicate.
+ *
+ * This is performed by checking the CRC, which will tell us
+ * with a good chance that it is the same packet. If it is furthermore
+ * sent by another host, drop it. We allow equal packets from
+ * the same host however as this might be intended.
+ *
+ * Return: true if a packet is in the duplicate list, false otherwise.
+ */
+static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int payload_offset,
+ const u8 *orig)
+{
+ struct batadv_bcast_duplist_entry *entry;
+ bool ret = false;
+ int payload_len;
+ int i, curr;
+ u32 crc;
+
+ /* calculate the crc ... */
+ payload_len = skb->len - payload_offset;
+ crc = skb_crc32c(skb, payload_offset, payload_len, 0);
+
+ spin_lock_bh(&bat_priv->bla.bcast_duplist_lock);
+
+ for (i = 0; i < BATADV_DUPLIST_SIZE; i++) {
+ curr = (bat_priv->bla.bcast_duplist_curr + i);
+ curr %= BATADV_DUPLIST_SIZE;
+ entry = &bat_priv->bla.bcast_duplist[curr];
+
+ /* we can stop searching if the entry is too old ;
+ * later entries will be even older
+ */
+ if (batadv_has_timed_out(entry->entrytime,
+ BATADV_DUPLIST_TIMEOUT))
+ break;
+
+ if (entry->crc != crc)
+ continue;
+
+ /* are the originators both known and not anonymous? */
+ if (orig && !is_zero_ether_addr(orig) &&
+ !is_zero_ether_addr(entry->orig)) {
+ /* If known, check if the new frame came from
+ * the same originator:
+ * We are safe to take identical frames from the
+ * same orig, if known, as multiplications in
+ * the mesh are detected via the (orig, seqno) pair.
+ * So we can be a bit more liberal here and allow
+ * identical frames from the same orig which the source
+ * host might have sent multiple times on purpose.
+ */
+ if (batadv_compare_eth(entry->orig, orig))
+ continue;
+ }
+
+ /* this entry seems to match: same crc, not too old,
+ * and from another gw. therefore return true to forbid it.
+ */
+ ret = true;
+ goto out;
+ }
+ /* not found, add a new entry (overwrite the oldest entry)
+ * and allow it, its the first occurrence.
+ */
+ curr = (bat_priv->bla.bcast_duplist_curr + BATADV_DUPLIST_SIZE - 1);
+ curr %= BATADV_DUPLIST_SIZE;
+ entry = &bat_priv->bla.bcast_duplist[curr];
+ entry->crc = crc;
+ entry->entrytime = jiffies;
+
+ /* known originator */
+ if (orig)
+ ether_addr_copy(entry->orig, orig);
+ /* anonymous originator */
+ else
+ eth_zero_addr(entry->orig);
+
+ bat_priv->bla.bcast_duplist_curr = curr;
+
+out:
+ spin_unlock_bh(&bat_priv->bla.bcast_duplist_lock);
+
+ return ret;
+}
+
+/**
+ * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: contains the multicast packet to be checked, decapsulated from a
+ * unicast_packet
+ *
+ * Check if it is on our broadcast list. Another gateway might have sent the
+ * same packet because it is connected to the same backbone, so we have to
+ * remove this duplicate.
+ *
+ * Return: true if a packet is in the duplicate list, false otherwise.
+ */
+static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return batadv_bla_check_duplist(bat_priv, skb, 0, NULL);
+}
+
+/**
+ * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: contains the bcast_packet to be checked
+ *
+ * Check if it is on our broadcast list. Another gateway might have sent the
+ * same packet because it is connected to the same backbone, so we have to
+ * remove this duplicate.
+ *
+ * Return: true if a packet is in the duplicate list, false otherwise.
+ */
+bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_bcast_packet *bcast_packet;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+
+ return batadv_bla_check_duplist(bat_priv, skb, sizeof(*bcast_packet),
+ bcast_packet->orig);
+}
+
+/**
+ * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for
+ * the VLAN identified by vid.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: originator mac address
+ * @vid: VLAN identifier
+ *
+ * Return: true if orig is a backbone for this vid, false otherwise.
+ */
+bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
+ struct hlist_head *head;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ int i;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ return false;
+
+ if (!hash)
+ return false;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (batadv_compare_eth(backbone_gw->orig, orig) &&
+ backbone_gw->vid == vid) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return false;
+}
+
+/**
+ * batadv_bla_is_backbone_gw() - check if originator is a backbone gw for a VLAN
+ * @skb: the frame to be checked
+ * @orig_node: the orig_node of the frame
+ * @hdr_size: maximum length of the frame
+ *
+ * Return: true if the orig_node is also a gateway on the mesh interface,
+ * otherwise it returns false.
+ */
+bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node, int hdr_size)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ unsigned short vid;
+
+ if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance))
+ return false;
+
+ /* first, find out the vid. */
+ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN))
+ return false;
+
+ vid = batadv_get_vid(skb, hdr_size);
+
+ /* see if this originator is a backbone gw for this VLAN */
+ backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv,
+ orig_node->orig, vid);
+ if (!backbone_gw)
+ return false;
+
+ batadv_backbone_gw_put(backbone_gw);
+ return true;
+}
+
+/**
+ * batadv_bla_free() - free all bla structures
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * for meshinterface free or module unload
+ */
+void batadv_bla_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hard_iface *primary_if;
+
+ cancel_delayed_work_sync(&bat_priv->bla.work);
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (bat_priv->bla.claim_hash) {
+ batadv_bla_purge_claims(bat_priv, primary_if, 1);
+ batadv_hash_destroy(bat_priv->bla.claim_hash);
+ bat_priv->bla.claim_hash = NULL;
+ }
+ if (bat_priv->bla.backbone_hash) {
+ batadv_bla_purge_backbone_gw(bat_priv, 1);
+ batadv_hash_destroy(bat_priv->bla.backbone_hash);
+ bat_priv->bla.backbone_hash = NULL;
+ }
+ batadv_hardif_put(primary_if);
+}
+
+/**
+ * batadv_bla_loopdetect_check() - check and handle a detected loop
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the packet to check
+ * @primary_if: interface where the request came on
+ * @vid: the VLAN ID of the frame
+ *
+ * Checks if this packet is a loop detect frame which has been sent by us,
+ * throws an uevent and logs the event if that is the case.
+ *
+ * Return: true if it is a loop detect frame which is to be dropped, false
+ * otherwise.
+ */
+static bool
+batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_hard_iface *primary_if,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct ethhdr *ethhdr;
+ bool ret;
+
+ ethhdr = eth_hdr(skb);
+
+ /* Only check for the MAC address and skip more checks here for
+ * performance reasons - this function is on the hotpath, after all.
+ */
+ if (!batadv_compare_eth(ethhdr->h_source,
+ bat_priv->bla.loopdetect_addr))
+ return false;
+
+ /* If the packet came too late, don't forward it on the mesh
+ * but don't consider that as loop. It might be a coincidence.
+ */
+ if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime,
+ BATADV_BLA_LOOPDETECT_TIMEOUT))
+ return true;
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv,
+ primary_if->net_dev->dev_addr,
+ vid, true);
+ if (unlikely(!backbone_gw))
+ return true;
+
+ ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work);
+
+ /* backbone_gw is unreferenced in the report work function
+ * if queue_work() call was successful
+ */
+ if (!ret)
+ batadv_backbone_gw_put(backbone_gw);
+
+ return true;
+}
+
+/**
+ * batadv_bla_rx() - check packets coming from the mesh.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the frame to be checked
+ * @vid: the VLAN ID of the frame
+ * @packet_type: the batman packet type this frame came in
+ *
+ * batadv_bla_rx avoidance checks if:
+ * * we have to race for a claim
+ * * if the frame is allowed on the LAN
+ *
+ * In these cases, the skb is further handled by this function
+ *
+ * Return: true if handled, otherwise it returns false and the caller shall
+ * further process the skb.
+ */
+bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int packet_type)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct ethhdr *ethhdr;
+ struct batadv_bla_claim search_claim, *claim = NULL;
+ struct batadv_hard_iface *primary_if;
+ bool own_claim;
+ bool ret;
+
+ ethhdr = eth_hdr(skb);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto handled;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ goto allow;
+
+ if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid))
+ goto handled;
+
+ if (unlikely(atomic_read(&bat_priv->bla.num_requests)))
+ /* don't allow multicast packets while requests are in flight */
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ /* Both broadcast flooding or multicast-via-unicasts
+ * delivery might send to multiple backbone gateways
+ * sharing the same LAN and therefore need to coordinate
+ * which backbone gateway forwards into the LAN,
+ * by claiming the payload source address.
+ *
+ * Broadcast flooding and multicast-via-unicasts
+ * delivery use the following two batman packet types.
+ * Note: explicitly exclude BATADV_UNICAST_4ADDR,
+ * as the DHCP gateway feature will send explicitly
+ * to only one BLA gateway, so the claiming process
+ * should be avoided there.
+ */
+ if (packet_type == BATADV_BCAST ||
+ packet_type == BATADV_UNICAST)
+ goto handled;
+
+ /* potential duplicates from foreign BLA backbone gateways via
+ * multicast-in-unicast packets
+ */
+ if (is_multicast_ether_addr(ethhdr->h_dest) &&
+ packet_type == BATADV_UNICAST &&
+ batadv_bla_check_ucast_duplist(bat_priv, skb))
+ goto handled;
+
+ ether_addr_copy(search_claim.addr, ethhdr->h_source);
+ search_claim.vid = vid;
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ if (!claim) {
+ bool local = batadv_is_my_client(bat_priv, ethhdr->h_source, vid);
+
+ /* possible optimization: race for a claim */
+ /* No claim exists yet, claim it for us!
+ */
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): Unclaimed MAC %pM found. Claim it. Local: %s\n",
+ __func__, ethhdr->h_source, str_yes_no(local));
+ batadv_handle_claim(bat_priv, primary_if,
+ primary_if->net_dev->dev_addr,
+ ethhdr->h_source, vid);
+ goto allow;
+ }
+
+ /* if it is our own claim ... */
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
+ own_claim = batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr);
+ batadv_backbone_gw_put(backbone_gw);
+
+ if (own_claim) {
+ /* ... allow it in any case */
+ claim->lasttime = jiffies;
+ goto allow;
+ }
+
+ /* if it is a multicast ... */
+ if (is_multicast_ether_addr(ethhdr->h_dest) &&
+ (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST)) {
+ /* ... drop it. the responsible gateway is in charge.
+ *
+ * We need to check packet type because with the gateway
+ * feature, broadcasts (like DHCP requests) may be sent
+ * using a unicast 4 address packet type. See comment above.
+ */
+ goto handled;
+ } else {
+ /* seems the client considers us as its best gateway.
+ * send a claim and update the claim table
+ * immediately.
+ */
+ batadv_handle_claim(bat_priv, primary_if,
+ primary_if->net_dev->dev_addr,
+ ethhdr->h_source, vid);
+ goto allow;
+ }
+allow:
+ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
+ ret = false;
+ goto out;
+
+handled:
+ kfree_skb(skb);
+ ret = true;
+
+out:
+ batadv_hardif_put(primary_if);
+ batadv_claim_put(claim);
+ return ret;
+}
+
+/**
+ * batadv_bla_tx() - check packets going into the mesh
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the frame to be checked
+ * @vid: the VLAN ID of the frame
+ *
+ * batadv_bla_tx checks if:
+ * * a claim was received which has to be processed
+ * * the frame is allowed on the mesh
+ *
+ * in these cases, the skb is further handled by this function.
+ *
+ * This call might reallocate skb data.
+ *
+ * Return: true if handled, otherwise it returns false and the caller shall
+ * further process the skb.
+ */
+bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
+{
+ struct ethhdr *ethhdr;
+ struct batadv_bla_claim search_claim, *claim = NULL;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_hard_iface *primary_if;
+ bool client_roamed;
+ bool ret = false;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ goto allow;
+
+ if (batadv_bla_process_claim(bat_priv, primary_if, skb))
+ goto handled;
+
+ ethhdr = eth_hdr(skb);
+
+ if (unlikely(atomic_read(&bat_priv->bla.num_requests)))
+ /* don't allow broadcasts while requests are in flight */
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ goto handled;
+
+ ether_addr_copy(search_claim.addr, ethhdr->h_source);
+ search_claim.vid = vid;
+
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ /* if no claim exists, allow it. */
+ if (!claim)
+ goto allow;
+
+ /* check if we are responsible. */
+ backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
+ client_roamed = batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr);
+ batadv_backbone_gw_put(backbone_gw);
+
+ if (client_roamed) {
+ /* if yes, the client has roamed and we have
+ * to unclaim it.
+ */
+ if (batadv_has_timed_out(claim->lasttime, 100)) {
+ /* only unclaim if the last claim entry is
+ * older than 100 ms to make sure we really
+ * have a roaming client here.
+ */
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Roaming client %pM detected. Unclaim it.\n",
+ __func__, ethhdr->h_source);
+ batadv_handle_unclaim(bat_priv, primary_if,
+ primary_if->net_dev->dev_addr,
+ ethhdr->h_source, vid);
+ goto allow;
+ } else {
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Race for claim %pM detected. Drop packet.\n",
+ __func__, ethhdr->h_source);
+ goto handled;
+ }
+ }
+
+ /* check if it is a multicast/broadcast frame */
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
+ /* drop it. the responsible gateway has forwarded it into
+ * the backbone network.
+ */
+ goto handled;
+ } else {
+ /* we must allow it. at least if we are
+ * responsible for the DESTINATION.
+ */
+ goto allow;
+ }
+allow:
+ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
+ ret = false;
+ goto out;
+handled:
+ ret = true;
+out:
+ batadv_hardif_put(primary_if);
+ batadv_claim_put(claim);
+ return ret;
+}
+
+/**
+ * batadv_bla_claim_dump_entry() - dump one entry of the claim table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @primary_if: primary interface
+ * @claim: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_bla_claim *claim)
+{
+ const u8 *primary_addr = primary_if->net_dev->dev_addr;
+ u16 backbone_crc;
+ bool is_own;
+ void *hdr;
+ int ret = -EINVAL;
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_BLA_CLAIM);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ genl_dump_check_consistent(cb, hdr);
+
+ is_own = batadv_compare_eth(claim->backbone_gw->orig,
+ primary_addr);
+
+ spin_lock_bh(&claim->backbone_gw->crc_lock);
+ backbone_crc = claim->backbone_gw->crc;
+ spin_unlock_bh(&claim->backbone_gw->crc_lock);
+
+ if (is_own)
+ if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) ||
+ nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
+ claim->backbone_gw->orig) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+ backbone_crc)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_bla_claim_dump_bucket() - dump one bucket of the claim table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @primary_if: primary interface
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: always 0.
+ */
+static int
+batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_skip)
+{
+ struct batadv_bla_claim *claim;
+ int idx = 0;
+ int ret = 0;
+
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) {
+ if (idx++ < *idx_skip)
+ continue;
+
+ ret = batadv_bla_claim_dump_entry(msg, portid, cb,
+ primary_if, claim);
+ if (ret) {
+ *idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ *idx_skip = 0;
+unlock:
+ spin_unlock_bh(&hash->list_locks[bucket]);
+ return ret;
+}
+
+/**
+ * batadv_bla_claim_dump() - dump claim table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct net_device *mesh_iface;
+ struct batadv_hashtable *hash;
+ struct batadv_priv *bat_priv;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int ret = 0;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+ hash = bat_priv->bla.claim_hash;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (bucket < hash->size) {
+ if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if,
+ hash, bucket, &idx))
+ break;
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ ret = msg->len;
+
+out:
+ batadv_hardif_put(primary_if);
+
+ dev_put(mesh_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a
+ * netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @primary_if: primary interface
+ * @backbone_gw: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ const u8 *primary_addr = primary_if->net_dev->dev_addr;
+ u16 backbone_crc;
+ bool is_own;
+ int msecs;
+ void *hdr;
+ int ret = -EINVAL;
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_BLA_BACKBONE);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ genl_dump_check_consistent(cb, hdr);
+
+ is_own = batadv_compare_eth(backbone_gw->orig, primary_addr);
+
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_crc = backbone_gw->crc;
+ spin_unlock_bh(&backbone_gw->crc_lock);
+
+ msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime);
+
+ if (is_own)
+ if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
+ backbone_gw->orig) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) ||
+ nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+ backbone_crc) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
+ genlmsg_end(msg, hdr);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_bla_backbone_dump_bucket() - dump one bucket of the backbone table to
+ * a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @primary_if: primary interface
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: always 0.
+ */
+static int
+batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hashtable *hash,
+ unsigned int bucket, int *idx_skip)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ int idx = 0;
+ int ret = 0;
+
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) {
+ if (idx++ < *idx_skip)
+ continue;
+
+ ret = batadv_bla_backbone_dump_entry(msg, portid, cb,
+ primary_if, backbone_gw);
+ if (ret) {
+ *idx_skip = idx - 1;
+ goto unlock;
+ }
+ }
+
+ *idx_skip = 0;
+unlock:
+ spin_unlock_bh(&hash->list_locks[bucket]);
+ return ret;
+}
+
+/**
+ * batadv_bla_backbone_dump() - dump backbone table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct net_device *mesh_iface;
+ struct batadv_hashtable *hash;
+ struct batadv_priv *bat_priv;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int ret = 0;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+ hash = bat_priv->bla.backbone_hash;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (bucket < hash->size) {
+ if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if,
+ hash, bucket, &idx))
+ break;
+ bucket++;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ ret = msg->len;
+
+out:
+ batadv_hardif_put(primary_if);
+
+ dev_put(mesh_iface);
+
+ return ret;
+}
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+/**
+ * batadv_bla_check_claim() - check if address is claimed
+ *
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: mac address of which the claim status is checked
+ * @vid: the VLAN ID
+ *
+ * addr is checked if this address is claimed by the local device itself.
+ *
+ * Return: true if bla is disabled or the mac is claimed by the device,
+ * false if the device addr is already claimed by another gateway
+ */
+bool batadv_bla_check_claim(struct batadv_priv *bat_priv,
+ u8 *addr, unsigned short vid)
+{
+ struct batadv_bla_claim search_claim;
+ struct batadv_bla_claim *claim = NULL;
+ struct batadv_hard_iface *primary_if = NULL;
+ bool ret = true;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ return ret;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return ret;
+
+ /* First look if the mac address is claimed */
+ ether_addr_copy(search_claim.addr, addr);
+ search_claim.vid = vid;
+
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ /* If there is a claim and we are not owner of the claim,
+ * return false.
+ */
+ if (claim) {
+ if (!batadv_compare_eth(claim->backbone_gw->orig,
+ primary_if->net_dev->dev_addr))
+ ret = false;
+ batadv_claim_put(claim);
+ }
+
+ batadv_hardif_put(primary_if);
+ return ret;
+}
+#endif
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
new file mode 100644
index 000000000000..8673a265995f
--- /dev/null
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_BLA_H_
+#define _NET_BATMAN_ADV_BLA_H_
+
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/**
+ * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop
+ * detect frame sent by bridge loop avoidance
+ * @mac: mac address to check
+ *
+ * Return: true if the it looks like a loop detect frame
+ * (mac starts with BA:BE), false otherwise
+ */
+static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
+{
+ if (mac[0] == 0xba && mac[1] == 0xbe)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int packet_type);
+bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid);
+bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int hdr_size);
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
+int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb);
+bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
+ unsigned short vid);
+bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hard_iface *oldif);
+void batadv_bla_status_update(struct net_device *net_dev);
+int batadv_bla_init(struct batadv_priv *bat_priv);
+void batadv_bla_free(struct batadv_priv *bat_priv);
+#ifdef CONFIG_BATMAN_ADV_DAT
+bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr,
+ unsigned short vid);
+#endif
+#define BATADV_BLA_CRC_INIT 0
+#else /* ifdef CONFIG_BATMAN_ADV_BLA */
+
+static inline bool batadv_bla_rx(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ int packet_type)
+{
+ return false;
+}
+
+static inline bool batadv_bla_tx(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ return false;
+}
+
+static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int hdr_size)
+{
+ return false;
+}
+
+static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv,
+ u8 *orig, unsigned short vid)
+{
+ return false;
+}
+
+static inline bool
+batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline void
+batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hard_iface *oldif)
+{
+}
+
+static inline int batadv_bla_init(struct batadv_priv *bat_priv)
+{
+ return 1;
+}
+
+static inline void batadv_bla_free(struct batadv_priv *bat_priv)
+{
+}
+
+static inline int batadv_bla_claim_dump(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int batadv_bla_backbone_dump(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline
+bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr,
+ unsigned short vid)
+{
+ return true;
+}
+
+#endif /* ifdef CONFIG_BATMAN_ADV_BLA */
+
+#endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
new file mode 100644
index 000000000000..8b8132eb0a79
--- /dev/null
+++ b/net/batman-adv/distributed-arp-table.c
@@ -0,0 +1,1820 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ */
+
+#include "distributed-arp-table.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/udp.h>
+#include <linux/unaligned.h>
+#include <linux/workqueue.h>
+#include <net/arp.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bridge_loop_avoidance.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+#include "send.h"
+#include "translation-table.h"
+#include "tvlv.h"
+
+enum batadv_bootpop {
+ BATADV_BOOTREPLY = 2,
+};
+
+enum batadv_boothtype {
+ BATADV_HTYPE_ETHERNET = 1,
+};
+
+enum batadv_dhcpoptioncode {
+ BATADV_DHCP_OPT_PAD = 0,
+ BATADV_DHCP_OPT_MSG_TYPE = 53,
+ BATADV_DHCP_OPT_END = 255,
+};
+
+enum batadv_dhcptype {
+ BATADV_DHCPACK = 5,
+};
+
+/* { 99, 130, 83, 99 } */
+#define BATADV_DHCP_MAGIC 1669485411
+
+struct batadv_dhcp_packet {
+ __u8 op;
+ __u8 htype;
+ __u8 hlen;
+ __u8 hops;
+ __be32 xid;
+ __be16 secs;
+ __be16 flags;
+ __be32 ciaddr;
+ __be32 yiaddr;
+ __be32 siaddr;
+ __be32 giaddr;
+ __u8 chaddr[16];
+ __u8 sname[64];
+ __u8 file[128];
+ __be32 magic;
+ /* __u8 options[]; */
+};
+
+#define BATADV_DHCP_YIADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->yiaddr)
+#define BATADV_DHCP_CHADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->chaddr)
+
+static void batadv_dat_purge(struct work_struct *work);
+
+/**
+ * batadv_dat_start_timer() - initialise the DAT periodic worker
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
+{
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->dat.work,
+ msecs_to_jiffies(10000));
+}
+
+/**
+ * batadv_dat_entry_release() - release dat_entry from lists and queue for free
+ * after rcu grace period
+ * @ref: kref pointer of the dat_entry
+ */
+static void batadv_dat_entry_release(struct kref *ref)
+{
+ struct batadv_dat_entry *dat_entry;
+
+ dat_entry = container_of(ref, struct batadv_dat_entry, refcount);
+
+ kfree_rcu(dat_entry, rcu);
+}
+
+/**
+ * batadv_dat_entry_put() - decrement the dat_entry refcounter and possibly
+ * release it
+ * @dat_entry: dat_entry to be free'd
+ */
+static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry)
+{
+ if (!dat_entry)
+ return;
+
+ kref_put(&dat_entry->refcount, batadv_dat_entry_release);
+}
+
+/**
+ * batadv_dat_to_purge() - check whether a dat_entry has to be purged or not
+ * @dat_entry: the entry to check
+ *
+ * Return: true if the entry has to be purged now, false otherwise.
+ */
+static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry)
+{
+ return batadv_has_timed_out(dat_entry->last_update,
+ BATADV_DAT_ENTRY_TIMEOUT);
+}
+
+/**
+ * __batadv_dat_purge() - delete entries from the DAT local storage
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the dat_entry as argument and has to
+ * returns a boolean value: true is the entry has to be deleted,
+ * false otherwise
+ *
+ * Loops over each entry in the DAT local storage and deletes it if and only if
+ * the to_purge function passed as argument returns true.
+ */
+static void __batadv_dat_purge(struct batadv_priv *bat_priv,
+ bool (*to_purge)(struct batadv_dat_entry *))
+{
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ struct batadv_dat_entry *dat_entry;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ u32 i;
+
+ if (!bat_priv->dat.hash)
+ return;
+
+ for (i = 0; i < bat_priv->dat.hash->size; i++) {
+ head = &bat_priv->dat.hash->table[i];
+ list_lock = &bat_priv->dat.hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(dat_entry, node_tmp, head,
+ hash_entry) {
+ /* if a helper function has been passed as parameter,
+ * ask it if the entry has to be purged or not
+ */
+ if (to_purge && !to_purge(dat_entry))
+ continue;
+
+ hlist_del_rcu(&dat_entry->hash_entry);
+ batadv_dat_entry_put(dat_entry);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+/**
+ * batadv_dat_purge() - periodic task that deletes old entries from the local
+ * DAT hash table
+ * @work: kernel work struct
+ */
+static void batadv_dat_purge(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_dat *priv_dat;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ priv_dat = container_of(delayed_work, struct batadv_priv_dat, work);
+ bat_priv = container_of(priv_dat, struct batadv_priv, dat);
+
+ __batadv_dat_purge(bat_priv, batadv_dat_to_purge);
+ batadv_dat_start_timer(bat_priv);
+}
+
+/**
+ * batadv_compare_dat() - comparing function used in the local DAT hash table
+ * @node: node in the local table
+ * @data2: second object to compare the node to
+ *
+ * Return: true if the two entries are the same, false otherwise.
+ */
+static bool batadv_compare_dat(const struct hlist_node *node, const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_dat_entry,
+ hash_entry);
+
+ return memcmp(data1, data2, sizeof(__be32)) == 0;
+}
+
+/**
+ * batadv_arp_hw_src() - extract the hw_src field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Return: the value of the hw_src field in the ARP packet.
+ */
+static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
+{
+ u8 *addr;
+
+ addr = (u8 *)(skb->data + hdr_size);
+ addr += ETH_HLEN + sizeof(struct arphdr);
+
+ return addr;
+}
+
+/**
+ * batadv_arp_ip_src() - extract the ip_src field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Return: the value of the ip_src field in the ARP packet.
+ */
+static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
+{
+ return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN);
+}
+
+/**
+ * batadv_arp_hw_dst() - extract the hw_dst field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Return: the value of the hw_dst field in the ARP packet.
+ */
+static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
+{
+ return batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN + 4;
+}
+
+/**
+ * batadv_arp_ip_dst() - extract the ip_dst field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Return: the value of the ip_dst field in the ARP packet.
+ */
+static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
+{
+ u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4;
+
+ return *(__force __be32 *)dst;
+}
+
+/**
+ * batadv_hash_dat() - compute the hash value for an IP address
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Return: the selected index in the hash table for the given data.
+ */
+static u32 batadv_hash_dat(const void *data, u32 size)
+{
+ u32 hash = 0;
+ const struct batadv_dat_entry *dat = data;
+ const unsigned char *key;
+ __be16 vid;
+ u32 i;
+
+ key = (__force const unsigned char *)&dat->ip;
+ for (i = 0; i < sizeof(dat->ip); i++) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+
+ vid = htons(dat->vid);
+ key = (__force const unsigned char *)&vid;
+ for (i = 0; i < sizeof(dat->vid); i++) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/**
+ * batadv_dat_entry_hash_find() - look for a given dat_entry in the local hash
+ * table
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ip: search key
+ * @vid: VLAN identifier
+ *
+ * Return: the dat_entry if found, NULL otherwise.
+ */
+static struct batadv_dat_entry *
+batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
+ unsigned short vid)
+{
+ struct hlist_head *head;
+ struct batadv_dat_entry to_find, *dat_entry, *dat_entry_tmp = NULL;
+ struct batadv_hashtable *hash = bat_priv->dat.hash;
+ u32 index;
+
+ if (!hash)
+ return NULL;
+
+ to_find.ip = ip;
+ to_find.vid = vid;
+
+ index = batadv_hash_dat(&to_find, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
+ if (dat_entry->ip != ip)
+ continue;
+
+ if (!kref_get_unless_zero(&dat_entry->refcount))
+ continue;
+
+ dat_entry_tmp = dat_entry;
+ break;
+ }
+ rcu_read_unlock();
+
+ return dat_entry_tmp;
+}
+
+/**
+ * batadv_dat_entry_add() - add a new dat entry or update it if already exists
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ip: ipv4 to add/edit
+ * @mac_addr: mac address to assign to the given ipv4
+ * @vid: VLAN identifier
+ */
+static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
+ u8 *mac_addr, unsigned short vid)
+{
+ struct batadv_dat_entry *dat_entry;
+ int hash_added;
+
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip, vid);
+ /* if this entry is already known, just update it */
+ if (dat_entry) {
+ if (!batadv_compare_eth(dat_entry->mac_addr, mac_addr))
+ ether_addr_copy(dat_entry->mac_addr, mac_addr);
+ dat_entry->last_update = jiffies;
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Entry updated: %pI4 %pM (vid: %d)\n",
+ &dat_entry->ip, dat_entry->mac_addr,
+ batadv_print_vid(vid));
+ goto out;
+ }
+
+ dat_entry = kmalloc(sizeof(*dat_entry), GFP_ATOMIC);
+ if (!dat_entry)
+ goto out;
+
+ dat_entry->ip = ip;
+ dat_entry->vid = vid;
+ ether_addr_copy(dat_entry->mac_addr, mac_addr);
+ dat_entry->last_update = jiffies;
+ kref_init(&dat_entry->refcount);
+
+ kref_get(&dat_entry->refcount);
+ hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat,
+ batadv_hash_dat, dat_entry,
+ &dat_entry->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* remove the reference for the hash */
+ batadv_dat_entry_put(dat_entry);
+ goto out;
+ }
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "New entry added: %pI4 %pM (vid: %d)\n",
+ &dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid));
+
+out:
+ batadv_dat_entry_put(dat_entry);
+}
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+
+/**
+ * batadv_dbg_arp() - print a debug message containing all the ARP packet
+ * details
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ * @msg: message to print together with the debugging information
+ */
+static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ int hdr_size, char *msg)
+{
+ struct batadv_unicast_4addr_packet *unicast_4addr_packet;
+ struct batadv_bcast_packet *bcast_pkt;
+ u8 *orig_addr;
+ __be32 ip_src, ip_dst;
+
+ if (msg)
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "%s\n", msg);
+
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "ARP MSG = [src: %pM-%pI4 dst: %pM-%pI4]\n",
+ batadv_arp_hw_src(skb, hdr_size), &ip_src,
+ batadv_arp_hw_dst(skb, hdr_size), &ip_dst);
+
+ if (hdr_size < sizeof(struct batadv_unicast_packet))
+ return;
+
+ unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
+
+ switch (unicast_4addr_packet->u.packet_type) {
+ case BATADV_UNICAST:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within a UNICAST packet\n");
+ break;
+ case BATADV_UNICAST_4ADDR:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within a UNICAST_4ADDR packet (src: %pM)\n",
+ unicast_4addr_packet->src);
+ switch (unicast_4addr_packet->subtype) {
+ case BATADV_P_DAT_DHT_PUT:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_DHT_PUT\n");
+ break;
+ case BATADV_P_DAT_DHT_GET:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_DHT_GET\n");
+ break;
+ case BATADV_P_DAT_CACHE_REPLY:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* type: DAT_CACHE_REPLY\n");
+ break;
+ case BATADV_P_DATA:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DATA\n");
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: Unknown (%u)!\n",
+ unicast_4addr_packet->u.packet_type);
+ }
+ break;
+ case BATADV_BCAST:
+ bcast_pkt = (struct batadv_bcast_packet *)unicast_4addr_packet;
+ orig_addr = bcast_pkt->orig;
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within a BCAST packet (src: %pM)\n",
+ orig_addr);
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within an unknown packet type (0x%x)\n",
+ unicast_4addr_packet->u.packet_type);
+ }
+}
+
+#else
+
+static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ int hdr_size, char *msg)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
+/**
+ * batadv_is_orig_node_eligible() - check whether a node can be a DHT candidate
+ * @res: the array with the already selected candidates
+ * @select: number of already selected candidates
+ * @tmp_max: address of the currently evaluated node
+ * @max: current round max address
+ * @last_max: address of the last selected candidate
+ * @candidate: orig_node under evaluation
+ * @max_orig_node: last selected candidate
+ *
+ * Return: true if the node has been elected as next candidate or false
+ * otherwise.
+ */
+static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
+ int select, batadv_dat_addr_t tmp_max,
+ batadv_dat_addr_t max,
+ batadv_dat_addr_t last_max,
+ struct batadv_orig_node *candidate,
+ struct batadv_orig_node *max_orig_node)
+{
+ bool ret = false;
+ int j;
+
+ /* check if orig node candidate is running DAT */
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_DAT, &candidate->capabilities))
+ goto out;
+
+ /* Check if this node has already been selected... */
+ for (j = 0; j < select; j++)
+ if (res[j].orig_node == candidate)
+ break;
+ /* ..and possibly skip it */
+ if (j < select)
+ goto out;
+ /* sanity check: has it already been selected? This should not happen */
+ if (tmp_max > last_max)
+ goto out;
+ /* check if during this iteration an originator with a closer dht
+ * address has already been found
+ */
+ if (tmp_max < max)
+ goto out;
+ /* this is an hash collision with the temporary selected node. Choose
+ * the one with the lowest address
+ */
+ if (tmp_max == max && max_orig_node &&
+ batadv_compare_eth(candidate->orig, max_orig_node->orig))
+ goto out;
+
+ ret = true;
+out:
+ return ret;
+}
+
+/**
+ * batadv_choose_next_candidate() - select the next DHT candidate
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @cands: candidates array
+ * @select: number of candidates already present in the array
+ * @ip_key: key to look up in the DHT
+ * @last_max: pointer where the address of the selected candidate will be saved
+ */
+static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
+ struct batadv_dat_candidate *cands,
+ int select, batadv_dat_addr_t ip_key,
+ batadv_dat_addr_t *last_max)
+{
+ batadv_dat_addr_t max = 0;
+ batadv_dat_addr_t tmp_max = 0;
+ struct batadv_orig_node *orig_node, *max_orig_node = NULL;
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ int i;
+
+ /* if no node is eligible as candidate, leave the candidate type as
+ * NOT_FOUND
+ */
+ cands[select].type = BATADV_DAT_CANDIDATE_NOT_FOUND;
+
+ /* iterate over the originator list and find the node with the closest
+ * dat_address which has not been selected yet
+ */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ /* the dht space is a ring using unsigned addresses */
+ tmp_max = BATADV_DAT_ADDR_MAX - orig_node->dat_addr +
+ ip_key;
+
+ if (!batadv_is_orig_node_eligible(cands, select,
+ tmp_max, max,
+ *last_max, orig_node,
+ max_orig_node))
+ continue;
+
+ if (!kref_get_unless_zero(&orig_node->refcount))
+ continue;
+
+ max = tmp_max;
+ batadv_orig_node_put(max_orig_node);
+ max_orig_node = orig_node;
+ }
+ rcu_read_unlock();
+ }
+ if (max_orig_node) {
+ cands[select].type = BATADV_DAT_CANDIDATE_ORIG;
+ cands[select].orig_node = max_orig_node;
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "dat_select_candidates() %d: selected %pM addr=%u dist=%u\n",
+ select, max_orig_node->orig, max_orig_node->dat_addr,
+ max);
+ }
+ *last_max = max;
+}
+
+/**
+ * batadv_dat_select_candidates() - select the nodes which the DHT message has
+ * to be sent to
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ip_dst: ipv4 to look up in the DHT
+ * @vid: VLAN identifier
+ *
+ * An originator O is selected if and only if its DHT_ID value is one of three
+ * closest values (from the LEFT, with wrap around if needed) then the hash
+ * value of the key. ip_dst is the key.
+ *
+ * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM.
+ */
+static struct batadv_dat_candidate *
+batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst,
+ unsigned short vid)
+{
+ int select;
+ batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key;
+ struct batadv_dat_candidate *res;
+ struct batadv_dat_entry dat;
+
+ if (!bat_priv->orig_hash)
+ return NULL;
+
+ res = kmalloc_array(BATADV_DAT_CANDIDATES_NUM, sizeof(*res),
+ GFP_ATOMIC);
+ if (!res)
+ return NULL;
+
+ dat.ip = ip_dst;
+ dat.vid = vid;
+ ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat,
+ BATADV_DAT_ADDR_MAX);
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "%s(): IP=%pI4 hash(IP)=%u\n", __func__, &ip_dst,
+ ip_key);
+
+ for (select = 0; select < BATADV_DAT_CANDIDATES_NUM; select++)
+ batadv_choose_next_candidate(bat_priv, res, select, ip_key,
+ &last_max);
+
+ return res;
+}
+
+/**
+ * batadv_dat_forward_data() - copy and send payload to the selected candidates
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: payload to send
+ * @ip: the DHT key
+ * @vid: VLAN identifier
+ * @packet_subtype: unicast4addr packet subtype to use
+ *
+ * This function copies the skb with pskb_copy() and is sent as a unicast packet
+ * to each of the selected candidates.
+ *
+ * Return: true if the packet is sent to at least one candidate, false
+ * otherwise.
+ */
+static bool batadv_dat_forward_data(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, __be32 ip,
+ unsigned short vid, int packet_subtype)
+{
+ int i;
+ bool ret = false;
+ int send_status;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct sk_buff *tmp_skb;
+ struct batadv_dat_candidate *cand;
+
+ cand = batadv_dat_select_candidates(bat_priv, ip, vid);
+ if (!cand)
+ return ret;
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "DHT_SEND for %pI4\n", &ip);
+
+ for (i = 0; i < BATADV_DAT_CANDIDATES_NUM; i++) {
+ if (cand[i].type == BATADV_DAT_CANDIDATE_NOT_FOUND)
+ continue;
+
+ neigh_node = batadv_orig_router_get(cand[i].orig_node,
+ BATADV_IF_DEFAULT);
+ if (!neigh_node)
+ goto free_orig;
+
+ tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
+ if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb,
+ cand[i].orig_node,
+ packet_subtype)) {
+ kfree_skb(tmp_skb);
+ goto free_neigh;
+ }
+
+ send_status = batadv_send_unicast_skb(tmp_skb, neigh_node);
+ if (send_status == NET_XMIT_SUCCESS) {
+ /* count the sent packet */
+ switch (packet_subtype) {
+ case BATADV_P_DAT_DHT_GET:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_GET_TX);
+ break;
+ case BATADV_P_DAT_DHT_PUT:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_PUT_TX);
+ break;
+ }
+
+ /* packet sent to a candidate: return true */
+ ret = true;
+ }
+free_neigh:
+ batadv_neigh_node_put(neigh_node);
+free_orig:
+ batadv_orig_node_put(cand[i].orig_node);
+ }
+
+ kfree(cand);
+ return ret;
+}
+
+/**
+ * batadv_dat_tvlv_container_update() - update the dat tvlv container after dat
+ * setting change
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ char dat_mode;
+
+ dat_mode = atomic_read(&bat_priv->distributed_arp_table);
+
+ switch (dat_mode) {
+ case 0:
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_DAT, 1);
+ break;
+ case 1:
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_DAT, 1,
+ NULL, 0);
+ break;
+ }
+}
+
+/**
+ * batadv_dat_status_update() - update the dat tvlv container after dat
+ * setting change
+ * @net_dev: the mesh interface net device
+ */
+void batadv_dat_status_update(struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+
+ batadv_dat_tvlv_container_update(bat_priv);
+}
+
+/**
+ * batadv_dat_tvlv_ogm_handler_v1() - process incoming dat tvlv container
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
+ clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities);
+ else
+ set_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities);
+}
+
+/**
+ * batadv_dat_hash_free() - free the local DAT hash table
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
+{
+ if (!bat_priv->dat.hash)
+ return;
+
+ __batadv_dat_purge(bat_priv, NULL);
+
+ batadv_hash_destroy(bat_priv->dat.hash);
+
+ bat_priv->dat.hash = NULL;
+}
+
+/**
+ * batadv_dat_init() - initialise the DAT internals
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 in case of success, a negative error code otherwise
+ */
+int batadv_dat_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->dat.hash)
+ return 0;
+
+ bat_priv->dat.hash = batadv_hash_new(1024);
+
+ if (!bat_priv->dat.hash)
+ return -ENOMEM;
+
+ INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge);
+ batadv_dat_start_timer(bat_priv);
+
+ batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1,
+ NULL, NULL, BATADV_TVLV_DAT, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ batadv_dat_tvlv_container_update(bat_priv);
+ return 0;
+}
+
+/**
+ * batadv_dat_free() - free the DAT internals
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_dat_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_DAT, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_DAT, 1);
+
+ cancel_delayed_work_sync(&bat_priv->dat.work);
+
+ batadv_dat_hash_free(bat_priv);
+}
+
+/**
+ * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a
+ * netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @dat_entry: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_dat_entry *dat_entry)
+{
+ int msecs;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_DAT_CACHE);
+ if (!hdr)
+ return -ENOBUFS;
+
+ genl_dump_check_consistent(cb, hdr);
+
+ msecs = jiffies_to_msecs(jiffies - dat_entry->last_update);
+
+ if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS,
+ dat_entry->ip) ||
+ nla_put(msg, BATADV_ATTR_DAT_CACHE_HWADDRESS, ETH_ALEN,
+ dat_entry->mac_addr) ||
+ nla_put_u16(msg, BATADV_ATTR_DAT_CACHE_VID, dat_entry->vid) ||
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) {
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+}
+
+/**
+ * batadv_dat_cache_dump_bucket() - dump one bucket of the DAT cache table to
+ * a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_skip)
+{
+ struct batadv_dat_entry *dat_entry;
+ int idx = 0;
+
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) {
+ if (idx < *idx_skip)
+ goto skip;
+
+ if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
+ *idx_skip = idx;
+
+ return -EMSGSIZE;
+ }
+
+skip:
+ idx++;
+ }
+ spin_unlock_bh(&hash->list_locks[bucket]);
+
+ return 0;
+}
+
+/**
+ * batadv_dat_cache_dump() - dump DAT cache table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct net_device *mesh_iface;
+ struct batadv_hashtable *hash;
+ struct batadv_priv *bat_priv;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int ret = 0;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+ hash = bat_priv->dat.hash;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (bucket < hash->size) {
+ if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket,
+ &idx))
+ break;
+
+ bucket++;
+ idx = 0;
+ }
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ ret = msg->len;
+
+out:
+ batadv_hardif_put(primary_if);
+
+ dev_put(mesh_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_arp_get_type() - parse an ARP packet and gets the type
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: packet to analyse
+ * @hdr_size: size of the possible header before the ARP packet in the skb
+ *
+ * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise.
+ */
+static u16 batadv_arp_get_type(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ struct arphdr *arphdr;
+ struct ethhdr *ethhdr;
+ __be32 ip_src, ip_dst;
+ u8 *hw_src, *hw_dst;
+ u16 type = 0;
+
+ /* pull the ethernet header */
+ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN)))
+ goto out;
+
+ ethhdr = (struct ethhdr *)(skb->data + hdr_size);
+
+ if (ethhdr->h_proto != htons(ETH_P_ARP))
+ goto out;
+
+ /* pull the ARP payload */
+ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN +
+ arp_hdr_len(skb->dev))))
+ goto out;
+
+ arphdr = (struct arphdr *)(skb->data + hdr_size + ETH_HLEN);
+
+ /* check whether the ARP packet carries a valid IP information */
+ if (arphdr->ar_hrd != htons(ARPHRD_ETHER))
+ goto out;
+
+ if (arphdr->ar_pro != htons(ETH_P_IP))
+ goto out;
+
+ if (arphdr->ar_hln != ETH_ALEN)
+ goto out;
+
+ if (arphdr->ar_pln != 4)
+ goto out;
+
+ /* Check for bad reply/request. If the ARP message is not sane, DAT
+ * will simply ignore it
+ */
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+ if (ipv4_is_loopback(ip_src) || ipv4_is_multicast(ip_src) ||
+ ipv4_is_loopback(ip_dst) || ipv4_is_multicast(ip_dst) ||
+ ipv4_is_zeronet(ip_src) || ipv4_is_lbcast(ip_src) ||
+ ipv4_is_zeronet(ip_dst) || ipv4_is_lbcast(ip_dst))
+ goto out;
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ if (is_zero_ether_addr(hw_src) || is_multicast_ether_addr(hw_src))
+ goto out;
+
+ /* don't care about the destination MAC address in ARP requests */
+ if (arphdr->ar_op != htons(ARPOP_REQUEST)) {
+ hw_dst = batadv_arp_hw_dst(skb, hdr_size);
+ if (is_zero_ether_addr(hw_dst) ||
+ is_multicast_ether_addr(hw_dst))
+ goto out;
+ }
+
+ type = ntohs(arphdr->ar_op);
+out:
+ return type;
+}
+
+/**
+ * batadv_dat_get_vid() - extract the VLAN identifier from skb if any
+ * @skb: the buffer containing the packet to extract the VID from
+ * @hdr_size: the size of the batman-adv header encapsulating the packet
+ *
+ * Return: If the packet embedded in the skb is vlan tagged this function
+ * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS
+ * is returned.
+ */
+static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
+{
+ unsigned short vid;
+
+ vid = batadv_get_vid(skb, *hdr_size);
+
+ /* ARP parsing functions jump forward of hdr_size + ETH_HLEN.
+ * If the header contained in the packet is a VLAN one (which is longer)
+ * hdr_size is updated so that the functions will still skip the
+ * correct amount of bytes.
+ */
+ if (vid & BATADV_VLAN_HAS_TAG)
+ *hdr_size += VLAN_HLEN;
+
+ return vid;
+}
+
+/**
+ * batadv_dat_arp_create_reply() - create an ARP Reply
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ip_src: ARP sender IP
+ * @ip_dst: ARP target IP
+ * @hw_src: Ethernet source and ARP sender MAC
+ * @hw_dst: Ethernet destination and ARP target MAC
+ * @vid: VLAN identifier (optional, set to zero otherwise)
+ *
+ * Creates an ARP Reply from the given values, optionally encapsulated in a
+ * VLAN header.
+ *
+ * Return: An skb containing an ARP Reply.
+ */
+static struct sk_buff *
+batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
+ __be32 ip_dst, u8 *hw_src, u8 *hw_dst,
+ unsigned short vid)
+{
+ struct sk_buff *skb;
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->mesh_iface,
+ ip_src, hw_dst, hw_src, hw_dst);
+ if (!skb)
+ return NULL;
+
+ skb_reset_mac_header(skb);
+
+ if (vid & BATADV_VLAN_HAS_TAG)
+ skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+ return skb;
+}
+
+/**
+ * batadv_dat_snoop_outgoing_arp_request() - snoop the ARP request and try to
+ * answer using DAT
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: packet to check
+ *
+ * Return: true if the message has been sent to the dht candidates, false
+ * otherwise. In case of a positive return value the message has to be enqueued
+ * to permit the fallback.
+ */
+bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ u16 type = 0;
+ __be32 ip_dst, ip_src;
+ u8 *hw_src;
+ bool ret = false;
+ struct batadv_dat_entry *dat_entry = NULL;
+ struct sk_buff *skb_new;
+ struct net_device *mesh_iface = bat_priv->mesh_iface;
+ int hdr_size = 0;
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ /* If the node gets an ARP_REQUEST it has to send a DHT_GET unicast
+ * message to the selected DHT candidates
+ */
+ if (type != ARPOP_REQUEST)
+ goto out;
+
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REQUEST");
+
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid);
+ if (dat_entry) {
+ /* If the ARP request is destined for a local client the local
+ * client will answer itself. DAT would only generate a
+ * duplicate packet.
+ *
+ * Moreover, if the mesh-interface is enslaved into a bridge, an
+ * additional DAT answer may trigger kernel warnings about
+ * a packet coming from the wrong port.
+ */
+ if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, vid)) {
+ ret = true;
+ goto out;
+ }
+
+ /* If BLA is enabled, only send ARP replies if we have claimed
+ * the destination for the ARP request or if no one else of
+ * the backbone gws belonging to our backbone has claimed the
+ * destination.
+ */
+ if (!batadv_bla_check_claim(bat_priv,
+ dat_entry->mac_addr, vid)) {
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Device %pM claimed by another backbone gw. Don't send ARP reply!",
+ dat_entry->mac_addr);
+ ret = true;
+ goto out;
+ }
+
+ skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr,
+ hw_src, vid);
+ if (!skb_new)
+ goto out;
+
+ skb_new->protocol = eth_type_trans(skb_new, mesh_iface);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN + hdr_size);
+
+ netif_rx(skb_new);
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
+ ret = true;
+ } else {
+ /* Send the request to the DHT */
+ ret = batadv_dat_forward_data(bat_priv, skb, ip_dst, vid,
+ BATADV_P_DAT_DHT_GET);
+ }
+out:
+ batadv_dat_entry_put(dat_entry);
+ return ret;
+}
+
+/**
+ * batadv_dat_snoop_incoming_arp_request() - snoop the ARP request and try to
+ * answer using the local DAT storage
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: packet to check
+ * @hdr_size: size of the encapsulation header
+ *
+ * Return: true if the request has been answered, false otherwise.
+ */
+bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ u16 type;
+ __be32 ip_src, ip_dst;
+ u8 *hw_src;
+ struct sk_buff *skb_new;
+ struct batadv_dat_entry *dat_entry = NULL;
+ bool ret = false;
+ unsigned short vid;
+ int err;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ if (type != ARPOP_REQUEST)
+ goto out;
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REQUEST");
+
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid);
+ if (!dat_entry)
+ goto out;
+
+ skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr, hw_src, vid);
+ if (!skb_new)
+ goto out;
+
+ /* To preserve backwards compatibility, the node has choose the outgoing
+ * format based on the incoming request packet type. The assumption is
+ * that a node not using the 4addr packet format doesn't support it.
+ */
+ if (hdr_size == sizeof(struct batadv_unicast_4addr_packet))
+ err = batadv_send_skb_via_tt_4addr(bat_priv, skb_new,
+ BATADV_P_DAT_CACHE_REPLY,
+ NULL, vid);
+ else
+ err = batadv_send_skb_via_tt(bat_priv, skb_new, NULL, vid);
+
+ if (err != NET_XMIT_DROP) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_DAT_CACHED_REPLY_TX);
+ ret = true;
+ }
+out:
+ batadv_dat_entry_put(dat_entry);
+ if (ret)
+ kfree_skb(skb);
+ return ret;
+}
+
+/**
+ * batadv_dat_snoop_outgoing_arp_reply() - snoop the ARP reply and fill the DHT
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: packet to check
+ */
+void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ u16 type;
+ __be32 ip_src, ip_dst;
+ u8 *hw_src, *hw_dst;
+ int hdr_size = 0;
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ return;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ if (type != ARPOP_REPLY)
+ return;
+
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REPLY");
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ hw_dst = batadv_arp_hw_dst(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+ batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+
+ /* Send the ARP reply to the candidates for both the IP addresses that
+ * the node obtained from the ARP reply
+ */
+ batadv_dat_forward_data(bat_priv, skb, ip_src, vid,
+ BATADV_P_DAT_DHT_PUT);
+ batadv_dat_forward_data(bat_priv, skb, ip_dst, vid,
+ BATADV_P_DAT_DHT_PUT);
+}
+
+/**
+ * batadv_dat_snoop_incoming_arp_reply() - snoop the ARP reply and fill the
+ * local DAT storage only
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: packet to check
+ * @hdr_size: size of the encapsulation header
+ *
+ * Return: true if the packet was snooped and consumed by DAT. False if the
+ * packet has to be delivered to the interface
+ */
+bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ struct batadv_dat_entry *dat_entry = NULL;
+ u16 type;
+ __be32 ip_src, ip_dst;
+ u8 *hw_src, *hw_dst;
+ bool dropped = false;
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ if (type != ARPOP_REPLY)
+ goto out;
+
+ batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REPLY");
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ hw_dst = batadv_arp_hw_dst(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ /* If ip_dst is already in cache and has the right mac address,
+ * drop this frame if this ARP reply is destined for us because it's
+ * most probably an ARP reply generated by another node of the DHT.
+ * We have most probably received already a reply earlier. Delivering
+ * this frame would lead to doubled receive of an ARP reply.
+ */
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_src, vid);
+ if (dat_entry && batadv_compare_eth(hw_src, dat_entry->mac_addr)) {
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "Doubled ARP reply removed: ARP MSG = [src: %pM-%pI4 dst: %pM-%pI4]; dat_entry: %pM-%pI4\n",
+ hw_src, &ip_src, hw_dst, &ip_dst,
+ dat_entry->mac_addr, &dat_entry->ip);
+ dropped = true;
+ }
+
+ /* Update our internal cache with both the IP addresses the node got
+ * within the ARP reply
+ */
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+ batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+
+ if (dropped)
+ goto out;
+
+ /* If BLA is enabled, only forward ARP replies if we have claimed the
+ * source of the ARP reply or if no one else of the same backbone has
+ * already claimed that client. This prevents that different gateways
+ * to the same backbone all forward the ARP reply leading to multiple
+ * replies in the backbone.
+ */
+ if (!batadv_bla_check_claim(bat_priv, hw_src, vid)) {
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Device %pM claimed by another backbone gw. Drop ARP reply.\n",
+ hw_src);
+ dropped = true;
+ goto out;
+ }
+
+ /* if this REPLY is directed to a client of mine, let's deliver the
+ * packet to the interface
+ */
+ dropped = !batadv_is_my_client(bat_priv, hw_dst, vid);
+
+ /* if this REPLY is sent on behalf of a client of mine, let's drop the
+ * packet because the client will reply by itself
+ */
+ dropped |= batadv_is_my_client(bat_priv, hw_src, vid);
+out:
+ if (dropped)
+ kfree_skb(skb);
+ batadv_dat_entry_put(dat_entry);
+ /* if dropped == false -> deliver to the interface */
+ return dropped;
+}
+
+/**
+ * batadv_dat_check_dhcp_ipudp() - check skb for IP+UDP headers valid for DHCP
+ * @skb: the packet to check
+ * @ip_src: a buffer to store the IPv4 source address in
+ *
+ * Checks whether the given skb has an IP and UDP header valid for a DHCP
+ * message from a DHCP server. And if so, stores the IPv4 source address in
+ * the provided buffer.
+ *
+ * Return: True if valid, false otherwise.
+ */
+static bool
+batadv_dat_check_dhcp_ipudp(struct sk_buff *skb, __be32 *ip_src)
+{
+ unsigned int offset = skb_network_offset(skb);
+ struct udphdr *udphdr, _udphdr;
+ struct iphdr *iphdr, _iphdr;
+
+ iphdr = skb_header_pointer(skb, offset, sizeof(_iphdr), &_iphdr);
+ if (!iphdr || iphdr->version != 4 || iphdr->ihl * 4 < sizeof(_iphdr))
+ return false;
+
+ if (iphdr->protocol != IPPROTO_UDP)
+ return false;
+
+ offset += iphdr->ihl * 4;
+ skb_set_transport_header(skb, offset);
+
+ udphdr = skb_header_pointer(skb, offset, sizeof(_udphdr), &_udphdr);
+ if (!udphdr || udphdr->source != htons(67))
+ return false;
+
+ *ip_src = get_unaligned(&iphdr->saddr);
+
+ return true;
+}
+
+/**
+ * batadv_dat_check_dhcp() - examine packet for valid DHCP message
+ * @skb: the packet to check
+ * @proto: ethernet protocol hint (behind a potential vlan)
+ * @ip_src: a buffer to store the IPv4 source address in
+ *
+ * Checks whether the given skb is a valid DHCP packet. And if so, stores the
+ * IPv4 source address in the provided buffer.
+ *
+ * Caller needs to ensure that the skb network header is set correctly.
+ *
+ * Return: If skb is a valid DHCP packet, then returns its op code
+ * (e.g. BOOTREPLY vs. BOOTREQUEST). Otherwise returns -EINVAL.
+ */
+static int
+batadv_dat_check_dhcp(struct sk_buff *skb, __be16 proto, __be32 *ip_src)
+{
+ __be32 *magic, _magic;
+ unsigned int offset;
+ struct {
+ __u8 op;
+ __u8 htype;
+ __u8 hlen;
+ __u8 hops;
+ } *dhcp_h, _dhcp_h;
+
+ if (proto != htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (!batadv_dat_check_dhcp_ipudp(skb, ip_src))
+ return -EINVAL;
+
+ offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ if (skb->len < offset + sizeof(struct batadv_dhcp_packet))
+ return -EINVAL;
+
+ dhcp_h = skb_header_pointer(skb, offset, sizeof(_dhcp_h), &_dhcp_h);
+ if (!dhcp_h || dhcp_h->htype != BATADV_HTYPE_ETHERNET ||
+ dhcp_h->hlen != ETH_ALEN)
+ return -EINVAL;
+
+ offset += offsetof(struct batadv_dhcp_packet, magic);
+
+ magic = skb_header_pointer(skb, offset, sizeof(_magic), &_magic);
+ if (!magic || get_unaligned(magic) != htonl(BATADV_DHCP_MAGIC))
+ return -EINVAL;
+
+ return dhcp_h->op;
+}
+
+/**
+ * batadv_dat_get_dhcp_message_type() - get message type of a DHCP packet
+ * @skb: the DHCP packet to parse
+ *
+ * Iterates over the DHCP options of the given DHCP packet to find a
+ * DHCP Message Type option and parse it.
+ *
+ * Caller needs to ensure that the given skb is a valid DHCP packet and
+ * that the skb transport header is set correctly.
+ *
+ * Return: The found DHCP message type value, if found. -EINVAL otherwise.
+ */
+static int batadv_dat_get_dhcp_message_type(struct sk_buff *skb)
+{
+ unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ u8 *type, _type;
+ struct {
+ u8 type;
+ u8 len;
+ } *tl, _tl;
+
+ offset += sizeof(struct batadv_dhcp_packet);
+
+ while ((tl = skb_header_pointer(skb, offset, sizeof(_tl), &_tl))) {
+ if (tl->type == BATADV_DHCP_OPT_MSG_TYPE)
+ break;
+
+ if (tl->type == BATADV_DHCP_OPT_END)
+ break;
+
+ if (tl->type == BATADV_DHCP_OPT_PAD)
+ offset++;
+ else
+ offset += tl->len + sizeof(_tl);
+ }
+
+ /* Option Overload Code not supported */
+ if (!tl || tl->type != BATADV_DHCP_OPT_MSG_TYPE ||
+ tl->len != sizeof(_type))
+ return -EINVAL;
+
+ offset += sizeof(_tl);
+
+ type = skb_header_pointer(skb, offset, sizeof(_type), &_type);
+ if (!type)
+ return -EINVAL;
+
+ return *type;
+}
+
+/**
+ * batadv_dat_dhcp_get_yiaddr() - get yiaddr from a DHCP packet
+ * @skb: the DHCP packet to parse
+ * @buf: a buffer to store the yiaddr in
+ *
+ * Caller needs to ensure that the given skb is a valid DHCP packet and
+ * that the skb transport header is set correctly.
+ *
+ * Return: True on success, false otherwise.
+ */
+static bool batadv_dat_dhcp_get_yiaddr(struct sk_buff *skb, __be32 *buf)
+{
+ unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ __be32 *yiaddr;
+
+ offset += offsetof(struct batadv_dhcp_packet, yiaddr);
+ yiaddr = skb_header_pointer(skb, offset, BATADV_DHCP_YIADDR_LEN, buf);
+
+ if (!yiaddr)
+ return false;
+
+ if (yiaddr != buf)
+ *buf = get_unaligned(yiaddr);
+
+ return true;
+}
+
+/**
+ * batadv_dat_get_dhcp_chaddr() - get chaddr from a DHCP packet
+ * @skb: the DHCP packet to parse
+ * @buf: a buffer to store the chaddr in
+ *
+ * Caller needs to ensure that the given skb is a valid DHCP packet and
+ * that the skb transport header is set correctly.
+ *
+ * Return: True on success, false otherwise
+ */
+static bool batadv_dat_get_dhcp_chaddr(struct sk_buff *skb, u8 *buf)
+{
+ unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ u8 *chaddr;
+
+ offset += offsetof(struct batadv_dhcp_packet, chaddr);
+ chaddr = skb_header_pointer(skb, offset, BATADV_DHCP_CHADDR_LEN, buf);
+
+ if (!chaddr)
+ return false;
+
+ if (chaddr != buf)
+ memcpy(buf, chaddr, BATADV_DHCP_CHADDR_LEN);
+
+ return true;
+}
+
+/**
+ * batadv_dat_put_dhcp() - puts addresses from a DHCP packet into the DHT and
+ * DAT cache
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @chaddr: the DHCP client MAC address
+ * @yiaddr: the DHCP client IP address
+ * @hw_dst: the DHCP server MAC address
+ * @ip_dst: the DHCP server IP address
+ * @vid: VLAN identifier
+ *
+ * Adds given MAC/IP pairs to the local DAT cache and propagates them further
+ * into the DHT.
+ *
+ * For the DHT propagation, client MAC + IP will appear as the ARP Reply
+ * transmitter (and hw_dst/ip_dst as the target).
+ */
+static void batadv_dat_put_dhcp(struct batadv_priv *bat_priv, u8 *chaddr,
+ __be32 yiaddr, u8 *hw_dst, __be32 ip_dst,
+ unsigned short vid)
+{
+ struct sk_buff *skb;
+
+ skb = batadv_dat_arp_create_reply(bat_priv, yiaddr, ip_dst, chaddr,
+ hw_dst, vid);
+ if (!skb)
+ return;
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid);
+ batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+
+ batadv_dat_forward_data(bat_priv, skb, yiaddr, vid,
+ BATADV_P_DAT_DHT_PUT);
+ batadv_dat_forward_data(bat_priv, skb, ip_dst, vid,
+ BATADV_P_DAT_DHT_PUT);
+
+ consume_skb(skb);
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from outgoing DHCPACK (server address): %pI4, %pM (vid: %i)\n",
+ &ip_dst, hw_dst, batadv_print_vid(vid));
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from outgoing DHCPACK (client address): %pI4, %pM (vid: %i)\n",
+ &yiaddr, chaddr, batadv_print_vid(vid));
+}
+
+/**
+ * batadv_dat_check_dhcp_ack() - examine packet for valid DHCP message
+ * @skb: the packet to check
+ * @proto: ethernet protocol hint (behind a potential vlan)
+ * @ip_src: a buffer to store the IPv4 source address in
+ * @chaddr: a buffer to store the DHCP Client Hardware Address in
+ * @yiaddr: a buffer to store the DHCP Your IP Address in
+ *
+ * Checks whether the given skb is a valid DHCPACK. And if so, stores the
+ * IPv4 server source address (ip_src), client MAC address (chaddr) and client
+ * IPv4 address (yiaddr) in the provided buffers.
+ *
+ * Caller needs to ensure that the skb network header is set correctly.
+ *
+ * Return: True if the skb is a valid DHCPACK. False otherwise.
+ */
+static bool
+batadv_dat_check_dhcp_ack(struct sk_buff *skb, __be16 proto, __be32 *ip_src,
+ u8 *chaddr, __be32 *yiaddr)
+{
+ int type;
+
+ type = batadv_dat_check_dhcp(skb, proto, ip_src);
+ if (type != BATADV_BOOTREPLY)
+ return false;
+
+ type = batadv_dat_get_dhcp_message_type(skb);
+ if (type != BATADV_DHCPACK)
+ return false;
+
+ if (!batadv_dat_dhcp_get_yiaddr(skb, yiaddr))
+ return false;
+
+ if (!batadv_dat_get_dhcp_chaddr(skb, chaddr))
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_dat_snoop_outgoing_dhcp_ack() - snoop DHCPACK and fill DAT with it
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the packet to snoop
+ * @proto: ethernet protocol hint (behind a potential vlan)
+ * @vid: VLAN identifier
+ *
+ * This function first checks whether the given skb is a valid DHCPACK. If
+ * so then its source MAC and IP as well as its DHCP Client Hardware Address
+ * field and DHCP Your IP Address field are added to the local DAT cache and
+ * propagated into the DHT.
+ *
+ * Caller needs to ensure that the skb mac and network headers are set
+ * correctly.
+ */
+void batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ __be16 proto,
+ unsigned short vid)
+{
+ u8 chaddr[BATADV_DHCP_CHADDR_LEN];
+ __be32 ip_src, yiaddr;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ return;
+
+ if (!batadv_dat_check_dhcp_ack(skb, proto, &ip_src, chaddr, &yiaddr))
+ return;
+
+ batadv_dat_put_dhcp(bat_priv, chaddr, yiaddr, eth_hdr(skb)->h_source,
+ ip_src, vid);
+}
+
+/**
+ * batadv_dat_snoop_incoming_dhcp_ack() - snoop DHCPACK and fill DAT cache
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the packet to snoop
+ * @hdr_size: header size, up to the tail of the batman-adv header
+ *
+ * This function first checks whether the given skb is a valid DHCPACK. If
+ * so then its source MAC and IP as well as its DHCP Client Hardware Address
+ * field and DHCP Your IP Address field are added to the local DAT cache.
+ */
+void batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ u8 chaddr[BATADV_DHCP_CHADDR_LEN];
+ struct ethhdr *ethhdr;
+ __be32 ip_src, yiaddr;
+ unsigned short vid;
+ __be16 proto;
+ u8 *hw_src;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ return;
+
+ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN)))
+ return;
+
+ ethhdr = (struct ethhdr *)(skb->data + hdr_size);
+ skb_set_network_header(skb, hdr_size + ETH_HLEN);
+ proto = ethhdr->h_proto;
+
+ if (!batadv_dat_check_dhcp_ack(skb, proto, &ip_src, chaddr, &yiaddr))
+ return;
+
+ hw_src = ethhdr->h_source;
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid);
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from incoming DHCPACK (server address): %pI4, %pM (vid: %i)\n",
+ &ip_src, hw_src, batadv_print_vid(vid));
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from incoming DHCPACK (client address): %pI4, %pM (vid: %i)\n",
+ &yiaddr, chaddr, batadv_print_vid(vid));
+}
+
+/**
+ * batadv_dat_drop_broadcast_packet() - check if an ARP request has to be
+ * dropped (because the node has already obtained the reply via DAT) or not
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @forw_packet: the broadcast packet
+ *
+ * Return: true if the node can drop the packet, false otherwise.
+ */
+bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet)
+{
+ u16 type;
+ __be32 ip_dst;
+ struct batadv_dat_entry *dat_entry = NULL;
+ bool ret = false;
+ int hdr_size = sizeof(struct batadv_bcast_packet);
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ /* If this packet is an ARP_REQUEST and the node already has the
+ * information that it is going to ask, then the packet can be dropped
+ */
+ if (batadv_forw_packet_is_rebroadcast(forw_packet))
+ goto out;
+
+ vid = batadv_dat_get_vid(forw_packet->skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, forw_packet->skb, hdr_size);
+ if (type != ARPOP_REQUEST)
+ goto out;
+
+ ip_dst = batadv_arp_ip_dst(forw_packet->skb, hdr_size);
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid);
+ /* check if the node already got this entry */
+ if (!dat_entry) {
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "ARP Request for %pI4: fallback\n", &ip_dst);
+ goto out;
+ }
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "ARP Request for %pI4: fallback prevented\n", &ip_dst);
+ ret = true;
+
+out:
+ batadv_dat_entry_put(dat_entry);
+ return ret;
+}
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
new file mode 100644
index 000000000000..e7b75e82eb1d
--- /dev/null
+++ b/net/batman-adv/distributed-arp-table.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ */
+
+#ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
+#define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
+
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "originator.h"
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+
+/* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */
+#define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0)
+
+void batadv_dat_status_update(struct net_device *net_dev);
+bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size);
+void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size);
+void batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ __be16 proto,
+ unsigned short vid);
+void batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size);
+bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet);
+
+/**
+ * batadv_dat_init_orig_node_addr() - assign a DAT address to the orig_node
+ * @orig_node: the node to assign the DAT address to
+ */
+static inline void
+batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
+{
+ u32 addr;
+
+ addr = batadv_choose_orig(orig_node->orig, BATADV_DAT_ADDR_MAX);
+ orig_node->dat_addr = (batadv_dat_addr_t)addr;
+}
+
+/**
+ * batadv_dat_init_own_addr() - assign a DAT address to the node itself
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: a pointer to the primary interface
+ */
+static inline void
+batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if)
+{
+ u32 addr;
+
+ addr = batadv_choose_orig(primary_if->net_dev->dev_addr,
+ BATADV_DAT_ADDR_MAX);
+
+ bat_priv->dat.addr = (batadv_dat_addr_t)addr;
+}
+
+int batadv_dat_init(struct batadv_priv *bat_priv);
+void batadv_dat_free(struct batadv_priv *bat_priv);
+int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb);
+
+/**
+ * batadv_dat_inc_counter() - increment the correct DAT packet counter
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @subtype: the 4addr subtype of the packet to be counted
+ *
+ * Updates the ethtool statistics for the received packet if it is a DAT subtype
+ */
+static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
+ u8 subtype)
+{
+ switch (subtype) {
+ case BATADV_P_DAT_DHT_GET:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_GET_RX);
+ break;
+ case BATADV_P_DAT_DHT_PUT:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_PUT_RX);
+ break;
+ }
+}
+
+#else
+
+static inline void batadv_dat_status_update(struct net_device *net_dev)
+{
+}
+
+static inline bool
+batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ return false;
+}
+
+static inline void
+batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, __be16 proto,
+ unsigned short vid)
+{
+}
+
+static inline void
+batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+}
+
+static inline bool
+batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet)
+{
+ return false;
+}
+
+static inline void
+batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
+{
+}
+
+static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *iface)
+{
+}
+
+static inline int batadv_dat_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_dat_free(struct batadv_priv *bat_priv)
+{
+}
+
+static inline int
+batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
+ u8 subtype)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+#endif /* _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ */
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
new file mode 100644
index 000000000000..cc14bc41381e
--- /dev/null
+++ b/net/batman-adv/fragmentation.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll <martin@hundeboll.net>
+ */
+
+#include "fragmentation.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "hard-interface.h"
+#include "originator.h"
+#include "send.h"
+
+/**
+ * batadv_frag_clear_chain() - delete entries in the fragment buffer chain
+ * @head: head of chain with entries.
+ * @dropped: whether the chain is cleared because all fragments are dropped
+ *
+ * Free fragments in the passed hlist. Should be called with appropriate lock.
+ */
+static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped)
+{
+ struct batadv_frag_list_entry *entry;
+ struct hlist_node *node;
+
+ hlist_for_each_entry_safe(entry, node, head, list) {
+ hlist_del(&entry->list);
+
+ if (dropped)
+ kfree_skb(entry->skb);
+ else
+ consume_skb(entry->skb);
+
+ kfree(entry);
+ }
+}
+
+/**
+ * batadv_frag_purge_orig() - free fragments associated to an orig
+ * @orig_node: originator to free fragments from
+ * @check_cb: optional function to tell if an entry should be purged
+ */
+void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
+ bool (*check_cb)(struct batadv_frag_table_entry *))
+{
+ struct batadv_frag_table_entry *chain;
+ u8 i;
+
+ for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
+ chain = &orig_node->fragments[i];
+ spin_lock_bh(&chain->lock);
+
+ if (!check_cb || check_cb(chain)) {
+ batadv_frag_clear_chain(&chain->fragment_list, true);
+ chain->size = 0;
+ }
+
+ spin_unlock_bh(&chain->lock);
+ }
+}
+
+/**
+ * batadv_frag_size_limit() - maximum possible size of packet to be fragmented
+ *
+ * Return: the maximum size of payload that can be fragmented.
+ */
+static int batadv_frag_size_limit(void)
+{
+ int limit = BATADV_FRAG_MAX_FRAG_SIZE;
+
+ limit -= sizeof(struct batadv_frag_packet);
+ limit *= BATADV_FRAG_MAX_FRAGMENTS;
+
+ return limit;
+}
+
+/**
+ * batadv_frag_init_chain() - check and prepare fragment chain for new fragment
+ * @chain: chain in fragments table to init
+ * @seqno: sequence number of the received fragment
+ *
+ * Make chain ready for a fragment with sequence number "seqno". Delete existing
+ * entries if they have an "old" sequence number.
+ *
+ * Caller must hold chain->lock.
+ *
+ * Return: true if chain is empty and the caller can just insert the new
+ * fragment without searching for the right position.
+ */
+static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
+ u16 seqno)
+{
+ lockdep_assert_held(&chain->lock);
+
+ if (chain->seqno == seqno)
+ return false;
+
+ if (!hlist_empty(&chain->fragment_list))
+ batadv_frag_clear_chain(&chain->fragment_list, true);
+
+ chain->size = 0;
+ chain->seqno = seqno;
+
+ return true;
+}
+
+/**
+ * batadv_frag_insert_packet() - insert a fragment into a fragment chain
+ * @orig_node: originator that the fragment was received from
+ * @skb: skb to insert
+ * @chain_out: list head to attach complete chains of fragments to
+ *
+ * Insert a new fragment into the reverse ordered chain in the right table
+ * entry. The hash table entry is cleared if "old" fragments exist in it.
+ *
+ * Return: true if skb is buffered, false on error. If the chain has all the
+ * fragments needed to merge the packet, the chain is moved to the passed head
+ * to avoid locking the chain in the table.
+ */
+static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
+ struct sk_buff *skb,
+ struct hlist_head *chain_out)
+{
+ struct batadv_frag_table_entry *chain;
+ struct batadv_frag_list_entry *frag_entry_new = NULL, *frag_entry_curr;
+ struct batadv_frag_list_entry *frag_entry_last = NULL;
+ struct batadv_frag_packet *frag_packet;
+ u8 bucket;
+ u16 seqno, hdr_size = sizeof(struct batadv_frag_packet);
+ bool ret = false;
+
+ /* Linearize packet to avoid linearizing 16 packets in a row when doing
+ * the later merge. Non-linear merge should be added to remove this
+ * linearization.
+ */
+ if (skb_linearize(skb) < 0)
+ goto err;
+
+ frag_packet = (struct batadv_frag_packet *)skb->data;
+ seqno = ntohs(frag_packet->seqno);
+ bucket = seqno % BATADV_FRAG_BUFFER_COUNT;
+
+ frag_entry_new = kmalloc(sizeof(*frag_entry_new), GFP_ATOMIC);
+ if (!frag_entry_new)
+ goto err;
+
+ frag_entry_new->skb = skb;
+ frag_entry_new->no = frag_packet->no;
+
+ /* Select entry in the "chain table" and delete any prior fragments
+ * with another sequence number. batadv_frag_init_chain() returns true,
+ * if the list is empty at return.
+ */
+ chain = &orig_node->fragments[bucket];
+ spin_lock_bh(&chain->lock);
+ if (batadv_frag_init_chain(chain, seqno)) {
+ hlist_add_head(&frag_entry_new->list, &chain->fragment_list);
+ chain->size = skb->len - hdr_size;
+ chain->timestamp = jiffies;
+ chain->total_size = ntohs(frag_packet->total_size);
+ ret = true;
+ goto out;
+ }
+
+ /* Find the position for the new fragment. */
+ hlist_for_each_entry(frag_entry_curr, &chain->fragment_list, list) {
+ /* Drop packet if fragment already exists. */
+ if (frag_entry_curr->no == frag_entry_new->no)
+ goto err_unlock;
+
+ /* Order fragments from highest to lowest. */
+ if (frag_entry_curr->no < frag_entry_new->no) {
+ hlist_add_before(&frag_entry_new->list,
+ &frag_entry_curr->list);
+ chain->size += skb->len - hdr_size;
+ chain->timestamp = jiffies;
+ ret = true;
+ goto out;
+ }
+
+ /* store current entry because it could be the last in list */
+ frag_entry_last = frag_entry_curr;
+ }
+
+ /* Reached the end of the list, so insert after 'frag_entry_last'. */
+ if (likely(frag_entry_last)) {
+ hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list);
+ chain->size += skb->len - hdr_size;
+ chain->timestamp = jiffies;
+ ret = true;
+ }
+
+out:
+ if (chain->size > batadv_frag_size_limit() ||
+ chain->total_size != ntohs(frag_packet->total_size) ||
+ chain->total_size > batadv_frag_size_limit()) {
+ /* Clear chain if total size of either the list or the packet
+ * exceeds the maximum size of one merged packet. Don't allow
+ * packets to have different total_size.
+ */
+ batadv_frag_clear_chain(&chain->fragment_list, true);
+ chain->size = 0;
+ } else if (ntohs(frag_packet->total_size) == chain->size) {
+ /* All fragments received. Hand over chain to caller. */
+ hlist_move_list(&chain->fragment_list, chain_out);
+ chain->size = 0;
+ }
+
+err_unlock:
+ spin_unlock_bh(&chain->lock);
+
+err:
+ if (!ret) {
+ kfree(frag_entry_new);
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/**
+ * batadv_frag_merge_packets() - merge a chain of fragments
+ * @chain: head of chain with fragments
+ *
+ * Expand the first skb in the chain and copy the content of the remaining
+ * skb's into the expanded one. After doing so, clear the chain.
+ *
+ * Return: the merged skb or NULL on error.
+ */
+static struct sk_buff *
+batadv_frag_merge_packets(struct hlist_head *chain)
+{
+ struct batadv_frag_packet *packet;
+ struct batadv_frag_list_entry *entry;
+ struct sk_buff *skb_out;
+ int size, hdr_size = sizeof(struct batadv_frag_packet);
+ bool dropped = false;
+
+ /* Remove first entry, as this is the destination for the rest of the
+ * fragments.
+ */
+ entry = hlist_entry(chain->first, struct batadv_frag_list_entry, list);
+ hlist_del(&entry->list);
+ skb_out = entry->skb;
+ kfree(entry);
+
+ packet = (struct batadv_frag_packet *)skb_out->data;
+ size = ntohs(packet->total_size) + hdr_size;
+
+ /* Make room for the rest of the fragments. */
+ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
+ kfree_skb(skb_out);
+ skb_out = NULL;
+ dropped = true;
+ goto free;
+ }
+
+ /* Move the existing MAC header to just before the payload. (Override
+ * the fragment header.)
+ */
+ skb_pull(skb_out, hdr_size);
+ skb_out->ip_summed = CHECKSUM_NONE;
+ memmove(skb_out->data - ETH_HLEN, skb_mac_header(skb_out), ETH_HLEN);
+ skb_set_mac_header(skb_out, -ETH_HLEN);
+ skb_reset_network_header(skb_out);
+ skb_reset_transport_header(skb_out);
+
+ /* Copy the payload of the each fragment into the last skb */
+ hlist_for_each_entry(entry, chain, list) {
+ size = entry->skb->len - hdr_size;
+ skb_put_data(skb_out, entry->skb->data + hdr_size, size);
+ }
+
+free:
+ /* Locking is not needed, because 'chain' is not part of any orig. */
+ batadv_frag_clear_chain(chain, dropped);
+ return skb_out;
+}
+
+/**
+ * batadv_frag_skb_buffer() - buffer fragment for later merge
+ * @skb: skb to buffer
+ * @orig_node_src: originator that the skb is received from
+ *
+ * Add fragment to buffer and merge fragments if possible.
+ *
+ * There are three possible outcomes: 1) Packet is merged: Return true and
+ * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
+ * to NULL; 3) Error: Return false and free skb.
+ *
+ * Return: true when the packet is merged or buffered, false when skb is not
+ * used.
+ */
+bool batadv_frag_skb_buffer(struct sk_buff **skb,
+ struct batadv_orig_node *orig_node_src)
+{
+ struct sk_buff *skb_out = NULL;
+ struct hlist_head head = HLIST_HEAD_INIT;
+ bool ret = false;
+
+ /* Add packet to buffer and table entry if merge is possible. */
+ if (!batadv_frag_insert_packet(orig_node_src, *skb, &head))
+ goto out_err;
+
+ /* Leave if more fragments are needed to merge. */
+ if (hlist_empty(&head))
+ goto out;
+
+ skb_out = batadv_frag_merge_packets(&head);
+ if (!skb_out)
+ goto out_err;
+
+out:
+ ret = true;
+out_err:
+ *skb = skb_out;
+ return ret;
+}
+
+/**
+ * batadv_frag_skb_fwd() - forward fragments that would exceed MTU when merged
+ * @skb: skb to forward
+ * @recv_if: interface that the skb is received on
+ * @orig_node_src: originator that the skb is received from
+ *
+ * Look up the next-hop of the fragments payload and check if the merged packet
+ * will exceed the MTU towards the next-hop. If so, the fragment is forwarded
+ * without merging it.
+ *
+ * Return: true if the fragment is consumed/forwarded, false otherwise.
+ */
+bool batadv_frag_skb_fwd(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_orig_node *orig_node_src)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_frag_packet *packet;
+ u16 total_size;
+ bool ret = false;
+
+ packet = (struct batadv_frag_packet *)skb->data;
+
+ neigh_node = batadv_orig_to_router(bat_priv, packet->dest, recv_if);
+ if (!neigh_node)
+ goto out;
+
+ /* Forward the fragment, if the merged packet would be too big to
+ * be assembled.
+ */
+ total_size = ntohs(packet->total_size);
+ if (total_size > neigh_node->if_incoming->net_dev->mtu) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_FWD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_FWD_BYTES,
+ skb->len + ETH_HLEN);
+
+ packet->ttl--;
+ batadv_send_unicast_skb(skb, neigh_node);
+ ret = true;
+ }
+
+out:
+ batadv_neigh_node_put(neigh_node);
+ return ret;
+}
+
+/**
+ * batadv_frag_create() - create a fragment from skb
+ * @net_dev: outgoing device for fragment
+ * @skb: skb to create fragment from
+ * @frag_head: header to use in new fragment
+ * @fragment_size: size of new fragment
+ *
+ * Split the passed skb into two fragments: A new one with size matching the
+ * passed mtu and the old one with the rest. The new skb contains data from the
+ * tail of the old skb.
+ *
+ * Return: the new fragment, NULL on error.
+ */
+static struct sk_buff *batadv_frag_create(struct net_device *net_dev,
+ struct sk_buff *skb,
+ struct batadv_frag_packet *frag_head,
+ unsigned int fragment_size)
+{
+ unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev);
+ unsigned int tailroom = net_dev->needed_tailroom;
+ struct sk_buff *skb_fragment;
+ unsigned int header_size = sizeof(*frag_head);
+ unsigned int mtu = fragment_size + header_size;
+
+ skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom);
+ if (!skb_fragment)
+ goto err;
+
+ skb_fragment->priority = skb->priority;
+
+ /* Eat the last mtu-bytes of the skb */
+ skb_reserve(skb_fragment, ll_reserved + header_size);
+ skb_split(skb, skb_fragment, skb->len - fragment_size);
+
+ /* Add the header */
+ skb_push(skb_fragment, header_size);
+ memcpy(skb_fragment->data, frag_head, header_size);
+
+err:
+ return skb_fragment;
+}
+
+/**
+ * batadv_frag_send_packet() - create up to 16 fragments from the passed skb
+ * @skb: skb to create fragments from
+ * @orig_node: final destination of the created fragments
+ * @neigh_node: next-hop of the created fragments
+ *
+ * Return: the netdev tx status or a negative errno code on a failure
+ */
+int batadv_frag_send_packet(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node)
+{
+ struct net_device *net_dev = neigh_node->if_incoming->net_dev;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_frag_packet frag_header;
+ struct sk_buff *skb_fragment;
+ unsigned int mtu = net_dev->mtu;
+ unsigned int header_size = sizeof(frag_header);
+ unsigned int max_fragment_size, num_fragments;
+ int ret;
+
+ /* To avoid merge and refragmentation at next-hops we never send
+ * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
+ */
+ mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
+ max_fragment_size = mtu - header_size;
+
+ if (skb->len == 0 || max_fragment_size == 0)
+ return -EINVAL;
+
+ num_fragments = (skb->len - 1) / max_fragment_size + 1;
+ max_fragment_size = (skb->len - 1) / num_fragments + 1;
+
+ /* Don't even try to fragment, if we need more than 16 fragments */
+ if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) {
+ ret = -EAGAIN;
+ goto free_skb;
+ }
+
+ bat_priv = orig_node->bat_priv;
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if) {
+ ret = -EINVAL;
+ goto free_skb;
+ }
+
+ /* GRO might have added fragments to the fragment list instead of
+ * frags[]. But this is not handled by skb_split and must be
+ * linearized to avoid incorrect length information after all
+ * batman-adv fragments were created and submitted to the
+ * hard-interface
+ */
+ if (skb_has_frag_list(skb) && __skb_linearize(skb)) {
+ ret = -ENOMEM;
+ goto free_skb;
+ }
+
+ /* Create one header to be copied to all fragments */
+ frag_header.packet_type = BATADV_UNICAST_FRAG;
+ frag_header.version = BATADV_COMPAT_VERSION;
+ frag_header.ttl = BATADV_TTL;
+ frag_header.seqno = htons(atomic_inc_return(&bat_priv->frag_seqno));
+ frag_header.reserved = 0;
+ frag_header.no = 0;
+ frag_header.total_size = htons(skb->len);
+
+ /* skb->priority values from 256->263 are magic values to
+ * directly indicate a specific 802.1d priority. This is used
+ * to allow 802.1d priority to be passed directly in from VLAN
+ * tags, etc.
+ */
+ if (skb->priority >= 256 && skb->priority <= 263)
+ frag_header.priority = skb->priority - 256;
+ else
+ frag_header.priority = 0;
+
+ ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr);
+ ether_addr_copy(frag_header.dest, orig_node->orig);
+
+ /* Eat and send fragments from the tail of skb */
+ while (skb->len > max_fragment_size) {
+ /* The initial check in this function should cover this case */
+ if (unlikely(frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)) {
+ ret = -EINVAL;
+ goto put_primary_if;
+ }
+
+ skb_fragment = batadv_frag_create(net_dev, skb, &frag_header,
+ max_fragment_size);
+ if (!skb_fragment) {
+ ret = -ENOMEM;
+ goto put_primary_if;
+ }
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
+ skb_fragment->len + ETH_HLEN);
+ ret = batadv_send_unicast_skb(skb_fragment, neigh_node);
+ if (ret != NET_XMIT_SUCCESS) {
+ ret = NET_XMIT_DROP;
+ goto put_primary_if;
+ }
+
+ frag_header.no++;
+ }
+
+ /* make sure that there is at least enough head for the fragmentation
+ * and ethernet headers
+ */
+ ret = skb_cow_head(skb, ETH_HLEN + header_size);
+ if (ret < 0)
+ goto put_primary_if;
+
+ skb_push(skb, header_size);
+ memcpy(skb->data, &frag_header, header_size);
+
+ /* Send the last fragment */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
+ skb->len + ETH_HLEN);
+ ret = batadv_send_unicast_skb(skb, neigh_node);
+ /* skb was consumed */
+ skb = NULL;
+
+put_primary_if:
+ batadv_hardif_put(primary_if);
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
new file mode 100644
index 000000000000..dbf0871f8703
--- /dev/null
+++ b/net/batman-adv/fragmentation.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll <martin@hundeboll.net>
+ */
+
+#ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_
+#define _NET_BATMAN_ADV_FRAGMENTATION_H_
+
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+void batadv_frag_purge_orig(struct batadv_orig_node *orig,
+ bool (*check_cb)(struct batadv_frag_table_entry *));
+bool batadv_frag_skb_fwd(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_orig_node *orig_node_src);
+bool batadv_frag_skb_buffer(struct sk_buff **skb,
+ struct batadv_orig_node *orig_node);
+int batadv_frag_send_packet(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node);
+
+/**
+ * batadv_frag_check_entry() - check if a list of fragments has timed out
+ * @frags_entry: table entry to check
+ *
+ * Return: true if the frags entry has timed out, false otherwise.
+ */
+static inline bool
+batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
+{
+ if (!hlist_empty(&frags_entry->fragment_list) &&
+ batadv_has_timed_out(frags_entry->timestamp, BATADV_FRAG_TIMEOUT))
+ return true;
+ return false;
+}
+
+#endif /* _NET_BATMAN_ADV_FRAGMENTATION_H_ */
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
new file mode 100644
index 000000000000..7a11b245e9f4
--- /dev/null
+++ b/net/batman-adv/gateway_client.c
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#include "gateway_client.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/sprintf.h>
+#include <linux/stddef.h>
+#include <linux/udp.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "hard-interface.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+#include "routing.h"
+#include "translation-table.h"
+
+/* These are the offsets of the "hw type" and "hw address length" in the dhcp
+ * packet starting at the beginning of the dhcp header
+ */
+#define BATADV_DHCP_HTYPE_OFFSET 1
+#define BATADV_DHCP_HLEN_OFFSET 2
+/* Value of htype representing Ethernet */
+#define BATADV_DHCP_HTYPE_ETHERNET 0x01
+/* This is the offset of the "chaddr" field in the dhcp packet starting at the
+ * beginning of the dhcp header
+ */
+#define BATADV_DHCP_CHADDR_OFFSET 28
+
+/**
+ * batadv_gw_node_release() - release gw_node from lists and queue for free
+ * after rcu grace period
+ * @ref: kref pointer of the gw_node
+ */
+void batadv_gw_node_release(struct kref *ref)
+{
+ struct batadv_gw_node *gw_node;
+
+ gw_node = container_of(ref, struct batadv_gw_node, refcount);
+
+ batadv_orig_node_put(gw_node->orig_node);
+ kfree_rcu(gw_node, rcu);
+}
+
+/**
+ * batadv_gw_get_selected_gw_node() - Get currently selected gateway
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: selected gateway (with increased refcnt), NULL on errors
+ */
+struct batadv_gw_node *
+batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node;
+
+ rcu_read_lock();
+ gw_node = rcu_dereference(bat_priv->gw.curr_gw);
+ if (!gw_node)
+ goto out;
+
+ if (!kref_get_unless_zero(&gw_node->refcount))
+ gw_node = NULL;
+
+out:
+ rcu_read_unlock();
+ return gw_node;
+}
+
+/**
+ * batadv_gw_get_selected_orig() - Get originator of currently selected gateway
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: orig_node of selected gateway (with increased refcnt), NULL on errors
+ */
+struct batadv_orig_node *
+batadv_gw_get_selected_orig(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node;
+ struct batadv_orig_node *orig_node = NULL;
+
+ gw_node = batadv_gw_get_selected_gw_node(bat_priv);
+ if (!gw_node)
+ goto out;
+
+ rcu_read_lock();
+ orig_node = gw_node->orig_node;
+ if (!orig_node)
+ goto unlock;
+
+ if (!kref_get_unless_zero(&orig_node->refcount))
+ orig_node = NULL;
+
+unlock:
+ rcu_read_unlock();
+out:
+ batadv_gw_node_put(gw_node);
+ return orig_node;
+}
+
+static void batadv_gw_select(struct batadv_priv *bat_priv,
+ struct batadv_gw_node *new_gw_node)
+{
+ struct batadv_gw_node *curr_gw_node;
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+
+ if (new_gw_node)
+ kref_get(&new_gw_node->refcount);
+
+ curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node,
+ true);
+
+ batadv_gw_node_put(curr_gw_node);
+
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+}
+
+/**
+ * batadv_gw_reselect() - force a gateway reselection
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Set a flag to remind the GW component to perform a new gateway reselection.
+ * However this function does not ensure that the current gateway is going to be
+ * deselected. The reselection mechanism may elect the same gateway once again.
+ *
+ * This means that invoking batadv_gw_reselect() does not guarantee a gateway
+ * change and therefore a uevent is not necessarily expected.
+ */
+void batadv_gw_reselect(struct batadv_priv *bat_priv)
+{
+ atomic_set(&bat_priv->gw.reselect, 1);
+}
+
+/**
+ * batadv_gw_check_client_stop() - check if client mode has been switched off
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * This function assumes the caller has checked that the gw state *is actually
+ * changing*. This function is not supposed to be called when there is no state
+ * change.
+ */
+void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *curr_gw;
+
+ if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT)
+ return;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ if (!curr_gw)
+ return;
+
+ /* deselect the current gateway so that next time that client mode is
+ * enabled a proper GW_ADD event can be sent
+ */
+ batadv_gw_select(bat_priv, NULL);
+
+ /* if batman-adv is switching the gw client mode off and a gateway was
+ * already selected, send a DEL uevent
+ */
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL);
+
+ batadv_gw_node_put(curr_gw);
+}
+
+/**
+ * batadv_gw_election() - Elect the best gateway
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_gw_election(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *curr_gw = NULL;
+ struct batadv_gw_node *next_gw = NULL;
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ char gw_addr[18] = { '\0' };
+
+ if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT)
+ goto out;
+
+ if (!bat_priv->algo_ops->gw.get_best_gw_node)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw)
+ goto out;
+
+ /* if gw.reselect is set to 1 it means that a previous call to
+ * gw.is_eligible() said that we have a new best GW, therefore it can
+ * now be picked from the list and selected
+ */
+ next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv);
+
+ if (curr_gw == next_gw)
+ goto out;
+
+ if (next_gw) {
+ sprintf(gw_addr, "%pM", next_gw->orig_node->orig);
+
+ router = batadv_orig_router_get(next_gw->orig_node,
+ BATADV_IF_DEFAULT);
+ if (!router) {
+ batadv_gw_reselect(bat_priv);
+ goto out;
+ }
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router,
+ BATADV_IF_DEFAULT);
+ if (!router_ifinfo) {
+ batadv_gw_reselect(bat_priv);
+ goto out;
+ }
+ }
+
+ if (curr_gw && !next_gw) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Removing selected gateway - no gateway in range\n");
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL,
+ NULL);
+ } else if (!curr_gw && next_gw) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n",
+ next_gw->orig_node->orig,
+ next_gw->bandwidth_down / 10,
+ next_gw->bandwidth_down % 10,
+ next_gw->bandwidth_up / 10,
+ next_gw->bandwidth_up % 10,
+ router_ifinfo->bat_iv.tq_avg);
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_ADD,
+ gw_addr);
+ } else {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Changing route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n",
+ next_gw->orig_node->orig,
+ next_gw->bandwidth_down / 10,
+ next_gw->bandwidth_down % 10,
+ next_gw->bandwidth_up / 10,
+ next_gw->bandwidth_up % 10,
+ router_ifinfo->bat_iv.tq_avg);
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_CHANGE,
+ gw_addr);
+ }
+
+ batadv_gw_select(bat_priv, next_gw);
+
+out:
+ batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(next_gw);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+}
+
+/**
+ * batadv_gw_check_election() - Elect orig node as best gateway when eligible
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be checked
+ */
+void batadv_gw_check_election(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_node *curr_gw_orig;
+
+ /* abort immediately if the routing algorithm does not support gateway
+ * election
+ */
+ if (!bat_priv->algo_ops->gw.is_eligible)
+ return;
+
+ curr_gw_orig = batadv_gw_get_selected_orig(bat_priv);
+ if (!curr_gw_orig)
+ goto reselect;
+
+ /* this node already is the gateway */
+ if (curr_gw_orig == orig_node)
+ goto out;
+
+ if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig,
+ orig_node))
+ goto out;
+
+reselect:
+ batadv_gw_reselect(bat_priv);
+out:
+ batadv_orig_node_put(curr_gw_orig);
+}
+
+/**
+ * batadv_gw_node_add() - add gateway node to list of available gateways
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: originator announcing gateway capabilities
+ * @gateway: announced bandwidth information
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (gw.list_lock).
+ */
+static void batadv_gw_node_add(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_gateway_data *gateway)
+{
+ struct batadv_gw_node *gw_node;
+
+ lockdep_assert_held(&bat_priv->gw.list_lock);
+
+ if (gateway->bandwidth_down == 0)
+ return;
+
+ gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC);
+ if (!gw_node)
+ return;
+
+ kref_init(&gw_node->refcount);
+ INIT_HLIST_NODE(&gw_node->list);
+ kref_get(&orig_node->refcount);
+ gw_node->orig_node = orig_node;
+ gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
+ gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
+
+ kref_get(&gw_node->refcount);
+ hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
+ bat_priv->gw.generation++;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n",
+ orig_node->orig,
+ ntohl(gateway->bandwidth_down) / 10,
+ ntohl(gateway->bandwidth_down) % 10,
+ ntohl(gateway->bandwidth_up) / 10,
+ ntohl(gateway->bandwidth_up) % 10);
+
+ /* don't return reference to new gw_node */
+ batadv_gw_node_put(gw_node);
+}
+
+/**
+ * batadv_gw_node_get() - retrieve gateway node from list of available gateways
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: originator announcing gateway capabilities
+ *
+ * Return: gateway node if found or NULL otherwise.
+ */
+struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_gw_node *gw_node_tmp, *gw_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list,
+ list) {
+ if (gw_node_tmp->orig_node != orig_node)
+ continue;
+
+ if (!kref_get_unless_zero(&gw_node_tmp->refcount))
+ continue;
+
+ gw_node = gw_node_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return gw_node;
+}
+
+/**
+ * batadv_gw_node_update() - update list of available gateways with changed
+ * bandwidth information
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: originator announcing gateway capabilities
+ * @gateway: announced bandwidth information
+ */
+void batadv_gw_node_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_gateway_data *gateway)
+{
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ gw_node = batadv_gw_node_get(bat_priv, orig_node);
+ if (!gw_node) {
+ batadv_gw_node_add(bat_priv, orig_node, gateway);
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+ goto out;
+ }
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) &&
+ gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Gateway bandwidth of originator %pM changed from %u.%u/%u.%u MBit to %u.%u/%u.%u MBit\n",
+ orig_node->orig,
+ gw_node->bandwidth_down / 10,
+ gw_node->bandwidth_down % 10,
+ gw_node->bandwidth_up / 10,
+ gw_node->bandwidth_up % 10,
+ ntohl(gateway->bandwidth_down) / 10,
+ ntohl(gateway->bandwidth_down) % 10,
+ ntohl(gateway->bandwidth_up) / 10,
+ ntohl(gateway->bandwidth_up) % 10);
+
+ gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
+ gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
+
+ if (ntohl(gateway->bandwidth_down) == 0) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Gateway %pM removed from gateway list\n",
+ orig_node->orig);
+
+ /* Note: We don't need a NULL check here, since curr_gw never
+ * gets dereferenced.
+ */
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ if (!hlist_unhashed(&gw_node->list)) {
+ hlist_del_init_rcu(&gw_node->list);
+ batadv_gw_node_put(gw_node);
+ bat_priv->gw.generation++;
+ }
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ if (gw_node == curr_gw)
+ batadv_gw_reselect(bat_priv);
+
+ batadv_gw_node_put(curr_gw);
+ }
+
+out:
+ batadv_gw_node_put(gw_node);
+}
+
+/**
+ * batadv_gw_node_delete() - Remove orig_node from gateway list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is currently in process of being removed
+ */
+void batadv_gw_node_delete(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_tvlv_gateway_data gateway;
+
+ gateway.bandwidth_down = 0;
+ gateway.bandwidth_up = 0;
+
+ batadv_gw_node_update(bat_priv, orig_node, &gateway);
+}
+
+/**
+ * batadv_gw_node_free() - Free gateway information from mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_gw_node_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node;
+ struct hlist_node *node_tmp;
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ hlist_for_each_entry_safe(gw_node, node_tmp,
+ &bat_priv->gw.gateway_list, list) {
+ hlist_del_init_rcu(&gw_node->list);
+ batadv_gw_node_put(gw_node);
+ bat_priv->gw.generation++;
+ }
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+}
+
+/**
+ * batadv_gw_dump() - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ *
+ * Return: Error code, or length of message
+ */
+int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct net_device *mesh_iface;
+ struct batadv_priv *bat_priv;
+ int ret;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!bat_priv->algo_ops->gw.dump) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ bat_priv->algo_ops->gw.dump(msg, cb, bat_priv);
+
+ ret = msg->len;
+
+out:
+ batadv_hardif_put(primary_if);
+ dev_put(mesh_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_gw_dhcp_recipient_get() - check if a packet is a DHCP message
+ * @skb: the packet to check
+ * @header_len: a pointer to the batman-adv header size
+ * @chaddr: buffer where the client address will be stored. Valid
+ * only if the function returns BATADV_DHCP_TO_CLIENT
+ *
+ * This function may re-allocate the data buffer of the skb passed as argument.
+ *
+ * Return:
+ * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error
+ * while parsing it
+ * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server
+ * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client
+ */
+enum batadv_dhcp_recipient
+batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
+ u8 *chaddr)
+{
+ enum batadv_dhcp_recipient ret = BATADV_DHCP_NO;
+ struct ethhdr *ethhdr;
+ struct iphdr *iphdr;
+ struct ipv6hdr *ipv6hdr;
+ struct udphdr *udphdr;
+ struct vlan_ethhdr *vhdr;
+ int chaddr_offset;
+ __be16 proto;
+ u8 *p;
+
+ /* check for ethernet header */
+ if (!pskb_may_pull(skb, *header_len + ETH_HLEN))
+ return BATADV_DHCP_NO;
+
+ ethhdr = eth_hdr(skb);
+ proto = ethhdr->h_proto;
+ *header_len += ETH_HLEN;
+
+ /* check for initial vlan header */
+ if (proto == htons(ETH_P_8021Q)) {
+ if (!pskb_may_pull(skb, *header_len + VLAN_HLEN))
+ return BATADV_DHCP_NO;
+
+ vhdr = vlan_eth_hdr(skb);
+ proto = vhdr->h_vlan_encapsulated_proto;
+ *header_len += VLAN_HLEN;
+ }
+
+ /* check for ip header */
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr)))
+ return BATADV_DHCP_NO;
+
+ iphdr = (struct iphdr *)(skb->data + *header_len);
+ *header_len += iphdr->ihl * 4;
+
+ /* check for udp header */
+ if (iphdr->protocol != IPPROTO_UDP)
+ return BATADV_DHCP_NO;
+
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr)))
+ return BATADV_DHCP_NO;
+
+ ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len);
+ *header_len += sizeof(*ipv6hdr);
+
+ /* check for udp header */
+ if (ipv6hdr->nexthdr != IPPROTO_UDP)
+ return BATADV_DHCP_NO;
+
+ break;
+ default:
+ return BATADV_DHCP_NO;
+ }
+
+ if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr)))
+ return BATADV_DHCP_NO;
+
+ udphdr = (struct udphdr *)(skb->data + *header_len);
+ *header_len += sizeof(*udphdr);
+
+ /* check for bootp port */
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (udphdr->dest == htons(67))
+ ret = BATADV_DHCP_TO_SERVER;
+ else if (udphdr->source == htons(67))
+ ret = BATADV_DHCP_TO_CLIENT;
+ break;
+ case htons(ETH_P_IPV6):
+ if (udphdr->dest == htons(547))
+ ret = BATADV_DHCP_TO_SERVER;
+ else if (udphdr->source == htons(547))
+ ret = BATADV_DHCP_TO_CLIENT;
+ break;
+ }
+
+ chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET;
+ /* store the client address if the message is going to a client */
+ if (ret == BATADV_DHCP_TO_CLIENT) {
+ if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN))
+ return BATADV_DHCP_NO;
+
+ /* check if the DHCP packet carries an Ethernet DHCP */
+ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET;
+ if (*p != BATADV_DHCP_HTYPE_ETHERNET)
+ return BATADV_DHCP_NO;
+
+ /* check if the DHCP packet carries a valid Ethernet address */
+ p = skb->data + *header_len + BATADV_DHCP_HLEN_OFFSET;
+ if (*p != ETH_ALEN)
+ return BATADV_DHCP_NO;
+
+ ether_addr_copy(chaddr, skb->data + chaddr_offset);
+ }
+
+ return ret;
+}
+
+/**
+ * batadv_gw_out_of_range() - check if the dhcp request destination is the best
+ * gateway
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the outgoing packet
+ *
+ * Check if the skb is a DHCP request and if it is sent to the current best GW
+ * server. Due to topology changes it may be the case that the GW server
+ * previously selected is not the best one anymore.
+ *
+ * This call might reallocate skb data.
+ * Must be invoked only when the DHCP packet is going TO a DHCP SERVER.
+ *
+ * Return: true if the packet destination is unicast and it is not the best gw,
+ * false otherwise.
+ */
+bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_neigh_node *neigh_curr = NULL;
+ struct batadv_neigh_node *neigh_old = NULL;
+ struct batadv_orig_node *orig_dst_node = NULL;
+ struct batadv_gw_node *gw_node = NULL;
+ struct batadv_gw_node *curr_gw = NULL;
+ struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo;
+ struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+ bool out_of_range = false;
+ u8 curr_tq_avg;
+ unsigned short vid;
+
+ vid = batadv_get_vid(skb, 0);
+
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ goto out;
+
+ orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source,
+ ethhdr->h_dest, vid);
+ if (!orig_dst_node)
+ goto out;
+
+ gw_node = batadv_gw_node_get(bat_priv, orig_dst_node);
+ if (!gw_node)
+ goto out;
+
+ switch (atomic_read(&bat_priv->gw.mode)) {
+ case BATADV_GW_MODE_SERVER:
+ /* If we are a GW then we are our best GW. We can artificially
+ * set the tq towards ourself as the maximum value
+ */
+ curr_tq_avg = BATADV_TQ_MAX_VALUE;
+ break;
+ case BATADV_GW_MODE_CLIENT:
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ if (!curr_gw)
+ goto out;
+
+ /* packet is going to our gateway */
+ if (curr_gw->orig_node == orig_dst_node)
+ goto out;
+
+ /* If the dhcp packet has been sent to a different gw,
+ * we have to evaluate whether the old gw is still
+ * reliable enough
+ */
+ neigh_curr = batadv_find_router(bat_priv, curr_gw->orig_node,
+ NULL);
+ if (!neigh_curr)
+ goto out;
+
+ curr_ifinfo = batadv_neigh_ifinfo_get(neigh_curr,
+ BATADV_IF_DEFAULT);
+ if (!curr_ifinfo)
+ goto out;
+
+ curr_tq_avg = curr_ifinfo->bat_iv.tq_avg;
+ batadv_neigh_ifinfo_put(curr_ifinfo);
+
+ break;
+ case BATADV_GW_MODE_OFF:
+ default:
+ goto out;
+ }
+
+ neigh_old = batadv_find_router(bat_priv, orig_dst_node, NULL);
+ if (!neigh_old)
+ goto out;
+
+ old_ifinfo = batadv_neigh_ifinfo_get(neigh_old, BATADV_IF_DEFAULT);
+ if (!old_ifinfo)
+ goto out;
+
+ if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD)
+ out_of_range = true;
+ batadv_neigh_ifinfo_put(old_ifinfo);
+
+out:
+ batadv_orig_node_put(orig_dst_node);
+ batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(gw_node);
+ batadv_neigh_node_put(neigh_old);
+ batadv_neigh_node_put(neigh_curr);
+ return out_of_range;
+}
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
new file mode 100644
index 000000000000..95c2ccdaa554
--- /dev/null
+++ b/net/batman-adv/gateway_client.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+#define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+
+#include "main.h"
+
+#include <linux/kref.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+void batadv_gw_check_client_stop(struct batadv_priv *bat_priv);
+void batadv_gw_reselect(struct batadv_priv *bat_priv);
+void batadv_gw_election(struct batadv_priv *bat_priv);
+struct batadv_orig_node *
+batadv_gw_get_selected_orig(struct batadv_priv *bat_priv);
+void batadv_gw_check_election(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node);
+void batadv_gw_node_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_gateway_data *gateway);
+void batadv_gw_node_delete(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node);
+void batadv_gw_node_free(struct batadv_priv *bat_priv);
+void batadv_gw_node_release(struct kref *ref);
+struct batadv_gw_node *
+batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv);
+int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb);
+bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
+enum batadv_dhcp_recipient
+batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
+ u8 *chaddr);
+struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node);
+
+/**
+ * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release
+ * it
+ * @gw_node: gateway node to free
+ */
+static inline void batadv_gw_node_put(struct batadv_gw_node *gw_node)
+{
+ if (!gw_node)
+ return;
+
+ kref_put(&gw_node->refcount, batadv_gw_node_release);
+}
+
+#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
new file mode 100644
index 000000000000..315fa90f0c94
--- /dev/null
+++ b/net/batman-adv/gateway_common.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#include "gateway_common.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "gateway_client.h"
+#include "tvlv.h"
+
+/**
+ * batadv_gw_tvlv_container_update() - update the gw tvlv container after
+ * gateway setting change
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ struct batadv_tvlv_gateway_data gw;
+ u32 down, up;
+ char gw_mode;
+
+ gw_mode = atomic_read(&bat_priv->gw.mode);
+
+ switch (gw_mode) {
+ case BATADV_GW_MODE_OFF:
+ case BATADV_GW_MODE_CLIENT:
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_GW, 1);
+ break;
+ case BATADV_GW_MODE_SERVER:
+ down = atomic_read(&bat_priv->gw.bandwidth_down);
+ up = atomic_read(&bat_priv->gw.bandwidth_up);
+ gw.bandwidth_down = htonl(down);
+ gw.bandwidth_up = htonl(up);
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_GW, 1,
+ &gw, sizeof(gw));
+ break;
+ }
+}
+
+/**
+ * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ struct batadv_tvlv_gateway_data gateway, *gateway_ptr;
+
+ /* only fetch the tvlv value if the handler wasn't called via the
+ * CIFNOTFND flag and if there is data to fetch
+ */
+ if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND ||
+ tvlv_value_len < sizeof(gateway)) {
+ gateway.bandwidth_down = 0;
+ gateway.bandwidth_up = 0;
+ } else {
+ gateway_ptr = tvlv_value;
+ gateway.bandwidth_down = gateway_ptr->bandwidth_down;
+ gateway.bandwidth_up = gateway_ptr->bandwidth_up;
+ if (gateway.bandwidth_down == 0 ||
+ gateway.bandwidth_up == 0) {
+ gateway.bandwidth_down = 0;
+ gateway.bandwidth_up = 0;
+ }
+ }
+
+ batadv_gw_node_update(bat_priv, orig, &gateway);
+
+ /* restart gateway selection */
+ if (gateway.bandwidth_down != 0 &&
+ atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)
+ batadv_gw_check_election(bat_priv, orig);
+}
+
+/**
+ * batadv_gw_init() - initialise the gateway handling internals
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_gw_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->algo_ops->gw.init_sel_class)
+ bat_priv->algo_ops->gw.init_sel_class(bat_priv);
+ else
+ atomic_set(&bat_priv->gw.sel_class, 1);
+
+ batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1,
+ NULL, NULL, BATADV_TVLV_GW, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+}
+
+/**
+ * batadv_gw_free() - free the gateway handling internals
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_gw_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_GW, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_GW, 1);
+}
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
new file mode 100644
index 000000000000..5d097d6a1dd9
--- /dev/null
+++ b/net/batman-adv/gateway_common.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+#define _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+
+#include "main.h"
+
+/**
+ * enum batadv_bandwidth_units - bandwidth unit types
+ */
+enum batadv_bandwidth_units {
+ /** @BATADV_BW_UNIT_KBIT: unit type kbit */
+ BATADV_BW_UNIT_KBIT,
+
+ /** @BATADV_BW_UNIT_MBIT: unit type mbit */
+ BATADV_BW_UNIT_MBIT,
+};
+
+#define BATADV_GW_MODE_OFF_NAME "off"
+#define BATADV_GW_MODE_CLIENT_NAME "client"
+#define BATADV_GW_MODE_SERVER_NAME "server"
+
+void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv);
+void batadv_gw_init(struct batadv_priv *bat_priv);
+void batadv_gw_free(struct batadv_priv *bat_priv);
+
+#endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
new file mode 100644
index 000000000000..5113f879736b
--- /dev/null
+++ b/net/batman-adv/hard-interface.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "hard-interface.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/container_of.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/kref.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "bat_v.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "log.h"
+#include "mesh-interface.h"
+#include "originator.h"
+#include "send.h"
+#include "translation-table.h"
+
+/**
+ * batadv_hardif_release() - release hard interface from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the hard interface
+ */
+void batadv_hardif_release(struct kref *ref)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ hard_iface = container_of(ref, struct batadv_hard_iface, refcount);
+ netdev_put(hard_iface->net_dev, &hard_iface->dev_tracker);
+
+ kfree_rcu(hard_iface, rcu);
+}
+
+/**
+ * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device
+ * @net_dev: net_device to search for
+ *
+ * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors
+ */
+struct batadv_hard_iface *
+batadv_hardif_get_by_netdev(const struct net_device *net_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->net_dev == net_dev &&
+ kref_get_unless_zero(&hard_iface->refcount))
+ goto out;
+ }
+
+ hard_iface = NULL;
+
+out:
+ rcu_read_unlock();
+ return hard_iface;
+}
+
+/**
+ * batadv_getlink_net() - return link net namespace (of use fallback)
+ * @netdev: net_device to check
+ * @fallback_net: return in case get_link_net is not available for @netdev
+ *
+ * Return: result of rtnl_link_ops->get_link_net or @fallback_net
+ */
+static struct net *batadv_getlink_net(const struct net_device *netdev,
+ struct net *fallback_net)
+{
+ if (!netdev->rtnl_link_ops)
+ return fallback_net;
+
+ if (!netdev->rtnl_link_ops->get_link_net)
+ return fallback_net;
+
+ return netdev->rtnl_link_ops->get_link_net(netdev);
+}
+
+/**
+ * batadv_mutual_parents() - check if two devices are each others parent
+ * @dev1: 1st net dev
+ * @net1: 1st devices netns
+ * @dev2: 2nd net dev
+ * @net2: 2nd devices netns
+ *
+ * veth devices come in pairs and each is the parent of the other!
+ *
+ * Return: true if the devices are each others parent, otherwise false
+ */
+static bool batadv_mutual_parents(const struct net_device *dev1,
+ struct net *net1,
+ const struct net_device *dev2,
+ struct net *net2)
+{
+ int dev1_parent_iflink = dev_get_iflink(dev1);
+ int dev2_parent_iflink = dev_get_iflink(dev2);
+ const struct net *dev1_parent_net;
+ const struct net *dev2_parent_net;
+
+ dev1_parent_net = batadv_getlink_net(dev1, net1);
+ dev2_parent_net = batadv_getlink_net(dev2, net2);
+
+ if (!dev1_parent_iflink || !dev2_parent_iflink)
+ return false;
+
+ return (dev1_parent_iflink == dev2->ifindex) &&
+ (dev2_parent_iflink == dev1->ifindex) &&
+ net_eq(dev1_parent_net, net2) &&
+ net_eq(dev2_parent_net, net1);
+}
+
+/**
+ * batadv_is_on_batman_iface() - check if a device is a batman iface descendant
+ * @net_dev: the device to check
+ *
+ * If the user creates any virtual device on top of a batman-adv interface, it
+ * is important to prevent this new interface from being used to create a new
+ * mesh network (this behaviour would lead to a batman-over-batman
+ * configuration). This function recursively checks all the fathers of the
+ * device passed as argument looking for a batman-adv mesh interface.
+ *
+ * Return: true if the device is descendant of a batman-adv mesh interface (or
+ * if it is a batman-adv interface itself), false otherwise
+ */
+static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
+{
+ struct net *net = dev_net(net_dev);
+ struct net_device *parent_dev;
+ struct net *parent_net;
+ int iflink;
+ bool ret;
+
+ /* check if this is a batman-adv mesh interface */
+ if (batadv_meshif_is_valid(net_dev))
+ return true;
+
+ iflink = dev_get_iflink(net_dev);
+ if (iflink == 0)
+ return false;
+
+ parent_net = batadv_getlink_net(net_dev, net);
+
+ /* iflink to itself, most likely physical device */
+ if (net == parent_net && iflink == net_dev->ifindex)
+ return false;
+
+ /* recurse over the parent device */
+ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink);
+ if (!parent_dev) {
+ pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n",
+ net_dev->name);
+ return false;
+ }
+
+ if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net))
+ return false;
+
+ ret = batadv_is_on_batman_iface(parent_dev);
+
+ return ret;
+}
+
+static bool batadv_is_valid_iface(const struct net_device *net_dev)
+{
+ if (net_dev->flags & IFF_LOOPBACK)
+ return false;
+
+ if (net_dev->type != ARPHRD_ETHER)
+ return false;
+
+ if (net_dev->addr_len != ETH_ALEN)
+ return false;
+
+ /* no batman over batman */
+ if (batadv_is_on_batman_iface(net_dev))
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_get_real_netdevice() - check if the given netdev struct is a virtual
+ * interface on top of another 'real' interface
+ * @netdev: the device to check
+ *
+ * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev()
+ * instead of this.
+ *
+ * Return: the 'real' net device or the original net device and NULL in case
+ * of an error.
+ */
+static struct net_device *batadv_get_real_netdevice(struct net_device *netdev)
+{
+ struct batadv_hard_iface *hard_iface = NULL;
+ struct net_device *real_netdev = NULL;
+ struct net *real_net;
+ struct net *net;
+ int iflink;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return NULL;
+
+ iflink = dev_get_iflink(netdev);
+ if (iflink == 0) {
+ dev_hold(netdev);
+ return netdev;
+ }
+
+ hard_iface = batadv_hardif_get_by_netdev(netdev);
+ if (!hard_iface || !hard_iface->mesh_iface)
+ goto out;
+
+ net = dev_net(hard_iface->mesh_iface);
+ real_net = batadv_getlink_net(netdev, net);
+
+ /* iflink to itself, most likely physical device */
+ if (net == real_net && netdev->ifindex == iflink) {
+ real_netdev = netdev;
+ dev_hold(real_netdev);
+ goto out;
+ }
+
+ real_netdev = dev_get_by_index(real_net, iflink);
+
+out:
+ batadv_hardif_put(hard_iface);
+ return real_netdev;
+}
+
+/**
+ * batadv_get_real_netdev() - check if the given net_device struct is a virtual
+ * interface on top of another 'real' interface
+ * @net_device: the device to check
+ *
+ * Return: the 'real' net device or the original net device and NULL in case
+ * of an error.
+ */
+struct net_device *batadv_get_real_netdev(struct net_device *net_device)
+{
+ struct net_device *real_netdev;
+
+ rtnl_lock();
+ real_netdev = batadv_get_real_netdevice(net_device);
+ rtnl_unlock();
+
+ return real_netdev;
+}
+
+/**
+ * batadv_is_wext_netdev() - check if the given net_device struct is a
+ * wext wifi interface
+ * @net_device: the device to check
+ *
+ * Return: true if the net device is a wext wireless device, false
+ * otherwise.
+ */
+static bool batadv_is_wext_netdev(struct net_device *net_device)
+{
+ if (!net_device)
+ return false;
+
+#ifdef CONFIG_WIRELESS_EXT
+ /* pre-cfg80211 drivers have to implement WEXT, so it is possible to
+ * check for wireless_handlers != NULL
+ */
+ if (net_device->wireless_handlers)
+ return true;
+#endif
+
+ return false;
+}
+
+/**
+ * batadv_is_cfg80211_netdev() - check if the given net_device struct is a
+ * cfg80211 wifi interface
+ * @net_device: the device to check
+ *
+ * Return: true if the net device is a cfg80211 wireless device, false
+ * otherwise.
+ */
+static bool batadv_is_cfg80211_netdev(struct net_device *net_device)
+{
+ if (!net_device)
+ return false;
+
+#if IS_ENABLED(CONFIG_CFG80211)
+ /* cfg80211 drivers have to set ieee80211_ptr */
+ if (net_device->ieee80211_ptr)
+ return true;
+#endif
+
+ return false;
+}
+
+/**
+ * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device
+ * @net_device: the device to check
+ *
+ * Return: batadv_hard_iface_wifi_flags flags of the device
+ */
+static u32 batadv_wifi_flags_evaluate(struct net_device *net_device)
+{
+ u32 wifi_flags = 0;
+ struct net_device *real_netdev;
+
+ if (batadv_is_wext_netdev(net_device))
+ wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT;
+
+ if (batadv_is_cfg80211_netdev(net_device))
+ wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
+
+ real_netdev = batadv_get_real_netdevice(net_device);
+ if (!real_netdev)
+ return wifi_flags;
+
+ if (real_netdev == net_device)
+ goto out;
+
+ if (batadv_is_wext_netdev(real_netdev))
+ wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT;
+
+ if (batadv_is_cfg80211_netdev(real_netdev))
+ wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
+
+out:
+ dev_put(real_netdev);
+ return wifi_flags;
+}
+
+/**
+ * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi
+ * interface
+ * @hard_iface: the device to check
+ *
+ * Return: true if the net device is a cfg80211 wireless device, false
+ * otherwise.
+ */
+bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface)
+{
+ u32 allowed_flags = 0;
+
+ allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT;
+ allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT;
+
+ return !!(hard_iface->wifi_flags & allowed_flags);
+}
+
+/**
+ * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface
+ * @hard_iface: the device to check
+ *
+ * Return: true if the net device is a 802.11 wireless device, false otherwise.
+ */
+bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface)
+{
+ if (!hard_iface)
+ return false;
+
+ return hard_iface->wifi_flags != 0;
+}
+
+/**
+ * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary
+ * @if_outgoing: the outgoing interface checked and considered for (re)broadcast
+ * @orig_addr: the originator of this packet
+ * @orig_neigh: originator address of the forwarder we just got the packet from
+ * (NULL if we originated)
+ *
+ * Checks whether a packet needs to be (re)broadcasted on the given interface.
+ *
+ * Return:
+ * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface
+ * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder
+ * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator
+ * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast
+ */
+int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
+ u8 *orig_addr, u8 *orig_neigh)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct hlist_node *first;
+ int ret = BATADV_HARDIF_BCAST_OK;
+
+ rcu_read_lock();
+
+ /* 0 neighbors -> no (re)broadcast */
+ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list));
+ if (!first) {
+ ret = BATADV_HARDIF_BCAST_NORECIPIENT;
+ goto out;
+ }
+
+ /* >1 neighbors -> (re)broadcast */
+ if (rcu_dereference(hlist_next_rcu(first)))
+ goto out;
+
+ hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node,
+ list);
+
+ /* 1 neighbor, is the originator -> no rebroadcast */
+ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) {
+ ret = BATADV_HARDIF_BCAST_DUPORIG;
+ /* 1 neighbor, is the one we received from -> no rebroadcast */
+ } else if (orig_neigh &&
+ batadv_compare_eth(hardif_neigh->orig, orig_neigh)) {
+ ret = BATADV_HARDIF_BCAST_DUPFWD;
+ }
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct batadv_hard_iface *
+batadv_hardif_get_active(struct net_device *mesh_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status == BATADV_IF_ACTIVE &&
+ kref_get_unless_zero(&hard_iface->refcount))
+ goto out;
+ }
+
+ hard_iface = NULL;
+
+out:
+ rcu_read_unlock();
+ return hard_iface;
+}
+
+static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *oldif)
+{
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ batadv_dat_init_own_addr(bat_priv, primary_if);
+ batadv_bla_update_orig_address(bat_priv, primary_if, oldif);
+out:
+ batadv_hardif_put(primary_if);
+}
+
+static void batadv_primary_if_select(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *new_hard_iface)
+{
+ struct batadv_hard_iface *curr_hard_iface;
+
+ ASSERT_RTNL();
+
+ if (new_hard_iface)
+ kref_get(&new_hard_iface->refcount);
+
+ curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if,
+ new_hard_iface, 1);
+
+ if (!new_hard_iface)
+ goto out;
+
+ bat_priv->algo_ops->iface.primary_set(new_hard_iface);
+ batadv_primary_if_update_addr(bat_priv, curr_hard_iface);
+
+out:
+ batadv_hardif_put(curr_hard_iface);
+}
+
+static bool
+batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface)
+{
+ if (hard_iface->net_dev->flags & IFF_UP)
+ return true;
+
+ return false;
+}
+
+static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface)
+{
+ struct net_device *mesh_iface = hard_iface->mesh_iface;
+ const struct batadv_hard_iface *tmp_hard_iface;
+ struct list_head *iter;
+
+ if (!mesh_iface)
+ return;
+
+ netdev_for_each_lower_private(mesh_iface, tmp_hard_iface, iter) {
+ if (tmp_hard_iface == hard_iface)
+ continue;
+
+ if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+ continue;
+
+ if (!batadv_compare_eth(tmp_hard_iface->net_dev->dev_addr,
+ hard_iface->net_dev->dev_addr))
+ continue;
+
+ pr_warn("The newly added mac address (%pM) already exists on: %s\n",
+ hard_iface->net_dev->dev_addr, tmp_hard_iface->net_dev->name);
+ pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n");
+ }
+}
+
+/**
+ * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom
+ * @mesh_iface: netdev struct of the mesh interface
+ */
+static void batadv_hardif_recalc_extra_skbroom(struct net_device *mesh_iface)
+{
+ const struct batadv_hard_iface *hard_iface;
+ unsigned short lower_header_len = ETH_HLEN;
+ unsigned short lower_headroom = 0;
+ unsigned short lower_tailroom = 0;
+ unsigned short needed_headroom;
+ struct list_head *iter;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+ continue;
+
+ lower_header_len = max_t(unsigned short, lower_header_len,
+ hard_iface->net_dev->hard_header_len);
+
+ lower_headroom = max_t(unsigned short, lower_headroom,
+ hard_iface->net_dev->needed_headroom);
+
+ lower_tailroom = max_t(unsigned short, lower_tailroom,
+ hard_iface->net_dev->needed_tailroom);
+ }
+ rcu_read_unlock();
+
+ needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN);
+ needed_headroom += batadv_max_header_len();
+
+ /* fragmentation headers don't strip the unicast/... header */
+ needed_headroom += sizeof(struct batadv_frag_packet);
+
+ mesh_iface->needed_headroom = needed_headroom;
+ mesh_iface->needed_tailroom = lower_tailroom;
+}
+
+/**
+ * batadv_hardif_min_mtu() - Calculate maximum MTU for mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
+ *
+ * Return: MTU for the mesh-interface (limited by the minimal MTU of all active
+ * slave interfaces)
+ */
+int batadv_hardif_min_mtu(struct net_device *mesh_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ const struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+ int min_mtu = INT_MAX;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+ hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
+ continue;
+
+ min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu);
+ }
+ rcu_read_unlock();
+
+ if (atomic_read(&bat_priv->fragmentation) == 0)
+ goto out;
+
+ /* with fragmentation enabled the maximum size of internally generated
+ * packets such as translation table exchanges or tvlv containers, etc
+ * has to be calculated
+ */
+ min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE);
+ min_mtu -= sizeof(struct batadv_frag_packet);
+ min_mtu *= BATADV_FRAG_MAX_FRAGMENTS;
+
+out:
+ /* report to the other components the maximum amount of bytes that
+ * batman-adv can send over the wire (without considering the payload
+ * overhead). For example, this value is used by TT to compute the
+ * maximum local table size
+ */
+ atomic_set(&bat_priv->packet_size_max, min_mtu);
+
+ /* the real mesh-interface MTU is computed by removing the payload
+ * overhead from the maximum amount of bytes that was just computed.
+ */
+ return min_t(int, min_mtu - batadv_max_header_len(), BATADV_MAX_MTU);
+}
+
+/**
+ * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller
+ * MTU appeared
+ * @mesh_iface: netdev struct of the mesh interface
+ */
+void batadv_update_min_mtu(struct net_device *mesh_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ int limit_mtu;
+ int mtu;
+
+ mtu = batadv_hardif_min_mtu(mesh_iface);
+
+ if (bat_priv->mtu_set_by_user)
+ limit_mtu = bat_priv->mtu_set_by_user;
+ else
+ limit_mtu = ETH_DATA_LEN;
+
+ mtu = min(mtu, limit_mtu);
+ dev_set_mtu(mesh_iface, mtu);
+
+ /* Check if the local translate table should be cleaned up to match a
+ * new (and smaller) MTU.
+ */
+ batadv_tt_local_resize_to_mtu(mesh_iface);
+}
+
+static void
+batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+
+ if (hard_iface->if_status != BATADV_IF_INACTIVE)
+ goto out;
+
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ bat_priv->algo_ops->iface.update_mac(hard_iface);
+ hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED;
+
+ /* the first active interface becomes our primary interface or
+ * the next active interface after the old primary interface was removed
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ batadv_primary_if_select(bat_priv, hard_iface);
+
+ batadv_info(hard_iface->mesh_iface, "Interface activated: %s\n",
+ hard_iface->net_dev->name);
+
+ batadv_update_min_mtu(hard_iface->mesh_iface);
+
+ if (bat_priv->algo_ops->iface.activate)
+ bat_priv->algo_ops->iface.activate(hard_iface);
+
+out:
+ batadv_hardif_put(primary_if);
+}
+
+static void
+batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
+{
+ if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+ hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
+ return;
+
+ hard_iface->if_status = BATADV_IF_INACTIVE;
+
+ batadv_info(hard_iface->mesh_iface, "Interface deactivated: %s\n",
+ hard_iface->net_dev->name);
+
+ batadv_update_min_mtu(hard_iface->mesh_iface);
+}
+
+/**
+ * batadv_hardif_enable_interface() - Enslave hard interface to mesh interface
+ * @hard_iface: hard interface to add to mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
+ struct net_device *mesh_iface)
+{
+ struct batadv_priv *bat_priv;
+ __be16 ethertype = htons(ETH_P_BATMAN);
+ int max_header_len = batadv_max_header_len();
+ unsigned int required_mtu;
+ unsigned int hardif_mtu;
+ int ret;
+
+ hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu);
+ required_mtu = READ_ONCE(mesh_iface->mtu) + max_header_len;
+
+ if (hardif_mtu < ETH_MIN_MTU + max_header_len)
+ return -EINVAL;
+
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ goto out;
+
+ kref_get(&hard_iface->refcount);
+
+ netdev_hold(mesh_iface, &hard_iface->meshif_dev_tracker, GFP_ATOMIC);
+ hard_iface->mesh_iface = mesh_iface;
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ ret = netdev_master_upper_dev_link(hard_iface->net_dev,
+ mesh_iface, hard_iface, NULL, NULL);
+ if (ret)
+ goto err_dev;
+
+ ret = bat_priv->algo_ops->iface.enable(hard_iface);
+ if (ret < 0)
+ goto err_upper;
+
+ hard_iface->if_status = BATADV_IF_INACTIVE;
+
+ kref_get(&hard_iface->refcount);
+ hard_iface->batman_adv_ptype.type = ethertype;
+ hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv;
+ hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
+ dev_add_pack(&hard_iface->batman_adv_ptype);
+
+ batadv_info(hard_iface->mesh_iface, "Adding interface: %s\n",
+ hard_iface->net_dev->name);
+
+ if (atomic_read(&bat_priv->fragmentation) &&
+ hardif_mtu < required_mtu)
+ batadv_info(hard_iface->mesh_iface,
+ "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n",
+ hard_iface->net_dev->name, hardif_mtu,
+ required_mtu);
+
+ if (!atomic_read(&bat_priv->fragmentation) &&
+ hardif_mtu < required_mtu)
+ batadv_info(hard_iface->mesh_iface,
+ "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n",
+ hard_iface->net_dev->name, hardif_mtu,
+ required_mtu);
+
+ batadv_check_known_mac_addr(hard_iface);
+
+ if (batadv_hardif_is_iface_up(hard_iface))
+ batadv_hardif_activate_interface(hard_iface);
+ else
+ batadv_err(hard_iface->mesh_iface,
+ "Not using interface %s (retrying later): interface not active\n",
+ hard_iface->net_dev->name);
+
+ batadv_hardif_recalc_extra_skbroom(mesh_iface);
+
+ if (bat_priv->algo_ops->iface.enabled)
+ bat_priv->algo_ops->iface.enabled(hard_iface);
+
+out:
+ return 0;
+
+err_upper:
+ netdev_upper_dev_unlink(hard_iface->net_dev, mesh_iface);
+err_dev:
+ hard_iface->mesh_iface = NULL;
+ netdev_put(mesh_iface, &hard_iface->meshif_dev_tracker);
+ batadv_hardif_put(hard_iface);
+ return ret;
+}
+
+/**
+ * batadv_hardif_cnt() - get number of interfaces enslaved to mesh interface
+ * @mesh_iface: mesh interface to check
+ *
+ * This function is only using RCU for locking - the result can therefore be
+ * off when another function is modifying the list at the same time. The
+ * caller can use the rtnl_lock to make sure that the count is accurate.
+ *
+ * Return: number of connected/enslaved hard interfaces
+ */
+static size_t batadv_hardif_cnt(struct net_device *mesh_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+ size_t count = 0;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter)
+ count++;
+ rcu_read_unlock();
+
+ return count;
+}
+
+/**
+ * batadv_hardif_disable_interface() - Remove hard interface from mesh interface
+ * @hard_iface: hard interface to be removed
+ */
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ struct batadv_hard_iface *primary_if = NULL;
+
+ batadv_hardif_deactivate_interface(hard_iface);
+
+ if (hard_iface->if_status != BATADV_IF_INACTIVE)
+ goto out;
+
+ batadv_info(hard_iface->mesh_iface, "Removing interface: %s\n",
+ hard_iface->net_dev->name);
+ dev_remove_pack(&hard_iface->batman_adv_ptype);
+ batadv_hardif_put(hard_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (hard_iface == primary_if) {
+ struct batadv_hard_iface *new_if;
+
+ new_if = batadv_hardif_get_active(hard_iface->mesh_iface);
+ batadv_primary_if_select(bat_priv, new_if);
+
+ batadv_hardif_put(new_if);
+ }
+
+ bat_priv->algo_ops->iface.disable(hard_iface);
+ hard_iface->if_status = BATADV_IF_NOT_IN_USE;
+
+ /* delete all references to this hard_iface */
+ batadv_purge_orig_ref(bat_priv);
+ batadv_purge_outstanding_packets(bat_priv, hard_iface);
+ netdev_put(hard_iface->mesh_iface, &hard_iface->meshif_dev_tracker);
+
+ netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->mesh_iface);
+ batadv_hardif_recalc_extra_skbroom(hard_iface->mesh_iface);
+
+ /* nobody uses this interface anymore */
+ if (batadv_hardif_cnt(hard_iface->mesh_iface) <= 1)
+ batadv_gw_check_client_stop(bat_priv);
+
+ hard_iface->mesh_iface = NULL;
+ batadv_hardif_put(hard_iface);
+
+out:
+ batadv_hardif_put(primary_if);
+}
+
+static struct batadv_hard_iface *
+batadv_hardif_add_interface(struct net_device *net_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ ASSERT_RTNL();
+
+ if (!batadv_is_valid_iface(net_dev))
+ return NULL;
+
+ hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC);
+ if (!hard_iface)
+ return NULL;
+
+ netdev_hold(net_dev, &hard_iface->dev_tracker, GFP_ATOMIC);
+ hard_iface->net_dev = net_dev;
+
+ hard_iface->mesh_iface = NULL;
+ hard_iface->if_status = BATADV_IF_NOT_IN_USE;
+
+ INIT_LIST_HEAD(&hard_iface->list);
+ INIT_HLIST_HEAD(&hard_iface->neigh_list);
+
+ mutex_init(&hard_iface->bat_iv.ogm_buff_mutex);
+ spin_lock_init(&hard_iface->neigh_list_lock);
+ kref_init(&hard_iface->refcount);
+
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
+ hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
+ if (batadv_is_wifi_hardif(hard_iface))
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+
+ atomic_set(&hard_iface->hop_penalty, 0);
+
+ batadv_v_hardif_init(hard_iface);
+
+ kref_get(&hard_iface->refcount);
+ list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
+ batadv_hardif_generation++;
+
+ return hard_iface;
+}
+
+static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
+{
+ ASSERT_RTNL();
+
+ /* first deactivate interface */
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ batadv_hardif_disable_interface(hard_iface);
+
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ return;
+
+ hard_iface->if_status = BATADV_IF_TO_BE_REMOVED;
+ batadv_hardif_put(hard_iface);
+}
+
+/**
+ * batadv_hard_if_event_meshif() - Handle events for mesh interfaces
+ * @event: NETDEV_* event to handle
+ * @net_dev: net_device which generated an event
+ *
+ * Return: NOTIFY_* result
+ */
+static int batadv_hard_if_event_meshif(unsigned long event,
+ struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ bat_priv = netdev_priv(net_dev);
+ batadv_meshif_create_vlan(bat_priv, BATADV_NO_FLAGS);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int batadv_hard_if_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *net_dev = netdev_notifier_info_to_dev(ptr);
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_priv *bat_priv;
+
+ if (batadv_meshif_is_valid(net_dev))
+ return batadv_hard_if_event_meshif(event, net_dev);
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface && (event == NETDEV_REGISTER ||
+ event == NETDEV_POST_TYPE_CHANGE))
+ hard_iface = batadv_hardif_add_interface(net_dev);
+
+ if (!hard_iface)
+ goto out;
+
+ switch (event) {
+ case NETDEV_UP:
+ batadv_hardif_activate_interface(hard_iface);
+ break;
+ case NETDEV_GOING_DOWN:
+ case NETDEV_DOWN:
+ batadv_hardif_deactivate_interface(hard_iface);
+ break;
+ case NETDEV_UNREGISTER:
+ case NETDEV_PRE_TYPE_CHANGE:
+ list_del_rcu(&hard_iface->list);
+ batadv_hardif_generation++;
+
+ batadv_hardif_remove_interface(hard_iface);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (hard_iface->mesh_iface)
+ batadv_update_min_mtu(hard_iface->mesh_iface);
+ break;
+ case NETDEV_CHANGEADDR:
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+ goto hardif_put;
+
+ batadv_check_known_mac_addr(hard_iface);
+
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
+ bat_priv->algo_ops->iface.update_mac(hard_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto hardif_put;
+
+ if (hard_iface == primary_if)
+ batadv_primary_if_update_addr(bat_priv, NULL);
+ break;
+ case NETDEV_CHANGEUPPER:
+ hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev);
+ if (batadv_is_wifi_hardif(hard_iface))
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+ break;
+ default:
+ break;
+ }
+
+hardif_put:
+ batadv_hardif_put(hard_iface);
+out:
+ batadv_hardif_put(primary_if);
+ return NOTIFY_DONE;
+}
+
+struct notifier_block batadv_hard_if_notifier = {
+ .notifier_call = batadv_hard_if_event,
+};
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
new file mode 100644
index 000000000000..9db8a310961e
--- /dev/null
+++ b/net/batman-adv/hard-interface.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_
+#define _NET_BATMAN_ADV_HARD_INTERFACE_H_
+
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/**
+ * enum batadv_hard_if_state - State of a hard interface
+ */
+enum batadv_hard_if_state {
+ /**
+ * @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a
+ * batman-adv mesh interface
+ */
+ BATADV_IF_NOT_IN_USE,
+
+ /**
+ * @BATADV_IF_TO_BE_REMOVED: interface will be removed from mesh
+ * interface
+ */
+ BATADV_IF_TO_BE_REMOVED,
+
+ /** @BATADV_IF_INACTIVE: interface is deactivated */
+ BATADV_IF_INACTIVE,
+
+ /** @BATADV_IF_ACTIVE: interface is used */
+ BATADV_IF_ACTIVE,
+
+ /** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */
+ BATADV_IF_TO_BE_ACTIVATED,
+};
+
+/**
+ * enum batadv_hard_if_bcast - broadcast avoidance options
+ */
+enum batadv_hard_if_bcast {
+ /** @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface */
+ BATADV_HARDIF_BCAST_OK = 0,
+
+ /**
+ * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no
+ * recipient
+ */
+ BATADV_HARDIF_BCAST_NORECIPIENT,
+
+ /**
+ * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it
+ * from
+ */
+ BATADV_HARDIF_BCAST_DUPFWD,
+
+ /** @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator */
+ BATADV_HARDIF_BCAST_DUPORIG,
+};
+
+extern struct notifier_block batadv_hard_if_notifier;
+
+struct net_device *batadv_get_real_netdev(struct net_device *net_device);
+bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface);
+bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface);
+struct batadv_hard_iface*
+batadv_hardif_get_by_netdev(const struct net_device *net_dev);
+int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
+ struct net_device *mesh_iface);
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface);
+int batadv_hardif_min_mtu(struct net_device *mesh_iface);
+void batadv_update_min_mtu(struct net_device *mesh_iface);
+void batadv_hardif_release(struct kref *ref);
+int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
+ u8 *orig_addr, u8 *orig_neigh);
+
+/**
+ * batadv_hardif_put() - decrement the hard interface refcounter and possibly
+ * release it
+ * @hard_iface: the hard interface to free
+ */
+static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface)
+{
+ if (!hard_iface)
+ return;
+
+ kref_put(&hard_iface->refcount, batadv_hardif_release);
+}
+
+/**
+ * batadv_primary_if_get_selected() - Get reference to primary interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: primary interface (with increased refcnt), otherwise NULL
+ */
+static inline struct batadv_hard_iface *
+batadv_primary_if_get_selected(struct batadv_priv *bat_priv)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ hard_iface = rcu_dereference(bat_priv->primary_if);
+ if (!hard_iface)
+ goto out;
+
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ hard_iface = NULL;
+
+out:
+ rcu_read_unlock();
+ return hard_iface;
+}
+
+#endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
new file mode 100644
index 000000000000..8016e619787f
--- /dev/null
+++ b/net/batman-adv/hash.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ */
+
+#include "hash.h"
+#include "main.h"
+
+#include <linux/gfp.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+
+/* clears the hash */
+static void batadv_hash_init(struct batadv_hashtable *hash)
+{
+ u32 i;
+
+ for (i = 0; i < hash->size; i++) {
+ INIT_HLIST_HEAD(&hash->table[i]);
+ spin_lock_init(&hash->list_locks[i]);
+ }
+
+ atomic_set(&hash->generation, 0);
+}
+
+/**
+ * batadv_hash_destroy() - Free only the hashtable and the hash itself
+ * @hash: hash object to destroy
+ */
+void batadv_hash_destroy(struct batadv_hashtable *hash)
+{
+ kfree(hash->list_locks);
+ kfree(hash->table);
+ kfree(hash);
+}
+
+/**
+ * batadv_hash_new() - Allocates and clears the hashtable
+ * @size: number of hash buckets to allocate
+ *
+ * Return: newly allocated hashtable, NULL on errors
+ */
+struct batadv_hashtable *batadv_hash_new(u32 size)
+{
+ struct batadv_hashtable *hash;
+
+ hash = kmalloc(sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ return NULL;
+
+ hash->table = kmalloc_array(size, sizeof(*hash->table), GFP_ATOMIC);
+ if (!hash->table)
+ goto free_hash;
+
+ hash->list_locks = kmalloc_array(size, sizeof(*hash->list_locks),
+ GFP_ATOMIC);
+ if (!hash->list_locks)
+ goto free_table;
+
+ hash->size = size;
+ batadv_hash_init(hash);
+ return hash;
+
+free_table:
+ kfree(hash->table);
+free_hash:
+ kfree(hash);
+ return NULL;
+}
+
+/**
+ * batadv_hash_set_lock_class() - Set specific lockdep class for hash spinlocks
+ * @hash: hash object to modify
+ * @key: lockdep class key address
+ */
+void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
+ struct lock_class_key *key)
+{
+ u32 i;
+
+ for (i = 0; i < hash->size; i++)
+ lockdep_set_class(&hash->list_locks[i], key);
+}
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
new file mode 100644
index 000000000000..fb251c385a1b
--- /dev/null
+++ b/net/batman-adv/hash.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_HASH_H_
+#define _NET_BATMAN_ADV_HASH_H_
+
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/* callback to a compare function. should compare 2 element data for their
+ * keys
+ *
+ * Return: true if same and false if not same
+ */
+typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *,
+ const void *);
+
+/* the hashfunction
+ *
+ * Return: an index based on the key in the data of the first argument and the
+ * size the second
+ */
+typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32);
+typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *);
+
+/**
+ * struct batadv_hashtable - Wrapper of simple hlist based hashtable
+ */
+struct batadv_hashtable {
+ /** @table: the hashtable itself with the buckets */
+ struct hlist_head *table;
+
+ /** @list_locks: spinlock for each hash list entry */
+ spinlock_t *list_locks;
+
+ /** @size: size of hashtable */
+ u32 size;
+
+ /** @generation: current (generation) sequence number */
+ atomic_t generation;
+};
+
+/* allocates and clears the hash */
+struct batadv_hashtable *batadv_hash_new(u32 size);
+
+/* set class key for all locks */
+void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
+ struct lock_class_key *key);
+
+/* free only the hashtable and the hash itself. */
+void batadv_hash_destroy(struct batadv_hashtable *hash);
+
+/**
+ * batadv_hash_add() - adds data to the hashtable
+ * @hash: storage hash table
+ * @compare: callback to determine if 2 hash elements are identical
+ * @choose: callback calculating the hash index
+ * @data: data passed to the aforementioned callbacks as argument
+ * @data_node: to be added element
+ *
+ * Return: 0 on success, 1 if the element already is in the hash
+ * and -1 on error.
+ */
+static inline int batadv_hash_add(struct batadv_hashtable *hash,
+ batadv_hashdata_compare_cb compare,
+ batadv_hashdata_choose_cb choose,
+ const void *data,
+ struct hlist_node *data_node)
+{
+ u32 index;
+ int ret = -1;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+
+ if (!hash)
+ goto out;
+
+ index = choose(data, hash->size);
+ head = &hash->table[index];
+ list_lock = &hash->list_locks[index];
+
+ spin_lock_bh(list_lock);
+
+ hlist_for_each(node, head) {
+ if (!compare(node, data))
+ continue;
+
+ ret = 1;
+ goto unlock;
+ }
+
+ /* no duplicate found in list, add new element */
+ hlist_add_head_rcu(data_node, head);
+ atomic_inc(&hash->generation);
+
+ ret = 0;
+
+unlock:
+ spin_unlock_bh(list_lock);
+out:
+ return ret;
+}
+
+/**
+ * batadv_hash_remove() - Removes data from hash, if found
+ * @hash: hash table
+ * @compare: callback to determine if 2 hash elements are identical
+ * @choose: callback calculating the hash index
+ * @data: data passed to the aforementioned callbacks as argument
+ *
+ * ata could be the structure you use with just the key filled, we just need
+ * the key for comparing.
+ *
+ * Return: returns pointer do data on success, so you can remove the used
+ * structure yourself, or NULL on error
+ */
+static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
+ batadv_hashdata_compare_cb compare,
+ batadv_hashdata_choose_cb choose,
+ void *data)
+{
+ u32 index;
+ struct hlist_node *node;
+ struct hlist_head *head;
+ void *data_save = NULL;
+
+ index = choose(data, hash->size);
+ head = &hash->table[index];
+
+ spin_lock_bh(&hash->list_locks[index]);
+ hlist_for_each(node, head) {
+ if (!compare(node, data))
+ continue;
+
+ data_save = node;
+ hlist_del_rcu(node);
+ atomic_inc(&hash->generation);
+ break;
+ }
+ spin_unlock_bh(&hash->list_locks[index]);
+
+ return data_save;
+}
+
+#endif /* _NET_BATMAN_ADV_HASH_H_ */
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
new file mode 100644
index 000000000000..c19d07eeb070
--- /dev/null
+++ b/net/batman-adv/log.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#include "log.h"
+#include "main.h"
+
+#include <linux/stdarg.h>
+
+#include "trace.h"
+
+/**
+ * batadv_debug_log() - Add debug log entry
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @fmt: format string
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ trace_batadv_dbg(bat_priv, &vaf);
+
+ va_end(args);
+
+ return 0;
+}
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
new file mode 100644
index 000000000000..225b747a2048
--- /dev/null
+++ b/net/batman-adv/log.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_LOG_H_
+#define _NET_BATMAN_ADV_LOG_H_
+
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/printk.h>
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+
+int batadv_debug_log_setup(struct batadv_priv *bat_priv);
+void batadv_debug_log_cleanup(struct batadv_priv *bat_priv);
+
+#else
+
+static inline int batadv_debug_log_setup(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
+{
+}
+
+#endif
+
+/**
+ * enum batadv_dbg_level - available log levels
+ */
+enum batadv_dbg_level {
+ /** @BATADV_DBG_BATMAN: OGM and TQ computations related messages */
+ BATADV_DBG_BATMAN = BIT(0),
+
+ /** @BATADV_DBG_ROUTES: route added / changed / deleted */
+ BATADV_DBG_ROUTES = BIT(1),
+
+ /** @BATADV_DBG_TT: translation table messages */
+ BATADV_DBG_TT = BIT(2),
+
+ /** @BATADV_DBG_BLA: bridge loop avoidance messages */
+ BATADV_DBG_BLA = BIT(3),
+
+ /** @BATADV_DBG_DAT: ARP snooping and DAT related messages */
+ BATADV_DBG_DAT = BIT(4),
+
+ /** @BATADV_DBG_MCAST: multicast related messages */
+ BATADV_DBG_MCAST = BIT(6),
+
+ /** @BATADV_DBG_TP_METER: throughput meter messages */
+ BATADV_DBG_TP_METER = BIT(7),
+
+ /** @BATADV_DBG_ALL: the union of all the above log levels */
+ BATADV_DBG_ALL = 255,
+};
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+__printf(2, 3);
+
+/**
+ * _batadv_dbg() - Store debug output with(out) rate limiting
+ * @type: type of debug message
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ratelimited: whether output should be rate limited
+ * @fmt: format string
+ * @arg: variable arguments
+ */
+#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
+ do { \
+ struct batadv_priv *__batpriv = (bat_priv); \
+ if (atomic_read(&__batpriv->log_level) & (type) && \
+ (!(ratelimited) || net_ratelimit())) \
+ batadv_debug_log(__batpriv, fmt, ## arg); \
+ } \
+ while (0)
+#else /* !CONFIG_BATMAN_ADV_DEBUG */
+__printf(4, 5)
+static inline void _batadv_dbg(int type __always_unused,
+ struct batadv_priv *bat_priv __always_unused,
+ int ratelimited __always_unused,
+ const char *fmt __always_unused, ...)
+{
+}
+#endif
+
+/**
+ * batadv_dbg() - Store debug output without rate limiting
+ * @type: type of debug message
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @arg: format string and variable arguments
+ */
+#define batadv_dbg(type, bat_priv, arg...) \
+ _batadv_dbg(type, bat_priv, 0, ## arg)
+
+/**
+ * batadv_dbg_ratelimited() - Store debug output with rate limiting
+ * @type: type of debug message
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @arg: format string and variable arguments
+ */
+#define batadv_dbg_ratelimited(type, bat_priv, arg...) \
+ _batadv_dbg(type, bat_priv, 1, ## arg)
+
+/**
+ * batadv_info() - Store message in debug buffer and print it to kmsg buffer
+ * @net_dev: the mesh interface net device
+ * @fmt: format string
+ * @arg: variable arguments
+ */
+#define batadv_info(net_dev, fmt, arg...) \
+ do { \
+ struct net_device *_netdev = (net_dev); \
+ struct batadv_priv *_batpriv = netdev_priv(_netdev); \
+ batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
+ pr_info("%s: " fmt, _netdev->name, ## arg); \
+ } while (0)
+
+/**
+ * batadv_err() - Store error in debug buffer and print it to kmsg buffer
+ * @net_dev: the mesh interface net device
+ * @fmt: format string
+ * @arg: variable arguments
+ */
+#define batadv_err(net_dev, fmt, arg...) \
+ do { \
+ struct net_device *_netdev = (net_dev); \
+ struct batadv_priv *_batpriv = netdev_priv(_netdev); \
+ batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
+ pr_err("%s: " fmt, _netdev->name, ## arg); \
+ } while (0)
+
+#endif /* _NET_BATMAN_ADV_LOG_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
new file mode 100644
index 000000000000..3a35aadd8b41
--- /dev/null
+++ b/net/batman-adv/main.c
@@ -0,0 +1,688 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "main.h"
+
+#include <linux/array_size.h>
+#include <linux/atomic.h>
+#include <linux/build_bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/sprintf.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <net/dsfield.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bat_algo.h"
+#include "bat_iv_ogm.h"
+#include "bat_v.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
+#include "hard-interface.h"
+#include "log.h"
+#include "mesh-interface.h"
+#include "multicast.h"
+#include "netlink.h"
+#include "originator.h"
+#include "routing.h"
+#include "send.h"
+#include "tp_meter.h"
+#include "translation-table.h"
+
+/* List manipulations on hardif_list have to be rtnl_lock()'ed,
+ * list traversals just rcu-locked
+ */
+struct list_head batadv_hardif_list;
+unsigned int batadv_hardif_generation;
+static int (*batadv_rx_handler[256])(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+
+struct workqueue_struct *batadv_event_workqueue;
+
+static void batadv_recv_handler_init(void);
+
+#define BATADV_UEV_TYPE_VAR "BATTYPE="
+#define BATADV_UEV_ACTION_VAR "BATACTION="
+#define BATADV_UEV_DATA_VAR "BATDATA="
+
+static char *batadv_uev_action_str[] = {
+ "add",
+ "del",
+ "change",
+ "loopdetect",
+};
+
+static char *batadv_uev_type_str[] = {
+ "gw",
+ "bla",
+};
+
+static int __init batadv_init(void)
+{
+ int ret;
+
+ ret = batadv_tt_cache_init();
+ if (ret < 0)
+ return ret;
+
+ INIT_LIST_HEAD(&batadv_hardif_list);
+ batadv_algo_init();
+
+ batadv_recv_handler_init();
+
+ batadv_v_init();
+ batadv_iv_init();
+ batadv_tp_meter_init();
+
+ batadv_event_workqueue = create_singlethread_workqueue("bat_events");
+ if (!batadv_event_workqueue)
+ goto err_create_wq;
+
+ register_netdevice_notifier(&batadv_hard_if_notifier);
+ rtnl_link_register(&batadv_link_ops);
+ batadv_netlink_register();
+
+ pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n",
+ BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION);
+
+ return 0;
+
+err_create_wq:
+ batadv_tt_cache_destroy();
+
+ return -ENOMEM;
+}
+
+static void __exit batadv_exit(void)
+{
+ batadv_netlink_unregister();
+ rtnl_link_unregister(&batadv_link_ops);
+ unregister_netdevice_notifier(&batadv_hard_if_notifier);
+
+ destroy_workqueue(batadv_event_workqueue);
+ batadv_event_workqueue = NULL;
+
+ rcu_barrier();
+
+ batadv_tt_cache_destroy();
+}
+
+/**
+ * batadv_mesh_init() - Initialize mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_mesh_init(struct net_device *mesh_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ int ret;
+
+ spin_lock_init(&bat_priv->forw_bat_list_lock);
+ spin_lock_init(&bat_priv->forw_bcast_list_lock);
+ spin_lock_init(&bat_priv->tt.changes_list_lock);
+ spin_lock_init(&bat_priv->tt.req_list_lock);
+ spin_lock_init(&bat_priv->tt.roam_list_lock);
+ spin_lock_init(&bat_priv->tt.last_changeset_lock);
+ spin_lock_init(&bat_priv->tt.commit_lock);
+ spin_lock_init(&bat_priv->gw.list_lock);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ spin_lock_init(&bat_priv->mcast.mla_lock);
+ spin_lock_init(&bat_priv->mcast.want_lists_lock);
+#endif
+ spin_lock_init(&bat_priv->tvlv.container_list_lock);
+ spin_lock_init(&bat_priv->tvlv.handler_list_lock);
+ spin_lock_init(&bat_priv->meshif_vlan_list_lock);
+ spin_lock_init(&bat_priv->tp_list_lock);
+
+ INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
+ INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
+ INIT_HLIST_HEAD(&bat_priv->gw.gateway_list);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list);
+ INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list);
+ INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list);
+#endif
+ INIT_LIST_HEAD(&bat_priv->tt.changes_list);
+ INIT_HLIST_HEAD(&bat_priv->tt.req_list);
+ INIT_LIST_HEAD(&bat_priv->tt.roam_list);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ INIT_HLIST_HEAD(&bat_priv->mcast.mla_list);
+#endif
+ INIT_HLIST_HEAD(&bat_priv->tvlv.container_list);
+ INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list);
+ INIT_HLIST_HEAD(&bat_priv->meshif_vlan_list);
+ INIT_HLIST_HEAD(&bat_priv->tp_list);
+
+ bat_priv->gw.generation = 0;
+
+ ret = batadv_originator_init(bat_priv);
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_orig;
+ }
+
+ ret = batadv_tt_init(bat_priv);
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_tt;
+ }
+
+ ret = batadv_v_mesh_init(bat_priv);
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_v;
+ }
+
+ ret = batadv_bla_init(bat_priv);
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_bla;
+ }
+
+ ret = batadv_dat_init(bat_priv);
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_dat;
+ }
+
+ batadv_gw_init(bat_priv);
+ batadv_mcast_init(bat_priv);
+
+ atomic_set(&bat_priv->gw.reselect, 0);
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE);
+
+ return 0;
+
+err_dat:
+ batadv_bla_free(bat_priv);
+err_bla:
+ batadv_v_mesh_free(bat_priv);
+err_v:
+ batadv_tt_free(bat_priv);
+err_tt:
+ batadv_originator_free(bat_priv);
+err_orig:
+ batadv_purge_outstanding_packets(bat_priv, NULL);
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+
+ return ret;
+}
+
+/**
+ * batadv_mesh_free() - Deinitialize mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
+ */
+void batadv_mesh_free(struct net_device *mesh_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+
+ batadv_purge_outstanding_packets(bat_priv, NULL);
+
+ batadv_gw_node_free(bat_priv);
+
+ batadv_v_mesh_free(bat_priv);
+ batadv_dat_free(bat_priv);
+ batadv_bla_free(bat_priv);
+
+ batadv_mcast_free(bat_priv);
+
+ /* Free the TT and the originator tables only after having terminated
+ * all the other depending components which may use these structures for
+ * their purposes.
+ */
+ batadv_tt_free(bat_priv);
+
+ /* Since the originator table clean up routine is accessing the TT
+ * tables as well, it has to be invoked after the TT tables have been
+ * freed and marked as empty. This ensures that no cleanup RCU callbacks
+ * accessing the TT data are scheduled for later execution.
+ */
+ batadv_originator_free(bat_priv);
+
+ batadv_gw_free(bat_priv);
+
+ free_percpu(bat_priv->bat_counters);
+ bat_priv->bat_counters = NULL;
+
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+}
+
+/**
+ * batadv_is_my_mac() - check if the given mac address belongs to any of the
+ * real interfaces in the current mesh
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the address to check
+ *
+ * Return: 'true' if the mac address was found, false otherwise.
+ */
+bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
+{
+ const struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+ bool is_my_mac = false;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) {
+ is_my_mac = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return is_my_mac;
+}
+
+/**
+ * batadv_max_header_len() - calculate maximum encapsulation overhead for a
+ * payload packet
+ *
+ * Return: the maximum encapsulation overhead in bytes.
+ */
+int batadv_max_header_len(void)
+{
+ int header_len = 0;
+
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_unicast_packet));
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_unicast_4addr_packet));
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_bcast_packet));
+
+ return header_len + ETH_HLEN;
+}
+
+/**
+ * batadv_skb_set_priority() - sets skb priority according to packet content
+ * @skb: the packet to be sent
+ * @offset: offset to the packet content
+ *
+ * This function sets a value between 256 and 263 (802.1d priority), which
+ * can be interpreted by the cfg80211 or other drivers.
+ */
+void batadv_skb_set_priority(struct sk_buff *skb, int offset)
+{
+ struct iphdr ip_hdr_tmp, *ip_hdr;
+ struct ipv6hdr ip6_hdr_tmp, *ip6_hdr;
+ struct ethhdr ethhdr_tmp, *ethhdr;
+ struct vlan_ethhdr *vhdr, vhdr_tmp;
+ u32 prio;
+
+ /* already set, do nothing */
+ if (skb->priority >= 256 && skb->priority <= 263)
+ return;
+
+ ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), &ethhdr_tmp);
+ if (!ethhdr)
+ return;
+
+ switch (ethhdr->h_proto) {
+ case htons(ETH_P_8021Q):
+ vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr),
+ sizeof(*vhdr), &vhdr_tmp);
+ if (!vhdr)
+ return;
+ prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK;
+ prio = prio >> VLAN_PRIO_SHIFT;
+ break;
+ case htons(ETH_P_IP):
+ ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr),
+ sizeof(*ip_hdr), &ip_hdr_tmp);
+ if (!ip_hdr)
+ return;
+ prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5;
+ break;
+ case htons(ETH_P_IPV6):
+ ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr),
+ sizeof(*ip6_hdr), &ip6_hdr_tmp);
+ if (!ip6_hdr)
+ return;
+ prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5;
+ break;
+ default:
+ return;
+ }
+
+ skb->priority = prio + 256;
+}
+
+static int batadv_recv_unhandled_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ kfree_skb(skb);
+
+ return NET_RX_DROP;
+}
+
+/* incoming packets with the batman ethertype received on any active hard
+ * interface
+ */
+
+/**
+ * batadv_batman_skb_recv() - Handle incoming message from an hard interface
+ * @skb: the received packet
+ * @dev: the net device that the packet was received on
+ * @ptype: packet type of incoming packet (ETH_P_BATMAN)
+ * @orig_dev: the original receive net device (e.g. bonded device)
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
+int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev)
+{
+ struct batadv_priv *bat_priv;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ struct batadv_hard_iface *hard_iface;
+ u8 idx;
+
+ hard_iface = container_of(ptype, struct batadv_hard_iface,
+ batman_adv_ptype);
+
+ /* Prevent processing a packet received on an interface which is getting
+ * shut down otherwise the packet may trigger de-reference errors
+ * further down in the receive path.
+ */
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ goto err_out;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+
+ /* skb was released by skb_share_check() */
+ if (!skb)
+ goto err_put;
+
+ /* packet should hold at least type and version */
+ if (unlikely(!pskb_may_pull(skb, 2)))
+ goto err_free;
+
+ /* expect a valid ethernet header here. */
+ if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb)))
+ goto err_free;
+
+ if (!hard_iface->mesh_iface)
+ goto err_free;
+
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto err_free;
+
+ /* discard frames on not active interfaces */
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ goto err_free;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data;
+
+ if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: incompatible batman version (%i)\n",
+ batadv_ogm_packet->version);
+ goto err_free;
+ }
+
+ /* reset control block to avoid left overs from previous users */
+ memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
+
+ idx = batadv_ogm_packet->packet_type;
+ (*batadv_rx_handler[idx])(skb, hard_iface);
+
+ batadv_hardif_put(hard_iface);
+
+ /* return NET_RX_SUCCESS in any case as we
+ * most probably dropped the packet for
+ * routing-logical reasons.
+ */
+ return NET_RX_SUCCESS;
+
+err_free:
+ kfree_skb(skb);
+err_put:
+ batadv_hardif_put(hard_iface);
+err_out:
+ return NET_RX_DROP;
+}
+
+static void batadv_recv_handler_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(batadv_rx_handler); i++)
+ batadv_rx_handler[i] = batadv_recv_unhandled_packet;
+
+ for (i = BATADV_UNICAST_MIN; i <= BATADV_UNICAST_MAX; i++)
+ batadv_rx_handler[i] = batadv_recv_unhandled_unicast_packet;
+
+ /* compile time checks for sizes */
+ BUILD_BUG_ON(sizeof(struct batadv_bla_claim_dst) != 6);
+ BUILD_BUG_ON(sizeof(struct batadv_ogm_packet) != 24);
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_header) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_packet) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_packet_rr) != 116);
+ BUILD_BUG_ON(sizeof(struct batadv_unicast_packet) != 10);
+ BUILD_BUG_ON(sizeof(struct batadv_unicast_4addr_packet) != 18);
+ BUILD_BUG_ON(sizeof(struct batadv_frag_packet) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_bcast_packet) != 14);
+ BUILD_BUG_ON(sizeof(struct batadv_coded_packet) != 46);
+ BUILD_BUG_ON(sizeof(struct batadv_unicast_tvlv_packet) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_hdr) != 4);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_gateway_data) != 8);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_vlan_data) != 8);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8);
+
+ i = sizeof_field(struct sk_buff, cb);
+ BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i);
+
+ /* broadcast packet */
+ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet;
+ /* multicast packet */
+ batadv_rx_handler[BATADV_MCAST] = batadv_recv_mcast_packet;
+
+ /* unicast packets ... */
+ /* unicast with 4 addresses packet */
+ batadv_rx_handler[BATADV_UNICAST_4ADDR] = batadv_recv_unicast_packet;
+ /* unicast packet */
+ batadv_rx_handler[BATADV_UNICAST] = batadv_recv_unicast_packet;
+ /* unicast tvlv packet */
+ batadv_rx_handler[BATADV_UNICAST_TVLV] = batadv_recv_unicast_tvlv;
+ /* batman icmp packet */
+ batadv_rx_handler[BATADV_ICMP] = batadv_recv_icmp_packet;
+ /* Fragmented packets */
+ batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet;
+}
+
+/**
+ * batadv_recv_handler_register() - Register handler for batman-adv packet type
+ * @packet_type: batadv_packettype which should be handled
+ * @recv_handler: receive handler for the packet type
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int
+batadv_recv_handler_register(u8 packet_type,
+ int (*recv_handler)(struct sk_buff *,
+ struct batadv_hard_iface *))
+{
+ int (*curr)(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+ curr = batadv_rx_handler[packet_type];
+
+ if (curr != batadv_recv_unhandled_packet &&
+ curr != batadv_recv_unhandled_unicast_packet)
+ return -EBUSY;
+
+ batadv_rx_handler[packet_type] = recv_handler;
+ return 0;
+}
+
+/**
+ * batadv_recv_handler_unregister() - Unregister handler for packet type
+ * @packet_type: batadv_packettype which should no longer be handled
+ */
+void batadv_recv_handler_unregister(u8 packet_type)
+{
+ batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet;
+}
+
+/**
+ * batadv_get_vid() - extract the VLAN identifier from skb if any
+ * @skb: the buffer containing the packet
+ * @header_len: length of the batman header preceding the ethernet header
+ *
+ * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the
+ * skb is vlan tagged. Otherwise BATADV_NO_FLAGS.
+ */
+unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
+{
+ struct ethhdr *ethhdr = (struct ethhdr *)(skb->data + header_len);
+ struct vlan_ethhdr *vhdr;
+ unsigned short vid;
+
+ if (ethhdr->h_proto != htons(ETH_P_8021Q))
+ return BATADV_NO_FLAGS;
+
+ if (!pskb_may_pull(skb, header_len + VLAN_ETH_HLEN))
+ return BATADV_NO_FLAGS;
+
+ vhdr = (struct vlan_ethhdr *)(skb->data + header_len);
+ vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK;
+
+ /* VID 0 is only used to indicate "priority tag" frames which only
+ * contain priority information and no VID.
+ */
+ if (vid == 0)
+ return BATADV_NO_FLAGS;
+
+ vid |= BATADV_VLAN_HAS_TAG;
+
+ return vid;
+}
+
+/**
+ * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: the VLAN identifier for which the AP isolation attributed as to be
+ * looked up
+ *
+ * Return: true if AP isolation is on for the VLAN identified by vid, false
+ * otherwise
+ */
+bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
+{
+ bool ap_isolation_enabled = false;
+ struct batadv_meshif_vlan *vlan;
+
+ /* if the AP isolation is requested on a VLAN, then check for its
+ * setting in the proper VLAN private data structure
+ */
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
+ if (vlan) {
+ ap_isolation_enabled = atomic_read(&vlan->ap_isolation);
+ batadv_meshif_vlan_put(vlan);
+ }
+
+ return ap_isolation_enabled;
+}
+
+/**
+ * batadv_throw_uevent() - Send an uevent with batman-adv specific env data
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: subsystem type of event. Stored in uevent's BATTYPE
+ * @action: action type of event. Stored in uevent's BATACTION
+ * @data: string with additional information to the event (ignored for
+ * BATADV_UEV_DEL). Stored in uevent's BATDATA
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
+ enum batadv_uev_action action, const char *data)
+{
+ int ret = -ENOMEM;
+ struct kobject *bat_kobj;
+ char *uevent_env[4] = { NULL, NULL, NULL, NULL };
+
+ bat_kobj = &bat_priv->mesh_iface->dev.kobj;
+
+ uevent_env[0] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_TYPE_VAR,
+ batadv_uev_type_str[type]);
+ if (!uevent_env[0])
+ goto report_error;
+
+ uevent_env[1] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_ACTION_VAR,
+ batadv_uev_action_str[action]);
+ if (!uevent_env[1])
+ goto free_first_env;
+
+ /* If the event is DEL, ignore the data field */
+ if (action != BATADV_UEV_DEL) {
+ uevent_env[2] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_DATA_VAR, data);
+ if (!uevent_env[2])
+ goto free_second_env;
+ }
+
+ ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env);
+ kfree(uevent_env[2]);
+free_second_env:
+ kfree(uevent_env[1]);
+free_first_env:
+ kfree(uevent_env[0]);
+
+ if (ret)
+report_error:
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n",
+ batadv_uev_type_str[type],
+ batadv_uev_action_str[action],
+ (action == BATADV_UEV_DEL ? "NULL" : data), ret);
+ return ret;
+}
+
+module_init(batadv_init);
+module_exit(batadv_exit);
+
+MODULE_LICENSE("GPL");
+
+MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
+MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
+MODULE_VERSION(BATADV_SOURCE_VERSION);
+MODULE_ALIAS_RTNL_LINK("batadv");
+MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
new file mode 100644
index 000000000000..af230b017bc1
--- /dev/null
+++ b/net/batman-adv/main.h
@@ -0,0 +1,380 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_MAIN_H_
+#define _NET_BATMAN_ADV_MAIN_H_
+
+#define BATADV_DRIVER_AUTHOR "Marek Lindner <marek.lindner@mailbox.org>, " \
+ "Simon Wunderlich <sw@simonwunderlich.de>"
+#define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced"
+#define BATADV_DRIVER_DEVICE "batman-adv"
+
+#ifndef BATADV_SOURCE_VERSION
+#define BATADV_SOURCE_VERSION "2025.5"
+#endif
+
+/* B.A.T.M.A.N. parameters */
+
+#define BATADV_TQ_MAX_VALUE 255
+#define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF
+#define BATADV_JITTER 20
+
+#define BATADV_MAX_MTU (ETH_MAX_MTU - batadv_max_header_len())
+
+/* Time To Live of broadcast messages */
+#define BATADV_TTL 50
+
+/* maximum sequence number age of broadcast messages */
+#define BATADV_BCAST_MAX_AGE 64
+
+/* purge originators after time in seconds if no valid packet comes in
+ * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE
+ */
+#define BATADV_PURGE_TIMEOUT 200000 /* 200 seconds */
+#define BATADV_TT_LOCAL_TIMEOUT 600000 /* in milliseconds */
+#define BATADV_TT_CLIENT_ROAM_TIMEOUT 600000 /* in milliseconds */
+#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */
+#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
+#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
+#define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */
+#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */
+/* sliding packet range of received originator messages in sequence numbers
+ * (should be a multiple of our word size)
+ */
+#define BATADV_TQ_LOCAL_WINDOW_SIZE 64
+/* milliseconds we have to keep pending tt_req */
+#define BATADV_TT_REQUEST_TIMEOUT 3000
+
+#define BATADV_TQ_GLOBAL_WINDOW_SIZE 5
+#define BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM 1
+#define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1
+#define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1
+
+/* B.A.T.M.A.N. V */
+#define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */
+#define BATADV_ELP_PROBES_PER_NODE 2
+#define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */
+#define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */
+#define BATADV_ELP_MAX_AGE 64
+#define BATADV_OGM_MAX_ORIGDIFF 5
+#define BATADV_OGM_MAX_AGE 64
+
+/* number of OGMs sent with the last tt diff */
+#define BATADV_TT_OGM_APPEND_MAX 3
+
+/* Time in which a client can roam at most ROAMING_MAX_COUNT times in
+ * milliseconds
+ */
+#define BATADV_ROAMING_MAX_TIME 20000
+#define BATADV_ROAMING_MAX_COUNT 5
+
+#define BATADV_NO_FLAGS 0
+
+#define BATADV_NULL_IFINDEX 0 /* dummy ifindex used to avoid iface checks */
+
+#define BATADV_NO_MARK 0
+
+/* default interface for multi interface operation. The default interface is
+ * used for communication which originated locally (i.e. is not forwarded)
+ * or where special forwarding is not desired/necessary.
+ */
+#define BATADV_IF_DEFAULT ((struct batadv_hard_iface *)NULL)
+
+#define BATADV_NUM_WORDS BITS_TO_LONGS(BATADV_TQ_LOCAL_WINDOW_SIZE)
+
+#define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */
+
+/* number of packets to send for broadcasts on different interface types */
+#define BATADV_NUM_BCASTS_DEFAULT 1
+#define BATADV_NUM_BCASTS_WIRELESS 3
+
+/* length of the single packet used by the TP meter */
+#define BATADV_TP_PACKET_LEN ETH_DATA_LEN
+
+/* msecs after which an ARP_REQUEST is sent in broadcast as fallback */
+#define ARP_REQ_DELAY 250
+/* numbers of originator to contact for any PUT/GET DHT operation */
+#define BATADV_DAT_CANDIDATES_NUM 3
+
+/* BATADV_TQ_SIMILARITY_THRESHOLD - TQ points that a secondary metric can differ
+ * at most from the primary one in order to be still considered acceptable
+ */
+#define BATADV_TQ_SIMILARITY_THRESHOLD 50
+
+#define BATADV_MAX_AGGREGATION_PACKETS 32
+#define BATADV_MAX_AGGREGATION_BYTES 512
+#define BATADV_MAX_AGGREGATION_MS 100
+
+#define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */
+#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6)
+#define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10)
+#define BATADV_BLA_WAIT_PERIODS 3
+#define BATADV_BLA_LOOPDETECT_PERIODS 6
+#define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */
+
+#define BATADV_DUPLIST_SIZE 16
+#define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */
+/* don't reset again within 30 seconds */
+#define BATADV_RESET_PROTECTION_MS 30000
+#define BATADV_EXPECTED_SEQNO_RANGE 65536
+
+/**
+ * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions
+ */
+#define BATADV_TP_MAX_NUM 5
+
+/**
+ * enum batadv_mesh_state - State of a mesh interface
+ */
+enum batadv_mesh_state {
+ /** @BATADV_MESH_INACTIVE: mesh interface is not yet running */
+ BATADV_MESH_INACTIVE,
+
+ /** @BATADV_MESH_ACTIVE: interface is up and running */
+ BATADV_MESH_ACTIVE,
+
+ /** @BATADV_MESH_DEACTIVATING: interface is getting shut down */
+ BATADV_MESH_DEACTIVATING,
+};
+
+#define BATADV_BCAST_QUEUE_LEN 256
+#define BATADV_BATMAN_QUEUE_LEN 256
+
+/**
+ * enum batadv_uev_action - action type of uevent
+ */
+enum batadv_uev_action {
+ /** @BATADV_UEV_ADD: gateway was selected (after none was selected) */
+ BATADV_UEV_ADD = 0,
+
+ /**
+ * @BATADV_UEV_DEL: selected gateway was removed and none is selected
+ * anymore
+ */
+ BATADV_UEV_DEL,
+
+ /**
+ * @BATADV_UEV_CHANGE: a different gateway was selected as based gateway
+ */
+ BATADV_UEV_CHANGE,
+
+ /**
+ * @BATADV_UEV_LOOPDETECT: loop was detected which cannot be handled by
+ * bridge loop avoidance
+ */
+ BATADV_UEV_LOOPDETECT,
+};
+
+/**
+ * enum batadv_uev_type - Type of uevent
+ */
+enum batadv_uev_type {
+ /** @BATADV_UEV_GW: selected gateway was modified */
+ BATADV_UEV_GW = 0,
+
+ /** @BATADV_UEV_BLA: bridge loop avoidance event */
+ BATADV_UEV_BLA,
+};
+
+#define BATADV_GW_THRESHOLD 50
+
+/* Number of fragment chains for each orig_node */
+#define BATADV_FRAG_BUFFER_COUNT 8
+/* Maximum number of fragments for one packet */
+#define BATADV_FRAG_MAX_FRAGMENTS 16
+/* Maxumim size of each fragment */
+#define BATADV_FRAG_MAX_FRAG_SIZE 1280
+/* Time to keep fragments while waiting for rest of the fragments */
+#define BATADV_FRAG_TIMEOUT 10000
+
+#define BATADV_DAT_CANDIDATE_NOT_FOUND 0
+#define BATADV_DAT_CANDIDATE_ORIG 1
+
+/* Debug Messages */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+/* Append 'batman-adv: ' before kernel messages */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+/* Kernel headers */
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/jiffies.h>
+#include <linux/netdevice.h>
+#include <linux/percpu.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "types.h"
+#include "main.h"
+
+/**
+ * batadv_print_vid() - return printable version of vid information
+ * @vid: the VLAN identifier
+ *
+ * Return: -1 when no VLAN is used, VLAN id otherwise
+ */
+static inline int batadv_print_vid(unsigned short vid)
+{
+ if (vid & BATADV_VLAN_HAS_TAG)
+ return (int)(vid & VLAN_VID_MASK);
+ else
+ return -1;
+}
+
+extern struct list_head batadv_hardif_list;
+extern unsigned int batadv_hardif_generation;
+
+extern struct workqueue_struct *batadv_event_workqueue;
+
+int batadv_mesh_init(struct net_device *mesh_iface);
+void batadv_mesh_free(struct net_device *mesh_iface);
+bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr);
+int batadv_max_header_len(void);
+void batadv_skb_set_priority(struct sk_buff *skb, int offset);
+int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev);
+int
+batadv_recv_handler_register(u8 packet_type,
+ int (*recv_handler)(struct sk_buff *,
+ struct batadv_hard_iface *));
+void batadv_recv_handler_unregister(u8 packet_type);
+
+/**
+ * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses
+ * @data1: Pointer to a six-byte array containing the Ethernet address
+ * @data2: Pointer other six-byte array containing the Ethernet address
+ *
+ * note: can't use ether_addr_equal() as it requires aligned memory
+ *
+ * Return: true if they are the same ethernet addr
+ */
+static inline bool batadv_compare_eth(const void *data1, const void *data2)
+{
+ return ether_addr_equal_unaligned(data1, data2);
+}
+
+/**
+ * batadv_has_timed_out() - compares current time (jiffies) and timestamp +
+ * timeout
+ * @timestamp: base value to compare with (in jiffies)
+ * @timeout: added to base value before comparing (in milliseconds)
+ *
+ * Return: true if current time is after timestamp + timeout
+ */
+static inline bool batadv_has_timed_out(unsigned long timestamp,
+ unsigned int timeout)
+{
+ return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout));
+}
+
+/**
+ * batadv_atomic_dec_not_zero() - Decrease unless the number is 0
+ * @v: pointer of type atomic_t
+ *
+ * Return: non-zero if v was not 0, and zero otherwise.
+ */
+#define batadv_atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0)
+
+/**
+ * batadv_smallest_signed_int() - Returns the smallest signed integer in two's
+ * complement with the sizeof x
+ * @x: type of integer
+ *
+ * Return: smallest signed integer of type
+ */
+#define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u)))
+
+/**
+ * batadv_seq_before() - Checks if a sequence number x is a predecessor of y
+ * @x: potential predecessor of @y
+ * @y: value to compare @x against
+ *
+ * It handles overflows/underflows and can correctly check for a predecessor
+ * unless the variable sequence number has grown by more than
+ * 2**(bitwidth(x)-1)-1.
+ *
+ * This means that for a u8 with the maximum value 255, it would think:
+ *
+ * * when adding nothing - it is neither a predecessor nor a successor
+ * * before adding more than 127 to the starting value - it is a predecessor,
+ * * when adding 128 - it is neither a predecessor nor a successor,
+ * * after adding more than 127 to the starting value - it is a successor
+ *
+ * Return: true when x is a predecessor of y, false otherwise
+ */
+#define batadv_seq_before(x, y) ({ \
+ typeof(x)_d1 = (x); \
+ typeof(y)_d2 = (y); \
+ typeof(x)_dummy = (_d1 - _d2); \
+ (void)(&_d1 == &_d2); \
+ _dummy > batadv_smallest_signed_int(_dummy); \
+})
+
+/**
+ * batadv_seq_after() - Checks if a sequence number x is a successor of y
+ * @x: potential successor of @y
+ * @y: value to compare @x against
+ *
+ * It handles overflows/underflows and can correctly check for a successor
+ * unless the variable sequence number has grown by more than
+ * 2**(bitwidth(x)-1)-1.
+ *
+ * This means that for a u8 with the maximum value 255, it would think:
+ *
+ * * when adding nothing - it is neither a predecessor nor a successor
+ * * before adding more than 127 to the starting value - it is a predecessor,
+ * * when adding 128 - it is neither a predecessor nor a successor,
+ * * after adding more than 127 to the starting value - it is a successor
+ *
+ * Return: true when x is a successor of y, false otherwise
+ */
+#define batadv_seq_after(x, y) batadv_seq_before(y, x)
+
+/**
+ * batadv_add_counter() - Add to per cpu statistics counter of mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @idx: counter index which should be modified
+ * @count: value to increase counter by
+ *
+ * Stop preemption on local cpu while incrementing the counter
+ */
+static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
+ size_t count)
+{
+ this_cpu_add(bat_priv->bat_counters[idx], count);
+}
+
+/**
+ * batadv_inc_counter() - Increase per cpu statistics counter of mesh interface
+ * @b: the bat priv with all the mesh interface information
+ * @i: counter index which should be modified
+ */
+#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
+
+/**
+ * BATADV_SKB_CB() - Get batadv_skb_cb from skb control buffer
+ * @__skb: skb holding the control buffer
+ *
+ * The members of the control buffer are defined in struct batadv_skb_cb in
+ * types.h. The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
+ *
+ * Return: pointer to the batadv_skb_cb of the skb
+ */
+#define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0]))
+
+unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len);
+bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid);
+int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
+ enum batadv_uev_action action, const char *data);
+
+#endif /* _NET_BATMAN_ADV_MAIN_H_ */
diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c
new file mode 100644
index 000000000000..df7e95811ef5
--- /dev/null
+++ b/net/batman-adv/mesh-interface.c
@@ -0,0 +1,1133 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "mesh-interface.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/container_of.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bat_algo.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "multicast.h"
+#include "send.h"
+#include "translation-table.h"
+
+/**
+ * batadv_skb_head_push() - Increase header size and move (push) head pointer
+ * @skb: packet buffer which should be modified
+ * @len: number of bytes to add
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
+{
+ int result;
+
+ /* TODO: We must check if we can release all references to non-payload
+ * data using __skb_header_release in our skbs to allow skb_cow_header
+ * to work optimally. This means that those skbs are not allowed to read
+ * or write any data which is before the current position of skb->data
+ * after that call and thus allow other skbs with the same data buffer
+ * to write freely in that area.
+ */
+ result = skb_cow_head(skb, len);
+ if (result < 0)
+ return result;
+
+ skb_push(skb, len);
+ return 0;
+}
+
+/**
+ * batadv_sum_counter() - Sum the cpu-local counters for index 'idx'
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @idx: index of counter to sum up
+ *
+ * Return: sum of all cpu-local counters
+ */
+static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
+{
+ u64 *counters, sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
+ sum += counters[idx];
+ }
+
+ return sum;
+}
+
+static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct net_device_stats *stats = &dev->stats;
+
+ stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX);
+ stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES);
+ stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED);
+ stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX);
+ stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES);
+ return stats;
+}
+
+static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_meshif_vlan *vlan;
+ struct sockaddr *addr = p;
+ u8 old_addr[ETH_ALEN];
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ ether_addr_copy(old_addr, dev->dev_addr);
+ eth_hw_addr_set(dev, addr->sa_data);
+
+ /* only modify transtable if it has been initialized before */
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) {
+ batadv_tt_local_remove(bat_priv, old_addr, vlan->vid,
+ "mac address changed", false);
+ batadv_tt_local_add(dev, addr->sa_data, vlan->vid,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+
+ /* check ranges */
+ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev))
+ return -EINVAL;
+
+ WRITE_ONCE(dev->mtu, new_mtu);
+ bat_priv->mtu_set_by_user = new_mtu;
+
+ return 0;
+}
+
+/**
+ * batadv_interface_set_rx_mode() - set the rx mode of a device
+ * @dev: registered network device to modify
+ *
+ * We do not actually need to set any rx filters for the virtual batman
+ * mesh interface. However a dummy handler enables a user to set static
+ * multicast listeners for instance.
+ */
+static void batadv_interface_set_rx_mode(struct net_device *dev)
+{
+}
+
+static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
+ struct net_device *mesh_iface)
+{
+ struct ethhdr *ethhdr;
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_bcast_packet *bcast_packet;
+ static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00,
+ 0x00, 0x00};
+ static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00,
+ 0x00, 0x00};
+ enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO;
+ u8 *dst_hint = NULL, chaddr[ETH_ALEN];
+ struct vlan_ethhdr *vhdr;
+ unsigned int header_len = 0;
+ int data_len = skb->len, ret;
+ unsigned long brd_delay = 0;
+ bool do_bcast = false, client_added;
+ unsigned short vid;
+ u32 seqno;
+ int gw_mode;
+ enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST;
+ int mcast_is_routable = 0;
+ int network_offset = ETH_HLEN;
+ __be16 proto;
+
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto dropped;
+
+ /* reset control block to avoid left overs from previous users */
+ memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
+
+ netif_trans_update(mesh_iface);
+ vid = batadv_get_vid(skb, 0);
+
+ skb_reset_mac_header(skb);
+ ethhdr = eth_hdr(skb);
+
+ proto = ethhdr->h_proto;
+
+ switch (ntohs(proto)) {
+ case ETH_P_8021Q:
+ if (!pskb_may_pull(skb, sizeof(*vhdr)))
+ goto dropped;
+ vhdr = vlan_eth_hdr(skb);
+ proto = vhdr->h_vlan_encapsulated_proto;
+
+ /* drop batman-in-batman packets to prevent loops */
+ if (proto != htons(ETH_P_BATMAN)) {
+ network_offset += VLAN_HLEN;
+ break;
+ }
+
+ fallthrough;
+ case ETH_P_BATMAN:
+ goto dropped;
+ }
+
+ skb_set_network_header(skb, network_offset);
+
+ if (batadv_bla_tx(bat_priv, skb, vid))
+ goto dropped;
+
+ /* skb->data might have been reallocated by batadv_bla_tx() */
+ ethhdr = eth_hdr(skb);
+
+ /* Register the client MAC in the transtable */
+ if (!is_multicast_ether_addr(ethhdr->h_source) &&
+ !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
+ client_added = batadv_tt_local_add(mesh_iface, ethhdr->h_source,
+ vid, skb->skb_iif,
+ skb->mark);
+ if (!client_added)
+ goto dropped;
+ }
+
+ /* Snoop address candidates from DHCPACKs for early DAT filling */
+ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid);
+
+ /* don't accept stp packets. STP does not help in meshes.
+ * better use the bridge loop avoidance ...
+ *
+ * The same goes for ECTP sent at least by some Cisco Switches,
+ * it might confuse the mesh when used with bridge loop avoidance.
+ */
+ if (batadv_compare_eth(ethhdr->h_dest, stp_addr))
+ goto dropped;
+
+ if (batadv_compare_eth(ethhdr->h_dest, ectp_addr))
+ goto dropped;
+
+ gw_mode = atomic_read(&bat_priv->gw.mode);
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
+ /* if gw mode is off, broadcast every packet */
+ if (gw_mode == BATADV_GW_MODE_OFF) {
+ do_bcast = true;
+ goto send;
+ }
+
+ dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len,
+ chaddr);
+ /* skb->data may have been modified by
+ * batadv_gw_dhcp_recipient_get()
+ */
+ ethhdr = eth_hdr(skb);
+ /* if gw_mode is on, broadcast any non-DHCP message.
+ * All the DHCP packets are going to be sent as unicast
+ */
+ if (dhcp_rcp == BATADV_DHCP_NO) {
+ do_bcast = true;
+ goto send;
+ }
+
+ if (dhcp_rcp == BATADV_DHCP_TO_CLIENT)
+ dst_hint = chaddr;
+ else if ((gw_mode == BATADV_GW_MODE_SERVER) &&
+ (dhcp_rcp == BATADV_DHCP_TO_SERVER))
+ /* gateways should not forward any DHCP message if
+ * directed to a DHCP server
+ */
+ goto dropped;
+
+send:
+ if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) {
+ forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid,
+ &mcast_is_routable);
+ switch (forw_mode) {
+ case BATADV_FORW_BCAST:
+ break;
+ case BATADV_FORW_UCASTS:
+ case BATADV_FORW_MCAST:
+ do_bcast = false;
+ break;
+ case BATADV_FORW_NONE:
+ fallthrough;
+ default:
+ goto dropped;
+ }
+ }
+ }
+
+ batadv_skb_set_priority(skb, 0);
+
+ /* ethernet packet should be broadcasted */
+ if (do_bcast) {
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto dropped;
+
+ /* in case of ARP request, we do not immediately broadcasti the
+ * packet, instead we first wait for DAT to try to retrieve the
+ * correct ARP entry
+ */
+ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb))
+ brd_delay = msecs_to_jiffies(ARP_REQ_DELAY);
+
+ if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0)
+ goto dropped;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ bcast_packet->version = BATADV_COMPAT_VERSION;
+ bcast_packet->ttl = BATADV_TTL - 1;
+
+ /* batman packet type: broadcast */
+ bcast_packet->packet_type = BATADV_BCAST;
+ bcast_packet->reserved = 0;
+
+ /* hw address of first interface is the orig mac because only
+ * this mac is known throughout the mesh
+ */
+ ether_addr_copy(bcast_packet->orig,
+ primary_if->net_dev->dev_addr);
+
+ /* set broadcast sequence number */
+ seqno = atomic_inc_return(&bat_priv->bcast_seqno);
+ bcast_packet->seqno = htonl(seqno);
+
+ batadv_send_bcast_packet(bat_priv, skb, brd_delay, true);
+ /* unicast packet */
+ } else {
+ /* DHCP packets going to a server will use the GW feature */
+ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) {
+ ret = batadv_gw_out_of_range(bat_priv, skb);
+ if (ret)
+ goto dropped;
+ ret = batadv_send_skb_via_gw(bat_priv, skb, vid);
+ } else if (forw_mode == BATADV_FORW_UCASTS) {
+ ret = batadv_mcast_forw_send(bat_priv, skb, vid,
+ mcast_is_routable);
+ } else if (forw_mode == BATADV_FORW_MCAST) {
+ ret = batadv_mcast_forw_mcsend(bat_priv, skb);
+ } else {
+ if (batadv_dat_snoop_outgoing_arp_request(bat_priv,
+ skb))
+ goto dropped;
+
+ batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb);
+
+ ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint,
+ vid);
+ }
+ if (ret != NET_XMIT_SUCCESS)
+ goto dropped_freed;
+ }
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len);
+ goto end;
+
+dropped:
+ kfree_skb(skb);
+dropped_freed:
+ batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
+end:
+ batadv_hardif_put(primary_if);
+ return NETDEV_TX_OK;
+}
+
+/**
+ * batadv_interface_rx() - receive ethernet frame on local batman-adv interface
+ * @mesh_iface: local interface which will receive the ethernet frame
+ * @skb: ethernet frame for @mesh_iface
+ * @hdr_size: size of already parsed batman-adv header
+ * @orig_node: originator from which the batman-adv packet was sent
+ *
+ * Sends an ethernet frame to the receive path of the local @mesh_iface.
+ * skb->data has still point to the batman-adv header with the size @hdr_size.
+ * The caller has to have parsed this header already and made sure that at least
+ * @hdr_size bytes are still available for pull in @skb.
+ *
+ * The packet may still get dropped. This can happen when the encapsulated
+ * ethernet frame is invalid or contains again an batman-adv packet. Also
+ * unicast packets will be dropped directly when it was sent between two
+ * isolated clients.
+ */
+void batadv_interface_rx(struct net_device *mesh_iface,
+ struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_bcast_packet *batadv_bcast_packet;
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ struct vlan_ethhdr *vhdr;
+ struct ethhdr *ethhdr;
+ unsigned short vid;
+ int packet_type;
+
+ batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ packet_type = batadv_bcast_packet->packet_type;
+
+ skb_pull_rcsum(skb, hdr_size);
+ skb_reset_mac_header(skb);
+
+ /* clean the netfilter state now that the batman-adv header has been
+ * removed
+ */
+ nf_reset_ct(skb);
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ goto dropped;
+
+ vid = batadv_get_vid(skb, 0);
+ ethhdr = eth_hdr(skb);
+
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_8021Q:
+ if (!pskb_may_pull(skb, VLAN_ETH_HLEN))
+ goto dropped;
+
+ vhdr = skb_vlan_eth_hdr(skb);
+
+ /* drop batman-in-batman packets to prevent loops */
+ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN))
+ break;
+
+ fallthrough;
+ case ETH_P_BATMAN:
+ goto dropped;
+ }
+
+ /* skb->dev & skb->pkt_type are set here */
+ skb->protocol = eth_type_trans(skb, mesh_iface);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ /* Let the bridge loop avoidance check the packet. If will
+ * not handle it, we can safely push it up.
+ */
+ if (batadv_bla_rx(bat_priv, skb, vid, packet_type))
+ goto out;
+
+ if (orig_node)
+ batadv_tt_add_temporary_global_entry(bat_priv, orig_node,
+ ethhdr->h_source, vid);
+
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
+ /* set the mark on broadcast packets if AP isolation is ON and
+ * the packet is coming from an "isolated" client
+ */
+ if (batadv_vlan_ap_isola_get(bat_priv, vid) &&
+ batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source,
+ vid)) {
+ /* save bits in skb->mark not covered by the mask and
+ * apply the mark on the rest
+ */
+ skb->mark &= ~bat_priv->isolation_mark_mask;
+ skb->mark |= bat_priv->isolation_mark;
+ }
+ } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source,
+ ethhdr->h_dest, vid)) {
+ goto dropped;
+ }
+
+ netif_rx(skb);
+ goto out;
+
+dropped:
+ kfree_skb(skb);
+out:
+ return;
+}
+
+/**
+ * batadv_meshif_vlan_release() - release vlan from lists and queue for free
+ * after rcu grace period
+ * @ref: kref pointer of the vlan object
+ */
+void batadv_meshif_vlan_release(struct kref *ref)
+{
+ struct batadv_meshif_vlan *vlan;
+
+ vlan = container_of(ref, struct batadv_meshif_vlan, refcount);
+
+ spin_lock_bh(&vlan->bat_priv->meshif_vlan_list_lock);
+ hlist_del_rcu(&vlan->list);
+ spin_unlock_bh(&vlan->bat_priv->meshif_vlan_list_lock);
+
+ kfree_rcu(vlan, rcu);
+}
+
+/**
+ * batadv_meshif_vlan_get() - get the vlan object for a specific vid
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: the identifier of the vlan object to retrieve
+ *
+ * Return: the private data of the vlan matching the vid passed as argument or
+ * NULL otherwise. The refcounter of the returned object is incremented by 1.
+ */
+struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ struct batadv_meshif_vlan *vlan_tmp, *vlan = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->meshif_vlan_list, list) {
+ if (vlan_tmp->vid != vid)
+ continue;
+
+ if (!kref_get_unless_zero(&vlan_tmp->refcount))
+ continue;
+
+ vlan = vlan_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return vlan;
+}
+
+/**
+ * batadv_meshif_create_vlan() - allocate the needed resources for a new vlan
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: the VLAN identifier
+ *
+ * Return: 0 on success, a negative error otherwise.
+ */
+int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
+{
+ struct batadv_meshif_vlan *vlan;
+
+ spin_lock_bh(&bat_priv->meshif_vlan_list_lock);
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
+ if (vlan) {
+ batadv_meshif_vlan_put(vlan);
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
+ return -EEXIST;
+ }
+
+ vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC);
+ if (!vlan) {
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
+ return -ENOMEM;
+ }
+
+ vlan->bat_priv = bat_priv;
+ vlan->vid = vid;
+ kref_init(&vlan->refcount);
+
+ atomic_set(&vlan->ap_isolation, 0);
+
+ kref_get(&vlan->refcount);
+ hlist_add_head_rcu(&vlan->list, &bat_priv->meshif_vlan_list);
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
+
+ /* add a new TT local entry. This one will be marked with the NOPURGE
+ * flag
+ */
+ batadv_tt_local_add(bat_priv->mesh_iface,
+ bat_priv->mesh_iface->dev_addr, vid,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+
+ /* don't return reference to new meshif_vlan */
+ batadv_meshif_vlan_put(vlan);
+
+ return 0;
+}
+
+/**
+ * batadv_meshif_destroy_vlan() - remove and destroy a meshif_vlan object
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vlan: the object to remove
+ */
+static void batadv_meshif_destroy_vlan(struct batadv_priv *bat_priv,
+ struct batadv_meshif_vlan *vlan)
+{
+ /* explicitly remove the associated TT local entry because it is marked
+ * with the NOPURGE flag
+ */
+ batadv_tt_local_remove(bat_priv, bat_priv->mesh_iface->dev_addr,
+ vlan->vid, "vlan interface destroyed", false);
+
+ batadv_meshif_vlan_put(vlan);
+}
+
+/**
+ * batadv_interface_add_vid() - ndo_add_vid API implementation
+ * @dev: the netdev of the mesh interface
+ * @proto: protocol of the vlan id
+ * @vid: identifier of the new vlan
+ *
+ * Set up all the internal structures for handling the new vlan on top of the
+ * mesh interface
+ *
+ * Return: 0 on success or a negative error code in case of failure.
+ */
+static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
+ unsigned short vid)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_meshif_vlan *vlan;
+
+ /* only 802.1Q vlans are supported.
+ * batman-adv does not know how to handle other types
+ */
+ if (proto != htons(ETH_P_8021Q))
+ return -EINVAL;
+
+ /* VID 0 is only used to indicate "priority tag" frames which only
+ * contain priority information and no VID. No management structures
+ * should be created for this VID and it should be handled like an
+ * untagged frame.
+ */
+ if (vid == 0)
+ return 0;
+
+ vid |= BATADV_VLAN_HAS_TAG;
+
+ /* if a new vlan is getting created and it already exists, it means that
+ * it was not deleted yet. batadv_meshif_vlan_get() increases the
+ * refcount in order to revive the object.
+ *
+ * if it does not exist then create it.
+ */
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
+ if (!vlan)
+ return batadv_meshif_create_vlan(bat_priv, vid);
+
+ /* add a new TT local entry. This one will be marked with the NOPURGE
+ * flag. This must be added again, even if the vlan object already
+ * exists, because the entry was deleted by kill_vid()
+ */
+ batadv_tt_local_add(bat_priv->mesh_iface,
+ bat_priv->mesh_iface->dev_addr, vid,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+
+ return 0;
+}
+
+/**
+ * batadv_interface_kill_vid() - ndo_kill_vid API implementation
+ * @dev: the netdev of the mesh interface
+ * @proto: protocol of the vlan id
+ * @vid: identifier of the deleted vlan
+ *
+ * Destroy all the internal structures used to handle the vlan identified by vid
+ * on top of the mesh interface
+ *
+ * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q
+ * or -ENOENT if the specified vlan id wasn't registered.
+ */
+static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
+ unsigned short vid)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_meshif_vlan *vlan;
+
+ /* only 802.1Q vlans are supported. batman-adv does not know how to
+ * handle other types
+ */
+ if (proto != htons(ETH_P_8021Q))
+ return -EINVAL;
+
+ /* "priority tag" frames are handled like "untagged" frames
+ * and no meshif_vlan needs to be destroyed
+ */
+ if (vid == 0)
+ return 0;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
+ if (!vlan)
+ return -ENOENT;
+
+ batadv_meshif_destroy_vlan(bat_priv, vlan);
+
+ /* finally free the vlan object */
+ batadv_meshif_vlan_put(vlan);
+
+ return 0;
+}
+
+/* batman-adv network devices have devices nesting below it and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key batadv_netdev_xmit_lock_key;
+static struct lock_class_key batadv_netdev_addr_lock_key;
+
+/**
+ * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue
+ * @dev: device which owns the tx queue
+ * @txq: tx queue to modify
+ * @_unused: always NULL
+ */
+static void batadv_set_lockdep_class_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key);
+}
+
+/**
+ * batadv_set_lockdep_class() - Set txq and addr_list lockdep class
+ * @dev: network device to modify
+ */
+static void batadv_set_lockdep_class(struct net_device *dev)
+{
+ lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key);
+ netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL);
+}
+
+/**
+ * batadv_meshif_init_late() - late stage initialization of mesh interface
+ * @dev: registered network device to modify
+ *
+ * Return: error code on failures
+ */
+static int batadv_meshif_init_late(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv;
+ u32 random_seqno;
+ int ret;
+ size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM;
+
+ batadv_set_lockdep_class(dev);
+
+ bat_priv = netdev_priv(dev);
+ bat_priv->mesh_iface = dev;
+
+ /* batadv_interface_stats() needs to be available as soon as
+ * register_netdevice() has been called
+ */
+ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64));
+ if (!bat_priv->bat_counters)
+ return -ENOMEM;
+
+ atomic_set(&bat_priv->aggregated_ogms, 1);
+ atomic_set(&bat_priv->bonding, 0);
+#ifdef CONFIG_BATMAN_ADV_BLA
+ atomic_set(&bat_priv->bridge_loop_avoidance, 1);
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+ atomic_set(&bat_priv->distributed_arp_table, 1);
+#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ atomic_set(&bat_priv->multicast_mode, 1);
+ atomic_set(&bat_priv->multicast_fanout, 16);
+ atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
+ atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
+ atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
+ atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0);
+#endif
+ atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF);
+ atomic_set(&bat_priv->gw.bandwidth_down, 100);
+ atomic_set(&bat_priv->gw.bandwidth_up, 20);
+ atomic_set(&bat_priv->orig_interval, 1000);
+ atomic_set(&bat_priv->hop_penalty, 30);
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ atomic_set(&bat_priv->log_level, 0);
+#endif
+ atomic_set(&bat_priv->fragmentation, 1);
+ atomic_set(&bat_priv->packet_size_max, BATADV_MAX_MTU);
+ atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN);
+ atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN);
+
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+ atomic_set(&bat_priv->bcast_seqno, 1);
+ atomic_set(&bat_priv->tt.vn, 0);
+ atomic_set(&bat_priv->tt.ogm_append_cnt, 0);
+#ifdef CONFIG_BATMAN_ADV_BLA
+ atomic_set(&bat_priv->bla.num_requests, 0);
+#endif
+ atomic_set(&bat_priv->tp_num, 0);
+
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
+ bat_priv->tt.last_changeset = NULL;
+ bat_priv->tt.last_changeset_len = 0;
+ bat_priv->isolation_mark = 0;
+ bat_priv->isolation_mark_mask = 0;
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&bat_priv->frag_seqno, random_seqno);
+
+ bat_priv->primary_if = NULL;
+
+ if (!bat_priv->algo_ops) {
+ ret = batadv_algo_select(bat_priv, batadv_routing_algo);
+ if (ret < 0)
+ goto free_bat_counters;
+ }
+
+ ret = batadv_mesh_init(dev);
+ if (ret < 0)
+ goto free_bat_counters;
+
+ return 0;
+
+free_bat_counters:
+ free_percpu(bat_priv->bat_counters);
+ bat_priv->bat_counters = NULL;
+
+ return ret;
+}
+
+/**
+ * batadv_meshif_slave_add() - Add a slave interface to a batadv_mesh_interface
+ * @dev: batadv_mesh_interface used as master interface
+ * @slave_dev: net_device which should become the slave interface
+ * @extack: extended ACK report struct
+ *
+ * Return: 0 if successful or error otherwise.
+ */
+static int batadv_meshif_slave_add(struct net_device *dev,
+ struct net_device *slave_dev,
+ struct netlink_ext_ack *extack)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret = -EINVAL;
+
+ hard_iface = batadv_hardif_get_by_netdev(slave_dev);
+ if (!hard_iface || hard_iface->mesh_iface)
+ goto out;
+
+ ret = batadv_hardif_enable_interface(hard_iface, dev);
+
+out:
+ batadv_hardif_put(hard_iface);
+ return ret;
+}
+
+/**
+ * batadv_meshif_slave_del() - Delete a slave iface from a batadv_mesh_interface
+ * @dev: batadv_mesh_interface used as master interface
+ * @slave_dev: net_device which should be removed from the master interface
+ *
+ * Return: 0 if successful or error otherwise.
+ */
+static int batadv_meshif_slave_del(struct net_device *dev,
+ struct net_device *slave_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret = -EINVAL;
+
+ hard_iface = batadv_hardif_get_by_netdev(slave_dev);
+
+ if (!hard_iface || hard_iface->mesh_iface != dev)
+ goto out;
+
+ batadv_hardif_disable_interface(hard_iface);
+ ret = 0;
+
+out:
+ batadv_hardif_put(hard_iface);
+ return ret;
+}
+
+static const struct net_device_ops batadv_netdev_ops = {
+ .ndo_init = batadv_meshif_init_late,
+ .ndo_get_stats = batadv_interface_stats,
+ .ndo_vlan_rx_add_vid = batadv_interface_add_vid,
+ .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid,
+ .ndo_set_mac_address = batadv_interface_set_mac_addr,
+ .ndo_change_mtu = batadv_interface_change_mtu,
+ .ndo_set_rx_mode = batadv_interface_set_rx_mode,
+ .ndo_start_xmit = batadv_interface_tx,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_add_slave = batadv_meshif_slave_add,
+ .ndo_del_slave = batadv_meshif_slave_del,
+};
+
+static void batadv_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver));
+ strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->bus_info, "batman", sizeof(info->bus_info));
+}
+
+/* Inspired by drivers/net/ethernet/dlink/sundance.c:1702
+ * Declare each description string in struct.name[] to get fixed sized buffer
+ * and compile time checking for strings longer than ETH_GSTRING_LEN.
+ */
+static const struct {
+ const char name[ETH_GSTRING_LEN];
+} batadv_counters_strings[] = {
+ { "tx" },
+ { "tx_bytes" },
+ { "tx_dropped" },
+ { "rx" },
+ { "rx_bytes" },
+ { "forward" },
+ { "forward_bytes" },
+ { "mgmt_tx" },
+ { "mgmt_tx_bytes" },
+ { "mgmt_rx" },
+ { "mgmt_rx_bytes" },
+ { "frag_tx" },
+ { "frag_tx_bytes" },
+ { "frag_rx" },
+ { "frag_rx_bytes" },
+ { "frag_fwd" },
+ { "frag_fwd_bytes" },
+ { "tt_request_tx" },
+ { "tt_request_rx" },
+ { "tt_response_tx" },
+ { "tt_response_rx" },
+ { "tt_roam_adv_tx" },
+ { "tt_roam_adv_rx" },
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ { "mcast_tx" },
+ { "mcast_tx_bytes" },
+ { "mcast_tx_local" },
+ { "mcast_tx_local_bytes" },
+ { "mcast_rx" },
+ { "mcast_rx_bytes" },
+ { "mcast_rx_local" },
+ { "mcast_rx_local_bytes" },
+ { "mcast_fwd" },
+ { "mcast_fwd_bytes" },
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+ { "dat_get_tx" },
+ { "dat_get_rx" },
+ { "dat_put_tx" },
+ { "dat_put_rx" },
+ { "dat_cached_reply_tx" },
+#endif
+};
+
+static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+ if (stringset == ETH_SS_STATS)
+ memcpy(data, batadv_counters_strings,
+ sizeof(batadv_counters_strings));
+}
+
+static void batadv_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, u64 *data)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < BATADV_CNT_NUM; i++)
+ data[i] = batadv_sum_counter(bat_priv, i);
+}
+
+static int batadv_get_sset_count(struct net_device *dev, int stringset)
+{
+ if (stringset == ETH_SS_STATS)
+ return BATADV_CNT_NUM;
+
+ return -EOPNOTSUPP;
+}
+
+static const struct ethtool_ops batadv_ethtool_ops = {
+ .get_drvinfo = batadv_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_strings = batadv_get_strings,
+ .get_ethtool_stats = batadv_get_ethtool_stats,
+ .get_sset_count = batadv_get_sset_count,
+};
+
+/**
+ * batadv_meshif_free() - Deconstructor of batadv_mesh_interface
+ * @dev: Device to cleanup and remove
+ */
+static void batadv_meshif_free(struct net_device *dev)
+{
+ batadv_mesh_free(dev);
+
+ /* some scheduled RCU callbacks need the bat_priv struct to accomplish
+ * their tasks. Wait for them all to be finished before freeing the
+ * netdev and its private data (bat_priv)
+ */
+ rcu_barrier();
+}
+
+/**
+ * batadv_meshif_init_early() - early stage initialization of mesh interface
+ * @dev: registered network device to modify
+ */
+static void batadv_meshif_init_early(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &batadv_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = batadv_meshif_free;
+ dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+ dev->priv_flags |= IFF_NO_QUEUE;
+ dev->lltx = true;
+ dev->netns_immutable = true;
+
+ /* can't call min_mtu, because the needed variables
+ * have not been initialized yet
+ */
+ dev->mtu = ETH_DATA_LEN;
+ dev->max_mtu = BATADV_MAX_MTU;
+
+ /* generate random address */
+ eth_hw_addr_random(dev);
+
+ dev->ethtool_ops = &batadv_ethtool_ops;
+}
+
+/**
+ * batadv_meshif_validate() - validate configuration of new batadv link
+ * @tb: IFLA_INFO_DATA netlink attributes
+ * @data: enum batadv_ifla_attrs attributes
+ * @extack: extended ACK report struct
+ *
+ * Return: 0 if successful or error otherwise.
+ */
+static int batadv_meshif_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct batadv_algo_ops *algo_ops;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_BATADV_ALGO_NAME]) {
+ algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME]));
+ if (!algo_ops)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * batadv_meshif_newlink() - pre-initialize and register new batadv link
+ * @dev: network device to register
+ * @params: rtnl newlink parameters
+ * @extack: extended ACK report struct
+ *
+ * Return: 0 if successful or error otherwise.
+ */
+static int batadv_meshif_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ const char *algo_name;
+ int err;
+
+ if (data && data[IFLA_BATADV_ALGO_NAME]) {
+ algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]);
+ err = batadv_algo_select(bat_priv, algo_name);
+ if (err)
+ return -EINVAL;
+ }
+
+ return register_netdevice(dev);
+}
+
+/**
+ * batadv_meshif_destroy_netlink() - deletion of batadv_mesh_interface via
+ * netlink
+ * @mesh_iface: the to-be-removed batman-adv interface
+ * @head: list pointer
+ */
+static void batadv_meshif_destroy_netlink(struct net_device *mesh_iface,
+ struct list_head *head)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_meshif_vlan *vlan;
+
+ while (!list_empty(&mesh_iface->adj_list.lower)) {
+ hard_iface = netdev_adjacent_get_private(mesh_iface->adj_list.lower.next);
+ batadv_hardif_disable_interface(hard_iface);
+ }
+
+ /* destroy the "untagged" VLAN */
+ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (vlan) {
+ batadv_meshif_destroy_vlan(bat_priv, vlan);
+ batadv_meshif_vlan_put(vlan);
+ }
+
+ unregister_netdevice_queue(mesh_iface, head);
+}
+
+/**
+ * batadv_meshif_is_valid() - Check whether device is a batadv mesh interface
+ * @net_dev: device which should be checked
+ *
+ * Return: true when net_dev is a batman-adv interface, false otherwise
+ */
+bool batadv_meshif_is_valid(const struct net_device *net_dev)
+{
+ if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx)
+ return true;
+
+ return false;
+}
+
+static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = {
+ [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING },
+};
+
+struct rtnl_link_ops batadv_link_ops __read_mostly = {
+ .kind = "batadv",
+ .priv_size = sizeof(struct batadv_priv),
+ .setup = batadv_meshif_init_early,
+ .maxtype = IFLA_BATADV_MAX,
+ .policy = batadv_ifla_policy,
+ .validate = batadv_meshif_validate,
+ .newlink = batadv_meshif_newlink,
+ .dellink = batadv_meshif_destroy_netlink,
+};
diff --git a/net/batman-adv/mesh-interface.h b/net/batman-adv/mesh-interface.h
new file mode 100644
index 000000000000..53756c5a45e0
--- /dev/null
+++ b/net/batman-adv/mesh-interface.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_MESH_INTERFACE_H_
+#define _NET_BATMAN_ADV_MESH_INTERFACE_H_
+
+#include "main.h"
+
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
+void batadv_interface_rx(struct net_device *mesh_iface,
+ struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node);
+bool batadv_meshif_is_valid(const struct net_device *net_dev);
+extern struct rtnl_link_ops batadv_link_ops;
+int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
+void batadv_meshif_vlan_release(struct kref *ref);
+struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv,
+ unsigned short vid);
+
+/**
+ * batadv_meshif_vlan_put() - decrease the vlan object refcounter and
+ * possibly release it
+ * @vlan: the vlan object to release
+ */
+static inline void batadv_meshif_vlan_put(struct batadv_meshif_vlan *vlan)
+{
+ if (!vlan)
+ return;
+
+ kref_put(&vlan->refcount, batadv_meshif_vlan_release);
+}
+
+#endif /* _NET_BATMAN_ADV_MESH_INTERFACE_H_ */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
new file mode 100644
index 000000000000..e8c6b0bf670f
--- /dev/null
+++ b/net/batman-adv/multicast.c
@@ -0,0 +1,2195 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing
+ */
+
+#include "multicast.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_bridge.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/sprintf.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+#include <net/genetlink.h>
+#include <net/if_inet6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bridge_loop_avoidance.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "netlink.h"
+#include "send.h"
+#include "translation-table.h"
+#include "tvlv.h"
+
+static void batadv_mcast_mla_update(struct work_struct *work);
+
+/**
+ * batadv_mcast_start_timer() - schedule the multicast periodic worker
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
+{
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work,
+ msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD));
+}
+
+/**
+ * batadv_mcast_get_bridge() - get the bridge on top of the meshif if it exists
+ * @mesh_iface: netdev struct of the mesh interface
+ *
+ * If the given mesh interface has a bridge on top then the refcount
+ * of the according net device is increased.
+ *
+ * Return: NULL if no such bridge exists. Otherwise the net device of the
+ * bridge.
+ */
+static struct net_device *batadv_mcast_get_bridge(struct net_device *mesh_iface)
+{
+ struct net_device *upper = mesh_iface;
+
+ rcu_read_lock();
+ do {
+ upper = netdev_master_upper_dev_get_rcu(upper);
+ } while (upper && !netif_is_bridge_master(upper));
+
+ dev_hold(upper);
+ rcu_read_unlock();
+
+ return upper;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_meshif_get_ipv4() - get mcast router flags from
+ * node for IPv4
+ * @dev: the interface to check
+ *
+ * Checks the presence of an IPv4 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise.
+ */
+static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv4(struct net_device *dev)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev && IN_DEV_MFORWARD(in_dev))
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR4;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_meshif_get_ipv6() - get mcast router flags from
+ * node for IPv6
+ * @dev: the interface to check
+ *
+ * Checks the presence of an IPv6 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise.
+ */
+#if IS_ENABLED(CONFIG_IPV6_MROUTE)
+static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev)
+{
+ struct inet6_dev *in6_dev = __in6_dev_get(dev);
+
+ if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding))
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR6;
+}
+#else
+static inline u8
+batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev)
+{
+ return BATADV_MCAST_WANT_NO_RTR6;
+}
+#endif
+
+/**
+ * batadv_mcast_mla_rtr_flags_meshif_get() - get mcast router flags from node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bridge: bridge interface on top of the mesh_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_meshif_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ struct net_device *dev = bridge ? bridge : bat_priv->mesh_iface;
+ u8 flags = BATADV_NO_FLAGS;
+
+ rcu_read_lock();
+
+ flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv4(dev);
+ flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv6(dev);
+
+ rcu_read_unlock();
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bridge: bridge interface on top of the mesh_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ struct net_device *dev = bat_priv->mesh_iface;
+ u8 flags = BATADV_NO_FLAGS;
+
+ if (!bridge)
+ return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+ if (!br_multicast_has_router_adjacent(dev, ETH_P_IP))
+ flags |= BATADV_MCAST_WANT_NO_RTR4;
+ if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6))
+ flags |= BATADV_MCAST_WANT_NO_RTR6;
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_get() - get multicast router flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bridge: bridge interface on top of the mesh_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node or behind its bridge.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+ flags &= batadv_mcast_mla_rtr_flags_meshif_get(bat_priv, bridge);
+ flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge);
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Checks if all active hard interfaces have an MTU larger or equal to 1280
+ * bytes (IPv6 minimum MTU).
+ *
+ * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise.
+ */
+static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv)
+{
+ const struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) {
+ rcu_read_unlock();
+ return BATADV_NO_FLAGS;
+ }
+ }
+ rcu_read_unlock();
+
+ return BATADV_MCAST_HAVE_MC_PTYPE_CAPA;
+}
+
+/**
+ * batadv_mcast_mla_flags_get() - get the new multicast flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: A set of flags for the current/next TVLV, querier and
+ * bridge state.
+ */
+static struct batadv_mcast_mla_flags
+batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv)
+{
+ struct net_device *dev = bat_priv->mesh_iface;
+ struct batadv_mcast_querier_state *qr4, *qr6;
+ struct batadv_mcast_mla_flags mla_flags;
+ struct net_device *bridge;
+
+ bridge = batadv_mcast_get_bridge(dev);
+
+ memset(&mla_flags, 0, sizeof(mla_flags));
+ mla_flags.enabled = 1;
+ mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv,
+ bridge);
+ mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv);
+
+ if (!bridge)
+ return mla_flags;
+
+ dev_put(bridge);
+
+ mla_flags.bridged = 1;
+ qr4 = &mla_flags.querier_ipv4;
+ qr6 = &mla_flags.querier_ipv6;
+
+ if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING))
+ pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n");
+
+ qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP);
+ qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP);
+
+ qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6);
+ qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6);
+
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES;
+
+ /* 1) If no querier exists at all, then multicast listeners on
+ * our local TT clients behind the bridge will keep silent.
+ * 2) If the selected querier is on one of our local TT clients,
+ * behind the bridge, then this querier might shadow multicast
+ * listeners on our local TT clients, behind this bridge.
+ *
+ * In both cases, we will signalize other batman nodes that
+ * we need all multicast traffic of the according protocol.
+ */
+ if (!qr4->exists || qr4->shadowing) {
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4;
+ }
+
+ if (!qr6->exists || qr6->shadowing) {
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6;
+ }
+
+ return mla_flags;
+}
+
+/**
+ * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
+ * @mcast_addr: the multicast address to check
+ * @mcast_list: the list with multicast addresses to search in
+ *
+ * Return: true if the given address is already in the given list.
+ * Otherwise returns false.
+ */
+static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+
+ hlist_for_each_entry(mcast_entry, mcast_list, list)
+ if (batadv_compare_eth(mcast_entry->addr, mcast_addr))
+ return true;
+
+ return false;
+}
+
+/**
+ * batadv_mcast_mla_meshif_get_ipv4() - get meshif IPv4 multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
+ *
+ * Collects multicast addresses of IPv4 multicast listeners residing
+ * on this kernel on the given mesh interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+static int
+batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct batadv_hw_addr *new;
+ struct in_device *in_dev;
+ u8 mcast_addr[ETH_ALEN];
+ struct ip_mc_list *pmc;
+ int ret = 0;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ return 0;
+
+ rcu_read_lock();
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (pmc = rcu_dereference(in_dev->mc_list); pmc;
+ pmc = rcu_dereference(pmc->next_rcu)) {
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv4_is_local_multicast(pmc->multiaddr))
+ continue;
+
+ if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ !ipv4_is_local_multicast(pmc->multiaddr))
+ continue;
+
+ ip_eth_mc_map(pmc->multiaddr, mcast_addr);
+
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
+ continue;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mcast_addr);
+ hlist_add_head(&new->list, mcast_list);
+ ret++;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * batadv_mcast_mla_meshif_get_ipv6() - get meshif IPv6 multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
+ *
+ * Collects multicast addresses of IPv6 multicast listeners residing
+ * on this kernel on the given mesh interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct batadv_hw_addr *new;
+ struct inet6_dev *in6_dev;
+ u8 mcast_addr[ETH_ALEN];
+ struct ifmcaddr6 *pmc6;
+ int ret = 0;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ return 0;
+
+ rcu_read_lock();
+
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (pmc6 = rcu_dereference(in6_dev->mc_list);
+ pmc6;
+ pmc6 = rcu_dereference(pmc6->next)) {
+ if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr))
+ continue;
+
+ if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) >
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+
+ ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr);
+
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
+ continue;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mcast_addr);
+ hlist_add_head(&new->list, mcast_list);
+ ret++;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+#else
+static inline int
+batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ return 0;
+}
+#endif
+
+/**
+ * batadv_mcast_mla_meshif_get() - get meshif multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
+ *
+ * Collects multicast addresses of multicast listeners residing
+ * on this kernel on the given mesh interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * If there is a bridge interface on top of dev, collect from that one
+ * instead. Just like with IP addresses and routes, multicast listeners
+ * will(/should) register to the bridge interface instead of an
+ * enslaved bat0.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+static int
+batadv_mcast_mla_meshif_get(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct net_device *bridge = batadv_mcast_get_bridge(dev);
+ int ret4, ret6 = 0;
+
+ if (bridge)
+ dev = bridge;
+
+ ret4 = batadv_mcast_mla_meshif_get_ipv4(dev, mcast_list, flags);
+ if (ret4 < 0)
+ goto out;
+
+ ret6 = batadv_mcast_mla_meshif_get_ipv6(dev, mcast_list, flags);
+ if (ret6 < 0) {
+ ret4 = 0;
+ goto out;
+ }
+
+out:
+ dev_put(bridge);
+
+ return ret4 + ret6;
+}
+
+/**
+ * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address
+ * @dst: destination to write to - a multicast MAC address
+ * @src: source to read from - a multicast IP address
+ *
+ * Converts a given multicast IPv4/IPv6 address from a bridge
+ * to its matching multicast MAC address and copies it into the given
+ * destination buffer.
+ *
+ * Caller needs to make sure the destination buffer can hold
+ * at least ETH_ALEN bytes.
+ */
+static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
+{
+ if (src->proto == htons(ETH_P_IP))
+ ip_eth_mc_map(src->dst.ip4, dst);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (src->proto == htons(ETH_P_IPV6))
+ ipv6_eth_mc_map(&src->dst.ip6, dst);
+#endif
+ else
+ eth_zero_addr(dst);
+}
+
+/**
+ * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners
+ * @dev: a bridge slave whose bridge to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
+ *
+ * Collects multicast addresses of multicast listeners residing
+ * on foreign, non-mesh devices which we gave access to our mesh via
+ * a bridge on top of the given mesh interface, dev, in the given
+ * mcast_list.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+static int batadv_mcast_mla_bridge_get(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
+ struct br_ip_list *br_ip_entry, *tmp;
+ u8 tvlv_flags = flags->tvlv_flags;
+ struct batadv_hw_addr *new;
+ u8 mcast_addr[ETH_ALEN];
+ int ret;
+
+ /* we don't need to detect these devices/listeners, the IGMP/MLD
+ * snooping code of the Linux bridge already does that for us
+ */
+ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list);
+ if (ret < 0)
+ goto out;
+
+ list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) {
+ if (br_ip_entry->addr.proto == htons(ETH_P_IP)) {
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ continue;
+
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4))
+ continue;
+
+ if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4))
+ continue;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) {
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ continue;
+
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6))
+ continue;
+
+ if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) >
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+ }
+#endif
+
+ batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr);
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
+ continue;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mcast_addr);
+ hlist_add_head(&new->list, mcast_list);
+ }
+
+out:
+ list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) {
+ list_del(&br_ip_entry->list);
+ kfree(br_ip_entry);
+ }
+
+ return ret;
+}
+
+/**
+ * batadv_mcast_mla_list_free() - free a list of multicast addresses
+ * @mcast_list: the list to free
+ *
+ * Removes and frees all items in the given mcast_list.
+ */
+static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
+ hlist_del(&mcast_entry->list);
+ kfree(mcast_entry);
+ }
+}
+
+/**
+ * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @mcast_list: a list of addresses which should _not_ be removed
+ *
+ * Retracts the announcement of any multicast listener from the
+ * translation table except the ones listed in the given mcast_list.
+ *
+ * If mcast_list is NULL then all are retracted.
+ */
+static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
+ list) {
+ if (mcast_list &&
+ batadv_mcast_mla_is_duplicate(mcast_entry->addr,
+ mcast_list))
+ continue;
+
+ batadv_tt_local_remove(bat_priv, mcast_entry->addr,
+ BATADV_NO_FLAGS,
+ "mcast TT outdated", false);
+
+ hlist_del(&mcast_entry->list);
+ kfree(mcast_entry);
+ }
+}
+
+/**
+ * batadv_mcast_mla_tt_add() - add multicast listener announcements
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @mcast_list: a list of addresses which are going to get added
+ *
+ * Adds multicast listener announcements from the given mcast_list to the
+ * translation table if they have not been added yet.
+ */
+static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+ struct hlist_node *tmp;
+
+ if (!mcast_list)
+ return;
+
+ hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
+ if (batadv_mcast_mla_is_duplicate(mcast_entry->addr,
+ &bat_priv->mcast.mla_list))
+ continue;
+
+ if (!batadv_tt_local_add(bat_priv->mesh_iface,
+ mcast_entry->addr, BATADV_NO_FLAGS,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK))
+ continue;
+
+ hlist_del(&mcast_entry->list);
+ hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list);
+ }
+}
+
+/**
+ * batadv_mcast_querier_log() - debug output regarding the querier status on
+ * link
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD")
+ * @old_state: the previous querier state on our link
+ * @new_state: the new querier state on our link
+ *
+ * Outputs debug messages to the logging facility with log level 'mcast'
+ * regarding changes to the querier status on the link which are relevant
+ * to our multicast optimizations.
+ *
+ * Usually this is about whether a querier appeared or vanished in
+ * our mesh or whether the querier is in the suboptimal position of being
+ * behind our local bridge segment: Snooping switches will directly
+ * forward listener reports to the querier, therefore batman-adv and
+ * the bridge will potentially not see these listeners - the querier is
+ * potentially shadowing listeners from us then.
+ *
+ * This is only interesting for nodes with a bridge on top of their
+ * mesh interface.
+ */
+static void
+batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
+ struct batadv_mcast_querier_state *old_state,
+ struct batadv_mcast_querier_state *new_state)
+{
+ if (!old_state->exists && new_state->exists)
+ batadv_info(bat_priv->mesh_iface, "%s Querier appeared\n",
+ str_proto);
+ else if (old_state->exists && !new_state->exists)
+ batadv_info(bat_priv->mesh_iface,
+ "%s Querier disappeared - multicast optimizations disabled\n",
+ str_proto);
+ else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists)
+ batadv_info(bat_priv->mesh_iface,
+ "No %s Querier present - multicast optimizations disabled\n",
+ str_proto);
+
+ if (new_state->exists) {
+ if ((!old_state->shadowing && new_state->shadowing) ||
+ (!old_state->exists && new_state->shadowing))
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "%s Querier is behind our bridged segment: Might shadow listeners\n",
+ str_proto);
+ else if (old_state->shadowing && !new_state->shadowing)
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "%s Querier is not behind our bridged segment\n",
+ str_proto);
+ }
+}
+
+/**
+ * batadv_mcast_bridge_log() - debug output for topology changes in bridged
+ * setups
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @new_flags: flags indicating the new multicast state
+ *
+ * If no bridges are ever used on this node, then this function does nothing.
+ *
+ * Otherwise this function outputs debug information to the 'mcast' log level
+ * which might be relevant to our multicast optimizations.
+ *
+ * More precisely, it outputs information when a bridge interface is added or
+ * removed from a mesh interface. And when a bridge is present, it further
+ * outputs information about the querier state which is relevant for the
+ * multicast flags this node is going to set.
+ */
+static void
+batadv_mcast_bridge_log(struct batadv_priv *bat_priv,
+ struct batadv_mcast_mla_flags *new_flags)
+{
+ struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags;
+
+ if (!old_flags->bridged && new_flags->bridged)
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "Bridge added: Setting Unsnoopables(U)-flag\n");
+ else if (old_flags->bridged && !new_flags->bridged)
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "Bridge removed: Unsetting Unsnoopables(U)-flag\n");
+
+ if (new_flags->bridged) {
+ batadv_mcast_querier_log(bat_priv, "IGMP",
+ &old_flags->querier_ipv4,
+ &new_flags->querier_ipv4);
+ batadv_mcast_querier_log(bat_priv, "MLD",
+ &old_flags->querier_ipv6,
+ &new_flags->querier_ipv6);
+ }
+}
+
+/**
+ * batadv_mcast_flags_log() - output debug information about mcast flag changes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @flags: TVLV flags indicating the new multicast state
+ *
+ * Whenever the multicast TVLV flags this node announces change, this function
+ * should be used to notify userspace about the change.
+ */
+static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags)
+{
+ bool old_enabled = bat_priv->mcast.mla_flags.enabled;
+ u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags;
+ char str_old_flags[] = "[.... . .]";
+
+ sprintf(str_old_flags, "[%c%c%c%s%s%c]",
+ (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
+ (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
+ (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ",
+ !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.');
+
+ batadv_dbg(BATADV_DBG_MCAST, bat_priv,
+ "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n",
+ old_enabled ? str_old_flags : "<undefined>",
+ (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
+ (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ",
+ !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.');
+}
+
+/**
+ * batadv_mcast_mla_flags_update() - update multicast flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @flags: flags indicating the new multicast state
+ *
+ * Updates the own multicast tvlv with our current multicast related settings,
+ * capabilities and inabilities.
+ */
+static void
+batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct batadv_tvlv_mcast_data mcast_data;
+
+ if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags)))
+ return;
+
+ batadv_mcast_bridge_log(bat_priv, flags);
+ batadv_mcast_flags_log(bat_priv, flags->tvlv_flags);
+
+ mcast_data.flags = flags->tvlv_flags;
+ memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
+
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2,
+ &mcast_data, sizeof(mcast_data));
+
+ bat_priv->mcast.mla_flags = *flags;
+}
+
+/**
+ * __batadv_mcast_mla_update() - update the own MLAs
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Updates the own multicast listener announcements in the translation
+ * table as well as the own, announced multicast tvlv container.
+ *
+ * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list
+ * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are
+ * ensured by the non-parallel execution of the worker this function
+ * belongs to.
+ */
+static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
+{
+ struct net_device *mesh_iface = bat_priv->mesh_iface;
+ struct hlist_head mcast_list = HLIST_HEAD_INIT;
+ struct batadv_mcast_mla_flags flags;
+ int ret;
+
+ flags = batadv_mcast_mla_flags_get(bat_priv);
+
+ ret = batadv_mcast_mla_meshif_get(mesh_iface, &mcast_list, &flags);
+ if (ret < 0)
+ goto out;
+
+ ret = batadv_mcast_mla_bridge_get(mesh_iface, &mcast_list, &flags);
+ if (ret < 0)
+ goto out;
+
+ spin_lock(&bat_priv->mcast.mla_lock);
+ batadv_mcast_mla_tt_retract(bat_priv, &mcast_list);
+ batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
+ batadv_mcast_mla_flags_update(bat_priv, &flags);
+ spin_unlock(&bat_priv->mcast.mla_lock);
+
+out:
+ batadv_mcast_mla_list_free(&mcast_list);
+}
+
+/**
+ * batadv_mcast_mla_update() - update the own MLAs
+ * @work: kernel work struct
+ *
+ * Updates the own multicast listener announcements in the translation
+ * table as well as the own, announced multicast tvlv container.
+ *
+ * In the end, reschedules the work timer.
+ */
+static void batadv_mcast_mla_update(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_mcast *priv_mcast;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work);
+ bat_priv = container_of(priv_mcast, struct batadv_priv, mcast);
+
+ __batadv_mcast_mla_update(bat_priv);
+ batadv_mcast_start_timer(bat_priv);
+}
+
+/**
+ * batadv_mcast_is_report_ipv4() - check for IGMP reports
+ * @skb: the ethernet frame destined for the mesh
+ *
+ * This call might reallocate skb data.
+ *
+ * Checks whether the given frame is a valid IGMP report.
+ *
+ * Return: If so then true, otherwise false.
+ */
+static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
+{
+ if (ip_mc_check_igmp(skb) < 0)
+ return false;
+
+ switch (igmp_hdr(skb)->type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding
+ * potential
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the IPv4 packet to check
+ * @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
+ *
+ * Checks whether the given IPv4 packet has the potential to be forwarded with a
+ * mode more optimal than classic flooding.
+ *
+ * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory
+ * allocation failure.
+ */
+static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ bool *is_unsnoopable,
+ int *is_routable)
+{
+ struct iphdr *iphdr;
+
+ /* We might fail due to out-of-memory -> drop it */
+ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr)))
+ return -ENOMEM;
+
+ if (batadv_mcast_is_report_ipv4(skb))
+ return -EINVAL;
+
+ iphdr = ip_hdr(skb);
+
+ /* link-local multicast listeners behind a bridge are
+ * not snoopable (see RFC4541, section 2.1.2.2)
+ */
+ if (ipv4_is_local_multicast(iphdr->daddr))
+ *is_unsnoopable = true;
+ else
+ *is_routable = ETH_P_IP;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_is_report_ipv6() - check for MLD reports
+ * @skb: the ethernet frame destined for the mesh
+ *
+ * This call might reallocate skb data.
+ *
+ * Checks whether the given frame is a valid MLD report.
+ *
+ * Return: If so then true, otherwise false.
+ */
+static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
+{
+ if (ipv6_mc_check_mld(skb) < 0)
+ return false;
+
+ switch (icmp6_hdr(skb)->icmp6_type) {
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MLD2_REPORT:
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding
+ * potential
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the IPv6 packet to check
+ * @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
+ *
+ * Checks whether the given IPv6 packet has the potential to be forwarded with a
+ * mode more optimal than classic flooding.
+ *
+ * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory
+ */
+static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ bool *is_unsnoopable,
+ int *is_routable)
+{
+ struct ipv6hdr *ip6hdr;
+
+ /* We might fail due to out-of-memory -> drop it */
+ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr)))
+ return -ENOMEM;
+
+ if (batadv_mcast_is_report_ipv6(skb))
+ return -EINVAL;
+
+ ip6hdr = ipv6_hdr(skb);
+
+ if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL)
+ return -EINVAL;
+
+ /* link-local-all-nodes multicast listeners behind a bridge are
+ * not snoopable (see RFC4541, section 3, paragraph 3)
+ */
+ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr))
+ *is_unsnoopable = true;
+ else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL)
+ *is_routable = ETH_P_IPV6;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_mode_check() - check for optimized forwarding potential
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast frame to check
+ * @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
+ *
+ * Checks whether the given multicast ethernet frame has the potential to be
+ * forwarded with a mode more optimal than classic flooding.
+ *
+ * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory
+ */
+static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ bool *is_unsnoopable,
+ int *is_routable)
+{
+ struct ethhdr *ethhdr = eth_hdr(skb);
+
+ if (!atomic_read(&bat_priv->multicast_mode))
+ return -EINVAL;
+
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
+ is_unsnoopable,
+ is_routable);
+ case ETH_P_IPV6:
+ if (!IS_ENABLED(CONFIG_IPV6))
+ return -EINVAL;
+
+ return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb,
+ is_unsnoopable,
+ is_routable);
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast
+ * interest
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @ethhdr: ethernet header of a packet
+ *
+ * Return: the number of nodes which want all IPv4 multicast traffic if the
+ * given ethhdr is from an IPv4 packet or the number of nodes which want all
+ * IPv6 traffic if it matches an IPv6 packet.
+ */
+static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr)
+{
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_IP:
+ return atomic_read(&bat_priv->mcast.num_want_all_ipv4);
+ case ETH_P_IPV6:
+ return atomic_read(&bat_priv->mcast.num_want_all_ipv6);
+ default:
+ /* we shouldn't be here... */
+ return 0;
+ }
+}
+
+/**
+ * batadv_mcast_forw_rtr_count() - count nodes with a multicast router
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @protocol: the ethernet protocol type to count multicast routers for
+ *
+ * Return: the number of nodes which want all routable IPv4 multicast traffic
+ * if the protocol is ETH_P_IP or the number of nodes which want all routable
+ * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0.
+ */
+
+static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv,
+ int protocol)
+{
+ switch (protocol) {
+ case ETH_P_IP:
+ return atomic_read(&bat_priv->mcast.num_want_all_rtr4);
+ case ETH_P_IPV6:
+ return atomic_read(&bat_priv->mcast.num_want_all_rtr6);
+ default:
+ return 0;
+ }
+}
+
+/**
+ * batadv_mcast_forw_mode_by_count() - get forwarding mode by count
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to check
+ * @vid: the vlan identifier
+ * @is_routable: stores whether the destination is routable
+ * @count: the number of originators the multicast packet need to be sent to
+ *
+ * For a multicast packet with multiple destination originators, checks which
+ * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a
+ * complete batman-adv multicast header.
+ *
+ * Return:
+ * BATADV_FORW_MCAST: If all nodes have multicast packet routing
+ * capabilities and an MTU >= 1280 on all hard interfaces (including us)
+ * and the encapsulated multicast packet with all destination addresses
+ * would still fit into an 1280 bytes batman-adv multicast packet
+ * (excluding the outer ethernet frame) and we could successfully push
+ * the full batman-adv multicast packet header.
+ * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv
+ * multicast packet and the amount of batman-adv unicast packets needed
+ * is smaller or equal to the configured multicast fanout.
+ * BATADV_FORW_BCAST: Otherwise.
+ */
+static enum batadv_forw_mode
+batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ int is_routable, int count)
+{
+ unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count);
+ u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags;
+
+ if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) &&
+ own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA &&
+ skb->len + mcast_hdrlen <= IPV6_MIN_MTU &&
+ batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count))
+ return BATADV_FORW_MCAST;
+
+ if (count <= atomic_read(&bat_priv->multicast_fanout))
+ return BATADV_FORW_UCASTS;
+
+ return BATADV_FORW_BCAST;
+}
+
+/**
+ * batadv_mcast_forw_mode() - check on how to forward a multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to check
+ * @vid: the vlan identifier
+ * @is_routable: stores whether the destination is routable
+ *
+ * Return: The forwarding mode as enum batadv_forw_mode.
+ */
+enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int *is_routable)
+{
+ int ret, tt_count, ip_count, unsnoop_count, total_count;
+ bool is_unsnoopable = false;
+ struct ethhdr *ethhdr;
+ int rtr_count = 0;
+
+ ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable,
+ is_routable);
+ if (ret == -ENOMEM)
+ return BATADV_FORW_NONE;
+ else if (ret < 0)
+ return BATADV_FORW_BCAST;
+
+ ethhdr = eth_hdr(skb);
+
+ tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest,
+ BATADV_NO_FLAGS);
+ ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr);
+ unsnoop_count = !is_unsnoopable ? 0 :
+ atomic_read(&bat_priv->mcast.num_want_all_unsnoopables);
+ rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable);
+
+ total_count = tt_count + ip_count + unsnoop_count + rtr_count;
+
+ if (!total_count)
+ return BATADV_FORW_NONE;
+ else if (unsnoop_count)
+ return BATADV_FORW_BCAST;
+
+ return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable,
+ total_count);
+}
+
+/**
+ * batadv_mcast_forw_send_orig() - send a multicast packet to an originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to send
+ * @vid: the vlan identifier
+ * @orig_node: the originator to send the packet to
+ *
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ struct batadv_orig_node *orig_node)
+{
+ /* Avoid sending multicast-in-unicast packets to other BLA
+ * gateways - they already got the frame from the LAN side
+ * we share with them.
+ * TODO: Refactor to take BLA into account earlier, to avoid
+ * reducing the mcast_fanout count.
+ */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) {
+ dev_kfree_skb(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0,
+ orig_node, vid);
+}
+
+/**
+ * batadv_mcast_forw_tt() - forwards a packet to multicast listeners
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any multicast
+ * listener registered in the translation table. A transmission is performed
+ * via a batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
+{
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ struct batadv_tt_global_entry *tt_global;
+ const u8 *addr = eth_hdr(skb)->h_dest;
+
+ tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global)
+ goto out;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid,
+ orig_entry->orig_node);
+ }
+ rcu_read_unlock();
+
+ batadv_tt_global_entry_put(tt_global);
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_ipv4_list,
+ mcast_want_all_ipv4_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: The multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_ipv6_list,
+ mcast_want_all_ipv6_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A
+ * transmission is performed via a batman-adv unicast packet for each such
+ * destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
+ */
+static int
+batadv_mcast_forw_want_all(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ switch (ntohs(eth_hdr(skb)->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid);
+ default:
+ /* we shouldn't be here... */
+ return NET_XMIT_DROP;
+ }
+}
+
+/**
+ * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_rtr4_list,
+ mcast_want_all_rtr4_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: The multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_rtr6_list,
+ mcast_want_all_rtr6_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A
+ * transmission is performed via a batman-adv unicast packet for each such
+ * destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
+ */
+static int
+batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ switch (ntohs(eth_hdr(skb)->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid);
+ default:
+ /* we shouldn't be here... */
+ return NET_XMIT_DROP;
+ }
+}
+
+/**
+ * batadv_mcast_forw_send() - send packet to any detected multicast recipient
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ * @is_routable: stores whether the destination is routable
+ *
+ * Sends copies of a frame with multicast destination to any node that signaled
+ * interest in it, that is either via the translation table or the according
+ * want-all flags. A transmission is performed via a batman-adv unicast packet
+ * for each such destination node.
+ *
+ * The given skb is consumed/freed.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable)
+{
+ int ret;
+
+ ret = batadv_mcast_forw_tt(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ ret = batadv_mcast_forw_want_all(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ if (!is_routable)
+ goto skip_mc_router;
+
+ ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+skip_mc_router:
+ consume_skb(skb);
+ return ret;
+}
+
+/**
+ * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator,
+ * orig, has toggled then this method updates the counter and the list
+ * accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag unset to set */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) {
+ atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag set to unset */
+ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) {
+ atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has
+ * toggled then this method updates the counter and the list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_ipv4_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag unset to set */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) {
+ atomic_inc(&bat_priv->mcast.num_want_all_ipv4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag set to unset */
+ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) {
+ atomic_dec(&bat_priv->mcast.num_want_all_ipv4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has
+ * toggled then this method updates the counter and the list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_ipv6_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag unset to set */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) {
+ atomic_inc(&bat_priv->mcast.num_want_all_ipv6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag set to unset */
+ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) {
+ atomic_dec(&bat_priv->mcast.num_want_all_ipv6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has
+ * toggled then this method updates the counter and the list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_rtr4_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) {
+ atomic_inc(&bat_priv->mcast.num_want_all_rtr4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag unset to set */
+ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) {
+ atomic_dec(&bat_priv->mcast.num_want_all_rtr4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has
+ * toggled then this method updates the counter and the list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_rtr6_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) {
+ atomic_inc(&bat_priv->mcast.num_want_all_rtr6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag unset to set */
+ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) {
+ atomic_dec(&bat_priv->mcast.num_want_all_rtr6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has
+ * toggled then this method updates the counter accordingly.
+ */
+static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) &&
+ orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)
+ atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa);
+ /* switched from flag unset to set */
+ else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA &&
+ !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA))
+ atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa);
+}
+
+/**
+ * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV
+ * @enabled: whether the originator has multicast TVLV support enabled
+ * @tvlv_value: tvlv buffer containing the multicast flags
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Return: multicast flags for the given tvlv buffer
+ */
+static u8
+batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len)
+{
+ u8 mcast_flags = BATADV_NO_FLAGS;
+
+ if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags))
+ mcast_flags = *(u8 *)tvlv_value;
+
+ if (!enabled) {
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ }
+
+ /* remove redundant flags to avoid sending duplicate packets later */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ mcast_flags |= BATADV_MCAST_WANT_NO_RTR4;
+
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
+
+ return mcast_flags;
+}
+
+/**
+ * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the multicast data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value,
+ u16 tvlv_value_len)
+{
+ bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ u8 mcast_flags;
+
+ mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled,
+ tvlv_value, tvlv_value_len);
+
+ spin_lock_bh(&orig->mcast_handler_lock);
+
+ if (orig_mcast_enabled &&
+ !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
+ set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
+ } else if (!orig_mcast_enabled &&
+ test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
+ clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
+ }
+
+ set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized);
+
+ batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags);
+
+ orig->mcast_flags = mcast_flags;
+ spin_unlock_bh(&orig->mcast_handler_lock);
+}
+
+/**
+ * batadv_mcast_init() - initialize the multicast optimizations structures
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_mcast_init(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler,
+ NULL, NULL, BATADV_TVLV_MCAST, 2,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ batadv_tvlv_handler_register(bat_priv, NULL, NULL,
+ batadv_mcast_forw_tracker_tvlv_handler,
+ BATADV_TVLV_MCAST_TRACKER, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+
+ INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update);
+ batadv_mcast_start_timer(bat_priv);
+}
+
+/**
+ * batadv_mcast_mesh_info_put() - put multicast info into a netlink message
+ * @msg: buffer for the message
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 or error code.
+ */
+int batadv_mcast_mesh_info_put(struct sk_buff *msg,
+ struct batadv_priv *bat_priv)
+{
+ u32 flags = bat_priv->mcast.mla_flags.tvlv_flags;
+ u32 flags_priv = BATADV_NO_FLAGS;
+
+ if (bat_priv->mcast.mla_flags.bridged) {
+ flags_priv |= BATADV_MCAST_FLAGS_BRIDGED;
+
+ if (bat_priv->mcast.mla_flags.querier_ipv4.exists)
+ flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS;
+ if (bat_priv->mcast.mla_flags.querier_ipv6.exists)
+ flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS;
+ if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing)
+ flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING;
+ if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing)
+ flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) ||
+ nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @orig_node: originator to dump the multicast flags of
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_orig_node *orig_node)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_MCAST_FLAGS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ genl_dump_check_consistent(cb, hdr);
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ orig_node->orig)) {
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+ }
+
+ if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+ &orig_node->capabilities)) {
+ if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS,
+ orig_node->mcast_flags)) {
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+}
+
+/**
+ * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags
+ * table to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hashtable *hash,
+ unsigned int bucket, long *idx_skip)
+{
+ struct batadv_orig_node *orig_node;
+ long idx = 0;
+
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) {
+ if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+ &orig_node->capa_initialized))
+ continue;
+
+ if (idx < *idx_skip)
+ goto skip;
+
+ if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
+ *idx_skip = idx;
+
+ return -EMSGSIZE;
+ }
+
+skip:
+ idx++;
+ }
+ spin_unlock_bh(&hash->list_locks[bucket]);
+
+ return 0;
+}
+
+/**
+ * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @cb: Control block containing additional options
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bucket: current bucket to dump
+ * @idx: index in current bucket to the next entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_priv *bat_priv, long *bucket, long *idx)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ long bucket_tmp = *bucket;
+ long idx_tmp = *idx;
+
+ while (bucket_tmp < hash->size) {
+ if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash,
+ bucket_tmp, &idx_tmp))
+ break;
+
+ bucket_tmp++;
+ idx_tmp = 0;
+ }
+
+ *bucket = bucket_tmp;
+ *idx = idx_tmp;
+
+ return msg->len;
+}
+
+/**
+ * batadv_mcast_netlink_get_primary() - get primary interface from netlink
+ * callback
+ * @cb: netlink callback structure
+ * @primary_if: the primary interface pointer to return the result in
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_mcast_netlink_get_primary(struct netlink_callback *cb,
+ struct batadv_hard_iface **primary_if)
+{
+ struct batadv_hard_iface *hard_iface = NULL;
+ struct net_device *mesh_iface;
+ struct batadv_priv *bat_priv;
+ int ret = 0;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ hard_iface = batadv_primary_if_get_selected(bat_priv);
+ if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+out:
+ dev_put(mesh_iface);
+
+ if (!ret && primary_if)
+ *primary_if = hard_iface;
+ else
+ batadv_hardif_put(hard_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ int portid = NETLINK_CB(cb->skb).portid;
+ struct batadv_priv *bat_priv;
+ long *bucket = &cb->args[0];
+ long *idx = &cb->args[1];
+ int ret;
+
+ ret = batadv_mcast_netlink_get_primary(cb, &primary_if);
+ if (ret)
+ return ret;
+
+ bat_priv = netdev_priv(primary_if->mesh_iface);
+ ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx);
+
+ batadv_hardif_put(primary_if);
+ return ret;
+}
+
+/**
+ * batadv_mcast_free() - free the multicast optimizations structures
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_mcast_free(struct batadv_priv *bat_priv)
+{
+ cancel_delayed_work_sync(&bat_priv->mcast.work);
+
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
+
+ /* safely calling outside of worker, as worker was canceled above */
+ batadv_mcast_mla_tt_retract(bat_priv, NULL);
+}
+
+/**
+ * batadv_mcast_purge_orig() - reset originator global mcast state modifications
+ * @orig: the originator which is going to get purged
+ */
+void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
+{
+ struct batadv_priv *bat_priv = orig->bat_priv;
+
+ spin_lock_bh(&orig->mcast_handler_lock);
+
+ batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_rtr4_update(bat_priv, orig,
+ BATADV_MCAST_WANT_NO_RTR4);
+ batadv_mcast_want_rtr6_update(bat_priv, orig,
+ BATADV_MCAST_WANT_NO_RTR6);
+ batadv_mcast_have_mc_ptype_update(bat_priv, orig,
+ BATADV_MCAST_HAVE_MC_PTYPE_CAPA);
+
+ spin_unlock_bh(&orig->mcast_handler_lock);
+}
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
new file mode 100644
index 000000000000..d97ee51d26f2
--- /dev/null
+++ b/net/batman-adv/multicast.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing
+ */
+
+#ifndef _NET_BATMAN_ADV_MULTICAST_H_
+#define _NET_BATMAN_ADV_MULTICAST_H_
+
+#include "main.h"
+
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+/**
+ * enum batadv_forw_mode - the way a packet should be forwarded as
+ */
+enum batadv_forw_mode {
+ /**
+ * @BATADV_FORW_BCAST: forward the packet to all nodes via a batman-adv
+ * broadcast packet
+ */
+ BATADV_FORW_BCAST,
+
+ /**
+ * @BATADV_FORW_UCASTS: forward the packet to some nodes via one
+ * or more batman-adv unicast packets
+ */
+ BATADV_FORW_UCASTS,
+
+ /**
+ * @BATADV_FORW_MCAST: forward the packet to some nodes via a
+ * batman-adv multicast packet
+ */
+ BATADV_FORW_MCAST,
+
+ /** @BATADV_FORW_NONE: don't forward, drop it */
+ BATADV_FORW_NONE,
+};
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+
+enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int *is_routable);
+
+int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable);
+
+void batadv_mcast_init(struct batadv_priv *bat_priv);
+
+int batadv_mcast_mesh_info_put(struct sk_buff *msg,
+ struct batadv_priv *bat_priv);
+
+int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb);
+
+void batadv_mcast_free(struct batadv_priv *bat_priv);
+
+void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
+
+/* multicast_forw.c */
+
+int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+
+unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests);
+
+bool batadv_mcast_forw_push(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int count);
+
+int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv, struct sk_buff *skb);
+
+#else
+
+static inline enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int *is_routable)
+{
+ return BATADV_FORW_BCAST;
+}
+
+static inline int
+batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+
+static inline int batadv_mcast_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline int
+batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline int batadv_mcast_flags_dump(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void batadv_mcast_free(struct batadv_priv *bat_priv)
+{
+}
+
+static inline void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node)
+{
+}
+
+static inline int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
+#endif /* _NET_BATMAN_ADV_MULTICAST_H_ */
diff --git a/net/batman-adv/multicast_forw.c b/net/batman-adv/multicast_forw.c
new file mode 100644
index 000000000000..b8668a80b94a
--- /dev/null
+++ b/net/batman-adv/multicast_forw.c
@@ -0,0 +1,1178 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing
+ */
+
+#include "multicast.h"
+#include "main.h"
+
+#include <linux/bug.h>
+#include <linux/build_bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ipv6.h>
+#include <linux/limits.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "bridge_loop_avoidance.h"
+#include "originator.h"
+#include "send.h"
+#include "translation-table.h"
+
+#define batadv_mcast_forw_tracker_for_each_dest(dest, num_dests) \
+ for (; num_dests; num_dests--, (dest) += ETH_ALEN)
+
+#define batadv_mcast_forw_tracker_for_each_dest2(dest1, dest2, num_dests) \
+ for (; num_dests; num_dests--, (dest1) += ETH_ALEN, (dest2) += ETH_ALEN)
+
+/**
+ * batadv_mcast_forw_skb_push() - skb_push and memorize amount of pushed bytes
+ * @skb: the skb to push onto
+ * @size: the amount of bytes to push
+ * @len: stores the total amount of bytes pushed
+ *
+ * Performs an skb_push() onto the given skb and adds the amount of pushed bytes
+ * to the given len pointer.
+ *
+ * Return: the return value of the skb_push() call.
+ */
+static void *batadv_mcast_forw_skb_push(struct sk_buff *skb, size_t size,
+ unsigned short *len)
+{
+ *len += size;
+ return skb_push(skb, size);
+}
+
+/**
+ * batadv_mcast_forw_push_padding() - push 2 padding bytes to skb's front
+ * @skb: the skb to push onto
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Pushes two padding bytes to the front of the given skb.
+ *
+ * Return: On success a pointer to the first byte of the two pushed padding
+ * bytes within the skb. NULL otherwise.
+ */
+static char *
+batadv_mcast_forw_push_padding(struct sk_buff *skb, unsigned short *tvlv_len)
+{
+ const int pad_len = 2;
+ char *padding;
+
+ if (skb_headroom(skb) < pad_len)
+ return NULL;
+
+ padding = batadv_mcast_forw_skb_push(skb, pad_len, tvlv_len);
+ memset(padding, 0, pad_len);
+
+ return padding;
+}
+
+/**
+ * batadv_mcast_forw_push_est_padding() - push padding bytes if necessary
+ * @skb: the skb to potentially push the padding onto
+ * @count: the (estimated) number of originators the multicast packet needs to
+ * be sent to
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * If the number of destination entries is even then this adds two
+ * padding bytes to the end of the tracker TVLV.
+ *
+ * Return: true on success or if no padding is needed, false otherwise.
+ */
+static bool
+batadv_mcast_forw_push_est_padding(struct sk_buff *skb, int count,
+ unsigned short *tvlv_len)
+{
+ if (!(count % 2) && !batadv_mcast_forw_push_padding(skb, tvlv_len))
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_orig_entry() - get orig_node from an hlist node
+ * @node: the hlist node to get the orig_node from
+ * @entry_offset: the offset of the hlist node within the orig_node struct
+ *
+ * Return: The orig_node containing the hlist node on success, NULL on error.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_orig_entry(struct hlist_node *node,
+ size_t entry_offset)
+{
+ /* sanity check */
+ switch (entry_offset) {
+ case offsetof(struct batadv_orig_node, mcast_want_all_ipv4_node):
+ case offsetof(struct batadv_orig_node, mcast_want_all_ipv6_node):
+ case offsetof(struct batadv_orig_node, mcast_want_all_rtr4_node):
+ case offsetof(struct batadv_orig_node, mcast_want_all_rtr6_node):
+ break;
+ default:
+ WARN_ON(1);
+ return NULL;
+ }
+
+ return (struct batadv_orig_node *)((void *)node - entry_offset);
+}
+
+/**
+ * batadv_mcast_forw_push_dest() - push an originator MAC address onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination address onto
+ * @vid: the vlan identifier
+ * @orig_node: the originator node to get the MAC address from
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * If the orig_node is a BLA backbone gateway, if there is not enough skb
+ * headroom available or if num_dests is already at its maximum (65535) then
+ * neither the skb nor num_dests is changed. Otherwise the originator's MAC
+ * address is pushed onto the given skb and num_dests incremented by one.
+ *
+ * Return: true if the orig_node is a backbone gateway or if an orig address
+ * was pushed successfully, false otherwise.
+ */
+static bool batadv_mcast_forw_push_dest(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ struct batadv_orig_node *orig_node,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ BUILD_BUG_ON(sizeof_field(struct batadv_tvlv_mcast_tracker, num_dests)
+ != sizeof(__be16));
+
+ /* Avoid sending to other BLA gateways - they already got the frame from
+ * the LAN side we share with them.
+ * TODO: Refactor to take BLA into account earlier in mode check.
+ */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid))
+ return true;
+
+ if (skb_headroom(skb) < ETH_ALEN || *num_dests == U16_MAX)
+ return false;
+
+ batadv_mcast_forw_skb_push(skb, ETH_ALEN, tvlv_len);
+ ether_addr_copy(skb->data, orig_node->orig);
+ (*num_dests)++;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_dests_list() - push originators from list onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @head: the list to gather originators from
+ * @entry_offset: offset of an hlist node in an orig_node structure
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators in the given list onto the given
+ * skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static int batadv_mcast_forw_push_dests_list(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ struct hlist_head *head,
+ size_t entry_offset,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct hlist_node *node;
+ struct batadv_orig_node *orig_node;
+
+ rcu_read_lock();
+ __hlist_for_each_rcu(node, head) {
+ orig_node = batadv_mcast_forw_orig_entry(node, entry_offset);
+ if (!orig_node ||
+ !batadv_mcast_forw_push_dest(bat_priv, skb, vid, orig_node,
+ num_dests, tvlv_len)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_tt() - push originators with interest through TT
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet through the translation table onto the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool
+batadv_mcast_forw_push_tt(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ struct batadv_tt_global_entry *tt_global;
+ const u8 *addr = eth_hdr(skb)->h_dest;
+
+ /* ok */
+ int ret = true;
+
+ tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global)
+ goto out;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) {
+ if (!batadv_mcast_forw_push_dest(bat_priv, skb, vid,
+ orig_entry->orig_node,
+ num_dests, tvlv_len)) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ batadv_tt_global_entry_put(tt_global);
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_push_want_all() - push originators with want-all flag
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet through the want-all flag onto the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool batadv_mcast_forw_push_want_all(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct hlist_head *head = NULL;
+ size_t offset;
+ int ret;
+
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ head = &bat_priv->mcast.want_all_ipv4_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_ipv4_node);
+ break;
+ case htons(ETH_P_IPV6):
+ head = &bat_priv->mcast.want_all_ipv6_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_ipv6_node);
+ break;
+ default:
+ return false;
+ }
+
+ ret = batadv_mcast_forw_push_dests_list(bat_priv, skb, vid, head,
+ offset, num_dests, tvlv_len);
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_want_rtr() - push originators with want-router flag
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet through the want-all-rtr flag onto the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool batadv_mcast_forw_push_want_rtr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct hlist_head *head = NULL;
+ size_t offset;
+ int ret;
+
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ head = &bat_priv->mcast.want_all_rtr4_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_rtr4_node);
+ break;
+ case htons(ETH_P_IPV6):
+ head = &bat_priv->mcast.want_all_rtr6_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_rtr6_node);
+ break;
+ default:
+ return false;
+ }
+
+ ret = batadv_mcast_forw_push_dests_list(bat_priv, skb, vid, head,
+ offset, num_dests, tvlv_len);
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_scrape() - remove bytes within skb data
+ * @skb: the skb to remove bytes from
+ * @offset: the offset from the skb data from which to scrape
+ * @len: the amount of bytes to scrape starting from the offset
+ *
+ * Scrapes/removes len bytes from the given skb at the given offset from the
+ * skb data.
+ *
+ * Caller needs to ensure that the region from the skb data's start up
+ * to/including the to be removed bytes are linearized.
+ */
+static void batadv_mcast_forw_scrape(struct sk_buff *skb,
+ unsigned short offset,
+ unsigned short len)
+{
+ char *to, *from;
+
+ SKB_LINEAR_ASSERT(skb);
+
+ to = skb_pull(skb, len);
+ from = to - len;
+
+ memmove(to, from, offset);
+}
+
+/**
+ * batadv_mcast_forw_push_scrape_padding() - remove TVLV padding
+ * @skb: the skb to potentially adjust the TVLV's padding on
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Remove two padding bytes from the end of the multicast tracker TVLV,
+ * from before the payload data.
+ *
+ * Caller needs to ensure that the TVLV bytes are linearized.
+ */
+static void batadv_mcast_forw_push_scrape_padding(struct sk_buff *skb,
+ unsigned short *tvlv_len)
+{
+ const int pad_len = 2;
+
+ batadv_mcast_forw_scrape(skb, *tvlv_len - pad_len, pad_len);
+ *tvlv_len -= pad_len;
+}
+
+/**
+ * batadv_mcast_forw_push_insert_padding() - insert TVLV padding
+ * @skb: the skb to potentially adjust the TVLV's padding on
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Inserts two padding bytes at the end of the multicast tracker TVLV,
+ * before the payload data in the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool batadv_mcast_forw_push_insert_padding(struct sk_buff *skb,
+ unsigned short *tvlv_len)
+{
+ unsigned short offset = *tvlv_len;
+ char *to, *from = skb->data;
+
+ to = batadv_mcast_forw_push_padding(skb, tvlv_len);
+ if (!to)
+ return false;
+
+ memmove(to, from, offset);
+ memset(to + offset, 0, *tvlv_len - offset);
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_adjust_padding() - adjust padding if necessary
+ * @skb: the skb to potentially adjust the TVLV's padding on
+ * @count: the estimated number of originators the multicast packet needs to
+ * be sent to
+ * @num_dests_pushed: the number of originators that were actually added to the
+ * multicast packet's tracker TVLV
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Adjusts the padding in the multicast packet's tracker TVLV depending on the
+ * initially estimated amount of destinations versus the amount of destinations
+ * that were actually added to the tracker TVLV.
+ *
+ * If the initial estimate was correct or at least the oddness was the same then
+ * no padding adjustment is performed.
+ * If the initially estimated number was even, so padding was initially added,
+ * but it turned out to be odd then padding is removed.
+ * If the initially estimated number was odd, so no padding was initially added,
+ * but it turned out to be even then padding is added.
+ *
+ * Return: true if no padding adjustment is needed or the adjustment was
+ * successful, false otherwise.
+ */
+static bool
+batadv_mcast_forw_push_adjust_padding(struct sk_buff *skb, int *count,
+ unsigned short num_dests_pushed,
+ unsigned short *tvlv_len)
+{
+ int ret = true;
+
+ if (likely((num_dests_pushed % 2) == (*count % 2)))
+ goto out;
+
+ /**
+ * estimated even number of destinations, but turned out to be odd
+ * -> remove padding
+ */
+ if (!(*count % 2) && (num_dests_pushed % 2))
+ batadv_mcast_forw_push_scrape_padding(skb, tvlv_len);
+ /**
+ * estimated odd number of destinations, but turned out to be even
+ * -> add padding
+ */
+ else if ((*count % 2) && (!(num_dests_pushed % 2)))
+ ret = batadv_mcast_forw_push_insert_padding(skb, tvlv_len);
+
+out:
+ *count = num_dests_pushed;
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_push_dests() - push originator addresses onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @is_routable: indicates whether the destination is routable
+ * @count: the number of originators the multicast packet needs to be sent to
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet onto the given skb.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int
+batadv_mcast_forw_push_dests(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int *count,
+ unsigned short *tvlv_len)
+{
+ unsigned short num_dests = 0;
+
+ if (!batadv_mcast_forw_push_est_padding(skb, *count, tvlv_len))
+ goto err;
+
+ if (!batadv_mcast_forw_push_tt(bat_priv, skb, vid, &num_dests,
+ tvlv_len))
+ goto err;
+
+ if (!batadv_mcast_forw_push_want_all(bat_priv, skb, vid, &num_dests,
+ tvlv_len))
+ goto err;
+
+ if (is_routable &&
+ !batadv_mcast_forw_push_want_rtr(bat_priv, skb, vid, &num_dests,
+ tvlv_len))
+ goto err;
+
+ if (!batadv_mcast_forw_push_adjust_padding(skb, count, num_dests,
+ tvlv_len))
+ goto err;
+
+ return 0;
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_mcast_forw_push_tracker() - push a multicast tracker TVLV header
+ * @skb: the skb to push the tracker TVLV onto
+ * @num_dests: the number of destination addresses to set in the header
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Pushes a multicast tracker TVLV header onto the given skb, including the
+ * generic TVLV header but excluding the destination MAC addresses.
+ *
+ * The provided num_dests value is taken into consideration to set the
+ * num_dests field in the tracker header and to set the appropriate TVLV length
+ * value fields.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int batadv_mcast_forw_push_tracker(struct sk_buff *skb, int num_dests,
+ unsigned short *tvlv_len)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ unsigned int tvlv_value_len;
+
+ if (skb_headroom(skb) < sizeof(*mcast_tracker) + sizeof(*tvlv_hdr))
+ return -ENOMEM;
+
+ tvlv_value_len = sizeof(*mcast_tracker) + *tvlv_len;
+ if (tvlv_value_len + sizeof(*tvlv_hdr) > U16_MAX)
+ return -ENOMEM;
+
+ batadv_mcast_forw_skb_push(skb, sizeof(*mcast_tracker), tvlv_len);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb->data;
+ mcast_tracker->num_dests = htons(num_dests);
+
+ skb_reset_network_header(skb);
+
+ batadv_mcast_forw_skb_push(skb, sizeof(*tvlv_hdr), tvlv_len);
+ tvlv_hdr = (struct batadv_tvlv_hdr *)skb->data;
+ tvlv_hdr->type = BATADV_TVLV_MCAST_TRACKER;
+ tvlv_hdr->version = 1;
+ tvlv_hdr->len = htons(tvlv_value_len);
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_push_tvlvs() - push a multicast tracker TVLV onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the tracker TVLV onto
+ * @vid: the vlan identifier
+ * @is_routable: indicates whether the destination is routable
+ * @count: the number of originators the multicast packet needs to be sent to
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Pushes a multicast tracker TVLV onto the given skb, including the collected
+ * destination MAC addresses and the generic TVLV header.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int
+batadv_mcast_forw_push_tvlvs(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int count,
+ unsigned short *tvlv_len)
+{
+ int ret;
+
+ ret = batadv_mcast_forw_push_dests(bat_priv, skb, vid, is_routable,
+ &count, tvlv_len);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_mcast_forw_push_tracker(skb, count, tvlv_len);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_push_hdr() - push a multicast packet header onto an skb
+ * @skb: the skb to push the header onto
+ * @tvlv_len: the total TVLV length value to set in the header
+ *
+ * Pushes a batman-adv multicast packet header onto the given skb and sets
+ * the provided total TVLV length value in it.
+ *
+ * Caller needs to ensure enough skb headroom is available.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int
+batadv_mcast_forw_push_hdr(struct sk_buff *skb, unsigned short tvlv_len)
+{
+ struct batadv_mcast_packet *mcast_packet;
+
+ if (skb_headroom(skb) < sizeof(*mcast_packet))
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(*mcast_packet));
+
+ mcast_packet = (struct batadv_mcast_packet *)skb->data;
+ mcast_packet->version = BATADV_COMPAT_VERSION;
+ mcast_packet->ttl = BATADV_TTL;
+ mcast_packet->packet_type = BATADV_MCAST;
+ mcast_packet->reserved = 0;
+ mcast_packet->tvlv_len = htons(tvlv_len);
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_scrub_dests() - scrub destinations in a tracker TVLV
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @comp_neigh: next hop neighbor to scrub+collect destinations for
+ * @dest: start MAC entry in original skb's tracker TVLV
+ * @next_dest: start MAC entry in to be sent skb's tracker TVLV
+ * @num_dests: number of remaining destination MAC entries to iterate over
+ *
+ * This sorts destination entries into either the original batman-adv
+ * multicast packet or the skb (copy) that is going to be sent to comp_neigh
+ * next.
+ *
+ * In preparation for the next, to be (unicast) transmitted batman-adv multicast
+ * packet skb to be sent to the given neighbor node, tries to collect all
+ * originator MAC addresses that have the given neighbor node as their next hop
+ * in the to be transmitted skb (copy), which next_dest points into. That is we
+ * zero all destination entries in next_dest which do not have comp_neigh as
+ * their next hop. And zero all destination entries in the original skb that
+ * would have comp_neigh as their next hop (to avoid redundant transmissions and
+ * duplicated payload later).
+ */
+static void
+batadv_mcast_forw_scrub_dests(struct batadv_priv *bat_priv,
+ struct batadv_neigh_node *comp_neigh, u8 *dest,
+ u8 *next_dest, u16 num_dests)
+{
+ struct batadv_neigh_node *next_neigh;
+
+ /* skip first entry, this is what we are comparing with */
+ eth_zero_addr(dest);
+ dest += ETH_ALEN;
+ next_dest += ETH_ALEN;
+ num_dests--;
+
+ batadv_mcast_forw_tracker_for_each_dest2(dest, next_dest, num_dests) {
+ if (is_zero_ether_addr(next_dest))
+ continue;
+
+ /* sanity check, we expect unicast destinations */
+ if (is_multicast_ether_addr(next_dest)) {
+ eth_zero_addr(dest);
+ eth_zero_addr(next_dest);
+ continue;
+ }
+
+ next_neigh = batadv_orig_to_router(bat_priv, next_dest, NULL);
+ if (!next_neigh) {
+ eth_zero_addr(next_dest);
+ continue;
+ }
+
+ if (!batadv_compare_eth(next_neigh->addr, comp_neigh->addr)) {
+ eth_zero_addr(next_dest);
+ batadv_neigh_node_put(next_neigh);
+ continue;
+ }
+
+ /* found an entry for our next packet to transmit, so remove it
+ * from the original packet
+ */
+ eth_zero_addr(dest);
+ batadv_neigh_node_put(next_neigh);
+ }
+}
+
+/**
+ * batadv_mcast_forw_shrink_fill() - swap slot with next non-zero destination
+ * @slot: the to be filled zero-MAC destination entry in a tracker TVLV
+ * @num_dests_slot: remaining entries in tracker TVLV from/including slot
+ *
+ * Searches for the next non-zero-MAC destination entry in a tracker TVLV after
+ * the given slot pointer. And if found, swaps it with the zero-MAC destination
+ * entry which the slot points to.
+ *
+ * Return: true if slot was swapped/filled successfully, false otherwise.
+ */
+static bool batadv_mcast_forw_shrink_fill(u8 *slot, u16 num_dests_slot)
+{
+ u16 num_dests_filler;
+ u8 *filler;
+
+ /* sanity check, should not happen */
+ if (!num_dests_slot)
+ return false;
+
+ num_dests_filler = num_dests_slot - 1;
+ filler = slot + ETH_ALEN;
+
+ /* find a candidate to fill the empty slot */
+ batadv_mcast_forw_tracker_for_each_dest(filler, num_dests_filler) {
+ if (is_zero_ether_addr(filler))
+ continue;
+
+ ether_addr_copy(slot, filler);
+ eth_zero_addr(filler);
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * batadv_mcast_forw_shrink_pack_dests() - pack destinations of a tracker TVLV
+ * @skb: the batman-adv multicast packet to compact destinations in
+ *
+ * Compacts the originator destination MAC addresses in the multicast tracker
+ * TVLV of the given multicast packet. This is done by moving all non-zero
+ * MAC addresses in direction of the skb head and all zero MAC addresses in skb
+ * tail direction, within the multicast tracker TVLV.
+ *
+ * Return: The number of consecutive zero MAC address destinations which are
+ * now at the end of the multicast tracker TVLV.
+ */
+static int batadv_mcast_forw_shrink_pack_dests(struct sk_buff *skb)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ unsigned char *skb_net_hdr;
+ u16 num_dests_slot;
+ u8 *slot;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests_slot = ntohs(mcast_tracker->num_dests);
+
+ slot = (u8 *)mcast_tracker + sizeof(*mcast_tracker);
+
+ batadv_mcast_forw_tracker_for_each_dest(slot, num_dests_slot) {
+ /* find an empty slot */
+ if (!is_zero_ether_addr(slot))
+ continue;
+
+ if (!batadv_mcast_forw_shrink_fill(slot, num_dests_slot))
+ /* could not find a filler, so we successfully packed
+ * and can stop - and must not reduce num_dests_slot!
+ */
+ break;
+ }
+
+ /* num_dests_slot is now the amount of reduced, zeroed
+ * destinations at the end of the tracker TVLV
+ */
+ return num_dests_slot;
+}
+
+/**
+ * batadv_mcast_forw_shrink_align_offset() - get new alignment offset
+ * @num_dests_old: the old, to be updated amount of destination nodes
+ * @num_dests_reduce: the number of destinations that were removed
+ *
+ * Calculates the amount of potential extra alignment offset that is needed to
+ * adjust the TVLV padding after the change in destination nodes.
+ *
+ * Return:
+ * 0: If no change to padding is needed.
+ * 2: If padding needs to be removed.
+ * -2: If padding needs to be added.
+ */
+static short
+batadv_mcast_forw_shrink_align_offset(unsigned int num_dests_old,
+ unsigned int num_dests_reduce)
+{
+ /* even amount of removed destinations -> no alignment change */
+ if (!(num_dests_reduce % 2))
+ return 0;
+
+ /* even to odd amount of destinations -> remove padding */
+ if (!(num_dests_old % 2))
+ return 2;
+
+ /* odd to even amount of destinations -> add padding */
+ return -2;
+}
+
+/**
+ * batadv_mcast_forw_shrink_update_headers() - update shrunk mc packet headers
+ * @skb: the batman-adv multicast packet to update headers of
+ * @num_dests_reduce: the number of destinations that were removed
+ *
+ * This updates any fields of a batman-adv multicast packet that are affected
+ * by the reduced number of destinations in the multicast tracket TVLV. In
+ * particular this updates:
+ *
+ * The num_dest field of the multicast tracker TVLV.
+ * The TVLV length field of the according generic TVLV header.
+ * The batman-adv multicast packet's total TVLV length field.
+ *
+ * Return: The offset in skb's tail direction at which the new batman-adv
+ * multicast packet header needs to start.
+ */
+static unsigned int
+batadv_mcast_forw_shrink_update_headers(struct sk_buff *skb,
+ unsigned int num_dests_reduce)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ struct batadv_mcast_packet *mcast_packet;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ unsigned char *skb_net_hdr;
+ unsigned int offset;
+ short align_offset;
+ u16 num_dests;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests = ntohs(mcast_tracker->num_dests);
+
+ align_offset = batadv_mcast_forw_shrink_align_offset(num_dests,
+ num_dests_reduce);
+ offset = ETH_ALEN * num_dests_reduce + align_offset;
+ num_dests -= num_dests_reduce;
+
+ /* update tracker header */
+ mcast_tracker->num_dests = htons(num_dests);
+
+ /* update tracker's tvlv header's length field */
+ tvlv_hdr = (struct batadv_tvlv_hdr *)(skb_network_header(skb) -
+ sizeof(*tvlv_hdr));
+ tvlv_hdr->len = htons(ntohs(tvlv_hdr->len) - offset);
+
+ /* update multicast packet header's tvlv length field */
+ mcast_packet = (struct batadv_mcast_packet *)skb->data;
+ mcast_packet->tvlv_len = htons(ntohs(mcast_packet->tvlv_len) - offset);
+
+ return offset;
+}
+
+/**
+ * batadv_mcast_forw_shrink_move_headers() - move multicast headers by offset
+ * @skb: the batman-adv multicast packet to move headers for
+ * @offset: a non-negative offset to move headers by, towards the skb tail
+ *
+ * Moves the batman-adv multicast packet header, its multicast tracker TVLV and
+ * any TVLVs in between by the given offset in direction towards the tail.
+ */
+static void
+batadv_mcast_forw_shrink_move_headers(struct sk_buff *skb, unsigned int offset)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ unsigned char *skb_net_hdr;
+ unsigned int len;
+ u16 num_dests;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests = ntohs(mcast_tracker->num_dests);
+ len = skb_network_offset(skb) + sizeof(*mcast_tracker);
+ len += num_dests * ETH_ALEN;
+
+ batadv_mcast_forw_scrape(skb, len, offset);
+}
+
+/**
+ * batadv_mcast_forw_shrink_tracker() - remove zero addresses in a tracker tvlv
+ * @skb: the batman-adv multicast packet to (potentially) shrink
+ *
+ * Removes all destinations with a zero MAC addresses (00:00:00:00:00:00) from
+ * the given batman-adv multicast packet's tracker TVLV and updates headers
+ * accordingly to maintain a valid batman-adv multicast packet.
+ */
+static void batadv_mcast_forw_shrink_tracker(struct sk_buff *skb)
+{
+ unsigned int offset;
+ u16 dests_reduced;
+
+ dests_reduced = batadv_mcast_forw_shrink_pack_dests(skb);
+ if (!dests_reduced)
+ return;
+
+ offset = batadv_mcast_forw_shrink_update_headers(skb, dests_reduced);
+ batadv_mcast_forw_shrink_move_headers(skb, offset);
+}
+
+/**
+ * batadv_mcast_forw_packet() - forward a batman-adv multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the received or locally generated batman-adv multicast packet
+ * @local_xmit: indicates that the packet was locally generated and not received
+ *
+ * Parses the tracker TVLV of a batman-adv multicast packet and forwards the
+ * packet as indicated in this TVLV.
+ *
+ * Caller needs to set the skb network header to the start of the multicast
+ * tracker TVLV (excluding the generic TVLV header) and the skb transport header
+ * to the next byte after this multicast tracker TVLV.
+ *
+ * Caller needs to free the skb.
+ *
+ * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error
+ * code on failure. NET_RX_SUCCESS if the received packet is supposed to be
+ * decapsulated and forwarded to the own mesh interface, NET_RX_DROP otherwise.
+ */
+static int batadv_mcast_forw_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, bool local_xmit)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ struct batadv_neigh_node *neigh_node;
+ unsigned long offset, num_dests_off;
+ struct sk_buff *nexthop_skb;
+ unsigned char *skb_net_hdr;
+ bool local_recv = false;
+ unsigned int tvlv_len;
+ bool xmitted = false;
+ u8 *dest, *next_dest;
+ u16 num_dests;
+ int ret;
+
+ /* (at least) TVLV part needs to be linearized */
+ SKB_LINEAR_ASSERT(skb);
+
+ /* check if num_dests is within skb length */
+ num_dests_off = offsetof(struct batadv_tvlv_mcast_tracker, num_dests);
+ if (num_dests_off > skb_network_header_len(skb))
+ return -EINVAL;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests = ntohs(mcast_tracker->num_dests);
+
+ dest = (u8 *)mcast_tracker + sizeof(*mcast_tracker);
+
+ /* check if full tracker tvlv is within skb length */
+ tvlv_len = sizeof(*mcast_tracker) + ETH_ALEN * num_dests;
+ if (tvlv_len > skb_network_header_len(skb))
+ return -EINVAL;
+
+ /* invalidate checksum: */
+ skb->ip_summed = CHECKSUM_NONE;
+
+ batadv_mcast_forw_tracker_for_each_dest(dest, num_dests) {
+ if (is_zero_ether_addr(dest))
+ continue;
+
+ /* only unicast originator addresses supported */
+ if (is_multicast_ether_addr(dest)) {
+ eth_zero_addr(dest);
+ continue;
+ }
+
+ if (batadv_is_my_mac(bat_priv, dest)) {
+ eth_zero_addr(dest);
+ local_recv = true;
+ continue;
+ }
+
+ neigh_node = batadv_orig_to_router(bat_priv, dest, NULL);
+ if (!neigh_node) {
+ eth_zero_addr(dest);
+ continue;
+ }
+
+ nexthop_skb = skb_copy(skb, GFP_ATOMIC);
+ if (!nexthop_skb) {
+ batadv_neigh_node_put(neigh_node);
+ return -ENOMEM;
+ }
+
+ offset = dest - skb->data;
+ next_dest = nexthop_skb->data + offset;
+
+ batadv_mcast_forw_scrub_dests(bat_priv, neigh_node, dest,
+ next_dest, num_dests);
+ batadv_mcast_forw_shrink_tracker(nexthop_skb);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_TX_BYTES,
+ nexthop_skb->len + ETH_HLEN);
+ xmitted = true;
+ ret = batadv_send_unicast_skb(nexthop_skb, neigh_node);
+
+ batadv_neigh_node_put(neigh_node);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xmitted) {
+ if (local_xmit) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_TX_LOCAL);
+ batadv_add_counter(bat_priv,
+ BATADV_CNT_MCAST_TX_LOCAL_BYTES,
+ skb->len -
+ skb_transport_offset(skb));
+ } else {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_FWD);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_FWD_BYTES,
+ skb->len + ETH_HLEN);
+ }
+ }
+
+ if (local_recv)
+ return NET_RX_SUCCESS;
+ else
+ return NET_RX_DROP;
+}
+
+/**
+ * batadv_mcast_forw_tracker_tvlv_handler() - handle an mcast tracker tvlv
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the received batman-adv multicast packet
+ *
+ * Parses the tracker TVLV of an incoming batman-adv multicast packet and
+ * forwards the packet as indicated in this TVLV.
+ *
+ * Caller needs to set the skb network header to the start of the multicast
+ * tracker TVLV (excluding the generic TVLV header) and the skb transport header
+ * to the next byte after this multicast tracker TVLV.
+ *
+ * Caller needs to free the skb.
+ *
+ * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error
+ * code on failure. NET_RX_SUCCESS if the received packet is supposed to be
+ * decapsulated and forwarded to the own mesh interface, NET_RX_DROP otherwise.
+ */
+int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return batadv_mcast_forw_packet(bat_priv, skb, false);
+}
+
+/**
+ * batadv_mcast_forw_packet_hdrlen() - multicast packet header length
+ * @num_dests: number of destination nodes
+ *
+ * Calculates the total batman-adv multicast packet header length for a given
+ * number of destination nodes (excluding the outer ethernet frame).
+ *
+ * Return: The calculated total batman-adv multicast packet header length.
+ */
+unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests)
+{
+ /**
+ * If the number of destination entries is even then we need to add
+ * two byte padding to the tracker TVLV.
+ */
+ int padding = (!(num_dests % 2)) ? 2 : 0;
+
+ return padding + num_dests * ETH_ALEN +
+ sizeof(struct batadv_tvlv_mcast_tracker) +
+ sizeof(struct batadv_tvlv_hdr) +
+ sizeof(struct batadv_mcast_packet);
+}
+
+/**
+ * batadv_mcast_forw_expand_head() - expand headroom for an mcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to send
+ *
+ * Tries to expand an skb's headroom so that its head to tail is 1298
+ * bytes (minimum IPv6 MTU + vlan ethernet header size) large.
+ *
+ * Return: -EINVAL if the given skb's length is too large or -ENOMEM on memory
+ * allocation failure. Otherwise, on success, zero is returned.
+ */
+static int batadv_mcast_forw_expand_head(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ int hdr_size = VLAN_ETH_HLEN + IPV6_MIN_MTU - skb->len;
+
+ /* TODO: Could be tightened to actual number of destination nodes?
+ * But it's tricky, number of destinations might have increased since
+ * we last checked.
+ */
+ if (hdr_size < 0) {
+ /* batadv_mcast_forw_mode_check_count() should ensure we do not
+ * end up here
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (skb_headroom(skb) < hdr_size &&
+ pskb_expand_head(skb, hdr_size, 0, GFP_ATOMIC) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_push() - encapsulate skb in a batman-adv multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to encapsulate and send
+ * @vid: the vlan identifier
+ * @is_routable: indicates whether the destination is routable
+ * @count: the number of originators the multicast packet needs to be sent to
+ *
+ * Encapsulates the given multicast packet in a batman-adv multicast packet.
+ * A multicast tracker TVLV with destination originator addresses for any node
+ * that signaled interest in it, that is either via the translation table or the
+ * according want-all flags, is attached accordingly.
+ *
+ * Return: true on success, false otherwise.
+ */
+bool batadv_mcast_forw_push(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int count)
+{
+ unsigned short tvlv_len = 0;
+ int ret;
+
+ if (batadv_mcast_forw_expand_head(bat_priv, skb) < 0)
+ goto err;
+
+ skb_reset_transport_header(skb);
+
+ ret = batadv_mcast_forw_push_tvlvs(bat_priv, skb, vid, is_routable,
+ count, &tvlv_len);
+ if (ret < 0)
+ goto err;
+
+ ret = batadv_mcast_forw_push_hdr(skb, tvlv_len);
+ if (ret < 0)
+ goto err;
+
+ return true;
+
+err:
+ if (tvlv_len)
+ skb_pull(skb, tvlv_len);
+
+ return false;
+}
+
+/**
+ * batadv_mcast_forw_mcsend() - send a self prepared batman-adv multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to encapsulate and send
+ *
+ * Transmits a batman-adv multicast packet that was locally prepared and
+ * consumes/frees it.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure. NET_XMIT_SUCCESS
+ * otherwise.
+ */
+int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ int ret = batadv_mcast_forw_packet(bat_priv, skb, true);
+
+ if (ret < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ consume_skb(skb);
+ return NET_XMIT_SUCCESS;
+}
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
new file mode 100644
index 000000000000..78c651f634cd
--- /dev/null
+++ b/net/batman-adv/netlink.c
@@ -0,0 +1,1559 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Matthias Schiffer
+ */
+
+#include "netlink.h"
+#include "main.h"
+
+#include <linux/array_size.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+#include <linux/minmax.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <net/genetlink.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bat_algo.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
+#include "hard-interface.h"
+#include "log.h"
+#include "mesh-interface.h"
+#include "multicast.h"
+#include "originator.h"
+#include "tp_meter.h"
+#include "translation-table.h"
+
+struct genl_family batadv_netlink_family;
+
+/* multicast groups */
+enum batadv_netlink_multicast_groups {
+ BATADV_NL_MCGRP_CONFIG,
+ BATADV_NL_MCGRP_TPMETER,
+};
+
+/**
+ * enum batadv_genl_ops_flags - flags for genl_ops's internal_flags
+ */
+enum batadv_genl_ops_flags {
+ /**
+ * @BATADV_FLAG_NEED_MESH: request requires valid mesh interface in
+ * attribute BATADV_ATTR_MESH_IFINDEX and expects a pointer to it to be
+ * saved in info->user_ptr[0]
+ */
+ BATADV_FLAG_NEED_MESH = BIT(0),
+
+ /**
+ * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in
+ * attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be
+ * saved in info->user_ptr[1]
+ */
+ BATADV_FLAG_NEED_HARDIF = BIT(1),
+
+ /**
+ * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in
+ * attribute BATADV_ATTR_VLANID and expects a pointer to it to be
+ * saved in info->user_ptr[1]
+ */
+ BATADV_FLAG_NEED_VLAN = BIT(2),
+};
+
+static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
+ [BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG },
+ [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER },
+};
+
+static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
+ [BATADV_ATTR_VERSION] = { .type = NLA_STRING },
+ [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING },
+ [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 },
+ [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING },
+ [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 },
+ [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING },
+ [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 },
+ [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 },
+ [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 },
+ [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 },
+ [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG },
+ [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 },
+ [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 },
+ [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 },
+ [BATADV_ATTR_TT_VID] = { .type = NLA_U16 },
+ [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 },
+ [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG },
+ [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 },
+ [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_TQ] = { .type = NLA_U8 },
+ [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 },
+ [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 },
+ [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 },
+ [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN },
+ [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG },
+ [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 },
+ [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN },
+ [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 },
+ [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 },
+ [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN },
+ [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 },
+ [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 },
+ [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 },
+ [BATADV_ATTR_VLANID] = { .type = NLA_U16 },
+ [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 },
+ [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 },
+ [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_GW_BANDWIDTH_DOWN] = { .type = NLA_U32 },
+ [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 },
+ [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 },
+ [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 },
+ [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 },
+ [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 },
+ [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 },
+ [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 },
+ [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 },
+ [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 },
+};
+
+/**
+ * batadv_netlink_get_ifindex() - Extract an interface index from a message
+ * @nlh: Message header
+ * @attrtype: Attribute which holds an interface index
+ *
+ * Return: interface index, or 0.
+ */
+static int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
+{
+ struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
+
+ return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0;
+}
+
+/**
+ * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation meshif attribute
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg,
+ struct batadv_priv *bat_priv)
+{
+ struct batadv_meshif_vlan *vlan;
+ u8 ap_isolation;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (!vlan)
+ return 0;
+
+ ap_isolation = atomic_read(&vlan->ap_isolation);
+ batadv_meshif_vlan_put(vlan);
+
+ return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED,
+ !!ap_isolation);
+}
+
+/**
+ * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg
+ * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr,
+ struct batadv_priv *bat_priv)
+{
+ struct batadv_meshif_vlan *vlan;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (!vlan)
+ return -ENOENT;
+
+ atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr));
+ batadv_meshif_vlan_put(vlan);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_mesh_fill() - Fill message with mesh attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_mesh_fill(struct sk_buff *msg,
+ struct batadv_priv *bat_priv,
+ enum batadv_nl_commands cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct net_device *mesh_iface = bat_priv->mesh_iface;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct net_device *hard_iface;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) ||
+ nla_put_string(msg, BATADV_ATTR_ALGO_NAME,
+ bat_priv->algo_ops->name) ||
+ nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, mesh_iface->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, mesh_iface->name) ||
+ nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN,
+ mesh_iface->dev_addr) ||
+ nla_put_u8(msg, BATADV_ATTR_TT_TTVN,
+ (u8)atomic_read(&bat_priv->tt.vn)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+ ntohs(bat_priv->bla.claim_dest.group)))
+ goto nla_put_failure;
+#endif
+
+ if (batadv_mcast_mesh_info_put(msg, bat_priv))
+ goto nla_put_failure;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
+ hard_iface = primary_if->net_dev;
+
+ if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ hard_iface->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ hard_iface->name) ||
+ nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
+ hard_iface->dev_addr))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED,
+ !!atomic_read(&bat_priv->aggregated_ogms)))
+ goto nla_put_failure;
+
+ if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK,
+ bat_priv->isolation_mark))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK,
+ bat_priv->isolation_mark_mask))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED,
+ !!atomic_read(&bat_priv->bonding)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED,
+ !!atomic_read(&bat_priv->bridge_loop_avoidance)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_BLA */
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED,
+ !!atomic_read(&bat_priv->distributed_arp_table)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+ if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED,
+ !!atomic_read(&bat_priv->fragmentation)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN,
+ atomic_read(&bat_priv->gw.bandwidth_down)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP,
+ atomic_read(&bat_priv->gw.bandwidth_up)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_GW_MODE,
+ atomic_read(&bat_priv->gw.mode)))
+ goto nla_put_failure;
+
+ if (bat_priv->algo_ops->gw.get_best_gw_node &&
+ bat_priv->algo_ops->gw.is_eligible) {
+ /* GW selection class is not available if the routing algorithm
+ * in use does not implement the GW API
+ */
+ if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS,
+ atomic_read(&bat_priv->gw.sel_class)))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY,
+ atomic_read(&bat_priv->hop_penalty)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL,
+ atomic_read(&bat_priv->log_level)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED,
+ !atomic_read(&bat_priv->multicast_mode)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT,
+ atomic_read(&bat_priv->multicast_fanout)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
+ if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL,
+ atomic_read(&bat_priv->orig_interval)))
+ goto nla_put_failure;
+
+ batadv_hardif_put(primary_if);
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ batadv_hardif_put(primary_if);
+
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_notify_mesh() - send meshif attributes to listener
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH,
+ 0, 0, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_get_mesh() - Get meshif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH,
+ info->snd_portid, info->snd_seq, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ ret = genlmsg_reply(msg, info);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_set_mesh() - Set meshif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct nlattr *attr;
+
+ if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED];
+
+ atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr));
+ }
+
+ if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED];
+
+ batadv_netlink_set_mesh_ap_isolation(attr, bat_priv);
+ }
+
+ if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) {
+ attr = info->attrs[BATADV_ATTR_ISOLATION_MARK];
+
+ bat_priv->isolation_mark = nla_get_u32(attr);
+ }
+
+ if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) {
+ attr = info->attrs[BATADV_ATTR_ISOLATION_MASK];
+
+ bat_priv->isolation_mark_mask = nla_get_u32(attr);
+ }
+
+ if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_BONDING_ENABLED];
+
+ atomic_set(&bat_priv->bonding, !!nla_get_u8(attr));
+ }
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED];
+
+ atomic_set(&bat_priv->bridge_loop_avoidance,
+ !!nla_get_u8(attr));
+ batadv_bla_status_update(bat_priv->mesh_iface);
+ }
+#endif /* CONFIG_BATMAN_ADV_BLA */
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED];
+
+ atomic_set(&bat_priv->distributed_arp_table,
+ !!nla_get_u8(attr));
+ batadv_dat_status_update(bat_priv->mesh_iface);
+ }
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+ if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED];
+
+ atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr));
+
+ rtnl_lock();
+ batadv_update_min_mtu(bat_priv->mesh_iface);
+ rtnl_unlock();
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) {
+ attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN];
+
+ atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr));
+ batadv_gw_tvlv_container_update(bat_priv);
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) {
+ attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP];
+
+ atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr));
+ batadv_gw_tvlv_container_update(bat_priv);
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_MODE]) {
+ u8 gw_mode;
+
+ attr = info->attrs[BATADV_ATTR_GW_MODE];
+ gw_mode = nla_get_u8(attr);
+
+ if (gw_mode <= BATADV_GW_MODE_SERVER) {
+ /* Invoking batadv_gw_reselect() is not enough to really
+ * de-select the current GW. It will only instruct the
+ * gateway client code to perform a re-election the next
+ * time that this is needed.
+ *
+ * When gw client mode is being switched off the current
+ * GW must be de-selected explicitly otherwise no GW_ADD
+ * uevent is thrown on client mode re-activation. This
+ * is operation is performed in
+ * batadv_gw_check_client_stop().
+ */
+ batadv_gw_reselect(bat_priv);
+
+ /* always call batadv_gw_check_client_stop() before
+ * changing the gateway state
+ */
+ batadv_gw_check_client_stop(bat_priv);
+ atomic_set(&bat_priv->gw.mode, gw_mode);
+ batadv_gw_tvlv_container_update(bat_priv);
+ }
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] &&
+ bat_priv->algo_ops->gw.get_best_gw_node &&
+ bat_priv->algo_ops->gw.is_eligible) {
+ /* setting the GW selection class is allowed only if the routing
+ * algorithm in use implements the GW API
+ */
+
+ u32 sel_class_max = bat_priv->algo_ops->gw.sel_class_max;
+ u32 sel_class;
+
+ attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS];
+ sel_class = nla_get_u32(attr);
+
+ if (sel_class >= 1 && sel_class <= sel_class_max) {
+ atomic_set(&bat_priv->gw.sel_class, sel_class);
+ batadv_gw_reselect(bat_priv);
+ }
+ }
+
+ if (info->attrs[BATADV_ATTR_HOP_PENALTY]) {
+ attr = info->attrs[BATADV_ATTR_HOP_PENALTY];
+
+ atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr));
+ }
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ if (info->attrs[BATADV_ATTR_LOG_LEVEL]) {
+ attr = info->attrs[BATADV_ATTR_LOG_LEVEL];
+
+ atomic_set(&bat_priv->log_level,
+ nla_get_u32(attr) & BATADV_DBG_ALL);
+ }
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED];
+
+ atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr));
+ }
+
+ if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) {
+ attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT];
+
+ atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr));
+ }
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
+ if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) {
+ u32 orig_interval;
+
+ attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL];
+ orig_interval = nla_get_u32(attr);
+
+ orig_interval = min_t(u32, orig_interval, INT_MAX);
+ orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER);
+
+ atomic_set(&bat_priv->orig_interval, orig_interval);
+ }
+
+ batadv_netlink_notify_mesh(bat_priv);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_tp_meter_put() - Fill information of started tp_meter session
+ * @msg: netlink message to be sent back
+ * @cookie: tp meter session cookie
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie)
+{
+ if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: destination of tp_meter session
+ * @result: reason for tp meter session stop
+ * @test_time: total time of the tp_meter session
+ * @total_bytes: bytes acked to the receiver
+ * @cookie: cookie of tp_meter session
+ *
+ * Return: 0 on success, < 0 on error
+ */
+int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 result, u32 test_time, u64 total_bytes,
+ u32 cookie)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0,
+ BATADV_CMD_TP_METER);
+ if (!hdr) {
+ ret = -ENOBUFS;
+ goto err_genlmsg;
+ }
+
+ if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes,
+ BATADV_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result))
+ goto nla_put_failure;
+
+ if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_TPMETER, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ ret = -EMSGSIZE;
+
+err_genlmsg:
+ nlmsg_free(msg);
+ return ret;
+}
+
+/**
+ * batadv_netlink_tp_meter_start() - Start a new tp_meter session
+ * @skb: received netlink message
+ * @info: receiver information
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg = NULL;
+ u32 test_length;
+ void *msg_head;
+ u32 cookie;
+ u8 *dst;
+ int ret;
+
+ if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS])
+ return -EINVAL;
+
+ if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME])
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]);
+
+ test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &batadv_netlink_family, 0,
+ BATADV_CMD_TP_METER);
+ if (!msg_head) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+
+ batadv_tp_start(bat_priv, dst, test_length, &cookie);
+
+ ret = batadv_netlink_tp_meter_put(msg, cookie);
+
+ out:
+ if (ret) {
+ if (msg)
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_end(msg, msg_head);
+ return genlmsg_reply(msg, info);
+}
+
+/**
+ * batadv_netlink_tp_meter_cancel() - Cancel a running tp_meter session
+ * @skb: received netlink message
+ * @info: receiver information
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int
+batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ u8 *dst;
+ int ret = 0;
+
+ if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS])
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]);
+
+ batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_hardif_fill() - Fill message with hardif attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @hard_iface: hard interface which was modified
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ * @cb: Control block containing additional options
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_hardif_fill(struct sk_buff *msg,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface,
+ enum batadv_nl_commands cmd,
+ u32 portid, u32 seq, int flags,
+ struct netlink_callback *cb)
+{
+ struct net_device *net_dev = hard_iface->net_dev;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (cb)
+ genl_dump_check_consistent(cb, hdr);
+
+ if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX,
+ bat_priv->mesh_iface->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME,
+ bat_priv->mesh_iface->name))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ net_dev->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ net_dev->name) ||
+ nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
+ net_dev->dev_addr))
+ goto nla_put_failure;
+
+ if (hard_iface->if_status == BATADV_IF_ACTIVE) {
+ if (nla_put_flag(msg, BATADV_ATTR_ACTIVE))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY,
+ atomic_read(&hard_iface->hop_penalty)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL,
+ atomic_read(&hard_iface->bat_v.elp_interval)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE,
+ atomic_read(&hard_iface->bat_v.throughput_override)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_notify_hardif() - send hardif attributes to listener
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @hard_iface: hard interface which was modified
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+ BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_cmd_get_hardif() - Get hardif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_cmd_get_hardif(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct batadv_hard_iface *hard_iface = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+ BATADV_CMD_GET_HARDIF,
+ info->snd_portid, info->snd_seq, 0,
+ NULL);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ ret = genlmsg_reply(msg, info);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_set_hardif() - Set hardif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_hardif(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct batadv_hard_iface *hard_iface = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct nlattr *attr;
+
+ if (info->attrs[BATADV_ATTR_HOP_PENALTY]) {
+ attr = info->attrs[BATADV_ATTR_HOP_PENALTY];
+
+ atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr));
+ }
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+ if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) {
+ attr = info->attrs[BATADV_ATTR_ELP_INTERVAL];
+
+ atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr));
+ }
+
+ if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) {
+ attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE];
+
+ atomic_set(&hard_iface->bat_v.throughput_override,
+ nla_get_u32(attr));
+ }
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
+ batadv_netlink_notify_hardif(bat_priv, hard_iface);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_dump_hardif() - Dump all hard interface into a messages
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: error code, or length of reply message on success
+ */
+static int
+batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net_device *mesh_iface;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_priv *bat_priv;
+ int portid = NETLINK_CB(cb->skb).portid;
+ int skip = cb->args[0];
+ struct list_head *iter;
+ int i = 0;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ rtnl_lock();
+ cb->seq = batadv_hardif_generation << 1 | 1;
+
+ netdev_for_each_lower_private(mesh_iface, hard_iface, iter) {
+ if (i++ < skip)
+ continue;
+
+ if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+ BATADV_CMD_GET_HARDIF,
+ portid, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb)) {
+ i--;
+ break;
+ }
+ }
+
+ rtnl_unlock();
+
+ dev_put(mesh_iface);
+
+ cb->args[0] = i;
+
+ return msg->len;
+}
+
+/**
+ * batadv_netlink_vlan_fill() - Fill message with vlan attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vlan: vlan which was modified
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_vlan_fill(struct sk_buff *msg,
+ struct batadv_priv *bat_priv,
+ struct batadv_meshif_vlan *vlan,
+ enum batadv_nl_commands cmd,
+ u32 portid, u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX,
+ bat_priv->mesh_iface->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME,
+ bat_priv->mesh_iface->name))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED,
+ !!atomic_read(&vlan->ap_isolation)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_notify_vlan() - send vlan attributes to listener
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vlan: vlan which was modified
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv,
+ struct batadv_meshif_vlan *vlan)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan,
+ BATADV_CMD_SET_VLAN, 0, 0, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_get_vlan() - Get vlan attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_meshif_vlan *vlan = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN,
+ info->snd_portid, info->snd_seq, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ ret = genlmsg_reply(msg, info);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_set_vlan() - Get vlan attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_meshif_vlan *vlan = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct nlattr *attr;
+
+ if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED];
+
+ atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr));
+ }
+
+ batadv_netlink_notify_vlan(bat_priv, vlan);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_get_meshif_from_ifindex() - Get mesh-iface from ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of the mesh interface
+ *
+ * Return: Pointer to mesh interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct net_device *
+batadv_netlink_get_meshif_from_ifindex(struct net *net, int ifindex)
+{
+ struct net_device *mesh_iface;
+
+ mesh_iface = dev_get_by_index(net, ifindex);
+ if (!mesh_iface)
+ return ERR_PTR(-ENODEV);
+
+ if (!batadv_meshif_is_valid(mesh_iface))
+ goto err_put_meshif;
+
+ return mesh_iface;
+
+err_put_meshif:
+ dev_put(mesh_iface);
+
+ return ERR_PTR(-EINVAL);
+}
+
+/**
+ * batadv_netlink_get_meshif_from_info() - Get mesh-iface from genl attributes
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to mesh interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct net_device *
+batadv_netlink_get_meshif_from_info(struct net *net, struct genl_info *info)
+{
+ int ifindex;
+
+ if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
+ return ERR_PTR(-EINVAL);
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
+
+ return batadv_netlink_get_meshif_from_ifindex(net, ifindex);
+}
+
+/**
+ * batadv_netlink_get_meshif() - Retrieve mesh interface from netlink callback
+ * @cb: callback structure containing arguments
+ *
+ * Return: Pointer to mesh interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb)
+{
+ int ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return ERR_PTR(-ENONET);
+
+ return batadv_netlink_get_meshif_from_ifindex(sock_net(cb->skb->sk),
+ ifindex);
+}
+
+/**
+ * batadv_netlink_get_hardif_from_ifindex() - Get hard-iface from ifindex
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @net: the applicable net namespace
+ * @ifindex: index of the hard interface
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct batadv_hard_iface *
+batadv_netlink_get_hardif_from_ifindex(struct batadv_priv *bat_priv,
+ struct net *net, int ifindex)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct net_device *hard_dev;
+
+ hard_dev = dev_get_by_index(net, ifindex);
+ if (!hard_dev)
+ return ERR_PTR(-ENODEV);
+
+ hard_iface = batadv_hardif_get_by_netdev(hard_dev);
+ if (!hard_iface)
+ goto err_put_harddev;
+
+ if (hard_iface->mesh_iface != bat_priv->mesh_iface)
+ goto err_put_hardif;
+
+ /* hard_dev is referenced by hard_iface and not needed here */
+ dev_put(hard_dev);
+
+ return hard_iface;
+
+err_put_hardif:
+ batadv_hardif_put(hard_iface);
+err_put_harddev:
+ dev_put(hard_dev);
+
+ return ERR_PTR(-EINVAL);
+}
+
+/**
+ * batadv_netlink_get_hardif_from_info() - Get hard-iface from genl attributes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct batadv_hard_iface *
+batadv_netlink_get_hardif_from_info(struct batadv_priv *bat_priv,
+ struct net *net, struct genl_info *info)
+{
+ int ifindex;
+
+ if (!info->attrs[BATADV_ATTR_HARD_IFINDEX])
+ return ERR_PTR(-EINVAL);
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]);
+
+ return batadv_netlink_get_hardif_from_ifindex(bat_priv, net, ifindex);
+}
+
+/**
+ * batadv_netlink_get_hardif() - Retrieve hard interface from netlink callback
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @cb: callback structure containing arguments
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+struct batadv_hard_iface *
+batadv_netlink_get_hardif(struct batadv_priv *bat_priv,
+ struct netlink_callback *cb)
+{
+ int ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_HARD_IFINDEX);
+ if (!ifindex)
+ return ERR_PTR(-ENONET);
+
+ return batadv_netlink_get_hardif_from_ifindex(bat_priv,
+ sock_net(cb->skb->sk),
+ ifindex);
+}
+
+/**
+ * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to vlan on success (with increased refcnt), error pointer
+ * on error
+ */
+static struct batadv_meshif_vlan *
+batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net,
+ struct genl_info *info)
+{
+ struct batadv_meshif_vlan *vlan;
+ u16 vid;
+
+ if (!info->attrs[BATADV_ATTR_VLANID])
+ return ERR_PTR(-EINVAL);
+
+ vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]);
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
+ if (!vlan)
+ return ERR_PTR(-ENOENT);
+
+ return vlan;
+}
+
+/**
+ * batadv_pre_doit() - Prepare batman-adv genl doit request
+ * @ops: requested netlink operation
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_priv *bat_priv = NULL;
+ struct batadv_meshif_vlan *vlan;
+ struct net_device *mesh_iface;
+ u8 user_ptr1_flags;
+ u8 mesh_dep_flags;
+ int ret;
+
+ user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN;
+ if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1))
+ return -EINVAL;
+
+ mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN;
+ if (WARN_ON((ops->internal_flags & mesh_dep_flags) &&
+ (~ops->internal_flags & BATADV_FLAG_NEED_MESH)))
+ return -EINVAL;
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_MESH) {
+ mesh_iface = batadv_netlink_get_meshif_from_info(net, info);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+ info->user_ptr[0] = bat_priv;
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) {
+ hard_iface = batadv_netlink_get_hardif_from_info(bat_priv, net,
+ info);
+ if (IS_ERR(hard_iface)) {
+ ret = PTR_ERR(hard_iface);
+ goto err_put_meshif;
+ }
+
+ info->user_ptr[1] = hard_iface;
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) {
+ vlan = batadv_get_vlan_from_info(bat_priv, net, info);
+ if (IS_ERR(vlan)) {
+ ret = PTR_ERR(vlan);
+ goto err_put_meshif;
+ }
+
+ info->user_ptr[1] = vlan;
+ }
+
+ return 0;
+
+err_put_meshif:
+ if (bat_priv)
+ dev_put(bat_priv->mesh_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_post_doit() - End batman-adv genl doit request
+ * @ops: requested netlink operation
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ */
+static void batadv_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_meshif_vlan *vlan;
+ struct batadv_priv *bat_priv;
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF &&
+ info->user_ptr[1]) {
+ hard_iface = info->user_ptr[1];
+
+ batadv_hardif_put(hard_iface);
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) {
+ vlan = info->user_ptr[1];
+ batadv_meshif_vlan_put(vlan);
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) {
+ bat_priv = info->user_ptr[0];
+ dev_put(bat_priv->mesh_iface);
+ }
+}
+
+static const struct genl_small_ops batadv_netlink_ops[] = {
+ {
+ .cmd = BATADV_CMD_GET_MESH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .doit = batadv_netlink_get_mesh,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
+ },
+ {
+ .cmd = BATADV_CMD_TP_METER,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_tp_meter_start,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
+ },
+ {
+ .cmd = BATADV_CMD_TP_METER_CANCEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_tp_meter_cancel,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
+ },
+ {
+ .cmd = BATADV_CMD_GET_ROUTING_ALGOS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_algo_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_HARDIF,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .dumpit = batadv_netlink_dump_hardif,
+ .doit = batadv_netlink_cmd_get_hardif,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_HARDIF,
+ },
+ {
+ .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_tt_local_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_tt_global_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_ORIGINATORS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_orig_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_NEIGHBORS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_hardif_neigh_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_GATEWAYS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_gw_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_BLA_CLAIM,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_bla_claim_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_BLA_BACKBONE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_bla_backbone_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_DAT_CACHE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_dat_cache_dump,
+ },
+ {
+ .cmd = BATADV_CMD_GET_MCAST_FLAGS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .dumpit = batadv_mcast_flags_dump,
+ },
+ {
+ .cmd = BATADV_CMD_SET_MESH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_set_mesh,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
+ },
+ {
+ .cmd = BATADV_CMD_SET_HARDIF,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_set_hardif,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_HARDIF,
+ },
+ {
+ .cmd = BATADV_CMD_GET_VLAN,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .doit = batadv_netlink_get_vlan,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_VLAN,
+ },
+ {
+ .cmd = BATADV_CMD_SET_VLAN,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_set_vlan,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_VLAN,
+ },
+};
+
+struct genl_family batadv_netlink_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = BATADV_NL_NAME,
+ .version = 1,
+ .maxattr = BATADV_ATTR_MAX,
+ .policy = batadv_netlink_policy,
+ .netnsok = true,
+ .pre_doit = batadv_pre_doit,
+ .post_doit = batadv_post_doit,
+ .module = THIS_MODULE,
+ .small_ops = batadv_netlink_ops,
+ .n_small_ops = ARRAY_SIZE(batadv_netlink_ops),
+ .resv_start_op = BATADV_CMD_SET_VLAN + 1,
+ .mcgrps = batadv_netlink_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
+};
+
+/**
+ * batadv_netlink_register() - register batadv genl netlink family
+ */
+void __init batadv_netlink_register(void)
+{
+ int ret;
+
+ ret = genl_register_family(&batadv_netlink_family);
+ if (ret)
+ pr_warn("unable to register netlink family\n");
+}
+
+/**
+ * batadv_netlink_unregister() - unregister batadv genl netlink family
+ */
+void batadv_netlink_unregister(void)
+{
+ genl_unregister_family(&batadv_netlink_family);
+}
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
new file mode 100644
index 000000000000..4eae9e5ff135
--- /dev/null
+++ b/net/batman-adv/netlink.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Matthias Schiffer
+ */
+
+#ifndef _NET_BATMAN_ADV_NETLINK_H_
+#define _NET_BATMAN_ADV_NETLINK_H_
+
+#include "main.h"
+
+#include <linux/netlink.h>
+#include <linux/types.h>
+
+void batadv_netlink_register(void);
+void batadv_netlink_unregister(void);
+struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb);
+struct batadv_hard_iface *
+batadv_netlink_get_hardif(struct batadv_priv *bat_priv,
+ struct netlink_callback *cb);
+
+int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 result, u32 test_time, u64 total_bytes,
+ u32 cookie);
+
+extern struct genl_family batadv_netlink_family;
+
+#endif /* _NET_BATMAN_ADV_NETLINK_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
new file mode 100644
index 000000000000..a662408ad867
--- /dev/null
+++ b/net/batman-adv/originator.c
@@ -0,0 +1,1373 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "originator.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_vlan.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "distributed-arp-table.h"
+#include "fragmentation.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "multicast.h"
+#include "netlink.h"
+#include "routing.h"
+#include "translation-table.h"
+
+/* hash class keys */
+static struct lock_class_key batadv_orig_hash_lock_class_key;
+
+/**
+ * batadv_orig_hash_find() - Find and return originator from orig_hash
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @data: mac address of the originator
+ *
+ * Return: orig_node (with increased refcnt), NULL on errors
+ */
+struct batadv_orig_node *
+batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node, *orig_node_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = batadv_choose_orig(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (!batadv_compare_eth(orig_node, data))
+ continue;
+
+ if (!kref_get_unless_zero(&orig_node->refcount))
+ continue;
+
+ orig_node_tmp = orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node_tmp;
+}
+
+static void batadv_purge_orig(struct work_struct *work);
+
+/**
+ * batadv_compare_orig() - comparing function used in the originator hash table
+ * @node: node in the local table
+ * @data2: second object to compare the node to
+ *
+ * Return: true if they are the same originator
+ */
+bool batadv_compare_orig(const struct hlist_node *node, const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_orig_node,
+ hash_entry);
+
+ return batadv_compare_eth(data1, data2);
+}
+
+/**
+ * batadv_orig_node_vlan_get() - get an orig_node_vlan object
+ * @orig_node: the originator serving the VLAN
+ * @vid: the VLAN identifier
+ *
+ * Return: the vlan object identified by vid and belonging to orig_node or NULL
+ * if it does not exist.
+ */
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_orig_node_vlan *vlan = NULL, *tmp;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) {
+ if (tmp->vid != vid)
+ continue;
+
+ if (!kref_get_unless_zero(&tmp->refcount))
+ continue;
+
+ vlan = tmp;
+
+ break;
+ }
+ rcu_read_unlock();
+
+ return vlan;
+}
+
+/**
+ * batadv_vlan_id_valid() - check if vlan id is in valid batman-adv encoding
+ * @vid: the VLAN identifier
+ *
+ * Return: true when either no vlan is set or if VLAN is in correct range,
+ * false otherwise
+ */
+static bool batadv_vlan_id_valid(unsigned short vid)
+{
+ unsigned short non_vlan = vid & ~(BATADV_VLAN_HAS_TAG | VLAN_VID_MASK);
+
+ if (vid == 0)
+ return true;
+
+ if (!(vid & BATADV_VLAN_HAS_TAG))
+ return false;
+
+ if (non_vlan)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan
+ * object
+ * @orig_node: the originator serving the VLAN
+ * @vid: the VLAN identifier
+ *
+ * Return: NULL in case of failure or the vlan object identified by vid and
+ * belonging to orig_node otherwise. The object is created and added to the list
+ * if it does not exist.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_orig_node_vlan *vlan;
+
+ if (!batadv_vlan_id_valid(vid))
+ return NULL;
+
+ spin_lock_bh(&orig_node->vlan_list_lock);
+
+ /* first look if an object for this vid already exists */
+ vlan = batadv_orig_node_vlan_get(orig_node, vid);
+ if (vlan)
+ goto out;
+
+ vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC);
+ if (!vlan)
+ goto out;
+
+ kref_init(&vlan->refcount);
+ vlan->vid = vid;
+
+ kref_get(&vlan->refcount);
+ hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list);
+
+out:
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+
+ return vlan;
+}
+
+/**
+ * batadv_orig_node_vlan_release() - release originator-vlan object from lists
+ * and queue for free after rcu grace period
+ * @ref: kref pointer of the originator-vlan object
+ */
+void batadv_orig_node_vlan_release(struct kref *ref)
+{
+ struct batadv_orig_node_vlan *orig_vlan;
+
+ orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount);
+
+ kfree_rcu(orig_vlan, rcu);
+}
+
+/**
+ * batadv_originator_init() - Initialize all originator structures
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_originator_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->orig_hash)
+ return 0;
+
+ bat_priv->orig_hash = batadv_hash_new(1024);
+
+ if (!bat_priv->orig_hash)
+ goto err;
+
+ batadv_hash_set_lock_class(bat_priv->orig_hash,
+ &batadv_orig_hash_lock_class_key);
+
+ INIT_DELAYED_WORK(&bat_priv->orig_work, batadv_purge_orig);
+ queue_delayed_work(batadv_event_workqueue,
+ &bat_priv->orig_work,
+ msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
+
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_neigh_ifinfo_release() - release neigh_ifinfo from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the neigh_ifinfo
+ */
+void batadv_neigh_ifinfo_release(struct kref *ref)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+ neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount);
+
+ if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
+ batadv_hardif_put(neigh_ifinfo->if_outgoing);
+
+ kfree_rcu(neigh_ifinfo, rcu);
+}
+
+/**
+ * batadv_hardif_neigh_release() - release hardif neigh node from lists and
+ * queue for free after rcu grace period
+ * @ref: kref pointer of the neigh_node
+ */
+void batadv_hardif_neigh_release(struct kref *ref)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node,
+ refcount);
+
+ spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
+ hlist_del_init_rcu(&hardif_neigh->list);
+ spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
+
+ batadv_hardif_put(hardif_neigh->if_incoming);
+ kfree_rcu(hardif_neigh, rcu);
+}
+
+/**
+ * batadv_neigh_node_release() - release neigh_node from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the neigh_node
+ */
+void batadv_neigh_node_release(struct kref *ref)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+ neigh_node = container_of(ref, struct batadv_neigh_node, refcount);
+
+ hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
+ &neigh_node->ifinfo_list, list) {
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ }
+
+ batadv_hardif_neigh_put(neigh_node->hardif_neigh);
+
+ batadv_hardif_put(neigh_node->if_incoming);
+
+ kfree_rcu(neigh_node, rcu);
+}
+
+/**
+ * batadv_orig_router_get() - router to the originator depending on iface
+ * @orig_node: the orig node for the router
+ * @if_outgoing: the interface where the payload packet has been received or
+ * the OGM should be sent to
+ *
+ * Return: the neighbor which should be the router for this orig_node/iface.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_neigh_node *
+batadv_orig_router_get(struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *router = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) {
+ if (orig_ifinfo->if_outgoing != if_outgoing)
+ continue;
+
+ router = rcu_dereference(orig_ifinfo->router);
+ break;
+ }
+
+ if (router && !kref_get_unless_zero(&router->refcount))
+ router = NULL;
+
+ rcu_read_unlock();
+ return router;
+}
+
+/**
+ * batadv_orig_to_router() - get next hop neighbor to an orig address
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_addr: the originator MAC address to search the best next hop router for
+ * @if_outgoing: the interface where the payload packet has been received or
+ * the OGM should be sent to
+ *
+ * Return: A neighbor node which is the best router towards the given originator
+ * address.
+ */
+struct batadv_neigh_node *
+batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_orig_node *orig_node;
+
+ orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ if (!orig_node)
+ return NULL;
+
+ neigh_node = batadv_find_router(bat_priv, orig_node, if_outgoing);
+ batadv_orig_node_put(orig_node);
+
+ return neigh_node;
+}
+
+/**
+ * batadv_orig_ifinfo_get() - find the ifinfo from an orig_node
+ * @orig_node: the orig node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * Return: the requested orig_ifinfo or NULL if not found.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *tmp, *orig_ifinfo = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp, &orig_node->ifinfo_list,
+ list) {
+ if (tmp->if_outgoing != if_outgoing)
+ continue;
+
+ if (!kref_get_unless_zero(&tmp->refcount))
+ continue;
+
+ orig_ifinfo = tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_ifinfo;
+}
+
+/**
+ * batadv_orig_ifinfo_new() - search and possibly create an orig_ifinfo object
+ * @orig_node: the orig node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing
+ * interface otherwise. The object is created and added to the list
+ * if it does not exist.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ unsigned long reset_time;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing);
+ if (orig_ifinfo)
+ goto out;
+
+ orig_ifinfo = kzalloc(sizeof(*orig_ifinfo), GFP_ATOMIC);
+ if (!orig_ifinfo)
+ goto out;
+
+ if (if_outgoing != BATADV_IF_DEFAULT)
+ kref_get(&if_outgoing->refcount);
+
+ reset_time = jiffies - 1;
+ reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS);
+ orig_ifinfo->batman_seqno_reset = reset_time;
+ orig_ifinfo->if_outgoing = if_outgoing;
+ INIT_HLIST_NODE(&orig_ifinfo->list);
+ kref_init(&orig_ifinfo->refcount);
+
+ kref_get(&orig_ifinfo->refcount);
+ hlist_add_head_rcu(&orig_ifinfo->list,
+ &orig_node->ifinfo_list);
+out:
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ return orig_ifinfo;
+}
+
+/**
+ * batadv_neigh_ifinfo_get() - find the ifinfo from an neigh_node
+ * @neigh: the neigh node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * The object is returned with refcounter increased by 1.
+ *
+ * Return: the requested neigh_ifinfo or NULL if not found
+ */
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL,
+ *tmp_neigh_ifinfo;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_ifinfo, &neigh->ifinfo_list,
+ list) {
+ if (tmp_neigh_ifinfo->if_outgoing != if_outgoing)
+ continue;
+
+ if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount))
+ continue;
+
+ neigh_ifinfo = tmp_neigh_ifinfo;
+ break;
+ }
+ rcu_read_unlock();
+
+ return neigh_ifinfo;
+}
+
+/**
+ * batadv_neigh_ifinfo_new() - search and possibly create an neigh_ifinfo object
+ * @neigh: the neigh node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * Return: NULL in case of failure or the neigh_ifinfo object for the
+ * if_outgoing interface otherwise. The object is created and added to the list
+ * if it does not exist.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+ spin_lock_bh(&neigh->ifinfo_lock);
+
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh, if_outgoing);
+ if (neigh_ifinfo)
+ goto out;
+
+ neigh_ifinfo = kzalloc(sizeof(*neigh_ifinfo), GFP_ATOMIC);
+ if (!neigh_ifinfo)
+ goto out;
+
+ if (if_outgoing)
+ kref_get(&if_outgoing->refcount);
+
+ INIT_HLIST_NODE(&neigh_ifinfo->list);
+ kref_init(&neigh_ifinfo->refcount);
+ neigh_ifinfo->if_outgoing = if_outgoing;
+
+ kref_get(&neigh_ifinfo->refcount);
+ hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list);
+
+out:
+ spin_unlock_bh(&neigh->ifinfo_lock);
+
+ return neigh_ifinfo;
+}
+
+/**
+ * batadv_neigh_node_get() - retrieve a neighbour from the list
+ * @orig_node: originator which the neighbour belongs to
+ * @hard_iface: the interface where this neighbour is connected to
+ * @addr: the address of the neighbour
+ *
+ * Looks for and possibly returns a neighbour belonging to this originator list
+ * which is connected through the provided hard interface.
+ *
+ * Return: neighbor when found. Otherwise NULL
+ */
+static struct batadv_neigh_node *
+batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *hard_iface,
+ const u8 *addr)
+{
+ struct batadv_neigh_node *tmp_neigh_node, *res = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) {
+ if (!batadv_compare_eth(tmp_neigh_node->addr, addr))
+ continue;
+
+ if (tmp_neigh_node->if_incoming != hard_iface)
+ continue;
+
+ if (!kref_get_unless_zero(&tmp_neigh_node->refcount))
+ continue;
+
+ res = tmp_neigh_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return res;
+}
+
+/**
+ * batadv_hardif_neigh_create() - create a hardif neighbour node
+ * @hard_iface: the interface this neighbour is connected to
+ * @neigh_addr: the interface address of the neighbour to retrieve
+ * @orig_node: originator object representing the neighbour
+ *
+ * Return: the hardif neighbour node if found or created or NULL otherwise.
+ */
+static struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ spin_lock_bh(&hard_iface->neigh_list_lock);
+
+ /* check if neighbor hasn't been added in the meantime */
+ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
+ if (hardif_neigh)
+ goto out;
+
+ hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC);
+ if (!hardif_neigh)
+ goto out;
+
+ kref_get(&hard_iface->refcount);
+ INIT_HLIST_NODE(&hardif_neigh->list);
+ ether_addr_copy(hardif_neigh->addr, neigh_addr);
+ ether_addr_copy(hardif_neigh->orig, orig_node->orig);
+ hardif_neigh->if_incoming = hard_iface;
+ hardif_neigh->last_seen = jiffies;
+
+ kref_init(&hardif_neigh->refcount);
+
+ if (bat_priv->algo_ops->neigh.hardif_init)
+ bat_priv->algo_ops->neigh.hardif_init(hardif_neigh);
+
+ hlist_add_head_rcu(&hardif_neigh->list, &hard_iface->neigh_list);
+
+out:
+ spin_unlock_bh(&hard_iface->neigh_list_lock);
+ return hardif_neigh;
+}
+
+/**
+ * batadv_hardif_neigh_get_or_create() - retrieve or create a hardif neighbour
+ * node
+ * @hard_iface: the interface this neighbour is connected to
+ * @neigh_addr: the interface address of the neighbour to retrieve
+ * @orig_node: originator object representing the neighbour
+ *
+ * Return: the hardif neighbour node if found or created or NULL otherwise.
+ */
+static struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ /* first check without locking to avoid the overhead */
+ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
+ if (hardif_neigh)
+ return hardif_neigh;
+
+ return batadv_hardif_neigh_create(hard_iface, neigh_addr, orig_node);
+}
+
+/**
+ * batadv_hardif_neigh_get() - retrieve a hardif neighbour from the list
+ * @hard_iface: the interface where this neighbour is connected to
+ * @neigh_addr: the address of the neighbour
+ *
+ * Looks for and possibly returns a neighbour belonging to this hard interface.
+ *
+ * Return: neighbor when found. Otherwise NULL
+ */
+struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_hardif_neigh_node *tmp_hardif_neigh, *hardif_neigh = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr))
+ continue;
+
+ if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount))
+ continue;
+
+ hardif_neigh = tmp_hardif_neigh;
+ break;
+ }
+ rcu_read_unlock();
+
+ return hardif_neigh;
+}
+
+/**
+ * batadv_neigh_node_create() - create a neigh node object
+ * @orig_node: originator object representing the neighbour
+ * @hard_iface: the interface where the neighbour is connected to
+ * @neigh_addr: the mac address of the neighbour interface
+ *
+ * Allocates a new neigh_node object and initialises all the generic fields.
+ *
+ * Return: the neighbour node if found or created or NULL otherwise.
+ */
+static struct batadv_neigh_node *
+batadv_neigh_node_create(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
+ if (neigh_node)
+ goto out;
+
+ hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface,
+ neigh_addr, orig_node);
+ if (!hardif_neigh)
+ goto out;
+
+ neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
+ if (!neigh_node)
+ goto out;
+
+ INIT_HLIST_NODE(&neigh_node->list);
+ INIT_HLIST_HEAD(&neigh_node->ifinfo_list);
+ spin_lock_init(&neigh_node->ifinfo_lock);
+
+ kref_get(&hard_iface->refcount);
+ ether_addr_copy(neigh_node->addr, neigh_addr);
+ neigh_node->if_incoming = hard_iface;
+ neigh_node->orig_node = orig_node;
+ neigh_node->last_seen = jiffies;
+
+ /* increment unique neighbor refcount */
+ kref_get(&hardif_neigh->refcount);
+ neigh_node->hardif_neigh = hardif_neigh;
+
+ /* extra reference for return */
+ kref_init(&neigh_node->refcount);
+
+ kref_get(&neigh_node->refcount);
+ hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+
+ batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
+ "Creating new neighbor %pM for orig_node %pM on interface %s\n",
+ neigh_addr, orig_node->orig, hard_iface->net_dev->name);
+
+out:
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ batadv_hardif_neigh_put(hardif_neigh);
+ return neigh_node;
+}
+
+/**
+ * batadv_neigh_node_get_or_create() - retrieve or create a neigh node object
+ * @orig_node: originator object representing the neighbour
+ * @hard_iface: the interface where the neighbour is connected to
+ * @neigh_addr: the mac address of the neighbour interface
+ *
+ * Return: the neighbour node if found or created or NULL otherwise.
+ */
+struct batadv_neigh_node *
+batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_neigh_node *neigh_node;
+
+ /* first check without locking to avoid the overhead */
+ neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
+ if (neigh_node)
+ return neigh_node;
+
+ return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr);
+}
+
+/**
+ * batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a
+ * specific outgoing interface
+ * @msg: message to dump into
+ * @cb: parameters for the dump
+ *
+ * Return: 0 or error value
+ */
+int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if, *hard_iface;
+ struct net_device *mesh_iface;
+ struct batadv_priv *bat_priv;
+ int ret;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if) {
+ ret = -ENOENT;
+ goto out_put_mesh_iface;
+ }
+
+ if (primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out_put_primary_if;
+ }
+
+ hard_iface = batadv_netlink_get_hardif(bat_priv, cb);
+ if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) {
+ ret = PTR_ERR(hard_iface);
+ goto out_put_primary_if;
+ } else if (IS_ERR(hard_iface)) {
+ /* => PTR_ERR(hard_iface) == -ENONET
+ * => no hard-iface given, ok
+ */
+ hard_iface = BATADV_IF_DEFAULT;
+ }
+
+ if (!bat_priv->algo_ops->neigh.dump) {
+ ret = -EOPNOTSUPP;
+ goto out_put_hard_iface;
+ }
+
+ bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hard_iface);
+
+ ret = msg->len;
+
+out_put_hard_iface:
+ batadv_hardif_put(hard_iface);
+out_put_primary_if:
+ batadv_hardif_put(primary_if);
+out_put_mesh_iface:
+ dev_put(mesh_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_orig_ifinfo_release() - release orig_ifinfo from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the orig_ifinfo
+ */
+void batadv_orig_ifinfo_release(struct kref *ref)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *router;
+
+ orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount);
+
+ if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
+ batadv_hardif_put(orig_ifinfo->if_outgoing);
+
+ /* this is the last reference to this object */
+ router = rcu_dereference_protected(orig_ifinfo->router, true);
+ batadv_neigh_node_put(router);
+
+ kfree_rcu(orig_ifinfo, rcu);
+}
+
+/**
+ * batadv_orig_node_free_rcu() - free the orig_node
+ * @rcu: rcu pointer of the orig_node
+ */
+static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_orig_node *orig_node;
+
+ orig_node = container_of(rcu, struct batadv_orig_node, rcu);
+
+ batadv_mcast_purge_orig(orig_node);
+
+ batadv_frag_purge_orig(orig_node, NULL);
+
+ kfree(orig_node->tt_buff);
+ kfree(orig_node);
+}
+
+/**
+ * batadv_orig_node_release() - release orig_node from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the orig_node
+ */
+void batadv_orig_node_release(struct kref *ref)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_orig_node_vlan *vlan;
+ struct batadv_orig_ifinfo *last_candidate;
+
+ orig_node = container_of(ref, struct batadv_orig_node, refcount);
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all neighbors towards this originator ... */
+ hlist_for_each_entry_safe(neigh_node, node_tmp,
+ &orig_node->neigh_list, list) {
+ hlist_del_rcu(&neigh_node->list);
+ batadv_neigh_node_put(neigh_node);
+ }
+
+ hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
+ &orig_node->ifinfo_list, list) {
+ hlist_del_rcu(&orig_ifinfo->list);
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ }
+
+ last_candidate = orig_node->last_bonding_candidate;
+ orig_node->last_bonding_candidate = NULL;
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ batadv_orig_ifinfo_put(last_candidate);
+
+ spin_lock_bh(&orig_node->vlan_list_lock);
+ hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) {
+ hlist_del_rcu(&vlan->list);
+ batadv_orig_node_vlan_put(vlan);
+ }
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+
+ call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
+}
+
+/**
+ * batadv_originator_free() - Free all originator structures
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_originator_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+ struct batadv_orig_node *orig_node;
+ u32 i;
+
+ if (!hash)
+ return;
+
+ cancel_delayed_work_sync(&bat_priv->orig_work);
+
+ bat_priv->orig_hash = NULL;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(orig_node, node_tmp,
+ head, hash_entry) {
+ hlist_del_rcu(&orig_node->hash_entry);
+ batadv_orig_node_put(orig_node);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+}
+
+/**
+ * batadv_orig_node_new() - creates a new orig_node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the originator
+ *
+ * Creates a new originator object and initialises all the generic fields.
+ * The new object is not added to the originator list.
+ *
+ * Return: the newly created object or NULL on failure.
+ */
+struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
+ const u8 *addr)
+{
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_node_vlan *vlan;
+ unsigned long reset_time;
+ int i;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Creating new originator: %pM\n", addr);
+
+ orig_node = kzalloc(sizeof(*orig_node), GFP_ATOMIC);
+ if (!orig_node)
+ return NULL;
+
+ INIT_HLIST_HEAD(&orig_node->neigh_list);
+ INIT_HLIST_HEAD(&orig_node->vlan_list);
+ INIT_HLIST_HEAD(&orig_node->ifinfo_list);
+ spin_lock_init(&orig_node->bcast_seqno_lock);
+ spin_lock_init(&orig_node->neigh_list_lock);
+ spin_lock_init(&orig_node->tt_buff_lock);
+ spin_lock_init(&orig_node->tt_lock);
+ spin_lock_init(&orig_node->vlan_list_lock);
+
+ /* extra reference for return */
+ kref_init(&orig_node->refcount);
+
+ orig_node->bat_priv = bat_priv;
+ ether_addr_copy(orig_node->orig, addr);
+ batadv_dat_init_orig_node_addr(orig_node);
+ atomic_set(&orig_node->last_ttvn, 0);
+ orig_node->tt_buff = NULL;
+ orig_node->tt_buff_len = 0;
+ orig_node->last_seen = jiffies;
+ reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS);
+ orig_node->bcast_seqno_reset = reset_time;
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4;
+ orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
+ orig_node->mcast_flags |= BATADV_MCAST_HAVE_MC_PTYPE_CAPA;
+ INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node);
+ INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node);
+ INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node);
+ spin_lock_init(&orig_node->mcast_handler_lock);
+#endif
+
+ /* create a vlan object for the "untagged" LAN */
+ vlan = batadv_orig_node_vlan_new(orig_node, BATADV_NO_FLAGS);
+ if (!vlan)
+ goto free_orig_node;
+ /* batadv_orig_node_vlan_new() increases the refcounter.
+ * Immediately release vlan since it is not needed anymore in this
+ * context
+ */
+ batadv_orig_node_vlan_put(vlan);
+
+ for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
+ INIT_HLIST_HEAD(&orig_node->fragments[i].fragment_list);
+ spin_lock_init(&orig_node->fragments[i].lock);
+ orig_node->fragments[i].size = 0;
+ }
+
+ return orig_node;
+free_orig_node:
+ kfree(orig_node);
+ return NULL;
+}
+
+/**
+ * batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @neigh: orig node which is to be checked
+ */
+static void
+batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
+ struct batadv_neigh_node *neigh)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+ struct batadv_hard_iface *if_outgoing;
+ struct hlist_node *node_tmp;
+
+ spin_lock_bh(&neigh->ifinfo_lock);
+
+ /* for all ifinfo objects for this neighinator */
+ hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
+ &neigh->ifinfo_list, list) {
+ if_outgoing = neigh_ifinfo->if_outgoing;
+
+ /* always keep the default interface */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ continue;
+
+ /* don't purge if the interface is not (going) down */
+ if (if_outgoing->if_status != BATADV_IF_INACTIVE &&
+ if_outgoing->if_status != BATADV_IF_NOT_IN_USE &&
+ if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)
+ continue;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "neighbor/ifinfo purge: neighbor %pM, iface: %s\n",
+ neigh->addr, if_outgoing->net_dev->name);
+
+ hlist_del_rcu(&neigh_ifinfo->list);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ }
+
+ spin_unlock_bh(&neigh->ifinfo_lock);
+}
+
+/**
+ * batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be checked
+ *
+ * Return: true if any ifinfo entry was purged, false otherwise.
+ */
+static bool
+batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_hard_iface *if_outgoing;
+ struct hlist_node *node_tmp;
+ bool ifinfo_purged = false;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all ifinfo objects for this originator */
+ hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
+ &orig_node->ifinfo_list, list) {
+ if_outgoing = orig_ifinfo->if_outgoing;
+
+ /* always keep the default interface */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ continue;
+
+ /* don't purge if the interface is not (going) down */
+ if (if_outgoing->if_status != BATADV_IF_INACTIVE &&
+ if_outgoing->if_status != BATADV_IF_NOT_IN_USE &&
+ if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)
+ continue;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "router/ifinfo purge: originator %pM, iface: %s\n",
+ orig_node->orig, if_outgoing->net_dev->name);
+
+ ifinfo_purged = true;
+
+ hlist_del_rcu(&orig_ifinfo->list);
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ if (orig_node->last_bonding_candidate == orig_ifinfo) {
+ orig_node->last_bonding_candidate = NULL;
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ }
+ }
+
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ return ifinfo_purged;
+}
+
+/**
+ * batadv_purge_orig_neighbors() - purges neighbors from originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be checked
+ *
+ * Return: true if any neighbor was purged, false otherwise
+ */
+static bool
+batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ bool neigh_purged = false;
+ unsigned long last_seen;
+ struct batadv_hard_iface *if_incoming;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all neighbors towards this originator ... */
+ hlist_for_each_entry_safe(neigh_node, node_tmp,
+ &orig_node->neigh_list, list) {
+ last_seen = neigh_node->last_seen;
+ if_incoming = neigh_node->if_incoming;
+
+ if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) ||
+ if_incoming->if_status == BATADV_IF_INACTIVE ||
+ if_incoming->if_status == BATADV_IF_NOT_IN_USE ||
+ if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) {
+ if (if_incoming->if_status == BATADV_IF_INACTIVE ||
+ if_incoming->if_status == BATADV_IF_NOT_IN_USE ||
+ if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n",
+ orig_node->orig, neigh_node->addr,
+ if_incoming->net_dev->name);
+ else
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "neighbor timeout: originator %pM, neighbor: %pM, last_seen: %u\n",
+ orig_node->orig, neigh_node->addr,
+ jiffies_to_msecs(last_seen));
+
+ neigh_purged = true;
+
+ hlist_del_rcu(&neigh_node->list);
+ batadv_neigh_node_put(neigh_node);
+ } else {
+ /* only necessary if not the whole neighbor is to be
+ * deleted, but some interface has been removed.
+ */
+ batadv_purge_neigh_ifinfo(bat_priv, neigh_node);
+ }
+ }
+
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ return neigh_purged;
+}
+
+/**
+ * batadv_find_best_neighbor() - finds the best neighbor after purging
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be checked
+ * @if_outgoing: the interface for which the metric should be compared
+ *
+ * Return: the current best neighbor, with refcount increased.
+ */
+static struct batadv_neigh_node *
+batadv_find_best_neighbor(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *best = NULL, *neigh;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) {
+ if (best && (bao->neigh.cmp(neigh, if_outgoing, best,
+ if_outgoing) <= 0))
+ continue;
+
+ if (!kref_get_unless_zero(&neigh->refcount))
+ continue;
+
+ batadv_neigh_node_put(best);
+
+ best = neigh;
+ }
+ rcu_read_unlock();
+
+ return best;
+}
+
+/**
+ * batadv_purge_orig_node() - purges obsolete information from an orig_node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be checked
+ *
+ * This function checks if the orig_node or substructures of it have become
+ * obsolete, and purges this information if that's the case.
+ *
+ * Return: true if the orig_node is to be removed, false otherwise.
+ */
+static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_neigh_node *best_neigh_node;
+ struct batadv_hard_iface *hard_iface;
+ bool changed_ifinfo, changed_neigh;
+ struct list_head *iter;
+
+ if (batadv_has_timed_out(orig_node->last_seen,
+ 2 * BATADV_PURGE_TIMEOUT)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Originator timeout: originator %pM, last_seen %u\n",
+ orig_node->orig,
+ jiffies_to_msecs(orig_node->last_seen));
+ return true;
+ }
+ changed_ifinfo = batadv_purge_orig_ifinfo(bat_priv, orig_node);
+ changed_neigh = batadv_purge_orig_neighbors(bat_priv, orig_node);
+
+ if (!changed_ifinfo && !changed_neigh)
+ return false;
+
+ /* first for NULL ... */
+ best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node,
+ BATADV_IF_DEFAULT);
+ batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT,
+ best_neigh_node);
+ batadv_neigh_node_put(best_neigh_node);
+
+ /* ... then for all other interfaces. */
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
+ best_neigh_node = batadv_find_best_neighbor(bat_priv,
+ orig_node,
+ hard_iface);
+ batadv_update_route(bat_priv, orig_node, hard_iface,
+ best_neigh_node);
+ batadv_neigh_node_put(best_neigh_node);
+
+ batadv_hardif_put(hard_iface);
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+/**
+ * batadv_purge_orig_ref() - Purge all outdated originators
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+ struct batadv_orig_node *orig_node;
+ u32 i;
+
+ if (!hash)
+ return;
+
+ /* for all origins... */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ if (hlist_empty(head))
+ continue;
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(orig_node, node_tmp,
+ head, hash_entry) {
+ if (batadv_purge_orig_node(bat_priv, orig_node)) {
+ batadv_gw_node_delete(bat_priv, orig_node);
+ hlist_del_rcu(&orig_node->hash_entry);
+ batadv_tt_global_del_orig(orig_node->bat_priv,
+ orig_node, -1,
+ "originator timed out");
+ batadv_orig_node_put(orig_node);
+ continue;
+ }
+
+ batadv_frag_purge_orig(orig_node,
+ batadv_frag_check_entry);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_gw_election(bat_priv);
+}
+
+static void batadv_purge_orig(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
+ batadv_purge_orig_ref(bat_priv);
+ queue_delayed_work(batadv_event_workqueue,
+ &bat_priv->orig_work,
+ msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
+}
+
+/**
+ * batadv_orig_dump() - Dump to netlink the originator infos for a specific
+ * outgoing interface
+ * @msg: message to dump into
+ * @cb: parameters for the dump
+ *
+ * Return: 0 or error value
+ */
+int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct batadv_hard_iface *primary_if, *hard_iface;
+ struct net_device *mesh_iface;
+ struct batadv_priv *bat_priv;
+ int ret;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if) {
+ ret = -ENOENT;
+ goto out_put_mesh_iface;
+ }
+
+ if (primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out_put_primary_if;
+ }
+
+ hard_iface = batadv_netlink_get_hardif(bat_priv, cb);
+ if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) {
+ ret = PTR_ERR(hard_iface);
+ goto out_put_primary_if;
+ } else if (IS_ERR(hard_iface)) {
+ /* => PTR_ERR(hard_iface) == -ENONET
+ * => no hard-iface given, ok
+ */
+ hard_iface = BATADV_IF_DEFAULT;
+ }
+
+ if (!bat_priv->algo_ops->orig.dump) {
+ ret = -EOPNOTSUPP;
+ goto out_put_hard_iface;
+ }
+
+ bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hard_iface);
+
+ ret = msg->len;
+
+out_put_hard_iface:
+ batadv_hardif_put(hard_iface);
+out_put_primary_if:
+ batadv_hardif_put(primary_if);
+out_put_mesh_iface:
+ dev_put(mesh_iface);
+
+ return ret;
+}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
new file mode 100644
index 000000000000..db0c55128170
--- /dev/null
+++ b/net/batman-adv/originator.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_
+#define _NET_BATMAN_ADV_ORIGINATOR_H_
+
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/if_ether.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
+int batadv_originator_init(struct batadv_priv *bat_priv);
+void batadv_originator_free(struct batadv_priv *bat_priv);
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
+void batadv_orig_node_release(struct kref *ref);
+struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
+ const u8 *addr);
+struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr);
+void batadv_hardif_neigh_release(struct kref *ref);
+struct batadv_neigh_node *
+batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr);
+void batadv_neigh_node_release(struct kref *ref);
+struct batadv_neigh_node *
+batadv_orig_router_get(struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *if_outgoing);
+struct batadv_neigh_node *
+batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr,
+ struct batadv_hard_iface *if_outgoing);
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing);
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing);
+void batadv_neigh_ifinfo_release(struct kref *ref);
+
+int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb);
+
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing);
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing);
+void batadv_orig_ifinfo_release(struct kref *ref);
+
+int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
+ unsigned short vid);
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
+ unsigned short vid);
+void batadv_orig_node_vlan_release(struct kref *ref);
+
+/**
+ * batadv_choose_orig() - Return the index of the orig entry in the hash table
+ * @data: mac address of the originator node
+ * @size: the size of the hash table
+ *
+ * Return: the hash index where the object represented by @data should be
+ * stored at.
+ */
+static inline u32 batadv_choose_orig(const void *data, u32 size)
+{
+ u32 hash = 0;
+
+ hash = jhash(data, ETH_ALEN, hash);
+ return hash % size;
+}
+
+struct batadv_orig_node *
+batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data);
+
+/**
+ * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release
+ * the originator-vlan object
+ * @orig_vlan: the originator-vlan object to release
+ */
+static inline void
+batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan)
+{
+ if (!orig_vlan)
+ return;
+
+ kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
+}
+
+/**
+ * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release
+ * the neigh_ifinfo
+ * @neigh_ifinfo: the neigh_ifinfo object to release
+ */
+static inline void
+batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo)
+{
+ if (!neigh_ifinfo)
+ return;
+
+ kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release);
+}
+
+/**
+ * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter
+ * and possibly release it
+ * @hardif_neigh: hardif neigh neighbor to free
+ */
+static inline void
+batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ if (!hardif_neigh)
+ return;
+
+ kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release);
+}
+
+/**
+ * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly
+ * release it
+ * @neigh_node: neigh neighbor to free
+ */
+static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
+{
+ if (!neigh_node)
+ return;
+
+ kref_put(&neigh_node->refcount, batadv_neigh_node_release);
+}
+
+/**
+ * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release
+ * the orig_ifinfo
+ * @orig_ifinfo: the orig_ifinfo object to release
+ */
+static inline void
+batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo)
+{
+ if (!orig_ifinfo)
+ return;
+
+ kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release);
+}
+
+/**
+ * batadv_orig_node_put() - decrement the orig node refcounter and possibly
+ * release it
+ * @orig_node: the orig node to free
+ */
+static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node)
+{
+ if (!orig_node)
+ return;
+
+ kref_put(&orig_node->refcount, batadv_orig_node_release);
+}
+
+#endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
new file mode 100644
index 000000000000..12c16f81cc51
--- /dev/null
+++ b/net/batman-adv/routing.c
@@ -0,0 +1,1335 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "routing.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "bitarray.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "fragmentation.h"
+#include "hard-interface.h"
+#include "log.h"
+#include "mesh-interface.h"
+#include "originator.h"
+#include "send.h"
+#include "tp_meter.h"
+#include "translation-table.h"
+#include "tvlv.h"
+
+static int batadv_route_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+
+/**
+ * _batadv_update_route() - set the router for this originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be configured
+ * @recv_if: the receive interface for which this route is set
+ * @neigh_node: neighbor which should be the next router
+ *
+ * This function does not perform any error checks
+ */
+static void _batadv_update_route(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_neigh_node *neigh_node)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *curr_router;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, recv_if);
+ if (!orig_ifinfo)
+ return;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ /* curr_router used earlier may not be the current orig_ifinfo->router
+ * anymore because it was dereferenced outside of the neigh_list_lock
+ * protected region. After the new best neighbor has replace the current
+ * best neighbor the reference counter needs to decrease. Consequently,
+ * the code needs to ensure the curr_router variable contains a pointer
+ * to the replaced best neighbor.
+ */
+
+ /* increase refcount of new best neighbor */
+ if (neigh_node)
+ kref_get(&neigh_node->refcount);
+
+ curr_router = rcu_replace_pointer(orig_ifinfo->router, neigh_node,
+ true);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ batadv_orig_ifinfo_put(orig_ifinfo);
+
+ /* route deleted */
+ if (curr_router && !neigh_node) {
+ batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
+ "Deleting route towards: %pM\n", orig_node->orig);
+ batadv_tt_global_del_orig(bat_priv, orig_node, -1,
+ "Deleted route towards originator");
+
+ /* route added */
+ } else if (!curr_router && neigh_node) {
+ batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
+ "Adding route towards: %pM (via %pM)\n",
+ orig_node->orig, neigh_node->addr);
+ /* route changed */
+ } else if (neigh_node && curr_router) {
+ batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
+ "Changing route towards: %pM (now via %pM - was via %pM)\n",
+ orig_node->orig, neigh_node->addr,
+ curr_router->addr);
+ }
+
+ /* decrease refcount of previous best neighbor */
+ batadv_neigh_node_put(curr_router);
+}
+
+/**
+ * batadv_update_route() - set the router for this originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which is to be configured
+ * @recv_if: the receive interface for which this route is set
+ * @neigh_node: neighbor which should be the next router
+ */
+void batadv_update_route(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_neigh_node *neigh_node)
+{
+ struct batadv_neigh_node *router = NULL;
+
+ if (!orig_node)
+ goto out;
+
+ router = batadv_orig_router_get(orig_node, recv_if);
+
+ if (router != neigh_node)
+ _batadv_update_route(bat_priv, orig_node, recv_if, neigh_node);
+
+out:
+ batadv_neigh_node_put(router);
+}
+
+/**
+ * batadv_window_protected() - checks whether the host restarted and is in the
+ * protection time.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @seq_num_diff: difference between the current/received sequence number and
+ * the last sequence number
+ * @seq_old_max_diff: maximum age of sequence number not considered as restart
+ * @last_reset: jiffies timestamp of the last reset, will be updated when reset
+ * is detected
+ * @protection_started: is set to true if the protection window was started,
+ * doesn't change otherwise.
+ *
+ * Return:
+ * false if the packet is to be accepted.
+ * true if the packet is to be ignored.
+ */
+bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
+ s32 seq_old_max_diff, unsigned long *last_reset,
+ bool *protection_started)
+{
+ if (seq_num_diff <= -seq_old_max_diff ||
+ seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) {
+ if (!batadv_has_timed_out(*last_reset,
+ BATADV_RESET_PROTECTION_MS))
+ return true;
+
+ *last_reset = jiffies;
+ if (protection_started)
+ *protection_started = true;
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "old packet received, start protection\n");
+ }
+
+ return false;
+}
+
+/**
+ * batadv_check_management_packet() - Check preconditions for management packets
+ * @skb: incoming packet buffer
+ * @hard_iface: incoming hard interface
+ * @header_len: minimal header length of packet type
+ *
+ * Return: true when management preconditions are met, false otherwise
+ */
+bool batadv_check_management_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ int header_len)
+{
+ struct ethhdr *ethhdr;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, header_len)))
+ return false;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with broadcast indication but unicast recipient */
+ if (!is_broadcast_ether_addr(ethhdr->h_dest))
+ return false;
+
+ /* packet with invalid sender address */
+ if (!is_valid_ether_addr(ethhdr->h_source))
+ return false;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, 0) < 0)
+ return false;
+
+ /* keep skb linear */
+ if (skb_linearize(skb) < 0)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_recv_my_icmp_packet() - receive an icmp packet locally
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: icmp packet to process
+ *
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
+ */
+static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_icmp_header *icmph;
+ int res, ret = NET_RX_DROP;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ switch (icmph->msg_type) {
+ case BATADV_ECHO_REQUEST:
+ /* answer echo request (ping) */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, icmph->orig);
+ if (!orig_node)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ ether_addr_copy(icmph->dst, icmph->orig);
+ ether_addr_copy(icmph->orig, primary_if->net_dev->dev_addr);
+ icmph->msg_type = BATADV_ECHO_REPLY;
+ icmph->ttl = BATADV_TTL;
+
+ res = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (res == NET_XMIT_SUCCESS)
+ ret = NET_RX_SUCCESS;
+
+ /* skb was consumed */
+ skb = NULL;
+ break;
+ case BATADV_TP:
+ if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet)))
+ goto out;
+
+ batadv_tp_meter_recv(bat_priv, skb);
+ ret = NET_RX_SUCCESS;
+ /* skb was consumed */
+ skb = NULL;
+ goto out;
+ default:
+ /* drop unknown type */
+ goto out;
+ }
+out:
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+
+ kfree_skb(skb);
+
+ return ret;
+}
+
+static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_icmp_packet *icmp_packet;
+ int res, ret = NET_RX_DROP;
+
+ icmp_packet = (struct batadv_icmp_packet *)skb->data;
+
+ /* send TTL exceeded if packet is an echo request (traceroute) */
+ if (icmp_packet->msg_type != BATADV_ECHO_REQUEST) {
+ pr_debug("Warning - can't forward icmp packet from %pM to %pM: ttl exceeded\n",
+ icmp_packet->orig, icmp_packet->dst);
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, icmp_packet->orig);
+ if (!orig_node)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ icmp_packet = (struct batadv_icmp_packet *)skb->data;
+
+ ether_addr_copy(icmp_packet->dst, icmp_packet->orig);
+ ether_addr_copy(icmp_packet->orig, primary_if->net_dev->dev_addr);
+ icmp_packet->msg_type = BATADV_TTL_EXCEEDED;
+ icmp_packet->ttl = BATADV_TTL;
+
+ res = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (res == NET_RX_SUCCESS)
+ ret = NET_XMIT_SUCCESS;
+
+ /* skb was consumed */
+ skb = NULL;
+
+out:
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_recv_icmp_packet() - Process incoming icmp packet
+ * @skb: incoming packet buffer
+ * @recv_if: incoming hard interface
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
+int batadv_recv_icmp_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_icmp_header *icmph;
+ struct batadv_icmp_packet_rr *icmp_packet_rr;
+ struct ethhdr *ethhdr;
+ struct batadv_orig_node *orig_node = NULL;
+ int hdr_size = sizeof(struct batadv_icmp_header);
+ int res, ret = NET_RX_DROP;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ goto free_skb;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with unicast indication but non-unicast recipient */
+ if (!is_valid_ether_addr(ethhdr->h_dest))
+ goto free_skb;
+
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ goto free_skb;
+
+ /* not for me */
+ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
+ goto free_skb;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ /* add record route information if not full */
+ if ((icmph->msg_type == BATADV_ECHO_REPLY ||
+ icmph->msg_type == BATADV_ECHO_REQUEST) &&
+ skb->len >= sizeof(struct batadv_icmp_packet_rr)) {
+ if (skb_linearize(skb) < 0)
+ goto free_skb;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto free_skb;
+
+ ethhdr = eth_hdr(skb);
+ icmph = (struct batadv_icmp_header *)skb->data;
+ icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
+ if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
+ goto free_skb;
+
+ ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur],
+ ethhdr->h_dest);
+ icmp_packet_rr->rr_cur++;
+ }
+
+ /* packet for me */
+ if (batadv_is_my_mac(bat_priv, icmph->dst))
+ return batadv_recv_my_icmp_packet(bat_priv, skb);
+
+ /* TTL exceeded */
+ if (icmph->ttl < 2)
+ return batadv_recv_icmp_ttl_exceeded(bat_priv, skb);
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, icmph->dst);
+ if (!orig_node)
+ goto free_skb;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto put_orig_node;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ /* decrement ttl */
+ icmph->ttl--;
+
+ /* route it */
+ res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
+ if (res == NET_XMIT_SUCCESS)
+ ret = NET_RX_SUCCESS;
+
+ /* skb was consumed */
+ skb = NULL;
+
+put_orig_node:
+ batadv_orig_node_put(orig_node);
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_check_unicast_packet() - Check for malformed unicast packets
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: packet to check
+ * @hdr_size: size of header to pull
+ *
+ * Checks for short header and bad addresses in the given packet.
+ *
+ * Return: negative value when check fails and 0 otherwise. The negative value
+ * depends on the reason: -ENODATA for bad header, -EBADR for broadcast
+ * destination or source, and -EREMOTE for non-local (other host) destination.
+ */
+static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ struct ethhdr *ethhdr;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ return -ENODATA;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with unicast indication but non-unicast recipient */
+ if (!is_valid_ether_addr(ethhdr->h_dest))
+ return -EBADR;
+
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ return -EBADR;
+
+ /* not for me */
+ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
+ return -EREMOTE;
+
+ return 0;
+}
+
+/**
+ * batadv_last_bonding_get() - Get last_bonding_candidate of orig_node
+ * @orig_node: originator node whose last bonding candidate should be retrieved
+ *
+ * Return: last bonding candidate of router or NULL if not found
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+static struct batadv_orig_ifinfo *
+batadv_last_bonding_get(struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_ifinfo *last_bonding_candidate;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ last_bonding_candidate = orig_node->last_bonding_candidate;
+
+ if (last_bonding_candidate)
+ kref_get(&last_bonding_candidate->refcount);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ return last_bonding_candidate;
+}
+
+/**
+ * batadv_last_bonding_replace() - Replace last_bonding_candidate of orig_node
+ * @orig_node: originator node whose bonding candidates should be replaced
+ * @new_candidate: new bonding candidate or NULL
+ */
+static void
+batadv_last_bonding_replace(struct batadv_orig_node *orig_node,
+ struct batadv_orig_ifinfo *new_candidate)
+{
+ struct batadv_orig_ifinfo *old_candidate;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ old_candidate = orig_node->last_bonding_candidate;
+
+ if (new_candidate)
+ kref_get(&new_candidate->refcount);
+ orig_node->last_bonding_candidate = new_candidate;
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ batadv_orig_ifinfo_put(old_candidate);
+}
+
+/**
+ * batadv_find_router() - find a suitable router for this originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: the destination node
+ * @recv_if: pointer to interface this packet was received on
+ *
+ * Return: the router which should be used for this orig_node on
+ * this interface, or NULL if not available.
+ */
+struct batadv_neigh_node *
+batadv_find_router(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
+ struct batadv_neigh_node *first_candidate_router = NULL;
+ struct batadv_neigh_node *next_candidate_router = NULL;
+ struct batadv_neigh_node *router, *cand_router = NULL;
+ struct batadv_neigh_node *last_cand_router = NULL;
+ struct batadv_orig_ifinfo *cand, *first_candidate = NULL;
+ struct batadv_orig_ifinfo *next_candidate = NULL;
+ struct batadv_orig_ifinfo *last_candidate;
+ bool last_candidate_found = false;
+
+ if (!orig_node)
+ return NULL;
+
+ router = batadv_orig_router_get(orig_node, recv_if);
+
+ if (!router)
+ return router;
+
+ /* only consider bonding for recv_if == BATADV_IF_DEFAULT (first hop)
+ * and if activated.
+ */
+ if (!(recv_if == BATADV_IF_DEFAULT && atomic_read(&bat_priv->bonding)))
+ return router;
+
+ /* bonding: loop through the list of possible routers found
+ * for the various outgoing interfaces and find a candidate after
+ * the last chosen bonding candidate (next_candidate). If no such
+ * router is found, use the first candidate found (the previously
+ * chosen bonding candidate might have been the last one in the list).
+ * If this can't be found either, return the previously chosen
+ * router - obviously there are no other candidates.
+ */
+ rcu_read_lock();
+ last_candidate = batadv_last_bonding_get(orig_node);
+ if (last_candidate)
+ last_cand_router = rcu_dereference(last_candidate->router);
+
+ hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) {
+ /* acquire some structures and references ... */
+ if (!kref_get_unless_zero(&cand->refcount))
+ continue;
+
+ cand_router = rcu_dereference(cand->router);
+ if (!cand_router)
+ goto next;
+
+ if (!kref_get_unless_zero(&cand_router->refcount)) {
+ cand_router = NULL;
+ goto next;
+ }
+
+ /* alternative candidate should be good enough to be
+ * considered
+ */
+ if (!bao->neigh.is_similar_or_better(cand_router,
+ cand->if_outgoing, router,
+ recv_if))
+ goto next;
+
+ /* don't use the same router twice */
+ if (last_cand_router == cand_router)
+ goto next;
+
+ /* mark the first possible candidate */
+ if (!first_candidate) {
+ kref_get(&cand_router->refcount);
+ kref_get(&cand->refcount);
+ first_candidate = cand;
+ first_candidate_router = cand_router;
+ }
+
+ /* check if the loop has already passed the previously selected
+ * candidate ... this function should select the next candidate
+ * AFTER the previously used bonding candidate.
+ */
+ if (!last_candidate || last_candidate_found) {
+ next_candidate = cand;
+ next_candidate_router = cand_router;
+ break;
+ }
+
+ if (last_candidate == cand)
+ last_candidate_found = true;
+next:
+ /* free references */
+ if (cand_router) {
+ batadv_neigh_node_put(cand_router);
+ cand_router = NULL;
+ }
+ batadv_orig_ifinfo_put(cand);
+ }
+ rcu_read_unlock();
+
+ /* After finding candidates, handle the three cases:
+ * 1) there is a next candidate, use that
+ * 2) there is no next candidate, use the first of the list
+ * 3) there is no candidate at all, return the default router
+ */
+ if (next_candidate) {
+ batadv_neigh_node_put(router);
+
+ kref_get(&next_candidate_router->refcount);
+ router = next_candidate_router;
+ batadv_last_bonding_replace(orig_node, next_candidate);
+ } else if (first_candidate) {
+ batadv_neigh_node_put(router);
+
+ kref_get(&first_candidate_router->refcount);
+ router = first_candidate_router;
+ batadv_last_bonding_replace(orig_node, first_candidate);
+ } else {
+ batadv_last_bonding_replace(orig_node, NULL);
+ }
+
+ /* cleanup of candidates */
+ if (first_candidate) {
+ batadv_neigh_node_put(first_candidate_router);
+ batadv_orig_ifinfo_put(first_candidate);
+ }
+
+ if (next_candidate) {
+ batadv_neigh_node_put(next_candidate_router);
+ batadv_orig_ifinfo_put(next_candidate);
+ }
+
+ batadv_orig_ifinfo_put(last_candidate);
+
+ return router;
+}
+
+static int batadv_route_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_unicast_packet *unicast_packet;
+ struct ethhdr *ethhdr = eth_hdr(skb);
+ int res, hdr_len, ret = NET_RX_DROP;
+ unsigned int len;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+
+ /* TTL exceeded */
+ if (unicast_packet->ttl < 2) {
+ pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n",
+ ethhdr->h_source, unicast_packet->dest);
+ goto free_skb;
+ }
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest);
+
+ if (!orig_node)
+ goto free_skb;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto put_orig_node;
+
+ /* decrement ttl */
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_packet->ttl--;
+
+ switch (unicast_packet->packet_type) {
+ case BATADV_UNICAST_4ADDR:
+ hdr_len = sizeof(struct batadv_unicast_4addr_packet);
+ break;
+ case BATADV_UNICAST:
+ hdr_len = sizeof(struct batadv_unicast_packet);
+ break;
+ default:
+ /* other packet types not supported - yet */
+ hdr_len = -1;
+ break;
+ }
+
+ if (hdr_len > 0)
+ batadv_skb_set_priority(skb, hdr_len);
+
+ len = skb->len;
+ res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
+
+ /* translate transmit result into receive result */
+ if (res == NET_XMIT_SUCCESS) {
+ ret = NET_RX_SUCCESS;
+ /* skb was transmitted and consumed */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
+ len + ETH_HLEN);
+ }
+
+ /* skb was consumed */
+ skb = NULL;
+
+put_orig_node:
+ batadv_orig_node_put(orig_node);
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_reroute_unicast_packet() - update the unicast header for re-routing
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: unicast packet to process
+ * @unicast_packet: the unicast header to be updated
+ * @dst_addr: the payload destination
+ * @vid: VLAN identifier
+ *
+ * Search the translation table for dst_addr and update the unicast header with
+ * the new corresponding information (originator address where the destination
+ * client currently is and its known TTVN)
+ *
+ * Return: true if the packet header has been updated, false otherwise
+ */
+static bool
+batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_unicast_packet *unicast_packet,
+ u8 *dst_addr, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_hard_iface *primary_if = NULL;
+ bool ret = false;
+ const u8 *orig_addr;
+ u8 orig_ttvn;
+
+ if (batadv_is_my_client(bat_priv, dst_addr, vid)) {
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+ orig_addr = primary_if->net_dev->dev_addr;
+ orig_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
+ } else {
+ orig_node = batadv_transtable_search(bat_priv, NULL, dst_addr,
+ vid);
+ if (!orig_node)
+ goto out;
+
+ if (batadv_compare_eth(orig_node->orig, unicast_packet->dest))
+ goto out;
+
+ orig_addr = orig_node->orig;
+ orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
+ }
+
+ /* update the packet header */
+ skb_postpull_rcsum(skb, unicast_packet, sizeof(*unicast_packet));
+ ether_addr_copy(unicast_packet->dest, orig_addr);
+ unicast_packet->ttvn = orig_ttvn;
+ skb_postpush_rcsum(skb, unicast_packet, sizeof(*unicast_packet));
+
+ ret = true;
+out:
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+
+ return ret;
+}
+
+static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_len)
+{
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_hard_iface *primary_if;
+ struct batadv_orig_node *orig_node;
+ u8 curr_ttvn, old_ttvn;
+ struct ethhdr *ethhdr;
+ unsigned short vid;
+ int is_old_ttvn;
+
+ /* check if there is enough data before accessing it */
+ if (!pskb_may_pull(skb, hdr_len + ETH_HLEN))
+ return false;
+
+ /* create a copy of the skb (in case of for re-routing) to modify it. */
+ if (skb_cow(skb, sizeof(*unicast_packet)) < 0)
+ return false;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ vid = batadv_get_vid(skb, hdr_len);
+ ethhdr = (struct ethhdr *)(skb->data + hdr_len);
+
+ /* do not reroute multicast frames in a unicast header */
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ return true;
+
+ /* check if the destination client was served by this node and it is now
+ * roaming. In this case, it means that the node has got a ROAM_ADV
+ * message and that it knows the new destination in the mesh to re-route
+ * the packet to
+ */
+ if (batadv_tt_local_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) {
+ if (batadv_reroute_unicast_packet(bat_priv, skb, unicast_packet,
+ ethhdr->h_dest, vid))
+ batadv_dbg_ratelimited(BATADV_DBG_TT,
+ bat_priv,
+ "Rerouting unicast packet to %pM (dst=%pM): Local Roaming\n",
+ unicast_packet->dest,
+ ethhdr->h_dest);
+ /* at this point the mesh destination should have been
+ * substituted with the originator address found in the global
+ * table. If not, let the packet go untouched anyway because
+ * there is nothing the node can do
+ */
+ return true;
+ }
+
+ /* retrieve the TTVN known by this node for the packet destination. This
+ * value is used later to check if the node which sent (or re-routed
+ * last time) the packet had an updated information or not
+ */
+ curr_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
+ if (!batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
+ orig_node = batadv_orig_hash_find(bat_priv,
+ unicast_packet->dest);
+ /* if it is not possible to find the orig_node representing the
+ * destination, the packet can immediately be dropped as it will
+ * not be possible to deliver it
+ */
+ if (!orig_node)
+ return false;
+
+ curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
+ batadv_orig_node_put(orig_node);
+ }
+
+ /* check if the TTVN contained in the packet is fresher than what the
+ * node knows
+ */
+ is_old_ttvn = batadv_seq_before(unicast_packet->ttvn, curr_ttvn);
+ if (!is_old_ttvn)
+ return true;
+
+ old_ttvn = unicast_packet->ttvn;
+ /* the packet was forged based on outdated network information. Its
+ * destination can possibly be updated and forwarded towards the new
+ * target host
+ */
+ if (batadv_reroute_unicast_packet(bat_priv, skb, unicast_packet,
+ ethhdr->h_dest, vid)) {
+ batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv,
+ "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n",
+ unicast_packet->dest, ethhdr->h_dest,
+ old_ttvn, curr_ttvn);
+ return true;
+ }
+
+ /* the packet has not been re-routed: either the destination is
+ * currently served by this node or there is no destination at all and
+ * it is possible to drop the packet
+ */
+ if (!batadv_is_my_client(bat_priv, ethhdr->h_dest, vid))
+ return false;
+
+ /* update the header in order to let the packet be delivered to this
+ * node's mesh interface
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return false;
+
+ /* update the packet header */
+ skb_postpull_rcsum(skb, unicast_packet, sizeof(*unicast_packet));
+ ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr);
+ unicast_packet->ttvn = curr_ttvn;
+ skb_postpush_rcsum(skb, unicast_packet, sizeof(*unicast_packet));
+
+ batadv_hardif_put(primary_if);
+
+ return true;
+}
+
+/**
+ * batadv_recv_unhandled_unicast_packet() - receive and process packets which
+ * are in the unicast number space but not yet known to the implementation
+ * @skb: unicast tvlv packet to process
+ * @recv_if: pointer to interface this packet was received on
+ *
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
+ */
+int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ int check, hdr_size = sizeof(*unicast_packet);
+
+ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
+ if (check < 0)
+ goto free_skb;
+
+ /* we don't know about this type, drop it. */
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ if (batadv_is_my_mac(bat_priv, unicast_packet->dest))
+ goto free_skb;
+
+ return batadv_route_unicast_packet(skb, recv_if);
+
+free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/**
+ * batadv_recv_unicast_packet() - Process incoming unicast packet
+ * @skb: incoming packet buffer
+ * @recv_if: incoming hard interface
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
+int batadv_recv_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_unicast_4addr_packet *unicast_4addr_packet;
+ u8 *orig_addr, *orig_addr_gw;
+ struct batadv_orig_node *orig_node = NULL, *orig_node_gw = NULL;
+ int check, hdr_size = sizeof(*unicast_packet);
+ enum batadv_subtype subtype;
+ int ret = NET_RX_DROP;
+ bool is4addr, is_gw;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ is4addr = unicast_packet->packet_type == BATADV_UNICAST_4ADDR;
+ /* the caller function should have already pulled 2 bytes */
+ if (is4addr)
+ hdr_size = sizeof(*unicast_4addr_packet);
+
+ /* function returns -EREMOTE for promiscuous packets */
+ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
+ if (check < 0)
+ goto free_skb;
+
+ if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
+ goto free_skb;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+
+ /* packet for me */
+ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
+ /* If this is a unicast packet from another backgone gw,
+ * drop it.
+ */
+ orig_addr_gw = eth_hdr(skb)->h_source;
+ orig_node_gw = batadv_orig_hash_find(bat_priv, orig_addr_gw);
+ if (orig_node_gw) {
+ is_gw = batadv_bla_is_backbone_gw(skb, orig_node_gw,
+ hdr_size);
+ batadv_orig_node_put(orig_node_gw);
+ if (is_gw) {
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "%s(): Dropped unicast pkt received from another backbone gw %pM.\n",
+ __func__, orig_addr_gw);
+ goto free_skb;
+ }
+ }
+
+ if (is4addr) {
+ unicast_4addr_packet =
+ (struct batadv_unicast_4addr_packet *)skb->data;
+ subtype = unicast_4addr_packet->subtype;
+ batadv_dat_inc_counter(bat_priv, subtype);
+
+ /* Only payload data should be considered for speedy
+ * join. For example, DAT also uses unicast 4addr
+ * types, but those packets should not be considered
+ * for speedy join, since the clients do not actually
+ * reside at the sending originator.
+ */
+ if (subtype == BATADV_P_DATA) {
+ orig_addr = unicast_4addr_packet->src;
+ orig_node = batadv_orig_hash_find(bat_priv,
+ orig_addr);
+ }
+ }
+
+ if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb,
+ hdr_size))
+ goto rx_success;
+ if (batadv_dat_snoop_incoming_arp_reply(bat_priv, skb,
+ hdr_size))
+ goto rx_success;
+
+ batadv_dat_snoop_incoming_dhcp_ack(bat_priv, skb, hdr_size);
+
+ batadv_interface_rx(recv_if->mesh_iface, skb, hdr_size,
+ orig_node);
+
+rx_success:
+ batadv_orig_node_put(orig_node);
+
+ return NET_RX_SUCCESS;
+ }
+
+ ret = batadv_route_unicast_packet(skb, recv_if);
+ /* skb was consumed */
+ skb = NULL;
+
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_recv_unicast_tvlv() - receive and process unicast tvlv packets
+ * @skb: unicast tvlv packet to process
+ * @recv_if: pointer to interface this packet was received on
+ *
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
+ */
+int batadv_recv_unicast_tvlv(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
+ unsigned char *tvlv_buff;
+ u16 tvlv_buff_len;
+ int hdr_size = sizeof(*unicast_tvlv_packet);
+ int ret = NET_RX_DROP;
+
+ if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
+ goto free_skb;
+
+ /* the header is likely to be modified while forwarding */
+ if (skb_cow(skb, hdr_size) < 0)
+ goto free_skb;
+
+ /* packet needs to be linearized to access the tvlv content */
+ if (skb_linearize(skb) < 0)
+ goto free_skb;
+
+ unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)skb->data;
+
+ tvlv_buff = (unsigned char *)(skb->data + hdr_size);
+ tvlv_buff_len = ntohs(unicast_tvlv_packet->tvlv_len);
+
+ if (tvlv_buff_len > skb->len - hdr_size)
+ goto free_skb;
+
+ ret = batadv_tvlv_containers_process(bat_priv, BATADV_UNICAST_TVLV,
+ NULL, skb, tvlv_buff,
+ tvlv_buff_len);
+
+ if (ret != NET_RX_SUCCESS) {
+ ret = batadv_route_unicast_packet(skb, recv_if);
+ /* skb was consumed */
+ skb = NULL;
+ }
+
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_recv_frag_packet() - process received fragment
+ * @skb: the received fragment
+ * @recv_if: interface that the skb is received on
+ *
+ * This function does one of the three following things: 1) Forward fragment, if
+ * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still
+ * lack further fragments; 3) Merge fragments, if we have all needed parts.
+ *
+ * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
+ */
+int batadv_recv_frag_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_orig_node *orig_node_src = NULL;
+ struct batadv_frag_packet *frag_packet;
+ int ret = NET_RX_DROP;
+
+ if (batadv_check_unicast_packet(bat_priv, skb,
+ sizeof(*frag_packet)) < 0)
+ goto free_skb;
+
+ frag_packet = (struct batadv_frag_packet *)skb->data;
+ orig_node_src = batadv_orig_hash_find(bat_priv, frag_packet->orig);
+ if (!orig_node_src)
+ goto free_skb;
+
+ skb->priority = frag_packet->priority + 256;
+
+ /* Route the fragment if it is not for us and too big to be merged. */
+ if (!batadv_is_my_mac(bat_priv, frag_packet->dest) &&
+ batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) {
+ /* skb was consumed */
+ skb = NULL;
+ ret = NET_RX_SUCCESS;
+ goto put_orig_node;
+ }
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_RX_BYTES, skb->len);
+
+ /* Add fragment to buffer and merge if possible. */
+ if (!batadv_frag_skb_buffer(&skb, orig_node_src))
+ goto put_orig_node;
+
+ /* Deliver merged packet to the appropriate handler, if it was
+ * merged
+ */
+ if (skb) {
+ batadv_batman_skb_recv(skb, recv_if->net_dev,
+ &recv_if->batman_adv_ptype, NULL);
+ /* skb was consumed */
+ skb = NULL;
+ }
+
+ ret = NET_RX_SUCCESS;
+
+put_orig_node:
+ batadv_orig_node_put(orig_node_src);
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_recv_bcast_packet() - Process incoming broadcast packet
+ * @skb: incoming packet buffer
+ * @recv_if: incoming hard interface
+ *
+ * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure
+ */
+int batadv_recv_bcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_bcast_packet *bcast_packet;
+ struct ethhdr *ethhdr;
+ int hdr_size = sizeof(*bcast_packet);
+ s32 seq_diff;
+ u32 seqno;
+ int ret;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ goto free_skb;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with broadcast indication but unicast recipient */
+ if (!is_broadcast_ether_addr(ethhdr->h_dest))
+ goto free_skb;
+
+ /* packet with broadcast/multicast sender address */
+ if (is_multicast_ether_addr(ethhdr->h_source))
+ goto free_skb;
+
+ /* ignore broadcasts sent by myself */
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+ goto free_skb;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+
+ /* ignore broadcasts originated by myself */
+ if (batadv_is_my_mac(bat_priv, bcast_packet->orig))
+ goto free_skb;
+
+ if (bcast_packet->ttl-- < 2)
+ goto free_skb;
+
+ orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig);
+
+ if (!orig_node)
+ goto free_skb;
+
+ spin_lock_bh(&orig_node->bcast_seqno_lock);
+
+ seqno = ntohl(bcast_packet->seqno);
+ /* check whether the packet is a duplicate */
+ if (batadv_test_bit(orig_node->bcast_bits, orig_node->last_bcast_seqno,
+ seqno))
+ goto spin_unlock;
+
+ seq_diff = seqno - orig_node->last_bcast_seqno;
+
+ /* check whether the packet is old and the host just restarted. */
+ if (batadv_window_protected(bat_priv, seq_diff,
+ BATADV_BCAST_MAX_AGE,
+ &orig_node->bcast_seqno_reset, NULL))
+ goto spin_unlock;
+
+ /* mark broadcast in flood history, update window position
+ * if required.
+ */
+ if (batadv_bit_get_packet(bat_priv, orig_node->bcast_bits, seq_diff, 1))
+ orig_node->last_bcast_seqno = seqno;
+
+ spin_unlock_bh(&orig_node->bcast_seqno_lock);
+
+ /* check whether this has been sent by another originator before */
+ if (batadv_bla_check_bcast_duplist(bat_priv, skb))
+ goto free_skb;
+
+ batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
+
+ /* rebroadcast packet */
+ ret = batadv_forw_bcast_packet(bat_priv, skb, 0, false);
+ if (ret == NETDEV_TX_BUSY)
+ goto free_skb;
+
+ /* don't hand the broadcast up if it is from an originator
+ * from the same backbone.
+ */
+ if (batadv_bla_is_backbone_gw(skb, orig_node, hdr_size))
+ goto free_skb;
+
+ if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, hdr_size))
+ goto rx_success;
+ if (batadv_dat_snoop_incoming_arp_reply(bat_priv, skb, hdr_size))
+ goto rx_success;
+
+ batadv_dat_snoop_incoming_dhcp_ack(bat_priv, skb, hdr_size);
+
+ /* broadcast for me */
+ batadv_interface_rx(recv_if->mesh_iface, skb, hdr_size, orig_node);
+
+rx_success:
+ ret = NET_RX_SUCCESS;
+ goto out;
+
+spin_unlock:
+ spin_unlock_bh(&orig_node->bcast_seqno_lock);
+free_skb:
+ kfree_skb(skb);
+ ret = NET_RX_DROP;
+out:
+ batadv_orig_node_put(orig_node);
+ return ret;
+}
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+/**
+ * batadv_recv_mcast_packet() - process received batman-adv multicast packet
+ * @skb: the received batman-adv multicast packet
+ * @recv_if: interface that the skb is received on
+ *
+ * Parses the given, received batman-adv multicast packet. Depending on the
+ * contents of its TVLV forwards it and/or decapsulates it to hand it to the
+ * mesh interface.
+ *
+ * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
+ */
+int batadv_recv_mcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_mcast_packet *mcast_packet;
+ int hdr_size = sizeof(*mcast_packet);
+ unsigned char *tvlv_buff;
+ int ret = NET_RX_DROP;
+ u16 tvlv_buff_len;
+
+ if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
+ goto free_skb;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto free_skb;
+
+ /* packet needs to be linearized to access the tvlv content */
+ if (skb_linearize(skb) < 0)
+ goto free_skb;
+
+ mcast_packet = (struct batadv_mcast_packet *)skb->data;
+ if (mcast_packet->ttl-- < 2)
+ goto free_skb;
+
+ tvlv_buff = (unsigned char *)(skb->data + hdr_size);
+ tvlv_buff_len = ntohs(mcast_packet->tvlv_len);
+
+ if (tvlv_buff_len > skb->len - hdr_size)
+ goto free_skb;
+
+ ret = batadv_tvlv_containers_process(bat_priv, BATADV_MCAST, NULL, skb,
+ tvlv_buff, tvlv_buff_len);
+ if (ret >= 0) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_BYTES,
+ skb->len + ETH_HLEN);
+ }
+
+ hdr_size += tvlv_buff_len;
+
+ if (ret == NET_RX_SUCCESS && (skb->len - hdr_size >= ETH_HLEN)) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL_BYTES,
+ skb->len - hdr_size);
+
+ batadv_interface_rx(bat_priv->mesh_iface, skb, hdr_size, NULL);
+ /* skb was consumed */
+ skb = NULL;
+ }
+
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+#endif /* CONFIG_BATMAN_ADV_MCAST */
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
new file mode 100644
index 000000000000..e9849f032a24
--- /dev/null
+++ b/net/batman-adv/routing.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_ROUTING_H_
+#define _NET_BATMAN_ADV_ROUTING_H_
+
+#include "main.h"
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+bool batadv_check_management_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ int header_len);
+void batadv_update_route(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_neigh_node *neigh_node);
+int batadv_recv_icmp_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_frag_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *iface);
+int batadv_recv_bcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+int batadv_recv_mcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+#else
+static inline int batadv_recv_mcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+#endif
+int batadv_recv_unicast_tvlv(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+struct batadv_neigh_node *
+batadv_find_router(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if);
+bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
+ s32 seq_old_max_diff, unsigned long *last_reset,
+ bool *protection_started);
+
+#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
new file mode 100644
index 000000000000..20d85c681064
--- /dev/null
+++ b/net/batman-adv/send.c
@@ -0,0 +1,1120 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "send.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/workqueue.h>
+
+#include "distributed-arp-table.h"
+#include "fragmentation.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "log.h"
+#include "mesh-interface.h"
+#include "originator.h"
+#include "routing.h"
+#include "translation-table.h"
+
+static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
+
+/**
+ * batadv_send_skb_packet() - send an already prepared packet
+ * @skb: the packet to send
+ * @hard_iface: the interface to use to send the broadcast packet
+ * @dst_addr: the payload destination
+ *
+ * Send out an already prepared packet to the given neighbor or broadcast it
+ * using the specified interface. Either hard_iface or neigh_node must be not
+ * NULL.
+ * If neigh_node is NULL, then the packet is broadcasted using hard_iface,
+ * otherwise it is sent as unicast to the given neighbor.
+ *
+ * Regardless of the return value, the skb is consumed.
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ */
+int batadv_send_skb_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *dst_addr)
+{
+ struct ethhdr *ethhdr;
+ int ret;
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ goto send_skb_err;
+
+ if (unlikely(!hard_iface->net_dev))
+ goto send_skb_err;
+
+ if (!(hard_iface->net_dev->flags & IFF_UP)) {
+ pr_warn("Interface %s is not up - can't send packet via that interface!\n",
+ hard_iface->net_dev->name);
+ goto send_skb_err;
+ }
+
+ /* push to the ethernet header. */
+ if (batadv_skb_head_push(skb, ETH_HLEN) < 0)
+ goto send_skb_err;
+
+ skb_reset_mac_header(skb);
+
+ ethhdr = eth_hdr(skb);
+ ether_addr_copy(ethhdr->h_source, hard_iface->net_dev->dev_addr);
+ ether_addr_copy(ethhdr->h_dest, dst_addr);
+ ethhdr->h_proto = htons(ETH_P_BATMAN);
+
+ skb_set_network_header(skb, ETH_HLEN);
+ skb->protocol = htons(ETH_P_BATMAN);
+
+ skb->dev = hard_iface->net_dev;
+
+ /* dev_queue_xmit() returns a negative result on error. However on
+ * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
+ * (which is > 0). This will not be treated as an error.
+ */
+ ret = dev_queue_xmit(skb);
+ return net_xmit_eval(ret);
+send_skb_err:
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+
+/**
+ * batadv_send_broadcast_skb() - Send broadcast packet via hard interface
+ * @skb: packet to be transmitted (with batadv header and no outer eth header)
+ * @hard_iface: outgoing interface
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ */
+int batadv_send_broadcast_skb(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ static const u8 broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+ return batadv_send_skb_packet(skb, hard_iface, broadcast_addr);
+}
+
+/**
+ * batadv_send_unicast_skb() - Send unicast packet to neighbor
+ * @skb: packet to be transmitted (with batadv header and no outer eth header)
+ * @neigh: neighbor which is used as next hop to destination
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ */
+int batadv_send_unicast_skb(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh)
+{
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ struct batadv_hardif_neigh_node *hardif_neigh;
+#endif
+ int ret;
+
+ ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr);
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr);
+
+ if (hardif_neigh && ret != NET_XMIT_DROP)
+ hardif_neigh->bat_v.last_unicast_tx = jiffies;
+
+ batadv_hardif_neigh_put(hardif_neigh);
+#endif
+
+ return ret;
+}
+
+/**
+ * batadv_send_skb_to_orig() - Lookup next-hop and transmit skb.
+ * @skb: Packet to be transmitted.
+ * @orig_node: Final destination of the packet.
+ * @recv_if: Interface used when receiving the packet (can be NULL).
+ *
+ * Looks up the best next-hop towards the passed originator and passes the
+ * skb on for preparation of MAC header. If the packet originated from this
+ * host, NULL can be passed as recv_if and no interface alternating is
+ * attempted.
+ *
+ * Return: negative errno code on a failure, -EINPROGRESS if the skb is
+ * buffered for later transmit or the NET_XMIT status returned by the
+ * lower routine if the packet has been passed down.
+ */
+int batadv_send_skb_to_orig(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = orig_node->bat_priv;
+ struct batadv_neigh_node *neigh_node;
+ int ret;
+
+ /* batadv_find_router() increases neigh_nodes refcount if found. */
+ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
+ if (!neigh_node) {
+ ret = -EINVAL;
+ goto free_skb;
+ }
+
+ /* Check if the skb is too large to send in one piece and fragment
+ * it if needed.
+ */
+ if (atomic_read(&bat_priv->fragmentation) &&
+ skb->len > neigh_node->if_incoming->net_dev->mtu) {
+ /* Fragment and send packet. */
+ ret = batadv_frag_send_packet(skb, orig_node, neigh_node);
+ /* skb was consumed */
+ skb = NULL;
+
+ goto put_neigh_node;
+ }
+
+ ret = batadv_send_unicast_skb(skb, neigh_node);
+
+ /* skb was consumed */
+ skb = NULL;
+
+put_neigh_node:
+ batadv_neigh_node_put(neigh_node);
+free_skb:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_send_skb_push_fill_unicast() - extend the buffer and initialize the
+ * common fields for unicast packets
+ * @skb: the skb carrying the unicast header to initialize
+ * @hdr_size: amount of bytes to push at the beginning of the skb
+ * @orig_node: the destination node
+ *
+ * Return: false if the buffer extension was not possible or true otherwise.
+ */
+static bool
+batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_unicast_packet *unicast_packet;
+ u8 ttvn = (u8)atomic_read(&orig_node->last_ttvn);
+
+ if (batadv_skb_head_push(skb, hdr_size) < 0)
+ return false;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_packet->version = BATADV_COMPAT_VERSION;
+ /* batman packet type: unicast */
+ unicast_packet->packet_type = BATADV_UNICAST;
+ /* set unicast ttl */
+ unicast_packet->ttl = BATADV_TTL;
+ /* copy the destination for faster routing */
+ ether_addr_copy(unicast_packet->dest, orig_node->orig);
+ /* set the destination tt version number */
+ unicast_packet->ttvn = ttvn;
+
+ return true;
+}
+
+/**
+ * batadv_send_skb_prepare_unicast() - encapsulate an skb with a unicast header
+ * @skb: the skb containing the payload to encapsulate
+ * @orig_node: the destination node
+ *
+ * Return: false if the payload could not be encapsulated or true otherwise.
+ */
+static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node)
+{
+ size_t uni_size = sizeof(struct batadv_unicast_packet);
+
+ return batadv_send_skb_push_fill_unicast(skb, uni_size, orig_node);
+}
+
+/**
+ * batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a
+ * unicast 4addr header
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb containing the payload to encapsulate
+ * @orig: the destination node
+ * @packet_subtype: the unicast 4addr packet subtype to use
+ *
+ * Return: false if the payload could not be encapsulated or true otherwise.
+ */
+bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ struct batadv_orig_node *orig,
+ int packet_subtype)
+{
+ struct batadv_hard_iface *primary_if;
+ struct batadv_unicast_4addr_packet *uc_4addr_packet;
+ bool ret = false;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* Pull the header space and fill the unicast_packet substructure.
+ * We can do that because the first member of the uc_4addr_packet
+ * is of type struct unicast_packet
+ */
+ if (!batadv_send_skb_push_fill_unicast(skb, sizeof(*uc_4addr_packet),
+ orig))
+ goto out;
+
+ uc_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
+ uc_4addr_packet->u.packet_type = BATADV_UNICAST_4ADDR;
+ ether_addr_copy(uc_4addr_packet->src, primary_if->net_dev->dev_addr);
+ uc_4addr_packet->subtype = packet_subtype;
+ uc_4addr_packet->reserved = 0;
+
+ ret = true;
+out:
+ batadv_hardif_put(primary_if);
+ return ret;
+}
+
+/**
+ * batadv_send_skb_unicast() - encapsulate and send an skb via unicast
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: payload to send
+ * @packet_type: the batman unicast packet type to use
+ * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast
+ * 4addr packets)
+ * @orig_node: the originator to send the packet to
+ * @vid: the vid to be used to search the translation table
+ *
+ * Wrap the given skb into a batman-adv unicast or unicast-4addr header
+ * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied
+ * as packet_type. Then send this frame to the given orig_node.
+ *
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_unicast_packet *unicast_packet;
+ struct ethhdr *ethhdr;
+ int ret = NET_XMIT_DROP;
+
+ if (!orig_node)
+ goto out;
+
+ switch (packet_type) {
+ case BATADV_UNICAST:
+ if (!batadv_send_skb_prepare_unicast(skb, orig_node))
+ goto out;
+ break;
+ case BATADV_UNICAST_4ADDR:
+ if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, skb,
+ orig_node,
+ packet_subtype))
+ goto out;
+ break;
+ default:
+ /* this function supports UNICAST and UNICAST_4ADDR only. It
+ * should never be invoked with any other packet type
+ */
+ goto out;
+ }
+
+ /* skb->data might have been reallocated by
+ * batadv_send_skb_prepare_unicast{,_4addr}()
+ */
+ ethhdr = eth_hdr(skb);
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+
+ /* inform the destination node that we are still missing a correct route
+ * for this client. The destination will receive this packet and will
+ * try to reroute it because the ttvn contained in the header is less
+ * than the current one
+ */
+ if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
+ unicast_packet->ttvn = unicast_packet->ttvn - 1;
+
+ ret = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ /* skb was consumed */
+ skb = NULL;
+
+out:
+ kfree_skb(skb);
+ return ret;
+}
+
+/**
+ * batadv_send_skb_via_tt_generic() - send an skb via TT lookup
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: payload to send
+ * @packet_type: the batman unicast packet type to use
+ * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast
+ * 4addr packets)
+ * @dst_hint: can be used to override the destination contained in the skb
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the recipient node for the destination address in the ethernet
+ * header via the translation table. Wrap the given skb into a batman-adv
+ * unicast or unicast-4addr header depending on whether BATADV_UNICAST or
+ * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame
+ * to the according destination node.
+ *
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype, u8 *dst_hint,
+ unsigned short vid)
+{
+ struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+ struct batadv_orig_node *orig_node;
+ u8 *src, *dst;
+ int ret;
+
+ src = ethhdr->h_source;
+ dst = ethhdr->h_dest;
+
+ /* if we got an hint! let's send the packet to this client (if any) */
+ if (dst_hint) {
+ src = NULL;
+ dst = dst_hint;
+ }
+ orig_node = batadv_transtable_search(bat_priv, src, dst, vid);
+
+ ret = batadv_send_skb_unicast(bat_priv, skb, packet_type,
+ packet_subtype, orig_node, vid);
+
+ batadv_orig_node_put(orig_node);
+
+ return ret;
+}
+
+/**
+ * batadv_send_skb_via_gw() - send an skb via gateway lookup
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: payload to send
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the currently selected gateway. Wrap the given skb into a batman-adv
+ * unicast header and send this frame to this gateway node.
+ *
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret;
+
+ orig_node = batadv_gw_get_selected_orig(bat_priv);
+ ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR,
+ BATADV_P_DATA, orig_node, vid);
+
+ batadv_orig_node_put(orig_node);
+
+ return ret;
+}
+
+/**
+ * batadv_forw_packet_free() - free a forwarding packet
+ * @forw_packet: The packet to free
+ * @dropped: whether the packet is freed because is dropped
+ *
+ * This frees a forwarding packet and releases any resources it might
+ * have claimed.
+ */
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
+ bool dropped)
+{
+ if (dropped)
+ kfree_skb(forw_packet->skb);
+ else
+ consume_skb(forw_packet->skb);
+
+ batadv_hardif_put(forw_packet->if_incoming);
+ batadv_hardif_put(forw_packet->if_outgoing);
+ if (forw_packet->queue_left)
+ atomic_inc(forw_packet->queue_left);
+ kfree(forw_packet);
+}
+
+/**
+ * batadv_forw_packet_alloc() - allocate a forwarding packet
+ * @if_incoming: The (optional) if_incoming to be grabbed
+ * @if_outgoing: The (optional) if_outgoing to be grabbed
+ * @queue_left: The (optional) queue counter to decrease
+ * @bat_priv: The bat_priv for the mesh of this forw_packet
+ * @skb: The raw packet this forwarding packet shall contain
+ *
+ * Allocates a forwarding packet and tries to get a reference to the
+ * (optional) if_incoming, if_outgoing and queue_left. If queue_left
+ * is NULL then bat_priv is optional, too.
+ *
+ * Return: An allocated forwarding packet on success, NULL otherwise.
+ */
+struct batadv_forw_packet *
+batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ atomic_t *queue_left,
+ struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_forw_packet *forw_packet;
+ const char *qname;
+
+ if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) {
+ qname = "unknown";
+
+ if (queue_left == &bat_priv->bcast_queue_left)
+ qname = "bcast";
+
+ if (queue_left == &bat_priv->batman_queue_left)
+ qname = "batman";
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s queue is full\n", qname);
+
+ return NULL;
+ }
+
+ forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC);
+ if (!forw_packet)
+ goto err;
+
+ if (if_incoming)
+ kref_get(&if_incoming->refcount);
+
+ if (if_outgoing)
+ kref_get(&if_outgoing->refcount);
+
+ INIT_HLIST_NODE(&forw_packet->list);
+ INIT_HLIST_NODE(&forw_packet->cleanup_list);
+ forw_packet->skb = skb;
+ forw_packet->queue_left = queue_left;
+ forw_packet->if_incoming = if_incoming;
+ forw_packet->if_outgoing = if_outgoing;
+ forw_packet->num_packets = 1;
+
+ return forw_packet;
+
+err:
+ if (queue_left)
+ atomic_inc(queue_left);
+
+ return NULL;
+}
+
+/**
+ * batadv_forw_packet_was_stolen() - check whether someone stole this packet
+ * @forw_packet: the forwarding packet to check
+ *
+ * This function checks whether the given forwarding packet was claimed by
+ * someone else for free().
+ *
+ * Return: True if someone stole it, false otherwise.
+ */
+static bool
+batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet)
+{
+ return !hlist_unhashed(&forw_packet->cleanup_list);
+}
+
+/**
+ * batadv_forw_packet_steal() - claim a forw_packet for free()
+ * @forw_packet: the forwarding packet to steal
+ * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock)
+ *
+ * This function tries to steal a specific forw_packet from global
+ * visibility for the purpose of getting it for free(). That means
+ * the caller is *not* allowed to requeue it afterwards.
+ *
+ * Return: True if stealing was successful. False if someone else stole it
+ * before us.
+ */
+bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet,
+ spinlock_t *lock)
+{
+ /* did purging routine steal it earlier? */
+ spin_lock_bh(lock);
+ if (batadv_forw_packet_was_stolen(forw_packet)) {
+ spin_unlock_bh(lock);
+ return false;
+ }
+
+ hlist_del_init(&forw_packet->list);
+
+ /* Just to spot misuse of this function */
+ hlist_add_fake(&forw_packet->cleanup_list);
+
+ spin_unlock_bh(lock);
+ return true;
+}
+
+/**
+ * batadv_forw_packet_list_steal() - claim a list of forward packets for free()
+ * @forw_list: the to be stolen forward packets
+ * @cleanup_list: a backup pointer, to be able to dispose the packet later
+ * @hard_iface: the interface to steal forward packets from
+ *
+ * This function claims responsibility to free any forw_packet queued on the
+ * given hard_iface. If hard_iface is NULL forwarding packets on all hard
+ * interfaces will be claimed.
+ *
+ * The packets are being moved from the forw_list to the cleanup_list. This
+ * makes it possible for already running threads to notice the claim.
+ */
+static void
+batadv_forw_packet_list_steal(struct hlist_head *forw_list,
+ struct hlist_head *cleanup_list,
+ const struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
+
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
+ forw_list, list) {
+ /* if purge_outstanding_packets() was called with an argument
+ * we delete only packets belonging to the given interface
+ */
+ if (hard_iface &&
+ forw_packet->if_incoming != hard_iface &&
+ forw_packet->if_outgoing != hard_iface)
+ continue;
+
+ hlist_del(&forw_packet->list);
+ hlist_add_head(&forw_packet->cleanup_list, cleanup_list);
+ }
+}
+
+/**
+ * batadv_forw_packet_list_free() - free a list of forward packets
+ * @head: a list of to be freed forw_packets
+ *
+ * This function cancels the scheduling of any packet in the provided list,
+ * waits for any possibly running packet forwarding thread to finish and
+ * finally, safely frees this forward packet.
+ *
+ * This function might sleep.
+ */
+static void batadv_forw_packet_list_free(struct hlist_head *head)
+{
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
+
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node, head,
+ cleanup_list) {
+ cancel_delayed_work_sync(&forw_packet->delayed_work);
+
+ hlist_del(&forw_packet->cleanup_list);
+ batadv_forw_packet_free(forw_packet, true);
+ }
+}
+
+/**
+ * batadv_forw_packet_queue() - try to queue a forwarding packet
+ * @forw_packet: the forwarding packet to queue
+ * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock)
+ * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list)
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue a forwarding packet. Requeuing
+ * is prevented if the according interface is shutting down
+ * (e.g. if batadv_forw_packet_list_steal() was called for this
+ * packet earlier).
+ *
+ * Calling batadv_forw_packet_queue() after a call to
+ * batadv_forw_packet_steal() is forbidden!
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet,
+ spinlock_t *lock, struct hlist_head *head,
+ unsigned long send_time)
+{
+ spin_lock_bh(lock);
+
+ /* did purging routine steal it from us? */
+ if (batadv_forw_packet_was_stolen(forw_packet)) {
+ /* If you got it for free() without trouble, then
+ * don't get back into the queue after stealing...
+ */
+ WARN_ONCE(hlist_fake(&forw_packet->cleanup_list),
+ "Requeuing after batadv_forw_packet_steal() not allowed!\n");
+
+ spin_unlock_bh(lock);
+ return;
+ }
+
+ hlist_del_init(&forw_packet->list);
+ hlist_add_head(&forw_packet->list, head);
+
+ queue_delayed_work(batadv_event_workqueue,
+ &forw_packet->delayed_work,
+ send_time - jiffies);
+ spin_unlock_bh(lock);
+}
+
+/**
+ * batadv_forw_packet_bcast_queue() - try to queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @forw_packet: the forwarding packet to queue
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue a broadcast packet.
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+static void
+batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bcast_list_lock,
+ &bat_priv->forw_bcast_list, send_time);
+}
+
+/**
+ * batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @forw_packet: the forwarding packet to queue
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ *
+ * This function tries to (re)queue an OGMv1 packet.
+ *
+ * Caller needs to ensure that forw_packet->delayed_work was initialized.
+ */
+void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bat_list_lock,
+ &bat_priv->forw_bat_list, send_time);
+}
+
+/**
+ * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ * @if_in: the interface where the packet was received on
+ * @if_out: the outgoing interface to queue on
+ *
+ * Adds a broadcast packet to the queue and sets up timers. Broadcast packets
+ * are sent multiple times to increase probability for being received.
+ *
+ * This call clones the given skb, hence the caller needs to take into
+ * account that the data segment of the original skb might not be
+ * modifiable anymore.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet,
+ struct batadv_hard_iface *if_in,
+ struct batadv_hard_iface *if_out)
+{
+ struct batadv_forw_packet *forw_packet;
+ unsigned long send_time = jiffies;
+ struct sk_buff *newskb;
+
+ newskb = skb_clone(skb, GFP_ATOMIC);
+ if (!newskb)
+ goto err;
+
+ forw_packet = batadv_forw_packet_alloc(if_in, if_out,
+ &bat_priv->bcast_queue_left,
+ bat_priv, newskb);
+ if (!forw_packet)
+ goto err_packet_free;
+
+ forw_packet->own = own_packet;
+
+ INIT_DELAYED_WORK(&forw_packet->delayed_work,
+ batadv_send_outstanding_bcast_packet);
+
+ send_time += delay ? delay : msecs_to_jiffies(5);
+
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time);
+ return NETDEV_TX_OK;
+
+err_packet_free:
+ kfree_skb(newskb);
+err:
+ return NETDEV_TX_BUSY;
+}
+
+/**
+ * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ * @if_in: the interface where the packet was received on
+ * @if_out: the outgoing interface to forward to
+ *
+ * Transmits a broadcast packet on the specified interface either immediately
+ * or if a delay is given after that. Furthermore, queues additional
+ * retransmissions if this interface is a wireless one.
+ *
+ * This call clones the given skb, hence the caller needs to take into
+ * account that the data segment of the original skb might not be
+ * modifiable anymore.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet,
+ struct batadv_hard_iface *if_in,
+ struct batadv_hard_iface *if_out)
+{
+ unsigned int num_bcasts = if_out->num_bcasts;
+ struct sk_buff *newskb;
+ int ret = NETDEV_TX_OK;
+
+ if (!delay) {
+ newskb = skb_clone(skb, GFP_ATOMIC);
+ if (!newskb)
+ return NETDEV_TX_BUSY;
+
+ batadv_send_broadcast_skb(newskb, if_out);
+ num_bcasts--;
+ }
+
+ /* delayed broadcast or rebroadcasts? */
+ if (num_bcasts >= 1) {
+ BATADV_SKB_CB(skb)->num_bcasts = num_bcasts;
+
+ ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay,
+ own_packet, if_in,
+ if_out);
+ }
+
+ return ret;
+}
+
+/**
+ * batadv_send_no_broadcast() - check whether (re)broadcast is necessary
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to check
+ * @own_packet: true if it is a self-generated broadcast packet
+ * @if_out: the outgoing interface checked and considered for (re)broadcast
+ *
+ * Return: False if a packet needs to be (re)broadcasted on the given interface,
+ * true otherwise.
+ */
+static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, bool own_packet,
+ struct batadv_hard_iface *if_out)
+{
+ struct batadv_hardif_neigh_node *neigh_node = NULL;
+ struct batadv_bcast_packet *bcast_packet;
+ u8 *orig_neigh;
+ u8 *neigh_addr;
+ char *type;
+ int ret;
+
+ if (!own_packet) {
+ neigh_addr = eth_hdr(skb)->h_source;
+ neigh_node = batadv_hardif_neigh_get(if_out,
+ neigh_addr);
+ }
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ orig_neigh = neigh_node ? neigh_node->orig : NULL;
+
+ ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig,
+ orig_neigh);
+
+ batadv_hardif_neigh_put(neigh_node);
+
+ /* ok, may broadcast */
+ if (!ret)
+ return false;
+
+ /* no broadcast */
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "BCAST packet from orig %pM on %s suppressed: %s\n",
+ bcast_packet->orig,
+ if_out->net_dev->name, type);
+
+ return true;
+}
+
+/**
+ * __batadv_forw_bcast_packet() - forward and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ *
+ * Transmits a broadcast packet either immediately or if a delay is given
+ * after that. Furthermore, queues additional retransmissions on wireless
+ * interfaces.
+ *
+ * This call clones the given skb, hence the caller needs to take into
+ * account that the data segment of the given skb might not be
+ * modifiable anymore.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_hard_iface *primary_if;
+ struct list_head *iter;
+ int ret = NETDEV_TX_OK;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return NETDEV_TX_BUSY;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
+ if (batadv_send_no_broadcast(bat_priv, skb, own_packet,
+ hard_iface)) {
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
+ ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay,
+ own_packet, primary_if,
+ hard_iface);
+ batadv_hardif_put(hard_iface);
+
+ if (ret == NETDEV_TX_BUSY)
+ break;
+ }
+ rcu_read_unlock();
+
+ batadv_hardif_put(primary_if);
+ return ret;
+}
+
+/**
+ * batadv_forw_bcast_packet() - forward and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ *
+ * Transmits a broadcast packet either immediately or if a delay is given
+ * after that. Furthermore, queues additional retransmissions on wireless
+ * interfaces.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+int batadv_forw_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet)
+{
+ return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet);
+}
+
+/**
+ * batadv_send_bcast_packet() - send and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ *
+ * Transmits a broadcast packet either immediately or if a delay is given
+ * after that. Furthermore, queues additional retransmissions on wireless
+ * interfaces.
+ *
+ * Consumes the provided skb.
+ */
+void batadv_send_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet)
+{
+ __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet);
+ consume_skb(skb);
+}
+
+/**
+ * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary
+ * @forw_packet: the forwarding packet to check
+ *
+ * Checks whether a given packet has any (re)transmissions left on the provided
+ * interface.
+ *
+ * hard_iface may be NULL: In that case the number of transmissions this skb had
+ * so far is compared with the maximum amount of retransmissions independent of
+ * any interface instead.
+ *
+ * Return: True if (re)transmissions are left, false otherwise.
+ */
+static bool
+batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet)
+{
+ return BATADV_SKB_CB(forw_packet->skb)->num_bcasts;
+}
+
+/**
+ * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a
+ * packet
+ * @forw_packet: the packet to decrease the counter for
+ */
+static void
+batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet)
+{
+ BATADV_SKB_CB(forw_packet->skb)->num_bcasts--;
+}
+
+/**
+ * batadv_forw_packet_is_rebroadcast() - check packet for previous transmissions
+ * @forw_packet: the packet to check
+ *
+ * Return: True if this packet was transmitted before, false otherwise.
+ */
+bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet)
+{
+ unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts;
+
+ return num_bcasts != forw_packet->if_outgoing->num_bcasts;
+}
+
+/**
+ * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet
+ * @work: work queue item
+ *
+ * Transmits a queued broadcast packet and if necessary reschedules it.
+ */
+static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
+{
+ unsigned long send_time = jiffies + msecs_to_jiffies(5);
+ struct batadv_forw_packet *forw_packet;
+ struct delayed_work *delayed_work;
+ struct batadv_priv *bat_priv;
+ struct sk_buff *skb1;
+ bool dropped = false;
+
+ delayed_work = to_delayed_work(work);
+ forw_packet = container_of(delayed_work, struct batadv_forw_packet,
+ delayed_work);
+ bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+ dropped = true;
+ goto out;
+ }
+
+ if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) {
+ dropped = true;
+ goto out;
+ }
+
+ /* send a copy of the saved skb */
+ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
+ if (!skb1)
+ goto out;
+
+ batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing);
+ batadv_forw_packet_bcasts_dec(forw_packet);
+
+ if (batadv_forw_packet_bcasts_left(forw_packet)) {
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet,
+ send_time);
+ return;
+ }
+
+out:
+ /* do we get something for free()? */
+ if (batadv_forw_packet_steal(forw_packet,
+ &bat_priv->forw_bcast_list_lock))
+ batadv_forw_packet_free(forw_packet, dropped);
+}
+
+/**
+ * batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on
+ *
+ * This method cancels and purges any broadcast and OGMv1 packet on the given
+ * hard_iface. If hard_iface is NULL, broadcast and OGMv1 packets on all hard
+ * interfaces will be canceled and purged.
+ *
+ * This function might sleep.
+ */
+void
+batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
+ const struct batadv_hard_iface *hard_iface)
+{
+ struct hlist_head head = HLIST_HEAD_INIT;
+
+ if (hard_iface)
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s(): %s\n",
+ __func__, hard_iface->net_dev->name);
+ else
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s()\n", __func__);
+
+ /* claim bcast list for free() */
+ spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+ batadv_forw_packet_list_steal(&bat_priv->forw_bcast_list, &head,
+ hard_iface);
+ spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+ /* claim batman packet list for free() */
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ batadv_forw_packet_list_steal(&bat_priv->forw_bat_list, &head,
+ hard_iface);
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* then cancel or wait for packet workers to finish and free */
+ batadv_forw_packet_list_free(&head);
+}
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
new file mode 100644
index 000000000000..3415afec4a0c
--- /dev/null
+++ b/net/batman-adv/send.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_SEND_H_
+#define _NET_BATMAN_ADV_SEND_H_
+
+#include "main.h"
+
+#include <linux/compiler.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
+ bool dropped);
+struct batadv_forw_packet *
+batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ atomic_t *queue_left,
+ struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+bool batadv_forw_packet_steal(struct batadv_forw_packet *packet, spinlock_t *l);
+void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time);
+bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet);
+
+int batadv_send_skb_to_orig(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if);
+int batadv_send_skb_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *dst_addr);
+int batadv_send_broadcast_skb(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface);
+int batadv_send_unicast_skb(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node);
+int batadv_forw_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet);
+void batadv_send_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet);
+void
+batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
+ const struct batadv_hard_iface *hard_iface);
+bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int packet_subtype);
+int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid);
+int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype, u8 *dst_hint,
+ unsigned short vid);
+int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid);
+
+/**
+ * batadv_send_skb_via_tt() - send an skb via TT lookup
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the payload to send
+ * @dst_hint: can be used to override the destination contained in the skb
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the recipient node for the destination address in the ethernet
+ * header via the translation table. Wrap the given skb into a batman-adv
+ * unicast header. Then send this frame to the according destination node.
+ *
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, u8 *dst_hint,
+ unsigned short vid)
+{
+ return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST, 0,
+ dst_hint, vid);
+}
+
+/**
+ * batadv_send_skb_via_tt_4addr() - send an skb via TT lookup
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the payload to send
+ * @packet_subtype: the unicast 4addr packet subtype to use
+ * @dst_hint: can be used to override the destination contained in the skb
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the recipient node for the destination address in the ethernet
+ * header via the translation table. Wrap the given skb into a batman-adv
+ * unicast-4addr header. Then send this frame to the according destination
+ * node.
+ *
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ int packet_subtype,
+ u8 *dst_hint,
+ unsigned short vid)
+{
+ return batadv_send_skb_via_tt_generic(bat_priv, skb,
+ BATADV_UNICAST_4ADDR,
+ packet_subtype, dst_hint, vid);
+}
+
+#endif /* _NET_BATMAN_ADV_SEND_H_ */
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
new file mode 100644
index 000000000000..350b149e48be
--- /dev/null
+++ b/net/batman-adv/tp_meter.c
@@ -0,0 +1,1490 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Edo Monticelli, Antonio Quartulli
+ */
+
+#include "tp_meter.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/build_bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/kthread.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/netdevice.h>
+#include <linux/param.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "hard-interface.h"
+#include "log.h"
+#include "netlink.h"
+#include "originator.h"
+#include "send.h"
+
+/**
+ * BATADV_TP_DEF_TEST_LENGTH - Default test length if not specified by the user
+ * in milliseconds
+ */
+#define BATADV_TP_DEF_TEST_LENGTH 10000
+
+/**
+ * BATADV_TP_AWND - Advertised window by the receiver (in bytes)
+ */
+#define BATADV_TP_AWND 0x20000000
+
+/**
+ * BATADV_TP_RECV_TIMEOUT - Receiver activity timeout. If the receiver does not
+ * get anything for such amount of milliseconds, the connection is killed
+ */
+#define BATADV_TP_RECV_TIMEOUT 1000
+
+/**
+ * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond
+ * such amount of milliseconds, the receiver is considered unreachable and the
+ * connection is killed
+ */
+#define BATADV_TP_MAX_RTO 30000
+
+/**
+ * BATADV_TP_FIRST_SEQ - First seqno of each session. The number is rather high
+ * in order to immediately trigger a wrap around (test purposes)
+ */
+#define BATADV_TP_FIRST_SEQ ((u32)-1 - 2000)
+
+/**
+ * BATADV_TP_PLEN - length of the payload (data after the batadv_unicast header)
+ * to simulate
+ */
+#define BATADV_TP_PLEN (BATADV_TP_PACKET_LEN - ETH_HLEN - \
+ sizeof(struct batadv_unicast_packet))
+
+static u8 batadv_tp_prerandom[4096] __read_mostly;
+
+/**
+ * batadv_tp_session_cookie() - generate session cookie based on session ids
+ * @session: TP session identifier
+ * @icmp_uid: icmp pseudo uid of the tp session
+ *
+ * Return: 32 bit tp_meter session cookie
+ */
+static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid)
+{
+ u32 cookie;
+
+ cookie = icmp_uid << 16;
+ cookie |= session[0] << 8;
+ cookie |= session[1];
+
+ return cookie;
+}
+
+/**
+ * batadv_tp_cwnd() - compute the new cwnd size
+ * @base: base cwnd size value
+ * @increment: the value to add to base to get the new size
+ * @min: minimum cwnd value (usually MSS)
+ *
+ * Return the new cwnd size and ensure it does not exceed the Advertised
+ * Receiver Window size. It is wrapped around safely.
+ * For details refer to Section 3.1 of RFC5681
+ *
+ * Return: new congestion window size in bytes
+ */
+static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min)
+{
+ u32 new_size = base + increment;
+
+ /* check for wrap-around */
+ if (new_size < base)
+ new_size = (u32)ULONG_MAX;
+
+ new_size = min_t(u32, new_size, BATADV_TP_AWND);
+
+ return max_t(u32, new_size, min);
+}
+
+/**
+ * batadv_tp_update_cwnd() - update the Congestion Windows
+ * @tp_vars: the private data of the current TP meter session
+ * @mss: maximum segment size of transmission
+ *
+ * 1) if the session is in Slow Start, the CWND has to be increased by 1
+ * MSS every unique received ACK
+ * 2) if the session is in Congestion Avoidance, the CWND has to be
+ * increased by MSS * MSS / CWND for every unique received ACK
+ */
+static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss)
+{
+ spin_lock_bh(&tp_vars->cwnd_lock);
+
+ /* slow start... */
+ if (tp_vars->cwnd <= tp_vars->ss_threshold) {
+ tp_vars->dec_cwnd = 0;
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss);
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+ return;
+ }
+
+ /* increment CWND at least of 1 (section 3.1 of RFC5681) */
+ tp_vars->dec_cwnd += max_t(u32, 1U << 3,
+ ((mss * mss) << 6) / (tp_vars->cwnd << 3));
+ if (tp_vars->dec_cwnd < (mss << 3)) {
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+ return;
+ }
+
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss);
+ tp_vars->dec_cwnd = 0;
+
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+}
+
+/**
+ * batadv_tp_update_rto() - calculate new retransmission timeout
+ * @tp_vars: the private data of the current TP meter session
+ * @new_rtt: new roundtrip time in msec
+ */
+static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars,
+ u32 new_rtt)
+{
+ long m = new_rtt;
+
+ /* RTT update
+ * Details in Section 2.2 and 2.3 of RFC6298
+ *
+ * It's tricky to understand. Don't lose hair please.
+ * Inspired by tcp_rtt_estimator() tcp_input.c
+ */
+ if (tp_vars->srtt != 0) {
+ m -= (tp_vars->srtt >> 3); /* m is now error in rtt est */
+ tp_vars->srtt += m; /* rtt = 7/8 srtt + 1/8 new */
+ if (m < 0)
+ m = -m;
+
+ m -= (tp_vars->rttvar >> 2);
+ tp_vars->rttvar += m; /* mdev ~= 3/4 rttvar + 1/4 new */
+ } else {
+ /* first measure getting in */
+ tp_vars->srtt = m << 3; /* take the measured time to be srtt */
+ tp_vars->rttvar = m << 1; /* new_rtt / 2 */
+ }
+
+ /* rto = srtt + 4 * rttvar.
+ * rttvar is scaled by 4, therefore doesn't need to be multiplied
+ */
+ tp_vars->rto = (tp_vars->srtt >> 3) + tp_vars->rttvar;
+}
+
+/**
+ * batadv_tp_batctl_notify() - send client status result to client
+ * @reason: reason for tp meter session stop
+ * @dst: destination of tp_meter session
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @start_time: start of transmission in jiffies
+ * @total_sent: bytes acked to the receiver
+ * @cookie: cookie of tp_meter session
+ */
+static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason,
+ const u8 *dst, struct batadv_priv *bat_priv,
+ unsigned long start_time, u64 total_sent,
+ u32 cookie)
+{
+ u32 test_time;
+ u8 result;
+ u32 total_bytes;
+
+ if (!batadv_tp_is_error(reason)) {
+ result = BATADV_TP_REASON_COMPLETE;
+ test_time = jiffies_to_msecs(jiffies - start_time);
+ total_bytes = total_sent;
+ } else {
+ result = reason;
+ test_time = 0;
+ total_bytes = 0;
+ }
+
+ batadv_netlink_tpmeter_notify(bat_priv, dst, result, test_time,
+ total_bytes, cookie);
+}
+
+/**
+ * batadv_tp_batctl_error_notify() - send client error result to client
+ * @reason: reason for tp meter session stop
+ * @dst: destination of tp_meter session
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @cookie: cookie of tp_meter session
+ */
+static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
+ const u8 *dst,
+ struct batadv_priv *bat_priv,
+ u32 cookie)
+{
+ batadv_tp_batctl_notify(reason, dst, bat_priv, 0, 0, cookie);
+}
+
+/**
+ * batadv_tp_list_find() - find a tp_vars object in the global list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: the other endpoint MAC address to look for
+ *
+ * Look for a tp_vars object matching dst as end_point and return it after
+ * having increment the refcounter. Return NULL is not found
+ *
+ * Return: matching tp_vars or NULL when no tp_vars with @dst was found
+ */
+static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
+ const u8 *dst)
+{
+ struct batadv_tp_vars *pos, *tp_vars = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) {
+ if (!batadv_compare_eth(pos->other_end, dst))
+ continue;
+
+ /* most of the time this function is invoked during the normal
+ * process..it makes sens to pay more when the session is
+ * finished and to speed the process up during the measurement
+ */
+ if (unlikely(!kref_get_unless_zero(&pos->refcount)))
+ continue;
+
+ tp_vars = pos;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tp_vars;
+}
+
+/**
+ * batadv_tp_list_find_session() - find tp_vars session object in the global
+ * list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: the other endpoint MAC address to look for
+ * @session: session identifier
+ *
+ * Look for a tp_vars object matching dst as end_point, session as tp meter
+ * session and return it after having increment the refcounter. Return NULL
+ * is not found
+ *
+ * Return: matching tp_vars or NULL when no tp_vars was found
+ */
+static struct batadv_tp_vars *
+batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst,
+ const u8 *session)
+{
+ struct batadv_tp_vars *pos, *tp_vars = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) {
+ if (!batadv_compare_eth(pos->other_end, dst))
+ continue;
+
+ if (memcmp(pos->session, session, sizeof(pos->session)) != 0)
+ continue;
+
+ /* most of the time this function is invoked during the normal
+ * process..it makes sense to pay more when the session is
+ * finished and to speed the process up during the measurement
+ */
+ if (unlikely(!kref_get_unless_zero(&pos->refcount)))
+ continue;
+
+ tp_vars = pos;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tp_vars;
+}
+
+/**
+ * batadv_tp_vars_release() - release batadv_tp_vars from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the batadv_tp_vars
+ */
+static void batadv_tp_vars_release(struct kref *ref)
+{
+ struct batadv_tp_vars *tp_vars;
+ struct batadv_tp_unacked *un, *safe;
+
+ tp_vars = container_of(ref, struct batadv_tp_vars, refcount);
+
+ /* lock should not be needed because this object is now out of any
+ * context!
+ */
+ spin_lock_bh(&tp_vars->unacked_lock);
+ list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+ list_del(&un->list);
+ kfree(un);
+ }
+ spin_unlock_bh(&tp_vars->unacked_lock);
+
+ kfree_rcu(tp_vars, rcu);
+}
+
+/**
+ * batadv_tp_vars_put() - decrement the batadv_tp_vars refcounter and possibly
+ * release it
+ * @tp_vars: the private data of the current TP meter session to be free'd
+ */
+static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars)
+{
+ if (!tp_vars)
+ return;
+
+ kref_put(&tp_vars->refcount, batadv_tp_vars_release);
+}
+
+/**
+ * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tp_vars: the private data of the current TP meter session to cleanup
+ */
+static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
+ struct batadv_tp_vars *tp_vars)
+{
+ cancel_delayed_work(&tp_vars->finish_work);
+
+ spin_lock_bh(&tp_vars->bat_priv->tp_list_lock);
+ hlist_del_rcu(&tp_vars->list);
+ spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock);
+
+ /* drop list reference */
+ batadv_tp_vars_put(tp_vars);
+
+ atomic_dec(&tp_vars->bat_priv->tp_num);
+
+ /* kill the timer and remove its reference */
+ timer_delete_sync(&tp_vars->timer);
+ /* the worker might have rearmed itself therefore we kill it again. Note
+ * that if the worker should run again before invoking the following
+ * timer_delete(), it would not re-arm itself once again because the status
+ * is OFF now
+ */
+ timer_delete(&tp_vars->timer);
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_sender_end() - print info about ended session and inform client
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tp_vars: the private data of the current TP meter session
+ */
+static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
+ struct batadv_tp_vars *tp_vars)
+{
+ u32 session_cookie;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Test towards %pM finished..shutting down (reason=%d)\n",
+ tp_vars->other_end, tp_vars->reason);
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n",
+ tp_vars->srtt >> 3, tp_vars->rttvar >> 2, tp_vars->rto);
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Final values: cwnd=%u ss_threshold=%u\n",
+ tp_vars->cwnd, tp_vars->ss_threshold);
+
+ session_cookie = batadv_tp_session_cookie(tp_vars->session,
+ tp_vars->icmp_uid);
+
+ batadv_tp_batctl_notify(tp_vars->reason,
+ tp_vars->other_end,
+ bat_priv,
+ tp_vars->start_time,
+ atomic64_read(&tp_vars->tot_sent),
+ session_cookie);
+}
+
+/**
+ * batadv_tp_sender_shutdown() - let sender thread/timer stop gracefully
+ * @tp_vars: the private data of the current TP meter session
+ * @reason: reason for tp meter session stop
+ */
+static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars,
+ enum batadv_tp_meter_reason reason)
+{
+ if (!atomic_dec_and_test(&tp_vars->sending))
+ return;
+
+ tp_vars->reason = reason;
+}
+
+/**
+ * batadv_tp_sender_finish() - stop sender session after test_length was reached
+ * @work: delayed work reference of the related tp_vars
+ */
+static void batadv_tp_sender_finish(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_tp_vars *tp_vars;
+
+ delayed_work = to_delayed_work(work);
+ tp_vars = container_of(delayed_work, struct batadv_tp_vars,
+ finish_work);
+
+ batadv_tp_sender_shutdown(tp_vars, BATADV_TP_REASON_COMPLETE);
+}
+
+/**
+ * batadv_tp_reset_sender_timer() - reschedule the sender timer
+ * @tp_vars: the private TP meter data for this session
+ *
+ * Reschedule the timer using tp_vars->rto as delay
+ */
+static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
+{
+ /* most of the time this function is invoked while normal packet
+ * reception...
+ */
+ if (unlikely(atomic_read(&tp_vars->sending) == 0))
+ /* timer ref will be dropped in batadv_tp_sender_cleanup */
+ return;
+
+ mod_timer(&tp_vars->timer, jiffies + msecs_to_jiffies(tp_vars->rto));
+}
+
+/**
+ * batadv_tp_sender_timeout() - timer that fires in case of packet loss
+ * @t: address to timer_list inside tp_vars
+ *
+ * If fired it means that there was packet loss.
+ * Switch to Slow Start, set the ss_threshold to half of the current cwnd and
+ * reset the cwnd to 3*MSS
+ */
+static void batadv_tp_sender_timeout(struct timer_list *t)
+{
+ struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer);
+ struct batadv_priv *bat_priv = tp_vars->bat_priv;
+
+ if (atomic_read(&tp_vars->sending) == 0)
+ return;
+
+ /* if the user waited long enough...shutdown the test */
+ if (unlikely(tp_vars->rto >= BATADV_TP_MAX_RTO)) {
+ batadv_tp_sender_shutdown(tp_vars,
+ BATADV_TP_REASON_DST_UNREACHABLE);
+ return;
+ }
+
+ /* RTO exponential backoff
+ * Details in Section 5.5 of RFC6298
+ */
+ tp_vars->rto <<= 1;
+
+ spin_lock_bh(&tp_vars->cwnd_lock);
+
+ tp_vars->ss_threshold = tp_vars->cwnd >> 1;
+ if (tp_vars->ss_threshold < BATADV_TP_PLEN * 2)
+ tp_vars->ss_threshold = BATADV_TP_PLEN * 2;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: RTO fired during test towards %pM! cwnd=%u new ss_thr=%u, resetting last_sent to %u\n",
+ tp_vars->other_end, tp_vars->cwnd, tp_vars->ss_threshold,
+ atomic_read(&tp_vars->last_acked));
+
+ tp_vars->cwnd = BATADV_TP_PLEN * 3;
+
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+
+ /* resend the non-ACKed packets.. */
+ tp_vars->last_sent = atomic_read(&tp_vars->last_acked);
+ wake_up(&tp_vars->more_bytes);
+
+ batadv_tp_reset_sender_timer(tp_vars);
+}
+
+/**
+ * batadv_tp_fill_prerandom() - Fill buffer with prefetched random bytes
+ * @tp_vars: the private TP meter data for this session
+ * @buf: Buffer to fill with bytes
+ * @nbytes: amount of pseudorandom bytes
+ */
+static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars,
+ u8 *buf, size_t nbytes)
+{
+ u32 local_offset;
+ size_t bytes_inbuf;
+ size_t to_copy;
+ size_t pos = 0;
+
+ spin_lock_bh(&tp_vars->prerandom_lock);
+ local_offset = tp_vars->prerandom_offset;
+ tp_vars->prerandom_offset += nbytes;
+ tp_vars->prerandom_offset %= sizeof(batadv_tp_prerandom);
+ spin_unlock_bh(&tp_vars->prerandom_lock);
+
+ while (nbytes) {
+ local_offset %= sizeof(batadv_tp_prerandom);
+ bytes_inbuf = sizeof(batadv_tp_prerandom) - local_offset;
+ to_copy = min(nbytes, bytes_inbuf);
+
+ memcpy(&buf[pos], &batadv_tp_prerandom[local_offset], to_copy);
+ pos += to_copy;
+ nbytes -= to_copy;
+ local_offset = 0;
+ }
+}
+
+/**
+ * batadv_tp_send_msg() - send a single message
+ * @tp_vars: the private TP meter data for this session
+ * @src: source mac address
+ * @orig_node: the originator of the destination
+ * @seqno: sequence number of this packet
+ * @len: length of the entire packet
+ * @session: session identifier
+ * @uid: local ICMP "socket" index
+ * @timestamp: timestamp in jiffies which is replied in ack
+ *
+ * Create and send a single TP Meter message.
+ *
+ * Return: 0 on success, BATADV_TP_REASON_DST_UNREACHABLE if the destination is
+ * not reachable, BATADV_TP_REASON_MEMORY_ERROR if the packet couldn't be
+ * allocated
+ */
+static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
+ struct batadv_orig_node *orig_node,
+ u32 seqno, size_t len, const u8 *session,
+ int uid, u32 timestamp)
+{
+ struct batadv_icmp_tp_packet *icmp;
+ struct sk_buff *skb;
+ int r;
+ u8 *data;
+ size_t data_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN);
+ if (unlikely(!skb))
+ return BATADV_TP_REASON_MEMORY_ERROR;
+
+ skb_reserve(skb, ETH_HLEN);
+ icmp = skb_put(skb, sizeof(*icmp));
+
+ /* fill the icmp header */
+ ether_addr_copy(icmp->dst, orig_node->orig);
+ ether_addr_copy(icmp->orig, src);
+ icmp->version = BATADV_COMPAT_VERSION;
+ icmp->packet_type = BATADV_ICMP;
+ icmp->ttl = BATADV_TTL;
+ icmp->msg_type = BATADV_TP;
+ icmp->uid = uid;
+
+ icmp->subtype = BATADV_TP_MSG;
+ memcpy(icmp->session, session, sizeof(icmp->session));
+ icmp->seqno = htonl(seqno);
+ icmp->timestamp = htonl(timestamp);
+
+ data_len = len - sizeof(*icmp);
+ data = skb_put(skb, data_len);
+ batadv_tp_fill_prerandom(tp_vars, data, data_len);
+
+ r = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (r == NET_XMIT_SUCCESS)
+ return 0;
+
+ return BATADV_TP_REASON_CANT_SEND;
+}
+
+/**
+ * batadv_tp_recv_ack() - ACK receiving function
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the buffer containing the received packet
+ *
+ * Process a received TP ACK packet
+ */
+static void batadv_tp_recv_ack(struct batadv_priv *bat_priv,
+ const struct sk_buff *skb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ const struct batadv_icmp_tp_packet *icmp;
+ struct batadv_tp_vars *tp_vars;
+ const unsigned char *dev_addr;
+ size_t packet_len, mss;
+ u32 rtt, recv_ack, cwnd;
+
+ packet_len = BATADV_TP_PLEN;
+ mss = BATADV_TP_PLEN;
+ packet_len += sizeof(struct batadv_unicast_packet);
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ /* find the tp_vars */
+ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
+ icmp->session);
+ if (unlikely(!tp_vars))
+ return;
+
+ if (unlikely(atomic_read(&tp_vars->sending) == 0))
+ goto out;
+
+ /* old ACK? silently drop it.. */
+ if (batadv_seq_before(ntohl(icmp->seqno),
+ (u32)atomic_read(&tp_vars->last_acked)))
+ goto out;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (unlikely(!primary_if))
+ goto out;
+
+ orig_node = batadv_orig_hash_find(bat_priv, icmp->orig);
+ if (unlikely(!orig_node))
+ goto out;
+
+ /* update RTO with the new sampled RTT, if any */
+ rtt = jiffies_to_msecs(jiffies) - ntohl(icmp->timestamp);
+ if (icmp->timestamp && rtt)
+ batadv_tp_update_rto(tp_vars, rtt);
+
+ /* ACK for new data... reset the timer */
+ batadv_tp_reset_sender_timer(tp_vars);
+
+ recv_ack = ntohl(icmp->seqno);
+
+ /* check if this ACK is a duplicate */
+ if (atomic_read(&tp_vars->last_acked) == recv_ack) {
+ atomic_inc(&tp_vars->dup_acks);
+ if (atomic_read(&tp_vars->dup_acks) != 3)
+ goto out;
+
+ if (recv_ack >= tp_vars->recover)
+ goto out;
+
+ /* if this is the third duplicate ACK do Fast Retransmit */
+ batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr,
+ orig_node, recv_ack, packet_len,
+ icmp->session, icmp->uid,
+ jiffies_to_msecs(jiffies));
+
+ spin_lock_bh(&tp_vars->cwnd_lock);
+
+ /* Fast Recovery */
+ tp_vars->fast_recovery = true;
+ /* Set recover to the last outstanding seqno when Fast Recovery
+ * is entered. RFC6582, Section 3.2, step 1
+ */
+ tp_vars->recover = tp_vars->last_sent;
+ tp_vars->ss_threshold = tp_vars->cwnd >> 1;
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: Fast Recovery, (cur cwnd=%u) ss_thr=%u last_sent=%u recv_ack=%u\n",
+ tp_vars->cwnd, tp_vars->ss_threshold,
+ tp_vars->last_sent, recv_ack);
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 3 * mss,
+ mss);
+ tp_vars->dec_cwnd = 0;
+ tp_vars->last_sent = recv_ack;
+
+ spin_unlock_bh(&tp_vars->cwnd_lock);
+ } else {
+ /* count the acked data */
+ atomic64_add(recv_ack - atomic_read(&tp_vars->last_acked),
+ &tp_vars->tot_sent);
+ /* reset the duplicate ACKs counter */
+ atomic_set(&tp_vars->dup_acks, 0);
+
+ if (tp_vars->fast_recovery) {
+ /* partial ACK */
+ if (batadv_seq_before(recv_ack, tp_vars->recover)) {
+ /* this is another hole in the window. React
+ * immediately as specified by NewReno (see
+ * Section 3.2 of RFC6582 for details)
+ */
+ dev_addr = primary_if->net_dev->dev_addr;
+ batadv_tp_send_msg(tp_vars, dev_addr,
+ orig_node, recv_ack,
+ packet_len, icmp->session,
+ icmp->uid,
+ jiffies_to_msecs(jiffies));
+ tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd,
+ mss, mss);
+ } else {
+ tp_vars->fast_recovery = false;
+ /* set cwnd to the value of ss_threshold at the
+ * moment that Fast Recovery was entered.
+ * RFC6582, Section 3.2, step 3
+ */
+ cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 0,
+ mss);
+ tp_vars->cwnd = cwnd;
+ }
+ goto move_twnd;
+ }
+
+ if (recv_ack - atomic_read(&tp_vars->last_acked) >= mss)
+ batadv_tp_update_cwnd(tp_vars, mss);
+move_twnd:
+ /* move the Transmit Window */
+ atomic_set(&tp_vars->last_acked, recv_ack);
+ }
+
+ wake_up(&tp_vars->more_bytes);
+out:
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_avail() - check if congestion window is not full
+ * @tp_vars: the private data of the current TP meter session
+ * @payload_len: size of the payload of a single message
+ *
+ * Return: true when congestion window is not full, false otherwise
+ */
+static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars,
+ size_t payload_len)
+{
+ u32 win_left, win_limit;
+
+ win_limit = atomic_read(&tp_vars->last_acked) + tp_vars->cwnd;
+ win_left = win_limit - tp_vars->last_sent;
+
+ return win_left >= payload_len;
+}
+
+/**
+ * batadv_tp_wait_available() - wait until congestion window becomes free or
+ * timeout is reached
+ * @tp_vars: the private data of the current TP meter session
+ * @plen: size of the payload of a single message
+ *
+ * Return: 0 if the condition evaluated to false after the timeout elapsed,
+ * 1 if the condition evaluated to true after the timeout elapsed, the
+ * remaining jiffies (at least 1) if the condition evaluated to true before
+ * the timeout elapsed, or -ERESTARTSYS if it was interrupted by a signal.
+ */
+static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen)
+{
+ int ret;
+
+ ret = wait_event_interruptible_timeout(tp_vars->more_bytes,
+ batadv_tp_avail(tp_vars, plen),
+ HZ / 10);
+
+ return ret;
+}
+
+/**
+ * batadv_tp_send() - main sending thread of a tp meter session
+ * @arg: address of the related tp_vars
+ *
+ * Return: nothing, this function never returns
+ */
+static int batadv_tp_send(void *arg)
+{
+ struct batadv_tp_vars *tp_vars = arg;
+ struct batadv_priv *bat_priv = tp_vars->bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ size_t payload_len, packet_len;
+ int err = 0;
+
+ if (unlikely(tp_vars->role != BATADV_TP_SENDER)) {
+ err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
+ goto out;
+ }
+
+ orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end);
+ if (unlikely(!orig_node)) {
+ err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (unlikely(!primary_if)) {
+ err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
+ goto out;
+ }
+
+ /* assume that all the hard_interfaces have a correctly
+ * configured MTU, so use the mesh_iface MTU as MSS.
+ * This might not be true and in that case the fragmentation
+ * should be used.
+ * Now, try to send the packet as it is
+ */
+ payload_len = BATADV_TP_PLEN;
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_tp_packet) > BATADV_TP_PLEN);
+
+ batadv_tp_reset_sender_timer(tp_vars);
+
+ /* queue the worker in charge of terminating the test */
+ queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work,
+ msecs_to_jiffies(tp_vars->test_length));
+
+ while (atomic_read(&tp_vars->sending) != 0) {
+ if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) {
+ batadv_tp_wait_available(tp_vars, payload_len);
+ continue;
+ }
+
+ /* to emulate normal unicast traffic, add to the payload len
+ * the size of the unicast header
+ */
+ packet_len = payload_len + sizeof(struct batadv_unicast_packet);
+
+ err = batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr,
+ orig_node, tp_vars->last_sent,
+ packet_len,
+ tp_vars->session, tp_vars->icmp_uid,
+ jiffies_to_msecs(jiffies));
+
+ /* something went wrong during the preparation/transmission */
+ if (unlikely(err && err != BATADV_TP_REASON_CANT_SEND)) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: %s() cannot send packets (%d)\n",
+ __func__, err);
+ /* ensure nobody else tries to stop the thread now */
+ if (atomic_dec_and_test(&tp_vars->sending))
+ tp_vars->reason = err;
+ break;
+ }
+
+ /* right-shift the TWND */
+ if (!err)
+ tp_vars->last_sent += payload_len;
+
+ cond_resched();
+ }
+
+out:
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+
+ batadv_tp_sender_end(bat_priv, tp_vars);
+ batadv_tp_sender_cleanup(bat_priv, tp_vars);
+
+ batadv_tp_vars_put(tp_vars);
+
+ return 0;
+}
+
+/**
+ * batadv_tp_start_kthread() - start new thread which manages the tp meter
+ * sender
+ * @tp_vars: the private data of the current TP meter session
+ */
+static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars)
+{
+ struct task_struct *kthread;
+ struct batadv_priv *bat_priv = tp_vars->bat_priv;
+ u32 session_cookie;
+
+ kref_get(&tp_vars->refcount);
+ kthread = kthread_create(batadv_tp_send, tp_vars, "kbatadv_tp_meter");
+ if (IS_ERR(kthread)) {
+ session_cookie = batadv_tp_session_cookie(tp_vars->session,
+ tp_vars->icmp_uid);
+ pr_err("batadv: cannot create tp meter kthread\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR,
+ tp_vars->other_end,
+ bat_priv, session_cookie);
+
+ /* drop reserved reference for kthread */
+ batadv_tp_vars_put(tp_vars);
+
+ /* cleanup of failed tp meter variables */
+ batadv_tp_sender_cleanup(bat_priv, tp_vars);
+ return;
+ }
+
+ wake_up_process(kthread);
+}
+
+/**
+ * batadv_tp_start() - start a new tp meter session
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: the receiver MAC address
+ * @test_length: test length in milliseconds
+ * @cookie: session cookie
+ */
+void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
+ u32 test_length, u32 *cookie)
+{
+ struct batadv_tp_vars *tp_vars;
+ u8 session_id[2];
+ u8 icmp_uid;
+ u32 session_cookie;
+
+ get_random_bytes(session_id, sizeof(session_id));
+ get_random_bytes(&icmp_uid, 1);
+ session_cookie = batadv_tp_session_cookie(session_id, icmp_uid);
+ *cookie = session_cookie;
+
+ /* look for an already existing test towards this node */
+ spin_lock_bh(&bat_priv->tp_list_lock);
+ tp_vars = batadv_tp_list_find(bat_priv, dst);
+ if (tp_vars) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_tp_vars_put(tp_vars);
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: test to or from the same node already ongoing, aborting\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING,
+ dst, bat_priv, session_cookie);
+ return;
+ }
+
+ if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: too many ongoing sessions, aborting (SEND)\n");
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_TOO_MANY, dst,
+ bat_priv, session_cookie);
+ return;
+ }
+
+ tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC);
+ if (!tp_vars) {
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: %s cannot allocate list elements\n",
+ __func__);
+ batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR,
+ dst, bat_priv, session_cookie);
+ return;
+ }
+
+ /* initialize tp_vars */
+ ether_addr_copy(tp_vars->other_end, dst);
+ kref_init(&tp_vars->refcount);
+ tp_vars->role = BATADV_TP_SENDER;
+ atomic_set(&tp_vars->sending, 1);
+ memcpy(tp_vars->session, session_id, sizeof(session_id));
+ tp_vars->icmp_uid = icmp_uid;
+
+ tp_vars->last_sent = BATADV_TP_FIRST_SEQ;
+ atomic_set(&tp_vars->last_acked, BATADV_TP_FIRST_SEQ);
+ tp_vars->fast_recovery = false;
+ tp_vars->recover = BATADV_TP_FIRST_SEQ;
+
+ /* initialise the CWND to 3*MSS (Section 3.1 in RFC5681).
+ * For batman-adv the MSS is the size of the payload received by the
+ * mesh_interface, hence its MTU
+ */
+ tp_vars->cwnd = BATADV_TP_PLEN * 3;
+ /* at the beginning initialise the SS threshold to the biggest possible
+ * window size, hence the AWND size
+ */
+ tp_vars->ss_threshold = BATADV_TP_AWND;
+
+ /* RTO initial value is 3 seconds.
+ * Details in Section 2.1 of RFC6298
+ */
+ tp_vars->rto = 1000;
+ tp_vars->srtt = 0;
+ tp_vars->rttvar = 0;
+
+ atomic64_set(&tp_vars->tot_sent, 0);
+
+ kref_get(&tp_vars->refcount);
+ timer_setup(&tp_vars->timer, batadv_tp_sender_timeout, 0);
+
+ tp_vars->bat_priv = bat_priv;
+ tp_vars->start_time = jiffies;
+
+ init_waitqueue_head(&tp_vars->more_bytes);
+
+ spin_lock_init(&tp_vars->unacked_lock);
+ INIT_LIST_HEAD(&tp_vars->unacked_list);
+
+ spin_lock_init(&tp_vars->cwnd_lock);
+
+ tp_vars->prerandom_offset = 0;
+ spin_lock_init(&tp_vars->prerandom_lock);
+
+ kref_get(&tp_vars->refcount);
+ hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list);
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+
+ tp_vars->test_length = test_length;
+ if (!tp_vars->test_length)
+ tp_vars->test_length = BATADV_TP_DEF_TEST_LENGTH;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: starting throughput meter towards %pM (length=%ums)\n",
+ dst, test_length);
+
+ /* init work item for finished tp tests */
+ INIT_DELAYED_WORK(&tp_vars->finish_work, batadv_tp_sender_finish);
+
+ /* start tp kthread. This way the write() call issued from userspace can
+ * happily return and avoid to block
+ */
+ batadv_tp_start_kthread(tp_vars);
+
+ /* don't return reference to new tp_vars */
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_stop() - stop currently running tp meter session
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: the receiver MAC address
+ * @return_value: reason for tp meter session stop
+ */
+void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 return_value)
+{
+ struct batadv_orig_node *orig_node;
+ struct batadv_tp_vars *tp_vars;
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: stopping test towards %pM\n", dst);
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (!orig_node)
+ return;
+
+ tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig);
+ if (!tp_vars) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: trying to interrupt an already over connection\n");
+ goto out;
+ }
+
+ batadv_tp_sender_shutdown(tp_vars, return_value);
+ batadv_tp_vars_put(tp_vars);
+out:
+ batadv_orig_node_put(orig_node);
+}
+
+/**
+ * batadv_tp_reset_receiver_timer() - reset the receiver shutdown timer
+ * @tp_vars: the private data of the current TP meter session
+ *
+ * start the receiver shutdown timer or reset it if already started
+ */
+static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
+{
+ mod_timer(&tp_vars->timer,
+ jiffies + msecs_to_jiffies(BATADV_TP_RECV_TIMEOUT));
+}
+
+/**
+ * batadv_tp_receiver_shutdown() - stop a tp meter receiver when timeout is
+ * reached without received ack
+ * @t: address to timer_list inside tp_vars
+ */
+static void batadv_tp_receiver_shutdown(struct timer_list *t)
+{
+ struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer);
+ struct batadv_tp_unacked *un, *safe;
+ struct batadv_priv *bat_priv;
+
+ bat_priv = tp_vars->bat_priv;
+
+ /* if there is recent activity rearm the timer */
+ if (!batadv_has_timed_out(tp_vars->last_recv_time,
+ BATADV_TP_RECV_TIMEOUT)) {
+ /* reset the receiver shutdown timer */
+ batadv_tp_reset_receiver_timer(tp_vars);
+ return;
+ }
+
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Shutting down for inactivity (more than %dms) from %pM\n",
+ BATADV_TP_RECV_TIMEOUT, tp_vars->other_end);
+
+ spin_lock_bh(&tp_vars->bat_priv->tp_list_lock);
+ hlist_del_rcu(&tp_vars->list);
+ spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock);
+
+ /* drop list reference */
+ batadv_tp_vars_put(tp_vars);
+
+ atomic_dec(&bat_priv->tp_num);
+
+ spin_lock_bh(&tp_vars->unacked_lock);
+ list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+ list_del(&un->list);
+ kfree(un);
+ }
+ spin_unlock_bh(&tp_vars->unacked_lock);
+
+ /* drop reference of timer */
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_send_ack() - send an ACK packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst: the mac address of the destination originator
+ * @seq: the sequence number to ACK
+ * @timestamp: the timestamp to echo back in the ACK
+ * @session: session identifier
+ * @socket_index: local ICMP socket identifier
+ *
+ * Return: 0 on success, a positive integer representing the reason of the
+ * failure otherwise
+ */
+static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
+ u32 seq, __be32 timestamp, const u8 *session,
+ int socket_index)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node;
+ struct batadv_icmp_tp_packet *icmp;
+ struct sk_buff *skb;
+ int r, ret;
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (unlikely(!orig_node)) {
+ ret = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (unlikely(!primary_if)) {
+ ret = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+
+ skb = netdev_alloc_skb_ip_align(NULL, sizeof(*icmp) + ETH_HLEN);
+ if (unlikely(!skb)) {
+ ret = BATADV_TP_REASON_MEMORY_ERROR;
+ goto out;
+ }
+
+ skb_reserve(skb, ETH_HLEN);
+ icmp = skb_put(skb, sizeof(*icmp));
+ icmp->packet_type = BATADV_ICMP;
+ icmp->version = BATADV_COMPAT_VERSION;
+ icmp->ttl = BATADV_TTL;
+ icmp->msg_type = BATADV_TP;
+ ether_addr_copy(icmp->dst, orig_node->orig);
+ ether_addr_copy(icmp->orig, primary_if->net_dev->dev_addr);
+ icmp->uid = socket_index;
+
+ icmp->subtype = BATADV_TP_ACK;
+ memcpy(icmp->session, session, sizeof(icmp->session));
+ icmp->seqno = htonl(seq);
+ icmp->timestamp = timestamp;
+
+ /* send the ack */
+ r = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (unlikely(r < 0) || r == NET_XMIT_DROP) {
+ ret = BATADV_TP_REASON_DST_UNREACHABLE;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
+
+ return ret;
+}
+
+/**
+ * batadv_tp_handle_out_of_order() - store an out of order packet
+ * @tp_vars: the private data of the current TP meter session
+ * @skb: the buffer containing the received packet
+ *
+ * Store the out of order packet in the unacked list for late processing. This
+ * packets are kept in this list so that they can be ACKed at once as soon as
+ * all the previous packets have been received
+ *
+ * Return: true if the packed has been successfully processed, false otherwise
+ */
+static bool batadv_tp_handle_out_of_order(struct batadv_tp_vars *tp_vars,
+ const struct sk_buff *skb)
+{
+ const struct batadv_icmp_tp_packet *icmp;
+ struct batadv_tp_unacked *un, *new;
+ u32 payload_len;
+ bool added = false;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (unlikely(!new))
+ return false;
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ new->seqno = ntohl(icmp->seqno);
+ payload_len = skb->len - sizeof(struct batadv_unicast_packet);
+ new->len = payload_len;
+
+ spin_lock_bh(&tp_vars->unacked_lock);
+ /* if the list is empty immediately attach this new object */
+ if (list_empty(&tp_vars->unacked_list)) {
+ list_add(&new->list, &tp_vars->unacked_list);
+ goto out;
+ }
+
+ /* otherwise loop over the list and either drop the packet because this
+ * is a duplicate or store it at the right position.
+ *
+ * The iteration is done in the reverse way because it is likely that
+ * the last received packet (the one being processed now) has a bigger
+ * seqno than all the others already stored.
+ */
+ list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) {
+ /* check for duplicates */
+ if (new->seqno == un->seqno) {
+ if (new->len > un->len)
+ un->len = new->len;
+ kfree(new);
+ added = true;
+ break;
+ }
+
+ /* look for the right position */
+ if (batadv_seq_before(new->seqno, un->seqno))
+ continue;
+
+ /* as soon as an entry having a bigger seqno is found, the new
+ * one is attached _after_ it. In this way the list is kept in
+ * ascending order
+ */
+ list_add_tail(&new->list, &un->list);
+ added = true;
+ break;
+ }
+
+ /* received packet with smallest seqno out of order; add it to front */
+ if (!added)
+ list_add(&new->list, &tp_vars->unacked_list);
+
+out:
+ spin_unlock_bh(&tp_vars->unacked_lock);
+
+ return true;
+}
+
+/**
+ * batadv_tp_ack_unordered() - update number received bytes in current stream
+ * without gaps
+ * @tp_vars: the private data of the current TP meter session
+ */
+static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars)
+{
+ struct batadv_tp_unacked *un, *safe;
+ u32 to_ack;
+
+ /* go through the unacked packet list and possibly ACK them as
+ * well
+ */
+ spin_lock_bh(&tp_vars->unacked_lock);
+ list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+ /* the list is ordered, therefore it is possible to stop as soon
+ * there is a gap between the last acked seqno and the seqno of
+ * the packet under inspection
+ */
+ if (batadv_seq_before(tp_vars->last_recv, un->seqno))
+ break;
+
+ to_ack = un->seqno + un->len - tp_vars->last_recv;
+
+ if (batadv_seq_before(tp_vars->last_recv, un->seqno + un->len))
+ tp_vars->last_recv += to_ack;
+
+ list_del(&un->list);
+ kfree(un);
+ }
+ spin_unlock_bh(&tp_vars->unacked_lock);
+}
+
+/**
+ * batadv_tp_init_recv() - return matching or create new receiver tp_vars
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @icmp: received icmp tp msg
+ *
+ * Return: corresponding tp_vars or NULL on errors
+ */
+static struct batadv_tp_vars *
+batadv_tp_init_recv(struct batadv_priv *bat_priv,
+ const struct batadv_icmp_tp_packet *icmp)
+{
+ struct batadv_tp_vars *tp_vars;
+
+ spin_lock_bh(&bat_priv->tp_list_lock);
+ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
+ icmp->session);
+ if (tp_vars)
+ goto out_unlock;
+
+ if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: too many ongoing sessions, aborting (RECV)\n");
+ goto out_unlock;
+ }
+
+ tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC);
+ if (!tp_vars)
+ goto out_unlock;
+
+ ether_addr_copy(tp_vars->other_end, icmp->orig);
+ tp_vars->role = BATADV_TP_RECEIVER;
+ memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session));
+ tp_vars->last_recv = BATADV_TP_FIRST_SEQ;
+ tp_vars->bat_priv = bat_priv;
+ kref_init(&tp_vars->refcount);
+
+ spin_lock_init(&tp_vars->unacked_lock);
+ INIT_LIST_HEAD(&tp_vars->unacked_list);
+
+ kref_get(&tp_vars->refcount);
+ hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list);
+
+ kref_get(&tp_vars->refcount);
+ timer_setup(&tp_vars->timer, batadv_tp_receiver_shutdown, 0);
+
+ batadv_tp_reset_receiver_timer(tp_vars);
+
+out_unlock:
+ spin_unlock_bh(&bat_priv->tp_list_lock);
+
+ return tp_vars;
+}
+
+/**
+ * batadv_tp_recv_msg() - process a single data message
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the buffer containing the received packet
+ *
+ * Process a received TP MSG packet
+ */
+static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
+ const struct sk_buff *skb)
+{
+ const struct batadv_icmp_tp_packet *icmp;
+ struct batadv_tp_vars *tp_vars;
+ size_t packet_size;
+ u32 seqno;
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ seqno = ntohl(icmp->seqno);
+ /* check if this is the first seqno. This means that if the
+ * first packet is lost, the tp meter does not work anymore!
+ */
+ if (seqno == BATADV_TP_FIRST_SEQ) {
+ tp_vars = batadv_tp_init_recv(bat_priv, icmp);
+ if (!tp_vars) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: seqno != BATADV_TP_FIRST_SEQ cannot initiate connection\n");
+ goto out;
+ }
+ } else {
+ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig,
+ icmp->session);
+ if (!tp_vars) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Unexpected packet from %pM!\n",
+ icmp->orig);
+ goto out;
+ }
+ }
+
+ if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) {
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Meter: dropping packet: not expected (role=%u)\n",
+ tp_vars->role);
+ goto out;
+ }
+
+ tp_vars->last_recv_time = jiffies;
+
+ /* if the packet is a duplicate, it may be the case that an ACK has been
+ * lost. Resend the ACK
+ */
+ if (batadv_seq_before(seqno, tp_vars->last_recv))
+ goto send_ack;
+
+ /* if the packet is out of order enqueue it */
+ if (ntohl(icmp->seqno) != tp_vars->last_recv) {
+ /* exit immediately (and do not send any ACK) if the packet has
+ * not been enqueued correctly
+ */
+ if (!batadv_tp_handle_out_of_order(tp_vars, skb))
+ goto out;
+
+ /* send a duplicate ACK */
+ goto send_ack;
+ }
+
+ /* if everything was fine count the ACKed bytes */
+ packet_size = skb->len - sizeof(struct batadv_unicast_packet);
+ tp_vars->last_recv += packet_size;
+
+ /* check if this ordered message filled a gap.... */
+ batadv_tp_ack_unordered(tp_vars);
+
+send_ack:
+ /* send the ACK. If the received packet was out of order, the ACK that
+ * is going to be sent is a duplicate (the sender will count them and
+ * possibly enter Fast Retransmit as soon as it has reached 3)
+ */
+ batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv,
+ icmp->timestamp, icmp->session, icmp->uid);
+out:
+ batadv_tp_vars_put(tp_vars);
+}
+
+/**
+ * batadv_tp_meter_recv() - main TP Meter receiving function
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the buffer containing the received packet
+ */
+void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb)
+{
+ struct batadv_icmp_tp_packet *icmp;
+
+ icmp = (struct batadv_icmp_tp_packet *)skb->data;
+
+ switch (icmp->subtype) {
+ case BATADV_TP_MSG:
+ batadv_tp_recv_msg(bat_priv, skb);
+ break;
+ case BATADV_TP_ACK:
+ batadv_tp_recv_ack(bat_priv, skb);
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_TP_METER, bat_priv,
+ "Received unknown TP Metric packet type %u\n",
+ icmp->subtype);
+ }
+ consume_skb(skb);
+}
+
+/**
+ * batadv_tp_meter_init() - initialize global tp_meter structures
+ */
+void __init batadv_tp_meter_init(void)
+{
+ get_random_bytes(batadv_tp_prerandom, sizeof(batadv_tp_prerandom));
+}
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
new file mode 100644
index 000000000000..f0046d366eac
--- /dev/null
+++ b/net/batman-adv/tp_meter.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Edo Monticelli, Antonio Quartulli
+ */
+
+#ifndef _NET_BATMAN_ADV_TP_METER_H_
+#define _NET_BATMAN_ADV_TP_METER_H_
+
+#include "main.h"
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+void batadv_tp_meter_init(void);
+void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
+ u32 test_length, u32 *cookie);
+void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst,
+ u8 return_value);
+void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb);
+
+#endif /* _NET_BATMAN_ADV_TP_METER_H_ */
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
new file mode 100644
index 000000000000..ec8b9519076b
--- /dev/null
+++ b/net/batman-adv/trace.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Sven Eckelmann
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
new file mode 100644
index 000000000000..7da692ec38e9
--- /dev/null
+++ b/net/batman-adv/trace.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Sven Eckelmann
+ */
+
+#if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _NET_BATMAN_ADV_TRACE_H_
+
+#include "main.h"
+
+#include <linux/netdevice.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM batadv
+
+/* provide dummy function when tracing is disabled */
+#if !defined(CONFIG_BATMAN_ADV_TRACING)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, ...) \
+ static inline void trace_ ## name(proto) {}
+
+#endif /* CONFIG_BATMAN_ADV_TRACING */
+
+TRACE_EVENT(batadv_dbg,
+
+ TP_PROTO(struct batadv_priv *bat_priv,
+ struct va_format *vaf),
+
+ TP_ARGS(bat_priv, vaf),
+
+ TP_STRUCT__entry(
+ __string(device, bat_priv->mesh_iface->name)
+ __string(driver, KBUILD_MODNAME)
+ __vstring(msg, vaf->fmt, vaf->va)
+ ),
+
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(driver);
+ __assign_vstr(msg, vaf->fmt, vaf->va);
+ ),
+
+ TP_printk(
+ "%s %s %s",
+ __get_str(driver),
+ __get_str(device),
+ __get_str(msg)
+ )
+);
+
+#endif /* _NET_BATMAN_ADV_TRACE_H_ || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
new file mode 100644
index 000000000000..6e95e883c2bf
--- /dev/null
+++ b/net/batman-adv/translation-table.c
@@ -0,0 +1,4238 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich, Antonio Quartulli
+ */
+
+#include "translation-table.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/build_bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/container_of.h>
+#include <linux/crc32.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/overflow.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#include "bridge_loop_avoidance.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "log.h"
+#include "mesh-interface.h"
+#include "netlink.h"
+#include "originator.h"
+#include "tvlv.h"
+
+static struct kmem_cache *batadv_tl_cache __read_mostly;
+static struct kmem_cache *batadv_tg_cache __read_mostly;
+static struct kmem_cache *batadv_tt_orig_cache __read_mostly;
+static struct kmem_cache *batadv_tt_change_cache __read_mostly;
+static struct kmem_cache *batadv_tt_req_cache __read_mostly;
+static struct kmem_cache *batadv_tt_roam_cache __read_mostly;
+
+/* hash class keys */
+static struct lock_class_key batadv_tt_local_hash_lock_class_key;
+static struct lock_class_key batadv_tt_global_hash_lock_class_key;
+
+static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
+ unsigned short vid,
+ struct batadv_orig_node *orig_node);
+static void batadv_tt_purge(struct work_struct *work);
+static void
+batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry);
+static void batadv_tt_global_del(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr,
+ unsigned short vid, const char *message,
+ bool roaming);
+
+/**
+ * batadv_compare_tt() - check if two TT entries are the same
+ * @node: the list element pointer of the first TT entry
+ * @data2: pointer to the tt_common_entry of the second TT entry
+ *
+ * Compare the MAC address and the VLAN ID of the two TT entries and check if
+ * they are the same TT client.
+ * Return: true if the two TT clients are the same, false otherwise
+ */
+static bool batadv_compare_tt(const struct hlist_node *node, const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_tt_common_entry,
+ hash_entry);
+ const struct batadv_tt_common_entry *tt1 = data1;
+ const struct batadv_tt_common_entry *tt2 = data2;
+
+ return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2);
+}
+
+/**
+ * batadv_choose_tt() - return the index of the tt entry in the hash table
+ * @data: pointer to the tt_common_entry object to map
+ * @size: the size of the hash table
+ *
+ * Return: the hash index where the object represented by 'data' should be
+ * stored at.
+ */
+static inline u32 batadv_choose_tt(const void *data, u32 size)
+{
+ const struct batadv_tt_common_entry *tt;
+ u32 hash = 0;
+
+ tt = data;
+ hash = jhash(&tt->addr, ETH_ALEN, hash);
+ hash = jhash(&tt->vid, sizeof(tt->vid), hash);
+
+ return hash % size;
+}
+
+/**
+ * batadv_tt_hash_find() - look for a client in the given hash table
+ * @hash: the hash table to search
+ * @addr: the mac address of the client to look for
+ * @vid: VLAN identifier
+ *
+ * Return: a pointer to the tt_common struct belonging to the searched client if
+ * found, NULL otherwise.
+ */
+static struct batadv_tt_common_entry *
+batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
+ unsigned short vid)
+{
+ struct hlist_head *head;
+ struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL;
+ u32 index;
+
+ if (!hash)
+ return NULL;
+
+ ether_addr_copy(to_search.addr, addr);
+ to_search.vid = vid;
+
+ index = batadv_choose_tt(&to_search, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt, head, hash_entry) {
+ if (!batadv_compare_eth(tt, addr))
+ continue;
+
+ if (tt->vid != vid)
+ continue;
+
+ if (!kref_get_unless_zero(&tt->refcount))
+ continue;
+
+ tt_tmp = tt;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tt_tmp;
+}
+
+/**
+ * batadv_tt_local_hash_find() - search the local table for a given client
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the client to look for
+ * @vid: VLAN identifier
+ *
+ * Return: a pointer to the corresponding tt_local_entry struct if the client is
+ * found, NULL otherwise.
+ */
+static struct batadv_tt_local_entry *
+batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_local_entry *tt_local_entry = NULL;
+
+ tt_common_entry = batadv_tt_hash_find(bat_priv->tt.local_hash, addr,
+ vid);
+ if (tt_common_entry)
+ tt_local_entry = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+ return tt_local_entry;
+}
+
+/**
+ * batadv_tt_global_hash_find() - search the global table for a given client
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the client to look for
+ * @vid: VLAN identifier
+ *
+ * Return: a pointer to the corresponding tt_global_entry struct if the client
+ * is found, NULL otherwise.
+ */
+struct batadv_tt_global_entry *
+batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_global_entry *tt_global_entry = NULL;
+
+ tt_common_entry = batadv_tt_hash_find(bat_priv->tt.global_hash, addr,
+ vid);
+ if (tt_common_entry)
+ tt_global_entry = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+ return tt_global_entry;
+}
+
+/**
+ * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue
+ * for free after rcu grace period
+ * @ref: kref pointer of the batadv_tt_local_entry
+ */
+static void batadv_tt_local_entry_release(struct kref *ref)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+
+ tt_local_entry = container_of(ref, struct batadv_tt_local_entry,
+ common.refcount);
+
+ batadv_meshif_vlan_put(tt_local_entry->vlan);
+
+ kfree_rcu(tt_local_entry, common.rcu);
+}
+
+/**
+ * batadv_tt_local_entry_put() - decrement the tt_local_entry refcounter and
+ * possibly release it
+ * @tt_local_entry: tt_local_entry to be free'd
+ */
+static void
+batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
+{
+ if (!tt_local_entry)
+ return;
+
+ kref_put(&tt_local_entry->common.refcount,
+ batadv_tt_local_entry_release);
+}
+
+/**
+ * batadv_tt_global_entry_release() - release tt_global_entry from lists and
+ * queue for free after rcu grace period
+ * @ref: kref pointer of the batadv_tt_global_entry
+ */
+void batadv_tt_global_entry_release(struct kref *ref)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+
+ tt_global_entry = container_of(ref, struct batadv_tt_global_entry,
+ common.refcount);
+
+ batadv_tt_global_del_orig_list(tt_global_entry);
+
+ kfree_rcu(tt_global_entry, common.rcu);
+}
+
+/**
+ * batadv_tt_global_hash_count() - count the number of orig entries
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the client to count entries for
+ * @vid: VLAN identifier
+ *
+ * Return: the number of originators advertising the given address/data
+ * (excluding our self).
+ */
+int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
+ const u8 *addr, unsigned short vid)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ int count;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ return 0;
+
+ count = atomic_read(&tt_global_entry->orig_list_count);
+ batadv_tt_global_entry_put(tt_global_entry);
+
+ return count;
+}
+
+/**
+ * batadv_tt_local_size_mod() - change the size by v of the local table
+ * identified by vid
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: the VLAN identifier of the sub-table to change
+ * @v: the amount to sum to the local table size
+ */
+static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv,
+ unsigned short vid, int v)
+{
+ struct batadv_meshif_vlan *vlan;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
+ if (!vlan)
+ return;
+
+ atomic_add(v, &vlan->tt.num_entries);
+
+ batadv_meshif_vlan_put(vlan);
+}
+
+/**
+ * batadv_tt_local_size_inc() - increase by one the local table size for the
+ * given vid
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: the VLAN identifier
+ */
+static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ batadv_tt_local_size_mod(bat_priv, vid, 1);
+}
+
+/**
+ * batadv_tt_local_size_dec() - decrease by one the local table size for the
+ * given vid
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: the VLAN identifier
+ */
+static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ batadv_tt_local_size_mod(bat_priv, vid, -1);
+}
+
+/**
+ * batadv_tt_global_size_mod() - change the size by v of the global table
+ * for orig_node identified by vid
+ * @orig_node: the originator for which the table has to be modified
+ * @vid: the VLAN identifier
+ * @v: the amount to sum to the global table size
+ */
+static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
+ unsigned short vid, int v)
+{
+ struct batadv_orig_node_vlan *vlan;
+
+ vlan = batadv_orig_node_vlan_new(orig_node, vid);
+ if (!vlan)
+ return;
+
+ if (atomic_add_return(v, &vlan->tt.num_entries) == 0) {
+ spin_lock_bh(&orig_node->vlan_list_lock);
+ if (!hlist_unhashed(&vlan->list)) {
+ hlist_del_init_rcu(&vlan->list);
+ batadv_orig_node_vlan_put(vlan);
+ }
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+ }
+
+ batadv_orig_node_vlan_put(vlan);
+}
+
+/**
+ * batadv_tt_global_size_inc() - increase by one the global table size for the
+ * given vid
+ * @orig_node: the originator which global table size has to be decreased
+ * @vid: the vlan identifier
+ */
+static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ batadv_tt_global_size_mod(orig_node, vid, 1);
+}
+
+/**
+ * batadv_tt_global_size_dec() - decrease by one the global table size for the
+ * given vid
+ * @orig_node: the originator which global table size has to be decreased
+ * @vid: the vlan identifier
+ */
+static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ batadv_tt_global_size_mod(orig_node, vid, -1);
+}
+
+/**
+ * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and
+ * queue for free after rcu grace period
+ * @ref: kref pointer of the tt orig entry
+ */
+static void batadv_tt_orig_list_entry_release(struct kref *ref)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ orig_entry = container_of(ref, struct batadv_tt_orig_list_entry,
+ refcount);
+
+ batadv_orig_node_put(orig_entry->orig_node);
+ kfree_rcu(orig_entry, rcu);
+}
+
+/**
+ * batadv_tt_orig_list_entry_put() - decrement the tt orig entry refcounter and
+ * possibly release it
+ * @orig_entry: tt orig entry to be free'd
+ */
+static void
+batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry)
+{
+ if (!orig_entry)
+ return;
+
+ kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release);
+}
+
+/**
+ * batadv_tt_local_event() - store a local TT event (ADD/DEL)
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_local_entry: the TT entry involved in the event
+ * @event_flags: flags to store in the event structure
+ */
+static void batadv_tt_local_event(struct batadv_priv *bat_priv,
+ struct batadv_tt_local_entry *tt_local_entry,
+ u8 event_flags)
+{
+ struct batadv_tt_change_node *tt_change_node, *entry, *safe;
+ struct batadv_tt_common_entry *common = &tt_local_entry->common;
+ u8 flags = common->flags | event_flags;
+ bool del_op_requested, del_op_entry;
+ size_t changes;
+
+ tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC);
+ if (!tt_change_node)
+ return;
+
+ tt_change_node->change.flags = flags;
+ memset(tt_change_node->change.reserved, 0,
+ sizeof(tt_change_node->change.reserved));
+ ether_addr_copy(tt_change_node->change.addr, common->addr);
+ tt_change_node->change.vid = htons(common->vid);
+
+ del_op_requested = flags & BATADV_TT_CLIENT_DEL;
+
+ /* check for ADD+DEL, DEL+ADD, ADD+ADD or DEL+DEL events */
+ spin_lock_bh(&bat_priv->tt.changes_list_lock);
+ changes = READ_ONCE(bat_priv->tt.local_changes);
+ list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
+ list) {
+ if (!batadv_compare_eth(entry->change.addr, common->addr))
+ continue;
+
+ del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL;
+ if (del_op_requested != del_op_entry) {
+ /* DEL+ADD in the same orig interval have no effect and
+ * can be removed to avoid silly behaviour on the
+ * receiver side. The other way around (ADD+DEL) can
+ * happen in case of roaming of a client still in the
+ * NEW state. Roaming of NEW clients is now possible due
+ * to automatically recognition of "temporary" clients
+ */
+ list_del(&entry->list);
+ kmem_cache_free(batadv_tt_change_cache, entry);
+ changes--;
+ } else {
+ /* this is a second add or del in the same originator
+ * interval. It could mean that flags have been changed
+ * (e.g. double add): update them
+ */
+ entry->change.flags = flags;
+ }
+
+ kmem_cache_free(batadv_tt_change_cache, tt_change_node);
+ goto update_changes;
+ }
+
+ /* track the change in the OGMinterval list */
+ list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list);
+ changes++;
+
+update_changes:
+ WRITE_ONCE(bat_priv->tt.local_changes, changes);
+ spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+}
+
+/**
+ * batadv_tt_len() - compute length in bytes of given number of tt changes
+ * @changes_num: number of tt changes
+ *
+ * Return: computed length in bytes.
+ */
+static int batadv_tt_len(int changes_num)
+{
+ return changes_num * sizeof(struct batadv_tvlv_tt_change);
+}
+
+/**
+ * batadv_tt_entries() - compute the number of entries fitting in tt_len bytes
+ * @tt_len: available space
+ *
+ * Return: the number of entries.
+ */
+static u16 batadv_tt_entries(u16 tt_len)
+{
+ return tt_len / batadv_tt_len(1);
+}
+
+/**
+ * batadv_tt_local_table_transmit_size() - calculates the local translation
+ * table size when transmitted over the air
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: local translation table size in bytes.
+ */
+static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv)
+{
+ u16 num_vlan = 0;
+ u16 tt_local_entries = 0;
+ struct batadv_meshif_vlan *vlan;
+ int hdr_size;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) {
+ num_vlan++;
+ tt_local_entries += atomic_read(&vlan->tt.num_entries);
+ }
+ rcu_read_unlock();
+
+ /* header size of tvlv encapsulated tt response payload */
+ hdr_size = sizeof(struct batadv_unicast_tvlv_packet);
+ hdr_size += sizeof(struct batadv_tvlv_hdr);
+ hdr_size += sizeof(struct batadv_tvlv_tt_data);
+ hdr_size += num_vlan * sizeof(struct batadv_tvlv_tt_vlan_data);
+
+ return hdr_size + batadv_tt_len(tt_local_entries);
+}
+
+static int batadv_tt_local_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->tt.local_hash)
+ return 0;
+
+ bat_priv->tt.local_hash = batadv_hash_new(1024);
+
+ if (!bat_priv->tt.local_hash)
+ return -ENOMEM;
+
+ batadv_hash_set_lock_class(bat_priv->tt.local_hash,
+ &batadv_tt_local_hash_lock_class_key);
+
+ return 0;
+}
+
+static void batadv_tt_global_free(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global,
+ const char *message)
+{
+ struct batadv_tt_global_entry *tt_removed_entry;
+ struct hlist_node *tt_removed_node;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting global tt entry %pM (vid: %d): %s\n",
+ tt_global->common.addr,
+ batadv_print_vid(tt_global->common.vid), message);
+
+ tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash,
+ batadv_compare_tt,
+ batadv_choose_tt,
+ &tt_global->common);
+ if (!tt_removed_node)
+ return;
+
+ /* drop reference of remove hash entry */
+ tt_removed_entry = hlist_entry(tt_removed_node,
+ struct batadv_tt_global_entry,
+ common.hash_entry);
+ batadv_tt_global_entry_put(tt_removed_entry);
+}
+
+/**
+ * batadv_tt_local_add() - add a new client to the local table or update an
+ * existing client
+ * @mesh_iface: netdev struct of the mesh interface
+ * @addr: the mac address of the client to add
+ * @vid: VLAN identifier
+ * @ifindex: index of the interface where the client is connected to (useful to
+ * identify wireless clients)
+ * @mark: the value contained in the skb->mark field of the received packet (if
+ * any)
+ *
+ * Return: true if the client was successfully added, false otherwise.
+ */
+bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr,
+ unsigned short vid, int ifindex, u32 mark)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ struct batadv_tt_local_entry *tt_local;
+ struct batadv_tt_global_entry *tt_global = NULL;
+ struct net *net = dev_net(mesh_iface);
+ struct batadv_meshif_vlan *vlan;
+ struct net_device *in_dev = NULL;
+ struct batadv_hard_iface *in_hardif = NULL;
+ struct hlist_head *head;
+ struct batadv_tt_orig_list_entry *orig_entry;
+ int hash_added, table_size, packet_size_max;
+ bool ret = false;
+ bool roamed_back = false;
+ u8 remote_flags;
+ u32 match_mark;
+
+ if (ifindex != BATADV_NULL_IFINDEX)
+ in_dev = dev_get_by_index(net, ifindex);
+
+ if (in_dev)
+ in_hardif = batadv_hardif_get_by_netdev(in_dev);
+
+ tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid);
+
+ if (!is_multicast_ether_addr(addr))
+ tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid);
+
+ if (tt_local) {
+ tt_local->last_seen = jiffies;
+ if (tt_local->common.flags & BATADV_TT_CLIENT_PENDING) {
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Re-adding pending client %pM (vid: %d)\n",
+ addr, batadv_print_vid(vid));
+ /* whatever the reason why the PENDING flag was set,
+ * this is a client which was enqueued to be removed in
+ * this orig_interval. Since it popped up again, the
+ * flag can be reset like it was never enqueued
+ */
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_PENDING;
+ goto add_event;
+ }
+
+ if (tt_local->common.flags & BATADV_TT_CLIENT_ROAM) {
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Roaming client %pM (vid: %d) came back to its original location\n",
+ addr, batadv_print_vid(vid));
+ /* the ROAM flag is set because this client roamed away
+ * and the node got a roaming_advertisement message. Now
+ * that the client popped up again at its original
+ * location such flag can be unset
+ */
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_ROAM;
+ roamed_back = true;
+ }
+ goto check_roaming;
+ }
+
+ /* Ignore the client if we cannot send it in a full table response. */
+ table_size = batadv_tt_local_table_transmit_size(bat_priv);
+ table_size += batadv_tt_len(1);
+ packet_size_max = atomic_read(&bat_priv->packet_size_max);
+ if (table_size > packet_size_max) {
+ net_ratelimited_function(batadv_info, mesh_iface,
+ "Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n",
+ table_size, packet_size_max, addr);
+ goto out;
+ }
+
+ tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC);
+ if (!tt_local)
+ goto out;
+
+ /* increase the refcounter of the related vlan */
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
+ if (!vlan) {
+ net_ratelimited_function(batadv_info, mesh_iface,
+ "adding TT local entry %pM to non-existent VLAN %d\n",
+ addr, batadv_print_vid(vid));
+ kmem_cache_free(batadv_tl_cache, tt_local);
+ tt_local = NULL;
+ goto out;
+ }
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n",
+ addr, batadv_print_vid(vid),
+ (u8)atomic_read(&bat_priv->tt.vn));
+
+ ether_addr_copy(tt_local->common.addr, addr);
+ /* The local entry has to be marked as NEW to avoid to send it in
+ * a full table response going out before the next ttvn increment
+ * (consistency check)
+ */
+ tt_local->common.flags = BATADV_TT_CLIENT_NEW;
+ tt_local->common.vid = vid;
+ if (batadv_is_wifi_hardif(in_hardif))
+ tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
+ kref_init(&tt_local->common.refcount);
+ tt_local->last_seen = jiffies;
+ tt_local->common.added_at = tt_local->last_seen;
+ tt_local->vlan = vlan;
+
+ /* the batman interface mac and multicast addresses should never be
+ * purged
+ */
+ if (batadv_compare_eth(addr, mesh_iface->dev_addr) ||
+ is_multicast_ether_addr(addr))
+ tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE;
+
+ kref_get(&tt_local->common.refcount);
+ hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt,
+ batadv_choose_tt, &tt_local->common,
+ &tt_local->common.hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* remove the reference for the hash */
+ batadv_tt_local_entry_put(tt_local);
+ goto out;
+ }
+
+add_event:
+ batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS);
+
+check_roaming:
+ /* Check whether it is a roaming, but don't do anything if the roaming
+ * process has already been handled
+ */
+ if (tt_global && !(tt_global->common.flags & BATADV_TT_CLIENT_ROAM)) {
+ /* These node are probably going to update their tt table */
+ head = &tt_global->orig_list;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ batadv_send_roam_adv(bat_priv, tt_global->common.addr,
+ tt_global->common.vid,
+ orig_entry->orig_node);
+ }
+ rcu_read_unlock();
+ if (roamed_back) {
+ batadv_tt_global_free(bat_priv, tt_global,
+ "Roaming canceled");
+ } else {
+ /* The global entry has to be marked as ROAMING and
+ * has to be kept for consistency purpose
+ */
+ tt_global->common.flags |= BATADV_TT_CLIENT_ROAM;
+ tt_global->roam_at = jiffies;
+ }
+ }
+
+ /* store the current remote flags before altering them. This helps
+ * understanding is flags are changing or not
+ */
+ remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK;
+
+ if (batadv_is_wifi_hardif(in_hardif))
+ tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
+ else
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI;
+
+ /* check the mark in the skb: if it's equal to the configured
+ * isolation_mark, it means the packet is coming from an isolated
+ * non-mesh client
+ */
+ match_mark = (mark & bat_priv->isolation_mark_mask);
+ if (bat_priv->isolation_mark_mask &&
+ match_mark == bat_priv->isolation_mark)
+ tt_local->common.flags |= BATADV_TT_CLIENT_ISOLA;
+ else
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_ISOLA;
+
+ /* if any "dynamic" flag has been modified, resend an ADD event for this
+ * entry so that all the nodes can get the new flags
+ */
+ if (remote_flags ^ (tt_local->common.flags & BATADV_TT_REMOTE_MASK))
+ batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS);
+
+ ret = true;
+out:
+ batadv_hardif_put(in_hardif);
+ dev_put(in_dev);
+ batadv_tt_local_entry_put(tt_local);
+ batadv_tt_global_entry_put(tt_global);
+ return ret;
+}
+
+/**
+ * batadv_tt_prepare_tvlv_global_data() - prepare the TVLV TT header to send
+ * within a TT Response directed to another node
+ * @orig_node: originator for which the TT data has to be prepared
+ * @tt_data: uninitialised pointer to the address of the TVLV buffer
+ * @tt_change: uninitialised pointer to the address of the area where the TT
+ * changed can be stored
+ * @tt_len: pointer to the length to reserve to the tt_change. if -1 this
+ * function reserves the amount of space needed to send the entire global TT
+ * table. In case of success the value is updated with the real amount of
+ * reserved bytes
+ * Allocate the needed amount of memory for the entire TT TVLV and write its
+ * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data
+ * objects, one per active VLAN served by the originator node.
+ *
+ * Return: the size of the allocated buffer or 0 in case of failure.
+ */
+static u16
+batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_tt_data **tt_data,
+ struct batadv_tvlv_tt_change **tt_change,
+ s32 *tt_len)
+{
+ u16 num_vlan = 0;
+ u16 num_entries = 0;
+ u16 change_offset;
+ u16 tvlv_len;
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ struct batadv_orig_node_vlan *vlan;
+ u8 *tt_change_ptr;
+
+ spin_lock_bh(&orig_node->vlan_list_lock);
+ hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
+ num_vlan++;
+ num_entries += atomic_read(&vlan->tt.num_entries);
+ }
+
+ change_offset = struct_size(*tt_data, vlan_data, num_vlan);
+
+ /* if tt_len is negative, allocate the space needed by the full table */
+ if (*tt_len < 0)
+ *tt_len = batadv_tt_len(num_entries);
+
+ tvlv_len = *tt_len;
+ tvlv_len += change_offset;
+
+ *tt_data = kmalloc(tvlv_len, GFP_ATOMIC);
+ if (!*tt_data) {
+ *tt_len = 0;
+ goto out;
+ }
+
+ (*tt_data)->flags = BATADV_NO_FLAGS;
+ (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn);
+ (*tt_data)->num_vlan = htons(num_vlan);
+
+ tt_vlan = (*tt_data)->vlan_data;
+ hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
+ tt_vlan->vid = htons(vlan->vid);
+ tt_vlan->crc = htonl(vlan->tt.crc);
+ tt_vlan->reserved = 0;
+
+ tt_vlan++;
+ }
+
+ tt_change_ptr = (u8 *)*tt_data + change_offset;
+ *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
+
+out:
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+ return tvlv_len;
+}
+
+/**
+ * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for
+ * this node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_data: uninitialised pointer to the address of the TVLV buffer
+ * @tt_change: uninitialised pointer to the address of the area where the TT
+ * changes can be stored
+ * @tt_len: pointer to the length to reserve to the tt_change. if -1 this
+ * function reserves the amount of space needed to send the entire local TT
+ * table. In case of success the value is updated with the real amount of
+ * reserved bytes
+ *
+ * Allocate the needed amount of memory for the entire TT TVLV and write its
+ * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
+ * objects, one per active VLAN.
+ *
+ * Return: the size of the allocated buffer or 0 in case of failure.
+ */
+static u16
+batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data **tt_data,
+ struct batadv_tvlv_tt_change **tt_change,
+ s32 *tt_len)
+{
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ struct batadv_meshif_vlan *vlan;
+ u16 num_vlan = 0;
+ u16 vlan_entries = 0;
+ u16 total_entries = 0;
+ u16 tvlv_len;
+ u8 *tt_change_ptr;
+ int change_offset;
+
+ spin_lock_bh(&bat_priv->meshif_vlan_list_lock);
+ hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) {
+ vlan_entries = atomic_read(&vlan->tt.num_entries);
+ if (vlan_entries < 1)
+ continue;
+
+ num_vlan++;
+ total_entries += vlan_entries;
+ }
+
+ change_offset = struct_size(*tt_data, vlan_data, num_vlan);
+
+ /* if tt_len is negative, allocate the space needed by the full table */
+ if (*tt_len < 0)
+ *tt_len = batadv_tt_len(total_entries);
+
+ tvlv_len = *tt_len;
+ tvlv_len += change_offset;
+
+ *tt_data = kmalloc(tvlv_len, GFP_ATOMIC);
+ if (!*tt_data) {
+ tvlv_len = 0;
+ goto out;
+ }
+
+ (*tt_data)->flags = BATADV_NO_FLAGS;
+ (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn);
+ (*tt_data)->num_vlan = htons(num_vlan);
+
+ tt_vlan = (*tt_data)->vlan_data;
+ hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) {
+ vlan_entries = atomic_read(&vlan->tt.num_entries);
+ if (vlan_entries < 1)
+ continue;
+
+ tt_vlan->vid = htons(vlan->vid);
+ tt_vlan->crc = htonl(vlan->tt.crc);
+ tt_vlan->reserved = 0;
+
+ tt_vlan++;
+ }
+
+ tt_change_ptr = (u8 *)*tt_data + change_offset;
+ *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
+
+out:
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
+ return tvlv_len;
+}
+
+/**
+ * batadv_tt_tvlv_container_update() - update the translation table tvlv
+ * container after local tt changes have been committed
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_change_node *entry, *safe;
+ struct batadv_tvlv_tt_data *tt_data;
+ struct batadv_tvlv_tt_change *tt_change;
+ int tt_diff_len, tt_change_len = 0;
+ int tt_diff_entries_num = 0;
+ int tt_diff_entries_count = 0;
+ bool drop_changes = false;
+ size_t tt_extra_len = 0;
+ u16 tvlv_len;
+
+ tt_diff_entries_num = READ_ONCE(bat_priv->tt.local_changes);
+ tt_diff_len = batadv_tt_len(tt_diff_entries_num);
+
+ /* if we have too many changes for one packet don't send any
+ * and wait for the tt table request so we can reply with the full
+ * (fragmented) table.
+ *
+ * The local change history should still be cleaned up so the next
+ * TT round can start again with a clean state.
+ */
+ if (tt_diff_len > bat_priv->mesh_iface->mtu) {
+ tt_diff_len = 0;
+ tt_diff_entries_num = 0;
+ drop_changes = true;
+ }
+
+ tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data,
+ &tt_change, &tt_diff_len);
+ if (!tvlv_len)
+ return;
+
+ tt_data->flags = BATADV_TT_OGM_DIFF;
+
+ if (!drop_changes && tt_diff_len == 0)
+ goto container_register;
+
+ spin_lock_bh(&bat_priv->tt.changes_list_lock);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
+
+ list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
+ list) {
+ if (tt_diff_entries_count < tt_diff_entries_num) {
+ memcpy(tt_change + tt_diff_entries_count,
+ &entry->change,
+ sizeof(struct batadv_tvlv_tt_change));
+ tt_diff_entries_count++;
+ }
+ list_del(&entry->list);
+ kmem_cache_free(batadv_tt_change_cache, entry);
+ }
+ spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+
+ tt_extra_len = batadv_tt_len(tt_diff_entries_num -
+ tt_diff_entries_count);
+
+ /* Keep the buffer for possible tt_request */
+ spin_lock_bh(&bat_priv->tt.last_changeset_lock);
+ kfree(bat_priv->tt.last_changeset);
+ bat_priv->tt.last_changeset_len = 0;
+ bat_priv->tt.last_changeset = NULL;
+ tt_change_len = batadv_tt_len(tt_diff_entries_count);
+ /* check whether this new OGM has no changes due to size problems */
+ if (tt_diff_entries_count > 0) {
+ tt_diff_len -= tt_extra_len;
+ /* if kmalloc() fails we will reply with the full table
+ * instead of providing the diff
+ */
+ bat_priv->tt.last_changeset = kzalloc(tt_diff_len, GFP_ATOMIC);
+ if (bat_priv->tt.last_changeset) {
+ memcpy(bat_priv->tt.last_changeset,
+ tt_change, tt_change_len);
+ bat_priv->tt.last_changeset_len = tt_diff_len;
+ }
+ }
+ spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+
+ /* Remove extra packet space for OGM */
+ tvlv_len -= tt_extra_len;
+container_register:
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data,
+ tvlv_len);
+ kfree(tt_data);
+}
+
+/**
+ * batadv_tt_local_dump_entry() - Dump one TT local entry into a message
+ * @msg :Netlink message to dump into
+ * @portid: Port making netlink request
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @common: tt local & tt global common data
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_tt_common_entry *common)
+{
+ void *hdr;
+ struct batadv_meshif_vlan *vlan;
+ struct batadv_tt_local_entry *local;
+ unsigned int last_seen_msecs;
+ u32 crc;
+
+ local = container_of(common, struct batadv_tt_local_entry, common);
+ last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen);
+
+ vlan = batadv_meshif_vlan_get(bat_priv, common->vid);
+ if (!vlan)
+ return 0;
+
+ crc = vlan->tt.crc;
+
+ batadv_meshif_vlan_put(vlan);
+
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_TRANSTABLE_LOCAL);
+ if (!hdr)
+ return -ENOBUFS;
+
+ genl_dump_check_consistent(cb, hdr);
+
+ if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
+ nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
+ goto nla_put_failure;
+
+ if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) &&
+ nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
+ * @idx_s: Number of entries to skip
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_s)
+{
+ struct batadv_tt_common_entry *common;
+ int idx = 0;
+
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(common, &hash->table[bucket], hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv,
+ common)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ spin_unlock_bh(&hash->list_locks[bucket]);
+
+ *idx_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_tt_local_dump() - Dump TT local entries into a message
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: Error code, or 0 on success
+ */
+int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net_device *mesh_iface;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_hashtable *hash;
+ int ret;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hash = bat_priv->tt.local_hash;
+
+ while (bucket < hash->size) {
+ if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv,
+ hash, bucket, &idx))
+ break;
+
+ bucket++;
+ }
+
+ ret = msg->len;
+
+ out:
+ batadv_hardif_put(primary_if);
+ dev_put(mesh_iface);
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+
+ return ret;
+}
+
+static void
+batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
+ struct batadv_tt_local_entry *tt_local_entry,
+ u16 flags, const char *message)
+{
+ batadv_tt_local_event(bat_priv, tt_local_entry, flags);
+
+ /* The local client has to be marked as "pending to be removed" but has
+ * to be kept in the table in order to send it in a full table
+ * response issued before the net ttvn increment (consistency check)
+ */
+ tt_local_entry->common.flags |= BATADV_TT_CLIENT_PENDING;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Local tt entry (%pM, vid: %d) pending to be removed: %s\n",
+ tt_local_entry->common.addr,
+ batadv_print_vid(tt_local_entry->common.vid), message);
+}
+
+/**
+ * batadv_tt_local_remove() - logically remove an entry from the local table
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the MAC address of the client to remove
+ * @vid: VLAN identifier
+ * @message: message to append to the log on deletion
+ * @roaming: true if the deletion is due to a roaming event
+ *
+ * Return: the flags assigned to the local entry before being deleted
+ */
+u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid, const char *message,
+ bool roaming)
+{
+ struct batadv_tt_local_entry *tt_removed_entry;
+ struct batadv_tt_local_entry *tt_local_entry;
+ u16 flags, curr_flags = BATADV_NO_FLAGS;
+ struct hlist_node *tt_removed_node;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
+ if (!tt_local_entry)
+ goto out;
+
+ curr_flags = tt_local_entry->common.flags;
+
+ flags = BATADV_TT_CLIENT_DEL;
+ /* if this global entry addition is due to a roaming, the node has to
+ * mark the local entry as "roamed" in order to correctly reroute
+ * packets later
+ */
+ if (roaming) {
+ flags |= BATADV_TT_CLIENT_ROAM;
+ /* mark the local client as ROAMed */
+ tt_local_entry->common.flags |= BATADV_TT_CLIENT_ROAM;
+ }
+
+ if (!(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) {
+ batadv_tt_local_set_pending(bat_priv, tt_local_entry, flags,
+ message);
+ goto out;
+ }
+ /* if this client has been added right now, it is possible to
+ * immediately purge it
+ */
+ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL);
+
+ tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash,
+ batadv_compare_tt,
+ batadv_choose_tt,
+ &tt_local_entry->common);
+ if (!tt_removed_node)
+ goto out;
+
+ /* drop reference of remove hash entry */
+ tt_removed_entry = hlist_entry(tt_removed_node,
+ struct batadv_tt_local_entry,
+ common.hash_entry);
+ batadv_tt_local_entry_put(tt_removed_entry);
+
+out:
+ batadv_tt_local_entry_put(tt_local_entry);
+
+ return curr_flags;
+}
+
+/**
+ * batadv_tt_local_purge_list() - purge inactive tt local entries
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @head: pointer to the list containing the local tt entries
+ * @timeout: parameter deciding whether a given tt local entry is considered
+ * inactive or not
+ */
+static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv,
+ struct hlist_head *head,
+ int timeout)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct hlist_node *node_tmp;
+
+ hlist_for_each_entry_safe(tt_common_entry, node_tmp, head,
+ hash_entry) {
+ tt_local_entry = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_NOPURGE)
+ continue;
+
+ /* entry already marked for deletion */
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING)
+ continue;
+
+ if (!batadv_has_timed_out(tt_local_entry->last_seen, timeout))
+ continue;
+
+ batadv_tt_local_set_pending(bat_priv, tt_local_entry,
+ BATADV_TT_CLIENT_DEL, "timed out");
+ }
+}
+
+/**
+ * batadv_tt_local_purge() - purge inactive tt local entries
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @timeout: parameter deciding whether a given tt local entry is considered
+ * inactive or not
+ */
+static void batadv_tt_local_purge(struct batadv_priv *bat_priv,
+ int timeout)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ u32 i;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ batadv_tt_local_purge_list(bat_priv, head, timeout);
+ spin_unlock_bh(list_lock);
+ }
+}
+
+static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_local_entry *tt_local;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ u32 i;
+
+ if (!bat_priv->tt.local_hash)
+ return;
+
+ hash = bat_priv->tt.local_hash;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common_entry, node_tmp,
+ head, hash_entry) {
+ hlist_del_rcu(&tt_common_entry->hash_entry);
+ tt_local = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+
+ batadv_tt_local_entry_put(tt_local);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+
+ bat_priv->tt.local_hash = NULL;
+}
+
+static int batadv_tt_global_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->tt.global_hash)
+ return 0;
+
+ bat_priv->tt.global_hash = batadv_hash_new(1024);
+
+ if (!bat_priv->tt.global_hash)
+ return -ENOMEM;
+
+ batadv_hash_set_lock_class(bat_priv->tt.global_hash,
+ &batadv_tt_global_hash_lock_class_key);
+
+ return 0;
+}
+
+static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_change_node *entry, *safe;
+
+ spin_lock_bh(&bat_priv->tt.changes_list_lock);
+
+ list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
+ list) {
+ list_del(&entry->list);
+ kmem_cache_free(batadv_tt_change_cache, entry);
+ }
+
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
+ spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+}
+
+/**
+ * batadv_tt_global_orig_entry_find() - find a TT orig_list_entry
+ * @entry: the TT global entry where the orig_list_entry has to be
+ * extracted from
+ * @orig_node: the originator for which the orig_list_entry has to be found
+ *
+ * retrieve the orig_tt_list_entry belonging to orig_node from the
+ * batadv_tt_global_entry list
+ *
+ * Return: it with an increased refcounter, NULL if not found
+ */
+static struct batadv_tt_orig_list_entry *
+batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
+ const struct batadv_orig_node *orig_node)
+{
+ struct batadv_tt_orig_list_entry *tmp_orig_entry, *orig_entry = NULL;
+ const struct hlist_head *head;
+
+ rcu_read_lock();
+ head = &entry->orig_list;
+ hlist_for_each_entry_rcu(tmp_orig_entry, head, list) {
+ if (tmp_orig_entry->orig_node != orig_node)
+ continue;
+ if (!kref_get_unless_zero(&tmp_orig_entry->refcount))
+ continue;
+
+ orig_entry = tmp_orig_entry;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_entry;
+}
+
+/**
+ * batadv_tt_global_entry_has_orig() - check if a TT global entry is also
+ * handled by a given originator
+ * @entry: the TT global entry to check
+ * @orig_node: the originator to search in the list
+ * @flags: a pointer to store TT flags for the given @entry received
+ * from @orig_node
+ *
+ * find out if an orig_node is already in the list of a tt_global_entry.
+ *
+ * Return: true if found, false otherwise
+ */
+static bool
+batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
+ const struct batadv_orig_node *orig_node,
+ u8 *flags)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+ bool found = false;
+
+ orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node);
+ if (orig_entry) {
+ found = true;
+
+ if (flags)
+ *flags = orig_entry->flags;
+
+ batadv_tt_orig_list_entry_put(orig_entry);
+ }
+
+ return found;
+}
+
+/**
+ * batadv_tt_global_sync_flags() - update TT sync flags
+ * @tt_global: the TT global entry to update sync flags in
+ *
+ * Updates the sync flag bits in the tt_global flag attribute with a logical
+ * OR of all sync flags from any of its TT orig entries.
+ */
+static void
+batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+ const struct hlist_head *head;
+ u16 flags = BATADV_NO_FLAGS;
+
+ rcu_read_lock();
+ head = &tt_global->orig_list;
+ hlist_for_each_entry_rcu(orig_entry, head, list)
+ flags |= orig_entry->flags;
+ rcu_read_unlock();
+
+ flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK);
+ tt_global->common.flags = flags;
+}
+
+/**
+ * batadv_tt_global_orig_entry_add() - add or update a TT orig entry
+ * @tt_global: the TT global entry to add an orig entry in
+ * @orig_node: the originator to add an orig entry for
+ * @ttvn: translation table version number of this changeset
+ * @flags: TT sync flags
+ */
+static void
+batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
+ struct batadv_orig_node *orig_node, int ttvn,
+ u8 flags)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ spin_lock_bh(&tt_global->list_lock);
+
+ orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node);
+ if (orig_entry) {
+ /* refresh the ttvn: the current value could be a bogus one that
+ * was added during a "temporary client detection"
+ */
+ orig_entry->ttvn = ttvn;
+ orig_entry->flags = flags;
+ goto sync_flags;
+ }
+
+ orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC);
+ if (!orig_entry)
+ goto out;
+
+ INIT_HLIST_NODE(&orig_entry->list);
+ kref_get(&orig_node->refcount);
+ batadv_tt_global_size_inc(orig_node, tt_global->common.vid);
+ orig_entry->orig_node = orig_node;
+ orig_entry->ttvn = ttvn;
+ orig_entry->flags = flags;
+ kref_init(&orig_entry->refcount);
+
+ kref_get(&orig_entry->refcount);
+ hlist_add_head_rcu(&orig_entry->list,
+ &tt_global->orig_list);
+ atomic_inc(&tt_global->orig_list_count);
+
+sync_flags:
+ batadv_tt_global_sync_flags(tt_global);
+out:
+ batadv_tt_orig_list_entry_put(orig_entry);
+
+ spin_unlock_bh(&tt_global->list_lock);
+}
+
+/**
+ * batadv_tt_global_add() - add a new TT global entry or update an existing one
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: the originator announcing the client
+ * @tt_addr: the mac address of the non-mesh client
+ * @vid: VLAN identifier
+ * @flags: TT flags that have to be set for this non-mesh client
+ * @ttvn: the tt version number ever announcing this non-mesh client
+ *
+ * Add a new TT global entry for the given originator. If the entry already
+ * exists add a new reference to the given originator (a global entry can have
+ * references to multiple originators) and adjust the flags attribute to reflect
+ * the function argument.
+ * If a TT local entry exists for this non-mesh client remove it.
+ *
+ * The caller must hold the orig_node refcount.
+ *
+ * Return: true if the new entry has been added, false otherwise
+ */
+static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *tt_addr,
+ unsigned short vid, u16 flags, u8 ttvn)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ struct batadv_tt_local_entry *tt_local_entry;
+ bool ret = false;
+ int hash_added;
+ struct batadv_tt_common_entry *common;
+ u16 local_flags;
+
+ /* ignore global entries from backbone nodes */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid))
+ return true;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, tt_addr, vid);
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, tt_addr, vid);
+
+ /* if the node already has a local client for this entry, it has to wait
+ * for a roaming advertisement instead of manually messing up the global
+ * table
+ */
+ if ((flags & BATADV_TT_CLIENT_TEMP) && tt_local_entry &&
+ !(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW))
+ goto out;
+
+ if (!tt_global_entry) {
+ tt_global_entry = kmem_cache_zalloc(batadv_tg_cache,
+ GFP_ATOMIC);
+ if (!tt_global_entry)
+ goto out;
+
+ common = &tt_global_entry->common;
+ ether_addr_copy(common->addr, tt_addr);
+ common->vid = vid;
+
+ if (!is_multicast_ether_addr(common->addr))
+ common->flags = flags & (~BATADV_TT_SYNC_MASK);
+
+ tt_global_entry->roam_at = 0;
+ /* node must store current time in case of roaming. This is
+ * needed to purge this entry out on timeout (if nobody claims
+ * it)
+ */
+ if (flags & BATADV_TT_CLIENT_ROAM)
+ tt_global_entry->roam_at = jiffies;
+ kref_init(&common->refcount);
+ common->added_at = jiffies;
+
+ INIT_HLIST_HEAD(&tt_global_entry->orig_list);
+ atomic_set(&tt_global_entry->orig_list_count, 0);
+ spin_lock_init(&tt_global_entry->list_lock);
+
+ kref_get(&common->refcount);
+ hash_added = batadv_hash_add(bat_priv->tt.global_hash,
+ batadv_compare_tt,
+ batadv_choose_tt, common,
+ &common->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* remove the reference for the hash */
+ batadv_tt_global_entry_put(tt_global_entry);
+ goto out_remove;
+ }
+ } else {
+ common = &tt_global_entry->common;
+ /* If there is already a global entry, we can use this one for
+ * our processing.
+ * But if we are trying to add a temporary client then here are
+ * two options at this point:
+ * 1) the global client is not a temporary client: the global
+ * client has to be left as it is, temporary information
+ * should never override any already known client state
+ * 2) the global client is a temporary client: purge the
+ * originator list and add the new one orig_entry
+ */
+ if (flags & BATADV_TT_CLIENT_TEMP) {
+ if (!(common->flags & BATADV_TT_CLIENT_TEMP))
+ goto out;
+ if (batadv_tt_global_entry_has_orig(tt_global_entry,
+ orig_node, NULL))
+ goto out_remove;
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ goto add_orig_entry;
+ }
+
+ /* if the client was temporary added before receiving the first
+ * OGM announcing it, we have to clear the TEMP flag. Also,
+ * remove the previous temporary orig node and re-add it
+ * if required. If the orig entry changed, the new one which
+ * is a non-temporary entry is preferred.
+ */
+ if (common->flags & BATADV_TT_CLIENT_TEMP) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ }
+
+ /* the change can carry possible "attribute" flags like the
+ * TT_CLIENT_TEMP, therefore they have to be copied in the
+ * client entry
+ */
+ if (!is_multicast_ether_addr(common->addr))
+ common->flags |= flags & (~BATADV_TT_SYNC_MASK);
+
+ /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only
+ * one originator left in the list and we previously received a
+ * delete + roaming change for this originator.
+ *
+ * We should first delete the old originator before adding the
+ * new one.
+ */
+ if (common->flags & BATADV_TT_CLIENT_ROAM) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ common->flags &= ~BATADV_TT_CLIENT_ROAM;
+ tt_global_entry->roam_at = 0;
+ }
+ }
+add_orig_entry:
+ /* add the new orig_entry (if needed) or update it */
+ batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn,
+ flags & BATADV_TT_SYNC_MASK);
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Creating new global tt entry: %pM (vid: %d, via %pM)\n",
+ common->addr, batadv_print_vid(common->vid),
+ orig_node->orig);
+ ret = true;
+
+out_remove:
+ /* Do not remove multicast addresses from the local hash on
+ * global additions
+ */
+ if (is_multicast_ether_addr(tt_addr))
+ goto out;
+
+ /* remove address from local hash if present */
+ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, vid,
+ "global tt received",
+ flags & BATADV_TT_CLIENT_ROAM);
+ tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI;
+
+ if (!(flags & BATADV_TT_CLIENT_ROAM))
+ /* this is a normal global add. Therefore the client is not in a
+ * roaming state anymore.
+ */
+ tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM;
+
+out:
+ batadv_tt_global_entry_put(tt_global_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
+ return ret;
+}
+
+/**
+ * batadv_transtable_best_orig() - Get best originator list entry from tt entry
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_global_entry: global translation table entry to be analyzed
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ * Return: best originator list entry or NULL on errors.
+ */
+static struct batadv_tt_orig_list_entry *
+batadv_transtable_best_orig(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry)
+{
+ struct batadv_neigh_node *router, *best_router = NULL;
+ struct batadv_algo_ops *bao = bat_priv->algo_ops;
+ struct hlist_head *head;
+ struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL;
+
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ router = batadv_orig_router_get(orig_entry->orig_node,
+ BATADV_IF_DEFAULT);
+ if (!router)
+ continue;
+
+ if (best_router &&
+ bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router,
+ BATADV_IF_DEFAULT) <= 0) {
+ batadv_neigh_node_put(router);
+ continue;
+ }
+
+ /* release the refcount for the "old" best */
+ batadv_neigh_node_put(best_router);
+
+ best_entry = orig_entry;
+ best_router = router;
+ }
+
+ batadv_neigh_node_put(best_router);
+
+ return best_entry;
+}
+
+/**
+ * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @common: tt local & tt global common data
+ * @orig: Originator node announcing a non-mesh client
+ * @best: Is the best originator for the TT entry
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_tt_common_entry *common,
+ struct batadv_tt_orig_list_entry *orig,
+ bool best)
+{
+ u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags;
+ void *hdr;
+ struct batadv_orig_node_vlan *vlan;
+ u8 last_ttvn;
+ u32 crc;
+
+ vlan = batadv_orig_node_vlan_get(orig->orig_node,
+ common->vid);
+ if (!vlan)
+ return 0;
+
+ crc = vlan->tt.crc;
+
+ batadv_orig_node_vlan_put(vlan);
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+ NLM_F_MULTI,
+ BATADV_CMD_GET_TRANSTABLE_GLOBAL);
+ if (!hdr)
+ return -ENOBUFS;
+
+ last_ttvn = atomic_read(&orig->orig_node->last_ttvn);
+
+ if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
+ nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+ orig->orig_node->orig) ||
+ nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) ||
+ nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
+ nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
+ nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags))
+ goto nla_put_failure;
+
+ if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_tt_global_dump_entry() - Dump one TT global entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @common: tt local & tt global common data
+ * @sub_s: Number of entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct batadv_tt_common_entry *common, int *sub_s)
+{
+ struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
+ struct batadv_tt_global_entry *global;
+ struct hlist_head *head;
+ int sub = 0;
+ bool best;
+
+ global = container_of(common, struct batadv_tt_global_entry, common);
+ best_entry = batadv_transtable_best_orig(bat_priv, global);
+ head = &global->orig_list;
+
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ if (sub++ < *sub_s)
+ continue;
+
+ best = (orig_entry == best_entry);
+
+ if (batadv_tt_global_dump_subentry(msg, portid, seq, common,
+ orig_entry, best)) {
+ *sub_s = sub - 1;
+ return -EMSGSIZE;
+ }
+ }
+
+ *sub_s = 0;
+ return 0;
+}
+
+/**
+ * batadv_tt_global_dump_bucket() - Dump one TT local bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @head: Pointer to the list containing the global tt entries
+ * @idx_s: Number of entries to skip
+ * @sub: Number of entries to skip
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+ struct batadv_priv *bat_priv,
+ struct hlist_head *head, int *idx_s, int *sub)
+{
+ struct batadv_tt_common_entry *common;
+ int idx = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(common, head, hash_entry) {
+ if (idx++ < *idx_s)
+ continue;
+
+ if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv,
+ common, sub)) {
+ rcu_read_unlock();
+ *idx_s = idx - 1;
+ return -EMSGSIZE;
+ }
+ }
+ rcu_read_unlock();
+
+ *idx_s = 0;
+ *sub = 0;
+ return 0;
+}
+
+/**
+ * batadv_tt_global_dump() - Dump TT global entries into a message
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: Error code, or length of message on success
+ */
+int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ struct net_device *mesh_iface;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_hashtable *hash;
+ struct hlist_head *head;
+ int ret;
+ int bucket = cb->args[0];
+ int idx = cb->args[1];
+ int sub = cb->args[2];
+ int portid = NETLINK_CB(cb->skb).portid;
+
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hash = bat_priv->tt.global_hash;
+
+ while (bucket < hash->size) {
+ head = &hash->table[bucket];
+
+ if (batadv_tt_global_dump_bucket(msg, portid,
+ cb->nlh->nlmsg_seq, bat_priv,
+ head, &idx, &sub))
+ break;
+
+ bucket++;
+ }
+
+ ret = msg->len;
+
+ out:
+ batadv_hardif_put(primary_if);
+ dev_put(mesh_iface);
+
+ cb->args[0] = bucket;
+ cb->args[1] = idx;
+ cb->args[2] = sub;
+
+ return ret;
+}
+
+/**
+ * _batadv_tt_global_del_orig_entry() - remove and free an orig_entry
+ * @tt_global_entry: the global entry to remove the orig_entry from
+ * @orig_entry: the orig entry to remove and free
+ *
+ * Remove an orig_entry from its list in the given tt_global_entry and
+ * free this orig_entry afterwards.
+ *
+ * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is
+ * part of a list.
+ */
+static void
+_batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_tt_orig_list_entry *orig_entry)
+{
+ lockdep_assert_held(&tt_global_entry->list_lock);
+
+ batadv_tt_global_size_dec(orig_entry->orig_node,
+ tt_global_entry->common.vid);
+ atomic_dec(&tt_global_entry->orig_list_count);
+ /* requires holding tt_global_entry->list_lock and orig_entry->list
+ * being part of a list
+ */
+ hlist_del_rcu(&orig_entry->list);
+ batadv_tt_orig_list_entry_put(orig_entry);
+}
+
+/* deletes the orig list of a tt_global_entry */
+static void
+batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry)
+{
+ struct hlist_head *head;
+ struct hlist_node *safe;
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ spin_lock_bh(&tt_global_entry->list_lock);
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_safe(orig_entry, safe, head, list)
+ _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry);
+ spin_unlock_bh(&tt_global_entry->list_lock);
+}
+
+/**
+ * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_global_entry: the global entry to remove the orig_node from
+ * @orig_node: the originator announcing the client
+ * @message: message to append to the log on deletion
+ *
+ * Remove the given orig_node and its according orig_entry from the given
+ * global tt entry.
+ */
+static void
+batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_orig_node *orig_node,
+ const char *message)
+{
+ struct hlist_head *head;
+ struct hlist_node *safe;
+ struct batadv_tt_orig_list_entry *orig_entry;
+ unsigned short vid;
+
+ spin_lock_bh(&tt_global_entry->list_lock);
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_safe(orig_entry, safe, head, list) {
+ if (orig_entry->orig_node == orig_node) {
+ vid = tt_global_entry->common.vid;
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting %pM from global tt entry %pM (vid: %d): %s\n",
+ orig_node->orig,
+ tt_global_entry->common.addr,
+ batadv_print_vid(vid), message);
+ _batadv_tt_global_del_orig_entry(tt_global_entry,
+ orig_entry);
+ }
+ }
+ spin_unlock_bh(&tt_global_entry->list_lock);
+}
+
+/* If the client is to be deleted, we check if it is the last origantor entry
+ * within tt_global entry. If yes, we set the BATADV_TT_CLIENT_ROAM flag and the
+ * timer, otherwise we simply remove the originator scheduled for deletion.
+ */
+static void
+batadv_tt_global_del_roaming(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_orig_node *orig_node,
+ const char *message)
+{
+ bool last_entry = true;
+ struct hlist_head *head;
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ /* no local entry exists, case 1:
+ * Check if this is the last one or if other entries exist.
+ */
+
+ rcu_read_lock();
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ if (orig_entry->orig_node != orig_node) {
+ last_entry = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (last_entry) {
+ /* its the last one, mark for roaming. */
+ tt_global_entry->common.flags |= BATADV_TT_CLIENT_ROAM;
+ tt_global_entry->roam_at = jiffies;
+ } else {
+ /* there is another entry, we can simply delete this
+ * one and can still use the other one.
+ */
+ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry,
+ orig_node, message);
+ }
+}
+
+/**
+ * batadv_tt_global_del() - remove a client from the global table
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: an originator serving this client
+ * @addr: the mac address of the client
+ * @vid: VLAN identifier
+ * @message: a message explaining the reason for deleting the client to print
+ * for debugging purpose
+ * @roaming: true if the deletion has been triggered by a roaming event
+ */
+static void batadv_tt_global_del(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr, unsigned short vid,
+ const char *message, bool roaming)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ struct batadv_tt_local_entry *local_entry = NULL;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ if (!roaming) {
+ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry,
+ orig_node, message);
+
+ if (hlist_empty(&tt_global_entry->orig_list))
+ batadv_tt_global_free(bat_priv, tt_global_entry,
+ message);
+
+ goto out;
+ }
+
+ /* if we are deleting a global entry due to a roam
+ * event, there are two possibilities:
+ * 1) the client roamed from node A to node B => if there
+ * is only one originator left for this client, we mark
+ * it with BATADV_TT_CLIENT_ROAM, we start a timer and we
+ * wait for node B to claim it. In case of timeout
+ * the entry is purged.
+ *
+ * If there are other originators left, we directly delete
+ * the originator.
+ * 2) the client roamed to us => we can directly delete
+ * the global entry, since it is useless now.
+ */
+ local_entry = batadv_tt_local_hash_find(bat_priv,
+ tt_global_entry->common.addr,
+ vid);
+ if (local_entry) {
+ /* local entry exists, case 2: client roamed to us. */
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ batadv_tt_global_free(bat_priv, tt_global_entry, message);
+ } else {
+ /* no local entry exists, case 1: check for roaming */
+ batadv_tt_global_del_roaming(bat_priv, tt_global_entry,
+ orig_node, message);
+ }
+
+out:
+ batadv_tt_global_entry_put(tt_global_entry);
+ batadv_tt_local_entry_put(local_entry);
+}
+
+/**
+ * batadv_tt_global_del_orig() - remove all the TT global entries belonging to
+ * the given originator matching the provided vid
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: the originator owning the entries to remove
+ * @match_vid: the VLAN identifier to match. If negative all the entries will be
+ * removed
+ * @message: debug message to print as "reason"
+ */
+void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ s32 match_vid,
+ const char *message)
+{
+ struct batadv_tt_global_entry *tt_global;
+ struct batadv_tt_common_entry *tt_common_entry;
+ u32 i;
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct hlist_node *safe;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ unsigned short vid;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common_entry, safe,
+ head, hash_entry) {
+ /* remove only matching entries */
+ if (match_vid >= 0 && tt_common_entry->vid != match_vid)
+ continue;
+
+ tt_global = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+
+ batadv_tt_global_del_orig_node(bat_priv, tt_global,
+ orig_node, message);
+
+ if (hlist_empty(&tt_global->orig_list)) {
+ vid = tt_global->common.vid;
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting global tt entry %pM (vid: %d): %s\n",
+ tt_global->common.addr,
+ batadv_print_vid(vid), message);
+ hlist_del_rcu(&tt_common_entry->hash_entry);
+ batadv_tt_global_entry_put(tt_global);
+ }
+ }
+ spin_unlock_bh(list_lock);
+ }
+ clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized);
+}
+
+static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global,
+ char **msg)
+{
+ bool purge = false;
+ unsigned long roam_timeout = BATADV_TT_CLIENT_ROAM_TIMEOUT;
+ unsigned long temp_timeout = BATADV_TT_CLIENT_TEMP_TIMEOUT;
+
+ if ((tt_global->common.flags & BATADV_TT_CLIENT_ROAM) &&
+ batadv_has_timed_out(tt_global->roam_at, roam_timeout)) {
+ purge = true;
+ *msg = "Roaming timeout\n";
+ }
+
+ if ((tt_global->common.flags & BATADV_TT_CLIENT_TEMP) &&
+ batadv_has_timed_out(tt_global->common.added_at, temp_timeout)) {
+ purge = true;
+ *msg = "Temporary client timeout\n";
+ }
+
+ return purge;
+}
+
+static void batadv_tt_global_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct hlist_head *head;
+ struct hlist_node *node_tmp;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ u32 i;
+ char *msg = NULL;
+ struct batadv_tt_common_entry *tt_common;
+ struct batadv_tt_global_entry *tt_global;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common, node_tmp, head,
+ hash_entry) {
+ tt_global = container_of(tt_common,
+ struct batadv_tt_global_entry,
+ common);
+
+ if (!batadv_tt_global_to_purge(tt_global, &msg))
+ continue;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting global tt entry %pM (vid: %d): %s\n",
+ tt_global->common.addr,
+ batadv_print_vid(tt_global->common.vid),
+ msg);
+
+ hlist_del_rcu(&tt_common->hash_entry);
+
+ batadv_tt_global_entry_put(tt_global);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+static void batadv_tt_global_table_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_global_entry *tt_global;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ u32 i;
+
+ if (!bat_priv->tt.global_hash)
+ return;
+
+ hash = bat_priv->tt.global_hash;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common_entry, node_tmp,
+ head, hash_entry) {
+ hlist_del_rcu(&tt_common_entry->hash_entry);
+ tt_global = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+ batadv_tt_global_entry_put(tt_global);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+
+ bat_priv->tt.global_hash = NULL;
+}
+
+static bool
+_batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
+ struct batadv_tt_global_entry *tt_global_entry)
+{
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_WIFI &&
+ tt_global_entry->common.flags & BATADV_TT_CLIENT_WIFI)
+ return true;
+
+ /* check if the two clients are marked as isolated */
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_ISOLA &&
+ tt_global_entry->common.flags & BATADV_TT_CLIENT_ISOLA)
+ return true;
+
+ return false;
+}
+
+/**
+ * batadv_transtable_search() - get the mesh destination for a given client
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @src: mac address of the source client
+ * @addr: mac address of the destination client
+ * @vid: VLAN identifier
+ *
+ * Return: a pointer to the originator that was selected as destination in the
+ * mesh for contacting the client 'addr', NULL otherwise.
+ * In case of multiple originators serving the same client, the function returns
+ * the best one (best in terms of metric towards the destination node).
+ *
+ * If the two clients are AP isolated the function returns NULL.
+ */
+struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
+ const u8 *src,
+ const u8 *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry = NULL;
+ struct batadv_tt_global_entry *tt_global_entry = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_tt_orig_list_entry *best_entry;
+
+ if (src && batadv_vlan_ap_isola_get(bat_priv, vid)) {
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, src, vid);
+ if (!tt_local_entry ||
+ (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING))
+ goto out;
+ }
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ /* check whether the clients should not communicate due to AP
+ * isolation
+ */
+ if (tt_local_entry &&
+ _batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
+ goto out;
+
+ rcu_read_lock();
+ best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry);
+ /* found anything? */
+ if (best_entry)
+ orig_node = best_entry->orig_node;
+ if (orig_node && !kref_get_unless_zero(&orig_node->refcount))
+ orig_node = NULL;
+ rcu_read_unlock();
+
+out:
+ batadv_tt_global_entry_put(tt_global_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
+
+ return orig_node;
+}
+
+/**
+ * batadv_tt_global_crc() - calculates the checksum of the local table belonging
+ * to the given orig_node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: originator for which the CRC should be computed
+ * @vid: VLAN identifier for which the CRC32 has to be computed
+ *
+ * This function computes the checksum for the global table corresponding to a
+ * specific originator. In particular, the checksum is computed as follows: For
+ * each client connected to the originator the CRC32C of the MAC address and the
+ * VID is computed and then all the CRC32Cs of the various clients are xor'ed
+ * together.
+ *
+ * The idea behind is that CRC32C should be used as much as possible in order to
+ * produce a unique hash of the table, but since the order which is used to feed
+ * the CRC32C function affects the result and since every node in the network
+ * probably sorts the clients differently, the hash function cannot be directly
+ * computed over the entire table. Hence the CRC32C is used only on
+ * the single client entry, while all the results are then xor'ed together
+ * because the XOR operation can combine them all while trying to reduce the
+ * noise as much as possible.
+ *
+ * Return: the checksum of the global table of a given originator.
+ */
+static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct batadv_tt_orig_list_entry *tt_orig;
+ struct batadv_tt_common_entry *tt_common;
+ struct batadv_tt_global_entry *tt_global;
+ struct hlist_head *head;
+ u32 i, crc_tmp, crc = 0;
+ u8 flags;
+ __be16 tmp_vid;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common, head, hash_entry) {
+ tt_global = container_of(tt_common,
+ struct batadv_tt_global_entry,
+ common);
+ /* compute the CRC only for entries belonging to the
+ * VLAN identified by the vid passed as parameter
+ */
+ if (tt_common->vid != vid)
+ continue;
+
+ /* Roaming clients are in the global table for
+ * consistency only. They don't have to be
+ * taken into account while computing the
+ * global crc
+ */
+ if (tt_common->flags & BATADV_TT_CLIENT_ROAM)
+ continue;
+ /* Temporary clients have not been announced yet, so
+ * they have to be skipped while computing the global
+ * crc
+ */
+ if (tt_common->flags & BATADV_TT_CLIENT_TEMP)
+ continue;
+
+ /* find out if this global entry is announced by this
+ * originator
+ */
+ tt_orig = batadv_tt_global_orig_entry_find(tt_global,
+ orig_node);
+ if (!tt_orig)
+ continue;
+
+ /* use network order to read the VID: this ensures that
+ * every node reads the bytes in the same order.
+ */
+ tmp_vid = htons(tt_common->vid);
+ crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid));
+
+ /* compute the CRC on flags that have to be kept in sync
+ * among nodes
+ */
+ flags = tt_orig->flags;
+ crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags));
+
+ crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN);
+
+ batadv_tt_orig_list_entry_put(tt_orig);
+ }
+ rcu_read_unlock();
+ }
+
+ return crc;
+}
+
+/**
+ * batadv_tt_local_crc() - calculates the checksum of the local table
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vid: VLAN identifier for which the CRC32 has to be computed
+ *
+ * For details about the computation, please refer to the documentation for
+ * batadv_tt_global_crc().
+ *
+ * Return: the checksum of the local table
+ */
+static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common;
+ struct hlist_head *head;
+ u32 i, crc_tmp, crc = 0;
+ u8 flags;
+ __be16 tmp_vid;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common, head, hash_entry) {
+ /* compute the CRC only for entries belonging to the
+ * VLAN identified by vid
+ */
+ if (tt_common->vid != vid)
+ continue;
+
+ /* not yet committed clients have not to be taken into
+ * account while computing the CRC
+ */
+ if (tt_common->flags & BATADV_TT_CLIENT_NEW)
+ continue;
+
+ /* use network order to read the VID: this ensures that
+ * every node reads the bytes in the same order.
+ */
+ tmp_vid = htons(tt_common->vid);
+ crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid));
+
+ /* compute the CRC on flags that have to be kept in sync
+ * among nodes
+ */
+ flags = tt_common->flags & BATADV_TT_SYNC_MASK;
+ crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags));
+
+ crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN);
+ }
+ rcu_read_unlock();
+ }
+
+ return crc;
+}
+
+/**
+ * batadv_tt_req_node_release() - free tt_req node entry
+ * @ref: kref pointer of the tt req_node entry
+ */
+static void batadv_tt_req_node_release(struct kref *ref)
+{
+ struct batadv_tt_req_node *tt_req_node;
+
+ tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount);
+
+ kmem_cache_free(batadv_tt_req_cache, tt_req_node);
+}
+
+/**
+ * batadv_tt_req_node_put() - decrement the tt_req_node refcounter and
+ * possibly release it
+ * @tt_req_node: tt_req_node to be free'd
+ */
+static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node)
+{
+ if (!tt_req_node)
+ return;
+
+ kref_put(&tt_req_node->refcount, batadv_tt_req_node_release);
+}
+
+static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_req_node *node;
+ struct hlist_node *safe;
+
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+
+ hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ hlist_del_init(&node->list);
+ batadv_tt_req_node_put(node);
+ }
+
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+}
+
+static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const void *tt_buff,
+ u16 tt_buff_len)
+{
+ /* Replace the old buffer only if I received something in the
+ * last OGM (the OGM could carry no changes)
+ */
+ spin_lock_bh(&orig_node->tt_buff_lock);
+ if (tt_buff_len > 0) {
+ kfree(orig_node->tt_buff);
+ orig_node->tt_buff_len = 0;
+ orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC);
+ if (orig_node->tt_buff) {
+ memcpy(orig_node->tt_buff, tt_buff, tt_buff_len);
+ orig_node->tt_buff_len = tt_buff_len;
+ }
+ }
+ spin_unlock_bh(&orig_node->tt_buff_lock);
+}
+
+static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_req_node *node;
+ struct hlist_node *safe;
+
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ if (batadv_has_timed_out(node->issued_at,
+ BATADV_TT_REQUEST_TIMEOUT)) {
+ hlist_del_init(&node->list);
+ batadv_tt_req_node_put(node);
+ }
+ }
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+}
+
+/**
+ * batadv_tt_req_node_new() - search and possibly create a tt_req_node object
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node this request is being issued for
+ *
+ * Return: the pointer to the new tt_req_node struct if no request
+ * has already been issued for this orig_node, NULL otherwise.
+ */
+static struct batadv_tt_req_node *
+batadv_tt_req_node_new(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL;
+
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) {
+ if (batadv_compare_eth(tt_req_node_tmp, orig_node) &&
+ !batadv_has_timed_out(tt_req_node_tmp->issued_at,
+ BATADV_TT_REQUEST_TIMEOUT))
+ goto unlock;
+ }
+
+ tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC);
+ if (!tt_req_node)
+ goto unlock;
+
+ kref_init(&tt_req_node->refcount);
+ ether_addr_copy(tt_req_node->addr, orig_node->orig);
+ tt_req_node->issued_at = jiffies;
+
+ kref_get(&tt_req_node->refcount);
+ hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list);
+unlock:
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+ return tt_req_node;
+}
+
+/**
+ * batadv_tt_local_valid() - verify local tt entry and get flags
+ * @entry_ptr: to be checked local tt entry
+ * @data_ptr: not used but definition required to satisfy the callback prototype
+ * @flags: a pointer to store TT flags for this client to
+ *
+ * Checks the validity of the given local TT entry. If it is, then the provided
+ * flags pointer is updated.
+ *
+ * Return: true if the entry is a valid, false otherwise.
+ */
+static bool batadv_tt_local_valid(const void *entry_ptr,
+ const void *data_ptr,
+ u8 *flags)
+{
+ const struct batadv_tt_common_entry *tt_common_entry = entry_ptr;
+
+ if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW)
+ return false;
+
+ if (flags)
+ *flags = tt_common_entry->flags;
+
+ return true;
+}
+
+/**
+ * batadv_tt_global_valid() - verify global tt entry and get flags
+ * @entry_ptr: to be checked global tt entry
+ * @data_ptr: an orig_node object (may be NULL)
+ * @flags: a pointer to store TT flags for this client to
+ *
+ * Checks the validity of the given global TT entry. If it is, then the provided
+ * flags pointer is updated either with the common (summed) TT flags if data_ptr
+ * is NULL or the specific, per originator TT flags otherwise.
+ *
+ * Return: true if the entry is a valid, false otherwise.
+ */
+static bool batadv_tt_global_valid(const void *entry_ptr,
+ const void *data_ptr,
+ u8 *flags)
+{
+ const struct batadv_tt_common_entry *tt_common_entry = entry_ptr;
+ const struct batadv_tt_global_entry *tt_global_entry;
+ const struct batadv_orig_node *orig_node = data_ptr;
+
+ if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM ||
+ tt_common_entry->flags & BATADV_TT_CLIENT_TEMP)
+ return false;
+
+ tt_global_entry = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+
+ return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node,
+ flags);
+}
+
+/**
+ * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the
+ * specified tt hash
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @hash: hash table containing the tt entries
+ * @tt_len: expected tvlv tt data buffer length in number of bytes
+ * @tvlv_buff: pointer to the buffer to fill with the TT data
+ * @valid_cb: function to filter tt change entries and to return TT flags
+ * @cb_data: data passed to the filter function as argument
+ *
+ * Fills the tvlv buff with the tt entries from the specified hash. If valid_cb
+ * is not provided then this becomes a no-op.
+ *
+ * Return: Remaining unused length in tvlv_buff.
+ */
+static u16 batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ void *tvlv_buff, u16 tt_len,
+ bool (*valid_cb)(const void *,
+ const void *,
+ u8 *flags),
+ void *cb_data)
+{
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct hlist_head *head;
+ u16 tt_tot, tt_num_entries = 0;
+ u8 flags;
+ bool ret;
+ u32 i;
+
+ tt_tot = batadv_tt_entries(tt_len);
+ tt_change = tvlv_buff;
+
+ if (!valid_cb)
+ return tt_len;
+
+ rcu_read_lock();
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ hlist_for_each_entry_rcu(tt_common_entry,
+ head, hash_entry) {
+ if (tt_tot == tt_num_entries)
+ break;
+
+ ret = valid_cb(tt_common_entry, cb_data, &flags);
+ if (!ret)
+ continue;
+
+ ether_addr_copy(tt_change->addr, tt_common_entry->addr);
+ tt_change->flags = flags;
+ tt_change->vid = htons(tt_common_entry->vid);
+ memset(tt_change->reserved, 0,
+ sizeof(tt_change->reserved));
+
+ tt_num_entries++;
+ tt_change++;
+ }
+ }
+ rcu_read_unlock();
+
+ return batadv_tt_len(tt_tot - tt_num_entries);
+}
+
+/**
+ * batadv_tt_global_check_crc() - check if all the CRCs are correct
+ * @orig_node: originator for which the CRCs have to be checked
+ * @tt_vlan: pointer to the first tvlv VLAN entry
+ * @num_vlan: number of tvlv VLAN entries
+ *
+ * Return: true if all the received CRCs match the locally stored ones, false
+ * otherwise
+ */
+static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_tt_vlan_data *tt_vlan,
+ u16 num_vlan)
+{
+ struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp;
+ struct batadv_orig_node_vlan *vlan;
+ int i, orig_num_vlan;
+ u32 crc;
+
+ /* check if each received CRC matches the locally stored one */
+ for (i = 0; i < num_vlan; i++) {
+ tt_vlan_tmp = tt_vlan + i;
+
+ /* if orig_node is a backbone node for this VLAN, don't check
+ * the CRC as we ignore all the global entries over it
+ */
+ if (batadv_bla_is_backbone_gw_orig(orig_node->bat_priv,
+ orig_node->orig,
+ ntohs(tt_vlan_tmp->vid)))
+ continue;
+
+ vlan = batadv_orig_node_vlan_get(orig_node,
+ ntohs(tt_vlan_tmp->vid));
+ if (!vlan)
+ return false;
+
+ crc = vlan->tt.crc;
+ batadv_orig_node_vlan_put(vlan);
+
+ if (crc != ntohl(tt_vlan_tmp->crc))
+ return false;
+ }
+
+ /* check if any excess VLANs exist locally for the originator
+ * which are not mentioned in the TVLV from the originator.
+ */
+ rcu_read_lock();
+ orig_num_vlan = 0;
+ hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list)
+ orig_num_vlan++;
+ rcu_read_unlock();
+
+ if (orig_num_vlan > num_vlan)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_tt_local_update_crc() - update all the local CRCs
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv)
+{
+ struct batadv_meshif_vlan *vlan;
+
+ /* recompute the global CRC for each VLAN */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) {
+ vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: the orig_node for which the CRCs have to be updated
+ */
+static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_node_vlan *vlan;
+ u32 crc;
+
+ /* recompute the global CRC for each VLAN */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ /* if orig_node is a backbone node for this VLAN, don't compute
+ * the CRC as we ignore all the global entries over it
+ */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig,
+ vlan->vid))
+ continue;
+
+ crc = batadv_tt_global_crc(bat_priv, orig_node, vlan->vid);
+ vlan->tt.crc = crc;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_send_tt_request() - send a TT Request message to a given node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @dst_orig_node: the destination of the message
+ * @ttvn: the version number that the source of the message is looking for
+ * @tt_vlan: pointer to the first tvlv VLAN object to request
+ * @num_vlan: number of tvlv VLAN entries
+ * @full_table: ask for the entire translation table if true, while only for the
+ * last TT diff otherwise
+ *
+ * Return: true if the TT Request was sent, false otherwise
+ */
+static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *dst_orig_node,
+ u8 ttvn,
+ struct batadv_tvlv_tt_vlan_data *tt_vlan,
+ u16 num_vlan, bool full_table)
+{
+ struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
+ struct batadv_tt_req_node *tt_req_node = NULL;
+ struct batadv_hard_iface *primary_if;
+ bool ret = false;
+ int i, size;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* The new tt_req will be issued only if I'm not waiting for a
+ * reply from the same orig_node yet
+ */
+ tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node);
+ if (!tt_req_node)
+ goto out;
+
+ size = struct_size(tvlv_tt_data, vlan_data, num_vlan);
+ tvlv_tt_data = kzalloc(size, GFP_ATOMIC);
+ if (!tvlv_tt_data)
+ goto out;
+
+ tvlv_tt_data->flags = BATADV_TT_REQUEST;
+ tvlv_tt_data->ttvn = ttvn;
+ tvlv_tt_data->num_vlan = htons(num_vlan);
+
+ /* send all the CRCs within the request. This is needed by intermediate
+ * nodes to ensure they have the correct table before replying
+ */
+ for (i = 0; i < num_vlan; i++) {
+ tvlv_tt_data->vlan_data[i].vid = tt_vlan->vid;
+ tvlv_tt_data->vlan_data[i].crc = tt_vlan->crc;
+
+ tt_vlan++;
+ }
+
+ if (full_table)
+ tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_REQUEST to %pM [%c]\n",
+ dst_orig_node->orig, full_table ? 'F' : '.');
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_TX);
+ batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr,
+ dst_orig_node->orig, BATADV_TVLV_TT, 1,
+ tvlv_tt_data, size);
+ ret = true;
+
+out:
+ batadv_hardif_put(primary_if);
+
+ if (ret && tt_req_node) {
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ if (!hlist_unhashed(&tt_req_node->list)) {
+ hlist_del_init(&tt_req_node->list);
+ batadv_tt_req_node_put(tt_req_node);
+ }
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+ }
+
+ batadv_tt_req_node_put(tt_req_node);
+
+ kfree(tvlv_tt_data);
+ return ret;
+}
+
+/**
+ * batadv_send_other_tt_response() - send reply to tt request concerning another
+ * node's translation table
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_data: tt data containing the tt request information
+ * @req_src: mac address of tt request sender
+ * @req_dst: mac address of tt request recipient
+ *
+ * Return: true if tt request reply was sent, false otherwise.
+ */
+static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ u8 *req_src, u8 *req_dst)
+{
+ struct batadv_orig_node *req_dst_orig_node;
+ struct batadv_orig_node *res_dst_orig_node = NULL;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
+ bool ret = false, full_table;
+ u8 orig_ttvn, req_ttvn;
+ u16 tvlv_len;
+ s32 tt_len;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n",
+ req_src, tt_data->ttvn, req_dst,
+ ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.'));
+
+ /* Let's get the orig node of the REAL destination */
+ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst);
+ if (!req_dst_orig_node)
+ goto out;
+
+ res_dst_orig_node = batadv_orig_hash_find(bat_priv, req_src);
+ if (!res_dst_orig_node)
+ goto out;
+
+ orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn);
+ req_ttvn = tt_data->ttvn;
+
+ /* this node doesn't have the requested data */
+ if (orig_ttvn != req_ttvn ||
+ !batadv_tt_global_check_crc(req_dst_orig_node, tt_data->vlan_data,
+ ntohs(tt_data->num_vlan)))
+ goto out;
+
+ /* If the full table has been explicitly requested */
+ if (tt_data->flags & BATADV_TT_FULL_TABLE ||
+ !req_dst_orig_node->tt_buff)
+ full_table = true;
+ else
+ full_table = false;
+
+ /* TT fragmentation hasn't been implemented yet, so send as many
+ * TT entries fit a single packet as possible only
+ */
+ if (!full_table) {
+ spin_lock_bh(&req_dst_orig_node->tt_buff_lock);
+ tt_len = req_dst_orig_node->tt_buff_len;
+
+ tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len)
+ goto unlock;
+
+ /* Copy the last orig_node's OGM buffer */
+ memcpy(tt_change, req_dst_orig_node->tt_buff,
+ req_dst_orig_node->tt_buff_len);
+ spin_unlock_bh(&req_dst_orig_node->tt_buff_lock);
+ } else {
+ /* allocate the tvlv, put the tt_data and all the tt_vlan_data
+ * in the initial part
+ */
+ tt_len = -1;
+ tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len)
+ goto out;
+
+ /* fill the rest of the tvlv with the real TT entries */
+ tvlv_len -= batadv_tt_tvlv_generate(bat_priv,
+ bat_priv->tt.global_hash,
+ tt_change, tt_len,
+ batadv_tt_global_valid,
+ req_dst_orig_node);
+ }
+
+ /* Don't send the response, if larger than fragmented packet. */
+ tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len;
+ if (tt_len > atomic_read(&bat_priv->packet_size_max)) {
+ net_ratelimited_function(batadv_info, bat_priv->mesh_iface,
+ "Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n",
+ res_dst_orig_node->orig);
+ goto out;
+ }
+
+ tvlv_tt_data->flags = BATADV_TT_RESPONSE;
+ tvlv_tt_data->ttvn = req_ttvn;
+
+ if (full_table)
+ tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Sending TT_RESPONSE %pM for %pM [%c] (ttvn: %u)\n",
+ res_dst_orig_node->orig, req_dst_orig_node->orig,
+ full_table ? 'F' : '.', req_ttvn);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX);
+
+ batadv_tvlv_unicast_send(bat_priv, req_dst_orig_node->orig,
+ req_src, BATADV_TVLV_TT, 1, tvlv_tt_data,
+ tvlv_len);
+
+ ret = true;
+ goto out;
+
+unlock:
+ spin_unlock_bh(&req_dst_orig_node->tt_buff_lock);
+
+out:
+ batadv_orig_node_put(res_dst_orig_node);
+ batadv_orig_node_put(req_dst_orig_node);
+ kfree(tvlv_tt_data);
+ return ret;
+}
+
+/**
+ * batadv_send_my_tt_response() - send reply to tt request concerning this
+ * node's translation table
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_data: tt data containing the tt request information
+ * @req_src: mac address of tt request sender
+ *
+ * Return: true if tt request reply was sent, false otherwise.
+ */
+static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ u8 *req_src)
+{
+ struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct batadv_orig_node *orig_node;
+ u8 my_ttvn, req_ttvn;
+ u16 tvlv_len;
+ bool full_table;
+ s32 tt_len;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n",
+ req_src, tt_data->ttvn,
+ ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.'));
+
+ spin_lock_bh(&bat_priv->tt.commit_lock);
+
+ my_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
+ req_ttvn = tt_data->ttvn;
+
+ orig_node = batadv_orig_hash_find(bat_priv, req_src);
+ if (!orig_node)
+ goto out;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* If the full table has been explicitly requested or the gap
+ * is too big send the whole local translation table
+ */
+ if (tt_data->flags & BATADV_TT_FULL_TABLE || my_ttvn != req_ttvn ||
+ !bat_priv->tt.last_changeset)
+ full_table = true;
+ else
+ full_table = false;
+
+ /* TT fragmentation hasn't been implemented yet, so send as many
+ * TT entries fit a single packet as possible only
+ */
+ if (!full_table) {
+ spin_lock_bh(&bat_priv->tt.last_changeset_lock);
+
+ tt_len = bat_priv->tt.last_changeset_len;
+ tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len || !tvlv_len)
+ goto unlock;
+
+ /* Copy the last orig_node's OGM buffer */
+ memcpy(tt_change, bat_priv->tt.last_changeset,
+ bat_priv->tt.last_changeset_len);
+ spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+ } else {
+ req_ttvn = (u8)atomic_read(&bat_priv->tt.vn);
+
+ /* allocate the tvlv, put the tt_data and all the tt_vlan_data
+ * in the initial part
+ */
+ tt_len = -1;
+ tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len || !tvlv_len)
+ goto out;
+
+ /* fill the rest of the tvlv with the real TT entries */
+ tvlv_len -= batadv_tt_tvlv_generate(bat_priv,
+ bat_priv->tt.local_hash,
+ tt_change, tt_len,
+ batadv_tt_local_valid,
+ NULL);
+ }
+
+ tvlv_tt_data->flags = BATADV_TT_RESPONSE;
+ tvlv_tt_data->ttvn = req_ttvn;
+
+ if (full_table)
+ tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Sending TT_RESPONSE to %pM [%c] (ttvn: %u)\n",
+ orig_node->orig, full_table ? 'F' : '.', req_ttvn);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX);
+
+ batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr,
+ req_src, BATADV_TVLV_TT, 1, tvlv_tt_data,
+ tvlv_len);
+
+ goto out;
+
+unlock:
+ spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+out:
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
+ batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
+ kfree(tvlv_tt_data);
+ /* The packet was for this host, so it doesn't need to be re-routed */
+ return true;
+}
+
+/**
+ * batadv_send_tt_response() - send reply to tt request
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_data: tt data containing the tt request information
+ * @req_src: mac address of tt request sender
+ * @req_dst: mac address of tt request recipient
+ *
+ * Return: true if tt request reply was sent, false otherwise.
+ */
+static bool batadv_send_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ u8 *req_src, u8 *req_dst)
+{
+ if (batadv_is_my_mac(bat_priv, req_dst))
+ return batadv_send_my_tt_response(bat_priv, tt_data, req_src);
+ return batadv_send_other_tt_response(bat_priv, tt_data, req_src,
+ req_dst);
+}
+
+static void _batadv_tt_update_changes(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_tt_change *tt_change,
+ u16 tt_num_changes, u8 ttvn)
+{
+ int i;
+ int roams;
+
+ for (i = 0; i < tt_num_changes; i++) {
+ if ((tt_change + i)->flags & BATADV_TT_CLIENT_DEL) {
+ roams = (tt_change + i)->flags & BATADV_TT_CLIENT_ROAM;
+ batadv_tt_global_del(bat_priv, orig_node,
+ (tt_change + i)->addr,
+ ntohs((tt_change + i)->vid),
+ "tt removed by changes",
+ roams);
+ } else {
+ if (!batadv_tt_global_add(bat_priv, orig_node,
+ (tt_change + i)->addr,
+ ntohs((tt_change + i)->vid),
+ (tt_change + i)->flags, ttvn))
+ /* In case of problem while storing a
+ * global_entry, we stop the updating
+ * procedure without committing the
+ * ttvn change. This will avoid to send
+ * corrupted data on tt_request
+ */
+ return;
+ }
+ }
+ set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized);
+}
+
+static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_change *tt_change,
+ u8 ttvn, u8 *resp_src,
+ u16 num_entries)
+{
+ struct batadv_orig_node *orig_node;
+
+ orig_node = batadv_orig_hash_find(bat_priv, resp_src);
+ if (!orig_node)
+ goto out;
+
+ /* Purge the old table first.. */
+ batadv_tt_global_del_orig(bat_priv, orig_node, -1,
+ "Received full table");
+
+ _batadv_tt_update_changes(bat_priv, orig_node, tt_change, num_entries,
+ ttvn);
+
+ spin_lock_bh(&orig_node->tt_buff_lock);
+ kfree(orig_node->tt_buff);
+ orig_node->tt_buff_len = 0;
+ orig_node->tt_buff = NULL;
+ spin_unlock_bh(&orig_node->tt_buff_lock);
+
+ atomic_set(&orig_node->last_ttvn, ttvn);
+
+out:
+ batadv_orig_node_put(orig_node);
+}
+
+static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ u16 tt_num_changes, u8 ttvn,
+ struct batadv_tvlv_tt_change *tt_change)
+{
+ _batadv_tt_update_changes(bat_priv, orig_node, tt_change,
+ tt_num_changes, ttvn);
+
+ batadv_tt_save_orig_buffer(bat_priv, orig_node, tt_change,
+ batadv_tt_len(tt_num_changes));
+ atomic_set(&orig_node->last_ttvn, ttvn);
+}
+
+/**
+ * batadv_is_my_client() - check if a client is served by the local node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the client to check
+ * @vid: VLAN identifier
+ *
+ * Return: true if the client is served by this node, false otherwise.
+ */
+bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ bool ret = false;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
+ if (!tt_local_entry)
+ goto out;
+ /* Check if the client has been logically deleted (but is kept for
+ * consistency purpose)
+ */
+ if ((tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) ||
+ (tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM))
+ goto out;
+ ret = true;
+out:
+ batadv_tt_local_entry_put(tt_local_entry);
+ return ret;
+}
+
+/**
+ * batadv_handle_tt_response() - process incoming tt reply
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tt_data: tt data containing the tt request information
+ * @resp_src: mac address of tt reply sender
+ * @num_entries: number of tt change entries appended to the tt data
+ */
+static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ u8 *resp_src, u16 num_entries)
+{
+ struct batadv_tt_req_node *node;
+ struct hlist_node *safe;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_tvlv_tt_change *tt_change;
+ u8 *tvlv_ptr = (u8 *)tt_data;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n",
+ resp_src, tt_data->ttvn, num_entries,
+ ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.'));
+
+ orig_node = batadv_orig_hash_find(bat_priv, resp_src);
+ if (!orig_node)
+ goto out;
+
+ spin_lock_bh(&orig_node->tt_lock);
+
+ tvlv_ptr += struct_size(tt_data, vlan_data, ntohs(tt_data->num_vlan));
+
+ tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr;
+ if (tt_data->flags & BATADV_TT_FULL_TABLE) {
+ batadv_tt_fill_gtable(bat_priv, tt_change, tt_data->ttvn,
+ resp_src, num_entries);
+ } else {
+ batadv_tt_update_changes(bat_priv, orig_node, num_entries,
+ tt_data->ttvn, tt_change);
+ }
+
+ /* Recalculate the CRC for this orig_node and store it */
+ batadv_tt_global_update_crc(bat_priv, orig_node);
+
+ spin_unlock_bh(&orig_node->tt_lock);
+
+ /* Delete the tt_req_node from pending tt_requests list */
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ if (!batadv_compare_eth(node->addr, resp_src))
+ continue;
+ hlist_del_init(&node->list);
+ batadv_tt_req_node_put(node);
+ }
+
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+out:
+ batadv_orig_node_put(orig_node);
+}
+
+static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_roam_node *node, *safe;
+
+ spin_lock_bh(&bat_priv->tt.roam_list_lock);
+
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) {
+ list_del(&node->list);
+ kmem_cache_free(batadv_tt_roam_cache, node);
+ }
+
+ spin_unlock_bh(&bat_priv->tt.roam_list_lock);
+}
+
+static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_roam_node *node, *safe;
+
+ spin_lock_bh(&bat_priv->tt.roam_list_lock);
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) {
+ if (!batadv_has_timed_out(node->first_time,
+ BATADV_ROAMING_MAX_TIME))
+ continue;
+
+ list_del(&node->list);
+ kmem_cache_free(batadv_tt_roam_cache, node);
+ }
+ spin_unlock_bh(&bat_priv->tt.roam_list_lock);
+}
+
+/**
+ * batadv_tt_check_roam_count() - check if a client has roamed too frequently
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @client: mac address of the roaming client
+ *
+ * This function checks whether the client already reached the
+ * maximum number of possible roaming phases. In this case the ROAMING_ADV
+ * will not be sent.
+ *
+ * Return: true if the ROAMING_ADV can be sent, false otherwise
+ */
+static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client)
+{
+ struct batadv_tt_roam_node *tt_roam_node;
+ bool ret = false;
+
+ spin_lock_bh(&bat_priv->tt.roam_list_lock);
+ /* The new tt_req will be issued only if I'm not waiting for a
+ * reply from the same orig_node yet
+ */
+ list_for_each_entry(tt_roam_node, &bat_priv->tt.roam_list, list) {
+ if (!batadv_compare_eth(tt_roam_node->addr, client))
+ continue;
+
+ if (batadv_has_timed_out(tt_roam_node->first_time,
+ BATADV_ROAMING_MAX_TIME))
+ continue;
+
+ if (!batadv_atomic_dec_not_zero(&tt_roam_node->counter))
+ /* Sorry, you roamed too many times! */
+ goto unlock;
+ ret = true;
+ break;
+ }
+
+ if (!ret) {
+ tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache,
+ GFP_ATOMIC);
+ if (!tt_roam_node)
+ goto unlock;
+
+ tt_roam_node->first_time = jiffies;
+ atomic_set(&tt_roam_node->counter,
+ BATADV_ROAMING_MAX_COUNT - 1);
+ ether_addr_copy(tt_roam_node->addr, client);
+
+ list_add(&tt_roam_node->list, &bat_priv->tt.roam_list);
+ ret = true;
+ }
+
+unlock:
+ spin_unlock_bh(&bat_priv->tt.roam_list_lock);
+ return ret;
+}
+
+/**
+ * batadv_send_roam_adv() - send a roaming advertisement message
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @client: mac address of the roaming client
+ * @vid: VLAN identifier
+ * @orig_node: message destination
+ *
+ * Send a ROAMING_ADV message to the node which was previously serving this
+ * client. This is done to inform the node that from now on all traffic destined
+ * for this particular roamed client has to be forwarded to the sender of the
+ * roaming message.
+ */
+static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
+ unsigned short vid,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_hard_iface *primary_if;
+ struct batadv_tvlv_roam_adv tvlv_roam;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* before going on we have to check whether the client has
+ * already roamed to us too many times
+ */
+ if (!batadv_tt_check_roam_count(bat_priv, client))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Sending ROAMING_ADV to %pM (client %pM, vid: %d)\n",
+ orig_node->orig, client, batadv_print_vid(vid));
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX);
+
+ memcpy(tvlv_roam.client, client, sizeof(tvlv_roam.client));
+ tvlv_roam.vid = htons(vid);
+
+ batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr,
+ orig_node->orig, BATADV_TVLV_ROAM, 1,
+ &tvlv_roam, sizeof(tvlv_roam));
+
+out:
+ batadv_hardif_put(primary_if);
+}
+
+static void batadv_tt_purge(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_tt *priv_tt;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = to_delayed_work(work);
+ priv_tt = container_of(delayed_work, struct batadv_priv_tt, work);
+ bat_priv = container_of(priv_tt, struct batadv_priv, tt);
+
+ batadv_tt_local_purge(bat_priv, BATADV_TT_LOCAL_TIMEOUT);
+ batadv_tt_global_purge(bat_priv);
+ batadv_tt_req_purge(bat_priv);
+ batadv_tt_roam_purge(bat_priv);
+
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work,
+ msecs_to_jiffies(BATADV_TT_WORK_PERIOD));
+}
+
+/**
+ * batadv_tt_free() - Free translation table of mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_tt_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1);
+
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1);
+
+ cancel_delayed_work_sync(&bat_priv->tt.work);
+
+ batadv_tt_local_table_free(bat_priv);
+ batadv_tt_global_table_free(bat_priv);
+ batadv_tt_req_list_free(bat_priv);
+ batadv_tt_changes_list_free(bat_priv);
+ batadv_tt_roam_list_free(bat_priv);
+
+ kfree(bat_priv->tt.last_changeset);
+}
+
+/**
+ * batadv_tt_local_set_flags() - set or unset the specified flags on the local
+ * table and possibly count them in the TT size
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @flags: the flag to switch
+ * @enable: whether to set or unset the flag
+ * @count: whether to increase the TT size by the number of changed entries
+ */
+static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags,
+ bool enable, bool count)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct hlist_head *head;
+ u32 i;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common_entry,
+ head, hash_entry) {
+ if (enable) {
+ if ((tt_common_entry->flags & flags) == flags)
+ continue;
+ tt_common_entry->flags |= flags;
+ } else {
+ if (!(tt_common_entry->flags & flags))
+ continue;
+ tt_common_entry->flags &= ~flags;
+ }
+
+ if (!count)
+ continue;
+
+ batadv_tt_local_size_inc(bat_priv,
+ tt_common_entry->vid);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/* Purge out all the tt local entries marked with BATADV_TT_CLIENT_PENDING */
+static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common;
+ struct batadv_tt_local_entry *tt_local;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ u32 i;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common, node_tmp, head,
+ hash_entry) {
+ if (!(tt_common->flags & BATADV_TT_CLIENT_PENDING))
+ continue;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting local tt entry (%pM, vid: %d): pending\n",
+ tt_common->addr,
+ batadv_print_vid(tt_common->vid));
+
+ batadv_tt_local_size_dec(bat_priv, tt_common->vid);
+ hlist_del_rcu(&tt_common->hash_entry);
+ tt_local = container_of(tt_common,
+ struct batadv_tt_local_entry,
+ common);
+
+ batadv_tt_local_entry_put(tt_local);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+/**
+ * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes
+ * which have been queued in the time since the last commit
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Caller must hold tt->commit_lock.
+ */
+static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
+{
+ lockdep_assert_held(&bat_priv->tt.commit_lock);
+
+ if (READ_ONCE(bat_priv->tt.local_changes) == 0) {
+ if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
+ batadv_tt_tvlv_container_update(bat_priv);
+ return;
+ }
+
+ batadv_tt_local_set_flags(bat_priv, BATADV_TT_CLIENT_NEW, false, true);
+
+ batadv_tt_local_purge_pending_clients(bat_priv);
+ batadv_tt_local_update_crc(bat_priv);
+
+ /* Increment the TTVN only once per OGM interval */
+ atomic_inc(&bat_priv->tt.vn);
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Local changes committed, updating to ttvn %u\n",
+ (u8)atomic_read(&bat_priv->tt.vn));
+
+ /* reset the sending counter */
+ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX);
+ batadv_tt_tvlv_container_update(bat_priv);
+}
+
+/**
+ * batadv_tt_local_commit_changes() - commit all pending local tt changes which
+ * have been queued in the time since the last commit
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
+{
+ spin_lock_bh(&bat_priv->tt.commit_lock);
+ batadv_tt_local_commit_changes_nolock(bat_priv);
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
+}
+
+/**
+ * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @src: source mac address of packet
+ * @dst: destination mac address of packet
+ * @vid: vlan id of packet
+ *
+ * Return: true when src+dst(+vid) pair should be isolated, false otherwise
+ */
+bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
+ unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ struct batadv_tt_global_entry *tt_global_entry;
+ struct batadv_meshif_vlan *vlan;
+ bool ret = false;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
+ if (!vlan)
+ return false;
+
+ if (!atomic_read(&vlan->ap_isolation))
+ goto vlan_put;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid);
+ if (!tt_local_entry)
+ goto vlan_put;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid);
+ if (!tt_global_entry)
+ goto local_entry_put;
+
+ if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
+ ret = true;
+
+ batadv_tt_global_entry_put(tt_global_entry);
+local_entry_put:
+ batadv_tt_local_entry_put(tt_local_entry);
+vlan_put:
+ batadv_meshif_vlan_put(vlan);
+ return ret;
+}
+
+/**
+ * batadv_tt_update_orig() - update global translation table with new tt
+ * information received via ogms
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: the orig_node of the ogm
+ * @tt_buff: pointer to the first tvlv VLAN entry
+ * @tt_num_vlan: number of tvlv VLAN entries
+ * @tt_change: pointer to the first entry in the TT buffer
+ * @tt_num_changes: number of tt changes inside the tt buffer
+ * @ttvn: translation table version number of this changeset
+ */
+static void batadv_tt_update_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const void *tt_buff, u16 tt_num_vlan,
+ struct batadv_tvlv_tt_change *tt_change,
+ u16 tt_num_changes, u8 ttvn)
+{
+ u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ bool full_table = true;
+ bool has_tt_init;
+
+ tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff;
+ has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT,
+ &orig_node->capa_initialized);
+
+ /* orig table not initialised AND first diff is in the OGM OR the ttvn
+ * increased by one -> we can apply the attached changes
+ */
+ if ((!has_tt_init && ttvn == 1) || ttvn - orig_ttvn == 1) {
+ /* the OGM could not contain the changes due to their size or
+ * because they have already been sent BATADV_TT_OGM_APPEND_MAX
+ * times.
+ * In this case send a tt request
+ */
+ if (!tt_num_changes) {
+ full_table = false;
+ goto request_table;
+ }
+
+ spin_lock_bh(&orig_node->tt_lock);
+
+ batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes,
+ ttvn, tt_change);
+
+ /* Even if we received the precomputed crc with the OGM, we
+ * prefer to recompute it to spot any possible inconsistency
+ * in the global table
+ */
+ batadv_tt_global_update_crc(bat_priv, orig_node);
+
+ spin_unlock_bh(&orig_node->tt_lock);
+
+ /* The ttvn alone is not enough to guarantee consistency
+ * because a single value could represent different states
+ * (due to the wrap around). Thus a node has to check whether
+ * the resulting table (after applying the changes) is still
+ * consistent or not. E.g. a node could disconnect while its
+ * ttvn is X and reconnect on ttvn = X + TTVN_MAX: in this case
+ * checking the CRC value is mandatory to detect the
+ * inconsistency
+ */
+ if (!batadv_tt_global_check_crc(orig_node, tt_vlan,
+ tt_num_vlan))
+ goto request_table;
+ } else {
+ /* if we missed more than one change or our tables are not
+ * in sync anymore -> request fresh tt data
+ */
+ if (!has_tt_init || ttvn != orig_ttvn ||
+ !batadv_tt_global_check_crc(orig_node, tt_vlan,
+ tt_num_vlan)) {
+request_table:
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u num_changes: %u)\n",
+ orig_node->orig, ttvn, orig_ttvn,
+ tt_num_changes);
+ batadv_send_tt_request(bat_priv, orig_node, ttvn,
+ tt_vlan, tt_num_vlan,
+ full_table);
+ return;
+ }
+ }
+}
+
+/**
+ * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the client to check
+ * @vid: VLAN identifier
+ *
+ * Return: true if we know that the client has moved from its old originator
+ * to another one. This entry is still kept for consistency purposes and will be
+ * deleted later by a DEL or because of timeout
+ */
+bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
+ u8 *addr, unsigned short vid)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ bool ret = false;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM;
+ batadv_tt_global_entry_put(tt_global_entry);
+out:
+ return ret;
+}
+
+/**
+ * batadv_tt_local_client_is_roaming() - tells whether the client is roaming
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the local client to query
+ * @vid: VLAN identifier
+ *
+ * Return: true if the local client is known to be roaming (it is not served by
+ * this node anymore) or not. If yes, the client is still present in the table
+ * to keep the latter consistent with the node TTVN
+ */
+bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
+ u8 *addr, unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ bool ret = false;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
+ if (!tt_local_entry)
+ goto out;
+
+ ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM;
+ batadv_tt_local_entry_put(tt_local_entry);
+out:
+ return ret;
+}
+
+/**
+ * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_node: orig node which the temporary entry should be associated with
+ * @addr: mac address of the client
+ * @vid: VLAN id of the new temporary global translation table
+ *
+ * Return: true when temporary tt entry could be added, false otherwise
+ */
+bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr,
+ unsigned short vid)
+{
+ /* ignore loop detect macs, they are not supposed to be in the tt local
+ * data as well.
+ */
+ if (batadv_bla_is_loopdetect_mac(addr))
+ return false;
+
+ if (!batadv_tt_global_add(bat_priv, orig_node, addr, vid,
+ BATADV_TT_CLIENT_TEMP,
+ atomic_read(&orig_node->last_ttvn)))
+ return false;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Added temporary global client (addr: %pM, vid: %d, orig: %pM)\n",
+ addr, batadv_print_vid(vid), orig_node->orig);
+
+ return true;
+}
+
+/**
+ * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the
+ * maximum packet size that can be transported through the mesh
+ * @mesh_iface: netdev struct of the mesh interface
+ *
+ * Remove entries older than 'timeout' and half timeout if more entries need
+ * to be removed.
+ */
+void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ int packet_size_max = atomic_read(&bat_priv->packet_size_max);
+ int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2;
+ bool reduced = false;
+
+ spin_lock_bh(&bat_priv->tt.commit_lock);
+
+ while (timeout) {
+ table_size = batadv_tt_local_table_transmit_size(bat_priv);
+ if (packet_size_max >= table_size)
+ break;
+
+ batadv_tt_local_purge(bat_priv, timeout);
+ batadv_tt_local_purge_pending_clients(bat_priv);
+
+ timeout /= 2;
+ reduced = true;
+ net_ratelimited_function(batadv_info, mesh_iface,
+ "Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n",
+ packet_size_max);
+ }
+
+ /* commit these changes immediately, to avoid synchronization problem
+ * with the TTVN
+ */
+ if (reduced)
+ batadv_tt_local_commit_changes_nolock(bat_priv);
+
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
+}
+
+/**
+ * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags, void *tvlv_value,
+ u16 tvlv_value_len)
+{
+ struct batadv_tvlv_tt_change *tt_change;
+ struct batadv_tvlv_tt_data *tt_data;
+ u16 num_entries, num_vlan;
+ size_t tt_data_sz;
+
+ if (tvlv_value_len < sizeof(*tt_data))
+ return;
+
+ tt_data = tvlv_value;
+ num_vlan = ntohs(tt_data->num_vlan);
+
+ tt_data_sz = struct_size(tt_data, vlan_data, num_vlan);
+ if (tvlv_value_len < tt_data_sz)
+ return;
+
+ tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data
+ + tt_data_sz);
+ tvlv_value_len -= tt_data_sz;
+
+ num_entries = batadv_tt_entries(tvlv_value_len);
+
+ batadv_tt_update_orig(bat_priv, orig, tt_data->vlan_data, num_vlan,
+ tt_change, num_entries, tt_data->ttvn);
+}
+
+/**
+ * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv
+ * container
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @src: mac address of tt tvlv sender
+ * @dst: mac address of tt tvlv recipient
+ * @tvlv_value: tvlv buffer containing the tt data
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS
+ * otherwise.
+ */
+static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value,
+ u16 tvlv_value_len)
+{
+ struct batadv_tvlv_tt_data *tt_data;
+ u16 tt_vlan_len, tt_num_entries;
+ char tt_flag;
+ bool ret;
+
+ if (tvlv_value_len < sizeof(*tt_data))
+ return NET_RX_SUCCESS;
+
+ tt_data = tvlv_value;
+ tvlv_value_len -= sizeof(*tt_data);
+
+ tt_vlan_len = flex_array_size(tt_data, vlan_data,
+ ntohs(tt_data->num_vlan));
+
+ if (tvlv_value_len < tt_vlan_len)
+ return NET_RX_SUCCESS;
+
+ tvlv_value_len -= tt_vlan_len;
+ tt_num_entries = batadv_tt_entries(tvlv_value_len);
+
+ switch (tt_data->flags & BATADV_TT_DATA_TYPE_MASK) {
+ case BATADV_TT_REQUEST:
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_RX);
+
+ /* If this node cannot provide a TT response the tt_request is
+ * forwarded
+ */
+ ret = batadv_send_tt_response(bat_priv, tt_data, src, dst);
+ if (!ret) {
+ if (tt_data->flags & BATADV_TT_FULL_TABLE)
+ tt_flag = 'F';
+ else
+ tt_flag = '.';
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Routing TT_REQUEST to %pM [%c]\n",
+ dst, tt_flag);
+ /* tvlv API will re-route the packet */
+ return NET_RX_DROP;
+ }
+ break;
+ case BATADV_TT_RESPONSE:
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_RX);
+
+ if (batadv_is_my_mac(bat_priv, dst)) {
+ batadv_handle_tt_response(bat_priv, tt_data,
+ src, tt_num_entries);
+ return NET_RX_SUCCESS;
+ }
+
+ if (tt_data->flags & BATADV_TT_FULL_TABLE)
+ tt_flag = 'F';
+ else
+ tt_flag = '.';
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Routing TT_RESPONSE to %pM [%c]\n", dst, tt_flag);
+
+ /* tvlv API will re-route the packet */
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv
+ * container
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @src: mac address of tt tvlv sender
+ * @dst: mac address of tt tvlv recipient
+ * @tvlv_value: tvlv buffer containing the tt data
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS
+ * otherwise.
+ */
+static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value,
+ u16 tvlv_value_len)
+{
+ struct batadv_tvlv_roam_adv *roaming_adv;
+ struct batadv_orig_node *orig_node = NULL;
+
+ /* If this node is not the intended recipient of the
+ * roaming advertisement the packet is forwarded
+ * (the tvlv API will re-route the packet).
+ */
+ if (!batadv_is_my_mac(bat_priv, dst))
+ return NET_RX_DROP;
+
+ if (tvlv_value_len < sizeof(*roaming_adv))
+ goto out;
+
+ orig_node = batadv_orig_hash_find(bat_priv, src);
+ if (!orig_node)
+ goto out;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX);
+ roaming_adv = tvlv_value;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received ROAMING_ADV from %pM (client %pM)\n",
+ src, roaming_adv->client);
+
+ batadv_tt_global_add(bat_priv, orig_node, roaming_adv->client,
+ ntohs(roaming_adv->vid), BATADV_TT_CLIENT_ROAM,
+ atomic_read(&orig_node->last_ttvn) + 1);
+
+out:
+ batadv_orig_node_put(orig_node);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tt_init() - initialise the translation table internals
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success or negative error number in case of failure.
+ */
+int batadv_tt_init(struct batadv_priv *bat_priv)
+{
+ int ret;
+
+ /* synchronized flags must be remote */
+ BUILD_BUG_ON(!(BATADV_TT_SYNC_MASK & BATADV_TT_REMOTE_MASK));
+
+ ret = batadv_tt_local_init(bat_priv);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_tt_global_init(bat_priv);
+ if (ret < 0) {
+ batadv_tt_local_table_free(bat_priv);
+ return ret;
+ }
+
+ batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1,
+ batadv_tt_tvlv_unicast_handler_v1, NULL,
+ BATADV_TVLV_TT, 1, BATADV_NO_FLAGS);
+
+ batadv_tvlv_handler_register(bat_priv, NULL,
+ batadv_roam_tvlv_unicast_handler_v1, NULL,
+ BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS);
+
+ INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge);
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work,
+ msecs_to_jiffies(BATADV_TT_WORK_PERIOD));
+
+ return 1;
+}
+
+/**
+ * batadv_tt_global_is_isolated() - check if a client is marked as isolated
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @addr: the mac address of the client
+ * @vid: the identifier of the VLAN where this client is connected
+ *
+ * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false
+ * otherwise
+ */
+bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
+ const u8 *addr, unsigned short vid)
+{
+ struct batadv_tt_global_entry *tt;
+ bool ret;
+
+ tt = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt)
+ return false;
+
+ ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA;
+
+ batadv_tt_global_entry_put(tt);
+
+ return ret;
+}
+
+/**
+ * batadv_tt_cache_init() - Initialize tt memory object cache
+ *
+ * Return: 0 on success or negative error number in case of failure.
+ */
+int __init batadv_tt_cache_init(void)
+{
+ size_t tl_size = sizeof(struct batadv_tt_local_entry);
+ size_t tg_size = sizeof(struct batadv_tt_global_entry);
+ size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry);
+ size_t tt_change_size = sizeof(struct batadv_tt_change_node);
+ size_t tt_req_size = sizeof(struct batadv_tt_req_node);
+ size_t tt_roam_size = sizeof(struct batadv_tt_roam_node);
+
+ batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tl_cache)
+ return -ENOMEM;
+
+ batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tg_cache)
+ goto err_tt_tl_destroy;
+
+ batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache",
+ tt_orig_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_orig_cache)
+ goto err_tt_tg_destroy;
+
+ batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache",
+ tt_change_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_change_cache)
+ goto err_tt_orig_destroy;
+
+ batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache",
+ tt_req_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_req_cache)
+ goto err_tt_change_destroy;
+
+ batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache",
+ tt_roam_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!batadv_tt_roam_cache)
+ goto err_tt_req_destroy;
+
+ return 0;
+
+err_tt_req_destroy:
+ kmem_cache_destroy(batadv_tt_req_cache);
+ batadv_tt_req_cache = NULL;
+err_tt_change_destroy:
+ kmem_cache_destroy(batadv_tt_change_cache);
+ batadv_tt_change_cache = NULL;
+err_tt_orig_destroy:
+ kmem_cache_destroy(batadv_tt_orig_cache);
+ batadv_tt_orig_cache = NULL;
+err_tt_tg_destroy:
+ kmem_cache_destroy(batadv_tg_cache);
+ batadv_tg_cache = NULL;
+err_tt_tl_destroy:
+ kmem_cache_destroy(batadv_tl_cache);
+ batadv_tl_cache = NULL;
+
+ return -ENOMEM;
+}
+
+/**
+ * batadv_tt_cache_destroy() - Destroy tt memory object cache
+ */
+void batadv_tt_cache_destroy(void)
+{
+ kmem_cache_destroy(batadv_tl_cache);
+ kmem_cache_destroy(batadv_tg_cache);
+ kmem_cache_destroy(batadv_tt_orig_cache);
+ kmem_cache_destroy(batadv_tt_change_cache);
+ kmem_cache_destroy(batadv_tt_req_cache);
+ kmem_cache_destroy(batadv_tt_roam_cache);
+}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
new file mode 100644
index 000000000000..618d9dbca5ea
--- /dev/null
+++ b/net/batman-adv/translation-table.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich, Antonio Quartulli
+ */
+
+#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+
+#include "main.h"
+
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+int batadv_tt_init(struct batadv_priv *bat_priv);
+bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr,
+ unsigned short vid, int ifindex, u32 mark);
+u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
+ const u8 *addr, unsigned short vid,
+ const char *message, bool roaming);
+int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb);
+int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb);
+void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ s32 match_vid, const char *message);
+struct batadv_tt_global_entry *
+batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid);
+void batadv_tt_global_entry_release(struct kref *ref);
+int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
+ const u8 *addr, unsigned short vid);
+struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
+ const u8 *src, const u8 *addr,
+ unsigned short vid);
+void batadv_tt_free(struct batadv_priv *bat_priv);
+bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid);
+bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
+ unsigned short vid);
+void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv);
+bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
+ u8 *addr, unsigned short vid);
+bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
+ u8 *addr, unsigned short vid);
+void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface);
+bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr,
+ unsigned short vid);
+bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
+ const u8 *addr, unsigned short vid);
+
+int batadv_tt_cache_init(void);
+void batadv_tt_cache_destroy(void);
+
+/**
+ * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and
+ * possibly release it
+ * @tt_global_entry: tt_global_entry to be free'd
+ */
+static inline void
+batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry)
+{
+ if (!tt_global_entry)
+ return;
+
+ kref_put(&tt_global_entry->common.refcount,
+ batadv_tt_global_entry_release);
+}
+
+#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
new file mode 100644
index 000000000000..76dff1f9c559
--- /dev/null
+++ b/net/batman-adv/tvlv.c
@@ -0,0 +1,663 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#include "main.h"
+
+#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "originator.h"
+#include "send.h"
+#include "tvlv.h"
+
+/**
+ * batadv_tvlv_handler_release() - release tvlv handler from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_handler_release(struct kref *ref)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount);
+ kfree_rcu(tvlv_handler, rcu);
+}
+
+/**
+ * batadv_tvlv_handler_put() - decrement the tvlv container refcounter and
+ * possibly release it
+ * @tvlv_handler: the tvlv handler to free
+ */
+static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
+{
+ if (!tvlv_handler)
+ return;
+
+ kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
+}
+
+/**
+ * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list
+ * based on the provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: tvlv handler type to look for
+ * @version: tvlv handler version to look for
+ *
+ * Return: tvlv handler if found or NULL otherwise.
+ */
+static struct batadv_tvlv_handler *
+batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
+{
+ struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tvlv_handler_tmp,
+ &bat_priv->tvlv.handler_list, list) {
+ if (tvlv_handler_tmp->type != type)
+ continue;
+
+ if (tvlv_handler_tmp->version != version)
+ continue;
+
+ if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount))
+ continue;
+
+ tvlv_handler = tvlv_handler_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tvlv_handler;
+}
+
+/**
+ * batadv_tvlv_container_release() - release tvlv from lists and free
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_container_release(struct kref *ref)
+{
+ struct batadv_tvlv_container *tvlv;
+
+ tvlv = container_of(ref, struct batadv_tvlv_container, refcount);
+ kfree(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_put() - decrement the tvlv container refcounter and
+ * possibly release it
+ * @tvlv: the tvlv container to free
+ */
+static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
+{
+ if (!tvlv)
+ return;
+
+ kref_put(&tvlv->refcount, batadv_tvlv_container_release);
+}
+
+/**
+ * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container
+ * list based on the provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: tvlv container type to look for
+ * @version: tvlv container version to look for
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ *
+ * Return: tvlv container if found or NULL otherwise.
+ */
+static struct batadv_tvlv_container *
+batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
+{
+ struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
+
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
+ hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) {
+ if (tvlv_tmp->tvlv_hdr.type != type)
+ continue;
+
+ if (tvlv_tmp->tvlv_hdr.version != version)
+ continue;
+
+ kref_get(&tvlv_tmp->refcount);
+ tvlv = tvlv_tmp;
+ break;
+ }
+
+ return tvlv;
+}
+
+/**
+ * batadv_tvlv_container_list_size() - calculate the size of the tvlv container
+ * list entries
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ *
+ * Return: size of all currently registered tvlv containers in bytes.
+ */
+static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
+{
+ struct batadv_tvlv_container *tvlv;
+ u16 tvlv_len = 0;
+
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
+ hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
+ tvlv_len += sizeof(struct batadv_tvlv_hdr);
+ tvlv_len += ntohs(tvlv->tvlv_hdr.len);
+ }
+
+ return tvlv_len;
+}
+
+/**
+ * batadv_tvlv_container_remove() - remove tvlv container from the tvlv
+ * container list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tvlv: the to be removed tvlv container
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ */
+static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_container *tvlv)
+{
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
+ if (!tvlv)
+ return;
+
+ hlist_del(&tvlv->list);
+
+ /* first call to decrement the counter, second call to free */
+ batadv_tvlv_container_put(tvlv);
+ batadv_tvlv_container_put(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_unregister() - unregister tvlv container based on the
+ * provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: tvlv container type to unregister
+ * @version: tvlv container type to unregister
+ */
+void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version)
+{
+ struct batadv_tvlv_container *tvlv;
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv = batadv_tvlv_container_get(bat_priv, type, version);
+ batadv_tvlv_container_remove(bat_priv, tvlv);
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+}
+
+/**
+ * batadv_tvlv_container_register() - register tvlv type, version and content
+ * to be propagated with each (primary interface) OGM
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: tvlv container type
+ * @version: tvlv container version
+ * @tvlv_value: tvlv container content
+ * @tvlv_value_len: tvlv container content length
+ *
+ * If a container of the same type and version was already registered the new
+ * content is going to replace the old one.
+ */
+void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
+ u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ struct batadv_tvlv_container *tvlv_old, *tvlv_new;
+
+ if (!tvlv_value)
+ tvlv_value_len = 0;
+
+ tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC);
+ if (!tvlv_new)
+ return;
+
+ tvlv_new->tvlv_hdr.version = version;
+ tvlv_new->tvlv_hdr.type = type;
+ tvlv_new->tvlv_hdr.len = htons(tvlv_value_len);
+
+ memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len));
+ INIT_HLIST_NODE(&tvlv_new->list);
+ kref_init(&tvlv_new->refcount);
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
+ batadv_tvlv_container_remove(bat_priv, tvlv_old);
+
+ kref_get(&tvlv_new->refcount);
+ hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+
+ /* don't return reference to new tvlv_container */
+ batadv_tvlv_container_put(tvlv_new);
+}
+
+/**
+ * batadv_tvlv_realloc_packet_buff() - reallocate packet buffer to accommodate
+ * requested packet size
+ * @packet_buff: packet buffer
+ * @packet_buff_len: packet buffer size
+ * @min_packet_len: requested packet minimum size
+ * @additional_packet_len: requested additional packet size on top of minimum
+ * size
+ *
+ * Return: true of the packet buffer could be changed to the requested size,
+ * false otherwise.
+ */
+static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
+ int *packet_buff_len,
+ int min_packet_len,
+ int additional_packet_len)
+{
+ unsigned char *new_buff;
+
+ new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC);
+
+ /* keep old buffer if kmalloc should fail */
+ if (!new_buff)
+ return false;
+
+ memcpy(new_buff, *packet_buff, min_packet_len);
+ kfree(*packet_buff);
+ *packet_buff = new_buff;
+ *packet_buff_len = min_packet_len + additional_packet_len;
+
+ return true;
+}
+
+/**
+ * batadv_tvlv_container_ogm_append() - append tvlv container content to given
+ * OGM packet buffer
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @packet_buff: ogm packet buffer
+ * @packet_buff_len: ogm packet buffer size including ogm header and tvlv
+ * content
+ * @packet_min_len: ogm header size to be preserved for the OGM itself
+ *
+ * The ogm packet might be enlarged or shrunk depending on the current size
+ * and the size of the to-be-appended tvlv containers.
+ *
+ * Return: size of all appended tvlv containers in bytes.
+ */
+u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len, int packet_min_len)
+{
+ struct batadv_tvlv_container *tvlv;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ u16 tvlv_value_len;
+ void *tvlv_value;
+ bool ret;
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv_value_len = batadv_tvlv_container_list_size(bat_priv);
+
+ ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len,
+ packet_min_len, tvlv_value_len);
+
+ if (!ret)
+ goto end;
+
+ if (!tvlv_value_len)
+ goto end;
+
+ tvlv_value = (*packet_buff) + packet_min_len;
+
+ hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
+ tvlv_hdr = tvlv_value;
+ tvlv_hdr->type = tvlv->tvlv_hdr.type;
+ tvlv_hdr->version = tvlv->tvlv_hdr.version;
+ tvlv_hdr->len = tvlv->tvlv_hdr.len;
+ tvlv_value = tvlv_hdr + 1;
+ memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len));
+ tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len);
+ }
+
+end:
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+ return tvlv_value_len;
+}
+
+/**
+ * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the
+ * appropriate handlers
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @tvlv_handler: tvlv callback function handling the tvlv content
+ * @packet_type: indicates for which packet type the TVLV handler is called
+ * @orig_node: orig node emitting the ogm packet
+ * @skb: the skb the TVLV handler is called for
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ *
+ * Return: success if the handler was not found or the return value of the
+ * handler callback.
+ */
+static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_handler *tvlv_handler,
+ u8 packet_type,
+ struct batadv_orig_node *orig_node,
+ struct sk_buff *skb, void *tvlv_value,
+ u16 tvlv_value_len)
+{
+ unsigned int tvlv_offset;
+ u8 *src, *dst;
+
+ if (!tvlv_handler)
+ return NET_RX_SUCCESS;
+
+ switch (packet_type) {
+ case BATADV_IV_OGM:
+ case BATADV_OGM2:
+ if (!tvlv_handler->ogm_handler)
+ return NET_RX_SUCCESS;
+
+ if (!orig_node)
+ return NET_RX_SUCCESS;
+
+ tvlv_handler->ogm_handler(bat_priv, orig_node,
+ BATADV_NO_FLAGS,
+ tvlv_value, tvlv_value_len);
+ tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED;
+ break;
+ case BATADV_UNICAST_TVLV:
+ if (!skb)
+ return NET_RX_SUCCESS;
+
+ if (!tvlv_handler->unicast_handler)
+ return NET_RX_SUCCESS;
+
+ src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src;
+ dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst;
+
+ return tvlv_handler->unicast_handler(bat_priv, src,
+ dst, tvlv_value,
+ tvlv_value_len);
+ case BATADV_MCAST:
+ if (!skb)
+ return NET_RX_SUCCESS;
+
+ if (!tvlv_handler->mcast_handler)
+ return NET_RX_SUCCESS;
+
+ tvlv_offset = (unsigned char *)tvlv_value - skb->data;
+ skb_set_network_header(skb, tvlv_offset);
+ skb_set_transport_header(skb, tvlv_offset + tvlv_value_len);
+
+ return tvlv_handler->mcast_handler(bat_priv, skb);
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the
+ * appropriate handlers
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @packet_type: indicates for which packet type the TVLV handler is called
+ * @orig_node: orig node emitting the ogm packet
+ * @skb: the skb the TVLV handler is called for
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ *
+ * Return: success when processing an OGM or the return value of all called
+ * handler callbacks.
+ */
+int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
+ u8 packet_type,
+ struct batadv_orig_node *orig_node,
+ struct sk_buff *skb, void *tvlv_value,
+ u16 tvlv_value_len)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ u16 tvlv_value_cont_len;
+ u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND;
+ int ret = NET_RX_SUCCESS;
+
+ while (tvlv_value_len >= sizeof(*tvlv_hdr)) {
+ tvlv_hdr = tvlv_value;
+ tvlv_value_cont_len = ntohs(tvlv_hdr->len);
+ tvlv_value = tvlv_hdr + 1;
+ tvlv_value_len -= sizeof(*tvlv_hdr);
+
+ if (tvlv_value_cont_len > tvlv_value_len)
+ break;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv,
+ tvlv_hdr->type,
+ tvlv_hdr->version);
+
+ ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler,
+ packet_type, orig_node, skb,
+ tvlv_value,
+ tvlv_value_cont_len);
+ batadv_tvlv_handler_put(tvlv_handler);
+ tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
+ tvlv_value_len -= tvlv_value_cont_len;
+ }
+
+ if (packet_type != BATADV_IV_OGM &&
+ packet_type != BATADV_OGM2)
+ return ret;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tvlv_handler,
+ &bat_priv->tvlv.handler_list, list) {
+ if (!tvlv_handler->ogm_handler)
+ continue;
+
+ if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) &&
+ !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED))
+ tvlv_handler->ogm_handler(bat_priv, orig_node,
+ cifnotfound, NULL, 0);
+
+ tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED;
+ }
+ rcu_read_unlock();
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate
+ * handlers
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @batadv_ogm_packet: ogm packet containing the tvlv containers
+ * @orig_node: orig node emitting the ogm packet
+ */
+void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_orig_node *orig_node)
+{
+ void *tvlv_value;
+ u16 tvlv_value_len;
+
+ if (!batadv_ogm_packet)
+ return;
+
+ tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len);
+ if (!tvlv_value_len)
+ return;
+
+ tvlv_value = batadv_ogm_packet + 1;
+
+ batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL,
+ tvlv_value, tvlv_value_len);
+}
+
+/**
+ * batadv_tvlv_handler_register() - register tvlv handler based on the provided
+ * type and version (both need to match) for ogm tvlv payload and/or unicast
+ * payload
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @optr: ogm tvlv handler callback function. This function receives the orig
+ * node, flags and the tvlv content as argument to process.
+ * @uptr: unicast tvlv handler callback function. This function receives the
+ * source & destination of the unicast packet as well as the tvlv content
+ * to process.
+ * @mptr: multicast packet tvlv handler callback function. This function
+ * receives the full skb to process, with the skb network header pointing
+ * to the current tvlv and the skb transport header pointing to the first
+ * byte after the current tvlv.
+ * @type: tvlv handler type to be registered
+ * @version: tvlv handler version to be registered
+ * @flags: flags to enable or disable TVLV API behavior
+ */
+void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
+ void (*optr)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ int (*uptr)(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ int (*mptr)(struct batadv_priv *bat_priv,
+ struct sk_buff *skb),
+ u8 type, u8 version, u8 flags)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
+ if (tvlv_handler) {
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+ batadv_tvlv_handler_put(tvlv_handler);
+ return;
+ }
+
+ tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC);
+ if (!tvlv_handler) {
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+ return;
+ }
+
+ tvlv_handler->ogm_handler = optr;
+ tvlv_handler->unicast_handler = uptr;
+ tvlv_handler->mcast_handler = mptr;
+ tvlv_handler->type = type;
+ tvlv_handler->version = version;
+ tvlv_handler->flags = flags;
+ kref_init(&tvlv_handler->refcount);
+ INIT_HLIST_NODE(&tvlv_handler->list);
+
+ kref_get(&tvlv_handler->refcount);
+ hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+
+ /* don't return reference to new tvlv_handler */
+ batadv_tvlv_handler_put(tvlv_handler);
+}
+
+/**
+ * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the
+ * provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: tvlv handler type to be unregistered
+ * @version: tvlv handler version to be unregistered
+ */
+void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
+ if (!tvlv_handler)
+ return;
+
+ batadv_tvlv_handler_put(tvlv_handler);
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+ hlist_del_rcu(&tvlv_handler->list);
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+ batadv_tvlv_handler_put(tvlv_handler);
+}
+
+/**
+ * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the
+ * specified host
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @type: tvlv type
+ * @version: tvlv version
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ */
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src,
+ const u8 *dst, u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len)
+{
+ struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ struct batadv_orig_node *orig_node;
+ struct sk_buff *skb;
+ unsigned char *tvlv_buff;
+ unsigned int tvlv_len;
+ ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (!orig_node)
+ return;
+
+ tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len);
+ if (!skb)
+ goto out;
+
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, ETH_HLEN);
+ tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len);
+ unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff;
+ unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV;
+ unicast_tvlv_packet->version = BATADV_COMPAT_VERSION;
+ unicast_tvlv_packet->ttl = BATADV_TTL;
+ unicast_tvlv_packet->reserved = 0;
+ unicast_tvlv_packet->tvlv_len = htons(tvlv_len);
+ unicast_tvlv_packet->align = 0;
+ ether_addr_copy(unicast_tvlv_packet->src, src);
+ ether_addr_copy(unicast_tvlv_packet->dst, dst);
+
+ tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1);
+ tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff;
+ tvlv_hdr->version = version;
+ tvlv_hdr->type = type;
+ tvlv_hdr->len = htons(tvlv_value_len);
+ tvlv_buff += sizeof(*tvlv_hdr);
+ memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
+
+ batadv_send_skb_to_orig(skb, orig_node, NULL);
+out:
+ batadv_orig_node_put(orig_node);
+}
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
new file mode 100644
index 000000000000..e5697230d991
--- /dev/null
+++ b/net/batman-adv/tvlv.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_TVLV_H_
+#define _NET_BATMAN_ADV_TVLV_H_
+
+#include "main.h"
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
+ u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len);
+u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len, int packet_min_len);
+void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_orig_node *orig_node);
+void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version);
+
+void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
+ void (*optr)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ int (*uptr)(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value,
+ u16 tvlv_value_len),
+ int (*mptr)(struct batadv_priv *bat_priv,
+ struct sk_buff *skb),
+ u8 type, u8 version, u8 flags);
+void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
+ u8 type, u8 version);
+int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
+ u8 packet_type,
+ struct batadv_orig_node *orig_node,
+ struct sk_buff *skb, void *tvlv_buff,
+ u16 tvlv_buff_len);
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src,
+ const u8 *dst, u8 type, u8 version,
+ void *tvlv_value, u16 tvlv_value_len);
+
+#endif /* _NET_BATMAN_ADV_TVLV_H_ */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
new file mode 100644
index 000000000000..8fc5fe0e9b05
--- /dev/null
+++ b/net/batman-adv/types.h
@@ -0,0 +1,2212 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ */
+
+#ifndef _NET_BATMAN_ADV_TYPES_H_
+#define _NET_BATMAN_ADV_TYPES_H_
+
+#ifndef _NET_BATMAN_ADV_MAIN_H_
+#error only "main.h" can be included directly
+#endif
+
+#include <linux/average.h>
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+
+/**
+ * typedef batadv_dat_addr_t - type used for all DHT addresses
+ *
+ * If it is changed, BATADV_DAT_ADDR_MAX is changed as well.
+ *
+ * *Please be careful: batadv_dat_addr_t must be UNSIGNED*
+ */
+typedef u16 batadv_dat_addr_t;
+
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+/**
+ * enum batadv_dhcp_recipient - dhcp destination
+ */
+enum batadv_dhcp_recipient {
+ /** @BATADV_DHCP_NO: packet is not a dhcp message */
+ BATADV_DHCP_NO = 0,
+
+ /** @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server */
+ BATADV_DHCP_TO_SERVER,
+
+ /** @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client */
+ BATADV_DHCP_TO_CLIENT,
+};
+
+/**
+ * BATADV_TT_REMOTE_MASK - bitmask selecting the flags that are sent over the
+ * wire only
+ */
+#define BATADV_TT_REMOTE_MASK 0x00FF
+
+/**
+ * BATADV_TT_SYNC_MASK - bitmask of the flags that need to be kept in sync
+ * among the nodes. These flags are used to compute the global/local CRC
+ */
+#define BATADV_TT_SYNC_MASK 0x00F0
+
+/**
+ * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data
+ */
+struct batadv_hard_iface_bat_iv {
+ /** @ogm_buff: buffer holding the OGM packet */
+ unsigned char *ogm_buff;
+
+ /** @ogm_buff_len: length of the OGM packet buffer */
+ int ogm_buff_len;
+
+ /** @ogm_seqno: OGM sequence number - used to identify each OGM */
+ atomic_t ogm_seqno;
+
+ /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
+ struct mutex ogm_buff_mutex;
+};
+
+/**
+ * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V
+ */
+enum batadv_v_hard_iface_flags {
+ /**
+ * @BATADV_FULL_DUPLEX: tells if the connection over this link is
+ * full-duplex
+ */
+ BATADV_FULL_DUPLEX = BIT(0),
+
+ /**
+ * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that
+ * no throughput data is available for this interface and that default
+ * values are assumed.
+ */
+ BATADV_WARNING_DEFAULT = BIT(1),
+};
+
+/**
+ * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data
+ */
+struct batadv_hard_iface_bat_v {
+ /** @elp_interval: time interval between two ELP transmissions */
+ atomic_t elp_interval;
+
+ /** @elp_seqno: current ELP sequence number */
+ atomic_t elp_seqno;
+
+ /** @elp_skb: base skb containing the ELP message to send */
+ struct sk_buff *elp_skb;
+
+ /** @elp_wq: workqueue used to schedule ELP transmissions */
+ struct delayed_work elp_wq;
+
+ /** @aggr_wq: workqueue used to transmit queued OGM packets */
+ struct delayed_work aggr_wq;
+
+ /** @aggr_list: queue for to be aggregated OGM packets */
+ struct sk_buff_head aggr_list;
+
+ /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */
+ unsigned int aggr_len;
+
+ /**
+ * @throughput_override: throughput override to disable link
+ * auto-detection
+ */
+ atomic_t throughput_override;
+
+ /** @flags: interface specific flags */
+ u8 flags;
+};
+
+/**
+ * enum batadv_hard_iface_wifi_flags - Flags describing the wifi configuration
+ * of a batadv_hard_iface
+ */
+enum batadv_hard_iface_wifi_flags {
+ /** @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device */
+ BATADV_HARDIF_WIFI_WEXT_DIRECT = BIT(0),
+
+ /** @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device */
+ BATADV_HARDIF_WIFI_CFG80211_DIRECT = BIT(1),
+
+ /**
+ * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device
+ */
+ BATADV_HARDIF_WIFI_WEXT_INDIRECT = BIT(2),
+
+ /**
+ * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi
+ * device
+ */
+ BATADV_HARDIF_WIFI_CFG80211_INDIRECT = BIT(3),
+};
+
+/**
+ * struct batadv_hard_iface - network device known to batman-adv
+ */
+struct batadv_hard_iface {
+ /** @list: list node for batadv_hardif_list */
+ struct list_head list;
+
+ /** @if_status: status of the interface for batman-adv */
+ char if_status;
+
+ /**
+ * @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
+ */
+ u8 num_bcasts;
+
+ /**
+ * @wifi_flags: flags whether this is (directly or indirectly) a wifi
+ * interface
+ */
+ u32 wifi_flags;
+
+ /** @net_dev: pointer to the net_device */
+ struct net_device *net_dev;
+
+ /** @dev_tracker: device tracker for @net_dev */
+ netdevice_tracker dev_tracker;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /**
+ * @batman_adv_ptype: packet type describing packets that should be
+ * processed by batman-adv for this interface
+ */
+ struct packet_type batman_adv_ptype;
+
+ /**
+ * @mesh_iface: the batman-adv interface which uses this network
+ * interface
+ */
+ struct net_device *mesh_iface;
+
+ /** @meshif_dev_tracker: device tracker for @mesh_iface */
+ netdevice_tracker meshif_dev_tracker;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+
+ /**
+ * @hop_penalty: penalty which will be applied to the tq-field
+ * of an OGM received via this interface
+ */
+ atomic_t hop_penalty;
+
+ /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */
+ struct batadv_hard_iface_bat_iv bat_iv;
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ /** @bat_v: per hard-interface B.A.T.M.A.N. V data */
+ struct batadv_hard_iface_bat_v bat_v;
+#endif
+
+ /**
+ * @neigh_list: list of unique single hop neighbors via this interface
+ */
+ struct hlist_head neigh_list;
+
+ /** @neigh_list_lock: lock protecting neigh_list */
+ spinlock_t neigh_list_lock;
+};
+
+/**
+ * struct batadv_orig_ifinfo_bat_iv - B.A.T.M.A.N. IV private orig_ifinfo
+ * members
+ */
+struct batadv_orig_ifinfo_bat_iv {
+ /**
+ * @bcast_own: bitfield which counts the number of our OGMs this
+ * orig_node rebroadcasted "back" to us (relative to last_real_seqno)
+ */
+ DECLARE_BITMAP(bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+ /** @bcast_own_sum: sum of bcast_own */
+ u8 bcast_own_sum;
+};
+
+/**
+ * struct batadv_orig_ifinfo - originator info per outgoing interface
+ */
+struct batadv_orig_ifinfo {
+ /** @list: list node for &batadv_orig_node.ifinfo_list */
+ struct hlist_node list;
+
+ /** @if_outgoing: pointer to outgoing hard-interface */
+ struct batadv_hard_iface *if_outgoing;
+
+ /** @router: router that should be used to reach this originator */
+ struct batadv_neigh_node __rcu *router;
+
+ /** @last_real_seqno: last and best known sequence number */
+ u32 last_real_seqno;
+
+ /** @last_ttl: ttl of last received packet */
+ u8 last_ttl;
+
+ /** @last_seqno_forwarded: seqno of the OGM which was forwarded last */
+ u32 last_seqno_forwarded;
+
+ /** @batman_seqno_reset: time when the batman seqno window was reset */
+ unsigned long batman_seqno_reset;
+
+ /** @bat_iv: B.A.T.M.A.N. IV private structure */
+ struct batadv_orig_ifinfo_bat_iv bat_iv;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_frag_table_entry - head in the fragment buffer table
+ */
+struct batadv_frag_table_entry {
+ /** @fragment_list: head of list with fragments */
+ struct hlist_head fragment_list;
+
+ /** @lock: lock to protect the list of fragments */
+ spinlock_t lock;
+
+ /** @timestamp: time (jiffy) of last received fragment */
+ unsigned long timestamp;
+
+ /** @seqno: sequence number of the fragments in the list */
+ u16 seqno;
+
+ /** @size: accumulated size of packets in list */
+ u16 size;
+
+ /** @total_size: expected size of the assembled packet */
+ u16 total_size;
+};
+
+/**
+ * struct batadv_frag_list_entry - entry in a list of fragments
+ */
+struct batadv_frag_list_entry {
+ /** @list: list node information */
+ struct hlist_node list;
+
+ /** @skb: fragment */
+ struct sk_buff *skb;
+
+ /** @no: fragment number in the set */
+ u8 no;
+};
+
+/**
+ * struct batadv_vlan_tt - VLAN specific TT attributes
+ */
+struct batadv_vlan_tt {
+ /** @crc: CRC32 checksum of the entries belonging to this vlan */
+ u32 crc;
+
+ /** @num_entries: number of TT entries for this VLAN */
+ atomic_t num_entries;
+};
+
+/**
+ * struct batadv_orig_node_vlan - VLAN specific data per orig_node
+ */
+struct batadv_orig_node_vlan {
+ /** @vid: the VLAN identifier */
+ unsigned short vid;
+
+ /** @tt: VLAN specific TT attributes */
+ struct batadv_vlan_tt tt;
+
+ /** @list: list node for &batadv_orig_node.vlan_list */
+ struct hlist_node list;
+
+ /**
+ * @refcount: number of context where this object is currently in use
+ */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in a RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members
+ */
+struct batadv_orig_bat_iv {
+ /**
+ * @ogm_cnt_lock: lock protecting &batadv_orig_ifinfo_bat_iv.bcast_own,
+ * &batadv_orig_ifinfo_bat_iv.bcast_own_sum,
+ * &batadv_neigh_ifinfo_bat_iv.bat_iv.real_bits and
+ * &batadv_neigh_ifinfo_bat_iv.real_packet_count
+ */
+ spinlock_t ogm_cnt_lock;
+};
+
+/**
+ * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh
+ */
+struct batadv_orig_node {
+ /** @orig: originator ethernet address */
+ u8 orig[ETH_ALEN];
+
+ /** @ifinfo_list: list for routers per outgoing interface */
+ struct hlist_head ifinfo_list;
+
+ /**
+ * @last_bonding_candidate: pointer to last ifinfo of last used router
+ */
+ struct batadv_orig_ifinfo *last_bonding_candidate;
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ /** @dat_addr: address of the orig node in the distributed hash */
+ batadv_dat_addr_t dat_addr;
+#endif
+
+ /** @last_seen: time when last packet from this node was received */
+ unsigned long last_seen;
+
+ /**
+ * @bcast_seqno_reset: time when the broadcast seqno window was reset
+ */
+ unsigned long bcast_seqno_reset;
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ /**
+ * @mcast_handler_lock: synchronizes mcast-capability and -flag changes
+ */
+ spinlock_t mcast_handler_lock;
+
+ /** @mcast_flags: multicast flags announced by the orig node */
+ u8 mcast_flags;
+
+ /**
+ * @mcast_want_all_unsnoopables_node: a list node for the
+ * mcast.want_all_unsnoopables list
+ */
+ struct hlist_node mcast_want_all_unsnoopables_node;
+
+ /**
+ * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4
+ * list
+ */
+ struct hlist_node mcast_want_all_ipv4_node;
+ /**
+ * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6
+ * list
+ */
+ struct hlist_node mcast_want_all_ipv6_node;
+
+ /**
+ * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4
+ * list
+ */
+ struct hlist_node mcast_want_all_rtr4_node;
+ /**
+ * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6
+ * list
+ */
+ struct hlist_node mcast_want_all_rtr6_node;
+#endif
+
+ /** @capabilities: announced capabilities of this originator */
+ unsigned long capabilities;
+
+ /**
+ * @capa_initialized: bitfield to remember whether a capability was
+ * initialized
+ */
+ unsigned long capa_initialized;
+
+ /** @last_ttvn: last seen translation table version number */
+ atomic_t last_ttvn;
+
+ /** @tt_buff: last tt changeset this node received from the orig node */
+ unsigned char *tt_buff;
+
+ /**
+ * @tt_buff_len: length of the last tt changeset this node received
+ * from the orig node
+ */
+ s16 tt_buff_len;
+
+ /** @tt_buff_lock: lock that protects tt_buff and tt_buff_len */
+ spinlock_t tt_buff_lock;
+
+ /**
+ * @tt_lock: avoids concurrent read from and write to the table. Table
+ * update is made up of two operations (data structure update and
+ * metadata -CRC/TTVN-recalculation) and they have to be executed
+ * atomically in order to avoid another thread to read the
+ * table/metadata between those.
+ */
+ spinlock_t tt_lock;
+
+ /**
+ * @bcast_bits: bitfield containing the info which payload broadcast
+ * originated from this orig node this host already has seen (relative
+ * to last_bcast_seqno)
+ */
+ DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+ /**
+ * @last_bcast_seqno: last broadcast sequence number received by this
+ * host
+ */
+ u32 last_bcast_seqno;
+
+ /**
+ * @neigh_list: list of potential next hop neighbor towards this orig
+ * node
+ */
+ struct hlist_head neigh_list;
+
+ /**
+ * @neigh_list_lock: lock protecting neigh_list, ifinfo_list,
+ * last_bonding_candidate and router
+ */
+ spinlock_t neigh_list_lock;
+
+ /** @hash_entry: hlist node for &batadv_priv.orig_hash */
+ struct hlist_node hash_entry;
+
+ /** @bat_priv: pointer to mesh_iface this orig node belongs to */
+ struct batadv_priv *bat_priv;
+
+ /** @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno */
+ spinlock_t bcast_seqno_lock;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+
+ /** @fragments: array with heads for fragment chains */
+ struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT];
+
+ /**
+ * @vlan_list: a list of orig_node_vlan structs, one per VLAN served by
+ * the originator represented by this object
+ */
+ struct hlist_head vlan_list;
+
+ /** @vlan_list_lock: lock protecting vlan_list */
+ spinlock_t vlan_list_lock;
+
+ /** @bat_iv: B.A.T.M.A.N. IV private structure */
+ struct batadv_orig_bat_iv bat_iv;
+};
+
+/**
+ * enum batadv_orig_capabilities - orig node capabilities
+ */
+enum batadv_orig_capabilities {
+ /**
+ * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table
+ * enabled
+ */
+ BATADV_ORIG_CAPA_HAS_DAT,
+
+ /** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */
+ BATADV_ORIG_CAPA_HAS_TT,
+
+ /**
+ * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability
+ * (= orig node announces a tvlv of type BATADV_TVLV_MCAST)
+ */
+ BATADV_ORIG_CAPA_HAS_MCAST,
+};
+
+/**
+ * struct batadv_gw_node - structure for orig nodes announcing gw capabilities
+ */
+struct batadv_gw_node {
+ /** @list: list node for &batadv_priv_gw.list */
+ struct hlist_node list;
+
+ /** @orig_node: pointer to corresponding orig node */
+ struct batadv_orig_node *orig_node;
+
+ /** @bandwidth_down: advertised uplink download bandwidth */
+ u32 bandwidth_down;
+
+ /** @bandwidth_up: advertised uplink upload bandwidth */
+ u32 bandwidth_up;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+DECLARE_EWMA(throughput, 10, 8)
+
+/**
+ * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
+ * information
+ */
+struct batadv_hardif_neigh_node_bat_v {
+ /** @throughput: ewma link throughput towards this neighbor */
+ struct ewma_throughput throughput;
+
+ /** @elp_interval: time interval between two ELP transmissions */
+ u32 elp_interval;
+
+ /** @elp_latest_seqno: latest and best known ELP sequence number */
+ u32 elp_latest_seqno;
+
+ /**
+ * @last_unicast_tx: when the last unicast packet has been sent to this
+ * neighbor
+ */
+ unsigned long last_unicast_tx;
+};
+
+/**
+ * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
+ */
+struct batadv_hardif_neigh_node {
+ /** @list: list node for &batadv_hard_iface.neigh_list */
+ struct hlist_node list;
+
+ /** @addr: the MAC address of the neighboring interface */
+ u8 addr[ETH_ALEN];
+
+ /**
+ * @orig: the address of the originator this neighbor node belongs to
+ */
+ u8 orig[ETH_ALEN];
+
+ /** @if_incoming: pointer to incoming hard-interface */
+ struct batadv_hard_iface *if_incoming;
+
+ /** @last_seen: when last packet via this neighbor was received */
+ unsigned long last_seen;
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ /** @bat_v: B.A.T.M.A.N. V private data */
+ struct batadv_hardif_neigh_node_bat_v bat_v;
+#endif
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in a RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_neigh_node - structure for single hops neighbors
+ */
+struct batadv_neigh_node {
+ /** @list: list node for &batadv_orig_node.neigh_list */
+ struct hlist_node list;
+
+ /** @orig_node: pointer to corresponding orig_node */
+ struct batadv_orig_node *orig_node;
+
+ /** @addr: the MAC address of the neighboring interface */
+ u8 addr[ETH_ALEN];
+
+ /** @ifinfo_list: list for routing metrics per outgoing interface */
+ struct hlist_head ifinfo_list;
+
+ /** @ifinfo_lock: lock protecting ifinfo_list and its members */
+ spinlock_t ifinfo_lock;
+
+ /** @if_incoming: pointer to incoming hard-interface */
+ struct batadv_hard_iface *if_incoming;
+
+ /** @last_seen: when last packet via this neighbor was received */
+ unsigned long last_seen;
+
+ /** @hardif_neigh: hardif_neigh of this neighbor */
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing
+ * interface for B.A.T.M.A.N. IV
+ */
+struct batadv_neigh_ifinfo_bat_iv {
+ /** @tq_recv: ring buffer of received TQ values from this neigh node */
+ u8 tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE];
+
+ /** @tq_index: ring buffer index */
+ u8 tq_index;
+
+ /**
+ * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv)
+ */
+ u8 tq_avg;
+
+ /**
+ * @real_bits: bitfield containing the number of OGMs received from this
+ * neigh node (relative to orig_node->last_real_seqno)
+ */
+ DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+ /** @real_packet_count: counted result of real_bits */
+ u8 real_packet_count;
+};
+
+/**
+ * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing
+ * interface for B.A.T.M.A.N. V
+ */
+struct batadv_neigh_ifinfo_bat_v {
+ /**
+ * @throughput: last throughput metric received from originator via this
+ * neigh
+ */
+ u32 throughput;
+
+ /** @last_seqno: last sequence number known for this neighbor */
+ u32 last_seqno;
+};
+
+/**
+ * struct batadv_neigh_ifinfo - neighbor information per outgoing interface
+ */
+struct batadv_neigh_ifinfo {
+ /** @list: list node for &batadv_neigh_node.ifinfo_list */
+ struct hlist_node list;
+
+ /** @if_outgoing: pointer to outgoing hard-interface */
+ struct batadv_hard_iface *if_outgoing;
+
+ /** @bat_iv: B.A.T.M.A.N. IV private structure */
+ struct batadv_neigh_ifinfo_bat_iv bat_iv;
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ /** @bat_v: B.A.T.M.A.N. V private data */
+ struct batadv_neigh_ifinfo_bat_v bat_v;
+#endif
+
+ /** @last_ttl: last received ttl from this neigh node */
+ u8 last_ttl;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in a RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+
+/**
+ * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression
+ */
+struct batadv_bcast_duplist_entry {
+ /** @orig: mac address of orig node originating the broadcast */
+ u8 orig[ETH_ALEN];
+
+ /** @crc: crc32 checksum of broadcast payload */
+ u32 crc;
+
+ /** @entrytime: time when the broadcast packet was received */
+ unsigned long entrytime;
+};
+#endif
+
+/**
+ * enum batadv_counters - indices for traffic counters
+ */
+enum batadv_counters {
+ /** @BATADV_CNT_TX: transmitted payload traffic packet counter */
+ BATADV_CNT_TX,
+
+ /** @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter */
+ BATADV_CNT_TX_BYTES,
+
+ /**
+ * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet
+ * counter
+ */
+ BATADV_CNT_TX_DROPPED,
+
+ /** @BATADV_CNT_RX: received payload traffic packet counter */
+ BATADV_CNT_RX,
+
+ /** @BATADV_CNT_RX_BYTES: received payload traffic bytes counter */
+ BATADV_CNT_RX_BYTES,
+
+ /** @BATADV_CNT_FORWARD: forwarded payload traffic packet counter */
+ BATADV_CNT_FORWARD,
+
+ /**
+ * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter
+ */
+ BATADV_CNT_FORWARD_BYTES,
+
+ /**
+ * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet
+ * counter
+ */
+ BATADV_CNT_MGMT_TX,
+
+ /**
+ * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes
+ * counter
+ */
+ BATADV_CNT_MGMT_TX_BYTES,
+
+ /**
+ * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter
+ */
+ BATADV_CNT_MGMT_RX,
+
+ /**
+ * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes
+ * counter
+ */
+ BATADV_CNT_MGMT_RX_BYTES,
+
+ /** @BATADV_CNT_FRAG_TX: transmitted fragment traffic packet counter */
+ BATADV_CNT_FRAG_TX,
+
+ /**
+ * @BATADV_CNT_FRAG_TX_BYTES: transmitted fragment traffic bytes counter
+ */
+ BATADV_CNT_FRAG_TX_BYTES,
+
+ /** @BATADV_CNT_FRAG_RX: received fragment traffic packet counter */
+ BATADV_CNT_FRAG_RX,
+
+ /**
+ * @BATADV_CNT_FRAG_RX_BYTES: received fragment traffic bytes counter
+ */
+ BATADV_CNT_FRAG_RX_BYTES,
+
+ /** @BATADV_CNT_FRAG_FWD: forwarded fragment traffic packet counter */
+ BATADV_CNT_FRAG_FWD,
+
+ /**
+ * @BATADV_CNT_FRAG_FWD_BYTES: forwarded fragment traffic bytes counter
+ */
+ BATADV_CNT_FRAG_FWD_BYTES,
+
+ /**
+ * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter
+ */
+ BATADV_CNT_TT_REQUEST_TX,
+
+ /** @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter */
+ BATADV_CNT_TT_REQUEST_RX,
+
+ /**
+ * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet
+ * counter
+ */
+ BATADV_CNT_TT_RESPONSE_TX,
+
+ /**
+ * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter
+ */
+ BATADV_CNT_TT_RESPONSE_RX,
+
+ /**
+ * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet
+ * counter
+ */
+ BATADV_CNT_TT_ROAM_ADV_TX,
+
+ /**
+ * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter
+ */
+ BATADV_CNT_TT_ROAM_ADV_RX,
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ /**
+ * @BATADV_CNT_MCAST_TX: transmitted batman-adv multicast packets
+ * counter
+ */
+ BATADV_CNT_MCAST_TX,
+
+ /**
+ * @BATADV_CNT_MCAST_TX_BYTES: transmitted batman-adv multicast packets
+ * bytes counter
+ */
+ BATADV_CNT_MCAST_TX_BYTES,
+
+ /**
+ * @BATADV_CNT_MCAST_TX_LOCAL: counter for multicast packets which
+ * were locally encapsulated and transmitted as batman-adv multicast
+ * packets
+ */
+ BATADV_CNT_MCAST_TX_LOCAL,
+
+ /**
+ * @BATADV_CNT_MCAST_TX_LOCAL_BYTES: bytes counter for multicast packets
+ * which were locally encapsulated and transmitted as batman-adv
+ * multicast packets
+ */
+ BATADV_CNT_MCAST_TX_LOCAL_BYTES,
+
+ /**
+ * @BATADV_CNT_MCAST_RX: received batman-adv multicast packet counter
+ */
+ BATADV_CNT_MCAST_RX,
+
+ /**
+ * @BATADV_CNT_MCAST_RX_BYTES: received batman-adv multicast packet
+ * bytes counter
+ */
+ BATADV_CNT_MCAST_RX_BYTES,
+
+ /**
+ * @BATADV_CNT_MCAST_RX_LOCAL: counter for received batman-adv multicast
+ * packets which were forwarded to the local mesh interface
+ */
+ BATADV_CNT_MCAST_RX_LOCAL,
+
+ /**
+ * @BATADV_CNT_MCAST_RX_LOCAL_BYTES: bytes counter for received
+ * batman-adv multicast packets which were forwarded to the local mesh
+ * interface
+ */
+ BATADV_CNT_MCAST_RX_LOCAL_BYTES,
+
+ /**
+ * @BATADV_CNT_MCAST_FWD: counter for received batman-adv multicast
+ * packets which were forwarded to other, neighboring nodes
+ */
+ BATADV_CNT_MCAST_FWD,
+
+ /**
+ * @BATADV_CNT_MCAST_FWD_BYTES: bytes counter for received batman-adv
+ * multicast packets which were forwarded to other, neighboring nodes
+ */
+ BATADV_CNT_MCAST_FWD_BYTES,
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ /**
+ * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter
+ */
+ BATADV_CNT_DAT_GET_TX,
+
+ /** @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter */
+ BATADV_CNT_DAT_GET_RX,
+
+ /**
+ * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter
+ */
+ BATADV_CNT_DAT_PUT_TX,
+
+ /** @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter */
+ BATADV_CNT_DAT_PUT_RX,
+
+ /**
+ * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic
+ * packet counter
+ */
+ BATADV_CNT_DAT_CACHED_REPLY_TX,
+#endif
+
+ /** @BATADV_CNT_NUM: number of traffic counters */
+ BATADV_CNT_NUM,
+};
+
+/**
+ * struct batadv_priv_tt - per mesh interface translation table data
+ */
+struct batadv_priv_tt {
+ /** @vn: translation table version number */
+ atomic_t vn;
+
+ /**
+ * @ogm_append_cnt: counter of number of OGMs containing the local tt
+ * diff
+ */
+ atomic_t ogm_append_cnt;
+
+ /** @local_changes: changes registered in an originator interval */
+ size_t local_changes;
+
+ /**
+ * @changes_list: tracks tt local changes within an originator interval
+ */
+ struct list_head changes_list;
+
+ /** @local_hash: local translation table hash table */
+ struct batadv_hashtable *local_hash;
+
+ /** @global_hash: global translation table hash table */
+ struct batadv_hashtable *global_hash;
+
+ /** @req_list: list of pending & unanswered tt_requests */
+ struct hlist_head req_list;
+
+ /**
+ * @roam_list: list of the last roaming events of each client limiting
+ * the number of roaming events to avoid route flapping
+ */
+ struct list_head roam_list;
+
+ /** @changes_list_lock: lock protecting changes_list & local_changes */
+ spinlock_t changes_list_lock;
+
+ /** @req_list_lock: lock protecting req_list */
+ spinlock_t req_list_lock;
+
+ /** @roam_list_lock: lock protecting roam_list */
+ spinlock_t roam_list_lock;
+
+ /** @last_changeset: last tt changeset this host has generated */
+ unsigned char *last_changeset;
+
+ /**
+ * @last_changeset_len: length of last tt changeset this host has
+ * generated
+ */
+ s16 last_changeset_len;
+
+ /**
+ * @last_changeset_lock: lock protecting last_changeset &
+ * last_changeset_len
+ */
+ spinlock_t last_changeset_lock;
+
+ /**
+ * @commit_lock: prevents from executing a local TT commit while reading
+ * the local table. The local TT commit is made up of two operations
+ * (data structure update and metadata -CRC/TTVN- recalculation) and
+ * they have to be executed atomically in order to avoid another thread
+ * to read the table/metadata between those.
+ */
+ spinlock_t commit_lock;
+
+ /** @work: work queue callback item for translation table purging */
+ struct delayed_work work;
+};
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+
+/**
+ * struct batadv_priv_bla - per mesh interface bridge loop avoidance data
+ */
+struct batadv_priv_bla {
+ /** @num_requests: number of bla requests in flight */
+ atomic_t num_requests;
+
+ /**
+ * @claim_hash: hash table containing mesh nodes this host has claimed
+ */
+ struct batadv_hashtable *claim_hash;
+
+ /**
+ * @backbone_hash: hash table containing all detected backbone gateways
+ */
+ struct batadv_hashtable *backbone_hash;
+
+ /** @loopdetect_addr: MAC address used for own loopdetection frames */
+ u8 loopdetect_addr[ETH_ALEN];
+
+ /**
+ * @loopdetect_lasttime: time when the loopdetection frames were sent
+ */
+ unsigned long loopdetect_lasttime;
+
+ /**
+ * @loopdetect_next: how many periods to wait for the next loopdetect
+ * process
+ */
+ atomic_t loopdetect_next;
+
+ /**
+ * @bcast_duplist: recently received broadcast packets array (for
+ * broadcast duplicate suppression)
+ */
+ struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE];
+
+ /**
+ * @bcast_duplist_curr: index of last broadcast packet added to
+ * bcast_duplist
+ */
+ int bcast_duplist_curr;
+
+ /**
+ * @bcast_duplist_lock: lock protecting bcast_duplist &
+ * bcast_duplist_curr
+ */
+ spinlock_t bcast_duplist_lock;
+
+ /** @claim_dest: local claim data (e.g. claim group) */
+ struct batadv_bla_claim_dst claim_dest;
+
+ /** @work: work queue callback item for cleanups & bla announcements */
+ struct delayed_work work;
+};
+#endif
+
+/**
+ * struct batadv_priv_gw - per mesh interface gateway data
+ */
+struct batadv_priv_gw {
+ /** @gateway_list: list of available gateway nodes */
+ struct hlist_head gateway_list;
+
+ /** @list_lock: lock protecting gateway_list, curr_gw, generation */
+ spinlock_t list_lock;
+
+ /** @curr_gw: pointer to currently selected gateway node */
+ struct batadv_gw_node __rcu *curr_gw;
+
+ /** @generation: current (generation) sequence number */
+ unsigned int generation;
+
+ /**
+ * @mode: gateway operation: off, client or server (see batadv_gw_modes)
+ */
+ atomic_t mode;
+
+ /** @sel_class: gateway selection class (applies if gw_mode client) */
+ atomic_t sel_class;
+
+ /**
+ * @bandwidth_down: advertised uplink download bandwidth (if gw_mode
+ * server)
+ */
+ atomic_t bandwidth_down;
+
+ /**
+ * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server)
+ */
+ atomic_t bandwidth_up;
+
+ /** @reselect: bool indicating a gateway re-selection is in progress */
+ atomic_t reselect;
+};
+
+/**
+ * struct batadv_priv_tvlv - per mesh interface tvlv data
+ */
+struct batadv_priv_tvlv {
+ /**
+ * @container_list: list of registered tvlv containers to be sent with
+ * each OGM
+ */
+ struct hlist_head container_list;
+
+ /** @handler_list: list of the various tvlv content handlers */
+ struct hlist_head handler_list;
+
+ /** @container_list_lock: protects tvlv container list access */
+ spinlock_t container_list_lock;
+
+ /** @handler_list_lock: protects handler list access */
+ spinlock_t handler_list_lock;
+};
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+
+/**
+ * struct batadv_priv_dat - per mesh interface DAT private data
+ */
+struct batadv_priv_dat {
+ /** @addr: node DAT address */
+ batadv_dat_addr_t addr;
+
+ /** @hash: hashtable representing the local ARP cache */
+ struct batadv_hashtable *hash;
+
+ /** @work: work queue callback item for cache purging */
+ struct delayed_work work;
+};
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+/**
+ * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged
+ */
+struct batadv_mcast_querier_state {
+ /** @exists: whether a querier exists in the mesh */
+ unsigned char exists:1;
+
+ /**
+ * @shadowing: if a querier exists, whether it is potentially shadowing
+ * multicast listeners (i.e. querier is behind our own bridge segment)
+ */
+ unsigned char shadowing:1;
+};
+
+/**
+ * struct batadv_mcast_mla_flags - flags for the querier, bridge and tvlv state
+ */
+struct batadv_mcast_mla_flags {
+ /** @querier_ipv4: the current state of an IGMP querier in the mesh */
+ struct batadv_mcast_querier_state querier_ipv4;
+
+ /** @querier_ipv6: the current state of an MLD querier in the mesh */
+ struct batadv_mcast_querier_state querier_ipv6;
+
+ /** @enabled: whether the multicast tvlv is currently enabled */
+ unsigned char enabled:1;
+
+ /** @bridged: whether the mesh interface has a bridge on top */
+ unsigned char bridged:1;
+
+ /** @tvlv_flags: the flags we have last sent in our mcast tvlv */
+ u8 tvlv_flags;
+};
+
+/**
+ * struct batadv_priv_mcast - per mesh interface mcast data
+ */
+struct batadv_priv_mcast {
+ /**
+ * @mla_list: list of multicast addresses we are currently announcing
+ * via TT
+ */
+ struct hlist_head mla_list; /* see __batadv_mcast_mla_update() */
+
+ /**
+ * @want_all_unsnoopables_list: a list of orig_nodes wanting all
+ * unsnoopable multicast traffic
+ */
+ struct hlist_head want_all_unsnoopables_list;
+
+ /**
+ * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast
+ * traffic
+ */
+ struct hlist_head want_all_ipv4_list;
+
+ /**
+ * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast
+ * traffic
+ */
+ struct hlist_head want_all_ipv6_list;
+
+ /**
+ * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4
+ * multicast traffic
+ */
+ struct hlist_head want_all_rtr4_list;
+
+ /**
+ * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6
+ * multicast traffic
+ */
+ struct hlist_head want_all_rtr6_list;
+
+ /**
+ * @mla_flags: flags for the querier, bridge and tvlv state
+ */
+ struct batadv_mcast_mla_flags mla_flags;
+
+ /**
+ * @mla_lock: a lock protecting mla_list and mla_flags
+ */
+ spinlock_t mla_lock;
+
+ /**
+ * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
+ * traffic
+ */
+ atomic_t num_want_all_unsnoopables;
+
+ /** @num_want_all_ipv4: counter for items in want_all_ipv4_list */
+ atomic_t num_want_all_ipv4;
+
+ /** @num_want_all_ipv6: counter for items in want_all_ipv6_list */
+ atomic_t num_want_all_ipv6;
+
+ /** @num_want_all_rtr4: counter for items in want_all_rtr4_list */
+ atomic_t num_want_all_rtr4;
+
+ /** @num_want_all_rtr6: counter for items in want_all_rtr6_list */
+ atomic_t num_want_all_rtr6;
+
+ /**
+ * @num_no_mc_ptype_capa: counter for number of nodes without the
+ * BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag
+ */
+ atomic_t num_no_mc_ptype_capa;
+
+ /**
+ * @want_lists_lock: lock for protecting modifications to mcasts
+ * want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked)
+ */
+ spinlock_t want_lists_lock;
+
+ /** @work: work queue callback item for multicast TT and TVLV updates */
+ struct delayed_work work;
+};
+#endif
+
+/**
+ * struct batadv_tp_unacked - unacked packet meta-information
+ *
+ * This struct is supposed to represent a buffer unacked packet. However, since
+ * the purpose of the TP meter is to count the traffic only, there is no need to
+ * store the entire sk_buff, the starting offset and the length are enough
+ */
+struct batadv_tp_unacked {
+ /** @seqno: seqno of the unacked packet */
+ u32 seqno;
+
+ /** @len: length of the packet */
+ u16 len;
+
+ /** @list: list node for &batadv_tp_vars.unacked_list */
+ struct list_head list;
+};
+
+/**
+ * enum batadv_tp_meter_role - Modus in tp meter session
+ */
+enum batadv_tp_meter_role {
+ /** @BATADV_TP_RECEIVER: Initialized as receiver */
+ BATADV_TP_RECEIVER,
+
+ /** @BATADV_TP_SENDER: Initialized as sender */
+ BATADV_TP_SENDER
+};
+
+/**
+ * struct batadv_tp_vars - tp meter private variables per session
+ */
+struct batadv_tp_vars {
+ /** @list: list node for &bat_priv.tp_list */
+ struct hlist_node list;
+
+ /** @timer: timer for ack (receiver) and retry (sender) */
+ struct timer_list timer;
+
+ /** @bat_priv: pointer to the mesh object */
+ struct batadv_priv *bat_priv;
+
+ /** @start_time: start time in jiffies */
+ unsigned long start_time;
+
+ /** @other_end: mac address of remote */
+ u8 other_end[ETH_ALEN];
+
+ /** @role: receiver/sender modi */
+ enum batadv_tp_meter_role role;
+
+ /** @sending: sending binary semaphore: 1 if sending, 0 is not */
+ atomic_t sending;
+
+ /** @reason: reason for a stopped session */
+ enum batadv_tp_meter_reason reason;
+
+ /** @finish_work: work item for the finishing procedure */
+ struct delayed_work finish_work;
+
+ /** @test_length: test length in milliseconds */
+ u32 test_length;
+
+ /** @session: TP session identifier */
+ u8 session[2];
+
+ /** @icmp_uid: local ICMP "socket" index */
+ u8 icmp_uid;
+
+ /* sender variables */
+
+ /** @dec_cwnd: decimal part of the cwnd used during linear growth */
+ u16 dec_cwnd;
+
+ /** @cwnd: current size of the congestion window */
+ u32 cwnd;
+
+ /** @cwnd_lock: lock do protect @cwnd & @dec_cwnd */
+ spinlock_t cwnd_lock;
+
+ /**
+ * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the
+ * connection switches to the Congestion Avoidance state
+ */
+ u32 ss_threshold;
+
+ /** @last_acked: last acked byte */
+ atomic_t last_acked;
+
+ /** @last_sent: last sent byte, not yet acked */
+ u32 last_sent;
+
+ /** @tot_sent: amount of data sent/ACKed so far */
+ atomic64_t tot_sent;
+
+ /** @dup_acks: duplicate ACKs counter */
+ atomic_t dup_acks;
+
+ /** @fast_recovery: true if in Fast Recovery mode */
+ unsigned char fast_recovery:1;
+
+ /** @recover: last sent seqno when entering Fast Recovery */
+ u32 recover;
+
+ /** @rto: sender timeout */
+ u32 rto;
+
+ /** @srtt: smoothed RTT scaled by 2^3 */
+ u32 srtt;
+
+ /** @rttvar: RTT variation scaled by 2^2 */
+ u32 rttvar;
+
+ /**
+ * @more_bytes: waiting queue anchor when waiting for more ack/retry
+ * timeout
+ */
+ wait_queue_head_t more_bytes;
+
+ /** @prerandom_offset: offset inside the prerandom buffer */
+ u32 prerandom_offset;
+
+ /** @prerandom_lock: spinlock protecting access to prerandom_offset */
+ spinlock_t prerandom_lock;
+
+ /* receiver variables */
+
+ /** @last_recv: last in-order received packet */
+ u32 last_recv;
+
+ /** @unacked_list: list of unacked packets (meta-info only) */
+ struct list_head unacked_list;
+
+ /** @unacked_lock: protect unacked_list */
+ spinlock_t unacked_lock;
+
+ /** @last_recv_time: time (jiffies) a msg was received */
+ unsigned long last_recv_time;
+
+ /** @refcount: number of context where the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_meshif_vlan - per VLAN attributes set
+ */
+struct batadv_meshif_vlan {
+ /** @bat_priv: pointer to the mesh object */
+ struct batadv_priv *bat_priv;
+
+ /** @vid: VLAN identifier */
+ unsigned short vid;
+
+ /** @ap_isolation: AP isolation state */
+ atomic_t ap_isolation; /* boolean */
+
+ /** @tt: TT private attributes (VLAN specific) */
+ struct batadv_vlan_tt tt;
+
+ /** @list: list node for &bat_priv.meshif_vlan_list */
+ struct hlist_node list;
+
+ /**
+ * @refcount: number of context where this object is currently in use
+ */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in a RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_priv_bat_v - B.A.T.M.A.N. V per mesh-interface private data
+ */
+struct batadv_priv_bat_v {
+ /** @ogm_buff: buffer holding the OGM packet */
+ unsigned char *ogm_buff;
+
+ /** @ogm_buff_len: length of the OGM packet buffer */
+ int ogm_buff_len;
+
+ /** @ogm_seqno: OGM sequence number - used to identify each OGM */
+ atomic_t ogm_seqno;
+
+ /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
+ struct mutex ogm_buff_mutex;
+
+ /** @ogm_wq: workqueue used to schedule OGM transmissions */
+ struct delayed_work ogm_wq;
+};
+
+/**
+ * struct batadv_priv - per mesh interface data
+ */
+struct batadv_priv {
+ /**
+ * @mesh_state: current status of the mesh
+ * (inactive/active/deactivating)
+ */
+ atomic_t mesh_state;
+
+ /** @mesh_iface: net device which holds this struct as private data */
+ struct net_device *mesh_iface;
+
+ /**
+ * @mtu_set_by_user: MTU was set once by user
+ * protected by rtnl_lock
+ */
+ int mtu_set_by_user;
+
+ /**
+ * @bat_counters: mesh internal traffic statistic counters (see
+ * batadv_counters)
+ */
+ u64 __percpu *bat_counters; /* Per cpu counters */
+
+ /**
+ * @aggregated_ogms: bool indicating whether OGM aggregation is enabled
+ */
+ atomic_t aggregated_ogms;
+
+ /** @bonding: bool indicating whether traffic bonding is enabled */
+ atomic_t bonding;
+
+ /**
+ * @fragmentation: bool indicating whether traffic fragmentation is
+ * enabled
+ */
+ atomic_t fragmentation;
+
+ /**
+ * @packet_size_max: max packet size that can be transmitted via
+ * multiple fragmented skbs or a single frame if fragmentation is
+ * disabled
+ */
+ atomic_t packet_size_max;
+
+ /**
+ * @frag_seqno: incremental counter to identify chains of egress
+ * fragments
+ */
+ atomic_t frag_seqno;
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ /**
+ * @bridge_loop_avoidance: bool indicating whether bridge loop
+ * avoidance is enabled
+ */
+ atomic_t bridge_loop_avoidance;
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ /**
+ * @distributed_arp_table: bool indicating whether distributed ARP table
+ * is enabled
+ */
+ atomic_t distributed_arp_table;
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ /**
+ * @multicast_mode: Enable or disable multicast optimizations on this
+ * node's sender/originating side
+ */
+ atomic_t multicast_mode;
+
+ /**
+ * @multicast_fanout: Maximum number of packet copies to generate for a
+ * multicast-to-unicast conversion
+ */
+ atomic_t multicast_fanout;
+#endif
+
+ /** @orig_interval: OGM broadcast interval in milliseconds */
+ atomic_t orig_interval;
+
+ /**
+ * @hop_penalty: penalty which will be applied to an OGM's tq-field on
+ * every hop
+ */
+ atomic_t hop_penalty;
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ /** @log_level: configured log level (see batadv_dbg_level) */
+ atomic_t log_level;
+#endif
+
+ /**
+ * @isolation_mark: the skb->mark value used to match packets for AP
+ * isolation
+ */
+ u32 isolation_mark;
+
+ /**
+ * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be
+ * used for the isolation mark
+ */
+ u32 isolation_mark_mask;
+
+ /** @bcast_seqno: last sent broadcast packet sequence number */
+ atomic_t bcast_seqno;
+
+ /**
+ * @bcast_queue_left: number of remaining buffered broadcast packet
+ * slots
+ */
+ atomic_t bcast_queue_left;
+
+ /** @batman_queue_left: number of remaining OGM packet slots */
+ atomic_t batman_queue_left;
+
+ /** @forw_bat_list: list of aggregated OGMs that will be forwarded */
+ struct hlist_head forw_bat_list;
+
+ /**
+ * @forw_bcast_list: list of broadcast packets that will be
+ * rebroadcasted
+ */
+ struct hlist_head forw_bcast_list;
+
+ /** @tp_list: list of tp sessions */
+ struct hlist_head tp_list;
+
+ /** @orig_hash: hash table containing mesh participants (orig nodes) */
+ struct batadv_hashtable *orig_hash;
+
+ /** @forw_bat_list_lock: lock protecting forw_bat_list */
+ spinlock_t forw_bat_list_lock;
+
+ /** @forw_bcast_list_lock: lock protecting forw_bcast_list */
+ spinlock_t forw_bcast_list_lock;
+
+ /** @tp_list_lock: spinlock protecting @tp_list */
+ spinlock_t tp_list_lock;
+
+ /** @tp_num: number of currently active tp sessions */
+ atomic_t tp_num;
+
+ /** @orig_work: work queue callback item for orig node purging */
+ struct delayed_work orig_work;
+
+ /**
+ * @primary_if: one of the hard-interfaces assigned to this mesh
+ * interface becomes the primary interface
+ */
+ struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */
+
+ /** @algo_ops: routing algorithm used by this mesh interface */
+ struct batadv_algo_ops *algo_ops;
+
+ /**
+ * @meshif_vlan_list: a list of meshif_vlan structs, one per VLAN
+ * created on top of the mesh interface represented by this object
+ */
+ struct hlist_head meshif_vlan_list;
+
+ /** @meshif_vlan_list_lock: lock protecting meshif_vlan_list */
+ spinlock_t meshif_vlan_list_lock;
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ /** @bla: bridge loop avoidance data */
+ struct batadv_priv_bla bla;
+#endif
+
+ /** @gw: gateway data */
+ struct batadv_priv_gw gw;
+
+ /** @tt: translation table data */
+ struct batadv_priv_tt tt;
+
+ /** @tvlv: type-version-length-value data */
+ struct batadv_priv_tvlv tvlv;
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ /** @dat: distributed arp table data */
+ struct batadv_priv_dat dat;
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ /** @mcast: multicast data */
+ struct batadv_priv_mcast mcast;
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ /** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */
+ struct batadv_priv_bat_v bat_v;
+#endif
+};
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+
+/**
+ * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN
+ */
+struct batadv_bla_backbone_gw {
+ /**
+ * @orig: originator address of backbone node (mac address of primary
+ * iface)
+ */
+ u8 orig[ETH_ALEN];
+
+ /** @vid: vlan id this gateway was detected on */
+ unsigned short vid;
+
+ /** @hash_entry: hlist node for &batadv_priv_bla.backbone_hash */
+ struct hlist_node hash_entry;
+
+ /** @bat_priv: pointer to mesh_iface this backbone gateway belongs to */
+ struct batadv_priv *bat_priv;
+
+ /** @lasttime: last time we heard of this backbone gw */
+ unsigned long lasttime;
+
+ /**
+ * @wait_periods: grace time for bridge forward delays and bla group
+ * forming at bootup phase - no bcast traffic is formwared until it has
+ * elapsed
+ */
+ atomic_t wait_periods;
+
+ /**
+ * @request_sent: if this bool is set to true we are out of sync with
+ * this backbone gateway - no bcast traffic is formwared until the
+ * situation was resolved
+ */
+ atomic_t request_sent;
+
+ /** @crc: crc16 checksum over all claims */
+ u16 crc;
+
+ /** @crc_lock: lock protecting crc */
+ spinlock_t crc_lock;
+
+ /** @report_work: work struct for reporting detected loops */
+ struct work_struct report_work;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_bla_claim - claimed non-mesh client structure
+ */
+struct batadv_bla_claim {
+ /** @addr: mac address of claimed non-mesh client */
+ u8 addr[ETH_ALEN];
+
+ /** @vid: vlan id this client was detected on */
+ unsigned short vid;
+
+ /** @backbone_gw: pointer to backbone gw claiming this client */
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ /** @backbone_lock: lock protecting backbone_gw pointer */
+ spinlock_t backbone_lock;
+
+ /** @lasttime: last time we heard of claim (locals only) */
+ unsigned long lasttime;
+
+ /** @hash_entry: hlist node for &batadv_priv_bla.claim_hash */
+ struct hlist_node hash_entry;
+
+ /** @refcount: number of contexts the object is used */
+ struct rcu_head rcu;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct kref refcount;
+};
+#endif
+
+/**
+ * struct batadv_tt_common_entry - tt local & tt global common data
+ */
+struct batadv_tt_common_entry {
+ /** @addr: mac address of non-mesh client */
+ u8 addr[ETH_ALEN];
+
+ /** @vid: VLAN identifier */
+ unsigned short vid;
+
+ /**
+ * @hash_entry: hlist node for &batadv_priv_tt.local_hash or for
+ * &batadv_priv_tt.global_hash
+ */
+ struct hlist_node hash_entry;
+
+ /** @flags: various state handling flags (see batadv_tt_client_flags) */
+ u16 flags;
+
+ /** @added_at: timestamp used for purging stale tt common entries */
+ unsigned long added_at;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_tt_local_entry - translation table local entry data
+ */
+struct batadv_tt_local_entry {
+ /** @common: general translation table data */
+ struct batadv_tt_common_entry common;
+
+ /** @last_seen: timestamp used for purging stale tt local entries */
+ unsigned long last_seen;
+
+ /** @vlan: mesh-interface vlan of the entry */
+ struct batadv_meshif_vlan *vlan;
+};
+
+/**
+ * struct batadv_tt_global_entry - translation table global entry data
+ */
+struct batadv_tt_global_entry {
+ /** @common: general translation table data */
+ struct batadv_tt_common_entry common;
+
+ /** @orig_list: list of orig nodes announcing this non-mesh client */
+ struct hlist_head orig_list;
+
+ /** @orig_list_count: number of items in the orig_list */
+ atomic_t orig_list_count;
+
+ /** @list_lock: lock protecting orig_list */
+ spinlock_t list_lock;
+
+ /** @roam_at: time at which TT_GLOBAL_ROAM was set */
+ unsigned long roam_at;
+};
+
+/**
+ * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client
+ */
+struct batadv_tt_orig_list_entry {
+ /** @orig_node: pointer to orig node announcing this non-mesh client */
+ struct batadv_orig_node *orig_node;
+
+ /**
+ * @ttvn: translation table version number which added the non-mesh
+ * client
+ */
+ u8 ttvn;
+
+ /** @flags: per orig entry TT sync flags */
+ u8 flags;
+
+ /** @list: list node for &batadv_tt_global_entry.orig_list */
+ struct hlist_node list;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_tt_change_node - structure for tt changes occurred
+ */
+struct batadv_tt_change_node {
+ /** @list: list node for &batadv_priv_tt.changes_list */
+ struct list_head list;
+
+ /** @change: holds the actual translation table diff data */
+ struct batadv_tvlv_tt_change change;
+};
+
+/**
+ * struct batadv_tt_req_node - data to keep track of the tt requests in flight
+ */
+struct batadv_tt_req_node {
+ /**
+ * @addr: mac address of the originator this request was sent to
+ */
+ u8 addr[ETH_ALEN];
+
+ /** @issued_at: timestamp used for purging stale tt requests */
+ unsigned long issued_at;
+
+ /** @refcount: number of contexts the object is used by */
+ struct kref refcount;
+
+ /** @list: list node for &batadv_priv_tt.req_list */
+ struct hlist_node list;
+};
+
+/**
+ * struct batadv_tt_roam_node - roaming client data
+ */
+struct batadv_tt_roam_node {
+ /** @addr: mac address of the client in the roaming phase */
+ u8 addr[ETH_ALEN];
+
+ /**
+ * @counter: number of allowed roaming events per client within a single
+ * OGM interval (changes are committed with each OGM)
+ */
+ atomic_t counter;
+
+ /**
+ * @first_time: timestamp used for purging stale roaming node entries
+ */
+ unsigned long first_time;
+
+ /** @list: list node for &batadv_priv_tt.roam_list */
+ struct list_head list;
+};
+
+/**
+ * struct batadv_skb_cb - control buffer structure used to store private data
+ * relevant to batman-adv in the skb->cb buffer in skbs.
+ */
+struct batadv_skb_cb {
+ /** @num_bcasts: Counter for broadcast packet retransmissions */
+ unsigned char num_bcasts;
+};
+
+/**
+ * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
+ */
+struct batadv_forw_packet {
+ /**
+ * @list: list node for &batadv_priv.forw.bcast_list and
+ * &batadv_priv.forw.bat_list
+ */
+ struct hlist_node list;
+
+ /** @cleanup_list: list node for purging functions */
+ struct hlist_node cleanup_list;
+
+ /** @send_time: execution time for delayed_work (packet sending) */
+ unsigned long send_time;
+
+ /**
+ * @own: bool for locally generated packets (local OGMs are re-scheduled
+ * after sending)
+ */
+ u8 own;
+
+ /** @skb: bcast packet's skb buffer */
+ struct sk_buff *skb;
+
+ /** @packet_len: size of aggregated OGM packet inside the skb buffer */
+ u16 packet_len;
+
+ /** @direct_link_flags: direct link flags for aggregated OGM packets */
+ DECLARE_BITMAP(direct_link_flags, BATADV_MAX_AGGREGATION_PACKETS);
+
+ /** @num_packets: counter for aggregated OGMv1 packets */
+ u8 num_packets;
+
+ /** @delayed_work: work queue callback item for packet sending */
+ struct delayed_work delayed_work;
+
+ /**
+ * @if_incoming: pointer to incoming hard-iface or primary iface if
+ * locally generated packet
+ */
+ struct batadv_hard_iface *if_incoming;
+
+ /**
+ * @if_outgoing: packet where the packet should be sent to, or NULL if
+ * unspecified
+ */
+ struct batadv_hard_iface *if_outgoing;
+
+ /** @queue_left: The queue (counter) this packet was applied to */
+ atomic_t *queue_left;
+};
+
+/**
+ * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific)
+ */
+struct batadv_algo_iface_ops {
+ /**
+ * @activate: start routing mechanisms when hard-interface is brought up
+ * (optional)
+ */
+ void (*activate)(struct batadv_hard_iface *hard_iface);
+
+ /** @enable: init routing info when hard-interface is enabled */
+ int (*enable)(struct batadv_hard_iface *hard_iface);
+
+ /** @enabled: notification when hard-interface was enabled (optional) */
+ void (*enabled)(struct batadv_hard_iface *hard_iface);
+
+ /** @disable: de-init routing info when hard-interface is disabled */
+ void (*disable)(struct batadv_hard_iface *hard_iface);
+
+ /**
+ * @update_mac: (re-)init mac addresses of the protocol information
+ * belonging to this hard-interface
+ */
+ void (*update_mac)(struct batadv_hard_iface *hard_iface);
+
+ /** @primary_set: called when primary interface is selected / changed */
+ void (*primary_set)(struct batadv_hard_iface *hard_iface);
+};
+
+/**
+ * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific)
+ */
+struct batadv_algo_neigh_ops {
+ /** @hardif_init: called on creation of single hop entry (optional) */
+ void (*hardif_init)(struct batadv_hardif_neigh_node *neigh);
+
+ /**
+ * @cmp: compare the metrics of two neighbors for their respective
+ * outgoing interfaces
+ */
+ int (*cmp)(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2);
+
+ /**
+ * @is_similar_or_better: check if neigh1 is equally similar or better
+ * than neigh2 for their respective outgoing interface from the metric
+ * prospective
+ */
+ bool (*is_similar_or_better)(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2);
+
+ /** @dump: dump neighbors to a netlink socket (optional) */
+ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *priv,
+ struct batadv_hard_iface *hard_iface);
+};
+
+/**
+ * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
+ */
+struct batadv_algo_orig_ops {
+ /** @dump: dump originators to a netlink socket (optional) */
+ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *priv,
+ struct batadv_hard_iface *hard_iface);
+};
+
+/**
+ * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific)
+ */
+struct batadv_algo_gw_ops {
+ /** @init_sel_class: initialize GW selection class (optional) */
+ void (*init_sel_class)(struct batadv_priv *bat_priv);
+
+ /**
+ * @sel_class_max: maximum allowed GW selection class
+ */
+ u32 sel_class_max;
+
+ /**
+ * @get_best_gw_node: select the best GW from the list of available
+ * nodes (optional)
+ */
+ struct batadv_gw_node *(*get_best_gw_node)
+ (struct batadv_priv *bat_priv);
+
+ /**
+ * @is_eligible: check if a newly discovered GW is a potential candidate
+ * for the election as best GW (optional)
+ */
+ bool (*is_eligible)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *curr_gw_orig,
+ struct batadv_orig_node *orig_node);
+
+ /** @dump: dump gateways to a netlink socket (optional) */
+ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+ struct batadv_priv *priv);
+};
+
+/**
+ * struct batadv_algo_ops - mesh algorithm callbacks
+ */
+struct batadv_algo_ops {
+ /** @list: list node for the batadv_algo_list */
+ struct hlist_node list;
+
+ /** @name: name of the algorithm */
+ char *name;
+
+ /** @iface: callbacks related to interface handling */
+ struct batadv_algo_iface_ops iface;
+
+ /** @neigh: callbacks related to neighbors handling */
+ struct batadv_algo_neigh_ops neigh;
+
+ /** @orig: callbacks related to originators handling */
+ struct batadv_algo_orig_ops orig;
+
+ /** @gw: callbacks related to GW mode */
+ struct batadv_algo_gw_ops gw;
+};
+
+/**
+ * struct batadv_dat_entry - it is a single entry of batman-adv ARP backend. It
+ * is used to stored ARP entries needed for the global DAT cache
+ */
+struct batadv_dat_entry {
+ /** @ip: the IPv4 corresponding to this DAT/ARP entry */
+ __be32 ip;
+
+ /** @mac_addr: the MAC address associated to the stored IPv4 */
+ u8 mac_addr[ETH_ALEN];
+
+ /** @vid: the vlan ID associated to this entry */
+ unsigned short vid;
+
+ /**
+ * @last_update: time in jiffies when this entry was refreshed last time
+ */
+ unsigned long last_update;
+
+ /** @hash_entry: hlist node for &batadv_priv_dat.hash */
+ struct hlist_node hash_entry;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_hw_addr - a list entry for a MAC address
+ */
+struct batadv_hw_addr {
+ /** @list: list node for the linking of entries */
+ struct hlist_node list;
+
+ /** @addr: the MAC address of this list entry */
+ unsigned char addr[ETH_ALEN];
+};
+
+/**
+ * struct batadv_dat_candidate - candidate destination for DAT operations
+ */
+struct batadv_dat_candidate {
+ /**
+ * @type: the type of the selected candidate. It can one of the
+ * following:
+ * - BATADV_DAT_CANDIDATE_NOT_FOUND
+ * - BATADV_DAT_CANDIDATE_ORIG
+ */
+ int type;
+
+ /**
+ * @orig_node: if type is BATADV_DAT_CANDIDATE_ORIG this field points to
+ * the corresponding originator node structure
+ */
+ struct batadv_orig_node *orig_node;
+};
+
+/**
+ * struct batadv_tvlv_container - container for tvlv appended to OGMs
+ */
+struct batadv_tvlv_container {
+ /** @list: hlist node for &batadv_priv_tvlv.container_list */
+ struct hlist_node list;
+
+ /** @tvlv_hdr: tvlv header information needed to construct the tvlv */
+ struct batadv_tvlv_hdr tvlv_hdr;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+};
+
+/**
+ * struct batadv_tvlv_handler - handler for specific tvlv type and version
+ */
+struct batadv_tvlv_handler {
+ /** @list: hlist node for &batadv_priv_tvlv.handler_list */
+ struct hlist_node list;
+
+ /**
+ * @ogm_handler: handler callback which is given the tvlv payload to
+ * process on incoming OGM packets
+ */
+ void (*ogm_handler)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 flags, void *tvlv_value, u16 tvlv_value_len);
+
+ /**
+ * @unicast_handler: handler callback which is given the tvlv payload to
+ * process on incoming unicast tvlv packets
+ */
+ int (*unicast_handler)(struct batadv_priv *bat_priv,
+ u8 *src, u8 *dst,
+ void *tvlv_value, u16 tvlv_value_len);
+
+ /**
+ * @mcast_handler: handler callback which is given the tvlv payload to
+ * process on incoming mcast packet
+ */
+ int (*mcast_handler)(struct batadv_priv *bat_priv, struct sk_buff *skb);
+
+ /** @type: tvlv type this handler feels responsible for */
+ u8 type;
+
+ /** @version: tvlv version this handler feels responsible for */
+ u8 version;
+
+ /** @flags: tvlv handler flags */
+ u8 flags;
+
+ /** @refcount: number of contexts the object is used */
+ struct kref refcount;
+
+ /** @rcu: struct used for freeing in an RCU-safe manner */
+ struct rcu_head rcu;
+};
+
+/**
+ * enum batadv_tvlv_handler_flags - tvlv handler flags definitions
+ */
+enum batadv_tvlv_handler_flags {
+ /**
+ * @BATADV_TVLV_HANDLER_OGM_CIFNOTFND: tvlv ogm processing function
+ * will call this handler even if its type was not found (with no data)
+ */
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND = BIT(1),
+
+ /**
+ * @BATADV_TVLV_HANDLER_OGM_CALLED: interval tvlv handling flag - the
+ * API marks a handler as being called, so it won't be called if the
+ * BATADV_TVLV_HANDLER_OGM_CIFNOTFND flag was set
+ */
+ BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
+};
+
+#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
new file mode 100644
index 000000000000..2c21ae8abadc
--- /dev/null
+++ b/net/bluetooth/6lowpan.c
@@ -0,0 +1,1336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ Copyright (c) 2013-2014 Intel Corp.
+
+*/
+
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/netdev_lock.h>
+#include <net/pkt_sched.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include <net/6lowpan.h> /* for the compression support */
+
+#define VERSION "0.1"
+
+static struct dentry *lowpan_enable_debugfs;
+static struct dentry *lowpan_control_debugfs;
+
+#define IFACE_NAME_TEMPLATE "bt%d"
+
+struct skb_cb {
+ struct in6_addr addr;
+ struct in6_addr gw;
+ struct l2cap_chan *chan;
+};
+#define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb))
+
+/* The devices list contains those devices that we are acting
+ * as a proxy. The BT 6LoWPAN device is a virtual device that
+ * connects to the Bluetooth LE device. The real connection to
+ * BT device is done via l2cap layer. There exists one
+ * virtual device / one BT 6LoWPAN network (=hciX device).
+ * The list contains struct lowpan_dev elements.
+ */
+static LIST_HEAD(bt_6lowpan_devices);
+static DEFINE_SPINLOCK(devices_lock);
+
+static bool enable_6lowpan;
+
+/* We are listening incoming connections via this channel
+ */
+static struct l2cap_chan *listen_chan;
+static DEFINE_MUTEX(set_lock);
+
+enum {
+ LOWPAN_PEER_CLOSING,
+ LOWPAN_PEER_MAXBITS
+};
+
+struct lowpan_peer {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct l2cap_chan *chan;
+
+ /* peer addresses in various formats */
+ unsigned char lladdr[ETH_ALEN];
+ struct in6_addr peer_addr;
+
+ DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS);
+};
+
+struct lowpan_btle_dev {
+ struct list_head list;
+
+ struct hci_dev *hdev;
+ struct net_device *netdev;
+ struct list_head peers;
+ atomic_t peer_count; /* number of items in peers list */
+
+ struct work_struct delete_netdev;
+ struct delayed_work notify_peers;
+};
+
+static inline struct lowpan_btle_dev *
+lowpan_btle_dev(const struct net_device *netdev)
+{
+ return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv;
+}
+
+static inline void peer_add(struct lowpan_btle_dev *dev,
+ struct lowpan_peer *peer)
+{
+ list_add_rcu(&peer->list, &dev->peers);
+ atomic_inc(&dev->peer_count);
+}
+
+static inline bool peer_del(struct lowpan_btle_dev *dev,
+ struct lowpan_peer *peer)
+{
+ list_del_rcu(&peer->list);
+ kfree_rcu(peer, rcu);
+
+ module_put(THIS_MODULE);
+
+ if (atomic_dec_and_test(&dev->peer_count)) {
+ BT_DBG("last peer");
+ return true;
+ }
+
+ return false;
+}
+
+static inline struct lowpan_peer *
+__peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan)
+{
+ struct lowpan_peer *peer;
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (peer->chan == chan)
+ return peer;
+ }
+
+ return NULL;
+}
+
+static inline struct lowpan_peer *
+__peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn)
+{
+ struct lowpan_peer *peer;
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (peer->chan->conn == conn)
+ return peer;
+ }
+
+ return NULL;
+}
+
+static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
+ struct in6_addr *daddr,
+ struct sk_buff *skb)
+{
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
+ int count = atomic_read(&dev->peer_count);
+ const struct in6_addr *nexthop;
+ struct lowpan_peer *peer;
+ struct neighbour *neigh;
+
+ BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt);
+
+ if (!rt) {
+ if (ipv6_addr_any(&lowpan_cb(skb)->gw)) {
+ /* There is neither route nor gateway,
+ * probably the destination is a direct peer.
+ */
+ nexthop = daddr;
+ } else {
+ /* There is a known gateway
+ */
+ nexthop = &lowpan_cb(skb)->gw;
+ }
+ } else {
+ nexthop = rt6_nexthop(rt, daddr);
+
+ /* We need to remember the address because it is needed
+ * by bt_xmit() when sending the packet. In bt_xmit(), the
+ * destination routing info is not set.
+ */
+ memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr));
+ }
+
+ BT_DBG("gw %pI6c", nexthop);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ BT_DBG("dst addr %pMR dst type %u ip %pI6c",
+ &peer->chan->dst, peer->chan->dst_type,
+ &peer->peer_addr);
+
+ if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) {
+ rcu_read_unlock();
+ return peer;
+ }
+ }
+
+ /* use the neighbour cache for matching addresses assigned by SLAAC */
+ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop);
+ if (neigh) {
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) {
+ neigh_release(neigh);
+ rcu_read_unlock();
+ return peer;
+ }
+ }
+ neigh_release(neigh);
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn)
+{
+ struct lowpan_btle_dev *entry;
+ struct lowpan_peer *peer = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ peer = __peer_lookup_conn(entry, conn);
+ if (peer)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return peer;
+}
+
+static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn)
+{
+ struct lowpan_btle_dev *entry;
+ struct lowpan_btle_dev *dev = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ if (conn->hcon->hdev == entry->hdev) {
+ dev = entry;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return dev;
+}
+
+static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sk_buff *skb_cp;
+
+ skb_cp = skb_copy(skb, GFP_ATOMIC);
+ if (!skb_cp)
+ return NET_RX_DROP;
+
+ return netif_rx(skb_cp);
+}
+
+static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
+ struct lowpan_peer *peer)
+{
+ const u8 *saddr;
+
+ saddr = peer->lladdr;
+
+ return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr);
+}
+
+static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
+ struct lowpan_peer *peer)
+{
+ struct sk_buff *local_skb;
+ int ret;
+
+ if (!netif_running(dev))
+ goto drop;
+
+ if (dev->type != ARPHRD_6LOWPAN || !skb->len)
+ goto drop;
+
+ skb_reset_network_header(skb);
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto drop;
+
+ /* check that it's our buffer */
+ if (lowpan_is_ipv6(*skb_network_header(skb))) {
+ /* Pull off the 1-byte of 6lowpan header. */
+ skb_pull(skb, 1);
+
+ /* Copy the packet so that the IPv6 header is
+ * properly aligned.
+ */
+ local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1,
+ skb_tailroom(skb), GFP_ATOMIC);
+ if (!local_skb)
+ goto drop;
+
+ local_skb->protocol = htons(ETH_P_IPV6);
+ local_skb->pkt_type = PACKET_HOST;
+ local_skb->dev = dev;
+
+ skb_reset_mac_header(local_skb);
+ skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
+
+ if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) {
+ kfree_skb(local_skb);
+ goto drop;
+ }
+
+ dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+
+ consume_skb(local_skb);
+ consume_skb(skb);
+ } else if (lowpan_is_iphc(*skb_network_header(skb))) {
+ local_skb = skb_clone(skb, GFP_ATOMIC);
+ if (!local_skb)
+ goto drop;
+
+ local_skb->dev = dev;
+
+ ret = iphc_decompress(local_skb, dev, peer);
+ if (ret < 0) {
+ BT_DBG("iphc_decompress failed: %d", ret);
+ kfree_skb(local_skb);
+ goto drop;
+ }
+
+ local_skb->protocol = htons(ETH_P_IPV6);
+ local_skb->pkt_type = PACKET_HOST;
+
+ if (give_skb_to_upper(local_skb, dev)
+ != NET_RX_SUCCESS) {
+ kfree_skb(local_skb);
+ goto drop;
+ }
+
+ dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+
+ consume_skb(local_skb);
+ consume_skb(skb);
+ } else {
+ BT_DBG("unknown packet type");
+ goto drop;
+ }
+
+ return NET_RX_SUCCESS;
+
+drop:
+ dev->stats.rx_dropped++;
+ return NET_RX_DROP;
+}
+
+/* Packet from BT LE device */
+static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct lowpan_btle_dev *dev;
+ struct lowpan_peer *peer;
+ int err;
+
+ peer = lookup_peer(chan->conn);
+ if (!peer)
+ return -ENOENT;
+
+ dev = lookup_dev(chan->conn);
+ if (!dev || !dev->netdev)
+ return -ENOENT;
+
+ err = recv_pkt(skb, dev->netdev, peer);
+ if (err) {
+ BT_DBG("recv pkt %d", err);
+ err = -EAGAIN;
+ }
+
+ return err;
+}
+
+static int setup_header(struct sk_buff *skb, struct net_device *netdev,
+ bdaddr_t *peer_addr, u8 *peer_addr_type)
+{
+ struct in6_addr ipv6_daddr;
+ struct ipv6hdr *hdr;
+ struct lowpan_btle_dev *dev;
+ struct lowpan_peer *peer;
+ u8 *daddr;
+ int err, status = 0;
+
+ hdr = ipv6_hdr(skb);
+
+ dev = lowpan_btle_dev(netdev);
+
+ memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr));
+
+ if (ipv6_addr_is_multicast(&ipv6_daddr)) {
+ lowpan_cb(skb)->chan = NULL;
+ daddr = NULL;
+ } else {
+ BT_DBG("dest IP %pI6c", &ipv6_daddr);
+
+ /* The packet might be sent to 6lowpan interface
+ * because of routing (either via default route
+ * or user set route) so get peer according to
+ * the destination address.
+ */
+ peer = peer_lookup_dst(dev, &ipv6_daddr, skb);
+ if (!peer) {
+ BT_DBG("no such peer");
+ return -ENOENT;
+ }
+
+ daddr = peer->lladdr;
+ *peer_addr = peer->chan->dst;
+ *peer_addr_type = peer->chan->dst_type;
+ lowpan_cb(skb)->chan = peer->chan;
+
+ status = 1;
+ }
+
+ lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr);
+
+ err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0);
+ if (err < 0)
+ return err;
+
+ return status;
+}
+
+static int header_create(struct sk_buff *skb, struct net_device *netdev,
+ unsigned short type, const void *_daddr,
+ const void *_saddr, unsigned int len)
+{
+ if (type != ETH_P_IPV6)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* Packet to BT LE device */
+static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct msghdr msg;
+ struct kvec iv;
+ int err;
+
+ /* Remember the skb so that we can send EAGAIN to the caller if
+ * we run out of credits.
+ */
+ chan->data = skb;
+
+ iv.iov_base = skb->data;
+ iv.iov_len = skb->len;
+
+ memset(&msg, 0, sizeof(msg));
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len);
+
+ err = l2cap_chan_send(chan, &msg, skb->len, NULL);
+ if (err > 0) {
+ netdev->stats.tx_bytes += err;
+ netdev->stats.tx_packets++;
+ return 0;
+ }
+
+ if (err < 0)
+ netdev->stats.tx_errors++;
+
+ return err;
+}
+
+static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct sk_buff *local_skb;
+ struct lowpan_btle_dev *entry;
+ int err = 0;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ struct lowpan_peer *pentry;
+ struct lowpan_btle_dev *dev;
+
+ if (entry->netdev != netdev)
+ continue;
+
+ dev = lowpan_btle_dev(entry->netdev);
+
+ list_for_each_entry_rcu(pentry, &dev->peers, list) {
+ int ret;
+
+ local_skb = skb_clone(skb, GFP_ATOMIC);
+
+ BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
+ netdev->name,
+ &pentry->chan->dst, pentry->chan->dst_type,
+ &pentry->peer_addr, pentry->chan);
+ ret = send_pkt(pentry->chan, local_skb, netdev);
+ if (ret < 0)
+ err = ret;
+
+ kfree_skb(local_skb);
+ }
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
+
+static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ int err = 0;
+ bdaddr_t addr;
+ u8 addr_type;
+
+ /* We must take a copy of the skb before we modify/replace the ipv6
+ * header as the header could be used elsewhere
+ */
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_XMIT_DROP;
+
+ /* Return values from setup_header()
+ * <0 - error, packet is dropped
+ * 0 - this is a multicast packet
+ * 1 - this is unicast packet
+ */
+ err = setup_header(skb, netdev, &addr, &addr_type);
+ if (err < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ if (err) {
+ if (lowpan_cb(skb)->chan) {
+ BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
+ netdev->name, &addr, addr_type,
+ &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan);
+ err = send_pkt(lowpan_cb(skb)->chan, skb, netdev);
+ } else {
+ err = -ENOENT;
+ }
+ } else {
+ /* We need to send the packet to every device behind this
+ * interface.
+ */
+ err = send_mcast_pkt(skb, netdev);
+ }
+
+ dev_kfree_skb(skb);
+
+ if (err)
+ BT_DBG("ERROR: xmit failed (%d)", err);
+
+ return err < 0 ? NET_XMIT_DROP : err;
+}
+
+static int bt_dev_init(struct net_device *dev)
+{
+ netdev_lockdep_set_classes(dev);
+
+ return 0;
+}
+
+static const struct net_device_ops netdev_ops = {
+ .ndo_init = bt_dev_init,
+ .ndo_start_xmit = bt_xmit,
+};
+
+static const struct header_ops header_ops = {
+ .create = header_create,
+};
+
+static void netdev_setup(struct net_device *dev)
+{
+ dev->hard_header_len = 0;
+ dev->needed_tailroom = 0;
+ dev->flags = IFF_RUNNING | IFF_MULTICAST;
+ dev->watchdog_timeo = 0;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+
+ dev->netdev_ops = &netdev_ops;
+ dev->header_ops = &header_ops;
+ dev->needs_free_netdev = true;
+}
+
+static const struct device_type bt_type = {
+ .name = "bluetooth",
+};
+
+static void ifup(struct net_device *netdev)
+{
+ int err;
+
+ rtnl_lock();
+ err = dev_open(netdev, NULL);
+ if (err < 0)
+ BT_INFO("iface %s cannot be opened (%d)", netdev->name, err);
+ rtnl_unlock();
+}
+
+static void ifdown(struct net_device *netdev)
+{
+ rtnl_lock();
+ dev_close(netdev);
+ rtnl_unlock();
+}
+
+static void do_notify_peers(struct work_struct *work)
+{
+ struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev,
+ notify_peers.work);
+
+ netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */
+}
+
+static bool is_bt_6lowpan(struct hci_conn *hcon)
+{
+ if (hcon->type != LE_LINK)
+ return false;
+
+ if (!enable_6lowpan)
+ return false;
+
+ return true;
+}
+
+static struct l2cap_chan *chan_create(void)
+{
+ struct l2cap_chan *chan;
+
+ chan = l2cap_chan_create();
+ if (!chan)
+ return NULL;
+
+ l2cap_chan_set_defaults(chan);
+
+ chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
+ chan->mode = L2CAP_MODE_LE_FLOWCTL;
+ chan->imtu = 1280;
+
+ return chan;
+}
+
+static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
+ struct lowpan_btle_dev *dev,
+ bool new_netdev)
+{
+ struct lowpan_peer *peer;
+
+ peer = kzalloc(sizeof(*peer), GFP_ATOMIC);
+ if (!peer)
+ return NULL;
+
+ peer->chan = chan;
+
+ baswap((void *)peer->lladdr, &chan->dst);
+
+ lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr);
+
+ spin_lock(&devices_lock);
+ INIT_LIST_HEAD(&peer->list);
+ peer_add(dev, peer);
+ spin_unlock(&devices_lock);
+
+ /* Notifying peers about us needs to be done without locks held */
+ if (new_netdev)
+ INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers);
+ schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100));
+
+ return peer->chan;
+}
+
+static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev)
+{
+ struct net_device *netdev;
+ bdaddr_t addr;
+ int err;
+
+ netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)),
+ IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN,
+ netdev_setup);
+ if (!netdev)
+ return -ENOMEM;
+
+ netdev->addr_assign_type = NET_ADDR_PERM;
+ baswap(&addr, &chan->src);
+ __dev_addr_set(netdev, &addr, sizeof(addr));
+
+ netdev->netdev_ops = &netdev_ops;
+ SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev);
+ SET_NETDEV_DEVTYPE(netdev, &bt_type);
+
+ *dev = lowpan_btle_dev(netdev);
+ (*dev)->netdev = netdev;
+ (*dev)->hdev = chan->conn->hcon->hdev;
+ INIT_LIST_HEAD(&(*dev)->peers);
+
+ spin_lock(&devices_lock);
+ INIT_LIST_HEAD(&(*dev)->list);
+ list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
+ spin_unlock(&devices_lock);
+
+ err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE);
+ if (err < 0) {
+ BT_INFO("register_netdev failed %d", err);
+ spin_lock(&devices_lock);
+ list_del_rcu(&(*dev)->list);
+ spin_unlock(&devices_lock);
+ free_netdev(netdev);
+ goto out;
+ }
+
+ BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d",
+ netdev->ifindex, &chan->dst, chan->dst_type,
+ &chan->src, chan->src_type);
+ set_bit(__LINK_STATE_PRESENT, &netdev->state);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static inline void chan_ready_cb(struct l2cap_chan *chan)
+{
+ struct lowpan_btle_dev *dev;
+ bool new_netdev = false;
+
+ dev = lookup_dev(chan->conn);
+
+ BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev);
+
+ if (!dev) {
+ if (setup_netdev(chan, &dev) < 0) {
+ l2cap_chan_del(chan, -ENOENT);
+ return;
+ }
+ new_netdev = true;
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return;
+
+ add_peer_chan(chan, dev, new_netdev);
+ ifup(dev->netdev);
+}
+
+static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan)
+{
+ struct l2cap_chan *chan;
+
+ chan = chan_create();
+ if (!chan)
+ return NULL;
+
+ chan->ops = pchan->ops;
+
+ BT_DBG("chan %p pchan %p", chan, pchan);
+
+ return chan;
+}
+
+static void delete_netdev(struct work_struct *work)
+{
+ struct lowpan_btle_dev *entry = container_of(work,
+ struct lowpan_btle_dev,
+ delete_netdev);
+
+ lowpan_unregister_netdev(entry->netdev);
+
+ /* The entry pointer is deleted by the netdev destructor. */
+}
+
+static void chan_close_cb(struct l2cap_chan *chan)
+{
+ struct lowpan_btle_dev *entry;
+ struct lowpan_btle_dev *dev = NULL;
+ struct lowpan_peer *peer;
+ int err = -ENOENT;
+ bool last = false, remove = true;
+
+ BT_DBG("chan %p conn %p", chan, chan->conn);
+
+ if (chan->conn && chan->conn->hcon) {
+ if (!is_bt_6lowpan(chan->conn->hcon))
+ return;
+
+ /* If conn is set, then the netdev is also there and we should
+ * not remove it.
+ */
+ remove = false;
+ }
+
+ spin_lock(&devices_lock);
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ dev = lowpan_btle_dev(entry->netdev);
+ peer = __peer_lookup_chan(dev, chan);
+ if (peer) {
+ last = peer_del(dev, peer);
+ err = 0;
+
+ BT_DBG("dev %p removing %speer %p", dev,
+ last ? "last " : "1 ", peer);
+ BT_DBG("chan %p orig refcnt %u", chan,
+ kref_read(&chan->kref));
+
+ l2cap_chan_put(chan);
+ break;
+ }
+ }
+
+ if (!err && last && dev && !atomic_read(&dev->peer_count)) {
+ spin_unlock(&devices_lock);
+
+ cancel_delayed_work_sync(&dev->notify_peers);
+
+ ifdown(dev->netdev);
+
+ if (remove) {
+ INIT_WORK(&entry->delete_netdev, delete_netdev);
+ schedule_work(&entry->delete_netdev);
+ }
+ } else {
+ spin_unlock(&devices_lock);
+ }
+}
+
+static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err)
+{
+ BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn,
+ state_to_string(state), err);
+}
+
+static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ struct sk_buff *skb;
+
+ /* Note that we must allocate using GFP_ATOMIC here as
+ * this function is called originally from netdev hard xmit
+ * function in atomic context.
+ */
+ skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+ return skb;
+}
+
+static void chan_suspend_cb(struct l2cap_chan *chan)
+{
+ struct lowpan_btle_dev *dev;
+
+ BT_DBG("chan %p suspend", chan);
+
+ dev = lookup_dev(chan->conn);
+ if (!dev || !dev->netdev)
+ return;
+
+ netif_stop_queue(dev->netdev);
+}
+
+static void chan_resume_cb(struct l2cap_chan *chan)
+{
+ struct lowpan_btle_dev *dev;
+
+ BT_DBG("chan %p resume", chan);
+
+ dev = lookup_dev(chan->conn);
+ if (!dev || !dev->netdev)
+ return;
+
+ netif_wake_queue(dev->netdev);
+}
+
+static long chan_get_sndtimeo_cb(struct l2cap_chan *chan)
+{
+ return L2CAP_CONN_TIMEOUT;
+}
+
+static const struct l2cap_ops bt_6lowpan_chan_ops = {
+ .name = "L2CAP 6LoWPAN channel",
+ .new_connection = chan_new_conn_cb,
+ .recv = chan_recv_cb,
+ .close = chan_close_cb,
+ .state_change = chan_state_change_cb,
+ .ready = chan_ready_cb,
+ .resume = chan_resume_cb,
+ .suspend = chan_suspend_cb,
+ .get_sndtimeo = chan_get_sndtimeo_cb,
+ .alloc_skb = chan_alloc_skb_cb,
+
+ .teardown = l2cap_chan_no_teardown,
+ .defer = l2cap_chan_no_defer,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+};
+
+static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
+{
+ struct l2cap_chan *chan;
+ int err;
+
+ chan = chan_create();
+ if (!chan)
+ return -EINVAL;
+
+ chan->ops = &bt_6lowpan_chan_ops;
+
+ err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
+ addr, dst_type, L2CAP_CONN_TIMEOUT);
+
+ BT_DBG("chan %p err %d", chan, err);
+ if (err < 0)
+ l2cap_chan_put(chan);
+
+ return err;
+}
+
+static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
+{
+ struct lowpan_peer *peer;
+
+ BT_DBG("conn %p dst type %u", conn, dst_type);
+
+ peer = lookup_peer(conn);
+ if (!peer)
+ return -ENOENT;
+
+ BT_DBG("peer %p chan %p", peer, peer->chan);
+
+ l2cap_chan_lock(peer->chan);
+ l2cap_chan_close(peer->chan, ENOENT);
+ l2cap_chan_unlock(peer->chan);
+
+ return 0;
+}
+
+static struct l2cap_chan *bt_6lowpan_listen(void)
+{
+ bdaddr_t *addr = BDADDR_ANY;
+ struct l2cap_chan *chan;
+ int err;
+
+ if (!enable_6lowpan)
+ return NULL;
+
+ chan = chan_create();
+ if (!chan)
+ return NULL;
+
+ chan->ops = &bt_6lowpan_chan_ops;
+ chan->state = BT_LISTEN;
+ chan->src_type = BDADDR_LE_PUBLIC;
+
+ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
+
+ BT_DBG("chan %p src type %u", chan, chan->src_type);
+
+ err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP));
+ if (err) {
+ l2cap_chan_put(chan);
+ BT_ERR("psm cannot be added err %d", err);
+ return NULL;
+ }
+
+ return chan;
+}
+
+static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
+ struct l2cap_conn **conn, bool disconnect)
+{
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int le_addr_type;
+ int n;
+
+ n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
+ &addr->b[5], &addr->b[4], &addr->b[3],
+ &addr->b[2], &addr->b[1], &addr->b[0],
+ addr_type);
+
+ if (n < 7)
+ return -EINVAL;
+
+ if (disconnect) {
+ /* The "disconnect" debugfs command has used different address
+ * type constants than "connect" since 2015. Let's retain that
+ * for now even though it's obviously buggy...
+ */
+ *addr_type += 1;
+ }
+
+ switch (*addr_type) {
+ case BDADDR_LE_PUBLIC:
+ le_addr_type = ADDR_LE_DEV_PUBLIC;
+ break;
+ case BDADDR_LE_RANDOM:
+ le_addr_type = ADDR_LE_DEV_RANDOM;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */
+ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC);
+ if (!hdev)
+ return -ENOENT;
+
+ hci_dev_lock(hdev);
+ hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type);
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+
+ if (!hcon)
+ return -ENOENT;
+
+ *conn = (struct l2cap_conn *)hcon->l2cap_data;
+
+ BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type);
+
+ return 0;
+}
+
+static void disconnect_all_peers(void)
+{
+ struct lowpan_btle_dev *entry;
+ struct lowpan_peer *peer;
+ int nchans;
+
+ /* l2cap_chan_close() cannot be called from RCU, and lock ordering
+ * chan->lock > devices_lock prevents taking write side lock, so copy
+ * then close.
+ */
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list)
+ list_for_each_entry_rcu(peer, &entry->peers, list)
+ clear_bit(LOWPAN_PEER_CLOSING, peer->flags);
+ rcu_read_unlock();
+
+ do {
+ struct l2cap_chan *chans[32];
+ int i;
+
+ nchans = 0;
+
+ spin_lock(&devices_lock);
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ list_for_each_entry_rcu(peer, &entry->peers, list) {
+ if (test_and_set_bit(LOWPAN_PEER_CLOSING,
+ peer->flags))
+ continue;
+
+ l2cap_chan_hold(peer->chan);
+ chans[nchans++] = peer->chan;
+
+ if (nchans >= ARRAY_SIZE(chans))
+ goto done;
+ }
+ }
+
+done:
+ spin_unlock(&devices_lock);
+
+ for (i = 0; i < nchans; ++i) {
+ l2cap_chan_lock(chans[i]);
+ l2cap_chan_close(chans[i], ENOENT);
+ l2cap_chan_unlock(chans[i]);
+ l2cap_chan_put(chans[i]);
+ }
+ } while (nchans);
+}
+
+struct set_enable {
+ struct work_struct work;
+ bool flag;
+};
+
+static void do_enable_set(struct work_struct *work)
+{
+ struct set_enable *set_enable = container_of(work,
+ struct set_enable, work);
+
+ if (!set_enable->flag || enable_6lowpan != set_enable->flag)
+ /* Disconnect existing connections if 6lowpan is
+ * disabled
+ */
+ disconnect_all_peers();
+
+ enable_6lowpan = set_enable->flag;
+
+ mutex_lock(&set_lock);
+ if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
+ l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
+ l2cap_chan_put(listen_chan);
+ }
+
+ listen_chan = bt_6lowpan_listen();
+ mutex_unlock(&set_lock);
+
+ kfree(set_enable);
+}
+
+static int lowpan_enable_set(void *data, u64 val)
+{
+ struct set_enable *set_enable;
+
+ set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL);
+ if (!set_enable)
+ return -ENOMEM;
+
+ set_enable->flag = !!val;
+ INIT_WORK(&set_enable->work, do_enable_set);
+
+ schedule_work(&set_enable->work);
+
+ return 0;
+}
+
+static int lowpan_enable_get(void *data, u64 *val)
+{
+ *val = enable_6lowpan;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get,
+ lowpan_enable_set, "%llu\n");
+
+static ssize_t lowpan_control_write(struct file *fp,
+ const char __user *user_buffer,
+ size_t count,
+ loff_t *position)
+{
+ char buf[32];
+ size_t buf_size = min(count, sizeof(buf) - 1);
+ int ret;
+ bdaddr_t addr;
+ u8 addr_type;
+ struct l2cap_conn *conn = NULL;
+
+ if (copy_from_user(buf, user_buffer, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+
+ if (memcmp(buf, "connect ", 8) == 0) {
+ ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false);
+ if (ret == -EINVAL)
+ return ret;
+
+ mutex_lock(&set_lock);
+ if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
+ l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
+ l2cap_chan_put(listen_chan);
+ listen_chan = NULL;
+ }
+ mutex_unlock(&set_lock);
+
+ if (conn) {
+ struct lowpan_peer *peer;
+
+ if (!is_bt_6lowpan(conn->hcon))
+ return -EINVAL;
+
+ peer = lookup_peer(conn);
+ if (peer) {
+ BT_DBG("6LoWPAN connection already exists");
+ return -EALREADY;
+ }
+
+ BT_DBG("conn %p dst %pMR type %d user %u", conn,
+ &conn->hcon->dst, conn->hcon->dst_type,
+ addr_type);
+ }
+
+ ret = bt_6lowpan_connect(&addr, addr_type);
+ if (ret < 0)
+ return ret;
+
+ return count;
+ }
+
+ if (memcmp(buf, "disconnect ", 11) == 0) {
+ ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true);
+ if (ret < 0)
+ return ret;
+
+ ret = bt_6lowpan_disconnect(conn, addr_type);
+ if (ret < 0)
+ return ret;
+
+ return count;
+ }
+
+ return count;
+}
+
+static int lowpan_control_show(struct seq_file *f, void *ptr)
+{
+ struct lowpan_btle_dev *entry;
+ struct lowpan_peer *peer;
+
+ spin_lock(&devices_lock);
+
+ list_for_each_entry(entry, &bt_6lowpan_devices, list) {
+ list_for_each_entry(peer, &entry->peers, list)
+ seq_printf(f, "%pMR (type %u)\n",
+ &peer->chan->dst, peer->chan->dst_type);
+ }
+
+ spin_unlock(&devices_lock);
+
+ return 0;
+}
+
+static int lowpan_control_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lowpan_control_show, inode->i_private);
+}
+
+static const struct file_operations lowpan_control_fops = {
+ .open = lowpan_control_open,
+ .read = seq_read,
+ .write = lowpan_control_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void disconnect_devices(void)
+{
+ struct lowpan_btle_dev *entry, *tmp, *new_dev;
+ struct list_head devices;
+
+ INIT_LIST_HEAD(&devices);
+
+ /* We make a separate list of devices because the unregister_netdev()
+ * will call device_event() which will also want to modify the same
+ * devices list.
+ */
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC);
+ if (!new_dev)
+ break;
+
+ new_dev->netdev = entry->netdev;
+ INIT_LIST_HEAD(&new_dev->list);
+
+ list_add_rcu(&new_dev->list, &devices);
+ }
+
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(entry, tmp, &devices, list) {
+ ifdown(entry->netdev);
+ BT_DBG("Unregistering netdev %s %p",
+ entry->netdev->name, entry->netdev);
+ lowpan_unregister_netdev(entry->netdev);
+ kfree(entry);
+ }
+}
+
+static int device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct lowpan_btle_dev *entry;
+
+ if (netdev->type != ARPHRD_6LOWPAN)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ spin_lock(&devices_lock);
+ list_for_each_entry(entry, &bt_6lowpan_devices, list) {
+ if (entry->netdev == netdev) {
+ BT_DBG("Unregistered netdev %s %p",
+ netdev->name, netdev);
+ list_del(&entry->list);
+ break;
+ }
+ }
+ spin_unlock(&devices_lock);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block bt_6lowpan_dev_notifier = {
+ .notifier_call = device_event,
+};
+
+static int __init bt_6lowpan_init(void)
+{
+ lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable",
+ 0644, bt_debugfs,
+ NULL,
+ &lowpan_enable_fops);
+ lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644,
+ bt_debugfs, NULL,
+ &lowpan_control_fops);
+
+ return register_netdevice_notifier(&bt_6lowpan_dev_notifier);
+}
+
+static void __exit bt_6lowpan_exit(void)
+{
+ debugfs_remove(lowpan_enable_debugfs);
+ debugfs_remove(lowpan_control_debugfs);
+
+ if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
+ l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
+ l2cap_chan_put(listen_chan);
+ }
+
+ disconnect_devices();
+
+ unregister_netdevice_notifier(&bt_6lowpan_dev_notifier);
+}
+
+module_init(bt_6lowpan_init);
+module_exit(bt_6lowpan_exit);
+
+MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>");
+MODULE_DESCRIPTION("Bluetooth 6LoWPAN");
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 6929490d095a..6b2b65a66700 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -1,22 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Bluetooth subsystem configuration
#
menuconfig BT
- depends on NET
tristate "Bluetooth subsystem support"
+ depends on !S390
+ depends on RFKILL || !RFKILL
+ select CRC16
+ select CRYPTO
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_AES
+ imply CRYPTO_AES
+ select CRYPTO_CMAC
+ select CRYPTO_ECB
+ select CRYPTO_SHA256
+ select CRYPTO_ECDH
help
Bluetooth is low-cost, low-power, short-range wireless technology.
It was designed as a replacement for cables and other short-range
technologies like IrDA. Bluetooth operates in personal area range
that typically extends up to 10 meters. More information about
- Bluetooth can be found at <http://www.bluetooth.com/>.
+ Bluetooth can be found at <https://www.bluetooth.com/>.
Linux Bluetooth subsystem consist of several layers:
- Bluetooth Core (HCI device and connection manager, scheduler)
+ Bluetooth Core
+ HCI device and connection manager, scheduler
+ SCO audio links
+ L2CAP (Logical Link Control and Adaptation Protocol)
+ SMP (Security Manager Protocol) on LE (Low Energy) links
+ ISO isochronous links
HCI Device drivers (Interface to the hardware)
- SCO Module (SCO audio links)
- L2CAP Module (Logical Link Control and Adaptation Protocol)
RFCOMM Module (RFCOMM Protocol)
BNEP Module (Bluetooth Network Encapsulation Protocol)
CMTP Module (CAPI Message Transport Protocol)
@@ -26,38 +40,116 @@ menuconfig BT
compile it as module (bluetooth).
To use Linux Bluetooth subsystem, you will need several user-space
- utilities like hciconfig and hcid. These utilities and updates to
- Bluetooth kernel modules are provided in the BlueZ packages.
- For more information, see <http://www.bluez.org/>.
+ utilities like hciconfig and bluetoothd. These utilities and updates
+ to Bluetooth kernel modules are provided in the BlueZ packages. For
+ more information, see <http://www.bluez.org/>.
-config BT_L2CAP
- tristate "L2CAP protocol support"
+config BT_BREDR
+ bool "Bluetooth Classic (BR/EDR) features"
depends on BT
+ default y
help
- L2CAP (Logical Link Control and Adaptation Protocol) provides
- connection oriented and connection-less data transport. L2CAP
- support is required for most Bluetooth applications.
+ Bluetooth Classic includes support for Basic Rate (BR)
+ available with Bluetooth version 1.0b or later and support
+ for Enhanced Data Rate (EDR) available with Bluetooth
+ version 2.0 or later.
- Say Y here to compile L2CAP support into the kernel or say M to
- compile it as module (l2cap).
+source "net/bluetooth/rfcomm/Kconfig"
+
+source "net/bluetooth/bnep/Kconfig"
+
+source "net/bluetooth/cmtp/Kconfig"
+
+source "net/bluetooth/hidp/Kconfig"
-config BT_SCO
- tristate "SCO links support"
+config BT_LE
+ bool "Bluetooth Low Energy (LE) features"
depends on BT
+ default y
help
- SCO link provides voice transport over Bluetooth. SCO support is
- required for voice applications like Headset and Audio.
+ Bluetooth Low Energy includes support low-energy physical
+ layer available with Bluetooth version 4.0 or later.
- Say Y here to compile SCO support into the kernel or say M to
- compile it as module (sco).
+config BT_LE_L2CAP_ECRED
+ bool "Bluetooth L2CAP Enhanced Credit Flow Control"
+ depends on BT_LE
+ default y
+ help
+ Bluetooth Low Energy L2CAP Enhanced Credit Flow Control available with
+ Bluetooth version 5.2 or later.
-source "net/bluetooth/rfcomm/Kconfig"
+ This can be overridden by passing bluetooth.enable_ecred=[1|0]
+ on the kernel commandline.
-source "net/bluetooth/bnep/Kconfig"
+config BT_6LOWPAN
+ tristate "Bluetooth 6LoWPAN support"
+ depends on BT_LE && 6LOWPAN
+ help
+ IPv6 compression over Bluetooth Low Energy.
-source "net/bluetooth/cmtp/Kconfig"
+config BT_LEDS
+ bool "Enable LED triggers"
+ depends on BT
+ depends on LEDS_CLASS
+ select LEDS_TRIGGERS
+ help
+ This option selects a few LED triggers for different
+ Bluetooth events.
-source "net/bluetooth/hidp/Kconfig"
+config BT_MSFTEXT
+ bool "Enable Microsoft extensions"
+ depends on BT
+ help
+ This options enables support for the Microsoft defined HCI
+ vendor extensions.
-source "drivers/bluetooth/Kconfig"
+config BT_AOSPEXT
+ bool "Enable Android Open Source Project extensions"
+ depends on BT
+ help
+ This options enables support for the Android Open Source
+ Project defined HCI vendor extensions.
+
+config BT_DEBUGFS
+ bool "Export Bluetooth internals in debugfs"
+ depends on BT && DEBUG_FS
+ default y
+ help
+ Provide extensive information about internal Bluetooth states
+ in debugfs.
+
+config BT_SELFTEST
+ bool "Bluetooth self testing support"
+ depends on BT && DEBUG_KERNEL
+ help
+ Run self tests when initializing the Bluetooth subsystem. This
+ is a developer option and can cause significant delay when booting
+ the system.
+ When the Bluetooth subsystem is built as module, then the test
+ cases are run first thing at module load time. When the Bluetooth
+ subsystem is compiled into the kernel image, then the test cases
+ are run late in the initcall hierarchy.
+
+config BT_SELFTEST_ECDH
+ bool "ECDH test cases"
+ depends on BT_LE && BT_SELFTEST
+ help
+ Run test cases for ECDH cryptographic functionality used by the
+ Bluetooth Low Energy Secure Connections feature.
+
+config BT_SELFTEST_SMP
+ bool "SMP test cases"
+ depends on BT_LE && BT_SELFTEST
+ help
+ Run test cases for SMP cryptographic functionality, including both
+ legacy SMP as well as the Secure Connections features.
+
+config BT_FEATURE_DEBUG
+ bool "Enable runtime option for debugging statements"
+ depends on BT && !DYNAMIC_DEBUG
+ help
+ This provides an option to enable/disable debugging statements
+ at runtime via the experimental features interface.
+
+source "drivers/bluetooth/Kconfig"
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index d1e433f7d673..a7eede7616d8 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -1,13 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux Bluetooth subsystem.
#
obj-$(CONFIG_BT) += bluetooth.o
-obj-$(CONFIG_BT_L2CAP) += l2cap.o
-obj-$(CONFIG_BT_SCO) += sco.o
obj-$(CONFIG_BT_RFCOMM) += rfcomm/
obj-$(CONFIG_BT_BNEP) += bnep/
obj-$(CONFIG_BT_CMTP) += cmtp/
obj-$(CONFIG_BT_HIDP) += hidp/
+obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o
-bluetooth-objs := af_bluetooth.o hci_core.o hci_conn.o hci_event.o hci_sock.o hci_sysfs.o lib.o
+bluetooth_6lowpan-y := 6lowpan.o
+
+bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
+ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \
+ ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o \
+ hci_drv.o
+
+bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
+
+bluetooth-$(CONFIG_BT_BREDR) += sco.o
+bluetooth-$(CONFIG_BT_LE) += iso.o
+bluetooth-$(CONFIG_BT_LEDS) += leds.o
+bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o
+bluetooth-$(CONFIG_BT_AOSPEXT) += aosp.o
+bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
+bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 67df99e2e5c8..2b94e2077203 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -1,4 +1,4 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (C) 2000-2001 Qualcomm Incorporated
@@ -12,50 +12,77 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth address family and sockets. */
#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/stringify.h>
+#include <linux/sched/signal.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <net/sock.h>
-
-#if defined(CONFIG_KMOD)
-#include <linux/kmod.h>
-#endif
+#include <asm/ioctls.h>
#include <net/bluetooth/bluetooth.h>
+#include <linux/proc_fs.h>
-#ifndef CONFIG_BT_SOCK_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
-#define VERSION "2.11"
+#include "leds.h"
+#include "selftest.h"
/* Bluetooth sockets */
-#define BT_MAX_PROTO 8
-static struct net_proto_family *bt_proto[BT_MAX_PROTO];
+#define BT_MAX_PROTO (BTPROTO_LAST + 1)
+static const struct net_proto_family *bt_proto[BT_MAX_PROTO];
static DEFINE_RWLOCK(bt_proto_lock);
-int bt_sock_register(int proto, struct net_proto_family *ops)
+static struct lock_class_key bt_lock_key[BT_MAX_PROTO];
+static const char *const bt_key_strings[BT_MAX_PROTO] = {
+ "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_HCI",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_SCO",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_BNEP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_CMTP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_HIDP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_ISO",
+};
+
+static struct lock_class_key bt_slock_key[BT_MAX_PROTO];
+static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
+ "slock-AF_BLUETOOTH-BTPROTO_L2CAP",
+ "slock-AF_BLUETOOTH-BTPROTO_HCI",
+ "slock-AF_BLUETOOTH-BTPROTO_SCO",
+ "slock-AF_BLUETOOTH-BTPROTO_RFCOMM",
+ "slock-AF_BLUETOOTH-BTPROTO_BNEP",
+ "slock-AF_BLUETOOTH-BTPROTO_CMTP",
+ "slock-AF_BLUETOOTH-BTPROTO_HIDP",
+ "slock-AF_BLUETOOTH-BTPROTO_AVDTP",
+ "slock-AF_BLUETOOTH-BTPROTO_ISO",
+};
+
+void bt_sock_reclassify_lock(struct sock *sk, int proto)
+{
+ BUG_ON(!sk);
+ BUG_ON(!sock_allow_reclassification(sk));
+
+ sock_lock_init_class_and_name(sk,
+ bt_slock_key_strings[proto], &bt_slock_key[proto],
+ bt_key_strings[proto], &bt_lock_key[proto]);
+}
+EXPORT_SYMBOL(bt_sock_reclassify_lock);
+
+int bt_sock_register(int proto, const struct net_proto_family *ops)
{
int err = 0;
@@ -75,86 +102,162 @@ int bt_sock_register(int proto, struct net_proto_family *ops)
}
EXPORT_SYMBOL(bt_sock_register);
-int bt_sock_unregister(int proto)
+void bt_sock_unregister(int proto)
{
- int err = 0;
-
if (proto < 0 || proto >= BT_MAX_PROTO)
- return -EINVAL;
+ return;
write_lock(&bt_proto_lock);
-
- if (!bt_proto[proto])
- err = -ENOENT;
- else
- bt_proto[proto] = NULL;
-
+ bt_proto[proto] = NULL;
write_unlock(&bt_proto_lock);
-
- return err;
}
EXPORT_SYMBOL(bt_sock_unregister);
-static int bt_sock_create(struct socket *sock, int proto)
+static int bt_sock_create(struct net *net, struct socket *sock, int proto,
+ int kern)
{
int err;
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
if (proto < 0 || proto >= BT_MAX_PROTO)
return -EINVAL;
-#if defined(CONFIG_KMOD)
- if (!bt_proto[proto]) {
+ if (!bt_proto[proto])
request_module("bt-proto-%d", proto);
- }
-#endif
err = -EPROTONOSUPPORT;
read_lock(&bt_proto_lock);
if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) {
- err = bt_proto[proto]->create(sock, proto);
+ err = bt_proto[proto]->create(net, sock, proto, kern);
+ if (!err)
+ bt_sock_reclassify_lock(sock->sk, proto);
module_put(bt_proto[proto]->owner);
}
read_unlock(&bt_proto_lock);
- return err;
+ return err;
+}
+
+struct sock *bt_sock_alloc(struct net *net, struct socket *sock,
+ struct proto *prot, int proto, gfp_t prio, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, prot, kern);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+ INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = proto;
+ sk->sk_state = BT_OPEN;
+
+ /* Init peer information so it can be properly monitored */
+ if (!kern) {
+ spin_lock(&sk->sk_peer_lock);
+ sk->sk_peer_pid = get_pid(task_tgid(current));
+ sk->sk_peer_cred = get_current_cred();
+ spin_unlock(&sk->sk_peer_lock);
+ }
+
+ return sk;
}
+EXPORT_SYMBOL(bt_sock_alloc);
void bt_sock_link(struct bt_sock_list *l, struct sock *sk)
{
- write_lock_bh(&l->lock);
+ write_lock(&l->lock);
sk_add_node(sk, &l->head);
- write_unlock_bh(&l->lock);
+ write_unlock(&l->lock);
}
EXPORT_SYMBOL(bt_sock_link);
void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk)
{
- write_lock_bh(&l->lock);
+ write_lock(&l->lock);
sk_del_node_init(sk);
- write_unlock_bh(&l->lock);
+ write_unlock(&l->lock);
}
EXPORT_SYMBOL(bt_sock_unlink);
-void bt_accept_enqueue(struct sock *parent, struct sock *sk)
+bool bt_sock_linked(struct bt_sock_list *l, struct sock *s)
+{
+ struct sock *sk;
+
+ if (!l || !s)
+ return false;
+
+ read_lock(&l->lock);
+
+ sk_for_each(sk, &l->head) {
+ if (s == sk) {
+ read_unlock(&l->lock);
+ return true;
+ }
+ }
+
+ read_unlock(&l->lock);
+
+ return false;
+}
+EXPORT_SYMBOL(bt_sock_linked);
+
+void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
{
+ const struct cred *old_cred;
+ struct pid *old_pid;
+
BT_DBG("parent %p, sk %p", parent, sk);
sock_hold(sk);
+
+ if (bh)
+ bh_lock_sock_nested(sk);
+ else
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
bt_sk(sk)->parent = parent;
- parent->sk_ack_backlog++;
+
+ /* Copy credentials from parent since for incoming connections the
+ * socket is allocated by the kernel.
+ */
+ spin_lock(&sk->sk_peer_lock);
+ old_pid = sk->sk_peer_pid;
+ old_cred = sk->sk_peer_cred;
+ sk->sk_peer_pid = get_pid(parent->sk_peer_pid);
+ sk->sk_peer_cred = get_cred(parent->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ put_pid(old_pid);
+ put_cred(old_cred);
+
+ if (bh)
+ bh_unlock_sock(sk);
+ else
+ release_sock(sk);
+
+ sk_acceptq_added(parent);
}
EXPORT_SYMBOL(bt_accept_enqueue);
+/* Calling function must hold the sk lock.
+ * bt_sk(sk)->parent must be non-NULL meaning sk is in the parent list.
+ */
void bt_accept_unlink(struct sock *sk)
{
BT_DBG("sk %p state %d", sk, sk->sk_state);
list_del_init(&bt_sk(sk)->accept_q);
- bt_sk(sk)->parent->sk_ack_backlog--;
+ sk_acceptq_removed(bt_sk(sk)->parent);
bt_sk(sk)->parent = NULL;
sock_put(sk);
}
@@ -162,130 +265,440 @@ EXPORT_SYMBOL(bt_accept_unlink);
struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
{
- struct list_head *p, *n;
+ struct bt_sock *s, *n;
struct sock *sk;
BT_DBG("parent %p", parent);
- list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
- sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+restart:
+ list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)s;
+ /* Prevent early freeing of sk due to unlink and sock_kill */
+ sock_hold(sk);
lock_sock(sk);
+ /* Check sk has not already been unlinked via
+ * bt_accept_unlink() due to serialisation caused by sk locking
+ */
+ if (!bt_sk(sk)->parent) {
+ BT_DBG("sk %p, already unlinked", sk);
+ release_sock(sk);
+ sock_put(sk);
+
+ /* Restart the loop as sk is no longer in the list
+ * and also avoid a potential infinite loop because
+ * list_for_each_entry_safe() is not thread safe.
+ */
+ goto restart;
+ }
+
+ /* sk is safely in the parent list so reduce reference count */
+ sock_put(sk);
+
/* FIXME: Is this check still needed */
if (sk->sk_state == BT_CLOSED) {
- release_sock(sk);
bt_accept_unlink(sk);
+ release_sock(sk);
continue;
}
- if (sk->sk_state == BT_CONNECTED || !newsock) {
+ if (sk->sk_state == BT_CONNECTED || !newsock ||
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) {
bt_accept_unlink(sk);
if (newsock)
sock_graft(sk, newsock);
+
release_sock(sk);
return sk;
}
release_sock(sk);
}
+
return NULL;
}
EXPORT_SYMBOL(bt_accept_dequeue);
-int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len, int flags)
+int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
{
- int noblock = flags & MSG_DONTWAIT;
struct sock *sk = sock->sk;
struct sk_buff *skb;
size_t copied;
+ size_t skblen;
int err;
- BT_DBG("sock %p sk %p len %d", sock, sk, len);
+ BT_DBG("sock %p sk %p len %zu", sock, sk, len);
- if (flags & (MSG_OOB))
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
- if (!(skb = skb_recv_datagram(sk, flags, noblock, &err))) {
+ skb = skb_recv_datagram(sk, flags, &err);
+ if (!skb) {
if (sk->sk_shutdown & RCV_SHUTDOWN)
- return 0;
+ err = 0;
+
return err;
}
- msg->msg_namelen = 0;
-
+ skblen = skb->len;
copied = skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- skb->h.raw = skb->data;
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ skb_reset_transport_header(skb);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err == 0) {
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (msg->msg_name && bt_sk(sk)->skb_msg_name)
+ bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
+ &msg->msg_namelen);
+
+ if (test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags)) {
+ u8 pkt_status = hci_skb_pkt_status(skb);
+
+ put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS,
+ sizeof(pkt_status), &pkt_status);
+ }
+
+ if (test_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags)) {
+ u16 pkt_seqnum = hci_skb_pkt_seqnum(skb);
+
+ put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_SEQNUM,
+ sizeof(pkt_seqnum), &pkt_seqnum);
+ }
+ }
skb_free_datagram(sk, skb);
+ if (flags & MSG_TRUNC)
+ copied = skblen;
+
return err ? : copied;
}
EXPORT_SYMBOL(bt_sock_recvmsg);
-static inline unsigned int bt_accept_poll(struct sock *parent)
+static long bt_sock_data_wait(struct sock *sk, long timeo)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+
+ if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN))
+ break;
+
+ if (signal_pending(current) || !timeo)
+ break;
+
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ size_t target, copied = 0;
+ long timeo;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ BT_DBG("sk %p size %zu", sk, size);
+
+ lock_sock(sk);
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ struct sk_buff *skb;
+ int chunk;
+
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (!skb) {
+ if (copied >= target)
+ break;
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+
+ timeo = bt_sock_data_wait(sk, timeo);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+ continue;
+ }
+
+ chunk = min_t(unsigned int, skb->len, size);
+ if (skb_copy_datagram_msg(skb, 0, msg, chunk)) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (!(flags & MSG_PEEK)) {
+ int skb_len = skb_headlen(skb);
+
+ if (chunk <= skb_len) {
+ __skb_pull(skb, chunk);
+ } else {
+ struct sk_buff *frag;
+
+ __skb_pull(skb, skb_len);
+ chunk -= skb_len;
+
+ skb_walk_frags(skb, frag) {
+ if (chunk <= frag->len) {
+ /* Pulling partial data */
+ skb->len -= chunk;
+ skb->data_len -= chunk;
+ __skb_pull(frag, chunk);
+ break;
+ } else if (frag->len) {
+ /* Pulling all frag data */
+ chunk -= frag->len;
+ skb->len -= frag->len;
+ skb->data_len -= frag->len;
+ __skb_pull(frag, frag->len);
+ }
+ }
+ }
+
+ if (skb->len) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ kfree_skb(skb);
+
+ } else {
+ /* put message back and return */
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ } while (size);
+
+out:
+ release_sock(sk);
+ return copied ? : err;
+}
+EXPORT_SYMBOL(bt_sock_stream_recvmsg);
+
+static inline __poll_t bt_accept_poll(struct sock *parent)
{
- struct list_head *p, *n;
+ struct bt_sock *s, *n;
struct sock *sk;
- list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
- sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
- if (sk->sk_state == BT_CONNECTED)
- return POLLIN | POLLRDNORM;
+ list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)s;
+ if (sk->sk_state == BT_CONNECTED ||
+ (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) &&
+ sk->sk_state == BT_CONNECT2))
+ return EPOLLIN | EPOLLRDNORM;
}
return 0;
}
-unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *wait)
+__poll_t bt_sock_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
{
struct sock *sk = sock->sk;
- unsigned int mask = 0;
+ __poll_t mask = 0;
- BT_DBG("sock %p, sk %p", sock, sk);
-
- poll_wait(file, sk->sk_sleep, wait);
+ poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == BT_LISTEN)
return bt_accept_poll(sk);
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
+ mask |= EPOLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLRDHUP;
+ mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
- mask |= POLLHUP;
+ mask |= EPOLLHUP;
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
- mask |= POLLIN | POLLRDNORM;
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
if (sk->sk_state == BT_CLOSED)
- mask |= POLLHUP;
+ mask |= EPOLLHUP;
if (sk->sk_state == BT_CONNECT ||
- sk->sk_state == BT_CONNECT2 ||
- sk->sk_state == BT_CONFIG)
+ sk->sk_state == BT_CONNECT2 ||
+ sk->sk_state == BT_CONFIG)
return mask;
- if (sock_writeable(sk))
- mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
+ mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
EXPORT_SYMBOL(bt_sock_poll);
+static int bt_ethtool_get_ts_info(struct sock *sk, unsigned int index,
+ void __user *useraddr)
+{
+ struct ethtool_ts_info info;
+ struct kernel_ethtool_ts_info ts_info = {};
+ int ret;
+
+ ret = hci_ethtool_ts_info(index, sk->sk_protocol, &ts_info);
+ if (ret == -ENODEV)
+ return ret;
+ else if (ret < 0)
+ return -EIO;
+
+ memset(&info, 0, sizeof(info));
+
+ info.cmd = ETHTOOL_GET_TS_INFO;
+ info.so_timestamping = ts_info.so_timestamping;
+ info.phc_index = ts_info.phc_index;
+ info.tx_types = ts_info.tx_types;
+ info.rx_filters = ts_info.rx_filters;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bt_ethtool(struct sock *sk, const struct ifreq *ifr,
+ void __user *useraddr)
+{
+ unsigned int index;
+ u32 ethcmd;
+ int n;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ if (sscanf(ifr->ifr_name, "hci%u%n", &index, &n) != 1 ||
+ n != strlen(ifr->ifr_name))
+ return -ENODEV;
+
+ switch (ethcmd) {
+ case ETHTOOL_GET_TS_INFO:
+ return bt_ethtool_get_ts_info(sk, index, useraddr);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int bt_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
+{
+ struct sock *sk = sock->sk;
+ struct ifreq ifr = {};
+ void __user *data;
+ char *colon;
+ int ret = -ENOIOCTLCMD;
+
+ if (get_user_ifreq(&ifr, &data, arg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ switch (cmd) {
+ case SIOCETHTOOL:
+ ret = bt_ethtool(sk, &ifr, data);
+ break;
+ }
+
+ if (colon)
+ *colon = ':';
+
+ if (put_user_ifreq(&ifr, arg))
+ return -EFAULT;
+
+ return ret;
+}
+
+int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ long amount;
+ int err;
+
+ BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg);
+
+ switch (cmd) {
+ case TIOCOUTQ:
+ if (sk->sk_state == BT_LISTEN)
+ return -EINVAL;
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ err = put_user(amount, (int __user *)arg);
+ break;
+
+ case TIOCINQ:
+ if (sk->sk_state == BT_LISTEN)
+ return -EINVAL;
+
+ spin_lock(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ amount = skb ? skb->len : 0;
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ err = put_user(amount, (int __user *)arg);
+ break;
+
+ case SIOCETHTOOL:
+ err = bt_dev_ioctl(sock, cmd, (void __user *)arg);
+ break;
+
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bt_sock_ioctl);
+
+/* This function expects the sk lock to be held when called */
int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
{
DECLARE_WAITQUEUE(wait, current);
@@ -293,10 +706,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
BT_DBG("sk %p", sk);
- add_wait_queue(sk->sk_sleep, &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
while (sk->sk_state != state) {
- set_current_state(TASK_INTERRUPTIBLE);
-
if (!timeo) {
err = -EINPROGRESS;
break;
@@ -310,59 +722,254 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
release_sock(sk);
timeo = schedule_timeout(timeo);
lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
err = sock_error(sk);
if (err)
break;
}
- set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
}
EXPORT_SYMBOL(bt_sock_wait_state);
-static struct net_proto_family bt_sock_family_ops = {
+/* This function expects the sk lock to be held when called */
+int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ unsigned long timeo;
+ int err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ timeo = sock_sndtimeo(sk, !!(msg_flags & MSG_DONTWAIT));
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags)) {
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ return err;
+}
+EXPORT_SYMBOL(bt_sock_wait_ready);
+
+#ifdef CONFIG_PROC_FS
+static void *bt_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(seq->private->l->lock)
+{
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
+
+ read_lock(&l->lock);
+ return seq_hlist_start_head(&l->head, *pos);
+}
+
+static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
+
+ return seq_hlist_next(v, &l->head, pos);
+}
+
+static void bt_seq_stop(struct seq_file *seq, void *v)
+ __releases(seq->private->l->lock)
+{
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
+
+ read_unlock(&l->lock);
+}
+
+static int bt_seq_show(struct seq_file *seq, void *v)
+{
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent");
+
+ if (l->custom_seq_show) {
+ seq_putc(seq, ' ');
+ l->custom_seq_show(seq, v);
+ }
+
+ seq_putc(seq, '\n');
+ } else {
+ struct sock *sk = sk_entry(v);
+ struct bt_sock *bt = bt_sk(sk);
+
+ seq_printf(seq,
+ "%pK %-6d %-6u %-6u %-6u %-6lu %-6lu",
+ sk,
+ refcount_read(&sk->sk_refcnt),
+ sk_rmem_alloc_get(sk),
+ sk_wmem_alloc_get(sk),
+ from_kuid(seq_user_ns(seq), sk_uid(sk)),
+ sock_i_ino(sk),
+ bt->parent ? sock_i_ino(bt->parent) : 0LU);
+
+ if (l->custom_seq_show) {
+ seq_putc(seq, ' ');
+ l->custom_seq_show(seq, v);
+ }
+
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations bt_seq_ops = {
+ .start = bt_seq_start,
+ .next = bt_seq_next,
+ .stop = bt_seq_stop,
+ .show = bt_seq_show,
+};
+
+int bt_procfs_init(struct net *net, const char *name,
+ struct bt_sock_list *sk_list,
+ int (*seq_show)(struct seq_file *, void *))
+{
+ sk_list->custom_seq_show = seq_show;
+
+ if (!proc_create_seq_data(name, 0, net->proc_net, &bt_seq_ops, sk_list))
+ return -ENOMEM;
+ return 0;
+}
+
+void bt_procfs_cleanup(struct net *net, const char *name)
+{
+ remove_proc_entry(name, net->proc_net);
+}
+#else
+int bt_procfs_init(struct net *net, const char *name,
+ struct bt_sock_list *sk_list,
+ int (*seq_show)(struct seq_file *, void *))
+{
+ return 0;
+}
+
+void bt_procfs_cleanup(struct net *net, const char *name)
+{
+}
+#endif
+EXPORT_SYMBOL(bt_procfs_init);
+EXPORT_SYMBOL(bt_procfs_cleanup);
+
+static const struct net_proto_family bt_sock_family_ops = {
.owner = THIS_MODULE,
.family = PF_BLUETOOTH,
.create = bt_sock_create,
};
+struct dentry *bt_debugfs;
+EXPORT_SYMBOL_GPL(bt_debugfs);
+
+#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
+ __stringify(BT_SUBSYS_REVISION)
+
static int __init bt_init(void)
{
int err;
+ sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
+
BT_INFO("Core ver %s", VERSION);
- err = bt_sysfs_init();
+ err = bt_selftest();
if (err < 0)
return err;
+ bt_debugfs = debugfs_create_dir("bluetooth", NULL);
+
+ bt_leds_init();
+
+ err = bt_sysfs_init();
+ if (err < 0)
+ goto cleanup_led;
+
err = sock_register(&bt_sock_family_ops);
- if (err < 0) {
- bt_sysfs_cleanup();
- return err;
- }
+ if (err)
+ goto cleanup_sysfs;
BT_INFO("HCI device and connection manager initialized");
- hci_sock_init();
+ err = hci_sock_init();
+ if (err)
+ goto unregister_socket;
+
+ err = l2cap_init();
+ if (err)
+ goto cleanup_socket;
+
+ err = sco_init();
+ if (err)
+ goto cleanup_cap;
+
+ err = mgmt_init();
+ if (err)
+ goto cleanup_sco;
return 0;
+
+cleanup_sco:
+ sco_exit();
+cleanup_cap:
+ l2cap_exit();
+cleanup_socket:
+ hci_sock_cleanup();
+unregister_socket:
+ sock_unregister(PF_BLUETOOTH);
+cleanup_sysfs:
+ bt_sysfs_cleanup();
+cleanup_led:
+ bt_leds_cleanup();
+ debugfs_remove_recursive(bt_debugfs);
+ return err;
}
static void __exit bt_exit(void)
{
+ iso_exit();
+
+ mgmt_exit();
+
+ sco_exit();
+
+ l2cap_exit();
+
hci_sock_cleanup();
sock_unregister(PF_BLUETOOTH);
bt_sysfs_cleanup();
+
+ bt_leds_cleanup();
+
+ debugfs_remove_recursive(bt_debugfs);
}
subsys_initcall(bt_init);
module_exit(bt_exit);
-MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>, Marcel Holtmann <marcel@holtmann.org>");
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
diff --git a/net/bluetooth/aosp.c b/net/bluetooth/aosp.c
new file mode 100644
index 000000000000..59025771af53
--- /dev/null
+++ b/net/bluetooth/aosp.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "aosp.h"
+
+/* Command complete parameters of LE_Get_Vendor_Capabilities_Command
+ * The parameters grow over time. The base version that declares the
+ * version_supported field is v0.95. Refer to
+ * https://cs.android.com/android/platform/superproject/+/master:system/
+ * bt/gd/hci/controller.cc;l=452?q=le_get_vendor_capabilities_handler
+ */
+struct aosp_rp_le_get_vendor_capa {
+ /* v0.95: 15 octets */
+ __u8 status;
+ __u8 max_advt_instances;
+ __u8 offloaded_resolution_of_private_address;
+ __le16 total_scan_results_storage;
+ __u8 max_irk_list_sz;
+ __u8 filtering_support;
+ __u8 max_filter;
+ __u8 activity_energy_info_support;
+ __le16 version_supported;
+ __le16 total_num_of_advt_tracked;
+ __u8 extended_scan_support;
+ __u8 debug_logging_supported;
+ /* v0.96: 16 octets */
+ __u8 le_address_generation_offloading_support;
+ /* v0.98: 21 octets */
+ __le32 a2dp_source_offload_capability_mask;
+ __u8 bluetooth_quality_report_support;
+ /* v1.00: 25 octets */
+ __le32 dynamic_audio_buffer_support;
+} __packed;
+
+#define VENDOR_CAPA_BASE_SIZE 15
+#define VENDOR_CAPA_0_98_SIZE 21
+
+void aosp_do_open(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct aosp_rp_le_get_vendor_capa *rp;
+ u16 version_supported;
+
+ if (!hdev->aosp_capable)
+ return;
+
+ bt_dev_dbg(hdev, "Initialize AOSP extension");
+
+ /* LE Get Vendor Capabilities Command */
+ skb = __hci_cmd_sync(hdev, hci_opcode_pack(0x3f, 0x153), 0, NULL,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR_OR_NULL(skb)) {
+ if (!skb)
+ skb = ERR_PTR(-EIO);
+
+ bt_dev_err(hdev, "AOSP get vendor capabilities (%ld)",
+ PTR_ERR(skb));
+ return;
+ }
+
+ /* A basic length check */
+ if (skb->len < VENDOR_CAPA_BASE_SIZE)
+ goto length_error;
+
+ rp = (struct aosp_rp_le_get_vendor_capa *)skb->data;
+
+ version_supported = le16_to_cpu(rp->version_supported);
+ /* AOSP displays the version number like v0.98, v1.00, etc. */
+ bt_dev_info(hdev, "AOSP extensions version v%u.%02u",
+ version_supported >> 8, version_supported & 0xff);
+
+ /* Do not support very old versions. */
+ if (version_supported < 95) {
+ bt_dev_warn(hdev, "AOSP capabilities version %u too old",
+ version_supported);
+ goto done;
+ }
+
+ if (version_supported < 98) {
+ bt_dev_warn(hdev, "AOSP quality report is not supported");
+ goto done;
+ }
+
+ if (skb->len < VENDOR_CAPA_0_98_SIZE)
+ goto length_error;
+
+ /* The bluetooth_quality_report_support is defined at version
+ * v0.98. Refer to
+ * https://cs.android.com/android/platform/superproject/+/
+ * master:system/bt/gd/hci/controller.cc;l=477
+ */
+ if (rp->bluetooth_quality_report_support) {
+ hdev->aosp_quality_report = true;
+ bt_dev_info(hdev, "AOSP quality report is supported");
+ }
+
+ goto done;
+
+length_error:
+ bt_dev_err(hdev, "AOSP capabilities length %d too short", skb->len);
+
+done:
+ kfree_skb(skb);
+}
+
+void aosp_do_close(struct hci_dev *hdev)
+{
+ if (!hdev->aosp_capable)
+ return;
+
+ bt_dev_dbg(hdev, "Cleanup of AOSP extension");
+}
+
+/* BQR command */
+#define BQR_OPCODE hci_opcode_pack(0x3f, 0x015e)
+
+/* BQR report action */
+#define REPORT_ACTION_ADD 0x00
+#define REPORT_ACTION_DELETE 0x01
+#define REPORT_ACTION_CLEAR 0x02
+
+/* BQR event masks */
+#define QUALITY_MONITORING BIT(0)
+#define APPRAOCHING_LSTO BIT(1)
+#define A2DP_AUDIO_CHOPPY BIT(2)
+#define SCO_VOICE_CHOPPY BIT(3)
+
+#define DEFAULT_BQR_EVENT_MASK (QUALITY_MONITORING | APPRAOCHING_LSTO | \
+ A2DP_AUDIO_CHOPPY | SCO_VOICE_CHOPPY)
+
+/* Reporting at milliseconds so as not to stress the controller too much.
+ * Range: 0 ~ 65535 ms
+ */
+#define DEFALUT_REPORT_INTERVAL_MS 5000
+
+struct aosp_bqr_cp {
+ __u8 report_action;
+ __u32 event_mask;
+ __u16 min_report_interval;
+} __packed;
+
+static int enable_quality_report(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct aosp_bqr_cp cp;
+
+ cp.report_action = REPORT_ACTION_ADD;
+ cp.event_mask = DEFAULT_BQR_EVENT_MASK;
+ cp.min_report_interval = DEFALUT_REPORT_INTERVAL_MS;
+
+ skb = __hci_cmd_sync(hdev, BQR_OPCODE, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR_OR_NULL(skb)) {
+ if (!skb)
+ skb = ERR_PTR(-EIO);
+
+ bt_dev_err(hdev, "Enabling Android BQR failed (%ld)",
+ PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static int disable_quality_report(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct aosp_bqr_cp cp = { 0 };
+
+ cp.report_action = REPORT_ACTION_CLEAR;
+
+ skb = __hci_cmd_sync(hdev, BQR_OPCODE, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR_OR_NULL(skb)) {
+ if (!skb)
+ skb = ERR_PTR(-EIO);
+
+ bt_dev_err(hdev, "Disabling Android BQR failed (%ld)",
+ PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+bool aosp_has_quality_report(struct hci_dev *hdev)
+{
+ return hdev->aosp_quality_report;
+}
+
+int aosp_set_quality_report(struct hci_dev *hdev, bool enable)
+{
+ if (!aosp_has_quality_report(hdev))
+ return -EOPNOTSUPP;
+
+ bt_dev_dbg(hdev, "quality report enable %d", enable);
+
+ /* Enable or disable the quality report feature. */
+ if (enable)
+ return enable_quality_report(hdev);
+ else
+ return disable_quality_report(hdev);
+}
diff --git a/net/bluetooth/aosp.h b/net/bluetooth/aosp.h
new file mode 100644
index 000000000000..2fd8886d51b2
--- /dev/null
+++ b/net/bluetooth/aosp.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#if IS_ENABLED(CONFIG_BT_AOSPEXT)
+
+void aosp_do_open(struct hci_dev *hdev);
+void aosp_do_close(struct hci_dev *hdev);
+
+bool aosp_has_quality_report(struct hci_dev *hdev);
+int aosp_set_quality_report(struct hci_dev *hdev, bool enable);
+
+#else
+
+static inline void aosp_do_open(struct hci_dev *hdev) {}
+static inline void aosp_do_close(struct hci_dev *hdev) {}
+
+static inline bool aosp_has_quality_report(struct hci_dev *hdev)
+{
+ return false;
+}
+
+static inline int aosp_set_quality_report(struct hci_dev *hdev, bool enable)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif
diff --git a/net/bluetooth/bnep/Kconfig b/net/bluetooth/bnep/Kconfig
index 35158b036d54..aac02b5b0d17 100644
--- a/net/bluetooth/bnep/Kconfig
+++ b/net/bluetooth/bnep/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_BNEP
tristate "BNEP protocol support"
- depends on BT && BT_L2CAP
+ depends on BT_BREDR
select CRC32
help
BNEP (Bluetooth Network Encapsulation Protocol) is Ethernet
diff --git a/net/bluetooth/bnep/Makefile b/net/bluetooth/bnep/Makefile
index c7821e76ca56..8af9d56bb012 100644
--- a/net/bluetooth/bnep/Makefile
+++ b/net/bluetooth/bnep/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth BNEP layer.
#
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
index 0b6cd0e2528d..9680473ed7ef 100644
--- a/net/bluetooth/bnep/bnep.h
+++ b/net/bluetooth/bnep/bnep.h
@@ -1,24 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
BNEP protocol definition for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License, version 2, as
- published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-/*
- * $Id: bnep.h,v 1.5 2002/08/04 21:23:58 maxk Exp $
- */
+*/
#ifndef _BNEP_H
#define _BNEP_H
@@ -27,101 +12,105 @@
#include <linux/crc32.h>
#include <net/bluetooth/bluetooth.h>
-// Limits
-#define BNEP_MAX_PROTO_FILTERS 5
-#define BNEP_MAX_MULTICAST_FILTERS 20
-
-// UUIDs
-#define BNEP_BASE_UUID 0x0000000000001000800000805F9B34FB
-#define BNEP_UUID16 0x02
-#define BNEP_UUID32 0x04
-#define BNEP_UUID128 0x16
-
-#define BNEP_SVC_PANU 0x1115
-#define BNEP_SVC_NAP 0x1116
-#define BNEP_SVC_GN 0x1117
-
-// Packet types
-#define BNEP_GENERAL 0x00
-#define BNEP_CONTROL 0x01
-#define BNEP_COMPRESSED 0x02
-#define BNEP_COMPRESSED_SRC_ONLY 0x03
-#define BNEP_COMPRESSED_DST_ONLY 0x04
-
-// Control types
-#define BNEP_CMD_NOT_UNDERSTOOD 0x00
-#define BNEP_SETUP_CONN_REQ 0x01
-#define BNEP_SETUP_CONN_RSP 0x02
-#define BNEP_FILTER_NET_TYPE_SET 0x03
-#define BNEP_FILTER_NET_TYPE_RSP 0x04
-#define BNEP_FILTER_MULTI_ADDR_SET 0x05
-#define BNEP_FILTER_MULTI_ADDR_RSP 0x06
-
-// Extension types
-#define BNEP_EXT_CONTROL 0x00
-
-// Response messages
-#define BNEP_SUCCESS 0x00
-
-#define BNEP_CONN_INVALID_DST 0x01
-#define BNEP_CONN_INVALID_SRC 0x02
-#define BNEP_CONN_INVALID_SVC 0x03
-#define BNEP_CONN_NOT_ALLOWED 0x04
-
-#define BNEP_FILTER_UNSUPPORTED_REQ 0x01
-#define BNEP_FILTER_INVALID_RANGE 0x02
-#define BNEP_FILTER_INVALID_MCADDR 0x02
-#define BNEP_FILTER_LIMIT_REACHED 0x03
-#define BNEP_FILTER_DENIED_SECURITY 0x04
-
-// L2CAP settings
-#define BNEP_MTU 1691
-#define BNEP_PSM 0x0f
-#define BNEP_FLUSH_TO 0xffff
-#define BNEP_CONNECT_TO 15
-#define BNEP_FILTER_TO 15
-
-// Headers
-#define BNEP_TYPE_MASK 0x7f
-#define BNEP_EXT_HEADER 0x80
+/* Limits */
+#define BNEP_MAX_PROTO_FILTERS 5
+#define BNEP_MAX_MULTICAST_FILTERS 20
+
+/* UUIDs */
+#define BNEP_BASE_UUID 0x0000000000001000800000805F9B34FB
+#define BNEP_UUID16 0x02
+#define BNEP_UUID32 0x04
+#define BNEP_UUID128 0x16
+
+#define BNEP_SVC_PANU 0x1115
+#define BNEP_SVC_NAP 0x1116
+#define BNEP_SVC_GN 0x1117
+
+/* Packet types */
+#define BNEP_GENERAL 0x00
+#define BNEP_CONTROL 0x01
+#define BNEP_COMPRESSED 0x02
+#define BNEP_COMPRESSED_SRC_ONLY 0x03
+#define BNEP_COMPRESSED_DST_ONLY 0x04
+
+/* Control types */
+#define BNEP_CMD_NOT_UNDERSTOOD 0x00
+#define BNEP_SETUP_CONN_REQ 0x01
+#define BNEP_SETUP_CONN_RSP 0x02
+#define BNEP_FILTER_NET_TYPE_SET 0x03
+#define BNEP_FILTER_NET_TYPE_RSP 0x04
+#define BNEP_FILTER_MULTI_ADDR_SET 0x05
+#define BNEP_FILTER_MULTI_ADDR_RSP 0x06
+
+/* Extension types */
+#define BNEP_EXT_CONTROL 0x00
+
+/* Response messages */
+#define BNEP_SUCCESS 0x00
+
+#define BNEP_CONN_INVALID_DST 0x01
+#define BNEP_CONN_INVALID_SRC 0x02
+#define BNEP_CONN_INVALID_SVC 0x03
+#define BNEP_CONN_NOT_ALLOWED 0x04
+
+#define BNEP_FILTER_UNSUPPORTED_REQ 0x01
+#define BNEP_FILTER_INVALID_RANGE 0x02
+#define BNEP_FILTER_INVALID_MCADDR 0x02
+#define BNEP_FILTER_LIMIT_REACHED 0x03
+#define BNEP_FILTER_DENIED_SECURITY 0x04
+
+/* L2CAP settings */
+#define BNEP_MTU 1691
+#define BNEP_PSM 0x0f
+#define BNEP_FLUSH_TO 0xffff
+#define BNEP_CONNECT_TO 15
+#define BNEP_FILTER_TO 15
+
+/* Headers */
+#define BNEP_TYPE_MASK 0x7f
+#define BNEP_EXT_HEADER 0x80
struct bnep_setup_conn_req {
- __u8 type;
- __u8 ctrl;
- __u8 uuid_size;
- __u8 service[0];
-} __attribute__((packed));
+ __u8 type;
+ __u8 ctrl;
+ __u8 uuid_size;
+ __u8 service[];
+} __packed;
struct bnep_set_filter_req {
- __u8 type;
- __u8 ctrl;
+ __u8 type;
+ __u8 ctrl;
__be16 len;
- __u8 list[0];
-} __attribute__((packed));
+ __u8 list[];
+} __packed;
struct bnep_control_rsp {
- __u8 type;
- __u8 ctrl;
+ __u8 type;
+ __u8 ctrl;
__be16 resp;
-} __attribute__((packed));
+} __packed;
struct bnep_ext_hdr {
- __u8 type;
- __u8 len;
- __u8 data[0];
-} __attribute__((packed));
+ __u8 type;
+ __u8 len;
+ __u8 data[];
+} __packed;
/* BNEP ioctl defines */
#define BNEPCONNADD _IOW('B', 200, int)
#define BNEPCONNDEL _IOW('B', 201, int)
#define BNEPGETCONNLIST _IOR('B', 210, int)
#define BNEPGETCONNINFO _IOR('B', 211, int)
+#define BNEPGETSUPPFEAT _IOR('B', 212, int)
+
+#define BNEP_SETUP_RESPONSE 0
+#define BNEP_SETUP_RSP_SENT 10
struct bnep_connadd_req {
- int sock; // Connected socket
+ int sock; /* Connected socket */
__u32 flags;
__u16 role;
- char device[16]; // Name of the Ethernet device
+ char device[16]; /* Name of the Ethernet device */
};
struct bnep_conndel_req {
@@ -132,7 +121,7 @@ struct bnep_conndel_req {
struct bnep_conninfo {
__u32 flags;
__u16 role;
- __u16 state;
+ __u16 state;
__u8 dst[ETH_ALEN];
char device[16];
};
@@ -152,33 +141,33 @@ int bnep_del_connection(struct bnep_conndel_req *req);
int bnep_get_connlist(struct bnep_connlist_req *req);
int bnep_get_conninfo(struct bnep_conninfo *ci);
-// BNEP sessions
+/* BNEP sessions */
struct bnep_session {
struct list_head list;
-
+
unsigned int role;
- unsigned long state;
- unsigned long flags;
- atomic_t killed;
+ unsigned long state;
+ unsigned long flags;
+ atomic_t terminate;
+ struct task_struct *task;
struct ethhdr eh;
struct msghdr msg;
struct bnep_proto_filter proto_filter[BNEP_MAX_PROTO_FILTERS];
- u64 mc_filter;
-
+ unsigned long long mc_filter;
+
struct socket *sock;
struct net_device *dev;
- struct net_device_stats stats;
};
void bnep_net_setup(struct net_device *dev);
int bnep_sock_init(void);
-int bnep_sock_cleanup(void);
+void bnep_sock_cleanup(void);
static inline int bnep_mc_hash(__u8 *addr)
{
- return (crc32_be(~0, addr, ETH_ALEN) >> 26);
+ return crc32_be(~0, addr, ETH_ALEN) >> 26;
}
#endif
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 7ba6470dc507..d44987d4515c 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -1,8 +1,8 @@
-/*
+/*
BNEP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2001-2002 Inventel Systemes
Written 2001-2002 by
- Clément Moreau <clement.moreau@inventel.fr>
+ Clément Moreau <clement.moreau@inventel.fr>
David Libault <david.libault@inventel.fr>
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
@@ -15,53 +15,32 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
-/*
- * $Id: core.c,v 1.20 2002/08/04 21:23:58 maxk Exp $
- */
-
#include <linux/module.h>
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-#include <linux/errno.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <net/sock.h>
-
-#include <linux/socket.h>
+#include <linux/kthread.h>
#include <linux/file.h>
-
-#include <linux/netdevice.h>
#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/hci_core.h>
#include "bnep.h"
-#ifndef CONFIG_BT_BNEP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#define VERSION "1.3"
-#define VERSION "1.2"
+static bool compress_src = true;
+static bool compress_dst = true;
static LIST_HEAD(bnep_session_list);
static DECLARE_RWSEM(bnep_session_sem);
@@ -69,31 +48,24 @@ static DECLARE_RWSEM(bnep_session_sem);
static struct bnep_session *__bnep_get_session(u8 *dst)
{
struct bnep_session *s;
- struct list_head *p;
BT_DBG("");
- list_for_each(p, &bnep_session_list) {
- s = list_entry(p, struct bnep_session, list);
- if (!compare_ether_addr(dst, s->eh.h_source))
+ list_for_each_entry(s, &bnep_session_list, list)
+ if (ether_addr_equal(dst, s->eh.h_source))
return s;
- }
+
return NULL;
}
static void __bnep_link_session(struct bnep_session *s)
{
- /* It's safe to call __module_get() here because sessions are added
- by the socket layer which has to hold the refference to this module.
- */
- __module_get(THIS_MODULE);
- list_add(&s->list, &bnep_session_list);
+ list_add(&s->list, &bnep_session_list);
}
static void __bnep_unlink_session(struct bnep_session *s)
{
list_del(&s->list);
- module_put(THIS_MODULE);
}
static int bnep_send(struct bnep_session *s, void *data, size_t len)
@@ -135,8 +107,9 @@ static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len
if (len < 2)
return -EILSEQ;
- n = ntohs(get_unaligned(data));
- data++; len -= 2;
+ n = get_unaligned_be16(data);
+ data++;
+ len -= 2;
if (len < n)
return -EILSEQ;
@@ -150,11 +123,11 @@ static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len
int i;
for (i = 0; i < n; i++) {
- f[i].start = ntohs(get_unaligned(data++));
- f[i].end = ntohs(get_unaligned(data++));
+ f[i].start = get_unaligned_be16(data++);
+ f[i].end = get_unaligned_be16(data++);
- BT_DBG("proto filter start %d end %d",
- f[i].start, f[i].end);
+ BT_DBG("proto filter start %u end %u",
+ f[i].start, f[i].end);
}
if (i < BNEP_MAX_PROTO_FILTERS)
@@ -180,8 +153,9 @@ static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len)
if (len < 2)
return -EILSEQ;
- n = ntohs(get_unaligned((__be16 *) data));
- data += 2; len -= 2;
+ n = get_unaligned_be16(data);
+ data += 2;
+ len -= 2;
if (len < n)
return -EILSEQ;
@@ -192,6 +166,8 @@ static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len)
n /= (ETH_ALEN * 2);
if (n > 0) {
+ int i;
+
s->mc_filter = 0;
/* Always send broadcast */
@@ -201,18 +177,21 @@ static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len)
for (; n > 0; n--) {
u8 a1[6], *a2;
- memcpy(a1, data, ETH_ALEN); data += ETH_ALEN;
- a2 = data; data += ETH_ALEN;
-
- BT_DBG("mc filter %s -> %s",
- batostr((void *) a1), batostr((void *) a2));
+ memcpy(a1, data, ETH_ALEN);
+ data += ETH_ALEN;
+ a2 = data;
+ data += ETH_ALEN;
- #define INCA(a) { int i = 5; while (i >=0 && ++a[i--] == 0); }
+ BT_DBG("mc filter %pMR -> %pMR", a1, a2);
/* Iterate from a1 to a2 */
set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter);
while (memcmp(a1, a2, 6) < 0 && s->mc_filter != ~0LL) {
- INCA(a1);
+ /* Increment a1 */
+ i = 5;
+ while (i >= 0 && ++a1[i--] == 0)
+ ;
+
set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter);
}
}
@@ -232,11 +211,11 @@ static int bnep_rx_control(struct bnep_session *s, void *data, int len)
u8 cmd = *(u8 *)data;
int err = 0;
- data++; len--;
+ data++;
+ len--;
switch (cmd) {
case BNEP_CMD_NOT_UNDERSTOOD:
- case BNEP_SETUP_CONN_REQ:
case BNEP_SETUP_CONN_RSP:
case BNEP_FILTER_NET_TYPE_RSP:
case BNEP_FILTER_MULTI_ADDR_RSP:
@@ -251,12 +230,23 @@ static int bnep_rx_control(struct bnep_session *s, void *data, int len)
err = bnep_ctrl_set_mcfilter(s, data, len);
break;
+ case BNEP_SETUP_CONN_REQ:
+ /* Successful response should be sent only once */
+ if (test_bit(BNEP_SETUP_RESPONSE, &s->flags) &&
+ !test_and_set_bit(BNEP_SETUP_RSP_SENT, &s->flags))
+ err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP,
+ BNEP_SUCCESS);
+ else
+ err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP,
+ BNEP_CONN_NOT_ALLOWED);
+ break;
+
default: {
u8 pkt[3];
pkt[0] = BNEP_CONTROL;
pkt[1] = BNEP_CMD_NOT_UNDERSTOOD;
pkt[2] = cmd;
- bnep_send(s, pkt, sizeof(pkt));
+ err = bnep_send(s, pkt, sizeof(pkt));
}
break;
}
@@ -276,8 +266,8 @@ static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb)
break;
}
- BT_DBG("type 0x%x len %d", h->type, h->len);
-
+ BT_DBG("type 0x%x len %u", h->type, h->len);
+
switch (h->type & BNEP_TYPE_MASK) {
case BNEP_EXT_CONTROL:
bnep_rx_control(s, skb->data, skb->len);
@@ -293,7 +283,7 @@ static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb)
break;
}
} while (!err && (h->type & BNEP_EXT_HEADER));
-
+
return err;
}
@@ -304,35 +294,60 @@ static u8 __bnep_rx_hlen[] = {
ETH_ALEN + 2, /* BNEP_COMPRESSED_SRC_ONLY */
ETH_ALEN + 2 /* BNEP_COMPRESSED_DST_ONLY */
};
-#define BNEP_RX_TYPES (sizeof(__bnep_rx_hlen) - 1)
-static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
+static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
{
struct net_device *dev = s->dev;
struct sk_buff *nskb;
- u8 type;
+ u8 type, ctrl_type;
- dev->last_rx = jiffies;
- s->stats.rx_bytes += skb->len;
+ dev->stats.rx_bytes += skb->len;
- type = *(u8 *) skb->data; skb_pull(skb, 1);
+ type = *(u8 *) skb->data;
+ skb_pull(skb, 1);
+ ctrl_type = *(u8 *)skb->data;
- if ((type & BNEP_TYPE_MASK) > BNEP_RX_TYPES)
+ if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen))
goto badframe;
-
+
if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) {
- bnep_rx_control(s, skb->data, skb->len);
- kfree_skb(skb);
- return 0;
- }
+ if (bnep_rx_control(s, skb->data, skb->len) < 0) {
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+ return 0;
+ }
- skb->mac.raw = skb->data;
+ if (!(type & BNEP_EXT_HEADER)) {
+ kfree_skb(skb);
+ return 0;
+ }
- /* Verify and pull out header */
- if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK]))
- goto badframe;
+ /* Verify and pull ctrl message since it's already processed */
+ switch (ctrl_type) {
+ case BNEP_SETUP_CONN_REQ:
+ /* Pull: ctrl type (1 b), len (1 b), data (len bytes) */
+ if (!skb_pull(skb, 2 + *(u8 *)(skb->data + 1) * 2))
+ goto badframe;
+ break;
+ case BNEP_FILTER_MULTI_ADDR_SET:
+ case BNEP_FILTER_NET_TYPE_SET:
+ /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */
+ if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2))
+ goto badframe;
+ break;
+ default:
+ kfree_skb(skb);
+ return 0;
+ }
+ } else {
+ skb_reset_mac_header(skb);
+
+ /* Verify and pull out header */
+ if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK]))
+ goto badframe;
- s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
+ s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
+ }
if (type & BNEP_EXT_HEADER) {
if (bnep_rx_extension(s, skb) < 0)
@@ -340,17 +355,17 @@ static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
}
/* Strip 802.1p header */
- if (ntohs(s->eh.h_proto) == 0x8100) {
+ if (ntohs(s->eh.h_proto) == ETH_P_8021Q) {
if (!skb_pull(skb, 4))
goto badframe;
s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
}
-
+
/* We have to alloc new skb and copy data here :(. Because original skb
* may not be modified and because of the alignment requirements. */
nskb = alloc_skb(2 + ETH_HLEN + skb->len, GFP_KERNEL);
if (!nskb) {
- s->stats.rx_dropped++;
+ dev->stats.rx_dropped++;
kfree_skb(skb);
return -ENOMEM;
}
@@ -359,38 +374,38 @@ static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
/* Decompress header and construct ether frame */
switch (type & BNEP_TYPE_MASK) {
case BNEP_COMPRESSED:
- memcpy(__skb_put(nskb, ETH_HLEN), &s->eh, ETH_HLEN);
+ __skb_put_data(nskb, &s->eh, ETH_HLEN);
break;
-
+
case BNEP_COMPRESSED_SRC_ONLY:
- memcpy(__skb_put(nskb, ETH_ALEN), s->eh.h_dest, ETH_ALEN);
- memcpy(__skb_put(nskb, ETH_ALEN), skb->mac.raw, ETH_ALEN);
+ __skb_put_data(nskb, s->eh.h_dest, ETH_ALEN);
+ __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN);
put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2));
break;
case BNEP_COMPRESSED_DST_ONLY:
- memcpy(__skb_put(nskb, ETH_ALEN), skb->mac.raw, ETH_ALEN);
- memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source, ETH_ALEN + 2);
+ __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN);
+ __skb_put_data(nskb, s->eh.h_source, ETH_ALEN);
+ put_unaligned(s->eh.h_proto, (__be16 *)__skb_put(nskb, 2));
break;
case BNEP_GENERAL:
- memcpy(__skb_put(nskb, ETH_ALEN * 2), skb->mac.raw, ETH_ALEN * 2);
+ __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN * 2);
put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2));
break;
}
- memcpy(__skb_put(nskb, skb->len), skb->data, skb->len);
+ skb_copy_from_linear_data(skb, __skb_put(nskb, skb->len), skb->len);
kfree_skb(skb);
-
- s->stats.rx_packets++;
- nskb->dev = dev;
+
+ dev->stats.rx_packets++;
nskb->ip_summed = CHECKSUM_NONE;
nskb->protocol = eth_type_trans(nskb, dev);
- netif_rx_ni(nskb);
+ netif_rx(nskb);
return 0;
badframe:
- s->stats.rx_errors++;
+ dev->stats.rx_errors++;
kfree_skb(skb);
return 0;
}
@@ -402,7 +417,7 @@ static u8 __bnep_tx_types[] = {
BNEP_COMPRESSED
};
-static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
+static int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
{
struct ethhdr *eh = (void *) skb->data;
struct socket *sock = s->sock;
@@ -410,7 +425,7 @@ static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
int len = 0, il = 0;
u8 type = 0;
- BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type);
+ BT_DBG("skb %p dev %p type %u", skb, skb->dev, skb->pkt_type);
if (!skb->dev) {
/* Control frame sent by us */
@@ -420,10 +435,10 @@ static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
iv[il++] = (struct kvec) { &type, 1 };
len++;
- if (!compare_ether_addr(eh->h_dest, s->eh.h_source))
+ if (compress_src && ether_addr_equal(eh->h_dest, s->eh.h_source))
type |= 0x01;
- if (!compare_ether_addr(eh->h_source, s->eh.h_dest))
+ if (compress_dst && ether_addr_equal(eh->h_source, s->eh.h_dest))
type |= 0x02;
if (type)
@@ -435,7 +450,7 @@ static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
iv[il++] = (struct kvec) { eh->h_source, ETH_ALEN };
len += ETH_ALEN;
break;
-
+
case BNEP_COMPRESSED_DST_ONLY:
iv[il++] = (struct kvec) { eh->h_dest, ETH_ALEN };
len += ETH_ALEN;
@@ -445,7 +460,7 @@ static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
send:
iv[il++] = (struct kvec) { skb->data, skb->len };
len += skb->len;
-
+
/* FIXME: linearize skb */
{
len = kernel_sendmsg(sock, &s->msg, iv, il, len);
@@ -453,8 +468,8 @@ send:
kfree_skb(skb);
if (len > 0) {
- s->stats.tx_bytes += len;
- s->stats.tx_packets++;
+ s->dev->stats.tx_bytes += len;
+ s->dev->stats.tx_packets++;
return 0;
}
@@ -467,38 +482,41 @@ static int bnep_session(void *arg)
struct net_device *dev = s->dev;
struct sock *sk = s->sock->sk;
struct sk_buff *skb;
- wait_queue_t wait;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
BT_DBG("");
- daemonize("kbnepd %s", dev->name);
set_user_nice(current, -15);
- current->flags |= PF_NOFREEZE;
-
- init_waitqueue_entry(&wait, current);
- add_wait_queue(sk->sk_sleep, &wait);
- while (!atomic_read(&s->killed)) {
- set_current_state(TASK_INTERRUPTIBLE);
- // RX
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ if (atomic_read(&s->terminate))
+ break;
+ /* RX */
while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
skb_orphan(skb);
- bnep_rx_frame(s, skb);
+ if (!skb_linearize(skb))
+ bnep_rx_frame(s, skb);
+ else
+ kfree_skb(skb);
}
if (sk->sk_state != BT_CONNECTED)
break;
-
- // TX
+
+ /* TX */
while ((skb = skb_dequeue(&sk->sk_write_queue)))
if (bnep_tx_frame(s, skb))
break;
netif_wake_queue(dev);
-
- schedule();
+
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
- set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
/* Cleanup session */
down_write(&bnep_session_sem);
@@ -506,6 +524,11 @@ static int bnep_session(void *arg)
/* Delete network device */
unregister_netdev(dev);
+ /* Wakeup user-space polling for socket errors */
+ s->sock->sk->sk_err = EUNATCH;
+
+ wake_up_interruptible(sk_sleep(s->sock->sk));
+
/* Release the socket */
fput(s->sock->file);
@@ -513,29 +536,27 @@ static int bnep_session(void *arg)
up_write(&bnep_session_sem);
free_netdev(dev);
+ module_put_and_kthread_exit(0);
return 0;
}
static struct device *bnep_get_device(struct bnep_session *session)
{
- bdaddr_t *src = &bt_sk(session->sock->sk)->src;
- bdaddr_t *dst = &bt_sk(session->sock->sk)->dst;
- struct hci_dev *hdev;
- struct hci_conn *conn;
+ struct l2cap_conn *conn = l2cap_pi(session->sock->sk)->chan->conn;
- hdev = hci_get_route(dst, src);
- if (!hdev)
+ if (!conn || !conn->hcon)
return NULL;
- conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
-
- hci_dev_put(hdev);
-
- return conn ? &conn->dev : NULL;
+ return &conn->hcon->dev;
}
+static const struct device_type bnep_type = {
+ .name = "bluetooth",
+};
+
int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
{
+ u32 valid_flags = BIT(BNEP_SETUP_RESPONSE);
struct net_device *dev;
struct bnep_session *s, *ss;
u8 dst[ETH_ALEN], src[ETH_ALEN];
@@ -543,12 +564,19 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
BT_DBG("");
- baswap((void *) dst, &bt_sk(sock->sk)->dst);
- baswap((void *) src, &bt_sk(sock->sk)->src);
+ if (!l2cap_is_socket(sock))
+ return -EBADFD;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ baswap((void *) dst, &l2cap_pi(sock->sk)->chan->dst);
+ baswap((void *) src, &l2cap_pi(sock->sk)->chan->src);
/* session struct allocated as private part of net_device */
dev = alloc_netdev(sizeof(struct bnep_session),
(*req->device) ? req->device : "bnep%d",
+ NET_NAME_UNKNOWN,
bnep_net_setup);
if (!dev)
return -ENOMEM;
@@ -561,24 +589,28 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
goto failed;
}
- s = dev->priv;
+ s = netdev_priv(dev);
/* This is rx header therefore addresses are swapped.
- * ie eh.h_dest is our local address. */
+ * ie. eh.h_dest is our local address. */
memcpy(s->eh.h_dest, &src, ETH_ALEN);
memcpy(s->eh.h_source, &dst, ETH_ALEN);
- memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN);
+ eth_hw_addr_set(dev, s->eh.h_dest);
s->dev = dev;
s->sock = sock;
s->role = req->role;
s->state = BT_CONNECTED;
-
+ s->flags = req->flags;
+
s->msg.msg_flags = MSG_NOSIGNAL;
#ifdef CONFIG_BT_BNEP_MC_FILTER
- /* Set default mc filter */
- set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter);
+ /* Set default mc filter to not filter out any mc addresses
+ * as defined in the BNEP specification (revision 0.95a)
+ * http://grouper.ieee.org/groups/802/15/Bluetooth/BNEP.pdf
+ */
+ s->mc_filter = ~0LL;
#endif
#ifdef CONFIG_BT_BNEP_PROTO_FILTER
@@ -587,19 +619,22 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
#endif
SET_NETDEV_DEV(dev, bnep_get_device(s));
+ SET_NETDEV_DEVTYPE(dev, &bnep_type);
err = register_netdev(dev);
- if (err) {
+ if (err)
goto failed;
- }
__bnep_link_session(s);
-
- err = kernel_thread(bnep_session, s, CLONE_KERNEL);
- if (err < 0) {
+
+ __module_get(THIS_MODULE);
+ s->task = kthread_run(bnep_session, s, "kbnepd %s", dev->name);
+ if (IS_ERR(s->task)) {
/* Session thread start failed, gotta cleanup. */
+ module_put(THIS_MODULE);
unregister_netdev(dev);
__bnep_unlink_session(s);
+ err = PTR_ERR(s->task);
goto failed;
}
@@ -615,22 +650,21 @@ failed:
int bnep_del_connection(struct bnep_conndel_req *req)
{
+ u32 valid_flags = 0;
struct bnep_session *s;
int err = 0;
BT_DBG("");
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
down_read(&bnep_session_sem);
s = __bnep_get_session(req->dst);
if (s) {
- /* Wakeup user-space which is polling for socket errors.
- * This is temporary hack untill we have shutdown in L2CAP */
- s->sock->sk->sk_err = EUNATCH;
-
- /* Kill session thread */
- atomic_inc(&s->killed);
- wake_up_interruptible(s->sock->sk->sk_sleep);
+ atomic_inc(&s->terminate);
+ wake_up_interruptible(sk_sleep(s->sock->sk));
} else
err = -ENOENT;
@@ -640,28 +674,28 @@ int bnep_del_connection(struct bnep_conndel_req *req)
static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s)
{
+ u32 valid_flags = BIT(BNEP_SETUP_RESPONSE);
+
+ memset(ci, 0, sizeof(*ci));
memcpy(ci->dst, s->eh.h_source, ETH_ALEN);
strcpy(ci->device, s->dev->name);
- ci->flags = s->flags;
+ ci->flags = s->flags & valid_flags;
ci->state = s->state;
ci->role = s->role;
}
int bnep_get_connlist(struct bnep_connlist_req *req)
{
- struct list_head *p;
+ struct bnep_session *s;
int err = 0, n = 0;
down_read(&bnep_session_sem);
- list_for_each(p, &bnep_session_list) {
- struct bnep_session *s;
+ list_for_each_entry(s, &bnep_session_list, list) {
struct bnep_conninfo ci;
- s = list_entry(p, struct bnep_session, list);
-
__bnep_copy_ci(&ci, s);
-
+
if (copy_to_user(req->ci, &ci, sizeof(ci))) {
err = -EFAULT;
break;
@@ -696,11 +730,9 @@ int bnep_get_conninfo(struct bnep_conninfo *ci)
}
static int __init bnep_init(void)
-{
+{
char flt[50] = "";
- l2cap_load();
-
#ifdef CONFIG_BT_BNEP_PROTO_FILTER
strcat(flt, "protocol ");
#endif
@@ -713,8 +745,7 @@ static int __init bnep_init(void)
if (flt[0])
BT_INFO("BNEP filters: %s", flt);
- bnep_sock_init();
- return 0;
+ return bnep_sock_init();
}
static void __exit bnep_exit(void)
@@ -725,7 +756,13 @@ static void __exit bnep_exit(void)
module_init(bnep_init);
module_exit(bnep_exit);
-MODULE_AUTHOR("David Libault <david.libault@inventel.fr>, Maxim Krasnyansky <maxk@qualcomm.com>");
+module_param(compress_src, bool, 0644);
+MODULE_PARM_DESC(compress_src, "Compress sources headers");
+
+module_param(compress_dst, bool, 0644);
+MODULE_PARM_DESC(compress_dst, "Compress destination headers");
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
MODULE_DESCRIPTION("Bluetooth BNEP ver " VERSION);
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index 67a002a9751a..cc1cff63194f 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -1,8 +1,8 @@
-/*
+/*
BNEP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2001-2002 Inventel Systemes
Written 2001-2002 by
- Clément Moreau <clement.moreau@inventel.fr>
+ Clément Moreau <clement.moreau@inventel.fr>
David Libault <david.libault@inventel.fr>
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
@@ -15,29 +15,17 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
-/*
- * $Id: netdev.c,v 1.8 2002/08/04 21:23:58 maxk Exp $
- */
-
-#include <linux/module.h>
-
-#include <linux/socket.h>
-#include <linux/netdevice.h>
#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/wait.h>
-
-#include <asm/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -45,11 +33,6 @@
#include "bnep.h"
-#ifndef CONFIG_BT_BNEP_DEBUG
-#undef BT_DBG
-#define BT_DBG( A... )
-#endif
-
#define BNEP_TX_QUEUE_LEN 20
static int bnep_net_open(struct net_device *dev)
@@ -64,22 +47,16 @@ static int bnep_net_close(struct net_device *dev)
return 0;
}
-static struct net_device_stats *bnep_net_get_stats(struct net_device *dev)
-{
- struct bnep_session *s = dev->priv;
- return &s->stats;
-}
-
static void bnep_net_set_mc_list(struct net_device *dev)
{
#ifdef CONFIG_BT_BNEP_MC_FILTER
- struct bnep_session *s = dev->priv;
+ struct bnep_session *s = netdev_priv(dev);
struct sock *sk = s->sock->sk;
struct bnep_set_filter_req *r;
struct sk_buff *skb;
int size;
- BT_DBG("%s mc_count %d", dev->name, dev->mc_count);
+ BT_DBG("%s mc_count %d", dev->name, netdev_mc_count(dev));
size = sizeof(*r) + (BNEP_MAX_MULTICAST_FILTERS + 1) * ETH_ALEN * 2;
skb = alloc_skb(size, GFP_ATOMIC);
@@ -94,34 +71,38 @@ static void bnep_net_set_mc_list(struct net_device *dev)
r->type = BNEP_CONTROL;
r->ctrl = BNEP_FILTER_MULTI_ADDR_SET;
- if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
+ if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
u8 start[ETH_ALEN] = { 0x01 };
/* Request all addresses */
- memcpy(__skb_put(skb, ETH_ALEN), start, ETH_ALEN);
- memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+ __skb_put_data(skb, start, ETH_ALEN);
+ __skb_put_data(skb, dev->broadcast, ETH_ALEN);
r->len = htons(ETH_ALEN * 2);
} else {
- struct dev_mc_list *dmi = dev->mc_list;
+ struct netdev_hw_addr *ha;
int i, len = skb->len;
if (dev->flags & IFF_BROADCAST) {
- memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
- memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
- }
-
+ __skb_put_data(skb, dev->broadcast, ETH_ALEN);
+ __skb_put_data(skb, dev->broadcast, ETH_ALEN);
+ }
+
/* FIXME: We should group addresses here. */
- for (i = 0; i < dev->mc_count && i < BNEP_MAX_MULTICAST_FILTERS; i++) {
- memcpy(__skb_put(skb, ETH_ALEN), dmi->dmi_addr, ETH_ALEN);
- memcpy(__skb_put(skb, ETH_ALEN), dmi->dmi_addr, ETH_ALEN);
- dmi = dmi->next;
+ i = 0;
+ netdev_for_each_mc_addr(ha, dev) {
+ if (i == BNEP_MAX_MULTICAST_FILTERS)
+ break;
+ __skb_put_data(skb, ha->addr, ETH_ALEN);
+ __skb_put_data(skb, ha->addr, ETH_ALEN);
+
+ i++;
}
r->len = htons(skb->len - len);
}
skb_queue_tail(&sk->sk_write_queue, skb);
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible(sk_sleep(sk));
#endif
}
@@ -131,19 +112,14 @@ static int bnep_net_set_mac_addr(struct net_device *dev, void *arg)
return 0;
}
-static void bnep_net_timeout(struct net_device *dev)
+static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue)
{
BT_DBG("net_timeout");
netif_wake_queue(dev);
}
-static int bnep_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
- return -EINVAL;
-}
-
#ifdef CONFIG_BT_BNEP_MC_FILTER
-static inline int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s)
+static int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s)
{
struct ethhdr *eh = (void *) skb->data;
@@ -155,26 +131,26 @@ static inline int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s
#ifdef CONFIG_BT_BNEP_PROTO_FILTER
/* Determine ether protocol. Based on eth_type_trans. */
-static inline u16 bnep_net_eth_proto(struct sk_buff *skb)
+static u16 bnep_net_eth_proto(struct sk_buff *skb)
{
struct ethhdr *eh = (void *) skb->data;
u16 proto = ntohs(eh->h_proto);
-
- if (proto >= 1536)
+
+ if (proto >= ETH_P_802_3_MIN)
return proto;
-
+
if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF))
return ETH_P_802_3;
-
+
return ETH_P_802_2;
}
-static inline int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s)
+static int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s)
{
u16 proto = bnep_net_eth_proto(skb);
struct bnep_proto_filter *f = s->proto_filter;
int i;
-
+
for (i = 0; i < BNEP_MAX_PROTO_FILTERS && f[i].end; i++) {
if (proto >= f[i].start && proto <= f[i].end)
return 0;
@@ -185,9 +161,10 @@ static inline int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session
}
#endif
-static int bnep_net_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bnep_net_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
- struct bnep_session *s = dev->priv;
+ struct bnep_session *s = netdev_priv(dev);
struct sock *sk = s->sock->sk;
BT_DBG("skb %p, dev %p", skb, dev);
@@ -195,25 +172,25 @@ static int bnep_net_xmit(struct sk_buff *skb, struct net_device *dev)
#ifdef CONFIG_BT_BNEP_MC_FILTER
if (bnep_net_mc_filter(skb, s)) {
kfree_skb(skb);
- return 0;
+ return NETDEV_TX_OK;
}
#endif
-
+
#ifdef CONFIG_BT_BNEP_PROTO_FILTER
if (bnep_net_proto_filter(skb, s)) {
kfree_skb(skb);
- return 0;
+ return NETDEV_TX_OK;
}
#endif
-
+
/*
* We cannot send L2CAP packets from here as we are potentially in a bh.
* So we have to queue them and wake up session thread which is sleeping
- * on the sk->sk_sleep.
+ * on the sk_sleep(sk).
*/
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
skb_queue_tail(&sk->sk_write_queue, skb);
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible(sk_sleep(sk));
if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) {
BT_DBG("tx queue is full");
@@ -223,25 +200,31 @@ static int bnep_net_xmit(struct sk_buff *skb, struct net_device *dev)
netif_stop_queue(dev);
}
- return 0;
+ return NETDEV_TX_OK;
}
+static const struct net_device_ops bnep_netdev_ops = {
+ .ndo_open = bnep_net_open,
+ .ndo_stop = bnep_net_close,
+ .ndo_start_xmit = bnep_net_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_rx_mode = bnep_net_set_mc_list,
+ .ndo_set_mac_address = bnep_net_set_mac_addr,
+ .ndo_tx_timeout = bnep_net_timeout,
+
+};
+
void bnep_net_setup(struct net_device *dev)
{
- memset(dev->broadcast, 0xff, ETH_ALEN);
+ eth_broadcast_addr(dev->broadcast);
dev->addr_len = ETH_ALEN;
ether_setup(dev);
-
- dev->open = bnep_net_open;
- dev->stop = bnep_net_close;
- dev->hard_start_xmit = bnep_net_xmit;
- dev->get_stats = bnep_net_get_stats;
- dev->do_ioctl = bnep_net_ioctl;
- dev->set_mac_address = bnep_net_set_mac_addr;
- dev->set_multicast_list = bnep_net_set_mc_list;
+ dev->min_mtu = 0;
+ dev->max_mtu = ETH_MAX_MTU;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->netdev_ops = &bnep_netdev_ops;
dev->watchdog_timeo = HZ * 2;
- dev->tx_timeout = bnep_net_timeout;
}
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 5563db1bf526..00d47bcf4d7d 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -1,4 +1,4 @@
-/*
+/*
BNEP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2001-2002 Inventel Systemes
Written 2001-2002 by
@@ -14,47 +14,25 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
-/*
- * $Id: sock.c,v 1.4 2002/08/04 21:23:58 maxk Exp $
- */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/skbuff.h>
-#include <linux/socket.h>
-#include <linux/ioctl.h>
-#include <linux/file.h>
-#include <linux/init.h>
#include <linux/compat.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/export.h>
+#include <linux/file.h>
#include "bnep.h"
-#ifndef CONFIG_BT_BNEP_DEBUG
-#undef BT_DBG
-#define BT_DBG( A... )
-#endif
+static struct bt_sock_list bnep_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(bnep_sk_list.lock)
+};
static int bnep_sock_release(struct socket *sock)
{
@@ -65,56 +43,59 @@ static int bnep_sock_release(struct socket *sock)
if (!sk)
return 0;
+ bt_sock_unlink(&bnep_sk_list, sk);
+
sock_orphan(sk);
sock_put(sk);
return 0;
}
-static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_bnep_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
{
struct bnep_connlist_req cl;
struct bnep_connadd_req ca;
struct bnep_conndel_req cd;
struct bnep_conninfo ci;
struct socket *nsock;
- void __user *argp = (void __user *)arg;
+ __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE);
int err;
- BT_DBG("cmd %x arg %lx", cmd, arg);
+ BT_DBG("cmd %x arg %p", cmd, argp);
switch (cmd) {
case BNEPCONNADD:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
if (copy_from_user(&ca, argp, sizeof(ca)))
return -EFAULT;
-
+
nsock = sockfd_lookup(ca.sock, &err);
if (!nsock)
return err;
if (nsock->sk->sk_state != BT_CONNECTED) {
- fput(nsock->file);
+ sockfd_put(nsock);
return -EBADFD;
}
+ ca.device[sizeof(ca.device)-1] = 0;
err = bnep_add_connection(&ca, nsock);
if (!err) {
- if (copy_to_user(argp, &ca, sizeof(ca)))
+ if (copy_to_user(argp, &ca, sizeof(ca)))
err = -EFAULT;
} else
- fput(nsock->file);
+ sockfd_put(nsock);
return err;
-
+
case BNEPCONNDEL:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
if (copy_from_user(&cd, argp, sizeof(cd)))
return -EFAULT;
-
+
return bnep_del_connection(&cd);
case BNEPGETCONNLIST:
@@ -123,7 +104,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
if (cl.cnum <= 0)
return -EINVAL;
-
+
err = bnep_get_connlist(&cl);
if (!err && copy_to_user(argp, &cl, sizeof(cl)))
return -EFAULT;
@@ -140,6 +121,12 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return err;
+ case BNEPGETSUPPFEAT:
+ if (copy_to_user(argp, &supp_feat, sizeof(supp_feat)))
+ return -EFAULT;
+
+ return 0;
+
default:
return -EINVAL;
}
@@ -147,32 +134,38 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return 0;
}
+static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_bnep_sock_ioctl(sock, cmd, (void __user *)arg);
+}
+
#ifdef CONFIG_COMPAT
static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
if (cmd == BNEPGETCONNLIST) {
struct bnep_connlist_req cl;
- uint32_t uci;
+ unsigned __user *p = argp;
+ u32 uci;
int err;
- if (get_user(cl.cnum, (uint32_t __user *) arg) ||
- get_user(uci, (u32 __user *) (arg + 4)))
+ if (get_user(cl.cnum, p) || get_user(uci, p + 1))
return -EFAULT;
cl.ci = compat_ptr(uci);
if (cl.cnum <= 0)
return -EINVAL;
-
+
err = bnep_get_connlist(&cl);
- if (!err && put_user(cl.cnum, (uint32_t __user *) arg))
+ if (!err && put_user(cl.cnum, p))
err = -EFAULT;
return err;
}
- return bnep_sock_ioctl(sock, cmd, arg);
+ return do_bnep_sock_ioctl(sock, cmd, argp);
}
#endif
@@ -188,11 +181,8 @@ static const struct proto_ops bnep_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
@@ -205,7 +195,8 @@ static struct proto bnep_proto = {
.obj_size = sizeof(struct bt_sock)
};
-static int bnep_sock_create(struct socket *sock, int protocol)
+static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
@@ -214,25 +205,18 @@ static int bnep_sock_create(struct socket *sock, int protocol)
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, 1);
+ sk = bt_sock_alloc(net, sock, &bnep_proto, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
- sock_init_data(sock, sk);
-
sock->ops = &bnep_sock_ops;
-
sock->state = SS_UNCONNECTED;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = protocol;
- sk->sk_state = BT_OPEN;
-
+ bt_sock_link(&bnep_sk_list, sk);
return 0;
}
-static struct net_proto_family bnep_sock_family_ops = {
+static const struct net_proto_family bnep_sock_family_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.create = bnep_sock_create
@@ -247,23 +231,30 @@ int __init bnep_sock_init(void)
return err;
err = bt_sock_register(BTPROTO_BNEP, &bnep_sock_family_ops);
- if (err < 0)
+ if (err < 0) {
+ BT_ERR("Can't register BNEP socket");
goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "bnep", &bnep_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create BNEP proc file");
+ bt_sock_unregister(BTPROTO_BNEP);
+ goto error;
+ }
+
+ BT_INFO("BNEP socket layer initialized");
return 0;
error:
- BT_ERR("Can't register BNEP socket");
proto_unregister(&bnep_proto);
return err;
}
-int __exit bnep_sock_cleanup(void)
+void __exit bnep_sock_cleanup(void)
{
- if (bt_sock_unregister(BTPROTO_BNEP) < 0)
- BT_ERR("Can't unregister BNEP socket");
-
+ bt_procfs_cleanup(&init_net, "bnep");
+ bt_sock_unregister(BTPROTO_BNEP);
proto_unregister(&bnep_proto);
-
- return 0;
}
diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig
index d6b0382f6f3a..34e923466236 100644
--- a/net/bluetooth/cmtp/Kconfig
+++ b/net/bluetooth/cmtp/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_CMTP
- tristate "CMTP protocol support"
- depends on BT && BT_L2CAP && ISDN_CAPI
+ tristate "CMTP protocol support (DEPRECATED)"
+ depends on BT_BREDR && ISDN_CAPI && DEPRECATED
help
CMTP (CAPI Message Transport Protocol) is a transport layer
for CAPI messages. CMTP is required for the Bluetooth Common
diff --git a/net/bluetooth/cmtp/Makefile b/net/bluetooth/cmtp/Makefile
index 890a9a5a6861..b2262ca97499 100644
--- a/net/bluetooth/cmtp/Makefile
+++ b/net/bluetooth/cmtp/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth CMTP layer
#
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index be04e9fb11f6..884703fda979 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -1,4 +1,4 @@
-/*
+/*
CMTP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
@@ -10,22 +10,23 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
-#include <linux/module.h>
-
+#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
@@ -34,6 +35,7 @@
#include <linux/ioctl.h>
#include <linux/file.h>
#include <linux/wait.h>
+#include <linux/kthread.h>
#include <net/sock.h>
#include <linux/isdn/capilli.h>
@@ -42,11 +44,6 @@
#include "cmtp.h"
-#ifndef CONFIG_BT_CMTP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
#define CAPI_INTEROPERABILITY 0x20
#define CAPI_INTEROPERABILITY_REQ CAPICMD(CAPI_INTEROPERABILITY, CAPI_REQ)
@@ -77,7 +74,7 @@ static struct cmtp_application *cmtp_application_add(struct cmtp_session *sessio
{
struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL);
- BT_DBG("session %p application %p appl %d", session, app, appl);
+ BT_DBG("session %p application %p appl %u", session, app, appl);
if (!app)
return NULL;
@@ -103,10 +100,8 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli
static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value)
{
struct cmtp_application *app;
- struct list_head *p, *n;
- list_for_each_safe(p, n, &session->applications) {
- app = list_entry(p, struct cmtp_application, list);
+ list_for_each_entry(app, &session->applications, list) {
switch (pattern) {
case CMTP_MSGNUM:
if (app->msgnum == value)
@@ -140,14 +135,14 @@ static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb)
{
struct cmtp_scb *scb = (void *) skb->cb;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
scb->id = -1;
scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3);
skb_queue_tail(&session->transmit, skb);
- cmtp_schedule(session);
+ wake_up_interruptible(sk_sleep(session->sock->sk));
}
static void cmtp_send_interopmsg(struct cmtp_session *session,
@@ -157,9 +152,10 @@ static void cmtp_send_interopmsg(struct cmtp_session *session,
struct sk_buff *skb;
unsigned char *s;
- BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum);
+ BT_DBG("session %p subcmd 0x%02x appl %u msgnum %u", session, subcmd, appl, msgnum);
- if (!(skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC))) {
+ skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC);
+ if (!skb) {
BT_ERR("Can't allocate memory for interoperability packet");
return;
}
@@ -192,10 +188,13 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
__u16 appl, msgnum, func, info;
__u32 controller;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
switch (CAPIMSG_SUBCOMMAND(skb->data)) {
case CAPI_CONF:
+ if (skb->len < CAPI_MSG_BASELEN + 10)
+ break;
+
func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 5);
info = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 8);
@@ -226,6 +225,9 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
break;
case CAPI_FUNCTION_GET_PROFILE:
+ if (skb->len < CAPI_MSG_BASELEN + 11 + sizeof(capi_profile))
+ break;
+
controller = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 11);
msgnum = CAPIMSG_MSGID(skb->data);
@@ -246,18 +248,15 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
break;
case CAPI_FUNCTION_GET_MANUFACTURER:
- controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 10);
-
- if (!info && ctrl) {
- strncpy(ctrl->manu,
- skb->data + CAPI_MSG_BASELEN + 15,
- skb->data[CAPI_MSG_BASELEN + 14]);
- }
-
+ if (!info && ctrl && skb->len > CAPI_MSG_BASELEN + 14)
+ strscpy_pad(ctrl->manu,
+ skb->data + CAPI_MSG_BASELEN + 15,
+ skb->data[CAPI_MSG_BASELEN + 14]);
break;
case CAPI_FUNCTION_GET_VERSION:
- controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12);
+ if (skb->len < CAPI_MSG_BASELEN + 32)
+ break;
if (!info && ctrl) {
ctrl->version.majorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 16);
@@ -269,29 +268,28 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
break;
case CAPI_FUNCTION_GET_SERIAL_NUMBER:
- controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12);
-
- if (!info && ctrl) {
- memset(ctrl->serial, 0, CAPI_SERIAL_LEN);
- strncpy(ctrl->serial,
- skb->data + CAPI_MSG_BASELEN + 17,
- skb->data[CAPI_MSG_BASELEN + 16]);
- }
-
+ if (!info && ctrl && skb->len > CAPI_MSG_BASELEN + 16)
+ strscpy_pad(ctrl->serial,
+ skb->data + CAPI_MSG_BASELEN + 17,
+ skb->data[CAPI_MSG_BASELEN + 16]);
break;
}
break;
case CAPI_IND:
+ if (skb->len < CAPI_MSG_BASELEN + 6)
+ break;
+
func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 3);
if (func == CAPI_FUNCTION_LOOPBACK) {
+ int len = min_t(uint, skb->len - CAPI_MSG_BASELEN - 6,
+ skb->data[CAPI_MSG_BASELEN + 5]);
appl = CAPIMSG_APPID(skb->data);
msgnum = CAPIMSG_MSGID(skb->data);
cmtp_send_interopmsg(session, CAPI_RESP, appl, msgnum, func,
- skb->data + CAPI_MSG_BASELEN + 6,
- skb->data[CAPI_MSG_BASELEN + 5]);
+ skb->data + CAPI_MSG_BASELEN + 6, len);
}
break;
@@ -304,22 +302,24 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
{
struct capi_ctr *ctrl = &session->ctrl;
struct cmtp_application *application;
- __u16 cmd, appl;
+ __u16 appl;
__u32 contr;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
+
+ if (skb->len < CAPI_MSG_BASELEN)
+ return;
if (CAPIMSG_COMMAND(skb->data) == CAPI_INTEROPERABILITY) {
cmtp_recv_interopmsg(session, skb);
return;
}
- if (session->flags & (1 << CMTP_LOOPBACK)) {
+ if (session->flags & BIT(CMTP_LOOPBACK)) {
kfree_skb(skb);
return;
}
- cmd = CAPICMD(CAPIMSG_COMMAND(skb->data), CAPIMSG_SUBCOMMAND(skb->data));
appl = CAPIMSG_APPID(skb->data);
contr = CAPIMSG_CONTROL(skb->data);
@@ -328,7 +328,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
appl = application->appl;
CAPIMSG_SETAPPID(skb->data, appl);
} else {
- BT_ERR("Can't find application with id %d", appl);
+ BT_ERR("Can't find application with id %u", appl);
kfree_skb(skb);
return;
}
@@ -338,12 +338,6 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
CAPIMSG_SETCONTROL(skb->data, contr);
}
- if (!ctrl) {
- BT_ERR("Can't find controller %d for message", session->num);
- kfree_skb(skb);
- return;
- }
-
capi_ctr_handle_message(ctrl, appl, skb);
}
@@ -360,10 +354,10 @@ static void cmtp_reset_ctr(struct capi_ctr *ctrl)
BT_DBG("ctrl %p", ctrl);
- capi_ctr_reseted(ctrl);
+ capi_ctr_down(ctrl);
atomic_inc(&session->terminate);
- cmtp_schedule(session);
+ wake_up_process(session->task);
}
static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_params *rp)
@@ -375,8 +369,8 @@ static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_
unsigned char buf[8];
int err = 0, nconn, want = rp->level3cnt;
- BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d",
- ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen);
+ BT_DBG("ctrl %p appl %u level3cnt %u datablkcnt %u datablklen %u",
+ ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen);
application = cmtp_application_add(session, appl);
if (!application) {
@@ -440,7 +434,7 @@ static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl)
struct cmtp_session *session = ctrl->driverdata;
struct cmtp_application *application;
- BT_DBG("ctrl %p appl %d", ctrl, appl);
+ BT_DBG("ctrl %p appl %u", ctrl, appl);
application = cmtp_application_get(session, CMTP_APPLID, appl);
if (!application) {
@@ -473,7 +467,7 @@ static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
application = cmtp_application_get(session, CMTP_APPLID, appl);
if ((!application) || (application->state != BT_CONNECTED)) {
- BT_ERR("Can't find application with id %d", appl);
+ BT_ERR("Can't find application with id %u", appl);
return CAPI_ILLAPPNR;
}
@@ -494,34 +488,23 @@ static char *cmtp_procinfo(struct capi_ctr *ctrl)
return "CAPI Message Transport Protocol";
}
-static int cmtp_ctr_read_proc(char *page, char **start, off_t off, int count, int *eof, struct capi_ctr *ctrl)
+static int cmtp_proc_show(struct seq_file *m, void *v)
{
+ struct capi_ctr *ctrl = m->private;
struct cmtp_session *session = ctrl->driverdata;
struct cmtp_application *app;
- struct list_head *p, *n;
- int len = 0;
- len += sprintf(page + len, "%s\n\n", cmtp_procinfo(ctrl));
- len += sprintf(page + len, "addr %s\n", session->name);
- len += sprintf(page + len, "ctrl %d\n", session->num);
+ seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl));
+ seq_printf(m, "addr %s\n", session->name);
+ seq_printf(m, "ctrl %d\n", session->num);
- list_for_each_safe(p, n, &session->applications) {
- app = list_entry(p, struct cmtp_application, list);
- len += sprintf(page + len, "appl %d -> %d\n", app->appl, app->mapping);
+ list_for_each_entry(app, &session->applications, list) {
+ seq_printf(m, "appl %u -> %u\n", app->appl, app->mapping);
}
- if (off + count >= len)
- *eof = 1;
-
- if (len < off)
- return 0;
-
- *start = page + off;
-
- return ((count < len - off) ? count : len - off);
+ return 0;
}
-
int cmtp_attach_device(struct cmtp_session *session)
{
unsigned char buf[4];
@@ -536,7 +519,7 @@ int cmtp_attach_device(struct cmtp_session *session)
ret = wait_event_interruptible_timeout(session->wait,
session->ncontroller, CMTP_INTEROP_TIMEOUT);
-
+
BT_INFO("Found %d CAPI controller(s) on device %s", session->ncontroller, session->name);
if (!ret)
@@ -560,7 +543,7 @@ int cmtp_attach_device(struct cmtp_session *session)
session->ctrl.send_message = cmtp_send_message;
session->ctrl.procinfo = cmtp_procinfo;
- session->ctrl.ctr_read_proc = cmtp_ctr_read_proc;
+ session->ctrl.proc_show = cmtp_proc_show;
if (attach_capi_ctr(&session->ctrl) < 0) {
BT_ERR("Can't attach new controller");
diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h
index 40e3dfec0cc8..f6b9dc4e408f 100644
--- a/net/bluetooth/cmtp/cmtp.h
+++ b/net/bluetooth/cmtp/cmtp.h
@@ -1,4 +1,4 @@
-/*
+/*
CMTP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
@@ -10,13 +10,13 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
@@ -26,7 +26,7 @@
#include <linux/types.h>
#include <net/bluetooth/bluetooth.h>
-#define BTNAMSIZ 18
+#define BTNAMSIZ 21
/* CMTP ioctl defines */
#define CMTPCONNADD _IOW('C', 200, int)
@@ -37,7 +37,7 @@
#define CMTP_LOOPBACK 0
struct cmtp_connadd_req {
- int sock; // Connected socket
+ int sock; /* Connected socket */
__u32 flags;
};
@@ -82,6 +82,7 @@ struct cmtp_session {
char name[BTNAMSIZ];
atomic_t terminate;
+ struct task_struct *task;
wait_queue_head_t wait;
@@ -121,13 +122,6 @@ void cmtp_detach_device(struct cmtp_session *session);
void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb);
-static inline void cmtp_schedule(struct cmtp_session *session)
-{
- struct sock *sk = session->sock->sk;
-
- wake_up_interruptible(sk->sk_sleep);
-}
-
/* CMTP init defines */
int cmtp_init_sockets(void);
void cmtp_cleanup_sockets(void);
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index b81a01c64aea..90d130588a3e 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -1,4 +1,4 @@
-/*
+/*
CMTP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
@@ -10,13 +10,13 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
@@ -29,11 +29,13 @@
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
+#include <linux/freezer.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/ioctl.h>
#include <linux/file.h>
#include <linux/init.h>
+#include <linux/kthread.h>
#include <net/sock.h>
#include <linux/isdn/capilli.h>
@@ -43,11 +45,6 @@
#include "cmtp.h"
-#ifndef CONFIG_BT_CMTP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
#define VERSION "1.0"
static DECLARE_RWSEM(cmtp_session_sem);
@@ -56,35 +53,33 @@ static LIST_HEAD(cmtp_session_list);
static struct cmtp_session *__cmtp_get_session(bdaddr_t *bdaddr)
{
struct cmtp_session *session;
- struct list_head *p;
BT_DBG("");
- list_for_each(p, &cmtp_session_list) {
- session = list_entry(p, struct cmtp_session, list);
+ list_for_each_entry(session, &cmtp_session_list, list)
if (!bacmp(bdaddr, &session->bdaddr))
return session;
- }
+
return NULL;
}
static void __cmtp_link_session(struct cmtp_session *session)
{
- __module_get(THIS_MODULE);
list_add(&session->list, &cmtp_session_list);
}
static void __cmtp_unlink_session(struct cmtp_session *session)
{
list_del(&session->list);
- module_put(THIS_MODULE);
}
static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_conninfo *ci)
{
+ u32 valid_flags = BIT(CMTP_LOOPBACK);
+ memset(ci, 0, sizeof(*ci));
bacpy(&ci->bdaddr, &session->bdaddr);
- ci->flags = session->flags;
+ ci->flags = session->flags & valid_flags;
ci->state = session->state;
ci->num = session->num;
@@ -118,20 +113,20 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const
size = (skb) ? skb->len + count : count;
- if (!(nskb = alloc_skb(size, GFP_ATOMIC))) {
+ nskb = alloc_skb(size, GFP_ATOMIC);
+ if (!nskb) {
BT_ERR("Can't allocate memory for CAPI message");
return;
}
if (skb && (skb->len > 0))
- memcpy(skb_put(nskb, skb->len), skb->data, skb->len);
+ skb_copy_from_linear_data(skb, skb_put(nskb, skb->len), skb->len);
- memcpy(skb_put(nskb, count), buf, count);
+ skb_put_data(nskb, buf, count);
session->reassembly[id] = nskb;
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
}
static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb)
@@ -183,8 +178,7 @@ static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *
cmtp_add_msgpart(session, id, skb->data + hdrlen, len);
break;
default:
- if (session->reassembly[id] != NULL)
- kfree_skb(session->reassembly[id]);
+ kfree_skb(session->reassembly[id]);
session->reassembly[id] = NULL;
break;
}
@@ -220,7 +214,8 @@ static void cmtp_process_transmit(struct cmtp_session *session)
BT_DBG("session %p", session);
- if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
+ nskb = alloc_skb(session->mtu, GFP_ATOMIC);
+ if (!nskb) {
BT_ERR("Can't allocate memory for new frame");
return;
}
@@ -228,7 +223,8 @@ static void cmtp_process_transmit(struct cmtp_session *session)
while ((skb = skb_dequeue(&session->transmit))) {
struct cmtp_scb *scb = (void *) skb->cb;
- if ((tail = (session->mtu - nskb->len)) < 5) {
+ tail = session->mtu - nskb->len;
+ if (tail < 5) {
cmtp_send_frame(session, nskb->data, nskb->len);
skb_trim(nskb, 0);
tail = session->mtu;
@@ -236,9 +232,12 @@ static void cmtp_process_transmit(struct cmtp_session *session)
size = min_t(uint, ((tail < 258) ? (tail - 2) : (tail - 3)), skb->len);
- if ((scb->id < 0) && ((scb->id = cmtp_alloc_block_id(session)) < 0)) {
- skb_queue_head(&session->transmit, skb);
- break;
+ if (scb->id < 0) {
+ scb->id = cmtp_alloc_block_id(session);
+ if (scb->id < 0) {
+ skb_queue_head(&session->transmit, skb);
+ break;
+ }
}
if (size < 256) {
@@ -256,7 +255,7 @@ static void cmtp_process_transmit(struct cmtp_session *session)
hdr[2] = size >> 8;
}
- memcpy(skb_put(nskb, size), skb->data, size);
+ skb_copy_from_linear_data(skb, skb_put(nskb, size), size);
skb_pull(skb, size);
if (skb->len > 0) {
@@ -281,37 +280,40 @@ static int cmtp_session(void *arg)
struct cmtp_session *session = arg;
struct sock *sk = session->sock->sk;
struct sk_buff *skb;
- wait_queue_t wait;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
BT_DBG("session %p", session);
- daemonize("kcmtpd_ctr_%d", session->num);
set_user_nice(current, -15);
- current->flags |= PF_NOFREEZE;
-
- init_waitqueue_entry(&wait, current);
- add_wait_queue(sk->sk_sleep, &wait);
- while (!atomic_read(&session->terminate)) {
- set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ if (atomic_read(&session->terminate))
+ break;
if (sk->sk_state != BT_CONNECTED)
break;
while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
skb_orphan(skb);
- cmtp_recv_frame(session, skb);
+ if (!skb_linearize(skb))
+ cmtp_recv_frame(session, skb);
+ else
+ kfree_skb(skb);
}
cmtp_process_transmit(session);
- schedule();
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
- set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
down_write(&cmtp_session_sem);
- if (!(session->flags & (1 << CMTP_LOOPBACK)))
+ if (!(session->flags & BIT(CMTP_LOOPBACK)))
cmtp_detach_device(session);
fput(session->sock->file);
@@ -321,39 +323,44 @@ static int cmtp_session(void *arg)
up_write(&cmtp_session_sem);
kfree(session);
+ module_put_and_kthread_exit(0);
return 0;
}
int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
{
+ u32 valid_flags = BIT(CMTP_LOOPBACK);
struct cmtp_session *session, *s;
- bdaddr_t src, dst;
int i, err;
BT_DBG("");
- baswap(&src, &bt_sk(sock->sk)->src);
- baswap(&dst, &bt_sk(sock->sk)->dst);
+ if (!l2cap_is_socket(sock))
+ return -EBADFD;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL);
- if (!session)
+ if (!session)
return -ENOMEM;
down_write(&cmtp_session_sem);
- s = __cmtp_get_session(&bt_sk(sock->sk)->dst);
+ s = __cmtp_get_session(&l2cap_pi(sock->sk)->chan->dst);
if (s && s->state == BT_CONNECTED) {
err = -EEXIST;
goto failed;
}
- bacpy(&session->bdaddr, &bt_sk(sock->sk)->dst);
+ bacpy(&session->bdaddr, &l2cap_pi(sock->sk)->chan->dst);
- session->mtu = min_t(uint, l2cap_pi(sock->sk)->omtu, l2cap_pi(sock->sk)->imtu);
+ session->mtu = min_t(uint, l2cap_pi(sock->sk)->chan->omtu,
+ l2cap_pi(sock->sk)->chan->imtu);
BT_DBG("mtu %d", session->mtu);
- sprintf(session->name, "%s", batostr(&dst));
+ sprintf(session->name, "%pMR", &session->bdaddr);
session->sock = sock;
session->state = BT_CONFIG;
@@ -373,22 +380,33 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
__cmtp_link_session(session);
- err = kernel_thread(cmtp_session, session, CLONE_KERNEL);
- if (err < 0)
+ __module_get(THIS_MODULE);
+ session->task = kthread_run(cmtp_session, session, "kcmtpd_ctr_%d",
+ session->num);
+ if (IS_ERR(session->task)) {
+ module_put(THIS_MODULE);
+ err = PTR_ERR(session->task);
goto unlink;
+ }
- if (!(session->flags & (1 << CMTP_LOOPBACK))) {
+ if (!(session->flags & BIT(CMTP_LOOPBACK))) {
err = cmtp_attach_device(session);
- if (err < 0)
- goto detach;
+ if (err < 0) {
+ /* Caller will call fput in case of failure, and so
+ * will cmtp_session kthread.
+ */
+ get_file(session->sock->file);
+
+ atomic_inc(&session->terminate);
+ wake_up_interruptible(sk_sleep(session->sock->sk));
+ up_write(&cmtp_session_sem);
+ return err;
+ }
}
up_write(&cmtp_session_sem);
return 0;
-detach:
- cmtp_detach_device(session);
-
unlink:
__cmtp_unlink_session(session);
@@ -400,11 +418,15 @@ failed:
int cmtp_del_connection(struct cmtp_conndel_req *req)
{
+ u32 valid_flags = 0;
struct cmtp_session *session;
int err = 0;
BT_DBG("");
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
down_read(&cmtp_session_sem);
session = __cmtp_get_session(&req->bdaddr);
@@ -412,9 +434,14 @@ int cmtp_del_connection(struct cmtp_conndel_req *req)
/* Flush the transmit queue */
skb_queue_purge(&session->transmit);
- /* Kill session thread */
+ /* Stop session thread */
atomic_inc(&session->terminate);
- cmtp_schedule(session);
+
+ /*
+ * See the comment preceding the call to wait_woken()
+ * in cmtp_session().
+ */
+ wake_up_interruptible(sk_sleep(session->sock->sk));
} else
err = -ENOENT;
@@ -424,19 +451,16 @@ int cmtp_del_connection(struct cmtp_conndel_req *req)
int cmtp_get_connlist(struct cmtp_connlist_req *req)
{
- struct list_head *p;
+ struct cmtp_session *session;
int err = 0, n = 0;
BT_DBG("");
down_read(&cmtp_session_sem);
- list_for_each(p, &cmtp_session_list) {
- struct cmtp_session *session;
+ list_for_each_entry(session, &cmtp_session_list, list) {
struct cmtp_conninfo ci;
- session = list_entry(p, struct cmtp_session, list);
-
__cmtp_copy_session(session, &ci);
if (copy_to_user(req->ci, &ci, sizeof(ci))) {
@@ -475,13 +499,9 @@ int cmtp_get_conninfo(struct cmtp_conninfo *ci)
static int __init cmtp_init(void)
{
- l2cap_load();
-
BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION);
- cmtp_init_sockets();
-
- return 0;
+ return cmtp_init_sockets();
}
static void __exit cmtp_exit(void)
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index 53295d33dc5c..96d49d9fae96 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -1,4 +1,4 @@
-/*
+/*
CMTP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
@@ -10,24 +10,22 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/skbuff.h>
@@ -35,19 +33,18 @@
#include <linux/ioctl.h>
#include <linux/file.h>
#include <linux/compat.h>
+#include <linux/gfp.h>
+#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/isdn/capilli.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
#include "cmtp.h"
-#ifndef CONFIG_BT_CMTP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+static struct bt_sock_list cmtp_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(cmtp_sk_list.lock)
+};
static int cmtp_sock_release(struct socket *sock)
{
@@ -58,28 +55,29 @@ static int cmtp_sock_release(struct socket *sock)
if (!sk)
return 0;
+ bt_sock_unlink(&cmtp_sk_list, sk);
+
sock_orphan(sk);
sock_put(sk);
return 0;
}
-static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
{
struct cmtp_connadd_req ca;
struct cmtp_conndel_req cd;
struct cmtp_connlist_req cl;
struct cmtp_conninfo ci;
struct socket *nsock;
- void __user *argp = (void __user *)arg;
int err;
- BT_DBG("cmd %x arg %lx", cmd, arg);
+ BT_DBG("cmd %x arg %p", cmd, argp);
switch (cmd) {
case CMTPCONNADD:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
if (copy_from_user(&ca, argp, sizeof(ca)))
return -EFAULT;
@@ -89,7 +87,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return err;
if (nsock->sk->sk_state != BT_CONNECTED) {
- fput(nsock->file);
+ sockfd_put(nsock);
return -EBADFD;
}
@@ -98,13 +96,13 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
if (copy_to_user(argp, &ca, sizeof(ca)))
err = -EFAULT;
} else
- fput(nsock->file);
+ sockfd_put(nsock);
return err;
case CMTPCONNDEL:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
if (copy_from_user(&cd, argp, sizeof(cd)))
return -EFAULT;
@@ -138,32 +136,38 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
+static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_cmtp_sock_ioctl(sock, cmd, (void __user *)arg);
+}
+
#ifdef CONFIG_COMPAT
static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
if (cmd == CMTPGETCONNLIST) {
struct cmtp_connlist_req cl;
- uint32_t uci;
+ u32 __user *p = argp;
+ u32 uci;
int err;
- if (get_user(cl.cnum, (uint32_t __user *) arg) ||
- get_user(uci, (u32 __user *) (arg + 4)))
+ if (get_user(cl.cnum, p) || get_user(uci, p + 1))
return -EFAULT;
cl.ci = compat_ptr(uci);
if (cl.cnum <= 0)
return -EINVAL;
-
+
err = cmtp_get_connlist(&cl);
- if (!err && put_user(cl.cnum, (uint32_t __user *) arg))
+ if (!err && put_user(cl.cnum, p))
err = -EFAULT;
return err;
}
- return cmtp_sock_ioctl(sock, cmd, arg);
+ return do_cmtp_sock_ioctl(sock, cmd, argp);
}
#endif
@@ -179,11 +183,8 @@ static const struct proto_ops cmtp_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
@@ -196,7 +197,8 @@ static struct proto cmtp_proto = {
.obj_size = sizeof(struct bt_sock)
};
-static int cmtp_sock_create(struct socket *sock, int protocol)
+static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
@@ -205,7 +207,7 @@ static int cmtp_sock_create(struct socket *sock, int protocol)
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, 1);
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern);
if (!sk)
return -ENOMEM;
@@ -220,10 +222,12 @@ static int cmtp_sock_create(struct socket *sock, int protocol)
sk->sk_protocol = protocol;
sk->sk_state = BT_OPEN;
+ bt_sock_link(&cmtp_sk_list, sk);
+
return 0;
}
-static struct net_proto_family cmtp_sock_family_ops = {
+static const struct net_proto_family cmtp_sock_family_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.create = cmtp_sock_create
@@ -238,21 +242,30 @@ int cmtp_init_sockets(void)
return err;
err = bt_sock_register(BTPROTO_CMTP, &cmtp_sock_family_ops);
- if (err < 0)
+ if (err < 0) {
+ BT_ERR("Can't register CMTP socket");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "cmtp", &cmtp_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create CMTP proc file");
+ bt_sock_unregister(BTPROTO_HIDP);
goto error;
+ }
+
+ BT_INFO("CMTP socket layer initialized");
return 0;
error:
- BT_ERR("Can't register CMTP socket");
proto_unregister(&cmtp_proto);
return err;
}
void cmtp_cleanup_sockets(void)
{
- if (bt_sock_unregister(BTPROTO_CMTP) < 0)
- BT_ERR("Can't unregister CMTP socket");
-
+ bt_procfs_cleanup(&init_net, "cmtp");
+ bt_sock_unregister(BTPROTO_CMTP);
proto_unregister(&cmtp_proto);
}
diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c
new file mode 100644
index 000000000000..720cb79adf96
--- /dev/null
+++ b/net/bluetooth/coredump.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Google Corporation
+ */
+
+#include <linux/devcoredump.h>
+
+#include <linux/unaligned.h>
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+enum hci_devcoredump_pkt_type {
+ HCI_DEVCOREDUMP_PKT_INIT,
+ HCI_DEVCOREDUMP_PKT_SKB,
+ HCI_DEVCOREDUMP_PKT_PATTERN,
+ HCI_DEVCOREDUMP_PKT_COMPLETE,
+ HCI_DEVCOREDUMP_PKT_ABORT,
+};
+
+struct hci_devcoredump_skb_cb {
+ u16 pkt_type;
+};
+
+struct hci_devcoredump_skb_pattern {
+ u8 pattern;
+ u32 len;
+} __packed;
+
+#define hci_dmp_cb(skb) ((struct hci_devcoredump_skb_cb *)((skb)->cb))
+
+#define DBG_UNEXPECTED_STATE() \
+ bt_dev_dbg(hdev, \
+ "Unexpected packet (%d) for state (%d). ", \
+ hci_dmp_cb(skb)->pkt_type, hdev->dump.state)
+
+#define MAX_DEVCOREDUMP_HDR_SIZE 512 /* bytes */
+
+static int hci_devcd_update_hdr_state(char *buf, size_t size, int state)
+{
+ int len = 0;
+
+ if (!buf)
+ return 0;
+
+ len = scnprintf(buf, size, "Bluetooth devcoredump\nState: %d\n", state);
+
+ return len + 1; /* scnprintf adds \0 at the end upon state rewrite */
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_update_state(struct hci_dev *hdev, int state)
+{
+ bt_dev_dbg(hdev, "Updating devcoredump state from %d to %d.",
+ hdev->dump.state, state);
+
+ hdev->dump.state = state;
+
+ return hci_devcd_update_hdr_state(hdev->dump.head,
+ hdev->dump.alloc_size, state);
+}
+
+static int hci_devcd_mkheader(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ char dump_start[] = "--- Start dump ---\n";
+ char hdr[80];
+ int hdr_len;
+
+ hdr_len = hci_devcd_update_hdr_state(hdr, sizeof(hdr),
+ HCI_DEVCOREDUMP_IDLE);
+ skb_put_data(skb, hdr, hdr_len);
+
+ if (hdev->dump.dmp_hdr)
+ hdev->dump.dmp_hdr(hdev, skb);
+
+ skb_put_data(skb, dump_start, strlen(dump_start));
+
+ return skb->len;
+}
+
+/* Do not call with hci_dev_lock since this calls driver code. */
+static void hci_devcd_notify(struct hci_dev *hdev, int state)
+{
+ if (hdev->dump.notify_change)
+ hdev->dump.notify_change(hdev, state);
+}
+
+/* Call with hci_dev_lock only. */
+void hci_devcd_reset(struct hci_dev *hdev)
+{
+ hdev->dump.head = NULL;
+ hdev->dump.tail = NULL;
+ hdev->dump.alloc_size = 0;
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+ cancel_delayed_work(&hdev->dump.dump_timeout);
+ skb_queue_purge(&hdev->dump.dump_q);
+}
+
+/* Call with hci_dev_lock only. */
+static void hci_devcd_free(struct hci_dev *hdev)
+{
+ vfree(hdev->dump.head);
+
+ hci_devcd_reset(hdev);
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_alloc(struct hci_dev *hdev, u32 size)
+{
+ hdev->dump.head = vmalloc(size);
+ if (!hdev->dump.head)
+ return -ENOMEM;
+
+ hdev->dump.alloc_size = size;
+ hdev->dump.tail = hdev->dump.head;
+ hdev->dump.end = hdev->dump.head + size;
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+ return 0;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_copy(struct hci_dev *hdev, char *buf, u32 size)
+{
+ if (hdev->dump.tail + size > hdev->dump.end)
+ return false;
+
+ memcpy(hdev->dump.tail, buf, size);
+ hdev->dump.tail += size;
+
+ return true;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_memset(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+ if (hdev->dump.tail + len > hdev->dump.end)
+ return false;
+
+ memset(hdev->dump.tail, pattern, len);
+ hdev->dump.tail += len;
+
+ return true;
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_prepare(struct hci_dev *hdev, u32 dump_size)
+{
+ struct sk_buff *skb;
+ int dump_hdr_size;
+ int err = 0;
+
+ skb = alloc_skb(MAX_DEVCOREDUMP_HDR_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ dump_hdr_size = hci_devcd_mkheader(hdev, skb);
+
+ if (hci_devcd_alloc(hdev, dump_hdr_size + dump_size)) {
+ err = -ENOMEM;
+ goto hdr_free;
+ }
+
+ /* Insert the device header */
+ if (!hci_devcd_copy(hdev, skb->data, skb->len)) {
+ bt_dev_err(hdev, "Failed to insert header");
+ hci_devcd_free(hdev);
+
+ err = -ENOMEM;
+ goto hdr_free;
+ }
+
+hdr_free:
+ kfree_skb(skb);
+
+ return err;
+}
+
+static void hci_devcd_handle_pkt_init(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u32 dump_size;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_IDLE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ if (skb->len != sizeof(dump_size)) {
+ bt_dev_dbg(hdev, "Invalid dump init pkt");
+ return;
+ }
+
+ dump_size = get_unaligned_le32(skb_pull_data(skb, 4));
+ if (!dump_size) {
+ bt_dev_err(hdev, "Zero size dump init pkt");
+ return;
+ }
+
+ if (hci_devcd_prepare(hdev, dump_size)) {
+ bt_dev_err(hdev, "Failed to prepare for dump");
+ return;
+ }
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ACTIVE);
+ queue_delayed_work(hdev->workqueue, &hdev->dump.dump_timeout,
+ hdev->dump.timeout);
+}
+
+static void hci_devcd_handle_pkt_skb(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ if (!hci_devcd_copy(hdev, skb->data, skb->len))
+ bt_dev_dbg(hdev, "Failed to insert skb");
+}
+
+static void hci_devcd_handle_pkt_pattern(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_devcoredump_skb_pattern *pattern;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ if (skb->len != sizeof(*pattern)) {
+ bt_dev_dbg(hdev, "Invalid pattern skb");
+ return;
+ }
+
+ pattern = skb_pull_data(skb, sizeof(*pattern));
+
+ if (!hci_devcd_memset(hdev, pattern->pattern, pattern->len))
+ bt_dev_dbg(hdev, "Failed to set pattern");
+}
+
+static void hci_devcd_dump(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ u32 size;
+
+ bt_dev_dbg(hdev, "state %d", hdev->dump.state);
+
+ size = hdev->dump.tail - hdev->dump.head;
+
+ /* Send a copy to monitor as a diagnostic packet */
+ skb = bt_skb_alloc(size, GFP_ATOMIC);
+ if (skb) {
+ skb_put_data(skb, hdev->dump.head, size);
+ hci_recv_diag(hdev, skb);
+ }
+
+ /* Emit a devcoredump with the available data */
+ dev_coredumpv(&hdev->dev, hdev->dump.head, size, GFP_KERNEL);
+}
+
+static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u32 dump_size;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_DONE);
+ dump_size = hdev->dump.tail - hdev->dump.head;
+
+ bt_dev_dbg(hdev, "complete with size %u (expect %zu)", dump_size,
+ hdev->dump.alloc_size);
+
+ hci_devcd_dump(hdev);
+}
+
+static void hci_devcd_handle_pkt_abort(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u32 dump_size;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ABORT);
+ dump_size = hdev->dump.tail - hdev->dump.head;
+
+ bt_dev_dbg(hdev, "aborted with size %u (expect %zu)", dump_size,
+ hdev->dump.alloc_size);
+
+ hci_devcd_dump(hdev);
+}
+
+/* Bluetooth devcoredump state machine.
+ *
+ * Devcoredump states:
+ *
+ * HCI_DEVCOREDUMP_IDLE: The default state.
+ *
+ * HCI_DEVCOREDUMP_ACTIVE: A devcoredump will be in this state once it has
+ * been initialized using hci_devcd_init(). Once active, the driver
+ * can append data using hci_devcd_append() or insert a pattern
+ * using hci_devcd_append_pattern().
+ *
+ * HCI_DEVCOREDUMP_DONE: Once the dump collection is complete, the drive
+ * can signal the completion using hci_devcd_complete(). A
+ * devcoredump is generated indicating the completion event and
+ * then the state machine is reset to the default state.
+ *
+ * HCI_DEVCOREDUMP_ABORT: The driver can cancel ongoing dump collection in
+ * case of any error using hci_devcd_abort(). A devcoredump is
+ * still generated with the available data indicating the abort
+ * event and then the state machine is reset to the default state.
+ *
+ * HCI_DEVCOREDUMP_TIMEOUT: A timeout timer for HCI_DEVCOREDUMP_TIMEOUT sec
+ * is started during devcoredump initialization. Once the timeout
+ * occurs, the driver is notified, a devcoredump is generated with
+ * the available data indicating the timeout event and then the
+ * state machine is reset to the default state.
+ *
+ * The driver must register using hci_devcd_register() before using the hci
+ * devcoredump APIs.
+ */
+void hci_devcd_rx(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, dump.dump_rx);
+ struct sk_buff *skb;
+ int start_state;
+
+ while ((skb = skb_dequeue(&hdev->dump.dump_q))) {
+ /* Return if timeout occurs. The timeout handler function
+ * hci_devcd_timeout() will report the available dump data.
+ */
+ if (hdev->dump.state == HCI_DEVCOREDUMP_TIMEOUT) {
+ kfree_skb(skb);
+ return;
+ }
+
+ hci_dev_lock(hdev);
+ start_state = hdev->dump.state;
+
+ switch (hci_dmp_cb(skb)->pkt_type) {
+ case HCI_DEVCOREDUMP_PKT_INIT:
+ hci_devcd_handle_pkt_init(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_SKB:
+ hci_devcd_handle_pkt_skb(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_PATTERN:
+ hci_devcd_handle_pkt_pattern(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_COMPLETE:
+ hci_devcd_handle_pkt_complete(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_ABORT:
+ hci_devcd_handle_pkt_abort(hdev, skb);
+ break;
+
+ default:
+ bt_dev_dbg(hdev, "Unknown packet (%d) for state (%d). ",
+ hci_dmp_cb(skb)->pkt_type, hdev->dump.state);
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+
+ /* Notify the driver about any state changes before resetting
+ * the state machine
+ */
+ if (start_state != hdev->dump.state)
+ hci_devcd_notify(hdev, hdev->dump.state);
+
+ /* Reset the state machine if the devcoredump is complete */
+ hci_dev_lock(hdev);
+ if (hdev->dump.state == HCI_DEVCOREDUMP_DONE ||
+ hdev->dump.state == HCI_DEVCOREDUMP_ABORT)
+ hci_devcd_reset(hdev);
+ hci_dev_unlock(hdev);
+ }
+}
+EXPORT_SYMBOL(hci_devcd_rx);
+
+void hci_devcd_timeout(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ dump.dump_timeout.work);
+ u32 dump_size;
+
+ hci_devcd_notify(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+ hci_dev_lock(hdev);
+
+ cancel_work(&hdev->dump.dump_rx);
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+ dump_size = hdev->dump.tail - hdev->dump.head;
+ bt_dev_dbg(hdev, "timeout with size %u (expect %zu)", dump_size,
+ hdev->dump.alloc_size);
+
+ hci_devcd_dump(hdev);
+
+ hci_devcd_reset(hdev);
+
+ hci_dev_unlock(hdev);
+}
+EXPORT_SYMBOL(hci_devcd_timeout);
+
+int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+ dmp_hdr_t dmp_hdr, notify_change_t notify_change)
+{
+ /* Driver must implement coredump() and dmp_hdr() functions for
+ * bluetooth devcoredump. The coredump() should trigger a coredump
+ * event on the controller when the device's coredump sysfs entry is
+ * written to. The dmp_hdr() should create a dump header to identify
+ * the controller/fw/driver info.
+ */
+ if (!coredump || !dmp_hdr)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->dump.coredump = coredump;
+ hdev->dump.dmp_hdr = dmp_hdr;
+ hdev->dump.notify_change = notify_change;
+ hdev->dump.supported = true;
+ hdev->dump.timeout = DEVCOREDUMP_TIMEOUT;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_register);
+
+static inline bool hci_devcd_enabled(struct hci_dev *hdev)
+{
+ return hdev->dump.supported;
+}
+
+int hci_devcd_init(struct hci_dev *hdev, u32 dump_size)
+{
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(sizeof(dump_size), GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_INIT;
+ put_unaligned_le32(dump_size, skb_put(skb, 4));
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_init);
+
+int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ if (!skb)
+ return -ENOMEM;
+
+ if (!hci_devcd_enabled(hdev)) {
+ kfree_skb(skb);
+ return -EOPNOTSUPP;
+ }
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_SKB;
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append);
+
+int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+ struct hci_devcoredump_skb_pattern p;
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(sizeof(p), GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ p.pattern = pattern;
+ p.len = len;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_PATTERN;
+ skb_put_data(skb, &p, sizeof(p));
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append_pattern);
+
+int hci_devcd_complete(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_COMPLETE;
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_complete);
+
+int hci_devcd_abort(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_ABORT;
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_abort);
diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c
new file mode 100644
index 000000000000..0efc93fdae8a
--- /dev/null
+++ b/net/bluetooth/ecdh_helper.c
@@ -0,0 +1,203 @@
+/*
+ * ECDH helper functions - KPP wrappings
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation;
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ * CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ * COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ * SOFTWARE IS DISCLAIMED.
+ */
+#include "ecdh_helper.h"
+
+#include <linux/scatterlist.h>
+#include <crypto/ecdh.h>
+
+static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits)
+{
+ int i;
+
+ for (i = 0; i < ndigits; i++)
+ out[i] = __swab64(in[ndigits - 1 - i]);
+}
+
+/* compute_ecdh_secret() - function assumes that the private key was
+ * already set.
+ * @tfm: KPP tfm handle allocated with crypto_alloc_kpp().
+ * @public_key: pair's ecc public key.
+ * secret: memory where the ecdh computed shared secret will be saved.
+ *
+ * Return: zero on success; error code in case of error.
+ */
+int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
+ u8 secret[32])
+{
+ DECLARE_CRYPTO_WAIT(result);
+ struct kpp_request *req;
+ u8 *tmp;
+ struct scatterlist src, dst;
+ int err;
+
+ tmp = kmalloc(64, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ req = kpp_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ err = -ENOMEM;
+ goto free_tmp;
+ }
+
+ swap_digits((u64 *)public_key, (u64 *)tmp, 4); /* x */
+ swap_digits((u64 *)&public_key[32], (u64 *)&tmp[32], 4); /* y */
+
+ sg_init_one(&src, tmp, 64);
+ sg_init_one(&dst, secret, 32);
+ kpp_request_set_input(req, &src, 64);
+ kpp_request_set_output(req, &dst, 32);
+ kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &result);
+ err = crypto_kpp_compute_shared_secret(req);
+ err = crypto_wait_req(err, &result);
+ if (err < 0) {
+ pr_err("alg: ecdh: compute shared secret failed. err %d\n",
+ err);
+ goto free_all;
+ }
+
+ swap_digits((u64 *)secret, (u64 *)tmp, 4);
+ memcpy(secret, tmp, 32);
+
+free_all:
+ kpp_request_free(req);
+free_tmp:
+ kfree_sensitive(tmp);
+ return err;
+}
+
+/* set_ecdh_privkey() - set or generate ecc private key.
+ *
+ * Function generates an ecc private key in the crypto subsystem when receiving
+ * a NULL private key or sets the received key when not NULL.
+ *
+ * @tfm: KPP tfm handle allocated with crypto_alloc_kpp().
+ * @private_key: user's ecc private key. When not NULL, the key is expected
+ * in little endian format.
+ *
+ * Return: zero on success; error code in case of error.
+ */
+int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32])
+{
+ u8 *buf, *tmp = NULL;
+ unsigned int buf_len;
+ int err;
+ struct ecdh p = {0};
+
+ if (private_key) {
+ tmp = kmalloc(32, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+ swap_digits((u64 *)private_key, (u64 *)tmp, 4);
+ p.key = tmp;
+ p.key_size = 32;
+ }
+
+ buf_len = crypto_ecdh_key_len(&p);
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto free_tmp;
+ }
+
+ err = crypto_ecdh_encode_key(buf, buf_len, &p);
+ if (err)
+ goto free_all;
+
+ err = crypto_kpp_set_secret(tfm, buf, buf_len);
+ /* fall through */
+free_all:
+ kfree_sensitive(buf);
+free_tmp:
+ kfree_sensitive(tmp);
+ return err;
+}
+
+/* generate_ecdh_public_key() - function assumes that the private key was
+ * already set.
+ *
+ * @tfm: KPP tfm handle allocated with crypto_alloc_kpp().
+ * @public_key: memory where the computed ecc public key will be saved.
+ *
+ * Return: zero on success; error code in case of error.
+ */
+int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64])
+{
+ DECLARE_CRYPTO_WAIT(result);
+ struct kpp_request *req;
+ u8 *tmp;
+ struct scatterlist dst;
+ int err;
+
+ tmp = kmalloc(64, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ req = kpp_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ err = -ENOMEM;
+ goto free_tmp;
+ }
+
+ sg_init_one(&dst, tmp, 64);
+ kpp_request_set_input(req, NULL, 0);
+ kpp_request_set_output(req, &dst, 64);
+ kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &result);
+
+ err = crypto_kpp_generate_public_key(req);
+ err = crypto_wait_req(err, &result);
+ if (err < 0)
+ goto free_all;
+
+ /* The public key is handed back in little endian as expected by
+ * the Security Manager Protocol.
+ */
+ swap_digits((u64 *)tmp, (u64 *)public_key, 4); /* x */
+ swap_digits((u64 *)&tmp[32], (u64 *)&public_key[32], 4); /* y */
+
+free_all:
+ kpp_request_free(req);
+free_tmp:
+ kfree(tmp);
+ return err;
+}
+
+/* generate_ecdh_keys() - generate ecc key pair.
+ *
+ * @tfm: KPP tfm handle allocated with crypto_alloc_kpp().
+ * @public_key: memory where the computed ecc public key will be saved.
+ *
+ * Return: zero on success; error code in case of error.
+ */
+int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64])
+{
+ int err;
+
+ err = set_ecdh_privkey(tfm, NULL);
+ if (err)
+ return err;
+
+ return generate_ecdh_public_key(tfm, public_key);
+}
diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h
new file mode 100644
index 000000000000..830723971cf8
--- /dev/null
+++ b/net/bluetooth/ecdh_helper.h
@@ -0,0 +1,30 @@
+/*
+ * ECDH helper functions - KPP wrappings
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation;
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ * CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ * COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ * SOFTWARE IS DISCLAIMED.
+ */
+#include <crypto/kpp.h>
+#include <linux/types.h>
+
+int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pair_public_key[64],
+ u8 secret[32]);
+int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32]);
+int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]);
+int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64]);
diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c
new file mode 100644
index 000000000000..3f72111ba651
--- /dev/null
+++ b/net/bluetooth/eir.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "eir.h"
+
+#define PNP_INFO_SVCLASS_ID 0x1200
+
+u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
+{
+ size_t short_len;
+ size_t complete_len;
+
+ /* no space left for name (+ type + len) */
+ if ((max_adv_len(hdev) - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 2)
+ return ad_len;
+
+ /* use complete name if present and fits */
+ complete_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name));
+ if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH)
+ return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, complete_len);
+
+ /* use short name if present */
+ short_len = strnlen(hdev->short_name, sizeof(hdev->short_name));
+ if (short_len)
+ return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
+ hdev->short_name,
+ short_len);
+
+ /* use shortened full name if present, we already know that name
+ * is longer then HCI_MAX_SHORT_NAME_LENGTH
+ */
+ if (complete_len)
+ return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
+ hdev->dev_name,
+ HCI_MAX_SHORT_NAME_LENGTH);
+
+ return ad_len;
+}
+
+u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
+{
+ return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance);
+}
+
+u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data,
+ u8 data_len)
+{
+ eir[eir_len++] = sizeof(u8) + sizeof(uuid) + data_len;
+ eir[eir_len++] = EIR_SERVICE_DATA;
+ put_unaligned_le16(uuid, &eir[eir_len]);
+ eir_len += sizeof(uuid);
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 4)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ u16 uuid16;
+
+ if (uuid->size != 16)
+ continue;
+
+ uuid16 = get_unaligned_le16(&uuid->uuid[12]);
+ if (uuid16 < 0x1100)
+ continue;
+
+ if (uuid16 == PNP_INFO_SVCLASS_ID)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID16_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u16) > len) {
+ uuids_start[1] = EIR_UUID16_SOME;
+ break;
+ }
+
+ *ptr++ = (uuid16 & 0x00ff);
+ *ptr++ = (uuid16 & 0xff00) >> 8;
+ uuids_start[0] += sizeof(uuid16);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 6)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 32)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID32_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u32) > len) {
+ uuids_start[1] = EIR_UUID32_SOME;
+ break;
+ }
+
+ memcpy(ptr, &uuid->uuid[12], sizeof(u32));
+ ptr += sizeof(u32);
+ uuids_start[0] += sizeof(u32);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 18)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 128)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID128_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + 16 > len) {
+ uuids_start[1] = EIR_UUID128_SOME;
+ break;
+ }
+
+ memcpy(ptr, uuid->uuid, 16);
+ ptr += 16;
+ uuids_start[0] += 16;
+ }
+
+ return ptr;
+}
+
+void eir_create(struct hci_dev *hdev, u8 *data)
+{
+ u8 *ptr = data;
+ size_t name_len;
+
+ name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name));
+
+ if (name_len > 0) {
+ /* EIR Data type */
+ if (name_len > 48) {
+ name_len = 48;
+ ptr[1] = EIR_NAME_SHORT;
+ } else {
+ ptr[1] = EIR_NAME_COMPLETE;
+ }
+
+ /* EIR Data length */
+ ptr[0] = name_len + 1;
+
+ memcpy(ptr + 2, hdev->dev_name, name_len);
+
+ ptr += (name_len + 2);
+ }
+
+ if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) {
+ ptr[0] = 2;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)hdev->inq_tx_power;
+
+ ptr += 3;
+ }
+
+ if (hdev->devid_source > 0) {
+ ptr[0] = 9;
+ ptr[1] = EIR_DEVICE_ID;
+
+ put_unaligned_le16(hdev->devid_source, ptr + 2);
+ put_unaligned_le16(hdev->devid_vendor, ptr + 4);
+ put_unaligned_le16(hdev->devid_product, ptr + 6);
+ put_unaligned_le16(hdev->devid_version, ptr + 8);
+
+ ptr += 10;
+ }
+
+ ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+}
+
+u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+ struct adv_info *adv = NULL;
+ u8 ad_len = 0;
+
+ /* Return 0 when the current instance identifier is invalid. */
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return 0;
+ }
+
+ if (adv) {
+ memcpy(ptr, adv->per_adv_data, adv->per_adv_data_len);
+ ad_len += adv->per_adv_data_len;
+ ptr += adv->per_adv_data_len;
+ }
+
+ return ad_len;
+}
+
+u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size)
+{
+ struct adv_info *adv = NULL;
+ u8 ad_len = 0, flags = 0;
+ u32 instance_flags;
+
+ /* Return 0 when the current instance identifier is invalid. */
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return 0;
+ }
+
+ instance_flags = hci_adv_instance_flags(hdev, instance);
+
+ /* If instance already has the flags set skip adding it once
+ * again.
+ */
+ if (adv && eir_get_data(adv->adv_data, adv->adv_data_len, EIR_FLAGS,
+ NULL))
+ goto skip_flags;
+
+ /* The Add Advertising command allows userspace to set both the general
+ * and limited discoverable flags.
+ */
+ if (instance_flags & MGMT_ADV_FLAG_DISCOV)
+ flags |= LE_AD_GENERAL;
+
+ if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
+ flags |= LE_AD_LIMITED;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
+ if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
+ /* If a discovery flag wasn't provided, simply use the global
+ * settings.
+ */
+ if (!flags)
+ flags |= mgmt_get_adv_discov_flags(hdev);
+
+ /* If flags would still be empty, then there is no need to
+ * include the "Flags" AD field".
+ */
+ if (flags && (ad_len + eir_precalc_len(1) <= size)) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_FLAGS;
+ ptr[2] = flags;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+ }
+
+skip_flags:
+ if (adv) {
+ memcpy(ptr, adv->adv_data, adv->adv_data_len);
+ ad_len += adv->adv_data_len;
+ ptr += adv->adv_data_len;
+ }
+
+ if (instance_flags & MGMT_ADV_FLAG_TX_POWER) {
+ s8 adv_tx_power;
+
+ if (ext_adv_capable(hdev)) {
+ if (adv)
+ adv_tx_power = adv->tx_power;
+ else
+ adv_tx_power = hdev->adv_tx_power;
+ } else {
+ adv_tx_power = hdev->adv_tx_power;
+ }
+
+ /* Provide Tx Power only if we can provide a valid value for it */
+ if (adv_tx_power != HCI_TX_POWER_INVALID &&
+ (ad_len + eir_precalc_len(1) <= size)) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)adv_tx_power;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+ }
+
+ return ad_len;
+}
+
+static u8 create_default_scan_rsp(struct hci_dev *hdev, u8 *ptr)
+{
+ u8 scan_rsp_len = 0;
+
+ if (hdev->appearance)
+ scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len);
+
+ return eir_append_local_name(hdev, ptr, scan_rsp_len);
+}
+
+u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+ struct adv_info *adv;
+ u8 scan_rsp_len = 0;
+
+ if (!instance)
+ return create_default_scan_rsp(hdev, ptr);
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return 0;
+
+ if ((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance)
+ scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len);
+
+ memcpy(&ptr[scan_rsp_len], adv->scan_rsp_data, adv->scan_rsp_len);
+
+ scan_rsp_len += adv->scan_rsp_len;
+
+ if (adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ scan_rsp_len = eir_append_local_name(hdev, ptr, scan_rsp_len);
+
+ return scan_rsp_len;
+}
+
+void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len)
+{
+ size_t dlen;
+
+ while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, &dlen))) {
+ u16 value = get_unaligned_le16(eir);
+
+ if (uuid == value) {
+ if (len)
+ *len = dlen - 2;
+ return &eir[2];
+ }
+
+ eir += dlen;
+ eir_len -= dlen;
+ }
+
+ return NULL;
+}
diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h
new file mode 100644
index 000000000000..9372db83f912
--- /dev/null
+++ b/net/bluetooth/eir.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#include <linux/unaligned.h>
+
+void eir_create(struct hci_dev *hdev, u8 *data);
+
+u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size);
+u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr);
+u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr);
+
+u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len);
+u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len);
+u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data,
+ u8 data_len);
+
+static inline u16 eir_precalc_len(u8 data_len)
+{
+ return sizeof(u8) * 2 + data_len;
+}
+
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type,
+ u8 *data, u8 data_len)
+{
+ eir[eir_len++] = sizeof(type) + data_len;
+ eir[eir_len++] = type;
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
+{
+ eir[eir_len++] = sizeof(type) + sizeof(data);
+ eir[eir_len++] = type;
+ put_unaligned_le16(data, &eir[eir_len]);
+ eir_len += sizeof(data);
+
+ return eir_len;
+}
+
+static inline u16 eir_skb_put_data(struct sk_buff *skb, u8 type, u8 *data, u8 data_len)
+{
+ u8 *eir;
+ u16 eir_len;
+
+ eir_len = eir_precalc_len(data_len);
+ eir = skb_put(skb, eir_len);
+ WARN_ON(sizeof(type) + data_len > U8_MAX);
+ eir[0] = sizeof(type) + data_len;
+ eir[1] = type;
+ memcpy(&eir[2], data, data_len);
+
+ return eir_len;
+}
+
+static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type,
+ size_t *data_len)
+{
+ size_t parsed = 0;
+
+ if (eir_len < 2)
+ return NULL;
+
+ while (parsed < eir_len - 1) {
+ u8 field_len = eir[0];
+
+ if (field_len == 0)
+ break;
+
+ parsed += field_len + 1;
+
+ if (parsed > eir_len)
+ break;
+
+ if (eir[1] != type) {
+ eir += field_len + 1;
+ continue;
+ }
+
+ /* Zero length data */
+ if (field_len == 1)
+ return NULL;
+
+ if (data_len)
+ *data_len = field_len - 1;
+
+ return &eir[2];
+ }
+
+ return NULL;
+}
+
+void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len);
diff --git a/net/bluetooth/hci_codec.c b/net/bluetooth/hci_codec.c
new file mode 100644
index 000000000000..3cc135bb1d30
--- /dev/null
+++ b/net/bluetooth/hci_codec.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Copyright (C) 2021 Intel Corporation */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include "hci_codec.h"
+
+static int hci_codec_list_add(struct list_head *list,
+ struct hci_op_read_local_codec_caps *sent,
+ struct hci_rp_read_local_codec_caps *rp,
+ void *caps,
+ __u32 len)
+{
+ struct codec_list *entry;
+
+ entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->id = sent->id;
+ if (sent->id == 0xFF) {
+ entry->cid = __le16_to_cpu(sent->cid);
+ entry->vid = __le16_to_cpu(sent->vid);
+ }
+ entry->transport = sent->transport;
+ entry->len = len;
+ entry->num_caps = 0;
+ if (rp) {
+ entry->num_caps = rp->num_caps;
+ memcpy(entry->caps, caps, len);
+ }
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+void hci_codec_list_clear(struct list_head *codec_list)
+{
+ struct codec_list *c, *n;
+
+ list_for_each_entry_safe(c, n, codec_list, list) {
+ list_del(&c->list);
+ kfree(c);
+ }
+}
+
+static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport,
+ struct hci_op_read_local_codec_caps
+ *cmd)
+{
+ __u8 i;
+
+ for (i = 0; i < TRANSPORT_TYPE_MAX; i++) {
+ if (transport & BIT(i)) {
+ struct hci_rp_read_local_codec_caps *rp;
+ struct hci_codec_caps *caps;
+ struct sk_buff *skb;
+ __u8 j;
+ __u32 len;
+
+ cmd->transport = i;
+
+ /* If Read_Codec_Capabilities command is not supported
+ * then just add codec to the list without caps
+ */
+ if (!(hdev->commands[45] & 0x08)) {
+ hci_dev_lock(hdev);
+ hci_codec_list_add(&hdev->local_codecs, cmd,
+ NULL, NULL, 0);
+ hci_dev_unlock(hdev);
+ continue;
+ }
+
+ skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS,
+ sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL);
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read codec capabilities (%ld)",
+ PTR_ERR(skb));
+ continue;
+ }
+
+ if (skb->len < sizeof(*rp))
+ goto error;
+
+ rp = (void *)skb->data;
+
+ if (rp->status)
+ goto error;
+
+ if (!rp->num_caps) {
+ len = 0;
+ /* this codec doesn't have capabilities */
+ goto skip_caps_parse;
+ }
+
+ skb_pull(skb, sizeof(*rp));
+
+ for (j = 0, len = 0; j < rp->num_caps; j++) {
+ caps = (void *)skb->data;
+ if (skb->len < sizeof(*caps))
+ goto error;
+ if (skb->len < caps->len)
+ goto error;
+ len += sizeof(caps->len) + caps->len;
+ skb_pull(skb, sizeof(caps->len) + caps->len);
+ }
+
+skip_caps_parse:
+ hci_dev_lock(hdev);
+ hci_codec_list_add(&hdev->local_codecs, cmd, rp,
+ (__u8 *)rp + sizeof(*rp), len);
+ hci_dev_unlock(hdev);
+error:
+ kfree_skb(skb);
+ }
+ }
+}
+
+void hci_read_supported_codecs(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct hci_rp_read_local_supported_codecs *rp;
+ struct hci_std_codecs *std_codecs;
+ struct hci_vnd_codecs *vnd_codecs;
+ struct hci_op_read_local_codec_caps caps;
+ __u8 i;
+
+ skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL,
+ 0, HCI_CMD_TIMEOUT, NULL);
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read local supported codecs (%ld)",
+ PTR_ERR(skb));
+ return;
+ }
+
+ if (skb->len < sizeof(*rp))
+ goto error;
+
+ rp = (void *)skb->data;
+
+ if (rp->status)
+ goto error;
+
+ skb_pull(skb, sizeof(rp->status));
+
+ std_codecs = (void *)skb->data;
+
+ /* validate codecs length before accessing */
+ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num))
+ goto error;
+
+ /* enumerate codec capabilities of standard codecs */
+ memset(&caps, 0, sizeof(caps));
+ for (i = 0; i < std_codecs->num; i++) {
+ caps.id = std_codecs->codec[i];
+ caps.direction = 0x00;
+ hci_read_codec_capabilities(hdev,
+ LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps);
+ }
+
+ skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num));
+
+ vnd_codecs = (void *)skb->data;
+
+ /* validate vendor codecs length before accessing */
+ if (skb->len <
+ flex_array_size(vnd_codecs, codec, vnd_codecs->num)
+ + sizeof(vnd_codecs->num))
+ goto error;
+
+ /* enumerate vendor codec capabilities */
+ for (i = 0; i < vnd_codecs->num; i++) {
+ caps.id = 0xFF;
+ caps.cid = vnd_codecs->codec[i].cid;
+ caps.vid = vnd_codecs->codec[i].vid;
+ caps.direction = 0x00;
+ hci_read_codec_capabilities(hdev,
+ LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps);
+ }
+
+error:
+ kfree_skb(skb);
+}
+
+void hci_read_supported_codecs_v2(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct hci_rp_read_local_supported_codecs_v2 *rp;
+ struct hci_std_codecs_v2 *std_codecs;
+ struct hci_vnd_codecs_v2 *vnd_codecs;
+ struct hci_op_read_local_codec_caps caps;
+ __u8 i;
+
+ skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL,
+ 0, HCI_CMD_TIMEOUT, NULL);
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read local supported codecs (%ld)",
+ PTR_ERR(skb));
+ return;
+ }
+
+ if (skb->len < sizeof(*rp))
+ goto error;
+
+ rp = (void *)skb->data;
+
+ if (rp->status)
+ goto error;
+
+ skb_pull(skb, sizeof(rp->status));
+
+ std_codecs = (void *)skb->data;
+
+ /* check for payload data length before accessing */
+ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num))
+ goto error;
+
+ memset(&caps, 0, sizeof(caps));
+
+ for (i = 0; i < std_codecs->num; i++) {
+ caps.id = std_codecs->codec[i].id;
+ hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport,
+ &caps);
+ }
+
+ skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num));
+
+ vnd_codecs = (void *)skb->data;
+
+ /* check for payload data length before accessing */
+ if (skb->len <
+ flex_array_size(vnd_codecs, codec, vnd_codecs->num)
+ + sizeof(vnd_codecs->num))
+ goto error;
+
+ for (i = 0; i < vnd_codecs->num; i++) {
+ caps.id = 0xFF;
+ caps.cid = vnd_codecs->codec[i].cid;
+ caps.vid = vnd_codecs->codec[i].vid;
+ hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport,
+ &caps);
+ }
+
+error:
+ kfree_skb(skb);
+}
diff --git a/net/bluetooth/hci_codec.h b/net/bluetooth/hci_codec.h
new file mode 100644
index 000000000000..a2751930f123
--- /dev/null
+++ b/net/bluetooth/hci_codec.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Copyright (C) 2014 Intel Corporation */
+
+void hci_read_supported_codecs(struct hci_dev *hdev);
+void hci_read_supported_codecs_v2(struct hci_dev *hdev);
+void hci_codec_list_clear(struct list_head *codec_list);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 6cd5711fa28a..c3f7828bf9d5 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1,6 +1,7 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
- Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+ Copyright 2023-2024 NXP
Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
@@ -12,273 +13,1273 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth HCI connection handling. */
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/errqueue.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/iso.h>
+#include <net/bluetooth/mgmt.h>
-#ifndef CONFIG_BT_HCI_CORE_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#include "smp.h"
+#include "eir.h"
-void hci_acl_connect(struct hci_conn *conn)
+struct sco_param {
+ u16 pkt_type;
+ u16 max_latency;
+ u8 retrans_effort;
+};
+
+struct conn_handle_t {
+ struct hci_conn *conn;
+ __u16 handle;
+};
+
+static const struct sco_param esco_param_cvsd[] = {
+ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */
+ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */
+ { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */
+ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */
+ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */
+};
+
+static const struct sco_param sco_param_cvsd[] = {
+ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */
+ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */
+};
+
+static const struct sco_param esco_param_msbc[] = {
+ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */
+ { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */
+};
+
+/* This function requires the caller holds hdev->lock */
+void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
{
+ struct hci_conn_params *params;
struct hci_dev *hdev = conn->hdev;
- struct inquiry_entry *ie;
- struct hci_cp_create_conn cp;
+ struct smp_irk *irk;
+ bdaddr_t *bdaddr;
+ u8 bdaddr_type;
+
+ bdaddr = &conn->dst;
+ bdaddr_type = conn->dst_type;
+
+ /* Check if we need to convert to identity address */
+ irk = hci_get_irk(hdev, bdaddr, bdaddr_type);
+ if (irk) {
+ bdaddr = &irk->bdaddr;
+ bdaddr_type = irk->addr_type;
+ }
- BT_DBG("%p", conn);
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr,
+ bdaddr_type);
+ if (!params)
+ return;
- conn->state = BT_CONNECT;
- conn->out = 1;
- conn->link_mode = HCI_LM_MASTER;
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ params->conn = NULL;
+ }
- conn->attempt++;
+ if (!params->explicit_connect)
+ return;
- memset(&cp, 0, sizeof(cp));
- bacpy(&cp.bdaddr, &conn->dst);
- cp.pscan_rep_mode = 0x02;
+ /* If the status indicates successful cancellation of
+ * the attempt (i.e. Unknown Connection Id) there's no point of
+ * notifying failure since we'll go back to keep trying to
+ * connect. The only exception is explicit connect requests
+ * where a timeout + cancel does indicate an actual failure.
+ */
+ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID)
+ mgmt_connect_failed(hdev, conn, status);
+
+ /* The connection attempt was doing scan for new RPA, and is
+ * in scan phase. If params are not associated with any other
+ * autoconnect action, remove them completely. If they are, just unmark
+ * them as waiting for connection, by clearing explicit_connect field.
+ */
+ params->explicit_connect = false;
+
+ hci_pend_le_list_del_init(params);
+
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_EXPLICIT:
+ hci_conn_params_del(hdev, bdaddr, bdaddr_type);
+ /* return instead of break to avoid duplicate scan update */
+ return;
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ hci_pend_le_list_add(params, &hdev->pend_le_reports);
+ break;
+ default:
+ break;
+ }
- if ((ie = hci_inquiry_cache_lookup(hdev, &conn->dst)) &&
- inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
- cp.pscan_rep_mode = ie->data.pscan_rep_mode;
- cp.pscan_mode = ie->data.pscan_mode;
- cp.clock_offset = ie->data.clock_offset | __cpu_to_le16(0x8000);
- memcpy(conn->dev_class, ie->data.dev_class, 3);
+ hci_update_passive_scan(hdev);
+}
+
+static void hci_conn_cleanup(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags))
+ hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type);
+
+ if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ hci_remove_link_key(hdev, &conn->dst);
+
+ hci_chan_list_flush(conn);
+
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ ida_free(&hdev->unset_handle_ida, conn->handle);
+
+ if (conn->cleanup)
+ conn->cleanup(conn);
+
+ if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
+ switch (conn->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_CVSD:
+ case SCO_AIRMODE_TRANSP:
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO);
+ break;
+ }
+ } else {
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
}
- cp.pkt_type = __cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK);
- if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
- cp.role_switch = 0x01;
- else
- cp.role_switch = 0x00;
+ debugfs_remove_recursive(conn->debugfs);
- hci_send_cmd(hdev, OGF_LINK_CTL, OCF_CREATE_CONN, sizeof(cp), &cp);
+ hci_conn_del_sysfs(conn);
+
+ hci_dev_put(hdev);
}
-static void hci_acl_connect_cancel(struct hci_conn *conn)
+int hci_disconnect(struct hci_conn *conn, __u8 reason)
{
- struct hci_cp_create_conn_cancel cp;
+ BT_DBG("hcon %p", conn);
+
+ /* When we are central of an established connection and it enters
+ * the disconnect timeout, then go ahead and try to read the
+ * current clock offset. Processing of the result is done
+ * within the event handling and hci_clock_offset_evt function.
+ */
+ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER &&
+ (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) {
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_read_clock_offset clkoff_cp;
+
+ clkoff_cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp),
+ &clkoff_cp);
+ }
- BT_DBG("%p", conn);
+ return hci_abort_conn(conn, reason);
+}
- if (conn->hdev->hci_ver < 2)
- return;
+static void hci_add_sco(struct hci_conn *conn, __u16 handle)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_add_sco cp;
- bacpy(&cp.bdaddr, &conn->dst);
- hci_send_cmd(conn->hdev, OGF_LINK_CTL,
- OCF_CREATE_CONN_CANCEL, sizeof(cp), &cp);
+ BT_DBG("hcon %p", conn);
+
+ conn->state = BT_CONNECT;
+ conn->out = true;
+
+ conn->attempt++;
+
+ cp.handle = cpu_to_le16(handle);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+ hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp);
}
-void hci_acl_disconn(struct hci_conn *conn, __u8 reason)
+static bool find_next_esco_param(struct hci_conn *conn,
+ const struct sco_param *esco_param, int size)
{
- struct hci_cp_disconnect cp;
+ if (!conn->parent)
+ return false;
- BT_DBG("%p", conn);
+ for (; conn->attempt <= size; conn->attempt++) {
+ if (lmp_esco_2m_capable(conn->parent) ||
+ (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3))
+ break;
+ BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported",
+ conn, conn->attempt);
+ }
+
+ return conn->attempt <= size;
+}
+
+static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec)
+{
+ int err;
+ __u8 vnd_len, *vnd_data = NULL;
+ struct hci_op_configure_data_path *cmd = NULL;
+
+ /* Do not take below 2 checks as error since the 1st means user do not
+ * want to use HFP offload mode and the 2nd means the vendor controller
+ * do not need to send below HCI command for offload mode.
+ */
+ if (!codec->data_path || !hdev->get_codec_config_data)
+ return 0;
+
+ err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len,
+ &vnd_data);
+ if (err < 0)
+ goto error;
- conn->state = BT_DISCONN;
+ cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ err = hdev->get_data_path_id(hdev, &cmd->data_path_id);
+ if (err < 0)
+ goto error;
+
+ cmd->vnd_len = vnd_len;
+ memcpy(cmd->vnd_data, vnd_data, vnd_len);
+
+ cmd->direction = 0x00;
+ __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH,
+ sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT);
- cp.handle = __cpu_to_le16(conn->handle);
- cp.reason = reason;
- hci_send_cmd(conn->hdev, OGF_LINK_CTL,
- OCF_DISCONNECT, sizeof(cp), &cp);
+ cmd->direction = 0x01;
+ err = __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH,
+ sizeof(*cmd) + vnd_len, cmd,
+ HCI_CMD_TIMEOUT);
+error:
+
+ kfree(cmd);
+ kfree(vnd_data);
+ return err;
}
-void hci_add_sco(struct hci_conn *conn, __u16 handle)
+static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data)
{
- struct hci_dev *hdev = conn->hdev;
- struct hci_cp_add_sco cp;
+ struct conn_handle_t *conn_handle = data;
+ struct hci_conn *conn = conn_handle->conn;
+ __u16 handle = conn_handle->handle;
+ struct hci_cp_enhanced_setup_sync_conn cp;
+ const struct sco_param *param;
- BT_DBG("%p", conn);
+ kfree(conn_handle);
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ bt_dev_dbg(hdev, "hcon %p", conn);
+
+ configure_datapath_sync(hdev, &conn->codec);
conn->state = BT_CONNECT;
- conn->out = 1;
+ conn->out = true;
+
+ conn->attempt++;
+
+ memset(&cp, 0x00, sizeof(cp));
+
+ cp.handle = cpu_to_le16(handle);
+
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+
+ switch (conn->codec.id) {
+ case BT_CODEC_MSBC:
+ if (!find_next_esco_param(conn, esco_param_msbc,
+ ARRAY_SIZE(esco_param_msbc)))
+ return -EINVAL;
+
+ param = &esco_param_msbc[conn->attempt - 1];
+ cp.tx_coding_format.id = 0x05;
+ cp.rx_coding_format.id = 0x05;
+ cp.tx_codec_frame_size = __cpu_to_le16(60);
+ cp.rx_codec_frame_size = __cpu_to_le16(60);
+ cp.in_bandwidth = __cpu_to_le32(32000);
+ cp.out_bandwidth = __cpu_to_le32(32000);
+ cp.in_coding_format.id = 0x04;
+ cp.out_coding_format.id = 0x04;
+ cp.in_coded_data_size = __cpu_to_le16(16);
+ cp.out_coded_data_size = __cpu_to_le16(16);
+ cp.in_pcm_data_format = 2;
+ cp.out_pcm_data_format = 2;
+ cp.in_pcm_sample_payload_msb_pos = 0;
+ cp.out_pcm_sample_payload_msb_pos = 0;
+ cp.in_data_path = conn->codec.data_path;
+ cp.out_data_path = conn->codec.data_path;
+ cp.in_transport_unit_size = 1;
+ cp.out_transport_unit_size = 1;
+ break;
+
+ case BT_CODEC_TRANSPARENT:
+ if (!find_next_esco_param(conn, esco_param_msbc,
+ ARRAY_SIZE(esco_param_msbc)))
+ return -EINVAL;
+
+ param = &esco_param_msbc[conn->attempt - 1];
+ cp.tx_coding_format.id = 0x03;
+ cp.rx_coding_format.id = 0x03;
+ cp.tx_codec_frame_size = __cpu_to_le16(60);
+ cp.rx_codec_frame_size = __cpu_to_le16(60);
+ cp.in_bandwidth = __cpu_to_le32(0x1f40);
+ cp.out_bandwidth = __cpu_to_le32(0x1f40);
+ cp.in_coding_format.id = 0x03;
+ cp.out_coding_format.id = 0x03;
+ cp.in_coded_data_size = __cpu_to_le16(16);
+ cp.out_coded_data_size = __cpu_to_le16(16);
+ cp.in_pcm_data_format = 2;
+ cp.out_pcm_data_format = 2;
+ cp.in_pcm_sample_payload_msb_pos = 0;
+ cp.out_pcm_sample_payload_msb_pos = 0;
+ cp.in_data_path = conn->codec.data_path;
+ cp.out_data_path = conn->codec.data_path;
+ cp.in_transport_unit_size = 1;
+ cp.out_transport_unit_size = 1;
+ break;
+
+ case BT_CODEC_CVSD:
+ if (conn->parent && lmp_esco_capable(conn->parent)) {
+ if (!find_next_esco_param(conn, esco_param_cvsd,
+ ARRAY_SIZE(esco_param_cvsd)))
+ return -EINVAL;
+ param = &esco_param_cvsd[conn->attempt - 1];
+ } else {
+ if (conn->attempt > ARRAY_SIZE(sco_param_cvsd))
+ return -EINVAL;
+ param = &sco_param_cvsd[conn->attempt - 1];
+ }
+ cp.tx_coding_format.id = 2;
+ cp.rx_coding_format.id = 2;
+ cp.tx_codec_frame_size = __cpu_to_le16(60);
+ cp.rx_codec_frame_size = __cpu_to_le16(60);
+ cp.in_bandwidth = __cpu_to_le32(16000);
+ cp.out_bandwidth = __cpu_to_le32(16000);
+ cp.in_coding_format.id = 4;
+ cp.out_coding_format.id = 4;
+ cp.in_coded_data_size = __cpu_to_le16(16);
+ cp.out_coded_data_size = __cpu_to_le16(16);
+ cp.in_pcm_data_format = 2;
+ cp.out_pcm_data_format = 2;
+ cp.in_pcm_sample_payload_msb_pos = 0;
+ cp.out_pcm_sample_payload_msb_pos = 0;
+ cp.in_data_path = conn->codec.data_path;
+ cp.out_data_path = conn->codec.data_path;
+ cp.in_transport_unit_size = 16;
+ cp.out_transport_unit_size = 16;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ cp.retrans_effort = param->retrans_effort;
+ cp.pkt_type = __cpu_to_le16(param->pkt_type);
+ cp.max_latency = __cpu_to_le16(param->max_latency);
- cp.pkt_type = __cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK);
- cp.handle = __cpu_to_le16(handle);
+ if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0)
+ return -EIO;
- hci_send_cmd(hdev, OGF_LINK_CTL, OCF_ADD_SCO, sizeof(cp), &cp);
+ return 0;
}
-static void hci_conn_timeout(unsigned long arg)
+static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle)
{
- struct hci_conn *conn = (void *) arg;
struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_setup_sync_conn cp;
+ const struct sco_param *param;
- BT_DBG("conn %p state %d", conn, conn->state);
+ bt_dev_dbg(hdev, "hcon %p", conn);
- if (atomic_read(&conn->refcnt))
- return;
+ conn->state = BT_CONNECT;
+ conn->out = true;
- hci_dev_lock(hdev);
+ conn->attempt++;
+
+ cp.handle = cpu_to_le16(handle);
- switch (conn->state) {
- case BT_CONNECT:
- hci_acl_connect_cancel(conn);
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.voice_setting = cpu_to_le16(conn->setting);
+
+ switch (conn->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (!find_next_esco_param(conn, esco_param_msbc,
+ ARRAY_SIZE(esco_param_msbc)))
+ return false;
+ param = &esco_param_msbc[conn->attempt - 1];
break;
- case BT_CONNECTED:
- hci_acl_disconn(conn, 0x13);
+ case SCO_AIRMODE_CVSD:
+ if (conn->parent && lmp_esco_capable(conn->parent)) {
+ if (!find_next_esco_param(conn, esco_param_cvsd,
+ ARRAY_SIZE(esco_param_cvsd)))
+ return false;
+ param = &esco_param_cvsd[conn->attempt - 1];
+ } else {
+ if (conn->attempt > ARRAY_SIZE(sco_param_cvsd))
+ return false;
+ param = &sco_param_cvsd[conn->attempt - 1];
+ }
break;
default:
- conn->state = BT_CLOSED;
- break;
+ return false;
+ }
+
+ cp.retrans_effort = param->retrans_effort;
+ cp.pkt_type = __cpu_to_le16(param->pkt_type);
+ cp.max_latency = __cpu_to_le16(param->max_latency);
+
+ if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0)
+ return false;
+
+ return true;
+}
+
+bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
+{
+ int result;
+ struct conn_handle_t *conn_handle;
+
+ if (enhanced_sync_conn_capable(conn->hdev)) {
+ conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL);
+
+ if (!conn_handle)
+ return false;
+
+ conn_handle->conn = conn;
+ conn_handle->handle = handle;
+ result = hci_cmd_sync_queue(conn->hdev, hci_enhanced_setup_sync,
+ conn_handle, NULL);
+ if (result < 0)
+ kfree(conn_handle);
+
+ return result == 0;
+ }
+
+ return hci_setup_sync_conn(conn, handle);
+}
+
+u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
+ u16 to_multiplier)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_conn_params *params;
+ struct hci_cp_le_conn_update cp;
+
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ params->conn_min_interval = min;
+ params->conn_max_interval = max;
+ params->conn_latency = latency;
+ params->supervision_timeout = to_multiplier;
}
hci_dev_unlock(hdev);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.conn_interval_min = cpu_to_le16(min);
+ cp.conn_interval_max = cpu_to_le16(max);
+ cp.conn_latency = cpu_to_le16(latency);
+ cp.supervision_timeout = cpu_to_le16(to_multiplier);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp);
+
+ if (params)
+ return 0x01;
+
+ return 0x00;
}
-static void hci_conn_idle(unsigned long arg)
+void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
+ __u8 ltk[16], __u8 key_size)
{
- struct hci_conn *conn = (void *) arg;
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_start_enc cp;
- BT_DBG("conn %p mode %d", conn, conn->mode);
+ BT_DBG("hcon %p", conn);
+
+ memset(&cp, 0, sizeof(cp));
- hci_conn_enter_sniff_mode(conn);
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.rand = rand;
+ cp.ediv = ediv;
+ memcpy(cp.ltk, ltk, key_size);
+
+ hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp);
}
-struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
+/* Device _must_ be locked */
+void hci_sco_setup(struct hci_conn *conn, __u8 status)
+{
+ struct hci_link *link;
+
+ link = list_first_entry_or_null(&conn->link_list, struct hci_link, list);
+ if (!link || !link->conn)
+ return;
+
+ BT_DBG("hcon %p", conn);
+
+ if (!status) {
+ if (lmp_esco_capable(conn->hdev))
+ hci_setup_sync(link->conn, conn->handle);
+ else
+ hci_add_sco(link->conn, conn->handle);
+ } else {
+ hci_connect_cfm(link->conn, status);
+ hci_conn_del(link->conn);
+ }
+}
+
+static void hci_conn_timeout(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ disc_work.work);
+ int refcnt = atomic_read(&conn->refcnt);
+
+ BT_DBG("hcon %p state %s", conn, state_to_string(conn->state));
+
+ WARN_ON(refcnt < 0);
+
+ /* FIXME: It was observed that in pairing failed scenario, refcnt
+ * drops below 0. Probably this is because l2cap_conn_del calls
+ * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is
+ * dropped. After that loop hci_chan_del is called which also drops
+ * conn. For now make sure that ACL is alive if refcnt is higher then 0,
+ * otherwise drop it.
+ */
+ if (refcnt > 0)
+ return;
+
+ hci_abort_conn(conn, hci_proto_disconn_ind(conn));
+}
+
+/* Enter sniff mode */
+static void hci_conn_idle(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ idle_work.work);
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("hcon %p mode %d", conn, conn->mode);
+
+ if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn))
+ return;
+
+ if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF))
+ return;
+
+ if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) {
+ struct hci_cp_sniff_subrate cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.max_latency = cpu_to_le16(0);
+ cp.min_remote_timeout = cpu_to_le16(0);
+ cp.min_local_timeout = cpu_to_le16(0);
+ hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp);
+ }
+
+ if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) {
+ struct hci_cp_sniff_mode cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.max_interval = cpu_to_le16(hdev->sniff_max_interval);
+ cp.min_interval = cpu_to_le16(hdev->sniff_min_interval);
+ cp.attempt = cpu_to_le16(4);
+ cp.timeout = cpu_to_le16(1);
+ hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp);
+ }
+}
+
+static void hci_conn_auto_accept(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ auto_accept_work.work);
+
+ hci_send_cmd(conn->hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst),
+ &conn->dst);
+}
+
+static void le_disable_advertising(struct hci_dev *hdev)
+{
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_enable cp;
+
+ cp.enable = 0x00;
+ cp.num_of_sets = 0x00;
+
+ hci_send_cmd(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp),
+ &cp);
+ } else {
+ u8 enable = 0x00;
+ hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ }
+}
+
+static void le_conn_timeout(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ le_conn_timeout.work);
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("");
+
+ /* We could end up here due to having done directed advertising,
+ * so clean up the state if necessary. This should however only
+ * happen with broken hardware or if low duty cycle was used
+ * (which doesn't have a timeout of its own).
+ */
+ if (conn->role == HCI_ROLE_SLAVE) {
+ /* Disable LE Advertising */
+ le_disable_advertising(hdev);
+ hci_dev_lock(hdev);
+ hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
+ hci_dev_unlock(hdev);
+ return;
+ }
+
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+}
+
+struct iso_list_data {
+ union {
+ u8 cig;
+ u8 big;
+ };
+ union {
+ u8 cis;
+ u8 bis;
+ u16 sync_handle;
+ };
+ int count;
+ bool big_term;
+ bool pa_sync_term;
+ bool big_sync_term;
+};
+
+static void bis_list(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Skip if not broadcast/ANY address */
+ if (bacmp(&conn->dst, BDADDR_ANY))
+ return;
+
+ if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
+ d->bis != conn->iso_qos.bcast.bis)
+ return;
+
+ d->count++;
+}
+
+static int terminate_big_sync(struct hci_dev *hdev, void *data)
+{
+ struct iso_list_data *d = data;
+
+ bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", d->big, d->bis);
+
+ hci_disable_per_advertising_sync(hdev, d->bis);
+ hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL);
+
+ /* Only terminate BIG if it has been created */
+ if (!d->big_term)
+ return 0;
+
+ return hci_le_terminate_big_sync(hdev, d->big,
+ HCI_ERROR_LOCAL_HOST_TERM);
+}
+
+static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err)
+{
+ kfree(data);
+}
+
+static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ struct iso_list_data *d;
+ int ret;
+
+ bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big,
+ conn->iso_qos.bcast.bis);
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->big = conn->iso_qos.bcast.big;
+ d->bis = conn->iso_qos.bcast.bis;
+ d->big_term = test_and_clear_bit(HCI_CONN_BIG_CREATED, &conn->flags);
+
+ ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d,
+ terminate_big_destroy);
+ if (ret)
+ kfree(d);
+
+ return ret;
+}
+
+static int big_terminate_sync(struct hci_dev *hdev, void *data)
+{
+ struct iso_list_data *d = data;
+
+ bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big,
+ d->sync_handle);
+
+ if (d->big_sync_term)
+ hci_le_big_terminate_sync(hdev, d->big);
+
+ if (d->pa_sync_term)
+ return hci_le_pa_terminate_sync(hdev, d->sync_handle);
+
+ return 0;
+}
+
+static void find_bis(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Ignore if BIG doesn't match */
+ if (d->big != conn->iso_qos.bcast.big)
+ return;
+
+ d->count++;
+}
+
+static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ struct iso_list_data *d;
+ int ret;
+
+ bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn,
+ conn->iso_qos.bcast.big, conn->sync_handle);
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->big = conn->iso_qos.bcast.big;
+ d->sync_handle = conn->sync_handle;
+
+ if (conn->type == PA_LINK &&
+ test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
+ hci_conn_hash_list_flag(hdev, find_bis, PA_LINK,
+ HCI_CONN_PA_SYNC, d);
+
+ if (!d->count)
+ d->pa_sync_term = true;
+
+ d->count = 0;
+ }
+
+ if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) {
+ hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK,
+ HCI_CONN_BIG_SYNC, d);
+
+ if (!d->count)
+ d->big_sync_term = true;
+ }
+
+ if (!d->pa_sync_term && !d->big_sync_term)
+ return 0;
+
+ ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d,
+ terminate_big_destroy);
+ if (ret)
+ kfree(d);
+
+ return ret;
+}
+
+/* Cleanup BIS connection
+ *
+ * Detects if there any BIS left connected in a BIG
+ * broadcaster: Remove advertising instance and terminate BIG.
+ * broadcaster receiver: Terminate BIG sync and terminate PA sync.
+ */
+static void bis_cleanup(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_conn *bis;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (conn->role == HCI_ROLE_MASTER) {
+ if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags))
+ return;
+
+ /* Check if ISO connection is a BIS and terminate advertising
+ * set and BIG if there are no other connections using it.
+ */
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_CONNECTED,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_CONNECT,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_OPEN,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ hci_le_terminate_big(hdev, conn);
+ } else {
+ hci_le_big_terminate(hdev, conn);
+ }
+}
+
+static int remove_cig_sync(struct hci_dev *hdev, void *data)
+{
+ u8 handle = PTR_UINT(data);
+
+ return hci_le_remove_cig_sync(hdev, handle);
+}
+
+static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle)
+{
+ bt_dev_dbg(hdev, "handle 0x%2.2x", handle);
+
+ return hci_cmd_sync_queue(hdev, remove_cig_sync, UINT_PTR(handle),
+ NULL);
+}
+
+static void find_cis(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Ignore broadcast or if CIG don't match */
+ if (!bacmp(&conn->dst, BDADDR_ANY) || d->cig != conn->iso_qos.ucast.cig)
+ return;
+
+ d->count++;
+}
+
+/* Cleanup CIS connection:
+ *
+ * Detects if there any CIS left connected in a CIG and remove it.
+ */
+static void cis_cleanup(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct iso_list_data d;
+
+ if (conn->iso_qos.ucast.cig == BT_ISO_QOS_CIG_UNSET)
+ return;
+
+ memset(&d, 0, sizeof(d));
+ d.cig = conn->iso_qos.ucast.cig;
+
+ /* Check if ISO connection is a CIS and remove CIG if there are
+ * no other connections using it.
+ */
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT,
+ &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED,
+ &d);
+ if (d.count)
+ return;
+
+ hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig);
+}
+
+static int hci_conn_hash_alloc_unset(struct hci_dev *hdev)
+{
+ return ida_alloc_range(&hdev->unset_handle_ida, HCI_CONN_HANDLE_MAX + 1,
+ U16_MAX, GFP_ATOMIC);
+}
+
+static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type,
+ bdaddr_t *dst, u8 dst_type,
+ u8 role, u16 handle)
{
struct hci_conn *conn;
+ struct smp_irk *irk = NULL;
- BT_DBG("%s dst %s", hdev->name, batostr(dst));
+ switch (type) {
+ case ACL_LINK:
+ if (!hdev->acl_mtu)
+ return ERR_PTR(-ECONNREFUSED);
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ if (!hdev->iso_mtu)
+ return ERR_PTR(-ECONNREFUSED);
+ irk = hci_get_irk(hdev, dst, dst_type);
+ break;
+ case LE_LINK:
+ if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+ return ERR_PTR(-ECONNREFUSED);
+ if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU)
+ return ERR_PTR(-ECONNREFUSED);
+ irk = hci_get_irk(hdev, dst, dst_type);
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ if (!hdev->sco_pkts)
+ /* Controller does not support SCO or eSCO over HCI */
+ return ERR_PTR(-ECONNREFUSED);
+ break;
+ default:
+ return ERR_PTR(-ECONNREFUSED);
+ }
+
+ bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle);
- conn = kzalloc(sizeof(struct hci_conn), GFP_ATOMIC);
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
if (!conn)
- return NULL;
+ return ERR_PTR(-ENOMEM);
- bacpy(&conn->dst, dst);
- conn->hdev = hdev;
- conn->type = type;
- conn->mode = HCI_CM_ACTIVE;
- conn->state = BT_OPEN;
+ /* If and IRK exists use its identity address */
+ if (!irk) {
+ bacpy(&conn->dst, dst);
+ conn->dst_type = dst_type;
+ } else {
+ bacpy(&conn->dst, &irk->bdaddr);
+ conn->dst_type = irk->addr_type;
+ }
+
+ bacpy(&conn->src, &hdev->bdaddr);
+ conn->handle = handle;
+ conn->hdev = hdev;
+ conn->type = type;
+ conn->role = role;
+ conn->mode = HCI_CM_ACTIVE;
+ conn->state = BT_OPEN;
+ conn->auth_type = HCI_AT_GENERAL_BONDING;
+ conn->io_capability = hdev->io_capability;
+ conn->remote_auth = 0xff;
+ conn->key_type = 0xff;
+ conn->rssi = HCI_RSSI_INVALID;
+ conn->tx_power = HCI_TX_POWER_INVALID;
+ conn->max_tx_power = HCI_TX_POWER_INVALID;
+ conn->sync_handle = HCI_SYNC_HANDLE_INVALID;
+ conn->sid = HCI_SID_INVALID;
+
+ set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+
+ /* Set Default Authenticated payload timeout to 30s */
+ conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
+
+ if (conn->role == HCI_ROLE_MASTER)
+ conn->out = true;
+
+ switch (type) {
+ case ACL_LINK:
+ conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
+ conn->mtu = hdev->acl_mtu;
+ break;
+ case LE_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+ conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
+ break;
+ case CIS_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
- conn->power_save = 1;
+ if (conn->role == HCI_ROLE_MASTER)
+ conn->cleanup = cis_cleanup;
+
+ conn->mtu = hdev->iso_mtu;
+ break;
+ case PA_LINK:
+ case BIS_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+ conn->cleanup = bis_cleanup;
+ conn->mtu = hdev->iso_mtu;
+ break;
+ case SCO_LINK:
+ if (lmp_esco_capable(hdev))
+ conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+ (hdev->esco_type & EDR_ESCO_MASK);
+ else
+ conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
+
+ conn->mtu = hdev->sco_mtu;
+ break;
+ case ESCO_LINK:
+ conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
+ conn->mtu = hdev->sco_mtu;
+ break;
+ }
skb_queue_head_init(&conn->data_q);
+ skb_queue_head_init(&conn->tx_q.queue);
- init_timer(&conn->disc_timer);
- conn->disc_timer.function = hci_conn_timeout;
- conn->disc_timer.data = (unsigned long) conn;
+ INIT_LIST_HEAD(&conn->chan_list);
+ INIT_LIST_HEAD(&conn->link_list);
- init_timer(&conn->idle_timer);
- conn->idle_timer.function = hci_conn_idle;
- conn->idle_timer.data = (unsigned long) conn;
+ INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout);
+ INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept);
+ INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle);
+ INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout);
atomic_set(&conn->refcnt, 0);
hci_dev_hold(hdev);
- tasklet_disable(&hdev->tx_task);
-
hci_conn_hash_add(hdev, conn);
- if (hdev->notify)
- hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
- hci_conn_add_sysfs(conn);
-
- tasklet_enable(&hdev->tx_task);
+ /* The SCO and eSCO connections will only be notified when their
+ * setup has been completed. This is different to ACL links which
+ * can be notified right away.
+ */
+ if (conn->type != SCO_LINK && conn->type != ESCO_LINK) {
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
+ }
+ hci_conn_init_sysfs(conn);
return conn;
}
-int hci_conn_del(struct hci_conn *conn)
+struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
+ bdaddr_t *dst, u8 dst_type, u8 role)
+{
+ int handle;
+
+ bt_dev_dbg(hdev, "dst %pMR", dst);
+
+ handle = hci_conn_hash_alloc_unset(hdev);
+ if (unlikely(handle < 0))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return __hci_conn_add(hdev, type, dst, dst_type, role, handle);
+}
+
+struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
+ u8 dst_type, u8 role, u16 handle)
+{
+ if (handle > HCI_CONN_HANDLE_MAX)
+ return ERR_PTR(-EINVAL);
+
+ return __hci_conn_add(hdev, type, dst, dst_type, role, handle);
+}
+
+static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason)
+{
+ if (!reason)
+ reason = HCI_ERROR_REMOTE_USER_TERM;
+
+ /* Due to race, SCO/ISO conn might be not established yet at this point,
+ * and nothing else will clean it up. In other cases it is done via HCI
+ * events.
+ */
+ switch (conn->type) {
+ case SCO_LINK:
+ case ESCO_LINK:
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ hci_conn_failed(conn, reason);
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ if ((conn->state != BT_CONNECTED &&
+ !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) ||
+ test_bit(HCI_CONN_BIG_CREATED, &conn->flags))
+ hci_conn_failed(conn, reason);
+ break;
+ }
+}
+
+static void hci_conn_unlink(struct hci_conn *conn)
{
struct hci_dev *hdev = conn->hdev;
- BT_DBG("%s conn %p handle %d", hdev->name, conn, conn->handle);
+ bt_dev_dbg(hdev, "hcon %p", conn);
- del_timer(&conn->idle_timer);
+ if (!conn->parent) {
+ struct hci_link *link, *t;
- del_timer(&conn->disc_timer);
+ list_for_each_entry_safe(link, t, &conn->link_list, list) {
+ struct hci_conn *child = link->conn;
- if (conn->type == SCO_LINK) {
- struct hci_conn *acl = conn->link;
- if (acl) {
- acl->link = NULL;
- hci_conn_put(acl);
+ hci_conn_unlink(child);
+
+ /* If hdev is down it means
+ * hci_dev_close_sync/hci_conn_hash_flush is in progress
+ * and links don't need to be cleanup as all connections
+ * would be cleanup.
+ */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ continue;
+
+ hci_conn_cleanup_child(child, conn->abort_reason);
}
- } else {
- struct hci_conn *sco = conn->link;
- if (sco)
- sco->link = NULL;
- /* Unacked frames */
- hdev->acl_cnt += conn->sent;
+ return;
}
- tasklet_disable(&hdev->tx_task);
+ if (!conn->link)
+ return;
- hci_conn_del_sysfs(conn);
+ list_del_rcu(&conn->link->list);
+ synchronize_rcu();
+
+ hci_conn_drop(conn->parent);
+ hci_conn_put(conn->parent);
+ conn->parent = NULL;
+ kfree(conn->link);
+ conn->link = NULL;
+}
+
+void hci_conn_del(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle);
+
+ hci_conn_unlink(conn);
+
+ disable_delayed_work_sync(&conn->disc_work);
+ disable_delayed_work_sync(&conn->auto_accept_work);
+ disable_delayed_work_sync(&conn->idle_work);
+
+ /* Remove the connection from the list so unacked logic can detect when
+ * a certain pool is not being utilized.
+ */
hci_conn_hash_del(hdev, conn);
- if (hdev->notify)
- hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
- tasklet_enable(&hdev->tx_task);
+ /* Handle unacked frames:
+ *
+ * - In case there are no connection, or if restoring the buffers
+ * considered in transist would overflow, restore all buffers to the
+ * pool.
+ * - Otherwise restore just the buffers considered in transit for the
+ * hci_conn
+ */
+ switch (conn->type) {
+ case ACL_LINK:
+ if (!hci_conn_num(hdev, ACL_LINK) ||
+ hdev->acl_cnt + conn->sent > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ else
+ hdev->acl_cnt += conn->sent;
+ break;
+ case LE_LINK:
+ cancel_delayed_work(&conn->le_conn_timeout);
+
+ if (hdev->le_pkts) {
+ if (!hci_conn_num(hdev, LE_LINK) ||
+ hdev->le_cnt + conn->sent > hdev->le_pkts)
+ hdev->le_cnt = hdev->le_pkts;
+ else
+ hdev->le_cnt += conn->sent;
+ } else {
+ if ((!hci_conn_num(hdev, LE_LINK) &&
+ !hci_conn_num(hdev, ACL_LINK)) ||
+ hdev->acl_cnt + conn->sent > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ else
+ hdev->acl_cnt += conn->sent;
+ }
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ if (!hci_iso_count(hdev) ||
+ hdev->iso_cnt + conn->sent > hdev->iso_pkts)
+ hdev->iso_cnt = hdev->iso_pkts;
+ else
+ hdev->iso_cnt += conn->sent;
+ break;
+ }
skb_queue_purge(&conn->data_q);
+ skb_queue_purge(&conn->tx_q.queue);
- hci_dev_put(hdev);
-
- /* will free via device release */
- put_device(&conn->dev);
+ /* Remove the connection from the list and cleanup its remaining
+ * state. This is a separate function since for some cases like
+ * BT_CONNECT_SCAN we *only* want the cleanup part without the
+ * rest of hci_conn_del.
+ */
+ hci_conn_cleanup(conn);
- return 0;
+ /* Dequeue callbacks using connection pointer as data */
+ hci_cmd_sync_dequeue(hdev, NULL, conn, NULL);
}
-struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
+struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
{
int use_src = bacmp(src, BDADDR_ANY);
- struct hci_dev *hdev = NULL;
- struct list_head *p;
-
- BT_DBG("%s -> %s", batostr(src), batostr(dst));
+ struct hci_dev *hdev = NULL, *d;
- read_lock_bh(&hci_dev_list_lock);
+ BT_DBG("%pMR -> %pMR", src, dst);
- list_for_each(p, &hci_dev_list) {
- struct hci_dev *d = list_entry(p, struct hci_dev, list);
+ read_lock(&hci_dev_list_lock);
- if (!test_bit(HCI_UP, &d->flags) || test_bit(HCI_RAW, &d->flags))
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (!test_bit(HCI_UP, &d->flags) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
continue;
- /* Simple routing:
+ /* Simple routing:
* No source address - find interface with bdaddr != dst
* Source address - find interface with bdaddr == src
*/
if (use_src) {
- if (!bacmp(&d->bdaddr, src)) {
+ bdaddr_t id_addr;
+ u8 id_addr_type;
+
+ if (src_type == BDADDR_BREDR) {
+ if (!lmp_bredr_capable(d))
+ continue;
+ bacpy(&id_addr, &d->bdaddr);
+ id_addr_type = BDADDR_BREDR;
+ } else {
+ if (!lmp_le_capable(d))
+ continue;
+
+ hci_copy_identity_address(d, &id_addr,
+ &id_addr_type);
+
+ /* Convert from HCI to three-value type */
+ if (id_addr_type == ADDR_LE_DEV_PUBLIC)
+ id_addr_type = BDADDR_LE_PUBLIC;
+ else
+ id_addr_type = BDADDR_LE_RANDOM;
+ }
+
+ if (!bacmp(&id_addr, src) && id_addr_type == src_type) {
hdev = d; break;
}
} else {
@@ -291,216 +1292,1381 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
if (hdev)
hdev = hci_dev_hold(hdev);
- read_unlock_bh(&hci_dev_list_lock);
+ read_unlock(&hci_dev_list_lock);
return hdev;
}
EXPORT_SYMBOL(hci_get_route);
-/* Create SCO or ACL connection.
- * Device _must_ be locked */
-struct hci_conn * hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst)
+/* This function requires the caller holds hdev->lock */
+static void hci_le_conn_failed(struct hci_conn *conn, u8 status)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ hci_connect_le_scan_cleanup(conn, status);
+
+ /* Enable advertising in case this was a failed connection
+ * attempt as a peripheral.
+ */
+ hci_enable_advertising(hdev);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_conn_failed(struct hci_conn *conn, u8 status)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ switch (conn->type) {
+ case LE_LINK:
+ hci_le_conn_failed(conn, status);
+ break;
+ case ACL_LINK:
+ mgmt_connect_failed(hdev, conn, status);
+ break;
+ }
+
+ /* In case of BIG/PA sync failed, clear conn flags so that
+ * the conns will be correctly cleaned up by ISO layer
+ */
+ test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags);
+ test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags);
+
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
+}
+
+/* This function requires the caller holds hdev->lock */
+u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ bt_dev_dbg(hdev, "hcon %p handle 0x%4.4x", conn, handle);
+
+ if (conn->handle == handle)
+ return 0;
+
+ if (handle > HCI_CONN_HANDLE_MAX) {
+ bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x",
+ handle, HCI_CONN_HANDLE_MAX);
+ return HCI_ERROR_INVALID_PARAMETERS;
+ }
+
+ /* If abort_reason has been sent it means the connection is being
+ * aborted and the handle shall not be changed.
+ */
+ if (conn->abort_reason)
+ return conn->abort_reason;
+
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ ida_free(&hdev->unset_handle_ida, conn->handle);
+
+ conn->handle = handle;
+
+ return 0;
+}
+
+struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
+ u8 dst_type, bool dst_resolved, u8 sec_level,
+ u16 conn_timeout, u8 role, u8 phy, u8 sec_phy)
+{
+ struct hci_conn *conn;
+ struct smp_irk *irk;
+ int err;
+
+ /* Let's make sure that le is enabled.*/
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (lmp_le_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Since the controller supports only one LE connection attempt at a
+ * time, we return -EBUSY if there is any connection attempt running.
+ */
+ if (hci_lookup_le_connect(hdev))
+ return ERR_PTR(-EBUSY);
+
+ /* If there's already a connection object but it's not in
+ * scanning state it means it must already be established, in
+ * which case we can't do anything else except report a failure
+ * to connect.
+ */
+ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type);
+ if (conn && !test_bit(HCI_CONN_SCANNING, &conn->flags)) {
+ return ERR_PTR(-EBUSY);
+ }
+
+ /* Check if the destination address has been resolved by the controller
+ * since if it did then the identity address shall be used.
+ */
+ if (!dst_resolved) {
+ /* When given an identity address with existing identity
+ * resolving key, the connection needs to be established
+ * to a resolvable random address.
+ *
+ * Storing the resolvable random address is required here
+ * to handle connection failures. The address will later
+ * be resolved back into the original identity address
+ * from the connect request.
+ */
+ irk = hci_find_irk_by_addr(hdev, dst, dst_type);
+ if (irk && bacmp(&irk->rpa, BDADDR_ANY)) {
+ dst = &irk->rpa;
+ dst_type = ADDR_LE_DEV_RANDOM;
+ }
+ }
+
+ if (conn) {
+ bacpy(&conn->dst, dst);
+ } else {
+ conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, role);
+ if (IS_ERR(conn))
+ return conn;
+ hci_conn_hold(conn);
+ conn->pending_sec_level = sec_level;
+ }
+
+ conn->sec_level = BT_SECURITY_LOW;
+ conn->conn_timeout = conn_timeout;
+ conn->le_adv_phy = phy;
+ conn->le_adv_sec_phy = sec_phy;
+
+ err = hci_connect_le_sync(hdev, conn);
+ if (err) {
+ hci_conn_del(conn);
+ return ERR_PTR(err);
+ }
+
+ return conn;
+}
+
+static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_le(hdev, addr, type);
+ if (!conn)
+ return false;
+
+ if (conn->state != BT_CONNECTED)
+ return false;
+
+ return true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static int hci_explicit_conn_params_set(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ if (is_connected(hdev, addr, addr_type))
+ return -EISCONN;
+
+ params = hci_conn_params_lookup(hdev, addr, addr_type);
+ if (!params) {
+ params = hci_conn_params_add(hdev, addr, addr_type);
+ if (!params)
+ return -ENOMEM;
+
+ /* If we created new params, mark them to be deleted in
+ * hci_connect_le_scan_cleanup. It's different case than
+ * existing disabled params, those will stay after cleanup.
+ */
+ params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ }
+
+ /* We're trying to connect, so make sure params are at pend_le_conns */
+ if (params->auto_connect == HCI_AUTO_CONN_DISABLED ||
+ params->auto_connect == HCI_AUTO_CONN_REPORT ||
+ params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ }
+
+ params->explicit_connect = true;
+
+ BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
+ params->auto_connect);
+
+ return 0;
+}
+
+static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
+{
+ struct hci_conn *conn;
+ u8 big;
+
+ /* Allocate a BIG if not set */
+ if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) {
+ for (big = 0x00; big < 0xef; big++) {
+
+ conn = hci_conn_hash_lookup_big(hdev, big);
+ if (!conn)
+ break;
+ }
+
+ if (big == 0xef)
+ return -EADDRNOTAVAIL;
+
+ /* Update BIG */
+ qos->bcast.big = big;
+ }
+
+ return 0;
+}
+
+static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
+{
+ struct hci_conn *conn;
+ u8 bis;
+
+ /* Allocate BIS if not set */
+ if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) {
+ if (qos->bcast.big != BT_ISO_QOS_BIG_UNSET) {
+ conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big);
+
+ if (conn) {
+ /* If the BIG handle is already matched to an advertising
+ * handle, do not allocate a new one.
+ */
+ qos->bcast.bis = conn->iso_qos.bcast.bis;
+ return 0;
+ }
+ }
+
+ /* Find an unused adv set to advertise BIS, skip instance 0x00
+ * since it is reserved as general purpose set.
+ */
+ for (bis = 0x01; bis < hdev->le_num_of_adv_sets;
+ bis++) {
+
+ conn = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, bis);
+ if (!conn)
+ break;
+ }
+
+ if (bis == hdev->le_num_of_adv_sets)
+ return -EADDRNOTAVAIL;
+
+ /* Update BIS */
+ qos->bcast.bis = bis;
+ }
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 sid, struct bt_iso_qos *qos,
+ __u8 base_len, __u8 *base, u16 timeout)
+{
+ struct hci_conn *conn;
+ int err;
+
+ /* Let's make sure that le is enabled.*/
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (lmp_le_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ err = qos_set_big(hdev, qos);
+ if (err)
+ return ERR_PTR(err);
+
+ err = qos_set_bis(hdev, qos);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Check if the LE Create BIG command has already been sent */
+ conn = hci_conn_hash_lookup_per_adv_bis(hdev, dst, qos->bcast.big,
+ qos->bcast.big);
+ if (conn)
+ return ERR_PTR(-EADDRINUSE);
+
+ /* Check BIS settings against other bound BISes, since all
+ * BISes in a BIG must have the same value for all parameters
+ */
+ conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big);
+
+ if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) ||
+ base_len != conn->le_per_adv_data_len ||
+ memcmp(conn->le_per_adv_data, base, base_len)))
+ return ERR_PTR(-EADDRINUSE);
+
+ conn = hci_conn_add_unset(hdev, BIS_LINK, dst, 0, HCI_ROLE_MASTER);
+ if (IS_ERR(conn))
+ return conn;
+
+ conn->state = BT_CONNECT;
+ conn->sid = sid;
+ conn->conn_timeout = timeout;
+
+ hci_conn_hold(conn);
+ return conn;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
+ u8 dst_type, u8 sec_level,
+ u16 conn_timeout,
+ enum conn_reasons conn_reason)
+{
+ struct hci_conn *conn;
+
+ /* Let's make sure that le is enabled.*/
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (lmp_le_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Some devices send ATT messages as soon as the physical link is
+ * established. To be able to handle these ATT messages, the user-
+ * space first establishes the connection and then starts the pairing
+ * process.
+ *
+ * So if a hci_conn object already exists for the following connection
+ * attempt, we simply update pending_sec_level and auth_type fields
+ * and return the object found.
+ */
+ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type);
+ if (conn) {
+ if (conn->pending_sec_level < sec_level)
+ conn->pending_sec_level = sec_level;
+ goto done;
+ }
+
+ BT_DBG("requesting refresh of dst_addr");
+
+ conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type,
+ HCI_ROLE_MASTER);
+ if (IS_ERR(conn))
+ return conn;
+
+ if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) {
+ hci_conn_del(conn);
+ return ERR_PTR(-EBUSY);
+ }
+
+ conn->state = BT_CONNECT;
+ set_bit(HCI_CONN_SCANNING, &conn->flags);
+ conn->sec_level = BT_SECURITY_LOW;
+ conn->pending_sec_level = sec_level;
+ conn->conn_timeout = conn_timeout;
+ conn->conn_reason = conn_reason;
+
+ hci_update_passive_scan(hdev);
+
+done:
+ hci_conn_hold(conn);
+ return conn;
+}
+
+struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
+ u8 sec_level, u8 auth_type,
+ enum conn_reasons conn_reason, u16 timeout)
{
struct hci_conn *acl;
- BT_DBG("%s dst %s", hdev->name, batostr(dst));
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ if (lmp_bredr_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
- if (!(acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst))) {
- if (!(acl = hci_conn_add(hdev, ACL_LINK, dst)))
- return NULL;
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Reject outgoing connection to device with same BD ADDR against
+ * CVE-2020-26555
+ */
+ if (!bacmp(&hdev->bdaddr, dst)) {
+ bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n",
+ dst);
+ return ERR_PTR(-ECONNREFUSED);
+ }
+
+ acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
+ if (!acl) {
+ acl = hci_conn_add_unset(hdev, ACL_LINK, dst, 0,
+ HCI_ROLE_MASTER);
+ if (IS_ERR(acl))
+ return acl;
}
hci_conn_hold(acl);
- if (acl->state == BT_OPEN || acl->state == BT_CLOSED)
- hci_acl_connect(acl);
+ acl->conn_reason = conn_reason;
+ if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+ int err;
- if (type == SCO_LINK) {
- struct hci_conn *sco;
+ acl->sec_level = BT_SECURITY_LOW;
+ acl->pending_sec_level = sec_level;
+ acl->auth_type = auth_type;
+ acl->conn_timeout = timeout;
- if (!(sco = hci_conn_hash_lookup_ba(hdev, SCO_LINK, dst))) {
- if (!(sco = hci_conn_add(hdev, SCO_LINK, dst))) {
- hci_conn_put(acl);
- return NULL;
- }
+ err = hci_connect_acl_sync(hdev, acl);
+ if (err) {
+ hci_conn_del(acl);
+ return ERR_PTR(err);
}
- acl->link = sco;
- sco->link = acl;
+ }
- hci_conn_hold(sco);
+ return acl;
+}
+
+static struct hci_link *hci_conn_link(struct hci_conn *parent,
+ struct hci_conn *conn)
+{
+ struct hci_dev *hdev = parent->hdev;
+ struct hci_link *link;
- if (acl->state == BT_CONNECTED &&
- (sco->state == BT_OPEN || sco->state == BT_CLOSED))
- hci_add_sco(sco, acl->handle);
+ bt_dev_dbg(hdev, "parent %p hcon %p", parent, conn);
- return sco;
- } else {
+ if (conn->link)
+ return conn->link;
+
+ if (conn->parent)
+ return NULL;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return NULL;
+
+ link->conn = hci_conn_hold(conn);
+ conn->link = link;
+ conn->parent = hci_conn_get(parent);
+
+ /* Use list_add_tail_rcu append to the list */
+ list_add_tail_rcu(&link->list, &parent->link_list);
+
+ return link;
+}
+
+struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
+ __u16 setting, struct bt_codec *codec,
+ u16 timeout)
+{
+ struct hci_conn *acl;
+ struct hci_conn *sco;
+ struct hci_link *link;
+
+ acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING,
+ CONN_REASON_SCO_CONNECT, timeout);
+ if (IS_ERR(acl))
return acl;
+
+ sco = hci_conn_hash_lookup_ba(hdev, type, dst);
+ if (!sco) {
+ sco = hci_conn_add_unset(hdev, type, dst, 0, HCI_ROLE_MASTER);
+ if (IS_ERR(sco)) {
+ hci_conn_drop(acl);
+ return sco;
+ }
+ }
+
+ link = hci_conn_link(acl, sco);
+ if (!link) {
+ hci_conn_drop(acl);
+ hci_conn_drop(sco);
+ return ERR_PTR(-ENOLINK);
+ }
+
+ sco->setting = setting;
+ sco->codec = *codec;
+
+ if (acl->state == BT_CONNECTED &&
+ (sco->state == BT_OPEN || sco->state == BT_CLOSED)) {
+ set_bit(HCI_CONN_POWER_SAVE, &acl->flags);
+ hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON);
+
+ if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->flags)) {
+ /* defer SCO setup until mode change completed */
+ set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->flags);
+ return sco;
+ }
+
+ hci_sco_setup(acl, 0x00);
+ }
+
+ return sco;
+}
+
+static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_create_big cp;
+ struct iso_list_data data;
+
+ memset(&cp, 0, sizeof(cp));
+
+ data.big = qos->bcast.big;
+ data.bis = qos->bcast.bis;
+ data.count = 0;
+
+ /* Create a BIS for each bound connection */
+ hci_conn_hash_list_state(hdev, bis_list, BIS_LINK,
+ BT_BOUND, &data);
+
+ cp.handle = qos->bcast.big;
+ cp.adv_handle = qos->bcast.bis;
+ cp.num_bis = data.count;
+ hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval);
+ cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu);
+ cp.bis.latency = cpu_to_le16(qos->bcast.out.latency);
+ cp.bis.rtn = qos->bcast.out.rtn;
+ cp.bis.phy = qos->bcast.out.phy;
+ cp.bis.packing = qos->bcast.packing;
+ cp.bis.framing = qos->bcast.framing;
+ cp.bis.encryption = qos->bcast.encryption;
+ memcpy(cp.bis.bcode, qos->bcast.bcode, sizeof(cp.bis.bcode));
+
+ return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp);
+}
+
+static int set_cig_params_sync(struct hci_dev *hdev, void *data)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_cig_params, pdu, cis, num_cis, 0x1f);
+ u8 cig_id = PTR_UINT(data);
+ struct hci_conn *conn;
+ struct bt_iso_qos *qos;
+ u8 aux_num_cis = 0;
+ u8 cis_id;
+
+ conn = hci_conn_hash_lookup_cig(hdev, cig_id);
+ if (!conn)
+ return 0;
+
+ qos = &conn->iso_qos;
+ pdu->cig_id = cig_id;
+ hci_cpu_to_le24(qos->ucast.out.interval, pdu->c_interval);
+ hci_cpu_to_le24(qos->ucast.in.interval, pdu->p_interval);
+ pdu->sca = qos->ucast.sca;
+ pdu->packing = qos->ucast.packing;
+ pdu->framing = qos->ucast.framing;
+ pdu->c_latency = cpu_to_le16(qos->ucast.out.latency);
+ pdu->p_latency = cpu_to_le16(qos->ucast.in.latency);
+
+ /* Reprogram all CIS(s) with the same CIG, valid range are:
+ * num_cis: 0x00 to 0x1F
+ * cis_id: 0x00 to 0xEF
+ */
+ for (cis_id = 0x00; cis_id < 0xf0 &&
+ aux_num_cis < pdu->num_cis; cis_id++) {
+ struct hci_cis_params *cis;
+
+ conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id);
+ if (!conn)
+ continue;
+
+ qos = &conn->iso_qos;
+
+ cis = &pdu->cis[aux_num_cis++];
+ cis->cis_id = cis_id;
+ cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu);
+ cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu);
+ cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy :
+ qos->ucast.in.phy;
+ cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy :
+ qos->ucast.out.phy;
+ cis->c_rtn = qos->ucast.out.rtn;
+ cis->p_rtn = qos->ucast.in.rtn;
+ }
+ pdu->num_cis = aux_num_cis;
+
+ if (!pdu->num_cis)
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS,
+ struct_size(pdu, cis, pdu->num_cis),
+ pdu, HCI_CMD_TIMEOUT);
+}
+
+static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct iso_list_data data;
+
+ memset(&data, 0, sizeof(data));
+
+ /* Allocate first still reconfigurable CIG if not set */
+ if (qos->ucast.cig == BT_ISO_QOS_CIG_UNSET) {
+ for (data.cig = 0x00; data.cig < 0xf0; data.cig++) {
+ data.count = 0;
+
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK,
+ BT_CONNECT, &data);
+ if (data.count)
+ continue;
+
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK,
+ BT_CONNECTED, &data);
+ if (!data.count)
+ break;
+ }
+
+ if (data.cig == 0xf0)
+ return false;
+
+ /* Update CIG */
+ qos->ucast.cig = data.cig;
+ }
+
+ if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) {
+ if (hci_conn_hash_lookup_cis(hdev, NULL, 0, qos->ucast.cig,
+ qos->ucast.cis))
+ return false;
+ goto done;
+ }
+
+ /* Allocate first available CIS if not set */
+ for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0;
+ data.cis++) {
+ if (!hci_conn_hash_lookup_cis(hdev, NULL, 0, data.cig,
+ data.cis)) {
+ /* Update CIS */
+ qos->ucast.cis = data.cis;
+ break;
+ }
+ }
+
+ if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET)
+ return false;
+
+done:
+ if (hci_cmd_sync_queue(hdev, set_cig_params_sync,
+ UINT_PTR(qos->ucast.cig), NULL) < 0)
+ return false;
+
+ return true;
+}
+
+struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout)
+{
+ struct hci_conn *cis;
+
+ cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
+ qos->ucast.cis);
+ if (!cis) {
+ cis = hci_conn_add_unset(hdev, CIS_LINK, dst, dst_type,
+ HCI_ROLE_MASTER);
+ if (IS_ERR(cis))
+ return cis;
+ cis->cleanup = cis_cleanup;
+ cis->dst_type = dst_type;
+ cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET;
+ cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET;
+ cis->conn_timeout = timeout;
+ }
+
+ if (cis->state == BT_CONNECTED)
+ return cis;
+
+ /* Check if CIS has been set and the settings matches */
+ if (cis->state == BT_BOUND &&
+ !memcmp(&cis->iso_qos, qos, sizeof(*qos)))
+ return cis;
+
+ /* Update LINK PHYs according to QoS preference */
+ cis->le_tx_phy = qos->ucast.out.phy;
+ cis->le_rx_phy = qos->ucast.in.phy;
+
+ /* If output interval is not set use the input interval as it cannot be
+ * 0x000000.
+ */
+ if (!qos->ucast.out.interval)
+ qos->ucast.out.interval = qos->ucast.in.interval;
+
+ /* If input interval is not set use the output interval as it cannot be
+ * 0x000000.
+ */
+ if (!qos->ucast.in.interval)
+ qos->ucast.in.interval = qos->ucast.out.interval;
+
+ /* If output latency is not set use the input latency as it cannot be
+ * 0x0000.
+ */
+ if (!qos->ucast.out.latency)
+ qos->ucast.out.latency = qos->ucast.in.latency;
+
+ /* If input latency is not set use the output latency as it cannot be
+ * 0x0000.
+ */
+ if (!qos->ucast.in.latency)
+ qos->ucast.in.latency = qos->ucast.out.latency;
+
+ if (!hci_le_set_cig_params(cis, qos)) {
+ hci_conn_drop(cis);
+ return ERR_PTR(-EINVAL);
+ }
+
+ hci_conn_hold(cis);
+
+ cis->iso_qos = *qos;
+ cis->state = BT_BOUND;
+
+ return cis;
+}
+
+bool hci_iso_setup_path(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_setup_iso_path cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+
+ if (conn->iso_qos.ucast.out.sdu) {
+ cmd.handle = cpu_to_le16(conn->handle);
+ cmd.direction = 0x00; /* Input (Host to Controller) */
+ cmd.path = 0x00; /* HCI path if enabled */
+ cmd.codec = 0x03; /* Transparent Data */
+
+ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd),
+ &cmd) < 0)
+ return false;
+ }
+
+ if (conn->iso_qos.ucast.in.sdu) {
+ cmd.handle = cpu_to_le16(conn->handle);
+ cmd.direction = 0x01; /* Output (Controller to Host) */
+ cmd.path = 0x00; /* HCI path if enabled */
+ cmd.codec = 0x03; /* Transparent Data */
+
+ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd),
+ &cmd) < 0)
+ return false;
+ }
+
+ return true;
+}
+
+int hci_conn_check_create_cis(struct hci_conn *conn)
+{
+ if (conn->type != CIS_LINK)
+ return -EINVAL;
+
+ if (!conn->parent || conn->parent->state != BT_CONNECTED ||
+ conn->state != BT_CONNECT || HCI_CONN_HANDLE_UNSET(conn->handle))
+ return 1;
+
+ return 0;
+}
+
+static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_le_create_cis_sync(hdev);
+}
+
+int hci_le_create_cis_pending(struct hci_dev *hdev)
+{
+ struct hci_conn *conn;
+ bool pending = false;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+
+ if (!hci_conn_check_create_cis(conn))
+ pending = true;
+ }
+
+ rcu_read_unlock();
+
+ if (!pending)
+ return 0;
+
+ /* Queue Create CIS */
+ return hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL);
+}
+
+static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
+ struct bt_iso_io_qos *qos, __u8 phy)
+{
+ /* Only set MTU if PHY is enabled */
+ if (!qos->sdu && qos->phy)
+ qos->sdu = conn->mtu;
+
+ /* Use the same PHY as ACL if set to any */
+ if (qos->phy == BT_ISO_PHY_ANY)
+ qos->phy = phy;
+
+ /* Use LE ACL connection interval if not set */
+ if (!qos->interval)
+ /* ACL interval unit in 1.25 ms to us */
+ qos->interval = conn->le_conn_interval * 1250;
+
+ /* Use LE ACL connection latency if not set */
+ if (!qos->latency)
+ qos->latency = conn->le_conn_latency;
+}
+
+static int create_big_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+ struct bt_iso_qos *qos = &conn->iso_qos;
+ u16 interval, sync_interval = 0;
+ u32 flags = 0;
+ int err;
+
+ if (qos->bcast.out.phy == 0x02)
+ flags |= MGMT_ADV_FLAG_SEC_2M;
+
+ /* Align intervals */
+ interval = (qos->bcast.out.interval / 1250) * qos->bcast.sync_factor;
+
+ if (qos->bcast.bis)
+ sync_interval = interval * 4;
+
+ err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->sid,
+ conn->le_per_adv_data_len,
+ conn->le_per_adv_data, flags, interval,
+ interval, sync_interval);
+ if (err)
+ return err;
+
+ return hci_le_create_big(conn, &conn->iso_qos);
+}
+
+struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, __u8 sid,
+ struct bt_iso_qos *qos)
+{
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid);
+
+ conn = hci_conn_add_unset(hdev, PA_LINK, dst, dst_type, HCI_ROLE_SLAVE);
+ if (IS_ERR(conn))
+ return conn;
+
+ conn->iso_qos = *qos;
+ conn->sid = sid;
+ conn->state = BT_LISTEN;
+ conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10);
+
+ hci_conn_hold(conn);
+
+ hci_connect_pa_sync(hdev, conn);
+
+ return conn;
+}
+
+int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
+ struct bt_iso_qos *qos, __u16 sync_handle,
+ __u8 num_bis, __u8 bis[])
+{
+ int err;
+
+ if (num_bis < 0x01 || num_bis > ISO_MAX_NUM_BIS)
+ return -EINVAL;
+
+ err = qos_set_big(hdev, qos);
+ if (err)
+ return err;
+
+ if (hcon) {
+ /* Update hcon QoS */
+ hcon->iso_qos = *qos;
+
+ hcon->num_bis = num_bis;
+ memcpy(hcon->bis, bis, num_bis);
+ hcon->conn_timeout = msecs_to_jiffies(qos->bcast.timeout * 10);
}
+
+ return hci_connect_big_sync(hdev, hcon);
+}
+
+static void create_big_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (err) {
+ bt_dev_err(hdev, "Unable to create BIG: %d", err);
+ hci_connect_cfm(conn, err);
+ hci_conn_del(conn);
+ }
+}
+
+struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
+ struct bt_iso_qos *qos,
+ __u8 base_len, __u8 *base, u16 timeout)
+{
+ struct hci_conn *conn;
+ struct hci_conn *parent;
+ __u8 eir[HCI_MAX_PER_AD_LENGTH];
+ struct hci_link *link;
+
+ /* Look for any BIS that is open for rebinding */
+ conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN,
+ HCI_ROLE_MASTER);
+ if (conn) {
+ memcpy(qos, &conn->iso_qos, sizeof(*qos));
+ conn->state = BT_CONNECTED;
+ return conn;
+ }
+
+ if (base_len && base)
+ base_len = eir_append_service_data(eir, 0, 0x1851,
+ base, base_len);
+
+ /* We need hci_conn object using the BDADDR_ANY as dst */
+ conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout);
+ if (IS_ERR(conn))
+ return conn;
+
+ /* Update LINK PHYs according to QoS preference */
+ conn->le_tx_phy = qos->bcast.out.phy;
+ conn->le_tx_phy = qos->bcast.out.phy;
+
+ /* Add Basic Announcement into Peridic Adv Data if BASE is set */
+ if (base_len && base) {
+ memcpy(conn->le_per_adv_data, eir, sizeof(eir));
+ conn->le_per_adv_data_len = base_len;
+ }
+
+ hci_iso_qos_setup(hdev, conn, &qos->bcast.out,
+ conn->le_tx_phy ? conn->le_tx_phy :
+ hdev->le_tx_def_phys);
+
+ conn->iso_qos = *qos;
+ conn->state = BT_BOUND;
+
+ /* Link BISes together */
+ parent = hci_conn_hash_lookup_big(hdev,
+ conn->iso_qos.bcast.big);
+ if (parent && parent != conn) {
+ link = hci_conn_link(parent, conn);
+ hci_conn_drop(conn);
+ if (!link)
+ return ERR_PTR(-ENOLINK);
+ }
+
+ return conn;
+}
+
+int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type)
+{
+ struct hci_conn *le;
+
+ /* Lookup existing LE connection to rebind to */
+ le = hci_conn_hash_lookup_le(conn->hdev, dst, dst_type);
+ if (!le)
+ return -EINVAL;
+
+ return hci_past_sync(conn, le);
+}
+
+static void bis_mark_per_adv(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Skip if not broadcast/ANY address */
+ if (bacmp(&conn->dst, BDADDR_ANY))
+ return;
+
+ if (d->big != conn->iso_qos.bcast.big ||
+ d->bis == BT_ISO_QOS_BIS_UNSET ||
+ d->bis != conn->iso_qos.bcast.bis)
+ return;
+
+ set_bit(HCI_CONN_PER_ADV, &conn->flags);
+}
+
+struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, __u8 sid,
+ struct bt_iso_qos *qos,
+ __u8 base_len, __u8 *base, u16 timeout)
+{
+ struct hci_conn *conn;
+ int err;
+ struct iso_list_data data;
+
+ conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout);
+ if (IS_ERR(conn))
+ return conn;
+
+ if (conn->state == BT_CONNECTED)
+ return conn;
+
+ /* Check if SID needs to be allocated then search for the first
+ * available.
+ */
+ if (conn->sid == HCI_SID_INVALID) {
+ u8 sid;
+
+ for (sid = 0; sid <= 0x0f; sid++) {
+ if (!hci_find_adv_sid(hdev, sid)) {
+ conn->sid = sid;
+ break;
+ }
+ }
+ }
+
+ data.big = qos->bcast.big;
+ data.bis = qos->bcast.bis;
+
+ /* Set HCI_CONN_PER_ADV for all bound connections, to mark that
+ * the start periodic advertising and create BIG commands have
+ * been queued
+ */
+ hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK,
+ BT_BOUND, &data);
+
+ /* Queue start periodic advertising and create BIG */
+ err = hci_cmd_sync_queue(hdev, create_big_sync, conn,
+ create_big_complete);
+ if (err < 0) {
+ hci_conn_drop(conn);
+ return ERR_PTR(err);
+ }
+
+ return conn;
+}
+
+struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout)
+{
+ struct hci_conn *le;
+ struct hci_conn *cis;
+ struct hci_link *link;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ le = hci_connect_le(hdev, dst, dst_type, false,
+ BT_SECURITY_LOW,
+ HCI_LE_CONN_TIMEOUT,
+ HCI_ROLE_SLAVE, 0, 0);
+ else
+ le = hci_connect_le_scan(hdev, dst, dst_type,
+ BT_SECURITY_LOW,
+ HCI_LE_CONN_TIMEOUT,
+ CONN_REASON_ISO_CONNECT);
+ if (IS_ERR(le))
+ return le;
+
+ hci_iso_qos_setup(hdev, le, &qos->ucast.out,
+ le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys);
+ hci_iso_qos_setup(hdev, le, &qos->ucast.in,
+ le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
+
+ cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout);
+ if (IS_ERR(cis)) {
+ hci_conn_drop(le);
+ return cis;
+ }
+
+ link = hci_conn_link(le, cis);
+ hci_conn_drop(cis);
+ if (!link) {
+ hci_conn_drop(le);
+ return ERR_PTR(-ENOLINK);
+ }
+
+ cis->state = BT_CONNECT;
+
+ hci_le_create_cis_pending(hdev);
+
+ return cis;
+}
+
+/* Check link security requirement */
+int hci_conn_check_link_mode(struct hci_conn *conn)
+{
+ BT_DBG("hcon %p", conn);
+
+ /* In Secure Connections Only mode, it is required that Secure
+ * Connections is used and the link is encrypted with AES-CCM
+ * using a P-256 authenticated combination key.
+ */
+ if (hci_dev_test_flag(conn->hdev, HCI_SC_ONLY)) {
+ if (!hci_conn_sc_enabled(conn) ||
+ !test_bit(HCI_CONN_AES_CCM, &conn->flags) ||
+ conn->key_type != HCI_LK_AUTH_COMBINATION_P256)
+ return 0;
+ }
+
+ /* AES encryption is required for Level 4:
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C
+ * page 1319:
+ *
+ * 128-bit equivalent strength for link and encryption keys
+ * required using FIPS approved algorithms (E0 not allowed,
+ * SAFER+ not allowed, and P-192 not allowed; encryption key
+ * not shortened)
+ */
+ if (conn->sec_level == BT_SECURITY_FIPS &&
+ !test_bit(HCI_CONN_AES_CCM, &conn->flags)) {
+ bt_dev_err(conn->hdev,
+ "Invalid security: Missing AES-CCM usage");
+ return 0;
+ }
+
+ if (hci_conn_ssp_enabled(conn) &&
+ !test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ return 0;
+
+ return 1;
}
-EXPORT_SYMBOL(hci_connect);
/* Authenticate remote device */
-int hci_conn_auth(struct hci_conn *conn)
+static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
{
- BT_DBG("conn %p", conn);
+ BT_DBG("hcon %p", conn);
- if (conn->link_mode & HCI_LM_AUTH)
+ if (conn->pending_sec_level > sec_level)
+ sec_level = conn->pending_sec_level;
+
+ if (sec_level > conn->sec_level)
+ conn->pending_sec_level = sec_level;
+ else if (test_bit(HCI_CONN_AUTH, &conn->flags))
return 1;
- if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
+ /* Make sure we preserve an existing MITM requirement*/
+ auth_type |= (conn->auth_type & 0x01);
+
+ conn->auth_type = auth_type;
+
+ if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) {
struct hci_cp_auth_requested cp;
- cp.handle = __cpu_to_le16(conn->handle);
- hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_AUTH_REQUESTED, sizeof(cp), &cp);
+
+ cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
+ sizeof(cp), &cp);
+
+ /* Set the ENCRYPT_PEND to trigger encryption after
+ * authentication.
+ */
+ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
}
+
return 0;
}
-EXPORT_SYMBOL(hci_conn_auth);
-/* Enable encryption */
-int hci_conn_encrypt(struct hci_conn *conn)
+/* Encrypt the link */
+static void hci_conn_encrypt(struct hci_conn *conn)
+{
+ BT_DBG("hcon %p", conn);
+
+ if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) {
+ struct hci_cp_set_conn_encrypt cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.encrypt = 0x01;
+ hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+ &cp);
+ }
+}
+
+/* Enable security */
+int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
+ bool initiator)
{
- BT_DBG("conn %p", conn);
+ BT_DBG("hcon %p", conn);
+
+ if (conn->type == LE_LINK)
+ return smp_conn_security(conn, sec_level);
- if (conn->link_mode & HCI_LM_ENCRYPT)
+ /* For sdp we don't need the link key. */
+ if (sec_level == BT_SECURITY_SDP)
return 1;
- if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
+ /* For non 2.1 devices and low security level we don't need the link
+ key. */
+ if (sec_level == BT_SECURITY_LOW && !hci_conn_ssp_enabled(conn))
+ return 1;
+
+ /* For other security levels we need the link key. */
+ if (!test_bit(HCI_CONN_AUTH, &conn->flags))
+ goto auth;
+
+ switch (conn->key_type) {
+ case HCI_LK_AUTH_COMBINATION_P256:
+ /* An authenticated FIPS approved combination key has
+ * sufficient security for security level 4 or lower.
+ */
+ if (sec_level <= BT_SECURITY_FIPS)
+ goto encrypt;
+ break;
+ case HCI_LK_AUTH_COMBINATION_P192:
+ /* An authenticated combination key has sufficient security for
+ * security level 3 or lower.
+ */
+ if (sec_level <= BT_SECURITY_HIGH)
+ goto encrypt;
+ break;
+ case HCI_LK_UNAUTH_COMBINATION_P192:
+ case HCI_LK_UNAUTH_COMBINATION_P256:
+ /* An unauthenticated combination key has sufficient security
+ * for security level 2 or lower.
+ */
+ if (sec_level <= BT_SECURITY_MEDIUM)
+ goto encrypt;
+ break;
+ case HCI_LK_COMBINATION:
+ /* A combination key has always sufficient security for the
+ * security levels 2 or lower. High security level requires the
+ * combination key is generated using maximum PIN code length
+ * (16). For pre 2.1 units.
+ */
+ if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16)
+ goto encrypt;
+ break;
+ default:
+ break;
+ }
+
+auth:
+ if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags))
return 0;
- if (hci_conn_auth(conn)) {
- struct hci_cp_set_conn_encrypt cp;
- cp.handle = __cpu_to_le16(conn->handle);
- cp.encrypt = 1;
- hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_SET_CONN_ENCRYPT, sizeof(cp), &cp);
+ if (initiator)
+ set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags);
+
+ if (!hci_conn_auth(conn, sec_level, auth_type))
+ return 0;
+
+encrypt:
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) {
+ /* Ensure that the encryption key size has been read,
+ * otherwise stall the upper layer responses.
+ */
+ if (!conn->enc_key_size)
+ return 0;
+
+ /* Nothing else needed, all requirements are met */
+ return 1;
}
+
+ hci_conn_encrypt(conn);
return 0;
}
-EXPORT_SYMBOL(hci_conn_encrypt);
+EXPORT_SYMBOL(hci_conn_security);
-/* Change link key */
-int hci_conn_change_link_key(struct hci_conn *conn)
+/* Check secure link requirement */
+int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level)
{
- BT_DBG("conn %p", conn);
+ BT_DBG("hcon %p", conn);
- if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
- struct hci_cp_change_conn_link_key cp;
- cp.handle = __cpu_to_le16(conn->handle);
- hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_CHANGE_CONN_LINK_KEY, sizeof(cp), &cp);
- }
+ /* Accept if non-secure or higher security level is required */
+ if (sec_level != BT_SECURITY_HIGH && sec_level != BT_SECURITY_FIPS)
+ return 1;
+
+ /* Accept if secure or higher security level is already present */
+ if (conn->sec_level == BT_SECURITY_HIGH ||
+ conn->sec_level == BT_SECURITY_FIPS)
+ return 1;
+
+ /* Reject not secure link */
return 0;
}
-EXPORT_SYMBOL(hci_conn_change_link_key);
+EXPORT_SYMBOL(hci_conn_check_secure);
/* Switch role */
-int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
+int hci_conn_switch_role(struct hci_conn *conn, __u8 role)
{
- BT_DBG("conn %p", conn);
+ BT_DBG("hcon %p", conn);
- if (!role && conn->link_mode & HCI_LM_MASTER)
+ if (role == conn->role)
return 1;
- if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->pend)) {
+ if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->flags)) {
struct hci_cp_switch_role cp;
bacpy(&cp.bdaddr, &conn->dst);
cp.role = role;
- hci_send_cmd(conn->hdev, OGF_LINK_POLICY, OCF_SWITCH_ROLE, sizeof(cp), &cp);
+ hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp);
}
+
return 0;
}
EXPORT_SYMBOL(hci_conn_switch_role);
/* Enter active mode */
-void hci_conn_enter_active_mode(struct hci_conn *conn)
+void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active)
{
struct hci_dev *hdev = conn->hdev;
- BT_DBG("conn %p mode %d", conn, conn->mode);
+ BT_DBG("hcon %p mode %d", conn, conn->mode);
- if (test_bit(HCI_RAW, &hdev->flags))
- return;
+ if (conn->mode != HCI_CM_SNIFF)
+ goto timer;
- if (conn->mode != HCI_CM_SNIFF || !conn->power_save)
+ if (!test_bit(HCI_CONN_POWER_SAVE, &conn->flags) && !force_active)
goto timer;
- if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+ if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) {
struct hci_cp_exit_sniff_mode cp;
- cp.handle = __cpu_to_le16(conn->handle);
- hci_send_cmd(hdev, OGF_LINK_POLICY,
- OCF_EXIT_SNIFF_MODE, sizeof(cp), &cp);
+ cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp);
}
timer:
if (hdev->idle_timeout > 0)
- mod_timer(&conn->idle_timer,
- jiffies + msecs_to_jiffies(hdev->idle_timeout));
+ queue_delayed_work(hdev->workqueue, &conn->idle_work,
+ msecs_to_jiffies(hdev->idle_timeout));
}
-/* Enter sniff mode */
-void hci_conn_enter_sniff_mode(struct hci_conn *conn)
+/* Drop all connection on the device */
+void hci_conn_hash_flush(struct hci_dev *hdev)
{
- struct hci_dev *hdev = conn->hdev;
-
- BT_DBG("conn %p mode %d", conn, conn->mode);
-
- if (test_bit(HCI_RAW, &hdev->flags))
- return;
-
- if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn))
- return;
-
- if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF))
- return;
+ struct list_head *head = &hdev->conn_hash.list;
+ struct hci_conn *conn;
- if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) {
- struct hci_cp_sniff_subrate cp;
- cp.handle = __cpu_to_le16(conn->handle);
- cp.max_latency = __constant_cpu_to_le16(0);
- cp.min_remote_timeout = __constant_cpu_to_le16(0);
- cp.min_local_timeout = __constant_cpu_to_le16(0);
- hci_send_cmd(hdev, OGF_LINK_POLICY,
- OCF_SNIFF_SUBRATE, sizeof(cp), &cp);
- }
+ BT_DBG("hdev %s", hdev->name);
- if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
- struct hci_cp_sniff_mode cp;
- cp.handle = __cpu_to_le16(conn->handle);
- cp.max_interval = __cpu_to_le16(hdev->sniff_max_interval);
- cp.min_interval = __cpu_to_le16(hdev->sniff_min_interval);
- cp.attempt = __constant_cpu_to_le16(4);
- cp.timeout = __constant_cpu_to_le16(1);
- hci_send_cmd(hdev, OGF_LINK_POLICY,
- OCF_SNIFF_MODE, sizeof(cp), &cp);
+ /* We should not traverse the list here, because hci_conn_del
+ * can remove extra links, which may cause the list traversal
+ * to hit items that have already been released.
+ */
+ while ((conn = list_first_entry_or_null(head,
+ struct hci_conn,
+ list)) != NULL) {
+ conn->state = BT_CLOSED;
+ hci_disconn_cfm(conn, HCI_ERROR_LOCAL_HOST_TERM);
+ hci_conn_del(conn);
}
}
-/* Drop all connection on the device */
-void hci_conn_hash_flush(struct hci_dev *hdev)
+static u32 get_link_mode(struct hci_conn *conn)
{
- struct hci_conn_hash *h = &hdev->conn_hash;
- struct list_head *p;
+ u32 link_mode = 0;
- BT_DBG("hdev %s", hdev->name);
+ if (conn->role == HCI_ROLE_MASTER)
+ link_mode |= HCI_LM_MASTER;
- p = h->list.next;
- while (p != &h->list) {
- struct hci_conn *c;
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ link_mode |= HCI_LM_ENCRYPT;
- c = list_entry(p, struct hci_conn, list);
- p = p->next;
+ if (test_bit(HCI_CONN_AUTH, &conn->flags))
+ link_mode |= HCI_LM_AUTH;
- c->state = BT_CLOSED;
+ if (test_bit(HCI_CONN_SECURE, &conn->flags))
+ link_mode |= HCI_LM_SECURE;
- hci_proto_disconn_ind(c, 0x16);
- hci_conn_del(c);
- }
+ if (test_bit(HCI_CONN_FIPS, &conn->flags))
+ link_mode |= HCI_LM_FIPS;
+
+ return link_mode;
}
int hci_get_conn_list(void __user *arg)
{
+ struct hci_conn *c;
struct hci_conn_list_req req, *cl;
struct hci_conn_info *ci;
struct hci_dev *hdev;
- struct list_head *p;
int n = 0, size, err;
if (copy_from_user(&req, arg, sizeof(req)))
@@ -511,31 +2677,30 @@ int hci_get_conn_list(void __user *arg)
size = sizeof(req) + req.conn_num * sizeof(*ci);
- if (!(cl = kmalloc(size, GFP_KERNEL)))
+ cl = kmalloc(size, GFP_KERNEL);
+ if (!cl)
return -ENOMEM;
- if (!(hdev = hci_dev_get(req.dev_id))) {
+ hdev = hci_dev_get(req.dev_id);
+ if (!hdev) {
kfree(cl);
return -ENODEV;
}
ci = cl->conn_info;
- hci_dev_lock_bh(hdev);
- list_for_each(p, &hdev->conn_hash.list) {
- register struct hci_conn *c;
- c = list_entry(p, struct hci_conn, list);
-
+ hci_dev_lock(hdev);
+ list_for_each_entry(c, &hdev->conn_hash.list, list) {
bacpy(&(ci + n)->bdaddr, &c->dst);
(ci + n)->handle = c->handle;
(ci + n)->type = c->type;
(ci + n)->out = c->out;
(ci + n)->state = c->state;
- (ci + n)->link_mode = c->link_mode;
+ (ci + n)->link_mode = get_link_mode(c);
if (++n >= req.conn_num)
break;
}
- hci_dev_unlock_bh(hdev);
+ hci_dev_unlock(hdev);
cl->dev_id = hdev->id;
cl->conn_num = n;
@@ -559,7 +2724,7 @@ int hci_get_conn_info(struct hci_dev *hdev, void __user *arg)
if (copy_from_user(&req, arg, sizeof(req)))
return -EFAULT;
- hci_dev_lock_bh(hdev);
+ hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr);
if (conn) {
bacpy(&ci.bdaddr, &conn->dst);
@@ -567,12 +2732,456 @@ int hci_get_conn_info(struct hci_dev *hdev, void __user *arg)
ci.type = conn->type;
ci.out = conn->out;
ci.state = conn->state;
- ci.link_mode = conn->link_mode;
+ ci.link_mode = get_link_mode(conn);
}
- hci_dev_unlock_bh(hdev);
+ hci_dev_unlock(hdev);
if (!conn)
return -ENOENT;
return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0;
}
+
+int hci_get_auth_info(struct hci_dev *hdev, void __user *arg)
+{
+ struct hci_auth_info_req req;
+ struct hci_conn *conn;
+
+ if (copy_from_user(&req, arg, sizeof(req)))
+ return -EFAULT;
+
+ hci_dev_lock(hdev);
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr);
+ if (conn)
+ req.type = conn->auth_type;
+ hci_dev_unlock(hdev);
+
+ if (!conn)
+ return -ENOENT;
+
+ return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0;
+}
+
+struct hci_chan *hci_chan_create(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_chan *chan;
+
+ BT_DBG("%s hcon %p", hdev->name, conn);
+
+ if (test_bit(HCI_CONN_DROP, &conn->flags)) {
+ BT_DBG("Refusing to create new hci_chan");
+ return NULL;
+ }
+
+ chan = kzalloc(sizeof(*chan), GFP_KERNEL);
+ if (!chan)
+ return NULL;
+
+ chan->conn = hci_conn_get(conn);
+ skb_queue_head_init(&chan->data_q);
+ chan->state = BT_CONNECTED;
+
+ list_add_rcu(&chan->list, &conn->chan_list);
+
+ return chan;
+}
+
+void hci_chan_del(struct hci_chan *chan)
+{
+ struct hci_conn *conn = chan->conn;
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("%s hcon %p chan %p", hdev->name, conn, chan);
+
+ list_del_rcu(&chan->list);
+
+ synchronize_rcu();
+
+ /* Prevent new hci_chan's to be created for this hci_conn */
+ set_bit(HCI_CONN_DROP, &conn->flags);
+
+ hci_conn_put(conn);
+
+ skb_queue_purge(&chan->data_q);
+ kfree(chan);
+}
+
+void hci_chan_list_flush(struct hci_conn *conn)
+{
+ struct hci_chan *chan, *n;
+
+ BT_DBG("hcon %p", conn);
+
+ list_for_each_entry_safe(chan, n, &conn->chan_list, list)
+ hci_chan_del(chan);
+}
+
+static struct hci_chan *__hci_chan_lookup_handle(struct hci_conn *hcon,
+ __u16 handle)
+{
+ struct hci_chan *hchan;
+
+ list_for_each_entry(hchan, &hcon->chan_list, list) {
+ if (hchan->handle == handle)
+ return hchan;
+ }
+
+ return NULL;
+}
+
+struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *hcon;
+ struct hci_chan *hchan = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(hcon, &h->list, list) {
+ hchan = __hci_chan_lookup_handle(hcon, handle);
+ if (hchan)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return hchan;
+}
+
+u32 hci_conn_get_phy(struct hci_conn *conn)
+{
+ u32 phys = 0;
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471:
+ * Table 6.2: Packets defined for synchronous, asynchronous, and
+ * CPB logical transport types.
+ */
+ switch (conn->type) {
+ case SCO_LINK:
+ /* SCO logical transport (1 Mb/s):
+ * HV1, HV2, HV3 and DV.
+ */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ break;
+
+ case ACL_LINK:
+ /* ACL logical transport (1 Mb/s) ptt=0:
+ * DH1, DM3, DH3, DM5 and DH5.
+ */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ if (conn->pkt_type & (HCI_DM3 | HCI_DH3))
+ phys |= BT_PHY_BR_1M_3SLOT;
+
+ if (conn->pkt_type & (HCI_DM5 | HCI_DH5))
+ phys |= BT_PHY_BR_1M_5SLOT;
+
+ /* ACL logical transport (2 Mb/s) ptt=1:
+ * 2-DH1, 2-DH3 and 2-DH5.
+ */
+ if (!(conn->pkt_type & HCI_2DH1))
+ phys |= BT_PHY_EDR_2M_1SLOT;
+
+ if (!(conn->pkt_type & HCI_2DH3))
+ phys |= BT_PHY_EDR_2M_3SLOT;
+
+ if (!(conn->pkt_type & HCI_2DH5))
+ phys |= BT_PHY_EDR_2M_5SLOT;
+
+ /* ACL logical transport (3 Mb/s) ptt=1:
+ * 3-DH1, 3-DH3 and 3-DH5.
+ */
+ if (!(conn->pkt_type & HCI_3DH1))
+ phys |= BT_PHY_EDR_3M_1SLOT;
+
+ if (!(conn->pkt_type & HCI_3DH3))
+ phys |= BT_PHY_EDR_3M_3SLOT;
+
+ if (!(conn->pkt_type & HCI_3DH5))
+ phys |= BT_PHY_EDR_3M_5SLOT;
+
+ break;
+
+ case ESCO_LINK:
+ /* eSCO logical transport (1 Mb/s): EV3, EV4 and EV5 */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ if (!(conn->pkt_type & (ESCO_EV4 | ESCO_EV5)))
+ phys |= BT_PHY_BR_1M_3SLOT;
+
+ /* eSCO logical transport (2 Mb/s): 2-EV3, 2-EV5 */
+ if (!(conn->pkt_type & ESCO_2EV3))
+ phys |= BT_PHY_EDR_2M_1SLOT;
+
+ if (!(conn->pkt_type & ESCO_2EV5))
+ phys |= BT_PHY_EDR_2M_3SLOT;
+
+ /* eSCO logical transport (3 Mb/s): 3-EV3, 3-EV5 */
+ if (!(conn->pkt_type & ESCO_3EV3))
+ phys |= BT_PHY_EDR_3M_1SLOT;
+
+ if (!(conn->pkt_type & ESCO_3EV5))
+ phys |= BT_PHY_EDR_3M_3SLOT;
+
+ break;
+
+ case LE_LINK:
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_1M)
+ phys |= BT_PHY_LE_1M_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_1M)
+ phys |= BT_PHY_LE_1M_RX;
+
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_2M)
+ phys |= BT_PHY_LE_2M_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_2M)
+ phys |= BT_PHY_LE_2M_RX;
+
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED)
+ phys |= BT_PHY_LE_CODED_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED)
+ phys |= BT_PHY_LE_CODED_RX;
+
+ break;
+ }
+
+ return phys;
+}
+
+static int abort_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ return hci_abort_conn_sync(hdev, conn, conn->abort_reason);
+}
+
+int hci_abort_conn(struct hci_conn *conn, u8 reason)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ /* If abort_reason has already been set it means the connection is
+ * already being aborted so don't attempt to overwrite it.
+ */
+ if (conn->abort_reason)
+ return 0;
+
+ bt_dev_dbg(hdev, "handle 0x%2.2x reason 0x%2.2x", conn->handle, reason);
+
+ conn->abort_reason = reason;
+
+ /* If the connection is pending check the command opcode since that
+ * might be blocking on hci_cmd_sync_work while waiting its respective
+ * event so we need to hci_cmd_sync_cancel to cancel it.
+ *
+ * hci_connect_le serializes the connection attempts so only one
+ * connection can be in BT_CONNECT at time.
+ */
+ if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) {
+ switch (hci_skb_event(hdev->sent_cmd)) {
+ case HCI_EV_CONN_COMPLETE:
+ case HCI_EV_LE_CONN_COMPLETE:
+ case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
+ case HCI_EVT_LE_CIS_ESTABLISHED:
+ hci_cmd_sync_cancel(hdev, ECANCELED);
+ break;
+ }
+ /* Cancel connect attempt if still queued/pending */
+ } else if (!hci_cancel_connect_sync(hdev, conn)) {
+ return 0;
+ }
+
+ /* Run immediately if on cmd_sync_work since this may be called
+ * as a result to MGMT_OP_DISCONNECT/MGMT_OP_UNPAIR which does
+ * already queue its callback on cmd_sync_work.
+ */
+ return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL);
+}
+
+void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset,
+ const struct sockcm_cookie *sockc)
+{
+ struct sock *sk = skb ? skb->sk : NULL;
+ int key;
+
+ /* This shall be called on a single skb of those generated by user
+ * sendmsg(), and only when the sendmsg() does not return error to
+ * user. This is required for keeping the tskey that increments here in
+ * sync with possible sendmsg() counting by user.
+ *
+ * Stream sockets shall set key_offset to sendmsg() length in bytes
+ * and call with the last fragment, others to 1 and first fragment.
+ */
+
+ if (!skb || !sockc || !sk || !key_offset)
+ return;
+
+ sock_tx_timestamp(sk, sockc, &skb_shinfo(skb)->tx_flags);
+
+ if (sk->sk_type == SOCK_STREAM)
+ key = atomic_add_return(key_offset, &sk->sk_tskey);
+
+ if (sockc->tsflags & SOF_TIMESTAMPING_OPT_ID &&
+ sockc->tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) {
+ if (sockc->tsflags & SOCKCM_FLAG_TS_OPT_ID) {
+ skb_shinfo(skb)->tskey = sockc->ts_opt_id;
+ } else {
+ if (sk->sk_type != SOCK_STREAM)
+ key = atomic_inc_return(&sk->sk_tskey);
+ skb_shinfo(skb)->tskey = key - 1;
+ }
+ }
+}
+
+void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb)
+{
+ struct tx_queue *comp = &conn->tx_q;
+ bool track = false;
+
+ /* Emit SND now, ie. just before sending to driver */
+ if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND);
+
+ /* COMPLETION tstamp is emitted for tracked skb later in Number of
+ * Completed Packets event. Available only for flow controlled cases.
+ *
+ * TODO: SCO support without flowctl (needs to be done in drivers)
+ */
+ switch (conn->type) {
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ case ACL_LINK:
+ case LE_LINK:
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ if (!hci_dev_test_flag(conn->hdev, HCI_SCO_FLOWCTL))
+ return;
+ break;
+ default:
+ return;
+ }
+
+ if (skb->sk && (skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP))
+ track = true;
+
+ /* If nothing is tracked, just count extra skbs at the queue head */
+ if (!track && !comp->tracked) {
+ comp->extra++;
+ return;
+ }
+
+ if (track) {
+ skb = skb_clone_sk(skb);
+ if (!skb)
+ goto count_only;
+
+ comp->tracked++;
+ } else {
+ skb = skb_clone(skb, GFP_KERNEL);
+ if (!skb)
+ goto count_only;
+ }
+
+ skb_queue_tail(&comp->queue, skb);
+ return;
+
+count_only:
+ /* Stop tracking skbs, and only count. This will not emit timestamps for
+ * the packets, but if we get here something is more seriously wrong.
+ */
+ comp->tracked = 0;
+ comp->extra += skb_queue_len(&comp->queue) + 1;
+ skb_queue_purge(&comp->queue);
+}
+
+void hci_conn_tx_dequeue(struct hci_conn *conn)
+{
+ struct tx_queue *comp = &conn->tx_q;
+ struct sk_buff *skb;
+
+ /* If there are tracked skbs, the counted extra go before dequeuing real
+ * skbs, to keep ordering. When nothing is tracked, the ordering doesn't
+ * matter so dequeue real skbs first to get rid of them ASAP.
+ */
+ if (comp->extra && (comp->tracked || skb_queue_empty(&comp->queue))) {
+ comp->extra--;
+ return;
+ }
+
+ skb = skb_dequeue(&comp->queue);
+ if (!skb)
+ return;
+
+ if (skb->sk) {
+ comp->tracked--;
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk,
+ SCM_TSTAMP_COMPLETION);
+ }
+
+ kfree_skb(skb);
+}
+
+u8 *hci_conn_key_enc_size(struct hci_conn *conn)
+{
+ if (conn->type == ACL_LINK) {
+ struct link_key *key;
+
+ key = hci_find_link_key(conn->hdev, &conn->dst);
+ if (!key)
+ return NULL;
+
+ return &key->pin_len;
+ } else if (conn->type == LE_LINK) {
+ struct smp_ltk *ltk;
+
+ ltk = hci_find_ltk(conn->hdev, &conn->dst, conn->dst_type,
+ conn->role);
+ if (!ltk)
+ return NULL;
+
+ return &ltk->enc_size;
+ }
+
+ return NULL;
+}
+
+int hci_ethtool_ts_info(unsigned int index, int sk_proto,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct hci_dev *hdev;
+
+ hdev = hci_dev_get(index);
+ if (!hdev)
+ return -ENODEV;
+
+ info->so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ info->tx_types = BIT(HWTSTAMP_TX_OFF);
+ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE);
+
+ switch (sk_proto) {
+ case BTPROTO_ISO:
+ case BTPROTO_L2CAP:
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION;
+ break;
+ case BTPROTO_SCO:
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION;
+ break;
+ }
+
+ hci_dev_put(hdev);
+ return 0;
+}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 338ae977a31b..8ccec73dce45 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1,6 +1,7 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (C) 2011 ProFUSION Embedded Systems
Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
@@ -12,52 +13,43 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth HCI core. */
-#include <linux/module.h>
-#include <linux/kmod.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/export.h>
+#include <linux/rfkill.h>
+#include <linux/debugfs.h>
+#include <linux/crypto.h>
+#include <linux/kcov.h>
+#include <linux/property.h>
+#include <linux/suspend.h>
+#include <linux/wait.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/mgmt.h>
-#ifndef CONFIG_BT_HCI_CORE_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#include "hci_debugfs.h"
+#include "smp.h"
+#include "leds.h"
+#include "msft.h"
+#include "aosp.h"
+#include "hci_codec.h"
-static void hci_cmd_task(unsigned long arg);
-static void hci_rx_task(unsigned long arg);
-static void hci_tx_task(unsigned long arg);
-static void hci_notify(struct hci_dev *hdev, int event);
-
-static DEFINE_RWLOCK(hci_task_lock);
+static void hci_rx_work(struct work_struct *work);
+static void hci_cmd_work(struct work_struct *work);
+static void hci_tx_work(struct work_struct *work);
/* HCI device list */
LIST_HEAD(hci_dev_list);
@@ -65,381 +57,359 @@ DEFINE_RWLOCK(hci_dev_list_lock);
/* HCI callback list */
LIST_HEAD(hci_cb_list);
-DEFINE_RWLOCK(hci_cb_list_lock);
+DEFINE_MUTEX(hci_cb_list_lock);
-/* HCI protocols */
-#define HCI_MAX_PROTO 2
-struct hci_proto *hci_proto[HCI_MAX_PROTO];
+/* HCI ID Numbering */
+static DEFINE_IDA(hci_index_ida);
-/* HCI notifiers list */
-static ATOMIC_NOTIFIER_HEAD(hci_notifier);
+/* Get HCI device by index.
+ * Device is held on return. */
+static struct hci_dev *__hci_dev_get(int index, int *srcu_index)
+{
+ struct hci_dev *hdev = NULL, *d;
-/* ---- HCI notifications ---- */
+ BT_DBG("%d", index);
-int hci_register_notifier(struct notifier_block *nb)
-{
- return atomic_notifier_chain_register(&hci_notifier, nb);
+ if (index < 0)
+ return NULL;
+
+ read_lock(&hci_dev_list_lock);
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (d->id == index) {
+ hdev = hci_dev_hold(d);
+ if (srcu_index)
+ *srcu_index = srcu_read_lock(&d->srcu);
+ break;
+ }
+ }
+ read_unlock(&hci_dev_list_lock);
+ return hdev;
}
-int hci_unregister_notifier(struct notifier_block *nb)
+struct hci_dev *hci_dev_get(int index)
{
- return atomic_notifier_chain_unregister(&hci_notifier, nb);
+ return __hci_dev_get(index, NULL);
}
-static void hci_notify(struct hci_dev *hdev, int event)
+static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index)
{
- atomic_notifier_call_chain(&hci_notifier, event, hdev);
+ return __hci_dev_get(index, srcu_index);
}
-/* ---- HCI requests ---- */
-
-void hci_req_complete(struct hci_dev *hdev, int result)
+static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index)
{
- BT_DBG("%s result 0x%2.2x", hdev->name, result);
-
- if (hdev->req_status == HCI_REQ_PEND) {
- hdev->req_result = result;
- hdev->req_status = HCI_REQ_DONE;
- wake_up_interruptible(&hdev->req_wait_q);
- }
+ srcu_read_unlock(&hdev->srcu, srcu_index);
+ hci_dev_put(hdev);
}
-static void hci_req_cancel(struct hci_dev *hdev, int err)
+/* ---- Inquiry support ---- */
+
+bool hci_discovery_active(struct hci_dev *hdev)
{
- BT_DBG("%s err 0x%2.2x", hdev->name, err);
+ struct discovery_state *discov = &hdev->discovery;
- if (hdev->req_status == HCI_REQ_PEND) {
- hdev->req_result = err;
- hdev->req_status = HCI_REQ_CANCELED;
- wake_up_interruptible(&hdev->req_wait_q);
+ switch (discov->state) {
+ case DISCOVERY_FINDING:
+ case DISCOVERY_RESOLVING:
+ return true;
+
+ default:
+ return false;
}
}
-/* Execute request and wait for completion. */
-static int __hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt),
- unsigned long opt, __u32 timeout)
+void hci_discovery_set_state(struct hci_dev *hdev, int state)
{
- DECLARE_WAITQUEUE(wait, current);
- int err = 0;
-
- BT_DBG("%s start", hdev->name);
-
- hdev->req_status = HCI_REQ_PEND;
-
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
+ int old_state = hdev->discovery.state;
- req(hdev, opt);
- schedule_timeout(timeout);
+ if (old_state == state)
+ return;
- remove_wait_queue(&hdev->req_wait_q, &wait);
+ hdev->discovery.state = state;
- if (signal_pending(current))
- return -EINTR;
+ switch (state) {
+ case DISCOVERY_STOPPED:
+ hci_update_passive_scan(hdev);
- switch (hdev->req_status) {
- case HCI_REQ_DONE:
- err = -bt_err(hdev->req_result);
+ if (old_state != DISCOVERY_STARTING)
+ mgmt_discovering(hdev, 0);
break;
-
- case HCI_REQ_CANCELED:
- err = -hdev->req_result;
+ case DISCOVERY_STARTING:
break;
-
- default:
- err = -ETIMEDOUT;
+ case DISCOVERY_FINDING:
+ mgmt_discovering(hdev, 1);
break;
- };
-
- hdev->req_status = hdev->req_result = 0;
-
- BT_DBG("%s end: err %d", hdev->name, err);
+ case DISCOVERY_RESOLVING:
+ break;
+ case DISCOVERY_STOPPING:
+ break;
+ }
- return err;
+ bt_dev_dbg(hdev, "state %u -> %u", old_state, state);
}
-static inline int hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt),
- unsigned long opt, __u32 timeout)
+void hci_inquiry_cache_flush(struct hci_dev *hdev)
{
- int ret;
-
- /* Serialize all requests */
- hci_req_lock(hdev);
- ret = __hci_request(hdev, req, opt, timeout);
- hci_req_unlock(hdev);
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *p, *n;
- return ret;
-}
-
-static void hci_reset_req(struct hci_dev *hdev, unsigned long opt)
-{
- BT_DBG("%s %ld", hdev->name, opt);
+ list_for_each_entry_safe(p, n, &cache->all, all) {
+ list_del(&p->all);
+ kfree(p);
+ }
- /* Reset device */
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_RESET, 0, NULL);
+ INIT_LIST_HEAD(&cache->unknown);
+ INIT_LIST_HEAD(&cache->resolve);
}
-static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
+struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev,
+ bdaddr_t *bdaddr)
{
- struct sk_buff *skb;
- __le16 param;
-
- BT_DBG("%s %ld", hdev->name, opt);
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
- /* Driver initialization */
+ BT_DBG("cache %p, %pMR", cache, bdaddr);
- /* Special commands */
- while ((skb = skb_dequeue(&hdev->driver_init))) {
- bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
- skb->dev = (void *) hdev;
- skb_queue_tail(&hdev->cmd_q, skb);
- hci_sched_cmd(hdev);
+ list_for_each_entry(e, &cache->all, all) {
+ if (!bacmp(&e->data.bdaddr, bdaddr))
+ return e;
}
- skb_queue_purge(&hdev->driver_init);
-
- /* Mandatory initialization */
-
- /* Reset */
- if (test_bit(HCI_QUIRK_RESET_ON_INIT, &hdev->quirks))
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_RESET, 0, NULL);
- /* Read Local Supported Features */
- hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_LOCAL_FEATURES, 0, NULL);
+ return NULL;
+}
- /* Read Local Version */
- hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_LOCAL_VERSION, 0, NULL);
+struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev,
+ bdaddr_t *bdaddr)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
- /* Read Buffer Size (ACL mtu, max pkt, etc.) */
- hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_BUFFER_SIZE, 0, NULL);
+ BT_DBG("cache %p, %pMR", cache, bdaddr);
-#if 0
- /* Host buffer size */
- {
- struct hci_cp_host_buffer_size cp;
- cp.acl_mtu = __cpu_to_le16(HCI_MAX_ACL_SIZE);
- cp.sco_mtu = HCI_MAX_SCO_SIZE;
- cp.acl_max_pkt = __cpu_to_le16(0xffff);
- cp.sco_max_pkt = __cpu_to_le16(0xffff);
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_HOST_BUFFER_SIZE, sizeof(cp), &cp);
+ list_for_each_entry(e, &cache->unknown, list) {
+ if (!bacmp(&e->data.bdaddr, bdaddr))
+ return e;
}
-#endif
- /* Read BD Address */
- hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_BD_ADDR, 0, NULL);
+ return NULL;
+}
- /* Read Voice Setting */
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_READ_VOICE_SETTING, 0, NULL);
+struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev,
+ bdaddr_t *bdaddr,
+ int state)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
- /* Optional initialization */
+ BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state);
- /* Clear Event Filters */
- {
- struct hci_cp_set_event_flt cp;
- cp.flt_type = HCI_FLT_CLEAR_ALL;
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_SET_EVENT_FLT, sizeof(cp), &cp);
+ list_for_each_entry(e, &cache->resolve, list) {
+ if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state)
+ return e;
+ if (!bacmp(&e->data.bdaddr, bdaddr))
+ return e;
}
- /* Page timeout ~20 secs */
- param = __cpu_to_le16(0x8000);
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_PG_TIMEOUT, 2, &param);
-
- /* Connection accept timeout ~20 secs */
- param = __cpu_to_le16(0x7d00);
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_CA_TIMEOUT, 2, &param);
+ return NULL;
}
-static void hci_scan_req(struct hci_dev *hdev, unsigned long opt)
+void hci_inquiry_cache_update_resolve(struct hci_dev *hdev,
+ struct inquiry_entry *ie)
{
- __u8 scan = opt;
+ struct discovery_state *cache = &hdev->discovery;
+ struct list_head *pos = &cache->resolve;
+ struct inquiry_entry *p;
- BT_DBG("%s %x", hdev->name, scan);
+ list_del(&ie->list);
- /* Inquiry and Page scans */
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_SCAN_ENABLE, 1, &scan);
-}
-
-static void hci_auth_req(struct hci_dev *hdev, unsigned long opt)
-{
- __u8 auth = opt;
-
- BT_DBG("%s %x", hdev->name, auth);
+ list_for_each_entry(p, &cache->resolve, list) {
+ if (p->name_state != NAME_PENDING &&
+ abs(p->data.rssi) >= abs(ie->data.rssi))
+ break;
+ pos = &p->list;
+ }
- /* Authentication */
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_AUTH_ENABLE, 1, &auth);
+ list_add(&ie->list, pos);
}
-static void hci_encrypt_req(struct hci_dev *hdev, unsigned long opt)
+u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data,
+ bool name_known)
{
- __u8 encrypt = opt;
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *ie;
+ u32 flags = 0;
- BT_DBG("%s %x", hdev->name, encrypt);
+ BT_DBG("cache %p, %pMR", cache, &data->bdaddr);
- /* Authentication */
- hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_ENCRYPT_MODE, 1, &encrypt);
-}
+ hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR);
-/* Get HCI device by index.
- * Device is held on return. */
-struct hci_dev *hci_dev_get(int index)
-{
- struct hci_dev *hdev = NULL;
- struct list_head *p;
-
- BT_DBG("%d", index);
+ if (!data->ssp_mode)
+ flags |= MGMT_DEV_FOUND_LEGACY_PAIRING;
- if (index < 0)
- return NULL;
+ ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr);
+ if (ie) {
+ if (!ie->data.ssp_mode)
+ flags |= MGMT_DEV_FOUND_LEGACY_PAIRING;
- read_lock(&hci_dev_list_lock);
- list_for_each(p, &hci_dev_list) {
- struct hci_dev *d = list_entry(p, struct hci_dev, list);
- if (d->id == index) {
- hdev = hci_dev_hold(d);
- break;
+ if (ie->name_state == NAME_NEEDED &&
+ data->rssi != ie->data.rssi) {
+ ie->data.rssi = data->rssi;
+ hci_inquiry_cache_update_resolve(hdev, ie);
}
- }
- read_unlock(&hci_dev_list_lock);
- return hdev;
-}
-
-/* ---- Inquiry support ---- */
-static void inquiry_cache_flush(struct hci_dev *hdev)
-{
- struct inquiry_cache *cache = &hdev->inq_cache;
- struct inquiry_entry *next = cache->list, *e;
- BT_DBG("cache %p", cache);
-
- cache->list = NULL;
- while ((e = next)) {
- next = e->next;
- kfree(e);
+ goto update;
}
-}
-
-struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr)
-{
- struct inquiry_cache *cache = &hdev->inq_cache;
- struct inquiry_entry *e;
- BT_DBG("cache %p, %s", cache, batostr(bdaddr));
-
- for (e = cache->list; e; e = e->next)
- if (!bacmp(&e->data.bdaddr, bdaddr))
- break;
- return e;
-}
+ /* Entry not in the cache. Add new one. */
+ ie = kzalloc(sizeof(*ie), GFP_KERNEL);
+ if (!ie) {
+ flags |= MGMT_DEV_FOUND_CONFIRM_NAME;
+ goto done;
+ }
-void hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data)
-{
- struct inquiry_cache *cache = &hdev->inq_cache;
- struct inquiry_entry *e;
+ list_add(&ie->all, &cache->all);
- BT_DBG("cache %p, %s", cache, batostr(&data->bdaddr));
+ if (name_known) {
+ ie->name_state = NAME_KNOWN;
+ } else {
+ ie->name_state = NAME_NOT_KNOWN;
+ list_add(&ie->list, &cache->unknown);
+ }
- if (!(e = hci_inquiry_cache_lookup(hdev, &data->bdaddr))) {
- /* Entry not in the cache. Add new one. */
- if (!(e = kzalloc(sizeof(struct inquiry_entry), GFP_ATOMIC)))
- return;
- e->next = cache->list;
- cache->list = e;
+update:
+ if (name_known && ie->name_state != NAME_KNOWN &&
+ ie->name_state != NAME_PENDING) {
+ ie->name_state = NAME_KNOWN;
+ list_del(&ie->list);
}
- memcpy(&e->data, data, sizeof(*data));
- e->timestamp = jiffies;
+ memcpy(&ie->data, data, sizeof(*data));
+ ie->timestamp = jiffies;
cache->timestamp = jiffies;
+
+ if (ie->name_state == NAME_NOT_KNOWN)
+ flags |= MGMT_DEV_FOUND_CONFIRM_NAME;
+
+done:
+ return flags;
}
static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf)
{
- struct inquiry_cache *cache = &hdev->inq_cache;
+ struct discovery_state *cache = &hdev->discovery;
struct inquiry_info *info = (struct inquiry_info *) buf;
struct inquiry_entry *e;
int copied = 0;
- for (e = cache->list; e && copied < num; e = e->next, copied++) {
+ list_for_each_entry(e, &cache->all, all) {
struct inquiry_data *data = &e->data;
+
+ if (copied >= num)
+ break;
+
bacpy(&info->bdaddr, &data->bdaddr);
info->pscan_rep_mode = data->pscan_rep_mode;
info->pscan_period_mode = data->pscan_period_mode;
info->pscan_mode = data->pscan_mode;
memcpy(info->dev_class, data->dev_class, 3);
info->clock_offset = data->clock_offset;
+
info++;
+ copied++;
}
BT_DBG("cache %p, copied %d", cache, copied);
return copied;
}
-static void hci_inq_req(struct hci_dev *hdev, unsigned long opt)
-{
- struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt;
- struct hci_cp_inquiry cp;
-
- BT_DBG("%s", hdev->name);
-
- if (test_bit(HCI_INQUIRY, &hdev->flags))
- return;
-
- /* Start Inquiry */
- memcpy(&cp.lap, &ir->lap, 3);
- cp.length = ir->length;
- cp.num_rsp = ir->num_rsp;
- hci_send_cmd(hdev, OGF_LINK_CTL, OCF_INQUIRY, sizeof(cp), &cp);
-}
-
int hci_inquiry(void __user *arg)
{
__u8 __user *ptr = arg;
struct hci_inquiry_req ir;
struct hci_dev *hdev;
int err = 0, do_inquiry = 0, max_rsp;
- long timeo;
__u8 *buf;
if (copy_from_user(&ir, ptr, sizeof(ir)))
return -EFAULT;
- if (!(hdev = hci_dev_get(ir.dev_id)))
+ hdev = hci_dev_get(ir.dev_id);
+ if (!hdev)
return -ENODEV;
- hci_dev_lock_bh(hdev);
- if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX ||
- inquiry_cache_empty(hdev) ||
- ir.flags & IREQ_CACHE_FLUSH) {
- inquiry_cache_flush(hdev);
- do_inquiry = 1;
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
}
- hci_dev_unlock_bh(hdev);
- timeo = ir.length * msecs_to_jiffies(2000);
- if (do_inquiry && (err = hci_request(hdev, hci_inq_req, (unsigned long)&ir, timeo)) < 0)
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ err = -EOPNOTSUPP;
goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
- /* for unlimited number of responses we will use buffer with 255 entries */
+ /* Restrict maximum inquiry length to 60 seconds */
+ if (ir.length > 60) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+ if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX ||
+ inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) {
+ hci_inquiry_cache_flush(hdev);
+ do_inquiry = 1;
+ }
+ hci_dev_unlock(hdev);
+
+ if (do_inquiry) {
+ hci_req_sync_lock(hdev);
+ err = hci_inquiry_sync(hdev, ir.length, ir.num_rsp);
+ hci_req_sync_unlock(hdev);
+
+ if (err < 0)
+ goto done;
+
+ /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is
+ * cleared). If it is interrupted by a signal, return -EINTR.
+ */
+ if (wait_on_bit(&hdev->flags, HCI_INQUIRY,
+ TASK_INTERRUPTIBLE)) {
+ err = -EINTR;
+ goto done;
+ }
+ }
+
+ /* for unlimited number of responses we will use buffer with
+ * 255 entries
+ */
max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp;
/* cache_dump can't sleep. Therefore we allocate temp buffer and then
* copy it to the user space.
*/
- if (!(buf = kmalloc(sizeof(struct inquiry_info) * max_rsp, GFP_KERNEL))) {
+ buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL);
+ if (!buf) {
err = -ENOMEM;
goto done;
}
- hci_dev_lock_bh(hdev);
+ hci_dev_lock(hdev);
ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf);
- hci_dev_unlock_bh(hdev);
+ hci_dev_unlock(hdev);
BT_DBG("num_rsp %d", ir.num_rsp);
if (!copy_to_user(ptr, &ir, sizeof(ir))) {
ptr += sizeof(ir);
if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) *
- ir.num_rsp))
+ ir.num_rsp))
err = -EFAULT;
- } else
+ } else
err = -EFAULT;
kfree(buf);
@@ -449,188 +419,199 @@ done:
return err;
}
+static int hci_dev_do_open(struct hci_dev *hdev)
+{
+ int ret = 0;
+
+ BT_DBG("%s %p", hdev->name, hdev);
+
+ hci_req_sync_lock(hdev);
+
+ ret = hci_dev_open_sync(hdev);
+
+ hci_req_sync_unlock(hdev);
+ return ret;
+}
+
/* ---- HCI ioctl helpers ---- */
int hci_dev_open(__u16 dev)
{
struct hci_dev *hdev;
- int ret = 0;
+ int err;
- if (!(hdev = hci_dev_get(dev)))
+ hdev = hci_dev_get(dev);
+ if (!hdev)
return -ENODEV;
- BT_DBG("%s %p", hdev->name, hdev);
-
- hci_req_lock(hdev);
-
- if (test_bit(HCI_UP, &hdev->flags)) {
- ret = -EALREADY;
+ /* Devices that are marked as unconfigured can only be powered
+ * up as user channel. Trying to bring them up as normal devices
+ * will result into a failure. Only user channel operation is
+ * possible.
+ *
+ * When this function is called for a user channel, the flag
+ * HCI_USER_CHANNEL will be set first before attempting to
+ * open the device.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EOPNOTSUPP;
goto done;
}
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
- set_bit(HCI_RAW, &hdev->flags);
+ /* We need to ensure that no other power on/off work is pending
+ * before proceeding to call hci_dev_do_open. This is
+ * particularly important if the setup procedure has not yet
+ * completed.
+ */
+ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF))
+ cancel_delayed_work(&hdev->power_off);
- if (hdev->open(hdev)) {
- ret = -EIO;
- goto done;
- }
+ /* After this call it is guaranteed that the setup procedure
+ * has finished. This means that error conditions like RFKILL
+ * or no valid public or static random address apply.
+ */
+ flush_workqueue(hdev->req_workqueue);
- if (!test_bit(HCI_RAW, &hdev->flags)) {
- atomic_set(&hdev->cmd_cnt, 1);
- set_bit(HCI_INIT, &hdev->flags);
+ /* For controllers not using the management interface and that
+ * are brought up using legacy ioctl, set the HCI_BONDABLE bit
+ * so that pairing works for them. Once the management interface
+ * is in use this bit will be cleared again and userspace has
+ * to explicitly enable it.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !hci_dev_test_flag(hdev, HCI_MGMT))
+ hci_dev_set_flag(hdev, HCI_BONDABLE);
- //__hci_request(hdev, hci_reset_req, 0, HZ);
- ret = __hci_request(hdev, hci_init_req, 0,
- msecs_to_jiffies(HCI_INIT_TIMEOUT));
+ err = hci_dev_do_open(hdev);
- clear_bit(HCI_INIT, &hdev->flags);
- }
+done:
+ hci_dev_put(hdev);
+ return err;
+}
- if (!ret) {
- hci_dev_hold(hdev);
- set_bit(HCI_UP, &hdev->flags);
- hci_notify(hdev, HCI_DEV_UP);
- } else {
- /* Init failed, cleanup */
- tasklet_kill(&hdev->rx_task);
- tasklet_kill(&hdev->tx_task);
- tasklet_kill(&hdev->cmd_task);
+int hci_dev_do_close(struct hci_dev *hdev)
+{
+ int err;
- skb_queue_purge(&hdev->cmd_q);
- skb_queue_purge(&hdev->rx_q);
+ BT_DBG("%s %p", hdev->name, hdev);
- if (hdev->flush)
- hdev->flush(hdev);
+ hci_req_sync_lock(hdev);
- if (hdev->sent_cmd) {
- kfree_skb(hdev->sent_cmd);
- hdev->sent_cmd = NULL;
- }
+ err = hci_dev_close_sync(hdev);
- hdev->close(hdev);
- hdev->flags = 0;
- }
+ hci_req_sync_unlock(hdev);
-done:
- hci_req_unlock(hdev);
- hci_dev_put(hdev);
- return ret;
+ return err;
}
-static int hci_dev_do_close(struct hci_dev *hdev)
+int hci_dev_close(__u16 dev)
{
- BT_DBG("%s %p", hdev->name, hdev);
+ struct hci_dev *hdev;
+ int err;
- hci_req_cancel(hdev, ENODEV);
- hci_req_lock(hdev);
+ hdev = hci_dev_get(dev);
+ if (!hdev)
+ return -ENODEV;
- if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
- hci_req_unlock(hdev);
- return 0;
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
}
- /* Kill RX and TX tasks */
- tasklet_kill(&hdev->rx_task);
- tasklet_kill(&hdev->tx_task);
+ cancel_work_sync(&hdev->power_on);
+ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF))
+ cancel_delayed_work(&hdev->power_off);
- hci_dev_lock_bh(hdev);
- inquiry_cache_flush(hdev);
- hci_conn_hash_flush(hdev);
- hci_dev_unlock_bh(hdev);
+ err = hci_dev_do_close(hdev);
- hci_notify(hdev, HCI_DEV_DOWN);
+done:
+ hci_dev_put(hdev);
+ return err;
+}
- if (hdev->flush)
- hdev->flush(hdev);
+static int hci_dev_do_reset(struct hci_dev *hdev)
+{
+ int ret;
- /* Reset device */
- skb_queue_purge(&hdev->cmd_q);
- atomic_set(&hdev->cmd_cnt, 1);
- if (!test_bit(HCI_RAW, &hdev->flags)) {
- set_bit(HCI_INIT, &hdev->flags);
- __hci_request(hdev, hci_reset_req, 0,
- msecs_to_jiffies(250));
- clear_bit(HCI_INIT, &hdev->flags);
- }
+ BT_DBG("%s %p", hdev->name, hdev);
- /* Kill cmd task */
- tasklet_kill(&hdev->cmd_task);
+ hci_req_sync_lock(hdev);
/* Drop queues */
skb_queue_purge(&hdev->rx_q);
skb_queue_purge(&hdev->cmd_q);
- skb_queue_purge(&hdev->raw_q);
- /* Drop last sent command */
- if (hdev->sent_cmd) {
- kfree_skb(hdev->sent_cmd);
- hdev->sent_cmd = NULL;
- }
+ /* Cancel these to avoid queueing non-chained pending work */
+ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+ /* Wait for
+ *
+ * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+ * queue_delayed_work(&hdev->{cmd,ncmd}_timer)
+ *
+ * inside RCU section to see the flag or complete scheduling.
+ */
+ synchronize_rcu();
+ /* Explicitly cancel works in case scheduled after setting the flag. */
+ cancel_delayed_work(&hdev->cmd_timer);
+ cancel_delayed_work(&hdev->ncmd_timer);
- /* After this point our queues are empty
- * and no tasks are scheduled. */
- hdev->close(hdev);
+ /* Avoid potential lockdep warnings from the *_flush() calls by
+ * ensuring the workqueue is empty up front.
+ */
+ drain_workqueue(hdev->workqueue);
+
+ hci_dev_lock(hdev);
+ hci_inquiry_cache_flush(hdev);
+ hci_conn_hash_flush(hdev);
+ hci_dev_unlock(hdev);
- /* Clear flags */
- hdev->flags = 0;
+ if (hdev->flush)
+ hdev->flush(hdev);
- hci_req_unlock(hdev);
+ hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
- hci_dev_put(hdev);
- return 0;
-}
+ atomic_set(&hdev->cmd_cnt, 1);
+ hdev->acl_cnt = 0;
+ hdev->sco_cnt = 0;
+ hdev->le_cnt = 0;
+ hdev->iso_cnt = 0;
-int hci_dev_close(__u16 dev)
-{
- struct hci_dev *hdev;
- int err;
+ ret = hci_reset_sync(hdev);
- if (!(hdev = hci_dev_get(dev)))
- return -ENODEV;
- err = hci_dev_do_close(hdev);
- hci_dev_put(hdev);
- return err;
+ hci_req_sync_unlock(hdev);
+ return ret;
}
int hci_dev_reset(__u16 dev)
{
struct hci_dev *hdev;
- int ret = 0;
+ int err, srcu_index;
- if (!(hdev = hci_dev_get(dev)))
+ hdev = hci_dev_get_srcu(dev, &srcu_index);
+ if (!hdev)
return -ENODEV;
- hci_req_lock(hdev);
- tasklet_disable(&hdev->tx_task);
-
- if (!test_bit(HCI_UP, &hdev->flags))
+ if (!test_bit(HCI_UP, &hdev->flags)) {
+ err = -ENETDOWN;
goto done;
+ }
- /* Drop queues */
- skb_queue_purge(&hdev->rx_q);
- skb_queue_purge(&hdev->cmd_q);
-
- hci_dev_lock_bh(hdev);
- inquiry_cache_flush(hdev);
- hci_conn_hash_flush(hdev);
- hci_dev_unlock_bh(hdev);
-
- if (hdev->flush)
- hdev->flush(hdev);
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
+ }
- atomic_set(&hdev->cmd_cnt, 1);
- hdev->acl_cnt = 0; hdev->sco_cnt = 0;
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
- if (!test_bit(HCI_RAW, &hdev->flags))
- ret = __hci_request(hdev, hci_reset_req, 0,
- msecs_to_jiffies(HCI_INIT_TIMEOUT));
+ err = hci_dev_do_reset(hdev);
done:
- tasklet_enable(&hdev->tx_task);
- hci_req_unlock(hdev);
- hci_dev_put(hdev);
- return ret;
+ hci_dev_put_srcu(hdev, srcu_index);
+ return err;
}
int hci_dev_reset_stat(__u16 dev)
@@ -638,32 +619,96 @@ int hci_dev_reset_stat(__u16 dev)
struct hci_dev *hdev;
int ret = 0;
- if (!(hdev = hci_dev_get(dev)))
+ hdev = hci_dev_get(dev);
+ if (!hdev)
return -ENODEV;
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ ret = -EOPNOTSUPP;
+ goto done;
+ }
+
memset(&hdev->stat, 0, sizeof(struct hci_dev_stats));
+done:
hci_dev_put(hdev);
-
return ret;
}
+static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan)
+{
+ bool conn_changed, discov_changed;
+
+ BT_DBG("%s scan 0x%02x", hdev->name, scan);
+
+ if ((scan & SCAN_PAGE))
+ conn_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_CONNECTABLE);
+ else
+ conn_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_CONNECTABLE);
+
+ if ((scan & SCAN_INQUIRY)) {
+ discov_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_DISCOVERABLE);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ discov_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_DISCOVERABLE);
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ if (conn_changed || discov_changed) {
+ /* In case this was disabled through mgmt */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ hci_update_adv_data(hdev, hdev->cur_adv_instance);
+
+ mgmt_new_settings(hdev);
+ }
+}
+
int hci_dev_cmd(unsigned int cmd, void __user *arg)
{
struct hci_dev *hdev;
struct hci_dev_req dr;
+ __le16 policy;
int err = 0;
if (copy_from_user(&dr, arg, sizeof(dr)))
return -EFAULT;
- if (!(hdev = hci_dev_get(dr.dev_id)))
+ hdev = hci_dev_get(dr.dev_id);
+ if (!hdev)
return -ENODEV;
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
switch (cmd) {
case HCISETAUTH:
- err = hci_request(hdev, hci_auth_req, dr.dev_opt,
- msecs_to_jiffies(HCI_INIT_TIMEOUT));
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE,
+ 1, &dr.dev_opt, HCI_CMD_TIMEOUT);
break;
case HCISETENCRYPT:
@@ -674,57 +719,75 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
if (!test_bit(HCI_AUTH, &hdev->flags)) {
/* Auth must be enabled first */
- err = hci_request(hdev, hci_auth_req, dr.dev_opt,
- msecs_to_jiffies(HCI_INIT_TIMEOUT));
+ err = hci_cmd_sync_status(hdev,
+ HCI_OP_WRITE_AUTH_ENABLE,
+ 1, &dr.dev_opt,
+ HCI_CMD_TIMEOUT);
if (err)
break;
}
- err = hci_request(hdev, hci_encrypt_req, dr.dev_opt,
- msecs_to_jiffies(HCI_INIT_TIMEOUT));
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE,
+ 1, &dr.dev_opt, HCI_CMD_TIMEOUT);
break;
case HCISETSCAN:
- err = hci_request(hdev, hci_scan_req, dr.dev_opt,
- msecs_to_jiffies(HCI_INIT_TIMEOUT));
- break;
-
- case HCISETPTYPE:
- hdev->pkt_type = (__u16) dr.dev_opt;
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE,
+ 1, &dr.dev_opt, HCI_CMD_TIMEOUT);
+
+ /* Ensure that the connectable and discoverable states
+ * get correctly modified as this was a non-mgmt change.
+ */
+ if (!err)
+ hci_update_passive_scan_state(hdev, dr.dev_opt);
break;
case HCISETLINKPOL:
- hdev->link_policy = (__u16) dr.dev_opt;
+ policy = cpu_to_le16(dr.dev_opt);
+
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY,
+ 2, &policy, HCI_CMD_TIMEOUT);
break;
case HCISETLINKMODE:
- hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT);
+ hdev->link_mode = ((__u16) dr.dev_opt) &
+ (HCI_LM_MASTER | HCI_LM_ACCEPT);
+ break;
+
+ case HCISETPTYPE:
+ if (hdev->pkt_type == (__u16) dr.dev_opt)
+ break;
+
+ hdev->pkt_type = (__u16) dr.dev_opt;
+ mgmt_phy_configuration_changed(hdev, NULL);
break;
case HCISETACLMTU:
- hdev->acl_mtu = *((__u16 *)&dr.dev_opt + 1);
- hdev->acl_pkts = *((__u16 *)&dr.dev_opt + 0);
+ hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1);
+ hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0);
break;
case HCISETSCOMTU:
- hdev->sco_mtu = *((__u16 *)&dr.dev_opt + 1);
- hdev->sco_pkts = *((__u16 *)&dr.dev_opt + 0);
+ hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1);
+ hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0);
break;
default:
err = -EINVAL;
break;
}
+
+done:
hci_dev_put(hdev);
return err;
}
int hci_get_dev_list(void __user *arg)
{
+ struct hci_dev *hdev;
struct hci_dev_list_req *dl;
struct hci_dev_req *dr;
- struct list_head *p;
- int n = 0, size, err;
+ int n = 0, err;
__u16 dev_num;
if (get_user(dev_num, (__u16 __user *) arg))
@@ -733,28 +796,34 @@ int hci_get_dev_list(void __user *arg)
if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr))
return -EINVAL;
- size = sizeof(*dl) + dev_num * sizeof(*dr);
-
- if (!(dl = kmalloc(size, GFP_KERNEL)))
+ dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL);
+ if (!dl)
return -ENOMEM;
+ dl->dev_num = dev_num;
dr = dl->dev_req;
- read_lock_bh(&hci_dev_list_lock);
- list_for_each(p, &hci_dev_list) {
- struct hci_dev *hdev;
- hdev = list_entry(p, struct hci_dev, list);
- (dr + n)->dev_id = hdev->id;
- (dr + n)->dev_opt = hdev->flags;
+ read_lock(&hci_dev_list_lock);
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ unsigned long flags = hdev->flags;
+
+ /* When the auto-off is configured it means the transport
+ * is running, but in that case still indicate that the
+ * device is actually down.
+ */
+ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF))
+ flags &= ~BIT(HCI_UP);
+
+ dr[n].dev_id = hdev->id;
+ dr[n].dev_opt = flags;
+
if (++n >= dev_num)
break;
}
- read_unlock_bh(&hci_dev_list_lock);
+ read_unlock(&hci_dev_list_lock);
dl->dev_num = n;
- size = sizeof(*dl) + n * sizeof(*dr);
-
- err = copy_to_user(arg, dl, size);
+ err = copy_to_user(arg, dl, struct_size(dl, dev_req, n));
kfree(dl);
return err ? -EFAULT : 0;
@@ -764,23 +833,41 @@ int hci_get_dev_info(void __user *arg)
{
struct hci_dev *hdev;
struct hci_dev_info di;
+ unsigned long flags;
int err = 0;
if (copy_from_user(&di, arg, sizeof(di)))
return -EFAULT;
- if (!(hdev = hci_dev_get(di.dev_id)))
+ hdev = hci_dev_get(di.dev_id);
+ if (!hdev)
return -ENODEV;
- strcpy(di.name, hdev->name);
+ /* When the auto-off is configured it means the transport
+ * is running, but in that case still indicate that the
+ * device is actually down.
+ */
+ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF))
+ flags = hdev->flags & ~BIT(HCI_UP);
+ else
+ flags = hdev->flags;
+
+ strscpy(di.name, hdev->name, sizeof(di.name));
di.bdaddr = hdev->bdaddr;
- di.type = hdev->type;
- di.flags = hdev->flags;
+ di.type = (hdev->bus & 0x0f);
+ di.flags = flags;
di.pkt_type = hdev->pkt_type;
- di.acl_mtu = hdev->acl_mtu;
- di.acl_pkts = hdev->acl_pkts;
- di.sco_mtu = hdev->sco_mtu;
- di.sco_pkts = hdev->sco_pkts;
+ if (lmp_bredr_capable(hdev)) {
+ di.acl_mtu = hdev->acl_mtu;
+ di.acl_pkts = hdev->acl_pkts;
+ di.sco_mtu = hdev->sco_mtu;
+ di.sco_pkts = hdev->sco_pkts;
+ } else {
+ di.acl_mtu = hdev->le_mtu;
+ di.acl_pkts = hdev->le_pkts;
+ di.sco_mtu = 0;
+ di.sco_pkts = 0;
+ }
di.link_policy = hdev->link_policy;
di.link_mode = hdev->link_mode;
@@ -797,186 +884,2140 @@ int hci_get_dev_info(void __user *arg)
/* ---- Interface to HCI drivers ---- */
-/* Alloc HCI device */
-struct hci_dev *hci_alloc_dev(void)
+static int hci_dev_do_poweroff(struct hci_dev *hdev)
{
- struct hci_dev *hdev;
+ int err;
- hdev = kzalloc(sizeof(struct hci_dev), GFP_KERNEL);
- if (!hdev)
+ BT_DBG("%s %p", hdev->name, hdev);
+
+ hci_req_sync_lock(hdev);
+
+ err = hci_set_powered_sync(hdev, false);
+
+ hci_req_sync_unlock(hdev);
+
+ return err;
+}
+
+static int hci_rfkill_set_block(void *data, bool blocked)
+{
+ struct hci_dev *hdev = data;
+ int err;
+
+ BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ return -EBUSY;
+
+ if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED))
+ return 0;
+
+ if (blocked) {
+ hci_dev_set_flag(hdev, HCI_RFKILLED);
+
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ err = hci_dev_do_poweroff(hdev);
+ if (err) {
+ bt_dev_err(hdev, "Error when powering off device on rfkill (%d)",
+ err);
+
+ /* Make sure the device is still closed even if
+ * anything during power off sequence (eg.
+ * disconnecting devices) failed.
+ */
+ hci_dev_do_close(hdev);
+ }
+ }
+ } else {
+ hci_dev_clear_flag(hdev, HCI_RFKILLED);
+ }
+
+ return 0;
+}
+
+static const struct rfkill_ops hci_rfkill_ops = {
+ .set_block = hci_rfkill_set_block,
+};
+
+static void hci_power_on(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, power_on);
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (test_bit(HCI_UP, &hdev->flags) &&
+ hci_dev_test_flag(hdev, HCI_MGMT) &&
+ hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+ cancel_delayed_work(&hdev->power_off);
+ err = hci_powered_update_sync(hdev);
+ mgmt_power_on(hdev, err);
+ return;
+ }
+
+ err = hci_dev_do_open(hdev);
+ if (err < 0) {
+ hci_dev_lock(hdev);
+ mgmt_set_powered_failed(hdev, err);
+ hci_dev_unlock(hdev);
+ return;
+ }
+
+ /* During the HCI setup phase, a few error conditions are
+ * ignored and they need to be checked now. If they are still
+ * valid, it is important to turn the device back off.
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
+ hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
+ (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
+ hci_dev_do_close(hdev);
+ } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) {
+ queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
+ HCI_AUTO_OFF_TIMEOUT);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) {
+ /* For unconfigured devices, set the HCI_RAW flag
+ * so that userspace can easily identify them.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ set_bit(HCI_RAW, &hdev->flags);
+
+ /* For fully configured devices, this will send
+ * the Index Added event. For unconfigured devices,
+ * it will send Unconfigued Index Added event.
+ *
+ * Devices with HCI_QUIRK_RAW_DEVICE are ignored
+ * and no event will be send.
+ */
+ mgmt_index_added(hdev);
+ } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) {
+ /* When the controller is now configured, then it
+ * is important to clear the HCI_RAW flag.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ clear_bit(HCI_RAW, &hdev->flags);
+
+ /* Powering on the controller with HCI_CONFIG set only
+ * happens with the transition from unconfigured to
+ * configured. This will send the Index Added event.
+ */
+ mgmt_index_added(hdev);
+ }
+}
+
+static void hci_power_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ power_off.work);
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_do_close(hdev);
+}
+
+static void hci_error_reset(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset);
+
+ hci_dev_hold(hdev);
+ BT_DBG("%s", hdev->name);
+
+ if (hdev->hw_error)
+ hdev->hw_error(hdev, hdev->hw_error_code);
+ else
+ bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code);
+
+ if (!hci_dev_do_close(hdev))
+ hci_dev_do_open(hdev);
+
+ hci_dev_put(hdev);
+}
+
+void hci_uuids_clear(struct hci_dev *hdev)
+{
+ struct bt_uuid *uuid, *tmp;
+
+ list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) {
+ list_del(&uuid->list);
+ kfree(uuid);
+ }
+}
+
+void hci_link_keys_clear(struct hci_dev *hdev)
+{
+ struct link_key *key, *tmp;
+
+ list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) {
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+ }
+}
+
+void hci_smp_ltks_clear(struct hci_dev *hdev)
+{
+ struct smp_ltk *k, *tmp;
+
+ list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) {
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ }
+}
+
+void hci_smp_irks_clear(struct hci_dev *hdev)
+{
+ struct smp_irk *k, *tmp;
+
+ list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) {
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ }
+}
+
+void hci_blocked_keys_clear(struct hci_dev *hdev)
+{
+ struct blocked_key *b, *tmp;
+
+ list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) {
+ list_del_rcu(&b->list);
+ kfree_rcu(b, rcu);
+ }
+}
+
+bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16])
+{
+ bool blocked = false;
+ struct blocked_key *b;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(b, &hdev->blocked_keys, list) {
+ if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) {
+ blocked = true;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return blocked;
+}
+
+struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ struct link_key *k;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(k, &hdev->link_keys, list) {
+ if (bacmp(bdaddr, &k->bdaddr) == 0) {
+ rcu_read_unlock();
+
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LINKKEY,
+ k->val)) {
+ bt_dev_warn_ratelimited(hdev,
+ "Link key blocked for %pMR",
+ &k->bdaddr);
+ return NULL;
+ }
+
+ return k;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 key_type, u8 old_key_type)
+{
+ /* Legacy key */
+ if (key_type < 0x03)
+ return true;
+
+ /* Debug keys are insecure so don't store them persistently */
+ if (key_type == HCI_LK_DEBUG_COMBINATION)
+ return false;
+
+ /* Changed combination key and there's no previous one */
+ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff)
+ return false;
+
+ /* Security mode 3 case */
+ if (!conn)
+ return true;
+
+ /* BR/EDR key derived using SC from an LE link */
+ if (conn->type == LE_LINK)
+ return true;
+
+ /* Neither local nor remote side had no-bonding as requirement */
+ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01)
+ return true;
+
+ /* Local side had dedicated bonding as requirement */
+ if (conn->auth_type == 0x02 || conn->auth_type == 0x03)
+ return true;
+
+ /* Remote side had dedicated bonding as requirement */
+ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03)
+ return true;
+
+ /* If none of the above criteria match, then don't store the key
+ * persistently */
+ return false;
+}
+
+static u8 ltk_role(u8 type)
+{
+ if (type == SMP_LTK)
+ return HCI_ROLE_MASTER;
+
+ return HCI_ROLE_SLAVE;
+}
+
+struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, u8 role)
+{
+ struct smp_ltk *k;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr))
+ continue;
+
+ if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) {
+ rcu_read_unlock();
+
+ if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK,
+ k->val)) {
+ bt_dev_warn_ratelimited(hdev,
+ "LTK blocked for %pMR",
+ &k->bdaddr);
+ return NULL;
+ }
+
+ return k;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa)
+{
+ struct smp_irk *irk_to_return = NULL;
+ struct smp_irk *irk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ if (!bacmp(&irk->rpa, rpa)) {
+ irk_to_return = irk;
+ goto done;
+ }
+ }
+
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ if (smp_irk_matches(hdev, irk->val, rpa)) {
+ bacpy(&irk->rpa, rpa);
+ irk_to_return = irk;
+ goto done;
+ }
+ }
+
+done:
+ if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ irk_to_return->val)) {
+ bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR",
+ &irk_to_return->bdaddr);
+ irk_to_return = NULL;
+ }
+
+ rcu_read_unlock();
+
+ return irk_to_return;
+}
+
+struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type)
+{
+ struct smp_irk *irk_to_return = NULL;
+ struct smp_irk *irk;
+
+ /* Identity Address must be public or static random */
+ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0)
return NULL;
- skb_queue_head_init(&hdev->driver_init);
+ rcu_read_lock();
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ if (addr_type == irk->addr_type &&
+ bacmp(bdaddr, &irk->bdaddr) == 0) {
+ irk_to_return = irk;
+ break;
+ }
+ }
+
+ if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ irk_to_return->val)) {
+ bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR",
+ &irk_to_return->bdaddr);
+ irk_to_return = NULL;
+ }
- return hdev;
+ rcu_read_unlock();
+
+ return irk_to_return;
}
-EXPORT_SYMBOL(hci_alloc_dev);
-/* Free HCI device */
-void hci_free_dev(struct hci_dev *hdev)
+struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
+ bdaddr_t *bdaddr, u8 *val, u8 type,
+ u8 pin_len, bool *persistent)
{
- skb_queue_purge(&hdev->driver_init);
+ struct link_key *key, *old_key;
+ u8 old_key_type;
- /* will free via device release */
- put_device(&hdev->dev);
+ old_key = hci_find_link_key(hdev, bdaddr);
+ if (old_key) {
+ old_key_type = old_key->type;
+ key = old_key;
+ } else {
+ old_key_type = conn ? conn->key_type : 0xff;
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return NULL;
+ list_add_rcu(&key->list, &hdev->link_keys);
+ }
+
+ BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type);
+
+ /* Some buggy controller combinations generate a changed
+ * combination key for legacy pairing even when there's no
+ * previous key */
+ if (type == HCI_LK_CHANGED_COMBINATION &&
+ (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) {
+ type = HCI_LK_COMBINATION;
+ if (conn)
+ conn->key_type = type;
+ }
+
+ bacpy(&key->bdaddr, bdaddr);
+ memcpy(key->val, val, HCI_LINK_KEY_SIZE);
+ key->pin_len = pin_len;
+
+ if (type == HCI_LK_CHANGED_COMBINATION)
+ key->type = old_key_type;
+ else
+ key->type = type;
+
+ if (persistent)
+ *persistent = hci_persistent_key(hdev, conn, type,
+ old_key_type);
+
+ return key;
}
-EXPORT_SYMBOL(hci_free_dev);
-/* Register HCI device */
-int hci_register_dev(struct hci_dev *hdev)
+struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, u8 type, u8 authenticated,
+ u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand)
+{
+ struct smp_ltk *key, *old_key;
+ u8 role = ltk_role(type);
+
+ old_key = hci_find_ltk(hdev, bdaddr, addr_type, role);
+ if (old_key)
+ key = old_key;
+ else {
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return NULL;
+ list_add_rcu(&key->list, &hdev->long_term_keys);
+ }
+
+ bacpy(&key->bdaddr, bdaddr);
+ key->bdaddr_type = addr_type;
+ memcpy(key->val, tk, sizeof(key->val));
+ key->authenticated = authenticated;
+ key->ediv = ediv;
+ key->rand = rand;
+ key->enc_size = enc_size;
+ key->type = type;
+
+ return key;
+}
+
+struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, u8 val[16], bdaddr_t *rpa)
+{
+ struct smp_irk *irk;
+
+ irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type);
+ if (!irk) {
+ irk = kzalloc(sizeof(*irk), GFP_KERNEL);
+ if (!irk)
+ return NULL;
+
+ bacpy(&irk->bdaddr, bdaddr);
+ irk->addr_type = addr_type;
+
+ list_add_rcu(&irk->list, &hdev->identity_resolving_keys);
+ }
+
+ memcpy(irk->val, val, 16);
+ bacpy(&irk->rpa, rpa);
+
+ return irk;
+}
+
+int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ struct link_key *key;
+
+ key = hci_find_link_key(hdev, bdaddr);
+ if (!key)
+ return -ENOENT;
+
+ BT_DBG("%s removing %pMR", hdev->name, bdaddr);
+
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+
+ return 0;
+}
+
+int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct smp_ltk *k, *tmp;
+ int removed = 0;
+
+ list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) {
+ if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type)
+ continue;
+
+ BT_DBG("%s removing %pMR", hdev->name, bdaddr);
+
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ removed++;
+ }
+
+ return removed ? 0 : -ENOENT;
+}
+
+void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type)
+{
+ struct smp_irk *k, *tmp;
+
+ list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) {
+ if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type)
+ continue;
+
+ BT_DBG("%s removing %pMR", hdev->name, bdaddr);
+
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ }
+}
+
+bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
+{
+ struct smp_ltk *k;
+ struct smp_irk *irk;
+ u8 addr_type;
+
+ if (type == BDADDR_BREDR) {
+ if (hci_find_link_key(hdev, bdaddr))
+ return true;
+ return false;
+ }
+
+ /* Convert to HCI addr type which struct smp_ltk uses */
+ if (type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ irk = hci_get_irk(hdev, bdaddr, addr_type);
+ if (irk) {
+ bdaddr = &irk->bdaddr;
+ addr_type = irk->addr_type;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+/* HCI command timer function */
+static void hci_cmd_timeout(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ cmd_timer.work);
+
+ if (hdev->req_skb) {
+ u16 opcode = hci_skb_opcode(hdev->req_skb);
+
+ bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
+
+ hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT);
+ } else {
+ bt_dev_err(hdev, "command tx timeout");
+ }
+
+ if (hdev->reset)
+ hdev->reset(hdev);
+
+ atomic_set(&hdev->cmd_cnt, 1);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+/* HCI ncmd timer function */
+static void hci_ncmd_timeout(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ ncmd_timer.work);
+
+ bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0");
+
+ /* During HCI_INIT phase no events can be injected if the ncmd timer
+ * triggers since the procedure has its own timeout handling.
+ */
+ if (test_bit(HCI_INIT, &hdev->flags))
+ return;
+
+ /* This is an irrecoverable state, inject hardware error event */
+ hci_reset_dev(hdev);
+}
+
+struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct oob_data *data;
+
+ list_for_each_entry(data, &hdev->remote_oob_data, list) {
+ if (bacmp(bdaddr, &data->bdaddr) != 0)
+ continue;
+ if (data->bdaddr_type != bdaddr_type)
+ continue;
+ return data;
+ }
+
+ return NULL;
+}
+
+int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type)
+{
+ struct oob_data *data;
+
+ data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type);
+ if (!data)
+ return -ENOENT;
+
+ BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type);
+
+ list_del(&data->list);
+ kfree(data);
+
+ return 0;
+}
+
+void hci_remote_oob_data_clear(struct hci_dev *hdev)
+{
+ struct oob_data *data, *n;
+
+ list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) {
+ list_del(&data->list);
+ kfree(data);
+ }
+}
+
+int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type, u8 *hash192, u8 *rand192,
+ u8 *hash256, u8 *rand256)
+{
+ struct oob_data *data;
+
+ data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type);
+ if (!data) {
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ bacpy(&data->bdaddr, bdaddr);
+ data->bdaddr_type = bdaddr_type;
+ list_add(&data->list, &hdev->remote_oob_data);
+ }
+
+ if (hash192 && rand192) {
+ memcpy(data->hash192, hash192, sizeof(data->hash192));
+ memcpy(data->rand192, rand192, sizeof(data->rand192));
+ if (hash256 && rand256)
+ data->present = 0x03;
+ } else {
+ memset(data->hash192, 0, sizeof(data->hash192));
+ memset(data->rand192, 0, sizeof(data->rand192));
+ if (hash256 && rand256)
+ data->present = 0x02;
+ else
+ data->present = 0x00;
+ }
+
+ if (hash256 && rand256) {
+ memcpy(data->hash256, hash256, sizeof(data->hash256));
+ memcpy(data->rand256, rand256, sizeof(data->rand256));
+ } else {
+ memset(data->hash256, 0, sizeof(data->hash256));
+ memset(data->rand256, 0, sizeof(data->rand256));
+ if (hash192 && rand192)
+ data->present = 0x01;
+ }
+
+ BT_DBG("%s for %pMR", hdev->name, bdaddr);
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv_instance;
+
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
+ if (adv_instance->instance == instance)
+ return adv_instance;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid)
+{
+ struct adv_info *adv;
+
+ list_for_each_entry(adv, &hdev->adv_instances, list) {
+ if (adv->sid == sid)
+ return adv;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *cur_instance;
+
+ cur_instance = hci_find_adv_instance(hdev, instance);
+ if (!cur_instance)
+ return NULL;
+
+ if (cur_instance == list_last_entry(&hdev->adv_instances,
+ struct adv_info, list))
+ return list_first_entry(&hdev->adv_instances,
+ struct adv_info, list);
+ else
+ return list_next_entry(cur_instance, list);
+}
+
+/* This function requires the caller holds hdev->lock */
+int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -ENOENT;
+
+ BT_DBG("%s removing %dMR", hdev->name, instance);
+
+ if (hdev->cur_adv_instance == instance) {
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+ hdev->cur_adv_instance = 0x00;
+ }
+
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+
+ list_del(&adv_instance->list);
+ kfree(adv_instance);
+
+ hdev->adv_instance_cnt--;
+
+ return 0;
+}
+
+void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired)
+{
+ struct adv_info *adv_instance, *n;
+
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list)
+ adv_instance->rpa_expired = rpa_expired;
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_adv_instances_clear(struct hci_dev *hdev)
+{
+ struct adv_info *adv_instance, *n;
+
+ if (hdev->adv_instance_timeout) {
+ disable_delayed_work(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
+ disable_delayed_work_sync(&adv_instance->rpa_expired_cb);
+ list_del(&adv_instance->list);
+ kfree(adv_instance);
+ }
+
+ hdev->adv_instance_cnt = 0;
+ hdev->cur_adv_instance = 0x00;
+}
+
+static void adv_instance_rpa_expired(struct work_struct *work)
+{
+ struct adv_info *adv_instance = container_of(work, struct adv_info,
+ rpa_expired_cb.work);
+
+ BT_DBG("");
+
+ adv_instance->rpa_expired = true;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
+ u32 flags, u16 adv_data_len, u8 *adv_data,
+ u16 scan_rsp_len, u8 *scan_rsp_data,
+ u16 timeout, u16 duration, s8 tx_power,
+ u32 min_interval, u32 max_interval,
+ u8 mesh_handle)
+{
+ struct adv_info *adv;
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (adv) {
+ memset(adv->adv_data, 0, sizeof(adv->adv_data));
+ memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data));
+ memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data));
+ } else {
+ if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets ||
+ instance < 1 || instance > hdev->le_num_of_adv_sets + 1)
+ return ERR_PTR(-EOVERFLOW);
+
+ adv = kzalloc(sizeof(*adv), GFP_KERNEL);
+ if (!adv)
+ return ERR_PTR(-ENOMEM);
+
+ adv->pending = true;
+ adv->instance = instance;
+
+ /* If controller support only one set and the instance is set to
+ * 1 then there is no option other than using handle 0x00.
+ */
+ if (hdev->le_num_of_adv_sets == 1 && instance == 1)
+ adv->handle = 0x00;
+ else
+ adv->handle = instance;
+
+ list_add(&adv->list, &hdev->adv_instances);
+ hdev->adv_instance_cnt++;
+ }
+
+ adv->flags = flags;
+ adv->min_interval = min_interval;
+ adv->max_interval = max_interval;
+ adv->tx_power = tx_power;
+ /* Defining a mesh_handle changes the timing units to ms,
+ * rather than seconds, and ties the instance to the requested
+ * mesh_tx queue.
+ */
+ adv->mesh = mesh_handle;
+
+ hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data,
+ scan_rsp_len, scan_rsp_data);
+
+ adv->timeout = timeout;
+ adv->remaining_time = timeout;
+
+ if (duration == 0)
+ adv->duration = hdev->def_multi_adv_rotation_duration;
+ else
+ adv->duration = duration;
+
+ INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired);
+
+ BT_DBG("%s for %dMR", hdev->name, instance);
+
+ return adv;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid,
+ u32 flags, u8 data_len, u8 *data,
+ u32 min_interval, u32 max_interval)
+{
+ struct adv_info *adv;
+
+ adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL,
+ 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE,
+ min_interval, max_interval, 0);
+ if (IS_ERR(adv))
+ return adv;
+
+ adv->sid = sid;
+ adv->periodic = true;
+ adv->per_adv_data_len = data_len;
+
+ if (data)
+ memcpy(adv->per_adv_data, data, data_len);
+
+ return adv;
+}
+
+/* This function requires the caller holds hdev->lock */
+int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance,
+ u16 adv_data_len, u8 *adv_data,
+ u16 scan_rsp_len, u8 *scan_rsp_data)
+{
+ struct adv_info *adv;
+
+ adv = hci_find_adv_instance(hdev, instance);
+
+ /* If advertisement doesn't exist, we can't modify its data */
+ if (!adv)
+ return -ENOENT;
+
+ if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) {
+ memset(adv->adv_data, 0, sizeof(adv->adv_data));
+ memcpy(adv->adv_data, adv_data, adv_data_len);
+ adv->adv_data_len = adv_data_len;
+ adv->adv_data_changed = true;
+ }
+
+ if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) {
+ memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data));
+ memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len);
+ adv->scan_rsp_len = scan_rsp_len;
+ adv->scan_rsp_changed = true;
+ }
+
+ /* Mark as changed if there are flags which would affect it */
+ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) ||
+ adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ adv->scan_rsp_changed = true;
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance)
+{
+ u32 flags;
+ struct adv_info *adv;
+
+ if (instance == 0x00) {
+ /* Instance 0 always manages the "Tx Power" and "Flags"
+ * fields
+ */
+ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
+
+ /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting
+ * corresponds to the "connectable" instance flag.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
+ else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ flags |= MGMT_ADV_FLAG_DISCOV;
+
+ return flags;
+ }
+
+ adv = hci_find_adv_instance(hdev, instance);
+
+ /* Return 0 when we got an invalid instance identifier. */
+ if (!adv)
+ return 0;
+
+ return adv->flags;
+}
+
+bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv;
+
+ /* Instance 0x00 always set local name */
+ if (instance == 0x00)
+ return true;
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return false;
+
+ if (adv->flags & MGMT_ADV_FLAG_APPEARANCE ||
+ adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ return true;
+
+ return adv->scan_rsp_len ? true : false;
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_adv_monitors_clear(struct hci_dev *hdev)
{
- struct list_head *head = &hci_dev_list, *p;
- int id = 0;
+ struct adv_monitor *monitor;
+ int handle;
+
+ idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle)
+ hci_free_adv_monitor(hdev, monitor);
- BT_DBG("%p name %s type %d owner %p", hdev, hdev->name, hdev->type, hdev->owner);
+ idr_destroy(&hdev->adv_monitors_idr);
+}
- if (!hdev->open || !hdev->close || !hdev->destruct)
+/* Frees the monitor structure and do some bookkeepings.
+ * This function requires the caller holds hdev->lock.
+ */
+void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ struct adv_pattern *pattern;
+ struct adv_pattern *tmp;
+
+ if (!monitor)
+ return;
+
+ list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) {
+ list_del(&pattern->list);
+ kfree(pattern);
+ }
+
+ if (monitor->handle)
+ idr_remove(&hdev->adv_monitors_idr, monitor->handle);
+
+ if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED)
+ hdev->adv_monitors_cnt--;
+
+ kfree(monitor);
+}
+
+/* Assigns handle to a monitor, and if offloading is supported and power is on,
+ * also attempts to forward the request to the controller.
+ * This function requires the caller holds hci_req_sync_lock.
+ */
+int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ int min, max, handle;
+ int status = 0;
+
+ if (!monitor)
return -EINVAL;
- write_lock_bh(&hci_dev_list_lock);
+ hci_dev_lock(hdev);
+
+ min = HCI_MIN_ADV_MONITOR_HANDLE;
+ max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES;
+ handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max,
+ GFP_KERNEL);
+
+ hci_dev_unlock(hdev);
+
+ if (handle < 0)
+ return handle;
- /* Find first available device id */
- list_for_each(p, &hci_dev_list) {
- if (list_entry(p, struct hci_dev, list)->id != id)
+ monitor->handle = handle;
+
+ if (!hdev_is_powered(hdev))
+ return status;
+
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_NONE:
+ bt_dev_dbg(hdev, "add monitor %d status %d",
+ monitor->handle, status);
+ /* Message was not forwarded to controller - not an error */
+ break;
+
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ status = msft_add_monitor_pattern(hdev, monitor);
+ bt_dev_dbg(hdev, "add monitor %d msft status %d",
+ handle, status);
+ break;
+ }
+
+ return status;
+}
+
+/* Attempts to tell the controller and free the monitor. If somehow the
+ * controller doesn't have a corresponding handle, remove anyway.
+ * This function requires the caller holds hci_req_sync_lock.
+ */
+static int hci_remove_adv_monitor(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ int status = 0;
+ int handle;
+
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */
+ bt_dev_dbg(hdev, "remove monitor %d status %d",
+ monitor->handle, status);
+ goto free_monitor;
+
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ handle = monitor->handle;
+ status = msft_remove_monitor(hdev, monitor);
+ bt_dev_dbg(hdev, "remove monitor %d msft status %d",
+ handle, status);
+ break;
+ }
+
+ /* In case no matching handle registered, just free the monitor */
+ if (status == -ENOENT)
+ goto free_monitor;
+
+ return status;
+
+free_monitor:
+ if (status == -ENOENT)
+ bt_dev_warn(hdev, "Removing monitor with no matching handle %d",
+ monitor->handle);
+ hci_free_adv_monitor(hdev, monitor);
+
+ return status;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle)
+{
+ struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle);
+
+ if (!monitor)
+ return -EINVAL;
+
+ return hci_remove_adv_monitor(hdev, monitor);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int hci_remove_all_adv_monitor(struct hci_dev *hdev)
+{
+ struct adv_monitor *monitor;
+ int idr_next_id = 0;
+ int status = 0;
+
+ while (1) {
+ monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id);
+ if (!monitor)
break;
- head = p; id++;
+
+ status = hci_remove_adv_monitor(hdev, monitor);
+ if (status)
+ return status;
+
+ idr_next_id++;
}
-
- sprintf(hdev->name, "hci%d", id);
- hdev->id = id;
- list_add(&hdev->list, head);
- atomic_set(&hdev->refcnt, 1);
- spin_lock_init(&hdev->lock);
+ return status;
+}
+
+/* This function requires the caller holds hdev->lock */
+bool hci_is_adv_monitoring(struct hci_dev *hdev)
+{
+ return !idr_is_empty(&hdev->adv_monitors_idr);
+}
+
+int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev)
+{
+ if (msft_monitor_supported(hdev))
+ return HCI_ADV_MONITOR_EXT_MSFT;
+
+ return HCI_ADV_MONITOR_EXT_NONE;
+}
+
+struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
+ bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
+struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
+ struct list_head *bdaddr_list, bdaddr_t *bdaddr,
+ u8 type)
+{
+ struct bdaddr_list_with_irk *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
+struct bdaddr_list_with_flags *
+hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list,
+ bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list_with_flags *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
+void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
+{
+ struct bdaddr_list *b, *n;
+
+ list_for_each_entry_safe(b, n, bdaddr_list, list) {
+ list_del(&b->list);
+ kfree(b);
+ }
+}
+
+int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type, u8 *peer_irk, u8 *local_irk)
+{
+ struct bdaddr_list_with_irk *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+
+ if (peer_irk)
+ memcpy(entry->peer_irk, peer_irk, 16);
+
+ if (local_irk)
+ memcpy(entry->local_irk, local_irk, 16);
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type, u32 flags)
+{
+ struct bdaddr_list_with_flags *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+ entry->flags = flags;
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY)) {
+ hci_bdaddr_list_clear(list);
+ return 0;
+ }
+
+ entry = hci_bdaddr_list_lookup(list, bdaddr, type);
+ if (!entry)
+ return -ENOENT;
+
+ list_del(&entry->list);
+ kfree(entry);
+
+ return 0;
+}
+
+int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type)
+{
+ struct bdaddr_list_with_irk *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY)) {
+ hci_bdaddr_list_clear(list);
+ return 0;
+ }
+
+ entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type);
+ if (!entry)
+ return -ENOENT;
+
+ list_del(&entry->list);
+ kfree(entry);
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ list_for_each_entry(params, &hdev->le_conn_params, list) {
+ if (bacmp(&params->addr, addr) == 0 &&
+ params->addr_type == addr_type) {
+ return params;
+ }
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock or rcu_read_lock */
+struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *param;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(param, list, action) {
+ if (bacmp(&param->addr, addr) == 0 &&
+ param->addr_type == addr_type) {
+ rcu_read_unlock();
+ return param;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_pend_le_list_del_init(struct hci_conn_params *param)
+{
+ if (list_empty(&param->action))
+ return;
+
+ list_del_rcu(&param->action);
+ synchronize_rcu();
+ INIT_LIST_HEAD(&param->action);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_pend_le_list_add(struct hci_conn_params *param,
+ struct list_head *list)
+{
+ list_add_rcu(&param->action, list);
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, addr, addr_type);
+ if (params)
+ return params;
+
+ params = kzalloc(sizeof(*params), GFP_KERNEL);
+ if (!params) {
+ bt_dev_err(hdev, "out of memory");
+ return NULL;
+ }
+
+ bacpy(&params->addr, addr);
+ params->addr_type = addr_type;
+
+ list_add(&params->list, &hdev->le_conn_params);
+ INIT_LIST_HEAD(&params->action);
+
+ params->conn_min_interval = hdev->le_conn_min_interval;
+ params->conn_max_interval = hdev->le_conn_max_interval;
+ params->conn_latency = hdev->le_conn_latency;
+ params->supervision_timeout = hdev->le_supv_timeout;
+ params->auto_connect = HCI_AUTO_CONN_DISABLED;
+
+ BT_DBG("addr %pMR (type %u)", addr, addr_type);
+
+ return params;
+}
+
+void hci_conn_params_free(struct hci_conn_params *params)
+{
+ hci_pend_le_list_del_init(params);
+
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ }
+
+ list_del(&params->list);
+ kfree(params);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, addr, addr_type);
+ if (!params)
+ return;
+
+ hci_conn_params_free(params);
+
+ hci_update_passive_scan(hdev);
+
+ BT_DBG("addr %pMR (type %u)", addr, addr_type);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_conn_params_clear_disabled(struct hci_dev *hdev)
+{
+ struct hci_conn_params *params, *tmp;
+
+ list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) {
+ if (params->auto_connect != HCI_AUTO_CONN_DISABLED)
+ continue;
+
+ /* If trying to establish one time connection to disabled
+ * device, leave the params, but mark them as just once.
+ */
+ if (params->explicit_connect) {
+ params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ continue;
+ }
+
+ hci_conn_params_free(params);
+ }
+
+ BT_DBG("All LE disabled connection parameters were removed");
+}
+
+/* This function requires the caller holds hdev->lock */
+static void hci_conn_params_clear_all(struct hci_dev *hdev)
+{
+ struct hci_conn_params *params, *tmp;
+
+ list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list)
+ hci_conn_params_free(params);
+
+ BT_DBG("All LE connection parameters were removed");
+}
+
+/* Copy the Identity Address of the controller.
+ *
+ * If the controller has a public BD_ADDR, then by default use that one.
+ * If this is a LE only controller without a public address, default to
+ * the static random address.
+ *
+ * For debugging purposes it is possible to force controllers with a
+ * public address to use the static random address instead.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller and
+ * userspace has configured a static address, then that address
+ * becomes the identity address instead of the public BR/EDR address.
+ */
+void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 *bdaddr_type)
+{
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ bacpy(bdaddr, &hdev->static_addr);
+ *bdaddr_type = ADDR_LE_DEV_RANDOM;
+ } else {
+ bacpy(bdaddr, &hdev->bdaddr);
+ *bdaddr_type = ADDR_LE_DEV_PUBLIC;
+ }
+}
+
+static void hci_clear_wake_reason(struct hci_dev *hdev)
+{
+ hci_dev_lock(hdev);
+
+ hdev->wake_reason = 0;
+ bacpy(&hdev->wake_addr, BDADDR_ANY);
+ hdev->wake_addr_type = 0;
+
+ hci_dev_unlock(hdev);
+}
+
+static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct hci_dev *hdev =
+ container_of(nb, struct hci_dev, suspend_notifier);
+ int ret = 0;
+
+ /* Userspace has full control of this device. Do nothing. */
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ return NOTIFY_DONE;
+
+ /* To avoid a potential race with hci_unregister_dev. */
+ hci_dev_hold(hdev);
+
+ switch (action) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ ret = hci_suspend_dev(hdev);
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ ret = hci_resume_dev(hdev);
+ break;
+ }
+
+ if (ret)
+ bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d",
+ action, ret);
+
+ hci_dev_put(hdev);
+ return NOTIFY_DONE;
+}
+
+/* Alloc HCI device */
+struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
+{
+ struct hci_dev *hdev;
+ unsigned int alloc_size;
+
+ alloc_size = sizeof(*hdev);
+ if (sizeof_priv) {
+ /* Fixme: May need ALIGN-ment? */
+ alloc_size += sizeof_priv;
+ }
+
+ hdev = kzalloc(alloc_size, GFP_KERNEL);
+ if (!hdev)
+ return NULL;
+
+ if (init_srcu_struct(&hdev->srcu)) {
+ kfree(hdev);
+ return NULL;
+ }
- hdev->flags = 0;
hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1);
+ hdev->esco_type = (ESCO_HV1);
hdev->link_mode = (HCI_LM_ACCEPT);
+ hdev->num_iac = 0x01; /* One IAC support is mandatory */
+ hdev->io_capability = 0x03; /* No Input No Output */
+ hdev->manufacturer = 0xffff; /* Default to internal use */
+ hdev->inq_tx_power = HCI_TX_POWER_INVALID;
+ hdev->adv_tx_power = HCI_TX_POWER_INVALID;
+ hdev->adv_instance_cnt = 0;
+ hdev->cur_adv_instance = 0x00;
+ hdev->adv_instance_timeout = 0;
+
+ hdev->advmon_allowlist_duration = 300;
+ hdev->advmon_no_filter_duration = 500;
+ hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */
- hdev->idle_timeout = 0;
hdev->sniff_max_interval = 800;
hdev->sniff_min_interval = 80;
- tasklet_init(&hdev->cmd_task, hci_cmd_task,(unsigned long) hdev);
- tasklet_init(&hdev->rx_task, hci_rx_task, (unsigned long) hdev);
- tasklet_init(&hdev->tx_task, hci_tx_task, (unsigned long) hdev);
+ hdev->le_adv_channel_map = 0x07;
+ hdev->le_adv_min_interval = 0x0800;
+ hdev->le_adv_max_interval = 0x0800;
+ hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST;
+ hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST;
+ hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1;
+ hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1;
+ hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT;
+ hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN;
+ hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST;
+ hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST;
+ hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN;
+ hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN;
+ hdev->le_conn_min_interval = 0x0018;
+ hdev->le_conn_max_interval = 0x0028;
+ hdev->le_conn_latency = 0x0000;
+ hdev->le_supv_timeout = 0x002a;
+ hdev->le_def_tx_len = 0x001b;
+ hdev->le_def_tx_time = 0x0148;
+ hdev->le_max_tx_len = 0x001b;
+ hdev->le_max_tx_time = 0x0148;
+ hdev->le_max_rx_len = 0x001b;
+ hdev->le_max_rx_time = 0x0148;
+ hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE;
+ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES;
+ hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION;
+ hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT;
+ hdev->min_le_tx_power = HCI_TX_POWER_INVALID;
+ hdev->max_le_tx_power = HCI_TX_POWER_INVALID;
+
+ hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
+ hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
+ hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE;
+ hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE;
+ hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
+ hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE;
+
+ /* default 1.28 sec page scan */
+ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD;
+ hdev->def_page_scan_int = 0x0800;
+ hdev->def_page_scan_window = 0x0012;
+
+ mutex_init(&hdev->lock);
+ mutex_init(&hdev->req_lock);
+ mutex_init(&hdev->mgmt_pending_lock);
+
+ ida_init(&hdev->unset_handle_ida);
+
+ INIT_LIST_HEAD(&hdev->mesh_pending);
+ INIT_LIST_HEAD(&hdev->mgmt_pending);
+ INIT_LIST_HEAD(&hdev->reject_list);
+ INIT_LIST_HEAD(&hdev->accept_list);
+ INIT_LIST_HEAD(&hdev->uuids);
+ INIT_LIST_HEAD(&hdev->link_keys);
+ INIT_LIST_HEAD(&hdev->long_term_keys);
+ INIT_LIST_HEAD(&hdev->identity_resolving_keys);
+ INIT_LIST_HEAD(&hdev->remote_oob_data);
+ INIT_LIST_HEAD(&hdev->le_accept_list);
+ INIT_LIST_HEAD(&hdev->le_resolv_list);
+ INIT_LIST_HEAD(&hdev->le_conn_params);
+ INIT_LIST_HEAD(&hdev->pend_le_conns);
+ INIT_LIST_HEAD(&hdev->pend_le_reports);
+ INIT_LIST_HEAD(&hdev->conn_hash.list);
+ INIT_LIST_HEAD(&hdev->adv_instances);
+ INIT_LIST_HEAD(&hdev->blocked_keys);
+ INIT_LIST_HEAD(&hdev->monitored_devices);
+
+ INIT_LIST_HEAD(&hdev->local_codecs);
+ INIT_WORK(&hdev->rx_work, hci_rx_work);
+ INIT_WORK(&hdev->cmd_work, hci_cmd_work);
+ INIT_WORK(&hdev->tx_work, hci_tx_work);
+ INIT_WORK(&hdev->power_on, hci_power_on);
+ INIT_WORK(&hdev->error_reset, hci_error_reset);
+
+ hci_cmd_sync_init(hdev);
+
+ INIT_DELAYED_WORK(&hdev->power_off, hci_power_off);
skb_queue_head_init(&hdev->rx_q);
skb_queue_head_init(&hdev->cmd_q);
skb_queue_head_init(&hdev->raw_q);
init_waitqueue_head(&hdev->req_wait_q);
- init_MUTEX(&hdev->req_lock);
- inquiry_cache_init(hdev);
+ INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
+ INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout);
- hci_conn_hash_init(hdev);
+ hci_devcd_setup(hdev);
- memset(&hdev->stat, 0, sizeof(struct hci_dev_stats));
+ hci_init_sysfs(hdev);
+ discovery_init(hdev);
+
+ return hdev;
+}
+EXPORT_SYMBOL(hci_alloc_dev_priv);
+
+/* Free HCI device */
+void hci_free_dev(struct hci_dev *hdev)
+{
+ /* will free via device release */
+ put_device(&hdev->dev);
+}
+EXPORT_SYMBOL(hci_free_dev);
+
+/* Register HCI device */
+int hci_register_dev(struct hci_dev *hdev)
+{
+ int id, error;
+
+ if (!hdev->open || !hdev->close || !hdev->send)
+ return -EINVAL;
+
+ id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ error = dev_set_name(&hdev->dev, "hci%u", id);
+ if (error)
+ return error;
+
+ hdev->name = dev_name(&hdev->dev);
+ hdev->id = id;
+
+ BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+
+ hdev->workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name);
+ if (!hdev->workqueue) {
+ error = -ENOMEM;
+ goto err;
+ }
+
+ hdev->req_workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI,
+ hdev->name);
+ if (!hdev->req_workqueue) {
+ destroy_workqueue(hdev->workqueue);
+ error = -ENOMEM;
+ goto err;
+ }
+
+ if (!IS_ERR_OR_NULL(bt_debugfs))
+ hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs);
+
+ error = device_add(&hdev->dev);
+ if (error < 0)
+ goto err_wqueue;
+
+ hci_leds_init(hdev);
+
+ hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev,
+ RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops,
+ hdev);
+ if (hdev->rfkill) {
+ if (rfkill_register(hdev->rfkill) < 0) {
+ rfkill_destroy(hdev->rfkill);
+ hdev->rfkill = NULL;
+ }
+ }
+
+ if (hdev->rfkill && rfkill_blocked(hdev->rfkill))
+ hci_dev_set_flag(hdev, HCI_RFKILLED);
+
+ hci_dev_set_flag(hdev, HCI_SETUP);
+ hci_dev_set_flag(hdev, HCI_AUTO_OFF);
- atomic_set(&hdev->promisc, 0);
+ /* Assume BR/EDR support until proven otherwise (such as
+ * through reading supported features during init.
+ */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
- write_unlock_bh(&hci_dev_list_lock);
+ write_lock(&hci_dev_list_lock);
+ list_add(&hdev->list, &hci_dev_list);
+ write_unlock(&hci_dev_list_lock);
- hci_register_sysfs(hdev);
+ /* Devices that are marked for raw-only usage are unconfigured
+ * and should not be included in normal operation.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
+ hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
- hci_notify(hdev, HCI_DEV_REG);
+ /* Mark Remote Wakeup connection flag as supported if driver has wakeup
+ * callback.
+ */
+ if (hdev->wakeup)
+ hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP;
+
+ hci_sock_dev_event(hdev, HCI_DEV_REG);
+ hci_dev_hold(hdev);
+
+ error = hci_register_suspend_notifier(hdev);
+ if (error)
+ BT_WARN("register suspend notifier failed error:%d\n", error);
+
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+
+ idr_init(&hdev->adv_monitors_idr);
+ msft_register(hdev);
return id;
+
+err_wqueue:
+ debugfs_remove_recursive(hdev->debugfs);
+ destroy_workqueue(hdev->workqueue);
+ destroy_workqueue(hdev->req_workqueue);
+err:
+ ida_free(&hci_index_ida, hdev->id);
+
+ return error;
}
EXPORT_SYMBOL(hci_register_dev);
/* Unregister HCI device */
-int hci_unregister_dev(struct hci_dev *hdev)
+void hci_unregister_dev(struct hci_dev *hdev)
{
- BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type);
+ BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
- hci_unregister_sysfs(hdev);
+ mutex_lock(&hdev->unregister_lock);
+ hci_dev_set_flag(hdev, HCI_UNREGISTER);
+ mutex_unlock(&hdev->unregister_lock);
- write_lock_bh(&hci_dev_list_lock);
+ write_lock(&hci_dev_list_lock);
list_del(&hdev->list);
- write_unlock_bh(&hci_dev_list_lock);
+ write_unlock(&hci_dev_list_lock);
+
+ synchronize_srcu(&hdev->srcu);
+ cleanup_srcu_struct(&hdev->srcu);
+
+ disable_work_sync(&hdev->rx_work);
+ disable_work_sync(&hdev->cmd_work);
+ disable_work_sync(&hdev->tx_work);
+ disable_work_sync(&hdev->power_on);
+ disable_work_sync(&hdev->error_reset);
+
+ hci_cmd_sync_clear(hdev);
+
+ hci_unregister_suspend_notifier(hdev);
hci_dev_do_close(hdev);
- hci_notify(hdev, HCI_DEV_UNREG);
+ if (!test_bit(HCI_INIT, &hdev->flags) &&
+ !hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ hci_dev_lock(hdev);
+ mgmt_index_removed(hdev);
+ hci_dev_unlock(hdev);
+ }
- __hci_dev_put(hdev);
- return 0;
+ /* mgmt_index_removed should take care of emptying the
+ * pending list */
+ BUG_ON(!list_empty(&hdev->mgmt_pending));
+
+ hci_sock_dev_event(hdev, HCI_DEV_UNREG);
+
+ if (hdev->rfkill) {
+ rfkill_unregister(hdev->rfkill);
+ rfkill_destroy(hdev->rfkill);
+ }
+
+ device_del(&hdev->dev);
+ /* Actual cleanup is deferred until hci_release_dev(). */
+ hci_dev_put(hdev);
}
EXPORT_SYMBOL(hci_unregister_dev);
+/* Release HCI device */
+void hci_release_dev(struct hci_dev *hdev)
+{
+ debugfs_remove_recursive(hdev->debugfs);
+ kfree_const(hdev->hw_info);
+ kfree_const(hdev->fw_info);
+
+ destroy_workqueue(hdev->workqueue);
+ destroy_workqueue(hdev->req_workqueue);
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_clear(&hdev->reject_list);
+ hci_bdaddr_list_clear(&hdev->accept_list);
+ hci_uuids_clear(hdev);
+ hci_link_keys_clear(hdev);
+ hci_smp_ltks_clear(hdev);
+ hci_smp_irks_clear(hdev);
+ hci_remote_oob_data_clear(hdev);
+ hci_adv_instances_clear(hdev);
+ hci_adv_monitors_clear(hdev);
+ hci_bdaddr_list_clear(&hdev->le_accept_list);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
+ hci_conn_params_clear_all(hdev);
+ hci_discovery_filter_clear(hdev);
+ hci_blocked_keys_clear(hdev);
+ hci_codec_list_clear(&hdev->local_codecs);
+ msft_release(hdev);
+ hci_dev_unlock(hdev);
+
+ ida_destroy(&hdev->unset_handle_ida);
+ ida_free(&hci_index_ida, hdev->id);
+ kfree_skb(hdev->sent_cmd);
+ kfree_skb(hdev->req_skb);
+ kfree_skb(hdev->recv_event);
+ kfree(hdev);
+}
+EXPORT_SYMBOL(hci_release_dev);
+
+int hci_register_suspend_notifier(struct hci_dev *hdev)
+{
+ int ret = 0;
+
+ if (!hdev->suspend_notifier.notifier_call &&
+ !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) {
+ hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
+ ret = register_pm_notifier(&hdev->suspend_notifier);
+ }
+
+ return ret;
+}
+
+int hci_unregister_suspend_notifier(struct hci_dev *hdev)
+{
+ int ret = 0;
+
+ if (hdev->suspend_notifier.notifier_call) {
+ ret = unregister_pm_notifier(&hdev->suspend_notifier);
+ if (!ret)
+ hdev->suspend_notifier.notifier_call = NULL;
+ }
+
+ return ret;
+}
+
+/* Cancel ongoing command synchronously:
+ *
+ * - Cancel command timer
+ * - Reset command counter
+ * - Cancel command request
+ */
+static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err)
+{
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ disable_delayed_work_sync(&hdev->cmd_timer);
+ disable_delayed_work_sync(&hdev->ncmd_timer);
+ } else {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ cancel_delayed_work_sync(&hdev->ncmd_timer);
+ }
+
+ atomic_set(&hdev->cmd_cnt, 1);
+
+ hci_cmd_sync_cancel_sync(hdev, err);
+}
+
/* Suspend HCI device */
int hci_suspend_dev(struct hci_dev *hdev)
{
- hci_notify(hdev, HCI_DEV_SUSPEND);
- return 0;
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Suspend should only act on when powered. */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ /* If powering down don't attempt to suspend */
+ if (mgmt_powering_down(hdev))
+ return 0;
+
+ /* Cancel potentially blocking sync operation before suspend */
+ hci_cancel_cmd_sync(hdev, EHOSTDOWN);
+
+ hci_req_sync_lock(hdev);
+ ret = hci_suspend_sync(hdev);
+ hci_req_sync_unlock(hdev);
+
+ hci_clear_wake_reason(hdev);
+ mgmt_suspending(hdev, hdev->suspend_state);
+
+ hci_sock_dev_event(hdev, HCI_DEV_SUSPEND);
+ return ret;
}
EXPORT_SYMBOL(hci_suspend_dev);
/* Resume HCI device */
int hci_resume_dev(struct hci_dev *hdev)
{
- hci_notify(hdev, HCI_DEV_RESUME);
- return 0;
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Resume should only act on when powered. */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ /* If powering down don't attempt to resume */
+ if (mgmt_powering_down(hdev))
+ return 0;
+
+ hci_req_sync_lock(hdev);
+ ret = hci_resume_sync(hdev);
+ hci_req_sync_unlock(hdev);
+
+ mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr,
+ hdev->wake_addr_type);
+
+ hci_sock_dev_event(hdev, HCI_DEV_RESUME);
+ return ret;
}
EXPORT_SYMBOL(hci_resume_dev);
-/* ---- Interface to upper protocols ---- */
+/* Reset HCI device */
+int hci_reset_dev(struct hci_dev *hdev)
+{
+ static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 };
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(3, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_skb_pkt_type(skb) = HCI_EVENT_PKT;
+ skb_put_data(skb, hw_err, 3);
+
+ bt_dev_err(hdev, "Injecting HCI hardware error event");
+
+ /* Send Hardware Error to upper stack */
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_reset_dev);
-/* Register/Unregister protocols.
- * hci_task_lock is used to ensure that no tasks are running. */
-int hci_register_proto(struct hci_proto *hp)
+static u8 hci_dev_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb)
{
- int err = 0;
+ if (hdev->classify_pkt_type)
+ return hdev->classify_pkt_type(hdev, skb);
+
+ return hci_skb_pkt_type(skb);
+}
+
+/* Receive frame from HCI drivers */
+int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u8 dev_pkt_type;
- BT_DBG("%p name %s id %d", hp, hp->name, hp->id);
+ if (!hdev || (!test_bit(HCI_UP, &hdev->flags)
+ && !test_bit(HCI_INIT, &hdev->flags))) {
+ kfree_skb(skb);
+ return -ENXIO;
+ }
+
+ /* Check if the driver agree with packet type classification */
+ dev_pkt_type = hci_dev_classify_pkt_type(hdev, skb);
+ if (hci_skb_pkt_type(skb) != dev_pkt_type) {
+ hci_skb_pkt_type(skb) = dev_pkt_type;
+ }
- if (hp->id >= HCI_MAX_PROTO)
+ switch (hci_skb_pkt_type(skb)) {
+ case HCI_EVENT_PKT:
+ break;
+ case HCI_ACLDATA_PKT:
+ /* Detect if ISO packet has been sent as ACL */
+ if (hci_conn_num(hdev, CIS_LINK) ||
+ hci_conn_num(hdev, BIS_LINK) ||
+ hci_conn_num(hdev, PA_LINK)) {
+ __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle);
+ __u8 type;
+
+ type = hci_conn_lookup_type(hdev, hci_handle(handle));
+ if (type == CIS_LINK || type == BIS_LINK ||
+ type == PA_LINK)
+ hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+ }
+ break;
+ case HCI_SCODATA_PKT:
+ break;
+ case HCI_ISODATA_PKT:
+ break;
+ case HCI_DRV_PKT:
+ break;
+ default:
+ kfree_skb(skb);
return -EINVAL;
+ }
- write_lock_bh(&hci_task_lock);
+ /* Incoming skb */
+ bt_cb(skb)->incoming = 1;
- if (!hci_proto[hp->id])
- hci_proto[hp->id] = hp;
- else
- err = -EEXIST;
+ /* Time stamp */
+ __net_timestamp(skb);
- write_unlock_bh(&hci_task_lock);
+ skb_queue_tail(&hdev->rx_q, skb);
+ queue_work(hdev->workqueue, &hdev->rx_work);
- return err;
+ return 0;
}
-EXPORT_SYMBOL(hci_register_proto);
+EXPORT_SYMBOL(hci_recv_frame);
-int hci_unregister_proto(struct hci_proto *hp)
+/* Receive diagnostic message from HCI drivers */
+int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb)
{
- int err = 0;
+ /* Mark as diagnostic packet */
+ hci_skb_pkt_type(skb) = HCI_DIAG_PKT;
- BT_DBG("%p name %s id %d", hp, hp->name, hp->id);
+ /* Time stamp */
+ __net_timestamp(skb);
- if (hp->id >= HCI_MAX_PROTO)
- return -EINVAL;
+ skb_queue_tail(&hdev->rx_q, skb);
+ queue_work(hdev->workqueue, &hdev->rx_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_recv_diag);
- write_lock_bh(&hci_task_lock);
+void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...)
+{
+ va_list vargs;
- if (hci_proto[hp->id])
- hci_proto[hp->id] = NULL;
- else
- err = -ENOENT;
+ va_start(vargs, fmt);
+ kfree_const(hdev->hw_info);
+ hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs);
+ va_end(vargs);
+}
+EXPORT_SYMBOL(hci_set_hw_info);
- write_unlock_bh(&hci_task_lock);
+void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...)
+{
+ va_list vargs;
- return err;
+ va_start(vargs, fmt);
+ kfree_const(hdev->fw_info);
+ hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs);
+ va_end(vargs);
}
-EXPORT_SYMBOL(hci_unregister_proto);
+EXPORT_SYMBOL(hci_set_fw_info);
+
+/* ---- Interface to upper protocols ---- */
int hci_register_cb(struct hci_cb *cb)
{
BT_DBG("%p name %s", cb, cb->name);
- write_lock_bh(&hci_cb_list_lock);
- list_add(&cb->list, &hci_cb_list);
- write_unlock_bh(&hci_cb_list_lock);
+ mutex_lock(&hci_cb_list_lock);
+ list_add_tail(&cb->list, &hci_cb_list);
+ mutex_unlock(&hci_cb_list_lock);
return 0;
}
@@ -986,86 +3027,181 @@ int hci_unregister_cb(struct hci_cb *cb)
{
BT_DBG("%p name %s", cb, cb->name);
- write_lock_bh(&hci_cb_list_lock);
+ mutex_lock(&hci_cb_list_lock);
list_del(&cb->list);
- write_unlock_bh(&hci_cb_list_lock);
+ mutex_unlock(&hci_cb_list_lock);
return 0;
}
EXPORT_SYMBOL(hci_unregister_cb);
-static int hci_send_frame(struct sk_buff *skb)
+static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+ int err;
- if (!hdev) {
- kfree_skb(skb);
- return -ENODEV;
- }
+ BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb),
+ skb->len);
- BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
+ /* Time stamp */
+ __net_timestamp(skb);
- if (atomic_read(&hdev->promisc)) {
- /* Time stamp */
- __net_timestamp(skb);
+ /* Send copy to monitor */
+ hci_send_to_monitor(hdev, skb);
+ if (atomic_read(&hdev->promisc)) {
+ /* Send copy to the sockets */
hci_send_to_sock(hdev, skb);
}
/* Get rid of skb owner, prior to sending to the driver. */
skb_orphan(skb);
- return hdev->send(skb);
+ if (!test_bit(HCI_RUNNING, &hdev->flags)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) {
+ /* Intercept HCI Drv packet here and don't go with hdev->send
+ * callback.
+ */
+ err = hci_drv_process_cmd(hdev, skb);
+ kfree_skb(skb);
+ return err;
+ }
+
+ err = hdev->send(hdev, skb);
+ if (err < 0) {
+ bt_dev_err(hdev, "sending frame failed (%d)", err);
+ kfree_skb(skb);
+ return err;
+ }
+
+ return 0;
+}
+
+static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn,
+ struct sk_buff *skb)
+{
+ hci_conn_tx_queue(conn, skb);
+ return hci_send_frame(hdev, skb);
}
/* Send HCI command */
-int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *param)
+int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
+ const void *param)
{
- int len = HCI_COMMAND_HDR_SIZE + plen;
- struct hci_command_hdr *hdr;
struct sk_buff *skb;
- BT_DBG("%s ogf 0x%x ocf 0x%x plen %d", hdev->name, ogf, ocf, plen);
+ BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen);
- skb = bt_skb_alloc(len, GFP_ATOMIC);
+ skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL);
if (!skb) {
- BT_ERR("%s Can't allocate memory for HCI command", hdev->name);
+ bt_dev_err(hdev, "no memory for command");
return -ENOMEM;
}
- hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE);
- hdr->opcode = __cpu_to_le16(hci_opcode_pack(ogf, ocf));
- hdr->plen = plen;
+ /* Stand-alone HCI commands must be flagged as
+ * single-command requests.
+ */
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
+
+ skb_queue_tail(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
- if (plen)
- memcpy(skb_put(skb, plen), param, plen);
+ return 0;
+}
- BT_DBG("skb len %d", skb->len);
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param)
+{
+ struct sk_buff *skb;
- bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
- skb->dev = (void *) hdev;
- skb_queue_tail(&hdev->cmd_q, skb);
- hci_sched_cmd(hdev);
+ if (hci_opcode_ogf(opcode) != 0x3f) {
+ /* A controller receiving a command shall respond with either
+ * a Command Status Event or a Command Complete Event.
+ * Therefore, all standard HCI commands must be sent via the
+ * standard API, using hci_send_cmd or hci_cmd_sync helpers.
+ * Some vendors do not comply with this rule for vendor-specific
+ * commands and do not return any event. We want to support
+ * unresponded commands for such cases only.
+ */
+ bt_dev_err(hdev, "unresponded command not supported");
+ return -EINVAL;
+ }
+
+ skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL);
+ if (!skb) {
+ bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+ opcode);
+ return -ENOMEM;
+ }
+
+ hci_send_frame(hdev, skb);
return 0;
}
+EXPORT_SYMBOL(__hci_cmd_send);
/* Get data from the previously sent command */
-void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
+static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode)
{
struct hci_command_hdr *hdr;
- if (!hdev->sent_cmd)
+ if (!skb || skb->len < HCI_COMMAND_HDR_SIZE)
return NULL;
- hdr = (void *) hdev->sent_cmd->data;
+ hdr = (void *)skb->data;
- if (hdr->opcode != __cpu_to_le16(hci_opcode_pack(ogf, ocf)))
+ if (hdr->opcode != cpu_to_le16(opcode))
return NULL;
- BT_DBG("%s ogf 0x%x ocf 0x%x", hdev->name, ogf, ocf);
+ return skb->data + HCI_COMMAND_HDR_SIZE;
+}
+
+/* Get data from the previously sent command */
+void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+{
+ void *data;
+
+ /* Check if opcode matches last sent command */
+ data = hci_cmd_data(hdev->sent_cmd, opcode);
+ if (!data)
+ /* Check if opcode matches last request */
+ data = hci_cmd_data(hdev->req_skb, opcode);
- return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
+ return data;
+}
+
+/* Get data from last received event */
+void *hci_recv_event_data(struct hci_dev *hdev, __u8 event)
+{
+ struct hci_event_hdr *hdr;
+ int offset;
+
+ if (!hdev->recv_event)
+ return NULL;
+
+ hdr = (void *)hdev->recv_event->data;
+ offset = sizeof(*hdr);
+
+ if (hdr->evt != event) {
+ /* In case of LE metaevent check the subevent match */
+ if (hdr->evt == HCI_EV_LE_META) {
+ struct hci_ev_le_meta *ev;
+
+ ev = (void *)hdev->recv_event->data + offset;
+ offset += sizeof(*ev);
+ if (ev->subevent == event)
+ goto found;
+ }
+ return NULL;
+ }
+
+found:
+ bt_dev_dbg(hdev, "event 0x%2.2x", event);
+
+ return hdev->recv_event->data + offset;
}
/* Send ACL data */
@@ -1074,325 +3210,869 @@ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags)
struct hci_acl_hdr *hdr;
int len = skb->len;
- hdr = (struct hci_acl_hdr *) skb_push(skb, HCI_ACL_HDR_SIZE);
- hdr->handle = __cpu_to_le16(hci_handle_pack(handle, flags));
- hdr->dlen = __cpu_to_le16(len);
-
- skb->h.raw = (void *) hdr;
+ skb_push(skb, HCI_ACL_HDR_SIZE);
+ skb_reset_transport_header(skb);
+ hdr = (struct hci_acl_hdr *)skb_transport_header(skb);
+ hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags));
+ hdr->dlen = cpu_to_le16(len);
}
-int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
+static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
+ struct sk_buff *skb, __u16 flags)
{
+ struct hci_conn *conn = chan->conn;
struct hci_dev *hdev = conn->hdev;
struct sk_buff *list;
- BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
+ skb->len = skb_headlen(skb);
+ skb->data_len = 0;
+
+ hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
- skb->dev = (void *) hdev;
- bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
- hci_add_acl_hdr(skb, conn->handle, flags | ACL_START);
+ hci_add_acl_hdr(skb, conn->handle, flags);
- if (!(list = skb_shinfo(skb)->frag_list)) {
+ list = skb_shinfo(skb)->frag_list;
+ if (!list) {
/* Non fragmented */
BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len);
- skb_queue_tail(&conn->data_q, skb);
+ skb_queue_tail(queue, skb);
} else {
/* Fragmented */
BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
skb_shinfo(skb)->frag_list = NULL;
- /* Queue all fragments atomically */
- spin_lock_bh(&conn->data_q.lock);
+ /* Queue all fragments atomically. We need to use spin_lock_bh
+ * here because of 6LoWPAN links, as there this function is
+ * called from softirq and using normal spin lock could cause
+ * deadlocks.
+ */
+ spin_lock_bh(&queue->lock);
- __skb_queue_tail(&conn->data_q, skb);
+ __skb_queue_tail(queue, skb);
+
+ flags &= ~ACL_START;
+ flags |= ACL_CONT;
do {
skb = list; list = list->next;
-
- skb->dev = (void *) hdev;
- bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
- hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT);
+
+ hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
+ hci_add_acl_hdr(skb, conn->handle, flags);
BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
- __skb_queue_tail(&conn->data_q, skb);
+ __skb_queue_tail(queue, skb);
} while (list);
- spin_unlock_bh(&conn->data_q.lock);
+ spin_unlock_bh(&queue->lock);
}
- hci_sched_tx(hdev);
- return 0;
+ bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue));
+}
+
+void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags)
+{
+ struct hci_dev *hdev = chan->conn->hdev;
+
+ BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags);
+
+ hci_queue_acl(chan, &chan->data_q, skb, flags);
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
}
-EXPORT_SYMBOL(hci_send_acl);
/* Send SCO data */
-int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
+void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
{
struct hci_dev *hdev = conn->hdev;
struct hci_sco_hdr hdr;
BT_DBG("%s len %d", hdev->name, skb->len);
- if (skb->len > hdev->sco_mtu) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
- hdr.handle = __cpu_to_le16(conn->handle);
+ hdr.handle = cpu_to_le16(conn->handle);
hdr.dlen = skb->len;
- skb->h.raw = skb_push(skb, HCI_SCO_HDR_SIZE);
- memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE);
+ skb_push(skb, HCI_SCO_HDR_SIZE);
+ skb_reset_transport_header(skb);
+ memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE);
+
+ hci_skb_pkt_type(skb) = HCI_SCODATA_PKT;
- skb->dev = (void *) hdev;
- bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
skb_queue_tail(&conn->data_q, skb);
- hci_sched_tx(hdev);
- return 0;
+
+ bt_dev_dbg(hdev, "hcon %p queued %d", conn,
+ skb_queue_len(&conn->data_q));
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+/* Send ISO data */
+static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags)
+{
+ struct hci_iso_hdr *hdr;
+ int len = skb->len;
+
+ skb_push(skb, HCI_ISO_HDR_SIZE);
+ skb_reset_transport_header(skb);
+ hdr = (struct hci_iso_hdr *)skb_transport_header(skb);
+ hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags));
+ hdr->dlen = cpu_to_le16(len);
+}
+
+static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue,
+ struct sk_buff *skb)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct sk_buff *list;
+ __u16 flags;
+
+ skb->len = skb_headlen(skb);
+ skb->data_len = 0;
+
+ hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+
+ list = skb_shinfo(skb)->frag_list;
+
+ flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00);
+ hci_add_iso_hdr(skb, conn->handle, flags);
+
+ if (!list) {
+ /* Non fragmented */
+ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len);
+
+ skb_queue_tail(queue, skb);
+ } else {
+ /* Fragmented */
+ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ __skb_queue_tail(queue, skb);
+
+ do {
+ skb = list; list = list->next;
+
+ hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+ flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END,
+ 0x00);
+ hci_add_iso_hdr(skb, conn->handle, flags);
+
+ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+ __skb_queue_tail(queue, skb);
+ } while (list);
+ }
+
+ bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue));
+}
+
+void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("%s len %d", hdev->name, skb->len);
+
+ hci_queue_iso(conn, &conn->data_q, skb);
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
}
-EXPORT_SYMBOL(hci_send_sco);
/* ---- HCI TX task (outgoing data) ---- */
/* HCI Connection scheduler */
-static inline struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote)
+static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
+{
+ struct hci_dev *hdev;
+ int cnt, q;
+
+ if (!conn) {
+ *quote = 0;
+ return;
+ }
+
+ hdev = conn->hdev;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ cnt = hdev->acl_cnt;
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ cnt = hdev->sco_cnt;
+ break;
+ case LE_LINK:
+ cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ cnt = hdev->iso_cnt;
+ break;
+ default:
+ cnt = 0;
+ bt_dev_err(hdev, "unknown link type %d", conn->type);
+ }
+
+ q = cnt / num;
+ *quote = q ? q : 1;
+}
+
+static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
+ int *quote)
{
struct hci_conn_hash *h = &hdev->conn_hash;
- struct hci_conn *conn = NULL;
- int num = 0, min = ~0;
- struct list_head *p;
+ struct hci_conn *conn = NULL, *c;
+ unsigned int num = 0, min = ~0;
- /* We don't have to lock device here. Connections are always
+ /* We don't have to lock device here. Connections are always
* added and removed with TX task disabled. */
- list_for_each(p, &h->list) {
- struct hci_conn *c;
- c = list_entry(p, struct hci_conn, list);
- if (c->type != type || c->state != BT_CONNECTED
- || skb_queue_empty(&c->data_q))
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(c, &h->list, list) {
+ if (c->type != type ||
+ skb_queue_empty(&c->data_q))
+ continue;
+
+ bt_dev_dbg(hdev, "hcon %p state %s queued %d", c,
+ state_to_string(c->state),
+ skb_queue_len(&c->data_q));
+
+ if (c->state != BT_CONNECTED && c->state != BT_CONFIG)
continue;
+
num++;
if (c->sent < min) {
min = c->sent;
conn = c;
}
+
+ if (hci_conn_num(hdev, type) == num)
+ break;
}
- if (conn) {
- int cnt = (type == ACL_LINK ? hdev->acl_cnt : hdev->sco_cnt);
- int q = cnt / num;
- *quote = q ? q : 1;
- } else
- *quote = 0;
+ rcu_read_unlock();
+
+ hci_quote_sent(conn, num, quote);
BT_DBG("conn %p quote %d", conn, *quote);
return conn;
}
-static inline void hci_acl_tx_to(struct hci_dev *hdev)
+static void hci_link_tx_to(struct hci_dev *hdev, __u8 type)
{
struct hci_conn_hash *h = &hdev->conn_hash;
- struct list_head *p;
- struct hci_conn *c;
+ struct hci_conn *c;
- BT_ERR("%s ACL tx timeout", hdev->name);
+ bt_dev_err(hdev, "link tx timeout");
+
+ hci_dev_lock(hdev);
/* Kill stalled connections */
- list_for_each(p, &h->list) {
- c = list_entry(p, struct hci_conn, list);
- if (c->type == ACL_LINK && c->sent) {
- BT_ERR("%s killing stalled ACL connection %s",
- hdev->name, batostr(&c->dst));
- hci_acl_disconn(c, 0x13);
+ list_for_each_entry(c, &h->list, list) {
+ if (c->type == type && c->sent) {
+ bt_dev_err(hdev, "killing stalled connection %pMR",
+ &c->dst);
+ hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM);
}
}
+
+ hci_dev_unlock(hdev);
}
-static inline void hci_sched_acl(struct hci_dev *hdev)
+static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
+ int *quote)
{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_chan *chan = NULL;
+ unsigned int num = 0, min = ~0, cur_prio = 0;
struct hci_conn *conn;
- struct sk_buff *skb;
- int quote;
+ int conn_num = 0;
BT_DBG("%s", hdev->name);
- if (!test_bit(HCI_RAW, &hdev->flags)) {
- /* ACL tx timeout must be longer than maximum
- * link supervision timeout (40.9 seconds) */
- if (!hdev->acl_cnt && (jiffies - hdev->acl_last_tx) > (HZ * 45))
- hci_acl_tx_to(hdev);
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(conn, &h->list, list) {
+ struct hci_chan *tmp;
+
+ if (conn->type != type)
+ continue;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ continue;
+
+ conn_num++;
+
+ list_for_each_entry_rcu(tmp, &conn->chan_list, list) {
+ struct sk_buff *skb;
+
+ if (skb_queue_empty(&tmp->data_q))
+ continue;
+
+ skb = skb_peek(&tmp->data_q);
+ if (skb->priority < cur_prio)
+ continue;
+
+ if (skb->priority > cur_prio) {
+ num = 0;
+ min = ~0;
+ cur_prio = skb->priority;
+ }
+
+ num++;
+
+ if (conn->sent < min) {
+ min = conn->sent;
+ chan = tmp;
+ }
+ }
+
+ if (hci_conn_num(hdev, type) == conn_num)
+ break;
}
- while (hdev->acl_cnt && (conn = hci_low_sent(hdev, ACL_LINK, &quote))) {
- while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
- BT_DBG("skb %p len %d", skb, skb->len);
+ rcu_read_unlock();
- hci_conn_enter_active_mode(conn);
+ if (!chan)
+ return NULL;
- hci_send_frame(skb);
- hdev->acl_last_tx = jiffies;
+ hci_quote_sent(chan->conn, num, quote);
- hdev->acl_cnt--;
- conn->sent++;
+ BT_DBG("chan %p quote %d", chan, *quote);
+ return chan;
+}
+
+static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *conn;
+ int num = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(conn, &h->list, list) {
+ struct hci_chan *chan;
+
+ if (conn->type != type)
+ continue;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ continue;
+
+ num++;
+
+ list_for_each_entry_rcu(chan, &conn->chan_list, list) {
+ struct sk_buff *skb;
+
+ if (chan->sent) {
+ chan->sent = 0;
+ continue;
+ }
+
+ if (skb_queue_empty(&chan->data_q))
+ continue;
+
+ skb = skb_peek(&chan->data_q);
+ if (skb->priority >= HCI_PRIO_MAX - 1)
+ continue;
+
+ skb->priority = HCI_PRIO_MAX - 1;
+
+ BT_DBG("chan %p skb %p promoted to %d", chan, skb,
+ skb->priority);
}
+
+ if (hci_conn_num(hdev, type) == num)
+ break;
+ }
+
+ rcu_read_unlock();
+
+}
+
+static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type)
+{
+ unsigned long timeout;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ return;
+
+ switch (type) {
+ case ACL_LINK:
+ /* tx timeout must be longer than maximum link supervision
+ * timeout (40.9 seconds)
+ */
+ timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT;
+ break;
+ case LE_LINK:
+ /* tx timeout must be longer than maximum link supervision
+ * timeout (40.9 seconds)
+ */
+ timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT;
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ /* tx timeout must be longer than the maximum transport latency
+ * (8.388607 seconds)
+ */
+ timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT;
+ break;
+ default:
+ return;
}
+
+ if (!cnt && time_after(jiffies, timeout))
+ hci_link_tx_to(hdev, type);
}
/* Schedule SCO */
-static inline void hci_sched_sco(struct hci_dev *hdev)
+static void hci_sched_sco(struct hci_dev *hdev, __u8 type)
{
struct hci_conn *conn;
struct sk_buff *skb;
- int quote;
+ int quote, *cnt;
+ unsigned int pkts = hdev->sco_pkts;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "type %u", type);
- while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
+ if (!hci_conn_num(hdev, type) || !pkts)
+ return;
+
+ /* Use sco_pkts if flow control has not been enabled which will limit
+ * the amount of buffer sent in a row.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ cnt = &pkts;
+ else
+ cnt = &hdev->sco_cnt;
+
+ while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
BT_DBG("skb %p len %d", skb, skb->len);
- hci_send_frame(skb);
+ hci_send_conn_frame(hdev, conn, skb);
conn->sent++;
if (conn->sent == ~0)
conn->sent = 0;
+ (*cnt)--;
+ }
+ }
+
+ /* Rescheduled if all packets were sent and flow control is not enabled
+ * as there could be more packets queued that could not be sent and
+ * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule
+ * needs to be forced.
+ */
+ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+static void hci_sched_acl_pkt(struct hci_dev *hdev)
+{
+ unsigned int cnt = hdev->acl_cnt;
+ struct hci_chan *chan;
+ struct sk_buff *skb;
+ int quote;
+
+ __check_timeout(hdev, cnt, ACL_LINK);
+
+ while (hdev->acl_cnt &&
+ (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) {
+ u32 priority = (skb_peek(&chan->data_q))->priority;
+ while (quote-- && (skb = skb_peek(&chan->data_q))) {
+ BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
+ skb->len, skb->priority);
+
+ /* Stop if priority has changed */
+ if (skb->priority < priority)
+ break;
+
+ skb = skb_dequeue(&chan->data_q);
+
+ hci_conn_enter_active_mode(chan->conn,
+ bt_cb(skb)->force_active);
+
+ hci_send_conn_frame(hdev, chan->conn, skb);
+ hdev->acl_last_tx = jiffies;
+
+ hdev->acl_cnt--;
+ chan->sent++;
+ chan->conn->sent++;
+
+ /* Send pending SCO packets right away */
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
}
}
+
+ if (cnt != hdev->acl_cnt)
+ hci_prio_recalculate(hdev, ACL_LINK);
+}
+
+static void hci_sched_acl(struct hci_dev *hdev)
+{
+ BT_DBG("%s", hdev->name);
+
+ /* No ACL link over BR/EDR controller */
+ if (!hci_conn_num(hdev, ACL_LINK))
+ return;
+
+ hci_sched_acl_pkt(hdev);
}
-static void hci_tx_task(unsigned long arg)
+static void hci_sched_le(struct hci_dev *hdev)
{
- struct hci_dev *hdev = (struct hci_dev *) arg;
+ struct hci_chan *chan;
struct sk_buff *skb;
+ int quote, *cnt, tmp;
- read_lock(&hci_task_lock);
+ BT_DBG("%s", hdev->name);
- BT_DBG("%s acl %d sco %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt);
+ if (!hci_conn_num(hdev, LE_LINK))
+ return;
- /* Schedule queues and send stuff to HCI driver */
+ cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
- hci_sched_acl(hdev);
+ __check_timeout(hdev, *cnt, LE_LINK);
- hci_sched_sco(hdev);
+ tmp = *cnt;
+ while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
+ u32 priority = (skb_peek(&chan->data_q))->priority;
+ while (quote-- && (skb = skb_peek(&chan->data_q))) {
+ BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
+ skb->len, skb->priority);
+
+ /* Stop if priority has changed */
+ if (skb->priority < priority)
+ break;
+
+ skb = skb_dequeue(&chan->data_q);
+
+ hci_send_conn_frame(hdev, chan->conn, skb);
+ hdev->le_last_tx = jiffies;
+
+ (*cnt)--;
+ chan->sent++;
+ chan->conn->sent++;
+
+ /* Send pending SCO packets right away */
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
+ }
+ }
+
+ if (*cnt != tmp)
+ hci_prio_recalculate(hdev, LE_LINK);
+}
+
+/* Schedule iso */
+static void hci_sched_iso(struct hci_dev *hdev, __u8 type)
+{
+ struct hci_conn *conn;
+ struct sk_buff *skb;
+ int quote, *cnt;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_conn_num(hdev, type))
+ return;
+
+ cnt = &hdev->iso_cnt;
+
+ __check_timeout(hdev, *cnt, type);
+
+ while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
+ while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+ BT_DBG("skb %p len %d", skb, skb->len);
+
+ hci_send_conn_frame(hdev, conn, skb);
+ hdev->iso_last_tx = jiffies;
+
+ conn->sent++;
+ if (conn->sent == ~0)
+ conn->sent = 0;
+ (*cnt)--;
+ }
+ }
+}
+
+static void hci_tx_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work);
+ struct sk_buff *skb;
+
+ BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt,
+ hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt);
+
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ /* Schedule queues and send stuff to HCI driver */
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
+ hci_sched_iso(hdev, CIS_LINK);
+ hci_sched_iso(hdev, BIS_LINK);
+ hci_sched_iso(hdev, PA_LINK);
+ hci_sched_acl(hdev);
+ hci_sched_le(hdev);
+ }
/* Send next queued raw (unknown type) packet */
while ((skb = skb_dequeue(&hdev->raw_q)))
- hci_send_frame(skb);
-
- read_unlock(&hci_task_lock);
+ hci_send_frame(hdev, skb);
}
-/* ----- HCI RX task (incoming data proccessing) ----- */
+/* ----- HCI RX task (incoming data processing) ----- */
/* ACL data packet */
-static inline void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct hci_acl_hdr *hdr = (void *) skb->data;
- struct hci_conn *conn;
+ struct hci_acl_hdr *hdr;
__u16 handle, flags;
+ int err;
- skb_pull(skb, HCI_ACL_HDR_SIZE);
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr) {
+ bt_dev_err(hdev, "ACL packet too small");
+ kfree_skb(skb);
+ return;
+ }
handle = __le16_to_cpu(hdr->handle);
flags = hci_flags(handle);
handle = hci_handle(handle);
- BT_DBG("%s len %d handle 0x%x flags 0x%x", hdev->name, skb->len, handle, flags);
+ bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+ handle, flags);
hdev->stat.acl_rx++;
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
-
- if (conn) {
- register struct hci_proto *hp;
+ err = l2cap_recv_acldata(hdev, handle, skb, flags);
+ if (err == -ENOENT)
+ bt_dev_err(hdev, "ACL packet for unknown connection handle %d",
+ handle);
+ else if (err)
+ bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d",
+ handle, err);
+}
- hci_conn_enter_active_mode(conn);
+/* SCO data packet */
+static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_sco_hdr *hdr;
+ __u16 handle, flags;
+ int err;
- /* Send to upper protocol */
- if ((hp = hci_proto[HCI_PROTO_L2CAP]) && hp->recv_acldata) {
- hp->recv_acldata(conn, skb, flags);
- return;
- }
- } else {
- BT_ERR("%s ACL packet for unknown connection handle %d",
- hdev->name, handle);
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr) {
+ bt_dev_err(hdev, "SCO packet too small");
+ kfree_skb(skb);
+ return;
}
- kfree_skb(skb);
+ handle = __le16_to_cpu(hdr->handle);
+ flags = hci_flags(handle);
+ handle = hci_handle(handle);
+
+ bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+ handle, flags);
+
+ hdev->stat.sco_rx++;
+
+ hci_skb_pkt_status(skb) = flags & 0x03;
+
+ err = sco_recv_scodata(hdev, handle, skb);
+ if (err == -ENOENT)
+ bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d",
+ handle);
+ else if (err)
+ bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d",
+ handle, err);
}
-/* SCO data packet */
-static inline void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct hci_sco_hdr *hdr = (void *) skb->data;
- struct hci_conn *conn;
- __u16 handle;
+ struct hci_iso_hdr *hdr;
+ __u16 handle, flags;
+ int err;
- skb_pull(skb, HCI_SCO_HDR_SIZE);
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr) {
+ bt_dev_err(hdev, "ISO packet too small");
+ kfree_skb(skb);
+ return;
+ }
handle = __le16_to_cpu(hdr->handle);
+ flags = hci_flags(handle);
+ handle = hci_handle(handle);
- BT_DBG("%s len %d handle 0x%x", hdev->name, skb->len, handle);
+ bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+ handle, flags);
- hdev->stat.sco_rx++;
+ err = iso_recv(hdev, handle, skb, flags);
+ if (err == -ENOENT)
+ bt_dev_err(hdev, "ISO packet for unknown connection handle %d",
+ handle);
+ else if (err)
+ bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d",
+ handle, err);
+}
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
+static bool hci_req_is_complete(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
- if (conn) {
- register struct hci_proto *hp;
+ skb = skb_peek(&hdev->cmd_q);
+ if (!skb)
+ return true;
- /* Send to upper protocol */
- if ((hp = hci_proto[HCI_PROTO_SCO]) && hp->recv_scodata) {
- hp->recv_scodata(conn, skb);
- return;
- }
- } else {
- BT_ERR("%s SCO packet for unknown connection handle %d",
- hdev->name, handle);
+ return (bt_cb(skb)->hci.req_flags & HCI_REQ_START);
+}
+
+static void hci_resend_last(struct hci_dev *hdev)
+{
+ struct hci_command_hdr *sent;
+ struct sk_buff *skb;
+ u16 opcode;
+
+ if (!hdev->sent_cmd)
+ return;
+
+ sent = (void *) hdev->sent_cmd->data;
+ opcode = __le16_to_cpu(sent->opcode);
+ if (opcode == HCI_OP_RESET)
+ return;
+
+ skb = skb_clone(hdev->sent_cmd, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ skb_queue_head(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ BT_DBG("opcode 0x%04x status 0x%02x", opcode, status);
+
+ /* If the completed command doesn't match the last one that was
+ * sent we need to do special handling of it.
+ */
+ if (!hci_sent_cmd_data(hdev, opcode)) {
+ /* Some CSR based controllers generate a spontaneous
+ * reset complete event during init and any pending
+ * command will never be completed. In such a case we
+ * need to resend whatever was the last sent
+ * command.
+ */
+ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET)
+ hci_resend_last(hdev);
+
+ return;
}
- kfree_skb(skb);
+ /* If we reach this point this event matches the last command sent */
+ hci_dev_clear_flag(hdev, HCI_CMD_PENDING);
+
+ /* If the command succeeded and there's still more commands in
+ * this request the request is not yet complete.
+ */
+ if (!status && !hci_req_is_complete(hdev))
+ return;
+
+ skb = hdev->req_skb;
+
+ /* If this was the last command in a request the complete
+ * callback would be found in hdev->req_skb instead of the
+ * command queue (hdev->cmd_q).
+ */
+ if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) {
+ *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
+ return;
+ }
+
+ if (skb && bt_cb(skb)->hci.req_complete) {
+ *req_complete = bt_cb(skb)->hci.req_complete;
+ return;
+ }
+
+ /* Remove all pending commands belonging to this request */
+ spin_lock_irqsave(&hdev->cmd_q.lock, flags);
+ while ((skb = __skb_dequeue(&hdev->cmd_q))) {
+ if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) {
+ __skb_queue_head(&hdev->cmd_q, skb);
+ break;
+ }
+
+ if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB)
+ *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
+ else
+ *req_complete = bt_cb(skb)->hci.req_complete;
+ dev_kfree_skb_irq(skb);
+ }
+ spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
}
-static void hci_rx_task(unsigned long arg)
+static void hci_rx_work(struct work_struct *work)
{
- struct hci_dev *hdev = (struct hci_dev *) arg;
+ struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work);
struct sk_buff *skb;
BT_DBG("%s", hdev->name);
- read_lock(&hci_task_lock);
+ /* The kcov_remote functions used for collecting packet parsing
+ * coverage information from this background thread and associate
+ * the coverage with the syscall's thread which originally injected
+ * the packet. This helps fuzzing the kernel.
+ */
+ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) {
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
+
+ /* Send copy to monitor */
+ hci_send_to_monitor(hdev, skb);
- while ((skb = skb_dequeue(&hdev->rx_q))) {
if (atomic_read(&hdev->promisc)) {
/* Send copy to the sockets */
hci_send_to_sock(hdev, skb);
}
- if (test_bit(HCI_RAW, &hdev->flags)) {
+ /* If the device has been opened in HCI_USER_CHANNEL,
+ * the userspace has exclusive access to device.
+ * When device is HCI_INIT, we still need to process
+ * the data packets to the driver in order
+ * to complete its setup().
+ */
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !test_bit(HCI_INIT, &hdev->flags)) {
kfree_skb(skb);
continue;
}
if (test_bit(HCI_INIT, &hdev->flags)) {
/* Don't process data packets in this states. */
- switch (bt_cb(skb)->pkt_type) {
+ switch (hci_skb_pkt_type(skb)) {
case HCI_ACLDATA_PKT:
case HCI_SCODATA_PKT:
+ case HCI_ISODATA_PKT:
kfree_skb(skb);
continue;
- };
+ }
}
/* Process frame */
- switch (bt_cb(skb)->pkt_type) {
+ switch (hci_skb_pkt_type(skb)) {
case HCI_EVENT_PKT:
+ BT_DBG("%s Event packet", hdev->name);
hci_event_packet(hdev, skb);
break;
@@ -1406,39 +4086,80 @@ static void hci_rx_task(unsigned long arg)
hci_scodata_packet(hdev, skb);
break;
+ case HCI_ISODATA_PKT:
+ BT_DBG("%s ISO data packet", hdev->name);
+ hci_isodata_packet(hdev, skb);
+ break;
+
default:
kfree_skb(skb);
break;
}
}
-
- read_unlock(&hci_task_lock);
}
-static void hci_cmd_task(unsigned long arg)
+static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct hci_dev *hdev = (struct hci_dev *) arg;
- struct sk_buff *skb;
+ int err;
- BT_DBG("%s cmd %d", hdev->name, atomic_read(&hdev->cmd_cnt));
+ bt_dev_dbg(hdev, "skb %p", skb);
- if (!atomic_read(&hdev->cmd_cnt) && (jiffies - hdev->cmd_last_tx) > HZ) {
- BT_ERR("%s command tx timeout", hdev->name);
- atomic_set(&hdev->cmd_cnt, 1);
+ kfree_skb(hdev->sent_cmd);
+
+ hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
+ if (!hdev->sent_cmd) {
+ skb_queue_head(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+ return -EINVAL;
}
- /* Send queued commands */
- if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) {
- if (hdev->sent_cmd)
- kfree_skb(hdev->sent_cmd);
-
- if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) {
- atomic_dec(&hdev->cmd_cnt);
- hci_send_frame(skb);
- hdev->cmd_last_tx = jiffies;
- } else {
- skb_queue_head(&hdev->cmd_q, skb);
- hci_sched_cmd(hdev);
+ if (hci_skb_opcode(skb) != HCI_OP_NOP) {
+ err = hci_send_frame(hdev, skb);
+ if (err < 0) {
+ hci_cmd_sync_cancel_sync(hdev, -err);
+ return err;
}
+ atomic_dec(&hdev->cmd_cnt);
+ } else {
+ err = -ENODATA;
+ kfree_skb(skb);
+ }
+
+ if (hdev->req_status == HCI_REQ_PEND &&
+ !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) {
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL);
+ }
+
+ return err;
+}
+
+static void hci_cmd_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
+ struct sk_buff *skb;
+ int err;
+
+ BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name,
+ atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q));
+
+ /* Send queued commands */
+ if (atomic_read(&hdev->cmd_cnt)) {
+ skb = skb_dequeue(&hdev->cmd_q);
+ if (!skb)
+ return;
+
+ err = hci_send_cmd_sync(hdev, skb);
+ if (err)
+ return;
+
+ rcu_read_lock();
+ if (test_bit(HCI_RESET, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+ cancel_delayed_work(&hdev->cmd_timer);
+ else
+ queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
+ HCI_CMD_TIMEOUT);
+ rcu_read_unlock();
}
}
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
new file mode 100644
index 000000000000..99e2e9fc70e8
--- /dev/null
+++ b/net/bluetooth/hci_debugfs.c
@@ -0,0 +1,1395 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/debugfs.h>
+#include <linux/kstrtox.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "smp.h"
+#include "hci_debugfs.h"
+
+#define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \
+static ssize_t __name ## _read(struct file *file, \
+ char __user *user_buf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct hci_dev *hdev = file->private_data; \
+ char buf[3]; \
+ \
+ buf[0] = test_bit(__quirk, hdev->quirk_flags) ? 'Y' : 'N'; \
+ buf[1] = '\n'; \
+ buf[2] = '\0'; \
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2); \
+} \
+ \
+static ssize_t __name ## _write(struct file *file, \
+ const char __user *user_buf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct hci_dev *hdev = file->private_data; \
+ bool enable; \
+ int err; \
+ \
+ if (test_bit(HCI_UP, &hdev->flags)) \
+ return -EBUSY; \
+ \
+ err = kstrtobool_from_user(user_buf, count, &enable); \
+ if (err) \
+ return err; \
+ \
+ if (enable == test_bit(__quirk, hdev->quirk_flags)) \
+ return -EALREADY; \
+ \
+ change_bit(__quirk, hdev->quirk_flags); \
+ \
+ return count; \
+} \
+ \
+static const struct file_operations __name ## _fops = { \
+ .open = simple_open, \
+ .read = __name ## _read, \
+ .write = __name ## _write, \
+ .llseek = default_llseek, \
+} \
+
+#define DEFINE_INFO_ATTRIBUTE(__name, __field) \
+static int __name ## _show(struct seq_file *f, void *ptr) \
+{ \
+ struct hci_dev *hdev = f->private; \
+ \
+ hci_dev_lock(hdev); \
+ seq_printf(f, "%s\n", hdev->__field ? : ""); \
+ hci_dev_unlock(hdev); \
+ \
+ return 0; \
+} \
+ \
+DEFINE_SHOW_ATTRIBUTE(__name)
+
+static int features_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ u8 p;
+
+ hci_dev_lock(hdev);
+ for (p = 0; p < HCI_MAX_PAGES && p <= hdev->max_page; p++)
+ seq_printf(f, "%2u: %8ph\n", p, hdev->features[p]);
+ if (lmp_le_capable(hdev))
+ seq_printf(f, "LE: %8ph\n", hdev->le_features);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(features);
+
+static int device_id_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "%4.4x:%4.4x:%4.4x:%4.4x\n", hdev->devid_source,
+ hdev->devid_vendor, hdev->devid_product, hdev->devid_version);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(device_id);
+
+static int device_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct hci_conn_params *p;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->accept_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ seq_printf(f, "%pMR (type %u) %u\n", &p->addr, p->addr_type,
+ p->auto_connect);
+ }
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(device_list);
+
+static int blacklist_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->reject_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(blacklist);
+
+static int blocked_keys_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct blocked_key *key;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(key, &hdev->blocked_keys, list)
+ seq_printf(f, "%u %*phN\n", key->type, 16, key->val);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(blocked_keys);
+
+static int uuids_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct bt_uuid *uuid;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ u8 i, val[16];
+
+ /* The Bluetooth UUID values are stored in big endian,
+ * but with reversed byte order. So convert them into
+ * the right order for the %pUb modifier.
+ */
+ for (i = 0; i < 16; i++)
+ val[i] = uuid->uuid[15 - i];
+
+ seq_printf(f, "%pUb\n", val);
+ }
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(uuids);
+
+static int remote_oob_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct oob_data *data;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(data, &hdev->remote_oob_data, list) {
+ seq_printf(f, "%pMR (type %u) %u %*phN %*phN %*phN %*phN\n",
+ &data->bdaddr, data->bdaddr_type, data->present,
+ 16, data->hash192, 16, data->rand192,
+ 16, data->hash256, 16, data->rand256);
+ }
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(remote_oob);
+
+static int conn_info_min_age_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val == 0 || val > hdev->conn_info_max_age) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->conn_info_min_age = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_info_min_age_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->conn_info_min_age;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get,
+ conn_info_min_age_set, "%llu\n");
+
+static int conn_info_max_age_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val == 0 || val < hdev->conn_info_min_age) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->conn_info_max_age = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_info_max_age_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->conn_info_max_age;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get,
+ conn_info_max_age_set, "%llu\n");
+
+static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static const struct file_operations use_debug_keys_fops = {
+ .open = simple_open,
+ .read = use_debug_keys_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t sc_only_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static const struct file_operations sc_only_mode_fops = {
+ .open = simple_open,
+ .read = sc_only_mode_read,
+ .llseek = default_llseek,
+};
+
+DEFINE_INFO_ATTRIBUTE(hardware_info, hw_info);
+DEFINE_INFO_ATTRIBUTE(firmware_info, fw_info);
+
+void hci_debugfs_create_common(struct hci_dev *hdev)
+{
+ debugfs_create_file("features", 0444, hdev->debugfs, hdev,
+ &features_fops);
+ debugfs_create_u16("manufacturer", 0444, hdev->debugfs,
+ &hdev->manufacturer);
+ debugfs_create_u8("hci_version", 0444, hdev->debugfs, &hdev->hci_ver);
+ debugfs_create_u16("hci_revision", 0444, hdev->debugfs, &hdev->hci_rev);
+ debugfs_create_u8("hardware_error", 0444, hdev->debugfs,
+ &hdev->hw_error_code);
+ debugfs_create_file("device_id", 0444, hdev->debugfs, hdev,
+ &device_id_fops);
+
+ debugfs_create_file("device_list", 0444, hdev->debugfs, hdev,
+ &device_list_fops);
+ debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev,
+ &blacklist_fops);
+ debugfs_create_file("blocked_keys", 0444, hdev->debugfs, hdev,
+ &blocked_keys_fops);
+ debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops);
+ debugfs_create_file("remote_oob", 0400, hdev->debugfs, hdev,
+ &remote_oob_fops);
+
+ debugfs_create_file("conn_info_min_age", 0644, hdev->debugfs, hdev,
+ &conn_info_min_age_fops);
+ debugfs_create_file("conn_info_max_age", 0644, hdev->debugfs, hdev,
+ &conn_info_max_age_fops);
+
+ if (lmp_ssp_capable(hdev) || lmp_le_capable(hdev))
+ debugfs_create_file("use_debug_keys", 0444, hdev->debugfs,
+ hdev, &use_debug_keys_fops);
+
+ if (lmp_sc_capable(hdev) || lmp_le_capable(hdev))
+ debugfs_create_file("sc_only_mode", 0444, hdev->debugfs,
+ hdev, &sc_only_mode_fops);
+
+ if (hdev->hw_info)
+ debugfs_create_file("hardware_info", 0444, hdev->debugfs,
+ hdev, &hardware_info_fops);
+
+ if (hdev->fw_info)
+ debugfs_create_file("firmware_info", 0444, hdev->debugfs,
+ hdev, &firmware_info_fops);
+}
+
+static int inquiry_cache_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ hci_dev_lock(hdev);
+
+ list_for_each_entry(e, &cache->all, all) {
+ struct inquiry_data *data = &e->data;
+ seq_printf(f, "%pMR %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n",
+ &data->bdaddr,
+ data->pscan_rep_mode, data->pscan_period_mode,
+ data->pscan_mode, data->dev_class[2],
+ data->dev_class[1], data->dev_class[0],
+ __le16_to_cpu(data->clock_offset),
+ data->rssi, data->ssp_mode, e->timestamp);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(inquiry_cache);
+
+static int link_keys_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct link_key *key;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(key, &hdev->link_keys, list)
+ seq_printf(f, "%pMR %u %*phN %u\n", &key->bdaddr, key->type,
+ HCI_LINK_KEY_SIZE, key->val, key->pin_len);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(link_keys);
+
+static int dev_class_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "0x%.2x%.2x%.2x\n", hdev->dev_class[2],
+ hdev->dev_class[1], hdev->dev_class[0]);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(dev_class);
+
+static int voice_setting_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->voice_setting;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(voice_setting_fops, voice_setting_get,
+ NULL, "0x%4.4llx\n");
+
+static ssize_t ssp_debug_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hdev->ssp_debug_mode ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static const struct file_operations ssp_debug_mode_fops = {
+ .open = simple_open,
+ .read = ssp_debug_mode_read,
+ .llseek = default_llseek,
+};
+
+static int auto_accept_delay_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ hdev->auto_accept_delay = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int min_encrypt_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 1 || val > 16)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->min_enc_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int min_encrypt_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->min_enc_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(min_encrypt_key_size_fops,
+ min_encrypt_key_size_get,
+ min_encrypt_key_size_set, "%llu\n");
+
+static int auto_accept_delay_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->auto_accept_delay;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get,
+ auto_accept_delay_set, "%llu\n");
+
+static ssize_t force_bredr_smp_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_bredr_smp_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ bool enable;
+ int err;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ err = smp_force_bredr(hdev, enable);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static const struct file_operations force_bredr_smp_fops = {
+ .open = simple_open,
+ .read = force_bredr_smp_read,
+ .write = force_bredr_smp_write,
+ .llseek = default_llseek,
+};
+
+static int idle_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val != 0 && (val < 500 || val > 3600000))
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->idle_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int idle_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->idle_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(idle_timeout_fops, idle_timeout_get,
+ idle_timeout_set, "%llu\n");
+
+static int sniff_min_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val == 0 || val % 2 || val > hdev->sniff_max_interval) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->sniff_min_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int sniff_min_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->sniff_min_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(sniff_min_interval_fops, sniff_min_interval_get,
+ sniff_min_interval_set, "%llu\n");
+
+static int sniff_max_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val == 0 || val % 2 || val < hdev->sniff_min_interval) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->sniff_max_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int sniff_max_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->sniff_max_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get,
+ sniff_max_interval_set, "%llu\n");
+
+void hci_debugfs_create_bredr(struct hci_dev *hdev)
+{
+ debugfs_create_file("inquiry_cache", 0444, hdev->debugfs, hdev,
+ &inquiry_cache_fops);
+ debugfs_create_file("link_keys", 0400, hdev->debugfs, hdev,
+ &link_keys_fops);
+ debugfs_create_file("dev_class", 0444, hdev->debugfs, hdev,
+ &dev_class_fops);
+ debugfs_create_file("voice_setting", 0444, hdev->debugfs, hdev,
+ &voice_setting_fops);
+
+ /* If the controller does not support BR/EDR Secure Connections
+ * feature, then the BR/EDR SMP channel shall not be present.
+ *
+ * To test this with Bluetooth 4.0 controllers, create a debugfs
+ * switch that allows forcing BR/EDR SMP support and accepting
+ * cross-transport pairing on non-AES encrypted connections.
+ */
+ if (!lmp_sc_capable(hdev))
+ debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
+ hdev, &force_bredr_smp_fops);
+
+ if (lmp_ssp_capable(hdev)) {
+ debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs,
+ hdev, &ssp_debug_mode_fops);
+ debugfs_create_file("min_encrypt_key_size", 0644, hdev->debugfs,
+ hdev, &min_encrypt_key_size_fops);
+ debugfs_create_file("auto_accept_delay", 0644, hdev->debugfs,
+ hdev, &auto_accept_delay_fops);
+ }
+
+ if (lmp_sniff_capable(hdev)) {
+ debugfs_create_file("idle_timeout", 0644, hdev->debugfs,
+ hdev, &idle_timeout_fops);
+ debugfs_create_file("sniff_min_interval", 0644, hdev->debugfs,
+ hdev, &sniff_min_interval_fops);
+ debugfs_create_file("sniff_max_interval", 0644, hdev->debugfs,
+ hdev, &sniff_max_interval_fops);
+ }
+}
+
+static int identity_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ bdaddr_t addr;
+ u8 addr_type;
+
+ hci_dev_lock(hdev);
+
+ hci_copy_identity_address(hdev, &addr, &addr_type);
+
+ seq_printf(f, "%pMR (type %u) %*phN %pMR\n", &addr, addr_type,
+ 16, hdev->irk, &hdev->rpa);
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(identity);
+
+static int rpa_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ /* Require the RPA timeout to be at least 30 seconds and at most
+ * 24 hours.
+ */
+ if (val < 30 || val > (60 * 60 * 24))
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->rpa_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int rpa_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->rpa_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get,
+ rpa_timeout_set, "%llu\n");
+
+static int random_address_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "%pMR\n", &hdev->random_addr);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(random_address);
+
+static int static_address_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "%pMR\n", &hdev->static_addr);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(static_address);
+
+static ssize_t force_static_address_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_static_address_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ bool enable;
+ int err;
+
+ if (hdev_is_powered(hdev))
+ return -EBUSY;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR))
+ return -EALREADY;
+
+ hci_dev_change_flag(hdev, HCI_FORCE_STATIC_ADDR);
+
+ return count;
+}
+
+static const struct file_operations force_static_address_fops = {
+ .open = simple_open,
+ .read = force_static_address_read,
+ .write = force_static_address_write,
+ .llseek = default_llseek,
+};
+
+static int white_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->le_accept_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(white_list);
+
+static int resolv_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->le_resolv_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(resolv_list);
+
+static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct smp_irk *irk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ seq_printf(f, "%pMR (type %u) %*phN %pMR\n",
+ &irk->bdaddr, irk->addr_type,
+ 16, irk->val, &irk->rpa);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(identity_resolving_keys);
+
+static int long_term_keys_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct smp_ltk *ltk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ltk, &hdev->long_term_keys, list)
+ seq_printf(f, "%pMR (type %u) %u 0x%02x %u %.4x %.16llx %*phN\n",
+ &ltk->bdaddr, ltk->bdaddr_type, ltk->authenticated,
+ ltk->type, ltk->enc_size, __le16_to_cpu(ltk->ediv),
+ __le64_to_cpu(ltk->rand), 16, ltk->val);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(long_term_keys);
+
+static int conn_min_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_conn_min_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_min_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_conn_min_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(conn_min_interval_fops, conn_min_interval_get,
+ conn_min_interval_set, "%llu\n");
+
+static int conn_max_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_conn_max_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_max_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_conn_max_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get,
+ conn_max_interval_set, "%llu\n");
+
+static int conn_latency_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val > 0x01f3)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_conn_latency = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_latency_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_conn_latency;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(conn_latency_fops, conn_latency_get,
+ conn_latency_set, "%llu\n");
+
+static int supervision_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x000a || val > 0x0c80)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_supv_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int supervision_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_supv_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get,
+ supervision_timeout_set, "%llu\n");
+
+static int adv_channel_map_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x01 || val > 0x07)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_adv_channel_map = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int adv_channel_map_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_adv_channel_map;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get,
+ adv_channel_map_set, "%llu\n");
+
+static int adv_min_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_adv_min_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int adv_min_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_adv_min_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get,
+ adv_min_interval_set, "%llu\n");
+
+static int adv_max_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_adv_max_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int adv_max_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_adv_max_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
+ adv_max_interval_set, "%llu\n");
+
+static int min_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val > hdev->le_max_key_size || val < SMP_MIN_ENC_KEY_SIZE) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_min_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int min_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_min_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(min_key_size_fops, min_key_size_get,
+ min_key_size_set, "%llu\n");
+
+static int max_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val > SMP_MAX_ENC_KEY_SIZE || val < hdev->le_min_key_size) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_max_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int max_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_max_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(max_key_size_fops, max_key_size_get,
+ max_key_size_set, "%llu\n");
+
+static int auth_payload_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0001 || val > 0xffff)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->auth_payload_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int auth_payload_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->auth_payload_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(auth_payload_timeout_fops,
+ auth_payload_timeout_get,
+ auth_payload_timeout_set, "%llu\n");
+
+static ssize_t force_no_mitm_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_no_mitm_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf) - 1));
+ bool enable;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ if (kstrtobool(buf, &enable))
+ return -EINVAL;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM))
+ return -EALREADY;
+
+ hci_dev_change_flag(hdev, HCI_FORCE_NO_MITM);
+
+ return count;
+}
+
+static const struct file_operations force_no_mitm_fops = {
+ .open = simple_open,
+ .read = force_no_mitm_read,
+ .write = force_no_mitm_write,
+ .llseek = default_llseek,
+};
+
+DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter,
+ HCI_QUIRK_STRICT_DUPLICATE_FILTER);
+DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery,
+ HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+
+void hci_debugfs_create_le(struct hci_dev *hdev)
+{
+ debugfs_create_file("identity", 0400, hdev->debugfs, hdev,
+ &identity_fops);
+ debugfs_create_file("rpa_timeout", 0644, hdev->debugfs, hdev,
+ &rpa_timeout_fops);
+ debugfs_create_file("random_address", 0444, hdev->debugfs, hdev,
+ &random_address_fops);
+ debugfs_create_file("static_address", 0444, hdev->debugfs, hdev,
+ &static_address_fops);
+
+ /* For controllers with a public address, provide a debug
+ * option to force the usage of the configured static
+ * address. By default the public address is used.
+ */
+ if (bacmp(&hdev->bdaddr, BDADDR_ANY))
+ debugfs_create_file("force_static_address", 0644,
+ hdev->debugfs, hdev,
+ &force_static_address_fops);
+
+ debugfs_create_u8("white_list_size", 0444, hdev->debugfs,
+ &hdev->le_accept_list_size);
+ debugfs_create_file("white_list", 0444, hdev->debugfs, hdev,
+ &white_list_fops);
+ debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs,
+ &hdev->le_resolv_list_size);
+ debugfs_create_file("resolv_list", 0444, hdev->debugfs, hdev,
+ &resolv_list_fops);
+ debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs,
+ hdev, &identity_resolving_keys_fops);
+ debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev,
+ &long_term_keys_fops);
+ debugfs_create_file("conn_min_interval", 0644, hdev->debugfs, hdev,
+ &conn_min_interval_fops);
+ debugfs_create_file("conn_max_interval", 0644, hdev->debugfs, hdev,
+ &conn_max_interval_fops);
+ debugfs_create_file("conn_latency", 0644, hdev->debugfs, hdev,
+ &conn_latency_fops);
+ debugfs_create_file("supervision_timeout", 0644, hdev->debugfs, hdev,
+ &supervision_timeout_fops);
+ debugfs_create_file("adv_channel_map", 0644, hdev->debugfs, hdev,
+ &adv_channel_map_fops);
+ debugfs_create_file("adv_min_interval", 0644, hdev->debugfs, hdev,
+ &adv_min_interval_fops);
+ debugfs_create_file("adv_max_interval", 0644, hdev->debugfs, hdev,
+ &adv_max_interval_fops);
+ debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs,
+ &hdev->discov_interleaved_timeout);
+ debugfs_create_file("min_key_size", 0644, hdev->debugfs, hdev,
+ &min_key_size_fops);
+ debugfs_create_file("max_key_size", 0644, hdev->debugfs, hdev,
+ &max_key_size_fops);
+ debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev,
+ &auth_payload_timeout_fops);
+ debugfs_create_file("force_no_mitm", 0644, hdev->debugfs, hdev,
+ &force_no_mitm_fops);
+
+ debugfs_create_file("quirk_strict_duplicate_filter", 0644,
+ hdev->debugfs, hdev,
+ &quirk_strict_duplicate_filter_fops);
+ debugfs_create_file("quirk_simultaneous_discovery", 0644,
+ hdev->debugfs, hdev,
+ &quirk_simultaneous_discovery_fops);
+}
+
+void hci_debugfs_create_conn(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ char name[6];
+
+ if (IS_ERR_OR_NULL(hdev->debugfs) || conn->debugfs)
+ return;
+
+ snprintf(name, sizeof(name), "%u", conn->handle);
+ conn->debugfs = debugfs_create_dir(name, hdev->debugfs);
+}
+
+static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ struct sk_buff *skb;
+ bool enable;
+ int err;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENETDOWN;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
+ return -EALREADY;
+
+ hci_req_sync_lock(hdev);
+ if (enable)
+ skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ else
+ skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ hci_req_sync_unlock(hdev);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ kfree_skb(skb);
+
+ hci_dev_change_flag(hdev, HCI_DUT_MODE);
+
+ return count;
+}
+
+static const struct file_operations dut_mode_fops = {
+ .open = simple_open,
+ .read = dut_mode_read,
+ .write = dut_mode_write,
+ .llseek = default_llseek,
+};
+
+static ssize_t vendor_diag_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ bool enable;
+ int err;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ /* When the diagnostic flags are not persistent and the transport
+ * is not active or in user channel operation, then there is no need
+ * for the vendor callback. Instead just store the desired value and
+ * the setting will be programmed when the controller gets powered on.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) &&
+ (!test_bit(HCI_RUNNING, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_USER_CHANNEL)))
+ goto done;
+
+ hci_req_sync_lock(hdev);
+ err = hdev->set_diag(hdev, enable);
+ hci_req_sync_unlock(hdev);
+
+ if (err < 0)
+ return err;
+
+done:
+ if (enable)
+ hci_dev_set_flag(hdev, HCI_VENDOR_DIAG);
+ else
+ hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG);
+
+ return count;
+}
+
+static const struct file_operations vendor_diag_fops = {
+ .open = simple_open,
+ .read = vendor_diag_read,
+ .write = vendor_diag_write,
+ .llseek = default_llseek,
+};
+
+void hci_debugfs_create_basic(struct hci_dev *hdev)
+{
+ debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev,
+ &dut_mode_fops);
+
+ if (hdev->set_diag)
+ debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev,
+ &vendor_diag_fops);
+}
diff --git a/net/bluetooth/hci_debugfs.h b/net/bluetooth/hci_debugfs.h
new file mode 100644
index 000000000000..9a8a7c93bb12
--- /dev/null
+++ b/net/bluetooth/hci_debugfs.h
@@ -0,0 +1,53 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#if IS_ENABLED(CONFIG_BT_DEBUGFS)
+
+void hci_debugfs_create_common(struct hci_dev *hdev);
+void hci_debugfs_create_bredr(struct hci_dev *hdev);
+void hci_debugfs_create_le(struct hci_dev *hdev);
+void hci_debugfs_create_conn(struct hci_conn *conn);
+void hci_debugfs_create_basic(struct hci_dev *hdev);
+
+#else
+
+static inline void hci_debugfs_create_common(struct hci_dev *hdev)
+{
+}
+
+static inline void hci_debugfs_create_bredr(struct hci_dev *hdev)
+{
+}
+
+static inline void hci_debugfs_create_le(struct hci_dev *hdev)
+{
+}
+
+static inline void hci_debugfs_create_conn(struct hci_conn *conn)
+{
+}
+
+static inline void hci_debugfs_create_basic(struct hci_dev *hdev)
+{
+}
+
+#endif
diff --git a/net/bluetooth/hci_drv.c b/net/bluetooth/hci_drv.c
new file mode 100644
index 000000000000..3dd2d8a006b9
--- /dev/null
+++ b/net/bluetooth/hci_drv.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google Corporation
+ */
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_drv.h>
+
+int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status)
+{
+ struct hci_drv_ev_hdr *hdr;
+ struct hci_drv_ev_cmd_status *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_STATUS);
+ hdr->len = __cpu_to_le16(sizeof(*ev));
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->opcode = __cpu_to_le16(cmd);
+ ev->status = status;
+
+ hci_skb_pkt_type(skb) = HCI_DRV_PKT;
+
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_drv_cmd_status);
+
+int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp,
+ size_t rp_len)
+{
+ struct hci_drv_ev_hdr *hdr;
+ struct hci_drv_ev_cmd_complete *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_COMPLETE);
+ hdr->len = __cpu_to_le16(sizeof(*ev) + rp_len);
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->opcode = __cpu_to_le16(cmd);
+ ev->status = status;
+
+ skb_put_data(skb, rp, rp_len);
+
+ hci_skb_pkt_type(skb) = HCI_DRV_PKT;
+
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_drv_cmd_complete);
+
+int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_drv_cmd_hdr *hdr;
+ const struct hci_drv_handler *handler = NULL;
+ u16 opcode, len, ogf, ocf;
+
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr)
+ return -EILSEQ;
+
+ opcode = __le16_to_cpu(hdr->opcode);
+ len = __le16_to_cpu(hdr->len);
+ if (len != skb->len)
+ return -EILSEQ;
+
+ ogf = hci_opcode_ogf(opcode);
+ ocf = hci_opcode_ocf(opcode);
+
+ if (!hdev->hci_drv)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_UNKNOWN_COMMAND);
+
+ if (ogf != HCI_DRV_OGF_DRIVER_SPECIFIC) {
+ if (opcode < hdev->hci_drv->common_handler_count)
+ handler = &hdev->hci_drv->common_handlers[opcode];
+ } else {
+ if (ocf < hdev->hci_drv->specific_handler_count)
+ handler = &hdev->hci_drv->specific_handlers[ocf];
+ }
+
+ if (!handler || !handler->func)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_UNKNOWN_COMMAND);
+
+ if (len != handler->data_len)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_INVALID_PARAMETERS);
+
+ return handler->func(hdev, skb->data, len);
+}
+EXPORT_SYMBOL(hci_drv_process_cmd);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index bb94e6da223c..a9868f17ef40 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1,6 +1,7 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
- Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+ Copyright 2023-2024 NXP
Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
@@ -12,406 +13,2242 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth HCI event handling. */
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
-#ifndef CONFIG_BT_HCI_CORE_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#include "hci_debugfs.h"
+#include "hci_codec.h"
+#include "smp.h"
+#include "msft.h"
+#include "eir.h"
+
+#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
/* Handle HCI Event packets */
-/* Command Complete OGF LINK_CTL */
-static void hci_cc_link_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb)
+static void *hci_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
{
- __u8 status;
- struct hci_conn *pend;
+ void *data;
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed Event: 0x%2.2x", ev);
- switch (ocf) {
- case OCF_INQUIRY_CANCEL:
- case OCF_EXIT_PERIODIC_INQ:
- status = *((__u8 *) skb->data);
+ return data;
+}
- if (status) {
- BT_DBG("%s Inquiry cancel error: status 0x%x", hdev->name, status);
- } else {
- clear_bit(HCI_INQUIRY, &hdev->flags);
- hci_req_complete(hdev, status);
- }
+static void *hci_cc_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u16 op, size_t len)
+{
+ void *data;
- hci_dev_lock(hdev);
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed Command Complete: 0x%4.4x", op);
- pend = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
- if (pend)
- hci_acl_connect(pend);
+ return data;
+}
- hci_dev_unlock(hdev);
+static void *hci_le_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
+{
+ void *data;
- break;
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed LE Event: 0x%2.2x", ev);
- default:
- BT_DBG("%s Command complete: ogf LINK_CTL ocf %x", hdev->name, ocf);
- break;
+ return data;
+}
+
+static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ /* It is possible that we receive Inquiry Complete event right
+ * before we receive Inquiry Cancel Command Complete event, in
+ * which case the latter event should have status of Command
+ * Disallowed. This should not be treated as error, since
+ * we actually achieve what Inquiry Cancel wants to achieve,
+ * which is to end the last Inquiry session.
+ */
+ if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) {
+ bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command");
+ rp->status = 0x00;
}
+
+ if (rp->status)
+ return rp->status;
+
+ clear_bit(HCI_INQUIRY, &hdev->flags);
+ smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */
+ wake_up_bit(&hdev->flags, HCI_INQUIRY);
+
+ hci_dev_lock(hdev);
+ /* Set discovery state to stopped if we're not doing LE active
+ * scanning.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ hdev->le_scan_type != LE_SCAN_ACTIVE)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_periodic_inq(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_set_flag(hdev, HCI_PERIODIC_INQ);
+
+ return rp->status;
+}
+
+static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);
+
+ return rp->status;
+}
+
+static u8 hci_cc_remote_name_req_cancel(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_remote_name_req_cancel *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ return rp->status;
}
-/* Command Complete OGF LINK_POLICY */
-static void hci_cc_link_policy(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb)
+static u8 hci_cc_role_discovery(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
+ struct hci_rp_role_discovery *rp = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->role = rp->role;
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_link_policy *rp = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->link_policy = __le16_to_cpu(rp->policy);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_write_link_policy *rp = data;
struct hci_conn *conn;
- struct hci_rp_role_discovery *rd;
- struct hci_rp_write_link_policy *lp;
void *sent;
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- switch (ocf) {
- case OCF_ROLE_DISCOVERY:
- rd = (void *) skb->data;
+ if (rp->status)
+ return rp->status;
- if (rd->status)
- break;
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LINK_POLICY);
+ if (!sent)
+ return rp->status;
- hci_dev_lock(hdev);
+ hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rd->handle));
- if (conn) {
- if (rd->role)
- conn->link_mode &= ~HCI_LM_MASTER;
- else
- conn->link_mode |= HCI_LM_MASTER;
- }
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->link_policy = get_unaligned_le16(sent + 2);
- hci_dev_unlock(hdev);
- break;
+ hci_dev_unlock(hdev);
- case OCF_WRITE_LINK_POLICY:
- sent = hci_sent_cmd_data(hdev, OGF_LINK_POLICY, OCF_WRITE_LINK_POLICY);
- if (!sent)
- break;
+ return rp->status;
+}
- lp = (struct hci_rp_write_link_policy *) skb->data;
+static u8 hci_cc_read_def_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_def_link_policy *rp = data;
- if (lp->status)
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- hci_dev_lock(hdev);
+ if (rp->status)
+ return rp->status;
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(lp->handle));
- if (conn) {
- __le16 policy = get_unaligned((__le16 *) (sent + 2));
- conn->link_policy = __le16_to_cpu(policy);
- }
+ hdev->link_policy = __le16_to_cpu(rp->policy);
- hci_dev_unlock(hdev);
- break;
+ return rp->status;
+}
- default:
- BT_DBG("%s: Command complete: ogf LINK_POLICY ocf %x",
- hdev->name, ocf);
- break;
+static u8 hci_cc_write_def_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_LINK_POLICY);
+ if (!sent)
+ return rp->status;
+
+ hdev->link_policy = get_unaligned_le16(sent);
+
+ return rp->status;
+}
+
+static u8 hci_cc_reset(struct hci_dev *hdev, void *data, struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ clear_bit(HCI_RESET, &hdev->flags);
+
+ if (rp->status)
+ return rp->status;
+
+ /* Reset all non-persistent flags */
+ hci_dev_clear_volatile_flags(hdev);
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ hdev->inq_tx_power = HCI_TX_POWER_INVALID;
+ hdev->adv_tx_power = HCI_TX_POWER_INVALID;
+
+ memset(hdev->adv_data, 0, sizeof(hdev->adv_data));
+ hdev->adv_data_len = 0;
+
+ memset(hdev->scan_rsp_data, 0, sizeof(hdev->scan_rsp_data));
+ hdev->scan_rsp_data_len = 0;
+
+ hdev->le_scan_type = LE_SCAN_PASSIVE;
+
+ hdev->ssp_debug_mode = 0;
+
+ hci_bdaddr_list_clear(&hdev->le_accept_list);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_stored_link_key(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_stored_link_key *rp = data;
+ struct hci_cp_read_stored_link_key *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_READ_STORED_LINK_KEY);
+ if (!sent)
+ return rp->status;
+
+ if (!rp->status && sent->read_all == 0x01) {
+ hdev->stored_max_keys = le16_to_cpu(rp->max_keys);
+ hdev->stored_num_keys = le16_to_cpu(rp->num_keys);
+ }
+
+ return rp->status;
+}
+
+static u8 hci_cc_delete_stored_link_key(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_delete_stored_link_key *rp = data;
+ u16 num_keys;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ num_keys = le16_to_cpu(rp->num_keys);
+
+ if (num_keys <= hdev->stored_num_keys)
+ hdev->stored_num_keys -= num_keys;
+ else
+ hdev->stored_num_keys = 0;
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_local_name(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LOCAL_NAME);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_set_local_name_complete(hdev, sent, rp->status);
+ else if (!rp->status)
+ memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_name(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_name *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG))
+ memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_auth_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_ENABLE);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ if (!rp->status) {
+ __u8 param = *((__u8 *) sent);
+
+ if (param == AUTH_ENABLED)
+ set_bit(HCI_AUTH, &hdev->flags);
+ else
+ clear_bit(HCI_AUTH, &hdev->flags);
}
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_auth_enable_complete(hdev, rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
}
-/* Command Complete OGF HOST_CTL */
-static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb)
+static u8 hci_cc_write_encrypt_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status, param;
+ struct hci_ev_status *rp = data;
+ __u8 param;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_ENCRYPT_MODE);
+ if (!sent)
+ return rp->status;
+
+ param = *((__u8 *) sent);
+
+ if (param)
+ set_bit(HCI_ENCRYPT, &hdev->flags);
+ else
+ clear_bit(HCI_ENCRYPT, &hdev->flags);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_scan_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ __u8 param;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SCAN_ENABLE);
+ if (!sent)
+ return rp->status;
+
+ param = *((__u8 *) sent);
+
+ hci_dev_lock(hdev);
+
+ if (rp->status) {
+ hdev->discov_timeout = 0;
+ goto done;
+ }
+
+ if (param & SCAN_INQUIRY)
+ set_bit(HCI_ISCAN, &hdev->flags);
+ else
+ clear_bit(HCI_ISCAN, &hdev->flags);
+
+ if (param & SCAN_PAGE)
+ set_bit(HCI_PSCAN, &hdev->flags);
+ else
+ clear_bit(HCI_PSCAN, &hdev->flags);
+
+done:
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_set_event_filter(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_set_event_filter *cp;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_SET_EVENT_FLT);
+ if (!sent)
+ return rp->status;
+
+ cp = (struct hci_cp_set_event_filter *)sent;
+
+ if (cp->flt_type == HCI_FLT_CLEAR_ALL)
+ hci_dev_clear_flag(hdev, HCI_EVENT_FILTER_CONFIGURED);
+ else
+ hci_dev_set_flag(hdev, HCI_EVENT_FILTER_CONFIGURED);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_class_of_dev(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_class_of_dev *rp = data;
+
+ if (WARN_ON(!hdev))
+ return HCI_ERROR_UNSPECIFIED;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ memcpy(hdev->dev_class, rp->dev_class, 3);
+
+ bt_dev_dbg(hdev, "class 0x%.2x%.2x%.2x", hdev->dev_class[2],
+ hdev->dev_class[1], hdev->dev_class[0]);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_class_of_dev(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_CLASS_OF_DEV);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ if (!rp->status)
+ memcpy(hdev->dev_class, sent, 3);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_set_class_of_dev_complete(hdev, sent, rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_voice_setting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_voice_setting *rp = data;
+ __u16 setting;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ setting = __le16_to_cpu(rp->voice_setting);
+
+ if (hdev->voice_setting == setting)
+ return rp->status;
+
+ hdev->voice_setting = setting;
+
+ bt_dev_dbg(hdev, "voice setting 0x%4.4x", setting);
+
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_voice_setting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
__u16 setting;
- struct hci_rp_read_voice_setting *vs;
void *sent;
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- switch (ocf) {
- case OCF_RESET:
- status = *((__u8 *) skb->data);
- hci_req_complete(hdev, status);
- break;
+ if (rp->status)
+ return rp->status;
- case OCF_SET_EVENT_FLT:
- status = *((__u8 *) skb->data);
- if (status) {
- BT_DBG("%s SET_EVENT_FLT failed %d", hdev->name, status);
- } else {
- BT_DBG("%s SET_EVENT_FLT succeseful", hdev->name);
- }
- break;
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_VOICE_SETTING);
+ if (!sent)
+ return rp->status;
- case OCF_WRITE_AUTH_ENABLE:
- sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_AUTH_ENABLE);
- if (!sent)
- break;
+ setting = get_unaligned_le16(sent);
- status = *((__u8 *) skb->data);
- param = *((__u8 *) sent);
+ if (hdev->voice_setting == setting)
+ return rp->status;
- if (!status) {
- if (param == AUTH_ENABLED)
- set_bit(HCI_AUTH, &hdev->flags);
- else
- clear_bit(HCI_AUTH, &hdev->flags);
- }
- hci_req_complete(hdev, status);
- break;
+ hdev->voice_setting = setting;
- case OCF_WRITE_ENCRYPT_MODE:
- sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_ENCRYPT_MODE);
- if (!sent)
- break;
+ bt_dev_dbg(hdev, "voice setting 0x%4.4x", setting);
- status = *((__u8 *) skb->data);
- param = *((__u8 *) sent);
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
- if (!status) {
- if (param)
- set_bit(HCI_ENCRYPT, &hdev->flags);
- else
- clear_bit(HCI_ENCRYPT, &hdev->flags);
- }
- hci_req_complete(hdev, status);
- break;
+ return rp->status;
+}
- case OCF_WRITE_CA_TIMEOUT:
- status = *((__u8 *) skb->data);
- if (status) {
- BT_DBG("%s OCF_WRITE_CA_TIMEOUT failed %d", hdev->name, status);
- } else {
- BT_DBG("%s OCF_WRITE_CA_TIMEOUT succeseful", hdev->name);
- }
- break;
+static u8 hci_cc_read_num_supported_iac(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_num_supported_iac *rp = data;
- case OCF_WRITE_PG_TIMEOUT:
- status = *((__u8 *) skb->data);
- if (status) {
- BT_DBG("%s OCF_WRITE_PG_TIMEOUT failed %d", hdev->name, status);
- } else {
- BT_DBG("%s: OCF_WRITE_PG_TIMEOUT succeseful", hdev->name);
- }
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case OCF_WRITE_SCAN_ENABLE:
- sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_SCAN_ENABLE);
- if (!sent)
- break;
+ if (rp->status)
+ return rp->status;
+
+ hdev->num_iac = rp->num_iac;
+
+ bt_dev_dbg(hdev, "num iac %d", hdev->num_iac);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_ssp_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_write_ssp_mode *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_MODE);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ if (!rp->status) {
+ if (sent->mode)
+ hdev->features[1][0] |= LMP_HOST_SSP;
+ else
+ hdev->features[1][0] &= ~LMP_HOST_SSP;
+ }
+
+ if (!rp->status) {
+ if (sent->mode)
+ hci_dev_set_flag(hdev, HCI_SSP_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
- status = *((__u8 *) skb->data);
- param = *((__u8 *) sent);
+static u8 hci_cc_write_sc_support(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_write_sc_support *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SC_SUPPORT);
+ if (!sent)
+ return rp->status;
- BT_DBG("param 0x%x", param);
+ hci_dev_lock(hdev);
- if (!status) {
- clear_bit(HCI_PSCAN, &hdev->flags);
- clear_bit(HCI_ISCAN, &hdev->flags);
- if (param & SCAN_INQUIRY)
- set_bit(HCI_ISCAN, &hdev->flags);
+ if (!rp->status) {
+ if (sent->support)
+ hdev->features[1][0] |= LMP_HOST_SC;
+ else
+ hdev->features[1][0] &= ~LMP_HOST_SC;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT) && !rp->status) {
+ if (sent->support)
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_SC_ENABLED);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_version(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_version *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ hdev->hci_ver = rp->hci_ver;
+ hdev->hci_rev = __le16_to_cpu(rp->hci_rev);
+ hdev->lmp_ver = rp->lmp_ver;
+ hdev->manufacturer = __le16_to_cpu(rp->manufacturer);
+ hdev->lmp_subver = __le16_to_cpu(rp->lmp_subver);
+ }
+
+ return rp->status;
+}
- if (param & SCAN_PAGE)
- set_bit(HCI_PSCAN, &hdev->flags);
+static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_enc_key_size *rp = data;
+ struct hci_conn *conn;
+ u16 handle;
+ u8 status = rp->status;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ handle = le16_to_cpu(rp->handle);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn) {
+ status = 0xFF;
+ goto done;
+ }
+
+ /* While unexpected, the read_enc_key_size command may fail. The most
+ * secure approach is to then assume the key size is 0 to force a
+ * disconnection.
+ */
+ if (status) {
+ bt_dev_err(hdev, "failed to read key size for handle %u",
+ handle);
+ conn->enc_key_size = 0;
+ } else {
+ u8 *key_enc_size = hci_conn_key_enc_size(conn);
+
+ conn->enc_key_size = rp->key_size;
+ status = 0;
+
+ /* Attempt to check if the key size is too small or if it has
+ * been downgraded from the last time it was stored as part of
+ * the link_key.
+ */
+ if (conn->enc_key_size < hdev->min_enc_key_size ||
+ (key_enc_size && conn->enc_key_size < *key_enc_size)) {
+ /* As slave role, the conn->state has been set to
+ * BT_CONNECTED and l2cap conn req might not be received
+ * yet, at this moment the l2cap layer almost does
+ * nothing with the non-zero status.
+ * So we also clear encrypt related bits, and then the
+ * handler of l2cap conn req will get the right secure
+ * state at a later time.
+ */
+ status = HCI_ERROR_AUTH_FAILURE;
+ clear_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ clear_bit(HCI_CONN_AES_CCM, &conn->flags);
}
- hci_req_complete(hdev, status);
- break;
- case OCF_READ_VOICE_SETTING:
- vs = (struct hci_rp_read_voice_setting *) skb->data;
+ /* Update the key encryption size with the connection one */
+ if (key_enc_size && *key_enc_size != conn->enc_key_size)
+ *key_enc_size = conn->enc_key_size;
+ }
- if (vs->status) {
- BT_DBG("%s READ_VOICE_SETTING failed %d", hdev->name, vs->status);
- break;
+ hci_encrypt_cfm(conn, status);
+
+done:
+ hci_dev_unlock(hdev);
+
+ return status;
+}
+
+static u8 hci_cc_read_local_commands(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_commands *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG))
+ memcpy(hdev->commands, rp->commands, sizeof(hdev->commands));
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_auth_payload_timeout(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_auth_payload_to *rp = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->auth_payload_timeout = __le16_to_cpu(rp->timeout);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_write_auth_payload_to *rp = data;
+ struct hci_conn *conn;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (!conn) {
+ rp->status = 0xff;
+ goto unlock;
+ }
+
+ if (!rp->status)
+ conn->auth_payload_timeout = get_unaligned_le16(sent + 2);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_features *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ memcpy(hdev->features, rp->features, 8);
+
+ /* Adjust default settings according to features
+ * supported by device. */
+
+ if (hdev->features[0][0] & LMP_3SLOT)
+ hdev->pkt_type |= (HCI_DM3 | HCI_DH3);
+
+ if (hdev->features[0][0] & LMP_5SLOT)
+ hdev->pkt_type |= (HCI_DM5 | HCI_DH5);
+
+ if (hdev->features[0][1] & LMP_HV2) {
+ hdev->pkt_type |= (HCI_HV2);
+ hdev->esco_type |= (ESCO_HV2);
+ }
+
+ if (hdev->features[0][1] & LMP_HV3) {
+ hdev->pkt_type |= (HCI_HV3);
+ hdev->esco_type |= (ESCO_HV3);
+ }
+
+ if (lmp_esco_capable(hdev))
+ hdev->esco_type |= (ESCO_EV3);
+
+ if (hdev->features[0][4] & LMP_EV4)
+ hdev->esco_type |= (ESCO_EV4);
+
+ if (hdev->features[0][4] & LMP_EV5)
+ hdev->esco_type |= (ESCO_EV5);
+
+ if (hdev->features[0][5] & LMP_EDR_ESCO_2M)
+ hdev->esco_type |= (ESCO_2EV3);
+
+ if (hdev->features[0][5] & LMP_EDR_ESCO_3M)
+ hdev->esco_type |= (ESCO_3EV3);
+
+ if (hdev->features[0][5] & LMP_EDR_3S_ESCO)
+ hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_ext_features *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (hdev->max_page < rp->max_page) {
+ if (hci_test_quirk(hdev,
+ HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2))
+ bt_dev_warn(hdev, "broken local ext features page 2");
+ else
+ hdev->max_page = rp->max_page;
+ }
+
+ if (rp->page < HCI_MAX_PAGES)
+ memcpy(hdev->features[rp->page], rp->features, 8);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_buffer_size *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->acl_mtu = __le16_to_cpu(rp->acl_mtu);
+ hdev->sco_mtu = rp->sco_mtu;
+ hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt);
+ hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt);
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE)) {
+ hdev->sco_mtu = 64;
+ hdev->sco_pkts = 8;
+ }
+
+ if (!read_voice_setting_capable(hdev))
+ hdev->sco_pkts = 0;
+
+ hdev->acl_cnt = hdev->acl_pkts;
+ hdev->sco_cnt = hdev->sco_pkts;
+
+ BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name, hdev->acl_mtu,
+ hdev->acl_pkts, hdev->sco_mtu, hdev->sco_pkts);
+
+ if (!hdev->acl_mtu || !hdev->acl_pkts)
+ return HCI_ERROR_INVALID_PARAMETERS;
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_bd_addr(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_bd_addr *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (test_bit(HCI_INIT, &hdev->flags))
+ bacpy(&hdev->bdaddr, &rp->bdaddr);
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ bacpy(&hdev->setup_addr, &rp->bdaddr);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_pairing_opts(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_pairing_opts *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ hdev->pairing_opts = rp->pairing_opts;
+ hdev->max_enc_key_size = rp->max_key_size;
+ }
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_page_scan_activity(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_page_scan_activity *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (test_bit(HCI_INIT, &hdev->flags)) {
+ hdev->page_scan_interval = __le16_to_cpu(rp->interval);
+ hdev->page_scan_window = __le16_to_cpu(rp->window);
+ }
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_page_scan_activity(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_write_page_scan_activity *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY);
+ if (!sent)
+ return rp->status;
+
+ hdev->page_scan_interval = __le16_to_cpu(sent->interval);
+ hdev->page_scan_window = __le16_to_cpu(sent->window);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_page_scan_type(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_page_scan_type *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (test_bit(HCI_INIT, &hdev->flags))
+ hdev->page_scan_type = rp->type;
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_page_scan_type(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ u8 *type;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ type = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE);
+ if (type)
+ hdev->page_scan_type = *type;
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_clock(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_clock *rp = data;
+ struct hci_cp_read_clock *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_CLOCK);
+ if (!cp)
+ goto unlock;
+
+ if (cp->which == 0x00) {
+ hdev->clock = le32_to_cpu(rp->clock);
+ goto unlock;
+ }
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn) {
+ conn->clock = le32_to_cpu(rp->clock);
+ conn->clock_accuracy = le16_to_cpu(rp->accuracy);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return rp->status;
+}
+
+static u8 hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_inq_rsp_tx_power *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->inq_tx_power = rp->tx_power;
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_def_err_data_reporting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_def_err_data_reporting *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->err_data_reporting = rp->err_data_reporting;
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_def_err_data_reporting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_write_def_err_data_reporting *cp;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING);
+ if (!cp)
+ return rp->status;
+
+ hdev->err_data_reporting = cp->err_data_reporting;
+
+ return rp->status;
+}
+
+static u8 hci_cc_pin_code_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_pin_code_reply *rp = data;
+ struct hci_cp_pin_code_reply *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_pin_code_reply_complete(hdev, &rp->bdaddr, rp->status);
+
+ if (rp->status)
+ goto unlock;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_PIN_CODE_REPLY);
+ if (!cp)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+ if (conn)
+ conn->pin_length = cp->pin_len;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return rp->status;
+}
+
+static u8 hci_cc_pin_code_neg_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_pin_code_neg_reply *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_pin_code_neg_reply_complete(hdev, &rp->bdaddr,
+ rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_buffer_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_buffer_size *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->le_mtu = __le16_to_cpu(rp->le_mtu);
+ hdev->le_pkts = rp->le_max_pkt;
+
+ hdev->le_cnt = hdev->le_pkts;
+
+ BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts);
+
+ if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+ return HCI_ERROR_INVALID_PARAMETERS;
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_local_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_local_features *rp = data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ memcpy(hdev->le_features, rp->features, 8);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_adv_tx_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_adv_tx_power *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->adv_tx_power = rp->tx_power;
+
+ return rp->status;
+}
+
+static u8 hci_cc_user_confirm_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_confirm_reply_complete(hdev, &rp->bdaddr, ACL_LINK, 0,
+ rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_user_confirm_neg_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_confirm_neg_reply_complete(hdev, &rp->bdaddr,
+ ACL_LINK, 0, rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_user_passkey_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_reply_complete(hdev, &rp->bdaddr, ACL_LINK,
+ 0, rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_user_passkey_neg_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_neg_reply_complete(hdev, &rp->bdaddr,
+ ACL_LINK, 0, rp->status);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_oob_data(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_oob_data *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_oob_ext_data(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_oob_ext_data *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_random_addr(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ bdaddr_t *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_RANDOM_ADDR);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ bacpy(&hdev->random_addr, sent);
+
+ if (!bacmp(&hdev->rpa, sent)) {
+ hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ queue_delayed_work(hdev->workqueue, &hdev->rpa_expired,
+ secs_to_jiffies(hdev->rpa_timeout));
+ }
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_default_phy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_default_phy *cp;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY);
+ if (!cp)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_tx_def_phys = cp->tx_phys;
+ hdev->le_rx_def_phys = cp->rx_phys;
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_adv_set_rand_addr *cp;
+ struct adv_info *adv;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR);
+ /* Update only in case the adv instance since handle 0x00 shall be using
+ * HCI_OP_LE_SET_RANDOM_ADDR since that allows both extended and
+ * non-extended adverting.
+ */
+ if (!cp || !cp->handle)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ adv = hci_find_adv_instance(hdev, cp->handle);
+ if (adv) {
+ bacpy(&adv->random_addr, &cp->bdaddr);
+ if (!bacmp(&hdev->rpa, &cp->bdaddr)) {
+ adv->rpa_expired = false;
+ queue_delayed_work(hdev->workqueue,
+ &adv->rpa_expired_cb,
+ secs_to_jiffies(hdev->rpa_timeout));
}
+ }
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_remove_adv_set(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ u8 *instance;
+ int err;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ instance = hci_sent_cmd_data(hdev, HCI_OP_LE_REMOVE_ADV_SET);
+ if (!instance)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ err = hci_remove_adv_instance(hdev, *instance);
+ if (!err)
+ mgmt_advertising_removed(hci_skb_sk(hdev->sent_cmd), hdev,
+ *instance);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_clear_adv_sets(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct adv_info *adv, *n;
+ int err;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (!hci_sent_cmd_data(hdev, HCI_OP_LE_CLEAR_ADV_SETS))
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ u8 instance = adv->instance;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(hci_skb_sk(hdev->sent_cmd),
+ hdev, instance);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_transmit_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_transmit_power *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->min_le_tx_power = rp->min_le_tx_power;
+ hdev->max_le_tx_power = rp->max_le_tx_power;
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_privacy_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_privacy_mode *cp;
+ struct hci_conn_params *params;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PRIVACY_MODE);
+ if (!cp)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &cp->bdaddr, cp->bdaddr_type);
+ if (params)
+ WRITE_ONCE(params->privacy_mode, cp->mode);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_adv_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ __u8 *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_ENABLE);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ /* If we're doing connection initiation as peripheral. Set a
+ * timeout in case something goes wrong.
+ */
+ if (*sent) {
+ struct hci_conn *conn;
+
+ hci_dev_set_flag(hdev, HCI_LE_ADV);
- setting = __le16_to_cpu(vs->voice_setting);
+ conn = hci_lookup_le_connect(hdev);
+ if (conn)
+ queue_delayed_work(hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ }
- if (hdev->voice_setting != setting ) {
- hdev->voice_setting = setting;
+ hci_dev_unlock(hdev);
- BT_DBG("%s: voice setting 0x%04x", hdev->name, setting);
+ return rp->status;
+}
- if (hdev->notify) {
- tasklet_disable(&hdev->tx_task);
- hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
- tasklet_enable(&hdev->tx_task);
+static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *set;
+ struct adv_info *adv = NULL, *n;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE);
+ if (!cp)
+ return rp->status;
+
+ set = (void *)cp->data;
+
+ hci_dev_lock(hdev);
+
+ if (cp->num_of_sets)
+ adv = hci_find_adv_instance(hdev, set->handle);
+
+ if (cp->enable) {
+ struct hci_conn *conn;
+
+ hci_dev_set_flag(hdev, HCI_LE_ADV);
+
+ if (adv)
+ adv->enabled = true;
+ else if (!set->handle)
+ hci_dev_set_flag(hdev, HCI_LE_ADV_0);
+
+ conn = hci_lookup_le_connect(hdev);
+ if (conn)
+ queue_delayed_work(hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+ } else {
+ if (cp->num_of_sets) {
+ if (adv)
+ adv->enabled = false;
+ else if (!set->handle)
+ hci_dev_clear_flag(hdev, HCI_LE_ADV_0);
+
+ /* If just one instance was disabled check if there are
+ * any other instance enabled before clearing HCI_LE_ADV
+ */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+ list) {
+ if (adv->enabled)
+ goto unlock;
}
+ } else {
+ /* All instances shall be considered disabled */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+ list)
+ adv->enabled = false;
}
- break;
- case OCF_WRITE_VOICE_SETTING:
- sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_VOICE_SETTING);
- if (!sent)
- break;
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ }
- status = *((__u8 *) skb->data);
- setting = __le16_to_cpu(get_unaligned((__le16 *) sent));
+unlock:
+ hci_dev_unlock(hdev);
+ return rp->status;
+}
- if (!status && hdev->voice_setting != setting) {
- hdev->voice_setting = setting;
+static u8 hci_cc_le_set_scan_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_scan_param *cp;
+ struct hci_ev_status *rp = data;
- BT_DBG("%s: voice setting 0x%04x", hdev->name, setting);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (hdev->notify) {
- tasklet_disable(&hdev->tx_task);
- hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
- tasklet_enable(&hdev->tx_task);
- }
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_PARAM);
+ if (!cp)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_type = cp->type;
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_ext_scan_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_scan_params *cp;
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_scan_phy_params *phy_param;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS);
+ if (!cp)
+ return rp->status;
+
+ phy_param = (void *)cp->data;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_type = phy_param->type;
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static bool has_pending_adv_report(struct hci_dev *hdev)
+{
+ struct discovery_state *d = &hdev->discovery;
+
+ return bacmp(&d->last_adv_addr, BDADDR_ANY);
+}
+
+static void clear_pending_adv_report(struct hci_dev *hdev)
+{
+ struct discovery_state *d = &hdev->discovery;
+
+ bacpy(&d->last_adv_addr, BDADDR_ANY);
+ d->last_adv_data_len = 0;
+}
+
+static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type, s8 rssi, u32 flags,
+ u8 *data, u8 len)
+{
+ struct discovery_state *d = &hdev->discovery;
+
+ if (len > max_adv_len(hdev))
+ return;
+
+ bacpy(&d->last_adv_addr, bdaddr);
+ d->last_adv_addr_type = bdaddr_type;
+ d->last_adv_rssi = rssi;
+ d->last_adv_flags = flags;
+ memcpy(d->last_adv_data, data, len);
+ d->last_adv_data_len = len;
+}
+
+static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
+{
+ hci_dev_lock(hdev);
+
+ switch (enable) {
+ case LE_SCAN_ENABLE:
+ hci_dev_set_flag(hdev, HCI_LE_SCAN);
+ if (hdev->le_scan_type == LE_SCAN_ACTIVE) {
+ clear_pending_adv_report(hdev);
+ hci_discovery_set_state(hdev, DISCOVERY_FINDING);
}
- hci_req_complete(hdev, status);
break;
- case OCF_HOST_BUFFER_SIZE:
- status = *((__u8 *) skb->data);
- if (status) {
- BT_DBG("%s OCF_BUFFER_SIZE failed %d", hdev->name, status);
- hci_req_complete(hdev, status);
+ case LE_SCAN_DISABLE:
+ /* We do this here instead of when setting DISCOVERY_STOPPED
+ * since the latter would potentially require waiting for
+ * inquiry to stop too.
+ */
+ if (has_pending_adv_report(hdev)) {
+ struct discovery_state *d = &hdev->discovery;
+
+ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+ d->last_adv_addr_type, NULL,
+ d->last_adv_rssi, d->last_adv_flags,
+ d->last_adv_data,
+ d->last_adv_data_len, NULL, 0, 0);
}
+
+ /* Cancel this timer so that we don't try to disable scanning
+ * when it's already disabled.
+ */
+ cancel_delayed_work(&hdev->le_scan_disable);
+
+ hci_dev_clear_flag(hdev, HCI_LE_SCAN);
+
+ /* The HCI_LE_SCAN_INTERRUPTED flag indicates that we
+ * interrupted scanning due to a connect request. Mark
+ * therefore discovery as stopped.
+ */
+ if (hci_dev_test_and_clear_flag(hdev, HCI_LE_SCAN_INTERRUPTED))
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) &&
+ hdev->discovery.state == DISCOVERY_FINDING)
+ queue_work(hdev->workqueue, &hdev->reenable_adv_work);
+
break;
default:
- BT_DBG("%s Command complete: ogf HOST_CTL ocf %x", hdev->name, ocf);
+ bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d",
+ enable);
break;
}
+
+ hci_dev_unlock(hdev);
}
-/* Command Complete OGF INFO_PARAM */
-static void hci_cc_info_param(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb)
+static u8 hci_cc_le_set_scan_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_loc_version *lv;
- struct hci_rp_read_local_features *lf;
- struct hci_rp_read_buffer_size *bs;
- struct hci_rp_read_bd_addr *ba;
+ struct hci_cp_le_set_scan_enable *cp;
+ struct hci_ev_status *rp = data;
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- switch (ocf) {
- case OCF_READ_LOCAL_VERSION:
- lv = (struct hci_rp_read_loc_version *) skb->data;
+ if (rp->status)
+ return rp->status;
- if (lv->status) {
- BT_DBG("%s READ_LOCAL_VERSION failed %d", hdev->name, lf->status);
- break;
- }
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
+ if (!cp)
+ return rp->status;
- hdev->hci_ver = lv->hci_ver;
- hdev->hci_rev = btohs(lv->hci_rev);
- hdev->manufacturer = btohs(lv->manufacturer);
+ le_set_scan_enable_complete(hdev, cp->enable);
- BT_DBG("%s: manufacturer %d hci_ver %d hci_rev %d", hdev->name,
- hdev->manufacturer, hdev->hci_ver, hdev->hci_rev);
+ return rp->status;
+}
- break;
+static u8 hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_scan_enable *cp;
+ struct hci_ev_status *rp = data;
- case OCF_READ_LOCAL_FEATURES:
- lf = (struct hci_rp_read_local_features *) skb->data;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (lf->status) {
- BT_DBG("%s READ_LOCAL_FEATURES failed %d", hdev->name, lf->status);
- break;
- }
+ if (rp->status)
+ return rp->status;
- memcpy(hdev->features, lf->features, sizeof(hdev->features));
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE);
+ if (!cp)
+ return rp->status;
- /* Adjust default settings according to features
- * supported by device. */
- if (hdev->features[0] & LMP_3SLOT)
- hdev->pkt_type |= (HCI_DM3 | HCI_DH3);
+ le_set_scan_enable_complete(hdev, cp->enable);
- if (hdev->features[0] & LMP_5SLOT)
- hdev->pkt_type |= (HCI_DM5 | HCI_DH5);
+ return rp->status;
+}
- if (hdev->features[1] & LMP_HV2)
- hdev->pkt_type |= (HCI_HV2);
+static u8 hci_cc_le_read_num_adv_sets(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_num_supported_adv_sets *rp = data;
- if (hdev->features[1] & LMP_HV3)
- hdev->pkt_type |= (HCI_HV3);
+ bt_dev_dbg(hdev, "status 0x%2.2x No of Adv sets %u", rp->status,
+ rp->num_of_sets);
- BT_DBG("%s: features 0x%x 0x%x 0x%x", hdev->name,
- lf->features[0], lf->features[1], lf->features[2]);
+ if (rp->status)
+ return rp->status;
- break;
+ hdev->le_num_of_adv_sets = rp->num_of_sets;
- case OCF_READ_BUFFER_SIZE:
- bs = (struct hci_rp_read_buffer_size *) skb->data;
+ return rp->status;
+}
- if (bs->status) {
- BT_DBG("%s READ_BUFFER_SIZE failed %d", hdev->name, bs->status);
- hci_req_complete(hdev, bs->status);
- break;
- }
+static u8 hci_cc_le_read_accept_list_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_accept_list_size *rp = data;
- hdev->acl_mtu = __le16_to_cpu(bs->acl_mtu);
- hdev->sco_mtu = bs->sco_mtu;
- hdev->acl_pkts = __le16_to_cpu(bs->acl_max_pkt);
- hdev->sco_pkts = __le16_to_cpu(bs->sco_max_pkt);
+ bt_dev_dbg(hdev, "status 0x%2.2x size %u", rp->status, rp->size);
- if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) {
- hdev->sco_mtu = 64;
- hdev->sco_pkts = 8;
- }
+ if (rp->status)
+ return rp->status;
- hdev->acl_cnt = hdev->acl_pkts;
- hdev->sco_cnt = hdev->sco_pkts;
+ hdev->le_accept_list_size = rp->size;
- BT_DBG("%s mtu: acl %d, sco %d max_pkt: acl %d, sco %d", hdev->name,
- hdev->acl_mtu, hdev->sco_mtu, hdev->acl_pkts, hdev->sco_pkts);
- break;
+ return rp->status;
+}
- case OCF_READ_BD_ADDR:
- ba = (struct hci_rp_read_bd_addr *) skb->data;
+static u8 hci_cc_le_clear_accept_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
- if (!ba->status) {
- bacpy(&hdev->bdaddr, &ba->bdaddr);
- } else {
- BT_DBG("%s: READ_BD_ADDR failed %d", hdev->name, ba->status);
- }
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- hci_req_complete(hdev, ba->status);
- break;
+ if (rp->status)
+ return rp->status;
- default:
- BT_DBG("%s Command complete: ogf INFO_PARAM ocf %x", hdev->name, ocf);
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_clear(&hdev->le_accept_list);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_add_to_accept_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_add_to_accept_list *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_add(&hdev->le_accept_list, &sent->bdaddr,
+ sent->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_del_from_accept_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_del_from_accept_list *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_del(&hdev->le_accept_list, &sent->bdaddr,
+ sent->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_supported_states(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_supported_states *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ memcpy(hdev->le_states, rp->le_states, 8);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_def_data_len(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_def_data_len *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->le_def_tx_len = le16_to_cpu(rp->tx_len);
+ hdev->le_def_tx_time = le16_to_cpu(rp->tx_time);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_write_def_data_len(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_write_def_data_len *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN);
+ if (!sent)
+ return rp->status;
+
+ hdev->le_def_tx_len = le16_to_cpu(sent->tx_len);
+ hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_add_to_resolv_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_add_to_resolv_list *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+ sent->bdaddr_type, sent->peer_irk,
+ sent->local_irk);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_del_from_resolv_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_del_from_resolv_list *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+ sent->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_clear_resolv_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_resolv_list_size *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x size %u", rp->status, rp->size);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->le_resolv_list_size = rp->size;
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ __u8 *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ if (*sent)
+ hci_dev_set_flag(hdev, HCI_LL_RPA_RESOLUTION);
+ else
+ hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_max_data_len(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_max_data_len *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->le_max_tx_len = le16_to_cpu(rp->tx_len);
+ hdev->le_max_tx_time = le16_to_cpu(rp->tx_time);
+ hdev->le_max_rx_len = le16_to_cpu(rp->rx_len);
+ hdev->le_max_rx_time = le16_to_cpu(rp->rx_time);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_le_host_supported(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_write_le_host_supported *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ if (sent->le) {
+ hdev->features[1][0] |= LMP_HOST_LE;
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+ } else {
+ hdev->features[1][0] &= ~LMP_HOST_LE;
+ hci_dev_clear_flag(hdev, HCI_LE_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING);
+ }
+
+ if (sent->simul)
+ hdev->features[1][0] |= LMP_HOST_LE_BREDR;
+ else
+ hdev->features[1][0] &= ~LMP_HOST_LE_BREDR;
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_set_adv_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_adv_param *cp;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_PARAM);
+ if (!cp)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hdev->adv_addr_type = cp->own_address_type;
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_rssi(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_rssi *rp = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->rssi = rp->rssi;
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_tx_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_read_tx_power *sent;
+ struct hci_rp_read_tx_power *rp = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (!conn)
+ goto unlock;
+
+ switch (sent->type) {
+ case 0x00:
+ conn->tx_power = rp->tx_power;
+ break;
+ case 0x01:
+ conn->max_tx_power = rp->tx_power;
break;
}
+
+unlock:
+ hci_dev_unlock(hdev);
+ return rp->status;
}
-/* Command Status OGF LINK_CTL */
-static inline void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
+static u8 hci_cc_write_ssp_debug_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
+ struct hci_ev_status *rp = data;
+ u8 *mode;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ mode = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE);
+ if (mode)
+ hdev->ssp_debug_mode = *mode;
+
+ return rp->status;
+}
+
+static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status)
+{
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (status)
+ return;
+
+ if (hci_sent_cmd_data(hdev, HCI_OP_INQUIRY))
+ set_bit(HCI_INQUIRY, &hdev->flags);
+}
+
+static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_create_conn *cp;
struct hci_conn *conn;
- struct hci_cp_create_conn *cp = hci_sent_cmd_data(hdev, OGF_LINK_CTL, OCF_CREATE_CONN);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_CONN);
if (!cp)
return;
@@ -419,662 +2256,2517 @@ static inline void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
- BT_DBG("%s status 0x%x bdaddr %s conn %p", hdev->name,
- status, batostr(&cp->bdaddr), conn);
+ bt_dev_dbg(hdev, "bdaddr %pMR hcon %p", &cp->bdaddr, conn);
if (status) {
if (conn && conn->state == BT_CONNECT) {
- if (status != 0x0c || conn->attempt > 2) {
- conn->state = BT_CLOSED;
- hci_proto_connect_cfm(conn, status);
- hci_conn_del(conn);
- } else
- conn->state = BT_CONNECT2;
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
}
} else {
if (!conn) {
- conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr);
- if (conn) {
- conn->out = 1;
- conn->link_mode |= HCI_LM_MASTER;
- } else
- BT_ERR("No memmory for new connection");
+ conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr,
+ 0, HCI_ROLE_MASTER);
+ if (IS_ERR(conn))
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
}
}
hci_dev_unlock(hdev);
}
-static void hci_cs_link_ctl(struct hci_dev *hdev, __u16 ocf, __u8 status)
+static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
{
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ struct hci_cp_add_sco *cp;
+ struct hci_conn *acl;
+ struct hci_link *link;
+ __u16 handle;
- switch (ocf) {
- case OCF_CREATE_CONN:
- hci_cs_create_conn(hdev, status);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- case OCF_ADD_SCO:
- if (status) {
- struct hci_conn *acl, *sco;
- struct hci_cp_add_sco *cp = hci_sent_cmd_data(hdev, OGF_LINK_CTL, OCF_ADD_SCO);
- __u16 handle;
+ if (!status)
+ return;
- if (!cp)
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ADD_SCO);
+ if (!cp)
+ return;
- handle = __le16_to_cpu(cp->handle);
+ handle = __le16_to_cpu(cp->handle);
- BT_DBG("%s Add SCO error: handle %d status 0x%x", hdev->name, handle, status);
+ bt_dev_dbg(hdev, "handle 0x%4.4x", handle);
- hci_dev_lock(hdev);
+ hci_dev_lock(hdev);
- acl = hci_conn_hash_lookup_handle(hdev, handle);
- if (acl && (sco = acl->link)) {
- sco->state = BT_CLOSED;
+ acl = hci_conn_hash_lookup_handle(hdev, handle);
+ if (acl) {
+ link = list_first_entry_or_null(&acl->link_list,
+ struct hci_link, list);
+ if (link && link->conn) {
+ link->conn->state = BT_CLOSED;
- hci_proto_connect_cfm(sco, status);
- hci_conn_del(sco);
- }
+ hci_connect_cfm(link->conn, status);
+ hci_conn_del(link->conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_auth_requested *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- hci_dev_unlock(hdev);
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_AUTH_REQUESTED);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
}
- break;
+ }
- case OCF_INQUIRY:
- if (status) {
- BT_DBG("%s Inquiry error: status 0x%x", hdev->name, status);
- hci_req_complete(hdev, status);
- } else {
- set_bit(HCI_INQUIRY, &hdev->flags);
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_set_conn_encrypt *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SET_CONN_ENCRYPT);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
}
- break;
+ }
- default:
- BT_DBG("%s Command status: ogf LINK_CTL ocf %x status %d",
- hdev->name, ocf, status);
- break;
+ hci_dev_unlock(hdev);
+}
+
+static int hci_outgoing_auth_needed(struct hci_dev *hdev,
+ struct hci_conn *conn)
+{
+ if (conn->state != BT_CONFIG || !conn->out)
+ return 0;
+
+ if (conn->pending_sec_level == BT_SECURITY_SDP)
+ return 0;
+
+ /* Only request authentication for SSP connections or non-SSP
+ * devices with sec_level MEDIUM or HIGH or if MITM protection
+ * is requested.
+ */
+ if (!hci_conn_ssp_enabled(conn) && !(conn->auth_type & 0x01) &&
+ conn->pending_sec_level != BT_SECURITY_FIPS &&
+ conn->pending_sec_level != BT_SECURITY_HIGH &&
+ conn->pending_sec_level != BT_SECURITY_MEDIUM)
+ return 0;
+
+ return 1;
+}
+
+static int hci_resolve_name(struct hci_dev *hdev,
+ struct inquiry_entry *e)
+{
+ struct hci_cp_remote_name_req cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ bacpy(&cp.bdaddr, &e->data.bdaddr);
+ cp.pscan_rep_mode = e->data.pscan_rep_mode;
+ cp.pscan_mode = e->data.pscan_mode;
+ cp.clock_offset = e->data.clock_offset;
+
+ return hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+}
+
+static bool hci_resolve_next_name(struct hci_dev *hdev)
+{
+ struct discovery_state *discov = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ if (list_empty(&discov->resolve))
+ return false;
+
+ /* We should stop if we already spent too much time resolving names. */
+ if (time_after(jiffies, discov->name_resolve_timeout)) {
+ bt_dev_warn_ratelimited(hdev, "Name resolve takes too long.");
+ return false;
+ }
+
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
+ if (!e)
+ return false;
+
+ if (hci_resolve_name(hdev, e) == 0) {
+ e->name_state = NAME_PENDING;
+ return true;
+ }
+
+ return false;
+}
+
+static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
+ bdaddr_t *bdaddr, u8 *name, u8 name_len)
+{
+ struct discovery_state *discov = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ /* Update the mgmt connected state if necessary. Be careful with
+ * conn objects that exist but are not (yet) connected however.
+ * Only those in BT_CONFIG or BT_CONNECTED states can be
+ * considered connected.
+ */
+ if (conn && (conn->state == BT_CONFIG || conn->state == BT_CONNECTED))
+ mgmt_device_connected(hdev, conn, name, name_len);
+
+ if (discov->state == DISCOVERY_STOPPED)
+ return;
+
+ if (discov->state == DISCOVERY_STOPPING)
+ goto discov_complete;
+
+ if (discov->state != DISCOVERY_RESOLVING)
+ return;
+
+ e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING);
+ /* If the device was not found in a list of found devices names of which
+ * are pending. there is no need to continue resolving a next name as it
+ * will be done upon receiving another Remote Name Request Complete
+ * Event */
+ if (!e)
+ return;
+
+ list_del(&e->list);
+
+ e->name_state = name ? NAME_KNOWN : NAME_NOT_KNOWN;
+ mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, e->data.rssi,
+ name, name_len);
+
+ if (hci_resolve_next_name(hdev))
+ return;
+
+discov_complete:
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+}
+
+static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_remote_name_req *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ /* If successful wait for the name req complete event before
+ * checking for the need to do authentication */
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_REMOTE_NAME_REQ);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ hci_check_pending_name(hdev, conn, &cp->bdaddr, NULL, 0);
+
+ if (!conn)
+ goto unlock;
+
+ if (!hci_outgoing_auth_needed(hdev, conn))
+ goto unlock;
+
+ if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) {
+ struct hci_cp_auth_requested auth_cp;
+
+ set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags);
+
+ auth_cp.handle = __cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED,
+ sizeof(auth_cp), &auth_cp);
}
+
+unlock:
+ hci_dev_unlock(hdev);
}
-/* Command Status OGF LINK_POLICY */
-static void hci_cs_link_policy(struct hci_dev *hdev, __u16 ocf, __u8 status)
+static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status)
{
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ struct hci_cp_read_remote_features *cp;
+ struct hci_conn *conn;
- switch (ocf) {
- case OCF_SNIFF_MODE:
- if (status) {
- struct hci_conn *conn;
- struct hci_cp_sniff_mode *cp = hci_sent_cmd_data(hdev, OGF_LINK_POLICY, OCF_SNIFF_MODE);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- if (!cp)
- break;
+ if (!status)
+ return;
- hci_dev_lock(hdev);
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_FEATURES);
+ if (!cp)
+ return;
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
- if (conn) {
- clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
- }
+ hci_dev_lock(hdev);
- hci_dev_unlock(hdev);
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
}
- break;
+ }
- case OCF_EXIT_SNIFF_MODE:
- if (status) {
- struct hci_conn *conn;
- struct hci_cp_exit_sniff_mode *cp = hci_sent_cmd_data(hdev, OGF_LINK_POLICY, OCF_EXIT_SNIFF_MODE);
+ hci_dev_unlock(hdev);
+}
- if (!cp)
- break;
+static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_read_remote_ext_features *cp;
+ struct hci_conn *conn;
- hci_dev_lock(hdev);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
- if (conn) {
- clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
- }
+ if (!status)
+ return;
- hci_dev_unlock(hdev);
- }
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES);
+ if (!cp)
+ return;
- default:
- BT_DBG("%s Command status: ogf LINK_POLICY ocf %x", hdev->name, ocf);
- break;
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
}
+
+ hci_dev_unlock(hdev);
}
-/* Command Status OGF HOST_CTL */
-static void hci_cs_host_ctl(struct hci_dev *hdev, __u16 ocf, __u8 status)
+static void hci_setup_sync_conn_status(struct hci_dev *hdev, __u16 handle,
+ __u8 status)
{
- BT_DBG("%s ocf 0x%x", hdev->name, ocf);
+ struct hci_conn *acl;
+ struct hci_link *link;
- switch (ocf) {
- default:
- BT_DBG("%s Command status: ogf HOST_CTL ocf %x", hdev->name, ocf);
- break;
+ bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x", handle, status);
+
+ hci_dev_lock(hdev);
+
+ acl = hci_conn_hash_lookup_handle(hdev, handle);
+ if (acl) {
+ link = list_first_entry_or_null(&acl->link_list,
+ struct hci_link, list);
+ if (link && link->conn) {
+ link->conn->state = BT_CLOSED;
+
+ hci_connect_cfm(link->conn, status);
+ hci_conn_del(link->conn);
+ }
}
+
+ hci_dev_unlock(hdev);
}
-/* Command Status OGF INFO_PARAM */
-static void hci_cs_info_param(struct hci_dev *hdev, __u16 ocf, __u8 status)
+static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
{
- BT_DBG("%s: hci_cs_info_param: ocf 0x%x", hdev->name, ocf);
+ struct hci_cp_setup_sync_conn *cp;
- switch (ocf) {
- default:
- BT_DBG("%s Command status: ogf INFO_PARAM ocf %x", hdev->name, ocf);
- break;
- }
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
+ if (!cp)
+ return;
+
+ hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
}
-/* Inquiry Complete */
-static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status)
{
- __u8 status = *((__u8 *) skb->data);
- struct hci_conn *pend;
+ struct hci_cp_enhanced_setup_sync_conn *cp;
- BT_DBG("%s status %d", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- clear_bit(HCI_INQUIRY, &hdev->flags);
- hci_req_complete(hdev, status);
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN);
+ if (!cp)
+ return;
+
+ hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
+}
+
+static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_sniff_mode *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SNIFF_MODE);
+ if (!cp)
+ return;
hci_dev_lock(hdev);
- pend = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
- if (pend)
- hci_acl_connect(pend);
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags);
+
+ if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->flags))
+ hci_sco_setup(conn, status);
+ }
hci_dev_unlock(hdev);
}
-/* Inquiry Result */
-static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status)
{
- struct inquiry_data data;
- struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1);
- int num_rsp = *((__u8 *) skb->data);
+ struct hci_cp_exit_sniff_mode *cp;
+ struct hci_conn *conn;
- BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- if (!num_rsp)
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_EXIT_SNIFF_MODE);
+ if (!cp)
return;
hci_dev_lock(hdev);
- for (; num_rsp; num_rsp--) {
- bacpy(&data.bdaddr, &info->bdaddr);
- data.pscan_rep_mode = info->pscan_rep_mode;
- data.pscan_period_mode = info->pscan_period_mode;
- data.pscan_mode = info->pscan_mode;
- memcpy(data.dev_class, info->dev_class, 3);
- data.clock_offset = info->clock_offset;
- data.rssi = 0x00;
- info++;
- hci_inquiry_cache_update(hdev, &data);
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags);
+
+ if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->flags))
+ hci_sco_setup(conn, status);
}
hci_dev_unlock(hdev);
}
-/* Inquiry Result With RSSI */
-static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
{
- struct inquiry_data data;
- int num_rsp = *((__u8 *) skb->data);
+ struct hci_cp_disconnect *cp;
+ struct hci_conn_params *params;
+ struct hci_conn *conn;
+ bool mgmt_conn;
- BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- if (!num_rsp)
+ /* Wait for HCI_EV_DISCONN_COMPLETE if status 0x00 and not suspended
+ * otherwise cleanup the connection immediately.
+ */
+ if (!status && !hdev->suspended)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_DISCONNECT);
+ if (!cp)
return;
hci_dev_lock(hdev);
- if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
- struct inquiry_info_with_rssi_and_pscan_mode *info =
- (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1);
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (!conn)
+ goto unlock;
- for (; num_rsp; num_rsp--) {
- bacpy(&data.bdaddr, &info->bdaddr);
- data.pscan_rep_mode = info->pscan_rep_mode;
- data.pscan_period_mode = info->pscan_period_mode;
- data.pscan_mode = info->pscan_mode;
- memcpy(data.dev_class, info->dev_class, 3);
- data.clock_offset = info->clock_offset;
- data.rssi = info->rssi;
- info++;
- hci_inquiry_cache_update(hdev, &data);
+ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) {
+ mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, status);
+
+ if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) {
+ hdev->cur_adv_instance = conn->adv_instance;
+ hci_enable_advertising(hdev);
}
- } else {
- struct inquiry_info_with_rssi *info =
- (struct inquiry_info_with_rssi *) (skb->data + 1);
- for (; num_rsp; num_rsp--) {
- bacpy(&data.bdaddr, &info->bdaddr);
- data.pscan_rep_mode = info->pscan_rep_mode;
- data.pscan_period_mode = info->pscan_period_mode;
- data.pscan_mode = 0x00;
- memcpy(data.dev_class, info->dev_class, 3);
- data.clock_offset = info->clock_offset;
- data.rssi = info->rssi;
- info++;
- hci_inquiry_cache_update(hdev, &data);
+ /* Inform sockets conn is gone before we delete it */
+ hci_disconn_cfm(conn, HCI_ERROR_UNSPECIFIED);
+
+ goto done;
+ }
+
+ /* During suspend, mark connection as closed immediately
+ * since we might not receive HCI_EV_DISCONN_COMPLETE
+ */
+ if (hdev->suspended)
+ conn->state = BT_CLOSED;
+
+ mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
+
+ if (conn->type == ACL_LINK) {
+ if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ hci_remove_link_key(hdev, &conn->dst);
+ }
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_LINK_LOSS:
+ if (cp->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ break;
+ fallthrough;
+
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+
+ default:
+ break;
}
}
+ mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type,
+ cp->reason, mgmt_conn);
+
+ hci_disconn_cfm(conn, cp->reason);
+
+done:
+ /* If the disconnection failed for any reason, the upper layer
+ * does not retry to disconnect in current implementation.
+ * Hence, we need to do some basic cleanup here and re-enable
+ * advertising if necessary.
+ */
+ hci_conn_del(conn);
+unlock:
hci_dev_unlock(hdev);
}
-/* Extended Inquiry Result */
-static inline void hci_extended_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 ev_bdaddr_type(struct hci_dev *hdev, u8 type, bool *resolved)
{
- struct inquiry_data data;
- struct extended_inquiry_info *info = (struct extended_inquiry_info *) (skb->data + 1);
- int num_rsp = *((__u8 *) skb->data);
+ /* When using controller based address resolution, then the new
+ * address types 0x02 and 0x03 are used. These types need to be
+ * converted back into either public address or random address type
+ */
+ switch (type) {
+ case ADDR_LE_DEV_PUBLIC_RESOLVED:
+ if (resolved)
+ *resolved = true;
+ return ADDR_LE_DEV_PUBLIC;
+ case ADDR_LE_DEV_RANDOM_RESOLVED:
+ if (resolved)
+ *resolved = true;
+ return ADDR_LE_DEV_RANDOM;
+ }
+
+ if (resolved)
+ *resolved = false;
+ return type;
+}
- BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
+ u8 peer_addr_type, u8 own_address_type,
+ u8 filter_policy)
+{
+ struct hci_conn *conn;
- if (!num_rsp)
+ conn = hci_conn_hash_lookup_le(hdev, peer_addr,
+ peer_addr_type);
+ if (!conn)
+ return;
+
+ own_address_type = ev_bdaddr_type(hdev, own_address_type, NULL);
+
+ /* Store the initiator and responder address information which
+ * is needed for SMP. These values will not change during the
+ * lifetime of the connection.
+ */
+ conn->init_addr_type = own_address_type;
+ if (own_address_type == ADDR_LE_DEV_RANDOM)
+ bacpy(&conn->init_addr, &hdev->random_addr);
+ else
+ bacpy(&conn->init_addr, &hdev->bdaddr);
+
+ conn->resp_addr_type = peer_addr_type;
+ bacpy(&conn->resp_addr, peer_addr);
+}
+
+static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_create_conn *cp;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ /* All connection failure handling is taken care of by the
+ * hci_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
+ */
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN);
+ if (!cp)
return;
hci_dev_lock(hdev);
- for (; num_rsp; num_rsp--) {
- bacpy(&data.bdaddr, &info->bdaddr);
- data.pscan_rep_mode = info->pscan_rep_mode;
- data.pscan_period_mode = info->pscan_period_mode;
- data.pscan_mode = 0x00;
- memcpy(data.dev_class, info->dev_class, 3);
- data.clock_offset = info->clock_offset;
- data.rssi = info->rssi;
- info++;
- hci_inquiry_cache_update(hdev, &data);
- }
+ cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
+ cp->own_address_type, cp->filter_policy);
hci_dev_unlock(hdev);
}
-/* Connect Request */
-static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
{
- struct hci_ev_conn_request *ev = (struct hci_ev_conn_request *) skb->data;
- int mask = hdev->link_mode;
+ struct hci_cp_le_ext_create_conn *cp;
- BT_DBG("%s Connection request: %s type 0x%x", hdev->name,
- batostr(&ev->bdaddr), ev->link_type);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type);
+ /* All connection failure handling is taken care of by the
+ * hci_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
+ */
+ if (status)
+ return;
- if (mask & HCI_LM_ACCEPT) {
- /* Connection accepted */
- struct hci_conn *conn;
- struct hci_cp_accept_conn_req cp;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_EXT_CREATE_CONN);
+ if (!cp)
+ return;
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
- if (!conn) {
- if (!(conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr))) {
- BT_ERR("No memmory for new connection");
- hci_dev_unlock(hdev);
- return;
- }
- }
- memcpy(conn->dev_class, ev->dev_class, 3);
- conn->state = BT_CONNECT;
- hci_dev_unlock(hdev);
+ hci_dev_lock(hdev);
- bacpy(&cp.bdaddr, &ev->bdaddr);
+ cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
+ cp->own_addr_type, cp->filter_policy);
- if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER))
- cp.role = 0x00; /* Become master */
- else
- cp.role = 0x01; /* Remain slave */
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_read_remote_features *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_REMOTE_FEATURES);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn && conn->state == BT_CONFIG)
+ hci_connect_cfm(conn, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_start_enc *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_START_ENC);
+ if (!cp)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (!conn)
+ goto unlock;
+
+ if (conn->state != BT_CONNECTED)
+ goto unlock;
+
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_switch_role(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_switch_role *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
- hci_send_cmd(hdev, OGF_LINK_CTL,
- OCF_ACCEPT_CONN_REQ, sizeof(cp), &cp);
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SWITCH_ROLE);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+ if (conn)
+ clear_bit(HCI_CONN_RSWITCH_PEND, &conn->flags);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *ev = data;
+ struct discovery_state *discov = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags))
+ return;
+
+ smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */
+ wake_up_bit(&hdev->flags, HCI_INQUIRY);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (discov->state != DISCOVERY_FINDING)
+ goto unlock;
+
+ if (list_empty(&discov->resolve)) {
+ /* When BR/EDR inquiry is active and no LE scanning is in
+ * progress, then change discovery state to indicate completion.
+ *
+ * When running LE scanning and BR/EDR inquiry simultaneously
+ * and the LE scan already finished, then change the discovery
+ * state to indicate completion.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY))
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ goto unlock;
+ }
+
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
+ if (e && hci_resolve_name(hdev, e) == 0) {
+ e->name_state = NAME_PENDING;
+ hci_discovery_set_state(hdev, DISCOVERY_RESOLVING);
+ discov->name_resolve_timeout = jiffies + NAME_RESOLVE_DURATION;
} else {
- /* Connection rejected */
- struct hci_cp_reject_conn_req cp;
+ /* When BR/EDR inquiry is active and no LE scanning is in
+ * progress, then change discovery state to indicate completion.
+ *
+ * When running LE scanning and BR/EDR inquiry simultaneously
+ * and the LE scan already finished, then change the discovery
+ * state to indicate completion.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY))
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ }
- bacpy(&cp.bdaddr, &ev->bdaddr);
- cp.reason = 0x0f;
- hci_send_cmd(hdev, OGF_LINK_CTL,
- OCF_REJECT_CONN_REQ, sizeof(cp), &cp);
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata,
+ struct sk_buff *skb)
+{
+ struct hci_ev_inquiry_result *ev = edata;
+ struct inquiry_data data;
+ int i;
+
+ if (!hci_ev_skb_pull(hdev, skb, HCI_EV_INQUIRY_RESULT,
+ flex_array_size(ev, info, ev->num)))
+ return;
+
+ bt_dev_dbg(hdev, "num %d", ev->num);
+
+ if (!ev->num)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
+ return;
+
+ hci_dev_lock(hdev);
+
+ for (i = 0; i < ev->num; i++) {
+ struct inquiry_info *info = &ev->info[i];
+ u32 flags;
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = info->pscan_mode;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = HCI_RSSI_INVALID;
+ data.ssp_mode = 0x00;
+
+ flags = hci_inquiry_cache_update(hdev, &data, false);
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, HCI_RSSI_INVALID,
+ flags, NULL, 0, NULL, 0, 0);
}
+
+ hci_dev_unlock(hdev);
}
-/* Connect Complete */
-static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static int hci_read_enc_key_size(struct hci_dev *hdev, struct hci_conn *conn)
{
- struct hci_ev_conn_complete *ev = (struct hci_ev_conn_complete *) skb->data;
- struct hci_conn *conn, *pend;
+ struct hci_cp_read_enc_key_size cp;
+ u8 *key_enc_size = hci_conn_key_enc_size(conn);
+
+ if (!read_key_size_capable(hdev)) {
+ conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ return -EOPNOTSUPP;
+ }
+
+ bt_dev_dbg(hdev, "hcon %p", conn);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+
+ /* If the key enc_size is already known, use it as conn->enc_key_size,
+ * otherwise use hdev->min_enc_key_size so the likes of
+ * l2cap_check_enc_key_size don't fail while waiting for
+ * HCI_OP_READ_ENC_KEY_SIZE response.
+ */
+ if (key_enc_size && *key_enc_size)
+ conn->enc_key_size = *key_enc_size;
+ else
+ conn->enc_key_size = hdev->min_enc_key_size;
+
+ return hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
+}
+
+static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_conn_complete *ev = data;
+ struct hci_conn *conn;
+ u8 status = ev->status;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
hci_dev_lock(hdev);
+ /* Check for existing connection:
+ *
+ * 1. If it doesn't exist then it must be receiver/slave role.
+ * 2. If it does exist confirm that it is connecting/BT_CONNECT in case
+ * of initiator/master role since there could be a collision where
+ * either side is attempting to connect or something like a fuzzing
+ * testing is trying to play tricks to destroy the hcon object before
+ * it even attempts to connect (e.g. hcon->state == BT_OPEN).
+ */
conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
- if (!conn) {
- hci_dev_unlock(hdev);
- return;
+ if (!conn ||
+ (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) {
+ /* In case of error status and there is no connection pending
+ * just unlock as there is nothing to cleanup.
+ */
+ if (ev->status)
+ goto unlock;
+
+ /* Connection may not exist if auto-connected. Check the bredr
+ * allowlist to see if this device is allowed to auto connect.
+ * If link is an ACL type, create a connection class
+ * automatically.
+ *
+ * Auto-connect will only occur if the event filter is
+ * programmed with a given address. Right now, event filter is
+ * only used during suspend.
+ */
+ if (ev->link_type == ACL_LINK &&
+ hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
+ &ev->bdaddr,
+ BDADDR_BREDR)) {
+ conn = hci_conn_add_unset(hdev, ev->link_type,
+ &ev->bdaddr, 0,
+ HCI_ROLE_SLAVE);
+ if (IS_ERR(conn)) {
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
+ goto unlock;
+ }
+ } else {
+ if (ev->link_type != SCO_LINK)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK,
+ &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ conn->type = SCO_LINK;
+ }
}
- if (!ev->status) {
- conn->handle = __le16_to_cpu(ev->handle);
- conn->state = BT_CONNECTED;
+ /* The HCI_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (!HCI_CONN_HANDLE_UNSET(conn->handle)) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+ goto unlock;
+ }
+
+ if (!status) {
+ status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle));
+ if (status)
+ goto done;
+
+ if (conn->type == ACL_LINK) {
+ conn->state = BT_CONFIG;
+ hci_conn_hold(conn);
+
+ if (!conn->out && !hci_conn_ssp_enabled(conn) &&
+ !hci_find_link_key(hdev, &ev->bdaddr))
+ conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+ else
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ } else
+ conn->state = BT_CONNECTED;
+
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
if (test_bit(HCI_AUTH, &hdev->flags))
- conn->link_mode |= HCI_LM_AUTH;
+ set_bit(HCI_CONN_AUTH, &conn->flags);
if (test_bit(HCI_ENCRYPT, &hdev->flags))
- conn->link_mode |= HCI_LM_ENCRYPT;
+ set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+
+ /* "Link key request" completed ahead of "connect request" completes */
+ if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
+ ev->link_type == ACL_LINK) {
+ struct link_key *key;
+
+ key = hci_find_link_key(hdev, &ev->bdaddr);
+ if (key) {
+ set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ hci_read_enc_key_size(hdev, conn);
+ hci_encrypt_cfm(conn, ev->status);
+ }
+ }
/* Get remote features */
if (conn->type == ACL_LINK) {
struct hci_cp_read_remote_features cp;
cp.handle = ev->handle;
- hci_send_cmd(hdev, OGF_LINK_CTL,
- OCF_READ_REMOTE_FEATURES, sizeof(cp), &cp);
- }
+ hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES,
+ sizeof(cp), &cp);
- /* Set link policy */
- if (conn->type == ACL_LINK && hdev->link_policy) {
- struct hci_cp_write_link_policy cp;
- cp.handle = ev->handle;
- cp.policy = __cpu_to_le16(hdev->link_policy);
- hci_send_cmd(hdev, OGF_LINK_POLICY,
- OCF_WRITE_LINK_POLICY, sizeof(cp), &cp);
+ hci_update_scan(hdev);
}
/* Set packet type for incoming connection */
- if (!conn->out) {
+ if (!conn->out && hdev->hci_ver < BLUETOOTH_VER_2_0) {
struct hci_cp_change_conn_ptype cp;
cp.handle = ev->handle;
- cp.pkt_type = (conn->type == ACL_LINK) ?
- __cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK):
- __cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK);
-
- hci_send_cmd(hdev, OGF_LINK_CTL,
- OCF_CHANGE_CONN_PTYPE, sizeof(cp), &cp);
- } else {
- /* Update disconnect timer */
- hci_conn_hold(conn);
- hci_conn_put(conn);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+ hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp),
+ &cp);
}
- } else
- conn->state = BT_CLOSED;
+ }
- if (conn->type == ACL_LINK) {
- struct hci_conn *sco = conn->link;
- if (sco) {
- if (!ev->status)
- hci_add_sco(sco, conn->handle);
- else {
- hci_proto_connect_cfm(sco, ev->status);
- hci_conn_del(sco);
- }
+ if (conn->type == ACL_LINK)
+ hci_sco_setup(conn, ev->status);
+
+done:
+ if (status) {
+ hci_conn_failed(conn, status);
+ } else if (ev->link_type == SCO_LINK) {
+ switch (conn->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_CVSD:
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD);
+ break;
}
+
+ hci_connect_cfm(conn, status);
}
- hci_proto_connect_cfm(conn, ev->status);
- if (ev->status)
- hci_conn_del(conn);
+unlock:
+ hci_dev_unlock(hdev);
+}
- pend = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
- if (pend)
- hci_acl_connect(pend);
+static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ struct hci_cp_reject_conn_req cp;
- hci_dev_unlock(hdev);
+ bacpy(&cp.bdaddr, bdaddr);
+ cp.reason = HCI_ERROR_REJ_BAD_ADDR;
+ hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp);
}
-/* Disconnect Complete */
-static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_conn_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_disconn_complete *ev = (struct hci_ev_disconn_complete *) skb->data;
+ struct hci_ev_conn_request *ev = data;
+ int mask = hdev->link_mode;
+ struct inquiry_entry *ie;
struct hci_conn *conn;
+ __u8 flags = 0;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "bdaddr %pMR type 0x%x", &ev->bdaddr, ev->link_type);
- if (ev->status)
+ /* Reject incoming connection from device with same BD ADDR against
+ * CVE-2020-26555
+ */
+ if (hdev && !bacmp(&hdev->bdaddr, &ev->bdaddr)) {
+ bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n",
+ &ev->bdaddr);
+ hci_reject_conn(hdev, &ev->bdaddr);
+ return;
+ }
+
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type,
+ &flags);
+
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
return;
+ }
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
- if (conn) {
- conn->state = BT_CLOSED;
- hci_proto_disconn_ind(conn, ev->reason);
- hci_conn_del(conn);
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, &ev->bdaddr,
+ BDADDR_BREDR)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
+ goto unlock;
+ }
+
+ /* Require HCI_CONNECTABLE or an accept list entry to accept the
+ * connection. These features are only touched through mgmt so
+ * only do the checks if HCI_MGMT is set.
+ */
+ if (hci_dev_test_flag(hdev, HCI_MGMT) &&
+ !hci_dev_test_flag(hdev, HCI_CONNECTABLE) &&
+ !hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr,
+ BDADDR_BREDR)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
+ goto unlock;
+ }
+
+ /* Connection accepted */
+
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie)
+ memcpy(ie->data.dev_class, ev->dev_class, 3);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type,
+ &ev->bdaddr);
+ if (!conn) {
+ conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, 0,
+ HCI_ROLE_SLAVE);
+ if (IS_ERR(conn)) {
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
+ goto unlock;
+ }
+ }
+
+ memcpy(conn->dev_class, ev->dev_class, 3);
+
+ hci_dev_unlock(hdev);
+
+ if (ev->link_type == ACL_LINK ||
+ (!(flags & HCI_PROTO_DEFER) && !lmp_esco_capable(hdev))) {
+ struct hci_cp_accept_conn_req cp;
+ conn->state = BT_CONNECT;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+
+ if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER))
+ cp.role = 0x00; /* Become central */
+ else
+ cp.role = 0x01; /* Remain peripheral */
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp);
+ } else if (!(flags & HCI_PROTO_DEFER)) {
+ struct hci_cp_accept_sync_conn_req cp;
+ conn->state = BT_CONNECT;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.max_latency = cpu_to_le16(0xffff);
+ cp.content_format = cpu_to_le16(hdev->voice_setting);
+ cp.retrans_effort = 0xff;
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, sizeof(cp),
+ &cp);
+ } else {
+ conn->state = BT_CONNECT2;
+ hci_connect_cfm(conn, 0);
}
+ return;
+unlock:
hci_dev_unlock(hdev);
}
-/* Number of completed packets */
-static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_to_mgmt_reason(u8 err)
{
- struct hci_ev_num_comp_pkts *ev = (struct hci_ev_num_comp_pkts *) skb->data;
- __le16 *ptr;
- int i;
+ switch (err) {
+ case HCI_ERROR_CONNECTION_TIMEOUT:
+ return MGMT_DEV_DISCONN_TIMEOUT;
+ case HCI_ERROR_REMOTE_USER_TERM:
+ case HCI_ERROR_REMOTE_LOW_RESOURCES:
+ case HCI_ERROR_REMOTE_POWER_OFF:
+ return MGMT_DEV_DISCONN_REMOTE;
+ case HCI_ERROR_LOCAL_HOST_TERM:
+ return MGMT_DEV_DISCONN_LOCAL_HOST;
+ default:
+ return MGMT_DEV_DISCONN_UNKNOWN;
+ }
+}
+
+static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_disconn_complete *ev = data;
+ u8 reason;
+ struct hci_conn_params *params;
+ struct hci_conn *conn;
+ bool mgmt_connected;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
- skb_pull(skb, sizeof(*ev));
+ hci_dev_lock(hdev);
- BT_DBG("%s num_hndl %d", hdev->name, ev->num_hndl);
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
- if (skb->len < ev->num_hndl * 4) {
- BT_DBG("%s bad parameters", hdev->name);
- return;
+ if (ev->status) {
+ mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, ev->status);
+ goto unlock;
}
- tasklet_disable(&hdev->tx_task);
+ conn->state = BT_CLOSED;
- for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) {
- struct hci_conn *conn;
- __u16 handle, count;
+ mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
- handle = __le16_to_cpu(get_unaligned(ptr++));
- count = __le16_to_cpu(get_unaligned(ptr++));
+ if (test_bit(HCI_CONN_AUTH_FAILURE, &conn->flags))
+ reason = MGMT_DEV_DISCONN_AUTH_FAILURE;
+ else
+ reason = hci_to_mgmt_reason(ev->reason);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- if (conn) {
- conn->sent -= count;
+ mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type,
+ reason, mgmt_connected);
- if (conn->type == SCO_LINK) {
- if ((hdev->sco_cnt += count) > hdev->sco_pkts)
- hdev->sco_cnt = hdev->sco_pkts;
- } else {
- if ((hdev->acl_cnt += count) > hdev->acl_pkts)
- hdev->acl_cnt = hdev->acl_pkts;
+ if (conn->type == ACL_LINK) {
+ if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ hci_remove_link_key(hdev, &conn->dst);
+
+ hci_update_scan(hdev);
+ }
+
+ /* Re-enable passive scanning if disconnected device is marked
+ * as auto-connectable.
+ */
+ if (conn->type == LE_LINK) {
+ params = hci_conn_params_lookup(hdev, &conn->dst,
+ conn->dst_type);
+ if (params) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_LINK_LOSS:
+ if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ break;
+ fallthrough;
+
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params,
+ &hdev->pend_le_conns);
+ hci_update_passive_scan(hdev);
+ break;
+
+ default:
+ break;
}
}
}
- hci_sched_tx(hdev);
- tasklet_enable(&hdev->tx_task);
+ hci_disconn_cfm(conn, ev->reason);
+
+ /* Re-enable advertising if necessary, since it might
+ * have been disabled by the connection. From the
+ * HCI_LE_Set_Advertise_Enable command description in
+ * the core specification (v4.0):
+ * "The Controller shall continue advertising until the Host
+ * issues an LE_Set_Advertise_Enable command with
+ * Advertising_Enable set to 0x00 (Advertising is disabled)
+ * or until a connection is created or until the Advertising
+ * is timed out due to Directed Advertising."
+ */
+ if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) {
+ hdev->cur_adv_instance = conn->adv_instance;
+ hci_enable_advertising(hdev);
+ }
+
+ hci_conn_del(conn);
+
+unlock:
+ hci_dev_unlock(hdev);
}
-/* Role Change */
-static inline void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_auth_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_role_change *ev = (struct hci_ev_role_change *) skb->data;
+ struct hci_ev_auth_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
- if (conn) {
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (!ev->status) {
+ clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+ set_bit(HCI_CONN_AUTH, &conn->flags);
+ conn->sec_level = conn->pending_sec_level;
+ } else {
+ if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
+ set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
+ mgmt_auth_failed(conn, ev->status);
+ }
+
+ clear_bit(HCI_CONN_AUTH_PEND, &conn->flags);
+
+ if (conn->state == BT_CONFIG) {
+ if (!ev->status && hci_conn_ssp_enabled(conn)) {
+ struct hci_cp_set_conn_encrypt cp;
+ cp.handle = ev->handle;
+ cp.encrypt = 0x01;
+ hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+ &cp);
+ } else {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ }
+ } else {
+ hci_auth_cfm(conn, ev->status);
+
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(conn);
+ }
+
+ if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) {
if (!ev->status) {
- if (ev->role)
- conn->link_mode &= ~HCI_LM_MASTER;
- else
- conn->link_mode |= HCI_LM_MASTER;
+ struct hci_cp_set_conn_encrypt cp;
+ cp.handle = ev->handle;
+ cp.encrypt = 0x01;
+ hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+ &cp);
+ } else {
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+ hci_encrypt_cfm(conn, ev->status);
}
+ }
- clear_bit(HCI_CONN_RSWITCH_PEND, &conn->pend);
+unlock:
+ hci_dev_unlock(hdev);
+}
- hci_role_switch_cfm(conn, ev->status, ev->role);
+static void hci_remote_name_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_name *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto check_auth;
+
+ if (ev->status == 0)
+ hci_check_pending_name(hdev, conn, &ev->bdaddr, ev->name,
+ strnlen(ev->name, HCI_MAX_NAME_LENGTH));
+ else
+ hci_check_pending_name(hdev, conn, &ev->bdaddr, NULL, 0);
+
+check_auth:
+ if (!conn)
+ goto unlock;
+
+ if (!hci_outgoing_auth_needed(hdev, conn))
+ goto unlock;
+
+ if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) {
+ struct hci_cp_auth_requested cp;
+
+ set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags);
+
+ cp.handle = __cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp);
}
+unlock:
hci_dev_unlock(hdev);
}
-/* Mode Change */
-static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_mode_change *ev = (struct hci_ev_mode_change *) skb->data;
+ struct hci_ev_encrypt_change *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
- if (conn) {
- conn->mode = ev->mode;
- conn->interval = __le16_to_cpu(ev->interval);
+ if (!conn)
+ goto unlock;
- if (!test_and_clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
- if (conn->mode == HCI_CM_ACTIVE)
- conn->power_save = 1;
- else
- conn->power_save = 0;
+ if (!ev->status) {
+ if (ev->encrypt) {
+ /* Encryption implies authentication */
+ set_bit(HCI_CONN_AUTH, &conn->flags);
+ set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ conn->sec_level = conn->pending_sec_level;
+
+ /* P-256 authentication key implies FIPS */
+ if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256)
+ set_bit(HCI_CONN_FIPS, &conn->flags);
+
+ if ((conn->type == ACL_LINK && ev->encrypt == 0x02) ||
+ conn->type == LE_LINK)
+ set_bit(HCI_CONN_AES_CCM, &conn->flags);
+ } else {
+ clear_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ clear_bit(HCI_CONN_AES_CCM, &conn->flags);
}
}
+ /* We should disregard the current RPA and generate a new one
+ * whenever the encryption procedure fails.
+ */
+ if (ev->status && conn->type == LE_LINK) {
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
+ }
+
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+
+ /* Check link security requirements are met */
+ if (!hci_conn_check_link_mode(conn))
+ ev->status = HCI_ERROR_AUTH_FAILURE;
+
+ if (ev->status && conn->state == BT_CONNECTED) {
+ if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
+ set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+
+ /* Notify upper layers so they can cleanup before
+ * disconnecting.
+ */
+ hci_encrypt_cfm(conn, ev->status);
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ /* Try reading the encryption key size for encrypted ACL links */
+ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) {
+ if (hci_read_enc_key_size(hdev, conn))
+ goto notify;
+
+ goto unlock;
+ }
+
+ /* We skip the WRITE_AUTH_PAYLOAD_TIMEOUT for ATS2851 based controllers
+ * to avoid unexpected SMP command errors when pairing.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT))
+ goto notify;
+
+ /* Set the default Authenticated Payload Timeout after
+ * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B
+ * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be
+ * sent when the link is active and Encryption is enabled, the conn
+ * type can be either LE or ACL and controller must support LMP Ping.
+ * Ensure for AES-CCM encryption as well.
+ */
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
+ test_bit(HCI_CONN_AES_CCM, &conn->flags) &&
+ ((conn->type == ACL_LINK && lmp_ping_capable(hdev)) ||
+ (conn->type == LE_LINK && (hdev->le_features[0] & HCI_LE_PING)))) {
+ struct hci_cp_write_auth_payload_to cp;
+
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.timeout = cpu_to_le16(hdev->auth_payload_timeout);
+ if (hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO,
+ sizeof(cp), &cp))
+ bt_dev_err(hdev, "write auth payload timeout failed");
+ }
+
+notify:
+ hci_encrypt_cfm(conn, ev->status);
+
+unlock:
hci_dev_unlock(hdev);
}
-/* Authentication Complete */
-static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_change_link_key_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_auth_complete *ev = (struct hci_ev_auth_complete *) skb->data;
+ struct hci_ev_change_link_key_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
if (conn) {
if (!ev->status)
- conn->link_mode |= HCI_LM_AUTH;
+ set_bit(HCI_CONN_SECURE, &conn->flags);
- clear_bit(HCI_CONN_AUTH_PEND, &conn->pend);
+ clear_bit(HCI_CONN_AUTH_PEND, &conn->flags);
- hci_auth_cfm(conn, ev->status);
+ hci_key_change_cfm(conn, ev->status);
+ }
- if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) {
- if (!ev->status) {
- struct hci_cp_set_conn_encrypt cp;
- cp.handle = __cpu_to_le16(conn->handle);
- cp.encrypt = 1;
- hci_send_cmd(conn->hdev, OGF_LINK_CTL,
- OCF_SET_CONN_ENCRYPT, sizeof(cp), &cp);
- } else {
- clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
- hci_encrypt_cfm(conn, ev->status, 0x00);
- }
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_features_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_features *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (!ev->status)
+ memcpy(conn->features[0], ev->features, 8);
+
+ if (conn->state != BT_CONFIG)
+ goto unlock;
+
+ if (!ev->status && lmp_ext_feat_capable(hdev) &&
+ lmp_ext_feat_capable(conn)) {
+ struct hci_cp_read_remote_ext_features cp;
+ cp.handle = ev->handle;
+ cp.page = 0x01;
+ hci_send_cmd(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES,
+ sizeof(cp), &cp);
+ goto unlock;
+ }
+
+ if (!ev->status) {
+ struct hci_cp_remote_name_req cp;
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+ hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+ } else {
+ mgmt_device_connected(hdev, conn, NULL, 0);
+ }
+
+ if (!hci_outgoing_auth_needed(hdev, conn)) {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd)
+{
+ cancel_delayed_work(&hdev->cmd_timer);
+
+ rcu_read_lock();
+ if (!test_bit(HCI_RESET, &hdev->flags)) {
+ if (ncmd) {
+ cancel_delayed_work(&hdev->ncmd_timer);
+ atomic_set(&hdev->cmd_cnt, 1);
+ } else {
+ if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+ queue_delayed_work(hdev->workqueue, &hdev->ncmd_timer,
+ HCI_NCMD_TIMEOUT);
}
}
+ rcu_read_unlock();
+}
+
+static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_buffer_size_v2 *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->le_mtu = __le16_to_cpu(rp->acl_mtu);
+ hdev->le_pkts = rp->acl_max_pkt;
+ hdev->iso_mtu = __le16_to_cpu(rp->iso_mtu);
+ hdev->iso_pkts = rp->iso_max_pkt;
+
+ hdev->le_cnt = hdev->le_pkts;
+ hdev->iso_cnt = hdev->iso_pkts;
+
+ BT_DBG("%s acl mtu %d:%d iso mtu %d:%d", hdev->name, hdev->acl_mtu,
+ hdev->acl_pkts, hdev->iso_mtu, hdev->iso_pkts);
+
+ if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+ return HCI_ERROR_INVALID_PARAMETERS;
+
+ return rp->status;
+}
+
+static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status)
+{
+ struct hci_conn *conn, *tmp;
+
+ lockdep_assert_held(&hdev->lock);
+
+ list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) {
+ if (conn->type != CIS_LINK ||
+ conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig)
+ continue;
+
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ hci_conn_failed(conn, status);
+ }
+}
+
+static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_set_cig_params *rp = data;
+ struct hci_cp_le_set_cig_params *cp;
+ struct hci_conn *conn;
+ u8 status = rp->status;
+ bool pending = false;
+ int i;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_CIG_PARAMS);
+ if (!rp->status && (!cp || rp->num_handles != cp->num_cis ||
+ rp->cig_id != cp->cig_id)) {
+ bt_dev_err(hdev, "unexpected Set CIG Parameters response data");
+ status = HCI_ERROR_UNSPECIFIED;
+ }
+
+ hci_dev_lock(hdev);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E page 2554
+ *
+ * If the Status return parameter is non-zero, then the state of the CIG
+ * and its CIS configurations shall not be changed by the command. If
+ * the CIG did not already exist, it shall not be created.
+ */
+ if (status) {
+ /* Keep current configuration, fail only the unbound CIS */
+ hci_unbound_cis_failed(hdev, rp->cig_id, status);
+ goto unlock;
+ }
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2553
+ *
+ * If the Status return parameter is zero, then the Controller shall
+ * set the Connection_Handle arrayed return parameter to the connection
+ * handle(s) corresponding to the CIS configurations specified in
+ * the CIS_IDs command parameter, in the same order.
+ */
+ for (i = 0; i < rp->num_handles; ++i) {
+ conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, rp->cig_id,
+ cp->cis[i].cis_id);
+ if (!conn || !bacmp(&conn->dst, BDADDR_ANY))
+ continue;
+
+ if (conn->state != BT_BOUND && conn->state != BT_CONNECT)
+ continue;
+
+ if (hci_conn_set_handle(conn, __le16_to_cpu(rp->handle[i])))
+ continue;
+
+ if (conn->state == BT_CONNECT)
+ pending = true;
+ }
+
+unlock:
+ if (pending)
+ hci_le_create_cis_pending(hdev);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-/* Encryption Change */
-static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_le_setup_iso_path(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_encrypt_change *ev = (struct hci_ev_encrypt_change *) skb->data;
+ struct hci_rp_le_setup_iso_path *rp = data;
+ struct hci_cp_le_setup_iso_path *cp;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SETUP_ISO_PATH);
+ if (!cp)
+ return rp->status;
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (!conn)
+ goto unlock;
+
+ if (rp->status) {
+ hci_connect_cfm(conn, rp->status);
+ hci_conn_del(conn);
+ goto unlock;
+ }
+
+ switch (cp->direction) {
+ /* Input (Host to Controller) */
+ case 0x00:
+ /* Only confirm connection if output only */
+ if (conn->iso_qos.ucast.out.sdu && !conn->iso_qos.ucast.in.sdu)
+ hci_connect_cfm(conn, rp->status);
+ break;
+ /* Output (Controller to Host) */
+ case 0x01:
+ /* Confirm connection since conn->iso_qos is always configured
+ * last.
+ */
+ hci_connect_cfm(conn, rp->status);
+
+ /* Notify device connected in case it is a BIG Sync */
+ if (!rp->status && test_bit(HCI_CONN_BIG_SYNC, &conn->flags))
+ mgmt_device_connected(hdev, conn, NULL, 0);
+
+ break;
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_all_local_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_all_local_features *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ memcpy(hdev->le_features, rp->features, 248);
+
+ return rp->status;
+}
+
+static void hci_cs_le_create_big(struct hci_dev *hdev, u8 status)
+{
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+}
+
+static void hci_cs_le_read_all_remote_features(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_read_remote_features *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn && conn->state == BT_CONFIG)
+ hci_connect_cfm(conn, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static u8 hci_cc_set_per_adv_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_per_adv_params *cp;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS);
+ if (!cp)
+ return rp->status;
+
+ /* TODO: set the conn state */
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_per_adv_enable *cp;
+ struct adv_info *adv = NULL, *n;
+ u8 per_adv_cnt = 0;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE);
+ if (!cp)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ adv = hci_find_adv_instance(hdev, cp->handle);
+
+ if (cp->enable) {
+ hci_dev_set_flag(hdev, HCI_LE_PER_ADV);
+
+ if (adv)
+ adv->periodic_enabled = true;
+ } else {
+ if (adv)
+ adv->periodic_enabled = false;
+
+ /* If just one instance was disabled check if there are
+ * any other instance enabled before clearing HCI_LE_PER_ADV.
+ * The current periodic adv instance will be marked as
+ * disabled once extended advertising is also disabled.
+ */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+ list) {
+ if (adv->periodic && adv->enabled)
+ per_adv_cnt++;
+ }
+
+ if (per_adv_cnt > 1)
+ goto unlock;
+
+ hci_dev_clear_flag(hdev, HCI_LE_PER_ADV);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+#define HCI_CC_VL(_op, _func, _min, _max) \
+{ \
+ .op = _op, \
+ .func = _func, \
+ .min_len = _min, \
+ .max_len = _max, \
+}
+
+#define HCI_CC(_op, _func, _len) \
+ HCI_CC_VL(_op, _func, _len, _len)
+
+#define HCI_CC_STATUS(_op, _func) \
+ HCI_CC(_op, _func, sizeof(struct hci_ev_status))
+
+static const struct hci_cc {
+ u16 op;
+ u8 (*func)(struct hci_dev *hdev, void *data, struct sk_buff *skb);
+ u16 min_len;
+ u16 max_len;
+} hci_cc_table[] = {
+ HCI_CC_STATUS(HCI_OP_INQUIRY_CANCEL, hci_cc_inquiry_cancel),
+ HCI_CC_STATUS(HCI_OP_PERIODIC_INQ, hci_cc_periodic_inq),
+ HCI_CC_STATUS(HCI_OP_EXIT_PERIODIC_INQ, hci_cc_exit_periodic_inq),
+ HCI_CC(HCI_OP_REMOTE_NAME_REQ_CANCEL, hci_cc_remote_name_req_cancel,
+ sizeof(struct hci_rp_remote_name_req_cancel)),
+ HCI_CC(HCI_OP_ROLE_DISCOVERY, hci_cc_role_discovery,
+ sizeof(struct hci_rp_role_discovery)),
+ HCI_CC(HCI_OP_READ_LINK_POLICY, hci_cc_read_link_policy,
+ sizeof(struct hci_rp_read_link_policy)),
+ HCI_CC(HCI_OP_WRITE_LINK_POLICY, hci_cc_write_link_policy,
+ sizeof(struct hci_rp_write_link_policy)),
+ HCI_CC(HCI_OP_READ_DEF_LINK_POLICY, hci_cc_read_def_link_policy,
+ sizeof(struct hci_rp_read_def_link_policy)),
+ HCI_CC_STATUS(HCI_OP_WRITE_DEF_LINK_POLICY,
+ hci_cc_write_def_link_policy),
+ HCI_CC_STATUS(HCI_OP_RESET, hci_cc_reset),
+ HCI_CC(HCI_OP_READ_STORED_LINK_KEY, hci_cc_read_stored_link_key,
+ sizeof(struct hci_rp_read_stored_link_key)),
+ HCI_CC(HCI_OP_DELETE_STORED_LINK_KEY, hci_cc_delete_stored_link_key,
+ sizeof(struct hci_rp_delete_stored_link_key)),
+ HCI_CC_STATUS(HCI_OP_WRITE_LOCAL_NAME, hci_cc_write_local_name),
+ HCI_CC(HCI_OP_READ_LOCAL_NAME, hci_cc_read_local_name,
+ sizeof(struct hci_rp_read_local_name)),
+ HCI_CC_STATUS(HCI_OP_WRITE_AUTH_ENABLE, hci_cc_write_auth_enable),
+ HCI_CC_STATUS(HCI_OP_WRITE_ENCRYPT_MODE, hci_cc_write_encrypt_mode),
+ HCI_CC_STATUS(HCI_OP_WRITE_SCAN_ENABLE, hci_cc_write_scan_enable),
+ HCI_CC_STATUS(HCI_OP_SET_EVENT_FLT, hci_cc_set_event_filter),
+ HCI_CC(HCI_OP_READ_CLASS_OF_DEV, hci_cc_read_class_of_dev,
+ sizeof(struct hci_rp_read_class_of_dev)),
+ HCI_CC_STATUS(HCI_OP_WRITE_CLASS_OF_DEV, hci_cc_write_class_of_dev),
+ HCI_CC(HCI_OP_READ_VOICE_SETTING, hci_cc_read_voice_setting,
+ sizeof(struct hci_rp_read_voice_setting)),
+ HCI_CC_STATUS(HCI_OP_WRITE_VOICE_SETTING, hci_cc_write_voice_setting),
+ HCI_CC(HCI_OP_READ_NUM_SUPPORTED_IAC, hci_cc_read_num_supported_iac,
+ sizeof(struct hci_rp_read_num_supported_iac)),
+ HCI_CC_STATUS(HCI_OP_WRITE_SSP_MODE, hci_cc_write_ssp_mode),
+ HCI_CC_STATUS(HCI_OP_WRITE_SC_SUPPORT, hci_cc_write_sc_support),
+ HCI_CC(HCI_OP_READ_AUTH_PAYLOAD_TO, hci_cc_read_auth_payload_timeout,
+ sizeof(struct hci_rp_read_auth_payload_to)),
+ HCI_CC(HCI_OP_WRITE_AUTH_PAYLOAD_TO, hci_cc_write_auth_payload_timeout,
+ sizeof(struct hci_rp_write_auth_payload_to)),
+ HCI_CC(HCI_OP_READ_LOCAL_VERSION, hci_cc_read_local_version,
+ sizeof(struct hci_rp_read_local_version)),
+ HCI_CC(HCI_OP_READ_LOCAL_COMMANDS, hci_cc_read_local_commands,
+ sizeof(struct hci_rp_read_local_commands)),
+ HCI_CC(HCI_OP_READ_LOCAL_FEATURES, hci_cc_read_local_features,
+ sizeof(struct hci_rp_read_local_features)),
+ HCI_CC(HCI_OP_READ_LOCAL_EXT_FEATURES, hci_cc_read_local_ext_features,
+ sizeof(struct hci_rp_read_local_ext_features)),
+ HCI_CC(HCI_OP_READ_BUFFER_SIZE, hci_cc_read_buffer_size,
+ sizeof(struct hci_rp_read_buffer_size)),
+ HCI_CC(HCI_OP_READ_BD_ADDR, hci_cc_read_bd_addr,
+ sizeof(struct hci_rp_read_bd_addr)),
+ HCI_CC(HCI_OP_READ_LOCAL_PAIRING_OPTS, hci_cc_read_local_pairing_opts,
+ sizeof(struct hci_rp_read_local_pairing_opts)),
+ HCI_CC(HCI_OP_READ_PAGE_SCAN_ACTIVITY, hci_cc_read_page_scan_activity,
+ sizeof(struct hci_rp_read_page_scan_activity)),
+ HCI_CC_STATUS(HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
+ hci_cc_write_page_scan_activity),
+ HCI_CC(HCI_OP_READ_PAGE_SCAN_TYPE, hci_cc_read_page_scan_type,
+ sizeof(struct hci_rp_read_page_scan_type)),
+ HCI_CC_STATUS(HCI_OP_WRITE_PAGE_SCAN_TYPE, hci_cc_write_page_scan_type),
+ HCI_CC(HCI_OP_READ_CLOCK, hci_cc_read_clock,
+ sizeof(struct hci_rp_read_clock)),
+ HCI_CC(HCI_OP_READ_ENC_KEY_SIZE, hci_cc_read_enc_key_size,
+ sizeof(struct hci_rp_read_enc_key_size)),
+ HCI_CC(HCI_OP_READ_INQ_RSP_TX_POWER, hci_cc_read_inq_rsp_tx_power,
+ sizeof(struct hci_rp_read_inq_rsp_tx_power)),
+ HCI_CC(HCI_OP_READ_DEF_ERR_DATA_REPORTING,
+ hci_cc_read_def_err_data_reporting,
+ sizeof(struct hci_rp_read_def_err_data_reporting)),
+ HCI_CC_STATUS(HCI_OP_WRITE_DEF_ERR_DATA_REPORTING,
+ hci_cc_write_def_err_data_reporting),
+ HCI_CC(HCI_OP_PIN_CODE_REPLY, hci_cc_pin_code_reply,
+ sizeof(struct hci_rp_pin_code_reply)),
+ HCI_CC(HCI_OP_PIN_CODE_NEG_REPLY, hci_cc_pin_code_neg_reply,
+ sizeof(struct hci_rp_pin_code_neg_reply)),
+ HCI_CC(HCI_OP_READ_LOCAL_OOB_DATA, hci_cc_read_local_oob_data,
+ sizeof(struct hci_rp_read_local_oob_data)),
+ HCI_CC(HCI_OP_READ_LOCAL_OOB_EXT_DATA, hci_cc_read_local_oob_ext_data,
+ sizeof(struct hci_rp_read_local_oob_ext_data)),
+ HCI_CC(HCI_OP_LE_READ_BUFFER_SIZE, hci_cc_le_read_buffer_size,
+ sizeof(struct hci_rp_le_read_buffer_size)),
+ HCI_CC(HCI_OP_LE_READ_LOCAL_FEATURES, hci_cc_le_read_local_features,
+ sizeof(struct hci_rp_le_read_local_features)),
+ HCI_CC(HCI_OP_LE_READ_ADV_TX_POWER, hci_cc_le_read_adv_tx_power,
+ sizeof(struct hci_rp_le_read_adv_tx_power)),
+ HCI_CC(HCI_OP_USER_CONFIRM_REPLY, hci_cc_user_confirm_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC(HCI_OP_USER_CONFIRM_NEG_REPLY, hci_cc_user_confirm_neg_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC(HCI_OP_USER_PASSKEY_REPLY, hci_cc_user_passkey_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC(HCI_OP_USER_PASSKEY_NEG_REPLY, hci_cc_user_passkey_neg_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_RANDOM_ADDR, hci_cc_le_set_random_addr),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADV_ENABLE, hci_cc_le_set_adv_enable),
+ HCI_CC_STATUS(HCI_OP_LE_SET_SCAN_PARAM, hci_cc_le_set_scan_param),
+ HCI_CC_STATUS(HCI_OP_LE_SET_SCAN_ENABLE, hci_cc_le_set_scan_enable),
+ HCI_CC(HCI_OP_LE_READ_ACCEPT_LIST_SIZE,
+ hci_cc_le_read_accept_list_size,
+ sizeof(struct hci_rp_le_read_accept_list_size)),
+ HCI_CC_STATUS(HCI_OP_LE_CLEAR_ACCEPT_LIST, hci_cc_le_clear_accept_list),
+ HCI_CC_STATUS(HCI_OP_LE_ADD_TO_ACCEPT_LIST,
+ hci_cc_le_add_to_accept_list),
+ HCI_CC_STATUS(HCI_OP_LE_DEL_FROM_ACCEPT_LIST,
+ hci_cc_le_del_from_accept_list),
+ HCI_CC(HCI_OP_LE_READ_SUPPORTED_STATES, hci_cc_le_read_supported_states,
+ sizeof(struct hci_rp_le_read_supported_states)),
+ HCI_CC(HCI_OP_LE_READ_DEF_DATA_LEN, hci_cc_le_read_def_data_len,
+ sizeof(struct hci_rp_le_read_def_data_len)),
+ HCI_CC_STATUS(HCI_OP_LE_WRITE_DEF_DATA_LEN,
+ hci_cc_le_write_def_data_len),
+ HCI_CC_STATUS(HCI_OP_LE_ADD_TO_RESOLV_LIST,
+ hci_cc_le_add_to_resolv_list),
+ HCI_CC_STATUS(HCI_OP_LE_DEL_FROM_RESOLV_LIST,
+ hci_cc_le_del_from_resolv_list),
+ HCI_CC_STATUS(HCI_OP_LE_CLEAR_RESOLV_LIST,
+ hci_cc_le_clear_resolv_list),
+ HCI_CC(HCI_OP_LE_READ_RESOLV_LIST_SIZE, hci_cc_le_read_resolv_list_size,
+ sizeof(struct hci_rp_le_read_resolv_list_size)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADDR_RESOLV_ENABLE,
+ hci_cc_le_set_addr_resolution_enable),
+ HCI_CC(HCI_OP_LE_READ_MAX_DATA_LEN, hci_cc_le_read_max_data_len,
+ sizeof(struct hci_rp_le_read_max_data_len)),
+ HCI_CC_STATUS(HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ hci_cc_write_le_host_supported),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADV_PARAM, hci_cc_set_adv_param),
+ HCI_CC(HCI_OP_READ_RSSI, hci_cc_read_rssi,
+ sizeof(struct hci_rp_read_rssi)),
+ HCI_CC(HCI_OP_READ_TX_POWER, hci_cc_read_tx_power,
+ sizeof(struct hci_rp_read_tx_power)),
+ HCI_CC_STATUS(HCI_OP_WRITE_SSP_DEBUG_MODE, hci_cc_write_ssp_debug_mode),
+ HCI_CC_STATUS(HCI_OP_LE_SET_EXT_SCAN_PARAMS,
+ hci_cc_le_set_ext_scan_param),
+ HCI_CC_STATUS(HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ hci_cc_le_set_ext_scan_enable),
+ HCI_CC_STATUS(HCI_OP_LE_SET_DEFAULT_PHY, hci_cc_le_set_default_phy),
+ HCI_CC(HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
+ hci_cc_le_read_num_adv_sets,
+ sizeof(struct hci_rp_le_read_num_supported_adv_sets)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ hci_cc_le_set_ext_adv_enable),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ hci_cc_le_set_adv_set_random_addr),
+ HCI_CC_STATUS(HCI_OP_LE_REMOVE_ADV_SET, hci_cc_le_remove_adv_set),
+ HCI_CC_STATUS(HCI_OP_LE_CLEAR_ADV_SETS, hci_cc_le_clear_adv_sets),
+ HCI_CC_STATUS(HCI_OP_LE_SET_PER_ADV_PARAMS, hci_cc_set_per_adv_param),
+ HCI_CC_STATUS(HCI_OP_LE_SET_PER_ADV_ENABLE,
+ hci_cc_le_set_per_adv_enable),
+ HCI_CC(HCI_OP_LE_READ_TRANSMIT_POWER, hci_cc_le_read_transmit_power,
+ sizeof(struct hci_rp_le_read_transmit_power)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_PRIVACY_MODE, hci_cc_le_set_privacy_mode),
+ HCI_CC(HCI_OP_LE_READ_BUFFER_SIZE_V2, hci_cc_le_read_buffer_size_v2,
+ sizeof(struct hci_rp_le_read_buffer_size_v2)),
+ HCI_CC_VL(HCI_OP_LE_SET_CIG_PARAMS, hci_cc_le_set_cig_params,
+ sizeof(struct hci_rp_le_set_cig_params), HCI_MAX_EVENT_SIZE),
+ HCI_CC(HCI_OP_LE_SETUP_ISO_PATH, hci_cc_le_setup_iso_path,
+ sizeof(struct hci_rp_le_setup_iso_path)),
+ HCI_CC(HCI_OP_LE_READ_ALL_LOCAL_FEATURES,
+ hci_cc_le_read_all_local_features,
+ sizeof(struct hci_rp_le_read_all_local_features)),
+};
+
+static u8 hci_cc_func(struct hci_dev *hdev, const struct hci_cc *cc,
+ struct sk_buff *skb)
+{
+ void *data;
+
+ if (skb->len < cc->min_len) {
+ bt_dev_err(hdev, "unexpected cc 0x%4.4x length: %u < %u",
+ cc->op, skb->len, cc->min_len);
+ return HCI_ERROR_UNSPECIFIED;
+ }
+
+ /* Just warn if the length is over max_len size it still be possible to
+ * partially parse the cc so leave to callback to decide if that is
+ * acceptable.
+ */
+ if (skb->len > cc->max_len)
+ bt_dev_warn(hdev, "unexpected cc 0x%4.4x length: %u > %u",
+ cc->op, skb->len, cc->max_len);
+
+ data = hci_cc_skb_pull(hdev, skb, cc->op, cc->min_len);
+ if (!data)
+ return HCI_ERROR_UNSPECIFIED;
+
+ return cc->func(hdev, data, skb);
+}
+
+static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_cmd_complete *ev = data;
+ int i;
+
+ *opcode = __le16_to_cpu(ev->opcode);
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x", *opcode);
+
+ for (i = 0; i < ARRAY_SIZE(hci_cc_table); i++) {
+ if (hci_cc_table[i].op == *opcode) {
+ *status = hci_cc_func(hdev, &hci_cc_table[i], skb);
+ break;
+ }
+ }
+
+ if (i == ARRAY_SIZE(hci_cc_table)) {
+ if (!skb->len) {
+ bt_dev_err(hdev, "Unexpected cc 0x%4.4x with no status",
+ *opcode);
+ *status = HCI_ERROR_UNSPECIFIED;
+ return;
+ }
+
+ /* Unknown opcode, assume byte 0 contains the status, so
+ * that e.g. __hci_cmd_sync() properly returns errors
+ * for vendor specific commands send by HCI drivers.
+ * If a vendor doesn't actually follow this convention we may
+ * need to introduce a vendor CC table in order to properly set
+ * the status.
+ */
+ *status = skb->data[0];
+ }
+
+ handle_cmd_cnt_and_timer(hdev, ev->ncmd);
+
+ hci_req_cmd_complete(hdev, *opcode, *status, req_complete,
+ req_complete_skb);
+
+ if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) {
+ bt_dev_err(hdev,
+ "unexpected event for opcode 0x%4.4x", *opcode);
+ return;
+ }
+
+ if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q))
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_create_cis *cp;
+ bool pending = false;
+ int i;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CIS);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ /* Remove connection if command failed */
+ for (i = 0; i < cp->num_cis; i++) {
+ struct hci_conn *conn;
+ u16 handle;
+
+ handle = __le16_to_cpu(cp->cis[i].cis_handle);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (conn) {
+ if (test_and_clear_bit(HCI_CONN_CREATE_CIS,
+ &conn->flags))
+ pending = true;
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
+ }
+ }
+ cp->num_cis = 0;
+
+ if (pending)
+ hci_le_create_cis_pending(hdev);
+
+ hci_dev_unlock(hdev);
+}
+
+#define HCI_CS(_op, _func) \
+{ \
+ .op = _op, \
+ .func = _func, \
+}
+
+static const struct hci_cs {
+ u16 op;
+ void (*func)(struct hci_dev *hdev, __u8 status);
+} hci_cs_table[] = {
+ HCI_CS(HCI_OP_INQUIRY, hci_cs_inquiry),
+ HCI_CS(HCI_OP_CREATE_CONN, hci_cs_create_conn),
+ HCI_CS(HCI_OP_DISCONNECT, hci_cs_disconnect),
+ HCI_CS(HCI_OP_ADD_SCO, hci_cs_add_sco),
+ HCI_CS(HCI_OP_AUTH_REQUESTED, hci_cs_auth_requested),
+ HCI_CS(HCI_OP_SET_CONN_ENCRYPT, hci_cs_set_conn_encrypt),
+ HCI_CS(HCI_OP_REMOTE_NAME_REQ, hci_cs_remote_name_req),
+ HCI_CS(HCI_OP_READ_REMOTE_FEATURES, hci_cs_read_remote_features),
+ HCI_CS(HCI_OP_READ_REMOTE_EXT_FEATURES,
+ hci_cs_read_remote_ext_features),
+ HCI_CS(HCI_OP_SETUP_SYNC_CONN, hci_cs_setup_sync_conn),
+ HCI_CS(HCI_OP_ENHANCED_SETUP_SYNC_CONN,
+ hci_cs_enhanced_setup_sync_conn),
+ HCI_CS(HCI_OP_SNIFF_MODE, hci_cs_sniff_mode),
+ HCI_CS(HCI_OP_EXIT_SNIFF_MODE, hci_cs_exit_sniff_mode),
+ HCI_CS(HCI_OP_SWITCH_ROLE, hci_cs_switch_role),
+ HCI_CS(HCI_OP_LE_CREATE_CONN, hci_cs_le_create_conn),
+ HCI_CS(HCI_OP_LE_READ_REMOTE_FEATURES, hci_cs_le_read_remote_features),
+ HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc),
+ HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn),
+ HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis),
+ HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big),
+ HCI_CS(HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+ hci_cs_le_read_all_remote_features),
+};
+
+static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_cmd_status *ev = data;
+ int i;
+
+ *opcode = __le16_to_cpu(ev->opcode);
+ *status = ev->status;
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x", *opcode);
+
+ for (i = 0; i < ARRAY_SIZE(hci_cs_table); i++) {
+ if (hci_cs_table[i].op == *opcode) {
+ hci_cs_table[i].func(hdev, ev->status);
+ break;
+ }
+ }
+
+ handle_cmd_cnt_and_timer(hdev, ev->ncmd);
+
+ /* Indicate request completion if the command failed. Also, if
+ * we're not waiting for a special event and we get a success
+ * command status we should try to flag the request as completed
+ * (since for this kind of commands there will not be a command
+ * complete event).
+ */
+ if (ev->status || (hdev->req_skb && !hci_skb_event(hdev->req_skb))) {
+ hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete,
+ req_complete_skb);
+ if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) {
+ bt_dev_err(hdev, "unexpected event for opcode 0x%4.4x",
+ *opcode);
+ return;
+ }
+ }
+
+ if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q))
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+static void hci_hardware_error_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_hardware_error *ev = data;
+
+ bt_dev_dbg(hdev, "code 0x%2.2x", ev->code);
+
+ hdev->hw_error_code = ev->code;
+
+ queue_work(hdev->req_workqueue, &hdev->error_reset);
+}
+
+static void hci_role_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_role_change *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
if (conn) {
- if (!ev->status) {
- if (ev->encrypt)
- conn->link_mode |= HCI_LM_ENCRYPT;
- else
- conn->link_mode &= ~HCI_LM_ENCRYPT;
+ if (!ev->status)
+ conn->role = ev->role;
+
+ clear_bit(HCI_CONN_RSWITCH_PEND, &conn->flags);
+
+ hci_role_switch_cfm(conn, ev->status, ev->role);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_num_comp_pkts *ev = data;
+ int i;
+
+ if (!hci_ev_skb_pull(hdev, skb, HCI_EV_NUM_COMP_PKTS,
+ flex_array_size(ev, handles, ev->num)))
+ return;
+
+ bt_dev_dbg(hdev, "num %d", ev->num);
+
+ hci_dev_lock(hdev);
+
+ for (i = 0; i < ev->num; i++) {
+ struct hci_comp_pkts_info *info = &ev->handles[i];
+ struct hci_conn *conn;
+ __u16 handle, count;
+ unsigned int i;
+
+ handle = __le16_to_cpu(info->handle);
+ count = __le16_to_cpu(info->count);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn)
+ continue;
+
+ /* Check if there is really enough packets outstanding before
+ * attempting to decrease the sent counter otherwise it could
+ * underflow..
+ */
+ if (conn->sent >= count) {
+ conn->sent -= count;
+ } else {
+ bt_dev_warn(hdev, "hcon %p sent %u < count %u",
+ conn, conn->sent, count);
+ conn->sent = 0;
}
- clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
+ for (i = 0; i < count; ++i)
+ hci_conn_tx_dequeue(conn);
+
+ switch (conn->type) {
+ case ACL_LINK:
+ hdev->acl_cnt += count;
+ if (hdev->acl_cnt > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ break;
- hci_encrypt_cfm(conn, ev->status, ev->encrypt);
+ case LE_LINK:
+ if (hdev->le_pkts) {
+ hdev->le_cnt += count;
+ if (hdev->le_cnt > hdev->le_pkts)
+ hdev->le_cnt = hdev->le_pkts;
+ } else {
+ hdev->acl_cnt += count;
+ if (hdev->acl_cnt > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ }
+ break;
+
+ case SCO_LINK:
+ case ESCO_LINK:
+ hdev->sco_cnt += count;
+ if (hdev->sco_cnt > hdev->sco_pkts)
+ hdev->sco_cnt = hdev->sco_pkts;
+
+ break;
+
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ hdev->iso_cnt += count;
+ if (hdev->iso_cnt > hdev->iso_pkts)
+ hdev->iso_cnt = hdev->iso_pkts;
+ break;
+
+ default:
+ bt_dev_err(hdev, "unknown type %d conn %p",
+ conn->type, conn);
+ break;
+ }
}
+ queue_work(hdev->workqueue, &hdev->tx_work);
+
hci_dev_unlock(hdev);
}
-/* Change Connection Link Key Complete */
-static inline void hci_change_conn_link_key_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_mode_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_change_conn_link_key_complete *ev = (struct hci_ev_change_conn_link_key_complete *) skb->data;
+ struct hci_ev_mode_change *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
if (conn) {
- if (!ev->status)
- conn->link_mode |= HCI_LM_SECURE;
+ conn->mode = ev->mode;
- clear_bit(HCI_CONN_AUTH_PEND, &conn->pend);
+ if (!test_and_clear_bit(HCI_CONN_MODE_CHANGE_PEND,
+ &conn->flags)) {
+ if (conn->mode == HCI_CM_ACTIVE)
+ set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
+ else
+ clear_bit(HCI_CONN_POWER_SAVE, &conn->flags);
+ }
- hci_key_change_cfm(conn, ev->status);
+ if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->flags))
+ hci_sco_setup(conn, ev->status);
}
hci_dev_unlock(hdev);
}
-/* Pin Code Request*/
-static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_pin_code_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
+ struct hci_ev_pin_code_req *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ if (conn->state == BT_CONNECTED) {
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+ hci_conn_drop(conn);
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE) &&
+ !test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags)) {
+ hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
+ sizeof(ev->bdaddr), &ev->bdaddr);
+ } else if (hci_dev_test_flag(hdev, HCI_MGMT)) {
+ u8 secure;
+
+ if (conn->pending_sec_level == BT_SECURITY_HIGH)
+ secure = 1;
+ else
+ secure = 0;
+
+ mgmt_pin_code_request(hdev, &ev->bdaddr, secure);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
}
-/* Link Key Request */
-static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void conn_set_key(struct hci_conn *conn, u8 key_type, u8 pin_len)
{
+ if (key_type == HCI_LK_CHANGED_COMBINATION)
+ return;
+
+ conn->pin_length = pin_len;
+ conn->key_type = key_type;
+
+ switch (key_type) {
+ case HCI_LK_LOCAL_UNIT:
+ case HCI_LK_REMOTE_UNIT:
+ case HCI_LK_DEBUG_COMBINATION:
+ return;
+ case HCI_LK_COMBINATION:
+ if (pin_len == 16)
+ conn->pending_sec_level = BT_SECURITY_HIGH;
+ else
+ conn->pending_sec_level = BT_SECURITY_MEDIUM;
+ break;
+ case HCI_LK_UNAUTH_COMBINATION_P192:
+ case HCI_LK_UNAUTH_COMBINATION_P256:
+ conn->pending_sec_level = BT_SECURITY_MEDIUM;
+ break;
+ case HCI_LK_AUTH_COMBINATION_P192:
+ conn->pending_sec_level = BT_SECURITY_HIGH;
+ break;
+ case HCI_LK_AUTH_COMBINATION_P256:
+ conn->pending_sec_level = BT_SECURITY_FIPS;
+ break;
+ }
}
-/* Link Key Notification */
-static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_link_key_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
+ struct hci_ev_link_key_req *ev = data;
+ struct hci_cp_link_key_reply cp;
+ struct hci_conn *conn;
+ struct link_key *key;
+
+ bt_dev_dbg(hdev, "");
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ hci_dev_lock(hdev);
+
+ key = hci_find_link_key(hdev, &ev->bdaddr);
+ if (!key) {
+ bt_dev_dbg(hdev, "link key not found for %pMR", &ev->bdaddr);
+ goto not_found;
+ }
+
+ bt_dev_dbg(hdev, "found key type %u for %pMR", key->type, &ev->bdaddr);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (conn) {
+ clear_bit(HCI_CONN_NEW_LINK_KEY, &conn->flags);
+
+ if ((key->type == HCI_LK_UNAUTH_COMBINATION_P192 ||
+ key->type == HCI_LK_UNAUTH_COMBINATION_P256) &&
+ conn->auth_type != 0xff && (conn->auth_type & 0x01)) {
+ bt_dev_dbg(hdev, "ignoring unauthenticated key");
+ goto not_found;
+ }
+
+ if (key->type == HCI_LK_COMBINATION && key->pin_len < 16 &&
+ (conn->pending_sec_level == BT_SECURITY_HIGH ||
+ conn->pending_sec_level == BT_SECURITY_FIPS)) {
+ bt_dev_dbg(hdev, "ignoring key unauthenticated for high security");
+ goto not_found;
+ }
+
+ conn_set_key(conn, key->type, key->pin_len);
+ }
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ memcpy(cp.link_key, key->val, HCI_LINK_KEY_SIZE);
+
+ hci_send_cmd(hdev, HCI_OP_LINK_KEY_REPLY, sizeof(cp), &cp);
+
+ hci_dev_unlock(hdev);
+
+ return;
+
+not_found:
+ hci_send_cmd(hdev, HCI_OP_LINK_KEY_NEG_REPLY, 6, &ev->bdaddr);
+ hci_dev_unlock(hdev);
}
-/* Remote Features */
-static inline void hci_remote_features_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_link_key_notify_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_remote_features *ev = (struct hci_ev_remote_features *) skb->data;
+ struct hci_ev_link_key_notify *ev = data;
struct hci_conn *conn;
+ struct link_key *key;
+ bool persistent;
+ u8 pin_len = 0;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
- if (conn && !ev->status) {
- memcpy(conn->features, ev->features, sizeof(conn->features));
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ /* Ignore NULL link key against CVE-2020-26555 */
+ if (!crypto_memneq(ev->link_key, ZERO_KEY, HCI_LINK_KEY_SIZE)) {
+ bt_dev_dbg(hdev, "Ignore NULL link key (ZERO KEY) for %pMR",
+ &ev->bdaddr);
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(conn);
+
+ set_bit(HCI_CONN_NEW_LINK_KEY, &conn->flags);
+ conn_set_key(conn, ev->key_type, conn->pin_length);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ key = hci_add_link_key(hdev, conn, &ev->bdaddr, ev->link_key,
+ ev->key_type, pin_len, &persistent);
+ if (!key)
+ goto unlock;
+
+ /* Update connection information since adding the key will have
+ * fixed up the type in the case of changed combination keys.
+ */
+ if (ev->key_type == HCI_LK_CHANGED_COMBINATION)
+ conn_set_key(conn, key->type, key->pin_len);
+
+ mgmt_new_link_key(hdev, key, persistent);
+
+ /* Keep debug keys around only if the HCI_KEEP_DEBUG_KEYS flag
+ * is set. If it's not set simply remove the key from the kernel
+ * list (we've still notified user space about it but with
+ * store_hint being 0).
+ */
+ if (key->type == HCI_LK_DEBUG_COMBINATION &&
+ !hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS)) {
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+ goto unlock;
}
+ if (persistent)
+ clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags);
+ else
+ set_bit(HCI_CONN_FLUSH_KEY, &conn->flags);
+
+unlock:
hci_dev_unlock(hdev);
}
-/* Clock Offset */
-static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_clock_offset_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_clock_offset *ev = (struct hci_ev_clock_offset *) skb->data;
+ struct hci_ev_clock_offset *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -1082,7 +4774,8 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
if (conn && !ev->status) {
struct inquiry_entry *ie;
- if ((ie = hci_inquiry_cache_lookup(hdev, &conn->dst))) {
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie) {
ie->data.clock_offset = ev->clock_offset;
ie->timestamp = jiffies;
}
@@ -1091,17 +4784,35 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
hci_dev_unlock(hdev);
}
-/* Page Scan Repetition Mode */
-static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_pkt_type_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data;
+ struct hci_ev_pkt_type_change *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn && !ev->status)
+ conn->pkt_type = __le16_to_cpu(ev->pkt_type);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_pscan_rep_mode_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_pscan_rep_mode *ev = data;
struct inquiry_entry *ie;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
- if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) {
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie) {
ie->data.pscan_rep_mode = ev->pscan_rep_mode;
ie->timestamp = jiffies;
}
@@ -1109,218 +4820,2971 @@ static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *
hci_dev_unlock(hdev);
}
-/* Sniff Subrate */
-static inline void hci_sniff_subrate_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
+ struct sk_buff *skb)
+{
+ struct hci_ev_inquiry_result_rssi *ev = edata;
+ struct inquiry_data data;
+ int i;
+
+ bt_dev_dbg(hdev, "num_rsp %d", ev->num);
+
+ if (!ev->num)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (skb->len == array_size(ev->num,
+ sizeof(struct inquiry_info_rssi_pscan))) {
+ struct inquiry_info_rssi_pscan *info;
+
+ for (i = 0; i < ev->num; i++) {
+ u32 flags;
+
+ info = hci_ev_skb_pull(hdev, skb,
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI,
+ sizeof(*info));
+ if (!info) {
+ bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI);
+ goto unlock;
+ }
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = info->pscan_mode;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ data.ssp_mode = 0x00;
+
+ flags = hci_inquiry_cache_update(hdev, &data, false);
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, info->rssi,
+ flags, NULL, 0, NULL, 0, 0);
+ }
+ } else if (skb->len == array_size(ev->num,
+ sizeof(struct inquiry_info_rssi))) {
+ struct inquiry_info_rssi *info;
+
+ for (i = 0; i < ev->num; i++) {
+ u32 flags;
+
+ info = hci_ev_skb_pull(hdev, skb,
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI,
+ sizeof(*info));
+ if (!info) {
+ bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI);
+ goto unlock;
+ }
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ data.ssp_mode = 0x00;
+
+ flags = hci_inquiry_cache_update(hdev, &data, false);
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, info->rssi,
+ flags, NULL, 0, NULL, 0, 0);
+ }
+ } else {
+ bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI);
+ }
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_ext_features_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_sniff_subrate *ev = (struct hci_ev_sniff_subrate *) skb->data;
+ struct hci_ev_remote_ext_features *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status %d", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
- if (conn) {
+ if (!conn)
+ goto unlock;
+
+ if (ev->page < HCI_MAX_PAGES)
+ memcpy(conn->features[ev->page], ev->features, 8);
+
+ if (!ev->status && ev->page == 0x01) {
+ struct inquiry_entry *ie;
+
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie)
+ ie->data.ssp_mode = (ev->features[0] & LMP_HOST_SSP);
+
+ if (ev->features[0] & LMP_HOST_SSP) {
+ set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+ } else {
+ /* It is mandatory by the Bluetooth specification that
+ * Extended Inquiry Results are only used when Secure
+ * Simple Pairing is enabled, but some devices violate
+ * this.
+ *
+ * To make these devices work, the internal SSP
+ * enabled flag needs to be cleared if the remote host
+ * features do not indicate SSP support */
+ clear_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+ }
+
+ if (ev->features[0] & LMP_HOST_SC)
+ set_bit(HCI_CONN_SC_ENABLED, &conn->flags);
}
+ if (conn->state != BT_CONFIG)
+ goto unlock;
+
+ if (!ev->status && !test_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) {
+ struct hci_cp_remote_name_req cp;
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+ hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+ } else {
+ mgmt_device_connected(hdev, conn, NULL, 0);
+ }
+
+ if (!hci_outgoing_auth_needed(hdev, conn)) {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ }
+
+unlock:
hci_dev_unlock(hdev);
}
-void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
- struct hci_ev_cmd_complete *ec;
- struct hci_ev_cmd_status *cs;
- u16 opcode, ocf, ogf;
+ struct hci_ev_sync_conn_complete *ev = data;
+ struct hci_conn *conn;
+ u8 status = ev->status;
- skb_pull(skb, HCI_EVENT_HDR_SIZE);
+ switch (ev->link_type) {
+ case SCO_LINK:
+ case ESCO_LINK:
+ break;
+ default:
+ /* As per Core 5.3 Vol 4 Part E 7.7.35 (p.2219), Link_Type
+ * for HCI_Synchronous_Connection_Complete is limited to
+ * either SCO or eSCO
+ */
+ bt_dev_err(hdev, "Ignoring connect complete event for invalid link type");
+ return;
+ }
- BT_DBG("%s evt 0x%x", hdev->name, hdr->evt);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- switch (hdr->evt) {
- case HCI_EV_NUM_COMP_PKTS:
- hci_num_comp_pkts_evt(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_EV_INQUIRY_COMPLETE:
- hci_inquiry_complete_evt(hdev, skb);
- break;
+ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
+ if (!conn) {
+ if (ev->link_type == ESCO_LINK)
+ goto unlock;
+
+ /* When the link type in the event indicates SCO connection
+ * and lookup of the connection object fails, then check
+ * if an eSCO connection object exists.
+ *
+ * The core limits the synchronous connections to either
+ * SCO or eSCO. The eSCO connection is preferred and tried
+ * to be setup first and until successfully established,
+ * the link type will be hinted as eSCO.
+ */
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+ }
- case HCI_EV_INQUIRY_RESULT:
- hci_inquiry_result_evt(hdev, skb);
- break;
+ /* The HCI_Synchronous_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (!HCI_CONN_HANDLE_UNSET(conn->handle)) {
+ bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection");
+ goto unlock;
+ }
- case HCI_EV_INQUIRY_RESULT_WITH_RSSI:
- hci_inquiry_result_with_rssi_evt(hdev, skb);
- break;
+ switch (status) {
+ case 0x00:
+ status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle));
+ if (status) {
+ conn->state = BT_CLOSED;
+ break;
+ }
- case HCI_EV_EXTENDED_INQUIRY_RESULT:
- hci_extended_inquiry_result_evt(hdev, skb);
- break;
+ conn->state = BT_CONNECTED;
+ conn->type = ev->link_type;
- case HCI_EV_CONN_REQUEST:
- hci_conn_request_evt(hdev, skb);
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
break;
- case HCI_EV_CONN_COMPLETE:
- hci_conn_complete_evt(hdev, skb);
- break;
+ case 0x10: /* Connection Accept Timeout */
+ case 0x0d: /* Connection Rejected due to Limited Resources */
+ case 0x11: /* Unsupported Feature or Parameter Value */
+ case 0x1c: /* SCO interval rejected */
+ case 0x1a: /* Unsupported Remote Feature */
+ case 0x1e: /* Invalid LMP Parameters */
+ case 0x1f: /* Unspecified error */
+ case 0x20: /* Unsupported LMP Parameter value */
+ if (conn->out) {
+ conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+ (hdev->esco_type & EDR_ESCO_MASK);
+ if (hci_setup_sync(conn, conn->parent->handle))
+ goto unlock;
+ }
+ fallthrough;
- case HCI_EV_DISCONN_COMPLETE:
- hci_disconn_complete_evt(hdev, skb);
+ default:
+ conn->state = BT_CLOSED;
break;
+ }
- case HCI_EV_ROLE_CHANGE:
- hci_role_change_evt(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode);
+ /* Notify only in case of SCO over HCI transport data path which
+ * is zero and non-zero value shall be non-HCI transport data path
+ */
+ if (conn->codec.data_path == 0 && hdev->notify) {
+ switch (ev->air_mode) {
+ case 0x02:
+ hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD);
+ break;
+ case 0x03:
+ hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP);
+ break;
+ }
+ }
- case HCI_EV_MODE_CHANGE:
- hci_mode_change_evt(hdev, skb);
- break;
+ hci_connect_cfm(conn, status);
+ if (status)
+ hci_conn_del(conn);
- case HCI_EV_AUTH_COMPLETE:
- hci_auth_complete_evt(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
+}
- case HCI_EV_ENCRYPT_CHANGE:
- hci_encrypt_change_evt(hdev, skb);
- break;
+static inline size_t eir_get_length(u8 *eir, size_t eir_len)
+{
+ size_t parsed = 0;
- case HCI_EV_CHANGE_CONN_LINK_KEY_COMPLETE:
- hci_change_conn_link_key_complete_evt(hdev, skb);
- break;
+ while (parsed < eir_len) {
+ u8 field_len = eir[0];
- case HCI_EV_PIN_CODE_REQ:
- hci_pin_code_request_evt(hdev, skb);
- break;
+ if (field_len == 0)
+ return parsed;
- case HCI_EV_LINK_KEY_REQ:
- hci_link_key_request_evt(hdev, skb);
- break;
+ parsed += field_len + 1;
+ eir += field_len + 1;
+ }
- case HCI_EV_LINK_KEY_NOTIFY:
- hci_link_key_notify_evt(hdev, skb);
- break;
+ return eir_len;
+}
- case HCI_EV_REMOTE_FEATURES:
- hci_remote_features_evt(hdev, skb);
- break;
+static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, void *edata,
+ struct sk_buff *skb)
+{
+ struct hci_ev_ext_inquiry_result *ev = edata;
+ struct inquiry_data data;
+ size_t eir_len;
+ int i;
+
+ if (!hci_ev_skb_pull(hdev, skb, HCI_EV_EXTENDED_INQUIRY_RESULT,
+ flex_array_size(ev, info, ev->num)))
+ return;
+
+ bt_dev_dbg(hdev, "num %d", ev->num);
+
+ if (!ev->num)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
+ return;
+
+ hci_dev_lock(hdev);
+
+ for (i = 0; i < ev->num; i++) {
+ struct extended_inquiry_info *info = &ev->info[i];
+ u32 flags;
+ bool name_known;
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ data.ssp_mode = 0x01;
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ name_known = eir_get_data(info->data,
+ sizeof(info->data),
+ EIR_NAME_COMPLETE, NULL);
+ else
+ name_known = true;
+
+ flags = hci_inquiry_cache_update(hdev, &data, name_known);
+
+ eir_len = eir_get_length(info->data, sizeof(info->data));
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, info->rssi,
+ flags, info->data, eir_len, NULL, 0, 0);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_key_refresh_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_key_refresh_complete *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x handle 0x%4.4x", ev->status,
+ __le16_to_cpu(ev->handle));
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ /* For BR/EDR the necessary steps are taken through the
+ * auth_complete event.
+ */
+ if (conn->type != LE_LINK)
+ goto unlock;
+
+ if (!ev->status)
+ conn->sec_level = conn->pending_sec_level;
+
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+
+ if (ev->status && conn->state == BT_CONNECTED) {
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ if (conn->state == BT_CONFIG) {
+ if (!ev->status)
+ conn->state = BT_CONNECTED;
+
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ } else {
+ hci_auth_cfm(conn, ev->status);
+
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(conn);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static u8 hci_get_auth_req(struct hci_conn *conn)
+{
+ /* If remote requests no-bonding follow that lead */
+ if (conn->remote_auth == HCI_AT_NO_BONDING ||
+ conn->remote_auth == HCI_AT_NO_BONDING_MITM)
+ return conn->remote_auth | (conn->auth_type & 0x01);
+
+ /* If both remote and local have enough IO capabilities, require
+ * MITM protection
+ */
+ if (conn->remote_cap != HCI_IO_NO_INPUT_OUTPUT &&
+ conn->io_capability != HCI_IO_NO_INPUT_OUTPUT)
+ return conn->remote_auth | 0x01;
+
+ /* No MITM protection possible so ignore remote requirement */
+ return (conn->remote_auth & ~0x01) | (conn->auth_type & 0x01);
+}
+
+static u8 bredr_oob_data_present(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct oob_data *data;
+
+ data = hci_find_remote_oob_data(hdev, &conn->dst, BDADDR_BREDR);
+ if (!data)
+ return 0x00;
+
+ if (bredr_sc_enabled(hdev)) {
+ /* When Secure Connections is enabled, then just
+ * return the present value stored with the OOB
+ * data. The stored value contains the right present
+ * information. However it can only be trusted when
+ * not in Secure Connection Only mode.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SC_ONLY))
+ return data->present;
+
+ /* When Secure Connections Only mode is enabled, then
+ * the P-256 values are required. If they are not
+ * available, then do not declare that OOB data is
+ * present.
+ */
+ if (!crypto_memneq(data->rand256, ZERO_KEY, 16) ||
+ !crypto_memneq(data->hash256, ZERO_KEY, 16))
+ return 0x00;
+
+ return 0x02;
+ }
+
+ /* When Secure Connections is not enabled or actually
+ * not supported by the hardware, then check that if
+ * P-192 data values are present.
+ */
+ if (!crypto_memneq(data->rand192, ZERO_KEY, 16) ||
+ !crypto_memneq(data->hash192, ZERO_KEY, 16))
+ return 0x00;
+
+ return 0x01;
+}
+
+static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_io_capa_request *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ goto unlock;
+
+ /* Assume remote supports SSP since it has triggered this event */
+ set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+
+ hci_conn_hold(conn);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ /* Allow pairing if we're pairable, the initiators of the
+ * pairing or if the remote is not requesting bonding.
+ */
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE) ||
+ test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags) ||
+ (conn->remote_auth & ~0x01) == HCI_AT_NO_BONDING) {
+ struct hci_cp_io_capability_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ /* Change the IO capability from KeyboardDisplay
+ * to DisplayYesNo as it is not supported by BT spec. */
+ cp.capability = (conn->io_capability == 0x04) ?
+ HCI_IO_DISPLAY_YESNO : conn->io_capability;
+
+ /* If we are initiators, there is no remote information yet */
+ if (conn->remote_auth == 0xff) {
+ /* Request MITM protection if our IO caps allow it
+ * except for the no-bonding case.
+ */
+ if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
+ conn->auth_type != HCI_AT_NO_BONDING)
+ conn->auth_type |= 0x01;
+ } else {
+ conn->auth_type = hci_get_auth_req(conn);
+ }
+
+ /* If we're not bondable, force one of the non-bondable
+ * authentication requirement values.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE))
+ conn->auth_type &= HCI_AT_NO_BONDING_MITM;
+
+ cp.authentication = conn->auth_type;
+ cp.oob_data = bredr_oob_data_present(conn);
+
+ hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_REPLY,
+ sizeof(cp), &cp);
+ } else {
+ struct hci_cp_io_capability_neg_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ cp.reason = HCI_ERROR_PAIRING_NOT_ALLOWED;
+
+ hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_NEG_REPLY,
+ sizeof(cp), &cp);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_io_capa_reply_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_io_capa_reply *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ conn->remote_cap = ev->capability;
+ conn->remote_auth = ev->authentication;
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_user_confirm_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_user_confirm_req *ev = data;
+ int loc_mitm, rem_mitm, confirm_hint = 0;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ loc_mitm = (conn->auth_type & 0x01);
+ rem_mitm = (conn->remote_auth & 0x01);
+
+ /* If we require MITM but the remote device can't provide that
+ * (it has NoInputNoOutput) then reject the confirmation
+ * request. We check the security level here since it doesn't
+ * necessarily match conn->auth_type.
+ */
+ if (conn->pending_sec_level > BT_SECURITY_MEDIUM &&
+ conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) {
+ bt_dev_dbg(hdev, "Rejecting request: remote device can't provide MITM");
+ hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY,
+ sizeof(ev->bdaddr), &ev->bdaddr);
+ goto unlock;
+ }
+
+ /* If no side requires MITM protection; use JUST_CFM method */
+ if ((!loc_mitm || conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) &&
+ (!rem_mitm || conn->io_capability == HCI_IO_NO_INPUT_OUTPUT)) {
+
+ /* If we're not the initiator of request authorization and the
+ * local IO capability is not NoInputNoOutput, use JUST_WORKS
+ * method (mgmt_user_confirm with confirm_hint set to 1).
+ */
+ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) &&
+ conn->io_capability != HCI_IO_NO_INPUT_OUTPUT) {
+ bt_dev_dbg(hdev, "Confirming auto-accept as acceptor");
+ confirm_hint = 1;
+ goto confirm;
+ }
+
+ /* If there already exists link key in local host, leave the
+ * decision to user space since the remote device could be
+ * legitimate or malicious.
+ */
+ if (hci_find_link_key(hdev, &ev->bdaddr)) {
+ bt_dev_dbg(hdev, "Local host already has link key");
+ confirm_hint = 1;
+ goto confirm;
+ }
+
+ BT_DBG("Auto-accept of user confirmation with %ums delay",
+ hdev->auto_accept_delay);
+
+ if (hdev->auto_accept_delay > 0) {
+ int delay = msecs_to_jiffies(hdev->auto_accept_delay);
+ queue_delayed_work(conn->hdev->workqueue,
+ &conn->auto_accept_work, delay);
+ goto unlock;
+ }
- case HCI_EV_CLOCK_OFFSET:
- hci_clock_offset_evt(hdev, skb);
+ hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_REPLY,
+ sizeof(ev->bdaddr), &ev->bdaddr);
+ goto unlock;
+ }
+
+confirm:
+ mgmt_user_confirm_request(hdev, &ev->bdaddr, ACL_LINK, 0,
+ le32_to_cpu(ev->passkey), confirm_hint);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_user_passkey_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_user_passkey_req *ev = data;
+
+ bt_dev_dbg(hdev, "");
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_request(hdev, &ev->bdaddr, ACL_LINK, 0);
+}
+
+static void hci_user_passkey_notify_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_user_passkey_notify *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ return;
+
+ conn->passkey_notify = __le32_to_cpu(ev->passkey);
+ conn->passkey_entered = 0;
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_notify(hdev, &conn->dst, conn->type,
+ conn->dst_type, conn->passkey_notify,
+ conn->passkey_entered);
+}
+
+static void hci_keypress_notify_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_keypress_notify *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ return;
+
+ switch (ev->type) {
+ case HCI_KEYPRESS_STARTED:
+ conn->passkey_entered = 0;
+ return;
+
+ case HCI_KEYPRESS_ENTERED:
+ conn->passkey_entered++;
break;
- case HCI_EV_PSCAN_REP_MODE:
- hci_pscan_rep_mode_evt(hdev, skb);
+ case HCI_KEYPRESS_ERASED:
+ conn->passkey_entered--;
break;
- case HCI_EV_SNIFF_SUBRATE:
- hci_sniff_subrate_evt(hdev, skb);
+ case HCI_KEYPRESS_CLEARED:
+ conn->passkey_entered = 0;
break;
- case HCI_EV_CMD_STATUS:
- cs = (struct hci_ev_cmd_status *) skb->data;
- skb_pull(skb, sizeof(cs));
+ case HCI_KEYPRESS_COMPLETED:
+ return;
+ }
- opcode = __le16_to_cpu(cs->opcode);
- ogf = hci_opcode_ogf(opcode);
- ocf = hci_opcode_ocf(opcode);
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_notify(hdev, &conn->dst, conn->type,
+ conn->dst_type, conn->passkey_notify,
+ conn->passkey_entered);
+}
- switch (ogf) {
- case OGF_INFO_PARAM:
- hci_cs_info_param(hdev, ocf, cs->status);
- break;
+static void hci_simple_pair_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_simple_pair_complete *ev = data;
+ struct hci_conn *conn;
- case OGF_HOST_CTL:
- hci_cs_host_ctl(hdev, ocf, cs->status);
- break;
+ bt_dev_dbg(hdev, "");
- case OGF_LINK_CTL:
- hci_cs_link_ctl(hdev, ocf, cs->status);
- break;
+ hci_dev_lock(hdev);
- case OGF_LINK_POLICY:
- hci_cs_link_policy(hdev, ocf, cs->status);
- break;
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn || !hci_conn_ssp_enabled(conn))
+ goto unlock;
- default:
- BT_DBG("%s Command Status OGF %x", hdev->name, ogf);
- break;
+ /* Reset the authentication requirement to unknown */
+ conn->remote_auth = 0xff;
+
+ /* To avoid duplicate auth_failed events to user space we check
+ * the HCI_CONN_AUTH_PEND flag which will be set if we
+ * initiated the authentication. A traditional auth_complete
+ * event gets always produced as initiator and is also mapped to
+ * the mgmt_auth_failed event */
+ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && ev->status)
+ mgmt_auth_failed(conn, ev->status);
+
+ hci_conn_drop(conn);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_host_features_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_host_features *ev = data;
+ struct inquiry_entry *ie;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (conn)
+ memcpy(conn->features[1], ev->features, 8);
+
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie)
+ ie->data.ssp_mode = (ev->features[0] & LMP_HOST_SSP);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_oob_data_request_evt(struct hci_dev *hdev, void *edata,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_oob_data_request *ev = edata;
+ struct oob_data *data;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ data = hci_find_remote_oob_data(hdev, &ev->bdaddr, BDADDR_BREDR);
+ if (!data) {
+ struct hci_cp_remote_oob_data_neg_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_NEG_REPLY,
+ sizeof(cp), &cp);
+ goto unlock;
+ }
+
+ if (bredr_sc_enabled(hdev)) {
+ struct hci_cp_remote_oob_ext_data_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ memset(cp.hash192, 0, sizeof(cp.hash192));
+ memset(cp.rand192, 0, sizeof(cp.rand192));
+ } else {
+ memcpy(cp.hash192, data->hash192, sizeof(cp.hash192));
+ memcpy(cp.rand192, data->rand192, sizeof(cp.rand192));
}
+ memcpy(cp.hash256, data->hash256, sizeof(cp.hash256));
+ memcpy(cp.rand256, data->rand256, sizeof(cp.rand256));
- if (cs->ncmd) {
- atomic_set(&hdev->cmd_cnt, 1);
- if (!skb_queue_empty(&hdev->cmd_q))
- hci_sched_cmd(hdev);
+ hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_EXT_DATA_REPLY,
+ sizeof(cp), &cp);
+ } else {
+ struct hci_cp_remote_oob_data_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ memcpy(cp.hash, data->hash192, sizeof(cp.hash));
+ memcpy(cp.rand, data->rand192, sizeof(cp.rand));
+
+ hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_REPLY,
+ sizeof(cp), &cp);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr,
+ u8 bdaddr_type, bdaddr_t *local_rpa)
+{
+ if (conn->out) {
+ conn->dst_type = bdaddr_type;
+ conn->resp_addr_type = bdaddr_type;
+ bacpy(&conn->resp_addr, bdaddr);
+
+ /* Check if the controller has set a Local RPA then it must be
+ * used instead or hdev->rpa.
+ */
+ if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) {
+ conn->init_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->init_addr, local_rpa);
+ } else if (hci_dev_test_flag(conn->hdev, HCI_PRIVACY)) {
+ conn->init_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->init_addr, &conn->hdev->rpa);
+ } else {
+ hci_copy_identity_address(conn->hdev, &conn->init_addr,
+ &conn->init_addr_type);
+ }
+ } else {
+ conn->resp_addr_type = conn->hdev->adv_addr_type;
+ /* Check if the controller has set a Local RPA then it must be
+ * used instead or hdev->rpa.
+ */
+ if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) {
+ conn->resp_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->resp_addr, local_rpa);
+ } else if (conn->hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) {
+ /* In case of ext adv, resp_addr will be updated in
+ * Adv Terminated event.
+ */
+ if (!ext_adv_capable(conn->hdev))
+ bacpy(&conn->resp_addr,
+ &conn->hdev->random_addr);
+ } else {
+ bacpy(&conn->resp_addr, &conn->hdev->bdaddr);
+ }
+
+ conn->init_addr_type = bdaddr_type;
+ bacpy(&conn->init_addr, bdaddr);
+
+ /* For incoming connections, set the default minimum
+ * and maximum connection interval. They will be used
+ * to check if the parameters are in range and if not
+ * trigger the connection update procedure.
+ */
+ conn->le_conn_min_interval = conn->hdev->le_conn_min_interval;
+ conn->le_conn_max_interval = conn->hdev->le_conn_max_interval;
+ }
+}
+
+static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
+ bdaddr_t *bdaddr, u8 bdaddr_type,
+ bdaddr_t *local_rpa, u8 role, u16 handle,
+ u16 interval, u16 latency,
+ u16 supervision_timeout)
+{
+ struct hci_conn_params *params;
+ struct hci_conn *conn;
+ struct smp_irk *irk;
+ u8 addr_type;
+ int err;
+
+ hci_dev_lock(hdev);
+
+ /* All controllers implicitly stop advertising in the event of a
+ * connection, so ensure that the state bit is cleared.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Check for existing connection:
+ *
+ * 1. If it doesn't exist then use the role to create a new object.
+ * 2. If it does exist confirm that it is connecting/BT_CONNECT in case
+ * of initiator/master role since there could be a collision where
+ * either side is attempting to connect or something like a fuzzing
+ * testing is trying to play tricks to destroy the hcon object before
+ * it even attempts to connect (e.g. hcon->state == BT_OPEN).
+ */
+ conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr);
+ if (!conn ||
+ (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) {
+ /* In case of error status and there is no connection pending
+ * just unlock as there is nothing to cleanup.
+ */
+ if (status)
+ goto unlock;
+
+ conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, bdaddr_type,
+ role);
+ if (IS_ERR(conn)) {
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
+ goto unlock;
+ }
+
+ /* If we didn't have a hci_conn object previously
+ * but we're in central role this must be something
+ * initiated using an accept list. Since accept list based
+ * connections are not "first class citizens" we don't
+ * have full tracking of them. Therefore, we go ahead
+ * with a "best effort" approach of determining the
+ * initiator address based on the HCI_PRIVACY flag.
+ */
+ if (conn->out) {
+ conn->resp_addr_type = bdaddr_type;
+ bacpy(&conn->resp_addr, bdaddr);
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ conn->init_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->init_addr, &hdev->rpa);
+ } else {
+ hci_copy_identity_address(hdev,
+ &conn->init_addr,
+ &conn->init_addr_type);
+ }
+ }
+ } else {
+ cancel_delayed_work(&conn->le_conn_timeout);
+ }
+
+ /* The HCI_LE_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (!HCI_CONN_HANDLE_UNSET(conn->handle)) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+ goto unlock;
+ }
+
+ le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa);
+
+ /* Lookup the identity address from the stored connection
+ * address and address type.
+ *
+ * When establishing connections to an identity address, the
+ * connection procedure will store the resolvable random
+ * address first. Now if it can be converted back into the
+ * identity address, start using the identity address from
+ * now on.
+ */
+ irk = hci_get_irk(hdev, &conn->dst, conn->dst_type);
+ if (irk) {
+ bacpy(&conn->dst, &irk->bdaddr);
+ conn->dst_type = irk->addr_type;
+ }
+
+ conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL);
+
+ /* All connection failure handling is taken care of by the
+ * hci_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
+ */
+ if (status || hci_conn_set_handle(conn, handle))
+ goto unlock;
+
+ /* Drop the connection if it has been aborted */
+ if (test_bit(HCI_CONN_CANCEL, &conn->flags)) {
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ if (conn->dst_type == ADDR_LE_DEV_PUBLIC)
+ addr_type = BDADDR_LE_PUBLIC;
+ else
+ addr_type = BDADDR_LE_RANDOM;
+
+ /* Drop the connection if the device is blocked */
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, &conn->dst, addr_type)) {
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ mgmt_device_connected(hdev, conn, NULL, 0);
+
+ conn->sec_level = BT_SECURITY_LOW;
+ conn->state = BT_CONFIG;
+
+ /* Store current advertising instance as connection advertising instance
+ * when software rotation is in use so it can be re-enabled when
+ * disconnected.
+ */
+ if (!ext_adv_capable(hdev))
+ conn->adv_instance = hdev->cur_adv_instance;
+
+ conn->le_conn_interval = interval;
+ conn->le_conn_latency = latency;
+ conn->le_supv_timeout = supervision_timeout;
+
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+
+ err = hci_le_read_remote_features(conn);
+ if (err) {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, status);
+ }
+
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
+ conn->dst_type);
+ if (params) {
+ hci_pend_le_list_del_init(params);
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ params->conn = NULL;
+ }
+ }
+
+unlock:
+ hci_update_passive_scan(hdev);
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_conn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_conn_complete *ev = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
+ NULL, ev->role, le16_to_cpu(ev->handle),
+ le16_to_cpu(ev->interval),
+ le16_to_cpu(ev->latency),
+ le16_to_cpu(ev->supervision_timeout));
+}
+
+static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_enh_conn_complete *ev = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
+ &ev->local_rpa, ev->role, le16_to_cpu(ev->handle),
+ le16_to_cpu(ev->interval),
+ le16_to_cpu(ev->latency),
+ le16_to_cpu(ev->supervision_timeout));
+}
+
+static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_pa_sync_lost *ev = data;
+ u16 handle = le16_to_cpu(ev->handle);
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle);
+
+ hci_dev_lock(hdev);
+
+ /* Delete the pa sync connection */
+ conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle);
+ if (conn) {
+ clear_bit(HCI_CONN_BIG_SYNC, &conn->flags);
+ clear_bit(HCI_CONN_PA_SYNC, &conn->flags);
+ hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM);
+ hci_conn_del(conn);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_ext_adv_set_term *ev = data;
+ struct hci_conn *conn;
+ struct adv_info *adv, *n;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ /* The Bluetooth Core 5.3 specification clearly states that this event
+ * shall not be sent when the Host disables the advertising set. So in
+ * case of HCI_ERROR_CANCELLED_BY_HOST, just ignore the event.
+ *
+ * When the Host disables an advertising set, all cleanup is done via
+ * its command callback and not needed to be duplicated here.
+ */
+ if (ev->status == HCI_ERROR_CANCELLED_BY_HOST) {
+ bt_dev_warn_ratelimited(hdev, "Unexpected advertising set terminated event");
+ return;
+ }
+
+ hci_dev_lock(hdev);
+
+ adv = hci_find_adv_instance(hdev, ev->handle);
+
+ if (ev->status) {
+ if (!adv)
+ goto unlock;
+
+ /* Remove advertising as it has been terminated */
+ hci_remove_adv_instance(hdev, ev->handle);
+ mgmt_advertising_removed(NULL, hdev, ev->handle);
+
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ if (adv->enabled)
+ goto unlock;
}
+
+ /* We are no longer advertising, clear HCI_LE_ADV */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ goto unlock;
+ }
+
+ if (adv)
+ adv->enabled = false;
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle));
+ if (conn) {
+ /* Store handle in the connection so the correct advertising
+ * instance can be re-enabled when disconnected.
+ */
+ conn->adv_instance = ev->handle;
+
+ if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM ||
+ bacmp(&conn->resp_addr, BDADDR_ANY))
+ goto unlock;
+
+ if (!ev->handle) {
+ bacpy(&conn->resp_addr, &hdev->random_addr);
+ goto unlock;
+ }
+
+ if (adv)
+ bacpy(&conn->resp_addr, &adv->random_addr);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_le_pa_term_sync cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+
+ return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp);
+}
+
+static void hci_le_past_received_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_past_received *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync, *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection for dst %pMR sid 0x%2.2x",
+ &ev->bdaddr, ev->sid);
+ goto unlock;
+ }
+
+ conn->sync_handle = le16_to_cpu(ev->sync_handle);
+ conn->sid = HCI_SID_INVALID;
+
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK,
+ &flags);
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_le_pa_term_sync(hdev, ev->sync_handle);
+ goto unlock;
+ }
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ /* Add connection to indicate PA sync event */
+ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ pa_sync->sync_handle = le16_to_cpu(ev->sync_handle);
+
+ if (ev->status) {
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, ev->status);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_conn_update_complete *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ if (ev->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn) {
+ conn->le_conn_interval = le16_to_cpu(ev->interval);
+ conn->le_conn_latency = le16_to_cpu(ev->latency);
+ conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
+ bdaddr_t *addr,
+ u8 addr_type, bool addr_resolved,
+ u8 adv_type, u8 phy, u8 sec_phy)
+{
+ struct hci_conn *conn;
+ struct hci_conn_params *params;
+
+ /* If the event is not connectable don't proceed further */
+ if (adv_type != LE_ADV_IND && adv_type != LE_ADV_DIRECT_IND)
+ return NULL;
+
+ /* Ignore if the device is blocked or hdev is suspended */
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, addr, addr_type) ||
+ hdev->suspended)
+ return NULL;
+
+ /* Most controller will fail if we try to create new connections
+ * while we have an existing one in peripheral role.
+ */
+ if (hdev->conn_hash.le_num_peripheral > 0 &&
+ (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES) ||
+ !(hdev->le_states[3] & 0x10)))
+ return NULL;
+
+ /* If we're not connectable only connect devices that we have in
+ * our pend_le_conns list.
+ */
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, addr,
+ addr_type);
+ if (!params)
+ return NULL;
+
+ if (!params->explicit_connect) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_DIRECT:
+ /* Only devices advertising with ADV_DIRECT_IND are
+ * triggering a connection attempt. This is allowing
+ * incoming connections from peripheral devices.
+ */
+ if (adv_type != LE_ADV_DIRECT_IND)
+ return NULL;
+ break;
+ case HCI_AUTO_CONN_ALWAYS:
+ /* Devices advertising with ADV_IND or ADV_DIRECT_IND
+ * are triggering a connection attempt. This means
+ * that incoming connections from peripheral device are
+ * accepted and also outgoing connections to peripheral
+ * devices are established when found.
+ */
+ break;
+ default:
+ return NULL;
+ }
+ }
+
+ conn = hci_connect_le(hdev, addr, addr_type, addr_resolved,
+ BT_SECURITY_LOW, hdev->def_le_autoconnect_timeout,
+ HCI_ROLE_MASTER, phy, sec_phy);
+ if (!IS_ERR(conn)) {
+ /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned
+ * by higher layer that tried to connect, if no then
+ * store the pointer since we don't really have any
+ * other owner of the object besides the params that
+ * triggered it. This way we can abort the connection if
+ * the parameters get removed and keep the reference
+ * count consistent once the connection is established.
+ */
+
+ if (!params->explicit_connect)
+ params->conn = hci_conn_get(conn);
+
+ return conn;
+ }
+
+ switch (PTR_ERR(conn)) {
+ case -EBUSY:
+ /* If hci_connect() returns -EBUSY it means there is already
+ * an LE connection attempt going on. Since controllers don't
+ * support more than one connection attempt at the time, we
+ * don't consider this an error case.
+ */
break;
+ default:
+ BT_DBG("Failed to connect: err %ld", PTR_ERR(conn));
+ return NULL;
+ }
- case HCI_EV_CMD_COMPLETE:
- ec = (struct hci_ev_cmd_complete *) skb->data;
- skb_pull(skb, sizeof(*ec));
+ return NULL;
+}
- opcode = __le16_to_cpu(ec->opcode);
- ogf = hci_opcode_ogf(opcode);
- ocf = hci_opcode_ocf(opcode);
+static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
+ u8 bdaddr_type, bdaddr_t *direct_addr,
+ u8 direct_addr_type, u8 phy, u8 sec_phy, s8 rssi,
+ u8 *data, u8 len, bool ext_adv, bool ctl_time,
+ u64 instant)
+{
+ struct discovery_state *d = &hdev->discovery;
+ struct smp_irk *irk;
+ struct hci_conn *conn;
+ bool match, bdaddr_resolved;
+ u32 flags;
+ u8 *ptr;
+
+ switch (type) {
+ case LE_ADV_IND:
+ case LE_ADV_DIRECT_IND:
+ case LE_ADV_SCAN_IND:
+ case LE_ADV_NONCONN_IND:
+ case LE_ADV_SCAN_RSP:
+ break;
+ default:
+ bt_dev_err_ratelimited(hdev, "unknown advertising packet "
+ "type: 0x%02x", type);
+ return;
+ }
- switch (ogf) {
- case OGF_INFO_PARAM:
- hci_cc_info_param(hdev, ocf, skb);
+ if (len > max_adv_len(hdev)) {
+ bt_dev_err_ratelimited(hdev,
+ "adv larger than maximum supported");
+ return;
+ }
+
+ /* Find the end of the data in case the report contains padded zero
+ * bytes at the end causing an invalid length value.
+ *
+ * When data is NULL, len is 0 so there is no need for extra ptr
+ * check as 'ptr < data + 0' is already false in such case.
+ */
+ for (ptr = data; ptr < data + len && *ptr; ptr += *ptr + 1) {
+ if (ptr + 1 + *ptr > data + len)
break;
+ }
+
+ /* Adjust for actual length. This handles the case when remote
+ * device is advertising with incorrect data length.
+ */
+ len = ptr - data;
+
+ /* If the direct address is present, then this report is from
+ * a LE Direct Advertising Report event. In that case it is
+ * important to see if the address is matching the local
+ * controller address.
+ *
+ * If local privacy is not enable the controller shall not be
+ * generating such event since according to its documentation it is only
+ * valid for filter_policy 0x02 and 0x03, but the fact that it did
+ * generate LE Direct Advertising Report means it is probably broken and
+ * won't generate any other event which can potentially break
+ * auto-connect logic so in case local privacy is not enable this
+ * ignores the direct_addr so it works as a regular report.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr &&
+ hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type,
+ &bdaddr_resolved);
+
+ /* Only resolvable random addresses are valid for these
+ * kind of reports and others can be ignored.
+ */
+ if (!hci_bdaddr_is_rpa(direct_addr, direct_addr_type))
+ return;
+
+ /* If the local IRK of the controller does not match
+ * with the resolvable random address provided, then
+ * this report can be ignored.
+ */
+ if (!smp_irk_matches(hdev, hdev->irk, direct_addr))
+ return;
+ }
+
+ /* Check if we need to convert to identity address */
+ irk = hci_get_irk(hdev, bdaddr, bdaddr_type);
+ if (irk) {
+ bdaddr = &irk->bdaddr;
+ bdaddr_type = irk->addr_type;
+ }
+
+ bdaddr_type = ev_bdaddr_type(hdev, bdaddr_type, &bdaddr_resolved);
+
+ /* Check if we have been requested to connect to this device.
+ *
+ * direct_addr is set only for directed advertising reports (it is NULL
+ * for advertising reports) and is already verified to be RPA above.
+ */
+ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved,
+ type, phy, sec_phy);
+ if (!ext_adv && conn && type == LE_ADV_IND &&
+ len <= max_adv_len(hdev)) {
+ /* Store report for later inclusion by
+ * mgmt_device_connected
+ */
+ memcpy(conn->le_adv_data, data, len);
+ conn->le_adv_data_len = len;
+ }
+
+ if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND)
+ flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
+ else
+ flags = 0;
- case OGF_HOST_CTL:
- hci_cc_host_ctl(hdev, ocf, skb);
+ /* All scan results should be sent up for Mesh systems */
+ if (hci_dev_test_flag(hdev, HCI_MESH)) {
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0, instant);
+ return;
+ }
+
+ /* Passive scanning shouldn't trigger any device found events,
+ * except for devices marked as CONN_REPORT for which we do send
+ * device found events, or advertisement monitoring requested.
+ */
+ if (hdev->le_scan_type == LE_SCAN_PASSIVE) {
+ if (type == LE_ADV_DIRECT_IND)
+ return;
+
+ if (!hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ bdaddr, bdaddr_type) &&
+ idr_is_empty(&hdev->adv_monitors_idr))
+ return;
+
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0, 0);
+ return;
+ }
+
+ /* When receiving a scan response, then there is no way to
+ * know if the remote device is connectable or not. However
+ * since scan responses are merged with a previously seen
+ * advertising report, the flags field from that report
+ * will be used.
+ *
+ * In the unlikely case that a controller just sends a scan
+ * response event that doesn't match the pending report, then
+ * it is marked as a standalone SCAN_RSP.
+ */
+ if (type == LE_ADV_SCAN_RSP)
+ flags = MGMT_DEV_FOUND_SCAN_RSP;
+
+ /* If there's nothing pending either store the data from this
+ * event or send an immediate device found event if the data
+ * should not be stored for later.
+ */
+ if (!has_pending_adv_report(hdev)) {
+ /* If the report will trigger a SCAN_REQ store it for
+ * later merging.
+ */
+ if (!ext_adv && (type == LE_ADV_IND ||
+ type == LE_ADV_SCAN_IND)) {
+ store_pending_adv_report(hdev, bdaddr, bdaddr_type,
+ rssi, flags, data, len);
+ return;
+ }
+
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0, 0);
+ return;
+ }
+
+ /* Check if the pending report is for the same device as the new one */
+ match = (!bacmp(bdaddr, &d->last_adv_addr) &&
+ bdaddr_type == d->last_adv_addr_type);
+
+ /* If the pending data doesn't match this report or this isn't a
+ * scan response (e.g. we got a duplicate ADV_IND) then force
+ * sending of the pending data.
+ */
+ if (type != LE_ADV_SCAN_RSP || !match) {
+ /* Send out whatever is in the cache, but skip duplicates */
+ if (!match)
+ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+ d->last_adv_addr_type, NULL,
+ d->last_adv_rssi, d->last_adv_flags,
+ d->last_adv_data,
+ d->last_adv_data_len, NULL, 0, 0);
+
+ /* If the new report will trigger a SCAN_REQ store it for
+ * later merging.
+ */
+ if (!ext_adv && (type == LE_ADV_IND ||
+ type == LE_ADV_SCAN_IND)) {
+ store_pending_adv_report(hdev, bdaddr, bdaddr_type,
+ rssi, flags, data, len);
+ return;
+ }
+
+ /* The advertising reports cannot be merged, so clear
+ * the pending report and send out a device found event.
+ */
+ clear_pending_adv_report(hdev);
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0, 0);
+ return;
+ }
+
+ /* If we get here we've got a pending ADV_IND or ADV_SCAN_IND and
+ * the new event is a SCAN_RSP. We can therefore proceed with
+ * sending a merged device found event.
+ */
+ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+ d->last_adv_addr_type, NULL, rssi, d->last_adv_flags,
+ d->last_adv_data, d->last_adv_data_len, data, len, 0);
+ clear_pending_adv_report(hdev);
+}
+
+static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_advertising_report *ev = data;
+ u64 instant = jiffies;
+
+ if (!ev->num)
+ return;
+
+ hci_dev_lock(hdev);
+
+ while (ev->num--) {
+ struct hci_ev_le_advertising_info *info;
+ s8 rssi;
+
+ info = hci_le_ev_skb_pull(hdev, skb,
+ HCI_EV_LE_ADVERTISING_REPORT,
+ sizeof(*info));
+ if (!info)
break;
- case OGF_LINK_CTL:
- hci_cc_link_ctl(hdev, ocf, skb);
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_ADVERTISING_REPORT,
+ info->length + 1))
break;
- case OGF_LINK_POLICY:
- hci_cc_link_policy(hdev, ocf, skb);
+ if (info->length <= max_adv_len(hdev)) {
+ rssi = info->data[info->length];
+ process_adv_report(hdev, info->type, &info->bdaddr,
+ info->bdaddr_type, NULL, 0,
+ HCI_ADV_PHY_1M, 0, rssi,
+ info->data, info->length, false,
+ false, instant);
+ } else {
+ bt_dev_err(hdev, "Dropping invalid advertising data");
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type)
+{
+ u16 pdu_type = evt_type & ~LE_EXT_ADV_DATA_STATUS_MASK;
+
+ if (!pdu_type)
+ return LE_ADV_NONCONN_IND;
+
+ if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
+ switch (evt_type) {
+ case LE_LEGACY_ADV_IND:
+ return LE_ADV_IND;
+ case LE_LEGACY_ADV_DIRECT_IND:
+ return LE_ADV_DIRECT_IND;
+ case LE_LEGACY_ADV_SCAN_IND:
+ return LE_ADV_SCAN_IND;
+ case LE_LEGACY_NONCONN_IND:
+ return LE_ADV_NONCONN_IND;
+ case LE_LEGACY_SCAN_RSP_ADV:
+ case LE_LEGACY_SCAN_RSP_ADV_SCAN:
+ return LE_ADV_SCAN_RSP;
+ }
+
+ goto invalid;
+ }
+
+ if (evt_type & LE_EXT_ADV_CONN_IND) {
+ if (evt_type & LE_EXT_ADV_DIRECT_IND)
+ return LE_ADV_DIRECT_IND;
+
+ return LE_ADV_IND;
+ }
+
+ if (evt_type & LE_EXT_ADV_SCAN_RSP)
+ return LE_ADV_SCAN_RSP;
+
+ if (evt_type & LE_EXT_ADV_SCAN_IND)
+ return LE_ADV_SCAN_IND;
+
+ if (evt_type & LE_EXT_ADV_DIRECT_IND)
+ return LE_ADV_NONCONN_IND;
+
+invalid:
+ bt_dev_err_ratelimited(hdev, "Unknown advertising packet type: 0x%02x",
+ evt_type);
+
+ return LE_ADV_INVALID;
+}
+
+static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_ext_adv_report *ev = data;
+ u64 instant = jiffies;
+
+ if (!ev->num)
+ return;
+
+ hci_dev_lock(hdev);
+
+ while (ev->num--) {
+ struct hci_ev_le_ext_adv_info *info;
+ u8 legacy_evt_type;
+ u16 evt_type;
+
+ info = hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_EXT_ADV_REPORT,
+ sizeof(*info));
+ if (!info)
break;
- default:
- BT_DBG("%s Command Completed OGF %x", hdev->name, ogf);
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_EXT_ADV_REPORT,
+ info->length))
break;
+
+ evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK;
+ legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type);
+
+ if (hci_test_quirk(hdev,
+ HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY)) {
+ info->primary_phy &= 0x1f;
+ info->secondary_phy &= 0x1f;
}
- if (ec->ncmd) {
- atomic_set(&hdev->cmd_cnt, 1);
- if (!skb_queue_empty(&hdev->cmd_q))
- hci_sched_cmd(hdev);
+ /* Check if PA Sync is pending and if the hci_conn SID has not
+ * been set update it.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (conn && conn->sid == HCI_SID_INVALID)
+ conn->sid = info->sid;
+ }
+
+ if (legacy_evt_type != LE_ADV_INVALID) {
+ process_adv_report(hdev, legacy_evt_type, &info->bdaddr,
+ info->bdaddr_type, NULL, 0,
+ info->primary_phy,
+ info->secondary_phy,
+ info->rssi, info->data, info->length,
+ !(evt_type & LE_EXT_ADV_LEGACY_PDU),
+ false, instant);
}
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_pa_sync_established *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync, *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection for dst %pMR sid 0x%2.2x",
+ &ev->bdaddr, ev->sid);
+ goto unlock;
+ }
+
+ clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
+
+ conn->sync_handle = le16_to_cpu(ev->handle);
+ conn->sid = HCI_SID_INVALID;
+
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK,
+ &flags);
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_le_pa_term_sync(hdev, ev->handle);
+ goto unlock;
+ }
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ /* Add connection to indicate PA sync event */
+ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ pa_sync->sync_handle = le16_to_cpu(ev->handle);
+
+ if (ev->status) {
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, ev->status);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_per_adv_report *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync;
+
+ bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
+
+ hci_dev_lock(hdev);
+
+ mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, PA_LINK, &flags);
+ if (!(mask & HCI_LM_ACCEPT))
+ goto unlock;
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ pa_sync = hci_conn_hash_lookup_pa_sync_handle
+ (hdev,
+ le16_to_cpu(ev->sync_handle));
+
+ if (!pa_sync)
+ goto unlock;
+
+ if (ev->data_status == LE_PA_DATA_COMPLETE &&
+ !test_and_set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags)) {
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, 0);
+
+ /* Notify MGMT layer */
+ mgmt_device_connected(hdev, pa_sync, NULL, 0);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_remote_feat_complete *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn) {
+ if (!ev->status)
+ memcpy(conn->features[0], ev->features, 8);
+
+ if (conn->state == BT_CONFIG) {
+ __u8 status;
+
+ /* If the local controller supports peripheral-initiated
+ * features exchange, but the remote controller does
+ * not, then it is possible that the error code 0x1a
+ * for unsupported remote feature gets returned.
+ *
+ * In this specific case, allow the connection to
+ * transition into connected state and mark it as
+ * successful.
+ */
+ if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
+ (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+ status = 0x00;
+ else
+ status = ev->status;
+
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, status);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_ltk_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_ltk_req *ev = data;
+ struct hci_cp_le_ltk_reply cp;
+ struct hci_cp_le_ltk_neg_reply neg;
+ struct hci_conn *conn;
+ struct smp_ltk *ltk;
+
+ bt_dev_dbg(hdev, "handle 0x%4.4x", __le16_to_cpu(ev->handle));
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn == NULL)
+ goto not_found;
+
+ ltk = hci_find_ltk(hdev, &conn->dst, conn->dst_type, conn->role);
+ if (!ltk)
+ goto not_found;
+
+ if (smp_ltk_is_sc(ltk)) {
+ /* With SC both EDiv and Rand are set to zero */
+ if (ev->ediv || ev->rand)
+ goto not_found;
+ } else {
+ /* For non-SC keys check that EDiv and Rand match */
+ if (ev->ediv != ltk->ediv || ev->rand != ltk->rand)
+ goto not_found;
+ }
+
+ memcpy(cp.ltk, ltk->val, ltk->enc_size);
+ memset(cp.ltk + ltk->enc_size, 0, sizeof(cp.ltk) - ltk->enc_size);
+ cp.handle = cpu_to_le16(conn->handle);
+
+ conn->pending_sec_level = smp_ltk_sec_level(ltk);
+
+ conn->enc_key_size = ltk->enc_size;
+
+ hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp);
+
+ /* Ref. Bluetooth Core SPEC pages 1975 and 2004. STK is a
+ * temporary key used to encrypt a connection following
+ * pairing. It is used during the Encrypted Session Setup to
+ * distribute the keys. Later, security can be re-established
+ * using a distributed LTK.
+ */
+ if (ltk->type == SMP_STK) {
+ set_bit(HCI_CONN_STK_ENCRYPT, &conn->flags);
+ list_del_rcu(&ltk->list);
+ kfree_rcu(ltk, rcu);
+ } else {
+ clear_bit(HCI_CONN_STK_ENCRYPT, &conn->flags);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return;
+
+not_found:
+ neg.handle = ev->handle;
+ hci_send_cmd(hdev, HCI_OP_LE_LTK_NEG_REPLY, sizeof(neg), &neg);
+ hci_dev_unlock(hdev);
+}
+
+static void send_conn_param_neg_reply(struct hci_dev *hdev, u16 handle,
+ u8 reason)
+{
+ struct hci_cp_le_conn_param_req_neg_reply cp;
+
+ cp.handle = cpu_to_le16(handle);
+ cp.reason = reason;
+
+ hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_NEG_REPLY, sizeof(cp),
+ &cp);
+}
+
+static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_remote_conn_param_req *ev = data;
+ struct hci_cp_le_conn_param_req_reply cp;
+ struct hci_conn *hcon;
+ u16 handle, min, max, latency, timeout;
+
+ bt_dev_dbg(hdev, "handle 0x%4.4x", __le16_to_cpu(ev->handle));
+
+ handle = le16_to_cpu(ev->handle);
+ min = le16_to_cpu(ev->interval_min);
+ max = le16_to_cpu(ev->interval_max);
+ latency = le16_to_cpu(ev->latency);
+ timeout = le16_to_cpu(ev->timeout);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon || hcon->state != BT_CONNECTED)
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_UNKNOWN_CONN_ID);
+
+ if (max > hcon->le_conn_max_interval)
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_INVALID_LL_PARAMS);
+
+ if (hci_check_conn_params(min, max, latency, timeout))
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_INVALID_LL_PARAMS);
+
+ if (hcon->role == HCI_ROLE_MASTER) {
+ struct hci_conn_params *params;
+ u8 store_hint;
+
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &hcon->dst,
+ hcon->dst_type);
+ if (params) {
+ params->conn_min_interval = min;
+ params->conn_max_interval = max;
+ params->conn_latency = latency;
+ params->supervision_timeout = timeout;
+ store_hint = 0x01;
+ } else {
+ store_hint = 0x00;
+ }
+
+ hci_dev_unlock(hdev);
+
+ mgmt_new_conn_param(hdev, &hcon->dst, hcon->dst_type,
+ store_hint, min, max, latency, timeout);
+ }
+
+ cp.handle = ev->handle;
+ cp.interval_min = ev->interval_min;
+ cp.interval_max = ev->interval_max;
+ cp.latency = ev->latency;
+ cp.timeout = ev->timeout;
+ cp.min_ce_len = 0;
+ cp.max_ce_len = 0;
+
+ hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_REPLY, sizeof(cp), &cp);
+}
+
+static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_direct_adv_report *ev = data;
+ u64 instant = jiffies;
+ int i;
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_DIRECT_ADV_REPORT,
+ flex_array_size(ev, info, ev->num)))
+ return;
+
+ if (!ev->num)
+ return;
+
+ hci_dev_lock(hdev);
+
+ for (i = 0; i < ev->num; i++) {
+ struct hci_ev_le_direct_adv_info *info = &ev->info[i];
+
+ process_adv_report(hdev, info->type, &info->bdaddr,
+ info->bdaddr_type, &info->direct_addr,
+ info->direct_addr_type, HCI_ADV_PHY_1M, 0,
+ info->rssi, NULL, 0, false, false, instant);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_phy_update_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_phy_update_complete *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ if (ev->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ conn->le_tx_phy = ev->tx_phy;
+ conn->le_rx_phy = ev->rx_phy;
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_cis_established *ev = data;
+ struct hci_conn *conn;
+ struct bt_iso_qos *qos;
+ bool pending = false;
+ u16 handle = __le16_to_cpu(ev->handle);
+ u32 c_sdu_interval, p_sdu_interval;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection with handle 0x%4.4x",
+ handle);
+ goto unlock;
+ }
+
+ if (conn->type != CIS_LINK) {
+ bt_dev_err(hdev,
+ "Invalid connection link type handle 0x%4.4x",
+ handle);
+ goto unlock;
+ }
+
+ qos = &conn->iso_qos;
+
+ pending = test_and_clear_bit(HCI_CONN_CREATE_CIS, &conn->flags);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 6, Part G
+ * page 3075:
+ * Transport_Latency_C_To_P = CIG_Sync_Delay + (FT_C_To_P) ×
+ * ISO_Interval + SDU_Interval_C_To_P
+ * ...
+ * SDU_Interval = (CIG_Sync_Delay + (FT) x ISO_Interval) -
+ * Transport_Latency
+ */
+ c_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) +
+ (ev->c_ft * le16_to_cpu(ev->interval) * 1250)) -
+ get_unaligned_le24(ev->c_latency);
+ p_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) +
+ (ev->p_ft * le16_to_cpu(ev->interval) * 1250)) -
+ get_unaligned_le24(ev->p_latency);
+
+ switch (conn->role) {
+ case HCI_ROLE_SLAVE:
+ qos->ucast.in.interval = c_sdu_interval;
+ qos->ucast.out.interval = p_sdu_interval;
+ /* Convert Transport Latency (us) to Latency (msec) */
+ qos->ucast.in.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency),
+ 1000);
+ qos->ucast.out.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency),
+ 1000);
+ qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
+ qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
+ qos->ucast.in.phy = ev->c_phy;
+ qos->ucast.out.phy = ev->p_phy;
+ break;
+ case HCI_ROLE_MASTER:
+ qos->ucast.in.interval = p_sdu_interval;
+ qos->ucast.out.interval = c_sdu_interval;
+ /* Convert Transport Latency (us) to Latency (msec) */
+ qos->ucast.out.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency),
+ 1000);
+ qos->ucast.in.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency),
+ 1000);
+ qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
+ qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
+ qos->ucast.out.phy = ev->c_phy;
+ qos->ucast.in.phy = ev->p_phy;
break;
}
- kfree_skb(skb);
- hdev->stat.evt_rx++;
+ if (!ev->status) {
+ conn->state = BT_CONNECTED;
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+ hci_iso_setup_path(conn);
+ goto unlock;
+ }
+
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+
+unlock:
+ if (pending)
+ hci_le_create_cis_pending(hdev);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_reject_cis(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_le_reject_cis cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+ cp.reason = HCI_ERROR_REJ_BAD_ADDR;
+ hci_send_cmd(hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp);
+}
+
+static void hci_le_accept_cis(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_le_accept_cis cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+ hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp);
+}
+
+static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_cis_req *ev = data;
+ u16 acl_handle, cis_handle;
+ struct hci_conn *acl, *cis;
+ int mask;
+ __u8 flags = 0;
+
+ acl_handle = __le16_to_cpu(ev->acl_handle);
+ cis_handle = __le16_to_cpu(ev->cis_handle);
+
+ bt_dev_dbg(hdev, "acl 0x%4.4x handle 0x%4.4x cig 0x%2.2x cis 0x%2.2x",
+ acl_handle, cis_handle, ev->cig_id, ev->cis_id);
+
+ hci_dev_lock(hdev);
+
+ acl = hci_conn_hash_lookup_handle(hdev, acl_handle);
+ if (!acl)
+ goto unlock;
+
+ mask = hci_proto_connect_ind(hdev, &acl->dst, CIS_LINK, &flags);
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_le_reject_cis(hdev, ev->cis_handle);
+ goto unlock;
+ }
+
+ cis = hci_conn_hash_lookup_handle(hdev, cis_handle);
+ if (!cis) {
+ cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, acl->dst_type,
+ HCI_ROLE_SLAVE, cis_handle);
+ if (IS_ERR(cis)) {
+ hci_le_reject_cis(hdev, ev->cis_handle);
+ goto unlock;
+ }
+ }
+
+ cis->iso_qos.ucast.cig = ev->cig_id;
+ cis->iso_qos.ucast.cis = ev->cis_id;
+
+ if (!(flags & HCI_PROTO_DEFER)) {
+ hci_le_accept_cis(hdev, ev->cis_handle);
+ } else {
+ cis->state = BT_CONNECT2;
+ hci_connect_cfm(cis, 0);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
}
-/* Generate internal stack event */
-void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
+static int hci_iso_term_big_sync(struct hci_dev *hdev, void *data)
{
+ u8 handle = PTR_UINT(data);
+
+ return hci_le_terminate_big_sync(hdev, handle,
+ HCI_ERROR_LOCAL_HOST_TERM);
+}
+
+static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_create_big_complete *ev = data;
+ struct hci_conn *conn;
+ __u8 i = 0;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_CREATE_BIG_COMPLETE,
+ flex_array_size(ev, bis_handle, ev->num_bis)))
+ return;
+
+ hci_dev_lock(hdev);
+
+ /* Connect all BISes that are bound to the BIG */
+ while ((conn = hci_conn_hash_lookup_big_state(hdev, ev->handle,
+ BT_BOUND,
+ HCI_ROLE_MASTER))) {
+ if (ev->status) {
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+ continue;
+ }
+
+ if (hci_conn_set_handle(conn,
+ __le16_to_cpu(ev->bis_handle[i++])))
+ continue;
+
+ conn->state = BT_CONNECTED;
+ set_bit(HCI_CONN_BIG_CREATED, &conn->flags);
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+ hci_iso_setup_path(conn);
+ }
+
+ if (!ev->status && !i)
+ /* If no BISes have been connected for the BIG,
+ * terminate. This is in case all bound connections
+ * have been closed before the BIG creation
+ * has completed.
+ */
+ hci_cmd_sync_queue(hdev, hci_iso_term_big_sync,
+ UINT_PTR(ev->handle), NULL);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_big_sync_established *ev = data;
+ struct hci_conn *bis, *conn;
+ int i;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
+ flex_array_size(ev, bis, ev->num_bis)))
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_big_sync_pend(hdev, ev->handle,
+ ev->num_bis);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection for big 0x%2.2x",
+ ev->handle);
+ goto unlock;
+ }
+
+ clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
+
+ conn->num_bis = 0;
+ memset(conn->bis, 0, sizeof(conn->num_bis));
+
+ for (i = 0; i < ev->num_bis; i++) {
+ u16 handle = le16_to_cpu(ev->bis[i]);
+ __le32 interval;
+
+ bis = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!bis) {
+ if (handle > HCI_CONN_HANDLE_MAX) {
+ bt_dev_dbg(hdev, "ignore too large handle %u", handle);
+ continue;
+ }
+ bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE, handle);
+ if (IS_ERR(bis))
+ continue;
+ }
+
+ if (ev->status != 0x42)
+ /* Mark PA sync as established */
+ set_bit(HCI_CONN_PA_SYNC, &bis->flags);
+
+ bis->sync_handle = conn->sync_handle;
+ bis->iso_qos.bcast.big = ev->handle;
+ memset(&interval, 0, sizeof(interval));
+ memcpy(&interval, ev->latency, sizeof(ev->latency));
+ bis->iso_qos.bcast.in.interval = le32_to_cpu(interval);
+ /* Convert ISO Interval (1.25 ms slots) to latency (ms) */
+ bis->iso_qos.bcast.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
+ bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu);
+
+ if (!ev->status) {
+ bis->state = BT_CONNECTED;
+ set_bit(HCI_CONN_BIG_SYNC, &bis->flags);
+ hci_debugfs_create_conn(bis);
+ hci_conn_add_sysfs(bis);
+ hci_iso_setup_path(bis);
+ }
+ }
+
+ /* In case BIG sync failed, notify each failed connection to
+ * the user after all hci connections have been added
+ */
+ if (ev->status)
+ for (i = 0; i < ev->num_bis; i++) {
+ u16 handle = le16_to_cpu(ev->bis[i]);
+
+ bis = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!bis)
+ continue;
+
+ set_bit(HCI_CONN_BIG_SYNC_FAILED, &bis->flags);
+ hci_connect_cfm(bis, ev->status);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_big_sync_lost *ev = data;
+ struct hci_conn *bis;
+ bool mgmt_conn = false;
+
+ bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle);
+
+ hci_dev_lock(hdev);
+
+ /* Delete each bis connection */
+ while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle,
+ BT_CONNECTED,
+ HCI_ROLE_SLAVE))) {
+ if (!mgmt_conn) {
+ mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED,
+ &bis->flags);
+ mgmt_device_disconnected(hdev, &bis->dst, bis->type,
+ bis->dst_type, ev->reason,
+ mgmt_conn);
+ }
+
+ clear_bit(HCI_CONN_BIG_SYNC, &bis->flags);
+ hci_disconn_cfm(bis, ev->reason);
+ hci_conn_del(bis);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_big_info_adv_report *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync;
+
+ bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
+
+ hci_dev_lock(hdev);
+
+ mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags);
+ if (!(mask & HCI_LM_ACCEPT))
+ goto unlock;
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ pa_sync = hci_conn_hash_lookup_pa_sync_handle
+ (hdev,
+ le16_to_cpu(ev->sync_handle));
+
+ if (!pa_sync)
+ goto unlock;
+
+ pa_sync->iso_qos.bcast.encryption = ev->encryption;
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, 0);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev,
+ void *data, struct sk_buff *skb)
+{
+ struct hci_evt_le_read_all_remote_features_complete *ev = data;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (!ev->status)
+ memcpy(conn->le_features, ev->features, 248);
+
+ if (conn->state == BT_CONFIG) {
+ __u8 status;
+
+ /* If the local controller supports peripheral-initiated
+ * features exchange, but the remote controller does
+ * not, then it is possible that the error code 0x1a
+ * for unsupported remote feature gets returned.
+ *
+ * In this specific case, allow the connection to
+ * transition into connected state and mark it as
+ * successful.
+ */
+ if (!conn->out &&
+ ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
+ (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+ status = 0x00;
+ else
+ status = ev->status;
+
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, status);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+#define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \
+[_op] = { \
+ .func = _func, \
+ .min_len = _min_len, \
+ .max_len = _max_len, \
+}
+
+#define HCI_LE_EV(_op, _func, _len) \
+ HCI_LE_EV_VL(_op, _func, _len, _len)
+
+#define HCI_LE_EV_STATUS(_op, _func) \
+ HCI_LE_EV(_op, _func, sizeof(struct hci_ev_status))
+
+/* Entries in this table shall have their position according to the subevent
+ * opcode they handle so the use of the macros above is recommend since it does
+ * attempt to initialize at its proper index using Designated Initializers that
+ * way events without a callback function can be omitted.
+ */
+static const struct hci_le_ev {
+ void (*func)(struct hci_dev *hdev, void *data, struct sk_buff *skb);
+ u16 min_len;
+ u16 max_len;
+} hci_le_ev_table[U8_MAX + 1] = {
+ /* [0x01 = HCI_EV_LE_CONN_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_CONN_COMPLETE, hci_le_conn_complete_evt,
+ sizeof(struct hci_ev_le_conn_complete)),
+ /* [0x02 = HCI_EV_LE_ADVERTISING_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_ADVERTISING_REPORT, hci_le_adv_report_evt,
+ sizeof(struct hci_ev_le_advertising_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x03 = HCI_EV_LE_CONN_UPDATE_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_CONN_UPDATE_COMPLETE,
+ hci_le_conn_update_complete_evt,
+ sizeof(struct hci_ev_le_conn_update_complete)),
+ /* [0x04 = HCI_EV_LE_REMOTE_FEAT_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_REMOTE_FEAT_COMPLETE,
+ hci_le_remote_feat_complete_evt,
+ sizeof(struct hci_ev_le_remote_feat_complete)),
+ /* [0x05 = HCI_EV_LE_LTK_REQ] */
+ HCI_LE_EV(HCI_EV_LE_LTK_REQ, hci_le_ltk_request_evt,
+ sizeof(struct hci_ev_le_ltk_req)),
+ /* [0x06 = HCI_EV_LE_REMOTE_CONN_PARAM_REQ] */
+ HCI_LE_EV(HCI_EV_LE_REMOTE_CONN_PARAM_REQ,
+ hci_le_remote_conn_param_req_evt,
+ sizeof(struct hci_ev_le_remote_conn_param_req)),
+ /* [0x0a = HCI_EV_LE_ENHANCED_CONN_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_ENHANCED_CONN_COMPLETE,
+ hci_le_enh_conn_complete_evt,
+ sizeof(struct hci_ev_le_enh_conn_complete)),
+ /* [0x0b = HCI_EV_LE_DIRECT_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_DIRECT_ADV_REPORT, hci_le_direct_adv_report_evt,
+ sizeof(struct hci_ev_le_direct_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x0c = HCI_EV_LE_PHY_UPDATE_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_PHY_UPDATE_COMPLETE, hci_le_phy_update_evt,
+ sizeof(struct hci_ev_le_phy_update_complete)),
+ /* [0x0d = HCI_EV_LE_EXT_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_EXT_ADV_REPORT, hci_le_ext_adv_report_evt,
+ sizeof(struct hci_ev_le_ext_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x0e = HCI_EV_LE_PA_SYNC_ESTABLISHED] */
+ HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED,
+ hci_le_pa_sync_established_evt,
+ sizeof(struct hci_ev_le_pa_sync_established)),
+ /* [0x0f = HCI_EV_LE_PER_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_PER_ADV_REPORT,
+ hci_le_per_adv_report_evt,
+ sizeof(struct hci_ev_le_per_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */
+ HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt,
+ sizeof(struct hci_ev_le_pa_sync_lost)),
+ /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */
+ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
+ sizeof(struct hci_evt_le_ext_adv_set_term)),
+ /* [0x18 = HCI_EVT_LE_PAST_RECEIVED] */
+ HCI_LE_EV(HCI_EV_LE_PAST_RECEIVED,
+ hci_le_past_received_evt,
+ sizeof(struct hci_ev_le_past_received)),
+ /* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */
+ HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt,
+ sizeof(struct hci_evt_le_cis_established)),
+ /* [0x1a = HCI_EVT_LE_CIS_REQ] */
+ HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt,
+ sizeof(struct hci_evt_le_cis_req)),
+ /* [0x1b = HCI_EVT_LE_CREATE_BIG_COMPLETE] */
+ HCI_LE_EV_VL(HCI_EVT_LE_CREATE_BIG_COMPLETE,
+ hci_le_create_big_complete_evt,
+ sizeof(struct hci_evt_le_create_big_complete),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */
+ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
+ hci_le_big_sync_established_evt,
+ sizeof(struct hci_evt_le_big_sync_established),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */
+ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST,
+ hci_le_big_sync_lost_evt,
+ sizeof(struct hci_evt_le_big_sync_lost),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x22 = HCI_EVT_LE_BIG_INFO_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EVT_LE_BIG_INFO_ADV_REPORT,
+ hci_le_big_info_adv_report_evt,
+ sizeof(struct hci_evt_le_big_info_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x2b = HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE] */
+ HCI_LE_EV_VL(HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
+ hci_le_read_all_remote_features_evt,
+ sizeof(struct
+ hci_evt_le_read_all_remote_features_complete),
+ HCI_MAX_EVENT_SIZE),
+};
+
+static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_le_meta *ev = data;
+ const struct hci_le_ev *subev;
+
+ bt_dev_dbg(hdev, "subevent 0x%2.2x", ev->subevent);
+
+ /* Only match event if command OGF is for LE */
+ if (hdev->req_skb &&
+ (hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 ||
+ hci_skb_opcode(hdev->req_skb) == HCI_OP_NOP) &&
+ hci_skb_event(hdev->req_skb) == ev->subevent) {
+ *opcode = hci_skb_opcode(hdev->req_skb);
+ hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete,
+ req_complete_skb);
+ }
+
+ subev = &hci_le_ev_table[ev->subevent];
+ if (!subev->func)
+ return;
+
+ if (skb->len < subev->min_len) {
+ bt_dev_err(hdev, "unexpected subevent 0x%2.2x length: %u < %u",
+ ev->subevent, skb->len, subev->min_len);
+ return;
+ }
+
+ /* Just warn if the length is over max_len size it still be
+ * possible to partially parse the event so leave to callback to
+ * decide if that is acceptable.
+ */
+ if (skb->len > subev->max_len)
+ bt_dev_warn(hdev, "unexpected subevent 0x%2.2x length: %u > %u",
+ ev->subevent, skb->len, subev->max_len);
+ data = hci_le_ev_skb_pull(hdev, skb, ev->subevent, subev->min_len);
+ if (!data)
+ return;
+
+ subev->func(hdev, data, skb);
+}
+
+static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
+ u8 event, struct sk_buff *skb)
+{
+ struct hci_ev_cmd_complete *ev;
struct hci_event_hdr *hdr;
- struct hci_ev_stack_internal *ev;
- struct sk_buff *skb;
- skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC);
if (!skb)
+ return false;
+
+ hdr = hci_ev_skb_pull(hdev, skb, event, sizeof(*hdr));
+ if (!hdr)
+ return false;
+
+ if (event) {
+ if (hdr->evt != event)
+ return false;
+ return true;
+ }
+
+ /* Check if request ended in Command Status - no way to retrieve
+ * any extra parameters in this case.
+ */
+ if (hdr->evt == HCI_EV_CMD_STATUS)
+ return false;
+
+ if (hdr->evt != HCI_EV_CMD_COMPLETE) {
+ bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)",
+ hdr->evt);
+ return false;
+ }
+
+ ev = hci_cc_skb_pull(hdev, skb, opcode, sizeof(*ev));
+ if (!ev)
+ return false;
+
+ if (opcode != __le16_to_cpu(ev->opcode)) {
+ BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode,
+ __le16_to_cpu(ev->opcode));
+ return false;
+ }
+
+ return true;
+}
+
+static void hci_store_wake_reason(struct hci_dev *hdev, u8 event,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_advertising_info *adv;
+ struct hci_ev_le_direct_adv_info *direct_adv;
+ struct hci_ev_le_ext_adv_info *ext_adv;
+ const struct hci_ev_conn_complete *conn_complete = (void *)skb->data;
+ const struct hci_ev_conn_request *conn_request = (void *)skb->data;
+
+ hci_dev_lock(hdev);
+
+ /* If we are currently suspended and this is the first BT event seen,
+ * save the wake reason associated with the event.
+ */
+ if (!hdev->suspended || hdev->wake_reason)
+ goto unlock;
+
+ /* Default to remote wake. Values for wake_reason are documented in the
+ * Bluez mgmt api docs.
+ */
+ hdev->wake_reason = MGMT_WAKE_REASON_REMOTE_WAKE;
+
+ /* Once configured for remote wakeup, we should only wake up for
+ * reconnections. It's useful to see which device is waking us up so
+ * keep track of the bdaddr of the connection event that woke us up.
+ */
+ if (event == HCI_EV_CONN_REQUEST) {
+ bacpy(&hdev->wake_addr, &conn_request->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_CONN_COMPLETE) {
+ bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_LE_META) {
+ struct hci_ev_le_meta *le_ev = (void *)skb->data;
+ u8 subevent = le_ev->subevent;
+ u8 *ptr = &skb->data[sizeof(*le_ev)];
+ u8 num_reports = *ptr;
+
+ if ((subevent == HCI_EV_LE_ADVERTISING_REPORT ||
+ subevent == HCI_EV_LE_DIRECT_ADV_REPORT ||
+ subevent == HCI_EV_LE_EXT_ADV_REPORT) &&
+ num_reports) {
+ adv = (void *)(ptr + 1);
+ direct_adv = (void *)(ptr + 1);
+ ext_adv = (void *)(ptr + 1);
+
+ switch (subevent) {
+ case HCI_EV_LE_ADVERTISING_REPORT:
+ bacpy(&hdev->wake_addr, &adv->bdaddr);
+ hdev->wake_addr_type = adv->bdaddr_type;
+ break;
+ case HCI_EV_LE_DIRECT_ADV_REPORT:
+ bacpy(&hdev->wake_addr, &direct_adv->bdaddr);
+ hdev->wake_addr_type = direct_adv->bdaddr_type;
+ break;
+ case HCI_EV_LE_EXT_ADV_REPORT:
+ bacpy(&hdev->wake_addr, &ext_adv->bdaddr);
+ hdev->wake_addr_type = ext_adv->bdaddr_type;
+ break;
+ }
+ }
+ } else {
+ hdev->wake_reason = MGMT_WAKE_REASON_UNEXPECTED;
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+#define HCI_EV_VL(_op, _func, _min_len, _max_len) \
+[_op] = { \
+ .req = false, \
+ .func = _func, \
+ .min_len = _min_len, \
+ .max_len = _max_len, \
+}
+
+#define HCI_EV(_op, _func, _len) \
+ HCI_EV_VL(_op, _func, _len, _len)
+
+#define HCI_EV_STATUS(_op, _func) \
+ HCI_EV(_op, _func, sizeof(struct hci_ev_status))
+
+#define HCI_EV_REQ_VL(_op, _func, _min_len, _max_len) \
+[_op] = { \
+ .req = true, \
+ .func_req = _func, \
+ .min_len = _min_len, \
+ .max_len = _max_len, \
+}
+
+#define HCI_EV_REQ(_op, _func, _len) \
+ HCI_EV_REQ_VL(_op, _func, _len, _len)
+
+/* Entries in this table shall have their position according to the event opcode
+ * they handle so the use of the macros above is recommend since it does attempt
+ * to initialize at its proper index using Designated Initializers that way
+ * events without a callback function don't have entered.
+ */
+static const struct hci_ev {
+ bool req;
+ union {
+ void (*func)(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb);
+ void (*func_req)(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb);
+ };
+ u16 min_len;
+ u16 max_len;
+} hci_ev_table[U8_MAX + 1] = {
+ /* [0x01 = HCI_EV_INQUIRY_COMPLETE] */
+ HCI_EV_STATUS(HCI_EV_INQUIRY_COMPLETE, hci_inquiry_complete_evt),
+ /* [0x02 = HCI_EV_INQUIRY_RESULT] */
+ HCI_EV_VL(HCI_EV_INQUIRY_RESULT, hci_inquiry_result_evt,
+ sizeof(struct hci_ev_inquiry_result), HCI_MAX_EVENT_SIZE),
+ /* [0x03 = HCI_EV_CONN_COMPLETE] */
+ HCI_EV(HCI_EV_CONN_COMPLETE, hci_conn_complete_evt,
+ sizeof(struct hci_ev_conn_complete)),
+ /* [0x04 = HCI_EV_CONN_REQUEST] */
+ HCI_EV(HCI_EV_CONN_REQUEST, hci_conn_request_evt,
+ sizeof(struct hci_ev_conn_request)),
+ /* [0x05 = HCI_EV_DISCONN_COMPLETE] */
+ HCI_EV(HCI_EV_DISCONN_COMPLETE, hci_disconn_complete_evt,
+ sizeof(struct hci_ev_disconn_complete)),
+ /* [0x06 = HCI_EV_AUTH_COMPLETE] */
+ HCI_EV(HCI_EV_AUTH_COMPLETE, hci_auth_complete_evt,
+ sizeof(struct hci_ev_auth_complete)),
+ /* [0x07 = HCI_EV_REMOTE_NAME] */
+ HCI_EV(HCI_EV_REMOTE_NAME, hci_remote_name_evt,
+ sizeof(struct hci_ev_remote_name)),
+ /* [0x08 = HCI_EV_ENCRYPT_CHANGE] */
+ HCI_EV(HCI_EV_ENCRYPT_CHANGE, hci_encrypt_change_evt,
+ sizeof(struct hci_ev_encrypt_change)),
+ /* [0x09 = HCI_EV_CHANGE_LINK_KEY_COMPLETE] */
+ HCI_EV(HCI_EV_CHANGE_LINK_KEY_COMPLETE,
+ hci_change_link_key_complete_evt,
+ sizeof(struct hci_ev_change_link_key_complete)),
+ /* [0x0b = HCI_EV_REMOTE_FEATURES] */
+ HCI_EV(HCI_EV_REMOTE_FEATURES, hci_remote_features_evt,
+ sizeof(struct hci_ev_remote_features)),
+ /* [0x0e = HCI_EV_CMD_COMPLETE] */
+ HCI_EV_REQ_VL(HCI_EV_CMD_COMPLETE, hci_cmd_complete_evt,
+ sizeof(struct hci_ev_cmd_complete), HCI_MAX_EVENT_SIZE),
+ /* [0x0f = HCI_EV_CMD_STATUS] */
+ HCI_EV_REQ(HCI_EV_CMD_STATUS, hci_cmd_status_evt,
+ sizeof(struct hci_ev_cmd_status)),
+ /* [0x10 = HCI_EV_CMD_STATUS] */
+ HCI_EV(HCI_EV_HARDWARE_ERROR, hci_hardware_error_evt,
+ sizeof(struct hci_ev_hardware_error)),
+ /* [0x12 = HCI_EV_ROLE_CHANGE] */
+ HCI_EV(HCI_EV_ROLE_CHANGE, hci_role_change_evt,
+ sizeof(struct hci_ev_role_change)),
+ /* [0x13 = HCI_EV_NUM_COMP_PKTS] */
+ HCI_EV_VL(HCI_EV_NUM_COMP_PKTS, hci_num_comp_pkts_evt,
+ sizeof(struct hci_ev_num_comp_pkts), HCI_MAX_EVENT_SIZE),
+ /* [0x14 = HCI_EV_MODE_CHANGE] */
+ HCI_EV(HCI_EV_MODE_CHANGE, hci_mode_change_evt,
+ sizeof(struct hci_ev_mode_change)),
+ /* [0x16 = HCI_EV_PIN_CODE_REQ] */
+ HCI_EV(HCI_EV_PIN_CODE_REQ, hci_pin_code_request_evt,
+ sizeof(struct hci_ev_pin_code_req)),
+ /* [0x17 = HCI_EV_LINK_KEY_REQ] */
+ HCI_EV(HCI_EV_LINK_KEY_REQ, hci_link_key_request_evt,
+ sizeof(struct hci_ev_link_key_req)),
+ /* [0x18 = HCI_EV_LINK_KEY_NOTIFY] */
+ HCI_EV(HCI_EV_LINK_KEY_NOTIFY, hci_link_key_notify_evt,
+ sizeof(struct hci_ev_link_key_notify)),
+ /* [0x1c = HCI_EV_CLOCK_OFFSET] */
+ HCI_EV(HCI_EV_CLOCK_OFFSET, hci_clock_offset_evt,
+ sizeof(struct hci_ev_clock_offset)),
+ /* [0x1d = HCI_EV_PKT_TYPE_CHANGE] */
+ HCI_EV(HCI_EV_PKT_TYPE_CHANGE, hci_pkt_type_change_evt,
+ sizeof(struct hci_ev_pkt_type_change)),
+ /* [0x20 = HCI_EV_PSCAN_REP_MODE] */
+ HCI_EV(HCI_EV_PSCAN_REP_MODE, hci_pscan_rep_mode_evt,
+ sizeof(struct hci_ev_pscan_rep_mode)),
+ /* [0x22 = HCI_EV_INQUIRY_RESULT_WITH_RSSI] */
+ HCI_EV_VL(HCI_EV_INQUIRY_RESULT_WITH_RSSI,
+ hci_inquiry_result_with_rssi_evt,
+ sizeof(struct hci_ev_inquiry_result_rssi),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x23 = HCI_EV_REMOTE_EXT_FEATURES] */
+ HCI_EV(HCI_EV_REMOTE_EXT_FEATURES, hci_remote_ext_features_evt,
+ sizeof(struct hci_ev_remote_ext_features)),
+ /* [0x2c = HCI_EV_SYNC_CONN_COMPLETE] */
+ HCI_EV(HCI_EV_SYNC_CONN_COMPLETE, hci_sync_conn_complete_evt,
+ sizeof(struct hci_ev_sync_conn_complete)),
+ /* [0x2f = HCI_EV_EXTENDED_INQUIRY_RESULT] */
+ HCI_EV_VL(HCI_EV_EXTENDED_INQUIRY_RESULT,
+ hci_extended_inquiry_result_evt,
+ sizeof(struct hci_ev_ext_inquiry_result), HCI_MAX_EVENT_SIZE),
+ /* [0x30 = HCI_EV_KEY_REFRESH_COMPLETE] */
+ HCI_EV(HCI_EV_KEY_REFRESH_COMPLETE, hci_key_refresh_complete_evt,
+ sizeof(struct hci_ev_key_refresh_complete)),
+ /* [0x31 = HCI_EV_IO_CAPA_REQUEST] */
+ HCI_EV(HCI_EV_IO_CAPA_REQUEST, hci_io_capa_request_evt,
+ sizeof(struct hci_ev_io_capa_request)),
+ /* [0x32 = HCI_EV_IO_CAPA_REPLY] */
+ HCI_EV(HCI_EV_IO_CAPA_REPLY, hci_io_capa_reply_evt,
+ sizeof(struct hci_ev_io_capa_reply)),
+ /* [0x33 = HCI_EV_USER_CONFIRM_REQUEST] */
+ HCI_EV(HCI_EV_USER_CONFIRM_REQUEST, hci_user_confirm_request_evt,
+ sizeof(struct hci_ev_user_confirm_req)),
+ /* [0x34 = HCI_EV_USER_PASSKEY_REQUEST] */
+ HCI_EV(HCI_EV_USER_PASSKEY_REQUEST, hci_user_passkey_request_evt,
+ sizeof(struct hci_ev_user_passkey_req)),
+ /* [0x35 = HCI_EV_REMOTE_OOB_DATA_REQUEST] */
+ HCI_EV(HCI_EV_REMOTE_OOB_DATA_REQUEST, hci_remote_oob_data_request_evt,
+ sizeof(struct hci_ev_remote_oob_data_request)),
+ /* [0x36 = HCI_EV_SIMPLE_PAIR_COMPLETE] */
+ HCI_EV(HCI_EV_SIMPLE_PAIR_COMPLETE, hci_simple_pair_complete_evt,
+ sizeof(struct hci_ev_simple_pair_complete)),
+ /* [0x3b = HCI_EV_USER_PASSKEY_NOTIFY] */
+ HCI_EV(HCI_EV_USER_PASSKEY_NOTIFY, hci_user_passkey_notify_evt,
+ sizeof(struct hci_ev_user_passkey_notify)),
+ /* [0x3c = HCI_EV_KEYPRESS_NOTIFY] */
+ HCI_EV(HCI_EV_KEYPRESS_NOTIFY, hci_keypress_notify_evt,
+ sizeof(struct hci_ev_keypress_notify)),
+ /* [0x3d = HCI_EV_REMOTE_HOST_FEATURES] */
+ HCI_EV(HCI_EV_REMOTE_HOST_FEATURES, hci_remote_host_features_evt,
+ sizeof(struct hci_ev_remote_host_features)),
+ /* [0x3e = HCI_EV_LE_META] */
+ HCI_EV_REQ_VL(HCI_EV_LE_META, hci_le_meta_evt,
+ sizeof(struct hci_ev_le_meta), HCI_MAX_EVENT_SIZE),
+ /* [0xff = HCI_EV_VENDOR] */
+ HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE),
+};
+
+static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb,
+ u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ const struct hci_ev *ev = &hci_ev_table[event];
+ void *data;
+
+ if (!ev->func)
return;
- hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE);
- hdr->evt = HCI_EV_STACK_INTERNAL;
- hdr->plen = sizeof(*ev) + dlen;
+ if (skb->len < ev->min_len) {
+ bt_dev_err(hdev, "unexpected event 0x%2.2x length: %u < %u",
+ event, skb->len, ev->min_len);
+ return;
+ }
- ev = (void *) skb_put(skb, sizeof(*ev) + dlen);
- ev->type = type;
- memcpy(ev->data, data, dlen);
+ /* Just warn if the length is over max_len size it still be
+ * possible to partially parse the event so leave to callback to
+ * decide if that is acceptable.
+ */
+ if (skb->len > ev->max_len)
+ bt_dev_warn_ratelimited(hdev,
+ "unexpected event 0x%2.2x length: %u > %u",
+ event, skb->len, ev->max_len);
+
+ data = hci_ev_skb_pull(hdev, skb, event, ev->min_len);
+ if (!data)
+ return;
- bt_cb(skb)->incoming = 1;
- __net_timestamp(skb);
+ if (ev->req)
+ ev->func_req(hdev, data, skb, opcode, status, req_complete,
+ req_complete_skb);
+ else
+ ev->func(hdev, data, skb);
+}
- bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
- skb->dev = (void *) hdev;
- hci_send_to_sock(hdev, skb);
+void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_event_hdr *hdr = (void *) skb->data;
+ hci_req_complete_t req_complete = NULL;
+ hci_req_complete_skb_t req_complete_skb = NULL;
+ struct sk_buff *orig_skb = NULL;
+ u8 status = 0, event, req_evt = 0;
+ u16 opcode = HCI_OP_NOP;
+
+ if (skb->len < sizeof(*hdr)) {
+ bt_dev_err(hdev, "Malformed HCI Event");
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+ kfree_skb(hdev->recv_event);
+ hdev->recv_event = skb_clone(skb, GFP_KERNEL);
+ hci_dev_unlock(hdev);
+
+ event = hdr->evt;
+ if (!event) {
+ bt_dev_warn(hdev, "Received unexpected HCI Event 0x%2.2x",
+ event);
+ goto done;
+ }
+
+ /* Only match event if command OGF is not for LE */
+ if (hdev->req_skb &&
+ hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) != 0x08 &&
+ hci_skb_event(hdev->req_skb) == event) {
+ hci_req_cmd_complete(hdev, hci_skb_opcode(hdev->req_skb),
+ status, &req_complete, &req_complete_skb);
+ req_evt = event;
+ }
+
+ /* If it looks like we might end up having to call
+ * req_complete_skb, store a pristine copy of the skb since the
+ * various handlers may modify the original one through
+ * skb_pull() calls, etc.
+ */
+ if (req_complete_skb || event == HCI_EV_CMD_STATUS ||
+ event == HCI_EV_CMD_COMPLETE)
+ orig_skb = skb_clone(skb, GFP_KERNEL);
+
+ skb_pull(skb, HCI_EVENT_HDR_SIZE);
+
+ /* Store wake reason if we're suspended */
+ hci_store_wake_reason(hdev, event, skb);
+
+ bt_dev_dbg(hdev, "event 0x%2.2x", event);
+
+ hci_event_func(hdev, event, skb, &opcode, &status, &req_complete,
+ &req_complete_skb);
+
+ if (req_complete) {
+ req_complete(hdev, status, opcode);
+ } else if (req_complete_skb) {
+ if (!hci_get_cmd_complete(hdev, opcode, req_evt, orig_skb)) {
+ kfree_skb(orig_skb);
+ orig_skb = NULL;
+ }
+ req_complete_skb(hdev, status, opcode, orig_skb);
+ }
+
+done:
+ kfree_skb(orig_skb);
kfree_skb(skb);
+ hdev->stat.evt_rx++;
}
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 711a085eca5b..4e7bf63af9c5 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1,4 +1,4 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (C) 2000-2001 Qualcomm Incorporated
@@ -12,57 +12,132 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth HCI sockets. */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <linux/utsname.h>
#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/workqueue.h>
-#include <linux/interrupt.h>
-#include <linux/socket.h>
-#include <linux/ioctl.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
+#include <net/bluetooth/mgmt.h>
-#ifndef CONFIG_BT_HCI_SOCK_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#include "mgmt_util.h"
+
+static LIST_HEAD(mgmt_chan_list);
+static DEFINE_MUTEX(mgmt_chan_list_lock);
+
+static DEFINE_IDA(sock_cookie_ida);
+
+static atomic_t monitor_promisc = ATOMIC_INIT(0);
/* ----- HCI socket interface ----- */
-static inline int hci_test_bit(int nr, void *addr)
+/* Socket info */
+#define hci_pi(sk) ((struct hci_pinfo *) sk)
+
+struct hci_pinfo {
+ struct bt_sock bt;
+ struct hci_dev *hdev;
+ struct hci_filter filter;
+ __u8 cmsg_mask;
+ unsigned short channel;
+ unsigned long flags;
+ __u32 cookie;
+ char comm[TASK_COMM_LEN];
+ __u16 mtu;
+};
+
+static struct hci_dev *hci_hdev_from_sock(struct sock *sk)
+{
+ struct hci_dev *hdev = hci_pi(sk)->hdev;
+
+ if (!hdev)
+ return ERR_PTR(-EBADFD);
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return ERR_PTR(-EPIPE);
+ return hdev;
+}
+
+void hci_sock_set_flag(struct sock *sk, int nr)
+{
+ set_bit(nr, &hci_pi(sk)->flags);
+}
+
+void hci_sock_clear_flag(struct sock *sk, int nr)
+{
+ clear_bit(nr, &hci_pi(sk)->flags);
+}
+
+int hci_sock_test_flag(struct sock *sk, int nr)
+{
+ return test_bit(nr, &hci_pi(sk)->flags);
+}
+
+unsigned short hci_sock_get_channel(struct sock *sk)
+{
+ return hci_pi(sk)->channel;
+}
+
+u32 hci_sock_get_cookie(struct sock *sk)
+{
+ return hci_pi(sk)->cookie;
+}
+
+static bool hci_sock_gen_cookie(struct sock *sk)
{
- return *((__u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
+ int id = hci_pi(sk)->cookie;
+
+ if (!id) {
+ id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL);
+ if (id < 0)
+ id = 0xffffffff;
+
+ hci_pi(sk)->cookie = id;
+ get_task_comm(hci_pi(sk)->comm, current);
+ return true;
+ }
+
+ return false;
+}
+
+static void hci_sock_free_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (id) {
+ hci_pi(sk)->cookie = 0;
+ ida_free(&sock_cookie_ida, id);
+ }
+}
+
+static inline int hci_test_bit(int nr, const void *addr)
+{
+ return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
}
/* Security filter */
-static struct hci_sec_filter hci_sec_filter = {
+#define HCI_SFLT_MAX_OGF 5
+
+struct hci_sec_filter {
+ __u32 type_mask;
+ __u32 event_mask[2];
+ __u32 ocf_mask[HCI_SFLT_MAX_OGF + 1][4];
+};
+
+static const struct hci_sec_filter hci_sec_filter = {
/* Packet types */
0x10,
/* Events */
@@ -71,33 +146,70 @@ static struct hci_sec_filter hci_sec_filter = {
{
{ 0x0 },
/* OGF_LINK_CTL */
- { 0xbe000006, 0x00000001, 0x000000, 0x00 },
+ { 0xbe000006, 0x00000001, 0x00000000, 0x00 },
/* OGF_LINK_POLICY */
- { 0x00005200, 0x00000000, 0x000000, 0x00 },
+ { 0x00005200, 0x00000000, 0x00000000, 0x00 },
/* OGF_HOST_CTL */
- { 0xaab00200, 0x2b402aaa, 0x020154, 0x00 },
+ { 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 },
/* OGF_INFO_PARAM */
- { 0x000002be, 0x00000000, 0x000000, 0x00 },
+ { 0x000002be, 0x00000000, 0x00000000, 0x00 },
/* OGF_STATUS_PARAM */
- { 0x000000ea, 0x00000000, 0x000000, 0x00 }
+ { 0x000000ea, 0x00000000, 0x00000000, 0x00 }
}
};
static struct bt_sock_list hci_sk_list = {
- .lock = RW_LOCK_UNLOCKED
+ .lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock)
};
+static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb)
+{
+ struct hci_filter *flt;
+ int flt_type, flt_event;
+
+ /* Apply filter */
+ flt = &hci_pi(sk)->filter;
+
+ flt_type = hci_skb_pkt_type(skb) & HCI_FLT_TYPE_BITS;
+
+ if (!test_bit(flt_type, &flt->type_mask))
+ return true;
+
+ /* Extra filter for event packets only */
+ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT)
+ return false;
+
+ flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
+
+ if (!hci_test_bit(flt_event, &flt->event_mask))
+ return true;
+
+ /* Check filter only when opcode is set */
+ if (!flt->opcode)
+ return false;
+
+ if (flt_event == HCI_EV_CMD_COMPLETE &&
+ flt->opcode != get_unaligned((__le16 *)(skb->data + 3)))
+ return true;
+
+ if (flt_event == HCI_EV_CMD_STATUS &&
+ flt->opcode != get_unaligned((__le16 *)(skb->data + 4)))
+ return true;
+
+ return false;
+}
+
/* Send frame to RAW socket */
void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
{
struct sock *sk;
- struct hlist_node *node;
+ struct sk_buff *skb_copy = NULL;
BT_DBG("hdev %p len %d", hdev, skb->len);
read_lock(&hci_sk_list.lock);
- sk_for_each(sk, node, &hci_sk_list.head) {
- struct hci_filter *flt;
+
+ sk_for_each(sk, &hci_sk_list.head) {
struct sk_buff *nskb;
if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev)
@@ -107,121 +219,901 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
if (skb->sk == sk)
continue;
- /* Apply filter */
- flt = &hci_pi(sk)->filter;
-
- if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
- 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
+ if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) {
+ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
+ hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT)
+ continue;
+ if (is_filtered_packet(sk, skb))
+ continue;
+ } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
+ if (!bt_cb(skb)->incoming)
+ continue;
+ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_DRV_PKT)
+ continue;
+ } else {
+ /* Don't send frame to other channel types */
continue;
+ }
- if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
- register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
-
- if (!hci_test_bit(evt, &flt->event_mask))
+ if (!skb_copy) {
+ /* Create a private copy with headroom */
+ skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true);
+ if (!skb_copy)
continue;
- if (flt->opcode &&
- ((evt == HCI_EV_CMD_COMPLETE &&
- flt->opcode !=
- get_unaligned((__u16 *)(skb->data + 3))) ||
- (evt == HCI_EV_CMD_STATUS &&
- flt->opcode !=
- get_unaligned((__u16 *)(skb->data + 4)))))
- continue;
+ /* Put type byte before the data */
+ memcpy(skb_push(skb_copy, 1), &hci_skb_pkt_type(skb), 1);
}
- if (!(nskb = skb_clone(skb, GFP_ATOMIC)))
+ nskb = skb_clone(skb_copy, GFP_ATOMIC);
+ if (!nskb)
continue;
- /* Put type byte before the data */
- memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
+ if (sock_queue_rcv_skb(sk, nskb))
+ kfree_skb(nskb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+
+ kfree_skb(skb_copy);
+}
+
+static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb)
+{
+ struct scm_creds *creds;
+
+ if (!sk || WARN_ON(!skb))
+ return;
+
+ creds = &bt_cb(skb)->creds;
+
+ /* Check if peer credentials is set */
+ if (!sk->sk_peer_pid) {
+ /* Check if parent peer credentials is set */
+ if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid)
+ sk = bt_sk(sk)->parent;
+ else
+ return;
+ }
+
+ /* Check if scm_creds already set */
+ if (creds->pid == pid_vnr(sk->sk_peer_pid))
+ return;
+
+ memset(creds, 0, sizeof(*creds));
+
+ creds->pid = pid_vnr(sk->sk_peer_pid);
+ if (sk->sk_peer_cred) {
+ creds->uid = sk->sk_peer_cred->uid;
+ creds->gid = sk->sk_peer_cred->gid;
+ }
+}
+
+static struct sk_buff *hci_skb_clone(struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+
+ if (!skb)
+ return NULL;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ hci_sock_copy_creds(skb->sk, nskb);
+
+ return nskb;
+}
+
+/* Send frame to sockets with specific channel */
+static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
+ int flag, struct sock *skip_sk)
+{
+ struct sock *sk;
+
+ BT_DBG("channel %u len %d", channel, skb->len);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *nskb;
+
+ /* Ignore socket without the flag set */
+ if (!hci_sock_test_flag(sk, flag))
+ continue;
+
+ /* Skip the original socket */
+ if (sk == skip_sk)
+ continue;
+
+ if (sk->sk_state != BT_BOUND)
+ continue;
+
+ if (hci_pi(sk)->channel != channel)
+ continue;
+
+ nskb = hci_skb_clone(skb);
+ if (!nskb)
+ continue;
if (sock_queue_rcv_skb(sk, nskb))
kfree_skb(nskb);
}
+
+}
+
+void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
+ int flag, struct sock *skip_sk)
+{
+ read_lock(&hci_sk_list.lock);
+ __hci_send_to_channel(channel, skb, flag, skip_sk);
+ read_unlock(&hci_sk_list.lock);
+}
+
+/* Send frame to monitor socket */
+void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct sk_buff *skb_copy = NULL;
+ struct hci_mon_hdr *hdr;
+ __le16 opcode;
+
+ if (!atomic_read(&monitor_promisc))
+ return;
+
+ BT_DBG("hdev %p len %d", hdev, skb->len);
+
+ switch (hci_skb_pkt_type(skb)) {
+ case HCI_COMMAND_PKT:
+ opcode = cpu_to_le16(HCI_MON_COMMAND_PKT);
+ break;
+ case HCI_EVENT_PKT:
+ opcode = cpu_to_le16(HCI_MON_EVENT_PKT);
+ break;
+ case HCI_ACLDATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_ACL_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_ACL_TX_PKT);
+ break;
+ case HCI_SCODATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_SCO_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT);
+ break;
+ case HCI_ISODATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT);
+ break;
+ case HCI_DRV_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT);
+ break;
+ case HCI_DIAG_PKT:
+ opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG);
+ break;
+ default:
+ return;
+ }
+
+ /* Create a private copy with headroom */
+ skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE, GFP_ATOMIC, true);
+ if (!skb_copy)
+ return;
+
+ hci_sock_copy_creds(skb->sk, skb_copy);
+
+ /* Put header before the data */
+ hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE);
+ hdr->opcode = opcode;
+ hdr->index = cpu_to_le16(hdev->id);
+ hdr->len = cpu_to_le16(skb->len);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb_copy);
+}
+
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+ void *data, u16 data_len, ktime_t tstamp,
+ int flag, struct sock *skip_sk)
+{
+ struct sock *sk;
+ __le16 index;
+
+ if (hdev)
+ index = cpu_to_le16(hdev->id);
+ else
+ index = cpu_to_le16(MGMT_INDEX_NONE);
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+ continue;
+
+ /* Ignore socket without the flag set */
+ if (!hci_sock_test_flag(sk, flag))
+ continue;
+
+ /* Skip the original socket */
+ if (sk == skip_sk)
+ continue;
+
+ skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
+ if (!skb)
+ continue;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(event, skb_put(skb, 2));
+
+ if (data)
+ skb_put_data(skb, data, data_len);
+
+ skb->tstamp = tstamp;
+
+ hdr = skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
+static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
+{
+ struct hci_mon_hdr *hdr;
+ struct hci_mon_new_index *ni;
+ struct hci_mon_index_info *ii;
+ struct sk_buff *skb;
+ __le16 opcode;
+
+ switch (event) {
+ case HCI_DEV_REG:
+ skb = bt_skb_alloc(HCI_MON_NEW_INDEX_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE);
+ ni->type = 0x00; /* Old hdev->dev_type */
+ ni->bus = hdev->bus;
+ bacpy(&ni->bdaddr, &hdev->bdaddr);
+ memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name,
+ strnlen(hdev->name, sizeof(ni->name)), '\0');
+
+ opcode = cpu_to_le16(HCI_MON_NEW_INDEX);
+ break;
+
+ case HCI_DEV_UNREG:
+ skb = bt_skb_alloc(0, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ opcode = cpu_to_le16(HCI_MON_DEL_INDEX);
+ break;
+
+ case HCI_DEV_SETUP:
+ if (hdev->manufacturer == 0xffff)
+ return NULL;
+ fallthrough;
+
+ case HCI_DEV_UP:
+ skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ ii = skb_put(skb, HCI_MON_INDEX_INFO_SIZE);
+ bacpy(&ii->bdaddr, &hdev->bdaddr);
+ ii->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ opcode = cpu_to_le16(HCI_MON_INDEX_INFO);
+ break;
+
+ case HCI_DEV_OPEN:
+ skb = bt_skb_alloc(0, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ opcode = cpu_to_le16(HCI_MON_OPEN_INDEX);
+ break;
+
+ case HCI_DEV_CLOSE:
+ skb = bt_skb_alloc(0, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX);
+ break;
+
+ default:
+ return NULL;
+ }
+
+ __net_timestamp(skb);
+
+ hdr = skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = opcode;
+ hdr->index = cpu_to_le16(hdev->id);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ u16 format;
+ u8 ver[3];
+ u32 flags;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ format = 0x0000;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_USER:
+ format = 0x0001;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_CONTROL:
+ format = 0x0002;
+ mgmt_fill_version_info(ver);
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ hci_sock_copy_creds(sk, skb);
+
+ flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(format, skb_put(skb, 2));
+ skb_put_data(skb, ver, sizeof(ver));
+ put_unaligned_le32(flags, skb_put(skb, 4));
+ skb_put_u8(skb, TASK_COMM_LEN);
+ skb_put_data(skb, hci_pi(sk)->comm, TASK_COMM_LEN);
+
+ __net_timestamp(skb);
+
+ hdr = skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(4, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ hci_sock_copy_creds(sk, skb);
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+
+ __net_timestamp(skb);
+
+ hdr = skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
+ u16 opcode, u16 len,
+ const void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ hci_sock_copy_creds(sk, skb);
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ skb_put_data(skb, buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static void __printf(2, 3)
+send_monitor_note(struct sock *sk, const char *fmt, ...)
+{
+ size_t len;
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ va_list args;
+
+ va_start(args, fmt);
+ len = vsnprintf(NULL, 0, fmt, args);
+ va_end(args);
+
+ skb = bt_skb_alloc(len + 1, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ hci_sock_copy_creds(sk, skb);
+
+ va_start(args, fmt);
+ vsprintf(skb_put(skb, len), fmt, args);
+ *(u8 *)skb_put(skb, 1) = 0;
+ va_end(args);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_SYSTEM_NOTE);
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+static void send_monitor_replay(struct sock *sk)
+{
+ struct hci_dev *hdev;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_event(hdev, HCI_DEV_REG);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+
+ if (!test_bit(HCI_RUNNING, &hdev->flags))
+ continue;
+
+ skb = create_monitor_event(hdev, HCI_DEV_OPEN);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+
+ if (test_bit(HCI_UP, &hdev->flags))
+ skb = create_monitor_event(hdev, HCI_DEV_UP);
+ else if (hci_dev_test_flag(hdev, HCI_SETUP))
+ skb = create_monitor_event(hdev, HCI_DEV_SETUP);
+ else
+ skb = NULL;
+
+ if (skb) {
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+ }
+ }
+
+ read_unlock(&hci_dev_list_lock);
+}
+
+static void send_monitor_control_replay(struct sock *mon_sk)
+{
+ struct sock *sk;
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_ctrl_open(sk);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(mon_sk, skb))
+ kfree_skb(skb);
+ }
+
read_unlock(&hci_sk_list.lock);
}
+/* Generate internal stack event */
+static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
+{
+ struct hci_event_hdr *hdr;
+ struct hci_ev_stack_internal *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ hdr = skb_put(skb, HCI_EVENT_HDR_SIZE);
+ hdr->evt = HCI_EV_STACK_INTERNAL;
+ hdr->plen = sizeof(*ev) + dlen;
+
+ ev = skb_put(skb, sizeof(*ev) + dlen);
+ ev->type = type;
+ memcpy(ev->data, data, dlen);
+
+ bt_cb(skb)->incoming = 1;
+ __net_timestamp(skb);
+
+ hci_skb_pkt_type(skb) = HCI_EVENT_PKT;
+ hci_send_to_sock(hdev, skb);
+ kfree_skb(skb);
+}
+
+void hci_sock_dev_event(struct hci_dev *hdev, int event)
+{
+ BT_DBG("hdev %s event %d", hdev->name, event);
+
+ if (atomic_read(&monitor_promisc)) {
+ struct sk_buff *skb;
+
+ /* Send event to monitor */
+ skb = create_monitor_event(hdev, event);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ if (event <= HCI_DEV_DOWN) {
+ struct hci_ev_si_device ev;
+
+ /* Send event to sockets */
+ ev.event = event;
+ ev.dev_id = hdev->id;
+ hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev);
+ }
+
+ if (event == HCI_DEV_UNREG) {
+ struct sock *sk;
+
+ /* Wake up sockets using this dead device */
+ read_lock(&hci_sk_list.lock);
+ sk_for_each(sk, &hci_sk_list.head) {
+ if (hci_pi(sk)->hdev == hdev) {
+ sk->sk_err = EPIPE;
+ sk->sk_state_change(sk);
+ }
+ }
+ read_unlock(&hci_sk_list.lock);
+ }
+}
+
+static struct hci_mgmt_chan *__hci_mgmt_chan_find(unsigned short channel)
+{
+ struct hci_mgmt_chan *c;
+
+ list_for_each_entry(c, &mgmt_chan_list, list) {
+ if (c->channel == channel)
+ return c;
+ }
+
+ return NULL;
+}
+
+static struct hci_mgmt_chan *hci_mgmt_chan_find(unsigned short channel)
+{
+ struct hci_mgmt_chan *c;
+
+ mutex_lock(&mgmt_chan_list_lock);
+ c = __hci_mgmt_chan_find(channel);
+ mutex_unlock(&mgmt_chan_list_lock);
+
+ return c;
+}
+
+int hci_mgmt_chan_register(struct hci_mgmt_chan *c)
+{
+ if (c->channel < HCI_CHANNEL_CONTROL)
+ return -EINVAL;
+
+ mutex_lock(&mgmt_chan_list_lock);
+ if (__hci_mgmt_chan_find(c->channel)) {
+ mutex_unlock(&mgmt_chan_list_lock);
+ return -EALREADY;
+ }
+
+ list_add_tail(&c->list, &mgmt_chan_list);
+
+ mutex_unlock(&mgmt_chan_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_mgmt_chan_register);
+
+void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c)
+{
+ mutex_lock(&mgmt_chan_list_lock);
+ list_del(&c->list);
+ mutex_unlock(&mgmt_chan_list_lock);
+}
+EXPORT_SYMBOL(hci_mgmt_chan_unregister);
+
static int hci_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct hci_dev *hdev;
+ struct sk_buff *skb;
BT_DBG("sock %p sk %p", sock, sk);
if (!sk)
return 0;
- hdev = hci_pi(sk)->hdev;
+ lock_sock(sk);
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_MONITOR:
+ atomic_dec(&monitor_promisc);
+ break;
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ hci_sock_free_cookie(sk);
+ break;
+ }
bt_sock_unlink(&hci_sk_list, sk);
+ hdev = hci_pi(sk)->hdev;
if (hdev) {
+ if (hci_pi(sk)->channel == HCI_CHANNEL_USER &&
+ !hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ /* When releasing a user channel exclusive access,
+ * call hci_dev_do_close directly instead of calling
+ * hci_dev_close to ensure the exclusive access will
+ * be released and the controller brought back down.
+ *
+ * The checking of HCI_AUTO_OFF is not needed in this
+ * case since it will have been cleared already when
+ * opening the user channel.
+ *
+ * Make sure to also check that we haven't already
+ * unregistered since all the cleanup will have already
+ * been complete and hdev will get released when we put
+ * below.
+ */
+ hci_dev_do_close(hdev);
+ hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
+ mgmt_index_added(hdev);
+ }
+
atomic_dec(&hdev->promisc);
hci_dev_put(hdev);
}
sock_orphan(sk);
-
- skb_queue_purge(&sk->sk_receive_queue);
- skb_queue_purge(&sk->sk_write_queue);
-
+ release_sock(sk);
sock_put(sk);
return 0;
}
-/* Ioctls that require bound socket */
-static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg)
{
- struct hci_dev *hdev = hci_pi(sk)->hdev;
+ bdaddr_t bdaddr;
+ int err;
- if (!hdev)
- return -EBADFD;
+ if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+ return -EFAULT;
- switch (cmd) {
- case HCISETRAW:
- if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ hci_dev_lock(hdev);
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
- return -EPERM;
+ err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR);
- if (arg)
- set_bit(HCI_RAW, &hdev->flags);
- else
- clear_bit(HCI_RAW, &hdev->flags);
+ hci_dev_unlock(hdev);
- return 0;
+ return err;
+}
- case HCISETSECMGR:
- if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg)
+{
+ bdaddr_t bdaddr;
+ int err;
- if (arg)
- set_bit(HCI_SECMGR, &hdev->flags);
- else
- clear_bit(HCI_SECMGR, &hdev->flags);
+ if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+ return -EFAULT;
- return 0;
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+/* Ioctls that require bound socket */
+static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
+ unsigned long arg)
+{
+ struct hci_dev *hdev = hci_hdev_from_sock(sk);
+
+ if (IS_ERR(hdev))
+ return PTR_ERR(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ return -EBUSY;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ return -EOPNOTSUPP;
+
+ switch (cmd) {
+ case HCISETRAW:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return -EOPNOTSUPP;
case HCIGETCONNINFO:
return hci_get_conn_info(hdev, (void __user *)arg);
- default:
- if (hdev->ioctl)
- return hdev->ioctl(hdev, cmd, arg);
- return -EINVAL;
+ case HCIGETAUTHINFO:
+ return hci_get_auth_info(hdev, (void __user *)arg);
+
+ case HCIBLOCKADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_sock_reject_list_add(hdev, (void __user *)arg);
+
+ case HCIUNBLOCKADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_sock_reject_list_del(hdev, (void __user *)arg);
}
+
+ return -ENOIOCTLCMD;
}
-static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
{
- struct sock *sk = sock->sk;
void __user *argp = (void __user *)arg;
+ struct sock *sk = sock->sk;
int err;
BT_DBG("cmd %x arg %lx", cmd, arg);
+ /* Make sure the cmd is valid before doing anything */
+ switch (cmd) {
+ case HCIGETDEVLIST:
+ case HCIGETDEVINFO:
+ case HCIGETCONNLIST:
+ case HCIDEVUP:
+ case HCIDEVDOWN:
+ case HCIDEVRESET:
+ case HCIDEVRESTAT:
+ case HCISETSCAN:
+ case HCISETAUTH:
+ case HCISETENCRYPT:
+ case HCISETPTYPE:
+ case HCISETLINKPOL:
+ case HCISETLINKMODE:
+ case HCISETACLMTU:
+ case HCISETSCOMTU:
+ case HCIINQUIRY:
+ case HCISETRAW:
+ case HCIGETCONNINFO:
+ case HCIGETAUTHINFO:
+ case HCIBLOCKADDR:
+ case HCIUNBLOCKADDR:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ lock_sock(sk);
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ /* When calling an ioctl on an unbound raw socket, then ensure
+ * that the monitor gets informed. Ensure that the resulting event
+ * is only send once by checking if the cookie exists or not. The
+ * socket cookie will be only ever generated once for the lifetime
+ * of a given socket.
+ */
+ if (hci_sock_gen_cookie(sk)) {
+ struct sk_buff *skb;
+
+ /* Perform careful checks before setting the HCI_SOCK_TRUSTED
+ * flag. Make sure that not only the current task but also
+ * the socket opener has the required capability, since
+ * privileged programs can be tricked into making ioctl calls
+ * on HCI sockets, and the socket should not be marked as
+ * trusted simply because the ioctl caller is privileged.
+ */
+ if (sk_capable(sk, CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ release_sock(sk);
+
switch (cmd) {
case HCIGETDEVLIST:
return hci_get_dev_list(argp);
@@ -234,22 +1126,22 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long a
case HCIDEVUP:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
return hci_dev_open(arg);
case HCIDEVDOWN:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
return hci_dev_close(arg);
case HCIDEVRESET:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
return hci_dev_reset(arg);
case HCIDEVRESTAT:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
return hci_dev_reset_stat(arg);
case HCISETSCAN:
@@ -261,49 +1153,333 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long a
case HCISETACLMTU:
case HCISETSCOMTU:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
return hci_dev_cmd(cmd, argp);
case HCIINQUIRY:
return hci_inquiry(argp);
+ }
- default:
- lock_sock(sk);
- err = hci_sock_bound_ioctl(sk, cmd, arg);
- release_sock(sk);
- return err;
+ lock_sock(sk);
+
+ err = hci_sock_bound_ioctl(sk, cmd, arg);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case HCIDEVUP:
+ case HCIDEVDOWN:
+ case HCIDEVRESET:
+ case HCIDEVRESTAT:
+ return hci_sock_ioctl(sock, cmd, arg);
}
+
+ return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
}
+#endif
-static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
+ int addr_len)
{
- struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr;
+ struct sockaddr_hci haddr;
struct sock *sk = sock->sk;
struct hci_dev *hdev = NULL;
- int err = 0;
+ struct sk_buff *skb;
+ int len, err = 0;
BT_DBG("sock %p sk %p", sock, sk);
- if (!haddr || haddr->hci_family != AF_BLUETOOTH)
+ if (!addr)
+ return -EINVAL;
+
+ memset(&haddr, 0, sizeof(haddr));
+ len = min_t(unsigned int, sizeof(haddr), addr_len);
+ memcpy(&haddr, addr, len);
+
+ if (haddr.hci_family != AF_BLUETOOTH)
return -EINVAL;
lock_sock(sk);
- if (hci_pi(sk)->hdev) {
+ /* Allow detaching from dead device and attaching to alive device, if
+ * the caller wants to re-bind (instead of close) this socket in
+ * response to hci_sock_dev_event(HCI_DEV_UNREG) notification.
+ */
+ hdev = hci_pi(sk)->hdev;
+ if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ hci_pi(sk)->hdev = NULL;
+ sk->sk_state = BT_OPEN;
+ hci_dev_put(hdev);
+ }
+ hdev = NULL;
+
+ if (sk->sk_state == BT_BOUND) {
err = -EALREADY;
goto done;
}
- if (haddr->hci_dev != HCI_DEV_NONE) {
- if (!(hdev = hci_dev_get(haddr->hci_dev))) {
+ switch (haddr.hci_channel) {
+ case HCI_CHANNEL_RAW:
+ if (hci_pi(sk)->hdev) {
+ err = -EALREADY;
+ goto done;
+ }
+
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ hdev = hci_dev_get(haddr.hci_dev);
+ if (!hdev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ atomic_inc(&hdev->promisc);
+ }
+
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * then there has been already an ioctl issued against
+ * an unbound socket and with that triggered an open
+ * notification. Send a close notification first to
+ * allow the state transition to bounded.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ break;
+
+ case HCI_CHANNEL_USER:
+ if (hci_pi(sk)->hdev) {
+ err = -EALREADY;
+ goto done;
+ }
+
+ if (haddr.hci_dev == HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto done;
+ }
+
+ hdev = hci_dev_get(haddr.hci_dev);
+ if (!hdev) {
err = -ENODEV;
goto done;
}
+ if (test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) &&
+ test_bit(HCI_UP, &hdev->flags))) {
+ err = -EBUSY;
+ hci_dev_put(hdev);
+ goto done;
+ }
+
+ if (hci_dev_test_and_set_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EUSERS;
+ hci_dev_put(hdev);
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+ mgmt_index_removed(hdev);
+ hci_dev_unlock(hdev);
+
+ err = hci_dev_open(hdev->id);
+ if (err) {
+ if (err == -EALREADY) {
+ /* In case the transport is already up and
+ * running, clear the error here.
+ *
+ * This can happen when opening a user
+ * channel and HCI_AUTO_OFF grace period
+ * is still active.
+ */
+ err = 0;
+ } else {
+ hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
+ mgmt_index_added(hdev);
+ hci_dev_put(hdev);
+ goto done;
+ }
+ }
+
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * this socket will transition from a raw socket into
+ * a user channel socket. For a clean transition, send
+ * the close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* The user channel is restricted to CAP_NET_ADMIN
+ * capabilities and with that implicitly trusted.
+ */
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
atomic_inc(&hdev->promisc);
+ break;
+
+ case HCI_CHANNEL_MONITOR:
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!capable(CAP_NET_RAW)) {
+ err = -EPERM;
+ goto done;
+ }
+
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ /* The monitor interface is restricted to CAP_NET_RAW
+ * capabilities and with that implicitly trusted.
+ */
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ send_monitor_note(sk, "Linux version %s (%s)",
+ init_utsname()->release,
+ init_utsname()->machine);
+ send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
+ BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
+ send_monitor_replay(sk);
+ send_monitor_control_replay(sk);
+
+ atomic_inc(&monitor_promisc);
+ break;
+
+ case HCI_CHANNEL_LOGGING:
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto done;
+ }
+
+ hci_pi(sk)->channel = haddr.hci_channel;
+ break;
+
+ default:
+ if (!hci_mgmt_chan_find(haddr.hci_channel)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ /* Users with CAP_NET_ADMIN capabilities are allowed
+ * access to all management commands and events. For
+ * untrusted users the interface is restricted and
+ * also only untrusted events are sent.
+ */
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ /* At the moment the index and unconfigured index events
+ * are enabled unconditionally. Setting them on each
+ * socket when binding keeps this functionality. They
+ * however might be cleared later and then sending of these
+ * events will be disabled, but that is then intentional.
+ *
+ * This also enables generic events that are safe to be
+ * received by untrusted users. Example for such events
+ * are changes to settings, class of device, name etc.
+ */
+ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been
+ * assigned, this socket will transition from
+ * a raw socket into a control socket. To
+ * allow for a clean transition, send the
+ * close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+ }
+ break;
}
- hci_pi(sk)->hdev = hdev;
+ /* Default MTU to HCI_MAX_FRAME_SIZE if not set */
+ if (!hci_pi(sk)->mtu)
+ hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE;
+
sk->sk_state = BT_BOUND;
done:
@@ -311,137 +1487,425 @@ done:
return err;
}
-static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer)
+static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
{
- struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr;
+ struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr;
struct sock *sk = sock->sk;
- struct hci_dev *hdev = hci_pi(sk)->hdev;
+ struct hci_dev *hdev;
+ int err = 0;
BT_DBG("sock %p sk %p", sock, sk);
- if (!hdev)
- return -EBADFD;
+ if (peer)
+ return -EOPNOTSUPP;
lock_sock(sk);
- *addr_len = sizeof(*haddr);
+ hdev = hci_hdev_from_sock(sk);
+ if (IS_ERR(hdev)) {
+ err = PTR_ERR(hdev);
+ goto done;
+ }
+
haddr->hci_family = AF_BLUETOOTH;
haddr->hci_dev = hdev->id;
+ haddr->hci_channel= hci_pi(sk)->channel;
+ err = sizeof(*haddr);
+done:
release_sock(sk);
- return 0;
+ return err;
}
-static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
+static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
{
- __u32 mask = hci_pi(sk)->cmsg_mask;
+ __u8 mask = hci_pi(sk)->cmsg_mask;
if (mask & HCI_CMSG_DIR) {
int incoming = bt_cb(skb)->incoming;
- put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming),
+ &incoming);
}
if (mask & HCI_CMSG_TSTAMP) {
- struct timeval tv;
+#ifdef CONFIG_COMPAT
+ struct old_timeval32 ctv;
+#endif
+ struct __kernel_old_timeval tv;
+ void *data;
+ int len;
skb_get_timestamp(skb, &tv);
- put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv);
+
+ data = &tv;
+ len = sizeof(tv);
+#ifdef CONFIG_COMPAT
+ if (!COMPAT_USE_64BIT_TIME &&
+ (msg->msg_flags & MSG_CMSG_COMPAT)) {
+ ctv.tv_sec = tv.tv_sec;
+ ctv.tv_usec = tv.tv_usec;
+ data = &ctv;
+ len = sizeof(ctv);
+ }
+#endif
+
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data);
}
}
-
-static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len, int flags)
+
+static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
{
- int noblock = flags & MSG_DONTWAIT;
+ struct scm_cookie scm;
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
+ unsigned int skblen;
BT_DBG("sock %p, sk %p", sock, sk);
- if (flags & (MSG_OOB))
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (hci_pi(sk)->channel == HCI_CHANNEL_LOGGING)
return -EOPNOTSUPP;
if (sk->sk_state == BT_CLOSED)
return 0;
- if (!(skb = skb_recv_datagram(sk, flags, noblock, &err)))
+ skb = skb_recv_datagram(sk, flags, &err);
+ if (!skb)
return err;
- msg->msg_namelen = 0;
-
+ skblen = skb->len;
copied = skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- skb->h.raw = skb->data;
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ skb_reset_transport_header(skb);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
- hci_sock_cmsg(sk, msg, skb);
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ hci_sock_cmsg(sk, msg, skb);
+ break;
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_MONITOR:
+ sock_recv_timestamp(msg, sk, skb);
+ break;
+ default:
+ if (hci_mgmt_chan_find(hci_pi(sk)->channel))
+ sock_recv_timestamp(msg, sk, skb);
+ break;
+ }
+
+ memset(&scm, 0, sizeof(scm));
+ scm.creds = bt_cb(skb)->creds;
skb_free_datagram(sk, skb);
+ if (flags & MSG_TRUNC)
+ copied = skblen;
+
+ scm_recv(sock, msg, &scm, flags);
+
return err ? : copied;
}
-static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
+ struct sk_buff *skb)
+{
+ u8 *cp;
+ struct mgmt_hdr *hdr;
+ u16 opcode, index, len;
+ struct hci_dev *hdev = NULL;
+ const struct hci_mgmt_handler *handler;
+ bool var_len, no_hdev;
+ int err;
+
+ BT_DBG("got %d bytes", skb->len);
+
+ if (skb->len < sizeof(*hdr))
+ return -EINVAL;
+
+ hdr = (void *)skb->data;
+ opcode = __le16_to_cpu(hdr->opcode);
+ index = __le16_to_cpu(hdr->index);
+ len = __le16_to_cpu(hdr->len);
+
+ if (len != skb->len - sizeof(*hdr)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (chan->channel == HCI_CHANNEL_CONTROL) {
+ struct sk_buff *cmd;
+
+ /* Send event to monitor */
+ cmd = create_monitor_ctrl_command(sk, index, opcode, len,
+ skb->data + sizeof(*hdr));
+ if (cmd) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(cmd);
+ }
+ }
+
+ if (opcode >= chan->handler_count ||
+ chan->handlers[opcode].func == NULL) {
+ BT_DBG("Unknown op %u", opcode);
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_UNKNOWN_COMMAND);
+ goto done;
+ }
+
+ handler = &chan->handlers[opcode];
+
+ if (!hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) &&
+ !(handler->flags & HCI_MGMT_UNTRUSTED)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_PERMISSION_DENIED);
+ goto done;
+ }
+
+ if (index != MGMT_INDEX_NONE) {
+ hdev = hci_dev_get(index);
+ if (!hdev) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !(handler->flags & HCI_MGMT_UNCONFIGURED)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+ }
+
+ if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) {
+ no_hdev = (handler->flags & HCI_MGMT_NO_HDEV);
+ if (no_hdev != !hdev) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+ }
+
+ var_len = (handler->flags & HCI_MGMT_VAR_LEN);
+ if ((var_len && len < handler->data_len) ||
+ (!var_len && len != handler->data_len)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto done;
+ }
+
+ if (hdev && chan->hdev_init)
+ chan->hdev_init(sk, hdev);
+
+ cp = skb->data + sizeof(*hdr);
+
+ err = handler->func(sk, hdev, cp, len);
+ if (err < 0)
+ goto done;
+
+ err = skb->len;
+
+done:
+ if (hdev)
+ hci_dev_put(hdev);
+
+ return err;
+}
+
+static int hci_logging_frame(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags)
+{
+ struct hci_mon_hdr *hdr;
+ struct hci_dev *hdev;
+ u16 index;
+ int err;
+
+ /* The logging frame consists at minimum of the standard header,
+ * the priority byte, the ident length byte and at least one string
+ * terminator NUL byte. Anything shorter are invalid packets.
+ */
+ if (skb->len < sizeof(*hdr) + 3)
+ return -EINVAL;
+
+ hdr = (void *)skb->data;
+
+ if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr))
+ return -EINVAL;
+
+ if (__le16_to_cpu(hdr->opcode) == 0x0000) {
+ __u8 priority = skb->data[sizeof(*hdr)];
+ __u8 ident_len = skb->data[sizeof(*hdr) + 1];
+
+ /* Only the priorities 0-7 are valid and with that any other
+ * value results in an invalid packet.
+ *
+ * The priority byte is followed by an ident length byte and
+ * the NUL terminated ident string. Check that the ident
+ * length is not overflowing the packet and also that the
+ * ident string itself is NUL terminated. In case the ident
+ * length is zero, the length value actually doubles as NUL
+ * terminator identifier.
+ *
+ * The message follows the ident string (if present) and
+ * must be NUL terminated. Otherwise it is not a valid packet.
+ */
+ if (priority > 7 || skb->data[skb->len - 1] != 0x00 ||
+ ident_len > skb->len - sizeof(*hdr) - 3 ||
+ skb->data[sizeof(*hdr) + ident_len + 1] != 0x00)
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ index = __le16_to_cpu(hdr->index);
+
+ if (index != MGMT_INDEX_NONE) {
+ hdev = hci_dev_get(index);
+ if (!hdev)
+ return -ENODEV;
+ } else {
+ hdev = NULL;
+ }
+
+ hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL);
+ err = skb->len;
+
+ if (hdev)
+ hci_dev_put(hdev);
+
+ return err;
+}
+
+static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
{
struct sock *sk = sock->sk;
+ struct hci_mgmt_chan *chan;
struct hci_dev *hdev;
struct sk_buff *skb;
int err;
+ const unsigned int flags = msg->msg_flags;
BT_DBG("sock %p sk %p", sock, sk);
- if (msg->msg_flags & MSG_OOB)
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
- if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE))
+ if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT))
return -EINVAL;
- if (len < 4 || len > HCI_MAX_FRAME_SIZE)
+ if (len < 4 || len > hci_pi(sk)->mtu)
return -EINVAL;
+ skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
lock_sock(sk);
- if (!(hdev = hci_pi(sk)->hdev)) {
- err = -EBADFD;
- goto done;
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ break;
+ case HCI_CHANNEL_MONITOR:
+ err = -EOPNOTSUPP;
+ goto drop;
+ case HCI_CHANNEL_LOGGING:
+ err = hci_logging_frame(sk, skb, flags);
+ goto drop;
+ default:
+ mutex_lock(&mgmt_chan_list_lock);
+ chan = __hci_mgmt_chan_find(hci_pi(sk)->channel);
+ if (chan)
+ err = hci_mgmt_cmd(chan, sk, skb);
+ else
+ err = -EINVAL;
+
+ mutex_unlock(&mgmt_chan_list_lock);
+ goto drop;
}
- if (!(skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err)))
- goto done;
+ hdev = hci_hdev_from_sock(sk);
+ if (IS_ERR(hdev)) {
+ err = PTR_ERR(hdev);
+ goto drop;
+ }
- if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
- err = -EFAULT;
+ if (!test_bit(HCI_UP, &hdev->flags)) {
+ err = -ENETDOWN;
goto drop;
}
- bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
+ hci_skb_pkt_type(skb) = skb->data[0];
skb_pull(skb, 1);
- skb->dev = (void *) hdev;
- if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
- u16 opcode = __le16_to_cpu(get_unaligned((__le16 *) skb->data));
+ if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
+ /* No permission check is needed for user channel
+ * since that gets enforced when binding the socket.
+ *
+ * However check that the packet type is valid.
+ */
+ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_DRV_PKT) {
+ err = -EINVAL;
+ goto drop;
+ }
+
+ skb_queue_tail(&hdev->raw_q, skb);
+ queue_work(hdev->workqueue, &hdev->tx_work);
+ } else if (hci_skb_pkt_type(skb) == HCI_COMMAND_PKT) {
+ u16 opcode = get_unaligned_le16(skb->data);
u16 ogf = hci_opcode_ogf(opcode);
u16 ocf = hci_opcode_ocf(opcode);
if (((ogf > HCI_SFLT_MAX_OGF) ||
- !hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) &&
- !capable(CAP_NET_RAW)) {
+ !hci_test_bit(ocf & HCI_FLT_OCF_BITS,
+ &hci_sec_filter.ocf_mask[ogf])) &&
+ !capable(CAP_NET_RAW)) {
err = -EPERM;
goto drop;
}
- if (test_bit(HCI_RAW, &hdev->flags) || (ogf == OGF_VENDOR_CMD)) {
+ /* Since the opcode has already been extracted here, store
+ * a copy of the value for later use by the drivers.
+ */
+ hci_skb_opcode(skb) = opcode;
+
+ if (ogf == 0x3f) {
skb_queue_tail(&hdev->raw_q, skb);
- hci_sched_tx(hdev);
+ queue_work(hdev->workqueue, &hdev->tx_work);
} else {
+ /* Stand-alone HCI commands must be flagged as
+ * single-command requests.
+ */
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
+
skb_queue_tail(&hdev->cmd_q, skb);
- hci_sched_cmd(hdev);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
}
} else {
if (!capable(CAP_NET_RAW)) {
@@ -449,8 +1913,15 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
goto drop;
}
+ if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) {
+ err = -EINVAL;
+ goto drop;
+ }
+
skb_queue_tail(&hdev->raw_q, skb);
- hci_sched_tx(hdev);
+ queue_work(hdev->workqueue, &hdev->tx_work);
}
err = len;
@@ -464,7 +1935,8 @@ drop:
goto done;
}
-static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int len)
+static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct hci_ufilter uf = { .opcode = 0 };
struct sock *sk = sock->sk;
@@ -474,12 +1946,16 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char
lock_sock(sk);
+ if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
+ err = -EBADFD;
+ goto done;
+ }
+
switch (optname) {
case HCI_DATA_DIR:
- if (get_user(opt, (int __user *)optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt)
hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR;
@@ -488,10 +1964,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char
break;
case HCI_TIME_STAMP:
- if (get_user(opt, (int __user *)optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt)
hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP;
@@ -500,12 +1975,19 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char
break;
case HCI_FILTER:
- len = min_t(unsigned int, len, sizeof(uf));
- if (copy_from_user(&uf, optval, len)) {
- err = -EFAULT;
- break;
+ {
+ struct hci_filter *f = &hci_pi(sk)->filter;
+
+ uf.type_mask = f->type_mask;
+ uf.opcode = f->opcode;
+ uf.event_mask[0] = *((u32 *) f->event_mask + 0);
+ uf.event_mask[1] = *((u32 *) f->event_mask + 1);
}
+ err = copy_safe_from_sockptr(&uf, sizeof(uf), optval, optlen);
+ if (err)
+ break;
+
if (!capable(CAP_NET_RAW)) {
uf.type_mask &= hci_sec_filter.type_mask;
uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0);
@@ -520,51 +2002,111 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char
*((u32 *) f->event_mask + 0) = uf.event_mask[0];
*((u32 *) f->event_mask + 1) = uf.event_mask[1];
}
- break;
+ break;
default:
err = -ENOPROTOOPT;
break;
}
+done:
release_sock(sk);
return err;
}
-static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ u16 opt;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
+
+ if (level == SOL_HCI)
+ return hci_sock_setsockopt_old(sock, level, optname, optval,
+ optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ switch (hci_pi(sk)->channel) {
+ /* Don't allow changing MTU for channels that are meant for HCI
+ * traffic only.
+ */
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ err = -ENOPROTOOPT;
+ goto done;
+ }
+
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ hci_pi(sk)->mtu = opt;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
{
struct hci_ufilter uf;
struct sock *sk = sock->sk;
- int len, opt;
+ int len, opt, err = 0;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
if (get_user(len, optlen))
return -EFAULT;
+ lock_sock(sk);
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
+ err = -EBADFD;
+ goto done;
+ }
+
switch (optname) {
case HCI_DATA_DIR:
if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR)
opt = 1;
- else
+ else
opt = 0;
if (put_user(opt, optval))
- return -EFAULT;
+ err = -EFAULT;
break;
case HCI_TIME_STAMP:
if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP)
opt = 1;
- else
+ else
opt = 0;
if (put_user(opt, optval))
- return -EFAULT;
+ err = -EFAULT;
break;
case HCI_FILTER:
{
struct hci_filter *f = &hci_pi(sk)->filter;
+ memset(&uf, 0, sizeof(uf));
uf.type_mask = f->type_mask;
uf.opcode = f->opcode;
uf.event_mask[0] = *((u32 *) f->event_mask + 0);
@@ -573,15 +2115,57 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
len = min_t(unsigned int, len, sizeof(uf));
if (copy_to_user(optval, &uf, len))
- return -EFAULT;
+ err = -EFAULT;
break;
default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
+
+ if (level == SOL_HCI)
+ return hci_sock_getsockopt_old(sock, level, optname, optval,
+ optlen);
+
+ if (level != SOL_BLUETOOTH)
return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
break;
}
- return 0;
+ release_sock(sk);
+ return err;
+}
+
+static void hci_sock_destruct(struct sock *sk)
+{
+ mgmt_cleanup(sk);
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
}
static const struct proto_ops hci_sock_ops = {
@@ -593,6 +2177,9 @@ static const struct proto_ops hci_sock_ops = {
.sendmsg = hci_sock_sendmsg,
.recvmsg = hci_sock_recvmsg,
.ioctl = hci_sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = hci_sock_compat_ioctl,
+#endif
.poll = datagram_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -610,7 +2197,8 @@ static struct proto hci_sk_proto = {
.obj_size = sizeof(struct hci_pinfo)
};
-static int hci_sock_create(struct socket *sock, int protocol)
+static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
@@ -621,101 +2209,59 @@ static int hci_sock_create(struct socket *sock, int protocol)
sock->ops = &hci_sock_ops;
- sk = sk_alloc(PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, 1);
+ sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC,
+ kern);
if (!sk)
return -ENOMEM;
- sock_init_data(sock, sk);
-
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = protocol;
-
sock->state = SS_UNCONNECTED;
- sk->sk_state = BT_OPEN;
+ sk->sk_destruct = hci_sock_destruct;
bt_sock_link(&hci_sk_list, sk);
return 0;
}
-static int hci_sock_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
- struct hci_dev *hdev = (struct hci_dev *) ptr;
- struct hci_ev_si_device ev;
-
- BT_DBG("hdev %s event %ld", hdev->name, event);
-
- /* Send event to sockets */
- ev.event = event;
- ev.dev_id = hdev->id;
- hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev);
-
- if (event == HCI_DEV_UNREG) {
- struct sock *sk;
- struct hlist_node *node;
-
- /* Detach sockets from device */
- read_lock(&hci_sk_list.lock);
- sk_for_each(sk, node, &hci_sk_list.head) {
- bh_lock_sock(sk);
- if (hci_pi(sk)->hdev == hdev) {
- hci_pi(sk)->hdev = NULL;
- sk->sk_err = EPIPE;
- sk->sk_state = BT_OPEN;
- sk->sk_state_change(sk);
-
- hci_dev_put(hdev);
- }
- bh_unlock_sock(sk);
- }
- read_unlock(&hci_sk_list.lock);
- }
-
- return NOTIFY_DONE;
-}
-
-static struct net_proto_family hci_sock_family_ops = {
+static const struct net_proto_family hci_sock_family_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.create = hci_sock_create,
};
-static struct notifier_block hci_sock_nblock = {
- .notifier_call = hci_sock_dev_event
-};
-
int __init hci_sock_init(void)
{
int err;
+ BUILD_BUG_ON(sizeof(struct sockaddr_hci) > sizeof(struct sockaddr));
+
err = proto_register(&hci_sk_proto, 0);
if (err < 0)
return err;
err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops);
- if (err < 0)
+ if (err < 0) {
+ BT_ERR("HCI socket registration failed");
goto error;
+ }
- hci_register_notifier(&hci_sock_nblock);
+ err = bt_procfs_init(&init_net, "hci", &hci_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create HCI proc file");
+ bt_sock_unregister(BTPROTO_HCI);
+ goto error;
+ }
BT_INFO("HCI socket layer initialized");
return 0;
error:
- BT_ERR("HCI socket registration failed");
proto_unregister(&hci_sk_proto);
return err;
}
-int __exit hci_sock_cleanup(void)
+void hci_sock_cleanup(void)
{
- if (bt_sock_unregister(BTPROTO_HCI) < 0)
- BT_ERR("HCI socket unregistration failed");
-
- hci_unregister_notifier(&hci_sock_nblock);
-
+ bt_procfs_cleanup(&init_net, "hci");
+ bt_sock_unregister(BTPROTO_HCI);
proto_unregister(&hci_sk_proto);
-
- return 0;
}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
new file mode 100644
index 000000000000..a9f5b1a68356
--- /dev/null
+++ b/net/bluetooth/hci_sync.c
@@ -0,0 +1,7420 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2021 Intel Corporation
+ * Copyright 2023 NXP
+ */
+
+#include <linux/property.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "hci_codec.h"
+#include "hci_debugfs.h"
+#include "smp.h"
+#include "eir.h"
+#include "msft.h"
+#include "aosp.h"
+#include "leds.h"
+
+static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
+ struct sk_buff *skb)
+{
+ bt_dev_dbg(hdev, "result 0x%2.2x", result);
+
+ if (hdev->req_status != HCI_REQ_PEND)
+ return;
+
+ hdev->req_result = result;
+ hdev->req_status = HCI_REQ_DONE;
+
+ /* Free the request command so it is not used as response */
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
+
+ if (skb) {
+ struct sock *sk = hci_skb_sk(skb);
+
+ /* Drop sk reference if set */
+ if (sk)
+ sock_put(sk);
+
+ hdev->req_rsp = skb_get(skb);
+ }
+
+ wake_up_interruptible(&hdev->req_wait_q);
+}
+
+struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, struct sock *sk)
+{
+ int len = HCI_COMMAND_HDR_SIZE + plen;
+ struct hci_command_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ hdr = skb_put(skb, HCI_COMMAND_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(opcode);
+ hdr->plen = plen;
+
+ if (plen)
+ skb_put_data(skb, param, plen);
+
+ bt_dev_dbg(hdev, "skb len %d", skb->len);
+
+ hci_skb_pkt_type(skb) = HCI_COMMAND_PKT;
+ hci_skb_opcode(skb) = opcode;
+
+ /* Grab a reference if command needs to be associated with a sock (e.g.
+ * likely mgmt socket that initiated the command).
+ */
+ if (sk) {
+ hci_skb_sk(skb) = sk;
+ sock_hold(sk);
+ }
+
+ return skb;
+}
+
+static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen,
+ const void *param, u8 event, struct sock *sk)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct sk_buff *skb;
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
+
+ /* If an error occurred during request building, there is no point in
+ * queueing the HCI command. We can simply return.
+ */
+ if (req->err)
+ return;
+
+ skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, sk);
+ if (!skb) {
+ bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+ opcode);
+ req->err = -ENOMEM;
+ return;
+ }
+
+ if (skb_queue_empty(&req->cmd_q))
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
+
+ hci_skb_event(skb) = event;
+
+ skb_queue_tail(&req->cmd_q, skb);
+}
+
+static int hci_req_sync_run(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q));
+
+ /* If an error occurred during request building, remove all HCI
+ * commands queued on the HCI request queue.
+ */
+ if (req->err) {
+ skb_queue_purge(&req->cmd_q);
+ return req->err;
+ }
+
+ /* Do not allow empty requests */
+ if (skb_queue_empty(&req->cmd_q))
+ return -ENODATA;
+
+ skb = skb_peek_tail(&req->cmd_q);
+ bt_cb(skb)->hci.req_complete_skb = hci_cmd_sync_complete;
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB;
+
+ spin_lock_irqsave(&hdev->cmd_q.lock, flags);
+ skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q);
+ spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
+
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+
+ return 0;
+}
+
+static void hci_request_init(struct hci_request *req, struct hci_dev *hdev)
+{
+ skb_queue_head_init(&req->cmd_q);
+ req->hdev = hdev;
+ req->err = 0;
+}
+
+/* This function requires the caller holds hdev->req_lock. */
+struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout,
+ struct sock *sk)
+{
+ struct hci_request req;
+ struct sk_buff *skb;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "Opcode 0x%4.4x", opcode);
+
+ hci_request_init(&req, hdev);
+
+ hci_cmd_sync_add(&req, opcode, plen, param, event, sk);
+
+ hdev->req_status = HCI_REQ_PEND;
+
+ err = hci_req_sync_run(&req);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = wait_event_interruptible_timeout(hdev->req_wait_q,
+ hdev->req_status != HCI_REQ_PEND,
+ timeout);
+
+ if (err == -ERESTARTSYS)
+ return ERR_PTR(-EINTR);
+
+ switch (hdev->req_status) {
+ case HCI_REQ_DONE:
+ err = -bt_to_errno(hdev->req_result);
+ break;
+
+ case HCI_REQ_CANCELED:
+ err = -hdev->req_result;
+ break;
+
+ default:
+ err = -ETIMEDOUT;
+ break;
+ }
+
+ hdev->req_status = 0;
+ hdev->req_result = 0;
+ skb = hdev->req_rsp;
+ hdev->req_rsp = NULL;
+
+ bt_dev_dbg(hdev, "end: err %d", err);
+
+ if (err < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ /* If command return a status event skb will be set to NULL as there are
+ * no parameters.
+ */
+ if (!skb)
+ return ERR_PTR(-ENODATA);
+
+ return skb;
+}
+EXPORT_SYMBOL(__hci_cmd_sync_sk);
+
+/* This function requires the caller holds hdev->req_lock. */
+struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ return __hci_cmd_sync_sk(hdev, opcode, plen, param, 0, timeout, NULL);
+}
+EXPORT_SYMBOL(__hci_cmd_sync);
+
+/* Send HCI command and wait for command complete event */
+struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ struct sk_buff *skb;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return ERR_PTR(-ENETDOWN);
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
+
+ hci_req_sync_lock(hdev);
+ skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout);
+ hci_req_sync_unlock(hdev);
+
+ return skb;
+}
+EXPORT_SYMBOL(hci_cmd_sync);
+
+/* This function requires the caller holds hdev->req_lock. */
+struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout)
+{
+ return __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout,
+ NULL);
+}
+EXPORT_SYMBOL(__hci_cmd_sync_ev);
+
+/* This function requires the caller holds hdev->req_lock. */
+int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout,
+ struct sock *sk)
+{
+ struct sk_buff *skb;
+ u8 status;
+
+ skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk);
+
+ /* If command return a status event, skb will be set to -ENODATA */
+ if (skb == ERR_PTR(-ENODATA))
+ return 0;
+
+ if (IS_ERR(skb)) {
+ if (!event)
+ bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", opcode,
+ PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ status = skb->data[0];
+
+ kfree_skb(skb);
+
+ return status;
+}
+EXPORT_SYMBOL(__hci_cmd_sync_status_sk);
+
+int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ return __hci_cmd_sync_status_sk(hdev, opcode, plen, param, 0, timeout,
+ NULL);
+}
+EXPORT_SYMBOL(__hci_cmd_sync_status);
+
+int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ int err;
+
+ hci_req_sync_lock(hdev);
+ err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout);
+ hci_req_sync_unlock(hdev);
+
+ return err;
+}
+EXPORT_SYMBOL(hci_cmd_sync_status);
+
+static void hci_cmd_sync_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work);
+
+ bt_dev_dbg(hdev, "");
+
+ /* Dequeue all entries and run them */
+ while (1) {
+ struct hci_cmd_sync_work_entry *entry;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ entry = list_first_entry_or_null(&hdev->cmd_sync_work_list,
+ struct hci_cmd_sync_work_entry,
+ list);
+ if (entry)
+ list_del(&entry->list);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ if (!entry)
+ break;
+
+ bt_dev_dbg(hdev, "entry %p", entry);
+
+ if (entry->func) {
+ int err;
+
+ hci_req_sync_lock(hdev);
+ err = entry->func(hdev, entry->data);
+ if (entry->destroy)
+ entry->destroy(hdev, entry->data, err);
+ hci_req_sync_unlock(hdev);
+ }
+
+ kfree(entry);
+ }
+}
+
+static void hci_cmd_sync_cancel_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_cancel_work);
+
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ cancel_delayed_work_sync(&hdev->ncmd_timer);
+ atomic_set(&hdev->cmd_cnt, 1);
+
+ wake_up_interruptible(&hdev->req_wait_q);
+}
+
+static int hci_scan_disable_sync(struct hci_dev *hdev);
+static int scan_disable_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_scan_disable_sync(hdev);
+}
+
+static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN, 0);
+}
+
+static void le_scan_disable(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ le_scan_disable.work);
+ int status;
+
+ bt_dev_dbg(hdev, "");
+ hci_dev_lock(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ goto _return;
+
+ status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL);
+ if (status) {
+ bt_dev_err(hdev, "failed to disable LE scan: %d", status);
+ goto _return;
+ }
+
+ /* If we were running LE only scan, change discovery state. If
+ * we were running both LE and BR/EDR inquiry simultaneously,
+ * and BR/EDR inquiry is already finished, stop discovery,
+ * otherwise BR/EDR inquiry will stop discovery when finished.
+ * If we will resolve remote device name, do not change
+ * discovery state.
+ */
+
+ if (hdev->discovery.type == DISCOV_TYPE_LE)
+ goto discov_stopped;
+
+ if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
+ goto _return;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) {
+ if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+ hdev->discovery.state != DISCOVERY_RESOLVING)
+ goto discov_stopped;
+
+ goto _return;
+ }
+
+ status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL);
+ if (status) {
+ bt_dev_err(hdev, "inquiry failed: status %d", status);
+ goto discov_stopped;
+ }
+
+ goto _return;
+
+discov_stopped:
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+_return:
+ hci_dev_unlock(hdev);
+}
+
+static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
+ u8 filter_dup);
+
+static int reenable_adv_sync(struct hci_dev *hdev, void *data)
+{
+ bt_dev_dbg(hdev, "");
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances))
+ return 0;
+
+ if (hdev->cur_adv_instance) {
+ return hci_schedule_adv_instance_sync(hdev,
+ hdev->cur_adv_instance,
+ true);
+ } else {
+ if (ext_adv_capable(hdev)) {
+ hci_start_ext_adv_sync(hdev, 0x00);
+ } else {
+ hci_update_adv_data_sync(hdev, 0x00);
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ hci_enable_advertising_sync(hdev);
+ }
+ }
+
+ return 0;
+}
+
+static void reenable_adv(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ reenable_adv_work);
+ int status;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL);
+ if (status)
+ bt_dev_err(hdev, "failed to reenable ADV: %d", status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void cancel_adv_timeout(struct hci_dev *hdev)
+{
+ if (hdev->adv_instance_timeout) {
+ hdev->adv_instance_timeout = 0;
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ }
+}
+
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ * lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ * the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ * setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk,
+ u8 instance, bool force)
+{
+ struct adv_info *adv_instance, *n, *next_instance = NULL;
+ int err;
+ u8 rem_inst;
+
+ /* Cancel any timeout concerning the removed instance(s). */
+ if (!instance || hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ /* Get the next instance to advertise BEFORE we remove
+ * the current one. This can be the same instance again
+ * if there is only one instance.
+ */
+ if (instance && hdev->cur_adv_instance == instance)
+ next_instance = hci_get_next_instance(hdev, instance);
+
+ if (instance == 0x00) {
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
+ list) {
+ if (!(force || adv_instance->timeout))
+ continue;
+
+ rem_inst = adv_instance->instance;
+ err = hci_remove_adv_instance(hdev, rem_inst);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, rem_inst);
+ }
+ } else {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+
+ if (force || (adv_instance && adv_instance->timeout &&
+ !adv_instance->remaining_time)) {
+ /* Don't advertise a removed instance. */
+ if (next_instance &&
+ next_instance->instance == instance)
+ next_instance = NULL;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, instance);
+ }
+ }
+
+ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return 0;
+
+ if (next_instance && !ext_adv_capable(hdev))
+ return hci_schedule_adv_instance_sync(hdev,
+ next_instance->instance,
+ false);
+
+ return 0;
+}
+
+static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data)
+{
+ u8 instance = *(u8 *)data;
+
+ kfree(data);
+
+ hci_clear_adv_instance_sync(hdev, NULL, instance, false);
+
+ if (list_empty(&hdev->adv_instances))
+ return hci_disable_advertising_sync(hdev);
+
+ return 0;
+}
+
+static void adv_timeout_expire(struct work_struct *work)
+{
+ u8 *inst_ptr;
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ adv_instance_expire.work);
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ hdev->adv_instance_timeout = 0;
+
+ if (hdev->cur_adv_instance == 0x00)
+ goto unlock;
+
+ inst_ptr = kmalloc(1, GFP_KERNEL);
+ if (!inst_ptr)
+ goto unlock;
+
+ *inst_ptr = hdev->cur_adv_instance;
+ hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static bool is_interleave_scanning(struct hci_dev *hdev)
+{
+ return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE;
+}
+
+static int hci_passive_scan_sync(struct hci_dev *hdev);
+
+static void interleave_scan_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ interleave_scan.work);
+ unsigned long timeout;
+
+ if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) {
+ timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration);
+ } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) {
+ timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration);
+ } else {
+ bt_dev_err(hdev, "unexpected error");
+ return;
+ }
+
+ hci_passive_scan_sync(hdev);
+
+ hci_dev_lock(hdev);
+
+ switch (hdev->interleave_scan_state) {
+ case INTERLEAVE_SCAN_ALLOWLIST:
+ bt_dev_dbg(hdev, "next state: allowlist");
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
+ break;
+ case INTERLEAVE_SCAN_NO_FILTER:
+ bt_dev_dbg(hdev, "next state: no filter");
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST;
+ break;
+ case INTERLEAVE_SCAN_NONE:
+ bt_dev_err(hdev, "unexpected error");
+ }
+
+ hci_dev_unlock(hdev);
+
+ /* Don't continue interleaving if it was canceled */
+ if (is_interleave_scanning(hdev))
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->interleave_scan, timeout);
+}
+
+void hci_cmd_sync_init(struct hci_dev *hdev)
+{
+ INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work);
+ INIT_LIST_HEAD(&hdev->cmd_sync_work_list);
+ mutex_init(&hdev->cmd_sync_work_lock);
+ mutex_init(&hdev->unregister_lock);
+
+ INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work);
+ INIT_WORK(&hdev->reenable_adv_work, reenable_adv);
+ INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable);
+ INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
+ INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
+}
+
+static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+ struct hci_cmd_sync_work_entry *entry,
+ int err)
+{
+ if (entry->destroy)
+ entry->destroy(hdev, entry->data, err);
+
+ list_del(&entry->list);
+ kfree(entry);
+}
+
+void hci_cmd_sync_clear(struct hci_dev *hdev)
+{
+ struct hci_cmd_sync_work_entry *entry, *tmp;
+
+ cancel_work_sync(&hdev->cmd_sync_work);
+ cancel_work_sync(&hdev->reenable_adv_work);
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list)
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+}
+
+void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+{
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ hdev->req_result = err;
+ hdev->req_status = HCI_REQ_CANCELED;
+
+ queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
+ }
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel);
+
+/* Cancel ongoing command request synchronously:
+ *
+ * - Set result and mark status to HCI_REQ_CANCELED
+ * - Wakeup command sync thread
+ */
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err)
+{
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ /* req_result is __u32 so error must be positive to be properly
+ * propagated.
+ */
+ hdev->req_result = err < 0 ? -err : err;
+ hdev->req_status = HCI_REQ_CANCELED;
+
+ wake_up_interruptible(&hdev->req_wait_q);
+ }
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel_sync);
+
+/* Submit HCI command to be run in as cmd_sync_work:
+ *
+ * - hdev must _not_ be unregistered
+ */
+int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+ int err = 0;
+
+ mutex_lock(&hdev->unregister_lock);
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ entry->func = func;
+ entry->data = data;
+ entry->destroy = destroy;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ list_add_tail(&entry->list, &hdev->cmd_sync_work_list);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ queue_work(hdev->req_workqueue, &hdev->cmd_sync_work);
+
+unlock:
+ mutex_unlock(&hdev->unregister_lock);
+ return err;
+}
+EXPORT_SYMBOL(hci_cmd_sync_submit);
+
+/* Queue HCI command:
+ *
+ * - hdev must be running
+ */
+int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ /* Only queue command if hdev is running which means it had been opened
+ * and is either on init phase or is already up.
+ */
+ if (!test_bit(HCI_RUNNING, &hdev->flags))
+ return -ENETDOWN;
+
+ return hci_cmd_sync_submit(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_queue);
+
+static struct hci_cmd_sync_work_entry *
+_hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
+ if (func && entry->func != func)
+ continue;
+
+ if (data && entry->data != data)
+ continue;
+
+ if (destroy && entry->destroy != destroy)
+ continue;
+
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* Queue HCI command entry once:
+ *
+ * - Lookup if an entry already exist and only if it doesn't creates a new entry
+ * and queue it.
+ */
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy))
+ return 0;
+
+ return hci_cmd_sync_queue(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_queue_once);
+
+/* Run HCI command:
+ *
+ * - hdev must be running
+ * - if on cmd_sync_work then run immediately otherwise queue
+ */
+int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ /* Only queue command if hdev is running which means it had been opened
+ * and is either on init phase or is already up.
+ */
+ if (!test_bit(HCI_RUNNING, &hdev->flags))
+ return -ENETDOWN;
+
+ /* If on cmd_sync_work then run immediately otherwise queue */
+ if (current_work() == &hdev->cmd_sync_work)
+ return func(hdev, data);
+
+ return hci_cmd_sync_submit(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_run);
+
+/* Run HCI command entry once:
+ *
+ * - Lookup if an entry already exist and only if it doesn't creates a new entry
+ * and run it.
+ * - if on cmd_sync_work then run immediately otherwise queue
+ */
+int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy))
+ return 0;
+
+ return hci_cmd_sync_run(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_run_once);
+
+/* Lookup HCI command entry:
+ *
+ * - Return first entry that matches by function callback or data or
+ * destroy callback.
+ */
+struct hci_cmd_sync_work_entry *
+hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ return entry;
+}
+EXPORT_SYMBOL(hci_cmd_sync_lookup_entry);
+
+/* Cancel HCI command entry */
+void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+ struct hci_cmd_sync_work_entry *entry)
+{
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel_entry);
+
+/* Dequeue one HCI command entry:
+ *
+ * - Lookup and cancel first entry that matches.
+ */
+bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev,
+ hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+
+ entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+ if (!entry) {
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+ return false;
+ }
+
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ return true;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue_once);
+
+/* Dequeue HCI command entry:
+ *
+ * - Lookup and cancel any entry that matches by function callback or data or
+ * destroy callback.
+ */
+bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+ bool ret = false;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data,
+ destroy))) {
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+ ret = true;
+ }
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue);
+
+int hci_update_eir_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_eir cp;
+
+ bt_dev_dbg(hdev, "");
+
+ if (!hdev_is_powered(hdev))
+ return 0;
+
+ if (!lmp_ext_inq_capable(hdev))
+ return 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return 0;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ eir_create(hdev, cp.data);
+
+ if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
+ return 0;
+
+ memcpy(hdev->eir, cp.data, sizeof(cp.data));
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static u8 get_service_classes(struct hci_dev *hdev)
+{
+ struct bt_uuid *uuid;
+ u8 val = 0;
+
+ list_for_each_entry(uuid, &hdev->uuids, list)
+ val |= uuid->svc_hint;
+
+ return val;
+}
+
+int hci_update_class_sync(struct hci_dev *hdev)
+{
+ u8 cod[3];
+
+ bt_dev_dbg(hdev, "");
+
+ if (!hdev_is_powered(hdev))
+ return 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return 0;
+
+ cod[0] = hdev->minor_class;
+ cod[1] = hdev->major_class;
+ cod[2] = get_service_classes(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ cod[1] |= 0x20;
+
+ if (memcmp(cod, hdev->dev_class, 3) == 0)
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CLASS_OF_DEV,
+ sizeof(cod), cod, HCI_CMD_TIMEOUT);
+}
+
+static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
+{
+ /* If there is no connection we are OK to advertise. */
+ if (hci_conn_num(hdev, LE_LINK) == 0)
+ return true;
+
+ /* Check le_states if there is any connection in peripheral role. */
+ if (hdev->conn_hash.le_num_peripheral > 0) {
+ /* Peripheral connection state and non connectable mode
+ * bit 20.
+ */
+ if (!connectable && !(hdev->le_states[2] & 0x10))
+ return false;
+
+ /* Peripheral connection state and connectable mode bit 38
+ * and scannable bit 21.
+ */
+ if (connectable && (!(hdev->le_states[4] & 0x40) ||
+ !(hdev->le_states[2] & 0x20)))
+ return false;
+ }
+
+ /* Check le_states if there is any connection in central role. */
+ if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) {
+ /* Central connection state and non connectable mode bit 18. */
+ if (!connectable && !(hdev->le_states[2] & 0x02))
+ return false;
+
+ /* Central connection state and connectable mode bit 35 and
+ * scannable 19.
+ */
+ if (connectable && (!(hdev->le_states[4] & 0x08) ||
+ !(hdev->le_states[2] & 0x08)))
+ return false;
+ }
+
+ return true;
+}
+
+static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
+{
+ /* If privacy is not enabled don't use RPA */
+ if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+ return false;
+
+ /* If basic privacy mode is enabled use RPA */
+ if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ return true;
+
+ /* If limited privacy mode is enabled don't use RPA if we're
+ * both discoverable and bondable.
+ */
+ if ((flags & MGMT_ADV_FLAG_DISCOV) &&
+ hci_dev_test_flag(hdev, HCI_BONDABLE))
+ return false;
+
+ /* We're neither bondable nor discoverable in the limited
+ * privacy mode, therefore use RPA.
+ */
+ return true;
+}
+
+static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa)
+{
+ /* If a random_addr has been set we're advertising or initiating an LE
+ * connection we can't go ahead and change the random address at this
+ * time. This is because the eventual initiator address used for the
+ * subsequently created connection will be undefined (some
+ * controllers use the new address and others the one we had
+ * when the operation started).
+ *
+ * In this kind of scenario skip the update and let the random
+ * address be updated at the next cycle.
+ */
+ if (bacmp(&hdev->random_addr, BDADDR_ANY) &&
+ (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
+ hci_lookup_le_connect(hdev))) {
+ bt_dev_dbg(hdev, "Deferring random address update");
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ return 0;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RANDOM_ADDR,
+ 6, rpa, HCI_CMD_TIMEOUT);
+}
+
+int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy,
+ bool rpa, u8 *own_addr_type)
+{
+ int err;
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired or there is something else than
+ * the current RPA in use, then generate a new one.
+ */
+ if (rpa) {
+ /* If Controller supports LL Privacy use own address type is
+ * 0x03
+ */
+ if (ll_privacy_capable(hdev))
+ *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+ else
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ /* Check if RPA is valid */
+ if (rpa_valid(hdev))
+ return 0;
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ bt_dev_err(hdev, "failed to generate new RPA");
+ return err;
+ }
+
+ err = hci_set_random_addr_sync(hdev, &hdev->rpa);
+ if (err)
+ return err;
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for active
+ * scanning and non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ return hci_set_random_addr_sync(hdev, &nrpa);
+ }
+
+ /* If forcing static address is in use or there is no public
+ * address use the static address as random address (but skip
+ * the HCI command if the current random address is already the
+ * static one.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller
+ * and a static address has been configured, then use that
+ * address instead of the public BR/EDR address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ if (bacmp(&hdev->static_addr, &hdev->random_addr))
+ return hci_set_random_addr_sync(hdev,
+ &hdev->static_addr);
+ return 0;
+ }
+
+ /* Neither privacy nor static address is being used so use a
+ * public address.
+ */
+ *own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ return 0;
+}
+
+static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *set;
+ u8 data[sizeof(*cp) + sizeof(*set) * 1];
+ u8 size;
+ struct adv_info *adv = NULL;
+
+ /* If request specifies an instance that doesn't exist, fail */
+ if (instance > 0) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -EINVAL;
+
+ /* If not enabled there is nothing to do */
+ if (!adv->enabled)
+ return 0;
+ }
+
+ memset(data, 0, sizeof(data));
+
+ cp = (void *)data;
+ set = (void *)cp->data;
+
+ /* Instance 0x00 indicates all advertising instances will be disabled */
+ cp->num_of_sets = !!instance;
+ cp->enable = 0x00;
+
+ set->handle = adv ? adv->handle : instance;
+
+ size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ size, data, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_adv_set_random_addr_sync(struct hci_dev *hdev, u8 instance,
+ bdaddr_t *random_addr)
+{
+ struct hci_cp_le_set_adv_set_rand_addr cp;
+ int err;
+
+ if (!instance) {
+ /* Instance 0x00 doesn't have an adv_info, instead it uses
+ * hdev->random_addr to track its address so whenever it needs
+ * to be updated this also set the random address since
+ * hdev->random_addr is shared with scan state machine.
+ */
+ err = hci_set_random_addr_sync(hdev, random_addr);
+ if (err)
+ return err;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = instance;
+ bacpy(&cp.bdaddr, random_addr);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int
+hci_set_ext_adv_params_sync(struct hci_dev *hdev, struct adv_info *adv,
+ const struct hci_cp_le_set_ext_adv_params *cp,
+ struct hci_rp_le_set_ext_adv_params *rp)
+{
+ struct sk_buff *skb;
+
+ skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(*cp),
+ cp, HCI_CMD_TIMEOUT);
+
+ /* If command return a status event, skb will be set to -ENODATA */
+ if (skb == ERR_PTR(-ENODATA))
+ return 0;
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld",
+ HCI_OP_LE_SET_EXT_ADV_PARAMS, PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ if (skb->len != sizeof(*rp)) {
+ bt_dev_err(hdev, "Invalid response length for 0x%4.4x: %u",
+ HCI_OP_LE_SET_EXT_ADV_PARAMS, skb->len);
+ kfree_skb(skb);
+ return -EIO;
+ }
+
+ memcpy(rp, skb->data, sizeof(*rp));
+ kfree_skb(skb);
+
+ if (!rp->status) {
+ hdev->adv_addr_type = cp->own_addr_type;
+ if (!cp->handle) {
+ /* Store in hdev for instance 0 */
+ hdev->adv_tx_power = rp->tx_power;
+ } else if (adv) {
+ adv->tx_power = rp->tx_power;
+ }
+ }
+
+ return rp->status;
+}
+
+static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length,
+ HCI_MAX_EXT_AD_LENGTH);
+ u8 len;
+ struct adv_info *adv = NULL;
+ int err;
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->adv_data_changed)
+ return 0;
+ }
+
+ len = eir_create_adv_data(hdev, instance, pdu->data,
+ HCI_MAX_EXT_AD_LENGTH);
+
+ pdu->length = len;
+ pdu->handle = adv ? adv->handle : instance;
+ pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA,
+ struct_size(pdu, data, len), pdu,
+ HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ /* Update data if the command succeed */
+ if (adv) {
+ adv->adv_data_changed = false;
+ } else {
+ memcpy(hdev->adv_data, pdu->data, len);
+ hdev->adv_data_len = len;
+ }
+
+ return 0;
+}
+
+static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_adv_data cp;
+ u8 len;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = eir_create_adv_data(hdev, instance, cp.data, sizeof(cp.data));
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return 0;
+
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
+
+ cp.length = len;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ if (ext_adv_capable(hdev))
+ return hci_set_ext_adv_data_sync(hdev, instance);
+
+ return hci_set_adv_data_sync(hdev, instance);
+}
+
+int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_params cp;
+ struct hci_rp_le_set_ext_adv_params rp;
+ bool connectable, require_privacy;
+ u32 flags;
+ bdaddr_t random_addr;
+ u8 own_addr_type;
+ int err;
+ struct adv_info *adv;
+ bool secondary_adv;
+
+ if (instance > 0) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -EINVAL;
+ } else {
+ adv = NULL;
+ }
+
+ /* Updating parameters of an active instance will return a
+ * Command Disallowed error, so we must first disable the
+ * instance if it is active.
+ */
+ if (adv) {
+ err = hci_disable_ext_adv_instance_sync(hdev, instance);
+ if (err)
+ return err;
+ }
+
+ flags = hci_adv_instance_flags(hdev, instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ if (!is_advertising_allowed(hdev, connectable))
+ return -EPERM;
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used and it is not periodic.
+ * In that case it is fine to use a non-resolvable private address.
+ */
+ require_privacy = !connectable && !(adv && adv->periodic);
+
+ err = hci_get_random_address(hdev, require_privacy,
+ adv_use_rpa(hdev, flags), adv,
+ &own_addr_type, &random_addr);
+ if (err < 0)
+ return err;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (adv) {
+ hci_cpu_to_le24(adv->min_interval, cp.min_interval);
+ hci_cpu_to_le24(adv->max_interval, cp.max_interval);
+ cp.tx_power = adv->tx_power;
+ cp.sid = adv->sid;
+ } else {
+ hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval);
+ hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval);
+ cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
+ cp.sid = 0x00;
+ }
+
+ secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
+
+ if (connectable) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
+ } else if (hci_adv_instance_is_scannable(hdev, instance) ||
+ (flags & MGMT_ADV_PARAM_SCAN_RSP)) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
+ } else {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
+ }
+
+ /* If Own_Address_Type equals 0x02 or 0x03, the Peer_Address parameter
+ * contains the peer’s Identity Address and the Peer_Address_Type
+ * parameter contains the peer’s Identity Type (i.e., 0x00 or 0x01).
+ * These parameters are used to locate the corresponding local IRK in
+ * the resolving list; this IRK is used to generate their own address
+ * used in the advertisement.
+ */
+ if (own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED)
+ hci_copy_identity_address(hdev, &cp.peer_addr,
+ &cp.peer_addr_type);
+
+ cp.own_addr_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.handle = adv ? adv->handle : instance;
+
+ if (flags & MGMT_ADV_FLAG_SEC_2M) {
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_2M;
+ } else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
+ cp.primary_phy = HCI_ADV_PHY_CODED;
+ cp.secondary_phy = HCI_ADV_PHY_CODED;
+ } else {
+ /* In all other cases use 1M */
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ }
+
+ err = hci_set_ext_adv_params_sync(hdev, adv, &cp, &rp);
+ if (err)
+ return err;
+
+ /* Update adv data as tx power is known now */
+ err = hci_set_ext_adv_data_sync(hdev, cp.handle);
+ if (err)
+ return err;
+
+ if ((own_addr_type == ADDR_LE_DEV_RANDOM ||
+ own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) &&
+ bacmp(&random_addr, BDADDR_ANY)) {
+ /* Check if random address need to be updated */
+ if (adv) {
+ if (!bacmp(&random_addr, &adv->random_addr))
+ return 0;
+ } else {
+ if (!bacmp(&random_addr, &hdev->random_addr))
+ return 0;
+ }
+
+ return hci_set_adv_set_random_addr_sync(hdev, instance,
+ &random_addr);
+ }
+
+ return 0;
+}
+
+static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length,
+ HCI_MAX_EXT_AD_LENGTH);
+ u8 len;
+ struct adv_info *adv = NULL;
+ int err;
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->scan_rsp_changed)
+ return 0;
+ }
+
+ len = eir_create_scan_rsp(hdev, instance, pdu->data);
+
+ pdu->handle = adv ? adv->handle : instance;
+ pdu->length = len;
+ pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA,
+ struct_size(pdu, data, len), pdu,
+ HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ if (adv) {
+ adv->scan_rsp_changed = false;
+ } else {
+ memcpy(hdev->scan_rsp_data, pdu->data, len);
+ hdev->scan_rsp_data_len = len;
+ }
+
+ return 0;
+}
+
+static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_scan_rsp_data cp;
+ u8 len;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = eir_create_scan_rsp(hdev, instance, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return 0;
+
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
+
+ cp.length = len;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_RSP_DATA,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ if (ext_adv_capable(hdev))
+ return hci_set_ext_scan_rsp_data_sync(hdev, instance);
+
+ return __hci_set_scan_rsp_data_sync(hdev, instance);
+}
+
+int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *set;
+ u8 data[sizeof(*cp) + sizeof(*set) * 1];
+ struct adv_info *adv;
+
+ if (instance > 0) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -EINVAL;
+ /* If already enabled there is nothing to do */
+ if (adv->enabled)
+ return 0;
+ } else {
+ adv = NULL;
+ }
+
+ cp = (void *)data;
+ set = (void *)cp->data;
+
+ memset(cp, 0, sizeof(*cp));
+
+ cp->enable = 0x01;
+ cp->num_of_sets = 0x01;
+
+ memset(set, 0, sizeof(*set));
+
+ set->handle = adv ? adv->handle : instance;
+
+ /* Set duration per instance since controller is responsible for
+ * scheduling it.
+ */
+ if (adv && adv->timeout) {
+ u16 duration = adv->timeout * MSEC_PER_SEC;
+
+ /* Time = N * 10 ms */
+ set->duration = cpu_to_le16(duration / 10);
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ sizeof(*cp) +
+ sizeof(*set) * cp->num_of_sets,
+ data, HCI_CMD_TIMEOUT);
+}
+
+int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance)
+{
+ int err;
+
+ err = hci_setup_ext_adv_instance_sync(hdev, instance);
+ if (err)
+ return err;
+
+ err = hci_set_ext_scan_rsp_data_sync(hdev, instance);
+ if (err)
+ return err;
+
+ return hci_enable_ext_advertising_sync(hdev, instance);
+}
+
+int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_per_adv_enable cp;
+ struct adv_info *adv = NULL;
+
+ /* If periodic advertising already disabled there is nothing to do. */
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->periodic_enabled)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.enable = 0x00;
+ cp.handle = instance;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance,
+ u16 min_interval, u16 max_interval)
+{
+ struct hci_cp_le_set_per_adv_params cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (!min_interval)
+ min_interval = DISCOV_LE_PER_ADV_INT_MIN;
+
+ if (!max_interval)
+ max_interval = DISCOV_LE_PER_ADV_INT_MAX;
+
+ cp.handle = instance;
+ cp.min_interval = cpu_to_le16(min_interval);
+ cp.max_interval = cpu_to_le16(max_interval);
+ cp.periodic_properties = 0x0000;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length,
+ HCI_MAX_PER_AD_LENGTH);
+ u8 len;
+ struct adv_info *adv = NULL;
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->periodic)
+ return 0;
+ }
+
+ len = eir_create_per_adv_data(hdev, instance, pdu->data);
+
+ pdu->length = len;
+ pdu->handle = adv ? adv->handle : instance;
+ pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA,
+ struct_size(pdu, data, len), pdu,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_per_adv_enable cp;
+ struct adv_info *adv = NULL;
+
+ /* If periodic advertising already enabled there is nothing to do. */
+ adv = hci_find_adv_instance(hdev, instance);
+ if (adv && adv->periodic_enabled)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.enable = 0x01;
+ cp.handle = instance;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Checks if periodic advertising data contains a Basic Announcement and if it
+ * does generates a Broadcast ID and add Broadcast Announcement.
+ */
+static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv)
+{
+ u8 bid[3];
+ u8 ad[HCI_MAX_EXT_AD_LENGTH];
+ u8 len;
+
+ /* Skip if NULL adv as instance 0x00 is used for general purpose
+ * advertising so it cannot used for the likes of Broadcast Announcement
+ * as it can be overwritten at any point.
+ */
+ if (!adv)
+ return 0;
+
+ /* Check if PA data doesn't contains a Basic Audio Announcement then
+ * there is nothing to do.
+ */
+ if (!eir_get_service_data(adv->per_adv_data, adv->per_adv_data_len,
+ 0x1851, NULL))
+ return 0;
+
+ /* Check if advertising data already has a Broadcast Announcement since
+ * the process may want to control the Broadcast ID directly and in that
+ * case the kernel shall no interfere.
+ */
+ if (eir_get_service_data(adv->adv_data, adv->adv_data_len, 0x1852,
+ NULL))
+ return 0;
+
+ /* Generate Broadcast ID */
+ get_random_bytes(bid, sizeof(bid));
+ len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid));
+ memcpy(ad + len, adv->adv_data, adv->adv_data_len);
+ hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len,
+ ad, 0, NULL);
+
+ return hci_update_adv_data_sync(hdev, adv->instance);
+}
+
+int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid,
+ u8 data_len, u8 *data, u32 flags, u16 min_interval,
+ u16 max_interval, u16 sync_interval)
+{
+ struct adv_info *adv = NULL;
+ int err;
+ bool added = false;
+
+ hci_disable_per_advertising_sync(hdev, instance);
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (adv) {
+ if (sid != HCI_SID_INVALID && adv->sid != sid) {
+ /* If the SID don't match attempt to find by
+ * SID.
+ */
+ adv = hci_find_adv_sid(hdev, sid);
+ if (!adv) {
+ bt_dev_err(hdev,
+ "Unable to find adv_info");
+ return -EINVAL;
+ }
+ }
+
+ /* Turn it into periodic advertising */
+ adv->periodic = true;
+ adv->per_adv_data_len = data_len;
+ if (data)
+ memcpy(adv->per_adv_data, data, data_len);
+ adv->flags = flags;
+ } else if (!adv) {
+ /* Create an instance if that could not be found */
+ adv = hci_add_per_instance(hdev, instance, sid, flags,
+ data_len, data,
+ sync_interval,
+ sync_interval);
+ if (IS_ERR(adv))
+ return PTR_ERR(adv);
+ adv->pending = false;
+ added = true;
+ }
+ }
+
+ /* Start advertising */
+ err = hci_start_ext_adv_sync(hdev, instance);
+ if (err < 0)
+ goto fail;
+
+ err = hci_adv_bcast_annoucement(hdev, adv);
+ if (err < 0)
+ goto fail;
+
+ err = hci_set_per_adv_params_sync(hdev, instance, min_interval,
+ max_interval);
+ if (err < 0)
+ goto fail;
+
+ err = hci_set_per_adv_data_sync(hdev, instance);
+ if (err < 0)
+ goto fail;
+
+ err = hci_enable_per_advertising_sync(hdev, instance);
+ if (err < 0)
+ goto fail;
+
+ return 0;
+
+fail:
+ if (added)
+ hci_remove_adv_instance(hdev, instance);
+
+ return err;
+}
+
+static int hci_start_adv_sync(struct hci_dev *hdev, u8 instance)
+{
+ int err;
+
+ if (ext_adv_capable(hdev))
+ return hci_start_ext_adv_sync(hdev, instance);
+
+ err = hci_update_adv_data_sync(hdev, instance);
+ if (err)
+ return err;
+
+ err = hci_update_scan_rsp_data_sync(hdev, instance);
+ if (err)
+ return err;
+
+ return hci_enable_advertising_sync(hdev);
+}
+
+int hci_enable_advertising_sync(struct hci_dev *hdev)
+{
+ struct adv_info *adv_instance;
+ struct hci_cp_le_set_adv_param cp;
+ u8 own_addr_type, enable = 0x01;
+ bool connectable;
+ u16 adv_min_interval, adv_max_interval;
+ u32 flags;
+ u8 status;
+
+ if (ext_adv_capable(hdev))
+ return hci_enable_ext_advertising_sync(hdev,
+ hdev->cur_adv_instance);
+
+ flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance);
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ if (!is_advertising_allowed(hdev, connectable))
+ return -EINVAL;
+
+ status = hci_disable_advertising_sync(hdev);
+ if (status)
+ return status;
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used. In that case it is fine to use a
+ * non-resolvable private address.
+ */
+ status = hci_update_random_address_sync(hdev, !connectable,
+ adv_use_rpa(hdev, flags),
+ &own_addr_type);
+ if (status)
+ return status;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (adv_instance) {
+ adv_min_interval = adv_instance->min_interval;
+ adv_max_interval = adv_instance->max_interval;
+ } else {
+ adv_min_interval = hdev->le_adv_min_interval;
+ adv_max_interval = hdev->le_adv_max_interval;
+ }
+
+ if (connectable) {
+ cp.type = LE_ADV_IND;
+ } else {
+ if (hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance))
+ cp.type = LE_ADV_SCAN_IND;
+ else
+ cp.type = LE_ADV_NONCONN_IND;
+
+ if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) ||
+ hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
+ adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN;
+ adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX;
+ }
+ }
+
+ cp.min_interval = cpu_to_le16(adv_min_interval);
+ cp.max_interval = cpu_to_le16(adv_max_interval);
+ cp.own_address_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (status)
+ return status;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
+ sizeof(enable), &enable, HCI_CMD_TIMEOUT);
+}
+
+static int enable_advertising_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_enable_advertising_sync(hdev);
+}
+
+int hci_enable_advertising(struct hci_dev *hdev)
+{
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances))
+ return 0;
+
+ return hci_cmd_sync_queue(hdev, enable_advertising_sync, NULL, NULL);
+}
+
+int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance,
+ struct sock *sk)
+{
+ int err;
+
+ if (!ext_adv_capable(hdev))
+ return 0;
+
+ err = hci_disable_ext_adv_instance_sync(hdev, instance);
+ if (err)
+ return err;
+
+ /* If request specifies an instance that doesn't exist, fail */
+ if (instance > 0 && !hci_find_adv_instance(hdev, instance))
+ return -EINVAL;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_REMOVE_ADV_SET,
+ sizeof(instance), &instance, 0,
+ HCI_CMD_TIMEOUT, sk);
+}
+
+int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason)
+{
+ struct hci_cp_le_term_big cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+ cp.reason = reason;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_TERM_BIG,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance,
+ bool force)
+{
+ struct adv_info *adv = NULL;
+ u16 timeout;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && !ext_adv_capable(hdev))
+ return -EPERM;
+
+ if (hdev->adv_instance_timeout)
+ return -EBUSY;
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -ENOENT;
+
+ /* A zero timeout means unlimited advertising. As long as there is
+ * only one instance, duration should be ignored. We still set a timeout
+ * in case further instances are being added later on.
+ *
+ * If the remaining lifetime of the instance is more than the duration
+ * then the timeout corresponds to the duration, otherwise it will be
+ * reduced to the remaining instance lifetime.
+ */
+ if (adv->timeout == 0 || adv->duration <= adv->remaining_time)
+ timeout = adv->duration;
+ else
+ timeout = adv->remaining_time;
+
+ /* The remaining time is being reduced unless the instance is being
+ * advertised without time limit.
+ */
+ if (adv->timeout)
+ adv->remaining_time = adv->remaining_time - timeout;
+
+ /* Only use work for scheduling instances with legacy advertising */
+ if (!ext_adv_capable(hdev)) {
+ hdev->adv_instance_timeout = timeout;
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->adv_instance_expire,
+ secs_to_jiffies(timeout));
+ }
+
+ /* If we're just re-scheduling the same instance again then do not
+ * execute any HCI commands. This happens when a single instance is
+ * being advertised.
+ */
+ if (!force && hdev->cur_adv_instance == instance &&
+ hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ hdev->cur_adv_instance = instance;
+
+ return hci_start_adv_sync(hdev, instance);
+}
+
+static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk)
+{
+ int err;
+
+ if (!ext_adv_capable(hdev))
+ return 0;
+
+ /* Disable instance 0x00 to disable all instances */
+ err = hci_disable_ext_adv_instance_sync(hdev, 0x00);
+ if (err)
+ return err;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CLEAR_ADV_SETS,
+ 0, NULL, 0, HCI_CMD_TIMEOUT, sk);
+}
+
+static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force)
+{
+ struct adv_info *adv, *n;
+
+ if (ext_adv_capable(hdev))
+ /* Remove all existing sets */
+ return hci_clear_adv_sets_sync(hdev, sk);
+
+ /* This is safe as long as there is no command send while the lock is
+ * held.
+ */
+ hci_dev_lock(hdev);
+
+ /* Cleanup non-ext instances */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ u8 instance = adv->instance;
+ int err;
+
+ if (!(force || adv->timeout))
+ continue;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, instance);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance,
+ struct sock *sk)
+{
+ int err;
+
+ /* If we use extended advertising, instance has to be removed first. */
+ if (ext_adv_capable(hdev))
+ return hci_remove_ext_adv_instance_sync(hdev, instance, sk);
+
+ /* This is safe as long as there is no command send while the lock is
+ * held.
+ */
+ hci_dev_lock(hdev);
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, instance);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ * lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ * the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ * setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk,
+ u8 instance, bool force)
+{
+ struct adv_info *next = NULL;
+ int err;
+
+ /* Cancel any timeout concerning the removed instance(s). */
+ if (!instance || hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ /* Get the next instance to advertise BEFORE we remove
+ * the current one. This can be the same instance again
+ * if there is only one instance.
+ */
+ if (hdev->cur_adv_instance == instance)
+ next = hci_get_next_instance(hdev, instance);
+
+ if (!instance) {
+ err = hci_clear_adv_sync(hdev, sk, force);
+ if (err)
+ return err;
+ } else {
+ struct adv_info *adv = hci_find_adv_instance(hdev, instance);
+
+ if (force || (adv && adv->timeout && !adv->remaining_time)) {
+ /* Don't advertise a removed instance. */
+ if (next && next->instance == instance)
+ next = NULL;
+
+ err = hci_remove_adv_sync(hdev, instance, sk);
+ if (err)
+ return err;
+ }
+ }
+
+ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return 0;
+
+ if (next && !ext_adv_capable(hdev))
+ hci_schedule_adv_instance_sync(hdev, next->instance, false);
+
+ return 0;
+}
+
+int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_read_rssi cp;
+
+ cp.handle = handle;
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_RSSI,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLOCK,
+ sizeof(*cp), cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type)
+{
+ struct hci_cp_read_tx_power cp;
+
+ cp.handle = handle;
+ cp.type = type;
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_TX_POWER,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_disable_advertising_sync(struct hci_dev *hdev)
+{
+ u8 enable = 0x00;
+
+ /* If controller is not advertising we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ if (ext_adv_capable(hdev))
+ return hci_disable_ext_adv_instance_sync(hdev, 0x00);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
+ sizeof(enable), &enable, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val,
+ u8 filter_dup)
+{
+ struct hci_cp_le_set_ext_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = val;
+
+ if (hci_dev_test_flag(hdev, HCI_MESH))
+ cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+ else
+ cp.filter_dup = filter_dup;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
+ u8 filter_dup)
+{
+ struct hci_cp_le_set_scan_enable cp;
+
+ if (use_ext_scan(hdev))
+ return hci_le_set_ext_scan_enable_sync(hdev, val, filter_dup);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = val;
+
+ if (val && hci_dev_test_flag(hdev, HCI_MESH))
+ cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+ else
+ cp.filter_dup = filter_dup;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val)
+{
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ /* If controller is not/already resolving we are done. */
+ if (val == hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE,
+ sizeof(val), &val, HCI_CMD_TIMEOUT);
+}
+
+static int hci_scan_disable_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return 0;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return 0;
+ }
+
+ err = hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00);
+ if (err) {
+ bt_dev_err(hdev, "Unable to disable scanning: %d", err);
+ return err;
+ }
+
+ return err;
+}
+
+static bool scan_use_rpa(struct hci_dev *hdev)
+{
+ return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
+static void hci_start_interleave_scan(struct hci_dev *hdev)
+{
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->interleave_scan, 0);
+}
+
+static void cancel_interleave_scan(struct hci_dev *hdev)
+{
+ bt_dev_dbg(hdev, "cancelling interleave scan");
+
+ cancel_delayed_work_sync(&hdev->interleave_scan);
+
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE;
+}
+
+/* Return true if interleave_scan wasn't started until exiting this function,
+ * otherwise, return false
+ */
+static bool hci_update_interleaved_scan_sync(struct hci_dev *hdev)
+{
+ /* Do interleaved scan only if all of the following are true:
+ * - There is at least one ADV monitor
+ * - At least one pending LE connection or one device to be scanned for
+ * - Monitor offloading is not supported
+ * If so, we should alternate between allowlist scan and one without
+ * any filters to save power.
+ */
+ bool use_interleaving = hci_is_adv_monitoring(hdev) &&
+ !(list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports)) &&
+ hci_get_adv_monitor_offload_ext(hdev) ==
+ HCI_ADV_MONITOR_EXT_NONE;
+ bool is_interleaving = is_interleave_scanning(hdev);
+
+ if (use_interleaving && !is_interleaving) {
+ hci_start_interleave_scan(hdev);
+ bt_dev_dbg(hdev, "starting interleave scan");
+ return true;
+ }
+
+ if (!use_interleaving && is_interleaving)
+ cancel_interleave_scan(hdev);
+
+ return false;
+}
+
+/* Removes connection to resolve list if needed.*/
+static int hci_le_del_resolve_list_sync(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct hci_cp_le_del_from_resolv_list cp;
+ struct bdaddr_list_with_irk *entry;
+
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ /* Check if the IRK has been programmed */
+ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, bdaddr,
+ bdaddr_type);
+ if (!entry)
+ return 0;
+
+ cp.bdaddr_type = bdaddr_type;
+ bacpy(&cp.bdaddr, bdaddr);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_del_accept_list_sync(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct hci_cp_le_del_from_accept_list cp;
+ int err;
+
+ /* Check if device is on accept list before removing it */
+ if (!hci_bdaddr_list_lookup(&hdev->le_accept_list, bdaddr, bdaddr_type))
+ return 0;
+
+ cp.bdaddr_type = bdaddr_type;
+ bacpy(&cp.bdaddr, bdaddr);
+
+ /* Ignore errors when removing from resolving list as that is likely
+ * that the device was never added.
+ */
+ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type);
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err) {
+ bt_dev_err(hdev, "Unable to remove from allow list: %d", err);
+ return err;
+ }
+
+ bt_dev_dbg(hdev, "Remove %pMR (0x%x) from allow list", &cp.bdaddr,
+ cp.bdaddr_type);
+
+ return 0;
+}
+
+struct conn_params {
+ bdaddr_t addr;
+ u8 addr_type;
+ hci_conn_flags_t flags;
+ u8 privacy_mode;
+};
+
+/* Adds connection to resolve list if needed.
+ * Setting params to NULL programs local hdev->irk
+ */
+static int hci_le_add_resolve_list_sync(struct hci_dev *hdev,
+ struct conn_params *params)
+{
+ struct hci_cp_le_add_to_resolv_list cp;
+ struct smp_irk *irk;
+ struct bdaddr_list_with_irk *entry;
+ struct hci_conn_params *p;
+
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ /* Attempt to program local identity address, type and irk if params is
+ * NULL.
+ */
+ if (!params) {
+ if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+ return 0;
+
+ hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type);
+ memcpy(cp.peer_irk, hdev->irk, 16);
+ goto done;
+ } else if (!(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION))
+ return 0;
+
+ irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type);
+ if (!irk)
+ return 0;
+
+ /* Check if the IK has _not_ been programmed yet. */
+ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list,
+ &params->addr,
+ params->addr_type);
+ if (entry)
+ return 0;
+
+ cp.bdaddr_type = params->addr_type;
+ bacpy(&cp.bdaddr, &params->addr);
+ memcpy(cp.peer_irk, irk->val, 16);
+
+ /* Default privacy mode is always Network */
+ params->privacy_mode = HCI_NETWORK_PRIVACY;
+
+ rcu_read_lock();
+ p = hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &params->addr, params->addr_type);
+ if (!p)
+ p = hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &params->addr, params->addr_type);
+ if (p)
+ WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY);
+ rcu_read_unlock();
+
+done:
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ memcpy(cp.local_irk, hdev->irk, 16);
+ else
+ memset(cp.local_irk, 0, 16);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Set Device Privacy Mode. */
+static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev,
+ struct conn_params *params)
+{
+ struct hci_cp_le_set_privacy_mode cp;
+ struct smp_irk *irk;
+
+ if (!ll_privacy_capable(hdev) ||
+ !(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION))
+ return 0;
+
+ /* If device privacy mode has already been set there is nothing to do */
+ if (params->privacy_mode == HCI_DEVICE_PRIVACY)
+ return 0;
+
+ /* Check if HCI_CONN_FLAG_DEVICE_PRIVACY has been set as it also
+ * indicates that LL Privacy has been enabled and
+ * HCI_OP_LE_SET_PRIVACY_MODE is supported.
+ */
+ if (!(params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY))
+ return 0;
+
+ irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type);
+ if (!irk)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.bdaddr_type = irk->addr_type;
+ bacpy(&cp.bdaddr, &irk->bdaddr);
+ cp.mode = HCI_DEVICE_PRIVACY;
+
+ /* Note: params->privacy_mode is not updated since it is a copy */
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Adds connection to allow list if needed, if the device uses RPA (has IRK)
+ * this attempts to program the device in the resolving list as well and
+ * properly set the privacy mode.
+ */
+static int hci_le_add_accept_list_sync(struct hci_dev *hdev,
+ struct conn_params *params,
+ u8 *num_entries)
+{
+ struct hci_cp_le_add_to_accept_list cp;
+ int err;
+
+ /* During suspend, only wakeable devices can be in acceptlist */
+ if (hdev->suspended &&
+ !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) {
+ hci_le_del_accept_list_sync(hdev, &params->addr,
+ params->addr_type);
+ return 0;
+ }
+
+ /* Select filter policy to accept all advertising */
+ if (*num_entries >= hdev->le_accept_list_size)
+ return -ENOSPC;
+
+ /* Attempt to program the device in the resolving list first to avoid
+ * having to rollback in case it fails since the resolving list is
+ * dynamic it can probably be smaller than the accept list.
+ */
+ err = hci_le_add_resolve_list_sync(hdev, params);
+ if (err) {
+ bt_dev_err(hdev, "Unable to add to resolve list: %d", err);
+ return err;
+ }
+
+ /* Set Privacy Mode */
+ err = hci_le_set_privacy_mode_sync(hdev, params);
+ if (err) {
+ bt_dev_err(hdev, "Unable to set privacy mode: %d", err);
+ return err;
+ }
+
+ /* Check if already in accept list */
+ if (hci_bdaddr_list_lookup(&hdev->le_accept_list, &params->addr,
+ params->addr_type))
+ return 0;
+
+ *num_entries += 1;
+ cp.bdaddr_type = params->addr_type;
+ bacpy(&cp.bdaddr, &params->addr);
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err) {
+ bt_dev_err(hdev, "Unable to add to allow list: %d", err);
+ /* Rollback the device from the resolving list */
+ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type);
+ return err;
+ }
+
+ bt_dev_dbg(hdev, "Add %pMR (0x%x) to allow list", &cp.bdaddr,
+ cp.bdaddr_type);
+
+ return 0;
+}
+
+/* This function disables/pause all advertising instances */
+static int hci_pause_advertising_sync(struct hci_dev *hdev)
+{
+ int err;
+ int old_state;
+
+ /* If controller is not advertising we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ /* If already been paused there is nothing to do. */
+ if (hdev->advertising_paused)
+ return 0;
+
+ bt_dev_dbg(hdev, "Pausing directed advertising");
+
+ /* Stop directed advertising */
+ old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING);
+ if (old_state) {
+ /* When discoverable timeout triggers, then just make sure
+ * the limited discoverable flag is cleared. Even in the case
+ * of a timeout triggered from general discoverable, it is
+ * safe to unconditionally clear the flag.
+ */
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hdev->discov_timeout = 0;
+ }
+
+ bt_dev_dbg(hdev, "Pausing advertising instances");
+
+ /* Call to disable any advertisements active on the controller.
+ * This will succeed even if no advertisements are configured.
+ */
+ err = hci_disable_advertising_sync(hdev);
+ if (err)
+ return err;
+
+ /* If we are using software rotation, pause the loop */
+ if (!ext_adv_capable(hdev))
+ cancel_adv_timeout(hdev);
+
+ hdev->advertising_paused = true;
+ hdev->advertising_old_state = old_state;
+
+ return 0;
+}
+
+/* This function enables all user advertising instances */
+static int hci_resume_advertising_sync(struct hci_dev *hdev)
+{
+ struct adv_info *adv, *tmp;
+ int err;
+
+ /* If advertising has not been paused there is nothing to do. */
+ if (!hdev->advertising_paused)
+ return 0;
+
+ /* Resume directed advertising */
+ hdev->advertising_paused = false;
+ if (hdev->advertising_old_state) {
+ hci_dev_set_flag(hdev, HCI_ADVERTISING);
+ hdev->advertising_old_state = 0;
+ }
+
+ bt_dev_dbg(hdev, "Resuming advertising instances");
+
+ if (ext_adv_capable(hdev)) {
+ /* Call for each tracked instance to be re-enabled */
+ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) {
+ err = hci_enable_ext_advertising_sync(hdev,
+ adv->instance);
+ if (!err)
+ continue;
+
+ /* If the instance cannot be resumed remove it */
+ hci_remove_ext_adv_instance_sync(hdev, adv->instance,
+ NULL);
+ }
+
+ /* If current advertising instance is set to instance 0x00
+ * then we need to re-enable it.
+ */
+ if (hci_dev_test_and_clear_flag(hdev, HCI_LE_ADV_0))
+ err = hci_enable_ext_advertising_sync(hdev, 0x00);
+ } else {
+ /* Schedule for most recent instance to be restarted and begin
+ * the software rotation loop
+ */
+ err = hci_schedule_adv_instance_sync(hdev,
+ hdev->cur_adv_instance,
+ true);
+ }
+
+ hdev->advertising_paused = false;
+
+ return err;
+}
+
+static int hci_pause_addr_resolution(struct hci_dev *hdev)
+{
+ int err;
+
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION))
+ return 0;
+
+ /* Cannot disable addr resolution if scanning is enabled or
+ * when initiating an LE connection.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ hci_lookup_le_connect(hdev)) {
+ bt_dev_err(hdev, "Command not allowed when scan/LE connect");
+ return -EPERM;
+ }
+
+ /* Cannot disable addr resolution if advertising is enabled. */
+ err = hci_pause_advertising_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "Pause advertising failed: %d", err);
+ return err;
+ }
+
+ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00);
+ if (err)
+ bt_dev_err(hdev, "Unable to disable Address Resolution: %d",
+ err);
+
+ /* Return if address resolution is disabled and RPA is not used. */
+ if (!err && scan_use_rpa(hdev))
+ return 0;
+
+ hci_resume_advertising_sync(hdev);
+ return err;
+}
+
+struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev,
+ bool extended, struct sock *sk)
+{
+ u16 opcode = extended ? HCI_OP_READ_LOCAL_OOB_EXT_DATA :
+ HCI_OP_READ_LOCAL_OOB_DATA;
+
+ return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk);
+}
+
+static struct conn_params *conn_params_copy(struct list_head *list, size_t *n)
+{
+ struct hci_conn_params *params;
+ struct conn_params *p;
+ size_t i;
+
+ rcu_read_lock();
+
+ i = 0;
+ list_for_each_entry_rcu(params, list, action)
+ ++i;
+ *n = i;
+
+ rcu_read_unlock();
+
+ p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ rcu_read_lock();
+
+ i = 0;
+ list_for_each_entry_rcu(params, list, action) {
+ /* Racing adds are handled in next scan update */
+ if (i >= *n)
+ break;
+
+ /* No hdev->lock, but: addr, addr_type are immutable.
+ * privacy_mode is only written by us or in
+ * hci_cc_le_set_privacy_mode that we wait for.
+ * We should be idempotent so MGMT updating flags
+ * while we are processing is OK.
+ */
+ bacpy(&p[i].addr, &params->addr);
+ p[i].addr_type = params->addr_type;
+ p[i].flags = READ_ONCE(params->flags);
+ p[i].privacy_mode = READ_ONCE(params->privacy_mode);
+ ++i;
+ }
+
+ rcu_read_unlock();
+
+ *n = i;
+ return p;
+}
+
+/* Clear LE Accept List */
+static int hci_le_clear_accept_list_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[26] & 0x80))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Device must not be scanning when updating the accept list.
+ *
+ * Update is done using the following sequence:
+ *
+ * ll_privacy_capable((Disable Advertising) -> Disable Resolving List) ->
+ * Remove Devices From Accept List ->
+ * (has IRK && ll_privacy_capable(Remove Devices From Resolving List))->
+ * Add Devices to Accept List ->
+ * (has IRK && ll_privacy_capable(Remove Devices From Resolving List)) ->
+ * ll_privacy_capable(Enable Resolving List -> (Enable Advertising)) ->
+ * Enable Scanning
+ *
+ * In case of failure advertising shall be restored to its original state and
+ * return would disable accept list since either accept or resolving list could
+ * not be programmed.
+ *
+ */
+static u8 hci_update_accept_list_sync(struct hci_dev *hdev)
+{
+ struct conn_params *params;
+ struct bdaddr_list *b, *t;
+ u8 num_entries = 0;
+ bool pend_conn, pend_report;
+ u8 filter_policy;
+ size_t i, n;
+ int err;
+
+ /* Pause advertising if resolving list can be used as controllers
+ * cannot accept resolving list modifications while advertising.
+ */
+ if (ll_privacy_capable(hdev)) {
+ err = hci_pause_advertising_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "pause advertising failed: %d", err);
+ return 0x00;
+ }
+ }
+
+ /* Disable address resolution while reprogramming accept list since
+ * devices that do have an IRK will be programmed in the resolving list
+ * when LL Privacy is enabled.
+ */
+ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00);
+ if (err) {
+ bt_dev_err(hdev, "Unable to disable LL privacy: %d", err);
+ goto done;
+ }
+
+ /* Force address filtering if PA Sync is in progress */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (conn) {
+ struct conn_params pa;
+
+ memset(&pa, 0, sizeof(pa));
+
+ bacpy(&pa.addr, &conn->dst);
+ pa.addr_type = conn->dst_type;
+
+ /* Clear first since there could be addresses left
+ * behind.
+ */
+ hci_le_clear_accept_list_sync(hdev);
+
+ num_entries = 1;
+ err = hci_le_add_accept_list_sync(hdev, &pa,
+ &num_entries);
+ goto done;
+ }
+ }
+
+ /* Go through the current accept list programmed into the
+ * controller one by one and check if that address is connected or is
+ * still in the list of pending connections or list of devices to
+ * report. If not present in either list, then remove it from
+ * the controller.
+ */
+ list_for_each_entry_safe(b, t, &hdev->le_accept_list, list) {
+ if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type))
+ continue;
+
+ /* Pointers not dereferenced, no locks needed */
+ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &b->bdaddr,
+ b->bdaddr_type);
+ pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &b->bdaddr,
+ b->bdaddr_type);
+
+ /* If the device is not likely to connect or report,
+ * remove it from the acceptlist.
+ */
+ if (!pend_conn && !pend_report) {
+ hci_le_del_accept_list_sync(hdev, &b->bdaddr,
+ b->bdaddr_type);
+ continue;
+ }
+
+ num_entries++;
+ }
+
+ /* Since all no longer valid accept list entries have been
+ * removed, walk through the list of pending connections
+ * and ensure that any new device gets programmed into
+ * the controller.
+ *
+ * If the list of the devices is larger than the list of
+ * available accept list entries in the controller, then
+ * just abort and return filer policy value to not use the
+ * accept list.
+ *
+ * The list and params may be mutated while we wait for events,
+ * so make a copy and iterate it.
+ */
+
+ params = conn_params_copy(&hdev->pend_le_conns, &n);
+ if (!params) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ for (i = 0; i < n; ++i) {
+ err = hci_le_add_accept_list_sync(hdev, &params[i],
+ &num_entries);
+ if (err) {
+ kvfree(params);
+ goto done;
+ }
+ }
+
+ kvfree(params);
+
+ /* After adding all new pending connections, walk through
+ * the list of pending reports and also add these to the
+ * accept list if there is still space. Abort if space runs out.
+ */
+
+ params = conn_params_copy(&hdev->pend_le_reports, &n);
+ if (!params) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ for (i = 0; i < n; ++i) {
+ err = hci_le_add_accept_list_sync(hdev, &params[i],
+ &num_entries);
+ if (err) {
+ kvfree(params);
+ goto done;
+ }
+ }
+
+ kvfree(params);
+
+ /* Use the allowlist unless the following conditions are all true:
+ * - We are not currently suspending
+ * - There are 1 or more ADV monitors registered and it's not offloaded
+ * - Interleaved scanning is not currently using the allowlist
+ */
+ if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended &&
+ hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE &&
+ hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST)
+ err = -EINVAL;
+
+done:
+ filter_policy = err ? 0x00 : 0x01;
+
+ /* Enable address resolution when LL Privacy is enabled. */
+ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01);
+ if (err)
+ bt_dev_err(hdev, "Unable to enable LL privacy: %d", err);
+
+ /* Resume advertising if it was paused */
+ if (ll_privacy_capable(hdev))
+ hci_resume_advertising_sync(hdev);
+
+ /* Select filter policy to use accept list */
+ return filter_policy;
+}
+
+static void hci_le_scan_phy_params(struct hci_cp_le_scan_phy_params *cp,
+ u8 type, u16 interval, u16 window)
+{
+ cp->type = type;
+ cp->interval = cpu_to_le16(interval);
+ cp->window = cpu_to_le16(window);
+}
+
+static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type,
+ u16 interval, u16 window,
+ u8 own_addr_type, u8 filter_policy)
+{
+ struct hci_cp_le_set_ext_scan_params *cp;
+ struct hci_cp_le_scan_phy_params *phy;
+ u8 data[sizeof(*cp) + sizeof(*phy) * 2];
+ u8 num_phy = 0x00;
+
+ cp = (void *)data;
+ phy = (void *)cp->data;
+
+ memset(data, 0, sizeof(data));
+
+ cp->own_addr_type = own_addr_type;
+ cp->filter_policy = filter_policy;
+
+ /* Check if PA Sync is in progress then select the PHY based on the
+ * hci_conn.iso_qos.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_cp_le_add_to_accept_list *sent;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST);
+ if (sent) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, PA_LINK,
+ &sent->bdaddr);
+ if (conn) {
+ struct bt_iso_qos *qos = &conn->iso_qos;
+
+ if (qos->bcast.in.phy & BT_ISO_PHY_1M ||
+ qos->bcast.in.phy & BT_ISO_PHY_2M) {
+ cp->scanning_phys |= LE_SCAN_PHY_1M;
+ hci_le_scan_phy_params(phy, type,
+ interval,
+ window);
+ num_phy++;
+ phy++;
+ }
+
+ if (qos->bcast.in.phy & BT_ISO_PHY_CODED) {
+ cp->scanning_phys |= LE_SCAN_PHY_CODED;
+ hci_le_scan_phy_params(phy, type,
+ interval * 3,
+ window * 3);
+ num_phy++;
+ phy++;
+ }
+
+ if (num_phy)
+ goto done;
+ }
+ }
+ }
+
+ if (scan_1m(hdev) || scan_2m(hdev)) {
+ cp->scanning_phys |= LE_SCAN_PHY_1M;
+ hci_le_scan_phy_params(phy, type, interval, window);
+ num_phy++;
+ phy++;
+ }
+
+ if (scan_coded(hdev)) {
+ cp->scanning_phys |= LE_SCAN_PHY_CODED;
+ hci_le_scan_phy_params(phy, type, interval * 3, window * 3);
+ num_phy++;
+ phy++;
+ }
+
+done:
+ if (!num_phy)
+ return -EINVAL;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS,
+ sizeof(*cp) + sizeof(*phy) * num_phy,
+ data, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_scan_param_sync(struct hci_dev *hdev, u8 type,
+ u16 interval, u16 window,
+ u8 own_addr_type, u8 filter_policy)
+{
+ struct hci_cp_le_set_scan_param cp;
+
+ if (use_ext_scan(hdev))
+ return hci_le_set_ext_scan_param_sync(hdev, type, interval,
+ window, own_addr_type,
+ filter_policy);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.type = type;
+ cp.interval = cpu_to_le16(interval);
+ cp.window = cpu_to_le16(window);
+ cp.own_address_type = own_addr_type;
+ cp.filter_policy = filter_policy;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_PARAM,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_start_scan_sync(struct hci_dev *hdev, u8 type, u16 interval,
+ u16 window, u8 own_addr_type, u8 filter_policy,
+ u8 filter_dup)
+{
+ int err;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return 0;
+ }
+
+ err = hci_le_set_scan_param_sync(hdev, type, interval, window,
+ own_addr_type, filter_policy);
+ if (err)
+ return err;
+
+ return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, filter_dup);
+}
+
+static int hci_passive_scan_sync(struct hci_dev *hdev)
+{
+ u8 own_addr_type;
+ u8 filter_policy;
+ u16 window, interval;
+ u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE;
+ int err;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return 0;
+ }
+
+ err = hci_scan_disable_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "disable scanning failed: %d", err);
+ return err;
+ }
+
+ /* Set require_privacy to false since no SCAN_REQ are send
+ * during passive scanning. Not using an non-resolvable address
+ * here is important so that peer devices using direct
+ * advertising with our address will be correctly reported
+ * by the controller.
+ */
+ if (hci_update_random_address_sync(hdev, false, scan_use_rpa(hdev),
+ &own_addr_type))
+ return 0;
+
+ if (hdev->enable_advmon_interleave_scan &&
+ hci_update_interleaved_scan_sync(hdev))
+ return 0;
+
+ bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state);
+
+ /* Adding or removing entries from the accept list must
+ * happen before enabling scanning. The controller does
+ * not allow accept list modification while scanning.
+ */
+ filter_policy = hci_update_accept_list_sync(hdev);
+
+ /* If suspended and filter_policy set to 0x00 (no acceptlist) then
+ * passive scanning cannot be started since that would require the host
+ * to be woken up to process the reports.
+ */
+ if (hdev->suspended && !filter_policy) {
+ /* Check if accept list is empty then there is no need to scan
+ * while suspended.
+ */
+ if (list_empty(&hdev->le_accept_list))
+ return 0;
+
+ /* If there are devices is the accept_list that means some
+ * devices could not be programmed which in non-suspended case
+ * means filter_policy needs to be set to 0x00 so the host needs
+ * to filter, but since this is treating suspended case we
+ * can ignore device needing host to filter to allow devices in
+ * the acceptlist to be able to wakeup the system.
+ */
+ filter_policy = 0x01;
+ }
+
+ /* When the controller is using random resolvable addresses and
+ * with that having LE privacy enabled, then controllers with
+ * Extended Scanner Filter Policies support can now enable support
+ * for handling directed advertising.
+ *
+ * So instead of using filter polices 0x00 (no acceptlist)
+ * and 0x01 (acceptlist enabled) use the new filter policies
+ * 0x02 (no acceptlist) and 0x03 (acceptlist enabled).
+ */
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY) &&
+ (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
+ filter_policy |= 0x02;
+
+ if (hdev->suspended) {
+ window = hdev->le_scan_window_suspend;
+ interval = hdev->le_scan_int_suspend;
+ } else if (hci_is_le_conn_scanning(hdev)) {
+ window = hdev->le_scan_window_connect;
+ interval = hdev->le_scan_int_connect;
+ } else if (hci_is_adv_monitoring(hdev)) {
+ window = hdev->le_scan_window_adv_monitor;
+ interval = hdev->le_scan_int_adv_monitor;
+
+ /* Disable duplicates filter when scanning for advertisement
+ * monitor for the following reasons.
+ *
+ * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm
+ * controllers ignore RSSI_Sampling_Period when the duplicates
+ * filter is enabled.
+ *
+ * For SW pattern filtering, when we're not doing interleaved
+ * scanning, it is necessary to disable duplicates filter,
+ * otherwise hosts can only receive one advertisement and it's
+ * impossible to know if a peer is still in range.
+ */
+ filter_dups = LE_SCAN_FILTER_DUP_DISABLE;
+ } else {
+ window = hdev->le_scan_window;
+ interval = hdev->le_scan_interval;
+ }
+
+ /* Disable all filtering for Mesh */
+ if (hci_dev_test_flag(hdev, HCI_MESH)) {
+ filter_policy = 0;
+ filter_dups = LE_SCAN_FILTER_DUP_DISABLE;
+ }
+
+ bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy);
+
+ return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window,
+ own_addr_type, filter_policy, filter_dups);
+}
+
+/* This function controls the passive scanning based on hdev->pend_le_conns
+ * list. If there are pending LE connection we start the background scanning,
+ * otherwise we stop it in the following sequence:
+ *
+ * If there are devices to scan:
+ *
+ * Disable Scanning -> Update Accept List ->
+ * ll_privacy_capable((Disable Advertising) -> Disable Resolving List ->
+ * Update Resolving List -> Enable Resolving List -> (Enable Advertising)) ->
+ * Enable Scanning
+ *
+ * Otherwise:
+ *
+ * Disable Scanning
+ */
+int hci_update_passive_scan_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (!test_bit(HCI_UP, &hdev->flags) ||
+ test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ /* No point in doing scanning if LE support hasn't been enabled */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ /* If discovery is active don't interfere with it */
+ if (hdev->discovery.state != DISCOVERY_STOPPED)
+ return 0;
+
+ /* Reset RSSI and UUID filters when starting background scanning
+ * since these filters are meant for service discovery only.
+ *
+ * The Start Discovery and Start Service Discovery operations
+ * ensure to set proper values for RSSI threshold and UUID
+ * filter list. So it is safe to just reset them here.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ bt_dev_dbg(hdev, "ADV monitoring is %s",
+ hci_is_adv_monitoring(hdev) ? "on" : "off");
+
+ if (!hci_dev_test_flag(hdev, HCI_MESH) &&
+ list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports) &&
+ !hci_is_adv_monitoring(hdev) &&
+ !hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ /* If there is no pending LE connections or devices
+ * to be scanned for or no ADV monitors, we should stop the
+ * background scanning.
+ */
+
+ bt_dev_dbg(hdev, "stopping background scanning");
+
+ err = hci_scan_disable_sync(hdev);
+ if (err)
+ bt_dev_err(hdev, "stop background scanning failed: %d",
+ err);
+ } else {
+ /* If there is at least one pending LE connection, we should
+ * keep the background scan running.
+ */
+
+ /* If controller is connecting, we should not start scanning
+ * since some controllers are not able to scan and connect at
+ * the same time.
+ */
+ if (hci_lookup_le_connect(hdev))
+ return 0;
+
+ bt_dev_dbg(hdev, "start background scanning");
+
+ err = hci_passive_scan_sync(hdev);
+ if (err)
+ bt_dev_err(hdev, "start background scanning failed: %d",
+ err);
+ }
+
+ return err;
+}
+
+static int update_scan_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_scan_sync(hdev);
+}
+
+int hci_update_scan(struct hci_dev *hdev)
+{
+ return hci_cmd_sync_queue(hdev, update_scan_sync, NULL, NULL);
+}
+
+static int update_passive_scan_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_passive_scan_sync(hdev);
+}
+
+int hci_update_passive_scan(struct hci_dev *hdev)
+{
+ /* Only queue if it would have any effect */
+ if (!test_bit(HCI_UP, &hdev->flags) ||
+ test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL,
+ NULL);
+}
+
+int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val)
+{
+ int err;
+
+ if (!bredr_sc_enabled(hdev) || lmp_host_sc_capable(hdev))
+ return 0;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(val), &val, HCI_CMD_TIMEOUT);
+
+ if (!err) {
+ if (val) {
+ hdev->features[1][0] |= LMP_HOST_SC;
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ } else {
+ hdev->features[1][0] &= ~LMP_HOST_SC;
+ hci_dev_clear_flag(hdev, HCI_SC_ENABLED);
+ }
+ }
+
+ return err;
+}
+
+int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode)
+{
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) ||
+ lmp_host_ssp_capable(hdev))
+ return 0;
+
+ if (!mode && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
+ __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+ }
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ return hci_write_sc_support_sync(hdev, 0x01);
+}
+
+int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul)
+{
+ struct hci_cp_write_le_host_supported cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) ||
+ !lmp_bredr_capable(hdev))
+ return 0;
+
+ /* Check first if we already have the right host state
+ * (host features set)
+ */
+ if (le == lmp_host_le_capable(hdev) &&
+ simul == lmp_host_le_br_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.le = le;
+ cp.simul = simul;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_powered_update_adv_sync(struct hci_dev *hdev)
+{
+ struct adv_info *adv, *tmp;
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ /* If RPA Resolution has not been enable yet it means the
+ * resolving list is empty and we should attempt to program the
+ * local IRK in order to support using own_addr_type
+ * ADDR_LE_DEV_RANDOM_RESOLVED (0x03).
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) {
+ hci_le_add_resolve_list_sync(hdev, NULL);
+ hci_le_set_addr_resolution_enable_sync(hdev, 0x01);
+ }
+
+ /* Make sure the controller has a good default for
+ * advertising data. This also applies to the case
+ * where BR/EDR was toggled during the AUTO_OFF phase.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances)) {
+ if (ext_adv_capable(hdev)) {
+ err = hci_setup_ext_adv_instance_sync(hdev, 0x00);
+ if (!err)
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ } else {
+ err = hci_update_adv_data_sync(hdev, 0x00);
+ if (!err)
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ hci_enable_advertising_sync(hdev);
+ }
+
+ /* Call for each tracked instance to be scheduled */
+ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list)
+ hci_schedule_adv_instance_sync(hdev, adv->instance, true);
+
+ return 0;
+}
+
+static int hci_write_auth_enable_sync(struct hci_dev *hdev)
+{
+ u8 link_sec;
+
+ link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
+ if (link_sec == test_bit(HCI_AUTH, &hdev->flags))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE,
+ sizeof(link_sec), &link_sec,
+ HCI_CMD_TIMEOUT);
+}
+
+int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable)
+{
+ struct hci_cp_write_page_scan_activity cp;
+ u8 type;
+ int err = 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (enable) {
+ type = PAGE_SCAN_TYPE_INTERLACED;
+
+ /* 160 msec page scan interval */
+ cp.interval = cpu_to_le16(0x0100);
+ } else {
+ type = hdev->def_page_scan_type;
+ cp.interval = cpu_to_le16(hdev->def_page_scan_int);
+ }
+
+ cp.window = cpu_to_le16(hdev->def_page_scan_window);
+
+ if (__cpu_to_le16(hdev->page_scan_interval) != cp.interval ||
+ __cpu_to_le16(hdev->page_scan_window) != cp.window) {
+ err = __hci_cmd_sync_status(hdev,
+ HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+ }
+
+ if (hdev->page_scan_type != type)
+ err = __hci_cmd_sync_status(hdev,
+ HCI_OP_WRITE_PAGE_SCAN_TYPE,
+ sizeof(type), &type,
+ HCI_CMD_TIMEOUT);
+
+ return err;
+}
+
+static bool disconnected_accept_list_entries(struct hci_dev *hdev)
+{
+ struct bdaddr_list *b;
+
+ list_for_each_entry(b, &hdev->accept_list, list) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr);
+ if (!conn)
+ return true;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ return true;
+ }
+
+ return false;
+}
+
+static int hci_write_scan_enable_sync(struct hci_dev *hdev, u8 val)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE,
+ sizeof(val), &val,
+ HCI_CMD_TIMEOUT);
+}
+
+int hci_update_scan_sync(struct hci_dev *hdev)
+{
+ u8 scan;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (!hdev_is_powered(hdev))
+ return 0;
+
+ if (mgmt_powering_down(hdev))
+ return 0;
+
+ if (hdev->scanning_paused)
+ return 0;
+
+ if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) ||
+ disconnected_accept_list_entries(hdev))
+ scan = SCAN_PAGE;
+ else
+ scan = SCAN_DISABLED;
+
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ scan |= SCAN_INQUIRY;
+
+ if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) &&
+ test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY))
+ return 0;
+
+ return hci_write_scan_enable_sync(hdev, scan);
+}
+
+int hci_update_name_sync(struct hci_dev *hdev, const u8 *name)
+{
+ struct hci_cp_write_local_name cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ memcpy(cp.name, name, sizeof(cp.name));
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME,
+ sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+/* This function perform powered update HCI command sequence after the HCI init
+ * sequence which end up resetting all states, the sequence is as follows:
+ *
+ * HCI_SSP_ENABLED(Enable SSP)
+ * HCI_LE_ENABLED(Enable LE)
+ * HCI_LE_ENABLED(ll_privacy_capable(Add local IRK to Resolving List) ->
+ * Update adv data)
+ * Enable Authentication
+ * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class ->
+ * Set Name -> Set EIR)
+ * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address)
+ */
+int hci_powered_update_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* Register the available SMP channels (BR/EDR and LE) only when
+ * successfully powering on the controller. This late
+ * registration is required so that LE SMP can clearly decide if
+ * the public address or static address is used.
+ */
+ smp_register(hdev);
+
+ err = hci_write_ssp_mode_sync(hdev, 0x01);
+ if (err)
+ return err;
+
+ err = hci_write_le_host_supported_sync(hdev, 0x01, 0x00);
+ if (err)
+ return err;
+
+ err = hci_powered_update_adv_sync(hdev);
+ if (err)
+ return err;
+
+ err = hci_write_auth_enable_sync(hdev);
+ if (err)
+ return err;
+
+ if (lmp_bredr_capable(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
+ hci_write_fast_connectable_sync(hdev, true);
+ else
+ hci_write_fast_connectable_sync(hdev, false);
+ hci_update_scan_sync(hdev);
+ hci_update_class_sync(hdev);
+ hci_update_name_sync(hdev, hdev->dev_name);
+ hci_update_eir_sync(hdev);
+ }
+
+ /* If forcing static address is in use or there is no public
+ * address use the static address as random address (but skip
+ * the HCI command if the current random address is already the
+ * static one.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller
+ * and a static address has been configured, then use that
+ * address instead of the public BR/EDR address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) {
+ if (bacmp(&hdev->static_addr, BDADDR_ANY))
+ return hci_set_random_addr_sync(hdev,
+ &hdev->static_addr);
+ }
+
+ return 0;
+}
+
+/**
+ * hci_dev_get_bd_addr_from_property - Get the Bluetooth Device Address
+ * (BD_ADDR) for a HCI device from
+ * a firmware node property.
+ * @hdev: The HCI device
+ *
+ * Search the firmware node for 'local-bd-address'.
+ *
+ * All-zero BD addresses are rejected, because those could be properties
+ * that exist in the firmware tables, but were not updated by the firmware. For
+ * example, the DTS could define 'local-bd-address', with zero BD addresses.
+ */
+static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev)
+{
+ struct fwnode_handle *fwnode = dev_fwnode(hdev->dev.parent);
+ bdaddr_t ba;
+ int ret;
+
+ ret = fwnode_property_read_u8_array(fwnode, "local-bd-address",
+ (u8 *)&ba, sizeof(ba));
+ if (ret < 0 || !bacmp(&ba, BDADDR_ANY))
+ return;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN))
+ baswap(&hdev->public_addr, &ba);
+ else
+ bacpy(&hdev->public_addr, &ba);
+}
+
+struct hci_init_stage {
+ int (*func)(struct hci_dev *hdev);
+};
+
+/* Run init stage NULL terminated function table */
+static int hci_init_stage_sync(struct hci_dev *hdev,
+ const struct hci_init_stage *stage)
+{
+ size_t i;
+
+ for (i = 0; stage[i].func; i++) {
+ int err;
+
+ err = stage[i].func(hdev);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Read Local Version */
+static int hci_read_local_version_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_VERSION,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read BD Address */
+static int hci_read_bd_addr_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_BD_ADDR,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+#define HCI_INIT(_func) \
+{ \
+ .func = _func, \
+}
+
+static const struct hci_init_stage hci_init0[] = {
+ /* HCI_OP_READ_LOCAL_VERSION */
+ HCI_INIT(hci_read_local_version_sync),
+ /* HCI_OP_READ_BD_ADDR */
+ HCI_INIT(hci_read_bd_addr_sync),
+ {}
+};
+
+int hci_reset_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ set_bit(HCI_RESET, &hdev->flags);
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_RESET, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int hci_init0_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Reset */
+ if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) {
+ err = hci_reset_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ return hci_init_stage_sync(hdev, hci_init0);
+}
+
+static int hci_unconf_init_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
+ return 0;
+
+ err = hci_init0_sync(hdev);
+ if (err < 0)
+ return err;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ hci_debugfs_create_basic(hdev);
+
+ return 0;
+}
+
+/* Read Local Supported Features. */
+static int hci_read_local_features_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* BR Controller init stage 1 command sequence */
+static const struct hci_init_stage br_init1[] = {
+ /* HCI_OP_READ_LOCAL_FEATURES */
+ HCI_INIT(hci_read_local_features_sync),
+ /* HCI_OP_READ_LOCAL_VERSION */
+ HCI_INIT(hci_read_local_version_sync),
+ /* HCI_OP_READ_BD_ADDR */
+ HCI_INIT(hci_read_bd_addr_sync),
+ {}
+};
+
+/* Read Local Commands */
+static int hci_read_local_cmds_sync(struct hci_dev *hdev)
+{
+ /* All Bluetooth 1.2 and later controllers should support the
+ * HCI command for reading the local supported commands.
+ *
+ * Unfortunately some controllers indicate Bluetooth 1.2 support,
+ * but do not have support for this command. If that is the case,
+ * the driver can quirk the behavior and skip reading the local
+ * supported commands.
+ */
+ if (hdev->hci_ver > BLUETOOTH_VER_1_1 &&
+ !hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS))
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+ return 0;
+}
+
+static int hci_init1_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Reset */
+ if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) {
+ err = hci_reset_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ return hci_init_stage_sync(hdev, br_init1);
+}
+
+/* Read Buffer Size (ACL mtu, max pkt, etc.) */
+static int hci_read_buffer_size_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Class of Device */
+static int hci_read_dev_class_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLASS_OF_DEV,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Local Name */
+static int hci_read_local_name_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_NAME,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Voice Setting */
+static int hci_read_voice_setting_sync(struct hci_dev *hdev)
+{
+ if (!read_voice_setting_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Number of Supported IAC */
+static int hci_read_num_supported_iac_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_NUM_SUPPORTED_IAC,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Current IAC LAP */
+static int hci_read_current_iac_lap_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_CURRENT_IAC_LAP,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type,
+ u8 cond_type, bdaddr_t *bdaddr,
+ u8 auto_accept)
+{
+ struct hci_cp_set_event_filter cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.flt_type = flt_type;
+
+ if (flt_type != HCI_FLT_CLEAR_ALL) {
+ cp.cond_type = cond_type;
+ bacpy(&cp.addr_conn_flt.bdaddr, bdaddr);
+ cp.addr_conn_flt.auto_accept = auto_accept;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_FLT,
+ flt_type == HCI_FLT_CLEAR_ALL ?
+ sizeof(cp.flt_type) : sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_clear_event_filter_sync(struct hci_dev *hdev)
+{
+ if (!hci_dev_test_flag(hdev, HCI_EVENT_FILTER_CONFIGURED))
+ return 0;
+
+ /* In theory the state machine should not reach here unless
+ * a hci_set_event_filter_sync() call succeeds, but we do
+ * the check both for parity and as a future reminder.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
+ return 0;
+
+ return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00,
+ BDADDR_ANY, 0x00);
+}
+
+/* Connection accept timeout ~20 secs */
+static int hci_write_ca_timeout_sync(struct hci_dev *hdev)
+{
+ __le16 param = cpu_to_le16(0x7d00);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CA_TIMEOUT,
+ sizeof(param), &param, HCI_CMD_TIMEOUT);
+}
+
+/* Enable SCO flow control if supported */
+static int hci_write_sync_flowctl_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_sync_flowctl cp;
+ int err;
+
+ /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */
+ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) ||
+ !hci_test_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = 0x01;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (!err)
+ hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL);
+
+ return err;
+}
+
+/* BR Controller init stage 2 command sequence */
+static const struct hci_init_stage br_init2[] = {
+ /* HCI_OP_READ_BUFFER_SIZE */
+ HCI_INIT(hci_read_buffer_size_sync),
+ /* HCI_OP_READ_CLASS_OF_DEV */
+ HCI_INIT(hci_read_dev_class_sync),
+ /* HCI_OP_READ_LOCAL_NAME */
+ HCI_INIT(hci_read_local_name_sync),
+ /* HCI_OP_READ_VOICE_SETTING */
+ HCI_INIT(hci_read_voice_setting_sync),
+ /* HCI_OP_READ_NUM_SUPPORTED_IAC */
+ HCI_INIT(hci_read_num_supported_iac_sync),
+ /* HCI_OP_READ_CURRENT_IAC_LAP */
+ HCI_INIT(hci_read_current_iac_lap_sync),
+ /* HCI_OP_SET_EVENT_FLT */
+ HCI_INIT(hci_clear_event_filter_sync),
+ /* HCI_OP_WRITE_CA_TIMEOUT */
+ HCI_INIT(hci_write_ca_timeout_sync),
+ /* HCI_OP_WRITE_SYNC_FLOWCTL */
+ HCI_INIT(hci_write_sync_flowctl_sync),
+ {}
+};
+
+static int hci_write_ssp_mode_1_sync(struct hci_dev *hdev)
+{
+ u8 mode = 0x01;
+
+ if (!lmp_ssp_capable(hdev) || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return 0;
+
+ /* When SSP is available, then the host features page
+ * should also be available as well. However some
+ * controllers list the max_page as 0 as long as SSP
+ * has not been enabled. To achieve proper debugging
+ * output, force the minimum max_page to 1 at least.
+ */
+ hdev->max_page = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+}
+
+static int hci_write_eir_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_eir cp;
+
+ if (!lmp_ssp_capable(hdev) || hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return 0;
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+ memset(&cp, 0, sizeof(cp));
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_write_inquiry_mode_sync(struct hci_dev *hdev)
+{
+ u8 mode;
+
+ if (!lmp_inq_rssi_capable(hdev) &&
+ !hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE))
+ return 0;
+
+ /* If Extended Inquiry Result events are supported, then
+ * they are clearly preferred over Inquiry Result with RSSI
+ * events.
+ */
+ mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_INQUIRY_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_inq_rsp_tx_power_sync(struct hci_dev *hdev)
+{
+ if (!lmp_inq_tx_pwr_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_INQ_RSP_TX_POWER,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_local_ext_features_sync(struct hci_dev *hdev, u8 page)
+{
+ struct hci_cp_read_local_ext_features cp;
+
+ if (!lmp_ext_feat_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.page = page;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_local_ext_features_1_sync(struct hci_dev *hdev)
+{
+ return hci_read_local_ext_features_sync(hdev, 0x01);
+}
+
+/* HCI Controller init stage 2 command sequence */
+static const struct hci_init_stage hci_init2[] = {
+ /* HCI_OP_READ_LOCAL_COMMANDS */
+ HCI_INIT(hci_read_local_cmds_sync),
+ /* HCI_OP_WRITE_SSP_MODE */
+ HCI_INIT(hci_write_ssp_mode_1_sync),
+ /* HCI_OP_WRITE_EIR */
+ HCI_INIT(hci_write_eir_sync),
+ /* HCI_OP_WRITE_INQUIRY_MODE */
+ HCI_INIT(hci_write_inquiry_mode_sync),
+ /* HCI_OP_READ_INQ_RSP_TX_POWER */
+ HCI_INIT(hci_read_inq_rsp_tx_power_sync),
+ /* HCI_OP_READ_LOCAL_EXT_FEATURES */
+ HCI_INIT(hci_read_local_ext_features_1_sync),
+ /* HCI_OP_WRITE_AUTH_ENABLE */
+ HCI_INIT(hci_write_auth_enable_sync),
+ {}
+};
+
+/* Read LE Buffer Size */
+static int hci_le_read_buffer_size_sync(struct hci_dev *hdev)
+{
+ /* Use Read LE Buffer Size V2 if supported */
+ if (iso_capable(hdev) && hdev->commands[41] & 0x20)
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_BUFFER_SIZE_V2,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Local Supported Features */
+static int hci_le_read_local_features_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(2))
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_ALL_LOCAL_FEATURES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+ return err;
+}
+
+/* Read LE Supported States */
+static int hci_le_read_supported_states_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_SUPPORTED_STATES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* LE Controller init stage 2 command sequence */
+static const struct hci_init_stage le_init2[] = {
+ /* HCI_OP_LE_READ_LOCAL_FEATURES */
+ HCI_INIT(hci_le_read_local_features_sync),
+ /* HCI_OP_LE_READ_BUFFER_SIZE */
+ HCI_INIT(hci_le_read_buffer_size_sync),
+ /* HCI_OP_LE_READ_SUPPORTED_STATES */
+ HCI_INIT(hci_le_read_supported_states_sync),
+ {}
+};
+
+static int hci_init2_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_init_stage_sync(hdev, hci_init2);
+ if (err)
+ return err;
+
+ if (lmp_bredr_capable(hdev)) {
+ err = hci_init_stage_sync(hdev, br_init2);
+ if (err)
+ return err;
+ } else {
+ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
+ }
+
+ if (lmp_le_capable(hdev)) {
+ err = hci_init_stage_sync(hdev, le_init2);
+ if (err)
+ return err;
+ /* LE-only controllers have LE implicitly enabled */
+ if (!lmp_bredr_capable(hdev))
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+ }
+
+ return 0;
+}
+
+static int hci_set_event_mask_sync(struct hci_dev *hdev)
+{
+ /* The second byte is 0xff instead of 0x9f (two reserved bits
+ * disabled) since a Broadcom 1.2 dongle doesn't respond to the
+ * command otherwise.
+ */
+ u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 };
+
+ /* CSR 1.1 dongles does not accept any bitfield so don't try to set
+ * any event mask for pre 1.2 devices.
+ */
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ events[4] |= 0x01; /* Flow Specification Complete */
+
+ /* Don't set Disconnect Complete and mode change when
+ * suspended as that would wakeup the host when disconnecting
+ * due to suspend.
+ */
+ if (hdev->suspended) {
+ events[0] &= 0xef;
+ events[2] &= 0xf7;
+ }
+ } else {
+ /* Use a different default for LE-only devices */
+ memset(events, 0, sizeof(events));
+ events[1] |= 0x20; /* Command Complete */
+ events[1] |= 0x40; /* Command Status */
+ events[1] |= 0x80; /* Hardware Error */
+
+ /* If the controller supports the Disconnect command, enable
+ * the corresponding event. In addition enable packet flow
+ * control related events.
+ */
+ if (hdev->commands[0] & 0x20) {
+ /* Don't set Disconnect Complete when suspended as that
+ * would wakeup the host when disconnecting due to
+ * suspend.
+ */
+ if (!hdev->suspended)
+ events[0] |= 0x10; /* Disconnection Complete */
+ events[2] |= 0x04; /* Number of Completed Packets */
+ events[3] |= 0x02; /* Data Buffer Overflow */
+ }
+
+ /* If the controller supports the Read Remote Version
+ * Information command, enable the corresponding event.
+ */
+ if (hdev->commands[2] & 0x80)
+ events[1] |= 0x08; /* Read Remote Version Information
+ * Complete
+ */
+
+ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) {
+ events[0] |= 0x80; /* Encryption Change */
+ events[5] |= 0x80; /* Encryption Key Refresh Complete */
+ }
+ }
+
+ if (lmp_inq_rssi_capable(hdev) ||
+ hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE))
+ events[4] |= 0x02; /* Inquiry Result with RSSI */
+
+ if (lmp_ext_feat_capable(hdev))
+ events[4] |= 0x04; /* Read Remote Extended Features Complete */
+
+ if (lmp_esco_capable(hdev)) {
+ events[5] |= 0x08; /* Synchronous Connection Complete */
+ events[5] |= 0x10; /* Synchronous Connection Changed */
+ }
+
+ if (lmp_sniffsubr_capable(hdev))
+ events[5] |= 0x20; /* Sniff Subrating */
+
+ if (lmp_pause_enc_capable(hdev))
+ events[5] |= 0x80; /* Encryption Key Refresh Complete */
+
+ if (lmp_ext_inq_capable(hdev))
+ events[5] |= 0x40; /* Extended Inquiry Result */
+
+ if (lmp_no_flush_capable(hdev))
+ events[7] |= 0x01; /* Enhanced Flush Complete */
+
+ if (lmp_lsto_capable(hdev))
+ events[6] |= 0x80; /* Link Supervision Timeout Changed */
+
+ if (lmp_ssp_capable(hdev)) {
+ events[6] |= 0x01; /* IO Capability Request */
+ events[6] |= 0x02; /* IO Capability Response */
+ events[6] |= 0x04; /* User Confirmation Request */
+ events[6] |= 0x08; /* User Passkey Request */
+ events[6] |= 0x10; /* Remote OOB Data Request */
+ events[6] |= 0x20; /* Simple Pairing Complete */
+ events[7] |= 0x04; /* User Passkey Notification */
+ events[7] |= 0x08; /* Keypress Notification */
+ events[7] |= 0x10; /* Remote Host Supported
+ * Features Notification
+ */
+ }
+
+ if (lmp_le_capable(hdev))
+ events[7] |= 0x20; /* LE Meta-Event */
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK,
+ sizeof(events), events, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_stored_link_key_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_read_stored_link_key cp;
+
+ if (!(hdev->commands[6] & 0x20) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, BDADDR_ANY);
+ cp.read_all = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_STORED_LINK_KEY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_setup_link_policy_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_def_link_policy cp;
+ u16 link_policy = 0;
+
+ if (!(hdev->commands[5] & 0x10))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (lmp_rswitch_capable(hdev))
+ link_policy |= HCI_LP_RSWITCH;
+ if (lmp_hold_capable(hdev))
+ link_policy |= HCI_LP_HOLD;
+ if (lmp_sniff_capable(hdev))
+ link_policy |= HCI_LP_SNIFF;
+ if (lmp_park_capable(hdev))
+ link_policy |= HCI_LP_PARK;
+
+ cp.policy = cpu_to_le16(link_policy);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_page_scan_activity_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[8] & 0x01))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_ACTIVITY,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[18] & 0x04) ||
+ !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_page_scan_type_sync(struct hci_dev *hdev)
+{
+ /* Some older Broadcom based Bluetooth 1.2 controllers do not
+ * support the Read Page Scan Type command. Check support for
+ * this command in the bit mask of supported commands.
+ */
+ if (!(hdev->commands[13] & 0x01) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read features beyond page 1 if available */
+static int hci_read_local_ext_features_all_sync(struct hci_dev *hdev)
+{
+ u8 page;
+ int err;
+
+ if (!lmp_ext_feat_capable(hdev))
+ return 0;
+
+ for (page = 2; page < HCI_MAX_PAGES && page <= hdev->max_page;
+ page++) {
+ err = hci_read_local_ext_features_sync(hdev, page);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* HCI Controller init stage 3 command sequence */
+static const struct hci_init_stage hci_init3[] = {
+ /* HCI_OP_SET_EVENT_MASK */
+ HCI_INIT(hci_set_event_mask_sync),
+ /* HCI_OP_READ_STORED_LINK_KEY */
+ HCI_INIT(hci_read_stored_link_key_sync),
+ /* HCI_OP_WRITE_DEF_LINK_POLICY */
+ HCI_INIT(hci_setup_link_policy_sync),
+ /* HCI_OP_READ_PAGE_SCAN_ACTIVITY */
+ HCI_INIT(hci_read_page_scan_activity_sync),
+ /* HCI_OP_READ_DEF_ERR_DATA_REPORTING */
+ HCI_INIT(hci_read_def_err_data_reporting_sync),
+ /* HCI_OP_READ_PAGE_SCAN_TYPE */
+ HCI_INIT(hci_read_page_scan_type_sync),
+ /* HCI_OP_READ_LOCAL_EXT_FEATURES */
+ HCI_INIT(hci_read_local_ext_features_all_sync),
+ {}
+};
+
+static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
+{
+ u8 events[8];
+
+ if (!lmp_le_capable(hdev))
+ return 0;
+
+ memset(events, 0, sizeof(events));
+
+ if (hdev->le_features[0] & HCI_LE_ENCRYPTION)
+ events[0] |= 0x10; /* LE Long Term Key Request */
+
+ /* If controller supports the Connection Parameters Request
+ * Link Layer Procedure, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC)
+ /* LE Remote Connection Parameter Request */
+ events[0] |= 0x20;
+
+ /* If the controller supports the Data Length Extension
+ * feature, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)
+ events[0] |= 0x40; /* LE Data Length Change */
+
+ /* If the controller supports LL Privacy feature or LE Extended Adv,
+ * enable the corresponding event.
+ */
+ if (use_enhanced_conn_complete(hdev))
+ events[1] |= 0x02; /* LE Enhanced Connection Complete */
+
+ /* Mark Device Privacy if Privacy Mode is supported */
+ if (privacy_mode_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY;
+
+ /* Mark Address Resolution if LL Privacy is supported */
+ if (ll_privacy_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION;
+
+ /* Mark PAST if supported */
+ if (past_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_PAST;
+
+ /* If the controller supports Extended Scanner Filter
+ * Policies, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)
+ events[1] |= 0x04; /* LE Direct Advertising Report */
+
+ /* If the controller supports Channel Selection Algorithm #2
+ * feature, enable the corresponding event.
+ */
+ if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2)
+ events[2] |= 0x08; /* LE Channel Selection Algorithm */
+
+ /* If the controller supports the LE Set Scan Enable command,
+ * enable the corresponding advertising report event.
+ */
+ if (hdev->commands[26] & 0x08)
+ events[0] |= 0x02; /* LE Advertising Report */
+
+ /* If the controller supports the LE Create Connection
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[26] & 0x10)
+ events[0] |= 0x01; /* LE Connection Complete */
+
+ /* If the controller supports the LE Connection Update
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[27] & 0x04)
+ events[0] |= 0x04; /* LE Connection Update Complete */
+
+ /* If the controller supports the LE Read Remote Used Features
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[27] & 0x20)
+ /* LE Read Remote Used Features Complete */
+ events[0] |= 0x08;
+
+ /* If the controller supports the LE Read Local P-256
+ * Public Key command, enable the corresponding event.
+ */
+ if (hdev->commands[34] & 0x02)
+ /* LE Read Local P-256 Public Key Complete */
+ events[0] |= 0x80;
+
+ /* If the controller supports the LE Generate DHKey
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[34] & 0x04)
+ events[1] |= 0x01; /* LE Generate DHKey Complete */
+
+ /* If the controller supports the LE Set Default PHY or
+ * LE Set PHY commands, enable the corresponding event.
+ */
+ if (hdev->commands[35] & (0x20 | 0x40))
+ events[1] |= 0x08; /* LE PHY Update Complete */
+
+ /* If the controller supports LE Set Extended Scan Parameters
+ * and LE Set Extended Scan Enable commands, enable the
+ * corresponding event.
+ */
+ if (use_ext_scan(hdev))
+ events[1] |= 0x10; /* LE Extended Advertising Report */
+
+ /* If the controller supports the LE Extended Advertising
+ * command, enable the corresponding event.
+ */
+ if (ext_adv_capable(hdev))
+ events[2] |= 0x02; /* LE Advertising Set Terminated */
+
+ if (past_receiver_capable(hdev))
+ events[2] |= 0x80; /* LE PAST Received */
+
+ if (cis_capable(hdev)) {
+ events[3] |= 0x01; /* LE CIS Established */
+ if (cis_peripheral_capable(hdev))
+ events[3] |= 0x02; /* LE CIS Request */
+ }
+
+ if (bis_capable(hdev)) {
+ events[1] |= 0x20; /* LE PA Report */
+ events[1] |= 0x40; /* LE PA Sync Established */
+ events[3] |= 0x04; /* LE Create BIG Complete */
+ events[3] |= 0x08; /* LE Terminate BIG Complete */
+ events[3] |= 0x10; /* LE BIG Sync Established */
+ events[3] |= 0x20; /* LE BIG Sync Loss */
+ events[4] |= 0x02; /* LE BIG Info Advertising Report */
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK,
+ sizeof(events), events, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Advertising Channel TX Power */
+static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev)
+{
+ if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) {
+ /* HCI TS spec forbids mixing of legacy and extended
+ * advertising commands wherein READ_ADV_TX_POWER is
+ * also included. So do not call it if extended adv
+ * is supported otherwise controller will return
+ * COMMAND_DISALLOWED for extended commands.
+ */
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_ADV_TX_POWER,
+ 0, NULL, HCI_CMD_TIMEOUT);
+ }
+
+ return 0;
+}
+
+/* Read LE Min/Max Tx Power*/
+static int hci_le_read_tx_power_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[38] & 0x80) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Accept List Size */
+static int hci_le_read_accept_list_size_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[26] & 0x40))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ACCEPT_LIST_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Resolving List Size */
+static int hci_le_read_resolv_list_size_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[34] & 0x40))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_RESOLV_LIST_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Clear LE Resolving List */
+static int hci_le_clear_resolv_list_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[34] & 0x20))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Set RPA timeout */
+static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev)
+{
+ __le16 timeout = cpu_to_le16(hdev->rpa_timeout);
+
+ if (!(hdev->commands[35] & 0x04) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT,
+ sizeof(timeout), &timeout,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Maximum Data Length */
+static int hci_le_read_max_data_len_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Suggested Default Data Length */
+static int hci_le_read_def_data_len_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Number of Supported Advertising Sets */
+static int hci_le_read_num_support_adv_sets_sync(struct hci_dev *hdev)
+{
+ if (!ext_adv_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Write LE Host Supported */
+static int hci_set_le_support_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_le_host_supported cp;
+
+ /* LE-only devices do not support explicit enablement */
+ if (!lmp_bredr_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ cp.le = 0x01;
+ cp.simul = 0x00;
+ }
+
+ if (cp.le == lmp_host_le_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* LE Set Host Feature */
+static int hci_le_set_host_feature_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_le_set_host_feature cp;
+
+ if (!iso_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Connected Isochronous Channels (Host Support) */
+ cp.bit_number = 32;
+ cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* LE Controller init stage 3 command sequence */
+static const struct hci_init_stage le_init3[] = {
+ /* HCI_OP_LE_SET_EVENT_MASK */
+ HCI_INIT(hci_le_set_event_mask_sync),
+ /* HCI_OP_LE_READ_ADV_TX_POWER */
+ HCI_INIT(hci_le_read_adv_tx_power_sync),
+ /* HCI_OP_LE_READ_TRANSMIT_POWER */
+ HCI_INIT(hci_le_read_tx_power_sync),
+ /* HCI_OP_LE_READ_ACCEPT_LIST_SIZE */
+ HCI_INIT(hci_le_read_accept_list_size_sync),
+ /* HCI_OP_LE_CLEAR_ACCEPT_LIST */
+ HCI_INIT(hci_le_clear_accept_list_sync),
+ /* HCI_OP_LE_READ_RESOLV_LIST_SIZE */
+ HCI_INIT(hci_le_read_resolv_list_size_sync),
+ /* HCI_OP_LE_CLEAR_RESOLV_LIST */
+ HCI_INIT(hci_le_clear_resolv_list_sync),
+ /* HCI_OP_LE_SET_RPA_TIMEOUT */
+ HCI_INIT(hci_le_set_rpa_timeout_sync),
+ /* HCI_OP_LE_READ_MAX_DATA_LEN */
+ HCI_INIT(hci_le_read_max_data_len_sync),
+ /* HCI_OP_LE_READ_DEF_DATA_LEN */
+ HCI_INIT(hci_le_read_def_data_len_sync),
+ /* HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS */
+ HCI_INIT(hci_le_read_num_support_adv_sets_sync),
+ /* HCI_OP_WRITE_LE_HOST_SUPPORTED */
+ HCI_INIT(hci_set_le_support_sync),
+ /* HCI_OP_LE_SET_HOST_FEATURE */
+ HCI_INIT(hci_le_set_host_feature_sync),
+ {}
+};
+
+static int hci_init3_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_init_stage_sync(hdev, hci_init3);
+ if (err)
+ return err;
+
+ if (lmp_le_capable(hdev))
+ return hci_init_stage_sync(hdev, le_init3);
+
+ return 0;
+}
+
+static int hci_delete_stored_link_key_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_delete_stored_link_key cp;
+
+ /* Some Broadcom based Bluetooth controllers do not support the
+ * Delete Stored Link Key command. They are clearly indicating its
+ * absence in the bit mask of supported commands.
+ *
+ * Check the supported commands and only if the command is marked
+ * as supported send it. If not supported assume that the controller
+ * does not have actual support for stored link keys which makes this
+ * command redundant anyway.
+ *
+ * Some controllers indicate that they support handling deleting
+ * stored link keys, but they don't. The quirk lets a driver
+ * just disable this command.
+ */
+ if (!(hdev->commands[6] & 0x80) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, BDADDR_ANY);
+ cp.delete_all = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_DELETE_STORED_LINK_KEY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev)
+{
+ u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ bool changed = false;
+
+ /* Set event mask page 2 if the HCI command for it is supported */
+ if (!(hdev->commands[22] & 0x04))
+ return 0;
+
+ /* If Connectionless Peripheral Broadcast central role is supported
+ * enable all necessary events for it.
+ */
+ if (lmp_cpb_central_capable(hdev)) {
+ events[1] |= 0x40; /* Triggered Clock Capture */
+ events[1] |= 0x80; /* Synchronization Train Complete */
+ events[2] |= 0x08; /* Truncated Page Complete */
+ events[2] |= 0x20; /* CPB Channel Map Change */
+ changed = true;
+ }
+
+ /* If Connectionless Peripheral Broadcast peripheral role is supported
+ * enable all necessary events for it.
+ */
+ if (lmp_cpb_peripheral_capable(hdev)) {
+ events[2] |= 0x01; /* Synchronization Train Received */
+ events[2] |= 0x02; /* CPB Receive */
+ events[2] |= 0x04; /* CPB Timeout */
+ events[2] |= 0x10; /* Peripheral Page Response Timeout */
+ changed = true;
+ }
+
+ /* Enable Authenticated Payload Timeout Expired event if supported */
+ if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) {
+ events[2] |= 0x80;
+ changed = true;
+ }
+
+ /* Some Broadcom based controllers indicate support for Set Event
+ * Mask Page 2 command, but then actually do not support it. Since
+ * the default value is all bits set to zero, the command is only
+ * required if the event mask has to be changed. In case no change
+ * to the event mask is needed, skip this command.
+ */
+ if (!changed)
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK_PAGE_2,
+ sizeof(events), events, HCI_CMD_TIMEOUT);
+}
+
+/* Read local codec list if the HCI command is supported */
+static int hci_read_local_codecs_sync(struct hci_dev *hdev)
+{
+ if (hdev->commands[45] & 0x04)
+ hci_read_supported_codecs_v2(hdev);
+ else if (hdev->commands[29] & 0x20)
+ hci_read_supported_codecs(hdev);
+
+ return 0;
+}
+
+/* Read local pairing options if the HCI command is supported */
+static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[41] & 0x08))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_PAIRING_OPTS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Get MWS transport configuration if the HCI command is supported */
+static int hci_get_mws_transport_config_sync(struct hci_dev *hdev)
+{
+ if (!mws_transport_config_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Check for Synchronization Train support */
+static int hci_read_sync_train_params_sync(struct hci_dev *hdev)
+{
+ if (!lmp_sync_train_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_SYNC_TRAIN_PARAMS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Enable Secure Connections if supported and configured */
+static int hci_write_sc_support_1_sync(struct hci_dev *hdev)
+{
+ u8 support = 0x01;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) ||
+ !bredr_sc_enabled(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(support), &support,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Set erroneous data reporting if supported to the wideband speech
+ * setting value
+ */
+static int hci_set_err_data_report_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_def_err_data_reporting cp;
+ bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED);
+
+ if (!(hdev->commands[18] & 0x08) ||
+ !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING))
+ return 0;
+
+ if (enabled == hdev->err_data_reporting)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.err_data_reporting = enabled ? ERR_DATA_REPORTING_ENABLED :
+ ERR_DATA_REPORTING_DISABLED;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static const struct hci_init_stage hci_init4[] = {
+ /* HCI_OP_DELETE_STORED_LINK_KEY */
+ HCI_INIT(hci_delete_stored_link_key_sync),
+ /* HCI_OP_SET_EVENT_MASK_PAGE_2 */
+ HCI_INIT(hci_set_event_mask_page_2_sync),
+ /* HCI_OP_READ_LOCAL_CODECS */
+ HCI_INIT(hci_read_local_codecs_sync),
+ /* HCI_OP_READ_LOCAL_PAIRING_OPTS */
+ HCI_INIT(hci_read_local_pairing_opts_sync),
+ /* HCI_OP_GET_MWS_TRANSPORT_CONFIG */
+ HCI_INIT(hci_get_mws_transport_config_sync),
+ /* HCI_OP_READ_SYNC_TRAIN_PARAMS */
+ HCI_INIT(hci_read_sync_train_params_sync),
+ /* HCI_OP_WRITE_SC_SUPPORT */
+ HCI_INIT(hci_write_sc_support_1_sync),
+ /* HCI_OP_WRITE_DEF_ERR_DATA_REPORTING */
+ HCI_INIT(hci_set_err_data_report_sync),
+ {}
+};
+
+/* Set Suggested Default Data Length to maximum if supported */
+static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_le_write_def_data_len cp;
+
+ if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.tx_len = cpu_to_le16(hdev->le_max_tx_len);
+ cp.tx_time = cpu_to_le16(hdev->le_max_tx_time);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Set Default PHY parameters if command is supported, enables all supported
+ * PHYs according to the LE Features bits.
+ */
+static int hci_le_set_default_phy_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_le_set_default_phy cp;
+
+ if (!(hdev->commands[35] & 0x20)) {
+ /* If the command is not supported it means only 1M PHY is
+ * supported.
+ */
+ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
+ return 0;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.all_phys = 0x00;
+ cp.tx_phys = HCI_LE_SET_PHY_1M;
+ cp.rx_phys = HCI_LE_SET_PHY_1M;
+
+ /* Enables 2M PHY if supported */
+ if (le_2m_capable(hdev)) {
+ cp.tx_phys |= HCI_LE_SET_PHY_2M;
+ cp.rx_phys |= HCI_LE_SET_PHY_2M;
+ }
+
+ /* Enables Coded PHY if supported */
+ if (le_coded_capable(hdev)) {
+ cp.tx_phys |= HCI_LE_SET_PHY_CODED;
+ cp.rx_phys |= HCI_LE_SET_PHY_CODED;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static const struct hci_init_stage le_init4[] = {
+ /* HCI_OP_LE_WRITE_DEF_DATA_LEN */
+ HCI_INIT(hci_le_set_write_def_data_len_sync),
+ /* HCI_OP_LE_SET_DEFAULT_PHY */
+ HCI_INIT(hci_le_set_default_phy_sync),
+ {}
+};
+
+static int hci_init4_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_init_stage_sync(hdev, hci_init4);
+ if (err)
+ return err;
+
+ if (lmp_le_capable(hdev))
+ return hci_init_stage_sync(hdev, le_init4);
+
+ return 0;
+}
+
+static int hci_init_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ err = hci_init1_sync(hdev);
+ if (err < 0)
+ return err;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ hci_debugfs_create_basic(hdev);
+
+ err = hci_init2_sync(hdev);
+ if (err < 0)
+ return err;
+
+ err = hci_init3_sync(hdev);
+ if (err < 0)
+ return err;
+
+ err = hci_init4_sync(hdev);
+ if (err < 0)
+ return err;
+
+ /* This function is only called when the controller is actually in
+ * configured state. When the controller is marked as unconfigured,
+ * this initialization procedure is not run.
+ *
+ * It means that it is possible that a controller runs through its
+ * setup phase and then discovers missing settings. If that is the
+ * case, then this function will not be called. It then will only
+ * be called during the config phase.
+ *
+ * So only when in setup phase or config phase, create the debugfs
+ * entries and register the SMP channels.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG))
+ return 0;
+
+ if (hci_dev_test_and_set_flag(hdev, HCI_DEBUGFS_CREATED))
+ return 0;
+
+ hci_debugfs_create_common(hdev);
+
+ if (lmp_bredr_capable(hdev))
+ hci_debugfs_create_bredr(hdev);
+
+ if (lmp_le_capable(hdev))
+ hci_debugfs_create_le(hdev);
+
+ return 0;
+}
+
+#define HCI_QUIRK_BROKEN(_quirk, _desc) { HCI_QUIRK_BROKEN_##_quirk, _desc }
+
+static const struct {
+ unsigned long quirk;
+ const char *desc;
+} hci_broken_table[] = {
+ HCI_QUIRK_BROKEN(LOCAL_COMMANDS,
+ "HCI Read Local Supported Commands not supported"),
+ HCI_QUIRK_BROKEN(STORED_LINK_KEY,
+ "HCI Delete Stored Link Key command is advertised, "
+ "but not supported."),
+ HCI_QUIRK_BROKEN(ERR_DATA_REPORTING,
+ "HCI Read Default Erroneous Data Reporting command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER,
+ "HCI Read Transmit Power Level command is advertised, "
+ "but not supported."),
+ HCI_QUIRK_BROKEN(FILTER_CLEAR_ALL,
+ "HCI Set Event Filter command not supported."),
+ HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN,
+ "HCI Enhanced Setup Synchronous Connection command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT,
+ "HCI LE Set Random Private Address Timeout command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(EXT_CREATE_CONN,
+ "HCI LE Extended Create Connection command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(WRITE_AUTH_PAYLOAD_TIMEOUT,
+ "HCI WRITE AUTH PAYLOAD TIMEOUT command leads "
+ "to unexpected SMP errors when pairing "
+ "and will not be used."),
+ HCI_QUIRK_BROKEN(LE_CODED,
+ "HCI LE Coded PHY feature bit is set, "
+ "but its usage is not supported.")
+};
+
+/* This function handles hdev setup stage:
+ *
+ * Calls hdev->setup
+ * Setup address if HCI_QUIRK_USE_BDADDR_PROPERTY is set.
+ */
+static int hci_dev_setup_sync(struct hci_dev *hdev)
+{
+ int ret = 0;
+ bool invalid_bdaddr;
+ size_t i;
+
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP))
+ return 0;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_sock_dev_event(hdev, HCI_DEV_SETUP);
+
+ if (hdev->setup)
+ ret = hdev->setup(hdev);
+
+ for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) {
+ if (hci_test_quirk(hdev, hci_broken_table[i].quirk))
+ bt_dev_warn(hdev, "%s", hci_broken_table[i].desc);
+ }
+
+ /* The transport driver can set the quirk to mark the
+ * BD_ADDR invalid before creating the HCI device or in
+ * its setup callback.
+ */
+ invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
+ if (!ret) {
+ if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ hci_dev_get_bd_addr_from_property(hdev);
+
+ if (invalid_bdaddr && bacmp(&hdev->public_addr, BDADDR_ANY) &&
+ hdev->set_bdaddr) {
+ ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
+ if (!ret)
+ invalid_bdaddr = false;
+ }
+ }
+
+ /* The transport driver can set these quirks before
+ * creating the HCI device or in its setup callback.
+ *
+ * For the invalid BD_ADDR quirk it is possible that
+ * it becomes a valid address if the bootloader does
+ * provide it (see above).
+ *
+ * In case any of them is set, the controller has to
+ * start up as unconfigured.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) ||
+ invalid_bdaddr)
+ hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
+
+ /* For an unconfigured controller it is required to
+ * read at least the version information provided by
+ * the Read Local Version Information command.
+ *
+ * If the set_bdaddr driver callback is provided, then
+ * also the original Bluetooth public device address
+ * will be read using the Read BD Address command.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ return hci_unconf_init_sync(hdev);
+
+ return ret;
+}
+
+/* This function handles hdev init stage:
+ *
+ * Calls hci_dev_setup_sync to perform setup stage
+ * Calls hci_init_sync to perform HCI command init sequence
+ */
+static int hci_dev_init_sync(struct hci_dev *hdev)
+{
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ atomic_set(&hdev->cmd_cnt, 1);
+ set_bit(HCI_INIT, &hdev->flags);
+
+ ret = hci_dev_setup_sync(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ /* If public address change is configured, ensure that
+ * the address gets programmed. If the driver does not
+ * support changing the public address, fail the power
+ * on procedure.
+ */
+ if (bacmp(&hdev->public_addr, BDADDR_ANY) &&
+ hdev->set_bdaddr)
+ ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
+ else
+ ret = -EADDRNOTAVAIL;
+ }
+
+ if (!ret) {
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ ret = hci_init_sync(hdev);
+ if (!ret && hdev->post_init)
+ ret = hdev->post_init(hdev);
+ }
+ }
+
+ /* If the HCI Reset command is clearing all diagnostic settings,
+ * then they need to be reprogrammed after the init procedure
+ * completed.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag)
+ ret = hdev->set_diag(hdev, true);
+
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ msft_do_open(hdev);
+ aosp_do_open(hdev);
+ }
+
+ clear_bit(HCI_INIT, &hdev->flags);
+
+ return ret;
+}
+
+int hci_dev_open_sync(struct hci_dev *hdev)
+{
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ /* Check for rfkill but allow the HCI setup stage to
+ * proceed (which in itself doesn't cause any RF activity).
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED)) {
+ ret = -ERFKILL;
+ goto done;
+ }
+
+ /* Check for valid public address or a configured static
+ * random address, but let the HCI setup proceed to
+ * be able to determine if there is a public address
+ * or not.
+ *
+ * In case of user channel usage, it is not important
+ * if a public address or static random address is
+ * available.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY)) {
+ ret = -EADDRNOTAVAIL;
+ goto done;
+ }
+ }
+
+ if (test_bit(HCI_UP, &hdev->flags)) {
+ ret = -EALREADY;
+ goto done;
+ }
+
+ if (hdev->open(hdev)) {
+ ret = -EIO;
+ goto done;
+ }
+
+ hci_devcd_reset(hdev);
+
+ set_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_OPEN);
+
+ ret = hci_dev_init_sync(hdev);
+ if (!ret) {
+ hci_dev_hold(hdev);
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
+ set_bit(HCI_UP, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_UP);
+ hci_leds_update_powered(hdev, true);
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG) &&
+ !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_MGMT)) {
+ ret = hci_powered_update_sync(hdev);
+ mgmt_power_on(hdev, ret);
+ }
+ } else {
+ /* Init failed, cleanup */
+ flush_work(&hdev->tx_work);
+
+ /* Since hci_rx_work() is possible to awake new cmd_work
+ * it should be flushed first to avoid unexpected call of
+ * hci_cmd_work()
+ */
+ flush_work(&hdev->rx_work);
+ flush_work(&hdev->cmd_work);
+
+ skb_queue_purge(&hdev->cmd_q);
+ skb_queue_purge(&hdev->rx_q);
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ if (hdev->sent_cmd) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ kfree_skb(hdev->sent_cmd);
+ hdev->sent_cmd = NULL;
+ }
+
+ if (hdev->req_skb) {
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
+ }
+
+ clear_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+
+ hdev->close(hdev);
+ hdev->flags &= BIT(HCI_RAW);
+ }
+
+done:
+ return ret;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void hci_pend_le_actions_clear(struct hci_dev *hdev)
+{
+ struct hci_conn_params *p;
+
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ hci_pend_le_list_del_init(p);
+ if (p->conn) {
+ hci_conn_drop(p->conn);
+ hci_conn_put(p->conn);
+ p->conn = NULL;
+ }
+ }
+
+ BT_DBG("All LE pending actions cleared");
+}
+
+static int hci_dev_shutdown(struct hci_dev *hdev)
+{
+ int err = 0;
+ /* Similar to how we first do setup and then set the exclusive access
+ * bit for userspace, we must first unset userchannel and then clean up.
+ * Otherwise, the kernel can't properly use the hci channel to clean up
+ * the controller (some shutdown routines require sending additional
+ * commands to the controller for example).
+ */
+ bool was_userchannel =
+ hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL);
+
+ if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
+ test_bit(HCI_UP, &hdev->flags)) {
+ /* Execute vendor specific shutdown routine */
+ if (hdev->shutdown)
+ err = hdev->shutdown(hdev);
+ }
+
+ if (was_userchannel)
+ hci_dev_set_flag(hdev, HCI_USER_CHANNEL);
+
+ return err;
+}
+
+int hci_dev_close_sync(struct hci_dev *hdev)
+{
+ bool auto_off;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "");
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ disable_delayed_work(&hdev->power_off);
+ disable_delayed_work(&hdev->ncmd_timer);
+ disable_delayed_work(&hdev->le_scan_disable);
+ } else {
+ cancel_delayed_work(&hdev->power_off);
+ cancel_delayed_work(&hdev->ncmd_timer);
+ cancel_delayed_work(&hdev->le_scan_disable);
+ }
+
+ hci_cmd_sync_cancel_sync(hdev, ENODEV);
+
+ cancel_interleave_scan(hdev);
+
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work_sync(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+
+ err = hci_dev_shutdown(hdev);
+
+ if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ return err;
+ }
+
+ hci_leds_update_powered(hdev, false);
+
+ /* Flush RX and TX works */
+ flush_work(&hdev->tx_work);
+ flush_work(&hdev->rx_work);
+
+ if (hdev->discov_timeout > 0) {
+ hdev->discov_timeout = 0;
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
+ cancel_delayed_work(&hdev->service_cache);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT)) {
+ struct adv_info *adv_instance;
+
+ cancel_delayed_work_sync(&hdev->rpa_expired);
+
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list)
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+ }
+
+ /* Avoid potential lockdep warnings from the *_flush() calls by
+ * ensuring the workqueue is empty up front.
+ */
+ drain_workqueue(hdev->workqueue);
+
+ hci_dev_lock(hdev);
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
+
+ if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_MGMT))
+ __mgmt_power_off(hdev);
+
+ hci_inquiry_cache_flush(hdev);
+ hci_pend_le_actions_clear(hdev);
+ hci_conn_hash_flush(hdev);
+ /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */
+ smp_unregister(hdev);
+ hci_dev_unlock(hdev);
+
+ hci_sock_dev_event(hdev, HCI_DEV_DOWN);
+
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ aosp_do_close(hdev);
+ msft_do_close(hdev);
+ }
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ /* Reset device */
+ skb_queue_purge(&hdev->cmd_q);
+ atomic_set(&hdev->cmd_cnt, 1);
+ if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) &&
+ !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ set_bit(HCI_INIT, &hdev->flags);
+ hci_reset_sync(hdev);
+ clear_bit(HCI_INIT, &hdev->flags);
+ }
+
+ /* flush cmd work */
+ flush_work(&hdev->cmd_work);
+
+ /* Drop queues */
+ skb_queue_purge(&hdev->rx_q);
+ skb_queue_purge(&hdev->cmd_q);
+ skb_queue_purge(&hdev->raw_q);
+
+ /* Drop last sent command */
+ if (hdev->sent_cmd) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ kfree_skb(hdev->sent_cmd);
+ hdev->sent_cmd = NULL;
+ }
+
+ /* Drop last request */
+ if (hdev->req_skb) {
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
+ }
+
+ clear_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+
+ /* After this point our queues are empty and no tasks are scheduled. */
+ hdev->close(hdev);
+
+ /* Clear flags */
+ hdev->flags &= BIT(HCI_RAW);
+ hci_dev_clear_volatile_flags(hdev);
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+ memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
+ bacpy(&hdev->random_addr, BDADDR_ANY);
+ hci_codec_list_clear(&hdev->local_codecs);
+
+ hci_dev_put(hdev);
+ return err;
+}
+
+/* This function perform power on HCI command sequence as follows:
+ *
+ * If controller is already up (HCI_UP) performs hci_powered_update_sync
+ * sequence otherwise run hci_dev_open_sync which will follow with
+ * hci_powered_update_sync after the init sequence is completed.
+ */
+static int hci_power_on_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (test_bit(HCI_UP, &hdev->flags) &&
+ hci_dev_test_flag(hdev, HCI_MGMT) &&
+ hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+ cancel_delayed_work(&hdev->power_off);
+ return hci_powered_update_sync(hdev);
+ }
+
+ err = hci_dev_open_sync(hdev);
+ if (err < 0)
+ return err;
+
+ /* During the HCI setup phase, a few error conditions are
+ * ignored and they need to be checked now. If they are still
+ * valid, it is important to return the device back off.
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
+ hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
+ (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
+ hci_dev_close_sync(hdev);
+ } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) {
+ queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
+ HCI_AUTO_OFF_TIMEOUT);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) {
+ /* For unconfigured devices, set the HCI_RAW flag
+ * so that userspace can easily identify them.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ set_bit(HCI_RAW, &hdev->flags);
+
+ /* For fully configured devices, this will send
+ * the Index Added event. For unconfigured devices,
+ * it will send Unconfigued Index Added event.
+ *
+ * Devices with HCI_QUIRK_RAW_DEVICE are ignored
+ * and no event will be send.
+ */
+ mgmt_index_added(hdev);
+ } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) {
+ /* When the controller is now configured, then it
+ * is important to clear the HCI_RAW flag.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ clear_bit(HCI_RAW, &hdev->flags);
+
+ /* Powering on the controller with HCI_CONFIG set only
+ * happens with the transition from unconfigured to
+ * configured. This will send the Index Added event.
+ */
+ mgmt_index_added(hdev);
+ }
+
+ return 0;
+}
+
+static int hci_remote_name_cancel_sync(struct hci_dev *hdev, bdaddr_t *addr)
+{
+ struct hci_cp_remote_name_req_cancel cp;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, addr);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_REMOTE_NAME_REQ_CANCEL,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_stop_discovery_sync(struct hci_dev *hdev)
+{
+ struct discovery_state *d = &hdev->discovery;
+ struct inquiry_entry *e;
+ int err;
+
+ bt_dev_dbg(hdev, "state %u", hdev->discovery.state);
+
+ if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) {
+ if (test_bit(HCI_INQUIRY, &hdev->flags)) {
+ err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL,
+ 0, NULL, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ cancel_delayed_work(&hdev->le_scan_disable);
+
+ err = hci_scan_disable_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ } else {
+ err = hci_scan_disable_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ /* Resume advertising if it was paused */
+ if (ll_privacy_capable(hdev))
+ hci_resume_advertising_sync(hdev);
+
+ /* No further actions needed for LE-only discovery */
+ if (d->type == DISCOV_TYPE_LE)
+ return 0;
+
+ if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) {
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
+ NAME_PENDING);
+ if (!e)
+ return 0;
+
+ /* Ignore cancel errors since it should interfere with stopping
+ * of the discovery.
+ */
+ hci_remote_name_cancel_sync(hdev, &e->data.bdaddr);
+ }
+
+ return 0;
+}
+
+static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_disconnect cp;
+
+ if (conn->type == BIS_LINK || conn->type == PA_LINK) {
+ /* This is a BIS connection, hci_conn_del will
+ * do the necessary cleanup.
+ */
+ hci_dev_lock(hdev);
+ hci_conn_failed(conn, reason);
+ hci_dev_unlock(hdev);
+
+ return 0;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.reason = reason;
+
+ /* Wait for HCI_EV_DISCONN_COMPLETE, not HCI_EV_CMD_STATUS, when the
+ * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is
+ * used when suspending or powering off, where we don't want to wait
+ * for the peer's response.
+ */
+ if (reason != HCI_ERROR_REMOTE_POWER_OFF)
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_DISCONNECT,
+ sizeof(cp), &cp,
+ HCI_EV_DISCONN_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_connect_cancel_sync(struct hci_dev *hdev,
+ struct hci_conn *conn, u8 reason)
+{
+ /* Return reason if scanning since the connection shall probably be
+ * cleanup directly.
+ */
+ if (test_bit(HCI_CONN_SCANNING, &conn->flags))
+ return reason;
+
+ if (conn->role == HCI_ROLE_SLAVE ||
+ test_and_set_bit(HCI_CONN_CANCEL, &conn->flags))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ if (conn->type == LE_LINK)
+ return hci_le_connect_cancel_sync(hdev, conn, reason);
+
+ if (conn->type == CIS_LINK) {
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 1857:
+ *
+ * If this command is issued for a CIS on the Central and the
+ * CIS is successfully terminated before being established,
+ * then an HCI_LE_CIS_Established event shall also be sent for
+ * this CIS with the Status Operation Cancelled by Host (0x44).
+ */
+ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags))
+ return hci_disconnect_sync(hdev, conn, reason);
+
+ /* CIS with no Create CIS sent have nothing to cancel */
+ return HCI_ERROR_LOCAL_HOST_TERM;
+ }
+
+ if (conn->type == BIS_LINK || conn->type == PA_LINK) {
+ /* There is no way to cancel a BIS without terminating the BIG
+ * which is done later on connection cleanup.
+ */
+ return 0;
+ }
+
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return 0;
+
+ /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the
+ * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is
+ * used when suspending or powering off, where we don't want to wait
+ * for the peer's response.
+ */
+ if (reason != HCI_ERROR_REMOTE_POWER_OFF)
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL,
+ 6, &conn->dst,
+ HCI_EV_CONN_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL,
+ 6, &conn->dst, HCI_CMD_TIMEOUT);
+}
+
+static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_reject_sync_conn_req cp;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.reason = reason;
+
+ /* SCO rejection has its own limited set of
+ * allowed error values (0x0D-0x0F).
+ */
+ if (reason < 0x0d || reason > 0x0f)
+ cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_SYNC_CONN_REQ,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_le_reject_cis cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.reason = reason;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_reject_conn_req cp;
+
+ if (conn->type == CIS_LINK)
+ return hci_le_reject_cis_sync(hdev, conn, reason);
+
+ if (conn->type == BIS_LINK || conn->type == PA_LINK)
+ return -EINVAL;
+
+ if (conn->type == SCO_LINK || conn->type == ESCO_LINK)
+ return hci_reject_sco_sync(hdev, conn, reason);
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.reason = reason;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_CONN_REQ,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
+{
+ int err = 0;
+ u16 handle = conn->handle;
+ bool disconnect = false;
+ struct hci_conn *c;
+
+ switch (conn->state) {
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ err = hci_disconnect_sync(hdev, conn, reason);
+ break;
+ case BT_CONNECT:
+ err = hci_connect_cancel_sync(hdev, conn, reason);
+ break;
+ case BT_CONNECT2:
+ err = hci_reject_conn_sync(hdev, conn, reason);
+ break;
+ case BT_OPEN:
+ case BT_BOUND:
+ break;
+ default:
+ disconnect = true;
+ break;
+ }
+
+ hci_dev_lock(hdev);
+
+ /* Check if the connection has been cleaned up concurrently */
+ c = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!c || c != conn) {
+ err = 0;
+ goto unlock;
+ }
+
+ /* Cleanup hci_conn object if it cannot be cancelled as it
+ * likely means the controller and host stack are out of sync
+ * or in case of LE it was still scanning so it can be cleanup
+ * safely.
+ */
+ if (disconnect) {
+ conn->state = BT_CLOSED;
+ hci_disconn_cfm(conn, reason);
+ hci_conn_del(conn);
+ } else {
+ hci_conn_failed(conn, reason);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason)
+{
+ struct list_head *head = &hdev->conn_hash.list;
+ struct hci_conn *conn;
+
+ rcu_read_lock();
+ while ((conn = list_first_or_null_rcu(head, struct hci_conn, list))) {
+ /* Make sure the connection is not freed while unlocking */
+ conn = hci_conn_get(conn);
+ rcu_read_unlock();
+ /* Disregard possible errors since hci_conn_del shall have been
+ * called even in case of errors had occurred since it would
+ * then cause hci_conn_failed to be called which calls
+ * hci_conn_del internally.
+ */
+ hci_abort_conn_sync(hdev, conn, reason);
+ hci_conn_put(conn);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/* This function perform power off HCI command sequence as follows:
+ *
+ * Clear Advertising
+ * Stop Discovery
+ * Disconnect all connections
+ * hci_dev_close_sync
+ */
+static int hci_power_off_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If controller is already down there is nothing to do */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return 0;
+
+ hci_dev_set_flag(hdev, HCI_POWERING_DOWN);
+
+ if (test_bit(HCI_ISCAN, &hdev->flags) ||
+ test_bit(HCI_PSCAN, &hdev->flags)) {
+ err = hci_write_scan_enable_sync(hdev, 0x00);
+ if (err)
+ goto out;
+ }
+
+ err = hci_clear_adv_sync(hdev, NULL, false);
+ if (err)
+ goto out;
+
+ err = hci_stop_discovery_sync(hdev);
+ if (err)
+ goto out;
+
+ /* Terminated due to Power Off */
+ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
+ if (err)
+ goto out;
+
+ err = hci_dev_close_sync(hdev);
+
+out:
+ hci_dev_clear_flag(hdev, HCI_POWERING_DOWN);
+ return err;
+}
+
+int hci_set_powered_sync(struct hci_dev *hdev, u8 val)
+{
+ if (val)
+ return hci_power_on_sync(hdev);
+
+ return hci_power_off_sync(hdev);
+}
+
+static int hci_write_iac_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_current_iac_lap cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
+ /* Limited discoverable mode */
+ cp.num_iac = min_t(u8, hdev->num_iac, 2);
+ cp.iac_lap[0] = 0x00; /* LIAC */
+ cp.iac_lap[1] = 0x8b;
+ cp.iac_lap[2] = 0x9e;
+ cp.iac_lap[3] = 0x33; /* GIAC */
+ cp.iac_lap[4] = 0x8b;
+ cp.iac_lap[5] = 0x9e;
+ } else {
+ /* General discoverable mode */
+ cp.num_iac = 1;
+ cp.iac_lap[0] = 0x33; /* GIAC */
+ cp.iac_lap[1] = 0x8b;
+ cp.iac_lap[2] = 0x9e;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CURRENT_IAC_LAP,
+ (cp.num_iac * 3) + 1, &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+int hci_update_discoverable_sync(struct hci_dev *hdev)
+{
+ int err = 0;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = hci_write_iac_sync(hdev);
+ if (err)
+ return err;
+
+ err = hci_update_scan_sync(hdev);
+ if (err)
+ return err;
+
+ err = hci_update_class_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ /* Advertising instances don't use the global discoverable setting, so
+ * only update AD if advertising was enabled using Set Advertising.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ err = hci_update_adv_data_sync(hdev, 0x00);
+ if (err)
+ return err;
+
+ /* Discoverable mode affects the local advertising
+ * address in limited privacy mode.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) {
+ if (ext_adv_capable(hdev))
+ err = hci_start_ext_adv_sync(hdev, 0x00);
+ else
+ err = hci_enable_advertising_sync(hdev);
+ }
+ }
+
+ return err;
+}
+
+static int update_discoverable_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_discoverable_sync(hdev);
+}
+
+int hci_update_discoverable(struct hci_dev *hdev)
+{
+ /* Only queue if it would have any effect */
+ if (hdev_is_powered(hdev) &&
+ hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ return hci_cmd_sync_queue(hdev, update_discoverable_sync, NULL,
+ NULL);
+
+ return 0;
+}
+
+int hci_update_connectable_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ err = hci_update_scan_sync(hdev);
+ if (err)
+ return err;
+
+ /* If BR/EDR is not enabled and we disable advertising as a
+ * by-product of disabling connectable, we need to update the
+ * advertising flags.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ err = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance);
+
+ /* Update the advertising parameters if necessary */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !list_empty(&hdev->adv_instances)) {
+ if (ext_adv_capable(hdev))
+ err = hci_start_ext_adv_sync(hdev,
+ hdev->cur_adv_instance);
+ else
+ err = hci_enable_advertising_sync(hdev);
+
+ if (err)
+ return err;
+ }
+
+ return hci_update_passive_scan_sync(hdev);
+}
+
+int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp)
+{
+ const u8 giac[3] = { 0x33, 0x8b, 0x9e };
+ const u8 liac[3] = { 0x00, 0x8b, 0x9e };
+ struct hci_cp_inquiry cp;
+
+ bt_dev_dbg(hdev, "");
+
+ if (test_bit(HCI_INQUIRY, &hdev->flags))
+ return 0;
+
+ hci_dev_lock(hdev);
+ hci_inquiry_cache_flush(hdev);
+ hci_dev_unlock(hdev);
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hdev->discovery.limited)
+ memcpy(&cp.lap, liac, sizeof(cp.lap));
+ else
+ memcpy(&cp.lap, giac, sizeof(cp.lap));
+
+ cp.length = length;
+ cp.num_rsp = num_rsp;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval)
+{
+ u8 own_addr_type;
+ /* Accept list is not used for discovery */
+ u8 filter_policy = 0x00;
+ /* Default is to enable duplicates filter */
+ u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ /* If controller is scanning, it means the passive scanning is
+ * running. Thus, we should temporarily stop it in order to set the
+ * discovery scanning parameters.
+ */
+ err = hci_scan_disable_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "Unable to disable scanning: %d", err);
+ return err;
+ }
+
+ cancel_interleave_scan(hdev);
+
+ /* Pause address resolution for active scan and stop advertising if
+ * privacy is enabled.
+ */
+ err = hci_pause_addr_resolution(hdev);
+ if (err)
+ goto failed;
+
+ /* All active scans will be done with either a resolvable private
+ * address (when privacy feature has been enabled) or non-resolvable
+ * private address.
+ */
+ err = hci_update_random_address_sync(hdev, true, scan_use_rpa(hdev),
+ &own_addr_type);
+ if (err < 0)
+ own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ if (hci_is_adv_monitoring(hdev) ||
+ (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER) &&
+ hdev->discovery.result_filtering)) {
+ /* Duplicate filter should be disabled when some advertisement
+ * monitor is activated, otherwise AdvMon can only receive one
+ * advertisement for one peer(*) during active scanning, and
+ * might report loss to these peers.
+ *
+ * If controller does strict duplicate filtering and the
+ * discovery requires result filtering disables controller based
+ * filtering since that can cause reports that would match the
+ * host filter to not be reported.
+ */
+ filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+ }
+
+ err = hci_start_scan_sync(hdev, LE_SCAN_ACTIVE, interval,
+ hdev->le_scan_window_discovery,
+ own_addr_type, filter_policy, filter_dup);
+ if (!err)
+ return err;
+
+failed:
+ /* Resume advertising if it was paused */
+ if (ll_privacy_capable(hdev))
+ hci_resume_advertising_sync(hdev);
+
+ /* Resume passive scanning */
+ hci_update_passive_scan_sync(hdev);
+ return err;
+}
+
+static int hci_start_interleaved_discovery_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery * 2);
+ if (err)
+ return err;
+
+ return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0);
+}
+
+int hci_start_discovery_sync(struct hci_dev *hdev)
+{
+ unsigned long timeout;
+ int err;
+
+ bt_dev_dbg(hdev, "type %u", hdev->discovery.type);
+
+ switch (hdev->discovery.type) {
+ case DISCOV_TYPE_BREDR:
+ return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0);
+ case DISCOV_TYPE_INTERLEAVED:
+ /* When running simultaneous discovery, the LE scanning time
+ * should occupy the whole discovery time sine BR/EDR inquiry
+ * and LE scanning are scheduled by the controller.
+ *
+ * For interleaving discovery in comparison, BR/EDR inquiry
+ * and LE scanning are done sequentially with separate
+ * timeouts.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) {
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ /* During simultaneous discovery, we double LE scan
+ * interval. We must leave some time for the controller
+ * to do BR/EDR inquiry.
+ */
+ err = hci_start_interleaved_discovery_sync(hdev);
+ break;
+ }
+
+ timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
+ err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery);
+ break;
+ case DISCOV_TYPE_LE:
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (err)
+ return err;
+
+ bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout));
+
+ queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable,
+ timeout);
+ return 0;
+}
+
+static void hci_suspend_monitor_sync(struct hci_dev *hdev)
+{
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ msft_suspend_sync(hdev);
+ break;
+ default:
+ return;
+ }
+}
+
+/* This function disables discovery and mark it as paused */
+static int hci_pause_discovery_sync(struct hci_dev *hdev)
+{
+ int old_state = hdev->discovery.state;
+ int err;
+
+ /* If discovery already stopped/stopping/paused there nothing to do */
+ if (old_state == DISCOVERY_STOPPED || old_state == DISCOVERY_STOPPING ||
+ hdev->discovery_paused)
+ return 0;
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+ err = hci_stop_discovery_sync(hdev);
+ if (err)
+ return err;
+
+ hdev->discovery_paused = true;
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ return 0;
+}
+
+static int hci_update_event_filter_sync(struct hci_dev *hdev)
+{
+ struct bdaddr_list_with_flags *b;
+ u8 scan = SCAN_DISABLED;
+ bool scanning = test_bit(HCI_PSCAN, &hdev->flags);
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ /* Some fake CSR controllers lock up after setting this type of
+ * filter, so avoid sending the request altogether.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
+ return 0;
+
+ /* Always clear event filter when starting */
+ hci_clear_event_filter_sync(hdev);
+
+ list_for_each_entry(b, &hdev->accept_list, list) {
+ if (!(b->flags & HCI_CONN_FLAG_REMOTE_WAKEUP))
+ continue;
+
+ bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr);
+
+ err = hci_set_event_filter_sync(hdev, HCI_FLT_CONN_SETUP,
+ HCI_CONN_SETUP_ALLOW_BDADDR,
+ &b->bdaddr,
+ HCI_CONN_SETUP_AUTO_ON);
+ if (err)
+ bt_dev_err(hdev, "Failed to set event filter for %pMR",
+ &b->bdaddr);
+ else
+ scan = SCAN_PAGE;
+ }
+
+ if (scan && !scanning)
+ hci_write_scan_enable_sync(hdev, scan);
+ else if (!scan && scanning)
+ hci_write_scan_enable_sync(hdev, scan);
+
+ return 0;
+}
+
+/* This function disables scan (BR and LE) and mark it as paused */
+static int hci_pause_scan_sync(struct hci_dev *hdev)
+{
+ if (hdev->scanning_paused)
+ return 0;
+
+ /* Disable page scan if enabled */
+ if (test_bit(HCI_PSCAN, &hdev->flags))
+ hci_write_scan_enable_sync(hdev, SCAN_DISABLED);
+
+ hci_scan_disable_sync(hdev);
+
+ hdev->scanning_paused = true;
+
+ return 0;
+}
+
+/* This function performs the HCI suspend procedures in the follow order:
+ *
+ * Pause discovery (active scanning/inquiry)
+ * Pause Directed Advertising/Advertising
+ * Pause Scanning (passive scanning in case discovery was not active)
+ * Disconnect all connections
+ * Set suspend_status to BT_SUSPEND_DISCONNECT if hdev cannot wakeup
+ * otherwise:
+ * Update event mask (only set events that are allowed to wake up the host)
+ * Update event filter (with devices marked with HCI_CONN_FLAG_REMOTE_WAKEUP)
+ * Update passive scanning (lower duty cycle)
+ * Set suspend_status to BT_SUSPEND_CONFIGURE_WAKE
+ */
+int hci_suspend_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If marked as suspended there nothing to do */
+ if (hdev->suspended)
+ return 0;
+
+ /* Mark device as suspended */
+ hdev->suspended = true;
+
+ /* Pause discovery if not already stopped */
+ hci_pause_discovery_sync(hdev);
+
+ /* Pause other advertisements */
+ hci_pause_advertising_sync(hdev);
+
+ /* Suspend monitor filters */
+ hci_suspend_monitor_sync(hdev);
+
+ /* Prevent disconnects from causing scanning to be re-enabled */
+ hci_pause_scan_sync(hdev);
+
+ if (hci_conn_count(hdev)) {
+ /* Soft disconnect everything (power off) */
+ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
+ if (err) {
+ /* Set state to BT_RUNNING so resume doesn't notify */
+ hdev->suspend_state = BT_RUNNING;
+ hci_resume_sync(hdev);
+ return err;
+ }
+
+ /* Update event mask so only the allowed event can wakeup the
+ * host.
+ */
+ hci_set_event_mask_sync(hdev);
+ }
+
+ /* Only configure accept list if disconnect succeeded and wake
+ * isn't being prevented.
+ */
+ if (!hdev->wakeup || !hdev->wakeup(hdev)) {
+ hdev->suspend_state = BT_SUSPEND_DISCONNECT;
+ return 0;
+ }
+
+ /* Unpause to take care of updating scanning params */
+ hdev->scanning_paused = false;
+
+ /* Enable event filter for paired devices */
+ hci_update_event_filter_sync(hdev);
+
+ /* Update LE passive scan if enabled */
+ hci_update_passive_scan_sync(hdev);
+
+ /* Pause scan changes again. */
+ hdev->scanning_paused = true;
+
+ hdev->suspend_state = BT_SUSPEND_CONFIGURE_WAKE;
+
+ return 0;
+}
+
+/* This function resumes discovery */
+static int hci_resume_discovery_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If discovery not paused there nothing to do */
+ if (!hdev->discovery_paused)
+ return 0;
+
+ hdev->discovery_paused = false;
+
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+
+ err = hci_start_discovery_sync(hdev);
+
+ hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED :
+ DISCOVERY_FINDING);
+
+ return err;
+}
+
+static void hci_resume_monitor_sync(struct hci_dev *hdev)
+{
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ msft_resume_sync(hdev);
+ break;
+ default:
+ return;
+ }
+}
+
+/* This function resume scan and reset paused flag */
+static int hci_resume_scan_sync(struct hci_dev *hdev)
+{
+ if (!hdev->scanning_paused)
+ return 0;
+
+ hdev->scanning_paused = false;
+
+ hci_update_scan_sync(hdev);
+
+ /* Reset passive scanning to normal */
+ hci_update_passive_scan_sync(hdev);
+
+ return 0;
+}
+
+/* This function performs the HCI suspend procedures in the follow order:
+ *
+ * Restore event mask
+ * Clear event filter
+ * Update passive scanning (normal duty cycle)
+ * Resume Directed Advertising/Advertising
+ * Resume discovery (active scanning/inquiry)
+ */
+int hci_resume_sync(struct hci_dev *hdev)
+{
+ /* If not marked as suspended there nothing to do */
+ if (!hdev->suspended)
+ return 0;
+
+ hdev->suspended = false;
+
+ /* Restore event mask */
+ hci_set_event_mask_sync(hdev);
+
+ /* Clear any event filters and restore scan state */
+ hci_clear_event_filter_sync(hdev);
+
+ /* Resume scanning */
+ hci_resume_scan_sync(hdev);
+
+ /* Resume monitor filters */
+ hci_resume_monitor_sync(hdev);
+
+ /* Resume other advertisements */
+ hci_resume_advertising_sync(hdev);
+
+ /* Resume discovery */
+ hci_resume_discovery_sync(hdev);
+
+ return 0;
+}
+
+static bool conn_use_rpa(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
+static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev,
+ struct hci_conn *conn)
+{
+ struct hci_cp_le_set_ext_adv_params cp;
+ struct hci_rp_le_set_ext_adv_params rp;
+ int err;
+ bdaddr_t random_addr;
+ u8 own_addr_type;
+
+ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn),
+ &own_addr_type);
+ if (err)
+ return err;
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ err = hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL,
+ &own_addr_type, &random_addr);
+ if (err)
+ return err;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND);
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.tx_power = HCI_TX_POWER_INVALID;
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ cp.handle = 0x00; /* Use instance 0 for directed adv */
+ cp.own_addr_type = own_addr_type;
+ cp.peer_addr_type = conn->dst_type;
+ bacpy(&cp.peer_addr, &conn->dst);
+
+ /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for
+ * advertising_event_property LE_LEGACY_ADV_DIRECT_IND
+ * does not supports advertising data when the advertising set already
+ * contains some, the controller shall return erroc code 'Invalid
+ * HCI Command Parameters(0x12).
+ * So it is required to remove adv set for handle 0x00. since we use
+ * instance 0 for directed adv.
+ */
+ err = hci_remove_ext_adv_instance_sync(hdev, cp.handle, NULL);
+ if (err)
+ return err;
+
+ err = hci_set_ext_adv_params_sync(hdev, NULL, &cp, &rp);
+ if (err)
+ return err;
+
+ /* Update adv data as tx power is known now */
+ err = hci_set_ext_adv_data_sync(hdev, cp.handle);
+ if (err)
+ return err;
+
+ /* Check if random address need to be updated */
+ if (own_addr_type == ADDR_LE_DEV_RANDOM &&
+ bacmp(&random_addr, BDADDR_ANY) &&
+ bacmp(&random_addr, &hdev->random_addr)) {
+ err = hci_set_adv_set_random_addr_sync(hdev, 0x00,
+ &random_addr);
+ if (err)
+ return err;
+ }
+
+ return hci_enable_ext_advertising_sync(hdev, 0x00);
+}
+
+static int hci_le_directed_advertising_sync(struct hci_dev *hdev,
+ struct hci_conn *conn)
+{
+ struct hci_cp_le_set_adv_param cp;
+ u8 status;
+ u8 own_addr_type;
+ u8 enable;
+
+ if (ext_adv_capable(hdev))
+ return hci_le_ext_directed_advertising_sync(hdev, conn);
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ status = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn),
+ &own_addr_type);
+ if (status)
+ return status;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Some controllers might reject command if intervals are not
+ * within range for undirected advertising.
+ * BCM20702A0 is known to be affected by this.
+ */
+ cp.min_interval = cpu_to_le16(0x0020);
+ cp.max_interval = cpu_to_le16(0x0020);
+
+ cp.type = LE_ADV_DIRECT_IND;
+ cp.own_address_type = own_addr_type;
+ cp.direct_addr_type = conn->dst_type;
+ bacpy(&cp.direct_addr, &conn->dst);
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (status)
+ return status;
+
+ enable = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
+ sizeof(enable), &enable, HCI_CMD_TIMEOUT);
+}
+
+static void set_ext_conn_params(struct hci_conn *conn,
+ struct hci_cp_le_ext_conn_param *p)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ memset(p, 0, sizeof(*p));
+
+ p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect);
+ p->scan_window = cpu_to_le16(hdev->le_scan_window_connect);
+ p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ p->conn_latency = cpu_to_le16(conn->le_conn_latency);
+ p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ p->min_ce_len = cpu_to_le16(0x0000);
+ p->max_ce_len = cpu_to_le16(0x0000);
+}
+
+static int hci_le_ext_create_conn_sync(struct hci_dev *hdev,
+ struct hci_conn *conn, u8 own_addr_type)
+{
+ struct hci_cp_le_ext_create_conn *cp;
+ struct hci_cp_le_ext_conn_param *p;
+ u8 data[sizeof(*cp) + sizeof(*p) * 3];
+ u32 plen;
+
+ cp = (void *)data;
+ p = (void *)cp->data;
+
+ memset(cp, 0, sizeof(*cp));
+
+ bacpy(&cp->peer_addr, &conn->dst);
+ cp->peer_addr_type = conn->dst_type;
+ cp->own_addr_type = own_addr_type;
+
+ plen = sizeof(*cp);
+
+ if (scan_1m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_1M ||
+ conn->le_adv_sec_phy == HCI_ADV_PHY_1M)) {
+ cp->phys |= LE_SCAN_PHY_1M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_2m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_2M ||
+ conn->le_adv_sec_phy == HCI_ADV_PHY_2M)) {
+ cp->phys |= LE_SCAN_PHY_2M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_coded(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_CODED ||
+ conn->le_adv_sec_phy == HCI_ADV_PHY_CODED)) {
+ cp->phys |= LE_SCAN_PHY_CODED;
+ set_ext_conn_params(conn, p);
+
+ plen += sizeof(*p);
+ }
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN,
+ plen, data,
+ HCI_EV_LE_ENHANCED_CONN_COMPLETE,
+ conn->conn_timeout, NULL);
+}
+
+static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_cp_le_create_conn cp;
+ struct hci_conn_params *params;
+ u8 own_addr_type;
+ int err;
+ struct hci_conn *conn = data;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ clear_bit(HCI_CONN_SCANNING, &conn->flags);
+ conn->state = BT_CONNECT;
+
+ /* If requested to connect as peripheral use directed advertising */
+ if (conn->role == HCI_ROLE_SLAVE) {
+ /* If we're active scanning and simultaneous roles is not
+ * enabled simply reject the attempt.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->le_scan_type == LE_SCAN_ACTIVE &&
+ !hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) {
+ hci_conn_del(conn);
+ return -EBUSY;
+ }
+
+ /* Pause advertising while doing directed advertising. */
+ hci_pause_advertising_sync(hdev);
+
+ err = hci_le_directed_advertising_sync(hdev, conn);
+ goto done;
+ }
+
+ /* Disable advertising if simultaneous roles is not in use. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES))
+ hci_pause_advertising_sync(hdev);
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ conn->le_conn_min_interval = params->conn_min_interval;
+ conn->le_conn_max_interval = params->conn_max_interval;
+ conn->le_conn_latency = params->conn_latency;
+ conn->le_supv_timeout = params->supervision_timeout;
+ } else {
+ conn->le_conn_min_interval = hdev->le_conn_min_interval;
+ conn->le_conn_max_interval = hdev->le_conn_max_interval;
+ conn->le_conn_latency = hdev->le_conn_latency;
+ conn->le_supv_timeout = hdev->le_supv_timeout;
+ }
+
+ /* If controller is scanning, we stop it since some controllers are
+ * not able to scan and connect at the same time. Also set the
+ * HCI_LE_SCAN_INTERRUPTED flag so that the command complete
+ * handler for scan disabling knows to set the correct discovery
+ * state.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_scan_disable_sync(hdev);
+ hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
+ }
+
+ /* Update random address, but set require_privacy to false so
+ * that we never connect with an non-resolvable address.
+ */
+ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn),
+ &own_addr_type);
+ if (err)
+ goto done;
+ /* Send command LE Extended Create Connection if supported */
+ if (use_ext_conn(hdev)) {
+ err = hci_le_ext_create_conn_sync(hdev, conn, own_addr_type);
+ goto done;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect);
+ cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect);
+
+ bacpy(&cp.peer_addr, &conn->dst);
+ cp.peer_addr_type = conn->dst_type;
+ cp.own_address_type = own_addr_type;
+ cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
+ cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261:
+ *
+ * If this event is unmasked and the HCI_LE_Connection_Complete event
+ * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is
+ * sent when a new connection has been created.
+ */
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN,
+ sizeof(cp), &cp,
+ use_enhanced_conn_complete(hdev) ?
+ HCI_EV_LE_ENHANCED_CONN_COMPLETE :
+ HCI_EV_LE_CONN_COMPLETE,
+ conn->conn_timeout, NULL);
+
+done:
+ if (err == -ETIMEDOUT)
+ hci_le_connect_cancel_sync(hdev, conn, 0x00);
+
+ /* Re-enable advertising after the connection attempt is finished. */
+ hci_resume_advertising_sync(hdev);
+ return err;
+}
+
+int hci_le_create_cis_sync(struct hci_dev *hdev)
+{
+ DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f);
+ size_t aux_num_cis = 0;
+ struct hci_conn *conn;
+ u8 cig = BT_ISO_QOS_CIG_UNSET;
+
+ /* The spec allows only one pending LE Create CIS command at a time. If
+ * the command is pending now, don't do anything. We check for pending
+ * connections after each CIS Established event.
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2566:
+ *
+ * If the Host issues this command before all the
+ * HCI_LE_CIS_Established events from the previous use of the
+ * command have been generated, the Controller shall return the
+ * error code Command Disallowed (0x0C).
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2567:
+ *
+ * When the Controller receives the HCI_LE_Create_CIS command, the
+ * Controller sends the HCI_Command_Status event to the Host. An
+ * HCI_LE_CIS_Established event will be generated for each CIS when it
+ * is established or if it is disconnected or considered lost before
+ * being established; until all the events are generated, the command
+ * remains pending.
+ */
+
+ hci_dev_lock(hdev);
+
+ rcu_read_lock();
+
+ /* Wait until previous Create CIS has completed */
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags))
+ goto done;
+ }
+
+ /* Find CIG with all CIS ready */
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ struct hci_conn *link;
+
+ if (hci_conn_check_create_cis(conn))
+ continue;
+
+ cig = conn->iso_qos.ucast.cig;
+
+ list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) {
+ if (hci_conn_check_create_cis(link) > 0 &&
+ link->iso_qos.ucast.cig == cig &&
+ link->state != BT_CONNECTED) {
+ cig = BT_ISO_QOS_CIG_UNSET;
+ break;
+ }
+ }
+
+ if (cig != BT_ISO_QOS_CIG_UNSET)
+ break;
+ }
+
+ if (cig == BT_ISO_QOS_CIG_UNSET)
+ goto done;
+
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ struct hci_cis *cis = &cmd->cis[aux_num_cis];
+
+ if (hci_conn_check_create_cis(conn) ||
+ conn->iso_qos.ucast.cig != cig)
+ continue;
+
+ set_bit(HCI_CONN_CREATE_CIS, &conn->flags);
+ cis->acl_handle = cpu_to_le16(conn->parent->handle);
+ cis->cis_handle = cpu_to_le16(conn->handle);
+ aux_num_cis++;
+
+ if (aux_num_cis >= cmd->num_cis)
+ break;
+ }
+ cmd->num_cis = aux_num_cis;
+
+done:
+ rcu_read_unlock();
+
+ hci_dev_unlock(hdev);
+
+ if (!aux_num_cis)
+ return 0;
+
+ /* Wait for HCI_LE_CIS_Established */
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS,
+ struct_size(cmd, cis, cmd->num_cis),
+ cmd, HCI_EVT_LE_CIS_ESTABLISHED,
+ conn->conn_timeout, NULL);
+}
+
+int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
+{
+ struct hci_cp_le_remove_cig cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.cig_id = handle;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp),
+ &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle)
+{
+ struct hci_cp_le_big_term_sync cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_BIG_TERM_SYNC,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle)
+{
+ struct hci_cp_le_pa_term_sync cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(handle);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+ bool use_rpa, struct adv_info *adv_instance,
+ u8 *own_addr_type, bdaddr_t *rand_addr)
+{
+ int err;
+
+ bacpy(rand_addr, BDADDR_ANY);
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired then generate a new one.
+ */
+ if (use_rpa) {
+ /* If Controller supports LL Privacy use own address type is
+ * 0x03
+ */
+ if (ll_privacy_capable(hdev))
+ *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+ else
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ if (adv_instance) {
+ if (adv_rpa_valid(adv_instance))
+ return 0;
+ } else {
+ if (rpa_valid(hdev))
+ return 0;
+ }
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ bt_dev_err(hdev, "failed to generate new RPA");
+ return err;
+ }
+
+ bacpy(rand_addr, &hdev->rpa);
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for
+ * non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(rand_addr, &nrpa);
+
+ return 0;
+ }
+
+ /* No privacy, use the current address */
+ hci_copy_identity_address(hdev, rand_addr, own_addr_type);
+
+ return 0;
+}
+
+static int _update_adv_data_sync(struct hci_dev *hdev, void *data)
+{
+ u8 instance = PTR_UINT(data);
+
+ return hci_update_adv_data_sync(hdev, instance);
+}
+
+int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
+{
+ return hci_cmd_sync_queue(hdev, _update_adv_data_sync,
+ UINT_PTR(instance), NULL);
+}
+
+static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+ struct inquiry_entry *ie;
+ struct hci_cp_create_conn cp;
+ int err;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ /* Many controllers disallow HCI Create Connection while it is doing
+ * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
+ * Connection. This may cause the MGMT discovering state to become false
+ * without user space's request but it is okay since the MGMT Discovery
+ * APIs do not promise that discovery should be done forever. Instead,
+ * the user space monitors the status of MGMT discovering and it may
+ * request for discovery again when this flag becomes false.
+ */
+ if (test_bit(HCI_INQUIRY, &hdev->flags)) {
+ err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0,
+ NULL, HCI_CMD_TIMEOUT);
+ if (err)
+ bt_dev_warn(hdev, "Failed to cancel inquiry %d", err);
+ }
+
+ conn->state = BT_CONNECT;
+ conn->out = true;
+ conn->role = HCI_ROLE_MASTER;
+
+ conn->attempt++;
+
+ conn->link_policy = hdev->link_policy;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie) {
+ if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
+ cp.pscan_rep_mode = ie->data.pscan_rep_mode;
+ cp.pscan_mode = ie->data.pscan_mode;
+ cp.clock_offset = ie->data.clock_offset |
+ cpu_to_le16(0x8000);
+ }
+
+ memcpy(conn->dev_class, ie->data.dev_class, 3);
+ }
+
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+ if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
+ cp.role_switch = 0x01;
+ else
+ cp.role_switch = 0x00;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
+ sizeof(cp), &cp,
+ HCI_EV_CONN_COMPLETE,
+ conn->conn_timeout, NULL);
+}
+
+int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn,
+ NULL);
+}
+
+static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, conn))
+ goto done;
+
+ if (!err) {
+ hci_connect_le_scan_cleanup(conn, 0x00);
+ goto done;
+ }
+
+ /* Check if connection is still pending */
+ if (conn != hci_lookup_le_connect(hdev))
+ goto done;
+
+ /* Flush to make sure we send create conn cancel command if needed */
+ flush_delayed_work(&conn->le_conn_timeout);
+ hci_conn_failed(conn, bt_status(err));
+
+done:
+ hci_dev_unlock(hdev);
+}
+
+int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn,
+ create_le_conn_complete);
+}
+
+int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ if (conn->state != BT_OPEN)
+ return -EINVAL;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ return !hci_cmd_sync_dequeue_once(hdev,
+ hci_acl_create_conn_sync,
+ conn, NULL);
+ case LE_LINK:
+ return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync,
+ conn, create_le_conn_complete);
+ }
+
+ return -ENOENT;
+}
+
+int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ struct hci_conn_params *params)
+{
+ struct hci_cp_le_conn_update cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.conn_interval_min = cpu_to_le16(params->conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(params->conn_max_interval);
+ cp.conn_latency = cpu_to_le16(params->conn_latency);
+ cp.supervision_timeout = cpu_to_le16(params->supervision_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+ struct hci_conn *pa_sync;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (hci_conn_valid(hdev, conn))
+ clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
+
+ if (!err)
+ goto unlock;
+
+ /* Add connection to indicate PA sync error */
+ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, bt_status(err));
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int hci_le_past_params_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ struct hci_conn *acl, struct bt_iso_qos *qos)
+{
+ struct hci_cp_le_past_params cp;
+ int err;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(acl->handle);
+ /* An HCI_LE_Periodic_Advertising_Sync_Transfer_Received event is sent
+ * to the Host. HCI_LE_Periodic_Advertising_Report events will be
+ * enabled with duplicate filtering enabled.
+ */
+ cp.mode = 0x03;
+ cp.skip = cpu_to_le16(qos->bcast.skip);
+ cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+ cp.cte_type = qos->bcast.sync_cte_type;
+
+ /* HCI_LE_PAST_PARAMS command returns a command complete event so it
+ * cannot wait for HCI_EV_LE_PAST_RECEIVED.
+ */
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_PARAMS,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ /* Wait for HCI_EV_LE_PAST_RECEIVED event */
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+ HCI_EV_LE_PAST_RECEIVED,
+ conn->conn_timeout, NULL);
+}
+
+static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_cp_le_pa_create_sync cp;
+ struct hci_conn *conn = data, *le;
+ struct bt_iso_qos *qos = &conn->iso_qos;
+ int err;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ if (conn->sync_handle != HCI_SYNC_HANDLE_INVALID)
+ return -EINVAL;
+
+ if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC))
+ return -EBUSY;
+
+ /* Stop scanning if SID has not been set and active scanning is enabled
+ * so we use passive scanning which will be scanning using the allow
+ * list programmed to contain only the connection address.
+ */
+ if (conn->sid == HCI_SID_INVALID &&
+ hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_scan_disable_sync(hdev);
+ hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ }
+
+ /* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can
+ * program the address in the allow list so PA advertisements can be
+ * received.
+ */
+ set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
+
+ hci_update_passive_scan_sync(hdev);
+
+ /* Check if PAST is possible:
+ *
+ * 1. Check if an ACL connection with the destination address exists
+ * 2. Check if that HCI_CONN_FLAG_PAST has been set which indicates that
+ * user really intended to use PAST.
+ */
+ le = hci_conn_hash_lookup_le(hdev, &conn->dst, conn->dst_type);
+ if (le) {
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, &le->dst, le->dst_type);
+ if (params && params->flags & HCI_CONN_FLAG_PAST) {
+ err = hci_le_past_params_sync(hdev, conn, le, qos);
+ if (!err)
+ goto done;
+ }
+ }
+
+ /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update
+ * it.
+ */
+ if (conn->sid == HCI_SID_INVALID) {
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+ HCI_EV_LE_EXT_ADV_REPORT,
+ conn->conn_timeout, NULL);
+ if (err == -ETIMEDOUT)
+ goto done;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.options = qos->bcast.options;
+ cp.sid = conn->sid;
+ cp.addr_type = conn->dst_type;
+ bacpy(&cp.addr, &conn->dst);
+ cp.skip = cpu_to_le16(qos->bcast.skip);
+ cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+ cp.sync_cte_type = qos->bcast.sync_cte_type;
+
+ /* The spec allows only one pending LE Periodic Advertising Create
+ * Sync command at a time so we forcefully wait for PA Sync Established
+ * event since cmd_work can only schedule one command at a time.
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2493:
+ *
+ * If the Host issues this command when another HCI_LE_Periodic_
+ * Advertising_Create_Sync command is pending, the Controller shall
+ * return the error code Command Disallowed (0x0C).
+ */
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_PA_CREATE_SYNC,
+ sizeof(cp), &cp,
+ HCI_EV_LE_PA_SYNC_ESTABLISHED,
+ conn->conn_timeout, NULL);
+ if (err == -ETIMEDOUT)
+ __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+done:
+ hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+ /* Update passive scan since HCI_PA_SYNC flag has been cleared */
+ hci_update_passive_scan_sync(hdev);
+
+ return err;
+}
+
+int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn,
+ create_pa_complete);
+}
+
+static void create_big_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ if (hci_conn_valid(hdev, conn))
+ clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
+}
+
+static int hci_le_big_create_sync(struct hci_dev *hdev, void *data)
+{
+ DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, 0x11);
+ struct hci_conn *conn = data;
+ struct bt_iso_qos *qos = &conn->iso_qos;
+ int err;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
+
+ memset(cp, 0, sizeof(*cp));
+ cp->handle = qos->bcast.big;
+ cp->sync_handle = cpu_to_le16(conn->sync_handle);
+ cp->encryption = qos->bcast.encryption;
+ memcpy(cp->bcode, qos->bcast.bcode, sizeof(cp->bcode));
+ cp->mse = qos->bcast.mse;
+ cp->timeout = cpu_to_le16(qos->bcast.timeout);
+ cp->num_bis = conn->num_bis;
+ memcpy(cp->bis, conn->bis, conn->num_bis);
+
+ /* The spec allows only one pending LE BIG Create Sync command at
+ * a time, so we forcefully wait for BIG Sync Established event since
+ * cmd_work can only schedule one command at a time.
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2586:
+ *
+ * If the Host sends this command when the Controller is in the
+ * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_
+ * Established event has not been generated, the Controller shall
+ * return the error code Command Disallowed (0x0C).
+ */
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_BIG_CREATE_SYNC,
+ struct_size(cp, bis, cp->num_bis), cp,
+ HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
+ conn->conn_timeout, NULL);
+ if (err == -ETIMEDOUT)
+ hci_le_big_terminate_sync(hdev, cp->handle);
+
+ return err;
+}
+
+int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn,
+ create_big_complete);
+}
+
+struct past_data {
+ struct hci_conn *conn;
+ struct hci_conn *le;
+};
+
+static void past_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct past_data *past = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ kfree(past);
+}
+
+static int hci_le_past_set_info_sync(struct hci_dev *hdev, void *data)
+{
+ struct past_data *past = data;
+ struct hci_cp_le_past_set_info cp;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, past->conn) ||
+ !hci_conn_valid(hdev, past->le)) {
+ hci_dev_unlock(hdev);
+ return -ECANCELED;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(past->le->handle);
+ cp.adv_handle = past->conn->iso_qos.bcast.bis;
+
+ hci_dev_unlock(hdev);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_SET_INFO,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_past_sync(struct hci_dev *hdev, void *data)
+{
+ struct past_data *past = data;
+ struct hci_cp_le_past cp;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, past->conn) ||
+ !hci_conn_valid(hdev, past->le)) {
+ hci_dev_unlock(hdev);
+ return -ECANCELED;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(past->le->handle);
+ cp.sync_handle = cpu_to_le16(past->conn->sync_handle);
+
+ hci_dev_unlock(hdev);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_past_sync(struct hci_conn *conn, struct hci_conn *le)
+{
+ struct past_data *data;
+ int err;
+
+ if (conn->type != BIS_LINK && conn->type != PA_LINK)
+ return -EINVAL;
+
+ if (!past_sender_capable(conn->hdev))
+ return -EOPNOTSUPP;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->conn = conn;
+ data->le = le;
+
+ if (conn->role == HCI_ROLE_MASTER)
+ err = hci_cmd_sync_queue_once(conn->hdev,
+ hci_le_past_set_info_sync, data,
+ past_complete);
+ else
+ err = hci_cmd_sync_queue_once(conn->hdev, hci_le_past_sync,
+ data, past_complete);
+
+ if (err)
+ kfree(data);
+
+ return err;
+}
+
+static void le_read_features_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ hci_conn_drop(conn);
+}
+
+static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev,
+ void *data)
+{
+ struct hci_conn *conn = data;
+ struct hci_cp_le_read_all_remote_features cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.pages = 10; /* Attempt to read all pages */
+
+ /* Wait for HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE event otherwise
+ * hci_conn_drop may run prematurely causing a disconnection.
+ */
+ return __hci_cmd_sync_status_sk(hdev,
+ HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+ sizeof(cp), &cp,
+ HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+ struct hci_cp_le_read_remote_features cp;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ /* Check if LL Extended Feature Set is supported and
+ * HCI_OP_LE_READ_ALL_REMOTE_FEATURES is supported then use that to read
+ * all features.
+ */
+ if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(3))
+ return hci_le_read_all_remote_features_sync(hdev, data);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+
+ /* Wait for HCI_EV_LE_REMOTE_FEAT_COMPLETE event otherwise
+ * hci_conn_drop may run prematurely causing a disconnection.
+ */
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
+ sizeof(cp), &cp,
+ HCI_EV_LE_REMOTE_FEAT_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+}
+
+int hci_le_read_remote_features(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ int err;
+
+ /* The remote features procedure is defined for central
+ * role only. So only in case of an initiated connection
+ * request the remote features.
+ *
+ * If the local controller supports peripheral-initiated features
+ * exchange, then requesting the remote features in peripheral
+ * role is possible. Otherwise just transition into the
+ * connected state without requesting the remote features.
+ */
+ if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+ err = hci_cmd_sync_queue_once(hdev,
+ hci_le_read_remote_features_sync,
+ hci_conn_hold(conn),
+ le_read_features_complete);
+ else
+ err = -EOPNOTSUPP;
+
+ return err;
+}
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 3eeeb7a86e75..041ce9adc378 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -1,361 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
/* Bluetooth HCI driver model support. */
-#include <linux/kernel.h>
-#include <linux/init.h>
-
-#include <linux/platform_device.h>
+#include <linux/module.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
-#ifndef CONFIG_BT_HCI_CORE_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
-static inline char *typetostr(int type)
-{
- switch (type) {
- case HCI_VIRTUAL:
- return "VIRTUAL";
- case HCI_USB:
- return "USB";
- case HCI_PCCARD:
- return "PCCARD";
- case HCI_UART:
- return "UART";
- case HCI_RS232:
- return "RS232";
- case HCI_PCI:
- return "PCI";
- case HCI_SDIO:
- return "SDIO";
- default:
- return "UNKNOWN";
- }
-}
-
-static ssize_t show_type(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%s\n", typetostr(hdev->type));
-}
-
-static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- bdaddr_t bdaddr;
- baswap(&bdaddr, &hdev->bdaddr);
- return sprintf(buf, "%s\n", batostr(&bdaddr));
-}
+static const struct class bt_class = {
+ .name = "bluetooth",
+};
-static ssize_t show_manufacturer(struct device *dev, struct device_attribute *attr, char *buf)
+static void bt_link_release(struct device *dev)
{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", hdev->manufacturer);
+ struct hci_conn *conn = to_hci_conn(dev);
+ kfree(conn);
}
-static ssize_t show_hci_version(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", hdev->hci_ver);
-}
+static const struct device_type bt_link = {
+ .name = "link",
+ .release = bt_link_release,
+};
-static ssize_t show_hci_revision(struct device *dev, struct device_attribute *attr, char *buf)
+void hci_conn_init_sysfs(struct hci_conn *conn)
{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", hdev->hci_rev);
-}
+ struct hci_dev *hdev = conn->hdev;
-static ssize_t show_inquiry_cache(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- struct inquiry_cache *cache = &hdev->inq_cache;
- struct inquiry_entry *e;
- int n = 0;
-
- hci_dev_lock_bh(hdev);
-
- for (e = cache->list; e; e = e->next) {
- struct inquiry_data *data = &e->data;
- bdaddr_t bdaddr;
- baswap(&bdaddr, &data->bdaddr);
- n += sprintf(buf + n, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %u\n",
- batostr(&bdaddr),
- data->pscan_rep_mode, data->pscan_period_mode, data->pscan_mode,
- data->dev_class[2], data->dev_class[1], data->dev_class[0],
- __le16_to_cpu(data->clock_offset), data->rssi, e->timestamp);
- }
+ bt_dev_dbg(hdev, "conn %p", conn);
- hci_dev_unlock_bh(hdev);
- return n;
-}
+ conn->dev.type = &bt_link;
+ conn->dev.class = &bt_class;
+ conn->dev.parent = &hdev->dev;
-static ssize_t show_idle_timeout(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", hdev->idle_timeout);
+ device_initialize(&conn->dev);
}
-static ssize_t store_idle_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+void hci_conn_add_sysfs(struct hci_conn *conn)
{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- char *ptr;
- __u32 val;
+ struct hci_dev *hdev = conn->hdev;
- val = simple_strtoul(buf, &ptr, 10);
- if (ptr == buf)
- return -EINVAL;
+ bt_dev_dbg(hdev, "conn %p", conn);
- if (val != 0 && (val < 500 || val > 3600000))
- return -EINVAL;
+ if (device_is_registered(&conn->dev))
+ return;
- hdev->idle_timeout = val;
+ dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
- return count;
+ if (device_add(&conn->dev) < 0)
+ bt_dev_err(hdev, "failed to register connection device");
}
-static ssize_t show_sniff_max_interval(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", hdev->sniff_max_interval);
-}
-
-static ssize_t store_sniff_max_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+void hci_conn_del_sysfs(struct hci_conn *conn)
{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- char *ptr;
- __u16 val;
-
- val = simple_strtoul(buf, &ptr, 10);
- if (ptr == buf)
- return -EINVAL;
+ struct hci_dev *hdev = conn->hdev;
- if (val < 0x0002 || val > 0xFFFE || val % 2)
- return -EINVAL;
+ bt_dev_dbg(hdev, "conn %p", conn);
- if (val < hdev->sniff_min_interval)
- return -EINVAL;
+ if (!device_is_registered(&conn->dev)) {
+ /* If device_add() has *not* succeeded, use *only* put_device()
+ * to drop the reference count.
+ */
+ put_device(&conn->dev);
+ return;
+ }
- hdev->sniff_max_interval = val;
+ /* If there are devices using the connection as parent reset it to NULL
+ * before unregistering the device.
+ */
+ while (1) {
+ struct device *dev;
+
+ dev = device_find_any_child(&conn->dev);
+ if (!dev)
+ break;
+ device_move(dev, NULL, DPM_ORDER_DEV_LAST);
+ put_device(dev);
+ }
- return count;
+ device_unregister(&conn->dev);
}
-static ssize_t show_sniff_min_interval(struct device *dev, struct device_attribute *attr, char *buf)
+static void bt_host_release(struct device *dev)
{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- return sprintf(buf, "%d\n", hdev->sniff_min_interval);
+ struct hci_dev *hdev = to_hci_dev(dev);
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ hci_release_dev(hdev);
+ else
+ kfree(hdev);
+ module_put(THIS_MODULE);
}
-static ssize_t store_sniff_min_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
- struct hci_dev *hdev = dev_get_drvdata(dev);
- char *ptr;
- __u16 val;
-
- val = simple_strtoul(buf, &ptr, 10);
- if (ptr == buf)
- return -EINVAL;
-
- if (val < 0x0002 || val > 0xFFFE || val % 2)
- return -EINVAL;
+ struct hci_dev *hdev = to_hci_dev(dev);
- if (val > hdev->sniff_max_interval)
- return -EINVAL;
-
- hdev->sniff_min_interval = val;
+ if (hdev->reset)
+ hdev->reset(hdev);
return count;
}
+static DEVICE_ATTR_WO(reset);
-static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
-static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
-static DEVICE_ATTR(manufacturer, S_IRUGO, show_manufacturer, NULL);
-static DEVICE_ATTR(hci_version, S_IRUGO, show_hci_version, NULL);
-static DEVICE_ATTR(hci_revision, S_IRUGO, show_hci_revision, NULL);
-static DEVICE_ATTR(inquiry_cache, S_IRUGO, show_inquiry_cache, NULL);
-
-static DEVICE_ATTR(idle_timeout, S_IRUGO | S_IWUSR,
- show_idle_timeout, store_idle_timeout);
-static DEVICE_ATTR(sniff_max_interval, S_IRUGO | S_IWUSR,
- show_sniff_max_interval, store_sniff_max_interval);
-static DEVICE_ATTR(sniff_min_interval, S_IRUGO | S_IWUSR,
- show_sniff_min_interval, store_sniff_min_interval);
-
-static struct device_attribute *bt_attrs[] = {
- &dev_attr_type,
- &dev_attr_address,
- &dev_attr_manufacturer,
- &dev_attr_hci_version,
- &dev_attr_hci_revision,
- &dev_attr_inquiry_cache,
- &dev_attr_idle_timeout,
- &dev_attr_sniff_max_interval,
- &dev_attr_sniff_min_interval,
- NULL
-};
-
-static ssize_t show_conn_type(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_conn *conn = dev_get_drvdata(dev);
- return sprintf(buf, "%s\n", conn->type == ACL_LINK ? "ACL" : "SCO");
-}
-
-static ssize_t show_conn_address(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct hci_conn *conn = dev_get_drvdata(dev);
- bdaddr_t bdaddr;
- baswap(&bdaddr, &conn->dst);
- return sprintf(buf, "%s\n", batostr(&bdaddr));
-}
-
-#define CONN_ATTR(_name,_mode,_show,_store) \
-struct device_attribute conn_attr_##_name = __ATTR(_name,_mode,_show,_store)
-
-static CONN_ATTR(type, S_IRUGO, show_conn_type, NULL);
-static CONN_ATTR(address, S_IRUGO, show_conn_address, NULL);
-
-static struct device_attribute *conn_attrs[] = {
- &conn_attr_type,
- &conn_attr_address,
- NULL
+static struct attribute *bt_host_attrs[] = {
+ &dev_attr_reset.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(bt_host);
-struct class *bt_class = NULL;
-EXPORT_SYMBOL_GPL(bt_class);
-
-static struct bus_type bt_bus = {
- .name = "bluetooth",
+static const struct device_type bt_host = {
+ .name = "host",
+ .release = bt_host_release,
+ .groups = bt_host_groups,
};
-static struct platform_device *bt_platform;
-
-static void bt_release(struct device *dev)
-{
- void *data = dev_get_drvdata(dev);
- kfree(data);
-}
-
-static void add_conn(void *data)
-{
- struct hci_conn *conn = data;
- int i;
-
- if (device_register(&conn->dev) < 0) {
- BT_ERR("Failed to register connection device");
- return;
- }
-
- for (i = 0; conn_attrs[i]; i++)
- if (device_create_file(&conn->dev, conn_attrs[i]) < 0)
- BT_ERR("Failed to create connection attribute");
-}
-
-void hci_conn_add_sysfs(struct hci_conn *conn)
-{
- struct hci_dev *hdev = conn->hdev;
- bdaddr_t *ba = &conn->dst;
-
- BT_DBG("conn %p", conn);
-
- conn->dev.bus = &bt_bus;
- conn->dev.parent = &hdev->dev;
-
- conn->dev.release = bt_release;
-
- snprintf(conn->dev.bus_id, BUS_ID_SIZE,
- "%s%2.2X%2.2X%2.2X%2.2X%2.2X%2.2X",
- conn->type == ACL_LINK ? "acl" : "sco",
- ba->b[5], ba->b[4], ba->b[3],
- ba->b[2], ba->b[1], ba->b[0]);
-
- dev_set_drvdata(&conn->dev, conn);
-
- INIT_WORK(&conn->work, add_conn, (void *) conn);
-
- schedule_work(&conn->work);
-}
-
-static void del_conn(void *data)
-{
- struct hci_conn *conn = data;
- device_del(&conn->dev);
-}
-
-void hci_conn_del_sysfs(struct hci_conn *conn)
-{
- BT_DBG("conn %p", conn);
-
- INIT_WORK(&conn->work, del_conn, (void *) conn);
-
- schedule_work(&conn->work);
-}
-
-int hci_register_sysfs(struct hci_dev *hdev)
+void hci_init_sysfs(struct hci_dev *hdev)
{
struct device *dev = &hdev->dev;
- unsigned int i;
- int err;
-
- BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type);
-
- dev->class = bt_class;
- dev->parent = hdev->parent;
-
- strlcpy(dev->bus_id, hdev->name, BUS_ID_SIZE);
-
- dev->release = bt_release;
-
- dev_set_drvdata(dev, hdev);
-
- err = device_register(dev);
- if (err < 0)
- return err;
- for (i = 0; bt_attrs[i]; i++)
- if (device_create_file(dev, bt_attrs[i]) < 0)
- BT_ERR("Failed to create device attribute");
+ dev->type = &bt_host;
+ dev->class = &bt_class;
- return 0;
-}
-
-void hci_unregister_sysfs(struct hci_dev *hdev)
-{
- BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type);
-
- device_del(&hdev->dev);
+ __module_get(THIS_MODULE);
+ device_initialize(dev);
}
int __init bt_sysfs_init(void)
{
- int err;
-
- bt_platform = platform_device_register_simple("bluetooth", -1, NULL, 0);
- if (IS_ERR(bt_platform))
- return PTR_ERR(bt_platform);
-
- err = bus_register(&bt_bus);
- if (err < 0) {
- platform_device_unregister(bt_platform);
- return err;
- }
-
- bt_class = class_create(THIS_MODULE, "bluetooth");
- if (IS_ERR(bt_class)) {
- bus_unregister(&bt_bus);
- platform_device_unregister(bt_platform);
- return PTR_ERR(bt_class);
- }
-
- return 0;
+ return class_register(&bt_class);
}
void bt_sysfs_cleanup(void)
{
- class_destroy(bt_class);
-
- bus_unregister(&bt_bus);
-
- platform_device_unregister(bt_platform);
+ class_unregister(&bt_class);
}
diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig
index c6abf2a5a932..e08aae35351a 100644
--- a/net/bluetooth/hidp/Kconfig
+++ b/net/bluetooth/hidp/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_HIDP
tristate "HIDP protocol support"
- depends on BT && BT_L2CAP && INPUT
+ depends on BT_BREDR && HID
help
HIDP (Human Interface Device Protocol) is a transport layer
for HID reports. HIDP is required for the Bluetooth Human
diff --git a/net/bluetooth/hidp/Makefile b/net/bluetooth/hidp/Makefile
index a9ee115696ae..f41b0aa02b23 100644
--- a/net/bluetooth/hidp/Makefile
+++ b/net/bluetooth/hidp/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth HIDP layer
#
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 66782010f82c..6724adce615b 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1,6 +1,7 @@
-/*
+/*
HIDP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+ Copyright (C) 2013 David Herrmann <dh.herrmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2 as
@@ -10,34 +11,21 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
+#include <linux/kref.h>
#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/skbuff.h>
-#include <linux/socket.h>
-#include <linux/ioctl.h>
#include <linux/file.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-#include <net/sock.h>
-
-#include <linux/input.h>
+#include <linux/kthread.h>
+#include <linux/hidraw.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -45,94 +33,126 @@
#include "hidp.h"
-#ifndef CONFIG_BT_HIDP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
-#define VERSION "1.1"
+#define VERSION "1.2"
static DECLARE_RWSEM(hidp_session_sem);
+static DECLARE_WAIT_QUEUE_HEAD(hidp_session_wq);
static LIST_HEAD(hidp_session_list);
static unsigned char hidp_keycode[256] = {
- 0, 0, 0, 0, 30, 48, 46, 32, 18, 33, 34, 35, 23, 36, 37, 38,
- 50, 49, 24, 25, 16, 19, 31, 20, 22, 47, 17, 45, 21, 44, 2, 3,
- 4, 5, 6, 7, 8, 9, 10, 11, 28, 1, 14, 15, 57, 12, 13, 26,
- 27, 43, 43, 39, 40, 41, 51, 52, 53, 58, 59, 60, 61, 62, 63, 64,
- 65, 66, 67, 68, 87, 88, 99, 70,119,110,102,104,111,107,109,106,
- 105,108,103, 69, 98, 55, 74, 78, 96, 79, 80, 81, 75, 76, 77, 71,
- 72, 73, 82, 83, 86,127,116,117,183,184,185,186,187,188,189,190,
- 191,192,193,194,134,138,130,132,128,129,131,137,133,135,136,113,
- 115,114, 0, 0, 0,121, 0, 89, 93,124, 92, 94, 95, 0, 0, 0,
- 122,123, 90, 91, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 29, 42, 56,125, 97, 54,100,126,164,166,165,163,161,115,114,113,
- 150,158,159,128,136,177,178,176,142,152,173,140
+ 0, 0, 0, 0, 30, 48, 46, 32, 18, 33, 34, 35, 23, 36,
+ 37, 38, 50, 49, 24, 25, 16, 19, 31, 20, 22, 47, 17, 45,
+ 21, 44, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 1,
+ 14, 15, 57, 12, 13, 26, 27, 43, 43, 39, 40, 41, 51, 52,
+ 53, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 87, 88,
+ 99, 70, 119, 110, 102, 104, 111, 107, 109, 106, 105, 108, 103, 69,
+ 98, 55, 74, 78, 96, 79, 80, 81, 75, 76, 77, 71, 72, 73,
+ 82, 83, 86, 127, 116, 117, 183, 184, 185, 186, 187, 188, 189, 190,
+ 191, 192, 193, 194, 134, 138, 130, 132, 128, 129, 131, 137, 133, 135,
+ 136, 113, 115, 114, 0, 0, 0, 121, 0, 89, 93, 124, 92, 94,
+ 95, 0, 0, 0, 122, 123, 90, 91, 85, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 29, 42, 56, 125, 97, 54, 100, 126, 164, 166, 165, 163, 161, 115,
+ 114, 113, 150, 158, 159, 128, 136, 177, 178, 176, 142, 152, 173, 140
};
static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
-static struct hidp_session *__hidp_get_session(bdaddr_t *bdaddr)
-{
- struct hidp_session *session;
- struct list_head *p;
-
- BT_DBG("");
-
- list_for_each(p, &hidp_session_list) {
- session = list_entry(p, struct hidp_session, list);
- if (!bacmp(bdaddr, &session->bdaddr))
- return session;
- }
- return NULL;
-}
+static int hidp_session_probe(struct l2cap_conn *conn,
+ struct l2cap_user *user);
+static void hidp_session_remove(struct l2cap_conn *conn,
+ struct l2cap_user *user);
+static int hidp_session_thread(void *arg);
+static void hidp_session_terminate(struct hidp_session *s);
-static void __hidp_link_session(struct hidp_session *session)
-{
- __module_get(THIS_MODULE);
- list_add(&session->list, &hidp_session_list);
-}
-
-static void __hidp_unlink_session(struct hidp_session *session)
-{
- list_del(&session->list);
- module_put(THIS_MODULE);
-}
-
-static void __hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci)
+static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci)
{
+ u32 valid_flags = 0;
+ memset(ci, 0, sizeof(*ci));
bacpy(&ci->bdaddr, &session->bdaddr);
- ci->flags = session->flags;
- ci->state = session->state;
-
- ci->vendor = 0x0000;
- ci->product = 0x0000;
- ci->version = 0x0000;
- memset(ci->name, 0, 128);
+ ci->flags = session->flags & valid_flags;
+ ci->state = BT_CONNECTED;
if (session->input) {
ci->vendor = session->input->id.vendor;
ci->product = session->input->id.product;
ci->version = session->input->id.version;
if (session->input->name)
- strncpy(ci->name, session->input->name, 128);
+ strscpy(ci->name, session->input->name, 128);
else
- strncpy(ci->name, "HID Boot Device", 128);
+ strscpy(ci->name, "HID Boot Device", 128);
+ } else if (session->hid) {
+ ci->vendor = session->hid->vendor;
+ ci->product = session->hid->product;
+ ci->version = session->hid->version;
+ strscpy(ci->name, session->hid->name, 128);
}
}
-static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
+/* assemble skb, queue message on @transmit and wake up the session thread */
+static int hidp_send_message(struct hidp_session *session, struct socket *sock,
+ struct sk_buff_head *transmit, unsigned char hdr,
+ const unsigned char *data, int size)
{
- struct hidp_session *session = dev->private;
struct sk_buff *skb;
+ struct sock *sk = sock->sk;
+ int ret;
+
+ BT_DBG("session %p data %p size %d", session, data, size);
+
+ if (atomic_read(&session->terminate))
+ return -EIO;
+
+ skb = alloc_skb(size + 1, GFP_ATOMIC);
+ if (!skb) {
+ BT_ERR("Can't allocate memory for new frame");
+ return -ENOMEM;
+ }
+
+ skb_put_u8(skb, hdr);
+ if (data && size > 0) {
+ skb_put_data(skb, data, size);
+ ret = size;
+ } else {
+ ret = 0;
+ }
+
+ skb_queue_tail(transmit, skb);
+ wake_up_interruptible(sk_sleep(sk));
+
+ return ret;
+}
+
+static int hidp_send_ctrl_message(struct hidp_session *session,
+ unsigned char hdr, const unsigned char *data,
+ int size)
+{
+ return hidp_send_message(session, session->ctrl_sock,
+ &session->ctrl_transmit, hdr, data, size);
+}
+
+static int hidp_send_intr_message(struct hidp_session *session,
+ unsigned char hdr, const unsigned char *data,
+ int size)
+{
+ return hidp_send_message(session, session->intr_sock,
+ &session->intr_transmit, hdr, data, size);
+}
+
+static int hidp_input_event(struct input_dev *dev, unsigned int type,
+ unsigned int code, int value)
+{
+ struct hidp_session *session = input_get_drvdata(dev);
unsigned char newleds;
+ unsigned char hdr, data[2];
- BT_DBG("input %p type %d code %d value %d", dev, type, code, value);
+ BT_DBG("session %p type %d code %d value %d",
+ session, type, code, value);
if (type != EV_LED)
return -1;
@@ -141,27 +161,18 @@ static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned i
(!!test_bit(LED_COMPOSE, dev->led) << 3) |
(!!test_bit(LED_SCROLLL, dev->led) << 2) |
(!!test_bit(LED_CAPSL, dev->led) << 1) |
- (!!test_bit(LED_NUML, dev->led));
+ (!!test_bit(LED_NUML, dev->led) << 0);
if (session->leds == newleds)
return 0;
session->leds = newleds;
- if (!(skb = alloc_skb(3, GFP_ATOMIC))) {
- BT_ERR("Can't allocate memory for new frame");
- return -ENOMEM;
- }
+ hdr = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT;
+ data[0] = 0x01;
+ data[1] = newleds;
- *skb_put(skb, 1) = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT;
- *skb_put(skb, 1) = 0x01;
- *skb_put(skb, 1) = newleds;
-
- skb_queue_tail(&session->intr_transmit, skb);
-
- hidp_schedule(session);
-
- return 0;
+ return hidp_send_intr_message(session, hdr, data, 2);
}
static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
@@ -219,72 +230,241 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
input_sync(dev);
}
-static void hidp_idle_timeout(unsigned long arg)
+static int hidp_get_raw_report(struct hid_device *hid,
+ unsigned char report_number,
+ unsigned char *data, size_t count,
+ unsigned char report_type)
{
- struct hidp_session *session = (struct hidp_session *) arg;
+ struct hidp_session *session = hid->driver_data;
+ struct sk_buff *skb;
+ size_t len;
+ int numbered_reports = hid->report_enum[report_type].numbered;
+ int ret;
- atomic_inc(&session->terminate);
- hidp_schedule(session);
-}
+ if (atomic_read(&session->terminate))
+ return -EIO;
-static inline void hidp_set_timer(struct hidp_session *session)
-{
- if (session->idle_to > 0)
- mod_timer(&session->timer, jiffies + HZ * session->idle_to);
-}
+ switch (report_type) {
+ case HID_FEATURE_REPORT:
+ report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+ break;
+ case HID_INPUT_REPORT:
+ report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_INPUT;
+ break;
+ case HID_OUTPUT_REPORT:
+ report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+ break;
+ default:
+ return -EINVAL;
+ }
-static inline void hidp_del_timer(struct hidp_session *session)
-{
- if (session->idle_to > 0)
- del_timer(&session->timer);
+ if (mutex_lock_interruptible(&session->report_mutex))
+ return -ERESTARTSYS;
+
+ /* Set up our wait, and send the report request to the device. */
+ session->waiting_report_type = report_type & HIDP_DATA_RTYPE_MASK;
+ session->waiting_report_number = numbered_reports ? report_number : -1;
+ set_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ data[0] = report_number;
+ ret = hidp_send_ctrl_message(session, report_type, data, 1);
+ if (ret < 0)
+ goto err;
+
+ /* Wait for the return of the report. The returned report
+ gets put in session->report_return. */
+ while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) &&
+ !atomic_read(&session->terminate)) {
+ int res;
+
+ res = wait_event_interruptible_timeout(session->report_queue,
+ !test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)
+ || atomic_read(&session->terminate),
+ 5*HZ);
+ if (res == 0) {
+ /* timeout */
+ ret = -EIO;
+ goto err;
+ }
+ if (res < 0) {
+ /* signal */
+ ret = -ERESTARTSYS;
+ goto err;
+ }
+ }
+
+ skb = session->report_return;
+ if (skb) {
+ len = skb->len < count ? skb->len : count;
+ memcpy(data, skb->data, len);
+
+ kfree_skb(skb);
+ session->report_return = NULL;
+ } else {
+ /* Device returned a HANDSHAKE, indicating protocol error. */
+ len = -EIO;
+ }
+
+ clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ mutex_unlock(&session->report_mutex);
+
+ return len;
+
+err:
+ clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ mutex_unlock(&session->report_mutex);
+ return ret;
}
-static int __hidp_send_ctrl_message(struct hidp_session *session,
- unsigned char hdr, unsigned char *data, int size)
+static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum,
+ unsigned char *data, size_t count,
+ unsigned char report_type)
{
- struct sk_buff *skb;
+ struct hidp_session *session = hid->driver_data;
+ int ret;
- BT_DBG("session %p data %p size %d", session, data, size);
+ switch (report_type) {
+ case HID_FEATURE_REPORT:
+ report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+ break;
+ case HID_INPUT_REPORT:
+ report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_INPUT;
+ break;
+ case HID_OUTPUT_REPORT:
+ report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+ break;
+ default:
+ return -EINVAL;
+ }
- if (!(skb = alloc_skb(size + 1, GFP_ATOMIC))) {
- BT_ERR("Can't allocate memory for new frame");
- return -ENOMEM;
+ if (mutex_lock_interruptible(&session->report_mutex))
+ return -ERESTARTSYS;
+
+ /* Set up our wait, and send the report request to the device. */
+ data[0] = reportnum;
+ set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+ ret = hidp_send_ctrl_message(session, report_type, data, count);
+ if (ret < 0)
+ goto err;
+
+ /* Wait for the ACK from the device. */
+ while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags) &&
+ !atomic_read(&session->terminate)) {
+ int res;
+
+ res = wait_event_interruptible_timeout(session->report_queue,
+ !test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)
+ || atomic_read(&session->terminate),
+ 10*HZ);
+ if (res == 0) {
+ /* timeout */
+ ret = -EIO;
+ goto err;
+ }
+ if (res < 0) {
+ /* signal */
+ ret = -ERESTARTSYS;
+ goto err;
+ }
}
- *skb_put(skb, 1) = hdr;
- if (data && size > 0)
- memcpy(skb_put(skb, size), data, size);
+ if (!session->output_report_success) {
+ ret = -EIO;
+ goto err;
+ }
- skb_queue_tail(&session->ctrl_transmit, skb);
+ ret = count;
- return 0;
+err:
+ clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+ mutex_unlock(&session->report_mutex);
+ return ret;
}
-static int inline hidp_send_ctrl_message(struct hidp_session *session,
- unsigned char hdr, unsigned char *data, int size)
+static int hidp_output_report(struct hid_device *hid, __u8 *data, size_t count)
{
- int err;
+ struct hidp_session *session = hid->driver_data;
+
+ return hidp_send_intr_message(session,
+ HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT,
+ data, count);
+}
+
+static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum,
+ __u8 *buf, size_t len, unsigned char rtype,
+ int reqtype)
+{
+ switch (reqtype) {
+ case HID_REQ_GET_REPORT:
+ return hidp_get_raw_report(hid, reportnum, buf, len, rtype);
+ case HID_REQ_SET_REPORT:
+ return hidp_set_raw_report(hid, reportnum, buf, len, rtype);
+ default:
+ return -EIO;
+ }
+}
- err = __hidp_send_ctrl_message(session, hdr, data, size);
+static void hidp_idle_timeout(struct timer_list *t)
+{
+ struct hidp_session *session = timer_container_of(session, t, timer);
+
+ /* The HIDP user-space API only contains calls to add and remove
+ * devices. There is no way to forward events of any kind. Therefore,
+ * we have to forcefully disconnect a device on idle-timeouts. This is
+ * unfortunate and weird API design, but it is spec-compliant and
+ * required for backwards-compatibility. Hence, on idle-timeout, we
+ * signal driver-detach events, so poll() will be woken up with an
+ * error-condition on both sockets.
+ */
+
+ session->intr_sock->sk->sk_err = EUNATCH;
+ session->ctrl_sock->sk->sk_err = EUNATCH;
+ wake_up_interruptible(sk_sleep(session->intr_sock->sk));
+ wake_up_interruptible(sk_sleep(session->ctrl_sock->sk));
+
+ hidp_session_terminate(session);
+}
- hidp_schedule(session);
+static void hidp_set_timer(struct hidp_session *session)
+{
+ if (session->idle_to > 0)
+ mod_timer(&session->timer, jiffies + HZ * session->idle_to);
+}
- return err;
+static void hidp_del_timer(struct hidp_session *session)
+{
+ if (session->idle_to > 0)
+ timer_delete_sync(&session->timer);
}
-static inline void hidp_process_handshake(struct hidp_session *session, unsigned char param)
+static void hidp_process_report(struct hidp_session *session, int type,
+ const u8 *data, unsigned int len, int intr)
+{
+ if (len > HID_MAX_BUFFER_SIZE)
+ len = HID_MAX_BUFFER_SIZE;
+
+ memcpy(session->input_buf, data, len);
+ hid_input_report(session->hid, type, session->input_buf, len, intr);
+}
+
+static void hidp_process_handshake(struct hidp_session *session,
+ unsigned char param)
{
BT_DBG("session %p param 0x%02x", session, param);
+ session->output_report_success = 0; /* default condition */
switch (param) {
case HIDP_HSHK_SUCCESSFUL:
/* FIXME: Call into SET_ GET_ handlers here */
+ session->output_report_success = 1;
break;
case HIDP_HSHK_NOT_READY:
case HIDP_HSHK_ERR_INVALID_REPORT_ID:
case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST:
case HIDP_HSHK_ERR_INVALID_PARAMETER:
+ if (test_and_clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags))
+ wake_up_interruptible(&session->report_queue);
+
/* FIXME: Call into SET_ GET_ handlers here */
break;
@@ -293,52 +473,42 @@ static inline void hidp_process_handshake(struct hidp_session *session, unsigned
case HIDP_HSHK_ERR_FATAL:
/* Device requests a reboot, as this is the only way this error
- * can be recovered. */
- __hidp_send_ctrl_message(session,
+ * can be recovered. */
+ hidp_send_ctrl_message(session,
HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0);
break;
default:
- __hidp_send_ctrl_message(session,
+ hidp_send_ctrl_message(session,
HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
break;
}
+
+ /* Wake up the waiting thread. */
+ if (test_and_clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags))
+ wake_up_interruptible(&session->report_queue);
}
-static inline void hidp_process_hid_control(struct hidp_session *session, unsigned char param)
+static void hidp_process_hid_control(struct hidp_session *session,
+ unsigned char param)
{
BT_DBG("session %p param 0x%02x", session, param);
- switch (param) {
- case HIDP_CTRL_NOP:
- break;
-
- case HIDP_CTRL_VIRTUAL_CABLE_UNPLUG:
+ if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) {
/* Flush the transmit queues */
skb_queue_purge(&session->ctrl_transmit);
skb_queue_purge(&session->intr_transmit);
- /* Kill session thread */
- atomic_inc(&session->terminate);
- break;
-
- case HIDP_CTRL_HARD_RESET:
- case HIDP_CTRL_SOFT_RESET:
- case HIDP_CTRL_SUSPEND:
- case HIDP_CTRL_EXIT_SUSPEND:
- /* FIXME: We have to parse these and return no error */
- break;
-
- default:
- __hidp_send_ctrl_message(session,
- HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
- break;
+ hidp_session_terminate(session);
}
}
-static inline void hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param)
+/* Returns true if the passed-in skb should be freed by the caller. */
+static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
+ unsigned char param)
{
- BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
+ int done_with_skb = 1;
+ BT_DBG("session %p skb %p len %u param 0x%02x", session, skb, skb->len, param);
switch (param) {
case HIDP_DATA_RTYPE_INPUT:
@@ -346,6 +516,10 @@ static inline void hidp_process_data(struct hidp_session *session, struct sk_buf
if (session->input)
hidp_input_report(session, skb);
+
+ if (session->hid)
+ hidp_process_report(session, HID_INPUT_REPORT,
+ skb->data, skb->len, 0);
break;
case HIDP_DATA_RTYPE_OTHER:
@@ -354,16 +528,32 @@ static inline void hidp_process_data(struct hidp_session *session, struct sk_buf
break;
default:
- __hidp_send_ctrl_message(session,
+ hidp_send_ctrl_message(session,
HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
}
+
+ if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) &&
+ param == session->waiting_report_type) {
+ if (session->waiting_report_number < 0 ||
+ session->waiting_report_number == skb->data[0]) {
+ /* hidp_get_raw_report() is waiting on this report. */
+ session->report_return = skb;
+ done_with_skb = 0;
+ clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ wake_up_interruptible(&session->report_queue);
+ }
+ }
+
+ return done_with_skb;
}
-static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_buff *skb)
+static void hidp_recv_ctrl_frame(struct hidp_session *session,
+ struct sk_buff *skb)
{
unsigned char hdr, type, param;
+ int free_skb = 1;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
hdr = skb->data[0];
skb_pull(skb, 1);
@@ -381,31 +571,40 @@ static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_
break;
case HIDP_TRANS_DATA:
- hidp_process_data(session, skb, param);
+ free_skb = hidp_process_data(session, skb, param);
break;
default:
- __hidp_send_ctrl_message(session,
+ hidp_send_ctrl_message(session,
HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0);
break;
}
- kfree_skb(skb);
+ if (free_skb)
+ kfree_skb(skb);
}
-static inline void hidp_recv_intr_frame(struct hidp_session *session, struct sk_buff *skb)
+static void hidp_recv_intr_frame(struct hidp_session *session,
+ struct sk_buff *skb)
{
unsigned char hdr;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
hdr = skb->data[0];
skb_pull(skb, 1);
if (hdr == (HIDP_TRANS_DATA | HIDP_DATA_RTYPE_INPUT)) {
hidp_set_timer(session);
+
if (session->input)
hidp_input_report(session, skb);
+
+ if (session->hid) {
+ hidp_process_report(session, HID_INPUT_REPORT,
+ skb->data, skb->len, 1);
+ BT_DBG("report len %d", skb->len);
+ }
} else {
BT_DBG("Unsupported protocol header 0x%02x", hdr);
}
@@ -428,306 +627,805 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static void hidp_process_transmit(struct hidp_session *session)
+/* dequeue message from @transmit and send via @sock */
+static void hidp_process_transmit(struct hidp_session *session,
+ struct sk_buff_head *transmit,
+ struct socket *sock)
{
struct sk_buff *skb;
+ int ret;
BT_DBG("session %p", session);
- while ((skb = skb_dequeue(&session->ctrl_transmit))) {
- if (hidp_send_frame(session->ctrl_sock, skb->data, skb->len) < 0) {
- skb_queue_head(&session->ctrl_transmit, skb);
+ while ((skb = skb_dequeue(transmit))) {
+ ret = hidp_send_frame(sock, skb->data, skb->len);
+ if (ret == -EAGAIN) {
+ skb_queue_head(transmit, skb);
+ break;
+ } else if (ret < 0) {
+ hidp_session_terminate(session);
+ kfree_skb(skb);
break;
}
hidp_set_timer(session);
kfree_skb(skb);
}
+}
- while ((skb = skb_dequeue(&session->intr_transmit))) {
- if (hidp_send_frame(session->intr_sock, skb->data, skb->len) < 0) {
- skb_queue_head(&session->intr_transmit, skb);
- break;
- }
+static int hidp_setup_input(struct hidp_session *session,
+ const struct hidp_connadd_req *req)
+{
+ struct input_dev *input;
+ int i;
- hidp_set_timer(session);
- kfree_skb(skb);
+ input = input_allocate_device();
+ if (!input)
+ return -ENOMEM;
+
+ session->input = input;
+
+ input_set_drvdata(input, session);
+
+ input->name = "Bluetooth HID Boot Protocol Device";
+
+ input->id.bustype = BUS_BLUETOOTH;
+ input->id.vendor = req->vendor;
+ input->id.product = req->product;
+ input->id.version = req->version;
+
+ if (req->subclass & 0x40) {
+ set_bit(EV_KEY, input->evbit);
+ set_bit(EV_LED, input->evbit);
+ set_bit(EV_REP, input->evbit);
+
+ set_bit(LED_NUML, input->ledbit);
+ set_bit(LED_CAPSL, input->ledbit);
+ set_bit(LED_SCROLLL, input->ledbit);
+ set_bit(LED_COMPOSE, input->ledbit);
+ set_bit(LED_KANA, input->ledbit);
+
+ for (i = 0; i < sizeof(hidp_keycode); i++)
+ set_bit(hidp_keycode[i], input->keybit);
+ clear_bit(0, input->keybit);
+ }
+
+ if (req->subclass & 0x80) {
+ input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL);
+ input->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) |
+ BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE);
+ input->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+ input->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) |
+ BIT_MASK(BTN_EXTRA);
+ input->relbit[0] |= BIT_MASK(REL_WHEEL);
}
+
+ input->dev.parent = &session->conn->hcon->dev;
+
+ input->event = hidp_input_event;
+
+ return 0;
}
-static int hidp_session(void *arg)
+static int hidp_open(struct hid_device *hid)
{
- struct hidp_session *session = arg;
- struct sock *ctrl_sk = session->ctrl_sock->sk;
- struct sock *intr_sk = session->intr_sock->sk;
- struct sk_buff *skb;
- int vendor = 0x0000, product = 0x0000;
- wait_queue_t ctrl_wait, intr_wait;
+ return 0;
+}
- BT_DBG("session %p", session);
+static void hidp_close(struct hid_device *hid)
+{
+}
- if (session->input) {
- vendor = session->input->id.vendor;
- product = session->input->id.product;
- }
+static int hidp_parse(struct hid_device *hid)
+{
+ struct hidp_session *session = hid->driver_data;
- daemonize("khidpd_%04x%04x", vendor, product);
- set_user_nice(current, -15);
- current->flags |= PF_NOFREEZE;
+ return hid_parse_report(session->hid, session->rd_data,
+ session->rd_size);
+}
- init_waitqueue_entry(&ctrl_wait, current);
- init_waitqueue_entry(&intr_wait, current);
- add_wait_queue(ctrl_sk->sk_sleep, &ctrl_wait);
- add_wait_queue(intr_sk->sk_sleep, &intr_wait);
- while (!atomic_read(&session->terminate)) {
- set_current_state(TASK_INTERRUPTIBLE);
+static int hidp_start(struct hid_device *hid)
+{
+ return 0;
+}
- if (ctrl_sk->sk_state != BT_CONNECTED || intr_sk->sk_state != BT_CONNECTED)
- break;
+static void hidp_stop(struct hid_device *hid)
+{
+ struct hidp_session *session = hid->driver_data;
- while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) {
- skb_orphan(skb);
- hidp_recv_ctrl_frame(session, skb);
- }
+ skb_queue_purge(&session->ctrl_transmit);
+ skb_queue_purge(&session->intr_transmit);
- while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) {
- skb_orphan(skb);
- hidp_recv_intr_frame(session, skb);
- }
+ hid->claimed = 0;
+}
+
+static const struct hid_ll_driver hidp_hid_driver = {
+ .parse = hidp_parse,
+ .start = hidp_start,
+ .stop = hidp_stop,
+ .open = hidp_open,
+ .close = hidp_close,
+ .raw_request = hidp_raw_request,
+ .output_report = hidp_output_report,
+};
+
+/* This function sets up the hid device. It does not add it
+ to the HID system. That is done in hidp_add_connection(). */
+static int hidp_setup_hid(struct hidp_session *session,
+ const struct hidp_connadd_req *req)
+{
+ struct hid_device *hid;
+ int err;
+
+ session->rd_data = memdup_user(req->rd_data, req->rd_size);
+ if (IS_ERR(session->rd_data))
+ return PTR_ERR(session->rd_data);
- hidp_process_transmit(session);
+ session->rd_size = req->rd_size;
- schedule();
+ hid = hid_allocate_device();
+ if (IS_ERR(hid)) {
+ err = PTR_ERR(hid);
+ goto fault;
}
- set_current_state(TASK_RUNNING);
- remove_wait_queue(intr_sk->sk_sleep, &intr_wait);
- remove_wait_queue(ctrl_sk->sk_sleep, &ctrl_wait);
- down_write(&hidp_session_sem);
+ session->hid = hid;
- hidp_del_timer(session);
+ hid->driver_data = session;
- fput(session->intr_sock->file);
+ hid->bus = BUS_BLUETOOTH;
+ hid->vendor = req->vendor;
+ hid->product = req->product;
+ hid->version = req->version;
+ hid->country = req->country;
- wait_event_timeout(*(ctrl_sk->sk_sleep),
- (ctrl_sk->sk_state == BT_CLOSED), msecs_to_jiffies(500));
+ strscpy(hid->name, req->name, sizeof(hid->name));
- fput(session->ctrl_sock->file);
+ snprintf(hid->phys, sizeof(hid->phys), "%pMR",
+ &l2cap_pi(session->ctrl_sock->sk)->chan->src);
- __hidp_unlink_session(session);
+ /* NOTE: Some device modules depend on the dst address being stored in
+ * uniq. Please be aware of this before making changes to this behavior.
+ */
+ snprintf(hid->uniq, sizeof(hid->uniq), "%pMR",
+ &l2cap_pi(session->ctrl_sock->sk)->chan->dst);
- if (session->input) {
- input_unregister_device(session->input);
- session->input = NULL;
+ hid->dev.parent = &session->conn->hcon->dev;
+ hid->ll_driver = &hidp_hid_driver;
+
+ /* True if device is blocked in drivers/hid/hid-quirks.c */
+ if (hid_ignore(hid)) {
+ hid_destroy_device(session->hid);
+ session->hid = NULL;
+ return -ENODEV;
}
- up_write(&hidp_session_sem);
+ return 0;
+
+fault:
+ kfree(session->rd_data);
+ session->rd_data = NULL;
+
+ return err;
+}
+
+/* initialize session devices */
+static int hidp_session_dev_init(struct hidp_session *session,
+ const struct hidp_connadd_req *req)
+{
+ int ret;
+
+ if (req->rd_size > 0) {
+ ret = hidp_setup_hid(session, req);
+ if (ret && ret != -ENODEV)
+ return ret;
+ }
+
+ if (!session->hid) {
+ ret = hidp_setup_input(session, req);
+ if (ret < 0)
+ return ret;
+ }
- kfree(session);
return 0;
}
-static struct device *hidp_get_device(struct hidp_session *session)
+/* destroy session devices */
+static void hidp_session_dev_destroy(struct hidp_session *session)
{
- bdaddr_t *src = &bt_sk(session->ctrl_sock->sk)->src;
- bdaddr_t *dst = &bt_sk(session->ctrl_sock->sk)->dst;
- struct hci_dev *hdev;
- struct hci_conn *conn;
+ if (session->hid)
+ put_device(&session->hid->dev);
+ else if (session->input)
+ input_put_device(session->input);
- hdev = hci_get_route(dst, src);
- if (!hdev)
- return NULL;
+ kfree(session->rd_data);
+ session->rd_data = NULL;
+}
- conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
+/* add HID/input devices to their underlying bus systems */
+static int hidp_session_dev_add(struct hidp_session *session)
+{
+ int ret;
+
+ /* Both HID and input systems drop a ref-count when unregistering the
+ * device but they don't take a ref-count when registering them. Work
+ * around this by explicitly taking a refcount during registration
+ * which is dropped automatically by unregistering the devices. */
+
+ if (session->hid) {
+ ret = hid_add_device(session->hid);
+ if (ret)
+ return ret;
+ get_device(&session->hid->dev);
+ } else if (session->input) {
+ ret = input_register_device(session->input);
+ if (ret)
+ return ret;
+ input_get_device(session->input);
+ }
- hci_dev_put(hdev);
+ return 0;
+}
- return conn ? &conn->dev : NULL;
+/* remove HID/input devices from their bus systems */
+static void hidp_session_dev_del(struct hidp_session *session)
+{
+ if (session->hid)
+ hid_destroy_device(session->hid);
+ else if (session->input)
+ input_unregister_device(session->input);
}
-static inline void hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req)
+/*
+ * Asynchronous device registration
+ * HID device drivers might want to perform I/O during initialization to
+ * detect device types. Therefore, call device registration in a separate
+ * worker so the HIDP thread can schedule I/O operations.
+ * Note that this must be called after the worker thread was initialized
+ * successfully. This will then add the devices and increase session state
+ * on success, otherwise it will terminate the session thread.
+ */
+static void hidp_session_dev_work(struct work_struct *work)
{
- struct input_dev *input = session->input;
- int i;
+ struct hidp_session *session = container_of(work,
+ struct hidp_session,
+ dev_init);
+ int ret;
+
+ ret = hidp_session_dev_add(session);
+ if (!ret)
+ atomic_inc(&session->state);
+ else
+ hidp_session_terminate(session);
+}
- input->private = session;
+/*
+ * Create new session object
+ * Allocate session object, initialize static fields, copy input data into the
+ * object and take a reference to all sub-objects.
+ * This returns 0 on success and puts a pointer to the new session object in
+ * \out. Otherwise, an error code is returned.
+ * The new session object has an initial ref-count of 1.
+ */
+static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
+ struct socket *ctrl_sock,
+ struct socket *intr_sock,
+ const struct hidp_connadd_req *req,
+ struct l2cap_conn *conn)
+{
+ struct hidp_session *session;
+ int ret;
+ struct bt_sock *ctrl, *intr;
- input->name = "Bluetooth HID Boot Protocol Device";
+ ctrl = bt_sk(ctrl_sock->sk);
+ intr = bt_sk(intr_sock->sk);
- input->id.bustype = BUS_BLUETOOTH;
- input->id.vendor = req->vendor;
- input->id.product = req->product;
- input->id.version = req->version;
+ session = kzalloc(sizeof(*session), GFP_KERNEL);
+ if (!session)
+ return -ENOMEM;
- if (req->subclass & 0x40) {
- set_bit(EV_KEY, input->evbit);
- set_bit(EV_LED, input->evbit);
- set_bit(EV_REP, input->evbit);
+ /* object and runtime management */
+ kref_init(&session->ref);
+ atomic_set(&session->state, HIDP_SESSION_IDLING);
+ init_waitqueue_head(&session->state_queue);
+ session->flags = req->flags & BIT(HIDP_BLUETOOTH_VENDOR_ID);
+
+ /* connection management */
+ bacpy(&session->bdaddr, bdaddr);
+ session->conn = l2cap_conn_get(conn);
+ session->user.probe = hidp_session_probe;
+ session->user.remove = hidp_session_remove;
+ INIT_LIST_HEAD(&session->user.list);
+ session->ctrl_sock = ctrl_sock;
+ session->intr_sock = intr_sock;
+ skb_queue_head_init(&session->ctrl_transmit);
+ skb_queue_head_init(&session->intr_transmit);
+ session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl)->chan->omtu,
+ l2cap_pi(ctrl)->chan->imtu);
+ session->intr_mtu = min_t(uint, l2cap_pi(intr)->chan->omtu,
+ l2cap_pi(intr)->chan->imtu);
+ session->idle_to = req->idle_to;
- set_bit(LED_NUML, input->ledbit);
- set_bit(LED_CAPSL, input->ledbit);
- set_bit(LED_SCROLLL, input->ledbit);
- set_bit(LED_COMPOSE, input->ledbit);
- set_bit(LED_KANA, input->ledbit);
+ /* device management */
+ INIT_WORK(&session->dev_init, hidp_session_dev_work);
+ timer_setup(&session->timer, hidp_idle_timeout, 0);
- for (i = 0; i < sizeof(hidp_keycode); i++)
- set_bit(hidp_keycode[i], input->keybit);
- clear_bit(0, input->keybit);
+ /* session data */
+ mutex_init(&session->report_mutex);
+ init_waitqueue_head(&session->report_queue);
+
+ ret = hidp_session_dev_init(session, req);
+ if (ret)
+ goto err_free;
+
+ get_file(session->intr_sock->file);
+ get_file(session->ctrl_sock->file);
+ *out = session;
+ return 0;
+
+err_free:
+ l2cap_conn_put(session->conn);
+ kfree(session);
+ return ret;
+}
+
+/* increase ref-count of the given session by one */
+static void hidp_session_get(struct hidp_session *session)
+{
+ kref_get(&session->ref);
+}
+
+/* release callback */
+static void session_free(struct kref *ref)
+{
+ struct hidp_session *session = container_of(ref, struct hidp_session,
+ ref);
+
+ hidp_session_dev_destroy(session);
+ skb_queue_purge(&session->ctrl_transmit);
+ skb_queue_purge(&session->intr_transmit);
+ fput(session->intr_sock->file);
+ fput(session->ctrl_sock->file);
+ l2cap_conn_put(session->conn);
+ kfree(session);
+}
+
+/* decrease ref-count of the given session by one */
+static void hidp_session_put(struct hidp_session *session)
+{
+ kref_put(&session->ref, session_free);
+}
+
+/*
+ * Search the list of active sessions for a session with target address
+ * \bdaddr. You must hold at least a read-lock on \hidp_session_sem. As long as
+ * you do not release this lock, the session objects cannot vanish and you can
+ * safely take a reference to the session yourself.
+ */
+static struct hidp_session *__hidp_session_find(const bdaddr_t *bdaddr)
+{
+ struct hidp_session *session;
+
+ list_for_each_entry(session, &hidp_session_list, list) {
+ if (!bacmp(bdaddr, &session->bdaddr))
+ return session;
}
- if (req->subclass & 0x80) {
- input->evbit[0] = BIT(EV_KEY) | BIT(EV_REL);
- input->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE);
- input->relbit[0] = BIT(REL_X) | BIT(REL_Y);
- input->keybit[LONG(BTN_MOUSE)] |= BIT(BTN_SIDE) | BIT(BTN_EXTRA);
- input->relbit[0] |= BIT(REL_WHEEL);
+ return NULL;
+}
+
+/*
+ * Same as __hidp_session_find() but no locks must be held. This also takes a
+ * reference of the returned session (if non-NULL) so you must drop this
+ * reference if you no longer use the object.
+ */
+static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr)
+{
+ struct hidp_session *session;
+
+ down_read(&hidp_session_sem);
+
+ session = __hidp_session_find(bdaddr);
+ if (session)
+ hidp_session_get(session);
+
+ up_read(&hidp_session_sem);
+
+ return session;
+}
+
+/*
+ * Start session synchronously
+ * This starts a session thread and waits until initialization
+ * is done or returns an error if it couldn't be started.
+ * If this returns 0 the session thread is up and running. You must call
+ * hipd_session_stop_sync() before deleting any runtime resources.
+ */
+static int hidp_session_start_sync(struct hidp_session *session)
+{
+ unsigned int vendor, product;
+
+ if (session->hid) {
+ vendor = session->hid->vendor;
+ product = session->hid->product;
+ } else if (session->input) {
+ vendor = session->input->id.vendor;
+ product = session->input->id.product;
+ } else {
+ vendor = 0x0000;
+ product = 0x0000;
}
- input->cdev.dev = hidp_get_device(session);
+ session->task = kthread_run(hidp_session_thread, session,
+ "khidpd_%04x%04x", vendor, product);
+ if (IS_ERR(session->task))
+ return PTR_ERR(session->task);
- input->event = hidp_input_event;
+ while (atomic_read(&session->state) <= HIDP_SESSION_IDLING)
+ wait_event(session->state_queue,
+ atomic_read(&session->state) > HIDP_SESSION_IDLING);
- input_register_device(input);
+ return 0;
}
-int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock)
+/*
+ * Terminate session thread
+ * Wake up session thread and notify it to stop. This is asynchronous and
+ * returns immediately. Call this whenever a runtime error occurs and you want
+ * the session to stop.
+ * Note: wake_up_interruptible() performs any necessary memory-barriers for us.
+ */
+static void hidp_session_terminate(struct hidp_session *session)
{
- struct hidp_session *session, *s;
- int err;
+ atomic_inc(&session->terminate);
+ /*
+ * See the comment preceding the call to wait_woken()
+ * in hidp_session_run().
+ */
+ wake_up_interruptible(&hidp_session_wq);
+}
- BT_DBG("");
+/*
+ * Probe HIDP session
+ * This is called from the l2cap_conn core when our l2cap_user object is bound
+ * to the hci-connection. We get the session via the \user object and can now
+ * start the session thread, link it into the global session list and
+ * schedule HID/input device registration.
+ * The global session-list owns its own reference to the session object so you
+ * can drop your own reference after registering the l2cap_user object.
+ */
+static int hidp_session_probe(struct l2cap_conn *conn,
+ struct l2cap_user *user)
+{
+ struct hidp_session *session = container_of(user,
+ struct hidp_session,
+ user);
+ struct hidp_session *s;
+ int ret;
- if (bacmp(&bt_sk(ctrl_sock->sk)->src, &bt_sk(intr_sock->sk)->src) ||
- bacmp(&bt_sk(ctrl_sock->sk)->dst, &bt_sk(intr_sock->sk)->dst))
- return -ENOTUNIQ;
+ down_write(&hidp_session_sem);
- session = kzalloc(sizeof(struct hidp_session), GFP_KERNEL);
- if (!session)
- return -ENOMEM;
+ /* check that no other session for this device exists */
+ s = __hidp_session_find(&session->bdaddr);
+ if (s) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
- session->input = input_allocate_device();
- if (!session->input) {
- kfree(session);
- return -ENOMEM;
+ if (session->input) {
+ ret = hidp_session_dev_add(session);
+ if (ret)
+ goto out_unlock;
}
+ ret = hidp_session_start_sync(session);
+ if (ret)
+ goto out_del;
+
+ /* HID device registration is async to allow I/O during probe */
+ if (session->input)
+ atomic_inc(&session->state);
+ else
+ schedule_work(&session->dev_init);
+
+ hidp_session_get(session);
+ list_add(&session->list, &hidp_session_list);
+ ret = 0;
+ goto out_unlock;
+
+out_del:
+ if (session->input)
+ hidp_session_dev_del(session);
+out_unlock:
+ up_write(&hidp_session_sem);
+ return ret;
+}
+
+/*
+ * Remove HIDP session
+ * Called from the l2cap_conn core when either we explicitly unregistered
+ * the l2cap_user object or if the underlying connection is shut down.
+ * We signal the hidp-session thread to shut down, unregister the HID/input
+ * devices and unlink the session from the global list.
+ * This drops the reference to the session that is owned by the global
+ * session-list.
+ * Note: We _must_ not synchronosly wait for the session-thread to shut down.
+ * This is, because the session-thread might be waiting for an HCI lock that is
+ * held while we are called. Therefore, we only unregister the devices and
+ * notify the session-thread to terminate. The thread itself owns a reference
+ * to the session object so it can safely shut down.
+ */
+static void hidp_session_remove(struct l2cap_conn *conn,
+ struct l2cap_user *user)
+{
+ struct hidp_session *session = container_of(user,
+ struct hidp_session,
+ user);
+
down_write(&hidp_session_sem);
- s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst);
- if (s && s->state == BT_CONNECTED) {
- err = -EEXIST;
- goto failed;
- }
+ hidp_session_terminate(session);
- bacpy(&session->bdaddr, &bt_sk(ctrl_sock->sk)->dst);
+ cancel_work_sync(&session->dev_init);
+ if (session->input ||
+ atomic_read(&session->state) > HIDP_SESSION_PREPARING)
+ hidp_session_dev_del(session);
- session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl_sock->sk)->omtu, l2cap_pi(ctrl_sock->sk)->imtu);
- session->intr_mtu = min_t(uint, l2cap_pi(intr_sock->sk)->omtu, l2cap_pi(intr_sock->sk)->imtu);
+ list_del(&session->list);
- BT_DBG("ctrl mtu %d intr mtu %d", session->ctrl_mtu, session->intr_mtu);
+ up_write(&hidp_session_sem);
- session->ctrl_sock = ctrl_sock;
- session->intr_sock = intr_sock;
- session->state = BT_CONNECTED;
+ hidp_session_put(session);
+}
- init_timer(&session->timer);
+/*
+ * Session Worker
+ * This performs the actual main-loop of the HIDP worker. We first check
+ * whether the underlying connection is still alive, then parse all pending
+ * messages and finally send all outstanding messages.
+ */
+static void hidp_session_run(struct hidp_session *session)
+{
+ struct sock *ctrl_sk = session->ctrl_sock->sk;
+ struct sock *intr_sk = session->intr_sock->sk;
+ struct sk_buff *skb;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(&hidp_session_wq, &wait);
+ for (;;) {
+ /*
+ * This thread can be woken up two ways:
+ * - You call hidp_session_terminate() which sets the
+ * session->terminate flag and wakes this thread up.
+ * - Via modifying the socket state of ctrl/intr_sock. This
+ * thread is woken up by ->sk_state_changed().
+ */
+
+ if (atomic_read(&session->terminate))
+ break;
- session->timer.function = hidp_idle_timeout;
- session->timer.data = (unsigned long) session;
+ if (ctrl_sk->sk_state != BT_CONNECTED ||
+ intr_sk->sk_state != BT_CONNECTED)
+ break;
- skb_queue_head_init(&session->ctrl_transmit);
- skb_queue_head_init(&session->intr_transmit);
+ /* parse incoming intr-skbs */
+ while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb))
+ hidp_recv_intr_frame(session, skb);
+ else
+ kfree_skb(skb);
+ }
- session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID);
- session->idle_to = req->idle_to;
+ /* send pending intr-skbs */
+ hidp_process_transmit(session, &session->intr_transmit,
+ session->intr_sock);
- if (session->input)
- hidp_setup_input(session, req);
+ /* parse incoming ctrl-skbs */
+ while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb))
+ hidp_recv_ctrl_frame(session, skb);
+ else
+ kfree_skb(skb);
+ }
- __hidp_link_session(session);
+ /* send pending ctrl-skbs */
+ hidp_process_transmit(session, &session->ctrl_transmit,
+ session->ctrl_sock);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&hidp_session_wq, &wait);
+
+ atomic_inc(&session->terminate);
+}
+
+static int hidp_session_wake_function(wait_queue_entry_t *wait,
+ unsigned int mode,
+ int sync, void *key)
+{
+ wake_up_interruptible(&hidp_session_wq);
+ return false;
+}
+
+/*
+ * HIDP session thread
+ * This thread runs the I/O for a single HIDP session. Startup is synchronous
+ * which allows us to take references to ourself here instead of doing that in
+ * the caller.
+ * When we are ready to run we notify the caller and call hidp_session_run().
+ */
+static int hidp_session_thread(void *arg)
+{
+ struct hidp_session *session = arg;
+ DEFINE_WAIT_FUNC(ctrl_wait, hidp_session_wake_function);
+ DEFINE_WAIT_FUNC(intr_wait, hidp_session_wake_function);
+
+ BT_DBG("session %p", session);
+
+ /* initialize runtime environment */
+ hidp_session_get(session);
+ __module_get(THIS_MODULE);
+ set_user_nice(current, -15);
hidp_set_timer(session);
- err = kernel_thread(hidp_session, session, CLONE_KERNEL);
- if (err < 0)
- goto unlink;
+ add_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait);
+ add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
+ /* This memory barrier is paired with wq_has_sleeper(). See
+ * sock_poll_wait() for more information why this is needed. */
+ smp_mb__before_atomic();
- if (session->input) {
- hidp_send_ctrl_message(session,
- HIDP_TRANS_SET_PROTOCOL | HIDP_PROTO_BOOT, NULL, 0);
- session->flags |= (1 << HIDP_BOOT_PROTOCOL_MODE);
+ /* notify synchronous startup that we're ready */
+ atomic_inc(&session->state);
+ wake_up(&session->state_queue);
- session->leds = 0xff;
- hidp_input_event(session->input, EV_LED, 0, 0);
- }
+ /* run session */
+ hidp_session_run(session);
- up_write(&hidp_session_sem);
+ /* cleanup runtime environment */
+ remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
+ remove_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait);
+ wake_up_interruptible(&session->report_queue);
+ hidp_del_timer(session);
+
+ /*
+ * If we stopped ourself due to any internal signal, we should try to
+ * unregister our own session here to avoid having it linger until the
+ * parent l2cap_conn dies or user-space cleans it up.
+ * This does not deadlock as we don't do any synchronous shutdown.
+ * Instead, this call has the same semantics as if user-space tried to
+ * delete the session.
+ */
+ l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_put(session);
+
+ module_put_and_kthread_exit(0);
return 0;
+}
-unlink:
- hidp_del_timer(session);
+static int hidp_verify_sockets(struct socket *ctrl_sock,
+ struct socket *intr_sock)
+{
+ struct l2cap_chan *ctrl_chan, *intr_chan;
+ struct bt_sock *ctrl, *intr;
+ struct hidp_session *session;
- __hidp_unlink_session(session);
+ if (!l2cap_is_socket(ctrl_sock) || !l2cap_is_socket(intr_sock))
+ return -EINVAL;
- if (session->input) {
- input_unregister_device(session->input);
- session->input = NULL; /* don't try to free it here */
+ ctrl_chan = l2cap_pi(ctrl_sock->sk)->chan;
+ intr_chan = l2cap_pi(intr_sock->sk)->chan;
+
+ if (bacmp(&ctrl_chan->src, &intr_chan->src) ||
+ bacmp(&ctrl_chan->dst, &intr_chan->dst))
+ return -ENOTUNIQ;
+
+ ctrl = bt_sk(ctrl_sock->sk);
+ intr = bt_sk(intr_sock->sk);
+
+ if (ctrl->sk.sk_state != BT_CONNECTED ||
+ intr->sk.sk_state != BT_CONNECTED)
+ return -EBADFD;
+
+ /* early session check, we check again during session registration */
+ session = hidp_session_find(&ctrl_chan->dst);
+ if (session) {
+ hidp_session_put(session);
+ return -EEXIST;
}
-failed:
- up_write(&hidp_session_sem);
+ return 0;
+}
- kfree(session->input);
- kfree(session);
- return err;
+int hidp_connection_add(const struct hidp_connadd_req *req,
+ struct socket *ctrl_sock,
+ struct socket *intr_sock)
+{
+ u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) |
+ BIT(HIDP_BOOT_PROTOCOL_MODE);
+ struct hidp_session *session;
+ struct l2cap_conn *conn;
+ struct l2cap_chan *chan;
+ int ret;
+
+ ret = hidp_verify_sockets(ctrl_sock, intr_sock);
+ if (ret)
+ return ret;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ chan = l2cap_pi(ctrl_sock->sk)->chan;
+ conn = NULL;
+ l2cap_chan_lock(chan);
+ if (chan->conn)
+ conn = l2cap_conn_get(chan->conn);
+ l2cap_chan_unlock(chan);
+
+ if (!conn)
+ return -EBADFD;
+
+ ret = hidp_session_new(&session, &chan->dst, ctrl_sock,
+ intr_sock, req, conn);
+ if (ret)
+ goto out_conn;
+
+ ret = l2cap_register_user(conn, &session->user);
+ if (ret)
+ goto out_session;
+
+ ret = 0;
+
+out_session:
+ hidp_session_put(session);
+out_conn:
+ l2cap_conn_put(conn);
+ return ret;
}
-int hidp_del_connection(struct hidp_conndel_req *req)
+int hidp_connection_del(struct hidp_conndel_req *req)
{
+ u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG);
struct hidp_session *session;
- int err = 0;
- BT_DBG("");
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
- down_read(&hidp_session_sem);
+ session = hidp_session_find(&req->bdaddr);
+ if (!session)
+ return -ENOENT;
- session = __hidp_get_session(&req->bdaddr);
- if (session) {
- if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) {
- hidp_send_ctrl_message(session,
- HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0);
- } else {
- /* Flush the transmit queues */
- skb_queue_purge(&session->ctrl_transmit);
- skb_queue_purge(&session->intr_transmit);
-
- /* Kill session thread */
- atomic_inc(&session->terminate);
- hidp_schedule(session);
- }
- } else
- err = -ENOENT;
+ if (req->flags & BIT(HIDP_VIRTUAL_CABLE_UNPLUG))
+ hidp_send_ctrl_message(session,
+ HIDP_TRANS_HID_CONTROL |
+ HIDP_CTRL_VIRTUAL_CABLE_UNPLUG,
+ NULL, 0);
+ else
+ l2cap_unregister_user(session->conn, &session->user);
- up_read(&hidp_session_sem);
- return err;
+ hidp_session_put(session);
+
+ return 0;
}
int hidp_get_connlist(struct hidp_connlist_req *req)
{
- struct list_head *p;
+ struct hidp_session *session;
int err = 0, n = 0;
BT_DBG("");
down_read(&hidp_session_sem);
- list_for_each(p, &hidp_session_list) {
- struct hidp_session *session;
+ list_for_each_entry(session, &hidp_session_list, list) {
struct hidp_conninfo ci;
- session = list_entry(p, struct hidp_session, list);
-
- __hidp_copy_session(session, &ci);
+ hidp_copy_session(session, &ci);
if (copy_to_user(req->ci, &ci, sizeof(ci))) {
err = -EFAULT;
@@ -748,24 +1446,18 @@ int hidp_get_connlist(struct hidp_connlist_req *req)
int hidp_get_conninfo(struct hidp_conninfo *ci)
{
struct hidp_session *session;
- int err = 0;
- down_read(&hidp_session_sem);
-
- session = __hidp_get_session(&ci->bdaddr);
- if (session)
- __hidp_copy_session(session, ci);
- else
- err = -ENOENT;
+ session = hidp_session_find(&ci->bdaddr);
+ if (session) {
+ hidp_copy_session(session, ci);
+ hidp_session_put(session);
+ }
- up_read(&hidp_session_sem);
- return err;
+ return session ? 0 : -ENOENT;
}
static int __init hidp_init(void)
{
- l2cap_load();
-
BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION);
return hidp_init_sockets();
@@ -780,6 +1472,7 @@ module_init(hidp_init);
module_exit(hidp_exit);
MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_AUTHOR("David Herrmann <dh.herrmann@gmail.com>");
MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION);
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index c2775f587d2e..6ef88d0a1919 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -1,4 +1,4 @@
-/*
+/*
HIDP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
@@ -10,13 +10,13 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
@@ -24,7 +24,10 @@
#define __HIDP_H
#include <linux/types.h>
+#include <linux/hid.h>
+#include <linux/kref.h>
#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/l2cap.h>
/* HIDP header masks */
#define HIDP_HEADER_TRANS_MASK 0xf0
@@ -80,13 +83,15 @@
#define HIDP_VIRTUAL_CABLE_UNPLUG 0
#define HIDP_BOOT_PROTOCOL_MODE 1
#define HIDP_BLUETOOTH_VENDOR_ID 9
+#define HIDP_WAITING_FOR_RETURN 10
+#define HIDP_WAITING_FOR_SEND_ACK 11
struct hidp_connadd_req {
- int ctrl_sock; // Connected control socket
- int intr_sock; // Connteted interrupt socket
+ int ctrl_sock; /* Connected control socket */
+ int intr_sock; /* Connected interrupt socket */
__u16 parser;
__u16 rd_size;
- __u8 *rd_data;
+ __u8 __user *rd_data;
__u8 country;
__u8 subclass;
__u16 vendor;
@@ -117,51 +122,71 @@ struct hidp_connlist_req {
struct hidp_conninfo __user *ci;
};
-int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
-int hidp_del_connection(struct hidp_conndel_req *req);
+int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
+int hidp_connection_del(struct hidp_conndel_req *req);
int hidp_get_connlist(struct hidp_connlist_req *req);
int hidp_get_conninfo(struct hidp_conninfo *ci);
+enum hidp_session_state {
+ HIDP_SESSION_IDLING,
+ HIDP_SESSION_PREPARING,
+ HIDP_SESSION_RUNNING,
+};
+
/* HIDP session defines */
struct hidp_session {
struct list_head list;
+ struct kref ref;
- struct socket *ctrl_sock;
- struct socket *intr_sock;
-
- bdaddr_t bdaddr;
-
- unsigned long state;
+ /* runtime management */
+ atomic_t state;
+ wait_queue_head_t state_queue;
+ atomic_t terminate;
+ struct task_struct *task;
unsigned long flags;
- unsigned long idle_to;
+ /* connection management */
+ bdaddr_t bdaddr;
+ struct l2cap_conn *conn;
+ struct l2cap_user user;
+ struct socket *ctrl_sock;
+ struct socket *intr_sock;
+ struct sk_buff_head ctrl_transmit;
+ struct sk_buff_head intr_transmit;
uint ctrl_mtu;
uint intr_mtu;
+ unsigned long idle_to;
- atomic_t terminate;
+ /* device management */
+ struct work_struct dev_init;
+ struct input_dev *input;
+ struct hid_device *hid;
+ struct timer_list timer;
+
+ /* Report descriptor */
+ __u8 *rd_data;
+ uint rd_size;
+ /* session data */
unsigned char keys[8];
unsigned char leds;
- struct input_dev *input;
+ /* Used in hidp_get_raw_report() */
+ int waiting_report_type; /* HIDP_DATA_RTYPE_* */
+ int waiting_report_number; /* -1 for not numbered */
+ struct mutex report_mutex;
+ struct sk_buff *report_return;
+ wait_queue_head_t report_queue;
- struct timer_list timer;
+ /* Used in hidp_output_raw_report() */
+ int output_report_success; /* boolean */
- struct sk_buff_head ctrl_transmit;
- struct sk_buff_head intr_transmit;
+ /* temporary input buffer */
+ u8 input_buf[HID_MAX_BUFFER_SIZE];
};
-static inline void hidp_schedule(struct hidp_session *session)
-{
- struct sock *ctrl_sk = session->ctrl_sock->sk;
- struct sock *intr_sk = session->intr_sock->sk;
-
- wake_up_interruptible(ctrl_sk->sk_sleep);
- wake_up_interruptible(intr_sk->sk_sleep);
-}
-
/* HIDP init defines */
-extern int __init hidp_init_sockets(void);
-extern void __exit hidp_cleanup_sockets(void);
+int __init hidp_init_sockets(void);
+void __exit hidp_cleanup_sockets(void);
#endif /* __HIDP_H */
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 407fba43c1b9..c93aaeb3a3fa 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -1,4 +1,4 @@
-/*
+/*
HIDP implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
@@ -10,40 +10,25 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/skbuff.h>
-#include <linux/socket.h>
-#include <linux/ioctl.h>
-#include <linux/file.h>
-#include <linux/init.h>
#include <linux/compat.h>
-#include <net/sock.h>
+#include <linux/export.h>
+#include <linux/file.h>
#include "hidp.h"
-#ifndef CONFIG_BT_HIDP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+static struct bt_sock_list hidp_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(hidp_sk_list.lock)
+};
static int hidp_sock_release(struct socket *sock)
{
@@ -54,15 +39,16 @@ static int hidp_sock_release(struct socket *sock)
if (!sk)
return 0;
+ bt_sock_unlink(&hidp_sk_list, sk);
+
sock_orphan(sk);
sock_put(sk);
return 0;
}
-static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
{
- void __user *argp = (void __user *) arg;
struct hidp_connadd_req ca;
struct hidp_conndel_req cd;
struct hidp_connlist_req cl;
@@ -71,12 +57,12 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
struct socket *isock;
int err;
- BT_DBG("cmd %x arg %lx", cmd, arg);
+ BT_DBG("cmd %x arg %p", cmd, argp);
switch (cmd) {
case HIDPCONNADD:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
if (copy_from_user(&ca, argp, sizeof(ca)))
return -EFAULT;
@@ -87,35 +73,28 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
isock = sockfd_lookup(ca.intr_sock, &err);
if (!isock) {
- fput(csock->file);
+ sockfd_put(csock);
return err;
}
+ ca.name[sizeof(ca.name)-1] = 0;
- if (csock->sk->sk_state != BT_CONNECTED || isock->sk->sk_state != BT_CONNECTED) {
- fput(csock->file);
- fput(isock->file);
- return -EBADFD;
- }
+ err = hidp_connection_add(&ca, csock, isock);
+ if (!err && copy_to_user(argp, &ca, sizeof(ca)))
+ err = -EFAULT;
- err = hidp_add_connection(&ca, csock, isock);
- if (!err) {
- if (copy_to_user(argp, &ca, sizeof(ca)))
- err = -EFAULT;
- } else {
- fput(csock->file);
- fput(isock->file);
- }
+ sockfd_put(csock);
+ sockfd_put(isock);
return err;
case HIDPCONNDEL:
if (!capable(CAP_NET_ADMIN))
- return -EACCES;
+ return -EPERM;
if (copy_from_user(&cd, argp, sizeof(cd)))
return -EFAULT;
- return hidp_del_connection(&cd);
+ return hidp_connection_del(&cd);
case HIDPGETCONNLIST:
if (copy_from_user(&cl, argp, sizeof(cl)))
@@ -144,10 +123,15 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
+static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg);
+}
+
#ifdef CONFIG_COMPAT
struct compat_hidp_connadd_req {
- int ctrl_sock; // Connected control socket
- int intr_sock; // Connteted interrupt socket
+ int ctrl_sock; /* Connected control socket */
+ int intr_sock; /* Connected interrupt socket */
__u16 parser;
__u16 rd_size;
compat_uptr_t rd_data;
@@ -163,13 +147,15 @@ struct compat_hidp_connadd_req {
static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
+ int err;
+
if (cmd == HIDPGETCONNLIST) {
struct hidp_connlist_req cl;
- uint32_t uci;
- int err;
+ u32 __user *p = argp;
+ u32 uci;
- if (get_user(cl.cnum, (uint32_t __user *) arg) ||
- get_user(uci, (u32 __user *) (arg + 4)))
+ if (get_user(cl.cnum, p) || get_user(uci, p + 1))
return -EFAULT;
cl.ci = compat_ptr(uci);
@@ -179,39 +165,55 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
err = hidp_get_connlist(&cl);
- if (!err && put_user(cl.cnum, (uint32_t __user *) arg))
+ if (!err && put_user(cl.cnum, p))
err = -EFAULT;
return err;
} else if (cmd == HIDPCONNADD) {
- struct compat_hidp_connadd_req ca;
- struct hidp_connadd_req __user *uca;
+ struct compat_hidp_connadd_req ca32;
+ struct hidp_connadd_req ca;
+ struct socket *csock;
+ struct socket *isock;
- uca = compat_alloc_user_space(sizeof(*uca));
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
- if (copy_from_user(&ca, (void *) arg, sizeof(ca)))
+ if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32)))
return -EFAULT;
- if (put_user(ca.ctrl_sock, &uca->ctrl_sock) ||
- put_user(ca.intr_sock, &uca->intr_sock) ||
- put_user(ca.parser, &uca->parser) ||
- put_user(ca.rd_size, &uca->parser) ||
- put_user(compat_ptr(ca.rd_data), &uca->rd_data) ||
- put_user(ca.country, &uca->country) ||
- put_user(ca.subclass, &uca->subclass) ||
- put_user(ca.vendor, &uca->vendor) ||
- put_user(ca.product, &uca->product) ||
- put_user(ca.version, &uca->version) ||
- put_user(ca.flags, &uca->flags) ||
- put_user(ca.idle_to, &uca->idle_to) ||
- copy_to_user(&uca->name[0], &ca.name[0], 128))
- return -EFAULT;
-
- arg = (unsigned long) uca;
+ ca.ctrl_sock = ca32.ctrl_sock;
+ ca.intr_sock = ca32.intr_sock;
+ ca.parser = ca32.parser;
+ ca.rd_size = ca32.rd_size;
+ ca.rd_data = compat_ptr(ca32.rd_data);
+ ca.country = ca32.country;
+ ca.subclass = ca32.subclass;
+ ca.vendor = ca32.vendor;
+ ca.product = ca32.product;
+ ca.version = ca32.version;
+ ca.flags = ca32.flags;
+ ca.idle_to = ca32.idle_to;
+ ca32.name[sizeof(ca32.name) - 1] = '\0';
+ memcpy(ca.name, ca32.name, 128);
+
+ csock = sockfd_lookup(ca.ctrl_sock, &err);
+ if (!csock)
+ return err;
+
+ isock = sockfd_lookup(ca.intr_sock, &err);
+ if (!isock) {
+ sockfd_put(csock);
+ return err;
+ }
+
+ err = hidp_connection_add(&ca, csock, isock);
+ if (!err && copy_to_user(argp, &ca32, sizeof(ca32)))
+ err = -EFAULT;
- /* Fall through. We don't actually write back any _changes_
- to the structure anyway, so there's no need to copy back
- into the original compat version */
+ sockfd_put(csock);
+ sockfd_put(isock);
+
+ return err;
}
return hidp_sock_ioctl(sock, cmd, arg);
@@ -230,11 +232,8 @@ static const struct proto_ops hidp_sock_ops = {
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
- .poll = sock_no_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
@@ -247,7 +246,8 @@ static struct proto hidp_proto = {
.obj_size = sizeof(struct bt_sock)
};
-static int hidp_sock_create(struct socket *sock, int protocol)
+static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
@@ -256,25 +256,19 @@ static int hidp_sock_create(struct socket *sock, int protocol)
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, 1);
+ sk = bt_sock_alloc(net, sock, &hidp_proto, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
- sock_init_data(sock, sk);
-
sock->ops = &hidp_sock_ops;
-
sock->state = SS_UNCONNECTED;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = protocol;
- sk->sk_state = BT_OPEN;
+ bt_sock_link(&hidp_sk_list, sk);
return 0;
}
-static struct net_proto_family hidp_sock_family_ops = {
+static const struct net_proto_family hidp_sock_family_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.create = hidp_sock_create
@@ -289,21 +283,30 @@ int __init hidp_init_sockets(void)
return err;
err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops);
- if (err < 0)
+ if (err < 0) {
+ BT_ERR("Can't register HIDP socket");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "hidp", &hidp_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create HIDP proc file");
+ bt_sock_unregister(BTPROTO_HIDP);
goto error;
+ }
+
+ BT_INFO("HIDP socket layer initialized");
return 0;
error:
- BT_ERR("Can't register HIDP socket");
proto_unregister(&hidp_proto);
return err;
}
void __exit hidp_cleanup_sockets(void)
{
- if (bt_sock_unregister(BTPROTO_HIDP) < 0)
- BT_ERR("Can't unregister HIDP socket");
-
+ bt_procfs_cleanup(&init_net, "hidp");
+ bt_sock_unregister(BTPROTO_HIDP);
proto_unregister(&hidp_proto);
}
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
new file mode 100644
index 000000000000..e36d24a9098b
--- /dev/null
+++ b/net/bluetooth/iso.c
@@ -0,0 +1,2734 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2022 Intel Corporation
+ * Copyright 2023-2024 NXP
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/iso.h>
+#include "eir.h"
+
+static const struct proto_ops iso_sock_ops;
+
+static struct bt_sock_list iso_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock)
+};
+
+/* ---- ISO connections ---- */
+struct iso_conn {
+ struct hci_conn *hcon;
+
+ /* @lock: spinlock protecting changes to iso_conn fields */
+ spinlock_t lock;
+ struct sock *sk;
+
+ struct delayed_work timeout_work;
+
+ struct sk_buff *rx_skb;
+ __u32 rx_len;
+ __u16 tx_sn;
+ struct kref ref;
+};
+
+#define iso_conn_lock(c) spin_lock(&(c)->lock)
+#define iso_conn_unlock(c) spin_unlock(&(c)->lock)
+
+static void iso_sock_close(struct sock *sk);
+static void iso_sock_kill(struct sock *sk);
+
+/* ----- ISO socket info ----- */
+#define iso_pi(sk) ((struct iso_pinfo *)sk)
+
+#define EIR_SERVICE_DATA_LENGTH 4
+#define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH)
+#define EIR_BAA_SERVICE_UUID 0x1851
+
+/* iso_pinfo flags values */
+enum {
+ BT_SK_BIG_SYNC,
+ BT_SK_PA_SYNC,
+};
+
+struct iso_pinfo {
+ struct bt_sock bt;
+ bdaddr_t src;
+ __u8 src_type;
+ bdaddr_t dst;
+ __u8 dst_type;
+ __u8 bc_sid;
+ __u8 bc_num_bis;
+ __u8 bc_bis[ISO_MAX_NUM_BIS];
+ __u16 sync_handle;
+ unsigned long flags;
+ struct bt_iso_qos qos;
+ bool qos_user_set;
+ __u8 base_len;
+ __u8 base[BASE_MAX_LENGTH];
+ struct iso_conn *conn;
+};
+
+static struct bt_iso_qos default_qos;
+
+static bool check_ucast_qos(struct bt_iso_qos *qos);
+static bool check_bcast_qos(struct bt_iso_qos *qos);
+static bool iso_match_sid(struct sock *sk, void *data);
+static bool iso_match_sid_past(struct sock *sk, void *data);
+static bool iso_match_sync_handle(struct sock *sk, void *data);
+static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data);
+static void iso_sock_disconn(struct sock *sk);
+
+typedef bool (*iso_sock_match_t)(struct sock *sk, void *data);
+
+static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src,
+ bdaddr_t *dst, enum bt_sock_state state,
+ iso_sock_match_t match, void *data);
+
+/* ---- ISO timers ---- */
+#define ISO_CONN_TIMEOUT secs_to_jiffies(20)
+#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2)
+
+static void iso_conn_free(struct kref *ref)
+{
+ struct iso_conn *conn = container_of(ref, struct iso_conn, ref);
+
+ BT_DBG("conn %p", conn);
+
+ if (conn->sk)
+ iso_pi(conn->sk)->conn = NULL;
+
+ if (conn->hcon) {
+ conn->hcon->iso_data = NULL;
+ hci_conn_drop(conn->hcon);
+ }
+
+ /* Ensure no more work items will run since hci_conn has been dropped */
+ disable_delayed_work_sync(&conn->timeout_work);
+
+ kfree_skb(conn->rx_skb);
+
+ kfree(conn);
+}
+
+static void iso_conn_put(struct iso_conn *conn)
+{
+ if (!conn)
+ return;
+
+ BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref));
+
+ kref_put(&conn->ref, iso_conn_free);
+}
+
+static struct iso_conn *iso_conn_hold_unless_zero(struct iso_conn *conn)
+{
+ if (!conn)
+ return NULL;
+
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ if (!kref_get_unless_zero(&conn->ref))
+ return NULL;
+
+ return conn;
+}
+
+static struct sock *iso_sock_hold(struct iso_conn *conn)
+{
+ if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk))
+ return NULL;
+
+ sock_hold(conn->sk);
+
+ return conn->sk;
+}
+
+static void iso_sock_timeout(struct work_struct *work)
+{
+ struct iso_conn *conn = container_of(work, struct iso_conn,
+ timeout_work.work);
+ struct sock *sk;
+
+ conn = iso_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
+
+ iso_conn_lock(conn);
+ sk = iso_sock_hold(conn);
+ iso_conn_unlock(conn);
+ iso_conn_put(conn);
+
+ if (!sk)
+ return;
+
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+
+ lock_sock(sk);
+ sk->sk_err = ETIMEDOUT;
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static void iso_sock_set_timer(struct sock *sk, long timeout)
+{
+ if (!iso_pi(sk)->conn)
+ return;
+
+ BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
+ cancel_delayed_work(&iso_pi(sk)->conn->timeout_work);
+ schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout);
+}
+
+static void iso_sock_clear_timer(struct sock *sk)
+{
+ if (!iso_pi(sk)->conn)
+ return;
+
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+ cancel_delayed_work(&iso_pi(sk)->conn->timeout_work);
+}
+
+/* ---- ISO connections ---- */
+static struct iso_conn *iso_conn_add(struct hci_conn *hcon)
+{
+ struct iso_conn *conn = hcon->iso_data;
+
+ conn = iso_conn_hold_unless_zero(conn);
+ if (conn) {
+ if (!conn->hcon) {
+ iso_conn_lock(conn);
+ conn->hcon = hcon;
+ iso_conn_unlock(conn);
+ }
+ iso_conn_put(conn);
+ return conn;
+ }
+
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+ if (!conn)
+ return NULL;
+
+ kref_init(&conn->ref);
+ spin_lock_init(&conn->lock);
+ INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout);
+
+ hcon->iso_data = conn;
+ conn->hcon = hcon;
+ conn->tx_sn = 0;
+
+ BT_DBG("hcon %p conn %p", hcon, conn);
+
+ return conn;
+}
+
+/* Delete channel. Must be called on the locked socket. */
+static void iso_chan_del(struct sock *sk, int err)
+{
+ struct iso_conn *conn;
+ struct sock *parent;
+
+ conn = iso_pi(sk)->conn;
+ iso_pi(sk)->conn = NULL;
+
+ BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+
+ if (conn) {
+ iso_conn_lock(conn);
+ conn->sk = NULL;
+ iso_conn_unlock(conn);
+ iso_conn_put(conn);
+ }
+
+ sk->sk_state = BT_CLOSED;
+ sk->sk_err = err;
+
+ parent = bt_sk(sk)->parent;
+ if (parent) {
+ bt_accept_unlink(sk);
+ parent->sk_data_ready(parent);
+ } else {
+ sk->sk_state_change(sk);
+ }
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+}
+
+static void iso_conn_del(struct hci_conn *hcon, int err)
+{
+ struct iso_conn *conn = hcon->iso_data;
+ struct sock *sk;
+
+ conn = iso_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
+
+ BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+ /* Kill socket */
+ iso_conn_lock(conn);
+ sk = iso_sock_hold(conn);
+ iso_conn_unlock(conn);
+ iso_conn_put(conn);
+
+ if (!sk) {
+ iso_conn_put(conn);
+ return;
+ }
+
+ lock_sock(sk);
+ iso_sock_clear_timer(sk);
+ iso_chan_del(sk, err);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static int __iso_chan_add(struct iso_conn *conn, struct sock *sk,
+ struct sock *parent)
+{
+ BT_DBG("conn %p", conn);
+
+ if (iso_pi(sk)->conn == conn && conn->sk == sk)
+ return 0;
+
+ if (conn->sk) {
+ BT_ERR("conn->sk already set");
+ return -EBUSY;
+ }
+
+ iso_pi(sk)->conn = conn;
+ conn->sk = sk;
+
+ if (parent)
+ bt_accept_enqueue(parent, sk, true);
+
+ return 0;
+}
+
+static int iso_chan_add(struct iso_conn *conn, struct sock *sk,
+ struct sock *parent)
+{
+ int err;
+
+ iso_conn_lock(conn);
+ err = __iso_chan_add(conn, sk, parent);
+ iso_conn_unlock(conn);
+
+ return err;
+}
+
+static inline u8 le_addr_type(u8 bdaddr_type)
+{
+ if (bdaddr_type == BDADDR_LE_PUBLIC)
+ return ADDR_LE_DEV_PUBLIC;
+ else
+ return ADDR_LE_DEV_RANDOM;
+}
+
+static int iso_connect_bis(struct sock *sk)
+{
+ struct iso_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err;
+
+ BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid);
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (!bis_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ /* Fail if user set invalid QoS */
+ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+ iso_pi(sk)->qos = default_qos;
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Fail if out PHYs are marked as disabled */
+ if (!iso_pi(sk)->qos.bcast.out.phy) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Just bind if DEFER_SETUP has been set */
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid,
+ &iso_pi(sk)->qos, iso_pi(sk)->base_len,
+ iso_pi(sk)->base,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+ } else {
+ hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ iso_pi(sk)->bc_sid, &iso_pi(sk)->qos,
+ iso_pi(sk)->base_len, iso_pi(sk)->base,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+
+ /* Update SID if it was not set */
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ iso_pi(sk)->bc_sid = hcon->sid;
+ }
+
+ conn = iso_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ lock_sock(sk);
+
+ err = iso_chan_add(conn, sk, NULL);
+ if (err) {
+ release_sock(sk);
+ goto unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&iso_pi(sk)->src, &hcon->src);
+
+ if (hcon->state == BT_CONNECTED) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECT;
+ } else {
+ sk->sk_state = BT_CONNECT;
+ iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
+ }
+
+ release_sock(sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int iso_connect_cis(struct sock *sk)
+{
+ struct iso_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err;
+
+ BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst);
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (!cis_central_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ /* Fail if user set invalid QoS */
+ if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) {
+ iso_pi(sk)->qos = default_qos;
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Fail if either PHYs are marked as disabled */
+ if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Check if there are available buffers for output/TX. */
+ if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) &&
+ (hdev->iso_pkts && !hdev->iso_cnt)) {
+ err = -ENOBUFS;
+ goto unlock;
+ }
+
+ /* Just bind if DEFER_SETUP has been set */
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+ } else {
+ hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+ }
+
+ conn = iso_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ lock_sock(sk);
+
+ err = iso_chan_add(conn, sk, NULL);
+ if (err) {
+ release_sock(sk);
+ goto unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&iso_pi(sk)->src, &hcon->src);
+
+ if (hcon->state == BT_CONNECTED) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECT;
+ } else {
+ sk->sk_state = BT_CONNECT;
+ iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
+ }
+
+ release_sock(sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk)
+{
+ if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2)
+ return &iso_pi(sk)->conn->hcon->iso_qos;
+
+ return &iso_pi(sk)->qos;
+}
+
+static int iso_send_frame(struct sock *sk, struct sk_buff *skb,
+ const struct sockcm_cookie *sockc)
+{
+ struct iso_conn *conn = iso_pi(sk)->conn;
+ struct bt_iso_qos *qos = iso_sock_get_qos(sk);
+ struct hci_iso_data_hdr *hdr;
+ int len = 0;
+
+ BT_DBG("sk %p len %d", sk, skb->len);
+
+ if (skb->len > qos->ucast.out.sdu)
+ return -EMSGSIZE;
+
+ len = skb->len;
+
+ /* Push ISO data header */
+ hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE);
+ hdr->sn = cpu_to_le16(conn->tx_sn++);
+ hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len,
+ HCI_ISO_STATUS_VALID));
+
+ if (sk->sk_state == BT_CONNECTED) {
+ hci_setup_tx_timestamp(skb, 1, sockc);
+ hci_send_iso(conn->hcon, skb);
+ } else {
+ len = -ENOTCONN;
+ }
+
+ return len;
+}
+
+static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb)
+{
+ struct sock *sk;
+
+ iso_conn_lock(conn);
+ sk = conn->sk;
+ iso_conn_unlock(conn);
+
+ if (!sk)
+ goto drop;
+
+ BT_DBG("sk %p len %d", sk, skb->len);
+
+ if (sk->sk_state != BT_CONNECTED)
+ goto drop;
+
+ if (!sock_queue_rcv_skb(sk, skb))
+ return;
+
+drop:
+ kfree_skb(skb);
+}
+
+/* -------- Socket interface ---------- */
+static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst)
+{
+ struct sock *sk;
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (bacmp(&iso_pi(sk)->dst, dst))
+ continue;
+
+ if (!bacmp(&iso_pi(sk)->src, src))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc,
+ __u8 sid)
+{
+ struct sock *sk;
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (bacmp(&iso_pi(sk)->src, ba))
+ continue;
+
+ if (bacmp(&iso_pi(sk)->dst, bc))
+ continue;
+
+ if (iso_pi(sk)->bc_sid == sid)
+ return sk;
+ }
+
+ return NULL;
+}
+
+/* Find socket in given state:
+ * source bdaddr (Unicast)
+ * destination bdaddr (Broadcast only)
+ * match func - pass NULL to ignore
+ * match func data - pass -1 to ignore
+ * Returns closest match.
+ */
+static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src,
+ bdaddr_t *dst, enum bt_sock_state state,
+ iso_sock_match_t match, void *data)
+{
+ struct sock *sk = NULL, *sk1 = NULL;
+
+ read_lock(&iso_sk_list.lock);
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (sk->sk_state != state)
+ continue;
+
+ /* Match Broadcast destination */
+ if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) {
+ struct smp_irk *irk1, *irk2;
+
+ /* Check if destination is an RPA that we can resolve */
+ irk1 = hci_find_irk_by_rpa(hdev, dst);
+ if (!irk1)
+ continue;
+
+ /* Match with identity address */
+ if (bacmp(&iso_pi(sk)->dst, &irk1->bdaddr)) {
+ /* Check if socket destination address is also
+ * an RPA and if the IRK matches.
+ */
+ irk2 = hci_find_irk_by_rpa(hdev,
+ &iso_pi(sk)->dst);
+ if (!irk2 || irk1 != irk2)
+ continue;
+ }
+ }
+
+ /* Use Match function if provided */
+ if (match && !match(sk, data))
+ continue;
+
+ /* Exact match. */
+ if (!bacmp(&iso_pi(sk)->src, src)) {
+ sock_hold(sk);
+ break;
+ }
+
+ /* Closest match */
+ if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) {
+ if (sk1)
+ sock_put(sk1);
+
+ sk1 = sk;
+ sock_hold(sk1);
+ }
+ }
+
+ if (sk && sk1)
+ sock_put(sk1);
+
+ read_unlock(&iso_sk_list.lock);
+
+ return sk ? sk : sk1;
+}
+
+static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src,
+ bdaddr_t *dst, uint8_t big)
+{
+ struct sock *sk = NULL;
+
+ read_lock(&iso_sk_list.lock);
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (match_sk == sk)
+ continue;
+
+ /* Look for sockets that have already been
+ * connected to the BIG
+ */
+ if (sk->sk_state != BT_CONNECTED &&
+ sk->sk_state != BT_CONNECT)
+ continue;
+
+ /* Match Broadcast destination */
+ if (bacmp(&iso_pi(sk)->dst, dst))
+ continue;
+
+ /* Match BIG handle */
+ if (iso_pi(sk)->qos.bcast.big != big)
+ continue;
+
+ /* Match source address */
+ if (bacmp(&iso_pi(sk)->src, src))
+ continue;
+
+ sock_hold(sk);
+ break;
+ }
+
+ read_unlock(&iso_sk_list.lock);
+
+ return sk;
+}
+
+static void iso_sock_destruct(struct sock *sk)
+{
+ BT_DBG("sk %p", sk);
+
+ iso_conn_put(iso_pi(sk)->conn);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void iso_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ BT_DBG("parent %p", parent);
+
+ /* Close not yet accepted channels */
+ while ((sk = bt_accept_dequeue(parent, NULL))) {
+ iso_sock_close(sk);
+ iso_sock_kill(sk);
+ }
+
+ /* If listening socket has a hcon, properly disconnect it */
+ if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) {
+ iso_sock_disconn(parent);
+ return;
+ }
+
+ parent->sk_state = BT_CLOSED;
+ sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void iso_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
+ sock_flag(sk, SOCK_DEAD))
+ return;
+
+ BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+ /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ if (iso_pi(sk)->conn) {
+ iso_conn_lock(iso_pi(sk)->conn);
+ iso_pi(sk)->conn->sk = NULL;
+ iso_conn_unlock(iso_pi(sk)->conn);
+ }
+
+ /* Kill poor orphan */
+ bt_sock_unlink(&iso_sk_list, sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static void iso_sock_disconn(struct sock *sk)
+{
+ struct sock *bis_sk;
+ struct hci_conn *hcon = iso_pi(sk)->conn->hcon;
+
+ if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) {
+ bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src,
+ &iso_pi(sk)->dst,
+ iso_pi(sk)->qos.bcast.big);
+
+ /* If there are any other connected sockets for the
+ * same BIG, just delete the sk and leave the bis
+ * hcon active, in case later rebinding is needed.
+ */
+ if (bis_sk) {
+ hcon->state = BT_OPEN;
+ hcon->iso_data = NULL;
+ iso_pi(sk)->conn->hcon = NULL;
+ iso_sock_clear_timer(sk);
+ iso_chan_del(sk, bt_to_errno(hcon->abort_reason));
+ sock_put(bis_sk);
+ return;
+ }
+ }
+
+ sk->sk_state = BT_DISCONN;
+ iso_conn_lock(iso_pi(sk)->conn);
+ hci_conn_drop(iso_pi(sk)->conn->hcon);
+ iso_pi(sk)->conn->hcon = NULL;
+ iso_conn_unlock(iso_pi(sk)->conn);
+}
+
+static void __iso_sock_close(struct sock *sk)
+{
+ BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+ switch (sk->sk_state) {
+ case BT_LISTEN:
+ iso_sock_cleanup_listen(sk);
+ break;
+
+ case BT_CONNECT:
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ if (iso_pi(sk)->conn->hcon)
+ iso_sock_disconn(sk);
+ else
+ iso_chan_del(sk, ECONNRESET);
+ break;
+
+ case BT_CONNECT2:
+ if (iso_pi(sk)->conn->hcon &&
+ (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) ||
+ test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags)))
+ iso_sock_disconn(sk);
+ else
+ iso_chan_del(sk, ECONNRESET);
+ break;
+ case BT_DISCONN:
+ iso_chan_del(sk, ECONNRESET);
+ break;
+
+ default:
+ sock_set_flag(sk, SOCK_ZAPPED);
+ break;
+ }
+}
+
+/* Must be called on unlocked socket. */
+static void iso_sock_close(struct sock *sk)
+{
+ iso_sock_clear_timer(sk);
+ lock_sock(sk);
+ __iso_sock_close(sk);
+ release_sock(sk);
+ iso_sock_kill(sk);
+}
+
+static void iso_sock_init(struct sock *sk, struct sock *parent)
+{
+ BT_DBG("sk %p", sk);
+
+ if (parent) {
+ sk->sk_type = parent->sk_type;
+ bt_sk(sk)->flags = bt_sk(parent)->flags;
+ security_sk_clone(parent, sk);
+ }
+}
+
+static struct proto iso_proto = {
+ .name = "ISO",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct iso_pinfo)
+};
+
+#define DEFAULT_IO_QOS \
+{ \
+ .interval = 10000u, \
+ .latency = 10u, \
+ .sdu = 40u, \
+ .phy = BT_ISO_PHY_2M, \
+ .rtn = 2u, \
+}
+
+static struct bt_iso_qos default_qos = {
+ .bcast = {
+ .big = BT_ISO_QOS_BIG_UNSET,
+ .bis = BT_ISO_QOS_BIS_UNSET,
+ .sync_factor = 0x01,
+ .packing = 0x00,
+ .framing = 0x00,
+ .in = DEFAULT_IO_QOS,
+ .out = DEFAULT_IO_QOS,
+ .encryption = 0x00,
+ .bcode = {0x00},
+ .options = 0x00,
+ .skip = 0x0000,
+ .sync_timeout = BT_ISO_SYNC_TIMEOUT,
+ .sync_cte_type = 0x00,
+ .mse = 0x00,
+ .timeout = BT_ISO_SYNC_TIMEOUT,
+ },
+};
+
+static struct sock *iso_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
+{
+ struct sock *sk;
+
+ sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern);
+ if (!sk)
+ return NULL;
+
+ sk->sk_destruct = iso_sock_destruct;
+ sk->sk_sndtimeo = ISO_CONN_TIMEOUT;
+
+ /* Set address type as public as default src address is BDADDR_ANY */
+ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC;
+
+ iso_pi(sk)->qos = default_qos;
+ iso_pi(sk)->sync_handle = -1;
+
+ bt_sock_link(&iso_sk_list, sk);
+ return sk;
+}
+
+static int iso_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_SEQPACKET)
+ return -ESOCKTNOSUPPORT;
+
+ sock->ops = &iso_sock_ops;
+
+ sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ iso_sock_init(sk, NULL);
+ return 0;
+}
+
+static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr,
+ int addr_len)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int i;
+
+ BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid,
+ sa->iso_bc->bc_num_bis);
+
+ if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc))
+ return -EINVAL;
+
+ bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr);
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type))
+ return -EINVAL;
+
+ iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type;
+
+ if (sa->iso_bc->bc_sid > 0x0f && sa->iso_bc->bc_sid != HCI_SID_INVALID)
+ return -EINVAL;
+
+ iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid;
+
+ if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS)
+ return -EINVAL;
+
+ iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis;
+
+ for (i = 0; i < iso_pi(sk)->bc_num_bis; i++)
+ if (sa->iso_bc->bc_bis[i] < 0x01 ||
+ sa->iso_bc->bc_bis[i] > 0x1f)
+ return -EINVAL;
+
+ memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis,
+ iso_pi(sk)->bc_num_bis);
+
+ return 0;
+}
+
+/* Must be called on the locked socket. */
+static int iso_sock_rebind_bis(struct sock *sk, struct sockaddr_iso *sa,
+ int addr_len)
+{
+ int err = 0;
+
+ if (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags))
+ return -EBADFD;
+
+ if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis;
+
+ for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++)
+ if (sa->iso_bc->bc_bis[i] < 0x01 ||
+ sa->iso_bc->bc_bis[i] > 0x1f) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis,
+ iso_pi(sk)->bc_num_bis);
+
+done:
+ return err;
+}
+
+static struct hci_dev *iso_conn_get_hdev(struct iso_conn *conn)
+{
+ struct hci_dev *hdev = NULL;
+
+ iso_conn_lock(conn);
+ if (conn->hcon)
+ hdev = hci_dev_hold(conn->hcon->hdev);
+ iso_conn_unlock(conn);
+
+ return hdev;
+}
+
+/* Must be called on the locked socket. */
+static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa,
+ int addr_len)
+{
+ struct hci_dev *hdev;
+ struct hci_conn *bis;
+ int err;
+
+ if (sk->sk_type != SOCK_SEQPACKET || !iso_pi(sk)->conn)
+ return -EINVAL;
+
+ /* Check if it is really a Broadcast address being requested */
+ if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc))
+ return -EINVAL;
+
+ /* Check if the address hasn't changed then perhaps only the number of
+ * bis has changed.
+ */
+ if (!bacmp(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr) ||
+ !bacmp(&sa->iso_bc->bc_bdaddr, BDADDR_ANY))
+ return iso_sock_rebind_bis(sk, sa, addr_len);
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type))
+ return -EINVAL;
+
+ hdev = iso_conn_get_hdev(iso_pi(sk)->conn);
+ if (!hdev)
+ return -EINVAL;
+
+ bis = iso_pi(sk)->conn->hcon;
+
+ /* Release the socket before lookups since that requires hci_dev_lock
+ * which shall not be acquired while holding sock_lock for proper
+ * ordering.
+ */
+ release_sock(sk);
+ hci_dev_lock(bis->hdev);
+ lock_sock(sk);
+
+ if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) {
+ /* raced with iso_conn_del() or iso_disconn_sock() */
+ err = -ENOTCONN;
+ goto unlock;
+ }
+
+ BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bc->bc_bdaddr,
+ sa->iso_bc->bc_bdaddr_type);
+
+ err = hci_past_bis(bis, &sa->iso_bc->bc_bdaddr,
+ le_addr_type(sa->iso_bc->bc_bdaddr_type));
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+
+ return err;
+}
+
+static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
+ int addr_len)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type);
+
+ if (!addr || addr_len < sizeof(struct sockaddr_iso) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) &&
+ addr_len > sizeof(*sa)) {
+ /* Allow the user to rebind to a different address using
+ * PAST procedures.
+ */
+ err = iso_sock_rebind_bc(sk, sa, addr_len);
+ goto done;
+ }
+
+ if (sk->sk_state != BT_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr);
+ iso_pi(sk)->src_type = sa->iso_bdaddr_type;
+
+ /* Check for Broadcast address */
+ if (addr_len > sizeof(*sa)) {
+ err = iso_sock_bind_bc(sock, addr, addr_len);
+ if (err)
+ goto done;
+ }
+
+ sk->sk_state = BT_BOUND;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
+ int alen, int flags)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("sk %p", sk);
+
+ if (alen < sizeof(struct sockaddr_iso) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
+ return -EBADFD;
+
+ if (sk->sk_type != SOCK_SEQPACKET)
+ return -EINVAL;
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bdaddr_type))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr);
+ iso_pi(sk)->dst_type = sa->iso_bdaddr_type;
+
+ release_sock(sk);
+
+ if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
+ err = iso_connect_cis(sk);
+ else
+ err = iso_connect_bis(sk);
+
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int iso_listen_bis(struct sock *sk)
+{
+ struct hci_dev *hdev;
+ int err = 0;
+ struct iso_conn *conn;
+ struct hci_conn *hcon;
+
+ BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src,
+ &iso_pi(sk)->dst, iso_pi(sk)->bc_sid);
+
+ write_lock(&iso_sk_list.lock);
+
+ if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst,
+ iso_pi(sk)->bc_sid))
+ err = -EADDRINUSE;
+
+ write_unlock(&iso_sk_list.lock);
+
+ if (err)
+ return err;
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+ lock_sock(sk);
+
+ /* Fail if user set invalid QoS */
+ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+ iso_pi(sk)->qos = default_qos;
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ iso_pi(sk)->bc_sid, &iso_pi(sk)->qos);
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+
+ conn = iso_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = iso_chan_add(conn, sk, NULL);
+ if (err) {
+ hci_conn_drop(hcon);
+ goto unlock;
+ }
+
+unlock:
+ release_sock(sk);
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int iso_listen_cis(struct sock *sk)
+{
+ int err = 0;
+
+ BT_DBG("%pMR", &iso_pi(sk)->src);
+
+ write_lock(&iso_sk_list.lock);
+
+ if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst))
+ err = -EADDRINUSE;
+
+ write_unlock(&iso_sk_list.lock);
+
+ return err;
+}
+
+static int iso_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p backlog %d", sk, backlog);
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) {
+ err = iso_listen_cis(sk);
+ } else {
+ /* Drop sock lock to avoid potential
+ * deadlock with the hdev lock.
+ */
+ release_sock(sk);
+ err = iso_listen_bis(sk);
+ lock_sock(sk);
+ }
+
+ if (err)
+ goto done;
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+
+ sk->sk_state = BT_LISTEN;
+
+done:
+ release_sock(sk);
+ sock_put(sk);
+ return err;
+}
+
+static int iso_sock_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = sock->sk, *ch;
+ long timeo;
+ int err = 0;
+
+ /* Use explicit nested locking to avoid lockdep warnings generated
+ * because the parent socket and the child socket are locked on the
+ * same thread.
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
+
+ BT_DBG("sk %p timeo %ld", sk, timeo);
+
+ /* Wait for an incoming connection. (wake-one). */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ ch = bt_accept_dequeue(sk, newsock);
+ if (ch)
+ break;
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+ BT_DBG("new socket %p", ch);
+
+ /* A Broadcast Sink might require BIG sync to be terminated
+ * and re-established multiple times, while keeping the same
+ * PA sync handle active. To allow this, once all BIS
+ * connections have been accepted on a PA sync parent socket,
+ * "reset" socket state, to allow future BIG re-sync procedures.
+ */
+ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+ /* Iterate through the list of bound BIS indices
+ * and clear each BIS as they are accepted by the
+ * user space, one by one.
+ */
+ for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) {
+ if (iso_pi(sk)->bc_bis[i] > 0) {
+ iso_pi(sk)->bc_bis[i] = 0;
+ iso_pi(sk)->bc_num_bis--;
+ break;
+ }
+ }
+
+ if (iso_pi(sk)->bc_num_bis == 0) {
+ /* Once the last BIS was accepted, reset parent
+ * socket parameters to mark that the listening
+ * process for BIS connections has been completed:
+ *
+ * 1. Reset the DEFER setup flag on the parent sk.
+ * 2. Clear the flag marking that the BIG create
+ * sync command is pending.
+ * 3. Transition socket state from BT_LISTEN to
+ * BT_CONNECTED.
+ */
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ clear_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags);
+ sk->sk_state = BT_CONNECTED;
+ }
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int len = sizeof(struct sockaddr_iso);
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ addr->sa_family = AF_BLUETOOTH;
+
+ if (peer) {
+ struct hci_conn *hcon = iso_pi(sk)->conn ?
+ iso_pi(sk)->conn->hcon : NULL;
+
+ bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst);
+ sa->iso_bdaddr_type = iso_pi(sk)->dst_type;
+
+ if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) {
+ sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid;
+ sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis;
+ memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis,
+ ISO_MAX_NUM_BIS);
+ len += sizeof(struct sockaddr_iso_bc);
+ }
+ } else {
+ bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src);
+ sa->iso_bdaddr_type = iso_pi(sk)->src_type;
+ }
+
+ return len;
+}
+
+static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb, **frag;
+ struct sockcm_cookie sockc;
+ size_t mtu;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ hci_sockcm_init(&sockc, sk);
+
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (err)
+ return err;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_CONNECTED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ mtu = iso_pi(sk)->conn->hcon->mtu;
+
+ release_sock(sk);
+
+ skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ len -= skb->len;
+
+ BT_DBG("skb %p len %d", sk, skb->len);
+
+ /* Continuation fragments */
+ frag = &skb_shinfo(skb)->frag_list;
+ while (len) {
+ struct sk_buff *tmp;
+
+ tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0);
+ if (IS_ERR(tmp)) {
+ kfree_skb(skb);
+ return PTR_ERR(tmp);
+ }
+
+ *frag = tmp;
+
+ len -= tmp->len;
+
+ skb->len += tmp->len;
+ skb->data_len += tmp->len;
+
+ BT_DBG("frag %p len %d", *frag, tmp->len);
+
+ frag = &(*frag)->next;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECTED)
+ err = iso_send_frame(sk, skb, &sockc);
+ else
+ err = -ENOTCONN;
+
+ release_sock(sk);
+
+ if (err < 0)
+ kfree_skb(skb);
+ return err;
+}
+
+static void iso_conn_defer_accept(struct hci_conn *conn)
+{
+ struct hci_cp_le_accept_cis cp;
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("conn %p", conn);
+
+ conn->state = BT_CONFIG;
+
+ cp.handle = cpu_to_le16(conn->handle);
+
+ hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp);
+}
+
+static void iso_conn_big_sync(struct sock *sk)
+{
+ int err;
+ struct hci_dev *hdev;
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+
+ if (!hdev)
+ return;
+
+ /* hci_le_big_create_sync requires hdev lock to be held, since
+ * it enqueues the HCI LE BIG Create Sync command via
+ * hci_cmd_sync_queue_once, which checks hdev flags that might
+ * change.
+ */
+ hci_dev_lock(hdev);
+ lock_sock(sk);
+
+ if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) {
+ err = hci_conn_big_create_sync(hdev, iso_pi(sk)->conn->hcon,
+ &iso_pi(sk)->qos,
+ iso_pi(sk)->sync_handle,
+ iso_pi(sk)->bc_num_bis,
+ iso_pi(sk)->bc_bis);
+ if (err)
+ bt_dev_err(hdev, "hci_big_create_sync: %d", err);
+ }
+
+ release_sock(sk);
+ hci_dev_unlock(hdev);
+}
+
+static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct iso_pinfo *pi = iso_pi(sk);
+ bool early_ret = false;
+ int err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH,
+ BT_SCM_ERROR);
+
+ if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ sock_hold(sk);
+ lock_sock(sk);
+
+ switch (sk->sk_state) {
+ case BT_CONNECT2:
+ if (test_bit(BT_SK_PA_SYNC, &pi->flags)) {
+ release_sock(sk);
+ iso_conn_big_sync(sk);
+ lock_sock(sk);
+
+ sk->sk_state = BT_LISTEN;
+ } else {
+ iso_conn_defer_accept(pi->conn->hcon);
+ sk->sk_state = BT_CONFIG;
+ }
+
+ early_ret = true;
+ break;
+ case BT_CONNECTED:
+ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+ release_sock(sk);
+ iso_conn_big_sync(sk);
+ lock_sock(sk);
+
+ sk->sk_state = BT_LISTEN;
+ early_ret = true;
+ }
+
+ break;
+ case BT_CONNECT:
+ release_sock(sk);
+ err = iso_connect_cis(sk);
+ lock_sock(sk);
+
+ early_ret = true;
+ break;
+ default:
+ break;
+ }
+
+ release_sock(sk);
+ sock_put(sk);
+
+ if (early_ret)
+ return err;
+ }
+
+ return bt_sock_recvmsg(sock, msg, len, flags);
+}
+
+static bool check_io_qos(struct bt_iso_io_qos *qos)
+{
+ /* If no PHY is enable SDU must be 0 */
+ if (!qos->phy && qos->sdu)
+ return false;
+
+ if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff))
+ return false;
+
+ if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0))
+ return false;
+
+ if (qos->phy > BT_ISO_PHY_ANY)
+ return false;
+
+ return true;
+}
+
+static bool check_ucast_qos(struct bt_iso_qos *qos)
+{
+ if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET)
+ return false;
+
+ if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET)
+ return false;
+
+ if (qos->ucast.sca > 0x07)
+ return false;
+
+ if (qos->ucast.packing > 0x01)
+ return false;
+
+ if (qos->ucast.framing > 0x01)
+ return false;
+
+ if (!check_io_qos(&qos->ucast.in))
+ return false;
+
+ if (!check_io_qos(&qos->ucast.out))
+ return false;
+
+ return true;
+}
+
+static bool check_bcast_qos(struct bt_iso_qos *qos)
+{
+ if (!qos->bcast.sync_factor)
+ qos->bcast.sync_factor = 0x01;
+
+ if (qos->bcast.packing > 0x01)
+ return false;
+
+ if (qos->bcast.framing > 0x01)
+ return false;
+
+ if (!check_io_qos(&qos->bcast.in))
+ return false;
+
+ if (!check_io_qos(&qos->bcast.out))
+ return false;
+
+ if (qos->bcast.encryption > 0x01)
+ return false;
+
+ if (qos->bcast.options > 0x07)
+ return false;
+
+ if (qos->bcast.skip > 0x01f3)
+ return false;
+
+ if (!qos->bcast.sync_timeout)
+ qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT;
+
+ if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000)
+ return false;
+
+ if (qos->bcast.sync_cte_type > 0x1f)
+ return false;
+
+ if (qos->bcast.mse > 0x1f)
+ return false;
+
+ if (!qos->bcast.timeout)
+ qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT;
+
+ if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000)
+ return false;
+
+ return true;
+}
+
+static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ struct bt_iso_qos qos = default_qos;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ break;
+
+ case BT_PKT_STATUS:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ break;
+
+ case BT_PKT_SEQNUM:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags);
+ break;
+
+ case BT_ISO_QOS:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2 &&
+ (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) ||
+ sk->sk_state != BT_CONNECTED)) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&qos, sizeof(qos), optval, optlen);
+ if (err)
+ break;
+
+ iso_pi(sk)->qos = qos;
+ iso_pi(sk)->qos_user_set = true;
+
+ break;
+
+ case BT_ISO_BASE:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (optlen > sizeof(iso_pi(sk)->base)) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(iso_pi(sk)->base, optlen, optval,
+ optlen);
+ if (err)
+ break;
+
+ iso_pi(sk)->base_len = optlen;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int len, err = 0;
+ struct bt_iso_qos *qos;
+ u8 base_len;
+ u8 *base;
+
+ BT_DBG("sk %p", sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_DEFER_SETUP:
+ if (sk->sk_state == BT_CONNECTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *)optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_PKT_STATUS:
+ if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags),
+ (int __user *)optval))
+ err = -EFAULT;
+ break;
+
+ case BT_ISO_QOS:
+ qos = iso_sock_get_qos(sk);
+
+ len = min_t(unsigned int, len, sizeof(*qos));
+ if (copy_to_user(optval, qos, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_ISO_BASE:
+ if (sk->sk_state == BT_CONNECTED &&
+ !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) {
+ base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len;
+ base = iso_pi(sk)->conn->hcon->le_per_adv_data;
+ } else {
+ base_len = iso_pi(sk)->base_len;
+ base = iso_pi(sk)->base;
+ }
+
+ len = min_t(unsigned int, len, base_len);
+ if (copy_to_user(optval, base, len))
+ err = -EFAULT;
+ if (put_user(len, optlen))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
+
+ if (!sk)
+ return 0;
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ switch (how) {
+ case SHUT_RD:
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto unlock;
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ break;
+ case SHUT_WR:
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto unlock;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ break;
+ case SHUT_RDWR:
+ if (sk->sk_shutdown & SHUTDOWN_MASK)
+ goto unlock;
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ break;
+ }
+
+ iso_sock_clear_timer(sk);
+ __iso_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+
+unlock:
+ release_sock(sk);
+ sock_put(sk);
+
+ return err;
+}
+
+static int iso_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ iso_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) &&
+ !(current->flags & PF_EXITING)) {
+ lock_sock(sk);
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+ release_sock(sk);
+ }
+
+ sock_orphan(sk);
+ iso_sock_kill(sk);
+ return err;
+}
+
+static void iso_sock_ready(struct sock *sk)
+{
+ BT_DBG("sk %p", sk);
+
+ if (!sk)
+ return;
+
+ lock_sock(sk);
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+ release_sock(sk);
+}
+
+static bool iso_match_big(struct sock *sk, void *data)
+{
+ struct hci_evt_le_big_sync_established *ev = data;
+
+ return ev->handle == iso_pi(sk)->qos.bcast.big;
+}
+
+static bool iso_match_big_hcon(struct sock *sk, void *data)
+{
+ struct hci_conn *hcon = data;
+
+ return hcon->iso_qos.bcast.big == iso_pi(sk)->qos.bcast.big;
+}
+
+static bool iso_match_pa_sync_flag(struct sock *sk, void *data)
+{
+ return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
+}
+
+static bool iso_match_dst(struct sock *sk, void *data)
+{
+ return !bacmp(&iso_pi(sk)->dst, (bdaddr_t *)data);
+}
+
+static void iso_conn_ready(struct iso_conn *conn)
+{
+ struct sock *parent = NULL;
+ struct sock *sk = conn->sk;
+ struct hci_ev_le_big_sync_established *ev = NULL;
+ struct hci_ev_le_pa_sync_established *ev2 = NULL;
+ struct hci_ev_le_per_adv_report *ev3 = NULL;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+
+ BT_DBG("conn %p", conn);
+
+ if (sk) {
+ /* Attempt to update source address in case of BIS Sender if
+ * the advertisement is using a random address.
+ */
+ if (conn->hcon->type == BIS_LINK &&
+ conn->hcon->role == HCI_ROLE_MASTER &&
+ !bacmp(&conn->hcon->dst, BDADDR_ANY)) {
+ struct hci_conn *bis = conn->hcon;
+ struct adv_info *adv;
+
+ adv = hci_find_adv_instance(bis->hdev,
+ bis->iso_qos.bcast.bis);
+ if (adv && bacmp(&adv->random_addr, BDADDR_ANY)) {
+ lock_sock(sk);
+ iso_pi(sk)->src_type = BDADDR_LE_RANDOM;
+ bacpy(&iso_pi(sk)->src, &adv->random_addr);
+ release_sock(sk);
+ }
+ }
+
+ iso_sock_ready(conn->sk);
+ } else {
+ hcon = conn->hcon;
+ if (!hcon)
+ return;
+
+ hdev = hcon->hdev;
+
+ if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) {
+ /* A BIS slave hcon is notified to the ISO layer
+ * after the Command Complete for the LE Setup
+ * ISO Data Path command is received. Get the
+ * parent socket that matches the hcon BIG handle.
+ */
+ parent = iso_get_sock(hdev, &hcon->src, &hcon->dst,
+ BT_LISTEN, iso_match_big_hcon,
+ hcon);
+ } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) {
+ ev = hci_recv_event_data(hcon->hdev,
+ HCI_EVT_LE_BIG_SYNC_ESTABLISHED);
+
+ /* Get reference to PA sync parent socket, if it exists */
+ parent = iso_get_sock(hdev, &hcon->src, &hcon->dst,
+ BT_LISTEN,
+ iso_match_pa_sync_flag,
+ NULL);
+ if (!parent && ev)
+ parent = iso_get_sock(hdev, &hcon->src,
+ &hcon->dst,
+ BT_LISTEN,
+ iso_match_big, ev);
+ } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
+ ev2 = hci_recv_event_data(hcon->hdev,
+ HCI_EV_LE_PA_SYNC_ESTABLISHED);
+ if (ev2)
+ parent = iso_get_sock(hdev, &hcon->src,
+ &hcon->dst,
+ BT_LISTEN,
+ iso_match_sid, ev2);
+ } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
+ ev3 = hci_recv_event_data(hcon->hdev,
+ HCI_EV_LE_PER_ADV_REPORT);
+ if (ev3)
+ parent = iso_get_sock(hdev, &hcon->src,
+ &hcon->dst,
+ BT_LISTEN,
+ iso_match_sync_handle_pa_report,
+ ev3);
+ }
+
+ if (!parent)
+ parent = iso_get_sock(hdev, &hcon->src, BDADDR_ANY,
+ BT_LISTEN, iso_match_dst, BDADDR_ANY);
+
+ if (!parent)
+ return;
+
+ lock_sock(parent);
+
+ sk = iso_sock_alloc(sock_net(parent), NULL,
+ BTPROTO_ISO, GFP_ATOMIC, 0);
+ if (!sk) {
+ release_sock(parent);
+ return;
+ }
+
+ iso_sock_init(sk, parent);
+
+ bacpy(&iso_pi(sk)->src, &hcon->src);
+
+ /* Convert from HCI to three-value type */
+ if (hcon->src_type == ADDR_LE_DEV_PUBLIC)
+ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC;
+ else
+ iso_pi(sk)->src_type = BDADDR_LE_RANDOM;
+
+ /* If hcon has no destination address (BDADDR_ANY) it means it
+ * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or
+ * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using
+ * the parent socket destination address.
+ */
+ if (!bacmp(&hcon->dst, BDADDR_ANY)) {
+ bacpy(&hcon->dst, &iso_pi(parent)->dst);
+ hcon->dst_type = le_addr_type(iso_pi(parent)->dst_type);
+ }
+
+ if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
+ iso_pi(sk)->qos = iso_pi(parent)->qos;
+ hcon->iso_qos = iso_pi(sk)->qos;
+ iso_pi(sk)->bc_sid = iso_pi(parent)->bc_sid;
+ iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis;
+ memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis,
+ ISO_MAX_NUM_BIS);
+ set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
+ }
+
+ bacpy(&iso_pi(sk)->dst, &hcon->dst);
+
+ /* Convert from HCI to three-value type */
+ if (hcon->dst_type == ADDR_LE_DEV_PUBLIC)
+ iso_pi(sk)->dst_type = BDADDR_LE_PUBLIC;
+ else
+ iso_pi(sk)->dst_type = BDADDR_LE_RANDOM;
+
+ iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle;
+ memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len);
+ iso_pi(sk)->base_len = iso_pi(parent)->base_len;
+
+ hci_conn_hold(hcon);
+ iso_chan_add(conn, sk, parent);
+
+ if ((ev && ((struct hci_evt_le_big_sync_established *)ev)->status) ||
+ (ev2 && ev2->status)) {
+ /* Trigger error signal on child socket */
+ sk->sk_err = ECONNREFUSED;
+ sk->sk_error_report(sk);
+ }
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ sk->sk_state = BT_CONNECT2;
+ else
+ sk->sk_state = BT_CONNECTED;
+
+ /* Wake up parent */
+ parent->sk_data_ready(parent);
+
+ release_sock(parent);
+ sock_put(parent);
+ }
+}
+
+static bool iso_match_sid(struct sock *sk, void *data)
+{
+ struct hci_ev_le_pa_sync_established *ev = data;
+
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ return true;
+
+ return ev->sid == iso_pi(sk)->bc_sid;
+}
+
+static bool iso_match_sid_past(struct sock *sk, void *data)
+{
+ struct hci_ev_le_past_received *ev = data;
+
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ return true;
+
+ return ev->sid == iso_pi(sk)->bc_sid;
+}
+
+static bool iso_match_sync_handle(struct sock *sk, void *data)
+{
+ struct hci_evt_le_big_info_adv_report *ev = data;
+
+ return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle;
+}
+
+static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data)
+{
+ struct hci_ev_le_per_adv_report *ev = data;
+
+ return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle;
+}
+
+/* ----- ISO interface with lower layer (HCI) ----- */
+
+int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
+{
+ struct hci_ev_le_pa_sync_established *ev1;
+ struct hci_ev_le_past_received *ev1a;
+ struct hci_evt_le_big_info_adv_report *ev2;
+ struct hci_ev_le_per_adv_report *ev3;
+ struct sock *sk;
+
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
+
+ /* Broadcast receiver requires handling of some events before it can
+ * proceed to establishing a BIG sync:
+ *
+ * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific
+ * SID to listen to and once sync is established its handle needs to
+ * be stored in iso_pi(sk)->sync_handle so it can be matched once
+ * receiving the BIG Info.
+ * 1a. HCI_EV_LE_PAST_RECEIVED: alternative to 1.
+ * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a
+ * a BIG Info it attempts to check if there any listening socket with
+ * the same sync_handle and if it does then attempt to create a sync.
+ * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored
+ * in iso_pi(sk)->base so it can be passed up to user, in the case of a
+ * broadcast sink.
+ */
+ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED);
+ if (ev1) {
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
+ iso_match_sid, ev1);
+ if (sk && !ev1->status) {
+ iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
+ iso_pi(sk)->bc_sid = ev1->sid;
+ }
+
+ goto done;
+ }
+
+ ev1a = hci_recv_event_data(hdev, HCI_EV_LE_PAST_RECEIVED);
+ if (ev1a) {
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
+ iso_match_sid_past, ev1a);
+ if (sk && !ev1a->status) {
+ iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle);
+ iso_pi(sk)->bc_sid = ev1a->sid;
+ }
+
+ goto done;
+ }
+
+ ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT);
+ if (ev2) {
+ /* Check if BIGInfo report has already been handled */
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECTED,
+ iso_match_sync_handle, ev2);
+ if (sk) {
+ sock_put(sk);
+ sk = NULL;
+ goto done;
+ }
+
+ /* Try to get PA sync socket, if it exists */
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECT2,
+ iso_match_sync_handle, ev2);
+ if (!sk)
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr,
+ BT_LISTEN,
+ iso_match_sync_handle,
+ ev2);
+
+ if (sk) {
+ int err;
+ struct hci_conn *hcon = iso_pi(sk)->conn->hcon;
+
+ iso_pi(sk)->qos.bcast.encryption = ev2->encryption;
+
+ if (ev2->num_bis < iso_pi(sk)->bc_num_bis)
+ iso_pi(sk)->bc_num_bis = ev2->num_bis;
+
+ if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
+ !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) {
+ err = hci_conn_big_create_sync(hdev, hcon,
+ &iso_pi(sk)->qos,
+ iso_pi(sk)->sync_handle,
+ iso_pi(sk)->bc_num_bis,
+ iso_pi(sk)->bc_bis);
+ if (err) {
+ bt_dev_err(hdev, "hci_le_big_create_sync: %d",
+ err);
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ }
+
+ goto done;
+ }
+
+ ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT);
+ if (ev3) {
+ size_t base_len = 0;
+ u8 *base;
+ struct hci_conn *hcon;
+
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
+ iso_match_sync_handle_pa_report, ev3);
+ if (!sk)
+ goto done;
+
+ hcon = iso_pi(sk)->conn->hcon;
+ if (!hcon)
+ goto done;
+
+ if (ev3->data_status == LE_PA_DATA_TRUNCATED) {
+ /* The controller was unable to retrieve PA data. */
+ memset(hcon->le_per_adv_data, 0,
+ HCI_MAX_PER_AD_TOT_LEN);
+ hcon->le_per_adv_data_len = 0;
+ hcon->le_per_adv_data_offset = 0;
+ goto done;
+ }
+
+ if (hcon->le_per_adv_data_offset + ev3->length >
+ HCI_MAX_PER_AD_TOT_LEN)
+ goto done;
+
+ memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset,
+ ev3->data, ev3->length);
+ hcon->le_per_adv_data_offset += ev3->length;
+
+ if (ev3->data_status == LE_PA_DATA_COMPLETE) {
+ /* All PA data has been received. */
+ hcon->le_per_adv_data_len =
+ hcon->le_per_adv_data_offset;
+ hcon->le_per_adv_data_offset = 0;
+
+ /* Extract BASE */
+ base = eir_get_service_data(hcon->le_per_adv_data,
+ hcon->le_per_adv_data_len,
+ EIR_BAA_SERVICE_UUID,
+ &base_len);
+
+ if (!base || base_len > BASE_MAX_LENGTH)
+ goto done;
+
+ memcpy(iso_pi(sk)->base, base, base_len);
+ iso_pi(sk)->base_len = base_len;
+ } else {
+ /* This is a PA data fragment. Keep pa_data_len set to 0
+ * until all data has been reassembled.
+ */
+ hcon->le_per_adv_data_len = 0;
+ }
+ } else {
+ sk = iso_get_sock(hdev, &hdev->bdaddr, BDADDR_ANY,
+ BT_LISTEN, iso_match_dst, BDADDR_ANY);
+ }
+
+done:
+ if (!sk)
+ return 0;
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
+ *flags |= HCI_PROTO_DEFER;
+
+ sock_put(sk);
+
+ return HCI_LM_ACCEPT;
+}
+
+static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
+{
+ if (hcon->type != CIS_LINK && hcon->type != BIS_LINK &&
+ hcon->type != PA_LINK) {
+ if (hcon->type != LE_LINK)
+ return;
+
+ /* Check if LE link has failed */
+ if (status) {
+ struct hci_link *link, *t;
+
+ list_for_each_entry_safe(link, t, &hcon->link_list,
+ list)
+ iso_conn_del(link->conn, bt_to_errno(status));
+
+ return;
+ }
+
+ /* Create CIS if pending */
+ hci_le_create_cis_pending(hcon->hdev);
+ return;
+ }
+
+ BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+
+ /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or
+ * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection
+ * into the accept queue of the listening socket and wake up
+ * userspace, to inform the user about the event.
+ */
+ if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) ||
+ test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
+ struct iso_conn *conn;
+
+ conn = iso_conn_add(hcon);
+ if (conn)
+ iso_conn_ready(conn);
+ } else {
+ iso_conn_del(hcon, bt_to_errno(status));
+ }
+}
+
+static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
+{
+ if (hcon->type != CIS_LINK && hcon->type != BIS_LINK &&
+ hcon->type != PA_LINK)
+ return;
+
+ BT_DBG("hcon %p reason %d", hcon, reason);
+
+ iso_conn_del(hcon, bt_to_errno(reason));
+}
+
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags)
+{
+ struct hci_conn *hcon;
+ struct iso_conn *conn;
+ struct skb_shared_hwtstamps *hwts;
+ __u16 pb, ts, len, sn;
+
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ conn = iso_conn_hold_unless_zero(hcon->iso_data);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ pb = hci_iso_flags_pb(flags);
+ ts = hci_iso_flags_ts(flags);
+
+ BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts);
+
+ switch (pb) {
+ case ISO_START:
+ case ISO_SINGLE:
+ if (conn->rx_len) {
+ BT_ERR("Unexpected start frame (len %d)", skb->len);
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+ }
+
+ if (ts) {
+ struct hci_iso_ts_data_hdr *hdr;
+
+ hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE);
+ if (!hdr) {
+ BT_ERR("Frame is too short (len %d)", skb->len);
+ goto drop;
+ }
+
+ /* Record the timestamp to skb */
+ hwts = skb_hwtstamps(skb);
+ hwts->hwtstamp = us_to_ktime(le32_to_cpu(hdr->ts));
+
+ sn = __le16_to_cpu(hdr->sn);
+ len = __le16_to_cpu(hdr->slen);
+ } else {
+ struct hci_iso_data_hdr *hdr;
+
+ hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE);
+ if (!hdr) {
+ BT_ERR("Frame is too short (len %d)", skb->len);
+ goto drop;
+ }
+
+ sn = __le16_to_cpu(hdr->sn);
+ len = __le16_to_cpu(hdr->slen);
+ }
+
+ flags = hci_iso_data_flags(len);
+ len = hci_iso_data_len(len);
+
+ BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x sn %d",
+ len, skb->len, flags, sn);
+
+ if (len == skb->len) {
+ /* Complete frame received */
+ hci_skb_pkt_status(skb) = flags & 0x03;
+ hci_skb_pkt_seqnum(skb) = sn;
+ iso_recv_frame(conn, skb);
+ goto done;
+ }
+
+ if (pb == ISO_SINGLE) {
+ BT_ERR("Frame malformed (len %d, expected len %d)",
+ skb->len, len);
+ goto drop;
+ }
+
+ if (skb->len > len) {
+ BT_ERR("Frame is too long (len %d, expected len %d)",
+ skb->len, len);
+ goto drop;
+ }
+
+ /* Allocate skb for the complete frame (with header) */
+ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
+ if (!conn->rx_skb)
+ goto drop;
+
+ hci_skb_pkt_status(conn->rx_skb) = flags & 0x03;
+ hci_skb_pkt_seqnum(conn->rx_skb) = sn;
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len = len - skb->len;
+
+ /* Copy hw timestamp from skb to rx_skb if present */
+ if (ts) {
+ hwts = skb_hwtstamps(conn->rx_skb);
+ hwts->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ break;
+
+ case ISO_CONT:
+ BT_DBG("Cont: frag len %d (expecting %d)", skb->len,
+ conn->rx_len);
+
+ if (!conn->rx_len) {
+ BT_ERR("Unexpected continuation frame (len %d)",
+ skb->len);
+ goto drop;
+ }
+
+ if (skb->len > conn->rx_len) {
+ BT_ERR("Fragment is too long (len %d, expected %d)",
+ skb->len, conn->rx_len);
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+ goto drop;
+ }
+
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len -= skb->len;
+ break;
+
+ case ISO_END:
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len -= skb->len;
+
+ if (!conn->rx_len) {
+ struct sk_buff *rx_skb = conn->rx_skb;
+
+ /* Complete frame received. iso_recv_frame
+ * takes ownership of the skb so set the global
+ * rx_skb pointer to NULL first.
+ */
+ conn->rx_skb = NULL;
+ iso_recv_frame(conn, rx_skb);
+ }
+ break;
+ }
+
+drop:
+ kfree_skb(skb);
+done:
+ iso_conn_put(conn);
+ return 0;
+}
+
+static struct hci_cb iso_cb = {
+ .name = "ISO",
+ .connect_cfm = iso_connect_cfm,
+ .disconn_cfm = iso_disconn_cfm,
+};
+
+static int iso_debugfs_show(struct seq_file *f, void *p)
+{
+ struct sock *sk;
+
+ read_lock(&iso_sk_list.lock);
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src,
+ &iso_pi(sk)->dst, sk->sk_state);
+ }
+
+ read_unlock(&iso_sk_list.lock);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(iso_debugfs);
+
+static struct dentry *iso_debugfs;
+
+static const struct proto_ops iso_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = iso_sock_release,
+ .bind = iso_sock_bind,
+ .connect = iso_sock_connect,
+ .listen = iso_sock_listen,
+ .accept = iso_sock_accept,
+ .getname = iso_sock_getname,
+ .sendmsg = iso_sock_sendmsg,
+ .recvmsg = iso_sock_recvmsg,
+ .poll = bt_sock_poll,
+ .ioctl = bt_sock_ioctl,
+ .mmap = sock_no_mmap,
+ .socketpair = sock_no_socketpair,
+ .shutdown = iso_sock_shutdown,
+ .setsockopt = iso_sock_setsockopt,
+ .getsockopt = iso_sock_getsockopt
+};
+
+static const struct net_proto_family iso_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = iso_sock_create,
+};
+
+static bool inited;
+
+bool iso_inited(void)
+{
+ return inited;
+}
+
+int iso_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr));
+
+ if (inited)
+ return -EALREADY;
+
+ err = proto_register(&iso_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("ISO socket registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create ISO proc file");
+ bt_sock_unregister(BTPROTO_ISO);
+ goto error;
+ }
+
+ BT_INFO("ISO socket layer initialized");
+
+ hci_register_cb(&iso_cb);
+
+ if (!IS_ERR_OR_NULL(bt_debugfs))
+ iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs,
+ NULL, &iso_debugfs_fops);
+
+ inited = true;
+
+ return 0;
+
+error:
+ proto_unregister(&iso_proto);
+ return err;
+}
+
+int iso_exit(void)
+{
+ if (!inited)
+ return -EALREADY;
+
+ bt_procfs_cleanup(&init_net, "iso");
+
+ debugfs_remove(iso_debugfs);
+ iso_debugfs = NULL;
+
+ hci_unregister_cb(&iso_cb);
+
+ bt_sock_unregister(BTPROTO_ISO);
+
+ proto_unregister(&iso_proto);
+
+ inited = false;
+
+ return 0;
+}
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
deleted file mode 100644
index 29a8fa4d3728..000000000000
--- a/net/bluetooth/l2cap.c
+++ /dev/null
@@ -1,2264 +0,0 @@
-/*
- BlueZ - Bluetooth protocol stack for Linux
- Copyright (C) 2000-2001 Qualcomm Incorporated
-
- Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation;
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
- IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
- SOFTWARE IS DISCLAIMED.
-*/
-
-/* Bluetooth L2CAP core and sockets. */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/list.h>
-#include <linux/device.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_core.h>
-#include <net/bluetooth/l2cap.h>
-
-#ifndef CONFIG_BT_L2CAP_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
-#define VERSION "2.8"
-
-static const struct proto_ops l2cap_sock_ops;
-
-static struct bt_sock_list l2cap_sk_list = {
- .lock = RW_LOCK_UNLOCKED
-};
-
-static void __l2cap_sock_close(struct sock *sk, int reason);
-static void l2cap_sock_close(struct sock *sk);
-static void l2cap_sock_kill(struct sock *sk);
-
-static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
- u8 code, u8 ident, u16 dlen, void *data);
-
-/* ---- L2CAP timers ---- */
-static void l2cap_sock_timeout(unsigned long arg)
-{
- struct sock *sk = (struct sock *) arg;
-
- BT_DBG("sock %p state %d", sk, sk->sk_state);
-
- bh_lock_sock(sk);
- __l2cap_sock_close(sk, ETIMEDOUT);
- bh_unlock_sock(sk);
-
- l2cap_sock_kill(sk);
- sock_put(sk);
-}
-
-static void l2cap_sock_set_timer(struct sock *sk, long timeout)
-{
- BT_DBG("sk %p state %d timeout %ld", sk, sk->sk_state, timeout);
- sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout);
-}
-
-static void l2cap_sock_clear_timer(struct sock *sk)
-{
- BT_DBG("sock %p state %d", sk, sk->sk_state);
- sk_stop_timer(sk, &sk->sk_timer);
-}
-
-static void l2cap_sock_init_timer(struct sock *sk)
-{
- init_timer(&sk->sk_timer);
- sk->sk_timer.function = l2cap_sock_timeout;
- sk->sk_timer.data = (unsigned long)sk;
-}
-
-/* ---- L2CAP channels ---- */
-static struct sock *__l2cap_get_chan_by_dcid(struct l2cap_chan_list *l, u16 cid)
-{
- struct sock *s;
- for (s = l->head; s; s = l2cap_pi(s)->next_c) {
- if (l2cap_pi(s)->dcid == cid)
- break;
- }
- return s;
-}
-
-static struct sock *__l2cap_get_chan_by_scid(struct l2cap_chan_list *l, u16 cid)
-{
- struct sock *s;
- for (s = l->head; s; s = l2cap_pi(s)->next_c) {
- if (l2cap_pi(s)->scid == cid)
- break;
- }
- return s;
-}
-
-/* Find channel with given SCID.
- * Returns locked socket */
-static inline struct sock *l2cap_get_chan_by_scid(struct l2cap_chan_list *l, u16 cid)
-{
- struct sock *s;
- read_lock(&l->lock);
- s = __l2cap_get_chan_by_scid(l, cid);
- if (s) bh_lock_sock(s);
- read_unlock(&l->lock);
- return s;
-}
-
-static struct sock *__l2cap_get_chan_by_ident(struct l2cap_chan_list *l, u8 ident)
-{
- struct sock *s;
- for (s = l->head; s; s = l2cap_pi(s)->next_c) {
- if (l2cap_pi(s)->ident == ident)
- break;
- }
- return s;
-}
-
-static inline struct sock *l2cap_get_chan_by_ident(struct l2cap_chan_list *l, u8 ident)
-{
- struct sock *s;
- read_lock(&l->lock);
- s = __l2cap_get_chan_by_ident(l, ident);
- if (s) bh_lock_sock(s);
- read_unlock(&l->lock);
- return s;
-}
-
-static u16 l2cap_alloc_cid(struct l2cap_chan_list *l)
-{
- u16 cid = 0x0040;
-
- for (; cid < 0xffff; cid++) {
- if(!__l2cap_get_chan_by_scid(l, cid))
- return cid;
- }
-
- return 0;
-}
-
-static inline void __l2cap_chan_link(struct l2cap_chan_list *l, struct sock *sk)
-{
- sock_hold(sk);
-
- if (l->head)
- l2cap_pi(l->head)->prev_c = sk;
-
- l2cap_pi(sk)->next_c = l->head;
- l2cap_pi(sk)->prev_c = NULL;
- l->head = sk;
-}
-
-static inline void l2cap_chan_unlink(struct l2cap_chan_list *l, struct sock *sk)
-{
- struct sock *next = l2cap_pi(sk)->next_c, *prev = l2cap_pi(sk)->prev_c;
-
- write_lock_bh(&l->lock);
- if (sk == l->head)
- l->head = next;
-
- if (next)
- l2cap_pi(next)->prev_c = prev;
- if (prev)
- l2cap_pi(prev)->next_c = next;
- write_unlock_bh(&l->lock);
-
- __sock_put(sk);
-}
-
-static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct sock *parent)
-{
- struct l2cap_chan_list *l = &conn->chan_list;
-
- BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid);
-
- l2cap_pi(sk)->conn = conn;
-
- if (sk->sk_type == SOCK_SEQPACKET) {
- /* Alloc CID for connection-oriented socket */
- l2cap_pi(sk)->scid = l2cap_alloc_cid(l);
- } else if (sk->sk_type == SOCK_DGRAM) {
- /* Connectionless socket */
- l2cap_pi(sk)->scid = 0x0002;
- l2cap_pi(sk)->dcid = 0x0002;
- l2cap_pi(sk)->omtu = L2CAP_DEFAULT_MTU;
- } else {
- /* Raw socket can send/recv signalling messages only */
- l2cap_pi(sk)->scid = 0x0001;
- l2cap_pi(sk)->dcid = 0x0001;
- l2cap_pi(sk)->omtu = L2CAP_DEFAULT_MTU;
- }
-
- __l2cap_chan_link(l, sk);
-
- if (parent)
- bt_accept_enqueue(parent, sk);
-}
-
-/* Delete channel.
- * Must be called on the locked socket. */
-static void l2cap_chan_del(struct sock *sk, int err)
-{
- struct l2cap_conn *conn = l2cap_pi(sk)->conn;
- struct sock *parent = bt_sk(sk)->parent;
-
- l2cap_sock_clear_timer(sk);
-
- BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
-
- if (conn) {
- /* Unlink from channel list */
- l2cap_chan_unlink(&conn->chan_list, sk);
- l2cap_pi(sk)->conn = NULL;
- hci_conn_put(conn->hcon);
- }
-
- sk->sk_state = BT_CLOSED;
- sock_set_flag(sk, SOCK_ZAPPED);
-
- if (err)
- sk->sk_err = err;
-
- if (parent) {
- bt_accept_unlink(sk);
- parent->sk_data_ready(parent, 0);
- } else
- sk->sk_state_change(sk);
-}
-
-/* ---- L2CAP connections ---- */
-static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status)
-{
- struct l2cap_conn *conn = hcon->l2cap_data;
-
- if (conn || status)
- return conn;
-
- conn = kzalloc(sizeof(struct l2cap_conn), GFP_ATOMIC);
- if (!conn)
- return NULL;
-
- hcon->l2cap_data = conn;
- conn->hcon = hcon;
-
- BT_DBG("hcon %p conn %p", hcon, conn);
-
- conn->mtu = hcon->hdev->acl_mtu;
- conn->src = &hcon->hdev->bdaddr;
- conn->dst = &hcon->dst;
-
- spin_lock_init(&conn->lock);
- rwlock_init(&conn->chan_list.lock);
-
- return conn;
-}
-
-static void l2cap_conn_del(struct hci_conn *hcon, int err)
-{
- struct l2cap_conn *conn = hcon->l2cap_data;
- struct sock *sk;
-
- if (!conn)
- return;
-
- BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
-
- if (conn->rx_skb)
- kfree_skb(conn->rx_skb);
-
- /* Kill channels */
- while ((sk = conn->chan_list.head)) {
- bh_lock_sock(sk);
- l2cap_chan_del(sk, err);
- bh_unlock_sock(sk);
- l2cap_sock_kill(sk);
- }
-
- hcon->l2cap_data = NULL;
- kfree(conn);
-}
-
-static inline void l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct sock *parent)
-{
- struct l2cap_chan_list *l = &conn->chan_list;
- write_lock_bh(&l->lock);
- __l2cap_chan_add(conn, sk, parent);
- write_unlock_bh(&l->lock);
-}
-
-static inline u8 l2cap_get_ident(struct l2cap_conn *conn)
-{
- u8 id;
-
- /* Get next available identificator.
- * 1 - 128 are used by kernel.
- * 129 - 199 are reserved.
- * 200 - 254 are used by utilities like l2ping, etc.
- */
-
- spin_lock_bh(&conn->lock);
-
- if (++conn->tx_ident > 128)
- conn->tx_ident = 1;
-
- id = conn->tx_ident;
-
- spin_unlock_bh(&conn->lock);
-
- return id;
-}
-
-static inline int l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data)
-{
- struct sk_buff *skb = l2cap_build_cmd(conn, code, ident, len, data);
-
- BT_DBG("code 0x%2.2x", code);
-
- if (!skb)
- return -ENOMEM;
-
- return hci_send_acl(conn->hcon, skb, 0);
-}
-
-/* ---- Socket interface ---- */
-static struct sock *__l2cap_get_sock_by_addr(u16 psm, bdaddr_t *src)
-{
- struct sock *sk;
- struct hlist_node *node;
- sk_for_each(sk, node, &l2cap_sk_list.head)
- if (l2cap_pi(sk)->sport == psm && !bacmp(&bt_sk(sk)->src, src))
- goto found;
- sk = NULL;
-found:
- return sk;
-}
-
-/* Find socket with psm and source bdaddr.
- * Returns closest match.
- */
-static struct sock *__l2cap_get_sock_by_psm(int state, u16 psm, bdaddr_t *src)
-{
- struct sock *sk = NULL, *sk1 = NULL;
- struct hlist_node *node;
-
- sk_for_each(sk, node, &l2cap_sk_list.head) {
- if (state && sk->sk_state != state)
- continue;
-
- if (l2cap_pi(sk)->psm == psm) {
- /* Exact match. */
- if (!bacmp(&bt_sk(sk)->src, src))
- break;
-
- /* Closest match */
- if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
- sk1 = sk;
- }
- }
- return node ? sk : sk1;
-}
-
-/* Find socket with given address (psm, src).
- * Returns locked socket */
-static inline struct sock *l2cap_get_sock_by_psm(int state, u16 psm, bdaddr_t *src)
-{
- struct sock *s;
- read_lock(&l2cap_sk_list.lock);
- s = __l2cap_get_sock_by_psm(state, psm, src);
- if (s) bh_lock_sock(s);
- read_unlock(&l2cap_sk_list.lock);
- return s;
-}
-
-static void l2cap_sock_destruct(struct sock *sk)
-{
- BT_DBG("sk %p", sk);
-
- skb_queue_purge(&sk->sk_receive_queue);
- skb_queue_purge(&sk->sk_write_queue);
-}
-
-static void l2cap_sock_cleanup_listen(struct sock *parent)
-{
- struct sock *sk;
-
- BT_DBG("parent %p", parent);
-
- /* Close not yet accepted channels */
- while ((sk = bt_accept_dequeue(parent, NULL)))
- l2cap_sock_close(sk);
-
- parent->sk_state = BT_CLOSED;
- sock_set_flag(parent, SOCK_ZAPPED);
-}
-
-/* Kill socket (only if zapped and orphan)
- * Must be called on unlocked socket.
- */
-static void l2cap_sock_kill(struct sock *sk)
-{
- if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
- return;
-
- BT_DBG("sk %p state %d", sk, sk->sk_state);
-
- /* Kill poor orphan */
- bt_sock_unlink(&l2cap_sk_list, sk);
- sock_set_flag(sk, SOCK_DEAD);
- sock_put(sk);
-}
-
-static void __l2cap_sock_close(struct sock *sk, int reason)
-{
- BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
-
- switch (sk->sk_state) {
- case BT_LISTEN:
- l2cap_sock_cleanup_listen(sk);
- break;
-
- case BT_CONNECTED:
- case BT_CONFIG:
- case BT_CONNECT2:
- if (sk->sk_type == SOCK_SEQPACKET) {
- struct l2cap_conn *conn = l2cap_pi(sk)->conn;
- struct l2cap_disconn_req req;
-
- sk->sk_state = BT_DISCONN;
- l2cap_sock_set_timer(sk, sk->sk_sndtimeo);
-
- req.dcid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- req.scid = __cpu_to_le16(l2cap_pi(sk)->scid);
- l2cap_send_cmd(conn, l2cap_get_ident(conn),
- L2CAP_DISCONN_REQ, sizeof(req), &req);
- } else {
- l2cap_chan_del(sk, reason);
- }
- break;
-
- case BT_CONNECT:
- case BT_DISCONN:
- l2cap_chan_del(sk, reason);
- break;
-
- default:
- sock_set_flag(sk, SOCK_ZAPPED);
- break;
- }
-}
-
-/* Must be called on unlocked socket. */
-static void l2cap_sock_close(struct sock *sk)
-{
- l2cap_sock_clear_timer(sk);
- lock_sock(sk);
- __l2cap_sock_close(sk, ECONNRESET);
- release_sock(sk);
- l2cap_sock_kill(sk);
-}
-
-static void l2cap_sock_init(struct sock *sk, struct sock *parent)
-{
- struct l2cap_pinfo *pi = l2cap_pi(sk);
-
- BT_DBG("sk %p", sk);
-
- if (parent) {
- sk->sk_type = parent->sk_type;
- pi->imtu = l2cap_pi(parent)->imtu;
- pi->omtu = l2cap_pi(parent)->omtu;
- pi->link_mode = l2cap_pi(parent)->link_mode;
- } else {
- pi->imtu = L2CAP_DEFAULT_MTU;
- pi->omtu = 0;
- pi->link_mode = 0;
- }
-
- /* Default config options */
- pi->conf_mtu = L2CAP_DEFAULT_MTU;
- pi->flush_to = L2CAP_DEFAULT_FLUSH_TO;
-}
-
-static struct proto l2cap_proto = {
- .name = "L2CAP",
- .owner = THIS_MODULE,
- .obj_size = sizeof(struct l2cap_pinfo)
-};
-
-static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, gfp_t prio)
-{
- struct sock *sk;
-
- sk = sk_alloc(PF_BLUETOOTH, prio, &l2cap_proto, 1);
- if (!sk)
- return NULL;
-
- sock_init_data(sock, sk);
- INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
-
- sk->sk_destruct = l2cap_sock_destruct;
- sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT;
-
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = proto;
- sk->sk_state = BT_OPEN;
-
- l2cap_sock_init_timer(sk);
-
- bt_sock_link(&l2cap_sk_list, sk);
- return sk;
-}
-
-static int l2cap_sock_create(struct socket *sock, int protocol)
-{
- struct sock *sk;
-
- BT_DBG("sock %p", sock);
-
- sock->state = SS_UNCONNECTED;
-
- if (sock->type != SOCK_SEQPACKET &&
- sock->type != SOCK_DGRAM && sock->type != SOCK_RAW)
- return -ESOCKTNOSUPPORT;
-
- if (sock->type == SOCK_RAW && !capable(CAP_NET_RAW))
- return -EPERM;
-
- sock->ops = &l2cap_sock_ops;
-
- sk = l2cap_sock_alloc(sock, protocol, GFP_ATOMIC);
- if (!sk)
- return -ENOMEM;
-
- l2cap_sock_init(sk, NULL);
- return 0;
-}
-
-static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
-{
- struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
- struct sock *sk = sock->sk;
- int err = 0;
-
- BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm);
-
- if (!addr || addr->sa_family != AF_BLUETOOTH)
- return -EINVAL;
-
- lock_sock(sk);
-
- if (sk->sk_state != BT_OPEN) {
- err = -EBADFD;
- goto done;
- }
-
- write_lock_bh(&l2cap_sk_list.lock);
-
- if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) {
- err = -EADDRINUSE;
- } else {
- /* Save source address */
- bacpy(&bt_sk(sk)->src, &la->l2_bdaddr);
- l2cap_pi(sk)->psm = la->l2_psm;
- l2cap_pi(sk)->sport = la->l2_psm;
- sk->sk_state = BT_BOUND;
- }
-
- write_unlock_bh(&l2cap_sk_list.lock);
-
-done:
- release_sock(sk);
- return err;
-}
-
-static int l2cap_do_connect(struct sock *sk)
-{
- bdaddr_t *src = &bt_sk(sk)->src;
- bdaddr_t *dst = &bt_sk(sk)->dst;
- struct l2cap_conn *conn;
- struct hci_conn *hcon;
- struct hci_dev *hdev;
- int err = 0;
-
- BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm);
-
- if (!(hdev = hci_get_route(dst, src)))
- return -EHOSTUNREACH;
-
- hci_dev_lock_bh(hdev);
-
- err = -ENOMEM;
-
- hcon = hci_connect(hdev, ACL_LINK, dst);
- if (!hcon)
- goto done;
-
- conn = l2cap_conn_add(hcon, 0);
- if (!conn) {
- hci_conn_put(hcon);
- goto done;
- }
-
- err = 0;
-
- /* Update source addr of the socket */
- bacpy(src, conn->src);
-
- l2cap_chan_add(conn, sk, NULL);
-
- sk->sk_state = BT_CONNECT;
- l2cap_sock_set_timer(sk, sk->sk_sndtimeo);
-
- if (hcon->state == BT_CONNECTED) {
- if (sk->sk_type == SOCK_SEQPACKET) {
- struct l2cap_conn_req req;
- l2cap_pi(sk)->ident = l2cap_get_ident(conn);
- req.scid = __cpu_to_le16(l2cap_pi(sk)->scid);
- req.psm = l2cap_pi(sk)->psm;
- l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
- L2CAP_CONN_REQ, sizeof(req), &req);
- } else {
- l2cap_sock_clear_timer(sk);
- sk->sk_state = BT_CONNECTED;
- }
- }
-
-done:
- hci_dev_unlock_bh(hdev);
- hci_dev_put(hdev);
- return err;
-}
-
-static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
-{
- struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
- struct sock *sk = sock->sk;
- int err = 0;
-
- lock_sock(sk);
-
- BT_DBG("sk %p", sk);
-
- if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) {
- err = -EINVAL;
- goto done;
- }
-
- if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) {
- err = -EINVAL;
- goto done;
- }
-
- switch(sk->sk_state) {
- case BT_CONNECT:
- case BT_CONNECT2:
- case BT_CONFIG:
- /* Already connecting */
- goto wait;
-
- case BT_CONNECTED:
- /* Already connected */
- goto done;
-
- case BT_OPEN:
- case BT_BOUND:
- /* Can connect */
- break;
-
- default:
- err = -EBADFD;
- goto done;
- }
-
- /* Set destination address and psm */
- bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr);
- l2cap_pi(sk)->psm = la->l2_psm;
-
- if ((err = l2cap_do_connect(sk)))
- goto done;
-
-wait:
- err = bt_sock_wait_state(sk, BT_CONNECTED,
- sock_sndtimeo(sk, flags & O_NONBLOCK));
-done:
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- BT_DBG("sk %p backlog %d", sk, backlog);
-
- lock_sock(sk);
-
- if (sk->sk_state != BT_BOUND || sock->type != SOCK_SEQPACKET) {
- err = -EBADFD;
- goto done;
- }
-
- if (!l2cap_pi(sk)->psm) {
- bdaddr_t *src = &bt_sk(sk)->src;
- u16 psm;
-
- err = -EINVAL;
-
- write_lock_bh(&l2cap_sk_list.lock);
-
- for (psm = 0x1001; psm < 0x1100; psm += 2)
- if (!__l2cap_get_sock_by_addr(psm, src)) {
- l2cap_pi(sk)->psm = htobs(psm);
- l2cap_pi(sk)->sport = htobs(psm);
- err = 0;
- break;
- }
-
- write_unlock_bh(&l2cap_sk_list.lock);
-
- if (err < 0)
- goto done;
- }
-
- sk->sk_max_ack_backlog = backlog;
- sk->sk_ack_backlog = 0;
- sk->sk_state = BT_LISTEN;
-
-done:
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, int flags)
-{
- DECLARE_WAITQUEUE(wait, current);
- struct sock *sk = sock->sk, *nsk;
- long timeo;
- int err = 0;
-
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
-
- if (sk->sk_state != BT_LISTEN) {
- err = -EBADFD;
- goto done;
- }
-
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
- BT_DBG("sk %p timeo %ld", sk, timeo);
-
- /* Wait for an incoming connection. (wake-one). */
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- while (!(nsk = bt_accept_dequeue(sk, newsock))) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (!timeo) {
- err = -EAGAIN;
- break;
- }
-
- release_sock(sk);
- timeo = schedule_timeout(timeo);
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
-
- if (sk->sk_state != BT_LISTEN) {
- err = -EBADFD;
- break;
- }
-
- if (signal_pending(current)) {
- err = sock_intr_errno(timeo);
- break;
- }
- }
- set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
-
- if (err)
- goto done;
-
- newsock->state = SS_CONNECTED;
-
- BT_DBG("new socket %p", nsk);
-
-done:
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
-{
- struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
- struct sock *sk = sock->sk;
-
- BT_DBG("sock %p, sk %p", sock, sk);
-
- addr->sa_family = AF_BLUETOOTH;
- *len = sizeof(struct sockaddr_l2);
-
- if (peer)
- bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst);
- else
- bacpy(&la->l2_bdaddr, &bt_sk(sk)->src);
-
- la->l2_psm = l2cap_pi(sk)->psm;
- return 0;
-}
-
-static inline int l2cap_do_send(struct sock *sk, struct msghdr *msg, int len)
-{
- struct l2cap_conn *conn = l2cap_pi(sk)->conn;
- struct sk_buff *skb, **frag;
- int err, hlen, count, sent=0;
- struct l2cap_hdr *lh;
-
- BT_DBG("sk %p len %d", sk, len);
-
- /* First fragment (with L2CAP header) */
- if (sk->sk_type == SOCK_DGRAM)
- hlen = L2CAP_HDR_SIZE + 2;
- else
- hlen = L2CAP_HDR_SIZE;
-
- count = min_t(unsigned int, (conn->mtu - hlen), len);
-
- skb = bt_skb_send_alloc(sk, hlen + count,
- msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb)
- return err;
-
- /* Create L2CAP header */
- lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
- lh->cid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- lh->len = __cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
-
- if (sk->sk_type == SOCK_DGRAM)
- put_unaligned(l2cap_pi(sk)->psm, (u16 *) skb_put(skb, 2));
-
- if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) {
- err = -EFAULT;
- goto fail;
- }
-
- sent += count;
- len -= count;
-
- /* Continuation fragments (no L2CAP header) */
- frag = &skb_shinfo(skb)->frag_list;
- while (len) {
- count = min_t(unsigned int, conn->mtu, len);
-
- *frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err);
- if (!*frag)
- goto fail;
-
- if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count)) {
- err = -EFAULT;
- goto fail;
- }
-
- sent += count;
- len -= count;
-
- frag = &(*frag)->next;
- }
-
- if ((err = hci_send_acl(conn->hcon, skb, 0)) < 0)
- goto fail;
-
- return sent;
-
-fail:
- kfree_skb(skb);
- return err;
-}
-
-static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- BT_DBG("sock %p, sk %p", sock, sk);
-
- err = sock_error(sk);
- if (err)
- return err;
-
- if (msg->msg_flags & MSG_OOB)
- return -EOPNOTSUPP;
-
- /* Check outgoing MTU */
- if (sk->sk_type != SOCK_RAW && len > l2cap_pi(sk)->omtu)
- return -EINVAL;
-
- lock_sock(sk);
-
- if (sk->sk_state == BT_CONNECTED)
- err = l2cap_do_send(sk, msg, len);
- else
- err = -ENOTCONN;
-
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
-{
- struct sock *sk = sock->sk;
- struct l2cap_options opts;
- int err = 0, len;
- u32 opt;
-
- BT_DBG("sk %p", sk);
-
- lock_sock(sk);
-
- switch (optname) {
- case L2CAP_OPTIONS:
- len = min_t(unsigned int, sizeof(opts), optlen);
- if (copy_from_user((char *) &opts, optval, len)) {
- err = -EFAULT;
- break;
- }
- l2cap_pi(sk)->imtu = opts.imtu;
- l2cap_pi(sk)->omtu = opts.omtu;
- break;
-
- case L2CAP_LM:
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
- break;
- }
-
- l2cap_pi(sk)->link_mode = opt;
- break;
-
- default:
- err = -ENOPROTOOPT;
- break;
- }
-
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
- struct l2cap_options opts;
- struct l2cap_conninfo cinfo;
- int len, err = 0;
-
- BT_DBG("sk %p", sk);
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- lock_sock(sk);
-
- switch (optname) {
- case L2CAP_OPTIONS:
- opts.imtu = l2cap_pi(sk)->imtu;
- opts.omtu = l2cap_pi(sk)->omtu;
- opts.flush_to = l2cap_pi(sk)->flush_to;
- opts.mode = 0x00;
-
- len = min_t(unsigned int, len, sizeof(opts));
- if (copy_to_user(optval, (char *) &opts, len))
- err = -EFAULT;
-
- break;
-
- case L2CAP_LM:
- if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval))
- err = -EFAULT;
- break;
-
- case L2CAP_CONNINFO:
- if (sk->sk_state != BT_CONNECTED) {
- err = -ENOTCONN;
- break;
- }
-
- cinfo.hci_handle = l2cap_pi(sk)->conn->hcon->handle;
- memcpy(cinfo.dev_class, l2cap_pi(sk)->conn->hcon->dev_class, 3);
-
- len = min_t(unsigned int, len, sizeof(cinfo));
- if (copy_to_user(optval, (char *) &cinfo, len))
- err = -EFAULT;
-
- break;
-
- default:
- err = -ENOPROTOOPT;
- break;
- }
-
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_shutdown(struct socket *sock, int how)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- BT_DBG("sock %p, sk %p", sock, sk);
-
- if (!sk)
- return 0;
-
- lock_sock(sk);
- if (!sk->sk_shutdown) {
- sk->sk_shutdown = SHUTDOWN_MASK;
- l2cap_sock_clear_timer(sk);
- __l2cap_sock_close(sk, 0);
-
- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
- err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
- }
- release_sock(sk);
- return err;
-}
-
-static int l2cap_sock_release(struct socket *sock)
-{
- struct sock *sk = sock->sk;
- int err;
-
- BT_DBG("sock %p, sk %p", sock, sk);
-
- if (!sk)
- return 0;
-
- err = l2cap_sock_shutdown(sock, 2);
-
- sock_orphan(sk);
- l2cap_sock_kill(sk);
- return err;
-}
-
-static void l2cap_conn_ready(struct l2cap_conn *conn)
-{
- struct l2cap_chan_list *l = &conn->chan_list;
- struct sock *sk;
-
- BT_DBG("conn %p", conn);
-
- read_lock(&l->lock);
-
- for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- bh_lock_sock(sk);
-
- if (sk->sk_type != SOCK_SEQPACKET) {
- l2cap_sock_clear_timer(sk);
- sk->sk_state = BT_CONNECTED;
- sk->sk_state_change(sk);
- } else if (sk->sk_state == BT_CONNECT) {
- struct l2cap_conn_req req;
- l2cap_pi(sk)->ident = l2cap_get_ident(conn);
- req.scid = __cpu_to_le16(l2cap_pi(sk)->scid);
- req.psm = l2cap_pi(sk)->psm;
- l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req);
- }
-
- bh_unlock_sock(sk);
- }
-
- read_unlock(&l->lock);
-}
-
-/* Notify sockets that we cannot guaranty reliability anymore */
-static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
-{
- struct l2cap_chan_list *l = &conn->chan_list;
- struct sock *sk;
-
- BT_DBG("conn %p", conn);
-
- read_lock(&l->lock);
- for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE)
- sk->sk_err = err;
- }
- read_unlock(&l->lock);
-}
-
-static void l2cap_chan_ready(struct sock *sk)
-{
- struct sock *parent = bt_sk(sk)->parent;
-
- BT_DBG("sk %p, parent %p", sk, parent);
-
- l2cap_pi(sk)->conf_state = 0;
- l2cap_sock_clear_timer(sk);
-
- if (!parent) {
- /* Outgoing channel.
- * Wake up socket sleeping on connect.
- */
- sk->sk_state = BT_CONNECTED;
- sk->sk_state_change(sk);
- } else {
- /* Incoming channel.
- * Wake up socket sleeping on accept.
- */
- parent->sk_data_ready(parent, 0);
- }
-}
-
-/* Copy frame to all raw sockets on that connection */
-static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
-{
- struct l2cap_chan_list *l = &conn->chan_list;
- struct sk_buff *nskb;
- struct sock * sk;
-
- BT_DBG("conn %p", conn);
-
- read_lock(&l->lock);
- for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- if (sk->sk_type != SOCK_RAW)
- continue;
-
- /* Don't send frame to the socket it came from */
- if (skb->sk == sk)
- continue;
-
- if (!(nskb = skb_clone(skb, GFP_ATOMIC)))
- continue;
-
- if (sock_queue_rcv_skb(sk, nskb))
- kfree_skb(nskb);
- }
- read_unlock(&l->lock);
-}
-
-/* ---- L2CAP signalling commands ---- */
-static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
- u8 code, u8 ident, u16 dlen, void *data)
-{
- struct sk_buff *skb, **frag;
- struct l2cap_cmd_hdr *cmd;
- struct l2cap_hdr *lh;
- int len, count;
-
- BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %d", conn, code, ident, dlen);
-
- len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen;
- count = min_t(unsigned int, conn->mtu, len);
-
- skb = bt_skb_alloc(count, GFP_ATOMIC);
- if (!skb)
- return NULL;
-
- lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
- lh->len = __cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen);
- lh->cid = __cpu_to_le16(0x0001);
-
- cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE);
- cmd->code = code;
- cmd->ident = ident;
- cmd->len = __cpu_to_le16(dlen);
-
- if (dlen) {
- count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE;
- memcpy(skb_put(skb, count), data, count);
- data += count;
- }
-
- len -= skb->len;
-
- /* Continuation fragments (no L2CAP header) */
- frag = &skb_shinfo(skb)->frag_list;
- while (len) {
- count = min_t(unsigned int, conn->mtu, len);
-
- *frag = bt_skb_alloc(count, GFP_ATOMIC);
- if (!*frag)
- goto fail;
-
- memcpy(skb_put(*frag, count), data, count);
-
- len -= count;
- data += count;
-
- frag = &(*frag)->next;
- }
-
- return skb;
-
-fail:
- kfree_skb(skb);
- return NULL;
-}
-
-static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, unsigned long *val)
-{
- struct l2cap_conf_opt *opt = *ptr;
- int len;
-
- len = L2CAP_CONF_OPT_SIZE + opt->len;
- *ptr += len;
-
- *type = opt->type;
- *olen = opt->len;
-
- switch (opt->len) {
- case 1:
- *val = *((u8 *) opt->val);
- break;
-
- case 2:
- *val = __le16_to_cpu(*((u16 *)opt->val));
- break;
-
- case 4:
- *val = __le32_to_cpu(*((u32 *)opt->val));
- break;
-
- default:
- *val = (unsigned long) opt->val;
- break;
- }
-
- BT_DBG("type 0x%2.2x len %d val 0x%lx", *type, opt->len, *val);
- return len;
-}
-
-static inline void l2cap_parse_conf_req(struct sock *sk, void *data, int len)
-{
- int type, hint, olen;
- unsigned long val;
- void *ptr = data;
-
- BT_DBG("sk %p len %d", sk, len);
-
- while (len >= L2CAP_CONF_OPT_SIZE) {
- len -= l2cap_get_conf_opt(&ptr, &type, &olen, &val);
-
- hint = type & 0x80;
- type &= 0x7f;
-
- switch (type) {
- case L2CAP_CONF_MTU:
- l2cap_pi(sk)->conf_mtu = val;
- break;
-
- case L2CAP_CONF_FLUSH_TO:
- l2cap_pi(sk)->flush_to = val;
- break;
-
- case L2CAP_CONF_QOS:
- break;
-
- default:
- if (hint)
- break;
-
- /* FIXME: Reject unknown option */
- break;
- }
- }
-}
-
-static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
-{
- struct l2cap_conf_opt *opt = *ptr;
-
- BT_DBG("type 0x%2.2x len %d val 0x%lx", type, len, val);
-
- opt->type = type;
- opt->len = len;
-
- switch (len) {
- case 1:
- *((u8 *) opt->val) = val;
- break;
-
- case 2:
- *((u16 *) opt->val) = __cpu_to_le16(val);
- break;
-
- case 4:
- *((u32 *) opt->val) = __cpu_to_le32(val);
- break;
-
- default:
- memcpy(opt->val, (void *) val, len);
- break;
- }
-
- *ptr += L2CAP_CONF_OPT_SIZE + len;
-}
-
-static int l2cap_build_conf_req(struct sock *sk, void *data)
-{
- struct l2cap_pinfo *pi = l2cap_pi(sk);
- struct l2cap_conf_req *req = data;
- void *ptr = req->data;
-
- BT_DBG("sk %p", sk);
-
- if (pi->imtu != L2CAP_DEFAULT_MTU)
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu);
-
- /* FIXME: Need actual value of the flush timeout */
- //if (flush_to != L2CAP_DEFAULT_FLUSH_TO)
- // l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, 2, pi->flush_to);
-
- req->dcid = __cpu_to_le16(pi->dcid);
- req->flags = __cpu_to_le16(0);
-
- return ptr - data;
-}
-
-static inline int l2cap_conf_output(struct sock *sk, void **ptr)
-{
- struct l2cap_pinfo *pi = l2cap_pi(sk);
- int result = 0;
-
- /* Configure output options and let the other side know
- * which ones we don't like. */
- if (pi->conf_mtu < pi->omtu)
- result = L2CAP_CONF_UNACCEPT;
- else
- pi->omtu = pi->conf_mtu;
-
- l2cap_add_conf_opt(ptr, L2CAP_CONF_MTU, 2, pi->omtu);
-
- BT_DBG("sk %p result %d", sk, result);
- return result;
-}
-
-static int l2cap_build_conf_rsp(struct sock *sk, void *data, int *result)
-{
- struct l2cap_conf_rsp *rsp = data;
- void *ptr = rsp->data;
- u16 flags = 0;
-
- BT_DBG("sk %p complete %d", sk, result ? 1 : 0);
-
- if (result)
- *result = l2cap_conf_output(sk, &ptr);
- else
- flags = 0x0001;
-
- rsp->scid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- rsp->result = __cpu_to_le16(result ? *result : 0);
- rsp->flags = __cpu_to_le16(flags);
-
- return ptr - data;
-}
-
-static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_chan_list *list = &conn->chan_list;
- struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
- struct l2cap_conn_rsp rsp;
- struct sock *sk, *parent;
- int result = 0, status = 0;
-
- u16 dcid = 0, scid = __le16_to_cpu(req->scid);
- u16 psm = req->psm;
-
- BT_DBG("psm 0x%2.2x scid 0x%4.4x", psm, scid);
-
- /* Check if we have socket listening on psm */
- parent = l2cap_get_sock_by_psm(BT_LISTEN, psm, conn->src);
- if (!parent) {
- result = L2CAP_CR_BAD_PSM;
- goto sendresp;
- }
-
- result = L2CAP_CR_NO_MEM;
-
- /* Check for backlog size */
- if (sk_acceptq_is_full(parent)) {
- BT_DBG("backlog full %d", parent->sk_ack_backlog);
- goto response;
- }
-
- sk = l2cap_sock_alloc(NULL, BTPROTO_L2CAP, GFP_ATOMIC);
- if (!sk)
- goto response;
-
- write_lock_bh(&list->lock);
-
- /* Check if we already have channel with that dcid */
- if (__l2cap_get_chan_by_dcid(list, scid)) {
- write_unlock_bh(&list->lock);
- sock_set_flag(sk, SOCK_ZAPPED);
- l2cap_sock_kill(sk);
- goto response;
- }
-
- hci_conn_hold(conn->hcon);
-
- l2cap_sock_init(sk, parent);
- bacpy(&bt_sk(sk)->src, conn->src);
- bacpy(&bt_sk(sk)->dst, conn->dst);
- l2cap_pi(sk)->psm = psm;
- l2cap_pi(sk)->dcid = scid;
-
- __l2cap_chan_add(conn, sk, parent);
- dcid = l2cap_pi(sk)->scid;
-
- l2cap_sock_set_timer(sk, sk->sk_sndtimeo);
-
- /* Service level security */
- result = L2CAP_CR_PEND;
- status = L2CAP_CS_AUTHEN_PEND;
- sk->sk_state = BT_CONNECT2;
- l2cap_pi(sk)->ident = cmd->ident;
-
- if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) ||
- (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) {
- if (!hci_conn_encrypt(conn->hcon))
- goto done;
- } else if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH) {
- if (!hci_conn_auth(conn->hcon))
- goto done;
- }
-
- sk->sk_state = BT_CONFIG;
- result = status = 0;
-
-done:
- write_unlock_bh(&list->lock);
-
-response:
- bh_unlock_sock(parent);
-
-sendresp:
- rsp.scid = __cpu_to_le16(scid);
- rsp.dcid = __cpu_to_le16(dcid);
- rsp.result = __cpu_to_le16(result);
- rsp.status = __cpu_to_le16(status);
- l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp);
- return 0;
-}
-
-static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_conn_rsp *rsp = (struct l2cap_conn_rsp *) data;
- u16 scid, dcid, result, status;
- struct sock *sk;
- u8 req[128];
-
- scid = __le16_to_cpu(rsp->scid);
- dcid = __le16_to_cpu(rsp->dcid);
- result = __le16_to_cpu(rsp->result);
- status = __le16_to_cpu(rsp->status);
-
- BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status);
-
- if (scid) {
- if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, scid)))
- return 0;
- } else {
- if (!(sk = l2cap_get_chan_by_ident(&conn->chan_list, cmd->ident)))
- return 0;
- }
-
- switch (result) {
- case L2CAP_CR_SUCCESS:
- sk->sk_state = BT_CONFIG;
- l2cap_pi(sk)->ident = 0;
- l2cap_pi(sk)->dcid = dcid;
- l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
-
- l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(sk, req), req);
- break;
-
- case L2CAP_CR_PEND:
- break;
-
- default:
- l2cap_chan_del(sk, ECONNREFUSED);
- break;
- }
-
- bh_unlock_sock(sk);
- return 0;
-}
-
-static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
- u16 dcid, flags;
- u8 rsp[64];
- struct sock *sk;
- int result;
-
- dcid = __le16_to_cpu(req->dcid);
- flags = __le16_to_cpu(req->flags);
-
- BT_DBG("dcid 0x%4.4x flags 0x%2.2x", dcid, flags);
-
- if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, dcid)))
- return -ENOENT;
-
- if (sk->sk_state == BT_DISCONN)
- goto unlock;
-
- l2cap_parse_conf_req(sk, req->data, cmd->len - sizeof(*req));
-
- if (flags & 0x0001) {
- /* Incomplete config. Send empty response. */
- l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
- l2cap_build_conf_rsp(sk, rsp, NULL), rsp);
- goto unlock;
- }
-
- /* Complete config. */
- l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
- l2cap_build_conf_rsp(sk, rsp, &result), rsp);
-
- if (result)
- goto unlock;
-
- /* Output config done */
- l2cap_pi(sk)->conf_state |= L2CAP_CONF_OUTPUT_DONE;
-
- if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) {
- sk->sk_state = BT_CONNECTED;
- l2cap_chan_ready(sk);
- } else if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) {
- u8 req[64];
- l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(sk, req), req);
- }
-
-unlock:
- bh_unlock_sock(sk);
- return 0;
-}
-
-static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_conf_rsp *rsp = (struct l2cap_conf_rsp *)data;
- u16 scid, flags, result;
- struct sock *sk;
-
- scid = __le16_to_cpu(rsp->scid);
- flags = __le16_to_cpu(rsp->flags);
- result = __le16_to_cpu(rsp->result);
-
- BT_DBG("scid 0x%4.4x flags 0x%2.2x result 0x%2.2x", scid, flags, result);
-
- if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, scid)))
- return 0;
-
- switch (result) {
- case L2CAP_CONF_SUCCESS:
- break;
-
- case L2CAP_CONF_UNACCEPT:
- if (++l2cap_pi(sk)->conf_retry < L2CAP_CONF_MAX_RETRIES) {
- char req[128];
- /* It does not make sense to adjust L2CAP parameters
- * that are currently defined in the spec. We simply
- * resend config request that we sent earlier. It is
- * stupid, but it helps qualification testing which
- * expects at least some response from us. */
- l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(sk, req), req);
- goto done;
- }
-
- default:
- sk->sk_state = BT_DISCONN;
- sk->sk_err = ECONNRESET;
- l2cap_sock_set_timer(sk, HZ * 5);
- {
- struct l2cap_disconn_req req;
- req.dcid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- req.scid = __cpu_to_le16(l2cap_pi(sk)->scid);
- l2cap_send_cmd(conn, l2cap_get_ident(conn),
- L2CAP_DISCONN_REQ, sizeof(req), &req);
- }
- goto done;
- }
-
- if (flags & 0x01)
- goto done;
-
- /* Input config done */
- l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE;
-
- if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) {
- sk->sk_state = BT_CONNECTED;
- l2cap_chan_ready(sk);
- }
-
-done:
- bh_unlock_sock(sk);
- return 0;
-}
-
-static inline int l2cap_disconnect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_disconn_req *req = (struct l2cap_disconn_req *) data;
- struct l2cap_disconn_rsp rsp;
- u16 dcid, scid;
- struct sock *sk;
-
- scid = __le16_to_cpu(req->scid);
- dcid = __le16_to_cpu(req->dcid);
-
- BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid);
-
- if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, dcid)))
- return 0;
-
- rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid);
- rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- l2cap_chan_del(sk, ECONNRESET);
- bh_unlock_sock(sk);
-
- l2cap_sock_kill(sk);
- return 0;
-}
-
-static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_disconn_rsp *rsp = (struct l2cap_disconn_rsp *) data;
- u16 dcid, scid;
- struct sock *sk;
-
- scid = __le16_to_cpu(rsp->scid);
- dcid = __le16_to_cpu(rsp->dcid);
-
- BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid);
-
- if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, scid)))
- return 0;
-
- l2cap_chan_del(sk, 0);
- bh_unlock_sock(sk);
-
- l2cap_sock_kill(sk);
- return 0;
-}
-
-static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_info_req *req = (struct l2cap_info_req *) data;
- struct l2cap_info_rsp rsp;
- u16 type;
-
- type = __le16_to_cpu(req->type);
-
- BT_DBG("type 0x%4.4x", type);
-
- rsp.type = __cpu_to_le16(type);
- rsp.result = __cpu_to_le16(L2CAP_IR_NOTSUPP);
- l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(rsp), &rsp);
-
- return 0;
-}
-
-static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
-{
- struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data;
- u16 type, result;
-
- type = __le16_to_cpu(rsp->type);
- result = __le16_to_cpu(rsp->result);
-
- BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
-
- return 0;
-}
-
-static inline void l2cap_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb)
-{
- u8 *data = skb->data;
- int len = skb->len;
- struct l2cap_cmd_hdr cmd;
- int err = 0;
-
- l2cap_raw_recv(conn, skb);
-
- while (len >= L2CAP_CMD_HDR_SIZE) {
- memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE);
- data += L2CAP_CMD_HDR_SIZE;
- len -= L2CAP_CMD_HDR_SIZE;
-
- cmd.len = __le16_to_cpu(cmd.len);
-
- BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd.len, cmd.ident);
-
- if (cmd.len > len || !cmd.ident) {
- BT_DBG("corrupted command");
- break;
- }
-
- switch (cmd.code) {
- case L2CAP_COMMAND_REJ:
- /* FIXME: We should process this */
- break;
-
- case L2CAP_CONN_REQ:
- err = l2cap_connect_req(conn, &cmd, data);
- break;
-
- case L2CAP_CONN_RSP:
- err = l2cap_connect_rsp(conn, &cmd, data);
- break;
-
- case L2CAP_CONF_REQ:
- err = l2cap_config_req(conn, &cmd, data);
- break;
-
- case L2CAP_CONF_RSP:
- err = l2cap_config_rsp(conn, &cmd, data);
- break;
-
- case L2CAP_DISCONN_REQ:
- err = l2cap_disconnect_req(conn, &cmd, data);
- break;
-
- case L2CAP_DISCONN_RSP:
- err = l2cap_disconnect_rsp(conn, &cmd, data);
- break;
-
- case L2CAP_ECHO_REQ:
- l2cap_send_cmd(conn, cmd.ident, L2CAP_ECHO_RSP, cmd.len, data);
- break;
-
- case L2CAP_ECHO_RSP:
- break;
-
- case L2CAP_INFO_REQ:
- err = l2cap_information_req(conn, &cmd, data);
- break;
-
- case L2CAP_INFO_RSP:
- err = l2cap_information_rsp(conn, &cmd, data);
- break;
-
- default:
- BT_ERR("Unknown signaling command 0x%2.2x", cmd.code);
- err = -EINVAL;
- break;
- }
-
- if (err) {
- struct l2cap_cmd_rej rej;
- BT_DBG("error %d", err);
-
- /* FIXME: Map err to a valid reason */
- rej.reason = __cpu_to_le16(0);
- l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
- }
-
- data += cmd.len;
- len -= cmd.len;
- }
-
- kfree_skb(skb);
-}
-
-static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb)
-{
- struct sock *sk;
-
- sk = l2cap_get_chan_by_scid(&conn->chan_list, cid);
- if (!sk) {
- BT_DBG("unknown cid 0x%4.4x", cid);
- goto drop;
- }
-
- BT_DBG("sk %p, len %d", sk, skb->len);
-
- if (sk->sk_state != BT_CONNECTED)
- goto drop;
-
- if (l2cap_pi(sk)->imtu < skb->len)
- goto drop;
-
- /* If socket recv buffers overflows we drop data here
- * which is *bad* because L2CAP has to be reliable.
- * But we don't have any other choice. L2CAP doesn't
- * provide flow control mechanism. */
-
- if (!sock_queue_rcv_skb(sk, skb))
- goto done;
-
-drop:
- kfree_skb(skb);
-
-done:
- if (sk)
- bh_unlock_sock(sk);
-
- return 0;
-}
-
-static inline int l2cap_conless_channel(struct l2cap_conn *conn, u16 psm, struct sk_buff *skb)
-{
- struct sock *sk;
-
- sk = l2cap_get_sock_by_psm(0, psm, conn->src);
- if (!sk)
- goto drop;
-
- BT_DBG("sk %p, len %d", sk, skb->len);
-
- if (sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECTED)
- goto drop;
-
- if (l2cap_pi(sk)->imtu < skb->len)
- goto drop;
-
- if (!sock_queue_rcv_skb(sk, skb))
- goto done;
-
-drop:
- kfree_skb(skb);
-
-done:
- if (sk) bh_unlock_sock(sk);
- return 0;
-}
-
-static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb)
-{
- struct l2cap_hdr *lh = (void *) skb->data;
- u16 cid, psm, len;
-
- skb_pull(skb, L2CAP_HDR_SIZE);
- cid = __le16_to_cpu(lh->cid);
- len = __le16_to_cpu(lh->len);
-
- BT_DBG("len %d, cid 0x%4.4x", len, cid);
-
- switch (cid) {
- case 0x0001:
- l2cap_sig_channel(conn, skb);
- break;
-
- case 0x0002:
- psm = get_unaligned((u16 *) skb->data);
- skb_pull(skb, 2);
- l2cap_conless_channel(conn, psm, skb);
- break;
-
- default:
- l2cap_data_channel(conn, cid, skb);
- break;
- }
-}
-
-/* ---- L2CAP interface with lower layer (HCI) ---- */
-
-static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
-{
- int exact = 0, lm1 = 0, lm2 = 0;
- register struct sock *sk;
- struct hlist_node *node;
-
- if (type != ACL_LINK)
- return 0;
-
- BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
-
- /* Find listening sockets and check their link_mode */
- read_lock(&l2cap_sk_list.lock);
- sk_for_each(sk, node, &l2cap_sk_list.head) {
- if (sk->sk_state != BT_LISTEN)
- continue;
-
- if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) {
- lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode);
- exact++;
- } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
- lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode);
- }
- read_unlock(&l2cap_sk_list.lock);
-
- return exact ? lm1 : lm2;
-}
-
-static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
-{
- struct l2cap_conn *conn;
-
- BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status);
-
- if (hcon->type != ACL_LINK)
- return 0;
-
- if (!status) {
- conn = l2cap_conn_add(hcon, status);
- if (conn)
- l2cap_conn_ready(conn);
- } else
- l2cap_conn_del(hcon, bt_err(status));
-
- return 0;
-}
-
-static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason)
-{
- BT_DBG("hcon %p reason %d", hcon, reason);
-
- if (hcon->type != ACL_LINK)
- return 0;
-
- l2cap_conn_del(hcon, bt_err(reason));
-
- return 0;
-}
-
-static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status)
-{
- struct l2cap_chan_list *l;
- struct l2cap_conn *conn = conn = hcon->l2cap_data;
- struct l2cap_conn_rsp rsp;
- struct sock *sk;
- int result;
-
- if (!conn)
- return 0;
-
- l = &conn->chan_list;
-
- BT_DBG("conn %p", conn);
-
- read_lock(&l->lock);
-
- for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- bh_lock_sock(sk);
-
- if (sk->sk_state != BT_CONNECT2 ||
- (l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) ||
- (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) {
- bh_unlock_sock(sk);
- continue;
- }
-
- if (!status) {
- sk->sk_state = BT_CONFIG;
- result = 0;
- } else {
- sk->sk_state = BT_DISCONN;
- l2cap_sock_set_timer(sk, HZ/10);
- result = L2CAP_CR_SEC_BLOCK;
- }
-
- rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid);
- rsp.result = __cpu_to_le16(result);
- rsp.status = __cpu_to_le16(0);
- l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
- L2CAP_CONN_RSP, sizeof(rsp), &rsp);
-
- bh_unlock_sock(sk);
- }
-
- read_unlock(&l->lock);
- return 0;
-}
-
-static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status)
-{
- struct l2cap_chan_list *l;
- struct l2cap_conn *conn = hcon->l2cap_data;
- struct l2cap_conn_rsp rsp;
- struct sock *sk;
- int result;
-
- if (!conn)
- return 0;
-
- l = &conn->chan_list;
-
- BT_DBG("conn %p", conn);
-
- read_lock(&l->lock);
-
- for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- bh_lock_sock(sk);
-
- if (sk->sk_state != BT_CONNECT2) {
- bh_unlock_sock(sk);
- continue;
- }
-
- if (!status) {
- sk->sk_state = BT_CONFIG;
- result = 0;
- } else {
- sk->sk_state = BT_DISCONN;
- l2cap_sock_set_timer(sk, HZ/10);
- result = L2CAP_CR_SEC_BLOCK;
- }
-
- rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid);
- rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid);
- rsp.result = __cpu_to_le16(result);
- rsp.status = __cpu_to_le16(0);
- l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
- L2CAP_CONN_RSP, sizeof(rsp), &rsp);
-
- if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)
- hci_conn_change_link_key(hcon);
-
- bh_unlock_sock(sk);
- }
-
- read_unlock(&l->lock);
- return 0;
-}
-
-static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
-{
- struct l2cap_conn *conn = hcon->l2cap_data;
-
- if (!conn && !(conn = l2cap_conn_add(hcon, 0)))
- goto drop;
-
- BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags);
-
- if (flags & ACL_START) {
- struct l2cap_hdr *hdr;
- int len;
-
- if (conn->rx_len) {
- BT_ERR("Unexpected start frame (len %d)", skb->len);
- kfree_skb(conn->rx_skb);
- conn->rx_skb = NULL;
- conn->rx_len = 0;
- l2cap_conn_unreliable(conn, ECOMM);
- }
-
- if (skb->len < 2) {
- BT_ERR("Frame is too short (len %d)", skb->len);
- l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
- }
-
- hdr = (struct l2cap_hdr *) skb->data;
- len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
-
- if (len == skb->len) {
- /* Complete frame received */
- l2cap_recv_frame(conn, skb);
- return 0;
- }
-
- BT_DBG("Start: total len %d, frag len %d", len, skb->len);
-
- if (skb->len > len) {
- BT_ERR("Frame is too long (len %d, expected len %d)",
- skb->len, len);
- l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
- }
-
- /* Allocate skb for the complete frame (with header) */
- if (!(conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC)))
- goto drop;
-
- memcpy(skb_put(conn->rx_skb, skb->len), skb->data, skb->len);
- conn->rx_len = len - skb->len;
- } else {
- BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
-
- if (!conn->rx_len) {
- BT_ERR("Unexpected continuation frame (len %d)", skb->len);
- l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
- }
-
- if (skb->len > conn->rx_len) {
- BT_ERR("Fragment is too long (len %d, expected %d)",
- skb->len, conn->rx_len);
- kfree_skb(conn->rx_skb);
- conn->rx_skb = NULL;
- conn->rx_len = 0;
- l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
- }
-
- memcpy(skb_put(conn->rx_skb, skb->len), skb->data, skb->len);
- conn->rx_len -= skb->len;
-
- if (!conn->rx_len) {
- /* Complete frame received */
- l2cap_recv_frame(conn, conn->rx_skb);
- conn->rx_skb = NULL;
- }
- }
-
-drop:
- kfree_skb(skb);
- return 0;
-}
-
-static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
-{
- struct sock *sk;
- struct hlist_node *node;
- char *str = buf;
-
- read_lock_bh(&l2cap_sk_list.lock);
-
- sk_for_each(sk, node, &l2cap_sk_list.head) {
- struct l2cap_pinfo *pi = l2cap_pi(sk);
-
- str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu,
- pi->omtu, pi->link_mode);
- }
-
- read_unlock_bh(&l2cap_sk_list.lock);
-
- return (str - buf);
-}
-
-static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
-
-static const struct proto_ops l2cap_sock_ops = {
- .family = PF_BLUETOOTH,
- .owner = THIS_MODULE,
- .release = l2cap_sock_release,
- .bind = l2cap_sock_bind,
- .connect = l2cap_sock_connect,
- .listen = l2cap_sock_listen,
- .accept = l2cap_sock_accept,
- .getname = l2cap_sock_getname,
- .sendmsg = l2cap_sock_sendmsg,
- .recvmsg = bt_sock_recvmsg,
- .poll = bt_sock_poll,
- .mmap = sock_no_mmap,
- .socketpair = sock_no_socketpair,
- .ioctl = sock_no_ioctl,
- .shutdown = l2cap_sock_shutdown,
- .setsockopt = l2cap_sock_setsockopt,
- .getsockopt = l2cap_sock_getsockopt
-};
-
-static struct net_proto_family l2cap_sock_family_ops = {
- .family = PF_BLUETOOTH,
- .owner = THIS_MODULE,
- .create = l2cap_sock_create,
-};
-
-static struct hci_proto l2cap_hci_proto = {
- .name = "L2CAP",
- .id = HCI_PROTO_L2CAP,
- .connect_ind = l2cap_connect_ind,
- .connect_cfm = l2cap_connect_cfm,
- .disconn_ind = l2cap_disconn_ind,
- .auth_cfm = l2cap_auth_cfm,
- .encrypt_cfm = l2cap_encrypt_cfm,
- .recv_acldata = l2cap_recv_acldata
-};
-
-static int __init l2cap_init(void)
-{
- int err;
-
- err = proto_register(&l2cap_proto, 0);
- if (err < 0)
- return err;
-
- err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops);
- if (err < 0) {
- BT_ERR("L2CAP socket registration failed");
- goto error;
- }
-
- err = hci_register_proto(&l2cap_hci_proto);
- if (err < 0) {
- BT_ERR("L2CAP protocol registration failed");
- bt_sock_unregister(BTPROTO_L2CAP);
- goto error;
- }
-
- if (class_create_file(bt_class, &class_attr_l2cap) < 0)
- BT_ERR("Failed to create L2CAP info file");
-
- BT_INFO("L2CAP ver %s", VERSION);
- BT_INFO("L2CAP socket layer initialized");
-
- return 0;
-
-error:
- proto_unregister(&l2cap_proto);
- return err;
-}
-
-static void __exit l2cap_exit(void)
-{
- class_remove_file(bt_class, &class_attr_l2cap);
-
- if (bt_sock_unregister(BTPROTO_L2CAP) < 0)
- BT_ERR("L2CAP socket unregistration failed");
-
- if (hci_unregister_proto(&l2cap_hci_proto) < 0)
- BT_ERR("L2CAP protocol unregistration failed");
-
- proto_unregister(&l2cap_proto);
-}
-
-void l2cap_load(void)
-{
- /* Dummy function to trigger automatic L2CAP module loading by
- * other modules that use L2CAP sockets but don't use any other
- * symbols from it. */
- return;
-}
-EXPORT_SYMBOL(l2cap_load);
-
-module_init(l2cap_init);
-module_exit(l2cap_exit);
-
-MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>, Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth L2CAP ver " VERSION);
-MODULE_VERSION(VERSION);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("bt-proto-0");
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
new file mode 100644
index 000000000000..07b493331fd7
--- /dev/null
+++ b/net/bluetooth/l2cap_core.c
@@ -0,0 +1,7721 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+ Copyright (C) 2010 Google Inc.
+ Copyright (C) 2011 ProFUSION Embedded Systems
+ Copyright (c) 2012 Code Aurora Forum. All rights reserved.
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth L2CAP core. */
+
+#include <linux/module.h>
+
+#include <linux/debugfs.h>
+#include <linux/crc16.h>
+#include <linux/filter.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "smp.h"
+
+#define LE_FLOWCTL_MAX_CREDITS 65535
+
+bool disable_ertm;
+bool enable_ecred = IS_ENABLED(CONFIG_BT_LE_L2CAP_ECRED);
+
+static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
+
+static LIST_HEAD(chan_list);
+static DEFINE_RWLOCK(chan_list_lock);
+
+static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
+ u8 code, u8 ident, u16 dlen, void *data);
+static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
+ void *data);
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size);
+static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err);
+
+static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event);
+static void l2cap_retrans_timeout(struct work_struct *work);
+static void l2cap_monitor_timeout(struct work_struct *work);
+static void l2cap_ack_timeout(struct work_struct *work);
+
+static inline u8 bdaddr_type(u8 link_type, u8 bdaddr_type)
+{
+ if (link_type == LE_LINK) {
+ if (bdaddr_type == ADDR_LE_DEV_PUBLIC)
+ return BDADDR_LE_PUBLIC;
+ else
+ return BDADDR_LE_RANDOM;
+ }
+
+ return BDADDR_BREDR;
+}
+
+static inline u8 bdaddr_src_type(struct hci_conn *hcon)
+{
+ return bdaddr_type(hcon->type, hcon->src_type);
+}
+
+static inline u8 bdaddr_dst_type(struct hci_conn *hcon)
+{
+ return bdaddr_type(hcon->type, hcon->dst_type);
+}
+
+/* ---- L2CAP channels ---- */
+
+static struct l2cap_chan *__l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &conn->chan_l, list) {
+ if (c->dcid == cid)
+ return c;
+ }
+ return NULL;
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &conn->chan_l, list) {
+ if (c->scid == cid)
+ return c;
+ }
+ return NULL;
+}
+
+/* Find channel with given SCID.
+ * Returns a reference locked channel.
+ */
+static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ c = __l2cap_get_chan_by_scid(conn, cid);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
+
+ return c;
+}
+
+/* Find channel with given DCID.
+ * Returns a reference locked channel.
+ */
+static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ c = __l2cap_get_chan_by_dcid(conn, cid);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
+
+ return c;
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn,
+ u8 ident)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &conn->chan_l, list) {
+ if (c->ident == ident)
+ return c;
+ }
+ return NULL;
+}
+
+static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src,
+ u8 src_type)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &chan_list, global_l) {
+ if (src_type == BDADDR_BREDR && c->src_type != BDADDR_BREDR)
+ continue;
+
+ if (src_type != BDADDR_BREDR && c->src_type == BDADDR_BREDR)
+ continue;
+
+ if (c->sport == psm && !bacmp(&c->src, src))
+ return c;
+ }
+ return NULL;
+}
+
+int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
+{
+ int err;
+
+ write_lock(&chan_list_lock);
+
+ if (psm && __l2cap_global_chan_by_addr(psm, src, chan->src_type)) {
+ err = -EADDRINUSE;
+ goto done;
+ }
+
+ if (psm) {
+ chan->psm = psm;
+ chan->sport = psm;
+ err = 0;
+ } else {
+ u16 p, start, end, incr;
+
+ if (chan->src_type == BDADDR_BREDR) {
+ start = L2CAP_PSM_DYN_START;
+ end = L2CAP_PSM_AUTO_END;
+ incr = 2;
+ } else {
+ start = L2CAP_PSM_LE_DYN_START;
+ end = L2CAP_PSM_LE_DYN_END;
+ incr = 1;
+ }
+
+ err = -EINVAL;
+ for (p = start; p <= end; p += incr)
+ if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src,
+ chan->src_type)) {
+ chan->psm = cpu_to_le16(p);
+ chan->sport = cpu_to_le16(p);
+ err = 0;
+ break;
+ }
+ }
+
+done:
+ write_unlock(&chan_list_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2cap_add_psm);
+
+int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid)
+{
+ write_lock(&chan_list_lock);
+
+ /* Override the defaults (which are for conn-oriented) */
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ chan->chan_type = L2CAP_CHAN_FIXED;
+
+ chan->scid = scid;
+
+ write_unlock(&chan_list_lock);
+
+ return 0;
+}
+
+static u16 l2cap_alloc_cid(struct l2cap_conn *conn)
+{
+ u16 cid, dyn_end;
+
+ if (conn->hcon->type == LE_LINK)
+ dyn_end = L2CAP_CID_LE_DYN_END;
+ else
+ dyn_end = L2CAP_CID_DYN_END;
+
+ for (cid = L2CAP_CID_DYN_START; cid <= dyn_end; cid++) {
+ if (!__l2cap_get_chan_by_scid(conn, cid))
+ return cid;
+ }
+
+ return 0;
+}
+
+static void l2cap_state_change(struct l2cap_chan *chan, int state)
+{
+ BT_DBG("chan %p %s -> %s", chan, state_to_string(chan->state),
+ state_to_string(state));
+
+ chan->state = state;
+ chan->ops->state_change(chan, state, 0);
+}
+
+static inline void l2cap_state_change_and_error(struct l2cap_chan *chan,
+ int state, int err)
+{
+ chan->state = state;
+ chan->ops->state_change(chan, chan->state, err);
+}
+
+static inline void l2cap_chan_set_err(struct l2cap_chan *chan, int err)
+{
+ chan->ops->state_change(chan, chan->state, err);
+}
+
+static void __set_retrans_timer(struct l2cap_chan *chan)
+{
+ if (!delayed_work_pending(&chan->monitor_timer) &&
+ chan->retrans_timeout) {
+ l2cap_set_timer(chan, &chan->retrans_timer,
+ msecs_to_jiffies(chan->retrans_timeout));
+ }
+}
+
+static void __set_monitor_timer(struct l2cap_chan *chan)
+{
+ __clear_retrans_timer(chan);
+ if (chan->monitor_timeout) {
+ l2cap_set_timer(chan, &chan->monitor_timer,
+ msecs_to_jiffies(chan->monitor_timeout));
+ }
+}
+
+static struct sk_buff *l2cap_ertm_seq_in_queue(struct sk_buff_head *head,
+ u16 seq)
+{
+ struct sk_buff *skb;
+
+ skb_queue_walk(head, skb) {
+ if (bt_cb(skb)->l2cap.txseq == seq)
+ return skb;
+ }
+
+ return NULL;
+}
+
+/* ---- L2CAP sequence number lists ---- */
+
+/* For ERTM, ordered lists of sequence numbers must be tracked for
+ * SREJ requests that are received and for frames that are to be
+ * retransmitted. These seq_list functions implement a singly-linked
+ * list in an array, where membership in the list can also be checked
+ * in constant time. Items can also be added to the tail of the list
+ * and removed from the head in constant time, without further memory
+ * allocs or frees.
+ */
+
+static int l2cap_seq_list_init(struct l2cap_seq_list *seq_list, u16 size)
+{
+ size_t alloc_size, i;
+
+ /* Allocated size is a power of 2 to map sequence numbers
+ * (which may be up to 14 bits) in to a smaller array that is
+ * sized for the negotiated ERTM transmit windows.
+ */
+ alloc_size = roundup_pow_of_two(size);
+
+ seq_list->list = kmalloc_array(alloc_size, sizeof(u16), GFP_KERNEL);
+ if (!seq_list->list)
+ return -ENOMEM;
+
+ seq_list->mask = alloc_size - 1;
+ seq_list->head = L2CAP_SEQ_LIST_CLEAR;
+ seq_list->tail = L2CAP_SEQ_LIST_CLEAR;
+ for (i = 0; i < alloc_size; i++)
+ seq_list->list[i] = L2CAP_SEQ_LIST_CLEAR;
+
+ return 0;
+}
+
+static inline void l2cap_seq_list_free(struct l2cap_seq_list *seq_list)
+{
+ kfree(seq_list->list);
+}
+
+static inline bool l2cap_seq_list_contains(struct l2cap_seq_list *seq_list,
+ u16 seq)
+{
+ /* Constant-time check for list membership */
+ return seq_list->list[seq & seq_list->mask] != L2CAP_SEQ_LIST_CLEAR;
+}
+
+static inline u16 l2cap_seq_list_pop(struct l2cap_seq_list *seq_list)
+{
+ u16 seq = seq_list->head;
+ u16 mask = seq_list->mask;
+
+ seq_list->head = seq_list->list[seq & mask];
+ seq_list->list[seq & mask] = L2CAP_SEQ_LIST_CLEAR;
+
+ if (seq_list->head == L2CAP_SEQ_LIST_TAIL) {
+ seq_list->head = L2CAP_SEQ_LIST_CLEAR;
+ seq_list->tail = L2CAP_SEQ_LIST_CLEAR;
+ }
+
+ return seq;
+}
+
+static void l2cap_seq_list_clear(struct l2cap_seq_list *seq_list)
+{
+ u16 i;
+
+ if (seq_list->head == L2CAP_SEQ_LIST_CLEAR)
+ return;
+
+ for (i = 0; i <= seq_list->mask; i++)
+ seq_list->list[i] = L2CAP_SEQ_LIST_CLEAR;
+
+ seq_list->head = L2CAP_SEQ_LIST_CLEAR;
+ seq_list->tail = L2CAP_SEQ_LIST_CLEAR;
+}
+
+static void l2cap_seq_list_append(struct l2cap_seq_list *seq_list, u16 seq)
+{
+ u16 mask = seq_list->mask;
+
+ /* All appends happen in constant time */
+
+ if (seq_list->list[seq & mask] != L2CAP_SEQ_LIST_CLEAR)
+ return;
+
+ if (seq_list->tail == L2CAP_SEQ_LIST_CLEAR)
+ seq_list->head = seq;
+ else
+ seq_list->list[seq_list->tail & mask] = seq;
+
+ seq_list->tail = seq;
+ seq_list->list[seq & mask] = L2CAP_SEQ_LIST_TAIL;
+}
+
+static void l2cap_chan_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ chan_timer.work);
+ struct l2cap_conn *conn = chan->conn;
+ int reason;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ if (!conn)
+ return;
+
+ mutex_lock(&conn->lock);
+ /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling
+ * this work. No need to call l2cap_chan_hold(chan) here again.
+ */
+ l2cap_chan_lock(chan);
+
+ if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG)
+ reason = ECONNREFUSED;
+ else if (chan->state == BT_CONNECT &&
+ chan->sec_level != BT_SECURITY_SDP)
+ reason = ECONNREFUSED;
+ else
+ reason = ETIMEDOUT;
+
+ l2cap_chan_close(chan, reason);
+
+ chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ mutex_unlock(&conn->lock);
+}
+
+struct l2cap_chan *l2cap_chan_create(void)
+{
+ struct l2cap_chan *chan;
+
+ chan = kzalloc(sizeof(*chan), GFP_ATOMIC);
+ if (!chan)
+ return NULL;
+
+ skb_queue_head_init(&chan->tx_q);
+ skb_queue_head_init(&chan->srej_q);
+ mutex_init(&chan->lock);
+
+ /* Set default lock nesting level */
+ atomic_set(&chan->nesting, L2CAP_NESTING_NORMAL);
+
+ /* Available receive buffer space is initially unknown */
+ chan->rx_avail = -1;
+
+ write_lock(&chan_list_lock);
+ list_add(&chan->global_l, &chan_list);
+ write_unlock(&chan_list_lock);
+
+ INIT_DELAYED_WORK(&chan->chan_timer, l2cap_chan_timeout);
+ INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout);
+ INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout);
+ INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout);
+
+ chan->state = BT_OPEN;
+
+ kref_init(&chan->kref);
+
+ /* This flag is cleared in l2cap_chan_ready() */
+ set_bit(CONF_NOT_COMPLETE, &chan->conf_state);
+
+ BT_DBG("chan %p", chan);
+
+ return chan;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_create);
+
+static void l2cap_chan_destroy(struct kref *kref)
+{
+ struct l2cap_chan *chan = container_of(kref, struct l2cap_chan, kref);
+
+ BT_DBG("chan %p", chan);
+
+ write_lock(&chan_list_lock);
+ list_del(&chan->global_l);
+ write_unlock(&chan_list_lock);
+
+ kfree(chan);
+}
+
+void l2cap_chan_hold(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
+
+ kref_get(&c->kref);
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_hold);
+
+struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
+
+ if (!kref_get_unless_zero(&c->kref))
+ return NULL;
+
+ return c;
+}
+
+void l2cap_chan_put(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
+
+ kref_put(&c->kref, l2cap_chan_destroy);
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_put);
+
+void l2cap_chan_set_defaults(struct l2cap_chan *chan)
+{
+ chan->fcs = L2CAP_FCS_CRC16;
+ chan->max_tx = L2CAP_DEFAULT_MAX_TX;
+ chan->tx_win = L2CAP_DEFAULT_TX_WINDOW;
+ chan->tx_win_max = L2CAP_DEFAULT_TX_WINDOW;
+ chan->remote_max_tx = chan->max_tx;
+ chan->remote_tx_win = chan->tx_win;
+ chan->ack_win = L2CAP_DEFAULT_TX_WINDOW;
+ chan->sec_level = BT_SECURITY_LOW;
+ chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
+ chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
+ chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
+
+ chan->conf_state = 0;
+ set_bit(CONF_NOT_COMPLETE, &chan->conf_state);
+
+ set_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults);
+
+static __u16 l2cap_le_rx_credits(struct l2cap_chan *chan)
+{
+ size_t sdu_len = chan->sdu ? chan->sdu->len : 0;
+
+ if (chan->mps == 0)
+ return 0;
+
+ /* If we don't know the available space in the receiver buffer, give
+ * enough credits for a full packet.
+ */
+ if (chan->rx_avail == -1)
+ return (chan->imtu / chan->mps) + 1;
+
+ /* If we know how much space is available in the receive buffer, give
+ * out as many credits as would fill the buffer.
+ */
+ if (chan->rx_avail <= sdu_len)
+ return 0;
+
+ return DIV_ROUND_UP(chan->rx_avail - sdu_len, chan->mps);
+}
+
+static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits)
+{
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ chan->tx_credits = tx_credits;
+ /* Derive MPS from connection MTU to stop HCI fragmentation */
+ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
+ chan->rx_credits = l2cap_le_rx_credits(chan);
+
+ skb_queue_head_init(&chan->tx_q);
+}
+
+static void l2cap_ecred_init(struct l2cap_chan *chan, u16 tx_credits)
+{
+ l2cap_le_flowctl_init(chan, tx_credits);
+
+ /* L2CAP implementations shall support a minimum MPS of 64 octets */
+ if (chan->mps < L2CAP_ECRED_MIN_MPS) {
+ chan->mps = L2CAP_ECRED_MIN_MPS;
+ chan->rx_credits = l2cap_le_rx_credits(chan);
+ }
+}
+
+void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
+{
+ BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn,
+ __le16_to_cpu(chan->psm), chan->dcid);
+
+ conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
+
+ chan->conn = conn;
+
+ switch (chan->chan_type) {
+ case L2CAP_CHAN_CONN_ORIENTED:
+ /* Alloc CID for connection-oriented socket */
+ chan->scid = l2cap_alloc_cid(conn);
+ if (conn->hcon->type == ACL_LINK)
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ break;
+
+ case L2CAP_CHAN_CONN_LESS:
+ /* Connectionless socket */
+ chan->scid = L2CAP_CID_CONN_LESS;
+ chan->dcid = L2CAP_CID_CONN_LESS;
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ break;
+
+ case L2CAP_CHAN_FIXED:
+ /* Caller will set CID and CID specific MTU values */
+ break;
+
+ default:
+ /* Raw socket can send/recv signalling messages only */
+ chan->scid = L2CAP_CID_SIGNALING;
+ chan->dcid = L2CAP_CID_SIGNALING;
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ }
+
+ chan->local_id = L2CAP_BESTEFFORT_ID;
+ chan->local_stype = L2CAP_SERV_BESTEFFORT;
+ chan->local_msdu = L2CAP_DEFAULT_MAX_SDU_SIZE;
+ chan->local_sdu_itime = L2CAP_DEFAULT_SDU_ITIME;
+ chan->local_acc_lat = L2CAP_DEFAULT_ACC_LAT;
+ chan->local_flush_to = L2CAP_EFS_DEFAULT_FLUSH_TO;
+
+ l2cap_chan_hold(chan);
+
+ /* Only keep a reference for fixed channels if they requested it */
+ if (chan->chan_type != L2CAP_CHAN_FIXED ||
+ test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
+ hci_conn_hold(conn->hcon);
+
+ /* Append to the list since the order matters for ECRED */
+ list_add_tail(&chan->list, &conn->chan_l);
+}
+
+void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
+{
+ mutex_lock(&conn->lock);
+ __l2cap_chan_add(conn, chan);
+ mutex_unlock(&conn->lock);
+}
+
+void l2cap_chan_del(struct l2cap_chan *chan, int err)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ __clear_chan_timer(chan);
+
+ BT_DBG("chan %p, conn %p, err %d, state %s", chan, conn, err,
+ state_to_string(chan->state));
+
+ chan->ops->teardown(chan, err);
+
+ if (conn) {
+ /* Delete from channel list */
+ list_del(&chan->list);
+
+ l2cap_chan_put(chan);
+
+ chan->conn = NULL;
+
+ /* Reference was only held for non-fixed channels or
+ * fixed channels that explicitly requested it using the
+ * FLAG_HOLD_HCI_CONN flag.
+ */
+ if (chan->chan_type != L2CAP_CHAN_FIXED ||
+ test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
+ hci_conn_drop(conn->hcon);
+ }
+
+ if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state))
+ return;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ break;
+
+ case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
+ skb_queue_purge(&chan->tx_q);
+ break;
+
+ case L2CAP_MODE_ERTM:
+ __clear_retrans_timer(chan);
+ __clear_monitor_timer(chan);
+ __clear_ack_timer(chan);
+
+ skb_queue_purge(&chan->srej_q);
+
+ l2cap_seq_list_free(&chan->srej_list);
+ l2cap_seq_list_free(&chan->retrans_list);
+ fallthrough;
+
+ case L2CAP_MODE_STREAMING:
+ skb_queue_purge(&chan->tx_q);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_del);
+
+static void __l2cap_chan_list_id(struct l2cap_conn *conn, u16 id,
+ l2cap_chan_func_t func, void *data)
+{
+ struct l2cap_chan *chan, *l;
+
+ list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
+ if (chan->ident == id)
+ func(chan, data);
+ }
+}
+
+static void __l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ struct l2cap_chan *chan;
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ func(chan, data);
+ }
+}
+
+void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ if (!conn)
+ return;
+
+ mutex_lock(&conn->lock);
+ __l2cap_chan_list(conn, func, data);
+ mutex_unlock(&conn->lock);
+}
+
+EXPORT_SYMBOL_GPL(l2cap_chan_list);
+
+static void l2cap_conn_update_id_addr(struct work_struct *work)
+{
+ struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
+ id_addr_timer.work);
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan;
+
+ mutex_lock(&conn->lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ l2cap_chan_lock(chan);
+ bacpy(&chan->dst, &hcon->dst);
+ chan->dst_type = bdaddr_dst_type(hcon);
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->lock);
+}
+
+static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_le_conn_rsp rsp;
+ u16 result;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ result = L2CAP_CR_LE_AUTHORIZATION;
+ else
+ result = L2CAP_CR_LE_BAD_PSM;
+
+ l2cap_state_change(chan, BT_DISCONN);
+
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.mtu = cpu_to_le16(chan->imtu);
+ rsp.mps = cpu_to_le16(chan->mps);
+ rsp.credits = cpu_to_le16(chan->rx_credits);
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
+ &rsp);
+}
+
+static void l2cap_chan_ecred_connect_reject(struct l2cap_chan *chan)
+{
+ l2cap_state_change(chan, BT_DISCONN);
+
+ __l2cap_ecred_conn_rsp_defer(chan);
+}
+
+static void l2cap_chan_connect_reject(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_conn_rsp rsp;
+ u16 result;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ result = L2CAP_CR_SEC_BLOCK;
+ else
+ result = L2CAP_CR_BAD_PSM;
+
+ l2cap_state_change(chan, BT_DISCONN);
+
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.result = cpu_to_le16(result);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+}
+
+void l2cap_chan_close(struct l2cap_chan *chan, int reason)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ switch (chan->state) {
+ case BT_LISTEN:
+ chan->ops->teardown(chan, 0);
+ break;
+
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+ l2cap_send_disconn_req(chan, reason);
+ } else
+ l2cap_chan_del(chan, reason);
+ break;
+
+ case BT_CONNECT2:
+ if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
+ if (conn->hcon->type == ACL_LINK)
+ l2cap_chan_connect_reject(chan);
+ else if (conn->hcon->type == LE_LINK) {
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ l2cap_chan_le_connect_reject(chan);
+ break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ l2cap_chan_ecred_connect_reject(chan);
+ return;
+ }
+ }
+ }
+
+ l2cap_chan_del(chan, reason);
+ break;
+
+ case BT_CONNECT:
+ case BT_DISCONN:
+ l2cap_chan_del(chan, reason);
+ break;
+
+ default:
+ chan->ops->teardown(chan, 0);
+ break;
+ }
+}
+EXPORT_SYMBOL(l2cap_chan_close);
+
+static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan)
+{
+ switch (chan->chan_type) {
+ case L2CAP_CHAN_RAW:
+ switch (chan->sec_level) {
+ case BT_SECURITY_HIGH:
+ case BT_SECURITY_FIPS:
+ return HCI_AT_DEDICATED_BONDING_MITM;
+ case BT_SECURITY_MEDIUM:
+ return HCI_AT_DEDICATED_BONDING;
+ default:
+ return HCI_AT_NO_BONDING;
+ }
+ break;
+ case L2CAP_CHAN_CONN_LESS:
+ if (chan->psm == cpu_to_le16(L2CAP_PSM_3DSP)) {
+ if (chan->sec_level == BT_SECURITY_LOW)
+ chan->sec_level = BT_SECURITY_SDP;
+ }
+ if (chan->sec_level == BT_SECURITY_HIGH ||
+ chan->sec_level == BT_SECURITY_FIPS)
+ return HCI_AT_NO_BONDING_MITM;
+ else
+ return HCI_AT_NO_BONDING;
+ break;
+ case L2CAP_CHAN_CONN_ORIENTED:
+ if (chan->psm == cpu_to_le16(L2CAP_PSM_SDP)) {
+ if (chan->sec_level == BT_SECURITY_LOW)
+ chan->sec_level = BT_SECURITY_SDP;
+
+ if (chan->sec_level == BT_SECURITY_HIGH ||
+ chan->sec_level == BT_SECURITY_FIPS)
+ return HCI_AT_NO_BONDING_MITM;
+ else
+ return HCI_AT_NO_BONDING;
+ }
+ fallthrough;
+
+ default:
+ switch (chan->sec_level) {
+ case BT_SECURITY_HIGH:
+ case BT_SECURITY_FIPS:
+ return HCI_AT_GENERAL_BONDING_MITM;
+ case BT_SECURITY_MEDIUM:
+ return HCI_AT_GENERAL_BONDING;
+ default:
+ return HCI_AT_NO_BONDING;
+ }
+ break;
+ }
+}
+
+/* Service level security */
+int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator)
+{
+ struct l2cap_conn *conn = chan->conn;
+ __u8 auth_type;
+
+ if (conn->hcon->type == LE_LINK)
+ return smp_conn_security(conn->hcon, chan->sec_level);
+
+ auth_type = l2cap_get_auth_type(chan);
+
+ return hci_conn_security(conn->hcon, chan->sec_level, auth_type,
+ initiator);
+}
+
+static u8 l2cap_get_ident(struct l2cap_conn *conn)
+{
+ u8 id;
+
+ /* Get next available identificator.
+ * 1 - 128 are used by kernel.
+ * 129 - 199 are reserved.
+ * 200 - 254 are used by utilities like l2ping, etc.
+ */
+
+ mutex_lock(&conn->ident_lock);
+
+ if (++conn->tx_ident > 128)
+ conn->tx_ident = 1;
+
+ id = conn->tx_ident;
+
+ mutex_unlock(&conn->ident_lock);
+
+ return id;
+}
+
+static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb,
+ u8 flags)
+{
+ /* Check if the hcon still valid before attempting to send */
+ if (hci_conn_valid(conn->hcon->hdev, conn->hcon))
+ hci_send_acl(conn->hchan, skb, flags);
+ else
+ kfree_skb(skb);
+}
+
+static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
+ void *data)
+{
+ struct sk_buff *skb = l2cap_build_cmd(conn, code, ident, len, data);
+ u8 flags;
+
+ BT_DBG("code 0x%2.2x", code);
+
+ if (!skb)
+ return;
+
+ /* Use NO_FLUSH if supported or we have an LE link (which does
+ * not support auto-flushing packets) */
+ if (lmp_no_flush_capable(conn->hcon->hdev) ||
+ conn->hcon->type == LE_LINK)
+ flags = ACL_START_NO_FLUSH;
+ else
+ flags = ACL_START;
+
+ bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON;
+ skb->priority = HCI_PRIO_MAX;
+
+ l2cap_send_acl(conn, skb, flags);
+}
+
+static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct hci_conn *hcon = chan->conn->hcon;
+ u16 flags;
+
+ BT_DBG("chan %p, skb %p len %d priority %u", chan, skb, skb->len,
+ skb->priority);
+
+ /* Use NO_FLUSH for LE links (where this is the only option) or
+ * if the BR/EDR link supports it and flushing has not been
+ * explicitly requested (through FLAG_FLUSHABLE).
+ */
+ if (hcon->type == LE_LINK ||
+ (!test_bit(FLAG_FLUSHABLE, &chan->flags) &&
+ lmp_no_flush_capable(hcon->hdev)))
+ flags = ACL_START_NO_FLUSH;
+ else
+ flags = ACL_START;
+
+ bt_cb(skb)->force_active = test_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+ hci_send_acl(chan->conn->hchan, skb, flags);
+}
+
+static void __unpack_enhanced_control(u16 enh, struct l2cap_ctrl *control)
+{
+ control->reqseq = (enh & L2CAP_CTRL_REQSEQ) >> L2CAP_CTRL_REQSEQ_SHIFT;
+ control->final = (enh & L2CAP_CTRL_FINAL) >> L2CAP_CTRL_FINAL_SHIFT;
+
+ if (enh & L2CAP_CTRL_FRAME_TYPE) {
+ /* S-Frame */
+ control->sframe = 1;
+ control->poll = (enh & L2CAP_CTRL_POLL) >> L2CAP_CTRL_POLL_SHIFT;
+ control->super = (enh & L2CAP_CTRL_SUPERVISE) >> L2CAP_CTRL_SUPER_SHIFT;
+
+ control->sar = 0;
+ control->txseq = 0;
+ } else {
+ /* I-Frame */
+ control->sframe = 0;
+ control->sar = (enh & L2CAP_CTRL_SAR) >> L2CAP_CTRL_SAR_SHIFT;
+ control->txseq = (enh & L2CAP_CTRL_TXSEQ) >> L2CAP_CTRL_TXSEQ_SHIFT;
+
+ control->poll = 0;
+ control->super = 0;
+ }
+}
+
+static void __unpack_extended_control(u32 ext, struct l2cap_ctrl *control)
+{
+ control->reqseq = (ext & L2CAP_EXT_CTRL_REQSEQ) >> L2CAP_EXT_CTRL_REQSEQ_SHIFT;
+ control->final = (ext & L2CAP_EXT_CTRL_FINAL) >> L2CAP_EXT_CTRL_FINAL_SHIFT;
+
+ if (ext & L2CAP_EXT_CTRL_FRAME_TYPE) {
+ /* S-Frame */
+ control->sframe = 1;
+ control->poll = (ext & L2CAP_EXT_CTRL_POLL) >> L2CAP_EXT_CTRL_POLL_SHIFT;
+ control->super = (ext & L2CAP_EXT_CTRL_SUPERVISE) >> L2CAP_EXT_CTRL_SUPER_SHIFT;
+
+ control->sar = 0;
+ control->txseq = 0;
+ } else {
+ /* I-Frame */
+ control->sframe = 0;
+ control->sar = (ext & L2CAP_EXT_CTRL_SAR) >> L2CAP_EXT_CTRL_SAR_SHIFT;
+ control->txseq = (ext & L2CAP_EXT_CTRL_TXSEQ) >> L2CAP_EXT_CTRL_TXSEQ_SHIFT;
+
+ control->poll = 0;
+ control->super = 0;
+ }
+}
+
+static inline void __unpack_control(struct l2cap_chan *chan,
+ struct sk_buff *skb)
+{
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags)) {
+ __unpack_extended_control(get_unaligned_le32(skb->data),
+ &bt_cb(skb)->l2cap);
+ skb_pull(skb, L2CAP_EXT_CTRL_SIZE);
+ } else {
+ __unpack_enhanced_control(get_unaligned_le16(skb->data),
+ &bt_cb(skb)->l2cap);
+ skb_pull(skb, L2CAP_ENH_CTRL_SIZE);
+ }
+}
+
+static u32 __pack_extended_control(struct l2cap_ctrl *control)
+{
+ u32 packed;
+
+ packed = control->reqseq << L2CAP_EXT_CTRL_REQSEQ_SHIFT;
+ packed |= control->final << L2CAP_EXT_CTRL_FINAL_SHIFT;
+
+ if (control->sframe) {
+ packed |= control->poll << L2CAP_EXT_CTRL_POLL_SHIFT;
+ packed |= control->super << L2CAP_EXT_CTRL_SUPER_SHIFT;
+ packed |= L2CAP_EXT_CTRL_FRAME_TYPE;
+ } else {
+ packed |= control->sar << L2CAP_EXT_CTRL_SAR_SHIFT;
+ packed |= control->txseq << L2CAP_EXT_CTRL_TXSEQ_SHIFT;
+ }
+
+ return packed;
+}
+
+static u16 __pack_enhanced_control(struct l2cap_ctrl *control)
+{
+ u16 packed;
+
+ packed = control->reqseq << L2CAP_CTRL_REQSEQ_SHIFT;
+ packed |= control->final << L2CAP_CTRL_FINAL_SHIFT;
+
+ if (control->sframe) {
+ packed |= control->poll << L2CAP_CTRL_POLL_SHIFT;
+ packed |= control->super << L2CAP_CTRL_SUPER_SHIFT;
+ packed |= L2CAP_CTRL_FRAME_TYPE;
+ } else {
+ packed |= control->sar << L2CAP_CTRL_SAR_SHIFT;
+ packed |= control->txseq << L2CAP_CTRL_TXSEQ_SHIFT;
+ }
+
+ return packed;
+}
+
+static inline void __pack_control(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb)
+{
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags)) {
+ put_unaligned_le32(__pack_extended_control(control),
+ skb->data + L2CAP_HDR_SIZE);
+ } else {
+ put_unaligned_le16(__pack_enhanced_control(control),
+ skb->data + L2CAP_HDR_SIZE);
+ }
+}
+
+static inline unsigned int __ertm_hdr_size(struct l2cap_chan *chan)
+{
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ return L2CAP_EXT_HDR_SIZE;
+ else
+ return L2CAP_ENH_HDR_SIZE;
+}
+
+static struct sk_buff *l2cap_create_sframe_pdu(struct l2cap_chan *chan,
+ u32 control)
+{
+ struct sk_buff *skb;
+ struct l2cap_hdr *lh;
+ int hlen = __ertm_hdr_size(chan);
+
+ if (chan->fcs == L2CAP_FCS_CRC16)
+ hlen += L2CAP_FCS_SIZE;
+
+ skb = bt_skb_alloc(hlen, GFP_KERNEL);
+
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ lh = skb_put(skb, L2CAP_HDR_SIZE);
+ lh->len = cpu_to_le16(hlen - L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ put_unaligned_le32(control, skb_put(skb, L2CAP_EXT_CTRL_SIZE));
+ else
+ put_unaligned_le16(control, skb_put(skb, L2CAP_ENH_CTRL_SIZE));
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *)skb->data, skb->len);
+ put_unaligned_le16(fcs, skb_put(skb, L2CAP_FCS_SIZE));
+ }
+
+ skb->priority = HCI_PRIO_MAX;
+ return skb;
+}
+
+static void l2cap_send_sframe(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+ u32 control_field;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (!control->sframe)
+ return;
+
+ if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state) &&
+ !control->poll)
+ control->final = 1;
+
+ if (control->super == L2CAP_SUPER_RR)
+ clear_bit(CONN_RNR_SENT, &chan->conn_state);
+ else if (control->super == L2CAP_SUPER_RNR)
+ set_bit(CONN_RNR_SENT, &chan->conn_state);
+
+ if (control->super != L2CAP_SUPER_SREJ) {
+ chan->last_acked_seq = control->reqseq;
+ __clear_ack_timer(chan);
+ }
+
+ BT_DBG("reqseq %d, final %d, poll %d, super %d", control->reqseq,
+ control->final, control->poll, control->super);
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ control_field = __pack_extended_control(control);
+ else
+ control_field = __pack_enhanced_control(control);
+
+ skb = l2cap_create_sframe_pdu(chan, control_field);
+ if (!IS_ERR(skb))
+ l2cap_do_send(chan, skb);
+}
+
+static void l2cap_send_rr_or_rnr(struct l2cap_chan *chan, bool poll)
+{
+ struct l2cap_ctrl control;
+
+ BT_DBG("chan %p, poll %d", chan, poll);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.poll = poll;
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state))
+ control.super = L2CAP_SUPER_RNR;
+ else
+ control.super = L2CAP_SUPER_RR;
+
+ control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &control);
+}
+
+static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan)
+{
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED)
+ return true;
+
+ return !test_bit(CONF_CONNECT_PEND, &chan->conf_state);
+}
+
+void l2cap_send_conn_req(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_conn_req req;
+
+ req.scid = cpu_to_le16(chan->scid);
+ req.psm = chan->psm;
+
+ chan->ident = l2cap_get_ident(conn);
+
+ set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, sizeof(req), &req);
+}
+
+static void l2cap_chan_ready(struct l2cap_chan *chan)
+{
+ /* The channel may have already been flagged as connected in
+ * case of receiving data before the L2CAP info req/rsp
+ * procedure is complete.
+ */
+ if (chan->state == BT_CONNECTED)
+ return;
+
+ /* This clears all conf flags, including CONF_NOT_COMPLETE */
+ chan->conf_state = 0;
+ __clear_chan_timer(chan);
+
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!chan->tx_credits)
+ chan->ops->suspend(chan);
+ break;
+ }
+
+ chan->state = BT_CONNECTED;
+
+ chan->ops->ready(chan);
+}
+
+static void l2cap_le_connect(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_le_conn_req req;
+
+ if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ if (!chan->imtu)
+ chan->imtu = chan->conn->mtu;
+
+ l2cap_le_flowctl_init(chan, 0);
+
+ memset(&req, 0, sizeof(req));
+ req.psm = chan->psm;
+ req.scid = cpu_to_le16(chan->scid);
+ req.mtu = cpu_to_le16(chan->imtu);
+ req.mps = cpu_to_le16(chan->mps);
+ req.credits = cpu_to_le16(chan->rx_credits);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_REQ,
+ sizeof(req), &req);
+}
+
+struct l2cap_ecred_conn_data {
+ struct {
+ struct l2cap_ecred_conn_req_hdr req;
+ __le16 scid[5];
+ } __packed pdu;
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_ecred_defer_connect(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_ecred_conn_data *conn = data;
+ struct pid *pid;
+
+ if (chan == conn->chan)
+ return;
+
+ if (!test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only add deferred channels with the same PID/PSM */
+ if (conn->pid != pid || chan->psm != conn->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ /* Set the same ident so we can match on the rsp */
+ chan->ident = conn->chan->ident;
+
+ /* Include all channels deferred */
+ conn->pdu.scid[conn->count] = cpu_to_le16(chan->scid);
+
+ conn->count++;
+}
+
+static void l2cap_ecred_connect(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_conn_data data;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ memset(&data, 0, sizeof(data));
+ data.pdu.req.psm = chan->psm;
+ data.pdu.req.mtu = cpu_to_le16(chan->imtu);
+ data.pdu.req.mps = cpu_to_le16(chan->mps);
+ data.pdu.req.credits = cpu_to_le16(chan->rx_credits);
+ data.pdu.scid[0] = cpu_to_le16(chan->scid);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ data.count = 1;
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+
+ __l2cap_chan_list(conn, l2cap_ecred_defer_connect, &data);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_CONN_REQ,
+ sizeof(data.pdu.req) + data.count * sizeof(__le16),
+ &data.pdu);
+}
+
+static void l2cap_le_start(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ if (!smp_conn_security(conn->hcon, chan->sec_level))
+ return;
+
+ if (!chan->psm) {
+ l2cap_chan_ready(chan);
+ return;
+ }
+
+ if (chan->state == BT_CONNECT) {
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL)
+ l2cap_ecred_connect(chan);
+ else
+ l2cap_le_connect(chan);
+ }
+}
+
+static void l2cap_start_connection(struct l2cap_chan *chan)
+{
+ if (chan->conn->hcon->type == LE_LINK) {
+ l2cap_le_start(chan);
+ } else {
+ l2cap_send_conn_req(chan);
+ }
+}
+
+static void l2cap_request_info(struct l2cap_conn *conn)
+{
+ struct l2cap_info_req req;
+
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
+ return;
+
+ req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT;
+ conn->info_ident = l2cap_get_ident(conn);
+
+ schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT);
+
+ l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ,
+ sizeof(req), &req);
+}
+
+static bool l2cap_check_enc_key_size(struct hci_conn *hcon,
+ struct l2cap_chan *chan)
+{
+ /* The minimum encryption key size needs to be enforced by the
+ * host stack before establishing any L2CAP connections. The
+ * specification in theory allows a minimum of 1, but to align
+ * BR/EDR and LE transports, a minimum of 7 is chosen.
+ *
+ * This check might also be called for unencrypted connections
+ * that have no key size requirements. Ensure that the link is
+ * actually encrypted before enforcing a key size.
+ */
+ int min_key_size = hcon->hdev->min_enc_key_size;
+
+ /* On FIPS security level, key size must be 16 bytes */
+ if (chan->sec_level == BT_SECURITY_FIPS)
+ min_key_size = 16;
+
+ return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) ||
+ hcon->enc_key_size >= min_key_size);
+}
+
+static void l2cap_do_start(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ if (conn->hcon->type == LE_LINK) {
+ l2cap_le_start(chan);
+ return;
+ }
+
+ if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)) {
+ l2cap_request_info(conn);
+ return;
+ }
+
+ if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
+ return;
+
+ if (!l2cap_chan_check_security(chan, true) ||
+ !__l2cap_no_conn_pending(chan))
+ return;
+
+ if (l2cap_check_enc_key_size(conn->hcon, chan))
+ l2cap_start_connection(chan);
+ else
+ __set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
+}
+
+static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask)
+{
+ u32 local_feat_mask = l2cap_feat_mask;
+ if (!disable_ertm)
+ local_feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING;
+
+ switch (mode) {
+ case L2CAP_MODE_ERTM:
+ return L2CAP_FEAT_ERTM & feat_mask & local_feat_mask;
+ case L2CAP_MODE_STREAMING:
+ return L2CAP_FEAT_STREAMING & feat_mask & local_feat_mask;
+ default:
+ return 0x00;
+ }
+}
+
+static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_disconn_req req;
+
+ if (!conn)
+ return;
+
+ if (chan->mode == L2CAP_MODE_ERTM && chan->state == BT_CONNECTED) {
+ __clear_retrans_timer(chan);
+ __clear_monitor_timer(chan);
+ __clear_ack_timer(chan);
+ }
+
+ req.dcid = cpu_to_le16(chan->dcid);
+ req.scid = cpu_to_le16(chan->scid);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ,
+ sizeof(req), &req);
+
+ l2cap_state_change_and_error(chan, BT_DISCONN, err);
+}
+
+/* ---- L2CAP connections ---- */
+static void l2cap_conn_start(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan, *tmp;
+
+ BT_DBG("conn %p", conn);
+
+ list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ l2cap_chan_lock(chan);
+
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ l2cap_chan_ready(chan);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (chan->state == BT_CONNECT) {
+ if (!l2cap_chan_check_security(chan, true) ||
+ !__l2cap_no_conn_pending(chan)) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (!l2cap_mode_supported(chan->mode, conn->feat_mask)
+ && test_bit(CONF_STATE2_DEVICE,
+ &chan->conf_state)) {
+ l2cap_chan_close(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (l2cap_check_enc_key_size(conn->hcon, chan))
+ l2cap_start_connection(chan);
+ else
+ l2cap_chan_close(chan, ECONNREFUSED);
+
+ } else if (chan->state == BT_CONNECT2) {
+ struct l2cap_conn_rsp rsp;
+ char buf[128];
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+
+ if (l2cap_chan_check_security(chan, false)) {
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+ rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND);
+ chan->ops->defer(chan);
+
+ } else {
+ l2cap_state_change(chan, BT_CONFIG);
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ }
+ } else {
+ rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+ rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND);
+ }
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+ sizeof(rsp), &rsp);
+
+ if (test_bit(CONF_REQ_SENT, &chan->conf_state) ||
+ rsp.result != L2CAP_CR_SUCCESS) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
+ chan->num_conf_req++;
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+}
+
+static void l2cap_le_conn_ready(struct l2cap_conn *conn)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+
+ BT_DBG("%s conn %p", hdev->name, conn);
+
+ /* For outgoing pairing which doesn't necessarily have an
+ * associated socket (e.g. mgmt_pair_device).
+ */
+ if (hcon->out)
+ smp_conn_security(hcon, hcon->pending_sec_level);
+
+ /* For LE peripheral connections, make sure the connection interval
+ * is in the range of the minimum and maximum interval that has
+ * been configured for this connection. If not, then trigger
+ * the connection update procedure.
+ */
+ if (hcon->role == HCI_ROLE_SLAVE &&
+ (hcon->le_conn_interval < hcon->le_conn_min_interval ||
+ hcon->le_conn_interval > hcon->le_conn_max_interval)) {
+ struct l2cap_conn_param_update_req req;
+
+ req.min = cpu_to_le16(hcon->le_conn_min_interval);
+ req.max = cpu_to_le16(hcon->le_conn_max_interval);
+ req.latency = cpu_to_le16(hcon->le_conn_latency);
+ req.to_multiplier = cpu_to_le16(hcon->le_supv_timeout);
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn),
+ L2CAP_CONN_PARAM_UPDATE_REQ, sizeof(req), &req);
+ }
+}
+
+static void l2cap_conn_ready(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan;
+ struct hci_conn *hcon = conn->hcon;
+
+ BT_DBG("conn %p", conn);
+
+ if (hcon->type == ACL_LINK)
+ l2cap_request_info(conn);
+
+ mutex_lock(&conn->lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+
+ l2cap_chan_lock(chan);
+
+ if (hcon->type == LE_LINK) {
+ l2cap_le_start(chan);
+ } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)
+ l2cap_chan_ready(chan);
+ } else if (chan->state == BT_CONNECT) {
+ l2cap_do_start(chan);
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->lock);
+
+ if (hcon->type == LE_LINK)
+ l2cap_le_conn_ready(conn);
+
+ queue_work(hcon->hdev->workqueue, &conn->pending_rx_work);
+}
+
+/* Notify sockets that we cannot guaranty reliability anymore */
+static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
+{
+ struct l2cap_chan *chan;
+
+ BT_DBG("conn %p", conn);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags))
+ l2cap_chan_set_err(chan, err);
+ }
+}
+
+static void l2cap_info_timeout(struct work_struct *work)
+{
+ struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
+ info_timer.work);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ mutex_lock(&conn->lock);
+ l2cap_conn_start(conn);
+ mutex_unlock(&conn->lock);
+}
+
+/*
+ * l2cap_user
+ * External modules can register l2cap_user objects on l2cap_conn. The ->probe
+ * callback is called during registration. The ->remove callback is called
+ * during unregistration.
+ * An l2cap_user object can either be explicitly unregistered or when the
+ * underlying l2cap_conn object is deleted. This guarantees that l2cap->hcon,
+ * l2cap->hchan, .. are valid as long as the remove callback hasn't been called.
+ * External modules must own a reference to the l2cap_conn object if they intend
+ * to call l2cap_unregister_user(). The l2cap_conn object might get destroyed at
+ * any time if they don't.
+ */
+
+int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user)
+{
+ struct hci_dev *hdev = conn->hcon->hdev;
+ int ret;
+
+ /* We need to check whether l2cap_conn is registered. If it is not, we
+ * must not register the l2cap_user. l2cap_conn_del() is unregisters
+ * l2cap_conn objects, but doesn't provide its own locking. Instead, it
+ * relies on the parent hci_conn object to be locked. This itself relies
+ * on the hci_dev object to be locked. So we must lock the hci device
+ * here, too. */
+
+ hci_dev_lock(hdev);
+
+ if (!list_empty(&user->list)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* conn->hchan is NULL after l2cap_conn_del() was called */
+ if (!conn->hchan) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = user->probe(conn, user);
+ if (ret)
+ goto out_unlock;
+
+ list_add(&user->list, &conn->users);
+ ret = 0;
+
+out_unlock:
+ hci_dev_unlock(hdev);
+ return ret;
+}
+EXPORT_SYMBOL(l2cap_register_user);
+
+void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user)
+{
+ struct hci_dev *hdev = conn->hcon->hdev;
+
+ hci_dev_lock(hdev);
+
+ if (list_empty(&user->list))
+ goto out_unlock;
+
+ list_del_init(&user->list);
+ user->remove(conn, user);
+
+out_unlock:
+ hci_dev_unlock(hdev);
+}
+EXPORT_SYMBOL(l2cap_unregister_user);
+
+static void l2cap_unregister_all_users(struct l2cap_conn *conn)
+{
+ struct l2cap_user *user;
+
+ while (!list_empty(&conn->users)) {
+ user = list_first_entry(&conn->users, struct l2cap_user, list);
+ list_del_init(&user->list);
+ user->remove(conn, user);
+ }
+}
+
+static void l2cap_conn_del(struct hci_conn *hcon, int err)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan, *l;
+
+ if (!conn)
+ return;
+
+ BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+ mutex_lock(&conn->lock);
+
+ kfree_skb(conn->rx_skb);
+
+ skb_queue_purge(&conn->pending_rx);
+
+ /* We can not call flush_work(&conn->pending_rx_work) here since we
+ * might block if we are running on a worker from the same workqueue
+ * pending_rx_work is waiting on.
+ */
+ if (work_pending(&conn->pending_rx_work))
+ cancel_work_sync(&conn->pending_rx_work);
+
+ cancel_delayed_work_sync(&conn->id_addr_timer);
+
+ l2cap_unregister_all_users(conn);
+
+ /* Force the connection to be immediately dropped */
+ hcon->disc_timeout = 0;
+
+ /* Kill channels */
+ list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
+ l2cap_chan_hold(chan);
+ l2cap_chan_lock(chan);
+
+ l2cap_chan_del(chan, err);
+
+ chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ }
+
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
+ cancel_delayed_work_sync(&conn->info_timer);
+
+ hci_chan_del(conn->hchan);
+ conn->hchan = NULL;
+
+ hcon->l2cap_data = NULL;
+ mutex_unlock(&conn->lock);
+ l2cap_conn_put(conn);
+}
+
+static void l2cap_conn_free(struct kref *ref)
+{
+ struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref);
+
+ hci_conn_put(conn->hcon);
+ kfree(conn);
+}
+
+struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn)
+{
+ kref_get(&conn->ref);
+ return conn;
+}
+EXPORT_SYMBOL(l2cap_conn_get);
+
+void l2cap_conn_put(struct l2cap_conn *conn)
+{
+ kref_put(&conn->ref, l2cap_conn_free);
+}
+EXPORT_SYMBOL(l2cap_conn_put);
+
+/* ---- Socket interface ---- */
+
+/* Find socket with psm and source / destination bdaddr.
+ * Returns closest match.
+ */
+static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
+ bdaddr_t *src,
+ bdaddr_t *dst,
+ u8 link_type)
+{
+ struct l2cap_chan *c, *tmp, *c1 = NULL;
+
+ read_lock(&chan_list_lock);
+
+ list_for_each_entry_safe(c, tmp, &chan_list, global_l) {
+ if (state && c->state != state)
+ continue;
+
+ if (link_type == ACL_LINK && c->src_type != BDADDR_BREDR)
+ continue;
+
+ if (link_type == LE_LINK && c->src_type == BDADDR_BREDR)
+ continue;
+
+ if (c->chan_type != L2CAP_CHAN_FIXED && c->psm == psm) {
+ int src_match, dst_match;
+ int src_any, dst_any;
+
+ /* Exact match. */
+ src_match = !bacmp(&c->src, src);
+ dst_match = !bacmp(&c->dst, dst);
+ if (src_match && dst_match) {
+ if (!l2cap_chan_hold_unless_zero(c))
+ continue;
+
+ read_unlock(&chan_list_lock);
+ return c;
+ }
+
+ /* Closest match */
+ src_any = !bacmp(&c->src, BDADDR_ANY);
+ dst_any = !bacmp(&c->dst, BDADDR_ANY);
+ if ((src_match && dst_any) || (src_any && dst_match) ||
+ (src_any && dst_any))
+ c1 = c;
+ }
+ }
+
+ if (c1)
+ c1 = l2cap_chan_hold_unless_zero(c1);
+
+ read_unlock(&chan_list_lock);
+
+ return c1;
+}
+
+static void l2cap_monitor_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ monitor_timer.work);
+
+ BT_DBG("chan %p", chan);
+
+ l2cap_chan_lock(chan);
+
+ if (!chan->conn) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return;
+ }
+
+ l2cap_tx(chan, NULL, NULL, L2CAP_EV_MONITOR_TO);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+static void l2cap_retrans_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ retrans_timer.work);
+
+ BT_DBG("chan %p", chan);
+
+ l2cap_chan_lock(chan);
+
+ if (!chan->conn) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return;
+ }
+
+ l2cap_tx(chan, NULL, NULL, L2CAP_EV_RETRANS_TO);
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+static void l2cap_streaming_send(struct l2cap_chan *chan,
+ struct sk_buff_head *skbs)
+{
+ struct sk_buff *skb;
+ struct l2cap_ctrl *control;
+
+ BT_DBG("chan %p, skbs %p", chan, skbs);
+
+ skb_queue_splice_tail_init(skbs, &chan->tx_q);
+
+ while (!skb_queue_empty(&chan->tx_q)) {
+
+ skb = skb_dequeue(&chan->tx_q);
+
+ bt_cb(skb)->l2cap.retries = 1;
+ control = &bt_cb(skb)->l2cap;
+
+ control->reqseq = 0;
+ control->txseq = chan->next_tx_seq;
+
+ __pack_control(chan, control, skb);
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *) skb->data, skb->len);
+ put_unaligned_le16(fcs, skb_put(skb, L2CAP_FCS_SIZE));
+ }
+
+ l2cap_do_send(chan, skb);
+
+ BT_DBG("Sent txseq %u", control->txseq);
+
+ chan->next_tx_seq = __next_seq(chan, chan->next_tx_seq);
+ chan->frames_sent++;
+ }
+}
+
+static int l2cap_ertm_send(struct l2cap_chan *chan)
+{
+ struct sk_buff *skb, *tx_skb;
+ struct l2cap_ctrl *control;
+ int sent = 0;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->state != BT_CONNECTED)
+ return -ENOTCONN;
+
+ if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+ return 0;
+
+ while (chan->tx_send_head &&
+ chan->unacked_frames < chan->remote_tx_win &&
+ chan->tx_state == L2CAP_TX_STATE_XMIT) {
+
+ skb = chan->tx_send_head;
+
+ bt_cb(skb)->l2cap.retries = 1;
+ control = &bt_cb(skb)->l2cap;
+
+ if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+ control->final = 1;
+
+ control->reqseq = chan->buffer_seq;
+ chan->last_acked_seq = chan->buffer_seq;
+ control->txseq = chan->next_tx_seq;
+
+ __pack_control(chan, control, skb);
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *) skb->data, skb->len);
+ put_unaligned_le16(fcs, skb_put(skb, L2CAP_FCS_SIZE));
+ }
+
+ /* Clone after data has been modified. Data is assumed to be
+ read-only (for locking purposes) on cloned sk_buffs.
+ */
+ tx_skb = skb_clone(skb, GFP_KERNEL);
+
+ if (!tx_skb)
+ break;
+
+ __set_retrans_timer(chan);
+
+ chan->next_tx_seq = __next_seq(chan, chan->next_tx_seq);
+ chan->unacked_frames++;
+ chan->frames_sent++;
+ sent++;
+
+ if (skb_queue_is_last(&chan->tx_q, skb))
+ chan->tx_send_head = NULL;
+ else
+ chan->tx_send_head = skb_queue_next(&chan->tx_q, skb);
+
+ l2cap_do_send(chan, tx_skb);
+ BT_DBG("Sent txseq %u", control->txseq);
+ }
+
+ BT_DBG("Sent %d, %u unacked, %u in ERTM queue", sent,
+ chan->unacked_frames, skb_queue_len(&chan->tx_q));
+
+ return sent;
+}
+
+static void l2cap_ertm_resend(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+ struct sk_buff *skb;
+ struct sk_buff *tx_skb;
+ u16 seq;
+
+ BT_DBG("chan %p", chan);
+
+ if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+ return;
+
+ while (chan->retrans_list.head != L2CAP_SEQ_LIST_CLEAR) {
+ seq = l2cap_seq_list_pop(&chan->retrans_list);
+
+ skb = l2cap_ertm_seq_in_queue(&chan->tx_q, seq);
+ if (!skb) {
+ BT_DBG("Error: Can't retransmit seq %d, frame missing",
+ seq);
+ continue;
+ }
+
+ bt_cb(skb)->l2cap.retries++;
+ control = bt_cb(skb)->l2cap;
+
+ if (chan->max_tx != 0 &&
+ bt_cb(skb)->l2cap.retries > chan->max_tx) {
+ BT_DBG("Retry limit exceeded (%d)", chan->max_tx);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ l2cap_seq_list_clear(&chan->retrans_list);
+ break;
+ }
+
+ control.reqseq = chan->buffer_seq;
+ if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+ control.final = 1;
+ else
+ control.final = 0;
+
+ if (skb_cloned(skb)) {
+ /* Cloned sk_buffs are read-only, so we need a
+ * writeable copy
+ */
+ tx_skb = skb_copy(skb, GFP_KERNEL);
+ } else {
+ tx_skb = skb_clone(skb, GFP_KERNEL);
+ }
+
+ if (!tx_skb) {
+ l2cap_seq_list_clear(&chan->retrans_list);
+ break;
+ }
+
+ /* Update skb contents */
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags)) {
+ put_unaligned_le32(__pack_extended_control(&control),
+ tx_skb->data + L2CAP_HDR_SIZE);
+ } else {
+ put_unaligned_le16(__pack_enhanced_control(&control),
+ tx_skb->data + L2CAP_HDR_SIZE);
+ }
+
+ /* Update FCS */
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *) tx_skb->data,
+ tx_skb->len - L2CAP_FCS_SIZE);
+ put_unaligned_le16(fcs, skb_tail_pointer(tx_skb) -
+ L2CAP_FCS_SIZE);
+ }
+
+ l2cap_do_send(chan, tx_skb);
+
+ BT_DBG("Resent txseq %d", control.txseq);
+
+ chan->last_acked_seq = chan->buffer_seq;
+ }
+}
+
+static void l2cap_retransmit(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ BT_DBG("chan %p, control %p", chan, control);
+
+ l2cap_seq_list_append(&chan->retrans_list, control->reqseq);
+ l2cap_ertm_resend(chan);
+}
+
+static void l2cap_retransmit_all(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (control->poll)
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+
+ l2cap_seq_list_clear(&chan->retrans_list);
+
+ if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+ return;
+
+ if (chan->unacked_frames) {
+ skb_queue_walk(&chan->tx_q, skb) {
+ if (bt_cb(skb)->l2cap.txseq == control->reqseq ||
+ skb == chan->tx_send_head)
+ break;
+ }
+
+ skb_queue_walk_from(&chan->tx_q, skb) {
+ if (skb == chan->tx_send_head)
+ break;
+
+ l2cap_seq_list_append(&chan->retrans_list,
+ bt_cb(skb)->l2cap.txseq);
+ }
+
+ l2cap_ertm_resend(chan);
+ }
+}
+
+static void l2cap_send_ack(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+ u16 frames_to_ack = __seq_offset(chan, chan->buffer_seq,
+ chan->last_acked_seq);
+ int threshold;
+
+ BT_DBG("chan %p last_acked_seq %d buffer_seq %d",
+ chan, chan->last_acked_seq, chan->buffer_seq);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state) &&
+ chan->rx_state == L2CAP_RX_STATE_RECV) {
+ __clear_ack_timer(chan);
+ control.super = L2CAP_SUPER_RNR;
+ control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &control);
+ } else {
+ if (!test_bit(CONN_REMOTE_BUSY, &chan->conn_state)) {
+ l2cap_ertm_send(chan);
+ /* If any i-frames were sent, they included an ack */
+ if (chan->buffer_seq == chan->last_acked_seq)
+ frames_to_ack = 0;
+ }
+
+ /* Ack now if the window is 3/4ths full.
+ * Calculate without mul or div
+ */
+ threshold = chan->ack_win;
+ threshold += threshold << 1;
+ threshold >>= 2;
+
+ BT_DBG("frames_to_ack %u, threshold %d", frames_to_ack,
+ threshold);
+
+ if (frames_to_ack >= threshold) {
+ __clear_ack_timer(chan);
+ control.super = L2CAP_SUPER_RR;
+ control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &control);
+ frames_to_ack = 0;
+ }
+
+ if (frames_to_ack)
+ __set_ack_timer(chan);
+ }
+}
+
+static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
+ struct msghdr *msg, int len,
+ int count, struct sk_buff *skb)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff **frag;
+ int sent = 0;
+
+ if (!copy_from_iter_full(skb_put(skb, count), count, &msg->msg_iter))
+ return -EFAULT;
+
+ sent += count;
+ len -= count;
+
+ /* Continuation fragments (no L2CAP header) */
+ frag = &skb_shinfo(skb)->frag_list;
+ while (len) {
+ struct sk_buff *tmp;
+
+ count = min_t(unsigned int, conn->mtu, len);
+
+ tmp = chan->ops->alloc_skb(chan, 0, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ *frag = tmp;
+
+ if (!copy_from_iter_full(skb_put(*frag, count), count,
+ &msg->msg_iter))
+ return -EFAULT;
+
+ sent += count;
+ len -= count;
+
+ skb->len += (*frag)->len;
+ skb->data_len += (*frag)->len;
+
+ frag = &(*frag)->next;
+ }
+
+ return sent;
+}
+
+static struct sk_buff *l2cap_create_connless_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg, size_t len)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count, hlen = L2CAP_HDR_SIZE + L2CAP_PSMLEN_SIZE;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p psm 0x%2.2x len %zu", chan,
+ __le16_to_cpu(chan->psm), len);
+
+ count = min_t(unsigned int, (conn->mtu - hlen), len);
+
+ skb = chan->ops->alloc_skb(chan, hlen, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len + L2CAP_PSMLEN_SIZE);
+ put_unaligned(chan->psm, (__le16 *) skb_put(skb, L2CAP_PSMLEN_SIZE));
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ return skb;
+}
+
+static struct sk_buff *l2cap_create_basic_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg, size_t len)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p len %zu", chan, len);
+
+ count = min_t(unsigned int, (conn->mtu - L2CAP_HDR_SIZE), len);
+
+ skb = chan->ops->alloc_skb(chan, L2CAP_HDR_SIZE, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len);
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ return skb;
+}
+
+static struct sk_buff *l2cap_create_iframe_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg, size_t len,
+ u16 sdulen)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count, hlen;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p len %zu", chan, len);
+
+ if (!conn)
+ return ERR_PTR(-ENOTCONN);
+
+ hlen = __ertm_hdr_size(chan);
+
+ if (sdulen)
+ hlen += L2CAP_SDULEN_SIZE;
+
+ if (chan->fcs == L2CAP_FCS_CRC16)
+ hlen += L2CAP_FCS_SIZE;
+
+ count = min_t(unsigned int, (conn->mtu - hlen), len);
+
+ skb = chan->ops->alloc_skb(chan, hlen, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+
+ /* Control header is populated later */
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ put_unaligned_le32(0, skb_put(skb, L2CAP_EXT_CTRL_SIZE));
+ else
+ put_unaligned_le16(0, skb_put(skb, L2CAP_ENH_CTRL_SIZE));
+
+ if (sdulen)
+ put_unaligned_le16(sdulen, skb_put(skb, L2CAP_SDULEN_SIZE));
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ bt_cb(skb)->l2cap.fcs = chan->fcs;
+ bt_cb(skb)->l2cap.retries = 0;
+ return skb;
+}
+
+static int l2cap_segment_sdu(struct l2cap_chan *chan,
+ struct sk_buff_head *seg_queue,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ u16 sdu_len;
+ size_t pdu_len;
+ u8 sar;
+
+ BT_DBG("chan %p, msg %p, len %zu", chan, msg, len);
+
+ /* It is critical that ERTM PDUs fit in a single HCI fragment,
+ * so fragmented skbs are not used. The HCI layer's handling
+ * of fragmented skbs is not compatible with ERTM's queueing.
+ */
+
+ /* PDU size is derived from the HCI MTU */
+ pdu_len = chan->conn->mtu;
+
+ /* Constrain PDU size for BR/EDR connections */
+ pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
+
+ /* Adjust for largest possible L2CAP overhead. */
+ if (chan->fcs)
+ pdu_len -= L2CAP_FCS_SIZE;
+
+ pdu_len -= __ertm_hdr_size(chan);
+
+ /* Remote device may have requested smaller PDUs */
+ pdu_len = min_t(size_t, pdu_len, chan->remote_mps);
+
+ if (len <= pdu_len) {
+ sar = L2CAP_SAR_UNSEGMENTED;
+ sdu_len = 0;
+ pdu_len = len;
+ } else {
+ sar = L2CAP_SAR_START;
+ sdu_len = len;
+ }
+
+ while (len > 0) {
+ skb = l2cap_create_iframe_pdu(chan, msg, pdu_len, sdu_len);
+
+ if (IS_ERR(skb)) {
+ __skb_queue_purge(seg_queue);
+ return PTR_ERR(skb);
+ }
+
+ bt_cb(skb)->l2cap.sar = sar;
+ __skb_queue_tail(seg_queue, skb);
+
+ len -= pdu_len;
+ if (sdu_len)
+ sdu_len = 0;
+
+ if (len <= pdu_len) {
+ sar = L2CAP_SAR_END;
+ pdu_len = len;
+ } else {
+ sar = L2CAP_SAR_CONTINUE;
+ }
+ }
+
+ return 0;
+}
+
+static struct sk_buff *l2cap_create_le_flowctl_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg,
+ size_t len, u16 sdulen)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count, hlen;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p len %zu", chan, len);
+
+ if (!conn)
+ return ERR_PTR(-ENOTCONN);
+
+ hlen = L2CAP_HDR_SIZE;
+
+ if (sdulen)
+ hlen += L2CAP_SDULEN_SIZE;
+
+ count = min_t(unsigned int, (conn->mtu - hlen), len);
+
+ skb = chan->ops->alloc_skb(chan, hlen, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+
+ if (sdulen)
+ put_unaligned_le16(sdulen, skb_put(skb, L2CAP_SDULEN_SIZE));
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+static int l2cap_segment_le_sdu(struct l2cap_chan *chan,
+ struct sk_buff_head *seg_queue,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ size_t pdu_len;
+ u16 sdu_len;
+
+ BT_DBG("chan %p, msg %p, len %zu", chan, msg, len);
+
+ sdu_len = len;
+ pdu_len = chan->remote_mps - L2CAP_SDULEN_SIZE;
+
+ while (len > 0) {
+ if (len <= pdu_len)
+ pdu_len = len;
+
+ skb = l2cap_create_le_flowctl_pdu(chan, msg, pdu_len, sdu_len);
+ if (IS_ERR(skb)) {
+ __skb_queue_purge(seg_queue);
+ return PTR_ERR(skb);
+ }
+
+ __skb_queue_tail(seg_queue, skb);
+
+ len -= pdu_len;
+
+ if (sdu_len) {
+ sdu_len = 0;
+ pdu_len += L2CAP_SDULEN_SIZE;
+ }
+ }
+
+ return 0;
+}
+
+static void l2cap_le_flowctl_send(struct l2cap_chan *chan)
+{
+ int sent = 0;
+
+ BT_DBG("chan %p", chan);
+
+ while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) {
+ l2cap_do_send(chan, skb_dequeue(&chan->tx_q));
+ chan->tx_credits--;
+ sent++;
+ }
+
+ BT_DBG("Sent %d credits %u queued %u", sent, chan->tx_credits,
+ skb_queue_len(&chan->tx_q));
+}
+
+static void l2cap_tx_timestamp(struct sk_buff *skb,
+ const struct sockcm_cookie *sockc,
+ size_t len)
+{
+ struct sock *sk = skb ? skb->sk : NULL;
+
+ if (sk && sk->sk_type == SOCK_STREAM)
+ hci_setup_tx_timestamp(skb, len, sockc);
+ else
+ hci_setup_tx_timestamp(skb, 1, sockc);
+}
+
+static void l2cap_tx_timestamp_seg(struct sk_buff_head *queue,
+ const struct sockcm_cookie *sockc,
+ size_t len)
+{
+ struct sk_buff *skb = skb_peek(queue);
+ struct sock *sk = skb ? skb->sk : NULL;
+
+ if (sk && sk->sk_type == SOCK_STREAM)
+ l2cap_tx_timestamp(skb_peek_tail(queue), sockc, len);
+ else
+ l2cap_tx_timestamp(skb, sockc, len);
+}
+
+int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len,
+ const struct sockcm_cookie *sockc)
+{
+ struct sk_buff *skb;
+ int err;
+ struct sk_buff_head seg_queue;
+
+ if (!chan->conn)
+ return -ENOTCONN;
+
+ /* Connectionless channel */
+ if (chan->chan_type == L2CAP_CHAN_CONN_LESS) {
+ skb = l2cap_create_connless_pdu(chan, msg, len);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ l2cap_tx_timestamp(skb, sockc, len);
+
+ l2cap_do_send(chan, skb);
+ return len;
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
+ /* Check outgoing MTU */
+ if (len > chan->omtu)
+ return -EMSGSIZE;
+
+ __skb_queue_head_init(&seg_queue);
+
+ err = l2cap_segment_le_sdu(chan, &seg_queue, msg, len);
+
+ if (chan->state != BT_CONNECTED) {
+ __skb_queue_purge(&seg_queue);
+ err = -ENOTCONN;
+ }
+
+ if (err)
+ return err;
+
+ l2cap_tx_timestamp_seg(&seg_queue, sockc, len);
+
+ skb_queue_splice_tail_init(&seg_queue, &chan->tx_q);
+
+ l2cap_le_flowctl_send(chan);
+
+ if (!chan->tx_credits)
+ chan->ops->suspend(chan);
+
+ err = len;
+
+ break;
+
+ case L2CAP_MODE_BASIC:
+ /* Check outgoing MTU */
+ if (len > chan->omtu)
+ return -EMSGSIZE;
+
+ /* Create a basic PDU */
+ skb = l2cap_create_basic_pdu(chan, msg, len);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ l2cap_tx_timestamp(skb, sockc, len);
+
+ l2cap_do_send(chan, skb);
+ err = len;
+ break;
+
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ /* Check outgoing MTU */
+ if (len > chan->omtu) {
+ err = -EMSGSIZE;
+ break;
+ }
+
+ __skb_queue_head_init(&seg_queue);
+
+ /* Do segmentation before calling in to the state machine,
+ * since it's possible to block while waiting for memory
+ * allocation.
+ */
+ err = l2cap_segment_sdu(chan, &seg_queue, msg, len);
+
+ if (err)
+ break;
+
+ if (chan->mode == L2CAP_MODE_ERTM) {
+ /* TODO: ERTM mode timestamping */
+ l2cap_tx(chan, NULL, &seg_queue, L2CAP_EV_DATA_REQUEST);
+ } else {
+ l2cap_tx_timestamp_seg(&seg_queue, sockc, len);
+ l2cap_streaming_send(chan, &seg_queue);
+ }
+
+ err = len;
+
+ /* If the skbs were not queued for sending, they'll still be in
+ * seg_queue and need to be purged.
+ */
+ __skb_queue_purge(&seg_queue);
+ break;
+
+ default:
+ BT_DBG("bad state %1.1x", chan->mode);
+ err = -EBADFD;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_send);
+
+static void l2cap_send_srej(struct l2cap_chan *chan, u16 txseq)
+{
+ struct l2cap_ctrl control;
+ u16 seq;
+
+ BT_DBG("chan %p, txseq %u", chan, txseq);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.super = L2CAP_SUPER_SREJ;
+
+ for (seq = chan->expected_tx_seq; seq != txseq;
+ seq = __next_seq(chan, seq)) {
+ if (!l2cap_ertm_seq_in_queue(&chan->srej_q, seq)) {
+ control.reqseq = seq;
+ l2cap_send_sframe(chan, &control);
+ l2cap_seq_list_append(&chan->srej_list, seq);
+ }
+ }
+
+ chan->expected_tx_seq = __next_seq(chan, txseq);
+}
+
+static void l2cap_send_srej_tail(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->srej_list.tail == L2CAP_SEQ_LIST_CLEAR)
+ return;
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.super = L2CAP_SUPER_SREJ;
+ control.reqseq = chan->srej_list.tail;
+ l2cap_send_sframe(chan, &control);
+}
+
+static void l2cap_send_srej_list(struct l2cap_chan *chan, u16 txseq)
+{
+ struct l2cap_ctrl control;
+ u16 initial_head;
+ u16 seq;
+
+ BT_DBG("chan %p, txseq %u", chan, txseq);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.super = L2CAP_SUPER_SREJ;
+
+ /* Capture initial list head to allow only one pass through the list. */
+ initial_head = chan->srej_list.head;
+
+ do {
+ seq = l2cap_seq_list_pop(&chan->srej_list);
+ if (seq == txseq || seq == L2CAP_SEQ_LIST_CLEAR)
+ break;
+
+ control.reqseq = seq;
+ l2cap_send_sframe(chan, &control);
+ l2cap_seq_list_append(&chan->srej_list, seq);
+ } while (chan->srej_list.head != initial_head);
+}
+
+static void l2cap_process_reqseq(struct l2cap_chan *chan, u16 reqseq)
+{
+ struct sk_buff *acked_skb;
+ u16 ackseq;
+
+ BT_DBG("chan %p, reqseq %u", chan, reqseq);
+
+ if (chan->unacked_frames == 0 || reqseq == chan->expected_ack_seq)
+ return;
+
+ BT_DBG("expected_ack_seq %u, unacked_frames %u",
+ chan->expected_ack_seq, chan->unacked_frames);
+
+ for (ackseq = chan->expected_ack_seq; ackseq != reqseq;
+ ackseq = __next_seq(chan, ackseq)) {
+
+ acked_skb = l2cap_ertm_seq_in_queue(&chan->tx_q, ackseq);
+ if (acked_skb) {
+ skb_unlink(acked_skb, &chan->tx_q);
+ kfree_skb(acked_skb);
+ chan->unacked_frames--;
+ }
+ }
+
+ chan->expected_ack_seq = reqseq;
+
+ if (chan->unacked_frames == 0)
+ __clear_retrans_timer(chan);
+
+ BT_DBG("unacked_frames %u", chan->unacked_frames);
+}
+
+static void l2cap_abort_rx_srej_sent(struct l2cap_chan *chan)
+{
+ BT_DBG("chan %p", chan);
+
+ chan->expected_tx_seq = chan->buffer_seq;
+ l2cap_seq_list_clear(&chan->srej_list);
+ skb_queue_purge(&chan->srej_q);
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+}
+
+static void l2cap_tx_state_xmit(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event)
+{
+ BT_DBG("chan %p, control %p, skbs %p, event %d", chan, control, skbs,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_DATA_REQUEST:
+ if (chan->tx_send_head == NULL)
+ chan->tx_send_head = skb_peek(skbs);
+
+ skb_queue_splice_tail_init(skbs, &chan->tx_q);
+ l2cap_ertm_send(chan);
+ break;
+ case L2CAP_EV_LOCAL_BUSY_DETECTED:
+ BT_DBG("Enter LOCAL_BUSY");
+ set_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (chan->rx_state == L2CAP_RX_STATE_SREJ_SENT) {
+ /* The SREJ_SENT state must be aborted if we are to
+ * enter the LOCAL_BUSY state.
+ */
+ l2cap_abort_rx_srej_sent(chan);
+ }
+
+ l2cap_send_ack(chan);
+
+ break;
+ case L2CAP_EV_LOCAL_BUSY_CLEAR:
+ BT_DBG("Exit LOCAL_BUSY");
+ clear_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (test_bit(CONN_RNR_SENT, &chan->conn_state)) {
+ struct l2cap_ctrl local_control;
+
+ memset(&local_control, 0, sizeof(local_control));
+ local_control.sframe = 1;
+ local_control.super = L2CAP_SUPER_RR;
+ local_control.poll = 1;
+ local_control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &local_control);
+
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ }
+ break;
+ case L2CAP_EV_RECV_REQSEQ_AND_FBIT:
+ l2cap_process_reqseq(chan, control->reqseq);
+ break;
+ case L2CAP_EV_EXPLICIT_POLL:
+ l2cap_send_rr_or_rnr(chan, 1);
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ __clear_ack_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ break;
+ case L2CAP_EV_RETRANS_TO:
+ l2cap_send_rr_or_rnr(chan, 1);
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ break;
+ case L2CAP_EV_RECV_FBIT:
+ /* Nothing to process */
+ break;
+ default:
+ break;
+ }
+}
+
+static void l2cap_tx_state_wait_f(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event)
+{
+ BT_DBG("chan %p, control %p, skbs %p, event %d", chan, control, skbs,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_DATA_REQUEST:
+ if (chan->tx_send_head == NULL)
+ chan->tx_send_head = skb_peek(skbs);
+ /* Queue data, but don't send. */
+ skb_queue_splice_tail_init(skbs, &chan->tx_q);
+ break;
+ case L2CAP_EV_LOCAL_BUSY_DETECTED:
+ BT_DBG("Enter LOCAL_BUSY");
+ set_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (chan->rx_state == L2CAP_RX_STATE_SREJ_SENT) {
+ /* The SREJ_SENT state must be aborted if we are to
+ * enter the LOCAL_BUSY state.
+ */
+ l2cap_abort_rx_srej_sent(chan);
+ }
+
+ l2cap_send_ack(chan);
+
+ break;
+ case L2CAP_EV_LOCAL_BUSY_CLEAR:
+ BT_DBG("Exit LOCAL_BUSY");
+ clear_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (test_bit(CONN_RNR_SENT, &chan->conn_state)) {
+ struct l2cap_ctrl local_control;
+ memset(&local_control, 0, sizeof(local_control));
+ local_control.sframe = 1;
+ local_control.super = L2CAP_SUPER_RR;
+ local_control.poll = 1;
+ local_control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &local_control);
+
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ }
+ break;
+ case L2CAP_EV_RECV_REQSEQ_AND_FBIT:
+ l2cap_process_reqseq(chan, control->reqseq);
+ fallthrough;
+
+ case L2CAP_EV_RECV_FBIT:
+ if (control && control->final) {
+ __clear_monitor_timer(chan);
+ if (chan->unacked_frames > 0)
+ __set_retrans_timer(chan);
+ chan->retry_count = 0;
+ chan->tx_state = L2CAP_TX_STATE_XMIT;
+ BT_DBG("recv fbit tx_state 0x2.2%x", chan->tx_state);
+ }
+ break;
+ case L2CAP_EV_EXPLICIT_POLL:
+ /* Ignore */
+ break;
+ case L2CAP_EV_MONITOR_TO:
+ if (chan->max_tx == 0 || chan->retry_count < chan->max_tx) {
+ l2cap_send_rr_or_rnr(chan, 1);
+ __set_monitor_timer(chan);
+ chan->retry_count++;
+ } else {
+ l2cap_send_disconn_req(chan, ECONNABORTED);
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event)
+{
+ BT_DBG("chan %p, control %p, skbs %p, event %d, state %d",
+ chan, control, skbs, event, chan->tx_state);
+
+ switch (chan->tx_state) {
+ case L2CAP_TX_STATE_XMIT:
+ l2cap_tx_state_xmit(chan, control, skbs, event);
+ break;
+ case L2CAP_TX_STATE_WAIT_F:
+ l2cap_tx_state_wait_f(chan, control, skbs, event);
+ break;
+ default:
+ /* Ignore event */
+ break;
+ }
+}
+
+static void l2cap_pass_to_tx(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ BT_DBG("chan %p, control %p", chan, control);
+ l2cap_tx(chan, control, NULL, L2CAP_EV_RECV_REQSEQ_AND_FBIT);
+}
+
+static void l2cap_pass_to_tx_fbit(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ BT_DBG("chan %p, control %p", chan, control);
+ l2cap_tx(chan, control, NULL, L2CAP_EV_RECV_FBIT);
+}
+
+/* Copy frame to all raw sockets on that connection */
+static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+ struct l2cap_chan *chan;
+
+ BT_DBG("conn %p", conn);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (chan->chan_type != L2CAP_CHAN_RAW)
+ continue;
+
+ /* Don't send frame to the channel it came from */
+ if (bt_cb(skb)->l2cap.chan == chan)
+ continue;
+
+ nskb = skb_clone(skb, GFP_KERNEL);
+ if (!nskb)
+ continue;
+ if (chan->ops->recv(chan, nskb))
+ kfree_skb(nskb);
+ }
+}
+
+/* ---- L2CAP signalling commands ---- */
+static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code,
+ u8 ident, u16 dlen, void *data)
+{
+ struct sk_buff *skb, **frag;
+ struct l2cap_cmd_hdr *cmd;
+ struct l2cap_hdr *lh;
+ int len, count;
+
+ BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %u",
+ conn, code, ident, dlen);
+
+ if (conn->mtu < L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE)
+ return NULL;
+
+ len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen;
+ count = min_t(unsigned int, conn->mtu, len);
+
+ skb = bt_skb_alloc(count, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ lh = skb_put(skb, L2CAP_HDR_SIZE);
+ lh->len = cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen);
+
+ if (conn->hcon->type == LE_LINK)
+ lh->cid = cpu_to_le16(L2CAP_CID_LE_SIGNALING);
+ else
+ lh->cid = cpu_to_le16(L2CAP_CID_SIGNALING);
+
+ cmd = skb_put(skb, L2CAP_CMD_HDR_SIZE);
+ cmd->code = code;
+ cmd->ident = ident;
+ cmd->len = cpu_to_le16(dlen);
+
+ if (dlen) {
+ count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE;
+ skb_put_data(skb, data, count);
+ data += count;
+ }
+
+ len -= skb->len;
+
+ /* Continuation fragments (no L2CAP header) */
+ frag = &skb_shinfo(skb)->frag_list;
+ while (len) {
+ count = min_t(unsigned int, conn->mtu, len);
+
+ *frag = bt_skb_alloc(count, GFP_KERNEL);
+ if (!*frag)
+ goto fail;
+
+ skb_put_data(*frag, data, count);
+
+ len -= count;
+ data += count;
+
+ frag = &(*frag)->next;
+ }
+
+ return skb;
+
+fail:
+ kfree_skb(skb);
+ return NULL;
+}
+
+static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen,
+ unsigned long *val)
+{
+ struct l2cap_conf_opt *opt = *ptr;
+ int len;
+
+ len = L2CAP_CONF_OPT_SIZE + opt->len;
+ *ptr += len;
+
+ *type = opt->type;
+ *olen = opt->len;
+
+ switch (opt->len) {
+ case 1:
+ *val = *((u8 *) opt->val);
+ break;
+
+ case 2:
+ *val = get_unaligned_le16(opt->val);
+ break;
+
+ case 4:
+ *val = get_unaligned_le32(opt->val);
+ break;
+
+ default:
+ *val = (unsigned long) opt->val;
+ break;
+ }
+
+ BT_DBG("type 0x%2.2x len %u val 0x%lx", *type, opt->len, *val);
+ return len;
+}
+
+static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size)
+{
+ struct l2cap_conf_opt *opt = *ptr;
+
+ BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val);
+
+ if (size < L2CAP_CONF_OPT_SIZE + len)
+ return;
+
+ opt->type = type;
+ opt->len = len;
+
+ switch (len) {
+ case 1:
+ *((u8 *) opt->val) = val;
+ break;
+
+ case 2:
+ put_unaligned_le16(val, opt->val);
+ break;
+
+ case 4:
+ put_unaligned_le32(val, opt->val);
+ break;
+
+ default:
+ memcpy(opt->val, (void *) val, len);
+ break;
+ }
+
+ *ptr += L2CAP_CONF_OPT_SIZE + len;
+}
+
+static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size)
+{
+ struct l2cap_conf_efs efs;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_ERTM:
+ efs.id = chan->local_id;
+ efs.stype = chan->local_stype;
+ efs.msdu = cpu_to_le16(chan->local_msdu);
+ efs.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
+ efs.acc_lat = cpu_to_le32(L2CAP_DEFAULT_ACC_LAT);
+ efs.flush_to = cpu_to_le32(L2CAP_EFS_DEFAULT_FLUSH_TO);
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ efs.id = 1;
+ efs.stype = L2CAP_SERV_BESTEFFORT;
+ efs.msdu = cpu_to_le16(chan->local_msdu);
+ efs.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
+ efs.acc_lat = 0;
+ efs.flush_to = 0;
+ break;
+
+ default:
+ return;
+ }
+
+ l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs),
+ (unsigned long) &efs, size);
+}
+
+static void l2cap_ack_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ ack_timer.work);
+ u16 frames_to_ack;
+
+ BT_DBG("chan %p", chan);
+
+ l2cap_chan_lock(chan);
+
+ frames_to_ack = __seq_offset(chan, chan->buffer_seq,
+ chan->last_acked_seq);
+
+ if (frames_to_ack)
+ l2cap_send_rr_or_rnr(chan, 0);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+int l2cap_ertm_init(struct l2cap_chan *chan)
+{
+ int err;
+
+ chan->next_tx_seq = 0;
+ chan->expected_tx_seq = 0;
+ chan->expected_ack_seq = 0;
+ chan->unacked_frames = 0;
+ chan->buffer_seq = 0;
+ chan->frames_sent = 0;
+ chan->last_acked_seq = 0;
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+
+ skb_queue_head_init(&chan->tx_q);
+
+ if (chan->mode != L2CAP_MODE_ERTM)
+ return 0;
+
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ chan->tx_state = L2CAP_TX_STATE_XMIT;
+
+ skb_queue_head_init(&chan->srej_q);
+
+ err = l2cap_seq_list_init(&chan->srej_list, chan->tx_win);
+ if (err < 0)
+ return err;
+
+ err = l2cap_seq_list_init(&chan->retrans_list, chan->remote_tx_win);
+ if (err < 0)
+ l2cap_seq_list_free(&chan->srej_list);
+
+ return err;
+}
+
+static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
+{
+ switch (mode) {
+ case L2CAP_MODE_STREAMING:
+ case L2CAP_MODE_ERTM:
+ if (l2cap_mode_supported(mode, remote_feat_mask))
+ return mode;
+ fallthrough;
+ default:
+ return L2CAP_MODE_BASIC;
+ }
+}
+
+static inline bool __l2cap_ews_supported(struct l2cap_conn *conn)
+{
+ return (conn->feat_mask & L2CAP_FEAT_EXT_WINDOW);
+}
+
+static inline bool __l2cap_efs_supported(struct l2cap_conn *conn)
+{
+ return (conn->feat_mask & L2CAP_FEAT_EXT_FLOW);
+}
+
+static void __l2cap_set_ertm_timeouts(struct l2cap_chan *chan,
+ struct l2cap_conf_rfc *rfc)
+{
+ rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
+ rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
+}
+
+static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
+{
+ if (chan->tx_win > L2CAP_DEFAULT_TX_WINDOW &&
+ __l2cap_ews_supported(chan->conn)) {
+ /* use extended control field */
+ set_bit(FLAG_EXT_CTRL, &chan->flags);
+ chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
+ } else {
+ chan->tx_win = min_t(u16, chan->tx_win,
+ L2CAP_DEFAULT_TX_WINDOW);
+ chan->tx_win_max = L2CAP_DEFAULT_TX_WINDOW;
+ }
+ chan->ack_win = chan->tx_win;
+}
+
+static void l2cap_mtu_auto(struct l2cap_chan *chan)
+{
+ struct hci_conn *conn = chan->conn->hcon;
+
+ chan->imtu = L2CAP_DEFAULT_MIN_MTU;
+
+ /* The 2-DH1 packet has between 2 and 56 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH1))
+ chan->imtu = 54;
+
+ /* The 3-DH1 packet has between 2 and 85 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH1))
+ chan->imtu = 83;
+
+ /* The 2-DH3 packet has between 2 and 369 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH3))
+ chan->imtu = 367;
+
+ /* The 3-DH3 packet has between 2 and 554 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH3))
+ chan->imtu = 552;
+
+ /* The 2-DH5 packet has between 2 and 681 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH5))
+ chan->imtu = 679;
+
+ /* The 3-DH5 packet has between 2 and 1023 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH5))
+ chan->imtu = 1021;
+}
+
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
+{
+ struct l2cap_conf_req *req = data;
+ struct l2cap_conf_rfc rfc = { .mode = chan->mode };
+ void *ptr = req->data;
+ void *endptr = data + data_size;
+ u16 size;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->num_conf_req || chan->num_conf_rsp)
+ goto done;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_STREAMING:
+ case L2CAP_MODE_ERTM:
+ if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state))
+ break;
+
+ if (__l2cap_efs_supported(chan->conn))
+ set_bit(FLAG_EFS_ENABLE, &chan->flags);
+
+ fallthrough;
+ default:
+ chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask);
+ break;
+ }
+
+done:
+ if (chan->imtu != L2CAP_DEFAULT_MTU) {
+ if (!chan->imtu)
+ l2cap_mtu_auto(chan);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu,
+ endptr - ptr);
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ if (disable_ertm)
+ break;
+
+ if (!(chan->conn->feat_mask & L2CAP_FEAT_ERTM) &&
+ !(chan->conn->feat_mask & L2CAP_FEAT_STREAMING))
+ break;
+
+ rfc.mode = L2CAP_MODE_BASIC;
+ rfc.txwin_size = 0;
+ rfc.max_transmit = 0;
+ rfc.retrans_timeout = 0;
+ rfc.monitor_timeout = 0;
+ rfc.max_pdu_size = 0;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
+ break;
+
+ case L2CAP_MODE_ERTM:
+ rfc.mode = L2CAP_MODE_ERTM;
+ rfc.max_transmit = chan->max_tx;
+
+ __l2cap_set_ertm_timeouts(chan, &rfc);
+
+ size = min_t(u16, L2CAP_DEFAULT_MAX_PDU_SIZE, chan->conn->mtu -
+ L2CAP_EXT_HDR_SIZE - L2CAP_SDULEN_SIZE -
+ L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+
+ l2cap_txwin_setup(chan);
+
+ rfc.txwin_size = min_t(u16, chan->tx_win,
+ L2CAP_DEFAULT_TX_WINDOW);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
+ l2cap_add_opt_efs(&ptr, chan, endptr - ptr);
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
+ chan->tx_win, endptr - ptr);
+
+ if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
+ if (chan->fcs == L2CAP_FCS_NONE ||
+ test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
+ chan->fcs = L2CAP_FCS_NONE;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
+ chan->fcs, endptr - ptr);
+ }
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ l2cap_txwin_setup(chan);
+ rfc.mode = L2CAP_MODE_STREAMING;
+ rfc.txwin_size = 0;
+ rfc.max_transmit = 0;
+ rfc.retrans_timeout = 0;
+ rfc.monitor_timeout = 0;
+
+ size = min_t(u16, L2CAP_DEFAULT_MAX_PDU_SIZE, chan->conn->mtu -
+ L2CAP_EXT_HDR_SIZE - L2CAP_SDULEN_SIZE -
+ L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
+ l2cap_add_opt_efs(&ptr, chan, endptr - ptr);
+
+ if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
+ if (chan->fcs == L2CAP_FCS_NONE ||
+ test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
+ chan->fcs = L2CAP_FCS_NONE;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
+ chan->fcs, endptr - ptr);
+ }
+ break;
+ }
+
+ req->dcid = cpu_to_le16(chan->dcid);
+ req->flags = cpu_to_le16(0);
+
+ return ptr - data;
+}
+
+static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
+{
+ struct l2cap_conf_rsp *rsp = data;
+ void *ptr = rsp->data;
+ void *endptr = data + data_size;
+ void *req = chan->conf_req;
+ int len = chan->conf_len;
+ int type, hint, olen;
+ unsigned long val;
+ struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
+ struct l2cap_conf_efs efs;
+ u8 remote_efs = 0;
+ u16 mtu = 0;
+ u16 result = L2CAP_CONF_SUCCESS;
+ u16 size;
+
+ BT_DBG("chan %p", chan);
+
+ while (len >= L2CAP_CONF_OPT_SIZE) {
+ len -= l2cap_get_conf_opt(&req, &type, &olen, &val);
+ if (len < 0)
+ break;
+
+ hint = type & L2CAP_CONF_HINT;
+ type &= L2CAP_CONF_MASK;
+
+ switch (type) {
+ case L2CAP_CONF_MTU:
+ if (olen != 2)
+ break;
+ mtu = val;
+ break;
+
+ case L2CAP_CONF_FLUSH_TO:
+ if (olen != 2)
+ break;
+ chan->flush_to = val;
+ break;
+
+ case L2CAP_CONF_QOS:
+ break;
+
+ case L2CAP_CONF_RFC:
+ if (olen != sizeof(rfc))
+ break;
+ memcpy(&rfc, (void *) val, olen);
+ break;
+
+ case L2CAP_CONF_FCS:
+ if (olen != 1)
+ break;
+ if (val == L2CAP_FCS_NONE)
+ set_bit(CONF_RECV_NO_FCS, &chan->conf_state);
+ break;
+
+ case L2CAP_CONF_EFS:
+ if (olen != sizeof(efs))
+ break;
+ remote_efs = 1;
+ memcpy(&efs, (void *) val, olen);
+ break;
+
+ case L2CAP_CONF_EWS:
+ if (olen != 2)
+ break;
+ return -ECONNREFUSED;
+
+ default:
+ if (hint)
+ break;
+ result = L2CAP_CONF_UNKNOWN;
+ l2cap_add_conf_opt(&ptr, (u8)type, sizeof(u8), type, endptr - ptr);
+ break;
+ }
+ }
+
+ if (chan->num_conf_rsp || chan->num_conf_req > 1)
+ goto done;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_STREAMING:
+ case L2CAP_MODE_ERTM:
+ if (!test_bit(CONF_STATE2_DEVICE, &chan->conf_state)) {
+ chan->mode = l2cap_select_mode(rfc.mode,
+ chan->conn->feat_mask);
+ break;
+ }
+
+ if (remote_efs) {
+ if (__l2cap_efs_supported(chan->conn))
+ set_bit(FLAG_EFS_ENABLE, &chan->flags);
+ else
+ return -ECONNREFUSED;
+ }
+
+ if (chan->mode != rfc.mode)
+ return -ECONNREFUSED;
+
+ break;
+ }
+
+done:
+ if (chan->mode != rfc.mode) {
+ result = L2CAP_CONF_UNACCEPT;
+ rfc.mode = chan->mode;
+
+ if (chan->num_conf_rsp == 1)
+ return -ECONNREFUSED;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
+ }
+
+ if (result == L2CAP_CONF_SUCCESS) {
+ /* Configure output options and let the other side know
+ * which ones we don't like. */
+
+ /* If MTU is not provided in configure request, try adjusting it
+ * to the current output MTU if it has been set
+ *
+ * Bluetooth Core 6.1, Vol 3, Part A, Section 4.5
+ *
+ * Each configuration parameter value (if any is present) in an
+ * L2CAP_CONFIGURATION_RSP packet reflects an ‘adjustment’ to a
+ * configuration parameter value that has been sent (or, in case
+ * of default values, implied) in the corresponding
+ * L2CAP_CONFIGURATION_REQ packet.
+ */
+ if (!mtu) {
+ /* Only adjust for ERTM channels as for older modes the
+ * remote stack may not be able to detect that the
+ * adjustment causing it to silently drop packets.
+ */
+ if (chan->mode == L2CAP_MODE_ERTM &&
+ chan->omtu && chan->omtu != L2CAP_DEFAULT_MTU)
+ mtu = chan->omtu;
+ else
+ mtu = L2CAP_DEFAULT_MTU;
+ }
+
+ if (mtu < L2CAP_DEFAULT_MIN_MTU)
+ result = L2CAP_CONF_UNACCEPT;
+ else {
+ chan->omtu = mtu;
+ set_bit(CONF_MTU_DONE, &chan->conf_state);
+ }
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr);
+
+ if (remote_efs) {
+ if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != chan->local_stype) {
+
+ result = L2CAP_CONF_UNACCEPT;
+
+ if (chan->num_conf_req >= 1)
+ return -ECONNREFUSED;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
+ sizeof(efs),
+ (unsigned long) &efs, endptr - ptr);
+ } else {
+ /* Send PENDING Conf Rsp */
+ result = L2CAP_CONF_PENDING;
+ set_bit(CONF_LOC_CONF_PEND, &chan->conf_state);
+ }
+ }
+
+ switch (rfc.mode) {
+ case L2CAP_MODE_BASIC:
+ chan->fcs = L2CAP_FCS_NONE;
+ set_bit(CONF_MODE_DONE, &chan->conf_state);
+ break;
+
+ case L2CAP_MODE_ERTM:
+ if (!test_bit(CONF_EWS_RECV, &chan->conf_state))
+ chan->remote_tx_win = rfc.txwin_size;
+ else
+ rfc.txwin_size = L2CAP_DEFAULT_TX_WINDOW;
+
+ chan->remote_max_tx = rfc.max_transmit;
+
+ size = min_t(u16, le16_to_cpu(rfc.max_pdu_size),
+ chan->conn->mtu - L2CAP_EXT_HDR_SIZE -
+ L2CAP_SDULEN_SIZE - L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+ chan->remote_mps = size;
+
+ __l2cap_set_ertm_timeouts(chan, &rfc);
+
+ set_bit(CONF_MODE_DONE, &chan->conf_state);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+ sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
+
+ if (remote_efs &&
+ test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ chan->remote_id = efs.id;
+ chan->remote_stype = efs.stype;
+ chan->remote_msdu = le16_to_cpu(efs.msdu);
+ chan->remote_flush_to =
+ le32_to_cpu(efs.flush_to);
+ chan->remote_acc_lat =
+ le32_to_cpu(efs.acc_lat);
+ chan->remote_sdu_itime =
+ le32_to_cpu(efs.sdu_itime);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
+ sizeof(efs),
+ (unsigned long) &efs, endptr - ptr);
+ }
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ size = min_t(u16, le16_to_cpu(rfc.max_pdu_size),
+ chan->conn->mtu - L2CAP_EXT_HDR_SIZE -
+ L2CAP_SDULEN_SIZE - L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+ chan->remote_mps = size;
+
+ set_bit(CONF_MODE_DONE, &chan->conf_state);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
+
+ break;
+
+ default:
+ result = L2CAP_CONF_UNACCEPT;
+
+ memset(&rfc, 0, sizeof(rfc));
+ rfc.mode = chan->mode;
+ }
+
+ if (result == L2CAP_CONF_SUCCESS)
+ set_bit(CONF_OUTPUT_DONE, &chan->conf_state);
+ }
+ rsp->scid = cpu_to_le16(chan->dcid);
+ rsp->result = cpu_to_le16(result);
+ rsp->flags = cpu_to_le16(0);
+
+ return ptr - data;
+}
+
+static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
+ void *data, size_t size, u16 *result)
+{
+ struct l2cap_conf_req *req = data;
+ void *ptr = req->data;
+ void *endptr = data + size;
+ int type, olen;
+ unsigned long val;
+ struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
+ struct l2cap_conf_efs efs;
+
+ BT_DBG("chan %p, rsp %p, len %d, req %p", chan, rsp, len, data);
+
+ while (len >= L2CAP_CONF_OPT_SIZE) {
+ len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+ if (len < 0)
+ break;
+
+ switch (type) {
+ case L2CAP_CONF_MTU:
+ if (olen != 2)
+ break;
+ if (val < L2CAP_DEFAULT_MIN_MTU) {
+ *result = L2CAP_CONF_UNACCEPT;
+ chan->imtu = L2CAP_DEFAULT_MIN_MTU;
+ } else
+ chan->imtu = val;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu,
+ endptr - ptr);
+ break;
+
+ case L2CAP_CONF_FLUSH_TO:
+ if (olen != 2)
+ break;
+ chan->flush_to = val;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, 2,
+ chan->flush_to, endptr - ptr);
+ break;
+
+ case L2CAP_CONF_RFC:
+ if (olen != sizeof(rfc))
+ break;
+ memcpy(&rfc, (void *)val, olen);
+ if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state) &&
+ rfc.mode != chan->mode)
+ return -ECONNREFUSED;
+ chan->fcs = 0;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
+ break;
+
+ case L2CAP_CONF_EWS:
+ if (olen != 2)
+ break;
+ chan->ack_win = min_t(u16, val, chan->ack_win);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
+ chan->tx_win, endptr - ptr);
+ break;
+
+ case L2CAP_CONF_EFS:
+ if (olen != sizeof(efs))
+ break;
+ memcpy(&efs, (void *)val, olen);
+ if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != chan->local_stype)
+ return -ECONNREFUSED;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
+ (unsigned long) &efs, endptr - ptr);
+ break;
+
+ case L2CAP_CONF_FCS:
+ if (olen != 1)
+ break;
+ if (*result == L2CAP_CONF_PENDING)
+ if (val == L2CAP_FCS_NONE)
+ set_bit(CONF_RECV_NO_FCS,
+ &chan->conf_state);
+ break;
+ }
+ }
+
+ if (chan->mode == L2CAP_MODE_BASIC && chan->mode != rfc.mode)
+ return -ECONNREFUSED;
+
+ chan->mode = rfc.mode;
+
+ if (*result == L2CAP_CONF_SUCCESS || *result == L2CAP_CONF_PENDING) {
+ switch (rfc.mode) {
+ case L2CAP_MODE_ERTM:
+ chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
+ chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ if (!test_bit(FLAG_EXT_CTRL, &chan->flags))
+ chan->ack_win = min_t(u16, chan->ack_win,
+ rfc.txwin_size);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ chan->local_msdu = le16_to_cpu(efs.msdu);
+ chan->local_sdu_itime =
+ le32_to_cpu(efs.sdu_itime);
+ chan->local_acc_lat = le32_to_cpu(efs.acc_lat);
+ chan->local_flush_to =
+ le32_to_cpu(efs.flush_to);
+ }
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ }
+ }
+
+ req->dcid = cpu_to_le16(chan->dcid);
+ req->flags = cpu_to_le16(0);
+
+ return ptr - data;
+}
+
+static int l2cap_build_conf_rsp(struct l2cap_chan *chan, void *data,
+ u16 result, u16 flags)
+{
+ struct l2cap_conf_rsp *rsp = data;
+ void *ptr = rsp->data;
+
+ BT_DBG("chan %p", chan);
+
+ rsp->scid = cpu_to_le16(chan->dcid);
+ rsp->result = cpu_to_le16(result);
+ rsp->flags = cpu_to_le16(flags);
+
+ return ptr - data;
+}
+
+void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
+{
+ struct l2cap_le_conn_rsp rsp;
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("chan %p", chan);
+
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.mtu = cpu_to_le16(chan->imtu);
+ rsp.mps = cpu_to_le16(chan->mps);
+ rsp.credits = cpu_to_le16(chan->rx_credits);
+ rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
+ &rsp);
+}
+
+static void l2cap_ecred_list_defer(struct l2cap_chan *chan, void *data)
+{
+ int *result = data;
+
+ if (*result || test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ switch (chan->state) {
+ case BT_CONNECT2:
+ /* If channel still pending accept add to result */
+ (*result)++;
+ return;
+ case BT_CONNECTED:
+ return;
+ default:
+ /* If not connected or pending accept it has been refused */
+ *result = -ECONNREFUSED;
+ return;
+ }
+}
+
+struct l2cap_ecred_rsp_data {
+ struct {
+ struct l2cap_ecred_conn_rsp_hdr rsp;
+ __le16 scid[L2CAP_ECRED_MAX_CID];
+ } __packed pdu;
+ int count;
+};
+
+static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_ecred_rsp_data *rsp = data;
+ struct l2cap_ecred_conn_rsp *rsp_flex =
+ container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr);
+
+ /* Check if channel for outgoing connection or if it wasn't deferred
+ * since in those cases it must be skipped.
+ */
+ if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags) ||
+ !test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ /* Reset ident so only one response is sent */
+ chan->ident = 0;
+
+ /* Include all channels pending with the same ident */
+ if (!rsp->pdu.rsp.result)
+ rsp_flex->dcid[rsp->count++] = cpu_to_le16(chan->scid);
+ else
+ l2cap_chan_del(chan, ECONNRESET);
+}
+
+void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_rsp_data data;
+ u16 id = chan->ident;
+ int result = 0;
+
+ if (!id)
+ return;
+
+ BT_DBG("chan %p id %d", chan, id);
+
+ memset(&data, 0, sizeof(data));
+
+ data.pdu.rsp.mtu = cpu_to_le16(chan->imtu);
+ data.pdu.rsp.mps = cpu_to_le16(chan->mps);
+ data.pdu.rsp.credits = cpu_to_le16(chan->rx_credits);
+ data.pdu.rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
+
+ /* Verify that all channels are ready */
+ __l2cap_chan_list_id(conn, id, l2cap_ecred_list_defer, &result);
+
+ if (result > 0)
+ return;
+
+ if (result < 0)
+ data.pdu.rsp.result = cpu_to_le16(L2CAP_CR_LE_AUTHORIZATION);
+
+ /* Build response */
+ __l2cap_chan_list_id(conn, id, l2cap_ecred_rsp_defer, &data);
+
+ l2cap_send_cmd(conn, id, L2CAP_ECRED_CONN_RSP,
+ sizeof(data.pdu.rsp) + (data.count * sizeof(__le16)),
+ &data.pdu);
+}
+
+void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
+{
+ struct l2cap_conn_rsp rsp;
+ struct l2cap_conn *conn = chan->conn;
+ u8 buf[128];
+ u8 rsp_code;
+
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ rsp_code = L2CAP_CONN_RSP;
+
+ BT_DBG("chan %p rsp_code %u", chan, rsp_code);
+
+ l2cap_send_cmd(conn, chan->ident, rsp_code, sizeof(rsp), &rsp);
+
+ if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state))
+ return;
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
+ chan->num_conf_req++;
+}
+
+static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
+{
+ int type, olen;
+ unsigned long val;
+ /* Use sane default values in case a misbehaving remote device
+ * did not send an RFC or extended window size option.
+ */
+ u16 txwin_ext = chan->ack_win;
+ struct l2cap_conf_rfc rfc = {
+ .mode = chan->mode,
+ .retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO),
+ .monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO),
+ .max_pdu_size = cpu_to_le16(chan->imtu),
+ .txwin_size = min_t(u16, chan->ack_win, L2CAP_DEFAULT_TX_WINDOW),
+ };
+
+ BT_DBG("chan %p, rsp %p, len %d", chan, rsp, len);
+
+ if ((chan->mode != L2CAP_MODE_ERTM) && (chan->mode != L2CAP_MODE_STREAMING))
+ return;
+
+ while (len >= L2CAP_CONF_OPT_SIZE) {
+ len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+ if (len < 0)
+ break;
+
+ switch (type) {
+ case L2CAP_CONF_RFC:
+ if (olen != sizeof(rfc))
+ break;
+ memcpy(&rfc, (void *)val, olen);
+ break;
+ case L2CAP_CONF_EWS:
+ if (olen != 2)
+ break;
+ txwin_ext = val;
+ break;
+ }
+ }
+
+ switch (rfc.mode) {
+ case L2CAP_MODE_ERTM:
+ chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
+ chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ chan->ack_win = min_t(u16, chan->ack_win, txwin_ext);
+ else
+ chan->ack_win = min_t(u16, chan->ack_win,
+ rfc.txwin_size);
+ break;
+ case L2CAP_MODE_STREAMING:
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ }
+}
+
+static inline int l2cap_command_rej(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_cmd_rej_unk *rej = (struct l2cap_cmd_rej_unk *) data;
+
+ if (cmd_len < sizeof(*rej))
+ return -EPROTO;
+
+ if (rej->reason != L2CAP_REJ_NOT_UNDERSTOOD)
+ return 0;
+
+ if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) &&
+ cmd->ident == conn->info_ident) {
+ cancel_delayed_work(&conn->info_timer);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ }
+
+ return 0;
+}
+
+static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd,
+ u8 *data, u8 rsp_code)
+{
+ struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
+ struct l2cap_conn_rsp rsp;
+ struct l2cap_chan *chan = NULL, *pchan = NULL;
+ int result, status = L2CAP_CS_NO_INFO;
+
+ u16 dcid = 0, scid = __le16_to_cpu(req->scid);
+ __le16 psm = req->psm;
+
+ BT_DBG("psm 0x%2.2x scid 0x%4.4x", __le16_to_cpu(psm), scid);
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, ACL_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_BAD_PSM;
+ goto response;
+ }
+
+ l2cap_chan_lock(pchan);
+
+ /* Check if the ACL is secure enough (if not SDP) */
+ if (psm != cpu_to_le16(L2CAP_PSM_SDP) &&
+ (!hci_conn_check_link_mode(conn->hcon) ||
+ !l2cap_check_enc_key_size(conn->hcon, pchan))) {
+ conn->disc_reason = HCI_ERROR_AUTH_FAILURE;
+ result = L2CAP_CR_SEC_BLOCK;
+ goto response;
+ }
+
+ result = L2CAP_CR_NO_MEM;
+
+ /* Check for valid dynamic CID range (as per Erratum 3253) */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_DYN_END) {
+ result = L2CAP_CR_INVALID_SCID;
+ goto response;
+ }
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_SCID_IN_USE;
+ goto response;
+ }
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan)
+ goto response;
+
+ /* For certain devices (ex: HID mouse), support for authentication,
+ * pairing and bonding is optional. For such devices, inorder to avoid
+ * the ACL alive for too long after L2CAP disconnection, reset the ACL
+ * disc_timeout back to HCI_DISCONN_TIMEOUT during L2CAP connect.
+ */
+ conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+
+ __l2cap_chan_add(conn, chan);
+
+ dcid = chan->scid;
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) {
+ if (l2cap_chan_check_security(chan, false)) {
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_AUTHOR_PEND;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_state_change(chan, BT_CONFIG);
+ result = L2CAP_CR_SUCCESS;
+ status = L2CAP_CS_NO_INFO;
+ }
+ } else {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_AUTHEN_PEND;
+ }
+ } else {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_NO_INFO;
+ }
+
+response:
+ rsp.scid = cpu_to_le16(scid);
+ rsp.dcid = cpu_to_le16(dcid);
+ rsp.result = cpu_to_le16(result);
+ rsp.status = cpu_to_le16(status);
+ l2cap_send_cmd(conn, cmd->ident, rsp_code, sizeof(rsp), &rsp);
+
+ if (!pchan)
+ return;
+
+ if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) {
+ struct l2cap_info_req info;
+ info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT;
+ conn->info_ident = l2cap_get_ident(conn);
+
+ schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT);
+
+ l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ,
+ sizeof(info), &info);
+ }
+
+ if (chan && !test_bit(CONF_REQ_SENT, &chan->conf_state) &&
+ result == L2CAP_CR_SUCCESS) {
+ u8 buf[128];
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
+ chan->num_conf_req++;
+ }
+
+ l2cap_chan_unlock(pchan);
+ l2cap_chan_put(pchan);
+}
+
+static int l2cap_connect_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
+{
+ if (cmd_len < sizeof(struct l2cap_conn_req))
+ return -EPROTO;
+
+ l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP);
+ return 0;
+}
+
+static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_conn_rsp *rsp = (struct l2cap_conn_rsp *) data;
+ u16 scid, dcid, result, status;
+ struct l2cap_chan *chan;
+ u8 req[128];
+ int err;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(rsp->scid);
+ dcid = __le16_to_cpu(rsp->dcid);
+ result = __le16_to_cpu(rsp->result);
+ status = __le16_to_cpu(rsp->status);
+
+ if (result == L2CAP_CR_SUCCESS && (dcid < L2CAP_CID_DYN_START ||
+ dcid > L2CAP_CID_DYN_END))
+ return -EPROTO;
+
+ BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x",
+ dcid, scid, result, status);
+
+ if (scid) {
+ chan = __l2cap_get_chan_by_scid(conn, scid);
+ if (!chan)
+ return -EBADSLT;
+ } else {
+ chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
+ if (!chan)
+ return -EBADSLT;
+ }
+
+ chan = l2cap_chan_hold_unless_zero(chan);
+ if (!chan)
+ return -EBADSLT;
+
+ err = 0;
+
+ l2cap_chan_lock(chan);
+
+ switch (result) {
+ case L2CAP_CR_SUCCESS:
+ if (__l2cap_get_chan_by_dcid(conn, dcid)) {
+ err = -EBADSLT;
+ break;
+ }
+
+ l2cap_state_change(chan, BT_CONFIG);
+ chan->ident = 0;
+ chan->dcid = dcid;
+ clear_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+ if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state))
+ break;
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, req, sizeof(req)), req);
+ chan->num_conf_req++;
+ break;
+
+ case L2CAP_CR_PEND:
+ set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+ break;
+
+ default:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ return err;
+}
+
+static inline void set_default_fcs(struct l2cap_chan *chan)
+{
+ /* FCS is enabled only in ERTM or streaming mode, if one or both
+ * sides request it.
+ */
+ if (chan->mode != L2CAP_MODE_ERTM && chan->mode != L2CAP_MODE_STREAMING)
+ chan->fcs = L2CAP_FCS_NONE;
+ else if (!test_bit(CONF_RECV_NO_FCS, &chan->conf_state))
+ chan->fcs = L2CAP_FCS_CRC16;
+}
+
+static void l2cap_send_efs_conf_rsp(struct l2cap_chan *chan, void *data,
+ u8 ident, u16 flags)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("conn %p chan %p ident %d flags 0x%4.4x", conn, chan, ident,
+ flags);
+
+ clear_bit(CONF_LOC_CONF_PEND, &chan->conf_state);
+ set_bit(CONF_OUTPUT_DONE, &chan->conf_state);
+
+ l2cap_send_cmd(conn, ident, L2CAP_CONF_RSP,
+ l2cap_build_conf_rsp(chan, data,
+ L2CAP_CONF_SUCCESS, flags), data);
+}
+
+static void cmd_reject_invalid_cid(struct l2cap_conn *conn, u8 ident,
+ u16 scid, u16 dcid)
+{
+ struct l2cap_cmd_rej_cid rej;
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_INVALID_CID);
+ rej.scid = __cpu_to_le16(scid);
+ rej.dcid = __cpu_to_le16(dcid);
+
+ l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+}
+
+static inline int l2cap_config_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
+ u16 dcid, flags;
+ u8 rsp[64];
+ struct l2cap_chan *chan;
+ int len, err = 0;
+
+ if (cmd_len < sizeof(*req))
+ return -EPROTO;
+
+ dcid = __le16_to_cpu(req->dcid);
+ flags = __le16_to_cpu(req->flags);
+
+ BT_DBG("dcid 0x%4.4x flags 0x%2.2x", dcid, flags);
+
+ chan = l2cap_get_chan_by_scid(conn, dcid);
+ if (!chan) {
+ cmd_reject_invalid_cid(conn, cmd->ident, dcid, 0);
+ return 0;
+ }
+
+ if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2 &&
+ chan->state != BT_CONNECTED) {
+ cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
+ chan->dcid);
+ goto unlock;
+ }
+
+ /* Reject if config buffer is too small. */
+ len = cmd_len - sizeof(*req);
+ if (chan->conf_len + len > sizeof(chan->conf_req)) {
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
+ l2cap_build_conf_rsp(chan, rsp,
+ L2CAP_CONF_REJECT, flags), rsp);
+ goto unlock;
+ }
+
+ /* Store config. */
+ memcpy(chan->conf_req + chan->conf_len, req->data, len);
+ chan->conf_len += len;
+
+ if (flags & L2CAP_CONF_FLAG_CONTINUATION) {
+ /* Incomplete config. Send empty response. */
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
+ l2cap_build_conf_rsp(chan, rsp,
+ L2CAP_CONF_SUCCESS, flags), rsp);
+ goto unlock;
+ }
+
+ /* Complete config. */
+ len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp));
+ if (len < 0) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto unlock;
+ }
+
+ chan->ident = cmd->ident;
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp);
+ if (chan->num_conf_rsp < L2CAP_CONF_MAX_CONF_RSP)
+ chan->num_conf_rsp++;
+
+ /* Reset config buffer. */
+ chan->conf_len = 0;
+
+ if (!test_bit(CONF_OUTPUT_DONE, &chan->conf_state))
+ goto unlock;
+
+ if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
+ set_default_fcs(chan);
+
+ if (chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_STREAMING)
+ err = l2cap_ertm_init(chan);
+
+ if (err < 0)
+ l2cap_send_disconn_req(chan, -err);
+ else
+ l2cap_chan_ready(chan);
+
+ goto unlock;
+ }
+
+ if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) {
+ u8 buf[64];
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
+ chan->num_conf_req++;
+ }
+
+ /* Got Conf Rsp PENDING from remote side and assume we sent
+ Conf Rsp PENDING in the code above */
+ if (test_bit(CONF_REM_CONF_PEND, &chan->conf_state) &&
+ test_bit(CONF_LOC_CONF_PEND, &chan->conf_state)) {
+
+ /* check compatibility */
+
+ /* Send rsp for BR/EDR channel */
+ l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
+ }
+
+unlock:
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return err;
+}
+
+static inline int l2cap_config_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_conf_rsp *rsp = (struct l2cap_conf_rsp *)data;
+ u16 scid, flags, result;
+ struct l2cap_chan *chan;
+ int len = cmd_len - sizeof(*rsp);
+ int err = 0;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(rsp->scid);
+ flags = __le16_to_cpu(rsp->flags);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("scid 0x%4.4x flags 0x%2.2x result 0x%2.2x len %d", scid, flags,
+ result, len);
+
+ chan = l2cap_get_chan_by_scid(conn, scid);
+ if (!chan)
+ return 0;
+
+ switch (result) {
+ case L2CAP_CONF_SUCCESS:
+ l2cap_conf_rfc_get(chan, rsp->data, len);
+ clear_bit(CONF_REM_CONF_PEND, &chan->conf_state);
+ break;
+
+ case L2CAP_CONF_PENDING:
+ set_bit(CONF_REM_CONF_PEND, &chan->conf_state);
+
+ if (test_bit(CONF_LOC_CONF_PEND, &chan->conf_state)) {
+ char buf[64];
+
+ len = l2cap_parse_conf_rsp(chan, rsp->data, len,
+ buf, sizeof(buf), &result);
+ if (len < 0) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ l2cap_send_efs_conf_rsp(chan, buf, cmd->ident, 0);
+ }
+ goto done;
+
+ case L2CAP_CONF_UNKNOWN:
+ case L2CAP_CONF_UNACCEPT:
+ if (chan->num_conf_rsp <= L2CAP_CONF_MAX_CONF_RSP) {
+ char req[64];
+
+ if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ /* throw out any old stored conf requests */
+ result = L2CAP_CONF_SUCCESS;
+ len = l2cap_parse_conf_rsp(chan, rsp->data, len,
+ req, sizeof(req), &result);
+ if (len < 0) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn),
+ L2CAP_CONF_REQ, len, req);
+ chan->num_conf_req++;
+ if (result != L2CAP_CONF_SUCCESS)
+ goto done;
+ break;
+ }
+ fallthrough;
+
+ default:
+ l2cap_chan_set_err(chan, ECONNRESET);
+
+ __set_chan_timer(chan, L2CAP_DISC_REJ_TIMEOUT);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ if (flags & L2CAP_CONF_FLAG_CONTINUATION)
+ goto done;
+
+ set_bit(CONF_INPUT_DONE, &chan->conf_state);
+
+ if (test_bit(CONF_OUTPUT_DONE, &chan->conf_state)) {
+ set_default_fcs(chan);
+
+ if (chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_STREAMING)
+ err = l2cap_ertm_init(chan);
+
+ if (err < 0)
+ l2cap_send_disconn_req(chan, -err);
+ else
+ l2cap_chan_ready(chan);
+ }
+
+done:
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return err;
+}
+
+static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_disconn_req *req = (struct l2cap_disconn_req *) data;
+ struct l2cap_disconn_rsp rsp;
+ u16 dcid, scid;
+ struct l2cap_chan *chan;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(req->scid);
+ dcid = __le16_to_cpu(req->dcid);
+
+ BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid);
+
+ chan = l2cap_get_chan_by_scid(conn, dcid);
+ if (!chan) {
+ cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid);
+ return 0;
+ }
+
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.scid = cpu_to_le16(chan->dcid);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp);
+
+ chan->ops->set_shutdown(chan);
+
+ l2cap_chan_del(chan, ECONNRESET);
+
+ chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ return 0;
+}
+
+static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_disconn_rsp *rsp = (struct l2cap_disconn_rsp *) data;
+ u16 dcid, scid;
+ struct l2cap_chan *chan;
+
+ if (cmd_len != sizeof(*rsp))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(rsp->scid);
+ dcid = __le16_to_cpu(rsp->dcid);
+
+ BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid);
+
+ chan = l2cap_get_chan_by_scid(conn, scid);
+ if (!chan) {
+ return 0;
+ }
+
+ if (chan->state != BT_DISCONN) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return 0;
+ }
+
+ l2cap_chan_del(chan, 0);
+
+ chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ return 0;
+}
+
+static inline int l2cap_information_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_info_req *req = (struct l2cap_info_req *) data;
+ u16 type;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ type = __le16_to_cpu(req->type);
+
+ BT_DBG("type 0x%4.4x", type);
+
+ if (type == L2CAP_IT_FEAT_MASK) {
+ u8 buf[8];
+ u32 feat_mask = l2cap_feat_mask;
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+ rsp->type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+ rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+ if (!disable_ertm)
+ feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
+ | L2CAP_FEAT_FCS;
+
+ put_unaligned_le32(feat_mask, rsp->data);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
+ buf);
+ } else if (type == L2CAP_IT_FIXED_CHAN) {
+ u8 buf[12];
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+
+ rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+ rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+ rsp->data[0] = conn->local_fixed_chan;
+ memset(rsp->data + 1, 0, 7);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
+ buf);
+ } else {
+ struct l2cap_info_rsp rsp;
+ rsp.type = cpu_to_le16(type);
+ rsp.result = cpu_to_le16(L2CAP_IR_NOTSUPP);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(rsp),
+ &rsp);
+ }
+
+ return 0;
+}
+
+static inline int l2cap_information_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data;
+ u16 type, result;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ type = __le16_to_cpu(rsp->type);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
+
+ /* L2CAP Info req/rsp are unbound to channels, add extra checks */
+ if (cmd->ident != conn->info_ident ||
+ conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)
+ return 0;
+
+ cancel_delayed_work(&conn->info_timer);
+
+ if (result != L2CAP_IR_SUCCESS) {
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+
+ return 0;
+ }
+
+ switch (type) {
+ case L2CAP_IT_FEAT_MASK:
+ conn->feat_mask = get_unaligned_le32(rsp->data);
+
+ if (conn->feat_mask & L2CAP_FEAT_FIXED_CHAN) {
+ struct l2cap_info_req req;
+ req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+
+ conn->info_ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, conn->info_ident,
+ L2CAP_INFO_REQ, sizeof(req), &req);
+ } else {
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ }
+ break;
+
+ case L2CAP_IT_FIXED_CHAN:
+ conn->remote_fixed_chan = rsp->data[0];
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ break;
+ }
+
+ return 0;
+}
+
+static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, u8 *data)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_conn_param_update_req *req;
+ struct l2cap_conn_param_update_rsp rsp;
+ u16 min, max, latency, to_multiplier;
+ int err;
+
+ if (hcon->role != HCI_ROLE_MASTER)
+ return -EINVAL;
+
+ if (cmd_len != sizeof(struct l2cap_conn_param_update_req))
+ return -EPROTO;
+
+ req = (struct l2cap_conn_param_update_req *) data;
+ min = __le16_to_cpu(req->min);
+ max = __le16_to_cpu(req->max);
+ latency = __le16_to_cpu(req->latency);
+ to_multiplier = __le16_to_cpu(req->to_multiplier);
+
+ BT_DBG("min 0x%4.4x max 0x%4.4x latency: 0x%4.4x Timeout: 0x%4.4x",
+ min, max, latency, to_multiplier);
+
+ memset(&rsp, 0, sizeof(rsp));
+
+ err = hci_check_conn_params(min, max, latency, to_multiplier);
+ if (err)
+ rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
+ else
+ rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_ACCEPTED);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP,
+ sizeof(rsp), &rsp);
+
+ if (!err) {
+ u8 store_hint;
+
+ store_hint = hci_le_conn_update(hcon, min, max, latency,
+ to_multiplier);
+ mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type,
+ store_hint, min, max, latency,
+ to_multiplier);
+
+ }
+
+ return 0;
+}
+
+static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_le_conn_rsp *rsp = (struct l2cap_le_conn_rsp *) data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 dcid, mtu, mps, credits, result;
+ struct l2cap_chan *chan;
+ int err, sec_level;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ dcid = __le16_to_cpu(rsp->dcid);
+ mtu = __le16_to_cpu(rsp->mtu);
+ mps = __le16_to_cpu(rsp->mps);
+ credits = __le16_to_cpu(rsp->credits);
+ result = __le16_to_cpu(rsp->result);
+
+ if (result == L2CAP_CR_LE_SUCCESS && (mtu < 23 || mps < 23 ||
+ dcid < L2CAP_CID_DYN_START ||
+ dcid > L2CAP_CID_LE_DYN_END))
+ return -EPROTO;
+
+ BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x",
+ dcid, mtu, mps, credits, result);
+
+ chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
+ if (!chan)
+ return -EBADSLT;
+
+ err = 0;
+
+ l2cap_chan_lock(chan);
+
+ switch (result) {
+ case L2CAP_CR_LE_SUCCESS:
+ if (__l2cap_get_chan_by_dcid(conn, dcid)) {
+ err = -EBADSLT;
+ break;
+ }
+
+ chan->ident = 0;
+ chan->dcid = dcid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ chan->tx_credits = credits;
+ l2cap_chan_ready(chan);
+ break;
+
+ case L2CAP_CR_LE_AUTHENTICATION:
+ case L2CAP_CR_LE_ENCRYPTION:
+ /* If we already have MITM protection we can't do
+ * anything.
+ */
+ if (hcon->sec_level > BT_SECURITY_MEDIUM) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ sec_level = hcon->sec_level + 1;
+ if (chan->sec_level < sec_level)
+ chan->sec_level = sec_level;
+
+ /* We'll need to send a new Connect Request */
+ clear_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags);
+
+ smp_conn_security(hcon, chan->sec_level);
+ break;
+
+ default:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+
+ return err;
+}
+
+static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ int err = 0;
+
+ switch (cmd->code) {
+ case L2CAP_COMMAND_REJ:
+ l2cap_command_rej(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_REQ:
+ err = l2cap_connect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_RSP:
+ l2cap_connect_create_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONF_REQ:
+ err = l2cap_config_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONF_RSP:
+ l2cap_config_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_REQ:
+ err = l2cap_disconnect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_RSP:
+ l2cap_disconnect_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECHO_REQ:
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECHO_RSP, cmd_len, data);
+ break;
+
+ case L2CAP_ECHO_RSP:
+ break;
+
+ case L2CAP_INFO_REQ:
+ err = l2cap_information_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_INFO_RSP:
+ l2cap_information_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ default:
+ BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code);
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+static int l2cap_le_connect_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_le_conn_req *req = (struct l2cap_le_conn_req *) data;
+ struct l2cap_le_conn_rsp rsp;
+ struct l2cap_chan *chan, *pchan;
+ u16 dcid, scid, credits, mtu, mps;
+ __le16 psm;
+ u8 result;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(req->scid);
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+ psm = req->psm;
+ dcid = 0;
+ credits = 0;
+
+ if (mtu < 23 || mps < 23)
+ return -EPROTO;
+
+ BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm),
+ scid, mtu, mps);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ chan = NULL;
+ goto response;
+ }
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, LE_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ chan = NULL;
+ goto response;
+ }
+
+ l2cap_chan_lock(pchan);
+
+ if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
+ SMP_ALLOW_STK)) {
+ result = pchan->sec_level == BT_SECURITY_MEDIUM ?
+ L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION;
+ chan = NULL;
+ goto response_unlock;
+ }
+
+ /* Check for valid dynamic CID range */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
+ result = L2CAP_CR_LE_INVALID_SCID;
+ chan = NULL;
+ goto response_unlock;
+ }
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_LE_SCID_IN_USE;
+ chan = NULL;
+ goto response_unlock;
+ }
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan) {
+ result = L2CAP_CR_LE_NO_MEM;
+ goto response_unlock;
+ }
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+
+ __l2cap_chan_add(conn, chan);
+
+ l2cap_le_flowctl_init(chan, __le16_to_cpu(req->credits));
+
+ dcid = chan->scid;
+ credits = chan->rx_credits;
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ /* The following result value is actually not defined
+ * for LE CoC but we use it to let the function know
+ * that it should bail out after doing its cleanup
+ * instead of sending a response.
+ */
+ result = L2CAP_CR_PEND;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_chan_ready(chan);
+ result = L2CAP_CR_LE_SUCCESS;
+ }
+
+response_unlock:
+ l2cap_chan_unlock(pchan);
+ l2cap_chan_put(pchan);
+
+ if (result == L2CAP_CR_PEND)
+ return 0;
+
+response:
+ if (chan) {
+ rsp.mtu = cpu_to_le16(chan->imtu);
+ rsp.mps = cpu_to_le16(chan->mps);
+ } else {
+ rsp.mtu = 0;
+ rsp.mps = 0;
+ }
+
+ rsp.dcid = cpu_to_le16(dcid);
+ rsp.credits = cpu_to_le16(credits);
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), &rsp);
+
+ return 0;
+}
+
+static inline int l2cap_le_credits(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_le_credits *pkt;
+ struct l2cap_chan *chan;
+ u16 cid, credits, max_credits;
+
+ if (cmd_len != sizeof(*pkt))
+ return -EPROTO;
+
+ pkt = (struct l2cap_le_credits *) data;
+ cid = __le16_to_cpu(pkt->cid);
+ credits = __le16_to_cpu(pkt->credits);
+
+ BT_DBG("cid 0x%4.4x credits 0x%4.4x", cid, credits);
+
+ chan = l2cap_get_chan_by_dcid(conn, cid);
+ if (!chan)
+ return -EBADSLT;
+
+ max_credits = LE_FLOWCTL_MAX_CREDITS - chan->tx_credits;
+ if (credits > max_credits) {
+ BT_ERR("LE credits overflow");
+ l2cap_send_disconn_req(chan, ECONNRESET);
+
+ /* Return 0 so that we don't trigger an unnecessary
+ * command reject packet.
+ */
+ goto unlock;
+ }
+
+ chan->tx_credits += credits;
+
+ /* Resume sending */
+ l2cap_le_flowctl_send(chan);
+
+ if (chan->tx_credits)
+ chan->ops->resume(chan);
+
+unlock:
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_conn_req *req = (void *) data;
+ DEFINE_RAW_FLEX(struct l2cap_ecred_conn_rsp, pdu, dcid, L2CAP_ECRED_MAX_CID);
+ struct l2cap_chan *chan, *pchan;
+ u16 mtu, mps;
+ __le16 psm;
+ u8 result, len = 0;
+ int i, num_scid;
+ bool defer = false;
+
+ if (!enable_ecred)
+ return -EINVAL;
+
+ if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto response;
+ }
+
+ cmd_len -= sizeof(*req);
+ num_scid = cmd_len / sizeof(u16);
+
+ if (num_scid > L2CAP_ECRED_MAX_CID) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto response;
+ }
+
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+
+ if (mtu < L2CAP_ECRED_MIN_MTU || mps < L2CAP_ECRED_MIN_MPS) {
+ result = L2CAP_CR_LE_UNACCEPT_PARAMS;
+ goto response;
+ }
+
+ psm = req->psm;
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
+ BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps);
+
+ memset(pdu, 0, sizeof(*pdu));
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, LE_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
+ l2cap_chan_lock(pchan);
+
+ if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
+ SMP_ALLOW_STK)) {
+ result = L2CAP_CR_LE_AUTHENTICATION;
+ goto unlock;
+ }
+
+ result = L2CAP_CR_LE_SUCCESS;
+
+ for (i = 0; i < num_scid; i++) {
+ u16 scid = __le16_to_cpu(req->scid[i]);
+
+ BT_DBG("scid[%d] 0x%4.4x", i, scid);
+
+ pdu->dcid[i] = 0x0000;
+ len += sizeof(*pdu->dcid);
+
+ /* Check for valid dynamic CID range */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
+ result = L2CAP_CR_LE_INVALID_SCID;
+ continue;
+ }
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_LE_SCID_IN_USE;
+ continue;
+ }
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan) {
+ result = L2CAP_CR_LE_NO_MEM;
+ continue;
+ }
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+
+ __l2cap_chan_add(conn, chan);
+
+ l2cap_ecred_init(chan, __le16_to_cpu(req->credits));
+
+ /* Init response */
+ if (!pdu->credits) {
+ pdu->mtu = cpu_to_le16(chan->imtu);
+ pdu->mps = cpu_to_le16(chan->mps);
+ pdu->credits = cpu_to_le16(chan->rx_credits);
+ }
+
+ pdu->dcid[i] = cpu_to_le16(chan->scid);
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+ chan->mode = L2CAP_MODE_EXT_FLOWCTL;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ defer = true;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_chan_ready(chan);
+ }
+ }
+
+unlock:
+ l2cap_chan_unlock(pchan);
+ l2cap_chan_put(pchan);
+
+response:
+ pdu->result = cpu_to_le16(result);
+
+ if (defer)
+ return 0;
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP,
+ sizeof(*pdu) + len, pdu);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_conn_rsp *rsp = (void *) data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 mtu, mps, credits, result;
+ struct l2cap_chan *chan, *tmp;
+ int err = 0, sec_level;
+ int i = 0;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ mtu = __le16_to_cpu(rsp->mtu);
+ mps = __le16_to_cpu(rsp->mps);
+ credits = __le16_to_cpu(rsp->credits);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits,
+ result);
+
+ cmd_len -= sizeof(*rsp);
+
+ list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ u16 dcid;
+
+ if (chan->ident != cmd->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL ||
+ chan->state == BT_CONNECTED)
+ continue;
+
+ l2cap_chan_lock(chan);
+
+ /* Check that there is a dcid for each pending channel */
+ if (cmd_len < sizeof(dcid)) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ dcid = __le16_to_cpu(rsp->dcid[i++]);
+ cmd_len -= sizeof(u16);
+
+ BT_DBG("dcid[%d] 0x%4.4x", i, dcid);
+
+ /* Check if dcid is already in use */
+ if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) {
+ /* If a device receives a
+ * L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an
+ * already-assigned Destination CID, then both the
+ * original channel and the new channel shall be
+ * immediately discarded and not used.
+ */
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ chan = __l2cap_get_chan_by_dcid(conn, dcid);
+ l2cap_chan_lock(chan);
+ l2cap_chan_del(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ switch (result) {
+ case L2CAP_CR_LE_AUTHENTICATION:
+ case L2CAP_CR_LE_ENCRYPTION:
+ /* If we already have MITM protection we can't do
+ * anything.
+ */
+ if (hcon->sec_level > BT_SECURITY_MEDIUM) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ sec_level = hcon->sec_level + 1;
+ if (chan->sec_level < sec_level)
+ chan->sec_level = sec_level;
+
+ /* We'll need to send a new Connect Request */
+ clear_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags);
+
+ smp_conn_security(hcon, chan->sec_level);
+ break;
+
+ case L2CAP_CR_LE_BAD_PSM:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+
+ default:
+ /* If dcid was not set it means channels was refused */
+ if (!dcid) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ chan->ident = 0;
+ chan->dcid = dcid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ chan->tx_credits = credits;
+ l2cap_chan_ready(chan);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ return err;
+}
+
+static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_reconf_req *req = (void *) data;
+ struct l2cap_ecred_reconf_rsp rsp;
+ u16 mtu, mps, result;
+ struct l2cap_chan *chan;
+ int i, num_scid;
+
+ if (!enable_ecred)
+ return -EINVAL;
+
+ if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto respond;
+ }
+
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+
+ BT_DBG("mtu %u mps %u", mtu, mps);
+
+ if (mtu < L2CAP_ECRED_MIN_MTU) {
+ result = L2CAP_RECONF_INVALID_MTU;
+ goto respond;
+ }
+
+ if (mps < L2CAP_ECRED_MIN_MPS) {
+ result = L2CAP_RECONF_INVALID_MPS;
+ goto respond;
+ }
+
+ cmd_len -= sizeof(*req);
+ num_scid = cmd_len / sizeof(u16);
+ result = L2CAP_RECONF_SUCCESS;
+
+ for (i = 0; i < num_scid; i++) {
+ u16 scid;
+
+ scid = __le16_to_cpu(req->scid[i]);
+ if (!scid)
+ return -EPROTO;
+
+ chan = __l2cap_get_chan_by_dcid(conn, scid);
+ if (!chan)
+ continue;
+
+ /* If the MTU value is decreased for any of the included
+ * channels, then the receiver shall disconnect all
+ * included channels.
+ */
+ if (chan->omtu > mtu) {
+ BT_ERR("chan %p decreased MTU %u -> %u", chan,
+ chan->omtu, mtu);
+ result = L2CAP_RECONF_INVALID_MTU;
+ }
+
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ }
+
+respond:
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_RECONF_RSP, sizeof(rsp),
+ &rsp);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_chan *chan, *tmp;
+ struct l2cap_ecred_conn_rsp *rsp = (void *) data;
+ u16 result;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("result 0x%4.4x", rsp->result);
+
+ if (!result)
+ return 0;
+
+ list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ if (chan->ident != cmd->ident)
+ continue;
+
+ l2cap_chan_del(chan, ECONNRESET);
+ }
+
+ return 0;
+}
+
+static inline int l2cap_le_command_rej(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_cmd_rej_unk *rej = (struct l2cap_cmd_rej_unk *) data;
+ struct l2cap_chan *chan;
+
+ if (cmd_len < sizeof(*rej))
+ return -EPROTO;
+
+ chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
+ if (!chan)
+ goto done;
+
+ chan = l2cap_chan_hold_unless_zero(chan);
+ if (!chan)
+ goto done;
+
+ l2cap_chan_lock(chan);
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+done:
+ return 0;
+}
+
+static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ int err = 0;
+
+ switch (cmd->code) {
+ case L2CAP_COMMAND_REJ:
+ l2cap_le_command_rej(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_PARAM_UPDATE_REQ:
+ err = l2cap_conn_param_update_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_PARAM_UPDATE_RSP:
+ break;
+
+ case L2CAP_LE_CONN_RSP:
+ l2cap_le_connect_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_LE_CONN_REQ:
+ err = l2cap_le_connect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_LE_CREDITS:
+ err = l2cap_le_credits(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_CONN_REQ:
+ err = l2cap_ecred_conn_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_CONN_RSP:
+ err = l2cap_ecred_conn_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_RECONF_REQ:
+ err = l2cap_ecred_reconf_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_RECONF_RSP:
+ err = l2cap_ecred_reconf_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_REQ:
+ err = l2cap_disconnect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_RSP:
+ l2cap_disconnect_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ default:
+ BT_ERR("Unknown LE signaling command 0x%2.2x", cmd->code);
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+static inline void l2cap_le_sig_channel(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_cmd_hdr *cmd;
+ u16 len;
+ int err;
+
+ if (hcon->type != LE_LINK)
+ goto drop;
+
+ if (skb->len < L2CAP_CMD_HDR_SIZE)
+ goto drop;
+
+ cmd = (void *) skb->data;
+ skb_pull(skb, L2CAP_CMD_HDR_SIZE);
+
+ len = le16_to_cpu(cmd->len);
+
+ BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd->code, len, cmd->ident);
+
+ if (len != skb->len || !cmd->ident) {
+ BT_DBG("corrupted command");
+ goto drop;
+ }
+
+ err = l2cap_le_sig_cmd(conn, cmd, len, skb->data);
+ if (err) {
+ struct l2cap_cmd_rej_unk rej;
+
+ BT_ERR("Wrong link type (%d)", err);
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
+ sizeof(rej), &rej);
+ }
+
+drop:
+ kfree_skb(skb);
+}
+
+static inline void l2cap_sig_send_rej(struct l2cap_conn *conn, u16 ident)
+{
+ struct l2cap_cmd_rej_unk rej;
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
+ l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+}
+
+static inline void l2cap_sig_channel(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_cmd_hdr *cmd;
+ int err;
+
+ l2cap_raw_recv(conn, skb);
+
+ if (hcon->type != ACL_LINK)
+ goto drop;
+
+ while (skb->len >= L2CAP_CMD_HDR_SIZE) {
+ u16 len;
+
+ cmd = (void *) skb->data;
+ skb_pull(skb, L2CAP_CMD_HDR_SIZE);
+
+ len = le16_to_cpu(cmd->len);
+
+ BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd->code, len,
+ cmd->ident);
+
+ if (len > skb->len || !cmd->ident) {
+ BT_DBG("corrupted command");
+ l2cap_sig_send_rej(conn, cmd->ident);
+ skb_pull(skb, len > skb->len ? skb->len : len);
+ continue;
+ }
+
+ err = l2cap_bredr_sig_cmd(conn, cmd, len, skb->data);
+ if (err) {
+ BT_ERR("Wrong link type (%d)", err);
+ l2cap_sig_send_rej(conn, cmd->ident);
+ }
+
+ skb_pull(skb, len);
+ }
+
+ if (skb->len > 0) {
+ BT_DBG("corrupted command");
+ l2cap_sig_send_rej(conn, 0);
+ }
+
+drop:
+ kfree_skb(skb);
+}
+
+static int l2cap_check_fcs(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ u16 our_fcs, rcv_fcs;
+ int hdr_size;
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ hdr_size = L2CAP_EXT_HDR_SIZE;
+ else
+ hdr_size = L2CAP_ENH_HDR_SIZE;
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ skb_trim(skb, skb->len - L2CAP_FCS_SIZE);
+ rcv_fcs = get_unaligned_le16(skb->data + skb->len);
+ our_fcs = crc16(0, skb->data - hdr_size, skb->len + hdr_size);
+
+ if (our_fcs != rcv_fcs)
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+static void l2cap_send_i_or_rr_or_rnr(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+
+ BT_DBG("chan %p", chan);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.final = 1;
+ control.reqseq = chan->buffer_seq;
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ control.super = L2CAP_SUPER_RNR;
+ l2cap_send_sframe(chan, &control);
+ }
+
+ if (test_and_clear_bit(CONN_REMOTE_BUSY, &chan->conn_state) &&
+ chan->unacked_frames > 0)
+ __set_retrans_timer(chan);
+
+ /* Send pending iframes */
+ l2cap_ertm_send(chan);
+
+ if (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state) &&
+ test_bit(CONN_SEND_FBIT, &chan->conn_state)) {
+ /* F-bit wasn't sent in an s-frame or i-frame yet, so
+ * send it now.
+ */
+ control.super = L2CAP_SUPER_RR;
+ l2cap_send_sframe(chan, &control);
+ }
+}
+
+static void append_skb_frag(struct sk_buff *skb, struct sk_buff *new_frag,
+ struct sk_buff **last_frag)
+{
+ /* skb->len reflects data in skb as well as all fragments
+ * skb->data_len reflects only data in fragments
+ */
+ if (!skb_has_frag_list(skb))
+ skb_shinfo(skb)->frag_list = new_frag;
+
+ new_frag->next = NULL;
+
+ (*last_frag)->next = new_frag;
+ *last_frag = new_frag;
+
+ skb->len += new_frag->len;
+ skb->data_len += new_frag->len;
+ skb->truesize += new_frag->truesize;
+}
+
+static int l2cap_reassemble_sdu(struct l2cap_chan *chan, struct sk_buff *skb,
+ struct l2cap_ctrl *control)
+{
+ int err = -EINVAL;
+
+ switch (control->sar) {
+ case L2CAP_SAR_UNSEGMENTED:
+ if (chan->sdu)
+ break;
+
+ err = chan->ops->recv(chan, skb);
+ break;
+
+ case L2CAP_SAR_START:
+ if (chan->sdu)
+ break;
+
+ if (!pskb_may_pull(skb, L2CAP_SDULEN_SIZE))
+ break;
+
+ chan->sdu_len = get_unaligned_le16(skb->data);
+ skb_pull(skb, L2CAP_SDULEN_SIZE);
+
+ if (chan->sdu_len > chan->imtu) {
+ err = -EMSGSIZE;
+ break;
+ }
+
+ if (skb->len >= chan->sdu_len)
+ break;
+
+ chan->sdu = skb;
+ chan->sdu_last_frag = skb;
+
+ skb = NULL;
+ err = 0;
+ break;
+
+ case L2CAP_SAR_CONTINUE:
+ if (!chan->sdu)
+ break;
+
+ append_skb_frag(chan->sdu, skb,
+ &chan->sdu_last_frag);
+ skb = NULL;
+
+ if (chan->sdu->len >= chan->sdu_len)
+ break;
+
+ err = 0;
+ break;
+
+ case L2CAP_SAR_END:
+ if (!chan->sdu)
+ break;
+
+ append_skb_frag(chan->sdu, skb,
+ &chan->sdu_last_frag);
+ skb = NULL;
+
+ if (chan->sdu->len != chan->sdu_len)
+ break;
+
+ err = chan->ops->recv(chan, chan->sdu);
+
+ if (!err) {
+ /* Reassembly complete */
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+ break;
+ }
+
+ if (err) {
+ kfree_skb(skb);
+ kfree_skb(chan->sdu);
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+
+ return err;
+}
+
+static int l2cap_resegment(struct l2cap_chan *chan)
+{
+ /* Placeholder */
+ return 0;
+}
+
+void l2cap_chan_busy(struct l2cap_chan *chan, int busy)
+{
+ u8 event;
+
+ if (chan->mode != L2CAP_MODE_ERTM)
+ return;
+
+ event = busy ? L2CAP_EV_LOCAL_BUSY_DETECTED : L2CAP_EV_LOCAL_BUSY_CLEAR;
+ l2cap_tx(chan, NULL, NULL, event);
+}
+
+static int l2cap_rx_queued_iframes(struct l2cap_chan *chan)
+{
+ int err = 0;
+ /* Pass sequential frames to l2cap_reassemble_sdu()
+ * until a gap is encountered.
+ */
+
+ BT_DBG("chan %p", chan);
+
+ while (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ struct sk_buff *skb;
+ BT_DBG("Searching for skb with txseq %d (queue len %d)",
+ chan->buffer_seq, skb_queue_len(&chan->srej_q));
+
+ skb = l2cap_ertm_seq_in_queue(&chan->srej_q, chan->buffer_seq);
+
+ if (!skb)
+ break;
+
+ skb_unlink(skb, &chan->srej_q);
+ chan->buffer_seq = __next_seq(chan, chan->buffer_seq);
+ err = l2cap_reassemble_sdu(chan, skb, &bt_cb(skb)->l2cap);
+ if (err)
+ break;
+ }
+
+ if (skb_queue_empty(&chan->srej_q)) {
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ l2cap_send_ack(chan);
+ }
+
+ return err;
+}
+
+static void l2cap_handle_srej(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (control->reqseq == chan->next_tx_seq) {
+ BT_DBG("Invalid reqseq %d, disconnecting", control->reqseq);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ skb = l2cap_ertm_seq_in_queue(&chan->tx_q, control->reqseq);
+
+ if (skb == NULL) {
+ BT_DBG("Seq %d not available for retransmission",
+ control->reqseq);
+ return;
+ }
+
+ if (chan->max_tx != 0 && bt_cb(skb)->l2cap.retries >= chan->max_tx) {
+ BT_DBG("Retry limit exceeded (%d)", chan->max_tx);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ if (control->poll) {
+ l2cap_pass_to_tx(chan, control);
+
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_retransmit(chan, control);
+ l2cap_ertm_send(chan);
+
+ if (chan->tx_state == L2CAP_TX_STATE_WAIT_F) {
+ set_bit(CONN_SREJ_ACT, &chan->conn_state);
+ chan->srej_save_reqseq = control->reqseq;
+ }
+ } else {
+ l2cap_pass_to_tx_fbit(chan, control);
+
+ if (control->final) {
+ if (chan->srej_save_reqseq != control->reqseq ||
+ !test_and_clear_bit(CONN_SREJ_ACT,
+ &chan->conn_state))
+ l2cap_retransmit(chan, control);
+ } else {
+ l2cap_retransmit(chan, control);
+ if (chan->tx_state == L2CAP_TX_STATE_WAIT_F) {
+ set_bit(CONN_SREJ_ACT, &chan->conn_state);
+ chan->srej_save_reqseq = control->reqseq;
+ }
+ }
+ }
+}
+
+static void l2cap_handle_rej(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (control->reqseq == chan->next_tx_seq) {
+ BT_DBG("Invalid reqseq %d, disconnecting", control->reqseq);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ skb = l2cap_ertm_seq_in_queue(&chan->tx_q, control->reqseq);
+
+ if (chan->max_tx && skb &&
+ bt_cb(skb)->l2cap.retries >= chan->max_tx) {
+ BT_DBG("Retry limit exceeded (%d)", chan->max_tx);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ l2cap_pass_to_tx(chan, control);
+
+ if (control->final) {
+ if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state))
+ l2cap_retransmit_all(chan, control);
+ } else {
+ l2cap_retransmit_all(chan, control);
+ l2cap_ertm_send(chan);
+ if (chan->tx_state == L2CAP_TX_STATE_WAIT_F)
+ set_bit(CONN_REJ_ACT, &chan->conn_state);
+ }
+}
+
+static u8 l2cap_classify_txseq(struct l2cap_chan *chan, u16 txseq)
+{
+ BT_DBG("chan %p, txseq %d", chan, txseq);
+
+ BT_DBG("last_acked_seq %d, expected_tx_seq %d", chan->last_acked_seq,
+ chan->expected_tx_seq);
+
+ if (chan->rx_state == L2CAP_RX_STATE_SREJ_SENT) {
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) >=
+ chan->tx_win) {
+ /* See notes below regarding "double poll" and
+ * invalid packets.
+ */
+ if (chan->tx_win <= ((chan->tx_win_max + 1) >> 1)) {
+ BT_DBG("Invalid/Ignore - after SREJ");
+ return L2CAP_TXSEQ_INVALID_IGNORE;
+ } else {
+ BT_DBG("Invalid - in window after SREJ sent");
+ return L2CAP_TXSEQ_INVALID;
+ }
+ }
+
+ if (chan->srej_list.head == txseq) {
+ BT_DBG("Expected SREJ");
+ return L2CAP_TXSEQ_EXPECTED_SREJ;
+ }
+
+ if (l2cap_ertm_seq_in_queue(&chan->srej_q, txseq)) {
+ BT_DBG("Duplicate SREJ - txseq already stored");
+ return L2CAP_TXSEQ_DUPLICATE_SREJ;
+ }
+
+ if (l2cap_seq_list_contains(&chan->srej_list, txseq)) {
+ BT_DBG("Unexpected SREJ - not requested");
+ return L2CAP_TXSEQ_UNEXPECTED_SREJ;
+ }
+ }
+
+ if (chan->expected_tx_seq == txseq) {
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) >=
+ chan->tx_win) {
+ BT_DBG("Invalid - txseq outside tx window");
+ return L2CAP_TXSEQ_INVALID;
+ } else {
+ BT_DBG("Expected");
+ return L2CAP_TXSEQ_EXPECTED;
+ }
+ }
+
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) <
+ __seq_offset(chan, chan->expected_tx_seq, chan->last_acked_seq)) {
+ BT_DBG("Duplicate - expected_tx_seq later than txseq");
+ return L2CAP_TXSEQ_DUPLICATE;
+ }
+
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) >= chan->tx_win) {
+ /* A source of invalid packets is a "double poll" condition,
+ * where delays cause us to send multiple poll packets. If
+ * the remote stack receives and processes both polls,
+ * sequence numbers can wrap around in such a way that a
+ * resent frame has a sequence number that looks like new data
+ * with a sequence gap. This would trigger an erroneous SREJ
+ * request.
+ *
+ * Fortunately, this is impossible with a tx window that's
+ * less than half of the maximum sequence number, which allows
+ * invalid frames to be safely ignored.
+ *
+ * With tx window sizes greater than half of the tx window
+ * maximum, the frame is invalid and cannot be ignored. This
+ * causes a disconnect.
+ */
+
+ if (chan->tx_win <= ((chan->tx_win_max + 1) >> 1)) {
+ BT_DBG("Invalid/Ignore - txseq outside tx window");
+ return L2CAP_TXSEQ_INVALID_IGNORE;
+ } else {
+ BT_DBG("Invalid - txseq outside tx window");
+ return L2CAP_TXSEQ_INVALID;
+ }
+ } else {
+ BT_DBG("Unexpected - txseq indicates missing frames");
+ return L2CAP_TXSEQ_UNEXPECTED;
+ }
+}
+
+static int l2cap_rx_state_recv(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ struct l2cap_ctrl local_control;
+ int err = 0;
+ bool skb_in_use = false;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d", chan, control, skb,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_RECV_IFRAME:
+ switch (l2cap_classify_txseq(chan, control->txseq)) {
+ case L2CAP_TXSEQ_EXPECTED:
+ l2cap_pass_to_tx(chan, control);
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ BT_DBG("Busy, discarding expected seq %d",
+ control->txseq);
+ break;
+ }
+
+ chan->expected_tx_seq = __next_seq(chan,
+ control->txseq);
+
+ chan->buffer_seq = chan->expected_tx_seq;
+ skb_in_use = true;
+
+ /* l2cap_reassemble_sdu may free skb, hence invalidate
+ * control, so make a copy in advance to use it after
+ * l2cap_reassemble_sdu returns and to avoid the race
+ * condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but
+ * it was freed by skb_free_datagram.
+ */
+ local_control = *control;
+ err = l2cap_reassemble_sdu(chan, skb, control);
+ if (err)
+ break;
+
+ if (local_control.final) {
+ if (!test_and_clear_bit(CONN_REJ_ACT,
+ &chan->conn_state)) {
+ local_control.final = 0;
+ l2cap_retransmit_all(chan, &local_control);
+ l2cap_ertm_send(chan);
+ }
+ }
+
+ if (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state))
+ l2cap_send_ack(chan);
+ break;
+ case L2CAP_TXSEQ_UNEXPECTED:
+ l2cap_pass_to_tx(chan, control);
+
+ /* Can't issue SREJ frames in the local busy state.
+ * Drop this frame, it will be seen as missing
+ * when local busy is exited.
+ */
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ BT_DBG("Busy, discarding unexpected seq %d",
+ control->txseq);
+ break;
+ }
+
+ /* There was a gap in the sequence, so an SREJ
+ * must be sent for each missing frame. The
+ * current frame is stored for later use.
+ */
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ clear_bit(CONN_SREJ_ACT, &chan->conn_state);
+ l2cap_seq_list_clear(&chan->srej_list);
+ l2cap_send_srej(chan, control->txseq);
+
+ chan->rx_state = L2CAP_RX_STATE_SREJ_SENT;
+ break;
+ case L2CAP_TXSEQ_DUPLICATE:
+ l2cap_pass_to_tx(chan, control);
+ break;
+ case L2CAP_TXSEQ_INVALID_IGNORE:
+ break;
+ case L2CAP_TXSEQ_INVALID:
+ default:
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ break;
+ }
+ break;
+ case L2CAP_EV_RECV_RR:
+ l2cap_pass_to_tx(chan, control);
+ if (control->final) {
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ if (!test_and_clear_bit(CONN_REJ_ACT,
+ &chan->conn_state)) {
+ control->final = 0;
+ l2cap_retransmit_all(chan, control);
+ }
+
+ l2cap_ertm_send(chan);
+ } else if (control->poll) {
+ l2cap_send_i_or_rr_or_rnr(chan);
+ } else {
+ if (test_and_clear_bit(CONN_REMOTE_BUSY,
+ &chan->conn_state) &&
+ chan->unacked_frames)
+ __set_retrans_timer(chan);
+
+ l2cap_ertm_send(chan);
+ }
+ break;
+ case L2CAP_EV_RECV_RNR:
+ set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+ l2cap_pass_to_tx(chan, control);
+ if (control && control->poll) {
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_send_rr_or_rnr(chan, 0);
+ }
+ __clear_retrans_timer(chan);
+ l2cap_seq_list_clear(&chan->retrans_list);
+ break;
+ case L2CAP_EV_RECV_REJ:
+ l2cap_handle_rej(chan, control);
+ break;
+ case L2CAP_EV_RECV_SREJ:
+ l2cap_handle_srej(chan, control);
+ break;
+ default:
+ break;
+ }
+
+ if (skb && !skb_in_use) {
+ BT_DBG("Freeing %p", skb);
+ kfree_skb(skb);
+ }
+
+ return err;
+}
+
+static int l2cap_rx_state_srej_sent(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err = 0;
+ u16 txseq = control->txseq;
+ bool skb_in_use = false;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d", chan, control, skb,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_RECV_IFRAME:
+ switch (l2cap_classify_txseq(chan, txseq)) {
+ case L2CAP_TXSEQ_EXPECTED:
+ /* Keep frame for reassembly later */
+ l2cap_pass_to_tx(chan, control);
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ chan->expected_tx_seq = __next_seq(chan, txseq);
+ break;
+ case L2CAP_TXSEQ_EXPECTED_SREJ:
+ l2cap_seq_list_pop(&chan->srej_list);
+
+ l2cap_pass_to_tx(chan, control);
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ err = l2cap_rx_queued_iframes(chan);
+ if (err)
+ break;
+
+ break;
+ case L2CAP_TXSEQ_UNEXPECTED:
+ /* Got a frame that can't be reassembled yet.
+ * Save it for later, and send SREJs to cover
+ * the missing frames.
+ */
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ l2cap_pass_to_tx(chan, control);
+ l2cap_send_srej(chan, control->txseq);
+ break;
+ case L2CAP_TXSEQ_UNEXPECTED_SREJ:
+ /* This frame was requested with an SREJ, but
+ * some expected retransmitted frames are
+ * missing. Request retransmission of missing
+ * SREJ'd frames.
+ */
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ l2cap_pass_to_tx(chan, control);
+ l2cap_send_srej_list(chan, control->txseq);
+ break;
+ case L2CAP_TXSEQ_DUPLICATE_SREJ:
+ /* We've already queued this frame. Drop this copy. */
+ l2cap_pass_to_tx(chan, control);
+ break;
+ case L2CAP_TXSEQ_DUPLICATE:
+ /* Expecting a later sequence number, so this frame
+ * was already received. Ignore it completely.
+ */
+ break;
+ case L2CAP_TXSEQ_INVALID_IGNORE:
+ break;
+ case L2CAP_TXSEQ_INVALID:
+ default:
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ break;
+ }
+ break;
+ case L2CAP_EV_RECV_RR:
+ l2cap_pass_to_tx(chan, control);
+ if (control->final) {
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ if (!test_and_clear_bit(CONN_REJ_ACT,
+ &chan->conn_state)) {
+ control->final = 0;
+ l2cap_retransmit_all(chan, control);
+ }
+
+ l2cap_ertm_send(chan);
+ } else if (control->poll) {
+ if (test_and_clear_bit(CONN_REMOTE_BUSY,
+ &chan->conn_state) &&
+ chan->unacked_frames) {
+ __set_retrans_timer(chan);
+ }
+
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_send_srej_tail(chan);
+ } else {
+ if (test_and_clear_bit(CONN_REMOTE_BUSY,
+ &chan->conn_state) &&
+ chan->unacked_frames)
+ __set_retrans_timer(chan);
+
+ l2cap_send_ack(chan);
+ }
+ break;
+ case L2CAP_EV_RECV_RNR:
+ set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+ l2cap_pass_to_tx(chan, control);
+ if (control->poll) {
+ l2cap_send_srej_tail(chan);
+ } else {
+ struct l2cap_ctrl rr_control;
+ memset(&rr_control, 0, sizeof(rr_control));
+ rr_control.sframe = 1;
+ rr_control.super = L2CAP_SUPER_RR;
+ rr_control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &rr_control);
+ }
+
+ break;
+ case L2CAP_EV_RECV_REJ:
+ l2cap_handle_rej(chan, control);
+ break;
+ case L2CAP_EV_RECV_SREJ:
+ l2cap_handle_srej(chan, control);
+ break;
+ }
+
+ if (skb && !skb_in_use) {
+ BT_DBG("Freeing %p", skb);
+ kfree_skb(skb);
+ }
+
+ return err;
+}
+
+static int l2cap_finish_move(struct l2cap_chan *chan)
+{
+ BT_DBG("chan %p", chan);
+
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ chan->conn->mtu = chan->conn->hcon->mtu;
+
+ return l2cap_resegment(chan);
+}
+
+static int l2cap_rx_state_wait_p(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d", chan, control, skb,
+ event);
+
+ if (!control->poll)
+ return -EPROTO;
+
+ l2cap_process_reqseq(chan, control->reqseq);
+
+ if (!skb_queue_empty(&chan->tx_q))
+ chan->tx_send_head = skb_peek(&chan->tx_q);
+ else
+ chan->tx_send_head = NULL;
+
+ /* Rewind next_tx_seq to the point expected
+ * by the receiver.
+ */
+ chan->next_tx_seq = control->reqseq;
+ chan->unacked_frames = 0;
+
+ err = l2cap_finish_move(chan);
+ if (err)
+ return err;
+
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_send_i_or_rr_or_rnr(chan);
+
+ if (event == L2CAP_EV_RECV_IFRAME)
+ return -EPROTO;
+
+ return l2cap_rx_state_recv(chan, control, NULL, event);
+}
+
+static int l2cap_rx_state_wait_f(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err;
+
+ if (!control->final)
+ return -EPROTO;
+
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ l2cap_process_reqseq(chan, control->reqseq);
+
+ if (!skb_queue_empty(&chan->tx_q))
+ chan->tx_send_head = skb_peek(&chan->tx_q);
+ else
+ chan->tx_send_head = NULL;
+
+ /* Rewind next_tx_seq to the point expected
+ * by the receiver.
+ */
+ chan->next_tx_seq = control->reqseq;
+ chan->unacked_frames = 0;
+ chan->conn->mtu = chan->conn->hcon->mtu;
+
+ err = l2cap_resegment(chan);
+
+ if (!err)
+ err = l2cap_rx_state_recv(chan, control, skb, event);
+
+ return err;
+}
+
+static bool __valid_reqseq(struct l2cap_chan *chan, u16 reqseq)
+{
+ /* Make sure reqseq is for a packet that has been sent but not acked */
+ u16 unacked;
+
+ unacked = __seq_offset(chan, chan->next_tx_seq, chan->expected_ack_seq);
+ return __seq_offset(chan, chan->next_tx_seq, reqseq) <= unacked;
+}
+
+static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err = 0;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d, state %d", chan,
+ control, skb, event, chan->rx_state);
+
+ if (__valid_reqseq(chan, control->reqseq)) {
+ switch (chan->rx_state) {
+ case L2CAP_RX_STATE_RECV:
+ err = l2cap_rx_state_recv(chan, control, skb, event);
+ break;
+ case L2CAP_RX_STATE_SREJ_SENT:
+ err = l2cap_rx_state_srej_sent(chan, control, skb,
+ event);
+ break;
+ case L2CAP_RX_STATE_WAIT_P:
+ err = l2cap_rx_state_wait_p(chan, control, skb, event);
+ break;
+ case L2CAP_RX_STATE_WAIT_F:
+ err = l2cap_rx_state_wait_f(chan, control, skb, event);
+ break;
+ default:
+ /* shut it down */
+ break;
+ }
+ } else {
+ BT_DBG("Invalid reqseq %d (next_tx_seq %d, expected_ack_seq %d",
+ control->reqseq, chan->next_tx_seq,
+ chan->expected_ack_seq);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ }
+
+ return err;
+}
+
+static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff *skb)
+{
+ /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store
+ * the txseq field in advance to use it after l2cap_reassemble_sdu
+ * returns and to avoid the race condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but it was freed by
+ * skb_free_datagram.
+ */
+ u16 txseq = control->txseq;
+
+ BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb,
+ chan->rx_state);
+
+ if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) {
+ l2cap_pass_to_tx(chan, control);
+
+ BT_DBG("buffer_seq %u->%u", chan->buffer_seq,
+ __next_seq(chan, chan->buffer_seq));
+
+ chan->buffer_seq = __next_seq(chan, chan->buffer_seq);
+
+ l2cap_reassemble_sdu(chan, skb, control);
+ } else {
+ if (chan->sdu) {
+ kfree_skb(chan->sdu);
+ chan->sdu = NULL;
+ }
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+
+ if (skb) {
+ BT_DBG("Freeing %p", skb);
+ kfree_skb(skb);
+ }
+ }
+
+ chan->last_acked_seq = txseq;
+ chan->expected_tx_seq = __next_seq(chan, txseq);
+
+ return 0;
+}
+
+static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct l2cap_ctrl *control = &bt_cb(skb)->l2cap;
+ u16 len;
+ u8 event;
+
+ __unpack_control(chan, skb);
+
+ len = skb->len;
+
+ /*
+ * We can just drop the corrupted I-frame here.
+ * Receiver will miss it and start proper recovery
+ * procedures and ask for retransmission.
+ */
+ if (l2cap_check_fcs(chan, skb))
+ goto drop;
+
+ if (!control->sframe && control->sar == L2CAP_SAR_START)
+ len -= L2CAP_SDULEN_SIZE;
+
+ if (chan->fcs == L2CAP_FCS_CRC16)
+ len -= L2CAP_FCS_SIZE;
+
+ if (len > chan->mps) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto drop;
+ }
+
+ if (chan->ops->filter) {
+ if (chan->ops->filter(chan, skb))
+ goto drop;
+ }
+
+ if (!control->sframe) {
+ int err;
+
+ BT_DBG("iframe sar %d, reqseq %d, final %d, txseq %d",
+ control->sar, control->reqseq, control->final,
+ control->txseq);
+
+ /* Validate F-bit - F=0 always valid, F=1 only
+ * valid in TX WAIT_F
+ */
+ if (control->final && chan->tx_state != L2CAP_TX_STATE_WAIT_F)
+ goto drop;
+
+ if (chan->mode != L2CAP_MODE_STREAMING) {
+ event = L2CAP_EV_RECV_IFRAME;
+ err = l2cap_rx(chan, control, skb, event);
+ } else {
+ err = l2cap_stream_rx(chan, control, skb);
+ }
+
+ if (err)
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ } else {
+ const u8 rx_func_to_event[4] = {
+ L2CAP_EV_RECV_RR, L2CAP_EV_RECV_REJ,
+ L2CAP_EV_RECV_RNR, L2CAP_EV_RECV_SREJ
+ };
+
+ /* Only I-frames are expected in streaming mode */
+ if (chan->mode == L2CAP_MODE_STREAMING)
+ goto drop;
+
+ BT_DBG("sframe reqseq %d, final %d, poll %d, super %d",
+ control->reqseq, control->final, control->poll,
+ control->super);
+
+ if (len != 0) {
+ BT_ERR("Trailing bytes: %d in sframe", len);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto drop;
+ }
+
+ /* Validate F and P bits */
+ if (control->final && (control->poll ||
+ chan->tx_state != L2CAP_TX_STATE_WAIT_F))
+ goto drop;
+
+ event = rx_func_to_event[control->super];
+ if (l2cap_rx(chan, control, skb, event))
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ }
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_le_credits pkt;
+ u16 return_credits = l2cap_le_rx_credits(chan);
+
+ if (chan->rx_credits >= return_credits)
+ return;
+
+ return_credits -= chan->rx_credits;
+
+ BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
+
+ chan->rx_credits += return_credits;
+
+ pkt.cid = cpu_to_le16(chan->scid);
+ pkt.credits = cpu_to_le16(return_credits);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
+}
+
+void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail)
+{
+ if (chan->rx_avail == rx_avail)
+ return;
+
+ BT_DBG("chan %p has %zd bytes avail for rx", chan, rx_avail);
+
+ chan->rx_avail = rx_avail;
+
+ if (chan->state == BT_CONNECTED)
+ l2cap_chan_le_send_credits(chan);
+}
+
+static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len);
+
+ /* Wait recv to confirm reception before updating the credits */
+ err = chan->ops->recv(chan, skb);
+
+ if (err < 0 && chan->rx_avail != -1) {
+ BT_ERR("Queueing received LE L2CAP data failed");
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return err;
+ }
+
+ /* Update credits whenever an SDU is received */
+ l2cap_chan_le_send_credits(chan);
+
+ return err;
+}
+
+static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ if (!chan->rx_credits) {
+ BT_ERR("No credits to receive LE L2CAP data");
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return -ENOBUFS;
+ }
+
+ if (chan->imtu < skb->len) {
+ BT_ERR("Too big LE L2CAP PDU");
+ return -ENOBUFS;
+ }
+
+ chan->rx_credits--;
+ BT_DBG("chan %p: rx_credits %u -> %u",
+ chan, chan->rx_credits + 1, chan->rx_credits);
+
+ /* Update if remote had run out of credits, this should only happens
+ * if the remote is not using the entire MPS.
+ */
+ if (!chan->rx_credits)
+ l2cap_chan_le_send_credits(chan);
+
+ err = 0;
+
+ if (!chan->sdu) {
+ u16 sdu_len;
+
+ sdu_len = get_unaligned_le16(skb->data);
+ skb_pull(skb, L2CAP_SDULEN_SIZE);
+
+ BT_DBG("Start of new SDU. sdu_len %u skb->len %u imtu %u",
+ sdu_len, skb->len, chan->imtu);
+
+ if (sdu_len > chan->imtu) {
+ BT_ERR("Too big LE L2CAP SDU length received");
+ err = -EMSGSIZE;
+ goto failed;
+ }
+
+ if (skb->len > sdu_len) {
+ BT_ERR("Too much LE L2CAP data received");
+ err = -EINVAL;
+ goto failed;
+ }
+
+ if (skb->len == sdu_len)
+ return l2cap_ecred_recv(chan, skb);
+
+ chan->sdu = skb;
+ chan->sdu_len = sdu_len;
+ chan->sdu_last_frag = skb;
+
+ /* Detect if remote is not able to use the selected MPS */
+ if (skb->len + L2CAP_SDULEN_SIZE < chan->mps) {
+ u16 mps_len = skb->len + L2CAP_SDULEN_SIZE;
+
+ /* Adjust the number of credits */
+ BT_DBG("chan->mps %u -> %u", chan->mps, mps_len);
+ chan->mps = mps_len;
+ l2cap_chan_le_send_credits(chan);
+ }
+
+ return 0;
+ }
+
+ BT_DBG("SDU fragment. chan->sdu->len %u skb->len %u chan->sdu_len %u",
+ chan->sdu->len, skb->len, chan->sdu_len);
+
+ if (chan->sdu->len + skb->len > chan->sdu_len) {
+ BT_ERR("Too much LE L2CAP data received");
+ err = -EINVAL;
+ goto failed;
+ }
+
+ append_skb_frag(chan->sdu, skb, &chan->sdu_last_frag);
+ skb = NULL;
+
+ if (chan->sdu->len == chan->sdu_len) {
+ err = l2cap_ecred_recv(chan, chan->sdu);
+ if (!err) {
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+ }
+
+failed:
+ if (err) {
+ kfree_skb(skb);
+ kfree_skb(chan->sdu);
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+
+ /* We can't return an error here since we took care of the skb
+ * freeing internally. An error return would cause the caller to
+ * do a double-free of the skb.
+ */
+ return 0;
+}
+
+static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
+ struct sk_buff *skb)
+{
+ struct l2cap_chan *chan;
+
+ chan = l2cap_get_chan_by_scid(conn, cid);
+ if (!chan) {
+ BT_DBG("unknown cid 0x%4.4x", cid);
+ /* Drop packet and return */
+ kfree_skb(skb);
+ return;
+ }
+
+ BT_DBG("chan %p, len %d", chan, skb->len);
+
+ /* If we receive data on a fixed channel before the info req/rsp
+ * procedure is done simply assume that the channel is supported
+ * and mark it as ready.
+ */
+ if (chan->chan_type == L2CAP_CHAN_FIXED)
+ l2cap_chan_ready(chan);
+
+ if (chan->state != BT_CONNECTED)
+ goto drop;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (l2cap_ecred_data_rcv(chan, skb) < 0)
+ goto drop;
+
+ goto done;
+
+ case L2CAP_MODE_BASIC:
+ /* If socket recv buffers overflows we drop data here
+ * which is *bad* because L2CAP has to be reliable.
+ * But we don't have any other choice. L2CAP doesn't
+ * provide flow control mechanism. */
+
+ if (chan->imtu < skb->len) {
+ BT_ERR("Dropping L2CAP data: receive buffer overflow");
+ goto drop;
+ }
+
+ if (!chan->ops->recv(chan, skb))
+ goto done;
+ break;
+
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ l2cap_data_rcv(chan, skb);
+ goto done;
+
+ default:
+ BT_DBG("chan %p: bad mode 0x%2.2x", chan, chan->mode);
+ break;
+ }
+
+drop:
+ kfree_skb(skb);
+
+done:
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
+ struct sk_buff *skb)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan;
+
+ if (hcon->type != ACL_LINK)
+ goto free_skb;
+
+ chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst,
+ ACL_LINK);
+ if (!chan)
+ goto free_skb;
+
+ BT_DBG("chan %p, len %d", chan, skb->len);
+
+ l2cap_chan_lock(chan);
+
+ if (chan->state != BT_BOUND && chan->state != BT_CONNECTED)
+ goto drop;
+
+ if (chan->imtu < skb->len)
+ goto drop;
+
+ /* Store remote BD_ADDR and PSM for msg_name */
+ bacpy(&bt_cb(skb)->l2cap.bdaddr, &hcon->dst);
+ bt_cb(skb)->l2cap.psm = psm;
+
+ if (!chan->ops->recv(chan, skb)) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return;
+ }
+
+drop:
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+free_skb:
+ kfree_skb(skb);
+}
+
+static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct l2cap_hdr *lh = (void *) skb->data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 cid, len;
+ __le16 psm;
+
+ if (hcon->state != BT_CONNECTED) {
+ BT_DBG("queueing pending rx skb");
+ skb_queue_tail(&conn->pending_rx, skb);
+ return;
+ }
+
+ skb_pull(skb, L2CAP_HDR_SIZE);
+ cid = __le16_to_cpu(lh->cid);
+ len = __le16_to_cpu(lh->len);
+
+ if (len != skb->len) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Since we can't actively block incoming LE connections we must
+ * at least ensure that we ignore incoming data from them.
+ */
+ if (hcon->type == LE_LINK &&
+ hci_bdaddr_list_lookup(&hcon->hdev->reject_list, &hcon->dst,
+ bdaddr_dst_type(hcon))) {
+ kfree_skb(skb);
+ return;
+ }
+
+ BT_DBG("len %d, cid 0x%4.4x", len, cid);
+
+ switch (cid) {
+ case L2CAP_CID_SIGNALING:
+ l2cap_sig_channel(conn, skb);
+ break;
+
+ case L2CAP_CID_CONN_LESS:
+ psm = get_unaligned((__le16 *) skb->data);
+ skb_pull(skb, L2CAP_PSMLEN_SIZE);
+ l2cap_conless_channel(conn, psm, skb);
+ break;
+
+ case L2CAP_CID_LE_SIGNALING:
+ l2cap_le_sig_channel(conn, skb);
+ break;
+
+ default:
+ l2cap_data_channel(conn, cid, skb);
+ break;
+ }
+}
+
+static void process_pending_rx(struct work_struct *work)
+{
+ struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
+ pending_rx_work);
+ struct sk_buff *skb;
+
+ BT_DBG("");
+
+ mutex_lock(&conn->lock);
+
+ while ((skb = skb_dequeue(&conn->pending_rx)))
+ l2cap_recv_frame(conn, skb);
+
+ mutex_unlock(&conn->lock);
+}
+
+static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct hci_chan *hchan;
+
+ if (conn)
+ return conn;
+
+ hchan = hci_chan_create(hcon);
+ if (!hchan)
+ return NULL;
+
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+ if (!conn) {
+ hci_chan_del(hchan);
+ return NULL;
+ }
+
+ kref_init(&conn->ref);
+ hcon->l2cap_data = conn;
+ conn->hcon = hci_conn_get(hcon);
+ conn->hchan = hchan;
+
+ BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan);
+
+ conn->mtu = hcon->mtu;
+ conn->feat_mask = 0;
+
+ conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS;
+
+ if (hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED) &&
+ (bredr_sc_enabled(hcon->hdev) ||
+ hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP)))
+ conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR;
+
+ mutex_init(&conn->ident_lock);
+ mutex_init(&conn->lock);
+
+ INIT_LIST_HEAD(&conn->chan_l);
+ INIT_LIST_HEAD(&conn->users);
+
+ INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout);
+
+ skb_queue_head_init(&conn->pending_rx);
+ INIT_WORK(&conn->pending_rx_work, process_pending_rx);
+ INIT_DELAYED_WORK(&conn->id_addr_timer, l2cap_conn_update_id_addr);
+
+ conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
+
+ return conn;
+}
+
+static bool is_valid_psm(u16 psm, u8 dst_type)
+{
+ if (!psm)
+ return false;
+
+ if (bdaddr_type_is_le(dst_type))
+ return (psm <= 0x00ff);
+
+ /* PSM must be odd and lsb of upper byte must be 0 */
+ return ((psm & 0x0101) == 0x0001);
+}
+
+struct l2cap_chan_data {
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_chan_data *d = data;
+ struct pid *pid;
+
+ if (chan == d->chan)
+ return;
+
+ if (!test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only count deferred channels with the same PID/PSM */
+ if (d->pid != pid || chan->psm != d->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ d->count++;
+}
+
+int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
+ bdaddr_t *dst, u8 dst_type, u16 timeout)
+{
+ struct l2cap_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err;
+
+ BT_DBG("%pMR -> %pMR (type %u) psm 0x%4.4x mode 0x%2.2x", &chan->src,
+ dst, dst_type, __le16_to_cpu(psm), chan->mode);
+
+ hdev = hci_get_route(dst, &chan->src, chan->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (!is_valid_psm(__le16_to_cpu(psm), dst_type) && !cid &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && !psm) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (chan->chan_type == L2CAP_CHAN_FIXED && !cid) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ break;
+ case L2CAP_MODE_LE_FLOWCTL:
+ break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!enable_ecred) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+ break;
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ if (!disable_ertm)
+ break;
+ fallthrough;
+ default:
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ switch (chan->state) {
+ case BT_CONNECT:
+ case BT_CONNECT2:
+ case BT_CONFIG:
+ /* Already connecting */
+ err = 0;
+ goto done;
+
+ case BT_CONNECTED:
+ /* Already connected */
+ err = -EISCONN;
+ goto done;
+
+ case BT_OPEN:
+ case BT_BOUND:
+ /* Can connect */
+ break;
+
+ default:
+ err = -EBADFD;
+ goto done;
+ }
+
+ /* Set destination address and psm */
+ bacpy(&chan->dst, dst);
+ chan->dst_type = dst_type;
+
+ chan->psm = psm;
+ chan->dcid = cid;
+
+ if (bdaddr_type_is_le(dst_type)) {
+ /* Convert from L2CAP channel address type to HCI address type
+ */
+ if (dst_type == BDADDR_LE_PUBLIC)
+ dst_type = ADDR_LE_DEV_PUBLIC;
+ else
+ dst_type = ADDR_LE_DEV_RANDOM;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ hcon = hci_connect_le(hdev, dst, dst_type, false,
+ chan->sec_level, timeout,
+ HCI_ROLE_SLAVE, 0, 0);
+ else
+ hcon = hci_connect_le_scan(hdev, dst, dst_type,
+ chan->sec_level, timeout,
+ CONN_REASON_L2CAP_CHAN);
+
+ } else {
+ u8 auth_type = l2cap_get_auth_type(chan);
+ hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type,
+ CONN_REASON_L2CAP_CHAN, timeout);
+ }
+
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto done;
+ }
+
+ conn = l2cap_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto done;
+ }
+
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ struct l2cap_chan_data data;
+
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+ data.count = 1;
+
+ l2cap_chan_list(conn, l2cap_chan_by_pid, &data);
+
+ /* Check if there isn't too many channels being connected */
+ if (data.count > L2CAP_ECRED_CONN_SCID_MAX) {
+ hci_conn_drop(hcon);
+ err = -EPROTO;
+ goto done;
+ }
+ }
+
+ mutex_lock(&conn->lock);
+ l2cap_chan_lock(chan);
+
+ if (cid && __l2cap_get_chan_by_dcid(conn, cid)) {
+ hci_conn_drop(hcon);
+ err = -EBUSY;
+ goto chan_unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&chan->src, &hcon->src);
+ chan->src_type = bdaddr_src_type(hcon);
+
+ __l2cap_chan_add(conn, chan);
+
+ /* l2cap_chan_add takes its own ref so we can drop this one */
+ hci_conn_drop(hcon);
+
+ l2cap_state_change(chan, BT_CONNECT);
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ /* Release chan->sport so that it can be reused by other
+ * sockets (as it's only used for listening sockets).
+ */
+ write_lock(&chan_list_lock);
+ chan->sport = 0;
+ write_unlock(&chan_list_lock);
+
+ if (hcon->state == BT_CONNECTED) {
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ __clear_chan_timer(chan);
+ if (l2cap_chan_check_security(chan, true))
+ l2cap_state_change(chan, BT_CONNECTED);
+ } else
+ l2cap_do_start(chan);
+ }
+
+ err = 0;
+
+chan_unlock:
+ l2cap_chan_unlock(chan);
+ mutex_unlock(&conn->lock);
+done:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_connect);
+
+static void l2cap_ecred_reconfigure(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ DEFINE_RAW_FLEX(struct l2cap_ecred_reconf_req, pdu, scid, 1);
+
+ pdu->mtu = cpu_to_le16(chan->imtu);
+ pdu->mps = cpu_to_le16(chan->mps);
+ pdu->scid[0] = cpu_to_le16(chan->scid);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ,
+ sizeof(pdu), &pdu);
+}
+
+int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu)
+{
+ if (chan->imtu > mtu)
+ return -EINVAL;
+
+ BT_DBG("chan %p mtu 0x%4.4x", chan, mtu);
+
+ chan->imtu = mtu;
+
+ l2cap_ecred_reconfigure(chan);
+
+ return 0;
+}
+
+/* ---- L2CAP interface with lower layer (HCI) ---- */
+
+int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ int exact = 0, lm1 = 0, lm2 = 0;
+ struct l2cap_chan *c;
+
+ BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr);
+
+ /* Find listening sockets and check their link_mode */
+ read_lock(&chan_list_lock);
+ list_for_each_entry(c, &chan_list, global_l) {
+ if (c->state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&c->src, &hdev->bdaddr)) {
+ lm1 |= HCI_LM_ACCEPT;
+ if (test_bit(FLAG_ROLE_SWITCH, &c->flags))
+ lm1 |= HCI_LM_MASTER;
+ exact++;
+ } else if (!bacmp(&c->src, BDADDR_ANY)) {
+ lm2 |= HCI_LM_ACCEPT;
+ if (test_bit(FLAG_ROLE_SWITCH, &c->flags))
+ lm2 |= HCI_LM_MASTER;
+ }
+ }
+ read_unlock(&chan_list_lock);
+
+ return exact ? lm1 : lm2;
+}
+
+/* Find the next fixed channel in BT_LISTEN state, continue iteration
+ * from an existing channel in the list or from the beginning of the
+ * global list (by passing NULL as first parameter).
+ */
+static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c,
+ struct hci_conn *hcon)
+{
+ u8 src_type = bdaddr_src_type(hcon);
+
+ read_lock(&chan_list_lock);
+
+ if (c)
+ c = list_next_entry(c, global_l);
+ else
+ c = list_entry(chan_list.next, typeof(*c), global_l);
+
+ list_for_each_entry_from(c, &chan_list, global_l) {
+ if (c->chan_type != L2CAP_CHAN_FIXED)
+ continue;
+ if (c->state != BT_LISTEN)
+ continue;
+ if (bacmp(&c->src, &hcon->src) && bacmp(&c->src, BDADDR_ANY))
+ continue;
+ if (src_type != c->src_type)
+ continue;
+
+ c = l2cap_chan_hold_unless_zero(c);
+ read_unlock(&chan_list_lock);
+ return c;
+ }
+
+ read_unlock(&chan_list_lock);
+
+ return NULL;
+}
+
+static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
+{
+ struct hci_dev *hdev = hcon->hdev;
+ struct l2cap_conn *conn;
+ struct l2cap_chan *pchan;
+ u8 dst_type;
+
+ if (hcon->type != ACL_LINK && hcon->type != LE_LINK)
+ return;
+
+ BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+
+ if (status) {
+ l2cap_conn_del(hcon, bt_to_errno(status));
+ return;
+ }
+
+ conn = l2cap_conn_add(hcon);
+ if (!conn)
+ return;
+
+ dst_type = bdaddr_dst_type(hcon);
+
+ /* If device is blocked, do not create channels for it */
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, &hcon->dst, dst_type))
+ return;
+
+ /* Find fixed channels and notify them of the new connection. We
+ * use multiple individual lookups, continuing each time where
+ * we left off, because the list lock would prevent calling the
+ * potentially sleeping l2cap_chan_lock() function.
+ */
+ pchan = l2cap_global_fixed_chan(NULL, hcon);
+ while (pchan) {
+ struct l2cap_chan *chan, *next;
+
+ /* Client fixed channels should override server ones */
+ if (__l2cap_get_chan_by_dcid(conn, pchan->scid))
+ goto next;
+
+ l2cap_chan_lock(pchan);
+ chan = pchan->ops->new_connection(pchan);
+ if (chan) {
+ bacpy(&chan->src, &hcon->src);
+ bacpy(&chan->dst, &hcon->dst);
+ chan->src_type = bdaddr_src_type(hcon);
+ chan->dst_type = dst_type;
+
+ __l2cap_chan_add(conn, chan);
+ }
+
+ l2cap_chan_unlock(pchan);
+next:
+ next = l2cap_global_fixed_chan(pchan, hcon);
+ l2cap_chan_put(pchan);
+ pchan = next;
+ }
+
+ l2cap_conn_ready(conn);
+}
+
+int l2cap_disconn_ind(struct hci_conn *hcon)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+
+ BT_DBG("hcon %p", hcon);
+
+ if (!conn)
+ return HCI_ERROR_REMOTE_USER_TERM;
+ return conn->disc_reason;
+}
+
+static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
+{
+ if (hcon->type != ACL_LINK && hcon->type != LE_LINK)
+ return;
+
+ BT_DBG("hcon %p reason %d", hcon, reason);
+
+ l2cap_conn_del(hcon, bt_to_errno(reason));
+}
+
+static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt)
+{
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED)
+ return;
+
+ if (encrypt == 0x00) {
+ if (chan->sec_level == BT_SECURITY_MEDIUM) {
+ __set_chan_timer(chan, L2CAP_ENC_TIMEOUT);
+ } else if (chan->sec_level == BT_SECURITY_HIGH ||
+ chan->sec_level == BT_SECURITY_FIPS)
+ l2cap_chan_close(chan, ECONNREFUSED);
+ } else {
+ if (chan->sec_level == BT_SECURITY_MEDIUM)
+ __clear_chan_timer(chan);
+ }
+}
+
+static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+
+ if (!conn)
+ return;
+
+ BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt);
+
+ mutex_lock(&conn->lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ l2cap_chan_lock(chan);
+
+ BT_DBG("chan %p scid 0x%4.4x state %s", chan, chan->scid,
+ state_to_string(chan->state));
+
+ if (!status && encrypt)
+ chan->sec_level = hcon->sec_level;
+
+ if (!__l2cap_no_conn_pending(chan)) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (!status && (chan->state == BT_CONNECTED ||
+ chan->state == BT_CONFIG)) {
+ chan->ops->resume(chan);
+ l2cap_check_encryption(chan, encrypt);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (chan->state == BT_CONNECT) {
+ if (!status && l2cap_check_enc_key_size(hcon, chan))
+ l2cap_start_connection(chan);
+ else
+ __set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
+ } else if (chan->state == BT_CONNECT2 &&
+ !(chan->mode == L2CAP_MODE_EXT_FLOWCTL ||
+ chan->mode == L2CAP_MODE_LE_FLOWCTL)) {
+ struct l2cap_conn_rsp rsp;
+ __u16 res, stat;
+
+ if (!status && l2cap_check_enc_key_size(hcon, chan)) {
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ res = L2CAP_CR_PEND;
+ stat = L2CAP_CS_AUTHOR_PEND;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_state_change(chan, BT_CONFIG);
+ res = L2CAP_CR_SUCCESS;
+ stat = L2CAP_CS_NO_INFO;
+ }
+ } else {
+ l2cap_state_change(chan, BT_DISCONN);
+ __set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
+ res = L2CAP_CR_SEC_BLOCK;
+ stat = L2CAP_CS_NO_INFO;
+ }
+
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.result = cpu_to_le16(res);
+ rsp.status = cpu_to_le16(stat);
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+ sizeof(rsp), &rsp);
+
+ if (!test_bit(CONF_REQ_SENT, &chan->conf_state) &&
+ res == L2CAP_CR_SUCCESS) {
+ char buf[128];
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn),
+ L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf, sizeof(buf)),
+ buf);
+ chan->num_conf_req++;
+ }
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->lock);
+}
+
+/* Append fragment into frame respecting the maximum len of rx_skb */
+static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb,
+ u16 len)
+{
+ if (!conn->rx_skb) {
+ /* Allocate skb for the complete frame (with header) */
+ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
+ if (!conn->rx_skb)
+ return -ENOMEM;
+ /* Init rx_len */
+ conn->rx_len = len;
+
+ skb_set_delivery_time(conn->rx_skb, skb->tstamp,
+ skb->tstamp_type);
+ }
+
+ /* Copy as much as the rx_skb can hold */
+ len = min_t(u16, len, skb->len);
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, len), len);
+ skb_pull(skb, len);
+ conn->rx_len -= len;
+
+ return len;
+}
+
+static int l2cap_recv_len(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct sk_buff *rx_skb;
+ int len;
+
+ /* Append just enough to complete the header */
+ len = l2cap_recv_frag(conn, skb, L2CAP_LEN_SIZE - conn->rx_skb->len);
+
+ /* If header could not be read just continue */
+ if (len < 0 || conn->rx_skb->len < L2CAP_LEN_SIZE)
+ return len;
+
+ rx_skb = conn->rx_skb;
+ len = get_unaligned_le16(rx_skb->data);
+
+ /* Check if rx_skb has enough space to received all fragments */
+ if (len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE) <= skb_tailroom(rx_skb)) {
+ /* Update expected len */
+ conn->rx_len = len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE);
+ return L2CAP_LEN_SIZE;
+ }
+
+ /* Reset conn->rx_skb since it will need to be reallocated in order to
+ * fit all fragments.
+ */
+ conn->rx_skb = NULL;
+
+ /* Reallocates rx_skb using the exact expected length */
+ len = l2cap_recv_frag(conn, rx_skb,
+ len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE));
+ kfree_skb(rx_skb);
+
+ return len;
+}
+
+static void l2cap_recv_reset(struct l2cap_conn *conn)
+{
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+}
+
+struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c)
+{
+ if (!c)
+ return NULL;
+
+ BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref));
+
+ if (!kref_get_unless_zero(&c->ref))
+ return NULL;
+
+ return c;
+}
+
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb, u16 flags)
+{
+ struct hci_conn *hcon;
+ struct l2cap_conn *conn;
+ int len;
+
+ /* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF);
+
+ conn = hcon->l2cap_data;
+
+ if (!conn)
+ conn = l2cap_conn_add(hcon);
+
+ conn = l2cap_conn_hold_unless_zero(conn);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags);
+
+ mutex_lock(&conn->lock);
+
+ switch (flags) {
+ case ACL_START:
+ case ACL_START_NO_FLUSH:
+ case ACL_COMPLETE:
+ if (conn->rx_skb) {
+ BT_ERR("Unexpected start frame (len %d)", skb->len);
+ l2cap_recv_reset(conn);
+ l2cap_conn_unreliable(conn, ECOMM);
+ }
+
+ /* Start fragment may not contain the L2CAP length so just
+ * copy the initial byte when that happens and use conn->mtu as
+ * expected length.
+ */
+ if (skb->len < L2CAP_LEN_SIZE) {
+ l2cap_recv_frag(conn, skb, conn->mtu);
+ break;
+ }
+
+ len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE;
+
+ if (len == skb->len) {
+ /* Complete frame received */
+ l2cap_recv_frame(conn, skb);
+ goto unlock;
+ }
+
+ BT_DBG("Start: total len %d, frag len %u", len, skb->len);
+
+ if (skb->len > len) {
+ BT_ERR("Frame is too long (len %u, expected len %d)",
+ skb->len, len);
+ /* PTS test cases L2CAP/COS/CED/BI-14-C and BI-15-C
+ * (Multiple Signaling Command in one PDU, Data
+ * Truncated, BR/EDR) send a C-frame to the IUT with
+ * PDU Length set to 8 and Channel ID set to the
+ * correct signaling channel for the logical link.
+ * The Information payload contains one L2CAP_ECHO_REQ
+ * packet with Data Length set to 0 with 0 octets of
+ * echo data and one invalid command packet due to
+ * data truncated in PDU but present in HCI packet.
+ *
+ * Shorter the socket buffer to the PDU length to
+ * allow to process valid commands from the PDU before
+ * setting the socket unreliable.
+ */
+ skb->len = len;
+ l2cap_recv_frame(conn, skb);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto unlock;
+ }
+
+ /* Append fragment into frame (with header) */
+ if (l2cap_recv_frag(conn, skb, len) < 0)
+ goto drop;
+
+ break;
+
+ case ACL_CONT:
+ BT_DBG("Cont: frag len %u (expecting %u)", skb->len, conn->rx_len);
+
+ if (!conn->rx_skb) {
+ BT_ERR("Unexpected continuation frame (len %d)", skb->len);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ /* Complete the L2CAP length if it has not been read */
+ if (conn->rx_skb->len < L2CAP_LEN_SIZE) {
+ if (l2cap_recv_len(conn, skb) < 0) {
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ /* Header still could not be read just continue */
+ if (conn->rx_skb->len < L2CAP_LEN_SIZE)
+ break;
+ }
+
+ if (skb->len > conn->rx_len) {
+ BT_ERR("Fragment is too long (len %u, expected %u)",
+ skb->len, conn->rx_len);
+ l2cap_recv_reset(conn);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ /* Append fragment into frame (with header) */
+ l2cap_recv_frag(conn, skb, skb->len);
+
+ if (!conn->rx_len) {
+ /* Complete frame received. l2cap_recv_frame
+ * takes ownership of the skb so set the global
+ * rx_skb pointer to NULL first.
+ */
+ struct sk_buff *rx_skb = conn->rx_skb;
+ conn->rx_skb = NULL;
+ l2cap_recv_frame(conn, rx_skb);
+ }
+ break;
+ }
+
+drop:
+ kfree_skb(skb);
+unlock:
+ mutex_unlock(&conn->lock);
+ l2cap_conn_put(conn);
+ return 0;
+}
+
+static struct hci_cb l2cap_cb = {
+ .name = "L2CAP",
+ .connect_cfm = l2cap_connect_cfm,
+ .disconn_cfm = l2cap_disconn_cfm,
+ .security_cfm = l2cap_security_cfm,
+};
+
+static int l2cap_debugfs_show(struct seq_file *f, void *p)
+{
+ struct l2cap_chan *c;
+
+ read_lock(&chan_list_lock);
+
+ list_for_each_entry(c, &chan_list, global_l) {
+ seq_printf(f, "%pMR (%u) %pMR (%u) %d %d 0x%4.4x 0x%4.4x %d %d %d %d\n",
+ &c->src, c->src_type, &c->dst, c->dst_type,
+ c->state, __le16_to_cpu(c->psm),
+ c->scid, c->dcid, c->imtu, c->omtu,
+ c->sec_level, c->mode);
+ }
+
+ read_unlock(&chan_list_lock);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(l2cap_debugfs);
+
+static struct dentry *l2cap_debugfs;
+
+int __init l2cap_init(void)
+{
+ int err;
+
+ err = l2cap_init_sockets();
+ if (err < 0)
+ return err;
+
+ hci_register_cb(&l2cap_cb);
+
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs,
+ NULL, &l2cap_debugfs_fops);
+
+ return 0;
+}
+
+void l2cap_exit(void)
+{
+ debugfs_remove(l2cap_debugfs);
+ hci_unregister_cb(&l2cap_cb);
+ l2cap_cleanup_sockets();
+}
+
+module_param(disable_ertm, bool, 0644);
+MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode");
+
+module_param(enable_ecred, bool, 0644);
+MODULE_PARM_DESC(enable_ecred, "Enable enhanced credit flow control mode");
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
new file mode 100644
index 000000000000..9ee189c815d4
--- /dev/null
+++ b/net/bluetooth/l2cap_sock.c
@@ -0,0 +1,2013 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+ Copyright (C) 2010 Google Inc.
+ Copyright (C) 2011 ProFUSION Embedded Systems
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth L2CAP sockets. */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/filter.h>
+#include <linux/sched/signal.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "smp.h"
+
+static struct bt_sock_list l2cap_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(l2cap_sk_list.lock)
+};
+
+static const struct proto_ops l2cap_sock_ops;
+static void l2cap_sock_init(struct sock *sk, struct sock *parent);
+static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern);
+static void l2cap_sock_cleanup_listen(struct sock *parent);
+
+bool l2cap_is_socket(struct socket *sock)
+{
+ return sock && sock->ops == &l2cap_sock_ops;
+}
+EXPORT_SYMBOL(l2cap_is_socket);
+
+static int l2cap_validate_bredr_psm(u16 psm)
+{
+ /* PSM must be odd and lsb of upper byte must be 0 */
+ if ((psm & 0x0101) != 0x0001)
+ return -EINVAL;
+
+ /* Restrict usage of well-known PSMs */
+ if (psm < L2CAP_PSM_DYN_START && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ return 0;
+}
+
+static int l2cap_validate_le_psm(u16 psm)
+{
+ /* Valid LE_PSM ranges are defined only until 0x00ff */
+ if (psm > L2CAP_PSM_LE_DYN_END)
+ return -EINVAL;
+
+ /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */
+ if (psm < L2CAP_PSM_LE_DYN_START && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ return 0;
+}
+
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct sockaddr_l2 la;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (!addr || alen < offsetofend(struct sockaddr, sa_family) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ memset(&la, 0, sizeof(la));
+ len = min_t(unsigned int, sizeof(la), alen);
+ memcpy(&la, addr, len);
+
+ if (la.l2_cid && la.l2_psm)
+ return -EINVAL;
+
+ if (!bdaddr_type_is_valid(la.l2_bdaddr_type))
+ return -EINVAL;
+
+ if (bdaddr_type_is_le(la.l2_bdaddr_type)) {
+ /* We only allow ATT user space socket */
+ if (la.l2_cid &&
+ la.l2_cid != cpu_to_le16(L2CAP_CID_ATT))
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (la.l2_psm) {
+ __u16 psm = __le16_to_cpu(la.l2_psm);
+
+ if (la.l2_bdaddr_type == BDADDR_BREDR)
+ err = l2cap_validate_bredr_psm(psm);
+ else
+ err = l2cap_validate_le_psm(psm);
+
+ if (err)
+ goto done;
+ }
+
+ bacpy(&chan->src, &la.l2_bdaddr);
+ chan->src_type = la.l2_bdaddr_type;
+
+ if (la.l2_cid)
+ err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid));
+ else
+ err = l2cap_add_psm(chan, &la.l2_bdaddr, la.l2_psm);
+
+ if (err < 0)
+ goto done;
+
+ switch (chan->chan_type) {
+ case L2CAP_CHAN_CONN_LESS:
+ if (__le16_to_cpu(la.l2_psm) == L2CAP_PSM_3DSP)
+ chan->sec_level = BT_SECURITY_SDP;
+ break;
+ case L2CAP_CHAN_CONN_ORIENTED:
+ if (__le16_to_cpu(la.l2_psm) == L2CAP_PSM_SDP ||
+ __le16_to_cpu(la.l2_psm) == L2CAP_PSM_RFCOMM)
+ chan->sec_level = BT_SECURITY_SDP;
+ break;
+ case L2CAP_CHAN_RAW:
+ chan->sec_level = BT_SECURITY_SDP;
+ break;
+ case L2CAP_CHAN_FIXED:
+ /* Fixed channels default to the L2CAP core not holding a
+ * hci_conn reference for them. For fixed channels mapping to
+ * L2CAP sockets we do want to hold a reference so set the
+ * appropriate flag to request it.
+ */
+ set_bit(FLAG_HOLD_HCI_CONN, &chan->flags);
+ break;
+ }
+
+ /* Use L2CAP_MODE_LE_FLOWCTL (CoC) in case of LE address and
+ * L2CAP_MODE_EXT_FLOWCTL (ECRED) has not been set.
+ */
+ if (chan->psm && bdaddr_type_is_le(chan->src_type) &&
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL)
+ chan->mode = L2CAP_MODE_LE_FLOWCTL;
+
+ chan->state = BT_BOUND;
+ sk->sk_state = BT_BOUND;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct sockaddr_l2 la;
+ int len, err = 0;
+ bool zapped;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+ zapped = sock_flag(sk, SOCK_ZAPPED);
+ release_sock(sk);
+
+ if (zapped)
+ return -EINVAL;
+
+ if (!addr || alen < offsetofend(struct sockaddr, sa_family) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ memset(&la, 0, sizeof(la));
+ len = min_t(unsigned int, sizeof(la), alen);
+ memcpy(&la, addr, len);
+
+ if (la.l2_cid && la.l2_psm)
+ return -EINVAL;
+
+ if (!bdaddr_type_is_valid(la.l2_bdaddr_type))
+ return -EINVAL;
+
+ /* Check that the socket wasn't bound to something that
+ * conflicts with the address given to connect(). If chan->src
+ * is BDADDR_ANY it means bind() was never used, in which case
+ * chan->src_type and la.l2_bdaddr_type do not need to match.
+ */
+ if (chan->src_type == BDADDR_BREDR && bacmp(&chan->src, BDADDR_ANY) &&
+ bdaddr_type_is_le(la.l2_bdaddr_type)) {
+ /* Old user space versions will try to incorrectly bind
+ * the ATT socket using BDADDR_BREDR. We need to accept
+ * this and fix up the source address type only when
+ * both the source CID and destination CID indicate
+ * ATT. Anything else is an invalid combination.
+ */
+ if (chan->scid != L2CAP_CID_ATT ||
+ la.l2_cid != cpu_to_le16(L2CAP_CID_ATT))
+ return -EINVAL;
+
+ /* We don't have the hdev available here to make a
+ * better decision on random vs public, but since all
+ * user space versions that exhibit this issue anyway do
+ * not support random local addresses assuming public
+ * here is good enough.
+ */
+ chan->src_type = BDADDR_LE_PUBLIC;
+ }
+
+ if (chan->src_type != BDADDR_BREDR && la.l2_bdaddr_type == BDADDR_BREDR)
+ return -EINVAL;
+
+ if (bdaddr_type_is_le(la.l2_bdaddr_type)) {
+ /* We only allow ATT user space socket */
+ if (la.l2_cid &&
+ la.l2_cid != cpu_to_le16(L2CAP_CID_ATT))
+ return -EINVAL;
+ }
+
+ /* Use L2CAP_MODE_LE_FLOWCTL (CoC) in case of LE address and
+ * L2CAP_MODE_EXT_FLOWCTL (ECRED) has not been set.
+ */
+ if (chan->psm && bdaddr_type_is_le(chan->src_type) &&
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL)
+ chan->mode = L2CAP_MODE_LE_FLOWCTL;
+
+ err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
+ &la.l2_bdaddr, la.l2_bdaddr_type,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+ release_sock(sk);
+
+ return err;
+}
+
+static int l2cap_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ int err = 0;
+
+ BT_DBG("sk %p backlog %d", sk, backlog);
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ case L2CAP_MODE_LE_FLOWCTL:
+ break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!enable_ecred) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+ break;
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ if (!disable_ertm)
+ break;
+ fallthrough;
+ default:
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+
+ /* Listening channels need to use nested locking in order not to
+ * cause lockdep warnings when the created child channels end up
+ * being locked in the same thread as the parent channel.
+ */
+ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
+
+ chan->state = BT_LISTEN;
+ sk->sk_state = BT_LISTEN;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = sock->sk, *nsk;
+ long timeo;
+ int err = 0;
+
+ lock_sock_nested(sk, L2CAP_NESTING_PARENT);
+
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
+
+ BT_DBG("sk %p timeo %ld", sk, timeo);
+
+ /* Wait for an incoming connection. (wake-one). */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ nsk = bt_accept_dequeue(sk, newsock);
+ if (nsk)
+ break;
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+
+ lock_sock_nested(sk, L2CAP_NESTING_PARENT);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+ BT_DBG("new socket %p", nsk);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
+{
+ struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (peer && sk->sk_state != BT_CONNECTED &&
+ sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2 &&
+ sk->sk_state != BT_CONFIG)
+ return -ENOTCONN;
+
+ memset(la, 0, sizeof(struct sockaddr_l2));
+ addr->sa_family = AF_BLUETOOTH;
+
+ la->l2_psm = chan->psm;
+
+ if (peer) {
+ bacpy(&la->l2_bdaddr, &chan->dst);
+ la->l2_cid = cpu_to_le16(chan->dcid);
+ la->l2_bdaddr_type = chan->dst_type;
+ } else {
+ bacpy(&la->l2_bdaddr, &chan->src);
+ la->l2_cid = cpu_to_le16(chan->scid);
+ la->l2_bdaddr_type = chan->src_type;
+ }
+
+ return sizeof(struct sockaddr_l2);
+}
+
+static int l2cap_get_mode(struct l2cap_chan *chan)
+{
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ return BT_MODE_BASIC;
+ case L2CAP_MODE_ERTM:
+ return BT_MODE_ERTM;
+ case L2CAP_MODE_STREAMING:
+ return BT_MODE_STREAMING;
+ case L2CAP_MODE_LE_FLOWCTL:
+ return BT_MODE_LE_FLOWCTL;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ return BT_MODE_EXT_FLOWCTL;
+ }
+
+ return -EINVAL;
+}
+
+static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct l2cap_options opts;
+ struct l2cap_conninfo cinfo;
+ int err = 0;
+ size_t len;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case L2CAP_OPTIONS:
+ /* LE sockets should use BT_SNDMTU/BT_RCVMTU, but since
+ * legacy ATT code depends on getsockopt for
+ * L2CAP_OPTIONS we need to let this pass.
+ */
+ if (bdaddr_type_is_le(chan->src_type) &&
+ chan->scid != L2CAP_CID_ATT) {
+ err = -EINVAL;
+ break;
+ }
+
+ /* Only BR/EDR modes are supported here */
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ if (err < 0)
+ break;
+
+ memset(&opts, 0, sizeof(opts));
+ opts.imtu = chan->imtu;
+ opts.omtu = chan->omtu;
+ opts.flush_to = chan->flush_to;
+ opts.mode = chan->mode;
+ opts.fcs = chan->fcs;
+ opts.max_tx = chan->max_tx;
+ opts.txwin_size = chan->tx_win;
+
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
+ len = min(len, sizeof(opts));
+ if (copy_to_user(optval, (char *) &opts, len))
+ err = -EFAULT;
+
+ break;
+
+ case L2CAP_LM:
+ switch (chan->sec_level) {
+ case BT_SECURITY_LOW:
+ opt = L2CAP_LM_AUTH;
+ break;
+ case BT_SECURITY_MEDIUM:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT;
+ break;
+ case BT_SECURITY_HIGH:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+ L2CAP_LM_SECURE;
+ break;
+ case BT_SECURITY_FIPS:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+ L2CAP_LM_SECURE | L2CAP_LM_FIPS;
+ break;
+ default:
+ opt = 0;
+ break;
+ }
+
+ if (test_bit(FLAG_ROLE_SWITCH, &chan->flags))
+ opt |= L2CAP_LM_MASTER;
+
+ if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags))
+ opt |= L2CAP_LM_RELIABLE;
+
+ if (put_user(opt, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case L2CAP_CONNINFO:
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ cinfo.hci_handle = chan->conn->hcon->handle;
+ memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3);
+
+ len = min(len, sizeof(cinfo));
+ if (copy_to_user(optval, (char *) &cinfo, len))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct bt_security sec;
+ struct bt_power pwr;
+ u32 phys;
+ int len, mode, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_L2CAP)
+ return l2cap_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+ chan->chan_type != L2CAP_CHAN_FIXED &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ memset(&sec, 0, sizeof(sec));
+ if (chan->conn) {
+ sec.level = chan->conn->hcon->sec_level;
+
+ if (sk->sk_state == BT_CONNECTED)
+ sec.key_size = chan->conn->hcon->enc_key_size;
+ } else {
+ sec.level = chan->sec_level;
+ }
+
+ len = min_t(unsigned int, len, sizeof(sec));
+ if (copy_to_user(optval, (char *) &sec, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_FLUSHABLE:
+ if (put_user(test_bit(FLAG_FLUSHABLE, &chan->flags),
+ (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_POWER:
+ if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM
+ && sk->sk_type != SOCK_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ pwr.force_active = test_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+
+ len = min_t(unsigned int, len, sizeof(pwr));
+ if (copy_to_user(optval, (char *) &pwr, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_CHANNEL_POLICY:
+ if (put_user(chan->chan_policy, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_SNDMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (put_user(chan->omtu, (u16 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_RCVMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(chan->imtu, (u16 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ phys = hci_conn_get_phy(chan->conn->hcon);
+
+ if (put_user(phys, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_MODE:
+ if (!enable_ecred) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ mode = l2cap_get_mode(chan);
+ if (mode < 0) {
+ err = mode;
+ break;
+ }
+
+ if (put_user(mode, (u8 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu)
+{
+ switch (chan->scid) {
+ case L2CAP_CID_ATT:
+ if (mtu && mtu < L2CAP_LE_MIN_MTU)
+ return false;
+ break;
+
+ default:
+ if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU)
+ return false;
+ }
+
+ return true;
+}
+
+static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct l2cap_options opts;
+ int err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case L2CAP_OPTIONS:
+ if (bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (sk->sk_state == BT_CONNECTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ opts.imtu = chan->imtu;
+ opts.omtu = chan->omtu;
+ opts.flush_to = chan->flush_to;
+ opts.mode = chan->mode;
+ opts.fcs = chan->fcs;
+ opts.max_tx = chan->max_tx;
+ opts.txwin_size = chan->tx_win;
+
+ err = copy_safe_from_sockptr(&opts, sizeof(opts), optval,
+ optlen);
+ if (err)
+ break;
+
+ if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (!l2cap_valid_mtu(chan, opts.imtu)) {
+ err = -EINVAL;
+ break;
+ }
+
+ /* Only BR/EDR modes are supported here */
+ switch (opts.mode) {
+ case L2CAP_MODE_BASIC:
+ clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+ break;
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ if (!disable_ertm)
+ break;
+ fallthrough;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ if (err < 0)
+ break;
+
+ chan->mode = opts.mode;
+
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
+ chan->imtu = opts.imtu;
+ chan->omtu = opts.omtu;
+ chan->fcs = opts.fcs;
+ chan->max_tx = opts.max_tx;
+ chan->tx_win = opts.txwin_size;
+ chan->flush_to = opts.flush_to;
+ break;
+
+ case L2CAP_LM:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt & L2CAP_LM_FIPS) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (opt & L2CAP_LM_AUTH)
+ chan->sec_level = BT_SECURITY_LOW;
+ if (opt & L2CAP_LM_ENCRYPT)
+ chan->sec_level = BT_SECURITY_MEDIUM;
+ if (opt & L2CAP_LM_SECURE)
+ chan->sec_level = BT_SECURITY_HIGH;
+
+ if (opt & L2CAP_LM_MASTER)
+ set_bit(FLAG_ROLE_SWITCH, &chan->flags);
+ else
+ clear_bit(FLAG_ROLE_SWITCH, &chan->flags);
+
+ if (opt & L2CAP_LM_RELIABLE)
+ set_bit(FLAG_FORCE_RELIABLE, &chan->flags);
+ else
+ clear_bit(FLAG_FORCE_RELIABLE, &chan->flags);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode)
+{
+ switch (mode) {
+ case BT_MODE_BASIC:
+ if (bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_BASIC;
+ clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+ break;
+ case BT_MODE_ERTM:
+ if (!disable_ertm || bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_ERTM;
+ break;
+ case BT_MODE_STREAMING:
+ if (!disable_ertm || bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_STREAMING;
+ break;
+ case BT_MODE_LE_FLOWCTL:
+ if (!bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_LE_FLOWCTL;
+ break;
+ case BT_MODE_EXT_FLOWCTL:
+ /* TODO: Add support for ECRED PDUs to BR/EDR */
+ if (!bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_EXT_FLOWCTL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ chan->mode = mode;
+
+ return 0;
+}
+
+static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct bt_security sec;
+ struct bt_power pwr;
+ struct l2cap_conn *conn;
+ int err = 0;
+ u32 opt;
+ u16 mtu;
+ u8 mode;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_L2CAP)
+ return l2cap_sock_setsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+ chan->chan_type != L2CAP_CHAN_FIXED &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = BT_SECURITY_LOW;
+
+ err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen);
+ if (err)
+ break;
+
+ if (sec.level < BT_SECURITY_LOW ||
+ sec.level > BT_SECURITY_FIPS) {
+ err = -EINVAL;
+ break;
+ }
+
+ chan->sec_level = sec.level;
+
+ if (!chan->conn)
+ break;
+
+ conn = chan->conn;
+
+ /* change security for LE channels */
+ if (chan->scid == L2CAP_CID_ATT) {
+ if (smp_conn_security(conn->hcon, sec.level)) {
+ err = -EINVAL;
+ break;
+ }
+
+ set_bit(FLAG_PENDING_SECURITY, &chan->flags);
+ sk->sk_state = BT_CONFIG;
+ chan->state = BT_CONFIG;
+
+ /* or for ACL link */
+ } else if ((sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) ||
+ sk->sk_state == BT_CONNECTED) {
+ if (!l2cap_chan_check_security(chan, true))
+ set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags);
+ else
+ sk->sk_state_change(sk);
+ } else {
+ err = -EINVAL;
+ }
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt) {
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ set_bit(FLAG_DEFER_SETUP, &chan->flags);
+ } else {
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ clear_bit(FLAG_DEFER_SETUP, &chan->flags);
+ }
+ break;
+
+ case BT_FLUSHABLE:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt > BT_FLUSHABLE_ON) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (opt == BT_FLUSHABLE_OFF) {
+ conn = chan->conn;
+ /* proceed further only when we have l2cap_conn and
+ No Flush support in the LM */
+ if (!conn || !lmp_no_flush_capable(conn->hcon->hdev)) {
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ if (opt)
+ set_bit(FLAG_FLUSHABLE, &chan->flags);
+ else
+ clear_bit(FLAG_FLUSHABLE, &chan->flags);
+ break;
+
+ case BT_POWER:
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ pwr.force_active = BT_POWER_FORCE_ACTIVE_ON;
+
+ err = copy_safe_from_sockptr(&pwr, sizeof(pwr), optval, optlen);
+ if (err)
+ break;
+
+ if (pwr.force_active)
+ set_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+ else
+ clear_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+ break;
+
+ case BT_CHANNEL_POLICY:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ err = -EOPNOTSUPP;
+ break;
+
+ case BT_SNDMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ /* Setting is not supported as it's the remote side that
+ * decides this.
+ */
+ err = -EPERM;
+ break;
+
+ case BT_RCVMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (chan->mode == L2CAP_MODE_LE_FLOWCTL &&
+ sk->sk_state == BT_CONNECTED) {
+ err = -EISCONN;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&mtu, sizeof(mtu), optval, optlen);
+ if (err)
+ break;
+
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL &&
+ sk->sk_state == BT_CONNECTED)
+ err = l2cap_chan_reconfigure(chan, mtu);
+ else
+ chan->imtu = mtu;
+
+ break;
+
+ case BT_MODE:
+ if (!enable_ecred) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ BT_DBG("sk->sk_state %u", sk->sk_state);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&mode, sizeof(mode), optval,
+ optlen);
+ if (err)
+ break;
+
+ BT_DBG("mode %u", mode);
+
+ err = l2cap_set_mode(chan, mode);
+ if (err)
+ break;
+
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct sockcm_cookie sockc;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (sk->sk_state != BT_CONNECTED)
+ return -ENOTCONN;
+
+ hci_sockcm_init(&sockc, sk);
+
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (err)
+ return err;
+ }
+
+ lock_sock(sk);
+ err = bt_sock_wait_ready(sk, msg->msg_flags);
+ release_sock(sk);
+ if (err)
+ return err;
+
+ l2cap_chan_lock(chan);
+ err = l2cap_chan_send(chan, msg, len, &sockc);
+ l2cap_chan_unlock(chan);
+
+ return err;
+}
+
+static void l2cap_publish_rx_avail(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+ ssize_t avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc);
+ int expected_skbs, skb_overhead;
+
+ if (avail <= 0) {
+ l2cap_chan_rx_avail(chan, 0);
+ return;
+ }
+
+ if (!chan->mps) {
+ l2cap_chan_rx_avail(chan, -1);
+ return;
+ }
+
+ /* Correct available memory by estimated sk_buff overhead.
+ * This is significant due to small transfer sizes. However, accept
+ * at least one full packet if receive space is non-zero.
+ */
+ expected_skbs = DIV_ROUND_UP(avail, chan->mps);
+ skb_overhead = expected_skbs * sizeof(struct sk_buff);
+ if (skb_overhead < avail)
+ l2cap_chan_rx_avail(chan, avail - skb_overhead);
+ else
+ l2cap_chan_rx_avail(chan, -1);
+}
+
+static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_pinfo *pi = l2cap_pi(sk);
+ int err;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH,
+ BT_SCM_ERROR);
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP,
+ &bt_sk(sk)->flags)) {
+ if (pi->chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ sk->sk_state = BT_CONNECTED;
+ pi->chan->state = BT_CONNECTED;
+ __l2cap_ecred_conn_rsp_defer(pi->chan);
+ } else if (bdaddr_type_is_le(pi->chan->src_type)) {
+ sk->sk_state = BT_CONNECTED;
+ pi->chan->state = BT_CONNECTED;
+ __l2cap_le_connect_rsp_defer(pi->chan);
+ } else {
+ sk->sk_state = BT_CONFIG;
+ pi->chan->state = BT_CONFIG;
+ __l2cap_connect_rsp_defer(pi->chan);
+ }
+
+ err = 0;
+ goto done;
+ }
+
+ release_sock(sk);
+
+ if (sock->type == SOCK_STREAM)
+ err = bt_sock_stream_recvmsg(sock, msg, len, flags);
+ else
+ err = bt_sock_recvmsg(sock, msg, len, flags);
+
+ if (pi->chan->mode != L2CAP_MODE_ERTM &&
+ pi->chan->mode != L2CAP_MODE_LE_FLOWCTL &&
+ pi->chan->mode != L2CAP_MODE_EXT_FLOWCTL)
+ return err;
+
+ lock_sock(sk);
+
+ l2cap_publish_rx_avail(pi->chan);
+
+ /* Attempt to put pending rx data in the socket buffer */
+ while (!list_empty(&pi->rx_busy)) {
+ struct l2cap_rx_busy *rx_busy =
+ list_first_entry(&pi->rx_busy,
+ struct l2cap_rx_busy,
+ list);
+ if (__sock_queue_rcv_skb(sk, rx_busy->skb) < 0)
+ goto done;
+ list_del(&rx_busy->list);
+ kfree(rx_busy);
+ }
+
+ /* Restore data flow when half of the receive buffer is
+ * available. This avoids resending large numbers of
+ * frames.
+ */
+ if (test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state) &&
+ atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
+ l2cap_chan_busy(pi->chan, 0);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket, with l2cap channel lock.
+ */
+static void l2cap_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ return;
+
+ BT_DBG("sk %p state %s", sk, state_to_string(sk->sk_state));
+
+ /* Sock is dead, so set chan data to NULL, avoid other task use invalid
+ * sock pointer.
+ */
+ l2cap_pi(sk)->chan->data = NULL;
+ /* Kill poor orphan */
+
+ l2cap_chan_put(l2cap_pi(sk)->chan);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ int err = 0;
+ int timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
+ /* Timeout to prevent infinite loop */
+ unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ do {
+ BT_DBG("Waiting for %d ACKs, timeout %04d ms",
+ chan->unacked_frames, time_after(jiffies, timeout) ? 0 :
+ jiffies_to_msecs(timeout - jiffies));
+
+ if (!timeo)
+ timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = sock_error(sk);
+ if (err)
+ break;
+
+ if (time_after(jiffies, timeout)) {
+ err = -ENOLINK;
+ break;
+ }
+
+ } while (chan->unacked_frames > 0 &&
+ chan->state == BT_CONNECTED);
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return err;
+}
+
+static int l2cap_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan;
+ struct l2cap_conn *conn;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
+
+ /* 'how' parameter is mapped to sk_shutdown as follows:
+ * SHUT_RD (0) --> RCV_SHUTDOWN (1)
+ * SHUT_WR (1) --> SEND_SHUTDOWN (2)
+ * SHUT_RDWR (2) --> SHUTDOWN_MASK (3)
+ */
+ how++;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+
+ if ((sk->sk_shutdown & how) == how)
+ goto shutdown_already;
+
+ BT_DBG("Handling sock shutdown");
+
+ /* prevent sk structure from being freed whilst unlocked */
+ sock_hold(sk);
+
+ /* prevent chan structure from being freed whilst unlocked */
+ chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan);
+ if (!chan)
+ goto shutdown_already;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ if (chan->mode == L2CAP_MODE_ERTM &&
+ chan->unacked_frames > 0 &&
+ chan->state == BT_CONNECTED) {
+ err = __l2cap_wait_ack(sk, chan);
+
+ /* After waiting for ACKs, check whether shutdown
+ * has already been actioned to close the L2CAP
+ * link such as by l2cap_disconnection_req().
+ */
+ if ((sk->sk_shutdown & how) == how)
+ goto shutdown_matched;
+ }
+
+ /* Try setting the RCV_SHUTDOWN bit, return early if SEND_SHUTDOWN
+ * is already set
+ */
+ if ((how & RCV_SHUTDOWN) && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ if ((sk->sk_shutdown & how) == how)
+ goto shutdown_matched;
+ }
+
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ release_sock(sk);
+
+ l2cap_chan_lock(chan);
+ /* prevent conn structure from being freed */
+ conn = l2cap_conn_hold_unless_zero(chan->conn);
+ l2cap_chan_unlock(chan);
+
+ if (conn)
+ /* mutex lock must be taken before l2cap_chan_lock() */
+ mutex_lock(&conn->lock);
+
+ l2cap_chan_lock(chan);
+ l2cap_chan_close(chan, 0);
+ l2cap_chan_unlock(chan);
+
+ if (conn) {
+ mutex_unlock(&conn->lock);
+ l2cap_conn_put(conn);
+ }
+
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED,
+ sk->sk_lingertime);
+
+shutdown_matched:
+ l2cap_chan_put(chan);
+ sock_put(sk);
+
+shutdown_already:
+ if (!err && sk->sk_err)
+ err = -sk->sk_err;
+
+ release_sock(sk);
+
+ BT_DBG("Sock shutdown complete err: %d", err);
+
+ return err;
+}
+
+static int l2cap_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ struct l2cap_chan *chan;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ lock_sock_nested(sk, L2CAP_NESTING_PARENT);
+ l2cap_sock_cleanup_listen(sk);
+ release_sock(sk);
+
+ bt_sock_unlink(&l2cap_sk_list, sk);
+
+ err = l2cap_sock_shutdown(sock, SHUT_RDWR);
+ chan = l2cap_pi(sk)->chan;
+
+ l2cap_chan_hold(chan);
+ l2cap_chan_lock(chan);
+
+ sock_orphan(sk);
+ l2cap_sock_kill(sk);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ return err;
+}
+
+static void l2cap_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ BT_DBG("parent %p state %s", parent,
+ state_to_string(parent->sk_state));
+
+ /* Close not yet accepted channels */
+ while ((sk = bt_accept_dequeue(parent, NULL))) {
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+ BT_DBG("child chan %p state %s", chan,
+ state_to_string(chan->state));
+
+ l2cap_chan_hold(chan);
+ l2cap_chan_lock(chan);
+
+ __clear_chan_timer(chan);
+ l2cap_chan_close(chan, ECONNRESET);
+ l2cap_sock_kill(sk);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ }
+}
+
+static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk, *parent = chan->data;
+
+ lock_sock(parent);
+
+ /* Check for backlog size */
+ if (sk_acceptq_is_full(parent)) {
+ BT_DBG("backlog full %d", parent->sk_ack_backlog);
+ release_sock(parent);
+ return NULL;
+ }
+
+ sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
+ GFP_ATOMIC, 0);
+ if (!sk) {
+ release_sock(parent);
+ return NULL;
+ }
+
+ bt_sock_reclassify_lock(sk, BTPROTO_L2CAP);
+
+ l2cap_sock_init(sk, parent);
+
+ bt_accept_enqueue(parent, sk, false);
+
+ release_sock(parent);
+
+ return l2cap_pi(sk)->chan;
+}
+
+static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct l2cap_pinfo *pi;
+ int err;
+
+ sk = chan->data;
+ if (!sk)
+ return -ENXIO;
+
+ pi = l2cap_pi(sk);
+ lock_sock(sk);
+ if (chan->mode == L2CAP_MODE_ERTM && !list_empty(&pi->rx_busy)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ if (chan->mode != L2CAP_MODE_ERTM &&
+ chan->mode != L2CAP_MODE_STREAMING &&
+ chan->mode != L2CAP_MODE_LE_FLOWCTL &&
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL) {
+ /* Even if no filter is attached, we could potentially
+ * get errors from security modules, etc.
+ */
+ err = sk_filter(sk, skb);
+ if (err)
+ goto done;
+ }
+
+ err = __sock_queue_rcv_skb(sk, skb);
+
+ l2cap_publish_rx_avail(chan);
+
+ /* For ERTM and LE, handle a skb that doesn't fit into the recv
+ * buffer. This is important to do because the data frames
+ * have already been acked, so the skb cannot be discarded.
+ *
+ * Notify the l2cap core that the buffer is full, so the
+ * LOCAL_BUSY state is entered and no more frames are
+ * acked and reassembled until there is buffer space
+ * available.
+ */
+ if (err < 0 &&
+ (chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_LE_FLOWCTL ||
+ chan->mode == L2CAP_MODE_EXT_FLOWCTL)) {
+ struct l2cap_rx_busy *rx_busy =
+ kmalloc(sizeof(*rx_busy), GFP_KERNEL);
+ if (!rx_busy) {
+ err = -ENOMEM;
+ goto done;
+ }
+ rx_busy->skb = skb;
+ list_add_tail(&rx_busy->list, &pi->rx_busy);
+ l2cap_chan_busy(chan, 1);
+ err = 0;
+ }
+
+done:
+ release_sock(sk);
+
+ return err;
+}
+
+static void l2cap_sock_close_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ if (!sk)
+ return;
+
+ l2cap_sock_kill(sk);
+}
+
+static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
+{
+ struct sock *sk = chan->data;
+ struct sock *parent;
+
+ if (!sk)
+ return;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ /* This callback can be called both for server (BT_LISTEN)
+ * sockets as well as "normal" ones. To avoid lockdep warnings
+ * with child socket locking (through l2cap_sock_cleanup_listen)
+ * we need separation into separate nesting levels. The simplest
+ * way to accomplish this is to inherit the nesting level used
+ * for the channel.
+ */
+ lock_sock_nested(sk, atomic_read(&chan->nesting));
+
+ parent = bt_sk(sk)->parent;
+
+ switch (chan->state) {
+ case BT_OPEN:
+ case BT_BOUND:
+ case BT_CLOSED:
+ break;
+ case BT_LISTEN:
+ l2cap_sock_cleanup_listen(sk);
+ sk->sk_state = BT_CLOSED;
+ chan->state = BT_CLOSED;
+
+ break;
+ default:
+ sk->sk_state = BT_CLOSED;
+ chan->state = BT_CLOSED;
+
+ sk->sk_err = err;
+
+ if (parent) {
+ bt_accept_unlink(sk);
+ parent->sk_data_ready(parent);
+ } else {
+ sk->sk_state_change(sk);
+ }
+
+ break;
+ }
+ release_sock(sk);
+
+ /* Only zap after cleanup to avoid use after free race */
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+}
+
+static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state,
+ int err)
+{
+ struct sock *sk = chan->data;
+
+ sk->sk_state = state;
+
+ if (err)
+ sk->sk_err = err;
+}
+
+static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ struct sock *sk = chan->data;
+ struct sk_buff *skb;
+ int err;
+
+ l2cap_chan_unlock(chan);
+ skb = bt_skb_send_alloc(sk, hdr_len + len, nb, &err);
+ l2cap_chan_lock(chan);
+
+ if (!skb)
+ return ERR_PTR(err);
+
+ /* Channel lock is released before requesting new skb and then
+ * reacquired thus we need to recheck channel state.
+ */
+ if (chan->state != BT_CONNECTED) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOTCONN);
+ }
+
+ skb->priority = READ_ONCE(sk->sk_priority);
+
+ bt_cb(skb)->l2cap.chan = chan;
+
+ return skb;
+}
+
+static void l2cap_sock_ready_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+ struct sock *parent;
+
+ lock_sock(sk);
+
+ parent = bt_sk(sk)->parent;
+
+ BT_DBG("sk %p, parent %p", sk, parent);
+
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+
+ if (parent)
+ parent->sk_data_ready(parent);
+
+ release_sock(sk);
+}
+
+static void l2cap_sock_defer_cb(struct l2cap_chan *chan)
+{
+ struct sock *parent, *sk = chan->data;
+
+ lock_sock(sk);
+
+ parent = bt_sk(sk)->parent;
+ if (parent)
+ parent->sk_data_ready(parent);
+
+ release_sock(sk);
+}
+
+static void l2cap_sock_resume_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ if (!sk)
+ return;
+
+ if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) {
+ sk->sk_state = BT_CONNECTED;
+ chan->state = BT_CONNECTED;
+ }
+
+ clear_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags);
+ sk->sk_state_change(sk);
+}
+
+static void l2cap_sock_set_shutdown_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ release_sock(sk);
+}
+
+static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ return READ_ONCE(sk->sk_sndtimeo);
+}
+
+static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ return sk->sk_peer_pid;
+}
+
+static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags);
+ sk->sk_state_change(sk);
+}
+
+static int l2cap_sock_filter(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct sock *sk = chan->data;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ return sk_filter(sk, skb);
+ }
+
+ return 0;
+}
+
+static const struct l2cap_ops l2cap_chan_ops = {
+ .name = "L2CAP Socket Interface",
+ .new_connection = l2cap_sock_new_connection_cb,
+ .recv = l2cap_sock_recv_cb,
+ .close = l2cap_sock_close_cb,
+ .teardown = l2cap_sock_teardown_cb,
+ .state_change = l2cap_sock_state_change_cb,
+ .ready = l2cap_sock_ready_cb,
+ .defer = l2cap_sock_defer_cb,
+ .resume = l2cap_sock_resume_cb,
+ .suspend = l2cap_sock_suspend_cb,
+ .set_shutdown = l2cap_sock_set_shutdown_cb,
+ .get_sndtimeo = l2cap_sock_get_sndtimeo_cb,
+ .get_peer_pid = l2cap_sock_get_peer_pid_cb,
+ .alloc_skb = l2cap_sock_alloc_skb_cb,
+ .filter = l2cap_sock_filter,
+};
+
+static void l2cap_sock_destruct(struct sock *sk)
+{
+ struct l2cap_rx_busy *rx_busy, *next;
+
+ BT_DBG("sk %p", sk);
+
+ if (l2cap_pi(sk)->chan) {
+ l2cap_pi(sk)->chan->data = NULL;
+ l2cap_chan_put(l2cap_pi(sk)->chan);
+ }
+
+ list_for_each_entry_safe(rx_busy, next, &l2cap_pi(sk)->rx_busy, list) {
+ kfree_skb(rx_busy->skb);
+ list_del(&rx_busy->list);
+ kfree(rx_busy);
+ }
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void l2cap_skb_msg_name(struct sk_buff *skb, void *msg_name,
+ int *msg_namelen)
+{
+ DECLARE_SOCKADDR(struct sockaddr_l2 *, la, msg_name);
+
+ memset(la, 0, sizeof(struct sockaddr_l2));
+ la->l2_family = AF_BLUETOOTH;
+ la->l2_psm = bt_cb(skb)->l2cap.psm;
+ bacpy(&la->l2_bdaddr, &bt_cb(skb)->l2cap.bdaddr);
+
+ *msg_namelen = sizeof(struct sockaddr_l2);
+}
+
+static void l2cap_sock_init(struct sock *sk, struct sock *parent)
+{
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+ BT_DBG("sk %p", sk);
+
+ if (parent) {
+ struct l2cap_chan *pchan = l2cap_pi(parent)->chan;
+
+ sk->sk_type = parent->sk_type;
+ bt_sk(sk)->flags = bt_sk(parent)->flags;
+
+ chan->chan_type = pchan->chan_type;
+ chan->imtu = pchan->imtu;
+ chan->omtu = pchan->omtu;
+ chan->conf_state = pchan->conf_state;
+ chan->mode = pchan->mode;
+ chan->fcs = pchan->fcs;
+ chan->max_tx = pchan->max_tx;
+ chan->tx_win = pchan->tx_win;
+ chan->tx_win_max = pchan->tx_win_max;
+ chan->sec_level = pchan->sec_level;
+ chan->flags = pchan->flags;
+ chan->tx_credits = pchan->tx_credits;
+ chan->rx_credits = pchan->rx_credits;
+
+ if (chan->chan_type == L2CAP_CHAN_FIXED) {
+ chan->scid = pchan->scid;
+ chan->dcid = pchan->scid;
+ }
+
+ security_sk_clone(parent, sk);
+ } else {
+ switch (sk->sk_type) {
+ case SOCK_RAW:
+ chan->chan_type = L2CAP_CHAN_RAW;
+ break;
+ case SOCK_DGRAM:
+ chan->chan_type = L2CAP_CHAN_CONN_LESS;
+ bt_sk(sk)->skb_msg_name = l2cap_skb_msg_name;
+ break;
+ case SOCK_SEQPACKET:
+ case SOCK_STREAM:
+ chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
+ break;
+ }
+
+ chan->imtu = L2CAP_DEFAULT_MTU;
+ chan->omtu = 0;
+ if (!disable_ertm && sk->sk_type == SOCK_STREAM) {
+ chan->mode = L2CAP_MODE_ERTM;
+ set_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+ } else {
+ chan->mode = L2CAP_MODE_BASIC;
+ }
+
+ l2cap_chan_set_defaults(chan);
+ }
+
+ /* Default config options */
+ chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
+
+ chan->data = sk;
+ chan->ops = &l2cap_chan_ops;
+
+ l2cap_publish_rx_avail(chan);
+}
+
+static struct proto l2cap_proto = {
+ .name = "L2CAP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct l2cap_pinfo)
+};
+
+static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
+{
+ struct sock *sk;
+ struct l2cap_chan *chan;
+
+ sk = bt_sock_alloc(net, sock, &l2cap_proto, proto, prio, kern);
+ if (!sk)
+ return NULL;
+
+ sk->sk_destruct = l2cap_sock_destruct;
+ sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT;
+
+ INIT_LIST_HEAD(&l2cap_pi(sk)->rx_busy);
+
+ chan = l2cap_chan_create();
+ if (!chan) {
+ sk_free(sk);
+ if (sock)
+ sock->sk = NULL;
+ return NULL;
+ }
+
+ l2cap_chan_hold(chan);
+
+ l2cap_pi(sk)->chan = chan;
+
+ return sk;
+}
+
+static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM &&
+ sock->type != SOCK_DGRAM && sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ return -EPERM;
+
+ sock->ops = &l2cap_sock_ops;
+
+ sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ l2cap_sock_init(sk, NULL);
+ bt_sock_link(&l2cap_sk_list, sk);
+ return 0;
+}
+
+static const struct proto_ops l2cap_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = l2cap_sock_release,
+ .bind = l2cap_sock_bind,
+ .connect = l2cap_sock_connect,
+ .listen = l2cap_sock_listen,
+ .accept = l2cap_sock_accept,
+ .getname = l2cap_sock_getname,
+ .sendmsg = l2cap_sock_sendmsg,
+ .recvmsg = l2cap_sock_recvmsg,
+ .poll = bt_sock_poll,
+ .ioctl = bt_sock_ioctl,
+ .gettstamp = sock_gettstamp,
+ .mmap = sock_no_mmap,
+ .socketpair = sock_no_socketpair,
+ .shutdown = l2cap_sock_shutdown,
+ .setsockopt = l2cap_sock_setsockopt,
+ .getsockopt = l2cap_sock_getsockopt
+};
+
+static const struct net_proto_family l2cap_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = l2cap_sock_create,
+};
+
+int __init l2cap_init_sockets(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_l2) > sizeof(struct sockaddr));
+
+ err = proto_register(&l2cap_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("L2CAP socket registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "l2cap", &l2cap_sk_list,
+ NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create L2CAP proc file");
+ bt_sock_unregister(BTPROTO_L2CAP);
+ goto error;
+ }
+
+ BT_INFO("L2CAP socket layer initialized");
+
+ return 0;
+
+error:
+ proto_unregister(&l2cap_proto);
+ return err;
+}
+
+void l2cap_cleanup_sockets(void)
+{
+ bt_procfs_cleanup(&init_net, "l2cap");
+ bt_sock_unregister(BTPROTO_L2CAP);
+ proto_unregister(&l2cap_proto);
+}
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
new file mode 100644
index 000000000000..6e349704efe4
--- /dev/null
+++ b/net/bluetooth/leds.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "leds.h"
+
+DEFINE_LED_TRIGGER(bt_power_led_trigger);
+
+struct hci_basic_led_trigger {
+ struct led_trigger led_trigger;
+ struct hci_dev *hdev;
+};
+
+#define to_hci_basic_led_trigger(arg) container_of(arg, \
+ struct hci_basic_led_trigger, led_trigger)
+
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
+{
+ if (hdev->power_led)
+ led_trigger_event(hdev->power_led,
+ enabled ? LED_FULL : LED_OFF);
+
+ if (!enabled) {
+ struct hci_dev *d;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (test_bit(HCI_UP, &d->flags))
+ enabled = true;
+ }
+
+ read_unlock(&hci_dev_list_lock);
+ }
+
+ led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
+}
+
+static int power_activate(struct led_classdev *led_cdev)
+{
+ struct hci_basic_led_trigger *htrig;
+ bool powered;
+
+ htrig = to_hci_basic_led_trigger(led_cdev->trigger);
+ powered = test_bit(HCI_UP, &htrig->hdev->flags);
+
+ led_set_brightness(led_cdev, powered ? LED_FULL : LED_OFF);
+
+ return 0;
+}
+
+static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
+ int (*activate)(struct led_classdev *led_cdev),
+ const char *name)
+{
+ struct hci_basic_led_trigger *htrig;
+
+ htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL);
+ if (!htrig)
+ return NULL;
+
+ htrig->hdev = hdev;
+ htrig->led_trigger.activate = activate;
+ htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL,
+ "%s-%s", hdev->name,
+ name);
+ if (!htrig->led_trigger.name)
+ goto err_alloc;
+
+ if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger))
+ goto err_register;
+
+ return &htrig->led_trigger;
+
+err_register:
+ devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name);
+err_alloc:
+ devm_kfree(&hdev->dev, htrig);
+ return NULL;
+}
+
+void hci_leds_init(struct hci_dev *hdev)
+{
+ /* initialize power_led */
+ hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
+}
+
+void bt_leds_init(void)
+{
+ led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger);
+}
+
+void bt_leds_cleanup(void)
+{
+ led_trigger_unregister_simple(bt_power_led_trigger);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
new file mode 100644
index 000000000000..bb5e09204436
--- /dev/null
+++ b/net/bluetooth/leds.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ */
+
+#if IS_ENABLED(CONFIG_BT_LEDS)
+
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
+void hci_leds_init(struct hci_dev *hdev);
+
+void bt_leds_init(void);
+void bt_leds_cleanup(void);
+
+#else
+
+static inline void hci_leds_update_powered(struct hci_dev *hdev,
+ bool enabled) {}
+static inline void hci_leds_init(struct hci_dev *hdev) {}
+
+static inline void bt_leds_init(void) {}
+static inline void bt_leds_cleanup(void) {}
+
+#endif
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index e5fd0cb70ae9..305044a84478 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -1,4 +1,4 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (C) 2000-2001 Qualcomm Incorporated
@@ -12,31 +12,37 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth kernel library. */
-#include <linux/module.h>
+#define pr_fmt(fmt) "Bluetooth: " fmt
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <asm/errno.h>
+#include <linux/export.h>
#include <net/bluetooth/bluetooth.h>
-void baswap(bdaddr_t *dst, bdaddr_t *src)
+/**
+ * baswap() - Swaps the order of a bd address
+ * @dst: Pointer to a bdaddr_t struct that will store the swapped
+ * bd address.
+ * @src: Pointer to the bdaddr_t struct to be swapped.
+ *
+ * This function reverses the byte order of a Bluetooth device
+ * address.
+ */
+void baswap(bdaddr_t *dst, const bdaddr_t *src)
{
- unsigned char *d = (unsigned char *) dst;
- unsigned char *s = (unsigned char *) src;
+ const unsigned char *s = (const unsigned char *)src;
+ unsigned char *d = (unsigned char *)dst;
unsigned int i;
for (i = 0; i < 6; i++)
@@ -44,22 +50,20 @@ void baswap(bdaddr_t *dst, bdaddr_t *src)
}
EXPORT_SYMBOL(baswap);
-char *batostr(bdaddr_t *ba)
-{
- static char str[2][18];
- static int i = 1;
-
- i ^= 1;
- sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X",
- ba->b[0], ba->b[1], ba->b[2],
- ba->b[3], ba->b[4], ba->b[5]);
-
- return str[i];
-}
-EXPORT_SYMBOL(batostr);
-
-/* Bluetooth error codes to Unix errno mapping */
-int bt_err(__u16 code)
+/**
+ * bt_to_errno() - Bluetooth error codes to standard errno
+ * @code: Bluetooth error code to be converted
+ *
+ * This function takes a Bluetooth error code as input and converts
+ * it to an equivalent Unix/standard errno value.
+ *
+ * Return:
+ *
+ * If the bt error code is known, an equivalent Unix errno value
+ * is returned.
+ * If the given bt error code is not known, ENOSYS is returned.
+ */
+int bt_to_errno(__u16 code)
{
switch (code) {
case 0:
@@ -75,6 +79,7 @@ int bt_err(__u16 code)
return EIO;
case 0x04:
+ case 0x3c:
return EHOSTDOWN;
case 0x05:
@@ -149,4 +154,230 @@ int bt_err(__u16 code)
return ENOSYS;
}
}
+EXPORT_SYMBOL(bt_to_errno);
+
+/**
+ * bt_status() - Standard errno value to Bluetooth error code
+ * @err: Unix/standard errno value to be converted
+ *
+ * This function converts a standard/Unix errno value to an
+ * equivalent Bluetooth error code.
+ *
+ * Return: Bluetooth error code.
+ *
+ * If the given errno is not found, 0x1f is returned by default
+ * which indicates an unspecified error.
+ * For err >= 0, no conversion is performed, and the same value
+ * is immediately returned.
+ */
+__u8 bt_status(int err)
+{
+ if (err >= 0)
+ return err;
+
+ switch (err) {
+ case -EBADRQC:
+ return 0x01;
+
+ case -ENOTCONN:
+ return 0x02;
+
+ case -EIO:
+ return 0x03;
+
+ case -EHOSTDOWN:
+ return 0x04;
+
+ case -EACCES:
+ return 0x05;
+
+ case -EBADE:
+ return 0x06;
+
+ case -ENOMEM:
+ return 0x07;
+
+ case -ETIMEDOUT:
+ return 0x08;
+
+ case -EMLINK:
+ return 0x09;
+
+ case -EALREADY:
+ return 0x0b;
+
+ case -EBUSY:
+ return 0x0c;
+
+ case -ECONNREFUSED:
+ return 0x0d;
+
+ case -EOPNOTSUPP:
+ return 0x11;
+
+ case -EINVAL:
+ return 0x12;
+
+ case -ECONNRESET:
+ return 0x13;
+
+ case -ECONNABORTED:
+ return 0x16;
+
+ case -ELOOP:
+ return 0x17;
+
+ case -EPROTONOSUPPORT:
+ return 0x1a;
+
+ case -EPROTO:
+ return 0x19;
+
+ default:
+ return 0x1f;
+ }
+}
+EXPORT_SYMBOL(bt_status);
+
+/**
+ * bt_info() - Log Bluetooth information message
+ * @format: Message's format string
+ */
+void bt_info(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_info("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_info);
+
+/**
+ * bt_warn() - Log Bluetooth warning message
+ * @format: Message's format string
+ */
+void bt_warn(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_warn("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_warn);
+
+/**
+ * bt_err() - Log Bluetooth error message
+ * @format: Message's format string
+ */
+void bt_err(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_err("%pV", &vaf);
+
+ va_end(args);
+}
EXPORT_SYMBOL(bt_err);
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+static bool debug_enable;
+
+void bt_dbg_set(bool enable)
+{
+ debug_enable = enable;
+}
+
+bool bt_dbg_get(void)
+{
+ return debug_enable;
+}
+
+/**
+ * bt_dbg() - Log Bluetooth debugging message
+ * @format: Message's format string
+ */
+void bt_dbg(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (likely(!debug_enable))
+ return;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ printk(KERN_DEBUG pr_fmt("%pV"), &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_dbg);
+#endif
+
+/**
+ * bt_warn_ratelimited() - Log rate-limited Bluetooth warning message
+ * @format: Message's format string
+ *
+ * This functions works like bt_warn, but it uses rate limiting
+ * to prevent the message from being logged too often.
+ */
+void bt_warn_ratelimited(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_warn_ratelimited("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_warn_ratelimited);
+
+/**
+ * bt_err_ratelimited() - Log rate-limited Bluetooth error message
+ * @format: Message's format string
+ *
+ * This functions works like bt_err, but it uses rate limiting
+ * to prevent the message from being logged too often.
+ */
+void bt_err_ratelimited(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_err_ratelimited("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_err_ratelimited);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
new file mode 100644
index 000000000000..c11cdef42b6f
--- /dev/null
+++ b/net/bluetooth/mgmt.c
@@ -0,0 +1,10613 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2010 Nokia Corporation
+ Copyright (C) 2011-2012 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI Management interface */
+
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_sock.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "smp.h"
+#include "mgmt_util.h"
+#include "mgmt_config.h"
+#include "msft.h"
+#include "eir.h"
+#include "aosp.h"
+
+#define MGMT_VERSION 1
+#define MGMT_REVISION 23
+
+static const u16 mgmt_commands[] = {
+ MGMT_OP_READ_INDEX_LIST,
+ MGMT_OP_READ_INFO,
+ MGMT_OP_SET_POWERED,
+ MGMT_OP_SET_DISCOVERABLE,
+ MGMT_OP_SET_CONNECTABLE,
+ MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_OP_SET_BONDABLE,
+ MGMT_OP_SET_LINK_SECURITY,
+ MGMT_OP_SET_SSP,
+ MGMT_OP_SET_HS,
+ MGMT_OP_SET_LE,
+ MGMT_OP_SET_DEV_CLASS,
+ MGMT_OP_SET_LOCAL_NAME,
+ MGMT_OP_ADD_UUID,
+ MGMT_OP_REMOVE_UUID,
+ MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_OP_DISCONNECT,
+ MGMT_OP_GET_CONNECTIONS,
+ MGMT_OP_PIN_CODE_REPLY,
+ MGMT_OP_PIN_CODE_NEG_REPLY,
+ MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_OP_PAIR_DEVICE,
+ MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_OP_UNPAIR_DEVICE,
+ MGMT_OP_USER_CONFIRM_REPLY,
+ MGMT_OP_USER_CONFIRM_NEG_REPLY,
+ MGMT_OP_USER_PASSKEY_REPLY,
+ MGMT_OP_USER_PASSKEY_NEG_REPLY,
+ MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+ MGMT_OP_START_DISCOVERY,
+ MGMT_OP_STOP_DISCOVERY,
+ MGMT_OP_CONFIRM_NAME,
+ MGMT_OP_BLOCK_DEVICE,
+ MGMT_OP_UNBLOCK_DEVICE,
+ MGMT_OP_SET_DEVICE_ID,
+ MGMT_OP_SET_ADVERTISING,
+ MGMT_OP_SET_BREDR,
+ MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_OP_SET_SECURE_CONN,
+ MGMT_OP_SET_DEBUG_KEYS,
+ MGMT_OP_SET_PRIVACY,
+ MGMT_OP_LOAD_IRKS,
+ MGMT_OP_GET_CONN_INFO,
+ MGMT_OP_GET_CLOCK_INFO,
+ MGMT_OP_ADD_DEVICE,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_OP_READ_UNCONF_INDEX_LIST,
+ MGMT_OP_READ_CONFIG_INFO,
+ MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_OP_READ_LOCAL_OOB_EXT_DATA,
+ MGMT_OP_READ_EXT_INDEX_LIST,
+ MGMT_OP_READ_ADV_FEATURES,
+ MGMT_OP_ADD_ADVERTISING,
+ MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_OP_START_LIMITED_DISCOVERY,
+ MGMT_OP_READ_EXT_INFO,
+ MGMT_OP_SET_APPEARANCE,
+ MGMT_OP_GET_PHY_CONFIGURATION,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_OP_READ_CONTROLLER_CAP,
+ MGMT_OP_READ_EXP_FEATURES_INFO,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_OP_READ_DEF_RUNTIME_CONFIG,
+ MGMT_OP_SET_DEF_RUNTIME_CONFIG,
+ MGMT_OP_GET_DEVICE_FLAGS,
+ MGMT_OP_SET_DEVICE_FLAGS,
+ MGMT_OP_READ_ADV_MONITOR_FEATURES,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR,
+ MGMT_OP_REMOVE_ADV_MONITOR,
+ MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI,
+ MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_OP_MESH_READ_FEATURES,
+ MGMT_OP_MESH_SEND,
+ MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_OP_HCI_CMD_SYNC,
+};
+
+static const u16 mgmt_events[] = {
+ MGMT_EV_CONTROLLER_ERROR,
+ MGMT_EV_INDEX_ADDED,
+ MGMT_EV_INDEX_REMOVED,
+ MGMT_EV_NEW_SETTINGS,
+ MGMT_EV_CLASS_OF_DEV_CHANGED,
+ MGMT_EV_LOCAL_NAME_CHANGED,
+ MGMT_EV_NEW_LINK_KEY,
+ MGMT_EV_NEW_LONG_TERM_KEY,
+ MGMT_EV_DEVICE_CONNECTED,
+ MGMT_EV_DEVICE_DISCONNECTED,
+ MGMT_EV_CONNECT_FAILED,
+ MGMT_EV_PIN_CODE_REQUEST,
+ MGMT_EV_USER_CONFIRM_REQUEST,
+ MGMT_EV_USER_PASSKEY_REQUEST,
+ MGMT_EV_AUTH_FAILED,
+ MGMT_EV_DEVICE_FOUND,
+ MGMT_EV_DISCOVERING,
+ MGMT_EV_DEVICE_BLOCKED,
+ MGMT_EV_DEVICE_UNBLOCKED,
+ MGMT_EV_DEVICE_UNPAIRED,
+ MGMT_EV_PASSKEY_NOTIFY,
+ MGMT_EV_NEW_IRK,
+ MGMT_EV_NEW_CSRK,
+ MGMT_EV_DEVICE_ADDED,
+ MGMT_EV_DEVICE_REMOVED,
+ MGMT_EV_NEW_CONN_PARAM,
+ MGMT_EV_UNCONF_INDEX_ADDED,
+ MGMT_EV_UNCONF_INDEX_REMOVED,
+ MGMT_EV_NEW_CONFIG_OPTIONS,
+ MGMT_EV_EXT_INDEX_ADDED,
+ MGMT_EV_EXT_INDEX_REMOVED,
+ MGMT_EV_LOCAL_OOB_DATA_UPDATED,
+ MGMT_EV_ADVERTISING_ADDED,
+ MGMT_EV_ADVERTISING_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
+ MGMT_EV_PHY_CONFIGURATION_CHANGED,
+ MGMT_EV_EXP_FEATURE_CHANGED,
+ MGMT_EV_DEVICE_FLAGS_CHANGED,
+ MGMT_EV_ADV_MONITOR_ADDED,
+ MGMT_EV_ADV_MONITOR_REMOVED,
+ MGMT_EV_CONTROLLER_SUSPEND,
+ MGMT_EV_CONTROLLER_RESUME,
+ MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+ MGMT_EV_ADV_MONITOR_DEVICE_LOST,
+};
+
+static const u16 mgmt_untrusted_commands[] = {
+ MGMT_OP_READ_INDEX_LIST,
+ MGMT_OP_READ_INFO,
+ MGMT_OP_READ_UNCONF_INDEX_LIST,
+ MGMT_OP_READ_CONFIG_INFO,
+ MGMT_OP_READ_EXT_INDEX_LIST,
+ MGMT_OP_READ_EXT_INFO,
+ MGMT_OP_READ_CONTROLLER_CAP,
+ MGMT_OP_READ_EXP_FEATURES_INFO,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ MGMT_OP_READ_DEF_RUNTIME_CONFIG,
+};
+
+static const u16 mgmt_untrusted_events[] = {
+ MGMT_EV_INDEX_ADDED,
+ MGMT_EV_INDEX_REMOVED,
+ MGMT_EV_NEW_SETTINGS,
+ MGMT_EV_CLASS_OF_DEV_CHANGED,
+ MGMT_EV_LOCAL_NAME_CHANGED,
+ MGMT_EV_UNCONF_INDEX_ADDED,
+ MGMT_EV_UNCONF_INDEX_REMOVED,
+ MGMT_EV_NEW_CONFIG_OPTIONS,
+ MGMT_EV_EXT_INDEX_ADDED,
+ MGMT_EV_EXT_INDEX_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
+ MGMT_EV_EXP_FEATURE_CHANGED,
+};
+
+#define CACHE_TIMEOUT secs_to_jiffies(2)
+
+#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+
+/* HCI to MGMT error code conversion table */
+static const u8 mgmt_status_table[] = {
+ MGMT_STATUS_SUCCESS,
+ MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */
+ MGMT_STATUS_NOT_CONNECTED, /* No Connection */
+ MGMT_STATUS_FAILED, /* Hardware Failure */
+ MGMT_STATUS_CONNECT_FAILED, /* Page Timeout */
+ MGMT_STATUS_AUTH_FAILED, /* Authentication Failed */
+ MGMT_STATUS_AUTH_FAILED, /* PIN or Key Missing */
+ MGMT_STATUS_NO_RESOURCES, /* Memory Full */
+ MGMT_STATUS_TIMEOUT, /* Connection Timeout */
+ MGMT_STATUS_NO_RESOURCES, /* Max Number of Connections */
+ MGMT_STATUS_NO_RESOURCES, /* Max Number of SCO Connections */
+ MGMT_STATUS_ALREADY_CONNECTED, /* ACL Connection Exists */
+ MGMT_STATUS_BUSY, /* Command Disallowed */
+ MGMT_STATUS_NO_RESOURCES, /* Rejected Limited Resources */
+ MGMT_STATUS_REJECTED, /* Rejected Security */
+ MGMT_STATUS_REJECTED, /* Rejected Personal */
+ MGMT_STATUS_TIMEOUT, /* Host Timeout */
+ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported Feature */
+ MGMT_STATUS_INVALID_PARAMS, /* Invalid Parameters */
+ MGMT_STATUS_DISCONNECTED, /* OE User Ended Connection */
+ MGMT_STATUS_NO_RESOURCES, /* OE Low Resources */
+ MGMT_STATUS_DISCONNECTED, /* OE Power Off */
+ MGMT_STATUS_DISCONNECTED, /* Connection Terminated */
+ MGMT_STATUS_BUSY, /* Repeated Attempts */
+ MGMT_STATUS_REJECTED, /* Pairing Not Allowed */
+ MGMT_STATUS_FAILED, /* Unknown LMP PDU */
+ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported Remote Feature */
+ MGMT_STATUS_REJECTED, /* SCO Offset Rejected */
+ MGMT_STATUS_REJECTED, /* SCO Interval Rejected */
+ MGMT_STATUS_REJECTED, /* Air Mode Rejected */
+ MGMT_STATUS_INVALID_PARAMS, /* Invalid LMP Parameters */
+ MGMT_STATUS_FAILED, /* Unspecified Error */
+ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported LMP Parameter Value */
+ MGMT_STATUS_FAILED, /* Role Change Not Allowed */
+ MGMT_STATUS_TIMEOUT, /* LMP Response Timeout */
+ MGMT_STATUS_FAILED, /* LMP Error Transaction Collision */
+ MGMT_STATUS_FAILED, /* LMP PDU Not Allowed */
+ MGMT_STATUS_REJECTED, /* Encryption Mode Not Accepted */
+ MGMT_STATUS_FAILED, /* Unit Link Key Used */
+ MGMT_STATUS_NOT_SUPPORTED, /* QoS Not Supported */
+ MGMT_STATUS_TIMEOUT, /* Instant Passed */
+ MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */
+ MGMT_STATUS_FAILED, /* Transaction Collision */
+ MGMT_STATUS_FAILED, /* Reserved for future use */
+ MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */
+ MGMT_STATUS_REJECTED, /* QoS Rejected */
+ MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */
+ MGMT_STATUS_REJECTED, /* Insufficient Security */
+ MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */
+ MGMT_STATUS_FAILED, /* Reserved for future use */
+ MGMT_STATUS_BUSY, /* Role Switch Pending */
+ MGMT_STATUS_FAILED, /* Reserved for future use */
+ MGMT_STATUS_FAILED, /* Slot Violation */
+ MGMT_STATUS_FAILED, /* Role Switch Failed */
+ MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */
+ MGMT_STATUS_NOT_SUPPORTED, /* Simple Pairing Not Supported */
+ MGMT_STATUS_BUSY, /* Host Busy Pairing */
+ MGMT_STATUS_REJECTED, /* Rejected, No Suitable Channel */
+ MGMT_STATUS_BUSY, /* Controller Busy */
+ MGMT_STATUS_INVALID_PARAMS, /* Unsuitable Connection Interval */
+ MGMT_STATUS_TIMEOUT, /* Directed Advertising Timeout */
+ MGMT_STATUS_AUTH_FAILED, /* Terminated Due to MIC Failure */
+ MGMT_STATUS_CONNECT_FAILED, /* Connection Establishment Failed */
+ MGMT_STATUS_CONNECT_FAILED, /* MAC Connection Failed */
+};
+
+static u8 mgmt_errno_status(int err)
+{
+ switch (err) {
+ case 0:
+ return MGMT_STATUS_SUCCESS;
+ case -EPERM:
+ return MGMT_STATUS_REJECTED;
+ case -EINVAL:
+ return MGMT_STATUS_INVALID_PARAMS;
+ case -EOPNOTSUPP:
+ return MGMT_STATUS_NOT_SUPPORTED;
+ case -EBUSY:
+ return MGMT_STATUS_BUSY;
+ case -ETIMEDOUT:
+ return MGMT_STATUS_AUTH_FAILED;
+ case -ENOMEM:
+ return MGMT_STATUS_NO_RESOURCES;
+ case -EISCONN:
+ return MGMT_STATUS_ALREADY_CONNECTED;
+ case -ENOTCONN:
+ return MGMT_STATUS_DISCONNECTED;
+ }
+
+ return MGMT_STATUS_FAILED;
+}
+
+static u8 mgmt_status(int err)
+{
+ if (err < 0)
+ return mgmt_errno_status(err);
+
+ if (err < ARRAY_SIZE(mgmt_status_table))
+ return mgmt_status_table[err];
+
+ return MGMT_STATUS_FAILED;
+}
+
+static int mgmt_index_event(u16 event, struct hci_dev *hdev, void *data,
+ u16 len, int flag)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ flag, NULL);
+}
+
+static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
+ u16 len, int flag, struct sock *skip_sk)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ flag, skip_sk);
+}
+
+static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
+ struct sock *skip_sk)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ HCI_SOCK_TRUSTED, skip_sk);
+}
+
+static int mgmt_event_skb(struct sk_buff *skb, struct sock *skip_sk)
+{
+ return mgmt_send_event_skb(HCI_CHANNEL_CONTROL, skb, HCI_SOCK_TRUSTED,
+ skip_sk);
+}
+
+static u8 le_addr_type(u8 mgmt_addr_type)
+{
+ if (mgmt_addr_type == BDADDR_LE_PUBLIC)
+ return ADDR_LE_DEV_PUBLIC;
+ else
+ return ADDR_LE_DEV_RANDOM;
+}
+
+void mgmt_fill_version_info(void *ver)
+{
+ struct mgmt_rp_read_version *rp = ver;
+
+ rp->version = MGMT_VERSION;
+ rp->revision = cpu_to_le16(MGMT_REVISION);
+}
+
+static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_read_version rp;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ mgmt_fill_version_info(&rp);
+
+ return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
+ &rp, sizeof(rp));
+}
+
+static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_read_commands *rp;
+ u16 num_commands, num_events;
+ size_t rp_size;
+ int i, err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) {
+ num_commands = ARRAY_SIZE(mgmt_commands);
+ num_events = ARRAY_SIZE(mgmt_events);
+ } else {
+ num_commands = ARRAY_SIZE(mgmt_untrusted_commands);
+ num_events = ARRAY_SIZE(mgmt_untrusted_events);
+ }
+
+ rp_size = sizeof(*rp) + ((num_commands + num_events) * sizeof(u16));
+
+ rp = kmalloc(rp_size, GFP_KERNEL);
+ if (!rp)
+ return -ENOMEM;
+
+ rp->num_commands = cpu_to_le16(num_commands);
+ rp->num_events = cpu_to_le16(num_events);
+
+ if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) {
+ __le16 *opcode = rp->opcodes;
+
+ for (i = 0; i < num_commands; i++, opcode++)
+ put_unaligned_le16(mgmt_commands[i], opcode);
+
+ for (i = 0; i < num_events; i++, opcode++)
+ put_unaligned_le16(mgmt_events[i], opcode);
+ } else {
+ __le16 *opcode = rp->opcodes;
+
+ for (i = 0; i < num_commands; i++, opcode++)
+ put_unaligned_le16(mgmt_untrusted_commands[i], opcode);
+
+ for (i = 0; i < num_events; i++, opcode++)
+ put_unaligned_le16(mgmt_untrusted_events[i], opcode);
+ }
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_COMMANDS, 0,
+ rp, rp_size);
+ kfree(rp);
+
+ return err;
+}
+
+static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_read_index_list *rp;
+ struct hci_dev *d;
+ size_t rp_len;
+ u16 count;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ read_lock(&hci_dev_list_lock);
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (!hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ count++;
+ }
+
+ rp_len = sizeof(*rp) + (2 * count);
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_SETUP) ||
+ hci_dev_test_flag(d, HCI_CONFIG) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
+ continue;
+
+ /* Devices marked as raw-only are neither configured
+ * nor unconfigured controllers.
+ */
+ if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
+ continue;
+
+ if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+ rp->index[count++] = cpu_to_le16(d->id);
+ bt_dev_dbg(hdev, "Added hci%u", d->id);
+ }
+ }
+
+ rp->num_controllers = cpu_to_le16(count);
+ rp_len = sizeof(*rp) + (2 * count);
+
+ read_unlock(&hci_dev_list_lock);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST,
+ 0, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_unconf_index_list *rp;
+ struct hci_dev *d;
+ size_t rp_len;
+ u16 count;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ read_lock(&hci_dev_list_lock);
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ count++;
+ }
+
+ rp_len = sizeof(*rp) + (2 * count);
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_SETUP) ||
+ hci_dev_test_flag(d, HCI_CONFIG) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
+ continue;
+
+ /* Devices marked as raw-only are neither configured
+ * nor unconfigured controllers.
+ */
+ if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
+ continue;
+
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+ rp->index[count++] = cpu_to_le16(d->id);
+ bt_dev_dbg(hdev, "Added hci%u", d->id);
+ }
+ }
+
+ rp->num_controllers = cpu_to_le16(count);
+ rp_len = sizeof(*rp) + (2 * count);
+
+ read_unlock(&hci_dev_list_lock);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_READ_UNCONF_INDEX_LIST, 0, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_ext_index_list *rp;
+ struct hci_dev *d;
+ u16 count;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ read_lock(&hci_dev_list_lock);
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list)
+ count++;
+
+ rp = kmalloc(struct_size(rp, entry, count), GFP_ATOMIC);
+ if (!rp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_SETUP) ||
+ hci_dev_test_flag(d, HCI_CONFIG) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
+ continue;
+
+ /* Devices marked as raw-only are neither configured
+ * nor unconfigured controllers.
+ */
+ if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
+ continue;
+
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ rp->entry[count].type = 0x01;
+ else
+ rp->entry[count].type = 0x00;
+
+ rp->entry[count].bus = d->bus;
+ rp->entry[count++].index = cpu_to_le16(d->id);
+ bt_dev_dbg(hdev, "Added hci%u", d->id);
+ }
+
+ rp->num_controllers = cpu_to_le16(count);
+
+ read_unlock(&hci_dev_list_lock);
+
+ /* If this command is called at least once, then all the
+ * default index and unconfigured index events are disabled
+ * and from now on only extended index events are used.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXT_INDEX_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_INDEX_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_READ_EXT_INDEX_LIST, 0, rp,
+ struct_size(rp, entry, count));
+
+ kfree(rp);
+
+ return err;
+}
+
+static bool is_configured(struct hci_dev *hdev)
+{
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) &&
+ !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
+ return false;
+
+ if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ return false;
+
+ return true;
+}
+
+static __le32 get_missing_options(struct hci_dev *hdev)
+{
+ u32 options = 0;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) &&
+ !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
+ options |= MGMT_OPTION_EXTERNAL_CONFIG;
+
+ if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ options |= MGMT_OPTION_PUBLIC_ADDRESS;
+
+ return cpu_to_le32(options);
+}
+
+static int new_options(struct hci_dev *hdev, struct sock *skip)
+{
+ __le32 options = get_missing_options(hdev);
+
+ return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+ sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
+}
+
+static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
+{
+ __le32 options = get_missing_options(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &options,
+ sizeof(options));
+}
+
+static int read_config_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_config_info rp;
+ u32 options = 0;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG))
+ options |= MGMT_OPTION_EXTERNAL_CONFIG;
+
+ if (hdev->set_bdaddr)
+ options |= MGMT_OPTION_PUBLIC_ADDRESS;
+
+ rp.supported_options = cpu_to_le32(options);
+ rp.missing_options = get_missing_options(hdev);
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONFIG_INFO, 0,
+ &rp, sizeof(rp));
+}
+
+static u32 get_supported_phys(struct hci_dev *hdev)
+{
+ u32 supported_phys = 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ supported_phys |= MGMT_PHY_BR_1M_1SLOT;
+
+ if (hdev->features[0][0] & LMP_3SLOT)
+ supported_phys |= MGMT_PHY_BR_1M_3SLOT;
+
+ if (hdev->features[0][0] & LMP_5SLOT)
+ supported_phys |= MGMT_PHY_BR_1M_5SLOT;
+
+ if (lmp_edr_2m_capable(hdev)) {
+ supported_phys |= MGMT_PHY_EDR_2M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_2M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_2M_5SLOT;
+
+ if (lmp_edr_3m_capable(hdev)) {
+ supported_phys |= MGMT_PHY_EDR_3M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_3M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_3M_5SLOT;
+ }
+ }
+ }
+
+ if (lmp_le_capable(hdev)) {
+ supported_phys |= MGMT_PHY_LE_1M_TX;
+ supported_phys |= MGMT_PHY_LE_1M_RX;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_2M) {
+ supported_phys |= MGMT_PHY_LE_2M_TX;
+ supported_phys |= MGMT_PHY_LE_2M_RX;
+ }
+
+ if (hdev->le_features[1] & HCI_LE_PHY_CODED) {
+ supported_phys |= MGMT_PHY_LE_CODED_TX;
+ supported_phys |= MGMT_PHY_LE_CODED_RX;
+ }
+ }
+
+ return supported_phys;
+}
+
+static u32 get_selected_phys(struct hci_dev *hdev)
+{
+ u32 selected_phys = 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ selected_phys |= MGMT_PHY_BR_1M_1SLOT;
+
+ if (hdev->pkt_type & (HCI_DM3 | HCI_DH3))
+ selected_phys |= MGMT_PHY_BR_1M_3SLOT;
+
+ if (hdev->pkt_type & (HCI_DM5 | HCI_DH5))
+ selected_phys |= MGMT_PHY_BR_1M_5SLOT;
+
+ if (lmp_edr_2m_capable(hdev)) {
+ if (!(hdev->pkt_type & HCI_2DH1))
+ selected_phys |= MGMT_PHY_EDR_2M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_2DH3))
+ selected_phys |= MGMT_PHY_EDR_2M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_2DH5))
+ selected_phys |= MGMT_PHY_EDR_2M_5SLOT;
+
+ if (lmp_edr_3m_capable(hdev)) {
+ if (!(hdev->pkt_type & HCI_3DH1))
+ selected_phys |= MGMT_PHY_EDR_3M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_3DH3))
+ selected_phys |= MGMT_PHY_EDR_3M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_3DH5))
+ selected_phys |= MGMT_PHY_EDR_3M_5SLOT;
+ }
+ }
+ }
+
+ if (lmp_le_capable(hdev)) {
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M)
+ selected_phys |= MGMT_PHY_LE_1M_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M)
+ selected_phys |= MGMT_PHY_LE_1M_RX;
+
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M)
+ selected_phys |= MGMT_PHY_LE_2M_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M)
+ selected_phys |= MGMT_PHY_LE_2M_RX;
+
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED)
+ selected_phys |= MGMT_PHY_LE_CODED_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED)
+ selected_phys |= MGMT_PHY_LE_CODED_RX;
+ }
+
+ return selected_phys;
+}
+
+static u32 get_configurable_phys(struct hci_dev *hdev)
+{
+ return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT &
+ ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX);
+}
+
+static u32 get_supported_settings(struct hci_dev *hdev)
+{
+ u32 settings = 0;
+
+ settings |= MGMT_SETTING_POWERED;
+ settings |= MGMT_SETTING_BONDABLE;
+ settings |= MGMT_SETTING_DEBUG_KEYS;
+ settings |= MGMT_SETTING_CONNECTABLE;
+ settings |= MGMT_SETTING_DISCOVERABLE;
+
+ if (lmp_bredr_capable(hdev)) {
+ if (hdev->hci_ver >= BLUETOOTH_VER_1_2)
+ settings |= MGMT_SETTING_FAST_CONNECTABLE;
+ settings |= MGMT_SETTING_BREDR;
+ settings |= MGMT_SETTING_LINK_SECURITY;
+
+ if (lmp_ssp_capable(hdev)) {
+ settings |= MGMT_SETTING_SSP;
+ }
+
+ if (lmp_sc_capable(hdev))
+ settings |= MGMT_SETTING_SECURE_CONN;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED))
+ settings |= MGMT_SETTING_WIDEBAND_SPEECH;
+ }
+
+ if (lmp_le_capable(hdev)) {
+ settings |= MGMT_SETTING_LE;
+ settings |= MGMT_SETTING_SECURE_CONN;
+ settings |= MGMT_SETTING_PRIVACY;
+ settings |= MGMT_SETTING_STATIC_ADDRESS;
+ settings |= MGMT_SETTING_ADVERTISING;
+ }
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || hdev->set_bdaddr)
+ settings |= MGMT_SETTING_CONFIGURATION;
+
+ if (cis_central_capable(hdev))
+ settings |= MGMT_SETTING_CIS_CENTRAL;
+
+ if (cis_peripheral_capable(hdev))
+ settings |= MGMT_SETTING_CIS_PERIPHERAL;
+
+ if (ll_privacy_capable(hdev))
+ settings |= MGMT_SETTING_LL_PRIVACY;
+
+ if (past_sender_capable(hdev))
+ settings |= MGMT_SETTING_PAST_SENDER;
+
+ if (past_receiver_capable(hdev))
+ settings |= MGMT_SETTING_PAST_RECEIVER;
+
+ settings |= MGMT_SETTING_PHY_CONFIGURATION;
+
+ return settings;
+}
+
+static u32 get_current_settings(struct hci_dev *hdev)
+{
+ u32 settings = 0;
+
+ if (hdev_is_powered(hdev))
+ settings |= MGMT_SETTING_POWERED;
+
+ if (hci_dev_test_flag(hdev, HCI_CONNECTABLE))
+ settings |= MGMT_SETTING_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
+ settings |= MGMT_SETTING_FAST_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ settings |= MGMT_SETTING_DISCOVERABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE))
+ settings |= MGMT_SETTING_BONDABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ settings |= MGMT_SETTING_BREDR;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ settings |= MGMT_SETTING_LE;
+
+ if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY))
+ settings |= MGMT_SETTING_LINK_SECURITY;
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ settings |= MGMT_SETTING_SSP;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ settings |= MGMT_SETTING_ADVERTISING;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED))
+ settings |= MGMT_SETTING_SECURE_CONN;
+
+ if (hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS))
+ settings |= MGMT_SETTING_DEBUG_KEYS;
+
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ settings |= MGMT_SETTING_PRIVACY;
+
+ /* The current setting for static address has two purposes. The
+ * first is to indicate if the static address will be used and
+ * the second is to indicate if it is actually set.
+ *
+ * This means if the static address is not configured, this flag
+ * will never be set. If the address is configured, then if the
+ * address is actually used decides if the flag is set or not.
+ *
+ * For single mode LE only controllers and dual-mode controllers
+ * with BR/EDR disabled, the existence of the static address will
+ * be evaluated.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY)) {
+ if (bacmp(&hdev->static_addr, BDADDR_ANY))
+ settings |= MGMT_SETTING_STATIC_ADDRESS;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED))
+ settings |= MGMT_SETTING_WIDEBAND_SPEECH;
+
+ if (cis_central_enabled(hdev))
+ settings |= MGMT_SETTING_CIS_CENTRAL;
+
+ if (cis_peripheral_enabled(hdev))
+ settings |= MGMT_SETTING_CIS_PERIPHERAL;
+
+ if (bis_enabled(hdev))
+ settings |= MGMT_SETTING_ISO_BROADCASTER;
+
+ if (sync_recv_enabled(hdev))
+ settings |= MGMT_SETTING_ISO_SYNC_RECEIVER;
+
+ if (ll_privacy_enabled(hdev))
+ settings |= MGMT_SETTING_LL_PRIVACY;
+
+ if (past_sender_enabled(hdev))
+ settings |= MGMT_SETTING_PAST_SENDER;
+
+ if (past_receiver_enabled(hdev))
+ settings |= MGMT_SETTING_PAST_RECEIVER;
+
+ return settings;
+}
+
+static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev)
+{
+ return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev);
+}
+
+u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ /* If there's a pending mgmt command the flags will not yet have
+ * their final values, so check for this first.
+ */
+ cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev);
+ if (cmd) {
+ struct mgmt_mode *cp = cmd->param;
+ if (cp->val == 0x01)
+ return LE_AD_GENERAL;
+ else if (cp->val == 0x02)
+ return LE_AD_LIMITED;
+ } else {
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ return LE_AD_LIMITED;
+ else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ return LE_AD_GENERAL;
+ }
+
+ return 0;
+}
+
+bool mgmt_get_connectable(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ /* If there's a pending mgmt command the flag will not yet have
+ * it's final value, so check for this first.
+ */
+ cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev);
+ if (cmd) {
+ struct mgmt_mode *cp = cmd->param;
+
+ return cp->val;
+ }
+
+ return hci_dev_test_flag(hdev, HCI_CONNECTABLE);
+}
+
+static int service_cache_sync(struct hci_dev *hdev, void *data)
+{
+ hci_update_eir_sync(hdev);
+ hci_update_class_sync(hdev);
+
+ return 0;
+}
+
+static void service_cache_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ service_cache.work);
+
+ if (!hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
+ return;
+
+ hci_cmd_sync_queue(hdev, service_cache_sync, NULL, NULL);
+}
+
+static int rpa_expired_sync(struct hci_dev *hdev, void *data)
+{
+ /* The generation of a new RPA and programming it into the
+ * controller happens in the hci_req_enable_advertising()
+ * function.
+ */
+ if (ext_adv_capable(hdev))
+ return hci_start_ext_adv_sync(hdev, hdev->cur_adv_instance);
+ else
+ return hci_enable_advertising_sync(hdev);
+}
+
+static void rpa_expired(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ rpa_expired.work);
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return;
+
+ hci_cmd_sync_queue(hdev, rpa_expired_sync, NULL, NULL);
+}
+
+static int set_discoverable_sync(struct hci_dev *hdev, void *data);
+
+static void discov_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ discov_off.work);
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ /* When discoverable timeout triggers, then just make sure
+ * the limited discoverable flag is cleared. Even in the case
+ * of a timeout triggered from general discoverable, it is
+ * safe to unconditionally clear the flag.
+ */
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hdev->discov_timeout = 0;
+
+ hci_cmd_sync_queue(hdev, set_discoverable_sync, NULL, NULL);
+
+ mgmt_new_settings(hdev);
+
+ hci_dev_unlock(hdev);
+}
+
+static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev);
+
+static void mesh_send_complete(struct hci_dev *hdev,
+ struct mgmt_mesh_tx *mesh_tx, bool silent)
+{
+ u8 handle = mesh_tx->handle;
+
+ if (!silent)
+ mgmt_event(MGMT_EV_MESH_PACKET_CMPLT, hdev, &handle,
+ sizeof(handle), NULL);
+
+ mgmt_mesh_remove(mesh_tx);
+}
+
+static int mesh_send_done_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ hci_dev_clear_flag(hdev, HCI_MESH_SENDING);
+ if (list_empty(&hdev->adv_instances))
+ hci_disable_advertising_sync(hdev);
+ mesh_tx = mgmt_mesh_next(hdev, NULL);
+
+ if (mesh_tx)
+ mesh_send_complete(hdev, mesh_tx, false);
+
+ return 0;
+}
+
+static int mesh_send_sync(struct hci_dev *hdev, void *data);
+static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err);
+static void mesh_next(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_mesh_tx *mesh_tx = mgmt_mesh_next(hdev, NULL);
+
+ if (!mesh_tx)
+ return;
+
+ err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx,
+ mesh_send_start_complete);
+
+ if (err < 0)
+ mesh_send_complete(hdev, mesh_tx, false);
+ else
+ hci_dev_set_flag(hdev, HCI_MESH_SENDING);
+}
+
+static void mesh_send_done(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ mesh_send_done.work);
+
+ if (!hci_dev_test_flag(hdev, HCI_MESH_SENDING))
+ return;
+
+ hci_cmd_sync_queue(hdev, mesh_send_done_sync, NULL, mesh_next);
+}
+
+static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
+{
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ BT_INFO("MGMT ver %d.%d", MGMT_VERSION, MGMT_REVISION);
+
+ INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
+ INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off);
+ INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired);
+ INIT_DELAYED_WORK(&hdev->mesh_send_done, mesh_send_done);
+
+ /* Non-mgmt controlled devices get this bit set
+ * implicitly so that pairing works for them, however
+ * for mgmt we require user-space to explicitly enable
+ * it
+ */
+ hci_dev_clear_flag(hdev, HCI_BONDABLE);
+
+ hci_dev_set_flag(hdev, HCI_MGMT);
+}
+
+static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_info rp;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+
+ bacpy(&rp.bdaddr, &hdev->bdaddr);
+
+ rp.version = hdev->hci_ver;
+ rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ rp.supported_settings = cpu_to_le32(get_supported_settings(hdev));
+ rp.current_settings = cpu_to_le32(get_current_settings(hdev));
+
+ memcpy(rp.dev_class, hdev->dev_class, 3);
+
+ memcpy(rp.name, hdev->dev_name, sizeof(hdev->dev_name));
+ memcpy(rp.short_name, hdev->short_name, sizeof(hdev->short_name));
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_INFO, 0, &rp,
+ sizeof(rp));
+}
+
+static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
+{
+ u16 eir_len = 0;
+ size_t name_len;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
+ hdev->appearance);
+
+ name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name));
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, name_len);
+
+ name_len = strnlen(hdev->short_name, sizeof(hdev->short_name));
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
+ hdev->short_name, name_len);
+
+ return eir_len;
+}
+
+static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ char buf[512];
+ struct mgmt_rp_read_ext_info *rp = (void *)buf;
+ u16 eir_len;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&buf, 0, sizeof(buf));
+
+ hci_dev_lock(hdev);
+
+ bacpy(&rp->bdaddr, &hdev->bdaddr);
+
+ rp->version = hdev->hci_ver;
+ rp->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
+ rp->current_settings = cpu_to_le32(get_current_settings(hdev));
+
+
+ eir_len = append_eir_data_to_buf(hdev, rp->eir);
+ rp->eir_len = cpu_to_le16(eir_len);
+
+ hci_dev_unlock(hdev);
+
+ /* If this command is called at least once, then the events
+ * for class of device and local name changes are disabled
+ * and only the new extended controller information event
+ * is used.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp,
+ sizeof(*rp) + eir_len);
+}
+
+static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ char buf[512];
+ struct mgmt_ev_ext_info_changed *ev = (void *)buf;
+ u16 eir_len;
+
+ memset(buf, 0, sizeof(buf));
+
+ eir_len = append_eir_data_to_buf(hdev, ev->eir);
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev,
+ sizeof(*ev) + eir_len,
+ HCI_MGMT_EXT_INFO_EVENTS, skip);
+}
+
+static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
+{
+ __le32 settings = cpu_to_le32(get_current_settings(hdev));
+
+ return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &settings,
+ sizeof(settings));
+}
+
+void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance)
+{
+ struct mgmt_ev_advertising_added ev;
+
+ ev.instance = instance;
+
+ mgmt_event(MGMT_EV_ADVERTISING_ADDED, hdev, &ev, sizeof(ev), sk);
+}
+
+void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev,
+ u8 instance)
+{
+ struct mgmt_ev_advertising_removed ev;
+
+ ev.instance = instance;
+
+ mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void cancel_adv_timeout(struct hci_dev *hdev)
+{
+ if (hdev->adv_instance_timeout) {
+ hdev->adv_instance_timeout = 0;
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ }
+}
+
+/* This function requires the caller holds hdev->lock */
+static void restart_le_actions(struct hci_dev *hdev)
+{
+ struct hci_conn_params *p;
+
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ /* Needed for AUTO_OFF case where might not "really"
+ * have been powered off.
+ */
+ hci_pend_le_list_del_init(p);
+
+ switch (p->auto_connect) {
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_add(p, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ hci_pend_le_list_add(p, &hdev->pend_le_reports);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static int new_settings(struct hci_dev *hdev, struct sock *skip)
+{
+ __le32 ev = cpu_to_le32(get_current_settings(hdev));
+
+ return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+ sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
+}
+
+static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp;
+
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ cp = cmd->param;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (!err) {
+ if (cp->val) {
+ hci_dev_lock(hdev);
+ restart_le_actions(hdev);
+ hci_update_passive_scan(hdev);
+ hci_dev_unlock(hdev);
+ }
+
+ send_settings_rsp(cmd->sk, cmd->opcode, hdev);
+
+ /* Only call new_setting for power on as power off is deferred
+ * to hdev->power_off work which does call hci_dev_do_close.
+ */
+ if (cp->val)
+ new_settings(hdev, cmd->sk);
+ } else {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED,
+ mgmt_status(err));
+ }
+
+ mgmt_pending_free(cmd);
+}
+
+static int set_powered_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ /* Make sure cmd still outstanding. */
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ BT_DBG("%s", hdev->name);
+
+ return hci_set_powered_sync(hdev, cp.val);
+}
+
+static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!cp->val) {
+ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+ }
+
+ if (pending_find(MGMT_OP_SET_POWERED, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (!!cp->val == hdev_is_powered(hdev)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_POWERED, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ /* Cancel potentially blocking sync operation before power off */
+ if (cp->val == 0x00) {
+ hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN);
+ err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
+ mgmt_set_powered_complete);
+ } else {
+ /* Use hci_cmd_sync_submit since hdev might not be running */
+ err = hci_cmd_sync_submit(hdev, set_powered_sync, cmd,
+ mgmt_set_powered_complete);
+ }
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+int mgmt_new_settings(struct hci_dev *hdev)
+{
+ return new_settings(hdev, NULL);
+}
+
+struct cmd_lookup {
+ struct sock *sk;
+ struct hci_dev *hdev;
+ u8 mgmt_status;
+};
+
+static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct cmd_lookup *match = data;
+
+ send_settings_rsp(cmd->sk, cmd->opcode, match->hdev);
+
+ if (match->sk == NULL) {
+ match->sk = cmd->sk;
+ sock_hold(match->sk);
+ }
+}
+
+static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ u8 *status = data;
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, *status);
+}
+
+static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct cmd_lookup *match = data;
+
+ /* dequeue cmd_sync entries using cmd as data as that is about to be
+ * removed/freed.
+ */
+ hci_cmd_sync_dequeue(match->hdev, NULL, cmd, NULL);
+
+ if (cmd->cmd_complete) {
+ cmd->cmd_complete(cmd, match->mgmt_status);
+ return;
+ }
+
+ cmd_status_rsp(cmd, data);
+}
+
+static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status,
+ cmd->param, cmd->param_len);
+}
+
+static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status,
+ cmd->param, sizeof(struct mgmt_addr_info));
+}
+
+static u8 mgmt_bredr_support(struct hci_dev *hdev)
+{
+ if (!lmp_bredr_capable(hdev))
+ return MGMT_STATUS_NOT_SUPPORTED;
+ else if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return MGMT_STATUS_REJECTED;
+ else
+ return MGMT_STATUS_SUCCESS;
+}
+
+static u8 mgmt_le_support(struct hci_dev *hdev)
+{
+ if (!lmp_le_capable(hdev))
+ return MGMT_STATUS_NOT_SUPPORTED;
+ else if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return MGMT_STATUS_REJECTED;
+ else
+ return MGMT_STATUS_SUCCESS;
+}
+
+static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ hdev->discov_timeout > 0) {
+ int to = secs_to_jiffies(hdev->discov_timeout);
+ queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to);
+ }
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev);
+ new_settings(hdev, cmd->sk);
+
+done:
+ mgmt_pending_free(cmd);
+ hci_dev_unlock(hdev);
+}
+
+static int set_discoverable_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ BT_DBG("%s", hdev->name);
+
+ return hci_update_discoverable_sync(hdev);
+}
+
+static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_discoverable *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u16 timeout;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ timeout = __le16_to_cpu(cp->timeout);
+
+ /* Disabling discoverable requires that no timeout is set,
+ * and enabling limited discoverable requires a timeout.
+ */
+ if ((cp->val == 0x00 && timeout > 0) ||
+ (cp->val == 0x02 && timeout == 0))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev) && timeout > 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_NOT_POWERED);
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) ||
+ pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_CONNECTABLE)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_REJECTED);
+ goto failed;
+ }
+
+ if (hdev->advertising_paused) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (!hdev_is_powered(hdev)) {
+ bool changed = false;
+
+ /* Setting limited discoverable when powered off is
+ * not a valid operation since it requires a timeout
+ * and so no need to check HCI_LIMITED_DISCOVERABLE.
+ */
+ if (!!cp->val != hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) {
+ hci_dev_change_flag(hdev, HCI_DISCOVERABLE);
+ changed = true;
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ /* If the current mode is the same, then just update the timeout
+ * value with the new value. And if only the timeout gets updated,
+ * then no need for any HCI transactions.
+ */
+ if (!!cp->val == hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ (cp->val == 0x02) == hci_dev_test_flag(hdev,
+ HCI_LIMITED_DISCOVERABLE)) {
+ cancel_delayed_work(&hdev->discov_off);
+ hdev->discov_timeout = timeout;
+
+ if (cp->val && hdev->discov_timeout > 0) {
+ int to = secs_to_jiffies(hdev->discov_timeout);
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->discov_off, to);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ /* Cancel any potential discoverable timeout that might be
+ * still active and store new timeout value. The arming of
+ * the timeout happens in the complete handler.
+ */
+ cancel_delayed_work(&hdev->discov_off);
+ hdev->discov_timeout = timeout;
+
+ if (cp->val)
+ hci_dev_set_flag(hdev, HCI_DISCOVERABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+
+ /* Limited discoverable mode */
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+
+ err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd,
+ mgmt_set_discoverable_complete);
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ goto done;
+ }
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev);
+ new_settings(hdev, cmd->sk);
+
+done:
+ mgmt_pending_free(cmd);
+
+ hci_dev_unlock(hdev);
+}
+
+static int set_connectable_update_settings(struct hci_dev *hdev,
+ struct sock *sk, u8 val)
+{
+ bool changed = false;
+ int err;
+
+ if (!!val != hci_dev_test_flag(hdev, HCI_CONNECTABLE))
+ changed = true;
+
+ if (val) {
+ hci_dev_set_flag(hdev, HCI_CONNECTABLE);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_CONNECTABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_CONNECTABLE, hdev);
+ if (err < 0)
+ return err;
+
+ if (changed) {
+ hci_update_scan(hdev);
+ hci_update_passive_scan(hdev);
+ return new_settings(hdev, sk);
+ }
+
+ return 0;
+}
+
+static int set_connectable_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ BT_DBG("%s", hdev->name);
+
+ return hci_update_connectable_sync(hdev);
+}
+
+static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = set_connectable_update_settings(hdev, sk, cp->val);
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) ||
+ pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ if (cp->val) {
+ hci_dev_set_flag(hdev, HCI_CONNECTABLE);
+ } else {
+ if (hdev->discov_timeout > 0)
+ cancel_delayed_work(&hdev->discov_off);
+
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_CONNECTABLE);
+ }
+
+ err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd,
+ mgmt_set_connectable_complete);
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ bool changed;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_BONDABLE);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_BONDABLE);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_BONDABLE, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed) {
+ /* In limited privacy mode the change of bondable mode
+ * may affect the local advertising address.
+ */
+ hci_update_discoverable(hdev);
+
+ err = new_settings(hdev, sk);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 val, status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY,
+ status);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ bool changed = false;
+
+ if (!!cp->val != hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) {
+ hci_dev_change_flag(hdev, HCI_LINK_SECURITY);
+ changed = true;
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_LINK_SECURITY, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_LINK_SECURITY, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ val = !!cp->val;
+
+ if (test_bit(HCI_AUTH, &hdev->flags) == val) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_LINK_SECURITY, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_LINK_SECURITY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ err = hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(val), &val);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp;
+ u8 enable;
+ bool changed;
+
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ cp = cmd->param;
+ enable = cp->val;
+
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+
+ if (enable && hci_dev_test_and_clear_flag(hdev,
+ HCI_SSP_ENABLED)) {
+ new_settings(hdev, NULL);
+ }
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ return;
+ }
+
+ if (enable) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
+ }
+
+ settings_rsp(cmd, &match);
+
+ if (changed)
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ hci_update_eir_sync(hdev);
+}
+
+static int set_ssp_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+ bool changed = false;
+ int err;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ if (cp.val)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
+
+ err = hci_write_ssp_mode_sync(hdev, cp.val);
+
+ if (!err && changed)
+ hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
+
+ return err;
+}
+
+static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, status);
+
+ if (!lmp_ssp_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ bool changed;
+
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_SSP_ENABLED);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_SSP_ENABLED);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_SSP, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (!!cp->val == hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_SSP, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_ssp_sync, cmd,
+ set_ssp_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_NOT_SUPPORTED);
+}
+
+static void set_le_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct cmd_lookup match = { NULL, hdev };
+ u8 status = mgmt_status(err);
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, data))
+ return;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status);
+ goto done;
+ }
+
+ settings_rsp(cmd, &match);
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+done:
+ mgmt_pending_free(cmd);
+}
+
+static int set_le_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+ u8 val;
+ int err;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+ val = !!cp.val;
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ if (!val) {
+ hci_clear_adv_instance_sync(hdev, NULL, 0x00, true);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ hci_disable_advertising_sync(hdev);
+
+ if (ext_adv_capable(hdev))
+ hci_remove_ext_adv_instance_sync(hdev, 0, cmd->sk);
+ } else {
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+ }
+
+ err = hci_write_le_host_supported_sync(hdev, val, 0);
+
+ /* Make sure the controller has a good default for
+ * advertising data. Restrict the update to when LE
+ * has actually been enabled. During power on, the
+ * update in powered_update_hci will take care of it.
+ */
+ if (!err && hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (ext_adv_capable(hdev)) {
+ int status;
+
+ status = hci_setup_ext_adv_instance_sync(hdev, 0x00);
+ if (!status)
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ } else {
+ hci_update_adv_data_sync(hdev, 0x00);
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ }
+
+ hci_update_passive_scan(hdev);
+ }
+
+ return err;
+}
+
+static void set_mesh_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ u8 status = mgmt_status(err);
+ struct sock *sk;
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ sk = cmd->sk;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ status);
+ mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true,
+ cmd_status_rsp, &status);
+ goto done;
+ }
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0);
+
+done:
+ mgmt_pending_free(cmd);
+}
+
+static int set_mesh_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ DEFINE_FLEX(struct mgmt_cp_set_mesh, cp, ad_types, num_ad_types,
+ sizeof(hdev->mesh_ad_types));
+ size_t len;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ len = cmd->param_len;
+ memcpy(cp, cmd->param, min(__struct_size(cp), len));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types));
+
+ if (cp->enable)
+ hci_dev_set_flag(hdev, HCI_MESH);
+ else
+ hci_dev_clear_flag(hdev, HCI_MESH);
+
+ hdev->le_scan_interval = __le16_to_cpu(cp->period);
+ hdev->le_scan_window = __le16_to_cpu(cp->window);
+
+ len -= sizeof(struct mgmt_cp_set_mesh);
+
+ /* If filters don't fit, forward all adv pkts */
+ if (len <= sizeof(hdev->mesh_ad_types))
+ memcpy(hdev->mesh_ad_types, cp->ad_types, len);
+
+ hci_update_passive_scan_sync(hdev);
+ return 0;
+}
+
+static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_cp_set_mesh *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ __u16 period, window;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->enable != 0x00 && cp->enable != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Keep allowed ranges in sync with set_scan_params() */
+ period = __le16_to_cpu(cp->period);
+
+ if (period < 0x0004 || period > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ window = __le16_to_cpu(cp->window);
+
+ if (window < 0x0004 || window > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (window > period)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_MESH_RECEIVER, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_mesh_sync, cmd,
+ set_mesh_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
+ }
+
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_mesh_tx *mesh_tx = data;
+ struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param;
+ unsigned long mesh_send_interval;
+ u8 mgmt_err = mgmt_status(err);
+
+ /* Report any errors here, but don't report completion */
+
+ if (mgmt_err) {
+ hci_dev_clear_flag(hdev, HCI_MESH_SENDING);
+ /* Send Complete Error Code for handle */
+ mesh_send_complete(hdev, mesh_tx, false);
+ return;
+ }
+
+ mesh_send_interval = msecs_to_jiffies((send->cnt) * 25);
+ queue_delayed_work(hdev->req_workqueue, &hdev->mesh_send_done,
+ mesh_send_interval);
+}
+
+static int mesh_send_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_mesh_tx *mesh_tx = data;
+ struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param;
+ struct adv_info *adv, *next_instance;
+ u8 instance = hdev->le_num_of_adv_sets + 1;
+ u16 timeout, duration;
+ int err = 0;
+
+ if (hdev->le_num_of_adv_sets <= hdev->adv_instance_cnt)
+ return MGMT_STATUS_BUSY;
+
+ timeout = 1000;
+ duration = send->cnt * INTERVAL_TO_MS(hdev->le_adv_max_interval);
+ adv = hci_add_adv_instance(hdev, instance, 0,
+ send->adv_data_len, send->adv_data,
+ 0, NULL,
+ timeout, duration,
+ HCI_ADV_TX_POWER_NO_PREFERENCE,
+ hdev->le_adv_min_interval,
+ hdev->le_adv_max_interval,
+ mesh_tx->handle);
+
+ if (!IS_ERR(adv))
+ mesh_tx->instance = instance;
+ else
+ err = PTR_ERR(adv);
+
+ if (hdev->cur_adv_instance == instance) {
+ /* If the currently advertised instance is being changed then
+ * cancel the current advertising and schedule the next
+ * instance. If there is only one instance then the overridden
+ * advertising data will be visible right away.
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev, instance);
+ if (next_instance)
+ instance = next_instance->instance;
+ else
+ instance = 0;
+ } else if (hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other, or
+ * let it go naturally from queue if ADV is already happening
+ */
+ instance = 0;
+ }
+
+ if (instance)
+ return hci_schedule_adv_instance_sync(hdev, instance, true);
+
+ return err;
+}
+
+static void send_count(struct mgmt_mesh_tx *mesh_tx, void *data)
+{
+ struct mgmt_rp_mesh_read_features *rp = data;
+
+ if (rp->used_handles >= rp->max_handles)
+ return;
+
+ rp->handles[rp->used_handles++] = mesh_tx->handle;
+}
+
+static int mesh_features(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_rp_mesh_read_features rp;
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.index = cpu_to_le16(hdev->id);
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ rp.max_handles = MESH_HANDLES_MAX;
+
+ hci_dev_lock(hdev);
+
+ if (rp.max_handles)
+ mgmt_mesh_foreach(hdev, send_count, &rp, sk);
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, 0, &rp,
+ rp.used_handles + sizeof(rp) - MESH_HANDLES_MAX);
+
+ hci_dev_unlock(hdev);
+ return 0;
+}
+
+static int send_cancel(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_mesh_send_cancel *cancel = (void *)cmd->param;
+ struct mgmt_mesh_tx *mesh_tx;
+
+ if (!cancel->handle) {
+ do {
+ mesh_tx = mgmt_mesh_next(hdev, cmd->sk);
+
+ if (mesh_tx)
+ mesh_send_complete(hdev, mesh_tx, false);
+ } while (mesh_tx);
+ } else {
+ mesh_tx = mgmt_mesh_find(hdev, cancel->handle);
+
+ if (mesh_tx && mesh_tx->sk == cmd->sk)
+ mesh_send_complete(hdev, mesh_tx, false);
+ }
+
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ 0, NULL, 0);
+ mgmt_pending_free(cmd);
+
+ return 0;
+}
+
+static int mesh_send_cancel(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+ cmd = mgmt_pending_new(sk, MGMT_OP_MESH_SEND_CANCEL, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, send_cancel, cmd, NULL);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
+
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+ struct mgmt_cp_mesh_send *send = data;
+ struct mgmt_rp_mesh_read_features rp;
+ bool sending;
+ int err = 0;
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_NOT_SUPPORTED);
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) ||
+ len <= MGMT_MESH_SEND_SIZE ||
+ len > (MGMT_MESH_SEND_SIZE + 31))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.max_handles = MESH_HANDLES_MAX;
+
+ mgmt_mesh_foreach(hdev, send_count, &rp, sk);
+
+ if (rp.max_handles <= rp.used_handles) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_BUSY);
+ goto done;
+ }
+
+ sending = hci_dev_test_flag(hdev, HCI_MESH_SENDING);
+ mesh_tx = mgmt_mesh_add(sk, hdev, send, len);
+
+ if (!mesh_tx)
+ err = -ENOMEM;
+ else if (!sending)
+ err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx,
+ mesh_send_start_complete);
+
+ if (err < 0) {
+ bt_dev_err(hdev, "Send Mesh Failed %d", err);
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_FAILED);
+
+ if (mesh_tx) {
+ if (sending)
+ mgmt_mesh_remove(mesh_tx);
+ }
+ } else {
+ hci_dev_set_flag(hdev, HCI_MESH_SENDING);
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_SEND, 0,
+ &mesh_tx->handle, 1);
+ }
+
+done:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+ u8 val, enabled;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Bluetooth single mode LE only controllers or dual-mode
+ * controllers configured as LE only devices, do not allow
+ * switching LE off. These have either LE enabled explicitly
+ * or BR/EDR has been previously switched off.
+ *
+ * When trying to enable an already enabled LE, then gracefully
+ * send a positive response. Trying to disable it however will
+ * result into rejection.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ if (cp->val == 0x01)
+ return send_settings_rsp(sk, MGMT_OP_SET_LE, hdev);
+
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_REJECTED);
+ }
+
+ hci_dev_lock(hdev);
+
+ val = !!cp->val;
+ enabled = lmp_host_le_capable(hdev);
+
+ if (!hdev_is_powered(hdev) || val == enabled) {
+ bool changed = false;
+
+ if (val != hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ hci_dev_change_flag(hdev, HCI_LE_ENABLED);
+ changed = true;
+ }
+
+ if (!val && hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING);
+ changed = true;
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_LE, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_SET_ADVERTISING, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_LE, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_le_sync, cmd,
+ set_le_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int send_hci_cmd_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_hci_cmd_sync *cp = cmd->param;
+ struct sk_buff *skb;
+
+ skb = __hci_cmd_sync_ev(hdev, le16_to_cpu(cp->opcode),
+ le16_to_cpu(cp->params_len), cp->params,
+ cp->event, cp->timeout ?
+ secs_to_jiffies(cp->timeout) :
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC,
+ mgmt_status(PTR_ERR(skb)));
+ goto done;
+ }
+
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, 0,
+ skb->data, skb->len);
+
+ kfree_skb(skb);
+
+done:
+ mgmt_pending_free(cmd);
+
+ return 0;
+}
+
+static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_hci_cmd_sync *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ if (len != (offsetof(struct mgmt_cp_hci_cmd_sync, params) +
+ le16_to_cpu(cp->params_len)))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+ cmd = mgmt_pending_new(sk, MGMT_OP_HCI_CMD_SYNC, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, send_hci_cmd_sync, cmd, NULL);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
+
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+/* This is a helper function to test for pending mgmt commands that can
+ * cause CoD or EIR HCI commands. We can only allow one such pending
+ * mgmt command at a time since otherwise we cannot easily track what
+ * the current values are, will be, and based on that calculate if a new
+ * HCI command needs to be sent and if yes with what value.
+ */
+static bool pending_eir_or_class(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ switch (cmd->opcode) {
+ case MGMT_OP_ADD_UUID:
+ case MGMT_OP_REMOVE_UUID:
+ case MGMT_OP_SET_DEV_CLASS:
+ case MGMT_OP_SET_POWERED:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static const u8 bluetooth_base_uuid[] = {
+ 0xfb, 0x34, 0x9b, 0x5f, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static u8 get_uuid_size(const u8 *uuid)
+{
+ u32 val;
+
+ if (memcmp(uuid, bluetooth_base_uuid, 12))
+ return 128;
+
+ val = get_unaligned_le32(&uuid[12]);
+ if (val > 0xffff)
+ return 32;
+
+ return 16;
+}
+
+static void mgmt_class_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), hdev->dev_class, 3);
+
+ mgmt_pending_free(cmd);
+}
+
+static int add_uuid_sync(struct hci_dev *hdev, void *data)
+{
+ int err;
+
+ err = hci_update_class_sync(hdev);
+ if (err)
+ return err;
+
+ return hci_update_eir_sync(hdev);
+}
+
+static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_cp_add_uuid *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct bt_uuid *uuid;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (pending_eir_or_class(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_UUID,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ uuid = kmalloc(sizeof(*uuid), GFP_KERNEL);
+ if (!uuid) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ memcpy(uuid->uuid, cp->uuid, 16);
+ uuid->svc_hint = cp->svc_hint;
+ uuid->size = get_uuid_size(cp->uuid);
+
+ list_add_tail(&uuid->list, &hdev->uuids);
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_UUID, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ /* MGMT_OP_ADD_UUID don't require adapter the UP/Running so use
+ * hci_cmd_sync_submit instead of hci_cmd_sync_queue.
+ */
+ err = hci_cmd_sync_submit(hdev, add_uuid_sync, cmd,
+ mgmt_class_complete);
+ if (err < 0) {
+ mgmt_pending_free(cmd);
+ goto failed;
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool enable_service_cache(struct hci_dev *hdev)
+{
+ if (!hdev_is_powered(hdev))
+ return false;
+
+ if (!hci_dev_test_and_set_flag(hdev, HCI_SERVICE_CACHE)) {
+ queue_delayed_work(hdev->workqueue, &hdev->service_cache,
+ CACHE_TIMEOUT);
+ return true;
+ }
+
+ return false;
+}
+
+static int remove_uuid_sync(struct hci_dev *hdev, void *data)
+{
+ int err;
+
+ err = hci_update_class_sync(hdev);
+ if (err)
+ return err;
+
+ return hci_update_eir_sync(hdev);
+}
+
+static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_remove_uuid *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct bt_uuid *match, *tmp;
+ static const u8 bt_uuid_any[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ int err, found;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (pending_eir_or_class(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (memcmp(cp->uuid, bt_uuid_any, 16) == 0) {
+ hci_uuids_clear(hdev);
+
+ if (enable_service_cache(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_UUID,
+ 0, hdev->dev_class, 3);
+ goto unlock;
+ }
+
+ goto update_class;
+ }
+
+ found = 0;
+
+ list_for_each_entry_safe(match, tmp, &hdev->uuids, list) {
+ if (memcmp(match->uuid, cp->uuid, 16) != 0)
+ continue;
+
+ list_del(&match->list);
+ kfree(match);
+ found++;
+ }
+
+ if (found == 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+update_class:
+ cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_UUID, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ /* MGMT_OP_REMOVE_UUID don't require adapter the UP/Running so use
+ * hci_cmd_sync_submit instead of hci_cmd_sync_queue.
+ */
+ err = hci_cmd_sync_submit(hdev, remove_uuid_sync, cmd,
+ mgmt_class_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_class_sync(struct hci_dev *hdev, void *data)
+{
+ int err = 0;
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) {
+ cancel_delayed_work_sync(&hdev->service_cache);
+ err = hci_update_eir_sync(hdev);
+ }
+
+ if (err)
+ return err;
+
+ return hci_update_class_sync(hdev);
+}
+
+static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_dev_class *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_bredr_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ hci_dev_lock(hdev);
+
+ if (pending_eir_or_class(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if ((cp->minor & 0x03) != 0 || (cp->major & 0xe0) != 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ hdev->major_class = cp->major;
+ hdev->minor_class = cp->minor;
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0,
+ hdev->dev_class, 3);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_DEV_CLASS, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ /* MGMT_OP_SET_DEV_CLASS don't require adapter the UP/Running so use
+ * hci_cmd_sync_submit instead of hci_cmd_sync_queue.
+ */
+ err = hci_cmd_sync_submit(hdev, set_class_sync, cmd,
+ mgmt_class_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_load_link_keys *cp = data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_link_key_info));
+ u16 key_count, expected_len;
+ bool changed;
+ int i;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_bredr_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ key_count = __le16_to_cpu(cp->key_count);
+ if (key_count > max_key_count) {
+ bt_dev_err(hdev, "load_link_keys: too big key_count value %u",
+ key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = struct_size(cp, keys, key_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ if (cp->debug_keys != 0x00 && cp->debug_keys != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys,
+ key_count);
+
+ hci_dev_lock(hdev);
+
+ hci_link_keys_clear(hdev);
+
+ if (cp->debug_keys)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_KEEP_DEBUG_KEYS);
+
+ if (changed)
+ new_settings(hdev, NULL);
+
+ for (i = 0; i < key_count; i++) {
+ struct mgmt_link_key_info *key = &cp->keys[i];
+
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LINKKEY,
+ key->val)) {
+ bt_dev_warn(hdev, "Skipping blocked link key for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
+ if (key->addr.type != BDADDR_BREDR) {
+ bt_dev_warn(hdev,
+ "Invalid link address type %u for %pMR",
+ key->addr.type, &key->addr.bdaddr);
+ continue;
+ }
+
+ if (key->type > 0x08) {
+ bt_dev_warn(hdev, "Invalid link key type %u for %pMR",
+ key->type, &key->addr.bdaddr);
+ continue;
+ }
+
+ /* Always ignore debug keys and require a new pairing if
+ * the user wants to use them.
+ */
+ if (key->type == HCI_LK_DEBUG_COMBINATION)
+ continue;
+
+ hci_add_link_key(hdev, NULL, &key->addr.bdaddr, key->val,
+ key->type, key->pin_len, NULL);
+ }
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, 0, NULL, 0);
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int device_unpaired(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, struct sock *skip_sk)
+{
+ struct mgmt_ev_device_unpaired ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+
+ return mgmt_event(MGMT_EV_DEVICE_UNPAIRED, hdev, &ev, sizeof(ev),
+ skip_sk);
+}
+
+static void unpair_device_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_unpair_device *cp = cmd->param;
+
+ if (!err)
+ device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk);
+
+ cmd->cmd_complete(cmd, err);
+ mgmt_pending_free(cmd);
+}
+
+static int unpair_device_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_unpair_device *cp = cmd->param;
+ struct hci_conn *conn;
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+
+ if (!conn)
+ return 0;
+
+ /* Disregard any possible error since the likes of hci_abort_conn_sync
+ * will clean up the connection no matter the error.
+ */
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+
+ return 0;
+}
+
+static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_unpair_device *cp = data;
+ struct mgmt_rp_unpair_device rp;
+ struct hci_conn_params *params;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ u8 addr_type;
+ int err;
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ if (cp->disconnect != 0x00 && cp->disconnect != 0x01)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ /* If disconnection is requested, then look up the
+ * connection. If the remote device is connected, it
+ * will be later used to terminate the link.
+ *
+ * Setting it to NULL explicitly will cause no
+ * termination of the link.
+ */
+ if (cp->disconnect)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = NULL;
+
+ err = hci_remove_link_key(hdev, &cp->addr.bdaddr);
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_NOT_PAIRED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ goto done;
+ }
+
+ /* LE address type */
+ addr_type = le_addr_type(cp->addr.type);
+
+ /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */
+ err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type);
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_NOT_PAIRED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, addr_type);
+ if (!conn) {
+ hci_conn_params_del(hdev, &cp->addr.bdaddr, addr_type);
+ goto done;
+ }
+
+
+ /* Defer clearing up the connection parameters until closing to
+ * give a chance of keeping them if a repairing happens.
+ */
+ set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags);
+
+ /* Disable auto-connection parameters if present */
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type);
+ if (params) {
+ if (params->explicit_connect)
+ params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ else
+ params->auto_connect = HCI_AUTO_CONN_DISABLED;
+ }
+
+ /* If disconnection is not requested, then clear the connection
+ * variable so that the link is not terminated.
+ */
+ if (!cp->disconnect)
+ conn = NULL;
+
+done:
+ /* If the connection variable is set, then termination of the
+ * link is requested.
+ */
+ if (!conn) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, 0,
+ &rp, sizeof(rp));
+ device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, sk);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp,
+ sizeof(*cp));
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ err = hci_cmd_sync_queue(hdev, unpair_device_sync, cmd,
+ unpair_device_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void disconnect_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ cmd->cmd_complete(cmd, mgmt_status(err));
+ mgmt_pending_free(cmd);
+}
+
+static int disconnect_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_disconnect *cp = cmd->param;
+ struct hci_conn *conn;
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+
+ if (!conn)
+ return -ENOTCONN;
+
+ /* Disregard any possible error since the likes of hci_abort_conn_sync
+ * will clean up the connection no matter the error.
+ */
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+
+ return 0;
+}
+
+static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_disconnect *cp = data;
+ struct mgmt_rp_disconnect rp;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!test_bit(HCI_UP, &hdev->flags)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto failed;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_DISCONNECT, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ cmd->cmd_complete = generic_cmd_complete;
+
+ err = hci_cmd_sync_queue(hdev, disconnect_sync, cmd,
+ disconnect_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static u8 link_to_bdaddr(u8 link_type, u8 addr_type)
+{
+ switch (link_type) {
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ case LE_LINK:
+ switch (addr_type) {
+ case ADDR_LE_DEV_PUBLIC:
+ return BDADDR_LE_PUBLIC;
+
+ default:
+ /* Fallback to LE Random address type */
+ return BDADDR_LE_RANDOM;
+ }
+
+ default:
+ /* Fallback to BR/EDR type */
+ return BDADDR_BREDR;
+ }
+}
+
+static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_get_connections *rp;
+ struct hci_conn *c;
+ int err;
+ u16 i;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_CONNECTIONS,
+ MGMT_STATUS_NOT_POWERED);
+ goto unlock;
+ }
+
+ i = 0;
+ list_for_each_entry(c, &hdev->conn_hash.list, list) {
+ if (test_bit(HCI_CONN_MGMT_CONNECTED, &c->flags))
+ i++;
+ }
+
+ rp = kmalloc(struct_size(rp, addr, i), GFP_KERNEL);
+ if (!rp) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ i = 0;
+ list_for_each_entry(c, &hdev->conn_hash.list, list) {
+ if (!test_bit(HCI_CONN_MGMT_CONNECTED, &c->flags))
+ continue;
+ bacpy(&rp->addr[i].bdaddr, &c->dst);
+ rp->addr[i].type = link_to_bdaddr(c->type, c->dst_type);
+ if (c->type == SCO_LINK || c->type == ESCO_LINK)
+ continue;
+ i++;
+ }
+
+ rp->conn_count = cpu_to_le16(i);
+
+ /* Recalculate length in case of filtered SCO connections, etc */
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp,
+ struct_size(rp, addr, i));
+
+ kfree(rp);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_pin_code_neg_reply *cp)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_NEG_REPLY, hdev, cp,
+ sizeof(*cp));
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
+ sizeof(cp->addr.bdaddr), &cp->addr.bdaddr);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+ return err;
+}
+
+static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct hci_conn *conn;
+ struct mgmt_cp_pin_code_reply *cp = data;
+ struct hci_cp_pin_code_reply reply;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY,
+ MGMT_STATUS_NOT_POWERED);
+ goto failed;
+ }
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr);
+ if (!conn) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY,
+ MGMT_STATUS_NOT_CONNECTED);
+ goto failed;
+ }
+
+ if (conn->pending_sec_level == BT_SECURITY_HIGH && cp->pin_len != 16) {
+ struct mgmt_cp_pin_code_neg_reply ncp;
+
+ memcpy(&ncp.addr, &cp->addr, sizeof(ncp.addr));
+
+ bt_dev_err(hdev, "PIN code is not 16 bytes long");
+
+ err = send_pin_code_neg_reply(sk, hdev, &ncp);
+ if (err >= 0)
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_REPLY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ bacpy(&reply.bdaddr, &cp->addr.bdaddr);
+ reply.pin_len = cp->pin_len;
+ memcpy(reply.pin_code, cp->pin_code, sizeof(reply.pin_code));
+
+ err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_REPLY, sizeof(reply), &reply);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_io_capability *cp = data;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ hdev->io_capability = cp->io_capability;
+
+ bt_dev_dbg(hdev, "IO capability set to 0x%02x", hdev->io_capability);
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, 0,
+ NULL, 0);
+}
+
+static struct mgmt_pending_cmd *find_pairing(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct mgmt_pending_cmd *cmd;
+
+ list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ if (cmd->opcode != MGMT_OP_PAIR_DEVICE)
+ continue;
+
+ if (cmd->user_data != conn)
+ continue;
+
+ return cmd;
+ }
+
+ return NULL;
+}
+
+static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ struct mgmt_rp_pair_device rp;
+ struct hci_conn *conn = cmd->user_data;
+ int err;
+
+ bacpy(&rp.addr.bdaddr, &conn->dst);
+ rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+
+ err = mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_PAIR_DEVICE,
+ status, &rp, sizeof(rp));
+
+ /* So we don't get further callbacks for this connection */
+ conn->connect_cfm_cb = NULL;
+ conn->security_cfm_cb = NULL;
+ conn->disconn_cfm_cb = NULL;
+
+ hci_conn_drop(conn);
+
+ /* The device is paired so there is no need to remove
+ * its connection parameters anymore.
+ */
+ clear_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags);
+
+ hci_conn_put(conn);
+
+ return err;
+}
+
+void mgmt_smp_complete(struct hci_conn *conn, bool complete)
+{
+ u8 status = complete ? MGMT_STATUS_SUCCESS : MGMT_STATUS_FAILED;
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = find_pairing(conn);
+ if (cmd) {
+ cmd->cmd_complete(cmd, status);
+ mgmt_pending_remove(cmd);
+ }
+}
+
+static void pairing_complete_cb(struct hci_conn *conn, u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status %u", status);
+
+ cmd = find_pairing(conn);
+ if (!cmd) {
+ BT_DBG("Unable to find a pending command");
+ return;
+ }
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+static void le_pairing_complete_cb(struct hci_conn *conn, u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status %u", status);
+
+ if (!status)
+ return;
+
+ cmd = find_pairing(conn);
+ if (!cmd) {
+ BT_DBG("Unable to find a pending command");
+ return;
+ }
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_pair_device *cp = data;
+ struct mgmt_rp_pair_device rp;
+ struct mgmt_pending_cmd *cmd;
+ u8 sec_level, auth_type;
+ struct hci_conn *conn;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ if (cp->io_cap > SMP_IO_KEYBOARD_DISPLAY)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (hci_bdaddr_is_paired(hdev, &cp->addr.bdaddr, cp->addr.type)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_ALREADY_PAIRED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ sec_level = BT_SECURITY_MEDIUM;
+ auth_type = HCI_AT_DEDICATED_BONDING;
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level,
+ auth_type, CONN_REASON_PAIR_DEVICE,
+ HCI_ACL_CONN_TIMEOUT);
+ } else {
+ u8 addr_type = le_addr_type(cp->addr.type);
+ struct hci_conn_params *p;
+
+ /* When pairing a new device, it is expected to remember
+ * this device for future connections. Adding the connection
+ * parameter information ahead of time allows tracking
+ * of the peripheral preferred values and will speed up any
+ * further connection establishment.
+ *
+ * If connection parameters already exist, then they
+ * will be kept and this function does nothing.
+ */
+ p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type);
+ if (!p) {
+ err = -EIO;
+ goto unlock;
+ }
+
+ if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT)
+ p->auto_connect = HCI_AUTO_CONN_DISABLED;
+
+ conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type,
+ sec_level, HCI_LE_CONN_TIMEOUT,
+ CONN_REASON_PAIR_DEVICE);
+ }
+
+ if (IS_ERR(conn)) {
+ int status;
+
+ if (PTR_ERR(conn) == -EBUSY)
+ status = MGMT_STATUS_BUSY;
+ else if (PTR_ERR(conn) == -EOPNOTSUPP)
+ status = MGMT_STATUS_NOT_SUPPORTED;
+ else if (PTR_ERR(conn) == -ECONNREFUSED)
+ status = MGMT_STATUS_REJECTED;
+ else
+ status = MGMT_STATUS_CONNECT_FAILED;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ status, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ if (conn->connect_cfm_cb) {
+ hci_conn_drop(conn);
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_BUSY, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_PAIR_DEVICE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ cmd->cmd_complete = pairing_complete;
+
+ /* For LE, just connecting isn't a proof that the pairing finished */
+ if (cp->addr.type == BDADDR_BREDR) {
+ conn->connect_cfm_cb = pairing_complete_cb;
+ conn->security_cfm_cb = pairing_complete_cb;
+ conn->disconn_cfm_cb = pairing_complete_cb;
+ } else {
+ conn->connect_cfm_cb = le_pairing_complete_cb;
+ conn->security_cfm_cb = le_pairing_complete_cb;
+ conn->disconn_cfm_cb = le_pairing_complete_cb;
+ }
+
+ conn->io_capability = cp->io_cap;
+ cmd->user_data = hci_conn_get(conn);
+
+ if ((conn->state == BT_CONNECTED || conn->state == BT_CONFIG) &&
+ hci_conn_security(conn, sec_level, auth_type, true)) {
+ cmd->cmd_complete(cmd, 0);
+ mgmt_pending_remove(cmd);
+ }
+
+ err = 0;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_addr_info *addr = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_STATUS_NOT_POWERED);
+ goto unlock;
+ }
+
+ cmd = pending_find(MGMT_OP_PAIR_DEVICE, hdev);
+ if (!cmd) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ conn = cmd->user_data;
+
+ if (bacmp(&addr->bdaddr, &conn->dst) != 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ cmd->cmd_complete(cmd, MGMT_STATUS_CANCELLED);
+ mgmt_pending_remove(cmd);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0,
+ addr, sizeof(*addr));
+
+ /* Since user doesn't want to proceed with the connection, abort any
+ * ongoing pairing and then terminate the link if it was created
+ * because of the pair device action.
+ */
+ if (addr->type == BDADDR_BREDR)
+ hci_remove_link_key(hdev, &addr->bdaddr);
+ else
+ smp_cancel_and_remove_pairing(hdev, &addr->bdaddr,
+ le_addr_type(addr->type));
+
+ if (conn->conn_reason == CONN_REASON_PAIR_DEVICE)
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_addr_info *addr, u16 mgmt_op,
+ u16 hci_op, __le32 passkey)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_NOT_POWERED, addr,
+ sizeof(*addr));
+ goto done;
+ }
+
+ if (addr->type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &addr->bdaddr);
+ else
+ conn = hci_conn_hash_lookup_le(hdev, &addr->bdaddr,
+ le_addr_type(addr->type));
+
+ if (!conn) {
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_NOT_CONNECTED, addr,
+ sizeof(*addr));
+ goto done;
+ }
+
+ if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) {
+ err = smp_user_confirm_reply(conn, mgmt_op, passkey);
+ if (!err)
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_SUCCESS, addr,
+ sizeof(*addr));
+ else
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_FAILED, addr,
+ sizeof(*addr));
+
+ goto done;
+ }
+
+ cmd = mgmt_pending_add(sk, mgmt_op, hdev, addr, sizeof(*addr));
+ if (!cmd) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ /* Continue with pairing via HCI */
+ if (hci_op == HCI_OP_USER_PASSKEY_REPLY) {
+ struct hci_cp_user_passkey_reply cp;
+
+ bacpy(&cp.bdaddr, &addr->bdaddr);
+ cp.passkey = passkey;
+ err = hci_send_cmd(hdev, hci_op, sizeof(cp), &cp);
+ } else
+ err = hci_send_cmd(hdev, hci_op, sizeof(addr->bdaddr),
+ &addr->bdaddr);
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+done:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_pin_code_neg_reply *cp = data;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_PIN_CODE_NEG_REPLY,
+ HCI_OP_PIN_CODE_NEG_REPLY, 0);
+}
+
+static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_user_confirm_reply *cp = data;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (len != sizeof(*cp))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_CONFIRM_REPLY,
+ HCI_OP_USER_CONFIRM_REPLY, 0);
+}
+
+static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_user_confirm_neg_reply *cp = data;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_CONFIRM_NEG_REPLY,
+ HCI_OP_USER_CONFIRM_NEG_REPLY, 0);
+}
+
+static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_user_passkey_reply *cp = data;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_PASSKEY_REPLY,
+ HCI_OP_USER_PASSKEY_REPLY, cp->passkey);
+}
+
+static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_user_passkey_neg_reply *cp = data;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_PASSKEY_NEG_REPLY,
+ HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
+}
+
+static int adv_expire_sync(struct hci_dev *hdev, u32 flags)
+{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+ if (!adv_instance)
+ return 0;
+
+ /* stop if current instance doesn't need to be changed */
+ if (!(adv_instance->flags & flags))
+ return 0;
+
+ cancel_adv_timeout(hdev);
+
+ adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
+ if (!adv_instance)
+ return 0;
+
+ hci_schedule_adv_instance_sync(hdev, adv_instance->instance, true);
+
+ return 0;
+}
+
+static int name_changed_sync(struct hci_dev *hdev, void *data)
+{
+ return adv_expire_sync(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+}
+
+static void set_name_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_set_local_name *cp;
+ u8 status = mgmt_status(err);
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ cp = cmd->param;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
+ status);
+ } else {
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
+ cp, sizeof(*cp));
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL);
+ }
+
+ mgmt_pending_free(cmd);
+}
+
+static int set_name_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_set_local_name cp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ if (lmp_bredr_capable(hdev)) {
+ hci_update_name_sync(hdev, cp.name);
+ hci_update_eir_sync(hdev);
+ }
+
+ /* The name is stored in the scan response data and so
+ * no need to update the advertising data here.
+ */
+ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ hci_update_scan_rsp_data_sync(hdev, hdev->cur_adv_instance);
+
+ return 0;
+}
+
+static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_local_name *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ /* If the old values are the same as the new ones just return a
+ * direct command complete event.
+ */
+ if (!memcmp(hdev->dev_name, cp->name, sizeof(hdev->dev_name)) &&
+ !memcmp(hdev->short_name, cp->short_name,
+ sizeof(hdev->short_name))) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
+ data, len);
+ goto failed;
+ }
+
+ memcpy(hdev->short_name, cp->short_name, sizeof(hdev->short_name));
+
+ if (!hdev_is_powered(hdev)) {
+ memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name));
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
+ data, len);
+ if (err < 0)
+ goto failed;
+
+ err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
+ len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
+ ext_info_changed(hdev, sk);
+
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_LOCAL_NAME, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_name_sync, cmd,
+ set_name_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
+
+ goto failed;
+ }
+
+ memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name));
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int appearance_changed_sync(struct hci_dev *hdev, void *data)
+{
+ return adv_expire_sync(hdev, MGMT_ADV_FLAG_APPEARANCE);
+}
+
+static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_appearance *cp = data;
+ u16 appearance;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ appearance = le16_to_cpu(cp->appearance);
+
+ hci_dev_lock(hdev);
+
+ if (hdev->appearance != appearance) {
+ hdev->appearance = appearance;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ hci_cmd_sync_queue(hdev, appearance_changed_sync, NULL,
+ NULL);
+
+ ext_info_changed(hdev, sk);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
+ 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_rp_get_phy_configuration rp;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+
+ rp.supported_phys = cpu_to_le32(get_supported_phys(hdev));
+ rp.selected_phys = cpu_to_le32(get_selected_phys(hdev));
+ rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev));
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0,
+ &rp, sizeof(rp));
+}
+
+int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ struct mgmt_ev_phy_configuration_changed ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ ev.selected_phys = cpu_to_le32(get_selected_phys(hdev));
+
+ return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev,
+ sizeof(ev), skip);
+}
+
+static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct sk_buff *skb;
+ u8 status = mgmt_status(err);
+
+ skb = cmd->skb;
+
+ if (!status) {
+ if (!skb)
+ status = MGMT_STATUS_FAILED;
+ else if (IS_ERR(skb))
+ status = mgmt_status(PTR_ERR(skb));
+ else
+ status = mgmt_status(skb->data[0]);
+ }
+
+ bt_dev_dbg(hdev, "status %d", status);
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION, status);
+ } else {
+ mgmt_cmd_complete(cmd->sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION, 0,
+ NULL, 0);
+
+ mgmt_phy_configuration_changed(hdev, cmd->sk);
+ }
+
+ if (skb && !IS_ERR(skb))
+ kfree_skb(skb);
+
+ mgmt_pending_free(cmd);
+}
+
+static int set_default_phy_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_set_phy_configuration *cp = cmd->param;
+ struct hci_cp_le_set_default_phy cp_phy;
+ u32 selected_phys;
+
+ selected_phys = __le32_to_cpu(cp->selected_phys);
+
+ memset(&cp_phy, 0, sizeof(cp_phy));
+
+ if (!(selected_phys & MGMT_PHY_LE_TX_MASK))
+ cp_phy.all_phys |= 0x01;
+
+ if (!(selected_phys & MGMT_PHY_LE_RX_MASK))
+ cp_phy.all_phys |= 0x02;
+
+ if (selected_phys & MGMT_PHY_LE_1M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED;
+
+ if (selected_phys & MGMT_PHY_LE_1M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED;
+
+ cmd->skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_DEFAULT_PHY,
+ sizeof(cp_phy), &cp_phy, HCI_CMD_TIMEOUT);
+
+ return 0;
+}
+
+static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_phy_configuration *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys;
+ u16 pkt_type = (HCI_DH1 | HCI_DM1);
+ bool changed = false;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ configurable_phys = get_configurable_phys(hdev);
+ supported_phys = get_supported_phys(hdev);
+ selected_phys = __le32_to_cpu(cp->selected_phys);
+
+ if (selected_phys & ~supported_phys)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ unconfigure_phys = supported_phys & ~configurable_phys;
+
+ if ((selected_phys & unconfigure_phys) != unconfigure_phys)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (selected_phys == get_selected_phys(hdev))
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ 0, NULL, 0);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (selected_phys & MGMT_PHY_BR_1M_3SLOT)
+ pkt_type |= (HCI_DH3 | HCI_DM3);
+ else
+ pkt_type &= ~(HCI_DH3 | HCI_DM3);
+
+ if (selected_phys & MGMT_PHY_BR_1M_5SLOT)
+ pkt_type |= (HCI_DH5 | HCI_DM5);
+ else
+ pkt_type &= ~(HCI_DH5 | HCI_DM5);
+
+ if (selected_phys & MGMT_PHY_EDR_2M_1SLOT)
+ pkt_type &= ~HCI_2DH1;
+ else
+ pkt_type |= HCI_2DH1;
+
+ if (selected_phys & MGMT_PHY_EDR_2M_3SLOT)
+ pkt_type &= ~HCI_2DH3;
+ else
+ pkt_type |= HCI_2DH3;
+
+ if (selected_phys & MGMT_PHY_EDR_2M_5SLOT)
+ pkt_type &= ~HCI_2DH5;
+ else
+ pkt_type |= HCI_2DH5;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_1SLOT)
+ pkt_type &= ~HCI_3DH1;
+ else
+ pkt_type |= HCI_3DH1;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_3SLOT)
+ pkt_type &= ~HCI_3DH3;
+ else
+ pkt_type |= HCI_3DH3;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_5SLOT)
+ pkt_type &= ~HCI_3DH5;
+ else
+ pkt_type |= HCI_3DH5;
+
+ if (pkt_type != hdev->pkt_type) {
+ hdev->pkt_type = pkt_type;
+ changed = true;
+ }
+
+ if ((selected_phys & MGMT_PHY_LE_MASK) ==
+ (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) {
+ if (changed)
+ mgmt_phy_configuration_changed(hdev, sk);
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ 0, NULL, 0);
+
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
+ len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_default_phy_sync, cmd,
+ set_default_phy_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ int err = MGMT_STATUS_SUCCESS;
+ struct mgmt_cp_set_blocked_keys *keys = data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*keys)) /
+ sizeof(struct mgmt_blocked_key_info));
+ u16 key_count, expected_len;
+ int i;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ key_count = __le16_to_cpu(keys->key_count);
+ if (key_count > max_key_count) {
+ bt_dev_err(hdev, "too big key_count value %u", key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = struct_size(keys, keys, key_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ hci_blocked_keys_clear(hdev);
+
+ for (i = 0; i < key_count; ++i) {
+ struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL);
+
+ if (!b) {
+ err = MGMT_STATUS_NO_RESOURCES;
+ break;
+ }
+
+ b->type = keys->keys[i].type;
+ memcpy(b->val, keys->keys[i].val, sizeof(b->val));
+ list_add_rcu(&b->list, &hdev->blocked_keys);
+ }
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ err, NULL, 0);
+}
+
+static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ int err;
+ bool changed = false;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (hdev_is_powered(hdev) &&
+ !!cp->val != hci_dev_test_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_WIDEBAND_SPEECH, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int read_controller_cap(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ char buf[20];
+ struct mgmt_rp_read_controller_cap *rp = (void *)buf;
+ u16 cap_len = 0;
+ u8 flags = 0;
+ u8 tx_power_range[2];
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&buf, 0, sizeof(buf));
+
+ hci_dev_lock(hdev);
+
+ /* When the Read Simple Pairing Options command is supported, then
+ * the remote public key validation is supported.
+ *
+ * Alternatively, when Microsoft extensions are available, they can
+ * indicate support for public key validation as well.
+ */
+ if ((hdev->commands[41] & 0x08) || msft_curve_validity(hdev))
+ flags |= 0x01; /* Remote public key validation (BR/EDR) */
+
+ flags |= 0x02; /* Remote public key validation (LE) */
+
+ /* When the Read Encryption Key Size command is supported, then the
+ * encryption key size is enforced.
+ */
+ if (hdev->commands[20] & 0x10)
+ flags |= 0x04; /* Encryption key size enforcement (BR/EDR) */
+
+ flags |= 0x08; /* Encryption key size enforcement (LE) */
+
+ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_SEC_FLAGS,
+ &flags, 1);
+
+ /* When the Read Simple Pairing Options command is supported, then
+ * also max encryption key size information is provided.
+ */
+ if (hdev->commands[41] & 0x08)
+ cap_len = eir_append_le16(rp->cap, cap_len,
+ MGMT_CAP_MAX_ENC_KEY_SIZE,
+ hdev->max_enc_key_size);
+
+ cap_len = eir_append_le16(rp->cap, cap_len,
+ MGMT_CAP_SMP_MAX_ENC_KEY_SIZE,
+ SMP_MAX_ENC_KEY_SIZE);
+
+ /* Append the min/max LE tx power parameters if we were able to fetch
+ * it from the controller
+ */
+ if (hdev->commands[38] & 0x80) {
+ memcpy(&tx_power_range[0], &hdev->min_le_tx_power, 1);
+ memcpy(&tx_power_range[1], &hdev->max_le_tx_power, 1);
+ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_LE_TX_PWR,
+ tx_power_range, 2);
+ }
+
+ rp->cap_len = cpu_to_le16(cap_len);
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONTROLLER_CAP, 0,
+ rp, sizeof(*rp) + cap_len);
+}
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+/* d4992530-b9ec-469f-ab01-6c481c47da1c */
+static const u8 debug_uuid[16] = {
+ 0x1c, 0xda, 0x47, 0x1c, 0x48, 0x6c, 0x01, 0xab,
+ 0x9f, 0x46, 0xec, 0xb9, 0x30, 0x25, 0x99, 0xd4,
+};
+#endif
+
+/* 330859bc-7506-492d-9370-9a6f0614037f */
+static const u8 quality_report_uuid[16] = {
+ 0x7f, 0x03, 0x14, 0x06, 0x6f, 0x9a, 0x70, 0x93,
+ 0x2d, 0x49, 0x06, 0x75, 0xbc, 0x59, 0x08, 0x33,
+};
+
+/* a6695ace-ee7f-4fb9-881a-5fac66c629af */
+static const u8 offload_codecs_uuid[16] = {
+ 0xaf, 0x29, 0xc6, 0x66, 0xac, 0x5f, 0x1a, 0x88,
+ 0xb9, 0x4f, 0x7f, 0xee, 0xce, 0x5a, 0x69, 0xa6,
+};
+
+/* 671b10b5-42c0-4696-9227-eb28d1b049d6 */
+static const u8 le_simultaneous_roles_uuid[16] = {
+ 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92,
+ 0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67,
+};
+
+/* 6fbaf188-05e0-496a-9885-d6ddfdb4e03e */
+static const u8 iso_socket_uuid[16] = {
+ 0x3e, 0xe0, 0xb4, 0xfd, 0xdd, 0xd6, 0x85, 0x98,
+ 0x6a, 0x49, 0xe0, 0x05, 0x88, 0xf1, 0xba, 0x6f,
+};
+
+/* 2ce463d7-7a03-4d8d-bf05-5f24e8f36e76 */
+static const u8 mgmt_mesh_uuid[16] = {
+ 0x76, 0x6e, 0xf3, 0xe8, 0x24, 0x5f, 0x05, 0xbf,
+ 0x8d, 0x4d, 0x03, 0x7a, 0xd7, 0x63, 0xe4, 0x2c,
+};
+
+static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_exp_features_info *rp;
+ size_t len;
+ u16 idx = 0;
+ u32 flags;
+ int status;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ /* Enough space for 7 features */
+ len = sizeof(*rp) + (sizeof(rp->features[0]) * 7);
+ rp = kzalloc(len, GFP_KERNEL);
+ if (!rp)
+ return -ENOMEM;
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+ flags = bt_dbg_get() ? BIT(0) : 0;
+
+ memcpy(rp->features[idx].uuid, debug_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+#endif
+
+ if (hdev && hci_dev_le_state_simultaneous(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, le_simultaneous_roles_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (hdev && (aosp_has_quality_report(hdev) ||
+ hdev->set_quality_report)) {
+ if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, quality_report_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (hdev && hdev->get_data_path_id) {
+ if (hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, offload_codecs_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (IS_ENABLED(CONFIG_BT_LE)) {
+ flags = iso_inited() ? BIT(0) : 0;
+ memcpy(rp->features[idx].uuid, iso_socket_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (hdev && lmp_le_capable(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, mgmt_mesh_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ rp->feature_count = cpu_to_le16(idx);
+
+ /* After reading the experimental features information, enable
+ * the events to update client on any future change.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ status = mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+ MGMT_OP_READ_EXP_FEATURES_INFO,
+ 0, rp, sizeof(*rp) + (20 * idx));
+
+ kfree(rp);
+ return status;
+}
+
+static int exp_feature_changed(struct hci_dev *hdev, const u8 *uuid,
+ bool enabled, struct sock *skip)
+{
+ struct mgmt_ev_exp_feature_changed ev;
+
+ memset(&ev, 0, sizeof(ev));
+ memcpy(ev.uuid, uuid, 16);
+ ev.flags = cpu_to_le32(enabled ? BIT(0) : 0);
+
+ return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev,
+ &ev, sizeof(ev),
+ HCI_MGMT_EXP_FEATURE_EVENTS, skip);
+}
+
+#define EXP_FEAT(_uuid, _set_func) \
+{ \
+ .uuid = _uuid, \
+ .set_func = _set_func, \
+}
+
+/* The zero key uuid is special. Multiple exp features are set through it. */
+static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+
+ memset(rp.uuid, 0, 16);
+ rp.flags = cpu_to_le32(0);
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+ if (!hdev) {
+ bool changed = bt_dbg_get();
+
+ bt_dbg_set(false);
+
+ if (changed)
+ exp_feature_changed(NULL, ZERO_KEY, false, sk);
+ }
+#endif
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+}
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+static int set_debug_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+
+ bool val, changed;
+ int err;
+
+ /* Command requires to use the non-controller index */
+ if (hdev)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+ changed = val ? !bt_dbg_get() : bt_dbg_get();
+ bt_dbg_set(val);
+
+ memcpy(rp.uuid, debug_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, debug_uuid, val, sk);
+
+ return err;
+}
+#endif
+
+static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+ bool val, changed;
+ int err;
+
+ /* Command requires to use the controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+
+ if (val) {
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_MESH_EXPERIMENTAL);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_MESH);
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_MESH_EXPERIMENTAL);
+ }
+
+ memcpy(rp.uuid, mgmt_mesh_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, mgmt_mesh_uuid, val, sk);
+
+ return err;
+}
+
+static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp,
+ u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+ bool val, changed;
+ int err;
+
+ /* Command requires to use a valid controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_req_sync_lock(hdev);
+
+ val = !!cp->param[0];
+ changed = (val != hci_dev_test_flag(hdev, HCI_QUALITY_REPORT));
+
+ if (!aosp_has_quality_report(hdev) && !hdev->set_quality_report) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+ goto unlock_quality_report;
+ }
+
+ if (changed) {
+ if (hdev->set_quality_report)
+ err = hdev->set_quality_report(hdev, val);
+ else
+ err = aosp_set_quality_report(hdev, val);
+
+ if (err) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_FAILED);
+ goto unlock_quality_report;
+ }
+
+ if (val)
+ hci_dev_set_flag(hdev, HCI_QUALITY_REPORT);
+ else
+ hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT);
+ }
+
+ bt_dev_dbg(hdev, "quality report enable %d changed %d", val, changed);
+
+ memcpy(rp.uuid, quality_report_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, quality_report_uuid, val, sk);
+
+unlock_quality_report:
+ hci_req_sync_unlock(hdev);
+ return err;
+}
+
+static int set_offload_codec_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp,
+ u16 data_len)
+{
+ bool val, changed;
+ int err;
+ struct mgmt_rp_set_exp_feature rp;
+
+ /* Command requires to use a valid controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+ changed = (val != hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED));
+
+ if (!hdev->get_data_path_id) {
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+ }
+
+ if (changed) {
+ if (val)
+ hci_dev_set_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED);
+ }
+
+ bt_dev_info(hdev, "offload codecs enable %d changed %d",
+ val, changed);
+
+ memcpy(rp.uuid, offload_codecs_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, offload_codecs_uuid, val, sk);
+
+ return err;
+}
+
+static int set_le_simultaneous_roles_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp,
+ u16 data_len)
+{
+ bool val, changed;
+ int err;
+ struct mgmt_rp_set_exp_feature rp;
+
+ /* Command requires to use a valid controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+ changed = (val != hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES));
+
+ if (!hci_dev_le_state_simultaneous(hdev)) {
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+ }
+
+ if (changed) {
+ if (val)
+ hci_dev_set_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES);
+ else
+ hci_dev_clear_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES);
+ }
+
+ bt_dev_info(hdev, "LE simultaneous roles enable %d changed %d",
+ val, changed);
+
+ memcpy(rp.uuid, le_simultaneous_roles_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, le_simultaneous_roles_uuid, val, sk);
+
+ return err;
+}
+
+#ifdef CONFIG_BT_LE
+static int set_iso_socket_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+ bool val, changed = false;
+ int err;
+
+ /* Command requires to use the non-controller index */
+ if (hdev)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = cp->param[0] ? true : false;
+ if (val)
+ err = iso_init();
+ else
+ err = iso_exit();
+
+ if (!err)
+ changed = true;
+
+ memcpy(rp.uuid, iso_socket_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, iso_socket_uuid, val, sk);
+
+ return err;
+}
+#endif
+
+static const struct mgmt_exp_feature {
+ const u8 *uuid;
+ int (*set_func)(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len);
+} exp_features[] = {
+ EXP_FEAT(ZERO_KEY, set_zero_key_func),
+#ifdef CONFIG_BT_FEATURE_DEBUG
+ EXP_FEAT(debug_uuid, set_debug_func),
+#endif
+ EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func),
+ EXP_FEAT(quality_report_uuid, set_quality_report_func),
+ EXP_FEAT(offload_codecs_uuid, set_offload_codec_func),
+ EXP_FEAT(le_simultaneous_roles_uuid, set_le_simultaneous_roles_func),
+#ifdef CONFIG_BT_LE
+ EXP_FEAT(iso_socket_uuid, set_iso_socket_func),
+#endif
+
+ /* end with a null feature */
+ EXP_FEAT(NULL, NULL)
+};
+
+static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_set_exp_feature *cp = data;
+ size_t i = 0;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ for (i = 0; exp_features[i].uuid; i++) {
+ if (!memcmp(cp->uuid, exp_features[i].uuid, 16))
+ return exp_features[i].set_func(sk, hdev, cp, data_len);
+ }
+
+ return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+}
+
+static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_cp_get_device_flags *cp = data;
+ struct mgmt_rp_get_device_flags rp;
+ struct bdaddr_list_with_flags *br_params;
+ struct hci_conn_params *params;
+ u32 supported_flags;
+ u32 current_flags = 0;
+ u8 status = MGMT_STATUS_INVALID_PARAMS;
+
+ bt_dev_dbg(hdev, "Get device flags %pMR (type 0x%x)\n",
+ &cp->addr.bdaddr, cp->addr.type);
+
+ hci_dev_lock(hdev);
+
+ supported_flags = hdev->conn_flags;
+
+ memset(&rp, 0, sizeof(rp));
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type);
+ if (!br_params)
+ goto done;
+
+ current_flags = br_params->flags;
+ } else {
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+ if (!params)
+ goto done;
+
+ current_flags = params->flags;
+ }
+
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+ rp.supported_flags = cpu_to_le32(supported_flags);
+ rp.current_flags = cpu_to_le32(current_flags);
+
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_DEVICE_FLAGS, status,
+ &rp, sizeof(rp));
+}
+
+static void device_flags_changed(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type,
+ u32 supported_flags, u32 current_flags)
+{
+ struct mgmt_ev_device_flags_changed ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = bdaddr_type;
+ ev.supported_flags = cpu_to_le32(supported_flags);
+ ev.current_flags = cpu_to_le32(current_flags);
+
+ mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk);
+}
+
+static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
+ if (!conn)
+ return false;
+
+ if (conn->dst_type != type)
+ return false;
+
+ if (conn->state != BT_CONNECTED)
+ return false;
+
+ return true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn_params *hci_conn_params_set(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type,
+ u8 auto_connect)
+{
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_add(hdev, addr, addr_type);
+ if (!params)
+ return NULL;
+
+ if (params->auto_connect == auto_connect)
+ return params;
+
+ hci_pend_le_list_del_init(params);
+
+ switch (auto_connect) {
+ case HCI_AUTO_CONN_DISABLED:
+ case HCI_AUTO_CONN_LINK_LOSS:
+ /* If auto connect is being disabled when we're trying to
+ * connect to device, keep connecting.
+ */
+ if (params->explicit_connect)
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ if (params->explicit_connect)
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ else
+ hci_pend_le_list_add(params, &hdev->pend_le_reports);
+ break;
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ if (!is_connected(hdev, addr, addr_type))
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+ }
+
+ params->auto_connect = auto_connect;
+
+ bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u",
+ addr, addr_type, auto_connect);
+
+ return params;
+}
+
+static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_device_flags *cp = data;
+ struct bdaddr_list_with_flags *br_params;
+ struct hci_conn_params *params;
+ u8 status = MGMT_STATUS_INVALID_PARAMS;
+ u32 supported_flags;
+ u32 current_flags = __le32_to_cpu(cp->current_flags);
+
+ bt_dev_dbg(hdev, "Set device flags %pMR (type 0x%x) = 0x%x",
+ &cp->addr.bdaddr, cp->addr.type, current_flags);
+
+ // We should take hci_dev_lock() early, I think.. conn_flags can change
+ supported_flags = hdev->conn_flags;
+
+ if ((supported_flags | current_flags) != supported_flags) {
+ bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
+ current_flags, supported_flags);
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type);
+
+ if (br_params) {
+ br_params->flags = current_flags;
+ status = MGMT_STATUS_SUCCESS;
+ } else {
+ bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)",
+ &cp->addr.bdaddr, cp->addr.type);
+ }
+
+ goto unlock;
+ }
+
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+ if (!params) {
+ /* Create a new hci_conn_params if it doesn't exist */
+ params = hci_conn_params_set(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type),
+ HCI_AUTO_CONN_DISABLED);
+ if (!params) {
+ bt_dev_warn(hdev, "No such LE device %pMR (0x%x)",
+ &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+ goto unlock;
+ }
+ }
+
+ supported_flags = hdev->conn_flags;
+
+ if ((supported_flags | current_flags) != supported_flags) {
+ bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
+ current_flags, supported_flags);
+ goto unlock;
+ }
+
+ WRITE_ONCE(params->flags, current_flags);
+ status = MGMT_STATUS_SUCCESS;
+
+ /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY
+ * has been set.
+ */
+ if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)
+ hci_update_passive_scan(hdev);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+done:
+ if (status == MGMT_STATUS_SUCCESS)
+ device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type,
+ supported_flags, current_flags);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_FLAGS, status,
+ &cp->addr, sizeof(cp->addr));
+}
+
+static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev,
+ u16 handle)
+{
+ struct mgmt_ev_adv_monitor_added ev;
+
+ ev.monitor_handle = cpu_to_le16(handle);
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev,
+ __le16 handle)
+{
+ struct mgmt_ev_adv_monitor_removed ev;
+
+ ev.monitor_handle = handle;
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk);
+}
+
+static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct adv_monitor *monitor = NULL;
+ struct mgmt_rp_read_adv_monitor_features *rp = NULL;
+ int handle, err;
+ size_t rp_size = 0;
+ __u32 supported = 0;
+ __u32 enabled = 0;
+ __u16 num_handles = 0;
+ __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES];
+
+ BT_DBG("request for %s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (msft_monitor_supported(hdev))
+ supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS;
+
+ idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle)
+ handles[num_handles++] = monitor->handle;
+
+ hci_dev_unlock(hdev);
+
+ rp_size = sizeof(*rp) + (num_handles * sizeof(u16));
+ rp = kmalloc(rp_size, GFP_KERNEL);
+ if (!rp)
+ return -ENOMEM;
+
+ /* All supported features are currently enabled */
+ enabled = supported;
+
+ rp->supported_features = cpu_to_le32(supported);
+ rp->enabled_features = cpu_to_le32(enabled);
+ rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES);
+ rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS;
+ rp->num_handles = cpu_to_le16(num_handles);
+ if (num_handles)
+ memcpy(&rp->handles, &handles, (num_handles * sizeof(u16)));
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_ADV_MONITOR_FEATURES,
+ MGMT_STATUS_SUCCESS, rp, rp_size);
+
+ kfree(rp);
+
+ return err;
+}
+
+static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev,
+ void *data, int status)
+{
+ struct mgmt_rp_add_adv_patterns_monitor rp;
+ struct mgmt_pending_cmd *cmd = data;
+ struct adv_monitor *monitor;
+
+ /* This is likely the result of hdev being closed and mgmt_index_removed
+ * is attempting to clean up any pending command so
+ * hci_adv_monitors_clear is about to be called which will take care of
+ * freeing the adv_monitor instances.
+ */
+ if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ monitor = cmd->user_data;
+
+ hci_dev_lock(hdev);
+
+ rp.monitor_handle = cpu_to_le16(monitor->handle);
+
+ if (!status) {
+ mgmt_adv_monitor_added(cmd->sk, hdev, monitor->handle);
+ hdev->adv_monitors_cnt++;
+ if (monitor->state == ADV_MONITOR_STATE_NOT_REGISTERED)
+ monitor->state = ADV_MONITOR_STATE_REGISTERED;
+ hci_update_passive_scan(hdev);
+ }
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(status), &rp, sizeof(rp));
+ mgmt_pending_remove(cmd);
+
+ hci_dev_unlock(hdev);
+ bt_dev_dbg(hdev, "add monitor %d complete, status %d",
+ rp.monitor_handle, status);
+}
+
+static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct adv_monitor *mon;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ mon = cmd->user_data;
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return hci_add_adv_monitor(hdev, mon);
+}
+
+static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
+ struct adv_monitor *m, u8 status,
+ void *data, u16 len, u16 op)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ hci_dev_lock(hdev);
+
+ if (status)
+ goto unlock;
+
+ if (pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) {
+ status = MGMT_STATUS_BUSY;
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, op, hdev, data, len);
+ if (!cmd) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto unlock;
+ }
+
+ cmd->user_data = m;
+ err = hci_cmd_sync_queue(hdev, mgmt_add_adv_patterns_monitor_sync, cmd,
+ mgmt_add_adv_patterns_monitor_complete);
+ if (err) {
+ if (err == -ENOMEM)
+ status = MGMT_STATUS_NO_RESOURCES;
+ else
+ status = MGMT_STATUS_FAILED;
+
+ goto unlock;
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+
+unlock:
+ hci_free_adv_monitor(hdev, m);
+ hci_dev_unlock(hdev);
+ return mgmt_cmd_status(sk, hdev->id, op, status);
+}
+
+static void parse_adv_monitor_rssi(struct adv_monitor *m,
+ struct mgmt_adv_rssi_thresholds *rssi)
+{
+ if (rssi) {
+ m->rssi.low_threshold = rssi->low_threshold;
+ m->rssi.low_threshold_timeout =
+ __le16_to_cpu(rssi->low_threshold_timeout);
+ m->rssi.high_threshold = rssi->high_threshold;
+ m->rssi.high_threshold_timeout =
+ __le16_to_cpu(rssi->high_threshold_timeout);
+ m->rssi.sampling_period = rssi->sampling_period;
+ } else {
+ /* Default values. These numbers are the least constricting
+ * parameters for MSFT API to work, so it behaves as if there
+ * are no rssi parameter to consider. May need to be changed
+ * if other API are to be supported.
+ */
+ m->rssi.low_threshold = -127;
+ m->rssi.low_threshold_timeout = 60;
+ m->rssi.high_threshold = -127;
+ m->rssi.high_threshold_timeout = 0;
+ m->rssi.sampling_period = 0;
+ }
+}
+
+static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count,
+ struct mgmt_adv_pattern *patterns)
+{
+ u8 offset = 0, length = 0;
+ struct adv_pattern *p = NULL;
+ int i;
+
+ for (i = 0; i < pattern_count; i++) {
+ offset = patterns[i].offset;
+ length = patterns[i].length;
+ if (offset >= HCI_MAX_AD_LENGTH ||
+ length > HCI_MAX_AD_LENGTH ||
+ (offset + length) > HCI_MAX_AD_LENGTH)
+ return MGMT_STATUS_INVALID_PARAMS;
+
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return MGMT_STATUS_NO_RESOURCES;
+
+ p->ad_type = patterns[i].ad_type;
+ p->offset = patterns[i].offset;
+ p->length = patterns[i].length;
+ memcpy(p->value, patterns[i].value, p->length);
+
+ INIT_LIST_HEAD(&p->list);
+ list_add(&p->list, &m->patterns);
+ }
+
+ return MGMT_STATUS_SUCCESS;
+}
+
+static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_add_adv_patterns_monitor *cp = data;
+ struct adv_monitor *m = NULL;
+ u8 status = MGMT_STATUS_SUCCESS;
+ size_t expected_size = sizeof(*cp);
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (len <= sizeof(*cp)) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern);
+ if (len != expected_size) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto done;
+ }
+
+ INIT_LIST_HEAD(&m->patterns);
+
+ parse_adv_monitor_rssi(m, NULL);
+ status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns);
+
+done:
+ return __add_adv_patterns_monitor(sk, hdev, m, status, data, len,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR);
+}
+
+static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_add_adv_patterns_monitor_rssi *cp = data;
+ struct adv_monitor *m = NULL;
+ u8 status = MGMT_STATUS_SUCCESS;
+ size_t expected_size = sizeof(*cp);
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (len <= sizeof(*cp)) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern);
+ if (len != expected_size) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto done;
+ }
+
+ INIT_LIST_HEAD(&m->patterns);
+
+ parse_adv_monitor_rssi(m, &cp->rssi);
+ status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns);
+
+done:
+ return __add_adv_patterns_monitor(sk, hdev, m, status, data, len,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI);
+}
+
+static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev,
+ void *data, int status)
+{
+ struct mgmt_rp_remove_adv_monitor rp;
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_adv_monitor *cp;
+
+ if (status == -ECANCELED)
+ return;
+
+ hci_dev_lock(hdev);
+
+ cp = cmd->param;
+
+ rp.monitor_handle = cp->monitor_handle;
+
+ if (!status) {
+ mgmt_adv_monitor_removed(cmd->sk, hdev, cp->monitor_handle);
+ hci_update_passive_scan(hdev);
+ }
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(status), &rp, sizeof(rp));
+ mgmt_pending_free(cmd);
+
+ hci_dev_unlock(hdev);
+ bt_dev_dbg(hdev, "remove monitor %d complete, status %d",
+ rp.monitor_handle, status);
+}
+
+static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_adv_monitor *cp = cmd->param;
+ u16 handle = __le16_to_cpu(cp->monitor_handle);
+
+ if (!handle)
+ return hci_remove_all_adv_monitor(hdev);
+
+ return hci_remove_single_adv_monitor(hdev, handle);
+}
+
+static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err, status;
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) {
+ status = MGMT_STATUS_BUSY;
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len);
+ if (!cmd) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_submit(hdev, mgmt_remove_adv_monitor_sync, cmd,
+ mgmt_remove_adv_monitor_complete);
+
+ if (err) {
+ mgmt_pending_free(cmd);
+
+ if (err == -ENOMEM)
+ status = MGMT_STATUS_NO_RESOURCES;
+ else
+ status = MGMT_STATUS_FAILED;
+
+ goto unlock;
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR,
+ status);
+}
+
+static void read_local_oob_data_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_rp_read_local_oob_data mgmt_rp;
+ size_t rp_size = sizeof(mgmt_rp);
+ struct mgmt_pending_cmd *cmd = data;
+ struct sk_buff *skb = cmd->skb;
+ u8 status = mgmt_status(err);
+
+ if (!status) {
+ if (!skb)
+ status = MGMT_STATUS_FAILED;
+ else if (IS_ERR(skb))
+ status = mgmt_status(PTR_ERR(skb));
+ else
+ status = mgmt_status(skb->data[0]);
+ }
+
+ bt_dev_dbg(hdev, "status %d", status);
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ status);
+ goto remove;
+ }
+
+ memset(&mgmt_rp, 0, sizeof(mgmt_rp));
+
+ if (!bredr_sc_enabled(hdev)) {
+ struct hci_rp_read_local_oob_data *rp = (void *) skb->data;
+
+ if (skb->len < sizeof(*rp)) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_FAILED);
+ goto remove;
+ }
+
+ memcpy(mgmt_rp.hash192, rp->hash, sizeof(rp->hash));
+ memcpy(mgmt_rp.rand192, rp->rand, sizeof(rp->rand));
+
+ rp_size -= sizeof(mgmt_rp.hash256) + sizeof(mgmt_rp.rand256);
+ } else {
+ struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data;
+
+ if (skb->len < sizeof(*rp)) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_FAILED);
+ goto remove;
+ }
+
+ memcpy(mgmt_rp.hash192, rp->hash192, sizeof(rp->hash192));
+ memcpy(mgmt_rp.rand192, rp->rand192, sizeof(rp->rand192));
+
+ memcpy(mgmt_rp.hash256, rp->hash256, sizeof(rp->hash256));
+ memcpy(mgmt_rp.rand256, rp->rand256, sizeof(rp->rand256));
+ }
+
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_SUCCESS, &mgmt_rp, rp_size);
+
+remove:
+ if (skb && !IS_ERR(skb))
+ kfree_skb(skb);
+
+ mgmt_pending_free(cmd);
+}
+
+static int read_local_oob_data_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ if (bredr_sc_enabled(hdev))
+ cmd->skb = hci_read_local_oob_data_sync(hdev, true, cmd->sk);
+ else
+ cmd->skb = hci_read_local_oob_data_sync(hdev, false, cmd->sk);
+
+ if (IS_ERR(cmd->skb))
+ return PTR_ERR(cmd->skb);
+ else
+ return 0;
+}
+
+static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_NOT_POWERED);
+ goto unlock;
+ }
+
+ if (!lmp_ssp_capable(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_NOT_SUPPORTED);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, read_local_oob_data_sync, cmd,
+ read_local_oob_data_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_addr_info *addr = data;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!bdaddr_type_is_valid(addr->type))
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ addr, sizeof(*addr));
+
+ hci_dev_lock(hdev);
+
+ if (len == MGMT_ADD_REMOTE_OOB_DATA_SIZE) {
+ struct mgmt_cp_add_remote_oob_data *cp = data;
+ u8 status;
+
+ if (cp->addr.type != BDADDR_BREDR) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr,
+ cp->addr.type, cp->hash,
+ cp->rand, NULL, NULL);
+ if (err < 0)
+ status = MGMT_STATUS_FAILED;
+ else
+ status = MGMT_STATUS_SUCCESS;
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA, status,
+ &cp->addr, sizeof(cp->addr));
+ } else if (len == MGMT_ADD_REMOTE_OOB_EXT_DATA_SIZE) {
+ struct mgmt_cp_add_remote_oob_ext_data *cp = data;
+ u8 *rand192, *hash192, *rand256, *hash256;
+ u8 status;
+
+ if (bdaddr_type_is_le(cp->addr.type)) {
+ /* Enforce zero-valued 192-bit parameters as
+ * long as legacy SMP OOB isn't implemented.
+ */
+ if (memcmp(cp->rand192, ZERO_KEY, 16) ||
+ memcmp(cp->hash192, ZERO_KEY, 16)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ addr, sizeof(*addr));
+ goto unlock;
+ }
+
+ rand192 = NULL;
+ hash192 = NULL;
+ } else {
+ /* In case one of the P-192 values is set to zero,
+ * then just disable OOB data for P-192.
+ */
+ if (!memcmp(cp->rand192, ZERO_KEY, 16) ||
+ !memcmp(cp->hash192, ZERO_KEY, 16)) {
+ rand192 = NULL;
+ hash192 = NULL;
+ } else {
+ rand192 = cp->rand192;
+ hash192 = cp->hash192;
+ }
+ }
+
+ /* In case one of the P-256 values is set to zero, then just
+ * disable OOB data for P-256.
+ */
+ if (!memcmp(cp->rand256, ZERO_KEY, 16) ||
+ !memcmp(cp->hash256, ZERO_KEY, 16)) {
+ rand256 = NULL;
+ hash256 = NULL;
+ } else {
+ rand256 = cp->rand256;
+ hash256 = cp->hash256;
+ }
+
+ err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr,
+ cp->addr.type, hash192, rand192,
+ hash256, rand256);
+ if (err < 0)
+ status = MGMT_STATUS_FAILED;
+ else
+ status = MGMT_STATUS_SUCCESS;
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ status, &cp->addr, sizeof(cp->addr));
+ } else {
+ bt_dev_err(hdev, "add_remote_oob_data: invalid len of %u bytes",
+ len);
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_remove_remote_oob_data *cp = data;
+ u8 status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (cp->addr.type != BDADDR_BREDR)
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ if (!bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
+ hci_remote_oob_data_clear(hdev);
+ status = MGMT_STATUS_SUCCESS;
+ goto done;
+ }
+
+ err = hci_remove_remote_oob_data(hdev, &cp->addr.bdaddr, cp->addr.type);
+ if (err < 0)
+ status = MGMT_STATUS_INVALID_PARAMS;
+ else
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+ status, &cp->addr, sizeof(cp->addr));
+
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type,
+ uint8_t *mgmt_status)
+{
+ switch (type) {
+ case DISCOV_TYPE_LE:
+ *mgmt_status = mgmt_le_support(hdev);
+ if (*mgmt_status)
+ return false;
+ break;
+ case DISCOV_TYPE_INTERLEAVED:
+ *mgmt_status = mgmt_le_support(hdev);
+ if (*mgmt_status)
+ return false;
+ fallthrough;
+ case DISCOV_TYPE_BREDR:
+ *mgmt_status = mgmt_bredr_support(hdev);
+ if (*mgmt_status)
+ return false;
+ break;
+ default:
+ *mgmt_status = MGMT_STATUS_INVALID_PARAMS;
+ return false;
+ }
+
+ return true;
+}
+
+static void start_discovery_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err),
+ cmd->param, 1);
+ mgmt_pending_free(cmd);
+
+ hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED:
+ DISCOVERY_FINDING);
+}
+
+static int start_discovery_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ return hci_start_discovery_sync(hdev);
+}
+
+static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
+ u16 op, void *data, u16 len)
+{
+ struct mgmt_cp_start_discovery *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, op,
+ MGMT_STATUS_NOT_POWERED,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ if (hdev->discovery.state != DISCOVERY_STOPPED ||
+ hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) {
+ err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ if (!discovery_type_is_valid(hdev, cp->type, &status)) {
+ err = mgmt_cmd_complete(sk, hdev->id, op, status,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ /* Can't start discovery when it is paused */
+ if (hdev->discovery_paused) {
+ err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ /* Clear the discovery filter first to free any previously
+ * allocated memory for the UUID list.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ hdev->discovery.type = cp->type;
+ hdev->discovery.report_invalid_rssi = false;
+ if (op == MGMT_OP_START_LIMITED_DISCOVERY)
+ hdev->discovery.limited = true;
+ else
+ hdev->discovery.limited = false;
+
+ cmd = mgmt_pending_add(sk, op, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd,
+ start_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int start_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ return start_discovery_internal(sk, hdev, MGMT_OP_START_DISCOVERY,
+ data, len);
+}
+
+static int start_limited_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ return start_discovery_internal(sk, hdev,
+ MGMT_OP_START_LIMITED_DISCOVERY,
+ data, len);
+}
+
+static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_start_service_discovery *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ const u16 max_uuid_count = ((U16_MAX - sizeof(*cp)) / 16);
+ u16 uuid_count, expected_len;
+ u8 status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_NOT_POWERED,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ if (hdev->discovery.state != DISCOVERY_STOPPED ||
+ hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_BUSY, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ if (hdev->discovery_paused) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_BUSY, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ uuid_count = __le16_to_cpu(cp->uuid_count);
+ if (uuid_count > max_uuid_count) {
+ bt_dev_err(hdev, "service_discovery: too big uuid_count value %u",
+ uuid_count);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_INVALID_PARAMS, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ expected_len = sizeof(*cp) + uuid_count * 16;
+ if (expected_len != len) {
+ bt_dev_err(hdev, "service_discovery: expected %u bytes, got %u bytes",
+ expected_len, len);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_INVALID_PARAMS, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ if (!discovery_type_is_valid(hdev, cp->type, &status)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ status, &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY,
+ hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ /* Clear the discovery filter first to free any previously
+ * allocated memory for the UUID list.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ hdev->discovery.result_filtering = true;
+ hdev->discovery.type = cp->type;
+ hdev->discovery.rssi = cp->rssi;
+ hdev->discovery.uuid_count = uuid_count;
+
+ if (uuid_count > 0) {
+ hdev->discovery.uuids = kmemdup(cp->uuids, uuid_count * 16,
+ GFP_KERNEL);
+ if (!hdev->discovery.uuids) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_FAILED,
+ &cp->type, sizeof(cp->type));
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+ }
+
+ err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd,
+ start_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err),
+ cmd->param, 1);
+ mgmt_pending_free(cmd);
+
+ if (!err)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+}
+
+static int stop_discovery_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ return hci_stop_discovery_sync(hdev);
+}
+
+static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_stop_discovery *mgmt_cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hci_discovery_active(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY,
+ MGMT_STATUS_REJECTED, &mgmt_cp->type,
+ sizeof(mgmt_cp->type));
+ goto unlock;
+ }
+
+ if (hdev->discovery.type != mgmt_cp->type) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY,
+ MGMT_STATUS_INVALID_PARAMS,
+ &mgmt_cp->type, sizeof(mgmt_cp->type));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd,
+ stop_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_confirm_name *cp = data;
+ struct inquiry_entry *e;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (!hci_discovery_active(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME,
+ MGMT_STATUS_FAILED, &cp->addr,
+ sizeof(cp->addr));
+ goto failed;
+ }
+
+ e = hci_inquiry_cache_lookup_unknown(hdev, &cp->addr.bdaddr);
+ if (!e) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME,
+ MGMT_STATUS_INVALID_PARAMS, &cp->addr,
+ sizeof(cp->addr));
+ goto failed;
+ }
+
+ if (cp->name_known) {
+ e->name_state = NAME_KNOWN;
+ list_del(&e->list);
+ } else {
+ e->name_state = NAME_NEEDED;
+ hci_inquiry_cache_update_resolve(hdev, e);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, 0,
+ &cp->addr, sizeof(cp->addr));
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int block_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_block_device *cp = data;
+ u8 status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_add(&hdev->reject_list, &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err < 0) {
+ status = MGMT_STATUS_FAILED;
+ goto done;
+ }
+
+ mgmt_event(MGMT_EV_DEVICE_BLOCKED, hdev, &cp->addr, sizeof(cp->addr),
+ sk);
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_unblock_device *cp = data;
+ u8 status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_del(&hdev->reject_list, &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err < 0) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ mgmt_event(MGMT_EV_DEVICE_UNBLOCKED, hdev, &cp->addr, sizeof(cp->addr),
+ sk);
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int set_device_id_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_eir_sync(hdev);
+}
+
+static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_device_id *cp = data;
+ int err;
+ __u16 source;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ source = __le16_to_cpu(cp->source);
+
+ if (source > 0x0002)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEVICE_ID,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ hdev->devid_source = source;
+ hdev->devid_vendor = __le16_to_cpu(cp->vendor);
+ hdev->devid_product = __le16_to_cpu(cp->product);
+ hdev->devid_version = __le16_to_cpu(cp->version);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0,
+ NULL, 0);
+
+ hci_cmd_sync_queue(hdev, set_device_id_sync, NULL, NULL);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void enable_advertising_instance(struct hci_dev *hdev, int err)
+{
+ if (err)
+ bt_dev_err(hdev, "failed to re-configure advertising %d", err);
+ else
+ bt_dev_dbg(hdev, "status %d", err);
+}
+
+static void set_advertising_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct cmd_lookup match = { NULL, hdev };
+ u8 instance;
+ struct adv_info *adv_instance;
+ u8 status = mgmt_status(err);
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, data))
+ return;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status);
+ mgmt_pending_free(cmd);
+ return;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ hci_dev_set_flag(hdev, HCI_ADVERTISING);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING);
+
+ settings_rsp(cmd, &match);
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ /* If "Set Advertising" was just disabled and instance advertising was
+ * set up earlier, then re-enable multi-instance advertising.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ list_empty(&hdev->adv_instances))
+ return;
+
+ instance = hdev->cur_adv_instance;
+ if (!instance) {
+ adv_instance = list_first_entry_or_null(&hdev->adv_instances,
+ struct adv_info, list);
+ if (!adv_instance)
+ return;
+
+ instance = adv_instance->instance;
+ }
+
+ err = hci_schedule_adv_instance_sync(hdev, instance, true);
+
+ enable_advertising_instance(hdev, err);
+}
+
+static int set_adv_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+ u8 val;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ val = !!cp.val;
+
+ if (cp.val == 0x02)
+ hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+
+ cancel_adv_timeout(hdev);
+
+ if (val) {
+ /* Switch to instance "0" for the Set Advertising setting.
+ * We cannot use update_[adv|scan_rsp]_data() here as the
+ * HCI_ADVERTISING flag is not yet set.
+ */
+ hdev->cur_adv_instance = 0x00;
+
+ if (ext_adv_capable(hdev)) {
+ hci_start_ext_adv_sync(hdev, 0x00);
+ } else {
+ hci_update_adv_data_sync(hdev, 0x00);
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ hci_enable_advertising_sync(hdev);
+ }
+ } else {
+ hci_disable_advertising_sync(hdev);
+ }
+
+ return 0;
+}
+
+static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 val, status;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ status);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (hdev->advertising_paused)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_BUSY);
+
+ hci_dev_lock(hdev);
+
+ val = !!cp->val;
+
+ /* The following conditions are ones which mean that we should
+ * not do any HCI communication but directly send a mgmt
+ * response to user space (after toggling the flag if
+ * necessary).
+ */
+ if (!hdev_is_powered(hdev) ||
+ (val == hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) ||
+ hci_dev_test_flag(hdev, HCI_MESH) ||
+ hci_conn_num(hdev, LE_LINK) > 0 ||
+ (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->le_scan_type == LE_SCAN_ACTIVE)) {
+ bool changed;
+
+ if (cp->val) {
+ hdev->cur_adv_instance = 0x00;
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING);
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_ADVERTISING);
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_ADVERTISING, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_SET_LE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_ADVERTISING, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_adv_sync, cmd,
+ set_advertising_complete);
+
+ if (err < 0 && cmd)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_static_address(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_static_address *cp = data;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_REJECTED);
+
+ if (bacmp(&cp->bdaddr, BDADDR_ANY)) {
+ if (!bacmp(&cp->bdaddr, BDADDR_NONE))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Two most significant bits shall be set */
+ if ((cp->bdaddr.b[5] & 0xc0) != 0xc0)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ bacpy(&hdev->static_addr, &cp->bdaddr);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_STATIC_ADDRESS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_scan_params(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_scan_params *cp = data;
+ __u16 interval, window;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ /* Keep allowed ranges in sync with set_mesh() */
+ interval = __le16_to_cpu(cp->interval);
+
+ if (interval < 0x0004 || interval > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ window = __le16_to_cpu(cp->window);
+
+ if (window < 0x0004 || window > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (window > interval)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_interval = interval;
+ hdev->le_scan_window = window;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, 0,
+ NULL, 0);
+
+ /* If background scan is running, restart it so new parameters are
+ * loaded.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->discovery.state == DISCOVERY_STOPPED)
+ hci_update_passive_scan(hdev);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void fast_connectable_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ mgmt_status(err));
+ } else {
+ struct mgmt_mode *cp = cmd->param;
+
+ if (cp->val)
+ hci_dev_set_flag(hdev, HCI_FAST_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev);
+ new_settings(hdev, cmd->sk);
+ }
+
+ mgmt_pending_free(cmd);
+}
+
+static int write_fast_connectable_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp = cmd->param;
+
+ return hci_write_fast_connectable_sync(hdev, cp->val);
+}
+
+static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) ||
+ hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!!cp->val == hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev);
+ goto unlock;
+ }
+
+ if (!hdev_is_powered(hdev)) {
+ hci_dev_change_flag(hdev, HCI_FAST_CONNECTABLE);
+ err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev);
+ new_settings(hdev, sk);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev, data,
+ len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, write_fast_connectable_sync, cmd,
+ fast_connectable_complete);
+
+ if (err < 0) {
+ mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void set_bredr_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+
+ /* We need to restore the flag if related HCI commands
+ * failed.
+ */
+ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ } else {
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev);
+ new_settings(hdev, cmd->sk);
+ }
+
+ mgmt_pending_free(cmd);
+}
+
+static int set_bredr_sync(struct hci_dev *hdev, void *data)
+{
+ int status;
+
+ status = hci_write_fast_connectable_sync(hdev, false);
+
+ if (!status)
+ status = hci_update_scan_sync(hdev);
+
+ /* Since only the advertising data flags will change, there
+ * is no need to update the scan response data.
+ */
+ if (!status)
+ status = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance);
+
+ return status;
+}
+
+static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (cp->val == hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev);
+ goto unlock;
+ }
+
+ if (!hdev_is_powered(hdev)) {
+ if (!cp->val) {
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_LINK_SECURITY);
+ hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
+ }
+
+ hci_dev_change_flag(hdev, HCI_BREDR_ENABLED);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev);
+ if (err < 0)
+ goto unlock;
+
+ err = new_settings(hdev, sk);
+ goto unlock;
+ }
+
+ /* Reject disabling when powered on */
+ if (!cp->val) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ } else {
+ /* When configuring a dual-mode controller to operate
+ * with LE only and using a static address, then switching
+ * BR/EDR back on is not allowed.
+ *
+ * Dual-mode controllers shall operate with the public
+ * address as its identity address for BR/EDR and LE. So
+ * reject the attempt to create an invalid configuration.
+ *
+ * The same restrictions applies when secure connections
+ * has been enabled. For BR/EDR this is a controller feature
+ * while for LE it is a host stack feature. This means that
+ * switching BR/EDR back on when secure connections has been
+ * enabled is not a supported transaction.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ (bacmp(&hdev->static_addr, BDADDR_ANY) ||
+ hci_dev_test_flag(hdev, HCI_SC_ENABLED))) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_BREDR, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_bredr_sync, cmd,
+ set_bredr_complete);
+
+ if (err < 0) {
+ mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_FAILED);
+ if (cmd)
+ mgmt_pending_free(cmd);
+
+ goto unlock;
+ }
+
+ /* We need to flip the bit already here so that
+ * hci_req_update_adv_data generates the correct flags.
+ */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void set_secure_conn_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ goto done;
+ }
+
+ cp = cmd->param;
+
+ switch (cp->val) {
+ case 0x00:
+ hci_dev_clear_flag(hdev, HCI_SC_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ break;
+ case 0x01:
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ break;
+ case 0x02:
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ hci_dev_set_flag(hdev, HCI_SC_ONLY);
+ break;
+ }
+
+ send_settings_rsp(cmd->sk, cmd->opcode, hdev);
+ new_settings(hdev, cmd->sk);
+
+done:
+ mgmt_pending_free(cmd);
+}
+
+static int set_secure_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp = cmd->param;
+ u8 val = !!cp->val;
+
+ /* Force write of val */
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+
+ return hci_write_sc_support_sync(hdev, val);
+}
+
+static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 val;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_sc_capable(hdev) &&
+ !hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ lmp_sc_capable(hdev) &&
+ !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev) || !lmp_sc_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ bool changed;
+
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_SC_ENABLED);
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_SC_ONLY);
+ else
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_SC_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ val = !!cp->val;
+
+ if (val == hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
+ (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_SECURE_CONN, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_secure_conn_sync, cmd,
+ set_secure_conn_complete);
+
+ if (err < 0) {
+ mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_FAILED);
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_debug_keys(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ bool changed, use_changed;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_KEEP_DEBUG_KEYS);
+
+ if (cp->val == 0x02)
+ use_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_USE_DEBUG_KEYS);
+ else
+ use_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_USE_DEBUG_KEYS);
+
+ if (hdev_is_powered(hdev) && use_changed &&
+ hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ u8 mode = (cp->val == 0x02) ? 0x01 : 0x00;
+ hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE,
+ sizeof(mode), &mode);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_DEBUG_KEYS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
+ u16 len)
+{
+ struct mgmt_cp_set_privacy *cp = cp_data;
+ bool changed;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->privacy != 0x00 && cp->privacy != 0x01 && cp->privacy != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+
+ /* If user space supports this command it is also expected to
+ * handle IRKs. Therefore, set the HCI_RPA_RESOLVING flag.
+ */
+ hci_dev_set_flag(hdev, HCI_RPA_RESOLVING);
+
+ if (cp->privacy) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
+ memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
+ if (cp->privacy == 0x02)
+ hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
+ else
+ hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
+ memset(hdev->irk, 0, sizeof(hdev->irk));
+ hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, false);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool irk_is_valid(struct mgmt_irk_info *irk)
+{
+ switch (irk->addr.type) {
+ case BDADDR_LE_PUBLIC:
+ return true;
+
+ case BDADDR_LE_RANDOM:
+ /* Two most significant bits shall be set */
+ if ((irk->addr.bdaddr.b[5] & 0xc0) != 0xc0)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
+ u16 len)
+{
+ struct mgmt_cp_load_irks *cp = cp_data;
+ const u16 max_irk_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_irk_info));
+ u16 irk_count, expected_len;
+ int i, err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ irk_count = __le16_to_cpu(cp->irk_count);
+ if (irk_count > max_irk_count) {
+ bt_dev_err(hdev, "load_irks: too big irk_count value %u",
+ irk_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = struct_size(cp, irks, irk_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ bt_dev_dbg(hdev, "irk_count %u", irk_count);
+
+ for (i = 0; i < irk_count; i++) {
+ struct mgmt_irk_info *key = &cp->irks[i];
+
+ if (!irk_is_valid(key))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ hci_smp_irks_clear(hdev);
+
+ for (i = 0; i < irk_count; i++) {
+ struct mgmt_irk_info *irk = &cp->irks[i];
+
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_IRK,
+ irk->val)) {
+ bt_dev_warn(hdev, "Skipping blocked IRK for %pMR",
+ &irk->addr.bdaddr);
+ continue;
+ }
+
+ hci_add_irk(hdev, &irk->addr.bdaddr,
+ le_addr_type(irk->addr.type), irk->val,
+ BDADDR_ANY);
+ }
+
+ hci_dev_set_flag(hdev, HCI_RPA_RESOLVING);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_IRKS, 0, NULL, 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static bool ltk_is_valid(struct mgmt_ltk_info *key)
+{
+ if (key->initiator != 0x00 && key->initiator != 0x01)
+ return false;
+
+ switch (key->addr.type) {
+ case BDADDR_LE_PUBLIC:
+ return true;
+
+ case BDADDR_LE_RANDOM:
+ /* Two most significant bits shall be set */
+ if ((key->addr.bdaddr.b[5] & 0xc0) != 0xc0)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
+ void *cp_data, u16 len)
+{
+ struct mgmt_cp_load_long_term_keys *cp = cp_data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_ltk_info));
+ u16 key_count, expected_len;
+ int i, err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ key_count = __le16_to_cpu(cp->key_count);
+ if (key_count > max_key_count) {
+ bt_dev_err(hdev, "load_ltks: too big key_count value %u",
+ key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = struct_size(cp, keys, key_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ bt_dev_dbg(hdev, "key_count %u", key_count);
+
+ hci_dev_lock(hdev);
+
+ hci_smp_ltks_clear(hdev);
+
+ for (i = 0; i < key_count; i++) {
+ struct mgmt_ltk_info *key = &cp->keys[i];
+ u8 type, authenticated;
+
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LTK,
+ key->val)) {
+ bt_dev_warn(hdev, "Skipping blocked LTK for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
+ if (!ltk_is_valid(key)) {
+ bt_dev_warn(hdev, "Invalid LTK for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
+ switch (key->type) {
+ case MGMT_LTK_UNAUTHENTICATED:
+ authenticated = 0x00;
+ type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER;
+ break;
+ case MGMT_LTK_AUTHENTICATED:
+ authenticated = 0x01;
+ type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER;
+ break;
+ case MGMT_LTK_P256_UNAUTH:
+ authenticated = 0x00;
+ type = SMP_LTK_P256;
+ break;
+ case MGMT_LTK_P256_AUTH:
+ authenticated = 0x01;
+ type = SMP_LTK_P256;
+ break;
+ case MGMT_LTK_P256_DEBUG:
+ authenticated = 0x00;
+ type = SMP_LTK_P256_DEBUG;
+ fallthrough;
+ default:
+ continue;
+ }
+
+ hci_add_ltk(hdev, &key->addr.bdaddr,
+ le_addr_type(key->addr.type), type, authenticated,
+ key->val, key->enc_size, key->ediv, key->rand);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0,
+ NULL, 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct hci_conn *conn = cmd->user_data;
+ struct mgmt_cp_get_conn_info *cp = cmd->param;
+ struct mgmt_rp_get_conn_info rp;
+ u8 status;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ memcpy(&rp.addr, &cp->addr, sizeof(rp.addr));
+
+ status = mgmt_status(err);
+ if (status == MGMT_STATUS_SUCCESS) {
+ rp.rssi = conn->rssi;
+ rp.tx_power = conn->tx_power;
+ rp.max_tx_power = conn->max_tx_power;
+ } else {
+ rp.rssi = HCI_RSSI_INVALID;
+ rp.tx_power = HCI_TX_POWER_INVALID;
+ rp.max_tx_power = HCI_TX_POWER_INVALID;
+ }
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_GET_CONN_INFO, status,
+ &rp, sizeof(rp));
+
+ mgmt_pending_free(cmd);
+}
+
+static int get_conn_info_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_get_conn_info *cp = cmd->param;
+ struct hci_conn *conn;
+ int err;
+ __le16 handle;
+
+ /* Make sure we are still connected */
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr);
+
+ if (!conn || conn->state != BT_CONNECTED)
+ return MGMT_STATUS_NOT_CONNECTED;
+
+ cmd->user_data = conn;
+ handle = cpu_to_le16(conn->handle);
+
+ /* Refresh RSSI each time */
+ err = hci_read_rssi_sync(hdev, handle);
+
+ /* For LE links TX power does not change thus we don't need to
+ * query for it once value is known.
+ */
+ if (!err && (!bdaddr_type_is_le(cp->addr.type) ||
+ conn->tx_power == HCI_TX_POWER_INVALID))
+ err = hci_read_tx_power_sync(hdev, handle, 0x00);
+
+ /* Max TX power needs to be read only once per connection */
+ if (!err && conn->max_tx_power == HCI_TX_POWER_INVALID)
+ err = hci_read_tx_power_sync(hdev, handle, 0x01);
+
+ return err;
+}
+
+static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_get_conn_info *cp = data;
+ struct mgmt_rp_get_conn_info rp;
+ struct hci_conn *conn;
+ unsigned long conn_info_age;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr);
+
+ if (!conn || conn->state != BT_CONNECTED) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_NOT_CONNECTED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ /* To avoid client trying to guess when to poll again for information we
+ * calculate conn info age as random value between min/max set in hdev.
+ */
+ conn_info_age = get_random_u32_inclusive(hdev->conn_info_min_age,
+ hdev->conn_info_max_age - 1);
+
+ /* Query controller to refresh cached values if they are too old or were
+ * never read.
+ */
+ if (time_after(jiffies, conn->conn_info_timestamp +
+ msecs_to_jiffies(conn_info_age)) ||
+ !conn->conn_info_timestamp) {
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_GET_CONN_INFO, hdev, data,
+ len);
+ if (!cmd) {
+ err = -ENOMEM;
+ } else {
+ err = hci_cmd_sync_queue(hdev, get_conn_info_sync,
+ cmd, get_conn_info_complete);
+ }
+
+ if (err < 0) {
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_FAILED, &rp, sizeof(rp));
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+
+ goto unlock;
+ }
+
+ conn->conn_info_timestamp = jiffies;
+ } else {
+ /* Cache is valid, just reply with values cached in hci_conn */
+ rp.rssi = conn->rssi;
+ rp.tx_power = conn->tx_power;
+ rp.max_tx_power = conn->max_tx_power;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void get_clock_info_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_get_clock_info *cp = cmd->param;
+ struct mgmt_rp_get_clock_info rp;
+ struct hci_conn *conn = cmd->user_data;
+ u8 status = mgmt_status(err);
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (err)
+ goto complete;
+
+ rp.local_clock = cpu_to_le32(hdev->clock);
+
+ if (conn) {
+ rp.piconet_clock = cpu_to_le32(conn->clock);
+ rp.accuracy = cpu_to_le16(conn->clock_accuracy);
+ }
+
+complete:
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, &rp,
+ sizeof(rp));
+
+ mgmt_pending_free(cmd);
+}
+
+static int get_clock_info_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_get_clock_info *cp = cmd->param;
+ struct hci_cp_read_clock hci_cp;
+ struct hci_conn *conn;
+
+ memset(&hci_cp, 0, sizeof(hci_cp));
+ hci_read_clock_sync(hdev, &hci_cp);
+
+ /* Make sure connection still exists */
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr);
+ if (!conn || conn->state != BT_CONNECTED)
+ return MGMT_STATUS_NOT_CONNECTED;
+
+ cmd->user_data = conn;
+ hci_cp.handle = cpu_to_le16(conn->handle);
+ hci_cp.which = 0x01; /* Piconet clock */
+
+ return hci_read_clock_sync(hdev, &hci_cp);
+}
+
+static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_get_clock_info *cp = data;
+ struct mgmt_rp_get_clock_info rp;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (cp->addr.type != BDADDR_BREDR)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ if (!conn || conn->state != BT_CONNECTED) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_NOT_CONNECTED,
+ &rp, sizeof(rp));
+ goto unlock;
+ }
+ } else {
+ conn = NULL;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_GET_CLOCK_INFO, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, get_clock_info_sync, cmd,
+ get_clock_info_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_FAILED, &rp, sizeof(rp));
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
+
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void device_added(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 type, u8 action)
+{
+ struct mgmt_ev_device_added ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = type;
+ ev.action = action;
+
+ mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void add_device_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_device *cp = cmd->param;
+
+ if (!err) {
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+
+ device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type,
+ cp->action);
+ device_flags_changed(NULL, hdev, &cp->addr.bdaddr,
+ cp->addr.type, hdev->conn_flags,
+ params ? params->flags : 0);
+ }
+
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ mgmt_status(err), &cp->addr, sizeof(cp->addr));
+ mgmt_pending_free(cmd);
+}
+
+static int add_device_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_passive_scan_sync(hdev);
+}
+
+static int add_device(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_cp_add_device *cp = data;
+ u8 auto_conn, addr_type;
+ struct hci_conn_params *params;
+ int err;
+ u32 current_flags = 0;
+ u32 supported_flags;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!bdaddr_type_is_valid(cp->addr.type) ||
+ !bacmp(&cp->addr.bdaddr, BDADDR_ANY))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ if (cp->action != 0x00 && cp->action != 0x01 && cp->action != 0x02)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ /* Only incoming connections action is supported for now */
+ if (cp->action != 0x01) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ err = hci_bdaddr_list_add_with_flags(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type, 0);
+ if (err)
+ goto unlock;
+
+ hci_update_scan(hdev);
+
+ goto added;
+ }
+
+ addr_type = le_addr_type(cp->addr.type);
+
+ if (cp->action == 0x02)
+ auto_conn = HCI_AUTO_CONN_ALWAYS;
+ else if (cp->action == 0x01)
+ auto_conn = HCI_AUTO_CONN_DIRECT;
+ else
+ auto_conn = HCI_AUTO_CONN_REPORT;
+
+ /* Kernel internally uses conn_params with resolvable private
+ * address, but Add Device allows only identity addresses.
+ * Make sure it is enforced before calling
+ * hci_conn_params_lookup.
+ */
+ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ /* If the connection parameters don't exist for this device,
+ * they will be created and configured with defaults.
+ */
+ params = hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type,
+ auto_conn);
+ if (!params) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_FAILED, &cp->addr,
+ sizeof(cp->addr));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_queue(hdev, add_device_sync, cmd,
+ add_device_complete);
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_FAILED, &cp->addr,
+ sizeof(cp->addr));
+ mgmt_pending_free(cmd);
+ }
+
+ goto unlock;
+
+added:
+ device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action);
+ supported_flags = hdev->conn_flags;
+ device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type,
+ supported_flags, current_flags);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_SUCCESS, &cp->addr,
+ sizeof(cp->addr));
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void device_removed(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 type)
+{
+ struct mgmt_ev_device_removed ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = type;
+
+ mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk);
+}
+
+static int remove_device_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_passive_scan_sync(hdev);
+}
+
+static int remove_device(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_remove_device *cp = data;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
+ struct hci_conn_params *params;
+ u8 addr_type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ err = hci_bdaddr_list_del(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr,
+ sizeof(cp->addr));
+ goto unlock;
+ }
+
+ hci_update_scan(hdev);
+
+ device_removed(sk, hdev, &cp->addr.bdaddr,
+ cp->addr.type);
+ goto complete;
+ }
+
+ addr_type = le_addr_type(cp->addr.type);
+
+ /* Kernel internally uses conn_params with resolvable private
+ * address, but Remove Device allows only identity addresses.
+ * Make sure it is enforced before calling
+ * hci_conn_params_lookup.
+ */
+ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ addr_type);
+ if (!params) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ if (params->auto_connect == HCI_AUTO_CONN_DISABLED ||
+ params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ hci_conn_params_free(params);
+
+ device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type);
+ } else {
+ struct hci_conn_params *p, *tmp;
+ struct bdaddr_list *b, *btmp;
+
+ if (cp->addr.type) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ list_for_each_entry_safe(b, btmp, &hdev->accept_list, list) {
+ device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type);
+ list_del(&b->list);
+ kfree(b);
+ }
+
+ hci_update_scan(hdev);
+
+ list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) {
+ if (p->auto_connect == HCI_AUTO_CONN_DISABLED)
+ continue;
+ device_removed(sk, hdev, &p->addr, p->addr_type);
+ if (p->explicit_connect) {
+ p->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+ continue;
+ }
+ hci_conn_params_free(p);
+ }
+
+ bt_dev_dbg(hdev, "All LE connection parameters were removed");
+ }
+
+ hci_cmd_sync_queue(hdev, remove_device_sync, NULL, NULL);
+
+complete:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_SUCCESS, &cp->addr,
+ sizeof(cp->addr));
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int conn_update_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn_params *params = data;
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_le(hdev, &params->addr, params->addr_type);
+ if (!conn)
+ return -ECANCELED;
+
+ return hci_le_conn_update_sync(hdev, conn, params);
+}
+
+static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_load_conn_param *cp = data;
+ const u16 max_param_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_conn_param));
+ u16 param_count, expected_len;
+ int i;
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ param_count = __le16_to_cpu(cp->param_count);
+ if (param_count > max_param_count) {
+ bt_dev_err(hdev, "load_conn_param: too big param_count value %u",
+ param_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = struct_size(cp, params, param_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ bt_dev_dbg(hdev, "param_count %u", param_count);
+
+ hci_dev_lock(hdev);
+
+ if (param_count > 1)
+ hci_conn_params_clear_disabled(hdev);
+
+ for (i = 0; i < param_count; i++) {
+ struct mgmt_conn_param *param = &cp->params[i];
+ struct hci_conn_params *hci_param;
+ u16 min, max, latency, timeout;
+ bool update = false;
+ u8 addr_type;
+
+ bt_dev_dbg(hdev, "Adding %pMR (type %u)", &param->addr.bdaddr,
+ param->addr.type);
+
+ if (param->addr.type == BDADDR_LE_PUBLIC) {
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ } else if (param->addr.type == BDADDR_LE_RANDOM) {
+ addr_type = ADDR_LE_DEV_RANDOM;
+ } else {
+ bt_dev_err(hdev, "ignoring invalid connection parameters");
+ continue;
+ }
+
+ min = le16_to_cpu(param->min_interval);
+ max = le16_to_cpu(param->max_interval);
+ latency = le16_to_cpu(param->latency);
+ timeout = le16_to_cpu(param->timeout);
+
+ bt_dev_dbg(hdev, "min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x",
+ min, max, latency, timeout);
+
+ if (hci_check_conn_params(min, max, latency, timeout) < 0) {
+ bt_dev_err(hdev, "ignoring invalid connection parameters");
+ continue;
+ }
+
+ /* Detect when the loading is for an existing parameter then
+ * attempt to trigger the connection update procedure.
+ */
+ if (!i && param_count == 1) {
+ hci_param = hci_conn_params_lookup(hdev,
+ &param->addr.bdaddr,
+ addr_type);
+ if (hci_param)
+ update = true;
+ else
+ hci_conn_params_clear_disabled(hdev);
+ }
+
+ hci_param = hci_conn_params_add(hdev, &param->addr.bdaddr,
+ addr_type);
+ if (!hci_param) {
+ bt_dev_err(hdev, "failed to add connection parameters");
+ continue;
+ }
+
+ hci_param->conn_min_interval = min;
+ hci_param->conn_max_interval = max;
+ hci_param->conn_latency = latency;
+ hci_param->supervision_timeout = timeout;
+
+ /* Check if we need to trigger a connection update */
+ if (update) {
+ struct hci_conn *conn;
+
+ /* Lookup for existing connection as central and check
+ * if parameters match and if they don't then trigger
+ * a connection update.
+ */
+ conn = hci_conn_hash_lookup_le(hdev, &hci_param->addr,
+ addr_type);
+ if (conn && conn->role == HCI_ROLE_MASTER &&
+ (conn->le_conn_min_interval != min ||
+ conn->le_conn_max_interval != max ||
+ conn->le_conn_latency != latency ||
+ conn->le_supv_timeout != timeout))
+ hci_cmd_sync_queue(hdev, conn_update_sync,
+ hci_param, NULL);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, 0,
+ NULL, 0);
+}
+
+static int set_external_config(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_external_config *cp = data;
+ bool changed;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->config != 0x00 && cp->config != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (!hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ hci_dev_lock(hdev);
+
+ if (cp->config)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_EXT_CONFIGURED);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_EXT_CONFIGURED);
+
+ err = send_options_rsp(sk, MGMT_OP_SET_EXTERNAL_CONFIG, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (!changed)
+ goto unlock;
+
+ err = new_options(hdev, sk);
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) == is_configured(hdev)) {
+ mgmt_index_removed(hdev);
+
+ if (hci_dev_test_and_change_flag(hdev, HCI_UNCONFIGURED)) {
+ hci_dev_set_flag(hdev, HCI_CONFIG);
+ hci_dev_set_flag(hdev, HCI_AUTO_OFF);
+
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+ } else {
+ set_bit(HCI_RAW, &hdev->flags);
+ mgmt_index_added(hdev);
+ }
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_public_address(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_public_address *cp = data;
+ bool changed;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_STATUS_REJECTED);
+
+ if (!bacmp(&cp->bdaddr, BDADDR_ANY))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (!hdev->set_bdaddr)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ hci_dev_lock(hdev);
+
+ changed = !!bacmp(&hdev->public_addr, &cp->bdaddr);
+ bacpy(&hdev->public_addr, &cp->bdaddr);
+
+ err = send_options_rsp(sk, MGMT_OP_SET_PUBLIC_ADDRESS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (!changed)
+ goto unlock;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ err = new_options(hdev, sk);
+
+ if (is_configured(hdev)) {
+ mgmt_index_removed(hdev);
+
+ hci_dev_clear_flag(hdev, HCI_UNCONFIGURED);
+
+ hci_dev_set_flag(hdev, HCI_CONFIG);
+ hci_dev_set_flag(hdev, HCI_AUTO_OFF);
+
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ const struct mgmt_cp_read_local_oob_ext_data *mgmt_cp;
+ struct mgmt_rp_read_local_oob_ext_data *mgmt_rp;
+ u8 *h192, *r192, *h256, *r256;
+ struct mgmt_pending_cmd *cmd = data;
+ struct sk_buff *skb = cmd->skb;
+ u8 status = mgmt_status(err);
+ u16 eir_len;
+
+ if (!status) {
+ if (!skb)
+ status = MGMT_STATUS_FAILED;
+ else if (IS_ERR(skb))
+ status = mgmt_status(PTR_ERR(skb));
+ else
+ status = mgmt_status(skb->data[0]);
+ }
+
+ bt_dev_dbg(hdev, "status %u", status);
+
+ mgmt_cp = cmd->param;
+
+ if (status) {
+ status = mgmt_status(status);
+ eir_len = 0;
+
+ h192 = NULL;
+ r192 = NULL;
+ h256 = NULL;
+ r256 = NULL;
+ } else if (!bredr_sc_enabled(hdev)) {
+ struct hci_rp_read_local_oob_data *rp;
+
+ if (skb->len != sizeof(*rp)) {
+ status = MGMT_STATUS_FAILED;
+ eir_len = 0;
+ } else {
+ status = MGMT_STATUS_SUCCESS;
+ rp = (void *)skb->data;
+
+ eir_len = 5 + 18 + 18;
+ h192 = rp->hash;
+ r192 = rp->rand;
+ h256 = NULL;
+ r256 = NULL;
+ }
+ } else {
+ struct hci_rp_read_local_oob_ext_data *rp;
+
+ if (skb->len != sizeof(*rp)) {
+ status = MGMT_STATUS_FAILED;
+ eir_len = 0;
+ } else {
+ status = MGMT_STATUS_SUCCESS;
+ rp = (void *)skb->data;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ eir_len = 5 + 18 + 18;
+ h192 = NULL;
+ r192 = NULL;
+ } else {
+ eir_len = 5 + 18 + 18 + 18 + 18;
+ h192 = rp->hash192;
+ r192 = rp->rand192;
+ }
+
+ h256 = rp->hash256;
+ r256 = rp->rand256;
+ }
+ }
+
+ mgmt_rp = kmalloc(sizeof(*mgmt_rp) + eir_len, GFP_KERNEL);
+ if (!mgmt_rp)
+ goto done;
+
+ if (eir_len == 0)
+ goto send_rsp;
+
+ eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+
+ if (h192 && r192) {
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_HASH_C192, h192, 16);
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_RAND_R192, r192, 16);
+ }
+
+ if (h256 && r256) {
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_HASH_C256, h256, 16);
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_RAND_R256, r256, 16);
+ }
+
+send_rsp:
+ mgmt_rp->type = mgmt_cp->type;
+ mgmt_rp->eir_len = cpu_to_le16(eir_len);
+
+ err = mgmt_cmd_complete(cmd->sk, hdev->id,
+ MGMT_OP_READ_LOCAL_OOB_EXT_DATA, status,
+ mgmt_rp, sizeof(*mgmt_rp) + eir_len);
+ if (err < 0 || status)
+ goto done;
+
+ hci_sock_set_flag(cmd->sk, HCI_MGMT_OOB_DATA_EVENTS);
+
+ err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev,
+ mgmt_rp, sizeof(*mgmt_rp) + eir_len,
+ HCI_MGMT_OOB_DATA_EVENTS, cmd->sk);
+done:
+ if (skb && !IS_ERR(skb))
+ kfree_skb(skb);
+
+ kfree(mgmt_rp);
+ mgmt_pending_free(cmd);
+}
+
+static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk,
+ struct mgmt_cp_read_local_oob_ext_data *cp)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev,
+ cp, sizeof(*cp));
+ if (!cmd)
+ return -ENOMEM;
+
+ err = hci_cmd_sync_queue(hdev, read_local_oob_data_sync, cmd,
+ read_local_oob_ext_data_complete);
+
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ return err;
+ }
+
+ return 0;
+}
+
+static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_read_local_oob_ext_data *cp = data;
+ struct mgmt_rp_read_local_oob_ext_data *rp;
+ size_t rp_len;
+ u16 eir_len;
+ u8 status, flags, role, addr[7], hash[16], rand[16];
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (hdev_is_powered(hdev)) {
+ switch (cp->type) {
+ case BIT(BDADDR_BREDR):
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ eir_len = 0;
+ else
+ eir_len = 5;
+ break;
+ case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)):
+ status = mgmt_le_support(hdev);
+ if (status)
+ eir_len = 0;
+ else
+ eir_len = 9 + 3 + 18 + 18 + 3;
+ break;
+ default:
+ status = MGMT_STATUS_INVALID_PARAMS;
+ eir_len = 0;
+ break;
+ }
+ } else {
+ status = MGMT_STATUS_NOT_POWERED;
+ eir_len = 0;
+ }
+
+ rp_len = sizeof(*rp) + eir_len;
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp)
+ return -ENOMEM;
+
+ if (!status && !lmp_ssp_capable(hdev)) {
+ status = MGMT_STATUS_NOT_SUPPORTED;
+ eir_len = 0;
+ }
+
+ if (status)
+ goto complete;
+
+ hci_dev_lock(hdev);
+
+ eir_len = 0;
+ switch (cp->type) {
+ case BIT(BDADDR_BREDR):
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ err = read_local_ssp_oob_req(hdev, sk, cp);
+ hci_dev_unlock(hdev);
+ if (!err)
+ goto done;
+
+ status = MGMT_STATUS_FAILED;
+ goto complete;
+ } else {
+ eir_len = eir_append_data(rp->eir, eir_len,
+ EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+ }
+ break;
+ case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)):
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
+ smp_generate_oob(hdev, hash, rand) < 0) {
+ hci_dev_unlock(hdev);
+ status = MGMT_STATUS_FAILED;
+ goto complete;
+ }
+
+ /* This should return the active RPA, but since the RPA
+ * is only programmed on demand, it is really hard to fill
+ * this in at the moment. For now disallow retrieving
+ * local out-of-band data when privacy is in use.
+ *
+ * Returning the identity address will not help here since
+ * pairing happens before the identity resolving key is
+ * known and thus the connection establishment happens
+ * based on the RPA and not the identity address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ hci_dev_unlock(hdev);
+ status = MGMT_STATUS_REJECTED;
+ goto complete;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ memcpy(addr, &hdev->static_addr, 6);
+ addr[6] = 0x01;
+ } else {
+ memcpy(addr, &hdev->bdaddr, 6);
+ addr[6] = 0x00;
+ }
+
+ eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_BDADDR,
+ addr, sizeof(addr));
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ role = 0x02;
+ else
+ role = 0x01;
+
+ eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_ROLE,
+ &role, sizeof(role));
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED)) {
+ eir_len = eir_append_data(rp->eir, eir_len,
+ EIR_LE_SC_CONFIRM,
+ hash, sizeof(hash));
+
+ eir_len = eir_append_data(rp->eir, eir_len,
+ EIR_LE_SC_RANDOM,
+ rand, sizeof(rand));
+ }
+
+ flags = mgmt_get_adv_discov_flags(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
+ eir_len = eir_append_data(rp->eir, eir_len, EIR_FLAGS,
+ &flags, sizeof(flags));
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+
+ hci_sock_set_flag(sk, HCI_MGMT_OOB_DATA_EVENTS);
+
+ status = MGMT_STATUS_SUCCESS;
+
+complete:
+ rp->type = cp->type;
+ rp->eir_len = cpu_to_le16(eir_len);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_EXT_DATA,
+ status, rp, sizeof(*rp) + eir_len);
+ if (err < 0 || status)
+ goto done;
+
+ err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev,
+ rp, sizeof(*rp) + eir_len,
+ HCI_MGMT_OOB_DATA_EVENTS, sk);
+
+done:
+ kfree(rp);
+
+ return err;
+}
+
+static u32 get_supported_adv_flags(struct hci_dev *hdev)
+{
+ u32 flags = 0;
+
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
+ flags |= MGMT_ADV_FLAG_DISCOV;
+ flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
+ flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+ flags |= MGMT_ADV_FLAG_APPEARANCE;
+ flags |= MGMT_ADV_FLAG_LOCAL_NAME;
+ flags |= MGMT_ADV_PARAM_DURATION;
+ flags |= MGMT_ADV_PARAM_TIMEOUT;
+ flags |= MGMT_ADV_PARAM_INTERVALS;
+ flags |= MGMT_ADV_PARAM_TX_POWER;
+ flags |= MGMT_ADV_PARAM_SCAN_RSP;
+
+ /* In extended adv TX_POWER returned from Set Adv Param
+ * will be always valid.
+ */
+ if (hdev->adv_tx_power != HCI_TX_POWER_INVALID || ext_adv_capable(hdev))
+ flags |= MGMT_ADV_FLAG_TX_POWER;
+
+ if (ext_adv_capable(hdev)) {
+ flags |= MGMT_ADV_FLAG_SEC_1M;
+ flags |= MGMT_ADV_FLAG_HW_OFFLOAD;
+ flags |= MGMT_ADV_FLAG_CAN_SET_TX_POWER;
+
+ if (le_2m_capable(hdev))
+ flags |= MGMT_ADV_FLAG_SEC_2M;
+
+ if (le_coded_capable(hdev))
+ flags |= MGMT_ADV_FLAG_SEC_CODED;
+ }
+
+ return flags;
+}
+
+static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_adv_features *rp;
+ size_t rp_len;
+ int err;
+ struct adv_info *adv_instance;
+ u32 supported_flags;
+ u8 *instance;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+
+ rp_len = sizeof(*rp) + hdev->adv_instance_cnt;
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ hci_dev_unlock(hdev);
+ return -ENOMEM;
+ }
+
+ supported_flags = get_supported_adv_flags(hdev);
+
+ rp->supported_flags = cpu_to_le32(supported_flags);
+ rp->max_adv_data_len = max_adv_len(hdev);
+ rp->max_scan_rsp_len = max_adv_len(hdev);
+ rp->max_instances = hdev->le_num_of_adv_sets;
+ rp->num_instances = hdev->adv_instance_cnt;
+
+ instance = rp->instance;
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
+ /* Only instances 1-le_num_of_adv_sets are externally visible */
+ if (adv_instance->instance <= hdev->adv_instance_cnt) {
+ *instance = adv_instance->instance;
+ instance++;
+ } else {
+ rp->num_instances--;
+ rp_len--;
+ }
+ }
+
+ hci_dev_unlock(hdev);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES,
+ MGMT_STATUS_SUCCESS, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static u8 calculate_name_len(struct hci_dev *hdev)
+{
+ u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 2]; /* len + type + name */
+
+ return eir_append_local_name(hdev, buf, 0);
+}
+
+static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags,
+ bool is_adv_data)
+{
+ u8 max_len = max_adv_len(hdev);
+
+ if (is_adv_data) {
+ if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS))
+ max_len -= 3;
+
+ if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
+ max_len -= 3;
+ } else {
+ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ max_len -= calculate_name_len(hdev);
+
+ if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
+ max_len -= 4;
+ }
+
+ return max_len;
+}
+
+static bool flags_managed(u32 adv_flags)
+{
+ return adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS);
+}
+
+static bool tx_power_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_TX_POWER;
+}
+
+static bool name_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME;
+}
+
+static bool appearance_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_APPEARANCE;
+}
+
+static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
+ u8 len, bool is_adv_data)
+{
+ int i, cur_len;
+ u8 max_len;
+
+ max_len = tlv_data_max_len(hdev, adv_flags, is_adv_data);
+
+ if (len > max_len)
+ return false;
+
+ /* Make sure that the data is correctly formatted. */
+ for (i = 0; i < len; i += (cur_len + 1)) {
+ cur_len = data[i];
+
+ if (!cur_len)
+ continue;
+
+ if (data[i + 1] == EIR_FLAGS &&
+ (!is_adv_data || flags_managed(adv_flags)))
+ return false;
+
+ if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_APPEARANCE &&
+ appearance_managed(adv_flags))
+ return false;
+
+ /* If the current field length would exceed the total data
+ * length, then it's invalid.
+ */
+ if (i + cur_len >= len)
+ return false;
+ }
+
+ return true;
+}
+
+static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags)
+{
+ u32 supported_flags, phy_flags;
+
+ /* The current implementation only supports a subset of the specified
+ * flags. Also need to check mutual exclusiveness of sec flags.
+ */
+ supported_flags = get_supported_adv_flags(hdev);
+ phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK;
+ if (adv_flags & ~supported_flags ||
+ ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
+ return false;
+
+ return true;
+}
+
+static bool adv_busy(struct hci_dev *hdev)
+{
+ return pending_find(MGMT_OP_SET_LE, hdev);
+}
+
+static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance,
+ int err)
+{
+ struct adv_info *adv, *n;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ hci_dev_lock(hdev);
+
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ u8 instance;
+
+ if (!adv->pending)
+ continue;
+
+ if (!err) {
+ adv->pending = false;
+ continue;
+ }
+
+ instance = adv->instance;
+
+ if (hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ hci_remove_adv_instance(hdev, instance);
+ mgmt_advertising_removed(sk, hdev, instance);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void add_advertising_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_advertising *cp = cmd->param;
+ struct mgmt_rp_add_advertising rp;
+
+ memset(&rp, 0, sizeof(rp));
+
+ rp.instance = cp->instance;
+
+ if (err)
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ else
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), &rp, sizeof(rp));
+
+ add_adv_complete(hdev, cmd->sk, cp->instance, err);
+
+ mgmt_pending_free(cmd);
+}
+
+static int add_advertising_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_advertising *cp = cmd->param;
+
+ return hci_schedule_adv_instance_sync(hdev, cp->instance, true);
+}
+
+static int add_advertising(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_add_advertising *cp = data;
+ struct mgmt_rp_add_advertising rp;
+ u32 flags;
+ u8 status;
+ u16 timeout, duration;
+ unsigned int prev_instance_cnt;
+ u8 schedule_instance = 0;
+ struct adv_info *adv, *next_instance;
+ int err;
+ struct mgmt_pending_cmd *cmd;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ status);
+
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (data_len != sizeof(*cp) + cp->adv_data_len + cp->scan_rsp_len)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ flags = __le32_to_cpu(cp->flags);
+ timeout = __le16_to_cpu(cp->timeout);
+ duration = __le16_to_cpu(cp->duration);
+
+ if (!requested_adv_flags_are_valid(hdev, flags))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (timeout && !hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) ||
+ !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len,
+ cp->scan_rsp_len, false)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ prev_instance_cnt = hdev->adv_instance_cnt;
+
+ adv = hci_add_adv_instance(hdev, cp->instance, flags,
+ cp->adv_data_len, cp->data,
+ cp->scan_rsp_len,
+ cp->data + cp->adv_data_len,
+ timeout, duration,
+ HCI_ADV_TX_POWER_NO_PREFERENCE,
+ hdev->le_adv_min_interval,
+ hdev->le_adv_max_interval, 0);
+ if (IS_ERR(adv)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_FAILED);
+ goto unlock;
+ }
+
+ /* Only trigger an advertising added event if a new instance was
+ * actually added.
+ */
+ if (hdev->adv_instance_cnt > prev_instance_cnt)
+ mgmt_advertising_added(sk, hdev, cp->instance);
+
+ if (hdev->cur_adv_instance == cp->instance) {
+ /* If the currently advertised instance is being changed then
+ * cancel the current advertising and schedule the next
+ * instance. If there is only one instance then the overridden
+ * advertising data will be visible right away.
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev, cp->instance);
+ if (next_instance)
+ schedule_instance = next_instance->instance;
+ } else if (!hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other
+ * instance is currently being advertised.
+ */
+ schedule_instance = cp->instance;
+ }
+
+ /* If the HCI_ADVERTISING flag is set or the device isn't powered or
+ * there is no instance to be advertised then we have no HCI
+ * communication to make. Simply return.
+ */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !schedule_instance) {
+ rp.instance = cp->instance;
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ /* We're good to go, update advertising data, parameters, and start
+ * advertising.
+ */
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_ADVERTISING, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cp->instance = schedule_instance;
+
+ err = hci_cmd_sync_queue(hdev, add_advertising_sync, cmd,
+ add_advertising_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_params *cp = cmd->param;
+ struct mgmt_rp_add_ext_adv_params rp;
+ struct adv_info *adv;
+ u32 flags;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ adv = hci_find_adv_instance(hdev, cp->instance);
+ if (!adv)
+ goto unlock;
+
+ rp.instance = cp->instance;
+ rp.tx_power = adv->tx_power;
+
+ /* While we're at it, inform userspace of the available space for this
+ * advertisement, given the flags that will be used.
+ */
+ flags = __le32_to_cpu(cp->flags);
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+
+ if (err) {
+ /* If this advertisement was previously advertising and we
+ * failed to update it, we signal that it has been removed and
+ * delete its structure
+ */
+ if (!adv->pending)
+ mgmt_advertising_removed(cmd->sk, hdev, cp->instance);
+
+ hci_remove_adv_instance(hdev, cp->instance);
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ } else {
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), &rp, sizeof(rp));
+ }
+
+unlock:
+ mgmt_pending_free(cmd);
+
+ hci_dev_unlock(hdev);
+}
+
+static int add_ext_adv_params_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_params *cp = cmd->param;
+
+ return hci_setup_ext_adv_instance_sync(hdev, cp->instance);
+}
+
+static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_add_ext_adv_params *cp = data;
+ struct mgmt_rp_add_ext_adv_params rp;
+ struct mgmt_pending_cmd *cmd = NULL;
+ struct adv_info *adv;
+ u32 flags, min_interval, max_interval;
+ u16 timeout, duration;
+ u8 status;
+ s8 tx_power;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ status);
+
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* The purpose of breaking add_advertising into two separate MGMT calls
+ * for params and data is to allow more parameters to be added to this
+ * structure in the future. For this reason, we verify that we have the
+ * bare minimum structure we know of when the interface was defined. Any
+ * extra parameters we don't know about will be ignored in this request.
+ */
+ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ flags = __le32_to_cpu(cp->flags);
+
+ if (!requested_adv_flags_are_valid(hdev, flags))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ /* In new interface, we require that we are powered to register */
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ /* Parse defined parameters from request, use defaults otherwise */
+ timeout = (flags & MGMT_ADV_PARAM_TIMEOUT) ?
+ __le16_to_cpu(cp->timeout) : 0;
+
+ duration = (flags & MGMT_ADV_PARAM_DURATION) ?
+ __le16_to_cpu(cp->duration) :
+ hdev->def_multi_adv_rotation_duration;
+
+ min_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ?
+ __le32_to_cpu(cp->min_interval) :
+ hdev->le_adv_min_interval;
+
+ max_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ?
+ __le32_to_cpu(cp->max_interval) :
+ hdev->le_adv_max_interval;
+
+ tx_power = (flags & MGMT_ADV_PARAM_TX_POWER) ?
+ cp->tx_power :
+ HCI_ADV_TX_POWER_NO_PREFERENCE;
+
+ /* Create advertising instance with no advertising or response data */
+ adv = hci_add_adv_instance(hdev, cp->instance, flags, 0, NULL, 0, NULL,
+ timeout, duration, tx_power, min_interval,
+ max_interval, 0);
+
+ if (IS_ERR(adv)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_FAILED);
+ goto unlock;
+ }
+
+ /* Submit request for advertising params if ext adv available */
+ if (ext_adv_capable(hdev)) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_EXT_ADV_PARAMS, hdev,
+ data, data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ hci_remove_adv_instance(hdev, cp->instance);
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_queue(hdev, add_ext_adv_params_sync, cmd,
+ add_ext_adv_params_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+ } else {
+ rp.instance = cp->instance;
+ rp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void add_ext_adv_data_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_data *cp = cmd->param;
+ struct mgmt_rp_add_advertising rp;
+
+ add_adv_complete(hdev, cmd->sk, cp->instance, err);
+
+ memset(&rp, 0, sizeof(rp));
+
+ rp.instance = cp->instance;
+
+ if (err)
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ else
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), &rp, sizeof(rp));
+
+ mgmt_pending_free(cmd);
+}
+
+static int add_ext_adv_data_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_data *cp = cmd->param;
+ int err;
+
+ if (ext_adv_capable(hdev)) {
+ err = hci_update_adv_data_sync(hdev, cp->instance);
+ if (err)
+ return err;
+
+ err = hci_update_scan_rsp_data_sync(hdev, cp->instance);
+ if (err)
+ return err;
+
+ return hci_enable_ext_advertising_sync(hdev, cp->instance);
+ }
+
+ return hci_schedule_adv_instance_sync(hdev, cp->instance, true);
+}
+
+static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_cp_add_ext_adv_data *cp = data;
+ struct mgmt_rp_add_ext_adv_data rp;
+ u8 schedule_instance = 0;
+ struct adv_info *next_instance;
+ struct adv_info *adv_instance;
+ int err = 0;
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ adv_instance = hci_find_adv_instance(hdev, cp->instance);
+
+ if (!adv_instance) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ /* In new interface, we require that we are powered to register */
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_REJECTED);
+ goto clear_new_instance;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_BUSY);
+ goto clear_new_instance;
+ }
+
+ /* Validate new data */
+ if (!tlv_data_is_valid(hdev, adv_instance->flags, cp->data,
+ cp->adv_data_len, true) ||
+ !tlv_data_is_valid(hdev, adv_instance->flags, cp->data +
+ cp->adv_data_len, cp->scan_rsp_len, false)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto clear_new_instance;
+ }
+
+ /* Set the data in the advertising instance */
+ hci_set_adv_instance_data(hdev, cp->instance, cp->adv_data_len,
+ cp->data, cp->scan_rsp_len,
+ cp->data + cp->adv_data_len);
+
+ /* If using software rotation, determine next instance to use */
+ if (hdev->cur_adv_instance == cp->instance) {
+ /* If the currently advertised instance is being changed
+ * then cancel the current advertising and schedule the
+ * next instance. If there is only one instance then the
+ * overridden advertising data will be visible right
+ * away
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev, cp->instance);
+ if (next_instance)
+ schedule_instance = next_instance->instance;
+ } else if (!hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other
+ * instance is currently being advertised.
+ */
+ schedule_instance = cp->instance;
+ }
+
+ /* If the HCI_ADVERTISING flag is set or there is no instance to
+ * be advertised then we have no HCI communication to make.
+ * Simply return.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || !schedule_instance) {
+ if (adv_instance->pending) {
+ mgmt_advertising_added(sk, hdev, cp->instance);
+ adv_instance->pending = false;
+ }
+ rp.instance = cp->instance;
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_EXT_ADV_DATA, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto clear_new_instance;
+ }
+
+ err = hci_cmd_sync_queue(hdev, add_ext_adv_data_sync, cmd,
+ add_ext_adv_data_complete);
+ if (err < 0) {
+ mgmt_pending_free(cmd);
+ goto clear_new_instance;
+ }
+
+ /* We were successful in updating data, so trigger advertising_added
+ * event if this is an instance that wasn't previously advertising. If
+ * a failure occurs in the requests we initiated, we will remove the
+ * instance again in add_advertising_complete
+ */
+ if (adv_instance->pending)
+ mgmt_advertising_added(sk, hdev, cp->instance);
+
+ goto unlock;
+
+clear_new_instance:
+ hci_remove_adv_instance(hdev, cp->instance);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void remove_advertising_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_advertising *cp = cmd->param;
+ struct mgmt_rp_remove_advertising rp;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.instance = cp->instance;
+
+ if (err)
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ else
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+
+ mgmt_pending_free(cmd);
+}
+
+static int remove_advertising_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_advertising *cp = cmd->param;
+ int err;
+
+ err = hci_remove_advertising_sync(hdev, cmd->sk, cp->instance, true);
+ if (err)
+ return err;
+
+ if (list_empty(&hdev->adv_instances))
+ err = hci_disable_advertising_sync(hdev);
+
+ return err;
+}
+
+static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_remove_advertising *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ hci_dev_lock(hdev);
+
+ if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_LE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (list_empty(&hdev->adv_instances)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADVERTISING, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_queue(hdev, remove_advertising_sync, cmd,
+ remove_advertising_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_get_adv_size_info *cp = data;
+ struct mgmt_rp_get_adv_size_info rp;
+ u32 flags, supported_flags;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ flags = __le32_to_cpu(cp->flags);
+
+ /* The current implementation only supports a subset of the specified
+ * flags.
+ */
+ supported_flags = get_supported_adv_flags(hdev);
+ if (flags & ~supported_flags)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ rp.instance = cp->instance;
+ rp.flags = cp->flags;
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+}
+
+static const struct hci_mgmt_handler mgmt_handlers[] = {
+ { NULL }, /* 0x0000 (no command) */
+ { read_version, MGMT_READ_VERSION_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_commands, MGMT_READ_COMMANDS_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_index_list, MGMT_READ_INDEX_LIST_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_controller_info, MGMT_READ_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_powered, MGMT_SETTING_SIZE },
+ { set_discoverable, MGMT_SET_DISCOVERABLE_SIZE },
+ { set_connectable, MGMT_SETTING_SIZE },
+ { set_fast_connectable, MGMT_SETTING_SIZE },
+ { set_bondable, MGMT_SETTING_SIZE },
+ { set_link_security, MGMT_SETTING_SIZE },
+ { set_ssp, MGMT_SETTING_SIZE },
+ { set_hs, MGMT_SETTING_SIZE },
+ { set_le, MGMT_SETTING_SIZE },
+ { set_dev_class, MGMT_SET_DEV_CLASS_SIZE },
+ { set_local_name, MGMT_SET_LOCAL_NAME_SIZE },
+ { add_uuid, MGMT_ADD_UUID_SIZE },
+ { remove_uuid, MGMT_REMOVE_UUID_SIZE },
+ { load_link_keys, MGMT_LOAD_LINK_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { load_long_term_keys, MGMT_LOAD_LONG_TERM_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { disconnect, MGMT_DISCONNECT_SIZE },
+ { get_connections, MGMT_GET_CONNECTIONS_SIZE },
+ { pin_code_reply, MGMT_PIN_CODE_REPLY_SIZE },
+ { pin_code_neg_reply, MGMT_PIN_CODE_NEG_REPLY_SIZE },
+ { set_io_capability, MGMT_SET_IO_CAPABILITY_SIZE },
+ { pair_device, MGMT_PAIR_DEVICE_SIZE },
+ { cancel_pair_device, MGMT_CANCEL_PAIR_DEVICE_SIZE },
+ { unpair_device, MGMT_UNPAIR_DEVICE_SIZE },
+ { user_confirm_reply, MGMT_USER_CONFIRM_REPLY_SIZE },
+ { user_confirm_neg_reply, MGMT_USER_CONFIRM_NEG_REPLY_SIZE },
+ { user_passkey_reply, MGMT_USER_PASSKEY_REPLY_SIZE },
+ { user_passkey_neg_reply, MGMT_USER_PASSKEY_NEG_REPLY_SIZE },
+ { read_local_oob_data, MGMT_READ_LOCAL_OOB_DATA_SIZE },
+ { add_remote_oob_data, MGMT_ADD_REMOTE_OOB_DATA_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { remove_remote_oob_data, MGMT_REMOVE_REMOTE_OOB_DATA_SIZE },
+ { start_discovery, MGMT_START_DISCOVERY_SIZE },
+ { stop_discovery, MGMT_STOP_DISCOVERY_SIZE },
+ { confirm_name, MGMT_CONFIRM_NAME_SIZE },
+ { block_device, MGMT_BLOCK_DEVICE_SIZE },
+ { unblock_device, MGMT_UNBLOCK_DEVICE_SIZE },
+ { set_device_id, MGMT_SET_DEVICE_ID_SIZE },
+ { set_advertising, MGMT_SETTING_SIZE },
+ { set_bredr, MGMT_SETTING_SIZE },
+ { set_static_address, MGMT_SET_STATIC_ADDRESS_SIZE },
+ { set_scan_params, MGMT_SET_SCAN_PARAMS_SIZE },
+ { set_secure_conn, MGMT_SETTING_SIZE },
+ { set_debug_keys, MGMT_SETTING_SIZE },
+ { set_privacy, MGMT_SET_PRIVACY_SIZE },
+ { load_irks, MGMT_LOAD_IRKS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { get_conn_info, MGMT_GET_CONN_INFO_SIZE },
+ { get_clock_info, MGMT_GET_CLOCK_INFO_SIZE },
+ { add_device, MGMT_ADD_DEVICE_SIZE },
+ { remove_device, MGMT_REMOVE_DEVICE_SIZE },
+ { load_conn_param, MGMT_LOAD_CONN_PARAM_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { read_unconf_index_list, MGMT_READ_UNCONF_INDEX_LIST_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_config_info, MGMT_READ_CONFIG_INFO_SIZE,
+ HCI_MGMT_UNCONFIGURED |
+ HCI_MGMT_UNTRUSTED },
+ { set_external_config, MGMT_SET_EXTERNAL_CONFIG_SIZE,
+ HCI_MGMT_UNCONFIGURED },
+ { set_public_address, MGMT_SET_PUBLIC_ADDRESS_SIZE,
+ HCI_MGMT_UNCONFIGURED },
+ { start_service_discovery, MGMT_START_SERVICE_DISCOVERY_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { read_local_oob_ext_data, MGMT_READ_LOCAL_OOB_EXT_DATA_SIZE },
+ { read_ext_index_list, MGMT_READ_EXT_INDEX_LIST_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_adv_features, MGMT_READ_ADV_FEATURES_SIZE },
+ { add_advertising, MGMT_ADD_ADVERTISING_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
+ { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE },
+ { start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
+ { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_appearance, MGMT_SET_APPEARANCE_SIZE },
+ { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE },
+ { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
+ { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { set_wideband_speech, MGMT_SETTING_SIZE },
+ { read_controller_cap, MGMT_READ_CONTROLLER_CAP_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED |
+ HCI_MGMT_HDEV_OPTIONAL },
+ { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE,
+ HCI_MGMT_VAR_LEN |
+ HCI_MGMT_HDEV_OPTIONAL },
+ { read_def_system_config, MGMT_READ_DEF_SYSTEM_CONFIG_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_def_system_config, MGMT_SET_DEF_SYSTEM_CONFIG_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { read_def_runtime_config, MGMT_READ_DEF_RUNTIME_CONFIG_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_def_runtime_config, MGMT_SET_DEF_RUNTIME_CONFIG_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { get_device_flags, MGMT_GET_DEVICE_FLAGS_SIZE },
+ { set_device_flags, MGMT_SET_DEVICE_FLAGS_SIZE },
+ { read_adv_mon_features, MGMT_READ_ADV_MONITOR_FEATURES_SIZE },
+ { add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE },
+ { add_ext_adv_params, MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { add_adv_patterns_monitor_rssi,
+ MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { set_mesh, MGMT_SET_MESH_RECEIVER_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { mesh_features, MGMT_MESH_READ_FEATURES_SIZE },
+ { mesh_send, MGMT_MESH_SEND_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { mesh_send_cancel, MGMT_MESH_SEND_CANCEL_SIZE },
+ { mgmt_hci_cmd_sync, MGMT_HCI_CMD_SYNC_SIZE, HCI_MGMT_VAR_LEN },
+};
+
+void mgmt_index_added(struct hci_dev *hdev)
+{
+ struct mgmt_ev_ext_index ev;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0,
+ HCI_MGMT_UNCONF_INDEX_EVENTS);
+ ev.type = 0x01;
+ } else {
+ mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0,
+ HCI_MGMT_INDEX_EVENTS);
+ ev.type = 0x00;
+ }
+
+ ev.bus = hdev->bus;
+
+ mgmt_index_event(MGMT_EV_EXT_INDEX_ADDED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_EXT_INDEX_EVENTS);
+}
+
+void mgmt_index_removed(struct hci_dev *hdev)
+{
+ struct mgmt_ev_ext_index ev;
+ struct cmd_lookup match = { NULL, hdev, MGMT_STATUS_INVALID_INDEX };
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
+ return;
+
+ mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match);
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0,
+ HCI_MGMT_UNCONF_INDEX_EVENTS);
+ ev.type = 0x01;
+ } else {
+ mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0,
+ HCI_MGMT_INDEX_EVENTS);
+ ev.type = 0x00;
+ }
+
+ ev.bus = hdev->bus;
+
+ mgmt_index_event(MGMT_EV_EXT_INDEX_REMOVED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_EXT_INDEX_EVENTS);
+
+ /* Cancel any remaining timed work */
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+ cancel_delayed_work_sync(&hdev->discov_off);
+ cancel_delayed_work_sync(&hdev->service_cache);
+ cancel_delayed_work_sync(&hdev->rpa_expired);
+ cancel_delayed_work_sync(&hdev->mesh_send_done);
+}
+
+void mgmt_power_on(struct hci_dev *hdev, int err)
+{
+ struct cmd_lookup match = { NULL, hdev };
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ hci_dev_lock(hdev);
+
+ if (!err) {
+ restart_le_actions(hdev);
+ hci_update_passive_scan(hdev);
+ }
+
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp,
+ &match);
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ hci_dev_unlock(hdev);
+}
+
+void __mgmt_power_off(struct hci_dev *hdev)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ u8 zero_cod[] = { 0, 0, 0 };
+
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp,
+ &match);
+
+ /* If the power off is because of hdev unregistration let
+ * use the appropriate INVALID_INDEX status. Otherwise use
+ * NOT_POWERED. We cover both scenarios here since later in
+ * mgmt_index_removed() any hci_conn callbacks will have already
+ * been triggered, potentially causing misleading DISCONNECTED
+ * status responses.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ match.mgmt_status = MGMT_STATUS_INVALID_INDEX;
+ else
+ match.mgmt_status = MGMT_STATUS_NOT_POWERED;
+
+ mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match);
+
+ if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+ zero_cod, sizeof(zero_cod),
+ HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+}
+
+void mgmt_set_powered_failed(struct hci_dev *hdev, int err)
+{
+ struct mgmt_pending_cmd *cmd;
+ u8 status;
+
+ cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
+ if (!cmd)
+ return;
+
+ if (err == -ERFKILL)
+ status = MGMT_STATUS_RFKILLED;
+ else
+ status = MGMT_STATUS_FAILED;
+
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED, status);
+
+ mgmt_pending_remove(cmd);
+}
+
+void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key,
+ bool persistent)
+{
+ struct mgmt_ev_new_link_key ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ ev.store_hint = persistent;
+ bacpy(&ev.key.addr.bdaddr, &key->bdaddr);
+ ev.key.addr.type = BDADDR_BREDR;
+ ev.key.type = key->type;
+ memcpy(ev.key.val, key->val, HCI_LINK_KEY_SIZE);
+ ev.key.pin_len = key->pin_len;
+
+ mgmt_event(MGMT_EV_NEW_LINK_KEY, hdev, &ev, sizeof(ev), NULL);
+}
+
+static u8 mgmt_ltk_type(struct smp_ltk *ltk)
+{
+ switch (ltk->type) {
+ case SMP_LTK:
+ case SMP_LTK_RESPONDER:
+ if (ltk->authenticated)
+ return MGMT_LTK_AUTHENTICATED;
+ return MGMT_LTK_UNAUTHENTICATED;
+ case SMP_LTK_P256:
+ if (ltk->authenticated)
+ return MGMT_LTK_P256_AUTH;
+ return MGMT_LTK_P256_UNAUTH;
+ case SMP_LTK_P256_DEBUG:
+ return MGMT_LTK_P256_DEBUG;
+ }
+
+ return MGMT_LTK_UNAUTHENTICATED;
+}
+
+void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent)
+{
+ struct mgmt_ev_new_long_term_key ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ /* Devices using resolvable or non-resolvable random addresses
+ * without providing an identity resolving key don't require
+ * to store long term keys. Their addresses will change the
+ * next time around.
+ *
+ * Only when a remote device provides an identity address
+ * make sure the long term key is stored. If the remote
+ * identity is known, the long term keys are internally
+ * mapped to the identity address. So allow static random
+ * and public addresses here.
+ */
+ if (key->bdaddr_type == ADDR_LE_DEV_RANDOM &&
+ (key->bdaddr.b[5] & 0xc0) != 0xc0)
+ ev.store_hint = 0x00;
+ else
+ ev.store_hint = persistent;
+
+ bacpy(&ev.key.addr.bdaddr, &key->bdaddr);
+ ev.key.addr.type = link_to_bdaddr(LE_LINK, key->bdaddr_type);
+ ev.key.type = mgmt_ltk_type(key);
+ ev.key.enc_size = key->enc_size;
+ ev.key.ediv = key->ediv;
+ ev.key.rand = key->rand;
+
+ if (key->type == SMP_LTK)
+ ev.key.initiator = 1;
+
+ /* Make sure we copy only the significant bytes based on the
+ * encryption key size, and set the rest of the value to zeroes.
+ */
+ memcpy(ev.key.val, key->val, key->enc_size);
+ memset(ev.key.val + key->enc_size, 0,
+ sizeof(ev.key.val) - key->enc_size);
+
+ mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent)
+{
+ struct mgmt_ev_new_irk ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ ev.store_hint = persistent;
+
+ bacpy(&ev.rpa, &irk->rpa);
+ bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr);
+ ev.irk.addr.type = link_to_bdaddr(LE_LINK, irk->addr_type);
+ memcpy(ev.irk.val, irk->val, sizeof(irk->val));
+
+ mgmt_event(MGMT_EV_NEW_IRK, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk,
+ bool persistent)
+{
+ struct mgmt_ev_new_csrk ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ /* Devices using resolvable or non-resolvable random addresses
+ * without providing an identity resolving key don't require
+ * to store signature resolving keys. Their addresses will change
+ * the next time around.
+ *
+ * Only when a remote device provides an identity address
+ * make sure the signature resolving key is stored. So allow
+ * static random and public addresses here.
+ */
+ if (csrk->bdaddr_type == ADDR_LE_DEV_RANDOM &&
+ (csrk->bdaddr.b[5] & 0xc0) != 0xc0)
+ ev.store_hint = 0x00;
+ else
+ ev.store_hint = persistent;
+
+ bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr);
+ ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type);
+ ev.key.type = csrk->type;
+ memcpy(ev.key.val, csrk->val, sizeof(csrk->val));
+
+ mgmt_event(MGMT_EV_NEW_CSRK, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type, u8 store_hint, u16 min_interval,
+ u16 max_interval, u16 latency, u16 timeout)
+{
+ struct mgmt_ev_new_conn_param ev;
+
+ if (!hci_is_identity_address(bdaddr, bdaddr_type))
+ return;
+
+ memset(&ev, 0, sizeof(ev));
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(LE_LINK, bdaddr_type);
+ ev.store_hint = store_hint;
+ ev.min_interval = cpu_to_le16(min_interval);
+ ev.max_interval = cpu_to_le16(max_interval);
+ ev.latency = cpu_to_le16(latency);
+ ev.timeout = cpu_to_le16(timeout);
+
+ mgmt_event(MGMT_EV_NEW_CONN_PARAM, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 *name, u8 name_len)
+{
+ struct sk_buff *skb;
+ struct mgmt_ev_device_connected *ev;
+ u16 eir_len = 0;
+ u32 flags = 0;
+
+ if (test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+ return;
+
+ /* allocate buff for LE or BR/EDR adv */
+ if (conn->le_adv_data_len > 0)
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED,
+ sizeof(*ev) + conn->le_adv_data_len);
+ else
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED,
+ sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) +
+ eir_precalc_len(sizeof(conn->dev_class)));
+
+ if (!skb)
+ return;
+
+ ev = skb_put(skb, sizeof(*ev));
+ bacpy(&ev->addr.bdaddr, &conn->dst);
+ ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+
+ if (conn->out)
+ flags |= MGMT_DEV_FOUND_INITIATED_CONN;
+
+ ev->flags = __cpu_to_le32(flags);
+
+ /* We must ensure that the EIR Data fields are ordered and
+ * unique. Keep it simple for now and avoid the problem by not
+ * adding any BR/EDR data to the LE adv.
+ */
+ if (conn->le_adv_data_len > 0) {
+ skb_put_data(skb, conn->le_adv_data, conn->le_adv_data_len);
+ eir_len = conn->le_adv_data_len;
+ } else {
+ if (name)
+ eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len);
+
+ if (memcmp(conn->dev_class, "\0\0\0", sizeof(conn->dev_class)))
+ eir_len += eir_skb_put_data(skb, EIR_CLASS_OF_DEV,
+ conn->dev_class, sizeof(conn->dev_class));
+ }
+
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ mgmt_event_skb(skb, NULL);
+}
+
+static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct hci_dev *hdev = data;
+ struct mgmt_cp_unpair_device *cp = cmd->param;
+
+ device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk);
+
+ cmd->cmd_complete(cmd, 0);
+}
+
+bool mgmt_powering_down(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_mode *cp;
+
+ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+ return true;
+
+ cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
+ if (!cmd)
+ return false;
+
+ cp = cmd->param;
+ if (!cp->val)
+ return true;
+
+ return false;
+}
+
+void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 reason,
+ bool mgmt_connected)
+{
+ struct mgmt_ev_device_disconnected ev;
+ struct sock *sk = NULL;
+
+ if (!mgmt_connected)
+ return;
+
+ if (link_type != ACL_LINK &&
+ link_type != LE_LINK &&
+ link_type != BIS_LINK)
+ return;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.reason = reason;
+
+ /* Report disconnects due to suspend */
+ if (hdev->suspended)
+ ev.reason = MGMT_DEV_DISCONN_LOCAL_HOST_SUSPEND;
+
+ mgmt_event(MGMT_EV_DEVICE_DISCONNECTED, hdev, &ev, sizeof(ev), sk);
+
+ if (sk)
+ sock_put(sk);
+}
+
+void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ u8 bdaddr_type = link_to_bdaddr(link_type, addr_type);
+ struct mgmt_cp_disconnect *cp;
+ struct mgmt_pending_cmd *cmd;
+
+ mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, true,
+ unpair_device_rsp, hdev);
+
+ cmd = pending_find(MGMT_OP_DISCONNECT, hdev);
+ if (!cmd)
+ return;
+
+ cp = cmd->param;
+
+ if (bacmp(bdaddr, &cp->addr.bdaddr))
+ return;
+
+ if (cp->addr.type != bdaddr_type)
+ return;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, u8 status)
+{
+ struct mgmt_ev_connect_failed ev;
+
+ if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) {
+ mgmt_device_disconnected(hdev, &conn->dst, conn->type,
+ conn->dst_type, status, true);
+ return;
+ }
+
+ bacpy(&ev.addr.bdaddr, &conn->dst);
+ ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+ ev.status = mgmt_status(status);
+
+ mgmt_event(MGMT_EV_CONNECT_FAILED, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure)
+{
+ struct mgmt_ev_pin_code_request ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = BDADDR_BREDR;
+ ev.secure = secure;
+
+ mgmt_event(MGMT_EV_PIN_CODE_REQUEST, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = pending_find(MGMT_OP_PIN_CODE_REPLY, hdev);
+ if (!cmd)
+ return;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, hdev);
+ if (!cmd)
+ return;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u32 value,
+ u8 confirm_hint)
+{
+ struct mgmt_ev_user_confirm_request ev;
+
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.confirm_hint = confirm_hint;
+ ev.value = cpu_to_le32(value);
+
+ return mgmt_event(MGMT_EV_USER_CONFIRM_REQUEST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type)
+{
+ struct mgmt_ev_user_passkey_request ev;
+
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+
+ return mgmt_event(MGMT_EV_USER_PASSKEY_REQUEST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+static int user_pairing_resp_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status,
+ u8 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = pending_find(opcode, hdev);
+ if (!cmd)
+ return -ENOENT;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+
+ return 0;
+}
+
+int mgmt_user_confirm_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status, MGMT_OP_USER_CONFIRM_REPLY);
+}
+
+int mgmt_user_confirm_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status,
+ MGMT_OP_USER_CONFIRM_NEG_REPLY);
+}
+
+int mgmt_user_passkey_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status, MGMT_OP_USER_PASSKEY_REPLY);
+}
+
+int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status,
+ MGMT_OP_USER_PASSKEY_NEG_REPLY);
+}
+
+int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u32 passkey,
+ u8 entered)
+{
+ struct mgmt_ev_passkey_notify ev;
+
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.passkey = __cpu_to_le32(passkey);
+ ev.entered = entered;
+
+ return mgmt_event(MGMT_EV_PASSKEY_NOTIFY, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status)
+{
+ struct mgmt_ev_auth_failed ev;
+ struct mgmt_pending_cmd *cmd;
+ u8 status = mgmt_status(hci_status);
+
+ bacpy(&ev.addr.bdaddr, &conn->dst);
+ ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+ ev.status = status;
+
+ cmd = find_pairing(conn);
+
+ mgmt_event(MGMT_EV_AUTH_FAILED, conn->hdev, &ev, sizeof(ev),
+ cmd ? cmd->sk : NULL);
+
+ if (cmd) {
+ cmd->cmd_complete(cmd, status);
+ mgmt_pending_remove(cmd);
+ }
+}
+
+void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ bool changed;
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+ mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true,
+ cmd_status_rsp, &mgmt_err);
+ return;
+ }
+
+ if (test_bit(HCI_AUTH, &hdev->flags))
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_LINK_SECURITY);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY);
+
+ mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true,
+ settings_rsp, &match);
+
+ if (changed)
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+}
+
+static void sk_lookup(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct cmd_lookup *match = data;
+
+ if (match->sk == NULL) {
+ match->sk = cmd->sk;
+ sock_hold(match->sk);
+ }
+}
+
+void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
+ u8 status)
+{
+ struct cmd_lookup match = { NULL, hdev, mgmt_status(status) };
+
+ mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, false, sk_lookup,
+ &match);
+ mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, false, sk_lookup,
+ &match);
+ mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, false, sk_lookup,
+ &match);
+
+ if (!status) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
+ 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
+
+ if (match.sk)
+ sock_put(match.sk);
+}
+
+void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
+{
+ struct mgmt_cp_set_local_name ev;
+ struct mgmt_pending_cmd *cmd;
+
+ if (status)
+ return;
+
+ memset(&ev, 0, sizeof(ev));
+ memcpy(ev.name, name, HCI_MAX_NAME_LENGTH);
+ memcpy(ev.short_name, hdev->short_name, HCI_MAX_SHORT_NAME_LENGTH);
+
+ cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev);
+ if (!cmd) {
+ memcpy(hdev->dev_name, name, sizeof(hdev->dev_name));
+
+ /* If this is a HCI command related to powering on the
+ * HCI dev don't send any mgmt signals.
+ */
+ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+ return;
+
+ if (pending_find(MGMT_OP_SET_POWERED, hdev))
+ return;
+ }
+
+ mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
+ ext_info_changed(hdev, cmd ? cmd->sk : NULL);
+}
+
+static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
+{
+ int i;
+
+ for (i = 0; i < uuid_count; i++) {
+ if (!memcmp(uuid, uuids[i], 16))
+ return true;
+ }
+
+ return false;
+}
+
+static bool eir_has_uuids(u8 *eir, u16 eir_len, u16 uuid_count, u8 (*uuids)[16])
+{
+ u16 parsed = 0;
+
+ while (parsed < eir_len) {
+ u8 field_len = eir[0];
+ u8 uuid[16];
+ int i;
+
+ if (field_len == 0)
+ break;
+
+ if (eir_len - parsed < field_len + 1)
+ break;
+
+ switch (eir[1]) {
+ case EIR_UUID16_ALL:
+ case EIR_UUID16_SOME:
+ for (i = 0; i + 3 <= field_len; i += 2) {
+ memcpy(uuid, bluetooth_base_uuid, 16);
+ uuid[13] = eir[i + 3];
+ uuid[12] = eir[i + 2];
+ if (has_uuid(uuid, uuid_count, uuids))
+ return true;
+ }
+ break;
+ case EIR_UUID32_ALL:
+ case EIR_UUID32_SOME:
+ for (i = 0; i + 5 <= field_len; i += 4) {
+ memcpy(uuid, bluetooth_base_uuid, 16);
+ uuid[15] = eir[i + 5];
+ uuid[14] = eir[i + 4];
+ uuid[13] = eir[i + 3];
+ uuid[12] = eir[i + 2];
+ if (has_uuid(uuid, uuid_count, uuids))
+ return true;
+ }
+ break;
+ case EIR_UUID128_ALL:
+ case EIR_UUID128_SOME:
+ for (i = 0; i + 17 <= field_len; i += 16) {
+ memcpy(uuid, eir + i + 2, 16);
+ if (has_uuid(uuid, uuid_count, uuids))
+ return true;
+ }
+ break;
+ }
+
+ parsed += field_len + 1;
+ eir += field_len + 1;
+ }
+
+ return false;
+}
+
+static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
+ u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
+{
+ /* If a RSSI threshold has been specified, and
+ * HCI_QUIRK_STRICT_DUPLICATE_FILTER is not set, then all results with
+ * a RSSI smaller than the RSSI threshold will be dropped. If the quirk
+ * is set, let it through for further processing, as we might need to
+ * restart the scan.
+ *
+ * For BR/EDR devices (pre 1.2) providing no RSSI during inquiry,
+ * the results are also dropped.
+ */
+ if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
+ (rssi == HCI_RSSI_INVALID ||
+ (rssi < hdev->discovery.rssi &&
+ !hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER))))
+ return false;
+
+ if (hdev->discovery.uuid_count != 0) {
+ /* If a list of UUIDs is provided in filter, results with no
+ * matching UUID should be dropped.
+ */
+ if (!eir_has_uuids(eir, eir_len, hdev->discovery.uuid_count,
+ hdev->discovery.uuids) &&
+ !eir_has_uuids(scan_rsp, scan_rsp_len,
+ hdev->discovery.uuid_count,
+ hdev->discovery.uuids))
+ return false;
+ }
+
+ /* If duplicate filtering does not report RSSI changes, then restart
+ * scanning to ensure updated result with updated RSSI values.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)) {
+ /* Validate RSSI value against the RSSI threshold once more. */
+ if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
+ rssi < hdev->discovery.rssi)
+ return false;
+ }
+
+ return true;
+}
+
+void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
+ bdaddr_t *bdaddr, u8 addr_type)
+{
+ struct mgmt_ev_adv_monitor_device_lost ev;
+
+ ev.monitor_handle = cpu_to_le16(handle);
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_DEVICE_LOST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+static void mgmt_send_adv_monitor_device_found(struct hci_dev *hdev,
+ struct sk_buff *skb,
+ struct sock *skip_sk,
+ u16 handle)
+{
+ struct sk_buff *advmon_skb;
+ size_t advmon_skb_len;
+ __le16 *monitor_handle;
+
+ if (!skb)
+ return;
+
+ advmon_skb_len = (sizeof(struct mgmt_ev_adv_monitor_device_found) -
+ sizeof(struct mgmt_ev_device_found)) + skb->len;
+ advmon_skb = mgmt_alloc_skb(hdev, MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+ advmon_skb_len);
+ if (!advmon_skb)
+ return;
+
+ /* ADV_MONITOR_DEVICE_FOUND is similar to DEVICE_FOUND event except
+ * that it also has 'monitor_handle'. Make a copy of DEVICE_FOUND and
+ * store monitor_handle of the matched monitor.
+ */
+ monitor_handle = skb_put(advmon_skb, sizeof(*monitor_handle));
+ *monitor_handle = cpu_to_le16(handle);
+ skb_put_data(advmon_skb, skb->data, skb->len);
+
+ mgmt_event_skb(advmon_skb, skip_sk);
+}
+
+static void mgmt_adv_monitor_device_found(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, bool report_device,
+ struct sk_buff *skb,
+ struct sock *skip_sk)
+{
+ struct monitored_device *dev, *tmp;
+ bool matched = false;
+ bool notified = false;
+
+ /* We have received the Advertisement Report because:
+ * 1. the kernel has initiated active discovery
+ * 2. if not, we have pend_le_reports > 0 in which case we are doing
+ * passive scanning
+ * 3. if none of the above is true, we have one or more active
+ * Advertisement Monitor
+ *
+ * For case 1 and 2, report all advertisements via MGMT_EV_DEVICE_FOUND
+ * and report ONLY one advertisement per device for the matched Monitor
+ * via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+ *
+ * For case 3, since we are not active scanning and all advertisements
+ * received are due to a matched Advertisement Monitor, report all
+ * advertisements ONLY via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+ */
+ if (report_device && !hdev->advmon_pend_notify) {
+ mgmt_event_skb(skb, skip_sk);
+ return;
+ }
+
+ hdev->advmon_pend_notify = false;
+
+ list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+ if (!bacmp(&dev->bdaddr, bdaddr)) {
+ matched = true;
+
+ if (!dev->notified) {
+ mgmt_send_adv_monitor_device_found(hdev, skb,
+ skip_sk,
+ dev->handle);
+ notified = true;
+ dev->notified = true;
+ }
+ }
+
+ if (!dev->notified)
+ hdev->advmon_pend_notify = true;
+ }
+
+ if (!report_device &&
+ ((matched && !notified) || !msft_monitor_supported(hdev))) {
+ /* Handle 0 indicates that we are not active scanning and this
+ * is a subsequent advertisement report for an already matched
+ * Advertisement Monitor or the controller offloading support
+ * is not available.
+ */
+ mgmt_send_adv_monitor_device_found(hdev, skb, skip_sk, 0);
+ }
+
+ if (report_device)
+ mgmt_event_skb(skb, skip_sk);
+ else
+ kfree_skb(skb);
+}
+
+static void mesh_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, s8 rssi, u32 flags, u8 *eir,
+ u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+ u64 instant)
+{
+ struct sk_buff *skb;
+ struct mgmt_ev_mesh_device_found *ev;
+ int i, j;
+
+ if (!hdev->mesh_ad_types[0])
+ goto accepted;
+
+ /* Scan for requested AD types */
+ if (eir_len > 0) {
+ for (i = 0; i + 1 < eir_len; i += eir[i] + 1) {
+ for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) {
+ if (!hdev->mesh_ad_types[j])
+ break;
+
+ if (hdev->mesh_ad_types[j] == eir[i + 1])
+ goto accepted;
+ }
+ }
+ }
+
+ if (scan_rsp_len > 0) {
+ for (i = 0; i + 1 < scan_rsp_len; i += scan_rsp[i] + 1) {
+ for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) {
+ if (!hdev->mesh_ad_types[j])
+ break;
+
+ if (hdev->mesh_ad_types[j] == scan_rsp[i + 1])
+ goto accepted;
+ }
+ }
+ }
+
+ return;
+
+accepted:
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_MESH_DEVICE_FOUND,
+ sizeof(*ev) + eir_len + scan_rsp_len);
+ if (!skb)
+ return;
+
+ ev = skb_put(skb, sizeof(*ev));
+
+ bacpy(&ev->addr.bdaddr, bdaddr);
+ ev->addr.type = link_to_bdaddr(LE_LINK, addr_type);
+ ev->rssi = rssi;
+ ev->flags = cpu_to_le32(flags);
+ ev->instant = cpu_to_le64(instant);
+
+ if (eir_len > 0)
+ /* Copy EIR or advertising data into event */
+ skb_put_data(skb, eir, eir_len);
+
+ if (scan_rsp_len > 0)
+ /* Append scan response data to event */
+ skb_put_data(skb, scan_rsp, scan_rsp_len);
+
+ ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
+
+ mgmt_event_skb(skb, NULL);
+}
+
+void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
+ u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
+ u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+ u64 instant)
+{
+ struct sk_buff *skb;
+ struct mgmt_ev_device_found *ev;
+ bool report_device = hci_discovery_active(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MESH) && link_type == LE_LINK)
+ mesh_device_found(hdev, bdaddr, addr_type, rssi, flags,
+ eir, eir_len, scan_rsp, scan_rsp_len,
+ instant);
+
+ /* Don't send events for a non-kernel initiated discovery. With
+ * LE one exception is if we have pend_le_reports > 0 in which
+ * case we're doing passive scanning and want these events.
+ */
+ if (!hci_discovery_active(hdev)) {
+ if (link_type == ACL_LINK)
+ return;
+ if (link_type == LE_LINK && !list_empty(&hdev->pend_le_reports))
+ report_device = true;
+ else if (!hci_is_adv_monitoring(hdev))
+ return;
+ }
+
+ if (hdev->discovery.result_filtering) {
+ /* We are using service discovery */
+ if (!is_filter_match(hdev, rssi, eir, eir_len, scan_rsp,
+ scan_rsp_len))
+ return;
+ }
+
+ if (hdev->discovery.limited) {
+ /* Check for limited discoverable bit */
+ if (dev_class) {
+ if (!(dev_class[1] & 0x20))
+ return;
+ } else {
+ u8 *flags = eir_get_data(eir, eir_len, EIR_FLAGS, NULL);
+ if (!flags || !(flags[0] & LE_AD_LIMITED))
+ return;
+ }
+ }
+
+ /* Allocate skb. The 5 extra bytes are for the potential CoD field */
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND,
+ sizeof(*ev) + eir_len + scan_rsp_len + 5);
+ if (!skb)
+ return;
+
+ ev = skb_put(skb, sizeof(*ev));
+
+ /* In case of device discovery with BR/EDR devices (pre 1.2), the
+ * RSSI value was reported as 0 when not available. This behavior
+ * is kept when using device discovery. This is required for full
+ * backwards compatibility with the API.
+ *
+ * However when using service discovery, the value 127 will be
+ * returned when the RSSI is not available.
+ */
+ if (rssi == HCI_RSSI_INVALID && !hdev->discovery.report_invalid_rssi &&
+ link_type == ACL_LINK)
+ rssi = 0;
+
+ bacpy(&ev->addr.bdaddr, bdaddr);
+ ev->addr.type = link_to_bdaddr(link_type, addr_type);
+ ev->rssi = rssi;
+ ev->flags = cpu_to_le32(flags);
+
+ if (eir_len > 0)
+ /* Copy EIR or advertising data into event */
+ skb_put_data(skb, eir, eir_len);
+
+ if (dev_class && !eir_get_data(eir, eir_len, EIR_CLASS_OF_DEV, NULL)) {
+ u8 eir_cod[5];
+
+ eir_len += eir_append_data(eir_cod, 0, EIR_CLASS_OF_DEV,
+ dev_class, 3);
+ skb_put_data(skb, eir_cod, sizeof(eir_cod));
+ }
+
+ if (scan_rsp_len > 0)
+ /* Append scan response data to event */
+ skb_put_data(skb, scan_rsp, scan_rsp_len);
+
+ ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
+
+ mgmt_adv_monitor_device_found(hdev, bdaddr, report_device, skb, NULL);
+}
+
+void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
+ u8 addr_type, s8 rssi, u8 *name, u8 name_len)
+{
+ struct sk_buff *skb;
+ struct mgmt_ev_device_found *ev;
+ u16 eir_len = 0;
+ u32 flags = 0;
+
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND,
+ sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0));
+ if (!skb)
+ return;
+
+ ev = skb_put(skb, sizeof(*ev));
+ bacpy(&ev->addr.bdaddr, bdaddr);
+ ev->addr.type = link_to_bdaddr(link_type, addr_type);
+ ev->rssi = rssi;
+
+ if (name)
+ eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len);
+ else
+ flags = MGMT_DEV_FOUND_NAME_REQUEST_FAILED;
+
+ ev->eir_len = cpu_to_le16(eir_len);
+ ev->flags = cpu_to_le32(flags);
+
+ mgmt_event_skb(skb, NULL);
+}
+
+void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
+{
+ struct mgmt_ev_discovering ev;
+
+ bt_dev_dbg(hdev, "discovering %u", discovering);
+
+ memset(&ev, 0, sizeof(ev));
+ ev.type = hdev->discovery.type;
+ ev.discovering = discovering;
+
+ mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_suspending(struct hci_dev *hdev, u8 state)
+{
+ struct mgmt_ev_controller_suspend ev;
+
+ ev.suspend_state = state;
+ mgmt_event(MGMT_EV_CONTROLLER_SUSPEND, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr,
+ u8 addr_type)
+{
+ struct mgmt_ev_controller_resume ev;
+
+ ev.wake_reason = reason;
+ if (bdaddr) {
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+ } else {
+ memset(&ev.addr, 0, sizeof(ev.addr));
+ }
+
+ mgmt_event(MGMT_EV_CONTROLLER_RESUME, hdev, &ev, sizeof(ev), NULL);
+}
+
+static struct hci_mgmt_chan chan = {
+ .channel = HCI_CHANNEL_CONTROL,
+ .handler_count = ARRAY_SIZE(mgmt_handlers),
+ .handlers = mgmt_handlers,
+ .hdev_init = mgmt_init_hdev,
+};
+
+int mgmt_init(void)
+{
+ return hci_mgmt_chan_register(&chan);
+}
+
+void mgmt_exit(void)
+{
+ hci_mgmt_chan_unregister(&chan);
+}
+
+void mgmt_cleanup(struct sock *sk)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+ struct hci_dev *hdev;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ do {
+ mesh_tx = mgmt_mesh_next(hdev, sk);
+
+ if (mesh_tx)
+ mesh_send_complete(hdev, mesh_tx, true);
+ } while (mesh_tx);
+ }
+
+ read_unlock(&hci_dev_list_lock);
+}
diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c
new file mode 100644
index 000000000000..c4063d200c0a
--- /dev/null
+++ b/net/bluetooth/mgmt_config.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+#include "mgmt_config.h"
+
+#define HDEV_PARAM_U16(_param_name_) \
+ struct {\
+ struct mgmt_tlv_hdr entry; \
+ __le16 value; \
+ } __packed _param_name_
+
+#define HDEV_PARAM_U8(_param_name_) \
+ struct {\
+ struct mgmt_tlv_hdr entry; \
+ __u8 value; \
+ } __packed _param_name_
+
+#define TLV_SET_U16(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u16) }, \
+ cpu_to_le16(hdev->_param_name_) \
+ }
+
+#define TLV_SET_U8(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u8) }, \
+ hdev->_param_name_ \
+ }
+
+#define TLV_SET_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u16) }, \
+ cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) \
+ }
+
+int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ int ret;
+ struct mgmt_rp_read_def_system_config {
+ /* Please see mgmt-api.txt for documentation of these values */
+ HDEV_PARAM_U16(def_page_scan_type);
+ HDEV_PARAM_U16(def_page_scan_int);
+ HDEV_PARAM_U16(def_page_scan_window);
+ HDEV_PARAM_U16(def_inq_scan_type);
+ HDEV_PARAM_U16(def_inq_scan_int);
+ HDEV_PARAM_U16(def_inq_scan_window);
+ HDEV_PARAM_U16(def_br_lsto);
+ HDEV_PARAM_U16(def_page_timeout);
+ HDEV_PARAM_U16(sniff_min_interval);
+ HDEV_PARAM_U16(sniff_max_interval);
+ HDEV_PARAM_U16(le_adv_min_interval);
+ HDEV_PARAM_U16(le_adv_max_interval);
+ HDEV_PARAM_U16(def_multi_adv_rotation_duration);
+ HDEV_PARAM_U16(le_scan_interval);
+ HDEV_PARAM_U16(le_scan_window);
+ HDEV_PARAM_U16(le_scan_int_suspend);
+ HDEV_PARAM_U16(le_scan_window_suspend);
+ HDEV_PARAM_U16(le_scan_int_discovery);
+ HDEV_PARAM_U16(le_scan_window_discovery);
+ HDEV_PARAM_U16(le_scan_int_adv_monitor);
+ HDEV_PARAM_U16(le_scan_window_adv_monitor);
+ HDEV_PARAM_U16(le_scan_int_connect);
+ HDEV_PARAM_U16(le_scan_window_connect);
+ HDEV_PARAM_U16(le_conn_min_interval);
+ HDEV_PARAM_U16(le_conn_max_interval);
+ HDEV_PARAM_U16(le_conn_latency);
+ HDEV_PARAM_U16(le_supv_timeout);
+ HDEV_PARAM_U16(def_le_autoconnect_timeout);
+ HDEV_PARAM_U16(advmon_allowlist_duration);
+ HDEV_PARAM_U16(advmon_no_filter_duration);
+ HDEV_PARAM_U8(enable_advmon_interleave_scan);
+ } __packed rp = {
+ TLV_SET_U16(0x0000, def_page_scan_type),
+ TLV_SET_U16(0x0001, def_page_scan_int),
+ TLV_SET_U16(0x0002, def_page_scan_window),
+ TLV_SET_U16(0x0003, def_inq_scan_type),
+ TLV_SET_U16(0x0004, def_inq_scan_int),
+ TLV_SET_U16(0x0005, def_inq_scan_window),
+ TLV_SET_U16(0x0006, def_br_lsto),
+ TLV_SET_U16(0x0007, def_page_timeout),
+ TLV_SET_U16(0x0008, sniff_min_interval),
+ TLV_SET_U16(0x0009, sniff_max_interval),
+ TLV_SET_U16(0x000a, le_adv_min_interval),
+ TLV_SET_U16(0x000b, le_adv_max_interval),
+ TLV_SET_U16(0x000c, def_multi_adv_rotation_duration),
+ TLV_SET_U16(0x000d, le_scan_interval),
+ TLV_SET_U16(0x000e, le_scan_window),
+ TLV_SET_U16(0x000f, le_scan_int_suspend),
+ TLV_SET_U16(0x0010, le_scan_window_suspend),
+ TLV_SET_U16(0x0011, le_scan_int_discovery),
+ TLV_SET_U16(0x0012, le_scan_window_discovery),
+ TLV_SET_U16(0x0013, le_scan_int_adv_monitor),
+ TLV_SET_U16(0x0014, le_scan_window_adv_monitor),
+ TLV_SET_U16(0x0015, le_scan_int_connect),
+ TLV_SET_U16(0x0016, le_scan_window_connect),
+ TLV_SET_U16(0x0017, le_conn_min_interval),
+ TLV_SET_U16(0x0018, le_conn_max_interval),
+ TLV_SET_U16(0x0019, le_conn_latency),
+ TLV_SET_U16(0x001a, le_supv_timeout),
+ TLV_SET_U16_JIFFIES_TO_MSECS(0x001b,
+ def_le_autoconnect_timeout),
+ TLV_SET_U16(0x001d, advmon_allowlist_duration),
+ TLV_SET_U16(0x001e, advmon_no_filter_duration),
+ TLV_SET_U8(0x001f, enable_advmon_interleave_scan),
+ };
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ ret = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ 0, &rp, sizeof(rp));
+ return ret;
+}
+
+#define TO_TLV(x) ((struct mgmt_tlv *)(x))
+#define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value)))
+#define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value)))
+
+int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ u16 buffer_left = data_len;
+ u8 *buffer = data;
+
+ if (buffer_left < sizeof(struct mgmt_tlv)) {
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ /* First pass to validate the tlv */
+ while (buffer_left >= sizeof(struct mgmt_tlv)) {
+ const u8 len = TO_TLV(buffer)->length;
+ size_t exp_type_len;
+ const u16 exp_len = sizeof(struct mgmt_tlv) +
+ len;
+ const u16 type = le16_to_cpu(TO_TLV(buffer)->type);
+
+ if (buffer_left < exp_len) {
+ bt_dev_warn(hdev, "invalid len left %u, exp >= %u",
+ buffer_left, exp_len);
+
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ /* Please see mgmt-api.txt for documentation of these values */
+ switch (type) {
+ case 0x0000:
+ case 0x0001:
+ case 0x0002:
+ case 0x0003:
+ case 0x0004:
+ case 0x0005:
+ case 0x0006:
+ case 0x0007:
+ case 0x0008:
+ case 0x0009:
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x000e:
+ case 0x000f:
+ case 0x0010:
+ case 0x0011:
+ case 0x0012:
+ case 0x0013:
+ case 0x0014:
+ case 0x0015:
+ case 0x0016:
+ case 0x0017:
+ case 0x0018:
+ case 0x0019:
+ case 0x001a:
+ case 0x001b:
+ case 0x001d:
+ case 0x001e:
+ exp_type_len = sizeof(u16);
+ break;
+ case 0x001f:
+ exp_type_len = sizeof(u8);
+ break;
+ default:
+ exp_type_len = 0;
+ bt_dev_warn(hdev, "unsupported parameter %u", type);
+ break;
+ }
+
+ if (exp_type_len && len != exp_type_len) {
+ bt_dev_warn(hdev, "invalid length %d, exp %zu for type %u",
+ len, exp_type_len, type);
+
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ buffer_left -= exp_len;
+ buffer += exp_len;
+ }
+
+ buffer_left = data_len;
+ buffer = data;
+ while (buffer_left >= sizeof(struct mgmt_tlv)) {
+ const u8 len = TO_TLV(buffer)->length;
+ const u16 exp_len = sizeof(struct mgmt_tlv) +
+ len;
+ const u16 type = le16_to_cpu(TO_TLV(buffer)->type);
+
+ switch (type) {
+ case 0x0000:
+ hdev->def_page_scan_type = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001:
+ hdev->def_page_scan_int = TLV_GET_LE16(buffer);
+ break;
+ case 0x0002:
+ hdev->def_page_scan_window = TLV_GET_LE16(buffer);
+ break;
+ case 0x0003:
+ hdev->def_inq_scan_type = TLV_GET_LE16(buffer);
+ break;
+ case 0x0004:
+ hdev->def_inq_scan_int = TLV_GET_LE16(buffer);
+ break;
+ case 0x0005:
+ hdev->def_inq_scan_window = TLV_GET_LE16(buffer);
+ break;
+ case 0x0006:
+ hdev->def_br_lsto = TLV_GET_LE16(buffer);
+ break;
+ case 0x0007:
+ hdev->def_page_timeout = TLV_GET_LE16(buffer);
+ break;
+ case 0x0008:
+ hdev->sniff_min_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x0009:
+ hdev->sniff_max_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000a:
+ hdev->le_adv_min_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000b:
+ hdev->le_adv_max_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000c:
+ hdev->def_multi_adv_rotation_duration =
+ TLV_GET_LE16(buffer);
+ break;
+ case 0x000d:
+ hdev->le_scan_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000e:
+ hdev->le_scan_window = TLV_GET_LE16(buffer);
+ break;
+ case 0x000f:
+ hdev->le_scan_int_suspend = TLV_GET_LE16(buffer);
+ break;
+ case 0x0010:
+ hdev->le_scan_window_suspend = TLV_GET_LE16(buffer);
+ break;
+ case 0x0011:
+ hdev->le_scan_int_discovery = TLV_GET_LE16(buffer);
+ break;
+ case 0x00012:
+ hdev->le_scan_window_discovery = TLV_GET_LE16(buffer);
+ break;
+ case 0x00013:
+ hdev->le_scan_int_adv_monitor = TLV_GET_LE16(buffer);
+ break;
+ case 0x00014:
+ hdev->le_scan_window_adv_monitor = TLV_GET_LE16(buffer);
+ break;
+ case 0x00015:
+ hdev->le_scan_int_connect = TLV_GET_LE16(buffer);
+ break;
+ case 0x00016:
+ hdev->le_scan_window_connect = TLV_GET_LE16(buffer);
+ break;
+ case 0x00017:
+ hdev->le_conn_min_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x00018:
+ hdev->le_conn_max_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x00019:
+ hdev->le_conn_latency = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001a:
+ hdev->le_supv_timeout = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001b:
+ hdev->def_le_autoconnect_timeout =
+ msecs_to_jiffies(TLV_GET_LE16(buffer));
+ break;
+ case 0x0001d:
+ hdev->advmon_allowlist_duration = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001e:
+ hdev->advmon_no_filter_duration = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001f:
+ hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer);
+ break;
+ default:
+ bt_dev_warn(hdev, "unsupported parameter %u", type);
+ break;
+ }
+
+ buffer_left -= exp_len;
+ buffer += exp_len;
+ }
+
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG, 0, NULL, 0);
+}
+
+int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_DEF_RUNTIME_CONFIG, 0, NULL, 0);
+}
+
+int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+}
diff --git a/net/bluetooth/mgmt_config.h b/net/bluetooth/mgmt_config.h
new file mode 100644
index 000000000000..a4965f107891
--- /dev/null
+++ b/net/bluetooth/mgmt_config.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
+
+int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
+
+int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
+
+int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
new file mode 100644
index 000000000000..aa7b5585cb26
--- /dev/null
+++ b/net/bluetooth/mgmt_util.c
@@ -0,0 +1,441 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2015 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+
+static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
+ u16 opcode, u16 len, void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ skb_put_data(skb, buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode,
+ unsigned int size)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(sizeof(struct mgmt_hdr) + size, GFP_KERNEL);
+ if (!skb)
+ return skb;
+
+ skb_reserve(skb, sizeof(struct mgmt_hdr));
+ bt_cb(skb)->mgmt.hdev = hdev;
+ bt_cb(skb)->mgmt.opcode = opcode;
+
+ return skb;
+}
+
+int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag,
+ struct sock *skip_sk)
+{
+ struct hci_dev *hdev;
+ struct mgmt_hdr *hdr;
+ int len;
+
+ if (!skb)
+ return -EINVAL;
+
+ len = skb->len;
+ hdev = bt_cb(skb)->mgmt.hdev;
+
+ /* Time stamp */
+ __net_timestamp(skb);
+
+ /* Send just the data, without headers, to the monitor */
+ if (channel == HCI_CHANNEL_CONTROL)
+ hci_send_monitor_ctrl_event(hdev, bt_cb(skb)->mgmt.opcode,
+ skb->data, skb->len,
+ skb_get_ktime(skb), flag, skip_sk);
+
+ hdr = skb_push(skb, sizeof(*hdr));
+ hdr->opcode = cpu_to_le16(bt_cb(skb)->mgmt.opcode);
+ if (hdev)
+ hdr->index = cpu_to_le16(hdev->id);
+ else
+ hdr->index = cpu_to_le16(MGMT_INDEX_NONE);
+ hdr->len = cpu_to_le16(len);
+
+ hci_send_to_channel(channel, skb, flag, skip_sk);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
+ void *data, u16 data_len, int flag, struct sock *skip_sk)
+{
+ struct sk_buff *skb;
+
+ skb = mgmt_alloc_skb(hdev, event, data_len);
+ if (!skb)
+ return -ENOMEM;
+
+ if (data)
+ skb_put_data(skb, data, data_len);
+
+ return mgmt_send_event_skb(channel, skb, flag, skip_sk);
+}
+
+int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
+{
+ struct sk_buff *skb, *mskb;
+ struct mgmt_hdr *hdr;
+ struct mgmt_ev_cmd_status *ev;
+ int err;
+
+ BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status);
+
+ skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+
+ hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(sizeof(*ev));
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->status = status;
+ ev->opcode = cpu_to_le16(cmd);
+
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err < 0)
+ kfree_skb(skb);
+
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
+ return err;
+}
+
+int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
+ void *rp, size_t rp_len)
+{
+ struct sk_buff *skb, *mskb;
+ struct mgmt_hdr *hdr;
+ struct mgmt_ev_cmd_complete *ev;
+ int err;
+
+ BT_DBG("sock %p", sk);
+
+ skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+
+ hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(sizeof(*ev) + rp_len);
+
+ ev = skb_put(skb, sizeof(*ev) + rp_len);
+ ev->opcode = cpu_to_le16(cmd);
+ ev->status = status;
+
+ if (rp)
+ memcpy(ev->data, rp, rp_len);
+
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_COMPLETE,
+ sizeof(*ev) + rp_len, ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err < 0)
+ kfree_skb(skb);
+
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
+ return err;
+}
+
+struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode,
+ struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd, *tmp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) {
+ if (hci_sock_get_channel(cmd->sk) != channel)
+ continue;
+
+ if (cmd->opcode == opcode) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return cmd;
+ }
+ }
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return NULL;
+}
+
+void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove,
+ void (*cb)(struct mgmt_pending_cmd *cmd, void *data),
+ void *data)
+{
+ struct mgmt_pending_cmd *cmd, *tmp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) {
+ if (opcode > 0 && cmd->opcode != opcode)
+ continue;
+
+ if (remove)
+ list_del(&cmd->list);
+
+ cb(cmd, data);
+
+ if (remove)
+ mgmt_pending_free(cmd);
+ }
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+}
+
+struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return NULL;
+
+ cmd->opcode = opcode;
+ cmd->hdev = hdev;
+
+ cmd->param = kmemdup(data, len, GFP_KERNEL);
+ if (!cmd->param) {
+ kfree(cmd);
+ return NULL;
+ }
+
+ cmd->param_len = len;
+
+ cmd->sk = sk;
+ sock_hold(sk);
+
+ return cmd;
+}
+
+struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = mgmt_pending_new(sk, opcode, hdev, data, len);
+ if (!cmd)
+ return NULL;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+ list_add_tail(&cmd->list, &hdev->mgmt_pending);
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return cmd;
+}
+
+void mgmt_pending_free(struct mgmt_pending_cmd *cmd)
+{
+ sock_put(cmd->sk);
+ kfree(cmd->param);
+ kfree(cmd);
+}
+
+void mgmt_pending_remove(struct mgmt_pending_cmd *cmd)
+{
+ mutex_lock(&cmd->hdev->mgmt_pending_lock);
+ list_del(&cmd->list);
+ mutex_unlock(&cmd->hdev->mgmt_pending_lock);
+
+ mgmt_pending_free(cmd);
+}
+
+bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+ struct mgmt_pending_cmd *tmp;
+
+ lockdep_assert_held(&hdev->mgmt_pending_lock);
+
+ if (!cmd)
+ return false;
+
+ list_for_each_entry(tmp, &hdev->mgmt_pending, list) {
+ if (cmd == tmp)
+ return true;
+ }
+
+ return false;
+}
+
+bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+ bool listed;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+ listed = __mgmt_pending_listed(hdev, cmd);
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return listed;
+}
+
+bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+ bool listed;
+
+ if (!cmd)
+ return false;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ listed = __mgmt_pending_listed(hdev, cmd);
+ if (listed)
+ list_del(&cmd->list);
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return listed;
+}
+
+void mgmt_mesh_foreach(struct hci_dev *hdev,
+ void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
+ void *data, struct sock *sk)
+{
+ struct mgmt_mesh_tx *mesh_tx, *tmp;
+
+ list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) {
+ if (!sk || mesh_tx->sk == sk)
+ cb(mesh_tx, data);
+ }
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ if (list_empty(&hdev->mesh_pending))
+ return NULL;
+
+ list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) {
+ if (!sk || mesh_tx->sk == sk)
+ return mesh_tx;
+ }
+
+ return NULL;
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ if (list_empty(&hdev->mesh_pending))
+ return NULL;
+
+ list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) {
+ if (mesh_tx->handle == handle)
+ return mesh_tx;
+ }
+
+ return NULL;
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL);
+ if (!mesh_tx)
+ return NULL;
+
+ hdev->mesh_send_ref++;
+ if (!hdev->mesh_send_ref)
+ hdev->mesh_send_ref++;
+
+ mesh_tx->handle = hdev->mesh_send_ref;
+ mesh_tx->index = hdev->id;
+ memcpy(mesh_tx->param, data, len);
+ mesh_tx->param_len = len;
+ mesh_tx->sk = sk;
+ sock_hold(sk);
+
+ list_add_tail(&mesh_tx->list, &hdev->mesh_pending);
+
+ return mesh_tx;
+}
+
+void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx)
+{
+ list_del(&mesh_tx->list);
+ sock_put(mesh_tx->sk);
+ kfree(mesh_tx);
+}
diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h
new file mode 100644
index 000000000000..bcba8c9d8952
--- /dev/null
+++ b/net/bluetooth/mgmt_util.h
@@ -0,0 +1,78 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2015 Intel Coropration
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+struct mgmt_mesh_tx {
+ struct list_head list;
+ int index;
+ size_t param_len;
+ struct sock *sk;
+ u8 handle;
+ u8 instance;
+ u8 param[sizeof(struct mgmt_cp_mesh_send) + 31];
+};
+
+struct mgmt_pending_cmd {
+ struct list_head list;
+ u16 opcode;
+ struct hci_dev *hdev;
+ void *param;
+ size_t param_len;
+ struct sock *sk;
+ struct sk_buff *skb;
+ void *user_data;
+ int (*cmd_complete)(struct mgmt_pending_cmd *cmd, u8 status);
+};
+
+struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode,
+ unsigned int size);
+int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag,
+ struct sock *skip_sk);
+int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
+ void *data, u16 data_len, int flag, struct sock *skip_sk);
+int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status);
+int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
+ void *rp, size_t rp_len);
+
+struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode,
+ struct hci_dev *hdev);
+void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove,
+ void (*cb)(struct mgmt_pending_cmd *cmd, void *data),
+ void *data);
+struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len);
+struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len);
+void mgmt_pending_free(struct mgmt_pending_cmd *cmd);
+void mgmt_pending_remove(struct mgmt_pending_cmd *cmd);
+bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+void mgmt_mesh_foreach(struct hci_dev *hdev,
+ void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
+ void *data, struct sock *sk);
+struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle);
+struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk);
+struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len);
+void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx);
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
new file mode 100644
index 000000000000..c560d8467669
--- /dev/null
+++ b/net/bluetooth/msft.c
@@ -0,0 +1,1201 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+#include "msft.h"
+
+#define MSFT_RSSI_THRESHOLD_VALUE_MIN -127
+#define MSFT_RSSI_THRESHOLD_VALUE_MAX 20
+#define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C
+
+#define MSFT_OP_READ_SUPPORTED_FEATURES 0x00
+struct msft_cp_read_supported_features {
+ __u8 sub_opcode;
+} __packed;
+
+struct msft_rp_read_supported_features {
+ __u8 status;
+ __u8 sub_opcode;
+ __le64 features;
+ __u8 evt_prefix_len;
+ __u8 evt_prefix[];
+} __packed;
+
+#define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03
+#define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01
+struct msft_le_monitor_advertisement_pattern {
+ __u8 length;
+ __u8 data_type;
+ __u8 start_byte;
+ __u8 pattern[];
+};
+
+struct msft_le_monitor_advertisement_pattern_data {
+ __u8 count;
+ __u8 data[];
+};
+
+struct msft_cp_le_monitor_advertisement {
+ __u8 sub_opcode;
+ __s8 rssi_high;
+ __s8 rssi_low;
+ __u8 rssi_low_interval;
+ __u8 rssi_sampling_period;
+ __u8 cond_type;
+ __u8 data[];
+} __packed;
+
+struct msft_rp_le_monitor_advertisement {
+ __u8 status;
+ __u8 sub_opcode;
+ __u8 handle;
+} __packed;
+
+#define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04
+struct msft_cp_le_cancel_monitor_advertisement {
+ __u8 sub_opcode;
+ __u8 handle;
+} __packed;
+
+struct msft_rp_le_cancel_monitor_advertisement {
+ __u8 status;
+ __u8 sub_opcode;
+} __packed;
+
+#define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05
+struct msft_cp_le_set_advertisement_filter_enable {
+ __u8 sub_opcode;
+ __u8 enable;
+} __packed;
+
+struct msft_rp_le_set_advertisement_filter_enable {
+ __u8 status;
+ __u8 sub_opcode;
+} __packed;
+
+#define MSFT_EV_LE_MONITOR_DEVICE 0x02
+struct msft_ev_le_monitor_device {
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+ __u8 monitor_handle;
+ __u8 monitor_state;
+} __packed;
+
+struct msft_monitor_advertisement_handle_data {
+ __u8 msft_handle;
+ __u16 mgmt_handle;
+ __s8 rssi_high;
+ __s8 rssi_low;
+ __u8 rssi_low_interval;
+ __u8 rssi_sampling_period;
+ __u8 cond_type;
+ struct list_head list;
+};
+
+enum monitor_addr_filter_state {
+ AF_STATE_IDLE,
+ AF_STATE_ADDING,
+ AF_STATE_ADDED,
+ AF_STATE_REMOVING,
+};
+
+#define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04
+struct msft_monitor_addr_filter_data {
+ __u8 msft_handle;
+ __u8 pattern_handle; /* address filters pertain to */
+ __u16 mgmt_handle;
+ int state;
+ __s8 rssi_high;
+ __s8 rssi_low;
+ __u8 rssi_low_interval;
+ __u8 rssi_sampling_period;
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+ struct list_head list;
+};
+
+struct msft_data {
+ __u64 features;
+ __u8 evt_prefix_len;
+ __u8 *evt_prefix;
+ struct list_head handle_map;
+ struct list_head address_filters;
+ __u8 resuming;
+ __u8 suspending;
+ __u8 filter_enabled;
+ /* To synchronize add/remove address filter and monitor device event.*/
+ struct mutex filter_lock;
+};
+
+bool msft_monitor_supported(struct hci_dev *hdev)
+{
+ return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR);
+}
+
+static bool read_supported_features(struct hci_dev *hdev,
+ struct msft_data *msft)
+{
+ struct msft_cp_read_supported_features cp;
+ struct msft_rp_read_supported_features *rp;
+ struct sk_buff *skb;
+
+ cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)",
+ PTR_ERR(skb));
+ return false;
+ }
+
+ if (skb->len < sizeof(*rp)) {
+ bt_dev_err(hdev, "MSFT supported features length mismatch");
+ goto failed;
+ }
+
+ rp = (struct msft_rp_read_supported_features *)skb->data;
+
+ if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES)
+ goto failed;
+
+ if (rp->evt_prefix_len > 0) {
+ msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len,
+ GFP_KERNEL);
+ if (!msft->evt_prefix)
+ goto failed;
+ }
+
+ msft->evt_prefix_len = rp->evt_prefix_len;
+ msft->features = __le64_to_cpu(rp->features);
+
+ if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY)
+ hdev->msft_curve_validity = true;
+
+ kfree_skb(skb);
+ return true;
+
+failed:
+ kfree_skb(skb);
+ return false;
+}
+
+/* is_mgmt = true matches the handle exposed to userspace via mgmt.
+ * is_mgmt = false matches the handle used by the msft controller.
+ * This function requires the caller holds hdev->lock
+ */
+static struct msft_monitor_advertisement_handle_data *msft_find_handle_data
+ (struct hci_dev *hdev, u16 handle, bool is_mgmt)
+{
+ struct msft_monitor_advertisement_handle_data *entry;
+ struct msft_data *msft = hdev->msft_data;
+
+ list_for_each_entry(entry, &msft->handle_map, list) {
+ if (is_mgmt && entry->mgmt_handle == handle)
+ return entry;
+ if (!is_mgmt && entry->msft_handle == handle)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds msft->filter_lock */
+static struct msft_monitor_addr_filter_data *msft_find_address_data
+ (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr,
+ u8 pattern_handle)
+{
+ struct msft_monitor_addr_filter_data *entry;
+ struct msft_data *msft = hdev->msft_data;
+
+ list_for_each_entry(entry, &msft->address_filters, list) {
+ if (entry->pattern_handle == pattern_handle &&
+ addr_type == entry->addr_type &&
+ !bacmp(addr, &entry->bdaddr))
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle,
+ bdaddr_t *bdaddr, __u8 addr_type,
+ bool notify)
+{
+ struct monitored_device *dev, *tmp;
+ int count = 0;
+
+ list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+ /* mgmt_handle == 0 indicates remove all devices, whereas,
+ * bdaddr == NULL indicates remove all devices matching the
+ * mgmt_handle.
+ */
+ if ((!mgmt_handle || dev->handle == mgmt_handle) &&
+ (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) &&
+ addr_type == dev->addr_type))) {
+ if (notify && dev->notified) {
+ mgmt_adv_monitor_device_lost(hdev, dev->handle,
+ &dev->bdaddr,
+ dev->addr_type);
+ }
+
+ list_del(&dev->list);
+ kfree(dev);
+ count++;
+ }
+ }
+
+ return count;
+}
+
+static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode,
+ struct adv_monitor *monitor,
+ struct sk_buff *skb)
+{
+ struct msft_rp_le_monitor_advertisement *rp;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_data *msft = hdev->msft_data;
+ int status = 0;
+
+ hci_dev_lock(hdev);
+
+ rp = (struct msft_rp_le_monitor_advertisement *)skb->data;
+ if (skb->len < sizeof(*rp)) {
+ status = HCI_ERROR_UNSPECIFIED;
+ goto unlock;
+ }
+
+ status = rp->status;
+ if (status)
+ goto unlock;
+
+ handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL);
+ if (!handle_data) {
+ status = HCI_ERROR_UNSPECIFIED;
+ goto unlock;
+ }
+
+ handle_data->mgmt_handle = monitor->handle;
+ handle_data->msft_handle = rp->handle;
+ handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN;
+ INIT_LIST_HEAD(&handle_data->list);
+ list_add(&handle_data->list, &msft->handle_map);
+
+ monitor->state = ADV_MONITOR_STATE_OFFLOADED;
+
+unlock:
+ if (status)
+ hci_free_adv_monitor(hdev, monitor);
+
+ hci_dev_unlock(hdev);
+
+ return status;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle)
+{
+ struct msft_monitor_addr_filter_data *address_filter, *n;
+ struct msft_cp_le_cancel_monitor_advertisement cp;
+ struct msft_data *msft = hdev->msft_data;
+ struct list_head head;
+ struct sk_buff *skb;
+
+ INIT_LIST_HEAD(&head);
+
+ /* Cancel all corresponding address monitors */
+ mutex_lock(&msft->filter_lock);
+
+ list_for_each_entry_safe(address_filter, n, &msft->address_filters,
+ list) {
+ if (address_filter->pattern_handle != handle)
+ continue;
+
+ list_del(&address_filter->list);
+
+ /* Keep the address filter and let
+ * msft_add_address_filter_sync() remove and free the address
+ * filter.
+ */
+ if (address_filter->state == AF_STATE_ADDING) {
+ address_filter->state = AF_STATE_REMOVING;
+ continue;
+ }
+
+ /* Keep the address filter and let
+ * msft_cancel_address_filter_sync() remove and free the address
+ * filter
+ */
+ if (address_filter->state == AF_STATE_REMOVING)
+ continue;
+
+ list_add_tail(&address_filter->list, &head);
+ }
+
+ mutex_unlock(&msft->filter_lock);
+
+ list_for_each_entry_safe(address_filter, n, &head, list) {
+ list_del(&address_filter->list);
+
+ cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
+ cp.handle = address_filter->msft_handle;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ kfree(address_filter);
+ continue;
+ }
+
+ kfree_skb(skb);
+
+ bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter",
+ &address_filter->bdaddr);
+
+ kfree(address_filter);
+ }
+}
+
+static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
+ u16 opcode,
+ struct adv_monitor *monitor,
+ struct sk_buff *skb)
+{
+ struct msft_rp_le_cancel_monitor_advertisement *rp;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_data *msft = hdev->msft_data;
+ int status = 0;
+ u8 msft_handle;
+
+ rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data;
+ if (skb->len < sizeof(*rp)) {
+ status = HCI_ERROR_UNSPECIFIED;
+ goto done;
+ }
+
+ status = rp->status;
+ if (status)
+ goto done;
+
+ hci_dev_lock(hdev);
+
+ handle_data = msft_find_handle_data(hdev, monitor->handle, true);
+
+ if (handle_data) {
+ if (monitor->state == ADV_MONITOR_STATE_OFFLOADED)
+ monitor->state = ADV_MONITOR_STATE_REGISTERED;
+
+ /* Do not free the monitor if it is being removed due to
+ * suspend. It will be re-monitored on resume.
+ */
+ if (!msft->suspending) {
+ hci_free_adv_monitor(hdev, monitor);
+
+ /* Clear any monitored devices by this Adv Monitor */
+ msft_monitor_device_del(hdev, handle_data->mgmt_handle,
+ NULL, 0, false);
+ }
+
+ msft_handle = handle_data->msft_handle;
+
+ list_del(&handle_data->list);
+ kfree(handle_data);
+
+ hci_dev_unlock(hdev);
+
+ msft_remove_addr_filters_sync(hdev, msft_handle);
+ } else {
+ hci_dev_unlock(hdev);
+ }
+
+done:
+ return status;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+static int msft_remove_monitor_sync(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ struct msft_cp_le_cancel_monitor_advertisement cp;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct sk_buff *skb;
+
+ handle_data = msft_find_handle_data(hdev, monitor->handle, true);
+
+ /* If no matched handle, just remove without telling controller */
+ if (!handle_data)
+ return -ENOENT;
+
+ cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
+ cp.handle = handle_data->msft_handle;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode,
+ monitor, skb);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_suspend_sync(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+ struct adv_monitor *monitor;
+ int handle = 0;
+
+ if (!msft || !msft_monitor_supported(hdev))
+ return 0;
+
+ msft->suspending = true;
+
+ while (1) {
+ monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
+ if (!monitor)
+ break;
+
+ msft_remove_monitor_sync(hdev, monitor);
+
+ handle++;
+ }
+
+ /* All monitors have been removed */
+ msft->suspending = false;
+
+ return 0;
+}
+
+static bool msft_monitor_rssi_valid(struct adv_monitor *monitor)
+{
+ struct adv_rssi_thresholds *r = &monitor->rssi;
+
+ if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN ||
+ r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX ||
+ r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN ||
+ r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX)
+ return false;
+
+ /* High_threshold_timeout is not supported,
+ * once high_threshold is reached, events are immediately reported.
+ */
+ if (r->high_threshold_timeout != 0)
+ return false;
+
+ if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX)
+ return false;
+
+ /* Sampling period from 0x00 to 0xFF are all allowed */
+ return true;
+}
+
+static bool msft_monitor_pattern_valid(struct adv_monitor *monitor)
+{
+ return msft_monitor_rssi_valid(monitor);
+ /* No additional check needed for pattern-based monitor */
+}
+
+static int msft_add_monitor_sync(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ struct msft_cp_le_monitor_advertisement *cp;
+ struct msft_le_monitor_advertisement_pattern_data *pattern_data;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_le_monitor_advertisement_pattern *pattern;
+ struct adv_pattern *entry;
+ size_t total_size = sizeof(*cp) + sizeof(*pattern_data);
+ ptrdiff_t offset = 0;
+ u8 pattern_count = 0;
+ struct sk_buff *skb;
+ int err;
+
+ if (!msft_monitor_pattern_valid(monitor))
+ return -EINVAL;
+
+ list_for_each_entry(entry, &monitor->patterns, list) {
+ pattern_count++;
+ total_size += sizeof(*pattern) + entry->length;
+ }
+
+ cp = kmalloc(total_size, GFP_KERNEL);
+ if (!cp)
+ return -ENOMEM;
+
+ cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT;
+ cp->rssi_high = monitor->rssi.high_threshold;
+ cp->rssi_low = monitor->rssi.low_threshold;
+ cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout;
+ cp->rssi_sampling_period = monitor->rssi.sampling_period;
+
+ cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN;
+
+ pattern_data = (void *)cp->data;
+ pattern_data->count = pattern_count;
+
+ list_for_each_entry(entry, &monitor->patterns, list) {
+ pattern = (void *)(pattern_data->data + offset);
+ /* the length also includes data_type and offset */
+ pattern->length = entry->length + 2;
+ pattern->data_type = entry->ad_type;
+ pattern->start_byte = entry->offset;
+ memcpy(pattern->pattern, entry->value, entry->length);
+ offset += sizeof(*pattern) + entry->length;
+ }
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp,
+ HCI_CMD_TIMEOUT);
+
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ goto out_free;
+ }
+
+ err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode,
+ monitor, skb);
+ if (err)
+ goto out_free;
+
+ handle_data = msft_find_handle_data(hdev, monitor->handle, true);
+ if (!handle_data) {
+ err = -ENODATA;
+ goto out_free;
+ }
+
+ handle_data->rssi_high = cp->rssi_high;
+ handle_data->rssi_low = cp->rssi_low;
+ handle_data->rssi_low_interval = cp->rssi_low_interval;
+ handle_data->rssi_sampling_period = cp->rssi_sampling_period;
+
+out_free:
+ kfree(cp);
+ return err;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+static void reregister_monitor(struct hci_dev *hdev)
+{
+ struct adv_monitor *monitor;
+ struct msft_data *msft = hdev->msft_data;
+ int handle = 0;
+
+ if (!msft)
+ return;
+
+ msft->resuming = true;
+
+ while (1) {
+ monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
+ if (!monitor)
+ break;
+
+ msft_add_monitor_sync(hdev, monitor);
+
+ handle++;
+ }
+
+ /* All monitors have been reregistered */
+ msft->resuming = false;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_resume_sync(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft || !msft_monitor_supported(hdev))
+ return 0;
+
+ hci_dev_lock(hdev);
+
+ /* Clear already tracked devices on resume. Once the monitors are
+ * reregistered, devices in range will be found again after resume.
+ */
+ hdev->advmon_pend_notify = false;
+ msft_monitor_device_del(hdev, 0, NULL, 0, true);
+
+ hci_dev_unlock(hdev);
+
+ reregister_monitor(hdev);
+
+ return 0;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+void msft_do_open(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (hdev->msft_opcode == HCI_OP_NOP)
+ return;
+
+ if (!msft) {
+ bt_dev_err(hdev, "MSFT extension not registered");
+ return;
+ }
+
+ bt_dev_dbg(hdev, "Initialize MSFT extension");
+
+ /* Reset existing MSFT data before re-reading */
+ kfree(msft->evt_prefix);
+ msft->evt_prefix = NULL;
+ msft->evt_prefix_len = 0;
+ msft->features = 0;
+
+ if (!read_supported_features(hdev, msft)) {
+ hdev->msft_data = NULL;
+ kfree(msft);
+ return;
+ }
+
+ if (msft_monitor_supported(hdev)) {
+ msft->resuming = true;
+ msft_set_filter_enable(hdev, true);
+ /* Monitors get removed on power off, so we need to explicitly
+ * tell the controller to re-monitor.
+ */
+ reregister_monitor(hdev);
+ }
+}
+
+void msft_do_close(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+ struct msft_monitor_advertisement_handle_data *handle_data, *tmp;
+ struct msft_monitor_addr_filter_data *address_filter, *n;
+ struct adv_monitor *monitor;
+
+ if (!msft)
+ return;
+
+ bt_dev_dbg(hdev, "Cleanup of MSFT extension");
+
+ /* The controller will silently remove all monitors on power off.
+ * Therefore, remove handle_data mapping and reset monitor state.
+ */
+ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) {
+ monitor = idr_find(&hdev->adv_monitors_idr,
+ handle_data->mgmt_handle);
+
+ if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED)
+ monitor->state = ADV_MONITOR_STATE_REGISTERED;
+
+ list_del(&handle_data->list);
+ kfree(handle_data);
+ }
+
+ mutex_lock(&msft->filter_lock);
+ list_for_each_entry_safe(address_filter, n, &msft->address_filters,
+ list) {
+ list_del(&address_filter->list);
+ kfree(address_filter);
+ }
+ mutex_unlock(&msft->filter_lock);
+
+ hci_dev_lock(hdev);
+
+ /* Clear any devices that are being monitored and notify device lost */
+ hdev->advmon_pend_notify = false;
+ msft_monitor_device_del(hdev, 0, NULL, 0, true);
+
+ hci_dev_unlock(hdev);
+}
+
+static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data)
+{
+ struct msft_monitor_addr_filter_data *address_filter = data;
+ struct msft_cp_le_cancel_monitor_advertisement cp;
+ struct msft_data *msft = hdev->msft_data;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (!msft) {
+ bt_dev_err(hdev, "MSFT: msft data is freed");
+ return -EINVAL;
+ }
+
+ /* The address filter has been removed by hci dev close */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return 0;
+
+ mutex_lock(&msft->filter_lock);
+ list_del(&address_filter->list);
+ mutex_unlock(&msft->filter_lock);
+
+ cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
+ cp.handle = address_filter->msft_handle;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter",
+ &address_filter->bdaddr);
+ err = PTR_ERR(skb);
+ goto done;
+ }
+ kfree_skb(skb);
+
+ bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter",
+ &address_filter->bdaddr);
+
+done:
+ kfree(address_filter);
+
+ return err;
+}
+
+void msft_register(struct hci_dev *hdev)
+{
+ struct msft_data *msft = NULL;
+
+ bt_dev_dbg(hdev, "Register MSFT extension");
+
+ msft = kzalloc(sizeof(*msft), GFP_KERNEL);
+ if (!msft) {
+ bt_dev_err(hdev, "Failed to register MSFT extension");
+ return;
+ }
+
+ INIT_LIST_HEAD(&msft->handle_map);
+ INIT_LIST_HEAD(&msft->address_filters);
+ hdev->msft_data = msft;
+ mutex_init(&msft->filter_lock);
+}
+
+void msft_release(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft)
+ return;
+
+ bt_dev_dbg(hdev, "Unregister MSFT extension");
+
+ hdev->msft_data = NULL;
+
+ kfree(msft->evt_prefix);
+ mutex_destroy(&msft->filter_lock);
+ kfree(msft);
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ __u8 addr_type, __u16 mgmt_handle)
+{
+ struct monitored_device *dev;
+
+ dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ bt_dev_err(hdev, "MSFT vendor event %u: no memory",
+ MSFT_EV_LE_MONITOR_DEVICE);
+ return;
+ }
+
+ bacpy(&dev->bdaddr, bdaddr);
+ dev->addr_type = addr_type;
+ dev->handle = mgmt_handle;
+ dev->notified = false;
+
+ INIT_LIST_HEAD(&dev->list);
+ list_add(&dev->list, &hdev->monitored_devices);
+ hdev->advmon_pend_notify = true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ __u8 addr_type, __u16 mgmt_handle)
+{
+ if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type,
+ true)) {
+ bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list",
+ MSFT_EV_LE_MONITOR_DEVICE, bdaddr);
+ }
+}
+
+static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
+{
+ void *data;
+
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev);
+
+ return data;
+}
+
+static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data)
+{
+ struct msft_monitor_addr_filter_data *address_filter = data;
+ struct msft_rp_le_monitor_advertisement *rp;
+ struct msft_cp_le_monitor_advertisement *cp;
+ struct msft_data *msft = hdev->msft_data;
+ struct sk_buff *skb = NULL;
+ bool remove = false;
+ size_t size;
+
+ if (!msft) {
+ bt_dev_err(hdev, "MSFT: msft data is freed");
+ return -EINVAL;
+ }
+
+ /* The address filter has been removed by hci dev close */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENODEV;
+
+ /* We are safe to use the address filter from now on.
+ * msft_monitor_device_evt() wouldn't delete this filter because it's
+ * not been added by now.
+ * And all other functions that requiring hci_req_sync_lock wouldn't
+ * touch this filter before this func completes because it's protected
+ * by hci_req_sync_lock.
+ */
+
+ if (address_filter->state == AF_STATE_REMOVING) {
+ mutex_lock(&msft->filter_lock);
+ list_del(&address_filter->list);
+ mutex_unlock(&msft->filter_lock);
+ kfree(address_filter);
+ return 0;
+ }
+
+ size = sizeof(*cp) +
+ sizeof(address_filter->addr_type) +
+ sizeof(address_filter->bdaddr);
+ cp = kzalloc(size, GFP_KERNEL);
+ if (!cp) {
+ bt_dev_err(hdev, "MSFT: Alloc cmd param err");
+ remove = true;
+ goto done;
+ }
+
+ cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT;
+ cp->rssi_high = address_filter->rssi_high;
+ cp->rssi_low = address_filter->rssi_low;
+ cp->rssi_low_interval = address_filter->rssi_low_interval;
+ cp->rssi_sampling_period = address_filter->rssi_sampling_period;
+ cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR;
+ cp->data[0] = address_filter->addr_type;
+ memcpy(&cp->data[1], &address_filter->bdaddr,
+ sizeof(address_filter->bdaddr));
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp,
+ HCI_CMD_TIMEOUT);
+ kfree(cp);
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to enable address %pMR filter",
+ &address_filter->bdaddr);
+ skb = NULL;
+ remove = true;
+ goto done;
+ }
+
+ rp = skb_pull_data(skb, sizeof(*rp));
+ if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT ||
+ rp->status)
+ remove = true;
+
+done:
+ mutex_lock(&msft->filter_lock);
+
+ if (remove) {
+ bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter",
+ &address_filter->bdaddr);
+ list_del(&address_filter->list);
+ kfree(address_filter);
+ } else {
+ address_filter->state = AF_STATE_ADDED;
+ address_filter->msft_handle = rp->handle;
+ bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled",
+ &address_filter->bdaddr);
+ }
+ mutex_unlock(&msft->filter_lock);
+
+ kfree_skb(skb);
+
+ return 0;
+}
+
+/* This function requires the caller holds msft->filter_lock */
+static struct msft_monitor_addr_filter_data *msft_add_address_filter
+ (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr,
+ struct msft_monitor_advertisement_handle_data *handle_data)
+{
+ struct msft_monitor_addr_filter_data *address_filter = NULL;
+ struct msft_data *msft = hdev->msft_data;
+ int err;
+
+ address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL);
+ if (!address_filter)
+ return NULL;
+
+ address_filter->state = AF_STATE_ADDING;
+ address_filter->msft_handle = 0xff;
+ address_filter->pattern_handle = handle_data->msft_handle;
+ address_filter->mgmt_handle = handle_data->mgmt_handle;
+ address_filter->rssi_high = handle_data->rssi_high;
+ address_filter->rssi_low = handle_data->rssi_low;
+ address_filter->rssi_low_interval = handle_data->rssi_low_interval;
+ address_filter->rssi_sampling_period = handle_data->rssi_sampling_period;
+ address_filter->addr_type = addr_type;
+ bacpy(&address_filter->bdaddr, bdaddr);
+
+ /* With the above AF_STATE_ADDING, duplicated address filter can be
+ * avoided when receiving monitor device event (found/lost) frequently
+ * for the same device.
+ */
+ list_add_tail(&address_filter->list, &msft->address_filters);
+
+ err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync,
+ address_filter, NULL);
+ if (err < 0) {
+ bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr);
+ list_del(&address_filter->list);
+ kfree(address_filter);
+ return NULL;
+ }
+
+ bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter",
+ &address_filter->bdaddr);
+
+ return address_filter;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct msft_monitor_addr_filter_data *n, *address_filter = NULL;
+ struct msft_ev_le_monitor_device *ev;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_data *msft = hdev->msft_data;
+ u16 mgmt_handle = 0xffff;
+ u8 addr_type;
+
+ ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev));
+ if (!ev)
+ return;
+
+ bt_dev_dbg(hdev,
+ "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR",
+ MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle,
+ ev->monitor_state, &ev->bdaddr);
+
+ handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false);
+
+ if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) {
+ if (!handle_data)
+ return;
+ mgmt_handle = handle_data->mgmt_handle;
+ goto report_state;
+ }
+
+ if (handle_data) {
+ /* Don't report any device found/lost event from pattern
+ * monitors. Pattern monitor always has its address filters for
+ * tracking devices.
+ */
+
+ address_filter = msft_find_address_data(hdev, ev->addr_type,
+ &ev->bdaddr,
+ handle_data->msft_handle);
+ if (address_filter)
+ return;
+
+ if (ev->monitor_state && handle_data->cond_type ==
+ MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN)
+ msft_add_address_filter(hdev, ev->addr_type,
+ &ev->bdaddr, handle_data);
+
+ return;
+ }
+
+ /* This device event is not from pattern monitor.
+ * Report it if there is a corresponding address_filter for it.
+ */
+ list_for_each_entry(n, &msft->address_filters, list) {
+ if (n->state == AF_STATE_ADDED &&
+ n->msft_handle == ev->monitor_handle) {
+ mgmt_handle = n->mgmt_handle;
+ address_filter = n;
+ break;
+ }
+ }
+
+ if (!address_filter) {
+ bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u",
+ &ev->bdaddr, ev->monitor_handle, ev->monitor_state);
+ return;
+ }
+
+report_state:
+ switch (ev->addr_type) {
+ case ADDR_LE_DEV_PUBLIC:
+ addr_type = BDADDR_LE_PUBLIC;
+ break;
+
+ case ADDR_LE_DEV_RANDOM:
+ addr_type = BDADDR_LE_RANDOM;
+ break;
+
+ default:
+ bt_dev_err(hdev,
+ "MSFT vendor event 0x%02x: unknown addr type 0x%02x",
+ MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type);
+ return;
+ }
+
+ if (ev->monitor_state) {
+ msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle);
+ } else {
+ if (address_filter && address_filter->state == AF_STATE_ADDED) {
+ address_filter->state = AF_STATE_REMOVING;
+ hci_cmd_sync_queue(hdev,
+ msft_cancel_address_filter_sync,
+ address_filter,
+ NULL);
+ }
+ msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle);
+ }
+}
+
+void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
+{
+ struct msft_data *msft = hdev->msft_data;
+ u8 *evt_prefix;
+ u8 *evt;
+
+ if (!msft)
+ return;
+
+ /* When the extension has defined an event prefix, check that it
+ * matches, and otherwise just return.
+ */
+ if (msft->evt_prefix_len > 0) {
+ evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len);
+ if (!evt_prefix)
+ return;
+
+ if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len))
+ return;
+ }
+
+ /* Every event starts at least with an event code and the rest of
+ * the data is variable and depends on the event code.
+ */
+ if (skb->len < 1)
+ return;
+
+ evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt));
+ if (!evt)
+ return;
+
+ hci_dev_lock(hdev);
+
+ switch (*evt) {
+ case MSFT_EV_LE_MONITOR_DEVICE:
+ mutex_lock(&msft->filter_lock);
+ msft_monitor_device_evt(hdev, skb);
+ mutex_unlock(&msft->filter_lock);
+ break;
+
+ default:
+ bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt);
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+__u64 msft_get_features(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ return msft ? msft->features : 0;
+}
+
+static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev,
+ void *user_data,
+ u8 status)
+{
+ struct msft_cp_le_set_advertisement_filter_enable *cp = user_data;
+ struct msft_data *msft = hdev->msft_data;
+
+ /* Error 0x0C would be returned if the filter enabled status is
+ * already set to whatever we were trying to set.
+ * Although the default state should be disabled, some controller set
+ * the initial value to enabled. Because there is no way to know the
+ * actual initial value before sending this command, here we also treat
+ * error 0x0C as success.
+ */
+ if (status != 0x00 && status != 0x0C)
+ return;
+
+ hci_dev_lock(hdev);
+
+ msft->filter_enabled = cp->enable;
+
+ if (status == 0x0C)
+ bt_dev_warn(hdev, "MSFT filter_enable is already %s",
+ cp->enable ? "on" : "off");
+
+ hci_dev_unlock(hdev);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft)
+ return -EOPNOTSUPP;
+
+ if (msft->resuming || msft->suspending)
+ return -EBUSY;
+
+ return msft_add_monitor_sync(hdev, monitor);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft)
+ return -EOPNOTSUPP;
+
+ if (msft->resuming || msft->suspending)
+ return -EBUSY;
+
+ return msft_remove_monitor_sync(hdev, monitor);
+}
+
+int msft_set_filter_enable(struct hci_dev *hdev, bool enable)
+{
+ struct msft_cp_le_set_advertisement_filter_enable cp;
+ struct msft_data *msft = hdev->msft_data;
+ int err;
+
+ if (!msft)
+ return -EOPNOTSUPP;
+
+ cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE;
+ cp.enable = enable;
+ err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+
+ msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err);
+
+ return 0;
+}
+
+bool msft_curve_validity(struct hci_dev *hdev)
+{
+ return hdev->msft_curve_validity;
+}
diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h
new file mode 100644
index 000000000000..fe538e9c91c0
--- /dev/null
+++ b/net/bluetooth/msft.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#define MSFT_FEATURE_MASK_BREDR_RSSI_MONITOR BIT(0)
+#define MSFT_FEATURE_MASK_LE_CONN_RSSI_MONITOR BIT(1)
+#define MSFT_FEATURE_MASK_LE_ADV_RSSI_MONITOR BIT(2)
+#define MSFT_FEATURE_MASK_LE_ADV_MONITOR BIT(3)
+#define MSFT_FEATURE_MASK_CURVE_VALIDITY BIT(4)
+#define MSFT_FEATURE_MASK_CONCURRENT_ADV_MONITOR BIT(5)
+
+#if IS_ENABLED(CONFIG_BT_MSFTEXT)
+
+bool msft_monitor_supported(struct hci_dev *hdev);
+void msft_register(struct hci_dev *hdev);
+void msft_release(struct hci_dev *hdev);
+void msft_do_open(struct hci_dev *hdev);
+void msft_do_close(struct hci_dev *hdev);
+void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb);
+__u64 msft_get_features(struct hci_dev *hdev);
+int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor);
+int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
+void msft_req_add_set_filter_enable(struct hci_request *req, bool enable);
+int msft_set_filter_enable(struct hci_dev *hdev, bool enable);
+int msft_suspend_sync(struct hci_dev *hdev);
+int msft_resume_sync(struct hci_dev *hdev);
+bool msft_curve_validity(struct hci_dev *hdev);
+
+#else
+
+static inline bool msft_monitor_supported(struct hci_dev *hdev)
+{
+ return false;
+}
+
+static inline void msft_register(struct hci_dev *hdev) {}
+static inline void msft_release(struct hci_dev *hdev) {}
+static inline void msft_do_open(struct hci_dev *hdev) {}
+static inline void msft_do_close(struct hci_dev *hdev) {}
+static inline void msft_vendor_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb) {}
+static inline __u64 msft_get_features(struct hci_dev *hdev) { return 0; }
+static inline int msft_add_monitor_pattern(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int msft_remove_monitor(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void msft_req_add_set_filter_enable(struct hci_request *req,
+ bool enable) {}
+static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int msft_suspend_sync(struct hci_dev *hdev)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int msft_resume_sync(struct hci_dev *hdev)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool msft_curve_validity(struct hci_dev *hdev)
+{
+ return false;
+}
+
+#endif
diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig
index 405a0e61e7dc..9b9953ebf4c0 100644
--- a/net/bluetooth/rfcomm/Kconfig
+++ b/net/bluetooth/rfcomm/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_RFCOMM
tristate "RFCOMM protocol support"
- depends on BT && BT_L2CAP
+ depends on BT_BREDR
help
RFCOMM provides connection oriented stream transport. RFCOMM
support is required for Dialup Networking, OBEX and other Bluetooth
@@ -12,6 +13,7 @@ config BT_RFCOMM
config BT_RFCOMM_TTY
bool "RFCOMM TTY support"
depends on BT_RFCOMM
+ depends on TTY
help
This option enables TTY emulation support for RFCOMM channels.
diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile
index fe07988a3705..593e5c48c131 100644
--- a/net/bluetooth/rfcomm/Makefile
+++ b/net/bluetooth/rfcomm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth RFCOMM layer.
#
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 278c8676906a..57b1dca8141f 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1,4 +1,4 @@
-/*
+/*
RFCOMM implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
@@ -11,52 +11,37 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/*
* Bluetooth RFCOMM core.
- *
- * $Id: core.c,v 1.42 2002/10/01 23:26:25 maxk Exp $
*/
#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-#include <linux/device.h>
-#include <linux/net.h>
-#include <linux/mutex.h>
-
-#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/rfcomm.h>
-#ifndef CONFIG_BT_RFCOMM_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
+#include <trace/events/sock.h>
-#define VERSION "1.8"
+#define VERSION "1.11"
-static int disable_cfc = 0;
+static bool disable_cfc;
+static bool l2cap_ertm;
static int channel_mtu = -1;
-static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU;
static struct task_struct *rfcomm_thread;
@@ -64,10 +49,8 @@ static DEFINE_MUTEX(rfcomm_mutex);
#define rfcomm_lock() mutex_lock(&rfcomm_mutex)
#define rfcomm_unlock() mutex_unlock(&rfcomm_mutex)
-static unsigned long rfcomm_event;
static LIST_HEAD(session_list);
-static atomic_t terminate, running;
static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len);
static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci);
@@ -82,25 +65,27 @@ static void rfcomm_make_uih(struct sk_buff *skb, u8 addr);
static void rfcomm_process_connect(struct rfcomm_session *s);
-static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst, int *err);
+static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
+ bdaddr_t *dst,
+ u8 sec_level,
+ int *err);
static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst);
-static void rfcomm_session_del(struct rfcomm_session *s);
+static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
/* ---- RFCOMM frame parsing macros ---- */
#define __get_dlci(b) ((b & 0xfc) >> 2)
-#define __get_channel(b) ((b & 0xf8) >> 3)
-#define __get_dir(b) ((b & 0x04) >> 2)
#define __get_type(b) ((b & 0xef))
#define __test_ea(b) ((b & 0x01))
-#define __test_cr(b) ((b & 0x02))
-#define __test_pf(b) ((b & 0x10))
+#define __test_cr(b) (!!(b & 0x02))
+#define __test_pf(b) (!!(b & 0x10))
+
+#define __session_dir(s) ((s)->initiator ? 0x00 : 0x01)
#define __addr(cr, dlci) (((dlci & 0x3f) << 2) | (cr << 1) | 0x01)
#define __ctrl(type, pf) (((type & 0xef) | (pf << 4)))
#define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir)
#define __srv_channel(dlci) (dlci >> 1)
-#define __dir(dlci) (dlci & 0x01)
#define __len8(len) (((len) << 1) | 1)
#define __len16(len) ((len) << 1)
@@ -116,25 +101,17 @@ static void rfcomm_session_del(struct rfcomm_session *s);
#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
-static inline void rfcomm_schedule(uint event)
-{
- if (!rfcomm_thread)
- return;
- //set_bit(event, &rfcomm_event);
- set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event);
- wake_up_process(rfcomm_thread);
-}
+static DECLARE_WAIT_QUEUE_HEAD(rfcomm_wq);
-static inline void rfcomm_session_put(struct rfcomm_session *s)
+static void rfcomm_schedule(void)
{
- if (atomic_dec_and_test(&s->refcnt))
- rfcomm_session_del(s);
+ wake_up_all(&rfcomm_wq);
}
/* ---- RFCOMM FCS computation ---- */
/* reversed, 8-bit, poly=0x07 */
-static unsigned char rfcomm_crc_table[256] = {
+static unsigned char rfcomm_crc_table[256] = {
0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75,
0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b,
0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69,
@@ -179,16 +156,16 @@ static unsigned char rfcomm_crc_table[256] = {
/* CRC on 2 bytes */
#define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]])
-/* FCS on 2 bytes */
+/* FCS on 2 bytes */
static inline u8 __fcs(u8 *data)
{
- return (0xff - __crc(data));
+ return 0xff - __crc(data);
}
-/* FCS on 3 bytes */
+/* FCS on 3 bytes */
static inline u8 __fcs2(u8 *data)
{
- return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]);
+ return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
}
/* Check FCS */
@@ -206,13 +183,15 @@ static inline int __check_fcs(u8 *data, int type, u8 fcs)
static void rfcomm_l2state_change(struct sock *sk)
{
BT_DBG("%p state %d", sk, sk->sk_state);
- rfcomm_schedule(RFCOMM_SCHED_STATE);
+ rfcomm_schedule();
}
-static void rfcomm_l2data_ready(struct sock *sk, int bytes)
+static void rfcomm_l2data_ready(struct sock *sk)
{
- BT_DBG("%p bytes %d", sk, bytes);
- rfcomm_schedule(RFCOMM_SCHED_RX);
+ trace_sk_data_ready(sk);
+
+ BT_DBG("%p", sk);
+ rfcomm_schedule();
}
static int rfcomm_l2sock_create(struct socket **sock)
@@ -221,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock)
BT_DBG("");
- err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
+ err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
if (!err) {
struct sock *sk = (*sock)->sk;
sk->sk_data_ready = rfcomm_l2data_ready;
@@ -230,16 +209,64 @@ static int rfcomm_l2sock_create(struct socket **sock)
return err;
}
+static int rfcomm_check_security(struct rfcomm_dlc *d)
+{
+ struct sock *sk = d->session->sock->sk;
+ struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+
+ __u8 auth_type;
+
+ switch (d->sec_level) {
+ case BT_SECURITY_HIGH:
+ case BT_SECURITY_FIPS:
+ auth_type = HCI_AT_GENERAL_BONDING_MITM;
+ break;
+ case BT_SECURITY_MEDIUM:
+ auth_type = HCI_AT_GENERAL_BONDING;
+ break;
+ default:
+ auth_type = HCI_AT_NO_BONDING;
+ break;
+ }
+
+ return hci_conn_security(conn->hcon, d->sec_level, auth_type,
+ d->out);
+}
+
+static void rfcomm_session_timeout(struct timer_list *t)
+{
+ struct rfcomm_session *s = timer_container_of(s, t, timer);
+
+ BT_DBG("session %p state %ld", s, s->state);
+
+ set_bit(RFCOMM_TIMED_OUT, &s->flags);
+ rfcomm_schedule();
+}
+
+static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout)
+{
+ BT_DBG("session %p state %ld timeout %ld", s, s->state, timeout);
+
+ mod_timer(&s->timer, jiffies + timeout);
+}
+
+static void rfcomm_session_clear_timer(struct rfcomm_session *s)
+{
+ BT_DBG("session %p state %ld", s, s->state);
+
+ timer_delete_sync(&s->timer);
+}
+
/* ---- RFCOMM DLCs ---- */
-static void rfcomm_dlc_timeout(unsigned long arg)
+static void rfcomm_dlc_timeout(struct timer_list *t)
{
- struct rfcomm_dlc *d = (void *) arg;
+ struct rfcomm_dlc *d = timer_container_of(d, t, timer);
BT_DBG("dlc %p state %ld", d, d->state);
set_bit(RFCOMM_TIMED_OUT, &d->flags);
rfcomm_dlc_put(d);
- rfcomm_schedule(RFCOMM_SCHED_TIMEO);
+ rfcomm_schedule();
}
static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout)
@@ -254,7 +281,7 @@ static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d)
{
BT_DBG("dlc %p state %ld", d, d->state);
- if (timer_pending(&d->timer) && del_timer(&d->timer))
+ if (timer_delete(&d->timer))
rfcomm_dlc_put(d);
}
@@ -265,6 +292,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
d->state = BT_OPEN;
d->flags = 0;
d->mscex = 0;
+ d->sec_level = BT_SECURITY_LOW;
d->mtu = RFCOMM_DEFAULT_MTU;
d->v24_sig = RFCOMM_V24_RTC | RFCOMM_V24_RTR | RFCOMM_V24_DV;
@@ -279,16 +307,14 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
if (!d)
return NULL;
- init_timer(&d->timer);
- d->timer.function = rfcomm_dlc_timeout;
- d->timer.data = (unsigned long) d;
+ timer_setup(&d->timer, rfcomm_dlc_timeout, 0);
skb_queue_head_init(&d->tx_queue);
- spin_lock_init(&d->lock);
- atomic_set(&d->refcnt, 1);
+ mutex_init(&d->lock);
+ refcount_set(&d->refcnt, 1);
rfcomm_dlc_clear_state(d);
-
+
BT_DBG("%p", d);
return d;
@@ -306,8 +332,7 @@ static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d)
{
BT_DBG("dlc %p session %p", d, s);
- rfcomm_session_hold(s);
-
+ rfcomm_session_clear_timer(s);
rfcomm_dlc_hold(d);
list_add(&d->list, &s->dlcs);
d->session = s;
@@ -317,38 +342,42 @@ static void rfcomm_dlc_unlink(struct rfcomm_dlc *d)
{
struct rfcomm_session *s = d->session;
- BT_DBG("dlc %p refcnt %d session %p", d, atomic_read(&d->refcnt), s);
+ BT_DBG("dlc %p refcnt %d session %p", d, refcount_read(&d->refcnt), s);
list_del(&d->list);
d->session = NULL;
rfcomm_dlc_put(d);
- rfcomm_session_put(s);
+ if (list_empty(&s->dlcs))
+ rfcomm_session_set_timer(s, RFCOMM_IDLE_TIMEOUT);
}
static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci)
{
struct rfcomm_dlc *d;
- struct list_head *p;
- list_for_each(p, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
+ list_for_each_entry(d, &s->dlcs, list)
if (d->dlci == dlci)
return d;
- }
+
return NULL;
}
+static int rfcomm_check_channel(u8 channel)
+{
+ return channel < 1 || channel > 30;
+}
+
static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel)
{
struct rfcomm_session *s;
int err = 0;
u8 dlci;
- BT_DBG("dlc %p state %ld %s %s channel %d",
- d, d->state, batostr(src), batostr(dst), channel);
+ BT_DBG("dlc %p state %ld %pMR -> %pMR channel %d",
+ d, d->state, src, dst, channel);
- if (channel < 1 || channel > 30)
+ if (rfcomm_check_channel(channel))
return -EINVAL;
if (d->state != BT_OPEN && d->state != BT_CLOSED)
@@ -356,12 +385,12 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst,
s = rfcomm_session_get(src, dst);
if (!s) {
- s = rfcomm_session_create(src, dst, &err);
+ s = rfcomm_session_create(src, dst, d->sec_level, &err);
if (!s)
return err;
}
- dlci = __dlci(!s->initiator, channel);
+ dlci = __dlci(__session_dir(s), channel);
/* Check if DLCI already exists */
if (rfcomm_dlc_get(s, dlci))
@@ -373,15 +402,23 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst,
d->addr = __addr(s->initiator, dlci);
d->priority = 7;
- d->state = BT_CONFIG;
+ d->state = BT_CONFIG;
rfcomm_dlc_link(s, d);
+ d->out = 1;
+
d->mtu = s->mtu;
d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc;
- if (s->state == BT_CONNECTED)
- rfcomm_send_pn(s, 1, d);
+ if (s->state == BT_CONNECTED) {
+ if (rfcomm_check_security(d))
+ rfcomm_send_pn(s, 1, d);
+ else
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ }
+
rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
+
return 0;
}
@@ -397,6 +434,20 @@ int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 chann
return r;
}
+static void __rfcomm_dlc_disconn(struct rfcomm_dlc *d)
+{
+ struct rfcomm_session *s = d->session;
+
+ d->state = BT_DISCONN;
+ if (skb_queue_empty(&d->tx_queue)) {
+ rfcomm_send_disc(s, d->dlci);
+ rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT);
+ } else {
+ rfcomm_queue_disc(d);
+ rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2);
+ }
+}
+
static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
{
struct rfcomm_session *s = d->session;
@@ -407,19 +458,33 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
d, d->state, d->dlci, err, s);
switch (d->state) {
- case BT_CONNECTED:
- case BT_CONFIG:
case BT_CONNECT:
- d->state = BT_DISCONN;
- if (skb_queue_empty(&d->tx_queue)) {
- rfcomm_send_disc(s, d->dlci);
- rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT);
- } else {
- rfcomm_queue_disc(d);
- rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2);
+ case BT_CONFIG:
+ case BT_OPEN:
+ case BT_CONNECT2:
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+ rfcomm_schedule();
+ return 0;
}
+ }
+
+ switch (d->state) {
+ case BT_CONNECT:
+ case BT_CONNECTED:
+ __rfcomm_dlc_disconn(d);
break;
+ case BT_CONFIG:
+ if (s->state != BT_BOUND) {
+ __rfcomm_dlc_disconn(d);
+ break;
+ }
+ /* if closing a dlc in a session that hasn't been started,
+ * just close and unlink the dlc
+ */
+ fallthrough;
+
default:
rfcomm_dlc_clear_timer(d);
@@ -437,37 +502,128 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
int rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
{
- int r;
+ int r = 0;
+ struct rfcomm_dlc *d_list;
+ struct rfcomm_session *s, *s_list;
+
+ BT_DBG("dlc %p state %ld dlci %d err %d", d, d->state, d->dlci, err);
rfcomm_lock();
- r = __rfcomm_dlc_close(d, err);
+ s = d->session;
+ if (!s)
+ goto no_session;
+
+ /* after waiting on the mutex check the session still exists
+ * then check the dlc still exists
+ */
+ list_for_each_entry(s_list, &session_list, list) {
+ if (s_list == s) {
+ list_for_each_entry(d_list, &s->dlcs, list) {
+ if (d_list == d) {
+ r = __rfcomm_dlc_close(d, err);
+ break;
+ }
+ }
+ break;
+ }
+ }
+no_session:
rfcomm_unlock();
return r;
}
-int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
+struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel)
{
- int len = skb->len;
+ struct rfcomm_session *s;
+ struct rfcomm_dlc *dlc = NULL;
+ u8 dlci;
- if (d->state != BT_CONNECTED)
- return -ENOTCONN;
+ if (rfcomm_check_channel(channel))
+ return ERR_PTR(-EINVAL);
+
+ rfcomm_lock();
+ s = rfcomm_session_get(src, dst);
+ if (s) {
+ dlci = __dlci(__session_dir(s), channel);
+ dlc = rfcomm_dlc_get(s, dlci);
+ }
+ rfcomm_unlock();
+ return dlc;
+}
+
+static int rfcomm_dlc_send_frag(struct rfcomm_dlc *d, struct sk_buff *frag)
+{
+ int len = frag->len;
BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
if (len > d->mtu)
return -EINVAL;
+ rfcomm_make_uih(frag, d->addr);
+ __skb_queue_tail(&d->tx_queue, frag);
+
+ return len;
+}
+
+int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct sk_buff *frag, *next;
+ int len;
+
+ if (d->state != BT_CONNECTED)
+ return -ENOTCONN;
+
+ frag = skb_shinfo(skb)->frag_list;
+ skb_shinfo(skb)->frag_list = NULL;
+
+ /* Queue all fragments atomically. */
+ spin_lock_irqsave(&d->tx_queue.lock, flags);
+
+ len = rfcomm_dlc_send_frag(d, skb);
+ if (len < 0 || !frag)
+ goto unlock;
+
+ for (; frag; frag = next) {
+ int ret;
+
+ next = frag->next;
+
+ ret = rfcomm_dlc_send_frag(d, frag);
+ if (ret < 0) {
+ dev_kfree_skb_irq(frag);
+ goto unlock;
+ }
+
+ len += ret;
+ }
+
+unlock:
+ spin_unlock_irqrestore(&d->tx_queue.lock, flags);
+
+ if (len > 0 && !test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+ rfcomm_schedule();
+ return len;
+}
+
+void rfcomm_dlc_send_noerror(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+ int len = skb->len;
+
+ BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
+
rfcomm_make_uih(skb, d->addr);
skb_queue_tail(&d->tx_queue, skb);
- if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
- rfcomm_schedule(RFCOMM_SCHED_TX);
- return len;
+ if (d->state == BT_CONNECTED &&
+ !test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+ rfcomm_schedule();
}
-void fastcall __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
+void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
{
BT_DBG("dlc %p state %ld", d, d->state);
@@ -475,10 +631,10 @@ void fastcall __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
d->v24_sig |= RFCOMM_V24_FC;
set_bit(RFCOMM_MSC_PENDING, &d->flags);
}
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
}
-void fastcall __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
+void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
{
BT_DBG("dlc %p state %ld", d, d->state);
@@ -486,35 +642,35 @@ void fastcall __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
d->v24_sig &= ~RFCOMM_V24_FC;
set_bit(RFCOMM_MSC_PENDING, &d->flags);
}
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
}
-/*
+/*
Set/get modem status functions use _local_ status i.e. what we report
to the other side.
Remote status is provided by dlc->modem_status() callback.
*/
int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig)
{
- BT_DBG("dlc %p state %ld v24_sig 0x%x",
+ BT_DBG("dlc %p state %ld v24_sig 0x%x",
d, d->state, v24_sig);
if (test_bit(RFCOMM_RX_THROTTLED, &d->flags))
v24_sig |= RFCOMM_V24_FC;
else
v24_sig &= ~RFCOMM_V24_FC;
-
+
d->v24_sig = v24_sig;
if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags))
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
return 0;
}
int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig)
{
- BT_DBG("dlc %p state %ld v24_sig 0x%x",
+ BT_DBG("dlc %p state %ld v24_sig 0x%x",
d, d->state, d->v24_sig);
*v24_sig = d->v24_sig;
@@ -531,6 +687,8 @@ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state)
BT_DBG("session %p sock %p", s, sock);
+ timer_setup(&s->timer, rfcomm_session_timeout, 0);
+
INIT_LIST_HEAD(&s->dlcs);
s->state = state;
s->sock = sock;
@@ -551,7 +709,7 @@ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state)
return s;
}
-static void rfcomm_session_del(struct rfcomm_session *s)
+static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s)
{
int state = s->state;
@@ -559,61 +717,60 @@ static void rfcomm_session_del(struct rfcomm_session *s)
list_del(&s->list);
- if (state == BT_CONNECTED)
- rfcomm_send_disc(s, 0);
-
+ rfcomm_session_clear_timer(s);
sock_release(s->sock);
kfree(s);
if (state != BT_LISTEN)
module_put(THIS_MODULE);
+
+ return NULL;
}
static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst)
{
- struct rfcomm_session *s;
- struct list_head *p, *n;
- struct bt_sock *sk;
- list_for_each_safe(p, n, &session_list) {
- s = list_entry(p, struct rfcomm_session, list);
- sk = bt_sk(s->sock->sk);
-
- if ((!bacmp(src, BDADDR_ANY) || !bacmp(&sk->src, src)) &&
- !bacmp(&sk->dst, dst))
+ struct rfcomm_session *s, *n;
+ struct l2cap_chan *chan;
+ list_for_each_entry_safe(s, n, &session_list, list) {
+ chan = l2cap_pi(s->sock->sk)->chan;
+
+ if ((!bacmp(src, BDADDR_ANY) || !bacmp(&chan->src, src)) &&
+ !bacmp(&chan->dst, dst))
return s;
}
return NULL;
}
-static void rfcomm_session_close(struct rfcomm_session *s, int err)
+static struct rfcomm_session *rfcomm_session_close(struct rfcomm_session *s,
+ int err)
{
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
-
- BT_DBG("session %p state %ld err %d", s, s->state, err);
-
- rfcomm_session_hold(s);
+ struct rfcomm_dlc *d, *n;
s->state = BT_CLOSED;
+ BT_DBG("session %p state %ld err %d", s, s->state, err);
+
/* Close all dlcs */
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
d->state = BT_CLOSED;
__rfcomm_dlc_close(d, err);
}
- rfcomm_session_put(s);
+ rfcomm_session_clear_timer(s);
+ return rfcomm_session_del(s);
}
-static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst, int *err)
+static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
+ bdaddr_t *dst,
+ u8 sec_level,
+ int *err)
{
struct rfcomm_session *s = NULL;
struct sockaddr_l2 addr;
struct socket *sock;
struct sock *sk;
- BT_DBG("%s %s", batostr(src), batostr(dst));
+ BT_DBG("%pMR -> %pMR", src, dst);
*err = rfcomm_l2sock_create(&sock);
if (*err < 0)
@@ -622,14 +779,20 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
bacpy(&addr.l2_bdaddr, src);
addr.l2_family = AF_BLUETOOTH;
addr.l2_psm = 0;
- *err = sock->ops->bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+ addr.l2_cid = 0;
+ addr.l2_bdaddr_type = BDADDR_BREDR;
+ *err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
if (*err < 0)
goto failed;
/* Set L2CAP options */
sk = sock->sk;
lock_sock(sk);
- l2cap_pi(sk)->imtu = l2cap_mtu;
+ /* Set MTU to 0 so L2CAP can auto select the MTU */
+ l2cap_pi(sk)->chan->imtu = 0;
+ l2cap_pi(sk)->chan->sec_level = sec_level;
+ if (l2cap_ertm)
+ l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM;
release_sock(sk);
s = rfcomm_session_add(sock, BT_BOUND);
@@ -642,13 +805,14 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
bacpy(&addr.l2_bdaddr, dst);
addr.l2_family = AF_BLUETOOTH;
- addr.l2_psm = htobs(RFCOMM_PSM);
- *err = sock->ops->connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
+ addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM);
+ addr.l2_cid = 0;
+ addr.l2_bdaddr_type = BDADDR_BREDR;
+ *err = kernel_connect(sock, (struct sockaddr_unsized *)&addr, sizeof(addr), O_NONBLOCK);
if (*err == 0 || *err == -EINPROGRESS)
return s;
- rfcomm_session_del(s);
- return NULL;
+ return rfcomm_session_del(s);
failed:
sock_release(sock);
@@ -657,17 +821,16 @@ failed:
void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst)
{
- struct sock *sk = s->sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(s->sock->sk)->chan;
if (src)
- bacpy(src, &bt_sk(sk)->src);
+ bacpy(src, &chan->src);
if (dst)
- bacpy(dst, &bt_sk(sk)->dst);
+ bacpy(dst, &chan->dst);
}
/* ---- RFCOMM frame sending ---- */
static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len)
{
- struct socket *sock = s->sock;
struct kvec iv = { data, len };
struct msghdr msg;
@@ -675,7 +838,14 @@ static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len)
memset(&msg, 0, sizeof(msg));
- return kernel_sendmsg(sock, &msg, &iv, 1, len);
+ return kernel_sendmsg(s->sock, &msg, &iv, 1, len);
+}
+
+static int rfcomm_send_cmd(struct rfcomm_session *s, struct rfcomm_cmd *cmd)
+{
+ BT_DBG("%p cmd %u", s, cmd->ctrl);
+
+ return rfcomm_send_frame(s, (void *) cmd, sizeof(*cmd));
}
static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci)
@@ -689,7 +859,7 @@ static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci)
cmd.len = __len8(0);
cmd.fcs = __fcs2((u8 *) &cmd);
- return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+ return rfcomm_send_cmd(s, &cmd);
}
static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci)
@@ -703,7 +873,7 @@ static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci)
cmd.len = __len8(0);
cmd.fcs = __fcs2((u8 *) &cmd);
- return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+ return rfcomm_send_cmd(s, &cmd);
}
static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci)
@@ -717,7 +887,7 @@ static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci)
cmd.len = __len8(0);
cmd.fcs = __fcs2((u8 *) &cmd);
- return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+ return rfcomm_send_cmd(s, &cmd);
}
static int rfcomm_queue_disc(struct rfcomm_dlc *d)
@@ -731,14 +901,14 @@ static int rfcomm_queue_disc(struct rfcomm_dlc *d)
if (!skb)
return -ENOMEM;
- cmd = (void *) __skb_put(skb, sizeof(*cmd));
+ cmd = __skb_put(skb, sizeof(*cmd));
cmd->addr = d->addr;
cmd->ctrl = __ctrl(RFCOMM_DISC, 1);
cmd->len = __len8(0);
cmd->fcs = __fcs2((u8 *) cmd);
skb_queue_tail(&d->tx_queue, skb);
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
return 0;
}
@@ -753,7 +923,7 @@ static int rfcomm_send_dm(struct rfcomm_session *s, u8 dlci)
cmd.len = __len8(0);
cmd.fcs = __fcs2((u8 *) &cmd);
- return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+ return rfcomm_send_cmd(s, &cmd);
}
static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type)
@@ -770,7 +940,7 @@ static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type)
hdr->len = __len8(sizeof(*mcc) + 1);
mcc = (void *) ptr; ptr += sizeof(*mcc);
- mcc->type = __mcc_type(cr, RFCOMM_NSC);
+ mcc->type = __mcc_type(0, RFCOMM_NSC);
mcc->len = __len8(1);
/* Type that we didn't like */
@@ -814,9 +984,9 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
}
if (cr && channel_mtu >= 0)
- pn->mtu = htobs(channel_mtu);
+ pn->mtu = cpu_to_le16(channel_mtu);
else
- pn->mtu = htobs(d->mtu);
+ pn->mtu = cpu_to_le16(d->mtu);
*ptr = __fcs(buf); ptr++;
@@ -825,7 +995,7 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
u8 bit_rate, u8 data_bits, u8 stop_bits,
- u8 parity, u8 flow_ctrl_settings,
+ u8 parity, u8 flow_ctrl_settings,
u8 xon_char, u8 xoff_char, u16 param_mask)
{
struct rfcomm_hdr *hdr;
@@ -834,8 +1004,8 @@ int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
u8 buf[16], *ptr = buf;
BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
- " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
- s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
+ " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
+ s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
flow_ctrl_settings, xon_char, xoff_char, param_mask);
hdr = (void *) ptr; ptr += sizeof(*hdr);
@@ -1017,10 +1187,10 @@ static void rfcomm_make_uih(struct sk_buff *skb, u8 addr)
u8 *crc;
if (len > 127) {
- hdr = (void *) skb_push(skb, 4);
- put_unaligned(htobs(__len16(len)), (__le16 *) &hdr->len);
+ hdr = skb_push(skb, 4);
+ put_unaligned(cpu_to_le16(__len16(len)), (__le16 *) &hdr->len);
} else {
- hdr = (void *) skb_push(skb, 3);
+ hdr = skb_push(skb, 3);
hdr->len = __len8(len);
}
hdr->addr = addr;
@@ -1031,7 +1201,7 @@ static void rfcomm_make_uih(struct sk_buff *skb, u8 addr)
}
/* ---- RFCOMM frame reception ---- */
-static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
+static struct rfcomm_session *rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
{
BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
@@ -1040,7 +1210,7 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
if (!d) {
rfcomm_send_dm(s, dlci);
- return 0;
+ return s;
}
switch (d->state) {
@@ -1058,6 +1228,13 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
case BT_DISCONN:
d->state = BT_CLOSED;
__rfcomm_dlc_close(d, 0);
+
+ if (list_empty(&s->dlcs)) {
+ s->state = BT_DISCONN;
+ rfcomm_send_disc(s, 0);
+ rfcomm_session_clear_timer(s);
+ }
+
break;
}
} else {
@@ -1067,12 +1244,16 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
s->state = BT_CONNECTED;
rfcomm_process_connect(s);
break;
+
+ case BT_DISCONN:
+ s = rfcomm_session_close(s, ECONNRESET);
+ break;
}
}
- return 0;
+ return s;
}
-static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci)
+static struct rfcomm_session *rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci)
{
int err = 0;
@@ -1096,13 +1277,13 @@ static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci)
else
err = ECONNRESET;
- s->state = BT_CLOSED;
- rfcomm_session_close(s, err);
+ s = rfcomm_session_close(s, err);
}
- return 0;
+ return s;
}
-static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
+static struct rfcomm_session *rfcomm_recv_disc(struct rfcomm_session *s,
+ u8 dlci)
{
int err = 0;
@@ -1120,9 +1301,9 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
d->state = BT_CLOSED;
__rfcomm_dlc_close(d, err);
- } else
+ } else
rfcomm_send_dm(s, dlci);
-
+
} else {
rfcomm_send_ua(s, 0);
@@ -1131,47 +1312,52 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
else
err = ECONNRESET;
- s->state = BT_CLOSED;
- rfcomm_session_close(s, err);
+ s = rfcomm_session_close(s, err);
}
-
- return 0;
-}
-
-static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d)
-{
- struct sock *sk = d->session->sock->sk;
-
- if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) {
- if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon))
- return 1;
- } else if (d->link_mode & RFCOMM_LM_AUTH) {
- if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon))
- return 1;
- }
-
- return 0;
+ return s;
}
-static void rfcomm_dlc_accept(struct rfcomm_dlc *d)
+void rfcomm_dlc_accept(struct rfcomm_dlc *d)
{
struct sock *sk = d->session->sock->sk;
+ struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
BT_DBG("dlc %p", d);
rfcomm_send_ua(d->session, d->dlci);
+ rfcomm_dlc_clear_timer(d);
+
rfcomm_dlc_lock(d);
d->state = BT_CONNECTED;
d->state_change(d, 0);
rfcomm_dlc_unlock(d);
- if (d->link_mode & RFCOMM_LM_MASTER)
- hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00);
+ if (d->role_switch)
+ hci_conn_switch_role(conn->hcon, 0x00);
rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
}
+static void rfcomm_check_accept(struct rfcomm_dlc *d)
+{
+ if (rfcomm_check_security(d)) {
+ if (d->defer_setup) {
+ set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECT2;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+ } else
+ rfcomm_dlc_accept(d);
+ } else {
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ }
+}
+
static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
{
struct rfcomm_dlc *d;
@@ -1194,13 +1380,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
if (d) {
if (d->state == BT_OPEN) {
/* DLC was previously opened by PN request */
- if (rfcomm_check_link_mode(d)) {
- set_bit(RFCOMM_AUTH_PENDING, &d->flags);
- rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
- return 0;
- }
-
- rfcomm_dlc_accept(d);
+ rfcomm_check_accept(d);
}
return 0;
}
@@ -1212,13 +1392,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
d->addr = __addr(s->initiator, dlci);
rfcomm_dlc_link(s, d);
- if (rfcomm_check_link_mode(d)) {
- set_bit(RFCOMM_AUTH_PENDING, &d->flags);
- rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
- return 0;
- }
-
- rfcomm_dlc_accept(d);
+ rfcomm_check_accept(d);
} else {
rfcomm_send_dm(s, dlci);
}
@@ -1230,7 +1404,7 @@ static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn)
{
struct rfcomm_session *s = d->session;
- BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d",
+ BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d",
d, d->state, d->dlci, pn->mtu, pn->flow_ctrl, pn->credits);
if ((pn->flow_ctrl == 0xf0 && s->cfc != RFCOMM_CFC_DISABLED) ||
@@ -1247,7 +1421,7 @@ static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn)
d->priority = pn->priority;
- d->mtu = btohs(pn->mtu);
+ d->mtu = __le16_to_cpu(pn->mtu);
if (cr && d->mtu > s->mtu)
d->mtu = s->mtu;
@@ -1329,8 +1503,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
return 0;
if (len == 1) {
- /* This is a request, return default settings */
- bit_rate = RFCOMM_RPN_BR_115200;
+ /* This is a request, return default (according to ETSI TS 07.10) settings */
+ bit_rate = RFCOMM_RPN_BR_9600;
data_bits = RFCOMM_RPN_DATA_8;
stop_bits = RFCOMM_RPN_STOP_1;
parity = RFCOMM_RPN_PARITY_NONE;
@@ -1345,9 +1519,9 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) {
bit_rate = rpn->bit_rate;
- if (bit_rate != RFCOMM_RPN_BR_115200) {
+ if (bit_rate > RFCOMM_RPN_BR_230400) {
BT_DBG("RPN bit rate mismatch 0x%x", bit_rate);
- bit_rate = RFCOMM_RPN_BR_115200;
+ bit_rate = RFCOMM_RPN_BR_9600;
rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
}
}
@@ -1451,10 +1625,14 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
rfcomm_dlc_lock(d);
+
+ d->remote_v24_sig = msc->v24_sig;
+
if (d->modem_status)
d->modem_status(d, msc->v24_sig);
+
rfcomm_dlc_unlock(d);
-
+
rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
d->mscex |= RFCOMM_MSCEX_RX;
@@ -1557,22 +1735,29 @@ drop:
return 0;
}
-static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb)
+static struct rfcomm_session *rfcomm_recv_frame(struct rfcomm_session *s,
+ struct sk_buff *skb)
{
struct rfcomm_hdr *hdr = (void *) skb->data;
u8 type, dlci, fcs;
+ if (!s) {
+ /* no session, so free socket data */
+ kfree_skb(skb);
+ return s;
+ }
+
dlci = __get_dlci(hdr->addr);
type = __get_type(hdr->ctrl);
/* Trim FCS */
skb->len--; skb->tail--;
- fcs = *(u8 *) skb->tail;
+ fcs = *(u8 *)skb_tail_pointer(skb);
if (__check_fcs(skb->data, type, fcs)) {
BT_ERR("bad checksum in packet");
kfree_skb(skb);
- return -EILSEQ;
+ return s;
}
if (__test_ea(hdr->len))
@@ -1588,47 +1773,51 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb)
case RFCOMM_DISC:
if (__test_pf(hdr->ctrl))
- rfcomm_recv_disc(s, dlci);
+ s = rfcomm_recv_disc(s, dlci);
break;
case RFCOMM_UA:
if (__test_pf(hdr->ctrl))
- rfcomm_recv_ua(s, dlci);
+ s = rfcomm_recv_ua(s, dlci);
break;
case RFCOMM_DM:
- rfcomm_recv_dm(s, dlci);
+ s = rfcomm_recv_dm(s, dlci);
break;
case RFCOMM_UIH:
- if (dlci)
- return rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb);
-
+ if (dlci) {
+ rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb);
+ return s;
+ }
rfcomm_recv_mcc(s, skb);
break;
default:
- BT_ERR("Unknown packet type 0x%02x\n", type);
+ BT_ERR("Unknown packet type 0x%02x", type);
break;
}
kfree_skb(skb);
- return 0;
+ return s;
}
/* ---- Connection and data processing ---- */
static void rfcomm_process_connect(struct rfcomm_session *s)
{
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
BT_DBG("session %p state %ld", s, s->state);
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
if (d->state == BT_CONFIG) {
d->mtu = s->mtu;
- rfcomm_send_pn(s, 1, d);
+ if (rfcomm_check_security(d)) {
+ rfcomm_send_pn(s, 1, d);
+ } else {
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ }
}
}
}
@@ -1636,23 +1825,23 @@ static void rfcomm_process_connect(struct rfcomm_session *s)
/* Send data queued for the DLC.
* Return number of frames left in the queue.
*/
-static inline int rfcomm_process_tx(struct rfcomm_dlc *d)
+static int rfcomm_process_tx(struct rfcomm_dlc *d)
{
struct sk_buff *skb;
int err;
- BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d",
+ BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d",
d, d->state, d->cfc, d->rx_credits, d->tx_credits);
/* Send pending MSC */
if (test_and_clear_bit(RFCOMM_MSC_PENDING, &d->flags))
- rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
+ rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
if (d->cfc) {
- /* CFC enabled.
+ /* CFC enabled.
* Give them some credits */
if (!test_bit(RFCOMM_RX_THROTTLED, &d->flags) &&
- d->rx_credits <= (d->cfc >> 2)) {
+ d->rx_credits <= (d->cfc >> 2)) {
rfcomm_send_credits(d->session, d->addr, d->cfc - d->rx_credits);
d->rx_credits = d->cfc;
}
@@ -1684,46 +1873,64 @@ static inline int rfcomm_process_tx(struct rfcomm_dlc *d)
return skb_queue_len(&d->tx_queue);
}
-static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
+static void rfcomm_process_dlcs(struct rfcomm_session *s)
{
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
BT_DBG("session %p state %ld", s, s->state);
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
-
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) {
__rfcomm_dlc_close(d, ETIMEDOUT);
continue;
}
+ if (test_bit(RFCOMM_ENC_DROP, &d->flags)) {
+ __rfcomm_dlc_close(d, ECONNREFUSED);
+ continue;
+ }
+
if (test_and_clear_bit(RFCOMM_AUTH_ACCEPT, &d->flags)) {
rfcomm_dlc_clear_timer(d);
- rfcomm_dlc_accept(d);
- if (d->link_mode & RFCOMM_LM_SECURE) {
- struct sock *sk = s->sock->sk;
- hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon);
+ if (d->out) {
+ rfcomm_send_pn(s, 1, d);
+ rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
+ } else {
+ if (d->defer_setup) {
+ set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECT2;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+ } else
+ rfcomm_dlc_accept(d);
}
continue;
} else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) {
rfcomm_dlc_clear_timer(d);
- rfcomm_send_dm(s, d->dlci);
+ if (!d->out)
+ rfcomm_send_dm(s, d->dlci);
+ else
+ d->state = BT_CLOSED;
__rfcomm_dlc_close(d, ECONNREFUSED);
continue;
}
+ if (test_bit(RFCOMM_SEC_PENDING, &d->flags))
+ continue;
+
if (test_bit(RFCOMM_TX_THROTTLED, &s->flags))
continue;
if ((d->state == BT_CONNECTED || d->state == BT_DISCONN) &&
- d->mscex == RFCOMM_MSCEX_OK)
+ d->mscex == RFCOMM_MSCEX_OK)
rfcomm_process_tx(d);
}
}
-static inline void rfcomm_process_rx(struct rfcomm_session *s)
+static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s)
{
struct socket *sock = s->sock;
struct sock *sk = sock->sk;
@@ -1734,138 +1941,112 @@ static inline void rfcomm_process_rx(struct rfcomm_session *s)
/* Get data directly from socket receive queue without copying it. */
while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
skb_orphan(skb);
- rfcomm_recv_frame(s, skb);
+ if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) {
+ s = rfcomm_recv_frame(s, skb);
+ if (!s)
+ break;
+ } else {
+ kfree_skb(skb);
+ }
}
- if (sk->sk_state == BT_CLOSED) {
- if (!s->initiator)
- rfcomm_session_put(s);
+ if (s && (sk->sk_state == BT_CLOSED))
+ s = rfcomm_session_close(s, sk->sk_err);
- rfcomm_session_close(s, sk->sk_err);
- }
+ return s;
}
-static inline void rfcomm_accept_connection(struct rfcomm_session *s)
+static void rfcomm_accept_connection(struct rfcomm_session *s)
{
struct socket *sock = s->sock, *nsock;
int err;
/* Fast check for a new connection.
- * Avoids unnesesary socket allocations. */
+ * Avoids unnecessary socket allocations.
+ */
if (list_empty(&bt_sk(sock->sk)->accept_q))
return;
BT_DBG("session %p", s);
- if (sock_create_lite(PF_BLUETOOTH, sock->type, BTPROTO_L2CAP, &nsock))
+ err = kernel_accept(sock, &nsock, O_NONBLOCK);
+ if (err < 0)
return;
- nsock->ops = sock->ops;
-
- __module_get(nsock->ops->owner);
-
- err = sock->ops->accept(sock, nsock, O_NONBLOCK);
- if (err < 0) {
- sock_release(nsock);
- return;
- }
-
/* Set our callbacks */
nsock->sk->sk_data_ready = rfcomm_l2data_ready;
nsock->sk->sk_state_change = rfcomm_l2state_change;
s = rfcomm_session_add(nsock, BT_OPEN);
if (s) {
- rfcomm_session_hold(s);
-
/* We should adjust MTU on incoming sessions.
* L2CAP MTU minus UIH header and FCS. */
- s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5;
+ s->mtu = min(l2cap_pi(nsock->sk)->chan->omtu,
+ l2cap_pi(nsock->sk)->chan->imtu) - 5;
- rfcomm_schedule(RFCOMM_SCHED_RX);
+ rfcomm_schedule();
} else
sock_release(nsock);
}
-static inline void rfcomm_check_connection(struct rfcomm_session *s)
+static struct rfcomm_session *rfcomm_check_connection(struct rfcomm_session *s)
{
struct sock *sk = s->sock->sk;
BT_DBG("%p state %ld", s, s->state);
- switch(sk->sk_state) {
+ switch (sk->sk_state) {
case BT_CONNECTED:
s->state = BT_CONNECT;
/* We can adjust MTU on outgoing sessions.
* L2CAP MTU minus UIH header and FCS. */
- s->mtu = min(l2cap_pi(sk)->omtu, l2cap_pi(sk)->imtu) - 5;
+ s->mtu = min(l2cap_pi(sk)->chan->omtu, l2cap_pi(sk)->chan->imtu) - 5;
rfcomm_send_sabm(s, 0);
break;
case BT_CLOSED:
- s->state = BT_CLOSED;
- rfcomm_session_close(s, sk->sk_err);
+ s = rfcomm_session_close(s, sk->sk_err);
break;
}
+ return s;
}
-static inline void rfcomm_process_sessions(void)
+static void rfcomm_process_sessions(void)
{
- struct list_head *p, *n;
+ struct rfcomm_session *s, *n;
rfcomm_lock();
- list_for_each_safe(p, n, &session_list) {
- struct rfcomm_session *s;
- s = list_entry(p, struct rfcomm_session, list);
-
- if (s->state == BT_LISTEN) {
- rfcomm_accept_connection(s);
+ list_for_each_entry_safe(s, n, &session_list, list) {
+ if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) {
+ s->state = BT_DISCONN;
+ rfcomm_send_disc(s, 0);
continue;
}
- rfcomm_session_hold(s);
-
switch (s->state) {
+ case BT_LISTEN:
+ rfcomm_accept_connection(s);
+ continue;
+
case BT_BOUND:
- rfcomm_check_connection(s);
+ s = rfcomm_check_connection(s);
break;
default:
- rfcomm_process_rx(s);
+ s = rfcomm_process_rx(s);
break;
}
- rfcomm_process_dlcs(s);
-
- rfcomm_session_put(s);
+ if (s)
+ rfcomm_process_dlcs(s);
}
rfcomm_unlock();
}
-static void rfcomm_worker(void)
-{
- BT_DBG("");
-
- while (!atomic_read(&terminate)) {
- if (!test_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event)) {
- /* No pending events. Let's sleep.
- * Incoming connections and data will wake us up. */
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- }
-
- /* Process stuff */
- clear_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event);
- rfcomm_process_sessions();
- }
- set_current_state(TASK_RUNNING);
- return;
-}
-
static int rfcomm_add_listener(bdaddr_t *ba)
{
struct sockaddr_l2 addr;
@@ -1876,7 +2057,7 @@ static int rfcomm_add_listener(bdaddr_t *ba)
/* Create socket */
err = rfcomm_l2sock_create(&sock);
- if (err < 0) {
+ if (err < 0) {
BT_ERR("Create socket failed %d", err);
return err;
}
@@ -1884,8 +2065,10 @@ static int rfcomm_add_listener(bdaddr_t *ba)
/* Bind socket */
bacpy(&addr.l2_bdaddr, ba);
addr.l2_family = AF_BLUETOOTH;
- addr.l2_psm = htobs(RFCOMM_PSM);
- err = sock->ops->bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+ addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM);
+ addr.l2_cid = 0;
+ addr.l2_bdaddr_type = BDADDR_BREDR;
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
if (err < 0) {
BT_ERR("Bind failed %d", err);
goto failed;
@@ -1894,11 +2077,12 @@ static int rfcomm_add_listener(bdaddr_t *ba)
/* Set L2CAP options */
sk = sock->sk;
lock_sock(sk);
- l2cap_pi(sk)->imtu = l2cap_mtu;
+ /* Set MTU to 0 so L2CAP can auto select the MTU */
+ l2cap_pi(sk)->chan->imtu = 0;
release_sock(sk);
/* Start listening on the socket */
- err = sock->ops->listen(sock, 10);
+ err = kernel_listen(sock, 10);
if (err) {
BT_ERR("Listen failed %d", err);
goto failed;
@@ -1906,10 +2090,11 @@ static int rfcomm_add_listener(bdaddr_t *ba)
/* Add listening session */
s = rfcomm_session_add(sock, BT_LISTEN);
- if (!s)
+ if (!s) {
+ err = -ENOMEM;
goto failed;
+ }
- rfcomm_session_hold(s);
return 0;
failed:
sock_release(sock);
@@ -1918,78 +2103,42 @@ failed:
static void rfcomm_kill_listener(void)
{
- struct rfcomm_session *s;
- struct list_head *p, *n;
+ struct rfcomm_session *s, *n;
BT_DBG("");
- list_for_each_safe(p, n, &session_list) {
- s = list_entry(p, struct rfcomm_session, list);
+ list_for_each_entry_safe(s, n, &session_list, list)
rfcomm_session_del(s);
- }
}
static int rfcomm_run(void *unused)
{
- rfcomm_thread = current;
-
- atomic_inc(&running);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ BT_DBG("");
- daemonize("krfcommd");
set_user_nice(current, -10);
- current->flags |= PF_NOFREEZE;
-
- BT_DBG("");
rfcomm_add_listener(BDADDR_ANY);
- rfcomm_worker();
-
- rfcomm_kill_listener();
-
- atomic_dec(&running);
- return 0;
-}
-
-static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status)
-{
- struct rfcomm_session *s;
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
-
- BT_DBG("conn %p status 0x%02x", conn, status);
-
- s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst);
- if (!s)
- return;
+ add_wait_queue(&rfcomm_wq, &wait);
+ while (!kthread_should_stop()) {
- rfcomm_session_hold(s);
-
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
-
- if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE))
- continue;
-
- if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
- continue;
+ /* Process stuff */
+ rfcomm_process_sessions();
- if (!status)
- set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
- else
- set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
+ remove_wait_queue(&rfcomm_wq, &wait);
- rfcomm_session_put(s);
+ rfcomm_kill_listener();
- rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ return 0;
}
-static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
+static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
{
struct rfcomm_session *s;
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt);
@@ -1997,99 +2146,123 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
if (!s)
return;
- rfcomm_session_hold(s);
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
+ if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
+ rfcomm_dlc_clear_timer(d);
+ if (status || encrypt == 0x00) {
+ set_bit(RFCOMM_ENC_DROP, &d->flags);
+ continue;
+ }
+ }
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
+ if (d->state == BT_CONNECTED && !status && encrypt == 0x00) {
+ if (d->sec_level == BT_SECURITY_MEDIUM) {
+ set_bit(RFCOMM_SEC_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ continue;
+ } else if (d->sec_level == BT_SECURITY_HIGH ||
+ d->sec_level == BT_SECURITY_FIPS) {
+ set_bit(RFCOMM_ENC_DROP, &d->flags);
+ continue;
+ }
+ }
if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
continue;
- if (!status && encrypt)
+ if (!status && hci_conn_check_secure(conn, d->sec_level))
set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
else
set_bit(RFCOMM_AUTH_REJECT, &d->flags);
}
- rfcomm_session_put(s);
-
- rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ rfcomm_schedule();
}
static struct hci_cb rfcomm_cb = {
.name = "RFCOMM",
- .auth_cfm = rfcomm_auth_cfm,
- .encrypt_cfm = rfcomm_encrypt_cfm
+ .security_cfm = rfcomm_security_cfm
};
-static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf)
+static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
{
struct rfcomm_session *s;
- struct list_head *pp, *p;
- char *str = buf;
rfcomm_lock();
- list_for_each(p, &session_list) {
- s = list_entry(p, struct rfcomm_session, list);
- list_for_each(pp, &s->dlcs) {
- struct sock *sk = s->sock->sk;
- struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list);
-
- str += sprintf(str, "%s %s %ld %d %d %d %d\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits);
+ list_for_each_entry(s, &session_list, list) {
+ struct l2cap_chan *chan = l2cap_pi(s->sock->sk)->chan;
+ struct rfcomm_dlc *d;
+ list_for_each_entry(d, &s->dlcs, list) {
+ seq_printf(f, "%pMR %pMR %ld %d %d %d %d\n",
+ &chan->src, &chan->dst,
+ d->state, d->dlci, d->mtu,
+ d->rx_credits, d->tx_credits);
}
}
rfcomm_unlock();
- return (str - buf);
+ return 0;
}
-static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL);
+DEFINE_SHOW_ATTRIBUTE(rfcomm_dlc_debugfs);
+
+static struct dentry *rfcomm_dlc_debugfs;
/* ---- Initialization ---- */
static int __init rfcomm_init(void)
{
- l2cap_load();
+ int err;
hci_register_cb(&rfcomm_cb);
- kernel_thread(rfcomm_run, NULL, CLONE_KERNEL);
-
- if (class_create_file(bt_class, &class_attr_rfcomm_dlc) < 0)
- BT_ERR("Failed to create RFCOMM info file");
+ rfcomm_thread = kthread_run(rfcomm_run, NULL, "krfcommd");
+ if (IS_ERR(rfcomm_thread)) {
+ err = PTR_ERR(rfcomm_thread);
+ goto unregister;
+ }
- rfcomm_init_sockets();
+ err = rfcomm_init_ttys();
+ if (err < 0)
+ goto stop;
-#ifdef CONFIG_BT_RFCOMM_TTY
- rfcomm_init_ttys();
-#endif
+ err = rfcomm_init_sockets();
+ if (err < 0)
+ goto cleanup;
BT_INFO("RFCOMM ver %s", VERSION);
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444,
+ bt_debugfs, NULL,
+ &rfcomm_dlc_debugfs_fops);
+
return 0;
+
+cleanup:
+ rfcomm_cleanup_ttys();
+
+stop:
+ kthread_stop(rfcomm_thread);
+
+unregister:
+ hci_unregister_cb(&rfcomm_cb);
+
+ return err;
}
static void __exit rfcomm_exit(void)
{
- class_remove_file(bt_class, &class_attr_rfcomm_dlc);
+ debugfs_remove(rfcomm_dlc_debugfs);
hci_unregister_cb(&rfcomm_cb);
- /* Terminate working thread.
- * ie. Set terminate flag and wake it up */
- atomic_inc(&terminate);
- rfcomm_schedule(RFCOMM_SCHED_STATE);
-
- /* Wait until thread is running */
- while (atomic_read(&running))
- schedule();
+ kthread_stop(rfcomm_thread);
-#ifdef CONFIG_BT_RFCOMM_TTY
rfcomm_cleanup_ttys();
-#endif
rfcomm_cleanup_sockets();
}
@@ -2103,10 +2276,10 @@ MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control");
module_param(channel_mtu, int, 0644);
MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel");
-module_param(l2cap_mtu, uint, 0644);
-MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection");
+module_param(l2cap_ertm, bool, 0644);
+MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection");
-MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>, Marcel Holtmann <marcel@holtmann.org>");
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
MODULE_DESCRIPTION("Bluetooth RFCOMM ver " VERSION);
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 544d65b7baa7..be6639cd6f59 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1,4 +1,4 @@
-/*
+/*
RFCOMM implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
@@ -11,56 +11,33 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/*
* RFCOMM sockets.
- *
- * $Id: sock.c,v 1.24 2002/10/03 01:00:34 maxk Exp $
*/
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/list.h>
-#include <linux/device.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/sched/signal.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/rfcomm.h>
-#ifndef CONFIG_BT_RFCOMM_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
static const struct proto_ops rfcomm_sock_ops;
static struct bt_sock_list rfcomm_sk_list = {
- .lock = RW_LOCK_UNLOCKED
+ .lock = __RW_LOCK_UNLOCKED(rfcomm_sk_list.lock)
};
static void rfcomm_sock_close(struct sock *sk);
@@ -78,7 +55,7 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
atomic_add(skb->len, &sk->sk_rmem_alloc);
skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
rfcomm_dlc_throttle(d);
@@ -87,12 +64,13 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
{
struct sock *sk = d->owner, *parent;
+
if (!sk)
return;
BT_DBG("dlc %p state %ld err %d", d, d->state, err);
- bh_lock_sock(sk);
+ lock_sock(sk);
if (err)
sk->sk_err = err;
@@ -105,14 +83,15 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
sock_set_flag(sk, SOCK_ZAPPED);
bt_accept_unlink(sk);
}
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
if (d->state == BT_CONNECTED)
- rfcomm_session_getaddr(d->session, &bt_sk(sk)->src, NULL);
+ rfcomm_session_getaddr(d->session,
+ &rfcomm_pi(sk)->src, NULL);
sk->sk_state_change(sk);
}
- bh_unlock_sock(sk);
+ release_sock(sk);
if (parent && sock_flag(sk, SOCK_ZAPPED)) {
/* We have to drop DLC lock here, otherwise
@@ -124,55 +103,51 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
}
/* ---- Socket functions ---- */
-static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src)
+static struct sock *__rfcomm_get_listen_sock_by_addr(u8 channel, bdaddr_t *src)
{
struct sock *sk = NULL;
- struct hlist_node *node;
- sk_for_each(sk, node, &rfcomm_sk_list.head) {
- if (rfcomm_pi(sk)->channel == channel &&
- !bacmp(&bt_sk(sk)->src, src))
+ sk_for_each(sk, &rfcomm_sk_list.head) {
+ if (rfcomm_pi(sk)->channel != channel)
+ continue;
+
+ if (bacmp(&rfcomm_pi(sk)->src, src))
+ continue;
+
+ if (sk->sk_state == BT_BOUND || sk->sk_state == BT_LISTEN)
break;
}
- return node ? sk : NULL;
+ return sk ? sk : NULL;
}
/* Find socket with channel and source bdaddr.
* Returns closest match.
*/
-static struct sock *__rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src)
+static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src)
{
struct sock *sk = NULL, *sk1 = NULL;
- struct hlist_node *node;
- sk_for_each(sk, node, &rfcomm_sk_list.head) {
+ read_lock(&rfcomm_sk_list.lock);
+
+ sk_for_each(sk, &rfcomm_sk_list.head) {
if (state && sk->sk_state != state)
continue;
if (rfcomm_pi(sk)->channel == channel) {
/* Exact match. */
- if (!bacmp(&bt_sk(sk)->src, src))
+ if (!bacmp(&rfcomm_pi(sk)->src, src))
break;
/* Closest match */
- if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
+ if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY))
sk1 = sk;
}
}
- return node ? sk : sk1;
-}
-/* Find socket with given address (channel, src).
- * Returns locked socket */
-static inline struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src)
-{
- struct sock *s;
- read_lock(&rfcomm_sk_list.lock);
- s = __rfcomm_get_sock_by_channel(state, channel, src);
- if (s) bh_lock_sock(s);
read_unlock(&rfcomm_sk_list.lock);
- return s;
+
+ return sk ? sk : sk1;
}
static void rfcomm_sock_destruct(struct sock *sk)
@@ -219,7 +194,7 @@ static void rfcomm_sock_kill(struct sock *sk)
if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
return;
- BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, atomic_read(&sk->sk_refcnt));
+ BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, refcount_read(&sk->sk_refcnt));
/* Kill poor orphan */
bt_sock_unlink(&rfcomm_sk_list, sk);
@@ -243,6 +218,7 @@ static void __rfcomm_sock_close(struct sock *sk)
case BT_CONFIG:
case BT_CONNECTED:
rfcomm_dlc_close(d, 0);
+ fallthrough;
default:
sock_set_flag(sk, SOCK_ZAPPED);
@@ -268,12 +244,22 @@ static void rfcomm_sock_init(struct sock *sk, struct sock *parent)
if (parent) {
sk->sk_type = parent->sk_type;
- pi->link_mode = rfcomm_pi(parent)->link_mode;
+ pi->dlc->defer_setup = test_bit(BT_SK_DEFER_SETUP,
+ &bt_sk(parent)->flags);
+
+ pi->sec_level = rfcomm_pi(parent)->sec_level;
+ pi->role_switch = rfcomm_pi(parent)->role_switch;
+
+ security_sk_clone(parent, sk);
} else {
- pi->link_mode = 0;
+ pi->dlc->defer_setup = 0;
+
+ pi->sec_level = BT_SECURITY_LOW;
+ pi->role_switch = 0;
}
- pi->dlc->link_mode = pi->link_mode;
+ pi->dlc->sec_level = pi->sec_level;
+ pi->dlc->role_switch = pi->role_switch;
}
static struct proto rfcomm_proto = {
@@ -282,21 +268,19 @@ static struct proto rfcomm_proto = {
.obj_size = sizeof(struct rfcomm_pinfo)
};
-static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
{
struct rfcomm_dlc *d;
struct sock *sk;
- sk = sk_alloc(PF_BLUETOOTH, prio, &rfcomm_proto, 1);
- if (!sk)
+ d = rfcomm_dlc_alloc(prio);
+ if (!d)
return NULL;
- sock_init_data(sock, sk);
- INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
-
- d = rfcomm_dlc_alloc(prio);
- if (!d) {
- sk_free(sk);
+ sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern);
+ if (!sk) {
+ rfcomm_dlc_free(d);
return NULL;
}
@@ -309,13 +293,8 @@ static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, gfp_t prio
sk->sk_destruct = rfcomm_sock_destruct;
sk->sk_sndtimeo = RFCOMM_CONN_TIMEOUT;
- sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
- sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
-
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = proto;
- sk->sk_state = BT_OPEN;
+ sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
+ sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
bt_sock_link(&rfcomm_sk_list, sk);
@@ -323,7 +302,8 @@ static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, gfp_t prio
return sk;
}
-static int rfcomm_sock_create(struct socket *sock, int protocol)
+static int rfcomm_sock_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
{
struct sock *sk;
@@ -336,7 +316,7 @@ static int rfcomm_sock_create(struct socket *sock, int protocol)
sock->ops = &rfcomm_sock_ops;
- sk = rfcomm_sock_alloc(sock, protocol, GFP_ATOMIC);
+ sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
@@ -344,17 +324,22 @@ static int rfcomm_sock_create(struct socket *sock, int protocol)
return 0;
}
-static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int rfcomm_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
{
- struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+ struct sockaddr_rc sa;
struct sock *sk = sock->sk;
- int err = 0;
-
- BT_DBG("sk %p %s", sk, batostr(&sa->rc_bdaddr));
+ int len, err = 0;
- if (!addr || addr->sa_family != AF_BLUETOOTH)
+ if (!addr || addr_len < offsetofend(struct sockaddr, sa_family) ||
+ addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ memset(&sa, 0, sizeof(sa));
+ len = min_t(unsigned int, sizeof(sa), addr_len);
+ memcpy(&sa, addr, len);
+
+ BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr);
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -367,25 +352,27 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr
goto done;
}
- write_lock_bh(&rfcomm_sk_list.lock);
+ write_lock(&rfcomm_sk_list.lock);
- if (sa->rc_channel && __rfcomm_get_sock_by_addr(sa->rc_channel, &sa->rc_bdaddr)) {
+ if (sa.rc_channel &&
+ __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) {
err = -EADDRINUSE;
} else {
/* Save source address */
- bacpy(&bt_sk(sk)->src, &sa->rc_bdaddr);
- rfcomm_pi(sk)->channel = sa->rc_channel;
+ bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr);
+ rfcomm_pi(sk)->channel = sa.rc_channel;
sk->sk_state = BT_BOUND;
}
- write_unlock_bh(&rfcomm_sk_list.lock);
+ write_unlock(&rfcomm_sk_list.lock);
done:
release_sock(sk);
return err;
}
-static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+static int rfcomm_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
+ int alen, int flags)
{
struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
struct sock *sk = sock->sk;
@@ -394,9 +381,11 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
BT_DBG("sk %p", sk);
- if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_rc))
+ if (alen < sizeof(struct sockaddr_rc) ||
+ addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ sock_hold(sk);
lock_sock(sk);
if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
@@ -410,16 +399,24 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
}
sk->sk_state = BT_CONNECT;
- bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr);
+ bacpy(&rfcomm_pi(sk)->dst, &sa->rc_bdaddr);
rfcomm_pi(sk)->channel = sa->rc_channel;
- err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel);
- if (!err)
+ d->sec_level = rfcomm_pi(sk)->sec_level;
+ d->role_switch = rfcomm_pi(sk)->role_switch;
+
+ /* Drop sock lock to avoid potential deadlock with the RFCOMM lock */
+ release_sock(sk);
+ err = rfcomm_dlc_open(d, &rfcomm_pi(sk)->src, &sa->rc_bdaddr,
+ sa->rc_channel);
+ lock_sock(sk);
+ if (!err && !sock_flag(sk, SOCK_ZAPPED))
err = bt_sock_wait_state(sk, BT_CONNECTED,
sock_sndtimeo(sk, flags & O_NONBLOCK));
done:
release_sock(sk);
+ sock_put(sk);
return err;
}
@@ -443,21 +440,21 @@ static int rfcomm_sock_listen(struct socket *sock, int backlog)
}
if (!rfcomm_pi(sk)->channel) {
- bdaddr_t *src = &bt_sk(sk)->src;
+ bdaddr_t *src = &rfcomm_pi(sk)->src;
u8 channel;
err = -EINVAL;
- write_lock_bh(&rfcomm_sk_list.lock);
+ write_lock(&rfcomm_sk_list.lock);
for (channel = 1; channel < 31; channel++)
- if (!__rfcomm_get_sock_by_addr(channel, src)) {
+ if (!__rfcomm_get_listen_sock_by_addr(channel, src)) {
rfcomm_pi(sk)->channel = channel;
err = 0;
break;
}
- write_unlock_bh(&rfcomm_sk_list.lock);
+ write_unlock(&rfcomm_sk_list.lock);
if (err < 0)
goto done;
@@ -472,44 +469,39 @@ done:
return err;
}
-static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
- DECLARE_WAITQUEUE(wait, current);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *nsk;
long timeo;
int err = 0;
- lock_sock(sk);
-
- if (sk->sk_state != BT_LISTEN) {
- err = -EBADFD;
- goto done;
- }
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (sk->sk_type != SOCK_STREAM) {
err = -EINVAL;
goto done;
}
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
BT_DBG("sk %p timeo %ld", sk, timeo);
/* Wait for an incoming connection. (wake-one). */
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- while (!(nsk = bt_accept_dequeue(sk, newsock))) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (!timeo) {
- err = -EAGAIN;
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
break;
}
- release_sock(sk);
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
+ nsk = bt_accept_dequeue(sk, newsock);
+ if (nsk)
+ break;
- if (sk->sk_state != BT_LISTEN) {
- err = -EBADFD;
+ if (!timeo) {
+ err = -EAGAIN;
break;
}
@@ -517,9 +509,14 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f
err = sock_intr_errno(timeo);
break;
}
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
}
- set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
if (err)
goto done;
@@ -533,32 +530,38 @@ done:
return err;
}
-static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int peer)
{
struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
struct sock *sk = sock->sk;
BT_DBG("sock %p, sk %p", sock, sk);
+ if (peer && sk->sk_state != BT_CONNECTED &&
+ sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2)
+ return -ENOTCONN;
+
+ memset(sa, 0, sizeof(*sa));
sa->rc_family = AF_BLUETOOTH;
sa->rc_channel = rfcomm_pi(sk)->channel;
if (peer)
- bacpy(&sa->rc_bdaddr, &bt_sk(sk)->dst);
+ bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->dst);
else
- bacpy(&sa->rc_bdaddr, &bt_sk(sk)->src);
+ bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->src);
- *len = sizeof(struct sockaddr_rc);
- return 0;
+ return sizeof(struct sockaddr_rc);
}
-static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
{
struct sock *sk = sock->sk;
struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
struct sk_buff *skb;
- int err;
- int sent = 0;
+ int sent;
+
+ if (test_bit(RFCOMM_DEFER_SETUP, &d->flags))
+ return -ENOTCONN;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -570,164 +573,221 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
lock_sock(sk);
- while (len) {
- size_t size = min_t(size_t, len, d->mtu);
-
- skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
- msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb)
- break;
- skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
+ sent = bt_sock_wait_ready(sk, msg->msg_flags);
- err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
- if (err) {
- kfree_skb(skb);
- sent = err;
- break;
- }
+ release_sock(sk);
- err = rfcomm_dlc_send(d, skb);
- if (err < 0) {
- kfree_skb(skb);
- break;
- }
+ if (sent)
+ return sent;
+
+ skb = bt_skb_sendmmsg(sk, msg, len, d->mtu, RFCOMM_SKB_HEAD_RESERVE,
+ RFCOMM_SKB_TAIL_RESERVE);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
- sent += size;
- len -= size;
+ sent = rfcomm_dlc_send(d, skb);
+ if (sent < 0)
+ kfree_skb(skb);
+
+ return sent;
+}
+
+static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+ int len;
+
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ rfcomm_dlc_accept(d);
+ return 0;
}
+ len = bt_sock_stream_recvmsg(sock, msg, size, flags);
+
+ lock_sock(sk);
+ if (!(flags & MSG_PEEK) && len > 0)
+ atomic_sub(len, &sk->sk_rmem_alloc);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
+ rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
release_sock(sk);
- return sent ? sent : err;
+ return len;
}
-static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
+static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname,
+ sockptr_t optval, unsigned int optlen)
{
- DECLARE_WAITQUEUE(wait, current);
+ struct sock *sk = sock->sk;
+ int err = 0;
+ u32 opt;
- add_wait_queue(sk->sk_sleep, &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
+ BT_DBG("sk %p", sk);
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- sk->sk_err ||
- (sk->sk_shutdown & RCV_SHUTDOWN) ||
- signal_pending(current) ||
- !timeo)
+ lock_sock(sk);
+
+ switch (optname) {
+ case RFCOMM_LM:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- release_sock(sk);
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ if (opt & RFCOMM_LM_FIPS) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (opt & RFCOMM_LM_AUTH)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW;
+ if (opt & RFCOMM_LM_ENCRYPT)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+ if (opt & RFCOMM_LM_SECURE)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+ rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
}
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
- return timeo;
+ release_sock(sk);
+ return err;
}
-static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
+static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
+ struct bt_security sec;
int err = 0;
- size_t target, copied = 0;
- long timeo;
+ u32 opt;
- if (flags & MSG_OOB)
- return -EOPNOTSUPP;
+ BT_DBG("sk %p", sk);
- msg->msg_namelen = 0;
+ if (level == SOL_RFCOMM)
+ return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen);
- BT_DBG("sk %p size %d", sk, size);
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
lock_sock(sk);
- target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- do {
- struct sk_buff *skb;
- int chunk;
-
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (!skb) {
- if (copied >= target)
- break;
-
- if ((err = sock_error(sk)) != 0)
- break;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- break;
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ break;
+ }
- err = -EAGAIN;
- if (!timeo)
- break;
+ sec.level = BT_SECURITY_LOW;
- timeo = rfcomm_sock_data_wait(sk, timeo);
+ err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen);
+ if (err)
+ break;
- if (signal_pending(current)) {
- err = sock_intr_errno(timeo);
- goto out;
- }
- continue;
+ if (sec.level > BT_SECURITY_HIGH) {
+ err = -EINVAL;
+ break;
}
- chunk = min_t(unsigned int, skb->len, size);
- if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
- skb_queue_head(&sk->sk_receive_queue, skb);
- if (!copied)
- copied = -EFAULT;
+ rfcomm_pi(sk)->sec_level = sec.level;
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
break;
}
- copied += chunk;
- size -= chunk;
- if (!(flags & MSG_PEEK)) {
- atomic_sub(chunk, &sk->sk_rmem_alloc);
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
- skb_pull(skb, chunk);
- if (skb->len) {
- skb_queue_head(&sk->sk_receive_queue, skb);
- break;
- }
- kfree_skb(skb);
+ if (opt)
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
- } else {
- /* put message back and return */
- skb_queue_head(&sk->sk_receive_queue, skb);
- break;
- }
- } while (size);
+ break;
-out:
- if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
- rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
release_sock(sk);
- return copied ? : err;
+ return err;
}
-static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
+ struct sock *l2cap_sk;
+ struct l2cap_conn *conn;
+ struct rfcomm_conninfo cinfo;
int err = 0;
+ size_t len;
u32 opt;
BT_DBG("sk %p", sk);
+ if (get_user(len, optlen))
+ return -EFAULT;
+
lock_sock(sk);
switch (optname) {
case RFCOMM_LM:
- if (get_user(opt, (u32 __user *) optval)) {
+ switch (rfcomm_pi(sk)->sec_level) {
+ case BT_SECURITY_LOW:
+ opt = RFCOMM_LM_AUTH;
+ break;
+ case BT_SECURITY_MEDIUM:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT;
+ break;
+ case BT_SECURITY_HIGH:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+ RFCOMM_LM_SECURE;
+ break;
+ case BT_SECURITY_FIPS:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+ RFCOMM_LM_SECURE | RFCOMM_LM_FIPS;
+ break;
+ default:
+ opt = 0;
+ break;
+ }
+
+ if (rfcomm_pi(sk)->role_switch)
+ opt |= RFCOMM_LM_MASTER;
+
+ if (put_user(opt, (u32 __user *) optval))
err = -EFAULT;
+
+ break;
+
+ case RFCOMM_CONNINFO:
+ if (sk->sk_state != BT_CONNECTED &&
+ !rfcomm_pi(sk)->dlc->defer_setup) {
+ err = -ENOTCONN;
break;
}
- rfcomm_pi(sk)->link_mode = opt;
+ l2cap_sk = rfcomm_pi(sk)->dlc->session->sock->sk;
+ conn = l2cap_pi(l2cap_sk)->chan->conn;
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ cinfo.hci_handle = conn->hcon->handle;
+ memcpy(cinfo.dev_class, conn->hcon->dev_class, 3);
+
+ len = min(len, sizeof(cinfo));
+ if (copy_to_user(optval, (char *) &cinfo, len))
+ err = -EFAULT;
+
break;
default:
@@ -742,36 +802,47 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
- struct sock *l2cap_sk;
- struct rfcomm_conninfo cinfo;
- int len, err = 0;
+ struct bt_security sec;
+ int err = 0;
+ size_t len;
BT_DBG("sk %p", sk);
+ if (level == SOL_RFCOMM)
+ return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
if (get_user(len, optlen))
return -EFAULT;
lock_sock(sk);
switch (optname) {
- case RFCOMM_LM:
- if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval))
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = rfcomm_pi(sk)->sec_level;
+ sec.key_size = 0;
+
+ len = min(len, sizeof(sec));
+ if (copy_to_user(optval, (char *) &sec, len))
err = -EFAULT;
+
break;
- case RFCOMM_CONNINFO:
- if (sk->sk_state != BT_CONNECTED) {
- err = -ENOTCONN;
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
break;
}
- l2cap_sk = rfcomm_pi(sk)->dlc->session->sock->sk;
-
- cinfo.hci_handle = l2cap_pi(l2cap_sk)->conn->hcon->handle;
- memcpy(cinfo.dev_class, l2cap_pi(l2cap_sk)->conn->hcon->dev_class, 3);
-
- len = min_t(unsigned int, len, sizeof(cinfo));
- if (copy_to_user(optval, (char *) &cinfo, len))
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *) optval))
err = -EFAULT;
break;
@@ -787,21 +858,31 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- struct sock *sk = sock->sk;
+ struct sock *sk __maybe_unused = sock->sk;
int err;
- lock_sock(sk);
+ BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg);
+
+ err = bt_sock_ioctl(sock, cmd, arg);
+ if (err == -ENOIOCTLCMD) {
#ifdef CONFIG_BT_RFCOMM_TTY
- err = rfcomm_dev_ioctl(sk, cmd, (void __user *)arg);
+ err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg);
#else
- err = -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
#endif
+ }
- release_sock(sk);
return err;
}
+#ifdef CONFIG_COMPAT
+static int rfcomm_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return rfcomm_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
static int rfcomm_sock_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
@@ -809,14 +890,19 @@ static int rfcomm_sock_shutdown(struct socket *sock, int how)
BT_DBG("sock %p, sk %p", sock, sk);
- if (!sk) return 0;
+ if (!sk)
+ return 0;
lock_sock(sk);
if (!sk->sk_shutdown) {
sk->sk_shutdown = SHUTDOWN_MASK;
+
+ release_sock(sk);
__rfcomm_sock_close(sk);
+ lock_sock(sk);
- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
}
release_sock(sk);
@@ -840,7 +926,7 @@ static int rfcomm_sock_release(struct socket *sock)
return err;
}
-/* ---- RFCOMM core layer callbacks ----
+/* ---- RFCOMM core layer callbacks ----
*
* called under rfcomm_lock()
*/
@@ -859,53 +945,61 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
if (!parent)
return 0;
+ lock_sock(parent);
+
/* Check for backlog size */
if (sk_acceptq_is_full(parent)) {
- BT_DBG("backlog full %d", parent->sk_ack_backlog);
+ BT_DBG("backlog full %d", parent->sk_ack_backlog);
goto done;
}
- sk = rfcomm_sock_alloc(NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+ sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0);
if (!sk)
goto done;
+ bt_sock_reclassify_lock(sk, BTPROTO_RFCOMM);
+
rfcomm_sock_init(sk, parent);
- bacpy(&bt_sk(sk)->src, &src);
- bacpy(&bt_sk(sk)->dst, &dst);
+ bacpy(&rfcomm_pi(sk)->src, &src);
+ bacpy(&rfcomm_pi(sk)->dst, &dst);
rfcomm_pi(sk)->channel = channel;
sk->sk_state = BT_CONFIG;
- bt_accept_enqueue(parent, sk);
+ bt_accept_enqueue(parent, sk, true);
/* Accept connection and return socket DLC */
*d = rfcomm_pi(sk)->dlc;
result = 1;
done:
- bh_unlock_sock(parent);
+ release_sock(parent);
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ parent->sk_state_change(parent);
+
return result;
}
-static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
+static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
{
struct sock *sk;
- struct hlist_node *node;
- char *str = buf;
- read_lock_bh(&rfcomm_sk_list.lock);
+ read_lock(&rfcomm_sk_list.lock);
- sk_for_each(sk, node, &rfcomm_sk_list.head) {
- str += sprintf(str, "%s %s %d %d\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- sk->sk_state, rfcomm_pi(sk)->channel);
+ sk_for_each(sk, &rfcomm_sk_list.head) {
+ seq_printf(f, "%pMR %pMR %d %d\n",
+ &rfcomm_pi(sk)->src, &rfcomm_pi(sk)->dst,
+ sk->sk_state, rfcomm_pi(sk)->channel);
}
- read_unlock_bh(&rfcomm_sk_list.lock);
+ read_unlock(&rfcomm_sk_list.lock);
- return (str - buf);
+ return 0;
}
-static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
+DEFINE_SHOW_ATTRIBUTE(rfcomm_sock_debugfs);
+
+static struct dentry *rfcomm_sock_debugfs;
static const struct proto_ops rfcomm_sock_ops = {
.family = PF_BLUETOOTH,
@@ -922,12 +1016,16 @@ static const struct proto_ops rfcomm_sock_ops = {
.setsockopt = rfcomm_sock_setsockopt,
.getsockopt = rfcomm_sock_getsockopt,
.ioctl = rfcomm_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.poll = bt_sock_poll,
.socketpair = sock_no_socketpair,
- .mmap = sock_no_mmap
+ .mmap = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = rfcomm_sock_compat_ioctl,
+#endif
};
-static struct net_proto_family rfcomm_sock_family_ops = {
+static const struct net_proto_family rfcomm_sock_family_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.create = rfcomm_sock_create
@@ -937,33 +1035,48 @@ int __init rfcomm_init_sockets(void)
{
int err;
+ BUILD_BUG_ON(sizeof(struct sockaddr_rc) > sizeof(struct sockaddr));
+
err = proto_register(&rfcomm_proto, 0);
if (err < 0)
return err;
err = bt_sock_register(BTPROTO_RFCOMM, &rfcomm_sock_family_ops);
- if (err < 0)
+ if (err < 0) {
+ BT_ERR("RFCOMM socket layer registration failed");
goto error;
+ }
- if (class_create_file(bt_class, &class_attr_rfcomm) < 0)
- BT_ERR("Failed to create RFCOMM info file");
+ err = bt_procfs_init(&init_net, "rfcomm", &rfcomm_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create RFCOMM proc file");
+ bt_sock_unregister(BTPROTO_RFCOMM);
+ goto error;
+ }
BT_INFO("RFCOMM socket layer initialized");
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444,
+ bt_debugfs, NULL,
+ &rfcomm_sock_debugfs_fops);
+
return 0;
error:
- BT_ERR("RFCOMM socket layer registration failed");
proto_unregister(&rfcomm_proto);
return err;
}
void __exit rfcomm_cleanup_sockets(void)
{
- class_remove_file(bt_class, &class_attr_rfcomm);
+ bt_procfs_cleanup(&init_net, "rfcomm");
+
+ debugfs_remove(rfcomm_sock_debugfs);
- if (bt_sock_unregister(BTPROTO_RFCOMM) < 0)
- BT_ERR("RFCOMM socket layer unregistration failed");
+ bt_sock_unregister(BTPROTO_RFCOMM);
proto_unregister(&rfcomm_proto);
}
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 1fb5d42f37ae..b783526ab588 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -1,4 +1,4 @@
-/*
+/*
RFCOMM implementation for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
@@ -11,20 +11,18 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/*
* RFCOMM TTY.
- *
- * $Id: tty.c,v 1.24 2002/10/03 01:54:38 holtmann Exp $
*/
#include <linux/module.h>
@@ -33,62 +31,55 @@
#include <linux/tty_driver.h>
#include <linux/tty_flip.h>
-#include <linux/capability.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/rfcomm.h>
-#ifndef CONFIG_BT_RFCOMM_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
-#define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */
#define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */
#define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */
#define RFCOMM_TTY_MINOR 0
+static DEFINE_MUTEX(rfcomm_ioctl_mutex);
static struct tty_driver *rfcomm_tty_driver;
struct rfcomm_dev {
+ struct tty_port port;
struct list_head list;
- atomic_t refcnt;
char name[12];
int id;
unsigned long flags;
- int opened;
int err;
+ unsigned long status; /* don't export to userspace */
+
bdaddr_t src;
bdaddr_t dst;
- u8 channel;
+ u8 channel;
- uint modem_status;
+ uint modem_status;
struct rfcomm_dlc *dlc;
- struct tty_struct *tty;
- wait_queue_head_t wait;
- struct tasklet_struct wakeup_task;
- atomic_t wmem_alloc;
+ struct device *tty_dev;
+
+ atomic_t wmem_alloc;
+
+ struct sk_buff_head pending;
};
static LIST_HEAD(rfcomm_dev_list);
-static DEFINE_RWLOCK(rfcomm_dev_lock);
+static DEFINE_MUTEX(rfcomm_dev_lock);
static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb);
static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err);
static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig);
-static void rfcomm_tty_wakeup(unsigned long arg);
-
/* ---- Device functions ---- */
-static void rfcomm_dev_destruct(struct rfcomm_dev *dev)
+
+static void rfcomm_dev_destruct(struct tty_port *port)
{
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
struct rfcomm_dlc *dlc = dev->dlc;
BT_DBG("dev %p dlc %p", dev, dlc);
@@ -101,113 +92,155 @@ static void rfcomm_dev_destruct(struct rfcomm_dev *dev)
rfcomm_dlc_put(dlc);
- tty_unregister_device(rfcomm_tty_driver, dev->id);
+ if (dev->tty_dev)
+ tty_unregister_device(rfcomm_tty_driver, dev->id);
- /* Refcount should only hit zero when called from rfcomm_dev_del()
- which will have taken us off the list. Everything else are
- refcounting bugs. */
- BUG_ON(!list_empty(&dev->list));
+ mutex_lock(&rfcomm_dev_lock);
+ list_del(&dev->list);
+ mutex_unlock(&rfcomm_dev_lock);
kfree(dev);
- /* It's safe to call module_put() here because socket still
+ /* It's safe to call module_put() here because socket still
holds reference to this module. */
module_put(THIS_MODULE);
}
-static inline void rfcomm_dev_hold(struct rfcomm_dev *dev)
+/* device-specific initialization: open the dlc */
+static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+ int err;
+
+ err = rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel);
+ if (err)
+ set_bit(TTY_IO_ERROR, &tty->flags);
+ return err;
+}
+
+/* we block the open until the dlc->state becomes BT_CONNECTED */
+static bool rfcomm_dev_carrier_raised(struct tty_port *port)
{
- atomic_inc(&dev->refcnt);
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+ return (dev->dlc->state == BT_CONNECTED);
}
-static inline void rfcomm_dev_put(struct rfcomm_dev *dev)
+/* device-specific cleanup: close the dlc */
+static void rfcomm_dev_shutdown(struct tty_port *port)
{
- /* The reason this isn't actually a race, as you no
- doubt have a little voice screaming at you in your
- head, is that the refcount should never actually
- reach zero unless the device has already been taken
- off the list, in rfcomm_dev_del(). And if that's not
- true, we'll hit the BUG() in rfcomm_dev_destruct()
- anyway. */
- if (atomic_dec_and_test(&dev->refcnt))
- rfcomm_dev_destruct(dev);
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+ if (dev->tty_dev->parent)
+ device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
+
+ /* close the dlc */
+ rfcomm_dlc_close(dev->dlc, 0);
}
-static struct rfcomm_dev *__rfcomm_dev_get(int id)
+static const struct tty_port_operations rfcomm_port_ops = {
+ .destruct = rfcomm_dev_destruct,
+ .activate = rfcomm_dev_activate,
+ .shutdown = rfcomm_dev_shutdown,
+ .carrier_raised = rfcomm_dev_carrier_raised,
+};
+
+static struct rfcomm_dev *__rfcomm_dev_lookup(int id)
{
struct rfcomm_dev *dev;
- struct list_head *p;
- list_for_each(p, &rfcomm_dev_list) {
- dev = list_entry(p, struct rfcomm_dev, list);
+ list_for_each_entry(dev, &rfcomm_dev_list, list)
if (dev->id == id)
return dev;
- }
return NULL;
}
-static inline struct rfcomm_dev *rfcomm_dev_get(int id)
+static struct rfcomm_dev *rfcomm_dev_get(int id)
{
struct rfcomm_dev *dev;
- read_lock(&rfcomm_dev_lock);
+ mutex_lock(&rfcomm_dev_lock);
+
+ dev = __rfcomm_dev_lookup(id);
- dev = __rfcomm_dev_get(id);
- if (dev)
- rfcomm_dev_hold(dev);
+ if (dev && !tty_port_get(&dev->port))
+ dev = NULL;
- read_unlock(&rfcomm_dev_lock);
+ mutex_unlock(&rfcomm_dev_lock);
return dev;
}
-static struct device *rfcomm_get_device(struct rfcomm_dev *dev)
+static void rfcomm_reparent_device(struct rfcomm_dev *dev)
{
struct hci_dev *hdev;
struct hci_conn *conn;
- hdev = hci_get_route(&dev->dst, &dev->src);
+ hdev = hci_get_route(&dev->dst, &dev->src, BDADDR_BREDR);
if (!hdev)
- return NULL;
+ return;
+ /* The lookup results are unsafe to access without the
+ * hci device lock (FIXME: why is this not documented?)
+ */
+ hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst);
+ /* Just because the acl link is in the hash table is no
+ * guarantee the sysfs device has been added ...
+ */
+ if (conn && device_is_registered(&conn->dev))
+ device_move(dev->tty_dev, &conn->dev, DPM_ORDER_DEV_AFTER_PARENT);
+
+ hci_dev_unlock(hdev);
hci_dev_put(hdev);
+}
+
+static ssize_t address_show(struct device *tty_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
+ return sysfs_emit(buf, "%pMR\n", &dev->dst);
+}
- return conn ? &conn->dev : NULL;
+static ssize_t channel_show(struct device *tty_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
+ return sysfs_emit(buf, "%d\n", dev->channel);
}
-static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
+static DEVICE_ATTR_RO(address);
+static DEVICE_ATTR_RO(channel);
+
+static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req,
+ struct rfcomm_dlc *dlc)
{
- struct rfcomm_dev *dev;
- struct list_head *head = &rfcomm_dev_list, *p;
+ struct rfcomm_dev *dev, *entry;
+ struct list_head *head = &rfcomm_dev_list;
int err = 0;
- BT_DBG("id %d channel %d", req->dev_id, req->channel);
-
dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL);
if (!dev)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- write_lock_bh(&rfcomm_dev_lock);
+ mutex_lock(&rfcomm_dev_lock);
if (req->dev_id < 0) {
dev->id = 0;
- list_for_each(p, &rfcomm_dev_list) {
- if (list_entry(p, struct rfcomm_dev, list)->id != dev->id)
+ list_for_each_entry(entry, &rfcomm_dev_list, list) {
+ if (entry->id != dev->id)
break;
dev->id++;
- head = p;
+ head = &entry->list;
}
} else {
dev->id = req->dev_id;
- list_for_each(p, &rfcomm_dev_list) {
- struct rfcomm_dev *entry = list_entry(p, struct rfcomm_dev, list);
-
+ list_for_each_entry(entry, &rfcomm_dev_list, list) {
if (entry->id == dev->id) {
err = -EADDRINUSE;
goto out;
@@ -216,7 +249,7 @@ static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
if (entry->id > dev->id - 1)
break;
- head = p;
+ head = &entry->list;
}
}
@@ -228,97 +261,134 @@ static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
sprintf(dev->name, "rfcomm%d", dev->id);
list_add(&dev->list, head);
- atomic_set(&dev->refcnt, 1);
bacpy(&dev->src, &req->src);
bacpy(&dev->dst, &req->dst);
dev->channel = req->channel;
- dev->flags = req->flags &
+ dev->flags = req->flags &
((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC));
- init_waitqueue_head(&dev->wait);
- tasklet_init(&dev->wakeup_task, rfcomm_tty_wakeup, (unsigned long) dev);
+ tty_port_init(&dev->port);
+ dev->port.ops = &rfcomm_port_ops;
+
+ skb_queue_head_init(&dev->pending);
rfcomm_dlc_lock(dlc);
+
+ if (req->flags & (1 << RFCOMM_REUSE_DLC)) {
+ struct sock *sk = dlc->owner;
+ struct sk_buff *skb;
+
+ BUG_ON(!sk);
+
+ rfcomm_dlc_throttle(dlc);
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ skb_queue_tail(&dev->pending, skb);
+ atomic_sub(skb->len, &sk->sk_rmem_alloc);
+ }
+ }
+
dlc->data_ready = rfcomm_dev_data_ready;
dlc->state_change = rfcomm_dev_state_change;
dlc->modem_status = rfcomm_dev_modem_status;
dlc->owner = dev;
dev->dlc = dlc;
+
+ rfcomm_dev_modem_status(dlc, dlc->remote_v24_sig);
+
rfcomm_dlc_unlock(dlc);
- /* It's safe to call __module_get() here because socket already
+ /* It's safe to call __module_get() here because socket already
holds reference to this module. */
__module_get(THIS_MODULE);
+ mutex_unlock(&rfcomm_dev_lock);
+ return dev;
+
out:
- write_unlock_bh(&rfcomm_dev_lock);
+ mutex_unlock(&rfcomm_dev_lock);
+ kfree(dev);
+ return ERR_PTR(err);
+}
- if (err) {
- kfree(dev);
- return err;
+static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
+{
+ struct rfcomm_dev *dev;
+ struct device *tty;
+
+ BT_DBG("id %d channel %d", req->dev_id, req->channel);
+
+ dev = __rfcomm_dev_add(req, dlc);
+ if (IS_ERR(dev)) {
+ rfcomm_dlc_put(dlc);
+ return PTR_ERR(dev);
}
- tty_register_device(rfcomm_tty_driver, dev->id, rfcomm_get_device(dev));
+ tty = tty_port_register_device(&dev->port, rfcomm_tty_driver,
+ dev->id, NULL);
+ if (IS_ERR(tty)) {
+ tty_port_put(&dev->port);
+ return PTR_ERR(tty);
+ }
- return dev->id;
-}
+ dev->tty_dev = tty;
+ rfcomm_reparent_device(dev);
+ dev_set_drvdata(dev->tty_dev, dev);
-static void rfcomm_dev_del(struct rfcomm_dev *dev)
-{
- BT_DBG("dev %p", dev);
+ if (device_create_file(dev->tty_dev, &dev_attr_address) < 0)
+ BT_ERR("Failed to create address attribute");
- write_lock_bh(&rfcomm_dev_lock);
- list_del_init(&dev->list);
- write_unlock_bh(&rfcomm_dev_lock);
+ if (device_create_file(dev->tty_dev, &dev_attr_channel) < 0)
+ BT_ERR("Failed to create channel attribute");
- rfcomm_dev_put(dev);
+ return dev->id;
}
/* ---- Send buffer ---- */
-static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc)
+static inline unsigned int rfcomm_room(struct rfcomm_dev *dev)
{
- /* We can't let it be zero, because we don't get a callback
- when tx_credits becomes nonzero, hence we'd never wake up */
- return dlc->mtu * (dlc->tx_credits?:1);
+ struct rfcomm_dlc *dlc = dev->dlc;
+
+ /* Limit the outstanding number of packets not yet sent to 40 */
+ int pending = 40 - atomic_read(&dev->wmem_alloc);
+
+ return max(0, pending) * dlc->mtu;
}
static void rfcomm_wfree(struct sk_buff *skb)
{
struct rfcomm_dev *dev = (void *) skb->sk;
- atomic_sub(skb->truesize, &dev->wmem_alloc);
+ atomic_dec(&dev->wmem_alloc);
if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags))
- tasklet_schedule(&dev->wakeup_task);
- rfcomm_dev_put(dev);
+ tty_port_tty_wakeup(&dev->port);
+ tty_port_put(&dev->port);
}
-static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev)
+static void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev)
{
- rfcomm_dev_hold(dev);
- atomic_add(skb->truesize, &dev->wmem_alloc);
+ tty_port_get(&dev->port);
+ atomic_inc(&dev->wmem_alloc);
skb->sk = (void *) dev;
skb->destructor = rfcomm_wfree;
}
static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority)
{
- if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
- struct sk_buff *skb = alloc_skb(size, priority);
- if (skb) {
- rfcomm_set_owner_w(skb, dev);
- return skb;
- }
- }
- return NULL;
+ struct sk_buff *skb = alloc_skb(size, priority);
+ if (skb)
+ rfcomm_set_owner_w(skb, dev);
+ return skb;
}
/* ---- Device IOCTLs ---- */
#define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP))
-static int rfcomm_create_dev(struct sock *sk, void __user *arg)
+static int __rfcomm_create_dev(struct sock *sk, void __user *arg)
{
struct rfcomm_dev_req req;
struct rfcomm_dlc *dlc;
@@ -327,7 +397,7 @@ static int rfcomm_create_dev(struct sock *sk, void __user *arg)
if (copy_from_user(&req, arg, sizeof(req)))
return -EFAULT;
- BT_DBG("sk %p dev_id %id flags 0x%x", sk, req.dev_id, req.flags);
+ BT_DBG("sk %p dev_id %d flags 0x%x", sk, req.dev_id, req.flags);
if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN))
return -EPERM;
@@ -340,16 +410,20 @@ static int rfcomm_create_dev(struct sock *sk, void __user *arg)
dlc = rfcomm_pi(sk)->dlc;
rfcomm_dlc_hold(dlc);
} else {
+ /* Validate the channel is unused */
+ dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel);
+ if (IS_ERR(dlc))
+ return PTR_ERR(dlc);
+ if (dlc)
+ return -EBUSY;
dlc = rfcomm_dlc_alloc(GFP_KERNEL);
if (!dlc)
return -ENOMEM;
}
id = rfcomm_dev_add(&req, dlc);
- if (id < 0) {
- rfcomm_dlc_put(dlc);
+ if (id < 0)
return id;
- }
if (req.flags & (1 << RFCOMM_REUSE_DLC)) {
/* DLC is now used by device.
@@ -360,7 +434,7 @@ static int rfcomm_create_dev(struct sock *sk, void __user *arg)
return id;
}
-static int rfcomm_release_dev(void __user *arg)
+static int __rfcomm_release_dev(void __user *arg)
{
struct rfcomm_dev_req req;
struct rfcomm_dev *dev;
@@ -368,30 +442,64 @@ static int rfcomm_release_dev(void __user *arg)
if (copy_from_user(&req, arg, sizeof(req)))
return -EFAULT;
- BT_DBG("dev_id %id flags 0x%x", req.dev_id, req.flags);
+ BT_DBG("dev_id %d flags 0x%x", req.dev_id, req.flags);
- if (!(dev = rfcomm_dev_get(req.dev_id)))
+ dev = rfcomm_dev_get(req.dev_id);
+ if (!dev)
return -ENODEV;
if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) {
- rfcomm_dev_put(dev);
+ tty_port_put(&dev->port);
return -EPERM;
}
+ /* only release once */
+ if (test_and_set_bit(RFCOMM_DEV_RELEASED, &dev->status)) {
+ tty_port_put(&dev->port);
+ return -EALREADY;
+ }
+
if (req.flags & (1 << RFCOMM_HANGUP_NOW))
rfcomm_dlc_close(dev->dlc, 0);
- rfcomm_dev_del(dev);
- rfcomm_dev_put(dev);
+ /* Shut down TTY synchronously before freeing rfcomm_dev */
+ tty_port_tty_vhangup(&dev->port);
+
+ if (!test_bit(RFCOMM_TTY_OWNED, &dev->status))
+ tty_port_put(&dev->port);
+
+ tty_port_put(&dev->port);
return 0;
}
+static int rfcomm_create_dev(struct sock *sk, void __user *arg)
+{
+ int ret;
+
+ mutex_lock(&rfcomm_ioctl_mutex);
+ ret = __rfcomm_create_dev(sk, arg);
+ mutex_unlock(&rfcomm_ioctl_mutex);
+
+ return ret;
+}
+
+static int rfcomm_release_dev(void __user *arg)
+{
+ int ret;
+
+ mutex_lock(&rfcomm_ioctl_mutex);
+ ret = __rfcomm_release_dev(arg);
+ mutex_unlock(&rfcomm_ioctl_mutex);
+
+ return ret;
+}
+
static int rfcomm_get_dev_list(void __user *arg)
{
+ struct rfcomm_dev *dev;
struct rfcomm_dev_list_req *dl;
struct rfcomm_dev_info *di;
- struct list_head *p;
- int n = 0, size, err;
+ int n = 0, err;
u16 dev_num;
BT_DBG("");
@@ -402,33 +510,33 @@ static int rfcomm_get_dev_list(void __user *arg)
if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di))
return -EINVAL;
- size = sizeof(*dl) + dev_num * sizeof(*di);
-
- if (!(dl = kmalloc(size, GFP_KERNEL)))
+ dl = kzalloc(struct_size(dl, dev_info, dev_num), GFP_KERNEL);
+ if (!dl)
return -ENOMEM;
+ dl->dev_num = dev_num;
di = dl->dev_info;
- read_lock_bh(&rfcomm_dev_lock);
-
- list_for_each(p, &rfcomm_dev_list) {
- struct rfcomm_dev *dev = list_entry(p, struct rfcomm_dev, list);
- (di + n)->id = dev->id;
- (di + n)->flags = dev->flags;
- (di + n)->state = dev->dlc->state;
- (di + n)->channel = dev->channel;
- bacpy(&(di + n)->src, &dev->src);
- bacpy(&(di + n)->dst, &dev->dst);
+ mutex_lock(&rfcomm_dev_lock);
+
+ list_for_each_entry(dev, &rfcomm_dev_list, list) {
+ if (!tty_port_get(&dev->port))
+ continue;
+ di[n].id = dev->id;
+ di[n].flags = dev->flags;
+ di[n].state = dev->dlc->state;
+ di[n].channel = dev->channel;
+ bacpy(&di[n].src, &dev->src);
+ bacpy(&di[n].dst, &dev->dst);
+ tty_port_put(&dev->port);
if (++n >= dev_num)
break;
}
- read_unlock_bh(&rfcomm_dev_lock);
+ mutex_unlock(&rfcomm_dev_lock);
dl->dev_num = n;
- size = sizeof(*dl) + n * sizeof(*di);
-
- err = copy_to_user(arg, dl, size);
+ err = copy_to_user(arg, dl, struct_size(dl, dev_info, n));
kfree(dl);
return err ? -EFAULT : 0;
@@ -445,7 +553,8 @@ static int rfcomm_get_dev_info(void __user *arg)
if (copy_from_user(&di, arg, sizeof(di)))
return -EFAULT;
- if (!(dev = rfcomm_dev_get(di.id)))
+ dev = rfcomm_dev_get(di.id);
+ if (!dev)
return -ENODEV;
di.flags = dev->flags;
@@ -457,7 +566,7 @@ static int rfcomm_get_dev_info(void __user *arg)
if (copy_to_user(arg, &di, sizeof(di)))
err = -EFAULT;
- rfcomm_dev_put(dev);
+ tty_port_put(&dev->port);
return err;
}
@@ -486,17 +595,21 @@ int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
{
struct rfcomm_dev *dev = dlc->owner;
- struct tty_struct *tty;
-
- if (!dev || !(tty = dev->tty)) {
+
+ if (!dev) {
kfree_skb(skb);
return;
}
- BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len);
+ if (!skb_queue_empty(&dev->pending)) {
+ skb_queue_tail(&dev->pending, skb);
+ return;
+ }
+
+ BT_DBG("dlc %p len %d", dlc, skb->len);
- tty_insert_flip_string(tty, skb->data, skb->len);
- tty_flip_buffer_push(tty);
+ tty_insert_flip_string(&dev->port, skb->data, skb->len);
+ tty_flip_buffer_push(&dev->port);
kfree_skb(skb);
}
@@ -506,28 +619,16 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
struct rfcomm_dev *dev = dlc->owner;
if (!dev)
return;
-
+
BT_DBG("dlc %p dev %p err %d", dlc, dev, err);
dev->err = err;
- wake_up_interruptible(&dev->wait);
-
- if (dlc->state == BT_CLOSED) {
- if (!dev->tty) {
- if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
- rfcomm_dev_hold(dev);
- rfcomm_dev_del(dev);
-
- /* We have to drop DLC lock here, otherwise
- rfcomm_dev_put() will dead lock if it's
- the last reference. */
- rfcomm_dlc_unlock(dlc);
- rfcomm_dev_put(dev);
- rfcomm_dlc_lock(dlc);
- }
- } else
- tty_hangup(dev->tty);
- }
+ if (dlc->state == BT_CONNECTED) {
+ rfcomm_reparent_device(dev);
+
+ wake_up_interruptible(&dev->port.open_wait);
+ } else if (dlc->state == BT_CLOSED)
+ tty_port_tty_hangup(&dev->port, false);
}
static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
@@ -538,172 +639,182 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
- if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
- if (dev->tty && !C_CLOCAL(dev->tty))
- tty_hangup(dev->tty);
- }
+ if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV))
+ tty_port_tty_hangup(&dev->port, true);
- dev->modem_status =
- ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
- ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
+ dev->modem_status =
+ ((v24_sig & RFCOMM_V24_RTC) ? TIOCM_DSR : 0) |
+ ((v24_sig & RFCOMM_V24_RTR) ? TIOCM_CTS : 0) |
((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) |
((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0);
}
/* ---- TTY functions ---- */
-static void rfcomm_tty_wakeup(unsigned long arg)
+static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev)
{
- struct rfcomm_dev *dev = (void *) arg;
- struct tty_struct *tty = dev->tty;
- if (!tty)
- return;
+ struct sk_buff *skb;
+ int inserted = 0;
+
+ BT_DBG("dev %p", dev);
- BT_DBG("dev %p tty %p", dev, tty);
+ rfcomm_dlc_lock(dev->dlc);
- if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && tty->ldisc.write_wakeup)
- (tty->ldisc.write_wakeup)(tty);
+ while ((skb = skb_dequeue(&dev->pending))) {
+ inserted += tty_insert_flip_string(&dev->port, skb->data,
+ skb->len);
+ kfree_skb(skb);
+ }
+
+ rfcomm_dlc_unlock(dev->dlc);
- wake_up_interruptible(&tty->write_wait);
-#ifdef SERIAL_HAVE_POLL_WAIT
- wake_up_interruptible(&tty->poll_wait);
-#endif
+ if (inserted > 0)
+ tty_flip_buffer_push(&dev->port);
}
-static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
+/* do the reverse of install, clearing the tty fields and releasing the
+ * reference to tty_port
+ */
+static void rfcomm_tty_cleanup(struct tty_struct *tty)
{
- DECLARE_WAITQUEUE(wait, current);
- struct rfcomm_dev *dev;
- struct rfcomm_dlc *dlc;
- int err, id;
+ struct rfcomm_dev *dev = tty->driver_data;
- id = tty->index;
+ clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
- BT_DBG("tty %p id %d", tty, id);
+ rfcomm_dlc_lock(dev->dlc);
+ tty->driver_data = NULL;
+ rfcomm_dlc_unlock(dev->dlc);
- /* We don't leak this refcount. For reasons which are not entirely
- clear, the TTY layer will call our ->close() method even if the
- open fails. We decrease the refcount there, and decreasing it
- here too would cause breakage. */
- dev = rfcomm_dev_get(id);
- if (!dev)
- return -ENODEV;
+ /*
+ * purge the dlc->tx_queue to avoid circular dependencies
+ * between dev and dlc
+ */
+ skb_queue_purge(&dev->dlc->tx_queue);
- BT_DBG("dev %p dst %s channel %d opened %d", dev, batostr(&dev->dst), dev->channel, dev->opened);
+ tty_port_put(&dev->port);
+}
- if (dev->opened++ != 0)
- return 0;
+/* we acquire the tty_port reference since it's here the tty is first used
+ * by setting the termios. We also populate the driver_data field and install
+ * the tty port
+ */
+static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev;
+ struct rfcomm_dlc *dlc;
+ int err;
+
+ dev = rfcomm_dev_get(tty->index);
+ if (!dev)
+ return -ENODEV;
dlc = dev->dlc;
/* Attach TTY and open DLC */
-
rfcomm_dlc_lock(dlc);
tty->driver_data = dev;
- dev->tty = tty;
rfcomm_dlc_unlock(dlc);
set_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
- err = rfcomm_dlc_open(dlc, &dev->src, &dev->dst, dev->channel);
- if (err < 0)
+ /* install the tty_port */
+ err = tty_port_install(&dev->port, driver, tty);
+ if (err) {
+ rfcomm_tty_cleanup(tty);
return err;
+ }
- /* Wait for DLC to connect */
- add_wait_queue(&dev->wait, &wait);
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
+ /* take over the tty_port reference if the port was created with the
+ * flag RFCOMM_RELEASE_ONHUP. This will force the release of the port
+ * when the last process closes the tty. The behaviour is expected by
+ * userspace.
+ */
+ if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
+ set_bit(RFCOMM_TTY_OWNED, &dev->status);
+ tty_port_put(&dev->port);
+ }
- if (dlc->state == BT_CLOSED) {
- err = -dev->err;
- break;
- }
+ return 0;
+}
- if (dlc->state == BT_CONNECTED)
- break;
+static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
+{
+ struct rfcomm_dev *dev = tty->driver_data;
+ int err;
- if (signal_pending(current)) {
- err = -EINTR;
- break;
- }
+ BT_DBG("tty %p id %d", tty, tty->index);
- schedule();
- }
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&dev->wait, &wait);
+ BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst,
+ dev->channel, dev->port.count);
- return err;
-}
+ err = tty_port_open(&dev->port, tty, filp);
+ if (err)
+ return err;
-static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
-{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- if (!dev)
- return;
+ /*
+ * FIXME: rfcomm should use proper flow control for
+ * received data. This hack will be unnecessary and can
+ * be removed when that's implemented
+ */
+ rfcomm_tty_copy_pending(dev);
- BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc, dev->opened);
+ rfcomm_dlc_unthrottle(dev->dlc);
- if (--dev->opened == 0) {
- /* Close DLC and dettach TTY */
- rfcomm_dlc_close(dev->dlc, 0);
+ return 0;
+}
- clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
- tasklet_kill(&dev->wakeup_task);
+static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
+{
+ struct rfcomm_dev *dev = tty->driver_data;
- rfcomm_dlc_lock(dev->dlc);
- tty->driver_data = NULL;
- dev->tty = NULL;
- rfcomm_dlc_unlock(dev->dlc);
- }
+ BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc,
+ dev->port.count);
- rfcomm_dev_put(dev);
+ tty_port_close(&dev->port, tty, filp);
}
-static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+static ssize_t rfcomm_tty_write(struct tty_struct *tty, const u8 *buf,
+ size_t count)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
struct rfcomm_dlc *dlc = dev->dlc;
struct sk_buff *skb;
- int err = 0, sent = 0, size;
+ size_t sent = 0, size;
- BT_DBG("tty %p count %d", tty, count);
+ BT_DBG("tty %p count %zu", tty, count);
while (count) {
- size = min_t(uint, count, dlc->mtu);
+ size = min_t(size_t, count, dlc->mtu);
skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC);
-
if (!skb)
break;
skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
- memcpy(skb_put(skb, size), buf + sent, size);
+ skb_put_data(skb, buf + sent, size);
- if ((err = rfcomm_dlc_send(dlc, skb)) < 0) {
- kfree_skb(skb);
- break;
- }
+ rfcomm_dlc_send_noerror(dlc, skb);
sent += size;
count -= size;
}
- return sent ? sent : err;
+ return sent;
}
-static int rfcomm_tty_write_room(struct tty_struct *tty)
+static unsigned int rfcomm_tty_write_room(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- int room;
+ struct rfcomm_dev *dev = tty->driver_data;
+ int room = 0;
+
+ if (dev && dev->dlc)
+ room = rfcomm_room(dev);
- BT_DBG("tty %p", tty);
+ BT_DBG("tty %p room %d", tty, room);
- room = rfcomm_room(dev->dlc) - atomic_read(&dev->wmem_alloc);
- if (room < 0)
- room = 0;
return room;
}
-static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd, unsigned long arg)
+static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
{
BT_DBG("tty %p cmd 0x%02x", tty, cmd);
@@ -720,22 +831,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
BT_DBG("TIOCMIWAIT");
break;
- case TIOCGICOUNT:
- BT_DBG("TIOCGICOUNT");
- break;
-
- case TIOCGSERIAL:
- BT_ERR("TIOCGSERIAL is not supported");
- return -ENOIOCTLCMD;
-
- case TIOCSSERIAL:
- BT_ERR("TIOCSSERIAL is not supported");
- return -ENOIOCTLCMD;
-
- case TIOCSERGSTRUCT:
- BT_ERR("TIOCSERGSTRUCT is not supported");
- return -ENOIOCTLCMD;
-
case TIOCSERGETLSR:
BT_ERR("TIOCSERGETLSR is not supported");
return -ENOIOCTLCMD;
@@ -752,16 +847,17 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
return -ENOIOCTLCMD;
}
-static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
+static void rfcomm_tty_set_termios(struct tty_struct *tty,
+ const struct ktermios *old)
{
- struct termios *new = (struct termios *) tty->termios;
+ struct ktermios *new = &tty->termios;
int old_baud_rate = tty_termios_baud_rate(old);
int new_baud_rate = tty_termios_baud_rate(new);
u8 baud, data_bits, stop_bits, parity, x_on, x_off;
u16 changes = 0;
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p termios %p", tty, old);
@@ -769,12 +865,12 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
return;
/* Handle turning off CRTSCTS */
- if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
+ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
BT_DBG("Turning off CRTSCTS unsupported");
/* Parity on/off and when on, odd/even */
if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
- ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) {
+ ((old->c_cflag & PARODD) != (new->c_cflag & PARODD))) {
changes |= RFCOMM_RPN_PM_PARITY;
BT_DBG("Parity change detected.");
}
@@ -819,14 +915,13 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
/* POSIX does not support 1.5 stop bits and RFCOMM does not
* support 2 stop bits. So a request for 2 stop bits gets
* translated to 1.5 stop bits */
- if (new->c_cflag & CSTOPB) {
+ if (new->c_cflag & CSTOPB)
stop_bits = RFCOMM_RPN_STOP_15;
- } else {
+ else
stop_bits = RFCOMM_RPN_STOP_1;
- }
/* Handle number of data bits [5-8] */
- if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
+ if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
changes |= RFCOMM_RPN_PM_DATA;
switch (new->c_cflag & CSIZE) {
@@ -864,7 +959,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
case 9600:
baud = RFCOMM_RPN_BR_9600;
break;
- case 19200:
+ case 19200:
baud = RFCOMM_RPN_BR_19200;
break;
case 38400:
@@ -880,23 +975,21 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
baud = RFCOMM_RPN_BR_230400;
break;
default:
- /* 9600 is standard accordinag to the RFCOMM specification */
+ /* 9600 is standard according to the RFCOMM specification */
baud = RFCOMM_RPN_BR_9600;
break;
-
+
}
if (changes)
rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
data_bits, stop_bits, parity,
RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
-
- return;
}
static void rfcomm_tty_throttle(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
@@ -905,41 +998,42 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
static void rfcomm_tty_unthrottle(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
rfcomm_dlc_unthrottle(dev->dlc);
}
-static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- struct rfcomm_dlc *dlc = dev->dlc;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
- if (!skb_queue_empty(&dlc->tx_queue))
- return dlc->mtu;
+ if (!dev || !dev->dlc)
+ return 0;
+
+ if (!skb_queue_empty(&dev->dlc->tx_queue))
+ return dev->dlc->mtu;
return 0;
}
static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- if (!dev)
- return;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
- skb_queue_purge(&dev->dlc->tx_queue);
+ if (!dev || !dev->dlc)
+ return;
- if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && tty->ldisc.write_wakeup)
- tty->ldisc.write_wakeup(tty);
+ skb_queue_purge(&dev->dlc->tx_queue);
+ tty_wakeup(tty);
}
-static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch)
+static void rfcomm_tty_send_xchar(struct tty_struct *tty, u8 ch)
{
BT_DBG("tty %p ch %c", tty, ch);
}
@@ -951,35 +1045,29 @@ static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
static void rfcomm_tty_hangup(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- if (!dev)
- return;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
- rfcomm_tty_flush_buffer(tty);
-
- if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
- rfcomm_dev_del(dev);
-}
-
-static int rfcomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused)
-{
- return 0;
+ tty_port_hangup(&dev->port);
}
-static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
+static int rfcomm_tty_tiocmget(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
+ struct rfcomm_dlc *dlc = dev->dlc;
+ u8 v24_sig;
BT_DBG("tty %p dev %p", tty, dev);
- return dev->modem_status;
+ rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+ return (v24_sig & (TIOCM_DTR | TIOCM_RTS)) | dev->modem_status;
}
-static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear)
+static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
struct rfcomm_dlc *dlc = dev->dlc;
u8 v24_sig;
@@ -987,23 +1075,15 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsign
rfcomm_dlc_get_modem_status(dlc, &v24_sig);
- if (set & TIOCM_DSR || set & TIOCM_DTR)
+ if (set & TIOCM_DTR)
v24_sig |= RFCOMM_V24_RTC;
- if (set & TIOCM_RTS || set & TIOCM_CTS)
+ if (set & TIOCM_RTS)
v24_sig |= RFCOMM_V24_RTR;
- if (set & TIOCM_RI)
- v24_sig |= RFCOMM_V24_IC;
- if (set & TIOCM_CD)
- v24_sig |= RFCOMM_V24_DV;
- if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+ if (clear & TIOCM_DTR)
v24_sig &= ~RFCOMM_V24_RTC;
- if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+ if (clear & TIOCM_RTS)
v24_sig &= ~RFCOMM_V24_RTR;
- if (clear & TIOCM_RI)
- v24_sig &= ~RFCOMM_V24_IC;
- if (clear & TIOCM_CD)
- v24_sig &= ~RFCOMM_V24_DV;
rfcomm_dlc_set_modem_status(dlc, v24_sig);
@@ -1026,33 +1106,37 @@ static const struct tty_operations rfcomm_ops = {
.send_xchar = rfcomm_tty_send_xchar,
.hangup = rfcomm_tty_hangup,
.wait_until_sent = rfcomm_tty_wait_until_sent,
- .read_proc = rfcomm_tty_read_proc,
.tiocmget = rfcomm_tty_tiocmget,
.tiocmset = rfcomm_tty_tiocmset,
+ .install = rfcomm_tty_install,
+ .cleanup = rfcomm_tty_cleanup,
};
-int rfcomm_init_ttys(void)
+int __init rfcomm_init_ttys(void)
{
- rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS);
- if (!rfcomm_tty_driver)
- return -1;
+ int error;
+
+ rfcomm_tty_driver = tty_alloc_driver(RFCOMM_TTY_PORTS,
+ TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV);
+ if (IS_ERR(rfcomm_tty_driver))
+ return PTR_ERR(rfcomm_tty_driver);
- rfcomm_tty_driver->owner = THIS_MODULE;
rfcomm_tty_driver->driver_name = "rfcomm";
rfcomm_tty_driver->name = "rfcomm";
rfcomm_tty_driver->major = RFCOMM_TTY_MAJOR;
rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR;
rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL;
- rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
rfcomm_tty_driver->init_termios = tty_std_termios;
- rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+ rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL;
+ rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON;
tty_set_operations(rfcomm_tty_driver, &rfcomm_ops);
- if (tty_register_driver(rfcomm_tty_driver)) {
+ error = tty_register_driver(rfcomm_tty_driver);
+ if (error) {
BT_ERR("Can't register RFCOMM TTY driver");
- put_tty_driver(rfcomm_tty_driver);
- return -1;
+ tty_driver_kref_put(rfcomm_tty_driver);
+ return error;
}
BT_INFO("RFCOMM TTY layer initialized");
@@ -1063,5 +1147,5 @@ int rfcomm_init_ttys(void)
void rfcomm_cleanup_ttys(void)
{
tty_unregister_driver(rfcomm_tty_driver);
- put_tty_driver(rfcomm_tty_driver);
+ tty_driver_kref_put(rfcomm_tty_driver);
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 5d13d4f31753..87ba90336e80 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1,4 +1,4 @@
-/*
+/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (C) 2000-2001 Qualcomm Incorporated
@@ -12,121 +12,212 @@
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
SOFTWARE IS DISCLAIMED.
*/
/* Bluetooth SCO sockets. */
#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <net/sock.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/sco.h>
-#ifndef CONFIG_BT_SCO_DEBUG
-#undef BT_DBG
-#define BT_DBG(D...)
-#endif
-
-#define VERSION "0.5"
+static bool disable_esco;
static const struct proto_ops sco_sock_ops;
static struct bt_sock_list sco_sk_list = {
- .lock = RW_LOCK_UNLOCKED
+ .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock)
};
-static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent);
-static void sco_chan_del(struct sock *sk, int err);
+/* ---- SCO connections ---- */
+struct sco_conn {
+ struct hci_conn *hcon;
-static int sco_conn_del(struct hci_conn *conn, int err);
+ spinlock_t lock;
+ struct sock *sk;
+
+ struct delayed_work timeout_work;
+
+ unsigned int mtu;
+ struct kref ref;
+};
+
+#define sco_conn_lock(c) spin_lock(&c->lock)
+#define sco_conn_unlock(c) spin_unlock(&c->lock)
static void sco_sock_close(struct sock *sk);
static void sco_sock_kill(struct sock *sk);
+/* ----- SCO socket info ----- */
+#define sco_pi(sk) ((struct sco_pinfo *) sk)
+
+struct sco_pinfo {
+ struct bt_sock bt;
+ bdaddr_t src;
+ bdaddr_t dst;
+ __u32 flags;
+ __u16 setting;
+ struct bt_codec codec;
+ struct sco_conn *conn;
+};
+
/* ---- SCO timers ---- */
-static void sco_sock_timeout(unsigned long arg)
+#define SCO_CONN_TIMEOUT (HZ * 40)
+#define SCO_DISCONN_TIMEOUT (HZ * 2)
+
+static void sco_conn_free(struct kref *ref)
+{
+ struct sco_conn *conn = container_of(ref, struct sco_conn, ref);
+
+ BT_DBG("conn %p", conn);
+
+ if (conn->sk)
+ sco_pi(conn->sk)->conn = NULL;
+
+ if (conn->hcon) {
+ conn->hcon->sco_data = NULL;
+ hci_conn_drop(conn->hcon);
+ }
+
+ /* Ensure no more work items will run since hci_conn has been dropped */
+ disable_delayed_work_sync(&conn->timeout_work);
+
+ kfree(conn);
+}
+
+static void sco_conn_put(struct sco_conn *conn)
+{
+ if (!conn)
+ return;
+
+ BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref));
+
+ kref_put(&conn->ref, sco_conn_free);
+}
+
+static struct sco_conn *sco_conn_hold(struct sco_conn *conn)
+{
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ kref_get(&conn->ref);
+ return conn;
+}
+
+static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn)
+{
+ if (!conn)
+ return NULL;
+
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ if (!kref_get_unless_zero(&conn->ref))
+ return NULL;
+
+ return conn;
+}
+
+static struct sock *sco_sock_hold(struct sco_conn *conn)
{
- struct sock *sk = (struct sock *) arg;
+ if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk))
+ return NULL;
+
+ sock_hold(conn->sk);
+
+ return conn->sk;
+}
+
+static void sco_sock_timeout(struct work_struct *work)
+{
+ struct sco_conn *conn = container_of(work, struct sco_conn,
+ timeout_work.work);
+ struct sock *sk;
+
+ conn = sco_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
+
+ sco_conn_lock(conn);
+ if (!conn->hcon) {
+ sco_conn_unlock(conn);
+ sco_conn_put(conn);
+ return;
+ }
+ sk = sco_sock_hold(conn);
+ sco_conn_unlock(conn);
+ sco_conn_put(conn);
+
+ if (!sk)
+ return;
BT_DBG("sock %p state %d", sk, sk->sk_state);
- bh_lock_sock(sk);
+ lock_sock(sk);
sk->sk_err = ETIMEDOUT;
sk->sk_state_change(sk);
- bh_unlock_sock(sk);
-
- sco_sock_kill(sk);
+ release_sock(sk);
sock_put(sk);
}
static void sco_sock_set_timer(struct sock *sk, long timeout)
{
+ if (!sco_pi(sk)->conn)
+ return;
+
BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
- sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout);
+ cancel_delayed_work(&sco_pi(sk)->conn->timeout_work);
+ schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout);
}
static void sco_sock_clear_timer(struct sock *sk)
{
- BT_DBG("sock %p state %d", sk, sk->sk_state);
- sk_stop_timer(sk, &sk->sk_timer);
-}
+ if (!sco_pi(sk)->conn)
+ return;
-static void sco_sock_init_timer(struct sock *sk)
-{
- init_timer(&sk->sk_timer);
- sk->sk_timer.function = sco_sock_timeout;
- sk->sk_timer.data = (unsigned long)sk;
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+ cancel_delayed_work(&sco_pi(sk)->conn->timeout_work);
}
/* ---- SCO connections ---- */
-static struct sco_conn *sco_conn_add(struct hci_conn *hcon, __u8 status)
+static struct sco_conn *sco_conn_add(struct hci_conn *hcon)
{
- struct hci_dev *hdev = hcon->hdev;
struct sco_conn *conn = hcon->sco_data;
- if (conn || status)
+ conn = sco_conn_hold_unless_zero(conn);
+ if (conn) {
+ if (!conn->hcon) {
+ sco_conn_lock(conn);
+ conn->hcon = hcon;
+ sco_conn_unlock(conn);
+ }
return conn;
+ }
- conn = kzalloc(sizeof(struct sco_conn), GFP_ATOMIC);
+ conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL);
if (!conn)
return NULL;
+ kref_init(&conn->ref);
spin_lock_init(&conn->lock);
+ INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout);
hcon->sco_data = conn;
conn->hcon = hcon;
+ conn->mtu = hcon->mtu;
- conn->src = &hdev->bdaddr;
- conn->dst = &hcon->dst;
-
- if (hdev->sco_mtu > 0)
- conn->mtu = hdev->sco_mtu;
+ if (hcon->mtu > 0)
+ conn->mtu = hcon->mtu;
else
conn->mtu = 60;
@@ -135,106 +226,163 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon, __u8 status)
return conn;
}
-static inline struct sock *sco_chan_get(struct sco_conn *conn)
+/* Delete channel.
+ * Must be called on the locked socket. */
+static void sco_chan_del(struct sock *sk, int err)
{
- struct sock *sk = NULL;
- sco_conn_lock(conn);
- sk = conn->sk;
- sco_conn_unlock(conn);
- return sk;
+ struct sco_conn *conn;
+
+ conn = sco_pi(sk)->conn;
+ sco_pi(sk)->conn = NULL;
+
+ BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+
+ if (conn) {
+ sco_conn_lock(conn);
+ conn->sk = NULL;
+ sco_conn_unlock(conn);
+ sco_conn_put(conn);
+ }
+
+ sk->sk_state = BT_CLOSED;
+ sk->sk_err = err;
+ sk->sk_state_change(sk);
+
+ sock_set_flag(sk, SOCK_ZAPPED);
}
-static int sco_conn_del(struct hci_conn *hcon, int err)
+static void sco_conn_del(struct hci_conn *hcon, int err)
{
- struct sco_conn *conn;
+ struct sco_conn *conn = hcon->sco_data;
struct sock *sk;
- if (!(conn = hcon->sco_data))
- return 0;
+ conn = sco_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
- /* Kill socket */
- if ((sk = sco_chan_get(conn))) {
- bh_lock_sock(sk);
- sco_sock_clear_timer(sk);
- sco_chan_del(sk, err);
- bh_unlock_sock(sk);
- sco_sock_kill(sk);
+ sco_conn_lock(conn);
+ sk = sco_sock_hold(conn);
+ sco_conn_unlock(conn);
+ sco_conn_put(conn);
+
+ if (!sk) {
+ sco_conn_put(conn);
+ return;
}
- hcon->sco_data = NULL;
- kfree(conn);
- return 0;
+ /* Kill socket */
+ lock_sock(sk);
+ sco_sock_clear_timer(sk);
+ sco_chan_del(sk, err);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static void __sco_chan_add(struct sco_conn *conn, struct sock *sk,
+ struct sock *parent)
+{
+ BT_DBG("conn %p", conn);
+
+ sco_pi(sk)->conn = conn;
+ conn->sk = sk;
+
+ if (parent)
+ bt_accept_enqueue(parent, sk, true);
}
-static inline int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent)
+static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
+ struct sock *parent)
{
int err = 0;
sco_conn_lock(conn);
- if (conn->sk) {
+ if (conn->sk)
err = -EBUSY;
- } else {
+ else
__sco_chan_add(conn, sk, parent);
- }
+
sco_conn_unlock(conn);
return err;
}
static int sco_connect(struct sock *sk)
{
- bdaddr_t *src = &bt_sk(sk)->src;
- bdaddr_t *dst = &bt_sk(sk)->dst;
struct sco_conn *conn;
struct hci_conn *hcon;
struct hci_dev *hdev;
- int err = 0;
+ int err, type;
- BT_DBG("%s -> %s", batostr(src), batostr(dst));
+ BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst);
- if (!(hdev = hci_get_route(dst, src)))
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
+ if (!hdev)
return -EHOSTUNREACH;
- hci_dev_lock_bh(hdev);
+ hci_dev_lock(hdev);
- err = -ENOMEM;
+ if (lmp_esco_capable(hdev) && !disable_esco)
+ type = ESCO_LINK;
+ else
+ type = SCO_LINK;
- hcon = hci_connect(hdev, SCO_LINK, dst);
- if (!hcon)
- goto done;
+ switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+ break;
+ }
- conn = sco_conn_add(hcon, 0);
+ hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
+ sco_pi(sk)->setting, &sco_pi(sk)->codec,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+
+ conn = sco_conn_add(hcon);
if (!conn) {
- hci_conn_put(hcon);
- goto done;
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
}
- /* Update source addr of the socket */
- bacpy(src, conn->src);
+ lock_sock(sk);
err = sco_chan_add(conn, sk, NULL);
- if (err)
- goto done;
+ if (err) {
+ release_sock(sk);
+ goto unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&sco_pi(sk)->src, &hcon->src);
if (hcon->state == BT_CONNECTED) {
sco_sock_clear_timer(sk);
sk->sk_state = BT_CONNECTED;
} else {
sk->sk_state = BT_CONNECT;
- sco_sock_set_timer(sk, sk->sk_sndtimeo);
+ sco_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
}
-done:
- hci_dev_unlock_bh(hdev);
+
+ release_sock(sk);
+
+unlock:
+ hci_dev_unlock(hdev);
hci_dev_put(hdev);
return err;
}
-static inline int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
+static int sco_send_frame(struct sock *sk, struct sk_buff *skb,
+ const struct sockcm_cookie *sockc)
{
struct sco_conn *conn = sco_pi(sk)->conn;
- struct sk_buff *skb;
- int err, count;
+ int len = skb->len;
/* Check outgoing MTU */
if (len > conn->mtu)
@@ -242,33 +390,24 @@ static inline int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
BT_DBG("sk %p len %d", sk, len);
- count = min_t(unsigned int, conn->mtu, len);
- if (!(skb = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err)))
- return err;
-
- if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) {
- err = -EFAULT;
- goto fail;
- }
-
- if ((err = hci_send_sco(conn->hcon, skb)) < 0)
- return err;
-
- return count;
+ hci_setup_tx_timestamp(skb, 1, sockc);
+ hci_send_sco(conn->hcon, skb);
-fail:
- kfree_skb(skb);
- return err;
+ return len;
}
-static inline void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
+static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
{
- struct sock *sk = sco_chan_get(conn);
+ struct sock *sk;
+
+ sco_conn_lock(conn);
+ sk = conn->sk;
+ sco_conn_unlock(conn);
if (!sk)
goto drop;
- BT_DBG("sk %p len %d", sk, skb->len);
+ BT_DBG("sk %p len %u", sk, skb->len);
if (sk->sk_state != BT_CONNECTED)
goto drop;
@@ -278,21 +417,22 @@ static inline void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
drop:
kfree_skb(skb);
- return;
}
/* -------- Socket interface ---------- */
-static struct sock *__sco_get_sock_by_addr(bdaddr_t *ba)
+static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba)
{
struct sock *sk;
- struct hlist_node *node;
- sk_for_each(sk, node, &sco_sk_list.head)
- if (!bacmp(&bt_sk(sk)->src, ba))
- goto found;
- sk = NULL;
-found:
- return sk;
+ sk_for_each(sk, &sco_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&sco_pi(sk)->src, ba))
+ return sk;
+ }
+
+ return NULL;
}
/* Find socket listening on source bdaddr.
@@ -301,32 +441,33 @@ found:
static struct sock *sco_get_sock_listen(bdaddr_t *src)
{
struct sock *sk = NULL, *sk1 = NULL;
- struct hlist_node *node;
read_lock(&sco_sk_list.lock);
- sk_for_each(sk, node, &sco_sk_list.head) {
+ sk_for_each(sk, &sco_sk_list.head) {
if (sk->sk_state != BT_LISTEN)
continue;
/* Exact match. */
- if (!bacmp(&bt_sk(sk)->src, src))
+ if (!bacmp(&sco_pi(sk)->src, src))
break;
/* Closest match */
- if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
+ if (!bacmp(&sco_pi(sk)->src, BDADDR_ANY))
sk1 = sk;
}
read_unlock(&sco_sk_list.lock);
- return node ? sk : sk1;
+ return sk ? sk : sk1;
}
static void sco_sock_destruct(struct sock *sk)
{
BT_DBG("sk %p", sk);
+ sco_conn_put(sco_pi(sk)->conn);
+
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_write_queue);
}
@@ -357,26 +498,22 @@ static void sco_sock_kill(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
+ /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ if (sco_pi(sk)->conn) {
+ sco_conn_lock(sco_pi(sk)->conn);
+ sco_pi(sk)->conn->sk = NULL;
+ sco_conn_unlock(sco_pi(sk)->conn);
+ }
+
/* Kill poor orphan */
bt_sock_unlink(&sco_sk_list, sk);
sock_set_flag(sk, SOCK_DEAD);
sock_put(sk);
}
-/* Close socket.
- * Must be called on unlocked socket.
- */
-static void sco_sock_close(struct sock *sk)
+static void __sco_sock_close(struct sock *sk)
{
- struct sco_conn *conn;
-
- sco_sock_clear_timer(sk);
-
- lock_sock(sk);
-
- conn = sco_pi(sk)->conn;
-
- BT_DBG("sk %p state %d conn %p socket %p", sk, sk->sk_state, conn, sk->sk_socket);
+ BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
switch (sk->sk_state) {
case BT_LISTEN:
@@ -385,6 +522,7 @@ static void sco_sock_close(struct sock *sk)
case BT_CONNECTED:
case BT_CONFIG:
+ case BT_CONNECT2:
case BT_CONNECT:
case BT_DISCONN:
sco_chan_del(sk, ECONNRESET);
@@ -393,19 +531,28 @@ static void sco_sock_close(struct sock *sk)
default:
sock_set_flag(sk, SOCK_ZAPPED);
break;
- };
+ }
- release_sock(sk);
+}
- sco_sock_kill(sk);
+/* Must be called on unlocked socket. */
+static void sco_sock_close(struct sock *sk)
+{
+ lock_sock(sk);
+ sco_sock_clear_timer(sk);
+ __sco_sock_close(sk);
+ release_sock(sk);
}
static void sco_sock_init(struct sock *sk, struct sock *parent)
{
BT_DBG("sk %p", sk);
- if (parent)
+ if (parent) {
sk->sk_type = parent->sk_type;
+ bt_sk(sk)->flags = bt_sk(parent)->flags;
+ security_sk_clone(parent, sk);
+ }
}
static struct proto sco_proto = {
@@ -414,32 +561,30 @@ static struct proto sco_proto = {
.obj_size = sizeof(struct sco_pinfo)
};
-static struct sock *sco_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
{
struct sock *sk;
- sk = sk_alloc(PF_BLUETOOTH, prio, &sco_proto, 1);
+ sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern);
if (!sk)
return NULL;
- sock_init_data(sock, sk);
- INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
-
sk->sk_destruct = sco_sock_destruct;
sk->sk_sndtimeo = SCO_CONN_TIMEOUT;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = proto;
- sk->sk_state = BT_OPEN;
-
- sco_sock_init_timer(sk);
+ sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT;
+ sco_pi(sk)->codec.id = BT_CODEC_CVSD;
+ sco_pi(sk)->codec.cid = 0xffff;
+ sco_pi(sk)->codec.vid = 0xffff;
+ sco_pi(sk)->codec.data_path = 0x00;
bt_sock_link(&sco_sk_list, sk);
return sk;
}
-static int sco_sock_create(struct socket *sock, int protocol)
+static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
@@ -452,7 +597,7 @@ static int sco_sock_create(struct socket *sock, int protocol)
sock->ops = &sco_sock_ops;
- sk = sco_sock_alloc(sock, protocol, GFP_ATOMIC);
+ sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
@@ -460,18 +605,19 @@ static int sco_sock_create(struct socket *sock, int protocol)
return 0;
}
-static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
+ int addr_len)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
- bdaddr_t *src = &sa->sco_bdaddr;
int err = 0;
- BT_DBG("sk %p %s", sk, batostr(&sa->sco_bdaddr));
-
- if (!addr || addr->sa_family != AF_BLUETOOTH)
+ if (!addr || addr_len < sizeof(struct sockaddr_sco) ||
+ addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr);
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -479,53 +625,52 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_le
goto done;
}
- write_lock_bh(&sco_sk_list.lock);
-
- if (bacmp(src, BDADDR_ANY) && __sco_get_sock_by_addr(src)) {
- err = -EADDRINUSE;
- } else {
- /* Save source address */
- bacpy(&bt_sk(sk)->src, &sa->sco_bdaddr);
- sk->sk_state = BT_BOUND;
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
}
- write_unlock_bh(&sco_sk_list.lock);
+ bacpy(&sco_pi(sk)->src, &sa->sco_bdaddr);
+
+ sk->sk_state = BT_BOUND;
done:
release_sock(sk);
return err;
}
-static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
- int err = 0;
-
+ int err;
BT_DBG("sk %p", sk);
- if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_sco))
+ if (alen < sizeof(struct sockaddr_sco) ||
+ addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
return -EBADFD;
if (sk->sk_type != SOCK_SEQPACKET)
- return -EINVAL;
+ err = -EINVAL;
lock_sock(sk);
-
/* Set destination address and psm */
- bacpy(&bt_sk(sk)->dst, &sa->sco_bdaddr);
+ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr);
+ release_sock(sk);
- if ((err = sco_connect(sk)))
- goto done;
+ err = sco_connect(sk);
+ if (err)
+ return err;
+
+ lock_sock(sk);
- err = bt_sock_wait_state(sk, BT_CONNECTED,
- sock_sndtimeo(sk, flags & O_NONBLOCK));
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
-done:
release_sock(sk);
return err;
}
@@ -533,59 +678,71 @@ done:
static int sco_sock_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
+ bdaddr_t *src = &sco_pi(sk)->src;
int err = 0;
BT_DBG("sk %p backlog %d", sk, backlog);
lock_sock(sk);
- if (sk->sk_state != BT_BOUND || sock->type != SOCK_SEQPACKET) {
+ if (sk->sk_state != BT_BOUND) {
err = -EBADFD;
goto done;
}
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ write_lock(&sco_sk_list.lock);
+
+ if (__sco_get_sock_listen_by_addr(src)) {
+ err = -EADDRINUSE;
+ goto unlock;
+ }
+
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
+
sk->sk_state = BT_LISTEN;
+unlock:
+ write_unlock(&sco_sk_list.lock);
+
done:
release_sock(sk);
return err;
}
-static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int sco_sock_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
- DECLARE_WAITQUEUE(wait, current);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *ch;
long timeo;
int err = 0;
lock_sock(sk);
- if (sk->sk_state != BT_LISTEN) {
- err = -EBADFD;
- goto done;
- }
-
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
BT_DBG("sk %p timeo %ld", sk, timeo);
/* Wait for an incoming connection. (wake-one). */
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- while (!(ch = bt_accept_dequeue(sk, newsock))) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (!timeo) {
- err = -EAGAIN;
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
break;
}
- release_sock(sk);
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
+ ch = bt_accept_dequeue(sk, newsock);
+ if (ch)
+ break;
- if (sk->sk_state != BT_LISTEN) {
- err = -EBADFD;
+ if (!timeo) {
+ err = -EAGAIN;
break;
}
@@ -593,9 +750,13 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flag
err = sock_intr_errno(timeo);
break;
}
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ lock_sock(sk);
}
- set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
if (err)
goto done;
@@ -609,7 +770,8 @@ done:
return err;
}
-static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+static int sco_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
@@ -617,21 +779,22 @@ static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len
BT_DBG("sock %p, sk %p", sock, sk);
addr->sa_family = AF_BLUETOOTH;
- *len = sizeof(struct sockaddr_sco);
if (peer)
- bacpy(&sa->sco_bdaddr, &bt_sk(sk)->dst);
+ bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst);
else
- bacpy(&sa->sco_bdaddr, &bt_sk(sk)->src);
+ bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src);
- return 0;
+ return sizeof(struct sockaddr_sco);
}
-static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
{
struct sock *sk = sock->sk;
- int err = 0;
+ struct sk_buff *skb;
+ struct sockcm_cookie sockc;
+ int err;
BT_DBG("sock %p, sk %p", sock, sk);
@@ -642,27 +805,235 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
+ hci_sockcm_init(&sockc, sk);
+
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (err)
+ return err;
+ }
+
+ skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
lock_sock(sk);
if (sk->sk_state == BT_CONNECTED)
- err = sco_send_frame(sk, msg, len);
+ err = sco_send_frame(sk, skb, &sockc);
else
err = -ENOTCONN;
release_sock(sk);
+
+ if (err < 0)
+ kfree_skb(skb);
return err;
}
-static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("conn %p", conn);
+
+ conn->state = BT_CONFIG;
+
+ if (!lmp_esco_capable(hdev)) {
+ struct hci_cp_accept_conn_req cp;
+
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.role = 0x00; /* Ignored */
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp);
+ } else {
+ struct hci_cp_accept_sync_conn_req cp;
+
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.content_format = cpu_to_le16(setting);
+
+ switch (setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (conn->pkt_type & ESCO_2EV3)
+ cp.max_latency = cpu_to_le16(0x0008);
+ else
+ cp.max_latency = cpu_to_le16(0x000D);
+ cp.retrans_effort = 0x02;
+ break;
+ case SCO_AIRMODE_CVSD:
+ cp.max_latency = cpu_to_le16(0xffff);
+ cp.retrans_effort = 0xff;
+ break;
+ default:
+ /* use CVSD settings as fallback */
+ cp.max_latency = cpu_to_le16(0xffff);
+ cp.retrans_effort = 0xff;
+ break;
+ }
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ,
+ sizeof(cp), &cp);
+ }
+}
+
+static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sco_pinfo *pi = sco_pi(sk);
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH,
+ BT_SCM_ERROR);
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ sco_conn_defer_accept(pi->conn->hcon, pi->setting);
+ sk->sk_state = BT_CONFIG;
+
+ release_sock(sk);
+ return 0;
+ }
+
+ release_sock(sk);
+
+ return bt_sock_recvmsg(sock, msg, len, flags);
+}
+
+static int sco_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
int err = 0;
+ struct bt_voice voice;
+ u32 opt;
+ struct bt_codecs *codecs;
+ struct hci_dev *hdev;
+ __u8 buffer[255];
BT_DBG("sk %p", sk);
lock_sock(sk);
switch (optname) {
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ break;
+
+ case BT_VOICE:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2) {
+ err = -EINVAL;
+ break;
+ }
+
+ voice.setting = sco_pi(sk)->setting;
+
+ err = copy_safe_from_sockptr(&voice, sizeof(voice), optval,
+ optlen);
+ if (err)
+ break;
+
+ sco_pi(sk)->setting = voice.setting;
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src,
+ BDADDR_BREDR);
+ if (!hdev) {
+ err = -EBADFD;
+ break;
+ }
+
+ switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (enhanced_sync_conn_capable(hdev))
+ sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT;
+ break;
+ }
+
+ hci_dev_put(hdev);
+ break;
+
+ case BT_PKT_STATUS:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ break;
+
+ case BT_CODEC:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2) {
+ err = -EINVAL;
+ break;
+ }
+
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src,
+ BDADDR_BREDR);
+ if (!hdev) {
+ err = -EBADFD;
+ break;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (!hdev->get_data_path_id) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (optlen < sizeof(struct bt_codecs) ||
+ optlen > sizeof(buffer)) {
+ hci_dev_put(hdev);
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_struct_from_sockptr(buffer, sizeof(buffer), optval,
+ optlen);
+ if (err) {
+ hci_dev_put(hdev);
+ break;
+ }
+
+ codecs = (void *)buffer;
+
+ if (codecs->num_codecs > 1) {
+ hci_dev_put(hdev);
+ err = -EINVAL;
+ break;
+ }
+
+ sco_pi(sk)->codec = codecs->codecs[0];
+ hci_dev_put(hdev);
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -672,12 +1043,14 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
return err;
}
-static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int sco_sock_getsockopt_old(struct socket *sock, int optname,
+ char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct sco_options opts;
struct sco_conninfo cinfo;
- int len, err = 0;
+ int err = 0;
+ size_t len;
BT_DBG("sk %p", sk);
@@ -688,31 +1061,36 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
switch (optname) {
case SCO_OPTIONS:
- if (sk->sk_state != BT_CONNECTED) {
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
err = -ENOTCONN;
break;
}
opts.mtu = sco_pi(sk)->conn->mtu;
- BT_DBG("mtu %d", opts.mtu);
+ BT_DBG("mtu %u", opts.mtu);
- len = min_t(unsigned int, len, sizeof(opts));
+ len = min(len, sizeof(opts));
if (copy_to_user(optval, (char *)&opts, len))
err = -EFAULT;
break;
case SCO_CONNINFO:
- if (sk->sk_state != BT_CONNECTED) {
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
err = -ENOTCONN;
break;
}
+ memset(&cinfo, 0, sizeof(cinfo));
cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle;
memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3);
- len = min_t(unsigned int, len, sizeof(cinfo));
+ len = min(len, sizeof(cinfo));
if (copy_to_user(optval, (char *)&cinfo, len))
err = -EFAULT;
@@ -727,190 +1105,419 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
return err;
}
-static int sco_sock_release(struct socket *sock)
+static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
- int err = 0;
+ int len, err = 0;
+ struct bt_voice voice;
+ u32 phys;
+ int buf_len;
+ struct codec_list *c;
+ u8 num_codecs, i, __user *ptr;
+ struct hci_dev *hdev;
+ struct hci_codec_caps *caps;
+ struct bt_codec codec;
- BT_DBG("sock %p, sk %p", sock, sk);
+ BT_DBG("sk %p", sk);
- if (!sk)
- return 0;
+ if (level == SOL_SCO)
+ return sco_sock_getsockopt_old(sock, optname, optval, optlen);
- sco_sock_close(sk);
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *)optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_VOICE:
+ voice.setting = sco_pi(sk)->setting;
+
+ len = min_t(unsigned int, len, sizeof(voice));
+ if (copy_to_user(optval, (char *)&voice, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon);
+
+ if (put_user(phys, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_PKT_STATUS:
+ if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags),
+ (int __user *)optval))
+ err = -EFAULT;
+ break;
+
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval))
+ err = -EFAULT;
+ break;
+
+ case BT_CODEC:
+ num_codecs = 0;
+ buf_len = 0;
+
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
+ if (!hdev) {
+ err = -EBADFD;
+ break;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (!hdev->get_data_path_id) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) {
- lock_sock(sk);
- err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
release_sock(sk);
+
+ /* find total buffer size required to copy codec + caps */
+ hci_dev_lock(hdev);
+ list_for_each_entry(c, &hdev->local_codecs, list) {
+ if (c->transport != HCI_TRANSPORT_SCO_ESCO)
+ continue;
+ num_codecs++;
+ for (i = 0, caps = c->caps; i < c->num_caps; i++) {
+ buf_len += 1 + caps->len;
+ caps = (void *)&caps->data[caps->len];
+ }
+ buf_len += sizeof(struct bt_codec);
+ }
+ hci_dev_unlock(hdev);
+
+ buf_len += sizeof(struct bt_codecs);
+ if (buf_len > len) {
+ hci_dev_put(hdev);
+ return -ENOBUFS;
+ }
+ ptr = optval;
+
+ if (put_user(num_codecs, ptr)) {
+ hci_dev_put(hdev);
+ return -EFAULT;
+ }
+ ptr += sizeof(num_codecs);
+
+ /* Iterate all the codecs supported over SCO and populate
+ * codec data
+ */
+ hci_dev_lock(hdev);
+ list_for_each_entry(c, &hdev->local_codecs, list) {
+ if (c->transport != HCI_TRANSPORT_SCO_ESCO)
+ continue;
+
+ codec.id = c->id;
+ codec.cid = c->cid;
+ codec.vid = c->vid;
+ err = hdev->get_data_path_id(hdev, &codec.data_path);
+ if (err < 0)
+ break;
+ codec.num_caps = c->num_caps;
+ if (copy_to_user(ptr, &codec, sizeof(codec))) {
+ err = -EFAULT;
+ break;
+ }
+ ptr += sizeof(codec);
+
+ /* find codec capabilities data length */
+ len = 0;
+ for (i = 0, caps = c->caps; i < c->num_caps; i++) {
+ len += 1 + caps->len;
+ caps = (void *)&caps->data[caps->len];
+ }
+
+ /* copy codec capabilities data */
+ if (len && copy_to_user(ptr, c->caps, len)) {
+ err = -EFAULT;
+ break;
+ }
+ ptr += len;
+ }
+
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+
+ lock_sock(sk);
+
+ if (!err && put_user(buf_len, optlen))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
}
- sock_orphan(sk);
- sco_sock_kill(sk);
+ release_sock(sk);
return err;
}
-static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent)
+static int sco_sock_shutdown(struct socket *sock, int how)
{
- BT_DBG("conn %p", conn);
+ struct sock *sk = sock->sk;
+ int err = 0;
- sco_pi(sk)->conn = conn;
- conn->sk = sk;
+ BT_DBG("sock %p, sk %p", sock, sk);
- if (parent)
- bt_accept_enqueue(parent, sk);
+ if (!sk)
+ return 0;
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (!sk->sk_shutdown) {
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sco_sock_clear_timer(sk);
+ __sco_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED,
+ sk->sk_lingertime);
+ }
+
+ release_sock(sk);
+ sock_put(sk);
+
+ return err;
}
-/* Delete channel.
- * Must be called on the locked socket. */
-static void sco_chan_del(struct sock *sk, int err)
+static int sco_sock_release(struct socket *sock)
{
- struct sco_conn *conn;
+ struct sock *sk = sock->sk;
+ int err = 0;
- conn = sco_pi(sk)->conn;
+ BT_DBG("sock %p, sk %p", sock, sk);
- BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+ if (!sk)
+ return 0;
- if (conn) {
- sco_conn_lock(conn);
- conn->sk = NULL;
- sco_pi(sk)->conn = NULL;
- sco_conn_unlock(conn);
- hci_conn_put(conn->hcon);
- }
+ sco_sock_close(sk);
- sk->sk_state = BT_CLOSED;
- sk->sk_err = err;
- sk->sk_state_change(sk);
+ if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) &&
+ !(current->flags & PF_EXITING)) {
+ lock_sock(sk);
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+ release_sock(sk);
+ }
- sock_set_flag(sk, SOCK_ZAPPED);
+ sock_orphan(sk);
+ sco_sock_kill(sk);
+ return err;
}
static void sco_conn_ready(struct sco_conn *conn)
{
- struct sock *parent, *sk;
+ struct sock *parent;
+ struct sock *sk = conn->sk;
BT_DBG("conn %p", conn);
- sco_conn_lock(conn);
-
- if ((sk = conn->sk)) {
+ if (sk) {
+ lock_sock(sk);
sco_sock_clear_timer(sk);
- bh_lock_sock(sk);
sk->sk_state = BT_CONNECTED;
sk->sk_state_change(sk);
- bh_unlock_sock(sk);
+ release_sock(sk);
} else {
- parent = sco_get_sock_listen(conn->src);
- if (!parent)
- goto done;
+ sco_conn_lock(conn);
- bh_lock_sock(parent);
+ if (!conn->hcon) {
+ sco_conn_unlock(conn);
+ return;
+ }
- sk = sco_sock_alloc(NULL, BTPROTO_SCO, GFP_ATOMIC);
+ parent = sco_get_sock_listen(&conn->hcon->src);
+ if (!parent) {
+ sco_conn_unlock(conn);
+ return;
+ }
+
+ lock_sock(parent);
+
+ sk = sco_sock_alloc(sock_net(parent), NULL,
+ BTPROTO_SCO, GFP_ATOMIC, 0);
if (!sk) {
- bh_unlock_sock(parent);
- goto done;
+ release_sock(parent);
+ sco_conn_unlock(conn);
+ return;
}
sco_sock_init(sk, parent);
- bacpy(&bt_sk(sk)->src, conn->src);
- bacpy(&bt_sk(sk)->dst, conn->dst);
+ bacpy(&sco_pi(sk)->src, &conn->hcon->src);
+ bacpy(&sco_pi(sk)->dst, &conn->hcon->dst);
+ sco_conn_hold(conn);
hci_conn_hold(conn->hcon);
__sco_chan_add(conn, sk, parent);
- sk->sk_state = BT_CONNECTED;
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ sk->sk_state = BT_CONNECT2;
+ else
+ sk->sk_state = BT_CONNECTED;
/* Wake up parent */
- parent->sk_data_ready(parent, 1);
+ parent->sk_data_ready(parent);
- bh_unlock_sock(parent);
- }
+ release_sock(parent);
-done:
- sco_conn_unlock(conn);
+ sco_conn_unlock(conn);
+ }
}
/* ----- SCO interface with lower layer (HCI) ----- */
-static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type)
+int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
{
- BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
+ struct sock *sk;
+ int lm = 0;
+
+ BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr);
+
+ /* Find listening sockets */
+ read_lock(&sco_sk_list.lock);
+ sk_for_each(sk, &sco_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&sco_pi(sk)->src, &hdev->bdaddr) ||
+ !bacmp(&sco_pi(sk)->src, BDADDR_ANY)) {
+ lm |= HCI_LM_ACCEPT;
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
+ *flags |= HCI_PROTO_DEFER;
+ break;
+ }
+ }
+ read_unlock(&sco_sk_list.lock);
- /* Always accept connection */
- return HCI_LM_ACCEPT;
+ return lm;
}
-static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
+static void sco_connect_cfm(struct hci_conn *hcon, __u8 status)
{
- BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status);
+ if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+ return;
- if (hcon->type != SCO_LINK)
- return 0;
+ BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status);
if (!status) {
struct sco_conn *conn;
- conn = sco_conn_add(hcon, status);
- if (conn)
+ conn = sco_conn_add(hcon);
+ if (conn) {
sco_conn_ready(conn);
- } else
- sco_conn_del(hcon, bt_err(status));
-
- return 0;
+ sco_conn_put(conn);
+ }
+ } else
+ sco_conn_del(hcon, bt_to_errno(status));
}
-static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason)
+static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
{
- BT_DBG("hcon %p reason %d", hcon, reason);
+ if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+ return;
- if (hcon->type != SCO_LINK)
- return 0;
+ BT_DBG("hcon %p reason %d", hcon, reason);
- sco_conn_del(hcon, bt_err(reason));
- return 0;
+ sco_conn_del(hcon, bt_to_errno(reason));
}
-static int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb)
{
- struct sco_conn *conn = hcon->sco_data;
+ struct hci_conn *hcon;
+ struct sco_conn *conn;
- if (!conn)
- goto drop;
+ hci_dev_lock(hdev);
- BT_DBG("conn %p len %d", conn, skb->len);
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
- if (skb->len) {
- sco_recv_frame(conn, skb);
- return 0;
+ conn = sco_conn_hold_unless_zero(hcon->sco_data);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
}
-drop:
- kfree_skb(skb);
+ BT_DBG("conn %p len %u", conn, skb->len);
+
+ if (skb->len)
+ sco_recv_frame(conn, skb);
+ else
+ kfree_skb(skb);
+
+ sco_conn_put(conn);
return 0;
}
-static ssize_t sco_sysfs_show(struct class *dev, char *buf)
+static struct hci_cb sco_cb = {
+ .name = "SCO",
+ .connect_cfm = sco_connect_cfm,
+ .disconn_cfm = sco_disconn_cfm,
+};
+
+static int sco_debugfs_show(struct seq_file *f, void *p)
{
struct sock *sk;
- struct hlist_node *node;
- char *str = buf;
- read_lock_bh(&sco_sk_list.lock);
+ read_lock(&sco_sk_list.lock);
- sk_for_each(sk, node, &sco_sk_list.head) {
- str += sprintf(str, "%s %s %d\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- sk->sk_state);
+ sk_for_each(sk, &sco_sk_list.head) {
+ seq_printf(f, "%pMR %pMR %d\n", &sco_pi(sk)->src,
+ &sco_pi(sk)->dst, sk->sk_state);
}
- read_unlock_bh(&sco_sk_list.lock);
+ read_unlock(&sco_sk_list.lock);
- return (str - buf);
+ return 0;
}
-static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
+DEFINE_SHOW_ATTRIBUTE(sco_debugfs);
+
+static struct dentry *sco_debugfs;
static const struct proto_ops sco_sock_ops = {
.family = PF_BLUETOOTH,
@@ -922,35 +1529,29 @@ static const struct proto_ops sco_sock_ops = {
.accept = sco_sock_accept,
.getname = sco_sock_getname,
.sendmsg = sco_sock_sendmsg,
- .recvmsg = bt_sock_recvmsg,
+ .recvmsg = sco_sock_recvmsg,
.poll = bt_sock_poll,
- .ioctl = sock_no_ioctl,
+ .ioctl = bt_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
- .shutdown = sock_no_shutdown,
+ .shutdown = sco_sock_shutdown,
.setsockopt = sco_sock_setsockopt,
.getsockopt = sco_sock_getsockopt
};
-static struct net_proto_family sco_sock_family_ops = {
+static const struct net_proto_family sco_sock_family_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.create = sco_sock_create,
};
-static struct hci_proto sco_hci_proto = {
- .name = "SCO",
- .id = HCI_PROTO_SCO,
- .connect_ind = sco_connect_ind,
- .connect_cfm = sco_connect_cfm,
- .disconn_ind = sco_disconn_ind,
- .recv_scodata = sco_recv_scodata
-};
-
-static int __init sco_init(void)
+int __init sco_init(void)
{
int err;
+ BUILD_BUG_ON(sizeof(struct sockaddr_sco) > sizeof(struct sockaddr));
+
err = proto_register(&sco_proto, 0);
if (err < 0)
return err;
@@ -961,19 +1562,23 @@ static int __init sco_init(void)
goto error;
}
- err = hci_register_proto(&sco_hci_proto);
+ err = bt_procfs_init(&init_net, "sco", &sco_sk_list, NULL);
if (err < 0) {
- BT_ERR("SCO protocol registration failed");
+ BT_ERR("Failed to create SCO proc file");
bt_sock_unregister(BTPROTO_SCO);
goto error;
}
- if (class_create_file(bt_class, &class_attr_sco) < 0)
- BT_ERR("Failed to create SCO info file");
-
- BT_INFO("SCO (Voice Link) ver %s", VERSION);
BT_INFO("SCO socket layer initialized");
+ hci_register_cb(&sco_cb);
+
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ sco_debugfs = debugfs_create_file("sco", 0444, bt_debugfs,
+ NULL, &sco_debugfs_fops);
+
return 0;
error:
@@ -981,24 +1586,18 @@ error:
return err;
}
-static void __exit sco_exit(void)
+void sco_exit(void)
{
- class_remove_file(bt_class, &class_attr_sco);
+ bt_procfs_cleanup(&init_net, "sco");
- if (bt_sock_unregister(BTPROTO_SCO) < 0)
- BT_ERR("SCO socket unregistration failed");
+ debugfs_remove(sco_debugfs);
- if (hci_unregister_proto(&sco_hci_proto) < 0)
- BT_ERR("SCO protocol unregistration failed");
+ hci_unregister_cb(&sco_cb);
+
+ bt_sock_unregister(BTPROTO_SCO);
proto_unregister(&sco_proto);
}
-module_init(sco_init);
-module_exit(sco_exit);
-
-MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>, Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth SCO ver " VERSION);
-MODULE_VERSION(VERSION);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("bt-proto-2");
+module_param(disable_esco, bool, 0644);
+MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation");
diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c
new file mode 100644
index 000000000000..f49604d44b87
--- /dev/null
+++ b/net/bluetooth/selftest.c
@@ -0,0 +1,309 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/debugfs.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "ecdh_helper.h"
+#include "smp.h"
+#include "selftest.h"
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST_ECDH)
+
+static const u8 priv_a_1[32] __initconst = {
+ 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58,
+ 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a,
+ 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74,
+ 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f,
+};
+static const u8 priv_b_1[32] __initconst = {
+ 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b,
+ 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59,
+ 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90,
+ 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55,
+};
+static const u8 pub_a_1[64] __initconst = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20,
+
+ 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74,
+ 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76,
+ 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63,
+ 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc,
+};
+static const u8 pub_b_1[64] __initconst = {
+ 0x90, 0xa1, 0xaa, 0x2f, 0xb2, 0x77, 0x90, 0x55,
+ 0x9f, 0xa6, 0x15, 0x86, 0xfd, 0x8a, 0xb5, 0x47,
+ 0x00, 0x4c, 0x9e, 0xf1, 0x84, 0x22, 0x59, 0x09,
+ 0x96, 0x1d, 0xaf, 0x1f, 0xf0, 0xf0, 0xa1, 0x1e,
+
+ 0x4a, 0x21, 0xb1, 0x15, 0xf9, 0xaf, 0x89, 0x5f,
+ 0x76, 0x36, 0x8e, 0xe2, 0x30, 0x11, 0x2d, 0x47,
+ 0x60, 0x51, 0xb8, 0x9a, 0x3a, 0x70, 0x56, 0x73,
+ 0x37, 0xad, 0x9d, 0x42, 0x3e, 0xf3, 0x55, 0x4c,
+};
+static const u8 dhkey_1[32] __initconst = {
+ 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86,
+ 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99,
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec,
+};
+
+static const u8 priv_a_2[32] __initconst = {
+ 0x63, 0x76, 0x45, 0xd0, 0xf7, 0x73, 0xac, 0xb7,
+ 0xff, 0xdd, 0x03, 0x72, 0xb9, 0x72, 0x85, 0xb4,
+ 0x41, 0xb6, 0x5d, 0x0c, 0x5d, 0x54, 0x84, 0x60,
+ 0x1a, 0xa3, 0x9a, 0x3c, 0x69, 0x16, 0xa5, 0x06,
+};
+static const u8 priv_b_2[32] __initconst = {
+ 0xba, 0x30, 0x55, 0x50, 0x19, 0xa2, 0xca, 0xa3,
+ 0xa5, 0x29, 0x08, 0xc6, 0xb5, 0x03, 0x88, 0x7e,
+ 0x03, 0x2b, 0x50, 0x73, 0xd4, 0x2e, 0x50, 0x97,
+ 0x64, 0xcd, 0x72, 0x0d, 0x67, 0xa0, 0x9a, 0x52,
+};
+static const u8 pub_a_2[64] __initconst = {
+ 0xdd, 0x78, 0x5c, 0x74, 0x03, 0x9b, 0x7e, 0x98,
+ 0xcb, 0x94, 0x87, 0x4a, 0xad, 0xfa, 0xf8, 0xd5,
+ 0x43, 0x3e, 0x5c, 0xaf, 0xea, 0xb5, 0x4c, 0xf4,
+ 0x9e, 0x80, 0x79, 0x57, 0x7b, 0xa4, 0x31, 0x2c,
+
+ 0x4f, 0x5d, 0x71, 0x43, 0x77, 0x43, 0xf8, 0xea,
+ 0xd4, 0x3e, 0xbd, 0x17, 0x91, 0x10, 0x21, 0xd0,
+ 0x1f, 0x87, 0x43, 0x8e, 0x40, 0xe2, 0x52, 0xcd,
+ 0xbe, 0xdf, 0x98, 0x38, 0x18, 0x12, 0x95, 0x91,
+};
+static const u8 pub_b_2[64] __initconst = {
+ 0xcc, 0x00, 0x65, 0xe1, 0xf5, 0x6c, 0x0d, 0xcf,
+ 0xec, 0x96, 0x47, 0x20, 0x66, 0xc9, 0xdb, 0x84,
+ 0x81, 0x75, 0xa8, 0x4d, 0xc0, 0xdf, 0xc7, 0x9d,
+ 0x1b, 0x3f, 0x3d, 0xf2, 0x3f, 0xe4, 0x65, 0xf4,
+
+ 0x79, 0xb2, 0xec, 0xd8, 0xca, 0x55, 0xa1, 0xa8,
+ 0x43, 0x4d, 0x6b, 0xca, 0x10, 0xb0, 0xc2, 0x01,
+ 0xc2, 0x33, 0x4e, 0x16, 0x24, 0xc4, 0xef, 0xee,
+ 0x99, 0xd8, 0xbb, 0xbc, 0x48, 0xd0, 0x01, 0x02,
+};
+static const u8 dhkey_2[32] __initconst = {
+ 0x69, 0xeb, 0x21, 0x32, 0xf2, 0xc6, 0x05, 0x41,
+ 0x60, 0x19, 0xcd, 0x5e, 0x94, 0xe1, 0xe6, 0x5f,
+ 0x33, 0x07, 0xe3, 0x38, 0x4b, 0x68, 0xe5, 0x62,
+ 0x3f, 0x88, 0x6d, 0x2f, 0x3a, 0x84, 0x85, 0xab,
+};
+
+static const u8 priv_a_3[32] __initconst = {
+ 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58,
+ 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a,
+ 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74,
+ 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f,
+};
+static const u8 pub_a_3[64] __initconst = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20,
+
+ 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74,
+ 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76,
+ 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63,
+ 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc,
+};
+static const u8 dhkey_3[32] __initconst = {
+ 0x2d, 0xab, 0x00, 0x48, 0xcb, 0xb3, 0x7b, 0xda,
+ 0x55, 0x7b, 0x8b, 0x72, 0xa8, 0x57, 0x87, 0xc3,
+ 0x87, 0x27, 0x99, 0x32, 0xfc, 0x79, 0x5f, 0xae,
+ 0x7c, 0x1c, 0xf9, 0x49, 0xe6, 0xd7, 0xaa, 0x70,
+};
+
+static int __init test_ecdh_sample(struct crypto_kpp *tfm, const u8 priv_a[32],
+ const u8 priv_b[32], const u8 pub_a[64],
+ const u8 pub_b[64], const u8 dhkey[32])
+{
+ u8 *tmp, *dhkey_a, *dhkey_b;
+ int ret;
+
+ tmp = kmalloc(64, GFP_KERNEL);
+ if (!tmp)
+ return -EINVAL;
+
+ dhkey_a = &tmp[0];
+ dhkey_b = &tmp[32];
+
+ ret = set_ecdh_privkey(tfm, priv_a);
+ if (ret)
+ goto out;
+
+ ret = compute_ecdh_secret(tfm, pub_b, dhkey_a);
+ if (ret)
+ goto out;
+
+ if (memcmp(dhkey_a, dhkey, 32)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = set_ecdh_privkey(tfm, priv_b);
+ if (ret)
+ goto out;
+
+ ret = compute_ecdh_secret(tfm, pub_a, dhkey_b);
+ if (ret)
+ goto out;
+
+ if (memcmp(dhkey_b, dhkey, 32))
+ ret = -EINVAL;
+ /* fall through*/
+out:
+ kfree(tmp);
+ return ret;
+}
+
+static char test_ecdh_buffer[32];
+
+static ssize_t test_ecdh_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return simple_read_from_buffer(user_buf, count, ppos, test_ecdh_buffer,
+ strlen(test_ecdh_buffer));
+}
+
+static const struct file_operations test_ecdh_fops = {
+ .open = simple_open,
+ .read = test_ecdh_read,
+ .llseek = default_llseek,
+};
+
+static int __init test_ecdh(void)
+{
+ struct crypto_kpp *tfm;
+ ktime_t calltime, delta, rettime;
+ unsigned long long duration = 0;
+ int err;
+
+ calltime = ktime_get();
+
+ tfm = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
+ if (IS_ERR(tfm)) {
+ BT_ERR("Unable to create ECDH crypto context");
+ err = PTR_ERR(tfm);
+ goto done;
+ }
+
+ err = test_ecdh_sample(tfm, priv_a_1, priv_b_1, pub_a_1, pub_b_1,
+ dhkey_1);
+ if (err) {
+ BT_ERR("ECDH sample 1 failed");
+ goto done;
+ }
+
+ err = test_ecdh_sample(tfm, priv_a_2, priv_b_2, pub_a_2, pub_b_2,
+ dhkey_2);
+ if (err) {
+ BT_ERR("ECDH sample 2 failed");
+ goto done;
+ }
+
+ err = test_ecdh_sample(tfm, priv_a_3, priv_a_3, pub_a_3, pub_a_3,
+ dhkey_3);
+ if (err) {
+ BT_ERR("ECDH sample 3 failed");
+ goto done;
+ }
+
+ crypto_free_kpp(tfm);
+
+ rettime = ktime_get();
+ delta = ktime_sub(rettime, calltime);
+ duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+
+ BT_INFO("ECDH test passed in %llu usecs", duration);
+
+done:
+ if (!err)
+ snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer),
+ "PASS (%llu usecs)\n", duration);
+ else
+ snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer), "FAIL\n");
+
+ debugfs_create_file("selftest_ecdh", 0444, bt_debugfs, NULL,
+ &test_ecdh_fops);
+
+ return err;
+}
+
+#else
+
+static inline int test_ecdh(void)
+{
+ return 0;
+}
+
+#endif
+
+static int __init run_selftest(void)
+{
+ int err;
+
+ BT_INFO("Starting self testing");
+
+ err = test_ecdh();
+ if (err)
+ goto done;
+
+ err = bt_selftest_smp();
+
+done:
+ BT_INFO("Finished self testing");
+
+ return err;
+}
+
+#if IS_MODULE(CONFIG_BT)
+
+/* This is run when CONFIG_BT_SELFTEST=y and CONFIG_BT=m and is just a
+ * wrapper to allow running this at module init.
+ *
+ * If CONFIG_BT_SELFTEST=n, then this code is not compiled at all.
+ */
+int __init bt_selftest(void)
+{
+ return run_selftest();
+}
+
+#else
+
+/* This is run when CONFIG_BT_SELFTEST=y and CONFIG_BT=y and is run
+ * via late_initcall() as last item in the initialization sequence.
+ *
+ * If CONFIG_BT_SELFTEST=n, then this code is not compiled at all.
+ */
+static int __init bt_selftest_init(void)
+{
+ return run_selftest();
+}
+late_initcall(bt_selftest_init);
+
+#endif
diff --git a/net/bluetooth/selftest.h b/net/bluetooth/selftest.h
new file mode 100644
index 000000000000..2aa0a346a913
--- /dev/null
+++ b/net/bluetooth/selftest.h
@@ -0,0 +1,45 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST) && IS_MODULE(CONFIG_BT)
+
+/* When CONFIG_BT_SELFTEST=y and the CONFIG_BT=m, then the self testing
+ * is run at module loading time.
+ */
+int bt_selftest(void);
+
+#else
+
+/* When CONFIG_BT_SELFTEST=y and CONFIG_BT=y, then the self testing
+ * is run via late_initcall() to make sure that subsys_initcall() of
+ * the Bluetooth subsystem and device_initcall() of the Crypto subsystem
+ * do not clash.
+ *
+ * When CONFIG_BT_SELFTEST=n, then this turns into an empty call that
+ * has no impact.
+ */
+static inline int bt_selftest(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
new file mode 100644
index 000000000000..3a1ce04a7a53
--- /dev/null
+++ b/net/bluetooth/smp.c
@@ -0,0 +1,3847 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/debugfs.h>
+#include <linux/scatterlist.h>
+#include <crypto/aes.h>
+#include <crypto/hash.h>
+#include <crypto/kpp.h>
+#include <crypto/utils.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "ecdh_helper.h"
+#include "smp.h"
+
+#define SMP_DEV(hdev) \
+ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data)
+
+/* Low-level debug macros to be used for stuff that we don't want
+ * accidentally in dmesg, i.e. the values of the various crypto keys
+ * and the inputs & outputs of crypto functions.
+ */
+#ifdef DEBUG
+#define SMP_DBG(fmt, ...) printk(KERN_DEBUG "%s: " fmt, __func__, \
+ ##__VA_ARGS__)
+#else
+#define SMP_DBG(fmt, ...) no_printk(KERN_DEBUG "%s: " fmt, __func__, \
+ ##__VA_ARGS__)
+#endif
+
+#define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd)
+
+/* Keys which are not distributed with Secure Connections */
+#define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY)
+
+#define SMP_TIMEOUT secs_to_jiffies(30)
+
+#define ID_ADDR_TIMEOUT msecs_to_jiffies(200)
+
+#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
+ 0x3f : 0x07)
+#define KEY_DIST_MASK 0x07
+
+/* Maximum message length that can be passed to aes_cmac */
+#define CMAC_MSG_MAX 80
+
+enum {
+ SMP_FLAG_TK_VALID,
+ SMP_FLAG_CFM_PENDING,
+ SMP_FLAG_MITM_AUTH,
+ SMP_FLAG_COMPLETE,
+ SMP_FLAG_INITIATOR,
+ SMP_FLAG_SC,
+ SMP_FLAG_REMOTE_PK,
+ SMP_FLAG_DEBUG_KEY,
+ SMP_FLAG_WAIT_USER,
+ SMP_FLAG_DHKEY_PENDING,
+ SMP_FLAG_REMOTE_OOB,
+ SMP_FLAG_LOCAL_OOB,
+ SMP_FLAG_CT2,
+};
+
+struct smp_dev {
+ /* Secure Connections OOB data */
+ bool local_oob;
+ u8 local_pk[64];
+ u8 local_rand[16];
+ bool debug_key;
+
+ struct crypto_shash *tfm_cmac;
+ struct crypto_kpp *tfm_ecdh;
+};
+
+struct smp_chan {
+ struct l2cap_conn *conn;
+ struct delayed_work security_timer;
+ unsigned long allow_cmd; /* Bitmask of allowed commands */
+
+ u8 preq[7]; /* SMP Pairing Request */
+ u8 prsp[7]; /* SMP Pairing Response */
+ u8 prnd[16]; /* SMP Pairing Random (local) */
+ u8 rrnd[16]; /* SMP Pairing Random (remote) */
+ u8 pcnf[16]; /* SMP Pairing Confirm */
+ u8 tk[16]; /* SMP Temporary Key */
+ u8 rr[16]; /* Remote OOB ra/rb value */
+ u8 lr[16]; /* Local OOB ra/rb value */
+ u8 enc_key_size;
+ u8 remote_key_dist;
+ bdaddr_t id_addr;
+ u8 id_addr_type;
+ u8 irk[16];
+ struct smp_csrk *csrk;
+ struct smp_csrk *responder_csrk;
+ struct smp_ltk *ltk;
+ struct smp_ltk *responder_ltk;
+ struct smp_irk *remote_irk;
+ u8 *link_key;
+ unsigned long flags;
+ u8 method;
+ u8 passkey_round;
+
+ /* Secure Connections variables */
+ u8 local_pk[64];
+ u8 remote_pk[64];
+ u8 dhkey[32];
+ u8 mackey[16];
+
+ struct crypto_shash *tfm_cmac;
+ struct crypto_kpp *tfm_ecdh;
+};
+
+/* These debug key values are defined in the SMP section of the core
+ * specification. debug_pk is the public debug key and debug_sk the
+ * private debug key.
+ */
+static const u8 debug_pk[64] = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20,
+
+ 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74,
+ 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76,
+ 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63,
+ 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc,
+};
+
+static const u8 debug_sk[32] = {
+ 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58,
+ 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a,
+ 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74,
+ 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f,
+};
+
+static inline void swap_buf(const u8 *src, u8 *dst, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++)
+ dst[len - 1 - i] = src[i];
+}
+
+/* The following functions map to the LE SC SMP crypto functions
+ * AES-CMAC, f4, f5, f6, g2 and h6.
+ */
+
+static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
+ size_t len, u8 mac[16])
+{
+ uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX];
+ int err;
+
+ if (len > CMAC_MSG_MAX)
+ return -EFBIG;
+
+ if (!tfm) {
+ BT_ERR("tfm %p", tfm);
+ return -EINVAL;
+ }
+
+ /* Swap key and message from LSB to MSB */
+ swap_buf(k, tmp, 16);
+ swap_buf(m, msg_msb, len);
+
+ SMP_DBG("msg (len %zu) %*phN", len, (int) len, m);
+ SMP_DBG("key %16phN", k);
+
+ err = crypto_shash_setkey(tfm, tmp, 16);
+ if (err) {
+ BT_ERR("cipher setkey failed: %d", err);
+ return err;
+ }
+
+ err = crypto_shash_tfm_digest(tfm, msg_msb, len, mac_msb);
+ if (err) {
+ BT_ERR("Hash computation error %d", err);
+ return err;
+ }
+
+ swap_buf(mac_msb, mac, 16);
+
+ SMP_DBG("mac %16phN", mac);
+
+ return 0;
+}
+
+static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32],
+ const u8 v[32], const u8 x[16], u8 z, u8 res[16])
+{
+ u8 m[65];
+ int err;
+
+ SMP_DBG("u %32phN", u);
+ SMP_DBG("v %32phN", v);
+ SMP_DBG("x %16phN z %02x", x, z);
+
+ m[0] = z;
+ memcpy(m + 1, v, 32);
+ memcpy(m + 33, u, 32);
+
+ err = aes_cmac(tfm_cmac, x, m, sizeof(m), res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32],
+ const u8 n1[16], const u8 n2[16], const u8 a1[7],
+ const u8 a2[7], u8 mackey[16], u8 ltk[16])
+{
+ /* The btle, salt and length "magic" values are as defined in
+ * the SMP section of the Bluetooth core specification. In ASCII
+ * the btle value ends up being 'btle'. The salt is just a
+ * random number whereas length is the value 256 in little
+ * endian format.
+ */
+ const u8 btle[4] = { 0x65, 0x6c, 0x74, 0x62 };
+ const u8 salt[16] = { 0xbe, 0x83, 0x60, 0x5a, 0xdb, 0x0b, 0x37, 0x60,
+ 0x38, 0xa5, 0xf5, 0xaa, 0x91, 0x83, 0x88, 0x6c };
+ const u8 length[2] = { 0x00, 0x01 };
+ u8 m[53], t[16];
+ int err;
+
+ SMP_DBG("w %32phN", w);
+ SMP_DBG("n1 %16phN n2 %16phN", n1, n2);
+ SMP_DBG("a1 %7phN a2 %7phN", a1, a2);
+
+ err = aes_cmac(tfm_cmac, salt, w, 32, t);
+ if (err)
+ return err;
+
+ SMP_DBG("t %16phN", t);
+
+ memcpy(m, length, 2);
+ memcpy(m + 2, a2, 7);
+ memcpy(m + 9, a1, 7);
+ memcpy(m + 16, n2, 16);
+ memcpy(m + 32, n1, 16);
+ memcpy(m + 48, btle, 4);
+
+ m[52] = 0; /* Counter */
+
+ err = aes_cmac(tfm_cmac, t, m, sizeof(m), mackey);
+ if (err)
+ return err;
+
+ SMP_DBG("mackey %16phN", mackey);
+
+ m[52] = 1; /* Counter */
+
+ err = aes_cmac(tfm_cmac, t, m, sizeof(m), ltk);
+ if (err)
+ return err;
+
+ SMP_DBG("ltk %16phN", ltk);
+
+ return 0;
+}
+
+static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16],
+ const u8 n1[16], const u8 n2[16], const u8 r[16],
+ const u8 io_cap[3], const u8 a1[7], const u8 a2[7],
+ u8 res[16])
+{
+ u8 m[65];
+ int err;
+
+ SMP_DBG("w %16phN", w);
+ SMP_DBG("n1 %16phN n2 %16phN", n1, n2);
+ SMP_DBG("r %16phN io_cap %3phN a1 %7phN a2 %7phN", r, io_cap, a1, a2);
+
+ memcpy(m, a2, 7);
+ memcpy(m + 7, a1, 7);
+ memcpy(m + 14, io_cap, 3);
+ memcpy(m + 17, r, 16);
+ memcpy(m + 33, n2, 16);
+ memcpy(m + 49, n1, 16);
+
+ err = aes_cmac(tfm_cmac, w, m, sizeof(m), res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32],
+ const u8 x[16], const u8 y[16], u32 *val)
+{
+ u8 m[80], tmp[16];
+ int err;
+
+ SMP_DBG("u %32phN", u);
+ SMP_DBG("v %32phN", v);
+ SMP_DBG("x %16phN y %16phN", x, y);
+
+ memcpy(m, y, 16);
+ memcpy(m + 16, v, 32);
+ memcpy(m + 48, u, 32);
+
+ err = aes_cmac(tfm_cmac, x, m, sizeof(m), tmp);
+ if (err)
+ return err;
+
+ *val = get_unaligned_le32(tmp);
+ *val %= 1000000;
+
+ SMP_DBG("val %06u", *val);
+
+ return 0;
+}
+
+static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
+ const u8 key_id[4], u8 res[16])
+{
+ int err;
+
+ SMP_DBG("w %16phN key_id %4phN", w, key_id);
+
+ err = aes_cmac(tfm_cmac, w, key_id, 4, res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
+ const u8 salt[16], u8 res[16])
+{
+ int err;
+
+ SMP_DBG("w %16phN salt %16phN", w, salt);
+
+ err = aes_cmac(tfm_cmac, salt, w, 16, res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+/* The following functions map to the legacy SMP crypto functions e, c1,
+ * s1 and ah.
+ */
+
+static int smp_e(const u8 *k, u8 *r)
+{
+ struct crypto_aes_ctx ctx;
+ uint8_t tmp[16], data[16];
+ int err;
+
+ SMP_DBG("k %16phN r %16phN", k, r);
+
+ /* The most significant octet of key corresponds to k[0] */
+ swap_buf(k, tmp, 16);
+
+ err = aes_expandkey(&ctx, tmp, 16);
+ if (err) {
+ BT_ERR("cipher setkey failed: %d", err);
+ return err;
+ }
+
+ /* Most significant octet of plaintextData corresponds to data[0] */
+ swap_buf(r, data, 16);
+
+ aes_encrypt(&ctx, data, data);
+
+ /* Most significant octet of encryptedData corresponds to data[0] */
+ swap_buf(data, r, 16);
+
+ SMP_DBG("r %16phN", r);
+
+ memzero_explicit(&ctx, sizeof(ctx));
+ return err;
+}
+
+static int smp_c1(const u8 k[16],
+ const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
+ const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
+{
+ u8 p1[16], p2[16];
+ int err;
+
+ SMP_DBG("k %16phN r %16phN", k, r);
+ SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra);
+ SMP_DBG("preq %7phN pres %7phN", preq, pres);
+
+ memset(p1, 0, 16);
+
+ /* p1 = pres || preq || _rat || _iat */
+ p1[0] = _iat;
+ p1[1] = _rat;
+ memcpy(p1 + 2, preq, 7);
+ memcpy(p1 + 9, pres, 7);
+
+ SMP_DBG("p1 %16phN", p1);
+
+ /* res = r XOR p1 */
+ crypto_xor_cpy(res, r, p1, sizeof(p1));
+
+ /* res = e(k, res) */
+ err = smp_e(k, res);
+ if (err) {
+ BT_ERR("Encrypt data error");
+ return err;
+ }
+
+ /* p2 = padding || ia || ra */
+ memcpy(p2, ra, 6);
+ memcpy(p2 + 6, ia, 6);
+ memset(p2 + 12, 0, 4);
+
+ SMP_DBG("p2 %16phN", p2);
+
+ /* res = res XOR p2 */
+ crypto_xor(res, p2, sizeof(p2));
+
+ /* res = e(k, res) */
+ err = smp_e(k, res);
+ if (err)
+ BT_ERR("Encrypt data error");
+
+ return err;
+}
+
+static int smp_s1(const u8 k[16],
+ const u8 r1[16], const u8 r2[16], u8 _r[16])
+{
+ int err;
+
+ /* Just least significant octets from r1 and r2 are considered */
+ memcpy(_r, r2, 8);
+ memcpy(_r + 8, r1, 8);
+
+ err = smp_e(k, _r);
+ if (err)
+ BT_ERR("Encrypt data error");
+
+ return err;
+}
+
+static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3])
+{
+ u8 _res[16];
+ int err;
+
+ /* r' = padding || r */
+ memcpy(_res, r, 3);
+ memset(_res + 3, 0, 13);
+
+ err = smp_e(irk, _res);
+ if (err) {
+ BT_ERR("Encrypt error");
+ return err;
+ }
+
+ /* The output of the random address function ah is:
+ * ah(k, r) = e(k, r') mod 2^24
+ * The output of the security function e is then truncated to 24 bits
+ * by taking the least significant 24 bits of the output of e as the
+ * result of ah.
+ */
+ memcpy(res, _res, 3);
+
+ return 0;
+}
+
+bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
+ const bdaddr_t *bdaddr)
+{
+ struct l2cap_chan *chan = hdev->smp_data;
+ u8 hash[3];
+ int err;
+
+ if (!chan || !chan->data)
+ return false;
+
+ bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk);
+
+ err = smp_ah(irk, &bdaddr->b[3], hash);
+ if (err)
+ return false;
+
+ return !crypto_memneq(bdaddr->b, hash, 3);
+}
+
+int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa)
+{
+ struct l2cap_chan *chan = hdev->smp_data;
+ int err;
+
+ if (!chan || !chan->data)
+ return -EOPNOTSUPP;
+
+ get_random_bytes(&rpa->b[3], 3);
+
+ rpa->b[5] &= 0x3f; /* Clear two most significant bits */
+ rpa->b[5] |= 0x40; /* Set second most significant bit */
+
+ err = smp_ah(irk, &rpa->b[3], rpa->b);
+ if (err < 0)
+ return err;
+
+ bt_dev_dbg(hdev, "RPA %pMR", rpa);
+
+ return 0;
+}
+
+int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
+{
+ struct l2cap_chan *chan = hdev->smp_data;
+ struct smp_dev *smp;
+ int err;
+
+ if (!chan || !chan->data)
+ return -EOPNOTSUPP;
+
+ smp = chan->data;
+
+ if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
+ bt_dev_dbg(hdev, "Using debug keys");
+ err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk);
+ if (err)
+ return err;
+ memcpy(smp->local_pk, debug_pk, 64);
+ smp->debug_key = true;
+ } else {
+ while (true) {
+ /* Generate key pair for Secure Connections */
+ err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk);
+ if (err)
+ return err;
+
+ /* This is unlikely, but we need to check that
+ * we didn't accidentally generate a debug key.
+ */
+ if (crypto_memneq(smp->local_pk, debug_pk, 64))
+ break;
+ }
+ smp->debug_key = false;
+ }
+
+ SMP_DBG("OOB Public Key X: %32phN", smp->local_pk);
+ SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32);
+
+ get_random_bytes(smp->local_rand, 16);
+
+ err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->local_pk,
+ smp->local_rand, 0, hash);
+ if (err < 0)
+ return err;
+
+ memcpy(rand, smp->local_rand, 16);
+
+ smp->local_oob = true;
+
+ return 0;
+}
+
+static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp;
+ struct kvec iv[2];
+ struct msghdr msg;
+
+ if (!chan)
+ return;
+
+ bt_dev_dbg(conn->hcon->hdev, "code 0x%2.2x", code);
+
+ iv[0].iov_base = &code;
+ iv[0].iov_len = 1;
+
+ iv[1].iov_base = data;
+ iv[1].iov_len = len;
+
+ memset(&msg, 0, sizeof(msg));
+
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len);
+
+ l2cap_chan_send(chan, &msg, 1 + len, NULL);
+
+ if (!chan->data)
+ return;
+
+ smp = chan->data;
+
+ cancel_delayed_work_sync(&smp->security_timer);
+ schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT);
+}
+
+static u8 authreq_to_seclevel(u8 authreq)
+{
+ if (authreq & SMP_AUTH_MITM) {
+ if (authreq & SMP_AUTH_SC)
+ return BT_SECURITY_FIPS;
+ else
+ return BT_SECURITY_HIGH;
+ } else {
+ return BT_SECURITY_MEDIUM;
+ }
+}
+
+static __u8 seclevel_to_authreq(__u8 sec_level)
+{
+ switch (sec_level) {
+ case BT_SECURITY_FIPS:
+ case BT_SECURITY_HIGH:
+ return SMP_AUTH_MITM | SMP_AUTH_BONDING;
+ case BT_SECURITY_MEDIUM:
+ return SMP_AUTH_BONDING;
+ default:
+ return SMP_AUTH_NONE;
+ }
+}
+
+static void build_pairing_cmd(struct l2cap_conn *conn,
+ struct smp_cmd_pairing *req,
+ struct smp_cmd_pairing *rsp, __u8 authreq)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ u8 local_dist = 0, remote_dist = 0, oob_flag = SMP_OOB_NOT_PRESENT;
+
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE)) {
+ local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ authreq |= SMP_AUTH_BONDING;
+ } else {
+ authreq &= ~SMP_AUTH_BONDING;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING))
+ remote_dist |= SMP_DIST_ID_KEY;
+
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ local_dist |= SMP_DIST_ID_KEY;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
+ (authreq & SMP_AUTH_SC)) {
+ struct oob_data *oob_data;
+ u8 bdaddr_type;
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ local_dist |= SMP_DIST_LINK_KEY;
+ remote_dist |= SMP_DIST_LINK_KEY;
+ }
+
+ if (hcon->dst_type == ADDR_LE_DEV_PUBLIC)
+ bdaddr_type = BDADDR_LE_PUBLIC;
+ else
+ bdaddr_type = BDADDR_LE_RANDOM;
+
+ oob_data = hci_find_remote_oob_data(hdev, &hcon->dst,
+ bdaddr_type);
+ if (oob_data && oob_data->present) {
+ set_bit(SMP_FLAG_REMOTE_OOB, &smp->flags);
+ oob_flag = SMP_OOB_PRESENT;
+ memcpy(smp->rr, oob_data->rand256, 16);
+ memcpy(smp->pcnf, oob_data->hash256, 16);
+ SMP_DBG("OOB Remote Confirmation: %16phN", smp->pcnf);
+ SMP_DBG("OOB Remote Random: %16phN", smp->rr);
+ }
+
+ } else {
+ authreq &= ~SMP_AUTH_SC;
+ }
+
+ if (rsp == NULL) {
+ req->io_capability = conn->hcon->io_capability;
+ req->oob_flag = oob_flag;
+ req->max_key_size = hdev->le_max_key_size;
+ req->init_key_dist = local_dist;
+ req->resp_key_dist = remote_dist;
+ req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
+
+ smp->remote_key_dist = remote_dist;
+ return;
+ }
+
+ rsp->io_capability = conn->hcon->io_capability;
+ rsp->oob_flag = oob_flag;
+ rsp->max_key_size = hdev->le_max_key_size;
+ rsp->init_key_dist = req->init_key_dist & remote_dist;
+ rsp->resp_key_dist = req->resp_key_dist & local_dist;
+ rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
+
+ smp->remote_key_dist = rsp->init_key_dist;
+}
+
+static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ struct smp_chan *smp = chan->data;
+
+ if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS &&
+ max_key_size != SMP_MAX_ENC_KEY_SIZE)
+ return SMP_ENC_KEY_SIZE;
+
+ if (max_key_size > hdev->le_max_key_size ||
+ max_key_size < SMP_MIN_ENC_KEY_SIZE)
+ return SMP_ENC_KEY_SIZE;
+
+ smp->enc_key_size = max_key_size;
+
+ return 0;
+}
+
+static void smp_chan_destroy(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ bool complete;
+
+ BUG_ON(!smp);
+
+ cancel_delayed_work_sync(&smp->security_timer);
+
+ complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags);
+ mgmt_smp_complete(hcon, complete);
+
+ kfree_sensitive(smp->csrk);
+ kfree_sensitive(smp->responder_csrk);
+ kfree_sensitive(smp->link_key);
+
+ crypto_free_shash(smp->tfm_cmac);
+ crypto_free_kpp(smp->tfm_ecdh);
+
+ /* Ensure that we don't leave any debug key around if debug key
+ * support hasn't been explicitly enabled.
+ */
+ if (smp->ltk && smp->ltk->type == SMP_LTK_P256_DEBUG &&
+ !hci_dev_test_flag(hcon->hdev, HCI_KEEP_DEBUG_KEYS)) {
+ list_del_rcu(&smp->ltk->list);
+ kfree_rcu(smp->ltk, rcu);
+ smp->ltk = NULL;
+ }
+
+ /* If pairing failed clean up any keys we might have */
+ if (!complete) {
+ if (smp->ltk) {
+ list_del_rcu(&smp->ltk->list);
+ kfree_rcu(smp->ltk, rcu);
+ }
+
+ if (smp->responder_ltk) {
+ list_del_rcu(&smp->responder_ltk->list);
+ kfree_rcu(smp->responder_ltk, rcu);
+ }
+
+ if (smp->remote_irk) {
+ list_del_rcu(&smp->remote_irk->list);
+ kfree_rcu(smp->remote_irk, rcu);
+ }
+ }
+
+ chan->data = NULL;
+ kfree_sensitive(smp);
+ hci_conn_drop(hcon);
+}
+
+static void smp_failure(struct l2cap_conn *conn, u8 reason)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+
+ if (reason)
+ smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason),
+ &reason);
+
+ mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE);
+
+ if (chan->data)
+ smp_chan_destroy(conn);
+}
+
+#define JUST_WORKS 0x00
+#define JUST_CFM 0x01
+#define REQ_PASSKEY 0x02
+#define CFM_PASSKEY 0x03
+#define REQ_OOB 0x04
+#define DSP_PASSKEY 0x05
+#define OVERLAP 0xFF
+
+static const u8 gen_method[5][5] = {
+ { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY },
+ { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY },
+ { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY },
+ { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM },
+ { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP },
+};
+
+static const u8 sc_method[5][5] = {
+ { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY },
+ { JUST_WORKS, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY },
+ { DSP_PASSKEY, DSP_PASSKEY, REQ_PASSKEY, JUST_WORKS, DSP_PASSKEY },
+ { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM },
+ { DSP_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY },
+};
+
+static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io)
+{
+ /* If either side has unknown io_caps, use JUST_CFM (which gets
+ * converted later to JUST_WORKS if we're initiators.
+ */
+ if (local_io > SMP_IO_KEYBOARD_DISPLAY ||
+ remote_io > SMP_IO_KEYBOARD_DISPLAY)
+ return JUST_CFM;
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags))
+ return sc_method[remote_io][local_io];
+
+ return gen_method[remote_io][local_io];
+}
+
+static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
+ u8 local_io, u8 remote_io)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ u32 passkey = 0;
+ int ret;
+
+ /* Initialize key for JUST WORKS */
+ memset(smp->tk, 0, sizeof(smp->tk));
+ clear_bit(SMP_FLAG_TK_VALID, &smp->flags);
+
+ bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io,
+ remote_io);
+
+ /* If neither side wants MITM, either "just" confirm an incoming
+ * request or use just-works for outgoing ones. The JUST_CFM
+ * will be converted to JUST_WORKS if necessary later in this
+ * function. If either side has MITM look up the method from the
+ * table.
+ */
+ if (!(auth & SMP_AUTH_MITM))
+ smp->method = JUST_CFM;
+ else
+ smp->method = get_auth_method(smp, local_io, remote_io);
+
+ /* Don't confirm locally initiated pairing attempts */
+ if (smp->method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR,
+ &smp->flags))
+ smp->method = JUST_WORKS;
+
+ /* Don't bother user space with no IO capabilities */
+ if (smp->method == JUST_CFM &&
+ hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+ smp->method = JUST_WORKS;
+
+ /* If Just Works, Continue with Zero TK and ask user-space for
+ * confirmation */
+ if (smp->method == JUST_WORKS) {
+ ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
+ hcon->type,
+ hcon->dst_type,
+ passkey, 1);
+ if (ret)
+ return ret;
+ set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+ return 0;
+ }
+
+ /* If this function is used for SC -> legacy fallback we
+ * can only recover the just-works case.
+ */
+ if (test_bit(SMP_FLAG_SC, &smp->flags))
+ return -EINVAL;
+
+ /* Not Just Works/Confirm results in MITM Authentication */
+ if (smp->method != JUST_CFM) {
+ set_bit(SMP_FLAG_MITM_AUTH, &smp->flags);
+ if (hcon->pending_sec_level < BT_SECURITY_HIGH)
+ hcon->pending_sec_level = BT_SECURITY_HIGH;
+ }
+
+ /* If both devices have Keyboard-Display I/O, the initiator
+ * Confirms and the responder Enters the passkey.
+ */
+ if (smp->method == OVERLAP) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ smp->method = CFM_PASSKEY;
+ else
+ smp->method = REQ_PASSKEY;
+ }
+
+ /* Generate random passkey. */
+ if (smp->method == CFM_PASSKEY) {
+ memset(smp->tk, 0, sizeof(smp->tk));
+ get_random_bytes(&passkey, sizeof(passkey));
+ passkey %= 1000000;
+ put_unaligned_le32(passkey, smp->tk);
+ bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey);
+ set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+ }
+
+ if (smp->method == REQ_PASSKEY)
+ ret = mgmt_user_passkey_request(hcon->hdev, &hcon->dst,
+ hcon->type, hcon->dst_type);
+ else if (smp->method == JUST_CFM)
+ ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
+ hcon->type, hcon->dst_type,
+ passkey, 1);
+ else
+ ret = mgmt_user_passkey_notify(hcon->hdev, &hcon->dst,
+ hcon->type, hcon->dst_type,
+ passkey, 0);
+
+ return ret;
+}
+
+static u8 smp_confirm(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct smp_cmd_pairing_confirm cp;
+ int ret;
+
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
+
+ ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp,
+ conn->hcon->init_addr_type, &conn->hcon->init_addr,
+ conn->hcon->resp_addr_type, &conn->hcon->resp_addr,
+ cp.confirm_val);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+
+ smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp);
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ else
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ return 0;
+}
+
+static u8 smp_random(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ u8 confirm[16];
+ int ret;
+
+ bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn,
+ test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" :
+ "responder");
+
+ ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp,
+ hcon->init_addr_type, &hcon->init_addr,
+ hcon->resp_addr_type, &hcon->resp_addr, confirm);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ if (crypto_memneq(smp->pcnf, confirm, sizeof(smp->pcnf))) {
+ bt_dev_err(hcon->hdev, "pairing failed "
+ "(confirmation values mismatch)");
+ return SMP_CONFIRM_FAILED;
+ }
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ u8 stk[16];
+ __le64 rand = 0;
+ __le16 ediv = 0;
+
+ smp_s1(smp->tk, smp->rrnd, smp->prnd, stk);
+
+ if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
+ return SMP_UNSPECIFIED;
+
+ hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size);
+ hcon->enc_key_size = smp->enc_key_size;
+ set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags);
+ } else {
+ u8 stk[16], auth;
+ __le64 rand = 0;
+ __le16 ediv = 0;
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+
+ smp_s1(smp->tk, smp->prnd, smp->rrnd, stk);
+
+ if (hcon->pending_sec_level == BT_SECURITY_HIGH)
+ auth = 1;
+ else
+ auth = 0;
+
+ /* Even though there's no _RESPONDER suffix this is the
+ * responder STK we're adding for later lookup (the initiator
+ * STK never needs to be stored).
+ */
+ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
+ SMP_STK, auth, stk, smp->enc_key_size, ediv, rand);
+ }
+
+ return 0;
+}
+
+static void smp_notify_keys(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_cmd_pairing *req = (void *) &smp->preq[1];
+ struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1];
+ bool persistent;
+
+ if (hcon->type == ACL_LINK) {
+ if (hcon->key_type == HCI_LK_DEBUG_COMBINATION)
+ persistent = false;
+ else
+ persistent = !test_bit(HCI_CONN_FLUSH_KEY,
+ &hcon->flags);
+ } else {
+ /* The LTKs, IRKs and CSRKs should be persistent only if
+ * both sides had the bonding bit set in their
+ * authentication requests.
+ */
+ persistent = !!((req->auth_req & rsp->auth_req) &
+ SMP_AUTH_BONDING);
+ }
+
+ if (smp->remote_irk) {
+ mgmt_new_irk(hdev, smp->remote_irk, persistent);
+
+ /* Now that user space can be considered to know the
+ * identity address track the connection based on it
+ * from now on (assuming this is an LE link).
+ */
+ if (hcon->type == LE_LINK) {
+ bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
+ hcon->dst_type = smp->remote_irk->addr_type;
+ /* Use a short delay to make sure the new address is
+ * propagated _before_ the channels.
+ */
+ queue_delayed_work(hdev->workqueue,
+ &conn->id_addr_timer,
+ ID_ADDR_TIMEOUT);
+ }
+ }
+
+ if (smp->csrk) {
+ smp->csrk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->csrk->bdaddr, &hcon->dst);
+ mgmt_new_csrk(hdev, smp->csrk, persistent);
+ }
+
+ if (smp->responder_csrk) {
+ smp->responder_csrk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->responder_csrk->bdaddr, &hcon->dst);
+ mgmt_new_csrk(hdev, smp->responder_csrk, persistent);
+ }
+
+ if (smp->ltk) {
+ smp->ltk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->ltk->bdaddr, &hcon->dst);
+ mgmt_new_ltk(hdev, smp->ltk, persistent);
+ }
+
+ if (smp->responder_ltk) {
+ smp->responder_ltk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->responder_ltk->bdaddr, &hcon->dst);
+ mgmt_new_ltk(hdev, smp->responder_ltk, persistent);
+ }
+
+ if (smp->link_key) {
+ struct link_key *key;
+ u8 type;
+
+ if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags))
+ type = HCI_LK_DEBUG_COMBINATION;
+ else if (hcon->sec_level == BT_SECURITY_FIPS)
+ type = HCI_LK_AUTH_COMBINATION_P256;
+ else
+ type = HCI_LK_UNAUTH_COMBINATION_P256;
+
+ key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst,
+ smp->link_key, type, 0, &persistent);
+ if (key) {
+ mgmt_new_link_key(hdev, key, persistent);
+
+ /* Don't keep debug keys around if the relevant
+ * flag is not set.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS) &&
+ key->type == HCI_LK_DEBUG_COMBINATION) {
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+ }
+ }
+ }
+}
+
+static void sc_add_ltk(struct smp_chan *smp)
+{
+ struct hci_conn *hcon = smp->conn->hcon;
+ u8 key_type, auth;
+
+ if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags))
+ key_type = SMP_LTK_P256_DEBUG;
+ else
+ key_type = SMP_LTK_P256;
+
+ if (hcon->pending_sec_level == BT_SECURITY_FIPS)
+ auth = 1;
+ else
+ auth = 0;
+
+ smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
+ key_type, auth, smp->tk, smp->enc_key_size,
+ 0, 0);
+}
+
+static void sc_generate_link_key(struct smp_chan *smp)
+{
+ /* From core spec. Spells out in ASCII as 'lebr'. */
+ const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c };
+
+ smp->link_key = kzalloc(16, GFP_KERNEL);
+ if (!smp->link_key)
+ return;
+
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x000000000000000000000000746D7031 */
+ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
+ kfree_sensitive(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp1'. */
+ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
+ kfree_sensitive(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+ }
+
+ if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
+ kfree_sensitive(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+}
+
+static void smp_allow_key_dist(struct smp_chan *smp)
+{
+ /* Allow the first expected phase 3 PDU. The rest of the PDUs
+ * will be allowed in each PDU handler to ensure we receive
+ * them in the correct order.
+ */
+ if (smp->remote_key_dist & SMP_DIST_ENC_KEY)
+ SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO);
+ else if (smp->remote_key_dist & SMP_DIST_ID_KEY)
+ SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO);
+ else if (smp->remote_key_dist & SMP_DIST_SIGN)
+ SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO);
+}
+
+static void sc_generate_ltk(struct smp_chan *smp)
+{
+ /* From core spec. Spells out in ASCII as 'brle'. */
+ const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 };
+ struct hci_conn *hcon = smp->conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct link_key *key;
+
+ key = hci_find_link_key(hdev, &hcon->dst);
+ if (!key) {
+ bt_dev_err(hdev, "no Link Key found to generate LTK");
+ return;
+ }
+
+ if (key->type == HCI_LK_DEBUG_COMBINATION)
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x000000000000000000000000746D7032 */
+ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
+ return;
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp2'. */
+ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
+ return;
+ }
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk))
+ return;
+
+ sc_add_ltk(smp);
+}
+
+static void smp_distribute_keys(struct smp_chan *smp)
+{
+ struct smp_cmd_pairing *req, *rsp;
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ __u8 *keydist;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ rsp = (void *) &smp->prsp[1];
+
+ /* The responder sends its keys first */
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) &&
+ (smp->remote_key_dist & KEY_DIST_MASK)) {
+ smp_allow_key_dist(smp);
+ return;
+ }
+
+ req = (void *) &smp->preq[1];
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ keydist = &rsp->init_key_dist;
+ *keydist &= req->init_key_dist;
+ } else {
+ keydist = &rsp->resp_key_dist;
+ *keydist &= req->resp_key_dist;
+ }
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ if (hcon->type == LE_LINK && (*keydist & SMP_DIST_LINK_KEY))
+ sc_generate_link_key(smp);
+ if (hcon->type == ACL_LINK && (*keydist & SMP_DIST_ENC_KEY))
+ sc_generate_ltk(smp);
+
+ /* Clear the keys which are generated but not distributed */
+ *keydist &= ~SMP_SC_NO_DIST;
+ }
+
+ bt_dev_dbg(hdev, "keydist 0x%x", *keydist);
+
+ if (*keydist & SMP_DIST_ENC_KEY) {
+ struct smp_cmd_encrypt_info enc;
+ struct smp_cmd_initiator_ident ident;
+ struct smp_ltk *ltk;
+ u8 authenticated;
+ __le16 ediv;
+ __le64 rand;
+
+ /* Make sure we generate only the significant amount of
+ * bytes based on the encryption key size, and set the rest
+ * of the value to zeroes.
+ */
+ get_random_bytes(enc.ltk, smp->enc_key_size);
+ memset(enc.ltk + smp->enc_key_size, 0,
+ sizeof(enc.ltk) - smp->enc_key_size);
+
+ get_random_bytes(&ediv, sizeof(ediv));
+ get_random_bytes(&rand, sizeof(rand));
+
+ smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc);
+
+ authenticated = hcon->sec_level == BT_SECURITY_HIGH;
+ ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type,
+ SMP_LTK_RESPONDER, authenticated, enc.ltk,
+ smp->enc_key_size, ediv, rand);
+ smp->responder_ltk = ltk;
+
+ ident.ediv = ediv;
+ ident.rand = rand;
+
+ smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident),
+ &ident);
+
+ *keydist &= ~SMP_DIST_ENC_KEY;
+ }
+
+ if (*keydist & SMP_DIST_ID_KEY) {
+ struct smp_cmd_ident_addr_info addrinfo;
+ struct smp_cmd_ident_info idinfo;
+
+ memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk));
+
+ smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo);
+
+ /* The hci_conn contains the local identity address
+ * after the connection has been established.
+ *
+ * This is true even when the connection has been
+ * established using a resolvable random address.
+ */
+ bacpy(&addrinfo.bdaddr, &hcon->src);
+ addrinfo.addr_type = hcon->src_type;
+
+ smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo),
+ &addrinfo);
+
+ *keydist &= ~SMP_DIST_ID_KEY;
+ }
+
+ if (*keydist & SMP_DIST_SIGN) {
+ struct smp_cmd_sign_info sign;
+ struct smp_csrk *csrk;
+
+ /* Generate a new random key */
+ get_random_bytes(sign.csrk, sizeof(sign.csrk));
+
+ csrk = kzalloc(sizeof(*csrk), GFP_KERNEL);
+ if (csrk) {
+ if (hcon->sec_level > BT_SECURITY_MEDIUM)
+ csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED;
+ else
+ csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED;
+ memcpy(csrk->val, sign.csrk, sizeof(csrk->val));
+ }
+ smp->responder_csrk = csrk;
+
+ smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign);
+
+ *keydist &= ~SMP_DIST_SIGN;
+ }
+
+ /* If there are still keys to be received wait for them */
+ if (smp->remote_key_dist & KEY_DIST_MASK) {
+ smp_allow_key_dist(smp);
+ return;
+ }
+
+ set_bit(SMP_FLAG_COMPLETE, &smp->flags);
+ smp_notify_keys(conn);
+
+ smp_chan_destroy(conn);
+}
+
+static void smp_timeout(struct work_struct *work)
+{
+ struct smp_chan *smp = container_of(work, struct smp_chan,
+ security_timer.work);
+ struct l2cap_conn *conn = smp->conn;
+
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
+
+ hci_disconnect(conn->hcon, HCI_ERROR_AUTH_FAILURE);
+}
+
+static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp;
+
+ smp = kzalloc(sizeof(*smp), GFP_ATOMIC);
+ if (!smp)
+ return NULL;
+
+ smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(smp->tfm_cmac)) {
+ bt_dev_err(hcon->hdev, "Unable to create CMAC crypto context");
+ goto zfree_smp;
+ }
+
+ smp->tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
+ if (IS_ERR(smp->tfm_ecdh)) {
+ bt_dev_err(hcon->hdev, "Unable to create ECDH crypto context");
+ goto free_shash;
+ }
+
+ smp->conn = conn;
+ chan->data = smp;
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL);
+
+ INIT_DELAYED_WORK(&smp->security_timer, smp_timeout);
+
+ hci_conn_hold(hcon);
+
+ return smp;
+
+free_shash:
+ crypto_free_shash(smp->tfm_cmac);
+zfree_smp:
+ kfree_sensitive(smp);
+ return NULL;
+}
+
+static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16])
+{
+ struct hci_conn *hcon = smp->conn->hcon;
+ u8 *na, *nb, a[7], b[7];
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ na = smp->prnd;
+ nb = smp->rrnd;
+ } else {
+ na = smp->rrnd;
+ nb = smp->prnd;
+ }
+
+ memcpy(a, &hcon->init_addr, 6);
+ memcpy(b, &hcon->resp_addr, 6);
+ a[6] = hcon->init_addr_type;
+ b[6] = hcon->resp_addr_type;
+
+ return smp_f5(smp->tfm_cmac, smp->dhkey, na, nb, a, b, mackey, ltk);
+}
+
+static void sc_dhkey_check(struct smp_chan *smp)
+{
+ struct hci_conn *hcon = smp->conn->hcon;
+ struct smp_cmd_dhkey_check check;
+ u8 a[7], b[7], *local_addr, *remote_addr;
+ u8 io_cap[3], r[16];
+
+ memcpy(a, &hcon->init_addr, 6);
+ memcpy(b, &hcon->resp_addr, 6);
+ a[6] = hcon->init_addr_type;
+ b[6] = hcon->resp_addr_type;
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ local_addr = a;
+ remote_addr = b;
+ memcpy(io_cap, &smp->preq[1], 3);
+ } else {
+ local_addr = b;
+ remote_addr = a;
+ memcpy(io_cap, &smp->prsp[1], 3);
+ }
+
+ memset(r, 0, sizeof(r));
+
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ put_unaligned_le32(hcon->passkey_notify, r);
+
+ if (smp->method == REQ_OOB)
+ memcpy(r, smp->rr, 16);
+
+ smp_f6(smp->tfm_cmac, smp->mackey, smp->prnd, smp->rrnd, r, io_cap,
+ local_addr, remote_addr, check.e);
+
+ smp_send_cmd(smp->conn, SMP_CMD_DHKEY_CHECK, sizeof(check), &check);
+}
+
+static u8 sc_passkey_send_confirm(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_cmd_pairing_confirm cfm;
+ u8 r;
+
+ r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01);
+ r |= 0x80;
+
+ get_random_bytes(smp->prnd, sizeof(smp->prnd));
+
+ if (smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, r,
+ cfm.confirm_val))
+ return SMP_UNSPECIFIED;
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm);
+
+ return 0;
+}
+
+static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ u8 cfm[16], r;
+
+ /* Ignore the PDU if we've already done 20 rounds (0 - 19) */
+ if (smp->passkey_round >= 20)
+ return 0;
+
+ switch (smp_op) {
+ case SMP_CMD_PAIRING_RANDOM:
+ r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01);
+ r |= 0x80;
+
+ if (smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk,
+ smp->rrnd, r, cfm))
+ return SMP_UNSPECIFIED;
+
+ if (crypto_memneq(smp->pcnf, cfm, 16))
+ return SMP_CONFIRM_FAILED;
+
+ smp->passkey_round++;
+
+ if (smp->passkey_round == 20) {
+ /* Generate MacKey and LTK */
+ if (sc_mackey_and_ltk(smp, smp->mackey, smp->tk))
+ return SMP_UNSPECIFIED;
+ }
+
+ /* The round is only complete when the initiator
+ * receives pairing random.
+ */
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+ if (smp->passkey_round == 20)
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ else
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ return 0;
+ }
+
+ /* Start the next round */
+ if (smp->passkey_round != 20)
+ return sc_passkey_round(smp, 0);
+
+ /* Passkey rounds are complete - start DHKey Check */
+ sc_dhkey_check(smp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+
+ break;
+
+ case SMP_CMD_PAIRING_CONFIRM:
+ if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) {
+ set_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+ return 0;
+ }
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+ return 0;
+ }
+
+ return sc_passkey_send_confirm(smp);
+
+ case SMP_CMD_PUBLIC_KEY:
+ default:
+ /* Initiating device starts the round */
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ return 0;
+
+ bt_dev_dbg(hdev, "Starting passkey round %u",
+ smp->passkey_round + 1);
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ return sc_passkey_send_confirm(smp);
+ }
+
+ return 0;
+}
+
+static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ u8 smp_op;
+
+ clear_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+
+ switch (mgmt_op) {
+ case MGMT_OP_USER_PASSKEY_NEG_REPLY:
+ smp_failure(smp->conn, SMP_PASSKEY_ENTRY_FAILED);
+ return 0;
+ case MGMT_OP_USER_CONFIRM_NEG_REPLY:
+ smp_failure(smp->conn, SMP_NUMERIC_COMP_FAILED);
+ return 0;
+ case MGMT_OP_USER_PASSKEY_REPLY:
+ hcon->passkey_notify = le32_to_cpu(passkey);
+ smp->passkey_round = 0;
+
+ if (test_and_clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags))
+ smp_op = SMP_CMD_PAIRING_CONFIRM;
+ else
+ smp_op = 0;
+
+ if (sc_passkey_round(smp, smp_op))
+ return -EIO;
+
+ return 0;
+ }
+
+ /* Initiator sends DHKey check first */
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ sc_dhkey_check(smp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) {
+ sc_dhkey_check(smp);
+ sc_add_ltk(smp);
+ }
+
+ return 0;
+}
+
+int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+ struct smp_chan *smp;
+ u32 value;
+ int err;
+
+ if (!conn)
+ return -ENOTCONN;
+
+ bt_dev_dbg(conn->hcon->hdev, "");
+
+ chan = conn->smp;
+ if (!chan)
+ return -ENOTCONN;
+
+ l2cap_chan_lock(chan);
+ if (!chan->data) {
+ err = -ENOTCONN;
+ goto unlock;
+ }
+
+ smp = chan->data;
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ err = sc_user_reply(smp, mgmt_op, passkey);
+ goto unlock;
+ }
+
+ switch (mgmt_op) {
+ case MGMT_OP_USER_PASSKEY_REPLY:
+ value = le32_to_cpu(passkey);
+ memset(smp->tk, 0, sizeof(smp->tk));
+ bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value);
+ put_unaligned_le32(value, smp->tk);
+ fallthrough;
+ case MGMT_OP_USER_CONFIRM_REPLY:
+ set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+ break;
+ case MGMT_OP_USER_PASSKEY_NEG_REPLY:
+ case MGMT_OP_USER_CONFIRM_NEG_REPLY:
+ smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED);
+ err = 0;
+ goto unlock;
+ default:
+ smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED);
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ err = 0;
+
+ /* If it is our turn to send Pairing Confirm, do so now */
+ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) {
+ u8 rsp = smp_confirm(smp);
+ if (rsp)
+ smp_failure(conn, rsp);
+ }
+
+unlock:
+ l2cap_chan_unlock(chan);
+ return err;
+}
+
+static void build_bredr_pairing_cmd(struct smp_chan *smp,
+ struct smp_cmd_pairing *req,
+ struct smp_cmd_pairing *rsp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ u8 local_dist = 0, remote_dist = 0;
+
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE)) {
+ local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING))
+ remote_dist |= SMP_DIST_ID_KEY;
+
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ local_dist |= SMP_DIST_ID_KEY;
+
+ if (!rsp) {
+ memset(req, 0, sizeof(*req));
+
+ req->auth_req = SMP_AUTH_CT2;
+ req->init_key_dist = local_dist;
+ req->resp_key_dist = remote_dist;
+ req->max_key_size = conn->hcon->enc_key_size;
+
+ smp->remote_key_dist = remote_dist;
+
+ return;
+ }
+
+ memset(rsp, 0, sizeof(*rsp));
+
+ rsp->auth_req = SMP_AUTH_CT2;
+ rsp->max_key_size = conn->hcon->enc_key_size;
+ rsp->init_key_dist = req->init_key_dist & remote_dist;
+ rsp->resp_key_dist = req->resp_key_dist & local_dist;
+
+ smp->remote_key_dist = rsp->init_key_dist;
+}
+
+static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_pairing rsp, *req = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ struct smp_chan *smp = chan->data;
+ u8 key_size, auth, sec_level;
+ int ret;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*req))
+ return SMP_INVALID_PARAMS;
+
+ if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ return SMP_CMD_NOTSUPP;
+
+ if (!smp) {
+ smp = smp_chan_create(conn);
+ if (!smp)
+ return SMP_UNSPECIFIED;
+ }
+
+ /* We didn't start the pairing, so match remote */
+ auth = req->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE) &&
+ (auth & SMP_AUTH_BONDING))
+ return SMP_PAIRING_NOTSUPP;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC))
+ return SMP_AUTH_REQUIREMENTS;
+
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], req, sizeof(*req));
+ skb_pull(skb, sizeof(*req));
+
+ /* If the remote side's OOB flag is set it means it has
+ * successfully received our local OOB data - therefore set the
+ * flag to indicate that local OOB is in use.
+ */
+ if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob)
+ set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
+
+ /* SMP over BR/EDR requires special treatment */
+ if (conn->hcon->type == ACL_LINK) {
+ /* We must have a BR/EDR SC link */
+ if (!test_bit(HCI_CONN_AES_CCM, &conn->hcon->flags) &&
+ !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return SMP_CROSS_TRANSP_NOT_ALLOWED;
+
+ set_bit(SMP_FLAG_SC, &smp->flags);
+
+ build_bredr_pairing_cmd(smp, req, &rsp);
+
+ if (req->auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
+ key_size = min(req->max_key_size, rsp.max_key_size);
+ if (check_enc_key_size(conn, key_size))
+ return SMP_ENC_KEY_SIZE;
+
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+
+ smp->prsp[0] = SMP_CMD_PAIRING_RSP;
+ memcpy(&smp->prsp[1], &rsp, sizeof(rsp));
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp);
+
+ smp_distribute_keys(smp);
+ return 0;
+ }
+
+ build_pairing_cmd(conn, req, &rsp, auth);
+
+ if (rsp.auth_req & SMP_AUTH_SC) {
+ set_bit(SMP_FLAG_SC, &smp->flags);
+
+ if (rsp.auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+ }
+
+ if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+ sec_level = BT_SECURITY_MEDIUM;
+ else
+ sec_level = authreq_to_seclevel(auth);
+
+ if (sec_level > conn->hcon->pending_sec_level)
+ conn->hcon->pending_sec_level = sec_level;
+
+ /* If we need MITM check that it can be achieved */
+ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) {
+ u8 method;
+
+ method = get_auth_method(smp, conn->hcon->io_capability,
+ req->io_capability);
+ if (method == JUST_WORKS || method == JUST_CFM)
+ return SMP_AUTH_REQUIREMENTS;
+ }
+
+ key_size = min(req->max_key_size, rsp.max_key_size);
+ if (check_enc_key_size(conn, key_size))
+ return SMP_ENC_KEY_SIZE;
+
+ get_random_bytes(smp->prnd, sizeof(smp->prnd));
+
+ smp->prsp[0] = SMP_CMD_PAIRING_RSP;
+ memcpy(&smp->prsp[1], &rsp, sizeof(rsp));
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp);
+
+ clear_bit(SMP_FLAG_INITIATOR, &smp->flags);
+
+ /* Strictly speaking we shouldn't allow Pairing Confirm for the
+ * SC case, however some implementations incorrectly copy RFU auth
+ * req bits from our security request, which may create a false
+ * positive SC enablement.
+ */
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY);
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+ /* Wait for Public Key from Initiating Device */
+ return 0;
+ }
+
+ /* Request setup of TK */
+ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ return 0;
+}
+
+static u8 sc_send_public_key(struct smp_chan *smp)
+{
+ struct hci_dev *hdev = smp->conn->hcon->hdev;
+
+ bt_dev_dbg(hdev, "");
+
+ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
+ struct l2cap_chan *chan = hdev->smp_data;
+ struct smp_dev *smp_dev;
+
+ if (!chan || !chan->data)
+ return SMP_UNSPECIFIED;
+
+ smp_dev = chan->data;
+
+ memcpy(smp->local_pk, smp_dev->local_pk, 64);
+ memcpy(smp->lr, smp_dev->local_rand, 16);
+
+ if (smp_dev->debug_key)
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
+ bt_dev_dbg(hdev, "Using debug keys");
+ if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk))
+ return SMP_UNSPECIFIED;
+ memcpy(smp->local_pk, debug_pk, 64);
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+ } else {
+ while (true) {
+ /* Generate key pair for Secure Connections */
+ if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk))
+ return SMP_UNSPECIFIED;
+
+ /* This is unlikely, but we need to check that
+ * we didn't accidentally generate a debug key.
+ */
+ if (crypto_memneq(smp->local_pk, debug_pk, 64))
+ break;
+ }
+ }
+
+done:
+ SMP_DBG("Local Public Key X: %32phN", smp->local_pk);
+ SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32);
+
+ smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk);
+
+ return 0;
+}
+
+static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_pairing *req, *rsp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ u8 key_size, auth;
+ int ret;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*rsp))
+ return SMP_INVALID_PARAMS;
+
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ return SMP_CMD_NOTSUPP;
+
+ skb_pull(skb, sizeof(*rsp));
+
+ req = (void *) &smp->preq[1];
+
+ key_size = min(req->max_key_size, rsp->max_key_size);
+ if (check_enc_key_size(conn, key_size))
+ return SMP_ENC_KEY_SIZE;
+
+ auth = rsp->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC))
+ return SMP_AUTH_REQUIREMENTS;
+
+ /* If the remote side's OOB flag is set it means it has
+ * successfully received our local OOB data - therefore set the
+ * flag to indicate that local OOB is in use.
+ */
+ if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob)
+ set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
+
+ smp->prsp[0] = SMP_CMD_PAIRING_RSP;
+ memcpy(&smp->prsp[1], rsp, sizeof(*rsp));
+
+ /* Update remote key distribution in case the remote cleared
+ * some bits that we had enabled in our request.
+ */
+ smp->remote_key_dist &= rsp->resp_key_dist;
+
+ if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2))
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
+ /* For BR/EDR this means we're done and can start phase 3 */
+ if (conn->hcon->type == ACL_LINK) {
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+ smp_distribute_keys(smp);
+ return 0;
+ }
+
+ if ((req->auth_req & SMP_AUTH_SC) && (auth & SMP_AUTH_SC))
+ set_bit(SMP_FLAG_SC, &smp->flags);
+ else if (conn->hcon->pending_sec_level > BT_SECURITY_HIGH)
+ conn->hcon->pending_sec_level = BT_SECURITY_HIGH;
+
+ /* If we need MITM check that it can be achieved */
+ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) {
+ u8 method;
+
+ method = get_auth_method(smp, req->io_capability,
+ rsp->io_capability);
+ if (method == JUST_WORKS || method == JUST_CFM)
+ return SMP_AUTH_REQUIREMENTS;
+ }
+
+ get_random_bytes(smp->prnd, sizeof(smp->prnd));
+
+ /* Update remote key distribution in case the remote cleared
+ * some bits that we had enabled in our request.
+ */
+ smp->remote_key_dist &= rsp->resp_key_dist;
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+ SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY);
+ return sc_send_public_key(smp);
+ }
+
+ auth |= req->auth_req;
+
+ ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ set_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+
+ /* Can't compose response until we have been confirmed */
+ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags))
+ return smp_confirm(smp);
+
+ return 0;
+}
+
+static u8 sc_check_confirm(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+
+ bt_dev_dbg(conn->hcon->hdev, "");
+
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+ }
+
+ return 0;
+}
+
+/* Work-around for some implementations that incorrectly copy RFU bits
+ * from our security request and thereby create the impression that
+ * we're doing SC when in fact the remote doesn't support it.
+ */
+static int fixup_sc_false_positive(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_cmd_pairing *req, *rsp;
+ u8 auth;
+
+ /* The issue is only observed when we're in responder role */
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ return SMP_UNSPECIFIED;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ bt_dev_err(hdev, "refusing legacy fallback in SC-only mode");
+ return SMP_UNSPECIFIED;
+ }
+
+ bt_dev_err(hdev, "trying to fall back to legacy SMP");
+
+ req = (void *) &smp->preq[1];
+ rsp = (void *) &smp->prsp[1];
+
+ /* Rebuild key dist flags which may have been cleared for SC */
+ smp->remote_key_dist = (req->init_key_dist & rsp->resp_key_dist);
+
+ auth = req->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) {
+ bt_dev_err(hdev, "failed to fall back to legacy SMP");
+ return SMP_UNSPECIFIED;
+ }
+
+ clear_bit(SMP_FLAG_SC, &smp->flags);
+
+ return 0;
+}
+
+static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+
+ bt_dev_dbg(hdev, "conn %p %s", conn,
+ test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" :
+ "responder");
+
+ if (skb->len < sizeof(smp->pcnf))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf));
+ skb_pull(skb, sizeof(smp->pcnf));
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ int ret;
+
+ /* Public Key exchange must happen before any other steps */
+ if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags))
+ return sc_check_confirm(smp);
+
+ bt_dev_err(hdev, "Unexpected SMP Pairing Confirm");
+
+ ret = fixup_sc_false_positive(smp);
+ if (ret)
+ return ret;
+ }
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+ return 0;
+ }
+
+ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags))
+ return smp_confirm(smp);
+
+ set_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+
+ return 0;
+}
+
+static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ u8 *pkax, *pkbx, *na, *nb, confirm_hint;
+ u32 passkey = 0;
+ int err;
+
+ bt_dev_dbg(hcon->hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(smp->rrnd))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd));
+ skb_pull(skb, sizeof(smp->rrnd));
+
+ if (!test_bit(SMP_FLAG_SC, &smp->flags))
+ return smp_random(smp);
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ pkax = smp->local_pk;
+ pkbx = smp->remote_pk;
+ na = smp->prnd;
+ nb = smp->rrnd;
+ } else {
+ pkax = smp->remote_pk;
+ pkbx = smp->local_pk;
+ na = smp->rrnd;
+ nb = smp->prnd;
+ }
+
+ if (smp->method == REQ_OOB) {
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ goto mackey_and_ltk;
+ }
+
+ /* Passkey entry has special treatment */
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM);
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ u8 cfm[16];
+
+ err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk,
+ smp->rrnd, 0, cfm);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (crypto_memneq(smp->pcnf, cfm, 16))
+ return SMP_CONFIRM_FAILED;
+ } else {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ }
+
+mackey_and_ltk:
+ /* Generate MacKey and LTK */
+ err = sc_mackey_and_ltk(smp, smp->mackey, smp->tk);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (smp->method == REQ_OOB) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ sc_dhkey_check(smp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ }
+ return 0;
+ }
+
+ err = smp_g2(smp->tfm_cmac, pkax, pkbx, na, nb, &passkey);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ /* Always require user confirmation for Just-Works pairing to prevent
+ * impersonation attacks, or in case of a legitimate device that is
+ * repairing use the confirmation as acknowledgment to proceed with the
+ * creation of new keys.
+ */
+ confirm_hint = smp->method == JUST_WORKS ? 1 : 0;
+
+ err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type,
+ hcon->dst_type, passkey, confirm_hint);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+
+ return 0;
+}
+
+static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level)
+{
+ struct smp_ltk *key;
+ struct hci_conn *hcon = conn->hcon;
+
+ key = hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role);
+ if (!key)
+ return false;
+
+ if (smp_ltk_sec_level(key) < sec_level)
+ return false;
+
+ if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
+ return true;
+
+ hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size);
+ hcon->enc_key_size = key->enc_size;
+
+ /* We never store STKs for initiator role, so clear this flag */
+ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags);
+
+ return true;
+}
+
+bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
+ enum smp_key_pref key_pref)
+{
+ if (sec_level == BT_SECURITY_LOW)
+ return true;
+
+ /* If we're encrypted with an STK but the caller prefers using
+ * LTK claim insufficient security. This way we allow the
+ * connection to be re-encrypted with an LTK, even if the LTK
+ * provides the same level of security. Only exception is if we
+ * don't have an LTK (e.g. because of key distribution bits).
+ */
+ if (key_pref == SMP_USE_LTK &&
+ test_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags) &&
+ hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role))
+ return false;
+
+ if (hcon->sec_level >= sec_level)
+ return true;
+
+ return false;
+}
+
+static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth)
+{
+ struct smp_cmd_pairing cp;
+
+ if (smp->conn->hcon->type == ACL_LINK)
+ build_bredr_pairing_cmd(smp, &cp, NULL);
+ else
+ build_pairing_cmd(smp->conn, &cp, NULL, auth);
+
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], &cp, sizeof(cp));
+
+ smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+
+ set_bit(SMP_FLAG_INITIATOR, &smp->flags);
+}
+
+static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_security_req *rp = (void *) skb->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_chan *smp;
+ u8 sec_level, auth;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ if (hcon->role != HCI_ROLE_MASTER)
+ return SMP_CMD_NOTSUPP;
+
+ auth = rp->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC))
+ return SMP_AUTH_REQUIREMENTS;
+
+ if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+ sec_level = BT_SECURITY_MEDIUM;
+ else
+ sec_level = authreq_to_seclevel(auth);
+
+ if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) {
+ /* If link is already encrypted with sufficient security we
+ * still need refresh encryption as per Core Spec 5.0 Vol 3,
+ * Part H 2.4.6
+ */
+ smp_ltk_encrypt(conn, hcon->sec_level);
+ return 0;
+ }
+
+ if (sec_level > hcon->pending_sec_level)
+ hcon->pending_sec_level = sec_level;
+
+ if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
+ return 0;
+
+ smp = smp_chan_create(conn);
+ if (!smp)
+ return SMP_UNSPECIFIED;
+
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE) &&
+ (auth & SMP_AUTH_BONDING))
+ return SMP_PAIRING_NOTSUPP;
+
+ skb_pull(skb, sizeof(*rp));
+
+ smp_send_pairing_req(smp, auth);
+
+ return 0;
+}
+
+static void smp_send_security_req(struct smp_chan *smp, __u8 auth)
+{
+ struct smp_cmd_security_req cp;
+
+ cp.auth_req = auth;
+ smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ);
+
+ clear_bit(SMP_FLAG_INITIATOR, &smp->flags);
+}
+
+int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+ struct smp_chan *smp;
+ __u8 authreq;
+ int ret;
+
+ bt_dev_dbg(hcon->hdev, "conn %p hcon %p level 0x%2.2x", conn, hcon,
+ sec_level);
+
+ /* This may be NULL if there's an unexpected disconnection */
+ if (!conn)
+ return 1;
+
+ if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED))
+ return 1;
+
+ if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK))
+ return 1;
+
+ if (sec_level > hcon->pending_sec_level)
+ hcon->pending_sec_level = sec_level;
+
+ if (hcon->role == HCI_ROLE_MASTER)
+ if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
+ return 0;
+
+ chan = conn->smp;
+ if (!chan) {
+ bt_dev_err(hcon->hdev, "security requested but not available");
+ return 1;
+ }
+
+ l2cap_chan_lock(chan);
+
+ /* If SMP is already in progress ignore this request */
+ if (chan->data) {
+ ret = 0;
+ goto unlock;
+ }
+
+ smp = smp_chan_create(conn);
+ if (!smp) {
+ ret = 1;
+ goto unlock;
+ }
+
+ authreq = seclevel_to_authreq(sec_level);
+
+ if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) {
+ authreq |= SMP_AUTH_SC;
+ if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED))
+ authreq |= SMP_AUTH_CT2;
+ }
+
+ /* Don't attempt to set MITM if setting is overridden by debugfs
+ * Needed to pass certification test SM/MAS/PKE/BV-01-C
+ */
+ if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) {
+ /* Require MITM if IO Capability allows or the security level
+ * requires it.
+ */
+ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
+ hcon->pending_sec_level > BT_SECURITY_MEDIUM)
+ authreq |= SMP_AUTH_MITM;
+ }
+
+ if (hcon->role == HCI_ROLE_MASTER)
+ smp_send_pairing_req(smp, authreq);
+ else
+ smp_send_security_req(smp, authreq);
+
+ ret = 0;
+
+unlock:
+ l2cap_chan_unlock(chan);
+ return ret;
+}
+
+int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type)
+{
+ struct hci_conn *hcon;
+ struct l2cap_conn *conn;
+ struct l2cap_chan *chan;
+ struct smp_chan *smp;
+ int err;
+
+ err = hci_remove_ltk(hdev, bdaddr, addr_type);
+ hci_remove_irk(hdev, bdaddr, addr_type);
+
+ hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type);
+ if (!hcon)
+ goto done;
+
+ conn = hcon->l2cap_data;
+ if (!conn)
+ goto done;
+
+ chan = conn->smp;
+ if (!chan)
+ goto done;
+
+ l2cap_chan_lock(chan);
+
+ smp = chan->data;
+ if (smp) {
+ /* Set keys to NULL to make sure smp_failure() does not try to
+ * remove and free already invalidated rcu list entries. */
+ smp->ltk = NULL;
+ smp->responder_ltk = NULL;
+ smp->remote_irk = NULL;
+
+ if (test_bit(SMP_FLAG_COMPLETE, &smp->flags))
+ smp_failure(conn, 0);
+ else
+ smp_failure(conn, SMP_UNSPECIFIED);
+ err = 0;
+ }
+
+ l2cap_chan_unlock(chan);
+
+done:
+ return err;
+}
+
+static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_encrypt_info *rp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ /* Pairing is aborted if any blocked keys are distributed */
+ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK,
+ rp->ltk)) {
+ bt_dev_warn_ratelimited(conn->hcon->hdev,
+ "LTK blocked for %pMR",
+ &conn->hcon->dst);
+ return SMP_INVALID_PARAMS;
+ }
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT);
+
+ skb_pull(skb, sizeof(*rp));
+
+ memcpy(smp->tk, rp->ltk, sizeof(smp->tk));
+
+ return 0;
+}
+
+static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_initiator_ident *rp = (void *)skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_ltk *ltk;
+ u8 authenticated;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ /* Mark the information as received */
+ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY;
+
+ if (smp->remote_key_dist & SMP_DIST_ID_KEY)
+ SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO);
+ else if (smp->remote_key_dist & SMP_DIST_SIGN)
+ SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO);
+
+ skb_pull(skb, sizeof(*rp));
+
+ authenticated = (hcon->sec_level == BT_SECURITY_HIGH);
+ ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK,
+ authenticated, smp->tk, smp->enc_key_size,
+ rp->ediv, rp->rand);
+ smp->ltk = ltk;
+ if (!(smp->remote_key_dist & KEY_DIST_MASK))
+ smp_distribute_keys(smp);
+
+ return 0;
+}
+
+static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_ident_info *info = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+
+ bt_dev_dbg(conn->hcon->hdev, "");
+
+ if (skb->len < sizeof(*info))
+ return SMP_INVALID_PARAMS;
+
+ /* Pairing is aborted if any blocked keys are distributed */
+ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ info->irk)) {
+ bt_dev_warn_ratelimited(conn->hcon->hdev,
+ "Identity key blocked for %pMR",
+ &conn->hcon->dst);
+ return SMP_INVALID_PARAMS;
+ }
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO);
+
+ skb_pull(skb, sizeof(*info));
+
+ memcpy(smp->irk, info->irk, 16);
+
+ return 0;
+}
+
+static int smp_cmd_ident_addr_info(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct smp_cmd_ident_addr_info *info = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ bdaddr_t rpa;
+
+ bt_dev_dbg(hcon->hdev, "");
+
+ if (skb->len < sizeof(*info))
+ return SMP_INVALID_PARAMS;
+
+ /* Mark the information as received */
+ smp->remote_key_dist &= ~SMP_DIST_ID_KEY;
+
+ if (smp->remote_key_dist & SMP_DIST_SIGN)
+ SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO);
+
+ skb_pull(skb, sizeof(*info));
+
+ /* Strictly speaking the Core Specification (4.1) allows sending
+ * an empty address which would force us to rely on just the IRK
+ * as "identity information". However, since such
+ * implementations are not known of and in order to not over
+ * complicate our implementation, simply pretend that we never
+ * received an IRK for such a device.
+ *
+ * The Identity Address must also be a Static Random or Public
+ * Address, which hci_is_identity_address() checks for.
+ */
+ if (!bacmp(&info->bdaddr, BDADDR_ANY) ||
+ !hci_is_identity_address(&info->bdaddr, info->addr_type)) {
+ bt_dev_err(hcon->hdev, "ignoring IRK with no identity address");
+ goto distribute;
+ }
+
+ /* Drop IRK if peer is using identity address during pairing but is
+ * providing different address as identity information.
+ *
+ * Microsoft Surface Precision Mouse is known to have this bug.
+ */
+ if (hci_is_identity_address(&hcon->dst, hcon->dst_type) &&
+ (bacmp(&info->bdaddr, &hcon->dst) ||
+ info->addr_type != hcon->dst_type)) {
+ bt_dev_err(hcon->hdev,
+ "ignoring IRK with invalid identity address");
+ goto distribute;
+ }
+
+ bacpy(&smp->id_addr, &info->bdaddr);
+ smp->id_addr_type = info->addr_type;
+
+ if (hci_bdaddr_is_rpa(&hcon->dst, hcon->dst_type))
+ bacpy(&rpa, &hcon->dst);
+ else
+ bacpy(&rpa, BDADDR_ANY);
+
+ smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr,
+ smp->id_addr_type, smp->irk, &rpa);
+
+distribute:
+ if (!(smp->remote_key_dist & KEY_DIST_MASK))
+ smp_distribute_keys(smp);
+
+ return 0;
+}
+
+static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_sign_info *rp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct smp_csrk *csrk;
+
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ /* Mark the information as received */
+ smp->remote_key_dist &= ~SMP_DIST_SIGN;
+
+ skb_pull(skb, sizeof(*rp));
+
+ csrk = kzalloc(sizeof(*csrk), GFP_KERNEL);
+ if (csrk) {
+ if (conn->hcon->sec_level > BT_SECURITY_MEDIUM)
+ csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED;
+ else
+ csrk->type = MGMT_CSRK_REMOTE_UNAUTHENTICATED;
+ memcpy(csrk->val, rp->csrk, sizeof(csrk->val));
+ }
+ smp->csrk = csrk;
+ smp_distribute_keys(smp);
+
+ return 0;
+}
+
+static u8 sc_select_method(struct smp_chan *smp)
+{
+ struct smp_cmd_pairing *local, *remote;
+ u8 local_mitm, remote_mitm, local_io, remote_io, method;
+
+ if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags) ||
+ test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags))
+ return REQ_OOB;
+
+ /* The preq/prsp contain the raw Pairing Request/Response PDUs
+ * which are needed as inputs to some crypto functions. To get
+ * the "struct smp_cmd_pairing" from them we need to skip the
+ * first byte which contains the opcode.
+ */
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ local = (void *) &smp->preq[1];
+ remote = (void *) &smp->prsp[1];
+ } else {
+ local = (void *) &smp->prsp[1];
+ remote = (void *) &smp->preq[1];
+ }
+
+ local_io = local->io_capability;
+ remote_io = remote->io_capability;
+
+ local_mitm = (local->auth_req & SMP_AUTH_MITM);
+ remote_mitm = (remote->auth_req & SMP_AUTH_MITM);
+
+ /* If either side wants MITM, look up the method from the table,
+ * otherwise use JUST WORKS.
+ */
+ if (local_mitm || remote_mitm)
+ method = get_auth_method(smp, local_io, remote_io);
+ else
+ method = JUST_WORKS;
+
+ /* Don't confirm locally initiated pairing attempts */
+ if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ method = JUST_WORKS;
+
+ return method;
+}
+
+static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_public_key *key = (void *) skb->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_dev *hdev = hcon->hdev;
+ struct crypto_kpp *tfm_ecdh;
+ struct smp_cmd_pairing_confirm cfm;
+ int err;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*key))
+ return SMP_INVALID_PARAMS;
+
+ /* Check if remote and local public keys are the same and debug key is
+ * not in use.
+ */
+ if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) &&
+ !crypto_memneq(key, smp->local_pk, 64)) {
+ bt_dev_err(hdev, "Remote and local public keys are identical");
+ return SMP_UNSPECIFIED;
+ }
+
+ memcpy(smp->remote_pk, key, 64);
+
+ if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) {
+ err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk,
+ smp->rr, 0, cfm.confirm_val);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (crypto_memneq(cfm.confirm_val, smp->pcnf, 16))
+ return SMP_CONFIRM_FAILED;
+ }
+
+ /* Non-initiating device sends its public key after receiving
+ * the key from the initiating device.
+ */
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ err = sc_send_public_key(smp);
+ if (err)
+ return err;
+ }
+
+ SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk);
+ SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32);
+
+ /* Compute the shared secret on the same crypto tfm on which the private
+ * key was set/generated.
+ */
+ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
+ struct l2cap_chan *hchan = hdev->smp_data;
+ struct smp_dev *smp_dev;
+
+ if (!hchan || !hchan->data)
+ return SMP_UNSPECIFIED;
+
+ smp_dev = hchan->data;
+
+ tfm_ecdh = smp_dev->tfm_ecdh;
+ } else {
+ tfm_ecdh = smp->tfm_ecdh;
+ }
+
+ if (compute_ecdh_secret(tfm_ecdh, smp->remote_pk, smp->dhkey))
+ return SMP_UNSPECIFIED;
+
+ SMP_DBG("DHKey %32phN", smp->dhkey);
+
+ set_bit(SMP_FLAG_REMOTE_PK, &smp->flags);
+
+ smp->method = sc_select_method(smp);
+
+ bt_dev_dbg(hdev, "selected method 0x%02x", smp->method);
+
+ /* JUST_WORKS and JUST_CFM result in an unauthenticated key */
+ if (smp->method == JUST_WORKS || smp->method == JUST_CFM)
+ hcon->pending_sec_level = BT_SECURITY_MEDIUM;
+ else
+ hcon->pending_sec_level = BT_SECURITY_FIPS;
+
+ if (!crypto_memneq(debug_pk, smp->remote_pk, 64))
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+
+ if (smp->method == DSP_PASSKEY) {
+ get_random_bytes(&hcon->passkey_notify,
+ sizeof(hcon->passkey_notify));
+ hcon->passkey_notify %= 1000000;
+ hcon->passkey_entered = 0;
+ smp->passkey_round = 0;
+ if (mgmt_user_passkey_notify(hdev, &hcon->dst, hcon->type,
+ hcon->dst_type,
+ hcon->passkey_notify,
+ hcon->passkey_entered))
+ return SMP_UNSPECIFIED;
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ return sc_passkey_round(smp, SMP_CMD_PUBLIC_KEY);
+ }
+
+ if (smp->method == REQ_OOB) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ return 0;
+ }
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ if (smp->method == REQ_PASSKEY) {
+ if (mgmt_user_passkey_request(hdev, &hcon->dst, hcon->type,
+ hcon->dst_type))
+ return SMP_UNSPECIFIED;
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+ return 0;
+ }
+
+ /* The Initiating device waits for the non-initiating device to
+ * send the confirm value.
+ */
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ return 0;
+
+ err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd,
+ 0, cfm.confirm_val);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ return 0;
+}
+
+static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_dhkey_check *check = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_chan *smp = chan->data;
+ u8 a[7], b[7], *local_addr, *remote_addr;
+ u8 io_cap[3], r[16], e[16];
+ int err;
+
+ bt_dev_dbg(hcon->hdev, "conn %p", conn);
+
+ if (skb->len < sizeof(*check))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(a, &hcon->init_addr, 6);
+ memcpy(b, &hcon->resp_addr, 6);
+ a[6] = hcon->init_addr_type;
+ b[6] = hcon->resp_addr_type;
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ local_addr = a;
+ remote_addr = b;
+ memcpy(io_cap, &smp->prsp[1], 3);
+ } else {
+ local_addr = b;
+ remote_addr = a;
+ memcpy(io_cap, &smp->preq[1], 3);
+ }
+
+ memset(r, 0, sizeof(r));
+
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ put_unaligned_le32(hcon->passkey_notify, r);
+ else if (smp->method == REQ_OOB)
+ memcpy(r, smp->lr, 16);
+
+ err = smp_f6(smp->tfm_cmac, smp->mackey, smp->rrnd, smp->prnd, r,
+ io_cap, remote_addr, local_addr, e);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (crypto_memneq(check->e, e, 16))
+ return SMP_DHKEY_CHECK_FAILED;
+
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) {
+ set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags);
+ return 0;
+ }
+
+ /* Responder sends DHKey check as response to initiator */
+ sc_dhkey_check(smp);
+ }
+
+ sc_add_ltk(smp);
+
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+ hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size);
+ hcon->enc_key_size = smp->enc_key_size;
+ }
+
+ return 0;
+}
+
+static int smp_cmd_keypress_notify(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct smp_cmd_keypress_notify *kp = (void *) skb->data;
+
+ bt_dev_dbg(conn->hcon->hdev, "value 0x%02x", kp->value);
+
+ return 0;
+}
+
+static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_chan *smp;
+ __u8 code, reason;
+ int err = 0;
+
+ if (skb->len < 1)
+ return -EILSEQ;
+
+ if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) {
+ reason = SMP_PAIRING_NOTSUPP;
+ goto done;
+ }
+
+ code = skb->data[0];
+ skb_pull(skb, sizeof(code));
+
+ smp = chan->data;
+
+ if (code > SMP_CMD_MAX)
+ goto drop;
+
+ if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) {
+ /* If there is a context and the command is not allowed consider
+ * it a failure so the session is cleanup properly.
+ */
+ switch (code) {
+ case SMP_CMD_IDENT_INFO:
+ case SMP_CMD_IDENT_ADDR_INFO:
+ case SMP_CMD_SIGN_INFO:
+ /* 3.6.1. Key distribution and generation
+ *
+ * A device may reject a distributed key by sending the
+ * Pairing Failed command with the reason set to
+ * "Key Rejected".
+ */
+ smp_failure(conn, SMP_KEY_REJECTED);
+ break;
+ }
+ goto drop;
+ }
+
+ /* If we don't have a context the only allowed commands are
+ * pairing request and security request.
+ */
+ if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ)
+ goto drop;
+
+ switch (code) {
+ case SMP_CMD_PAIRING_REQ:
+ reason = smp_cmd_pairing_req(conn, skb);
+ break;
+
+ case SMP_CMD_PAIRING_FAIL:
+ smp_failure(conn, 0);
+ err = -EPERM;
+ break;
+
+ case SMP_CMD_PAIRING_RSP:
+ reason = smp_cmd_pairing_rsp(conn, skb);
+ break;
+
+ case SMP_CMD_SECURITY_REQ:
+ reason = smp_cmd_security_req(conn, skb);
+ break;
+
+ case SMP_CMD_PAIRING_CONFIRM:
+ reason = smp_cmd_pairing_confirm(conn, skb);
+ break;
+
+ case SMP_CMD_PAIRING_RANDOM:
+ reason = smp_cmd_pairing_random(conn, skb);
+ break;
+
+ case SMP_CMD_ENCRYPT_INFO:
+ reason = smp_cmd_encrypt_info(conn, skb);
+ break;
+
+ case SMP_CMD_INITIATOR_IDENT:
+ reason = smp_cmd_initiator_ident(conn, skb);
+ break;
+
+ case SMP_CMD_IDENT_INFO:
+ reason = smp_cmd_ident_info(conn, skb);
+ break;
+
+ case SMP_CMD_IDENT_ADDR_INFO:
+ reason = smp_cmd_ident_addr_info(conn, skb);
+ break;
+
+ case SMP_CMD_SIGN_INFO:
+ reason = smp_cmd_sign_info(conn, skb);
+ break;
+
+ case SMP_CMD_PUBLIC_KEY:
+ reason = smp_cmd_public_key(conn, skb);
+ break;
+
+ case SMP_CMD_DHKEY_CHECK:
+ reason = smp_cmd_dhkey_check(conn, skb);
+ break;
+
+ case SMP_CMD_KEYPRESS_NOTIFY:
+ reason = smp_cmd_keypress_notify(conn, skb);
+ break;
+
+ default:
+ bt_dev_dbg(hcon->hdev, "Unknown command code 0x%2.2x", code);
+ reason = SMP_CMD_NOTSUPP;
+ goto done;
+ }
+
+done:
+ if (!err) {
+ if (reason)
+ smp_failure(conn, reason);
+ kfree_skb(skb);
+ }
+
+ return err;
+
+drop:
+ bt_dev_err(hcon->hdev, "unexpected SMP command 0x%02x from %pMR",
+ code, &hcon->dst);
+ kfree_skb(skb);
+ return 0;
+}
+
+static void smp_teardown_cb(struct l2cap_chan *chan, int err)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ bt_dev_dbg(conn->hcon->hdev, "chan %p", chan);
+
+ if (chan->data)
+ smp_chan_destroy(conn);
+
+ conn->smp = NULL;
+ l2cap_chan_put(chan);
+}
+
+static void bredr_pairing(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_chan *smp;
+
+ bt_dev_dbg(hdev, "chan %p", chan);
+
+ /* Only new pairings are interesting */
+ if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags))
+ return;
+
+ /* Don't bother if we're not encrypted */
+ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
+ return;
+
+ /* Only initiator may initiate SMP over BR/EDR */
+ if (hcon->role != HCI_ROLE_MASTER)
+ return;
+
+ /* Secure Connections support must be enabled */
+ if (!hci_dev_test_flag(hdev, HCI_SC_ENABLED))
+ return;
+
+ /* BR/EDR must use Secure Connections for SMP */
+ if (!test_bit(HCI_CONN_AES_CCM, &hcon->flags) &&
+ !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return;
+
+ /* If our LE support is not enabled don't do anything */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ /* Don't bother if remote LE support is not enabled */
+ if (!lmp_host_le_capable(hcon))
+ return;
+
+ /* Remote must support SMP fixed chan for BR/EDR */
+ if (!(conn->remote_fixed_chan & L2CAP_FC_SMP_BREDR))
+ return;
+
+ /* Don't bother if SMP is already ongoing */
+ if (chan->data)
+ return;
+
+ smp = smp_chan_create(conn);
+ if (!smp) {
+ bt_dev_err(hdev, "unable to create SMP context for BR/EDR");
+ return;
+ }
+
+ set_bit(SMP_FLAG_SC, &smp->flags);
+
+ bt_dev_dbg(hdev, "starting SMP over BR/EDR");
+
+ smp_send_pairing_req(smp, 0x00);
+}
+
+static void smp_resume_cb(struct l2cap_chan *chan)
+{
+ struct smp_chan *smp = chan->data;
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+
+ bt_dev_dbg(hcon->hdev, "chan %p", chan);
+
+ if (hcon->type == ACL_LINK) {
+ bredr_pairing(chan);
+ return;
+ }
+
+ if (!smp)
+ return;
+
+ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
+ return;
+
+ cancel_delayed_work(&smp->security_timer);
+
+ smp_distribute_keys(smp);
+}
+
+static void smp_ready_cb(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+
+ bt_dev_dbg(hcon->hdev, "chan %p", chan);
+
+ /* No need to call l2cap_chan_hold() here since we already own
+ * the reference taken in smp_new_conn_cb(). This is just the
+ * first time that we tie it to a specific pointer. The code in
+ * l2cap_core.c ensures that there's no risk this function won't
+ * get called if smp_new_conn_cb was previously called.
+ */
+ conn->smp = chan;
+
+ if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
+ bredr_pairing(chan);
+}
+
+static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan);
+
+ err = smp_sig_channel(chan, skb);
+ if (err) {
+ struct smp_chan *smp = chan->data;
+
+ if (smp)
+ cancel_delayed_work_sync(&smp->security_timer);
+
+ hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE);
+ }
+
+ return err;
+}
+
+static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb->priority = HCI_PRIO_MAX;
+ bt_cb(skb)->l2cap.chan = chan;
+
+ return skb;
+}
+
+static const struct l2cap_ops smp_chan_ops = {
+ .name = "Security Manager",
+ .ready = smp_ready_cb,
+ .recv = smp_recv_cb,
+ .alloc_skb = smp_alloc_skb_cb,
+ .teardown = smp_teardown_cb,
+ .resume = smp_resume_cb,
+
+ .new_connection = l2cap_chan_no_new_connection,
+ .state_change = l2cap_chan_no_state_change,
+ .close = l2cap_chan_no_close,
+ .defer = l2cap_chan_no_defer,
+ .suspend = l2cap_chan_no_suspend,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+ .get_sndtimeo = l2cap_chan_no_get_sndtimeo,
+};
+
+static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan)
+{
+ struct l2cap_chan *chan;
+
+ BT_DBG("pchan %p", pchan);
+
+ chan = l2cap_chan_create();
+ if (!chan)
+ return NULL;
+
+ chan->chan_type = pchan->chan_type;
+ chan->ops = &smp_chan_ops;
+ chan->scid = pchan->scid;
+ chan->dcid = chan->scid;
+ chan->imtu = pchan->imtu;
+ chan->omtu = pchan->omtu;
+ chan->mode = pchan->mode;
+
+ /* Other L2CAP channels may request SMP routines in order to
+ * change the security level. This means that the SMP channel
+ * lock must be considered in its own category to avoid lockdep
+ * warnings.
+ */
+ atomic_set(&chan->nesting, L2CAP_NESTING_SMP);
+
+ BT_DBG("created chan %p", chan);
+
+ return chan;
+}
+
+static const struct l2cap_ops smp_root_chan_ops = {
+ .name = "Security Manager Root",
+ .new_connection = smp_new_conn_cb,
+
+ /* None of these are implemented for the root channel */
+ .close = l2cap_chan_no_close,
+ .alloc_skb = l2cap_chan_no_alloc_skb,
+ .recv = l2cap_chan_no_recv,
+ .state_change = l2cap_chan_no_state_change,
+ .teardown = l2cap_chan_no_teardown,
+ .ready = l2cap_chan_no_ready,
+ .defer = l2cap_chan_no_defer,
+ .suspend = l2cap_chan_no_suspend,
+ .resume = l2cap_chan_no_resume,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+ .get_sndtimeo = l2cap_chan_no_get_sndtimeo,
+};
+
+static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
+{
+ struct l2cap_chan *chan;
+ struct smp_dev *smp;
+ struct crypto_shash *tfm_cmac;
+ struct crypto_kpp *tfm_ecdh;
+
+ if (cid == L2CAP_CID_SMP_BREDR) {
+ smp = NULL;
+ goto create_chan;
+ }
+
+ smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+ if (!smp)
+ return ERR_PTR(-ENOMEM);
+
+ tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(tfm_cmac)) {
+ bt_dev_err(hdev, "Unable to create CMAC crypto context");
+ kfree_sensitive(smp);
+ return ERR_CAST(tfm_cmac);
+ }
+
+ tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
+ if (IS_ERR(tfm_ecdh)) {
+ bt_dev_err(hdev, "Unable to create ECDH crypto context");
+ crypto_free_shash(tfm_cmac);
+ kfree_sensitive(smp);
+ return ERR_CAST(tfm_ecdh);
+ }
+
+ smp->local_oob = false;
+ smp->tfm_cmac = tfm_cmac;
+ smp->tfm_ecdh = tfm_ecdh;
+
+create_chan:
+ chan = l2cap_chan_create();
+ if (!chan) {
+ if (smp) {
+ crypto_free_shash(smp->tfm_cmac);
+ crypto_free_kpp(smp->tfm_ecdh);
+ kfree_sensitive(smp);
+ }
+ return ERR_PTR(-ENOMEM);
+ }
+
+ chan->data = smp;
+
+ l2cap_add_scid(chan, cid);
+
+ l2cap_chan_set_defaults(chan);
+
+ if (cid == L2CAP_CID_SMP) {
+ u8 bdaddr_type;
+
+ hci_copy_identity_address(hdev, &chan->src, &bdaddr_type);
+
+ if (bdaddr_type == ADDR_LE_DEV_PUBLIC)
+ chan->src_type = BDADDR_LE_PUBLIC;
+ else
+ chan->src_type = BDADDR_LE_RANDOM;
+ } else {
+ bacpy(&chan->src, &hdev->bdaddr);
+ chan->src_type = BDADDR_BREDR;
+ }
+
+ chan->state = BT_LISTEN;
+ chan->mode = L2CAP_MODE_BASIC;
+ chan->imtu = L2CAP_DEFAULT_MTU;
+ chan->ops = &smp_root_chan_ops;
+
+ /* Set correct nesting level for a parent/listening channel */
+ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
+
+ return chan;
+}
+
+static void smp_del_chan(struct l2cap_chan *chan)
+{
+ struct smp_dev *smp;
+
+ BT_DBG("chan %p", chan);
+
+ smp = chan->data;
+ if (smp) {
+ chan->data = NULL;
+ crypto_free_shash(smp->tfm_cmac);
+ crypto_free_kpp(smp->tfm_ecdh);
+ kfree_sensitive(smp);
+ }
+
+ l2cap_chan_put(chan);
+}
+
+int smp_force_bredr(struct hci_dev *hdev, bool enable)
+{
+ if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return -EALREADY;
+
+ if (enable) {
+ struct l2cap_chan *chan;
+
+ chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR);
+ if (IS_ERR(chan))
+ return PTR_ERR(chan);
+
+ hdev->smp_bredr_data = chan;
+ } else {
+ struct l2cap_chan *chan;
+
+ chan = hdev->smp_bredr_data;
+ hdev->smp_bredr_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP);
+
+ return 0;
+}
+
+int smp_register(struct hci_dev *hdev)
+{
+ struct l2cap_chan *chan;
+
+ bt_dev_dbg(hdev, "");
+
+ /* If the controller does not support Low Energy operation, then
+ * there is also no need to register any SMP channel.
+ */
+ if (!lmp_le_capable(hdev))
+ return 0;
+
+ if (WARN_ON(hdev->smp_data)) {
+ chan = hdev->smp_data;
+ hdev->smp_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ chan = smp_add_cid(hdev, L2CAP_CID_SMP);
+ if (IS_ERR(chan))
+ return PTR_ERR(chan);
+
+ hdev->smp_data = chan;
+
+ if (!lmp_sc_capable(hdev)) {
+ /* Flag can be already set here (due to power toggle) */
+ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return 0;
+ }
+
+ if (WARN_ON(hdev->smp_bredr_data)) {
+ chan = hdev->smp_bredr_data;
+ hdev->smp_bredr_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR);
+ if (IS_ERR(chan)) {
+ int err = PTR_ERR(chan);
+ chan = hdev->smp_data;
+ hdev->smp_data = NULL;
+ smp_del_chan(chan);
+ return err;
+ }
+
+ hdev->smp_bredr_data = chan;
+
+ return 0;
+}
+
+void smp_unregister(struct hci_dev *hdev)
+{
+ struct l2cap_chan *chan;
+
+ if (hdev->smp_bredr_data) {
+ chan = hdev->smp_bredr_data;
+ hdev->smp_bredr_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ if (hdev->smp_data) {
+ chan = hdev->smp_data;
+ hdev->smp_data = NULL;
+ smp_del_chan(chan);
+ }
+}
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
+
+static int __init test_debug_key(struct crypto_kpp *tfm_ecdh)
+{
+ u8 pk[64];
+ int err;
+
+ err = set_ecdh_privkey(tfm_ecdh, debug_sk);
+ if (err)
+ return err;
+
+ err = generate_ecdh_public_key(tfm_ecdh, pk);
+ if (err)
+ return err;
+
+ if (crypto_memneq(pk, debug_pk, 64))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_ah(void)
+{
+ const u8 irk[16] = {
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec };
+ const u8 r[3] = { 0x94, 0x81, 0x70 };
+ const u8 exp[3] = { 0xaa, 0xfb, 0x0d };
+ u8 res[3];
+ int err;
+
+ err = smp_ah(irk, r, res);
+ if (err)
+ return err;
+
+ if (crypto_memneq(res, exp, 3))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_c1(void)
+{
+ const u8 k[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ const u8 r[16] = {
+ 0xe0, 0x2e, 0x70, 0xc6, 0x4e, 0x27, 0x88, 0x63,
+ 0x0e, 0x6f, 0xad, 0x56, 0x21, 0xd5, 0x83, 0x57 };
+ const u8 preq[7] = { 0x01, 0x01, 0x00, 0x00, 0x10, 0x07, 0x07 };
+ const u8 pres[7] = { 0x02, 0x03, 0x00, 0x00, 0x08, 0x00, 0x05 };
+ const u8 _iat = 0x01;
+ const u8 _rat = 0x00;
+ const bdaddr_t ra = { { 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1 } };
+ const bdaddr_t ia = { { 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1 } };
+ const u8 exp[16] = {
+ 0x86, 0x3b, 0xf1, 0xbe, 0xc5, 0x4d, 0xa7, 0xd2,
+ 0xea, 0x88, 0x89, 0x87, 0xef, 0x3f, 0x1e, 0x1e };
+ u8 res[16];
+ int err;
+
+ err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res);
+ if (err)
+ return err;
+
+ if (crypto_memneq(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_s1(void)
+{
+ const u8 k[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ const u8 r1[16] = {
+ 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 };
+ const u8 r2[16] = {
+ 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99 };
+ const u8 exp[16] = {
+ 0x62, 0xa0, 0x6d, 0x79, 0xae, 0x16, 0x42, 0x5b,
+ 0x9b, 0xf4, 0xb0, 0xe8, 0xf0, 0xe1, 0x1f, 0x9a };
+ u8 res[16];
+ int err;
+
+ err = smp_s1(k, r1, r2, res);
+ if (err)
+ return err;
+
+ if (crypto_memneq(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_f4(struct crypto_shash *tfm_cmac)
+{
+ const u8 u[32] = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 };
+ const u8 v[32] = {
+ 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b,
+ 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59,
+ 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90,
+ 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 };
+ const u8 x[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 z = 0x00;
+ const u8 exp[16] = {
+ 0x2d, 0x87, 0x74, 0xa9, 0xbe, 0xa1, 0xed, 0xf1,
+ 0x1c, 0xbd, 0xa9, 0x07, 0xf1, 0x16, 0xc9, 0xf2 };
+ u8 res[16];
+ int err;
+
+ err = smp_f4(tfm_cmac, u, v, x, z, res);
+ if (err)
+ return err;
+
+ if (crypto_memneq(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_f5(struct crypto_shash *tfm_cmac)
+{
+ const u8 w[32] = {
+ 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86,
+ 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99,
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec };
+ const u8 n1[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 n2[16] = {
+ 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21,
+ 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 };
+ const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 };
+ const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 };
+ const u8 exp_ltk[16] = {
+ 0x38, 0x0a, 0x75, 0x94, 0xb5, 0x22, 0x05, 0x98,
+ 0x23, 0xcd, 0xd7, 0x69, 0x11, 0x79, 0x86, 0x69 };
+ const u8 exp_mackey[16] = {
+ 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd,
+ 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 };
+ u8 mackey[16], ltk[16];
+ int err;
+
+ err = smp_f5(tfm_cmac, w, n1, n2, a1, a2, mackey, ltk);
+ if (err)
+ return err;
+
+ if (crypto_memneq(mackey, exp_mackey, 16))
+ return -EINVAL;
+
+ if (crypto_memneq(ltk, exp_ltk, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_f6(struct crypto_shash *tfm_cmac)
+{
+ const u8 w[16] = {
+ 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd,
+ 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 };
+ const u8 n1[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 n2[16] = {
+ 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21,
+ 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 };
+ const u8 r[16] = {
+ 0xc8, 0x0f, 0x2d, 0x0c, 0xd2, 0x42, 0xda, 0x08,
+ 0x54, 0xbb, 0x53, 0xb4, 0x3b, 0x34, 0xa3, 0x12 };
+ const u8 io_cap[3] = { 0x02, 0x01, 0x01 };
+ const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 };
+ const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 };
+ const u8 exp[16] = {
+ 0x61, 0x8f, 0x95, 0xda, 0x09, 0x0b, 0x6c, 0xd2,
+ 0xc5, 0xe8, 0xd0, 0x9c, 0x98, 0x73, 0xc4, 0xe3 };
+ u8 res[16];
+ int err;
+
+ err = smp_f6(tfm_cmac, w, n1, n2, r, io_cap, a1, a2, res);
+ if (err)
+ return err;
+
+ if (crypto_memneq(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_g2(struct crypto_shash *tfm_cmac)
+{
+ const u8 u[32] = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 };
+ const u8 v[32] = {
+ 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b,
+ 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59,
+ 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90,
+ 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 };
+ const u8 x[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 y[16] = {
+ 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21,
+ 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 };
+ const u32 exp_val = 0x2f9ed5ba % 1000000;
+ u32 val;
+ int err;
+
+ err = smp_g2(tfm_cmac, u, v, x, y, &val);
+ if (err)
+ return err;
+
+ if (val != exp_val)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_h6(struct crypto_shash *tfm_cmac)
+{
+ const u8 w[16] = {
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec };
+ const u8 key_id[4] = { 0x72, 0x62, 0x65, 0x6c };
+ const u8 exp[16] = {
+ 0x99, 0x63, 0xb1, 0x80, 0xe2, 0xa9, 0xd3, 0xe8,
+ 0x1c, 0xc9, 0x6d, 0xe7, 0x02, 0xe1, 0x9a, 0x2d };
+ u8 res[16];
+ int err;
+
+ err = smp_h6(tfm_cmac, w, key_id, res);
+ if (err)
+ return err;
+
+ if (crypto_memneq(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static char test_smp_buffer[32];
+
+static ssize_t test_smp_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return simple_read_from_buffer(user_buf, count, ppos, test_smp_buffer,
+ strlen(test_smp_buffer));
+}
+
+static const struct file_operations test_smp_fops = {
+ .open = simple_open,
+ .read = test_smp_read,
+ .llseek = default_llseek,
+};
+
+static int __init run_selftests(struct crypto_shash *tfm_cmac,
+ struct crypto_kpp *tfm_ecdh)
+{
+ ktime_t calltime, delta, rettime;
+ unsigned long long duration;
+ int err;
+
+ calltime = ktime_get();
+
+ err = test_debug_key(tfm_ecdh);
+ if (err) {
+ BT_ERR("debug_key test failed");
+ goto done;
+ }
+
+ err = test_ah();
+ if (err) {
+ BT_ERR("smp_ah test failed");
+ goto done;
+ }
+
+ err = test_c1();
+ if (err) {
+ BT_ERR("smp_c1 test failed");
+ goto done;
+ }
+
+ err = test_s1();
+ if (err) {
+ BT_ERR("smp_s1 test failed");
+ goto done;
+ }
+
+ err = test_f4(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_f4 test failed");
+ goto done;
+ }
+
+ err = test_f5(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_f5 test failed");
+ goto done;
+ }
+
+ err = test_f6(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_f6 test failed");
+ goto done;
+ }
+
+ err = test_g2(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_g2 test failed");
+ goto done;
+ }
+
+ err = test_h6(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_h6 test failed");
+ goto done;
+ }
+
+ rettime = ktime_get();
+ delta = ktime_sub(rettime, calltime);
+ duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+
+ BT_INFO("SMP test passed in %llu usecs", duration);
+
+done:
+ if (!err)
+ snprintf(test_smp_buffer, sizeof(test_smp_buffer),
+ "PASS (%llu usecs)\n", duration);
+ else
+ snprintf(test_smp_buffer, sizeof(test_smp_buffer), "FAIL\n");
+
+ debugfs_create_file("selftest_smp", 0444, bt_debugfs, NULL,
+ &test_smp_fops);
+
+ return err;
+}
+
+int __init bt_selftest_smp(void)
+{
+ struct crypto_shash *tfm_cmac;
+ struct crypto_kpp *tfm_ecdh;
+ int err;
+
+ tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(tfm_cmac)) {
+ BT_ERR("Unable to create CMAC crypto context");
+ return PTR_ERR(tfm_cmac);
+ }
+
+ tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
+ if (IS_ERR(tfm_ecdh)) {
+ BT_ERR("Unable to create ECDH crypto context");
+ crypto_free_shash(tfm_cmac);
+ return PTR_ERR(tfm_ecdh);
+ }
+
+ err = run_selftests(tfm_cmac, tfm_ecdh);
+
+ crypto_free_shash(tfm_cmac);
+ crypto_free_kpp(tfm_ecdh);
+
+ return err;
+}
+
+#endif
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
new file mode 100644
index 000000000000..c5da53dfab04
--- /dev/null
+++ b/net/bluetooth/smp.h
@@ -0,0 +1,215 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#ifndef __SMP_H
+#define __SMP_H
+
+struct smp_command_hdr {
+ __u8 code;
+} __packed;
+
+#define SMP_CMD_PAIRING_REQ 0x01
+#define SMP_CMD_PAIRING_RSP 0x02
+struct smp_cmd_pairing {
+ __u8 io_capability;
+ __u8 oob_flag;
+ __u8 auth_req;
+ __u8 max_key_size;
+ __u8 init_key_dist;
+ __u8 resp_key_dist;
+} __packed;
+
+#define SMP_IO_DISPLAY_ONLY 0x00
+#define SMP_IO_DISPLAY_YESNO 0x01
+#define SMP_IO_KEYBOARD_ONLY 0x02
+#define SMP_IO_NO_INPUT_OUTPUT 0x03
+#define SMP_IO_KEYBOARD_DISPLAY 0x04
+
+#define SMP_OOB_NOT_PRESENT 0x00
+#define SMP_OOB_PRESENT 0x01
+
+#define SMP_DIST_ENC_KEY 0x01
+#define SMP_DIST_ID_KEY 0x02
+#define SMP_DIST_SIGN 0x04
+#define SMP_DIST_LINK_KEY 0x08
+
+#define SMP_AUTH_NONE 0x00
+#define SMP_AUTH_BONDING 0x01
+#define SMP_AUTH_MITM 0x04
+#define SMP_AUTH_SC 0x08
+#define SMP_AUTH_KEYPRESS 0x10
+#define SMP_AUTH_CT2 0x20
+
+#define SMP_CMD_PAIRING_CONFIRM 0x03
+struct smp_cmd_pairing_confirm {
+ __u8 confirm_val[16];
+} __packed;
+
+#define SMP_CMD_PAIRING_RANDOM 0x04
+struct smp_cmd_pairing_random {
+ __u8 rand_val[16];
+} __packed;
+
+#define SMP_CMD_PAIRING_FAIL 0x05
+struct smp_cmd_pairing_fail {
+ __u8 reason;
+} __packed;
+
+#define SMP_CMD_ENCRYPT_INFO 0x06
+struct smp_cmd_encrypt_info {
+ __u8 ltk[16];
+} __packed;
+
+#define SMP_CMD_INITIATOR_IDENT 0x07
+struct smp_cmd_initiator_ident {
+ __le16 ediv;
+ __le64 rand;
+} __packed;
+
+#define SMP_CMD_IDENT_INFO 0x08
+struct smp_cmd_ident_info {
+ __u8 irk[16];
+} __packed;
+
+#define SMP_CMD_IDENT_ADDR_INFO 0x09
+struct smp_cmd_ident_addr_info {
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+} __packed;
+
+#define SMP_CMD_SIGN_INFO 0x0a
+struct smp_cmd_sign_info {
+ __u8 csrk[16];
+} __packed;
+
+#define SMP_CMD_SECURITY_REQ 0x0b
+struct smp_cmd_security_req {
+ __u8 auth_req;
+} __packed;
+
+#define SMP_CMD_PUBLIC_KEY 0x0c
+struct smp_cmd_public_key {
+ __u8 x[32];
+ __u8 y[32];
+} __packed;
+
+#define SMP_CMD_DHKEY_CHECK 0x0d
+struct smp_cmd_dhkey_check {
+ __u8 e[16];
+} __packed;
+
+#define SMP_CMD_KEYPRESS_NOTIFY 0x0e
+struct smp_cmd_keypress_notify {
+ __u8 value;
+} __packed;
+
+#define SMP_CMD_MAX 0x0e
+
+#define SMP_PASSKEY_ENTRY_FAILED 0x01
+#define SMP_OOB_NOT_AVAIL 0x02
+#define SMP_AUTH_REQUIREMENTS 0x03
+#define SMP_CONFIRM_FAILED 0x04
+#define SMP_PAIRING_NOTSUPP 0x05
+#define SMP_ENC_KEY_SIZE 0x06
+#define SMP_CMD_NOTSUPP 0x07
+#define SMP_UNSPECIFIED 0x08
+#define SMP_REPEATED_ATTEMPTS 0x09
+#define SMP_INVALID_PARAMS 0x0a
+#define SMP_DHKEY_CHECK_FAILED 0x0b
+#define SMP_NUMERIC_COMP_FAILED 0x0c
+#define SMP_BREDR_PAIRING_IN_PROGRESS 0x0d
+#define SMP_CROSS_TRANSP_NOT_ALLOWED 0x0e
+#define SMP_KEY_REJECTED 0x0f
+
+#define SMP_MIN_ENC_KEY_SIZE 7
+#define SMP_MAX_ENC_KEY_SIZE 16
+
+/* LTK types used in internal storage (struct smp_ltk) */
+enum {
+ SMP_STK,
+ SMP_LTK,
+ SMP_LTK_RESPONDER,
+ SMP_LTK_P256,
+ SMP_LTK_P256_DEBUG,
+};
+
+static inline bool smp_ltk_is_sc(struct smp_ltk *key)
+{
+ switch (key->type) {
+ case SMP_LTK_P256:
+ case SMP_LTK_P256_DEBUG:
+ return true;
+ }
+
+ return false;
+}
+
+static inline u8 smp_ltk_sec_level(struct smp_ltk *key)
+{
+ if (key->authenticated) {
+ if (smp_ltk_is_sc(key))
+ return BT_SECURITY_FIPS;
+ else
+ return BT_SECURITY_HIGH;
+ }
+
+ return BT_SECURITY_MEDIUM;
+}
+
+/* Key preferences for smp_sufficient security */
+enum smp_key_pref {
+ SMP_ALLOW_STK,
+ SMP_USE_LTK,
+};
+
+/* SMP Commands */
+int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type);
+bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
+ enum smp_key_pref key_pref);
+int smp_conn_security(struct hci_conn *hcon, __u8 sec_level);
+int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey);
+
+bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
+ const bdaddr_t *bdaddr);
+int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa);
+int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]);
+
+int smp_force_bredr(struct hci_dev *hdev, bool enable);
+
+int smp_register(struct hci_dev *hdev);
+void smp_unregister(struct hci_dev *hdev);
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
+
+int bt_selftest_smp(void);
+
+#else
+
+static inline int bt_selftest_smp(void)
+{
+ return 0;
+}
+
+#endif
+
+#endif /* __SMP_H */
diff --git a/net/bpf/Makefile b/net/bpf/Makefile
new file mode 100644
index 000000000000..1ebe270bde23
--- /dev/null
+++ b/net/bpf/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_BPF_SYSCALL) := test_run.o
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_dummy_struct_ops.o
+endif
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
new file mode 100644
index 000000000000..812457819b5a
--- /dev/null
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021. Huawei Technologies Co., Ltd
+ */
+#include <linux/kernel.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+static struct bpf_struct_ops bpf_bpf_dummy_ops;
+
+/* A common type for test_N with return value in bpf_dummy_ops */
+typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...);
+
+static int dummy_ops_test_ret_function(struct bpf_dummy_ops_state *state, ...)
+{
+ return 0;
+}
+
+struct bpf_dummy_ops_test_args {
+ u64 args[MAX_BPF_FUNC_ARGS];
+ struct bpf_dummy_ops_state state;
+};
+
+static struct btf *bpf_dummy_ops_btf;
+
+static struct bpf_dummy_ops_test_args *
+dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr)
+{
+ __u32 size_in;
+ struct bpf_dummy_ops_test_args *args;
+ void __user *ctx_in;
+ void __user *u_state;
+
+ size_in = kattr->test.ctx_size_in;
+ if (size_in != sizeof(u64) * nr)
+ return ERR_PTR(-EINVAL);
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ return ERR_PTR(-ENOMEM);
+
+ ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ if (copy_from_user(args->args, ctx_in, size_in))
+ goto out;
+
+ /* args[0] is 0 means state argument of test_N will be NULL */
+ u_state = u64_to_user_ptr(args->args[0]);
+ if (u_state && copy_from_user(&args->state, u_state,
+ sizeof(args->state)))
+ goto out;
+
+ return args;
+out:
+ kfree(args);
+ return ERR_PTR(-EFAULT);
+}
+
+static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args)
+{
+ void __user *u_state;
+
+ u_state = u64_to_user_ptr(args->args[0]);
+ if (u_state && copy_to_user(u_state, &args->state, sizeof(args->state)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args)
+{
+ dummy_ops_test_ret_fn test = (void *)image + cfi_get_offset();
+ struct bpf_dummy_ops_state *state = NULL;
+
+ /* state needs to be NULL if args[0] is 0 */
+ if (args->args[0])
+ state = &args->state;
+ return test(state, args->args[1], args->args[2],
+ args->args[3], args->args[4]);
+}
+
+static const struct bpf_ctx_arg_aux *find_ctx_arg_info(struct bpf_prog_aux *aux, int offset)
+{
+ int i;
+
+ for (i = 0; i < aux->ctx_arg_info_size; i++)
+ if (aux->ctx_arg_info[i].offset == offset)
+ return &aux->ctx_arg_info[i];
+
+ return NULL;
+}
+
+/* There is only one check at the moment:
+ * - zero should not be passed for pointer parameters not marked as nullable.
+ */
+static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_args *args)
+{
+ const struct btf_type *func_proto = prog->aux->attach_func_proto;
+
+ for (u32 arg_no = 0; arg_no < btf_type_vlen(func_proto) ; ++arg_no) {
+ const struct btf_param *param = &btf_params(func_proto)[arg_no];
+ const struct bpf_ctx_arg_aux *info;
+ const struct btf_type *t;
+ int offset;
+
+ if (args->args[arg_no] != 0)
+ continue;
+
+ /* Program is validated already, so there is no need
+ * to check if t is NULL.
+ */
+ t = btf_type_skip_modifiers(bpf_dummy_ops_btf, param->type, NULL);
+ if (!btf_type_is_ptr(t))
+ continue;
+
+ offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no);
+ info = find_ctx_arg_info(prog->aux, offset);
+ if (info && type_may_be_null(info->reg_type))
+ continue;
+
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+extern const struct bpf_link_ops bpf_struct_ops_link_lops;
+
+int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops;
+ const struct btf_type *func_proto;
+ struct bpf_dummy_ops_test_args *args;
+ struct bpf_tramp_links *tlinks = NULL;
+ struct bpf_tramp_link *link = NULL;
+ void *image = NULL;
+ unsigned int op_idx;
+ u32 image_off = 0;
+ int prog_ret;
+ s32 type_id;
+ int err;
+
+ type_id = btf_find_by_name_kind(bpf_dummy_ops_btf,
+ bpf_bpf_dummy_ops.name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ if (prog->aux->attach_btf_id != type_id)
+ return -EOPNOTSUPP;
+
+ func_proto = prog->aux->attach_func_proto;
+ args = dummy_ops_init_args(kattr, btf_type_vlen(func_proto));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ err = check_test_run_args(prog, args);
+ if (err)
+ goto out;
+
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /* prog doesn't take the ownership of the reference from caller */
+ bpf_prog_inc(prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog,
+ prog->expected_attach_type);
+
+ op_idx = prog->expected_attach_type;
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[op_idx],
+ &dummy_ops_test_ret_function,
+ &image, &image_off,
+ true);
+ if (err < 0)
+ goto out;
+
+ err = arch_protect_bpf_trampoline(image, PAGE_SIZE);
+ if (err)
+ goto out;
+ prog_ret = dummy_ops_call_op(image, args);
+
+ err = dummy_ops_copy_args(args);
+ if (err)
+ goto out;
+ if (put_user(prog_ret, &uattr->test.retval))
+ err = -EFAULT;
+out:
+ kfree(args);
+ bpf_struct_ops_image_free(image);
+ if (link)
+ bpf_link_put(&link->link);
+ kfree(tlinks);
+ return err;
+}
+
+static int bpf_dummy_init(struct btf *btf)
+{
+ bpf_dummy_ops_btf = btf;
+ return 0;
+}
+
+static bool bpf_dummy_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_dummy_ops_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_dummy_ops, test_sleepable):
+ break;
+ default:
+ if (prog->sleepable)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *state;
+ const struct btf_type *t;
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
+ BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ state = btf_type_by_id(reg->btf, type_id);
+ if (t != state) {
+ bpf_log(log, "only access to bpf_dummy_ops_state is supported\n");
+ return -EACCES;
+ }
+
+ if (off + size > sizeof(struct bpf_dummy_ops_state)) {
+ bpf_log(log, "write access at off %d with size %d\n", off, size);
+ return -EACCES;
+ }
+
+ return NOT_INIT;
+}
+
+static const struct bpf_verifier_ops bpf_dummy_verifier_ops = {
+ .is_valid_access = bpf_dummy_ops_is_valid_access,
+ .btf_struct_access = bpf_dummy_ops_btf_struct_access,
+};
+
+static int bpf_dummy_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
+{
+ return -EOPNOTSUPP;
+}
+
+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
+{
+}
+
+static int bpf_dummy_ops__test_1(struct bpf_dummy_ops_state *cb__nullable)
+{
+ return 0;
+}
+
+static int bpf_dummy_test_2(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
+ char a3, unsigned long a4)
+{
+ return 0;
+}
+
+static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb)
+{
+ return 0;
+}
+
+static struct bpf_dummy_ops __bpf_bpf_dummy_ops = {
+ .test_1 = bpf_dummy_ops__test_1,
+ .test_2 = bpf_dummy_test_2,
+ .test_sleepable = bpf_dummy_test_sleepable,
+};
+
+static struct bpf_struct_ops bpf_bpf_dummy_ops = {
+ .verifier_ops = &bpf_dummy_verifier_ops,
+ .init = bpf_dummy_init,
+ .check_member = bpf_dummy_ops_check_member,
+ .init_member = bpf_dummy_init_member,
+ .reg = bpf_dummy_reg,
+ .unreg = bpf_dummy_unreg,
+ .name = "bpf_dummy_ops",
+ .cfi_stubs = &__bpf_bpf_dummy_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_dummy_struct_ops_init(void)
+{
+ return register_bpf_struct_ops(&bpf_bpf_dummy_ops, bpf_dummy_ops);
+}
+late_initcall(bpf_dummy_struct_ops_init);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
new file mode 100644
index 000000000000..655efac6f133
--- /dev/null
+++ b/net/bpf/test_run.c
@@ -0,0 +1,1834 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/sched/signal.h>
+#include <net/bpf_sk_storage.h>
+#include <net/hotdata.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/net_namespace.h>
+#include <net/page_pool/helpers.h>
+#include <linux/error-injection.h>
+#include <linux/smp.h>
+#include <linux/sock_diag.h>
+#include <linux/netfilter.h>
+#include <net/netdev_rx_queue.h>
+#include <net/xdp.h>
+#include <net/netfilter/nf_bpf_link.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bpf_test_run.h>
+
+struct bpf_test_timer {
+ u32 i;
+ u64 time_start, time_spent;
+};
+
+static void bpf_test_timer_enter(struct bpf_test_timer *t)
+ __acquires(rcu)
+{
+ rcu_read_lock_dont_migrate();
+ t->time_start = ktime_get_ns();
+}
+
+static void bpf_test_timer_leave(struct bpf_test_timer *t)
+ __releases(rcu)
+{
+ t->time_start = 0;
+ rcu_read_unlock_migrate();
+}
+
+static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations,
+ u32 repeat, int *err, u32 *duration)
+ __must_hold(rcu)
+{
+ t->i += iterations;
+ if (t->i >= repeat) {
+ /* We're done. */
+ t->time_spent += ktime_get_ns() - t->time_start;
+ do_div(t->time_spent, t->i);
+ *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent;
+ *err = 0;
+ goto reset;
+ }
+
+ if (signal_pending(current)) {
+ /* During iteration: we've been cancelled, abort. */
+ *err = -EINTR;
+ goto reset;
+ }
+
+ if (need_resched()) {
+ /* During iteration: we need to reschedule between runs. */
+ t->time_spent += ktime_get_ns() - t->time_start;
+ bpf_test_timer_leave(t);
+ cond_resched();
+ bpf_test_timer_enter(t);
+ }
+
+ /* Do another round. */
+ return true;
+
+reset:
+ t->i = 0;
+ return false;
+}
+
+/* We put this struct at the head of each page with a context and frame
+ * initialised when the page is allocated, so we don't have to do this on each
+ * repetition of the test run.
+ */
+struct xdp_page_head {
+ struct xdp_buff orig_ctx;
+ struct xdp_buff ctx;
+ union {
+ /* ::data_hard_start starts here */
+ DECLARE_FLEX_ARRAY(struct xdp_frame, frame);
+ DECLARE_FLEX_ARRAY(u8, data);
+ };
+};
+
+struct xdp_test_data {
+ struct xdp_buff *orig_ctx;
+ struct xdp_rxq_info rxq;
+ struct net_device *dev;
+ struct page_pool *pp;
+ struct xdp_frame **frames;
+ struct sk_buff **skbs;
+ struct xdp_mem_info mem;
+ u32 batch_size;
+ u32 frame_cnt;
+};
+
+/* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE
+ * must be updated accordingly this gets changed, otherwise BPF selftests
+ * will fail.
+ */
+#define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head))
+#define TEST_XDP_MAX_BATCH 256
+
+static void xdp_test_run_init_page(netmem_ref netmem, void *arg)
+{
+ struct xdp_page_head *head =
+ phys_to_virt(page_to_phys(netmem_to_page(netmem)));
+ struct xdp_buff *new_ctx, *orig_ctx;
+ u32 headroom = XDP_PACKET_HEADROOM;
+ struct xdp_test_data *xdp = arg;
+ size_t frm_len, meta_len;
+ struct xdp_frame *frm;
+ void *data;
+
+ orig_ctx = xdp->orig_ctx;
+ frm_len = orig_ctx->data_end - orig_ctx->data_meta;
+ meta_len = orig_ctx->data - orig_ctx->data_meta;
+ headroom -= meta_len;
+
+ new_ctx = &head->ctx;
+ frm = head->frame;
+ data = head->data;
+ memcpy(data + headroom, orig_ctx->data_meta, frm_len);
+
+ xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq);
+ xdp_prepare_buff(new_ctx, data, headroom, frm_len, true);
+ new_ctx->data = new_ctx->data_meta + meta_len;
+
+ xdp_update_frame_from_buff(new_ctx, frm);
+ frm->mem_type = new_ctx->rxq->mem.type;
+
+ memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx));
+}
+
+static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx)
+{
+ struct page_pool *pp;
+ int err = -ENOMEM;
+ struct page_pool_params pp_params = {
+ .order = 0,
+ .flags = 0,
+ .pool_size = xdp->batch_size,
+ .nid = NUMA_NO_NODE,
+ .init_callback = xdp_test_run_init_page,
+ .init_arg = xdp,
+ };
+
+ xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL);
+ if (!xdp->frames)
+ return -ENOMEM;
+
+ xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL);
+ if (!xdp->skbs)
+ goto err_skbs;
+
+ pp = page_pool_create(&pp_params);
+ if (IS_ERR(pp)) {
+ err = PTR_ERR(pp);
+ goto err_pp;
+ }
+
+ /* will copy 'mem.id' into pp->xdp_mem_id */
+ err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp);
+ if (err)
+ goto err_mmodel;
+
+ xdp->pp = pp;
+
+ /* We create a 'fake' RXQ referencing the original dev, but with an
+ * xdp_mem_info pointing to our page_pool
+ */
+ xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0);
+ xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL;
+ xdp->rxq.mem.id = pp->xdp_mem_id;
+ xdp->dev = orig_ctx->rxq->dev;
+ xdp->orig_ctx = orig_ctx;
+
+ return 0;
+
+err_mmodel:
+ page_pool_destroy(pp);
+err_pp:
+ kvfree(xdp->skbs);
+err_skbs:
+ kvfree(xdp->frames);
+ return err;
+}
+
+static void xdp_test_run_teardown(struct xdp_test_data *xdp)
+{
+ xdp_unreg_mem_model(&xdp->mem);
+ page_pool_destroy(xdp->pp);
+ kfree(xdp->frames);
+ kfree(xdp->skbs);
+}
+
+static bool frame_was_changed(const struct xdp_page_head *head)
+{
+ /* xdp_scrub_frame() zeroes the data pointer, flags is the last field,
+ * i.e. has the highest chances to be overwritten. If those two are
+ * untouched, it's most likely safe to skip the context reset.
+ */
+ return head->frame->data != head->orig_ctx.data ||
+ head->frame->flags != head->orig_ctx.flags;
+}
+
+static bool ctx_was_changed(struct xdp_page_head *head)
+{
+ return head->orig_ctx.data != head->ctx.data ||
+ head->orig_ctx.data_meta != head->ctx.data_meta ||
+ head->orig_ctx.data_end != head->ctx.data_end;
+}
+
+static void reset_ctx(struct xdp_page_head *head)
+{
+ if (likely(!frame_was_changed(head) && !ctx_was_changed(head)))
+ return;
+
+ head->ctx.data = head->orig_ctx.data;
+ head->ctx.data_meta = head->orig_ctx.data_meta;
+ head->ctx.data_end = head->orig_ctx.data_end;
+ xdp_update_frame_from_buff(&head->ctx, head->frame);
+ head->frame->mem_type = head->orig_ctx.rxq->mem.type;
+}
+
+static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
+ struct sk_buff **skbs,
+ struct net_device *dev)
+{
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n;
+ LIST_HEAD(list);
+
+ n = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes,
+ (void **)skbs);
+ if (unlikely(n == 0)) {
+ for (i = 0; i < nframes; i++)
+ xdp_return_frame(frames[i]);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nframes; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ struct sk_buff *skb = skbs[i];
+
+ skb = __xdp_build_skb_from_frame(xdpf, skb, dev);
+ if (!skb) {
+ xdp_return_frame(xdpf);
+ continue;
+ }
+
+ list_add_tail(&skb->list, &list);
+ }
+ netif_receive_skb_list(&list);
+
+ return 0;
+}
+
+static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
+ u32 repeat)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int err = 0, act, ret, i, nframes = 0, batch_sz;
+ struct xdp_frame **frames = xdp->frames;
+ struct bpf_redirect_info *ri;
+ struct xdp_page_head *head;
+ struct xdp_frame *frm;
+ bool redirect = false;
+ struct xdp_buff *ctx;
+ struct page *page;
+
+ batch_sz = min_t(u32, repeat, xdp->batch_size);
+
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ ri = bpf_net_ctx_get_ri();
+ xdp_set_return_frame_no_direct();
+
+ for (i = 0; i < batch_sz; i++) {
+ page = page_pool_dev_alloc_pages(xdp->pp);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ head = phys_to_virt(page_to_phys(page));
+ reset_ctx(head);
+ ctx = &head->ctx;
+ frm = head->frame;
+ xdp->frame_cnt++;
+
+ act = bpf_prog_run_xdp(prog, ctx);
+
+ /* if program changed pkt bounds we need to update the xdp_frame */
+ if (unlikely(ctx_was_changed(head))) {
+ ret = xdp_update_frame_from_buff(ctx, frm);
+ if (ret) {
+ xdp_return_buff(ctx);
+ continue;
+ }
+ }
+
+ switch (act) {
+ case XDP_TX:
+ /* we can't do a real XDP_TX since we're not in the
+ * driver, so turn it into a REDIRECT back to the same
+ * index
+ */
+ ri->tgt_index = xdp->dev->ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+ fallthrough;
+ case XDP_REDIRECT:
+ redirect = true;
+ ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog);
+ if (ret)
+ xdp_return_buff(ctx);
+ break;
+ case XDP_PASS:
+ frames[nframes++] = frm;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_buff(ctx);
+ break;
+ }
+ }
+
+out:
+ if (redirect)
+ xdp_do_flush();
+ if (nframes) {
+ ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev);
+ if (ret)
+ err = ret;
+ }
+
+ xdp_clear_return_frame_no_direct();
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+ return err;
+}
+
+static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx,
+ u32 repeat, u32 batch_size, u32 *time)
+
+{
+ struct xdp_test_data xdp = { .batch_size = batch_size };
+ struct bpf_test_timer t = {};
+ int ret;
+
+ if (!repeat)
+ repeat = 1;
+
+ ret = xdp_test_run_setup(&xdp, ctx);
+ if (ret)
+ return ret;
+
+ bpf_test_timer_enter(&t);
+ do {
+ xdp.frame_cnt = 0;
+ ret = xdp_test_run_batch(&xdp, prog, repeat - t.i);
+ if (unlikely(ret < 0))
+ break;
+ } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time));
+ bpf_test_timer_leave(&t);
+
+ xdp_test_run_teardown(&xdp);
+ return ret;
+}
+
+static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
+ u32 *retval, u32 *time, bool xdp)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ struct bpf_prog_array_item item = {.prog = prog};
+ struct bpf_run_ctx *old_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ struct bpf_test_timer t = {};
+ enum bpf_cgroup_storage_type stype;
+ int ret;
+
+ for_each_cgroup_storage_type(stype) {
+ item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+ if (IS_ERR(item.cgroup_storage[stype])) {
+ item.cgroup_storage[stype] = NULL;
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(item.cgroup_storage[stype]);
+ return -ENOMEM;
+ }
+ }
+
+ if (!repeat)
+ repeat = 1;
+
+ bpf_test_timer_enter(&t);
+ old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ do {
+ run_ctx.prog_item = &item;
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ if (xdp)
+ *retval = bpf_prog_run_xdp(prog, ctx);
+ else
+ *retval = bpf_prog_run(prog, ctx);
+
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+ } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
+ bpf_reset_run_ctx(old_ctx);
+ bpf_test_timer_leave(&t);
+
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(item.cgroup_storage[stype]);
+
+ return ret;
+}
+
+static int bpf_test_finish(const union bpf_attr *kattr,
+ union bpf_attr __user *uattr, const void *data,
+ struct skb_shared_info *sinfo, u32 size, u32 frag_size,
+ u32 retval, u32 duration)
+{
+ void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
+ int err = -EFAULT;
+ u32 copy_size = size;
+
+ /* Clamp copy if the user has provided a size hint, but copy the full
+ * buffer if not to retain old behaviour.
+ */
+ if (kattr->test.data_size_out &&
+ copy_size > kattr->test.data_size_out) {
+ copy_size = kattr->test.data_size_out;
+ err = -ENOSPC;
+ }
+
+ if (data_out) {
+ int len = sinfo ? copy_size - frag_size : copy_size;
+
+ if (len < 0) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (copy_to_user(data_out, data, len))
+ goto out;
+
+ if (sinfo) {
+ int i, offset = len;
+ u32 data_len;
+
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ if (offset >= copy_size) {
+ err = -ENOSPC;
+ break;
+ }
+
+ data_len = min_t(u32, copy_size - offset,
+ skb_frag_size(frag));
+
+ if (copy_to_user(data_out + offset,
+ skb_frag_address(frag),
+ data_len))
+ goto out;
+
+ offset += data_len;
+ }
+ }
+ }
+
+ if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
+ goto out;
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
+ goto out;
+ if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration)))
+ goto out;
+ if (err != -ENOSPC)
+ err = 0;
+out:
+ trace_bpf_test_finish(&err);
+ return err;
+}
+
+/* Integer types of various sizes and pointer combinations cover variety of
+ * architecture dependent calling conventions. 7+ can be supported in the
+ * future.
+ */
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_fentry_test1(int a)
+{
+ return a + 1;
+}
+EXPORT_SYMBOL_GPL(bpf_fentry_test1);
+
+noinline int bpf_fentry_test2(int a, u64 b)
+{
+ return a + b;
+}
+
+noinline int bpf_fentry_test3(char a, int b, u64 c)
+{
+ return a + b + c;
+}
+
+noinline int bpf_fentry_test4(void *a, char b, int c, u64 d)
+{
+ return (long)a + b + c + d;
+}
+
+noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e)
+{
+ return a + (long)b + c + d + e;
+}
+
+noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f)
+{
+ return a + (long)b + c + d + (long)e + f;
+}
+
+struct bpf_fentry_test_t {
+ struct bpf_fentry_test_t *a;
+};
+
+noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg)
+{
+ asm volatile ("" : "+r"(arg));
+ return (long)arg;
+}
+
+noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg)
+{
+ return (long)arg->a;
+}
+
+__bpf_kfunc u32 bpf_fentry_test9(u32 *a)
+{
+ return *a;
+}
+
+noinline int bpf_fentry_test10(const void *a)
+{
+ return (long)a;
+}
+
+noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo)
+{
+}
+
+__bpf_kfunc int bpf_modify_return_test(int a, int *b)
+{
+ *b += 1;
+ return a + *b;
+}
+
+__bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d,
+ void *e, char f, int g)
+{
+ *b += 1;
+ return a + *b + c + d + (long)e + f + g;
+}
+
+__bpf_kfunc int bpf_modify_return_test_tp(int nonce)
+{
+ trace_bpf_trigger_tp(nonce);
+
+ return nonce;
+}
+
+noinline int bpf_fentry_shadow_test(int a)
+{
+ return a + 1;
+}
+
+struct prog_test_member1 {
+ int a;
+};
+
+struct prog_test_member {
+ struct prog_test_member1 m;
+ int c;
+};
+
+struct prog_test_ref_kfunc {
+ int a;
+ int b;
+ struct prog_test_member memb;
+ struct prog_test_ref_kfunc *next;
+ refcount_t cnt;
+};
+
+__bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+{
+ refcount_dec(&p->cnt);
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p)
+{
+ bpf_kfunc_call_test_release(p);
+}
+CFI_NOSEAL(bpf_kfunc_call_test_release_dtor);
+
+__bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p)
+{
+}
+CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor);
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_test_modify_return_ids)
+BTF_ID_FLAGS(func, bpf_modify_return_test)
+BTF_ID_FLAGS(func, bpf_modify_return_test2)
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE)
+BTF_KFUNCS_END(bpf_test_modify_return_ids)
+
+static const struct btf_kfunc_id_set bpf_test_modify_return_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_test_modify_return_ids,
+};
+
+BTF_KFUNCS_START(test_sk_check_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE)
+BTF_KFUNCS_END(test_sk_check_kfunc_ids)
+
+static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
+ u32 size, u32 headroom, u32 tailroom)
+{
+ void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+ void *data;
+
+ if (user_size > PAGE_SIZE - headroom - tailroom)
+ return ERR_PTR(-EINVAL);
+
+ size = SKB_DATA_ALIGN(size);
+ data = kzalloc(size + headroom + tailroom, GFP_USER);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(data + headroom, data_in, user_size)) {
+ kfree(data);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return data;
+}
+
+int bpf_prog_test_run_tracing(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_fentry_test_t arg = {};
+ u16 side_effect = 0, ret = 0;
+ int b = 2, err = -EFAULT;
+ u32 retval = 0;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ if (bpf_fentry_test1(1) != 2 ||
+ bpf_fentry_test2(2, 3) != 5 ||
+ bpf_fentry_test3(4, 5, 6) != 15 ||
+ bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
+ bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
+ bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 ||
+ bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 ||
+ bpf_fentry_test8(&arg) != 0 ||
+ bpf_fentry_test9(&retval) != 0 ||
+ bpf_fentry_test10((void *)0) != 0)
+ goto out;
+ break;
+ case BPF_MODIFY_RETURN:
+ ret = bpf_modify_return_test(1, &b);
+ if (b != 2)
+ side_effect++;
+ b = 2;
+ ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7);
+ if (b != 2)
+ side_effect++;
+ break;
+ default:
+ goto out;
+ }
+
+ retval = ((u32)side_effect << 16) | ret;
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
+ goto out;
+
+ err = 0;
+out:
+ trace_bpf_test_finish(&err);
+ return err;
+}
+
+struct bpf_raw_tp_test_run_info {
+ struct bpf_prog *prog;
+ void *ctx;
+ u32 retval;
+};
+
+static void
+__bpf_prog_test_run_raw_tp(void *data)
+{
+ struct bpf_raw_tp_test_run_info *info = data;
+ struct bpf_trace_run_ctx run_ctx = {};
+ struct bpf_run_ctx *old_run_ctx;
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
+ rcu_read_lock();
+ info->retval = bpf_prog_run(info->prog, info->ctx);
+ rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
+}
+
+int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ __u32 ctx_size_in = kattr->test.ctx_size_in;
+ struct bpf_raw_tp_test_run_info info;
+ int cpu = kattr->test.cpu, err = 0;
+ int current_cpu;
+
+ /* doesn't support data_in/out, ctx_out, duration, or repeat */
+ if (kattr->test.data_in || kattr->test.data_out ||
+ kattr->test.ctx_out || kattr->test.duration ||
+ kattr->test.repeat || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (ctx_size_in < prog->aux->max_ctx_offset ||
+ ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64))
+ return -EINVAL;
+
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
+ return -EINVAL;
+
+ if (ctx_size_in) {
+ info.ctx = memdup_user(ctx_in, ctx_size_in);
+ if (IS_ERR(info.ctx))
+ return PTR_ERR(info.ctx);
+ } else {
+ info.ctx = NULL;
+ }
+
+ info.prog = prog;
+
+ current_cpu = get_cpu();
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
+ cpu == current_cpu) {
+ __bpf_prog_test_run_raw_tp(&info);
+ } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ /* smp_call_function_single() also checks cpu_online()
+ * after csd_lock(). However, since cpu is from user
+ * space, let's do an extra quick check to filter out
+ * invalid value before smp_call_function_single().
+ */
+ err = -ENXIO;
+ } else {
+ err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp,
+ &info, 1);
+ }
+ put_cpu();
+
+ if (!err &&
+ copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32)))
+ err = -EFAULT;
+
+ kfree(info.ctx);
+ return err;
+}
+
+static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
+{
+ void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
+ void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+ u32 size = kattr->test.ctx_size_in;
+ void *data;
+ int err;
+
+ if (!data_in && !data_out)
+ return NULL;
+
+ data = kzalloc(max_size, GFP_USER);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ if (data_in) {
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size);
+ if (err) {
+ kfree(data);
+ return ERR_PTR(err);
+ }
+
+ size = min_t(u32, max_size, size);
+ if (copy_from_user(data, data_in, size)) {
+ kfree(data);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+ return data;
+}
+
+static int bpf_ctx_finish(const union bpf_attr *kattr,
+ union bpf_attr __user *uattr, const void *data,
+ u32 size)
+{
+ void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+ int err = -EFAULT;
+ u32 copy_size = size;
+
+ if (!data || !data_out)
+ return 0;
+
+ if (copy_size > kattr->test.ctx_size_out) {
+ copy_size = kattr->test.ctx_size_out;
+ err = -ENOSPC;
+ }
+
+ if (copy_to_user(data_out, data, copy_size))
+ goto out;
+ if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size)))
+ goto out;
+ if (err != -ENOSPC)
+ err = 0;
+out:
+ return err;
+}
+
+/**
+ * range_is_zero - test whether buffer is initialized
+ * @buf: buffer to check
+ * @from: check from this position
+ * @to: check up until (excluding) this position
+ *
+ * This function returns true if the there is a non-zero byte
+ * in the buf in the range [from,to).
+ */
+static inline bool range_is_zero(void *buf, size_t from, size_t to)
+{
+ return !memchr_inv((u8 *)buf + from, 0, to - from);
+}
+
+static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+ struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+ if (!__skb)
+ return 0;
+
+ /* make sure the fields we don't use are zeroed */
+ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark)))
+ return -EINVAL;
+
+ /* mark is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark),
+ offsetof(struct __sk_buff, priority)))
+ return -EINVAL;
+
+ /* priority is allowed */
+ /* ingress_ifindex is allowed */
+ /* ifindex is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex),
+ offsetof(struct __sk_buff, cb)))
+ return -EINVAL;
+
+ /* cb is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb),
+ offsetof(struct __sk_buff, data_end)))
+ return -EINVAL;
+
+ /* data_end is allowed, but not copied to skb */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, data_end),
+ offsetof(struct __sk_buff, tstamp)))
+ return -EINVAL;
+
+ /* tstamp is allowed */
+ /* wire_len is allowed */
+ /* gso_segs is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs),
+ offsetof(struct __sk_buff, gso_size)))
+ return -EINVAL;
+
+ /* gso_size is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size),
+ offsetof(struct __sk_buff, hwtstamp)))
+ return -EINVAL;
+
+ /* hwtstamp is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp),
+ sizeof(struct __sk_buff)))
+ return -EINVAL;
+
+ skb->mark = __skb->mark;
+ skb->priority = __skb->priority;
+ skb->skb_iif = __skb->ingress_ifindex;
+ skb->tstamp = __skb->tstamp;
+ memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN);
+
+ if (__skb->wire_len == 0) {
+ cb->pkt_len = skb->len;
+ } else {
+ if (__skb->wire_len < skb->len ||
+ __skb->wire_len > GSO_LEGACY_MAX_SIZE)
+ return -EINVAL;
+ cb->pkt_len = __skb->wire_len;
+ }
+
+ if (__skb->gso_segs > GSO_MAX_SEGS)
+ return -EINVAL;
+
+ /* Currently GSO type is zero/unset. If this gets extended with
+ * a small list of accepted GSO types in future, the filter for
+ * an unset GSO type in bpf_clone_redirect() can be lifted.
+ */
+ skb_shinfo(skb)->gso_segs = __skb->gso_segs;
+ skb_shinfo(skb)->gso_size = __skb->gso_size;
+ skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp;
+
+ return 0;
+}
+
+static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+ struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+ if (!__skb)
+ return;
+
+ __skb->mark = skb->mark;
+ __skb->priority = skb->priority;
+ __skb->ingress_ifindex = skb->skb_iif;
+ __skb->ifindex = skb->dev->ifindex;
+ __skb->tstamp = skb->tstamp;
+ memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
+ __skb->wire_len = cb->pkt_len;
+ __skb->gso_segs = skb_shinfo(skb)->gso_segs;
+ __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp;
+}
+
+static struct proto bpf_dummy_proto = {
+ .name = "bpf_dummy",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct sock),
+};
+
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ bool is_l2 = false, is_direct_pkt_access = false, is_lwt = false;
+ u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = net->loopback_dev;
+ u32 headroom = NET_SKB_PAD + NET_IP_ALIGN;
+ u32 linear_sz = kattr->test.data_size_in;
+ u32 repeat = kattr->test.repeat;
+ struct __sk_buff *ctx = NULL;
+ struct sk_buff *skb = NULL;
+ struct sock *sk = NULL;
+ u32 retval, duration;
+ int hh_len = ETH_HLEN;
+ void *data = NULL;
+ int ret;
+
+ if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) ||
+ kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (kattr->test.data_size_in < ETH_HLEN)
+ return -EINVAL;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ is_direct_pkt_access = true;
+ is_l2 = true;
+ break;
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ is_lwt = true;
+ fallthrough;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ is_direct_pkt_access = true;
+ break;
+ default:
+ break;
+ }
+
+ ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff));
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (ctx) {
+ if (ctx->data_end > kattr->test.data_size_in || ctx->data || ctx->data_meta) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (ctx->data_end) {
+ /* Non-linear LWT test_run is unsupported for now. */
+ if (is_lwt) {
+ ret = -EINVAL;
+ goto out;
+ }
+ linear_sz = max(ETH_HLEN, ctx->data_end);
+ }
+ }
+
+ linear_sz = min_t(u32, linear_sz, PAGE_SIZE - headroom - tailroom);
+
+ data = bpf_test_init(kattr, linear_sz, linear_sz, headroom, tailroom);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ data = NULL;
+ goto out;
+ }
+
+ sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1);
+ if (!sk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sock_init_data(NULL, sk);
+
+ skb = slab_build_skb(data);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ skb->sk = sk;
+
+ data = NULL; /* data released via kfree_skb */
+
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ __skb_put(skb, linear_sz);
+
+ if (unlikely(kattr->test.data_size_in > linear_sz)) {
+ void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ u32 copied = linear_sz;
+
+ while (copied < kattr->test.data_size_in) {
+ struct page *page;
+ u32 data_len;
+
+ if (sinfo->nr_frags == MAX_SKB_FRAGS) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data_len = min_t(u32, kattr->test.data_size_in - copied,
+ PAGE_SIZE);
+ skb_fill_page_desc(skb, sinfo->nr_frags, page, 0, data_len);
+
+ if (copy_from_user(page_address(page), data_in + copied,
+ data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ skb->data_len += data_len;
+ skb->truesize += PAGE_SIZE;
+ skb->len += data_len;
+ copied += data_len;
+ }
+ }
+
+ if (ctx && ctx->ifindex > 1) {
+ dev = dev_get_by_index(net, ctx->ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ }
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_reset_network_header(skb);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ sk->sk_family = AF_INET;
+ if (sizeof(struct iphdr) <= skb_headlen(skb)) {
+ sk->sk_rcv_saddr = ip_hdr(skb)->saddr;
+ sk->sk_daddr = ip_hdr(skb)->daddr;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ sk->sk_family = AF_INET6;
+ if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) {
+ sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr;
+ sk->sk_v6_daddr = ipv6_hdr(skb)->daddr;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+
+ if (is_l2)
+ __skb_push(skb, hh_len);
+ if (is_direct_pkt_access)
+ bpf_compute_data_pointers(skb);
+
+ ret = convert___skb_to_skb(skb, ctx);
+ if (ret)
+ goto out;
+
+ if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) {
+ const int off = skb_network_offset(skb);
+ int len = skb->len - off;
+
+ skb->csum = skb_checksum(skb, off, len, 0);
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ }
+
+ ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false);
+ if (ret)
+ goto out;
+ if (!is_l2) {
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ memset(__skb_push(skb, hh_len), 0, hh_len);
+ }
+
+ if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) {
+ const int off = skb_network_offset(skb);
+ int len = skb->len - off;
+ __wsum csum;
+
+ csum = skb_checksum(skb, off, len, 0);
+
+ if (csum_fold(skb->csum) != csum_fold(csum)) {
+ ret = -EBADMSG;
+ goto out;
+ }
+ }
+
+ convert_skb_to___skb(skb, ctx);
+
+ if (skb_is_nonlinear(skb))
+ /* bpf program can never convert linear skb to non-linear */
+ WARN_ON_ONCE(linear_sz == kattr->test.data_size_in);
+ ret = bpf_test_finish(kattr, uattr, skb->data, skb_shinfo(skb), skb->len,
+ skb->data_len, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, ctx,
+ sizeof(struct __sk_buff));
+out:
+ if (dev && dev != net->loopback_dev)
+ dev_put(dev);
+ kfree_skb(skb);
+ kfree(data);
+ if (sk)
+ sk_free(sk);
+ kfree(ctx);
+ return ret;
+}
+
+static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp)
+{
+ unsigned int ingress_ifindex, rx_queue_index;
+ struct netdev_rx_queue *rxqueue;
+ struct net_device *device;
+
+ if (!xdp_md)
+ return 0;
+
+ if (xdp_md->egress_ifindex != 0)
+ return -EINVAL;
+
+ ingress_ifindex = xdp_md->ingress_ifindex;
+ rx_queue_index = xdp_md->rx_queue_index;
+
+ if (!ingress_ifindex && rx_queue_index)
+ return -EINVAL;
+
+ if (ingress_ifindex) {
+ device = dev_get_by_index(current->nsproxy->net_ns,
+ ingress_ifindex);
+ if (!device)
+ return -ENODEV;
+
+ if (rx_queue_index >= device->real_num_rx_queues)
+ goto free_dev;
+
+ rxqueue = __netif_get_rx_queue(device, rx_queue_index);
+
+ if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq))
+ goto free_dev;
+
+ xdp->rxq = &rxqueue->xdp_rxq;
+ /* The device is now tracked in the xdp->rxq for later
+ * dev_put()
+ */
+ }
+
+ xdp->data = xdp->data_meta + xdp_md->data;
+ return 0;
+
+free_dev:
+ dev_put(device);
+ return -EINVAL;
+}
+
+static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md)
+{
+ if (!xdp_md)
+ return;
+
+ xdp_md->data = xdp->data - xdp->data_meta;
+ xdp_md->data_end = xdp->data_end - xdp->data_meta;
+
+ if (xdp_md->ingress_ifindex)
+ dev_put(xdp->rxq->dev);
+}
+
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES);
+ u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size;
+ u32 linear_sz = kattr->test.data_size_in;
+ u32 batch_size = kattr->test.batch_size;
+ u32 headroom = XDP_PACKET_HEADROOM;
+ u32 repeat = kattr->test.repeat;
+ struct netdev_rx_queue *rxqueue;
+ struct skb_shared_info *sinfo;
+ struct xdp_buff xdp = {};
+ int i, ret = -EINVAL;
+ struct xdp_md *ctx;
+ void *data;
+
+ if (prog->expected_attach_type == BPF_XDP_DEVMAP ||
+ prog->expected_attach_type == BPF_XDP_CPUMAP)
+ return -EINVAL;
+
+ if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES)
+ return -EINVAL;
+
+ if (bpf_prog_is_dev_bound(prog->aux))
+ return -EINVAL;
+
+ if (do_live) {
+ if (!batch_size)
+ batch_size = NAPI_POLL_WEIGHT;
+ else if (batch_size > TEST_XDP_MAX_BATCH)
+ return -E2BIG;
+
+ headroom += sizeof(struct xdp_page_head);
+ } else if (batch_size) {
+ return -EINVAL;
+ }
+
+ ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md));
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (ctx) {
+ /* There can't be user provided data before the meta data */
+ if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in ||
+ ctx->data > ctx->data_end ||
+ unlikely(xdp_metalen_invalid(ctx->data)) ||
+ (do_live && (kattr->test.data_out || kattr->test.ctx_out)))
+ goto free_ctx;
+ /* Meta data is allocated from the headroom */
+ headroom -= ctx->data;
+
+ meta_sz = ctx->data;
+ linear_sz = ctx->data_end;
+ }
+
+ max_linear_sz = PAGE_SIZE - headroom - tailroom;
+ linear_sz = min_t(u32, linear_sz, max_linear_sz);
+
+ /* disallow live data mode for jumbo frames */
+ if (do_live && kattr->test.data_size_in > linear_sz)
+ goto free_ctx;
+
+ if (kattr->test.data_size_in - meta_sz < ETH_HLEN)
+ goto free_ctx;
+
+ data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ goto free_ctx;
+ }
+
+ rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
+ rxqueue->xdp_rxq.frag_size = PAGE_SIZE;
+ xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq);
+ xdp_prepare_buff(&xdp, data, headroom, linear_sz, true);
+ sinfo = xdp_get_shared_info_from_buff(&xdp);
+
+ ret = xdp_convert_md_to_buff(ctx, &xdp);
+ if (ret)
+ goto free_data;
+
+ size = linear_sz;
+ if (unlikely(kattr->test.data_size_in > size)) {
+ void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+
+ while (size < kattr->test.data_size_in) {
+ struct page *page;
+ skb_frag_t *frag;
+ u32 data_len;
+
+ if (sinfo->nr_frags == MAX_SKB_FRAGS) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frag = &sinfo->frags[sinfo->nr_frags++];
+
+ data_len = min_t(u32, kattr->test.data_size_in - size,
+ PAGE_SIZE);
+ skb_frag_fill_page_desc(frag, page, 0, data_len);
+
+ if (copy_from_user(page_address(page), data_in + size,
+ data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ sinfo->xdp_frags_size += data_len;
+ size += data_len;
+ }
+ xdp_buff_set_frags_flag(&xdp);
+ }
+
+ if (repeat > 1)
+ bpf_prog_change_xdp(NULL, prog);
+
+ if (do_live)
+ ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration);
+ else
+ ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
+ /* We convert the xdp_buff back to an xdp_md before checking the return
+ * code so the reference count of any held netdevice will be decremented
+ * even if the test run failed.
+ */
+ xdp_convert_buff_to_md(&xdp, ctx);
+ if (ret)
+ goto out;
+
+ size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size;
+ ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, sinfo->xdp_frags_size,
+ retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, ctx,
+ sizeof(struct xdp_md));
+
+out:
+ if (repeat > 1)
+ bpf_prog_change_xdp(prog, NULL);
+free_data:
+ for (i = 0; i < sinfo->nr_frags; i++)
+ __free_page(skb_frag_page(&sinfo->frags[i]));
+ kfree(data);
+free_ctx:
+ kfree(ctx);
+ return ret;
+}
+
+static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
+{
+ /* make sure the fields we don't use are zeroed */
+ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
+ return -EINVAL;
+
+ /* flags is allowed */
+
+ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags),
+ sizeof(struct bpf_flow_keys)))
+ return -EINVAL;
+
+ return 0;
+}
+
+int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_test_timer t = {};
+ u32 size = kattr->test.data_size_in;
+ struct bpf_flow_dissector ctx = {};
+ u32 repeat = kattr->test.repeat;
+ struct bpf_flow_keys *user_ctx;
+ struct bpf_flow_keys flow_keys;
+ const struct ethhdr *eth;
+ unsigned int flags = 0;
+ u32 retval, duration;
+ void *data;
+ int ret;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (size < ETH_HLEN)
+ return -EINVAL;
+
+ data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ eth = (struct ethhdr *)data;
+
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
+ if (IS_ERR(user_ctx)) {
+ kfree(data);
+ return PTR_ERR(user_ctx);
+ }
+ if (user_ctx) {
+ ret = verify_user_bpf_flow_keys(user_ctx);
+ if (ret)
+ goto out;
+ flags = user_ctx->flags;
+ }
+
+ ctx.flow_keys = &flow_keys;
+ ctx.data = data;
+ ctx.data_end = (__u8 *)data + size;
+
+ bpf_test_timer_enter(&t);
+ do {
+ retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
+ size, flags);
+ } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration));
+ bpf_test_timer_leave(&t);
+
+ if (ret < 0)
+ goto out;
+
+ ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL,
+ sizeof(flow_keys), 0, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+ sizeof(struct bpf_flow_keys));
+
+out:
+ kfree(user_ctx);
+ kfree(data);
+ return ret;
+}
+
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_test_timer t = {};
+ struct bpf_prog_array *progs = NULL;
+ struct bpf_sk_lookup_kern ctx = {};
+ u32 repeat = kattr->test.repeat;
+ struct bpf_sk_lookup *user_ctx;
+ u32 retval, duration;
+ int ret = -EINVAL;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
+ kattr->test.data_size_out)
+ return -EINVAL;
+
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx));
+ if (IS_ERR(user_ctx))
+ return PTR_ERR(user_ctx);
+
+ if (!user_ctx)
+ return -EINVAL;
+
+ if (user_ctx->sk)
+ goto out;
+
+ if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
+ goto out;
+
+ if (user_ctx->local_port > U16_MAX) {
+ ret = -ERANGE;
+ goto out;
+ }
+
+ ctx.family = (u16)user_ctx->family;
+ ctx.protocol = (u16)user_ctx->protocol;
+ ctx.dport = (u16)user_ctx->local_port;
+ ctx.sport = user_ctx->remote_port;
+
+ switch (ctx.family) {
+ case AF_INET:
+ ctx.v4.daddr = (__force __be32)user_ctx->local_ip4;
+ ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6;
+ ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6;
+ break;
+#endif
+
+ default:
+ ret = -EAFNOSUPPORT;
+ goto out;
+ }
+
+ progs = bpf_prog_array_alloc(1, GFP_KERNEL);
+ if (!progs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ progs->items[0].prog = prog;
+
+ bpf_test_timer_enter(&t);
+ do {
+ ctx.selected_sk = NULL;
+ retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run);
+ } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration));
+ bpf_test_timer_leave(&t);
+
+ if (ret < 0)
+ goto out;
+
+ user_ctx->cookie = 0;
+ if (ctx.selected_sk) {
+ if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
+ }
+
+ ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
+
+out:
+ bpf_prog_array_free(progs);
+ kfree(user_ctx);
+ return ret;
+}
+
+int bpf_prog_test_run_syscall(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ __u32 ctx_size_in = kattr->test.ctx_size_in;
+ void *ctx = NULL;
+ u32 retval;
+ int err = 0;
+
+ /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */
+ if (kattr->test.data_in || kattr->test.data_out ||
+ kattr->test.ctx_out || kattr->test.duration ||
+ kattr->test.repeat || kattr->test.flags ||
+ kattr->test.batch_size)
+ return -EINVAL;
+
+ if (ctx_size_in < prog->aux->max_ctx_offset ||
+ ctx_size_in > U16_MAX)
+ return -EINVAL;
+
+ if (ctx_size_in) {
+ ctx = memdup_user(ctx_in, ctx_size_in);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ }
+
+ rcu_read_lock_trace();
+ retval = bpf_prog_run_pin_on_cpu(prog, ctx);
+ rcu_read_unlock_trace();
+
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) {
+ err = -EFAULT;
+ goto out;
+ }
+ if (ctx_size_in)
+ if (copy_to_user(ctx_in, ctx, ctx_size_in))
+ err = -EFAULT;
+out:
+ kfree(ctx);
+ return err;
+}
+
+static int verify_and_copy_hook_state(struct nf_hook_state *state,
+ const struct nf_hook_state *user,
+ struct net_device *dev)
+{
+ if (user->in || user->out)
+ return -EINVAL;
+
+ if (user->net || user->sk || user->okfn)
+ return -EINVAL;
+
+ switch (user->pf) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ switch (state->hook) {
+ case NF_INET_PRE_ROUTING:
+ state->in = dev;
+ break;
+ case NF_INET_LOCAL_IN:
+ state->in = dev;
+ break;
+ case NF_INET_FORWARD:
+ state->in = dev;
+ state->out = dev;
+ break;
+ case NF_INET_LOCAL_OUT:
+ state->out = dev;
+ break;
+ case NF_INET_POST_ROUTING:
+ state->out = dev;
+ break;
+ }
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ state->pf = user->pf;
+ state->hook = user->hook;
+
+ return 0;
+}
+
+static __be16 nfproto_eth(int nfproto)
+{
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ return htons(ETH_P_IP);
+ case NFPROTO_IPV6:
+ break;
+ }
+
+ return htons(ETH_P_IPV6);
+}
+
+int bpf_prog_test_run_nf(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = net->loopback_dev;
+ struct nf_hook_state *user_ctx, hook_state = {
+ .pf = NFPROTO_IPV4,
+ .hook = NF_INET_LOCAL_OUT,
+ };
+ u32 size = kattr->test.data_size_in;
+ u32 repeat = kattr->test.repeat;
+ struct bpf_nf_ctx ctx = {
+ .state = &hook_state,
+ };
+ struct sk_buff *skb = NULL;
+ u32 retval, duration;
+ void *data;
+ int ret;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (size < sizeof(struct iphdr))
+ return -EINVAL;
+
+ data = bpf_test_init(kattr, kattr->test.data_size_in, size,
+ NET_SKB_PAD + NET_IP_ALIGN,
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state));
+ if (IS_ERR(user_ctx)) {
+ kfree(data);
+ return PTR_ERR(user_ctx);
+ }
+
+ if (user_ctx) {
+ ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev);
+ if (ret)
+ goto out;
+ }
+
+ skb = slab_build_skb(data);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data = NULL; /* data released via kfree_skb */
+
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ __skb_put(skb, size);
+
+ ret = -EINVAL;
+
+ if (hook_state.hook != NF_INET_LOCAL_OUT) {
+ if (size < ETH_HLEN + sizeof(struct iphdr))
+ goto out;
+
+ skb->protocol = eth_type_trans(skb, dev);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (hook_state.pf == NFPROTO_IPV4)
+ break;
+ goto out;
+ case htons(ETH_P_IPV6):
+ if (size < ETH_HLEN + sizeof(struct ipv6hdr))
+ goto out;
+ if (hook_state.pf == NFPROTO_IPV6)
+ break;
+ goto out;
+ default:
+ ret = -EPROTO;
+ goto out;
+ }
+
+ skb_reset_network_header(skb);
+ } else {
+ skb->protocol = nfproto_eth(hook_state.pf);
+ }
+
+ ctx.skb = skb;
+
+ ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false);
+ if (ret)
+ goto out;
+
+ ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration);
+
+out:
+ kfree(user_ctx);
+ kfree_skb(skb);
+ kfree(data);
+ return ret;
+}
+
+static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &test_sk_check_kfunc_ids,
+};
+
+BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(func, bpf_kfunc_call_test_release_dtor)
+BTF_ID(struct, prog_test_member)
+BTF_ID(func, bpf_kfunc_call_memb_release_dtor)
+
+static int __init bpf_prog_test_run_init(void)
+{
+ const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = {
+ {
+ .btf_id = bpf_prog_test_dtor_kfunc_ids[0],
+ .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1]
+ },
+ {
+ .btf_id = bpf_prog_test_dtor_kfunc_ids[2],
+ .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3],
+ },
+ };
+ int ret;
+
+ ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc,
+ ARRAY_SIZE(bpf_prog_test_dtor_kfunc),
+ THIS_MODULE);
+}
+late_initcall(bpf_prog_test_run_init);
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index 12265aff7099..3c8ded7d3e84 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# 802.1d Ethernet Bridging
#
@@ -5,7 +6,9 @@
config BRIDGE
tristate "802.1d Ethernet Bridging"
select LLC
- ---help---
+ select STP
+ depends on IPV6 || IPV6=n
+ help
If you say Y here, then your Linux box will be able to act as an
Ethernet bridge, which means that the different Ethernet segments it
is connected to will appear as one Ethernet to the participants.
@@ -15,7 +18,7 @@ config BRIDGE
other third party bridge products.
In order to use the Ethernet bridge, you'll need the bridge
- configuration tools; see <file:Documentation/networking/bridge.txt>
+ configuration tools; see <file:Documentation/networking/bridge.rst>
for location. Please read the Bridge mini-HOWTO for more
information.
@@ -30,3 +33,54 @@ config BRIDGE
will be called bridge.
If unsure, say N.
+
+config BRIDGE_IGMP_SNOOPING
+ bool "IGMP/MLD snooping"
+ depends on BRIDGE
+ depends on INET
+ default y
+ help
+ If you say Y here, then the Ethernet bridge will be able selectively
+ forward multicast traffic based on IGMP/MLD traffic received from
+ each port.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
+
+config BRIDGE_VLAN_FILTERING
+ bool "VLAN filtering"
+ depends on BRIDGE
+ depends on VLAN_8021Q
+ default n
+ help
+ If you say Y here, then the Ethernet bridge will be able selectively
+ receive and forward traffic based on VLAN information in the packet
+ any VLAN information configured on the bridge port or bridge device.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
+
+config BRIDGE_MRP
+ bool "MRP protocol"
+ depends on BRIDGE
+ default n
+ help
+ If you say Y here, then the Ethernet bridge will be able to run MRP
+ protocol to detect loops
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say N.
+
+config BRIDGE_CFM
+ bool "CFM protocol"
+ depends on BRIDGE
+ help
+ If you say Y here, then the Ethernet bridge will be able to run CFM
+ protocol according to 802.1Q section 12.14
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say N.
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index f444c12cde5a..24bd1c0a9a5a 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the IEEE 802.1d ethernet bridging layer.
#
@@ -5,11 +6,26 @@
obj-$(CONFIG_BRIDGE) += bridge.o
bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
- br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \
- br_stp_if.o br_stp_timer.o br_netlink.o
+ br_ioctl.o br_stp.o br_stp_bpdu.o \
+ br_stp_if.o br_stp_timer.o br_netlink.o \
+ br_netlink_tunnel.o br_arp_nd_proxy.o
bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
-bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
+bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o
-obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/
+br_netfilter-y := br_netfilter_hooks.o
+br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o
+obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
+
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o br_multicast_eht.o
+
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o br_mst.o
+
+bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
+
+obj-$(CONFIG_NETFILTER) += netfilter/
+
+bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o
+
+bridge-$(CONFIG_BRIDGE_CFM) += br_cfm.o br_cfm_netlink.o
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 2994387999a8..c37e52e2f29a 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -1,16 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Generic parts
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br.c,v 1.47 2001/12/24 00:56:41 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -20,73 +14,508 @@
#include <linux/init.h>
#include <linux/llc.h>
#include <net/llc.h>
+#include <net/stp.h>
+#include <net/switchdev.h>
#include "br_private.h"
-int (*br_should_route_hook) (struct sk_buff **pskb) = NULL;
+/*
+ * Handle changes in state of network devices enslaved to a bridge.
+ *
+ * Note: don't care about up/down if bridge itself is down, because
+ * port state is checked when bridge is brought up.
+ */
+static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
+{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+ struct netdev_notifier_pre_changeaddr_info *prechaddr_info;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ bool notified = false;
+ bool changed_addr;
+ int err;
+
+ if (netif_is_bridge_master(dev)) {
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (event == NETDEV_REGISTER)
+ br_fdb_change_mac_address(br, dev->dev_addr);
+
+ err = br_vlan_bridge_event(dev, event, ptr);
+ if (err)
+ return notifier_from_errno(err);
+
+ if (event == NETDEV_REGISTER) {
+ /* register of bridge completed, add sysfs entries */
+ err = br_sysfs_addbr(dev);
+ if (err)
+ return notifier_from_errno(err);
+
+ return NOTIFY_DONE;
+ }
+ }
+
+ if (is_vlan_dev(dev)) {
+ struct net_device *real_dev = vlan_dev_real_dev(dev);
+
+ if (netif_is_bridge_master(real_dev))
+ br_vlan_vlan_upper_event(real_dev, dev, event);
+ }
+
+ /* not a port of a bridge */
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return NOTIFY_DONE;
+
+ br = p->br;
+
+ switch (event) {
+ case NETDEV_CHANGEMTU:
+ br_mtu_auto_adjust(br);
+ break;
+
+ case NETDEV_PRE_CHANGEADDR:
+ if (br->dev->addr_assign_type == NET_ADDR_SET)
+ break;
+ prechaddr_info = ptr;
+ err = netif_pre_changeaddr_notify(br->dev,
+ prechaddr_info->dev_addr,
+ extack);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
+ case NETDEV_CHANGEADDR:
+ spin_lock_bh(&br->lock);
+ br_fdb_changeaddr(p, dev->dev_addr);
+ changed_addr = br_stp_recalculate_bridge_id(br);
+ spin_unlock_bh(&br->lock);
+
+ if (changed_addr)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
+
+ break;
+
+ case NETDEV_CHANGE:
+ br_port_carrier_check(p, &notified);
+ break;
+
+ case NETDEV_FEAT_CHANGE:
+ netdev_update_features(br->dev);
+ break;
+
+ case NETDEV_DOWN:
+ spin_lock_bh(&br->lock);
+ if (br->dev->flags & IFF_UP) {
+ br_stp_disable_port(p);
+ notified = true;
+ }
+ spin_unlock_bh(&br->lock);
+ break;
+
+ case NETDEV_UP:
+ if (netif_running(br->dev) && netif_oper_up(dev)) {
+ spin_lock_bh(&br->lock);
+ br_stp_enable_port(p);
+ notified = true;
+ spin_unlock_bh(&br->lock);
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ br_del_if(br, dev);
+ break;
+
+ case NETDEV_CHANGENAME:
+ err = br_sysfs_renameif(p);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* Forbid underlying device to change its type. */
+ return NOTIFY_BAD;
+
+ case NETDEV_RESEND_IGMP:
+ /* Propagate to master device */
+ call_netdevice_notifiers(event, br->dev);
+ break;
+ }
+
+ if (event != NETDEV_UNREGISTER)
+ br_vlan_port_event(p, event);
+
+ /* Events that may cause spanning tree to refresh */
+ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+ event == NETDEV_CHANGE || event == NETDEV_DOWN))
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block br_device_notifier = {
+ .notifier_call = br_device_event
+};
+
+/* called with RTNL or RCU */
+static int br_switchdev_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ struct switchdev_notifier_fdb_info *fdb_info;
+ int err = NOTIFY_DONE;
+
+ p = br_port_get_rtnl_rcu(dev);
+ if (!p)
+ goto out;
+
+ br = p->br;
+
+ switch (event) {
+ case SWITCHDEV_FDB_ADD_TO_BRIDGE:
+ fdb_info = ptr;
+ err = br_fdb_external_learn_add(br, p, fdb_info->addr,
+ fdb_info->vid,
+ fdb_info->locked, false);
+ if (err) {
+ err = notifier_from_errno(err);
+ break;
+ }
+ br_fdb_offloaded_set(br, p, fdb_info->addr,
+ fdb_info->vid, fdb_info->offloaded);
+ break;
+ case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+ fdb_info = ptr;
+ err = br_fdb_external_learn_del(br, p, fdb_info->addr,
+ fdb_info->vid, false);
+ if (err)
+ err = notifier_from_errno(err);
+ break;
+ case SWITCHDEV_FDB_OFFLOADED:
+ fdb_info = ptr;
+ br_fdb_offloaded_set(br, p, fdb_info->addr,
+ fdb_info->vid, fdb_info->offloaded);
+ break;
+ case SWITCHDEV_FDB_FLUSH_TO_BRIDGE:
+ fdb_info = ptr;
+ /* Don't delete static entries */
+ br_fdb_delete_by_port(br, p, fdb_info->vid, 0);
+ break;
+ }
+
+out:
+ return err;
+}
+
+static struct notifier_block br_switchdev_notifier = {
+ .notifier_call = br_switchdev_event,
+};
+
+/* called under rtnl_mutex */
+static int br_switchdev_blocking_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ struct switchdev_notifier_brport_info *brport_info;
+ const struct switchdev_brport *b;
+ struct net_bridge_port *p;
+ int err = NOTIFY_DONE;
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ goto out;
+
+ switch (event) {
+ case SWITCHDEV_BRPORT_OFFLOADED:
+ brport_info = ptr;
+ b = &brport_info->brport;
+
+ err = br_switchdev_port_offload(p, b->dev, b->ctx,
+ b->atomic_nb, b->blocking_nb,
+ b->tx_fwd_offload, extack);
+ err = notifier_from_errno(err);
+ break;
+ case SWITCHDEV_BRPORT_UNOFFLOADED:
+ brport_info = ptr;
+ b = &brport_info->brport;
+
+ br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb,
+ b->blocking_nb);
+ break;
+ case SWITCHDEV_BRPORT_REPLAY:
+ brport_info = ptr;
+ b = &brport_info->brport;
+
+ err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb,
+ b->blocking_nb, extack);
+ err = notifier_from_errno(err);
+ break;
+ }
+
+out:
+ return err;
+}
+
+static struct notifier_block br_switchdev_blocking_notifier = {
+ .notifier_call = br_switchdev_blocking_event,
+};
+
+static int
+br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on)
+ return 0;
+
+ err = br_fdb_toggle_local_vlan_0(br, on, extack);
+ if (err)
+ return err;
+
+ br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on);
+ return 0;
+}
-static struct llc_sap *br_stp_sap;
+/* br_boolopt_toggle - change user-controlled boolean option
+ *
+ * @br: bridge device
+ * @opt: id of the option to change
+ * @on: new option value
+ * @extack: extack for error messages
+ *
+ * Changes the value of the respective boolean option to @on taking care of
+ * any internal option value mapping and configuration.
+ */
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+ struct netlink_ext_ack *extack)
+{
+ int err = 0;
+
+ switch (opt) {
+ case BR_BOOLOPT_NO_LL_LEARN:
+ br_opt_toggle(br, BROPT_NO_LL_LEARN, on);
+ break;
+ case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+ err = br_multicast_toggle_vlan_snooping(br, on, extack);
+ break;
+ case BR_BOOLOPT_MST_ENABLE:
+ err = br_mst_set_enabled(br, on, extack);
+ break;
+ case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
+ br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on);
+ break;
+ case BR_BOOLOPT_FDB_LOCAL_VLAN_0:
+ err = br_toggle_fdb_local_vlan_0(br, on, extack);
+ break;
+ default:
+ /* shouldn't be called with unsupported options */
+ WARN_ON(1);
+ break;
+ }
+
+ return err;
+}
+
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
+{
+ switch (opt) {
+ case BR_BOOLOPT_NO_LL_LEARN:
+ return br_opt_get(br, BROPT_NO_LL_LEARN);
+ case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+ return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED);
+ case BR_BOOLOPT_MST_ENABLE:
+ return br_opt_get(br, BROPT_MST_ENABLED);
+ case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
+ return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION);
+ case BR_BOOLOPT_FDB_LOCAL_VLAN_0:
+ return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
+ default:
+ /* shouldn't be called with unsupported options */
+ WARN_ON(1);
+ break;
+ }
+
+ return 0;
+}
+
+int br_boolopt_multi_toggle(struct net_bridge *br,
+ struct br_boolopt_multi *bm,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long bitmap = bm->optmask;
+ int err = 0;
+ int opt_id;
+
+ opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX);
+ if (opt_id != BITS_PER_LONG) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d",
+ opt_id);
+ return -EINVAL;
+ }
+
+ for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {
+ bool on = !!(bm->optval & BIT(opt_id));
+
+ err = br_boolopt_toggle(br, opt_id, on, extack);
+ if (err) {
+ br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n",
+ opt_id, br_boolopt_get(br, opt_id), on, err);
+ break;
+ }
+ }
+
+ return err;
+}
+
+void br_boolopt_multi_get(const struct net_bridge *br,
+ struct br_boolopt_multi *bm)
+{
+ u32 optval = 0;
+ int opt_id;
+
+ for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++)
+ optval |= (br_boolopt_get(br, opt_id) << opt_id);
+
+ bm->optval = optval;
+ bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0);
+}
+
+/* private bridge options, controlled by the kernel */
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
+{
+ bool cur = !!br_opt_get(br, opt);
+
+ br_debug(br, "toggle option: %d state: %d -> %d\n",
+ opt, cur, on);
+
+ if (cur == on)
+ return;
+
+ if (on)
+ set_bit(opt, &br->options);
+ else
+ clear_bit(opt, &br->options);
+}
+
+static void __net_exit br_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL_NET(net);
+
+ for_each_netdev(net, dev)
+ if (netif_is_bridge_master(dev))
+ br_dev_delete(dev, dev_to_kill);
+}
+
+static struct pernet_operations br_net_ops = {
+ .exit_rtnl = br_net_exit_rtnl,
+};
+
+static const struct stp_proto br_stp_proto = {
+ .rcv = br_stp_rcv,
+};
static int __init br_init(void)
{
int err;
- br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv);
- if (!br_stp_sap) {
- printk(KERN_ERR "bridge: can't register sap for STP\n");
- return -EADDRINUSE;
+ BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb));
+
+ err = stp_proto_register(&br_stp_proto);
+ if (err < 0) {
+ pr_err("bridge: can't register sap for STP\n");
+ return err;
}
- br_fdb_init();
+ err = br_fdb_init();
+ if (err)
+ goto err_out;
- err = br_netfilter_init();
+ err = register_pernet_subsys(&br_net_ops);
if (err)
goto err_out1;
- err = register_netdevice_notifier(&br_device_notifier);
+ err = br_nf_core_init();
if (err)
goto err_out2;
- br_netlink_init();
- brioctl_set(br_ioctl_deviceless_stub);
- br_handle_frame_hook = br_handle_frame;
+ err = register_netdevice_notifier(&br_device_notifier);
+ if (err)
+ goto err_out3;
+
+ err = register_switchdev_notifier(&br_switchdev_notifier);
+ if (err)
+ goto err_out4;
+
+ err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
+ if (err)
+ goto err_out5;
+
+ err = br_netlink_init();
+ if (err)
+ goto err_out6;
- br_fdb_get_hook = br_fdb_get;
- br_fdb_put_hook = br_fdb_put;
+ brioctl_set(br_ioctl_stub);
+
+#if IS_ENABLED(CONFIG_ATM_LANE)
+ br_fdb_test_addr_hook = br_fdb_test_addr;
+#endif
+
+#if IS_MODULE(CONFIG_BRIDGE_NETFILTER)
+ pr_info("bridge: filtering via arp/ip/ip6tables is no longer available "
+ "by default. Update your scripts to load br_netfilter if you "
+ "need this.\n");
+#endif
return 0;
+err_out6:
+ unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
+err_out5:
+ unregister_switchdev_notifier(&br_switchdev_notifier);
+err_out4:
+ unregister_netdevice_notifier(&br_device_notifier);
+err_out3:
+ br_nf_core_fini();
err_out2:
- br_netfilter_fini();
+ unregister_pernet_subsys(&br_net_ops);
err_out1:
- llc_sap_put(br_stp_sap);
+ br_fdb_fini();
+err_out:
+ stp_proto_unregister(&br_stp_proto);
return err;
}
static void __exit br_deinit(void)
{
- rcu_assign_pointer(br_stp_sap->rcv_func, NULL);
-
+ stp_proto_unregister(&br_stp_proto);
br_netlink_fini();
- br_netfilter_fini();
+ unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
+ unregister_switchdev_notifier(&br_switchdev_notifier);
unregister_netdevice_notifier(&br_device_notifier);
brioctl_set(NULL);
+ unregister_pernet_subsys(&br_net_ops);
- br_cleanup_bridges();
-
- synchronize_net();
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
- llc_sap_put(br_stp_sap);
- br_fdb_get_hook = NULL;
- br_fdb_put_hook = NULL;
-
- br_handle_frame_hook = NULL;
+ br_nf_core_fini();
+#if IS_ENABLED(CONFIG_ATM_LANE)
+ br_fdb_test_addr_hook = NULL;
+#endif
br_fdb_fini();
}
-EXPORT_SYMBOL(br_should_route_hook);
-
module_init(br_init)
module_exit(br_deinit)
MODULE_LICENSE("GPL");
MODULE_VERSION(BR_VERSION);
+MODULE_ALIAS_RTNL_LINK("bridge");
+MODULE_DESCRIPTION("Ethernet bridge driver");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
new file mode 100644
index 000000000000..1e2b51769eec
--- /dev/null
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handle bridge arp/nd proxy/suppress
+ *
+ * Copyright (C) 2017 Cumulus Networks
+ * Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/neighbour.h>
+#include <net/arp.h>
+#include <linux/if_vlan.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ bool neigh_suppress = false;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->flags & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) {
+ neigh_suppress = true;
+ break;
+ }
+ }
+
+ br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress);
+}
+
+#if IS_ENABLED(CONFIG_INET)
+static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p,
+ struct net_device *dev, __be32 dest_ip, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw,
+ __be16 vlan_proto, u16 vlan_tci)
+{
+ struct net_bridge_vlan_group *vg;
+ struct sk_buff *skb;
+ u16 pvid;
+
+ netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n",
+ dev->name, &dest_ip, dest_hw, &src_ip, src_hw);
+
+ if (!vlan_tci) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ return;
+ }
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ else
+ vg = br_vlan_group_rcu(br);
+ pvid = br_get_pvid(vg);
+ if (pvid == (vlan_tci & VLAN_VID_MASK))
+ vlan_tci = 0;
+
+ if (vlan_tci)
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+
+ if (p) {
+ arp_xmit(skb);
+ } else {
+ skb_reset_mac_header(skb);
+ __skb_pull(skb, skb_network_offset(skb));
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->pkt_type = PACKET_HOST;
+
+ netif_rx(skb);
+ }
+}
+
+static int br_chk_addr_ip(struct net_device *dev,
+ struct netdev_nested_priv *priv)
+{
+ __be32 ip = *(__be32 *)priv->data;
+ struct in_device *in_dev;
+ __be32 addr = 0;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev)
+ addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip,
+ RT_SCOPE_HOST);
+
+ if (addr == ip)
+ return 1;
+
+ return 0;
+}
+
+static bool br_is_local_ip(struct net_device *dev, __be32 ip)
+{
+ struct netdev_nested_priv priv = {
+ .data = (void *)&ip,
+ };
+
+ if (br_chk_addr_ip(dev, &priv))
+ return true;
+
+ /* check if ip is configured on upper dev */
+ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &priv))
+ return true;
+
+ return false;
+}
+
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p)
+{
+ struct net_device *dev = br->dev;
+ struct net_device *vlandev = dev;
+ struct neighbour *n;
+ struct arphdr *parp;
+ u8 *arpptr, *sha;
+ __be32 sip, tip;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
+
+ if ((dev->flags & IFF_NOARP) ||
+ !pskb_may_pull(skb, arp_hdr_len(dev)))
+ return;
+
+ parp = arp_hdr(skb);
+
+ if (parp->ar_pro != htons(ETH_P_IP) ||
+ parp->ar_hln != dev->addr_len ||
+ parp->ar_pln != 4)
+ return;
+
+ arpptr = (u8 *)parp + sizeof(struct arphdr);
+ sha = arpptr;
+ arpptr += dev->addr_len; /* sha */
+ memcpy(&sip, arpptr, sizeof(sip));
+ arpptr += sizeof(sip);
+ arpptr += dev->addr_len; /* tha */
+ memcpy(&tip, arpptr, sizeof(tip));
+
+ if (ipv4_is_loopback(tip) ||
+ ipv4_is_multicast(tip))
+ return;
+
+ if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
+ if (br_is_neigh_suppress_enabled(p, vid))
+ return;
+ if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ parp->ar_op == htons(ARPOP_REQUEST))
+ return;
+ if (parp->ar_op != htons(ARPOP_RREQUEST) &&
+ parp->ar_op != htons(ARPOP_RREPLY) &&
+ (ipv4_is_zeronet(sip) || sip == tip)) {
+ /* prevent flooding to neigh suppress ports */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ return;
+ }
+ }
+
+ if (parp->ar_op != htons(ARPOP_REQUEST))
+ return;
+
+ if (vid != 0) {
+ vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+ vid);
+ if (!vlandev)
+ return;
+ }
+
+ if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
+ br_is_local_ip(vlandev, tip)) {
+ /* its our local ip, so don't proxy reply
+ * and don't forward to neigh suppress ports
+ */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ return;
+ }
+
+ n = neigh_lookup(&arp_tbl, &tip, vlandev);
+ if (n) {
+ struct net_bridge_fdb_entry *f;
+
+ if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
+ neigh_release(n);
+ return;
+ }
+
+ f = br_fdb_find_rcu(br, n->ha, vid);
+ if (f) {
+ bool replied = false;
+
+ if ((p && (p->flags & BR_PROXYARP)) ||
+ (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)) ||
+ br_is_neigh_suppress_enabled(f->dst, vid)) {
+ if (!vid)
+ br_arp_send(br, p, skb->dev, sip, tip,
+ sha, n->ha, sha, 0, 0);
+ else
+ br_arp_send(br, p, skb->dev, sip, tip,
+ sha, n->ha, sha,
+ skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ replied = true;
+ }
+
+ /* If we have replied or as long as we know the
+ * mac, indicate to arp replied
+ */
+ if (replied ||
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ }
+
+ neigh_release(n);
+ }
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg)
+{
+ struct nd_msg *m;
+
+ m = skb_header_pointer(skb, skb_network_offset(skb) +
+ sizeof(struct ipv6hdr), sizeof(*msg), msg);
+ if (!m)
+ return NULL;
+
+ if (m->icmph.icmp6_code != 0 ||
+ (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
+ m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
+ return NULL;
+
+ return m;
+}
+
+static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
+ struct sk_buff *request, struct neighbour *n,
+ __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns)
+{
+ struct net_device *dev = request->dev;
+ struct net_bridge_vlan_group *vg;
+ struct sk_buff *reply;
+ struct nd_msg *na;
+ struct ipv6hdr *pip6;
+ int na_olen = 8; /* opt hdr + ETH_ALEN for target */
+ int ns_olen;
+ int i, len;
+ u8 *daddr;
+ u16 pvid;
+
+ if (!dev)
+ return;
+
+ len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
+ sizeof(*na) + na_olen + dev->needed_tailroom;
+
+ reply = alloc_skb(len, GFP_ATOMIC);
+ if (!reply)
+ return;
+
+ reply->protocol = htons(ETH_P_IPV6);
+ reply->dev = dev;
+ skb_reserve(reply, LL_RESERVED_SPACE(dev));
+ skb_push(reply, sizeof(struct ethhdr));
+ skb_set_mac_header(reply, 0);
+
+ daddr = eth_hdr(request)->h_source;
+
+ /* Do we need option processing ? */
+ ns_olen = request->len - (skb_network_offset(request) +
+ sizeof(struct ipv6hdr)) - sizeof(*ns);
+ for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
+ if (!ns->opt[i + 1]) {
+ kfree_skb(reply);
+ return;
+ }
+ if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
+ daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
+ break;
+ }
+ }
+
+ /* Ethernet header */
+ ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
+ ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
+ eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
+ reply->protocol = htons(ETH_P_IPV6);
+
+ skb_pull(reply, sizeof(struct ethhdr));
+ skb_set_network_header(reply, 0);
+ skb_put(reply, sizeof(struct ipv6hdr));
+
+ /* IPv6 header */
+ pip6 = ipv6_hdr(reply);
+ memset(pip6, 0, sizeof(struct ipv6hdr));
+ pip6->version = 6;
+ pip6->priority = ipv6_hdr(request)->priority;
+ pip6->nexthdr = IPPROTO_ICMPV6;
+ pip6->hop_limit = 255;
+ pip6->daddr = ipv6_hdr(request)->saddr;
+ pip6->saddr = *(struct in6_addr *)n->primary_key;
+
+ skb_pull(reply, sizeof(struct ipv6hdr));
+ skb_set_transport_header(reply, 0);
+
+ na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
+
+ /* Neighbor Advertisement */
+ memset(na, 0, sizeof(*na) + na_olen);
+ na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
+ na->icmph.icmp6_router = (n->flags & NTF_ROUTER) ? 1 : 0;
+ na->icmph.icmp6_override = 1;
+ na->icmph.icmp6_solicited = 1;
+ na->target = ns->target;
+ ether_addr_copy(&na->opt[2], n->ha);
+ na->opt[0] = ND_OPT_TARGET_LL_ADDR;
+ na->opt[1] = na_olen >> 3;
+
+ na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
+ &pip6->daddr,
+ sizeof(*na) + na_olen,
+ IPPROTO_ICMPV6,
+ csum_partial(na, sizeof(*na) + na_olen, 0));
+
+ pip6->payload_len = htons(sizeof(*na) + na_olen);
+
+ skb_push(reply, sizeof(struct ipv6hdr));
+ skb_push(reply, sizeof(struct ethhdr));
+
+ reply->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ else
+ vg = br_vlan_group_rcu(br);
+ pvid = br_get_pvid(vg);
+ if (pvid == (vlan_tci & VLAN_VID_MASK))
+ vlan_tci = 0;
+
+ if (vlan_tci)
+ __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci);
+
+ netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n",
+ dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha);
+
+ if (p) {
+ dev_queue_xmit(reply);
+ } else {
+ skb_reset_mac_header(reply);
+ __skb_pull(reply, skb_network_offset(reply));
+ reply->ip_summed = CHECKSUM_UNNECESSARY;
+ reply->pkt_type = PACKET_HOST;
+
+ netif_rx(reply);
+ }
+}
+
+static int br_chk_addr_ip6(struct net_device *dev,
+ struct netdev_nested_priv *priv)
+{
+ struct in6_addr *addr = (struct in6_addr *)priv->data;
+
+ if (ipv6_chk_addr(dev_net(dev), addr, dev, 0))
+ return 1;
+
+ return 0;
+}
+
+static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr)
+
+{
+ struct netdev_nested_priv priv = {
+ .data = (void *)addr,
+ };
+
+ if (br_chk_addr_ip6(dev, &priv))
+ return true;
+
+ /* check if ip is configured on upper dev */
+ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, &priv))
+ return true;
+
+ return false;
+}
+
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p, struct nd_msg *msg)
+{
+ struct net_device *dev = br->dev;
+ struct net_device *vlandev = NULL;
+ struct in6_addr *saddr, *daddr;
+ struct ipv6hdr *iphdr;
+ struct neighbour *n;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
+
+ if (br_is_neigh_suppress_enabled(p, vid))
+ return;
+
+ if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
+ return;
+
+ if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
+ !msg->icmph.icmp6_solicited) {
+ /* prevent flooding to neigh suppress ports */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ return;
+ }
+
+ if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
+ return;
+
+ iphdr = ipv6_hdr(skb);
+ saddr = &iphdr->saddr;
+ daddr = &iphdr->daddr;
+
+ if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) {
+ /* prevent flooding to neigh suppress ports */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ return;
+ }
+
+ if (vid != 0) {
+ /* build neigh table lookup on the vlan device */
+ vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+ vid);
+ if (!vlandev)
+ return;
+ } else {
+ vlandev = dev;
+ }
+
+ if (br_is_local_ip6(vlandev, &msg->target)) {
+ /* its our own ip, so don't proxy reply
+ * and don't forward to arp suppress ports
+ */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ return;
+ }
+
+ n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev);
+ if (n) {
+ struct net_bridge_fdb_entry *f;
+
+ if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
+ neigh_release(n);
+ return;
+ }
+
+ f = br_fdb_find_rcu(br, n->ha, vid);
+ if (f) {
+ bool replied = false;
+
+ if (br_is_neigh_suppress_enabled(f->dst, vid)) {
+ if (vid != 0)
+ br_nd_send(br, p, skb, n,
+ skb->vlan_proto,
+ skb_vlan_tag_get(skb), msg);
+ else
+ br_nd_send(br, p, skb, n, 0, 0, msg);
+ replied = true;
+ }
+
+ /* If we have replied or as long as we know the
+ * mac, indicate to NEIGH_SUPPRESS ports that we
+ * have replied
+ */
+ if (replied ||
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
+ }
+ neigh_release(n);
+ }
+}
+#endif
+
+bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid)
+{
+ if (!p)
+ return false;
+
+ if (!vid)
+ return !!(p->flags & BR_NEIGH_SUPPRESS);
+
+ if (p->flags & BR_NEIGH_VLAN_SUPPRESS) {
+ struct net_bridge_vlan_group *vg = nbp_vlan_group_rcu(p);
+ struct net_bridge_vlan *v;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return false;
+ return !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED);
+ } else {
+ return !!(p->flags & BR_NEIGH_SUPPRESS);
+ }
+}
diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c
new file mode 100644
index 000000000000..c2c1c7d44c61
--- /dev/null
+++ b/net/bridge/br_cfm.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/cfm_bridge.h>
+#include <uapi/linux/cfm_bridge.h>
+#include "br_private_cfm.h"
+
+static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance)
+{
+ struct br_cfm_mep *mep;
+
+ hlist_for_each_entry(mep, &br->mep_list, head)
+ if (mep->instance == instance)
+ return mep;
+
+ return NULL;
+}
+
+static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct br_cfm_mep *mep;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head,
+ lockdep_rtnl_is_held())
+ if (mep->create.ifindex == ifindex)
+ return mep;
+
+ return NULL;
+}
+
+static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep,
+ u32 mepid)
+{
+ struct br_cfm_peer_mep *peer_mep;
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head,
+ lockdep_rtnl_is_held())
+ if (peer_mep->mepid == mepid)
+ return peer_mep;
+
+ return NULL;
+}
+
+static struct net_bridge_port *br_mep_get_port(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list)
+ if (port->dev->ifindex == ifindex)
+ return port;
+
+ return NULL;
+}
+
+/* Calculate the CCM interval in us. */
+static u32 interval_to_us(enum br_cfm_ccm_interval interval)
+{
+ switch (interval) {
+ case BR_CFM_CCM_INTERVAL_NONE:
+ return 0;
+ case BR_CFM_CCM_INTERVAL_3_3_MS:
+ return 3300;
+ case BR_CFM_CCM_INTERVAL_10_MS:
+ return 10 * 1000;
+ case BR_CFM_CCM_INTERVAL_100_MS:
+ return 100 * 1000;
+ case BR_CFM_CCM_INTERVAL_1_SEC:
+ return 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_10_SEC:
+ return 10 * 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_1_MIN:
+ return 60 * 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_10_MIN:
+ return 10 * 60 * 1000 * 1000;
+ }
+ return 0;
+}
+
+/* Convert the interface interval to CCM PDU value. */
+static u32 interval_to_pdu(enum br_cfm_ccm_interval interval)
+{
+ switch (interval) {
+ case BR_CFM_CCM_INTERVAL_NONE:
+ return 0;
+ case BR_CFM_CCM_INTERVAL_3_3_MS:
+ return 1;
+ case BR_CFM_CCM_INTERVAL_10_MS:
+ return 2;
+ case BR_CFM_CCM_INTERVAL_100_MS:
+ return 3;
+ case BR_CFM_CCM_INTERVAL_1_SEC:
+ return 4;
+ case BR_CFM_CCM_INTERVAL_10_SEC:
+ return 5;
+ case BR_CFM_CCM_INTERVAL_1_MIN:
+ return 6;
+ case BR_CFM_CCM_INTERVAL_10_MIN:
+ return 7;
+ }
+ return 0;
+}
+
+/* Convert the CCM PDU value to interval on interface. */
+static u32 pdu_to_interval(u32 value)
+{
+ switch (value) {
+ case 0:
+ return BR_CFM_CCM_INTERVAL_NONE;
+ case 1:
+ return BR_CFM_CCM_INTERVAL_3_3_MS;
+ case 2:
+ return BR_CFM_CCM_INTERVAL_10_MS;
+ case 3:
+ return BR_CFM_CCM_INTERVAL_100_MS;
+ case 4:
+ return BR_CFM_CCM_INTERVAL_1_SEC;
+ case 5:
+ return BR_CFM_CCM_INTERVAL_10_SEC;
+ case 6:
+ return BR_CFM_CCM_INTERVAL_1_MIN;
+ case 7:
+ return BR_CFM_CCM_INTERVAL_10_MIN;
+ }
+ return BR_CFM_CCM_INTERVAL_NONE;
+}
+
+static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep)
+{
+ u32 interval_us;
+
+ interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval);
+ /* Function ccm_rx_dwork must be called with 1/4
+ * of the configured CC 'expected_interval'
+ * in order to detect CCM defect after 3.25 interval.
+ */
+ queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork,
+ usecs_to_jiffies(interval_us / 4));
+}
+
+static void br_cfm_notify(int event, const struct net_bridge_port *port)
+{
+ u32 filter = RTEXT_FILTER_CFM_STATUS;
+
+ br_info_notify(event, port->br, NULL, filter);
+}
+
+static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep)
+{
+ memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status));
+ peer_mep->ccm_rx_count_miss = 0;
+
+ ccm_rx_timer_start(peer_mep);
+}
+
+static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep)
+{
+ cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork);
+}
+
+static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info)
+
+{
+ struct br_cfm_common_hdr *common_hdr;
+ struct net_bridge_port *b_port;
+ struct br_cfm_maid *maid;
+ u8 *itu_reserved, *e_tlv;
+ struct ethhdr *eth_hdr;
+ struct sk_buff *skb;
+ __be32 *status_tlv;
+ __be32 *snumber;
+ __be16 *mepid;
+
+ skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH);
+ if (!skb)
+ return NULL;
+
+ rcu_read_lock();
+ b_port = rcu_dereference(mep->b_port);
+ if (!b_port) {
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NULL;
+ }
+ skb->dev = b_port->dev;
+ rcu_read_unlock();
+ /* The device cannot be deleted until the work_queue functions has
+ * completed. This function is called from ccm_tx_work_expired()
+ * that is a work_queue functions.
+ */
+
+ skb->protocol = htons(ETH_P_CFM);
+ skb->priority = CFM_FRAME_PRIO;
+
+ /* Ethernet header */
+ eth_hdr = skb_put(skb, sizeof(*eth_hdr));
+ ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr);
+ ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr);
+ eth_hdr->h_proto = htons(ETH_P_CFM);
+
+ /* Common CFM Header */
+ common_hdr = skb_put(skb, sizeof(*common_hdr));
+ common_hdr->mdlevel_version = mep->config.mdlevel << 5;
+ common_hdr->opcode = BR_CFM_OPCODE_CCM;
+ common_hdr->flags = (mep->rdi << 7) |
+ interval_to_pdu(mep->cc_config.exp_interval);
+ common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET;
+
+ /* Sequence number */
+ snumber = skb_put(skb, sizeof(*snumber));
+ if (tx_info->seq_no_update) {
+ *snumber = cpu_to_be32(mep->ccm_tx_snumber);
+ mep->ccm_tx_snumber += 1;
+ } else {
+ *snumber = 0;
+ }
+
+ mepid = skb_put(skb, sizeof(*mepid));
+ *mepid = cpu_to_be16((u16)mep->config.mepid);
+
+ maid = skb_put(skb, sizeof(*maid));
+ memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data));
+
+ /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */
+ itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE);
+ memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE);
+
+ /* Generel CFM TLV format:
+ * TLV type: one byte
+ * TLV value length: two bytes
+ * TLV value: 'TLV value length' bytes
+ */
+
+ /* Port status TLV. The value length is 1. Total of 4 bytes. */
+ if (tx_info->port_tlv) {
+ status_tlv = skb_put(skb, sizeof(*status_tlv));
+ *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) |
+ (1 << 8) | /* Value length */
+ (tx_info->port_tlv_value & 0xFF));
+ }
+
+ /* Interface status TLV. The value length is 1. Total of 4 bytes. */
+ if (tx_info->if_tlv) {
+ status_tlv = skb_put(skb, sizeof(*status_tlv));
+ *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) |
+ (1 << 8) | /* Value length */
+ (tx_info->if_tlv_value & 0xFF));
+ }
+
+ /* End TLV */
+ e_tlv = skb_put(skb, sizeof(*e_tlv));
+ *e_tlv = CFM_ENDE_TLV_TYPE;
+
+ return skb;
+}
+
+static void ccm_frame_tx(struct sk_buff *skb)
+{
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+}
+
+/* This function is called with the configured CC 'expected_interval'
+ * in order to drive CCM transmission when enabled.
+ */
+static void ccm_tx_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work;
+ struct br_cfm_mep *mep;
+ struct sk_buff *skb;
+ u32 interval_us;
+
+ del_work = to_delayed_work(work);
+ mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork);
+
+ if (time_before_eq(mep->ccm_tx_end, jiffies)) {
+ /* Transmission period has ended */
+ mep->cc_ccm_tx_info.period = 0;
+ return;
+ }
+
+ skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info);
+ if (skb)
+ ccm_frame_tx(skb);
+
+ interval_us = interval_to_us(mep->cc_config.exp_interval);
+ queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork,
+ usecs_to_jiffies(interval_us));
+}
+
+/* This function is called with 1/4 of the configured CC 'expected_interval'
+ * in order to detect CCM defect after 3.25 interval.
+ */
+static void ccm_rx_work_expired(struct work_struct *work)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct net_bridge_port *b_port;
+ struct delayed_work *del_work;
+
+ del_work = to_delayed_work(work);
+ peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork);
+
+ /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */
+ if (peer_mep->ccm_rx_count_miss < 13) {
+ /* 3.25 intervals are NOT expired without CCM reception */
+ peer_mep->ccm_rx_count_miss++;
+
+ /* Start timer again */
+ ccm_rx_timer_start(peer_mep);
+ } else {
+ /* 3.25 intervals are expired without CCM reception.
+ * CCM defect detected
+ */
+ peer_mep->cc_status.ccm_defect = true;
+
+ /* Change in CCM defect status - notify */
+ rcu_read_lock();
+ b_port = rcu_dereference(peer_mep->mep->b_port);
+ if (b_port)
+ br_cfm_notify(RTM_NEWLINK, b_port);
+ rcu_read_unlock();
+ }
+}
+
+static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index,
+ struct br_cfm_peer_mep *peer_mep)
+{
+ __be32 *s_tlv;
+ __be32 _s_tlv;
+ u32 h_s_tlv;
+ u8 *e_tlv;
+ u8 _e_tlv;
+
+ e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv);
+ if (!e_tlv)
+ return 0;
+
+ /* TLV is present - get the status TLV */
+ s_tlv = skb_header_pointer(skb,
+ index,
+ sizeof(_s_tlv), &_s_tlv);
+ if (!s_tlv)
+ return 0;
+
+ h_s_tlv = ntohl(*s_tlv);
+ if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) {
+ /* Interface status TLV */
+ peer_mep->cc_status.tlv_seen = true;
+ peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF);
+ }
+
+ if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) {
+ /* Port status TLV */
+ peer_mep->cc_status.tlv_seen = true;
+ peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF);
+ }
+
+ /* The Sender ID TLV is not handled */
+ /* The Organization-Specific TLV is not handled */
+
+ /* Return the length of this tlv.
+ * This is the length of the value field plus 3 bytes for size of type
+ * field and length field
+ */
+ return ((h_s_tlv >> 8) & 0xFFFF) + 3;
+}
+
+/* note: already called with rcu_read_lock */
+static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb)
+{
+ u32 mdlevel, interval, size, index, max;
+ const struct br_cfm_common_hdr *hdr;
+ struct br_cfm_peer_mep *peer_mep;
+ const struct br_cfm_maid *maid;
+ struct br_cfm_common_hdr _hdr;
+ struct br_cfm_maid _maid;
+ struct br_cfm_mep *mep;
+ struct net_bridge *br;
+ __be32 *snumber;
+ __be32 _snumber;
+ __be16 *mepid;
+ __be16 _mepid;
+
+ if (port->state == BR_STATE_DISABLED)
+ return 0;
+
+ hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 1;
+
+ br = port->br;
+ mep = br_mep_find_ifindex(br, port->dev->ifindex);
+ if (unlikely(!mep))
+ /* No MEP on this port - must be forwarded */
+ return 0;
+
+ mdlevel = hdr->mdlevel_version >> 5;
+ if (mdlevel > mep->config.mdlevel)
+ /* The level is above this MEP level - must be forwarded */
+ return 0;
+
+ if ((hdr->mdlevel_version & 0x1F) != 0) {
+ /* Invalid version */
+ mep->status.version_unexp_seen = true;
+ return 1;
+ }
+
+ if (mdlevel < mep->config.mdlevel) {
+ /* The level is below this MEP level */
+ mep->status.rx_level_low_seen = true;
+ return 1;
+ }
+
+ if (hdr->opcode == BR_CFM_OPCODE_CCM) {
+ /* CCM PDU received. */
+ /* MA ID is after common header + sequence number + MEP ID */
+ maid = skb_header_pointer(skb,
+ CFM_CCM_PDU_MAID_OFFSET,
+ sizeof(_maid), &_maid);
+ if (!maid)
+ return 1;
+ if (memcmp(maid->data, mep->cc_config.exp_maid.data,
+ sizeof(maid->data)))
+ /* MA ID not as expected */
+ return 1;
+
+ /* MEP ID is after common header + sequence number */
+ mepid = skb_header_pointer(skb,
+ CFM_CCM_PDU_MEPID_OFFSET,
+ sizeof(_mepid), &_mepid);
+ if (!mepid)
+ return 1;
+ peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid));
+ if (!peer_mep)
+ return 1;
+
+ /* Interval is in common header flags */
+ interval = hdr->flags & 0x07;
+ if (mep->cc_config.exp_interval != pdu_to_interval(interval))
+ /* Interval not as expected */
+ return 1;
+
+ /* A valid CCM frame is received */
+ if (peer_mep->cc_status.ccm_defect) {
+ peer_mep->cc_status.ccm_defect = false;
+
+ /* Change in CCM defect status - notify */
+ br_cfm_notify(RTM_NEWLINK, port);
+
+ /* Start CCM RX timer */
+ ccm_rx_timer_start(peer_mep);
+ }
+
+ peer_mep->cc_status.seen = true;
+ peer_mep->ccm_rx_count_miss = 0;
+
+ /* RDI is in common header flags */
+ peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false;
+
+ /* Sequence number is after common header */
+ snumber = skb_header_pointer(skb,
+ CFM_CCM_PDU_SEQNR_OFFSET,
+ sizeof(_snumber), &_snumber);
+ if (!snumber)
+ return 1;
+ if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1))
+ /* Unexpected sequence number */
+ peer_mep->cc_status.seq_unexp_seen = true;
+
+ mep->ccm_rx_snumber = ntohl(*snumber);
+
+ /* TLV end is after common header + sequence number + MEP ID +
+ * MA ID + ITU reserved
+ */
+ index = CFM_CCM_PDU_TLV_OFFSET;
+ max = 0;
+ do { /* Handle all TLVs */
+ size = ccm_tlv_extract(skb, index, peer_mep);
+ index += size;
+ max += 1;
+ } while (size != 0 && max < 4); /* Max four TLVs possible */
+
+ return 1;
+ }
+
+ mep->status.opcode_unexp_seen = true;
+
+ return 1;
+}
+
+static struct br_frame_type cfm_frame_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CFM),
+ .frame_handler = br_cfm_frame_rx,
+};
+
+int br_cfm_mep_create(struct net_bridge *br,
+ const u32 instance,
+ struct br_cfm_mep_create *const create,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *p;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ if (create->domain == BR_CFM_VLAN) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "VLAN domain not supported");
+ return -EINVAL;
+ }
+ if (create->domain != BR_CFM_PORT) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid domain value");
+ return -EINVAL;
+ }
+ if (create->direction == BR_CFM_MEP_DIRECTION_UP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Up-MEP not supported");
+ return -EINVAL;
+ }
+ if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid direction value");
+ return -EINVAL;
+ }
+ p = br_mep_get_port(br, create->ifindex);
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Port is not related to bridge");
+ return -EINVAL;
+ }
+ mep = br_mep_find(br, instance);
+ if (mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance already exists");
+ return -EEXIST;
+ }
+
+ /* In PORT domain only one instance can be created per port */
+ if (create->domain == BR_CFM_PORT) {
+ mep = br_mep_find_ifindex(br, create->ifindex);
+ if (mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one Port MEP on a port allowed");
+ return -EINVAL;
+ }
+ }
+
+ mep = kzalloc(sizeof(*mep), GFP_KERNEL);
+ if (!mep)
+ return -ENOMEM;
+
+ mep->create = *create;
+ mep->instance = instance;
+ rcu_assign_pointer(mep->b_port, p);
+
+ INIT_HLIST_HEAD(&mep->peer_mep_list);
+ INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired);
+
+ if (hlist_empty(&br->mep_list))
+ br_add_frame(br, &cfm_frame_type);
+
+ hlist_add_tail_rcu(&mep->head, &br->mep_list);
+
+ return 0;
+}
+
+static void mep_delete_implementation(struct net_bridge *br,
+ struct br_cfm_mep *mep)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct hlist_node *n_store;
+
+ ASSERT_RTNL();
+
+ /* Empty and free peer MEP list */
+ hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) {
+ cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork);
+ hlist_del_rcu(&peer_mep->head);
+ kfree_rcu(peer_mep, rcu);
+ }
+
+ cancel_delayed_work_sync(&mep->ccm_tx_dwork);
+
+ RCU_INIT_POINTER(mep->b_port, NULL);
+ hlist_del_rcu(&mep->head);
+ kfree_rcu(mep, rcu);
+
+ if (hlist_empty(&br->mep_list))
+ br_del_frame(br, &cfm_frame_type);
+}
+
+int br_cfm_mep_delete(struct net_bridge *br,
+ const u32 instance,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep_delete_implementation(br, mep);
+
+ return 0;
+}
+
+int br_cfm_mep_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_mep_config *const config,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep->config = *config;
+
+ return 0;
+}
+
+int br_cfm_cc_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_cc_config *const config,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ /* Check for no change in configuration */
+ if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0)
+ return 0;
+
+ if (config->enable && !mep->cc_config.enable)
+ /* CC is enabled */
+ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head)
+ cc_peer_enable(peer_mep);
+
+ if (!config->enable && mep->cc_config.enable)
+ /* CC is disabled */
+ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head)
+ cc_peer_disable(peer_mep);
+
+ mep->cc_config = *config;
+ mep->ccm_rx_snumber = 0;
+ mep->ccm_tx_snumber = 1;
+
+ return 0;
+}
+
+int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance,
+ u32 mepid,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ peer_mep = br_peer_mep_find(mep, mepid);
+ if (peer_mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Peer MEP-ID already exists");
+ return -EEXIST;
+ }
+
+ peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL);
+ if (!peer_mep)
+ return -ENOMEM;
+
+ peer_mep->mepid = mepid;
+ peer_mep->mep = mep;
+ INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired);
+
+ if (mep->cc_config.enable)
+ cc_peer_enable(peer_mep);
+
+ hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list);
+
+ return 0;
+}
+
+int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance,
+ u32 mepid,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ peer_mep = br_peer_mep_find(mep, mepid);
+ if (!peer_mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Peer MEP-ID does not exists");
+ return -ENOENT;
+ }
+
+ cc_peer_disable(peer_mep);
+
+ hlist_del_rcu(&peer_mep->head);
+ kfree_rcu(peer_mep, rcu);
+
+ return 0;
+}
+
+int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance,
+ const bool rdi, struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep->rdi = rdi;
+
+ return 0;
+}
+
+int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) {
+ /* No change in tx_info. */
+ if (mep->cc_ccm_tx_info.period == 0)
+ /* Transmission is not enabled - just return */
+ return 0;
+
+ /* Transmission is ongoing, the end time is recalculated */
+ mep->ccm_tx_end = jiffies +
+ usecs_to_jiffies(tx_info->period * 1000000);
+ return 0;
+ }
+
+ if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0)
+ /* Some change in info and transmission is not ongoing */
+ goto save;
+
+ if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) {
+ /* Some change in info and transmission is ongoing
+ * The end time is recalculated
+ */
+ mep->ccm_tx_end = jiffies +
+ usecs_to_jiffies(tx_info->period * 1000000);
+
+ goto save;
+ }
+
+ if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) {
+ cancel_delayed_work_sync(&mep->ccm_tx_dwork);
+ goto save;
+ }
+
+ /* Start delayed work to transmit CCM frames. It is done with zero delay
+ * to send first frame immediately
+ */
+ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000);
+ queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0);
+
+save:
+ mep->cc_ccm_tx_info = *tx_info;
+
+ return 0;
+}
+
+int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+ struct br_cfm_mep *mep;
+
+ *count = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head)
+ *count += 1;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ *count = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head)
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head)
+ *count += 1;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+bool br_cfm_created(struct net_bridge *br)
+{
+ return !hlist_empty(&br->mep_list);
+}
+
+/* Deletes the CFM instances on a specific bridge port
+ */
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port)
+{
+ struct hlist_node *n_store;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head)
+ if (mep->create.ifindex == port->dev->ifindex)
+ mep_delete_implementation(br, mep);
+}
diff --git a/net/bridge/br_cfm_netlink.c b/net/bridge/br_cfm_netlink.c
new file mode 100644
index 000000000000..2faab44652e7
--- /dev/null
+++ b/net/bridge/br_cfm_netlink.c
@@ -0,0 +1,726 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/genetlink.h>
+
+#include "br_private.h"
+#include "br_private_cfm.h"
+
+static const struct nla_policy
+br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR,
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7),
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF),
+};
+
+static const struct nla_policy
+br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = {
+ .type = NLA_BINARY, .len = CFM_MAID_LENGTH },
+};
+
+static const struct nla_policy
+br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF),
+};
+
+static const struct nla_policy
+br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR,
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy
+br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CREATE] =
+ NLA_POLICY_NESTED(br_cfm_mep_create_policy),
+ [IFLA_BRIDGE_CFM_MEP_DELETE] =
+ NLA_POLICY_NESTED(br_cfm_mep_delete_policy),
+ [IFLA_BRIDGE_CFM_MEP_CONFIG] =
+ NLA_POLICY_NESTED(br_cfm_mep_config_policy),
+ [IFLA_BRIDGE_CFM_CC_CONFIG] =
+ NLA_POLICY_NESTED(br_cfm_cc_config_policy),
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] =
+ NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy),
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] =
+ NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy),
+ [IFLA_BRIDGE_CFM_CC_RDI] =
+ NLA_POLICY_NESTED(br_cfm_cc_rdi_policy),
+ [IFLA_BRIDGE_CFM_CC_CCM_TX] =
+ NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy),
+};
+
+static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1];
+ struct br_cfm_mep_create create;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr,
+ br_cfm_mep_create_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute");
+ return -EINVAL;
+ }
+
+ memset(&create, 0, sizeof(create));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]);
+ create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]);
+ create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]);
+ create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]);
+
+ return br_cfm_mep_create(br, instance, &create, extack);
+}
+
+static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1];
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr,
+ br_cfm_mep_delete_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]);
+
+ return br_cfm_mep_delete(br, instance, extack);
+}
+
+static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1];
+ struct br_cfm_mep_config config;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr,
+ br_cfm_mep_config_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute");
+ return -EINVAL;
+ }
+
+ memset(&config, 0, sizeof(config));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]);
+ nla_memcpy(&config.unicast_mac.addr,
+ tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC],
+ sizeof(config.unicast_mac.addr));
+ config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]);
+ config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]);
+
+ return br_cfm_mep_config_set(br, instance, &config, extack);
+}
+
+static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1];
+ struct br_cfm_cc_config config;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr,
+ br_cfm_cc_config_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute");
+ return -EINVAL;
+ }
+
+ memset(&config, 0, sizeof(config));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]);
+ config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]);
+ config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]);
+ nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID],
+ sizeof(config.exp_maid.data));
+
+ return br_cfm_cc_config_set(br, instance, &config, extack);
+}
+
+static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1];
+ u32 instance, peer_mep_id;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr,
+ br_cfm_cc_peer_mep_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]);
+ peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]);
+
+ return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack);
+}
+
+static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1];
+ u32 instance, peer_mep_id;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr,
+ br_cfm_cc_peer_mep_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]);
+ peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]);
+
+ return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack);
+}
+
+static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1];
+ u32 instance, rdi;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr,
+ br_cfm_cc_rdi_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]);
+ rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]);
+
+ return br_cfm_cc_rdi_set(br, instance, rdi, extack);
+}
+
+static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1];
+ struct br_cfm_cc_ccm_tx_info tx_info;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr,
+ br_cfm_cc_ccm_tx_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute");
+ return -EINVAL;
+ }
+
+ memset(&tx_info, 0, sizeof(tx_info));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]);
+ nla_memcpy(&tx_info.dmac.addr,
+ tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC],
+ sizeof(tx_info.dmac.addr));
+ tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]);
+ tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]);
+ tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]);
+ tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]);
+ tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]);
+ tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]);
+
+ return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack);
+}
+
+int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1];
+ int err;
+
+ /* When this function is called for a port then the br pointer is
+ * invalid, therefor set the br to point correctly
+ */
+ if (p)
+ br = p->br;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr,
+ br_cfm_policy, extack);
+ if (err)
+ return err;
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) {
+ err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) {
+ err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) {
+ err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) {
+ err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) {
+ err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) {
+ err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_RDI]) {
+ err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) {
+ err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX],
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+ struct nlattr *tb;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head) {
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN,
+ mep->create.domain))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION,
+ mep->create.direction))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX,
+ mep->create.ifindex))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC,
+ sizeof(mep->config.unicast_mac.addr),
+ mep->config.unicast_mac.addr))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL,
+ mep->config.mdlevel))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID,
+ mep->config.mepid))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE,
+ mep->cc_config.enable))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL,
+ mep->cc_config.exp_interval))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID,
+ sizeof(mep->cc_config.exp_maid.data),
+ mep->cc_config.exp_maid.data))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI,
+ mep->rdi))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC,
+ sizeof(mep->cc_ccm_tx_info.dmac),
+ mep->cc_ccm_tx_info.dmac.addr))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE,
+ mep->cc_ccm_tx_info.seq_no_update))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD,
+ mep->cc_ccm_tx_info.period))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV,
+ mep->cc_ccm_tx_info.if_tlv))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE,
+ mep->cc_ccm_tx_info.if_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV,
+ mep->cc_ccm_tx_info.port_tlv))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE,
+ mep->cc_ccm_tx_info.port_tlv_value))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) {
+ tb = nla_nest_start(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID,
+ peer_mep->mepid))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+ }
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ return -EMSGSIZE;
+}
+
+int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+ struct nlattr *tb;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head) {
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN,
+ mep->status.opcode_unexp_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN,
+ mep->status.version_unexp_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN,
+ mep->status.rx_level_low_seen))
+ goto nla_put_failure;
+
+ /* Only clear if this is a GETLINK */
+ if (getlink) {
+ /* Clear all 'seen' indications */
+ mep->status.opcode_unexp_seen = false;
+ mep->status.version_unexp_seen = false;
+ mep->status.rx_level_low_seen = false;
+ }
+
+ nla_nest_end(skb, tb);
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) {
+ tb = nla_nest_start(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID,
+ peer_mep->mepid))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT,
+ peer_mep->cc_status.ccm_defect))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI,
+ peer_mep->cc_status.rdi))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE,
+ peer_mep->cc_status.port_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE,
+ peer_mep->cc_status.if_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN,
+ peer_mep->cc_status.seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN,
+ peer_mep->cc_status.tlv_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN,
+ peer_mep->cc_status.seq_unexp_seen))
+ goto nla_put_failure;
+
+ if (getlink) { /* Only clear if this is a GETLINK */
+ /* Clear all 'seen' indications */
+ peer_mep->cc_status.seen = false;
+ peer_mep->cc_status.tlv_seen = false;
+ peer_mep->cc_status.seq_unexp_seen = false;
+ }
+
+ nla_nest_end(skb, tb);
+ }
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ return -EMSGSIZE;
+}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f8dbcee80eba..a818fdc22da9 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -1,62 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Device handling code
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_device.c,v 1.6 2001/12/24 00:59:55 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/netdevice.h>
+#include <linux/netpoll.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/list.h>
+#include <linux/netfilter_bridge.h>
+
+#include <linux/uaccess.h>
+#include <net/netdev_lock.h>
-#include <asm/uaccess.h>
#include "br_private.h"
-static struct net_device_stats *br_dev_get_stats(struct net_device *dev)
+#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM)
+
+const struct nf_br_ops __rcu *nf_br_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_br_ops);
+
+/* net device transmit always called with BH disabled */
+netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
+ enum skb_drop_reason reason = pskb_may_pull_reason(skb, ETH_HLEN);
+ struct net_bridge_mcast_port *pmctx_null = NULL;
struct net_bridge *br = netdev_priv(dev);
- return &br->statistics;
+ struct net_bridge_mcast *brmctx = &br->multicast_ctx;
+ struct net_bridge_fdb_entry *dst;
+ struct net_bridge_mdb_entry *mdst;
+ const struct nf_br_ops *nf_ops;
+ u8 state = BR_STATE_FORWARDING;
+ struct net_bridge_vlan *vlan;
+ const unsigned char *dest;
+ u16 vid = 0;
+
+ if (unlikely(reason != SKB_NOT_DROPPED_YET)) {
+ kfree_skb_reason(skb, reason);
+ return NETDEV_TX_OK;
+ }
+
+ memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
+ br_tc_skb_miss_set(skb, false);
+
+ rcu_read_lock();
+ nf_ops = rcu_dereference(nf_br_ops);
+ if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
+ rcu_read_unlock();
+ return NETDEV_TX_OK;
+ }
+
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
+
+ br_switchdev_frame_unmark(skb);
+ BR_INPUT_SKB_CB(skb)->brdev = dev;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
+
+ skb_reset_mac_header(skb);
+ skb_pull(skb, ETH_HLEN);
+
+ if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid,
+ &state, &vlan))
+ goto out;
+
+ if (IS_ENABLED(CONFIG_INET) &&
+ (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) ||
+ eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
+ br_do_proxy_suppress_arp(skb, br, vid, NULL);
+ } else if (IS_ENABLED(CONFIG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
+ pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+ sizeof(struct nd_msg)) &&
+ ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+ struct nd_msg *msg, _msg;
+
+ msg = br_is_nd_neigh_msg(skb, &_msg);
+ if (msg)
+ br_do_suppress_nd(skb, br, vid, NULL, msg);
+ }
+
+ dest = eth_hdr(skb)->h_dest;
+ if (is_broadcast_ether_addr(dest)) {
+ br_flood(br, skb, BR_PKT_BROADCAST, false, true, vid);
+ } else if (is_multicast_ether_addr(dest)) {
+ if (unlikely(netpoll_tx_running(dev))) {
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid);
+ goto out;
+ }
+ if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ mdst = br_mdb_entry_skb_get(brmctx, skb, vid);
+ if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
+ br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst))
+ br_multicast_flood(mdst, skb, brmctx, false, true);
+ else
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid);
+ } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
+ br_forward(dst->dst, skb, false, true);
+ } else {
+ br_flood(br, skb, BR_PKT_UNICAST, false, true, vid);
+ }
+out:
+ rcu_read_unlock();
+ return NETDEV_TX_OK;
}
-/* net device transmit always called with no BH (preempt_disabled) */
-int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+static int br_dev_init(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
- const unsigned char *dest = skb->data;
- struct net_bridge_fdb_entry *dst;
+ int err;
- br->statistics.tx_packets++;
- br->statistics.tx_bytes += skb->len;
+ err = br_fdb_hash_init(br);
+ if (err)
+ return err;
- skb->mac.raw = skb->data;
- skb_pull(skb, ETH_HLEN);
+ err = br_mdb_hash_init(br);
+ if (err) {
+ br_fdb_hash_fini(br);
+ return err;
+ }
+
+ err = br_vlan_init(br);
+ if (err) {
+ br_mdb_hash_fini(br);
+ br_fdb_hash_fini(br);
+ return err;
+ }
- if (dest[0] & 1)
- br_flood_deliver(br, skb, 0);
- else if ((dst = __br_fdb_get(br, dest)) != NULL)
- br_deliver(dst->dst, skb);
- else
- br_flood_deliver(br, skb, 0);
+ err = br_multicast_init_stats(br);
+ if (err) {
+ br_vlan_flush(br);
+ br_mdb_hash_fini(br);
+ br_fdb_hash_fini(br);
+ return err;
+ }
+ netdev_lockdep_set_classes(dev);
return 0;
}
+static void br_dev_uninit(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ br_multicast_dev_del(br);
+ br_multicast_uninit_stats(br);
+ br_vlan_flush(br);
+ br_mdb_hash_fini(br);
+ br_fdb_hash_fini(br);
+}
+
static int br_dev_open(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
- br_features_recompute(br);
+ netdev_update_features(dev);
netif_start_queue(dev);
br_stp_enable_bridge(br);
+ br_multicast_open(br);
+
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_join_snoopers(br);
return 0;
}
@@ -65,9 +180,21 @@ static void br_dev_set_multicast_list(struct net_device *dev)
{
}
+static void br_dev_change_rx_flags(struct net_device *dev, int change)
+{
+ if (change & IFF_PROMISC)
+ br_manage_promisc(netdev_priv(dev));
+}
+
static int br_dev_stop(struct net_device *dev)
{
- br_stp_disable_bridge(netdev_priv(dev));
+ struct net_bridge *br = netdev_priv(dev);
+
+ br_stp_disable_bridge(br);
+ br_multicast_stop(br);
+
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_leave_snoopers(br);
netif_stop_queue(dev);
@@ -76,114 +203,333 @@ static int br_dev_stop(struct net_device *dev)
static int br_change_mtu(struct net_device *dev, int new_mtu)
{
- if (new_mtu < 68 || new_mtu > br_min_mtu(netdev_priv(dev)))
- return -EINVAL;
+ struct net_bridge *br = netdev_priv(dev);
+
+ WRITE_ONCE(dev->mtu, new_mtu);
+
+ /* this flag will be cleared if the MTU was automatically adjusted */
+ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* remember the MTU in the rtable for PMTU */
+ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
+#endif
- dev->mtu = new_mtu;
return 0;
}
-/* Allow setting mac address of pseudo-bridge to be same as
- * any of the bound interfaces
- */
+/* Allow setting mac address to any valid ethernet address. */
static int br_set_mac_address(struct net_device *dev, void *p)
{
struct net_bridge *br = netdev_priv(dev);
struct sockaddr *addr = p;
- struct net_bridge_port *port;
- int err = -EADDRNOTAVAIL;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ /* dev_set_mac_addr() can be called by a master device on bridge's
+ * NETDEV_UNREGISTER, but since it's being destroyed do nothing
+ */
+ if (dev->reg_state != NETREG_REGISTERED)
+ return -EBUSY;
spin_lock_bh(&br->lock);
- list_for_each_entry(port, &br->port_list, list) {
- if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) {
- br_stp_change_bridge_id(br, addr->sa_data);
- err = 0;
- break;
- }
+ if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) {
+ /* Mac address will be changed in br_stp_change_bridge_id(). */
+ br_stp_change_bridge_id(br, addr->sa_data);
}
spin_unlock_bh(&br->lock);
- return err;
+ return 0;
}
static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
- strcpy(info->driver, "bridge");
- strcpy(info->version, BR_VERSION);
- strcpy(info->fw_version, "N/A");
- strcpy(info->bus_info, "N/A");
+ strscpy(info->driver, "bridge", sizeof(info->driver));
+ strscpy(info->version, BR_VERSION, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->bus_info, "N/A", sizeof(info->bus_info));
}
-static int br_set_sg(struct net_device *dev, u32 data)
+static int br_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
{
struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+
+ cmd->base.duplex = DUPLEX_UNKNOWN;
+ cmd->base.port = PORT_OTHER;
+ cmd->base.speed = SPEED_UNKNOWN;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ struct ethtool_link_ksettings ecmd;
+ struct net_device *pdev = p->dev;
+
+ if (!netif_running(pdev) || !netif_oper_up(pdev))
+ continue;
+
+ if (__ethtool_get_link_ksettings(pdev, &ecmd))
+ continue;
- if (data)
- br->feature_mask |= NETIF_F_SG;
- else
- br->feature_mask &= ~NETIF_F_SG;
+ if (ecmd.base.speed == (__u32)SPEED_UNKNOWN)
+ continue;
+
+ if (cmd->base.speed == (__u32)SPEED_UNKNOWN ||
+ cmd->base.speed < ecmd.base.speed)
+ cmd->base.speed = ecmd.base.speed;
+ }
- br_features_recompute(br);
return 0;
}
-static int br_set_tso(struct net_device *dev, u32 data)
+static netdev_features_t br_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
struct net_bridge *br = netdev_priv(dev);
- if (data)
- br->feature_mask |= NETIF_F_TSO;
- else
- br->feature_mask &= ~NETIF_F_TSO;
+ return br_features_recompute(br, features);
+}
- br_features_recompute(br);
- return 0;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void br_poll_controller(struct net_device *br_dev)
+{
+}
+
+static void br_netpoll_cleanup(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list)
+ br_netpoll_disable(p);
+}
+
+static int __br_netpoll_enable(struct net_bridge_port *p)
+{
+ struct netpoll *np;
+ int err;
+
+ np = kzalloc(sizeof(*p->np), GFP_KERNEL);
+ if (!np)
+ return -ENOMEM;
+
+ err = __netpoll_setup(np, p->dev);
+ if (err) {
+ kfree(np);
+ return err;
+ }
+
+ p->np = np;
+ return err;
+}
+
+int br_netpoll_enable(struct net_bridge_port *p)
+{
+ if (!p->br->dev->npinfo)
+ return 0;
+
+ return __br_netpoll_enable(p);
+}
+
+static int br_netpoll_setup(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+ int err = 0;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!p->dev)
+ continue;
+ err = __br_netpoll_enable(p);
+ if (err)
+ goto fail;
+ }
+
+out:
+ return err;
+
+fail:
+ br_netpoll_cleanup(dev);
+ goto out;
+}
+
+void br_netpoll_disable(struct net_bridge_port *p)
+{
+ struct netpoll *np = p->np;
+
+ if (!np)
+ return;
+
+ p->np = NULL;
+
+ __netpoll_free(np);
+}
+
+#endif
+
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev,
+ struct netlink_ext_ack *extack)
+
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_add_if(br, slave_dev, extack);
}
-static int br_set_tx_csum(struct net_device *dev, u32 data)
+static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
{
struct net_bridge *br = netdev_priv(dev);
- if (data)
- br->feature_mask |= NETIF_F_NO_CSUM;
- else
- br->feature_mask &= ~NETIF_F_ALL_CSUM;
+ return br_del_if(br, slave_dev);
+}
+
+static int br_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *dst;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_port(ctx->dev))
+ return -1;
+
+ br = netdev_priv(ctx->dev);
+
+ br_vlan_fill_forward_path_pvid(br, ctx, path);
+
+ f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id);
+ if (!f)
+ return -1;
+
+ dst = READ_ONCE(f->dst);
+ if (!dst)
+ return -1;
+
+ if (br_vlan_fill_forward_path_mode(br, dst, path))
+ return -1;
+
+ path->type = DEV_PATH_BRIDGE;
+ path->dev = dst->br->dev;
+ ctx->dev = dst->dev;
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_TAG:
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+ ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
+ ctx->num_vlans++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ case DEV_PATH_BR_VLAN_UNTAG:
+ ctx->num_vlans--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
- br_features_recompute(br);
return 0;
}
-static struct ethtool_ops br_ethtool_ops = {
- .get_drvinfo = br_getinfo,
- .get_link = ethtool_op_get_link,
- .get_sg = ethtool_op_get_sg,
- .set_sg = br_set_sg,
- .get_tx_csum = ethtool_op_get_tx_csum,
- .set_tx_csum = br_set_tx_csum,
- .get_tso = ethtool_op_get_tso,
- .set_tso = br_set_tso,
+static const struct ethtool_ops br_ethtool_ops = {
+ .get_drvinfo = br_getinfo,
+ .get_link = ethtool_op_get_link,
+ .get_link_ksettings = br_get_link_ksettings,
+};
+
+static const struct net_device_ops br_netdev_ops = {
+ .ndo_open = br_dev_open,
+ .ndo_stop = br_dev_stop,
+ .ndo_init = br_dev_init,
+ .ndo_uninit = br_dev_uninit,
+ .ndo_start_xmit = br_dev_xmit,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_set_mac_address = br_set_mac_address,
+ .ndo_set_rx_mode = br_dev_set_multicast_list,
+ .ndo_change_rx_flags = br_dev_change_rx_flags,
+ .ndo_change_mtu = br_change_mtu,
+ .ndo_siocdevprivate = br_dev_siocdevprivate,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = br_netpoll_setup,
+ .ndo_netpoll_cleanup = br_netpoll_cleanup,
+ .ndo_poll_controller = br_poll_controller,
+#endif
+ .ndo_add_slave = br_add_slave,
+ .ndo_del_slave = br_del_slave,
+ .ndo_fix_features = br_fix_features,
+ .ndo_fdb_add = br_fdb_add,
+ .ndo_fdb_del = br_fdb_delete,
+ .ndo_fdb_del_bulk = br_fdb_delete_bulk,
+ .ndo_fdb_dump = br_fdb_dump,
+ .ndo_fdb_get = br_fdb_get,
+ .ndo_mdb_add = br_mdb_add,
+ .ndo_mdb_del = br_mdb_del,
+ .ndo_mdb_del_bulk = br_mdb_del_bulk,
+ .ndo_mdb_dump = br_mdb_dump,
+ .ndo_mdb_get = br_mdb_get,
+ .ndo_bridge_getlink = br_getlink,
+ .ndo_bridge_setlink = br_setlink,
+ .ndo_bridge_dellink = br_dellink,
+ .ndo_features_check = passthru_features_check,
+ .ndo_fill_forward_path = br_fill_forward_path,
+};
+
+static const struct device_type br_type = {
+ .name = "bridge",
};
void br_dev_setup(struct net_device *dev)
{
- memset(dev->dev_addr, 0, ETH_ALEN);
+ struct net_bridge *br = netdev_priv(dev);
+ eth_hw_addr_random(dev);
ether_setup(dev);
- dev->do_ioctl = br_dev_ioctl;
- dev->get_stats = br_dev_get_stats;
- dev->hard_start_xmit = br_dev_xmit;
- dev->open = br_dev_open;
- dev->set_multicast_list = br_dev_set_multicast_list;
- dev->change_mtu = br_change_mtu;
- dev->destructor = free_netdev;
- SET_MODULE_OWNER(dev);
- SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
- dev->stop = br_dev_stop;
- dev->tx_queue_len = 0;
- dev->set_mac_address = br_set_mac_address;
- dev->priv_flags = IFF_EBRIDGE;
-
- dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
- NETIF_F_TSO | NETIF_F_NO_CSUM | NETIF_F_GSO_ROBUST;
+ dev->netdev_ops = &br_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->ethtool_ops = &br_ethtool_ops;
+ SET_NETDEV_DEVTYPE(dev, &br_type);
+ dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
+ dev->lltx = true;
+ dev->netns_immutable = true;
+
+ dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ dev->vlan_features = COMMON_FEATURES;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
+ br->dev = dev;
+ spin_lock_init(&br->lock);
+ INIT_LIST_HEAD(&br->port_list);
+ INIT_HLIST_HEAD(&br->fdb_list);
+ INIT_HLIST_HEAD(&br->frame_type_list);
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+ INIT_HLIST_HEAD(&br->mrp_list);
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+ INIT_HLIST_HEAD(&br->mep_list);
+#endif
+ spin_lock_init(&br->hash_lock);
+
+ br->bridge_id.prio[0] = 0x80;
+ br->bridge_id.prio[1] = 0x00;
+
+ ether_addr_copy(br->group_addr, eth_stp_addr);
+
+ br->stp_enabled = BR_NO_STP;
+ br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
+ br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
+
+ br->designated_root = br->bridge_id;
+ br->bridge_max_age = br->max_age = 20 * HZ;
+ br->bridge_hello_time = br->hello_time = 2 * HZ;
+ br->bridge_forward_delay = br->forward_delay = 15 * HZ;
+ br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
+ dev->max_mtu = ETH_MAX_MTU;
+
+ br_netfilter_rtable_init(br);
+ br_stp_timer_init(br);
+ br_multicast_init(br);
+ INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9f04864d15d..58d22e2b85fc 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1,371 +1,1650 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Forwarding database
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_fdb.c,v 1.6 2002/01/17 00:57:07 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/times.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/jhash.h>
-#include <asm/atomic.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/unaligned.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+#include <trace/events/bridge.h>
#include "br_private.h"
-static kmem_cache_t *br_fdb_cache __read_mostly;
-static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr);
+static const struct rhashtable_params br_fdb_rht_params = {
+ .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode),
+ .key_offset = offsetof(struct net_bridge_fdb_entry, key),
+ .key_len = sizeof(struct net_bridge_fdb_key),
+ .automatic_shrinking = true,
+};
+
+static struct kmem_cache *br_fdb_cache __read_mostly;
-void __init br_fdb_init(void)
+int __init br_fdb_init(void)
{
- br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
- sizeof(struct net_bridge_fdb_entry),
- 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ br_fdb_cache = KMEM_CACHE(net_bridge_fdb_entry, SLAB_HWCACHE_ALIGN);
+ if (!br_fdb_cache)
+ return -ENOMEM;
+
+ return 0;
}
-void __exit br_fdb_fini(void)
+void br_fdb_fini(void)
{
kmem_cache_destroy(br_fdb_cache);
}
+int br_fdb_hash_init(struct net_bridge *br)
+{
+ return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params);
+}
+
+void br_fdb_hash_fini(struct net_bridge *br)
+{
+ rhashtable_destroy(&br->fdb_hash_tbl);
+}
/* if topology_changing then use forward_delay (default 15 sec)
* otherwise keep longer (default 5 minutes)
*/
-static __inline__ unsigned long hold_time(const struct net_bridge *br)
+static inline unsigned long hold_time(const struct net_bridge *br)
{
return br->topology_change ? br->forward_delay : br->ageing_time;
}
-static __inline__ int has_expired(const struct net_bridge *br,
+static inline int has_expired(const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb)
{
- return !fdb->is_static
- && time_before_eq(fdb->ageing_timer + hold_time(br), jiffies);
+ return !test_bit(BR_FDB_STATIC, &fdb->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) &&
+ time_before_eq(fdb->updated + hold_time(br), jiffies);
+}
+
+static int fdb_to_nud(const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb)
+{
+ if (test_bit(BR_FDB_LOCAL, &fdb->flags))
+ return NUD_PERMANENT;
+ else if (test_bit(BR_FDB_STATIC, &fdb->flags))
+ return NUD_NOARP;
+ else if (has_expired(br, fdb))
+ return NUD_STALE;
+ else
+ return NUD_REACHABLE;
+}
+
+static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb,
+ u32 portid, u32 seq, int type, unsigned int flags)
+{
+ const struct net_bridge_port *dst = READ_ONCE(fdb->dst);
+ unsigned long now = jiffies;
+ struct nda_cacheinfo ci;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+ u32 ext_flags = 0;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = 0;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex;
+ ndm->ndm_state = fdb_to_nud(br, fdb);
+
+ if (test_bit(BR_FDB_OFFLOADED, &fdb->flags))
+ ndm->ndm_flags |= NTF_OFFLOADED;
+ if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
+ ndm->ndm_flags |= NTF_EXT_LEARNED;
+ if (test_bit(BR_FDB_STICKY, &fdb->flags))
+ ndm->ndm_flags |= NTF_STICKY;
+ if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+ ext_flags |= NTF_EXT_LOCKED;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags))
+ goto nla_put_failure;
+
+ ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
+ ci.ndm_confirmed = 0;
+ ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
+ ci.ndm_refcnt = 0;
+ if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16),
+ &fdb->key.vlan_id))
+ goto nla_put_failure;
+
+ if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) {
+ struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS);
+ u8 notify_bits = FDB_NOTIFY_BIT;
+
+ if (!nest)
+ goto nla_put_failure;
+ if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags))
+ notify_bits |= FDB_NOTIFY_INACTIVE_BIT;
+
+ if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) {
+ nla_nest_cancel(skb, nest);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(u32)) /* NDA_MASTER */
+ + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */
+ + nla_total_size(sizeof(u16)) /* NDA_VLAN */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */
+ + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */
+}
+
+static void fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type,
+ bool swdev_notify)
+{
+ struct net *net = dev_net(br->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ if (swdev_notify)
+ br_switchdev_fdb_notify(br, fdb, type);
+
+ skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_key key;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key.vlan_id = vid;
+ memcpy(key.addr.addr, addr, sizeof(key.addr.addr));
+
+ return rhashtable_lookup(tbl, &key, br_fdb_rht_params);
+}
+
+/* requires bridge hash_lock */
+static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ lockdep_assert_held_once(&br->hash_lock);
+
+ rcu_read_lock();
+ fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
+ rcu_read_unlock();
+
+ return fdb;
+}
+
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_device *dev = NULL;
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(br_dev))
+ return NULL;
+
+ br = netdev_priv(br_dev);
+ rcu_read_lock();
+ f = br_fdb_find_rcu(br, addr, vid);
+ if (f && f->dst)
+ dev = f->dst->dev;
+ rcu_read_unlock();
+
+ return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
+}
+
+/* When a static FDB entry is added, the mac address from the entry is
+ * added to the bridge private HW address list and all required ports
+ * are then updated with the new information.
+ * Called under RTNL.
+ */
+static void fdb_add_hw_addr(struct net_bridge *br, const unsigned char *addr)
+{
+ int err;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!br_promisc_port(p)) {
+ err = dev_uc_add(p->dev, addr);
+ if (err)
+ goto undo;
+ }
+ }
+
+ return;
+undo:
+ list_for_each_entry_continue_reverse(p, &br->port_list, list) {
+ if (!br_promisc_port(p))
+ dev_uc_del(p->dev, addr);
+ }
+}
+
+/* When a static FDB entry is deleted, the HW address from that entry is
+ * also removed from the bridge private HW address list and updates all
+ * the ports with needed information.
+ * Called under RTNL.
+ */
+static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!br_promisc_port(p))
+ dev_uc_del(p->dev, addr);
+ }
+}
+
+static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
+ bool swdev_notify)
+{
+ trace_fdb_delete(br, f);
+
+ if (test_bit(BR_FDB_STATIC, &f->flags))
+ fdb_del_hw_addr(br, f->key.addr.addr);
+
+ hlist_del_init_rcu(&f->fdb_node);
+ rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
+ br_fdb_rht_params);
+ if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &f->flags))
+ atomic_dec(&br->fdb_n_learned);
+ fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
+ kfree_rcu(f, rcu);
+}
+
+/* Delete a local entry if no other port had the same address.
+ *
+ * This function should only be called on entries with BR_FDB_LOCAL set,
+ * so even with BR_FDB_ADDED_BY_USER cleared we never need to increase
+ * the accounting for dynamically learned entries again.
+ */
+static void fdb_delete_local(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_fdb_entry *f)
+{
+ const unsigned char *addr = f->key.addr.addr;
+ struct net_bridge_vlan_group *vg;
+ const struct net_bridge_vlan *v;
+ struct net_bridge_port *op;
+ u16 vid = f->key.vlan_id;
+
+ /* Maybe another port has same hw addr? */
+ list_for_each_entry(op, &br->port_list, list) {
+ vg = nbp_vlan_group(op);
+ if (op != p && ether_addr_equal(op->dev->dev_addr, addr) &&
+ (!vid || br_vlan_find(vg, vid))) {
+ f->dst = op;
+ clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
+ return;
+ }
+ }
+
+ vg = br_vlan_group(br);
+ v = br_vlan_find(vg, vid);
+ /* Maybe bridge device has same hw addr? */
+ if (p && ether_addr_equal(br->dev->dev_addr, addr) &&
+ (!vid || (v && br_vlan_should_use(v)))) {
+ f->dst = NULL;
+ clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
+ return;
+ }
+
+ fdb_delete(br, f, true);
}
-static __inline__ int br_mac_hash(const unsigned char *mac)
+void br_fdb_find_delete_local(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid)
{
- return jhash(mac, ETH_ALEN, 0) & (BR_HASH_SIZE - 1);
+ struct net_bridge_fdb_entry *f;
+
+ spin_lock_bh(&br->hash_lock);
+ f = br_fdb_find(br, addr, vid);
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p)
+ fdb_delete_local(br, p, f);
+ spin_unlock_bh(&br->hash_lock);
}
-static __inline__ void fdb_delete(struct net_bridge_fdb_entry *f)
+static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
+ struct net_bridge_port *source,
+ const unsigned char *addr,
+ __u16 vid,
+ unsigned long flags)
{
- hlist_del_rcu(&f->hlist);
- br_fdb_put(f);
+ bool learned = !test_bit(BR_FDB_ADDED_BY_USER, &flags) &&
+ !test_bit(BR_FDB_LOCAL, &flags);
+ u32 max_learned = READ_ONCE(br->fdb_max_learned);
+ struct net_bridge_fdb_entry *fdb;
+ int err;
+
+ if (likely(learned)) {
+ int n_learned = atomic_read(&br->fdb_n_learned);
+
+ if (unlikely(max_learned && n_learned >= max_learned))
+ return NULL;
+ __set_bit(BR_FDB_DYNAMIC_LEARNED, &flags);
+ }
+
+ fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
+ if (!fdb)
+ return NULL;
+
+ memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
+ WRITE_ONCE(fdb->dst, source);
+ fdb->key.vlan_id = vid;
+ fdb->flags = flags;
+ fdb->updated = fdb->used = jiffies;
+ err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode,
+ br_fdb_rht_params);
+ if (err) {
+ kmem_cache_free(br_fdb_cache, fdb);
+ return NULL;
+ }
+
+ if (likely(learned))
+ atomic_inc(&br->fdb_n_learned);
+
+ hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list);
+
+ return fdb;
+}
+
+static int fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ if (!is_valid_ether_addr(addr))
+ return -EINVAL;
+
+ fdb = br_fdb_find(br, addr, vid);
+ if (fdb) {
+ /* it is okay to have multiple ports with same
+ * address, just use the first one.
+ */
+ if (test_bit(BR_FDB_LOCAL, &fdb->flags))
+ return 0;
+ br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
+ source ? source->dev->name : br->dev->name, addr, vid);
+ fdb_delete(br, fdb, true);
+ }
+
+ fdb = fdb_create(br, source, addr, vid,
+ BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC));
+ if (!fdb)
+ return -ENOMEM;
+
+ fdb_add_hw_addr(br, addr);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
+ return 0;
}
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_fdb_entry *f;
struct net_bridge *br = p->br;
- int i;
-
+ struct net_bridge_vlan *v;
+ bool local_vlan_0;
+
+ local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
+
spin_lock_bh(&br->hash_lock);
+ vg = nbp_vlan_group(p);
+ hlist_for_each_entry(f, &br->fdb_list, fdb_node) {
+ if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) {
+ /* delete old one */
+ fdb_delete_local(br, p, f);
- /* Search all chains since old address/hash is unknown */
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct hlist_node *h;
- hlist_for_each(h, &br->hash[i]) {
- struct net_bridge_fdb_entry *f;
-
- f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
- if (f->dst == p && f->is_local) {
- /* maybe another port has same hw addr? */
- struct net_bridge_port *op;
- list_for_each_entry(op, &br->port_list, list) {
- if (op != p &&
- !compare_ether_addr(op->dev->dev_addr,
- f->addr.addr)) {
- f->dst = op;
- goto insert;
- }
- }
-
- /* delete old one */
- fdb_delete(f);
+ /* if this port has no vlan information configured, or
+ * local entries are only kept on VLAN 0, we can safely
+ * be done at this point.
+ */
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto insert;
- }
}
}
- insert:
+
+insert:
/* insert new address, may fail if invalid address or dup. */
- fdb_insert(br, p, newaddr);
+ fdb_add_local(br, p, newaddr, 0);
+ if (!vg || !vg->num_vlans || local_vlan_0)
+ goto done;
+
+ /* Now add entries for every VLAN configured on the port.
+ * This function runs under RTNL so the bitmap will not change
+ * from under us.
+ */
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ fdb_add_local(br, p, newaddr, v->vid);
+
+done:
spin_unlock_bh(&br->hash_lock);
}
-void br_fdb_cleanup(unsigned long _data)
+void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
{
- struct net_bridge *br = (struct net_bridge *)_data;
- unsigned long delay = hold_time(br);
- int i;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_vlan *v;
+ bool local_vlan_0;
+
+ local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
spin_lock_bh(&br->hash_lock);
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct net_bridge_fdb_entry *f;
- struct hlist_node *h, *n;
-
- hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) {
- if (!f->is_static &&
- time_before_eq(f->ageing_timer + delay, jiffies))
- fdb_delete(f);
- }
+
+ /* If old entry was unassociated with any port, then delete it. */
+ f = br_fdb_find(br, br->dev->dev_addr, 0);
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags))
+ fdb_delete_local(br, NULL, f);
+
+ fdb_add_local(br, NULL, newaddr, 0);
+ vg = br_vlan_group(br);
+ if (!vg || !vg->num_vlans || local_vlan_0)
+ goto out;
+ /* Now remove and add entries for every VLAN configured on the
+ * bridge. This function runs under RTNL so the bitmap will not
+ * change from under us.
+ */
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+ f = br_fdb_find(br, br->dev->dev_addr, v->vid);
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags))
+ fdb_delete_local(br, NULL, f);
+ fdb_add_local(br, NULL, newaddr, v->vid);
}
+out:
spin_unlock_bh(&br->hash_lock);
-
- mod_timer(&br->gc_timer, jiffies + HZ/10);
}
-
-void br_fdb_delete_by_port(struct net_bridge *br,
- const struct net_bridge_port *p,
- int do_all)
+void br_fdb_cleanup(struct work_struct *work)
{
- int i;
+ struct net_bridge *br = container_of(work, struct net_bridge,
+ gc_work.work);
+ struct net_bridge_fdb_entry *f = NULL;
+ unsigned long delay = hold_time(br);
+ unsigned long work_delay = delay;
+ unsigned long now = jiffies;
- spin_lock_bh(&br->hash_lock);
- for (i = 0; i < BR_HASH_SIZE; i++) {
- struct hlist_node *h, *g;
-
- hlist_for_each_safe(h, g, &br->hash[i]) {
- struct net_bridge_fdb_entry *f
- = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
- if (f->dst != p)
- continue;
+ /* this part is tricky, in order to avoid blocking learning and
+ * consequently forwarding, we rely on rcu to delete objects with
+ * delayed freeing allowing us to continue traversing
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ unsigned long this_timer = f->updated + delay;
- if (f->is_static && !do_all)
- continue;
- /*
- * if multiple ports all have the same device address
- * then when one port is deleted, assign
- * the local entry to other port
- */
- if (f->is_local) {
- struct net_bridge_port *op;
- list_for_each_entry(op, &br->port_list, list) {
- if (op != p &&
- !compare_ether_addr(op->dev->dev_addr,
- f->addr.addr)) {
- f->dst = op;
- goto skip_delete;
- }
- }
+ if (test_bit(BR_FDB_STATIC, &f->flags) ||
+ test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) {
+ if (test_bit(BR_FDB_NOTIFY, &f->flags)) {
+ if (time_after(this_timer, now))
+ work_delay = min(work_delay,
+ this_timer - now);
+ else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE,
+ &f->flags))
+ fdb_notify(br, f, RTM_NEWNEIGH, false);
}
+ continue;
+ }
- fdb_delete(f);
- skip_delete: ;
+ if (time_after(this_timer, now)) {
+ work_delay = min(work_delay, this_timer - now);
+ } else {
+ spin_lock_bh(&br->hash_lock);
+ if (!hlist_unhashed(&f->fdb_node))
+ fdb_delete(br, f, true);
+ spin_unlock_bh(&br->hash_lock);
}
}
- spin_unlock_bh(&br->hash_lock);
+ rcu_read_unlock();
+
+ /* Cleanup minimum 10 milliseconds apart */
+ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
+ mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
}
-/* No locking or refcounting, assumes caller has no preempt (rcu_read_lock) */
-struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
- const unsigned char *addr)
+static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br,
+ struct net_bridge_port *p)
{
- struct hlist_node *h;
- struct net_bridge_fdb_entry *fdb;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_device *dev;
- hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
- if (!compare_ether_addr(fdb->addr.addr, addr)) {
- if (unlikely(has_expired(br, fdb)))
- break;
- return fdb;
- }
+ if (p) {
+ vg = nbp_vlan_group(p);
+ dev = p->dev;
+ } else {
+ vg = br_vlan_group(br);
+ dev = br->dev;
}
- return NULL;
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
}
-/* Interface used by ATM hook that keeps a ref count */
-struct net_bridge_fdb_entry *br_fdb_get(struct net_bridge *br,
- unsigned char *addr)
+static void br_fdb_delete_locals_per_vlan(struct net_bridge *br)
{
- struct net_bridge_fdb_entry *fdb;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list)
+ br_fdb_delete_locals_per_vlan_port(br, p);
+
+ br_fdb_delete_locals_per_vlan_port(br, NULL);
+}
+
+static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br,
+ struct net_bridge_port *p,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_device *dev;
+ int err;
+
+ if (p) {
+ vg = nbp_vlan_group(p);
+ dev = p->dev;
+ } else {
+ vg = br_vlan_group(br);
+ dev = br->dev;
+ }
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+
+ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_fdb_insert_locals_per_vlan(struct net_bridge *br,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *p;
+ int err;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ err = br_fdb_insert_locals_per_vlan_port(br, p, extack);
+ if (err)
+ goto rollback;
+ }
+
+ err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack);
+ if (err)
+ goto rollback;
+
+ return 0;
+
+rollback:
+ NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed");
+ br_fdb_delete_locals_per_vlan(br);
+ return err;
+}
+
+int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ if (!on)
+ return br_fdb_insert_locals_per_vlan(br, extack);
+
+ br_fdb_delete_locals_per_vlan(br);
+ return 0;
+}
+
+static bool __fdb_flush_matches(const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *f,
+ const struct net_bridge_fdb_flush_desc *desc)
+{
+ const struct net_bridge_port *dst = READ_ONCE(f->dst);
+ int port_ifidx = dst ? dst->dev->ifindex : br->dev->ifindex;
+
+ if (desc->vlan_id && desc->vlan_id != f->key.vlan_id)
+ return false;
+ if (desc->port_ifindex && desc->port_ifindex != port_ifidx)
+ return false;
+ if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags)
+ return false;
+
+ return true;
+}
+
+/* Flush forwarding database entries matching the description */
+void br_fdb_flush(struct net_bridge *br,
+ const struct net_bridge_fdb_flush_desc *desc)
+{
+ struct net_bridge_fdb_entry *f;
rcu_read_lock();
- fdb = __br_fdb_get(br, addr);
- if (fdb)
- atomic_inc(&fdb->use_count);
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ if (!__fdb_flush_matches(br, f, desc))
+ continue;
+
+ spin_lock_bh(&br->hash_lock);
+ if (!hlist_unhashed(&f->fdb_node))
+ fdb_delete(br, f, true);
+ spin_unlock_bh(&br->hash_lock);
+ }
rcu_read_unlock();
- return fdb;
}
-static void fdb_rcu_free(struct rcu_head *head)
+static unsigned long __ndm_state_to_fdb_flags(u16 ndm_state)
{
- struct net_bridge_fdb_entry *ent
- = container_of(head, struct net_bridge_fdb_entry, rcu);
- kmem_cache_free(br_fdb_cache, ent);
+ unsigned long flags = 0;
+
+ if (ndm_state & NUD_PERMANENT)
+ __set_bit(BR_FDB_LOCAL, &flags);
+ if (ndm_state & NUD_NOARP)
+ __set_bit(BR_FDB_STATIC, &flags);
+
+ return flags;
}
-/* Set entry up for deletion with RCU */
-void br_fdb_put(struct net_bridge_fdb_entry *ent)
+static unsigned long __ndm_flags_to_fdb_flags(u8 ndm_flags)
{
- if (atomic_dec_and_test(&ent->use_count))
- call_rcu(&ent->rcu, fdb_rcu_free);
+ unsigned long flags = 0;
+
+ if (ndm_flags & NTF_USE)
+ __set_bit(BR_FDB_ADDED_BY_USER, &flags);
+ if (ndm_flags & NTF_EXT_LEARNED)
+ __set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &flags);
+ if (ndm_flags & NTF_OFFLOADED)
+ __set_bit(BR_FDB_OFFLOADED, &flags);
+ if (ndm_flags & NTF_STICKY)
+ __set_bit(BR_FDB_STICKY, &flags);
+
+ return flags;
}
-/*
- * Fill buffer with forwarding table records in
- * the API format.
- */
-int br_fdb_fillbuf(struct net_bridge *br, void *buf,
- unsigned long maxnum, unsigned long skip)
+static int __fdb_flush_validate_ifindex(const struct net_bridge *br,
+ int ifindex,
+ struct netlink_ext_ack *extack)
{
- struct __fdb_entry *fe = buf;
- int i, num = 0;
- struct hlist_node *h;
- struct net_bridge_fdb_entry *f;
+ const struct net_device *dev;
- memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
+ dev = __dev_get_by_index(dev_net(br->dev), ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Unknown flush device ifindex");
+ return -ENODEV;
+ }
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Flush device is not a bridge or bridge port");
+ return -EINVAL;
+ }
+ if (netif_is_bridge_master(dev) && dev != br->dev) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Flush bridge device does not match target bridge device");
+ return -EINVAL;
+ }
+ if (netif_is_bridge_port(dev)) {
+ struct net_bridge_port *p = br_port_get_rtnl(dev);
- rcu_read_lock();
- for (i = 0; i < BR_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
- if (num >= maxnum)
- goto out;
+ if (p->br != br) {
+ NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
+ return -EINVAL;
+ }
+ }
- if (has_expired(br, f))
- continue;
+ return 0;
+}
- if (skip) {
- --skip;
- continue;
- }
+static const struct nla_policy br_fdb_del_bulk_policy[NDA_MAX + 1] = {
+ [NDA_VLAN] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
+ [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [NDA_NDM_STATE_MASK] = { .type = NLA_U16 },
+ [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 },
+};
+
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_fdb_flush_desc desc = {};
+ struct ndmsg *ndm = nlmsg_data(nlh);
+ struct net_bridge_port *p = NULL;
+ struct nlattr *tb[NDA_MAX + 1];
+ struct net_bridge *br;
+ u8 ndm_flags;
+ int err;
+
+ ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS;
- /* convert from internal format to API */
- memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
- fe->port_no = f->dst->port_no;
- fe->is_local = f->is_local;
- if (!f->is_static)
- fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->ageing_timer);
- ++fe;
- ++num;
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
+ br_fdb_del_bulk_policy, extack);
+ if (err)
+ return err;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge port");
+ return -EINVAL;
}
+ br = p->br;
}
- out:
- rcu_read_unlock();
+ if (tb[NDA_VLAN])
+ desc.vlan_id = nla_get_u16(tb[NDA_VLAN]);
- return num;
+ if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) {
+ NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set");
+ return -EINVAL;
+ }
+ if (ndm->ndm_state & ~FDB_FLUSH_ALLOWED_NDM_STATES) {
+ NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set");
+ return -EINVAL;
+ }
+
+ desc.flags |= __ndm_state_to_fdb_flags(ndm->ndm_state);
+ desc.flags |= __ndm_flags_to_fdb_flags(ndm_flags);
+ if (tb[NDA_NDM_STATE_MASK]) {
+ u16 ndm_state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]);
+
+ desc.flags_mask |= __ndm_state_to_fdb_flags(ndm_state_mask);
+ }
+ if (tb[NDA_NDM_FLAGS_MASK]) {
+ u8 ndm_flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]);
+
+ desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask);
+ }
+ if (tb[NDA_IFINDEX]) {
+ int ifidx = nla_get_s32(tb[NDA_IFINDEX]);
+
+ err = __fdb_flush_validate_ifindex(br, ifidx, extack);
+ if (err)
+ return err;
+ desc.port_ifindex = ifidx;
+ } else if (p) {
+ /* flush was invoked with port device and NTF_MASTER */
+ desc.port_ifindex = p->dev->ifindex;
+ }
+
+ br_debug(br, "flushing port ifindex: %d vlan id: %u flags: 0x%lx flags mask: 0x%lx\n",
+ desc.port_ifindex, desc.vlan_id, desc.flags, desc.flags_mask);
+
+ br_fdb_flush(br, &desc);
+
+ return 0;
}
-static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
- const unsigned char *addr)
+/* Flush all entries referring to a specific port.
+ * if do_all is set also flush static entries
+ * if vid is set delete all entries that match the vlan_id
+ */
+void br_fdb_delete_by_port(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid,
+ int do_all)
{
- struct hlist_node *h;
- struct net_bridge_fdb_entry *fdb;
+ struct net_bridge_fdb_entry *f;
+ struct hlist_node *tmp;
+
+ spin_lock_bh(&br->hash_lock);
+ hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
+ if (f->dst != p)
+ continue;
+
+ if (!do_all)
+ if (test_bit(BR_FDB_STATIC, &f->flags) ||
+ (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags) &&
+ !test_bit(BR_FDB_OFFLOADED, &f->flags)) ||
+ (vid && f->key.vlan_id != vid))
+ continue;
- hlist_for_each_entry_rcu(fdb, h, head, hlist) {
- if (!compare_ether_addr(fdb->addr.addr, addr))
- return fdb;
+ if (test_bit(BR_FDB_LOCAL, &f->flags))
+ fdb_delete_local(br, p, f);
+ else
+ fdb_delete(br, f, true);
}
- return NULL;
+ spin_unlock_bh(&br->hash_lock);
}
-static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
- struct net_bridge_port *source,
- const unsigned char *addr,
- int is_local)
+#if IS_ENABLED(CONFIG_ATM_LANE)
+/* Interface used by ATM LANE hook to test
+ * if an addr is on some other bridge port */
+int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
{
struct net_bridge_fdb_entry *fdb;
+ struct net_bridge_port *port;
+ int ret;
- fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
- if (fdb) {
- memcpy(fdb->addr.addr, addr, ETH_ALEN);
- atomic_set(&fdb->use_count, 1);
- hlist_add_head_rcu(&fdb->hlist, head);
+ rcu_read_lock();
+ port = br_port_get_rcu(dev);
+ if (!port)
+ ret = 0;
+ else {
+ const struct net_bridge_port *dst = NULL;
- fdb->dst = source;
- fdb->is_local = is_local;
- fdb->is_static = is_local;
- fdb->ageing_timer = jiffies;
+ fdb = br_fdb_find_rcu(port->br, addr, 0);
+ if (fdb)
+ dst = READ_ONCE(fdb->dst);
+
+ ret = dst && dst->dev != dev &&
+ dst->state == BR_STATE_FORWARDING;
}
- return fdb;
+ rcu_read_unlock();
+
+ return ret;
}
+#endif /* CONFIG_ATM_LANE */
-static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+/*
+ * Fill buffer with forwarding table records in
+ * the API format.
+ */
+int br_fdb_fillbuf(struct net_bridge *br, void *buf,
+ unsigned long maxnum, unsigned long skip)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr)];
- struct net_bridge_fdb_entry *fdb;
+ struct net_bridge_fdb_entry *f;
+ struct __fdb_entry *fe = buf;
+ int num = 0;
- if (!is_valid_ether_addr(addr))
- return -EINVAL;
+ memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
- fdb = fdb_find(head, addr);
- if (fdb) {
- /* it is okay to have multiple ports with same
- * address, just use the first one.
- */
- if (fdb->is_local)
- return 0;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ if (num >= maxnum)
+ break;
- printk(KERN_WARNING "%s adding interface with same address "
- "as a received packet\n",
- source->dev->name);
- fdb_delete(fdb);
- }
+ if (has_expired(br, f))
+ continue;
- if (!fdb_create(head, source, addr, 1))
- return -ENOMEM;
+ /* ignore pseudo entry for local MAC address */
+ if (!f->dst)
+ continue;
- return 0;
+ if (skip) {
+ --skip;
+ continue;
+ }
+
+ /* convert from internal format to API */
+ memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN);
+
+ /* due to ABI compat need to split into hi/lo */
+ fe->port_no = f->dst->port_no;
+ fe->port_hi = f->dst->port_no >> 8;
+
+ fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags);
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
+ fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
+ ++fe;
+ ++num;
+ }
+ rcu_read_unlock();
+
+ return num;
}
-int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+/* Add entry for local address of interface */
+int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid)
{
int ret;
spin_lock_bh(&br->hash_lock);
- ret = fdb_insert(br, source, addr);
+ ret = fdb_add_local(br, source, addr, vid);
spin_unlock_bh(&br->hash_lock);
return ret;
}
+/* returns true if the fdb was modified */
+static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb)
+{
+ return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) &&
+ test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags));
+}
+
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+ const unsigned char *addr, u16 vid, unsigned long flags)
{
- struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
/* some users want to always flood. */
if (hold_time(br) == 0)
return;
- fdb = fdb_find(head, addr);
+ fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
if (likely(fdb)) {
/* attempt to update an entry for a local interface */
- if (unlikely(fdb->is_local)) {
- if (net_ratelimit())
- printk(KERN_WARNING "%s: received packet with "
- " own address as source address\n",
- source->dev->name);
+ if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) {
+ if (net_ratelimit())
+ br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
+ source->dev->name, addr, vid);
} else {
+ unsigned long now = jiffies;
+ bool fdb_modified = false;
+
+ if (now != fdb->updated) {
+ fdb->updated = now;
+ fdb_modified = __fdb_mark_active(fdb);
+ }
+
/* fastpath: update of existing entry */
- fdb->dst = source;
- fdb->ageing_timer = jiffies;
+ if (unlikely(source != READ_ONCE(fdb->dst) &&
+ !test_bit(BR_FDB_STICKY, &fdb->flags))) {
+ br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH);
+ WRITE_ONCE(fdb->dst, source);
+ fdb_modified = true;
+ /* Take over HW learned entry */
+ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN,
+ &fdb->flags)))
+ clear_bit(BR_FDB_ADDED_BY_EXT_LEARN,
+ &fdb->flags);
+ /* Clear locked flag when roaming to an
+ * unlocked port.
+ */
+ if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags)))
+ clear_bit(BR_FDB_LOCKED, &fdb->flags);
+ }
+
+ if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) {
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED,
+ &fdb->flags))
+ atomic_dec(&br->fdb_n_learned);
+ }
+ if (unlikely(fdb_modified)) {
+ trace_br_fdb_update(br, source, addr, vid, flags);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
+ }
}
} else {
spin_lock(&br->hash_lock);
- if (!fdb_find(head, addr))
- fdb_create(head, source, addr, 0);
+ fdb = fdb_create(br, source, addr, vid, flags);
+ if (fdb) {
+ trace_br_fdb_update(br, source, addr, vid, flags);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
+ }
/* else we lose race and someone else inserts
* it first, don't bother updating
*/
spin_unlock(&br->hash_lock);
}
}
+
+/* Dump information about entries, in response to GETNEIGH */
+int br_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev,
+ int *idx)
+{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_fdb_entry *f;
+ int err = 0;
+
+ if (!netif_is_bridge_master(dev))
+ return err;
+
+ if (!filter_dev) {
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+ if (err < 0)
+ return err;
+ }
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ if (*idx < ctx->fdb_idx)
+ goto skip;
+ if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
+ if (filter_dev != dev)
+ goto skip;
+ /* !f->dst is a special case for bridge
+ * It means the MAC belongs to the bridge
+ * Therefore need a little more filtering
+ * we only want to dump the !f->dst case
+ */
+ if (f->dst)
+ goto skip;
+ }
+ if (!filter_dev && f->dst)
+ goto skip;
+
+ err = fdb_fill_info(skb, br, f,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+skip:
+ *idx += 1;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+
+int br_fdb_get(struct sk_buff *skb,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 vid, u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_fdb_entry *f;
+ int err = 0;
+
+ rcu_read_lock();
+ f = br_fdb_find_rcu(br, addr, vid);
+ if (!f) {
+ NL_SET_ERR_MSG(extack, "Fdb entry not found");
+ err = -ENOENT;
+ goto errout;
+ }
+
+ err = fdb_fill_info(skb, br, f, portid, seq,
+ RTM_NEWNEIGH, 0);
+errout:
+ rcu_read_unlock();
+ return err;
+}
+
+/* returns true if the fdb is modified */
+static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify)
+{
+ bool modified = false;
+
+ /* allow to mark an entry as inactive, usually done on creation */
+ if ((notify & FDB_NOTIFY_INACTIVE_BIT) &&
+ !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags))
+ modified = true;
+
+ if ((notify & FDB_NOTIFY_BIT) &&
+ !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) {
+ /* enabled activity tracking */
+ modified = true;
+ } else if (!(notify & FDB_NOTIFY_BIT) &&
+ test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) {
+ /* disabled activity tracking, clear notify state */
+ clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags);
+ modified = true;
+ }
+
+ return modified;
+}
+
+/* Update (create or replace) forwarding database entry */
+static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
+ const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid,
+ struct nlattr *nfea_tb[])
+{
+ bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY);
+ bool refresh = !nfea_tb[NFEA_DONT_REFRESH];
+ struct net_bridge_fdb_entry *fdb;
+ u16 state = ndm->ndm_state;
+ bool modified = false;
+ u8 notify = 0;
+
+ /* If the port cannot learn allow only local and static entries */
+ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
+ !(source->state == BR_STATE_LEARNING ||
+ source->state == BR_STATE_FORWARDING))
+ return -EPERM;
+
+ if (!source && !(state & NUD_PERMANENT)) {
+ pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n",
+ br->dev->name);
+ return -EINVAL;
+ }
+
+ if (is_sticky && (state & NUD_PERMANENT))
+ return -EINVAL;
+
+ if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) {
+ notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]);
+ if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) ||
+ (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT)
+ return -EINVAL;
+ }
+
+ fdb = br_fdb_find(br, addr, vid);
+ if (fdb == NULL) {
+ if (!(flags & NLM_F_CREATE))
+ return -ENOENT;
+
+ fdb = fdb_create(br, source, addr, vid,
+ BIT(BR_FDB_ADDED_BY_USER));
+ if (!fdb)
+ return -ENOMEM;
+
+ modified = true;
+ } else {
+ if (flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ if (READ_ONCE(fdb->dst) != source) {
+ WRITE_ONCE(fdb->dst, source);
+ modified = true;
+ }
+
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags))
+ atomic_dec(&br->fdb_n_learned);
+ }
+
+ if (fdb_to_nud(br, fdb) != state) {
+ if (state & NUD_PERMANENT) {
+ set_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags))
+ fdb_add_hw_addr(br, addr);
+ } else if (state & NUD_NOARP) {
+ clear_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags))
+ fdb_add_hw_addr(br, addr);
+ } else {
+ clear_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags))
+ fdb_del_hw_addr(br, addr);
+ }
+
+ modified = true;
+ }
+
+ if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) {
+ change_bit(BR_FDB_STICKY, &fdb->flags);
+ modified = true;
+ }
+
+ if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags))
+ modified = true;
+
+ if (fdb_handle_notify(fdb, notify))
+ modified = true;
+
+ fdb->used = jiffies;
+ if (modified) {
+ if (refresh)
+ fdb->updated = jiffies;
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
+ }
+
+ return 0;
+}
+
+static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
+ struct net_bridge_port *p, const unsigned char *addr,
+ u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[],
+ bool *notified, struct netlink_ext_ack *extack)
+{
+ int err = 0;
+
+ if (ndm->ndm_flags & NTF_USE) {
+ if (!p) {
+ pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n",
+ br->dev->name);
+ return -EINVAL;
+ }
+ if (!nbp_state_should_learn(p))
+ return 0;
+
+ local_bh_disable();
+ rcu_read_lock();
+ br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER));
+ rcu_read_unlock();
+ local_bh_enable();
+ } else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
+ if (!p && !(ndm->ndm_state & NUD_PERMANENT)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "FDB entry towards bridge must be permanent");
+ return -EINVAL;
+ }
+ err = br_fdb_external_learn_add(br, p, addr, vid, false, true);
+ } else {
+ spin_lock_bh(&br->hash_lock);
+ err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb);
+ spin_unlock_bh(&br->hash_lock);
+ }
+
+ if (!err)
+ *notified = true;
+ return err;
+}
+
+static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = {
+ [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 },
+ [NFEA_DONT_REFRESH] = { .type = NLA_FLAG },
+};
+
+/* Add new permanent fdb entry with RTM_NEWNEIGH */
+int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid, u16 nlh_flags,
+ bool *notified, struct netlink_ext_ack *extack)
+{
+ struct nlattr *nfea_tb[NFEA_MAX + 1], *attr;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan *v;
+ struct net_bridge *br = NULL;
+ u32 ext_flags = 0;
+ int err = 0;
+
+ trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags);
+
+ if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
+ pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
+ return -EINVAL;
+ }
+
+ if (is_zero_ether_addr(addr)) {
+ pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n");
+ return -EINVAL;
+ }
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (!p) {
+ pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
+ dev->name);
+ return -EINVAL;
+ }
+ br = p->br;
+ vg = nbp_vlan_group(p);
+ }
+
+ if (tb[NDA_FLAGS_EXT])
+ ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+ if (ext_flags & NTF_EXT_LOCKED) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set");
+ return -EINVAL;
+ }
+
+ if (tb[NDA_FDB_EXT_ATTRS]) {
+ attr = tb[NDA_FDB_EXT_ATTRS];
+ err = nla_parse_nested(nfea_tb, NFEA_MAX, attr,
+ br_nda_fdb_pol, extack);
+ if (err)
+ return err;
+ } else {
+ memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1));
+ }
+
+ if (vid) {
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v)) {
+ pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name);
+ return -EINVAL;
+ }
+
+ /* VID was specified, so use it. */
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb,
+ notified, extack);
+ } else {
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb,
+ notified, extack);
+ if (err || !vg || !vg->num_vlans)
+ goto out;
+
+ /* We have vlans configured on this port and user didn't
+ * specify a VLAN. To be nice, add/update entry for every
+ * vlan on this port.
+ */
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid,
+ nfea_tb, notified, extack);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ return err;
+}
+
+static int fdb_delete_by_addr_and_port(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const u8 *addr, u16 vlan, bool *notified)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ fdb = br_fdb_find(br, addr, vlan);
+ if (!fdb || READ_ONCE(fdb->dst) != p)
+ return -ENOENT;
+
+ fdb_delete(br, fdb, true);
+ *notified = true;
+
+ return 0;
+}
+
+static int __br_fdb_delete(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid, bool *notified)
+{
+ int err;
+
+ spin_lock_bh(&br->hash_lock);
+ err = fdb_delete_by_addr_and_port(br, p, addr, vid, notified);
+ spin_unlock_bh(&br->hash_lock);
+
+ return err;
+}
+
+/* Remove neighbor entry with RTM_DELNEIGH */
+int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid, bool *notified,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge *br;
+ int err;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (!p) {
+ pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
+ dev->name);
+ return -EINVAL;
+ }
+ vg = nbp_vlan_group(p);
+ br = p->br;
+ }
+
+ if (vid) {
+ err = __br_fdb_delete(br, p, addr, vid, notified);
+ } else {
+ struct net_bridge_vlan *v;
+
+ err = -ENOENT;
+ err &= __br_fdb_delete(br, p, addr, 0, notified);
+ if (!vg || !vg->num_vlans)
+ return err;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+ err &= __br_fdb_delete(br, p, addr, v->vid, notified);
+ }
+ }
+
+ return err;
+}
+
+int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
+{
+ struct net_bridge_fdb_entry *f, *tmp;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ /* the key here is that static entries change only under rtnl */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ /* We only care for static entries */
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
+ continue;
+ err = dev_uc_add(p->dev, f->key.addr.addr);
+ if (err)
+ goto rollback;
+ }
+done:
+ rcu_read_unlock();
+
+ return err;
+
+rollback:
+ hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) {
+ /* We only care for static entries */
+ if (!test_bit(BR_FDB_STATIC, &tmp->flags))
+ continue;
+ if (tmp == f)
+ break;
+ dev_uc_del(p->dev, tmp->key.addr.addr);
+ }
+
+ goto done;
+}
+
+void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
+{
+ struct net_bridge_fdb_entry *f;
+
+ ASSERT_RTNL();
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ /* We only care for static entries */
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
+ continue;
+
+ dev_uc_del(p->dev, f->key.addr.addr);
+ }
+ rcu_read_unlock();
+}
+
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid, bool locked,
+ bool swdev_notify)
+{
+ struct net_bridge_fdb_entry *fdb;
+ bool modified = false;
+ int err = 0;
+
+ trace_br_fdb_external_learn_add(br, p, addr, vid);
+
+ if (locked && (!p || !(p->flags & BR_PORT_MAB)))
+ return -EINVAL;
+
+ spin_lock_bh(&br->hash_lock);
+
+ fdb = br_fdb_find(br, addr, vid);
+ if (!fdb) {
+ unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN);
+
+ if (swdev_notify)
+ flags |= BIT(BR_FDB_ADDED_BY_USER);
+
+ if (!p)
+ flags |= BIT(BR_FDB_LOCAL);
+
+ if (locked)
+ flags |= BIT(BR_FDB_LOCKED);
+
+ fdb = fdb_create(br, p, addr, vid, flags);
+ if (!fdb) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
+ } else {
+ if (locked &&
+ (!test_bit(BR_FDB_LOCKED, &fdb->flags) ||
+ READ_ONCE(fdb->dst) != p)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ fdb->updated = jiffies;
+
+ if (READ_ONCE(fdb->dst) != p) {
+ WRITE_ONCE(fdb->dst, p);
+ modified = true;
+ }
+
+ if (test_and_set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) {
+ /* Refresh entry */
+ fdb->used = jiffies;
+ } else {
+ modified = true;
+ }
+
+ if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) {
+ change_bit(BR_FDB_LOCKED, &fdb->flags);
+ modified = true;
+ }
+
+ if (swdev_notify)
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+
+ if (!p)
+ set_bit(BR_FDB_LOCAL, &fdb->flags);
+
+ if ((swdev_notify || !p) &&
+ test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags))
+ atomic_dec(&br->fdb_n_learned);
+
+ if (modified)
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
+ }
+
+err_unlock:
+ spin_unlock_bh(&br->hash_lock);
+
+ return err;
+}
+
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
+{
+ struct net_bridge_fdb_entry *fdb;
+ int err = 0;
+
+ spin_lock_bh(&br->hash_lock);
+
+ fdb = br_fdb_find(br, addr, vid);
+ if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
+ fdb_delete(br, fdb, swdev_notify);
+ else
+ err = -ENOENT;
+
+ spin_unlock_bh(&br->hash_lock);
+
+ return err;
+}
+
+void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid, bool offloaded)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ spin_lock_bh(&br->hash_lock);
+
+ fdb = br_fdb_find(br, addr, vid);
+ if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags))
+ change_bit(BR_FDB_OFFLOADED, &fdb->flags);
+
+ spin_unlock_bh(&br->hash_lock);
+}
+
+void br_fdb_clear_offload(const struct net_device *dev, u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return;
+
+ spin_lock_bh(&p->br->hash_lock);
+ hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) {
+ if (f->dst == p && f->key.vlan_id == vid)
+ clear_bit(BR_FDB_OFFLOADED, &f->flags);
+ }
+ spin_unlock_bh(&p->br->hash_lock);
+}
+EXPORT_SYMBOL_GPL(br_fdb_clear_offload);
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 191b861e5e53..dea09096ad0f 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -1,160 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Forwarding decision
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_forward.c,v 1.4 2001/08/14 22:05:57 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/err.h>
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
+#include <linux/netpoll.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
#include <linux/netfilter_bridge.h>
#include "br_private.h"
-/* Don't forward packets to originating port or forwarding diasabled */
-static inline int should_deliver(const struct net_bridge_port *p,
+/* Don't forward packets to originating port or forwarding disabled */
+static inline int should_deliver(const struct net_bridge_port *p,
const struct sk_buff *skb)
{
- return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING);
-}
+ struct net_bridge_vlan_group *vg;
-static inline unsigned packet_length(const struct sk_buff *skb)
-{
- return skb->len - (skb->protocol == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+ vg = nbp_vlan_group_rcu(p);
+ return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
+ (br_mst_is_enabled(p) || p->state == BR_STATE_FORWARDING) &&
+ br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) &&
+ !br_skb_isolated(p, skb);
}
-int br_dev_queue_push_xmit(struct sk_buff *skb)
+int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- /* drop mtu oversized packets except gso */
- if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
- kfree_skb(skb);
- else {
- /* ip_refrag calls ip_fragment, doesn't copy the MAC header. */
- if (nf_bridge_maybe_copy_header(skb))
- kfree_skb(skb);
- else {
- skb_push(skb, ETH_HLEN);
+ skb_push(skb, ETH_HLEN);
+ if (!is_skb_forwardable(skb->dev, skb))
+ goto drop;
- dev_queue_xmit(skb);
- }
+ br_drop_fake_rtable(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ eth_type_vlan(skb->protocol)) {
+ int depth;
+
+ if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth))
+ goto drop;
+
+ skb_set_network_header(skb, depth);
}
+ br_switchdev_frame_set_offload_fwd_mark(skb);
+
+ dev_queue_xmit(skb);
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
return 0;
}
+EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
-int br_forward_finish(struct sk_buff *skb)
+int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
+ skb_clear_tstamp(skb);
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
+EXPORT_SYMBOL_GPL(br_forward_finish);
-static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
-{
- skb->dev = to->dev;
- NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
- br_forward_finish);
-}
-
-static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+static void __br_forward(const struct net_bridge_port *to,
+ struct sk_buff *skb, bool local_orig)
{
+ struct net_bridge_vlan_group *vg;
struct net_device *indev;
+ struct net *net;
+ int br_hook;
+
+ /* Mark the skb for forwarding offload early so that br_handle_vlan()
+ * can know whether to pop the VLAN header on egress or keep it.
+ */
+ nbp_switchdev_frame_mark_tx_fwd_offload(to, skb);
+
+ vg = nbp_vlan_group_rcu(to);
+ skb = br_handle_vlan(to->br, to, vg, skb);
+ if (!skb)
+ return;
indev = skb->dev;
skb->dev = to->dev;
- skb->ip_summed = CHECKSUM_NONE;
+ if (!local_orig) {
+ if (skb_warn_if_lro(skb)) {
+ kfree_skb(skb);
+ return;
+ }
+ br_hook = NF_BR_FORWARD;
+ skb_forward_csum(skb);
+ net = dev_net(indev);
+ } else {
+ if (unlikely(netpoll_tx_running(to->br->dev))) {
+ skb_push(skb, ETH_HLEN);
+ if (!is_skb_forwardable(skb->dev, skb))
+ kfree_skb(skb);
+ else
+ br_netpoll_send_skb(to, skb);
+ return;
+ }
+ br_hook = NF_BR_LOCAL_OUT;
+ net = dev_net(skb->dev);
+ indev = NULL;
+ }
- NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
- br_forward_finish);
+ NF_HOOK(NFPROTO_BRIDGE, br_hook,
+ net, NULL, skb, indev, skb->dev,
+ br_forward_finish);
}
-/* called with rcu_read_lock */
-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+static int deliver_clone(const struct net_bridge_port *prev,
+ struct sk_buff *skb, bool local_orig)
{
- if (should_deliver(to, skb)) {
- __br_deliver(to, skb);
- return;
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb) {
+ DEV_STATS_INC(dev, tx_dropped);
+ return -ENOMEM;
}
- kfree_skb(skb);
+ __br_forward(prev, skb, local_orig);
+ return 0;
}
-/* called with rcu_read_lock */
-void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+/**
+ * br_forward - forward a packet to a specific port
+ * @to: destination port
+ * @skb: packet being forwarded
+ * @local_rcv: packet will be received locally after forwarding
+ * @local_orig: packet is locally originated
+ *
+ * Should be called with rcu_read_lock.
+ */
+void br_forward(const struct net_bridge_port *to,
+ struct sk_buff *skb, bool local_rcv, bool local_orig)
{
+ if (unlikely(!to))
+ goto out;
+
+ /* redirect to backup link if the destination port is down */
+ if (rcu_access_pointer(to->backup_port) &&
+ (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) {
+ struct net_bridge_port *backup_port;
+
+ backup_port = rcu_dereference(to->backup_port);
+ if (unlikely(!backup_port))
+ goto out;
+ BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid);
+ to = backup_port;
+ }
+
if (should_deliver(to, skb)) {
- __br_forward(to, skb);
+ if (local_rcv)
+ deliver_clone(to, skb, local_orig);
+ else
+ __br_forward(to, skb, local_orig);
return;
}
- kfree_skb(skb);
+out:
+ if (!local_rcv)
+ kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(br_forward);
-/* called under bridge lock */
-static void br_flood(struct net_bridge *br, struct sk_buff *skb, int clone,
- void (*__packet_hook)(const struct net_bridge_port *p,
- struct sk_buff *skb))
+static struct net_bridge_port *maybe_deliver(
+ struct net_bridge_port *prev, struct net_bridge_port *p,
+ struct sk_buff *skb, bool local_orig)
{
+ u8 igmp_type = br_multicast_igmp_type(skb);
+ int err;
+
+ if (!should_deliver(p, skb))
+ return prev;
+
+ nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb);
+
+ if (!prev)
+ goto out;
+
+ err = deliver_clone(prev, skb, local_orig);
+ if (err)
+ return ERR_PTR(err);
+out:
+ br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX);
+
+ return p;
+}
+
+/* called under rcu_read_lock */
+void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
+ u16 vid)
+{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET;
+ struct net_bridge_port *prev = NULL;
struct net_bridge_port *p;
- struct net_bridge_port *prev;
- if (clone) {
- struct sk_buff *skb2;
+ br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST);
- if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
- br->statistics.tx_dropped++;
- return;
+ list_for_each_entry_rcu(p, &br->port_list, list) {
+ /* Do not flood unicast traffic to ports that turn it off, nor
+ * other traffic if flood off, except for traffic we originate
+ */
+ switch (pkt_type) {
+ case BR_PKT_UNICAST:
+ if (!(p->flags & BR_FLOOD))
+ continue;
+ break;
+ case BR_PKT_MULTICAST:
+ if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
+ continue;
+ break;
+ case BR_PKT_BROADCAST:
+ if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev)
+ continue;
+ break;
}
- skb = skb2;
+ /* Do not flood to ports that enable proxy ARP */
+ if (p->flags & BR_PROXYARP)
+ continue;
+ if (BR_INPUT_SKB_CB(skb)->proxyarp_replied &&
+ ((p->flags & BR_PROXYARP_WIFI) ||
+ br_is_neigh_suppress_enabled(p, vid)))
+ continue;
+
+ prev = maybe_deliver(prev, p, skb, local_orig);
+ if (IS_ERR(prev)) {
+ reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM :
+ SKB_DROP_REASON_NOT_SPECIFIED;
+ goto out;
+ }
}
- prev = NULL;
+ if (!prev)
+ goto out;
- list_for_each_entry_rcu(p, &br->port_list, list) {
- if (should_deliver(p, skb)) {
- if (prev != NULL) {
- struct sk_buff *skb2;
+ if (local_rcv)
+ deliver_clone(prev, skb, local_orig);
+ else
+ __br_forward(prev, skb, local_orig);
+ return;
- if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
- br->statistics.tx_dropped++;
- kfree_skb(skb);
- return;
- }
+out:
+ if (!local_rcv)
+ kfree_skb_reason(skb, reason);
+}
- __packet_hook(prev, skb2);
- }
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+ const unsigned char *addr, bool local_orig)
+{
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+ const unsigned char *src = eth_hdr(skb)->h_source;
+ struct sk_buff *nskb;
- prev = p;
- }
- }
+ if (!should_deliver(p, skb))
+ return;
- if (prev != NULL) {
- __packet_hook(prev, skb);
+ /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+ if (skb->dev == p->dev && ether_addr_equal(src, addr))
+ return;
+
+ __skb_push(skb, ETH_HLEN);
+ nskb = pskb_copy(skb, GFP_ATOMIC);
+ __skb_pull(skb, ETH_HLEN);
+ if (!nskb) {
+ DEV_STATS_INC(dev, tx_dropped);
return;
}
- kfree_skb(skb);
-}
+ skb = nskb;
+ __skb_pull(skb, ETH_HLEN);
+ if (!is_broadcast_ether_addr(addr))
+ memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+ __br_forward(p, skb, local_orig);
+}
/* called with rcu_read_lock */
-void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, int clone)
+void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb,
+ struct net_bridge_mcast *brmctx,
+ bool local_rcv, bool local_orig)
{
- br_flood(br, skb, clone, __br_deliver);
-}
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET;
+ struct net_bridge_port *prev = NULL;
+ struct net_bridge_port_group *p;
+ bool allow_mode_include = true;
+ struct hlist_node *rp;
-/* called under bridge lock */
-void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, int clone)
-{
- br_flood(br, skb, clone, __br_forward);
+ rp = br_multicast_get_first_rport_node(brmctx, skb);
+
+ if (mdst) {
+ p = rcu_dereference(mdst->ports);
+ if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) &&
+ br_multicast_is_star_g(&mdst->addr))
+ allow_mode_include = false;
+ } else {
+ p = NULL;
+ br_tc_skb_miss_set(skb, true);
+ }
+
+ while (p || rp) {
+ struct net_bridge_port *port, *lport, *rport;
+
+ lport = p ? p->key.port : NULL;
+ rport = br_multicast_rport_from_node_skb(rp, skb);
+
+ if ((unsigned long)lport > (unsigned long)rport) {
+ port = lport;
+
+ if (port->flags & BR_MULTICAST_TO_UNICAST) {
+ maybe_deliver_addr(lport, skb, p->eth_addr,
+ local_orig);
+ goto delivered;
+ }
+ if ((!allow_mode_include &&
+ p->filter_mode == MCAST_INCLUDE) ||
+ (p->flags & MDB_PG_FLAGS_BLOCKED))
+ goto delivered;
+ } else {
+ port = rport;
+ }
+
+ prev = maybe_deliver(prev, port, skb, local_orig);
+ if (IS_ERR(prev)) {
+ reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM :
+ SKB_DROP_REASON_NOT_SPECIFIED;
+ goto out;
+ }
+delivered:
+ if ((unsigned long)lport >= (unsigned long)port)
+ p = rcu_dereference(p->next);
+ if ((unsigned long)rport >= (unsigned long)port)
+ rp = rcu_dereference(hlist_next_rcu(rp));
+ }
+
+ if (!prev)
+ goto out;
+
+ if (local_rcv)
+ deliver_clone(prev, skb, local_orig);
+ else
+ __br_forward(prev, skb, local_orig);
+ return;
+
+out:
+ if (!local_rcv)
+ kfree_skb_reason(skb, reason);
}
+#endif
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f753c40c11d2..4c67a32745f6 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -1,27 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Userspace interface
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_if.c,v 1.7 2001/12/24 00:59:55 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/netpoll.h>
#include <linux/ethtool.h>
#include <linux/if_arp.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/rtnetlink.h>
#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/dsa.h>
#include <net/sock.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+#include <net/net_namespace.h>
#include "br_private.h"
@@ -29,35 +30,31 @@
* Determine initial path cost based on speed.
* using recommendations from 802.1d standard
*
- * Need to simulate user ioctl because not all device's that support
- * ethtool, use ethtool_ops. Also, since driver might sleep need to
- * not be holding any locks.
+ * Since driver might sleep need to not be holding any locks.
*/
static int port_cost(struct net_device *dev)
{
- struct ethtool_cmd ecmd = { ETHTOOL_GSET };
- struct ifreq ifr;
- mm_segment_t old_fs;
- int err;
-
- strncpy(ifr.ifr_name, dev->name, IFNAMSIZ);
- ifr.ifr_data = (void __user *) &ecmd;
+ struct ethtool_link_ksettings ecmd;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = dev_ethtool(&ifr);
- set_fs(old_fs);
-
- if (!err) {
- switch(ecmd.speed) {
- case SPEED_100:
- return 19;
- case SPEED_1000:
- return 4;
+ if (!__ethtool_get_link_ksettings(dev, &ecmd)) {
+ switch (ecmd.base.speed) {
case SPEED_10000:
return 2;
+ case SPEED_5000:
+ return 3;
+ case SPEED_2500:
+ return 4;
+ case SPEED_1000:
+ return 5;
+ case SPEED_100:
+ return 19;
case SPEED_10:
return 100;
+ case SPEED_UNKNOWN:
+ return 100;
+ default:
+ if (ecmd.base.speed > SPEED_10000)
+ return 1;
}
}
@@ -72,39 +69,191 @@ static int port_cost(struct net_device *dev)
}
-/*
- * Check for port carrier transistions.
- * Called from work queue to allow for calling functions that
- * might sleep (such as speed check), and to debounce.
+/* Check for port carrier transitions. */
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
+{
+ struct net_device *dev = p->dev;
+ struct net_bridge *br = p->br;
+
+ if (!(p->flags & BR_ADMIN_COST) &&
+ netif_running(dev) && netif_oper_up(dev))
+ p->path_cost = port_cost(dev);
+
+ *notified = false;
+ if (!netif_running(br->dev))
+ return;
+
+ spin_lock_bh(&br->lock);
+ if (netif_running(dev) && netif_oper_up(dev)) {
+ if (p->state == BR_STATE_DISABLED) {
+ br_stp_enable_port(p);
+ *notified = true;
+ }
+ } else {
+ if (p->state != BR_STATE_DISABLED) {
+ br_stp_disable_port(p);
+ *notified = true;
+ }
+ }
+ spin_unlock_bh(&br->lock);
+}
+
+static void br_port_set_promisc(struct net_bridge_port *p)
+{
+ int err = 0;
+
+ if (br_promisc_port(p))
+ return;
+
+ err = dev_set_promiscuity(p->dev, 1);
+ if (err)
+ return;
+
+ br_fdb_unsync_static(p->br, p);
+ p->flags |= BR_PROMISC;
+}
+
+static void br_port_clear_promisc(struct net_bridge_port *p)
+{
+ int err;
+
+ /* Check if the port is already non-promisc or if it doesn't
+ * support UNICAST filtering. Without unicast filtering support
+ * we'll end up re-enabling promisc mode anyway, so just check for
+ * it here.
+ */
+ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT))
+ return;
+
+ /* Since we'll be clearing the promisc mode, program the port
+ * first so that we don't have interruption in traffic.
+ */
+ err = br_fdb_sync_static(p->br, p);
+ if (err)
+ return;
+
+ dev_set_promiscuity(p->dev, -1);
+ p->flags &= ~BR_PROMISC;
+}
+
+/* When a port is added or removed or when certain port flags
+ * change, this function is called to automatically manage
+ * promiscuity setting of all the bridge ports. We are always called
+ * under RTNL so can skip using rcu primitives.
*/
-static void port_carrier_check(void *arg)
+void br_manage_promisc(struct net_bridge *br)
{
- struct net_device *dev = arg;
struct net_bridge_port *p;
- struct net_bridge *br;
-
- rtnl_lock();
- p = dev->br_port;
- if (!p)
- goto done;
- br = p->br;
+ bool set_all = false;
- if (netif_carrier_ok(dev))
- p->path_cost = port_cost(dev);
+ /* If vlan filtering is disabled or bridge interface is placed
+ * into promiscuous mode, place all ports in promiscuous mode.
+ */
+ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev))
+ set_all = true;
- if (br->dev->flags & IFF_UP) {
- spin_lock_bh(&br->lock);
- if (netif_carrier_ok(dev)) {
- if (p->state == BR_STATE_DISABLED)
- br_stp_enable_port(p);
+ list_for_each_entry(p, &br->port_list, list) {
+ if (set_all) {
+ br_port_set_promisc(p);
} else {
- if (p->state != BR_STATE_DISABLED)
- br_stp_disable_port(p);
+ /* If the number of auto-ports is <= 1, then all other
+ * ports will have their output configuration
+ * statically specified through fdbs. Since ingress
+ * on the auto-port becomes forwarding/egress to other
+ * ports and egress configuration is statically known,
+ * we can say that ingress configuration of the
+ * auto-port is also statically known.
+ * This lets us disable promiscuous mode and write
+ * this config to hw.
+ */
+ if ((p->dev->priv_flags & IFF_UNICAST_FLT) &&
+ (br->auto_cnt == 0 ||
+ (br->auto_cnt == 1 && br_auto_port(p))))
+ br_port_clear_promisc(p);
+ else
+ br_port_set_promisc(p);
}
- spin_unlock_bh(&br->lock);
}
-done:
- rtnl_unlock();
+}
+
+int nbp_backup_change(struct net_bridge_port *p,
+ struct net_device *backup_dev)
+{
+ struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
+ struct net_bridge_port *backup_p = NULL;
+
+ ASSERT_RTNL();
+
+ if (backup_dev) {
+ if (!netif_is_bridge_port(backup_dev))
+ return -ENOENT;
+
+ backup_p = br_port_get_rtnl(backup_dev);
+ if (backup_p->br != p->br)
+ return -EINVAL;
+ }
+
+ if (p == backup_p)
+ return -EINVAL;
+
+ if (old_backup == backup_p)
+ return 0;
+
+ /* if the backup link is already set, clear it */
+ if (old_backup)
+ old_backup->backup_redirected_cnt--;
+
+ if (backup_p)
+ backup_p->backup_redirected_cnt++;
+ rcu_assign_pointer(p->backup_port, backup_p);
+
+ return 0;
+}
+
+static void nbp_backup_clear(struct net_bridge_port *p)
+{
+ nbp_backup_change(p, NULL);
+ if (p->backup_redirected_cnt) {
+ struct net_bridge_port *cur_p;
+
+ list_for_each_entry(cur_p, &p->br->port_list, list) {
+ struct net_bridge_port *backup_p;
+
+ backup_p = rtnl_dereference(cur_p->backup_port);
+ if (backup_p == p)
+ nbp_backup_change(cur_p, NULL);
+ }
+ }
+
+ WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
+}
+
+static void nbp_update_port_count(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ u32 cnt = 0;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (br_auto_port(p))
+ cnt++;
+ }
+ if (br->auto_cnt != cnt) {
+ br->auto_cnt = cnt;
+ br_manage_promisc(br);
+ }
+}
+
+static void nbp_delete_promisc(struct net_bridge_port *p)
+{
+ /* If port is currently promiscuous, unset promiscuity.
+ * Otherwise, it is a static port so remove all addresses
+ * from it.
+ */
+ dev_set_allmulti(p->dev, -1);
+ if (br_promisc_port(p))
+ dev_set_promiscuity(p->dev, -1);
+ else
+ br_fdb_unsync_static(p->br, p);
}
static void release_nbp(struct kobject *kobj)
@@ -114,11 +263,19 @@ static void release_nbp(struct kobject *kobj)
kfree(p);
}
-static struct kobj_type brport_ktype = {
+static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
+{
+ struct net_bridge_port *p = kobj_to_brport(kobj);
+
+ net_ns_get_ownership(dev_net(p->dev), uid, gid);
+}
+
+static const struct kobj_type brport_ktype = {
#ifdef CONFIG_SYSFS
.sysfs_ops = &brport_sysfs_ops,
#endif
.release = release_nbp,
+ .get_ownership = brport_get_ownership,
};
static void destroy_nbp(struct net_bridge_port *p)
@@ -127,7 +284,7 @@ static void destroy_nbp(struct net_bridge_port *p)
p->br = NULL;
p->dev = NULL;
- dev_put(dev);
+ netdev_put(dev, &p->dev_tracker);
kobject_put(&p->kobj);
}
@@ -139,6 +296,31 @@ static void destroy_nbp_rcu(struct rcu_head *head)
destroy_nbp(p);
}
+static unsigned get_max_headroom(struct net_bridge *br)
+{
+ unsigned max_headroom = 0;
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
+
+ if (dev_headroom > max_headroom)
+ max_headroom = dev_headroom;
+ }
+
+ return max_headroom;
+}
+
+static void update_headroom(struct net_bridge *br, int new_hr)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list)
+ netdev_set_rx_headroom(p->dev, new_hr);
+
+ br->dev->needed_headroom = new_hr;
+}
+
/* Delete port(interface) from bridge is done in two steps.
* via RCU. First step, marks device as down. That deletes
* all the timers and stops new packets from flowing through.
@@ -153,82 +335,66 @@ static void del_nbp(struct net_bridge_port *p)
struct net_bridge *br = p->br;
struct net_device *dev = p->dev;
- sysfs_remove_link(&br->ifobj, dev->name);
-
- dev_set_promiscuity(dev, -1);
+ sysfs_remove_link(br->ifobj, p->dev->name);
- cancel_delayed_work(&p->carrier_check);
+ nbp_delete_promisc(p);
spin_lock_bh(&br->lock);
br_stp_disable_port(p);
spin_unlock_bh(&br->lock);
- br_fdb_delete_by_port(br, p, 1);
+ br_mrp_port_del(br, p);
+ br_cfm_port_del(br, p);
+
+ br_ifinfo_notify(RTM_DELLINK, NULL, p);
list_del_rcu(&p->list);
+ if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
+ update_headroom(br, get_max_headroom(br));
+ netdev_reset_rx_headroom(dev);
+
+ nbp_vlan_flush(p);
+ br_fdb_delete_by_port(br, p, 0, 1);
+ switchdev_deferred_process();
+ nbp_backup_clear(p);
+
+ nbp_update_port_count(br);
+
+ netdev_upper_dev_unlink(dev, br->dev);
+
+ dev->priv_flags &= ~IFF_BRIDGE_PORT;
- rcu_assign_pointer(dev->br_port, NULL);
+ netdev_rx_handler_unregister(dev);
+
+ br_multicast_del_port(p);
kobject_uevent(&p->kobj, KOBJ_REMOVE);
kobject_del(&p->kobj);
+ br_netpoll_disable(p);
+
call_rcu(&p->rcu, destroy_nbp_rcu);
}
-/* called with RTNL */
-static void del_br(struct net_bridge *br)
+/* Delete bridge device */
+void br_dev_delete(struct net_device *dev, struct list_head *head)
{
+ struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p, *n;
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
- del_timer_sync(&br->gc_timer);
+ br_mst_uninit(br);
+ br_recalculate_neigh_suppress_enabled(br);
- br_sysfs_delbr(br->dev);
- unregister_netdevice(br->dev);
-}
+ br_fdb_delete_by_port(br, NULL, 0, 1);
-static struct net_device *new_bridge_dev(const char *name)
-{
- struct net_bridge *br;
- struct net_device *dev;
-
- dev = alloc_netdev(sizeof(struct net_bridge), name,
- br_dev_setup);
-
- if (!dev)
- return NULL;
-
- br = netdev_priv(dev);
- br->dev = dev;
-
- spin_lock_init(&br->lock);
- INIT_LIST_HEAD(&br->port_list);
- spin_lock_init(&br->hash_lock);
+ cancel_delayed_work_sync(&br->gc_work);
- br->bridge_id.prio[0] = 0x80;
- br->bridge_id.prio[1] = 0x00;
-
- memcpy(br->group_addr, br_group_address, ETH_ALEN);
-
- br->feature_mask = dev->features;
- br->stp_enabled = 0;
- br->designated_root = br->bridge_id;
- br->root_path_cost = 0;
- br->root_port = 0;
- br->bridge_max_age = br->max_age = 20 * HZ;
- br->bridge_hello_time = br->hello_time = 2 * HZ;
- br->bridge_forward_delay = br->forward_delay = 15 * HZ;
- br->topology_change = 0;
- br->topology_change_detected = 0;
- br->ageing_time = 300 * HZ;
- INIT_LIST_HEAD(&br->age_list);
-
- br_stp_timer_init(br);
-
- return dev;
+ br_sysfs_delbr(br->dev);
+ unregister_netdevice_queue(br->dev, head);
}
/* find an available port number */
@@ -238,28 +404,27 @@ static int find_portno(struct net_bridge *br)
struct net_bridge_port *p;
unsigned long *inuse;
- inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
- GFP_KERNEL);
+ inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
if (!inuse)
return -ENOMEM;
- set_bit(0, inuse); /* zero is reserved */
- list_for_each_entry(p, &br->port_list, list) {
- set_bit(p->port_no, inuse);
- }
+ __set_bit(0, inuse); /* zero is reserved */
+ list_for_each_entry(p, &br->port_list, list)
+ __set_bit(p->port_no, inuse);
+
index = find_first_zero_bit(inuse, BR_MAX_PORTS);
- kfree(inuse);
+ bitmap_free(inuse);
return (index >= BR_MAX_PORTS) ? -EXFULL : index;
}
/* called with RTNL but without bridge lock */
-static struct net_bridge_port *new_nbp(struct net_bridge *br,
+static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
- int index;
struct net_bridge_port *p;
-
+ int index, err;
+
index = find_portno(br);
if (index < 0)
return ERR_PTR(index);
@@ -269,66 +434,55 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
return ERR_PTR(-ENOMEM);
p->br = br;
- dev_hold(dev);
+ netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
p->path_cost = port_cost(dev);
- p->priority = 0x8000 >> BR_PORT_BITS;
+ p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
+ p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
br_init_port(p);
- p->state = BR_STATE_DISABLED;
- INIT_WORK(&p->carrier_check, port_carrier_check, dev);
+ br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
-
- kobject_init(&p->kobj);
- kobject_set_name(&p->kobj, SYSFS_BRIDGE_PORT_ATTR);
- p->kobj.ktype = &brport_ktype;
- p->kobj.parent = &(dev->class_dev.kobj);
- p->kobj.kset = NULL;
+ err = br_multicast_add_port(p);
+ if (err) {
+ netdev_put(dev, &p->dev_tracker);
+ kfree(p);
+ p = ERR_PTR(err);
+ }
return p;
}
-int br_add_bridge(const char *name)
+int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
- int ret;
+ int res;
- dev = new_bridge_dev(name);
- if (!dev)
- return -ENOMEM;
+ dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
+ br_dev_setup);
- rtnl_lock();
- if (strchr(dev->name, '%')) {
- ret = dev_alloc_name(dev, dev->name);
- if (ret < 0) {
- free_netdev(dev);
- goto out;
- }
- }
+ if (!dev)
+ return -ENOMEM;
- ret = register_netdevice(dev);
- if (ret)
- goto out;
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = &br_link_ops;
- ret = br_sysfs_addbr(dev);
- if (ret)
- unregister_netdevice(dev);
- out:
- rtnl_unlock();
- return ret;
+ res = register_netdevice(dev);
+ if (res)
+ free_netdev(dev);
+ return res;
}
-int br_del_bridge(const char *name)
+int br_del_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret = 0;
- rtnl_lock();
- dev = __dev_get_by_name(name);
- if (dev == NULL)
+ dev = __dev_get_by_name(net, name);
+ if (dev == NULL)
ret = -ENXIO; /* Could not find device */
- else if (!(dev->priv_flags & IFF_EBRIDGE)) {
+ else if (!netif_is_bridge_master(dev)) {
/* Attempt to delete non bridge device! */
ret = -EPERM;
}
@@ -336,154 +490,274 @@ int br_del_bridge(const char *name)
else if (dev->flags & IFF_UP) {
/* Not shutdown yet. */
ret = -EBUSY;
- }
+ }
- else
- del_br(netdev_priv(dev));
+ else
+ br_dev_delete(dev, NULL);
- rtnl_unlock();
return ret;
}
/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
-int br_min_mtu(const struct net_bridge *br)
+static int br_mtu_min(const struct net_bridge *br)
{
const struct net_bridge_port *p;
- int mtu = 0;
+ int ret_mtu = 0;
+
+ list_for_each_entry(p, &br->port_list, list)
+ if (!ret_mtu || ret_mtu > p->dev->mtu)
+ ret_mtu = p->dev->mtu;
+
+ return ret_mtu ? ret_mtu : ETH_DATA_LEN;
+}
+void br_mtu_auto_adjust(struct net_bridge *br)
+{
ASSERT_RTNL();
- if (list_empty(&br->port_list))
- mtu = ETH_DATA_LEN;
- else {
- list_for_each_entry(p, &br->port_list, list) {
- if (!mtu || p->dev->mtu < mtu)
- mtu = p->dev->mtu;
- }
- }
- return mtu;
+ /* if the bridge MTU was manually configured don't mess with it */
+ if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
+ return;
+
+ /* change to the minimum MTU and clear the flag which was set by
+ * the bridge ndo_change_mtu callback
+ */
+ dev_set_mtu(br->dev, br_mtu_min(br));
+ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false);
}
/*
* Recomputes features using slave's features
*/
-void br_features_recompute(struct net_bridge *br)
+netdev_features_t br_features_recompute(struct net_bridge *br,
+ netdev_features_t features)
{
struct net_bridge_port *p;
- unsigned long features, checksum;
-
- checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0;
- features = br->feature_mask & ~NETIF_F_ALL_CSUM;
-
- list_for_each_entry(p, &br->port_list, list) {
- unsigned long feature = p->dev->features;
+ netdev_features_t mask;
- if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
- checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
- if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
- checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
- if (!(feature & NETIF_F_IP_CSUM))
- checksum = 0;
+ if (list_empty(&br->port_list))
+ return features;
- if (feature & NETIF_F_GSO)
- feature |= NETIF_F_GSO_SOFTWARE;
- feature |= NETIF_F_GSO;
+ mask = features;
+ features &= ~NETIF_F_ONE_FOR_ALL;
- features &= feature;
+ list_for_each_entry(p, &br->port_list, list) {
+ features = netdev_increment_features(features,
+ p->dev->features, mask);
}
+ features = netdev_add_tso_features(features, mask);
- if (!(checksum & NETIF_F_ALL_CSUM))
- features &= ~NETIF_F_SG;
- if (!(features & NETIF_F_SG))
- features &= ~NETIF_F_GSO_MASK;
-
- br->dev->features = features | checksum | NETIF_F_LLTX |
- NETIF_F_GSO_ROBUST;
+ return features;
}
/* called with RTNL */
-int br_add_if(struct net_bridge *br, struct net_device *dev)
+int br_add_if(struct net_bridge *br, struct net_device *dev,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_port *p;
int err = 0;
+ unsigned br_hr, dev_hr;
+ bool changed_addr, fdb_synced = false;
- if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER)
+ /* Don't allow bridging non-ethernet like devices. */
+ if ((dev->flags & IFF_LOOPBACK) ||
+ dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
+ !is_valid_ether_addr(dev->dev_addr))
return -EINVAL;
- if (dev->hard_start_xmit == br_dev_xmit)
+ /* No bridging of bridges */
+ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
+ NL_SET_ERR_MSG(extack,
+ "Can not enslave a bridge to a bridge");
return -ELOOP;
+ }
- if (dev->br_port != NULL)
+ /* Device has master upper dev */
+ if (netdev_master_upper_dev_get(dev))
return -EBUSY;
+ /* No bridging devices that dislike that (e.g. wireless) */
+ if (dev->priv_flags & IFF_DONT_BRIDGE) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not allow enslaving to a bridge");
+ return -EOPNOTSUPP;
+ }
+
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);
- err = kobject_add(&p->kobj);
- if (err)
- goto err0;
+ call_netdevice_notifiers(NETDEV_JOIN, dev);
- err = br_fdb_insert(br, p, dev->dev_addr);
- if (err)
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ br_multicast_del_port(p);
+ netdev_put(dev, &p->dev_tracker);
+ kfree(p); /* kobject not yet init'd, manually free */
goto err1;
+ }
+
+ err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
+ SYSFS_BRIDGE_PORT_ATTR);
+ if (err)
+ goto err2;
err = br_sysfs_addif(p);
if (err)
goto err2;
- rcu_assign_pointer(dev->br_port, p);
- dev_set_promiscuity(dev, 1);
+ err = br_netpoll_enable(p);
+ if (err)
+ goto err3;
+
+ err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p);
+ if (err)
+ goto err4;
+
+ dev->priv_flags |= IFF_BRIDGE_PORT;
+
+ err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack);
+ if (err)
+ goto err5;
+
+ dev_disable_lro(dev);
list_add_rcu(&p->list, &br->port_list);
+ nbp_update_port_count(br);
+ if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) {
+ /* When updating the port count we also update all ports'
+ * promiscuous mode.
+ * A port leaving promiscuous mode normally gets the bridge's
+ * fdb synced to the unicast filter (if supported), however,
+ * `br_port_clear_promisc` does not distinguish between
+ * non-promiscuous ports and *new* ports, so we need to
+ * sync explicitly here.
+ */
+ fdb_synced = br_fdb_sync_static(br, p) == 0;
+ if (!fdb_synced)
+ netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n");
+ }
+
+ br_hr = br->dev->needed_headroom;
+ dev_hr = netdev_get_fwd_headroom(dev);
+ if (br_hr < dev_hr)
+ update_headroom(br, dev_hr);
+ else
+ netdev_set_rx_headroom(dev, br_hr);
+
+ if (br_fdb_add_local(br, p, dev->dev_addr, 0))
+ netdev_err(dev, "failed insert local address bridge forwarding table\n");
+
+ if (br->dev->addr_assign_type != NET_ADDR_SET) {
+ /* Ask for permission to use this MAC address now, even if we
+ * don't end up choosing it below.
+ */
+ err = netif_pre_changeaddr_notify(br->dev, dev->dev_addr,
+ extack);
+ if (err)
+ goto err6;
+ }
+
+ err = nbp_vlan_init(p, extack);
+ if (err) {
+ netdev_err(dev, "failed to initialize vlan filtering on this port\n");
+ goto err6;
+ }
+
spin_lock_bh(&br->lock);
- br_stp_recalculate_bridge_id(br);
- br_features_recompute(br);
- schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
+ changed_addr = br_stp_recalculate_bridge_id(br);
+
+ if (netif_running(dev) && netif_oper_up(dev) &&
+ (br->dev->flags & IFF_UP))
+ br_stp_enable_port(p);
spin_unlock_bh(&br->lock);
- dev_set_mtu(br->dev, br_min_mtu(br));
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+ if (changed_addr)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
+
+ br_mtu_auto_adjust(br);
+
+ netdev_compute_master_upper_features(br->dev, false);
+
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
+
+err6:
+ if (fdb_synced)
+ br_fdb_unsync_static(br, p);
+ list_del_rcu(&p->list);
+ br_fdb_delete_by_port(br, p, 0, 1);
+ nbp_update_port_count(br);
+ netdev_upper_dev_unlink(dev, br->dev);
+err5:
+ dev->priv_flags &= ~IFF_BRIDGE_PORT;
+ netdev_rx_handler_unregister(dev);
+err4:
+ br_netpoll_disable(p);
+err3:
+ sysfs_remove_link(br->ifobj, p->dev->name);
err2:
- br_fdb_delete_by_port(br, p, 1);
-err1:
- kobject_del(&p->kobj);
-err0:
+ br_multicast_del_port(p);
+ netdev_put(dev, &p->dev_tracker);
kobject_put(&p->kobj);
+ dev_set_allmulti(dev, -1);
+err1:
return err;
}
/* called with RTNL */
int br_del_if(struct net_bridge *br, struct net_device *dev)
{
- struct net_bridge_port *p = dev->br_port;
-
- if (!p || p->br != br)
+ struct net_bridge_port *p;
+ bool changed_addr;
+
+ p = br_port_get_rtnl(dev);
+ if (!p || p->br != br)
return -EINVAL;
+ /* Since more than one interface can be attached to a bridge,
+ * there still maybe an alternate path for netconsole to use;
+ * therefore there is no reason for a NETDEV_RELEASE event.
+ */
del_nbp(p);
+ br_mtu_auto_adjust(br);
+
spin_lock_bh(&br->lock);
- br_stp_recalculate_bridge_id(br);
- br_features_recompute(br);
+ changed_addr = br_stp_recalculate_bridge_id(br);
spin_unlock_bh(&br->lock);
+ if (changed_addr)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
+
+ netdev_compute_master_upper_features(br->dev, false);
+
return 0;
}
-void __exit br_cleanup_bridges(void)
+void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
{
- struct net_device *dev, *nxt;
+ struct net_bridge *br = p->br;
- rtnl_lock();
- for (dev = dev_base; dev; dev = nxt) {
- nxt = dev->next;
- if (dev->priv_flags & IFF_EBRIDGE)
- del_br(dev->priv);
- }
- rtnl_unlock();
+ if (mask & BR_AUTO_MASK)
+ nbp_update_port_count(br);
+
+ if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS))
+ br_recalculate_neigh_suppress_enabled(br);
+}
+
+bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
+{
+ struct net_bridge_port *p;
+
+ p = br_port_get_rtnl_rcu(dev);
+ if (!p)
+ return false;
+ return p->flags & flag;
}
+EXPORT_SYMBOL_GPL(br_port_flag_is_set);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index bfa4d8c333f7..777fa869c1a1 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -1,157 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handle incoming frames
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_input.c,v 1.10 2001/12/24 04:50:20 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/netfilter_bridge.h>
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+#include <net/netfilter/nf_queue.h>
+#endif
+#include <linux/neighbour.h>
+#include <net/arp.h>
+#include <net/dsa.h>
+#include <linux/export.h>
+#include <linux/rculist.h>
#include "br_private.h"
+#include "br_private_tunnel.h"
-/* Bridge group multicast address 802.1d (pg 51). */
-const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+static int
+br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ br_drop_fake_rtable(skb);
+ return netif_receive_skb(skb);
+}
-static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb)
+static int br_pass_frame_up(struct sk_buff *skb, bool promisc)
{
- struct net_device *indev;
+ struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
+ struct net_bridge *br = netdev_priv(brdev);
+ struct net_bridge_vlan_group *vg;
+
+ dev_sw_netstats_rx_add(brdev, skb->len);
+
+ vg = br_vlan_group_rcu(br);
- br->statistics.rx_packets++;
- br->statistics.rx_bytes += skb->len;
+ /* Reset the offload_fwd_mark because there could be a stacked
+ * bridge above, and it should not think this bridge it doing
+ * that bridge's work forwarding out its ports.
+ */
+ br_switchdev_frame_unmark(skb);
+
+ /* Bridge is just like any other port. Make sure the
+ * packet is allowed except in promisc mode when someone
+ * may be running packet capture.
+ */
+ if (!(brdev->flags & IFF_PROMISC) &&
+ !br_allowed_egress(vg, skb)) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
indev = skb->dev;
- skb->dev = br->dev;
+ skb->dev = brdev;
+ skb = br_handle_vlan(br, NULL, vg, skb);
+ if (!skb)
+ return NET_RX_DROP;
+ /* update the multicast stats if the packet is IGMP/MLD */
+ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
+ BR_MCAST_DIR_TX);
+
+ BR_INPUT_SKB_CB(skb)->promisc = promisc;
- NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
- netif_receive_skb);
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ dev_net(indev), NULL, skb, indev, NULL,
+ br_netif_receive_skb);
}
-/* note: already called with rcu_read_lock (preempt_disabled) */
-int br_handle_frame_finish(struct sk_buff *skb)
+/* note: already called with rcu_read_lock */
+int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- const unsigned char *dest = eth_hdr(skb)->h_dest;
- struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+ enum br_pkt_type pkt_type = BR_PKT_UNICAST;
+ struct net_bridge_fdb_entry *dst = NULL;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mdb_entry *mdst;
+ bool local_rcv, mcast_hit = false;
+ struct net_bridge_mcast *brmctx;
+ struct net_bridge_vlan *vlan;
struct net_bridge *br;
- struct net_bridge_fdb_entry *dst;
- int passedup = 0;
+ bool promisc;
+ u16 vid = 0;
+ u8 state;
- if (!p || p->state == BR_STATE_DISABLED)
+ if (!p)
goto drop;
- /* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
- br_fdb_update(br, p, eth_hdr(skb)->h_source);
- if (p->state == BR_STATE_LEARNING)
- goto drop;
+ if (br_mst_is_enabled(p)) {
+ state = BR_STATE_FORWARDING;
+ } else {
+ if (p->state == BR_STATE_DISABLED) {
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
+ goto drop;
+ }
+
+ state = p->state;
+ }
+
+ brmctx = &p->br->multicast_ctx;
+ pmctx = &p->multicast_ctx;
+ if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
+ &state, &vlan))
+ goto out;
- if (br->dev->flags & IFF_PROMISC) {
- struct sk_buff *skb2;
+ if (p->flags & BR_PORT_LOCKED) {
+ struct net_bridge_fdb_entry *fdb_src =
+ br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 != NULL) {
- passedup = 1;
- br_pass_frame_up(br, skb2);
+ if (!fdb_src) {
+ /* FDB miss. Create locked FDB entry if MAB is enabled
+ * and drop the packet.
+ */
+ if (p->flags & BR_PORT_MAB)
+ br_fdb_update(br, p, eth_hdr(skb)->h_source,
+ vid, BIT(BR_FDB_LOCKED));
+ goto drop;
+ } else if (READ_ONCE(fdb_src->dst) != p ||
+ test_bit(BR_FDB_LOCAL, &fdb_src->flags)) {
+ /* FDB mismatch. Drop the packet without roaming. */
+ goto drop;
+ } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) {
+ /* FDB match, but entry is locked. Refresh it and drop
+ * the packet.
+ */
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid,
+ BIT(BR_FDB_LOCKED));
+ goto drop;
}
}
- if (is_multicast_ether_addr(dest)) {
- br->statistics.multicast++;
- br_flood_forward(br, skb, !passedup);
- if (!passedup)
- br_pass_frame_up(br, skb);
- goto out;
+ nbp_switchdev_frame_mark(p, skb);
+
+ /* insert into forwarding database after filtering to avoid spoofing */
+ if (p->flags & BR_LEARNING)
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
+
+ promisc = !!(br->dev->flags & IFF_PROMISC);
+ local_rcv = promisc;
+
+ if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
+ /* by definition the broadcast is also a multicast address */
+ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) {
+ pkt_type = BR_PKT_BROADCAST;
+ local_rcv = true;
+ } else {
+ pkt_type = BR_PKT_MULTICAST;
+ if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid))
+ goto drop;
+ }
}
- dst = __br_fdb_get(br, dest);
- if (dst != NULL && dst->is_local) {
- if (!passedup)
- br_pass_frame_up(br, skb);
- else
- kfree_skb(skb);
- goto out;
+ if (state == BR_STATE_LEARNING) {
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
+ goto drop;
}
- if (dst != NULL) {
- br_forward(dst->dst, skb);
- goto out;
+ BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+ BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
+
+ if (IS_ENABLED(CONFIG_INET) &&
+ (skb->protocol == htons(ETH_P_ARP) ||
+ skb->protocol == htons(ETH_P_RARP))) {
+ br_do_proxy_suppress_arp(skb, br, vid, p);
+ } else if (IS_ENABLED(CONFIG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
+ pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+ sizeof(struct nd_msg)) &&
+ ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+ struct nd_msg *msg, _msg;
+
+ msg = br_is_nd_neigh_msg(skb, &_msg);
+ if (msg)
+ br_do_suppress_nd(skb, br, vid, p, msg);
+ }
+
+ switch (pkt_type) {
+ case BR_PKT_MULTICAST:
+ mdst = br_mdb_entry_skb_get(brmctx, skb, vid);
+ if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
+ br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) {
+ if ((mdst && mdst->host_joined) ||
+ br_multicast_is_router(brmctx, skb) ||
+ br->dev->flags & IFF_ALLMULTI) {
+ local_rcv = true;
+ DEV_STATS_INC(br->dev, multicast);
+ }
+ mcast_hit = true;
+ } else {
+ local_rcv = true;
+ DEV_STATS_INC(br->dev, multicast);
+ }
+ break;
+ case BR_PKT_UNICAST:
+ dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
+ if (unlikely(!dst && vid &&
+ br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) {
+ dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0);
+ if (dst &&
+ (!test_bit(BR_FDB_LOCAL, &dst->flags) ||
+ test_bit(BR_FDB_ADDED_BY_USER, &dst->flags)))
+ dst = NULL;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (dst) {
+ unsigned long now = jiffies;
+
+ if (test_bit(BR_FDB_LOCAL, &dst->flags))
+ return br_pass_frame_up(skb, false);
+
+ if (now != dst->used)
+ dst->used = now;
+ br_forward(dst->dst, skb, local_rcv, false);
+ } else {
+ if (!mcast_hit)
+ br_flood(br, skb, pkt_type, local_rcv, false, vid);
+ else
+ br_multicast_flood(mdst, skb, brmctx, local_rcv, false);
}
- br_flood_forward(br, skb, 0);
+ if (local_rcv)
+ return br_pass_frame_up(skb, promisc);
out:
return 0;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
goto out;
}
+EXPORT_SYMBOL_GPL(br_handle_frame_finish);
-/* note: already called with rcu_read_lock (preempt_disabled) */
-static int br_handle_local_finish(struct sk_buff *skb)
+static void __br_handle_local_finish(struct sk_buff *skb)
{
- struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+ u16 vid = 0;
- if (p && p->state != BR_STATE_DISABLED)
- br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+ /* check if vlan is allowed, to avoid spoofing */
+ if ((p->flags & BR_LEARNING) &&
+ nbp_state_should_learn(p) &&
+ !br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
+ br_should_learn(p, skb, &vid))
+ br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0);
+}
+
+/* note: already called with rcu_read_lock */
+static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ __br_handle_local_finish(skb);
- return 0; /* process further */
+ /* return 1 to signal the okfn() was called so it's ok to use the skb */
+ return 1;
}
-/* Does address match the link local multicast address.
- * 01:80:c2:00:00:0X
+static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
+{
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ struct nf_hook_entries *e = NULL;
+ struct nf_hook_state state;
+ unsigned int verdict, i;
+ struct net *net;
+ int ret;
+
+ net = dev_net(skb->dev);
+#ifdef HAVE_JUMP_LABEL
+ if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING]))
+ goto frame_finish;
+#endif
+
+ e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]);
+ if (!e)
+ goto frame_finish;
+
+ nf_hook_state_init(&state, NF_BR_PRE_ROUTING,
+ NFPROTO_BRIDGE, skb->dev, NULL, NULL,
+ net, br_handle_frame_finish);
+
+ for (i = 0; i < e->num_hook_entries; i++) {
+ verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state);
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) {
+ *pskb = skb;
+ return RX_HANDLER_PASS;
+ }
+ break;
+ case NF_DROP:
+ kfree_skb(skb);
+ return RX_HANDLER_CONSUMED;
+ case NF_QUEUE:
+ ret = nf_queue(skb, &state, i, verdict);
+ if (ret == 1)
+ continue;
+ return RX_HANDLER_CONSUMED;
+ default: /* STOLEN */
+ return RX_HANDLER_CONSUMED;
+ }
+ }
+frame_finish:
+ net = dev_net(skb->dev);
+ br_handle_frame_finish(net, NULL, skb);
+#else
+ br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
+#endif
+ return RX_HANDLER_CONSUMED;
+}
+
+/* Return 0 if the frame was not processed otherwise 1
+ * note: already called with rcu_read_lock
*/
-static inline int is_link_local(const unsigned char *dest)
+static int br_process_frame_type(struct net_bridge_port *p,
+ struct sk_buff *skb)
{
- return memcmp(dest, br_group_address, 5) == 0 && (dest[5] & 0xf0) == 0;
+ struct br_frame_type *tmp;
+
+ hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list)
+ if (unlikely(tmp->type == skb->protocol))
+ return tmp->frame_handler(p, skb);
+
+ return 0;
}
/*
- * Called via br_handle_frame_hook.
- * Return 0 if *pskb should be processed furthur
- * 1 if *pskb is handled
- * note: already called with rcu_read_lock (preempt_disabled)
+ * Return NULL if skb is handled
+ * note: already called with rcu_read_lock
*/
-int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
+static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
- if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
- goto err;
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ return RX_HANDLER_PASS;
- if (unlikely(is_link_local(dest))) {
- skb->pkt_type = PACKET_HOST;
- return NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
- NULL, br_handle_local_finish) != 0;
+ if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) {
+ reason = SKB_DROP_REASON_MAC_INVALID_SOURCE;
+ goto drop;
}
- if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
- if (br_should_route_hook) {
- if (br_should_route_hook(pskb))
- return 0;
- skb = *pskb;
- dest = eth_hdr(skb)->h_dest;
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return RX_HANDLER_CONSUMED;
+
+ memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
+ br_tc_skb_miss_set(skb, false);
+
+ p = br_port_get_rcu(skb->dev);
+ if (p->flags & BR_VLAN_TUNNEL)
+ br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p));
+
+ if (unlikely(is_link_local_ether_addr(dest))) {
+ u16 fwd_mask = p->br->group_fwd_mask_required;
+
+ /*
+ * See IEEE 802.1D Table 7-10 Reserved addresses
+ *
+ * Assignment Value
+ * Bridge Group Address 01-80-C2-00-00-00
+ * (MAC Control) 802.3 01-80-C2-00-00-01
+ * (Link Aggregation) 802.3 01-80-C2-00-00-02
+ * 802.1X PAE address 01-80-C2-00-00-03
+ *
+ * 802.1AB LLDP 01-80-C2-00-00-0E
+ *
+ * Others reserved for future standardization
+ */
+ fwd_mask |= p->group_fwd_mask;
+ switch (dest[5]) {
+ case 0x00: /* Bridge Group Address */
+ /* If STP is turned off,
+ then must forward to keep loop detection */
+ if (p->br->stp_enabled == BR_NO_STP ||
+ fwd_mask & (1u << dest[5]))
+ goto forward;
+ *pskb = skb;
+ __br_handle_local_finish(skb);
+ return RX_HANDLER_PASS;
+
+ case 0x01: /* IEEE MAC (Pause) */
+ reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL;
+ goto drop;
+
+ case 0x0E: /* 802.1AB LLDP */
+ fwd_mask |= p->br->group_fwd_mask;
+ if (fwd_mask & (1u << dest[5]))
+ goto forward;
+ *pskb = skb;
+ __br_handle_local_finish(skb);
+ return RX_HANDLER_PASS;
+
+ default:
+ /* Allow selective forwarding for most other protocols */
+ fwd_mask |= p->br->group_fwd_mask;
+ if (fwd_mask & (1u << dest[5]))
+ goto forward;
}
- if (!compare_ether_addr(p->br->dev->dev_addr, dest))
+ BR_INPUT_SKB_CB(skb)->promisc = false;
+
+ /* The else clause should be hit when nf_hook():
+ * - returns < 0 (drop/error)
+ * - returns = 0 (stolen/nf_queue)
+ * Thus return 1 from the okfn() to signal the skb is ok to pass
+ */
+ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ br_handle_local_finish) == 1) {
+ return RX_HANDLER_PASS;
+ } else {
+ return RX_HANDLER_CONSUMED;
+ }
+ }
+
+ if (unlikely(br_process_frame_type(p, skb)))
+ return RX_HANDLER_PASS;
+
+forward:
+ if (br_mst_is_enabled(p))
+ goto defer_stp_filtering;
+
+ switch (p->state) {
+ case BR_STATE_FORWARDING:
+ case BR_STATE_LEARNING:
+defer_stp_filtering:
+ if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
- NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
- br_handle_frame_finish);
- return 1;
+ return nf_hook_bridge_pre(skb, pskb);
+ default:
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
+drop:
+ kfree_skb_reason(skb, reason);
}
+ return RX_HANDLER_CONSUMED;
+}
-err:
- kfree_skb(skb);
- return 1;
+/* This function has no purpose other than to appease the br_port_get_rcu/rtnl
+ * helpers which identify bridged ports according to the rx_handler installed
+ * on them (so there _needs_ to be a bridge rx_handler even if we don't need it
+ * to do anything useful). This bridge won't support traffic to/from the stack,
+ * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal
+ * frames from the ETH_P_XDSA packet_type handler.
+ */
+static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb)
+{
+ return RX_HANDLER_PASS;
+}
+
+rx_handler_func_t *br_get_rx_handler(const struct net_device *dev)
+{
+ if (netdev_uses_dsa(dev))
+ return br_handle_frame_dummy;
+
+ return br_handle_frame;
+}
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+ hlist_add_head_rcu(&ft->list, &br->frame_type_list);
+}
+
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+ struct br_frame_type *tmp;
+
+ hlist_for_each_entry(tmp, &br->frame_type_list, list)
+ if (ft == tmp) {
+ hlist_del_rcu(&ft->list);
+ return;
+ }
}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 4c61a7e0a86e..6bc0a11f2ed3 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -1,36 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Ioctl handler
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_ioctl.c,v 1.4 2000/11/08 05:16:40 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
+#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/if_bridge.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <linux/times.h>
-#include <asm/uaccess.h>
+#include <net/net_namespace.h>
+#include <linux/uaccess.h>
#include "br_private.h"
-/* called with RTNL */
-static int get_bridge_ifindices(int *indices, int num)
+static int get_bridge_ifindices(struct net *net, int *indices, int num)
{
struct net_device *dev;
int i = 0;
- for (dev = dev_base; dev && i < num; dev = dev->next) {
- if (dev->priv_flags & IFF_EBRIDGE)
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (i >= num)
+ break;
+ if (netif_is_bridge_master(dev))
indices[i++] = dev->ifindex;
}
+ rcu_read_unlock();
return i;
}
@@ -53,7 +53,7 @@ static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num)
* (limited to a page for sanity)
* offset -- number of records to skip
*/
-static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
+static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
unsigned long maxnum, unsigned long offset)
{
int num;
@@ -69,10 +69,11 @@ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
buf = kmalloc(size, GFP_USER);
if (!buf)
return -ENOMEM;
-
+
num = br_fdb_fillbuf(br, buf, maxnum, offset);
if (num > 0) {
- if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry)))
+ if (copy_to_user(userbuf, buf,
+ array_size(num, sizeof(struct __fdb_entry))))
num = -EFAULT;
}
kfree(buf);
@@ -80,39 +81,78 @@ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
return num;
}
+/* called with RTNL */
static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
+ struct net *net = dev_net(br->dev);
struct net_device *dev;
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- dev = dev_get_by_index(ifindex);
+ dev = __dev_get_by_index(net, ifindex);
if (dev == NULL)
return -EINVAL;
-
+
if (isadd)
- ret = br_add_if(br, dev);
+ ret = br_add_if(br, dev, NULL);
else
ret = br_del_if(br, dev);
- dev_put(dev);
return ret;
}
+#define BR_UARGS_MAX 4
+static int br_dev_read_uargs(unsigned long *args, size_t nr_args,
+ void __user **argp, void __user *data)
+{
+ int ret;
+
+ if (nr_args < 2 || nr_args > BR_UARGS_MAX)
+ return -EINVAL;
+
+ if (in_compat_syscall()) {
+ unsigned int cargs[BR_UARGS_MAX];
+ int i;
+
+ ret = copy_from_user(cargs, data, nr_args * sizeof(*cargs));
+ if (ret)
+ goto fault;
+
+ for (i = 0; i < nr_args; ++i)
+ args[i] = cargs[i];
+
+ *argp = compat_ptr(args[1]);
+ } else {
+ ret = copy_from_user(args, data, nr_args * sizeof(*args));
+ if (ret)
+ goto fault;
+ *argp = (void __user *)args[1];
+ }
+
+ return 0;
+fault:
+ return -EFAULT;
+}
+
/*
* Legacy ioctl's through SIOCDEVPRIVATE
- * This interface is deprecated because it was too difficult to
- * to do the translation for 32/64bit ioctl compatability.
+ * This interface is deprecated because it was too difficult
+ * to do the translation for 32/64bit ioctl compatibility.
*/
-static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
+ void __user *data, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p = NULL;
unsigned long args[4];
-
- if (copy_from_user(args, rq->ifr_data, sizeof(args)))
- return -EFAULT;
+ void __user *argp;
+ int ret;
+
+ ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data);
+ if (ret)
+ return ret;
switch (args[0]) {
case BRCTL_ADD_IF:
@@ -137,13 +177,14 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
b.topology_change = br->topology_change;
b.topology_change_detected = br->topology_change_detected;
b.root_port = br->root_port;
- b.stp_enabled = br->stp_enabled;
+
+ b.stp_enabled = (br->stp_enabled != BR_NO_STP);
b.ageing_time = jiffies_to_clock_t(br->ageing_time);
b.hello_timer_value = br_timer_value(&br->hello_timer);
b.tcn_timer_value = br_timer_value(&br->tcn_timer);
b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
- b.gc_timer_value = br_timer_value(&br->gc_timer);
- rcu_read_unlock();
+ b.gc_timer_value = br_timer_value(&br->gc_work.timer);
+ rcu_read_unlock();
if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
return -EFAULT;
@@ -168,51 +209,39 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
return -ENOMEM;
get_port_ifindices(br, indices, num);
- if (copy_to_user((void __user *)args[1], indices, num*sizeof(int)))
+ if (copy_to_user(argp, indices, array_size(num, sizeof(int))))
num = -EFAULT;
kfree(indices);
return num;
}
case BRCTL_SET_BRIDGE_FORWARD_DELAY:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- spin_lock_bh(&br->lock);
- br->bridge_forward_delay = clock_t_to_jiffies(args[1]);
- if (br_is_root_bridge(br))
- br->forward_delay = br->bridge_forward_delay;
- spin_unlock_bh(&br->lock);
- return 0;
+ ret = br_set_forward_delay(br, args[1]);
+ break;
case BRCTL_SET_BRIDGE_HELLO_TIME:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- spin_lock_bh(&br->lock);
- br->bridge_hello_time = clock_t_to_jiffies(args[1]);
- if (br_is_root_bridge(br))
- br->hello_time = br->bridge_hello_time;
- spin_unlock_bh(&br->lock);
- return 0;
+ ret = br_set_hello_time(br, args[1]);
+ break;
case BRCTL_SET_BRIDGE_MAX_AGE:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- spin_lock_bh(&br->lock);
- br->bridge_max_age = clock_t_to_jiffies(args[1]);
- if (br_is_root_bridge(br))
- br->max_age = br->bridge_max_age;
- spin_unlock_bh(&br->lock);
- return 0;
+ ret = br_set_max_age(br, args[1]);
+ break;
case BRCTL_SET_AGEING_TIME:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- br->ageing_time = clock_t_to_jiffies(args[1]);
- return 0;
+ ret = br_set_ageing_time(br, args[1]);
+ break;
case BRCTL_GET_PORT_INFO:
{
@@ -241,79 +270,81 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
rcu_read_unlock();
- if (copy_to_user((void __user *)args[1], &p, sizeof(p)))
+ if (copy_to_user(argp, &p, sizeof(p)))
return -EFAULT;
return 0;
}
case BRCTL_SET_BRIDGE_STP_STATE:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- br->stp_enabled = args[1]?1:0;
- return 0;
+ ret = br_stp_set_enabled(br, args[1], NULL);
+ break;
case BRCTL_SET_BRIDGE_PRIORITY:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- spin_lock_bh(&br->lock);
br_stp_set_bridge_priority(br, args[1]);
- spin_unlock_bh(&br->lock);
- return 0;
+ ret = 0;
+ break;
case BRCTL_SET_PORT_PRIORITY:
{
- struct net_bridge_port *p;
- int ret = 0;
-
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- if (args[2] >= (1<<(16-BR_PORT_BITS)))
- return -ERANGE;
-
spin_lock_bh(&br->lock);
- if ((p = br_get_port(br, args[1])) == NULL)
+ if ((p = br_get_port(br, args[1])) == NULL)
ret = -EINVAL;
else
- br_stp_set_port_priority(p, args[2]);
+ ret = br_stp_set_port_priority(p, args[2]);
spin_unlock_bh(&br->lock);
- return ret;
+ break;
}
case BRCTL_SET_PATH_COST:
{
- struct net_bridge_port *p;
- int ret = 0;
-
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
spin_lock_bh(&br->lock);
if ((p = br_get_port(br, args[1])) == NULL)
ret = -EINVAL;
else
- br_stp_set_path_cost(p, args[2]);
+ ret = br_stp_set_path_cost(p, args[2]);
spin_unlock_bh(&br->lock);
- return ret;
+ break;
}
case BRCTL_GET_FDB_ENTRIES:
- return get_fdb_entries(br, (void __user *)args[1],
- args[2], args[3]);
+ return get_fdb_entries(br, argp, args[2], args[3]);
+
+ default:
+ ret = -EOPNOTSUPP;
}
- return -EOPNOTSUPP;
+ if (!ret) {
+ if (p)
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+ else
+ netdev_state_change(br->dev);
+ }
+
+ return ret;
}
-static int old_deviceless(void __user *uarg)
+static int old_deviceless(struct net *net, void __user *data)
{
unsigned long args[3];
+ void __user *argp;
+ int ret;
- if (copy_from_user(args, uarg, sizeof(args)))
- return -EFAULT;
+ ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data);
+ if (ret)
+ return ret;
switch (args[0]) {
case BRCTL_GET_VERSION:
@@ -330,9 +361,10 @@ static int old_deviceless(void __user *uarg)
if (indices == NULL)
return -ENOMEM;
- args[2] = get_bridge_ifindices(indices, args[2]);
+ args[2] = get_bridge_ifindices(net, indices, args[2]);
- ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int))
+ ret = copy_to_user(argp, indices,
+ array_size(args[2], sizeof(int)))
? -EFAULT : args[2];
kfree(indices);
@@ -344,66 +376,95 @@ static int old_deviceless(void __user *uarg)
{
char buf[IFNAMSIZ];
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ))
+ if (copy_from_user(buf, argp, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
if (args[0] == BRCTL_ADD_BRIDGE)
- return br_add_bridge(buf);
+ return br_add_bridge(net, buf);
- return br_del_bridge(buf);
+ return br_del_bridge(net, buf);
}
}
return -EOPNOTSUPP;
}
-int br_ioctl_deviceless_stub(unsigned int cmd, void __user *uarg)
+int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
+ int ret = -EOPNOTSUPP;
+ struct ifreq ifr;
+
+ if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) {
+ void __user *data;
+ char *colon;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (get_user_ifreq(&ifr, &data, uarg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+ }
+
+ rtnl_lock();
+
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
- return old_deviceless(uarg);
-
+ ret = old_deviceless(net, uarg);
+ break;
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
- if (copy_from_user(buf, uarg, IFNAMSIZ))
- return -EFAULT;
+ if (copy_from_user(buf, uarg, IFNAMSIZ)) {
+ ret = -EFAULT;
+ break;
+ }
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
- return br_add_bridge(buf);
-
- return br_del_bridge(buf);
- }
+ ret = br_add_bridge(net, buf);
+ else
+ ret = br_del_bridge(net, buf);
}
- return -EOPNOTSUPP;
-}
-
-int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-{
- struct net_bridge *br = netdev_priv(dev);
-
- switch(cmd) {
- case SIOCDEVPRIVATE:
- return old_dev_ioctl(dev, rq, cmd);
-
+ break;
case SIOCBRADDIF:
case SIOCBRDELIF:
- return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
+ {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev || !netif_device_present(dev)) {
+ ret = -ENODEV;
+ break;
+ }
+ if (!netif_is_bridge_master(dev)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF);
+ }
+ break;
}
- pr_debug("Bridge does not support ioctl 0x%x\n", cmd);
- return -EOPNOTSUPP;
+ rtnl_unlock();
+
+ return ret;
}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
new file mode 100644
index 000000000000..400eb872b403
--- /dev/null
+++ b/net/bridge/br_mdb.c
@@ -0,0 +1,1722 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/igmp.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/switchdev.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#endif
+
+#include "br_private.h"
+
+static bool
+br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx,
+ unsigned long *timer)
+{
+ *timer = br_timer_value(&pmctx->ip4_mc_router_timer);
+ return !hlist_unhashed(&pmctx->ip4_rlist);
+}
+
+static bool
+br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx,
+ unsigned long *timer)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ *timer = br_timer_value(&pmctx->ip6_mc_router_timer);
+ return !hlist_unhashed(&pmctx->ip6_rlist);
+#else
+ *timer = 0;
+ return false;
+#endif
+}
+
+static size_t __br_rports_one_size(void)
+{
+ return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */
+ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */
+ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */
+ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */
+ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */
+ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */
+}
+
+size_t br_rports_size(const struct net_bridge_mcast *brmctx)
+{
+ struct net_bridge_mcast_port *pmctx;
+ size_t size = nla_total_size(0); /* MDBA_ROUTER */
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list,
+ ip4_rlist)
+ size += __br_rports_one_size();
+
+#if IS_ENABLED(CONFIG_IPV6)
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list,
+ ip6_rlist)
+ size += __br_rports_one_size();
+#endif
+ rcu_read_unlock();
+
+ return size;
+}
+
+int br_rports_fill_info(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx)
+{
+ u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0;
+ bool have_ip4_mc_rtr, have_ip6_mc_rtr;
+ unsigned long ip4_timer, ip6_timer;
+ struct nlattr *nest, *port_nest;
+ struct net_bridge_port *p;
+
+ if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, MDBA_ROUTER);
+ if (nest == NULL)
+ return -EMSGSIZE;
+
+ list_for_each_entry_rcu(p, &brmctx->br->port_list, list) {
+ struct net_bridge_mcast_port *pmctx;
+
+ if (vid) {
+ struct net_bridge_vlan *v;
+
+ v = br_vlan_find(nbp_vlan_group(p), vid);
+ if (!v)
+ continue;
+ pmctx = &v->port_mcast_ctx;
+ } else {
+ pmctx = &p->multicast_ctx;
+ }
+
+ have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer);
+ have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer);
+
+ if (!have_ip4_mc_rtr && !have_ip6_mc_rtr)
+ continue;
+
+ port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT);
+ if (!port_nest)
+ goto fail;
+
+ if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
+ nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
+ max(ip4_timer, ip6_timer)) ||
+ nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
+ p->multicast_ctx.multicast_router) ||
+ (have_ip4_mc_rtr &&
+ nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER,
+ ip4_timer)) ||
+ (have_ip6_mc_rtr &&
+ nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER,
+ ip6_timer)) ||
+ (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) {
+ nla_nest_cancel(skb, port_nest);
+ goto fail;
+ }
+ nla_nest_end(skb, port_nest);
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+fail:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
+{
+ e->state = flags & MDB_PG_FLAGS_PERMANENT;
+ e->flags = 0;
+ if (flags & MDB_PG_FLAGS_OFFLOAD)
+ e->flags |= MDB_FLAGS_OFFLOAD;
+ if (flags & MDB_PG_FLAGS_FAST_LEAVE)
+ e->flags |= MDB_FLAGS_FAST_LEAVE;
+ if (flags & MDB_PG_FLAGS_STAR_EXCL)
+ e->flags |= MDB_FLAGS_STAR_EXCL;
+ if (flags & MDB_PG_FLAGS_BLOCKED)
+ e->flags |= MDB_FLAGS_BLOCKED;
+ if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED)
+ e->flags |= MDB_FLAGS_OFFLOAD_FAILED;
+}
+
+static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip,
+ struct nlattr **mdb_attrs)
+{
+ memset(ip, 0, sizeof(struct br_ip));
+ ip->vid = entry->vid;
+ ip->proto = entry->addr.proto;
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ ip->dst.ip4 = entry->addr.u.ip4;
+ if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE])
+ ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ip->dst.ip6 = entry->addr.u.ip6;
+ if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE])
+ ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
+ break;
+#endif
+ default:
+ ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr);
+ }
+
+}
+
+static int __mdb_fill_srcs(struct sk_buff *skb,
+ struct net_bridge_port_group *p)
+{
+ struct net_bridge_group_src *ent;
+ struct nlattr *nest, *nest_ent;
+
+ if (hlist_empty(&p->src_list))
+ return 0;
+
+ nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(ent, &p->src_list, node,
+ lockdep_is_held(&p->key.port->br->multicast_lock)) {
+ nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY);
+ if (!nest_ent)
+ goto out_cancel_err;
+ switch (ent->addr.proto) {
+ case htons(ETH_P_IP):
+ if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ ent->addr.src.ip4)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ &ent->addr.src.ip6)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#endif
+ default:
+ nla_nest_cancel(skb, nest_ent);
+ continue;
+ }
+ if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER,
+ br_timer_value(&ent->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ nla_nest_end(skb, nest_ent);
+ }
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+out_cancel_err:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int __mdb_fill_info(struct sk_buff *skb,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *p)
+{
+ bool dump_srcs_mode = false;
+ struct timer_list *mtimer;
+ struct nlattr *nest_ent;
+ struct br_mdb_entry e;
+ u8 flags = 0;
+ int ifindex;
+
+ memset(&e, 0, sizeof(e));
+ if (p) {
+ ifindex = p->key.port->dev->ifindex;
+ mtimer = &p->timer;
+ flags = p->flags;
+ } else {
+ ifindex = mp->br->dev->ifindex;
+ mtimer = &mp->timer;
+ }
+
+ __mdb_entry_fill_flags(&e, flags);
+ e.ifindex = ifindex;
+ e.vid = mp->addr.vid;
+ if (mp->addr.proto == htons(ETH_P_IP)) {
+ e.addr.u.ip4 = mp->addr.dst.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (mp->addr.proto == htons(ETH_P_IPV6)) {
+ e.addr.u.ip6 = mp->addr.dst.ip6;
+#endif
+ } else {
+ ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
+ e.state = MDB_PERMANENT;
+ }
+ e.addr.proto = mp->addr.proto;
+ nest_ent = nla_nest_start_noflag(skb,
+ MDBA_MDB_ENTRY_INFO);
+ if (!nest_ent)
+ return -EMSGSIZE;
+
+ if (nla_put_nohdr(skb, sizeof(e), &e) ||
+ nla_put_u32(skb,
+ MDBA_MDB_EATTR_TIMER,
+ br_timer_value(mtimer)))
+ goto nest_err;
+
+ switch (mp->addr.proto) {
+ case htons(ETH_P_IP):
+ dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3);
+ if (mp->addr.src.ip4) {
+ if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE,
+ mp->addr.src.ip4))
+ goto nest_err;
+ break;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2);
+ if (!ipv6_addr_any(&mp->addr.src.ip6)) {
+ if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE,
+ &mp->addr.src.ip6))
+ goto nest_err;
+ break;
+ }
+ break;
+#endif
+ default:
+ ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
+ }
+ if (p) {
+ if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol))
+ goto nest_err;
+ if (dump_srcs_mode &&
+ (__mdb_fill_srcs(skb, p) ||
+ nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE,
+ p->filter_mode)))
+ goto nest_err;
+ }
+ nla_nest_end(skb, nest_ent);
+
+ return 0;
+
+nest_err:
+ nla_nest_cancel(skb, nest_ent);
+ return -EMSGSIZE;
+}
+
+static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev)
+{
+ int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2];
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_mdb_entry *mp;
+ struct nlattr *nest, *nest2;
+
+ nest = nla_nest_start_noflag(skb, MDBA_MDB);
+ if (nest == NULL)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+
+ if (idx < s_idx)
+ goto skip;
+
+ nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+ if (!nest2) {
+ err = -EMSGSIZE;
+ break;
+ }
+
+ if (!s_pidx && mp->host_joined) {
+ err = __mdb_fill_info(skb, mp, NULL);
+ if (err) {
+ nla_nest_cancel(skb, nest2);
+ break;
+ }
+ }
+
+ for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
+ pp = &p->next) {
+ if (!p->key.port)
+ continue;
+ if (pidx < s_pidx)
+ goto skip_pg;
+
+ err = __mdb_fill_info(skb, mp, p);
+ if (err) {
+ nla_nest_end(skb, nest2);
+ goto out;
+ }
+skip_pg:
+ pidx++;
+ }
+ pidx = 0;
+ s_pidx = 0;
+ nla_nest_end(skb, nest2);
+skip:
+ idx++;
+ }
+
+out:
+ cb->args[1] = idx;
+ cb->args[2] = pidx;
+ nla_nest_end(skb, nest);
+ return err;
+}
+
+int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm),
+ NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->ifindex = dev->ifindex;
+
+ rcu_read_lock();
+
+ err = br_mdb_fill_info(skb, cb, dev);
+ if (err)
+ goto out;
+ err = br_rports_fill_info(skb, &br->multicast_ctx);
+ if (err)
+ goto out;
+
+out:
+ rcu_read_unlock();
+ nlmsg_end(skb, nlh);
+ return err;
+}
+
+static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct br_port_msg *bpm;
+ struct nlattr *nest, *nest2;
+
+ nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = dev->ifindex;
+ nest = nla_nest_start_noflag(skb, MDBA_MDB);
+ if (nest == NULL)
+ goto cancel;
+ nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+ if (nest2 == NULL)
+ goto end;
+
+ if (__mdb_fill_info(skb, mp, pg))
+ goto end;
+
+ nla_nest_end(skb, nest2);
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+end:
+ nla_nest_end(skb, nest);
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static size_t rtnl_mdb_nlmsg_pg_size(const struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_src *ent;
+ size_t nlmsg_size, addr_size = 0;
+
+ /* MDBA_MDB_ENTRY_INFO */
+ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) +
+ /* MDBA_MDB_EATTR_TIMER */
+ nla_total_size(sizeof(u32));
+
+ if (!pg)
+ goto out;
+
+ /* MDBA_MDB_EATTR_RTPROT */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ switch (pg->key.addr.proto) {
+ case htons(ETH_P_IP):
+ /* MDBA_MDB_EATTR_SOURCE */
+ if (pg->key.addr.src.ip4)
+ nlmsg_size += nla_total_size(sizeof(__be32));
+ if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2)
+ goto out;
+ addr_size = sizeof(__be32);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ /* MDBA_MDB_EATTR_SOURCE */
+ if (!ipv6_addr_any(&pg->key.addr.src.ip6))
+ nlmsg_size += nla_total_size(sizeof(struct in6_addr));
+ if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1)
+ goto out;
+ addr_size = sizeof(struct in6_addr);
+ break;
+#endif
+ }
+
+ /* MDBA_MDB_EATTR_GROUP_MODE */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ /* MDBA_MDB_EATTR_SRC_LIST nested attr */
+ if (!hlist_empty(&pg->src_list))
+ nlmsg_size += nla_total_size(0);
+
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ /* MDBA_MDB_SRCLIST_ENTRY nested attr +
+ * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER
+ */
+ nlmsg_size += nla_total_size(0) +
+ nla_total_size(addr_size) +
+ nla_total_size(sizeof(u32));
+ }
+out:
+ return nlmsg_size;
+}
+
+static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg)
+{
+ return NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+ /* MDBA_MDB */
+ nla_total_size(0) +
+ /* MDBA_MDB_ENTRY */
+ nla_total_size(0) +
+ /* Port group entry */
+ rtnl_mdb_nlmsg_pg_size(pg);
+}
+
+static void __br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type, bool notify_switchdev)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ if (notify_switchdev)
+ br_switchdev_mdb_notify(dev, mp, pg, type);
+
+ skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+void br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
+{
+ __br_mdb_notify(dev, mp, pg, type, true);
+}
+
+void br_mdb_flag_change_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg)
+{
+ __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false);
+}
+
+static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ int ifindex, u16 vid, u32 pid,
+ u32 seq, int type, unsigned int flags)
+{
+ struct nlattr *nest, *port_nest;
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = dev->ifindex;
+ nest = nla_nest_start_noflag(skb, MDBA_ROUTER);
+ if (!nest)
+ goto cancel;
+
+ port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT);
+ if (!port_nest)
+ goto end;
+ if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) {
+ nla_nest_cancel(skb, port_nest);
+ goto end;
+ }
+ if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) {
+ nla_nest_cancel(skb, port_nest);
+ goto end;
+ }
+ nla_nest_end(skb, port_nest);
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+end:
+ nla_nest_end(skb, nest);
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_rtr_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_port_msg))
+ + nla_total_size(sizeof(__u32))
+ + nla_total_size(sizeof(u16));
+}
+
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
+ int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ int ifindex;
+ u16 vid;
+
+ ifindex = pmctx ? pmctx->port->dev->ifindex : 0;
+ vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid :
+ 0;
+ skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type,
+ NTF_SELF);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+static const struct nla_policy
+br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = {
+ [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+};
+
+static const struct nla_policy
+br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = {
+ [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol),
+};
+
+static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+ [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE,
+ MCAST_INCLUDE),
+ [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol),
+ [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+};
+
+static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (nla_len(attr) != sizeof(struct in_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length");
+ return false;
+ }
+ if (ipv4_is_multicast(nla_get_in_addr(attr))) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed");
+ return false;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6): {
+ struct in6_addr src;
+
+ if (nla_len(attr) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length");
+ return false;
+ }
+ src = nla_get_in6_addr(attr);
+ if (ipv6_addr_is_multicast(&src)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed");
+ return false;
+ }
+ break;
+ }
+#endif
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address");
+ return false;
+ }
+
+ return true;
+}
+
+static struct net_bridge_mcast *
+__br_mdb_choose_context(struct net_bridge *br,
+ const struct br_mdb_entry *entry,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_mcast *brmctx = NULL;
+ struct net_bridge_vlan *v;
+
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ brmctx = &br->multicast_ctx;
+ goto out;
+ }
+
+ if (!entry->vid) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled");
+ goto out;
+ }
+
+ v = br_vlan_find(br_vlan_group(br), entry->vid);
+ if (!v) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured");
+ goto out;
+ }
+ if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled");
+ goto out;
+ }
+ brmctx = &v->br_mcast_ctx;
+out:
+ return brmctx;
+}
+
+static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags)
+{
+ unsigned long now = jiffies;
+
+ pg->flags = flags;
+ pg->rt_protocol = cfg->rt_protocol;
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
+ mod_timer(&pg->timer,
+ now + brmctx->multicast_membership_interval);
+ else
+ timer_delete(&pg->timer);
+
+ br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB);
+
+ return 0;
+}
+
+static int br_mdb_add_group_sg(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ unsigned long now = jiffies;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, cfg->br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port == cfg->p) {
+ if (!(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port");
+ return -EEXIST;
+ }
+ return br_mdb_replace_group_sg(cfg, mp, p, brmctx,
+ flags);
+ }
+ if ((unsigned long)p->key.port < (unsigned long)cfg->p)
+ break;
+ }
+
+ p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
+ MCAST_INCLUDE, cfg->rt_protocol, extack);
+ if (unlikely(!p))
+ return -ENOMEM;
+
+ rcu_assign_pointer(*pp, p);
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
+ mod_timer(&p->timer,
+ now + brmctx->multicast_membership_interval);
+ br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB);
+
+ /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for
+ * proper replication.
+ */
+ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) {
+ struct net_bridge_mdb_entry *star_mp;
+ struct br_ip star_group;
+
+ star_group = p->key.addr;
+ memset(&star_group.src, 0, sizeof(star_group.src));
+ star_mp = br_mdb_ip_get(cfg->br, &star_group);
+ if (star_mp)
+ br_multicast_sg_add_exclude_ports(star_mp, p);
+ }
+
+ return 0;
+}
+
+static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg,
+ struct br_ip *src_ip,
+ struct net_bridge_mcast *brmctx,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_mdb_entry *sgmp;
+ struct br_mdb_config sg_cfg;
+ struct br_ip sg_ip;
+ u8 flags = 0;
+
+ sg_ip = cfg->group;
+ sg_ip.src = src_ip->src;
+ sgmp = br_multicast_new_group(cfg->br, &sg_ip);
+ if (IS_ERR(sgmp)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry");
+ return PTR_ERR(sgmp);
+ }
+
+ if (cfg->entry->state == MDB_PERMANENT)
+ flags |= MDB_PG_FLAGS_PERMANENT;
+ if (cfg->filter_mode == MCAST_EXCLUDE)
+ flags |= MDB_PG_FLAGS_BLOCKED;
+
+ memset(&sg_cfg, 0, sizeof(sg_cfg));
+ sg_cfg.br = cfg->br;
+ sg_cfg.p = cfg->p;
+ sg_cfg.entry = cfg->entry;
+ sg_cfg.group = sg_ip;
+ sg_cfg.src_entry = true;
+ sg_cfg.filter_mode = MCAST_INCLUDE;
+ sg_cfg.rt_protocol = cfg->rt_protocol;
+ sg_cfg.nlflags = cfg->nlflags;
+ return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack);
+}
+
+static int br_mdb_add_group_src(const struct br_mdb_config *cfg,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ struct br_mdb_src_entry *src,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ int err;
+
+ ent = br_multicast_find_group_src(pg, &src->addr);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src->addr);
+ if (!ent) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry");
+ return -ENOSPC;
+ }
+ } else if (!(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Source entry already exists");
+ return -EEXIST;
+ }
+
+ if (cfg->filter_mode == MCAST_INCLUDE &&
+ cfg->entry->state == MDB_TEMPORARY)
+ mod_timer(&ent->timer, now + br_multicast_gmi(brmctx));
+ else
+ timer_delete(&ent->timer);
+
+ /* Install a (S, G) forwarding entry for the source. */
+ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack);
+ if (err)
+ goto err_del_sg;
+
+ ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED;
+
+ return 0;
+
+err_del_sg:
+ __br_multicast_del_group_src(ent);
+ return err;
+}
+
+static void br_mdb_del_group_src(struct net_bridge_port_group *pg,
+ struct br_mdb_src_entry *src)
+{
+ struct net_bridge_group_src *ent;
+
+ ent = br_multicast_find_group_src(pg, &src->addr);
+ if (WARN_ON_ONCE(!ent))
+ return;
+ br_multicast_del_group_src(ent, false);
+}
+
+static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ for (i = 0; i < cfg->num_src_entries; i++) {
+ err = br_mdb_add_group_src(cfg, pg, brmctx,
+ &cfg->src_entries[i], extack);
+ if (err)
+ goto err_del_group_srcs;
+ }
+
+ return 0;
+
+err_del_group_srcs:
+ for (i--; i >= 0; i--)
+ br_mdb_del_group_src(pg, &cfg->src_entries[i]);
+ return err;
+}
+
+static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+ int err;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack);
+ if (err)
+ goto err_clear_delete;
+
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) {
+ if (ent->flags & BR_SGRP_F_DELETE)
+ br_multicast_del_group_src(ent, false);
+ }
+
+ return 0;
+
+err_clear_delete:
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ return err;
+}
+
+static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long now = jiffies;
+ int err;
+
+ err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack);
+ if (err)
+ return err;
+
+ pg->flags = flags;
+ pg->filter_mode = cfg->filter_mode;
+ pg->rt_protocol = cfg->rt_protocol;
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) &&
+ cfg->filter_mode == MCAST_EXCLUDE)
+ mod_timer(&pg->timer,
+ now + brmctx->multicast_membership_interval);
+ else
+ timer_delete(&pg->timer);
+
+ br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB);
+
+ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto))
+ br_multicast_star_g_handle_mode(pg, cfg->filter_mode);
+
+ return 0;
+}
+
+static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ unsigned long now = jiffies;
+ int err;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, cfg->br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port == cfg->p) {
+ if (!(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port");
+ return -EEXIST;
+ }
+ return br_mdb_replace_group_star_g(cfg, mp, p, brmctx,
+ flags, extack);
+ }
+ if ((unsigned long)p->key.port < (unsigned long)cfg->p)
+ break;
+ }
+
+ p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
+ cfg->filter_mode, cfg->rt_protocol,
+ extack);
+ if (unlikely(!p))
+ return -ENOMEM;
+
+ err = br_mdb_add_group_srcs(cfg, p, brmctx, extack);
+ if (err)
+ goto err_del_port_group;
+
+ rcu_assign_pointer(*pp, p);
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) &&
+ cfg->filter_mode == MCAST_EXCLUDE)
+ mod_timer(&p->timer,
+ now + brmctx->multicast_membership_interval);
+ br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB);
+ /* If we are adding a new EXCLUDE port group (*, G), it needs to be
+ * also added to all (S, G) entries for proper replication.
+ */
+ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) &&
+ cfg->filter_mode == MCAST_EXCLUDE)
+ br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE);
+
+ return 0;
+
+err_del_port_group:
+ br_multicast_del_port_group(p);
+ return err;
+}
+
+static int br_mdb_add_group(const struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = cfg->entry;
+ struct net_bridge_port *port = cfg->p;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge *br = cfg->br;
+ struct net_bridge_mcast *brmctx;
+ struct br_ip group = cfg->group;
+ unsigned char flags = 0;
+
+ brmctx = __br_mdb_choose_context(br, entry, extack);
+ if (!brmctx)
+ return -EINVAL;
+
+ mp = br_multicast_new_group(br, &group);
+ if (IS_ERR(mp))
+ return PTR_ERR(mp);
+
+ /* host join */
+ if (!port) {
+ if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host");
+ return -EEXIST;
+ }
+
+ br_multicast_host_join(brmctx, mp, false);
+ br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB);
+
+ return 0;
+ }
+
+ if (entry->state == MDB_PERMANENT)
+ flags |= MDB_PG_FLAGS_PERMANENT;
+
+ if (br_multicast_is_star_g(&group))
+ return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack);
+ else
+ return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack);
+}
+
+static int __br_mdb_add(const struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ spin_lock_bh(&cfg->br->multicast_lock);
+ ret = br_mdb_add_group(cfg, extack);
+ spin_unlock_bh(&cfg->br->multicast_lock);
+
+ return ret;
+}
+
+static int br_mdb_config_src_entry_init(struct nlattr *src_entry,
+ struct br_mdb_src_entry *src,
+ __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBE_SRCATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry,
+ br_mdbe_src_list_entry_pol, extack);
+ if (err)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS))
+ return -EINVAL;
+
+ if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack))
+ return -EINVAL;
+
+ src->addr.proto = proto;
+ nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS],
+ nla_len(tb[MDBE_SRCATTR_ADDRESS]));
+
+ return 0;
+}
+
+static int br_mdb_config_src_list_init(struct nlattr *src_list,
+ struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *src_entry;
+ int rem, err;
+ int i = 0;
+
+ nla_for_each_nested(src_entry, src_list, rem)
+ cfg->num_src_entries++;
+
+ if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)",
+ PG_SRC_ENT_LIMIT - 1);
+ return -EINVAL;
+ }
+
+ cfg->src_entries = kcalloc(cfg->num_src_entries,
+ sizeof(struct br_mdb_src_entry), GFP_KERNEL);
+ if (!cfg->src_entries)
+ return -ENOMEM;
+
+ nla_for_each_nested(src_entry, src_list, rem) {
+ err = br_mdb_config_src_entry_init(src_entry,
+ &cfg->src_entries[i],
+ cfg->entry->addr.proto,
+ extack);
+ if (err)
+ goto err_src_entry_init;
+ i++;
+ }
+
+ return 0;
+
+err_src_entry_init:
+ kfree(cfg->src_entries);
+ return err;
+}
+
+static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg)
+{
+ kfree(cfg->src_entries);
+}
+
+static int br_mdb_config_attrs_init(struct nlattr *set_attrs,
+ struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs,
+ br_mdbe_attrs_pol, extack);
+ if (err)
+ return err;
+
+ if (mdb_attrs[MDBE_ATTR_SOURCE] &&
+ !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE],
+ cfg->entry->addr.proto, extack))
+ return -EINVAL;
+
+ __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs);
+
+ if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) {
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups");
+ return -EINVAL;
+ }
+ if (!br_multicast_is_star_g(&cfg->group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries");
+ return -EINVAL;
+ }
+ cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]);
+ } else {
+ cfg->filter_mode = MCAST_EXCLUDE;
+ }
+
+ if (mdb_attrs[MDBE_ATTR_SRC_LIST]) {
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups");
+ return -EINVAL;
+ }
+ if (!br_multicast_is_star_g(&cfg->group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries");
+ return -EINVAL;
+ }
+ if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode");
+ return -EINVAL;
+ }
+ err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST],
+ cfg, extack);
+ if (err)
+ return err;
+ }
+
+ if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list");
+ return -EINVAL;
+ }
+
+ if (mdb_attrs[MDBE_ATTR_RTPROT]) {
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups");
+ return -EINVAL;
+ }
+ cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]);
+ }
+
+ return 0;
+}
+
+static int br_mdb_config_init(struct br_mdb_config *cfg, struct net_device *dev,
+ struct nlattr *tb[], u16 nlmsg_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = dev_net(dev);
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->filter_mode = MCAST_EXCLUDE;
+ cfg->rt_protocol = RTPROT_STATIC;
+ cfg->nlflags = nlmsg_flags;
+
+ cfg->br = netdev_priv(dev);
+
+ if (!netif_running(cfg->br->dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running");
+ return -EINVAL;
+ }
+
+ if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled");
+ return -EINVAL;
+ }
+
+ cfg->entry = nla_data(tb[MDBA_SET_ENTRY]);
+
+ if (cfg->entry->ifindex != cfg->br->dev->ifindex) {
+ struct net_device *pdev;
+
+ pdev = __dev_get_by_index(net, cfg->entry->ifindex);
+ if (!pdev) {
+ NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist");
+ return -ENODEV;
+ }
+
+ cfg->p = br_port_get_rtnl(pdev);
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port");
+ return -EINVAL;
+ }
+
+ if (cfg->p->br != cfg->br) {
+ NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
+ return -EINVAL;
+ }
+ }
+
+ if (cfg->entry->addr.proto == htons(ETH_P_IP) &&
+ ipv4_is_zeronet(cfg->entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address 0.0.0.0 is not allowed");
+ return -EINVAL;
+ }
+
+ if (tb[MDBA_SET_ENTRY_ATTRS])
+ return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg,
+ extack);
+ else
+ __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL);
+
+ return 0;
+}
+
+static void br_mdb_config_fini(struct br_mdb_config *cfg)
+{
+ br_mdb_config_src_list_fini(cfg);
+}
+
+int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct br_mdb_config cfg;
+ int err;
+
+ err = br_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack);
+ if (err)
+ return err;
+
+ err = -EINVAL;
+ /* host join errors which can happen before creating the group */
+ if (!cfg.p && !br_group_is_l2(&cfg.group)) {
+ /* don't allow any flags for host-joined IP groups */
+ if (cfg.entry->state) {
+ NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups");
+ goto out;
+ }
+ if (!br_multicast_is_star_g(&cfg.group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined");
+ goto out;
+ }
+ }
+
+ if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) {
+ NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed");
+ goto out;
+ }
+
+ if (cfg.p) {
+ if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) {
+ NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent");
+ goto out;
+ }
+ vg = nbp_vlan_group(cfg.p);
+ } else {
+ vg = br_vlan_group(cfg.br);
+ }
+
+ /* If vlan filtering is enabled and VLAN is not specified
+ * install mdb entry on all vlans configured on the port.
+ */
+ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) {
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ cfg.entry->vid = v->vid;
+ cfg.group.vid = v->vid;
+ err = __br_mdb_add(&cfg, extack);
+ if (err)
+ break;
+ }
+ } else {
+ err = __br_mdb_add(&cfg, extack);
+ }
+
+out:
+ br_mdb_config_fini(&cfg);
+ return err;
+}
+
+static int __br_mdb_del(const struct br_mdb_config *cfg)
+{
+ struct br_mdb_entry *entry = cfg->entry;
+ struct net_bridge *br = cfg->br;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct br_ip ip = cfg->group;
+ int err = -EINVAL;
+
+ spin_lock_bh(&br->multicast_lock);
+ mp = br_mdb_ip_get(br, &ip);
+ if (!mp)
+ goto unlock;
+
+ /* host leave */
+ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) {
+ br_multicast_host_leave(mp, false);
+ err = 0;
+ br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB);
+ if (!mp->ports && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ goto unlock;
+ }
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex)
+ continue;
+
+ br_multicast_del_pg(mp, p, pp);
+ err = 0;
+ break;
+ }
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+ return err;
+}
+
+int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct br_mdb_config cfg;
+ int err;
+
+ err = br_mdb_config_init(&cfg, dev, tb, 0, extack);
+ if (err)
+ return err;
+
+ if (cfg.p)
+ vg = nbp_vlan_group(cfg.p);
+ else
+ vg = br_vlan_group(cfg.br);
+
+ /* If vlan filtering is enabled and VLAN is not specified
+ * delete mdb entry on all vlans configured on the port.
+ */
+ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) {
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ cfg.entry->vid = v->vid;
+ cfg.group.vid = v->vid;
+ err = __br_mdb_del(&cfg);
+ }
+ } else {
+ err = __br_mdb_del(&cfg);
+ }
+
+ br_mdb_config_fini(&cfg);
+ return err;
+}
+
+struct br_mdb_flush_desc {
+ u32 port_ifindex;
+ u16 vid;
+ u8 rt_protocol;
+ u8 state;
+ u8 state_mask;
+};
+
+static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+ [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT),
+};
+
+static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]);
+ struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1];
+ int err;
+
+ desc->port_ifindex = entry->ifindex;
+ desc->vid = entry->vid;
+ desc->state = entry->state;
+
+ if (!tb[MDBA_SET_ENTRY_ATTRS])
+ return 0;
+
+ err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX,
+ tb[MDBA_SET_ENTRY_ATTRS],
+ br_mdbe_attrs_del_bulk_pol, extack);
+ if (err)
+ return err;
+
+ if (mdbe_attrs[MDBE_ATTR_STATE_MASK])
+ desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]);
+
+ if (mdbe_attrs[MDBE_ATTR_RTPROT])
+ desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]);
+
+ return 0;
+}
+
+static void br_mdb_flush_host(struct net_bridge *br,
+ struct net_bridge_mdb_entry *mp,
+ const struct br_mdb_flush_desc *desc)
+{
+ u8 state;
+
+ if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex)
+ return;
+
+ if (desc->rt_protocol)
+ return;
+
+ state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0;
+ if (desc->state_mask && (state & desc->state_mask) != desc->state)
+ return;
+
+ br_multicast_host_leave(mp, true);
+ if (!mp->ports && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+}
+
+static void br_mdb_flush_pgs(struct net_bridge *br,
+ struct net_bridge_mdb_entry *mp,
+ const struct br_mdb_flush_desc *desc)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) {
+ u8 state;
+
+ if (desc->port_ifindex &&
+ desc->port_ifindex != p->key.port->dev->ifindex) {
+ pp = &p->next;
+ continue;
+ }
+
+ if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) {
+ pp = &p->next;
+ continue;
+ }
+
+ state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0;
+ if (desc->state_mask &&
+ (state & desc->state_mask) != desc->state) {
+ pp = &p->next;
+ continue;
+ }
+
+ br_multicast_del_pg(mp, p, pp);
+ }
+}
+
+static void br_mdb_flush(struct net_bridge *br,
+ const struct br_mdb_flush_desc *desc)
+{
+ struct net_bridge_mdb_entry *mp;
+
+ spin_lock_bh(&br->multicast_lock);
+
+ /* Safe variant is not needed because entries are removed from the list
+ * upon group timer expiration or bridge deletion.
+ */
+ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
+ if (desc->vid && desc->vid != mp->addr.vid)
+ continue;
+
+ br_mdb_flush_host(br, mp, desc);
+ br_mdb_flush_pgs(br, mp, desc);
+ }
+
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct br_mdb_flush_desc desc = {};
+ int err;
+
+ err = br_mdb_flush_desc_init(&desc, tb, extack);
+ if (err)
+ return err;
+
+ br_mdb_flush(br, &desc);
+
+ return 0;
+}
+
+static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+};
+
+static int br_mdb_get_parse(struct net_device *dev, struct nlattr *tb[],
+ struct br_ip *group, struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]);
+ struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1];
+ int err;
+
+ if (!tb[MDBA_GET_ENTRY_ATTRS]) {
+ __mdb_entry_to_br_ip(entry, group, NULL);
+ return 0;
+ }
+
+ err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX,
+ tb[MDBA_GET_ENTRY_ATTRS], br_mdbe_attrs_get_pol,
+ extack);
+ if (err)
+ return err;
+
+ if (mdbe_attrs[MDBE_ATTR_SOURCE] &&
+ !is_valid_mdb_source(mdbe_attrs[MDBE_ATTR_SOURCE],
+ entry->addr.proto, extack))
+ return -EINVAL;
+
+ __mdb_entry_to_br_ip(entry, group, mdbe_attrs);
+
+ return 0;
+}
+
+static struct sk_buff *
+br_mdb_get_reply_alloc(const struct net_bridge_mdb_entry *mp)
+{
+ struct net_bridge_port_group *pg;
+ size_t nlmsg_size;
+
+ nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+ /* MDBA_MDB */
+ nla_total_size(0) +
+ /* MDBA_MDB_ENTRY */
+ nla_total_size(0);
+
+ if (mp->host_joined)
+ nlmsg_size += rtnl_mdb_nlmsg_pg_size(NULL);
+
+ for (pg = mlock_dereference(mp->ports, mp->br); pg;
+ pg = mlock_dereference(pg->next, mp->br))
+ nlmsg_size += rtnl_mdb_nlmsg_pg_size(pg);
+
+ return nlmsg_new(nlmsg_size, GFP_ATOMIC);
+}
+
+static int br_mdb_get_reply_fill(struct sk_buff *skb,
+ struct net_bridge_mdb_entry *mp, u32 portid,
+ u32 seq)
+{
+ struct nlattr *mdb_nest, *mdb_entry_nest;
+ struct net_bridge_port_group *pg;
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = mp->br->dev->ifindex;
+ mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB);
+ if (!mdb_nest) {
+ err = -EMSGSIZE;
+ goto cancel;
+ }
+ mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+ if (!mdb_entry_nest) {
+ err = -EMSGSIZE;
+ goto cancel;
+ }
+
+ if (mp->host_joined) {
+ err = __mdb_fill_info(skb, mp, NULL);
+ if (err)
+ goto cancel;
+ }
+
+ for (pg = mlock_dereference(mp->ports, mp->br); pg;
+ pg = mlock_dereference(pg->next, mp->br)) {
+ err = __mdb_fill_info(skb, mp, pg);
+ if (err)
+ goto cancel;
+ }
+
+ nla_nest_end(skb, mdb_entry_nest);
+ nla_nest_end(skb, mdb_nest);
+ nlmsg_end(skb, nlh);
+
+ return 0;
+
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return err;
+}
+
+int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_mdb_entry *mp;
+ struct sk_buff *skb;
+ struct br_ip group;
+ int err;
+
+ err = br_mdb_get_parse(dev, tb, &group, extack);
+ if (err)
+ return err;
+
+ /* Hold the multicast lock to ensure that the MDB entry does not change
+ * between the time the reply size is determined and when the reply is
+ * filled in.
+ */
+ spin_lock_bh(&br->multicast_lock);
+
+ mp = br_mdb_ip_get(br, &group);
+ if (!mp || (!mp->ports && !mp->host_joined)) {
+ NL_SET_ERR_MSG_MOD(extack, "MDB entry not found");
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ skb = br_mdb_get_reply_alloc(mp);
+ if (!skb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = br_mdb_get_reply_fill(skb, mp, portid, seq);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply");
+ goto free;
+ }
+
+ spin_unlock_bh(&br->multicast_lock);
+
+ return rtnl_unicast(skb, dev_net(dev), portid);
+
+free:
+ kfree_skb(skb);
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+ return err;
+}
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
new file mode 100644
index 000000000000..3c36fa24bc05
--- /dev/null
+++ b/net/bridge/br_mrp.c
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/mrp_bridge.h>
+#include "br_private_mrp.h"
+
+static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 };
+static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 };
+
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb);
+
+static struct br_frame_type mrp_frame_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MRP),
+ .frame_handler = br_mrp_process,
+};
+
+static bool br_mrp_is_ring_port(struct net_bridge_port *p_port,
+ struct net_bridge_port *s_port,
+ struct net_bridge_port *port)
+{
+ if (port == p_port ||
+ port == s_port)
+ return true;
+
+ return false;
+}
+
+static bool br_mrp_is_in_port(struct net_bridge_port *i_port,
+ struct net_bridge_port *port)
+{
+ if (port == i_port)
+ return true;
+
+ return false;
+}
+
+static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct net_bridge_port *res = NULL;
+ struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list) {
+ if (port->dev->ifindex == ifindex) {
+ res = port;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id)
+{
+ struct br_mrp *res = NULL;
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ if (mrp->ring_id == ring_id) {
+ res = mrp;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id)
+{
+ struct br_mrp *res = NULL;
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ if (mrp->in_id == in_id) {
+ res = mrp;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex)
+{
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ struct net_bridge_port *p;
+
+ p = rtnl_dereference(mrp->p_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+
+ p = rtnl_dereference(mrp->s_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+
+ p = rtnl_dereference(mrp->i_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+ }
+
+ return true;
+}
+
+static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+ struct br_mrp *res = NULL;
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ if (rcu_access_pointer(mrp->p_port) == p ||
+ rcu_access_pointer(mrp->s_port) == p ||
+ rcu_access_pointer(mrp->i_port) == p) {
+ res = mrp;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static int br_mrp_next_seq(struct br_mrp *mrp)
+{
+ mrp->seq_id++;
+ return mrp->seq_id;
+}
+
+static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p,
+ const u8 *src, const u8 *dst)
+{
+ struct ethhdr *eth_hdr;
+ struct sk_buff *skb;
+ __be16 *version;
+
+ skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH);
+ if (!skb)
+ return NULL;
+
+ skb->dev = p->dev;
+ skb->protocol = htons(ETH_P_MRP);
+ skb->priority = MRP_FRAME_PRIO;
+ skb_reserve(skb, sizeof(*eth_hdr));
+
+ eth_hdr = skb_push(skb, sizeof(*eth_hdr));
+ ether_addr_copy(eth_hdr->h_dest, dst);
+ ether_addr_copy(eth_hdr->h_source, src);
+ eth_hdr->h_proto = htons(ETH_P_MRP);
+
+ version = skb_put(skb, sizeof(*version));
+ *version = cpu_to_be16(MRP_VERSION);
+
+ return skb;
+}
+
+static void br_mrp_skb_tlv(struct sk_buff *skb,
+ enum br_mrp_tlv_header_type type,
+ u8 length)
+{
+ struct br_mrp_tlv_hdr *hdr;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->type = type;
+ hdr->length = length;
+}
+
+static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp)
+{
+ struct br_mrp_common_hdr *hdr;
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr));
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp));
+ memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH);
+}
+
+static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp,
+ struct net_bridge_port *p,
+ enum br_mrp_port_role_type port_role)
+{
+ struct br_mrp_ring_test_hdr *hdr = NULL;
+ struct sk_buff *skb = NULL;
+
+ if (!p)
+ return NULL;
+
+ skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac);
+ if (!skb)
+ return NULL;
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr));
+ hdr = skb_put(skb, sizeof(*hdr));
+
+ hdr->prio = cpu_to_be16(mrp->prio);
+ ether_addr_copy(hdr->sa, p->br->dev->dev_addr);
+ hdr->port_role = cpu_to_be16(port_role);
+ hdr->state = cpu_to_be16(mrp->ring_state);
+ hdr->transitions = cpu_to_be16(mrp->ring_transitions);
+ hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies));
+
+ br_mrp_skb_common(skb, mrp);
+
+ /* In case the node behaves as MRA then the Test frame needs to have
+ * an Option TLV which includes eventually a sub-option TLV that has
+ * the type AUTO_MGR
+ */
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) {
+ struct br_mrp_sub_option1_hdr *sub_opt = NULL;
+ struct br_mrp_tlv_hdr *sub_tlv = NULL;
+ struct br_mrp_oui_hdr *oui = NULL;
+ u8 length;
+
+ length = sizeof(*sub_opt) + sizeof(*sub_tlv) + sizeof(oui) +
+ MRP_OPT_PADDING;
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_OPTION, length);
+
+ oui = skb_put(skb, sizeof(*oui));
+ memset(oui, 0x0, sizeof(*oui));
+ sub_opt = skb_put(skb, sizeof(*sub_opt));
+ memset(sub_opt, 0x0, sizeof(*sub_opt));
+
+ sub_tlv = skb_put(skb, sizeof(*sub_tlv));
+ sub_tlv->type = BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR;
+
+ /* 32 bit alligment shall be ensured therefore add 2 bytes */
+ skb_put(skb, MRP_OPT_PADDING);
+ }
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0);
+
+ return skb;
+}
+
+static struct sk_buff *br_mrp_alloc_in_test_skb(struct br_mrp *mrp,
+ struct net_bridge_port *p,
+ enum br_mrp_port_role_type port_role)
+{
+ struct br_mrp_in_test_hdr *hdr = NULL;
+ struct sk_buff *skb = NULL;
+
+ if (!p)
+ return NULL;
+
+ skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_in_test_dmac);
+ if (!skb)
+ return NULL;
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_IN_TEST, sizeof(*hdr));
+ hdr = skb_put(skb, sizeof(*hdr));
+
+ hdr->id = cpu_to_be16(mrp->in_id);
+ ether_addr_copy(hdr->sa, p->br->dev->dev_addr);
+ hdr->port_role = cpu_to_be16(port_role);
+ hdr->state = cpu_to_be16(mrp->in_state);
+ hdr->transitions = cpu_to_be16(mrp->in_transitions);
+ hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies));
+
+ br_mrp_skb_common(skb, mrp);
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0);
+
+ return skb;
+}
+
+/* This function is continuously called in the following cases:
+ * - when node role is MRM, in this case test_monitor is always set to false
+ * because it needs to notify the userspace that the ring is open and needs to
+ * send MRP_Test frames
+ * - when node role is MRA, there are 2 subcases:
+ * - when MRA behaves as MRM, in this case is similar with MRM role
+ * - when MRA behaves as MRC, in this case test_monitor is set to true,
+ * because it needs to detect when it stops seeing MRP_Test frames
+ * from MRM node but it doesn't need to send MRP_Test frames.
+ */
+static void br_mrp_test_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work = to_delayed_work(work);
+ struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work);
+ struct net_bridge_port *p;
+ bool notify_open = false;
+ struct sk_buff *skb;
+
+ if (time_before_eq(mrp->test_end, jiffies))
+ return;
+
+ if (mrp->test_count_miss < mrp->test_max_miss) {
+ mrp->test_count_miss++;
+ } else {
+ /* Notify that the ring is open only if the ring state is
+ * closed, otherwise it would continue to notify at every
+ * interval.
+ * Also notify that the ring is open when the node has the
+ * role MRA and behaves as MRC. The reason is that the
+ * userspace needs to know when the MRM stopped sending
+ * MRP_Test frames so that the current node to try to take
+ * the role of a MRM.
+ */
+ if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED ||
+ mrp->test_monitor)
+ notify_open = true;
+ }
+
+ rcu_read_lock();
+
+ p = rcu_dereference(mrp->p_port);
+ if (p) {
+ if (!mrp->test_monitor) {
+ skb = br_mrp_alloc_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_PRIMARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+ }
+
+ if (notify_open && !mrp->ring_role_offloaded)
+ br_mrp_ring_port_open(p->dev, true);
+ }
+
+ p = rcu_dereference(mrp->s_port);
+ if (p) {
+ if (!mrp->test_monitor) {
+ skb = br_mrp_alloc_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_SECONDARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+ }
+
+ if (notify_open && !mrp->ring_role_offloaded)
+ br_mrp_ring_port_open(p->dev, true);
+ }
+
+out:
+ rcu_read_unlock();
+
+ queue_delayed_work(system_percpu_wq, &mrp->test_work,
+ usecs_to_jiffies(mrp->test_interval));
+}
+
+/* This function is continuously called when the node has the interconnect role
+ * MIM. It would generate interconnect test frames and will send them on all 3
+ * ports. But will also check if it stop receiving interconnect test frames.
+ */
+static void br_mrp_in_test_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work = to_delayed_work(work);
+ struct br_mrp *mrp = container_of(del_work, struct br_mrp, in_test_work);
+ struct net_bridge_port *p;
+ bool notify_open = false;
+ struct sk_buff *skb;
+
+ if (time_before_eq(mrp->in_test_end, jiffies))
+ return;
+
+ if (mrp->in_test_count_miss < mrp->in_test_max_miss) {
+ mrp->in_test_count_miss++;
+ } else {
+ /* Notify that the interconnect ring is open only if the
+ * interconnect ring state is closed, otherwise it would
+ * continue to notify at every interval.
+ */
+ if (mrp->in_state == BR_MRP_IN_STATE_CLOSED)
+ notify_open = true;
+ }
+
+ rcu_read_lock();
+
+ p = rcu_dereference(mrp->p_port);
+ if (p) {
+ skb = br_mrp_alloc_in_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_PRIMARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+
+ if (notify_open && !mrp->in_role_offloaded)
+ br_mrp_in_port_open(p->dev, true);
+ }
+
+ p = rcu_dereference(mrp->s_port);
+ if (p) {
+ skb = br_mrp_alloc_in_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_SECONDARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+
+ if (notify_open && !mrp->in_role_offloaded)
+ br_mrp_in_port_open(p->dev, true);
+ }
+
+ p = rcu_dereference(mrp->i_port);
+ if (p) {
+ skb = br_mrp_alloc_in_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_INTER);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+
+ if (notify_open && !mrp->in_role_offloaded)
+ br_mrp_in_port_open(p->dev, true);
+ }
+
+out:
+ rcu_read_unlock();
+
+ queue_delayed_work(system_percpu_wq, &mrp->in_test_work,
+ usecs_to_jiffies(mrp->in_test_interval));
+}
+
+/* Deletes the MRP instance.
+ * note: called under rtnl_lock
+ */
+static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
+{
+ struct net_bridge_port *p;
+ u8 state;
+
+ /* Stop sending MRP_Test frames */
+ cancel_delayed_work_sync(&mrp->test_work);
+ br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0);
+
+ /* Stop sending MRP_InTest frames if has an interconnect role */
+ cancel_delayed_work_sync(&mrp->in_test_work);
+ br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0);
+
+ /* Disable the roles */
+ br_mrp_switchdev_set_ring_role(br, mrp, BR_MRP_RING_ROLE_DISABLED);
+ p = rtnl_dereference(mrp->i_port);
+ if (p)
+ br_mrp_switchdev_set_in_role(br, mrp, mrp->in_id, mrp->ring_id,
+ BR_MRP_IN_ROLE_DISABLED);
+
+ br_mrp_switchdev_del(br, mrp);
+
+ /* Reset the ports */
+ p = rtnl_dereference(mrp->p_port);
+ if (p) {
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->p_port, NULL);
+ }
+
+ p = rtnl_dereference(mrp->s_port);
+ if (p) {
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->s_port, NULL);
+ }
+
+ p = rtnl_dereference(mrp->i_port);
+ if (p) {
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->i_port, NULL);
+ }
+
+ hlist_del_rcu(&mrp->list);
+ kfree_rcu(mrp, rcu);
+
+ if (hlist_empty(&br->mrp_list))
+ br_del_frame(br, &mrp_frame_type);
+}
+
+/* Adds a new MRP instance.
+ * note: called under rtnl_lock
+ */
+int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
+{
+ struct net_bridge_port *p;
+ struct br_mrp *mrp;
+ int err;
+
+ /* If the ring exists, it is not possible to create another one with the
+ * same ring_id
+ */
+ mrp = br_mrp_find_id(br, instance->ring_id);
+ if (mrp)
+ return -EINVAL;
+
+ if (!br_mrp_get_port(br, instance->p_ifindex) ||
+ !br_mrp_get_port(br, instance->s_ifindex))
+ return -EINVAL;
+
+ /* It is not possible to have the same port part of multiple rings */
+ if (!br_mrp_unique_ifindex(br, instance->p_ifindex) ||
+ !br_mrp_unique_ifindex(br, instance->s_ifindex))
+ return -EINVAL;
+
+ mrp = kzalloc(sizeof(*mrp), GFP_KERNEL);
+ if (!mrp)
+ return -ENOMEM;
+
+ mrp->ring_id = instance->ring_id;
+ mrp->prio = instance->prio;
+
+ p = br_mrp_get_port(br, instance->p_ifindex);
+ spin_lock_bh(&br->lock);
+ p->state = BR_STATE_FORWARDING;
+ p->flags |= BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ rcu_assign_pointer(mrp->p_port, p);
+
+ p = br_mrp_get_port(br, instance->s_ifindex);
+ spin_lock_bh(&br->lock);
+ p->state = BR_STATE_FORWARDING;
+ p->flags |= BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ rcu_assign_pointer(mrp->s_port, p);
+
+ if (hlist_empty(&br->mrp_list))
+ br_add_frame(br, &mrp_frame_type);
+
+ INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired);
+ INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired);
+ hlist_add_tail_rcu(&mrp->list, &br->mrp_list);
+
+ err = br_mrp_switchdev_add(br, mrp);
+ if (err)
+ goto delete_mrp;
+
+ return 0;
+
+delete_mrp:
+ br_mrp_del_impl(br, mrp);
+
+ return err;
+}
+
+/* Deletes the MRP instance from which the port is part of
+ * note: called under rtnl_lock
+ */
+void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p)
+{
+ struct br_mrp *mrp = br_mrp_find_port(br, p);
+
+ /* If the port is not part of a MRP instance just bail out */
+ if (!mrp)
+ return;
+
+ br_mrp_del_impl(br, mrp);
+}
+
+/* Deletes existing MRP instance based on ring_id
+ * note: called under rtnl_lock
+ */
+int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id);
+
+ if (!mrp)
+ return -EINVAL;
+
+ br_mrp_del_impl(br, mrp);
+
+ return 0;
+}
+
+/* Set port state, port state can be forwarding, blocked or disabled
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_port_state(struct net_bridge_port *p,
+ enum br_mrp_port_state_type state)
+{
+ u32 port_state;
+
+ if (!p || !(p->flags & BR_MRP_AWARE))
+ return -EINVAL;
+
+ spin_lock_bh(&p->br->lock);
+
+ if (state == BR_MRP_PORT_STATE_FORWARDING)
+ port_state = BR_STATE_FORWARDING;
+ else
+ port_state = BR_STATE_BLOCKING;
+
+ p->state = port_state;
+ spin_unlock_bh(&p->br->lock);
+
+ br_mrp_port_switchdev_set_state(p, port_state);
+
+ return 0;
+}
+
+/* Set port role, port role can be primary or secondary
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_port_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role)
+{
+ struct br_mrp *mrp;
+
+ if (!p || !(p->flags & BR_MRP_AWARE))
+ return -EINVAL;
+
+ mrp = br_mrp_find_port(p->br, p);
+
+ if (!mrp)
+ return -EINVAL;
+
+ switch (role) {
+ case BR_MRP_PORT_ROLE_PRIMARY:
+ rcu_assign_pointer(mrp->p_port, p);
+ break;
+ case BR_MRP_PORT_ROLE_SECONDARY:
+ rcu_assign_pointer(mrp->s_port, p);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ br_mrp_port_switchdev_set_role(p, role);
+
+ return 0;
+}
+
+/* Set ring state, ring state can be only Open or Closed
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_ring_state(struct net_bridge *br,
+ struct br_mrp_ring_state *state)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id);
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (mrp->ring_state != state->ring_state)
+ mrp->ring_transitions++;
+
+ mrp->ring_state = state->ring_state;
+
+ br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state);
+
+ return 0;
+}
+
+/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or
+ * MRC(Media Redundancy Client).
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_ring_role(struct net_bridge *br,
+ struct br_mrp_ring_role *role)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id);
+ enum br_mrp_hw_support support;
+
+ if (!mrp)
+ return -EINVAL;
+
+ mrp->ring_role = role->ring_role;
+
+ /* If there is an error just bailed out */
+ support = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ /* Now detect if the HW actually applied the role or not. If the HW
+ * applied the role it means that the SW will not to do those operations
+ * anymore. For example if the role ir MRM then the HW will notify the
+ * SW when ring is open, but if the is not pushed to the HW the SW will
+ * need to detect when the ring is open
+ */
+ mrp->ring_role_offloaded = support == BR_MRP_SW ? 0 : 1;
+
+ return 0;
+}
+
+/* Start to generate or monitor MRP test frames, the frames are generated by
+ * HW and if it fails, they are generated by the SW.
+ * note: already called with rtnl_lock
+ */
+int br_mrp_start_test(struct net_bridge *br,
+ struct br_mrp_start_test *test)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id);
+ enum br_mrp_hw_support support;
+
+ if (!mrp)
+ return -EINVAL;
+
+ /* Try to push it to the HW and if it fails then continue with SW
+ * implementation and if that also fails then return error.
+ */
+ support = br_mrp_switchdev_send_ring_test(br, mrp, test->interval,
+ test->max_miss, test->period,
+ test->monitor);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ if (support == BR_MRP_HW)
+ return 0;
+
+ mrp->test_interval = test->interval;
+ mrp->test_end = jiffies + usecs_to_jiffies(test->period);
+ mrp->test_max_miss = test->max_miss;
+ mrp->test_monitor = test->monitor;
+ mrp->test_count_miss = 0;
+ queue_delayed_work(system_percpu_wq, &mrp->test_work,
+ usecs_to_jiffies(test->interval));
+
+ return 0;
+}
+
+/* Set in state, int state can be only Open or Closed
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state)
+{
+ struct br_mrp *mrp = br_mrp_find_in_id(br, state->in_id);
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (mrp->in_state != state->in_state)
+ mrp->in_transitions++;
+
+ mrp->in_state = state->in_state;
+
+ br_mrp_switchdev_set_in_state(br, mrp, state->in_state);
+
+ return 0;
+}
+
+/* Set in role, in role can be only MIM(Media Interconnection Manager) or
+ * MIC(Media Interconnection Client).
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id);
+ enum br_mrp_hw_support support;
+ struct net_bridge_port *p;
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (!br_mrp_get_port(br, role->i_ifindex))
+ return -EINVAL;
+
+ if (role->in_role == BR_MRP_IN_ROLE_DISABLED) {
+ u8 state;
+
+ /* It is not allowed to disable a port that doesn't exist */
+ p = rtnl_dereference(mrp->i_port);
+ if (!p)
+ return -EINVAL;
+
+ /* Stop the generating MRP_InTest frames */
+ cancel_delayed_work_sync(&mrp->in_test_work);
+ br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0);
+
+ /* Remove the port */
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->i_port, NULL);
+
+ mrp->in_role = role->in_role;
+ mrp->in_id = 0;
+
+ return 0;
+ }
+
+ /* It is not possible to have the same port part of multiple rings */
+ if (!br_mrp_unique_ifindex(br, role->i_ifindex))
+ return -EINVAL;
+
+ /* It is not allowed to set a different interconnect port if the mrp
+ * instance has already one. First it needs to be disabled and after
+ * that set the new port
+ */
+ if (rcu_access_pointer(mrp->i_port))
+ return -EINVAL;
+
+ p = br_mrp_get_port(br, role->i_ifindex);
+ spin_lock_bh(&br->lock);
+ p->state = BR_STATE_FORWARDING;
+ p->flags |= BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ rcu_assign_pointer(mrp->i_port, p);
+
+ mrp->in_role = role->in_role;
+ mrp->in_id = role->in_id;
+
+ /* If there is an error just bailed out */
+ support = br_mrp_switchdev_set_in_role(br, mrp, role->in_id,
+ role->ring_id, role->in_role);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ /* Now detect if the HW actually applied the role or not. If the HW
+ * applied the role it means that the SW will not to do those operations
+ * anymore. For example if the role is MIM then the HW will notify the
+ * SW when interconnect ring is open, but if the is not pushed to the HW
+ * the SW will need to detect when the interconnect ring is open.
+ */
+ mrp->in_role_offloaded = support == BR_MRP_SW ? 0 : 1;
+
+ return 0;
+}
+
+/* Start to generate MRP_InTest frames, the frames are generated by
+ * HW and if it fails, they are generated by the SW.
+ * note: already called with rtnl_lock
+ */
+int br_mrp_start_in_test(struct net_bridge *br,
+ struct br_mrp_start_in_test *in_test)
+{
+ struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id);
+ enum br_mrp_hw_support support;
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (mrp->in_role != BR_MRP_IN_ROLE_MIM)
+ return -EINVAL;
+
+ /* Try to push it to the HW and if it fails then continue with SW
+ * implementation and if that also fails then return error.
+ */
+ support = br_mrp_switchdev_send_in_test(br, mrp, in_test->interval,
+ in_test->max_miss,
+ in_test->period);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ if (support == BR_MRP_HW)
+ return 0;
+
+ mrp->in_test_interval = in_test->interval;
+ mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period);
+ mrp->in_test_max_miss = in_test->max_miss;
+ mrp->in_test_count_miss = 0;
+ queue_delayed_work(system_percpu_wq, &mrp->in_test_work,
+ usecs_to_jiffies(in_test->interval));
+
+ return 0;
+}
+
+/* Determine if the frame type is a ring frame */
+static bool br_mrp_ring_frame(struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+
+ if (hdr->type == BR_MRP_TLV_HEADER_RING_TEST ||
+ hdr->type == BR_MRP_TLV_HEADER_RING_TOPO ||
+ hdr->type == BR_MRP_TLV_HEADER_RING_LINK_DOWN ||
+ hdr->type == BR_MRP_TLV_HEADER_RING_LINK_UP ||
+ hdr->type == BR_MRP_TLV_HEADER_OPTION)
+ return true;
+
+ return false;
+}
+
+/* Determine if the frame type is an interconnect frame */
+static bool br_mrp_in_frame(struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+
+ if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_TOPO ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS)
+ return true;
+
+ return false;
+}
+
+/* Process only MRP Test frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return;
+
+ if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST)
+ return;
+
+ mrp->test_count_miss = 0;
+
+ /* Notify the userspace that the ring is closed only when the ring is
+ * not closed
+ */
+ if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED)
+ br_mrp_ring_port_open(port->dev, false);
+}
+
+/* Determine if the test hdr has a better priority than the node */
+static bool br_mrp_test_better_than_own(struct br_mrp *mrp,
+ struct net_bridge *br,
+ const struct br_mrp_ring_test_hdr *hdr)
+{
+ u16 prio = be16_to_cpu(hdr->prio);
+
+ if (prio < mrp->prio ||
+ (prio == mrp->prio &&
+ ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr)))
+ return true;
+
+ return false;
+}
+
+/* Process only MRP Test frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ const struct br_mrp_ring_test_hdr *test_hdr;
+ struct br_mrp_ring_test_hdr _test_hdr;
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return;
+
+ if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST)
+ return;
+
+ test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr),
+ sizeof(_test_hdr), &_test_hdr);
+ if (!test_hdr)
+ return;
+
+ /* Only frames that have a better priority than the node will
+ * clear the miss counter because otherwise the node will need to behave
+ * as MRM.
+ */
+ if (br_mrp_test_better_than_own(mrp, br, test_hdr))
+ mrp->test_count_miss = 0;
+}
+
+/* Process only MRP InTest frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static bool br_mrp_mim_process(struct br_mrp *mrp, struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ const struct br_mrp_in_test_hdr *in_hdr;
+ struct br_mrp_in_test_hdr _in_hdr;
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+
+ /* The check for InTest frame type was already done */
+ in_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr),
+ sizeof(_in_hdr), &_in_hdr);
+ if (!in_hdr)
+ return false;
+
+ /* It needs to process only it's own InTest frames. */
+ if (mrp->in_id != ntohs(in_hdr->id))
+ return false;
+
+ mrp->in_test_count_miss = 0;
+
+ /* Notify the userspace that the ring is closed only when the ring is
+ * not closed
+ */
+ if (mrp->in_state != BR_MRP_IN_STATE_CLOSED)
+ br_mrp_in_port_open(port->dev, false);
+
+ return true;
+}
+
+/* Get the MRP frame type
+ * note: already called with rcu_read_lock
+ */
+static u8 br_mrp_get_frame_type(struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 0xff;
+
+ return hdr->type;
+}
+
+static bool br_mrp_mrm_behaviour(struct br_mrp *mrp)
+{
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRM ||
+ (mrp->ring_role == BR_MRP_RING_ROLE_MRA && !mrp->test_monitor))
+ return true;
+
+ return false;
+}
+
+static bool br_mrp_mrc_behaviour(struct br_mrp *mrp)
+{
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRC ||
+ (mrp->ring_role == BR_MRP_RING_ROLE_MRA && mrp->test_monitor))
+ return true;
+
+ return false;
+}
+
+/* This will just forward the frame to the other mrp ring ports, depending on
+ * the frame type, ring role and interconnect role
+ * note: already called with rcu_read_lock
+ */
+static int br_mrp_rcv(struct net_bridge_port *p,
+ struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_bridge_port *p_port, *s_port, *i_port = NULL;
+ struct net_bridge_port *p_dst, *s_dst, *i_dst = NULL;
+ struct net_bridge *br;
+ struct br_mrp *mrp;
+
+ /* If port is disabled don't accept any frames */
+ if (p->state == BR_STATE_DISABLED)
+ return 0;
+
+ br = p->br;
+ mrp = br_mrp_find_port(br, p);
+ if (unlikely(!mrp))
+ return 0;
+
+ p_port = rcu_dereference(mrp->p_port);
+ if (!p_port)
+ return 0;
+ p_dst = p_port;
+
+ s_port = rcu_dereference(mrp->s_port);
+ if (!s_port)
+ return 0;
+ s_dst = s_port;
+
+ /* If the frame is a ring frame then it is not required to check the
+ * interconnect role and ports to process or forward the frame
+ */
+ if (br_mrp_ring_frame(skb)) {
+ /* If the role is MRM then don't forward the frames */
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) {
+ br_mrp_mrm_process(mrp, p, skb);
+ goto no_forward;
+ }
+
+ /* If the role is MRA then don't forward the frames if it
+ * behaves as MRM node
+ */
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) {
+ if (!mrp->test_monitor) {
+ br_mrp_mrm_process(mrp, p, skb);
+ goto no_forward;
+ }
+
+ br_mrp_mra_process(mrp, br, p, skb);
+ }
+
+ goto forward;
+ }
+
+ if (br_mrp_in_frame(skb)) {
+ u8 in_type = br_mrp_get_frame_type(skb);
+
+ i_port = rcu_dereference(mrp->i_port);
+ i_dst = i_port;
+
+ /* If the ring port is in block state it should not forward
+ * In_Test frames
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ p->state == BR_STATE_BLOCKING &&
+ in_type == BR_MRP_TLV_HEADER_IN_TEST)
+ goto no_forward;
+
+ /* Nodes that behaves as MRM needs to stop forwarding the
+ * frames in case the ring is closed, otherwise will be a loop.
+ * In this case the frame is no forward between the ring ports.
+ */
+ if (br_mrp_mrm_behaviour(mrp) &&
+ br_mrp_is_ring_port(p_port, s_port, p) &&
+ (s_port->state != BR_STATE_FORWARDING ||
+ p_port->state != BR_STATE_FORWARDING)) {
+ p_dst = NULL;
+ s_dst = NULL;
+ }
+
+ /* A node that behaves as MRC and doesn't have a interconnect
+ * role then it should forward all frames between the ring ports
+ * because it doesn't have an interconnect port
+ */
+ if (br_mrp_mrc_behaviour(mrp) &&
+ mrp->in_role == BR_MRP_IN_ROLE_DISABLED)
+ goto forward;
+
+ if (mrp->in_role == BR_MRP_IN_ROLE_MIM) {
+ if (in_type == BR_MRP_TLV_HEADER_IN_TEST) {
+ /* MIM should not forward it's own InTest
+ * frames
+ */
+ if (br_mrp_mim_process(mrp, p, skb)) {
+ goto no_forward;
+ } else {
+ if (br_mrp_is_ring_port(p_port, s_port,
+ p))
+ i_dst = NULL;
+
+ if (br_mrp_is_in_port(i_port, p))
+ goto no_forward;
+ }
+ } else {
+ /* MIM should forward IntLinkChange/Status and
+ * IntTopoChange between ring ports but MIM
+ * should not forward IntLinkChange/Status and
+ * IntTopoChange if the frame was received at
+ * the interconnect port
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p))
+ i_dst = NULL;
+
+ if (br_mrp_is_in_port(i_port, p))
+ goto no_forward;
+ }
+ }
+
+ if (mrp->in_role == BR_MRP_IN_ROLE_MIC) {
+ /* MIC should forward InTest frames on all ports
+ * regardless of the received port
+ */
+ if (in_type == BR_MRP_TLV_HEADER_IN_TEST)
+ goto forward;
+
+ /* MIC should forward IntLinkChange frames only if they
+ * are received on ring ports to all the ports
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ (in_type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
+ in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN))
+ goto forward;
+
+ /* MIC should forward IntLinkStatus frames only to
+ * interconnect port if it was received on a ring port.
+ * If it is received on interconnect port then, it
+ * should be forward on both ring ports
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) {
+ p_dst = NULL;
+ s_dst = NULL;
+ }
+
+ /* Should forward the InTopo frames only between the
+ * ring ports
+ */
+ if (in_type == BR_MRP_TLV_HEADER_IN_TOPO) {
+ i_dst = NULL;
+ goto forward;
+ }
+
+ /* In all the other cases don't forward the frames */
+ goto no_forward;
+ }
+ }
+
+forward:
+ if (p_dst)
+ br_forward(p_dst, skb, true, false);
+ if (s_dst)
+ br_forward(s_dst, skb, true, false);
+ if (i_dst)
+ br_forward(i_dst, skb, true, false);
+
+no_forward:
+ return 1;
+}
+
+/* Check if the frame was received on a port that is part of MRP ring
+ * and if the frame has MRP eth. In that case process the frame otherwise do
+ * normal forwarding.
+ * note: already called with rcu_read_lock
+ */
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
+{
+ /* If there is no MRP instance do normal forwarding */
+ if (likely(!(p->flags & BR_MRP_AWARE)))
+ goto out;
+
+ return br_mrp_rcv(p, skb, p->dev);
+out:
+ return 0;
+}
+
+bool br_mrp_enabled(struct net_bridge *br)
+{
+ return !hlist_empty(&br->mrp_list);
+}
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
new file mode 100644
index 000000000000..ce6f63c77cc0
--- /dev/null
+++ b/net/bridge/br_mrp_netlink.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/genetlink.h>
+
+#include <uapi/linux/mrp_bridge.h>
+#include "br_private.h"
+#include "br_private_mrp.h"
+
+static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_IN_ROLE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_IN_STATE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_START_IN_TEST] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 },
+};
+
+static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr,
+ int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1];
+ struct br_mrp_instance inst;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr,
+ br_mrp_instance_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] ||
+ !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX");
+ return -EINVAL;
+ }
+
+ memset(&inst, 0, sizeof(inst));
+
+ inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]);
+ inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]);
+ inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]);
+ inst.prio = MRP_DEFAULT_PRIO;
+
+ if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO])
+ inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]);
+
+ if (cmd == RTM_SETLINK)
+ return br_mrp_add(br, &inst);
+ else
+ return br_mrp_del(br, &inst);
+
+ return 0;
+}
+
+static const struct nla_policy
+br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_port_state_parse(struct net_bridge_port *p,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1];
+ enum br_mrp_port_state_type state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr,
+ br_mrp_port_state_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE");
+ return -EINVAL;
+ }
+
+ state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]);
+
+ return br_mrp_set_port_state(p, state);
+}
+
+static const struct nla_policy
+br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_port_role_parse(struct net_bridge_port *p,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1];
+ enum br_mrp_port_role_type role;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr,
+ br_mrp_port_role_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE");
+ return -EINVAL;
+ }
+
+ role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]);
+
+ return br_mrp_set_port_role(p, role);
+}
+
+static const struct nla_policy
+br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1];
+ struct br_mrp_ring_state state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr,
+ br_mrp_ring_state_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or STATE");
+ return -EINVAL;
+ }
+
+ memset(&state, 0x0, sizeof(state));
+
+ state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]);
+ state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]);
+
+ return br_mrp_set_ring_state(br, &state);
+}
+
+static const struct nla_policy
+br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1];
+ struct br_mrp_ring_role role;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr,
+ br_mrp_ring_role_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or ROLE");
+ return -EINVAL;
+ }
+
+ memset(&role, 0x0, sizeof(role));
+
+ role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]);
+ role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]);
+
+ return br_mrp_set_ring_role(br, &role);
+}
+
+static const struct nla_policy
+br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 },
+};
+
+static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1];
+ struct br_mrp_start_test test;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr,
+ br_mrp_start_test_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] ||
+ !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] ||
+ !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD");
+ return -EINVAL;
+ }
+
+ memset(&test, 0x0, sizeof(test));
+
+ test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]);
+ test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]);
+ test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]);
+ test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]);
+ test.monitor = false;
+
+ if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR])
+ test.monitor =
+ nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]);
+
+ return br_mrp_start_test(br, &test);
+}
+
+static const struct nla_policy
+br_mrp_in_state_policy[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_IN_STATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_IN_STATE_IN_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_IN_STATE_STATE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_in_state_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1];
+ struct br_mrp_in_state state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_STATE_MAX, attr,
+ br_mrp_in_state_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID] ||
+ !tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: IN_ID or STATE");
+ return -EINVAL;
+ }
+
+ memset(&state, 0x0, sizeof(state));
+
+ state.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID]);
+ state.in_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]);
+
+ return br_mrp_set_in_state(br, &state);
+}
+
+static const struct nla_policy
+br_mrp_in_role_policy[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] = { .type = NLA_U16 },
+ [IFLA_BRIDGE_MRP_IN_ROLE_ROLE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] = { .type = NLA_U32 },
+};
+
+static int br_mrp_in_role_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1];
+ struct br_mrp_in_role role;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_ROLE_MAX, attr,
+ br_mrp_in_role_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] ||
+ !tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] ||
+ !tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or ROLE or IN_ID or I_IFINDEX");
+ return -EINVAL;
+ }
+
+ memset(&role, 0x0, sizeof(role));
+
+ role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID]);
+ role.in_id = nla_get_u16(tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID]);
+ role.i_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX]);
+ role.in_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]);
+
+ return br_mrp_set_in_role(br, &role);
+}
+
+static const struct nla_policy
+br_mrp_start_in_test_policy[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = { .type = NLA_U32 },
+};
+
+static int br_mrp_start_in_test_parse(struct net_bridge *br,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1];
+ struct br_mrp_start_in_test test;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_IN_TEST_MAX, attr,
+ br_mrp_start_in_test_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] ||
+ !tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] ||
+ !tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] ||
+ !tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD");
+ return -EINVAL;
+ }
+
+ memset(&test, 0x0, sizeof(test));
+
+ test.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID]);
+ test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL]);
+ test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS]);
+ test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]);
+
+ return br_mrp_start_in_test(br, &test);
+}
+
+int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1];
+ int err;
+
+ /* When this function is called for a port then the br pointer is
+ * invalid, therefor set the br to point correctly
+ */
+ if (p)
+ br = p->br;
+
+ if (br->stp_enabled != BR_NO_STP) {
+ NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr,
+ br_mrp_policy, extack);
+ if (err)
+ return err;
+
+ if (tb[IFLA_BRIDGE_MRP_INSTANCE]) {
+ err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE],
+ cmd, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) {
+ err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) {
+ err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_RING_STATE]) {
+ err = br_mrp_ring_state_parse(br,
+ tb[IFLA_BRIDGE_MRP_RING_STATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) {
+ err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_START_TEST]) {
+ err = br_mrp_start_test_parse(br,
+ tb[IFLA_BRIDGE_MRP_START_TEST],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_IN_STATE]) {
+ err = br_mrp_in_state_parse(br, tb[IFLA_BRIDGE_MRP_IN_STATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_IN_ROLE]) {
+ err = br_mrp_in_role_parse(br, tb[IFLA_BRIDGE_MRP_IN_ROLE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_START_IN_TEST]) {
+ err = br_mrp_start_in_test_parse(br,
+ tb[IFLA_BRIDGE_MRP_START_IN_TEST],
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ struct nlattr *tb, *mrp_tb;
+ struct br_mrp *mrp;
+
+ mrp_tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP);
+ if (!mrp_tb)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list) {
+ struct net_bridge_port *p;
+
+ tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ID,
+ mrp->ring_id))
+ goto nla_put_failure;
+
+ p = rcu_dereference(mrp->p_port);
+ if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_P_IFINDEX,
+ p->dev->ifindex))
+ goto nla_put_failure;
+
+ p = rcu_dereference(mrp->s_port);
+ if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_S_IFINDEX,
+ p->dev->ifindex))
+ goto nla_put_failure;
+
+ p = rcu_dereference(mrp->i_port);
+ if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_I_IFINDEX,
+ p->dev->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_MRP_INFO_PRIO,
+ mrp->prio))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_STATE,
+ mrp->ring_state))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ROLE,
+ mrp->ring_role))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL,
+ mrp->test_interval))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS,
+ mrp->test_max_miss))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR,
+ mrp->test_monitor))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_STATE,
+ mrp->in_state))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_ROLE,
+ mrp->in_role))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL,
+ mrp->in_test_interval))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS,
+ mrp->in_test_max_miss))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+ }
+ nla_nest_end(skb, mrp_tb);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ nla_nest_cancel(skb, mrp_tb);
+
+ return -EMSGSIZE;
+}
+
+int br_mrp_ring_port_open(struct net_device *dev, u8 loc)
+{
+ struct net_bridge_port *p;
+ int err = 0;
+
+ p = br_port_get_rcu(dev);
+ if (!p) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (loc)
+ p->flags |= BR_MRP_LOST_CONT;
+ else
+ p->flags &= ~BR_MRP_LOST_CONT;
+
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+out:
+ return err;
+}
+
+int br_mrp_in_port_open(struct net_device *dev, u8 loc)
+{
+ struct net_bridge_port *p;
+ int err = 0;
+
+ p = br_port_get_rcu(dev);
+ if (!p) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (loc)
+ p->flags |= BR_MRP_LOST_IN_CONT;
+ else
+ p->flags &= ~BR_MRP_LOST_IN_CONT;
+
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+out:
+ return err;
+}
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
new file mode 100644
index 000000000000..cb54b324fa8c
--- /dev/null
+++ b/net/bridge/br_mrp_switchdev.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/switchdev.h>
+
+#include "br_private_mrp.h"
+
+static enum br_mrp_hw_support
+br_mrp_switchdev_port_obj(struct net_bridge *br,
+ const struct switchdev_obj *obj, bool add)
+{
+ int err;
+
+ if (add)
+ err = switchdev_port_obj_add(br->dev, obj, NULL);
+ else
+ err = switchdev_port_obj_del(br->dev, obj);
+
+ /* In case of success just return and notify the SW that doesn't need
+ * to do anything
+ */
+ if (!err)
+ return BR_MRP_HW;
+
+ if (err != -EOPNOTSUPP)
+ return BR_MRP_NONE;
+
+ /* Continue with SW backup */
+ return BR_MRP_SW;
+}
+
+int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp)
+{
+ struct switchdev_obj_mrp mrp_obj = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_MRP,
+ .p_port = rtnl_dereference(mrp->p_port)->dev,
+ .s_port = rtnl_dereference(mrp->s_port)->dev,
+ .ring_id = mrp->ring_id,
+ .prio = mrp->prio,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL);
+}
+
+int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp)
+{
+ struct switchdev_obj_mrp mrp_obj = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_MRP,
+ .p_port = NULL,
+ .s_port = NULL,
+ .ring_id = mrp->ring_id,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_del(br->dev, &mrp_obj.obj);
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_ring_role_type role)
+{
+ struct switchdev_obj_ring_role_mrp mrp_role = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP,
+ .ring_role = role,
+ .ring_id = mrp->ring_id,
+ .sw_backup = false,
+ };
+ enum br_mrp_hw_support support;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ support = br_mrp_switchdev_port_obj(br, &mrp_role.obj,
+ role != BR_MRP_RING_ROLE_DISABLED);
+ if (support != BR_MRP_SW)
+ return support;
+
+ /* If the driver can't configure to run completely the protocol in HW,
+ * then try again to configure the HW so the SW can run the protocol.
+ */
+ mrp_role.sw_backup = true;
+ if (role != BR_MRP_RING_ROLE_DISABLED)
+ err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL);
+ else
+ err = switchdev_port_obj_del(br->dev, &mrp_role.obj);
+
+ if (!err)
+ return BR_MRP_SW;
+
+ return BR_MRP_NONE;
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period,
+ bool monitor)
+{
+ struct switchdev_obj_ring_test_mrp test = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP,
+ .interval = interval,
+ .max_miss = max_miss,
+ .ring_id = mrp->ring_id,
+ .period = period,
+ .monitor = monitor,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0);
+}
+
+int br_mrp_switchdev_set_ring_state(struct net_bridge *br,
+ struct br_mrp *mrp,
+ enum br_mrp_ring_state_type state)
+{
+ struct switchdev_obj_ring_state_mrp mrp_state = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP,
+ .ring_state = state,
+ .ring_id = mrp->ring_id,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL);
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp,
+ u16 in_id, u32 ring_id,
+ enum br_mrp_in_role_type role)
+{
+ struct switchdev_obj_in_role_mrp mrp_role = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_IN_ROLE_MRP,
+ .in_role = role,
+ .in_id = mrp->in_id,
+ .ring_id = mrp->ring_id,
+ .i_port = rtnl_dereference(mrp->i_port)->dev,
+ .sw_backup = false,
+ };
+ enum br_mrp_hw_support support;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ support = br_mrp_switchdev_port_obj(br, &mrp_role.obj,
+ role != BR_MRP_IN_ROLE_DISABLED);
+ if (support != BR_MRP_NONE)
+ return support;
+
+ /* If the driver can't configure to run completely the protocol in HW,
+ * then try again to configure the HW so the SW can run the protocol.
+ */
+ mrp_role.sw_backup = true;
+ if (role != BR_MRP_IN_ROLE_DISABLED)
+ err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL);
+ else
+ err = switchdev_port_obj_del(br->dev, &mrp_role.obj);
+
+ if (!err)
+ return BR_MRP_SW;
+
+ return BR_MRP_NONE;
+}
+
+int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_in_state_type state)
+{
+ struct switchdev_obj_in_state_mrp mrp_state = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_IN_STATE_MRP,
+ .in_state = state,
+ .in_id = mrp->in_id,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL);
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period)
+{
+ struct switchdev_obj_in_test_mrp test = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_IN_TEST_MRP,
+ .interval = interval,
+ .max_miss = max_miss,
+ .in_id = mrp->in_id,
+ .period = period,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0);
+}
+
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+ .u.stp_state = state,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_attr_set(p->dev, &attr, NULL);
+}
+
+int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
+ .u.mrp_port_role = role,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_attr_set(p->dev, &attr, NULL);
+}
diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c
new file mode 100644
index 000000000000..43a300ae6bfa
--- /dev/null
+++ b/net/bridge/br_mst.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Bridge Multiple Spanning Tree Support
+ *
+ * Authors:
+ * Tobias Waldekranz <tobias@waldekranz.com>
+ */
+
+#include <linux/kernel.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+
+DEFINE_STATIC_KEY_FALSE(br_mst_used);
+
+bool br_mst_enabled(const struct net_device *dev)
+{
+ if (!netif_is_bridge_master(dev))
+ return false;
+
+ return br_opt_get(netdev_priv(dev), BROPT_MST_ENABLED);
+}
+EXPORT_SYMBOL_GPL(br_mst_enabled);
+
+void br_mst_uninit(struct net_bridge *br)
+{
+ if (br_opt_get(br, BROPT_MST_ENABLED))
+ static_branch_dec(&br_mst_used);
+}
+
+int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids)
+{
+ const struct net_bridge_vlan_group *vg;
+ const struct net_bridge_vlan *v;
+ const struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(dev))
+ return -EINVAL;
+
+ br = netdev_priv(dev);
+ if (!br_opt_get(br, BROPT_MST_ENABLED))
+ return -EINVAL;
+
+ vg = br_vlan_group(br);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (v->msti == msti)
+ __set_bit(v->vid, vids);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_mst_get_info);
+
+int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state)
+{
+ const struct net_bridge_port *p = NULL;
+ const struct net_bridge_vlan_group *vg;
+ const struct net_bridge_vlan *v;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_check_rtnl(dev);
+ if (!p || !br_opt_get(p->br, BROPT_MST_ENABLED))
+ return -EINVAL;
+
+ vg = nbp_vlan_group(p);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (v->brvlan->msti == msti) {
+ *state = v->state;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(br_mst_get_state);
+
+static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ u8 state)
+{
+ if (br_vlan_get_state(v) == state)
+ return;
+
+ if (v->vid == vg->pvid)
+ br_vlan_set_pvid_state(vg, state);
+
+ br_vlan_set_state(v, state);
+}
+
+int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_MST_STATE,
+ .orig_dev = p->dev,
+ .u.mst_state = {
+ .msti = msti,
+ .state = state,
+ },
+ };
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ int err = 0;
+
+ rcu_read_lock();
+ vg = nbp_vlan_group_rcu(p);
+ if (!vg)
+ goto out;
+
+ /* MSTI 0 (CST) state changes are notified via the regular
+ * SWITCHDEV_ATTR_ID_PORT_STP_STATE.
+ */
+ if (msti) {
+ err = switchdev_port_attr_set(p->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+
+ err = 0;
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (v->brvlan->msti != msti)
+ continue;
+
+ br_mst_vlan_set_state(vg, v, state);
+ }
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti)
+{
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(pv->port);
+ struct net_bridge_vlan *v;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ /* If this port already has a defined state in this
+ * MSTI (through some other VLAN membership), inherit
+ * it.
+ */
+ if (v != pv && v->brvlan->msti == msti) {
+ br_mst_vlan_set_state(vg, pv, v->state);
+ return;
+ }
+ }
+
+ /* Otherwise, start out in a new MSTI with all ports disabled. */
+ return br_mst_vlan_set_state(vg, pv, BR_STATE_DISABLED);
+}
+
+int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_VLAN_MSTI,
+ .orig_dev = mv->br->dev,
+ .u.vlan_msti = {
+ .vid = mv->vid,
+ .msti = msti,
+ },
+ };
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *pv;
+ struct net_bridge_port *p;
+ int err;
+
+ if (mv->msti == msti)
+ return 0;
+
+ err = switchdev_port_attr_set(mv->br->dev, &attr, NULL);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ mv->msti = msti;
+
+ list_for_each_entry(p, &mv->br->port_list, list) {
+ vg = nbp_vlan_group(p);
+
+ pv = br_vlan_find(vg, mv->vid);
+ if (pv)
+ br_mst_vlan_sync_state(pv, msti);
+ }
+
+ return 0;
+}
+
+void br_mst_vlan_init_state(struct net_bridge_vlan *v)
+{
+ /* VLANs always start out in MSTI 0 (CST) */
+ v->msti = 0;
+
+ if (br_vlan_is_master(v))
+ v->state = BR_STATE_FORWARDING;
+ else
+ v->state = v->port->state;
+}
+
+int br_mst_set_enabled(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_MST,
+ .orig_dev = br->dev,
+ .u.mst = on,
+ };
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+ int err;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+
+ if (!vg->num_vlans)
+ continue;
+
+ NL_SET_ERR_MSG(extack,
+ "MST mode can't be changed while VLANs exist");
+ return -EBUSY;
+ }
+
+ if (br_opt_get(br, BROPT_MST_ENABLED) == on)
+ return 0;
+
+ err = switchdev_port_attr_set(br->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ if (on)
+ static_branch_inc(&br_mst_used);
+ else
+ static_branch_dec(&br_mst_used);
+
+ br_opt_toggle(br, BROPT_MST_ENABLED, on);
+ return 0;
+}
+
+size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
+{
+ DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 };
+ const struct net_bridge_vlan *v;
+ size_t sz;
+
+ /* IFLA_BRIDGE_MST */
+ sz = nla_total_size(0);
+
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (test_bit(v->brvlan->msti, seen))
+ continue;
+
+ /* IFLA_BRIDGE_MST_ENTRY */
+ sz += nla_total_size(0) +
+ /* IFLA_BRIDGE_MST_ENTRY_MSTI */
+ nla_total_size(sizeof(u16)) +
+ /* IFLA_BRIDGE_MST_ENTRY_STATE */
+ nla_total_size(sizeof(u8));
+
+ __set_bit(v->brvlan->msti, seen);
+ }
+
+ return sz;
+}
+
+int br_mst_fill_info(struct sk_buff *skb,
+ const struct net_bridge_vlan_group *vg)
+{
+ DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 };
+ const struct net_bridge_vlan *v;
+ struct nlattr *nest;
+ int err = 0;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (test_bit(v->brvlan->msti, seen))
+ continue;
+
+ nest = nla_nest_start_noflag(skb, IFLA_BRIDGE_MST_ENTRY);
+ if (!nest ||
+ nla_put_u16(skb, IFLA_BRIDGE_MST_ENTRY_MSTI, v->brvlan->msti) ||
+ nla_put_u8(skb, IFLA_BRIDGE_MST_ENTRY_STATE, v->state)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ nla_nest_end(skb, nest);
+
+ __set_bit(v->brvlan->msti, seen);
+ }
+
+ return err;
+}
+
+static const struct nla_policy br_mst_nl_policy[IFLA_BRIDGE_MST_ENTRY_MAX + 1] = {
+ [IFLA_BRIDGE_MST_ENTRY_MSTI] = NLA_POLICY_RANGE(NLA_U16,
+ 1, /* 0 reserved for CST */
+ VLAN_N_VID - 1),
+ [IFLA_BRIDGE_MST_ENTRY_STATE] = NLA_POLICY_RANGE(NLA_U8,
+ BR_STATE_DISABLED,
+ BR_STATE_BLOCKING),
+};
+
+static int br_mst_process_one(struct net_bridge_port *p,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MST_ENTRY_MAX + 1];
+ u16 msti;
+ u8 state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MST_ENTRY_MAX, attr,
+ br_mst_nl_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MST_ENTRY_MSTI]) {
+ NL_SET_ERR_MSG_MOD(extack, "MSTI not specified");
+ return -EINVAL;
+ }
+
+ if (!tb[IFLA_BRIDGE_MST_ENTRY_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "State not specified");
+ return -EINVAL;
+ }
+
+ msti = nla_get_u16(tb[IFLA_BRIDGE_MST_ENTRY_MSTI]);
+ state = nla_get_u8(tb[IFLA_BRIDGE_MST_ENTRY_STATE]);
+
+ return br_mst_set_state(p, msti, state, extack);
+}
+
+int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *attr;
+ int err, msts = 0;
+ int rem;
+
+ if (!br_opt_get(p->br, BROPT_MST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify MST state when MST is disabled");
+ return -EBUSY;
+ }
+
+ nla_for_each_nested(attr, mst_attr, rem) {
+ switch (nla_type(attr)) {
+ case IFLA_BRIDGE_MST_ENTRY:
+ err = br_mst_process_one(p, attr, extack);
+ break;
+ default:
+ continue;
+ }
+
+ msts++;
+ if (err)
+ break;
+ }
+
+ if (!msts) {
+ NL_SET_ERR_MSG_MOD(extack, "Found no MST entries to process");
+ err = -EINVAL;
+ }
+
+ return err;
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
new file mode 100644
index 000000000000..d55a4ab87837
--- /dev/null
+++ b/net/bridge/br_multicast.c
@@ -0,0 +1,5247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Bridge multicast support.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/inetdevice.h>
+#include <linux/mroute.h>
+#include <net/ip.h>
+#include <net/switchdev.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#endif
+#include <trace/events/bridge.h>
+
+#include "br_private.h"
+#include "br_private_mcast_eht.h"
+
+static const struct rhashtable_params br_mdb_rht_params = {
+ .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode),
+ .key_offset = offsetof(struct net_bridge_mdb_entry, addr),
+ .key_len = sizeof(struct br_ip),
+ .automatic_shrinking = true,
+};
+
+static const struct rhashtable_params br_sg_port_rht_params = {
+ .head_offset = offsetof(struct net_bridge_port_group, rhnode),
+ .key_offset = offsetof(struct net_bridge_port_group, key),
+ .key_len = sizeof(struct net_bridge_port_group_sg_key),
+ .automatic_shrinking = true,
+};
+
+static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_own_query *query);
+static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx);
+static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ __be32 group,
+ __u16 vid,
+ const unsigned char *src);
+static void br_multicast_port_group_rexmit(struct timer_list *t);
+
+static void
+br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted);
+static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ const struct in6_addr *group,
+ __u16 vid, const unsigned char *src);
+#endif
+static struct net_bridge_port_group *
+__br_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1,
+ bool blocked);
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg);
+static void __br_multicast_stop(struct net_bridge_mcast *brmctx);
+
+static int br_mc_disabled_update(struct net_device *dev, bool value,
+ struct netlink_ext_ack *extack);
+
+static struct net_bridge_port_group *
+br_sg_port_find(struct net_bridge *br,
+ struct net_bridge_port_group_sg_key *sg_p)
+{
+ lockdep_assert_held_once(&br->multicast_lock);
+
+ return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p,
+ br_sg_port_rht_params);
+}
+
+static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br,
+ struct br_ip *dst)
+{
+ return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params);
+}
+
+struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br,
+ struct br_ip *dst)
+{
+ struct net_bridge_mdb_entry *ent;
+
+ lockdep_assert_held_once(&br->multicast_lock);
+
+ rcu_read_lock();
+ ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params);
+ rcu_read_unlock();
+
+ return ent;
+}
+
+static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br,
+ __be32 dst, __u16 vid)
+{
+ struct br_ip br_dst;
+
+ memset(&br_dst, 0, sizeof(br_dst));
+ br_dst.dst.ip4 = dst;
+ br_dst.proto = htons(ETH_P_IP);
+ br_dst.vid = vid;
+
+ return br_mdb_ip_get(br, &br_dst);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br,
+ const struct in6_addr *dst,
+ __u16 vid)
+{
+ struct br_ip br_dst;
+
+ memset(&br_dst, 0, sizeof(br_dst));
+ br_dst.dst.ip6 = *dst;
+ br_dst.proto = htons(ETH_P_IPV6);
+ br_dst.vid = vid;
+
+ return br_mdb_ip_get(br, &br_dst);
+}
+#endif
+
+struct net_bridge_mdb_entry *
+br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
+ u16 vid)
+{
+ struct net_bridge *br = brmctx->br;
+ struct br_ip ip;
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ br_multicast_ctx_vlan_global_disabled(brmctx))
+ return NULL;
+
+ if (BR_INPUT_SKB_CB(skb)->igmp)
+ return NULL;
+
+ memset(&ip, 0, sizeof(ip));
+ ip.proto = skb->protocol;
+ ip.vid = vid;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ip.dst.ip4 = ip_hdr(skb)->daddr;
+ if (brmctx->multicast_igmp_version == 3) {
+ struct net_bridge_mdb_entry *mdb;
+
+ ip.src.ip4 = ip_hdr(skb)->saddr;
+ mdb = br_mdb_ip_get_rcu(br, &ip);
+ if (mdb)
+ return mdb;
+ ip.src.ip4 = 0;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ip.dst.ip6 = ipv6_hdr(skb)->daddr;
+ if (brmctx->multicast_mld_version == 2) {
+ struct net_bridge_mdb_entry *mdb;
+
+ ip.src.ip6 = ipv6_hdr(skb)->saddr;
+ mdb = br_mdb_ip_get_rcu(br, &ip);
+ if (mdb)
+ return mdb;
+ memset(&ip.src.ip6, 0, sizeof(ip.src.ip6));
+ }
+ break;
+#endif
+ default:
+ ip.proto = 0;
+ ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest);
+ }
+
+ return br_mdb_ip_get_rcu(br, &ip);
+}
+
+/* IMPORTANT: this function must be used only when the contexts cannot be
+ * passed down (e.g. timer) and must be used for read-only purposes because
+ * the vlan snooping option can change, so it can return any context
+ * (non-vlan or vlan). Its initial intended purpose is to read timer values
+ * from the *current* context based on the option. At worst that could lead
+ * to inconsistent timers when the contexts are changed, i.e. src timer
+ * which needs to re-arm with a specific delay taken from the old context
+ */
+static struct net_bridge_mcast_port *
+br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg)
+{
+ struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx;
+ struct net_bridge_vlan *vlan;
+
+ lockdep_assert_held_once(&pg->key.port->br->multicast_lock);
+
+ /* if vlan snooping is disabled use the port's multicast context */
+ if (!pg->key.addr.vid ||
+ !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ goto out;
+
+ /* locking is tricky here, due to different rules for multicast and
+ * vlans we need to take rcu to find the vlan and make sure it has
+ * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under
+ * multicast_lock which must be already held here, so the vlan's pmctx
+ * can safely be used on return
+ */
+ rcu_read_lock();
+ vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid);
+ if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx))
+ pmctx = &vlan->port_mcast_ctx;
+ else
+ pmctx = NULL;
+ rcu_read_unlock();
+out:
+ return pmctx;
+}
+
+static struct net_bridge_mcast_port *
+br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_mcast_port *pmctx = NULL;
+ struct net_bridge_vlan *vlan;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return NULL;
+
+ /* Take RCU to access the vlan. */
+ rcu_read_lock();
+
+ vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid);
+ if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx))
+ pmctx = &vlan->port_mcast_ctx;
+
+ rcu_read_unlock();
+
+ return pmctx;
+}
+
+/* when snooping we need to check if the contexts should be used
+ * in the following order:
+ * - if pmctx is non-NULL (port), check if it should be used
+ * - if pmctx is NULL (bridge), check if brmctx should be used
+ */
+static bool
+br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx,
+ const struct net_bridge_mcast_port *pmctx)
+{
+ if (!netif_running(brmctx->br->dev))
+ return false;
+
+ if (pmctx)
+ return !br_multicast_port_ctx_state_disabled(pmctx);
+ else
+ return !br_multicast_ctx_vlan_disabled(brmctx);
+}
+
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+ struct net_bridge_port *port,
+ const unsigned char *src)
+{
+ if (p->key.port != port)
+ return false;
+
+ if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+ return true;
+
+ return ether_addr_equal(src, p->eth_addr);
+}
+
+static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ struct br_ip *sg_ip)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge_port_group *src_pg;
+ struct net_bridge_mcast *brmctx;
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ sg_key.port = pg->key.port;
+ sg_key.addr = *sg_ip;
+ if (br_sg_port_find(brmctx->br, &sg_key))
+ return;
+
+ src_pg = __br_multicast_add_group(brmctx, pmctx,
+ sg_ip, pg->eth_addr,
+ MCAST_INCLUDE, false, false);
+ if (IS_ERR_OR_NULL(src_pg) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ return;
+
+ src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL;
+}
+
+static void __fwd_del_star_excl(struct net_bridge_port_group *pg,
+ struct br_ip *sg_ip)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *src_pg;
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.port = pg->key.port;
+ sg_key.addr = *sg_ip;
+ src_pg = br_sg_port_find(br, &sg_key);
+ if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ return;
+
+ br_multicast_find_del_pg(br, src_pg);
+}
+
+/* When a port group transitions to (or is added as) EXCLUDE we need to add it
+ * to all other ports' S,G entries which are not blocked by the current group
+ * for proper replication, the assumption is that any S,G blocked entries
+ * are already added so the S,G,port lookup should skip them.
+ * When a port group transitions from EXCLUDE -> INCLUDE mode or is being
+ * deleted we need to remove it from all ports' S,G entries where it was
+ * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL).
+ */
+void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
+ u8 filter_mode)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *pg_lst;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mdb_entry *mp;
+ struct br_ip sg_ip;
+
+ if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr)))
+ return;
+
+ mp = br_mdb_ip_get(br, &pg->key.addr);
+ if (!mp)
+ return;
+ pmctx = br_multicast_pg_to_port_ctx(pg);
+ if (!pmctx)
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = pg->key.addr;
+
+ for (pg_lst = mlock_dereference(mp->ports, br);
+ pg_lst;
+ pg_lst = mlock_dereference(pg_lst->next, br)) {
+ struct net_bridge_group_src *src_ent;
+
+ if (pg_lst == pg)
+ continue;
+ hlist_for_each_entry(src_ent, &pg_lst->src_list, node) {
+ if (!(src_ent->flags & BR_SGRP_F_INSTALLED))
+ continue;
+ sg_ip.src = src_ent->addr.src;
+ switch (filter_mode) {
+ case MCAST_INCLUDE:
+ __fwd_del_star_excl(pg, &sg_ip);
+ break;
+ case MCAST_EXCLUDE:
+ __fwd_add_star_excl(pmctx, pg, &sg_ip);
+ break;
+ }
+ }
+ }
+}
+
+/* called when adding a new S,G with host_joined == false by default */
+static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg)
+{
+ struct net_bridge_mdb_entry *sg_mp;
+
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+ if (!star_mp->host_joined)
+ return;
+
+ sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr);
+ if (!sg_mp)
+ return;
+ sg_mp->host_joined = true;
+}
+
+/* set the host_joined state of all of *,G's S,G entries */
+static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp)
+{
+ struct net_bridge *br = star_mp->br;
+ struct net_bridge_mdb_entry *sg_mp;
+ struct net_bridge_port_group *pg;
+ struct br_ip sg_ip;
+
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = star_mp->addr;
+ for (pg = mlock_dereference(star_mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br)) {
+ struct net_bridge_group_src *src_ent;
+
+ hlist_for_each_entry(src_ent, &pg->src_list, node) {
+ if (!(src_ent->flags & BR_SGRP_F_INSTALLED))
+ continue;
+ sg_ip.src = src_ent->addr.src;
+ sg_mp = br_mdb_ip_get(br, &sg_ip);
+ if (!sg_mp)
+ continue;
+ sg_mp->host_joined = star_mp->host_joined;
+ }
+ }
+}
+
+static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+
+ /* *,G exclude ports are only added to S,G entries */
+ if (WARN_ON(br_multicast_is_star_g(&sgmp->addr)))
+ return;
+
+ /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports
+ * we should ignore perm entries since they're managed by user-space
+ */
+ for (pp = &sgmp->ports;
+ (p = mlock_dereference(*pp, sgmp->br)) != NULL;
+ pp = &p->next)
+ if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL |
+ MDB_PG_FLAGS_PERMANENT)))
+ return;
+
+ /* currently the host can only have joined the *,G which means
+ * we treat it as EXCLUDE {}, so for an S,G it's considered a
+ * STAR_EXCLUDE entry and we can safely leave it
+ */
+ sgmp->host_joined = false;
+
+ for (pp = &sgmp->ports;
+ (p = mlock_dereference(*pp, sgmp->br)) != NULL;) {
+ if (!(p->flags & MDB_PG_FLAGS_PERMANENT))
+ br_multicast_del_pg(sgmp, p, pp);
+ else
+ pp = &p->next;
+ }
+}
+
+void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = star_mp->br;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_port_group *pg;
+ struct net_bridge_mcast *brmctx;
+
+ if (WARN_ON(br_multicast_is_star_g(&sg->key.addr)))
+ return;
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+
+ br_multicast_sg_host_state(star_mp, sg);
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.addr = sg->key.addr;
+ /* we need to add all exclude ports to the S,G */
+ for (pg = mlock_dereference(star_mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br)) {
+ struct net_bridge_port_group *src_pg;
+
+ if (pg == sg || pg->filter_mode == MCAST_INCLUDE)
+ continue;
+
+ sg_key.port = pg->key.port;
+ if (br_sg_port_find(br, &sg_key))
+ continue;
+
+ pmctx = br_multicast_pg_to_port_ctx(pg);
+ if (!pmctx)
+ continue;
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+
+ src_pg = __br_multicast_add_group(brmctx, pmctx,
+ &sg->key.addr,
+ sg->eth_addr,
+ MCAST_INCLUDE, false, false);
+ if (IS_ERR_OR_NULL(src_pg) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ continue;
+ src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL;
+ }
+}
+
+static void br_multicast_fwd_src_add(struct net_bridge_group_src *src)
+{
+ struct net_bridge_mdb_entry *star_mp;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_port_group *sg;
+ struct net_bridge_mcast *brmctx;
+ struct br_ip sg_ip;
+
+ if (src->flags & BR_SGRP_F_INSTALLED)
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ pmctx = br_multicast_pg_to_port_ctx(src->pg);
+ if (!pmctx)
+ return;
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ sg_ip = src->pg->key.addr;
+ sg_ip.src = src->addr.src;
+
+ sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip,
+ src->pg->eth_addr, MCAST_INCLUDE, false,
+ !timer_pending(&src->timer));
+ if (IS_ERR_OR_NULL(sg))
+ return;
+ src->flags |= BR_SGRP_F_INSTALLED;
+ sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL;
+
+ /* if it was added by user-space as perm we can skip next steps */
+ if (sg->rt_protocol != RTPROT_KERNEL &&
+ (sg->flags & MDB_PG_FLAGS_PERMANENT))
+ return;
+
+ /* the kernel is now responsible for removing this S,G */
+ timer_delete(&sg->timer);
+ star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr);
+ if (!star_mp)
+ return;
+
+ br_multicast_sg_add_exclude_ports(star_mp, sg);
+}
+
+static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src,
+ bool fastleave)
+{
+ struct net_bridge_port_group *p, *pg = src->pg;
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_mdb_entry *mp;
+ struct br_ip sg_ip;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = pg->key.addr;
+ sg_ip.src = src->addr.src;
+
+ mp = br_mdb_ip_get(src->br, &sg_ip);
+ if (!mp)
+ return;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, src->br)) != NULL;
+ pp = &p->next) {
+ if (!br_port_group_equal(p, pg->key.port, pg->eth_addr))
+ continue;
+
+ if (p->rt_protocol != RTPROT_KERNEL &&
+ (p->flags & MDB_PG_FLAGS_PERMANENT) &&
+ !(src->flags & BR_SGRP_F_USER_ADDED))
+ break;
+
+ if (fastleave)
+ p->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_del_pg(mp, p, pp);
+ break;
+ }
+ src->flags &= ~BR_SGRP_F_INSTALLED;
+}
+
+/* install S,G and based on src's timer enable or disable forwarding */
+static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge_port_group *sg;
+ u8 old_flags;
+
+ br_multicast_fwd_src_add(src);
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.addr = src->pg->key.addr;
+ sg_key.addr.src = src->addr.src;
+ sg_key.port = src->pg->key.port;
+
+ sg = br_sg_port_find(src->br, &sg_key);
+ if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT))
+ return;
+
+ old_flags = sg->flags;
+ if (timer_pending(&src->timer))
+ sg->flags &= ~MDB_PG_FLAGS_BLOCKED;
+ else
+ sg->flags |= MDB_PG_FLAGS_BLOCKED;
+
+ if (old_flags != sg->flags) {
+ struct net_bridge_mdb_entry *sg_mp;
+
+ sg_mp = br_mdb_ip_get(src->br, &sg_key.addr);
+ if (!sg_mp)
+ return;
+ br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB);
+ }
+}
+
+static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_mdb_entry *mp;
+
+ mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc);
+ WARN_ON(!hlist_unhashed(&mp->mdb_node));
+ WARN_ON(mp->ports);
+
+ timer_shutdown_sync(&mp->timer);
+ kfree_rcu(mp, rcu);
+}
+
+static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp)
+{
+ struct net_bridge *br = mp->br;
+
+ rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ hlist_del_init_rcu(&mp->mdb_node);
+ hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
+static void br_multicast_group_expired(struct timer_list *t)
+{
+ struct net_bridge_mdb_entry *mp = timer_container_of(mp, t, timer);
+ struct net_bridge *br = mp->br;
+
+ spin_lock(&br->multicast_lock);
+ if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) ||
+ timer_pending(&mp->timer))
+ goto out;
+
+ br_multicast_host_leave(mp, true);
+
+ if (mp->ports)
+ goto out;
+ br_multicast_del_mdb_entry(mp);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_src *src;
+
+ src = container_of(gc, struct net_bridge_group_src, mcast_gc);
+ WARN_ON(!hlist_unhashed(&src->node));
+
+ timer_shutdown_sync(&src->timer);
+ kfree_rcu(src, rcu);
+}
+
+void __br_multicast_del_group_src(struct net_bridge_group_src *src)
+{
+ struct net_bridge *br = src->pg->key.port->br;
+
+ hlist_del_init_rcu(&src->node);
+ src->pg->src_ents--;
+ hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
+void br_multicast_del_group_src(struct net_bridge_group_src *src,
+ bool fastleave)
+{
+ br_multicast_fwd_src_remove(src, fastleave);
+ __br_multicast_del_group_src(src);
+}
+
+static int
+br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx,
+ struct netlink_ext_ack *extack,
+ const char *what)
+{
+ u32 max = READ_ONCE(pmctx->mdb_max_entries);
+ u32 n = READ_ONCE(pmctx->mdb_n_entries);
+
+ if (max && n >= max) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u",
+ what, n, max);
+ return -E2BIG;
+ }
+
+ WRITE_ONCE(pmctx->mdb_n_entries, n + 1);
+ return 0;
+}
+
+static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx)
+{
+ u32 n = READ_ONCE(pmctx->mdb_n_entries);
+
+ WARN_ON_ONCE(n == 0);
+ WRITE_ONCE(pmctx->mdb_n_entries, n - 1);
+}
+
+static int br_multicast_port_ngroups_inc(struct net_bridge_port *port,
+ const struct br_ip *group,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_mcast_port *pmctx;
+ int err;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ /* Always count on the port context. */
+ err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack,
+ "Port");
+ if (err) {
+ trace_br_mdb_full(port->dev, group);
+ return err;
+ }
+
+ /* Only count on the VLAN context if VID is given, and if snooping on
+ * that VLAN is enabled.
+ */
+ if (!group->vid)
+ return 0;
+
+ pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid);
+ if (!pmctx)
+ return 0;
+
+ err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN");
+ if (err) {
+ trace_br_mdb_full(port->dev, group);
+ goto dec_one_out;
+ }
+
+ return 0;
+
+dec_one_out:
+ br_multicast_port_ngroups_dec_one(&port->multicast_ctx);
+ return err;
+}
+
+static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_mcast_port *pmctx;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ if (vid) {
+ pmctx = br_multicast_port_vid_to_port_ctx(port, vid);
+ if (pmctx)
+ br_multicast_port_ngroups_dec_one(pmctx);
+ }
+ br_multicast_port_ngroups_dec_one(&port->multicast_ctx);
+}
+
+u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx)
+{
+ return READ_ONCE(pmctx->mdb_n_entries);
+}
+
+void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max)
+{
+ WRITE_ONCE(pmctx->mdb_max_entries, max);
+}
+
+u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx)
+{
+ return READ_ONCE(pmctx->mdb_max_entries);
+}
+
+static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_port_group *pg;
+
+ pg = container_of(gc, struct net_bridge_port_group, mcast_gc);
+ WARN_ON(!hlist_unhashed(&pg->mglist));
+ WARN_ON(!hlist_empty(&pg->src_list));
+
+ timer_shutdown_sync(&pg->rexmit_timer);
+ timer_shutdown_sync(&pg->timer);
+ kfree_rcu(pg, rcu);
+}
+
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+
+ rcu_assign_pointer(*pp, pg->next);
+ hlist_del_init(&pg->mglist);
+ br_multicast_eht_clean_sets(pg);
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ br_multicast_del_group_src(ent, false);
+ br_mdb_notify(br->dev, mp, pg, RTM_DELMDB);
+ if (!br_multicast_is_star_g(&mp->addr)) {
+ rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode,
+ br_sg_port_rht_params);
+ br_multicast_sg_del_exclude_ports(mp);
+ } else {
+ br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
+ }
+ br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid);
+ hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+
+ if (!mp->ports && !mp->host_joined && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+}
+
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+
+ mp = br_mdb_ip_get(br, &pg->key.addr);
+ if (WARN_ON(!mp))
+ return;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p != pg)
+ continue;
+
+ br_multicast_del_pg(mp, pg, pp);
+ return;
+ }
+
+ WARN_ON(1);
+}
+
+static void br_multicast_port_group_expired(struct timer_list *t)
+{
+ struct net_bridge_port_group *pg = timer_container_of(pg, t, timer);
+ struct net_bridge_group_src *src_ent;
+ struct net_bridge *br = pg->key.port->br;
+ struct hlist_node *tmp;
+ bool changed;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
+ hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT)
+ goto out;
+
+ changed = !!(pg->filter_mode == MCAST_EXCLUDE);
+ pg->filter_mode = MCAST_INCLUDE;
+ hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) {
+ if (!timer_pending(&src_ent->timer)) {
+ br_multicast_del_group_src(src_ent, false);
+ changed = true;
+ }
+ }
+
+ if (hlist_empty(&pg->src_list)) {
+ br_multicast_find_del_pg(br, pg);
+ } else if (changed) {
+ struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr);
+
+ if (changed && br_multicast_is_star_g(&pg->key.addr))
+ br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
+
+ if (WARN_ON(!mp))
+ goto out;
+ br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB);
+ }
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_gc(struct hlist_head *head)
+{
+ struct net_bridge_mcast_gc *gcent;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(gcent, tmp, head, gc_node) {
+ hlist_del_init(&gcent->gc_node);
+ gcent->destroy(gcent);
+ }
+}
+
+static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb)
+{
+ struct net_bridge_vlan *vlan = NULL;
+
+ if (pmctx && br_multicast_port_ctx_is_vlan(pmctx))
+ vlan = pmctx->vlan;
+ else if (br_multicast_ctx_is_vlan(brmctx))
+ vlan = brmctx->vlan;
+
+ if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) {
+ u16 vlan_proto;
+
+ if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0)
+ return;
+ __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid);
+ }
+}
+
+static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ __be32 ip_dst, __be32 group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ struct net_bridge_port *p = pg ? pg->key.port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, igmp_hdr_size;
+ unsigned long now = jiffies;
+ struct igmpv3_query *ihv3;
+ void *csum_start = NULL;
+ __sum16 *csum = NULL;
+ struct sk_buff *skb;
+ struct igmphdr *ih;
+ struct ethhdr *eth;
+ unsigned long lmqt;
+ struct iphdr *iph;
+ u16 lmqt_srcs = 0;
+
+ igmp_hdr_size = sizeof(*ih);
+ if (brmctx->multicast_igmp_version == 3) {
+ igmp_hdr_size = sizeof(*ihv3);
+ if (pg && with_srcs) {
+ lmqt = now + (brmctx->multicast_last_member_interval *
+ brmctx->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ lmqt_srcs++;
+ }
+
+ if (!lmqt_srcs)
+ return NULL;
+ igmp_hdr_size += lmqt_srcs * sizeof(__be32);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > brmctx->br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size);
+ if (!skb)
+ goto out;
+
+ __br_multicast_query_handle_vlan(brmctx, pmctx, skb);
+ skb->protocol = htons(ETH_P_IP);
+
+ skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
+
+ ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr);
+ ip_eth_mc_map(ip_dst, eth->h_dest);
+ eth->h_proto = htons(ETH_P_IP);
+ skb_put(skb, sizeof(*eth));
+
+ skb_set_network_header(skb, skb->len);
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(pkt_size - sizeof(*eth));
+
+ iph->version = 4;
+ iph->ihl = 6;
+ iph->tos = 0xc0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = 1;
+ iph->protocol = IPPROTO_IGMP;
+ iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
+ inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0;
+ iph->daddr = ip_dst;
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
+ ip_send_check(iph);
+ skb_put(skb, 24);
+
+ skb_set_transport_header(skb, skb->len);
+ *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
+
+ switch (brmctx->multicast_igmp_version) {
+ case 2:
+ ih = igmp_hdr(skb);
+ ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ih->code = (group ? brmctx->multicast_last_member_interval :
+ brmctx->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ih->group = group;
+ ih->csum = 0;
+ csum = &ih->csum;
+ csum_start = (void *)ih;
+ break;
+ case 3:
+ ihv3 = igmpv3_query_hdr(skb);
+ ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ihv3->code = (group ? brmctx->multicast_last_member_interval :
+ brmctx->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ihv3->group = group;
+ ihv3->qqic = brmctx->multicast_query_interval / HZ;
+ ihv3->nsrcs = htons(lmqt_srcs);
+ ihv3->resv = 0;
+ ihv3->suppress = sflag;
+ ihv3->qrv = 2;
+ ihv3->csum = 0;
+ csum = &ihv3->csum;
+ csum_start = (void *)ihv3;
+ if (!pg || !with_srcs)
+ break;
+
+ lmqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ break;
+ }
+
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = ip_compute_csum(csum_start, igmp_hdr_size);
+ skb_put(skb, igmp_hdr_size);
+ __skb_pull(skb, sizeof(*eth));
+
+out:
+ return skb;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ const struct in6_addr *ip6_dst,
+ const struct in6_addr *group,
+ bool with_srcs, bool over_llqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ struct net_bridge_port *p = pg ? pg->key.port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, mld_hdr_size;
+ unsigned long now = jiffies;
+ struct mld2_query *mld2q;
+ void *csum_start = NULL;
+ unsigned long interval;
+ __sum16 *csum = NULL;
+ struct ipv6hdr *ip6h;
+ struct mld_msg *mldq;
+ struct sk_buff *skb;
+ unsigned long llqt;
+ struct ethhdr *eth;
+ u16 llqt_srcs = 0;
+ u8 *hopopt;
+
+ mld_hdr_size = sizeof(*mldq);
+ if (brmctx->multicast_mld_version == 2) {
+ mld_hdr_size = sizeof(*mld2q);
+ if (pg && with_srcs) {
+ llqt = now + (brmctx->multicast_last_member_interval *
+ brmctx->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ llqt_srcs++;
+ }
+
+ if (!llqt_srcs)
+ return NULL;
+ mld_hdr_size += llqt_srcs * sizeof(struct in6_addr);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > brmctx->br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size);
+ if (!skb)
+ goto out;
+
+ __br_multicast_query_handle_vlan(brmctx, pmctx, skb);
+ skb->protocol = htons(ETH_P_IPV6);
+
+ /* Ethernet header */
+ skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
+
+ ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr);
+ eth->h_proto = htons(ETH_P_IPV6);
+ skb_put(skb, sizeof(*eth));
+
+ /* IPv6 header + HbH option */
+ skb_set_network_header(skb, skb->len);
+ ip6h = ipv6_hdr(skb);
+
+ *(__force __be32 *)ip6h = htonl(0x60000000);
+ ip6h->payload_len = htons(8 + mld_hdr_size);
+ ip6h->nexthdr = IPPROTO_HOPOPTS;
+ ip6h->hop_limit = 1;
+ ip6h->daddr = *ip6_dst;
+ if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev,
+ &ip6h->daddr, 0, &ip6h->saddr)) {
+ kfree_skb(skb);
+ br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false);
+ return NULL;
+ }
+
+ br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true);
+ ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
+
+ hopopt = (u8 *)(ip6h + 1);
+ hopopt[0] = IPPROTO_ICMPV6; /* next hdr */
+ hopopt[1] = 0; /* length of HbH */
+ hopopt[2] = IPV6_TLV_ROUTERALERT; /* Router Alert */
+ hopopt[3] = 2; /* Length of RA Option */
+ hopopt[4] = 0; /* Type = 0x0000 (MLD) */
+ hopopt[5] = 0;
+ hopopt[6] = IPV6_TLV_PAD1; /* Pad1 */
+ hopopt[7] = IPV6_TLV_PAD1; /* Pad1 */
+
+ skb_put(skb, sizeof(*ip6h) + 8);
+
+ /* ICMPv6 */
+ skb_set_transport_header(skb, skb->len);
+ interval = ipv6_addr_any(group) ?
+ brmctx->multicast_query_response_interval :
+ brmctx->multicast_last_member_interval;
+ *igmp_type = ICMPV6_MGM_QUERY;
+ switch (brmctx->multicast_mld_version) {
+ case 1:
+ mldq = (struct mld_msg *)icmp6_hdr(skb);
+ mldq->mld_type = ICMPV6_MGM_QUERY;
+ mldq->mld_code = 0;
+ mldq->mld_cksum = 0;
+ mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
+ mldq->mld_reserved = 0;
+ mldq->mld_mca = *group;
+ csum = &mldq->mld_cksum;
+ csum_start = (void *)mldq;
+ break;
+ case 2:
+ mld2q = (struct mld2_query *)icmp6_hdr(skb);
+ mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
+ mld2q->mld2q_type = ICMPV6_MGM_QUERY;
+ mld2q->mld2q_code = 0;
+ mld2q->mld2q_cksum = 0;
+ mld2q->mld2q_resv1 = 0;
+ mld2q->mld2q_resv2 = 0;
+ mld2q->mld2q_suppress = sflag;
+ mld2q->mld2q_qrv = 2;
+ mld2q->mld2q_nsrcs = htons(llqt_srcs);
+ mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ;
+ mld2q->mld2q_mca = *group;
+ csum = &mld2q->mld2q_cksum;
+ csum_start = (void *)mld2q;
+ if (!pg || !with_srcs)
+ break;
+
+ llqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ break;
+ }
+
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size,
+ IPPROTO_ICMPV6,
+ csum_partial(csum_start, mld_hdr_size, 0));
+ skb_put(skb, mld_hdr_size);
+ __skb_pull(skb, sizeof(*eth));
+
+out:
+ return skb;
+}
+#endif
+
+static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ __be32 ip4_dst;
+
+ switch (group->proto) {
+ case htons(ETH_P_IP):
+ ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP);
+ return br_ip4_multicast_alloc_query(brmctx, pmctx, pg,
+ ip4_dst, group->dst.ip4,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6): {
+ struct in6_addr ip6_dst;
+
+ if (ip_dst)
+ ip6_dst = ip_dst->dst.ip6;
+ else
+ ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0,
+ htonl(1));
+
+ return br_ip6_multicast_alloc_query(brmctx, pmctx, pg,
+ &ip6_dst, &group->dst.ip6,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
+ }
+#endif
+ }
+ return NULL;
+}
+
+struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
+ struct br_ip *group)
+{
+ struct net_bridge_mdb_entry *mp;
+ int err;
+
+ mp = br_mdb_ip_get(br, group);
+ if (mp)
+ return mp;
+
+ if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) {
+ trace_br_mdb_full(br->dev, group);
+ br_mc_disabled_update(br->dev, false, NULL);
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
+ return ERR_PTR(-E2BIG);
+ }
+
+ mp = kzalloc(sizeof(*mp), GFP_ATOMIC);
+ if (unlikely(!mp))
+ return ERR_PTR(-ENOMEM);
+
+ mp->br = br;
+ mp->addr = *group;
+ mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry;
+ timer_setup(&mp->timer, br_multicast_group_expired, 0);
+ err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ if (err) {
+ kfree(mp);
+ mp = ERR_PTR(err);
+ } else {
+ hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list);
+ }
+
+ return mp;
+}
+
+static void br_multicast_group_src_expired(struct timer_list *t)
+{
+ struct net_bridge_group_src *src = timer_container_of(src, t, timer);
+ struct net_bridge_port_group *pg;
+ struct net_bridge *br = src->br;
+
+ spin_lock(&br->multicast_lock);
+ if (hlist_unhashed(&src->node) || !netif_running(br->dev) ||
+ timer_pending(&src->timer))
+ goto out;
+
+ pg = src->pg;
+ if (pg->filter_mode == MCAST_INCLUDE) {
+ br_multicast_del_group_src(src, false);
+ if (!hlist_empty(&pg->src_list))
+ goto out;
+ br_multicast_find_del_pg(br, pg);
+ } else {
+ br_multicast_fwd_src_handle(src);
+ }
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+struct net_bridge_group_src *
+br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip)
+{
+ struct net_bridge_group_src *ent;
+
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (ip->src.ip4 == ent->addr.src.ip4)
+ return ent;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6))
+ return ent;
+ break;
+#endif
+ }
+
+ return NULL;
+}
+
+struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip)
+{
+ struct net_bridge_group_src *grp_src;
+
+ if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT))
+ return NULL;
+
+ switch (src_ip->proto) {
+ case htons(ETH_P_IP):
+ if (ipv4_is_zeronet(src_ip->src.ip4) ||
+ ipv4_is_multicast(src_ip->src.ip4))
+ return NULL;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (ipv6_addr_any(&src_ip->src.ip6) ||
+ ipv6_addr_is_multicast(&src_ip->src.ip6))
+ return NULL;
+ break;
+#endif
+ }
+
+ grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC);
+ if (unlikely(!grp_src))
+ return NULL;
+
+ grp_src->pg = pg;
+ grp_src->br = pg->key.port->br;
+ grp_src->addr = *src_ip;
+ grp_src->mcast_gc.destroy = br_multicast_destroy_group_src;
+ timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0);
+
+ hlist_add_head_rcu(&grp_src->node, &pg->src_list);
+ pg->src_ents++;
+
+ return grp_src;
+}
+
+struct net_bridge_port_group *br_multicast_new_port_group(
+ struct net_bridge_port *port,
+ const struct br_ip *group,
+ struct net_bridge_port_group __rcu *next,
+ unsigned char flags,
+ const unsigned char *src,
+ u8 filter_mode,
+ u8 rt_protocol,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port_group *p;
+ int err;
+
+ err = br_multicast_port_ngroups_inc(port, group, extack);
+ if (err)
+ return NULL;
+
+ p = kzalloc(sizeof(*p), GFP_ATOMIC);
+ if (unlikely(!p)) {
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group");
+ goto dec_out;
+ }
+
+ p->key.addr = *group;
+ p->key.port = port;
+ p->flags = flags;
+ p->filter_mode = filter_mode;
+ p->rt_protocol = rt_protocol;
+ p->eht_host_tree = RB_ROOT;
+ p->eht_set_tree = RB_ROOT;
+ p->mcast_gc.destroy = br_multicast_destroy_port_group;
+ INIT_HLIST_HEAD(&p->src_list);
+
+ if (!br_multicast_is_star_g(group) &&
+ rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode,
+ br_sg_port_rht_params)) {
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group");
+ goto free_out;
+ }
+
+ rcu_assign_pointer(p->next, next);
+ timer_setup(&p->timer, br_multicast_port_group_expired, 0);
+ timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0);
+ hlist_add_head(&p->mglist, &port->mglist);
+
+ if (src)
+ memcpy(p->eth_addr, src, ETH_ALEN);
+ else
+ eth_broadcast_addr(p->eth_addr);
+
+ return p;
+
+free_out:
+ kfree(p);
+dec_out:
+ br_multicast_port_ngroups_dec(port, group->vid);
+ return NULL;
+}
+
+void br_multicast_del_port_group(struct net_bridge_port_group *p)
+{
+ struct net_bridge_port *port = p->key.port;
+ __u16 vid = p->key.addr.vid;
+
+ hlist_del_init(&p->mglist);
+ if (!br_multicast_is_star_g(&p->key.addr))
+ rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode,
+ br_sg_port_rht_params);
+ kfree(p);
+ br_multicast_port_ngroups_dec(port, vid);
+}
+
+void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined) {
+ mp->host_joined = true;
+ if (br_multicast_is_star_g(&mp->addr))
+ br_multicast_star_g_host_state(mp);
+ if (notify)
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB);
+ }
+
+ if (br_group_is_l2(&mp->addr))
+ return;
+
+ mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval);
+}
+
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined)
+ return;
+
+ mp->host_joined = false;
+ if (br_multicast_is_star_g(&mp->addr))
+ br_multicast_star_g_host_state(mp);
+ if (notify)
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB);
+}
+
+static struct net_bridge_port_group *
+__br_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1,
+ bool blocked)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p = NULL;
+ struct net_bridge_mdb_entry *mp;
+ unsigned long now = jiffies;
+
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto out;
+
+ mp = br_multicast_new_group(brmctx->br, group);
+ if (IS_ERR(mp))
+ return ERR_CAST(mp);
+
+ if (!pmctx) {
+ br_multicast_host_join(brmctx, mp, true);
+ goto out;
+ }
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
+ pp = &p->next) {
+ if (br_port_group_equal(p, pmctx->port, src))
+ goto found;
+ if ((unsigned long)p->key.port < (unsigned long)pmctx->port)
+ break;
+ }
+
+ p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src,
+ filter_mode, RTPROT_KERNEL, NULL);
+ if (unlikely(!p)) {
+ p = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ rcu_assign_pointer(*pp, p);
+ if (blocked)
+ p->flags |= MDB_PG_FLAGS_BLOCKED;
+ br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB);
+
+found:
+ if (igmpv2_mldv1)
+ mod_timer(&p->timer,
+ now + brmctx->multicast_membership_interval);
+
+out:
+ return p;
+}
+
+static int br_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1)
+{
+ struct net_bridge_port_group *pg;
+ int err;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode,
+ igmpv2_mldv1, false);
+ /* NULL is considered valid for host joined groups */
+ err = PTR_ERR_OR_ZERO(pg);
+ spin_unlock(&brmctx->br->multicast_lock);
+
+ return err;
+}
+
+static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ __be32 group,
+ __u16 vid,
+ const unsigned char *src,
+ bool igmpv2)
+{
+ struct br_ip br_group;
+ u8 filter_mode;
+
+ if (ipv4_is_local_multicast(group))
+ return 0;
+
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip4 = group;
+ br_group.proto = htons(ETH_P_IP);
+ br_group.vid = vid;
+ filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE;
+
+ return br_multicast_add_group(brmctx, pmctx, &br_group, src,
+ filter_mode, igmpv2);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ const struct in6_addr *group,
+ __u16 vid,
+ const unsigned char *src,
+ bool mldv1)
+{
+ struct br_ip br_group;
+ u8 filter_mode;
+
+ if (ipv6_addr_is_ll_all_nodes(group))
+ return 0;
+
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip6 = *group;
+ br_group.proto = htons(ETH_P_IPV6);
+ br_group.vid = vid;
+ filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE;
+
+ return br_multicast_add_group(brmctx, pmctx, &br_group, src,
+ filter_mode, mldv1);
+}
+#endif
+
+static bool br_multicast_rport_del(struct hlist_node *rlist)
+{
+ if (hlist_unhashed(rlist))
+ return false;
+
+ hlist_del_init_rcu(rlist);
+ return true;
+}
+
+static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx)
+{
+ return br_multicast_rport_del(&pmctx->ip4_rlist);
+}
+
+static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return br_multicast_rport_del(&pmctx->ip6_rlist);
+#else
+ return false;
+#endif
+}
+
+static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx,
+ struct timer_list *t,
+ struct hlist_node *rlist)
+{
+ struct net_bridge *br = pmctx->port->br;
+ bool del;
+
+ spin_lock(&br->multicast_lock);
+ if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ pmctx->multicast_router == MDB_RTR_TYPE_PERM ||
+ timer_pending(t))
+ goto out;
+
+ del = br_multicast_rport_del(rlist);
+ br_multicast_rport_del_notify(pmctx, del);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip4_mc_router_timer);
+
+ br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip6_mc_router_timer);
+
+ br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist);
+}
+#endif
+
+static void br_mc_router_state_change(struct net_bridge *p,
+ bool is_mc_router)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.mrouter = is_mc_router,
+ };
+
+ switchdev_port_attr_set(p->dev, &attr, NULL);
+}
+
+static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx,
+ struct timer_list *timer)
+{
+ spin_lock(&brmctx->br->multicast_lock);
+ if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ brmctx->multicast_router == MDB_RTR_TYPE_PERM ||
+ br_ip4_multicast_is_router(brmctx) ||
+ br_ip6_multicast_is_router(brmctx))
+ goto out;
+
+ br_mc_router_state_change(brmctx->br, false);
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static void br_ip4_multicast_local_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip4_mc_router_timer);
+
+ br_multicast_local_router_expired(brmctx, t);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_local_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip6_mc_router_timer);
+
+ br_multicast_local_router_expired(brmctx, t);
+}
+#endif
+
+static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_own_query *query)
+{
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!netif_running(brmctx->br->dev) ||
+ br_multicast_ctx_vlan_global_disabled(brmctx) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+ goto out;
+
+ br_multicast_start_querier(brmctx, query);
+
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static void br_ip4_multicast_querier_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip4_other_query.timer);
+
+ br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_querier_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip6_other_query.timer);
+
+ br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query);
+}
+#endif
+
+static void br_multicast_query_delay_expired(struct timer_list *t)
+{
+}
+
+static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx,
+ struct br_ip *ip,
+ struct sk_buff *skb)
+{
+ if (ip->proto == htons(ETH_P_IP))
+ brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr;
+#endif
+}
+
+static void __br_multicast_send_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs,
+ u8 sflag,
+ bool *need_rexmit)
+{
+ bool over_lmqt = !!sflag;
+ struct sk_buff *skb;
+ u8 igmp_type;
+
+ if (!br_multicast_ctx_should_use(brmctx, pmctx) ||
+ !br_multicast_ctx_matches_vlan_snooping(brmctx))
+ return;
+
+again_under_lmqt:
+ skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group,
+ with_srcs, over_lmqt, sflag, &igmp_type,
+ need_rexmit);
+ if (!skb)
+ return;
+
+ if (pmctx) {
+ skb->dev = pmctx->port->dev;
+ br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type,
+ BR_MCAST_DIR_TX);
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev,
+ br_dev_queue_push_xmit);
+
+ if (over_lmqt && with_srcs && sflag) {
+ over_lmqt = false;
+ goto again_under_lmqt;
+ }
+ } else {
+ br_multicast_select_own_querier(brmctx, group, skb);
+ br_multicast_count(brmctx->br, NULL, skb, igmp_type,
+ BR_MCAST_DIR_RX);
+ netif_rx(skb);
+ }
+}
+
+static void br_multicast_read_querier(const struct bridge_mcast_querier *querier,
+ struct bridge_mcast_querier *dest)
+{
+ unsigned int seq;
+
+ memset(dest, 0, sizeof(*dest));
+ do {
+ seq = read_seqcount_begin(&querier->seq);
+ dest->port_ifidx = querier->port_ifidx;
+ memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip));
+ } while (read_seqcount_retry(&querier->seq, seq));
+}
+
+static void br_multicast_update_querier(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_querier *querier,
+ int ifindex,
+ struct br_ip *saddr)
+{
+ write_seqcount_begin(&querier->seq);
+ querier->port_ifidx = ifindex;
+ memcpy(&querier->addr, saddr, sizeof(*saddr));
+ write_seqcount_end(&querier->seq);
+}
+
+static void br_multicast_send_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct bridge_mcast_own_query *own_query)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct bridge_mcast_querier *querier;
+ struct br_ip br_group;
+ unsigned long time;
+
+ if (!br_multicast_ctx_should_use(brmctx, pmctx) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) ||
+ !brmctx->multicast_querier)
+ return;
+
+ memset(&br_group.dst, 0, sizeof(br_group.dst));
+
+ if (pmctx ? (own_query == &pmctx->ip4_own_query) :
+ (own_query == &brmctx->ip4_own_query)) {
+ querier = &brmctx->ip4_querier;
+ other_query = &brmctx->ip4_other_query;
+ br_group.proto = htons(ETH_P_IP);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ querier = &brmctx->ip6_querier;
+ other_query = &brmctx->ip6_other_query;
+ br_group.proto = htons(ETH_P_IPV6);
+#endif
+ }
+
+ if (!other_query || timer_pending(&other_query->timer))
+ return;
+
+ /* we're about to select ourselves as querier */
+ if (!pmctx && querier->port_ifidx) {
+ struct br_ip zeroip = {};
+
+ br_multicast_update_querier(brmctx, querier, 0, &zeroip);
+ }
+
+ __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false,
+ 0, NULL);
+
+ time = jiffies;
+ time += own_query->startup_sent < brmctx->multicast_startup_query_count ?
+ brmctx->multicast_startup_query_interval :
+ brmctx->multicast_query_interval;
+ mod_timer(&own_query->timer, time);
+}
+
+static void
+br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx,
+ struct bridge_mcast_own_query *query)
+{
+ struct net_bridge *br = pmctx->port->br;
+ struct net_bridge_mcast *brmctx;
+
+ spin_lock(&br->multicast_lock);
+ if (br_multicast_port_ctx_state_stopped(pmctx))
+ goto out;
+
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ if (query->startup_sent < brmctx->multicast_startup_query_count)
+ query->startup_sent++;
+
+ br_multicast_send_query(brmctx, pmctx, query);
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_port_query_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip4_own_query.timer);
+
+ br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_port_query_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip6_own_query.timer);
+
+ br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query);
+}
+#endif
+
+static void br_multicast_port_group_rexmit(struct timer_list *t)
+{
+ struct net_bridge_port_group *pg = timer_container_of(pg, t,
+ rexmit_timer);
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mcast *brmctx;
+ bool need_rexmit = false;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ goto out;
+
+ pmctx = br_multicast_pg_to_port_ctx(pg);
+ if (!pmctx)
+ goto out;
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ if (!brmctx->multicast_querier)
+ goto out;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &brmctx->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &brmctx->ip6_other_query;
+#endif
+
+ if (!other_query || timer_pending(&other_query->timer))
+ goto out;
+
+ if (pg->grp_query_rexmit_cnt) {
+ pg->grp_query_rexmit_cnt--;
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, false, 1, NULL);
+ }
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, true, 0, &need_rexmit);
+
+ if (pg->grp_query_rexmit_cnt || need_rexmit)
+ mod_timer(&pg->rexmit_timer, jiffies +
+ brmctx->multicast_last_member_interval);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static int br_mc_disabled_update(struct net_device *dev, bool value,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.mc_disabled = !value,
+ };
+
+ return switchdev_port_attr_set(dev, &attr, extack);
+}
+
+void br_multicast_port_ctx_init(struct net_bridge_port *port,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast_port *pmctx)
+{
+ pmctx->port = port;
+ pmctx->vlan = vlan;
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ timer_setup(&pmctx->ip4_mc_router_timer,
+ br_ip4_multicast_router_expired, 0);
+ timer_setup(&pmctx->ip4_own_query.timer,
+ br_ip4_multicast_port_query_expired, 0);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_setup(&pmctx->ip6_mc_router_timer,
+ br_ip6_multicast_router_expired, 0);
+ timer_setup(&pmctx->ip6_own_query.timer,
+ br_ip6_multicast_port_query_expired, 0);
+#endif
+}
+
+void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+ bool del = false;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete_sync(&pmctx->ip6_mc_router_timer);
+#endif
+ timer_delete_sync(&pmctx->ip4_mc_router_timer);
+
+ spin_lock_bh(&br->multicast_lock);
+ del |= br_ip6_multicast_rport_del(pmctx);
+ del |= br_ip4_multicast_rport_del(pmctx);
+ br_multicast_rport_del_notify(pmctx, del);
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+int br_multicast_add_port(struct net_bridge_port *port)
+{
+ int err;
+
+ port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT;
+ br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx);
+
+ err = br_mc_disabled_update(port->dev,
+ br_opt_get(port->br,
+ BROPT_MULTICAST_ENABLED),
+ NULL);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
+ if (!port->mcast_stats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void br_multicast_del_port(struct net_bridge_port *port)
+{
+ struct net_bridge *br = port->br;
+ struct net_bridge_port_group *pg;
+ struct hlist_node *n;
+
+ /* Take care of the remaining groups, only perm ones should be left */
+ spin_lock_bh(&br->multicast_lock);
+ hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
+ br_multicast_find_del_pg(br, pg);
+ spin_unlock_bh(&br->multicast_lock);
+ flush_work(&br->mcast_gc_work);
+ br_multicast_port_ctx_deinit(&port->multicast_ctx);
+ free_percpu(port->mcast_stats);
+}
+
+static void br_multicast_enable(struct bridge_mcast_own_query *query)
+{
+ query->startup_sent = 0;
+
+ if (timer_delete_sync_try(&query->timer) >= 0 ||
+ timer_delete(&query->timer))
+ mod_timer(&query->timer, jiffies);
+}
+
+static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+ struct net_bridge_mcast *brmctx;
+
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ !netif_running(br->dev))
+ return;
+
+ br_multicast_enable(&pmctx->ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ br_multicast_enable(&pmctx->ip6_own_query);
+#endif
+ if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) {
+ br_ip4_multicast_add_router(brmctx, pmctx);
+ br_ip6_multicast_add_router(brmctx, pmctx);
+ }
+
+ if (br_multicast_port_ctx_is_vlan(pmctx)) {
+ struct net_bridge_port_group *pg;
+ u32 n = 0;
+
+ /* The mcast_n_groups counter might be wrong. First,
+ * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries
+ * are flushed, thus mcast_n_groups after the toggle does not
+ * reflect the true values. And second, permanent entries added
+ * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected
+ * either. Thus we have to refresh the counter.
+ */
+
+ hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) {
+ if (pg->key.addr.vid == pmctx->vlan->vid)
+ n++;
+ }
+ WRITE_ONCE(pmctx->mdb_n_entries, n);
+ }
+}
+
+static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) {
+ spin_unlock_bh(&br->multicast_lock);
+ return;
+ }
+ __br_multicast_enable_port_ctx(pmctx);
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge_port_group *pg;
+ struct hlist_node *n;
+ bool del = false;
+
+ hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist)
+ if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) &&
+ (!br_multicast_port_ctx_is_vlan(pmctx) ||
+ pg->key.addr.vid == pmctx->vlan->vid))
+ br_multicast_find_del_pg(pmctx->port->br, pg);
+
+ del |= br_ip4_multicast_rport_del(pmctx);
+ timer_delete(&pmctx->ip4_mc_router_timer);
+ timer_delete(&pmctx->ip4_own_query.timer);
+ del |= br_ip6_multicast_rport_del(pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&pmctx->ip6_mc_router_timer);
+ timer_delete(&pmctx->ip6_own_query.timer);
+#endif
+ br_multicast_rport_del_notify(pmctx, del);
+}
+
+static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) {
+ spin_unlock_bh(&br->multicast_lock);
+ return;
+ }
+
+ __br_multicast_disable_port_ctx(pmctx);
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+static void br_multicast_toggle_port(struct net_bridge_port *port, bool on)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
+ if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ rcu_read_lock();
+ vg = nbp_vlan_group_rcu(port);
+ if (!vg) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* iterate each vlan, toggle vlan multicast context */
+ list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast_port *pmctx =
+ &vlan->port_mcast_ctx;
+ u8 state = br_vlan_get_state(vlan);
+ /* enable vlan multicast context when state is
+ * LEARNING or FORWARDING
+ */
+ if (on && br_vlan_state_allowed(state, true))
+ br_multicast_enable_port_ctx(pmctx);
+ else
+ br_multicast_disable_port_ctx(pmctx);
+ }
+ rcu_read_unlock();
+ return;
+ }
+#endif
+ /* toggle port multicast context when vlan snooping is disabled */
+ if (on)
+ br_multicast_enable_port_ctx(&port->multicast_ctx);
+ else
+ br_multicast_disable_port_ctx(&port->multicast_ctx);
+}
+
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+ br_multicast_toggle_port(port, true);
+}
+
+void br_multicast_disable_port(struct net_bridge_port *port)
+{
+ br_multicast_toggle_port(port, false);
+}
+
+static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+ int deleted = 0;
+
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ if (ent->flags & BR_SGRP_F_DELETE) {
+ br_multicast_del_group_src(ent, false);
+ deleted++;
+ }
+
+ return deleted;
+}
+
+static void __grp_src_mod_timer(struct net_bridge_group_src *src,
+ unsigned long expires)
+{
+ mod_timer(&src->timer, expires);
+ br_multicast_fwd_src_handle(src);
+}
+
+static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ u32 lmqc = brmctx->multicast_last_member_count;
+ unsigned long lmqt, lmi, now = jiffies;
+ struct net_bridge_group_src *ent;
+
+ if (!netif_running(brmctx->br->dev) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &brmctx->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &brmctx->ip6_other_query;
+#endif
+
+ lmqt = now + br_multicast_lmqt(brmctx);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (ent->flags & BR_SGRP_F_SEND) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ if (ent->timer.expires > lmqt) {
+ if (brmctx->multicast_querier &&
+ other_query &&
+ !timer_pending(&other_query->timer))
+ ent->src_query_rexmit_cnt = lmqc;
+ __grp_src_mod_timer(ent, lmqt);
+ }
+ }
+ }
+
+ if (!brmctx->multicast_querier ||
+ !other_query || timer_pending(&other_query->timer))
+ return;
+
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, true, 1, NULL);
+
+ lmi = now + brmctx->multicast_last_member_interval;
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+}
+
+static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ unsigned long now = jiffies, lmi;
+
+ if (!netif_running(brmctx->br->dev) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &brmctx->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &brmctx->ip6_other_query;
+#endif
+
+ if (brmctx->multicast_querier &&
+ other_query && !timer_pending(&other_query->timer)) {
+ lmi = now + brmctx->multicast_last_member_interval;
+ pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1;
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, false, 0, NULL);
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+ }
+
+ if (pg->filter_mode == MCAST_EXCLUDE &&
+ (!timer_pending(&pg->timer) ||
+ time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx))))
+ mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx));
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI
+ * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI
+ * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ */
+static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx));
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Group Timer=GMI
+ */
+static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent)
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ else
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ br_multicast_fwd_src_handle(ent);
+ }
+
+ br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type);
+
+ __grp_src_delete_marked(pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Group Timer=GMI
+ */
+static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent,
+ now + br_multicast_gmi(brmctx));
+ changed = true;
+ }
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+
+ return changed;
+}
+
+static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type);
+ br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs,
+ addr_size, grec_type);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI
+ * Send Q(G,A-B)
+ */
+static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx));
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ * Send Q(G,X-A)
+ * Send Q(G)
+ */
+static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (timer_pending(&ent->timer))
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ if (timer_pending(&ent->timer)) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ }
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx));
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ __grp_send_query_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+static bool br_multicast_toin(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ }
+
+ if (br_multicast_eht_should_del_pg(pg)) {
+ pg->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_find_del_pg(pg->key.port->br, pg);
+ /* a notification has already been sent and we shouldn't
+ * access pg after the delete so we have to return false
+ */
+ changed = false;
+ }
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Send Q(G,A*B)
+ * Group Timer=GMI
+ */
+static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) |
+ BR_SGRP_F_SEND;
+ to_send++;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ }
+ if (ent)
+ br_multicast_fwd_src_handle(ent);
+ }
+
+ br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type);
+
+ __grp_src_delete_marked(pg);
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Send Q(G,A-Y)
+ * Group Timer=GMI
+ */
+static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+static bool br_multicast_toex(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs,
+ addr_size, grec_type);
+ br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B)
+ */
+static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size, int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer
+ * Send Q(G,A-Y)
+ */
+static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size, int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+static bool br_multicast_block(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size, int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ }
+
+ if ((pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) ||
+ br_multicast_eht_should_del_pg(pg)) {
+ if (br_multicast_eht_should_del_pg(pg))
+ pg->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_find_del_pg(pg->key.port->br, pg);
+ /* a notification has already been sent and we shouldn't
+ * access pg after the delete so we have to return false
+ */
+ changed = false;
+ }
+
+ return changed;
+}
+
+static struct net_bridge_port_group *
+br_multicast_find_port(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port *p,
+ const unsigned char *src)
+{
+ struct net_bridge *br __maybe_unused = mp->br;
+ struct net_bridge_port_group *pg;
+
+ for (pg = mlock_dereference(mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br))
+ if (br_port_group_equal(pg, p, src))
+ return pg;
+
+ return NULL;
+}
+
+static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ bool igmpv2 = brmctx->multicast_igmp_version == 2;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
+ const unsigned char *src;
+ struct igmpv3_report *ih;
+ struct igmpv3_grec *grec;
+ int i, len, num, type;
+ __be32 group, *h_addr;
+ bool changed = false;
+ int err = 0;
+ u16 nsrcs;
+
+ ih = igmpv3_report_hdr(skb);
+ num = ntohs(ih->ngrec);
+ len = skb_transport_offset(skb) + sizeof(*ih);
+
+ for (i = 0; i < num; i++) {
+ len += sizeof(*grec);
+ if (!ip_mc_may_pull(skb, len))
+ return -EINVAL;
+
+ grec = (void *)(skb->data + len - sizeof(*grec));
+ group = grec->grec_mca;
+ type = grec->grec_type;
+ nsrcs = ntohs(grec->grec_nsrcs);
+
+ len += nsrcs * 4;
+ if (!ip_mc_may_pull(skb, len))
+ return -EINVAL;
+
+ switch (type) {
+ case IGMPV3_MODE_IS_INCLUDE:
+ case IGMPV3_MODE_IS_EXCLUDE:
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ break;
+
+ default:
+ continue;
+ }
+
+ src = eth_hdr(skb)->h_source;
+ if (nsrcs == 0 &&
+ (type == IGMPV3_CHANGE_TO_INCLUDE ||
+ type == IGMPV3_MODE_IS_INCLUDE)) {
+ if (!pmctx || igmpv2) {
+ br_ip4_multicast_leave_group(brmctx, pmctx,
+ group, vid, src);
+ continue;
+ }
+ } else {
+ err = br_ip4_multicast_add_group(brmctx, pmctx, group,
+ vid, src, igmpv2);
+ if (err)
+ break;
+ }
+
+ if (!pmctx || igmpv2)
+ continue;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto unlock_continue;
+
+ mdst = br_mdb_ip4_get(brmctx->br, group, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, pmctx->port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ /* reload grec and host addr */
+ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4));
+ h_addr = &ip_hdr(skb)->saddr;
+ switch (type) {
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(brmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(brmctx, pmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(brmctx, pmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(brmctx, pmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ }
+ if (changed)
+ br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock(&brmctx->br->multicast_lock);
+ }
+
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ bool mldv1 = brmctx->multicast_mld_version == 1;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
+ unsigned int nsrcs_offset;
+ struct mld2_report *mld2r;
+ const unsigned char *src;
+ struct in6_addr *h_addr;
+ struct mld2_grec *grec;
+ unsigned int grec_len;
+ bool changed = false;
+ int i, len, num;
+ int err = 0;
+
+ if (!ipv6_mc_may_pull(skb, sizeof(*mld2r)))
+ return -EINVAL;
+
+ mld2r = (struct mld2_report *)icmp6_hdr(skb);
+ num = ntohs(mld2r->mld2r_ngrec);
+ len = skb_transport_offset(skb) + sizeof(*mld2r);
+
+ for (i = 0; i < num; i++) {
+ __be16 *_nsrcs, __nsrcs;
+ u16 nsrcs;
+
+ nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs);
+
+ if (skb_transport_offset(skb) + ipv6_transport_len(skb) <
+ nsrcs_offset + sizeof(__nsrcs))
+ return -EINVAL;
+
+ _nsrcs = skb_header_pointer(skb, nsrcs_offset,
+ sizeof(__nsrcs), &__nsrcs);
+ if (!_nsrcs)
+ return -EINVAL;
+
+ nsrcs = ntohs(*_nsrcs);
+ grec_len = struct_size(grec, grec_src, nsrcs);
+
+ if (!ipv6_mc_may_pull(skb, len + grec_len))
+ return -EINVAL;
+
+ grec = (struct mld2_grec *)(skb->data + len);
+ len += grec_len;
+
+ switch (grec->grec_type) {
+ case MLD2_MODE_IS_INCLUDE:
+ case MLD2_MODE_IS_EXCLUDE:
+ case MLD2_CHANGE_TO_INCLUDE:
+ case MLD2_CHANGE_TO_EXCLUDE:
+ case MLD2_ALLOW_NEW_SOURCES:
+ case MLD2_BLOCK_OLD_SOURCES:
+ break;
+
+ default:
+ continue;
+ }
+
+ src = eth_hdr(skb)->h_source;
+ if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
+ grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
+ nsrcs == 0) {
+ if (!pmctx || mldv1) {
+ br_ip6_multicast_leave_group(brmctx, pmctx,
+ &grec->grec_mca,
+ vid, src);
+ continue;
+ }
+ } else {
+ err = br_ip6_multicast_add_group(brmctx, pmctx,
+ &grec->grec_mca, vid,
+ src, mldv1);
+ if (err)
+ break;
+ }
+
+ if (!pmctx || mldv1)
+ continue;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto unlock_continue;
+
+ mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, pmctx->port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ h_addr = &ipv6_hdr(skb)->saddr;
+ switch (grec->grec_type) {
+ case MLD2_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(brmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(brmctx, pmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(brmctx, pmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(brmctx, pmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ }
+ if (changed)
+ br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock(&brmctx->br->multicast_lock);
+ }
+
+ return err;
+}
+#endif
+
+static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *saddr)
+{
+ int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0;
+ struct timer_list *own_timer, *other_timer;
+ struct bridge_mcast_querier *querier;
+
+ switch (saddr->proto) {
+ case htons(ETH_P_IP):
+ querier = &brmctx->ip4_querier;
+ own_timer = &brmctx->ip4_own_query.timer;
+ other_timer = &brmctx->ip4_other_query.timer;
+ if (!querier->addr.src.ip4 ||
+ ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4))
+ goto update;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ querier = &brmctx->ip6_querier;
+ own_timer = &brmctx->ip6_own_query.timer;
+ other_timer = &brmctx->ip6_other_query.timer;
+ if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0)
+ goto update;
+ break;
+#endif
+ default:
+ return false;
+ }
+
+ if (!timer_pending(own_timer) && !timer_pending(other_timer))
+ goto update;
+
+ return false;
+
+update:
+ br_multicast_update_querier(brmctx, querier, port_ifidx, saddr);
+
+ return true;
+}
+
+static struct net_bridge_port *
+__br_multicast_get_querier_port(struct net_bridge *br,
+ const struct bridge_mcast_querier *querier)
+{
+ int port_ifidx = READ_ONCE(querier->port_ifidx);
+ struct net_bridge_port *p;
+ struct net_device *dev;
+
+ if (port_ifidx == 0)
+ return NULL;
+
+ dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx);
+ if (!dev)
+ return NULL;
+ p = br_port_get_rtnl_rcu(dev);
+ if (!p || p->br != br)
+ return NULL;
+
+ return p;
+}
+
+size_t br_multicast_querier_state_size(void)
+{
+ return nla_total_size(0) + /* nest attribute */
+ nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */
+ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */
+ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */
+#if IS_ENABLED(CONFIG_IPV6)
+ nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */
+ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */
+ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */
+#endif
+ 0;
+}
+
+/* protected by rtnl or rcu */
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx,
+ int nest_attr)
+{
+ struct bridge_mcast_querier querier = {};
+ struct net_bridge_port *p;
+ struct nlattr *nest;
+
+ if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) ||
+ br_multicast_ctx_vlan_global_disabled(brmctx))
+ return 0;
+
+ nest = nla_nest_start(skb, nest_attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ if (!brmctx->multicast_querier &&
+ !timer_pending(&brmctx->ip4_other_query.timer))
+ goto out_v6;
+
+ br_multicast_read_querier(&brmctx->ip4_querier, &querier);
+ if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS,
+ querier.addr.src.ip4)) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+
+ p = __br_multicast_get_querier_port(brmctx->br, &querier);
+ if (timer_pending(&brmctx->ip4_other_query.timer) &&
+ (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER,
+ br_timer_value(&brmctx->ip4_other_query.timer),
+ BRIDGE_QUERIER_PAD) ||
+ (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+
+out_v6:
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!brmctx->multicast_querier &&
+ !timer_pending(&brmctx->ip6_other_query.timer))
+ goto out;
+
+ br_multicast_read_querier(&brmctx->ip6_querier, &querier);
+ if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS,
+ &querier.addr.src.ip6)) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+
+ p = __br_multicast_get_querier_port(brmctx->br, &querier);
+ if (timer_pending(&brmctx->ip6_other_query.timer) &&
+ (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER,
+ br_timer_value(&brmctx->ip6_other_query.timer),
+ BRIDGE_QUERIER_PAD) ||
+ (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT,
+ p->dev->ifindex)))) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+out:
+#endif
+ rcu_read_unlock();
+ nla_nest_end(skb, nest);
+ if (!nla_len(nest))
+ nla_nest_cancel(skb, nest);
+
+ return 0;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void
+br_multicast_update_query_timer(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_other_query *query,
+ unsigned long max_delay)
+{
+ if (!timer_pending(&query->timer))
+ mod_timer(&query->delay_timer, jiffies + max_delay);
+
+ mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval);
+}
+
+static void br_port_mc_router_state_change(struct net_bridge_port *p,
+ bool is_mc_router)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_MROUTER,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.mrouter = is_mc_router,
+ };
+
+ switchdev_port_attr_set(p->dev, &attr, NULL);
+}
+
+static struct net_bridge_port *
+br_multicast_rport_from_node(struct net_bridge_mcast *brmctx,
+ struct hlist_head *mc_router_list,
+ struct hlist_node *rlist)
+{
+ struct net_bridge_mcast_port *pmctx;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (mc_router_list == &brmctx->ip6_mc_router_list)
+ pmctx = hlist_entry(rlist, struct net_bridge_mcast_port,
+ ip6_rlist);
+ else
+#endif
+ pmctx = hlist_entry(rlist, struct net_bridge_mcast_port,
+ ip4_rlist);
+
+ return pmctx->port;
+}
+
+static struct hlist_node *
+br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx,
+ struct net_bridge_port *port,
+ struct hlist_head *mc_router_list)
+
+{
+ struct hlist_node *slot = NULL;
+ struct net_bridge_port *p;
+ struct hlist_node *rlist;
+
+ hlist_for_each(rlist, mc_router_list) {
+ p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist);
+
+ if ((unsigned long)port >= (unsigned long)p)
+ break;
+
+ slot = rlist;
+ }
+
+ return slot;
+}
+
+static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx,
+ struct hlist_node *rnode)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (rnode != &pmctx->ip6_rlist)
+ return hlist_unhashed(&pmctx->ip6_rlist);
+ else
+ return hlist_unhashed(&pmctx->ip4_rlist);
+#else
+ return true;
+#endif
+}
+
+/* Add port to router_list
+ * list is maintained ordered by pointer value
+ * and locked by br->multicast_lock and RCU
+ */
+static void br_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct hlist_node *rlist,
+ struct hlist_head *mc_router_list)
+{
+ struct hlist_node *slot;
+
+ if (!hlist_unhashed(rlist))
+ return;
+
+ slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list);
+
+ if (slot)
+ hlist_add_behind_rcu(rlist, slot);
+ else
+ hlist_add_head_rcu(rlist, mc_router_list);
+
+ /* For backwards compatibility for now, only notify if we
+ * switched from no IPv4/IPv6 multicast router to a new
+ * IPv4 or IPv6 multicast router.
+ */
+ if (br_multicast_no_router_otherpf(pmctx, rlist)) {
+ br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB);
+ br_port_mc_router_state_change(pmctx->port, true);
+ }
+}
+
+/* Add port to router_list
+ * list is maintained ordered by pointer value
+ * and locked by br->multicast_lock and RCU
+ */
+static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+ br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist,
+ &brmctx->ip4_mc_router_list);
+}
+
+/* Add port to router_list
+ * list is maintained ordered by pointer value
+ * and locked by br->multicast_lock and RCU
+ */
+static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist,
+ &brmctx->ip6_mc_router_list);
+#endif
+}
+
+static void br_multicast_mark_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct timer_list *timer,
+ struct hlist_node *rlist,
+ struct hlist_head *mc_router_list)
+{
+ unsigned long now = jiffies;
+
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ return;
+
+ if (!pmctx) {
+ if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
+ if (!br_ip4_multicast_is_router(brmctx) &&
+ !br_ip6_multicast_is_router(brmctx))
+ br_mc_router_state_change(brmctx->br, true);
+ mod_timer(timer, now + brmctx->multicast_querier_interval);
+ }
+ return;
+ }
+
+ if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ pmctx->multicast_router == MDB_RTR_TYPE_PERM)
+ return;
+
+ br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list);
+ mod_timer(timer, now + brmctx->multicast_querier_interval);
+}
+
+static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+ struct timer_list *timer = &brmctx->ip4_mc_router_timer;
+ struct hlist_node *rlist = NULL;
+
+ if (pmctx) {
+ timer = &pmctx->ip4_mc_router_timer;
+ rlist = &pmctx->ip4_rlist;
+ }
+
+ br_multicast_mark_router(brmctx, pmctx, timer, rlist,
+ &brmctx->ip4_mc_router_list);
+}
+
+static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct timer_list *timer = &brmctx->ip6_mc_router_timer;
+ struct hlist_node *rlist = NULL;
+
+ if (pmctx) {
+ timer = &pmctx->ip6_mc_router_timer;
+ rlist = &pmctx->ip6_rlist;
+ }
+
+ br_multicast_mark_router(brmctx, pmctx, timer, rlist,
+ &brmctx->ip6_mc_router_list);
+#endif
+}
+
+static void
+br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct bridge_mcast_other_query *query,
+ struct br_ip *saddr,
+ unsigned long max_delay)
+{
+ if (!br_multicast_select_querier(brmctx, pmctx, saddr))
+ return;
+
+ br_multicast_update_query_timer(brmctx, query, max_delay);
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void
+br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct bridge_mcast_other_query *query,
+ struct br_ip *saddr,
+ unsigned long max_delay)
+{
+ if (!br_multicast_select_querier(brmctx, pmctx, saddr))
+ return;
+
+ br_multicast_update_query_timer(brmctx, query, max_delay);
+ br_ip6_multicast_mark_router(brmctx, pmctx);
+}
+#endif
+
+static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ unsigned int transport_len = ip_transport_len(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct igmphdr *ih = igmp_hdr(skb);
+ struct net_bridge_mdb_entry *mp;
+ struct igmpv3_query *ih3;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct br_ip saddr = {};
+ unsigned long max_delay;
+ unsigned long now = jiffies;
+ __be32 group;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto out;
+
+ group = ih->group;
+
+ if (transport_len == sizeof(*ih)) {
+ max_delay = ih->code * (HZ / IGMP_TIMER_SCALE);
+
+ if (!max_delay) {
+ max_delay = 10 * HZ;
+ group = 0;
+ }
+ } else if (transport_len >= sizeof(*ih3)) {
+ ih3 = igmpv3_query_hdr(skb);
+ if (ih3->nsrcs ||
+ (brmctx->multicast_igmp_version == 3 && group &&
+ ih3->suppress))
+ goto out;
+
+ max_delay = ih3->code ?
+ IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
+ } else {
+ goto out;
+ }
+
+ if (!group) {
+ saddr.proto = htons(ETH_P_IP);
+ saddr.src.ip4 = iph->saddr;
+
+ br_ip4_multicast_query_received(brmctx, pmctx,
+ &brmctx->ip4_other_query,
+ &saddr, max_delay);
+ goto out;
+ }
+
+ mp = br_mdb_ip4_get(brmctx->br, group, vid);
+ if (!mp)
+ goto out;
+
+ max_delay *= brmctx->multicast_last_member_count;
+
+ if (mp->host_joined &&
+ (timer_pending(&mp->timer) ?
+ time_after(mp->timer.expires, now + max_delay) :
+ timer_delete_sync_try(&mp->timer) >= 0))
+ mod_timer(&mp->timer, now + max_delay);
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
+ pp = &p->next) {
+ if (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, now + max_delay) :
+ timer_delete_sync_try(&p->timer) >= 0 &&
+ (brmctx->multicast_igmp_version == 2 ||
+ p->filter_mode == MCAST_EXCLUDE))
+ mod_timer(&p->timer, now + max_delay);
+ }
+
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ unsigned int transport_len = ipv6_transport_len(skb);
+ struct mld_msg *mld;
+ struct net_bridge_mdb_entry *mp;
+ struct mld2_query *mld2q;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct br_ip saddr = {};
+ unsigned long max_delay;
+ unsigned long now = jiffies;
+ unsigned int offset = skb_transport_offset(skb);
+ const struct in6_addr *group = NULL;
+ bool is_general_query;
+ int err = 0;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto out;
+
+ if (transport_len == sizeof(*mld)) {
+ if (!pskb_may_pull(skb, offset + sizeof(*mld))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld = (struct mld_msg *) icmp6_hdr(skb);
+ max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay));
+ if (max_delay)
+ group = &mld->mld_mca;
+ } else {
+ if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld2q = (struct mld2_query *)icmp6_hdr(skb);
+ if (!mld2q->mld2q_nsrcs)
+ group = &mld2q->mld2q_mca;
+ if (brmctx->multicast_mld_version == 2 &&
+ !ipv6_addr_any(&mld2q->mld2q_mca) &&
+ mld2q->mld2q_suppress)
+ goto out;
+
+ max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL);
+ }
+
+ is_general_query = group && ipv6_addr_any(group);
+
+ if (is_general_query) {
+ saddr.proto = htons(ETH_P_IPV6);
+ saddr.src.ip6 = ipv6_hdr(skb)->saddr;
+
+ br_ip6_multicast_query_received(brmctx, pmctx,
+ &brmctx->ip6_other_query,
+ &saddr, max_delay);
+ goto out;
+ } else if (!group) {
+ goto out;
+ }
+
+ mp = br_mdb_ip6_get(brmctx->br, group, vid);
+ if (!mp)
+ goto out;
+
+ max_delay *= brmctx->multicast_last_member_count;
+ if (mp->host_joined &&
+ (timer_pending(&mp->timer) ?
+ time_after(mp->timer.expires, now + max_delay) :
+ timer_delete_sync_try(&mp->timer) >= 0))
+ mod_timer(&mp->timer, now + max_delay);
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
+ pp = &p->next) {
+ if (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, now + max_delay) :
+ timer_delete_sync_try(&p->timer) >= 0 &&
+ (brmctx->multicast_mld_version == 1 ||
+ p->filter_mode == MCAST_EXCLUDE))
+ mod_timer(&p->timer, now + max_delay);
+ }
+
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
+ return err;
+}
+#endif
+
+static void
+br_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ struct bridge_mcast_other_query *other_query,
+ struct bridge_mcast_own_query *own_query,
+ const unsigned char *src)
+{
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ unsigned long now;
+ unsigned long time;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto out;
+
+ mp = br_mdb_ip_get(brmctx->br, group);
+ if (!mp)
+ goto out;
+
+ if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) {
+ struct net_bridge_port_group __rcu **pp;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
+ pp = &p->next) {
+ if (!br_port_group_equal(p, pmctx->port, src))
+ continue;
+
+ if (p->flags & MDB_PG_FLAGS_PERMANENT)
+ break;
+
+ p->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_del_pg(mp, p, pp);
+ }
+ goto out;
+ }
+
+ if (timer_pending(&other_query->timer))
+ goto out;
+
+ if (brmctx->multicast_querier) {
+ __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr,
+ false, 0, NULL);
+
+ time = jiffies + brmctx->multicast_last_member_count *
+ brmctx->multicast_last_member_interval;
+
+ mod_timer(&own_query->timer, time);
+
+ for (p = mlock_dereference(mp->ports, brmctx->br);
+ p != NULL && pmctx != NULL;
+ p = mlock_dereference(p->next, brmctx->br)) {
+ if (!br_port_group_equal(p, pmctx->port, src))
+ continue;
+
+ if (!hlist_unhashed(&p->mglist) &&
+ (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, time) :
+ timer_delete_sync_try(&p->timer) >= 0)) {
+ mod_timer(&p->timer, time);
+ }
+
+ break;
+ }
+ }
+
+ now = jiffies;
+ time = now + brmctx->multicast_last_member_count *
+ brmctx->multicast_last_member_interval;
+
+ if (!pmctx) {
+ if (mp->host_joined &&
+ (timer_pending(&mp->timer) ?
+ time_after(mp->timer.expires, time) :
+ timer_delete_sync_try(&mp->timer) >= 0)) {
+ mod_timer(&mp->timer, time);
+ }
+
+ goto out;
+ }
+
+ for (p = mlock_dereference(mp->ports, brmctx->br);
+ p != NULL;
+ p = mlock_dereference(p->next, brmctx->br)) {
+ if (p->key.port != pmctx->port)
+ continue;
+
+ if (!hlist_unhashed(&p->mglist) &&
+ (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, time) :
+ timer_delete_sync_try(&p->timer) >= 0)) {
+ mod_timer(&p->timer, time);
+ }
+
+ break;
+ }
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ __be32 group,
+ __u16 vid,
+ const unsigned char *src)
+{
+ struct br_ip br_group;
+ struct bridge_mcast_own_query *own_query;
+
+ if (ipv4_is_local_multicast(group))
+ return;
+
+ own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query;
+
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip4 = group;
+ br_group.proto = htons(ETH_P_IP);
+ br_group.vid = vid;
+
+ br_multicast_leave_group(brmctx, pmctx, &br_group,
+ &brmctx->ip4_other_query,
+ own_query, src);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ const struct in6_addr *group,
+ __u16 vid,
+ const unsigned char *src)
+{
+ struct br_ip br_group;
+ struct bridge_mcast_own_query *own_query;
+
+ if (ipv6_addr_is_ll_all_nodes(group))
+ return;
+
+ own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query;
+
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip6 = *group;
+ br_group.proto = htons(ETH_P_IPV6);
+ br_group.vid = vid;
+
+ br_multicast_leave_group(brmctx, pmctx, &br_group,
+ &brmctx->ip6_other_query,
+ own_query, src);
+}
+#endif
+
+static void br_multicast_err_count(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ __be16 proto)
+{
+ struct bridge_mcast_stats __percpu *stats;
+ struct bridge_mcast_stats *pstats;
+
+ if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
+ return;
+
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ pstats = this_cpu_ptr(stats);
+
+ u64_stats_update_begin(&pstats->syncp);
+ switch (proto) {
+ case htons(ETH_P_IP):
+ pstats->mstats.igmp_parse_errors++;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ pstats->mstats.mld_parse_errors++;
+ break;
+#endif
+ }
+ u64_stats_update_end(&pstats->syncp);
+}
+
+static void br_multicast_pim(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ const struct sk_buff *skb)
+{
+ unsigned int offset = skb_transport_offset(skb);
+ struct pimhdr *pimhdr, _pimhdr;
+
+ pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr);
+ if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION ||
+ pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
+ return;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb)
+{
+ if (ip_hdr(skb)->protocol != IPPROTO_IGMP ||
+ igmp_hdr(skb)->type != IGMP_MRDISC_ADV)
+ return -ENOMSG;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+ spin_unlock(&brmctx->br->multicast_lock);
+
+ return 0;
+}
+
+static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ struct net_bridge_port *p = pmctx ? pmctx->port : NULL;
+ const unsigned char *src;
+ struct igmphdr *ih;
+ int err;
+
+ err = ip_mc_check_igmp(skb);
+
+ if (err == -ENOMSG) {
+ if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) {
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) {
+ if (ip_hdr(skb)->protocol == IPPROTO_PIM)
+ br_multicast_pim(brmctx, pmctx, skb);
+ } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) {
+ br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb);
+ }
+
+ return 0;
+ } else if (err < 0) {
+ br_multicast_err_count(brmctx->br, p, skb->protocol);
+ return err;
+ }
+
+ ih = igmp_hdr(skb);
+ src = eth_hdr(skb)->h_source;
+ BR_INPUT_SKB_CB(skb)->igmp = ih->type;
+
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid,
+ src, true);
+ break;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid);
+ break;
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ br_ip4_multicast_query(brmctx, pmctx, skb, vid);
+ break;
+ case IGMP_HOST_LEAVE_MESSAGE:
+ br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src);
+ break;
+ }
+
+ br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp,
+ BR_MCAST_DIR_RX);
+
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb)
+{
+ if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV)
+ return;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ br_ip6_multicast_mark_router(brmctx, pmctx);
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ struct net_bridge_port *p = pmctx ? pmctx->port : NULL;
+ const unsigned char *src;
+ struct mld_msg *mld;
+ int err;
+
+ err = ipv6_mc_check_mld(skb);
+
+ if (err == -ENOMSG || err == -ENODATA) {
+ if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ if (err == -ENODATA &&
+ ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr))
+ br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb);
+
+ return 0;
+ } else if (err < 0) {
+ br_multicast_err_count(brmctx->br, p, skb->protocol);
+ return err;
+ }
+
+ mld = (struct mld_msg *)skb_transport_header(skb);
+ BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type;
+
+ switch (mld->mld_type) {
+ case ICMPV6_MGM_REPORT:
+ src = eth_hdr(skb)->h_source;
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca,
+ vid, src, true);
+ break;
+ case ICMPV6_MLD2_REPORT:
+ err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid);
+ break;
+ case ICMPV6_MGM_QUERY:
+ err = br_ip6_multicast_query(brmctx, pmctx, skb, vid);
+ break;
+ case ICMPV6_MGM_REDUCTION:
+ src = eth_hdr(skb)->h_source;
+ br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid,
+ src);
+ break;
+ }
+
+ br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp,
+ BR_MCAST_DIR_RX);
+
+ return err;
+}
+#endif
+
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+ struct net_bridge_mcast_port **pmctx,
+ struct net_bridge_vlan *vlan,
+ struct sk_buff *skb, u16 vid)
+{
+ int ret = 0;
+
+ BR_INPUT_SKB_CB(skb)->igmp = 0;
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
+
+ if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED))
+ return 0;
+
+ if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) {
+ const struct net_bridge_vlan *masterv;
+
+ /* the vlan has the master flag set only when transmitting
+ * through the bridge device
+ */
+ if (br_vlan_is_master(vlan)) {
+ masterv = vlan;
+ *brmctx = &vlan->br_mcast_ctx;
+ *pmctx = NULL;
+ } else {
+ masterv = vlan->brvlan;
+ *brmctx = &vlan->brvlan->br_mcast_ctx;
+ *pmctx = &vlan->port_mcast_ctx;
+ }
+
+ if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+ return 0;
+ }
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid);
+ break;
+#endif
+ }
+
+ return ret;
+}
+
+static void br_multicast_query_expired(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_own_query *query)
+{
+ spin_lock(&brmctx->br->multicast_lock);
+ if (br_multicast_ctx_vlan_disabled(brmctx))
+ goto out;
+
+ if (query->startup_sent < brmctx->multicast_startup_query_count)
+ query->startup_sent++;
+
+ br_multicast_send_query(brmctx, NULL, query);
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static void br_ip4_multicast_query_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip4_own_query.timer);
+
+ br_multicast_query_expired(brmctx, &brmctx->ip4_own_query);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_query_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip6_own_query.timer);
+
+ br_multicast_query_expired(brmctx, &brmctx->ip6_own_query);
+}
+#endif
+
+static void br_multicast_gc_work(struct work_struct *work)
+{
+ struct net_bridge *br = container_of(work, struct net_bridge,
+ mcast_gc_work);
+ HLIST_HEAD(deleted_head);
+
+ spin_lock_bh(&br->multicast_lock);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
+ spin_unlock_bh(&br->multicast_lock);
+
+ br_multicast_gc(&deleted_head);
+}
+
+void br_multicast_ctx_init(struct net_bridge *br,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast *brmctx)
+{
+ brmctx->br = br;
+ brmctx->vlan = vlan;
+ brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ brmctx->multicast_last_member_count = 2;
+ brmctx->multicast_startup_query_count = 2;
+
+ brmctx->multicast_last_member_interval = HZ;
+ brmctx->multicast_query_response_interval = 10 * HZ;
+ brmctx->multicast_startup_query_interval = 125 * HZ / 4;
+ brmctx->multicast_query_interval = 125 * HZ;
+ brmctx->multicast_querier_interval = 255 * HZ;
+ brmctx->multicast_membership_interval = 260 * HZ;
+
+ brmctx->ip4_querier.port_ifidx = 0;
+ seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock);
+ brmctx->multicast_igmp_version = 2;
+#if IS_ENABLED(CONFIG_IPV6)
+ brmctx->multicast_mld_version = 1;
+ brmctx->ip6_querier.port_ifidx = 0;
+ seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock);
+#endif
+
+ timer_setup(&brmctx->ip4_mc_router_timer,
+ br_ip4_multicast_local_router_expired, 0);
+ timer_setup(&brmctx->ip4_other_query.timer,
+ br_ip4_multicast_querier_expired, 0);
+ timer_setup(&brmctx->ip4_other_query.delay_timer,
+ br_multicast_query_delay_expired, 0);
+ timer_setup(&brmctx->ip4_own_query.timer,
+ br_ip4_multicast_query_expired, 0);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_setup(&brmctx->ip6_mc_router_timer,
+ br_ip6_multicast_local_router_expired, 0);
+ timer_setup(&brmctx->ip6_other_query.timer,
+ br_ip6_multicast_querier_expired, 0);
+ timer_setup(&brmctx->ip6_other_query.delay_timer,
+ br_multicast_query_delay_expired, 0);
+ timer_setup(&brmctx->ip6_own_query.timer,
+ br_ip6_multicast_query_expired, 0);
+#endif
+}
+
+void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx)
+{
+ __br_multicast_stop(brmctx);
+}
+
+void br_multicast_init(struct net_bridge *br)
+{
+ br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
+
+ br_multicast_ctx_init(br, NULL, &br->multicast_ctx);
+
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true);
+ br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
+
+ spin_lock_init(&br->multicast_lock);
+ INIT_HLIST_HEAD(&br->mdb_list);
+ INIT_HLIST_HEAD(&br->mcast_gc_list);
+ INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work);
+}
+
+static void br_ip4_multicast_join_snoopers(struct net_bridge *br)
+{
+ struct in_device *in_dev = in_dev_get(br->dev);
+
+ if (!in_dev)
+ return;
+
+ __ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC);
+ in_dev_put(in_dev);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_join_snoopers(struct net_bridge *br)
+{
+ struct in6_addr addr;
+
+ ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a));
+ ipv6_dev_mc_inc(br->dev, &addr);
+}
+#else
+static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br)
+{
+}
+#endif
+
+void br_multicast_join_snoopers(struct net_bridge *br)
+{
+ br_ip4_multicast_join_snoopers(br);
+ br_ip6_multicast_join_snoopers(br);
+}
+
+static void br_ip4_multicast_leave_snoopers(struct net_bridge *br)
+{
+ struct in_device *in_dev = in_dev_get(br->dev);
+
+ if (WARN_ON(!in_dev))
+ return;
+
+ __ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC);
+ in_dev_put(in_dev);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
+{
+ struct in6_addr addr;
+
+ ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a));
+ ipv6_dev_mc_dec(br->dev, &addr);
+}
+#else
+static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
+{
+}
+#endif
+
+void br_multicast_leave_snoopers(struct net_bridge *br)
+{
+ br_ip4_multicast_leave_snoopers(br);
+ br_ip6_multicast_leave_snoopers(br);
+}
+
+static void __br_multicast_open_query(struct net_bridge *br,
+ struct bridge_mcast_own_query *query)
+{
+ query->startup_sent = 0;
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ mod_timer(&query->timer, jiffies);
+}
+
+static void __br_multicast_open(struct net_bridge_mcast *brmctx)
+{
+ __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query);
+#endif
+}
+
+void br_multicast_open(struct net_bridge *br)
+{
+ ASSERT_RTNL();
+
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ vg = br_vlan_group(br);
+ if (vg) {
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast *brmctx;
+
+ brmctx = &vlan->br_mcast_ctx;
+ if (br_vlan_is_brentry(vlan) &&
+ !br_multicast_ctx_vlan_disabled(brmctx))
+ __br_multicast_open(&vlan->br_mcast_ctx);
+ }
+ }
+ } else {
+ __br_multicast_open(&br->multicast_ctx);
+ }
+}
+
+static void __br_multicast_stop(struct net_bridge_mcast *brmctx)
+{
+ timer_delete_sync(&brmctx->ip4_mc_router_timer);
+ timer_delete_sync(&brmctx->ip4_other_query.timer);
+ timer_delete_sync(&brmctx->ip4_other_query.delay_timer);
+ timer_delete_sync(&brmctx->ip4_own_query.timer);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete_sync(&brmctx->ip6_mc_router_timer);
+ timer_delete_sync(&brmctx->ip6_other_query.timer);
+ timer_delete_sync(&brmctx->ip6_other_query.delay_timer);
+ timer_delete_sync(&brmctx->ip6_own_query.timer);
+#endif
+}
+
+void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
+ struct net_bridge *br;
+
+ if (!br_vlan_should_use(v))
+ return;
+
+ if (br_vlan_is_master(v))
+ return;
+
+ br = v->port->br;
+
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return;
+
+ if (br_vlan_state_allowed(state, true))
+ br_multicast_enable_port_ctx(&v->port_mcast_ctx);
+
+ /* Multicast is not disabled for the vlan when it goes in
+ * blocking state because the timers will expire and stop by
+ * themselves without sending more queries.
+ */
+#endif
+}
+
+void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+ struct net_bridge *br;
+
+ /* it's okay to check for the flag without the multicast lock because it
+ * can only change under RTNL -> multicast_lock, we need the latter to
+ * sync with timers and packets
+ */
+ if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED))
+ return;
+
+ if (br_vlan_is_master(vlan)) {
+ br = vlan->br;
+
+ if (!br_vlan_is_brentry(vlan) ||
+ (on &&
+ br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx)))
+ return;
+
+ spin_lock_bh(&br->multicast_lock);
+ vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED;
+ spin_unlock_bh(&br->multicast_lock);
+
+ if (on)
+ __br_multicast_open(&vlan->br_mcast_ctx);
+ else
+ __br_multicast_stop(&vlan->br_mcast_ctx);
+ } else {
+ struct net_bridge_mcast *brmctx;
+
+ brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx);
+ if (on && br_multicast_ctx_vlan_global_disabled(brmctx))
+ return;
+
+ br = vlan->port->br;
+ spin_lock_bh(&br->multicast_lock);
+ vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED;
+ if (on)
+ __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx);
+ else
+ __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx);
+ spin_unlock_bh(&br->multicast_lock);
+ }
+}
+
+static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+ struct net_bridge_port *p;
+
+ if (WARN_ON_ONCE(!br_vlan_is_master(vlan)))
+ return;
+
+ list_for_each_entry(p, &vlan->br->port_list, list) {
+ struct net_bridge_vlan *vport;
+
+ vport = br_vlan_find(nbp_vlan_group(p), vlan->vid);
+ if (!vport)
+ continue;
+ br_multicast_toggle_one_vlan(vport, on);
+ }
+
+ if (br_vlan_is_brentry(vlan))
+ br_multicast_toggle_one_vlan(vlan, on);
+}
+
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+ struct net_bridge_port *p;
+
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on)
+ return 0;
+
+ if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled");
+ return -EINVAL;
+ }
+
+ vg = br_vlan_group(br);
+ if (!vg)
+ return 0;
+
+ br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on);
+
+ /* disable/enable non-vlan mcast contexts based on vlan snooping */
+ if (on)
+ __br_multicast_stop(&br->multicast_ctx);
+ else
+ __br_multicast_open(&br->multicast_ctx);
+ list_for_each_entry(p, &br->port_list, list) {
+ if (on)
+ br_multicast_disable_port_ctx(&p->multicast_ctx);
+ else
+ br_multicast_enable_port_ctx(&p->multicast_ctx);
+ }
+
+ list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ br_multicast_toggle_vlan(vlan, on);
+
+ return 0;
+}
+
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+ ASSERT_RTNL();
+
+ /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and
+ * requires only RTNL to change
+ */
+ if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+ return false;
+
+ vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
+ br_multicast_toggle_vlan(vlan, on);
+
+ return true;
+}
+
+void br_multicast_stop(struct net_bridge *br)
+{
+ ASSERT_RTNL();
+
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ vg = br_vlan_group(br);
+ if (vg) {
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast *brmctx;
+
+ brmctx = &vlan->br_mcast_ctx;
+ if (br_vlan_is_brentry(vlan) &&
+ !br_multicast_ctx_vlan_disabled(brmctx))
+ __br_multicast_stop(&vlan->br_mcast_ctx);
+ }
+ }
+ } else {
+ __br_multicast_stop(&br->multicast_ctx);
+ }
+}
+
+void br_multicast_dev_del(struct net_bridge *br)
+{
+ struct net_bridge_mdb_entry *mp;
+ HLIST_HEAD(deleted_head);
+ struct hlist_node *tmp;
+
+ spin_lock_bh(&br->multicast_lock);
+ hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node)
+ br_multicast_del_mdb_entry(mp);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
+ spin_unlock_bh(&br->multicast_lock);
+
+ br_multicast_ctx_deinit(&br->multicast_ctx);
+ br_multicast_gc(&deleted_head);
+ cancel_work_sync(&br->mcast_gc_work);
+
+ rcu_barrier();
+}
+
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val)
+{
+ int err = -EINVAL;
+
+ spin_lock_bh(&brmctx->br->multicast_lock);
+
+ switch (val) {
+ case MDB_RTR_TYPE_DISABLED:
+ case MDB_RTR_TYPE_PERM:
+ br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM);
+ timer_delete(&brmctx->ip4_mc_router_timer);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&brmctx->ip6_mc_router_timer);
+#endif
+ brmctx->multicast_router = val;
+ err = 0;
+ break;
+ case MDB_RTR_TYPE_TEMP_QUERY:
+ if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+ br_mc_router_state_change(brmctx->br, false);
+ brmctx->multicast_router = val;
+ err = 0;
+ break;
+ }
+
+ spin_unlock_bh(&brmctx->br->multicast_lock);
+
+ return err;
+}
+
+static void
+br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted)
+{
+ if (!deleted)
+ return;
+
+ /* For backwards compatibility for now, only notify if there is
+ * no multicast router anymore for both IPv4 and IPv6.
+ */
+ if (!hlist_unhashed(&pmctx->ip4_rlist))
+ return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!hlist_unhashed(&pmctx->ip6_rlist))
+ return;
+#endif
+
+ br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB);
+ br_port_mc_router_state_change(pmctx->port, false);
+
+ /* don't allow timer refresh */
+ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP)
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+}
+
+int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
+ unsigned long val)
+{
+ struct net_bridge_mcast *brmctx;
+ unsigned long now = jiffies;
+ int err = -EINVAL;
+ bool del = false;
+
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ if (pmctx->multicast_router == val) {
+ /* Refresh the temp router port timer */
+ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) {
+ mod_timer(&pmctx->ip4_mc_router_timer,
+ now + brmctx->multicast_querier_interval);
+#if IS_ENABLED(CONFIG_IPV6)
+ mod_timer(&pmctx->ip6_mc_router_timer,
+ now + brmctx->multicast_querier_interval);
+#endif
+ }
+ err = 0;
+ goto unlock;
+ }
+ switch (val) {
+ case MDB_RTR_TYPE_DISABLED:
+ pmctx->multicast_router = MDB_RTR_TYPE_DISABLED;
+ del |= br_ip4_multicast_rport_del(pmctx);
+ timer_delete(&pmctx->ip4_mc_router_timer);
+ del |= br_ip6_multicast_rport_del(pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&pmctx->ip6_mc_router_timer);
+#endif
+ br_multicast_rport_del_notify(pmctx, del);
+ break;
+ case MDB_RTR_TYPE_TEMP_QUERY:
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ del |= br_ip4_multicast_rport_del(pmctx);
+ del |= br_ip6_multicast_rport_del(pmctx);
+ br_multicast_rport_del_notify(pmctx, del);
+ break;
+ case MDB_RTR_TYPE_PERM:
+ pmctx->multicast_router = MDB_RTR_TYPE_PERM;
+ timer_delete(&pmctx->ip4_mc_router_timer);
+ br_ip4_multicast_add_router(brmctx, pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&pmctx->ip6_mc_router_timer);
+#endif
+ br_ip6_multicast_add_router(brmctx, pmctx);
+ break;
+ case MDB_RTR_TYPE_TEMP:
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP;
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+ br_ip6_multicast_mark_router(brmctx, pmctx);
+ break;
+ default:
+ goto unlock;
+ }
+ err = 0;
+unlock:
+ spin_unlock_bh(&brmctx->br->multicast_lock);
+
+ return err;
+}
+
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router)
+{
+ int err;
+
+ if (br_vlan_is_master(v))
+ err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router);
+ else
+ err = br_multicast_set_port_router(&v->port_mcast_ctx,
+ mcast_router);
+
+ return err;
+}
+
+static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_own_query *query)
+{
+ struct net_bridge_port *port;
+
+ if (!br_multicast_ctx_matches_vlan_snooping(brmctx))
+ return;
+
+ __br_multicast_open_query(brmctx->br, query);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &brmctx->br->port_list, list) {
+ struct bridge_mcast_own_query *ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct bridge_mcast_own_query *ip6_own_query;
+#endif
+
+ if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx))
+ continue;
+
+ if (br_multicast_ctx_is_vlan(brmctx)) {
+ struct net_bridge_vlan *vlan;
+
+ vlan = br_vlan_find(nbp_vlan_group_rcu(port),
+ brmctx->vlan->vid);
+ if (!vlan ||
+ br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx))
+ continue;
+
+ ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query;
+#endif
+ } else {
+ ip4_own_query = &port->multicast_ctx.ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ ip6_own_query = &port->multicast_ctx.ip6_own_query;
+#endif
+ }
+
+ if (query == &brmctx->ip4_own_query)
+ br_multicast_enable(ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ br_multicast_enable(ip6_own_query);
+#endif
+ }
+ rcu_read_unlock();
+}
+
+static void br_multicast_del_grps(struct net_bridge *br)
+{
+ struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list)
+ __br_multicast_disable_port_ctx(&port->multicast_ctx);
+}
+
+int br_multicast_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *port;
+ bool change_snoopers = false;
+ int err = 0;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
+ goto unlock;
+
+ err = br_mc_disabled_update(br->dev, val, extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
+ goto unlock;
+
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
+ change_snoopers = true;
+ br_multicast_del_grps(br);
+ goto unlock;
+ }
+
+ if (!netif_running(br->dev))
+ goto unlock;
+
+ br_multicast_open(br);
+ list_for_each_entry(port, &br->port_list, list)
+ __br_multicast_enable_port_ctx(&port->multicast_ctx);
+
+ change_snoopers = true;
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+
+ /* br_multicast_join_snoopers has the potential to cause
+ * an MLD Report/Leave to be delivered to br_multicast_rcv,
+ * which would in turn call br_multicast_add_group, which would
+ * attempt to acquire multicast_lock. This function should be
+ * called after the lock has been released to avoid deadlocks on
+ * multicast_lock.
+ *
+ * br_multicast_leave_snoopers does not have the problem since
+ * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and
+ * returns without calling br_multicast_ipv4/6_rcv if it's not
+ * enabled. Moved both functions out just for symmetry.
+ */
+ if (change_snoopers) {
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_join_snoopers(br);
+ else
+ br_multicast_leave_snoopers(br);
+ }
+
+ return err;
+}
+
+bool br_multicast_enabled(const struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return !!br_opt_get(br, BROPT_MULTICAST_ENABLED);
+}
+EXPORT_SYMBOL_GPL(br_multicast_enabled);
+
+bool br_multicast_router(const struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ bool is_router;
+
+ spin_lock_bh(&br->multicast_lock);
+ is_router = br_multicast_is_router(&br->multicast_ctx, NULL);
+ spin_unlock_bh(&br->multicast_lock);
+ return is_router;
+}
+EXPORT_SYMBOL_GPL(br_multicast_router);
+
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val)
+{
+ unsigned long max_delay;
+
+ val = !!val;
+
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ if (brmctx->multicast_querier == val)
+ goto unlock;
+
+ WRITE_ONCE(brmctx->multicast_querier, val);
+ if (!val)
+ goto unlock;
+
+ max_delay = brmctx->multicast_query_response_interval;
+
+ if (!timer_pending(&brmctx->ip4_other_query.timer))
+ mod_timer(&brmctx->ip4_other_query.delay_timer,
+ jiffies + max_delay);
+
+ br_multicast_start_querier(brmctx, &brmctx->ip4_own_query);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!timer_pending(&brmctx->ip6_other_query.timer))
+ mod_timer(&brmctx->ip6_other_query.delay_timer,
+ jiffies + max_delay);
+
+ br_multicast_start_querier(brmctx, &brmctx->ip6_own_query);
+#endif
+
+unlock:
+ spin_unlock_bh(&brmctx->br->multicast_lock);
+
+ return 0;
+}
+
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+ unsigned long val)
+{
+ /* Currently we support only version 2 and 3 */
+ switch (val) {
+ case 2:
+ case 3:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ brmctx->multicast_igmp_version = val;
+ spin_unlock_bh(&brmctx->br->multicast_lock);
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+ unsigned long val)
+{
+ /* Currently we support version 1 and 2 */
+ switch (val) {
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ brmctx->multicast_mld_version = val;
+ spin_unlock_bh(&brmctx->br->multicast_lock);
+
+ return 0;
+}
+#endif
+
+void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val)
+{
+ unsigned long intvl_jiffies = clock_t_to_jiffies(val);
+
+ if (intvl_jiffies < BR_MULTICAST_QUERY_INTVL_MIN) {
+ br_info(brmctx->br,
+ "trying to set multicast query interval below minimum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MIN),
+ jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MIN));
+ intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN;
+ }
+
+ if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) {
+ br_info(brmctx->br,
+ "trying to set multicast query interval above maximum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX),
+ jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX));
+ intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX;
+ }
+
+ brmctx->multicast_query_interval = intvl_jiffies;
+}
+
+void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val)
+{
+ unsigned long intvl_jiffies = clock_t_to_jiffies(val);
+
+ if (intvl_jiffies < BR_MULTICAST_STARTUP_QUERY_INTVL_MIN) {
+ br_info(brmctx->br,
+ "trying to set multicast startup query interval below minimum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN),
+ jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN));
+ intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN;
+ }
+
+ if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) {
+ br_info(brmctx->br,
+ "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX),
+ jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX));
+ intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX;
+ }
+
+ brmctx->multicast_startup_query_interval = intvl_jiffies;
+}
+
+/**
+ * br_multicast_list_adjacent - Returns snooped multicast addresses
+ * @dev: The bridge port adjacent to which to retrieve addresses
+ * @br_ip_list: The list to store found, snooped multicast IP addresses in
+ *
+ * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast
+ * snooping feature on all bridge ports of dev's bridge device, excluding
+ * the addresses from dev itself.
+ *
+ * Returns the number of items added to br_ip_list.
+ *
+ * Notes:
+ * - br_ip_list needs to be initialized by caller
+ * - br_ip_list might contain duplicates in the end
+ * (needs to be taken care of by caller)
+ * - br_ip_list needs to be freed by caller
+ */
+int br_multicast_list_adjacent(struct net_device *dev,
+ struct list_head *br_ip_list)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ struct net_bridge_port_group *group;
+ struct br_ip_list *entry;
+ int count = 0;
+
+ rcu_read_lock();
+ if (!br_ip_list || !netif_is_bridge_port(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+
+ list_for_each_entry_rcu(port, &br->port_list, list) {
+ if (!port->dev || port->dev == dev)
+ continue;
+
+ hlist_for_each_entry_rcu(group, &port->mglist, mglist) {
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ goto unlock;
+
+ entry->addr = group->key.addr;
+ list_add(&entry->list, br_ip_list);
+ count++;
+ }
+ }
+
+unlock:
+ rcu_read_unlock();
+ return count;
+}
+EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
+
+/**
+ * br_multicast_has_querier_anywhere - Checks for a querier on a bridge
+ * @dev: The bridge port providing the bridge on which to check for a querier
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a valid querier exists anywhere on the bridged link layer.
+ * Otherwise returns false.
+ */
+bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ struct ethhdr eth;
+ bool ret = false;
+
+ rcu_read_lock();
+ if (!netif_is_bridge_port(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+
+ memset(&eth, 0, sizeof(eth));
+ eth.h_proto = htons(proto);
+
+ ret = br_multicast_querier_exists(&br->multicast_ctx, &eth, NULL);
+
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere);
+
+/**
+ * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port
+ * @dev: The bridge port adjacent to which to check for a querier
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a selected querier is behind one of the other ports of this
+ * bridge. Otherwise returns false.
+ */
+bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto)
+{
+ struct net_bridge_mcast *brmctx;
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ bool ret = false;
+ int port_ifidx;
+
+ rcu_read_lock();
+ if (!netif_is_bridge_port(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+ brmctx = &br->multicast_ctx;
+
+ switch (proto) {
+ case ETH_P_IP:
+ port_ifidx = brmctx->ip4_querier.port_ifidx;
+ if (!timer_pending(&brmctx->ip4_other_query.timer) ||
+ port_ifidx == port->dev->ifindex)
+ goto unlock;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case ETH_P_IPV6:
+ port_ifidx = brmctx->ip6_querier.port_ifidx;
+ if (!timer_pending(&brmctx->ip6_other_query.timer) ||
+ port_ifidx == port->dev->ifindex)
+ goto unlock;
+ break;
+#endif
+ default:
+ goto unlock;
+ }
+
+ ret = true;
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
+
+/**
+ * br_multicast_has_router_adjacent - Checks for a router behind a bridge port
+ * @dev: The bridge port adjacent to which to check for a multicast router
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a multicast router is behind one of the other ports of this
+ * bridge. Otherwise returns false.
+ */
+bool br_multicast_has_router_adjacent(struct net_device *dev, int proto)
+{
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mcast *brmctx;
+ struct net_bridge_port *port;
+ bool ret = false;
+
+ rcu_read_lock();
+ port = br_port_get_check_rcu(dev);
+ if (!port)
+ goto unlock;
+
+ brmctx = &port->br->multicast_ctx;
+ switch (proto) {
+ case ETH_P_IP:
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list,
+ ip4_rlist) {
+ if (pmctx->port == port)
+ continue;
+
+ ret = true;
+ goto unlock;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case ETH_P_IPV6:
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list,
+ ip6_rlist) {
+ if (pmctx->port == port)
+ continue;
+
+ ret = true;
+ goto unlock;
+ }
+ break;
+#endif
+ default:
+ /* when compiled without IPv6 support, be conservative and
+ * always assume presence of an IPv6 multicast router
+ */
+ ret = true;
+ }
+
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent);
+
+static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats,
+ const struct sk_buff *skb, u8 type, u8 dir)
+{
+ struct bridge_mcast_stats *pstats = this_cpu_ptr(stats);
+ __be16 proto = skb->protocol;
+ unsigned int t_len;
+
+ u64_stats_update_begin(&pstats->syncp);
+ switch (proto) {
+ case htons(ETH_P_IP):
+ t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+ switch (type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v1reports[dir]++;
+ break;
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v2reports[dir]++;
+ break;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v3reports[dir]++;
+ break;
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ if (t_len != sizeof(struct igmphdr)) {
+ pstats->mstats.igmp_v3queries[dir]++;
+ } else {
+ unsigned int offset = skb_transport_offset(skb);
+ struct igmphdr *ih, _ihdr;
+
+ ih = skb_header_pointer(skb, offset,
+ sizeof(_ihdr), &_ihdr);
+ if (!ih)
+ break;
+ if (!ih->code)
+ pstats->mstats.igmp_v1queries[dir]++;
+ else
+ pstats->mstats.igmp_v2queries[dir]++;
+ }
+ break;
+ case IGMP_HOST_LEAVE_MESSAGE:
+ pstats->mstats.igmp_leaves[dir]++;
+ break;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ t_len = ntohs(ipv6_hdr(skb)->payload_len) +
+ sizeof(struct ipv6hdr);
+ t_len -= skb_network_header_len(skb);
+ switch (type) {
+ case ICMPV6_MGM_REPORT:
+ pstats->mstats.mld_v1reports[dir]++;
+ break;
+ case ICMPV6_MLD2_REPORT:
+ pstats->mstats.mld_v2reports[dir]++;
+ break;
+ case ICMPV6_MGM_QUERY:
+ if (t_len != sizeof(struct mld_msg))
+ pstats->mstats.mld_v2queries[dir]++;
+ else
+ pstats->mstats.mld_v1queries[dir]++;
+ break;
+ case ICMPV6_MGM_REDUCTION:
+ pstats->mstats.mld_leaves[dir]++;
+ break;
+ }
+ break;
+#endif /* CONFIG_IPV6 */
+ }
+ u64_stats_update_end(&pstats->syncp);
+}
+
+void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const struct sk_buff *skb, u8 type, u8 dir)
+{
+ struct bridge_mcast_stats __percpu *stats;
+
+ /* if multicast_disabled is true then igmp type can't be set */
+ if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
+ return;
+
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ br_mcast_stats_add(stats, skb, type, dir);
+}
+
+int br_multicast_init_stats(struct net_bridge *br)
+{
+ br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
+ if (!br->mcast_stats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void br_multicast_uninit_stats(struct net_bridge *br)
+{
+ free_percpu(br->mcast_stats);
+}
+
+/* noinline for https://llvm.org/pr45802#c9 */
+static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src)
+{
+ dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
+ dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX];
+}
+
+void br_multicast_get_stats(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct br_mcast_stats *dest)
+{
+ struct bridge_mcast_stats __percpu *stats;
+ struct br_mcast_stats tdst;
+ int i;
+
+ memset(dest, 0, sizeof(*dest));
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ memset(&tdst, 0, sizeof(tdst));
+ for_each_possible_cpu(i) {
+ struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i);
+ struct br_mcast_stats temp;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
+ mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
+ mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries);
+ mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves);
+ mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports);
+ mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports);
+ mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports);
+ tdst.igmp_parse_errors += temp.igmp_parse_errors;
+
+ mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries);
+ mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries);
+ mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves);
+ mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports);
+ mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports);
+ tdst.mld_parse_errors += temp.mld_parse_errors;
+ }
+ memcpy(dest, &tdst, sizeof(*dest));
+}
+
+int br_mdb_hash_init(struct net_bridge *br)
+{
+ int err;
+
+ err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params);
+ if (err)
+ return err;
+
+ err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params);
+ if (err) {
+ rhashtable_destroy(&br->sg_port_tbl);
+ return err;
+ }
+
+ return 0;
+}
+
+void br_mdb_hash_fini(struct net_bridge *br)
+{
+ rhashtable_destroy(&br->sg_port_tbl);
+ rhashtable_destroy(&br->mdb_hash_tbl);
+}
diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c
new file mode 100644
index 000000000000..adfd74102019
--- /dev/null
+++ b/net/bridge/br_multicast_eht.c
@@ -0,0 +1,822 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/inetdevice.h>
+#include <linux/mroute.h>
+#include <net/ip.h>
+#include <net/switchdev.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#endif
+
+#include "br_private.h"
+#include "br_private_mcast_eht.h"
+
+static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr);
+static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr,
+ int filter_mode,
+ bool allow_zero_src);
+
+static struct net_bridge_group_eht_host *
+br_multicast_eht_host_lookup(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct rb_node *node = pg->eht_host_tree.rb_node;
+
+ while (node) {
+ struct net_bridge_group_eht_host *this;
+ int result;
+
+ this = rb_entry(node, struct net_bridge_group_eht_host,
+ rb_node);
+ result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr));
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+
+ return NULL;
+}
+
+static int br_multicast_eht_host_filter_mode(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct net_bridge_group_eht_host *eht_host;
+
+ eht_host = br_multicast_eht_host_lookup(pg, h_addr);
+ if (!eht_host)
+ return MCAST_INCLUDE;
+
+ return eht_host->filter_mode;
+}
+
+static struct net_bridge_group_eht_set_entry *
+br_multicast_eht_set_entry_lookup(struct net_bridge_group_eht_set *eht_set,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct rb_node *node = eht_set->entry_tree.rb_node;
+
+ while (node) {
+ struct net_bridge_group_eht_set_entry *this;
+ int result;
+
+ this = rb_entry(node, struct net_bridge_group_eht_set_entry,
+ rb_node);
+ result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr));
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+
+ return NULL;
+}
+
+static struct net_bridge_group_eht_set *
+br_multicast_eht_set_lookup(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr)
+{
+ struct rb_node *node = pg->eht_set_tree.rb_node;
+
+ while (node) {
+ struct net_bridge_group_eht_set *this;
+ int result;
+
+ this = rb_entry(node, struct net_bridge_group_eht_set,
+ rb_node);
+ result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr));
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+
+ return NULL;
+}
+
+static void __eht_destroy_host(struct net_bridge_group_eht_host *eht_host)
+{
+ WARN_ON(!hlist_empty(&eht_host->set_entries));
+
+ br_multicast_eht_hosts_dec(eht_host->pg);
+
+ rb_erase(&eht_host->rb_node, &eht_host->pg->eht_host_tree);
+ RB_CLEAR_NODE(&eht_host->rb_node);
+ kfree(eht_host);
+}
+
+static void br_multicast_destroy_eht_set_entry(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+
+ set_h = container_of(gc, struct net_bridge_group_eht_set_entry, mcast_gc);
+ WARN_ON(!RB_EMPTY_NODE(&set_h->rb_node));
+
+ timer_shutdown_sync(&set_h->timer);
+ kfree(set_h);
+}
+
+static void br_multicast_destroy_eht_set(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_eht_set *eht_set;
+
+ eht_set = container_of(gc, struct net_bridge_group_eht_set, mcast_gc);
+ WARN_ON(!RB_EMPTY_NODE(&eht_set->rb_node));
+ WARN_ON(!RB_EMPTY_ROOT(&eht_set->entry_tree));
+
+ timer_shutdown_sync(&eht_set->timer);
+ kfree(eht_set);
+}
+
+static void __eht_del_set_entry(struct net_bridge_group_eht_set_entry *set_h)
+{
+ struct net_bridge_group_eht_host *eht_host = set_h->h_parent;
+ union net_bridge_eht_addr zero_addr;
+
+ rb_erase(&set_h->rb_node, &set_h->eht_set->entry_tree);
+ RB_CLEAR_NODE(&set_h->rb_node);
+ hlist_del_init(&set_h->host_list);
+ memset(&zero_addr, 0, sizeof(zero_addr));
+ if (memcmp(&set_h->h_addr, &zero_addr, sizeof(zero_addr)))
+ eht_host->num_entries--;
+ hlist_add_head(&set_h->mcast_gc.gc_node, &set_h->br->mcast_gc_list);
+ queue_work(system_long_wq, &set_h->br->mcast_gc_work);
+
+ if (hlist_empty(&eht_host->set_entries))
+ __eht_destroy_host(eht_host);
+}
+
+static void br_multicast_del_eht_set(struct net_bridge_group_eht_set *eht_set)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct rb_node *node;
+
+ while ((node = rb_first(&eht_set->entry_tree))) {
+ set_h = rb_entry(node, struct net_bridge_group_eht_set_entry,
+ rb_node);
+ __eht_del_set_entry(set_h);
+ }
+
+ rb_erase(&eht_set->rb_node, &eht_set->pg->eht_set_tree);
+ RB_CLEAR_NODE(&eht_set->rb_node);
+ hlist_add_head(&eht_set->mcast_gc.gc_node, &eht_set->br->mcast_gc_list);
+ queue_work(system_long_wq, &eht_set->br->mcast_gc_work);
+}
+
+void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_eht_set *eht_set;
+ struct rb_node *node;
+
+ while ((node = rb_first(&pg->eht_set_tree))) {
+ eht_set = rb_entry(node, struct net_bridge_group_eht_set,
+ rb_node);
+ br_multicast_del_eht_set(eht_set);
+ }
+}
+
+static void br_multicast_eht_set_entry_expired(struct timer_list *t)
+{
+ struct net_bridge_group_eht_set_entry *set_h = timer_container_of(set_h,
+ t,
+ timer);
+ struct net_bridge *br = set_h->br;
+
+ spin_lock(&br->multicast_lock);
+ if (RB_EMPTY_NODE(&set_h->rb_node) || timer_pending(&set_h->timer))
+ goto out;
+
+ br_multicast_del_eht_set_entry(set_h->eht_set->pg,
+ &set_h->eht_set->src_addr,
+ &set_h->h_addr);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_eht_set_expired(struct timer_list *t)
+{
+ struct net_bridge_group_eht_set *eht_set = timer_container_of(eht_set,
+ t,
+ timer);
+ struct net_bridge *br = eht_set->br;
+
+ spin_lock(&br->multicast_lock);
+ if (RB_EMPTY_NODE(&eht_set->rb_node) || timer_pending(&eht_set->timer))
+ goto out;
+
+ br_multicast_del_eht_set(eht_set);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static struct net_bridge_group_eht_host *
+__eht_lookup_create_host(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ unsigned char filter_mode)
+{
+ struct rb_node **link = &pg->eht_host_tree.rb_node, *parent = NULL;
+ struct net_bridge_group_eht_host *eht_host;
+
+ while (*link) {
+ struct net_bridge_group_eht_host *this;
+ int result;
+
+ this = rb_entry(*link, struct net_bridge_group_eht_host,
+ rb_node);
+ result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr));
+ parent = *link;
+ if (result < 0)
+ link = &((*link)->rb_left);
+ else if (result > 0)
+ link = &((*link)->rb_right);
+ else
+ return this;
+ }
+
+ if (br_multicast_eht_hosts_over_limit(pg))
+ return NULL;
+
+ eht_host = kzalloc(sizeof(*eht_host), GFP_ATOMIC);
+ if (!eht_host)
+ return NULL;
+
+ memcpy(&eht_host->h_addr, h_addr, sizeof(*h_addr));
+ INIT_HLIST_HEAD(&eht_host->set_entries);
+ eht_host->pg = pg;
+ eht_host->filter_mode = filter_mode;
+
+ rb_link_node(&eht_host->rb_node, parent, link);
+ rb_insert_color(&eht_host->rb_node, &pg->eht_host_tree);
+
+ br_multicast_eht_hosts_inc(pg);
+
+ return eht_host;
+}
+
+static struct net_bridge_group_eht_set_entry *
+__eht_lookup_create_set_entry(struct net_bridge *br,
+ struct net_bridge_group_eht_set *eht_set,
+ struct net_bridge_group_eht_host *eht_host,
+ bool allow_zero_src)
+{
+ struct rb_node **link = &eht_set->entry_tree.rb_node, *parent = NULL;
+ struct net_bridge_group_eht_set_entry *set_h;
+
+ while (*link) {
+ struct net_bridge_group_eht_set_entry *this;
+ int result;
+
+ this = rb_entry(*link, struct net_bridge_group_eht_set_entry,
+ rb_node);
+ result = memcmp(&eht_host->h_addr, &this->h_addr,
+ sizeof(union net_bridge_eht_addr));
+ parent = *link;
+ if (result < 0)
+ link = &((*link)->rb_left);
+ else if (result > 0)
+ link = &((*link)->rb_right);
+ else
+ return this;
+ }
+
+ /* always allow auto-created zero entry */
+ if (!allow_zero_src && eht_host->num_entries >= PG_SRC_ENT_LIMIT)
+ return NULL;
+
+ set_h = kzalloc(sizeof(*set_h), GFP_ATOMIC);
+ if (!set_h)
+ return NULL;
+
+ memcpy(&set_h->h_addr, &eht_host->h_addr,
+ sizeof(union net_bridge_eht_addr));
+ set_h->mcast_gc.destroy = br_multicast_destroy_eht_set_entry;
+ set_h->eht_set = eht_set;
+ set_h->h_parent = eht_host;
+ set_h->br = br;
+ timer_setup(&set_h->timer, br_multicast_eht_set_entry_expired, 0);
+
+ hlist_add_head(&set_h->host_list, &eht_host->set_entries);
+ rb_link_node(&set_h->rb_node, parent, link);
+ rb_insert_color(&set_h->rb_node, &eht_set->entry_tree);
+ /* we must not count the auto-created zero entry otherwise we won't be
+ * able to track the full list of PG_SRC_ENT_LIMIT entries
+ */
+ if (!allow_zero_src)
+ eht_host->num_entries++;
+
+ return set_h;
+}
+
+static struct net_bridge_group_eht_set *
+__eht_lookup_create_set(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr)
+{
+ struct rb_node **link = &pg->eht_set_tree.rb_node, *parent = NULL;
+ struct net_bridge_group_eht_set *eht_set;
+
+ while (*link) {
+ struct net_bridge_group_eht_set *this;
+ int result;
+
+ this = rb_entry(*link, struct net_bridge_group_eht_set,
+ rb_node);
+ result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr));
+ parent = *link;
+ if (result < 0)
+ link = &((*link)->rb_left);
+ else if (result > 0)
+ link = &((*link)->rb_right);
+ else
+ return this;
+ }
+
+ eht_set = kzalloc(sizeof(*eht_set), GFP_ATOMIC);
+ if (!eht_set)
+ return NULL;
+
+ memcpy(&eht_set->src_addr, src_addr, sizeof(*src_addr));
+ eht_set->mcast_gc.destroy = br_multicast_destroy_eht_set;
+ eht_set->pg = pg;
+ eht_set->br = pg->key.port->br;
+ eht_set->entry_tree = RB_ROOT;
+ timer_setup(&eht_set->timer, br_multicast_eht_set_expired, 0);
+
+ rb_link_node(&eht_set->rb_node, parent, link);
+ rb_insert_color(&eht_set->rb_node, &pg->eht_set_tree);
+
+ return eht_set;
+}
+
+static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src,
+ union net_bridge_eht_addr *dest)
+{
+ switch (src->proto) {
+ case htons(ETH_P_IP):
+ dest->ip4 = src->src.ip4;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ memcpy(&dest->ip6, &src->src.ip6, sizeof(struct in6_addr));
+ break;
+#endif
+ }
+}
+
+static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ int filter_mode)
+{
+ struct net_bridge_group_eht_host *eht_host;
+ union net_bridge_eht_addr zero_addr;
+
+ eht_host = br_multicast_eht_host_lookup(pg, h_addr);
+ if (eht_host)
+ eht_host->filter_mode = filter_mode;
+
+ memset(&zero_addr, 0, sizeof(zero_addr));
+ switch (filter_mode) {
+ case MCAST_INCLUDE:
+ br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr);
+ break;
+ case MCAST_EXCLUDE:
+ br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr,
+ h_addr, MCAST_EXCLUDE,
+ true);
+ break;
+ }
+}
+
+static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr,
+ int filter_mode,
+ bool allow_zero_src)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct net_bridge_group_eht_host *eht_host;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_eht_set *eht_set;
+ union net_bridge_eht_addr zero_addr;
+
+ memset(&zero_addr, 0, sizeof(zero_addr));
+ if (!allow_zero_src && !memcmp(src_addr, &zero_addr, sizeof(zero_addr)))
+ return;
+
+ eht_set = __eht_lookup_create_set(pg, src_addr);
+ if (!eht_set)
+ return;
+
+ eht_host = __eht_lookup_create_host(pg, h_addr, filter_mode);
+ if (!eht_host)
+ goto fail_host;
+
+ set_h = __eht_lookup_create_set_entry(br, eht_set, eht_host,
+ allow_zero_src);
+ if (!set_h)
+ goto fail_set_entry;
+
+ mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx));
+ mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx));
+
+ return;
+
+fail_set_entry:
+ if (hlist_empty(&eht_host->set_entries))
+ __eht_destroy_host(eht_host);
+fail_host:
+ if (RB_EMPTY_ROOT(&eht_set->entry_tree))
+ br_multicast_del_eht_set(eht_set);
+}
+
+static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct net_bridge_group_eht_set *eht_set;
+ bool set_deleted = false;
+
+ eht_set = br_multicast_eht_set_lookup(pg, src_addr);
+ if (!eht_set)
+ goto out;
+
+ set_h = br_multicast_eht_set_entry_lookup(eht_set, h_addr);
+ if (!set_h)
+ goto out;
+
+ __eht_del_set_entry(set_h);
+
+ if (RB_EMPTY_ROOT(&eht_set->entry_tree)) {
+ br_multicast_del_eht_set(eht_set);
+ set_deleted = true;
+ }
+
+out:
+ return set_deleted;
+}
+
+static void br_multicast_del_eht_host(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct net_bridge_group_eht_host *eht_host;
+ struct hlist_node *tmp;
+
+ eht_host = br_multicast_eht_host_lookup(pg, h_addr);
+ if (!eht_host)
+ return;
+
+ hlist_for_each_entry_safe(set_h, tmp, &eht_host->set_entries, host_list)
+ br_multicast_del_eht_set_entry(set_h->eht_set->pg,
+ &set_h->eht_set->src_addr,
+ &set_h->h_addr);
+}
+
+/* create new set entries from reports */
+static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int filter_mode)
+{
+ union net_bridge_eht_addr eht_src_addr;
+ u32 src_idx;
+
+ memset(&eht_src_addr, 0, sizeof(eht_src_addr));
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
+ br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr,
+ h_addr, filter_mode,
+ false);
+ }
+}
+
+/* delete existing set entries and their (S,G) entries if they were the last */
+static bool __eht_del_set_entries(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
+{
+ union net_bridge_eht_addr eht_src_addr;
+ struct net_bridge_group_src *src_ent;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ memset(&eht_src_addr, 0, sizeof(eht_src_addr));
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
+ if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, h_addr))
+ continue;
+ memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size);
+ src_ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!src_ent)
+ continue;
+ br_multicast_del_group_src(src_ent, true);
+ changed = true;
+ }
+
+ return changed;
+}
+
+static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
+{
+ bool changed = false;
+
+ switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
+ case MCAST_INCLUDE:
+ __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs,
+ addr_size, MCAST_INCLUDE);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs,
+ addr_size);
+ break;
+ }
+
+ return changed;
+}
+
+static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
+{
+ bool changed = false;
+
+ switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
+ case MCAST_INCLUDE:
+ changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs,
+ addr_size);
+ break;
+ case MCAST_EXCLUDE:
+ __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_EXCLUDE);
+ break;
+ }
+
+ return changed;
+}
+
+/* flush_entries is true when changing mode */
+static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ unsigned char filter_mode,
+ bool to_report)
+{
+ bool changed = false, flush_entries = to_report;
+ union net_bridge_eht_addr eht_src_addr;
+
+ if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode)
+ flush_entries = true;
+
+ memset(&eht_src_addr, 0, sizeof(eht_src_addr));
+ /* if we're changing mode del host and its entries */
+ if (flush_entries)
+ br_multicast_del_eht_host(pg, h_addr);
+ __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ filter_mode);
+ /* we can be missing sets only if we've deleted some entries */
+ if (flush_entries) {
+ struct net_bridge_group_eht_set *eht_set;
+ struct net_bridge_group_src *src_ent;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) {
+ br_multicast_ip_src_to_eht_addr(&src_ent->addr,
+ &eht_src_addr);
+ if (!br_multicast_eht_set_lookup(pg, &eht_src_addr)) {
+ br_multicast_del_group_src(src_ent, true);
+ changed = true;
+ continue;
+ }
+ /* this is an optimization for TO_INCLUDE where we lower
+ * the set's timeout to LMQT to catch timeout hosts:
+ * - host A (timing out): set entries X, Y
+ * - host B: set entry Z (new from current TO_INCLUDE)
+ * sends BLOCK Z after LMQT but host A's EHT
+ * entries still exist (unless lowered to LMQT
+ * so they can timeout with the S,Gs)
+ * => we wait another LMQT, when we can just delete the
+ * group immediately
+ */
+ if (!(src_ent->flags & BR_SGRP_F_SEND) ||
+ filter_mode != MCAST_INCLUDE ||
+ !to_report)
+ continue;
+ eht_set = br_multicast_eht_set_lookup(pg,
+ &eht_src_addr);
+ if (!eht_set)
+ continue;
+ mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx));
+ }
+ }
+
+ return changed;
+}
+
+static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ bool to_report)
+{
+ bool changed;
+
+ changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_INCLUDE, to_report);
+ br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE);
+
+ return changed;
+}
+
+static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ bool to_report)
+{
+ bool changed;
+
+ changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_EXCLUDE, to_report);
+ br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE);
+
+ return changed;
+}
+
+static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ int grec_type)
+{
+ bool changed = false, to_report = false;
+
+ switch (grec_type) {
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ to_report = true;
+ fallthrough;
+ case IGMPV3_MODE_IS_INCLUDE:
+ changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32), to_report);
+ break;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ to_report = true;
+ fallthrough;
+ case IGMPV3_MODE_IS_EXCLUDE:
+ changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32), to_report);
+ break;
+ }
+
+ return changed;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ int grec_type)
+{
+ bool changed = false, to_report = false;
+
+ switch (grec_type) {
+ case MLD2_ALLOW_NEW_SOURCES:
+ br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_BLOCK_OLD_SOURCES:
+ changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_CHANGE_TO_INCLUDE:
+ to_report = true;
+ fallthrough;
+ case MLD2_MODE_IS_INCLUDE:
+ changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr),
+ to_report);
+ break;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ to_report = true;
+ fallthrough;
+ case MLD2_MODE_IS_EXCLUDE:
+ changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr),
+ to_report);
+ break;
+ }
+
+ return changed;
+}
+#endif
+
+/* true means an entry was deleted */
+bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ void *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int grec_type)
+{
+ bool eht_enabled = !!(pg->key.port->flags & BR_MULTICAST_FAST_LEAVE);
+ union net_bridge_eht_addr eht_host_addr;
+ bool changed = false;
+
+ if (!eht_enabled)
+ goto out;
+
+ memset(&eht_host_addr, 0, sizeof(eht_host_addr));
+ memcpy(&eht_host_addr, h_addr, addr_size);
+ if (addr_size == sizeof(__be32))
+ changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs,
+ nsrcs, grec_type);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs,
+ nsrcs, grec_type);
+#endif
+
+out:
+ return changed;
+}
+
+int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p,
+ u32 eht_hosts_limit)
+{
+ struct net_bridge *br = p->br;
+
+ if (!eht_hosts_limit)
+ return -EINVAL;
+
+ spin_lock_bh(&br->multicast_lock);
+ p->multicast_eht_hosts_limit = eht_hosts_limit;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
deleted file mode 100644
index ac47ba2ba028..000000000000
--- a/net/bridge/br_netfilter.c
+++ /dev/null
@@ -1,1129 +0,0 @@
-/*
- * Handle firewalling
- * Linux ethernet bridge
- *
- * Authors:
- * Lennert Buytenhek <buytenh@gnu.org>
- * Bart De Schuymer (maintainer) <bdschuym@pandora.be>
- *
- * Changes:
- * Apr 29 2003: physdev module support (bdschuym)
- * Jun 19 2003: let arptables see bridged ARP traffic (bdschuym)
- * Oct 06 2003: filter encapsulated IP/ARP VLAN traffic on untagged bridge
- * (bdschuym)
- * Sep 01 2004: add IPv6 filtering (bdschuym)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Lennert dedicates this file to Kerstin Wurdinger.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter_arp.h>
-#include <linux/in_route.h>
-
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/route.h>
-
-#include <asm/uaccess.h>
-#include "br_private.h"
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-#endif
-
-#define skb_origaddr(skb) (((struct bridge_skb_cb *) \
- (skb->nf_bridge->data))->daddr.ipv4)
-#define store_orig_dstaddr(skb) (skb_origaddr(skb) = (skb)->nh.iph->daddr)
-#define dnat_took_place(skb) (skb_origaddr(skb) != (skb)->nh.iph->daddr)
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly = 1;
-#else
-#define brnf_filter_vlan_tagged 1
-#endif
-
-int brnf_deferred_hooks;
-EXPORT_SYMBOL_GPL(brnf_deferred_hooks);
-
-static __be16 inline vlan_proto(const struct sk_buff *skb)
-{
- return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
-}
-
-#define IS_VLAN_IP(skb) \
- (skb->protocol == htons(ETH_P_8021Q) && \
- vlan_proto(skb) == htons(ETH_P_IP) && \
- brnf_filter_vlan_tagged)
-
-#define IS_VLAN_IPV6(skb) \
- (skb->protocol == htons(ETH_P_8021Q) && \
- vlan_proto(skb) == htons(ETH_P_IPV6) &&\
- brnf_filter_vlan_tagged)
-
-#define IS_VLAN_ARP(skb) \
- (skb->protocol == htons(ETH_P_8021Q) && \
- vlan_proto(skb) == htons(ETH_P_ARP) && \
- brnf_filter_vlan_tagged)
-
-/* We need these fake structures to make netfilter happy --
- * lots of places assume that skb->dst != NULL, which isn't
- * all that unreasonable.
- *
- * Currently, we fill in the PMTU entry because netfilter
- * refragmentation needs it, and the rt_flags entry because
- * ipt_REJECT needs it. Future netfilter modules might
- * require us to fill additional fields. */
-static struct net_device __fake_net_device = {
- .hard_header_len = ETH_HLEN
-};
-
-static struct rtable __fake_rtable = {
- .u = {
- .dst = {
- .__refcnt = ATOMIC_INIT(1),
- .dev = &__fake_net_device,
- .path = &__fake_rtable.u.dst,
- .metrics = {[RTAX_MTU - 1] = 1500},
- .flags = DST_NOXFRM,
- }
- },
- .rt_flags = 0,
-};
-
-static inline struct net_device *bridge_parent(const struct net_device *dev)
-{
- struct net_bridge_port *port = rcu_dereference(dev->br_port);
-
- return port ? port->br->dev : NULL;
-}
-
-static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
-{
- skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
- if (likely(skb->nf_bridge))
- atomic_set(&(skb->nf_bridge->use), 1);
-
- return skb->nf_bridge;
-}
-
-static inline void nf_bridge_save_header(struct sk_buff *skb)
-{
- int header_size = ETH_HLEN;
-
- if (skb->protocol == htons(ETH_P_8021Q))
- header_size += VLAN_HLEN;
-
- memcpy(skb->nf_bridge->data, skb->data - header_size, header_size);
-}
-
-/*
- * When forwarding bridge frames, we save a copy of the original
- * header before processing.
- */
-int nf_bridge_copy_header(struct sk_buff *skb)
-{
- int err;
- int header_size = ETH_HLEN;
-
- if (skb->protocol == htons(ETH_P_8021Q))
- header_size += VLAN_HLEN;
-
- err = skb_cow(skb, header_size);
- if (err)
- return err;
-
- memcpy(skb->data - header_size, skb->nf_bridge->data, header_size);
-
- if (skb->protocol == htons(ETH_P_8021Q))
- __skb_push(skb, VLAN_HLEN);
- return 0;
-}
-
-/* PF_BRIDGE/PRE_ROUTING *********************************************/
-/* Undo the changes made for ip6tables PREROUTING and continue the
- * bridge PRE_ROUTING hook. */
-static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
-{
- struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-
- if (nf_bridge->mask & BRNF_PKT_TYPE) {
- skb->pkt_type = PACKET_OTHERHOST;
- nf_bridge->mask ^= BRNF_PKT_TYPE;
- }
- nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-
- skb->dst = (struct dst_entry *)&__fake_rtable;
- dst_hold(skb->dst);
-
- skb->dev = nf_bridge->physindev;
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_push(skb, VLAN_HLEN);
- skb->nh.raw -= VLAN_HLEN;
- }
- NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
- br_handle_frame_finish, 1);
-
- return 0;
-}
-
-static void __br_dnat_complain(void)
-{
- static unsigned long last_complaint;
-
- if (jiffies - last_complaint >= 5 * HZ) {
- printk(KERN_WARNING "Performing cross-bridge DNAT requires IP "
- "forwarding to be enabled\n");
- last_complaint = jiffies;
- }
-}
-
-/* This requires some explaining. If DNAT has taken place,
- * we will need to fix up the destination Ethernet address,
- * and this is a tricky process.
- *
- * There are two cases to consider:
- * 1. The packet was DNAT'ed to a device in the same bridge
- * port group as it was received on. We can still bridge
- * the packet.
- * 2. The packet was DNAT'ed to a different device, either
- * a non-bridged device or another bridge port group.
- * The packet will need to be routed.
- *
- * The correct way of distinguishing between these two cases is to
- * call ip_route_input() and to look at skb->dst->dev, which is
- * changed to the destination device if ip_route_input() succeeds.
- *
- * Let us first consider the case that ip_route_input() succeeds:
- *
- * If skb->dst->dev equals the logical bridge device the packet
- * came in on, we can consider this bridging. We then call
- * skb->dst->output() which will make the packet enter br_nf_local_out()
- * not much later. In that function it is assured that the iptables
- * FORWARD chain is traversed for the packet.
- *
- * Otherwise, the packet is considered to be routed and we just
- * change the destination MAC address so that the packet will
- * later be passed up to the IP stack to be routed.
- *
- * Let us now consider the case that ip_route_input() fails:
- *
- * After a "echo '0' > /proc/sys/net/ipv4/ip_forward" ip_route_input()
- * will fail, while __ip_route_output_key() will return success. The source
- * address for __ip_route_output_key() is set to zero, so __ip_route_output_key
- * thinks we're handling a locally generated packet and won't care
- * if IP forwarding is allowed. We send a warning message to the users's
- * log telling her to put IP forwarding on.
- *
- * ip_route_input() will also fail if there is no route available.
- * In that case we just drop the packet.
- *
- * --Lennert, 20020411
- * --Bart, 20020416 (updated)
- * --Bart, 20021007 (updated) */
-static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
-{
- if (skb->pkt_type == PACKET_OTHERHOST) {
- skb->pkt_type = PACKET_HOST;
- skb->nf_bridge->mask |= BRNF_PKT_TYPE;
- }
- skb->nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-
- skb->dev = bridge_parent(skb->dev);
- if (!skb->dev)
- kfree_skb(skb);
- else {
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_pull(skb, VLAN_HLEN);
- skb->nh.raw += VLAN_HLEN;
- }
- skb->dst->output(skb);
- }
- return 0;
-}
-
-static int br_nf_pre_routing_finish(struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- struct iphdr *iph = skb->nh.iph;
- struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-
- if (nf_bridge->mask & BRNF_PKT_TYPE) {
- skb->pkt_type = PACKET_OTHERHOST;
- nf_bridge->mask ^= BRNF_PKT_TYPE;
- }
- nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-
- if (dnat_took_place(skb)) {
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) {
- struct rtable *rt;
- struct flowi fl = {
- .nl_u = {
- .ip4_u = {
- .daddr = iph->daddr,
- .saddr = 0,
- .tos = RT_TOS(iph->tos) },
- },
- .proto = 0,
- };
-
- if (!ip_route_output_key(&rt, &fl)) {
- /* - Bridged-and-DNAT'ed traffic doesn't
- * require ip_forwarding.
- * - Deal with redirected traffic. */
- if (((struct dst_entry *)rt)->dev == dev ||
- rt->rt_type == RTN_LOCAL) {
- skb->dst = (struct dst_entry *)rt;
- goto bridged_dnat;
- }
- __br_dnat_complain();
- dst_release((struct dst_entry *)rt);
- }
- kfree_skb(skb);
- return 0;
- } else {
- if (skb->dst->dev == dev) {
-bridged_dnat:
- /* Tell br_nf_local_out this is a
- * bridged frame */
- nf_bridge->mask |= BRNF_BRIDGED_DNAT;
- skb->dev = nf_bridge->physindev;
- if (skb->protocol ==
- htons(ETH_P_8021Q)) {
- skb_push(skb, VLAN_HLEN);
- skb->nh.raw -= VLAN_HLEN;
- }
- NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING,
- skb, skb->dev, NULL,
- br_nf_pre_routing_finish_bridge,
- 1);
- return 0;
- }
- memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN);
- skb->pkt_type = PACKET_HOST;
- }
- } else {
- skb->dst = (struct dst_entry *)&__fake_rtable;
- dst_hold(skb->dst);
- }
-
- skb->dev = nf_bridge->physindev;
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_push(skb, VLAN_HLEN);
- skb->nh.raw -= VLAN_HLEN;
- }
- NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
- br_handle_frame_finish, 1);
-
- return 0;
-}
-
-/* Some common code for IPv4/IPv6 */
-static struct net_device *setup_pre_routing(struct sk_buff *skb)
-{
- struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-
- if (skb->pkt_type == PACKET_OTHERHOST) {
- skb->pkt_type = PACKET_HOST;
- nf_bridge->mask |= BRNF_PKT_TYPE;
- }
-
- nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
- nf_bridge->physindev = skb->dev;
- skb->dev = bridge_parent(skb->dev);
-
- return skb->dev;
-}
-
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
- unsigned char *raw = (u8 *) (skb->nh.ipv6h + 1);
- u32 pkt_len;
- int off = raw - skb->nh.raw;
- int len = (raw[1] + 1) << 3;
-
- if ((raw + len) - skb->data > skb_headlen(skb))
- goto bad;
-
- off += 2;
- len -= 2;
-
- while (len > 0) {
- int optlen = skb->nh.raw[off + 1] + 2;
-
- switch (skb->nh.raw[off]) {
- case IPV6_TLV_PAD0:
- optlen = 1;
- break;
-
- case IPV6_TLV_PADN:
- break;
-
- case IPV6_TLV_JUMBO:
- if (skb->nh.raw[off + 1] != 4 || (off & 3) != 2)
- goto bad;
- pkt_len = ntohl(*(__be32 *) (skb->nh.raw + off + 2));
- if (pkt_len <= IPV6_MAXPLEN ||
- skb->nh.ipv6h->payload_len)
- goto bad;
- if (pkt_len > skb->len - sizeof(struct ipv6hdr))
- goto bad;
- if (pskb_trim_rcsum(skb,
- pkt_len + sizeof(struct ipv6hdr)))
- goto bad;
- break;
- default:
- if (optlen > len)
- goto bad;
- break;
- }
- off += optlen;
- len -= optlen;
- }
- if (len == 0)
- return 0;
-bad:
- return -1;
-
-}
-
-/* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
-static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ipv6hdr *hdr;
- u32 pkt_len;
-
- if (skb->len < sizeof(struct ipv6hdr))
- goto inhdr_error;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- goto inhdr_error;
-
- hdr = skb->nh.ipv6h;
-
- if (hdr->version != 6)
- goto inhdr_error;
-
- pkt_len = ntohs(hdr->payload_len);
-
- if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- goto inhdr_error;
- if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
- goto inhdr_error;
- }
- if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
- goto inhdr_error;
-
- nf_bridge_put(skb->nf_bridge);
- if (!nf_bridge_alloc(skb))
- return NF_DROP;
- if (!setup_pre_routing(skb))
- return NF_DROP;
-
- NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
- br_nf_pre_routing_finish_ipv6);
-
- return NF_STOLEN;
-
-inhdr_error:
- return NF_DROP;
-}
-
-/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
- * Replicate the checks that IPv4 does on packet reception.
- * Set skb->dev to the bridge device (i.e. parent of the
- * receiving device) to make netfilter happy, the REDIRECT
- * target in particular. Save the original destination IP
- * address to be able to detect DNAT afterwards. */
-static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct iphdr *iph;
- __u32 len;
- struct sk_buff *skb = *pskb;
-
- if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb)) {
-#ifdef CONFIG_SYSCTL
- if (!brnf_call_ip6tables)
- return NF_ACCEPT;
-#endif
- if ((skb = skb_share_check(*pskb, GFP_ATOMIC)) == NULL)
- goto out;
-
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_pull_rcsum(skb, VLAN_HLEN);
- skb->nh.raw += VLAN_HLEN;
- }
- return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
- }
-#ifdef CONFIG_SYSCTL
- if (!brnf_call_iptables)
- return NF_ACCEPT;
-#endif
-
- if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb))
- return NF_ACCEPT;
-
- if ((skb = skb_share_check(*pskb, GFP_ATOMIC)) == NULL)
- goto out;
-
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_pull_rcsum(skb, VLAN_HLEN);
- skb->nh.raw += VLAN_HLEN;
- }
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto inhdr_error;
-
- iph = skb->nh.iph;
- if (iph->ihl < 5 || iph->version != 4)
- goto inhdr_error;
-
- if (!pskb_may_pull(skb, 4 * iph->ihl))
- goto inhdr_error;
-
- iph = skb->nh.iph;
- if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
- goto inhdr_error;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < 4 * iph->ihl)
- goto inhdr_error;
-
- pskb_trim_rcsum(skb, len);
-
- nf_bridge_put(skb->nf_bridge);
- if (!nf_bridge_alloc(skb))
- return NF_DROP;
- if (!setup_pre_routing(skb))
- return NF_DROP;
- store_orig_dstaddr(skb);
-
- NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
- br_nf_pre_routing_finish);
-
- return NF_STOLEN;
-
-inhdr_error:
-// IP_INC_STATS_BH(IpInHdrErrors);
-out:
- return NF_DROP;
-}
-
-
-/* PF_BRIDGE/LOCAL_IN ************************************************/
-/* The packet is locally destined, which requires a real
- * dst_entry, so detach the fake one. On the way up, the
- * packet would pass through PRE_ROUTING again (which already
- * took place when the packet entered the bridge), but we
- * register an IPv4 PRE_ROUTING 'sabotage' hook that will
- * prevent this from happening. */
-static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
-
- if (skb->dst == (struct dst_entry *)&__fake_rtable) {
- dst_release(skb->dst);
- skb->dst = NULL;
- }
-
- return NF_ACCEPT;
-}
-
-/* PF_BRIDGE/FORWARD *************************************************/
-static int br_nf_forward_finish(struct sk_buff *skb)
-{
- struct nf_bridge_info *nf_bridge = skb->nf_bridge;
- struct net_device *in;
-
- if (skb->protocol != htons(ETH_P_ARP) && !IS_VLAN_ARP(skb)) {
- in = nf_bridge->physindev;
- if (nf_bridge->mask & BRNF_PKT_TYPE) {
- skb->pkt_type = PACKET_OTHERHOST;
- nf_bridge->mask ^= BRNF_PKT_TYPE;
- }
- } else {
- in = *((struct net_device **)(skb->cb));
- }
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_push(skb, VLAN_HLEN);
- skb->nh.raw -= VLAN_HLEN;
- }
- NF_HOOK_THRESH(PF_BRIDGE, NF_BR_FORWARD, skb, in,
- skb->dev, br_forward_finish, 1);
- return 0;
-}
-
-/* This is the 'purely bridged' case. For IP, we pass the packet to
- * netfilter with indev and outdev set to the bridge device,
- * but we are still able to filter on the 'real' indev/outdev
- * because of the physdev module. For ARP, indev and outdev are the
- * bridge ports. */
-static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
- struct nf_bridge_info *nf_bridge;
- struct net_device *parent;
- int pf;
-
- if (!skb->nf_bridge)
- return NF_ACCEPT;
-
- parent = bridge_parent(out);
- if (!parent)
- return NF_DROP;
-
- if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb))
- pf = PF_INET;
- else
- pf = PF_INET6;
-
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_pull(*pskb, VLAN_HLEN);
- (*pskb)->nh.raw += VLAN_HLEN;
- }
-
- nf_bridge = skb->nf_bridge;
- if (skb->pkt_type == PACKET_OTHERHOST) {
- skb->pkt_type = PACKET_HOST;
- nf_bridge->mask |= BRNF_PKT_TYPE;
- }
-
- /* The physdev module checks on this */
- nf_bridge->mask |= BRNF_BRIDGED;
- nf_bridge->physoutdev = skb->dev;
-
- NF_HOOK(pf, NF_IP_FORWARD, skb, bridge_parent(in), parent,
- br_nf_forward_finish);
-
- return NF_STOLEN;
-}
-
-static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
- struct net_device **d = (struct net_device **)(skb->cb);
-
-#ifdef CONFIG_SYSCTL
- if (!brnf_call_arptables)
- return NF_ACCEPT;
-#endif
-
- if (skb->protocol != htons(ETH_P_ARP)) {
- if (!IS_VLAN_ARP(skb))
- return NF_ACCEPT;
- skb_pull(*pskb, VLAN_HLEN);
- (*pskb)->nh.raw += VLAN_HLEN;
- }
-
- if (skb->nh.arph->ar_pln != 4) {
- if (IS_VLAN_ARP(skb)) {
- skb_push(*pskb, VLAN_HLEN);
- (*pskb)->nh.raw -= VLAN_HLEN;
- }
- return NF_ACCEPT;
- }
- *d = (struct net_device *)in;
- NF_HOOK(NF_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in,
- (struct net_device *)out, br_nf_forward_finish);
-
- return NF_STOLEN;
-}
-
-/* PF_BRIDGE/LOCAL_OUT ***********************************************/
-static int br_nf_local_out_finish(struct sk_buff *skb)
-{
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_push(skb, VLAN_HLEN);
- skb->nh.raw -= VLAN_HLEN;
- }
-
- NF_HOOK_THRESH(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
- br_forward_finish, NF_BR_PRI_FIRST + 1);
-
- return 0;
-}
-
-/* This function sees both locally originated IP packets and forwarded
- * IP packets (in both cases the destination device is a bridge
- * device). It also sees bridged-and-DNAT'ed packets.
- * To be able to filter on the physical bridge devices (with the physdev
- * module), we steal packets destined to a bridge device away from the
- * PF_INET/FORWARD and PF_INET/OUTPUT hook functions, and give them back later,
- * when we have determined the real output device. This is done in here.
- *
- * If (nf_bridge->mask & BRNF_BRIDGED_DNAT) then the packet is bridged
- * and we fake the PF_BRIDGE/FORWARD hook. The function br_nf_forward()
- * will then fake the PF_INET/FORWARD hook. br_nf_local_out() has priority
- * NF_BR_PRI_FIRST, so no relevant PF_BRIDGE/INPUT functions have been nor
- * will be executed.
- * Otherwise, if nf_bridge->physindev is NULL, the bridge-nf code never touched
- * this packet before, and so the packet was locally originated. We fake
- * the PF_INET/LOCAL_OUT hook.
- * Finally, if nf_bridge->physindev isn't NULL, then the packet was IP routed,
- * so we fake the PF_INET/FORWARD hook. ip_sabotage_out() makes sure
- * even routed packets that didn't arrive on a bridge interface have their
- * nf_bridge->physindev set. */
-static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct net_device *realindev, *realoutdev;
- struct sk_buff *skb = *pskb;
- struct nf_bridge_info *nf_bridge;
- int pf;
-
- if (!skb->nf_bridge)
- return NF_ACCEPT;
-
- if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb))
- pf = PF_INET;
- else
- pf = PF_INET6;
-
- nf_bridge = skb->nf_bridge;
- nf_bridge->physoutdev = skb->dev;
- realindev = nf_bridge->physindev;
-
- /* Bridged, take PF_BRIDGE/FORWARD.
- * (see big note in front of br_nf_pre_routing_finish) */
- if (nf_bridge->mask & BRNF_BRIDGED_DNAT) {
- if (nf_bridge->mask & BRNF_PKT_TYPE) {
- skb->pkt_type = PACKET_OTHERHOST;
- nf_bridge->mask ^= BRNF_PKT_TYPE;
- }
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_push(skb, VLAN_HLEN);
- skb->nh.raw -= VLAN_HLEN;
- }
-
- NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, realindev,
- skb->dev, br_forward_finish);
- goto out;
- }
- realoutdev = bridge_parent(skb->dev);
- if (!realoutdev)
- return NF_DROP;
-
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
- /* iptables should match -o br0.x */
- if (nf_bridge->netoutdev)
- realoutdev = nf_bridge->netoutdev;
-#endif
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_pull(skb, VLAN_HLEN);
- (*pskb)->nh.raw += VLAN_HLEN;
- }
- /* IP forwarded traffic has a physindev, locally
- * generated traffic hasn't. */
- if (realindev != NULL) {
- if (!(nf_bridge->mask & BRNF_DONT_TAKE_PARENT)) {
- struct net_device *parent = bridge_parent(realindev);
- if (parent)
- realindev = parent;
- }
-
- NF_HOOK_THRESH(pf, NF_IP_FORWARD, skb, realindev,
- realoutdev, br_nf_local_out_finish,
- NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
- } else {
- NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
- realoutdev, br_nf_local_out_finish,
- NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
- }
-
-out:
- return NF_STOLEN;
-}
-
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
- if (skb->protocol == htons(ETH_P_IP) &&
- skb->len > skb->dev->mtu &&
- !skb_is_gso(skb))
- return ip_fragment(skb, br_dev_queue_push_xmit);
- else
- return br_dev_queue_push_xmit(skb);
-}
-
-/* PF_BRIDGE/POST_ROUTING ********************************************/
-static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
- struct nf_bridge_info *nf_bridge = (*pskb)->nf_bridge;
- struct net_device *realoutdev = bridge_parent(skb->dev);
- int pf;
-
-#ifdef CONFIG_NETFILTER_DEBUG
- /* Be very paranoid. This probably won't happen anymore, but let's
- * keep the check just to be sure... */
- if (skb->mac.raw < skb->head || skb->mac.raw + ETH_HLEN > skb->data) {
- printk(KERN_CRIT "br_netfilter: Argh!! br_nf_post_routing: "
- "bad mac.raw pointer.\n");
- goto print_error;
- }
-#endif
-
- if (!nf_bridge)
- return NF_ACCEPT;
-
- if (!realoutdev)
- return NF_DROP;
-
- if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb))
- pf = PF_INET;
- else
- pf = PF_INET6;
-
-#ifdef CONFIG_NETFILTER_DEBUG
- if (skb->dst == NULL) {
- printk(KERN_INFO "br_netfilter post_routing: skb->dst == NULL\n");
- goto print_error;
- }
-#endif
-
- /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
- * about the value of skb->pkt_type. */
- if (skb->pkt_type == PACKET_OTHERHOST) {
- skb->pkt_type = PACKET_HOST;
- nf_bridge->mask |= BRNF_PKT_TYPE;
- }
-
- if (skb->protocol == htons(ETH_P_8021Q)) {
- skb_pull(skb, VLAN_HLEN);
- skb->nh.raw += VLAN_HLEN;
- }
-
- nf_bridge_save_header(skb);
-
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
- if (nf_bridge->netoutdev)
- realoutdev = nf_bridge->netoutdev;
-#endif
- NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev,
- br_nf_dev_queue_xmit);
-
- return NF_STOLEN;
-
-#ifdef CONFIG_NETFILTER_DEBUG
-print_error:
- if (skb->dev != NULL) {
- printk("[%s]", skb->dev->name);
- if (realoutdev)
- printk("[%s]", realoutdev->name);
- }
- printk(" head:%p, raw:%p, data:%p\n", skb->head, skb->mac.raw,
- skb->data);
- dump_stack();
- return NF_ACCEPT;
-#endif
-}
-
-/* IP/SABOTAGE *****************************************************/
-/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
- * for the second time. */
-static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- if ((*pskb)->nf_bridge &&
- !((*pskb)->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
- return NF_STOP;
- }
-
- return NF_ACCEPT;
-}
-
-/* Postpone execution of PF_INET(6)/FORWARD, PF_INET(6)/LOCAL_OUT
- * and PF_INET(6)/POST_ROUTING until we have done the forwarding
- * decision in the bridge code and have determined nf_bridge->physoutdev. */
-static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
-
- if ((out->hard_start_xmit == br_dev_xmit &&
- okfn != br_nf_forward_finish &&
- okfn != br_nf_local_out_finish && okfn != br_nf_dev_queue_xmit)
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
- || ((out->priv_flags & IFF_802_1Q_VLAN) &&
- VLAN_DEV_INFO(out)->real_dev->hard_start_xmit == br_dev_xmit)
-#endif
- ) {
- struct nf_bridge_info *nf_bridge;
-
- if (!skb->nf_bridge) {
-#ifdef CONFIG_SYSCTL
- /* This code is executed while in the IP(v6) stack,
- the version should be 4 or 6. We can't use
- skb->protocol because that isn't set on
- PF_INET(6)/LOCAL_OUT. */
- struct iphdr *ip = skb->nh.iph;
-
- if (ip->version == 4 && !brnf_call_iptables)
- return NF_ACCEPT;
- else if (ip->version == 6 && !brnf_call_ip6tables)
- return NF_ACCEPT;
- else if (!brnf_deferred_hooks)
- return NF_ACCEPT;
-#endif
- if (hook == NF_IP_POST_ROUTING)
- return NF_ACCEPT;
- if (!nf_bridge_alloc(skb))
- return NF_DROP;
- }
-
- nf_bridge = skb->nf_bridge;
-
- /* This frame will arrive on PF_BRIDGE/LOCAL_OUT and we
- * will need the indev then. For a brouter, the real indev
- * can be a bridge port, so we make sure br_nf_local_out()
- * doesn't use the bridge parent of the indev by using
- * the BRNF_DONT_TAKE_PARENT mask. */
- if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
- nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
- nf_bridge->physindev = (struct net_device *)in;
- }
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
- /* the iptables outdev is br0.x, not br0 */
- if (out->priv_flags & IFF_802_1Q_VLAN)
- nf_bridge->netoutdev = (struct net_device *)out;
-#endif
- return NF_STOP;
- }
-
- return NF_ACCEPT;
-}
-
-/* For br_nf_local_out we need (prio = NF_BR_PRI_FIRST), to insure that innocent
- * PF_BRIDGE/NF_BR_LOCAL_OUT functions don't get bridged traffic as input.
- * For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
- * ip_refrag() can return NF_STOLEN. */
-static struct nf_hook_ops br_nf_ops[] = {
- { .hook = br_nf_pre_routing,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
- .hooknum = NF_BR_PRE_ROUTING,
- .priority = NF_BR_PRI_BRNF, },
- { .hook = br_nf_local_in,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
- .hooknum = NF_BR_LOCAL_IN,
- .priority = NF_BR_PRI_BRNF, },
- { .hook = br_nf_forward_ip,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
- .hooknum = NF_BR_FORWARD,
- .priority = NF_BR_PRI_BRNF - 1, },
- { .hook = br_nf_forward_arp,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
- .hooknum = NF_BR_FORWARD,
- .priority = NF_BR_PRI_BRNF, },
- { .hook = br_nf_local_out,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
- .hooknum = NF_BR_LOCAL_OUT,
- .priority = NF_BR_PRI_FIRST, },
- { .hook = br_nf_post_routing,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
- .hooknum = NF_BR_POST_ROUTING,
- .priority = NF_BR_PRI_LAST, },
- { .hook = ip_sabotage_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_FIRST, },
- { .hook = ip_sabotage_in,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_PRE_ROUTING,
- .priority = NF_IP6_PRI_FIRST, },
- { .hook = ip_sabotage_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD, },
- { .hook = ip_sabotage_out,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_FORWARD,
- .priority = NF_IP6_PRI_BRIDGE_SABOTAGE_FORWARD, },
- { .hook = ip_sabotage_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, },
- { .hook = ip_sabotage_out,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, },
- { .hook = ip_sabotage_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_FIRST, },
- { .hook = ip_sabotage_out,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_POST_ROUTING,
- .priority = NF_IP6_PRI_FIRST, },
-};
-
-#ifdef CONFIG_SYSCTL
-static
-int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp,
- void __user * buffer, size_t * lenp, loff_t * ppos)
-{
- int ret;
-
- ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-
- if (write && *(int *)(ctl->data))
- *(int *)(ctl->data) = 1;
- return ret;
-}
-
-static ctl_table brnf_table[] = {
- {
- .ctl_name = NET_BRIDGE_NF_CALL_ARPTABLES,
- .procname = "bridge-nf-call-arptables",
- .data = &brnf_call_arptables,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &brnf_sysctl_call_tables,
- },
- {
- .ctl_name = NET_BRIDGE_NF_CALL_IPTABLES,
- .procname = "bridge-nf-call-iptables",
- .data = &brnf_call_iptables,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &brnf_sysctl_call_tables,
- },
- {
- .ctl_name = NET_BRIDGE_NF_CALL_IP6TABLES,
- .procname = "bridge-nf-call-ip6tables",
- .data = &brnf_call_ip6tables,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &brnf_sysctl_call_tables,
- },
- {
- .ctl_name = NET_BRIDGE_NF_FILTER_VLAN_TAGGED,
- .procname = "bridge-nf-filter-vlan-tagged",
- .data = &brnf_filter_vlan_tagged,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &brnf_sysctl_call_tables,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table brnf_bridge_table[] = {
- {
- .ctl_name = NET_BRIDGE,
- .procname = "bridge",
- .mode = 0555,
- .child = brnf_table,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table brnf_net_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = brnf_bridge_table,
- },
- { .ctl_name = 0 }
-};
-#endif
-
-int br_netfilter_init(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++) {
- int ret;
-
- if ((ret = nf_register_hook(&br_nf_ops[i])) >= 0)
- continue;
-
- while (i--)
- nf_unregister_hook(&br_nf_ops[i]);
-
- return ret;
- }
-
-#ifdef CONFIG_SYSCTL
- brnf_sysctl_header = register_sysctl_table(brnf_net_table, 0);
- if (brnf_sysctl_header == NULL) {
- printk(KERN_WARNING
- "br_netfilter: can't register to sysctl.\n");
- for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++)
- nf_unregister_hook(&br_nf_ops[i]);
- return -EFAULT;
- }
-#endif
-
- printk(KERN_NOTICE "Bridge firewalling registered\n");
-
- return 0;
-}
-
-void br_netfilter_fini(void)
-{
- int i;
-
- for (i = ARRAY_SIZE(br_nf_ops) - 1; i >= 0; i--)
- nf_unregister_hook(&br_nf_ops[i]);
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(brnf_sysctl_header);
-#endif
-}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
new file mode 100644
index 000000000000..083e2fe96441
--- /dev/null
+++ b/net/bridge/br_netfilter_hooks.c
@@ -0,0 +1,1340 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handle firewalling
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <uapi/linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/rculist.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/dst_metadata.h>
+#include <net/route.h>
+#include <net/netfilter/br_netfilter.h>
+#include <net/netns/generic.h>
+#include <net/inet_dscp.h>
+
+#include <linux/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
+static unsigned int brnf_net_id __read_mostly;
+
+struct brnf_net {
+ bool enabled;
+
+#ifdef CONFIG_SYSCTL
+ struct ctl_table_header *ctl_hdr;
+#endif
+
+ /* default value is 1 */
+ int call_iptables;
+ int call_ip6tables;
+ int call_arptables;
+
+ /* default value is 0 */
+ int filter_vlan_tagged;
+ int filter_pppoe_tagged;
+ int pass_vlan_indev;
+};
+
+#define IS_IP(skb) \
+ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+
+#define IS_IPV6(skb) \
+ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+
+#define IS_ARP(skb) \
+ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+
+static inline __be16 vlan_proto(const struct sk_buff *skb)
+{
+ if (skb_vlan_tag_present(skb))
+ return skb->protocol;
+ else if (skb->protocol == htons(ETH_P_8021Q))
+ return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+ else
+ return 0;
+}
+
+static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_IPV6) &&
+ brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
+}
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+ sizeof(struct pppoe_hdr)));
+}
+
+static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
+}
+
+static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IPV6) &&
+ brnet->filter_pppoe_tagged;
+}
+
+/* largest possible L2 header, see br_nf_dev_queue_xmit() */
+#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
+
+struct brnf_frag_data {
+ local_lock_t bh_lock;
+ char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
+ u8 encap_size;
+ u8 size;
+ u16 vlan_tci;
+ __be16 vlan_proto;
+};
+
+static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+static void nf_bridge_info_free(struct sk_buff *skb)
+{
+ skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
+}
+
+static inline struct net_device *bridge_parent(const struct net_device *dev)
+{
+ struct net_bridge_port *port;
+
+ port = br_port_get_rcu(dev);
+ return port ? port->br->dev : NULL;
+}
+
+static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
+{
+ return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
+}
+
+unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case __cpu_to_be16(ETH_P_8021Q):
+ return VLAN_HLEN;
+ case __cpu_to_be16(ETH_P_PPP_SES):
+ return PPPOE_SES_HLEN;
+ default:
+ return 0;
+ }
+}
+
+static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
+{
+ unsigned int len = nf_bridge_encap_header_len(skb);
+
+ skb_pull(skb, len);
+ skb->network_header += len;
+}
+
+static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
+{
+ unsigned int len = nf_bridge_encap_header_len(skb);
+
+ skb_pull_rcsum(skb, len);
+ skb->network_header += len;
+}
+
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+
+ /* Basic sanity checks */
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto csum_error;
+
+ len = skb_ip_totlen(skb);
+ if (skb->len < len) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
+ goto inhdr_error;
+
+ if (pskb_trim_rcsum(skb, len)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ /* We should really parse IP options here but until
+ * somebody who actually uses IP options complains to
+ * us we'll just silently ignore the options because
+ * we're lazy!
+ */
+ return 0;
+
+csum_error:
+ __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
+inhdr_error:
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+drop:
+ return -1;
+}
+
+void nf_bridge_update_protocol(struct sk_buff *skb)
+{
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ switch (nf_bridge->orig_proto) {
+ case BRNF_PROTO_8021Q:
+ skb->protocol = htons(ETH_P_8021Q);
+ break;
+ case BRNF_PROTO_PPPOE:
+ skb->protocol = htons(ETH_P_PPP_SES);
+ break;
+ case BRNF_PROTO_UNCHANGED:
+ break;
+ }
+}
+
+/* Obtain the correct destination MAC address, while preserving the original
+ * source MAC address. If we already know this address, we just copy it. If we
+ * don't, we use the neighbour framework to find out. In both cases, we make
+ * sure that br_handle_frame_finish() is called afterwards.
+ */
+int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct neighbour *neigh;
+ struct dst_entry *dst;
+
+ skb->dev = bridge_parent(skb->dev);
+ if (!skb->dev)
+ goto free_skb;
+ dst = skb_dst(skb);
+ neigh = dst_neigh_lookup_skb(dst, skb);
+ if (neigh) {
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ int ret;
+
+ if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
+ READ_ONCE(neigh->hh.hh_len)) {
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ neigh_release(neigh);
+ goto free_skb;
+ }
+
+ neigh_hh_bridge(&neigh->hh, skb);
+ skb->dev = br_indev;
+
+ ret = br_handle_frame_finish(net, sk, skb);
+ } else {
+ /* the neighbour function below overwrites the complete
+ * MAC header, so we save the Ethernet source address and
+ * protocol number.
+ */
+ skb_copy_from_linear_data_offset(skb,
+ -(ETH_HLEN-ETH_ALEN),
+ nf_bridge->neigh_header,
+ ETH_HLEN-ETH_ALEN);
+ /* tell br_dev_xmit to continue with forwarding */
+ nf_bridge->bridged_dnat = 1;
+ /* FIXME Need to refragment */
+ ret = READ_ONCE(neigh->output)(neigh, skb);
+ }
+ neigh_release(neigh);
+ return ret;
+ }
+free_skb:
+ kfree_skb(skb);
+ return 0;
+}
+
+static inline bool
+br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
+ const struct nf_bridge_info *nf_bridge)
+{
+ return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
+}
+
+/* This requires some explaining. If DNAT has taken place,
+ * we will need to fix up the destination Ethernet address.
+ * This is also true when SNAT takes place (for the reply direction).
+ *
+ * There are two cases to consider:
+ * 1. The packet was DNAT'ed to a device in the same bridge
+ * port group as it was received on. We can still bridge
+ * the packet.
+ * 2. The packet was DNAT'ed to a different device, either
+ * a non-bridged device or another bridge port group.
+ * The packet will need to be routed.
+ *
+ * The correct way of distinguishing between these two cases is to
+ * call ip_route_input() and to look at skb->dst->dev, which is
+ * changed to the destination device if ip_route_input() succeeds.
+ *
+ * Let's first consider the case that ip_route_input() succeeds:
+ *
+ * If the output device equals the logical bridge device the packet
+ * came in on, we can consider this bridging. The corresponding MAC
+ * address will be obtained in br_nf_pre_routing_finish_bridge.
+ * Otherwise, the packet is considered to be routed and we just
+ * change the destination MAC address so that the packet will
+ * later be passed up to the IP stack to be routed. For a redirected
+ * packet, ip_route_input() will give back the localhost as output device,
+ * which differs from the bridge device.
+ *
+ * Let's now consider the case that ip_route_input() fails:
+ *
+ * This can be because the destination address is martian, in which case
+ * the packet will be dropped.
+ * If IP forwarding is disabled, ip_route_input() will fail, while
+ * ip_route_output_key() can return success. The source
+ * address for ip_route_output_key() is set to zero, so ip_route_output_key()
+ * thinks we're handling a locally generated packet and won't care
+ * if IP forwarding is enabled. If the output device equals the logical bridge
+ * device, we proceed as if ip_route_input() succeeded. If it differs from the
+ * logical bridge port or if ip_route_output_key() fails we drop the packet.
+ */
+static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *dev = skb->dev, *br_indev;
+ const struct iphdr *iph = ip_hdr(skb);
+ enum skb_drop_reason reason;
+ struct rtable *rt;
+
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge->in_prerouting = 0;
+ if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
+ reason = ip_route_input(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ if (reason) {
+ kfree_skb_reason(skb, reason);
+ return 0;
+ } else {
+ if (skb_dst(skb)->dev == dev) {
+ skb->dev = br_indev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev,
+ NULL,
+ br_nf_pre_routing_finish_bridge);
+ return 0;
+ }
+ ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
+ skb->pkt_type = PACKET_HOST;
+ }
+ } else {
+ rt = bridge_parent_rtable(br_indev);
+ if (!rt) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb_dst_drop(skb);
+ skb_dst_set_noref(skb, &rt->dst);
+ }
+
+ skb->dev = br_indev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
+ br_handle_frame_finish);
+ return 0;
+}
+
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct net *net)
+{
+ struct net_device *vlan, *br;
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ br = bridge_parent(dev);
+
+ if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+ return br;
+
+ vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
+ skb_vlan_tag_get(skb) & VLAN_VID_MASK);
+
+ return vlan ? vlan : br;
+}
+
+/* Some common code for IPv4/IPv6 */
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ skb->pkt_type = PACKET_HOST;
+ nf_bridge->pkt_otherhost = true;
+ }
+
+ nf_bridge->in_prerouting = 1;
+ nf_bridge->physinif = skb->dev->ifindex;
+ skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
+
+ if (skb->protocol == htons(ETH_P_8021Q))
+ nf_bridge->orig_proto = BRNF_PROTO_8021Q;
+ else if (skb->protocol == htons(ETH_P_PPP_SES))
+ nf_bridge->orig_proto = BRNF_PROTO_PPPOE;
+
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
+ return skb->dev;
+}
+
+/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
+ * Replicate the checks that IPv4 does on packet reception.
+ * Set skb->dev to the bridge device (i.e. parent of the
+ * receiving device) to make netfilter happy, the REDIRECT
+ * target in particular. Save the original destination IP
+ * address to be able to detect DNAT afterwards. */
+static unsigned int br_nf_pre_routing(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge;
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ __u32 len = nf_bridge_encap_header_len(skb);
+ struct brnf_net *brnet;
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);
+
+ p = br_port_get_rcu(state->in);
+ if (p == NULL)
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
+ br = p->br;
+
+ brnet = net_generic(state->net, brnf_net_id);
+ if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net)) {
+ if (!brnet->call_ip6tables &&
+ !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
+ return NF_ACCEPT;
+ if (!ipv6_mod_enabled()) {
+ pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported.");
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0);
+ }
+
+ nf_bridge_pull_encap_header_rcsum(skb);
+ return br_nf_pre_routing_ipv6(priv, skb, state);
+ }
+
+ if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
+ return NF_ACCEPT;
+
+ if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
+ !is_pppoe_ip(skb, state->net))
+ return NF_ACCEPT;
+
+ nf_bridge_pull_encap_header_rcsum(skb);
+
+ if (br_validate_ipv4(state->net, skb))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
+
+ if (!nf_bridge_alloc(skb))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
+ if (!setup_pre_routing(skb, state->net))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
+
+ nf_bridge = nf_bridge_info_get(skb);
+ nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4;
+
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
+ skb->dev, NULL,
+ br_nf_pre_routing_finish);
+
+ return NF_STOLEN;
+}
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
+ * the same nf_conn entry, which will happen for multicast (broadcast)
+ * Frames on bridges.
+ *
+ * Example:
+ * macvlan0
+ * br0
+ * ethX ethY
+ *
+ * ethX (or Y) receives multicast or broadcast packet containing
+ * an IP packet, not yet in conntrack table.
+ *
+ * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
+ * -> skb->_nfct now references a unconfirmed entry
+ * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
+ * interface.
+ * 3. skb gets passed up the stack.
+ * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
+ * and schedules a work queue to send them out on the lower devices.
+ *
+ * The clone skb->_nfct is not a copy, it is the same entry as the
+ * original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
+ * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
+ *
+ * The Macvlan broadcast worker and normal confirm path will race.
+ *
+ * This race will not happen if step 2 already confirmed a clone. In that
+ * case later steps perform skb_clone() with skb->_nfct already confirmed (in
+ * hash table). This works fine.
+ *
+ * But such confirmation won't happen when eb/ip/nftables rules dropped the
+ * packets before they reached the nf_confirm step in postrouting.
+ *
+ * Work around this problem by explicit confirmation of the entry at
+ * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
+ * entry.
+ *
+ */
+static unsigned int br_nf_local_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
+ struct nf_conntrack *nfct = skb_nfct(skb);
+ const struct nf_ct_hook *ct_hook;
+ struct nf_conn *ct;
+ int ret;
+
+ if (promisc) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
+ if (!nfct || skb->pkt_type == PACKET_HOST)
+ return NF_ACCEPT;
+
+ ct = container_of(nfct, struct nf_conn, ct_general);
+ if (likely(nf_ct_is_confirmed(ct)))
+ return NF_ACCEPT;
+
+ if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
+ WARN_ON_ONCE(skb_shared(skb));
+
+ /* We can't call nf_confirm here, it would create a dependency
+ * on nf_conntrack module.
+ */
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook) {
+ skb->_nfct = 0ul;
+ nf_conntrack_put(nfct);
+ return NF_ACCEPT;
+ }
+
+ nf_bridge_pull_encap_header(skb);
+ ret = ct_hook->confirm(skb);
+ switch (ret & NF_VERDICT_MASK) {
+ case NF_STOLEN:
+ return NF_STOLEN;
+ default:
+ nf_bridge_push_encap_header(skb);
+ break;
+ }
+
+ return ret;
+}
+#endif
+
+/* PF_BRIDGE/FORWARD *************************************************/
+static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *in;
+
+ if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {
+
+ if (skb->protocol == htons(ETH_P_IP))
+ nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+ in = nf_bridge_get_physindev(skb, net);
+ if (!in) {
+ kfree_skb(skb);
+ return 0;
+ }
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge_update_protocol(skb);
+ } else {
+ in = *((struct net_device **)(skb->cb));
+ }
+ nf_bridge_push_encap_header(skb);
+
+ br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
+ br_forward_finish);
+ return 0;
+}
+
+
+static unsigned int br_nf_forward_ip(struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ u8 pf)
+{
+ struct nf_bridge_info *nf_bridge;
+ struct net_device *parent;
+
+ nf_bridge = nf_bridge_info_get(skb);
+ if (!nf_bridge)
+ return NF_ACCEPT;
+
+ /* Need exclusive nf_bridge_info since we might have multiple
+ * different physoutdevs. */
+ if (!nf_bridge_unshare(skb))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
+
+ nf_bridge = nf_bridge_info_get(skb);
+ if (!nf_bridge)
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
+
+ parent = bridge_parent(state->out);
+ if (!parent)
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
+
+ nf_bridge_pull_encap_header(skb);
+
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ skb->pkt_type = PACKET_HOST;
+ nf_bridge->pkt_otherhost = true;
+ }
+
+ if (pf == NFPROTO_IPV4) {
+ if (br_validate_ipv4(state->net, skb))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
+ IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
+ skb->protocol = htons(ETH_P_IP);
+ } else if (pf == NFPROTO_IPV6) {
+ if (br_validate_ipv6(state->net, skb))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
+ IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+ skb->protocol = htons(ETH_P_IPV6);
+ } else {
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ nf_bridge->physoutdev = skb->dev;
+
+ NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
+ brnf_get_logical_dev(skb, state->in, state->net),
+ parent, br_nf_forward_finish);
+
+ return NF_STOLEN;
+}
+
+static unsigned int br_nf_forward_arp(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ struct net_device **d = (struct net_device **)(skb->cb);
+ struct brnf_net *brnet;
+
+ p = br_port_get_rcu(state->out);
+ if (p == NULL)
+ return NF_ACCEPT;
+ br = p->br;
+
+ brnet = net_generic(state->net, brnf_net_id);
+ if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
+ return NF_ACCEPT;
+
+ if (is_vlan_arp(skb, state->net))
+ nf_bridge_pull_encap_header(skb);
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr))))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);
+
+ if (arp_hdr(skb)->ar_pln != 4) {
+ if (is_vlan_arp(skb, state->net))
+ nf_bridge_push_encap_header(skb);
+ return NF_ACCEPT;
+ }
+ *d = state->in;
+ NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb,
+ state->in, state->out, br_nf_forward_finish);
+
+ return NF_STOLEN;
+}
+
+/* This is the 'purely bridged' case. For IP, we pass the packet to
+ * netfilter with indev and outdev set to the bridge device,
+ * but we are still able to filter on the 'real' indev/outdev
+ * because of the physdev module. For ARP, indev and outdev are the
+ * bridge ports.
+ */
+static unsigned int br_nf_forward(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
+ return br_nf_forward_ip(skb, state, NFPROTO_IPV4);
+ if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
+ return br_nf_forward_ip(skb, state, NFPROTO_IPV6);
+ if (IS_ARP(skb) || is_vlan_arp(skb, state->net))
+ return br_nf_forward_arp(skb, state);
+
+ return NF_ACCEPT;
+}
+
+static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct brnf_frag_data *data;
+ int err;
+
+ data = this_cpu_ptr(&brnf_frag_data_storage);
+ err = skb_cow_head(skb, data->size);
+
+ if (err) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (data->vlan_proto)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
+
+ skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
+ __skb_push(skb, data->encap_size);
+
+ nf_bridge_info_free(skb);
+ return br_dev_queue_push_xmit(net, sk, skb);
+}
+
+static int
+br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ unsigned int mtu = ip_skb_dst_mtu(sk, skb);
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(net, sk, skb, output);
+}
+
+static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
+{
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
+ return PPPOE_SES_HLEN;
+ return 0;
+}
+
+static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ unsigned int mtu, mtu_reserved;
+ int ret;
+
+ mtu_reserved = nf_bridge_mtu_reduction(skb);
+ mtu = skb->dev->mtu;
+
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+
+ if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
+ mtu = nf_bridge->frag_max_size;
+
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+
+ if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
+ nf_bridge_info_free(skb);
+ return br_dev_queue_push_xmit(net, sk, skb);
+ }
+
+ /* Fragmentation on metadata/template dst is not supported */
+ if (unlikely(!skb_valid_dst(skb)))
+ goto drop;
+
+ /* This is wrong! We should preserve the original fragment
+ * boundaries by preserving frag_list rather than refragmenting.
+ */
+ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) &&
+ skb->protocol == htons(ETH_P_IP)) {
+ struct brnf_frag_data *data;
+
+ if (br_validate_ipv4(net, skb))
+ goto drop;
+
+ IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
+
+ local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ data = this_cpu_ptr(&brnf_frag_data_storage);
+
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_proto = 0;
+ }
+
+ data->encap_size = nf_bridge_encap_header_len(skb);
+ data->size = ETH_HLEN + data->encap_size;
+
+ skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
+ data->size);
+
+ ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ return ret;
+ }
+ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6)) {
+ const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+ struct brnf_frag_data *data;
+
+ if (br_validate_ipv6(net, skb))
+ goto drop;
+
+ IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+
+ local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ data = this_cpu_ptr(&brnf_frag_data_storage);
+ data->encap_size = nf_bridge_encap_header_len(skb);
+ data->size = ETH_HLEN + data->encap_size;
+
+ skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
+ data->size);
+
+ if (v6ops) {
+ ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ return ret;
+ }
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ nf_bridge_info_free(skb);
+ return br_dev_queue_push_xmit(net, sk, skb);
+ drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+/* PF_BRIDGE/POST_ROUTING ********************************************/
+static unsigned int br_nf_post_routing(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *realoutdev = bridge_parent(skb->dev);
+ u_int8_t pf;
+
+ /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in
+ * on a bridge, but was delivered locally and is now being routed:
+ *
+ * POST_ROUTING was already invoked from the ip stack.
+ */
+ if (!nf_bridge || !nf_bridge->physoutdev)
+ return NF_ACCEPT;
+
+ if (!realoutdev)
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
+
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
+ pf = NFPROTO_IPV4;
+ else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
+ pf = NFPROTO_IPV6;
+ else
+ return NF_ACCEPT;
+
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ skb->pkt_type = PACKET_HOST;
+ nf_bridge->pkt_otherhost = true;
+ }
+
+ nf_bridge_pull_encap_header(skb);
+ if (pf == NFPROTO_IPV4)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ skb->protocol = htons(ETH_P_IPV6);
+
+ NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb,
+ NULL, realoutdev,
+ br_nf_dev_queue_xmit);
+
+ return NF_STOLEN;
+}
+
+/* IP/SABOTAGE *****************************************************/
+/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
+ * for the second time. */
+static unsigned int ip_sabotage_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge) {
+ if (nf_bridge->sabotage_in_done)
+ return NF_ACCEPT;
+
+ if (!nf_bridge->in_prerouting &&
+ !netif_is_l3_master(skb->dev) &&
+ !netif_is_l3_slave(skb->dev)) {
+ nf_bridge->sabotage_in_done = 1;
+ state->okfn(state->net, state->sk, skb);
+ return NF_STOLEN;
+ }
+ }
+
+ return NF_ACCEPT;
+}
+
+/* This is called when br_netfilter has called into iptables/netfilter,
+ * and DNAT has taken place on a bridge-forwarded packet.
+ *
+ * neigh->output has created a new MAC header, with local br0 MAC
+ * as saddr.
+ *
+ * This restores the original MAC saddr of the bridged packet
+ * before invoking bridge forward logic to transmit the packet.
+ */
+static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev));
+ if (!br_indev) {
+ kfree_skb(skb);
+ return;
+ }
+
+ skb_pull(skb, ETH_HLEN);
+ nf_bridge->bridged_dnat = 0;
+
+ BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
+
+ skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
+ nf_bridge->neigh_header,
+ ETH_HLEN - ETH_ALEN);
+ skb->dev = br_indev;
+
+ nf_bridge->physoutdev = NULL;
+ br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
+}
+
+static int br_nf_dev_xmit(struct sk_buff *skb)
+{
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge && nf_bridge->bridged_dnat) {
+ br_nf_pre_routing_finish_bridge_slow(skb);
+ return 1;
+ }
+ return 0;
+}
+
+static const struct nf_br_ops br_ops = {
+ .br_dev_xmit_hook = br_nf_dev_xmit,
+};
+
+/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
+ * br_dev_queue_push_xmit is called afterwards */
+static const struct nf_hook_ops br_nf_ops[] = {
+ {
+ .hook = br_nf_pre_routing,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_BRNF,
+ },
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ {
+ .hook = br_nf_local_in,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_BR_PRI_LAST,
+ },
+#endif
+ {
+ .hook = br_nf_forward,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_FORWARD,
+ .priority = NF_BR_PRI_BRNF,
+ },
+ {
+ .hook = br_nf_post_routing,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_BR_PRI_LAST,
+ },
+ {
+ .hook = ip_sabotage_in,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_FIRST,
+ },
+ {
+ .hook = ip_sabotage_in,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_FIRST,
+ },
+};
+
+static int brnf_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct brnf_net *brnet;
+ struct net *net;
+ int ret;
+
+ if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev))
+ return NOTIFY_DONE;
+
+ ASSERT_RTNL();
+
+ net = dev_net(dev);
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled)
+ return NOTIFY_OK;
+
+ ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ if (ret)
+ return NOTIFY_BAD;
+
+ brnet->enabled = true;
+ return NOTIFY_OK;
+}
+
+static struct notifier_block brnf_notifier __read_mostly = {
+ .notifier_call = brnf_device_event,
+};
+
+/* recursively invokes nf_hook_slow (again), skipping already-called
+ * hooks (< NF_BR_PRI_BRNF).
+ *
+ * Called with rcu read lock held.
+ */
+int br_nf_hook_thresh(unsigned int hook, struct net *net,
+ struct sock *sk, struct sk_buff *skb,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct net *, struct sock *,
+ struct sk_buff *))
+{
+ const struct nf_hook_entries *e;
+ struct nf_hook_state state;
+ struct nf_hook_ops **ops;
+ unsigned int i;
+ int ret;
+
+ e = rcu_dereference(net->nf.hooks_bridge[hook]);
+ if (!e)
+ return okfn(net, sk, skb);
+
+ ops = nf_hook_entries_get_hook_ops(e);
+ for (i = 0; i < e->num_hook_entries; i++) {
+ /* These hooks have already been called */
+ if (ops[i]->priority < NF_BR_PRI_BRNF)
+ continue;
+
+ /* These hooks have not been called yet, run them. */
+ if (ops[i]->priority > NF_BR_PRI_BRNF)
+ break;
+
+ /* take a closer look at NF_BR_PRI_BRNF. */
+ if (ops[i]->hook == br_nf_pre_routing) {
+ /* This hook diverted the skb to this function,
+ * hooks after this have not been run yet.
+ */
+ i++;
+ break;
+ }
+ }
+
+ nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
+ sk, net, okfn);
+
+ ret = nf_hook_slow(skb, &state, e, i);
+ if (ret == 1)
+ ret = okfn(net, sk, skb);
+
+ return ret;
+}
+
+#ifdef CONFIG_SYSCTL
+static
+int brnf_sysctl_call_tables(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write && *(int *)(ctl->data))
+ *(int *)(ctl->data) = 1;
+ return ret;
+}
+
+static struct ctl_table brnf_table[] = {
+ {
+ .procname = "bridge-nf-call-arptables",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-call-iptables",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-call-ip6tables",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-filter-vlan-tagged",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-filter-pppoe-tagged",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-pass-vlan-input-dev",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+};
+
+static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
+{
+ brnf->call_iptables = 1;
+ brnf->call_ip6tables = 1;
+ brnf->call_arptables = 1;
+ brnf->filter_vlan_tagged = 0;
+ brnf->filter_pppoe_tagged = 0;
+ brnf->pass_vlan_indev = 0;
+}
+
+static int br_netfilter_sysctl_init_net(struct net *net)
+{
+ struct ctl_table *table = brnf_table;
+ struct brnf_net *brnet;
+
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ brnet = net_generic(net, brnf_net_id);
+ table[0].data = &brnet->call_arptables;
+ table[1].data = &brnet->call_iptables;
+ table[2].data = &brnet->call_ip6tables;
+ table[3].data = &brnet->filter_vlan_tagged;
+ table[4].data = &brnet->filter_pppoe_tagged;
+ table[5].data = &brnet->pass_vlan_indev;
+
+ br_netfilter_sysctl_default(brnet);
+
+ brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table,
+ ARRAY_SIZE(brnf_table));
+ if (!brnet->ctl_hdr) {
+ if (!net_eq(net, &init_net))
+ kfree(table);
+
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void br_netfilter_sysctl_exit_net(struct net *net,
+ struct brnf_net *brnet)
+{
+ const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(brnet->ctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static int __net_init brnf_init_net(struct net *net)
+{
+ return br_netfilter_sysctl_init_net(net);
+}
+#endif
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+ struct brnf_net *brnet;
+
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled) {
+ nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ brnet->enabled = false;
+ }
+
+#ifdef CONFIG_SYSCTL
+ br_netfilter_sysctl_exit_net(net, brnet);
+#endif
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+#ifdef CONFIG_SYSCTL
+ .init = brnf_init_net,
+#endif
+ .exit = brnf_exit_net,
+ .id = &brnf_net_id,
+ .size = sizeof(struct brnf_net),
+};
+
+static int __init br_netfilter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&brnf_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = register_netdevice_notifier(&brnf_notifier);
+ if (ret < 0) {
+ unregister_pernet_subsys(&brnf_net_ops);
+ return ret;
+ }
+
+ RCU_INIT_POINTER(nf_br_ops, &br_ops);
+ printk(KERN_NOTICE "Bridge firewalling registered\n");
+ return 0;
+}
+
+static void __exit br_netfilter_fini(void)
+{
+ RCU_INIT_POINTER(nf_br_ops, NULL);
+ unregister_netdevice_notifier(&brnf_notifier);
+ unregister_pernet_subsys(&brnf_net_ops);
+}
+
+module_init(br_netfilter_init);
+module_exit(br_netfilter_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
new file mode 100644
index 000000000000..e0421eaa3abc
--- /dev/null
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handle firewalling
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/route.h>
+#include <net/netfilter/br_netfilter.h>
+
+#include <linux/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+int br_validate_ipv6(struct net *net, struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+ u32 pkt_len;
+ u8 ip6h_len = sizeof(struct ipv6hdr);
+
+ if (!pskb_may_pull(skb, ip6h_len))
+ goto inhdr_error;
+
+ if (skb->len < ip6h_len)
+ goto drop;
+
+ hdr = ipv6_hdr(skb);
+
+ if (hdr->version != 6)
+ goto inhdr_error;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len))
+ goto drop;
+
+ if (pkt_len + ip6h_len > skb->len) {
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ }
+ if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ /* No IP options in IPv6 header; however it should be
+ * checked if some next headers need special treatment
+ */
+ return 0;
+
+inhdr_error:
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+ return -1;
+}
+
+static inline bool
+br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb,
+ const struct nf_bridge_info *nf_bridge)
+{
+ return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr,
+ sizeof(ipv6_hdr(skb)->daddr)) != 0;
+}
+
+/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables
+ * PREROUTING and continue the bridge PRE_ROUTING hook. See comment
+ * for br_nf_pre_routing_finish(), same logic is used here but
+ * equivalent IPv6 function ip6_route_input() called indirectly.
+ */
+static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct rtable *rt;
+ struct net_device *dev = skb->dev, *br_indev;
+ const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge->in_prerouting = 0;
+ if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) {
+ skb_dst_drop(skb);
+ v6ops->route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (skb_dst(skb)->dev == dev) {
+ skb->dev = br_indev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev, NULL,
+ br_nf_pre_routing_finish_bridge);
+ return 0;
+ }
+ ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
+ skb->pkt_type = PACKET_HOST;
+ } else {
+ rt = bridge_parent_rtable(br_indev);
+ if (!rt) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb_dst_drop(skb);
+ skb_dst_set_noref(skb, &rt->dst);
+ }
+
+ skb->dev = br_indev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
+ skb->dev, NULL, br_handle_frame_finish);
+
+ return 0;
+}
+
+/* Replicate the checks that IPv6 does on packet reception and pass the packet
+ * to ip6tables.
+ */
+unsigned int br_nf_pre_routing_ipv6(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge;
+
+ if (br_validate_ipv6(state->net, skb))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
+
+ nf_bridge = nf_bridge_alloc(skb);
+ if (!nf_bridge)
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
+ if (!setup_pre_routing(skb, state->net))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
+
+ nf_bridge = nf_bridge_info_get(skb);
+ nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
+
+ NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
+ skb->dev, NULL,
+ br_nf_pre_routing_finish_ipv6);
+
+ return NF_STOLEN;
+}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a9139682c49b..0264730938f4 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1,192 +1,1952 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Bridge netlink control interface
*
* Authors:
* Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
-#include <linux/rtnetlink.h>
-#include <net/netlink.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/if_bridge.h>
+
#include "br_private.h"
+#include "br_private_stp.h"
+#include "br_private_cfm.h"
+#include "br_private_tunnel.h"
+#include "br_private_mcast_eht.h"
+
+static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
+ u32 filter_mask)
+{
+ struct net_bridge_vlan *v;
+ u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0;
+ u16 flags, pvid;
+ int num_vlans = 0;
+
+ if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
+ return 0;
+
+ pvid = br_get_pvid(vg);
+ /* Count number of vlan infos */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ flags = 0;
+ /* only a context, bridge vlan not activated */
+ if (!br_vlan_should_use(v))
+ continue;
+ if (v->vid == pvid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ flags |= BRIDGE_VLAN_INFO_UNTAGGED;
-static inline size_t br_nlmsg_size(void)
+ if (vid_range_start == 0) {
+ goto initvars;
+ } else if ((v->vid - vid_range_end) == 1 &&
+ flags == vid_range_flags) {
+ vid_range_end = v->vid;
+ continue;
+ } else {
+ if ((vid_range_end - vid_range_start) > 0)
+ num_vlans += 2;
+ else
+ num_vlans += 1;
+ }
+initvars:
+ vid_range_start = v->vid;
+ vid_range_end = v->vid;
+ vid_range_flags = flags;
+ }
+
+ if (vid_range_start != 0) {
+ if ((vid_range_end - vid_range_start) > 0)
+ num_vlans += 2;
+ else
+ num_vlans += 1;
+ }
+
+ return num_vlans;
+}
+
+static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg,
+ u32 filter_mask)
+{
+ int num_vlans;
+
+ if (!vg)
+ return 0;
+
+ if (filter_mask & RTEXT_FILTER_BRVLAN)
+ return vg->num_vlans;
+
+ rcu_read_lock();
+ num_vlans = __get_num_vlan_infos(vg, filter_mask);
+ rcu_read_unlock();
+
+ return num_vlans;
+}
+
+static size_t br_get_link_af_size_filtered(const struct net_device *dev,
+ u32 filter_mask)
+{
+ struct net_bridge_vlan_group *vg = NULL;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge *br = NULL;
+ u32 num_cfm_peer_mep_infos;
+ u32 num_cfm_mep_infos;
+ size_t vinfo_sz = 0;
+ int num_vlan_infos;
+
+ rcu_read_lock();
+ if (netif_is_bridge_port(dev)) {
+ p = br_port_get_check_rcu(dev);
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ } else if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group_rcu(br);
+ }
+ num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
+ rcu_read_unlock();
+
+ if (p && (p->flags & BR_VLAN_TUNNEL))
+ vinfo_sz += br_get_vlan_tunnel_info_size(vg);
+
+ /* Each VLAN is returned in bridge_vlan_info along with flags */
+ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+
+ if (p && vg && (filter_mask & RTEXT_FILTER_MST))
+ vinfo_sz += br_mst_info_size(vg);
+
+ if (!(filter_mask & RTEXT_FILTER_CFM_STATUS))
+ return vinfo_sz;
+
+ if (!br)
+ return vinfo_sz;
+
+ /* CFM status info must be added */
+ br_cfm_mep_count(br, &num_cfm_mep_infos);
+ br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos);
+
+ vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */
+ /* For each status struct the MEP instance (u32) is added */
+ /* MEP instance (u32) + br_cfm_mep_status */
+ vinfo_sz += num_cfm_mep_infos *
+ /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */
+ (nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */
+ + nla_total_size(sizeof(u32)));
+ /* MEP instance (u32) + br_cfm_cc_peer_status */
+ vinfo_sz += num_cfm_peer_mep_infos *
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */
+ (nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */
+ + nla_total_size(sizeof(u8))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */
+ + nla_total_size(sizeof(u8))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32)));
+
+ return vinfo_sz;
+}
+
+static inline size_t br_port_info_size(void)
+{
+ return nla_total_size(1) /* IFLA_BRPORT_STATE */
+ + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */
+ + nla_total_size(4) /* IFLA_BRPORT_COST */
+ + nla_total_size(1) /* IFLA_BRPORT_MODE */
+ + nla_total_size(1) /* IFLA_BRPORT_GUARD */
+ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */
+ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
+ + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */
+ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */
+ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
+ + nla_total_size(1) /* IFLA_BRPORT_MCAST_FLOOD */
+ + nla_total_size(1) /* IFLA_BRPORT_BCAST_FLOOD */
+ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */
+ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
+ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
+ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */
+ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
+ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */
+ + nla_total_size(1) /* IFLA_BRPORT_MAB */
+ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */
+ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */
+ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */
+ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */
+ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */
+#endif
+ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */
+ + 0;
+}
+
+static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
{
return NLMSG_ALIGN(sizeof(struct ifinfomsg))
- + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
- + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
- + nla_total_size(4) /* IFLA_MASTER */
- + nla_total_size(4) /* IFLA_MTU */
- + nla_total_size(4) /* IFLA_LINK */
- + nla_total_size(1) /* IFLA_OPERSTATE */
- + nla_total_size(1); /* IFLA_PROTINFO */
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
+ + nla_total_size(br_get_link_af_size_filtered(dev,
+ filter_mask)) /* IFLA_AF_SPEC */
+ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */
+}
+
+static int br_port_fill_attrs(struct sk_buff *skb,
+ const struct net_bridge_port *p)
+{
+ u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
+ struct net_bridge_port *backup_p;
+ u64 timerval;
+
+ if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
+ nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) ||
+ nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
+ nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
+ nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROTECT,
+ !!(p->flags & BR_ROOT_BLOCK)) ||
+ nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
+ !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
+ !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
+ nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
+ nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
+ !!(p->flags & BR_FLOOD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD,
+ !!(p->flags & BR_MCAST_FLOOD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD,
+ !!(p->flags & BR_BCAST_FLOOD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
+ !!(p->flags & BR_PROXYARP_WIFI)) ||
+ nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id),
+ &p->designated_root) ||
+ nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id),
+ &p->designated_bridge) ||
+ nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) ||
+ nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) ||
+ nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) ||
+ nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
+ nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK,
+ p->topology_change_ack) ||
+ nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
+ nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
+ BR_VLAN_TUNNEL)) ||
+ nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
+ nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
+ !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags &
+ BR_MRP_LOST_CONT)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
+ !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
+ nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
+ nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) ||
+ nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+ !!(p->flags & BR_NEIGH_VLAN_SUPPRESS)))
+ return -EMSGSIZE;
+
+ timerval = br_timer_value(&p->message_age_timer);
+ if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval,
+ IFLA_BRPORT_PAD))
+ return -EMSGSIZE;
+ timerval = br_timer_value(&p->forward_delay_timer);
+ if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval,
+ IFLA_BRPORT_PAD))
+ return -EMSGSIZE;
+ timerval = br_timer_value(&p->hold_timer);
+ if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval,
+ IFLA_BRPORT_PAD))
+ return -EMSGSIZE;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER,
+ p->multicast_ctx.multicast_router) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
+ p->multicast_eht_hosts_limit) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
+ p->multicast_eht_hosts_cnt) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS,
+ br_multicast_ngroups_get(&p->multicast_ctx)) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS,
+ br_multicast_ngroups_get_max(&p->multicast_ctx)))
+ return -EMSGSIZE;
+#endif
+
+ /* we might be called only with br->lock */
+ rcu_read_lock();
+ backup_p = rcu_dereference(p->backup_port);
+ if (backup_p)
+ nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT,
+ backup_p->dev->ifindex);
+ rcu_read_unlock();
+
+ if (p->backup_nhid &&
+ nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start,
+ u16 vid_end, u16 flags)
+{
+ struct bridge_vlan_info vinfo;
+
+ if ((vid_end - vid_start) > 0) {
+ /* add range to skb */
+ vinfo.vid = vid_start;
+ vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_BEGIN;
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+
+ vinfo.vid = vid_end;
+ vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END;
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+ } else {
+ vinfo.vid = vid_start;
+ vinfo.flags = flags;
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb,
+ struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *v;
+ u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0;
+ u16 flags, pvid;
+ int err = 0;
+
+ /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan
+ * and mark vlan info with begin and end flags
+ * if vlaninfo represents a range
+ */
+ pvid = br_get_pvid(vg);
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ flags = 0;
+ if (!br_vlan_should_use(v))
+ continue;
+ if (v->vid == pvid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+ if (vid_range_start == 0) {
+ goto initvars;
+ } else if ((v->vid - vid_range_end) == 1 &&
+ flags == vid_range_flags) {
+ vid_range_end = v->vid;
+ continue;
+ } else {
+ err = br_fill_ifvlaninfo_range(skb, vid_range_start,
+ vid_range_end,
+ vid_range_flags);
+ if (err)
+ return err;
+ }
+
+initvars:
+ vid_range_start = v->vid;
+ vid_range_end = v->vid;
+ vid_range_flags = flags;
+ }
+
+ if (vid_range_start != 0) {
+ /* Call it once more to send any left over vlans */
+ err = br_fill_ifvlaninfo_range(skb, vid_range_start,
+ vid_range_end,
+ vid_range_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_fill_ifvlaninfo(struct sk_buff *skb,
+ struct net_bridge_vlan_group *vg)
+{
+ struct bridge_vlan_info vinfo;
+ struct net_bridge_vlan *v;
+ u16 pvid;
+
+ pvid = br_get_pvid(vg);
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+
+ vinfo.vid = v->vid;
+ vinfo.flags = 0;
+ if (v->vid == pvid)
+ vinfo.flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
}
/*
* Create one netlink message for one interface
* Contains port and master info as well as carrier and bridge state.
*/
-static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port,
- u32 pid, u32 seq, int event, unsigned int flags)
+static int br_fill_ifinfo(struct sk_buff *skb,
+ const struct net_bridge_port *port,
+ u32 pid, u32 seq, int event, unsigned int flags,
+ u32 filter_mask, const struct net_device *dev,
+ bool getlink)
{
- const struct net_bridge *br = port->br;
- const struct net_device *dev = port->dev;
+ u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) :
+ IF_OPER_DOWN;
+ struct nlattr *af = NULL;
+ struct net_bridge *br;
struct ifinfomsg *hdr;
struct nlmsghdr *nlh;
- u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
- pr_debug("br_fill_info event %d port %s master %s\n",
- event, dev->name, br->dev->name);
+ if (port)
+ br = port->br;
+ else
+ br = netdev_priv(dev);
+
+ br_debug(br, "br_fill_ifinfo event %d port %s master %s\n",
+ event, dev->name, br->dev->name);
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
if (nlh == NULL)
- return -ENOBUFS;
+ return -EMSGSIZE;
hdr = nlmsg_data(nlh);
hdr->ifi_family = AF_BRIDGE;
hdr->__ifi_pad = 0;
hdr->ifi_type = dev->type;
hdr->ifi_index = dev->ifindex;
- hdr->ifi_flags = dev_get_flags(dev);
+ hdr->ifi_flags = netif_get_flags(dev);
hdr->ifi_change = 0;
- NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
- NLA_PUT_U32(skb, IFLA_MASTER, br->dev->ifindex);
- NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
- NLA_PUT_U8(skb, IFLA_OPERSTATE, operstate);
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ goto nla_put_failure;
+
+ if (event == RTM_NEWLINK && port) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, IFLA_PROTINFO);
+ if (nest == NULL || br_port_fill_attrs(skb, port) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ }
+
+ if (filter_mask & (RTEXT_FILTER_BRVLAN |
+ RTEXT_FILTER_BRVLAN_COMPRESSED |
+ RTEXT_FILTER_MRP |
+ RTEXT_FILTER_CFM_CONFIG |
+ RTEXT_FILTER_CFM_STATUS |
+ RTEXT_FILTER_MST)) {
+ af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
+ if (!af)
+ goto nla_put_failure;
+ }
+
+ /* Check if the VID information is requested */
+ if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
+ (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
+ struct net_bridge_vlan_group *vg;
+ int err;
+
+ /* RCU needed because of the VLAN locking rules (rcu || rtnl) */
+ rcu_read_lock();
+ if (port)
+ vg = nbp_vlan_group_rcu(port);
+ else
+ vg = br_vlan_group_rcu(br);
+
+ if (!vg || !vg->num_vlans) {
+ rcu_read_unlock();
+ goto done;
+ }
+ if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
+ err = br_fill_ifvlaninfo_compressed(skb, vg);
+ else
+ err = br_fill_ifvlaninfo(skb, vg);
+
+ if (port && (port->flags & BR_VLAN_TUNNEL))
+ err = br_fill_vlan_tunnel_info(skb, vg);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (filter_mask & RTEXT_FILTER_MRP) {
+ int err;
+
+ if (!br_mrp_enabled(br) || port)
+ goto done;
+
+ rcu_read_lock();
+ err = br_mrp_fill_info(skb, br);
+ rcu_read_unlock();
+
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) {
+ struct nlattr *cfm_nest = NULL;
+ int err;
+
+ if (!br_cfm_created(br) || port)
+ goto done;
+
+ cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM);
+ if (!cfm_nest)
+ goto nla_put_failure;
+
+ if (filter_mask & RTEXT_FILTER_CFM_CONFIG) {
+ rcu_read_lock();
+ err = br_cfm_config_fill_info(skb, br);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (filter_mask & RTEXT_FILTER_CFM_STATUS) {
+ rcu_read_lock();
+ err = br_cfm_status_fill_info(skb, br, getlink);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, cfm_nest);
+ }
+
+ if ((filter_mask & RTEXT_FILTER_MST) &&
+ br_opt_get(br, BROPT_MST_ENABLED) && port) {
+ const struct net_bridge_vlan_group *vg = nbp_vlan_group(port);
+ struct nlattr *mst_nest;
+ int err;
+
+ if (!vg || !vg->num_vlans)
+ goto done;
- if (dev->addr_len)
- NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+ mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST);
+ if (!mst_nest)
+ goto nla_put_failure;
- if (dev->ifindex != dev->iflink)
- NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+ err = br_mst_fill_info(skb, vg);
+ if (err)
+ goto nla_put_failure;
- if (event == RTM_NEWLINK)
- NLA_PUT_U8(skb, IFLA_PROTINFO, port->state);
+ nla_nest_end(skb, mst_nest);
+ }
+
+done:
+ if (af) {
+ if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0))
+ nla_nest_end(skb, af);
+ else
+ nla_nest_cancel(skb, af);
+ }
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-/*
- * Notify listeners of a change in port information
- */
-void br_ifinfo_notify(int event, struct net_bridge_port *port)
+void br_info_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port, u32 filter)
{
+ struct net_device *dev;
struct sk_buff *skb;
int err = -ENOBUFS;
+ struct net *net;
+ u16 port_no = 0;
- pr_debug("bridge notify event=%d\n", event);
- skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC);
+ if (WARN_ON(!port && !br))
+ return;
+
+ if (port) {
+ dev = port->dev;
+ br = port->br;
+ port_no = port->port_no;
+ } else {
+ dev = br->dev;
+ }
+
+ net = dev_net(dev);
+ br_debug(br, "port %u(%s) event %d\n", port_no, dev->name, event);
+
+ skb = nlmsg_new(br_nlmsg_size(dev, filter), GFP_ATOMIC);
if (skb == NULL)
goto errout;
- err = br_fill_ifinfo(skb, port, 0, 0, event, 0);
- /* failure implies BUG in br_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in br_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return;
errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_LINK, err);
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+
+/* Notify listeners of a change in bridge or port information */
+void br_ifinfo_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port)
+{
+ u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED;
+
+ br_info_notify(event, br, port, filter);
}
/*
* Dump information about all ports, in response to GETLINK
*/
-static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u32 filter_mask, int nlflags)
{
- struct net_device *dev;
- int idx;
+ struct net_bridge_port *port = br_port_get_rtnl(dev);
- read_lock(&dev_base_lock);
- for (dev = dev_base, idx = 0; dev; dev = dev->next) {
- /* not a bridge port */
- if (dev->br_port == NULL || idx < cb->args[0])
- goto skip;
+ if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) &&
+ !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) &&
+ !(filter_mask & RTEXT_FILTER_MRP) &&
+ !(filter_mask & RTEXT_FILTER_CFM_CONFIG) &&
+ !(filter_mask & RTEXT_FILTER_CFM_STATUS))
+ return 0;
- if (br_fill_ifinfo(skb, dev->br_port, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWLINK,
- NLM_F_MULTI) < 0)
- break;
-skip:
- ++idx;
- }
- read_unlock(&dev_base_lock);
+ return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
+ filter_mask, dev, true);
+}
+
+static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
+ int cmd, struct bridge_vlan_info *vinfo, bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ bool curr_change;
+ int err = 0;
- cb->args[0] = idx;
+ switch (cmd) {
+ case RTM_SETLINK:
+ if (p) {
+ /* if the MASTER flag is set this will act on the global
+ * per-VLAN entry as well
+ */
+ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags,
+ &curr_change, extack);
+ } else {
+ vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY;
+ err = br_vlan_add(br, vinfo->vid, vinfo->flags,
+ &curr_change, extack);
+ }
+ if (curr_change)
+ *changed = true;
+ break;
+
+ case RTM_DELLINK:
+ if (p) {
+ if (!nbp_vlan_delete(p, vinfo->vid))
+ *changed = true;
+
+ if ((vinfo->flags & BRIDGE_VLAN_INFO_MASTER) &&
+ !br_vlan_delete(p->br, vinfo->vid))
+ *changed = true;
+ } else if (!br_vlan_delete(br, vinfo->vid)) {
+ *changed = true;
+ }
+ break;
+ }
- return skb->len;
+ return err;
}
-/*
- * Change state of port (ie from forwarding to blocking etc)
- * Used by spanning tree in user space.
- */
-static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last,
+ bool *changed,
+ struct netlink_ext_ack *extack)
{
- struct ifinfomsg *ifm;
- struct nlattr *protinfo;
- struct net_device *dev;
- struct net_bridge_port *p;
- u8 new_state;
+ int err, rtm_cmd;
- if (nlmsg_len(nlh) < sizeof(*ifm))
+ if (!br_vlan_valid_id(vinfo_curr->vid, extack))
return -EINVAL;
- ifm = nlmsg_data(nlh);
- if (ifm->ifi_family != AF_BRIDGE)
- return -EPFNOSUPPORT;
+ /* needed for vlan-only NEWVLAN/DELVLAN notifications */
+ rtm_cmd = br_afspec_cmd_to_rtm(cmd);
- protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO);
- if (!protinfo || nla_len(protinfo) < sizeof(u8))
- return -EINVAL;
+ if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack))
+ return -EINVAL;
+ *vinfo_last = vinfo_curr;
+ return 0;
+ }
- new_state = nla_get_u8(protinfo);
- if (new_state > BR_STATE_BLOCKING)
- return -EINVAL;
+ if (*vinfo_last) {
+ struct bridge_vlan_info tmp_vinfo;
+ int v, v_change_start = 0;
+
+ if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack))
+ return -EINVAL;
+
+ memcpy(&tmp_vinfo, *vinfo_last,
+ sizeof(struct bridge_vlan_info));
+ for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
+ bool curr_change = false;
+
+ tmp_vinfo.vid = v;
+ err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change,
+ extack);
+ if (err)
+ break;
+ if (curr_change) {
+ *changed = curr_change;
+ if (!v_change_start)
+ v_change_start = v;
+ } else {
+ /* nothing to notify yet */
+ if (!v_change_start)
+ continue;
+ br_vlan_notify(br, p, v_change_start,
+ v - 1, rtm_cmd);
+ v_change_start = 0;
+ }
+ cond_resched();
+ }
+ /* v_change_start is set only if the last/whole range changed */
+ if (v_change_start)
+ br_vlan_notify(br, p, v_change_start,
+ v - 1, rtm_cmd);
+
+ *vinfo_last = NULL;
+
+ return err;
+ }
+
+ err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack);
+ if (*changed)
+ br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd);
- dev = __dev_get_by_index(ifm->ifi_index);
- if (!dev)
- return -ENODEV;
+ return err;
+}
- p = dev->br_port;
- if (!p)
+static int br_afspec(struct net_bridge *br,
+ struct net_bridge_port *p,
+ struct nlattr *af_spec,
+ int cmd, bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct bridge_vlan_info *vinfo_curr = NULL;
+ struct bridge_vlan_info *vinfo_last = NULL;
+ struct nlattr *attr;
+ struct vtunnel_info tinfo_last = {};
+ struct vtunnel_info tinfo_curr = {};
+ int err = 0, rem;
+
+ nla_for_each_nested(attr, af_spec, rem) {
+ err = 0;
+ switch (nla_type(attr)) {
+ case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
+ if (!p || !(p->flags & BR_VLAN_TUNNEL))
+ return -EINVAL;
+ err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
+ if (err)
+ return err;
+ err = br_process_vlan_tunnel_info(br, p, cmd,
+ &tinfo_curr,
+ &tinfo_last,
+ changed);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_VLAN_INFO:
+ if (nla_len(attr) != sizeof(struct bridge_vlan_info))
+ return -EINVAL;
+ vinfo_curr = nla_data(attr);
+ err = br_process_vlan_info(br, p, cmd, vinfo_curr,
+ &vinfo_last, changed,
+ extack);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_MRP:
+ err = br_mrp_parse(br, p, attr, cmd, extack);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_CFM:
+ err = br_cfm_parse(br, p, attr, cmd, extack);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_MST:
+ if (!p) {
+ NL_SET_ERR_MSG(extack,
+ "MST states can only be set on bridge ports");
+ return -EINVAL;
+ }
+
+ if (cmd != RTM_SETLINK) {
+ NL_SET_ERR_MSG(extack,
+ "MST states can only be set through RTM_SETLINK");
+ return -EINVAL;
+ }
+
+ err = br_mst_process(p, attr, extack);
+ if (err)
+ return err;
+ break;
+ }
+ }
+
+ return err;
+}
+
+static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
+ [IFLA_BRPORT_UNSPEC] = { .strict_start_type =
+ IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 },
+ [IFLA_BRPORT_STATE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_COST] = { .type = NLA_U32 },
+ [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 },
+ [IFLA_BRPORT_MODE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_GUARD] = { .type = NLA_U8 },
+ [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 },
+ [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 },
+ [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 },
+ [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
+ [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 },
+ [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 },
+ [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 },
+ [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
+ [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
+ [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
+ [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MAB] = { .type = NLA_U8 },
+ [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
+ [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
+ [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT },
+ [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+ [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
+ [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 },
+};
+
+/* Change the state of the port and notify spanning tree */
+static int br_set_port_state(struct net_bridge_port *p, u8 state)
+{
+ if (state > BR_STATE_BLOCKING)
return -EINVAL;
/* if kernel STP is running, don't allow changes */
- if (p->br->stp_enabled)
+ if (p->br->stp_enabled == BR_KERNEL_STP)
return -EBUSY;
- if (!netif_running(dev) ||
- (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED))
+ /* if device is not up, change is not allowed
+ * if link is not present, only allowable state is disabled
+ */
+ if (!netif_running(p->dev) ||
+ (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED))
return -ENETDOWN;
- p->state = new_state;
- br_log_state(p);
+ br_set_state(p, state);
+ br_port_state_selection(p->br);
+ return 0;
+}
+
+/* Set/clear or port flags based on attribute */
+static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
+ int attrtype, unsigned long mask)
+{
+ if (!tb[attrtype])
+ return;
+
+ if (nla_get_u8(tb[attrtype]))
+ p->flags |= mask;
+ else
+ p->flags &= ~mask;
+}
+
+/* Process bridge protocol info on port */
+static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ unsigned long old_flags, changed_mask;
+ bool br_vlan_tunnel_old;
+ int err;
+
+ old_flags = p->flags;
+ br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false;
+
+ br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
+ br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
+ br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST,
+ BR_MULTICAST_TO_UNICAST);
+ br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
+ br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
+ br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
+ br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+ br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+ br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+ BR_NEIGH_VLAN_SUPPRESS);
+
+ if ((p->flags & BR_PORT_MAB) &&
+ (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
+ NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled");
+ p->flags = old_flags;
+ return -EINVAL;
+ } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) {
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags = BIT(BR_FDB_LOCKED),
+ .flags_mask = BIT(BR_FDB_LOCKED),
+ .port_ifindex = p->dev->ifindex,
+ };
+
+ br_fdb_flush(p->br, &desc);
+ }
+
+ changed_mask = old_flags ^ p->flags;
+
+ err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack);
+ if (err) {
+ p->flags = old_flags;
+ return err;
+ }
+
+ if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
+ nbp_vlan_tunnel_info_flush(p);
+
+ br_port_flags_change(p, changed_mask);
+
+ if (tb[IFLA_BRPORT_COST]) {
+ err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_PRIORITY]) {
+ err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY]));
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_STATE]) {
+ err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE]));
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_FLUSH])
+ br_fdb_delete_by_port(p->br, p, 0, 0);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) {
+ u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]);
+
+ err = br_multicast_set_port_router(&p->multicast_ctx,
+ mcast_router);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) {
+ u32 hlimit;
+
+ hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]);
+ err = br_multicast_eht_set_hosts_limit(p, hlimit);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) {
+ u32 max_groups;
+
+ max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]);
+ br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups);
+ }
+#endif
+
+ if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) {
+ u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]);
+
+ if (fwd_mask & BR_GROUPFWD_MACPAUSE)
+ return -EINVAL;
+ p->group_fwd_mask = fwd_mask;
+ }
+
+ if (tb[IFLA_BRPORT_BACKUP_PORT]) {
+ struct net_device *backup_dev = NULL;
+ u32 backup_ifindex;
+
+ backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
+ if (backup_ifindex) {
+ backup_dev = __dev_get_by_index(dev_net(p->dev),
+ backup_ifindex);
+ if (!backup_dev)
+ return -ENOENT;
+ }
+
+ err = nbp_backup_change(p, backup_dev);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_BACKUP_NHID]) {
+ u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]);
+
+ WRITE_ONCE(p->backup_nhid, backup_nhid);
+ }
+
return 0;
}
+/* Change state and parameters on port. */
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = (struct net_bridge *)netdev_priv(dev);
+ struct nlattr *tb[IFLA_BRPORT_MAX + 1];
+ struct net_bridge_port *p;
+ struct nlattr *protinfo;
+ struct nlattr *afspec;
+ bool changed = false;
+ int err = 0;
+
+ protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO);
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (!protinfo && !afspec)
+ return 0;
+
+ p = br_port_get_rtnl(dev);
+ /* We want to accept dev as bridge itself if the AF_SPEC
+ * is set to see if someone is setting vlan info on the bridge
+ */
+ if (!p && !afspec)
+ return -EINVAL;
+
+ if (p && protinfo) {
+ if (protinfo->nla_type & NLA_F_NESTED) {
+ err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX,
+ protinfo,
+ br_port_policy,
+ NULL);
+ if (err)
+ return err;
+
+ spin_lock_bh(&p->br->lock);
+ err = br_setport(p, tb, extack);
+ spin_unlock_bh(&p->br->lock);
+ } else {
+ /* Binary compatibility with old RSTP */
+ if (nla_len(protinfo) < sizeof(u8))
+ return -EINVAL;
+
+ spin_lock_bh(&p->br->lock);
+ err = br_set_port_state(p, nla_get_u8(protinfo));
+ spin_unlock_bh(&p->br->lock);
+ }
+ if (err)
+ goto out;
+ changed = true;
+ }
+
+ if (afspec)
+ err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack);
+
+ if (changed)
+ br_ifinfo_notify(RTM_NEWLINK, br, p);
+out:
+ return err;
+}
+
+/* Delete port information */
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
+{
+ struct net_bridge *br = (struct net_bridge *)netdev_priv(dev);
+ struct net_bridge_port *p;
+ struct nlattr *afspec;
+ bool changed = false;
+ int err = 0;
+
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (!afspec)
+ return 0;
+
+ p = br_port_get_rtnl(dev);
+ /* We want to accept dev as bridge itself as well */
+ if (!p && !netif_is_bridge_master(dev))
+ return -EINVAL;
+
+ err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL);
+ if (changed)
+ /* Send RTM_NEWLINK because userspace
+ * expects RTM_NEWLINK for vlan dels
+ */
+ br_ifinfo_notify(RTM_NEWLINK, br, p);
+
+ return err;
+}
+
+static int br_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ return 0;
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (data[IFLA_BR_VLAN_PROTOCOL] &&
+ !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])))
+ return -EPROTONOSUPPORT;
-static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = {
- [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, },
- [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, },
+ if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
+ __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
+
+ if (defpvid >= VLAN_VID_MASK)
+ return -EINVAL;
+ }
+#endif
+
+ return 0;
+}
+
+static int br_port_slave_changelink(struct net_device *brdev,
+ struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(brdev);
+ int ret;
+
+ if (!data)
+ return 0;
+
+ spin_lock_bh(&br->lock);
+ ret = br_setport(br_port_get_rtnl(dev), data, extack);
+ spin_unlock_bh(&br->lock);
+
+ return ret;
+}
+
+static int br_port_fill_slave_info(struct sk_buff *skb,
+ const struct net_device *brdev,
+ const struct net_device *dev)
+{
+ return br_port_fill_attrs(skb, br_port_get_rtnl(dev));
+}
+
+static size_t br_port_get_slave_size(const struct net_device *brdev,
+ const struct net_device *dev)
+{
+ return br_port_info_size();
+}
+
+static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
+ [IFLA_BR_UNSPEC] = { .strict_start_type =
+ IFLA_BR_FDB_N_LEARNED },
+ [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 },
+ [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 },
+ [IFLA_BR_MAX_AGE] = { .type = NLA_U32 },
+ [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 },
+ [IFLA_BR_STP_STATE] = { .type = NLA_U32 },
+ [IFLA_BR_PRIORITY] = { .type = NLA_U16 },
+ [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 },
+ [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 },
+ [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 },
+ [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 },
+ [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 },
+ [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 },
+ [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
+ [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
+ [IFLA_BR_MULTI_BOOLOPT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)),
+ [IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT },
+ [IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 },
};
-void __init br_netlink_init(void)
+static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(brdev);
+ int err;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_BR_FORWARD_DELAY]) {
+ err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_HELLO_TIME]) {
+ err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MAX_AGE]) {
+ err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_AGEING_TIME]) {
+ err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_STP_STATE]) {
+ u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]);
+
+ err = br_stp_set_enabled(br, stp_enabled, extack);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_PRIORITY]) {
+ u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]);
+
+ br_stp_set_bridge_priority(br, priority);
+ }
+
+ if (data[IFLA_BR_VLAN_FILTERING]) {
+ u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]);
+
+ err = br_vlan_filter_toggle(br, vlan_filter, extack);
+ if (err)
+ return err;
+ }
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (data[IFLA_BR_VLAN_PROTOCOL]) {
+ __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]);
+
+ err = __br_vlan_set_proto(br, vlan_proto, extack);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
+ __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
+
+ err = __br_vlan_set_default_pvid(br, defpvid, extack);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_VLAN_STATS_ENABLED]) {
+ __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]);
+
+ err = br_vlan_set_stats(br, vlan_stats);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_VLAN_STATS_PER_PORT]) {
+ __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]);
+
+ err = br_vlan_set_stats_per_port(br, per_port);
+ if (err)
+ return err;
+ }
+#endif
+
+ if (data[IFLA_BR_GROUP_FWD_MASK]) {
+ u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]);
+
+ if (fwd_mask & BR_GROUPFWD_RESTRICTED)
+ return -EINVAL;
+ br->group_fwd_mask = fwd_mask;
+ }
+
+ if (data[IFLA_BR_GROUP_ADDR]) {
+ u8 new_addr[ETH_ALEN];
+
+ if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN)
+ return -EINVAL;
+ memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN);
+ if (!is_link_local_ether_addr(new_addr))
+ return -EINVAL;
+ if (new_addr[5] == 1 || /* 802.3x Pause address */
+ new_addr[5] == 2 || /* 802.3ad Slow protocols */
+ new_addr[5] == 3) /* 802.1X PAE address */
+ return -EINVAL;
+ spin_lock_bh(&br->lock);
+ memcpy(br->group_addr, new_addr, sizeof(br->group_addr));
+ spin_unlock_bh(&br->lock);
+ br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
+ br_recalculate_fwd_mask(br);
+ }
+
+ if (data[IFLA_BR_FDB_FLUSH]) {
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags_mask = BIT(BR_FDB_STATIC)
+ };
+
+ br_fdb_flush(br, &desc);
+ }
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (data[IFLA_BR_MCAST_ROUTER]) {
+ u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]);
+
+ err = br_multicast_set_router(&br->multicast_ctx,
+ multicast_router);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_SNOOPING]) {
+ u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]);
+
+ err = br_multicast_toggle(br, mcast_snooping, extack);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) {
+ u8 val;
+
+ val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]);
+ br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERIER]) {
+ u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]);
+
+ err = br_multicast_set_querier(&br->multicast_ctx,
+ mcast_querier);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MCAST_HASH_ELASTICITY])
+ br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n",
+ RHT_ELASTICITY);
+
+ if (data[IFLA_BR_MCAST_HASH_MAX])
+ br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
+
+ if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) {
+ u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]);
+
+ br->multicast_ctx.multicast_last_member_count = val;
+ }
+
+ if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) {
+ u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]);
+
+ br->multicast_ctx.multicast_startup_query_count = val;
+ }
+
+ if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]);
+
+ br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]);
+
+ br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERIER_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]);
+
+ br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERY_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]);
+
+ br_multicast_set_query_intvl(&br->multicast_ctx, val);
+ }
+
+ if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]);
+
+ br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
+ }
+
+ if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) {
+ u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]);
+
+ br_multicast_set_startup_query_intvl(&br->multicast_ctx, val);
+ }
+
+ if (data[IFLA_BR_MCAST_STATS_ENABLED]) {
+ __u8 mcast_stats;
+
+ mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
+ br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats);
+ }
+
+ if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
+ __u8 igmp_version;
+
+ igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
+ err = br_multicast_set_igmp_version(&br->multicast_ctx,
+ igmp_version);
+ if (err)
+ return err;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (data[IFLA_BR_MCAST_MLD_VERSION]) {
+ __u8 mld_version;
+
+ mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
+ err = br_multicast_set_mld_version(&br->multicast_ctx,
+ mld_version);
+ if (err)
+ return err;
+ }
+#endif
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (data[IFLA_BR_NF_CALL_IPTABLES]) {
+ u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]);
+
+ br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
+ }
+
+ if (data[IFLA_BR_NF_CALL_IP6TABLES]) {
+ u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]);
+
+ br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
+ }
+
+ if (data[IFLA_BR_NF_CALL_ARPTABLES]) {
+ u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]);
+
+ br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
+ }
+#endif
+
+ if (data[IFLA_BR_MULTI_BOOLOPT]) {
+ struct br_boolopt_multi *bm;
+
+ bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]);
+ err = br_boolopt_multi_toggle(br, bm, extack);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_FDB_MAX_LEARNED]) {
+ u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]);
+
+ WRITE_ONCE(br->fdb_max_learned, val);
+ }
+
+ return 0;
+}
+
+static int br_dev_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
{
- rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table;
+ struct net_bridge *br = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ int err;
+
+ err = register_netdevice(dev);
+ if (err)
+ return err;
+
+ if (tb[IFLA_ADDRESS]) {
+ spin_lock_bh(&br->lock);
+ br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
+ spin_unlock_bh(&br->lock);
+ }
+
+ err = br_changelink(dev, tb, data, extack);
+ if (err)
+ br_dev_delete(dev, NULL);
+
+ return err;
}
-void __exit br_netlink_fini(void)
+static size_t br_get_size(const struct net_device *brdev)
{
- rtnetlink_links[PF_BRIDGE] = NULL;
+ return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */
+#endif
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */
+ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */
+ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_N_LEARNED */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_MAX_LEARNED */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
+ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */
+ br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */
+#endif
+ nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */
+ 0;
}
+static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
+{
+ struct net_bridge *br = netdev_priv(brdev);
+ u32 forward_delay = jiffies_to_clock_t(br->forward_delay);
+ u32 hello_time = jiffies_to_clock_t(br->hello_time);
+ u32 age_time = jiffies_to_clock_t(br->max_age);
+ u32 ageing_time = jiffies_to_clock_t(br->ageing_time);
+ u32 stp_enabled = br->stp_enabled;
+ u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
+ u8 vlan_enabled = br_vlan_enabled(br->dev);
+ struct br_boolopt_multi bm;
+ u64 clockval;
+
+ clockval = br_timer_value(&br->hello_timer);
+ if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = br_timer_value(&br->tcn_timer);
+ if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = br_timer_value(&br->topology_change_timer);
+ if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = br_timer_value(&br->gc_work.timer);
+ if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
+ return -EMSGSIZE;
+
+ br_boolopt_multi_get(br, &bm);
+ if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
+ nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
+ nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
+ nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) ||
+ nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) ||
+ nla_put_u16(skb, IFLA_BR_PRIORITY, priority) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) ||
+ nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) ||
+ nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id),
+ &br->bridge_id) ||
+ nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id),
+ &br->designated_root) ||
+ nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) ||
+ nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) ||
+ nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) ||
+ nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED,
+ br->topology_change_detected) ||
+ nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) ||
+ nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) ||
+ nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED,
+ atomic_read(&br->fdb_n_learned)) ||
+ nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned))
+ return -EMSGSIZE;
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
+ nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
+ br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT,
+ br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)))
+ return -EMSGSIZE;
+#endif
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER,
+ br->multicast_ctx.multicast_router) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING,
+ br_opt_get(br, BROPT_MULTICAST_ENABLED)) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
+ br->multicast_ctx.multicast_querier) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
+ br->multicast_ctx.multicast_last_member_count) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
+ br->multicast_ctx.multicast_startup_query_count) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
+ br->multicast_ctx.multicast_igmp_version) ||
+ br_multicast_dump_querier_state(skb, &br->multicast_ctx,
+ IFLA_BR_MCAST_QUERIER_STATE))
+ return -EMSGSIZE;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
+ br->multicast_ctx.multicast_mld_version))
+ return -EMSGSIZE;
+#endif
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval);
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval);
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval);
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval);
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval);
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval);
+ if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval,
+ IFLA_BR_PAD))
+ return -EMSGSIZE;
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES,
+ br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) ||
+ nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES,
+ br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) ||
+ nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES,
+ br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0))
+ return -EMSGSIZE;
+#endif
+
+ return 0;
+}
+
+static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
+{
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge *br;
+ int numvls = 0;
+
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return 0;
+ vg = nbp_vlan_group(p);
+ break;
+ default:
+ return 0;
+ }
+
+ if (vg) {
+ /* we need to count all, even placeholder entries */
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ numvls++;
+ }
+
+ return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) +
+ nla_total_size_64bit(sizeof(struct br_mcast_stats)) +
+ (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) +
+ nla_total_size(0);
+}
+
+static int br_fill_linkxstats(struct sk_buff *skb,
+ const struct net_device *dev,
+ int *prividx, int attr)
+{
+ struct nlattr *nla __maybe_unused;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge *br;
+ struct nlattr *nest;
+ int vl_idx = 0;
+
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return 0;
+ br = p->br;
+ vg = nbp_vlan_group(p);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (vg) {
+ u16 pvid;
+
+ pvid = br_get_pvid(vg);
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct bridge_vlan_xstats vxi;
+ struct pcpu_sw_netstats stats;
+
+ if (++vl_idx < *prividx)
+ continue;
+ memset(&vxi, 0, sizeof(vxi));
+ vxi.vid = v->vid;
+ vxi.flags = v->flags;
+ if (v->vid == pvid)
+ vxi.flags |= BRIDGE_VLAN_INFO_PVID;
+ br_vlan_get_stats(v, &stats);
+ vxi.rx_bytes = u64_stats_read(&stats.rx_bytes);
+ vxi.rx_packets = u64_stats_read(&stats.rx_packets);
+ vxi.tx_bytes = u64_stats_read(&stats.tx_bytes);
+ vxi.tx_packets = u64_stats_read(&stats.tx_packets);
+
+ if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
+ goto nla_put_failure;
+ }
+ }
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (++vl_idx >= *prividx) {
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
+ sizeof(struct br_mcast_stats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla)
+ goto nla_put_failure;
+ br_multicast_get_stats(br, p, nla_data(nla));
+ }
+#endif
+
+ if (p) {
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP,
+ sizeof(p->stp_xstats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla)
+ goto nla_put_failure;
+
+ spin_lock_bh(&br->lock);
+ memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats));
+ spin_unlock_bh(&br->lock);
+ }
+
+ nla_nest_end(skb, nest);
+ *prividx = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_end(skb, nest);
+ *prividx = vl_idx;
+
+ return -EMSGSIZE;
+}
+
+static struct rtnl_af_ops br_af_ops __read_mostly = {
+ .family = AF_BRIDGE,
+ .get_link_af_size = br_get_link_af_size_filtered,
+};
+
+struct rtnl_link_ops br_link_ops __read_mostly = {
+ .kind = "bridge",
+ .priv_size = sizeof(struct net_bridge),
+ .setup = br_dev_setup,
+ .maxtype = IFLA_BR_MAX,
+ .policy = br_policy,
+ .validate = br_validate,
+ .newlink = br_dev_newlink,
+ .changelink = br_changelink,
+ .dellink = br_dev_delete,
+ .get_size = br_get_size,
+ .fill_info = br_fill_info,
+ .fill_linkxstats = br_fill_linkxstats,
+ .get_linkxstats_size = br_get_linkxstats_size,
+
+ .slave_maxtype = IFLA_BRPORT_MAX,
+ .slave_policy = br_port_policy,
+ .slave_changelink = br_port_slave_changelink,
+ .get_slave_size = br_port_get_slave_size,
+ .fill_slave_info = br_port_fill_slave_info,
+};
+
+int __init br_netlink_init(void)
+{
+ int err;
+
+ err = br_vlan_rtnl_init();
+ if (err)
+ goto out;
+
+ err = rtnl_af_register(&br_af_ops);
+ if (err)
+ goto out_vlan;
+
+ err = rtnl_link_register(&br_link_ops);
+ if (err)
+ goto out_af;
+
+ return 0;
+
+out_af:
+ rtnl_af_unregister(&br_af_ops);
+out_vlan:
+ br_vlan_rtnl_uninit();
+out:
+ return err;
+}
+
+void br_netlink_fini(void)
+{
+ br_vlan_rtnl_uninit();
+ rtnl_af_unregister(&br_af_ops);
+ rtnl_link_unregister(&br_link_ops);
+}
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
new file mode 100644
index 000000000000..71a12da30004
--- /dev/null
+++ b/net/bridge/br_netlink_tunnel.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Bridge per vlan tunnel port dst_metadata netlink control interface
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/if_bridge.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static size_t __get_vlan_tinfo_size(void)
+{
+ return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */
+ nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */
+ nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */
+ nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
+}
+
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last)
+{
+ __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
+ __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
+
+ return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1;
+}
+
+static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL;
+ int num_tinfos = 0;
+
+ /* Count number of vlan infos */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ /* only a context, bridge vlan not activated */
+ if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id)
+ continue;
+
+ if (!vtbegin) {
+ goto initvars;
+ } else if ((v->vid - vtend->vid) == 1 &&
+ vlan_tunid_inrange(v, vtend)) {
+ vtend = v;
+ continue;
+ } else {
+ if ((vtend->vid - vtbegin->vid) > 0)
+ num_tinfos += 2;
+ else
+ num_tinfos += 1;
+ }
+initvars:
+ vtbegin = v;
+ vtend = v;
+ }
+
+ if (vtbegin && vtend) {
+ if ((vtend->vid - vtbegin->vid) > 0)
+ num_tinfos += 2;
+ else
+ num_tinfos += 1;
+ }
+
+ return num_tinfos;
+}
+
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg)
+{
+ int num_tinfos;
+
+ if (!vg)
+ return 0;
+
+ rcu_read_lock();
+ num_tinfos = __get_num_vlan_tunnel_infos(vg);
+ rcu_read_unlock();
+
+ return num_tinfos * __get_vlan_tinfo_size();
+}
+
+static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
+ __be64 tunnel_id, u16 flags)
+{
+ __be32 tid = tunnel_id_to_key32(tunnel_id);
+ struct nlattr *tmap;
+
+ tmap = nla_nest_start_noflag(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
+ if (!tmap)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
+ be32_to_cpu(tid)))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID,
+ vid))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+ flags))
+ goto nla_put_failure;
+ nla_nest_end(skb, tmap);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tmap);
+
+ return -EMSGSIZE;
+}
+
+static int br_fill_vlan_tinfo_range(struct sk_buff *skb,
+ struct net_bridge_vlan *vtbegin,
+ struct net_bridge_vlan *vtend)
+{
+ int err;
+
+ if (vtend && (vtend->vid - vtbegin->vid) > 0) {
+ /* add range to skb */
+ err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+ vtbegin->tinfo.tunnel_id,
+ BRIDGE_VLAN_INFO_RANGE_BEGIN);
+ if (err)
+ return err;
+
+ err = br_fill_vlan_tinfo(skb, vtend->vid,
+ vtend->tinfo.tunnel_id,
+ BRIDGE_VLAN_INFO_RANGE_END);
+ if (err)
+ return err;
+ } else {
+ err = br_fill_vlan_tinfo(skb, vtbegin->vid,
+ vtbegin->tinfo.tunnel_id,
+ 0);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+ struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *vtbegin = NULL;
+ struct net_bridge_vlan *vtend = NULL;
+ struct net_bridge_vlan *v;
+ int err;
+
+ /* Count number of vlan infos */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ /* only a context, bridge vlan not activated */
+ if (!br_vlan_should_use(v))
+ continue;
+
+ if (!v->tinfo.tunnel_dst)
+ continue;
+
+ if (!vtbegin) {
+ goto initvars;
+ } else if ((v->vid - vtend->vid) == 1 &&
+ vlan_tunid_inrange(v, vtend)) {
+ vtend = v;
+ continue;
+ } else {
+ err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+ if (err)
+ return err;
+ }
+initvars:
+ vtbegin = v;
+ vtend = v;
+ }
+
+ if (vtbegin) {
+ err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
+ [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = {
+ .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1
+ },
+ [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
+ [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
+};
+
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed)
+{
+ int err = 0;
+
+ if (!p)
+ return -EINVAL;
+
+ switch (cmd) {
+ case RTM_SETLINK:
+ err = nbp_vlan_tunnel_info_add(p, vid, tun_id);
+ if (!err)
+ *changed = true;
+ break;
+ case RTM_DELLINK:
+ if (!nbp_vlan_tunnel_info_delete(p, vid))
+ *changed = true;
+ break;
+ }
+
+ return err;
+}
+
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+ struct vtunnel_info *tinfo)
+{
+ struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1];
+ u32 tun_id;
+ u16 vid, flags = 0;
+ int err;
+
+ memset(tinfo, 0, sizeof(*tinfo));
+
+ err = nla_parse_nested_deprecated(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+ attr, vlan_tunnel_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] ||
+ !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID])
+ return -EINVAL;
+
+ tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]);
+ vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]);
+ if (vid >= VLAN_VID_MASK)
+ return -ERANGE;
+
+ if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS])
+ flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]);
+
+ tinfo->tunid = tun_id;
+ tinfo->vid = vid;
+ tinfo->flags = flags;
+
+ return 0;
+}
+
+/* send a notification if v_curr can't enter the range and start a new one */
+static void __vlan_tunnel_handle_range(const struct net_bridge_port *p,
+ struct net_bridge_vlan **v_start,
+ struct net_bridge_vlan **v_end,
+ int v_curr, bool curr_change)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ vg = nbp_vlan_group(p);
+ if (!vg)
+ return;
+
+ v = br_vlan_find(vg, v_curr);
+
+ if (!*v_start)
+ goto out_init;
+
+ if (v && curr_change && br_vlan_can_enter_range(v, *v_end)) {
+ *v_end = v;
+ return;
+ }
+
+ br_vlan_notify(p->br, p, (*v_start)->vid, (*v_end)->vid, RTM_NEWVLAN);
+out_init:
+ /* we start a range only if there are any changes to notify about */
+ *v_start = curr_change ? v : NULL;
+ *v_end = *v_start;
+}
+
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p, int cmd,
+ struct vtunnel_info *tinfo_curr,
+ struct vtunnel_info *tinfo_last,
+ bool *changed)
+{
+ int err;
+
+ if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)
+ return -EINVAL;
+ memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
+ } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
+ struct net_bridge_vlan *v_start = NULL, *v_end = NULL;
+ int t, v;
+
+ if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
+ return -EINVAL;
+ if ((tinfo_curr->vid - tinfo_last->vid) !=
+ (tinfo_curr->tunid - tinfo_last->tunid))
+ return -EINVAL;
+ t = tinfo_last->tunid;
+ for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
+ bool curr_change = false;
+
+ err = br_vlan_tunnel_info(p, cmd, v, t, &curr_change);
+ if (err)
+ break;
+ t++;
+
+ if (curr_change)
+ *changed = curr_change;
+ __vlan_tunnel_handle_range(p, &v_start, &v_end, v,
+ curr_change);
+ }
+ if (v_start && v_end)
+ br_vlan_notify(br, p, v_start->vid, v_end->vid,
+ RTM_NEWVLAN);
+ if (err)
+ return err;
+
+ memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+ memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+ } else {
+ if (tinfo_last->flags)
+ return -EINVAL;
+ err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid,
+ tinfo_curr->tunid, changed);
+ if (err)
+ return err;
+ br_vlan_notify(br, p, tinfo_curr->vid, 0, RTM_NEWVLAN);
+ memset(tinfo_last, 0, sizeof(struct vtunnel_info));
+ memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
+ }
+
+ return 0;
+}
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
new file mode 100644
index 000000000000..a8c67035e23c
--- /dev/null
+++ b/net/bridge/br_nf_core.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handle firewalling core
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+#include <net/route.h>
+
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
+{
+}
+
+static void fake_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+
+static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ return NULL;
+}
+
+static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ return NULL;
+}
+
+static unsigned int fake_mtu(const struct dst_entry *dst)
+{
+ return dst->dev->mtu;
+}
+
+static struct dst_ops fake_dst_ops = {
+ .family = AF_INET,
+ .update_pmtu = fake_update_pmtu,
+ .redirect = fake_redirect,
+ .cow_metrics = fake_cow_metrics,
+ .neigh_lookup = fake_neigh_lookup,
+ .mtu = fake_mtu,
+};
+
+/*
+ * Initialize bogus route table used to keep netfilter happy.
+ * Currently, we fill in the PMTU entry because netfilter
+ * refragmentation needs it, and the rt_flags entry because
+ * ipt_REJECT needs it. Future netfilter modules might
+ * require us to fill additional fields.
+ */
+void br_netfilter_rtable_init(struct net_bridge *br)
+{
+ struct rtable *rt = &br->fake_rtable;
+
+ rcuref_init(&rt->dst.__rcuref, 1);
+ rt->dst.dev = br->dev;
+ dst_init_metrics(&rt->dst, br->metrics, false);
+ dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu);
+ rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
+ rt->dst.ops = &fake_dst_ops;
+}
+
+int __init br_nf_core_init(void)
+{
+ return dst_entries_init(&fake_dst_ops);
+}
+
+void br_nf_core_fini(void)
+{
+ dst_entries_destroy(&fake_dst_ops);
+}
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
deleted file mode 100644
index 20278494e4da..000000000000
--- a/net/bridge/br_notify.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Device event handling
- * Linux ethernet bridge
- *
- * Authors:
- * Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_notify.c,v 1.2 2000/02/21 15:51:34 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/rtnetlink.h>
-
-#include "br_private.h"
-
-static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr);
-
-struct notifier_block br_device_notifier = {
- .notifier_call = br_device_event
-};
-
-/*
- * Handle changes in state of network devices enslaved to a bridge.
- *
- * Note: don't care about up/down if bridge itself is down, because
- * port state is checked when bridge is brought up.
- */
-static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
- struct net_bridge_port *p = dev->br_port;
- struct net_bridge *br;
-
- /* not a port of a bridge */
- if (p == NULL)
- return NOTIFY_DONE;
-
- br = p->br;
-
- spin_lock_bh(&br->lock);
- switch (event) {
- case NETDEV_CHANGEMTU:
- dev_set_mtu(br->dev, br_min_mtu(br));
- break;
-
- case NETDEV_CHANGEADDR:
- br_fdb_changeaddr(p, dev->dev_addr);
- br_ifinfo_notify(RTM_NEWLINK, p);
- br_stp_recalculate_bridge_id(br);
- break;
-
- case NETDEV_CHANGE:
- if (br->dev->flags & IFF_UP)
- schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
- break;
-
- case NETDEV_FEAT_CHANGE:
- if (br->dev->flags & IFF_UP)
- br_features_recompute(br);
-
- /* could do recursive feature change notification
- * but who would care??
- */
- break;
-
- case NETDEV_DOWN:
- if (br->dev->flags & IFF_UP)
- br_stp_disable_port(p);
- break;
-
- case NETDEV_UP:
- if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP))
- br_stp_enable_port(p);
- break;
-
- case NETDEV_UNREGISTER:
- spin_unlock_bh(&br->lock);
- br_del_if(br, dev);
- goto done;
- }
- spin_unlock_bh(&br->lock);
-
- done:
- return NOTIFY_DONE;
-}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 74258d86f256..7280c4e9305f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1,23 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_private.h,v 1.7 2001/12/24 00:59:55 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _BR_PRIVATE_H
#define _BR_PRIVATE_H
#include <linux/netdevice.h>
-#include <linux/miscdevice.h>
#include <linux/if_bridge.h>
+#include <linux/netpoll.h>
+#include <linux/u64_stats_sync.h>
+#include <net/route.h>
+#include <net/ip6_fib.h>
+#include <net/pkt_cls.h>
+#include <linux/if_vlan.h>
+#include <linux/rhashtable.h>
+#include <linux/refcount.h>
#define BR_HASH_BITS 8
#define BR_HASH_SIZE (1 << BR_HASH_BITS)
@@ -27,44 +28,373 @@
#define BR_PORT_BITS 10
#define BR_MAX_PORTS (1<<BR_PORT_BITS)
-#define BR_PORT_DEBOUNCE (HZ/10)
+#define BR_MULTICAST_DEFAULT_HASH_MAX 4096
+#define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000)
+#define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN
+#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */
+#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX
+
+#define BR_HWDOM_MAX BITS_PER_LONG
+
+#define BR_VERSION "2.3"
+
+/* Control of forwarding link local multicast */
+#define BR_GROUPFWD_DEFAULT 0
+/* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */
+enum {
+ BR_GROUPFWD_STP = BIT(0),
+ BR_GROUPFWD_MACPAUSE = BIT(1),
+ BR_GROUPFWD_LACP = BIT(2),
+};
+
+#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \
+ BR_GROUPFWD_LACP)
+/* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
+#define BR_GROUPFWD_8021AD 0xB801u
+
+/* Path to usermode spanning tree program */
+#define BR_STP_PROG "/sbin/bridge-stp"
-#define BR_VERSION "2.2"
+#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT)
typedef struct bridge_id bridge_id;
typedef struct mac_addr mac_addr;
typedef __u16 port_id;
-struct bridge_id
-{
+struct bridge_id {
unsigned char prio[2];
- unsigned char addr[6];
+ unsigned char addr[ETH_ALEN];
};
-struct mac_addr
-{
- unsigned char addr[6];
+struct mac_addr {
+ unsigned char addr[ETH_ALEN];
};
-struct net_bridge_fdb_entry
-{
- struct hlist_node hlist;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+/* our own querier */
+struct bridge_mcast_own_query {
+ struct timer_list timer;
+ u32 startup_sent;
+};
+
+/* other querier */
+struct bridge_mcast_other_query {
+ struct timer_list timer;
+ struct timer_list delay_timer;
+};
+
+/* selected querier */
+struct bridge_mcast_querier {
+ struct br_ip addr;
+ int port_ifidx;
+ seqcount_spinlock_t seq;
+};
+
+/* IGMP/MLD statistics */
+struct bridge_mcast_stats {
+ struct br_mcast_stats mstats;
+ struct u64_stats_sync syncp;
+};
+
+struct br_mdb_src_entry {
+ struct br_ip addr;
+};
+
+struct br_mdb_config {
+ struct net_bridge *br;
+ struct net_bridge_port *p;
+ struct br_mdb_entry *entry;
+ struct br_ip group;
+ bool src_entry;
+ u8 filter_mode;
+ u16 nlflags;
+ struct br_mdb_src_entry *src_entries;
+ int num_src_entries;
+ u8 rt_protocol;
+};
+#endif
+
+/* net_bridge_mcast_port must be always defined due to forwarding stubs */
+struct net_bridge_mcast_port {
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct net_bridge_port *port;
+ struct net_bridge_vlan *vlan;
+
+ struct bridge_mcast_own_query ip4_own_query;
+ struct timer_list ip4_mc_router_timer;
+ struct hlist_node ip4_rlist;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct bridge_mcast_own_query ip6_own_query;
+ struct timer_list ip6_mc_router_timer;
+ struct hlist_node ip6_rlist;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+ unsigned char multicast_router;
+ u32 mdb_n_entries;
+ u32 mdb_max_entries;
+#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
+};
+
+/* net_bridge_mcast must be always defined due to forwarding stubs */
+struct net_bridge_mcast {
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct net_bridge *br;
+ struct net_bridge_vlan *vlan;
+
+ u32 multicast_last_member_count;
+ u32 multicast_startup_query_count;
+
+ u8 multicast_querier;
+ u8 multicast_igmp_version;
+ u8 multicast_router;
+#if IS_ENABLED(CONFIG_IPV6)
+ u8 multicast_mld_version;
+#endif
+ unsigned long multicast_last_member_interval;
+ unsigned long multicast_membership_interval;
+ unsigned long multicast_querier_interval;
+ unsigned long multicast_query_interval;
+ unsigned long multicast_query_response_interval;
+ unsigned long multicast_startup_query_interval;
+ struct hlist_head ip4_mc_router_list;
+ struct timer_list ip4_mc_router_timer;
+ struct bridge_mcast_other_query ip4_other_query;
+ struct bridge_mcast_own_query ip4_own_query;
+ struct bridge_mcast_querier ip4_querier;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct hlist_head ip6_mc_router_list;
+ struct timer_list ip6_mc_router_timer;
+ struct bridge_mcast_other_query ip6_other_query;
+ struct bridge_mcast_own_query ip6_own_query;
+ struct bridge_mcast_querier ip6_querier;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
+};
+
+struct br_tunnel_info {
+ __be64 tunnel_id;
+ struct metadata_dst __rcu *tunnel_dst;
+};
+
+/* private vlan flags */
+enum {
+ BR_VLFLAG_PER_PORT_STATS = BIT(0),
+ BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1),
+ BR_VLFLAG_MCAST_ENABLED = BIT(2),
+ BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3),
+ BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4),
+};
+
+/**
+ * struct net_bridge_vlan - per-vlan entry
+ *
+ * @vnode: rhashtable member
+ * @tnode: rhashtable member
+ * @vid: VLAN id
+ * @flags: bridge vlan flags
+ * @priv_flags: private (in-kernel) bridge vlan flags
+ * @state: STP state (e.g. blocking, learning, forwarding)
+ * @stats: per-cpu VLAN statistics
+ * @br: if MASTER flag set, this points to a bridge struct
+ * @port: if MASTER flag unset, this points to a port struct
+ * @refcnt: if MASTER flag set, this is bumped for each port referencing it
+ * @brvlan: if MASTER flag unset, this points to the global per-VLAN context
+ * for this VLAN entry
+ * @tinfo: bridge tunnel info
+ * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context
+ * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast
+ * context
+ * @msti: if MASTER flag set, this holds the VLANs MST instance
+ * @vlist: sorted list of VLAN entries
+ * @rcu: used for entry destruction
+ *
+ * This structure is shared between the global per-VLAN entries contained in
+ * the bridge rhashtable and the local per-port per-VLAN entries contained in
+ * the port's rhashtable. The union entries should be interpreted depending on
+ * the entry flags that are set.
+ */
+struct net_bridge_vlan {
+ struct rhash_head vnode;
+ struct rhash_head tnode;
+ u16 vid;
+ u16 flags;
+ u16 priv_flags;
+ u8 state;
+ struct pcpu_sw_netstats __percpu *stats;
+ union {
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ };
+ union {
+ refcount_t refcnt;
+ struct net_bridge_vlan *brvlan;
+ };
+
+ struct br_tunnel_info tinfo;
+
+ union {
+ struct net_bridge_mcast br_mcast_ctx;
+ struct net_bridge_mcast_port port_mcast_ctx;
+ };
+
+ u16 msti;
+
+ struct list_head vlist;
+
+ struct rcu_head rcu;
+};
+
+/**
+ * struct net_bridge_vlan_group
+ *
+ * @vlan_hash: VLAN entry rhashtable
+ * @vlan_list: sorted VLAN entry list
+ * @num_vlans: number of total VLAN entries
+ * @pvid: PVID VLAN id
+ * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking)
+ *
+ * IMPORTANT: Be careful when checking if there're VLAN entries using list
+ * primitives because the bridge can have entries in its list which
+ * are just for global context but not for filtering, i.e. they have
+ * the master flag set but not the brentry flag. If you have to check
+ * if there're "real" entries in the bridge please test @num_vlans
+ */
+struct net_bridge_vlan_group {
+ struct rhashtable vlan_hash;
+ struct rhashtable tunnel_hash;
+ struct list_head vlan_list;
+ u16 num_vlans;
+ u16 pvid;
+ u8 pvid_state;
+};
+
+/* bridge fdb flags */
+enum {
+ BR_FDB_LOCAL,
+ BR_FDB_STATIC,
+ BR_FDB_STICKY,
+ BR_FDB_ADDED_BY_USER,
+ BR_FDB_ADDED_BY_EXT_LEARN,
+ BR_FDB_OFFLOADED,
+ BR_FDB_NOTIFY,
+ BR_FDB_NOTIFY_INACTIVE,
+ BR_FDB_LOCKED,
+ BR_FDB_DYNAMIC_LEARNED,
+};
+
+struct net_bridge_fdb_key {
+ mac_addr addr;
+ u16 vlan_id;
+};
+
+struct net_bridge_fdb_entry {
+ struct rhash_head rhnode;
struct net_bridge_port *dst;
+ struct net_bridge_fdb_key key;
+ struct hlist_node fdb_node;
+ unsigned long flags;
+
+ /* write-heavy members should not affect lookups */
+ unsigned long updated ____cacheline_aligned_in_smp;
+ unsigned long used;
+
struct rcu_head rcu;
- atomic_t use_count;
- unsigned long ageing_timer;
- mac_addr addr;
- unsigned char is_local;
- unsigned char is_static;
};
-struct net_bridge_port
-{
+struct net_bridge_fdb_flush_desc {
+ unsigned long flags;
+ unsigned long flags_mask;
+ int port_ifindex;
+ u16 vlan_id;
+};
+
+#define MDB_PG_FLAGS_PERMANENT BIT(0)
+#define MDB_PG_FLAGS_OFFLOAD BIT(1)
+#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
+#define MDB_PG_FLAGS_STAR_EXCL BIT(3)
+#define MDB_PG_FLAGS_BLOCKED BIT(4)
+#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5)
+
+#define PG_SRC_ENT_LIMIT 32
+
+#define BR_SGRP_F_DELETE BIT(0)
+#define BR_SGRP_F_SEND BIT(1)
+#define BR_SGRP_F_INSTALLED BIT(2)
+#define BR_SGRP_F_USER_ADDED BIT(3)
+
+struct net_bridge_mcast_gc {
+ struct hlist_node gc_node;
+ void (*destroy)(struct net_bridge_mcast_gc *gc);
+};
+
+struct net_bridge_group_src {
+ struct hlist_node node;
+
+ struct br_ip addr;
+ struct net_bridge_port_group *pg;
+ u8 flags;
+ u8 src_query_rexmit_cnt;
+ struct timer_list timer;
+
+ struct net_bridge *br;
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
+};
+
+struct net_bridge_port_group_sg_key {
+ struct net_bridge_port *port;
+ struct br_ip addr;
+};
+
+struct net_bridge_port_group {
+ struct net_bridge_port_group __rcu *next;
+ struct net_bridge_port_group_sg_key key;
+ unsigned char eth_addr[ETH_ALEN] __aligned(2);
+ unsigned char flags;
+ unsigned char filter_mode;
+ unsigned char grp_query_rexmit_cnt;
+ unsigned char rt_protocol;
+
+ struct hlist_head src_list;
+ unsigned int src_ents;
+ struct timer_list timer;
+ struct timer_list rexmit_timer;
+ struct hlist_node mglist;
+ struct rb_root eht_set_tree;
+ struct rb_root eht_host_tree;
+
+ struct rhash_head rhnode;
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
+};
+
+struct net_bridge_mdb_entry {
+ struct rhash_head rhnode;
+ struct net_bridge *br;
+ struct net_bridge_port_group __rcu *ports;
+ struct br_ip addr;
+ bool host_joined;
+
+ struct timer_list timer;
+ struct hlist_node mdb_node;
+
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
+};
+
+struct net_bridge_port {
struct net_bridge *br;
struct net_device *dev;
+ netdevice_tracker dev_tracker;
struct list_head list;
+ unsigned long flags;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ struct net_bridge_vlan_group __rcu *vlgrp;
+#endif
+ struct net_bridge_port __rcu *backup_port;
+ u32 backup_nhid;
+
/* STP */
u8 priority;
u8 state;
@@ -77,53 +407,244 @@ struct net_bridge_port
bridge_id designated_bridge;
u32 path_cost;
u32 designated_cost;
+ unsigned long designated_age;
struct timer_list forward_delay_timer;
struct timer_list hold_timer;
struct timer_list message_age_timer;
struct kobject kobj;
- struct work_struct carrier_check;
struct rcu_head rcu;
+
+ struct net_bridge_mcast_port multicast_ctx;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct bridge_mcast_stats __percpu *mcast_stats;
+
+ u32 multicast_eht_hosts_limit;
+ u32 multicast_eht_hosts_cnt;
+ struct hlist_head mglist;
+#endif
+
+#ifdef CONFIG_SYSFS
+ char sysfs_name[IFNAMSIZ];
+#endif
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *np;
+#endif
+#ifdef CONFIG_NET_SWITCHDEV
+ /* Identifier used to group ports that share the same switchdev
+ * hardware domain.
+ */
+ int hwdom;
+ int offload_count;
+ struct netdev_phys_item_id ppid;
+#endif
+ u16 group_fwd_mask;
+ u16 backup_redirected_cnt;
+
+ struct bridge_stp_xstats stp_xstats;
};
-struct net_bridge
+#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
+
+#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
+#define br_promisc_port(p) ((p)->flags & BR_PROMISC)
+
+static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev)
+{
+ return rcu_dereference(dev->rx_handler_data);
+}
+
+static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev)
{
+ return netif_is_bridge_port(dev) ?
+ rtnl_dereference(dev->rx_handler_data) : NULL;
+}
+
+static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_device *dev)
+{
+ return netif_is_bridge_port(dev) ?
+ rcu_dereference_rtnl(dev->rx_handler_data) : NULL;
+}
+
+enum net_bridge_opts {
+ BROPT_VLAN_ENABLED,
+ BROPT_VLAN_STATS_ENABLED,
+ BROPT_NF_CALL_IPTABLES,
+ BROPT_NF_CALL_IP6TABLES,
+ BROPT_NF_CALL_ARPTABLES,
+ BROPT_GROUP_ADDR_SET,
+ BROPT_MULTICAST_ENABLED,
+ BROPT_MULTICAST_QUERY_USE_IFADDR,
+ BROPT_MULTICAST_STATS_ENABLED,
+ BROPT_HAS_IPV6_ADDR,
+ BROPT_NEIGH_SUPPRESS_ENABLED,
+ BROPT_MTU_SET_BY_USER,
+ BROPT_VLAN_STATS_PER_PORT,
+ BROPT_NO_LL_LEARN,
+ BROPT_VLAN_BRIDGE_BINDING,
+ BROPT_MCAST_VLAN_SNOOPING_ENABLED,
+ BROPT_MST_ENABLED,
+ BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION,
+ BROPT_FDB_LOCAL_VLAN_0,
+};
+
+struct net_bridge {
spinlock_t lock;
- struct list_head port_list;
- struct net_device *dev;
- struct net_device_stats statistics;
spinlock_t hash_lock;
- struct hlist_head hash[BR_HASH_SIZE];
- struct list_head age_list;
- unsigned long feature_mask;
+ struct hlist_head frame_type_list;
+ struct net_device *dev;
+ unsigned long options;
+ /* These fields are accessed on each packet */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ __be16 vlan_proto;
+ u16 default_pvid;
+ struct net_bridge_vlan_group __rcu *vlgrp;
+#endif
+
+ struct rhashtable fdb_hash_tbl;
+ struct list_head port_list;
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ union {
+ struct rtable fake_rtable;
+ struct rt6_info fake_rt6_info;
+ };
+ u32 metrics[RTAX_MAX];
+#endif
+ u16 group_fwd_mask;
+ u16 group_fwd_mask_required;
/* STP */
bridge_id designated_root;
bridge_id bridge_id;
- u32 root_path_cost;
+ unsigned char topology_change;
+ unsigned char topology_change_detected;
+ u16 root_port;
unsigned long max_age;
unsigned long hello_time;
unsigned long forward_delay;
- unsigned long bridge_max_age;
unsigned long ageing_time;
+ unsigned long bridge_max_age;
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
+ unsigned long bridge_ageing_time;
+ u32 root_path_cost;
u8 group_addr[ETH_ALEN];
- u16 root_port;
- unsigned char stp_enabled;
- unsigned char topology_change;
- unsigned char topology_change_detected;
+
+ enum {
+ BR_NO_STP, /* no spanning tree */
+ BR_KERNEL_STP, /* old STP in kernel */
+ BR_USER_STP, /* new RSTP in userspace */
+ } stp_enabled;
+
+ struct net_bridge_mcast multicast_ctx;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct bridge_mcast_stats __percpu *mcast_stats;
+
+ u32 hash_max;
+
+ spinlock_t multicast_lock;
+
+ struct rhashtable mdb_hash_tbl;
+ struct rhashtable sg_port_tbl;
+
+ struct hlist_head mcast_gc_list;
+ struct hlist_head mdb_list;
+
+ struct work_struct mcast_gc_work;
+#endif
struct timer_list hello_timer;
struct timer_list tcn_timer;
struct timer_list topology_change_timer;
- struct timer_list gc_timer;
- struct kobject ifobj;
+ struct delayed_work gc_work;
+ struct kobject *ifobj;
+ u32 auto_cnt;
+
+ atomic_t fdb_n_learned;
+ u32 fdb_max_learned;
+
+#ifdef CONFIG_NET_SWITCHDEV
+ /* Counter used to make sure that hardware domains get unique
+ * identifiers in case a bridge spans multiple switchdev instances.
+ */
+ int last_hwdom;
+ /* Bit mask of hardware domain numbers in use */
+ unsigned long busy_hwdoms;
+#endif
+ struct hlist_head fdb_list;
+
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+ struct hlist_head mrp_list;
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+ struct hlist_head mep_list;
+#endif
};
-extern struct notifier_block br_device_notifier;
-extern const u8 br_group_address[ETH_ALEN];
+struct br_input_skb_cb {
+ struct net_device *brdev;
+
+ u16 frag_max_size;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ u8 igmp;
+ u8 mrouters_only:1;
+#endif
+ u8 proxyarp_replied:1;
+ u8 src_port_isolated:1;
+ u8 promisc:1;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ u8 vlan_filtered:1;
+#endif
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ u8 br_netfilter_broute:1;
+#endif
+
+#ifdef CONFIG_NET_SWITCHDEV
+ /* Set if TX data plane offloading is used towards at least one
+ * hardware domain.
+ */
+ u8 tx_fwd_offload:1;
+ /* The switchdev hardware domain from which this packet was received.
+ * If skb->offload_fwd_mark was set, then this packet was already
+ * forwarded by hardware to the other ports in the source hardware
+ * domain, otherwise it wasn't.
+ */
+ int src_hwdom;
+ /* Bit mask of hardware domains towards this packet has already been
+ * transmitted using the TX data plane offload.
+ */
+ unsigned long fwd_hwdoms;
+#endif
+
+ u32 backup_nhid;
+};
+
+#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb)
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (BR_INPUT_SKB_CB(__skb)->mrouters_only)
+#else
+# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (0)
+#endif
+
+#define br_printk(level, br, format, args...) \
+ printk(level "%s: " format, (br)->dev->name, ##args)
+
+#define br_err(__br, format, args...) \
+ br_printk(KERN_ERR, __br, format, ##args)
+#define br_warn(__br, format, args...) \
+ br_printk(KERN_WARNING, __br, format, ##args)
+#define br_notice(__br, format, args...) \
+ br_printk(KERN_NOTICE, __br, format, ##args)
+#define br_info(__br, format, args...) \
+ br_printk(KERN_INFO, __br, format, ##args)
+
+#define br_debug(br, format, args...) \
+ pr_debug("%s: " format, (br)->dev->name, ##args)
/* called under bridge lock */
static inline int br_is_root_bridge(const struct net_bridge *br)
@@ -131,131 +652,1694 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
return !memcmp(&br->bridge_id, &br->designated_root, 8);
}
+/* check if a VLAN entry is global */
+static inline bool br_vlan_is_master(const struct net_bridge_vlan *v)
+{
+ return v->flags & BRIDGE_VLAN_INFO_MASTER;
+}
+
+/* check if a VLAN entry is used by the bridge */
+static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v)
+{
+ return v->flags & BRIDGE_VLAN_INFO_BRENTRY;
+}
+
+/* check if we should use the vlan entry, returns false if it's only context */
+static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
+{
+ if (br_vlan_is_master(v)) {
+ if (br_vlan_is_brentry(v))
+ return true;
+ else
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool nbp_state_should_learn(const struct net_bridge_port *p)
+{
+ return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING;
+}
+
+static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack)
+{
+ bool ret = vid > 0 && vid < VLAN_VID_MASK;
+
+ if (!ret)
+ NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid");
+
+ return ret;
+}
+
+static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur,
+ const struct bridge_vlan_info *last,
+ struct netlink_ext_ack *extack)
+{
+ /* pvid flag is not allowed in ranges */
+ if (cur->flags & BRIDGE_VLAN_INFO_PVID) {
+ NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range");
+ return false;
+ }
+
+ /* when cur is the range end, check if:
+ * - it has range start flag
+ * - range ids are invalid (end is equal to or before start)
+ */
+ if (last) {
+ if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one");
+ return false;
+ } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing");
+ return false;
+ } else if (cur->vid <= last->vid) {
+ NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+ return false;
+ }
+ }
+
+ /* check for required range flags */
+ if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
+ BRIDGE_VLAN_INFO_RANGE_END))) {
+ NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing");
+ return false;
+ }
+
+ return true;
+}
+
+static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v)
+{
+ u8 mcast_router = MDB_RTR_TYPE_DISABLED;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (!br_vlan_is_master(v))
+ mcast_router = v->port_mcast_ctx.multicast_router;
+ else
+ mcast_router = v->br_mcast_ctx.multicast_router;
+#endif
+
+ return mcast_router;
+}
+
+static inline int br_afspec_cmd_to_rtm(int cmd)
+{
+ switch (cmd) {
+ case RTM_SETLINK:
+ return RTM_NEWVLAN;
+ case RTM_DELLINK:
+ return RTM_DELVLAN;
+ }
+
+ return 0;
+}
+
+static inline int br_opt_get(const struct net_bridge *br,
+ enum net_bridge_opts opt)
+{
+ return test_bit(opt, &br->options);
+}
+
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+ struct netlink_ext_ack *extack);
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt);
+int br_boolopt_multi_toggle(struct net_bridge *br,
+ struct br_boolopt_multi *bm,
+ struct netlink_ext_ack *extack);
+void br_boolopt_multi_get(const struct net_bridge *br,
+ struct br_boolopt_multi *bm);
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss)
+{
+ struct tc_skb_ext *ext;
+
+ if (!tc_skb_ext_tc_enabled())
+ return;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+ if (ext) {
+ ext->l2_miss = miss;
+ return;
+ }
+ if (!miss)
+ return;
+ ext = tc_skb_ext_alloc(skb);
+ if (!ext)
+ return;
+ ext->l2_miss = true;
+}
+#else
+static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss)
+{
+}
+#endif
/* br_device.c */
-extern void br_dev_setup(struct net_device *dev);
-extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
+void br_dev_setup(struct net_device *dev);
+void br_dev_delete(struct net_device *dev, struct list_head *list);
+netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ netpoll_send_skb(p->np, skb);
+}
+
+int br_netpoll_enable(struct net_bridge_port *p);
+void br_netpoll_disable(struct net_bridge_port *p);
+#else
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline int br_netpoll_enable(struct net_bridge_port *p)
+{
+ return 0;
+}
+
+static inline void br_netpoll_disable(struct net_bridge_port *p)
+{
+}
+#endif
/* br_fdb.c */
-extern void br_fdb_init(void);
-extern void br_fdb_fini(void);
-extern void br_fdb_changeaddr(struct net_bridge_port *p,
- const unsigned char *newaddr);
-extern void br_fdb_cleanup(unsigned long arg);
-extern void br_fdb_delete_by_port(struct net_bridge *br,
- const struct net_bridge_port *p, int do_all);
-extern struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
- const unsigned char *addr);
-extern struct net_bridge_fdb_entry *br_fdb_get(struct net_bridge *br,
- unsigned char *addr);
-extern void br_fdb_put(struct net_bridge_fdb_entry *ent);
-extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
- unsigned long count, unsigned long off);
-extern int br_fdb_insert(struct net_bridge *br,
- struct net_bridge_port *source,
- const unsigned char *addr);
-extern void br_fdb_update(struct net_bridge *br,
- struct net_bridge_port *source,
- const unsigned char *addr);
+#define FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF)
+#define FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP)
+#define FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_USE | NTF_EXT_LEARNED | \
+ NTF_STICKY | NTF_OFFLOADED)
+
+int br_fdb_init(void);
+void br_fdb_fini(void);
+int br_fdb_hash_init(struct net_bridge *br);
+void br_fdb_hash_fini(struct net_bridge *br);
+void br_fdb_flush(struct net_bridge *br,
+ const struct net_bridge_fdb_flush_desc *desc);
+void br_fdb_find_delete_local(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid);
+void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
+void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
+void br_fdb_cleanup(struct work_struct *work);
+int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
+void br_fdb_delete_by_port(struct net_bridge *br,
+ const struct net_bridge_port *p, u16 vid, int do_all);
+struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid);
+int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
+int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
+ unsigned long off);
+int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid);
+void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid, unsigned long flags);
+
+int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev, const unsigned char *addr, u16 vid,
+ bool *notified, struct netlink_ext_ack *extack);
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
+ struct netlink_ext_ack *extack);
+int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
+ const unsigned char *addr, u16 vid, u16 nlh_flags,
+ bool *notified, struct netlink_ext_ack *extack);
+int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev, struct net_device *fdev, int *idx);
+int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
+ const unsigned char *addr, u16 vid, u32 portid, u32 seq,
+ struct netlink_ext_ack *extack);
+int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
+void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid,
+ bool locked, bool swdev_notify);
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
+void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid, bool offloaded);
/* br_forward.c */
-extern void br_deliver(const struct net_bridge_port *to,
- struct sk_buff *skb);
-extern int br_dev_queue_push_xmit(struct sk_buff *skb);
-extern void br_forward(const struct net_bridge_port *to,
- struct sk_buff *skb);
-extern int br_forward_finish(struct sk_buff *skb);
-extern void br_flood_deliver(struct net_bridge *br,
- struct sk_buff *skb,
- int clone);
-extern void br_flood_forward(struct net_bridge *br,
- struct sk_buff *skb,
- int clone);
+enum br_pkt_type {
+ BR_PKT_UNICAST,
+ BR_PKT_MULTICAST,
+ BR_PKT_BROADCAST
+};
+int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
+void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
+ bool local_rcv, bool local_orig);
+int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
+void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
+ u16 vid);
+
+/* return true if both source port and dest port are isolated */
+static inline bool br_skb_isolated(const struct net_bridge_port *to,
+ const struct sk_buff *skb)
+{
+ return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
+ (to->flags & BR_ISOLATED);
+}
/* br_if.c */
-extern int br_add_bridge(const char *name);
-extern int br_del_bridge(const char *name);
-extern void br_cleanup_bridges(void);
-extern int br_add_if(struct net_bridge *br,
- struct net_device *dev);
-extern int br_del_if(struct net_bridge *br,
- struct net_device *dev);
-extern int br_min_mtu(const struct net_bridge *br);
-extern void br_features_recompute(struct net_bridge *br);
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
+int br_add_bridge(struct net *net, const char *name);
+int br_del_bridge(struct net *net, const char *name);
+int br_add_if(struct net_bridge *br, struct net_device *dev,
+ struct netlink_ext_ack *extack);
+int br_del_if(struct net_bridge *br, struct net_device *dev);
+void br_mtu_auto_adjust(struct net_bridge *br);
+netdev_features_t br_features_recompute(struct net_bridge *br,
+ netdev_features_t features);
+void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
+void br_manage_promisc(struct net_bridge *br);
+int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
/* br_input.c */
-extern int br_handle_frame_finish(struct sk_buff *skb);
-extern int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb);
+int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
+rx_handler_func_t *br_get_rx_handler(const struct net_device *dev);
+
+struct br_frame_type {
+ __be16 type;
+ int (*frame_handler)(struct net_bridge_port *port,
+ struct sk_buff *skb);
+ struct hlist_node list;
+};
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft);
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft);
+
+static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
+{
+ return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev);
+}
+
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev);
+}
+
+static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
+{
+ return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
+}
+
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+ return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
/* br_ioctl.c */
-extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg);
+int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
+ void __user *data, int cmd);
+int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg);
+
+/* br_multicast.c */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+ struct net_bridge_mcast_port **pmctx,
+ struct net_bridge_vlan *vlan,
+ struct sk_buff *skb, u16 vid);
+struct net_bridge_mdb_entry *
+br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
+ u16 vid);
+int br_multicast_add_port(struct net_bridge_port *port);
+void br_multicast_del_port(struct net_bridge_port *port);
+void br_multicast_enable_port(struct net_bridge_port *port);
+void br_multicast_disable_port(struct net_bridge_port *port);
+void br_multicast_init(struct net_bridge *br);
+void br_multicast_join_snoopers(struct net_bridge *br);
+void br_multicast_leave_snoopers(struct net_bridge *br);
+void br_multicast_open(struct net_bridge *br);
+void br_multicast_stop(struct net_bridge *br);
+void br_multicast_dev_del(struct net_bridge *br);
+void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
+ struct net_bridge_mcast *brmctx,
+ bool local_rcv, bool local_orig);
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
+int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
+ unsigned long val);
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router);
+int br_multicast_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+ unsigned long val);
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+ unsigned long val);
+#endif
+struct net_bridge_mdb_entry *
+br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
+struct net_bridge_mdb_entry *
+br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
+struct net_bridge_port_group *
+br_multicast_new_port_group(struct net_bridge_port *port,
+ const struct br_ip *group,
+ struct net_bridge_port_group __rcu *next,
+ unsigned char flags, const unsigned char *src,
+ u8 filter_mode, u8 rt_protocol,
+ struct netlink_ext_ack *extack);
+void br_multicast_del_port_group(struct net_bridge_port_group *p);
+int br_mdb_hash_init(struct net_bridge *br);
+void br_mdb_hash_fini(struct net_bridge *br);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg, int type);
+void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg);
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
+ int type);
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp);
+void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const struct sk_buff *skb, u8 type, u8 dir);
+int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_uninit_stats(struct net_bridge *br);
+void br_multicast_get_stats(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct br_mcast_stats *dest);
+u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx);
+void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max);
+u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx);
+int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+ struct netlink_ext_ack *extack);
+int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack);
+int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack);
+int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+ struct netlink_callback *cb);
+int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq,
+ struct netlink_ext_ack *extack);
+void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
+ u8 filter_mode);
+void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg);
+struct net_bridge_group_src *
+br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip);
+struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg,
+ struct br_ip *src_ip);
+void __br_multicast_del_group_src(struct net_bridge_group_src *src);
+void br_multicast_del_group_src(struct net_bridge_group_src *src,
+ bool fastleave);
+void br_multicast_ctx_init(struct net_bridge *br,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast *brmctx);
+void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx);
+void br_multicast_port_ctx_init(struct net_bridge_port *port,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast_port *pmctx);
+void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx);
+void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state);
+void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
+
+int br_rports_fill_info(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx);
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx,
+ int nest_attr);
+size_t br_multicast_querier_state_size(void);
+size_t br_rports_size(const struct net_bridge_mcast *brmctx);
+void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val);
+void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val);
+
+static inline bool br_group_is_l2(const struct br_ip *group)
+{
+ return group->proto == 0;
+}
+
+#define mlock_dereference(X, br) \
+ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
+
+static inline struct hlist_node *
+br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx,
+ struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (skb->protocol == htons(ETH_P_IPV6))
+ return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list));
+#endif
+ return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list));
+}
+
+static inline struct net_bridge_port *
+br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb)
+{
+ struct net_bridge_mcast_port *mctx;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (skb->protocol == htons(ETH_P_IPV6))
+ mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port,
+ ip6_rlist);
+ else
+#endif
+ mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port,
+ ip4_rlist);
+
+ if (mctx)
+ return mctx->port;
+ else
+ return NULL;
+}
+
+static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx)
+{
+ return timer_pending(&brmctx->ip4_mc_router_timer);
+}
+
+static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return timer_pending(&brmctx->ip6_mc_router_timer);
+#else
+ return false;
+#endif
+}
+
+static inline bool
+br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb)
+{
+ switch (brmctx->multicast_router) {
+ case MDB_RTR_TYPE_PERM:
+ return true;
+ case MDB_RTR_TYPE_TEMP_QUERY:
+ if (skb) {
+ if (skb->protocol == htons(ETH_P_IP))
+ return br_ip4_multicast_is_router(brmctx);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return br_ip6_multicast_is_router(brmctx);
+ } else {
+ return br_ip4_multicast_is_router(brmctx) ||
+ br_ip6_multicast_is_router(brmctx);
+ }
+ fallthrough;
+ default:
+ return false;
+ }
+}
+
+static inline bool
+__br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_other_query *querier,
+ const bool is_ipv6)
+{
+ bool own_querier_enabled;
+
+ if (brmctx->multicast_querier) {
+ if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR))
+ own_querier_enabled = false;
+ else
+ own_querier_enabled = true;
+ } else {
+ own_querier_enabled = false;
+ }
+
+ return !timer_pending(&querier->delay_timer) &&
+ (own_querier_enabled || timer_pending(&querier->timer));
+}
+
+static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
+ struct ethhdr *eth,
+ const struct net_bridge_mdb_entry *mdb)
+{
+ switch (eth->h_proto) {
+ case (htons(ETH_P_IP)):
+ return __br_multicast_querier_exists(brmctx,
+ &brmctx->ip4_other_query, false);
+#if IS_ENABLED(CONFIG_IPV6)
+ case (htons(ETH_P_IPV6)):
+ return __br_multicast_querier_exists(brmctx,
+ &brmctx->ip6_other_query, true);
+#endif
+ default:
+ return !!mdb && br_group_is_l2(&mdb->addr);
+ }
+}
+
+static inline bool br_multicast_is_star_g(const struct br_ip *ip)
+{
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ return ipv4_is_zeronet(ip->src.ip4);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return ipv6_addr_any(&ip->src.ip6);
+#endif
+ default:
+ return false;
+ }
+}
+
+static inline bool
+br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx,
+ __be16 proto)
+{
+ switch (proto) {
+ case htons(ETH_P_IP):
+ return !!(brmctx->multicast_igmp_version == 3);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return !!(brmctx->multicast_mld_version == 2);
+#endif
+ default:
+ return false;
+ }
+}
+
+static inline int br_multicast_igmp_type(const struct sk_buff *skb)
+{
+ return BR_INPUT_SKB_CB(skb)->igmp;
+}
+
+static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx)
+{
+ return brmctx->multicast_last_member_interval *
+ brmctx->multicast_last_member_count;
+}
+
+static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx)
+{
+ return brmctx->multicast_membership_interval;
+}
+
+static inline bool
+br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx)
+{
+ return !!brmctx->vlan;
+}
+
+static inline bool
+br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx)
+{
+ return !!pmctx->vlan;
+}
+
+static inline struct net_bridge_mcast *
+br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx)
+{
+ if (!br_multicast_port_ctx_is_vlan(pmctx))
+ return &pmctx->port->br->multicast_ctx;
+ else
+ return &pmctx->vlan->brvlan->br_mcast_ctx;
+}
+
+static inline bool
+br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx)
+{
+ return br_multicast_ctx_is_vlan(brmctx) &&
+ (!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) ||
+ !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED));
+}
+
+static inline bool
+br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx)
+{
+ return br_multicast_ctx_is_vlan(brmctx) &&
+ !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
+}
+
+static inline bool
+br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx)
+{
+ return br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
+}
+
+static inline bool
+br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx)
+{
+ return pmctx->port->state == BR_STATE_DISABLED ||
+ (br_multicast_port_ctx_is_vlan(pmctx) &&
+ (br_multicast_port_ctx_vlan_disabled(pmctx) ||
+ pmctx->vlan->state == BR_STATE_DISABLED));
+}
+
+static inline bool
+br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
+{
+ return br_multicast_port_ctx_state_disabled(pmctx) ||
+ pmctx->port->state == BR_STATE_BLOCKING ||
+ (br_multicast_port_ctx_is_vlan(pmctx) &&
+ pmctx->vlan->state == BR_STATE_BLOCKING);
+}
+
+static inline bool
+br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return !hlist_empty(&brmctx->ip4_mc_router_list) ||
+ !hlist_empty(&brmctx->ip6_mc_router_list);
+#else
+ return !hlist_empty(&brmctx->ip4_mc_router_list);
+#endif
+}
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+ const struct net_bridge_mcast *brmctx2)
+{
+ return brmctx1->multicast_igmp_version ==
+ brmctx2->multicast_igmp_version &&
+ brmctx1->multicast_last_member_count ==
+ brmctx2->multicast_last_member_count &&
+ brmctx1->multicast_startup_query_count ==
+ brmctx2->multicast_startup_query_count &&
+ brmctx1->multicast_last_member_interval ==
+ brmctx2->multicast_last_member_interval &&
+ brmctx1->multicast_membership_interval ==
+ brmctx2->multicast_membership_interval &&
+ brmctx1->multicast_querier_interval ==
+ brmctx2->multicast_querier_interval &&
+ brmctx1->multicast_query_interval ==
+ brmctx2->multicast_query_interval &&
+ brmctx1->multicast_query_response_interval ==
+ brmctx2->multicast_query_response_interval &&
+ brmctx1->multicast_startup_query_interval ==
+ brmctx2->multicast_startup_query_interval &&
+ brmctx1->multicast_querier == brmctx2->multicast_querier &&
+ brmctx1->multicast_router == brmctx2->multicast_router &&
+ !br_rports_have_mc_router(brmctx1) &&
+ !br_rports_have_mc_router(brmctx2) &&
+#if IS_ENABLED(CONFIG_IPV6)
+ brmctx1->multicast_mld_version ==
+ brmctx2->multicast_mld_version &&
+#endif
+ true;
+}
+
+static inline bool
+br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx)
+{
+ bool vlan_snooping_enabled;
+
+ vlan_snooping_enabled = !!br_opt_get(brmctx->br,
+ BROPT_MCAST_VLAN_SNOOPING_ENABLED);
+
+ return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx));
+}
+
+static inline void
+br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p,
+ bool offloaded)
+{
+ p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED);
+ p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD :
+ MDB_PG_FLAGS_OFFLOAD_FAILED);
+}
+
+static inline bool
+br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags)
+{
+ return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) &&
+ (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED);
+}
+#else
+static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+ struct net_bridge_mcast_port **pmctx,
+ struct net_bridge_vlan *vlan,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ return 0;
+}
+
+static inline struct net_bridge_mdb_entry *
+br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
+ u16 vid)
+{
+ return NULL;
+}
+
+static inline int br_multicast_add_port(struct net_bridge_port *port)
+{
+ return 0;
+}
+
+static inline void br_multicast_del_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_enable_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_disable_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_init(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_join_snoopers(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_leave_snoopers(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_open(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_stop(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_dev_del(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb,
+ struct net_bridge_mcast *brmctx,
+ bool local_rcv, bool local_orig)
+{
+}
+
+static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
+ struct ethhdr *eth,
+ const struct net_bridge_mdb_entry *mdb)
+{
+ return false;
+}
+
+static inline int br_mdb_add(struct net_device *dev, struct nlattr *tb[],
+ u16 nlmsg_flags, struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static inline int br_mdb_get(struct net_device *dev, struct nlattr *tb[],
+ u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_hash_init(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline void br_mdb_hash_fini(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const struct sk_buff *skb,
+ u8 type, u8 dir)
+{
+}
+
+static inline int br_multicast_init_stats(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline void br_multicast_uninit_stats(struct net_bridge *br)
+{
+}
+
+static inline int br_multicast_igmp_type(const struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline void br_multicast_ctx_init(struct net_bridge *br,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast *brmctx)
+{
+}
+
+static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx)
+{
+}
+
+static inline void br_multicast_port_ctx_init(struct net_bridge_port *port,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast_port *pmctx)
+{
+}
+
+static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx)
+{
+}
+
+static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v,
+ u8 state)
+{
+}
+
+static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan,
+ bool on)
+{
+}
+
+static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
+ bool on,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan,
+ bool on)
+{
+ return false;
+}
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+ const struct net_bridge_mcast *brmctx2)
+{
+ return true;
+}
+#endif
+
+/* br_vlan.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+bool br_allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg, struct sk_buff *skb,
+ u16 *vid, u8 *state,
+ struct net_bridge_vlan **vlan);
+bool br_allowed_egress(struct net_bridge_vlan_group *vg,
+ const struct sk_buff *skb);
+bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
+struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_bridge_port *port,
+ struct net_bridge_vlan_group *vg,
+ struct sk_buff *skb);
+int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
+ bool *changed, struct netlink_ext_ack *extack);
+int br_vlan_delete(struct net_bridge *br, u16 vid);
+void br_vlan_flush(struct net_bridge *br);
+struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid);
+void br_recalculate_fwd_mask(struct net_bridge *br);
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
+ struct netlink_ext_ack *extack);
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
+int br_vlan_init(struct net_bridge *br);
+int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
+ struct netlink_ext_ack *extack);
+int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
+ bool *changed, struct netlink_ext_ack *extack);
+int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
+void nbp_vlan_flush(struct net_bridge_port *port);
+int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack);
+int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
+void br_vlan_get_stats(const struct net_bridge_vlan *v,
+ struct pcpu_sw_netstats *stats);
+void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
+int br_vlan_bridge_event(struct net_device *dev, unsigned long event,
+ void *ptr);
+void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event);
+int br_vlan_rtnl_init(void);
+void br_vlan_rtnl_uninit(void);
+void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd);
+bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
+
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path);
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path);
+
+static inline struct net_bridge_vlan_group *br_vlan_group(
+ const struct net_bridge *br)
+{
+ return rtnl_dereference(br->vlgrp);
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group(
+ const struct net_bridge_port *p)
+{
+ return rtnl_dereference(p->vlgrp);
+}
+
+static inline struct net_bridge_vlan_group *br_vlan_group_rcu(
+ const struct net_bridge *br)
+{
+ return rcu_dereference(br->vlgrp);
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
+ const struct net_bridge_port *p)
+{
+ return rcu_dereference(p->vlgrp);
+}
+
+/* Since bridge now depends on 8021Q module, but the time bridge sees the
+ * skb, the vlan tag will always be present if the frame was tagged.
+ */
+static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
+{
+ int err = 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ *vid = skb_vlan_tag_get_id(skb);
+ } else {
+ *vid = 0;
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
+{
+ if (!vg)
+ return 0;
+
+ smp_rmb();
+ return vg->pvid;
+}
+
+static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
+{
+ return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags;
+}
+#else
+static inline bool br_allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct sk_buff *skb,
+ u16 *vid, u8 *state,
+ struct net_bridge_vlan **vlan)
+
+{
+ *vlan = NULL;
+ return true;
+}
+
+static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg,
+ const struct sk_buff *skb)
+{
+ return true;
+}
+
+static inline bool br_should_learn(struct net_bridge_port *p,
+ struct sk_buff *skb, u16 *vid)
+{
+ return true;
+}
+
+static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_bridge_port *port,
+ struct net_bridge_vlan_group *vg,
+ struct sk_buff *skb)
+{
+ return skb;
+}
+
+static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
+ bool *changed, struct netlink_ext_ack *extack)
+{
+ *changed = false;
+ return -EOPNOTSUPP;
+}
+
+static inline int br_vlan_delete(struct net_bridge *br, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void br_vlan_flush(struct net_bridge *br)
+{
+}
+
+static inline void br_recalculate_fwd_mask(struct net_bridge *br)
+{
+}
+
+static inline int br_vlan_init(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
+ bool *changed, struct netlink_ext_ack *extack)
+{
+ *changed = false;
+ return -EOPNOTSUPP;
+}
+
+static inline int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void nbp_vlan_flush(struct net_bridge_port *port)
+{
+}
+
+static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg,
+ u16 vid)
+{
+ return NULL;
+}
+
+static inline int nbp_vlan_init(struct net_bridge_port *port,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag)
+{
+ return 0;
+}
+
+static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+
+static inline int br_vlan_filter_toggle(struct net_bridge *br,
+ unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p,
+ u32 filter_mask)
+{
+ return 0;
+}
+
+static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+}
+
+static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ return 0;
+}
+
+static inline struct net_bridge_vlan_group *br_vlan_group(
+ const struct net_bridge *br)
+{
+ return NULL;
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group(
+ const struct net_bridge_port *p)
+{
+ return NULL;
+}
+
+static inline struct net_bridge_vlan_group *br_vlan_group_rcu(
+ const struct net_bridge *br)
+{
+ return NULL;
+}
+
+static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
+ const struct net_bridge_port *p)
+{
+ return NULL;
+}
+
+static inline void br_vlan_get_stats(const struct net_bridge_vlan *v,
+ struct pcpu_sw_netstats *stats)
+{
+}
+
+static inline void br_vlan_port_event(struct net_bridge_port *p,
+ unsigned long event)
+{
+}
+
+static inline int br_vlan_bridge_event(struct net_device *dev,
+ unsigned long event, void *ptr)
+{
+ return 0;
+}
+
+static inline void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event)
+{
+}
+
+static inline int br_vlan_rtnl_init(void)
+{
+ return 0;
+}
+
+static inline void br_vlan_rtnl_uninit(void)
+{
+}
+
+static inline void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd)
+{
+}
+
+static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return true;
+}
+
+static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
+{
+ return 0;
+}
+
+#endif
+
+/* br_vlan_options.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
+ const struct net_bridge_port *p);
+size_t br_vlan_opts_nl_size(void);
+int br_vlan_process_options(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan *range_start,
+ struct net_bridge_vlan *range_end,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack);
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd,
+ struct netlink_ext_ack *extack);
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *r_end);
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts);
+
+/* vlan state manipulation helpers using *_ONCE to annotate lock-free access,
+ * while br_vlan_set_state() may access data protected by multicast_lock.
+ */
+static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
+{
+ return READ_ONCE(v->state);
+}
+
+static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state)
+{
+ WRITE_ONCE(v->state, state);
+ br_multicast_update_vlan_mcast_ctx(v, state);
+}
+
+static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg)
+{
+ return READ_ONCE(vg->pvid_state);
+}
+
+static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg,
+ u8 state)
+{
+ WRITE_ONCE(vg->pvid_state, state);
+}
+
+/* learn_allow is true at ingress and false at egress */
+static inline bool br_vlan_state_allowed(u8 state, bool learn_allow)
+{
+ switch (state) {
+ case BR_STATE_LEARNING:
+ return learn_allow;
+ case BR_STATE_FORWARDING:
+ return true;
+ default:
+ return false;
+ }
+}
+#endif
+
+/* br_mst.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+DECLARE_STATIC_KEY_FALSE(br_mst_used);
+static inline bool br_mst_is_enabled(const struct net_bridge_port *p)
+{
+ /* check the port's vlan group to avoid racing with port deletion */
+ return static_branch_unlikely(&br_mst_used) &&
+ br_opt_get(p->br, BROPT_MST_ENABLED) &&
+ rcu_access_pointer(p->vlgrp);
+}
+
+int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
+ struct netlink_ext_ack *extack);
+int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti);
+void br_mst_vlan_init_state(struct net_bridge_vlan *v);
+int br_mst_set_enabled(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
+size_t br_mst_info_size(const struct net_bridge_vlan_group *vg);
+int br_mst_fill_info(struct sk_buff *skb,
+ const struct net_bridge_vlan_group *vg);
+int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
+ struct netlink_ext_ack *extack);
+void br_mst_uninit(struct net_bridge *br);
+#else
+static inline bool br_mst_is_enabled(const struct net_bridge_port *p)
+{
+ return false;
+}
+
+static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti,
+ u8 state, struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mst_set_enabled(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+
+static inline int br_mst_fill_info(struct sk_buff *skb,
+ const struct net_bridge_vlan_group *vg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mst_process(struct net_bridge_port *p,
+ const struct nlattr *mst_attr,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void br_mst_uninit(struct net_bridge *br)
+{
+}
+#endif
+
+struct nf_br_ops {
+ int (*br_dev_xmit_hook)(struct sk_buff *skb);
+};
+extern const struct nf_br_ops __rcu *nf_br_ops;
/* br_netfilter.c */
-#ifdef CONFIG_BRIDGE_NETFILTER
-extern int br_netfilter_init(void);
-extern void br_netfilter_fini(void);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+int br_nf_core_init(void);
+void br_nf_core_fini(void);
+void br_netfilter_rtable_init(struct net_bridge *);
#else
-#define br_netfilter_init() (0)
-#define br_netfilter_fini() do { } while(0)
+static inline int br_nf_core_init(void) { return 0; }
+static inline void br_nf_core_fini(void) {}
+#define br_netfilter_rtable_init(x)
#endif
/* br_stp.c */
-extern void br_log_state(const struct net_bridge_port *p);
-extern struct net_bridge_port *br_get_port(struct net_bridge *br,
- u16 port_no);
-extern void br_init_port(struct net_bridge_port *p);
-extern void br_become_designated_port(struct net_bridge_port *p);
+void br_set_state(struct net_bridge_port *p, unsigned int state);
+struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no);
+void br_init_port(struct net_bridge_port *p);
+void br_become_designated_port(struct net_bridge_port *p);
+
+void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
+int br_set_forward_delay(struct net_bridge *br, unsigned long x);
+int br_set_hello_time(struct net_bridge *br, unsigned long x);
+int br_set_max_age(struct net_bridge *br, unsigned long x);
+int __set_ageing_time(struct net_device *dev, unsigned long t);
+int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
+
/* br_stp_if.c */
-extern void br_stp_enable_bridge(struct net_bridge *br);
-extern void br_stp_disable_bridge(struct net_bridge *br);
-extern void br_stp_enable_port(struct net_bridge_port *p);
-extern void br_stp_disable_port(struct net_bridge_port *p);
-extern void br_stp_recalculate_bridge_id(struct net_bridge *br);
-extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
-extern void br_stp_set_bridge_priority(struct net_bridge *br,
- u16 newprio);
-extern void br_stp_set_port_priority(struct net_bridge_port *p,
- u8 newprio);
-extern void br_stp_set_path_cost(struct net_bridge_port *p,
- u32 path_cost);
-extern ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id);
+void br_stp_enable_bridge(struct net_bridge *br);
+void br_stp_disable_bridge(struct net_bridge *br);
+int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+void br_stp_enable_port(struct net_bridge_port *p);
+void br_stp_disable_port(struct net_bridge_port *p);
+bool br_stp_recalculate_bridge_id(struct net_bridge *br);
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
+void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio);
+int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio);
+int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost);
+ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id);
/* br_stp_bpdu.c */
-extern int br_stp_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev);
+struct stp_proto;
+void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+ struct net_device *dev);
/* br_stp_timer.c */
-extern void br_stp_timer_init(struct net_bridge *br);
-extern void br_stp_port_timer_init(struct net_bridge_port *p);
-extern unsigned long br_timer_value(const struct timer_list *timer);
+void br_stp_timer_init(struct net_bridge *br);
+void br_stp_port_timer_init(struct net_bridge_port *p);
+unsigned long br_timer_value(const struct timer_list *timer);
/* br.c */
-extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
- unsigned char *addr);
-extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
+#if IS_ENABLED(CONFIG_ATM_LANE)
+extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr);
+#endif
+/* br_mrp.c */
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
+bool br_mrp_enabled(struct net_bridge *br);
+void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p);
+int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br);
+#else
+static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_mrp_enabled(struct net_bridge *br)
+{
+ return false;
+}
+
+static inline void br_mrp_port_del(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+}
+
+static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ return 0;
+}
+
+#endif
+
+/* br_cfm.c */
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
+bool br_cfm_created(struct net_bridge *br);
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p);
+int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br);
+int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink);
+int br_cfm_mep_count(struct net_bridge *br, u32 *count);
+int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count);
+#else
+static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_cfm_created(struct net_bridge *br)
+{
+ return false;
+}
+
+static inline void br_cfm_port_del(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+}
+
+static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+ *count = 0;
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
+{
+ *count = 0;
+ return -EOPNOTSUPP;
+}
+#endif
/* br_netlink.c */
-extern void br_netlink_init(void);
-extern void br_netlink_fini(void);
-extern void br_ifinfo_notify(int event, struct net_bridge_port *port);
+extern struct rtnl_link_ops br_link_ops;
+int br_netlink_init(void);
+void br_netlink_fini(void);
+void br_ifinfo_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port);
+void br_info_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port, u32 filter);
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags,
+ struct netlink_ext_ack *extack);
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
+ u32 filter_mask, int nlflags);
+int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last,
+ bool *changed,
+ struct netlink_ext_ack *extack);
#ifdef CONFIG_SYSFS
/* br_sysfs_if.c */
-extern struct sysfs_ops brport_sysfs_ops;
-extern int br_sysfs_addif(struct net_bridge_port *p);
+extern const struct sysfs_ops brport_sysfs_ops;
+int br_sysfs_addif(struct net_bridge_port *p);
+int br_sysfs_renameif(struct net_bridge_port *p);
/* br_sysfs_br.c */
-extern int br_sysfs_addbr(struct net_device *dev);
-extern void br_sysfs_delbr(struct net_device *dev);
+int br_sysfs_addbr(struct net_device *dev);
+void br_sysfs_delbr(struct net_device *dev);
#else
-#define br_sysfs_addif(p) (0)
-#define br_sysfs_addbr(dev) (0)
-#define br_sysfs_delbr(dev) do { } while(0)
+static inline int br_sysfs_addif(struct net_bridge_port *p) { return 0; }
+static inline int br_sysfs_renameif(struct net_bridge_port *p) { return 0; }
+static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
+static inline void br_sysfs_delbr(struct net_device *dev) { return; }
#endif /* CONFIG_SYSFS */
+/* br_switchdev.c */
+#ifdef CONFIG_NET_SWITCHDEV
+int br_switchdev_port_offload(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack);
+
+void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb);
+
+int br_switchdev_port_replay(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack);
+
+bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb);
+
+void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb);
+
+void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+ struct sk_buff *skb);
+void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+ struct sk_buff *skb);
+void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+ struct sk_buff *skb);
+bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+ const struct sk_buff *skb);
+int br_switchdev_set_port_flag(struct net_bridge_port *p,
+ unsigned long flags,
+ unsigned long mask,
+ struct netlink_ext_ack *extack);
+void br_switchdev_fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type);
+void br_switchdev_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type);
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
+ bool changed, struct netlink_ext_ack *extack);
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
+void br_switchdev_init(struct net_bridge *br);
+
+static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
+{
+ skb->offload_fwd_mark = 0;
+}
+#else
+static inline int
+br_switchdev_port_offload(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void
+br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb)
+{
+}
+
+static inline int
+br_switchdev_port_replay(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb)
+{
+}
+
+static inline void
+nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline void
+nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
+{
+ return true;
+}
+
+static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
+ unsigned long flags,
+ unsigned long mask,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid,
+ u16 flags, bool changed,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void
+br_switchdev_fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type)
+{
+}
+
+static inline void br_switchdev_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
+{
+}
+
+static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
+{
+}
+
+static inline void br_switchdev_init(struct net_bridge *br)
+{
+}
+
+#endif /* CONFIG_NET_SWITCHDEV */
+
+/* br_arp_nd_proxy.c */
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p);
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m);
+bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid);
#endif
diff --git a/net/bridge/br_private_cfm.h b/net/bridge/br_private_cfm.h
new file mode 100644
index 000000000000..a43a5e7fa2c3
--- /dev/null
+++ b/net/bridge/br_private_cfm.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _BR_PRIVATE_CFM_H_
+#define _BR_PRIVATE_CFM_H_
+
+#include "br_private.h"
+#include <uapi/linux/cfm_bridge.h>
+
+struct br_cfm_mep_create {
+ enum br_cfm_domain domain; /* Domain for this MEP */
+ enum br_cfm_mep_direction direction; /* Up or Down MEP direction */
+ u32 ifindex; /* Residence port */
+};
+
+int br_cfm_mep_create(struct net_bridge *br,
+ const u32 instance,
+ struct br_cfm_mep_create *const create,
+ struct netlink_ext_ack *extack);
+
+int br_cfm_mep_delete(struct net_bridge *br,
+ const u32 instance,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_mep_config {
+ u32 mdlevel;
+ u32 mepid; /* MEPID for this MEP */
+ struct mac_addr unicast_mac; /* The MEP unicast MAC */
+};
+
+int br_cfm_mep_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_mep_config *const config,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_maid {
+ u8 data[CFM_MAID_LENGTH];
+};
+
+struct br_cfm_cc_config {
+ /* Expected received CCM PDU MAID. */
+ struct br_cfm_maid exp_maid;
+
+ /* Expected received CCM PDU interval. */
+ /* Transmitting CCM PDU interval when CCM tx is enabled. */
+ enum br_cfm_ccm_interval exp_interval;
+
+ bool enable; /* Enable/disable CCM PDU handling */
+};
+
+int br_cfm_cc_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_cc_config *const config,
+ struct netlink_ext_ack *extack);
+
+int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance,
+ u32 peer_mep_id,
+ struct netlink_ext_ack *extack);
+int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance,
+ u32 peer_mep_id,
+ struct netlink_ext_ack *extack);
+
+/* Transmitted CCM Remote Defect Indication status set.
+ * This RDI is inserted in transmitted CCM PDUs if CCM transmission is enabled.
+ * See br_cfm_cc_ccm_tx() with interval != BR_CFM_CCM_INTERVAL_NONE
+ */
+int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance,
+ const bool rdi, struct netlink_ext_ack *extack);
+
+/* OAM PDU Tx information */
+struct br_cfm_cc_ccm_tx_info {
+ struct mac_addr dmac;
+ /* The CCM will be transmitted for this period in seconds.
+ * Call br_cfm_cc_ccm_tx before timeout to keep transmission alive.
+ * When period is zero any ongoing transmission will be stopped.
+ */
+ u32 period;
+
+ bool seq_no_update; /* Update Tx CCM sequence number */
+ bool if_tlv; /* Insert Interface Status TLV */
+ u8 if_tlv_value; /* Interface Status TLV value */
+ bool port_tlv; /* Insert Port Status TLV */
+ u8 port_tlv_value; /* Port Status TLV value */
+ /* Sender ID TLV ??
+ * Organization-Specific TLV ??
+ */
+};
+
+int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_mep_status {
+ /* Indications that an OAM PDU has been seen. */
+ bool opcode_unexp_seen; /* RX of OAM PDU with unexpected opcode */
+ bool version_unexp_seen; /* RX of OAM PDU with unexpected version */
+ bool rx_level_low_seen; /* Rx of OAM PDU with level low */
+};
+
+struct br_cfm_cc_peer_status {
+ /* This CCM related status is based on the latest received CCM PDU. */
+ u8 port_tlv_value; /* Port Status TLV value */
+ u8 if_tlv_value; /* Interface Status TLV value */
+
+ /* CCM has not been received for 3.25 intervals */
+ u8 ccm_defect:1;
+
+ /* (RDI == 1) for last received CCM PDU */
+ u8 rdi:1;
+
+ /* Indications that a CCM PDU has been seen. */
+ u8 seen:1; /* CCM PDU received */
+ u8 tlv_seen:1; /* CCM PDU with TLV received */
+ /* CCM PDU with unexpected sequence number received */
+ u8 seq_unexp_seen:1;
+};
+
+struct br_cfm_mep {
+ /* list header of MEP instances */
+ struct hlist_node head;
+ u32 instance;
+ struct br_cfm_mep_create create;
+ struct br_cfm_mep_config config;
+ struct br_cfm_cc_config cc_config;
+ struct br_cfm_cc_ccm_tx_info cc_ccm_tx_info;
+ /* List of multiple peer MEPs */
+ struct hlist_head peer_mep_list;
+ struct net_bridge_port __rcu *b_port;
+ unsigned long ccm_tx_end;
+ struct delayed_work ccm_tx_dwork;
+ u32 ccm_tx_snumber;
+ u32 ccm_rx_snumber;
+ struct br_cfm_mep_status status;
+ bool rdi;
+ struct rcu_head rcu;
+};
+
+struct br_cfm_peer_mep {
+ struct hlist_node head;
+ struct br_cfm_mep *mep;
+ struct delayed_work ccm_rx_dwork;
+ u32 mepid;
+ struct br_cfm_cc_peer_status cc_status;
+ u32 ccm_rx_count_miss;
+ struct rcu_head rcu;
+};
+
+#endif /* _BR_PRIVATE_CFM_H_ */
diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h
new file mode 100644
index 000000000000..adf82a05515a
--- /dev/null
+++ b/net/bridge/br_private_mcast_eht.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com>
+ */
+#ifndef _BR_PRIVATE_MCAST_EHT_H_
+#define _BR_PRIVATE_MCAST_EHT_H_
+
+#define BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT 512
+
+union net_bridge_eht_addr {
+ __be32 ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr ip6;
+#endif
+};
+
+/* single host's list of set entries and filter_mode */
+struct net_bridge_group_eht_host {
+ struct rb_node rb_node;
+
+ union net_bridge_eht_addr h_addr;
+ struct hlist_head set_entries;
+ unsigned int num_entries;
+ unsigned char filter_mode;
+ struct net_bridge_port_group *pg;
+};
+
+/* (host, src entry) added to a per-src set and host's list */
+struct net_bridge_group_eht_set_entry {
+ struct rb_node rb_node;
+ struct hlist_node host_list;
+
+ union net_bridge_eht_addr h_addr;
+ struct timer_list timer;
+ struct net_bridge *br;
+ struct net_bridge_group_eht_set *eht_set;
+ struct net_bridge_group_eht_host *h_parent;
+ struct net_bridge_mcast_gc mcast_gc;
+};
+
+/* per-src set */
+struct net_bridge_group_eht_set {
+ struct rb_node rb_node;
+
+ union net_bridge_eht_addr src_addr;
+ struct rb_root entry_tree;
+ struct timer_list timer;
+ struct net_bridge_port_group *pg;
+ struct net_bridge *br;
+ struct net_bridge_mcast_gc mcast_gc;
+};
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg);
+bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ void *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int grec_type);
+int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p,
+ u32 eht_hosts_limit);
+
+static inline bool
+br_multicast_eht_should_del_pg(const struct net_bridge_port_group *pg)
+{
+ return !!((pg->key.port->flags & BR_MULTICAST_FAST_LEAVE) &&
+ RB_EMPTY_ROOT(&pg->eht_host_tree));
+}
+
+static inline bool
+br_multicast_eht_hosts_over_limit(const struct net_bridge_port_group *pg)
+{
+ const struct net_bridge_port *p = pg->key.port;
+
+ return !!(p->multicast_eht_hosts_cnt >= p->multicast_eht_hosts_limit);
+}
+
+static inline void br_multicast_eht_hosts_inc(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port *p = pg->key.port;
+
+ p->multicast_eht_hosts_cnt++;
+}
+
+static inline void br_multicast_eht_hosts_dec(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port *p = pg->key.port;
+
+ p->multicast_eht_hosts_cnt--;
+}
+#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
+
+#endif /* _BR_PRIVATE_MCAST_EHT_H_ */
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
new file mode 100644
index 000000000000..bda8e1896712
--- /dev/null
+++ b/net/bridge/br_private_mrp.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _BR_PRIVATE_MRP_H_
+#define _BR_PRIVATE_MRP_H_
+
+#include "br_private.h"
+#include <uapi/linux/mrp_bridge.h>
+
+#define MRP_OPT_PADDING 0x2
+
+struct br_mrp {
+ /* list of mrp instances */
+ struct hlist_node list;
+
+ struct net_bridge_port __rcu *p_port;
+ struct net_bridge_port __rcu *s_port;
+ struct net_bridge_port __rcu *i_port;
+
+ u32 ring_id;
+ u16 in_id;
+ u16 prio;
+
+ enum br_mrp_ring_role_type ring_role;
+ u8 ring_role_offloaded;
+ enum br_mrp_ring_state_type ring_state;
+ u32 ring_transitions;
+
+ enum br_mrp_in_role_type in_role;
+ u8 in_role_offloaded;
+ enum br_mrp_in_state_type in_state;
+ u32 in_transitions;
+
+ struct delayed_work test_work;
+ u32 test_interval;
+ unsigned long test_end;
+ u32 test_count_miss;
+ u32 test_max_miss;
+ bool test_monitor;
+
+ struct delayed_work in_test_work;
+ u32 in_test_interval;
+ unsigned long in_test_end;
+ u32 in_test_count_miss;
+ u32 in_test_max_miss;
+
+ u32 seq_id;
+
+ struct rcu_head rcu;
+};
+
+/* This type is returned by br_mrp_switchdev functions that allow to have a SW
+ * backup in case the HW can't implement completely the protocol.
+ * BR_MRP_NONE - means the HW can't run at all the protocol, so the SW stops
+ * configuring the node anymore.
+ * BR_MRP_SW - the HW can help the SW to run the protocol, by redirecting MRP
+ * frames to CPU.
+ * BR_MRP_HW - the HW can implement completely the protocol.
+ */
+enum br_mrp_hw_support {
+ BR_MRP_NONE,
+ BR_MRP_SW,
+ BR_MRP_HW,
+};
+
+/* br_mrp.c */
+int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance);
+int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance);
+int br_mrp_set_port_state(struct net_bridge_port *p,
+ enum br_mrp_port_state_type state);
+int br_mrp_set_port_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role);
+int br_mrp_set_ring_state(struct net_bridge *br,
+ struct br_mrp_ring_state *state);
+int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role);
+int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test);
+int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state);
+int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role);
+int br_mrp_start_in_test(struct net_bridge *br,
+ struct br_mrp_start_in_test *test);
+
+/* br_mrp_switchdev.c */
+int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp);
+int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp);
+enum br_mrp_hw_support
+br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_ring_role_type role);
+int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_ring_state_type state);
+enum br_mrp_hw_support
+br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period,
+ bool monitor);
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state);
+int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role);
+enum br_mrp_hw_support
+br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp,
+ u16 in_id, u32 ring_id,
+ enum br_mrp_in_role_type role);
+int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_in_state_type state);
+enum br_mrp_hw_support
+br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period);
+
+/* br_mrp_netlink.c */
+int br_mrp_ring_port_open(struct net_device *dev, u8 loc);
+int br_mrp_in_port_open(struct net_device *dev, u8 loc);
+
+/* MRP protocol data units */
+struct br_mrp_tlv_hdr {
+ __u8 type;
+ __u8 length;
+};
+
+struct br_mrp_common_hdr {
+ __be16 seq_id;
+ __u8 domain[MRP_DOMAIN_UUID_LENGTH];
+};
+
+struct br_mrp_ring_test_hdr {
+ __be16 prio;
+ __u8 sa[ETH_ALEN];
+ __be16 port_role;
+ __be16 state;
+ __be16 transitions;
+ __be32 timestamp;
+} __attribute__((__packed__));
+
+struct br_mrp_in_test_hdr {
+ __be16 id;
+ __u8 sa[ETH_ALEN];
+ __be16 port_role;
+ __be16 state;
+ __be16 transitions;
+ __be32 timestamp;
+} __attribute__((__packed__));
+
+struct br_mrp_oui_hdr {
+ __u8 oui[MRP_OUI_LENGTH];
+};
+
+struct br_mrp_sub_option1_hdr {
+ __u8 type;
+ __u8 data[MRP_MANUFACTURE_DATA_LENGTH];
+};
+
+#endif /* _BR_PRIVATE_MRP_H */
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
index e29f01ac1adf..814cf1364cfb 100644
--- a/net/bridge/br_private_stp.h
+++ b/net/bridge/br_private_stp.h
@@ -1,15 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_private_stp.h,v 1.3 2001/02/05 06:03:47 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _BR_PRIVATE_STP_H
@@ -18,10 +12,22 @@
#define BPDU_TYPE_CONFIG 0
#define BPDU_TYPE_TCN 0x80
-struct br_config_bpdu
-{
- unsigned topology_change:1;
- unsigned topology_change_ack:1;
+/* IEEE 802.1D-1998 timer values */
+#define BR_MIN_HELLO_TIME (1*HZ)
+#define BR_MAX_HELLO_TIME (10*HZ)
+
+#define BR_MIN_FORWARD_DELAY (2*HZ)
+#define BR_MAX_FORWARD_DELAY (30*HZ)
+
+#define BR_MIN_MAX_AGE (6*HZ)
+#define BR_MAX_MAX_AGE (40*HZ)
+
+#define BR_MIN_PATH_COST 1
+#define BR_MAX_PATH_COST 65535
+
+struct br_config_bpdu {
+ unsigned int topology_change:1;
+ unsigned int topology_change_ack:1;
bridge_id root;
int root_path_cost;
bridge_id bridge_id;
@@ -41,18 +47,20 @@ static inline int br_is_designated_port(const struct net_bridge_port *p)
/* br_stp.c */
-extern void br_become_root_bridge(struct net_bridge *br);
-extern void br_config_bpdu_generation(struct net_bridge *);
-extern void br_configuration_update(struct net_bridge *);
-extern void br_port_state_selection(struct net_bridge *);
-extern void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu);
-extern void br_received_tcn_bpdu(struct net_bridge_port *p);
-extern void br_transmit_config(struct net_bridge_port *p);
-extern void br_transmit_tcn(struct net_bridge *br);
-extern void br_topology_change_detection(struct net_bridge *br);
+void br_become_root_bridge(struct net_bridge *br);
+void br_config_bpdu_generation(struct net_bridge *);
+void br_configuration_update(struct net_bridge *);
+void br_port_state_selection(struct net_bridge *);
+void br_received_config_bpdu(struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu);
+void br_received_tcn_bpdu(struct net_bridge_port *p);
+void br_transmit_config(struct net_bridge_port *p);
+void br_transmit_tcn(struct net_bridge *br);
+void br_topology_change_detection(struct net_bridge *br);
+void __br_set_topology_change(struct net_bridge *br, unsigned char val);
/* br_stp_bpdu.c */
-extern void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
-extern void br_send_tcn_bpdu(struct net_bridge_port *);
+void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
+void br_send_tcn_bpdu(struct net_bridge_port *);
#endif
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
new file mode 100644
index 000000000000..efb096025151
--- /dev/null
+++ b/net/bridge/br_private_tunnel.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Bridge per vlan tunnels
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ */
+
+#ifndef _BR_PRIVATE_TUNNEL_H
+#define _BR_PRIVATE_TUNNEL_H
+
+struct vtunnel_info {
+ u32 tunid;
+ u16 vid;
+ u16 flags;
+};
+
+/* br_netlink_tunnel.c */
+int br_parse_vlan_tunnel_info(struct nlattr *attr,
+ struct vtunnel_info *tinfo);
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ int cmd,
+ struct vtunnel_info *tinfo_curr,
+ struct vtunnel_info *tinfo_last,
+ bool *changed);
+int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
+int br_fill_vlan_tunnel_info(struct sk_buff *skb,
+ struct net_bridge_vlan_group *vg);
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last);
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed);
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+/* br_vlan_tunnel.c */
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id);
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan);
+void br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg);
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_vlan *vlan);
+#else
+static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port,
+ u16 vid)
+{
+ return 0;
+}
+
+static inline int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port,
+ u16 vid, u32 tun_id)
+{
+ return 0;
+}
+
+static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+}
+
+static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan)
+{
+}
+
+static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 04ca0639a95a..024210f95468 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -1,50 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; generic parts
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_stp.c,v 1.4 2000/06/19 10:13:35 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
-#include <linux/smp_lock.h>
+#include <linux/rculist.h>
+#include <net/switchdev.h>
#include "br_private.h"
#include "br_private_stp.h"
/* since time values in bpdu are in jiffies and then scaled (1/256)
- * before sending, make sure that is at least one.
+ * before sending, make sure that is at least one STP tick.
*/
-#define MESSAGE_AGE_INCR ((HZ < 256) ? 1 : (HZ/256))
+#define MESSAGE_AGE_INCR ((HZ / 256) + 1)
-static const char *br_port_state_names[] = {
- [BR_STATE_DISABLED] = "disabled",
+static const char *const br_port_state_names[] = {
+ [BR_STATE_DISABLED] = "disabled",
[BR_STATE_LISTENING] = "listening",
- [BR_STATE_LEARNING] = "learning",
- [BR_STATE_FORWARDING] = "forwarding",
+ [BR_STATE_LEARNING] = "learning",
+ [BR_STATE_FORWARDING] = "forwarding",
[BR_STATE_BLOCKING] = "blocking",
};
-void br_log_state(const struct net_bridge_port *p)
+void br_set_state(struct net_bridge_port *p, unsigned int state)
{
- pr_info("%s: port %d(%s) entering %s state\n",
- p->br->dev->name, p->port_no, p->dev->name,
- br_port_state_names[p->state]);
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+ .flags = SWITCHDEV_F_DEFER,
+ .u.stp_state = state,
+ };
+ int err;
+
+ /* Don't change the state of the ports if they are driven by a different
+ * protocol.
+ */
+ if (p->flags & BR_MRP_AWARE)
+ return;
+
+ p->state = state;
+ if (br_opt_get(p->br, BROPT_MST_ENABLED)) {
+ err = br_mst_set_state(p, 0, state, NULL);
+ if (err)
+ br_warn(p->br, "error setting MST state on port %u(%s)\n",
+ p->port_no, netdev_name(p->dev));
+ }
+ err = switchdev_port_attr_set(p->dev, &attr, NULL);
+ if (err && err != -EOPNOTSUPP)
+ br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
+ (unsigned int) p->port_no, p->dev->name);
+ else
+ br_info(p->br, "port %u(%s) entered %s state\n",
+ (unsigned int) p->port_no, p->dev->name,
+ br_port_state_names[p->state]);
+
+ if (p->br->stp_enabled == BR_KERNEL_STP) {
+ switch (p->state) {
+ case BR_STATE_BLOCKING:
+ p->stp_xstats.transition_blk++;
+ break;
+ case BR_STATE_FORWARDING:
+ p->stp_xstats.transition_fwd++;
+ break;
+ }
+ }
+}
+
+u8 br_port_get_stp_state(const struct net_device *dev)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return BR_STATE_DISABLED;
+ return p->state;
}
+EXPORT_SYMBOL_GPL(br_port_get_stp_state);
/* called under bridge lock */
struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
{
struct net_bridge_port *p;
- list_for_each_entry_rcu(p, &br->port_list, list) {
+ list_for_each_entry_rcu(p, &br->port_list, list,
+ lockdep_is_held(&br->lock)) {
if (p->port_no == port_no)
return p;
}
@@ -53,7 +99,7 @@ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
}
/* called under bridge lock */
-static int br_should_become_root_port(const struct net_bridge_port *p,
+static int br_should_become_root_port(const struct net_bridge_port *p,
u16 root_port)
{
struct net_bridge *br;
@@ -103,6 +149,20 @@ static int br_should_become_root_port(const struct net_bridge_port *p,
return 0;
}
+static void br_root_port_block(const struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+
+ br_notice(br, "port %u(%s) tried to become root port (blocked)",
+ (unsigned int) p->port_no, p->dev->name);
+
+ br_set_state(p, BR_STATE_LISTENING);
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+ if (br->forward_delay > 0)
+ mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay);
+}
+
/* called under bridge lock */
static void br_root_selection(struct net_bridge *br)
{
@@ -110,9 +170,13 @@ static void br_root_selection(struct net_bridge *br)
u16 root_port = 0;
list_for_each_entry(p, &br->port_list, list) {
- if (br_should_become_root_port(p, root_port))
- root_port = p->port_no;
+ if (!br_should_become_root_port(p, root_port))
+ continue;
+ if (p->flags & BR_ROOT_BLOCK)
+ br_root_port_block(br, p);
+ else
+ root_port = p->port_no;
}
br->root_port = root_port;
@@ -134,7 +198,7 @@ void br_become_root_bridge(struct net_bridge *br)
br->hello_time = br->bridge_hello_time;
br->forward_delay = br->bridge_forward_delay;
br_topology_change_detection(br);
- del_timer(&br->tcn_timer);
+ timer_delete(&br->tcn_timer);
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
@@ -148,7 +212,6 @@ void br_transmit_config(struct net_bridge_port *p)
struct br_config_bpdu bpdu;
struct net_bridge *br;
-
if (timer_pending(&p->hold_timer)) {
p->config_pending = 1;
return;
@@ -167,8 +230,7 @@ void br_transmit_config(struct net_bridge_port *p)
else {
struct net_bridge_port *root
= br_get_port(br, br->root_port);
- bpdu.message_age = br->max_age
- - (root->message_age_timer.expires - jiffies)
+ bpdu.message_age = (jiffies - root->designated_age)
+ MESSAGE_AGE_INCR;
}
bpdu.max_age = br->max_age;
@@ -179,37 +241,47 @@ void br_transmit_config(struct net_bridge_port *p)
br_send_config_bpdu(p, &bpdu);
p->topology_change_ack = 0;
p->config_pending = 0;
- mod_timer(&p->hold_timer, jiffies + BR_HOLD_TIME);
+ if (p->br->stp_enabled == BR_KERNEL_STP)
+ mod_timer(&p->hold_timer,
+ round_jiffies(jiffies + BR_HOLD_TIME));
}
}
/* called under bridge lock */
-static inline void br_record_config_information(struct net_bridge_port *p,
- const struct br_config_bpdu *bpdu)
+static void br_record_config_information(struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu)
{
p->designated_root = bpdu->root;
p->designated_cost = bpdu->root_path_cost;
p->designated_bridge = bpdu->bridge_id;
p->designated_port = bpdu->port_id;
+ p->designated_age = jiffies - bpdu->message_age;
- mod_timer(&p->message_age_timer, jiffies
- + (p->br->max_age - bpdu->message_age));
+ mod_timer(&p->message_age_timer, jiffies
+ + (bpdu->max_age - bpdu->message_age));
}
/* called under bridge lock */
-static inline void br_record_config_timeout_values(struct net_bridge *br,
+static void br_record_config_timeout_values(struct net_bridge *br,
const struct br_config_bpdu *bpdu)
{
br->max_age = bpdu->max_age;
br->hello_time = bpdu->hello_time;
br->forward_delay = bpdu->forward_delay;
- br->topology_change = bpdu->topology_change;
+ __br_set_topology_change(br, bpdu->topology_change);
}
/* called under bridge lock */
void br_transmit_tcn(struct net_bridge *br)
{
- br_send_tcn_bpdu(br_get_port(br, br->root_port));
+ struct net_bridge_port *p;
+
+ p = br_get_port(br, br->root_port);
+ if (p)
+ br_send_tcn_bpdu(p);
+ else
+ br_notice(br, "root port %u not found for topology notice\n",
+ br->root_port);
}
/* called under bridge lock */
@@ -256,7 +328,8 @@ static void br_designated_port_selection(struct net_bridge *br)
}
/* called under bridge lock */
-static int br_supersedes_port_info(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
+static int br_supersedes_port_info(const struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu)
{
int t;
@@ -287,10 +360,10 @@ static int br_supersedes_port_info(struct net_bridge_port *p, struct br_config_b
}
/* called under bridge lock */
-static inline void br_topology_change_acknowledged(struct net_bridge *br)
+static void br_topology_change_acknowledged(struct net_bridge *br)
{
br->topology_change_detected = 0;
- del_timer(&br->tcn_timer);
+ timer_delete(&br->tcn_timer);
}
/* called under bridge lock */
@@ -298,11 +371,14 @@ void br_topology_change_detection(struct net_bridge *br)
{
int isroot = br_is_root_bridge(br);
- pr_info("%s: topology change detected, %s\n", br->dev->name,
+ if (br->stp_enabled != BR_KERNEL_STP)
+ return;
+
+ br_info(br, "topology change detected, %s\n",
isroot ? "propagating" : "sending tcn bpdu");
if (isroot) {
- br->topology_change = 1;
+ __br_set_topology_change(br, 1);
mod_timer(&br->topology_change_timer, jiffies
+ br->bridge_forward_delay + br->bridge_max_age);
} else if (!br->topology_change_detected) {
@@ -326,7 +402,7 @@ void br_config_bpdu_generation(struct net_bridge *br)
}
/* called under bridge lock */
-static inline void br_reply(struct net_bridge_port *p)
+static void br_reply(struct net_bridge_port *p)
{
br_transmit_config(p);
}
@@ -360,38 +436,54 @@ static void br_make_blocking(struct net_bridge_port *p)
p->state == BR_STATE_LEARNING)
br_topology_change_detection(p->br);
- p->state = BR_STATE_BLOCKING;
- br_log_state(p);
- del_timer(&p->forward_delay_timer);
+ br_set_state(p, BR_STATE_BLOCKING);
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+ timer_delete(&p->forward_delay_timer);
}
}
/* called under bridge lock */
static void br_make_forwarding(struct net_bridge_port *p)
{
- if (p->state == BR_STATE_BLOCKING) {
- if (p->br->stp_enabled) {
- p->state = BR_STATE_LISTENING;
- } else {
- p->state = BR_STATE_LEARNING;
- }
- br_log_state(p);
- mod_timer(&p->forward_delay_timer, jiffies + p->br->forward_delay); }
+ struct net_bridge *br = p->br;
+
+ if (p->state != BR_STATE_BLOCKING)
+ return;
+
+ if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) {
+ br_set_state(p, BR_STATE_FORWARDING);
+ br_topology_change_detection(br);
+ timer_delete(&p->forward_delay_timer);
+ } else if (br->stp_enabled == BR_KERNEL_STP)
+ br_set_state(p, BR_STATE_LISTENING);
+ else
+ br_set_state(p, BR_STATE_LEARNING);
+
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+ if (br->forward_delay != 0)
+ mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay);
}
/* called under bridge lock */
void br_port_state_selection(struct net_bridge *br)
{
struct net_bridge_port *p;
+ unsigned int liveports = 0;
list_for_each_entry(p, &br->port_list, list) {
- if (p->state != BR_STATE_DISABLED) {
+ if (p->state == BR_STATE_DISABLED)
+ continue;
+
+ /* Don't change port states if userspace is handling STP */
+ if (br->stp_enabled != BR_USER_STP) {
if (p->port_no == br->root_port) {
p->config_pending = 0;
p->topology_change_ack = 0;
br_make_forwarding(p);
} else if (br_is_designated_port(p)) {
- del_timer(&p->message_age_timer);
+ timer_delete(&p->message_age_timer);
br_make_forwarding(p);
} else {
p->config_pending = 0;
@@ -400,22 +492,38 @@ void br_port_state_selection(struct net_bridge *br)
}
}
+ if (p->state != BR_STATE_BLOCKING)
+ br_multicast_enable_port(p);
+ /* Multicast is not disabled for the port when it goes in
+ * blocking state because the timers will expire and stop by
+ * themselves without sending more queries.
+ */
+ if (p->state == BR_STATE_FORWARDING)
+ ++liveports;
}
+
+ if (liveports == 0)
+ netif_carrier_off(br->dev);
+ else
+ netif_carrier_on(br->dev);
}
/* called under bridge lock */
-static inline void br_topology_change_acknowledge(struct net_bridge_port *p)
+static void br_topology_change_acknowledge(struct net_bridge_port *p)
{
p->topology_change_ack = 1;
br_transmit_config(p);
}
/* called under bridge lock */
-void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
+void br_received_config_bpdu(struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu)
{
struct net_bridge *br;
int was_root;
-
+
+ p->stp_xstats.rx_bpdu++;
+
br = p->br;
was_root = br_is_root_bridge(br);
@@ -425,12 +533,12 @@ void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *b
br_port_state_selection(br);
if (!br_is_root_bridge(br) && was_root) {
- del_timer(&br->hello_timer);
+ timer_delete(&br->hello_timer);
if (br->topology_change_detected) {
- del_timer(&br->topology_change_timer);
+ timer_delete(&br->topology_change_timer);
br_transmit_tcn(br);
- mod_timer(&br->tcn_timer,
+ mod_timer(&br->tcn_timer,
jiffies + br->bridge_hello_time);
}
}
@@ -441,19 +549,165 @@ void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *b
if (bpdu->topology_change_ack)
br_topology_change_acknowledged(br);
}
- } else if (br_is_designated_port(p)) {
- br_reply(p);
+ } else if (br_is_designated_port(p)) {
+ br_reply(p);
}
}
/* called under bridge lock */
void br_received_tcn_bpdu(struct net_bridge_port *p)
{
+ p->stp_xstats.rx_tcn++;
+
if (br_is_designated_port(p)) {
- pr_info("%s: received tcn bpdu on port %i(%s)\n",
- p->br->dev->name, p->port_no, p->dev->name);
+ br_info(p->br, "port %u(%s) received tcn bpdu\n",
+ (unsigned int) p->port_no, p->dev->name);
br_topology_change_detection(p->br);
br_topology_change_acknowledge(p);
}
}
+
+/* Change bridge STP parameter */
+int br_set_hello_time(struct net_bridge *br, unsigned long val)
+{
+ unsigned long t = clock_t_to_jiffies(val);
+
+ if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME)
+ return -ERANGE;
+
+ spin_lock_bh(&br->lock);
+ br->bridge_hello_time = t;
+ if (br_is_root_bridge(br))
+ br->hello_time = br->bridge_hello_time;
+ spin_unlock_bh(&br->lock);
+ return 0;
+}
+
+int br_set_max_age(struct net_bridge *br, unsigned long val)
+{
+ unsigned long t = clock_t_to_jiffies(val);
+
+ if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE)
+ return -ERANGE;
+
+ spin_lock_bh(&br->lock);
+ br->bridge_max_age = t;
+ if (br_is_root_bridge(br))
+ br->max_age = br->bridge_max_age;
+ spin_unlock_bh(&br->lock);
+ return 0;
+
+}
+
+/* called under bridge lock */
+int __set_ageing_time(struct net_device *dev, unsigned long t)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
+ .u.ageing_time = jiffies_to_clock_t(t),
+ };
+ int err;
+
+ err = switchdev_port_attr_set(dev, &attr, NULL);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
+/* Set time interval that dynamic forwarding entries live
+ * For pure software bridge, allow values outside the 802.1
+ * standard specification for special cases:
+ * 0 - entry never ages (all permanent)
+ * 1 - entry disappears (no persistence)
+ *
+ * Offloaded switch entries maybe more restrictive
+ */
+int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
+{
+ unsigned long t = clock_t_to_jiffies(ageing_time);
+ int err;
+
+ err = __set_ageing_time(br->dev, t);
+ if (err)
+ return err;
+
+ spin_lock_bh(&br->lock);
+ br->bridge_ageing_time = t;
+ br->ageing_time = t;
+ spin_unlock_bh(&br->lock);
+
+ mod_delayed_work(system_long_wq, &br->gc_work, 0);
+
+ return 0;
+}
+
+clock_t br_get_ageing_time(const struct net_device *br_dev)
+{
+ const struct net_bridge *br;
+
+ if (!netif_is_bridge_master(br_dev))
+ return 0;
+
+ br = netdev_priv(br_dev);
+
+ return jiffies_to_clock_t(br->ageing_time);
+}
+EXPORT_SYMBOL_GPL(br_get_ageing_time);
+
+/* called under bridge lock */
+void __br_set_topology_change(struct net_bridge *br, unsigned char val)
+{
+ unsigned long t;
+ int err;
+
+ if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) {
+ /* On topology change, set the bridge ageing time to twice the
+ * forward delay. Otherwise, restore its default ageing time.
+ */
+
+ if (val) {
+ t = 2 * br->forward_delay;
+ br_debug(br, "decreasing ageing time to %lu\n", t);
+ } else {
+ t = br->bridge_ageing_time;
+ br_debug(br, "restoring ageing time to %lu\n", t);
+ }
+
+ err = __set_ageing_time(br->dev, t);
+ if (err)
+ br_warn(br, "error offloading ageing time\n");
+ else
+ br->ageing_time = t;
+ }
+
+ br->topology_change = val;
+}
+
+void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
+{
+ br->bridge_forward_delay = t;
+ if (br_is_root_bridge(br))
+ br->forward_delay = br->bridge_forward_delay;
+}
+
+int br_set_forward_delay(struct net_bridge *br, unsigned long val)
+{
+ unsigned long t = clock_t_to_jiffies(val);
+ int err = -ERANGE;
+
+ spin_lock_bh(&br->lock);
+ if (br->stp_enabled != BR_NO_STP &&
+ (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY))
+ goto unlock;
+
+ __br_set_forward_delay(br, t);
+ err = 0;
+
+unlock:
+ spin_unlock_bh(&br->lock);
+ return err;
+}
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 068d8afbf0a7..7895489ac6fe 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -1,25 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; BPDU handling
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_stp_bpdu.c,v 1.3 2001/11/10 02:35:25 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/netfilter_bridge.h>
#include <linux/etherdevice.h>
#include <linux/llc.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#include <net/net_namespace.h>
#include <net/llc.h>
#include <net/llc_pdu.h>
-#include <asm/unaligned.h>
+#include <net/stp.h>
+#include <linux/unaligned.h>
#include "br_private.h"
#include "br_private_stp.h"
@@ -28,23 +26,27 @@
#define LLC_RESERVE sizeof(struct llc_pdu_un)
+static int br_send_bpdu_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+
static void br_send_bpdu(struct net_bridge_port *p,
- const unsigned char *data, int length)
+ const unsigned char *data, int length)
{
struct sk_buff *skb;
- if (!p->br->stp_enabled)
- return;
-
skb = dev_alloc_skb(length+LLC_RESERVE);
if (!skb)
return;
skb->dev = p->dev;
skb->protocol = htons(ETH_P_802_2);
+ skb->priority = TC_PRIO_CONTROL;
skb_reserve(skb, LLC_RESERVE);
- memcpy(__skb_put(skb, length), data, length);
+ __skb_put_data(skb, data, length);
llc_pdu_header_init(skb, LLC_PDU_TYPE_U, LLC_SAP_BSPAN,
LLC_SAP_BSPAN, LLC_PDU_CMD);
@@ -52,22 +54,25 @@ static void br_send_bpdu(struct net_bridge_port *p,
llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr);
- NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
- dev_queue_xmit);
+ skb_reset_mac_header(skb);
+
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ dev_net(p->dev), NULL, skb, NULL, skb->dev,
+ br_send_bpdu_finish);
}
static inline void br_set_ticks(unsigned char *dest, int j)
{
unsigned long ticks = (STP_HZ * j)/ HZ;
- put_unaligned(htons(ticks), (__be16 *)dest);
+ put_unaligned_be16(ticks, dest);
}
static inline int br_get_ticks(const unsigned char *src)
{
- unsigned long ticks = ntohs(get_unaligned((__be16 *)src));
+ unsigned long ticks = get_unaligned_be16(src);
- return (ticks * HZ + STP_HZ - 1) / STP_HZ;
+ return DIV_ROUND_UP(ticks * HZ, STP_HZ);
}
/* called under bridge lock */
@@ -75,6 +80,9 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
{
unsigned char buf[35];
+ if (p->br->stp_enabled != BR_KERNEL_STP)
+ return;
+
buf[0] = 0;
buf[1] = 0;
buf[2] = 0;
@@ -110,6 +118,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
br_set_ticks(buf+33, bpdu->forward_delay);
br_send_bpdu(p, buf, 35);
+
+ p->stp_xstats.tx_bpdu++;
}
/* called under bridge lock */
@@ -117,35 +127,30 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
{
unsigned char buf[4];
+ if (p->br->stp_enabled != BR_KERNEL_STP)
+ return;
+
buf[0] = 0;
buf[1] = 0;
buf[2] = 0;
buf[3] = BPDU_TYPE_TCN;
br_send_bpdu(p, buf, 4);
+
+ p->stp_xstats.tx_tcn++;
}
/*
* Called from llc.
*
- * NO locks, but rcu_read_lock (preempt_disabled)
+ * NO locks, but rcu_read_lock
*/
-int br_stp_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
+void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+ struct net_device *dev)
{
- const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
- const unsigned char *dest = eth_hdr(skb)->h_dest;
- struct net_bridge_port *p = rcu_dereference(dev->br_port);
+ struct net_bridge_port *p;
struct net_bridge *br;
const unsigned char *buf;
- if (!p)
- goto err;
-
- if (pdu->ssap != LLC_SAP_BSPAN
- || pdu->dsap != LLC_SAP_BSPAN
- || pdu->ctrl_1 != LLC_PDU_TYPE_U)
- goto err;
-
if (!pskb_may_pull(skb, 4))
goto err;
@@ -154,16 +159,31 @@ int br_stp_rcv(struct sk_buff *skb, struct net_device *dev,
if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0)
goto err;
+ p = br_port_get_check_rcu(dev);
+ if (!p)
+ goto err;
+
br = p->br;
spin_lock(&br->lock);
- if (p->state == BR_STATE_DISABLED
- || !br->stp_enabled
- || !(br->dev->flags & IFF_UP))
+ if (br->stp_enabled != BR_KERNEL_STP)
+ goto out;
+
+ if (!(br->dev->flags & IFF_UP))
+ goto out;
+
+ if (p->state == BR_STATE_DISABLED)
+ goto out;
+
+ if (!ether_addr_equal(eth_hdr(skb)->h_dest, br->group_addr))
goto out;
- if (compare_ether_addr(dest, br->group_addr) != 0)
+ if (p->flags & BR_BPDU_GUARD) {
+ br_notice(br, "BPDU received on blocked port %u(%s)\n",
+ (unsigned int) p->port_no, p->dev->name);
+ br_stp_disable_port(p);
goto out;
+ }
buf = skb_pull(skb, 3);
@@ -205,15 +225,23 @@ int br_stp_rcv(struct sk_buff *skb, struct net_device *dev,
bpdu.hello_time = br_get_ticks(buf+28);
bpdu.forward_delay = br_get_ticks(buf+30);
- br_received_config_bpdu(p, &bpdu);
- }
+ if (bpdu.message_age > bpdu.max_age) {
+ if (net_ratelimit())
+ br_notice(p->br,
+ "port %u config from %pM"
+ " (message_age %ul > max_age %ul)\n",
+ p->port_no,
+ eth_hdr(skb)->h_source,
+ bpdu.message_age, bpdu.max_age);
+ goto out;
+ }
- else if (buf[0] == BPDU_TYPE_TCN) {
+ br_received_config_bpdu(p, &bpdu);
+ } else if (buf[0] == BPDU_TYPE_TCN) {
br_received_tcn_bpdu(p);
}
out:
spin_unlock(&br->lock);
err:
kfree_skb(skb);
- return 0;
}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index d294224592db..c20a41bf253b 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -1,60 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; interface code
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_stp_if.c,v 1.4 2001/04/14 21:14:39 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
-#include <linux/smp_lock.h>
+#include <linux/kmod.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
+#include <net/switchdev.h>
#include "br_private.h"
#include "br_private_stp.h"
/* Port id is composed of priority and port number.
- * NB: least significant bits of priority are dropped to
+ * NB: some bits of priority are dropped to
* make room for more ports.
*/
static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
{
- return ((u16)priority << BR_PORT_BITS)
+ return ((u16)priority << BR_PORT_BITS)
| (port_no & ((1<<BR_PORT_BITS)-1));
}
+#define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS)
+
/* called under bridge lock */
void br_init_port(struct net_bridge_port *p)
{
+ int err;
+
p->port_id = br_make_port_id(p->priority, p->port_no);
br_become_designated_port(p);
- p->state = BR_STATE_BLOCKING;
+ br_set_state(p, BR_STATE_BLOCKING);
p->topology_change_ack = 0;
p->config_pending = 0;
+
+ err = __set_ageing_time(p->dev, p->br->ageing_time);
+ if (err)
+ netdev_err(p->dev, "failed to offload ageing time\n");
}
-/* called under bridge lock */
+/* NO locks held */
void br_stp_enable_bridge(struct net_bridge *br)
{
struct net_bridge_port *p;
spin_lock_bh(&br->lock);
- mod_timer(&br->hello_timer, jiffies + br->hello_time);
- mod_timer(&br->gc_timer, jiffies + HZ/10);
-
+ if (br->stp_enabled == BR_KERNEL_STP)
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
+
br_config_bpdu_generation(br);
list_for_each_entry(p, &br->port_list, list) {
- if ((p->dev->flags & IFF_UP) && netif_carrier_ok(p->dev))
+ if (netif_running(p->dev) && netif_oper_up(p->dev))
br_stp_enable_port(p);
}
@@ -73,47 +77,45 @@ void br_stp_disable_bridge(struct net_bridge *br)
}
- br->topology_change = 0;
+ __br_set_topology_change(br, 0);
br->topology_change_detected = 0;
spin_unlock_bh(&br->lock);
- del_timer_sync(&br->hello_timer);
- del_timer_sync(&br->topology_change_timer);
- del_timer_sync(&br->tcn_timer);
- del_timer_sync(&br->gc_timer);
+ timer_delete_sync(&br->hello_timer);
+ timer_delete_sync(&br->topology_change_timer);
+ timer_delete_sync(&br->tcn_timer);
+ cancel_delayed_work_sync(&br->gc_work);
}
/* called under bridge lock */
void br_stp_enable_port(struct net_bridge_port *p)
{
br_init_port(p);
- br_ifinfo_notify(RTM_NEWLINK, p);
br_port_state_selection(p->br);
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
}
/* called under bridge lock */
void br_stp_disable_port(struct net_bridge_port *p)
{
- struct net_bridge *br;
+ struct net_bridge *br = p->br;
int wasroot;
- br = p->br;
- printk(KERN_INFO "%s: port %i(%s) entering %s state\n",
- br->dev->name, p->port_no, p->dev->name, "disabled");
-
- br_ifinfo_notify(RTM_DELLINK, p);
-
wasroot = br_is_root_bridge(br);
br_become_designated_port(p);
- p->state = BR_STATE_DISABLED;
+ br_set_state(p, BR_STATE_DISABLED);
p->topology_change_ack = 0;
p->config_pending = 0;
- del_timer(&p->message_age_timer);
- del_timer(&p->forward_delay_timer);
- del_timer(&p->hold_timer);
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
- br_fdb_delete_by_port(br, p, 0);
+ timer_delete(&p->message_age_timer);
+ timer_delete(&p->forward_delay_timer);
+ timer_delete(&p->hold_timer);
+
+ if (!rcu_access_pointer(p->backup_port))
+ br_fdb_delete_by_port(br, p, 0, 0);
+ br_multicast_disable_port(p);
br_configuration_update(br);
@@ -123,26 +125,122 @@ void br_stp_disable_port(struct net_bridge_port *p)
br_become_root_bridge(br);
}
+static int br_stp_call_user(struct net_bridge *br, char *arg)
+{
+ char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL };
+ char *envp[] = { NULL };
+ int rc;
+
+ /* call userspace STP and report program errors */
+ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ if (rc > 0) {
+ if (rc & 0xff)
+ br_debug(br, BR_STP_PROG " received signal %d\n",
+ rc & 0x7f);
+ else
+ br_debug(br, BR_STP_PROG " exited with code %d\n",
+ (rc >> 8) & 0xff);
+ }
+
+ return rc;
+}
+
+static void br_stp_start(struct net_bridge *br)
+{
+ int err = -ENOENT;
+
+ if (net_eq(dev_net(br->dev), &init_net))
+ err = br_stp_call_user(br, "start");
+
+ if (err && err != -ENOENT)
+ br_err(br, "failed to start userspace STP (%d)\n", err);
+
+ spin_lock_bh(&br->lock);
+
+ if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY)
+ __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY);
+ else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
+ __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
+
+ if (!err) {
+ br->stp_enabled = BR_USER_STP;
+ br_debug(br, "userspace STP started\n");
+ } else {
+ br->stp_enabled = BR_KERNEL_STP;
+ br_debug(br, "using kernel STP\n");
+
+ /* To start timers on any ports left in blocking */
+ if (br->dev->flags & IFF_UP)
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ br_port_state_selection(br);
+ }
+
+ spin_unlock_bh(&br->lock);
+}
+
+static void br_stp_stop(struct net_bridge *br)
+{
+ int err;
+
+ if (br->stp_enabled == BR_USER_STP) {
+ err = br_stp_call_user(br, "stop");
+ if (err)
+ br_err(br, "failed to stop userspace STP (%d)\n", err);
+
+ /* To start timers on any ports left in blocking */
+ spin_lock_bh(&br->lock);
+ br_port_state_selection(br);
+ spin_unlock_bh(&br->lock);
+ }
+
+ br->stp_enabled = BR_NO_STP;
+}
+
+int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (br_mrp_enabled(br)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "STP can't be enabled if MRP is already enabled");
+ return -EINVAL;
+ }
+
+ if (val) {
+ if (br->stp_enabled == BR_NO_STP)
+ br_stp_start(br);
+ } else {
+ if (br->stp_enabled != BR_NO_STP)
+ br_stp_stop(br);
+ }
+
+ return 0;
+}
+
/* called under bridge lock */
void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
{
- unsigned char oldaddr[6];
+ /* should be aligned on 2 bytes for ether_addr_equal() */
+ unsigned short oldaddr_aligned[ETH_ALEN >> 1];
+ unsigned char *oldaddr = (unsigned char *)oldaddr_aligned;
struct net_bridge_port *p;
int wasroot;
wasroot = br_is_root_bridge(br);
+ br_fdb_change_mac_address(br, addr);
+
memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN);
memcpy(br->bridge_id.addr, addr, ETH_ALEN);
- memcpy(br->dev->dev_addr, addr, ETH_ALEN);
+ eth_hw_addr_set(br->dev, addr);
list_for_each_entry(p, &br->port_list, list) {
- if (!compare_ether_addr(p->designated_bridge.addr, oldaddr))
+ if (ether_addr_equal(p->designated_bridge.addr, oldaddr))
memcpy(p->designated_bridge.addr, addr, ETH_ALEN);
- if (!compare_ether_addr(p->designated_root.addr, oldaddr))
+ if (ether_addr_equal(p->designated_root.addr, oldaddr))
memcpy(p->designated_root.addr, addr, ETH_ALEN);
-
}
br_configuration_update(br);
@@ -151,14 +249,21 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
br_become_root_bridge(br);
}
-static const unsigned char br_mac_zero[6];
+/* should be aligned on 2 bytes for ether_addr_equal() */
+static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1];
/* called under bridge lock */
-void br_stp_recalculate_bridge_id(struct net_bridge *br)
+bool br_stp_recalculate_bridge_id(struct net_bridge *br)
{
+ const unsigned char *br_mac_zero =
+ (const unsigned char *)br_mac_zero_aligned;
const unsigned char *addr = br_mac_zero;
struct net_bridge_port *p;
+ /* user has chosen a value so keep it */
+ if (br->dev->addr_assign_type == NET_ADDR_SET)
+ return false;
+
list_for_each_entry(p, &br->port_list, list) {
if (addr == br_mac_zero ||
memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
@@ -166,16 +271,20 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
}
- if (compare_ether_addr(br->bridge_id.addr, addr))
- br_stp_change_bridge_id(br, addr);
+ if (ether_addr_equal(br->bridge_id.addr, addr))
+ return false; /* no change */
+
+ br_stp_change_bridge_id(br, addr);
+ return true;
}
-/* called under bridge lock */
+/* Acquires and releases bridge lock */
void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio)
{
struct net_bridge_port *p;
int wasroot;
+ spin_lock_bh(&br->lock);
wasroot = br_is_root_bridge(br);
list_for_each_entry(p, &br->port_list, list) {
@@ -193,13 +302,18 @@ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio)
br_port_state_selection(br);
if (br_is_root_bridge(br) && !wasroot)
br_become_root_bridge(br);
+ spin_unlock_bh(&br->lock);
}
/* called under bridge lock */
-void br_stp_set_port_priority(struct net_bridge_port *p, u8 newprio)
+int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio)
{
- port_id new_port_id = br_make_port_id(newprio, p->port_no);
+ port_id new_port_id;
+ if (newprio > BR_MAX_PORT_PRIORITY)
+ return -ERANGE;
+
+ new_port_id = br_make_port_id(newprio, p->port_no);
if (br_is_designated_port(p))
p->designated_port = new_port_id;
@@ -210,14 +324,22 @@ void br_stp_set_port_priority(struct net_bridge_port *p, u8 newprio)
br_become_designated_port(p);
br_port_state_selection(p->br);
}
+
+ return 0;
}
/* called under bridge lock */
-void br_stp_set_path_cost(struct net_bridge_port *p, u32 path_cost)
+int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost)
{
+ if (path_cost < BR_MIN_PATH_COST ||
+ path_cost > BR_MAX_PATH_COST)
+ return -ERANGE;
+
+ p->flags |= BR_ADMIN_COST;
p->path_cost = path_cost;
br_configuration_update(p->br);
br_port_state_selection(p->br);
+ return 0;
}
ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id)
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index d0fcde82c6fc..e5d453305381 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -1,21 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; timer-related code
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * $Id: br_stp_timer.c,v 1.3 2000/05/05 02:17:17 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/times.h>
-#include <linux/smp_lock.h>
#include "br_private.h"
#include "br_private_stp.h"
@@ -27,30 +20,33 @@ static int br_is_designated_for_some_port(const struct net_bridge *br)
list_for_each_entry(p, &br->port_list, list) {
if (p->state != BR_STATE_DISABLED &&
- !memcmp(&p->designated_bridge, &br->bridge_id, 8))
+ !memcmp(&p->designated_bridge, &br->bridge_id, 8))
return 1;
}
return 0;
}
-static void br_hello_timer_expired(unsigned long arg)
+static void br_hello_timer_expired(struct timer_list *t)
{
- struct net_bridge *br = (struct net_bridge *)arg;
-
- pr_debug("%s: hello timer expired\n", br->dev->name);
+ struct net_bridge *br = timer_container_of(br, t, hello_timer);
+
+ br_debug(br, "hello timer expired\n");
spin_lock(&br->lock);
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
- mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ if (br->stp_enabled == BR_KERNEL_STP)
+ mod_timer(&br->hello_timer,
+ round_jiffies(jiffies + br->hello_time));
}
spin_unlock(&br->lock);
}
-static void br_message_age_timer_expired(unsigned long arg)
+static void br_message_age_timer_expired(struct timer_list *t)
{
- struct net_bridge_port *p = (struct net_bridge_port *) arg;
+ struct net_bridge_port *p = timer_container_of(p, t,
+ message_age_timer);
struct net_bridge *br = p->br;
const bridge_id *id = &p->designated_bridge;
int was_root;
@@ -58,13 +54,9 @@ static void br_message_age_timer_expired(unsigned long arg)
if (p->state == BR_STATE_DISABLED)
return;
-
- pr_info("%s: neighbor %.2x%.2x.%.2x:%.2x:%.2x:%.2x:%.2x:%.2x lost on port %d(%s)\n",
- br->dev->name,
- id->prio[0], id->prio[1],
- id->addr[0], id->addr[1], id->addr[2],
- id->addr[3], id->addr[4], id->addr[5],
- p->port_no, p->dev->name);
+ br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n",
+ (unsigned int) p->port_no, p->dev->name,
+ id->prio[0], id->prio[1], &id->addr);
/*
* According to the spec, the message age timer cannot be
@@ -85,58 +77,63 @@ static void br_message_age_timer_expired(unsigned long arg)
spin_unlock(&br->lock);
}
-static void br_forward_delay_timer_expired(unsigned long arg)
+static void br_forward_delay_timer_expired(struct timer_list *t)
{
- struct net_bridge_port *p = (struct net_bridge_port *) arg;
+ struct net_bridge_port *p = timer_container_of(p, t,
+ forward_delay_timer);
struct net_bridge *br = p->br;
- pr_debug("%s: %d(%s) forward delay timer\n",
- br->dev->name, p->port_no, p->dev->name);
+ br_debug(br, "port %u(%s) forward delay timer\n",
+ (unsigned int) p->port_no, p->dev->name);
spin_lock(&br->lock);
if (p->state == BR_STATE_LISTENING) {
- p->state = BR_STATE_LEARNING;
+ br_set_state(p, BR_STATE_LEARNING);
mod_timer(&p->forward_delay_timer,
jiffies + br->forward_delay);
} else if (p->state == BR_STATE_LEARNING) {
- p->state = BR_STATE_FORWARDING;
+ br_set_state(p, BR_STATE_FORWARDING);
if (br_is_designated_for_some_port(br))
br_topology_change_detection(br);
+ netif_carrier_on(br->dev);
}
- br_log_state(p);
+ rcu_read_lock();
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+ rcu_read_unlock();
spin_unlock(&br->lock);
}
-static void br_tcn_timer_expired(unsigned long arg)
+static void br_tcn_timer_expired(struct timer_list *t)
{
- struct net_bridge *br = (struct net_bridge *) arg;
+ struct net_bridge *br = timer_container_of(br, t, tcn_timer);
- pr_debug("%s: tcn timer expired\n", br->dev->name);
+ br_debug(br, "tcn timer expired\n");
spin_lock(&br->lock);
- if (br->dev->flags & IFF_UP) {
+ if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) {
br_transmit_tcn(br);
-
- mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time);
+
+ mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time);
}
spin_unlock(&br->lock);
}
-static void br_topology_change_timer_expired(unsigned long arg)
+static void br_topology_change_timer_expired(struct timer_list *t)
{
- struct net_bridge *br = (struct net_bridge *) arg;
+ struct net_bridge *br = timer_container_of(br, t,
+ topology_change_timer);
- pr_debug("%s: topo change timer expired\n", br->dev->name);
+ br_debug(br, "topo change timer expired\n");
spin_lock(&br->lock);
br->topology_change_detected = 0;
- br->topology_change = 0;
+ __br_set_topology_change(br, 0);
spin_unlock(&br->lock);
}
-static void br_hold_timer_expired(unsigned long arg)
+static void br_hold_timer_expired(struct timer_list *t)
{
- struct net_bridge_port *p = (struct net_bridge_port *) arg;
+ struct net_bridge_port *p = timer_container_of(p, t, hold_timer);
- pr_debug("%s: %d(%s) hold timer expired\n",
- p->br->dev->name, p->port_no, p->dev->name);
+ br_debug(p->br, "port %u(%s) hold timer expired\n",
+ (unsigned int) p->port_no, p->dev->name);
spin_lock(&p->br->lock);
if (p->config_pending)
@@ -146,34 +143,22 @@ static void br_hold_timer_expired(unsigned long arg)
void br_stp_timer_init(struct net_bridge *br)
{
- setup_timer(&br->hello_timer, br_hello_timer_expired,
- (unsigned long) br);
-
- setup_timer(&br->tcn_timer, br_tcn_timer_expired,
- (unsigned long) br);
-
- setup_timer(&br->topology_change_timer,
- br_topology_change_timer_expired,
- (unsigned long) br);
-
- setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
+ timer_setup(&br->hello_timer, br_hello_timer_expired, 0);
+ timer_setup(&br->tcn_timer, br_tcn_timer_expired, 0);
+ timer_setup(&br->topology_change_timer,
+ br_topology_change_timer_expired, 0);
}
void br_stp_port_timer_init(struct net_bridge_port *p)
{
- setup_timer(&p->message_age_timer, br_message_age_timer_expired,
- (unsigned long) p);
-
- setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired,
- (unsigned long) p);
-
- setup_timer(&p->hold_timer, br_hold_timer_expired,
- (unsigned long) p);
-}
+ timer_setup(&p->message_age_timer, br_message_age_timer_expired, 0);
+ timer_setup(&p->forward_delay_timer, br_forward_delay_timer_expired, 0);
+ timer_setup(&p->hold_timer, br_hold_timer_expired, 0);
+}
/* Report ticks left (in USER_HZ) used for API */
unsigned long br_timer_value(const struct timer_list *timer)
{
return timer_pending(timer)
- ? jiffies_to_clock_t(timer->expires - jiffies) : 0;
+ ? jiffies_delta_to_clock_t(timer->expires - jiffies) : 0;
}
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
new file mode 100644
index 000000000000..fe3f7bbe86ee
--- /dev/null
+++ b/net/bridge/br_switchdev.c
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+
+static struct static_key_false br_switchdev_tx_fwd_offload;
+
+static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
+{
+ if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload))
+ return false;
+
+ if (br_multicast_igmp_type(skb))
+ return false;
+
+ return (p->flags & BR_TX_FWD_OFFLOAD) &&
+ (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom);
+}
+
+bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
+{
+ if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload))
+ return false;
+
+ return BR_INPUT_SKB_CB(skb)->tx_fwd_offload;
+}
+
+void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb)
+{
+ skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb);
+}
+
+/* Mark the frame for TX forwarding offload if this egress port supports it */
+void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ if (nbp_switchdev_can_offload_tx_fwd(p, skb))
+ BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true;
+}
+
+/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms
+ * that the skb has been already forwarded to, to avoid further cloning to
+ * other ports in the same hwdom by making nbp_switchdev_allowed_egress()
+ * return false.
+ */
+void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ if (nbp_switchdev_can_offload_tx_fwd(p, skb))
+ set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms);
+}
+
+void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ if (p->hwdom)
+ BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom;
+}
+
+bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
+{
+ struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb);
+
+ return !test_bit(p->hwdom, &cb->fwd_hwdoms) &&
+ (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom);
+}
+
+/* Flags that can be offloaded to hardware */
+#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \
+ BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \
+ BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST)
+
+int br_switchdev_set_port_flag(struct net_bridge_port *p,
+ unsigned long flags,
+ unsigned long mask,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ };
+ struct switchdev_notifier_port_attr_info info = {
+ .attr = &attr,
+ };
+ int err;
+
+ mask &= BR_PORT_FLAGS_HW_OFFLOAD;
+ if (!mask)
+ return 0;
+
+ attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS;
+ attr.u.brport_flags.val = flags;
+ attr.u.brport_flags.mask = mask;
+
+ /* We run from atomic context here */
+ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev,
+ &info.info, extack);
+ err = notifier_to_errno(err);
+ if (err == -EOPNOTSUPP)
+ return 0;
+
+ if (err) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "bridge flag offload is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS;
+ attr.flags = SWITCHDEV_F_DEFER;
+
+ err = switchdev_port_attr_set(p->dev, &attr, extack);
+ if (err) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "error setting offload flag on port");
+ return err;
+ }
+
+ return 0;
+}
+
+static void br_switchdev_fdb_populate(struct net_bridge *br,
+ struct switchdev_notifier_fdb_info *item,
+ const struct net_bridge_fdb_entry *fdb,
+ const void *ctx)
+{
+ const struct net_bridge_port *p = READ_ONCE(fdb->dst);
+
+ item->addr = fdb->key.addr.addr;
+ item->vid = fdb->key.vlan_id;
+ item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
+ item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags);
+ item->locked = false;
+ item->info.dev = (!p || item->is_local) ? br->dev : p->dev;
+ item->info.ctx = ctx;
+}
+
+void
+br_switchdev_fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type)
+{
+ struct switchdev_notifier_fdb_info item;
+
+ if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+ return;
+
+ /* Entries with these flags were created using ndm_state == NUD_REACHABLE,
+ * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something
+ * equivalent to 'bridge fdb add ... master dynamic (sticky)'.
+ * Drivers don't know how to deal with these, so don't notify them to
+ * avoid confusing them.
+ */
+ if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) &&
+ !test_bit(BR_FDB_STATIC, &fdb->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
+ return;
+
+ br_switchdev_fdb_populate(br, &item, fdb, NULL);
+
+ switch (type) {
+ case RTM_DELNEIGH:
+ call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE,
+ item.info.dev, &item.info, NULL);
+ break;
+ case RTM_NEWNEIGH:
+ call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE,
+ item.info.dev, &item.info, NULL);
+ break;
+ }
+}
+
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
+ bool changed, struct netlink_ext_ack *extack)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = flags,
+ .vid = vid,
+ .changed = changed,
+ };
+
+ return switchdev_port_obj_add(dev, &v.obj, extack);
+}
+
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid = vid,
+ };
+
+ return switchdev_port_obj_del(dev, &v.obj);
+}
+
+static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining)
+{
+ struct net_bridge *br = joining->br;
+ struct net_bridge_port *p;
+ int hwdom;
+
+ /* joining is yet to be added to the port list. */
+ list_for_each_entry(p, &br->port_list, list) {
+ if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) {
+ joining->hwdom = p->hwdom;
+ return 0;
+ }
+ }
+
+ hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1);
+ if (hwdom >= BR_HWDOM_MAX)
+ return -EBUSY;
+
+ set_bit(hwdom, &br->busy_hwdoms);
+ joining->hwdom = hwdom;
+ return 0;
+}
+
+static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving)
+{
+ struct net_bridge *br = leaving->br;
+ struct net_bridge_port *p;
+
+ /* leaving is no longer in the port list. */
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->hwdom == leaving->hwdom)
+ return;
+ }
+
+ clear_bit(leaving->hwdom, &br->busy_hwdoms);
+}
+
+static int nbp_switchdev_add(struct net_bridge_port *p,
+ struct netdev_phys_item_id ppid,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (p->offload_count) {
+ /* Prevent unsupported configurations such as a bridge port
+ * which is a bonding interface, and the member ports are from
+ * different hardware switches.
+ */
+ if (!netdev_phys_item_id_same(&p->ppid, &ppid)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Same bridge port cannot be offloaded by two physical switches");
+ return -EBUSY;
+ }
+
+ /* Tolerate drivers that call switchdev_bridge_port_offload()
+ * more than once for the same bridge port, such as when the
+ * bridge port is an offloaded bonding/team interface.
+ */
+ p->offload_count++;
+
+ return 0;
+ }
+
+ p->ppid = ppid;
+ p->offload_count = 1;
+
+ err = nbp_switchdev_hwdom_set(p);
+ if (err)
+ return err;
+
+ if (tx_fwd_offload) {
+ p->flags |= BR_TX_FWD_OFFLOAD;
+ static_branch_inc(&br_switchdev_tx_fwd_offload);
+ }
+
+ return 0;
+}
+
+static void nbp_switchdev_del(struct net_bridge_port *p)
+{
+ if (WARN_ON(!p->offload_count))
+ return;
+
+ p->offload_count--;
+
+ if (p->offload_count)
+ return;
+
+ if (p->hwdom)
+ nbp_switchdev_hwdom_put(p);
+
+ if (p->flags & BR_TX_FWD_OFFLOAD) {
+ p->flags &= ~BR_TX_FWD_OFFLOAD;
+ static_branch_dec(&br_switchdev_tx_fwd_offload);
+ }
+}
+
+static int
+br_switchdev_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb,
+ const struct net_bridge_fdb_entry *fdb,
+ unsigned long action, const void *ctx)
+{
+ struct switchdev_notifier_fdb_info item;
+ int err;
+
+ br_switchdev_fdb_populate(br, &item, fdb, ctx);
+
+ err = nb->notifier_call(nb, action, &item);
+ return notifier_to_errno(err);
+}
+
+static int
+br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx,
+ bool adding, struct notifier_block *nb)
+{
+ struct net_bridge_fdb_entry *fdb;
+ struct net_bridge *br;
+ unsigned long action;
+ int err = 0;
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev))
+ return -EINVAL;
+
+ br = netdev_priv(br_dev);
+
+ if (adding)
+ action = SWITCHDEV_FDB_ADD_TO_DEVICE;
+ else
+ action = SWITCHDEV_FDB_DEL_TO_DEVICE;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
+ err = br_switchdev_fdb_replay_one(br, nb, fdb, action, ctx);
+ if (err)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int br_switchdev_vlan_attr_replay(struct net_device *br_dev,
+ const void *ctx,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_attr_info attr_info = {
+ .info = {
+ .dev = br_dev,
+ .extack = extack,
+ .ctx = ctx,
+ },
+ };
+ struct net_bridge *br = netdev_priv(br_dev);
+ struct net_bridge_vlan_group *vg;
+ struct switchdev_attr attr;
+ struct net_bridge_vlan *v;
+ int err;
+
+ attr_info.attr = &attr;
+ attr.orig_dev = br_dev;
+
+ vg = br_vlan_group(br);
+ if (!vg)
+ return 0;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (v->msti) {
+ attr.id = SWITCHDEV_ATTR_ID_VLAN_MSTI;
+ attr.u.vlan_msti.vid = v->vid;
+ attr.u.vlan_msti.msti = v->msti;
+
+ err = nb->notifier_call(nb, SWITCHDEV_PORT_ATTR_SET,
+ &attr_info);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int
+br_switchdev_vlan_replay_one(struct notifier_block *nb,
+ struct net_device *dev,
+ struct switchdev_obj_port_vlan *vlan,
+ const void *ctx, unsigned long action,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ .ctx = ctx,
+ },
+ .obj = &vlan->obj,
+ };
+ int err;
+
+ err = nb->notifier_call(nb, action, &obj_info);
+ return notifier_to_errno(err);
+}
+
+static int br_switchdev_vlan_replay_group(struct notifier_block *nb,
+ struct net_device *dev,
+ struct net_bridge_vlan_group *vg,
+ const void *ctx, unsigned long action,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v;
+ int err = 0;
+ u16 pvid;
+
+ if (!vg)
+ return 0;
+
+ pvid = br_get_pvid(vg);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = br_vlan_flags(v, pvid),
+ .vid = v->vid,
+ };
+
+ if (!br_vlan_should_use(v))
+ continue;
+
+ err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx,
+ action, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_switchdev_vlan_replay(struct net_device *br_dev,
+ const void *ctx, bool adding,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(br_dev);
+ struct net_bridge_port *p;
+ unsigned long action;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev))
+ return -EINVAL;
+
+ if (adding)
+ action = SWITCHDEV_PORT_OBJ_ADD;
+ else
+ action = SWITCHDEV_PORT_OBJ_DEL;
+
+ err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br),
+ ctx, action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ struct net_device *dev = p->dev;
+
+ err = br_switchdev_vlan_replay_group(nb, dev,
+ nbp_vlan_group(p),
+ ctx, action, extack);
+ if (err)
+ return err;
+ }
+
+ if (adding) {
+ err = br_switchdev_vlan_attr_replay(br_dev, ctx, nb, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+struct br_switchdev_mdb_complete_info {
+ struct net_bridge_port *port;
+ struct br_ip ip;
+};
+
+static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *priv)
+{
+ struct br_switchdev_mdb_complete_info *data = priv;
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port *port = data->port;
+ struct net_bridge *br = port->br;
+ u8 old_flags;
+
+ if (err == -EOPNOTSUPP)
+ goto out_free;
+
+ spin_lock_bh(&br->multicast_lock);
+ mp = br_mdb_ip_get(br, &data->ip);
+ if (!mp)
+ goto out;
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port != port)
+ continue;
+
+ old_flags = p->flags;
+ br_multicast_set_pg_offload_flags(p, !err);
+ if (br_mdb_should_notify(br, old_flags ^ p->flags))
+ br_mdb_flag_change_notify(br->dev, mp, p);
+ }
+out:
+ spin_unlock_bh(&br->multicast_lock);
+out_free:
+ kfree(priv);
+}
+
+static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
+ const struct net_bridge_mdb_entry *mp)
+{
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (mp->addr.proto == htons(ETH_P_IPV6))
+ ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr);
+#endif
+ else
+ ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr);
+
+ mdb->vid = mp->addr.vid;
+}
+
+static void br_switchdev_host_mdb_one(struct net_device *dev,
+ struct net_device *lower_dev,
+ struct net_bridge_mdb_entry *mp,
+ int type)
+{
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = SWITCHDEV_OBJ_ID_HOST_MDB,
+ .flags = SWITCHDEV_F_DEFER,
+ .orig_dev = dev,
+ },
+ };
+
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ switch (type) {
+ case RTM_NEWMDB:
+ switchdev_port_obj_add(lower_dev, &mdb.obj, NULL);
+ break;
+ case RTM_DELMDB:
+ switchdev_port_obj_del(lower_dev, &mdb.obj);
+ break;
+ }
+}
+
+static void br_switchdev_host_mdb(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp, int type)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ br_switchdev_host_mdb_one(dev, lower_dev, mp, type);
+}
+
+static int
+br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
+ const struct switchdev_obj_port_mdb *mdb,
+ unsigned long action, const void *ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ .ctx = ctx,
+ },
+ .obj = &mdb->obj,
+ };
+ int err;
+
+ err = nb->notifier_call(nb, action, &obj_info);
+ return notifier_to_errno(err);
+}
+
+static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
+ struct net_device *dev,
+ unsigned long action,
+ enum switchdev_obj_id id,
+ const struct net_bridge_mdb_entry *mp,
+ struct net_device *orig_dev)
+{
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = id,
+ .orig_dev = orig_dev,
+ },
+ };
+ struct switchdev_obj_port_mdb *pmdb;
+
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ if (action == SWITCHDEV_PORT_OBJ_ADD &&
+ switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) {
+ /* This event is already in the deferred queue of
+ * events, so this replay must be elided, lest the
+ * driver receives duplicate events for it. This can
+ * only happen when replaying additions, since
+ * modifications are always immediately visible in
+ * br->mdb_list, whereas actual event delivery may be
+ * delayed.
+ */
+ return 0;
+ }
+
+ pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC);
+ if (!pmdb)
+ return -ENOMEM;
+
+ list_add_tail(&pmdb->obj.list, mdb_list);
+ return 0;
+}
+
+void br_switchdev_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
+{
+ struct br_switchdev_mdb_complete_info *complete_info;
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = SWITCHDEV_OBJ_ID_PORT_MDB,
+ .flags = SWITCHDEV_F_DEFER,
+ },
+ };
+
+ if (!pg)
+ return br_switchdev_host_mdb(dev, mp, type);
+
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ mdb.obj.orig_dev = pg->key.port->dev;
+ switch (type) {
+ case RTM_NEWMDB:
+ complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
+ if (!complete_info)
+ break;
+ complete_info->port = pg->key.port;
+ complete_info->ip = mp->addr;
+ mdb.obj.complete_priv = complete_info;
+ mdb.obj.complete = br_switchdev_mdb_complete;
+ if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL))
+ kfree(complete_info);
+ break;
+ case RTM_DELMDB:
+ switchdev_port_obj_del(pg->key.port->dev, &mdb.obj);
+ break;
+ }
+}
+#endif
+
+static int
+br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
+ const void *ctx, bool adding, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ const struct net_bridge_mdb_entry *mp;
+ struct switchdev_obj *obj, *tmp;
+ struct net_bridge *br;
+ unsigned long action;
+ LIST_HEAD(mdb_list);
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ br = netdev_priv(br_dev);
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return 0;
+
+ if (adding)
+ action = SWITCHDEV_PORT_OBJ_ADD;
+ else
+ action = SWITCHDEV_PORT_OBJ_DEL;
+
+ /* br_switchdev_mdb_queue_one() will take care to not queue a
+ * replay of an event that is already pending in the switchdev
+ * deferred queue. In order to safely determine that, there
+ * must be no new deferred MDB notifications enqueued for the
+ * duration of the MDB scan. Therefore, grab the write-side
+ * lock to avoid racing with any concurrent IGMP/MLD snooping.
+ */
+ spin_lock_bh(&br->multicast_lock);
+
+ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
+ struct net_bridge_port_group __rcu * const *pp;
+ const struct net_bridge_port_group *p;
+
+ if (mp->host_joined) {
+ err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
+ SWITCHDEV_OBJ_ID_HOST_MDB,
+ mp, br_dev);
+ if (err) {
+ spin_unlock_bh(&br->multicast_lock);
+ goto out_free_mdb;
+ }
+ }
+
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port->dev != dev)
+ continue;
+
+ err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
+ SWITCHDEV_OBJ_ID_PORT_MDB,
+ mp, dev);
+ if (err) {
+ spin_unlock_bh(&br->multicast_lock);
+ goto out_free_mdb;
+ }
+ }
+ }
+
+ spin_unlock_bh(&br->multicast_lock);
+
+ list_for_each_entry(obj, &mdb_list, list) {
+ err = br_switchdev_mdb_replay_one(nb, dev,
+ SWITCHDEV_OBJ_PORT_MDB(obj),
+ action, ctx, extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
+ goto out_free_mdb;
+ }
+
+out_free_mdb:
+ list_for_each_entry_safe(obj, tmp, &mdb_list, list) {
+ list_del(&obj->list);
+ kfree(SWITCHDEV_OBJ_PORT_MDB(obj));
+ }
+
+ if (err)
+ return err;
+#endif
+
+ return 0;
+}
+
+static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *br_dev = p->br->dev;
+ struct net_device *dev = p->dev;
+ int err;
+
+ err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb,
+ extack);
+ if (err) {
+ /* -EOPNOTSUPP not propagated from MDB replay. */
+ return err;
+ }
+
+ err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
+static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
+ const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb)
+{
+ struct net_device *br_dev = p->br->dev;
+ struct net_device *dev = p->dev;
+
+ br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb);
+
+ br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
+
+ br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);
+
+ /* Make sure that the device leaving this bridge has seen all
+ * relevant events before it is disassociated. In the normal
+ * case, when the device is directly attached to the bridge,
+ * this is covered by del_nbp(). If the association was indirect
+ * however, e.g. via a team or bond, and the device is leaving
+ * that intermediate device, then the bridge port remains in
+ * place.
+ */
+ switchdev_deferred_process();
+}
+
+/* Let the bridge know that this port is offloaded, so that it can assign a
+ * switchdev hardware domain to it.
+ */
+int br_switchdev_port_offload(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_phys_item_id ppid;
+ int err;
+
+ err = netif_get_port_parent_id(dev, &ppid, false);
+ if (err)
+ return err;
+
+ err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack);
+ if (err)
+ return err;
+
+ err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack);
+ if (err)
+ goto out_switchdev_del;
+
+ return 0;
+
+out_switchdev_del:
+ nbp_switchdev_del(p);
+
+ return err;
+}
+
+void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb)
+{
+ nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb);
+
+ nbp_switchdev_del(p);
+}
+
+int br_switchdev_port_replay(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack)
+{
+ return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack);
+}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index de9d1a9473f2..cb4855ed9500 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -1,317 +1,996 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Sysfs attributes of bridge ports
+ * Sysfs attributes of bridge
* Linux ethernet bridge
*
* Authors:
* Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
#include <linux/if_bridge.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
#include <linux/times.h>
+#include <linux/sched/signal.h>
#include "br_private.h"
-#define to_class_dev(obj) container_of(obj,struct class_device,kobj)
-#define to_net_dev(class) container_of(class, struct net_device, class_dev)
-#define to_bridge(cd) ((struct net_bridge *)(to_net_dev(cd)->priv))
+/* IMPORTANT: new bridge options must be added with netlink support only
+ * please do not add new sysfs entries
+ */
+
+#define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd)))
/*
* Common code for storing bridge parameters.
*/
-static ssize_t store_bridge_parm(struct class_device *cd,
+static ssize_t store_bridge_parm(struct device *d,
const char *buf, size_t len,
- void (*set)(struct net_bridge *, unsigned long))
+ int (*set)(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack))
{
- struct net_bridge *br = to_bridge(cd);
- char *endp;
+ struct net_bridge *br = to_bridge(d);
+ struct netlink_ext_ack extack = {0};
unsigned long val;
+ int err;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- val = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
+ err = kstrtoul(buf, 0, &val);
+ if (err != 0)
+ return err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ err = (*set)(br, val, &extack);
+ if (!err)
+ netdev_state_change(br->dev);
+ if (extack._msg) {
+ if (err)
+ br_err(br, "%s\n", extack._msg);
+ else
+ br_warn(br, "%s\n", extack._msg);
+ }
+ rtnl_unlock();
- spin_lock_bh(&br->lock);
- (*set)(br, val);
- spin_unlock_bh(&br->lock);
- return len;
+ return err ? err : len;
}
-static ssize_t show_forward_delay(struct class_device *cd, char *buf)
+static ssize_t forward_delay_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
}
-static void set_forward_delay(struct net_bridge *br, unsigned long val)
+static int set_forward_delay(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- unsigned long delay = clock_t_to_jiffies(val);
- br->forward_delay = delay;
- if (br_is_root_bridge(br))
- br->bridge_forward_delay = delay;
+ return br_set_forward_delay(br, val);
}
-static ssize_t store_forward_delay(struct class_device *cd, const char *buf,
- size_t len)
+static ssize_t forward_delay_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return store_bridge_parm(cd, buf, len, set_forward_delay);
+ return store_bridge_parm(d, buf, len, set_forward_delay);
}
-static CLASS_DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR,
- show_forward_delay, store_forward_delay);
+static DEVICE_ATTR_RW(forward_delay);
-static ssize_t show_hello_time(struct class_device *cd, char *buf)
+static ssize_t hello_time_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(to_bridge(cd)->hello_time));
+ jiffies_to_clock_t(to_bridge(d)->hello_time));
}
-static void set_hello_time(struct net_bridge *br, unsigned long val)
+static int set_hello_time(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- unsigned long t = clock_t_to_jiffies(val);
- br->hello_time = t;
- if (br_is_root_bridge(br))
- br->bridge_hello_time = t;
+ return br_set_hello_time(br, val);
}
-static ssize_t store_hello_time(struct class_device *cd, const char *buf,
+static ssize_t hello_time_store(struct device *d,
+ struct device_attribute *attr, const char *buf,
size_t len)
{
- return store_bridge_parm(cd, buf, len, set_hello_time);
+ return store_bridge_parm(d, buf, len, set_hello_time);
}
+static DEVICE_ATTR_RW(hello_time);
-static CLASS_DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time,
- store_hello_time);
-
-static ssize_t show_max_age(struct class_device *cd, char *buf)
+static ssize_t max_age_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(to_bridge(cd)->max_age));
+ jiffies_to_clock_t(to_bridge(d)->max_age));
}
-static void set_max_age(struct net_bridge *br, unsigned long val)
+static int set_max_age(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- unsigned long t = clock_t_to_jiffies(val);
- br->max_age = t;
- if (br_is_root_bridge(br))
- br->bridge_max_age = t;
+ return br_set_max_age(br, val);
}
-static ssize_t store_max_age(struct class_device *cd, const char *buf,
- size_t len)
+static ssize_t max_age_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return store_bridge_parm(cd, buf, len, set_max_age);
+ return store_bridge_parm(d, buf, len, set_max_age);
}
+static DEVICE_ATTR_RW(max_age);
-static CLASS_DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age,
- store_max_age);
-
-static ssize_t show_ageing_time(struct class_device *cd, char *buf)
+static ssize_t ageing_time_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
}
-static void set_ageing_time(struct net_bridge *br, unsigned long val)
+static int set_ageing_time(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->ageing_time = clock_t_to_jiffies(val);
+ return br_set_ageing_time(br, val);
}
-static ssize_t store_ageing_time(struct class_device *cd, const char *buf,
- size_t len)
+static ssize_t ageing_time_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return store_bridge_parm(cd, buf, len, set_ageing_time);
+ return store_bridge_parm(d, buf, len, set_ageing_time);
}
+static DEVICE_ATTR_RW(ageing_time);
-static CLASS_DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time,
- store_ageing_time);
-static ssize_t show_stp_state(struct class_device *cd, char *buf)
+static ssize_t stp_state_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%d\n", br->stp_enabled);
}
-static void set_stp_state(struct net_bridge *br, unsigned long val)
+
+static int set_stp_state(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_stp_set_enabled(br, val, extack);
+}
+
+static ssize_t stp_state_store(struct device *d,
+ struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_stp_state);
+}
+static DEVICE_ATTR_RW(stp_state);
+
+static ssize_t group_fwd_mask_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
{
- br->stp_enabled = val;
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%#x\n", br->group_fwd_mask);
}
-static ssize_t store_stp_state(struct class_device *cd,
- const char *buf, size_t len)
+static int set_group_fwd_mask(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- return store_bridge_parm(cd, buf, len, set_stp_state);
+ if (val & BR_GROUPFWD_RESTRICTED)
+ return -EINVAL;
+
+ br->group_fwd_mask = val;
+
+ return 0;
}
-static CLASS_DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state,
- store_stp_state);
+static ssize_t group_fwd_mask_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_group_fwd_mask);
+}
+static DEVICE_ATTR_RW(group_fwd_mask);
-static ssize_t show_priority(struct class_device *cd, char *buf)
+static ssize_t priority_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%d\n",
(br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
}
-static void set_priority(struct net_bridge *br, unsigned long val)
+static int set_priority(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
br_stp_set_bridge_priority(br, (u16) val);
+ return 0;
}
-static ssize_t store_priority(struct class_device *cd,
- const char *buf, size_t len)
+static ssize_t priority_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return store_bridge_parm(cd, buf, len, set_priority);
+ return store_bridge_parm(d, buf, len, set_priority);
}
-static CLASS_DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority,
- store_priority);
+static DEVICE_ATTR_RW(priority);
-static ssize_t show_root_id(struct class_device *cd, char *buf)
+static ssize_t root_id_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
- return br_show_bridge_id(buf, &to_bridge(cd)->designated_root);
+ return br_show_bridge_id(buf, &to_bridge(d)->designated_root);
}
-static CLASS_DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL);
+static DEVICE_ATTR_RO(root_id);
-static ssize_t show_bridge_id(struct class_device *cd, char *buf)
+static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
- return br_show_bridge_id(buf, &to_bridge(cd)->bridge_id);
+ return br_show_bridge_id(buf, &to_bridge(d)->bridge_id);
}
-static CLASS_DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL);
+static DEVICE_ATTR_RO(bridge_id);
-static ssize_t show_root_port(struct class_device *cd, char *buf)
+static ssize_t root_port_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
- return sprintf(buf, "%d\n", to_bridge(cd)->root_port);
+ return sprintf(buf, "%d\n", to_bridge(d)->root_port);
}
-static CLASS_DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL);
+static DEVICE_ATTR_RO(root_port);
-static ssize_t show_root_path_cost(struct class_device *cd, char *buf)
+static ssize_t root_path_cost_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", to_bridge(cd)->root_path_cost);
+ return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost);
}
-static CLASS_DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL);
+static DEVICE_ATTR_RO(root_path_cost);
-static ssize_t show_topology_change(struct class_device *cd, char *buf)
+static ssize_t topology_change_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", to_bridge(cd)->topology_change);
+ return sprintf(buf, "%d\n", to_bridge(d)->topology_change);
}
-static CLASS_DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL);
+static DEVICE_ATTR_RO(topology_change);
-static ssize_t show_topology_change_detected(struct class_device *cd, char *buf)
+static ssize_t topology_change_detected_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%d\n", br->topology_change_detected);
}
-static CLASS_DEVICE_ATTR(topology_change_detected, S_IRUGO, show_topology_change_detected, NULL);
+static DEVICE_ATTR_RO(topology_change_detected);
-static ssize_t show_hello_timer(struct class_device *cd, char *buf)
+static ssize_t hello_timer_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer));
}
-static CLASS_DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL);
+static DEVICE_ATTR_RO(hello_timer);
-static ssize_t show_tcn_timer(struct class_device *cd, char *buf)
+static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer));
}
-static CLASS_DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL);
+static DEVICE_ATTR_RO(tcn_timer);
-static ssize_t show_topology_change_timer(struct class_device *cd, char *buf)
+static ssize_t topology_change_timer_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
{
- struct net_bridge *br = to_bridge(cd);
+ struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer));
}
-static CLASS_DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer, NULL);
+static DEVICE_ATTR_RO(topology_change_timer);
-static ssize_t show_gc_timer(struct class_device *cd, char *buf)
+static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
+ char *buf)
{
- struct net_bridge *br = to_bridge(cd);
- return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
}
-static CLASS_DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL);
+static DEVICE_ATTR_RO(gc_timer);
-static ssize_t show_group_addr(struct class_device *cd, char *buf)
+static ssize_t group_addr_show(struct device *d,
+ struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(cd);
- return sprintf(buf, "%x:%x:%x:%x:%x:%x\n",
- br->group_addr[0], br->group_addr[1],
- br->group_addr[2], br->group_addr[3],
- br->group_addr[4], br->group_addr[5]);
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%pM\n", br->group_addr);
}
-static ssize_t store_group_addr(struct class_device *cd, const char *buf,
- size_t len)
+static ssize_t group_addr_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- struct net_bridge *br = to_bridge(cd);
- unsigned new_addr[6];
- int i;
+ struct net_bridge *br = to_bridge(d);
+ u8 new_addr[6];
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- if (sscanf(buf, "%x:%x:%x:%x:%x:%x",
- &new_addr[0], &new_addr[1], &new_addr[2],
- &new_addr[3], &new_addr[4], &new_addr[5]) != 6)
+ if (!mac_pton(buf, new_addr))
return -EINVAL;
- /* Must be 01:80:c2:00:00:0X */
- for (i = 0; i < 5; i++)
- if (new_addr[i] != br_group_address[i])
- return -EINVAL;
-
- if (new_addr[5] & ~0xf)
+ if (!is_link_local_ether_addr(new_addr))
return -EINVAL;
- if (new_addr[5] == 1 /* 802.3x Pause address */
- || new_addr[5] == 2 /* 802.3ad Slow protocols */
- || new_addr[5] == 3) /* 802.1X PAE address */
+ if (new_addr[5] == 1 || /* 802.3x Pause address */
+ new_addr[5] == 2 || /* 802.3ad Slow protocols */
+ new_addr[5] == 3) /* 802.1X PAE address */
return -EINVAL;
+ if (!rtnl_trylock())
+ return restart_syscall();
+
spin_lock_bh(&br->lock);
- for (i = 0; i < 6; i++)
- br->group_addr[i] = new_addr[i];
+ ether_addr_copy(br->group_addr, new_addr);
spin_unlock_bh(&br->lock);
+
+ br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
+ br_recalculate_fwd_mask(br);
+ netdev_state_change(br->dev);
+
+ rtnl_unlock();
+
return len;
}
-static CLASS_DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR,
- show_group_addr, store_group_addr);
+static DEVICE_ATTR_RW(group_addr);
+
+static int set_flush(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags_mask = BIT(BR_FDB_STATIC)
+ };
+
+ br_fdb_flush(br, &desc);
+ return 0;
+}
+
+static ssize_t flush_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_flush);
+}
+static DEVICE_ATTR_WO(flush);
+
+static ssize_t no_linklocal_learn_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN));
+}
+
+static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack);
+}
+
+static ssize_t no_linklocal_learn_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_no_linklocal_learn);
+}
+static DEVICE_ATTR_RW(no_linklocal_learn);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static ssize_t multicast_router_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router);
+}
+
+static int set_multicast_router(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_router(&br->multicast_ctx, val);
+}
+
+static ssize_t multicast_router_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_multicast_router);
+}
+static DEVICE_ATTR_RW(multicast_router);
+
+static ssize_t multicast_snooping_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
+}
+
+static ssize_t multicast_snooping_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_toggle);
+}
+static DEVICE_ATTR_RW(multicast_snooping);
+
+static ssize_t multicast_query_use_ifaddr_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
+}
+
+static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
+ return 0;
+}
+
+static ssize_t
+multicast_query_use_ifaddr_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_use_ifaddr);
+}
+static DEVICE_ATTR_RW(multicast_query_use_ifaddr);
+
+static ssize_t multicast_querier_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier);
+}
+
+static int set_multicast_querier(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_querier(&br->multicast_ctx, val);
+}
+
+static ssize_t multicast_querier_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_multicast_querier);
+}
+static DEVICE_ATTR_RW(multicast_querier);
+
+static ssize_t hash_elasticity_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", RHT_ELASTICITY);
+}
+
+static int set_elasticity(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ /* 16 is RHT_ELASTICITY */
+ NL_SET_ERR_MSG_MOD(extack,
+ "the hash_elasticity option has been deprecated and is always 16");
+ return 0;
+}
+
+static ssize_t hash_elasticity_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_elasticity);
+}
+static DEVICE_ATTR_RW(hash_elasticity);
+
+static ssize_t hash_max_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->hash_max);
+}
+
+static int set_hash_max(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->hash_max = val;
+ return 0;
+}
+
+static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_hash_max);
+}
+static DEVICE_ATTR_RW(hash_max);
+
+static ssize_t multicast_igmp_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version);
+}
+
+static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_igmp_version(&br->multicast_ctx, val);
+}
+
+static ssize_t multicast_igmp_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_multicast_igmp_version);
+}
+static DEVICE_ATTR_RW(multicast_igmp_version);
+
+static ssize_t multicast_last_member_count_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count);
+}
+
+static int set_last_member_count(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->multicast_ctx.multicast_last_member_count = val;
+ return 0;
+}
+
+static ssize_t multicast_last_member_count_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_last_member_count);
+}
+static DEVICE_ATTR_RW(multicast_last_member_count);
+
+static ssize_t multicast_startup_query_count_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count);
+}
+
+static int set_startup_query_count(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->multicast_ctx.multicast_startup_query_count = val;
+ return 0;
+}
+
+static ssize_t multicast_startup_query_count_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_startup_query_count);
+}
+static DEVICE_ATTR_RW(multicast_startup_query_count);
+static ssize_t multicast_last_member_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval));
+}
+
+static int set_last_member_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_last_member_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_last_member_interval);
+}
+static DEVICE_ATTR_RW(multicast_last_member_interval);
+
+static ssize_t multicast_membership_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval));
+}
+
+static int set_membership_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_membership_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_membership_interval);
+}
+static DEVICE_ATTR_RW(multicast_membership_interval);
+
+static ssize_t multicast_querier_interval_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval));
+}
+
+static int set_querier_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_querier_interval_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_querier_interval);
+}
+static DEVICE_ATTR_RW(multicast_querier_interval);
+
+static ssize_t multicast_query_interval_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval));
+}
+
+static int set_query_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_multicast_set_query_intvl(&br->multicast_ctx, val);
+ return 0;
+}
+
+static ssize_t multicast_query_interval_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_interval);
+}
+static DEVICE_ATTR_RW(multicast_query_interval);
+
+static ssize_t multicast_query_response_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(
+ buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval));
+}
+
+static int set_query_response_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_query_response_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_response_interval);
+}
+static DEVICE_ATTR_RW(multicast_query_response_interval);
+
+static ssize_t multicast_startup_query_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(
+ buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval));
+}
+
+static int set_startup_query_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_multicast_set_startup_query_intvl(&br->multicast_ctx, val);
+ return 0;
+}
+
+static ssize_t multicast_startup_query_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_startup_query_interval);
+}
+static DEVICE_ATTR_RW(multicast_startup_query_interval);
+
+static ssize_t multicast_stats_enabled_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
+}
+
+static int set_stats_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val);
+ return 0;
+}
+
+static ssize_t multicast_stats_enabled_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_stats_enabled);
+}
+static DEVICE_ATTR_RW(multicast_stats_enabled);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static ssize_t multicast_mld_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version);
+}
+
+static int set_multicast_mld_version(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_mld_version(&br->multicast_ctx, val);
+}
+
+static ssize_t multicast_mld_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_multicast_mld_version);
+}
+static DEVICE_ATTR_RW(multicast_mld_version);
+#endif
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+static ssize_t nf_call_iptables_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
+}
+
+static int set_nf_call_iptables(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
+ return 0;
+}
+
+static ssize_t nf_call_iptables_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_nf_call_iptables);
+}
+static DEVICE_ATTR_RW(nf_call_iptables);
+
+static ssize_t nf_call_ip6tables_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
+}
+
+static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
+ return 0;
+}
+
+static ssize_t nf_call_ip6tables_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_nf_call_ip6tables);
+}
+static DEVICE_ATTR_RW(nf_call_ip6tables);
+
+static ssize_t nf_call_arptables_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
+}
+
+static int set_nf_call_arptables(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
+ return 0;
+}
+
+static ssize_t nf_call_arptables_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_nf_call_arptables);
+}
+static DEVICE_ATTR_RW(nf_call_arptables);
+#endif
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+static ssize_t vlan_filtering_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
+}
+
+static ssize_t vlan_filtering_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_filter_toggle);
+}
+static DEVICE_ATTR_RW(vlan_filtering);
+
+static ssize_t vlan_protocol_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto));
+}
+
+static ssize_t vlan_protocol_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_set_proto);
+}
+static DEVICE_ATTR_RW(vlan_protocol);
+
+static ssize_t default_pvid_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->default_pvid);
+}
+
+static ssize_t default_pvid_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid);
+}
+static DEVICE_ATTR_RW(default_pvid);
+
+static ssize_t vlan_stats_enabled_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
+}
+
+static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_vlan_set_stats(br, val);
+}
+
+static ssize_t vlan_stats_enabled_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_vlan_stats_enabled);
+}
+static DEVICE_ATTR_RW(vlan_stats_enabled);
+
+static ssize_t vlan_stats_per_port_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
+}
+
+static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_vlan_set_stats_per_port(br, val);
+}
+
+static ssize_t vlan_stats_per_port_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_vlan_stats_per_port);
+}
+static DEVICE_ATTR_RW(vlan_stats_per_port);
+#endif
static struct attribute *bridge_attrs[] = {
- &class_device_attr_forward_delay.attr,
- &class_device_attr_hello_time.attr,
- &class_device_attr_max_age.attr,
- &class_device_attr_ageing_time.attr,
- &class_device_attr_stp_state.attr,
- &class_device_attr_priority.attr,
- &class_device_attr_bridge_id.attr,
- &class_device_attr_root_id.attr,
- &class_device_attr_root_path_cost.attr,
- &class_device_attr_root_port.attr,
- &class_device_attr_topology_change.attr,
- &class_device_attr_topology_change_detected.attr,
- &class_device_attr_hello_timer.attr,
- &class_device_attr_tcn_timer.attr,
- &class_device_attr_topology_change_timer.attr,
- &class_device_attr_gc_timer.attr,
- &class_device_attr_group_addr.attr,
+ &dev_attr_forward_delay.attr,
+ &dev_attr_hello_time.attr,
+ &dev_attr_max_age.attr,
+ &dev_attr_ageing_time.attr,
+ &dev_attr_stp_state.attr,
+ &dev_attr_group_fwd_mask.attr,
+ &dev_attr_priority.attr,
+ &dev_attr_bridge_id.attr,
+ &dev_attr_root_id.attr,
+ &dev_attr_root_path_cost.attr,
+ &dev_attr_root_port.attr,
+ &dev_attr_topology_change.attr,
+ &dev_attr_topology_change_detected.attr,
+ &dev_attr_hello_timer.attr,
+ &dev_attr_tcn_timer.attr,
+ &dev_attr_topology_change_timer.attr,
+ &dev_attr_gc_timer.attr,
+ &dev_attr_group_addr.attr,
+ &dev_attr_flush.attr,
+ &dev_attr_no_linklocal_learn.attr,
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ &dev_attr_multicast_router.attr,
+ &dev_attr_multicast_snooping.attr,
+ &dev_attr_multicast_querier.attr,
+ &dev_attr_multicast_query_use_ifaddr.attr,
+ &dev_attr_hash_elasticity.attr,
+ &dev_attr_hash_max.attr,
+ &dev_attr_multicast_last_member_count.attr,
+ &dev_attr_multicast_startup_query_count.attr,
+ &dev_attr_multicast_last_member_interval.attr,
+ &dev_attr_multicast_membership_interval.attr,
+ &dev_attr_multicast_querier_interval.attr,
+ &dev_attr_multicast_query_interval.attr,
+ &dev_attr_multicast_query_response_interval.attr,
+ &dev_attr_multicast_startup_query_interval.attr,
+ &dev_attr_multicast_stats_enabled.attr,
+ &dev_attr_multicast_igmp_version.attr,
+#if IS_ENABLED(CONFIG_IPV6)
+ &dev_attr_multicast_mld_version.attr,
+#endif
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ &dev_attr_nf_call_iptables.attr,
+ &dev_attr_nf_call_ip6tables.attr,
+ &dev_attr_nf_call_arptables.attr,
+#endif
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ &dev_attr_vlan_filtering.attr,
+ &dev_attr_vlan_protocol.attr,
+ &dev_attr_default_pvid.attr,
+ &dev_attr_vlan_stats_enabled.attr,
+ &dev_attr_vlan_stats_per_port.attr,
+#endif
NULL
};
-static struct attribute_group bridge_group = {
+static const struct attribute_group bridge_group = {
.name = SYSFS_BRIDGE_ATTR,
.attrs = bridge_attrs,
};
@@ -322,31 +1001,31 @@ static struct attribute_group bridge_group = {
*
* Returns the number of bytes read.
*/
-static ssize_t brforward_read(struct kobject *kobj, char *buf,
- loff_t off, size_t count)
+static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
{
- struct class_device *cdev = to_class_dev(kobj);
- struct net_bridge *br = to_bridge(cdev);
+ struct device *dev = kobj_to_dev(kobj);
+ struct net_bridge *br = to_bridge(dev);
int n;
/* must read whole records */
if (off % sizeof(struct __fdb_entry) != 0)
return -EINVAL;
- n = br_fdb_fillbuf(br, buf,
+ n = br_fdb_fillbuf(br, buf,
count / sizeof(struct __fdb_entry),
off / sizeof(struct __fdb_entry));
if (n > 0)
n *= sizeof(struct __fdb_entry);
-
+
return n;
}
-static struct bin_attribute bridge_forward = {
+static const struct bin_attribute bridge_forward = {
.attr = { .name = SYSFS_BRIDGE_FDB,
- .mode = S_IRUGO,
- .owner = THIS_MODULE, },
+ .mode = 0444, },
.read = brforward_read,
};
@@ -363,41 +1042,36 @@ static struct bin_attribute bridge_forward = {
*/
int br_sysfs_addbr(struct net_device *dev)
{
- struct kobject *brobj = &dev->class_dev.kobj;
+ struct kobject *brobj = &dev->dev.kobj;
struct net_bridge *br = netdev_priv(dev);
int err;
err = sysfs_create_group(brobj, &bridge_group);
if (err) {
pr_info("%s: can't create group %s/%s\n",
- __FUNCTION__, dev->name, bridge_group.name);
+ __func__, dev->name, bridge_group.name);
goto out1;
}
err = sysfs_create_bin_file(brobj, &bridge_forward);
if (err) {
pr_info("%s: can't create attribute file %s/%s\n",
- __FUNCTION__, dev->name, bridge_forward.attr.name);
+ __func__, dev->name, bridge_forward.attr.name);
goto out2;
}
-
- kobject_set_name(&br->ifobj, SYSFS_BRIDGE_PORT_SUBDIR);
- br->ifobj.ktype = NULL;
- br->ifobj.kset = NULL;
- br->ifobj.parent = brobj;
-
- err = kobject_register(&br->ifobj);
- if (err) {
+ br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj);
+ if (!br->ifobj) {
pr_info("%s: can't add kobject (directory) %s/%s\n",
- __FUNCTION__, dev->name, br->ifobj.name);
+ __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR);
+ err = -ENOMEM;
goto out3;
}
return 0;
out3:
- sysfs_remove_bin_file(&dev->class_dev.kobj, &bridge_forward);
+ sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward);
out2:
- sysfs_remove_group(&dev->class_dev.kobj, &bridge_group);
+ sysfs_remove_group(&dev->dev.kobj, &bridge_group);
out1:
return err;
@@ -405,10 +1079,10 @@ int br_sysfs_addbr(struct net_device *dev)
void br_sysfs_delbr(struct net_device *dev)
{
- struct kobject *kobj = &dev->class_dev.kobj;
+ struct kobject *kobj = &dev->dev.kobj;
struct net_bridge *br = netdev_priv(dev);
- kobject_unregister(&br->ifobj);
+ kobject_put(br->ifobj);
sysfs_remove_bin_file(kobj, &bridge_forward);
sysfs_remove_group(kobj, &bridge_group);
}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index c51c9e42aeb3..74fdd8105dca 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Sysfs attributes of bridge ports
* Linux ethernet bridge
*
* Authors:
* Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
@@ -17,127 +13,253 @@
#include <linux/if_bridge.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
+#include <linux/sched/signal.h>
#include "br_private.h"
+/* IMPORTANT: new bridge port options must be added with netlink support only
+ * please do not add new sysfs entries
+ */
+
struct brport_attribute {
struct attribute attr;
ssize_t (*show)(struct net_bridge_port *, char *);
- ssize_t (*store)(struct net_bridge_port *, unsigned long);
+ int (*store)(struct net_bridge_port *, unsigned long);
+ int (*store_raw)(struct net_bridge_port *, char *);
+};
+
+#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \
+const struct brport_attribute brport_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode }, \
+ .show = _show, \
+ .store_raw = _store, \
};
-#define BRPORT_ATTR(_name,_mode,_show,_store) \
-struct brport_attribute brport_attr_##_name = { \
+#define BRPORT_ATTR(_name, _mode, _show, _store) \
+const struct brport_attribute brport_attr_##_name = { \
.attr = {.name = __stringify(_name), \
- .mode = _mode, \
- .owner = THIS_MODULE, }, \
+ .mode = _mode }, \
.show = _show, \
.store = _store, \
};
-static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
+#define BRPORT_ATTR_FLAG(_name, _mask) \
+static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", !!(p->flags & _mask)); \
+} \
+static int store_##_name(struct net_bridge_port *p, unsigned long v) \
+{ \
+ return store_flag(p, v, _mask); \
+} \
+static BRPORT_ATTR(_name, 0644, \
+ show_##_name, store_##_name)
+
+static int store_flag(struct net_bridge_port *p, unsigned long v,
+ unsigned long mask)
{
- return sprintf(buf, "%d\n", p->path_cost);
+ struct netlink_ext_ack extack = {0};
+ unsigned long flags = p->flags;
+ int err;
+
+ if (v)
+ flags |= mask;
+ else
+ flags &= ~mask;
+
+ if (flags != p->flags) {
+ err = br_switchdev_set_port_flag(p, flags, mask, &extack);
+ if (err) {
+ netdev_err(p->dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ p->flags = flags;
+ br_port_flags_change(p, mask);
+ }
+ return 0;
}
-static ssize_t store_path_cost(struct net_bridge_port *p, unsigned long v)
+
+static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
{
- br_stp_set_path_cost(p, v);
- return 0;
+ return sprintf(buf, "%d\n", p->path_cost);
}
-static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR,
- show_path_cost, store_path_cost);
+
+static BRPORT_ATTR(path_cost, 0644,
+ show_path_cost, br_stp_set_path_cost);
static ssize_t show_priority(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "%d\n", p->priority);
}
-static ssize_t store_priority(struct net_bridge_port *p, unsigned long v)
-{
- if (v >= (1<<(16-BR_PORT_BITS)))
- return -ERANGE;
- br_stp_set_port_priority(p, v);
- return 0;
-}
-static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR,
- show_priority, store_priority);
+
+static BRPORT_ATTR(priority, 0644,
+ show_priority, br_stp_set_port_priority);
static ssize_t show_designated_root(struct net_bridge_port *p, char *buf)
{
return br_show_bridge_id(buf, &p->designated_root);
}
-static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL);
+static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL);
static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf)
{
return br_show_bridge_id(buf, &p->designated_bridge);
}
-static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL);
+static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL);
static ssize_t show_designated_port(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "%d\n", p->designated_port);
}
-static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL);
+static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL);
static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "%d\n", p->designated_cost);
}
-static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL);
+static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL);
static ssize_t show_port_id(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "0x%x\n", p->port_id);
}
-static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL);
+static BRPORT_ATTR(port_id, 0444, show_port_id, NULL);
static ssize_t show_port_no(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "0x%x\n", p->port_no);
}
-static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL);
+static BRPORT_ATTR(port_no, 0444, show_port_no, NULL);
static ssize_t show_change_ack(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "%d\n", p->topology_change_ack);
}
-static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL);
+static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL);
static ssize_t show_config_pending(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "%d\n", p->config_pending);
}
-static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL);
+static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL);
static ssize_t show_port_state(struct net_bridge_port *p, char *buf)
{
return sprintf(buf, "%d\n", p->state);
}
-static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL);
+static BRPORT_ATTR(state, 0444, show_port_state, NULL);
static ssize_t show_message_age_timer(struct net_bridge_port *p,
char *buf)
{
return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer));
}
-static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL);
+static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL);
static ssize_t show_forward_delay_timer(struct net_bridge_port *p,
char *buf)
{
return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
}
-static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL);
+static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL);
static ssize_t show_hold_timer(struct net_bridge_port *p,
char *buf)
{
return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer));
}
-static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
+static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL);
+
+static int store_flush(struct net_bridge_port *p, unsigned long v)
+{
+ br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry
+ return 0;
+}
+static BRPORT_ATTR(flush, 0200, NULL, store_flush);
+
+static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%#x\n", p->group_fwd_mask);
+}
+
+static int store_group_fwd_mask(struct net_bridge_port *p,
+ unsigned long v)
+{
+ if (v & BR_GROUPFWD_MACPAUSE)
+ return -EINVAL;
+ p->group_fwd_mask = v;
+
+ return 0;
+}
+static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
+ store_group_fwd_mask);
+
+static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
+{
+ struct net_bridge_port *backup_p;
+ int ret = 0;
+
+ rcu_read_lock();
+ backup_p = rcu_dereference(p->backup_port);
+ if (backup_p)
+ ret = sprintf(buf, "%s\n", backup_p->dev->name);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int store_backup_port(struct net_bridge_port *p, char *buf)
+{
+ struct net_device *backup_dev = NULL;
+ char *nl = strchr(buf, '\n');
+
+ if (nl)
+ *nl = '\0';
+
+ if (strlen(buf) > 0) {
+ backup_dev = __dev_get_by_name(dev_net(p->dev), buf);
+ if (!backup_dev)
+ return -ENOENT;
+ }
+
+ return nbp_backup_change(p, backup_dev);
+}
+static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port);
+
+BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
+BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
+BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
+BRPORT_ATTR_FLAG(learning, BR_LEARNING);
+BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
+BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
+BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
+BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
+BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
+BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
+BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router);
+}
+
+static int store_multicast_router(struct net_bridge_port *p,
+ unsigned long v)
+{
+ return br_multicast_set_port_router(&p->multicast_ctx, v);
+}
+static BRPORT_ATTR(multicast_router, 0644, show_multicast_router,
+ store_multicast_router);
-static struct brport_attribute *brport_attrs[] = {
+BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
+#endif
+
+static const struct brport_attribute *brport_attrs[] = {
&brport_attr_path_cost,
&brport_attr_priority,
&brport_attr_port_id,
@@ -152,50 +274,90 @@ static struct brport_attribute *brport_attrs[] = {
&brport_attr_message_age_timer,
&brport_attr_forward_delay_timer,
&brport_attr_hold_timer,
+ &brport_attr_flush,
+ &brport_attr_hairpin_mode,
+ &brport_attr_bpdu_guard,
+ &brport_attr_root_block,
+ &brport_attr_learning,
+ &brport_attr_unicast_flood,
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ &brport_attr_multicast_router,
+ &brport_attr_multicast_fast_leave,
+ &brport_attr_multicast_to_unicast,
+#endif
+ &brport_attr_proxyarp,
+ &brport_attr_proxyarp_wifi,
+ &brport_attr_multicast_flood,
+ &brport_attr_broadcast_flood,
+ &brport_attr_group_fwd_mask,
+ &brport_attr_neigh_suppress,
+ &brport_attr_isolated,
+ &brport_attr_backup_port,
NULL
};
#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
-#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
-static ssize_t brport_show(struct kobject * kobj,
- struct attribute * attr, char * buf)
+static ssize_t brport_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
{
- struct brport_attribute * brport_attr = to_brport_attr(attr);
- struct net_bridge_port * p = to_brport(kobj);
+ struct brport_attribute *brport_attr = to_brport_attr(attr);
+ struct net_bridge_port *p = kobj_to_brport(kobj);
+
+ if (!brport_attr->show)
+ return -EINVAL;
return brport_attr->show(p, buf);
}
-static ssize_t brport_store(struct kobject * kobj,
- struct attribute * attr,
- const char * buf, size_t count)
+static ssize_t brport_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
{
- struct brport_attribute * brport_attr = to_brport_attr(attr);
- struct net_bridge_port * p = to_brport(kobj);
+ struct brport_attribute *brport_attr = to_brport_attr(attr);
+ struct net_bridge_port *p = kobj_to_brport(kobj);
ssize_t ret = -EINVAL;
- char *endp;
unsigned long val;
+ char *endp;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- val = simple_strtoul(buf, &endp, 0);
- if (endp != buf) {
- rtnl_lock();
- if (p->dev && p->br && brport_attr->store) {
- spin_lock_bh(&p->br->lock);
- ret = brport_attr->store(p, val);
- spin_unlock_bh(&p->br->lock);
- if (ret == 0)
- ret = count;
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (brport_attr->store_raw) {
+ char *buf_copy;
+
+ buf_copy = kstrndup(buf, count, GFP_KERNEL);
+ if (!buf_copy) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- rtnl_unlock();
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store_raw(p, buf_copy);
+ spin_unlock_bh(&p->br->lock);
+ kfree(buf_copy);
+ } else if (brport_attr->store) {
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ goto out_unlock;
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store(p, val);
+ spin_unlock_bh(&p->br->lock);
+ }
+
+ if (!ret) {
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+ ret = count;
}
+out_unlock:
+ rtnl_unlock();
+
return ret;
}
-struct sysfs_ops brport_sysfs_ops = {
+const struct sysfs_ops brport_sysfs_ops = {
.show = brport_show,
.store = brport_store,
};
@@ -203,26 +365,48 @@ struct sysfs_ops brport_sysfs_ops = {
/*
* Add sysfs entries to ethernet device added to a bridge.
* Creates a brport subdirectory with bridge attributes.
- * Puts symlink in bridge's brport subdirectory
+ * Puts symlink in bridge's brif subdirectory
*/
int br_sysfs_addif(struct net_bridge_port *p)
{
struct net_bridge *br = p->br;
- struct brport_attribute **a;
+ const struct brport_attribute **a;
int err;
- err = sysfs_create_link(&p->kobj, &br->dev->class_dev.kobj,
+ err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj,
SYSFS_BRIDGE_PORT_LINK);
if (err)
- goto out2;
+ return err;
for (a = brport_attrs; *a; ++a) {
err = sysfs_create_file(&p->kobj, &((*a)->attr));
if (err)
- goto out2;
+ return err;
}
- err= sysfs_create_link(&br->ifobj, &p->kobj, p->dev->name);
-out2:
+ strscpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+ return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name);
+}
+
+/* Rename bridge's brif symlink */
+int br_sysfs_renameif(struct net_bridge_port *p)
+{
+ struct net_bridge *br = p->br;
+ int err;
+
+ /* If a rename fails, the rollback will cause another
+ * rename call with the existing name.
+ */
+ if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ))
+ return 0;
+
+ err = sysfs_rename_link(br->ifobj, &p->kobj,
+ p->sysfs_name, p->dev->name);
+ if (err)
+ netdev_notice(br->dev, "unable to rename link %s to %s",
+ p->sysfs_name, p->dev->name);
+ else
+ strscpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+
return err;
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
new file mode 100644
index 000000000000..ce72b837ff8e
--- /dev/null
+++ b/net/bridge/br_vlan.c
@@ -0,0 +1,2349 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid);
+
+static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct net_bridge_vlan *vle = ptr;
+ u16 vid = *(u16 *)arg->key;
+
+ return vle->vid != vid;
+}
+
+static const struct rhashtable_params br_vlan_rht_params = {
+ .head_offset = offsetof(struct net_bridge_vlan, vnode),
+ .key_offset = offsetof(struct net_bridge_vlan, vid),
+ .key_len = sizeof(u16),
+ .nelem_hint = 3,
+ .max_size = VLAN_N_VID,
+ .obj_cmpfn = br_vlan_cmp,
+ .automatic_shrinking = true,
+};
+
+static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
+{
+ return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
+}
+
+static void __vlan_add_pvid(struct net_bridge_vlan_group *vg,
+ const struct net_bridge_vlan *v)
+{
+ if (vg->pvid == v->vid)
+ return;
+
+ smp_wmb();
+ br_vlan_set_pvid_state(vg, v->state);
+ vg->pvid = v->vid;
+}
+
+static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+{
+ if (vg->pvid != vid)
+ return;
+
+ smp_wmb();
+ vg->pvid = 0;
+}
+
+/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v.
+ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and
+ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v.
+ */
+static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags,
+ bool commit)
+{
+ struct net_bridge_vlan_group *vg;
+ bool change;
+
+ if (br_vlan_is_master(v))
+ vg = br_vlan_group(v->br);
+ else
+ vg = nbp_vlan_group(v->port);
+
+ /* check if anything would be changed on commit */
+ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) ||
+ ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED);
+
+ if (!commit)
+ goto out;
+
+ if (flags & BRIDGE_VLAN_INFO_PVID)
+ __vlan_add_pvid(vg, v);
+ else
+ __vlan_delete_pvid(vg, v->vid);
+
+ if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ v->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+ else
+ v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED;
+
+out:
+ return change;
+}
+
+static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags)
+{
+ return __vlan_flags_update(v, flags, false);
+}
+
+static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags)
+{
+ __vlan_flags_update(v, flags, true);
+}
+
+static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
+ struct net_bridge_vlan *v, u16 flags,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ /* Try switchdev op first. In case it is not supported, fallback to
+ * 8021q add.
+ */
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack);
+ if (err == -EOPNOTSUPP)
+ return vlan_vid_add(dev, br->vlan_proto, v->vid);
+ v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV;
+ return err;
+}
+
+static void __vlan_add_list(struct net_bridge_vlan *v)
+{
+ struct net_bridge_vlan_group *vg;
+ struct list_head *headp, *hpos;
+ struct net_bridge_vlan *vent;
+
+ if (br_vlan_is_master(v))
+ vg = br_vlan_group(v->br);
+ else
+ vg = nbp_vlan_group(v->port);
+
+ headp = &vg->vlan_list;
+ list_for_each_prev(hpos, headp) {
+ vent = list_entry(hpos, struct net_bridge_vlan, vlist);
+ if (v->vid >= vent->vid)
+ break;
+ }
+ list_add_rcu(&v->vlist, hpos);
+}
+
+static void __vlan_del_list(struct net_bridge_vlan *v)
+{
+ list_del_rcu(&v->vlist);
+}
+
+static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
+ const struct net_bridge_vlan *v)
+{
+ int err;
+
+ /* Try switchdev op first. In case it is not supported, fallback to
+ * 8021q del.
+ */
+ err = br_switchdev_port_vlan_del(dev, v->vid);
+ if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV))
+ vlan_vid_del(dev, br->vlan_proto, v->vid);
+ return err == -EOPNOTSUPP ? 0 : err;
+}
+
+/* Returns a master vlan, if it didn't exist it gets created. In all cases
+ * a reference is taken to the master vlan before returning.
+ */
+static struct net_bridge_vlan *
+br_vlan_get_master(struct net_bridge *br, u16 vid,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *masterv;
+
+ vg = br_vlan_group(br);
+ masterv = br_vlan_find(vg, vid);
+ if (!masterv) {
+ bool changed;
+
+ /* missing global ctx, create it now */
+ if (br_vlan_add(br, vid, 0, &changed, extack))
+ return NULL;
+ masterv = br_vlan_find(vg, vid);
+ if (WARN_ON(!masterv))
+ return NULL;
+ refcount_set(&masterv->refcnt, 1);
+ return masterv;
+ }
+ refcount_inc(&masterv->refcnt);
+
+ return masterv;
+}
+
+static void br_master_vlan_rcu_free(struct rcu_head *rcu)
+{
+ struct net_bridge_vlan *v;
+
+ v = container_of(rcu, struct net_bridge_vlan, rcu);
+ WARN_ON(!br_vlan_is_master(v));
+ free_percpu(v->stats);
+ v->stats = NULL;
+ kfree(v);
+}
+
+static void br_vlan_put_master(struct net_bridge_vlan *masterv)
+{
+ struct net_bridge_vlan_group *vg;
+
+ if (!br_vlan_is_master(masterv))
+ return;
+
+ vg = br_vlan_group(masterv->br);
+ if (refcount_dec_and_test(&masterv->refcnt)) {
+ rhashtable_remove_fast(&vg->vlan_hash,
+ &masterv->vnode, br_vlan_rht_params);
+ __vlan_del_list(masterv);
+ br_multicast_toggle_one_vlan(masterv, false);
+ br_multicast_ctx_deinit(&masterv->br_mcast_ctx);
+ call_rcu(&masterv->rcu, br_master_vlan_rcu_free);
+ }
+}
+
+static void nbp_vlan_rcu_free(struct rcu_head *rcu)
+{
+ struct net_bridge_vlan *v;
+
+ v = container_of(rcu, struct net_bridge_vlan, rcu);
+ WARN_ON(br_vlan_is_master(v));
+ /* if we had per-port stats configured then free them here */
+ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS)
+ free_percpu(v->stats);
+ v->stats = NULL;
+ kfree(v);
+}
+
+static void br_vlan_init_state(struct net_bridge_vlan *v)
+{
+ struct net_bridge *br;
+
+ if (br_vlan_is_master(v))
+ br = v->br;
+ else
+ br = v->port->br;
+
+ if (br_opt_get(br, BROPT_MST_ENABLED)) {
+ br_mst_vlan_init_state(v);
+ return;
+ }
+
+ v->state = BR_STATE_FORWARDING;
+ v->msti = 0;
+}
+
+/* This is the shared VLAN add function which works for both ports and bridge
+ * devices. There are four possible calls to this function in terms of the
+ * vlan entry type:
+ * 1. vlan is being added on a port (no master flags, global entry exists)
+ * 2. vlan is being added on a bridge (both master and brentry flags)
+ * 3. vlan is being added on a port, but a global entry didn't exist which
+ * is being created right now (master flag set, brentry flag unset), the
+ * global entry is used for global per-vlan features, but not for filtering
+ * 4. same as 3 but with both master and brentry flags set so the entry
+ * will be used for filtering in both the port and the bridge
+ */
+static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *masterv = NULL;
+ struct net_bridge_port *p = NULL;
+ struct net_bridge_vlan_group *vg;
+ struct net_device *dev;
+ struct net_bridge *br;
+ int err;
+
+ if (br_vlan_is_master(v)) {
+ br = v->br;
+ dev = br->dev;
+ vg = br_vlan_group(br);
+ } else {
+ p = v->port;
+ br = p->br;
+ dev = p->dev;
+ vg = nbp_vlan_group(p);
+ }
+
+ if (p) {
+ /* Add VLAN to the device filter if it is supported.
+ * This ensures tagged traffic enters the bridge when
+ * promiscuous mode is disabled by br_manage_promisc().
+ */
+ err = __vlan_vid_add(dev, br, v, flags, extack);
+ if (err)
+ goto out;
+
+ /* need to work on the master vlan too */
+ if (flags & BRIDGE_VLAN_INFO_MASTER) {
+ bool changed;
+
+ err = br_vlan_add(br, v->vid,
+ flags | BRIDGE_VLAN_INFO_BRENTRY,
+ &changed, extack);
+ if (err)
+ goto out_filt;
+
+ if (changed)
+ br_vlan_notify(br, NULL, v->vid, 0,
+ RTM_NEWVLAN);
+ }
+
+ masterv = br_vlan_get_master(br, v->vid, extack);
+ if (!masterv) {
+ err = -ENOMEM;
+ goto out_filt;
+ }
+ v->brvlan = masterv;
+ if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
+ v->stats =
+ netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!v->stats) {
+ err = -ENOMEM;
+ goto out_filt;
+ }
+ v->priv_flags |= BR_VLFLAG_PER_PORT_STATS;
+ } else {
+ v->stats = masterv->stats;
+ }
+ br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx);
+ } else {
+ if (br_vlan_should_use(v)) {
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags,
+ false, extack);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ br_multicast_ctx_init(br, v, &v->br_mcast_ctx);
+ v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
+ }
+
+ /* Add the dev mac and count the vlan only if it's usable */
+ if (br_vlan_should_use(v)) {
+ if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) {
+ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
+ if (err) {
+ br_err(br, "failed insert local address into bridge forwarding table\n");
+ goto out_filt;
+ }
+ }
+ vg->num_vlans++;
+ }
+
+ /* set the state before publishing */
+ br_vlan_init_state(v);
+
+ err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
+ br_vlan_rht_params);
+ if (err)
+ goto out_fdb_insert;
+
+ __vlan_add_list(v);
+ __vlan_flags_commit(v, flags);
+ br_multicast_toggle_one_vlan(v, true);
+
+ if (p)
+ nbp_vlan_set_vlan_dev_state(p, v->vid);
+out:
+ return err;
+
+out_fdb_insert:
+ if (br_vlan_should_use(v)) {
+ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
+ vg->num_vlans--;
+ }
+
+out_filt:
+ if (p) {
+ __vlan_vid_del(dev, br, v);
+ if (masterv) {
+ if (v->stats && masterv->stats != v->stats)
+ free_percpu(v->stats);
+ v->stats = NULL;
+
+ br_vlan_put_master(masterv);
+ v->brvlan = NULL;
+ }
+ } else {
+ br_switchdev_port_vlan_del(dev, v->vid);
+ }
+
+ goto out;
+}
+
+static int __vlan_del(struct net_bridge_vlan *v)
+{
+ struct net_bridge_vlan *masterv = v;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ int err = 0;
+
+ if (br_vlan_is_master(v)) {
+ vg = br_vlan_group(v->br);
+ } else {
+ p = v->port;
+ vg = nbp_vlan_group(v->port);
+ masterv = v->brvlan;
+ }
+
+ __vlan_delete_pvid(vg, v->vid);
+ if (p) {
+ err = __vlan_vid_del(p->dev, p->br, v);
+ if (err)
+ goto out;
+ } else {
+ err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ err = 0;
+ }
+
+ if (br_vlan_should_use(v)) {
+ v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY;
+ vg->num_vlans--;
+ }
+
+ if (masterv != v) {
+ vlan_tunnel_info_del(vg, v);
+ rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
+ br_vlan_rht_params);
+ __vlan_del_list(v);
+ nbp_vlan_set_vlan_dev_state(p, v->vid);
+ br_multicast_toggle_one_vlan(v, false);
+ br_multicast_port_ctx_deinit(&v->port_mcast_ctx);
+ call_rcu(&v->rcu, nbp_vlan_rcu_free);
+ }
+
+ br_vlan_put_master(masterv);
+out:
+ return err;
+}
+
+static void __vlan_group_free(struct net_bridge_vlan_group *vg)
+{
+ WARN_ON(!list_empty(&vg->vlan_list));
+ rhashtable_destroy(&vg->vlan_hash);
+ vlan_tunnel_deinit(vg);
+ kfree(vg);
+}
+
+static void __vlan_flush(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *vlan, *tmp;
+ u16 v_start = 0, v_end = 0;
+ int err;
+
+ __vlan_delete_pvid(vg, vg->pvid);
+ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) {
+ /* take care of disjoint ranges */
+ if (!v_start) {
+ v_start = vlan->vid;
+ } else if (vlan->vid - v_end != 1) {
+ /* found range end, notify and start next one */
+ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
+ v_start = vlan->vid;
+ }
+ v_end = vlan->vid;
+
+ err = __vlan_del(vlan);
+ if (err) {
+ br_err(br,
+ "port %u(%s) failed to delete vlan %d: %pe\n",
+ (unsigned int) p->port_no, p->dev->name,
+ vlan->vid, ERR_PTR(err));
+ }
+ }
+
+ /* notify about the last/whole vlan range */
+ if (v_start)
+ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
+}
+
+struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg,
+ struct sk_buff *skb)
+{
+ struct pcpu_sw_netstats *stats;
+ struct net_bridge_vlan *v;
+ u16 vid;
+
+ /* If this packet was not filtered at input, let it pass */
+ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
+ goto out;
+
+ /* At this point, we know that the frame was filtered and contains
+ * a valid vlan id. If the vlan id has untagged flag set,
+ * send untagged; otherwise, send tagged.
+ */
+ br_vlan_get_tag(skb, &vid);
+ v = br_vlan_find(vg, vid);
+ /* Vlan entry must be configured at this point. The
+ * only exception is the bridge is set in promisc mode and the
+ * packet is destined for the bridge device. In this case
+ * pass the packet as is.
+ */
+ if (!v || !br_vlan_should_use(v)) {
+ if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) {
+ goto out;
+ } else {
+ kfree_skb(skb);
+ return NULL;
+ }
+ }
+ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+ stats = this_cpu_ptr(v->stats);
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_add(&stats->tx_bytes, skb->len);
+ u64_stats_inc(&stats->tx_packets);
+ u64_stats_update_end(&stats->syncp);
+ }
+
+ /* If the skb will be sent using forwarding offload, the assumption is
+ * that the switchdev will inject the packet into hardware together
+ * with the bridge VLAN, so that it can be forwarded according to that
+ * VLAN. The switchdev should deal with popping the VLAN header in
+ * hardware on each egress port as appropriate. So only strip the VLAN
+ * header if forwarding offload is not being used.
+ */
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED &&
+ !br_switchdev_frame_uses_tx_fwd_offload(skb))
+ __vlan_hwaccel_clear_tag(skb);
+
+ if (p && (p->flags & BR_VLAN_TUNNEL) &&
+ br_handle_egress_vlan_tunnel(skb, v)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+out:
+ return skb;
+}
+
+/* Called under RCU */
+static bool __allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct sk_buff *skb, u16 *vid,
+ u8 *state,
+ struct net_bridge_vlan **vlan)
+{
+ struct pcpu_sw_netstats *stats;
+ struct net_bridge_vlan *v;
+ bool tagged;
+
+ BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
+ /* If vlan tx offload is disabled on bridge device and frame was
+ * sent from vlan device on the bridge device, it does not have
+ * HW accelerated vlan tag.
+ */
+ if (unlikely(!skb_vlan_tag_present(skb) &&
+ skb->protocol == br->vlan_proto)) {
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ return false;
+ }
+
+ if (!br_vlan_get_tag(skb, vid)) {
+ /* Tagged frame */
+ if (skb->vlan_proto != br->vlan_proto) {
+ /* Protocol-mismatch, empty out vlan_tci for new tag */
+ skb_push(skb, ETH_HLEN);
+ skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ if (unlikely(!skb))
+ return false;
+
+ skb_pull(skb, ETH_HLEN);
+ skb_reset_mac_len(skb);
+ *vid = 0;
+ tagged = false;
+ } else {
+ tagged = true;
+ }
+ } else {
+ /* Untagged frame */
+ tagged = false;
+ }
+
+ if (!*vid) {
+ u16 pvid = br_get_pvid(vg);
+
+ /* Frame had a tag with VID 0 or did not have a tag.
+ * See if pvid is set on this port. That tells us which
+ * vlan untagged or priority-tagged traffic belongs to.
+ */
+ if (!pvid)
+ goto drop;
+
+ /* PVID is set on this port. Any untagged or priority-tagged
+ * ingress frame is considered to belong to this vlan.
+ */
+ *vid = pvid;
+ if (likely(!tagged))
+ /* Untagged Frame. */
+ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
+ else
+ /* Priority-tagged Frame.
+ * At this point, we know that skb->vlan_tci VID
+ * field was 0.
+ * We update only VID field and preserve PCP field.
+ */
+ skb->vlan_tci |= pvid;
+
+ /* if snooping and stats are disabled we can avoid the lookup */
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
+ !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+ if (*state == BR_STATE_FORWARDING) {
+ *state = br_vlan_get_pvid_state(vg);
+ if (!br_vlan_state_allowed(*state, true))
+ goto drop;
+ }
+ return true;
+ }
+ }
+ v = br_vlan_find(vg, *vid);
+ if (!v || !br_vlan_should_use(v))
+ goto drop;
+
+ if (*state == BR_STATE_FORWARDING) {
+ *state = br_vlan_get_state(v);
+ if (!br_vlan_state_allowed(*state, true))
+ goto drop;
+ }
+
+ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+ stats = this_cpu_ptr(v->stats);
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_add(&stats->rx_bytes, skb->len);
+ u64_stats_inc(&stats->rx_packets);
+ u64_stats_update_end(&stats->syncp);
+ }
+
+ *vlan = v;
+
+ return true;
+
+drop:
+ kfree_skb(skb);
+ return false;
+}
+
+bool br_allowed_ingress(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg, struct sk_buff *skb,
+ u16 *vid, u8 *state,
+ struct net_bridge_vlan **vlan)
+{
+ /* If VLAN filtering is disabled on the bridge, all packets are
+ * permitted.
+ */
+ *vlan = NULL;
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
+ BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
+ return true;
+ }
+
+ return __allowed_ingress(br, vg, skb, vid, state, vlan);
+}
+
+/* Called under RCU. */
+bool br_allowed_egress(struct net_bridge_vlan_group *vg,
+ const struct sk_buff *skb)
+{
+ const struct net_bridge_vlan *v;
+ u16 vid;
+
+ /* If this packet was not filtered at input, let it pass */
+ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
+ return true;
+
+ br_vlan_get_tag(skb, &vid);
+ v = br_vlan_find(vg, vid);
+ if (v && br_vlan_should_use(v) &&
+ br_vlan_state_allowed(br_vlan_get_state(v), false))
+ return true;
+
+ return false;
+}
+
+/* Called under RCU */
+bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge *br = p->br;
+ struct net_bridge_vlan *v;
+
+ /* If filtering was disabled at input, let it pass. */
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return true;
+
+ vg = nbp_vlan_group_rcu(p);
+ if (!vg || !vg->num_vlans)
+ return false;
+
+ if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto)
+ *vid = 0;
+
+ if (!*vid) {
+ *vid = br_get_pvid(vg);
+ if (!*vid ||
+ !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true))
+ return false;
+
+ return true;
+ }
+
+ v = br_vlan_find(vg, *vid);
+ if (v && br_vlan_state_allowed(br_vlan_get_state(v), true))
+ return true;
+
+ return false;
+}
+
+static int br_vlan_add_existing(struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan,
+ u16 flags, bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ bool becomes_brentry = false;
+ bool would_change = false;
+ int err;
+
+ if (!br_vlan_is_brentry(vlan)) {
+ /* Trying to change flags of non-existent bridge vlan */
+ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
+ return -EINVAL;
+
+ becomes_brentry = true;
+ } else {
+ would_change = __vlan_flags_would_change(vlan, flags);
+ }
+
+ /* Master VLANs that aren't brentries weren't notified before,
+ * time to notify them now.
+ */
+ if (becomes_brentry || would_change) {
+ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags,
+ would_change, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ if (becomes_brentry) {
+ /* It was only kept for port vlans, now make it real */
+ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid);
+ if (err) {
+ br_err(br, "failed to insert local address into bridge forwarding table\n");
+ goto err_fdb_insert;
+ }
+
+ refcount_inc(&vlan->refcnt);
+ vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
+ vg->num_vlans++;
+ *changed = true;
+ br_multicast_toggle_one_vlan(vlan, true);
+ }
+
+ __vlan_flags_commit(vlan, flags);
+ if (would_change)
+ *changed = true;
+
+ return 0;
+
+err_fdb_insert:
+ br_switchdev_port_vlan_del(br->dev, vlan->vid);
+ return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ * changed must be true only if the vlan was created or updated
+ */
+int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+ int ret;
+
+ ASSERT_RTNL();
+
+ *changed = false;
+ vg = br_vlan_group(br);
+ vlan = br_vlan_find(vg, vid);
+ if (vlan)
+ return br_vlan_add_existing(br, vg, vlan, flags, changed,
+ extack);
+
+ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
+ if (!vlan)
+ return -ENOMEM;
+
+ vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!vlan->stats) {
+ kfree(vlan);
+ return -ENOMEM;
+ }
+ vlan->vid = vid;
+ vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER;
+ vlan->flags &= ~BRIDGE_VLAN_INFO_PVID;
+ vlan->br = br;
+ if (flags & BRIDGE_VLAN_INFO_BRENTRY)
+ refcount_set(&vlan->refcnt, 1);
+ ret = __vlan_add(vlan, flags, extack);
+ if (ret) {
+ free_percpu(vlan->stats);
+ kfree(vlan);
+ } else {
+ *changed = true;
+ }
+
+ return ret;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int br_vlan_delete(struct net_bridge *br, u16 vid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ ASSERT_RTNL();
+
+ vg = br_vlan_group(br);
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_is_brentry(v))
+ return -ENOENT;
+
+ br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
+ br_fdb_delete_by_port(br, NULL, vid, 0);
+
+ vlan_tunnel_info_del(vg, v);
+
+ return __vlan_del(v);
+}
+
+void br_vlan_flush(struct net_bridge *br)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+
+ vg = br_vlan_group(br);
+ __vlan_flush(br, NULL, vg);
+ RCU_INIT_POINTER(br->vlgrp, NULL);
+ synchronize_net();
+ __vlan_group_free(vg);
+}
+
+struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
+{
+ if (!vg)
+ return NULL;
+
+ return br_vlan_lookup(&vg->vlan_hash, vid);
+}
+
+/* Must be protected by RTNL. */
+static void recalculate_group_addr(struct net_bridge *br)
+{
+ if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
+ return;
+
+ spin_lock_bh(&br->lock);
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+ br->vlan_proto == htons(ETH_P_8021Q)) {
+ /* Bridge Group Address */
+ br->group_addr[5] = 0x00;
+ } else { /* vlan_enabled && ETH_P_8021AD */
+ /* Provider Bridge Group Address */
+ br->group_addr[5] = 0x08;
+ }
+ spin_unlock_bh(&br->lock);
+}
+
+/* Must be protected by RTNL. */
+void br_recalculate_fwd_mask(struct net_bridge *br)
+{
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+ br->vlan_proto == htons(ETH_P_8021Q))
+ br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
+ else /* vlan_enabled && ETH_P_8021AD */
+ br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
+ ~(1u << br->group_addr[5]);
+}
+
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_filtering = val,
+ };
+ int err;
+
+ if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
+ return 0;
+
+ br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
+
+ err = switchdev_port_attr_set(br->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP) {
+ br_opt_toggle(br, BROPT_VLAN_ENABLED, !val);
+ return err;
+ }
+
+ br_manage_promisc(br);
+ recalculate_group_addr(br);
+ br_recalculate_fwd_mask(br);
+ if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n");
+ br_multicast_toggle_vlan_snooping(br, false, NULL);
+ }
+
+ return 0;
+}
+
+bool br_vlan_enabled(const struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_opt_get(br, BROPT_VLAN_ENABLED);
+}
+EXPORT_SYMBOL_GPL(br_vlan_enabled);
+
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ *p_proto = ntohs(br->vlan_proto);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_proto);
+
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_protocol = ntohs(proto),
+ };
+ int err = 0;
+ struct net_bridge_port *p;
+ struct net_bridge_vlan *vlan;
+ struct net_bridge_vlan_group *vg;
+ __be16 oldproto = br->vlan_proto;
+
+ if (br->vlan_proto == proto)
+ return 0;
+
+ err = switchdev_port_attr_set(br->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ /* Add VLANs for the new proto to the device filter. */
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
+ err = vlan_vid_add(p->dev, proto, vlan->vid);
+ if (err)
+ goto err_filt;
+ }
+ }
+
+ br->vlan_proto = proto;
+
+ recalculate_group_addr(br);
+ br_recalculate_fwd_mask(br);
+
+ /* Delete VLANs for the old proto from the device filter. */
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
+ vlan_vid_del(p->dev, oldproto, vlan->vid);
+ }
+ }
+
+ return 0;
+
+err_filt:
+ attr.u.vlan_protocol = ntohs(oldproto);
+ switchdev_port_attr_set(br->dev, &attr, NULL);
+
+ list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
+ vlan_vid_del(p->dev, proto, vlan->vid);
+ }
+
+ list_for_each_entry_continue_reverse(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
+ vlan_vid_del(p->dev, proto, vlan->vid);
+ }
+ }
+
+ return err;
+}
+
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ if (!eth_type_vlan(htons(val)))
+ return -EPROTONOSUPPORT;
+
+ return __br_vlan_set_proto(br, htons(val), extack);
+}
+
+int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
+{
+ switch (val) {
+ case 0:
+ case 1:
+ br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
+{
+ struct net_bridge_port *p;
+
+ /* allow to change the option if there are no port vlans configured */
+ list_for_each_entry(p, &br->port_list, list) {
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+
+ if (vg->num_vlans)
+ return -EBUSY;
+ }
+
+ switch (val) {
+ case 0:
+ case 1:
+ br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+{
+ struct net_bridge_vlan *v;
+
+ if (vid != vg->pvid)
+ return false;
+
+ v = br_vlan_lookup(&vg->vlan_hash, vid);
+ if (v && br_vlan_should_use(v) &&
+ (v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return true;
+
+ return false;
+}
+
+static void br_vlan_disable_default_pvid(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ u16 pvid = br->default_pvid;
+
+ /* Disable default_pvid on all ports where it is still
+ * configured.
+ */
+ if (vlan_default_pvid(br_vlan_group(br), pvid)) {
+ if (!br_vlan_delete(br, pvid))
+ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
+ }
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (vlan_default_pvid(nbp_vlan_group(p), pvid) &&
+ !nbp_vlan_delete(p, pvid))
+ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
+ }
+
+ br->default_pvid = 0;
+}
+
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_bridge_vlan *pvent;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+ unsigned long *changed;
+ bool vlchange;
+ u16 old_pvid;
+ int err = 0;
+
+ if (!pvid) {
+ br_vlan_disable_default_pvid(br);
+ return 0;
+ }
+
+ changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
+ if (!changed)
+ return -ENOMEM;
+
+ old_pvid = br->default_pvid;
+
+ /* Update default_pvid config only if we do not conflict with
+ * user configuration.
+ */
+ vg = br_vlan_group(br);
+ pvent = br_vlan_find(vg, pvid);
+ if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) &&
+ (!pvent || !br_vlan_should_use(pvent))) {
+ err = br_vlan_add(br, pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY,
+ &vlchange, extack);
+ if (err)
+ goto out;
+
+ if (br_vlan_delete(br, old_pvid))
+ br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN);
+ br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN);
+ __set_bit(0, changed);
+ }
+
+ list_for_each_entry(p, &br->port_list, list) {
+ /* Update default_pvid config only if we do not conflict with
+ * user configuration.
+ */
+ vg = nbp_vlan_group(p);
+ if ((old_pvid &&
+ !vlan_default_pvid(vg, old_pvid)) ||
+ br_vlan_find(vg, pvid))
+ continue;
+
+ err = nbp_vlan_add(p, pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED,
+ &vlchange, extack);
+ if (err)
+ goto err_port;
+ if (nbp_vlan_delete(p, old_pvid))
+ br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN);
+ br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN);
+ __set_bit(p->port_no, changed);
+ }
+
+ br->default_pvid = pvid;
+
+out:
+ bitmap_free(changed);
+ return err;
+
+err_port:
+ list_for_each_entry_continue_reverse(p, &br->port_list, list) {
+ if (!test_bit(p->port_no, changed))
+ continue;
+
+ if (old_pvid) {
+ nbp_vlan_add(p, old_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED,
+ &vlchange, NULL);
+ br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN);
+ }
+ nbp_vlan_delete(p, pvid);
+ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
+ }
+
+ if (test_bit(0, changed)) {
+ if (old_pvid) {
+ br_vlan_add(br, old_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY,
+ &vlchange, NULL);
+ br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN);
+ }
+ br_vlan_delete(br, pvid);
+ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
+ }
+ goto out;
+}
+
+int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ u16 pvid = val;
+ int err = 0;
+
+ if (val >= VLAN_VID_MASK)
+ return -EINVAL;
+
+ if (pvid == br->default_pvid)
+ goto out;
+
+ /* Only allow default pvid change when filtering is disabled */
+ if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
+ pr_info_once("Please disable vlan filtering to change default_pvid\n");
+ err = -EPERM;
+ goto out;
+ }
+ err = __br_vlan_set_default_pvid(br, pvid, extack);
+out:
+ return err;
+}
+
+int br_vlan_init(struct net_bridge *br)
+{
+ struct net_bridge_vlan_group *vg;
+ int ret = -ENOMEM;
+
+ vg = kzalloc(sizeof(*vg), GFP_KERNEL);
+ if (!vg)
+ goto out;
+ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
+ if (ret)
+ goto err_rhtbl;
+ ret = vlan_tunnel_init(vg);
+ if (ret)
+ goto err_tunnel_init;
+ INIT_LIST_HEAD(&vg->vlan_list);
+ br->vlan_proto = htons(ETH_P_8021Q);
+ br->default_pvid = 1;
+ rcu_assign_pointer(br->vlgrp, vg);
+
+out:
+ return ret;
+
+err_tunnel_init:
+ rhashtable_destroy(&vg->vlan_hash);
+err_rhtbl:
+ kfree(vg);
+
+ goto out;
+}
+
+int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
+ };
+ struct net_bridge_vlan_group *vg;
+ int ret = -ENOMEM;
+
+ vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL);
+ if (!vg)
+ goto out;
+
+ ret = switchdev_port_attr_set(p->dev, &attr, extack);
+ if (ret && ret != -EOPNOTSUPP)
+ goto err_vlan_enabled;
+
+ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
+ if (ret)
+ goto err_rhtbl;
+ ret = vlan_tunnel_init(vg);
+ if (ret)
+ goto err_tunnel_init;
+ INIT_LIST_HEAD(&vg->vlan_list);
+ rcu_assign_pointer(p->vlgrp, vg);
+ if (p->br->default_pvid) {
+ bool changed;
+
+ ret = nbp_vlan_add(p, p->br->default_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED,
+ &changed, extack);
+ if (ret)
+ goto err_vlan_add;
+ br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN);
+ }
+out:
+ return ret;
+
+err_vlan_add:
+ RCU_INIT_POINTER(p->vlgrp, NULL);
+ synchronize_rcu();
+ vlan_tunnel_deinit(vg);
+err_tunnel_init:
+ rhashtable_destroy(&vg->vlan_hash);
+err_rhtbl:
+err_vlan_enabled:
+ kfree(vg);
+
+ goto out;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ * changed must be true only if the vlan was created or updated
+ */
+int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
+ bool *changed, struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *vlan;
+ int ret;
+
+ ASSERT_RTNL();
+
+ *changed = false;
+ vlan = br_vlan_find(nbp_vlan_group(port), vid);
+ if (vlan) {
+ bool would_change = __vlan_flags_would_change(vlan, flags);
+
+ if (would_change) {
+ /* Pass the flags to the hardware bridge */
+ ret = br_switchdev_port_vlan_add(port->dev, vid, flags,
+ true, extack);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ __vlan_flags_commit(vlan, flags);
+ *changed = would_change;
+
+ return 0;
+ }
+
+ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
+ if (!vlan)
+ return -ENOMEM;
+
+ vlan->vid = vid;
+ vlan->port = port;
+ ret = __vlan_add(vlan, flags, extack);
+ if (ret)
+ kfree(vlan);
+ else
+ *changed = true;
+
+ return ret;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_vlan *v;
+
+ ASSERT_RTNL();
+
+ v = br_vlan_find(nbp_vlan_group(port), vid);
+ if (!v)
+ return -ENOENT;
+ br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid);
+ br_fdb_delete_by_port(port->br, port, vid, 0);
+
+ return __vlan_del(v);
+}
+
+void nbp_vlan_flush(struct net_bridge_port *port)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ __vlan_flush(port->br, port, vg);
+ RCU_INIT_POINTER(port->vlgrp, NULL);
+ synchronize_net();
+ __vlan_group_free(vg);
+}
+
+void br_vlan_get_stats(const struct net_bridge_vlan *v,
+ struct pcpu_sw_netstats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(i) {
+ u64 rxpackets, rxbytes, txpackets, txbytes;
+ struct pcpu_sw_netstats *cpu_stats;
+ unsigned int start;
+
+ cpu_stats = per_cpu_ptr(v->stats, i);
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ rxpackets = u64_stats_read(&cpu_stats->rx_packets);
+ rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
+ txbytes = u64_stats_read(&cpu_stats->tx_bytes);
+ txpackets = u64_stats_read(&cpu_stats->tx_packets);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->rx_packets, rxpackets);
+ u64_stats_add(&stats->rx_bytes, rxbytes);
+ u64_stats_add(&stats->tx_bytes, txbytes);
+ u64_stats_add(&stats->tx_packets, txpackets);
+ }
+}
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ *p_pvid = br_get_pvid(vg);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+
+ p = br_port_get_check_rcu(dev);
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group_rcu(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ *p_pvid = br_get_pvid(vg);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ int idx = ctx->num_vlans - 1;
+ u16 vid;
+
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return;
+
+ vg = br_vlan_group_rcu(br);
+
+ if (idx >= 0 &&
+ ctx->vlan[idx].proto == br->vlan_proto) {
+ vid = ctx->vlan[idx].id;
+ } else {
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
+ vid = br_get_pvid(vg);
+ }
+
+ path->bridge.vlan_id = vid;
+ path->bridge.vlan_proto = br->vlan_proto;
+}
+
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return 0;
+
+ vg = nbp_vlan_group_rcu(dst);
+ v = br_vlan_find(vg, path->bridge.vlan_id);
+ if (!v || !br_vlan_should_use(v))
+ return -EINVAL;
+
+ if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return 0;
+
+ if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+ else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
+ else
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
+
+ return 0;
+}
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+ struct bridge_vlan_info *p_vinfo)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ p_vinfo->vid = vid;
+ p_vinfo->flags = v->flags;
+ if (vid == br_get_pvid(vg))
+ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
+
+int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
+ struct bridge_vlan_info *p_vinfo)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+
+ p = br_port_get_check_rcu(dev);
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group_rcu(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ p_vinfo->vid = vid;
+ p_vinfo->flags = v->flags;
+ if (vid == br_get_pvid(vg))
+ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu);
+
+static int br_vlan_is_bind_vlan_dev(const struct net_device *dev)
+{
+ return is_vlan_dev(dev) &&
+ !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING);
+}
+
+static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev,
+ __always_unused struct netdev_nested_priv *priv)
+{
+ return br_vlan_is_bind_vlan_dev(dev);
+}
+
+static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev)
+{
+ int found;
+
+ rcu_read_lock();
+ found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn,
+ NULL);
+ rcu_read_unlock();
+
+ return !!found;
+}
+
+struct br_vlan_bind_walk_data {
+ u16 vid;
+ struct net_device *result;
+};
+
+static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev,
+ struct netdev_nested_priv *priv)
+{
+ struct br_vlan_bind_walk_data *data = priv->data;
+ int found = 0;
+
+ if (br_vlan_is_bind_vlan_dev(dev) &&
+ vlan_dev_priv(dev)->vlan_id == data->vid) {
+ data->result = dev;
+ found = 1;
+ }
+
+ return found;
+}
+
+static struct net_device *
+br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid)
+{
+ struct br_vlan_bind_walk_data data = {
+ .vid = vid,
+ };
+ struct netdev_nested_priv priv = {
+ .data = (void *)&data,
+ };
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn,
+ &priv);
+ rcu_read_unlock();
+
+ return data.result;
+}
+
+static bool br_vlan_is_dev_up(const struct net_device *dev)
+{
+ return !!(dev->flags & IFF_UP) && netif_oper_up(dev);
+}
+
+static void br_vlan_set_vlan_dev_state(const struct net_bridge *br,
+ struct net_device *vlan_dev)
+{
+ u16 vid = vlan_dev_priv(vlan_dev)->vlan_id;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+ bool has_carrier = false;
+
+ if (!netif_carrier_ok(br->dev)) {
+ netif_carrier_off(vlan_dev);
+ return;
+ }
+
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+ if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) {
+ has_carrier = true;
+ break;
+ }
+ }
+
+ if (has_carrier)
+ netif_carrier_on(vlan_dev);
+ else
+ netif_carrier_off(vlan_dev);
+}
+
+static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p)
+{
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+ struct net_bridge_vlan *vlan;
+ struct net_device *vlan_dev;
+
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev,
+ vlan->vid);
+ if (vlan_dev) {
+ if (br_vlan_is_dev_up(p->dev)) {
+ if (netif_carrier_ok(p->br->dev))
+ netif_carrier_on(vlan_dev);
+ } else {
+ br_vlan_set_vlan_dev_state(p->br, vlan_dev);
+ }
+ }
+ }
+}
+
+static void br_vlan_toggle_bridge_binding(struct net_device *br_dev,
+ bool enable)
+{
+ struct net_bridge *br = netdev_priv(br_dev);
+
+ if (enable)
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
+ else
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
+ br_vlan_has_upper_bind_vlan_dev(br_dev));
+}
+
+static void br_vlan_upper_change(struct net_device *dev,
+ struct net_device *upper_dev,
+ bool linking)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (!br_vlan_is_bind_vlan_dev(upper_dev))
+ return;
+
+ br_vlan_toggle_bridge_binding(dev, linking);
+ if (linking)
+ br_vlan_set_vlan_dev_state(br, upper_dev);
+}
+
+struct br_vlan_link_state_walk_data {
+ struct net_bridge *br;
+};
+
+static int br_vlan_link_state_change_fn(struct net_device *vlan_dev,
+ struct netdev_nested_priv *priv)
+{
+ struct br_vlan_link_state_walk_data *data = priv->data;
+
+ if (br_vlan_is_bind_vlan_dev(vlan_dev))
+ br_vlan_set_vlan_dev_state(data->br, vlan_dev);
+
+ return 0;
+}
+
+static void br_vlan_link_state_change(struct net_device *dev,
+ struct net_bridge *br)
+{
+ struct br_vlan_link_state_walk_data data = {
+ .br = br
+ };
+ struct netdev_nested_priv priv = {
+ .data = (void *)&data,
+ };
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn,
+ &priv);
+ rcu_read_unlock();
+}
+
+/* Must be protected by RTNL. */
+static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid)
+{
+ struct net_device *vlan_dev;
+
+ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+
+ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid);
+ if (vlan_dev)
+ br_vlan_set_vlan_dev_state(p->br, vlan_dev);
+}
+
+/* Must be protected by RTNL. */
+int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
+{
+ struct netdev_notifier_changeupper_info *info;
+ struct net_bridge *br = netdev_priv(dev);
+ int vlcmd = 0, ret = 0;
+ bool changed = false;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ ret = br_vlan_add(br, br->default_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL);
+ vlcmd = RTM_NEWVLAN;
+ break;
+ case NETDEV_UNREGISTER:
+ changed = !br_vlan_delete(br, br->default_pvid);
+ vlcmd = RTM_DELVLAN;
+ break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ br_vlan_upper_change(dev, info->upper_dev, info->linking);
+ break;
+
+ case NETDEV_CHANGE:
+ case NETDEV_UP:
+ if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING))
+ break;
+ br_vlan_link_state_change(dev, br);
+ break;
+ }
+ if (changed)
+ br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd);
+
+ return ret;
+}
+
+void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev);
+ struct net_bridge *br = netdev_priv(br_dev);
+ bool bridge_binding;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ case NETDEV_UP:
+ break;
+ default:
+ return;
+ }
+
+ bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING;
+ br_vlan_toggle_bridge_binding(br_dev, bridge_binding);
+ if (bridge_binding)
+ br_vlan_set_vlan_dev_state(br, vlan_dev);
+ else if (!bridge_binding && netif_carrier_ok(br_dev))
+ netif_carrier_on(vlan_dev);
+}
+
+/* Must be protected by RTNL. */
+void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
+{
+ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ case NETDEV_DOWN:
+ case NETDEV_UP:
+ br_vlan_set_all_vlan_dev_state(p);
+ break;
+ }
+}
+
+static bool br_vlan_stats_fill(struct sk_buff *skb,
+ const struct net_bridge_vlan *v)
+{
+ struct pcpu_sw_netstats stats;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS);
+ if (!nest)
+ return false;
+
+ br_vlan_get_stats(v, &stats);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes),
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets),
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES,
+ u64_stats_read(&stats.tx_bytes),
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS,
+ u64_stats_read(&stats.tx_packets),
+ BRIDGE_VLANDB_STATS_PAD))
+ goto out_err;
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+/* v_opts is used to dump the options which must be equal in the whole range */
+static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts,
+ const struct net_bridge_port *p,
+ u16 flags,
+ bool dump_stats)
+{
+ struct bridge_vlan_info info;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY);
+ if (!nest)
+ return false;
+
+ memset(&info, 0, sizeof(info));
+ info.vid = vid;
+ if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ info.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+ if (flags & BRIDGE_VLAN_INFO_PVID)
+ info.flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info))
+ goto out_err;
+
+ if (vid_range && vid < vid_range &&
+ !(flags & BRIDGE_VLAN_INFO_PVID) &&
+ nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range))
+ goto out_err;
+
+ if (v_opts) {
+ if (!br_vlan_opts_fill(skb, v_opts, p))
+ goto out_err;
+
+ if (dump_stats && !br_vlan_stats_fill(skb, v_opts))
+ goto out_err;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+static size_t rtnl_vlan_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
+ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */
+ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */
+ + br_vlan_opts_nl_size(); /* bridge vlan options */
+}
+
+void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v = NULL;
+ struct br_vlan_msg *bvm;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ struct net *net;
+ u16 flags = 0;
+ int ifindex;
+
+ /* right now notifications are done only with rtnl held */
+ ASSERT_RTNL();
+
+ if (p) {
+ ifindex = p->dev->ifindex;
+ vg = nbp_vlan_group(p);
+ net = dev_net(p->dev);
+ } else {
+ ifindex = br->dev->ifindex;
+ vg = br_vlan_group(br);
+ net = dev_net(br->dev);
+ }
+
+ skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out_err;
+
+ err = -EMSGSIZE;
+ nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0);
+ if (!nlh)
+ goto out_err;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = AF_BRIDGE;
+ bvm->ifindex = ifindex;
+
+ switch (cmd) {
+ case RTM_NEWVLAN:
+ /* need to find the vlan due to flags/options */
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v))
+ goto out_kfree;
+
+ flags = v->flags;
+ if (br_get_pvid(vg) == v->vid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+ break;
+ case RTM_DELVLAN:
+ break;
+ default:
+ goto out_kfree;
+ }
+
+ if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false))
+ goto out_err;
+
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
+ return;
+
+out_err:
+ rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err);
+out_kfree:
+ kfree_skb(skb);
+}
+
+/* check if v_curr can enter a range ending in range_end */
+bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return v_curr->vid - range_end->vid == 1 &&
+ range_end->flags == v_curr->flags &&
+ br_vlan_opts_eq_range(v_curr, range_end);
+}
+
+static int br_vlan_dump_dev(const struct net_device *dev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u32 dump_flags)
+{
+ struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
+ bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL);
+ bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
+ struct net_bridge_vlan_group *vg;
+ int idx = 0, s_idx = cb->args[1];
+ struct nlmsghdr *nlh = NULL;
+ struct net_bridge_port *p;
+ struct br_vlan_msg *bvm;
+ struct net_bridge *br;
+ int err = 0;
+ u16 pvid;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group_rcu(br);
+ p = NULL;
+ } else {
+ /* global options are dumped only for bridge devices */
+ if (dump_global)
+ return 0;
+
+ p = br_port_get_rcu(dev);
+ if (WARN_ON(!p))
+ return -EINVAL;
+ vg = nbp_vlan_group_rcu(p);
+ br = p->br;
+ }
+
+ if (!vg)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = PF_BRIDGE;
+ bvm->ifindex = dev->ifindex;
+ pvid = br_get_pvid(vg);
+
+ /* idx must stay at range's beginning until it is filled in */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (!dump_global && !br_vlan_should_use(v))
+ continue;
+ if (idx < s_idx) {
+ idx++;
+ continue;
+ }
+
+ if (!range_start) {
+ range_start = v;
+ range_end = v;
+ continue;
+ }
+
+ if (dump_global) {
+ if (br_vlan_global_opts_can_enter_range(v, range_end))
+ goto update_end;
+ if (!br_vlan_global_opts_fill(skb, range_start->vid,
+ range_end->vid,
+ range_start)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ /* advance number of filled vlans */
+ idx += range_end->vid - range_start->vid + 1;
+
+ range_start = v;
+ } else if (dump_stats || v->vid == pvid ||
+ !br_vlan_can_enter_range(v, range_end)) {
+ u16 vlan_flags = br_vlan_flags(range_start, pvid);
+
+ if (!br_vlan_fill_vids(skb, range_start->vid,
+ range_end->vid, range_start,
+ p, vlan_flags, dump_stats)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ /* advance number of filled vlans */
+ idx += range_end->vid - range_start->vid + 1;
+
+ range_start = v;
+ }
+update_end:
+ range_end = v;
+ }
+
+ /* err will be 0 and range_start will be set in 3 cases here:
+ * - first vlan (range_start == range_end)
+ * - last vlan (range_start == range_end, not in range)
+ * - last vlan range (range_start != range_end, in range)
+ */
+ if (!err && range_start) {
+ if (dump_global &&
+ !br_vlan_global_opts_fill(skb, range_start->vid,
+ range_end->vid, range_start))
+ err = -EMSGSIZE;
+ else if (!dump_global &&
+ !br_vlan_fill_vids(skb, range_start->vid,
+ range_end->vid, range_start,
+ p, br_vlan_flags(range_start, pvid),
+ dump_stats))
+ err = -EMSGSIZE;
+ }
+
+ cb->args[1] = err ? idx : 0;
+
+ nlmsg_end(skb, nlh);
+
+ return err;
+}
+
+static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = {
+ [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 },
+};
+
+static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1];
+ int idx = 0, err = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct br_vlan_msg *bvm;
+ struct net_device *dev;
+ u32 dump_flags = 0;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX,
+ br_vlan_db_dump_pol, cb->extack);
+ if (err < 0)
+ return err;
+
+ bvm = nlmsg_data(cb->nlh);
+ if (dtb[BRIDGE_VLANDB_DUMP_FLAGS])
+ dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]);
+
+ rcu_read_lock();
+ if (bvm->ifindex) {
+ dev = dev_get_by_index_rcu(net, bvm->ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_err;
+ }
+ err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
+ /* if the dump completed without an error we return 0 here */
+ if (err != -EMSGSIZE)
+ goto out_err;
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ if (idx < s_idx)
+ goto skip;
+
+ err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
+ if (err == -EMSGSIZE)
+ break;
+skip:
+ idx++;
+ }
+ }
+ cb->args[0] = idx;
+ rcu_read_unlock();
+
+ return skb->len;
+
+out_err:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
+ [BRIDGE_VLANDB_ENTRY_INFO] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)),
+ [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
+ [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT },
+ [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int br_vlan_rtm_process_one(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd, struct netlink_ext_ack *extack)
+{
+ struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL;
+ struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1];
+ bool changed = false, skip_processing = false;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ int err = 0, cmdmap = 0;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (WARN_ON(!p))
+ return -ENODEV;
+ br = p->br;
+ vg = nbp_vlan_group(p);
+ }
+
+ if (WARN_ON(!vg))
+ return -ENODEV;
+
+ err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr,
+ br_vlan_db_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info");
+ return -EINVAL;
+ }
+ memset(&vrange_end, 0, sizeof(vrange_end));
+
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
+ BRIDGE_VLAN_INFO_RANGE_END)) {
+ NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls");
+ return -EINVAL;
+ }
+ if (!br_vlan_valid_id(vinfo->vid, extack))
+ return -EINVAL;
+
+ if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) {
+ vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]);
+ /* validate user-provided flags without RANGE_BEGIN */
+ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags;
+ vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+
+ /* vinfo_last is the range start, vinfo the range end */
+ vinfo_last = vinfo;
+ vinfo = &vrange_end;
+
+ if (!br_vlan_valid_id(vinfo->vid, extack) ||
+ !br_vlan_valid_range(vinfo, vinfo_last, extack))
+ return -EINVAL;
+ }
+
+ switch (cmd) {
+ case RTM_NEWVLAN:
+ cmdmap = RTM_SETLINK;
+ skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS);
+ break;
+ case RTM_DELVLAN:
+ cmdmap = RTM_DELLINK;
+ break;
+ }
+
+ if (!skip_processing) {
+ struct bridge_vlan_info *tmp_last = vinfo_last;
+
+ /* br_process_vlan_info may overwrite vinfo_last */
+ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last,
+ &changed, extack);
+
+ /* notify first if anything changed */
+ if (changed)
+ br_ifinfo_notify(cmdmap, br, p);
+
+ if (err)
+ return err;
+ }
+
+ /* deal with options */
+ if (cmd == RTM_NEWVLAN) {
+ struct net_bridge_vlan *range_start, *range_end;
+
+ if (vinfo_last) {
+ range_start = br_vlan_find(vg, vinfo_last->vid);
+ range_end = br_vlan_find(vg, vinfo->vid);
+ } else {
+ range_start = br_vlan_find(vg, vinfo->vid);
+ range_end = range_start;
+ }
+
+ err = br_vlan_process_options(br, p, range_start, range_end,
+ tb, extack);
+ }
+
+ return err;
+}
+
+static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct br_vlan_msg *bvm;
+ struct net_device *dev;
+ struct nlattr *attr;
+ int err, vlans = 0;
+ int rem;
+
+ /* this should validate the header and check for remaining bytes */
+ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL,
+ extack);
+ if (err < 0)
+ return err;
+
+ bvm = nlmsg_data(nlh);
+ dev = __dev_get_by_index(net, bvm->ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port");
+ return -EINVAL;
+ }
+
+ nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
+ switch (nla_type(attr)) {
+ case BRIDGE_VLANDB_ENTRY:
+ err = br_vlan_rtm_process_one(dev, attr,
+ nlh->nlmsg_type,
+ extack);
+ break;
+ case BRIDGE_VLANDB_GLOBAL_OPTIONS:
+ err = br_vlan_rtm_process_global_options(dev, attr,
+ nlh->nlmsg_type,
+ extack);
+ break;
+ default:
+ continue;
+ }
+
+ vlans++;
+ if (err)
+ break;
+ }
+ if (!vlans) {
+ NL_SET_ERR_MSG_MOD(extack, "No vlans found to process");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static const struct rtnl_msg_handler br_vlan_rtnl_msg_handlers[] = {
+ {THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0},
+ {THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0},
+ {THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0},
+};
+
+int br_vlan_rtnl_init(void)
+{
+ return rtnl_register_many(br_vlan_rtnl_msg_handlers);
+}
+
+void br_vlan_rtnl_uninit(void)
+{
+ rtnl_unregister_many(br_vlan_rtnl_msg_handlers);
+}
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
new file mode 100644
index 000000000000..8fa89b04ee94
--- /dev/null
+++ b/net/bridge/br_vlan_options.c
@@ -0,0 +1,740 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/ip_tunnels.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v)
+{
+ __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id);
+ struct nlattr *nest;
+
+ if (!v->tinfo.tunnel_dst)
+ return true;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO);
+ if (!nest)
+ return false;
+ if (nla_put_u32(skb, BRIDGE_VLANDB_TINFO_ID, be32_to_cpu(tid))) {
+ nla_nest_cancel(skb, nest);
+ return false;
+ }
+ nla_nest_end(skb, nest);
+
+ return true;
+}
+
+static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) ||
+ vlan_tunid_inrange(v_curr, range_end);
+}
+
+/* check if the options' state of v_curr allow it to enter the range */
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ u8 range_mc_rtr = br_vlan_multicast_router(range_end);
+ u8 curr_mc_rtr = br_vlan_multicast_router(v_curr);
+
+ return v_curr->state == range_end->state &&
+ __vlan_tun_can_enter_range(v_curr, range_end) &&
+ curr_mc_rtr == range_mc_rtr;
+}
+
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
+ const struct net_bridge_port *p)
+{
+ if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
+ !__vlan_tun_put(skb, v) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
+ !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED)))
+ return false;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
+ br_vlan_multicast_router(v)))
+ return false;
+ if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) &&
+ (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS,
+ br_multicast_ngroups_get(&v->port_mcast_ctx)) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS,
+ br_multicast_ngroups_get_max(&v->port_mcast_ctx))))
+ return false;
+#endif
+
+ return true;
+}
+
+size_t br_vlan_opts_nl_size(void)
+{
+ return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */
+ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */
+#endif
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */
+ + 0;
+}
+
+static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ u8 state,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (state > BR_STATE_BLOCKING) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state");
+ return -EINVAL;
+ }
+
+ if (br_vlan_is_brentry(v))
+ br = v->br;
+ else
+ br = v->port->br;
+
+ if (br->stp_enabled == BR_KERNEL_STP) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP");
+ return -EBUSY;
+ }
+
+ if (br_opt_get(br, BROPT_MST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state directly when MST is enabled");
+ return -EBUSY;
+ }
+
+ if (v->state == state)
+ return 0;
+
+ if (v->vid == br_get_pvid(vg))
+ br_vlan_set_pvid_state(vg, state);
+
+ br_vlan_set_state(v, state);
+ *changed = true;
+
+ return 0;
+}
+
+static const struct nla_policy br_vlandb_tinfo_pol[BRIDGE_VLANDB_TINFO_MAX + 1] = {
+ [BRIDGE_VLANDB_TINFO_ID] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_TINFO_CMD] = { .type = NLA_U32 },
+};
+
+static int br_vlan_modify_tunnel(const struct net_bridge_port *p,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tun_tb[BRIDGE_VLANDB_TINFO_MAX + 1], *attr;
+ struct bridge_vlan_info *vinfo;
+ u32 tun_id = 0;
+ int cmd, err;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans");
+ return -EINVAL;
+ }
+ if (!(p->flags & BR_VLAN_TUNNEL)) {
+ NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set");
+ return -EINVAL;
+ }
+
+ attr = tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO];
+ err = nla_parse_nested(tun_tb, BRIDGE_VLANDB_TINFO_MAX, attr,
+ br_vlandb_tinfo_pol, extack);
+ if (err)
+ return err;
+
+ if (!tun_tb[BRIDGE_VLANDB_TINFO_CMD]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel command attribute");
+ return -ENOENT;
+ }
+ cmd = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_CMD]);
+ switch (cmd) {
+ case RTM_SETLINK:
+ if (!tun_tb[BRIDGE_VLANDB_TINFO_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel id attribute");
+ return -ENOENT;
+ }
+ /* when working on vlan ranges this is the starting tunnel id */
+ tun_id = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_ID]);
+ /* vlan info attr is guaranteed by br_vlan_rtm_process_one */
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ /* tunnel ids are mapped to each vlan in increasing order,
+ * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the
+ * current vlan, so we compute: tun_id + v - vinfo->vid
+ */
+ tun_id += v->vid - vinfo->vid;
+ break;
+ case RTM_DELLINK:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel command");
+ return -EINVAL;
+ }
+
+ return br_vlan_tunnel_info(p, cmd, v->vid, tun_id, changed);
+}
+
+static int br_vlan_process_one_opts(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ *changed = false;
+ if (tb[BRIDGE_VLANDB_ENTRY_STATE]) {
+ u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]);
+
+ err = br_vlan_modify_state(vg, v, state, changed, extack);
+ if (err)
+ return err;
+ }
+ if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]) {
+ err = br_vlan_modify_tunnel(p, v, tb, changed, extack);
+ if (err)
+ return err;
+ }
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) {
+ u8 val;
+
+ val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]);
+ err = br_multicast_set_vlan_router(v, val);
+ if (err)
+ return err;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) {
+ u32 val;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans");
+ return -EINVAL;
+ }
+ if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) {
+ NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN");
+ return -EINVAL;
+ }
+
+ val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]);
+ br_multicast_ngroups_set_max(&v->port_mcast_ctx, val);
+ *changed = true;
+ }
+#endif
+
+ if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) {
+ bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+ bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]);
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans");
+ return -EINVAL;
+ }
+
+ if (val != enabled) {
+ v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+ *changed = true;
+ }
+ }
+
+ return 0;
+}
+
+int br_vlan_process_options(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan *range_start,
+ struct net_bridge_vlan *range_end,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL;
+ struct net_bridge_vlan_group *vg;
+ int vid, err = 0;
+ u16 pvid;
+
+ if (p)
+ vg = nbp_vlan_group(p);
+ else
+ vg = br_vlan_group(br);
+
+ if (!range_start || !br_vlan_should_use(range_start)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options");
+ return -ENOENT;
+ }
+ if (!range_end || !br_vlan_should_use(range_end)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options");
+ return -ENOENT;
+ }
+
+ pvid = br_get_pvid(vg);
+ for (vid = range_start->vid; vid <= range_end->vid; vid++) {
+ bool changed = false;
+
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options");
+ err = -ENOENT;
+ break;
+ }
+
+ err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed,
+ extack);
+ if (err)
+ break;
+
+ if (changed) {
+ /* vlan options changed, check for range */
+ if (!curr_start) {
+ curr_start = v;
+ curr_end = v;
+ continue;
+ }
+
+ if (v->vid == pvid ||
+ !br_vlan_can_enter_range(v, curr_end)) {
+ br_vlan_notify(br, p, curr_start->vid,
+ curr_end->vid, RTM_NEWVLAN);
+ curr_start = v;
+ }
+ curr_end = v;
+ } else {
+ /* nothing changed and nothing to notify yet */
+ if (!curr_start)
+ continue;
+
+ br_vlan_notify(br, p, curr_start->vid, curr_end->vid,
+ RTM_NEWVLAN);
+ curr_start = NULL;
+ curr_end = NULL;
+ }
+ }
+ if (curr_start)
+ br_vlan_notify(br, p, curr_start->vid, curr_end->vid,
+ RTM_NEWVLAN);
+
+ return err;
+}
+
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *r_end)
+{
+ return v_curr->vid - r_end->vid == 1 &&
+ v_curr->msti == r_end->msti &&
+ ((v_curr->priv_flags ^ r_end->priv_flags) &
+ BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 &&
+ br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx,
+ &r_end->br_mcast_ctx);
+}
+
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts)
+{
+ struct nlattr *nest2 __maybe_unused;
+ u64 clockval __maybe_unused;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS);
+ if (!nest)
+ return false;
+
+ if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid))
+ goto out_err;
+
+ if (vid_range && vid < vid_range &&
+ nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range))
+ goto out_err;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
+ !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
+ v_opts->br_mcast_ctx.multicast_igmp_version) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
+ v_opts->br_mcast_ctx.multicast_last_member_count) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
+ v_opts->br_mcast_ctx.multicast_startup_query_count) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
+ v_opts->br_mcast_ctx.multicast_querier) ||
+ br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx,
+ BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE))
+ goto out_err;
+
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+
+ if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) {
+ nest2 = nla_nest_start(skb,
+ BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS);
+ if (!nest2)
+ goto out_err;
+
+ rcu_read_lock();
+ if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) {
+ rcu_read_unlock();
+ nla_nest_cancel(skb, nest2);
+ goto out_err;
+ }
+ rcu_read_unlock();
+
+ nla_nest_end(skb, nest2);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
+ v_opts->br_mcast_ctx.multicast_mld_version))
+ goto out_err;
+#endif
+#endif
+
+ if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_MSTI, v_opts->msti))
+ goto out_err;
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v)
+{
+ return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
+ + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
+ + br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */
+ + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
+ + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
+#endif
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_MSTI */
+ + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
+}
+
+static void br_vlan_global_opts_notify(const struct net_bridge *br,
+ u16 vid, u16 vid_range)
+{
+ struct net_bridge_vlan *v;
+ struct br_vlan_msg *bvm;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ /* right now notifications are done only with rtnl held */
+ ASSERT_RTNL();
+
+ /* need to find the vlan due to flags/options */
+ v = br_vlan_find(br_vlan_group(br), vid);
+ if (!v)
+ return;
+
+ skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(v), GFP_KERNEL);
+ if (!skb)
+ goto out_err;
+
+ err = -EMSGSIZE;
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0);
+ if (!nlh)
+ goto out_err;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = AF_BRIDGE;
+ bvm->ifindex = br->dev->ifindex;
+
+ if (!br_vlan_global_opts_fill(skb, vid, vid_range, v))
+ goto out_err;
+
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
+ return;
+
+out_err:
+ rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err);
+ kfree_skb(skb);
+}
+
+static int br_vlan_process_global_one_opts(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ int err __maybe_unused;
+
+ *changed = false;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) {
+ u8 mc_snooping;
+
+ mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]);
+ if (br_multicast_toggle_global_vlan(v, !!mc_snooping))
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) {
+ u8 ver;
+
+ ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]);
+ err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver);
+ if (err)
+ return err;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) {
+ u32 cnt;
+
+ cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]);
+ v->br_mcast_ctx.multicast_last_member_count = cnt;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) {
+ u32 cnt;
+
+ cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]);
+ v->br_mcast_ctx.multicast_startup_query_count = cnt;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]);
+ v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]);
+ v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]);
+ v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]);
+ br_multicast_set_query_intvl(&v->br_mcast_ctx, val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]);
+ v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]);
+ br_multicast_set_startup_query_intvl(&v->br_mcast_ctx, val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) {
+ u8 val;
+
+ val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]);
+ err = br_multicast_set_querier(&v->br_mcast_ctx, val);
+ if (err)
+ return err;
+ *changed = true;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
+ u8 ver;
+
+ ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]);
+ err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver);
+ if (err)
+ return err;
+ *changed = true;
+ }
+#endif
+#endif
+ if (tb[BRIDGE_VLANDB_GOPTS_MSTI]) {
+ u16 msti;
+
+ msti = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_MSTI]);
+ err = br_mst_vlan_set_msti(v, msti);
+ if (err)
+ return err;
+ *changed = true;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
+ [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MSTI] = NLA_POLICY_MAX(NLA_U16, VLAN_N_VID - 1),
+};
+
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL;
+ struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1];
+ struct net_bridge_vlan_group *vg;
+ u16 vid, vid_range = 0;
+ struct net_bridge *br;
+ int err = 0;
+
+ if (cmd != RTM_NEWVLAN) {
+ NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation");
+ return -EINVAL;
+ }
+ if (!netif_is_bridge_master(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device");
+ return -EINVAL;
+ }
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ if (WARN_ON(!vg))
+ return -ENODEV;
+
+ err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr,
+ br_vlan_db_gpol, extack);
+ if (err)
+ return err;
+
+ if (!tb[BRIDGE_VLANDB_GOPTS_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id");
+ return -EINVAL;
+ }
+ vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]);
+ if (!br_vlan_valid_id(vid, extack))
+ return -EINVAL;
+
+ if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) {
+ vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]);
+ if (!br_vlan_valid_id(vid_range, extack))
+ return -EINVAL;
+ if (vid >= vid_range) {
+ NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+ return -EINVAL;
+ }
+ } else {
+ vid_range = vid;
+ }
+
+ for (; vid <= vid_range; vid++) {
+ bool changed = false;
+
+ v = br_vlan_find(vg, vid);
+ if (!v) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options");
+ err = -ENOENT;
+ break;
+ }
+
+ err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed,
+ extack);
+ if (err)
+ break;
+
+ if (changed) {
+ /* vlan options changed, check for range */
+ if (!curr_start) {
+ curr_start = v;
+ curr_end = v;
+ continue;
+ }
+
+ if (!br_vlan_global_opts_can_enter_range(v, curr_end)) {
+ br_vlan_global_opts_notify(br, curr_start->vid,
+ curr_end->vid);
+ curr_start = v;
+ }
+ curr_end = v;
+ } else {
+ /* nothing changed and nothing to notify yet */
+ if (!curr_start)
+ continue;
+
+ br_vlan_global_opts_notify(br, curr_start->vid,
+ curr_end->vid);
+ curr_start = NULL;
+ curr_end = NULL;
+ }
+ }
+ if (curr_start)
+ br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid);
+
+ return err;
+}
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
new file mode 100644
index 000000000000..a966a6ec8263
--- /dev/null
+++ b/net/bridge/br_vlan_tunnel.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Bridge per vlan tunnel port dst_metadata handling code
+ *
+ * Authors:
+ * Roopa Prabhu <roopa@cumulusnetworks.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/switchdev.h>
+#include <net/dst_metadata.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct net_bridge_vlan *vle = ptr;
+ __be64 tunid = *(__be64 *)arg->key;
+
+ return vle->tinfo.tunnel_id != tunid;
+}
+
+static const struct rhashtable_params br_vlan_tunnel_rht_params = {
+ .head_offset = offsetof(struct net_bridge_vlan, tnode),
+ .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
+ .key_len = sizeof(__be64),
+ .nelem_hint = 3,
+ .obj_cmpfn = br_vlan_tunid_cmp,
+ .automatic_shrinking = true,
+};
+
+static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
+ __be64 tunnel_id)
+{
+ return rhashtable_lookup_fast(tbl, &tunnel_id,
+ br_vlan_tunnel_rht_params);
+}
+
+static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan)
+{
+ struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst);
+
+ WRITE_ONCE(vlan->tinfo.tunnel_id, 0);
+ RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL);
+ dst_release(&tdst->dst);
+}
+
+void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan)
+{
+ if (!rcu_access_pointer(vlan->tinfo.tunnel_dst))
+ return;
+ rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
+ br_vlan_tunnel_rht_params);
+ vlan_tunnel_info_release(vlan);
+}
+
+static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan, u32 tun_id)
+{
+ struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst);
+ __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ int err;
+
+ if (metadata)
+ return -EEXIST;
+
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, key, 0);
+ if (!metadata)
+ return -EINVAL;
+
+ metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
+ rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata);
+ WRITE_ONCE(vlan->tinfo.tunnel_id, key);
+
+ err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
+ br_vlan_tunnel_rht_params);
+ if (err)
+ goto out;
+
+ return 0;
+out:
+ vlan_tunnel_info_release(vlan);
+
+ return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ vlan = br_vlan_find(vg, vid);
+ if (!vlan)
+ return -EINVAL;
+
+ return __vlan_tunnel_info_add(vg, vlan, tun_id);
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ vlan_tunnel_info_del(vg, v);
+
+ return 0;
+}
+
+static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg)
+{
+ struct net_bridge_vlan *vlan, *tmp;
+
+ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
+ vlan_tunnel_info_del(vg, vlan);
+}
+
+void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+
+ vg = nbp_vlan_group(port);
+ __vlan_tunnel_info_flush(vg);
+}
+
+int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
+{
+ return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params);
+}
+
+void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
+{
+ rhashtable_destroy(&vg->tunnel_hash);
+}
+
+void br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
+{
+ struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
+ struct net_bridge_vlan *vlan;
+
+ if (!vg || !tinfo)
+ return;
+
+ /* if already tagged, ignore */
+ if (skb_vlan_tagged(skb))
+ return;
+
+ /* lookup vid, given tunnel id */
+ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
+ if (!vlan)
+ return;
+
+ skb_dst_drop(skb);
+
+ __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
+}
+
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_vlan *vlan)
+{
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ struct metadata_dst *tunnel_dst;
+ __be64 tunnel_id;
+ int err;
+
+ if (!vlan)
+ return 0;
+
+ tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id);
+ if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb)))
+ return 0;
+
+ skb_dst_drop(skb);
+ err = skb_vlan_pop(skb);
+ if (err)
+ return err;
+
+ if (BR_INPUT_SKB_CB(skb)->backup_nhid) {
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, flags,
+ tunnel_id, 0);
+ if (!tunnel_dst)
+ return -ENOMEM;
+
+ tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX |
+ IP_TUNNEL_INFO_BRIDGE;
+ tunnel_dst->u.tun_info.key.nhid =
+ BR_INPUT_SKB_CB(skb)->backup_nhid;
+ skb_dst_set(skb, &tunnel_dst->dst);
+
+ return 0;
+ }
+
+ tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst);
+ if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst))
+ skb_dst_set(skb, &tunnel_dst->dst);
+
+ return 0;
+}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index b84fc6075fe1..4fd5a6ea26b4 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -1,22 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Bridge netfilter configuration
#
+#
+menuconfig NF_TABLES_BRIDGE
+ depends on BRIDGE && NETFILTER && NF_TABLES
+ select NETFILTER_FAMILY_BRIDGE
+ tristate "Ethernet Bridge nf_tables support"
+
+if NF_TABLES_BRIDGE
+
+config NFT_BRIDGE_META
+ tristate "Netfilter nf_table bridge meta support"
+ help
+ Add support for bridge dedicated meta key.
+
+config NFT_BRIDGE_REJECT
+ tristate "Netfilter nf_tables bridge reject support"
+ depends on NFT_REJECT
+ depends on NF_REJECT_IPV4
+ depends on NF_REJECT_IPV6
+ help
+ Add support to reject packets.
+
+endif # NF_TABLES_BRIDGE
-menu "Bridge: Netfilter Configuration"
- depends on BRIDGE && NETFILTER
+config NF_CONNTRACK_BRIDGE
+ tristate "IPv4/IPV6 bridge connection tracking support"
+ depends on NF_CONNTRACK
+ default n
+ help
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections. This is used to enhance packet filtering via
+ stateful policies. Enable this if you want native tracking from
+ the bridge. This provides a replacement for the `br_netfilter'
+ infrastructure.
+
+ To compile it as a module, choose M here. If unsure, say N.
-config BRIDGE_NF_EBTABLES
+# old sockopt interface and eval loop
+config BRIDGE_NF_EBTABLES_LEGACY
+ tristate "Legacy EBTABLES support"
+ depends on BRIDGE && NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default n
+ help
+ Legacy ebtables packet/frame classifier.
+ This is not needed if you are using ebtables over nftables
+ (iptables-nft).
+
+menuconfig BRIDGE_NF_EBTABLES
tristate "Ethernet Bridge tables (ebtables) support"
+ depends on BRIDGE && NETFILTER && NETFILTER_XTABLES
+ select NETFILTER_FAMILY_BRIDGE
help
ebtables is a general, extensible frame/packet identification
framework. Say 'Y' or 'M' here if you want to do Ethernet
filtering/NAT/brouting on the Ethernet bridge.
+
+if BRIDGE_NF_EBTABLES
+
#
# tables
#
config BRIDGE_EBT_BROUTE
tristate "ebt: broute table support"
- depends on BRIDGE_NF_EBTABLES
+ depends on BRIDGE_NF_EBTABLES_LEGACY
help
The ebtables broute table is used to define rules that decide between
bridging and routing frames, giving Linux the functionality of a
@@ -27,7 +77,7 @@ config BRIDGE_EBT_BROUTE
config BRIDGE_EBT_T_FILTER
tristate "ebt: filter table support"
- depends on BRIDGE_NF_EBTABLES
+ depends on BRIDGE_NF_EBTABLES_LEGACY
help
The ebtables filter table is used to define frame filtering rules at
local input, forwarding and local output. See the man page for
@@ -37,7 +87,7 @@ config BRIDGE_EBT_T_FILTER
config BRIDGE_EBT_T_NAT
tristate "ebt: nat table support"
- depends on BRIDGE_NF_EBTABLES
+ depends on BRIDGE_NF_EBTABLES_LEGACY
help
The ebtables nat table is used to define rules that alter the MAC
source address (MAC SNAT) or the MAC destination address (MAC DNAT).
@@ -49,7 +99,6 @@ config BRIDGE_EBT_T_NAT
#
config BRIDGE_EBT_802_3
tristate "ebt: 802.3 filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds matching support for 802.3 Ethernet frames.
@@ -57,7 +106,6 @@ config BRIDGE_EBT_802_3
config BRIDGE_EBT_AMONG
tristate "ebt: among filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the among match, which allows matching the MAC source
and/or destination address on a list of addresses. Optionally,
@@ -67,7 +115,6 @@ config BRIDGE_EBT_AMONG
config BRIDGE_EBT_ARP
tristate "ebt: ARP filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the ARP match, which allows ARP and RARP header field
filtering.
@@ -76,27 +123,33 @@ config BRIDGE_EBT_ARP
config BRIDGE_EBT_IP
tristate "ebt: IP filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the IP match, which allows basic IP header field
filtering.
To compile it as a module, choose M here. If unsure, say N.
+config BRIDGE_EBT_IP6
+ tristate "ebt: IP6 filter support"
+ depends on BRIDGE_NF_EBTABLES && IPV6
+ help
+ This option adds the IP6 match, which allows basic IPV6 header field
+ filtering.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config BRIDGE_EBT_LIMIT
tristate "ebt: limit match support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the limit match, which allows you to control
the rate at which a rule can be matched. This match is the
equivalent of the iptables limit match.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config BRIDGE_EBT_MARK
tristate "ebt: mark filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the mark match, which allows matching frames based on
the 'nfmark' value in the frame. This can be set by the mark target.
@@ -107,7 +160,6 @@ config BRIDGE_EBT_MARK
config BRIDGE_EBT_PKTTYPE
tristate "ebt: packet type filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the packet type match, which allows matching on the
type of packet based on its Ethernet "class" (as determined by
@@ -118,7 +170,6 @@ config BRIDGE_EBT_PKTTYPE
config BRIDGE_EBT_STP
tristate "ebt: STP filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the Spanning Tree Protocol match, which
allows STP header field filtering.
@@ -127,7 +178,6 @@ config BRIDGE_EBT_STP
config BRIDGE_EBT_VLAN
tristate "ebt: 802.1Q VLAN filter support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the 802.1Q vlan match, which allows the filtering of
802.1Q vlan fields.
@@ -147,7 +197,6 @@ config BRIDGE_EBT_ARPREPLY
config BRIDGE_EBT_DNAT
tristate "ebt: dnat target support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the MAC DNAT target, which allows altering the MAC
destination address of frames.
@@ -156,7 +205,6 @@ config BRIDGE_EBT_DNAT
config BRIDGE_EBT_MARK_T
tristate "ebt: mark target support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the mark target, which allows marking frames by
setting the 'nfmark' value in the frame.
@@ -167,7 +215,6 @@ config BRIDGE_EBT_MARK_T
config BRIDGE_EBT_REDIRECT
tristate "ebt: redirect target support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the MAC redirect target, which allows altering the MAC
destination address of a frame to that of the device it arrived on.
@@ -176,7 +223,6 @@ config BRIDGE_EBT_REDIRECT
config BRIDGE_EBT_SNAT
tristate "ebt: snat target support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the MAC SNAT target, which allows altering the MAC
source address of frames.
@@ -187,7 +233,6 @@ config BRIDGE_EBT_SNAT
#
config BRIDGE_EBT_LOG
tristate "ebt: log support"
- depends on BRIDGE_NF_EBTABLES
help
This option adds the log watcher, that you can use in any rule
in any ebtables table. It records info about the frame header
@@ -195,21 +240,17 @@ config BRIDGE_EBT_LOG
To compile it as a module, choose M here. If unsure, say N.
-config BRIDGE_EBT_ULOG
- tristate "ebt: ulog support (OBSOLETE)"
- depends on BRIDGE_NF_EBTABLES
+config BRIDGE_EBT_NFLOG
+ tristate "ebt: nflog support"
help
- This option enables the old bridge-specific "ebt_ulog" implementation
- which has been obsoleted by the new "nfnetlink_log" code (see
- CONFIG_NETFILTER_NETLINK_LOG).
+ This option enables the nflog watcher, which allows to LOG
+ messages through the netfilter logging API, which can use
+ either the old LOG target, the old ULOG target or nfnetlink_log
+ as backend.
- This option adds the ulog watcher, that you can use in any rule
- in any ebtables table. The packet is passed to a userspace
- logging daemon using netlink multicast sockets. This differs
- from the log watcher in the sense that the complete packet is
- sent to userspace instead of a descriptive text and that
- netlink multicast sockets are used instead of the syslog.
+ This option adds the nflog watcher, that you can use in any rule
+ in any ebtables table.
To compile it as a module, choose M here. If unsure, say N.
-endmenu
+endif # BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 905087e0d485..b9a1303da977 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -1,8 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the netfilter modules for Link Layer filtering on a bridge.
#
-obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o
+obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
+obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
+
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o
+
+obj-$(CONFIG_BRIDGE_NF_EBTABLES_LEGACY) += ebtables.o
# tables
obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o
@@ -14,6 +21,7 @@ obj-$(CONFIG_BRIDGE_EBT_802_3) += ebt_802_3.o
obj-$(CONFIG_BRIDGE_EBT_AMONG) += ebt_among.o
obj-$(CONFIG_BRIDGE_EBT_ARP) += ebt_arp.o
obj-$(CONFIG_BRIDGE_EBT_IP) += ebt_ip.o
+obj-$(CONFIG_BRIDGE_EBT_IP6) += ebt_ip6.o
obj-$(CONFIG_BRIDGE_EBT_LIMIT) += ebt_limit.o
obj-$(CONFIG_BRIDGE_EBT_MARK) += ebt_mark_m.o
obj-$(CONFIG_BRIDGE_EBT_PKTTYPE) += ebt_pkttype.o
@@ -29,4 +37,4 @@ obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o
# watchers
obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o
-obj-$(CONFIG_BRIDGE_EBT_ULOG) += ebt_ulog.o
+obj-$(CONFIG_BRIDGE_EBT_NFLOG) += ebt_nflog.o
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 9abbc09ccdc3..68c2519bdc52 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* 802_3
*
@@ -5,69 +6,74 @@
* Chris Vitale csv@bluetail.com
*
* May 2003
- *
+ *
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_802_3.h>
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/skbuff.h>
+#include <uapi/linux/netfilter_bridge/ebt_802_3.h>
-static int ebt_filter_802_3(const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out, const void *data, unsigned int datalen)
+static struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb)
{
- struct ebt_802_3_info *info = (struct ebt_802_3_info *)data;
- struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb);
+ return (struct ebt_802_3_hdr *)skb_mac_header(skb);
+}
+
+static bool
+ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_802_3_info *info = par->matchinfo;
+ const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb);
__be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type;
if (info->bitmask & EBT_802_3_SAP) {
- if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP))
- return EBT_NOMATCH;
- if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.ssap))
+ return false;
+ if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.dsap))
+ return false;
}
if (info->bitmask & EBT_802_3_TYPE) {
if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE))
- return EBT_NOMATCH;
- if (FWINV(info->type != type, EBT_802_3_TYPE))
- return EBT_NOMATCH;
+ return false;
+ if (NF_INVF(info, EBT_802_3_TYPE, info->type != type))
+ return false;
}
- return EBT_MATCH;
+ return true;
}
-static struct ebt_match filter_802_3;
-static int ebt_802_3_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_802_3_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_802_3_info *info = (struct ebt_802_3_info *)data;
+ const struct ebt_802_3_info *info = par->matchinfo;
- if (datalen < sizeof(struct ebt_802_3_info))
- return -EINVAL;
if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK)
return -EINVAL;
return 0;
}
-static struct ebt_match filter_802_3 =
-{
- .name = EBT_802_3_MATCH,
- .match = ebt_filter_802_3,
- .check = ebt_802_3_check,
+static struct xt_match ebt_802_3_mt_reg __read_mostly = {
+ .name = "802_3",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_802_3_mt,
+ .checkentry = ebt_802_3_mt_check,
+ .matchsize = sizeof(struct ebt_802_3_info),
.me = THIS_MODULE,
};
static int __init ebt_802_3_init(void)
{
- return ebt_register_match(&filter_802_3);
+ return xt_register_match(&ebt_802_3_mt_reg);
}
static void __exit ebt_802_3_fini(void)
{
- ebt_unregister_match(&filter_802_3);
+ xt_unregister_match(&ebt_802_3_mt_reg);
}
module_init(ebt_802_3_init);
module_exit(ebt_802_3_fini);
+MODULE_DESCRIPTION("Ebtables: DSAP/SSAP field and SNAP type matching");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index ce97c4285f9a..96f7243b6314 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_among
*
@@ -7,49 +8,46 @@
* August, 2003
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_among.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/ip.h>
#include <linux/if_arp.h>
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_among.h>
-static int ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
- const char *mac, __be32 ip)
+static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
+ const char *mac, __be32 ip)
{
/* You may be puzzled as to how this code works.
- * Some tricks were used, refer to
+ * Some tricks were used, refer to
* include/linux/netfilter_bridge/ebt_among.h
* as there you can find a solution of this mystery.
*/
const struct ebt_mac_wormhash_tuple *p;
int start, limit, i;
uint32_t cmp[2] = { 0, 0 };
- int key = (const unsigned char) mac[5];
+ int key = ((const unsigned char *)mac)[5];
- memcpy(((char *) cmp) + 2, mac, 6);
+ ether_addr_copy(((char *) cmp) + 2, mac);
start = wh->table[key];
limit = wh->table[key + 1];
if (ip) {
for (i = start; i < limit; i++) {
p = &wh->pool[i];
- if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) {
- if (p->ip == 0 || p->ip == ip) {
- return 1;
- }
- }
+ if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+ if (p->ip == 0 || p->ip == ip)
+ return true;
}
} else {
for (i = start; i < limit; i++) {
p = &wh->pool[i];
- if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) {
- if (p->ip == 0) {
- return 1;
- }
- }
+ if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+ if (p->ip == 0)
+ return true;
}
}
- return 0;
+ return false;
}
static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash
@@ -73,15 +71,18 @@ static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash
static int get_ip_dst(const struct sk_buff *skb, __be32 *addr)
{
if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) {
- struct iphdr _iph, *ih;
+ const struct iphdr *ih;
+ struct iphdr _iph;
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL)
return -1;
*addr = ih->daddr;
} else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
- struct arphdr _arph, *ah;
- __be32 buf, *bp;
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const __be32 *bp;
+ __be32 buf;
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL ||
@@ -101,15 +102,18 @@ static int get_ip_dst(const struct sk_buff *skb, __be32 *addr)
static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
{
if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) {
- struct iphdr _iph, *ih;
+ const struct iphdr *ih;
+ struct iphdr _iph;
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL)
return -1;
*addr = ih->saddr;
} else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
- struct arphdr _arph, *ah;
- __be32 buf, *bp;
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const __be32 *bp;
+ __be32 buf;
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL ||
@@ -125,12 +129,10 @@ static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
return 0;
}
-static int ebt_filter_among(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out, const void *data,
- unsigned int datalen)
+static bool
+ebt_among_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_among_info *info = (struct ebt_among_info *) data;
+ const struct ebt_among_info *info = par->matchinfo;
const char *dmac, *smac;
const struct ebt_mac_wormhash *wh_dst, *wh_src;
__be32 dip = 0, sip = 0;
@@ -141,88 +143,139 @@ static int ebt_filter_among(const struct sk_buff *skb,
if (wh_src) {
smac = eth_hdr(skb)->h_source;
if (get_ip_src(skb, &sip))
- return EBT_NOMATCH;
+ return false;
if (!(info->bitmask & EBT_AMONG_SRC_NEG)) {
/* we match only if it contains */
if (!ebt_mac_wormhash_contains(wh_src, smac, sip))
- return EBT_NOMATCH;
+ return false;
} else {
/* we match only if it DOES NOT contain */
if (ebt_mac_wormhash_contains(wh_src, smac, sip))
- return EBT_NOMATCH;
+ return false;
}
}
if (wh_dst) {
dmac = eth_hdr(skb)->h_dest;
if (get_ip_dst(skb, &dip))
- return EBT_NOMATCH;
+ return false;
if (!(info->bitmask & EBT_AMONG_DST_NEG)) {
/* we match only if it contains */
if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip))
- return EBT_NOMATCH;
+ return false;
} else {
/* we match only if it DOES NOT contain */
if (ebt_mac_wormhash_contains(wh_dst, dmac, dip))
- return EBT_NOMATCH;
+ return false;
}
}
- return EBT_MATCH;
+ return true;
+}
+
+static bool poolsize_invalid(const struct ebt_mac_wormhash *w)
+{
+ return w && w->poolsize >= (INT_MAX / sizeof(struct ebt_mac_wormhash_tuple));
+}
+
+static bool wormhash_offset_invalid(int off, unsigned int len)
+{
+ if (off == 0) /* not present */
+ return false;
+
+ if (off < (int)sizeof(struct ebt_among_info) ||
+ off % __alignof__(struct ebt_mac_wormhash))
+ return true;
+
+ off += sizeof(struct ebt_mac_wormhash);
+
+ return off > len;
+}
+
+static bool wormhash_sizes_valid(const struct ebt_mac_wormhash *wh, int a, int b)
+{
+ if (a == 0)
+ a = sizeof(struct ebt_among_info);
+
+ return ebt_mac_wormhash_size(wh) + a == b;
}
-static int ebt_among_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data,
- unsigned int datalen)
+static int ebt_among_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_among_info *info = (struct ebt_among_info *) data;
- int expected_length = sizeof(struct ebt_among_info);
+ const struct ebt_among_info *info = par->matchinfo;
+ const struct ebt_entry_match *em =
+ container_of(par->matchinfo, const struct ebt_entry_match, data);
+ unsigned int expected_length = sizeof(struct ebt_among_info);
const struct ebt_mac_wormhash *wh_dst, *wh_src;
int err;
+ if (expected_length > em->match_size)
+ return -EINVAL;
+
+ if (wormhash_offset_invalid(info->wh_dst_ofs, em->match_size) ||
+ wormhash_offset_invalid(info->wh_src_ofs, em->match_size))
+ return -EINVAL;
+
wh_dst = ebt_among_wh_dst(info);
- wh_src = ebt_among_wh_src(info);
+ if (poolsize_invalid(wh_dst))
+ return -EINVAL;
+
expected_length += ebt_mac_wormhash_size(wh_dst);
+ if (expected_length > em->match_size)
+ return -EINVAL;
+
+ wh_src = ebt_among_wh_src(info);
+ if (poolsize_invalid(wh_src))
+ return -EINVAL;
+
+ if (info->wh_src_ofs < info->wh_dst_ofs) {
+ if (!wormhash_sizes_valid(wh_src, info->wh_src_ofs, info->wh_dst_ofs))
+ return -EINVAL;
+ } else {
+ if (!wormhash_sizes_valid(wh_dst, info->wh_dst_ofs, info->wh_src_ofs))
+ return -EINVAL;
+ }
+
expected_length += ebt_mac_wormhash_size(wh_src);
- if (datalen != EBT_ALIGN(expected_length)) {
- printk(KERN_WARNING
- "ebtables: among: wrong size: %d"
- "against expected %d, rounded to %Zd\n",
- datalen, expected_length,
- EBT_ALIGN(expected_length));
+ if (em->match_size != EBT_ALIGN(expected_length)) {
+ pr_err_ratelimited("wrong size: %d against expected %d, rounded to %zd\n",
+ em->match_size, expected_length,
+ EBT_ALIGN(expected_length));
return -EINVAL;
}
if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) {
- printk(KERN_WARNING
- "ebtables: among: dst integrity fail: %x\n", -err);
+ pr_err_ratelimited("dst integrity fail: %x\n", -err);
return -EINVAL;
}
if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) {
- printk(KERN_WARNING
- "ebtables: among: src integrity fail: %x\n", -err);
+ pr_err_ratelimited("src integrity fail: %x\n", -err);
return -EINVAL;
}
return 0;
}
-static struct ebt_match filter_among = {
- .name = EBT_AMONG_MATCH,
- .match = ebt_filter_among,
- .check = ebt_among_check,
+static struct xt_match ebt_among_mt_reg __read_mostly = {
+ .name = "among",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_among_mt,
+ .checkentry = ebt_among_mt_check,
+ .matchsize = -1, /* special case */
.me = THIS_MODULE,
};
static int __init ebt_among_init(void)
{
- return ebt_register_match(&filter_among);
+ return xt_register_match(&ebt_among_mt_reg);
}
static void __exit ebt_among_fini(void)
{
- ebt_unregister_match(&filter_among);
+ xt_unregister_match(&ebt_among_mt_reg);
}
module_init(ebt_among_init);
module_exit(ebt_among_fini);
+MODULE_DESCRIPTION("Ebtables: Combined MAC/IP address list matching");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 9c599800a900..0707cc00fe8f 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_arp
*
@@ -8,79 +9,77 @@
* April, 2002
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_arp.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arp.h>
-static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out, const void *data, unsigned int datalen)
+static bool
+ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_arp_info *info = (struct ebt_arp_info *)data;
- struct arphdr _arph, *ah;
+ const struct ebt_arp_info *info = par->matchinfo;
+ const struct arphdr *ah;
+ struct arphdr _arph;
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL)
- return EBT_NOMATCH;
- if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode !=
- ah->ar_op, EBT_ARP_OPCODE))
- return EBT_NOMATCH;
- if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype !=
- ah->ar_hrd, EBT_ARP_HTYPE))
- return EBT_NOMATCH;
- if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype !=
- ah->ar_pro, EBT_ARP_PTYPE))
- return EBT_NOMATCH;
-
- if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP)) {
- __be32 _addr, *ap;
-
- /* IPv4 addresses are always 4 bytes */
- if (ah->ar_pln != sizeof(__be32))
- return EBT_NOMATCH;
- if (info->bitmask & EBT_ARP_SRC_IP) {
- ap = skb_header_pointer(skb, sizeof(struct arphdr) +
- ah->ar_hln, sizeof(_addr),
- &_addr);
- if (ap == NULL)
- return EBT_NOMATCH;
- if (FWINV(info->saddr != (*ap & info->smsk),
- EBT_ARP_SRC_IP))
- return EBT_NOMATCH;
- }
-
- if (info->bitmask & EBT_ARP_DST_IP) {
- ap = skb_header_pointer(skb, sizeof(struct arphdr) +
- 2*ah->ar_hln+sizeof(__be32),
- sizeof(_addr), &_addr);
- if (ap == NULL)
- return EBT_NOMATCH;
- if (FWINV(info->daddr != (*ap & info->dmsk),
- EBT_ARP_DST_IP))
- return EBT_NOMATCH;
- }
+ return false;
+ if ((info->bitmask & EBT_ARP_OPCODE) &&
+ NF_INVF(info, EBT_ARP_OPCODE, info->opcode != ah->ar_op))
+ return false;
+ if ((info->bitmask & EBT_ARP_HTYPE) &&
+ NF_INVF(info, EBT_ARP_HTYPE, info->htype != ah->ar_hrd))
+ return false;
+ if ((info->bitmask & EBT_ARP_PTYPE) &&
+ NF_INVF(info, EBT_ARP_PTYPE, info->ptype != ah->ar_pro))
+ return false;
+
+ if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) {
+ const __be32 *sap, *dap;
+ __be32 saddr, daddr;
+
+ if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP))
+ return false;
+ sap = skb_header_pointer(skb, sizeof(struct arphdr) +
+ ah->ar_hln, sizeof(saddr),
+ &saddr);
+ if (sap == NULL)
+ return false;
+ dap = skb_header_pointer(skb, sizeof(struct arphdr) +
+ 2*ah->ar_hln+sizeof(saddr),
+ sizeof(daddr), &daddr);
+ if (dap == NULL)
+ return false;
+ if ((info->bitmask & EBT_ARP_SRC_IP) &&
+ NF_INVF(info, EBT_ARP_SRC_IP,
+ info->saddr != (*sap & info->smsk)))
+ return false;
+ if ((info->bitmask & EBT_ARP_DST_IP) &&
+ NF_INVF(info, EBT_ARP_DST_IP,
+ info->daddr != (*dap & info->dmsk)))
+ return false;
+ if ((info->bitmask & EBT_ARP_GRAT) &&
+ NF_INVF(info, EBT_ARP_GRAT, *dap != *sap))
+ return false;
}
if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) {
- unsigned char _mac[ETH_ALEN], *mp;
- uint8_t verdict, i;
+ const unsigned char *mp;
+ unsigned char _mac[ETH_ALEN];
- /* MAC addresses are 6 bytes */
- if (ah->ar_hln != ETH_ALEN)
- return EBT_NOMATCH;
+ if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER))
+ return false;
if (info->bitmask & EBT_ARP_SRC_MAC) {
mp = skb_header_pointer(skb, sizeof(struct arphdr),
sizeof(_mac), &_mac);
if (mp == NULL)
- return EBT_NOMATCH;
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (mp[i] ^ info->smaddr[i]) &
- info->smmsk[i];
- if (FWINV(verdict != 0, EBT_ARP_SRC_MAC))
- return EBT_NOMATCH;
+ return false;
+ if (NF_INVF(info, EBT_ARP_SRC_MAC,
+ !ether_addr_equal_masked(mp, info->smaddr,
+ info->smmsk)))
+ return false;
}
if (info->bitmask & EBT_ARP_DST_MAC) {
@@ -88,26 +87,22 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in
ah->ar_hln + ah->ar_pln,
sizeof(_mac), &_mac);
if (mp == NULL)
- return EBT_NOMATCH;
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (mp[i] ^ info->dmaddr[i]) &
- info->dmmsk[i];
- if (FWINV(verdict != 0, EBT_ARP_DST_MAC))
- return EBT_NOMATCH;
+ return false;
+ if (NF_INVF(info, EBT_ARP_DST_MAC,
+ !ether_addr_equal_masked(mp, info->dmaddr,
+ info->dmmsk)))
+ return false;
}
}
- return EBT_MATCH;
+ return true;
}
-static int ebt_arp_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_arp_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_arp_info *info = (struct ebt_arp_info *)data;
+ const struct ebt_arp_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_arp_info)))
- return -EINVAL;
if ((e->ethproto != htons(ETH_P_ARP) &&
e->ethproto != htons(ETH_P_RARP)) ||
e->invflags & EBT_IPROTO)
@@ -117,24 +112,27 @@ static int ebt_arp_check(const char *tablename, unsigned int hookmask,
return 0;
}
-static struct ebt_match filter_arp =
-{
- .name = EBT_ARP_MATCH,
- .match = ebt_filter_arp,
- .check = ebt_arp_check,
+static struct xt_match ebt_arp_mt_reg __read_mostly = {
+ .name = "arp",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_arp_mt,
+ .checkentry = ebt_arp_mt_check,
+ .matchsize = sizeof(struct ebt_arp_info),
.me = THIS_MODULE,
};
static int __init ebt_arp_init(void)
{
- return ebt_register_match(&filter_arp);
+ return xt_register_match(&ebt_arp_mt_reg);
}
static void __exit ebt_arp_fini(void)
{
- ebt_unregister_match(&filter_arp);
+ xt_unregister_match(&ebt_arp_mt_reg);
}
module_init(ebt_arp_init);
module_exit(ebt_arp_fini);
+MODULE_DESCRIPTION("Ebtables: ARP protocol packet match");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 0aa7b9910a86..d9e77e2500cd 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_arpreply
*
@@ -8,22 +9,23 @@
* August, 2003
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_arpreply.h>
#include <linux/if_arp.h>
#include <net/arp.h>
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arpreply.h>
-static int ebt_target_reply(struct sk_buff **pskb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static unsigned int
+ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct ebt_arpreply_info *info = (struct ebt_arpreply_info *)data;
- __be32 _sip, *siptr, _dip, *diptr;
- struct arphdr _ah, *ap;
- unsigned char _sha[ETH_ALEN], *shp;
- struct sk_buff *skb = *pskb;
+ const struct ebt_arpreply_info *info = par->targinfo;
+ const __be32 *siptr, *diptr;
+ __be32 _sip, _dip;
+ const struct arphdr *ap;
+ struct arphdr _ah;
+ const unsigned char *shp;
+ unsigned char _sha[ETH_ALEN];
ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
if (ap == NULL)
@@ -50,48 +52,52 @@ static int ebt_target_reply(struct sk_buff **pskb, unsigned int hooknr,
if (diptr == NULL)
return EBT_DROP;
- arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)in,
- *diptr, shp, info->mac, shp);
+ arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr,
+ (struct net_device *)xt_in(par),
+ *diptr, shp, info->mac, shp);
return info->target;
}
-static int ebt_target_reply_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
{
- struct ebt_arpreply_info *info = (struct ebt_arpreply_info *)data;
+ const struct ebt_arpreply_info *info = par->targinfo;
+ const struct ebt_entry *e = par->entryinfo;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_arpreply_info)))
- return -EINVAL;
if (BASE_CHAIN && info->target == EBT_RETURN)
return -EINVAL;
if (e->ethproto != htons(ETH_P_ARP) ||
e->invflags & EBT_IPROTO)
return -EINVAL;
- CLEAR_BASE_CHAIN_BIT;
- if (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING))
+ if (ebt_invalid_target(info->target))
return -EINVAL;
+
return 0;
}
-static struct ebt_target reply_target =
-{
- .name = EBT_ARPREPLY_TARGET,
- .target = ebt_target_reply,
- .check = ebt_target_reply_check,
+static struct xt_target ebt_arpreply_tg_reg __read_mostly = {
+ .name = "arpreply",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .table = "nat",
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING),
+ .target = ebt_arpreply_tg,
+ .checkentry = ebt_arpreply_tg_check,
+ .targetsize = sizeof(struct ebt_arpreply_info),
.me = THIS_MODULE,
};
static int __init ebt_arpreply_init(void)
{
- return ebt_register_target(&reply_target);
+ return xt_register_target(&ebt_arpreply_tg_reg);
}
static void __exit ebt_arpreply_fini(void)
{
- ebt_unregister_target(&reply_target);
+ xt_unregister_target(&ebt_arpreply_tg_reg);
}
module_init(ebt_arpreply_init);
module_exit(ebt_arpreply_fini);
+MODULE_DESCRIPTION("Ebtables: ARP reply target");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index 4582659dff0e..3fda71a8579d 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_dnat
*
@@ -7,70 +8,99 @@
* June, 2002
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_nat.h>
#include <linux/module.h>
#include <net/sock.h>
+#include "../br_private.h"
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
-static int ebt_target_dnat(struct sk_buff **pskb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static unsigned int
+ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct ebt_nat_info *info = (struct ebt_nat_info *)data;
+ const struct ebt_nat_info *info = par->targinfo;
+
+ if (skb_ensure_writable(skb, 0))
+ return EBT_DROP;
+
+ ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
- if (skb_shared(*pskb) || skb_cloned(*pskb)) {
- struct sk_buff *nskb;
+ if (is_multicast_ether_addr(info->mac)) {
+ if (is_broadcast_ether_addr(info->mac))
+ skb->pkt_type = PACKET_BROADCAST;
+ else
+ skb->pkt_type = PACKET_MULTICAST;
+ } else {
+ const struct net_device *dev;
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return NF_DROP;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
+ switch (xt_hooknum(par)) {
+ case NF_BR_BROUTING:
+ dev = xt_in(par);
+ break;
+ case NF_BR_PRE_ROUTING:
+ dev = br_port_get_rcu(xt_in(par))->br->dev;
+ break;
+ default:
+ dev = NULL;
+ break;
+ }
+
+ if (!dev) /* NF_BR_LOCAL_OUT */
+ return info->target;
+
+ if (ether_addr_equal(info->mac, dev->dev_addr))
+ skb->pkt_type = PACKET_HOST;
+ else
+ skb->pkt_type = PACKET_OTHERHOST;
}
- memcpy(eth_hdr(*pskb)->h_dest, info->mac, ETH_ALEN);
+
return info->target;
}
-static int ebt_target_dnat_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_dnat_tg_check(const struct xt_tgchk_param *par)
{
- struct ebt_nat_info *info = (struct ebt_nat_info *)data;
+ const struct ebt_nat_info *info = par->targinfo;
+ unsigned int hook_mask;
if (BASE_CHAIN && info->target == EBT_RETURN)
return -EINVAL;
- CLEAR_BASE_CHAIN_BIT;
- if ( (strcmp(tablename, "nat") ||
- (hookmask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) &&
- (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
- return -EINVAL;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_nat_info)))
+
+ hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+ if ((strcmp(par->table, "nat") != 0 ||
+ (hook_mask & ~((1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_OUT)))) &&
+ (strcmp(par->table, "broute") != 0 ||
+ hook_mask & ~(1 << NF_BR_BROUTING)))
return -EINVAL;
- if (INVALID_TARGET)
+ if (ebt_invalid_target(info->target))
return -EINVAL;
return 0;
}
-static struct ebt_target dnat =
-{
- .name = EBT_DNAT_TARGET,
- .target = ebt_target_dnat,
- .check = ebt_target_dnat_check,
+static struct xt_target ebt_dnat_tg_reg __read_mostly = {
+ .name = "dnat",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_BROUTING),
+ .target = ebt_dnat_tg,
+ .checkentry = ebt_dnat_tg_check,
+ .targetsize = sizeof(struct ebt_nat_info),
.me = THIS_MODULE,
};
static int __init ebt_dnat_init(void)
{
- return ebt_register_target(&dnat);
+ return xt_register_target(&ebt_dnat_tg_reg);
}
static void __exit ebt_dnat_fini(void)
{
- ebt_unregister_target(&dnat);
+ xt_unregister_target(&ebt_dnat_tg_reg);
}
module_init(ebt_dnat_init);
module_exit(ebt_dnat_fini);
+MODULE_DESCRIPTION("Ebtables: Destination MAC address translation");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index e4c642448e1b..df372496c1c1 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_ip
*
@@ -11,78 +12,100 @@
* Innominate Security Technologies AG <mhopf@innominate.com>
* September, 2002
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_ip.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <linux/in.h>
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip.h>
-struct tcpudphdr {
- __be16 src;
- __be16 dst;
+union pkthdr {
+ struct {
+ __be16 src;
+ __be16 dst;
+ } tcpudphdr;
+ struct {
+ u8 type;
+ u8 code;
+ } icmphdr;
+ struct {
+ u8 type;
+ } igmphdr;
};
-static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out, const void *data,
- unsigned int datalen)
+static bool
+ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_ip_info *info = (struct ebt_ip_info *)data;
- struct iphdr _iph, *ih;
- struct tcpudphdr _ports, *pptr;
+ const struct ebt_ip_info *info = par->matchinfo;
+ const struct iphdr *ih;
+ struct iphdr _iph;
+ const union pkthdr *pptr;
+ union pkthdr _pkthdr;
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL)
- return EBT_NOMATCH;
- if (info->bitmask & EBT_IP_TOS &&
- FWINV(info->tos != ih->tos, EBT_IP_TOS))
- return EBT_NOMATCH;
- if (info->bitmask & EBT_IP_SOURCE &&
- FWINV((ih->saddr & info->smsk) !=
- info->saddr, EBT_IP_SOURCE))
- return EBT_NOMATCH;
+ return false;
+ if ((info->bitmask & EBT_IP_TOS) &&
+ NF_INVF(info, EBT_IP_TOS, info->tos != ih->tos))
+ return false;
+ if ((info->bitmask & EBT_IP_SOURCE) &&
+ NF_INVF(info, EBT_IP_SOURCE,
+ (ih->saddr & info->smsk) != info->saddr))
+ return false;
if ((info->bitmask & EBT_IP_DEST) &&
- FWINV((ih->daddr & info->dmsk) !=
- info->daddr, EBT_IP_DEST))
- return EBT_NOMATCH;
+ NF_INVF(info, EBT_IP_DEST,
+ (ih->daddr & info->dmsk) != info->daddr))
+ return false;
if (info->bitmask & EBT_IP_PROTO) {
- if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO))
- return EBT_NOMATCH;
- if (!(info->bitmask & EBT_IP_DPORT) &&
- !(info->bitmask & EBT_IP_SPORT))
- return EBT_MATCH;
+ if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol))
+ return false;
+ if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT |
+ EBT_IP_ICMP | EBT_IP_IGMP)))
+ return true;
if (ntohs(ih->frag_off) & IP_OFFSET)
- return EBT_NOMATCH;
+ return false;
+
+ /* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */
pptr = skb_header_pointer(skb, ih->ihl*4,
- sizeof(_ports), &_ports);
+ sizeof(_pkthdr), &_pkthdr);
if (pptr == NULL)
- return EBT_NOMATCH;
+ return false;
if (info->bitmask & EBT_IP_DPORT) {
- u32 dst = ntohs(pptr->dst);
- if (FWINV(dst < info->dport[0] ||
- dst > info->dport[1],
- EBT_IP_DPORT))
- return EBT_NOMATCH;
+ u32 dst = ntohs(pptr->tcpudphdr.dst);
+ if (NF_INVF(info, EBT_IP_DPORT,
+ dst < info->dport[0] ||
+ dst > info->dport[1]))
+ return false;
}
if (info->bitmask & EBT_IP_SPORT) {
- u32 src = ntohs(pptr->src);
- if (FWINV(src < info->sport[0] ||
- src > info->sport[1],
- EBT_IP_SPORT))
- return EBT_NOMATCH;
+ u32 src = ntohs(pptr->tcpudphdr.src);
+ if (NF_INVF(info, EBT_IP_SPORT,
+ src < info->sport[0] ||
+ src > info->sport[1]))
+ return false;
}
+ if ((info->bitmask & EBT_IP_ICMP) &&
+ NF_INVF(info, EBT_IP_ICMP,
+ pptr->icmphdr.type < info->icmp_type[0] ||
+ pptr->icmphdr.type > info->icmp_type[1] ||
+ pptr->icmphdr.code < info->icmp_code[0] ||
+ pptr->icmphdr.code > info->icmp_code[1]))
+ return false;
+ if ((info->bitmask & EBT_IP_IGMP) &&
+ NF_INVF(info, EBT_IP_IGMP,
+ pptr->igmphdr.type < info->igmp_type[0] ||
+ pptr->igmphdr.type > info->igmp_type[1]))
+ return false;
}
- return EBT_MATCH;
+ return true;
}
-static int ebt_ip_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_ip_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_ip_info *info = (struct ebt_ip_info *)data;
+ const struct ebt_ip_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_ip_info)))
- return -EINVAL;
if (e->ethproto != htons(ETH_P_IP) ||
e->invflags & EBT_IPROTO)
return -EINVAL;
@@ -93,6 +116,7 @@ static int ebt_ip_check(const char *tablename, unsigned int hookmask,
return -EINVAL;
if (info->protocol != IPPROTO_TCP &&
info->protocol != IPPROTO_UDP &&
+ info->protocol != IPPROTO_UDPLITE &&
info->protocol != IPPROTO_SCTP &&
info->protocol != IPPROTO_DCCP)
return -EINVAL;
@@ -101,27 +125,45 @@ static int ebt_ip_check(const char *tablename, unsigned int hookmask,
return -EINVAL;
if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1])
return -EINVAL;
+ if (info->bitmask & EBT_IP_ICMP) {
+ if ((info->invflags & EBT_IP_PROTO) ||
+ info->protocol != IPPROTO_ICMP)
+ return -EINVAL;
+ if (info->icmp_type[0] > info->icmp_type[1] ||
+ info->icmp_code[0] > info->icmp_code[1])
+ return -EINVAL;
+ }
+ if (info->bitmask & EBT_IP_IGMP) {
+ if ((info->invflags & EBT_IP_PROTO) ||
+ info->protocol != IPPROTO_IGMP)
+ return -EINVAL;
+ if (info->igmp_type[0] > info->igmp_type[1])
+ return -EINVAL;
+ }
return 0;
}
-static struct ebt_match filter_ip =
-{
- .name = EBT_IP_MATCH,
- .match = ebt_filter_ip,
- .check = ebt_ip_check,
+static struct xt_match ebt_ip_mt_reg __read_mostly = {
+ .name = "ip",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_ip_mt,
+ .checkentry = ebt_ip_mt_check,
+ .matchsize = sizeof(struct ebt_ip_info),
.me = THIS_MODULE,
};
static int __init ebt_ip_init(void)
{
- return ebt_register_match(&filter_ip);
+ return xt_register_match(&ebt_ip_mt_reg);
}
static void __exit ebt_ip_fini(void)
{
- ebt_unregister_match(&filter_ip);
+ xt_unregister_match(&ebt_ip_mt_reg);
}
module_init(ebt_ip_init);
module_exit(ebt_ip_fini);
+MODULE_DESCRIPTION("Ebtables: IPv4 protocol packet match");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
new file mode 100644
index 000000000000..f3225bc31f6c
--- /dev/null
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ebt_ip6
+ *
+ * Authors:
+ * Manohar Castelino <manohar.r.castelino@intel.com>
+ * Kuo-Lang Tseng <kuo-lang.tseng@intel.com>
+ * Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Summary:
+ * This is just a modification of the IPv4 code written by
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * with the changes required to support IPv6
+ *
+ * Jan, 2008
+ */
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/dsfield.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip6.h>
+
+union pkthdr {
+ struct {
+ __be16 src;
+ __be16 dst;
+ } tcpudphdr;
+ struct {
+ u8 type;
+ u8 code;
+ } icmphdr;
+};
+
+static bool
+ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_ip6_info *info = par->matchinfo;
+ const struct ipv6hdr *ih6;
+ struct ipv6hdr _ip6h;
+ const union pkthdr *pptr;
+ union pkthdr _pkthdr;
+
+ ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
+ if (ih6 == NULL)
+ return false;
+ if ((info->bitmask & EBT_IP6_TCLASS) &&
+ NF_INVF(info, EBT_IP6_TCLASS,
+ info->tclass != ipv6_get_dsfield(ih6)))
+ return false;
+ if (((info->bitmask & EBT_IP6_SOURCE) &&
+ NF_INVF(info, EBT_IP6_SOURCE,
+ ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk,
+ &info->saddr))) ||
+ ((info->bitmask & EBT_IP6_DEST) &&
+ NF_INVF(info, EBT_IP6_DEST,
+ ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk,
+ &info->daddr))))
+ return false;
+ if (info->bitmask & EBT_IP6_PROTO) {
+ uint8_t nexthdr = ih6->nexthdr;
+ __be16 frag_off;
+ int offset_ph;
+
+ offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off);
+ if (offset_ph == -1)
+ return false;
+ if (NF_INVF(info, EBT_IP6_PROTO, info->protocol != nexthdr))
+ return false;
+ if (!(info->bitmask & (EBT_IP6_DPORT |
+ EBT_IP6_SPORT | EBT_IP6_ICMP6)))
+ return true;
+
+ /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
+ pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr),
+ &_pkthdr);
+ if (pptr == NULL)
+ return false;
+ if (info->bitmask & EBT_IP6_DPORT) {
+ u16 dst = ntohs(pptr->tcpudphdr.dst);
+ if (NF_INVF(info, EBT_IP6_DPORT,
+ dst < info->dport[0] ||
+ dst > info->dport[1]))
+ return false;
+ }
+ if (info->bitmask & EBT_IP6_SPORT) {
+ u16 src = ntohs(pptr->tcpudphdr.src);
+ if (NF_INVF(info, EBT_IP6_SPORT,
+ src < info->sport[0] ||
+ src > info->sport[1]))
+ return false;
+ }
+ if ((info->bitmask & EBT_IP6_ICMP6) &&
+ NF_INVF(info, EBT_IP6_ICMP6,
+ pptr->icmphdr.type < info->icmpv6_type[0] ||
+ pptr->icmphdr.type > info->icmpv6_type[1] ||
+ pptr->icmphdr.code < info->icmpv6_code[0] ||
+ pptr->icmphdr.code > info->icmpv6_code[1]))
+ return false;
+ }
+ return true;
+}
+
+static int ebt_ip6_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_entry *e = par->entryinfo;
+ struct ebt_ip6_info *info = par->matchinfo;
+
+ if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
+ return -EINVAL;
+ if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK)
+ return -EINVAL;
+ if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) {
+ if (info->invflags & EBT_IP6_PROTO)
+ return -EINVAL;
+ if (info->protocol != IPPROTO_TCP &&
+ info->protocol != IPPROTO_UDP &&
+ info->protocol != IPPROTO_UDPLITE &&
+ info->protocol != IPPROTO_SCTP &&
+ info->protocol != IPPROTO_DCCP)
+ return -EINVAL;
+ }
+ if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1])
+ return -EINVAL;
+ if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
+ return -EINVAL;
+ if (info->bitmask & EBT_IP6_ICMP6) {
+ if ((info->invflags & EBT_IP6_PROTO) ||
+ info->protocol != IPPROTO_ICMPV6)
+ return -EINVAL;
+ if (info->icmpv6_type[0] > info->icmpv6_type[1] ||
+ info->icmpv6_code[0] > info->icmpv6_code[1])
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match ebt_ip6_mt_reg __read_mostly = {
+ .name = "ip6",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_ip6_mt,
+ .checkentry = ebt_ip6_mt_check,
+ .matchsize = sizeof(struct ebt_ip6_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_ip6_init(void)
+{
+ return xt_register_match(&ebt_ip6_mt_reg);
+}
+
+static void __exit ebt_ip6_fini(void)
+{
+ xt_unregister_match(&ebt_ip6_mt_reg);
+}
+
+module_init(ebt_ip6_init);
+module_exit(ebt_ip6_fini);
+MODULE_DESCRIPTION("Ebtables: IPv6 protocol packet match");
+MODULE_AUTHOR("Kuo-Lang Tseng <kuo-lang.tseng@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index d48fa5cb26cf..e16183bd1bb8 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_limit
*
@@ -10,13 +11,13 @@
* September, 2003
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_limit.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
-
#include <linux/netdevice.h>
#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_limit.h>
static DEFINE_SPINLOCK(limit_lock);
@@ -31,11 +32,10 @@ static DEFINE_SPINLOCK(limit_lock);
#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
-static int ebt_limit_match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static bool
+ebt_limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_limit_info *info = (struct ebt_limit_info *)data;
+ struct ebt_limit_info *info = (void *)par->matchinfo;
unsigned long now = jiffies;
spin_lock_bh(&limit_lock);
@@ -47,11 +47,11 @@ static int ebt_limit_match(const struct sk_buff *skb,
/* We're not limited. */
info->credit -= info->cost;
spin_unlock_bh(&limit_lock);
- return EBT_MATCH;
+ return true;
}
spin_unlock_bh(&limit_lock);
- return EBT_NOMATCH;
+ return false;
}
/* Precision saver. */
@@ -66,19 +66,15 @@ user2credits(u_int32_t user)
return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE;
}
-static int ebt_limit_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_limit_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_limit_info *info = (struct ebt_limit_info *)data;
-
- if (datalen != EBT_ALIGN(sizeof(struct ebt_limit_info)))
- return -EINVAL;
+ struct ebt_limit_info *info = par->matchinfo;
/* Check for overflow. */
if (info->burst == 0 ||
user2credits(info->avg * info->burst) < user2credits(info->avg)) {
- printk("Overflow in ebt_limit, try lower: %u/%u\n",
- info->avg, info->burst);
+ pr_info_ratelimited("overflow, try lower: %u/%u\n",
+ info->avg, info->burst);
return -EINVAL;
}
@@ -90,24 +86,44 @@ static int ebt_limit_check(const char *tablename, unsigned int hookmask,
return 0;
}
-static struct ebt_match ebt_limit_reg =
-{
- .name = EBT_LIMIT_MATCH,
- .match = ebt_limit_match,
- .check = ebt_limit_check,
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+/*
+ * no conversion function needed --
+ * only avg/burst have meaningful values in userspace.
+ */
+struct ebt_compat_limit_info {
+ compat_uint_t avg, burst;
+ compat_ulong_t prev;
+ compat_uint_t credit, credit_cap, cost;
+};
+#endif
+
+static struct xt_match ebt_limit_mt_reg __read_mostly = {
+ .name = "limit",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_limit_mt,
+ .checkentry = ebt_limit_mt_check,
+ .matchsize = sizeof(struct ebt_limit_info),
+ .usersize = offsetof(struct ebt_limit_info, prev),
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(struct ebt_compat_limit_info),
+#endif
.me = THIS_MODULE,
};
static int __init ebt_limit_init(void)
{
- return ebt_register_match(&ebt_limit_reg);
+ return xt_register_match(&ebt_limit_mt_reg);
}
static void __exit ebt_limit_fini(void)
{
- ebt_unregister_match(&ebt_limit_reg);
+ xt_unregister_match(&ebt_limit_mt_reg);
}
module_init(ebt_limit_init);
module_exit(ebt_limit_fini);
+MODULE_DESCRIPTION("Ebtables: Rate-limit match");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index a184f879f253..e2eea1daaf8b 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_log
*
@@ -8,26 +9,26 @@
* April, 2002
*
*/
-
-#include <linux/in.h>
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_log.h>
-#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/spinlock.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
static DEFINE_SPINLOCK(ebt_log_lock);
-static int ebt_log_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_log_tg_check(const struct xt_tgchk_param *par)
{
- struct ebt_log_info *info = (struct ebt_log_info *)data;
+ struct ebt_log_info *info = par->targinfo;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_log_info)))
- return -EINVAL;
if (info->bitmask & ~EBT_LOG_MASK)
return -EINVAL;
if (info->loglevel >= 8)
@@ -36,181 +37,190 @@ static int ebt_log_check(const char *tablename, unsigned int hookmask,
return 0;
}
-struct tcpudphdr
-{
+struct tcpudphdr {
__be16 src;
__be16 dst;
};
-struct arppayload
-{
+struct arppayload {
unsigned char mac_src[ETH_ALEN];
unsigned char ip_src[4];
unsigned char mac_dst[ETH_ALEN];
unsigned char ip_dst[4];
};
-static void print_MAC(unsigned char *p)
+static void
+print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
{
- int i;
-
- for (i = 0; i < ETH_ALEN; i++, p++)
- printk("%02x%c", *p, i == ETH_ALEN - 1 ? ' ':':');
+ if (protocol == IPPROTO_TCP ||
+ protocol == IPPROTO_UDP ||
+ protocol == IPPROTO_UDPLITE ||
+ protocol == IPPROTO_SCTP ||
+ protocol == IPPROTO_DCCP) {
+ const struct tcpudphdr *pptr;
+ struct tcpudphdr _ports;
+
+ pptr = skb_header_pointer(skb, offset,
+ sizeof(_ports), &_ports);
+ if (pptr == NULL) {
+ pr_cont(" INCOMPLETE TCP/UDP header");
+ return;
+ }
+ pr_cont(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+ }
}
-#define myNIPQUAD(a) a[0], a[1], a[2], a[3]
static void
-ebt_log_packet(unsigned int pf, unsigned int hooknum,
- const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out, const struct nf_loginfo *loginfo,
- const char *prefix)
+ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
+ const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct nf_loginfo *loginfo,
+ const char *prefix)
{
unsigned int bitmask;
- spin_lock_bh(&ebt_log_lock);
- printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level,
- prefix, in ? in->name : "", out ? out->name : "");
-
- print_MAC(eth_hdr(skb)->h_source);
- printk("MAC dest = ");
- print_MAC(eth_hdr(skb)->h_dest);
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
+ return;
- printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto));
+ spin_lock_bh(&ebt_log_lock);
+ printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "", out ? out->name : "",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
if (loginfo->type == NF_LOG_TYPE_LOG)
bitmask = loginfo->u.log.logflags;
else
- bitmask = NF_LOG_MASK;
+ bitmask = NF_LOG_DEFAULT_MASK;
if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
- htons(ETH_P_IP)){
- struct iphdr _iph, *ih;
+ htons(ETH_P_IP)) {
+ const struct iphdr *ih;
+ struct iphdr _iph;
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL) {
- printk(" INCOMPLETE IP header");
+ pr_cont(" INCOMPLETE IP header");
goto out;
}
- printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP "
- "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr),
- NIPQUAD(ih->daddr), ih->tos, ih->protocol);
- if (ih->protocol == IPPROTO_TCP ||
- ih->protocol == IPPROTO_UDP ||
- ih->protocol == IPPROTO_SCTP ||
- ih->protocol == IPPROTO_DCCP) {
- struct tcpudphdr _ports, *pptr;
-
- pptr = skb_header_pointer(skb, ih->ihl*4,
- sizeof(_ports), &_ports);
- if (pptr == NULL) {
- printk(" INCOMPLETE TCP/UDP header");
- goto out;
- }
- printk(" SPT=%u DPT=%u", ntohs(pptr->src),
- ntohs(pptr->dst));
+ pr_cont(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+ &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
+ print_ports(skb, ih->protocol, ih->ihl*4);
+ goto out;
+ }
+
+#if IS_ENABLED(CONFIG_BRIDGE_EBT_IP6)
+ if ((bitmask & EBT_LOG_IP6) && eth_hdr(skb)->h_proto ==
+ htons(ETH_P_IPV6)) {
+ const struct ipv6hdr *ih;
+ struct ipv6hdr _iph;
+ uint8_t nexthdr;
+ __be16 frag_off;
+ int offset_ph;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ pr_cont(" INCOMPLETE IPv6 header");
+ goto out;
}
+ pr_cont(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
+ &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
+ nexthdr = ih->nexthdr;
+ offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off);
+ if (offset_ph == -1)
+ goto out;
+ print_ports(skb, nexthdr, offset_ph);
goto out;
}
+#endif
if ((bitmask & EBT_LOG_ARP) &&
((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
(eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
- struct arphdr _arph, *ah;
+ const struct arphdr *ah;
+ struct arphdr _arph;
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL) {
- printk(" INCOMPLETE ARP header");
+ pr_cont(" INCOMPLETE ARP header");
goto out;
}
- printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
- ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
- ntohs(ah->ar_op));
+ pr_cont(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
+ ntohs(ah->ar_op));
/* If it's for Ethernet and the lengths are OK,
- * then log the ARP payload */
+ * then log the ARP payload
+ */
if (ah->ar_hrd == htons(1) &&
ah->ar_hln == ETH_ALEN &&
ah->ar_pln == sizeof(__be32)) {
- struct arppayload _arpp, *ap;
+ const struct arppayload *ap;
+ struct arppayload _arpp;
ap = skb_header_pointer(skb, sizeof(_arph),
sizeof(_arpp), &_arpp);
if (ap == NULL) {
- printk(" INCOMPLETE ARP payload");
+ pr_cont(" INCOMPLETE ARP payload");
goto out;
}
- printk(" ARP MAC SRC=");
- print_MAC(ap->mac_src);
- printk(" ARP IP SRC=%u.%u.%u.%u",
- myNIPQUAD(ap->ip_src));
- printk(" ARP MAC DST=");
- print_MAC(ap->mac_dst);
- printk(" ARP IP DST=%u.%u.%u.%u",
- myNIPQUAD(ap->ip_dst));
+ pr_cont(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
+ ap->mac_src, ap->ip_src,
+ ap->mac_dst, ap->ip_dst);
}
}
out:
- printk("\n");
+ pr_cont("\n");
spin_unlock_bh(&ebt_log_lock);
-
}
-static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static unsigned int
+ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct ebt_log_info *info = (struct ebt_log_info *)data;
+ const struct ebt_log_info *info = par->targinfo;
struct nf_loginfo li;
+ struct net *net = xt_net(par);
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = info->loglevel;
li.u.log.logflags = info->bitmask;
+ /* Remember that we have to use ebt_log_packet() not to break backward
+ * compatibility. We cannot use the default bridge packet logger via
+ * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo
+ */
if (info->bitmask & EBT_LOG_NFLOG)
- nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li,
- "%s", info->prefix);
+ nf_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, "%s",
+ info->prefix);
else
- ebt_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li,
- info->prefix);
+ ebt_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+ xt_in(par), xt_out(par), &li, info->prefix);
+ return EBT_CONTINUE;
}
-static struct ebt_watcher log =
-{
- .name = EBT_LOG_WATCHER,
- .watcher = ebt_log,
- .check = ebt_log_check,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ebt_log_logger = {
- .name = "ebt_log",
- .logfn = &ebt_log_packet,
+static struct xt_target ebt_log_tg_reg __read_mostly = {
+ .name = "log",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .target = ebt_log_tg,
+ .checkentry = ebt_log_tg_check,
+ .targetsize = sizeof(struct ebt_log_info),
.me = THIS_MODULE,
};
static int __init ebt_log_init(void)
{
- int ret;
-
- ret = ebt_register_watcher(&log);
- if (ret < 0)
- return ret;
- if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) {
- printk(KERN_WARNING "ebt_log: not logging via system console "
- "since somebody else already registered for PF_INET\n");
- /* we cannot make module load fail here, since otherwise
- * ebtables userspace would abort */
- }
-
- return 0;
+ return xt_register_target(&ebt_log_tg_reg);
}
static void __exit ebt_log_fini(void)
{
- nf_log_unregister_logger(&ebt_log_logger);
- ebt_unregister_watcher(&log);
+ xt_unregister_target(&ebt_log_tg_reg);
}
module_init(ebt_log_init);
module_exit(ebt_log_fini);
+MODULE_DESCRIPTION("Ebtables: Packet logging to syslog");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 2458638561cb..8cf653c72fd8 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_mark
*
@@ -13,68 +14,98 @@
* Marking a frame doesn't really change anything in the frame anyway.
*/
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_mark_t.h>
-#include <linux/module.h>
-static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static unsigned int
+ebt_mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
+ const struct ebt_mark_t_info *info = par->targinfo;
int action = info->target & -16;
if (action == MARK_SET_VALUE)
- (*pskb)->mark = info->mark;
+ skb->mark = info->mark;
else if (action == MARK_OR_VALUE)
- (*pskb)->mark |= info->mark;
+ skb->mark |= info->mark;
else if (action == MARK_AND_VALUE)
- (*pskb)->mark &= info->mark;
+ skb->mark &= info->mark;
else
- (*pskb)->mark ^= info->mark;
+ skb->mark ^= info->mark;
- return info->target | -16;
+ return info->target | ~EBT_VERDICT_BITS;
}
-static int ebt_target_mark_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_mark_tg_check(const struct xt_tgchk_param *par)
{
- struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
+ const struct ebt_mark_t_info *info = par->targinfo;
int tmp;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_t_info)))
- return -EINVAL;
- tmp = info->target | -16;
+ tmp = info->target | ~EBT_VERDICT_BITS;
if (BASE_CHAIN && tmp == EBT_RETURN)
return -EINVAL;
- CLEAR_BASE_CHAIN_BIT;
- if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
+ if (ebt_invalid_target(tmp))
return -EINVAL;
- tmp = info->target & -16;
+ tmp = info->target & ~EBT_VERDICT_BITS;
if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE &&
tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE)
return -EINVAL;
return 0;
}
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+struct compat_ebt_mark_t_info {
+ compat_ulong_t mark;
+ compat_uint_t target;
+};
+
+static void mark_tg_compat_from_user(void *dst, const void *src)
+{
+ const struct compat_ebt_mark_t_info *user = src;
+ struct ebt_mark_t_info *kern = dst;
+
+ kern->mark = user->mark;
+ kern->target = user->target;
+}
-static struct ebt_target mark_target =
+static int mark_tg_compat_to_user(void __user *dst, const void *src)
{
- .name = EBT_MARK_TARGET,
- .target = ebt_target_mark,
- .check = ebt_target_mark_check,
+ struct compat_ebt_mark_t_info __user *user = dst;
+ const struct ebt_mark_t_info *kern = src;
+
+ if (put_user(kern->mark, &user->mark) ||
+ put_user(kern->target, &user->target))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+static struct xt_target ebt_mark_tg_reg __read_mostly = {
+ .name = "mark",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .target = ebt_mark_tg,
+ .checkentry = ebt_mark_tg_check,
+ .targetsize = sizeof(struct ebt_mark_t_info),
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(struct compat_ebt_mark_t_info),
+ .compat_from_user = mark_tg_compat_from_user,
+ .compat_to_user = mark_tg_compat_to_user,
+#endif
.me = THIS_MODULE,
};
static int __init ebt_mark_init(void)
{
- return ebt_register_target(&mark_target);
+ return xt_register_target(&ebt_mark_tg_reg);
}
static void __exit ebt_mark_fini(void)
{
- ebt_unregister_target(&mark_target);
+ xt_unregister_target(&ebt_mark_tg_reg);
}
module_init(ebt_mark_init);
module_exit(ebt_mark_fini);
+MODULE_DESCRIPTION("Ebtables: Packet mark modification");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index 025869ee0b68..5872e73c741e 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_mark_m
*
@@ -7,29 +8,25 @@
* July, 2002
*
*/
-
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_mark_m.h>
-#include <linux/module.h>
-static int ebt_filter_mark(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out, const void *data,
- unsigned int datalen)
+static bool
+ebt_mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_mark_m_info *info = (struct ebt_mark_m_info *) data;
+ const struct ebt_mark_m_info *info = par->matchinfo;
if (info->bitmask & EBT_MARK_OR)
- return !(!!(skb->mark & info->mask) ^ info->invert);
- return !(((skb->mark & info->mask) == info->mark) ^ info->invert);
+ return !!(skb->mark & info->mask) ^ info->invert;
+ return ((skb->mark & info->mask) == info->mark) ^ info->invert;
}
-static int ebt_mark_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_mark_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_mark_m_info *info = (struct ebt_mark_m_info *) data;
+ const struct ebt_mark_m_info *info = par->matchinfo;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_m_info)))
- return -EINVAL;
if (info->bitmask & ~EBT_MARK_MASK)
return -EINVAL;
if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND))
@@ -39,24 +36,64 @@ static int ebt_mark_check(const char *tablename, unsigned int hookmask,
return 0;
}
-static struct ebt_match filter_mark =
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+struct compat_ebt_mark_m_info {
+ compat_ulong_t mark, mask;
+ uint8_t invert, bitmask;
+};
+
+static void mark_mt_compat_from_user(void *dst, const void *src)
{
- .name = EBT_MARK_MATCH,
- .match = ebt_filter_mark,
- .check = ebt_mark_check,
+ const struct compat_ebt_mark_m_info *user = src;
+ struct ebt_mark_m_info *kern = dst;
+
+ kern->mark = user->mark;
+ kern->mask = user->mask;
+ kern->invert = user->invert;
+ kern->bitmask = user->bitmask;
+}
+
+static int mark_mt_compat_to_user(void __user *dst, const void *src)
+{
+ struct compat_ebt_mark_m_info __user *user = dst;
+ const struct ebt_mark_m_info *kern = src;
+
+ if (put_user(kern->mark, &user->mark) ||
+ put_user(kern->mask, &user->mask) ||
+ put_user(kern->invert, &user->invert) ||
+ put_user(kern->bitmask, &user->bitmask))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+static struct xt_match ebt_mark_mt_reg __read_mostly = {
+ .name = "mark_m",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_mark_mt,
+ .checkentry = ebt_mark_mt_check,
+ .matchsize = sizeof(struct ebt_mark_m_info),
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(struct compat_ebt_mark_m_info),
+ .compat_from_user = mark_mt_compat_from_user,
+ .compat_to_user = mark_mt_compat_to_user,
+#endif
.me = THIS_MODULE,
};
static int __init ebt_mark_m_init(void)
{
- return ebt_register_match(&filter_mark);
+ return xt_register_match(&ebt_mark_mt_reg);
}
static void __exit ebt_mark_m_fini(void)
{
- ebt_unregister_match(&filter_mark);
+ xt_unregister_match(&ebt_mark_mt_reg);
}
module_init(ebt_mark_m_init);
module_exit(ebt_mark_m_fini);
+MODULE_DESCRIPTION("Ebtables: Packet mark match");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
new file mode 100644
index 000000000000..61bf8f4465ab
--- /dev/null
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ebt_nflog
+ *
+ * Author:
+ * Peter Warasin <peter@endian.com>
+ *
+ * February, 2008
+ *
+ * Based on:
+ * xt_NFLOG.c, (C) 2006 by Patrick McHardy <kaber@trash.net>
+ * ebt_ulog.c, (C) 2004 by Bart De Schuymer <bdschuym@pandora.be>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nflog.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int
+ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_nflog_info *info = par->targinfo;
+ struct net *net = xt_net(par);
+ struct nf_loginfo li;
+
+ li.type = NF_LOG_TYPE_ULOG;
+ li.u.ulog.copy_len = info->len;
+ li.u.ulog.group = info->group;
+ li.u.ulog.qthreshold = info->threshold;
+ li.u.ulog.flags = 0;
+
+ nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", info->prefix);
+ return EBT_CONTINUE;
+}
+
+static int ebt_nflog_tg_check(const struct xt_tgchk_param *par)
+{
+ struct ebt_nflog_info *info = par->targinfo;
+
+ if (info->flags & ~EBT_NFLOG_MASK)
+ return -EINVAL;
+ info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0';
+ return 0;
+}
+
+static struct xt_target ebt_nflog_tg_reg __read_mostly = {
+ .name = "nflog",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .target = ebt_nflog_tg,
+ .checkentry = ebt_nflog_tg_check,
+ .targetsize = sizeof(struct ebt_nflog_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_nflog_init(void)
+{
+ return xt_register_target(&ebt_nflog_tg_reg);
+}
+
+static void __exit ebt_nflog_fini(void)
+{
+ xt_unregister_target(&ebt_nflog_tg_reg);
+}
+
+module_init(ebt_nflog_init);
+module_exit(ebt_nflog_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Peter Warasin <peter@endian.com>");
+MODULE_DESCRIPTION("ebtables NFLOG netfilter logging module");
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 4fffd70e4da7..c9e306119ee3 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_pkttype
*
@@ -7,53 +8,50 @@
* April, 2003
*
*/
-
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_pkttype.h>
-#include <linux/module.h>
-static int ebt_filter_pkttype(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const void *data,
- unsigned int datalen)
+static bool
+ebt_pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_pkttype_info *info = (struct ebt_pkttype_info *)data;
+ const struct ebt_pkttype_info *info = par->matchinfo;
- return (skb->pkt_type != info->pkt_type) ^ info->invert;
+ return (skb->pkt_type == info->pkt_type) ^ info->invert;
}
-static int ebt_pkttype_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_pkttype_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_pkttype_info *info = (struct ebt_pkttype_info *)data;
+ const struct ebt_pkttype_info *info = par->matchinfo;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_pkttype_info)))
- return -EINVAL;
if (info->invert != 0 && info->invert != 1)
return -EINVAL;
/* Allow any pkt_type value */
return 0;
}
-static struct ebt_match filter_pkttype =
-{
- .name = EBT_PKTTYPE_MATCH,
- .match = ebt_filter_pkttype,
- .check = ebt_pkttype_check,
+static struct xt_match ebt_pkttype_mt_reg __read_mostly = {
+ .name = "pkttype",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_pkttype_mt,
+ .checkentry = ebt_pkttype_mt_check,
+ .matchsize = sizeof(struct ebt_pkttype_info),
.me = THIS_MODULE,
};
static int __init ebt_pkttype_init(void)
{
- return ebt_register_match(&filter_pkttype);
+ return xt_register_match(&ebt_pkttype_mt_reg);
}
static void __exit ebt_pkttype_fini(void)
{
- ebt_unregister_match(&filter_pkttype);
+ xt_unregister_match(&ebt_pkttype_mt_reg);
}
module_init(ebt_pkttype_init);
module_exit(ebt_pkttype_fini);
+MODULE_DESCRIPTION("Ebtables: Link layer packet type match");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 9f378eab72d0..307790562b49 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_redirect
*
@@ -7,75 +8,74 @@
* April, 2002
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_redirect.h>
#include <linux/module.h>
#include <net/sock.h>
#include "../br_private.h"
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_redirect.h>
-static int ebt_target_redirect(struct sk_buff **pskb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static unsigned int
+ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct ebt_redirect_info *info = (struct ebt_redirect_info *)data;
+ const struct ebt_redirect_info *info = par->targinfo;
- if (skb_shared(*pskb) || skb_cloned(*pskb)) {
- struct sk_buff *nskb;
+ if (skb_ensure_writable(skb, 0))
+ return EBT_DROP;
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return NF_DROP;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- }
- if (hooknr != NF_BR_BROUTING)
- memcpy(eth_hdr(*pskb)->h_dest,
- in->br_port->br->dev->dev_addr, ETH_ALEN);
+ if (xt_hooknum(par) != NF_BR_BROUTING)
+ /* rcu_read_lock()ed by nf_hook_thresh */
+ ether_addr_copy(eth_hdr(skb)->h_dest,
+ br_port_get_rcu(xt_in(par))->br->dev->dev_addr);
else
- memcpy(eth_hdr(*pskb)->h_dest, in->dev_addr, ETH_ALEN);
- (*pskb)->pkt_type = PACKET_HOST;
+ ether_addr_copy(eth_hdr(skb)->h_dest, xt_in(par)->dev_addr);
+ skb->pkt_type = PACKET_HOST;
return info->target;
}
-static int ebt_target_redirect_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_redirect_tg_check(const struct xt_tgchk_param *par)
{
- struct ebt_redirect_info *info = (struct ebt_redirect_info *)data;
+ const struct ebt_redirect_info *info = par->targinfo;
+ unsigned int hook_mask;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_redirect_info)))
- return -EINVAL;
if (BASE_CHAIN && info->target == EBT_RETURN)
return -EINVAL;
- CLEAR_BASE_CHAIN_BIT;
- if ( (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING)) &&
- (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) )
+
+ hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+ if ((strcmp(par->table, "nat") != 0 ||
+ hook_mask & ~(1 << NF_BR_PRE_ROUTING)) &&
+ (strcmp(par->table, "broute") != 0 ||
+ hook_mask & ~(1 << NF_BR_BROUTING)))
return -EINVAL;
- if (INVALID_TARGET)
+ if (ebt_invalid_target(info->target))
return -EINVAL;
return 0;
}
-static struct ebt_target redirect_target =
-{
- .name = EBT_REDIRECT_TARGET,
- .target = ebt_target_redirect,
- .check = ebt_target_redirect_check,
+static struct xt_target ebt_redirect_tg_reg __read_mostly = {
+ .name = "redirect",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_BROUTING),
+ .target = ebt_redirect_tg,
+ .checkentry = ebt_redirect_tg_check,
+ .targetsize = sizeof(struct ebt_redirect_info),
.me = THIS_MODULE,
};
static int __init ebt_redirect_init(void)
{
- return ebt_register_target(&redirect_target);
+ return xt_register_target(&ebt_redirect_tg_reg);
}
static void __exit ebt_redirect_fini(void)
{
- ebt_unregister_target(&redirect_target);
+ xt_unregister_target(&ebt_redirect_tg_reg);
}
module_init(ebt_redirect_init);
module_exit(ebt_redirect_fini);
+MODULE_DESCRIPTION("Ebtables: Packet redirection to localhost");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index cbb33e24ca8a..7dfbcdfc30e5 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_snat
*
@@ -7,70 +8,81 @@
* June, 2002
*
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_nat.h>
#include <linux/module.h>
#include <net/sock.h>
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
-static int ebt_target_snat(struct sk_buff **pskb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static unsigned int
+ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct ebt_nat_info *info = (struct ebt_nat_info *) data;
+ const struct ebt_nat_info *info = par->targinfo;
- if (skb_shared(*pskb) || skb_cloned(*pskb)) {
- struct sk_buff *nskb;
+ if (skb_ensure_writable(skb, 0))
+ return EBT_DROP;
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return NF_DROP;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
+ ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
+ if (!(info->target & NAT_ARP_BIT) &&
+ eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+ const struct arphdr *ap;
+ struct arphdr _ah;
+
+ ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
+ if (ap == NULL)
+ return EBT_DROP;
+ if (ap->ar_hln != ETH_ALEN)
+ goto out;
+ if (skb_store_bits(skb, sizeof(_ah), info->mac, ETH_ALEN))
+ return EBT_DROP;
}
- memcpy(eth_hdr(*pskb)->h_source, info->mac, ETH_ALEN);
- return info->target;
+out:
+ return info->target | ~EBT_VERDICT_BITS;
}
-static int ebt_target_snat_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_snat_tg_check(const struct xt_tgchk_param *par)
{
- struct ebt_nat_info *info = (struct ebt_nat_info *) data;
+ const struct ebt_nat_info *info = par->targinfo;
+ int tmp;
- if (datalen != EBT_ALIGN(sizeof(struct ebt_nat_info)))
- return -EINVAL;
- if (BASE_CHAIN && info->target == EBT_RETURN)
- return -EINVAL;
- CLEAR_BASE_CHAIN_BIT;
- if (strcmp(tablename, "nat"))
+ tmp = info->target | ~EBT_VERDICT_BITS;
+ if (BASE_CHAIN && tmp == EBT_RETURN)
return -EINVAL;
- if (hookmask & ~(1 << NF_BR_POST_ROUTING))
+
+ if (ebt_invalid_target(tmp))
return -EINVAL;
- if (INVALID_TARGET)
+ tmp = info->target | EBT_VERDICT_BITS;
+ if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT)
return -EINVAL;
return 0;
}
-static struct ebt_target snat =
-{
- .name = EBT_SNAT_TARGET,
- .target = ebt_target_snat,
- .check = ebt_target_snat_check,
+static struct xt_target ebt_snat_tg_reg __read_mostly = {
+ .name = "snat",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .table = "nat",
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_POST_ROUTING),
+ .target = ebt_snat_tg,
+ .checkentry = ebt_snat_tg_check,
+ .targetsize = sizeof(struct ebt_nat_info),
.me = THIS_MODULE,
};
static int __init ebt_snat_init(void)
{
- return ebt_register_target(&snat);
+ return xt_register_target(&ebt_snat_tg_reg);
}
static void __exit ebt_snat_fini(void)
{
- ebt_unregister_target(&snat);
+ xt_unregister_target(&ebt_snat_tg_reg);
}
module_init(ebt_snat_init);
module_exit(ebt_snat_fini);
+MODULE_DESCRIPTION("Ebtables: Source MAC address translation");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index a0bed82145ed..8f68afda5f81 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_stp
*
@@ -7,189 +8,187 @@
*
* July, 2003
*/
-
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_stp.h>
#include <linux/etherdevice.h>
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_stp.h>
#define BPDU_TYPE_CONFIG 0
-#define BPDU_TYPE_TCN 0x80
struct stp_header {
- uint8_t dsap;
- uint8_t ssap;
- uint8_t ctrl;
- uint8_t pid;
- uint8_t vers;
- uint8_t type;
+ u8 dsap;
+ u8 ssap;
+ u8 ctrl;
+ u8 pid;
+ u8 vers;
+ u8 type;
};
struct stp_config_pdu {
- uint8_t flags;
- uint8_t root[8];
- uint8_t root_cost[4];
- uint8_t sender[8];
- uint8_t port[2];
- uint8_t msg_age[2];
- uint8_t max_age[2];
- uint8_t hello_time[2];
- uint8_t forward_delay[2];
+ u8 flags;
+ u8 root[8];
+ u8 root_cost[4];
+ u8 sender[8];
+ u8 port[2];
+ u8 msg_age[2];
+ u8 max_age[2];
+ u8 hello_time[2];
+ u8 forward_delay[2];
};
#define NR16(p) (p[0] << 8 | p[1])
#define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3])
-static int ebt_filter_config(struct ebt_stp_info *info,
- struct stp_config_pdu *stpc)
+static bool ebt_filter_config(const struct ebt_stp_info *info,
+ const struct stp_config_pdu *stpc)
{
- struct ebt_stp_config_info *c;
- uint16_t v16;
- uint32_t v32;
- int verdict, i;
+ const struct ebt_stp_config_info *c;
+ u16 v16;
+ u32 v32;
c = &info->config;
if ((info->bitmask & EBT_STP_FLAGS) &&
- FWINV(c->flags != stpc->flags, EBT_STP_FLAGS))
- return EBT_NOMATCH;
+ NF_INVF(info, EBT_STP_FLAGS, c->flags != stpc->flags))
+ return false;
if (info->bitmask & EBT_STP_ROOTPRIO) {
v16 = NR16(stpc->root);
- if (FWINV(v16 < c->root_priol ||
- v16 > c->root_priou, EBT_STP_ROOTPRIO))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_ROOTPRIO,
+ v16 < c->root_priol || v16 > c->root_priou))
+ return false;
}
if (info->bitmask & EBT_STP_ROOTADDR) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (stpc->root[2+i] ^ c->root_addr[i]) &
- c->root_addrmsk[i];
- if (FWINV(verdict != 0, EBT_STP_ROOTADDR))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_ROOTADDR,
+ !ether_addr_equal_masked(&stpc->root[2],
+ c->root_addr,
+ c->root_addrmsk)))
+ return false;
}
if (info->bitmask & EBT_STP_ROOTCOST) {
v32 = NR32(stpc->root_cost);
- if (FWINV(v32 < c->root_costl ||
- v32 > c->root_costu, EBT_STP_ROOTCOST))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_ROOTCOST,
+ v32 < c->root_costl || v32 > c->root_costu))
+ return false;
}
if (info->bitmask & EBT_STP_SENDERPRIO) {
v16 = NR16(stpc->sender);
- if (FWINV(v16 < c->sender_priol ||
- v16 > c->sender_priou, EBT_STP_SENDERPRIO))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_SENDERPRIO,
+ v16 < c->sender_priol || v16 > c->sender_priou))
+ return false;
}
if (info->bitmask & EBT_STP_SENDERADDR) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) &
- c->sender_addrmsk[i];
- if (FWINV(verdict != 0, EBT_STP_SENDERADDR))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_SENDERADDR,
+ !ether_addr_equal_masked(&stpc->sender[2],
+ c->sender_addr,
+ c->sender_addrmsk)))
+ return false;
}
if (info->bitmask & EBT_STP_PORT) {
v16 = NR16(stpc->port);
- if (FWINV(v16 < c->portl ||
- v16 > c->portu, EBT_STP_PORT))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_PORT,
+ v16 < c->portl || v16 > c->portu))
+ return false;
}
if (info->bitmask & EBT_STP_MSGAGE) {
v16 = NR16(stpc->msg_age);
- if (FWINV(v16 < c->msg_agel ||
- v16 > c->msg_ageu, EBT_STP_MSGAGE))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_MSGAGE,
+ v16 < c->msg_agel || v16 > c->msg_ageu))
+ return false;
}
if (info->bitmask & EBT_STP_MAXAGE) {
v16 = NR16(stpc->max_age);
- if (FWINV(v16 < c->max_agel ||
- v16 > c->max_ageu, EBT_STP_MAXAGE))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_MAXAGE,
+ v16 < c->max_agel || v16 > c->max_ageu))
+ return false;
}
if (info->bitmask & EBT_STP_HELLOTIME) {
v16 = NR16(stpc->hello_time);
- if (FWINV(v16 < c->hello_timel ||
- v16 > c->hello_timeu, EBT_STP_HELLOTIME))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_HELLOTIME,
+ v16 < c->hello_timel || v16 > c->hello_timeu))
+ return false;
}
if (info->bitmask & EBT_STP_FWDD) {
v16 = NR16(stpc->forward_delay);
- if (FWINV(v16 < c->forward_delayl ||
- v16 > c->forward_delayu, EBT_STP_FWDD))
- return EBT_NOMATCH;
+ if (NF_INVF(info, EBT_STP_FWDD,
+ v16 < c->forward_delayl || v16 > c->forward_delayu))
+ return false;
}
- return EBT_MATCH;
+ return true;
}
-static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out, const void *data, unsigned int datalen)
+static bool
+ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_stp_info *info = (struct ebt_stp_info *)data;
- struct stp_header _stph, *sp;
- uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
+ const struct ebt_stp_info *info = par->matchinfo;
+ const struct stp_header *sp;
+ struct stp_header _stph;
+ const u8 header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph);
if (sp == NULL)
- return EBT_NOMATCH;
+ return false;
/* The stp code only considers these */
if (memcmp(sp, header, sizeof(header)))
- return EBT_NOMATCH;
+ return false;
- if (info->bitmask & EBT_STP_TYPE
- && FWINV(info->type != sp->type, EBT_STP_TYPE))
- return EBT_NOMATCH;
+ if ((info->bitmask & EBT_STP_TYPE) &&
+ NF_INVF(info, EBT_STP_TYPE, info->type != sp->type))
+ return false;
if (sp->type == BPDU_TYPE_CONFIG &&
info->bitmask & EBT_STP_CONFIG_MASK) {
- struct stp_config_pdu _stpc, *st;
+ const struct stp_config_pdu *st;
+ struct stp_config_pdu _stpc;
st = skb_header_pointer(skb, sizeof(_stph),
sizeof(_stpc), &_stpc);
if (st == NULL)
- return EBT_NOMATCH;
+ return false;
return ebt_filter_config(info, st);
}
- return EBT_MATCH;
+ return true;
}
-static int ebt_stp_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_stp_info *info = (struct ebt_stp_info *)data;
- int len = EBT_ALIGN(sizeof(struct ebt_stp_info));
- uint8_t bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
- uint8_t msk[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ const struct ebt_stp_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
!(info->bitmask & EBT_STP_MASK))
return -EINVAL;
- if (datalen != len)
- return -EINVAL;
/* Make sure the match only receives stp frames */
- if (compare_ether_addr(e->destmac, bridge_ula) ||
- compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
+ if (!par->nft_compat &&
+ (!ether_addr_equal(e->destmac, eth_stp_addr) ||
+ !(e->bitmask & EBT_DESTMAC) ||
+ !is_broadcast_ether_addr(e->destmsk)))
return -EINVAL;
return 0;
}
-static struct ebt_match filter_stp =
-{
- .name = EBT_STP_MATCH,
- .match = ebt_filter_stp,
- .check = ebt_stp_check,
+static struct xt_match ebt_stp_mt_reg __read_mostly = {
+ .name = "stp",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_stp_mt,
+ .checkentry = ebt_stp_mt_check,
+ .matchsize = sizeof(struct ebt_stp_info),
.me = THIS_MODULE,
};
static int __init ebt_stp_init(void)
{
- return ebt_register_match(&filter_stp);
+ return xt_register_match(&ebt_stp_mt_reg);
}
static void __exit ebt_stp_fini(void)
{
- ebt_unregister_match(&filter_stp);
+ xt_unregister_match(&ebt_stp_mt_reg);
}
module_init(ebt_stp_init);
module_exit(ebt_stp_fini);
+MODULE_DESCRIPTION("Ebtables: Spanning Tree Protocol packet match");
MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
deleted file mode 100644
index c1af68b5a29c..000000000000
--- a/net/bridge/netfilter/ebt_ulog.c
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * netfilter module for userspace bridged Ethernet frames logging daemons
- *
- * Authors:
- * Bart De Schuymer <bdschuym@pandora.be>
- * Harald Welte <laforge@netfilter.org>
- *
- * November, 2004
- *
- * Based on ipt_ULOG.c, which is
- * (C) 2000-2002 by Harald Welte <laforge@netfilter.org>
- *
- * This module accepts two parameters:
- *
- * nlbufsiz:
- * The parameter specifies how big the buffer for each netlink multicast
- * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
- * get accumulated in the kernel until they are sent to userspace. It is
- * NOT possible to allocate more than 128kB, and it is strongly discouraged,
- * because atomically allocating 128kB inside the network rx softirq is not
- * reliable. Please also keep in mind that this buffer size is allocated for
- * each nlgroup you are using, so the total kernel memory usage increases
- * by that factor.
- *
- * flushtimeout:
- * Specify, after how many hundredths of a second the queue should be
- * flushed even if it is not full yet.
- *
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/netlink.h>
-#include <linux/netdevice.h>
-#include <linux/module.h>
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_ulog.h>
-#include <net/sock.h>
-#include "../br_private.h"
-
-#define PRINTR(format, args...) do { if (net_ratelimit()) \
- printk(format , ## args); } while (0)
-
-static unsigned int nlbufsiz = NLMSG_GOODSIZE;
-module_param(nlbufsiz, uint, 0600);
-MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) "
- "(defaults to 4096)");
-
-static unsigned int flushtimeout = 10;
-module_param(flushtimeout, uint, 0600);
-MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) "
- "(defaults to 10)");
-
-typedef struct {
- unsigned int qlen; /* number of nlmsgs' in the skb */
- struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
- struct sk_buff *skb; /* the pre-allocated skb */
- struct timer_list timer; /* the timer function */
- spinlock_t lock; /* the per-queue lock */
-} ebt_ulog_buff_t;
-
-static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
-static struct sock *ebtulognl;
-
-/* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroup)
-{
- ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup];
-
- if (timer_pending(&ub->timer))
- del_timer(&ub->timer);
-
- if (!ub->skb)
- return;
-
- /* last nlmsg needs NLMSG_DONE */
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_type = NLMSG_DONE;
-
- NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
- netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
-
- ub->qlen = 0;
- ub->skb = NULL;
-}
-
-/* timer function to flush queue in flushtimeout time */
-static void ulog_timer(unsigned long data)
-{
- spin_lock_bh(&ulog_buffers[data].lock);
- if (ulog_buffers[data].skb)
- ulog_send(data);
- spin_unlock_bh(&ulog_buffers[data].lock);
-}
-
-static struct sk_buff *ulog_alloc_skb(unsigned int size)
-{
- struct sk_buff *skb;
- unsigned int n;
-
- n = max(size, nlbufsiz);
- skb = alloc_skb(n, GFP_ATOMIC);
- if (!skb) {
- PRINTR(KERN_ERR "ebt_ulog: can't alloc whole buffer "
- "of size %ub!\n", n);
- if (n > size) {
- /* try to allocate only as much as we need for
- * current packet */
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- PRINTR(KERN_ERR "ebt_ulog: can't even allocate "
- "buffer of size %ub\n", size);
- }
- }
-
- return skb;
-}
-
-static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct ebt_ulog_info *uloginfo, const char *prefix)
-{
- ebt_ulog_packet_msg_t *pm;
- size_t size, copy_len;
- struct nlmsghdr *nlh;
- unsigned int group = uloginfo->nlgroup;
- ebt_ulog_buff_t *ub = &ulog_buffers[group];
- spinlock_t *lock = &ub->lock;
-
- if ((uloginfo->cprange == 0) ||
- (uloginfo->cprange > skb->len + ETH_HLEN))
- copy_len = skb->len + ETH_HLEN;
- else
- copy_len = uloginfo->cprange;
-
- size = NLMSG_SPACE(sizeof(*pm) + copy_len);
- if (size > nlbufsiz) {
- PRINTR("ebt_ulog: Size %Zd needed, but nlbufsiz=%d\n",
- size, nlbufsiz);
- return;
- }
-
- spin_lock_bh(lock);
-
- if (!ub->skb) {
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- } else if (size > skb_tailroom(ub->skb)) {
- ulog_send(group);
-
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- }
-
- nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, 0,
- size - NLMSG_ALIGN(sizeof(*nlh)));
- ub->qlen++;
-
- pm = NLMSG_DATA(nlh);
-
- /* Fill in the ulog data */
- pm->version = EBT_ULOG_VERSION;
- do_gettimeofday(&pm->stamp);
- if (ub->qlen == 1)
- skb_set_timestamp(ub->skb, &pm->stamp);
- pm->data_len = copy_len;
- pm->mark = skb->mark;
- pm->hook = hooknr;
- if (uloginfo->prefix != NULL)
- strcpy(pm->prefix, uloginfo->prefix);
- else
- *(pm->prefix) = '\0';
-
- if (in) {
- strcpy(pm->physindev, in->name);
- /* If in isn't a bridge, then physindev==indev */
- if (in->br_port)
- strcpy(pm->indev, in->br_port->br->dev->name);
- else
- strcpy(pm->indev, in->name);
- } else
- pm->indev[0] = pm->physindev[0] = '\0';
-
- if (out) {
- /* If out exists, then out is a bridge port */
- strcpy(pm->physoutdev, out->name);
- strcpy(pm->outdev, out->br_port->br->dev->name);
- } else
- pm->outdev[0] = pm->physoutdev[0] = '\0';
-
- if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0)
- BUG();
-
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
-
- ub->lastnlh = nlh;
-
- if (ub->qlen >= uloginfo->qthreshold)
- ulog_send(group);
- else if (!timer_pending(&ub->timer)) {
- ub->timer.expires = jiffies + flushtimeout * HZ / 100;
- add_timer(&ub->timer);
- }
-
-unlock:
- spin_unlock_bh(lock);
-
- return;
-
-nlmsg_failure:
- printk(KERN_CRIT "ebt_ulog: error during NLMSG_PUT. This should "
- "not happen, please report to author.\n");
- goto unlock;
-alloc_failure:
- goto unlock;
-}
-
-/* this function is registered with the netfilter core */
-static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
- const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out, const struct nf_loginfo *li,
- const char *prefix)
-{
- struct ebt_ulog_info loginfo;
-
- if (!li || li->type != NF_LOG_TYPE_ULOG) {
- loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
- loginfo.cprange = 0;
- loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
- loginfo.prefix[0] = '\0';
- } else {
- loginfo.nlgroup = li->u.ulog.group;
- loginfo.cprange = li->u.ulog.copy_len;
- loginfo.qthreshold = li->u.ulog.qthreshold;
- strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
- }
-
- ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
-}
-
-static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
-{
- struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
-
- ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
-}
-
-
-static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
- const struct ebt_entry *e, void *data, unsigned int datalen)
-{
- struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
-
- if (datalen != EBT_ALIGN(sizeof(struct ebt_ulog_info)) ||
- uloginfo->nlgroup > 31)
- return -EINVAL;
-
- uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0';
-
- if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN)
- uloginfo->qthreshold = EBT_ULOG_MAX_QLEN;
-
- return 0;
-}
-
-static struct ebt_watcher ulog = {
- .name = EBT_ULOG_WATCHER,
- .watcher = ebt_ulog,
- .check = ebt_ulog_check,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ebt_ulog_logger = {
- .name = EBT_ULOG_WATCHER,
- .logfn = &ebt_log_packet,
- .me = THIS_MODULE,
-};
-
-static int __init ebt_ulog_init(void)
-{
- int i, ret = 0;
-
- if (nlbufsiz >= 128*1024) {
- printk(KERN_NOTICE "ebt_ulog: Netlink buffer has to be <= 128kB,"
- " please try a smaller nlbufsiz parameter.\n");
- return -EINVAL;
- }
-
- /* initialize ulog_buffers */
- for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
- init_timer(&ulog_buffers[i].timer);
- ulog_buffers[i].timer.function = ulog_timer;
- ulog_buffers[i].timer.data = i;
- spin_lock_init(&ulog_buffers[i].lock);
- }
-
- ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
- NULL, THIS_MODULE);
- if (!ebtulognl)
- ret = -ENOMEM;
- else if ((ret = ebt_register_watcher(&ulog)))
- sock_release(ebtulognl->sk_socket);
-
- if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) {
- printk(KERN_WARNING "ebt_ulog: not logging via ulog "
- "since somebody else already registered for PF_BRIDGE\n");
- /* we cannot make module load fail here, since otherwise
- * ebtables userspace would abort */
- }
-
- return ret;
-}
-
-static void __exit ebt_ulog_fini(void)
-{
- ebt_ulog_buff_t *ub;
- int i;
-
- nf_log_unregister_logger(&ebt_ulog_logger);
- ebt_unregister_watcher(&ulog);
- for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
- ub = &ulog_buffers[i];
- if (timer_pending(&ub->timer))
- del_timer(&ub->timer);
- spin_lock_bh(&ub->lock);
- if (ub->skb) {
- kfree_skb(ub->skb);
- ub->skb = NULL;
- }
- spin_unlock_bh(&ub->lock);
- }
- sock_release(ebtulognl->sk_socket);
-}
-
-module_init(ebt_ulog_init);
-module_exit(ebt_ulog_fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
-MODULE_DESCRIPTION("ebtables userspace logging module for bridged Ethernet"
- " frames");
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 7ee377622964..80ede370afed 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -1,55 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Description: EBTables 802.1Q match extension kernelspace module.
* Authors: Nick Fedchik <nick@fedchik.org.ua>
* Bart De Schuymer <bdschuym@pandora.be>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_vlan.h>
-static int debug;
#define MODULE_VERS "0.6"
-module_param(debug, int, 0);
-MODULE_PARM_DESC(debug, "debug=1 is turn on debug messages");
MODULE_AUTHOR("Nick Fedchik <nick@fedchik.org.ua>");
-MODULE_DESCRIPTION("802.1Q match module (ebtables extension), v"
- MODULE_VERS);
+MODULE_DESCRIPTION("Ebtables: 802.1Q VLAN tag match");
MODULE_LICENSE("GPL");
-
-#define DEBUG_MSG(args...) if (debug) printk (KERN_DEBUG "ebt_vlan: " args)
-#define INV_FLAG(_inv_flag_) (info->invflags & _inv_flag_) ? "!" : ""
#define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_
-#define SET_BITMASK(_BIT_MASK_) info->bitmask |= _BIT_MASK_
-#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return EBT_NOMATCH;}
-
-static int
-ebt_filter_vlan(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const void *data, unsigned int datalen)
+#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; }
+
+static bool
+ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ebt_vlan_info *info = (struct ebt_vlan_info *) data;
- struct vlan_hdr _frame, *fp;
+ const struct ebt_vlan_info *info = par->matchinfo;
unsigned short TCI; /* Whole TCI, given from parsed frame */
unsigned short id; /* VLAN ID, given from frame TCI */
@@ -57,9 +33,20 @@ ebt_filter_vlan(const struct sk_buff *skb,
/* VLAN encapsulated Type/Length field, given from orig frame */
__be16 encap;
- fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
- if (fp == NULL)
- return EBT_NOMATCH;
+ if (skb_vlan_tag_present(skb)) {
+ TCI = skb_vlan_tag_get(skb);
+ encap = skb->protocol;
+ } else {
+ const struct vlan_hdr *fp;
+ struct vlan_hdr _frame;
+
+ fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
+ if (fp == NULL)
+ return false;
+
+ TCI = ntohs(fp->h_vlan_TCI);
+ encap = fp->h_vlan_encapsulated_proto;
+ }
/* Tag Control Information (TCI) consists of the following elements:
* - User_priority. The user_priority field is three bits in length,
@@ -67,11 +54,10 @@ ebt_filter_vlan(const struct sk_buff *skb,
* - Canonical Format Indicator (CFI). The Canonical Format Indicator
* (CFI) is a single bit flag value. Currently ignored.
* - VLAN Identifier (VID). The VID is encoded as
- * an unsigned binary number. */
- TCI = ntohs(fp->h_vlan_TCI);
+ * an unsigned binary number.
+ */
id = TCI & VLAN_VID_MASK;
prio = (TCI >> 13) & 0x7;
- encap = fp->h_vlan_encapsulated_proto;
/* Checking VLAN Identifier (VID) */
if (GET_BITMASK(EBT_VLAN_ID))
@@ -85,65 +71,56 @@ ebt_filter_vlan(const struct sk_buff *skb,
if (GET_BITMASK(EBT_VLAN_ENCAP))
EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP);
- return EBT_MATCH;
+ return true;
}
-static int
-ebt_check_vlan(const char *tablename,
- unsigned int hooknr,
- const struct ebt_entry *e, void *data, unsigned int datalen)
+static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
{
- struct ebt_vlan_info *info = (struct ebt_vlan_info *) data;
-
- /* Parameters buffer overflow check */
- if (datalen != EBT_ALIGN(sizeof(struct ebt_vlan_info))) {
- DEBUG_MSG
- ("passed size %d is not eq to ebt_vlan_info (%Zd)\n",
- datalen, sizeof(struct ebt_vlan_info));
- return -EINVAL;
- }
+ struct ebt_vlan_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
/* Is it 802.1Q frame checked? */
if (e->ethproto != htons(ETH_P_8021Q)) {
- DEBUG_MSG
- ("passed entry proto %2.4X is not 802.1Q (8100)\n",
- (unsigned short) ntohs(e->ethproto));
+ pr_debug("passed entry proto %2.4X is not 802.1Q (8100)\n",
+ ntohs(e->ethproto));
return -EINVAL;
}
/* Check for bitmask range
- * True if even one bit is out of mask */
+ * True if even one bit is out of mask
+ */
if (info->bitmask & ~EBT_VLAN_MASK) {
- DEBUG_MSG("bitmask %2X is out of mask (%2X)\n",
- info->bitmask, EBT_VLAN_MASK);
+ pr_debug("bitmask %2X is out of mask (%2X)\n",
+ info->bitmask, EBT_VLAN_MASK);
return -EINVAL;
}
/* Check for inversion flags range */
if (info->invflags & ~EBT_VLAN_MASK) {
- DEBUG_MSG("inversion flags %2X is out of mask (%2X)\n",
- info->invflags, EBT_VLAN_MASK);
+ pr_debug("inversion flags %2X is out of mask (%2X)\n",
+ info->invflags, EBT_VLAN_MASK);
return -EINVAL;
}
/* Reserved VLAN ID (VID) values
* -----------------------------
- * 0 - The null VLAN ID.
+ * 0 - The null VLAN ID.
* 1 - The default Port VID (PVID)
- * 0x0FFF - Reserved for implementation use.
- * if_vlan.h: VLAN_GROUP_ARRAY_LEN 4096. */
+ * 0x0FFF - Reserved for implementation use.
+ * if_vlan.h: VLAN_N_VID 4096.
+ */
if (GET_BITMASK(EBT_VLAN_ID)) {
if (!!info->id) { /* if id!=0 => check vid range */
- if (info->id > VLAN_GROUP_ARRAY_LEN) {
- DEBUG_MSG
- ("id %d is out of range (1-4096)\n",
- info->id);
+ if (info->id > VLAN_N_VID) {
+ pr_debug("id %d is out of range (1-4096)\n",
+ info->id);
return -EINVAL;
}
/* Note: This is valid VLAN-tagged frame point.
- * Any value of user_priority are acceptable,
+ * Any value of user_priority are acceptable,
* but should be ignored according to 802.1Q Std.
- * So we just drop the prio flag. */
+ * So we just drop the prio flag.
+ */
info->bitmask &= ~EBT_VLAN_PRIO;
}
/* Else, id=0 (null VLAN ID) => user_priority range (any?) */
@@ -151,19 +128,19 @@ ebt_check_vlan(const char *tablename,
if (GET_BITMASK(EBT_VLAN_PRIO)) {
if ((unsigned char) info->prio > 7) {
- DEBUG_MSG("prio %d is out of range (0-7)\n",
- info->prio);
+ pr_debug("prio %d is out of range (0-7)\n",
+ info->prio);
return -EINVAL;
}
}
/* Check for encapsulated proto range - it is possible to be
* any value for u_short range.
- * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */
+ * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS
+ */
if (GET_BITMASK(EBT_VLAN_ENCAP)) {
if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) {
- DEBUG_MSG
- ("encap frame length %d is less than minimal\n",
- ntohs(info->encap));
+ pr_debug("encap frame length %d is less than "
+ "minimal\n", ntohs(info->encap));
return -EINVAL;
}
}
@@ -171,24 +148,25 @@ ebt_check_vlan(const char *tablename,
return 0;
}
-static struct ebt_match filter_vlan = {
- .name = EBT_VLAN_MATCH,
- .match = ebt_filter_vlan,
- .check = ebt_check_vlan,
+static struct xt_match ebt_vlan_mt_reg __read_mostly = {
+ .name = "vlan",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_vlan_mt,
+ .checkentry = ebt_vlan_mt_check,
+ .matchsize = sizeof(struct ebt_vlan_info),
.me = THIS_MODULE,
};
static int __init ebt_vlan_init(void)
{
- DEBUG_MSG("ebtables 802.1Q extension module v"
- MODULE_VERS "\n");
- DEBUG_MSG("module debug=%d\n", !!debug);
- return ebt_register_match(&filter_vlan);
+ pr_debug("ebtables 802.1Q extension module v" MODULE_VERS "\n");
+ return xt_register_match(&ebt_vlan_mt_reg);
}
static void __exit ebt_vlan_fini(void)
{
- ebt_unregister_match(&filter_vlan);
+ xt_unregister_match(&ebt_vlan_mt_reg);
}
module_init(ebt_vlan_init);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 9a6e548e148b..741360219552 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebtable_broute
*
@@ -15,6 +16,8 @@
#include <linux/module.h>
#include <linux/if_bridge.h>
+#include "../br_private.h"
+
/* EBT_ACCEPT means the frame will be bridged
* EBT_DROP means the frame will be routed
*/
@@ -23,8 +26,7 @@ static struct ebt_entries initial_chain = {
.policy = EBT_ACCEPT,
};
-static struct ebt_replace initial_table =
-{
+static struct ebt_replace_kernel initial_table = {
.name = "broute",
.valid_hooks = 1 << NF_BR_BROUTING,
.entries_size = sizeof(struct ebt_entries),
@@ -34,53 +36,103 @@ static struct ebt_replace initial_table =
.entries = (char *)&initial_chain,
};
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
- if (valid_hooks & ~(1 << NF_BR_BROUTING))
- return -EINVAL;
- return 0;
-}
-
-static struct ebt_table broute_table =
-{
+static const struct ebt_table broute_table = {
.name = "broute",
.table = &initial_table,
.valid_hooks = 1 << NF_BR_BROUTING,
- .lock = RW_LOCK_UNLOCKED,
- .check = check,
.me = THIS_MODULE,
};
-static int ebt_broute(struct sk_buff **pskb)
+static unsigned int ebt_broute(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *s)
{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+ struct nf_hook_state state;
+ unsigned char *dest;
int ret;
- ret = ebt_do_table(NF_BR_BROUTING, pskb, (*pskb)->dev, NULL,
- &broute_table);
- if (ret == NF_DROP)
- return 1; /* route it */
- return 0; /* bridge it */
+ if (!p || p->state != BR_STATE_FORWARDING)
+ return NF_ACCEPT;
+
+ nf_hook_state_init(&state, NF_BR_BROUTING,
+ NFPROTO_BRIDGE, s->in, NULL, NULL,
+ s->net, NULL);
+
+ ret = ebt_do_table(priv, skb, &state);
+ if (ret != NF_DROP)
+ return ret;
+
+ /* DROP in ebtables -t broute means that the
+ * skb should be routed, not bridged.
+ * This is awkward, but can't be changed for compatibility
+ * reasons.
+ *
+ * We map DROP to ACCEPT and set the ->br_netfilter_broute flag.
+ */
+ BR_INPUT_SKB_CB(skb)->br_netfilter_broute = 1;
+
+ /* undo PACKET_HOST mangling done in br_input in case the dst
+ * address matches the logical bridge but not the port.
+ */
+ dest = eth_hdr(skb)->h_dest;
+ if (skb->pkt_type == PACKET_HOST &&
+ !ether_addr_equal(skb->dev->dev_addr, dest) &&
+ ether_addr_equal(p->br->dev->dev_addr, dest))
+ skb->pkt_type = PACKET_OTHERHOST;
+
+ return NF_ACCEPT;
}
+static const struct nf_hook_ops ebt_ops_broute = {
+ .hook = ebt_broute,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_FIRST,
+};
+
+static int broute_table_init(struct net *net)
+{
+ return ebt_register_table(net, &broute_table, &ebt_ops_broute);
+}
+
+static void __net_exit broute_net_pre_exit(struct net *net)
+{
+ ebt_unregister_table_pre_exit(net, "broute");
+}
+
+static void __net_exit broute_net_exit(struct net *net)
+{
+ ebt_unregister_table(net, "broute");
+}
+
+static struct pernet_operations broute_net_ops = {
+ .exit = broute_net_exit,
+ .pre_exit = broute_net_pre_exit,
+};
+
static int __init ebtable_broute_init(void)
{
- int ret;
+ int ret = ebt_register_template(&broute_table, broute_table_init);
- ret = ebt_register_table(&broute_table);
- if (ret < 0)
+ if (ret)
return ret;
- /* see br_input.c */
- br_should_route_hook = ebt_broute;
- return ret;
+
+ ret = register_pernet_subsys(&broute_net_ops);
+ if (ret) {
+ ebt_unregister_template(&broute_table);
+ return ret;
+ }
+
+ return 0;
}
static void __exit ebtable_broute_fini(void)
{
- br_should_route_hook = NULL;
- synchronize_net();
- ebt_unregister_table(&broute_table);
+ unregister_pernet_subsys(&broute_net_ops);
+ ebt_unregister_template(&broute_table);
}
module_init(ebtable_broute_init);
module_exit(ebtable_broute_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Force packets to be routed instead of bridged");
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 3d5bd44f2395..dacd81b12e62 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebtable_filter
*
@@ -9,13 +10,13 @@
*/
#include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/module.h>
#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
- (1 << NF_BR_LOCAL_OUT))
+ (1 << NF_BR_LOCAL_OUT))
-static struct ebt_entries initial_chains[] =
-{
+static struct ebt_entries initial_chains[] = {
{
.name = "INPUT",
.policy = EBT_ACCEPT,
@@ -30,8 +31,7 @@ static struct ebt_entries initial_chains[] =
},
};
-static struct ebt_replace initial_table =
-{
+static struct ebt_replace_kernel initial_table = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.entries_size = 3 * sizeof(struct ebt_entries),
@@ -43,81 +43,77 @@ static struct ebt_replace initial_table =
.entries = (char *)initial_chains,
};
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
- if (valid_hooks & ~FILTER_VALID_HOOKS)
- return -EINVAL;
- return 0;
-}
-
-static struct ebt_table frame_filter =
-{
+static const struct ebt_table frame_filter = {
.name = "filter",
.table = &initial_table,
- .valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .check = check,
+ .valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
};
-static unsigned int
-ebt_hook (unsigned int hook, struct sk_buff **pskb, const struct net_device *in,
- const struct net_device *out, int (*okfn)(struct sk_buff *))
-{
- return ebt_do_table(hook, pskb, in, out, &frame_filter);
-}
-
-static struct nf_hook_ops ebt_ops_filter[] = {
+static const struct nf_hook_ops ebt_ops_filter[] = {
{
- .hook = ebt_hook,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
+ .hook = ebt_do_table,
+ .pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_FILTER_BRIDGED,
},
{
- .hook = ebt_hook,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
+ .hook = ebt_do_table,
+ .pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_FILTER_BRIDGED,
},
{
- .hook = ebt_hook,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
+ .hook = ebt_do_table,
+ .pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_FILTER_OTHER,
},
};
+static int frame_filter_table_init(struct net *net)
+{
+ return ebt_register_table(net, &frame_filter, ebt_ops_filter);
+}
+
+static void __net_exit frame_filter_net_pre_exit(struct net *net)
+{
+ ebt_unregister_table_pre_exit(net, "filter");
+}
+
+static void __net_exit frame_filter_net_exit(struct net *net)
+{
+ ebt_unregister_table(net, "filter");
+}
+
+static struct pernet_operations frame_filter_net_ops = {
+ .exit = frame_filter_net_exit,
+ .pre_exit = frame_filter_net_pre_exit,
+};
+
static int __init ebtable_filter_init(void)
{
- int i, j, ret;
+ int ret = ebt_register_template(&frame_filter, frame_filter_table_init);
+
+ if (ret)
+ return ret;
- ret = ebt_register_table(&frame_filter);
- if (ret < 0)
+ ret = register_pernet_subsys(&frame_filter_net_ops);
+ if (ret) {
+ ebt_unregister_template(&frame_filter);
return ret;
- for (i = 0; i < ARRAY_SIZE(ebt_ops_filter); i++)
- if ((ret = nf_register_hook(&ebt_ops_filter[i])) < 0)
- goto cleanup;
- return ret;
-cleanup:
- for (j = 0; j < i; j++)
- nf_unregister_hook(&ebt_ops_filter[j]);
- ebt_unregister_table(&frame_filter);
- return ret;
+ }
+
+ return 0;
}
static void __exit ebtable_filter_fini(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ebt_ops_filter); i++)
- nf_unregister_hook(&ebt_ops_filter[i]);
- ebt_unregister_table(&frame_filter);
+ unregister_pernet_subsys(&frame_filter_net_ops);
+ ebt_unregister_template(&frame_filter);
}
module_init(ebtable_filter_init);
module_exit(ebtable_filter_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ebtables legacy filter table");
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 04dd42efda1d..0f2a8c6118d4 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebtable_nat
*
@@ -9,13 +10,13 @@
*/
#include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/module.h>
#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
- (1 << NF_BR_POST_ROUTING))
+ (1 << NF_BR_POST_ROUTING))
-static struct ebt_entries initial_chains[] =
-{
+static struct ebt_entries initial_chains[] = {
{
.name = "PREROUTING",
.policy = EBT_ACCEPT,
@@ -30,8 +31,7 @@ static struct ebt_entries initial_chains[] =
}
};
-static struct ebt_replace initial_table =
-{
+static struct ebt_replace_kernel initial_table = {
.name = "nat",
.valid_hooks = NAT_VALID_HOOKS,
.entries_size = 3 * sizeof(struct ebt_entries),
@@ -43,88 +43,77 @@ static struct ebt_replace initial_table =
.entries = (char *)initial_chains,
};
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
- if (valid_hooks & ~NAT_VALID_HOOKS)
- return -EINVAL;
- return 0;
-}
-
-static struct ebt_table frame_nat =
-{
+static const struct ebt_table frame_nat = {
.name = "nat",
.table = &initial_table,
.valid_hooks = NAT_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .check = check,
.me = THIS_MODULE,
};
-static unsigned int
-ebt_nat_dst(unsigned int hook, struct sk_buff **pskb, const struct net_device *in
- , const struct net_device *out, int (*okfn)(struct sk_buff *))
-{
- return ebt_do_table(hook, pskb, in, out, &frame_nat);
-}
-
-static unsigned int
-ebt_nat_src(unsigned int hook, struct sk_buff **pskb, const struct net_device *in
- , const struct net_device *out, int (*okfn)(struct sk_buff *))
-{
- return ebt_do_table(hook, pskb, in, out, &frame_nat);
-}
-
-static struct nf_hook_ops ebt_ops_nat[] = {
+static const struct nf_hook_ops ebt_ops_nat[] = {
{
- .hook = ebt_nat_dst,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
+ .hook = ebt_do_table,
+ .pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_NAT_DST_OTHER,
},
{
- .hook = ebt_nat_src,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
+ .hook = ebt_do_table,
+ .pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_NAT_SRC,
},
{
- .hook = ebt_nat_dst,
- .owner = THIS_MODULE,
- .pf = PF_BRIDGE,
+ .hook = ebt_do_table,
+ .pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED,
},
};
+static int frame_nat_table_init(struct net *net)
+{
+ return ebt_register_table(net, &frame_nat, ebt_ops_nat);
+}
+
+static void __net_exit frame_nat_net_pre_exit(struct net *net)
+{
+ ebt_unregister_table_pre_exit(net, "nat");
+}
+
+static void __net_exit frame_nat_net_exit(struct net *net)
+{
+ ebt_unregister_table(net, "nat");
+}
+
+static struct pernet_operations frame_nat_net_ops = {
+ .exit = frame_nat_net_exit,
+ .pre_exit = frame_nat_net_pre_exit,
+};
+
static int __init ebtable_nat_init(void)
{
- int i, ret, j;
+ int ret = ebt_register_template(&frame_nat, frame_nat_table_init);
- ret = ebt_register_table(&frame_nat);
- if (ret < 0)
+ if (ret)
return ret;
- for (i = 0; i < ARRAY_SIZE(ebt_ops_nat); i++)
- if ((ret = nf_register_hook(&ebt_ops_nat[i])) < 0)
- goto cleanup;
- return ret;
-cleanup:
- for (j = 0; j < i; j++)
- nf_unregister_hook(&ebt_ops_nat[j]);
- ebt_unregister_table(&frame_nat);
+
+ ret = register_pernet_subsys(&frame_nat_net_ops);
+ if (ret) {
+ ebt_unregister_template(&frame_nat);
+ return ret;
+ }
+
return ret;
}
static void __exit ebtable_nat_fini(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ebt_ops_nat); i++)
- nf_unregister_hook(&ebt_ops_nat[i]);
- ebt_unregister_table(&frame_nat);
+ unregister_pernet_subsys(&frame_nat_net_ops);
+ ebt_unregister_template(&frame_nat);
}
module_init(ebtable_nat_init);
module_exit(ebtable_nat_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ebtables legacy stateless nat table");
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9f85666f29f7..5697e3949a36 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ebtables
*
@@ -6,43 +7,28 @@
*
* ebtables.c,v 2.0, July, 2002
*
- * This code is stongly inspired on the iptables code which is
+ * This code is strongly inspired by the iptables code which is
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-
-/* used for print_string */
-#include <linux/sched.h>
-#include <linux/tty.h>
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
+#include <linux/audit.h>
#include <net/sock.h>
+#include <net/netns/generic.h>
/* needed for logical [in,out]-dev filtering */
#include "../br_private.h"
-#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
- "report to author: "format, ## args)
-/* #define BUGPRINT(format, args...) */
-#define MEMPRINT(format, args...) printk("kernel msg: ebtables "\
- ": out of memory: "format, ## args)
-/* #define MEMPRINT(format, args...) */
-
-
-
-/*
- * Each cpu has its own set of counters, so there is no need for write_lock in
+/* Each cpu has its own set of counters, so there is no need for write_lock in
* the softirq
* For reading or updating the counters, the user context needs to
* get a write_lock
@@ -52,110 +38,175 @@
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
#define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter)))
#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \
- COUNTER_OFFSET(n) * cpu))
+ COUNTER_OFFSET(n) * cpu))
+struct ebt_pernet {
+ struct list_head tables;
+};
+struct ebt_template {
+ struct list_head list;
+ char name[EBT_TABLE_MAXNAMELEN];
+ struct module *owner;
+ /* called when table is needed in the given netns */
+ int (*table_init)(struct net *net);
+};
+static unsigned int ebt_pernet_id __read_mostly;
+static LIST_HEAD(template_tables);
static DEFINE_MUTEX(ebt_mutex);
-static LIST_HEAD(ebt_tables);
-static LIST_HEAD(ebt_targets);
-static LIST_HEAD(ebt_matches);
-static LIST_HEAD(ebt_watchers);
-static struct ebt_target ebt_standard_target =
-{ {NULL, NULL}, EBT_STANDARD_TARGET, NULL, NULL, NULL, NULL};
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+static void ebt_standard_compat_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
-static inline int ebt_do_watcher (struct ebt_entry_watcher *w,
- const struct sk_buff *skb, unsigned int hooknr, const struct net_device *in,
- const struct net_device *out)
+ if (v >= 0)
+ v += xt_compat_calc_jump(NFPROTO_BRIDGE, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int ebt_standard_compat_to_user(void __user *dst, const void *src)
{
- w->u.watcher->watcher(skb, hooknr, in, out, w->data,
- w->watcher_size);
+ compat_int_t cv = *(int *)src;
+
+ if (cv >= 0)
+ cv -= xt_compat_calc_jump(NFPROTO_BRIDGE, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+#endif
+
+
+static struct xt_target ebt_standard_target = {
+ .name = "standard",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .targetsize = sizeof(int),
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = ebt_standard_compat_from_user,
+ .compat_to_user = ebt_standard_compat_to_user,
+#endif
+};
+
+static inline int
+ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ par->target = w->u.watcher;
+ par->targinfo = w->data;
+ w->u.watcher->target(skb, par);
/* watchers don't give a verdict */
return 0;
}
-static inline int ebt_do_match (struct ebt_entry_match *m,
- const struct sk_buff *skb, const struct net_device *in,
- const struct net_device *out)
+static inline int
+ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
+ struct xt_action_param *par)
{
- return m->u.match->match(skb, in, out, m->data,
- m->match_size);
+ par->match = m->u.match;
+ par->matchinfo = m->data;
+ return !m->u.match->match(skb, par);
}
-static inline int ebt_dev_check(char *entry, const struct net_device *device)
+static inline int
+ebt_dev_check(const char *entry, const struct net_device *device)
{
int i = 0;
- const char *devname = device->name;
+ const char *devname;
if (*entry == '\0')
return 0;
if (!device)
return 1;
+ devname = device->name;
/* 1 is the wildcard token */
while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i])
i++;
- return (devname[i] != entry[i] && entry[i] != 1);
+ return devname[i] != entry[i] && entry[i] != 1;
}
-#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg))
/* process standard matches */
-static inline int ebt_basic_match(struct ebt_entry *e, struct ethhdr *h,
- const struct net_device *in, const struct net_device *out)
+static inline int
+ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out)
{
- int verdict, i;
+ const struct ethhdr *h = eth_hdr(skb);
+ const struct net_bridge_port *p;
+ __be16 ethproto;
+
+ if (skb_vlan_tag_present(skb))
+ ethproto = htons(ETH_P_8021Q);
+ else
+ ethproto = h->h_proto;
if (e->bitmask & EBT_802_3) {
- if (FWINV2(ntohs(h->h_proto) >= 1536, EBT_IPROTO))
+ if (NF_INVF(e, EBT_IPROTO, eth_proto_is_802_3(ethproto)))
return 1;
} else if (!(e->bitmask & EBT_NOPROTO) &&
- FWINV2(e->ethproto != h->h_proto, EBT_IPROTO))
+ NF_INVF(e, EBT_IPROTO, e->ethproto != ethproto))
return 1;
- if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
+ if (NF_INVF(e, EBT_IIN, ebt_dev_check(e->in, in)))
return 1;
- if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
+ if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
return 1;
- if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
- e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
+ /* rcu_read_lock()ed by nf_hook_thresh */
+ if (in && (p = br_port_get_rcu(in)) != NULL &&
+ NF_INVF(e, EBT_ILOGICALIN,
+ ebt_dev_check(e->logical_in, p->br->dev)))
return 1;
- if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
- e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
+ if (out && (p = br_port_get_rcu(out)) != NULL &&
+ NF_INVF(e, EBT_ILOGICALOUT,
+ ebt_dev_check(e->logical_out, p->br->dev)))
return 1;
if (e->bitmask & EBT_SOURCEMAC) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (h->h_source[i] ^ e->sourcemac[i]) &
- e->sourcemsk[i];
- if (FWINV2(verdict != 0, EBT_ISOURCE) )
+ if (NF_INVF(e, EBT_ISOURCE,
+ !ether_addr_equal_masked(h->h_source, e->sourcemac,
+ e->sourcemsk)))
return 1;
}
if (e->bitmask & EBT_DESTMAC) {
- verdict = 0;
- for (i = 0; i < 6; i++)
- verdict |= (h->h_dest[i] ^ e->destmac[i]) &
- e->destmsk[i];
- if (FWINV2(verdict != 0, EBT_IDEST) )
+ if (NF_INVF(e, EBT_IDEST,
+ !ether_addr_equal_masked(h->h_dest, e->destmac,
+ e->destmsk)))
return 1;
}
return 0;
}
+static inline
+struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+static inline const struct ebt_entry_target *
+ebt_get_target_c(const struct ebt_entry *e)
+{
+ return ebt_get_target((struct ebt_entry *)e);
+}
+
/* Do some firewalling */
-unsigned int ebt_do_table (unsigned int hook, struct sk_buff **pskb,
- const struct net_device *in, const struct net_device *out,
- struct ebt_table *table)
+unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ struct ebt_table *table = priv;
+ unsigned int hook = state->hook;
int i, nentries;
struct ebt_entry *point;
struct ebt_counter *counter_base, *cb_base;
- struct ebt_entry_target *t;
+ const struct ebt_entry_target *t;
int verdict, sp = 0;
struct ebt_chainstack *cs;
struct ebt_entries *chaininfo;
- char *base;
- struct ebt_table_info *private;
+ const char *base;
+ const struct ebt_table_info *private;
+ struct xt_action_param acpar;
+
+ acpar.state = state;
+ acpar.hotdrop = false;
read_lock_bh(&table->lock);
private = table->private;
@@ -173,29 +224,32 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff **pskb,
base = private->entries;
i = 0;
while (i < nentries) {
- if (ebt_basic_match(point, eth_hdr(*pskb), in, out))
+ if (ebt_basic_match(point, skb, state->in, state->out))
goto letscontinue;
- if (EBT_MATCH_ITERATE(point, ebt_do_match, *pskb, in, out) != 0)
+ if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
goto letscontinue;
+ if (acpar.hotdrop) {
+ read_unlock_bh(&table->lock);
+ return NF_DROP;
+ }
- /* increase counter */
- (*(counter_base + i)).pcnt++;
- (*(counter_base + i)).bcnt+=(**pskb).len;
+ ADD_COUNTER(*(counter_base + i), skb->len, 1);
/* these should only watch: not modify, nor tell us
- what to do with the packet */
- EBT_WATCHER_ITERATE(point, ebt_do_watcher, *pskb, hook, in,
- out);
+ * what to do with the packet
+ */
+ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
- t = (struct ebt_entry_target *)
- (((char *)point) + point->target_offset);
+ t = ebt_get_target_c(point);
/* standard target */
if (!t->u.target->target)
verdict = ((struct ebt_standard_target *)t)->verdict;
- else
- verdict = t->u.target->target(pskb, hook,
- in, out, t->data, t->target_size);
+ else {
+ acpar.target = t->u.target;
+ acpar.targinfo = t->data;
+ verdict = t->u.target->target(skb, &acpar);
+ }
if (verdict == EBT_ACCEPT) {
read_unlock_bh(&table->lock);
return NF_ACCEPT;
@@ -206,13 +260,11 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff **pskb,
}
if (verdict == EBT_RETURN) {
letsreturn:
-#ifdef CONFIG_NETFILTER_DEBUG
- if (sp == 0) {
- BUGPRINT("RETURN on base chain");
+ if (WARN(sp == 0, "RETURN on base chain")) {
/* act like this is EBT_CONTINUE */
goto letscontinue;
}
-#endif
+
sp--;
/* put all the local variables right */
i = cs[sp].n;
@@ -225,35 +277,31 @@ letsreturn:
}
if (verdict == EBT_CONTINUE)
goto letscontinue;
-#ifdef CONFIG_NETFILTER_DEBUG
- if (verdict < 0) {
- BUGPRINT("bogus standard verdict\n");
+
+ if (WARN(verdict < 0, "bogus standard verdict\n")) {
read_unlock_bh(&table->lock);
return NF_DROP;
}
-#endif
+
/* jump to a udc */
cs[sp].n = i + 1;
cs[sp].chaininfo = chaininfo;
- cs[sp].e = (struct ebt_entry *)
- (((char *)point) + point->next_offset);
+ cs[sp].e = ebt_next_entry(point);
i = 0;
chaininfo = (struct ebt_entries *) (base + verdict);
-#ifdef CONFIG_NETFILTER_DEBUG
- if (chaininfo->distinguisher) {
- BUGPRINT("jump to non-chain\n");
+
+ if (WARN(chaininfo->distinguisher, "jump to non-chain\n")) {
read_unlock_bh(&table->lock);
return NF_DROP;
}
-#endif
+
nentries = chaininfo->nentries;
point = (struct ebt_entry *)chaininfo->data;
counter_base = cb_base + chaininfo->counter_offset;
sp++;
continue;
letscontinue:
- point = (struct ebt_entry *)
- (((char *)point) + point->next_offset);
+ point = ebt_next_entry(point);
i++;
}
@@ -270,186 +318,250 @@ letscontinue:
/* If it succeeds, returns element and locks mutex */
static inline void *
-find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
- struct mutex *mutex)
+find_inlist_lock_noload(struct net *net, const char *name, int *error,
+ struct mutex *mutex)
{
- struct {
- struct list_head list;
- char name[EBT_FUNCTION_MAXNAMELEN];
- } *e;
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+ struct ebt_template *tmpl;
+ struct ebt_table *table;
+
+ mutex_lock(mutex);
+ list_for_each_entry(table, &ebt_net->tables, list) {
+ if (strcmp(table->name, name) == 0)
+ return table;
+ }
+
+ list_for_each_entry(tmpl, &template_tables, list) {
+ if (strcmp(name, tmpl->name) == 0) {
+ struct module *owner = tmpl->owner;
+
+ if (!try_module_get(owner))
+ goto out;
+
+ mutex_unlock(mutex);
+
+ *error = tmpl->table_init(net);
+ if (*error) {
+ module_put(owner);
+ return NULL;
+ }
- *error = mutex_lock_interruptible(mutex);
- if (*error != 0)
- return NULL;
+ mutex_lock(mutex);
+ module_put(owner);
+ break;
+ }
+ }
- list_for_each_entry(e, head, list) {
- if (strcmp(e->name, name) == 0)
- return e;
+ list_for_each_entry(table, &ebt_net->tables, list) {
+ if (strcmp(table->name, name) == 0)
+ return table;
}
+
+out:
*error = -ENOENT;
mutex_unlock(mutex);
return NULL;
}
-#ifndef CONFIG_KMOD
-#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
-#else
static void *
-find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
- int *error, struct mutex *mutex)
+find_inlist_lock(struct net *net, const char *name, const char *prefix,
+ int *error, struct mutex *mutex)
{
- void *ret;
-
- ret = find_inlist_lock_noload(head, name, error, mutex);
- if (!ret) {
- request_module("%s%s", prefix, name);
- ret = find_inlist_lock_noload(head, name, error, mutex);
- }
- return ret;
+ return try_then_request_module(
+ find_inlist_lock_noload(net, name, error, mutex),
+ "%s%s", prefix, name);
}
-#endif
static inline struct ebt_table *
-find_table_lock(const char *name, int *error, struct mutex *mutex)
+find_table_lock(struct net *net, const char *name, int *error,
+ struct mutex *mutex)
{
- return find_inlist_lock(&ebt_tables, name, "ebtable_", error, mutex);
+ return find_inlist_lock(net, name, "ebtable_", error, mutex);
}
-static inline struct ebt_match *
-find_match_lock(const char *name, int *error, struct mutex *mutex)
+static inline void ebt_free_table_info(struct ebt_table_info *info)
{
- return find_inlist_lock(&ebt_matches, name, "ebt_", error, mutex);
-}
-
-static inline struct ebt_watcher *
-find_watcher_lock(const char *name, int *error, struct mutex *mutex)
-{
- return find_inlist_lock(&ebt_watchers, name, "ebt_", error, mutex);
-}
+ int i;
-static inline struct ebt_target *
-find_target_lock(const char *name, int *error, struct mutex *mutex)
-{
- return find_inlist_lock(&ebt_targets, name, "ebt_", error, mutex);
+ if (info->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(info->chainstack[i]);
+ vfree(info->chainstack);
+ }
}
-
static inline int
-ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
- const char *name, unsigned int hookmask, unsigned int *cnt)
+ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
+ unsigned int *cnt)
{
- struct ebt_match *match;
+ const struct ebt_entry *e = par->entryinfo;
+ struct xt_match *match;
+ size_t left = ((char *)e + e->watchers_offset) - (char *)m;
int ret;
- if (((char *)m) + m->match_size + sizeof(struct ebt_entry_match) >
- ((char *)e) + e->watchers_offset)
+ if (left < sizeof(struct ebt_entry_match) ||
+ left - sizeof(struct ebt_entry_match) < m->match_size)
return -EINVAL;
- match = find_match_lock(m->u.name, &ret, &ebt_mutex);
- if (!match)
- return ret;
- m->u.match = match;
- if (!try_module_get(match->me)) {
- mutex_unlock(&ebt_mutex);
- return -ENOENT;
+
+ match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision);
+ if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
+ if (!IS_ERR(match))
+ module_put(match->me);
+ request_module("ebt_%s", m->u.name);
+ match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision);
}
- mutex_unlock(&ebt_mutex);
- if (match->check &&
- match->check(name, hookmask, e, m->data, m->match_size) != 0) {
- BUGPRINT("match->check failed\n");
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+ m->u.match = match;
+
+ par->match = match;
+ par->matchinfo = m->data;
+ ret = xt_check_match(par, m->match_size,
+ ntohs(e->ethproto), e->invflags & EBT_IPROTO);
+ if (ret < 0) {
module_put(match->me);
- return -EINVAL;
+ return ret;
}
+
(*cnt)++;
return 0;
}
static inline int
-ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
- const char *name, unsigned int hookmask, unsigned int *cnt)
+ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
+ unsigned int *cnt)
{
- struct ebt_watcher *watcher;
+ const struct ebt_entry *e = par->entryinfo;
+ struct xt_target *watcher;
+ size_t left = ((char *)e + e->target_offset) - (char *)w;
int ret;
- if (((char *)w) + w->watcher_size + sizeof(struct ebt_entry_watcher) >
- ((char *)e) + e->target_offset)
+ if (left < sizeof(struct ebt_entry_watcher) ||
+ left - sizeof(struct ebt_entry_watcher) < w->watcher_size)
return -EINVAL;
- watcher = find_watcher_lock(w->u.name, &ret, &ebt_mutex);
- if (!watcher)
- return ret;
- w->u.watcher = watcher;
- if (!try_module_get(watcher->me)) {
- mutex_unlock(&ebt_mutex);
+
+ watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0);
+ if (IS_ERR(watcher))
+ return PTR_ERR(watcher);
+
+ if (watcher->family != NFPROTO_BRIDGE) {
+ module_put(watcher->me);
return -ENOENT;
}
- mutex_unlock(&ebt_mutex);
- if (watcher->check &&
- watcher->check(name, hookmask, e, w->data, w->watcher_size) != 0) {
- BUGPRINT("watcher->check failed\n");
+
+ w->u.watcher = watcher;
+
+ par->target = watcher;
+ par->targinfo = w->data;
+ ret = xt_check_target(par, w->watcher_size,
+ ntohs(e->ethproto), e->invflags & EBT_IPROTO);
+ if (ret < 0) {
module_put(watcher->me);
- return -EINVAL;
+ return ret;
}
+
(*cnt)++;
return 0;
}
-/*
- * this one is very careful, as it is the first function
+static int ebt_verify_pointers(const struct ebt_replace *repl,
+ struct ebt_table_info *newinfo)
+{
+ unsigned int limit = repl->entries_size;
+ unsigned int valid_hooks = repl->valid_hooks;
+ unsigned int offset = 0;
+ int i;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++)
+ newinfo->hook_entry[i] = NULL;
+
+ newinfo->entries_size = repl->entries_size;
+ newinfo->nentries = repl->nentries;
+
+ while (offset < limit) {
+ size_t left = limit - offset;
+ struct ebt_entry *e = (void *)newinfo->entries + offset;
+
+ if (left < sizeof(unsigned int))
+ break;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if ((valid_hooks & (1 << i)) == 0)
+ continue;
+ if ((char __user *)repl->hook_entry[i] ==
+ repl->entries + offset)
+ break;
+ }
+
+ if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) {
+ if (e->bitmask != 0) {
+ /* we make userspace set this right,
+ * so there is no misunderstanding
+ */
+ return -EINVAL;
+ }
+ if (i != NF_BR_NUMHOOKS)
+ newinfo->hook_entry[i] = (struct ebt_entries *)e;
+ if (left < sizeof(struct ebt_entries))
+ break;
+ offset += sizeof(struct ebt_entries);
+ } else {
+ if (left < sizeof(struct ebt_entry))
+ break;
+ if (left < e->next_offset)
+ break;
+ if (e->next_offset < sizeof(struct ebt_entry))
+ return -EINVAL;
+ offset += e->next_offset;
+ }
+ }
+ if (offset != limit)
+ return -EINVAL;
+
+ /* check if all valid hooks have a chain */
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if (!newinfo->hook_entry[i] &&
+ (valid_hooks & (1 << i)))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* this one is very careful, as it is the first function
* to parse the userspace data
*/
static inline int
-ebt_check_entry_size_and_hooks(struct ebt_entry *e,
- struct ebt_table_info *newinfo, char *base, char *limit,
- struct ebt_entries **hook_entries, unsigned int *n, unsigned int *cnt,
- unsigned int *totalcnt, unsigned int *udc_cnt, unsigned int valid_hooks)
+ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
+ const struct ebt_table_info *newinfo,
+ unsigned int *n, unsigned int *cnt,
+ unsigned int *totalcnt, unsigned int *udc_cnt)
{
int i;
for (i = 0; i < NF_BR_NUMHOOKS; i++) {
- if ((valid_hooks & (1 << i)) == 0)
- continue;
- if ( (char *)hook_entries[i] - base ==
- (char *)e - newinfo->entries)
+ if ((void *)e == (void *)newinfo->hook_entry[i])
break;
}
/* beginning of a new chain
- if i == NF_BR_NUMHOOKS it must be a user defined chain */
- if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) {
- if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) != 0) {
- /* we make userspace set this right,
- so there is no misunderstanding */
- BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set "
- "in distinguisher\n");
- return -EINVAL;
- }
+ * if i == NF_BR_NUMHOOKS it must be a user defined chain
+ */
+ if (i != NF_BR_NUMHOOKS || !e->bitmask) {
/* this checks if the previous chain has as many entries
- as it said it has */
- if (*n != *cnt) {
- BUGPRINT("nentries does not equal the nr of entries "
- "in the chain\n");
- return -EINVAL;
- }
- /* before we look at the struct, be sure it is not too big */
- if ((char *)hook_entries[i] + sizeof(struct ebt_entries)
- > limit) {
- BUGPRINT("entries_size too small\n");
+ * as it said it has
+ */
+ if (*n != *cnt)
return -EINVAL;
- }
+
if (((struct ebt_entries *)e)->policy != EBT_DROP &&
((struct ebt_entries *)e)->policy != EBT_ACCEPT) {
/* only RETURN from udc */
if (i != NF_BR_NUMHOOKS ||
- ((struct ebt_entries *)e)->policy != EBT_RETURN) {
- BUGPRINT("bad policy\n");
+ ((struct ebt_entries *)e)->policy != EBT_RETURN)
return -EINVAL;
- }
}
if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */
(*udc_cnt)++;
- else
- newinfo->hook_entry[i] = (struct ebt_entries *)e;
- if (((struct ebt_entries *)e)->counter_offset != *totalcnt) {
- BUGPRINT("counter_offset != totalcnt");
+ if (((struct ebt_entries *)e)->counter_offset != *totalcnt)
return -EINVAL;
- }
*n = ((struct ebt_entries *)e)->nentries;
*cnt = 0;
return 0;
@@ -457,45 +569,37 @@ ebt_check_entry_size_and_hooks(struct ebt_entry *e,
/* a plain old entry, heh */
if (sizeof(struct ebt_entry) > e->watchers_offset ||
e->watchers_offset > e->target_offset ||
- e->target_offset >= e->next_offset) {
- BUGPRINT("entry offsets not in right order\n");
+ e->target_offset >= e->next_offset)
return -EINVAL;
- }
+
/* this is not checked anywhere else */
- if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) {
- BUGPRINT("target size too small\n");
+ if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target))
return -EINVAL;
- }
(*cnt)++;
(*totalcnt)++;
return 0;
}
-struct ebt_cl_stack
-{
+struct ebt_cl_stack {
struct ebt_chainstack cs;
int from;
unsigned int hookmask;
};
-/*
- * we need these positions to check that the jumps to a different part of the
+/* We need these positions to check that the jumps to a different part of the
* entries is a jump to the beginning of a new chain.
*/
static inline int
ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo,
- struct ebt_entries **hook_entries, unsigned int *n, unsigned int valid_hooks,
- struct ebt_cl_stack *udc)
+ unsigned int *n, struct ebt_cl_stack *udc)
{
int i;
/* we're only interested in chain starts */
- if (e->bitmask & EBT_ENTRY_OR_ENTRIES)
+ if (e->bitmask)
return 0;
for (i = 0; i < NF_BR_NUMHOOKS; i++) {
- if ((valid_hooks & (1 << i)) == 0)
- continue;
if (newinfo->hook_entry[i] == (struct ebt_entries *)e)
break;
}
@@ -513,78 +617,96 @@ ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo,
}
static inline int
-ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i)
+ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i)
{
+ struct xt_mtdtor_param par;
+
if (i && (*i)-- == 0)
return 1;
- if (m->u.match->destroy)
- m->u.match->destroy(m->data, m->match_size);
- module_put(m->u.match->me);
+ par.net = net;
+ par.match = m->u.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_BRIDGE;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
return 0;
}
static inline int
-ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i)
+ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i)
{
+ struct xt_tgdtor_param par;
+
if (i && (*i)-- == 0)
return 1;
- if (w->u.watcher->destroy)
- w->u.watcher->destroy(w->data, w->watcher_size);
- module_put(w->u.watcher->me);
+ par.net = net;
+ par.target = w->u.watcher;
+ par.targinfo = w->data;
+ par.family = NFPROTO_BRIDGE;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
return 0;
}
static inline int
-ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt)
+ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
{
+ struct xt_tgdtor_param par;
struct ebt_entry_target *t;
- if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0)
+ if (e->bitmask == 0)
return 0;
/* we're done */
if (cnt && (*cnt)-- == 0)
return 1;
- EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL);
- EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL);
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
- if (t->u.target->destroy)
- t->u.target->destroy(t->data, t->target_size);
- module_put(t->u.target->me);
-
+ EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
+ EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
+ t = ebt_get_target(e);
+
+ par.net = net;
+ par.target = t->u.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_BRIDGE;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
return 0;
}
static inline int
-ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
- const char *name, unsigned int *cnt, unsigned int valid_hooks,
- struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
+ebt_check_entry(struct ebt_entry *e, struct net *net,
+ const struct ebt_table_info *newinfo,
+ const char *name, unsigned int *cnt,
+ struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
{
struct ebt_entry_target *t;
- struct ebt_target *target;
+ struct xt_target *target;
unsigned int i, j, hook = 0, hookmask = 0;
+ size_t gap;
int ret;
+ struct xt_mtchk_param mtpar;
+ struct xt_tgchk_param tgpar;
/* don't mess with the struct ebt_entries */
- if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0)
+ if (e->bitmask == 0)
return 0;
- if (e->bitmask & ~EBT_F_MASK) {
- BUGPRINT("Unknown flag for bitmask\n");
+ if (e->bitmask & ~EBT_F_MASK)
return -EINVAL;
- }
- if (e->invflags & ~EBT_INV_MASK) {
- BUGPRINT("Unknown flag for inv bitmask\n");
+
+ if (e->invflags & ~EBT_INV_MASK)
return -EINVAL;
- }
- if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) {
- BUGPRINT("NOPROTO & 802_3 not allowed\n");
+
+ if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3))
return -EINVAL;
- }
+
/* what hook do we belong to? */
for (i = 0; i < NF_BR_NUMHOOKS; i++) {
- if ((valid_hooks & (1 << i)) == 0)
+ if (!newinfo->hook_entry[i])
continue;
if ((char *)newinfo->hook_entry[i] < (char *)e)
hook = i;
@@ -592,7 +714,8 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
break;
}
/* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on
- a base chain */
+ * a base chain
+ */
if (i < NF_BR_NUMHOOKS)
hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS);
else {
@@ -605,66 +728,81 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
hookmask = cl_s[i - 1].hookmask;
}
i = 0;
- ret = EBT_MATCH_ITERATE(e, ebt_check_match, e, name, hookmask, &i);
+
+ memset(&mtpar, 0, sizeof(mtpar));
+ memset(&tgpar, 0, sizeof(tgpar));
+ mtpar.net = tgpar.net = net;
+ mtpar.table = tgpar.table = name;
+ mtpar.entryinfo = tgpar.entryinfo = e;
+ mtpar.hook_mask = tgpar.hook_mask = hookmask;
+ mtpar.family = tgpar.family = NFPROTO_BRIDGE;
+ ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
if (ret != 0)
goto cleanup_matches;
j = 0;
- ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, e, name, hookmask, &j);
+ ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
if (ret != 0)
goto cleanup_watchers;
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
- target = find_target_lock(t->u.name, &ret, &ebt_mutex);
- if (!target)
+ t = ebt_get_target(e);
+ gap = e->next_offset - e->target_offset;
+
+ target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
goto cleanup_watchers;
- if (!try_module_get(target->me)) {
- mutex_unlock(&ebt_mutex);
+ }
+
+ /* Reject UNSPEC, xtables verdicts/return values are incompatible */
+ if (target->family != NFPROTO_BRIDGE) {
+ module_put(target->me);
ret = -ENOENT;
goto cleanup_watchers;
}
- mutex_unlock(&ebt_mutex);
t->u.target = target;
if (t->u.target == &ebt_standard_target) {
- if (e->target_offset + sizeof(struct ebt_standard_target) >
- e->next_offset) {
- BUGPRINT("Standard target size too big\n");
+ if (gap < sizeof(struct ebt_standard_target)) {
ret = -EFAULT;
goto cleanup_watchers;
}
if (((struct ebt_standard_target *)t)->verdict <
-NUM_STANDARD_TARGETS) {
- BUGPRINT("Invalid standard target\n");
ret = -EFAULT;
goto cleanup_watchers;
}
- } else if ((e->target_offset + t->target_size +
- sizeof(struct ebt_entry_target) > e->next_offset) ||
- (t->u.target->check &&
- t->u.target->check(name, hookmask, e, t->data, t->target_size) != 0)){
+ } else if (t->target_size > gap - sizeof(struct ebt_entry_target)) {
module_put(t->u.target->me);
ret = -EFAULT;
goto cleanup_watchers;
}
+
+ tgpar.target = target;
+ tgpar.targinfo = t->data;
+ ret = xt_check_target(&tgpar, t->target_size,
+ ntohs(e->ethproto), e->invflags & EBT_IPROTO);
+ if (ret < 0) {
+ module_put(target->me);
+ goto cleanup_watchers;
+ }
(*cnt)++;
return 0;
cleanup_watchers:
- EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, &j);
+ EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j);
cleanup_matches:
- EBT_MATCH_ITERATE(e, ebt_cleanup_match, &i);
+ EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i);
return ret;
}
-/*
- * checks for loops and sets the hook mask for udc
+/* checks for loops and sets the hook mask for udc
* the hook mask for udc tells us from which base chains the udc can be
* accessed. This mask is a parameter to the check() functions of the extensions
*/
-static int check_chainloops(struct ebt_entries *chain, struct ebt_cl_stack *cl_s,
- unsigned int udc_cnt, unsigned int hooknr, char *base)
+static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s,
+ unsigned int udc_cnt, unsigned int hooknr, char *base)
{
int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict;
- struct ebt_entry *e = (struct ebt_entry *)chain->data;
- struct ebt_entry_target *t;
+ const struct ebt_entry *e = (struct ebt_entry *)chain->data;
+ const struct ebt_entry_target *t;
while (pos < nentries || chain_nr != -1) {
/* end of udc, go back one 'recursion' step */
@@ -683,15 +821,13 @@ static int check_chainloops(struct ebt_entries *chain, struct ebt_cl_stack *cl_s
if (pos == nentries)
continue;
}
- t = (struct ebt_entry_target *)
- (((char *)e) + e->target_offset);
+ t = ebt_get_target_c(e);
if (strcmp(t->u.name, EBT_STANDARD_TARGET))
goto letscontinue;
if (e->target_offset + sizeof(struct ebt_standard_target) >
- e->next_offset) {
- BUGPRINT("Standard target size too big\n");
+ e->next_offset)
return -1;
- }
+
verdict = ((struct ebt_standard_target *)t)->verdict;
if (verdict >= 0) { /* jump to another chain */
struct ebt_entries *hlp2 =
@@ -700,18 +836,18 @@ static int check_chainloops(struct ebt_entries *chain, struct ebt_cl_stack *cl_s
if (hlp2 == cl_s[i].cs.chaininfo)
break;
/* bad destination or loop */
- if (i == udc_cnt) {
- BUGPRINT("bad destination\n");
+ if (i == udc_cnt)
return -1;
- }
- if (cl_s[i].cs.n) {
- BUGPRINT("loop\n");
+
+ if (cl_s[i].cs.n)
return -1;
- }
- /* this can't be 0, so the above test is correct */
+
+ if (cl_s[i].hookmask & (1 << hooknr))
+ goto letscontinue;
+ /* this can't be 0, so the loop test is correct */
cl_s[i].cs.n = pos + 1;
pos = 0;
- cl_s[i].cs.e = ((void *)e + e->next_offset);
+ cl_s[i].cs.e = ebt_next_entry(e);
e = (struct ebt_entry *)(hlp2->data);
nentries = hlp2->nentries;
cl_s[i].from = chain_nr;
@@ -721,95 +857,78 @@ static int check_chainloops(struct ebt_entries *chain, struct ebt_cl_stack *cl_s
continue;
}
letscontinue:
- e = (void *)e + e->next_offset;
+ e = ebt_next_entry(e);
pos++;
}
return 0;
}
/* do the parsing of the table/chains/entries/matches/watchers/targets, heh */
-static int translate_table(struct ebt_replace *repl,
- struct ebt_table_info *newinfo)
+static int translate_table(struct net *net, const char *name,
+ struct ebt_table_info *newinfo)
{
unsigned int i, j, k, udc_cnt;
int ret;
struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */
i = 0;
- while (i < NF_BR_NUMHOOKS && !(repl->valid_hooks & (1 << i)))
+ while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i])
i++;
- if (i == NF_BR_NUMHOOKS) {
- BUGPRINT("No valid hooks specified\n");
+ if (i == NF_BR_NUMHOOKS)
return -EINVAL;
- }
- if (repl->hook_entry[i] != (struct ebt_entries *)repl->entries) {
- BUGPRINT("Chains don't start at beginning\n");
+
+ if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries)
return -EINVAL;
- }
+
/* make sure chains are ordered after each other in same order
- as their corresponding hooks */
+ * as their corresponding hooks
+ */
for (j = i + 1; j < NF_BR_NUMHOOKS; j++) {
- if (!(repl->valid_hooks & (1 << j)))
+ if (!newinfo->hook_entry[j])
continue;
- if ( repl->hook_entry[j] <= repl->hook_entry[i] ) {
- BUGPRINT("Hook order must be followed\n");
+ if (newinfo->hook_entry[j] <= newinfo->hook_entry[i])
return -EINVAL;
- }
+
i = j;
}
- for (i = 0; i < NF_BR_NUMHOOKS; i++)
- newinfo->hook_entry[i] = NULL;
-
- newinfo->entries_size = repl->entries_size;
- newinfo->nentries = repl->nentries;
-
/* do some early checkings and initialize some things */
i = 0; /* holds the expected nr. of entries for the chain */
j = 0; /* holds the up to now counted entries for the chain */
k = 0; /* holds the total nr. of entries, should equal
- newinfo->nentries afterwards */
+ * newinfo->nentries afterwards
+ */
udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */
ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_check_entry_size_and_hooks, newinfo, repl->entries,
- repl->entries + repl->entries_size, repl->hook_entry, &i, &j, &k,
- &udc_cnt, repl->valid_hooks);
+ ebt_check_entry_size_and_hooks, newinfo,
+ &i, &j, &k, &udc_cnt);
if (ret != 0)
return ret;
- if (i != j) {
- BUGPRINT("nentries does not equal the nr of entries in the "
- "(last) chain\n");
- return -EINVAL;
- }
- if (k != newinfo->nentries) {
- BUGPRINT("Total nentries is wrong\n");
+ if (i != j)
return -EINVAL;
- }
- /* check if all valid hooks have a chain */
- for (i = 0; i < NF_BR_NUMHOOKS; i++) {
- if (newinfo->hook_entry[i] == NULL &&
- (repl->valid_hooks & (1 << i))) {
- BUGPRINT("Valid hook without chain\n");
- return -EINVAL;
- }
- }
+ if (k != newinfo->nentries)
+ return -EINVAL;
/* get the location of the udc, put them in an array
- while we're at it, allocate the chainstack */
+ * while we're at it, allocate the chainstack
+ */
if (udc_cnt) {
/* this will get free'd in do_replace()/ebt_register_table()
- if an error occurs */
+ * if an error occurs
+ */
newinfo->chainstack =
- vmalloc((highest_possible_processor_id()+1)
- * sizeof(*(newinfo->chainstack)));
+ vmalloc_array(nr_cpu_ids,
+ sizeof(*(newinfo->chainstack)));
if (!newinfo->chainstack)
return -ENOMEM;
for_each_possible_cpu(i) {
newinfo->chainstack[i] =
- vmalloc(udc_cnt * sizeof(*(newinfo->chainstack[0])));
+ vmalloc_node(array_size(udc_cnt,
+ sizeof(*(newinfo->chainstack[0]))),
+ cpu_to_node(i));
if (!newinfo->chainstack[i]) {
while (i)
vfree(newinfo->chainstack[--i]);
@@ -819,16 +938,14 @@ static int translate_table(struct ebt_replace *repl,
}
}
- cl_s = vmalloc(udc_cnt * sizeof(*cl_s));
+ cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s));
if (!cl_s)
return -ENOMEM;
i = 0; /* the i'th udc */
EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_get_udc_positions, newinfo, repl->hook_entry, &i,
- repl->valid_hooks, cl_s);
+ ebt_get_udc_positions, newinfo, &i, cl_s);
/* sanity check */
if (i != udc_cnt) {
- BUGPRINT("i != udc_cnt\n");
vfree(cl_s);
return -EFAULT;
}
@@ -836,39 +953,39 @@ static int translate_table(struct ebt_replace *repl,
/* Check for loops */
for (i = 0; i < NF_BR_NUMHOOKS; i++)
- if (repl->valid_hooks & (1 << i))
+ if (newinfo->hook_entry[i])
if (check_chainloops(newinfo->hook_entry[i],
cl_s, udc_cnt, i, newinfo->entries)) {
vfree(cl_s);
return -EINVAL;
}
- /* we now know the following (along with E=mc²):
- - the nr of entries in each chain is right
- - the size of the allocated space is right
- - all valid hooks have a corresponding chain
- - there are no loops
- - wrong data can still be on the level of a single entry
- - could be there are jumps to places that are not the
- beginning of a chain. This can only occur in chains that
- are not accessible from any base chains, so we don't care. */
+ /* we now know the following (along with E=mc²):
+ * - the nr of entries in each chain is right
+ * - the size of the allocated space is right
+ * - all valid hooks have a corresponding chain
+ * - there are no loops
+ * - wrong data can still be on the level of a single entry
+ * - could be there are jumps to places that are not the
+ * beginning of a chain. This can only occur in chains that
+ * are not accessible from any base chains, so we don't care.
+ */
/* used to know what we need to clean up if something goes wrong */
i = 0;
ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_check_entry, newinfo, repl->name, &i, repl->valid_hooks,
- cl_s, udc_cnt);
+ ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt);
if (ret != 0) {
EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_cleanup_entry, &i);
+ ebt_cleanup_entry, net, &i);
}
vfree(cl_s);
return ret;
}
/* called under write_lock */
-static void get_counters(struct ebt_counter *oldcounters,
- struct ebt_counter *counters, unsigned int nentries)
+static void get_counters(const struct ebt_counter *oldcounters,
+ struct ebt_counter *counters, unsigned int nentries)
{
int i, cpu;
struct ebt_counter *counter_base;
@@ -882,95 +999,53 @@ static void get_counters(struct ebt_counter *oldcounters,
if (cpu == 0)
continue;
counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
- for (i = 0; i < nentries; i++) {
- counters[i].pcnt += counter_base[i].pcnt;
- counters[i].bcnt += counter_base[i].bcnt;
- }
+ for (i = 0; i < nentries; i++)
+ ADD_COUNTER(counters[i], counter_base[i].bcnt,
+ counter_base[i].pcnt);
}
}
-/* replace the table */
-static int do_replace(void __user *user, unsigned int len)
+static int do_replace_finish(struct net *net, struct ebt_replace *repl,
+ struct ebt_table_info *newinfo)
{
- int ret, i, countersize;
- struct ebt_table_info *newinfo;
- struct ebt_replace tmp;
- struct ebt_table *t;
+ int ret;
struct ebt_counter *counterstmp = NULL;
/* used to be able to unlock earlier */
struct ebt_table_info *table;
-
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
- return -EFAULT;
-
- if (len != sizeof(tmp) + tmp.entries_size) {
- BUGPRINT("Wrong len argument\n");
- return -EINVAL;
- }
-
- if (tmp.entries_size == 0) {
- BUGPRINT("Entries_size never zero\n");
- return -EINVAL;
- }
- /* overflow check */
- if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
- return -ENOMEM;
- if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
- return -ENOMEM;
-
- countersize = COUNTER_OFFSET(tmp.nentries) *
- (highest_possible_processor_id()+1);
- newinfo = vmalloc(sizeof(*newinfo) + countersize);
- if (!newinfo)
- return -ENOMEM;
-
- if (countersize)
- memset(newinfo->counters, 0, countersize);
-
- newinfo->entries = vmalloc(tmp.entries_size);
- if (!newinfo->entries) {
- ret = -ENOMEM;
- goto free_newinfo;
- }
- if (copy_from_user(
- newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
- BUGPRINT("Couldn't copy entries from userspace\n");
- ret = -EFAULT;
- goto free_entries;
- }
+ struct ebt_table *t;
/* the user wants counters back
- the check on the size is done later, when we have the lock */
- if (tmp.num_counters) {
- counterstmp = vmalloc(tmp.num_counters * sizeof(*counterstmp));
- if (!counterstmp) {
- ret = -ENOMEM;
- goto free_entries;
- }
+ * the check on the size is done later, when we have the lock
+ */
+ if (repl->num_counters) {
+ counterstmp = vmalloc_array(repl->num_counters,
+ sizeof(*counterstmp));
+ if (!counterstmp)
+ return -ENOMEM;
}
- else
- counterstmp = NULL;
- /* this can get initialized by translate_table() */
newinfo->chainstack = NULL;
- ret = translate_table(&tmp, newinfo);
+ ret = ebt_verify_pointers(repl, newinfo);
+ if (ret != 0)
+ goto free_counterstmp;
+
+ ret = translate_table(net, repl->name, newinfo);
if (ret != 0)
goto free_counterstmp;
- t = find_table_lock(tmp.name, &ret, &ebt_mutex);
+ t = find_table_lock(net, repl->name, &ret, &ebt_mutex);
if (!t) {
ret = -ENOENT;
goto free_iterate;
}
- /* the table doesn't like it */
- if (t->check && (ret = t->check(newinfo, tmp.valid_hooks)))
+ if (repl->valid_hooks != t->valid_hooks) {
+ ret = -EINVAL;
goto free_unlock;
+ }
- if (tmp.num_counters && tmp.num_counters != t->private->nentries) {
- BUGPRINT("Wrong nr. of counters requested\n");
+ if (repl->num_counters && repl->num_counters != t->private->nentries) {
ret = -EINVAL;
goto free_unlock;
}
@@ -985,7 +1060,7 @@ static int do_replace(void __user *user, unsigned int len)
module_put(t->me);
/* we need an atomic snapshot of the counters */
write_lock_bh(&t->lock);
- if (tmp.num_counters)
+ if (repl->num_counters)
get_counters(t->private->counters, counterstmp,
t->private->nentries);
@@ -993,187 +1068,178 @@ static int do_replace(void __user *user, unsigned int len)
write_unlock_bh(&t->lock);
mutex_unlock(&ebt_mutex);
/* so, a user can change the chains while having messed up her counter
- allocation. Only reason why this is done is because this way the lock
- is held only once, while this doesn't bring the kernel into a
- dangerous state. */
- if (tmp.num_counters &&
- copy_to_user(tmp.counters, counterstmp,
- tmp.num_counters * sizeof(struct ebt_counter))) {
- BUGPRINT("Couldn't copy counters to userspace\n");
- ret = -EFAULT;
+ * allocation. Only reason why this is done is because this way the lock
+ * is held only once, while this doesn't bring the kernel into a
+ * dangerous state.
+ */
+ if (repl->num_counters &&
+ copy_to_user(repl->counters, counterstmp,
+ array_size(repl->num_counters, sizeof(struct ebt_counter)))) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n");
}
- else
- ret = 0;
/* decrease module count and free resources */
EBT_ENTRY_ITERATE(table->entries, table->entries_size,
- ebt_cleanup_entry, NULL);
+ ebt_cleanup_entry, net, NULL);
vfree(table->entries);
- if (table->chainstack) {
- for_each_possible_cpu(i)
- vfree(table->chainstack[i]);
- vfree(table->chainstack);
- }
+ ebt_free_table_info(table);
vfree(table);
-
vfree(counterstmp);
- return ret;
+
+ audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
+ AUDIT_XT_OP_REPLACE, GFP_KERNEL);
+ return 0;
free_unlock:
mutex_unlock(&ebt_mutex);
free_iterate:
EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_cleanup_entry, NULL);
+ ebt_cleanup_entry, net, NULL);
free_counterstmp:
vfree(counterstmp);
/* can be initialized in translate_table() */
- if (newinfo->chainstack) {
- for_each_possible_cpu(i)
- vfree(newinfo->chainstack[i]);
- vfree(newinfo->chainstack);
- }
-free_entries:
- vfree(newinfo->entries);
-free_newinfo:
- vfree(newinfo);
+ ebt_free_table_info(newinfo);
return ret;
}
-int ebt_register_target(struct ebt_target *target)
+/* replace the table */
+static int do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
- struct ebt_target *t;
- int ret;
-
- ret = mutex_lock_interruptible(&ebt_mutex);
- if (ret != 0)
- return ret;
- list_for_each_entry(t, &ebt_targets, list) {
- if (strcmp(t->name, target->name) == 0) {
- mutex_unlock(&ebt_mutex);
- return -EEXIST;
- }
- }
- list_add(&target->list, &ebt_targets);
- mutex_unlock(&ebt_mutex);
+ int ret, countersize;
+ struct ebt_table_info *newinfo;
+ struct ebt_replace tmp;
- return 0;
-}
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+ return -EFAULT;
-void ebt_unregister_target(struct ebt_target *target)
-{
- mutex_lock(&ebt_mutex);
- list_del(&target->list);
- mutex_unlock(&ebt_mutex);
-}
+ if (len != sizeof(tmp) + tmp.entries_size)
+ return -EINVAL;
-int ebt_register_match(struct ebt_match *match)
-{
- struct ebt_match *m;
- int ret;
+ if (tmp.entries_size == 0)
+ return -EINVAL;
- ret = mutex_lock_interruptible(&ebt_mutex);
- if (ret != 0)
- return ret;
- list_for_each_entry(m, &ebt_matches, list) {
- if (strcmp(m->name, match->name) == 0) {
- mutex_unlock(&ebt_mutex);
- return -EEXIST;
- }
- }
- list_add(&match->list, &ebt_matches);
- mutex_unlock(&ebt_mutex);
+ /* overflow check */
+ if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
+ NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
+ return -ENOMEM;
- return 0;
-}
+ tmp.name[sizeof(tmp.name) - 1] = 0;
-void ebt_unregister_match(struct ebt_match *match)
-{
- mutex_lock(&ebt_mutex);
- list_del(&match->list);
- mutex_unlock(&ebt_mutex);
-}
+ countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
+ newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT);
+ if (!newinfo)
+ return -ENOMEM;
-int ebt_register_watcher(struct ebt_watcher *watcher)
-{
- struct ebt_watcher *w;
- int ret;
+ if (countersize)
+ memset(newinfo->counters, 0, countersize);
- ret = mutex_lock_interruptible(&ebt_mutex);
- if (ret != 0)
- return ret;
- list_for_each_entry(w, &ebt_watchers, list) {
- if (strcmp(w->name, watcher->name) == 0) {
- mutex_unlock(&ebt_mutex);
- return -EEXIST;
- }
+ newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT);
+ if (!newinfo->entries) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ if (copy_from_user(
+ newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
+ ret = -EFAULT;
+ goto free_entries;
}
- list_add(&watcher->list, &ebt_watchers);
- mutex_unlock(&ebt_mutex);
- return 0;
+ ret = do_replace_finish(net, &tmp, newinfo);
+ if (ret == 0)
+ return ret;
+free_entries:
+ vfree(newinfo->entries);
+free_newinfo:
+ vfree(newinfo);
+ return ret;
}
-void ebt_unregister_watcher(struct ebt_watcher *watcher)
+static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
{
mutex_lock(&ebt_mutex);
- list_del(&watcher->list);
+ list_del(&table->list);
mutex_unlock(&ebt_mutex);
+ audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries,
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+ EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
+ ebt_cleanup_entry, net, NULL);
+ if (table->private->nentries)
+ module_put(table->me);
+ vfree(table->private->entries);
+ ebt_free_table_info(table->private);
+ vfree(table->private);
+ kfree(table->ops);
+ kfree(table);
}
-int ebt_register_table(struct ebt_table *table)
+int ebt_register_table(struct net *net, const struct ebt_table *input_table,
+ const struct nf_hook_ops *template_ops)
{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
struct ebt_table_info *newinfo;
- struct ebt_table *t;
+ struct ebt_table *t, *table;
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ struct ebt_replace_kernel *repl;
int ret, i, countersize;
+ void *p;
- if (!table || !table->table ||!table->table->entries ||
- table->table->entries_size == 0 ||
- table->table->counters || table->private) {
- BUGPRINT("Bad table data for ebt_register_table!!!\n");
+ if (input_table == NULL || (repl = input_table->table) == NULL ||
+ repl->entries == NULL || repl->entries_size == 0 ||
+ repl->counters != NULL || input_table->private != NULL)
return -EINVAL;
+
+ /* Don't add one table to multiple lists. */
+ table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL);
+ if (!table) {
+ ret = -ENOMEM;
+ goto out;
}
- countersize = COUNTER_OFFSET(table->table->nentries) *
- (highest_possible_processor_id()+1);
+ countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids;
newinfo = vmalloc(sizeof(*newinfo) + countersize);
ret = -ENOMEM;
if (!newinfo)
- return -ENOMEM;
+ goto free_table;
- newinfo->entries = vmalloc(table->table->entries_size);
- if (!(newinfo->entries))
+ p = vmalloc(repl->entries_size);
+ if (!p)
goto free_newinfo;
- memcpy(newinfo->entries, table->table->entries,
- table->table->entries_size);
+ memcpy(p, repl->entries, repl->entries_size);
+ newinfo->entries = p;
+
+ newinfo->entries_size = repl->entries_size;
+ newinfo->nentries = repl->nentries;
if (countersize)
memset(newinfo->counters, 0, countersize);
/* fill in newinfo and parse the entries */
newinfo->chainstack = NULL;
- ret = translate_table(table->table, newinfo);
- if (ret != 0) {
- BUGPRINT("Translate_table failed\n");
- goto free_chainstack;
- }
-
- if (table->check && table->check(newinfo, table->valid_hooks)) {
- BUGPRINT("The table doesn't like its own initial data, lol\n");
- return -EINVAL;
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if ((repl->valid_hooks & (1 << i)) == 0)
+ newinfo->hook_entry[i] = NULL;
+ else
+ newinfo->hook_entry[i] = p +
+ ((char *)repl->hook_entry[i] - repl->entries);
}
-
- table->private = newinfo;
- rwlock_init(&table->lock);
- ret = mutex_lock_interruptible(&ebt_mutex);
+ ret = translate_table(net, repl->name, newinfo);
if (ret != 0)
goto free_chainstack;
- list_for_each_entry(t, &ebt_tables, list) {
+ table->private = newinfo;
+ rwlock_init(&table->lock);
+ mutex_lock(&ebt_mutex);
+ list_for_each_entry(t, &ebt_net->tables, list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
- BUGPRINT("Table name already exists\n");
goto free_unlock;
}
}
@@ -1183,77 +1249,158 @@ int ebt_register_table(struct ebt_table *table)
ret = -ENOENT;
goto free_unlock;
}
- list_add(&table->list, &ebt_tables);
+
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto free_unlock;
+ }
+
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ if (newinfo->nentries)
+ module_put(table->me);
+ goto free_unlock;
+ }
+
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = table;
+
+ list_add(&table->list, &ebt_net->tables);
mutex_unlock(&ebt_mutex);
- return 0;
+
+ table->ops = ops;
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret)
+ __ebt_unregister_table(net, table);
+
+ audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
+ AUDIT_XT_OP_REGISTER, GFP_KERNEL);
+ return ret;
free_unlock:
mutex_unlock(&ebt_mutex);
free_chainstack:
- if (newinfo->chainstack) {
- for_each_possible_cpu(i)
- vfree(newinfo->chainstack[i]);
- vfree(newinfo->chainstack);
- }
+ ebt_free_table_info(newinfo);
vfree(newinfo->entries);
free_newinfo:
vfree(newinfo);
+free_table:
+ kfree(table);
+out:
return ret;
}
-void ebt_unregister_table(struct ebt_table *table)
+int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct net *net))
{
- int i;
+ struct ebt_template *tmpl;
- if (!table) {
- BUGPRINT("Request to unregister NULL table!!!\n");
- return;
+ mutex_lock(&ebt_mutex);
+ list_for_each_entry(tmpl, &template_tables, list) {
+ if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) {
+ mutex_unlock(&ebt_mutex);
+ return -EEXIST;
+ }
}
+
+ tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL);
+ if (!tmpl) {
+ mutex_unlock(&ebt_mutex);
+ return -ENOMEM;
+ }
+
+ tmpl->table_init = table_init;
+ strscpy(tmpl->name, t->name, sizeof(tmpl->name));
+ tmpl->owner = t->me;
+ list_add(&tmpl->list, &template_tables);
+
+ mutex_unlock(&ebt_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ebt_register_template);
+
+void ebt_unregister_template(const struct ebt_table *t)
+{
+ struct ebt_template *tmpl;
+
mutex_lock(&ebt_mutex);
- list_del(&table->list);
+ list_for_each_entry(tmpl, &template_tables, list) {
+ if (strcmp(t->name, tmpl->name))
+ continue;
+
+ list_del(&tmpl->list);
+ mutex_unlock(&ebt_mutex);
+ kfree(tmpl);
+ return;
+ }
+
mutex_unlock(&ebt_mutex);
- vfree(table->private->entries);
- if (table->private->chainstack) {
- for_each_possible_cpu(i)
- vfree(table->private->chainstack[i]);
- vfree(table->private->chainstack);
+ WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL(ebt_unregister_template);
+
+static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
+{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+ struct ebt_table *t;
+
+ mutex_lock(&ebt_mutex);
+
+ list_for_each_entry(t, &ebt_net->tables, list) {
+ if (strcmp(t->name, name) == 0) {
+ mutex_unlock(&ebt_mutex);
+ return t;
+ }
}
- vfree(table->private);
+
+ mutex_unlock(&ebt_mutex);
+ return NULL;
+}
+
+void ebt_unregister_table_pre_exit(struct net *net, const char *name)
+{
+ struct ebt_table *table = __ebt_find_table(net, name);
+
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(ebt_unregister_table_pre_exit);
+
+void ebt_unregister_table(struct net *net, const char *name)
+{
+ struct ebt_table *table = __ebt_find_table(net, name);
+
+ if (table)
+ __ebt_unregister_table(net, table);
}
/* userspace just supplied us with counters */
-static int update_counters(void __user *user, unsigned int len)
+static int do_update_counters(struct net *net, const char *name,
+ struct ebt_counter __user *counters,
+ unsigned int num_counters, unsigned int len)
{
int i, ret;
struct ebt_counter *tmp;
- struct ebt_replace hlp;
struct ebt_table *t;
- if (copy_from_user(&hlp, user, sizeof(hlp)))
- return -EFAULT;
-
- if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
- return -EINVAL;
- if (hlp.num_counters == 0)
+ if (num_counters == 0)
return -EINVAL;
- if (!(tmp = vmalloc(hlp.num_counters * sizeof(*tmp)))) {
- MEMPRINT("Update_counters && nomemory\n");
+ tmp = vmalloc_array(num_counters, sizeof(*tmp));
+ if (!tmp)
return -ENOMEM;
- }
- t = find_table_lock(hlp.name, &ret, &ebt_mutex);
+ t = find_table_lock(net, name, &ret, &ebt_mutex);
if (!t)
goto free_tmp;
- if (hlp.num_counters != t->private->nentries) {
- BUGPRINT("Wrong nr of counters\n");
+ if (num_counters != t->private->nentries) {
ret = -EINVAL;
goto unlock_mutex;
}
- if ( copy_from_user(tmp, hlp.counters,
- hlp.num_counters * sizeof(struct ebt_counter)) ) {
- BUGPRINT("Updata_counters && !cfu\n");
+ if (copy_from_user(tmp, counters,
+ array_size(num_counters, sizeof(*counters)))) {
ret = -EFAULT;
goto unlock_mutex;
}
@@ -1262,10 +1409,8 @@ static int update_counters(void __user *user, unsigned int len)
write_lock_bh(&t->lock);
/* we add to the counters of the first cpu */
- for (i = 0; i < hlp.num_counters; i++) {
- t->private->counters[i].pcnt += tmp[i].pcnt;
- t->private->counters[i].bcnt += tmp[i].bcnt;
- }
+ for (i = 0; i < num_counters; i++)
+ ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt);
write_unlock_bh(&t->lock);
ret = 0;
@@ -1276,54 +1421,134 @@ free_tmp:
return ret;
}
-static inline int ebt_make_matchname(struct ebt_entry_match *m,
- char *base, char *ubase)
+static int update_counters(struct net *net, sockptr_t arg, unsigned int len)
{
- char *hlp = ubase - base + (char *)m;
- if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN))
+ struct ebt_replace hlp;
+
+ if (len < sizeof(hlp))
+ return -EINVAL;
+ if (copy_from_sockptr(&hlp, arg, sizeof(hlp)))
return -EFAULT;
- return 0;
+
+ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
+ return -EINVAL;
+
+ return do_update_counters(net, hlp.name, hlp.counters,
+ hlp.num_counters, len);
}
-static inline int ebt_make_watchername(struct ebt_entry_watcher *w,
- char *base, char *ubase)
+static inline int ebt_obj_to_user(char __user *um, const char *_name,
+ const char *data, int entrysize,
+ int usersize, int datasize, u8 revision)
{
- char *hlp = ubase - base + (char *)w;
- if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN))
+ char name[EBT_EXTENSION_MAXNAMELEN] = {0};
+
+ /* ebtables expects 31 bytes long names but xt_match names are 29 bytes
+ * long. Copy 29 bytes and fill remaining bytes with zeroes.
+ */
+ strscpy(name, _name, sizeof(name));
+ if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) ||
+ put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) ||
+ put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) ||
+ xt_data_to_user(um + entrysize, data, usersize, datasize,
+ XT_ALIGN(datasize)))
return -EFAULT;
+
return 0;
}
-static inline int ebt_make_names(struct ebt_entry *e, char *base, char *ubase)
+static inline int ebt_match_to_user(const struct ebt_entry_match *m,
+ const char *base, char __user *ubase)
+{
+ return ebt_obj_to_user(ubase + ((char *)m - base),
+ m->u.match->name, m->data, sizeof(*m),
+ m->u.match->usersize, m->match_size,
+ m->u.match->revision);
+}
+
+static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
+ const char *base, char __user *ubase)
+{
+ return ebt_obj_to_user(ubase + ((char *)w - base),
+ w->u.watcher->name, w->data, sizeof(*w),
+ w->u.watcher->usersize, w->watcher_size,
+ w->u.watcher->revision);
+}
+
+static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
+ char __user *ubase)
{
int ret;
- char *hlp;
- struct ebt_entry_target *t;
+ char __user *hlp;
+ const struct ebt_entry_target *t;
- if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0)
+ if (e->bitmask == 0) {
+ /* special case !EBT_ENTRY_OR_ENTRIES */
+ if (copy_to_user(ubase + ((char *)e - base), e,
+ sizeof(struct ebt_entries)))
+ return -EFAULT;
return 0;
+ }
+
+ if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e)))
+ return -EFAULT;
+
+ hlp = ubase + (((char *)e + e->target_offset) - base);
+ t = ebt_get_target_c(e);
- hlp = ubase - base + (char *)e + e->target_offset;
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
-
- ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
+ ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
if (ret != 0)
return ret;
- ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
+ ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase);
if (ret != 0)
return ret;
- if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN))
- return -EFAULT;
+ ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
+ t->u.target->usersize, t->target_size,
+ t->u.target->revision);
+ if (ret != 0)
+ return ret;
+
return 0;
}
+static int copy_counters_to_user(struct ebt_table *t,
+ const struct ebt_counter *oldcounters,
+ void __user *user, unsigned int num_counters,
+ unsigned int nentries)
+{
+ struct ebt_counter *counterstmp;
+ int ret = 0;
+
+ /* userspace might not need the counters */
+ if (num_counters == 0)
+ return 0;
+
+ if (num_counters != nentries)
+ return -EINVAL;
+
+ counterstmp = vmalloc_array(nentries, sizeof(*counterstmp));
+ if (!counterstmp)
+ return -ENOMEM;
+
+ write_lock_bh(&t->lock);
+ get_counters(oldcounters, counterstmp, nentries);
+ write_unlock_bh(&t->lock);
+
+ if (copy_to_user(user, counterstmp,
+ array_size(nentries, sizeof(struct ebt_counter))))
+ ret = -EFAULT;
+ vfree(counterstmp);
+ return ret;
+}
+
/* called with ebt_mutex locked */
static int copy_everything_to_user(struct ebt_table *t, void __user *user,
- int *len, int cmd)
+ const int *len, int cmd)
{
struct ebt_replace tmp;
- struct ebt_counter *counterstmp, *oldcounters;
+ const struct ebt_counter *oldcounters;
unsigned int entries_size, nentries;
+ int ret;
char *entries;
if (cmd == EBT_SO_GET_ENTRIES) {
@@ -1338,95 +1563,916 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
oldcounters = t->table->counters;
}
- if (copy_from_user(&tmp, user, sizeof(tmp))) {
- BUGPRINT("Cfu didn't work\n");
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
return -EFAULT;
- }
if (*len != sizeof(struct ebt_replace) + entries_size +
- (tmp.num_counters? nentries * sizeof(struct ebt_counter): 0)) {
- BUGPRINT("Wrong size\n");
+ (tmp.num_counters ? nentries * sizeof(struct ebt_counter) : 0))
+ return -EINVAL;
+
+ if (tmp.nentries != nentries)
+ return -EINVAL;
+
+ if (tmp.entries_size != entries_size)
+ return -EINVAL;
+
+ ret = copy_counters_to_user(t, oldcounters, tmp.counters,
+ tmp.num_counters, nentries);
+ if (ret)
+ return ret;
+
+ /* set the match/watcher/target names right */
+ return EBT_ENTRY_ITERATE(entries, entries_size,
+ ebt_entry_to_user, entries, tmp.entries);
+}
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+/* 32 bit-userspace compatibility definitions. */
+struct compat_ebt_replace {
+ char name[EBT_TABLE_MAXNAMELEN];
+ compat_uint_t valid_hooks;
+ compat_uint_t nentries;
+ compat_uint_t entries_size;
+ /* start of the chains */
+ compat_uptr_t hook_entry[NF_BR_NUMHOOKS];
+ /* nr of counters userspace expects back */
+ compat_uint_t num_counters;
+ /* where the kernel will put the old counters. */
+ compat_uptr_t counters;
+ compat_uptr_t entries;
+};
+
+/* struct ebt_entry_match, _target and _watcher have same layout */
+struct compat_ebt_entry_mwt {
+ union {
+ struct {
+ char name[EBT_EXTENSION_MAXNAMELEN];
+ u8 revision;
+ };
+ compat_uptr_t ptr;
+ } u;
+ compat_uint_t match_size;
+ compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace));
+};
+
+/* account for possible padding between match_size and ->data */
+static int ebt_compat_entry_padsize(void)
+{
+ BUILD_BUG_ON(sizeof(struct ebt_entry_match) <
+ sizeof(struct compat_ebt_entry_mwt));
+ return (int) sizeof(struct ebt_entry_match) -
+ sizeof(struct compat_ebt_entry_mwt);
+}
+
+static int ebt_compat_match_offset(const struct xt_match *match,
+ unsigned int userlen)
+{
+ /* ebt_among needs special handling. The kernel .matchsize is
+ * set to -1 at registration time; at runtime an EBT_ALIGN()ed
+ * value is expected.
+ * Example: userspace sends 4500, ebt_among.c wants 4504.
+ */
+ if (unlikely(match->matchsize == -1))
+ return XT_ALIGN(userlen) - COMPAT_XT_ALIGN(userlen);
+ return xt_compat_match_offset(match);
+}
+
+static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
+ unsigned int *size)
+{
+ const struct xt_match *match = m->u.match;
+ struct compat_ebt_entry_mwt __user *cm = *dstptr;
+ int off = ebt_compat_match_offset(match, m->match_size);
+ compat_uint_t msize = m->match_size - off;
+
+ if (WARN_ON(off >= m->match_size))
+ return -EINVAL;
+
+ if (copy_to_user(cm->u.name, match->name, strlen(match->name) + 1) ||
+ put_user(match->revision, &cm->u.revision) ||
+ put_user(msize, &cm->match_size))
+ return -EFAULT;
+
+ if (match->compat_to_user) {
+ if (match->compat_to_user(cm->data, m->data))
+ return -EFAULT;
+ } else {
+ if (xt_data_to_user(cm->data, m->data, match->usersize, msize,
+ COMPAT_XT_ALIGN(msize)))
+ return -EFAULT;
+ }
+
+ *size -= ebt_compat_entry_padsize() + off;
+ *dstptr = cm->data;
+ *dstptr += msize;
+ return 0;
+}
+
+static int compat_target_to_user(struct ebt_entry_target *t,
+ void __user **dstptr,
+ unsigned int *size)
+{
+ const struct xt_target *target = t->u.target;
+ struct compat_ebt_entry_mwt __user *cm = *dstptr;
+ int off = xt_compat_target_offset(target);
+ compat_uint_t tsize = t->target_size - off;
+
+ if (WARN_ON(off >= t->target_size))
+ return -EINVAL;
+
+ if (copy_to_user(cm->u.name, target->name, strlen(target->name) + 1) ||
+ put_user(target->revision, &cm->u.revision) ||
+ put_user(tsize, &cm->match_size))
+ return -EFAULT;
+
+ if (target->compat_to_user) {
+ if (target->compat_to_user(cm->data, t->data))
+ return -EFAULT;
+ } else {
+ if (xt_data_to_user(cm->data, t->data, target->usersize, tsize,
+ COMPAT_XT_ALIGN(tsize)))
+ return -EFAULT;
+ }
+
+ *size -= ebt_compat_entry_padsize() + off;
+ *dstptr = cm->data;
+ *dstptr += tsize;
+ return 0;
+}
+
+static int compat_watcher_to_user(struct ebt_entry_watcher *w,
+ void __user **dstptr,
+ unsigned int *size)
+{
+ return compat_target_to_user((struct ebt_entry_target *)w,
+ dstptr, size);
+}
+
+static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
+ unsigned int *size)
+{
+ struct ebt_entry_target *t;
+ struct ebt_entry __user *ce;
+ u32 watchers_offset, target_offset, next_offset;
+ compat_uint_t origsize;
+ int ret;
+
+ if (e->bitmask == 0) {
+ if (*size < sizeof(struct ebt_entries))
+ return -EINVAL;
+ if (copy_to_user(*dstptr, e, sizeof(struct ebt_entries)))
+ return -EFAULT;
+
+ *dstptr += sizeof(struct ebt_entries);
+ *size -= sizeof(struct ebt_entries);
+ return 0;
+ }
+
+ if (*size < sizeof(*ce))
return -EINVAL;
+
+ ce = *dstptr;
+ if (copy_to_user(ce, e, sizeof(*ce)))
+ return -EFAULT;
+
+ origsize = *size;
+ *dstptr += sizeof(*ce);
+
+ ret = EBT_MATCH_ITERATE(e, compat_match_to_user, dstptr, size);
+ if (ret)
+ return ret;
+ watchers_offset = e->watchers_offset - (origsize - *size);
+
+ ret = EBT_WATCHER_ITERATE(e, compat_watcher_to_user, dstptr, size);
+ if (ret)
+ return ret;
+ target_offset = e->target_offset - (origsize - *size);
+
+ t = ebt_get_target(e);
+
+ ret = compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+
+ if (put_user(watchers_offset, &ce->watchers_offset) ||
+ put_user(target_offset, &ce->target_offset) ||
+ put_user(next_offset, &ce->next_offset))
+ return -EFAULT;
+
+ *size -= sizeof(*ce);
+ return 0;
+}
+
+static int compat_calc_match(struct ebt_entry_match *m, int *off)
+{
+ *off += ebt_compat_match_offset(m->u.match, m->match_size);
+ *off += ebt_compat_entry_padsize();
+ return 0;
+}
+
+static int compat_calc_watcher(struct ebt_entry_watcher *w, int *off)
+{
+ *off += xt_compat_target_offset(w->u.watcher);
+ *off += ebt_compat_entry_padsize();
+ return 0;
+}
+
+static int compat_calc_entry(const struct ebt_entry *e,
+ const struct ebt_table_info *info,
+ const void *base,
+ struct compat_ebt_replace *newinfo)
+{
+ const struct ebt_entry_target *t;
+ unsigned int entry_offset;
+ int off, ret, i;
+
+ if (e->bitmask == 0)
+ return 0;
+
+ off = 0;
+ entry_offset = (void *)e - base;
+
+ EBT_MATCH_ITERATE(e, compat_calc_match, &off);
+ EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
+
+ t = ebt_get_target_c(e);
+
+ off += xt_compat_target_offset(t->u.target);
+ off += ebt_compat_entry_padsize();
+
+ newinfo->entries_size -= off;
+
+ ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ const void *hookptr = info->hook_entry[i];
+ if (info->hook_entry[i] &&
+ (e < (struct ebt_entry *)(base - hookptr))) {
+ newinfo->hook_entry[i] -= off;
+ pr_debug("0x%08X -> 0x%08X\n",
+ newinfo->hook_entry[i] + off,
+ newinfo->hook_entry[i]);
+ }
}
- if (tmp.nentries != nentries) {
- BUGPRINT("Nentries wrong\n");
+ return 0;
+}
+
+static int ebt_compat_init_offsets(unsigned int number)
+{
+ if (number > INT_MAX)
return -EINVAL;
+
+ /* also count the base chain policies */
+ number += NF_BR_NUMHOOKS;
+
+ return xt_compat_init_offsets(NFPROTO_BRIDGE, number);
+}
+
+static int compat_table_info(const struct ebt_table_info *info,
+ struct compat_ebt_replace *newinfo)
+{
+ unsigned int size = info->entries_size;
+ const void *entries = info->entries;
+ int ret;
+
+ newinfo->entries_size = size;
+ ret = ebt_compat_init_offsets(info->nentries);
+ if (ret)
+ return ret;
+
+ return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
+ entries, newinfo);
+}
+
+static int compat_copy_everything_to_user(struct ebt_table *t,
+ void __user *user, int *len, int cmd)
+{
+ struct compat_ebt_replace repl, tmp;
+ struct ebt_counter *oldcounters;
+ struct ebt_table_info tinfo;
+ int ret;
+ void __user *pos;
+
+ memset(&tinfo, 0, sizeof(tinfo));
+
+ if (cmd == EBT_SO_GET_ENTRIES) {
+ tinfo.entries_size = t->private->entries_size;
+ tinfo.nentries = t->private->nentries;
+ tinfo.entries = t->private->entries;
+ oldcounters = t->private->counters;
+ } else {
+ tinfo.entries_size = t->table->entries_size;
+ tinfo.nentries = t->table->nentries;
+ tinfo.entries = t->table->entries;
+ oldcounters = t->table->counters;
}
- if (tmp.entries_size != entries_size) {
- BUGPRINT("Wrong size\n");
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.nentries != tinfo.nentries ||
+ (tmp.num_counters && tmp.num_counters != tinfo.nentries))
+ return -EINVAL;
+
+ memcpy(&repl, &tmp, sizeof(repl));
+ if (cmd == EBT_SO_GET_ENTRIES)
+ ret = compat_table_info(t->private, &repl);
+ else
+ ret = compat_table_info(&tinfo, &repl);
+ if (ret)
+ return ret;
+
+ if (*len != sizeof(tmp) + repl.entries_size +
+ (tmp.num_counters? tinfo.nentries * sizeof(struct ebt_counter): 0)) {
+ pr_err("wrong size: *len %d, entries_size %u, replsz %d\n",
+ *len, tinfo.entries_size, repl.entries_size);
return -EINVAL;
}
/* userspace might not need the counters */
- if (tmp.num_counters) {
- if (tmp.num_counters != nentries) {
- BUGPRINT("Num_counters wrong\n");
- return -EINVAL;
+ ret = copy_counters_to_user(t, oldcounters, compat_ptr(tmp.counters),
+ tmp.num_counters, tinfo.nentries);
+ if (ret)
+ return ret;
+
+ pos = compat_ptr(tmp.entries);
+ return EBT_ENTRY_ITERATE(tinfo.entries, tinfo.entries_size,
+ compat_copy_entry_to_user, &pos, &tmp.entries_size);
+}
+
+struct ebt_entries_buf_state {
+ char *buf_kern_start; /* kernel buffer to copy (translated) data to */
+ u32 buf_kern_len; /* total size of kernel buffer */
+ u32 buf_kern_offset; /* amount of data copied so far */
+ u32 buf_user_offset; /* read position in userspace buffer */
+};
+
+static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz)
+{
+ state->buf_kern_offset += sz;
+ return state->buf_kern_offset >= sz ? 0 : -EINVAL;
+}
+
+static int ebt_buf_add(struct ebt_entries_buf_state *state,
+ const void *data, unsigned int sz)
+{
+ if (state->buf_kern_start == NULL)
+ goto count_only;
+
+ if (WARN_ON(state->buf_kern_offset + sz > state->buf_kern_len))
+ return -EINVAL;
+
+ memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz);
+
+ count_only:
+ state->buf_user_offset += sz;
+ return ebt_buf_count(state, sz);
+}
+
+static int ebt_buf_add_pad(struct ebt_entries_buf_state *state, unsigned int sz)
+{
+ char *b = state->buf_kern_start;
+
+ if (WARN_ON(b && state->buf_kern_offset > state->buf_kern_len))
+ return -EINVAL;
+
+ if (b != NULL && sz > 0)
+ memset(b + state->buf_kern_offset, 0, sz);
+ /* do not adjust ->buf_user_offset here, we added kernel-side padding */
+ return ebt_buf_count(state, sz);
+}
+
+enum compat_mwt {
+ EBT_COMPAT_MATCH,
+ EBT_COMPAT_WATCHER,
+ EBT_COMPAT_TARGET,
+};
+
+static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
+ enum compat_mwt compat_mwt,
+ struct ebt_entries_buf_state *state,
+ const unsigned char *base)
+{
+ char name[EBT_EXTENSION_MAXNAMELEN];
+ struct xt_match *match;
+ struct xt_target *wt;
+ void *dst = NULL;
+ int off, pad = 0;
+ unsigned int size_kern, match_size = mwt->match_size;
+
+ if (strscpy(name, mwt->u.name, sizeof(name)) < 0)
+ return -EINVAL;
+
+ if (state->buf_kern_start)
+ dst = state->buf_kern_start + state->buf_kern_offset;
+
+ switch (compat_mwt) {
+ case EBT_COMPAT_MATCH:
+ match = xt_request_find_match(NFPROTO_BRIDGE, name,
+ mwt->u.revision);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+
+ off = ebt_compat_match_offset(match, match_size);
+ if (dst) {
+ if (match->compat_from_user)
+ match->compat_from_user(dst, mwt->data);
+ else
+ memcpy(dst, mwt->data, match_size);
}
- counterstmp = vmalloc(nentries * sizeof(*counterstmp));
- if (!counterstmp) {
- MEMPRINT("Couldn't copy counters, out of memory\n");
- return -ENOMEM;
+
+ size_kern = match->matchsize;
+ if (unlikely(size_kern == -1))
+ size_kern = match_size;
+ module_put(match->me);
+ break;
+ case EBT_COMPAT_WATCHER:
+ case EBT_COMPAT_TARGET:
+ wt = xt_request_find_target(NFPROTO_BRIDGE, name,
+ mwt->u.revision);
+ if (IS_ERR(wt))
+ return PTR_ERR(wt);
+ off = xt_compat_target_offset(wt);
+
+ if (dst) {
+ if (wt->compat_from_user)
+ wt->compat_from_user(dst, mwt->data);
+ else
+ memcpy(dst, mwt->data, match_size);
}
- write_lock_bh(&t->lock);
- get_counters(oldcounters, counterstmp, nentries);
- write_unlock_bh(&t->lock);
-
- if (copy_to_user(tmp.counters, counterstmp,
- nentries * sizeof(struct ebt_counter))) {
- BUGPRINT("Couldn't copy counters to userspace\n");
- vfree(counterstmp);
- return -EFAULT;
+
+ size_kern = wt->targetsize;
+ module_put(wt->me);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ state->buf_kern_offset += match_size + off;
+ state->buf_user_offset += match_size;
+ pad = XT_ALIGN(size_kern) - size_kern;
+
+ if (pad > 0 && dst) {
+ if (WARN_ON(state->buf_kern_len <= pad))
+ return -EINVAL;
+ if (WARN_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad))
+ return -EINVAL;
+ memset(dst + size_kern, 0, pad);
+ }
+ return off + match_size;
+}
+
+/* return size of all matches, watchers or target, including necessary
+ * alignment and padding.
+ */
+static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32,
+ unsigned int size_left, enum compat_mwt type,
+ struct ebt_entries_buf_state *state, const void *base)
+{
+ const char *buf = (const char *)match32;
+ int growth = 0;
+
+ if (size_left == 0)
+ return 0;
+
+ do {
+ struct ebt_entry_match *match_kern;
+ int ret;
+
+ if (size_left < sizeof(*match32))
+ return -EINVAL;
+
+ match_kern = (struct ebt_entry_match *) state->buf_kern_start;
+ if (match_kern) {
+ char *tmp;
+ tmp = state->buf_kern_start + state->buf_kern_offset;
+ match_kern = (struct ebt_entry_match *) tmp;
+ }
+ ret = ebt_buf_add(state, buf, sizeof(*match32));
+ if (ret < 0)
+ return ret;
+ size_left -= sizeof(*match32);
+
+ /* add padding before match->data (if any) */
+ ret = ebt_buf_add_pad(state, ebt_compat_entry_padsize());
+ if (ret < 0)
+ return ret;
+
+ if (match32->match_size > size_left)
+ return -EINVAL;
+
+ size_left -= match32->match_size;
+
+ ret = compat_mtw_from_user(match32, type, state, base);
+ if (ret < 0)
+ return ret;
+
+ if (WARN_ON(ret < match32->match_size))
+ return -EINVAL;
+ growth += ret - match32->match_size;
+ growth += ebt_compat_entry_padsize();
+
+ buf += sizeof(*match32);
+ buf += match32->match_size;
+
+ if (match_kern)
+ match_kern->match_size = ret;
+
+ match32 = (struct compat_ebt_entry_mwt *) buf;
+ } while (size_left);
+
+ return growth;
+}
+
+/* called for all ebt_entry structures. */
+static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base,
+ unsigned int *total,
+ struct ebt_entries_buf_state *state)
+{
+ unsigned int i, j, startoff, next_expected_off, new_offset = 0;
+ /* stores match/watchers/targets & offset of next struct ebt_entry: */
+ unsigned int offsets[4];
+ unsigned int *offsets_update = NULL;
+ int ret;
+ char *buf_start;
+
+ if (*total < sizeof(struct ebt_entries))
+ return -EINVAL;
+
+ if (!entry->bitmask) {
+ *total -= sizeof(struct ebt_entries);
+ return ebt_buf_add(state, entry, sizeof(struct ebt_entries));
+ }
+ if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry))
+ return -EINVAL;
+
+ startoff = state->buf_user_offset;
+ /* pull in most part of ebt_entry, it does not need to be changed. */
+ ret = ebt_buf_add(state, entry,
+ offsetof(struct ebt_entry, watchers_offset));
+ if (ret < 0)
+ return ret;
+
+ offsets[0] = sizeof(struct ebt_entry); /* matches come first */
+ memcpy(&offsets[1], &entry->offsets, sizeof(entry->offsets));
+
+ if (state->buf_kern_start) {
+ buf_start = state->buf_kern_start + state->buf_kern_offset;
+ offsets_update = (unsigned int *) buf_start;
+ }
+ ret = ebt_buf_add(state, &offsets[1],
+ sizeof(offsets) - sizeof(offsets[0]));
+ if (ret < 0)
+ return ret;
+ buf_start = (char *) entry;
+ /* 0: matches offset, always follows ebt_entry.
+ * 1: watchers offset, from ebt_entry structure
+ * 2: target offset, from ebt_entry structure
+ * 3: next ebt_entry offset, from ebt_entry structure
+ *
+ * offsets are relative to beginning of struct ebt_entry (i.e., 0).
+ */
+ for (i = 0; i < 4 ; ++i) {
+ if (offsets[i] > *total)
+ return -EINVAL;
+
+ if (i < 3 && offsets[i] == *total)
+ return -EINVAL;
+
+ if (i == 0)
+ continue;
+ if (offsets[i-1] > offsets[i])
+ return -EINVAL;
+ }
+
+ for (i = 0, j = 1 ; j < 4 ; j++, i++) {
+ struct compat_ebt_entry_mwt *match32;
+ unsigned int size;
+ char *buf = buf_start + offsets[i];
+
+ if (offsets[i] > offsets[j])
+ return -EINVAL;
+
+ match32 = (struct compat_ebt_entry_mwt *) buf;
+ size = offsets[j] - offsets[i];
+ ret = ebt_size_mwt(match32, size, i, state, base);
+ if (ret < 0)
+ return ret;
+ new_offset += ret;
+ if (offsets_update && new_offset) {
+ pr_debug("change offset %d to %d\n",
+ offsets_update[i], offsets[j] + new_offset);
+ offsets_update[i] = offsets[j] + new_offset;
}
- vfree(counterstmp);
}
- if (copy_to_user(tmp.entries, entries, entries_size)) {
- BUGPRINT("Couldn't copy entries to userspace\n");
+ if (state->buf_kern_start == NULL) {
+ unsigned int offset = buf_start - (char *) base;
+
+ ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset);
+ if (ret < 0)
+ return ret;
+ }
+
+ next_expected_off = state->buf_user_offset - startoff;
+ if (next_expected_off != entry->next_offset)
+ return -EINVAL;
+
+ if (*total < entry->next_offset)
+ return -EINVAL;
+ *total -= entry->next_offset;
+ return 0;
+}
+
+/* repl->entries_size is the size of the ebt_entry blob in userspace.
+ * It might need more memory when copied to a 64 bit kernel in case
+ * userspace is 32-bit. So, first task: find out how much memory is needed.
+ *
+ * Called before validation is performed.
+ */
+static int compat_copy_entries(unsigned char *data, unsigned int size_user,
+ struct ebt_entries_buf_state *state)
+{
+ unsigned int size_remaining = size_user;
+ int ret;
+
+ ret = EBT_ENTRY_ITERATE(data, size_user, size_entry_mwt, data,
+ &size_remaining, state);
+ if (ret < 0)
+ return ret;
+
+ if (size_remaining)
+ return -EINVAL;
+
+ return state->buf_kern_offset;
+}
+
+
+static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
+ sockptr_t arg, unsigned int len)
+{
+ struct compat_ebt_replace tmp;
+ int i;
+
+ if (len < sizeof(tmp))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)))
return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.entries_size)
+ return -EINVAL;
+
+ if (tmp.entries_size == 0)
+ return -EINVAL;
+
+ if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
+ NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
+ return -ENOMEM;
+
+ memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry));
+
+ /* starting with hook_entry, 32 vs. 64 bit structures are different */
+ for (i = 0; i < NF_BR_NUMHOOKS; i++)
+ repl->hook_entry[i] = compat_ptr(tmp.hook_entry[i]);
+
+ repl->num_counters = tmp.num_counters;
+ repl->counters = compat_ptr(tmp.counters);
+ repl->entries = compat_ptr(tmp.entries);
+ return 0;
+}
+
+static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
+{
+ int ret, i, countersize, size64;
+ struct ebt_table_info *newinfo;
+ struct ebt_replace tmp;
+ struct ebt_entries_buf_state state;
+ void *entries_tmp;
+
+ ret = compat_copy_ebt_replace_from_user(&tmp, arg, len);
+ if (ret) {
+ /* try real handler in case userland supplied needed padding */
+ if (ret == -EINVAL && do_replace(net, arg, len) == 0)
+ ret = 0;
+ return ret;
}
- /* set the match/watcher/target names right */
- return EBT_ENTRY_ITERATE(entries, entries_size,
- ebt_make_names, entries, tmp.entries);
+
+ countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
+ newinfo = vmalloc(sizeof(*newinfo) + countersize);
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (countersize)
+ memset(newinfo->counters, 0, countersize);
+
+ memset(&state, 0, sizeof(state));
+
+ newinfo->entries = vmalloc(tmp.entries_size);
+ if (!newinfo->entries) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ if (copy_from_user(
+ newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
+ ret = -EFAULT;
+ goto free_entries;
+ }
+
+ entries_tmp = newinfo->entries;
+
+ xt_compat_lock(NFPROTO_BRIDGE);
+
+ ret = ebt_compat_init_offsets(tmp.nentries);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
+ if (ret < 0)
+ goto out_unlock;
+
+ pr_debug("tmp.entries_size %d, kern off %d, user off %d delta %d\n",
+ tmp.entries_size, state.buf_kern_offset, state.buf_user_offset,
+ xt_compat_calc_jump(NFPROTO_BRIDGE, tmp.entries_size));
+
+ size64 = ret;
+ newinfo->entries = vmalloc(size64);
+ if (!newinfo->entries) {
+ vfree(entries_tmp);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memset(&state, 0, sizeof(state));
+ state.buf_kern_start = newinfo->entries;
+ state.buf_kern_len = size64;
+
+ ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
+ if (WARN_ON(ret < 0)) {
+ vfree(entries_tmp);
+ goto out_unlock;
+ }
+
+ vfree(entries_tmp);
+ tmp.entries_size = size64;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ char __user *usrptr;
+ if (tmp.hook_entry[i]) {
+ unsigned int delta;
+ usrptr = (char __user *) tmp.hook_entry[i];
+ delta = usrptr - tmp.entries;
+ usrptr += xt_compat_calc_jump(NFPROTO_BRIDGE, delta);
+ tmp.hook_entry[i] = (struct ebt_entries __user *)usrptr;
+ }
+ }
+
+ xt_compat_flush_offsets(NFPROTO_BRIDGE);
+ xt_compat_unlock(NFPROTO_BRIDGE);
+
+ ret = do_replace_finish(net, &tmp, newinfo);
+ if (ret == 0)
+ return ret;
+free_entries:
+ vfree(newinfo->entries);
+free_newinfo:
+ vfree(newinfo);
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(NFPROTO_BRIDGE);
+ xt_compat_unlock(NFPROTO_BRIDGE);
+ goto free_entries;
+}
+
+static int compat_update_counters(struct net *net, sockptr_t arg,
+ unsigned int len)
+{
+ struct compat_ebt_replace hlp;
+
+ if (len < sizeof(hlp))
+ return -EINVAL;
+ if (copy_from_sockptr(&hlp, arg, sizeof(hlp)))
+ return -EFAULT;
+
+ /* try real handler in case userland supplied needed padding */
+ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
+ return update_counters(net, arg, len);
+
+ return do_update_counters(net, hlp.name, compat_ptr(hlp.counters),
+ hlp.num_counters, len);
}
-static int do_ebt_set_ctl(struct sock *sk,
- int cmd, void __user *user, unsigned int len)
+static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
+ void __user *user, int *len)
{
int ret;
+ struct compat_ebt_replace tmp;
+ struct ebt_table *t;
+ struct net *net = sock_net(sk);
- switch(cmd) {
- case EBT_SO_SET_ENTRIES:
- ret = do_replace(user, len);
+ if ((cmd == EBT_SO_GET_INFO || cmd == EBT_SO_GET_INIT_INFO) &&
+ *len != sizeof(struct compat_ebt_replace))
+ return -EINVAL;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ tmp.name[sizeof(tmp.name) - 1] = '\0';
+
+ t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
+ if (!t)
+ return ret;
+
+ xt_compat_lock(NFPROTO_BRIDGE);
+ switch (cmd) {
+ case EBT_SO_GET_INFO:
+ tmp.nentries = t->private->nentries;
+ ret = compat_table_info(t->private, &tmp);
+ if (ret)
+ goto out;
+ tmp.valid_hooks = t->valid_hooks;
+
+ if (copy_to_user(user, &tmp, *len) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = 0;
break;
- case EBT_SO_SET_COUNTERS:
- ret = update_counters(user, len);
+ case EBT_SO_GET_INIT_INFO:
+ tmp.nentries = t->table->nentries;
+ tmp.entries_size = t->table->entries_size;
+ tmp.valid_hooks = t->table->valid_hooks;
+
+ if (copy_to_user(user, &tmp, *len) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = 0;
+ break;
+ case EBT_SO_GET_ENTRIES:
+ case EBT_SO_GET_INIT_ENTRIES:
+ /* try real handler first in case of userland-side padding.
+ * in case we are dealing with an 'ordinary' 32 bit binary
+ * without 64bit compatibility padding, this will fail right
+ * after copy_from_user when the *len argument is validated.
+ *
+ * the compat_ variant needs to do one pass over the kernel
+ * data set to adjust for size differences before it the check.
+ */
+ if (copy_everything_to_user(t, user, len, cmd) == 0)
+ ret = 0;
+ else
+ ret = compat_copy_everything_to_user(t, user, len, cmd);
break;
default:
ret = -EINVAL;
- }
+ }
+ out:
+ xt_compat_flush_offsets(NFPROTO_BRIDGE);
+ xt_compat_unlock(NFPROTO_BRIDGE);
+ mutex_unlock(&ebt_mutex);
return ret;
}
+#endif
static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
- int ret;
+ struct net *net = sock_net(sk);
struct ebt_replace tmp;
struct ebt_table *t;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ /* try real handler in case userland supplied needed padding */
+ if (in_compat_syscall() &&
+ ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) ||
+ *len != sizeof(tmp)))
+ return compat_do_ebt_get_ctl(sk, cmd, user, len);
+#endif
if (copy_from_user(&tmp, user, sizeof(tmp)))
return -EFAULT;
- t = find_table_lock(tmp.name, &ret, &ebt_mutex);
+ tmp.name[sizeof(tmp.name) - 1] = '\0';
+
+ t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
if (!t)
return ret;
- switch(cmd) {
+ switch (cmd) {
case EBT_SO_GET_INFO:
case EBT_SO_GET_INIT_INFO:
- if (*len != sizeof(struct ebt_replace)){
+ if (*len != sizeof(struct ebt_replace)) {
ret = -EINVAL;
mutex_unlock(&ebt_mutex);
break;
@@ -1441,8 +2487,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
tmp.valid_hooks = t->table->valid_hooks;
}
mutex_unlock(&ebt_mutex);
- if (copy_to_user(user, &tmp, *len) != 0){
- BUGPRINT("c2u Didn't work\n");
+ if (copy_to_user(user, &tmp, *len) != 0) {
ret = -EFAULT;
break;
}
@@ -1463,8 +2508,39 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
-static struct nf_sockopt_ops ebt_sockopts =
+static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
+ unsigned int len)
{
+ struct net *net = sock_net(sk);
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case EBT_SO_SET_ENTRIES:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(net, arg, len);
+ else
+#endif
+ ret = do_replace(net, arg, len);
+ break;
+ case EBT_SO_SET_COUNTERS:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_update_counters(net, arg, len);
+ else
+#endif
+ ret = update_counters(net, arg, len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static struct nf_sockopt_ops ebt_sockopts = {
.pf = PF_INET,
.set_optmin = EBT_BASE_CTL,
.set_optmax = EBT_SO_SET_MAX + 1,
@@ -1472,37 +2548,57 @@ static struct nf_sockopt_ops ebt_sockopts =
.get_optmin = EBT_BASE_CTL,
.get_optmax = EBT_SO_GET_MAX + 1,
.get = do_ebt_get_ctl,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ebt_pernet_init(struct net *net)
+{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+
+ INIT_LIST_HEAD(&ebt_net->tables);
+ return 0;
+}
+
+static struct pernet_operations ebt_net_ops = {
+ .init = ebt_pernet_init,
+ .id = &ebt_pernet_id,
+ .size = sizeof(struct ebt_pernet),
};
static int __init ebtables_init(void)
{
int ret;
- mutex_lock(&ebt_mutex);
- list_add(&ebt_standard_target.list, &ebt_targets);
- mutex_unlock(&ebt_mutex);
- if ((ret = nf_register_sockopt(&ebt_sockopts)) < 0)
+ ret = xt_register_target(&ebt_standard_target);
+ if (ret < 0)
+ return ret;
+ ret = nf_register_sockopt(&ebt_sockopts);
+ if (ret < 0) {
+ xt_unregister_target(&ebt_standard_target);
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&ebt_net_ops);
+ if (ret < 0) {
+ nf_unregister_sockopt(&ebt_sockopts);
+ xt_unregister_target(&ebt_standard_target);
return ret;
+ }
- printk(KERN_NOTICE "Ebtables v2.0 registered\n");
return 0;
}
-static void __exit ebtables_fini(void)
+static void ebtables_fini(void)
{
nf_unregister_sockopt(&ebt_sockopts);
- printk(KERN_NOTICE "Ebtables v2.0 unregistered\n");
+ xt_unregister_target(&ebt_standard_target);
+ unregister_pernet_subsys(&ebt_net_ops);
}
EXPORT_SYMBOL(ebt_register_table);
EXPORT_SYMBOL(ebt_unregister_table);
-EXPORT_SYMBOL(ebt_register_match);
-EXPORT_SYMBOL(ebt_unregister_match);
-EXPORT_SYMBOL(ebt_register_watcher);
-EXPORT_SYMBOL(ebt_unregister_watcher);
-EXPORT_SYMBOL(ebt_register_target);
-EXPORT_SYMBOL(ebt_unregister_target);
EXPORT_SYMBOL(ebt_do_table);
module_init(ebtables_init);
module_exit(ebtables_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ebtables legacy core");
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
new file mode 100644
index 000000000000..6482de4d8750
--- /dev/null
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+#include "../br_private.h"
+
+/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff
+ * has been linearized or cloned.
+ */
+static int nf_br_ip_fragment(struct net *net, struct sock *sk,
+ struct sk_buff *skb,
+ struct nf_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ u8 tstamp_type = skb->tstamp_type;
+ unsigned int hlen, ll_rs, mtu;
+ ktime_t tstamp = skb->tstamp;
+ struct ip_frag_state state;
+ struct iphdr *iph;
+ int err = 0;
+
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ iph = ip_hdr(skb);
+
+ /*
+ * Setup starting values
+ */
+
+ hlen = iph->ihl * 4;
+ frag_max_size -= hlen;
+ ll_rs = LL_RESERVED_SPACE(skb->dev);
+ mtu = skb->dev->mtu;
+
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip_fraglist_iter iter;
+ struct sk_buff *frag;
+
+ if (first_len - hlen > mtu)
+ goto blackhole;
+
+ if (skb_cloned(skb) ||
+ skb_headroom(skb) < ll_rs)
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ if (frag->len > mtu)
+ goto blackhole;
+
+ if (skb_shared(frag) ||
+ skb_headroom(frag) < hlen + ll_rs)
+ goto slow_path;
+ }
+
+ ip_fraglist_init(skb, iph, hlen, &iter);
+
+ for (;;) {
+ if (iter.frag)
+ ip_fraglist_prepare(skb, &iter);
+
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip_fraglist_next(&iter);
+ }
+
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip_frag_init(skb, hlen, ll_rs, frag_max_size, false, &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+
+/* ip_defrag() expects IPCB() in place. */
+static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb,
+ size_t inet_skb_parm_size)
+{
+ memcpy(cb, skb->cb, sizeof(*cb));
+ memset(skb->cb, 0, inet_skb_parm_size);
+}
+
+static void br_skb_cb_restore(struct sk_buff *skb,
+ const struct br_input_skb_cb *cb,
+ u16 fragsz)
+{
+ memcpy(skb->cb, cb, sizeof(*cb));
+ BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz;
+}
+
+static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ if (!ip_is_fragment(ip_hdr(skb)))
+ return NF_ACCEPT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ local_bh_enable();
+ if (!err) {
+ br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size);
+ skb->ignore_df = 1;
+ return NF_ACCEPT;
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
+
+ err = nf_ct_frag6_gather(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ /* queued */
+ if (err == -EINPROGRESS)
+ return NF_STOLEN;
+
+ br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
+ return err == 0 ? NF_ACCEPT : NF_DROP;
+#else
+ return NF_ACCEPT;
+#endif
+}
+
+static int nf_ct_br_ip_check(const struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 ||
+ iph->version != 4)
+ return -1;
+
+ len = skb_ip_totlen(skb);
+ if (skb->len < nhoff + len ||
+ len < (iph->ihl * 4))
+ return -1;
+
+ return 0;
+}
+
+static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return -1;
+
+ len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+ if (skb->len < len)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state bridge_state = *state;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u32 len;
+ int ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) ||
+ ctinfo == IP_CT_UNTRACKED)
+ return NF_ACCEPT;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return NF_ACCEPT;
+
+ len = skb_ip_totlen(skb);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ip_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV4;
+ ret = nf_ct_br_defrag4(skb, &bridge_state);
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return NF_ACCEPT;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ipv6_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV6;
+ ret = nf_ct_br_defrag6(skb, &bridge_state);
+ break;
+ default:
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ return NF_ACCEPT;
+ }
+
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_conntrack_in(skb, &bridge_state);
+}
+
+static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
+ struct nf_conntrack *nfct = skb_nfct(skb);
+ struct nf_conn *ct;
+
+ if (promisc) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
+ if (!nfct || skb->pkt_type == PACKET_HOST)
+ return NF_ACCEPT;
+
+ /* nf_conntrack_confirm() cannot handle concurrent clones,
+ * this happens for broad/multicast frames with e.g. macvlan on top
+ * of the bridge device.
+ */
+ ct = container_of(nfct, struct nf_conn, ct_general);
+ if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
+ return NF_ACCEPT;
+
+ /* let inet prerouting call conntrack again */
+ skb->_nfct = 0;
+ nf_ct_put(ct);
+
+ return NF_ACCEPT;
+}
+
+static void nf_ct_bridge_frag_save(struct sk_buff *skb,
+ struct nf_bridge_frag_data *data)
+{
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_present = true;
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_present = false;
+ }
+ skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+}
+
+static unsigned int
+nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ struct nf_bridge_frag_data data;
+
+ if (!BR_INPUT_SKB_CB(skb)->frag_max_size)
+ return NF_ACCEPT;
+
+ nf_ct_bridge_frag_save(skb, &data);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_br_ip6_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ return NF_STOLEN;
+}
+
+/* Actually only slow path refragmentation needs this. */
+static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
+ const struct nf_bridge_frag_data *data)
+{
+ int err;
+
+ err = skb_cow_head(skb, ETH_HLEN);
+ if (err) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (data->vlan_present)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
+ else if (skb_vlan_tag_present(skb))
+ __vlan_hwaccel_clear_tag(skb);
+
+ skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ return 0;
+}
+
+static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *skb)
+{
+ int err;
+
+ err = nf_ct_bridge_frag_restore(skb, data);
+ if (err < 0)
+ return err;
+
+ return br_dev_queue_push_xmit(net, sk, skb);
+}
+
+static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int ret;
+
+ ret = nf_confirm(priv, skb, state);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post);
+}
+
+static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
+ {
+ .hook = nf_ct_bridge_pre,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = nf_ct_bridge_in,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
+ .hook = nf_ct_bridge_post,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+static struct nf_ct_bridge_info bridge_info = {
+ .ops = nf_ct_bridge_hook_ops,
+ .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops),
+ .me = THIS_MODULE,
+};
+
+static int __init nf_conntrack_l3proto_bridge_init(void)
+{
+ nf_ct_bridge_register(&bridge_info);
+
+ return 0;
+}
+
+static void __exit nf_conntrack_l3proto_bridge_fini(void)
+{
+ nf_ct_bridge_unregister(&bridge_info);
+}
+
+module_init(nf_conntrack_l3proto_bridge_init);
+module_exit(nf_conntrack_l3proto_bridge_fini);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE));
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Bridge IPv4 and IPv6 connection tracking");
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
new file mode 100644
index 000000000000..b7af36bbd306
--- /dev/null
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <linux/if_bridge.h>
+#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+
+#include "../br_private.h"
+
+static const struct net_device *
+nft_meta_get_bridge(const struct net_device *dev)
+{
+ if (dev && netif_is_bridge_port(dev))
+ return netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+
+ return NULL;
+}
+
+static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct net_device *br_dev;
+
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ br_dev = nft_meta_get_bridge(in);
+ break;
+ case NFT_META_BRI_OIFNAME:
+ br_dev = nft_meta_get_bridge(out);
+ break;
+ case NFT_META_BRI_IIFPVID: {
+ u16 p_pvid;
+
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev || !br_vlan_enabled(br_dev))
+ goto err;
+
+ br_vlan_get_pvid_rcu(in, &p_pvid);
+ nft_reg_store16(dest, p_pvid);
+ return;
+ }
+ case NFT_META_BRI_IIFVPROTO: {
+ u16 p_proto;
+
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev || !br_vlan_enabled(br_dev))
+ goto err;
+
+ br_vlan_get_proto(br_dev, &p_proto);
+ nft_reg_store_be16(dest, htons(p_proto));
+ return;
+ }
+ case NFT_META_BRI_IIFHWADDR:
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev)
+ goto err;
+
+ memcpy(dest, br_dev->dev_addr, ETH_ALEN);
+ return;
+ default:
+ return nft_meta_get_eval(expr, regs, pkt);
+ }
+
+ strscpy_pad((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ);
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ case NFT_META_BRI_OIFNAME:
+ len = IFNAMSIZ;
+ break;
+ case NFT_META_BRI_IIFPVID:
+ case NFT_META_BRI_IIFVPROTO:
+ len = sizeof(u16);
+ break;
+ case NFT_META_BRI_IIFHWADDR:
+ len = ETH_ALEN;
+ break;
+ default:
+ return nft_meta_get_init(ctx, expr, tb);
+ }
+
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
+}
+
+static struct nft_expr_type nft_meta_bridge_type;
+static const struct nft_expr_ops nft_meta_bridge_get_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_bridge_get_eval,
+ .init = nft_meta_bridge_get_init,
+ .dump = nft_meta_get_dump,
+ .reduce = nft_meta_get_reduce,
+};
+
+static void nft_meta_bridge_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *meta = nft_expr_priv(expr);
+ u32 *sreg = &regs->data[meta->sreg];
+ struct sk_buff *skb = pkt->skb;
+ u8 value8;
+
+ switch (meta->key) {
+ case NFT_META_BRI_BROUTE:
+ value8 = nft_reg_load8(sreg);
+ BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8;
+ break;
+ default:
+ nft_meta_set_eval(expr, regs, pkt);
+ }
+}
+
+static int nft_meta_bridge_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_BRI_BROUTE:
+ len = sizeof(u8);
+ break;
+ default:
+ return nft_meta_set_init(ctx, expr, tb);
+ }
+
+ priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
+static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->key) {
+ case NFT_META_BRI_BROUTE:
+ case NFT_META_BRI_IIFHWADDR:
+ hooks = 1 << NF_BR_PRE_ROUTING;
+ break;
+ default:
+ return nft_meta_set_validate(ctx, expr);
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static const struct nft_expr_ops nft_meta_bridge_set_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_bridge_set_eval,
+ .init = nft_meta_bridge_set_init,
+ .destroy = nft_meta_set_destroy,
+ .dump = nft_meta_set_dump,
+ .reduce = nft_meta_bridge_set_reduce,
+ .validate = nft_meta_bridge_set_validate,
+};
+
+static const struct nft_expr_ops *
+nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_META_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG])
+ return &nft_meta_bridge_get_ops;
+
+ if (tb[NFTA_META_SREG])
+ return &nft_meta_bridge_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .name = "meta",
+ .select_ops = nft_meta_bridge_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_meta_bridge_module_init(void)
+{
+ return nft_register_expr(&nft_meta_bridge_type);
+}
+
+static void __exit nft_meta_bridge_module_exit(void)
+{
+ nft_unregister_expr(&nft_meta_bridge_type);
+}
+
+module_init(nft_meta_bridge_module_init);
+module_exit(nft_meta_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
+MODULE_DESCRIPTION("Support for bridge dedicated meta key");
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
new file mode 100644
index 000000000000..1cb5c16e97b7
--- /dev/null
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2014 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include "../br_private.h"
+
+static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
+ struct sk_buff *nskb)
+{
+ struct ethhdr *eth;
+
+ eth = skb_push(nskb, ETH_HLEN);
+ skb_reset_mac_header(nskb);
+ ether_addr_copy(eth->h_source, eth_hdr(oldskb)->h_dest);
+ ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source);
+ eth->h_proto = eth_hdr(oldskb)->h_proto;
+ skb_pull(nskb, ETH_HLEN);
+
+ if (skb_vlan_tag_present(oldskb)) {
+ u16 vid = skb_vlan_tag_get(oldskb);
+
+ __vlan_hwaccel_put_tag(nskb, oldskb->vlan_proto, vid);
+ }
+}
+
+/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
+ * or the bridge port (NF_BRIDGE PREROUTING).
+ */
+static void nft_reject_br_send_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, NULL, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
+}
+
+static void nft_reject_br_send_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, NULL, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
+}
+
+static void nft_reject_br_send_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, NULL, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
+}
+
+
+static void nft_reject_br_send_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, NULL, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_forward(br_port_get_rcu(dev), nskb, false, true);
+}
+
+static void nft_reject_bridge_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+ const unsigned char *dest = eth_hdr(pkt->skb)->h_dest;
+
+ if (is_broadcast_ether_addr(dest) ||
+ is_multicast_ether_addr(dest))
+ goto out;
+
+ switch (eth_hdr(pkt->skb)->h_proto) {
+ case htons(ETH_P_IP):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_br_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_br_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmp_code(priv->icmp_code));
+ break;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_br_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_br_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmpv6_code(priv->icmp_code));
+ break;
+ }
+ break;
+ default:
+ /* No explicit way to reject this protocol, drop it. */
+ break;
+ }
+out:
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_IN));
+}
+
+static struct nft_expr_type nft_reject_bridge_type;
+static const struct nft_expr_ops nft_reject_bridge_ops = {
+ .type = &nft_reject_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_bridge_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_bridge_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_bridge_type __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .name = "reject",
+ .ops = &nft_reject_bridge_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_bridge_module_init(void)
+{
+ return nft_register_expr(&nft_reject_bridge_type);
+}
+
+static void __exit nft_reject_bridge_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_bridge_type);
+}
+
+module_init(nft_reject_bridge_module_init);
+module_exit(nft_reject_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "reject");
+MODULE_DESCRIPTION("Reject packets from bridge via nftables");
diff --git a/net/caif/Kconfig b/net/caif/Kconfig
new file mode 100644
index 000000000000..87205251cc25
--- /dev/null
+++ b/net/caif/Kconfig
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# CAIF net configurations
+#
+
+menuconfig CAIF
+ tristate "CAIF support"
+ select CRC_CCITT
+ default n
+ help
+ The "Communication CPU to Application CPU Interface" (CAIF) is a packet
+ based connection-oriented MUX protocol developed by ST-Ericsson for use
+ with its modems. It is accessed from user space as sockets (PF_CAIF).
+
+ Say Y (or M) here if you build for a phone product (e.g. Android or
+ MeeGo) that uses CAIF as transport. If unsure say N.
+
+ If you select to build it as module then CAIF_NETDEV also needs to be
+ built as a module. You will also need to say Y (or M) to any CAIF
+ physical devices that your platform requires.
+
+ See Documentation/networking/caif for a further explanation on how to
+ use and configure CAIF.
+
+config CAIF_DEBUG
+ bool "Enable Debug"
+ depends on CAIF
+ default n
+ help
+ Enable the inclusion of debug code in the CAIF stack.
+ Be aware that doing this will impact performance.
+ If unsure say N.
+
+config CAIF_NETDEV
+ tristate "CAIF GPRS Network device"
+ depends on CAIF
+ default CAIF
+ help
+ Say Y if you will be using a CAIF based GPRS network device.
+ This can be either built-in or a loadable module.
+ If you select to build it as a built-in then the main CAIF device must
+ also be a built-in.
+ If unsure say Y.
+
+config CAIF_USB
+ tristate "CAIF USB support"
+ depends on CAIF
+ default n
+ help
+ Say Y if you are using CAIF over USB CDC NCM.
+ This can be either built-in or a loadable module.
+ If you select to build it as a built-in then the main CAIF device must
+ also be a built-in.
+ If unsure say N.
diff --git a/net/caif/Makefile b/net/caif/Makefile
new file mode 100644
index 000000000000..4f6c0517cdfb
--- /dev/null
+++ b/net/caif/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-$(CONFIG_CAIF_DEBUG) := -DDEBUG
+
+caif-y := caif_dev.o \
+ cfcnfg.o cfmuxl.o cfctrl.o \
+ cffrml.o cfveil.o cfdbgl.o\
+ cfserl.o cfdgml.o \
+ cfrfml.o cfvidl.o cfutill.o \
+ cfsrvl.o cfpkt_skbuff.o
+
+obj-$(CONFIG_CAIF) += caif.o
+obj-$(CONFIG_CAIF_NETDEV) += chnl_net.o
+obj-$(CONFIG_CAIF) += caif_socket.o
+obj-$(CONFIG_CAIF_USB) += caif_usb.o
+
+export-y := caif.o
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
new file mode 100644
index 000000000000..24e85c5487ef
--- /dev/null
+++ b/net/caif/caif_dev.c
@@ -0,0 +1,586 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CAIF Interface registration.
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ *
+ * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont
+ * and Sakari Ailus <sakari.ailus@nokia.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <net/netns/generic.h>
+#include <net/net_namespace.h>
+#include <net/pkt_sched.h>
+#include <net/caif/caif_device.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+#include <net/caif/cfserl.h>
+
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support");
+MODULE_LICENSE("GPL");
+
+/* Used for local tracking of the CAIF net devices */
+struct caif_device_entry {
+ struct cflayer layer;
+ struct list_head list;
+ struct net_device *netdev;
+ int __percpu *pcpu_refcnt;
+ spinlock_t flow_lock;
+ struct sk_buff *xoff_skb;
+ void (*xoff_skb_dtor)(struct sk_buff *skb);
+ bool xoff;
+};
+
+struct caif_device_entry_list {
+ struct list_head list;
+ /* Protects simulanous deletes in list */
+ struct mutex lock;
+};
+
+struct caif_net {
+ struct cfcnfg *cfg;
+ struct caif_device_entry_list caifdevs;
+};
+
+static unsigned int caif_net_id;
+static int q_high = 50; /* Percent */
+
+struct cfcnfg *get_cfcnfg(struct net *net)
+{
+ struct caif_net *caifn;
+ caifn = net_generic(net, caif_net_id);
+ return caifn->cfg;
+}
+EXPORT_SYMBOL(get_cfcnfg);
+
+static struct caif_device_entry_list *caif_device_list(struct net *net)
+{
+ struct caif_net *caifn;
+ caifn = net_generic(net, caif_net_id);
+ return &caifn->caifdevs;
+}
+
+static void caifd_put(struct caif_device_entry *e)
+{
+ this_cpu_dec(*e->pcpu_refcnt);
+}
+
+static void caifd_hold(struct caif_device_entry *e)
+{
+ this_cpu_inc(*e->pcpu_refcnt);
+}
+
+static int caifd_refcnt_read(struct caif_device_entry *e)
+{
+ int i, refcnt = 0;
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(e->pcpu_refcnt, i);
+ return refcnt;
+}
+
+/* Allocate new CAIF device. */
+static struct caif_device_entry *caif_device_alloc(struct net_device *dev)
+{
+ struct caif_device_entry *caifd;
+
+ caifd = kzalloc(sizeof(*caifd), GFP_KERNEL);
+ if (!caifd)
+ return NULL;
+ caifd->pcpu_refcnt = alloc_percpu(int);
+ if (!caifd->pcpu_refcnt) {
+ kfree(caifd);
+ return NULL;
+ }
+ caifd->netdev = dev;
+ dev_hold(dev);
+ return caifd;
+}
+
+static struct caif_device_entry *caif_get(struct net_device *dev)
+{
+ struct caif_device_entry_list *caifdevs =
+ caif_device_list(dev_net(dev));
+ struct caif_device_entry *caifd;
+
+ list_for_each_entry_rcu(caifd, &caifdevs->list, list,
+ lockdep_rtnl_is_held()) {
+ if (caifd->netdev == dev)
+ return caifd;
+ }
+ return NULL;
+}
+
+static void caif_flow_cb(struct sk_buff *skb)
+{
+ struct caif_device_entry *caifd;
+ void (*dtor)(struct sk_buff *skb) = NULL;
+ bool send_xoff;
+
+ WARN_ON(skb->dev == NULL);
+
+ rcu_read_lock();
+ caifd = caif_get(skb->dev);
+
+ WARN_ON(caifd == NULL);
+ if (!caifd) {
+ rcu_read_unlock();
+ return;
+ }
+
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ spin_lock_bh(&caifd->flow_lock);
+ send_xoff = caifd->xoff;
+ caifd->xoff = false;
+ dtor = caifd->xoff_skb_dtor;
+
+ if (WARN_ON(caifd->xoff_skb != skb))
+ skb = NULL;
+
+ caifd->xoff_skb = NULL;
+ caifd->xoff_skb_dtor = NULL;
+
+ spin_unlock_bh(&caifd->flow_lock);
+
+ if (dtor && skb)
+ dtor(skb);
+
+ if (send_xoff)
+ caifd->layer.up->
+ ctrlcmd(caifd->layer.up,
+ _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND,
+ caifd->layer.id);
+ caifd_put(caifd);
+}
+
+static int transmit(struct cflayer *layer, struct cfpkt *pkt)
+{
+ int err, high = 0, qlen = 0;
+ struct caif_device_entry *caifd =
+ container_of(layer, struct caif_device_entry, layer);
+ struct sk_buff *skb;
+ struct netdev_queue *txq;
+
+ rcu_read_lock_bh();
+
+ skb = cfpkt_tonative(pkt);
+ skb->dev = caifd->netdev;
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_CAIF);
+
+ /* Check if we need to handle xoff */
+ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE))
+ goto noxoff;
+
+ if (unlikely(caifd->xoff))
+ goto noxoff;
+
+ if (likely(!netif_queue_stopped(caifd->netdev))) {
+ struct Qdisc *sch;
+
+ /* If we run with a TX queue, check if the queue is too long*/
+ txq = netdev_get_tx_queue(skb->dev, 0);
+ sch = rcu_dereference_bh(txq->qdisc);
+ if (likely(qdisc_is_empty(sch)))
+ goto noxoff;
+
+ /* can check for explicit qdisc len value only !NOLOCK,
+ * always set flow off otherwise
+ */
+ high = (caifd->netdev->tx_queue_len * q_high) / 100;
+ if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high))
+ goto noxoff;
+ }
+
+ /* Hold lock while accessing xoff */
+ spin_lock_bh(&caifd->flow_lock);
+ if (caifd->xoff) {
+ spin_unlock_bh(&caifd->flow_lock);
+ goto noxoff;
+ }
+
+ /*
+ * Handle flow off, we do this by temporary hi-jacking this
+ * skb's destructor function, and replace it with our own
+ * flow-on callback. The callback will set flow-on and call
+ * the original destructor.
+ */
+
+ pr_debug("queue has stopped(%d) or is full (%d > %d)\n",
+ netif_queue_stopped(caifd->netdev),
+ qlen, high);
+ caifd->xoff = true;
+ caifd->xoff_skb = skb;
+ caifd->xoff_skb_dtor = skb->destructor;
+ skb->destructor = caif_flow_cb;
+ spin_unlock_bh(&caifd->flow_lock);
+
+ caifd->layer.up->ctrlcmd(caifd->layer.up,
+ _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND,
+ caifd->layer.id);
+noxoff:
+ rcu_read_unlock_bh();
+
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = -EIO;
+
+ return err;
+}
+
+/*
+ * Stuff received packets into the CAIF stack.
+ * On error, returns non-zero and releases the skb.
+ */
+static int receive(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pkttype, struct net_device *orig_dev)
+{
+ struct cfpkt *pkt;
+ struct caif_device_entry *caifd;
+ int err;
+
+ pkt = cfpkt_fromnative(CAIF_DIR_IN, skb);
+
+ rcu_read_lock();
+ caifd = caif_get(dev);
+
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->receive ||
+ !netif_oper_up(caifd->netdev)) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ /* Hold reference to netdevice while using CAIF stack */
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ err = caifd->layer.up->receive(caifd->layer.up, pkt);
+
+ /* For -EILSEQ the packet is not freed so free it now */
+ if (err == -EILSEQ)
+ cfpkt_destroy(pkt);
+
+ /* Release reference to stack upwards */
+ caifd_put(caifd);
+
+ if (err != 0)
+ err = NET_RX_DROP;
+ return err;
+}
+
+static struct packet_type caif_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CAIF),
+ .func = receive,
+};
+
+static void dev_flowctrl(struct net_device *dev, int on)
+{
+ struct caif_device_entry *caifd;
+
+ rcu_read_lock();
+
+ caifd = caif_get(dev);
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) {
+ rcu_read_unlock();
+ return;
+ }
+
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ caifd->layer.up->ctrlcmd(caifd->layer.up,
+ on ?
+ _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND :
+ _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND,
+ caifd->layer.id);
+ caifd_put(caifd);
+}
+
+int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
+ struct cflayer *link_support, int head_room,
+ struct cflayer **layer,
+ int (**rcv_func)(struct sk_buff *, struct net_device *,
+ struct packet_type *,
+ struct net_device *))
+{
+ struct caif_device_entry *caifd;
+ enum cfcnfg_phy_preference pref;
+ struct cfcnfg *cfg = get_cfcnfg(dev_net(dev));
+ struct caif_device_entry_list *caifdevs;
+ int res;
+
+ caifdevs = caif_device_list(dev_net(dev));
+ caifd = caif_device_alloc(dev);
+ if (!caifd)
+ return -ENOMEM;
+ *layer = &caifd->layer;
+ spin_lock_init(&caifd->flow_lock);
+
+ switch (caifdev->link_select) {
+ case CAIF_LINK_HIGH_BANDW:
+ pref = CFPHYPREF_HIGH_BW;
+ break;
+ case CAIF_LINK_LOW_LATENCY:
+ pref = CFPHYPREF_LOW_LAT;
+ break;
+ default:
+ pref = CFPHYPREF_HIGH_BW;
+ break;
+ }
+ mutex_lock(&caifdevs->lock);
+ list_add_rcu(&caifd->list, &caifdevs->list);
+
+ strscpy(caifd->layer.name, dev->name,
+ sizeof(caifd->layer.name));
+ caifd->layer.transmit = transmit;
+ res = cfcnfg_add_phy_layer(cfg,
+ dev,
+ &caifd->layer,
+ pref,
+ link_support,
+ caifdev->use_fcs,
+ head_room);
+ mutex_unlock(&caifdevs->lock);
+ if (rcv_func)
+ *rcv_func = receive;
+ return res;
+}
+EXPORT_SYMBOL(caif_enroll_dev);
+
+/* notify Caif of device events */
+static int caif_device_notify(struct notifier_block *me, unsigned long what,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct caif_device_entry *caifd = NULL;
+ struct caif_dev_common *caifdev;
+ struct cfcnfg *cfg;
+ struct cflayer *layer, *link_support;
+ int head_room = 0;
+ struct caif_device_entry_list *caifdevs;
+ int res;
+
+ cfg = get_cfcnfg(dev_net(dev));
+ caifdevs = caif_device_list(dev_net(dev));
+
+ caifd = caif_get(dev);
+ if (caifd == NULL && dev->type != ARPHRD_CAIF)
+ return 0;
+
+ switch (what) {
+ case NETDEV_REGISTER:
+ if (caifd != NULL)
+ break;
+
+ caifdev = netdev_priv(dev);
+
+ link_support = NULL;
+ if (caifdev->use_frag) {
+ head_room = 1;
+ link_support = cfserl_create(dev->ifindex,
+ caifdev->use_stx);
+ if (!link_support) {
+ pr_warn("Out of memory\n");
+ break;
+ }
+ }
+ res = caif_enroll_dev(dev, caifdev, link_support, head_room,
+ &layer, NULL);
+ if (res)
+ cfserl_release(link_support);
+ caifdev->flowctrl = dev_flowctrl;
+ break;
+
+ case NETDEV_UP:
+ rcu_read_lock();
+
+ caifd = caif_get(dev);
+ if (caifd == NULL) {
+ rcu_read_unlock();
+ break;
+ }
+
+ caifd->xoff = false;
+ cfcnfg_set_phy_state(cfg, &caifd->layer, true);
+ rcu_read_unlock();
+
+ break;
+
+ case NETDEV_DOWN:
+ rcu_read_lock();
+
+ caifd = caif_get(dev);
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ cfcnfg_set_phy_state(cfg, &caifd->layer, false);
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ caifd->layer.up->ctrlcmd(caifd->layer.up,
+ _CAIF_CTRLCMD_PHYIF_DOWN_IND,
+ caifd->layer.id);
+
+ spin_lock_bh(&caifd->flow_lock);
+
+ /*
+ * Replace our xoff-destructor with original destructor.
+ * We trust that skb->destructor *always* is called before
+ * the skb reference is invalid. The hijacked SKB destructor
+ * takes the flow_lock so manipulating the skb->destructor here
+ * should be safe.
+ */
+ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL)
+ caifd->xoff_skb->destructor = caifd->xoff_skb_dtor;
+
+ caifd->xoff = false;
+ caifd->xoff_skb_dtor = NULL;
+ caifd->xoff_skb = NULL;
+
+ spin_unlock_bh(&caifd->flow_lock);
+ caifd_put(caifd);
+ break;
+
+ case NETDEV_UNREGISTER:
+ mutex_lock(&caifdevs->lock);
+
+ caifd = caif_get(dev);
+ if (caifd == NULL) {
+ mutex_unlock(&caifdevs->lock);
+ break;
+ }
+ list_del_rcu(&caifd->list);
+
+ /*
+ * NETDEV_UNREGISTER is called repeatedly until all reference
+ * counts for the net-device are released. If references to
+ * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for
+ * the next call to NETDEV_UNREGISTER.
+ *
+ * If any packets are in flight down the CAIF Stack,
+ * cfcnfg_del_phy_layer will return nonzero.
+ * If no packets are in flight, the CAIF Stack associated
+ * with the net-device un-registering is freed.
+ */
+
+ if (caifd_refcnt_read(caifd) != 0 ||
+ cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) {
+
+ pr_info("Wait for device inuse\n");
+ /* Enrole device if CAIF Stack is still in use */
+ list_add_rcu(&caifd->list, &caifdevs->list);
+ mutex_unlock(&caifdevs->lock);
+ break;
+ }
+
+ synchronize_rcu();
+ dev_put(caifd->netdev);
+ free_percpu(caifd->pcpu_refcnt);
+ kfree(caifd);
+
+ mutex_unlock(&caifdevs->lock);
+ break;
+ }
+ return 0;
+}
+
+static struct notifier_block caif_device_notifier = {
+ .notifier_call = caif_device_notify,
+ .priority = 0,
+};
+
+/* Per-namespace Caif devices handling */
+static int caif_init_net(struct net *net)
+{
+ struct caif_net *caifn = net_generic(net, caif_net_id);
+ INIT_LIST_HEAD(&caifn->caifdevs.list);
+ mutex_init(&caifn->caifdevs.lock);
+
+ caifn->cfg = cfcnfg_create();
+ if (!caifn->cfg)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void caif_exit_net(struct net *net)
+{
+ struct caif_device_entry *caifd, *tmp;
+ struct caif_device_entry_list *caifdevs =
+ caif_device_list(net);
+ struct cfcnfg *cfg = get_cfcnfg(net);
+
+ rtnl_lock();
+ mutex_lock(&caifdevs->lock);
+
+ list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) {
+ int i = 0;
+ list_del_rcu(&caifd->list);
+ cfcnfg_set_phy_state(cfg, &caifd->layer, false);
+
+ while (i < 10 &&
+ (caifd_refcnt_read(caifd) != 0 ||
+ cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) {
+
+ pr_info("Wait for device inuse\n");
+ msleep(250);
+ i++;
+ }
+ synchronize_rcu();
+ dev_put(caifd->netdev);
+ free_percpu(caifd->pcpu_refcnt);
+ kfree(caifd);
+ }
+ cfcnfg_remove(cfg);
+
+ mutex_unlock(&caifdevs->lock);
+ rtnl_unlock();
+}
+
+static struct pernet_operations caif_net_ops = {
+ .init = caif_init_net,
+ .exit = caif_exit_net,
+ .id = &caif_net_id,
+ .size = sizeof(struct caif_net),
+};
+
+/* Initialize Caif devices list */
+static int __init caif_device_init(void)
+{
+ int result;
+
+ result = register_pernet_subsys(&caif_net_ops);
+
+ if (result)
+ return result;
+
+ register_netdevice_notifier(&caif_device_notifier);
+ dev_add_pack(&caif_packet_type);
+
+ return result;
+}
+
+static void __exit caif_device_exit(void)
+{
+ unregister_netdevice_notifier(&caif_device_notifier);
+ dev_remove_pack(&caif_packet_type);
+ unregister_pernet_subsys(&caif_net_ops);
+}
+
+module_init(caif_device_init);
+module_exit(caif_device_exit);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
new file mode 100644
index 000000000000..af218742af5a
--- /dev/null
+++ b/net/caif/caif_socket.c
@@ -0,0 +1,1114 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/filter.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/caif/caif_socket.h>
+#include <linux/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/cfpkt.h>
+
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol socket support (AF_CAIF)");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(AF_CAIF);
+
+/*
+ * CAIF state is re-using the TCP socket states.
+ * caif_states stored in sk_state reflect the state as reported by
+ * the CAIF stack, while sk_socket->state is the state of the socket.
+ */
+enum caif_states {
+ CAIF_CONNECTED = TCP_ESTABLISHED,
+ CAIF_CONNECTING = TCP_SYN_SENT,
+ CAIF_DISCONNECTED = TCP_CLOSE
+};
+
+#define TX_FLOW_ON_BIT 1
+#define RX_FLOW_ON_BIT 2
+
+struct caifsock {
+ struct sock sk; /* must be first member */
+ struct cflayer layer;
+ unsigned long flow_state;
+ struct caif_connect_request conn_req;
+ struct mutex readlock;
+ struct dentry *debugfs_socket_dir;
+ int headroom, tailroom, maxframe;
+};
+
+static int rx_flow_is_on(struct caifsock *cf_sk)
+{
+ return test_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
+}
+
+static int tx_flow_is_on(struct caifsock *cf_sk)
+{
+ return test_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
+}
+
+static void set_rx_flow_off(struct caifsock *cf_sk)
+{
+ clear_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
+}
+
+static void set_rx_flow_on(struct caifsock *cf_sk)
+{
+ set_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
+}
+
+static void set_tx_flow_off(struct caifsock *cf_sk)
+{
+ clear_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
+}
+
+static void set_tx_flow_on(struct caifsock *cf_sk)
+{
+ set_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
+}
+
+static void caif_read_lock(struct sock *sk)
+{
+ struct caifsock *cf_sk;
+ cf_sk = container_of(sk, struct caifsock, sk);
+ mutex_lock(&cf_sk->readlock);
+}
+
+static void caif_read_unlock(struct sock *sk)
+{
+ struct caifsock *cf_sk;
+ cf_sk = container_of(sk, struct caifsock, sk);
+ mutex_unlock(&cf_sk->readlock);
+}
+
+static int sk_rcvbuf_lowwater(struct caifsock *cf_sk)
+{
+ /* A quarter of full buffer is used a low water mark */
+ return cf_sk->sk.sk_rcvbuf / 4;
+}
+
+static void caif_flow_ctrl(struct sock *sk, int mode)
+{
+ struct caifsock *cf_sk;
+ cf_sk = container_of(sk, struct caifsock, sk);
+ if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd)
+ cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode);
+}
+
+/*
+ * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are
+ * not dropped, but CAIF is sending flow off instead.
+ */
+static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ unsigned long flags;
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ bool queued = false;
+
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
+ net_dbg_ratelimited("sending flow OFF (queue len = %d %d)\n",
+ atomic_read(&cf_sk->sk.sk_rmem_alloc),
+ sk_rcvbuf_lowwater(cf_sk));
+ set_rx_flow_off(cf_sk);
+ caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
+ }
+
+ err = sk_filter(sk, skb);
+ if (err)
+ goto out;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
+ set_rx_flow_off(cf_sk);
+ net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
+ caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
+ }
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+ spin_lock_irqsave(&list->lock, flags);
+ queued = !sock_flag(sk, SOCK_DEAD);
+ if (queued)
+ __skb_queue_tail(list, skb);
+ spin_unlock_irqrestore(&list->lock, flags);
+out:
+ if (queued)
+ sk->sk_data_ready(sk);
+ else
+ kfree_skb(skb);
+}
+
+/* Packet Receive Callback function called from CAIF Stack */
+static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct caifsock *cf_sk;
+ struct sk_buff *skb;
+
+ cf_sk = container_of(layr, struct caifsock, layer);
+ skb = cfpkt_tonative(pkt);
+
+ if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) {
+ kfree_skb(skb);
+ return 0;
+ }
+ caif_queue_rcv_skb(&cf_sk->sk, skb);
+ return 0;
+}
+
+static void cfsk_hold(struct cflayer *layr)
+{
+ struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+ sock_hold(&cf_sk->sk);
+}
+
+static void cfsk_put(struct cflayer *layr)
+{
+ struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+ sock_put(&cf_sk->sk);
+}
+
+/* Packet Control Callback function called from CAIF */
+static void caif_ctrl_cb(struct cflayer *layr,
+ enum caif_ctrlcmd flow,
+ int phyid)
+{
+ struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+ switch (flow) {
+ case CAIF_CTRLCMD_FLOW_ON_IND:
+ /* OK from modem to start sending again */
+ set_tx_flow_on(cf_sk);
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ /* Modem asks us to shut up */
+ set_tx_flow_off(cf_sk);
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_INIT_RSP:
+ /* We're now connected */
+ caif_client_register_refcnt(&cf_sk->layer,
+ cfsk_hold, cfsk_put);
+ cf_sk->sk.sk_state = CAIF_CONNECTED;
+ set_tx_flow_on(cf_sk);
+ cf_sk->sk.sk_shutdown = 0;
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_DEINIT_RSP:
+ /* We're now disconnected */
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_INIT_FAIL_RSP:
+ /* Connect request failed */
+ cf_sk->sk.sk_err = ECONNREFUSED;
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
+ /*
+ * Socket "standards" seems to require POLLOUT to
+ * be set at connect failure.
+ */
+ set_tx_flow_on(cf_sk);
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+ /* Modem has closed this connection, or device is down. */
+ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
+ cf_sk->sk.sk_err = ECONNRESET;
+ set_rx_flow_on(cf_sk);
+ sk_error_report(&cf_sk->sk);
+ break;
+
+ default:
+ pr_debug("Unexpected flow command %d\n", flow);
+ }
+}
+
+static void caif_check_flow_release(struct sock *sk)
+{
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ if (rx_flow_is_on(cf_sk))
+ return;
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) {
+ set_rx_flow_on(cf_sk);
+ caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ);
+ }
+}
+
+/*
+ * Copied from unix_dgram_recvmsg, but removed credit checks,
+ * changed locking, address handling and added MSG_TRUNC.
+ */
+static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m,
+ size_t len, int flags)
+
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int ret;
+ int copylen;
+
+ ret = -EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ goto read_error;
+
+ skb = skb_recv_datagram(sk, flags, &ret);
+ if (!skb)
+ goto read_error;
+ copylen = skb->len;
+ if (len < copylen) {
+ m->msg_flags |= MSG_TRUNC;
+ copylen = len;
+ }
+
+ ret = skb_copy_datagram_msg(skb, 0, m, copylen);
+ if (ret)
+ goto out_free;
+
+ ret = (flags & MSG_TRUNC) ? skb->len : copylen;
+out_free:
+ skb_free_datagram(sk, skb);
+ caif_check_flow_release(sk);
+ return ret;
+
+read_error:
+ return ret;
+}
+
+
+/* Copied from unix_stream_wait_data, identical except for lock call. */
+static long caif_stream_data_wait(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+ lock_sock(sk);
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ sk->sk_err ||
+ sk->sk_state != CAIF_CONNECTED ||
+ sock_flag(sk, SOCK_DEAD) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current) ||
+ !timeo)
+ break;
+
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_DEAD))
+ break;
+
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ }
+
+ finish_wait(sk_sleep(sk), &wait);
+ release_sock(sk);
+ return timeo;
+}
+
+
+/*
+ * Copied from unix_stream_recvmsg, but removed credit checks,
+ * changed locking calls, changed address handling.
+ */
+static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int copied = 0;
+ int target;
+ int err = 0;
+ long timeo;
+
+ err = -EOPNOTSUPP;
+ if (flags&MSG_OOB)
+ goto out;
+
+ /*
+ * Lock the socket to prevent queue disordering
+ * while sleeps in memcpy_tomsg
+ */
+ err = -EAGAIN;
+ if (sk->sk_state == CAIF_CONNECTING)
+ goto out;
+
+ caif_read_lock(sk);
+ target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
+
+ do {
+ int chunk;
+ struct sk_buff *skb;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ err = -ECONNRESET;
+ goto unlock;
+ }
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ caif_check_flow_release(sk);
+
+ if (skb == NULL) {
+ if (copied >= target)
+ goto unlock;
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+ err = sock_error(sk);
+ if (err)
+ goto unlock;
+ err = -ECONNRESET;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto unlock;
+
+ err = -EPIPE;
+ if (sk->sk_state != CAIF_CONNECTED)
+ goto unlock;
+ if (sock_flag(sk, SOCK_DEAD))
+ goto unlock;
+
+ release_sock(sk);
+
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+
+ caif_read_unlock(sk);
+
+ timeo = caif_stream_data_wait(sk, timeo);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+ caif_read_lock(sk);
+ continue;
+unlock:
+ release_sock(sk);
+ break;
+ }
+ release_sock(sk);
+ chunk = min_t(unsigned int, skb->len, size);
+ if (memcpy_to_msg(msg, skb->data, chunk)) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ if (copied == 0)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ /* Mark read part of skb as used */
+ if (!(flags & MSG_PEEK)) {
+ skb_pull(skb, chunk);
+
+ /* put the skb back if we didn't use it up. */
+ if (skb->len) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ kfree_skb(skb);
+
+ } else {
+ /*
+ * It is questionable, see note in unix_dgram_recvmsg.
+ */
+ /* put message back and return */
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ } while (size);
+ caif_read_unlock(sk);
+
+out:
+ return copied ? : err;
+}
+
+/*
+ * Copied from sock.c:sock_wait_for_wmem, but change to wait for
+ * CAIF flow-on and sock_writable.
+ */
+static long caif_wait_for_flow_on(struct caifsock *cf_sk,
+ int wait_writeable, long timeo, int *err)
+{
+ struct sock *sk = &cf_sk->sk;
+ DEFINE_WAIT(wait);
+ for (;;) {
+ *err = 0;
+ if (tx_flow_is_on(cf_sk) &&
+ (!wait_writeable || sock_writeable(&cf_sk->sk)))
+ break;
+ *err = -ETIMEDOUT;
+ if (!timeo)
+ break;
+ *err = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ *err = -ECONNRESET;
+ if (sk->sk_shutdown & SHUTDOWN_MASK)
+ break;
+ *err = -sk->sk_err;
+ if (sk->sk_err)
+ break;
+ *err = -EPIPE;
+ if (cf_sk->sk.sk_state != CAIF_CONNECTED)
+ break;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+/*
+ * Transmit a SKB. The device may temporarily request re-transmission
+ * by returning EAGAIN.
+ */
+static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
+ int noblock, long timeo)
+{
+ struct cfpkt *pkt;
+
+ pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb);
+ memset(skb->cb, 0, sizeof(struct caif_payload_info));
+ cfpkt_set_prio(pkt, cf_sk->sk.sk_priority);
+
+ if (cf_sk->layer.dn == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt);
+}
+
+/* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */
+static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ int buffer_size;
+ int ret = 0;
+ struct sk_buff *skb = NULL;
+ int noblock;
+ long timeo;
+ caif_assert(cf_sk);
+ ret = sock_error(sk);
+ if (ret)
+ goto err;
+
+ ret = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+ goto err;
+
+ ret = -EOPNOTSUPP;
+ if (msg->msg_namelen)
+ goto err;
+
+ noblock = msg->msg_flags & MSG_DONTWAIT;
+
+ timeo = sock_sndtimeo(sk, noblock);
+ timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk),
+ 1, timeo, &ret);
+
+ if (ret)
+ goto err;
+ ret = -EPIPE;
+ if (cf_sk->sk.sk_state != CAIF_CONNECTED ||
+ sock_flag(sk, SOCK_DEAD) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ goto err;
+
+ /* Error if trying to write more than maximum frame size. */
+ ret = -EMSGSIZE;
+ if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM)
+ goto err;
+
+ buffer_size = len + cf_sk->headroom + cf_sk->tailroom;
+
+ ret = -ENOMEM;
+ skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret);
+
+ if (!skb || skb_tailroom(skb) < buffer_size)
+ goto err;
+
+ skb_reserve(skb, cf_sk->headroom);
+
+ ret = memcpy_from_msg(skb_put(skb, len), msg, len);
+
+ if (ret)
+ goto err;
+ ret = transmit_skb(skb, cf_sk, noblock, timeo);
+ if (ret < 0)
+ /* skb is already freed */
+ return ret;
+
+ return len;
+err:
+ kfree_skb(skb);
+ return ret;
+}
+
+/*
+ * Copied from unix_stream_sendmsg and adapted to CAIF:
+ * Changed removed permission handling and added waiting for flow on
+ * and other minor adaptations.
+ */
+static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ int err, size;
+ struct sk_buff *skb;
+ int sent = 0;
+ long timeo;
+
+ err = -EOPNOTSUPP;
+ if (unlikely(msg->msg_flags&MSG_OOB))
+ goto out_err;
+
+ if (unlikely(msg->msg_namelen))
+ goto out_err;
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err);
+
+ if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
+ goto pipe_err;
+
+ while (sent < len) {
+
+ size = len-sent;
+
+ if (size > cf_sk->maxframe)
+ size = cf_sk->maxframe;
+
+ /* If size is more than half of sndbuf, chop up message */
+ if (size > ((sk->sk_sndbuf >> 1) - 64))
+ size = (sk->sk_sndbuf >> 1) - 64;
+
+ if (size > SKB_MAX_ALLOC)
+ size = SKB_MAX_ALLOC;
+
+ skb = sock_alloc_send_skb(sk,
+ size + cf_sk->headroom +
+ cf_sk->tailroom,
+ msg->msg_flags&MSG_DONTWAIT,
+ &err);
+ if (skb == NULL)
+ goto out_err;
+
+ skb_reserve(skb, cf_sk->headroom);
+ /*
+ * If you pass two values to the sock_alloc_send_skb
+ * it tries to grab the large buffer with GFP_NOFS
+ * (which can fail easily), and if it fails grab the
+ * fallback size buffer which is under a page and will
+ * succeed. [Alan]
+ */
+ size = min_t(int, size, skb_tailroom(skb));
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err) {
+ kfree_skb(skb);
+ goto out_err;
+ }
+ err = transmit_skb(skb, cf_sk,
+ msg->msg_flags&MSG_DONTWAIT, timeo);
+ if (err < 0)
+ /* skb is already freed */
+ goto pipe_err;
+
+ sent += size;
+ }
+
+ return sent;
+
+pipe_err:
+ if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+out_err:
+ return sent ? : err;
+}
+
+static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov,
+ unsigned int ol)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ int linksel;
+
+ if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED)
+ return -ENOPROTOOPT;
+
+ switch (opt) {
+ case CAIFSO_LINK_SELECT:
+ if (ol < sizeof(int))
+ return -EINVAL;
+ if (lvl != SOL_CAIF)
+ goto bad_sol;
+ if (copy_from_sockptr(&linksel, ov, sizeof(int)))
+ return -EINVAL;
+ lock_sock(&(cf_sk->sk));
+ cf_sk->conn_req.link_selector = linksel;
+ release_sock(&cf_sk->sk);
+ return 0;
+
+ case CAIFSO_REQ_PARAM:
+ if (lvl != SOL_CAIF)
+ goto bad_sol;
+ if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL)
+ return -ENOPROTOOPT;
+ lock_sock(&(cf_sk->sk));
+ if (ol > sizeof(cf_sk->conn_req.param.data) ||
+ copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) {
+ release_sock(&cf_sk->sk);
+ return -EINVAL;
+ }
+ cf_sk->conn_req.param.size = ol;
+ release_sock(&cf_sk->sk);
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ return 0;
+bad_sol:
+ return -ENOPROTOOPT;
+
+}
+
+/*
+ * caif_connect() - Connect a CAIF Socket
+ * Copied and modified af_irda.c:irda_connect().
+ *
+ * Note : by consulting "errno", the user space caller may learn the cause
+ * of the failure. Most of them are visible in the function, others may come
+ * from subroutines called and are listed here :
+ * o -EAFNOSUPPORT: bad socket family or type.
+ * o -ESOCKTNOSUPPORT: bad socket type or protocol
+ * o -EINVAL: bad socket address, or CAIF link type
+ * o -ECONNREFUSED: remote end refused the connection.
+ * o -EINPROGRESS: connect request sent but timed out (or non-blocking)
+ * o -EISCONN: already connected.
+ * o -ETIMEDOUT: Connection timed out (send timeout)
+ * o -ENODEV: No link layer to send request
+ * o -ECONNRESET: Received Shutdown indication or lost link layer
+ * o -ENOMEM: Out of memory
+ *
+ * State Strategy:
+ * o sk_state: holds the CAIF_* protocol state, it's updated by
+ * caif_ctrl_cb.
+ * o sock->state: holds the SS_* socket state and is updated by connect and
+ * disconnect.
+ */
+static int caif_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ long timeo;
+ int err;
+ int ifindex, headroom, tailroom;
+ unsigned int mtu;
+ struct net_device *dev;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ goto out;
+
+ err = -EAFNOSUPPORT;
+ if (uaddr->sa_family != AF_CAIF)
+ goto out;
+
+ switch (sock->state) {
+ case SS_UNCONNECTED:
+ /* Normal case, a fresh connect */
+ caif_assert(sk->sk_state == CAIF_DISCONNECTED);
+ break;
+ case SS_CONNECTING:
+ switch (sk->sk_state) {
+ case CAIF_CONNECTED:
+ sock->state = SS_CONNECTED;
+ err = -EISCONN;
+ goto out;
+ case CAIF_DISCONNECTED:
+ /* Reconnect allowed */
+ break;
+ case CAIF_CONNECTING:
+ err = -EALREADY;
+ if (flags & O_NONBLOCK)
+ goto out;
+ goto wait_connect;
+ }
+ break;
+ case SS_CONNECTED:
+ caif_assert(sk->sk_state == CAIF_CONNECTED ||
+ sk->sk_state == CAIF_DISCONNECTED);
+ if (sk->sk_shutdown & SHUTDOWN_MASK) {
+ /* Allow re-connect after SHUTDOWN_IND */
+ caif_disconnect_client(sock_net(sk), &cf_sk->layer);
+ caif_free_client(&cf_sk->layer);
+ break;
+ }
+ /* No reconnect on a seqpacket socket */
+ err = -EISCONN;
+ goto out;
+ case SS_DISCONNECTING:
+ case SS_FREE:
+ caif_assert(1); /*Should never happen */
+ break;
+ }
+ sk->sk_state = CAIF_DISCONNECTED;
+ sock->state = SS_UNCONNECTED;
+ sk_stream_kill_queues(&cf_sk->sk);
+
+ err = -EINVAL;
+ if (addr_len != sizeof(struct sockaddr_caif))
+ goto out;
+
+ memcpy(&cf_sk->conn_req.sockaddr, uaddr,
+ sizeof(struct sockaddr_caif));
+
+ /* Move to connecting socket, start sending Connect Requests */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = CAIF_CONNECTING;
+
+ /* Check priority value comming from socket */
+ /* if priority value is out of range it will be ajusted */
+ if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX)
+ cf_sk->conn_req.priority = CAIF_PRIO_MAX;
+ else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN)
+ cf_sk->conn_req.priority = CAIF_PRIO_MIN;
+ else
+ cf_sk->conn_req.priority = cf_sk->sk.sk_priority;
+
+ /*ifindex = id of the interface.*/
+ cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if;
+
+ cf_sk->layer.receive = caif_sktrecv_cb;
+
+ err = caif_connect_client(sock_net(sk), &cf_sk->conn_req,
+ &cf_sk->layer, &ifindex, &headroom, &tailroom);
+
+ if (err < 0) {
+ cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+ goto out;
+ }
+
+ err = -ENODEV;
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ goto out;
+ }
+ cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
+ mtu = dev->mtu;
+ rcu_read_unlock();
+
+ cf_sk->tailroom = tailroom;
+ cf_sk->maxframe = mtu - (headroom + tailroom);
+ if (cf_sk->maxframe < 1) {
+ pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu);
+ err = -ENODEV;
+ goto out;
+ }
+
+ err = -EINPROGRESS;
+wait_connect:
+
+ if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK))
+ goto out;
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ release_sock(sk);
+ err = -ERESTARTSYS;
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ sk->sk_state != CAIF_CONNECTING,
+ timeo);
+ lock_sock(sk);
+ if (timeo < 0)
+ goto out; /* -ERESTARTSYS */
+
+ err = -ETIMEDOUT;
+ if (timeo == 0 && sk->sk_state != CAIF_CONNECTED)
+ goto out;
+ if (sk->sk_state != CAIF_CONNECTED) {
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk);
+ if (!err)
+ err = -ECONNREFUSED;
+ goto out;
+ }
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * caif_release() - Disconnect a CAIF Socket
+ * Copied and modified af_irda.c:irda_release().
+ */
+static int caif_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ if (!sk)
+ return 0;
+
+ set_tx_flow_off(cf_sk);
+
+ /*
+ * Ensure that packets are not queued after this point in time.
+ * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock,
+ * this ensures no packets when sock is dead.
+ */
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ sock_set_flag(sk, SOCK_DEAD);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ sock->sk = NULL;
+
+ WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir));
+ debugfs_remove_recursive(cf_sk->debugfs_socket_dir);
+
+ lock_sock(&(cf_sk->sk));
+ sk->sk_state = CAIF_DISCONNECTED;
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ caif_disconnect_client(sock_net(sk), &cf_sk->layer);
+ cf_sk->sk.sk_socket->state = SS_DISCONNECTING;
+ wake_up_interruptible_poll(sk_sleep(sk), EPOLLERR|EPOLLHUP);
+
+ sock_orphan(sk);
+ sk_stream_kill_queues(&cf_sk->sk);
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
+static __poll_t caif_poll(struct file *file,
+ struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ __poll_t mask;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ sock_poll_wait(file, sock, wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err)
+ mask |= EPOLLERR;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= EPOLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLRDHUP;
+
+ /* readable? */
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ /*
+ * we set writable also when the other side has shut down the
+ * connection. This prevents stuck sockets.
+ */
+ if (sock_writeable(sk) && tx_flow_is_on(cf_sk))
+ mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
+
+ return mask;
+}
+
+static const struct proto_ops caif_seqpacket_ops = {
+ .family = PF_CAIF,
+ .owner = THIS_MODULE,
+ .release = caif_release,
+ .bind = sock_no_bind,
+ .connect = caif_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = caif_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = setsockopt,
+ .sendmsg = caif_seqpkt_sendmsg,
+ .recvmsg = caif_seqpkt_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static const struct proto_ops caif_stream_ops = {
+ .family = PF_CAIF,
+ .owner = THIS_MODULE,
+ .release = caif_release,
+ .bind = sock_no_bind,
+ .connect = caif_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = caif_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = setsockopt,
+ .sendmsg = caif_stream_sendmsg,
+ .recvmsg = caif_stream_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+/* This function is called when a socket is finally destroyed. */
+static void caif_sock_destructor(struct sock *sk)
+{
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ caif_assert(!refcount_read(&sk->sk_wmem_alloc));
+ caif_assert(sk_unhashed(sk));
+ caif_assert(!sk->sk_socket);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_debug("Attempt to release alive CAIF socket: %p\n", sk);
+ return;
+ }
+ sk_stream_kill_queues(&cf_sk->sk);
+ WARN_ON_ONCE(sk->sk_forward_alloc);
+ caif_free_client(&cf_sk->layer);
+}
+
+static int caif_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk = NULL;
+ struct caifsock *cf_sk = NULL;
+ static struct proto prot = {.name = "PF_CAIF",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct caifsock),
+ .useroffset = offsetof(struct caifsock, conn_req.param),
+ .usersize = sizeof_field(struct caifsock, conn_req.param)
+ };
+
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /*
+ * The sock->type specifies the socket type to use.
+ * The CAIF socket is a packet stream in the sense
+ * that it is packet based. CAIF trusts the reliability
+ * of the link, no resending is implemented.
+ */
+ if (sock->type == SOCK_SEQPACKET)
+ sock->ops = &caif_seqpacket_ops;
+ else if (sock->type == SOCK_STREAM)
+ sock->ops = &caif_stream_ops;
+ else
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol < 0 || protocol >= CAIFPROTO_MAX)
+ return -EPROTONOSUPPORT;
+ /*
+ * Set the socket state to unconnected. The socket state
+ * is really not used at all in the net/core or socket.c but the
+ * initialization makes sure that sock->state is not uninitialized.
+ */
+ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ cf_sk = container_of(sk, struct caifsock, sk);
+
+ /* Store the protocol */
+ sk->sk_protocol = (unsigned char) protocol;
+
+ /* Initialize default priority for well-known cases */
+ switch (protocol) {
+ case CAIFPROTO_AT:
+ sk->sk_priority = TC_PRIO_CONTROL;
+ break;
+ case CAIFPROTO_RFM:
+ sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ break;
+ default:
+ sk->sk_priority = TC_PRIO_BESTEFFORT;
+ }
+
+ /*
+ * Lock in order to try to stop someone from opening the socket
+ * too early.
+ */
+ lock_sock(&(cf_sk->sk));
+
+ /* Initialize the nozero default sock structure data. */
+ sock_init_data(sock, sk);
+ sk->sk_destruct = caif_sock_destructor;
+
+ mutex_init(&cf_sk->readlock); /* single task reading lock */
+ cf_sk->layer.ctrlcmd = caif_ctrl_cb;
+ cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+
+ set_tx_flow_off(cf_sk);
+ set_rx_flow_on(cf_sk);
+
+ /* Set default options on configuration */
+ cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY;
+ cf_sk->conn_req.protocol = protocol;
+ release_sock(&cf_sk->sk);
+ return 0;
+}
+
+
+static const struct net_proto_family caif_family_ops = {
+ .family = PF_CAIF,
+ .create = caif_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init caif_sktinit_module(void)
+{
+ return sock_register(&caif_family_ops);
+}
+
+static void __exit caif_sktexit_module(void)
+{
+ sock_unregister(PF_CAIF);
+}
+module_init(caif_sktinit_module);
+module_exit(caif_sktexit_module);
diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
new file mode 100644
index 000000000000..5dc05a1e3178
--- /dev/null
+++ b/net/caif/caif_usb.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CAIF USB handler
+ * Copyright (C) ST-Ericsson AB 2011
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/mii.h>
+#include <linux/usb.h>
+#include <linux/usb/usbnet.h>
+#include <linux/etherdevice.h>
+#include <net/netns/generic.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol USB support");
+MODULE_LICENSE("GPL");
+
+#define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */
+#define CFUSB_ALIGNMENT 4 /* Number of bytes to align. */
+#define CFUSB_MAX_HEADLEN (CFUSB_PAD_DESCR_SZ + CFUSB_ALIGNMENT-1)
+#define STE_USB_VID 0x04cc /* USB Product ID for ST-Ericsson */
+#define STE_USB_PID_CAIF 0x230f /* Product id for CAIF Modems */
+
+struct cfusbl {
+ struct cflayer layer;
+ u8 tx_eth_hdr[ETH_HLEN];
+};
+
+static bool pack_added;
+
+static int cfusbl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 hpad;
+
+ /* Remove padding. */
+ cfpkt_extr_head(pkt, &hpad, 1);
+ cfpkt_extr_head(pkt, NULL, hpad);
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct caif_payload_info *info;
+ u8 hpad;
+ u8 zeros[CFUSB_ALIGNMENT];
+ struct sk_buff *skb;
+ struct cfusbl *usbl = container_of(layr, struct cfusbl, layer);
+
+ skb = cfpkt_tonative(pkt);
+
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+
+ info = cfpkt_info(pkt);
+ hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1);
+
+ if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) {
+ pr_warn("Headroom too small\n");
+ kfree_skb(skb);
+ return -EIO;
+ }
+ memset(zeros, 0, hpad);
+
+ cfpkt_add_head(pkt, zeros, hpad);
+ cfpkt_add_head(pkt, &hpad, 1);
+ cfpkt_add_head(pkt, usbl->tx_eth_hdr, sizeof(usbl->tx_eth_hdr));
+ return layr->dn->transmit(layr->dn, pkt);
+}
+
+static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ if (layr->up && layr->up->ctrlcmd)
+ layr->up->ctrlcmd(layr->up, ctrl, layr->id);
+}
+
+static struct cflayer *cfusbl_create(int phyid, const u8 ethaddr[ETH_ALEN],
+ u8 braddr[ETH_ALEN])
+{
+ struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC);
+
+ if (!this)
+ return NULL;
+
+ caif_assert(offsetof(struct cfusbl, layer) == 0);
+
+ memset(&this->layer, 0, sizeof(this->layer));
+ this->layer.receive = cfusbl_receive;
+ this->layer.transmit = cfusbl_transmit;
+ this->layer.ctrlcmd = cfusbl_ctrlcmd;
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "usb%d", phyid);
+ this->layer.id = phyid;
+
+ /*
+ * Construct TX ethernet header:
+ * 0-5 destination address
+ * 5-11 source address
+ * 12-13 protocol type
+ */
+ ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], braddr);
+ ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], ethaddr);
+ this->tx_eth_hdr[12] = cpu_to_be16(ETH_P_802_EX1) & 0xff;
+ this->tx_eth_hdr[13] = (cpu_to_be16(ETH_P_802_EX1) >> 8) & 0xff;
+ pr_debug("caif ethernet TX-header dst:%pM src:%pM type:%02x%02x\n",
+ this->tx_eth_hdr, this->tx_eth_hdr + ETH_ALEN,
+ this->tx_eth_hdr[12], this->tx_eth_hdr[13]);
+
+ return (struct cflayer *) this;
+}
+
+static void cfusbl_release(struct cflayer *layer)
+{
+ kfree(layer);
+}
+
+static struct packet_type caif_usb_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_802_EX1),
+};
+
+static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct caif_dev_common common;
+ struct cflayer *layer, *link_support;
+ struct usbnet *usbnet;
+ struct usb_device *usbdev;
+ int res;
+
+ if (what == NETDEV_UNREGISTER && dev->reg_state >= NETREG_UNREGISTERED)
+ return 0;
+
+ /* Check whether we have a NCM device, and find its VID/PID. */
+ if (!(dev->dev.parent && dev->dev.parent->driver &&
+ strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0))
+ return 0;
+
+ usbnet = netdev_priv(dev);
+ usbdev = usbnet->udev;
+
+ pr_debug("USB CDC NCM device VID:0x%4x PID:0x%4x\n",
+ le16_to_cpu(usbdev->descriptor.idVendor),
+ le16_to_cpu(usbdev->descriptor.idProduct));
+
+ /* Check for VID/PID that supports CAIF */
+ if (!(le16_to_cpu(usbdev->descriptor.idVendor) == STE_USB_VID &&
+ le16_to_cpu(usbdev->descriptor.idProduct) == STE_USB_PID_CAIF))
+ return 0;
+
+ if (what == NETDEV_UNREGISTER)
+ module_put(THIS_MODULE);
+
+ if (what != NETDEV_REGISTER)
+ return 0;
+
+ __module_get(THIS_MODULE);
+
+ memset(&common, 0, sizeof(common));
+ common.use_frag = false;
+ common.use_fcs = false;
+ common.use_stx = false;
+ common.link_select = CAIF_LINK_HIGH_BANDW;
+ common.flowctrl = NULL;
+
+ link_support = cfusbl_create(dev->ifindex, dev->dev_addr,
+ dev->broadcast);
+
+ if (!link_support)
+ return -ENOMEM;
+
+ if (dev->num_tx_queues > 1)
+ pr_warn("USB device uses more than one tx queue\n");
+
+ res = caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN,
+ &layer, &caif_usb_type.func);
+ if (res)
+ goto err;
+
+ if (!pack_added)
+ dev_add_pack(&caif_usb_type);
+ pack_added = true;
+
+ strscpy(layer->name, dev->name, sizeof(layer->name));
+
+ return 0;
+err:
+ cfusbl_release(link_support);
+ return res;
+}
+
+static struct notifier_block caif_device_notifier = {
+ .notifier_call = cfusbl_device_notify,
+ .priority = 0,
+};
+
+static int __init cfusbl_init(void)
+{
+ return register_netdevice_notifier(&caif_device_notifier);
+}
+
+static void __exit cfusbl_exit(void)
+{
+ unregister_netdevice_notifier(&caif_device_notifier);
+ dev_remove_pack(&caif_usb_type);
+}
+
+module_init(cfusbl_init);
+module_exit(cfusbl_exit);
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
new file mode 100644
index 000000000000..52509e185960
--- /dev/null
+++ b/net/caif/cfcnfg.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+#include <net/caif/cfctrl.h>
+#include <net/caif/cfmuxl.h>
+#include <net/caif/cffrml.h>
+#include <net/caif/cfserl.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/caif_dev.h>
+
+#define container_obj(layr) container_of(layr, struct cfcnfg, layer)
+
+/* Information about CAIF physical interfaces held by Config Module in order
+ * to manage physical interfaces
+ */
+struct cfcnfg_phyinfo {
+ struct list_head node;
+ bool up;
+
+ /* Pointer to the layer below the MUX (framing layer) */
+ struct cflayer *frm_layer;
+ /* Pointer to the lowest actual physical layer */
+ struct cflayer *phy_layer;
+ /* Unique identifier of the physical interface */
+ unsigned int id;
+ /* Preference of the physical in interface */
+ enum cfcnfg_phy_preference pref;
+
+ /* Information about the physical device */
+ struct dev_info dev_info;
+
+ /* Interface index */
+ int ifindex;
+
+ /* Protocol head room added for CAIF link layer */
+ int head_room;
+
+ /* Use Start of frame checksum */
+ bool use_fcs;
+};
+
+struct cfcnfg {
+ struct cflayer layer;
+ struct cflayer *ctrl;
+ struct cflayer *mux;
+ struct list_head phys;
+ struct mutex lock;
+};
+
+static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id,
+ enum cfctrl_srv serv, u8 phyid,
+ struct cflayer *adapt_layer);
+static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id);
+static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
+ struct cflayer *adapt_layer);
+static void cfctrl_resp_func(void);
+static void cfctrl_enum_resp(void);
+
+struct cfcnfg *cfcnfg_create(void)
+{
+ struct cfcnfg *this;
+ struct cfctrl_rsp *resp;
+
+ might_sleep();
+
+ /* Initiate this layer */
+ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ this->mux = cfmuxl_create();
+ if (!this->mux)
+ goto out_of_mem;
+ this->ctrl = cfctrl_create();
+ if (!this->ctrl)
+ goto out_of_mem;
+ /* Initiate response functions */
+ resp = cfctrl_get_respfuncs(this->ctrl);
+ resp->enum_rsp = cfctrl_enum_resp;
+ resp->linkerror_ind = cfctrl_resp_func;
+ resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp;
+ resp->sleep_rsp = cfctrl_resp_func;
+ resp->wake_rsp = cfctrl_resp_func;
+ resp->restart_rsp = cfctrl_resp_func;
+ resp->radioset_rsp = cfctrl_resp_func;
+ resp->linksetup_rsp = cfcnfg_linkup_rsp;
+ resp->reject_rsp = cfcnfg_reject_rsp;
+ INIT_LIST_HEAD(&this->phys);
+
+ cfmuxl_set_uplayer(this->mux, this->ctrl, 0);
+ layer_set_dn(this->ctrl, this->mux);
+ layer_set_up(this->ctrl, this);
+ mutex_init(&this->lock);
+
+ return this;
+out_of_mem:
+ synchronize_rcu();
+
+ kfree(this->mux);
+ kfree(this->ctrl);
+ kfree(this);
+ return NULL;
+}
+
+void cfcnfg_remove(struct cfcnfg *cfg)
+{
+ might_sleep();
+ if (cfg) {
+ synchronize_rcu();
+
+ kfree(cfg->mux);
+ cfctrl_remove(cfg->ctrl);
+ kfree(cfg);
+ }
+}
+
+static void cfctrl_resp_func(void)
+{
+}
+
+static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg,
+ u8 phyid)
+{
+ struct cfcnfg_phyinfo *phy;
+
+ list_for_each_entry_rcu(phy, &cnfg->phys, node)
+ if (phy->id == phyid)
+ return phy;
+ return NULL;
+}
+
+static void cfctrl_enum_resp(void)
+{
+}
+
+static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg,
+ enum cfcnfg_phy_preference phy_pref)
+{
+ /* Try to match with specified preference */
+ struct cfcnfg_phyinfo *phy;
+
+ list_for_each_entry_rcu(phy, &cnfg->phys, node) {
+ if (phy->up && phy->pref == phy_pref &&
+ phy->frm_layer != NULL)
+
+ return &phy->dev_info;
+ }
+
+ /* Otherwise just return something */
+ list_for_each_entry_rcu(phy, &cnfg->phys, node)
+ if (phy->up)
+ return &phy->dev_info;
+
+ return NULL;
+}
+
+static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi)
+{
+ struct cfcnfg_phyinfo *phy;
+
+ list_for_each_entry_rcu(phy, &cnfg->phys, node)
+ if (phy->ifindex == ifi && phy->up)
+ return phy->id;
+ return -ENODEV;
+}
+
+int caif_disconnect_client(struct net *net, struct cflayer *adap_layer)
+{
+ u8 channel_id;
+ struct cfcnfg *cfg = get_cfcnfg(net);
+
+ caif_assert(adap_layer != NULL);
+ cfctrl_cancel_req(cfg->ctrl, adap_layer);
+ channel_id = adap_layer->id;
+ if (channel_id != 0) {
+ struct cflayer *servl;
+ servl = cfmuxl_remove_uplayer(cfg->mux, channel_id);
+ cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer);
+ if (servl != NULL)
+ layer_set_up(servl, NULL);
+ } else
+ pr_debug("nothing to disconnect\n");
+
+ /* Do RCU sync before initiating cleanup */
+ synchronize_rcu();
+ if (adap_layer->ctrlcmd != NULL)
+ adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0);
+ return 0;
+
+}
+EXPORT_SYMBOL(caif_disconnect_client);
+
+static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id)
+{
+}
+
+static const int protohead[CFCTRL_SRV_MASK] = {
+ [CFCTRL_SRV_VEI] = 4,
+ [CFCTRL_SRV_DATAGRAM] = 7,
+ [CFCTRL_SRV_UTIL] = 4,
+ [CFCTRL_SRV_RFM] = 3,
+ [CFCTRL_SRV_DBG] = 3,
+};
+
+
+static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
+ struct caif_connect_request *s,
+ struct cfctrl_link_param *l)
+{
+ struct dev_info *dev_info;
+ enum cfcnfg_phy_preference pref;
+ int res;
+
+ memset(l, 0, sizeof(*l));
+ /* In caif protocol low value is high priority */
+ l->priority = CAIF_PRIO_MAX - s->priority + 1;
+
+ if (s->ifindex != 0) {
+ res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex);
+ if (res < 0)
+ return res;
+ l->phyid = res;
+ } else {
+ switch (s->link_selector) {
+ case CAIF_LINK_HIGH_BANDW:
+ pref = CFPHYPREF_HIGH_BW;
+ break;
+ case CAIF_LINK_LOW_LATENCY:
+ pref = CFPHYPREF_LOW_LAT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ dev_info = cfcnfg_get_phyid(cnfg, pref);
+ if (dev_info == NULL)
+ return -ENODEV;
+ l->phyid = dev_info->id;
+ }
+ switch (s->protocol) {
+ case CAIFPROTO_AT:
+ l->linktype = CFCTRL_SRV_VEI;
+ l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3;
+ l->chtype = s->sockaddr.u.at.type & 0x3;
+ break;
+ case CAIFPROTO_DATAGRAM:
+ l->linktype = CFCTRL_SRV_DATAGRAM;
+ l->chtype = 0x00;
+ l->u.datagram.connid = s->sockaddr.u.dgm.connection_id;
+ break;
+ case CAIFPROTO_DATAGRAM_LOOP:
+ l->linktype = CFCTRL_SRV_DATAGRAM;
+ l->chtype = 0x03;
+ l->endpoint = 0x00;
+ l->u.datagram.connid = s->sockaddr.u.dgm.connection_id;
+ break;
+ case CAIFPROTO_RFM:
+ l->linktype = CFCTRL_SRV_RFM;
+ l->u.datagram.connid = s->sockaddr.u.rfm.connection_id;
+ strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
+ sizeof(l->u.rfm.volume));
+ break;
+ case CAIFPROTO_UTIL:
+ l->linktype = CFCTRL_SRV_UTIL;
+ l->endpoint = 0x00;
+ l->chtype = 0x00;
+ strscpy(l->u.utility.name, s->sockaddr.u.util.service,
+ sizeof(l->u.utility.name));
+ caif_assert(sizeof(l->u.utility.name) > 10);
+ l->u.utility.paramlen = s->param.size;
+ if (l->u.utility.paramlen > sizeof(l->u.utility.params))
+ l->u.utility.paramlen = sizeof(l->u.utility.params);
+
+ memcpy(l->u.utility.params, s->param.data,
+ l->u.utility.paramlen);
+
+ break;
+ case CAIFPROTO_DEBUG:
+ l->linktype = CFCTRL_SRV_DBG;
+ l->endpoint = s->sockaddr.u.dbg.service;
+ l->chtype = s->sockaddr.u.dbg.type;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int caif_connect_client(struct net *net, struct caif_connect_request *conn_req,
+ struct cflayer *adap_layer, int *ifindex,
+ int *proto_head, int *proto_tail)
+{
+ struct cflayer *frml;
+ struct cfcnfg_phyinfo *phy;
+ int err;
+ struct cfctrl_link_param param;
+ struct cfcnfg *cfg = get_cfcnfg(net);
+
+ rcu_read_lock();
+ err = caif_connect_req_to_link_param(cfg, conn_req, &param);
+ if (err)
+ goto unlock;
+
+ phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid);
+ if (!phy) {
+ err = -ENODEV;
+ goto unlock;
+ }
+ err = -EINVAL;
+
+ if (adap_layer == NULL) {
+ pr_err("adap_layer is zero\n");
+ goto unlock;
+ }
+ if (adap_layer->receive == NULL) {
+ pr_err("adap_layer->receive is NULL\n");
+ goto unlock;
+ }
+ if (adap_layer->ctrlcmd == NULL) {
+ pr_err("adap_layer->ctrlcmd == NULL\n");
+ goto unlock;
+ }
+
+ err = -ENODEV;
+ frml = phy->frm_layer;
+ if (frml == NULL) {
+ pr_err("Specified PHY type does not exist!\n");
+ goto unlock;
+ }
+ caif_assert(param.phyid == phy->id);
+ caif_assert(phy->frm_layer->id ==
+ param.phyid);
+ caif_assert(phy->phy_layer->id ==
+ param.phyid);
+
+ *ifindex = phy->ifindex;
+ *proto_tail = 2;
+ *proto_head = protohead[param.linktype] + phy->head_room;
+
+ rcu_read_unlock();
+
+ /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */
+ cfctrl_enum_req(cfg->ctrl, param.phyid);
+ return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer);
+
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(caif_connect_client);
+
+static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
+ struct cflayer *adapt_layer)
+{
+ if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
+ adapt_layer->ctrlcmd(adapt_layer,
+ CAIF_CTRLCMD_INIT_FAIL_RSP, 0);
+}
+
+static void
+cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
+ u8 phyid, struct cflayer *adapt_layer)
+{
+ struct cfcnfg *cnfg = container_obj(layer);
+ struct cflayer *servicel = NULL;
+ struct cfcnfg_phyinfo *phyinfo;
+ struct net_device *netdev;
+
+ if (channel_id == 0) {
+ pr_warn("received channel_id zero\n");
+ if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
+ adapt_layer->ctrlcmd(adapt_layer,
+ CAIF_CTRLCMD_INIT_FAIL_RSP, 0);
+ return;
+ }
+
+ rcu_read_lock();
+
+ if (adapt_layer == NULL) {
+ pr_debug("link setup response but no client exist, send linkdown back\n");
+ cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
+ goto unlock;
+ }
+
+ caif_assert(cnfg != NULL);
+ caif_assert(phyid != 0);
+
+ phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
+ if (phyinfo == NULL) {
+ pr_err("ERROR: Link Layer Device disappeared while connecting\n");
+ goto unlock;
+ }
+
+ caif_assert(phyinfo != NULL);
+ caif_assert(phyinfo->id == phyid);
+ caif_assert(phyinfo->phy_layer != NULL);
+ caif_assert(phyinfo->phy_layer->id == phyid);
+
+ adapt_layer->id = channel_id;
+
+ switch (serv) {
+ case CFCTRL_SRV_VEI:
+ servicel = cfvei_create(channel_id, &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_DATAGRAM:
+ servicel = cfdgml_create(channel_id,
+ &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_RFM:
+ netdev = phyinfo->dev_info.dev;
+ servicel = cfrfml_create(channel_id, &phyinfo->dev_info,
+ netdev->mtu);
+ break;
+ case CFCTRL_SRV_UTIL:
+ servicel = cfutill_create(channel_id, &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_VIDEO:
+ servicel = cfvidl_create(channel_id, &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_DBG:
+ servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
+ break;
+ default:
+ pr_err("Protocol error. Link setup response - unknown channel type\n");
+ goto unlock;
+ }
+ if (!servicel)
+ goto unlock;
+ layer_set_dn(servicel, cnfg->mux);
+ cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id);
+ layer_set_up(servicel, adapt_layer);
+ layer_set_dn(adapt_layer, servicel);
+
+ rcu_read_unlock();
+
+ servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0);
+ return;
+unlock:
+ rcu_read_unlock();
+}
+
+int
+cfcnfg_add_phy_layer(struct cfcnfg *cnfg,
+ struct net_device *dev, struct cflayer *phy_layer,
+ enum cfcnfg_phy_preference pref,
+ struct cflayer *link_support,
+ bool fcs, int head_room)
+{
+ struct cflayer *frml;
+ struct cfcnfg_phyinfo *phyinfo = NULL;
+ int i, res = 0;
+ u8 phyid;
+
+ mutex_lock(&cnfg->lock);
+
+ /* CAIF protocol allow maximum 6 link-layers */
+ for (i = 0; i < 7; i++) {
+ phyid = (dev->ifindex + i) & 0x7;
+ if (phyid == 0)
+ continue;
+ if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL)
+ goto got_phyid;
+ }
+ pr_warn("Too many CAIF Link Layers (max 6)\n");
+ res = -EEXIST;
+ goto out;
+
+got_phyid:
+ phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC);
+ if (!phyinfo) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ phy_layer->id = phyid;
+ phyinfo->pref = pref;
+ phyinfo->id = phyid;
+ phyinfo->dev_info.id = phyid;
+ phyinfo->dev_info.dev = dev;
+ phyinfo->phy_layer = phy_layer;
+ phyinfo->ifindex = dev->ifindex;
+ phyinfo->head_room = head_room;
+ phyinfo->use_fcs = fcs;
+
+ frml = cffrml_create(phyid, fcs);
+
+ if (!frml) {
+ res = -ENOMEM;
+ goto out_err;
+ }
+ phyinfo->frm_layer = frml;
+ layer_set_up(frml, cnfg->mux);
+
+ if (link_support != NULL) {
+ link_support->id = phyid;
+ layer_set_dn(frml, link_support);
+ layer_set_up(link_support, frml);
+ layer_set_dn(link_support, phy_layer);
+ layer_set_up(phy_layer, link_support);
+ } else {
+ layer_set_dn(frml, phy_layer);
+ layer_set_up(phy_layer, frml);
+ }
+
+ list_add_rcu(&phyinfo->node, &cnfg->phys);
+out:
+ mutex_unlock(&cnfg->lock);
+ return res;
+
+out_err:
+ kfree(phyinfo);
+ mutex_unlock(&cnfg->lock);
+ return res;
+}
+EXPORT_SYMBOL(cfcnfg_add_phy_layer);
+
+int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer,
+ bool up)
+{
+ struct cfcnfg_phyinfo *phyinfo;
+
+ rcu_read_lock();
+ phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id);
+ if (phyinfo == NULL) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ if (phyinfo->up == up) {
+ rcu_read_unlock();
+ return 0;
+ }
+ phyinfo->up = up;
+
+ if (up) {
+ cffrml_hold(phyinfo->frm_layer);
+ cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer,
+ phy_layer->id);
+ } else {
+ cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id);
+ cffrml_put(phyinfo->frm_layer);
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(cfcnfg_set_phy_state);
+
+int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer)
+{
+ struct cflayer *frml, *frml_dn;
+ u16 phyid;
+ struct cfcnfg_phyinfo *phyinfo;
+
+ might_sleep();
+
+ mutex_lock(&cnfg->lock);
+
+ phyid = phy_layer->id;
+ phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
+
+ if (phyinfo == NULL) {
+ mutex_unlock(&cnfg->lock);
+ return 0;
+ }
+ caif_assert(phyid == phyinfo->id);
+ caif_assert(phy_layer == phyinfo->phy_layer);
+ caif_assert(phy_layer->id == phyid);
+ caif_assert(phyinfo->frm_layer->id == phyid);
+
+ list_del_rcu(&phyinfo->node);
+ synchronize_rcu();
+
+ /* Fail if reference count is not zero */
+ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) {
+ pr_info("Wait for device inuse\n");
+ list_add_rcu(&phyinfo->node, &cnfg->phys);
+ mutex_unlock(&cnfg->lock);
+ return -EAGAIN;
+ }
+
+ frml = phyinfo->frm_layer;
+ frml_dn = frml->dn;
+ cffrml_set_uplayer(frml, NULL);
+ cffrml_set_dnlayer(frml, NULL);
+ if (phy_layer != frml_dn) {
+ layer_set_up(frml_dn, NULL);
+ layer_set_dn(frml_dn, NULL);
+ }
+ layer_set_up(phy_layer, NULL);
+
+ if (phyinfo->phy_layer != frml_dn)
+ kfree(frml_dn);
+
+ cffrml_free(frml);
+ kfree(phyinfo);
+ mutex_unlock(&cnfg->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(cfcnfg_del_phy_layer);
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
new file mode 100644
index 000000000000..2aa1e7d46eb2
--- /dev/null
+++ b/net/caif/cfctrl.c
@@ -0,0 +1,631 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfctrl.h>
+
+#define container_obj(layr) container_of(layr, struct cfctrl, serv.layer)
+#define UTILITY_NAME_LENGTH 16
+#define CFPKT_CTRL_PKT_LEN 20
+
+#ifdef CAIF_NO_LOOP
+static int handle_loop(struct cfctrl *ctrl,
+ int cmd, struct cfpkt *pkt){
+ return -1;
+}
+#else
+static int handle_loop(struct cfctrl *ctrl,
+ int cmd, struct cfpkt *pkt);
+#endif
+static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt);
+static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+
+
+struct cflayer *cfctrl_create(void)
+{
+ struct dev_info dev_info;
+ struct cfctrl *this =
+ kzalloc(sizeof(struct cfctrl), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
+ memset(&dev_info, 0, sizeof(dev_info));
+ dev_info.id = 0xff;
+ cfsrvl_init(&this->serv, 0, &dev_info, false);
+ atomic_set(&this->req_seq_no, 1);
+ atomic_set(&this->rsp_seq_no, 1);
+ this->serv.layer.receive = cfctrl_recv;
+ sprintf(this->serv.layer.name, "ctrl");
+ this->serv.layer.ctrlcmd = cfctrl_ctrlcmd;
+#ifndef CAIF_NO_LOOP
+ spin_lock_init(&this->loop_linkid_lock);
+ this->loop_linkid = 1;
+#endif
+ spin_lock_init(&this->info_list_lock);
+ INIT_LIST_HEAD(&this->list);
+ return &this->serv.layer;
+}
+
+void cfctrl_remove(struct cflayer *layer)
+{
+ struct cfctrl_request_info *p, *tmp;
+ struct cfctrl *ctrl = container_obj(layer);
+
+ spin_lock_bh(&ctrl->info_list_lock);
+ list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+ list_del(&p->list);
+ kfree(p);
+ }
+ spin_unlock_bh(&ctrl->info_list_lock);
+ kfree(layer);
+}
+
+static bool param_eq(const struct cfctrl_link_param *p1,
+ const struct cfctrl_link_param *p2)
+{
+ bool eq =
+ p1->linktype == p2->linktype &&
+ p1->priority == p2->priority &&
+ p1->phyid == p2->phyid &&
+ p1->endpoint == p2->endpoint && p1->chtype == p2->chtype;
+
+ if (!eq)
+ return false;
+
+ switch (p1->linktype) {
+ case CFCTRL_SRV_VEI:
+ return true;
+ case CFCTRL_SRV_DATAGRAM:
+ return p1->u.datagram.connid == p2->u.datagram.connid;
+ case CFCTRL_SRV_RFM:
+ return
+ p1->u.rfm.connid == p2->u.rfm.connid &&
+ strcmp(p1->u.rfm.volume, p2->u.rfm.volume) == 0;
+ case CFCTRL_SRV_UTIL:
+ return
+ p1->u.utility.fifosize_kb == p2->u.utility.fifosize_kb
+ && p1->u.utility.fifosize_bufs ==
+ p2->u.utility.fifosize_bufs
+ && strcmp(p1->u.utility.name, p2->u.utility.name) == 0
+ && p1->u.utility.paramlen == p2->u.utility.paramlen
+ && memcmp(p1->u.utility.params, p2->u.utility.params,
+ p1->u.utility.paramlen) == 0;
+
+ case CFCTRL_SRV_VIDEO:
+ return p1->u.video.connid == p2->u.video.connid;
+ case CFCTRL_SRV_DBG:
+ return true;
+ case CFCTRL_SRV_DECM:
+ return false;
+ default:
+ return false;
+ }
+ return false;
+}
+
+static bool cfctrl_req_eq(const struct cfctrl_request_info *r1,
+ const struct cfctrl_request_info *r2)
+{
+ if (r1->cmd != r2->cmd)
+ return false;
+ if (r1->cmd == CFCTRL_CMD_LINK_SETUP)
+ return param_eq(&r1->param, &r2->param);
+ else
+ return r1->channel_id == r2->channel_id;
+}
+
+/* Insert request at the end */
+static void cfctrl_insert_req(struct cfctrl *ctrl,
+ struct cfctrl_request_info *req)
+{
+ spin_lock_bh(&ctrl->info_list_lock);
+ atomic_inc(&ctrl->req_seq_no);
+ req->sequence_no = atomic_read(&ctrl->req_seq_no);
+ list_add_tail(&req->list, &ctrl->list);
+ spin_unlock_bh(&ctrl->info_list_lock);
+}
+
+/* Compare and remove request */
+static struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
+ struct cfctrl_request_info *req)
+{
+ struct cfctrl_request_info *p, *tmp, *first;
+
+ first = list_first_entry(&ctrl->list, struct cfctrl_request_info, list);
+
+ list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+ if (cfctrl_req_eq(req, p)) {
+ if (p != first)
+ pr_warn("Requests are not received in order\n");
+
+ atomic_set(&ctrl->rsp_seq_no,
+ p->sequence_no);
+ list_del(&p->list);
+ goto out;
+ }
+ }
+ p = NULL;
+out:
+ return p;
+}
+
+struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer)
+{
+ struct cfctrl *this = container_obj(layer);
+ return &this->res;
+}
+
+static void init_info(struct caif_payload_info *info, struct cfctrl *cfctrl)
+{
+ info->hdr_len = 0;
+ info->channel_id = cfctrl->serv.layer.id;
+ info->dev_info = &cfctrl->serv.dev_info;
+}
+
+void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
+{
+ struct cfpkt *pkt;
+ struct cfctrl *cfctrl = container_obj(layer);
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+
+ if (!dn) {
+ pr_debug("not able to send enum request\n");
+ return;
+ }
+ pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+ if (!pkt)
+ return;
+ caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
+ init_info(cfpkt_info(pkt), cfctrl);
+ cfpkt_info(pkt)->dev_info->id = physlinkid;
+ cfctrl->serv.dev_info.id = physlinkid;
+ cfpkt_addbdy(pkt, CFCTRL_CMD_ENUM);
+ cfpkt_addbdy(pkt, physlinkid);
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ dn->transmit(dn, pkt);
+}
+
+int cfctrl_linkup_request(struct cflayer *layer,
+ struct cfctrl_link_param *param,
+ struct cflayer *user_layer)
+{
+ struct cfctrl *cfctrl = container_obj(layer);
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+ char utility_name[UTILITY_NAME_LENGTH];
+ struct cfctrl_request_info *req;
+ struct cfpkt *pkt;
+ u32 tmp32;
+ u16 tmp16;
+ u8 tmp8;
+ int ret;
+
+ if (!dn) {
+ pr_debug("not able to send linkup request\n");
+ return -ENODEV;
+ }
+
+ if (cfctrl_cancel_req(layer, user_layer) > 0) {
+ /* Slight Paranoia, check if already connecting */
+ pr_err("Duplicate connect request for same client\n");
+ WARN_ON(1);
+ return -EALREADY;
+ }
+
+ pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+ if (!pkt)
+ return -ENOMEM;
+ cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP);
+ cfpkt_addbdy(pkt, (param->chtype << 4) | param->linktype);
+ cfpkt_addbdy(pkt, (param->priority << 3) | param->phyid);
+ cfpkt_addbdy(pkt, param->endpoint & 0x03);
+
+ switch (param->linktype) {
+ case CFCTRL_SRV_VEI:
+ break;
+ case CFCTRL_SRV_VIDEO:
+ cfpkt_addbdy(pkt, (u8) param->u.video.connid);
+ break;
+ case CFCTRL_SRV_DBG:
+ break;
+ case CFCTRL_SRV_DATAGRAM:
+ tmp32 = cpu_to_le32(param->u.datagram.connid);
+ cfpkt_add_body(pkt, &tmp32, 4);
+ break;
+ case CFCTRL_SRV_RFM:
+ /* Construct a frame, convert DatagramConnectionID to network
+ * format long and copy it out...
+ */
+ tmp32 = cpu_to_le32(param->u.rfm.connid);
+ cfpkt_add_body(pkt, &tmp32, 4);
+ /* Add volume name, including zero termination... */
+ cfpkt_add_body(pkt, param->u.rfm.volume,
+ strlen(param->u.rfm.volume) + 1);
+ break;
+ case CFCTRL_SRV_UTIL:
+ tmp16 = cpu_to_le16(param->u.utility.fifosize_kb);
+ cfpkt_add_body(pkt, &tmp16, 2);
+ tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
+ cfpkt_add_body(pkt, &tmp16, 2);
+ strscpy_pad(utility_name, param->u.utility.name);
+ cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
+ tmp8 = param->u.utility.paramlen;
+ cfpkt_add_body(pkt, &tmp8, 1);
+ cfpkt_add_body(pkt, param->u.utility.params,
+ param->u.utility.paramlen);
+ break;
+ default:
+ pr_warn("Request setup of bad link type = %d\n",
+ param->linktype);
+ cfpkt_destroy(pkt);
+ return -EINVAL;
+ }
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req) {
+ cfpkt_destroy(pkt);
+ return -ENOMEM;
+ }
+
+ req->client_layer = user_layer;
+ req->cmd = CFCTRL_CMD_LINK_SETUP;
+ req->param = *param;
+ cfctrl_insert_req(cfctrl, req);
+ init_info(cfpkt_info(pkt), cfctrl);
+ /*
+ * NOTE:Always send linkup and linkdown request on the same
+ * device as the payload. Otherwise old queued up payload
+ * might arrive with the newly allocated channel ID.
+ */
+ cfpkt_info(pkt)->dev_info->id = param->phyid;
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ ret =
+ dn->transmit(dn, pkt);
+ if (ret < 0) {
+ int count;
+
+ count = cfctrl_cancel_req(&cfctrl->serv.layer,
+ user_layer);
+ if (count != 1) {
+ pr_err("Could not remove request (%d)", count);
+ return -ENODEV;
+ }
+ }
+ return 0;
+}
+
+int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
+ struct cflayer *client)
+{
+ int ret;
+ struct cfpkt *pkt;
+ struct cfctrl *cfctrl = container_obj(layer);
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+
+ if (!dn) {
+ pr_debug("not able to send link-down request\n");
+ return -ENODEV;
+ }
+ pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+ if (!pkt)
+ return -ENOMEM;
+ cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY);
+ cfpkt_addbdy(pkt, channelid);
+ init_info(cfpkt_info(pkt), cfctrl);
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ ret =
+ dn->transmit(dn, pkt);
+#ifndef CAIF_NO_LOOP
+ cfctrl->loop_linkused[channelid] = 0;
+#endif
+ return ret;
+}
+
+int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
+{
+ struct cfctrl_request_info *p, *tmp;
+ struct cfctrl *ctrl = container_obj(layr);
+ int found = 0;
+ spin_lock_bh(&ctrl->info_list_lock);
+
+ list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+ if (p->client_layer == adap_layer) {
+ list_del(&p->list);
+ kfree(p);
+ found++;
+ }
+ }
+
+ spin_unlock_bh(&ctrl->info_list_lock);
+ return found;
+}
+
+static int cfctrl_link_setup(struct cfctrl *cfctrl, struct cfpkt *pkt, u8 cmdrsp)
+{
+ u8 len;
+ u8 linkid = 0;
+ enum cfctrl_srv serv;
+ enum cfctrl_srv servtype;
+ u8 endpoint;
+ u8 physlinkid;
+ u8 prio;
+ u8 tmp;
+ u8 *cp;
+ int i;
+ struct cfctrl_link_param linkparam;
+ struct cfctrl_request_info rsp, *req;
+
+ memset(&linkparam, 0, sizeof(linkparam));
+
+ tmp = cfpkt_extr_head_u8(pkt);
+
+ serv = tmp & CFCTRL_SRV_MASK;
+ linkparam.linktype = serv;
+
+ servtype = tmp >> 4;
+ linkparam.chtype = servtype;
+
+ tmp = cfpkt_extr_head_u8(pkt);
+ physlinkid = tmp & 0x07;
+ prio = tmp >> 3;
+
+ linkparam.priority = prio;
+ linkparam.phyid = physlinkid;
+ endpoint = cfpkt_extr_head_u8(pkt);
+ linkparam.endpoint = endpoint & 0x03;
+
+ switch (serv) {
+ case CFCTRL_SRV_VEI:
+ case CFCTRL_SRV_DBG:
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ break;
+ case CFCTRL_SRV_VIDEO:
+ tmp = cfpkt_extr_head_u8(pkt);
+ linkparam.u.video.connid = tmp;
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ break;
+
+ case CFCTRL_SRV_DATAGRAM:
+ linkparam.u.datagram.connid = cfpkt_extr_head_u32(pkt);
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ break;
+ case CFCTRL_SRV_RFM:
+ /* Construct a frame, convert
+ * DatagramConnectionID
+ * to network format long and copy it out...
+ */
+ linkparam.u.rfm.connid = cfpkt_extr_head_u32(pkt);
+ cp = (u8 *) linkparam.u.rfm.volume;
+ for (tmp = cfpkt_extr_head_u8(pkt);
+ cfpkt_more(pkt) && tmp != '\0';
+ tmp = cfpkt_extr_head_u8(pkt))
+ *cp++ = tmp;
+ *cp = '\0';
+
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+
+ break;
+ case CFCTRL_SRV_UTIL:
+ /* Construct a frame, convert
+ * DatagramConnectionID
+ * to network format long and copy it out...
+ */
+ /* Fifosize KB */
+ linkparam.u.utility.fifosize_kb = cfpkt_extr_head_u16(pkt);
+ /* Fifosize bufs */
+ linkparam.u.utility.fifosize_bufs = cfpkt_extr_head_u16(pkt);
+ /* name */
+ cp = (u8 *) linkparam.u.utility.name;
+ caif_assert(sizeof(linkparam.u.utility.name)
+ >= UTILITY_NAME_LENGTH);
+ for (i = 0; i < UTILITY_NAME_LENGTH && cfpkt_more(pkt); i++) {
+ tmp = cfpkt_extr_head_u8(pkt);
+ *cp++ = tmp;
+ }
+ /* Length */
+ len = cfpkt_extr_head_u8(pkt);
+ linkparam.u.utility.paramlen = len;
+ /* Param Data */
+ cp = linkparam.u.utility.params;
+ while (cfpkt_more(pkt) && len--) {
+ tmp = cfpkt_extr_head_u8(pkt);
+ *cp++ = tmp;
+ }
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ /* Length */
+ len = cfpkt_extr_head_u8(pkt);
+ /* Param Data */
+ cfpkt_extr_head(pkt, NULL, len);
+ break;
+ default:
+ pr_warn("Request setup, invalid type (%d)\n", serv);
+ return -1;
+ }
+
+ rsp.cmd = CFCTRL_CMD_LINK_SETUP;
+ rsp.param = linkparam;
+ spin_lock_bh(&cfctrl->info_list_lock);
+ req = cfctrl_remove_req(cfctrl, &rsp);
+
+ if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
+ cfpkt_erroneous(pkt)) {
+ pr_err("Invalid O/E bit or parse error "
+ "on CAIF control channel\n");
+ cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 0,
+ req ? req->client_layer : NULL);
+ } else {
+ cfctrl->res.linksetup_rsp(cfctrl->serv.layer.up, linkid,
+ serv, physlinkid,
+ req ? req->client_layer : NULL);
+ }
+
+ kfree(req);
+
+ spin_unlock_bh(&cfctrl->info_list_lock);
+
+ return 0;
+}
+
+static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
+{
+ u8 cmdrsp;
+ u8 cmd;
+ int ret = 0;
+ u8 linkid = 0;
+ struct cfctrl *cfctrl = container_obj(layer);
+
+ cmdrsp = cfpkt_extr_head_u8(pkt);
+ cmd = cmdrsp & CFCTRL_CMD_MASK;
+ if (cmd != CFCTRL_CMD_LINK_ERR
+ && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp)
+ && CFCTRL_ERR_BIT != (CFCTRL_ERR_BIT & cmdrsp)) {
+ if (handle_loop(cfctrl, cmd, pkt) != 0)
+ cmdrsp |= CFCTRL_ERR_BIT;
+ }
+
+ switch (cmd) {
+ case CFCTRL_CMD_LINK_SETUP:
+ ret = cfctrl_link_setup(cfctrl, pkt, cmdrsp);
+ break;
+ case CFCTRL_CMD_LINK_DESTROY:
+ linkid = cfpkt_extr_head_u8(pkt);
+ cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
+ break;
+ case CFCTRL_CMD_LINK_ERR:
+ pr_err("Frame Error Indication received\n");
+ cfctrl->res.linkerror_ind();
+ break;
+ case CFCTRL_CMD_ENUM:
+ cfctrl->res.enum_rsp();
+ break;
+ case CFCTRL_CMD_SLEEP:
+ cfctrl->res.sleep_rsp();
+ break;
+ case CFCTRL_CMD_WAKE:
+ cfctrl->res.wake_rsp();
+ break;
+ case CFCTRL_CMD_LINK_RECONF:
+ cfctrl->res.restart_rsp();
+ break;
+ case CFCTRL_CMD_RADIO_SET:
+ cfctrl->res.radioset_rsp();
+ break;
+ default:
+ pr_err("Unrecognized Control Frame\n");
+ ret = -1;
+ goto error;
+ }
+error:
+ cfpkt_destroy(pkt);
+ return ret;
+}
+
+static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ struct cfctrl *this = container_obj(layr);
+ switch (ctrl) {
+ case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ spin_lock_bh(&this->info_list_lock);
+ if (!list_empty(&this->list))
+ pr_debug("Received flow off in control layer\n");
+ spin_unlock_bh(&this->info_list_lock);
+ break;
+ case _CAIF_CTRLCMD_PHYIF_DOWN_IND: {
+ struct cfctrl_request_info *p, *tmp;
+
+ /* Find all connect request and report failure */
+ spin_lock_bh(&this->info_list_lock);
+ list_for_each_entry_safe(p, tmp, &this->list, list) {
+ if (p->param.phyid == phyid) {
+ list_del(&p->list);
+ p->client_layer->ctrlcmd(p->client_layer,
+ CAIF_CTRLCMD_INIT_FAIL_RSP,
+ phyid);
+ kfree(p);
+ }
+ }
+ spin_unlock_bh(&this->info_list_lock);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+#ifndef CAIF_NO_LOOP
+static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt)
+{
+ static int last_linkid;
+ static int dec;
+ u8 linkid, linktype, tmp;
+ switch (cmd) {
+ case CFCTRL_CMD_LINK_SETUP:
+ spin_lock_bh(&ctrl->loop_linkid_lock);
+ if (!dec) {
+ for (linkid = last_linkid + 1; linkid < 254; linkid++)
+ if (!ctrl->loop_linkused[linkid])
+ goto found;
+ }
+ dec = 1;
+ for (linkid = last_linkid - 1; linkid > 1; linkid--)
+ if (!ctrl->loop_linkused[linkid])
+ goto found;
+ spin_unlock_bh(&ctrl->loop_linkid_lock);
+ return -1;
+found:
+ if (linkid < 10)
+ dec = 0;
+
+ if (!ctrl->loop_linkused[linkid])
+ ctrl->loop_linkused[linkid] = 1;
+
+ last_linkid = linkid;
+
+ cfpkt_add_trail(pkt, &linkid, 1);
+ spin_unlock_bh(&ctrl->loop_linkid_lock);
+ cfpkt_peek_head(pkt, &linktype, 1);
+ if (linktype == CFCTRL_SRV_UTIL) {
+ tmp = 0x01;
+ cfpkt_add_trail(pkt, &tmp, 1);
+ cfpkt_add_trail(pkt, &tmp, 1);
+ }
+ break;
+
+ case CFCTRL_CMD_LINK_DESTROY:
+ spin_lock_bh(&ctrl->loop_linkid_lock);
+ cfpkt_peek_head(pkt, &linkid, 1);
+ ctrl->loop_linkused[linkid] = 0;
+ spin_unlock_bh(&ctrl->loop_linkid_lock);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+#endif
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
new file mode 100644
index 000000000000..77f428428b47
--- /dev/null
+++ b/net/caif/cfdbgl.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *dbg = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!dbg)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(dbg, channel_id, dev_info, false);
+ dbg->layer.receive = cfdbgl_receive;
+ dbg->layer.transmit = cfdbgl_transmit;
+ snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ, "dbg%d", channel_id);
+ return &dbg->layer;
+}
+
+static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct cfsrvl *service = container_obj(layr);
+ struct caif_payload_info *info;
+ int ret;
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ /* Add info for MUX-layer to route the packet out */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->dev_info = &service->dev_info;
+
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
new file mode 100644
index 000000000000..eb6f8ef47a79
--- /dev/null
+++ b/net/caif/cfdgml.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+#define DGM_CMD_BIT 0x80
+#define DGM_FLOW_OFF 0x81
+#define DGM_FLOW_ON 0x80
+#define DGM_MTU 1500
+
+static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *dgm = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!dgm)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(dgm, channel_id, dev_info, true);
+ dgm->layer.receive = cfdgml_receive;
+ dgm->layer.transmit = cfdgml_transmit;
+ snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ, "dgm%d", channel_id);
+ return &dgm->layer;
+}
+
+static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 cmd = -1;
+ u8 dgmhdr[3];
+ int ret;
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->receive != NULL);
+ caif_assert(layr->ctrlcmd != NULL);
+
+ if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ if ((cmd & DGM_CMD_BIT) == 0) {
+ if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ ret = layr->up->receive(layr->up, pkt);
+ return ret;
+ }
+
+ switch (cmd) {
+ case DGM_FLOW_OFF: /* FLOW OFF */
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case DGM_FLOW_ON: /* FLOW ON */
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ default:
+ cfpkt_destroy(pkt);
+ pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd);
+ return -EPROTO;
+ }
+}
+
+static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 packet_type;
+ u32 zero = 0;
+ struct caif_payload_info *info;
+ struct cfsrvl *service = container_obj(layr);
+ int ret;
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ /* STE Modem cannot handle more than 1500 bytes datagrams */
+ if (cfpkt_getlen(pkt) > DGM_MTU) {
+ cfpkt_destroy(pkt);
+ return -EMSGSIZE;
+ }
+
+ cfpkt_add_head(pkt, &zero, 3);
+ packet_type = 0x08; /* B9 set - UNCLASSIFIED */
+ cfpkt_add_head(pkt, &packet_type, 1);
+
+ /* Add info for MUX-layer to route the packet out. */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ /* To optimize alignment, we add up the size of CAIF header
+ * before payload.
+ */
+ info->hdr_len = 4;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
new file mode 100644
index 000000000000..6651a8dc62e0
--- /dev/null
+++ b/net/caif/cffrml.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CAIF Framing Layer.
+ *
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/crc-ccitt.h>
+#include <linux/netdevice.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cffrml.h>
+
+#define container_obj(layr) container_of(layr, struct cffrml, layer)
+
+struct cffrml {
+ struct cflayer layer;
+ bool dofcs; /* !< FCS active */
+ int __percpu *pcpu_refcnt;
+};
+
+static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+
+static u32 cffrml_rcv_error;
+static u32 cffrml_rcv_checsum_error;
+struct cflayer *cffrml_create(u16 phyid, bool use_fcs)
+{
+ struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ this->pcpu_refcnt = alloc_percpu(int);
+ if (this->pcpu_refcnt == NULL) {
+ kfree(this);
+ return NULL;
+ }
+
+ caif_assert(offsetof(struct cffrml, layer) == 0);
+
+ this->layer.receive = cffrml_receive;
+ this->layer.transmit = cffrml_transmit;
+ this->layer.ctrlcmd = cffrml_ctrlcmd;
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid);
+ this->dofcs = use_fcs;
+ this->layer.id = phyid;
+ return (struct cflayer *) this;
+}
+
+void cffrml_free(struct cflayer *layer)
+{
+ struct cffrml *this = container_obj(layer);
+ free_percpu(this->pcpu_refcnt);
+ kfree(layer);
+}
+
+void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up)
+{
+ this->up = up;
+}
+
+void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn)
+{
+ this->dn = dn;
+}
+
+static u16 cffrml_checksum(u16 chks, void *buf, u16 len)
+{
+ /* FIXME: FCS should be moved to glue in order to use OS-Specific
+ * solutions
+ */
+ return crc_ccitt(chks, buf, len);
+}
+
+static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u16 tmp;
+ u16 len;
+ u16 hdrchks;
+ int pktchks;
+ struct cffrml *this;
+ this = container_obj(layr);
+
+ cfpkt_extr_head(pkt, &tmp, 2);
+ len = le16_to_cpu(tmp);
+
+ /* Subtract for FCS on length if FCS is not used. */
+ if (!this->dofcs)
+ len -= 2;
+
+ if (cfpkt_setlen(pkt, len) < 0) {
+ ++cffrml_rcv_error;
+ pr_err("Framing length error (%d)\n", len);
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ /*
+ * Don't do extract if FCS is false, rather do setlen - then we don't
+ * get a cache-miss.
+ */
+ if (this->dofcs) {
+ cfpkt_extr_trail(pkt, &tmp, 2);
+ hdrchks = le16_to_cpu(tmp);
+ pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff);
+ if (pktchks != hdrchks) {
+ cfpkt_add_trail(pkt, &tmp, 2);
+ ++cffrml_rcv_error;
+ ++cffrml_rcv_checsum_error;
+ pr_info("Frame checksum error (0x%x != 0x%x)\n",
+ hdrchks, pktchks);
+ return -EILSEQ;
+ }
+ }
+ if (cfpkt_erroneous(pkt)) {
+ ++cffrml_rcv_error;
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ if (layr->up == NULL) {
+ pr_err("Layr up is missing!\n");
+ cfpkt_destroy(pkt);
+ return -EINVAL;
+ }
+
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u16 chks;
+ u16 len;
+ __le16 data;
+
+ struct cffrml *this = container_obj(layr);
+ if (this->dofcs) {
+ chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff);
+ data = cpu_to_le16(chks);
+ cfpkt_add_trail(pkt, &data, 2);
+ } else {
+ cfpkt_pad_trail(pkt, 2);
+ }
+ len = cfpkt_getlen(pkt);
+ data = cpu_to_le16(len);
+ cfpkt_add_head(pkt, &data, 2);
+ cfpkt_info(pkt)->hdr_len += 2;
+ if (cfpkt_erroneous(pkt)) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ if (layr->dn == NULL) {
+ cfpkt_destroy(pkt);
+ return -ENODEV;
+
+ }
+ return layr->dn->transmit(layr->dn, pkt);
+}
+
+static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ if (layr->up && layr->up->ctrlcmd)
+ layr->up->ctrlcmd(layr->up, ctrl, layr->id);
+}
+
+void cffrml_put(struct cflayer *layr)
+{
+ struct cffrml *this = container_obj(layr);
+ if (layr != NULL && this->pcpu_refcnt != NULL)
+ this_cpu_dec(*this->pcpu_refcnt);
+}
+
+void cffrml_hold(struct cflayer *layr)
+{
+ struct cffrml *this = container_obj(layr);
+ if (layr != NULL && this->pcpu_refcnt != NULL)
+ this_cpu_inc(*this->pcpu_refcnt);
+}
+
+int cffrml_refcnt_read(struct cflayer *layr)
+{
+ int i, refcnt = 0;
+ struct cffrml *this = container_obj(layr);
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(this->pcpu_refcnt, i);
+ return refcnt;
+}
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
new file mode 100644
index 000000000000..4172b0d0db63
--- /dev/null
+++ b/net/caif/cfmuxl.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfmuxl.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cffrml.h>
+
+#define container_obj(layr) container_of(layr, struct cfmuxl, layer)
+
+#define CAIF_CTRL_CHANNEL 0
+#define UP_CACHE_SIZE 8
+#define DN_CACHE_SIZE 8
+
+struct cfmuxl {
+ struct cflayer layer;
+ struct list_head srvl_list;
+ struct list_head frml_list;
+ struct cflayer *up_cache[UP_CACHE_SIZE];
+ struct cflayer *dn_cache[DN_CACHE_SIZE];
+ /*
+ * Set when inserting or removing downwards layers.
+ */
+ spinlock_t transmit_lock;
+
+ /*
+ * Set when inserting or removing upwards layers.
+ */
+ spinlock_t receive_lock;
+
+};
+
+static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+static struct cflayer *get_up(struct cfmuxl *muxl, u16 id);
+
+struct cflayer *cfmuxl_create(void)
+{
+ struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC);
+
+ if (!this)
+ return NULL;
+ this->layer.receive = cfmuxl_receive;
+ this->layer.transmit = cfmuxl_transmit;
+ this->layer.ctrlcmd = cfmuxl_ctrlcmd;
+ INIT_LIST_HEAD(&this->srvl_list);
+ INIT_LIST_HEAD(&this->frml_list);
+ spin_lock_init(&this->transmit_lock);
+ spin_lock_init(&this->receive_lock);
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux");
+ return &this->layer;
+}
+
+int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid)
+{
+ struct cfmuxl *muxl = (struct cfmuxl *) layr;
+
+ spin_lock_bh(&muxl->transmit_lock);
+ list_add_rcu(&dn->node, &muxl->frml_list);
+ spin_unlock_bh(&muxl->transmit_lock);
+ return 0;
+}
+
+static struct cflayer *get_from_id(struct list_head *list, u16 id)
+{
+ struct cflayer *lyr;
+ list_for_each_entry_rcu(lyr, list, node) {
+ if (lyr->id == id)
+ return lyr;
+ }
+
+ return NULL;
+}
+
+int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ struct cflayer *old;
+
+ spin_lock_bh(&muxl->receive_lock);
+
+ /* Two entries with same id is wrong, so remove old layer from mux */
+ old = get_from_id(&muxl->srvl_list, linkid);
+ if (old != NULL)
+ list_del_rcu(&old->node);
+
+ list_add_rcu(&up->node, &muxl->srvl_list);
+ spin_unlock_bh(&muxl->receive_lock);
+
+ return 0;
+}
+
+struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ struct cflayer *dn;
+ int idx = phyid % DN_CACHE_SIZE;
+
+ spin_lock_bh(&muxl->transmit_lock);
+ RCU_INIT_POINTER(muxl->dn_cache[idx], NULL);
+ dn = get_from_id(&muxl->frml_list, phyid);
+ if (dn == NULL)
+ goto out;
+
+ list_del_rcu(&dn->node);
+ caif_assert(dn != NULL);
+out:
+ spin_unlock_bh(&muxl->transmit_lock);
+ return dn;
+}
+
+static struct cflayer *get_up(struct cfmuxl *muxl, u16 id)
+{
+ struct cflayer *up;
+ int idx = id % UP_CACHE_SIZE;
+ up = rcu_dereference(muxl->up_cache[idx]);
+ if (up == NULL || up->id != id) {
+ spin_lock_bh(&muxl->receive_lock);
+ up = get_from_id(&muxl->srvl_list, id);
+ rcu_assign_pointer(muxl->up_cache[idx], up);
+ spin_unlock_bh(&muxl->receive_lock);
+ }
+ return up;
+}
+
+static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info)
+{
+ struct cflayer *dn;
+ int idx = dev_info->id % DN_CACHE_SIZE;
+ dn = rcu_dereference(muxl->dn_cache[idx]);
+ if (dn == NULL || dn->id != dev_info->id) {
+ spin_lock_bh(&muxl->transmit_lock);
+ dn = get_from_id(&muxl->frml_list, dev_info->id);
+ rcu_assign_pointer(muxl->dn_cache[idx], dn);
+ spin_unlock_bh(&muxl->transmit_lock);
+ }
+ return dn;
+}
+
+struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id)
+{
+ struct cflayer *up;
+ struct cfmuxl *muxl = container_obj(layr);
+ int idx = id % UP_CACHE_SIZE;
+
+ if (id == 0) {
+ pr_warn("Trying to remove control layer\n");
+ return NULL;
+ }
+
+ spin_lock_bh(&muxl->receive_lock);
+ up = get_from_id(&muxl->srvl_list, id);
+ if (up == NULL)
+ goto out;
+
+ RCU_INIT_POINTER(muxl->up_cache[idx], NULL);
+ list_del_rcu(&up->node);
+out:
+ spin_unlock_bh(&muxl->receive_lock);
+ return up;
+}
+
+static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ int ret;
+ struct cfmuxl *muxl = container_obj(layr);
+ u8 id;
+ struct cflayer *up;
+ if (cfpkt_extr_head(pkt, &id, 1) < 0) {
+ pr_err("erroneous Caif Packet\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ rcu_read_lock();
+ up = get_up(muxl, id);
+
+ if (up == NULL) {
+ pr_debug("Received data on unknown link ID = %d (0x%x)"
+ " up == NULL", id, id);
+ cfpkt_destroy(pkt);
+ /*
+ * Don't return ERROR, since modem misbehaves and sends out
+ * flow on before linksetup response.
+ */
+
+ rcu_read_unlock();
+ return /* CFGLU_EPROT; */ 0;
+ }
+
+ /* We can't hold rcu_lock during receive, so take a ref count instead */
+ cfsrvl_get(up);
+ rcu_read_unlock();
+
+ ret = up->receive(up, pkt);
+
+ cfsrvl_put(up);
+ return ret;
+}
+
+static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ int err;
+ u8 linkid;
+ struct cflayer *dn;
+ struct caif_payload_info *info = cfpkt_info(pkt);
+ BUG_ON(!info);
+
+ rcu_read_lock();
+
+ dn = get_dn(muxl, info->dev_info);
+ if (dn == NULL) {
+ pr_debug("Send data on unknown phy ID = %d (0x%x)\n",
+ info->dev_info->id, info->dev_info->id);
+ rcu_read_unlock();
+ cfpkt_destroy(pkt);
+ return -ENOTCONN;
+ }
+
+ info->hdr_len += 1;
+ linkid = info->channel_id;
+ cfpkt_add_head(pkt, &linkid, 1);
+
+ /* We can't hold rcu_lock during receive, so take a ref count instead */
+ cffrml_hold(dn);
+
+ rcu_read_unlock();
+
+ err = dn->transmit(dn, pkt);
+
+ cffrml_put(dn);
+ return err;
+}
+
+static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ struct cflayer *layer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(layer, &muxl->srvl_list, node) {
+
+ if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) {
+
+ if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND ||
+ ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) &&
+ layer->id != 0)
+ cfmuxl_remove_uplayer(layr, layer->id);
+
+ /* NOTE: ctrlcmd is not allowed to block */
+ layer->ctrlcmd(layer, ctrl, phyid);
+ }
+ }
+ rcu_read_unlock();
+}
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
new file mode 100644
index 000000000000..96236d21b18e
--- /dev/null
+++ b/net/caif/cfpkt_skbuff.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <net/caif/cfpkt.h>
+
+#define PKT_PREFIX 48
+#define PKT_POSTFIX 2
+#define PKT_LEN_WHEN_EXTENDING 128
+#define PKT_ERROR(pkt, errmsg) \
+do { \
+ cfpkt_priv(pkt)->erronous = true; \
+ skb_reset_tail_pointer(&pkt->skb); \
+ pr_warn(errmsg); \
+} while (0)
+
+/*
+ * net/caif/ is generic and does not
+ * understand SKB, so we do this typecast
+ */
+struct cfpkt {
+ struct sk_buff skb;
+};
+
+/* Private data inside SKB */
+struct cfpkt_priv_data {
+ struct dev_info dev_info;
+ bool erronous;
+};
+
+static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt)
+{
+ return (struct cfpkt_priv_data *) pkt->skb.cb;
+}
+
+static inline bool is_erronous(struct cfpkt *pkt)
+{
+ return cfpkt_priv(pkt)->erronous;
+}
+
+static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt)
+{
+ return &pkt->skb;
+}
+
+static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb)
+{
+ return (struct cfpkt *) skb;
+}
+
+struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt)
+{
+ struct cfpkt *pkt = skb_to_pkt(nativepkt);
+ cfpkt_priv(pkt)->erronous = false;
+ return pkt;
+}
+EXPORT_SYMBOL(cfpkt_fromnative);
+
+void *cfpkt_tonative(struct cfpkt *pkt)
+{
+ return (void *) pkt;
+}
+EXPORT_SYMBOL(cfpkt_tonative);
+
+static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(len + pfx, GFP_ATOMIC);
+ if (unlikely(skb == NULL))
+ return NULL;
+
+ skb_reserve(skb, pfx);
+ return skb_to_pkt(skb);
+}
+
+inline struct cfpkt *cfpkt_create(u16 len)
+{
+ return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX);
+}
+
+void cfpkt_destroy(struct cfpkt *pkt)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ kfree_skb(skb);
+}
+
+inline bool cfpkt_more(struct cfpkt *pkt)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ return skb->len > 0;
+}
+
+int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ if (skb_headlen(skb) >= len) {
+ memcpy(data, skb->data, len);
+ return 0;
+ }
+ return !cfpkt_extr_head(pkt, data, len) &&
+ !cfpkt_add_head(pkt, data, len);
+}
+
+int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ u8 *from;
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ if (unlikely(len > skb->len)) {
+ PKT_ERROR(pkt, "read beyond end of packet\n");
+ return -EPROTO;
+ }
+
+ if (unlikely(len > skb_headlen(skb))) {
+ if (unlikely(skb_linearize(skb) != 0)) {
+ PKT_ERROR(pkt, "linearize failed\n");
+ return -EPROTO;
+ }
+ }
+ from = skb_pull(skb, len);
+ from -= len;
+ if (data)
+ memcpy(data, from, len);
+ return 0;
+}
+EXPORT_SYMBOL(cfpkt_extr_head);
+
+int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ u8 *data = dta;
+ u8 *from;
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ if (unlikely(skb_linearize(skb) != 0)) {
+ PKT_ERROR(pkt, "linearize failed\n");
+ return -EPROTO;
+ }
+ if (unlikely(skb->data + len > skb_tail_pointer(skb))) {
+ PKT_ERROR(pkt, "read beyond end of packet\n");
+ return -EPROTO;
+ }
+ from = skb_tail_pointer(skb) - len;
+ skb_trim(skb, skb->len - len);
+ memcpy(data, from, len);
+ return 0;
+}
+
+int cfpkt_pad_trail(struct cfpkt *pkt, u16 len)
+{
+ return cfpkt_add_body(pkt, NULL, len);
+}
+
+int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ struct sk_buff *lastskb;
+ u8 *to;
+ u16 addlen = 0;
+
+
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ lastskb = skb;
+
+ /* Check whether we need to add space at the tail */
+ if (unlikely(skb_tailroom(skb) < len)) {
+ if (likely(len < PKT_LEN_WHEN_EXTENDING))
+ addlen = PKT_LEN_WHEN_EXTENDING;
+ else
+ addlen = len;
+ }
+
+ /* Check whether we need to change the SKB before writing to the tail */
+ if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) {
+
+ /* Make sure data is writable */
+ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) {
+ PKT_ERROR(pkt, "cow failed\n");
+ return -EPROTO;
+ }
+ }
+
+ /* All set to put the last SKB and optionally write data there. */
+ to = pskb_put(skb, lastskb, len);
+ if (likely(data))
+ memcpy(to, data, len);
+ return 0;
+}
+
+inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data)
+{
+ return cfpkt_add_body(pkt, &data, 1);
+}
+
+int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ struct sk_buff *lastskb;
+ u8 *to;
+ const u8 *data = data2;
+ int ret;
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+ if (unlikely(skb_headroom(skb) < len)) {
+ PKT_ERROR(pkt, "no headroom\n");
+ return -EPROTO;
+ }
+
+ /* Make sure data is writable */
+ ret = skb_cow_data(skb, 0, &lastskb);
+ if (unlikely(ret < 0)) {
+ PKT_ERROR(pkt, "cow failed\n");
+ return ret;
+ }
+
+ to = skb_push(skb, len);
+ memcpy(to, data, len);
+ return 0;
+}
+EXPORT_SYMBOL(cfpkt_add_head);
+
+inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len)
+{
+ return cfpkt_add_body(pkt, data, len);
+}
+
+inline u16 cfpkt_getlen(struct cfpkt *pkt)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ return skb->len;
+}
+
+int cfpkt_iterate(struct cfpkt *pkt,
+ u16 (*iter_func)(u16, void *, u16),
+ u16 data)
+{
+ /*
+ * Don't care about the performance hit of linearizing,
+ * Checksum should not be used on high-speed interfaces anyway.
+ */
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+ if (unlikely(skb_linearize(&pkt->skb) != 0)) {
+ PKT_ERROR(pkt, "linearize failed\n");
+ return -EPROTO;
+ }
+ return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt));
+}
+
+int cfpkt_setlen(struct cfpkt *pkt, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+
+
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ if (likely(len <= skb->len)) {
+ if (unlikely(skb->data_len))
+ ___pskb_trim(skb, len);
+ else
+ skb_trim(skb, len);
+
+ return cfpkt_getlen(pkt);
+ }
+
+ /* Need to expand SKB */
+ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len)))
+ PKT_ERROR(pkt, "skb_pad_trail failed\n");
+
+ return cfpkt_getlen(pkt);
+}
+
+struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
+ struct cfpkt *addpkt,
+ u16 expectlen)
+{
+ struct sk_buff *dst = pkt_to_skb(dstpkt);
+ struct sk_buff *add = pkt_to_skb(addpkt);
+ u16 addlen = skb_headlen(add);
+ u16 neededtailspace;
+ struct sk_buff *tmp;
+ u16 dstlen;
+ u16 createlen;
+ if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) {
+ return dstpkt;
+ }
+
+ neededtailspace = max(expectlen, addlen);
+
+ if (dst->tail + neededtailspace > dst->end) {
+ /* Create a dumplicate of 'dst' with more tail space */
+ struct cfpkt *tmppkt;
+ dstlen = skb_headlen(dst);
+ createlen = dstlen + neededtailspace;
+ tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX);
+ if (tmppkt == NULL)
+ return NULL;
+ tmp = pkt_to_skb(tmppkt);
+ skb_put_data(tmp, dst->data, dstlen);
+ cfpkt_destroy(dstpkt);
+ dst = tmp;
+ }
+ skb_put_data(dst, add->data, skb_headlen(add));
+ cfpkt_destroy(addpkt);
+ return skb_to_pkt(dst);
+}
+
+struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
+{
+ struct sk_buff *skb2;
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ struct cfpkt *tmppkt;
+ u8 *split = skb->data + pos;
+ u16 len2nd = skb_tail_pointer(skb) - split;
+
+ if (unlikely(is_erronous(pkt)))
+ return NULL;
+
+ if (skb->data + pos > skb_tail_pointer(skb)) {
+ PKT_ERROR(pkt, "trying to split beyond end of packet\n");
+ return NULL;
+ }
+
+ /* Create a new packet for the second part of the data */
+ tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX,
+ PKT_PREFIX);
+ if (tmppkt == NULL)
+ return NULL;
+ skb2 = pkt_to_skb(tmppkt);
+
+
+ if (skb2 == NULL)
+ return NULL;
+
+ skb_put_data(skb2, split, len2nd);
+
+ /* Reduce the length of the original packet */
+ skb_trim(skb, pos);
+
+ skb2->priority = skb->priority;
+ return skb_to_pkt(skb2);
+}
+
+bool cfpkt_erroneous(struct cfpkt *pkt)
+{
+ return cfpkt_priv(pkt)->erronous;
+}
+
+struct caif_payload_info *cfpkt_info(struct cfpkt *pkt)
+{
+ return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb;
+}
+EXPORT_SYMBOL(cfpkt_info);
+
+void cfpkt_set_prio(struct cfpkt *pkt, int prio)
+{
+ pkt_to_skb(pkt)->priority = prio;
+}
+EXPORT_SYMBOL(cfpkt_set_prio);
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
new file mode 100644
index 000000000000..3c335057f255
--- /dev/null
+++ b/net/caif/cfrfml.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) container_of(layr, struct cfrfml, serv.layer)
+#define RFM_SEGMENTATION_BIT 0x01
+#define RFM_HEAD_SIZE 7
+
+static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cfrfml {
+ struct cfsrvl serv;
+ struct cfpkt *incomplete_frm;
+ int fragment_size;
+ u8 seghead[6];
+ u16 pdu_size;
+ /* Protects serialized processing of packets */
+ spinlock_t sync;
+};
+
+static void cfrfml_release(struct cflayer *layer)
+{
+ struct cfsrvl *srvl = container_of(layer, struct cfsrvl, layer);
+ struct cfrfml *rfml = container_obj(&srvl->layer);
+
+ if (rfml->incomplete_frm)
+ cfpkt_destroy(rfml->incomplete_frm);
+
+ kfree(srvl);
+}
+
+struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
+ int mtu_size)
+{
+ int tmp;
+ struct cfrfml *this = kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
+
+ if (!this)
+ return NULL;
+
+ cfsrvl_init(&this->serv, channel_id, dev_info, false);
+ this->serv.release = cfrfml_release;
+ this->serv.layer.receive = cfrfml_receive;
+ this->serv.layer.transmit = cfrfml_transmit;
+
+ /* Round down to closest multiple of 16 */
+ tmp = (mtu_size - RFM_HEAD_SIZE - 6) / 16;
+ tmp *= 16;
+
+ this->fragment_size = tmp;
+ spin_lock_init(&this->sync);
+ snprintf(this->serv.layer.name, CAIF_LAYER_NAME_SZ,
+ "rfm%d", channel_id);
+
+ return &this->serv.layer;
+}
+
+static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead,
+ struct cfpkt *pkt, int *err)
+{
+ struct cfpkt *tmppkt;
+ *err = -EPROTO;
+ /* n-th but not last segment */
+
+ if (cfpkt_extr_head(pkt, seghead, 6) < 0)
+ return NULL;
+
+ /* Verify correct header */
+ if (memcmp(seghead, rfml->seghead, 6) != 0)
+ return NULL;
+
+ tmppkt = cfpkt_append(rfml->incomplete_frm, pkt,
+ rfml->pdu_size + RFM_HEAD_SIZE);
+
+ /* If cfpkt_append failes input pkts are not freed */
+ *err = -ENOMEM;
+ if (tmppkt == NULL)
+ return NULL;
+
+ *err = 0;
+ return tmppkt;
+}
+
+static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 tmp;
+ bool segmented;
+ int err;
+ u8 seghead[6];
+ struct cfrfml *rfml;
+ struct cfpkt *tmppkt = NULL;
+
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->receive != NULL);
+ rfml = container_obj(layr);
+ spin_lock(&rfml->sync);
+
+ err = -EPROTO;
+ if (cfpkt_extr_head(pkt, &tmp, 1) < 0)
+ goto out;
+ segmented = tmp & RFM_SEGMENTATION_BIT;
+
+ if (segmented) {
+ if (rfml->incomplete_frm == NULL) {
+ /* Initial Segment */
+ if (cfpkt_peek_head(pkt, rfml->seghead, 6) != 0)
+ goto out;
+
+ rfml->pdu_size = get_unaligned_le16(rfml->seghead+4);
+
+ if (cfpkt_erroneous(pkt))
+ goto out;
+ rfml->incomplete_frm = pkt;
+ pkt = NULL;
+ } else {
+
+ tmppkt = rfm_append(rfml, seghead, pkt, &err);
+ if (tmppkt == NULL)
+ goto out;
+
+ if (cfpkt_erroneous(tmppkt))
+ goto out;
+
+ rfml->incomplete_frm = tmppkt;
+
+
+ if (cfpkt_erroneous(tmppkt))
+ goto out;
+ }
+ err = 0;
+ goto out;
+ }
+
+ if (rfml->incomplete_frm) {
+
+ /* Last Segment */
+ tmppkt = rfm_append(rfml, seghead, pkt, &err);
+ if (tmppkt == NULL)
+ goto out;
+
+ if (cfpkt_erroneous(tmppkt))
+ goto out;
+
+ rfml->incomplete_frm = NULL;
+ pkt = tmppkt;
+ tmppkt = NULL;
+
+ /* Verify that length is correct */
+ err = -EPROTO;
+ if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1)
+ goto out;
+ }
+
+ err = rfml->serv.layer.up->receive(rfml->serv.layer.up, pkt);
+
+out:
+
+ if (err != 0) {
+ if (tmppkt)
+ cfpkt_destroy(tmppkt);
+ if (pkt)
+ cfpkt_destroy(pkt);
+ if (rfml->incomplete_frm)
+ cfpkt_destroy(rfml->incomplete_frm);
+ rfml->incomplete_frm = NULL;
+
+ pr_info("Connection error %d triggered on RFM link\n", err);
+
+ /* Trigger connection error upon failure.*/
+ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+ rfml->serv.dev_info.id);
+ }
+ spin_unlock(&rfml->sync);
+
+ if (unlikely(err == -EAGAIN))
+ /* It is not possible to recover after drop of a fragment */
+ err = -EIO;
+
+ return err;
+}
+
+
+static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt)
+{
+ caif_assert(cfpkt_getlen(pkt) < rfml->fragment_size + RFM_HEAD_SIZE);
+
+ /* Add info for MUX-layer to route the packet out. */
+ cfpkt_info(pkt)->channel_id = rfml->serv.layer.id;
+
+ /*
+ * To optimize alignment, we add up the size of CAIF header before
+ * payload.
+ */
+ cfpkt_info(pkt)->hdr_len = RFM_HEAD_SIZE;
+ cfpkt_info(pkt)->dev_info = &rfml->serv.dev_info;
+
+ return rfml->serv.layer.dn->transmit(rfml->serv.layer.dn, pkt);
+}
+
+static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ int err;
+ u8 seg;
+ u8 head[6];
+ struct cfpkt *rearpkt = NULL;
+ struct cfpkt *frontpkt = pkt;
+ struct cfrfml *rfml = container_obj(layr);
+
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (!cfsrvl_ready(&rfml->serv, &err))
+ goto out;
+
+ err = -EPROTO;
+ if (cfpkt_getlen(pkt) <= RFM_HEAD_SIZE-1)
+ goto out;
+
+ err = 0;
+ if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE)
+ err = cfpkt_peek_head(pkt, head, 6);
+
+ if (err != 0)
+ goto out;
+
+ while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) {
+
+ seg = 1;
+ err = -EPROTO;
+
+ if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+ goto out;
+ /*
+ * On OOM error cfpkt_split returns NULL.
+ *
+ * NOTE: Segmented pdu is not correctly aligned.
+ * This has negative performance impact.
+ */
+
+ rearpkt = cfpkt_split(frontpkt, rfml->fragment_size);
+ if (rearpkt == NULL)
+ goto out;
+
+ err = cfrfml_transmit_segment(rfml, frontpkt);
+
+ if (err != 0) {
+ frontpkt = NULL;
+ goto out;
+ }
+
+ frontpkt = rearpkt;
+ rearpkt = NULL;
+
+ err = -EPROTO;
+ if (cfpkt_add_head(frontpkt, head, 6) < 0)
+ goto out;
+
+ }
+
+ seg = 0;
+ err = -EPROTO;
+
+ if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+ goto out;
+
+ err = cfrfml_transmit_segment(rfml, frontpkt);
+
+ frontpkt = NULL;
+out:
+
+ if (err != 0) {
+ pr_info("Connection error %d triggered on RFM link\n", err);
+ /* Trigger connection error upon failure.*/
+
+ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+ rfml->serv.dev_info.id);
+
+ if (rearpkt)
+ cfpkt_destroy(rearpkt);
+
+ if (frontpkt)
+ cfpkt_destroy(frontpkt);
+ }
+
+ return err;
+}
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
new file mode 100644
index 000000000000..aee11c74d3c8
--- /dev/null
+++ b/net/caif/cfserl.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfserl.h>
+
+#define container_obj(layr) ((struct cfserl *) layr)
+
+#define CFSERL_STX 0x02
+#define SERIAL_MINIUM_PACKET_SIZE 4
+#define SERIAL_MAX_FRAMESIZE 4096
+struct cfserl {
+ struct cflayer layer;
+ struct cfpkt *incomplete_frm;
+ /* Protects parallel processing of incoming packets */
+ spinlock_t sync;
+ bool usestx;
+};
+
+static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+
+void cfserl_release(struct cflayer *layer)
+{
+ kfree(layer);
+}
+
+struct cflayer *cfserl_create(int instance, bool use_stx)
+{
+ struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ caif_assert(offsetof(struct cfserl, layer) == 0);
+ this->layer.receive = cfserl_receive;
+ this->layer.transmit = cfserl_transmit;
+ this->layer.ctrlcmd = cfserl_ctrlcmd;
+ this->usestx = use_stx;
+ spin_lock_init(&this->sync);
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "ser1");
+ return &this->layer;
+}
+
+static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
+{
+ struct cfserl *layr = container_obj(l);
+ u16 pkt_len;
+ struct cfpkt *pkt = NULL;
+ struct cfpkt *tail_pkt = NULL;
+ u8 tmp8;
+ u16 tmp;
+ u8 stx = CFSERL_STX;
+ int ret;
+ u16 expectlen = 0;
+
+ caif_assert(newpkt != NULL);
+ spin_lock(&layr->sync);
+
+ if (layr->incomplete_frm != NULL) {
+ layr->incomplete_frm =
+ cfpkt_append(layr->incomplete_frm, newpkt, expectlen);
+ pkt = layr->incomplete_frm;
+ if (pkt == NULL) {
+ spin_unlock(&layr->sync);
+ return -ENOMEM;
+ }
+ } else {
+ pkt = newpkt;
+ }
+ layr->incomplete_frm = NULL;
+
+ do {
+ /* Search for STX at start of pkt if STX is used */
+ if (layr->usestx) {
+ cfpkt_extr_head(pkt, &tmp8, 1);
+ if (tmp8 != CFSERL_STX) {
+ while (cfpkt_more(pkt)
+ && tmp8 != CFSERL_STX) {
+ cfpkt_extr_head(pkt, &tmp8, 1);
+ }
+ if (!cfpkt_more(pkt)) {
+ cfpkt_destroy(pkt);
+ layr->incomplete_frm = NULL;
+ spin_unlock(&layr->sync);
+ return -EPROTO;
+ }
+ }
+ }
+
+ pkt_len = cfpkt_getlen(pkt);
+
+ /*
+ * pkt_len is the accumulated length of the packet data
+ * we have received so far.
+ * Exit if frame doesn't hold length.
+ */
+
+ if (pkt_len < 2) {
+ if (layr->usestx)
+ cfpkt_add_head(pkt, &stx, 1);
+ layr->incomplete_frm = pkt;
+ spin_unlock(&layr->sync);
+ return 0;
+ }
+
+ /*
+ * Find length of frame.
+ * expectlen is the length we need for a full frame.
+ */
+ cfpkt_peek_head(pkt, &tmp, 2);
+ expectlen = le16_to_cpu(tmp) + 2;
+ /*
+ * Frame error handling
+ */
+ if (expectlen < SERIAL_MINIUM_PACKET_SIZE
+ || expectlen > SERIAL_MAX_FRAMESIZE) {
+ if (!layr->usestx) {
+ if (pkt != NULL)
+ cfpkt_destroy(pkt);
+ layr->incomplete_frm = NULL;
+ spin_unlock(&layr->sync);
+ return -EPROTO;
+ }
+ continue;
+ }
+
+ if (pkt_len < expectlen) {
+ /* Too little received data */
+ if (layr->usestx)
+ cfpkt_add_head(pkt, &stx, 1);
+ layr->incomplete_frm = pkt;
+ spin_unlock(&layr->sync);
+ return 0;
+ }
+
+ /*
+ * Enough data for at least one frame.
+ * Split the frame, if too long
+ */
+ if (pkt_len > expectlen)
+ tail_pkt = cfpkt_split(pkt, expectlen);
+ else
+ tail_pkt = NULL;
+
+ /* Send the first part of packet upwards.*/
+ spin_unlock(&layr->sync);
+ ret = layr->layer.up->receive(layr->layer.up, pkt);
+ spin_lock(&layr->sync);
+ if (ret == -EILSEQ) {
+ if (layr->usestx) {
+ if (tail_pkt != NULL)
+ pkt = cfpkt_append(pkt, tail_pkt, 0);
+ /* Start search for next STX if frame failed */
+ continue;
+ } else {
+ cfpkt_destroy(pkt);
+ pkt = NULL;
+ }
+ }
+
+ pkt = tail_pkt;
+
+ } while (pkt != NULL);
+
+ spin_unlock(&layr->sync);
+ return 0;
+}
+
+static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt)
+{
+ struct cfserl *layr = container_obj(layer);
+ u8 tmp8 = CFSERL_STX;
+ if (layr->usestx)
+ cfpkt_add_head(newpkt, &tmp8, 1);
+ return layer->dn->transmit(layer->dn, newpkt);
+}
+
+static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+}
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
new file mode 100644
index 000000000000..171fa32ada85
--- /dev/null
+++ b/net/caif/cfsrvl.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pkt_sched.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/caif_dev.h>
+
+#define SRVL_CTRL_PKT_SIZE 1
+#define SRVL_FLOW_OFF 0x81
+#define SRVL_FLOW_ON 0x80
+#define SRVL_SET_PIN 0x82
+
+#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
+
+static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ struct cfsrvl *service = container_obj(layr);
+
+ if (layr->up == NULL || layr->up->ctrlcmd == NULL)
+ return;
+
+ switch (ctrl) {
+ case CAIF_CTRLCMD_INIT_RSP:
+ service->open = true;
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ break;
+ case CAIF_CTRLCMD_DEINIT_RSP:
+ case CAIF_CTRLCMD_INIT_FAIL_RSP:
+ service->open = false;
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ break;
+ case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
+ if (phyid != service->dev_info.id)
+ break;
+ if (service->modem_flow_on)
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_OFF_IND, phyid);
+ service->phy_flow_on = false;
+ break;
+ case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND:
+ if (phyid != service->dev_info.id)
+ return;
+ if (service->modem_flow_on) {
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_ON_IND,
+ phyid);
+ }
+ service->phy_flow_on = true;
+ break;
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ if (service->phy_flow_on) {
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_OFF_IND, phyid);
+ }
+ service->modem_flow_on = false;
+ break;
+ case CAIF_CTRLCMD_FLOW_ON_IND:
+ if (service->phy_flow_on) {
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_ON_IND, phyid);
+ }
+ service->modem_flow_on = true;
+ break;
+ case _CAIF_CTRLCMD_PHYIF_DOWN_IND:
+ /* In case interface is down, let's fake a remove shutdown */
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid);
+ break;
+ case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ break;
+ default:
+ pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl);
+ /* We have both modem and phy flow on, send flow on */
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ service->phy_flow_on = true;
+ break;
+ }
+}
+
+static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
+{
+ struct cfsrvl *service = container_obj(layr);
+
+ caif_assert(layr != NULL);
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (!service->supports_flowctrl)
+ return 0;
+
+ switch (ctrl) {
+ case CAIF_MODEMCMD_FLOW_ON_REQ:
+ {
+ struct cfpkt *pkt;
+ struct caif_payload_info *info;
+ u8 flow_on = SRVL_FLOW_ON;
+ pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+ if (!pkt)
+ return -ENOMEM;
+
+ if (cfpkt_add_head(pkt, &flow_on, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ return layr->dn->transmit(layr->dn, pkt);
+ }
+ case CAIF_MODEMCMD_FLOW_OFF_REQ:
+ {
+ struct cfpkt *pkt;
+ struct caif_payload_info *info;
+ u8 flow_off = SRVL_FLOW_OFF;
+ pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+ if (!pkt)
+ return -ENOMEM;
+
+ if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ return layr->dn->transmit(layr->dn, pkt);
+ }
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+static void cfsrvl_release(struct cflayer *layer)
+{
+ struct cfsrvl *service = container_of(layer, struct cfsrvl, layer);
+ kfree(service);
+}
+
+void cfsrvl_init(struct cfsrvl *service,
+ u8 channel_id,
+ struct dev_info *dev_info,
+ bool supports_flowctrl)
+{
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ service->open = false;
+ service->modem_flow_on = true;
+ service->phy_flow_on = true;
+ service->layer.id = channel_id;
+ service->layer.ctrlcmd = cfservl_ctrlcmd;
+ service->layer.modemcmd = cfservl_modemcmd;
+ service->dev_info = *dev_info;
+ service->supports_flowctrl = supports_flowctrl;
+ service->release = cfsrvl_release;
+}
+
+bool cfsrvl_ready(struct cfsrvl *service, int *err)
+{
+ if (!service->open) {
+ *err = -ENOTCONN;
+ return false;
+ }
+ return true;
+}
+
+bool cfsrvl_phyid_match(struct cflayer *layer, int phyid)
+{
+ struct cfsrvl *servl = container_obj(layer);
+ return servl->dev_info.id == phyid;
+}
+
+void caif_free_client(struct cflayer *adap_layer)
+{
+ struct cfsrvl *servl;
+ if (adap_layer == NULL || adap_layer->dn == NULL)
+ return;
+ servl = container_obj(adap_layer->dn);
+ servl->release(&servl->layer);
+}
+EXPORT_SYMBOL(caif_free_client);
+
+void caif_client_register_refcnt(struct cflayer *adapt_layer,
+ void (*hold)(struct cflayer *lyr),
+ void (*put)(struct cflayer *lyr))
+{
+ struct cfsrvl *service;
+
+ if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL))
+ return;
+ service = container_of(adapt_layer->dn, struct cfsrvl, layer);
+ service->hold = hold;
+ service->put = put;
+}
+EXPORT_SYMBOL(caif_client_register_refcnt);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
new file mode 100644
index 000000000000..b2e47ede912f
--- /dev/null
+++ b/net/caif/cfutill.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+#define UTIL_PAYLOAD 0x00
+#define UTIL_CMD_BIT 0x80
+#define UTIL_REMOTE_SHUTDOWN 0x82
+#define UTIL_FLOW_OFF 0x81
+#define UTIL_FLOW_ON 0x80
+
+static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *util = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!util)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(util, channel_id, dev_info, true);
+ util->layer.receive = cfutill_receive;
+ util->layer.transmit = cfutill_transmit;
+ snprintf(util->layer.name, CAIF_LAYER_NAME_SZ, "util1");
+ return &util->layer;
+}
+
+static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 cmd = -1;
+ struct cfsrvl *service = container_obj(layr);
+ caif_assert(layr != NULL);
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->up->receive != NULL);
+ caif_assert(layr->up->ctrlcmd != NULL);
+ if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ switch (cmd) {
+ case UTIL_PAYLOAD:
+ return layr->up->receive(layr->up, pkt);
+ case UTIL_FLOW_OFF:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case UTIL_FLOW_ON:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */
+ pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n");
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0);
+ service->open = false;
+ cfpkt_destroy(pkt);
+ return 0;
+ default:
+ cfpkt_destroy(pkt);
+ pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd);
+ return -EPROTO;
+ }
+}
+
+static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 zero = 0;
+ struct caif_payload_info *info;
+ int ret;
+ struct cfsrvl *service = container_obj(layr);
+ caif_assert(layr != NULL);
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ cfpkt_add_head(pkt, &zero, 1);
+ /* Add info for MUX-layer to route the packet out. */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ /*
+ * To optimize alignment, we add up the size of CAIF header before
+ * payload.
+ */
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
new file mode 100644
index 000000000000..db2274b94a5d
--- /dev/null
+++ b/net/caif/cfveil.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define VEI_PAYLOAD 0x00
+#define VEI_CMD_BIT 0x80
+#define VEI_FLOW_OFF 0x81
+#define VEI_FLOW_ON 0x80
+#define VEI_SET_PIN 0x82
+
+#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
+
+static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *vei = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!vei)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(vei, channel_id, dev_info, true);
+ vei->layer.receive = cfvei_receive;
+ vei->layer.transmit = cfvei_transmit;
+ snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ, "vei%d", channel_id);
+ return &vei->layer;
+}
+
+static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 cmd;
+ int ret;
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->receive != NULL);
+ caif_assert(layr->ctrlcmd != NULL);
+
+
+ if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ switch (cmd) {
+ case VEI_PAYLOAD:
+ ret = layr->up->receive(layr->up, pkt);
+ return ret;
+ case VEI_FLOW_OFF:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case VEI_FLOW_ON:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case VEI_SET_PIN: /* SET RS232 PIN */
+ cfpkt_destroy(pkt);
+ return 0;
+ default: /* SET RS232 PIN */
+ pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd);
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+}
+
+static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 tmp = 0;
+ struct caif_payload_info *info;
+ int ret;
+ struct cfsrvl *service = container_obj(layr);
+ if (!cfsrvl_ready(service, &ret))
+ goto err;
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ ret = -EPROTO;
+ goto err;
+ }
+
+ /* Add info-> for MUX-layer to route the packet out. */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+err:
+ cfpkt_destroy(pkt);
+ return ret;
+}
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
new file mode 100644
index 000000000000..134bad43196c
--- /dev/null
+++ b/net/caif/cfvidl.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *vid = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!vid)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+
+ cfsrvl_init(vid, channel_id, dev_info, false);
+ vid->layer.receive = cfvidl_receive;
+ vid->layer.transmit = cfvidl_transmit;
+ snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ, "vid1");
+ return &vid->layer;
+}
+
+static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u32 videoheader;
+ if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct cfsrvl *service = container_obj(layr);
+ struct caif_payload_info *info;
+ u32 videoheader = 0;
+ int ret;
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ cfpkt_add_head(pkt, &videoheader, 4);
+ /* Add info for MUX-layer to route the packet out */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
new file mode 100644
index 000000000000..fa6a3c2634a8
--- /dev/null
+++ b/net/caif/chnl_net.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Authors: Sjur Brendeland
+ * Daniel Martensson
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/sched.h>
+#include <linux/sockios.h>
+#include <linux/caif/if_caif.h>
+#include <net/rtnetlink.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/caif_dev.h>
+
+/* GPRS PDP connection has MTU to 1500 */
+#define GPRS_PDP_MTU 1500
+/* 5 sec. connect timeout */
+#define CONNECT_TIMEOUT (5 * HZ)
+#define CAIF_NET_DEFAULT_QUEUE_LEN 500
+#define UNDEF_CONNID 0xffffffff
+
+/*This list is protected by the rtnl lock. */
+static LIST_HEAD(chnl_net_list);
+
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol GPRS network device");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("caif");
+
+enum caif_states {
+ CAIF_CONNECTED = 1,
+ CAIF_CONNECTING,
+ CAIF_DISCONNECTED,
+ CAIF_SHUTDOWN
+};
+
+struct chnl_net {
+ struct cflayer chnl;
+ struct caif_connect_request conn_req;
+ struct list_head list_field;
+ struct net_device *netdev;
+ wait_queue_head_t netmgmt_wq;
+ /* Flow status to remember and control the transmission. */
+ bool flowenabled;
+ enum caif_states state;
+};
+
+static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct sk_buff *skb;
+ struct chnl_net *priv;
+ int pktlen;
+ const u8 *ip_version;
+ u8 buf;
+
+ priv = container_of(layr, struct chnl_net, chnl);
+
+ skb = (struct sk_buff *) cfpkt_tonative(pkt);
+
+ /* Get length of CAIF packet. */
+ pktlen = skb->len;
+
+ /* Pass some minimum information and
+ * send the packet to the net stack.
+ */
+ skb->dev = priv->netdev;
+
+ /* check the version of IP */
+ ip_version = skb_header_pointer(skb, 0, 1, &buf);
+ if (!ip_version) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ switch (*ip_version >> 4) {
+ case 4:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case 6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ default:
+ kfree_skb(skb);
+ priv->netdev->stats.rx_errors++;
+ return -EINVAL;
+ }
+
+ /* If we change the header in loop mode, the checksum is corrupted. */
+ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else
+ skb->ip_summed = CHECKSUM_NONE;
+
+ netif_rx(skb);
+
+ /* Update statistics. */
+ priv->netdev->stats.rx_packets++;
+ priv->netdev->stats.rx_bytes += pktlen;
+
+ return 0;
+}
+
+static int delete_device(struct chnl_net *dev)
+{
+ ASSERT_RTNL();
+ if (dev->netdev)
+ unregister_netdevice(dev->netdev);
+ return 0;
+}
+
+static void close_work(struct work_struct *work)
+{
+ struct chnl_net *dev = NULL;
+ struct list_head *list_node;
+ struct list_head *_tmp;
+
+ rtnl_lock();
+ list_for_each_safe(list_node, _tmp, &chnl_net_list) {
+ dev = list_entry(list_node, struct chnl_net, list_field);
+ if (dev->state == CAIF_SHUTDOWN)
+ dev_close(dev->netdev);
+ }
+ rtnl_unlock();
+}
+static DECLARE_WORK(close_worker, close_work);
+
+static void chnl_hold(struct cflayer *lyr)
+{
+ struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl);
+ dev_hold(priv->netdev);
+}
+
+static void chnl_put(struct cflayer *lyr)
+{
+ struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl);
+ dev_put(priv->netdev);
+}
+
+static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
+ int phyid)
+{
+ struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
+ pr_debug("NET flowctrl func called flow: %s\n",
+ flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" :
+ flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" :
+ flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" :
+ flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" :
+ flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" :
+ flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ?
+ "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND");
+
+
+
+ switch (flow) {
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ priv->flowenabled = false;
+ netif_stop_queue(priv->netdev);
+ break;
+ case CAIF_CTRLCMD_DEINIT_RSP:
+ priv->state = CAIF_DISCONNECTED;
+ break;
+ case CAIF_CTRLCMD_INIT_FAIL_RSP:
+ priv->state = CAIF_DISCONNECTED;
+ wake_up_interruptible(&priv->netmgmt_wq);
+ break;
+ case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+ priv->state = CAIF_SHUTDOWN;
+ netif_tx_disable(priv->netdev);
+ schedule_work(&close_worker);
+ break;
+ case CAIF_CTRLCMD_FLOW_ON_IND:
+ priv->flowenabled = true;
+ netif_wake_queue(priv->netdev);
+ break;
+ case CAIF_CTRLCMD_INIT_RSP:
+ caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put);
+ priv->state = CAIF_CONNECTED;
+ priv->flowenabled = true;
+ netif_wake_queue(priv->netdev);
+ wake_up_interruptible(&priv->netmgmt_wq);
+ break;
+ default:
+ break;
+ }
+}
+
+static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct chnl_net *priv;
+ struct cfpkt *pkt = NULL;
+ int len;
+ int result = -1;
+ /* Get our private data. */
+ priv = netdev_priv(dev);
+
+ if (skb->len > priv->netdev->mtu) {
+ pr_warn("Size of skb exceeded MTU\n");
+ kfree_skb(skb);
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+ }
+
+ if (!priv->flowenabled) {
+ pr_debug("dropping packets flow off\n");
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP)
+ swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+
+ /* Store original SKB length. */
+ len = skb->len;
+
+ pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb);
+
+ /* Send the packet down the stack. */
+ result = priv->chnl.dn->transmit(priv->chnl.dn, pkt);
+ if (result) {
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ /* Update statistics. */
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += len;
+
+ return NETDEV_TX_OK;
+}
+
+static int chnl_net_open(struct net_device *dev)
+{
+ struct chnl_net *priv = NULL;
+ int result = -1;
+ int llifindex, headroom, tailroom, mtu;
+ struct net_device *lldev;
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ if (!priv) {
+ pr_debug("chnl_net_open: no priv\n");
+ return -ENODEV;
+ }
+
+ if (priv->state != CAIF_CONNECTING) {
+ priv->state = CAIF_CONNECTING;
+ result = caif_connect_client(dev_net(dev), &priv->conn_req,
+ &priv->chnl, &llifindex,
+ &headroom, &tailroom);
+ if (result != 0) {
+ pr_debug("err: "
+ "Unable to register and open device,"
+ " Err:%d\n",
+ result);
+ goto error;
+ }
+
+ lldev = __dev_get_by_index(dev_net(dev), llifindex);
+
+ if (lldev == NULL) {
+ pr_debug("no interface?\n");
+ result = -ENODEV;
+ goto error;
+ }
+
+ dev->needed_tailroom = tailroom + lldev->needed_tailroom;
+ dev->hard_header_len = headroom + lldev->hard_header_len +
+ lldev->needed_tailroom;
+
+ /*
+ * MTU, head-room etc is not know before we have a
+ * CAIF link layer device available. MTU calculation may
+ * override initial RTNL configuration.
+ * MTU is minimum of current mtu, link layer mtu pluss
+ * CAIF head and tail, and PDP GPRS contexts max MTU.
+ */
+ mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom));
+ mtu = min_t(int, GPRS_PDP_MTU, mtu);
+ dev_set_mtu(dev, mtu);
+
+ if (mtu < 100) {
+ pr_warn("CAIF Interface MTU too small (%d)\n", mtu);
+ result = -ENODEV;
+ goto error;
+ }
+ }
+
+ rtnl_unlock(); /* Release RTNL lock during connect wait */
+
+ result = wait_event_interruptible_timeout(priv->netmgmt_wq,
+ priv->state != CAIF_CONNECTING,
+ CONNECT_TIMEOUT);
+
+ rtnl_lock();
+
+ if (result == -ERESTARTSYS) {
+ pr_debug("wait_event_interruptible woken by a signal\n");
+ result = -ERESTARTSYS;
+ goto error;
+ }
+
+ if (result == 0) {
+ pr_debug("connect timeout\n");
+ result = -ETIMEDOUT;
+ goto error;
+ }
+
+ if (priv->state != CAIF_CONNECTED) {
+ pr_debug("connect failed\n");
+ result = -ECONNREFUSED;
+ goto error;
+ }
+ pr_debug("CAIF Netdevice connected\n");
+ return 0;
+
+error:
+ caif_disconnect_client(dev_net(dev), &priv->chnl);
+ priv->state = CAIF_DISCONNECTED;
+ pr_debug("state disconnected\n");
+ return result;
+
+}
+
+static int chnl_net_stop(struct net_device *dev)
+{
+ struct chnl_net *priv;
+
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ priv->state = CAIF_DISCONNECTED;
+ caif_disconnect_client(dev_net(dev), &priv->chnl);
+ return 0;
+}
+
+static int chnl_net_init(struct net_device *dev)
+{
+ struct chnl_net *priv;
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ INIT_LIST_HEAD(&priv->list_field);
+ return 0;
+}
+
+static void chnl_net_uninit(struct net_device *dev)
+{
+ struct chnl_net *priv;
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ list_del_init(&priv->list_field);
+}
+
+static const struct net_device_ops netdev_ops = {
+ .ndo_open = chnl_net_open,
+ .ndo_stop = chnl_net_stop,
+ .ndo_init = chnl_net_init,
+ .ndo_uninit = chnl_net_uninit,
+ .ndo_start_xmit = chnl_net_start_xmit,
+};
+
+static void chnl_net_destructor(struct net_device *dev)
+{
+ struct chnl_net *priv = netdev_priv(dev);
+ caif_free_client(&priv->chnl);
+}
+
+static void ipcaif_net_setup(struct net_device *dev)
+{
+ struct chnl_net *priv;
+ dev->netdev_ops = &netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = chnl_net_destructor;
+ dev->flags |= IFF_NOARP;
+ dev->flags |= IFF_POINTOPOINT;
+ dev->mtu = GPRS_PDP_MTU;
+ dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN;
+
+ priv = netdev_priv(dev);
+ priv->chnl.receive = chnl_recv_cb;
+ priv->chnl.ctrlcmd = chnl_flowctrl_cb;
+ priv->netdev = dev;
+ priv->conn_req.protocol = CAIFPROTO_DATAGRAM;
+ priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW;
+ priv->conn_req.priority = CAIF_PRIO_LOW;
+ /* Insert illegal value */
+ priv->conn_req.sockaddr.u.dgm.connection_id = UNDEF_CONNID;
+ priv->flowenabled = false;
+
+ init_waitqueue_head(&priv->netmgmt_wq);
+}
+
+
+static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct chnl_net *priv;
+ u8 loop;
+ priv = netdev_priv(dev);
+ if (nla_put_u32(skb, IFLA_CAIF_IPV4_CONNID,
+ priv->conn_req.sockaddr.u.dgm.connection_id) ||
+ nla_put_u32(skb, IFLA_CAIF_IPV6_CONNID,
+ priv->conn_req.sockaddr.u.dgm.connection_id))
+ goto nla_put_failure;
+ loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP;
+ if (nla_put_u8(skb, IFLA_CAIF_LOOPBACK, loop))
+ goto nla_put_failure;
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+
+}
+
+static void caif_netlink_parms(struct nlattr *data[],
+ struct caif_connect_request *conn_req)
+{
+ if (!data) {
+ pr_warn("no params data found\n");
+ return;
+ }
+ if (data[IFLA_CAIF_IPV4_CONNID])
+ conn_req->sockaddr.u.dgm.connection_id =
+ nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]);
+ if (data[IFLA_CAIF_IPV6_CONNID])
+ conn_req->sockaddr.u.dgm.connection_id =
+ nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]);
+ if (data[IFLA_CAIF_LOOPBACK]) {
+ if (nla_get_u8(data[IFLA_CAIF_LOOPBACK]))
+ conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP;
+ else
+ conn_req->protocol = CAIFPROTO_DATAGRAM;
+ }
+}
+
+static int ipcaif_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ int ret;
+ struct chnl_net *caifdev;
+ ASSERT_RTNL();
+ caifdev = netdev_priv(dev);
+ caif_netlink_parms(data, &caifdev->conn_req);
+
+ ret = register_netdevice(dev);
+ if (ret)
+ pr_warn("device rtml registration failed\n");
+ else
+ list_add(&caifdev->list_field, &chnl_net_list);
+
+ /* Use ifindex as connection id, and use loopback channel default. */
+ if (caifdev->conn_req.sockaddr.u.dgm.connection_id == UNDEF_CONNID) {
+ caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex;
+ caifdev->conn_req.protocol = CAIFPROTO_DATAGRAM_LOOP;
+ }
+ return ret;
+}
+
+static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct chnl_net *caifdev;
+ ASSERT_RTNL();
+ caifdev = netdev_priv(dev);
+ caif_netlink_parms(data, &caifdev->conn_req);
+ netdev_state_change(dev);
+ return 0;
+}
+
+static size_t ipcaif_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_CAIF_IPV4_CONNID */
+ nla_total_size(4) +
+ /* IFLA_CAIF_IPV6_CONNID */
+ nla_total_size(4) +
+ /* IFLA_CAIF_LOOPBACK */
+ nla_total_size(2) +
+ 0;
+}
+
+static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = {
+ [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 },
+ [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 },
+ [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 }
+};
+
+
+static struct rtnl_link_ops ipcaif_link_ops __read_mostly = {
+ .kind = "caif",
+ .priv_size = sizeof(struct chnl_net),
+ .setup = ipcaif_net_setup,
+ .maxtype = IFLA_CAIF_MAX,
+ .policy = ipcaif_policy,
+ .newlink = ipcaif_newlink,
+ .changelink = ipcaif_changelink,
+ .get_size = ipcaif_get_size,
+ .fill_info = ipcaif_fill_info,
+
+};
+
+static int __init chnl_init_module(void)
+{
+ return rtnl_link_register(&ipcaif_link_ops);
+}
+
+static void __exit chnl_exit_module(void)
+{
+ struct chnl_net *dev = NULL;
+ struct list_head *list_node;
+ struct list_head *_tmp;
+ rtnl_link_unregister(&ipcaif_link_ops);
+ rtnl_lock();
+ list_for_each_safe(list_node, _tmp, &chnl_net_list) {
+ dev = list_entry(list_node, struct chnl_net, list_field);
+ list_del_init(list_node);
+ delete_device(dev);
+ }
+ rtnl_unlock();
+}
+
+module_init(chnl_init_module);
+module_exit(chnl_exit_module);
diff --git a/net/can/Kconfig b/net/can/Kconfig
new file mode 100644
index 000000000000..e4ccf731a24c
--- /dev/null
+++ b/net/can/Kconfig
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Controller Area Network (CAN) network layer core configuration
+#
+
+menuconfig CAN
+ tristate "CAN bus subsystem support"
+ select CAN_DEV
+ help
+ Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
+ communications protocol. Development of the CAN bus started in
+ 1983 at Robert Bosch GmbH, and the protocol was officially
+ released in 1986. The CAN bus was originally mainly for automotive,
+ but is now widely used in marine (NMEA2000), industrial, and medical
+ applications. More information on the CAN network protocol family
+ PF_CAN is contained in <Documentation/networking/can.rst>.
+
+ If you want CAN support you should say Y here and also to the
+ specific driver for your controller(s) under the Network device
+ support section.
+
+if CAN
+
+config CAN_RAW
+ tristate "Raw CAN Protocol (raw access with CAN-ID filtering)"
+ default y
+ help
+ The raw CAN protocol option offers access to the CAN bus via
+ the BSD socket API. You probably want to use the raw socket in
+ most cases where no higher level protocol is being used. The raw
+ socket has several filter options e.g. ID masking / error frames.
+ To receive/send raw CAN messages, use AF_CAN with protocol CAN_RAW.
+
+config CAN_BCM
+ tristate "Broadcast Manager CAN Protocol (with content filtering)"
+ default y
+ help
+ The Broadcast Manager offers content filtering, timeout monitoring,
+ sending of RTR frames, and cyclic CAN messages without permanent user
+ interaction. The BCM can be 'programmed' via the BSD socket API and
+ informs you on demand e.g. only on content updates / timeouts.
+ You probably want to use the bcm socket in most cases where cyclic
+ CAN messages are used on the bus (e.g. in automotive environments).
+ To use the Broadcast Manager, use AF_CAN with protocol CAN_BCM.
+
+config CAN_GW
+ tristate "CAN Gateway/Router (with netlink configuration)"
+ default y
+ help
+ The CAN Gateway/Router is used to route (and modify) CAN frames.
+ It is based on the PF_CAN core infrastructure for msg filtering and
+ msg sending and can optionally modify routed CAN frames on the fly.
+ CAN frames can be routed between CAN network interfaces (one hop).
+ They can be modified with AND/OR/XOR/SET operations as configured
+ by the netlink configuration interface known e.g. from iptables.
+
+source "net/can/j1939/Kconfig"
+
+config CAN_ISOTP
+ tristate "ISO 15765-2 CAN transport protocol"
+ help
+ CAN Transport Protocols offer support for segmented Point-to-Point
+ communication between CAN nodes via two defined CAN Identifiers.
+ This protocol driver implements segmented data transfers for CAN CC
+ (aka Classical CAN, CAN 2.0B) and CAN FD frame types which were
+ introduced with ISO 15765-2:2016.
+ As CAN frames can only transport a small amount of data bytes
+ (max. 8 bytes for CAN CC and max. 64 bytes for CAN FD) this
+ segmentation is needed to transport longer Protocol Data Units (PDU)
+ as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN
+ traffic.
+
+endif
diff --git a/net/can/Makefile b/net/can/Makefile
new file mode 100644
index 000000000000..58f2c31c1ef3
--- /dev/null
+++ b/net/can/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux Controller Area Network core.
+#
+
+obj-$(CONFIG_CAN) += can.o
+can-y := af_can.o
+can-$(CONFIG_PROC_FS) += proc.o
+
+obj-$(CONFIG_CAN_RAW) += can-raw.o
+can-raw-y := raw.o
+
+obj-$(CONFIG_CAN_BCM) += can-bcm.o
+can-bcm-y := bcm.o
+
+obj-$(CONFIG_CAN_GW) += can-gw.o
+can-gw-y := gw.o
+
+obj-$(CONFIG_CAN_J1939) += j1939/
+
+obj-$(CONFIG_CAN_ISOTP) += can-isotp.o
+can-isotp-y := isotp.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
new file mode 100644
index 000000000000..770173d8db42
--- /dev/null
+++ b/net/can/af_can.c
@@ -0,0 +1,922 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* af_can.c - Protocol family CAN core module
+ * (used by different CAN protocol modules)
+ *
+ * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/can-ml.h>
+#include <linux/ratelimit.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "af_can.h"
+
+MODULE_DESCRIPTION("Controller Area Network PF_CAN core");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, "
+ "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+
+MODULE_ALIAS_NETPROTO(PF_CAN);
+
+static int stats_timer __read_mostly = 1;
+module_param(stats_timer, int, 0444);
+MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");
+
+static struct kmem_cache *rcv_cache __read_mostly;
+
+/* table of registered CAN protocols */
+static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly;
+static DEFINE_MUTEX(proto_tab_lock);
+
+static atomic_t skbcounter = ATOMIC_INIT(0);
+
+/* af_can socket functions */
+
+void can_sock_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_error_queue);
+}
+EXPORT_SYMBOL(can_sock_destruct);
+
+static const struct can_proto *can_get_proto(int protocol)
+{
+ const struct can_proto *cp;
+
+ rcu_read_lock();
+ cp = rcu_dereference(proto_tab[protocol]);
+ if (cp && !try_module_get(cp->prot->owner))
+ cp = NULL;
+ rcu_read_unlock();
+
+ return cp;
+}
+
+static inline void can_put_proto(const struct can_proto *cp)
+{
+ module_put(cp->prot->owner);
+}
+
+static int can_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ const struct can_proto *cp;
+ int err = 0;
+
+ sock->state = SS_UNCONNECTED;
+
+ if (protocol < 0 || protocol >= CAN_NPROTO)
+ return -EINVAL;
+
+ cp = can_get_proto(protocol);
+
+#ifdef CONFIG_MODULES
+ if (!cp) {
+ /* try to load protocol module if kernel is modular */
+
+ err = request_module("can-proto-%d", protocol);
+
+ /* In case of error we only print a message but don't
+ * return the error code immediately. Below we will
+ * return -EPROTONOSUPPORT
+ */
+ if (err)
+ pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n",
+ protocol);
+
+ cp = can_get_proto(protocol);
+ }
+#endif
+
+ /* check for available protocol and correct usage */
+
+ if (!cp)
+ return -EPROTONOSUPPORT;
+
+ if (cp->type != sock->type) {
+ err = -EPROTOTYPE;
+ goto errout;
+ }
+
+ sock->ops = cp->ops;
+
+ sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern);
+ if (!sk) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ sock_init_data(sock, sk);
+ sk->sk_destruct = can_sock_destruct;
+
+ if (sk->sk_prot->init)
+ err = sk->sk_prot->init(sk);
+
+ if (err) {
+ /* release sk on errors */
+ sock_orphan(sk);
+ sock_put(sk);
+ sock->sk = NULL;
+ } else {
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
+ }
+
+ errout:
+ can_put_proto(cp);
+ return err;
+}
+
+/* af_can tx path */
+
+/**
+ * can_send - transmit a CAN frame (optional with local loopback)
+ * @skb: pointer to socket buffer with CAN frame in data section
+ * @loop: loopback for listeners on local CAN sockets (recommended default!)
+ *
+ * Due to the loopback this routine must not be called from hardirq context.
+ *
+ * Return:
+ * 0 on success
+ * -ENETDOWN when the selected interface is down
+ * -ENOBUFS on full driver queue (see net_xmit_errno())
+ * -ENOMEM when local loopback failed at calling skb_clone()
+ * -EPERM when trying to send on a non-CAN interface
+ * -EMSGSIZE CAN frame size is bigger than CAN interface MTU
+ * -EINVAL when the skb->data does not contain a valid CAN frame
+ */
+int can_send(struct sk_buff *skb, int loop)
+{
+ struct sk_buff *newskb = NULL;
+ struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats;
+ int err = -EINVAL;
+
+ if (can_is_canxl_skb(skb)) {
+ skb->protocol = htons(ETH_P_CANXL);
+ } else if (can_is_can_skb(skb)) {
+ skb->protocol = htons(ETH_P_CAN);
+ } else if (can_is_canfd_skb(skb)) {
+ struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
+ skb->protocol = htons(ETH_P_CANFD);
+
+ /* set CAN FD flag for CAN FD frames by default */
+ cfd->flags |= CANFD_FDF;
+ } else {
+ goto inval_skb;
+ }
+
+ /* Make sure the CAN frame can pass the selected CAN netdevice. */
+ if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) {
+ err = -EMSGSIZE;
+ goto inval_skb;
+ }
+
+ if (unlikely(skb->dev->type != ARPHRD_CAN)) {
+ err = -EPERM;
+ goto inval_skb;
+ }
+
+ if (unlikely(!(skb->dev->flags & IFF_UP))) {
+ err = -ENETDOWN;
+ goto inval_skb;
+ }
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ if (loop) {
+ /* local loopback of sent CAN frames */
+
+ /* indication for the CAN driver: do loopback */
+ skb->pkt_type = PACKET_LOOPBACK;
+
+ /* The reference to the originating sock may be required
+ * by the receiving socket to check whether the frame is
+ * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS
+ * Therefore we have to ensure that skb->sk remains the
+ * reference to the originating sock by restoring skb->sk
+ * after each skb_clone() or skb_orphan() usage.
+ */
+
+ if (!(skb->dev->flags & IFF_ECHO)) {
+ /* If the interface is not capable to do loopback
+ * itself, we do it here.
+ */
+ newskb = skb_clone(skb, GFP_ATOMIC);
+ if (!newskb) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ can_skb_set_owner(newskb, skb->sk);
+ newskb->ip_summed = CHECKSUM_UNNECESSARY;
+ newskb->pkt_type = PACKET_BROADCAST;
+ }
+ } else {
+ /* indication for the CAN driver: no loopback required */
+ skb->pkt_type = PACKET_HOST;
+ }
+
+ /* send to netdevice */
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ if (err) {
+ kfree_skb(newskb);
+ return err;
+ }
+
+ if (newskb)
+ netif_rx(newskb);
+
+ /* update statistics */
+ atomic_long_inc(&pkg_stats->tx_frames);
+ atomic_long_inc(&pkg_stats->tx_frames_delta);
+
+ return 0;
+
+inval_skb:
+ kfree_skb(skb);
+ return err;
+}
+EXPORT_SYMBOL(can_send);
+
+/* af_can rx path */
+
+static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net,
+ struct net_device *dev)
+{
+ if (dev) {
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+ return &can_ml->dev_rcv_lists;
+ } else {
+ return net->can.rx_alldev_list;
+ }
+}
+
+/**
+ * effhash - hash function for 29 bit CAN identifier reduction
+ * @can_id: 29 bit CAN identifier
+ *
+ * Description:
+ * To reduce the linear traversal in one linked list of _single_ EFF CAN
+ * frame subscriptions the 29 bit identifier is mapped to 10 bits.
+ * (see CAN_EFF_RCV_HASH_BITS definition)
+ *
+ * Return:
+ * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask )
+ */
+static unsigned int effhash(canid_t can_id)
+{
+ unsigned int hash;
+
+ hash = can_id;
+ hash ^= can_id >> CAN_EFF_RCV_HASH_BITS;
+ hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS);
+
+ return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1);
+}
+
+/**
+ * can_rcv_list_find - determine optimal filterlist inside device filter struct
+ * @can_id: pointer to CAN identifier of a given can_filter
+ * @mask: pointer to CAN mask of a given can_filter
+ * @dev_rcv_lists: pointer to the device filter struct
+ *
+ * Description:
+ * Returns the optimal filterlist to reduce the filter handling in the
+ * receive path. This function is called by service functions that need
+ * to register or unregister a can_filter in the filter lists.
+ *
+ * A filter matches in general, when
+ *
+ * <received_can_id> & mask == can_id & mask
+ *
+ * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe
+ * relevant bits for the filter.
+ *
+ * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg
+ * frames there is a special filterlist and a special rx path filter handling.
+ *
+ * Return:
+ * Pointer to optimal filterlist for the given can_id/mask pair.
+ * Consistency checked mask.
+ * Reduced can_id to have a preprocessed filter compare value.
+ */
+static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
+ struct can_dev_rcv_lists *dev_rcv_lists)
+{
+ canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */
+
+ /* filter for error message frames in extra filterlist */
+ if (*mask & CAN_ERR_FLAG) {
+ /* clear CAN_ERR_FLAG in filter entry */
+ *mask &= CAN_ERR_MASK;
+ return &dev_rcv_lists->rx[RX_ERR];
+ }
+
+ /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */
+
+#define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+ /* ensure valid values in can_mask for 'SFF only' frame filtering */
+ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG))
+ *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS);
+
+ /* reduce condition testing at receive time */
+ *can_id &= *mask;
+
+ /* inverse can_id/can_mask filter */
+ if (inv)
+ return &dev_rcv_lists->rx[RX_INV];
+
+ /* mask == 0 => no condition testing at receive time */
+ if (!(*mask))
+ return &dev_rcv_lists->rx[RX_ALL];
+
+ /* extra filterlists for the subscription of a single non-RTR can_id */
+ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) &&
+ !(*can_id & CAN_RTR_FLAG)) {
+ if (*can_id & CAN_EFF_FLAG) {
+ if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS))
+ return &dev_rcv_lists->rx_eff[effhash(*can_id)];
+ } else {
+ if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS))
+ return &dev_rcv_lists->rx_sff[*can_id];
+ }
+ }
+
+ /* default: filter via can_id/can_mask */
+ return &dev_rcv_lists->rx[RX_FIL];
+}
+
+/**
+ * can_rx_register - subscribe CAN frames from a specific interface
+ * @net: the applicable net namespace
+ * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list)
+ * @can_id: CAN identifier (see description)
+ * @mask: CAN mask (see description)
+ * @func: callback function on filter match
+ * @data: returned parameter for callback function
+ * @ident: string for calling module identification
+ * @sk: socket pointer (might be NULL)
+ *
+ * Description:
+ * Invokes the callback function with the received sk_buff and the given
+ * parameter 'data' on a matching receive filter. A filter matches, when
+ *
+ * <received_can_id> & mask == can_id & mask
+ *
+ * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ * filter for error message frames (CAN_ERR_FLAG bit set in mask).
+ *
+ * The provided pointer to the sk_buff is guaranteed to be valid as long as
+ * the callback function is running. The callback function must *not* free
+ * the given sk_buff while processing it's task. When the given sk_buff is
+ * needed after the end of the callback function it must be cloned inside
+ * the callback function with skb_clone().
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM on missing cache mem to create subscription entry
+ * -ENODEV unknown device
+ */
+int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
+ canid_t mask, void (*func)(struct sk_buff *, void *),
+ void *data, char *ident, struct sock *sk)
+{
+ struct receiver *rcv;
+ struct hlist_head *rcv_list;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+
+ /* insert new receiver (dev,canid,mask) -> (func,data) */
+
+ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev)))
+ return -ENODEV;
+
+ if (dev && !net_eq(net, dev_net(dev)))
+ return -ENODEV;
+
+ rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
+ if (!rcv)
+ return -ENOMEM;
+
+ spin_lock_bh(&net->can.rcvlists_lock);
+
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);
+
+ rcv->can_id = can_id;
+ rcv->mask = mask;
+ rcv->matches = 0;
+ rcv->func = func;
+ rcv->data = data;
+ rcv->ident = ident;
+ rcv->sk = sk;
+
+ hlist_add_head_rcu(&rcv->list, rcv_list);
+ dev_rcv_lists->entries++;
+
+ rcv_lists_stats->rcv_entries++;
+ rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max,
+ rcv_lists_stats->rcv_entries);
+ spin_unlock_bh(&net->can.rcvlists_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(can_rx_register);
+
+/* can_rx_delete_receiver - rcu callback for single receiver entry removal */
+static void can_rx_delete_receiver(struct rcu_head *rp)
+{
+ struct receiver *rcv = container_of(rp, struct receiver, rcu);
+ struct sock *sk = rcv->sk;
+
+ kmem_cache_free(rcv_cache, rcv);
+ if (sk)
+ sock_put(sk);
+}
+
+/**
+ * can_rx_unregister - unsubscribe CAN frames from a specific interface
+ * @net: the applicable net namespace
+ * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list)
+ * @can_id: CAN identifier
+ * @mask: CAN mask
+ * @func: callback function on filter match
+ * @data: returned parameter for callback function
+ *
+ * Description:
+ * Removes subscription entry depending on given (subscription) values.
+ */
+void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
+ canid_t mask, void (*func)(struct sk_buff *, void *),
+ void *data)
+{
+ struct receiver *rcv = NULL;
+ struct hlist_head *rcv_list;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+
+ if (dev && dev->type != ARPHRD_CAN)
+ return;
+
+ if (dev && !net_eq(net, dev_net(dev)))
+ return;
+
+ spin_lock_bh(&net->can.rcvlists_lock);
+
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);
+
+ /* Search the receiver list for the item to delete. This should
+ * exist, since no receiver may be unregistered that hasn't
+ * been registered before.
+ */
+ hlist_for_each_entry_rcu(rcv, rcv_list, list) {
+ if (rcv->can_id == can_id && rcv->mask == mask &&
+ rcv->func == func && rcv->data == data)
+ break;
+ }
+
+ /* Check for bugs in CAN protocol implementations using af_can.c:
+ * 'rcv' will be NULL if no matching list item was found for removal.
+ * As this case may potentially happen when closing a socket while
+ * the notifier for removing the CAN netdev is running we just print
+ * a warning here.
+ */
+ if (!rcv) {
+ pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n",
+ DNAME(dev), can_id, mask);
+ goto out;
+ }
+
+ hlist_del_rcu(&rcv->list);
+ dev_rcv_lists->entries--;
+
+ if (rcv_lists_stats->rcv_entries > 0)
+ rcv_lists_stats->rcv_entries--;
+
+ out:
+ spin_unlock_bh(&net->can.rcvlists_lock);
+
+ /* schedule the receiver item for deletion */
+ if (rcv) {
+ if (rcv->sk)
+ sock_hold(rcv->sk);
+ call_rcu(&rcv->rcu, can_rx_delete_receiver);
+ }
+}
+EXPORT_SYMBOL(can_rx_unregister);
+
+static inline void deliver(struct sk_buff *skb, struct receiver *rcv)
+{
+ rcv->func(skb, rcv->data);
+ rcv->matches++;
+}
+
+static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb)
+{
+ struct receiver *rcv;
+ int matches = 0;
+ struct can_frame *cf = (struct can_frame *)skb->data;
+ canid_t can_id = cf->can_id;
+
+ if (dev_rcv_lists->entries == 0)
+ return 0;
+
+ if (can_id & CAN_ERR_FLAG) {
+ /* check for error message frame entries only */
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) {
+ if (can_id & rcv->mask) {
+ deliver(skb, rcv);
+ matches++;
+ }
+ }
+ return matches;
+ }
+
+ /* check for unfiltered entries */
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) {
+ deliver(skb, rcv);
+ matches++;
+ }
+
+ /* check for can_id/mask entries */
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) {
+ if ((can_id & rcv->mask) == rcv->can_id) {
+ deliver(skb, rcv);
+ matches++;
+ }
+ }
+
+ /* check for inverted can_id/mask entries */
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) {
+ if ((can_id & rcv->mask) != rcv->can_id) {
+ deliver(skb, rcv);
+ matches++;
+ }
+ }
+
+ /* check filterlists for single non-RTR can_ids */
+ if (can_id & CAN_RTR_FLAG)
+ return matches;
+
+ if (can_id & CAN_EFF_FLAG) {
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) {
+ if (rcv->can_id == can_id) {
+ deliver(skb, rcv);
+ matches++;
+ }
+ }
+ } else {
+ can_id &= CAN_SFF_MASK;
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) {
+ deliver(skb, rcv);
+ matches++;
+ }
+ }
+
+ return matches;
+}
+
+static void can_receive(struct sk_buff *skb, struct net_device *dev)
+{
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct net *net = dev_net(dev);
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ int matches;
+
+ /* update statistics */
+ atomic_long_inc(&pkg_stats->rx_frames);
+ atomic_long_inc(&pkg_stats->rx_frames_delta);
+
+ /* create non-zero unique skb identifier together with *skb */
+ while (!(can_skb_prv(skb)->skbcnt))
+ can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter);
+
+ rcu_read_lock();
+
+ /* deliver the packet to sockets listening on all devices */
+ matches = can_rcv_filter(net->can.rx_alldev_list, skb);
+
+ /* find receive list for this device */
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ matches += can_rcv_filter(dev_rcv_lists, skb);
+
+ rcu_read_unlock();
+
+ /* consume the skbuff allocated by the netdevice driver */
+ consume_skb(skb);
+
+ if (matches > 0) {
+ atomic_long_inc(&pkg_stats->matches);
+ atomic_long_inc(&pkg_stats->matches_delta);
+ }
+}
+
+static int can_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) {
+ pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
+
+ kfree_skb_reason(skb, SKB_DROP_REASON_CAN_RX_INVALID_FRAME);
+ return NET_RX_DROP;
+ }
+
+ can_receive(skb, dev);
+ return NET_RX_SUCCESS;
+}
+
+static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) {
+ pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
+
+ kfree_skb_reason(skb, SKB_DROP_REASON_CANFD_RX_INVALID_FRAME);
+ return NET_RX_DROP;
+ }
+
+ can_receive(skb, dev);
+ return NET_RX_SUCCESS;
+}
+
+static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) {
+ pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
+
+ kfree_skb_reason(skb, SKB_DROP_REASON_CANXL_RX_INVALID_FRAME);
+ return NET_RX_DROP;
+ }
+
+ can_receive(skb, dev);
+ return NET_RX_SUCCESS;
+}
+
+/* af_can protocol functions */
+
+/**
+ * can_proto_register - register CAN transport protocol
+ * @cp: pointer to CAN protocol structure
+ *
+ * Return:
+ * 0 on success
+ * -EINVAL invalid (out of range) protocol number
+ * -EBUSY protocol already in use
+ * -ENOBUF if proto_register() fails
+ */
+int can_proto_register(const struct can_proto *cp)
+{
+ int proto = cp->protocol;
+ int err = 0;
+
+ if (proto < 0 || proto >= CAN_NPROTO) {
+ pr_err("can: protocol number %d out of range\n", proto);
+ return -EINVAL;
+ }
+
+ err = proto_register(cp->prot, 0);
+ if (err < 0)
+ return err;
+
+ mutex_lock(&proto_tab_lock);
+
+ if (rcu_access_pointer(proto_tab[proto])) {
+ pr_err("can: protocol %d already registered\n", proto);
+ err = -EBUSY;
+ } else {
+ RCU_INIT_POINTER(proto_tab[proto], cp);
+ }
+
+ mutex_unlock(&proto_tab_lock);
+
+ if (err < 0)
+ proto_unregister(cp->prot);
+
+ return err;
+}
+EXPORT_SYMBOL(can_proto_register);
+
+/**
+ * can_proto_unregister - unregister CAN transport protocol
+ * @cp: pointer to CAN protocol structure
+ */
+void can_proto_unregister(const struct can_proto *cp)
+{
+ int proto = cp->protocol;
+
+ mutex_lock(&proto_tab_lock);
+ BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp);
+ RCU_INIT_POINTER(proto_tab[proto], NULL);
+ mutex_unlock(&proto_tab_lock);
+
+ synchronize_rcu();
+
+ proto_unregister(cp->prot);
+}
+EXPORT_SYMBOL(can_proto_unregister);
+
+static int can_pernet_init(struct net *net)
+{
+ spin_lock_init(&net->can.rcvlists_lock);
+ net->can.rx_alldev_list =
+ kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL);
+ if (!net->can.rx_alldev_list)
+ goto out;
+ net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL);
+ if (!net->can.pkg_stats)
+ goto out_free_rx_alldev_list;
+ net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL);
+ if (!net->can.rcv_lists_stats)
+ goto out_free_pkg_stats;
+
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ /* the statistics are updated every second (timer triggered) */
+ if (stats_timer) {
+ timer_setup(&net->can.stattimer, can_stat_update,
+ 0);
+ mod_timer(&net->can.stattimer,
+ round_jiffies(jiffies + HZ));
+ }
+ net->can.pkg_stats->jiffies_init = jiffies;
+ can_init_proc(net);
+ }
+
+ return 0;
+
+ out_free_pkg_stats:
+ kfree(net->can.pkg_stats);
+ out_free_rx_alldev_list:
+ kfree(net->can.rx_alldev_list);
+ out:
+ return -ENOMEM;
+}
+
+static void can_pernet_exit(struct net *net)
+{
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ can_remove_proc(net);
+ if (stats_timer)
+ timer_delete_sync(&net->can.stattimer);
+ }
+
+ kfree(net->can.rx_alldev_list);
+ kfree(net->can.pkg_stats);
+ kfree(net->can.rcv_lists_stats);
+}
+
+/* af_can module init/exit functions */
+
+static struct packet_type can_packet __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CAN),
+ .func = can_rcv,
+};
+
+static struct packet_type canfd_packet __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CANFD),
+ .func = canfd_rcv,
+};
+
+static struct packet_type canxl_packet __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CANXL),
+ .func = canxl_rcv,
+};
+
+static const struct net_proto_family can_family_ops = {
+ .family = PF_CAN,
+ .create = can_create,
+ .owner = THIS_MODULE,
+};
+
+static struct pernet_operations can_pernet_ops __read_mostly = {
+ .init = can_pernet_init,
+ .exit = can_pernet_exit,
+};
+
+static __init int can_init(void)
+{
+ int err;
+
+ /* check for correct padding to be able to use the structs similarly */
+ BUILD_BUG_ON(offsetof(struct can_frame, len) !=
+ offsetof(struct canfd_frame, len) ||
+ offsetof(struct can_frame, len) !=
+ offsetof(struct canxl_frame, flags) ||
+ offsetof(struct can_frame, data) !=
+ offsetof(struct canfd_frame, data));
+
+ pr_info("can: controller area network core\n");
+
+ rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
+ 0, 0, NULL);
+ if (!rcv_cache)
+ return -ENOMEM;
+
+ err = register_pernet_subsys(&can_pernet_ops);
+ if (err)
+ goto out_pernet;
+
+ /* protocol register */
+ err = sock_register(&can_family_ops);
+ if (err)
+ goto out_sock;
+
+ dev_add_pack(&can_packet);
+ dev_add_pack(&canfd_packet);
+ dev_add_pack(&canxl_packet);
+
+ return 0;
+
+out_sock:
+ unregister_pernet_subsys(&can_pernet_ops);
+out_pernet:
+ kmem_cache_destroy(rcv_cache);
+
+ return err;
+}
+
+static __exit void can_exit(void)
+{
+ /* protocol unregister */
+ dev_remove_pack(&canxl_packet);
+ dev_remove_pack(&canfd_packet);
+ dev_remove_pack(&can_packet);
+ sock_unregister(PF_CAN);
+
+ unregister_pernet_subsys(&can_pernet_ops);
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ kmem_cache_destroy(rcv_cache);
+}
+
+module_init(can_init);
+module_exit(can_exit);
diff --git a/net/can/af_can.h b/net/can/af_can.h
new file mode 100644
index 000000000000..22f3352c77fe
--- /dev/null
+++ b/net/can/af_can.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef AF_CAN_H
+#define AF_CAN_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/can.h>
+
+/* af_can rx dispatcher structures */
+
+struct receiver {
+ struct hlist_node list;
+ canid_t can_id;
+ canid_t mask;
+ unsigned long matches;
+ void (*func)(struct sk_buff *skb, void *data);
+ void *data;
+ char *ident;
+ struct sock *sk;
+ struct rcu_head rcu;
+};
+
+/* statistic structures */
+
+/* can be reset e.g. by can_init_stats() */
+struct can_pkg_stats {
+ unsigned long jiffies_init;
+
+ atomic_long_t rx_frames;
+ atomic_long_t tx_frames;
+ atomic_long_t matches;
+
+ unsigned long total_rx_rate;
+ unsigned long total_tx_rate;
+ unsigned long total_rx_match_ratio;
+
+ unsigned long current_rx_rate;
+ unsigned long current_tx_rate;
+ unsigned long current_rx_match_ratio;
+
+ unsigned long max_rx_rate;
+ unsigned long max_tx_rate;
+ unsigned long max_rx_match_ratio;
+
+ atomic_long_t rx_frames_delta;
+ atomic_long_t tx_frames_delta;
+ atomic_long_t matches_delta;
+};
+
+/* persistent statistics */
+struct can_rcv_lists_stats {
+ unsigned long stats_reset;
+ unsigned long user_reset;
+ unsigned long rcv_entries;
+ unsigned long rcv_entries_max;
+};
+
+/* function prototypes for the CAN networklayer procfs (proc.c) */
+void can_init_proc(struct net *net);
+void can_remove_proc(struct net *net);
+void can_stat_update(struct timer_list *t);
+
+#endif /* AF_CAN_H */
diff --git a/net/can/bcm.c b/net/can/bcm.c
new file mode 100644
index 000000000000..7eba8ae01a5b
--- /dev/null
+++ b/net/can/bcm.c
@@ -0,0 +1,1865 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
+ *
+ * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/bcm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+/*
+ * To send multiple CAN frame content within TX_SETUP or to filter
+ * CAN messages with multiplex index within RX_SETUP, the number of
+ * different filters is limited to 256 due to the one byte index value.
+ */
+#define MAX_NFRAMES 256
+
+/* limit timers to 400 days for sending/timeouts */
+#define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60)
+
+/* use of last_frames[index].flags */
+#define RX_LOCAL 0x10 /* frame was created on the local host */
+#define RX_OWN 0x20 /* frame was sent via the socket it was received on */
+#define RX_RECV 0x40 /* received data for this element */
+#define RX_THR 0x80 /* element not been sent due to throttle feature */
+#define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */
+
+/* get best masking value for can_rx_register() for a given single can_id */
+#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
+ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
+ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
+
+MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+MODULE_ALIAS("can-proto-2");
+
+#define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)
+
+/*
+ * easy access to the first 64 bit of can(fd)_frame payload. cp->data is
+ * 64 bit aligned so the offset has to be multiples of 8 which is ensured
+ * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler().
+ */
+static inline u64 get_u64(const struct canfd_frame *cp, int offset)
+{
+ return *(u64 *)(cp->data + offset);
+}
+
+struct bcm_op {
+ struct list_head list;
+ struct rcu_head rcu;
+ int ifindex;
+ canid_t can_id;
+ u32 flags;
+ unsigned long frames_abs, frames_filtered;
+ struct bcm_timeval ival1, ival2;
+ struct hrtimer timer, thrtimer;
+ ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
+ int rx_ifindex;
+ int cfsiz;
+ u32 count;
+ u32 nframes;
+ u32 currframe;
+ /* void pointers to arrays of struct can[fd]_frame */
+ void *frames;
+ void *last_frames;
+ struct canfd_frame sframe;
+ struct canfd_frame last_sframe;
+ struct sock *sk;
+ struct net_device *rx_reg_dev;
+ spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */
+};
+
+struct bcm_sock {
+ struct sock sk;
+ int bound;
+ int ifindex;
+ struct list_head notifier;
+ struct list_head rx_ops;
+ struct list_head tx_ops;
+ unsigned long dropped_usr_msgs;
+ struct proc_dir_entry *bcm_proc_read;
+ char procname [32]; /* inode number in decimal with \0 */
+};
+
+static LIST_HEAD(bcm_notifier_list);
+static DEFINE_SPINLOCK(bcm_notifier_lock);
+static struct bcm_sock *bcm_busy_notifier;
+
+/* Return pointer to store the extra msg flags for bcm_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *bcm_flags(struct sk_buff *skb)
+{
+ /* return pointer after struct sockaddr_can */
+ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
+static inline struct bcm_sock *bcm_sk(const struct sock *sk)
+{
+ return (struct bcm_sock *)sk;
+}
+
+static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
+{
+ return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
+}
+
+/* check limitations for timeval provided by user */
+static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head)
+{
+ if ((msg_head->ival1.tv_sec < 0) ||
+ (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) ||
+ (msg_head->ival1.tv_usec < 0) ||
+ (msg_head->ival1.tv_usec >= USEC_PER_SEC) ||
+ (msg_head->ival2.tv_sec < 0) ||
+ (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) ||
+ (msg_head->ival2.tv_usec < 0) ||
+ (msg_head->ival2.tv_usec >= USEC_PER_SEC))
+ return true;
+
+ return false;
+}
+
+#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU)
+#define OPSIZ sizeof(struct bcm_op)
+#define MHSIZ sizeof(struct bcm_msg_head)
+
+/*
+ * procfs functions
+ */
+#if IS_ENABLED(CONFIG_PROC_FS)
+static char *bcm_proc_getifname(struct net *net, char *result, int ifindex)
+{
+ struct net_device *dev;
+
+ if (!ifindex)
+ return "any";
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ strcpy(result, dev->name);
+ else
+ strcpy(result, "???");
+ rcu_read_unlock();
+
+ return result;
+}
+
+static int bcm_proc_show(struct seq_file *m, void *v)
+{
+ char ifname[IFNAMSIZ];
+ struct net *net = m->private;
+ struct sock *sk = (struct sock *)pde_data(m->file->f_inode);
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct bcm_op *op;
+
+ seq_printf(m, ">>> socket %pK", sk->sk_socket);
+ seq_printf(m, " / sk %pK", sk);
+ seq_printf(m, " / bo %pK", bo);
+ seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs);
+ seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex));
+ seq_printf(m, " <<<\n");
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(op, &bo->rx_ops, list) {
+
+ unsigned long reduction;
+
+ /* print only active entries & prevent division by zero */
+ if (!op->frames_abs)
+ continue;
+
+ seq_printf(m, "rx_op: %03X %-5s ", op->can_id,
+ bcm_proc_getifname(net, ifname, op->ifindex));
+
+ if (op->flags & CAN_FD_FRAME)
+ seq_printf(m, "(%u)", op->nframes);
+ else
+ seq_printf(m, "[%u]", op->nframes);
+
+ seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
+
+ if (op->kt_ival1)
+ seq_printf(m, "timeo=%lld ",
+ (long long)ktime_to_us(op->kt_ival1));
+
+ if (op->kt_ival2)
+ seq_printf(m, "thr=%lld ",
+ (long long)ktime_to_us(op->kt_ival2));
+
+ seq_printf(m, "# recv %ld (%ld) => reduction: ",
+ op->frames_filtered, op->frames_abs);
+
+ reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;
+
+ seq_printf(m, "%s%ld%%\n",
+ (reduction == 100) ? "near " : "", reduction);
+ }
+
+ list_for_each_entry(op, &bo->tx_ops, list) {
+
+ seq_printf(m, "tx_op: %03X %s ", op->can_id,
+ bcm_proc_getifname(net, ifname, op->ifindex));
+
+ if (op->flags & CAN_FD_FRAME)
+ seq_printf(m, "(%u) ", op->nframes);
+ else
+ seq_printf(m, "[%u] ", op->nframes);
+
+ if (op->kt_ival1)
+ seq_printf(m, "t1=%lld ",
+ (long long)ktime_to_us(op->kt_ival1));
+
+ if (op->kt_ival2)
+ seq_printf(m, "t2=%lld ",
+ (long long)ktime_to_us(op->kt_ival2));
+
+ seq_printf(m, "# sent %ld\n", op->frames_abs);
+ }
+ seq_putc(m, '\n');
+
+ rcu_read_unlock();
+
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface
+ * of the given bcm tx op
+ */
+static void bcm_can_tx(struct bcm_op *op)
+{
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ int err;
+
+ /* no target device? => exit */
+ if (!op->ifindex)
+ return;
+
+ /* read currframe under lock protection */
+ spin_lock_bh(&op->bcm_tx_lock);
+ cf = op->frames + op->cfsiz * op->currframe;
+ spin_unlock_bh(&op->bcm_tx_lock);
+
+ dev = dev_get_by_index(sock_net(op->sk), op->ifindex);
+ if (!dev) {
+ /* RFC: should this bcm_op remove itself here? */
+ return;
+ }
+
+ skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any());
+ if (!skb)
+ goto out;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ skb_put_data(skb, cf, op->cfsiz);
+
+ /* send with loopback */
+ skb->dev = dev;
+ can_skb_set_owner(skb, op->sk);
+ err = can_send(skb, 1);
+
+ /* update currframe and count under lock protection */
+ spin_lock_bh(&op->bcm_tx_lock);
+
+ if (!err)
+ op->frames_abs++;
+
+ op->currframe++;
+
+ /* reached last frame? */
+ if (op->currframe >= op->nframes)
+ op->currframe = 0;
+
+ if (op->count > 0)
+ op->count--;
+
+ spin_unlock_bh(&op->bcm_tx_lock);
+out:
+ dev_put(dev);
+}
+
+/*
+ * bcm_send_to_user - send a BCM message to the userspace
+ * (consisting of bcm_msg_head + x CAN frames)
+ */
+static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
+ struct canfd_frame *frames, int has_timestamp)
+{
+ struct sk_buff *skb;
+ struct canfd_frame *firstframe;
+ struct sockaddr_can *addr;
+ struct sock *sk = op->sk;
+ unsigned int datalen = head->nframes * op->cfsiz;
+ int err;
+ unsigned int *pflags;
+ enum skb_drop_reason reason;
+
+ skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
+ if (!skb)
+ return;
+
+ skb_put_data(skb, head, sizeof(*head));
+
+ /* ensure space for sockaddr_can and msg flags */
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
+ sizeof(unsigned int));
+
+ /* initialize msg flags */
+ pflags = bcm_flags(skb);
+ *pflags = 0;
+
+ if (head->nframes) {
+ /* CAN frames starting here */
+ firstframe = (struct canfd_frame *)skb_tail_pointer(skb);
+
+ skb_put_data(skb, frames, datalen);
+
+ /*
+ * the BCM uses the flags-element of the canfd_frame
+ * structure for internal purposes. This is only
+ * relevant for updates that are generated by the
+ * BCM, where nframes is 1
+ */
+ if (head->nframes == 1) {
+ if (firstframe->flags & RX_LOCAL)
+ *pflags |= MSG_DONTROUTE;
+ if (firstframe->flags & RX_OWN)
+ *pflags |= MSG_CONFIRM;
+
+ firstframe->flags &= BCM_CAN_FLAGS_MASK;
+ }
+ }
+
+ if (has_timestamp) {
+ /* restore rx timestamp */
+ skb->tstamp = op->rx_stamp;
+ }
+
+ /*
+ * Put the datagram to the queue so that bcm_recvmsg() can
+ * get it from there. We need to pass the interface index to
+ * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
+ * containing the interface index.
+ */
+
+ addr = (struct sockaddr_can *)skb->cb;
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = op->rx_ifindex;
+
+ err = sock_queue_rcv_skb_reason(sk, skb, &reason);
+ if (err < 0) {
+ struct bcm_sock *bo = bcm_sk(sk);
+
+ sk_skb_reason_drop(sk, skb, reason);
+ /* don't care about overflows in this statistic */
+ bo->dropped_usr_msgs++;
+ }
+}
+
+static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
+{
+ ktime_t ival;
+
+ if (op->kt_ival1 && op->count)
+ ival = op->kt_ival1;
+ else if (op->kt_ival2)
+ ival = op->kt_ival2;
+ else
+ return false;
+
+ hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
+ return true;
+}
+
+static void bcm_tx_start_timer(struct bcm_op *op)
+{
+ if (bcm_tx_set_expiry(op, &op->timer))
+ hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
+}
+
+/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
+ struct bcm_msg_head msg_head;
+
+ if (op->kt_ival1 && (op->count > 0)) {
+ bcm_can_tx(op);
+ if (!op->count && (op->flags & TX_COUNTEVT)) {
+
+ /* create notification to user */
+ memset(&msg_head, 0, sizeof(msg_head));
+ msg_head.opcode = TX_EXPIRED;
+ msg_head.flags = op->flags;
+ msg_head.count = op->count;
+ msg_head.ival1 = op->ival1;
+ msg_head.ival2 = op->ival2;
+ msg_head.can_id = op->can_id;
+ msg_head.nframes = 0;
+
+ bcm_send_to_user(op, &msg_head, NULL, 0);
+ }
+
+ } else if (op->kt_ival2) {
+ bcm_can_tx(op);
+ }
+
+ return bcm_tx_set_expiry(op, &op->timer) ?
+ HRTIMER_RESTART : HRTIMER_NORESTART;
+}
+
+/*
+ * bcm_rx_changed - create a RX_CHANGED notification due to changed content
+ */
+static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
+{
+ struct bcm_msg_head head;
+
+ /* update statistics */
+ op->frames_filtered++;
+
+ /* prevent statistics overflow */
+ if (op->frames_filtered > ULONG_MAX/100)
+ op->frames_filtered = op->frames_abs = 0;
+
+ /* this element is not throttled anymore */
+ data->flags &= ~RX_THR;
+
+ memset(&head, 0, sizeof(head));
+ head.opcode = RX_CHANGED;
+ head.flags = op->flags;
+ head.count = op->count;
+ head.ival1 = op->ival1;
+ head.ival2 = op->ival2;
+ head.can_id = op->can_id;
+ head.nframes = 1;
+
+ bcm_send_to_user(op, &head, data, 1);
+}
+
+/*
+ * bcm_rx_update_and_send - process a detected relevant receive content change
+ * 1. update the last received data
+ * 2. send a notification to the user (if possible)
+ */
+static void bcm_rx_update_and_send(struct bcm_op *op,
+ struct canfd_frame *lastdata,
+ const struct canfd_frame *rxdata,
+ unsigned char traffic_flags)
+{
+ memcpy(lastdata, rxdata, op->cfsiz);
+
+ /* mark as used and throttled by default */
+ lastdata->flags |= (RX_RECV|RX_THR);
+
+ /* add own/local/remote traffic flags */
+ lastdata->flags |= traffic_flags;
+
+ /* throttling mode inactive ? */
+ if (!op->kt_ival2) {
+ /* send RX_CHANGED to the user immediately */
+ bcm_rx_changed(op, lastdata);
+ return;
+ }
+
+ /* with active throttling timer we are just done here */
+ if (hrtimer_active(&op->thrtimer))
+ return;
+
+ /* first reception with enabled throttling mode */
+ if (!op->kt_lastmsg)
+ goto rx_changed_settime;
+
+ /* got a second frame inside a potential throttle period? */
+ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) <
+ ktime_to_us(op->kt_ival2)) {
+ /* do not send the saved data - only start throttle timer */
+ hrtimer_start(&op->thrtimer,
+ ktime_add(op->kt_lastmsg, op->kt_ival2),
+ HRTIMER_MODE_ABS_SOFT);
+ return;
+ }
+
+ /* the gap was that big, that throttling was not needed here */
+rx_changed_settime:
+ bcm_rx_changed(op, lastdata);
+ op->kt_lastmsg = ktime_get();
+}
+
+/*
+ * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
+ * received data stored in op->last_frames[]
+ */
+static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
+ const struct canfd_frame *rxdata,
+ unsigned char traffic_flags)
+{
+ struct canfd_frame *cf = op->frames + op->cfsiz * index;
+ struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
+ int i;
+
+ /*
+ * no one uses the MSBs of flags for comparison,
+ * so we use it here to detect the first time of reception
+ */
+
+ if (!(lcf->flags & RX_RECV)) {
+ /* received data for the first time => send update to user */
+ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
+ return;
+ }
+
+ /* do a real check in CAN frame data section */
+ for (i = 0; i < rxdata->len; i += 8) {
+ if ((get_u64(cf, i) & get_u64(rxdata, i)) !=
+ (get_u64(cf, i) & get_u64(lcf, i))) {
+ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
+ return;
+ }
+ }
+
+ if (op->flags & RX_CHECK_DLC) {
+ /* do a real check in CAN frame length */
+ if (rxdata->len != lcf->len) {
+ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
+ return;
+ }
+ }
+}
+
+/*
+ * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception
+ */
+static void bcm_rx_starttimer(struct bcm_op *op)
+{
+ if (op->flags & RX_NO_AUTOTIMER)
+ return;
+
+ if (op->kt_ival1)
+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
+}
+
+/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
+ struct bcm_msg_head msg_head;
+
+ /* if user wants to be informed, when cyclic CAN-Messages come back */
+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
+ /* clear received CAN frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
+ }
+
+ /* create notification to user */
+ memset(&msg_head, 0, sizeof(msg_head));
+ msg_head.opcode = RX_TIMEOUT;
+ msg_head.flags = op->flags;
+ msg_head.count = op->count;
+ msg_head.ival1 = op->ival1;
+ msg_head.ival2 = op->ival2;
+ msg_head.can_id = op->can_id;
+ msg_head.nframes = 0;
+
+ bcm_send_to_user(op, &msg_head, NULL, 0);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * bcm_rx_do_flush - helper for bcm_rx_thr_flush
+ */
+static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
+{
+ struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
+
+ if ((op->last_frames) && (lcf->flags & RX_THR)) {
+ bcm_rx_changed(op, lcf);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
+ */
+static int bcm_rx_thr_flush(struct bcm_op *op)
+{
+ int updated = 0;
+
+ if (op->nframes > 1) {
+ unsigned int i;
+
+ /* for MUX filter we start at index 1 */
+ for (i = 1; i < op->nframes; i++)
+ updated += bcm_rx_do_flush(op, i);
+
+ } else {
+ /* for RX_FILTER_ID and simple filter */
+ updated += bcm_rx_do_flush(op, 0);
+ }
+
+ return updated;
+}
+
+/*
+ * bcm_rx_thr_handler - the time for blocked content updates is over now:
+ * Check for throttled data and send it to the userspace
+ */
+static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
+
+ if (bcm_rx_thr_flush(op)) {
+ hrtimer_forward_now(hrtimer, op->kt_ival2);
+ return HRTIMER_RESTART;
+ } else {
+ /* rearm throttle handling */
+ op->kt_lastmsg = 0;
+ return HRTIMER_NORESTART;
+ }
+}
+
+/*
+ * bcm_rx_handler - handle a CAN frame reception
+ */
+static void bcm_rx_handler(struct sk_buff *skb, void *data)
+{
+ struct bcm_op *op = (struct bcm_op *)data;
+ const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data;
+ unsigned int i;
+ unsigned char traffic_flags;
+
+ if (op->can_id != rxframe->can_id)
+ return;
+
+ /* make sure to handle the correct frame type (CAN / CAN FD) */
+ if (op->flags & CAN_FD_FRAME) {
+ if (!can_is_canfd_skb(skb))
+ return;
+ } else {
+ if (!can_is_can_skb(skb))
+ return;
+ }
+
+ /* disable timeout */
+ hrtimer_cancel(&op->timer);
+
+ /* save rx timestamp */
+ op->rx_stamp = skb->tstamp;
+ /* save originator for recvfrom() */
+ op->rx_ifindex = skb->dev->ifindex;
+ /* update statistics */
+ op->frames_abs++;
+
+ if (op->flags & RX_RTR_FRAME) {
+ /* send reply for RTR-request (placed in op->frames[0]) */
+ bcm_can_tx(op);
+ return;
+ }
+
+ /* compute flags to distinguish between own/local/remote CAN traffic */
+ traffic_flags = 0;
+ if (skb->sk) {
+ traffic_flags |= RX_LOCAL;
+ if (skb->sk == op->sk)
+ traffic_flags |= RX_OWN;
+ }
+
+ if (op->flags & RX_FILTER_ID) {
+ /* the easiest case */
+ bcm_rx_update_and_send(op, op->last_frames, rxframe,
+ traffic_flags);
+ goto rx_starttimer;
+ }
+
+ if (op->nframes == 1) {
+ /* simple compare with index 0 */
+ bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags);
+ goto rx_starttimer;
+ }
+
+ if (op->nframes > 1) {
+ /*
+ * multiplex compare
+ *
+ * find the first multiplex mask that fits.
+ * Remark: The MUX-mask is stored in index 0 - but only the
+ * first 64 bits of the frame data[] are relevant (CAN FD)
+ */
+
+ for (i = 1; i < op->nframes; i++) {
+ if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) ==
+ (get_u64(op->frames, 0) &
+ get_u64(op->frames + op->cfsiz * i, 0))) {
+ bcm_rx_cmp_to_index(op, i, rxframe,
+ traffic_flags);
+ break;
+ }
+ }
+ }
+
+rx_starttimer:
+ bcm_rx_starttimer(op);
+}
+
+/*
+ * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
+ */
+static struct bcm_op *bcm_find_op(struct list_head *ops,
+ struct bcm_msg_head *mh, int ifindex)
+{
+ struct bcm_op *op;
+
+ list_for_each_entry(op, ops, list) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME))
+ return op;
+ }
+
+ return NULL;
+}
+
+static void bcm_free_op_rcu(struct rcu_head *rcu_head)
+{
+ struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu);
+
+ if ((op->frames) && (op->frames != &op->sframe))
+ kfree(op->frames);
+
+ if ((op->last_frames) && (op->last_frames != &op->last_sframe))
+ kfree(op->last_frames);
+
+ kfree(op);
+}
+
+static void bcm_remove_op(struct bcm_op *op)
+{
+ hrtimer_cancel(&op->timer);
+ hrtimer_cancel(&op->thrtimer);
+
+ call_rcu(&op->rcu, bcm_free_op_rcu);
+}
+
+static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
+{
+ if (op->rx_reg_dev == dev) {
+ can_rx_unregister(dev_net(dev), dev, op->can_id,
+ REGMASK(op->can_id), bcm_rx_handler, op);
+
+ /* mark as removed subscription */
+ op->rx_reg_dev = NULL;
+ } else
+ printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device "
+ "mismatch %p %p\n", op->rx_reg_dev, dev);
+}
+
+/*
+ * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
+ */
+static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
+ int ifindex)
+{
+ struct bcm_op *op, *n;
+
+ list_for_each_entry_safe(op, n, ops, list) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
+
+ /* disable automatic timer on frame reception */
+ op->flags |= RX_NO_AUTOTIMER;
+
+ /*
+ * Don't care if we're bound or not (due to netdev
+ * problems) can_rx_unregister() is always a save
+ * thing to do here.
+ */
+ if (op->ifindex) {
+ /*
+ * Only remove subscriptions that had not
+ * been removed due to NETDEV_UNREGISTER
+ * in bcm_notifier()
+ */
+ if (op->rx_reg_dev) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(sock_net(op->sk),
+ op->ifindex);
+ if (dev) {
+ bcm_rx_unreg(dev, op);
+ dev_put(dev);
+ }
+ }
+ } else
+ can_rx_unregister(sock_net(op->sk), NULL,
+ op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op);
+
+ list_del_rcu(&op->list);
+ bcm_remove_op(op);
+ return 1; /* done */
+ }
+ }
+
+ return 0; /* not found */
+}
+
+/*
+ * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
+ */
+static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
+ int ifindex)
+{
+ struct bcm_op *op, *n;
+
+ list_for_each_entry_safe(op, n, ops, list) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
+ list_del_rcu(&op->list);
+ bcm_remove_op(op);
+ return 1; /* done */
+ }
+ }
+
+ return 0; /* not found */
+}
+
+/*
+ * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg)
+ */
+static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
+ int ifindex)
+{
+ struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex);
+
+ if (!op)
+ return -EINVAL;
+
+ /* put current values into msg_head */
+ msg_head->flags = op->flags;
+ msg_head->count = op->count;
+ msg_head->ival1 = op->ival1;
+ msg_head->ival2 = op->ival2;
+ msg_head->nframes = op->nframes;
+
+ bcm_send_to_user(op, msg_head, op->frames, 0);
+
+ return MHSIZ;
+}
+
+/*
+ * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg)
+ */
+static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
+ int ifindex, struct sock *sk)
+{
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct bcm_op *op;
+ struct canfd_frame *cf;
+ unsigned int i;
+ int err;
+
+ /* we need a real device to send frames */
+ if (!ifindex)
+ return -ENODEV;
+
+ /* check nframes boundaries - we need at least one CAN frame */
+ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
+ return -EINVAL;
+
+ /* check timeval limitations */
+ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
+ return -EINVAL;
+
+ /* check the given can_id */
+ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex);
+ if (op) {
+ /* update existing BCM operation */
+
+ /*
+ * Do we need more space for the CAN frames than currently
+ * allocated? -> This is a _really_ unusual use-case and
+ * therefore (complexity / locking) it is not supported.
+ */
+ if (msg_head->nframes > op->nframes)
+ return -E2BIG;
+
+ /* update CAN frames content */
+ for (i = 0; i < msg_head->nframes; i++) {
+
+ cf = op->frames + op->cfsiz * i;
+ err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+
+ if (op->flags & CAN_FD_FRAME) {
+ if (cf->len > 64)
+ err = -EINVAL;
+ } else {
+ if (cf->len > 8)
+ err = -EINVAL;
+ }
+
+ if (err < 0)
+ return err;
+
+ if (msg_head->flags & TX_CP_CAN_ID) {
+ /* copy can_id into frame */
+ cf->can_id = msg_head->can_id;
+ }
+ }
+ op->flags = msg_head->flags;
+
+ /* only lock for unlikely count/nframes/currframe changes */
+ if (op->nframes != msg_head->nframes ||
+ op->flags & TX_RESET_MULTI_IDX ||
+ op->flags & SETTIMER) {
+
+ spin_lock_bh(&op->bcm_tx_lock);
+
+ if (op->nframes != msg_head->nframes ||
+ op->flags & TX_RESET_MULTI_IDX) {
+ /* potentially update changed nframes */
+ op->nframes = msg_head->nframes;
+ /* restart multiple frame transmission */
+ op->currframe = 0;
+ }
+
+ if (op->flags & SETTIMER)
+ op->count = msg_head->count;
+
+ spin_unlock_bh(&op->bcm_tx_lock);
+ }
+
+ } else {
+ /* insert new BCM operation for the given can_id */
+
+ op = kzalloc(OPSIZ, GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ spin_lock_init(&op->bcm_tx_lock);
+ op->can_id = msg_head->can_id;
+ op->cfsiz = CFSIZ(msg_head->flags);
+ op->flags = msg_head->flags;
+ op->nframes = msg_head->nframes;
+
+ if (op->flags & SETTIMER)
+ op->count = msg_head->count;
+
+ /* create array for CAN frames and copy the data */
+ if (msg_head->nframes > 1) {
+ op->frames = kmalloc_array(msg_head->nframes,
+ op->cfsiz,
+ GFP_KERNEL);
+ if (!op->frames) {
+ kfree(op);
+ return -ENOMEM;
+ }
+ } else
+ op->frames = &op->sframe;
+
+ for (i = 0; i < msg_head->nframes; i++) {
+
+ cf = op->frames + op->cfsiz * i;
+ err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+ if (err < 0)
+ goto free_op;
+
+ if (op->flags & CAN_FD_FRAME) {
+ if (cf->len > 64)
+ err = -EINVAL;
+ } else {
+ if (cf->len > 8)
+ err = -EINVAL;
+ }
+
+ if (err < 0)
+ goto free_op;
+
+ if (msg_head->flags & TX_CP_CAN_ID) {
+ /* copy can_id into frame */
+ cf->can_id = msg_head->can_id;
+ }
+ }
+
+ /* tx_ops never compare with previous received messages */
+ op->last_frames = NULL;
+
+ /* bcm_can_tx / bcm_tx_timeout_handler needs this */
+ op->sk = sk;
+ op->ifindex = ifindex;
+
+ /* initialize uninitialized (kzalloc) structure */
+ hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+
+ /* currently unused in tx_ops */
+ hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+
+ /* add this bcm_op to the list of the tx_ops */
+ list_add(&op->list, &bo->tx_ops);
+
+ } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */
+
+ if (op->flags & SETTIMER) {
+ /* set timer values */
+ op->ival1 = msg_head->ival1;
+ op->ival2 = msg_head->ival2;
+ op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
+ op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
+
+ /* disable an active timer due to zero values? */
+ if (!op->kt_ival1 && !op->kt_ival2)
+ hrtimer_cancel(&op->timer);
+ }
+
+ if (op->flags & STARTTIMER) {
+ hrtimer_cancel(&op->timer);
+ /* spec: send CAN frame when starting timer */
+ op->flags |= TX_ANNOUNCE;
+ }
+
+ if (op->flags & TX_ANNOUNCE)
+ bcm_can_tx(op);
+
+ if (op->flags & STARTTIMER)
+ bcm_tx_start_timer(op);
+
+ return msg_head->nframes * op->cfsiz + MHSIZ;
+
+free_op:
+ if (op->frames != &op->sframe)
+ kfree(op->frames);
+ kfree(op);
+ return err;
+}
+
+/*
+ * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg)
+ */
+static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
+ int ifindex, struct sock *sk)
+{
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct bcm_op *op;
+ int do_rx_register;
+ int err = 0;
+
+ if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) {
+ /* be robust against wrong usage ... */
+ msg_head->flags |= RX_FILTER_ID;
+ /* ignore trailing garbage */
+ msg_head->nframes = 0;
+ }
+
+ /* the first element contains the mux-mask => MAX_NFRAMES + 1 */
+ if (msg_head->nframes > MAX_NFRAMES + 1)
+ return -EINVAL;
+
+ if ((msg_head->flags & RX_RTR_FRAME) &&
+ ((msg_head->nframes != 1) ||
+ (!(msg_head->can_id & CAN_RTR_FLAG))))
+ return -EINVAL;
+
+ /* check timeval limitations */
+ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
+ return -EINVAL;
+
+ /* check the given can_id */
+ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex);
+ if (op) {
+ /* update existing BCM operation */
+
+ /*
+ * Do we need more space for the CAN frames than currently
+ * allocated? -> This is a _really_ unusual use-case and
+ * therefore (complexity / locking) it is not supported.
+ */
+ if (msg_head->nframes > op->nframes)
+ return -E2BIG;
+
+ if (msg_head->nframes) {
+ /* update CAN frames content */
+ err = memcpy_from_msg(op->frames, msg,
+ msg_head->nframes * op->cfsiz);
+ if (err < 0)
+ return err;
+
+ /* clear last_frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz);
+ }
+
+ op->nframes = msg_head->nframes;
+ op->flags = msg_head->flags;
+
+ /* Only an update -> do not call can_rx_register() */
+ do_rx_register = 0;
+
+ } else {
+ /* insert new BCM operation for the given can_id */
+ op = kzalloc(OPSIZ, GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ op->can_id = msg_head->can_id;
+ op->nframes = msg_head->nframes;
+ op->cfsiz = CFSIZ(msg_head->flags);
+ op->flags = msg_head->flags;
+
+ if (msg_head->nframes > 1) {
+ /* create array for CAN frames and copy the data */
+ op->frames = kmalloc_array(msg_head->nframes,
+ op->cfsiz,
+ GFP_KERNEL);
+ if (!op->frames) {
+ kfree(op);
+ return -ENOMEM;
+ }
+
+ /* create and init array for received CAN frames */
+ op->last_frames = kcalloc(msg_head->nframes,
+ op->cfsiz,
+ GFP_KERNEL);
+ if (!op->last_frames) {
+ kfree(op->frames);
+ kfree(op);
+ return -ENOMEM;
+ }
+
+ } else {
+ op->frames = &op->sframe;
+ op->last_frames = &op->last_sframe;
+ }
+
+ if (msg_head->nframes) {
+ err = memcpy_from_msg(op->frames, msg,
+ msg_head->nframes * op->cfsiz);
+ if (err < 0) {
+ if (op->frames != &op->sframe)
+ kfree(op->frames);
+ if (op->last_frames != &op->last_sframe)
+ kfree(op->last_frames);
+ kfree(op);
+ return err;
+ }
+ }
+
+ /* bcm_can_tx / bcm_tx_timeout_handler needs this */
+ op->sk = sk;
+ op->ifindex = ifindex;
+
+ /* ifindex for timeout events w/o previous frame reception */
+ op->rx_ifindex = ifindex;
+
+ /* initialize uninitialized (kzalloc) structure */
+ hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+
+ /* add this bcm_op to the list of the rx_ops */
+ list_add(&op->list, &bo->rx_ops);
+
+ /* call can_rx_register() */
+ do_rx_register = 1;
+
+ } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */
+
+ /* check flags */
+
+ if (op->flags & RX_RTR_FRAME) {
+ struct canfd_frame *frame0 = op->frames;
+
+ /* no timers in RTR-mode */
+ hrtimer_cancel(&op->thrtimer);
+ hrtimer_cancel(&op->timer);
+
+ /*
+ * funny feature in RX(!)_SETUP only for RTR-mode:
+ * copy can_id into frame BUT without RTR-flag to
+ * prevent a full-load-loopback-test ... ;-]
+ */
+ if ((op->flags & TX_CP_CAN_ID) ||
+ (frame0->can_id == op->can_id))
+ frame0->can_id = op->can_id & ~CAN_RTR_FLAG;
+
+ } else {
+ if (op->flags & SETTIMER) {
+
+ /* set timer value */
+ op->ival1 = msg_head->ival1;
+ op->ival2 = msg_head->ival2;
+ op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
+ op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
+
+ /* disable an active timer due to zero value? */
+ if (!op->kt_ival1)
+ hrtimer_cancel(&op->timer);
+
+ /*
+ * In any case cancel the throttle timer, flush
+ * potentially blocked msgs and reset throttle handling
+ */
+ op->kt_lastmsg = 0;
+ hrtimer_cancel(&op->thrtimer);
+ bcm_rx_thr_flush(op);
+ }
+
+ if ((op->flags & STARTTIMER) && op->kt_ival1)
+ hrtimer_start(&op->timer, op->kt_ival1,
+ HRTIMER_MODE_REL_SOFT);
+ }
+
+ /* now we can register for can_ids, if we added a new bcm_op */
+ if (do_rx_register) {
+ if (ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ if (dev) {
+ err = can_rx_register(sock_net(sk), dev,
+ op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op,
+ "bcm", sk);
+
+ op->rx_reg_dev = dev;
+ dev_put(dev);
+ }
+
+ } else
+ err = can_rx_register(sock_net(sk), NULL, op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op, "bcm", sk);
+ if (err) {
+ /* this bcm rx op is broken -> remove it */
+ list_del_rcu(&op->list);
+ bcm_remove_op(op);
+ return err;
+ }
+ }
+
+ return msg_head->nframes * op->cfsiz + MHSIZ;
+}
+
+/*
+ * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
+ */
+static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
+ int cfsiz)
+{
+ struct sk_buff *skb;
+ struct net_device *dev;
+ int err;
+
+ /* we need a real device to send frames */
+ if (!ifindex)
+ return -ENODEV;
+
+ skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ can_skb_reserve(skb);
+
+ err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ skb->dev = dev;
+ can_skb_set_owner(skb, sk);
+ err = can_send(skb, 1); /* send with loopback */
+ dev_put(dev);
+
+ if (err)
+ return err;
+
+ return cfsiz + MHSIZ;
+}
+
+/*
+ * bcm_sendmsg - process BCM commands (opcodes) from the userspace
+ */
+static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct bcm_sock *bo = bcm_sk(sk);
+ int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
+ struct bcm_msg_head msg_head;
+ int cfsiz;
+ int ret; /* read bytes or error codes as return value */
+
+ if (!bo->bound)
+ return -ENOTCONN;
+
+ /* check for valid message length from userspace */
+ if (size < MHSIZ)
+ return -EINVAL;
+
+ /* read message head information */
+ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
+ if (ret < 0)
+ return ret;
+
+ cfsiz = CFSIZ(msg_head.flags);
+ if ((size - MHSIZ) % cfsiz)
+ return -EINVAL;
+
+ /* check for alternative ifindex for this bcm_op */
+
+ if (!ifindex && msg->msg_name) {
+ /* no bound device as default => check msg_name */
+ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
+
+ if (msg->msg_namelen < BCM_MIN_NAMELEN)
+ return -EINVAL;
+
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ /* ifindex from sendto() */
+ ifindex = addr->can_ifindex;
+
+ if (ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+
+ dev_put(dev);
+ }
+ }
+
+ lock_sock(sk);
+
+ switch (msg_head.opcode) {
+
+ case TX_SETUP:
+ ret = bcm_tx_setup(&msg_head, msg, ifindex, sk);
+ break;
+
+ case RX_SETUP:
+ ret = bcm_rx_setup(&msg_head, msg, ifindex, sk);
+ break;
+
+ case TX_DELETE:
+ if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex))
+ ret = MHSIZ;
+ else
+ ret = -EINVAL;
+ break;
+
+ case RX_DELETE:
+ if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex))
+ ret = MHSIZ;
+ else
+ ret = -EINVAL;
+ break;
+
+ case TX_READ:
+ /* reuse msg_head for the reply to TX_READ */
+ msg_head.opcode = TX_STATUS;
+ ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex);
+ break;
+
+ case RX_READ:
+ /* reuse msg_head for the reply to RX_READ */
+ msg_head.opcode = RX_STATUS;
+ ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex);
+ break;
+
+ case TX_SEND:
+ /* we need exactly one CAN frame behind the msg head */
+ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ))
+ ret = -EINVAL;
+ else
+ ret = bcm_tx_send(msg, ifindex, sk, cfsiz);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ release_sock(sk);
+
+ return ret;
+}
+
+/*
+ * notification handler for netdevice status changes
+ */
+static void bcm_notify(struct bcm_sock *bo, unsigned long msg,
+ struct net_device *dev)
+{
+ struct sock *sk = &bo->sk;
+ struct bcm_op *op;
+ int notify_enodev = 0;
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ return;
+
+ switch (msg) {
+
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+
+ /* remove device specific receive entries */
+ list_for_each_entry(op, &bo->rx_ops, list)
+ if (op->rx_reg_dev == dev)
+ bcm_rx_unreg(dev, op);
+
+ /* remove device reference, if this is our bound device */
+ if (bo->bound && bo->ifindex == dev->ifindex) {
+#if IS_ENABLED(CONFIG_PROC_FS)
+ if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) {
+ remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir);
+ bo->bcm_proc_read = NULL;
+ }
+#endif
+ bo->bound = 0;
+ bo->ifindex = 0;
+ notify_enodev = 1;
+ }
+
+ release_sock(sk);
+
+ if (notify_enodev) {
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ }
+ break;
+
+ case NETDEV_DOWN:
+ if (bo->bound && bo->ifindex == dev->ifindex) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ }
+ }
+}
+
+static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+ if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
+ return NOTIFY_DONE;
+ if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */
+ return NOTIFY_DONE;
+
+ spin_lock(&bcm_notifier_lock);
+ list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) {
+ spin_unlock(&bcm_notifier_lock);
+ bcm_notify(bcm_busy_notifier, msg, dev);
+ spin_lock(&bcm_notifier_lock);
+ }
+ bcm_busy_notifier = NULL;
+ spin_unlock(&bcm_notifier_lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * initial settings for all BCM sockets to be set at socket creation time
+ */
+static int bcm_init(struct sock *sk)
+{
+ struct bcm_sock *bo = bcm_sk(sk);
+
+ bo->bound = 0;
+ bo->ifindex = 0;
+ bo->dropped_usr_msgs = 0;
+ bo->bcm_proc_read = NULL;
+
+ INIT_LIST_HEAD(&bo->tx_ops);
+ INIT_LIST_HEAD(&bo->rx_ops);
+
+ /* set notifier */
+ spin_lock(&bcm_notifier_lock);
+ list_add_tail(&bo->notifier, &bcm_notifier_list);
+ spin_unlock(&bcm_notifier_lock);
+
+ return 0;
+}
+
+/*
+ * standard socket functions
+ */
+static int bcm_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net;
+ struct bcm_sock *bo;
+ struct bcm_op *op, *next;
+
+ if (!sk)
+ return 0;
+
+ net = sock_net(sk);
+ bo = bcm_sk(sk);
+
+ /* remove bcm_ops, timer, rx_unregister(), etc. */
+
+ spin_lock(&bcm_notifier_lock);
+ while (bcm_busy_notifier == bo) {
+ spin_unlock(&bcm_notifier_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&bcm_notifier_lock);
+ }
+ list_del(&bo->notifier);
+ spin_unlock(&bcm_notifier_lock);
+
+ lock_sock(sk);
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+ /* remove procfs entry */
+ if (net->can.bcmproc_dir && bo->bcm_proc_read)
+ remove_proc_entry(bo->procname, net->can.bcmproc_dir);
+#endif /* CONFIG_PROC_FS */
+
+ list_for_each_entry_safe(op, next, &bo->tx_ops, list)
+ bcm_remove_op(op);
+
+ list_for_each_entry_safe(op, next, &bo->rx_ops, list) {
+ /*
+ * Don't care if we're bound or not (due to netdev problems)
+ * can_rx_unregister() is always a save thing to do here.
+ */
+ if (op->ifindex) {
+ /*
+ * Only remove subscriptions that had not
+ * been removed due to NETDEV_UNREGISTER
+ * in bcm_notifier()
+ */
+ if (op->rx_reg_dev) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, op->ifindex);
+ if (dev) {
+ bcm_rx_unreg(dev, op);
+ dev_put(dev);
+ }
+ }
+ } else
+ can_rx_unregister(net, NULL, op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op);
+
+ }
+
+ synchronize_rcu();
+
+ list_for_each_entry_safe(op, next, &bo->rx_ops, list)
+ bcm_remove_op(op);
+
+ /* remove device reference */
+ if (bo->bound) {
+ bo->bound = 0;
+ bo->ifindex = 0;
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len,
+ int flags)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct net *net = sock_net(sk);
+ int ret = 0;
+
+ if (len < BCM_MIN_NAMELEN)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (bo->bound) {
+ ret = -EISCONN;
+ goto fail;
+ }
+
+ /* bind a device to this socket */
+ if (addr->can_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, addr->can_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto fail;
+ }
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ bo->ifindex = dev->ifindex;
+ dev_put(dev);
+
+ } else {
+ /* no interface reference for ifindex = 0 ('any' CAN device) */
+ bo->ifindex = 0;
+ }
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+ if (net->can.bcmproc_dir) {
+ /* unique socket address as filename */
+ sprintf(bo->procname, "%lu", sock_i_ino(sk));
+ bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644,
+ net->can.bcmproc_dir,
+ bcm_proc_show, sk);
+ if (!bo->bcm_proc_read) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+#endif /* CONFIG_PROC_FS */
+
+ bo->bound = 1;
+
+fail:
+ release_sock(sk);
+
+ return ret;
+}
+
+static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int error = 0;
+ int err;
+
+ skb = skb_recv_datagram(sk, flags, &error);
+ if (!skb)
+ return error;
+
+ if (skb->len < size)
+ size = skb->len;
+
+ err = memcpy_to_msg(msg, skb->data, size);
+ if (err < 0) {
+ skb_free_datagram(sk, skb);
+ return err;
+ }
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (msg->msg_name) {
+ __sockaddr_check_size(BCM_MIN_NAMELEN);
+ msg->msg_namelen = BCM_MIN_NAMELEN;
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ /* assign the flags that have been recorded in bcm_send_to_user() */
+ msg->msg_flags |= *(bcm_flags(skb));
+
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops bcm_ops = {
+ .family = PF_CAN,
+ .release = bcm_release,
+ .bind = sock_no_bind,
+ .connect = bcm_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = bcm_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .sendmsg = bcm_sendmsg,
+ .recvmsg = bcm_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct proto bcm_proto __read_mostly = {
+ .name = "CAN_BCM",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct bcm_sock),
+ .init = bcm_init,
+};
+
+static const struct can_proto bcm_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_BCM,
+ .ops = &bcm_ops,
+ .prot = &bcm_proto,
+};
+
+static int canbcm_pernet_init(struct net *net)
+{
+#if IS_ENABLED(CONFIG_PROC_FS)
+ /* create /proc/net/can-bcm directory */
+ net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net);
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void canbcm_pernet_exit(struct net *net)
+{
+#if IS_ENABLED(CONFIG_PROC_FS)
+ /* remove /proc/net/can-bcm directory */
+ if (net->can.bcmproc_dir)
+ remove_proc_entry("can-bcm", net->proc_net);
+#endif /* CONFIG_PROC_FS */
+}
+
+static struct pernet_operations canbcm_pernet_ops __read_mostly = {
+ .init = canbcm_pernet_init,
+ .exit = canbcm_pernet_exit,
+};
+
+static struct notifier_block canbcm_notifier = {
+ .notifier_call = bcm_notifier
+};
+
+static int __init bcm_module_init(void)
+{
+ int err;
+
+ pr_info("can: broadcast manager protocol\n");
+
+ err = register_pernet_subsys(&canbcm_pernet_ops);
+ if (err)
+ return err;
+
+ err = register_netdevice_notifier(&canbcm_notifier);
+ if (err)
+ goto register_notifier_failed;
+
+ err = can_proto_register(&bcm_can_proto);
+ if (err < 0) {
+ printk(KERN_ERR "can: registration of bcm protocol failed\n");
+ goto register_proto_failed;
+ }
+
+ return 0;
+
+register_proto_failed:
+ unregister_netdevice_notifier(&canbcm_notifier);
+register_notifier_failed:
+ unregister_pernet_subsys(&canbcm_pernet_ops);
+ return err;
+}
+
+static void __exit bcm_module_exit(void)
+{
+ can_proto_unregister(&bcm_can_proto);
+ unregister_netdevice_notifier(&canbcm_notifier);
+ unregister_pernet_subsys(&canbcm_pernet_ops);
+}
+
+module_init(bcm_module_init);
+module_exit(bcm_module_exit);
diff --git a/net/can/gw.c b/net/can/gw.c
new file mode 100644
index 000000000000..55eccb1c7620
--- /dev/null
+++ b/net/can/gw.c
@@ -0,0 +1,1362 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
+ *
+ * Copyright (c) 2019 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/gw.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#define CAN_GW_NAME "can-gw"
+
+MODULE_DESCRIPTION("PF_CAN netlink gateway");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+MODULE_ALIAS(CAN_GW_NAME);
+
+#define CGW_MIN_HOPS 1
+#define CGW_MAX_HOPS 6
+#define CGW_DEFAULT_HOPS 1
+
+static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS;
+module_param(max_hops, uint, 0444);
+MODULE_PARM_DESC(max_hops,
+ "maximum " CAN_GW_NAME " routing hops for CAN frames "
+ "(valid values: " __stringify(CGW_MIN_HOPS) "-"
+ __stringify(CGW_MAX_HOPS) " hops, "
+ "default: " __stringify(CGW_DEFAULT_HOPS) ")");
+
+static struct notifier_block notifier;
+static struct kmem_cache *cgw_cache __read_mostly;
+
+/* structure that contains the (on-the-fly) CAN frame modifications */
+struct cf_mod {
+ struct {
+ struct canfd_frame and;
+ struct canfd_frame or;
+ struct canfd_frame xor;
+ struct canfd_frame set;
+ } modframe;
+ struct {
+ u8 and;
+ u8 or;
+ u8 xor;
+ u8 set;
+ } modtype;
+ void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf,
+ struct cf_mod *mod);
+
+ /* CAN frame checksum calculation after CAN frame modifications */
+ struct {
+ struct cgw_csum_xor xor;
+ struct cgw_csum_crc8 crc8;
+ } csum;
+ struct {
+ void (*xor)(struct canfd_frame *cf,
+ struct cgw_csum_xor *xor);
+ void (*crc8)(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8);
+ } csumfunc;
+ u32 uid;
+};
+
+/* So far we just support CAN -> CAN routing and frame modifications.
+ *
+ * The internal can_can_gw structure contains data and attributes for
+ * a CAN -> CAN gateway job.
+ */
+struct can_can_gw {
+ struct can_filter filter;
+ int src_idx;
+ int dst_idx;
+};
+
+/* list entry for CAN gateways jobs */
+struct cgw_job {
+ struct hlist_node list;
+ struct rcu_head rcu;
+ u32 handled_frames;
+ u32 dropped_frames;
+ u32 deleted_frames;
+ struct cf_mod __rcu *cf_mod;
+ union {
+ /* CAN frame data source */
+ struct net_device *dev;
+ } src;
+ union {
+ /* CAN frame data destination */
+ struct net_device *dev;
+ } dst;
+ union {
+ struct can_can_gw ccgw;
+ /* tbc */
+ };
+ u8 gwtype;
+ u8 limit_hops;
+ u16 flags;
+};
+
+/* modification functions that are invoked in the hot path in can_can_gw_rcv */
+
+#define MODFUNC(func, op) static void func(struct canfd_frame *cf, \
+ struct cf_mod *mod) { op ; }
+
+MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
+MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len)
+MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags)
+MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
+MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
+MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len)
+MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags)
+MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
+MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
+MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len)
+MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags)
+MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
+MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
+MODFUNC(mod_set_len, cf->len = mod->modframe.set.len)
+MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags)
+MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)
+
+static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i);
+}
+
+static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i);
+}
+
+static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i);
+}
+
+static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN);
+}
+
+/* retrieve valid CC DLC value and store it into 'len' */
+static void mod_retrieve_ccdlc(struct canfd_frame *cf)
+{
+ struct can_frame *ccf = (struct can_frame *)cf;
+
+ /* len8_dlc is only valid if len == CAN_MAX_DLEN */
+ if (ccf->len != CAN_MAX_DLEN)
+ return;
+
+ /* do we have a valid len8_dlc value from 9 .. 15 ? */
+ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC)
+ ccf->len = ccf->len8_dlc;
+}
+
+/* convert valid CC DLC value in 'len' into struct can_frame elements */
+static void mod_store_ccdlc(struct canfd_frame *cf)
+{
+ struct can_frame *ccf = (struct can_frame *)cf;
+
+ /* clear potential leftovers */
+ ccf->len8_dlc = 0;
+
+ /* plain data length 0 .. 8 - that was easy */
+ if (ccf->len <= CAN_MAX_DLEN)
+ return;
+
+ /* potentially broken values are caught in can_can_gw_rcv() */
+ if (ccf->len > CAN_MAX_RAW_DLC)
+ return;
+
+ /* we have a valid dlc value from 9 .. 15 in ccf->len */
+ ccf->len8_dlc = ccf->len;
+ ccf->len = CAN_MAX_DLEN;
+}
+
+static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_and_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_or_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_xor_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_set_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
+{
+ /* Copy the struct members separately to ensure that no uninitialized
+ * data are copied in the 3 bytes hole of the struct. This is needed
+ * to make easy compares of the data in the struct cf_mod.
+ */
+
+ dst->can_id = src->can_id;
+ dst->len = src->len;
+ *(u64 *)dst->data = *(u64 *)src->data;
+}
+
+static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src)
+{
+ /* Copy the struct members separately to ensure that no uninitialized
+ * data are copied in the 2 bytes hole of the struct. This is needed
+ * to make easy compares of the data in the struct cf_mod.
+ */
+
+ dst->can_id = src->can_id;
+ dst->flags = src->flags;
+ dst->len = src->len;
+ memcpy(dst->data, src->data, CANFD_MAX_DLEN);
+}
+
+static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r)
+{
+ s8 dlen = CAN_MAX_DLEN;
+
+ if (r->flags & CGW_FLAGS_CAN_FD)
+ dlen = CANFD_MAX_DLEN;
+
+ /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
+ * relative to received dlc -1 .. -8 :
+ * e.g. for received dlc = 8
+ * -1 => index = 7 (data[7])
+ * -3 => index = 5 (data[5])
+ * -8 => index = 0 (data[0])
+ */
+
+ if (fr >= -dlen && fr < dlen &&
+ to >= -dlen && to < dlen &&
+ re >= -dlen && re < dlen)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+static inline int calc_idx(int idx, int rx_len)
+{
+ if (idx < 0)
+ return rx_len + idx;
+ else
+ return idx;
+}
+
+static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor)
+{
+ int from = calc_idx(xor->from_idx, cf->len);
+ int to = calc_idx(xor->to_idx, cf->len);
+ int res = calc_idx(xor->result_idx, cf->len);
+ u8 val = xor->init_xor_val;
+ int i;
+
+ if (from < 0 || to < 0 || res < 0)
+ return;
+
+ if (from <= to) {
+ for (i = from; i <= to; i++)
+ val ^= cf->data[i];
+ } else {
+ for (i = from; i >= to; i--)
+ val ^= cf->data[i];
+ }
+
+ cf->data[res] = val;
+}
+
+static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor)
+{
+ u8 val = xor->init_xor_val;
+ int i;
+
+ for (i = xor->from_idx; i <= xor->to_idx; i++)
+ val ^= cf->data[i];
+
+ cf->data[xor->result_idx] = val;
+}
+
+static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor)
+{
+ u8 val = xor->init_xor_val;
+ int i;
+
+ for (i = xor->from_idx; i >= xor->to_idx; i--)
+ val ^= cf->data[i];
+
+ cf->data[xor->result_idx] = val;
+}
+
+static void cgw_csum_crc8_rel(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
+{
+ int from = calc_idx(crc8->from_idx, cf->len);
+ int to = calc_idx(crc8->to_idx, cf->len);
+ int res = calc_idx(crc8->result_idx, cf->len);
+ u8 crc = crc8->init_crc_val;
+ int i;
+
+ if (from < 0 || to < 0 || res < 0)
+ return;
+
+ if (from <= to) {
+ for (i = crc8->from_idx; i <= crc8->to_idx; i++)
+ crc = crc8->crctab[crc ^ cf->data[i]];
+ } else {
+ for (i = crc8->from_idx; i >= crc8->to_idx; i--)
+ crc = crc8->crctab[crc ^ cf->data[i]];
+ }
+
+ switch (crc8->profile) {
+ case CGW_CRC8PRF_1U8:
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
+ break;
+
+ case CGW_CRC8PRF_16U8:
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
+ break;
+
+ case CGW_CRC8PRF_SFFID_XOR:
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
+ (cf->can_id >> 8 & 0xFF)];
+ break;
+ }
+
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
+}
+
+static void cgw_csum_crc8_pos(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
+{
+ u8 crc = crc8->init_crc_val;
+ int i;
+
+ for (i = crc8->from_idx; i <= crc8->to_idx; i++)
+ crc = crc8->crctab[crc ^ cf->data[i]];
+
+ switch (crc8->profile) {
+ case CGW_CRC8PRF_1U8:
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
+ break;
+
+ case CGW_CRC8PRF_16U8:
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
+ break;
+
+ case CGW_CRC8PRF_SFFID_XOR:
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
+ (cf->can_id >> 8 & 0xFF)];
+ break;
+ }
+
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
+}
+
+static void cgw_csum_crc8_neg(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
+{
+ u8 crc = crc8->init_crc_val;
+ int i;
+
+ for (i = crc8->from_idx; i >= crc8->to_idx; i--)
+ crc = crc8->crctab[crc ^ cf->data[i]];
+
+ switch (crc8->profile) {
+ case CGW_CRC8PRF_1U8:
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
+ break;
+
+ case CGW_CRC8PRF_16U8:
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
+ break;
+
+ case CGW_CRC8PRF_SFFID_XOR:
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
+ (cf->can_id >> 8 & 0xFF)];
+ break;
+ }
+
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
+}
+
+/* the receive & process & send function */
+static void can_can_gw_rcv(struct sk_buff *skb, void *data)
+{
+ struct cgw_job *gwj = (struct cgw_job *)data;
+ struct canfd_frame *cf;
+ struct sk_buff *nskb;
+ struct cf_mod *mod;
+ int modidx = 0;
+
+ /* process strictly Classic CAN or CAN FD frames */
+ if (gwj->flags & CGW_FLAGS_CAN_FD) {
+ if (!can_is_canfd_skb(skb))
+ return;
+ } else {
+ if (!can_is_can_skb(skb))
+ return;
+ }
+
+ /* Do not handle CAN frames routed more than 'max_hops' times.
+ * In general we should never catch this delimiter which is intended
+ * to cover a misconfiguration protection (e.g. circular CAN routes).
+ *
+ * The Controller Area Network controllers only accept CAN frames with
+ * correct CRCs - which are not visible in the controller registers.
+ * According to skbuff.h documentation the csum_start element for IP
+ * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY.
+ * Only CAN skbs can be processed here which already have this property.
+ */
+
+#define cgw_hops(skb) ((skb)->csum_start)
+
+ BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY);
+
+ if (cgw_hops(skb) >= max_hops) {
+ /* indicate deleted frames due to misconfiguration */
+ gwj->deleted_frames++;
+ return;
+ }
+
+ if (!(gwj->dst.dev->flags & IFF_UP)) {
+ gwj->dropped_frames++;
+ return;
+ }
+
+ /* is sending the skb back to the incoming interface not allowed? */
+ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) &&
+ can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex)
+ return;
+
+ /* clone the given skb, which has not been done in can_rcv()
+ *
+ * When there is at least one modification function activated,
+ * we need to copy the skb as we want to modify skb->data.
+ */
+ mod = rcu_dereference(gwj->cf_mod);
+ if (mod->modfunc[0])
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ else
+ nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (!nskb) {
+ gwj->dropped_frames++;
+ return;
+ }
+
+ /* put the incremented hop counter in the cloned skb */
+ cgw_hops(nskb) = cgw_hops(skb) + 1;
+
+ /* first processing of this CAN frame -> adjust to private hop limit */
+ if (gwj->limit_hops && cgw_hops(nskb) == 1)
+ cgw_hops(nskb) = max_hops - gwj->limit_hops + 1;
+
+ nskb->dev = gwj->dst.dev;
+
+ /* pointer to modifiable CAN frame */
+ cf = (struct canfd_frame *)nskb->data;
+
+ /* perform preprocessed modification functions if there are any */
+ while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx])
+ (*mod->modfunc[modidx++])(cf, mod);
+
+ /* Has the CAN frame been modified? */
+ if (modidx) {
+ /* get available space for the processed CAN frame type */
+ int max_len = nskb->len - offsetof(struct canfd_frame, data);
+
+ /* dlc may have changed, make sure it fits to the CAN frame */
+ if (cf->len > max_len) {
+ /* delete frame due to misconfiguration */
+ gwj->deleted_frames++;
+ kfree_skb(nskb);
+ return;
+ }
+
+ /* check for checksum updates */
+ if (mod->csumfunc.crc8)
+ (*mod->csumfunc.crc8)(cf, &mod->csum.crc8);
+
+ if (mod->csumfunc.xor)
+ (*mod->csumfunc.xor)(cf, &mod->csum.xor);
+ }
+
+ /* clear the skb timestamp if not configured the other way */
+ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
+ nskb->tstamp = 0;
+
+ /* send to netdevice */
+ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
+ gwj->dropped_frames++;
+ else
+ gwj->handled_frames++;
+}
+
+static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj)
+{
+ return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id,
+ gwj->ccgw.filter.can_mask, can_can_gw_rcv,
+ gwj, "gw", NULL);
+}
+
+static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj)
+{
+ can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id,
+ gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj);
+}
+
+static void cgw_job_free_rcu(struct rcu_head *rcu_head)
+{
+ struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu);
+
+ /* cgw_job::cf_mod is always accessed from the same cgw_job object within
+ * the same RCU read section. Once cgw_job is scheduled for removal,
+ * cf_mod can also be removed without mandating an additional grace period.
+ */
+ kfree(rcu_access_pointer(gwj->cf_mod));
+ kmem_cache_free(cgw_cache, gwj);
+}
+
+/* Return cgw_job::cf_mod with RTNL protected section */
+static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj)
+{
+ return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked());
+}
+
+static int cgw_notifier(struct notifier_block *nb,
+ unsigned long msg, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+
+ if (msg == NETDEV_UNREGISTER) {
+ struct cgw_job *gwj = NULL;
+ struct hlist_node *nx;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
+ if (gwj->src.dev == dev || gwj->dst.dev == dev) {
+ hlist_del(&gwj->list);
+ cgw_unregister_filter(net, gwj);
+ call_rcu(&gwj->rcu, cgw_job_free_rcu);
+ }
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
+ u32 pid, u32 seq, int flags)
+{
+ struct rtcanmsg *rtcan;
+ struct nlmsghdr *nlh;
+ struct cf_mod *mod;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtcan = nlmsg_data(nlh);
+ rtcan->can_family = AF_CAN;
+ rtcan->gwtype = gwj->gwtype;
+ rtcan->flags = gwj->flags;
+
+ /* add statistics if available */
+
+ if (gwj->handled_frames) {
+ if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0)
+ goto cancel;
+ }
+
+ if (gwj->dropped_frames) {
+ if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0)
+ goto cancel;
+ }
+
+ if (gwj->deleted_frames) {
+ if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0)
+ goto cancel;
+ }
+
+ /* check non default settings of attributes */
+
+ if (gwj->limit_hops) {
+ if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0)
+ goto cancel;
+ }
+
+ mod = cgw_job_cf_mod(gwj);
+ if (gwj->flags & CGW_FLAGS_CAN_FD) {
+ struct cgw_fdframe_mod mb;
+
+ if (mod->modtype.and) {
+ memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf));
+ mb.modtype = mod->modtype.and;
+ if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.or) {
+ memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf));
+ mb.modtype = mod->modtype.or;
+ if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.xor) {
+ memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf));
+ mb.modtype = mod->modtype.xor;
+ if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.set) {
+ memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf));
+ mb.modtype = mod->modtype.set;
+ if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+ } else {
+ struct cgw_frame_mod mb;
+
+ if (mod->modtype.and) {
+ memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf));
+ mb.modtype = mod->modtype.and;
+ if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.or) {
+ memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf));
+ mb.modtype = mod->modtype.or;
+ if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.xor) {
+ memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf));
+ mb.modtype = mod->modtype.xor;
+ if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.set) {
+ memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf));
+ mb.modtype = mod->modtype.set;
+ if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+ }
+
+ if (mod->uid) {
+ if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0)
+ goto cancel;
+ }
+
+ if (mod->csumfunc.crc8) {
+ if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN,
+ &mod->csum.crc8) < 0)
+ goto cancel;
+ }
+
+ if (mod->csumfunc.xor) {
+ if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN,
+ &mod->csum.xor) < 0)
+ goto cancel;
+ }
+
+ if (gwj->gwtype == CGW_TYPE_CAN_CAN) {
+ if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) {
+ if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter),
+ &gwj->ccgw.filter) < 0)
+ goto cancel;
+ }
+
+ if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0)
+ goto cancel;
+
+ if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0)
+ goto cancel;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */
+static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct cgw_job *gwj = NULL;
+ int idx = 0;
+ int s_idx = cb->args[0];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) {
+ if (idx < s_idx)
+ goto cont;
+
+ if (cgw_put_job(skb, gwj, RTM_NEWROUTE,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
+ break;
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static const struct nla_policy cgw_policy[CGW_MAX + 1] = {
+ [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) },
+ [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) },
+ [CGW_SRC_IF] = { .type = NLA_U32 },
+ [CGW_DST_IF] = { .type = NLA_U32 },
+ [CGW_FILTER] = { .len = sizeof(struct can_filter) },
+ [CGW_LIM_HOPS] = { .type = NLA_U8 },
+ [CGW_MOD_UID] = { .type = NLA_U32 },
+ [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) },
+};
+
+/* check for common and gwtype specific attributes */
+static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
+ u8 gwtype, void *gwtypeattr, u8 *limhops)
+{
+ struct nlattr *tb[CGW_MAX + 1];
+ struct rtcanmsg *r = nlmsg_data(nlh);
+ int modidx = 0;
+ int err = 0;
+
+ /* initialize modification & checksum data space */
+ memset(mod, 0, sizeof(*mod));
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb,
+ CGW_MAX, cgw_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[CGW_LIM_HOPS]) {
+ *limhops = nla_get_u8(tb[CGW_LIM_HOPS]);
+
+ if (*limhops < 1 || *limhops > max_hops)
+ return -EINVAL;
+ }
+
+ /* check for AND/OR/XOR/SET modifications */
+ if (r->flags & CGW_FLAGS_CAN_FD) {
+ struct cgw_fdframe_mod mb;
+
+ if (tb[CGW_FDMOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN);
+
+ canfdframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_and_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_and_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_fddata;
+ }
+
+ if (tb[CGW_FDMOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN);
+
+ canfdframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_or_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_or_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_fddata;
+ }
+
+ if (tb[CGW_FDMOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN);
+
+ canfdframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_xor_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_xor_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_fddata;
+ }
+
+ if (tb[CGW_FDMOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN);
+
+ canfdframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_set_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_set_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_fddata;
+ }
+ } else {
+ struct cgw_frame_mod mb;
+
+ if (tb[CGW_MOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_and_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_data;
+ }
+
+ if (tb[CGW_MOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_or_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_data;
+ }
+
+ if (tb[CGW_MOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_xor_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_data;
+ }
+
+ if (tb[CGW_MOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_set_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_data;
+ }
+ }
+
+ /* check for checksum operations after CAN frame modifications */
+ if (modidx) {
+ if (tb[CGW_CS_CRC8]) {
+ struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]);
+
+ err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
+ c->result_idx, r);
+ if (err)
+ return err;
+
+ nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8],
+ CGW_CS_CRC8_LEN);
+
+ /* select dedicated processing function to reduce
+ * runtime operations in receive hot path.
+ */
+ if (c->from_idx < 0 || c->to_idx < 0 ||
+ c->result_idx < 0)
+ mod->csumfunc.crc8 = cgw_csum_crc8_rel;
+ else if (c->from_idx <= c->to_idx)
+ mod->csumfunc.crc8 = cgw_csum_crc8_pos;
+ else
+ mod->csumfunc.crc8 = cgw_csum_crc8_neg;
+ }
+
+ if (tb[CGW_CS_XOR]) {
+ struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]);
+
+ err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
+ c->result_idx, r);
+ if (err)
+ return err;
+
+ nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR],
+ CGW_CS_XOR_LEN);
+
+ /* select dedicated processing function to reduce
+ * runtime operations in receive hot path.
+ */
+ if (c->from_idx < 0 || c->to_idx < 0 ||
+ c->result_idx < 0)
+ mod->csumfunc.xor = cgw_csum_xor_rel;
+ else if (c->from_idx <= c->to_idx)
+ mod->csumfunc.xor = cgw_csum_xor_pos;
+ else
+ mod->csumfunc.xor = cgw_csum_xor_neg;
+ }
+
+ if (tb[CGW_MOD_UID])
+ nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32));
+ }
+
+ if (gwtype == CGW_TYPE_CAN_CAN) {
+ /* check CGW_TYPE_CAN_CAN specific attributes */
+ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr;
+
+ memset(ccgw, 0, sizeof(*ccgw));
+
+ /* check for can_filter in attributes */
+ if (tb[CGW_FILTER])
+ nla_memcpy(&ccgw->filter, tb[CGW_FILTER],
+ sizeof(struct can_filter));
+
+ err = -ENODEV;
+
+ /* specifying two interfaces is mandatory */
+ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF])
+ return err;
+
+ ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]);
+ ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]);
+
+ /* both indices set to 0 for flushing all routing entries */
+ if (!ccgw->src_idx && !ccgw->dst_idx)
+ return 0;
+
+ /* only one index set to 0 is an error */
+ if (!ccgw->src_idx || !ccgw->dst_idx)
+ return err;
+ }
+
+ /* add the checks for other gwtypes here */
+
+ return 0;
+}
+
+static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rtcanmsg *r;
+ struct cgw_job *gwj;
+ struct cf_mod *mod;
+ struct can_can_gw ccgw;
+ u8 limhops = 0;
+ int err = 0;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (nlmsg_len(nlh) < sizeof(*r))
+ return -EINVAL;
+
+ r = nlmsg_data(nlh);
+ if (r->can_family != AF_CAN)
+ return -EPFNOSUPPORT;
+
+ /* so far we only support CAN -> CAN routings */
+ if (r->gwtype != CGW_TYPE_CAN_CAN)
+ return -EINVAL;
+
+ mod = kmalloc(sizeof(*mod), GFP_KERNEL);
+ if (!mod)
+ return -ENOMEM;
+
+ err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
+ if (err < 0)
+ goto out_free_cf;
+
+ if (mod->uid) {
+ ASSERT_RTNL();
+
+ /* check for updating an existing job with identical uid */
+ hlist_for_each_entry(gwj, &net->can.cgw_list, list) {
+ struct cf_mod *old_cf;
+
+ old_cf = cgw_job_cf_mod(gwj);
+ if (old_cf->uid != mod->uid)
+ continue;
+
+ /* interfaces & filters must be identical */
+ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) {
+ err = -EINVAL;
+ goto out_free_cf;
+ }
+
+ rcu_assign_pointer(gwj->cf_mod, mod);
+ kfree_rcu_mightsleep(old_cf);
+ return 0;
+ }
+ }
+
+ /* ifindex == 0 is not allowed for job creation */
+ if (!ccgw.src_idx || !ccgw.dst_idx) {
+ err = -ENODEV;
+ goto out_free_cf;
+ }
+
+ gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL);
+ if (!gwj) {
+ err = -ENOMEM;
+ goto out_free_cf;
+ }
+
+ gwj->handled_frames = 0;
+ gwj->dropped_frames = 0;
+ gwj->deleted_frames = 0;
+ gwj->flags = r->flags;
+ gwj->gwtype = r->gwtype;
+ gwj->limit_hops = limhops;
+
+ /* insert already parsed information */
+ RCU_INIT_POINTER(gwj->cf_mod, mod);
+ memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw));
+
+ err = -ENODEV;
+
+ gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx);
+
+ if (!gwj->src.dev)
+ goto out;
+
+ if (gwj->src.dev->type != ARPHRD_CAN)
+ goto out;
+
+ gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx);
+
+ if (!gwj->dst.dev)
+ goto out;
+
+ if (gwj->dst.dev->type != ARPHRD_CAN)
+ goto out;
+
+ /* is sending the skb back to the incoming interface intended? */
+ if (gwj->src.dev == gwj->dst.dev &&
+ !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ASSERT_RTNL();
+
+ err = cgw_register_filter(net, gwj);
+ if (!err)
+ hlist_add_head_rcu(&gwj->list, &net->can.cgw_list);
+out:
+ if (err) {
+ kmem_cache_free(cgw_cache, gwj);
+out_free_cf:
+ kfree(mod);
+ }
+ return err;
+}
+
+static void cgw_remove_all_jobs(struct net *net)
+{
+ struct cgw_job *gwj = NULL;
+ struct hlist_node *nx;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
+ hlist_del(&gwj->list);
+ cgw_unregister_filter(net, gwj);
+ call_rcu(&gwj->rcu, cgw_job_free_rcu);
+ }
+}
+
+static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct cgw_job *gwj = NULL;
+ struct hlist_node *nx;
+ struct rtcanmsg *r;
+ struct cf_mod mod;
+ struct can_can_gw ccgw;
+ u8 limhops = 0;
+ int err = 0;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (nlmsg_len(nlh) < sizeof(*r))
+ return -EINVAL;
+
+ r = nlmsg_data(nlh);
+ if (r->can_family != AF_CAN)
+ return -EPFNOSUPPORT;
+
+ /* so far we only support CAN -> CAN routings */
+ if (r->gwtype != CGW_TYPE_CAN_CAN)
+ return -EINVAL;
+
+ err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
+ if (err < 0)
+ return err;
+
+ /* two interface indices both set to 0 => remove all entries */
+ if (!ccgw.src_idx && !ccgw.dst_idx) {
+ cgw_remove_all_jobs(net);
+ return 0;
+ }
+
+ err = -EINVAL;
+
+ ASSERT_RTNL();
+
+ /* remove only the first matching entry */
+ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
+ struct cf_mod *cf_mod;
+
+ if (gwj->flags != r->flags)
+ continue;
+
+ if (gwj->limit_hops != limhops)
+ continue;
+
+ cf_mod = cgw_job_cf_mod(gwj);
+ /* we have a match when uid is enabled and identical */
+ if (cf_mod->uid || mod.uid) {
+ if (cf_mod->uid != mod.uid)
+ continue;
+ } else {
+ /* no uid => check for identical modifications */
+ if (memcmp(cf_mod, &mod, sizeof(mod)))
+ continue;
+ }
+
+ /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */
+ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
+ continue;
+
+ hlist_del(&gwj->list);
+ cgw_unregister_filter(net, gwj);
+ call_rcu(&gwj->rcu, cgw_job_free_rcu);
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static int __net_init cangw_pernet_init(struct net *net)
+{
+ INIT_HLIST_HEAD(&net->can.cgw_list);
+ return 0;
+}
+
+static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ cgw_remove_all_jobs(net);
+ rtnl_unlock();
+}
+
+static struct pernet_operations cangw_pernet_ops = {
+ .init = cangw_pernet_init,
+ .exit_batch = cangw_pernet_exit_batch,
+};
+
+static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE,
+ .doit = cgw_create_job},
+ {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE,
+ .doit = cgw_remove_job},
+ {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE,
+ .dumpit = cgw_dump_jobs},
+};
+
+static __init int cgw_module_init(void)
+{
+ int ret;
+
+ /* sanitize given module parameter */
+ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
+
+ pr_info("can: netlink gateway - max_hops=%d\n", max_hops);
+
+ ret = register_pernet_subsys(&cangw_pernet_ops);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
+ cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job),
+ 0, 0, NULL);
+ if (!cgw_cache)
+ goto out_cache_create;
+
+ /* set notifier */
+ notifier.notifier_call = cgw_notifier;
+ ret = register_netdevice_notifier(&notifier);
+ if (ret)
+ goto out_register_notifier;
+
+ ret = rtnl_register_many(cgw_rtnl_msg_handlers);
+ if (ret)
+ goto out_rtnl_register;
+
+ return 0;
+
+out_rtnl_register:
+ unregister_netdevice_notifier(&notifier);
+out_register_notifier:
+ kmem_cache_destroy(cgw_cache);
+out_cache_create:
+ unregister_pernet_subsys(&cangw_pernet_ops);
+
+ return ret;
+}
+
+static __exit void cgw_module_exit(void)
+{
+ rtnl_unregister_all(PF_CAN);
+
+ unregister_netdevice_notifier(&notifier);
+
+ unregister_pernet_subsys(&cangw_pernet_ops);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ kmem_cache_destroy(cgw_cache);
+}
+
+module_init(cgw_module_init);
+module_exit(cgw_module_exit);
diff --git a/net/can/isotp.c b/net/can/isotp.c
new file mode 100644
index 000000000000..ce588b85665a
--- /dev/null
+++ b/net/can/isotp.c
@@ -0,0 +1,1739 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN
+ *
+ * This implementation does not provide ISO-TP specific return values to the
+ * userspace.
+ *
+ * - RX path timeout of data reception leads to -ETIMEDOUT
+ * - RX path SN mismatch leads to -EILSEQ
+ * - RX path data reception with wrong padding leads to -EBADMSG
+ * - TX path flowcontrol reception timeout leads to -ECOMM
+ * - TX path flowcontrol reception overflow leads to -EMSGSIZE
+ * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG
+ * - when a transfer (tx) is on the run the next write() blocks until it's done
+ * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent
+ * - as we have static buffers the check whether the PDU fits into the buffer
+ * is done at FF reception time (no support for sending 'wait frames')
+ *
+ * Copyright (c) 2020 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/isotp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+MODULE_DESCRIPTION("PF_CAN ISO 15765-2 transport protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>");
+MODULE_ALIAS("can-proto-6");
+
+#define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)
+
+#define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \
+ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
+ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
+
+/* Since ISO 15765-2:2016 the CAN isotp protocol supports more than 4095
+ * byte per ISO PDU as the FF_DL can take full 32 bit values (4 Gbyte).
+ * We would need some good concept to handle this between user space and
+ * kernel space. For now set the static buffer to something about 8 kbyte
+ * to be able to test this new functionality.
+ */
+#define DEFAULT_MAX_PDU_SIZE 8300
+
+/* maximum PDU size before ISO 15765-2:2016 extension was 4095 */
+#define MAX_12BIT_PDU_SIZE 4095
+
+/* limit the isotp pdu size from the optional module parameter to 1MByte */
+#define MAX_PDU_SIZE (1025 * 1024U)
+
+static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE;
+module_param(max_pdu_size, uint, 0444);
+MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default "
+ __stringify(DEFAULT_MAX_PDU_SIZE) ")");
+
+/* N_PCI type values in bits 7-4 of N_PCI bytes */
+#define N_PCI_SF 0x00 /* single frame */
+#define N_PCI_FF 0x10 /* first frame */
+#define N_PCI_CF 0x20 /* consecutive frame */
+#define N_PCI_FC 0x30 /* flow control */
+
+#define N_PCI_SZ 1 /* size of the PCI byte #1 */
+#define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */
+#define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */
+#define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */
+#define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */
+#define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */
+
+#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA)
+#define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST)
+
+/* Flow Status given in FC frame */
+#define ISOTP_FC_CTS 0 /* clear to send */
+#define ISOTP_FC_WT 1 /* wait */
+#define ISOTP_FC_OVFLW 2 /* overflow */
+
+#define ISOTP_FC_TIMEOUT 1 /* 1 sec */
+#define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */
+
+enum {
+ ISOTP_IDLE = 0,
+ ISOTP_WAIT_FIRST_FC,
+ ISOTP_WAIT_FC,
+ ISOTP_WAIT_DATA,
+ ISOTP_SENDING,
+ ISOTP_SHUTDOWN,
+};
+
+struct tpcon {
+ u8 *buf;
+ unsigned int buflen;
+ unsigned int len;
+ unsigned int idx;
+ u32 state;
+ u8 bs;
+ u8 sn;
+ u8 ll_dl;
+ u8 sbuf[DEFAULT_MAX_PDU_SIZE];
+};
+
+struct isotp_sock {
+ struct sock sk;
+ int bound;
+ int ifindex;
+ canid_t txid;
+ canid_t rxid;
+ ktime_t tx_gap;
+ ktime_t lastrxcf_tstamp;
+ struct hrtimer rxtimer, txtimer, txfrtimer;
+ struct can_isotp_options opt;
+ struct can_isotp_fc_options rxfc, txfc;
+ struct can_isotp_ll_options ll;
+ u32 frame_txtime;
+ u32 force_tx_stmin;
+ u32 force_rx_stmin;
+ u32 cfecho; /* consecutive frame echo tag */
+ struct tpcon rx, tx;
+ struct list_head notifier;
+ wait_queue_head_t wait;
+ spinlock_t rx_lock; /* protect single thread state machine */
+};
+
+static LIST_HEAD(isotp_notifier_list);
+static DEFINE_SPINLOCK(isotp_notifier_lock);
+static struct isotp_sock *isotp_busy_notifier;
+
+static inline struct isotp_sock *isotp_sk(const struct sock *sk)
+{
+ return (struct isotp_sock *)sk;
+}
+
+static u32 isotp_bc_flags(struct isotp_sock *so)
+{
+ return so->opt.flags & ISOTP_ALL_BC_FLAGS;
+}
+
+static bool isotp_register_rxid(struct isotp_sock *so)
+{
+ /* no broadcast modes => register rx_id for FC frame reception */
+ return (isotp_bc_flags(so) == 0);
+}
+
+static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ rxtimer);
+ struct sock *sk = &so->sk;
+
+ if (so->rx.state == ISOTP_WAIT_DATA) {
+ /* we did not get new data frames in time */
+
+ /* report 'connection timed out' */
+ sk->sk_err = ETIMEDOUT;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ /* reset rx state */
+ so->rx.state = ISOTP_IDLE;
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
+{
+ struct net_device *dev;
+ struct sk_buff *nskb;
+ struct canfd_frame *ncf;
+ struct isotp_sock *so = isotp_sk(sk);
+ int can_send_ret;
+
+ nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any());
+ if (!nskb)
+ return 1;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev) {
+ kfree_skb(nskb);
+ return 1;
+ }
+
+ can_skb_reserve(nskb);
+ can_skb_prv(nskb)->ifindex = dev->ifindex;
+ can_skb_prv(nskb)->skbcnt = 0;
+
+ nskb->dev = dev;
+ can_skb_set_owner(nskb, sk);
+ ncf = (struct canfd_frame *)nskb->data;
+ skb_put_zero(nskb, so->ll.mtu);
+
+ /* create & send flow control reply */
+ ncf->can_id = so->txid;
+
+ if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
+ memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN);
+ ncf->len = CAN_MAX_DLEN;
+ } else {
+ ncf->len = ae + FC_CONTENT_SZ;
+ }
+
+ ncf->data[ae] = N_PCI_FC | flowstatus;
+ ncf->data[ae + 1] = so->rxfc.bs;
+ ncf->data[ae + 2] = so->rxfc.stmin;
+
+ if (ae)
+ ncf->data[0] = so->opt.ext_address;
+
+ ncf->flags = so->ll.tx_flags;
+
+ can_send_ret = can_send(nskb, 1);
+ if (can_send_ret)
+ pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
+ __func__, ERR_PTR(can_send_ret));
+
+ dev_put(dev);
+
+ /* reset blocksize counter */
+ so->rx.bs = 0;
+
+ /* reset last CF frame rx timestamp for rx stmin enforcement */
+ so->lastrxcf_tstamp = ktime_set(0, 0);
+
+ /* start rx timeout watchdog */
+ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return 0;
+}
+
+static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb;
+ enum skb_drop_reason reason;
+
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can));
+
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = skb->dev->ifindex;
+
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0)
+ sk_skb_reason_drop(sk, skb, reason);
+}
+
+static u8 padlen(u8 datalen)
+{
+ static const u8 plen[] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */
+ 12, 12, 12, 12, /* 9 - 12 */
+ 16, 16, 16, 16, /* 13 - 16 */
+ 20, 20, 20, 20, /* 17 - 20 */
+ 24, 24, 24, 24, /* 21 - 24 */
+ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */
+ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */
+ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */
+ };
+
+ if (datalen > 48)
+ return 64;
+
+ return plen[datalen];
+}
+
+/* check for length optimization and return 1/true when the check fails */
+static int check_optimized(struct canfd_frame *cf, int start_index)
+{
+ /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the
+ * padding would start at this point. E.g. if the padding would
+ * start at cf.data[7] cf->len has to be 7 to be optimal.
+ * Note: The data[] index starts with zero.
+ */
+ if (cf->len <= CAN_MAX_DLEN)
+ return (cf->len != start_index);
+
+ /* This relation is also valid in the non-linear DLC range, where
+ * we need to take care of the minimal next possible CAN_DL.
+ * The correct check would be (padlen(cf->len) != padlen(start_index)).
+ * But as cf->len can only take discrete values from 12, .., 64 at this
+ * point the padlen(cf->len) is always equal to cf->len.
+ */
+ return (cf->len != padlen(start_index));
+}
+
+/* check padding and return 1/true when the check fails */
+static int check_pad(struct isotp_sock *so, struct canfd_frame *cf,
+ int start_index, u8 content)
+{
+ int i;
+
+ /* no RX_PADDING value => check length of optimized frame length */
+ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) {
+ if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN)
+ return check_optimized(cf, start_index);
+
+ /* no valid test against empty value => ignore frame */
+ return 1;
+ }
+
+ /* check datalength of correctly padded CAN frame */
+ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) &&
+ cf->len != padlen(cf->len))
+ return 1;
+
+ /* check padding content */
+ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) {
+ for (i = start_index; i < cf->len; i++)
+ if (cf->data[i] != content)
+ return 1;
+ }
+ return 0;
+}
+
+static void isotp_send_cframe(struct isotp_sock *so);
+
+static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
+{
+ struct sock *sk = &so->sk;
+
+ if (so->tx.state != ISOTP_WAIT_FC &&
+ so->tx.state != ISOTP_WAIT_FIRST_FC)
+ return 0;
+
+ hrtimer_cancel(&so->txtimer);
+
+ if ((cf->len < ae + FC_CONTENT_SZ) ||
+ ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ return 1;
+ }
+
+ /* get static/dynamic communication params from first/every FC frame */
+ if (so->tx.state == ISOTP_WAIT_FIRST_FC ||
+ so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) {
+ so->txfc.bs = cf->data[ae + 1];
+ so->txfc.stmin = cf->data[ae + 2];
+
+ /* fix wrong STmin values according spec */
+ if (so->txfc.stmin > 0x7F &&
+ (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9))
+ so->txfc.stmin = 0x7F;
+
+ so->tx_gap = ktime_set(0, 0);
+ /* add transmission time for CAN frame N_As */
+ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime);
+ /* add waiting time for consecutive frames N_Cs */
+ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ so->force_tx_stmin);
+ else if (so->txfc.stmin < 0x80)
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ so->txfc.stmin * 1000000);
+ else
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ (so->txfc.stmin - 0xF0)
+ * 100000);
+ so->tx.state = ISOTP_WAIT_FC;
+ }
+
+ switch (cf->data[ae] & 0x0F) {
+ case ISOTP_FC_CTS:
+ so->tx.bs = 0;
+ so->tx.state = ISOTP_SENDING;
+ /* send CF frame and enable echo timeout handling */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ isotp_send_cframe(so);
+ break;
+
+ case ISOTP_FC_WT:
+ /* start timer to wait for next FC frame */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ break;
+
+ case ISOTP_FC_OVFLW:
+ /* overflow on receiver side - report 'message too long' */
+ sk->sk_err = EMSGSIZE;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ fallthrough;
+
+ default:
+ /* stop this tx job */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ }
+ return 0;
+}
+
+static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen,
+ struct sk_buff *skb, int len)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *nskb;
+
+ hrtimer_cancel(&so->rxtimer);
+ so->rx.state = ISOTP_IDLE;
+
+ if (!len || len > cf->len - pcilen)
+ return 1;
+
+ if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ return 1;
+ }
+
+ nskb = alloc_skb(len, gfp_any());
+ if (!nskb)
+ return 1;
+
+ memcpy(skb_put(nskb, len), &cf->data[pcilen], len);
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ isotp_rcv_skb(nskb, sk);
+ return 0;
+}
+
+static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ int i;
+ int off;
+ int ff_pci_sz;
+
+ hrtimer_cancel(&so->rxtimer);
+ so->rx.state = ISOTP_IDLE;
+
+ /* get the used sender LL_DL from the (first) CAN frame data length */
+ so->rx.ll_dl = padlen(cf->len);
+
+ /* the first frame has to use the entire frame up to LL_DL length */
+ if (cf->len != so->rx.ll_dl)
+ return 1;
+
+ /* get the FF_DL */
+ so->rx.len = (cf->data[ae] & 0x0F) << 8;
+ so->rx.len += cf->data[ae + 1];
+
+ /* Check for FF_DL escape sequence supporting 32 bit PDU length */
+ if (so->rx.len) {
+ ff_pci_sz = FF_PCI_SZ12;
+ } else {
+ /* FF_DL = 0 => get real length from next 4 bytes */
+ so->rx.len = cf->data[ae + 2] << 24;
+ so->rx.len += cf->data[ae + 3] << 16;
+ so->rx.len += cf->data[ae + 4] << 8;
+ so->rx.len += cf->data[ae + 5];
+ ff_pci_sz = FF_PCI_SZ32;
+ }
+
+ /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
+ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
+
+ if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl)
+ return 1;
+
+ /* PDU size > default => try max_pdu_size */
+ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) {
+ u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC);
+
+ if (newbuf) {
+ so->rx.buf = newbuf;
+ so->rx.buflen = max_pdu_size;
+ }
+ }
+
+ if (so->rx.len > so->rx.buflen) {
+ /* send FC frame with overflow status */
+ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW);
+ return 1;
+ }
+
+ /* copy the first received data bytes */
+ so->rx.idx = 0;
+ for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++)
+ so->rx.buf[so->rx.idx++] = cf->data[i];
+
+ /* initial setup for this pdu reception */
+ so->rx.sn = 1;
+ so->rx.state = ISOTP_WAIT_DATA;
+
+ /* no creation of flow control frames */
+ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
+ return 0;
+
+ /* send our first FC frame */
+ isotp_send_fc(sk, ae, ISOTP_FC_CTS);
+ return 0;
+}
+
+static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
+ struct sk_buff *skb)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *nskb;
+ int i;
+
+ if (so->rx.state != ISOTP_WAIT_DATA)
+ return 0;
+
+ /* drop if timestamp gap is less than force_rx_stmin nano secs */
+ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) {
+ if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) <
+ so->force_rx_stmin)
+ return 0;
+
+ so->lastrxcf_tstamp = skb->tstamp;
+ }
+
+ hrtimer_cancel(&so->rxtimer);
+
+ /* CFs are never longer than the FF */
+ if (cf->len > so->rx.ll_dl)
+ return 1;
+
+ /* CFs have usually the LL_DL length */
+ if (cf->len < so->rx.ll_dl) {
+ /* this is only allowed for the last CF */
+ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ)
+ return 1;
+ }
+
+ if ((cf->data[ae] & 0x0F) != so->rx.sn) {
+ /* wrong sn detected - report 'illegal byte sequence' */
+ sk->sk_err = EILSEQ;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ /* reset rx state */
+ so->rx.state = ISOTP_IDLE;
+ return 1;
+ }
+ so->rx.sn++;
+ so->rx.sn %= 16;
+
+ for (i = ae + N_PCI_SZ; i < cf->len; i++) {
+ so->rx.buf[so->rx.idx++] = cf->data[i];
+ if (so->rx.idx >= so->rx.len)
+ break;
+ }
+
+ if (so->rx.idx >= so->rx.len) {
+ /* we are done */
+ so->rx.state = ISOTP_IDLE;
+
+ if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, i + 1, so->opt.rxpad_content)) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ return 1;
+ }
+
+ nskb = alloc_skb(so->rx.len, gfp_any());
+ if (!nskb)
+ return 1;
+
+ memcpy(skb_put(nskb, so->rx.len), so->rx.buf,
+ so->rx.len);
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ isotp_rcv_skb(nskb, sk);
+ return 0;
+ }
+
+ /* perform blocksize handling, if enabled */
+ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) {
+ /* start rx timeout watchdog */
+ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return 0;
+ }
+
+ /* no creation of flow control frames */
+ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
+ return 0;
+
+ /* we reached the specified blocksize so->rxfc.bs */
+ isotp_send_fc(sk, ae, ISOTP_FC_CTS);
+ return 0;
+}
+
+static void isotp_rcv(struct sk_buff *skb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct canfd_frame *cf;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+ u8 n_pci_type, sf_dl;
+
+ /* Strictly receive only frames with the configured MTU size
+ * => clear separation of CAN2.0 / CAN FD transport channels
+ */
+ if (skb->len != so->ll.mtu)
+ return;
+
+ cf = (struct canfd_frame *)skb->data;
+
+ /* if enabled: check reception of my configured extended address */
+ if (ae && cf->data[0] != so->opt.rx_ext_address)
+ return;
+
+ n_pci_type = cf->data[ae] & 0xF0;
+
+ /* Make sure the state changes and data structures stay consistent at
+ * CAN frame reception time. This locking is not needed in real world
+ * use cases but the inconsistency can be triggered with syzkaller.
+ */
+ spin_lock(&so->rx_lock);
+
+ if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) {
+ /* check rx/tx path half duplex expectations */
+ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) ||
+ (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC))
+ goto out_unlock;
+ }
+
+ switch (n_pci_type) {
+ case N_PCI_FC:
+ /* tx path: flow control frame containing the FC parameters */
+ isotp_rcv_fc(so, cf, ae);
+ break;
+
+ case N_PCI_SF:
+ /* rx path: single frame
+ *
+ * As we do not have a rx.ll_dl configuration, we can only test
+ * if the CAN frames payload length matches the LL_DL == 8
+ * requirements - no matter if it's CAN 2.0 or CAN FD
+ */
+
+ /* get the SF_DL from the N_PCI byte */
+ sf_dl = cf->data[ae] & 0x0F;
+
+ if (cf->len <= CAN_MAX_DLEN) {
+ isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl);
+ } else {
+ if (can_is_canfd_skb(skb)) {
+ /* We have a CAN FD frame and CAN_DL is greater than 8:
+ * Only frames with the SF_DL == 0 ESC value are valid.
+ *
+ * If so take care of the increased SF PCI size
+ * (SF_PCI_SZ8) to point to the message content behind
+ * the extended SF PCI info and get the real SF_DL
+ * length value from the formerly first data byte.
+ */
+ if (sf_dl == 0)
+ isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb,
+ cf->data[SF_PCI_SZ4 + ae]);
+ }
+ }
+ break;
+
+ case N_PCI_FF:
+ /* rx path: first frame */
+ isotp_rcv_ff(sk, cf, ae);
+ break;
+
+ case N_PCI_CF:
+ /* rx path: consecutive frame */
+ isotp_rcv_cf(sk, cf, ae, skb);
+ break;
+ }
+
+out_unlock:
+ spin_unlock(&so->rx_lock);
+}
+
+static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so,
+ int ae, int off)
+{
+ int pcilen = N_PCI_SZ + ae + off;
+ int space = so->tx.ll_dl - pcilen;
+ int num = min_t(int, so->tx.len - so->tx.idx, space);
+ int i;
+
+ cf->can_id = so->txid;
+ cf->len = num + pcilen;
+
+ if (num < space) {
+ if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
+ /* user requested padding */
+ cf->len = padlen(cf->len);
+ memset(cf->data, so->opt.txpad_content, cf->len);
+ } else if (cf->len > CAN_MAX_DLEN) {
+ /* mandatory padding for CAN FD frames */
+ cf->len = padlen(cf->len);
+ memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT,
+ cf->len);
+ }
+ }
+
+ for (i = 0; i < num; i++)
+ cf->data[pcilen + i] = so->tx.buf[so->tx.idx++];
+
+ if (ae)
+ cf->data[0] = so->opt.ext_address;
+}
+
+static void isotp_send_cframe(struct isotp_sock *so)
+{
+ struct sock *sk = &so->sk;
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ int can_send_ret;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev)
+ return;
+
+ skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ if (!skb) {
+ dev_put(dev);
+ return;
+ }
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ cf = (struct canfd_frame *)skb->data;
+ skb_put_zero(skb, so->ll.mtu);
+
+ /* create consecutive frame */
+ isotp_fill_dataframe(cf, so, ae, 0);
+
+ /* place consecutive frame N_PCI in appropriate index */
+ cf->data[ae] = N_PCI_CF | so->tx.sn++;
+ so->tx.sn %= 16;
+ so->tx.bs++;
+
+ cf->flags = so->ll.tx_flags;
+
+ skb->dev = dev;
+ can_skb_set_owner(skb, sk);
+
+ /* cfecho should have been zero'ed by init/isotp_rcv_echo() */
+ if (so->cfecho)
+ pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho);
+
+ /* set consecutive frame echo tag */
+ so->cfecho = *(u32 *)cf->data;
+
+ /* send frame with local echo enabled */
+ can_send_ret = can_send(skb, 1);
+ if (can_send_ret) {
+ pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
+ __func__, ERR_PTR(can_send_ret));
+ if (can_send_ret == -ENOBUFS)
+ pr_notice_once("can-isotp: tx queue is full\n");
+ }
+ dev_put(dev);
+}
+
+static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so,
+ int ae)
+{
+ int i;
+ int ff_pci_sz;
+
+ cf->can_id = so->txid;
+ cf->len = so->tx.ll_dl;
+ if (ae)
+ cf->data[0] = so->opt.ext_address;
+
+ /* create N_PCI bytes with 12/32 bit FF_DL data length */
+ if (so->tx.len > MAX_12BIT_PDU_SIZE) {
+ /* use 32 bit FF_DL notation */
+ cf->data[ae] = N_PCI_FF;
+ cf->data[ae + 1] = 0;
+ cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU;
+ cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU;
+ cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU;
+ cf->data[ae + 5] = (u8)so->tx.len & 0xFFU;
+ ff_pci_sz = FF_PCI_SZ32;
+ } else {
+ /* use 12 bit FF_DL notation */
+ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF;
+ cf->data[ae + 1] = (u8)so->tx.len & 0xFFU;
+ ff_pci_sz = FF_PCI_SZ12;
+ }
+
+ /* add first data bytes depending on ae */
+ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++)
+ cf->data[i] = so->tx.buf[so->tx.idx++];
+
+ so->tx.sn = 1;
+}
+
+static void isotp_rcv_echo(struct sk_buff *skb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct canfd_frame *cf = (struct canfd_frame *)skb->data;
+
+ /* only handle my own local echo CF/SF skb's (no FF!) */
+ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data)
+ return;
+
+ /* cancel local echo timeout */
+ hrtimer_cancel(&so->txtimer);
+
+ /* local echo skb with consecutive frame has been consumed */
+ so->cfecho = 0;
+
+ if (so->tx.idx >= so->tx.len) {
+ /* we are done */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ return;
+ }
+
+ if (so->txfc.bs && so->tx.bs >= so->txfc.bs) {
+ /* stop and wait for FC with timeout */
+ so->tx.state = ISOTP_WAIT_FC;
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return;
+ }
+
+ /* no gap between data frames needed => use burst mode */
+ if (!so->tx_gap) {
+ /* enable echo timeout handling */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ isotp_send_cframe(so);
+ return;
+ }
+
+ /* start timer to send next consecutive frame with correct delay */
+ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT);
+}
+
+static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ txtimer);
+ struct sock *sk = &so->sk;
+
+ /* don't handle timeouts in IDLE or SHUTDOWN state */
+ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN)
+ return HRTIMER_NORESTART;
+
+ /* we did not get any flow control or echo frame in time */
+
+ /* report 'communication error on send' */
+ sk->sk_err = ECOMM;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ /* reset tx state */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ txfrtimer);
+
+ /* start echo timeout handling and cover below protocol error */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+
+ /* cfecho should be consumed by isotp_rcv_echo() here */
+ if (so->tx.state == ISOTP_SENDING && !so->cfecho)
+ isotp_send_cframe(so);
+
+ return HRTIMER_NORESTART;
+}
+
+static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+ int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
+ s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT;
+ int off;
+ int err;
+
+ if (!so->bound || so->tx.state == ISOTP_SHUTDOWN)
+ return -EADDRNOTAVAIL;
+
+ while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) {
+ /* we do not support multiple buffers - for now */
+ if (msg->msg_flags & MSG_DONTWAIT)
+ return -EAGAIN;
+
+ if (so->tx.state == ISOTP_SHUTDOWN)
+ return -EADDRNOTAVAIL;
+
+ /* wait for complete transmission of current pdu */
+ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ if (err)
+ goto err_event_drop;
+ }
+
+ /* PDU size > default => try max_pdu_size */
+ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) {
+ u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL);
+
+ if (newbuf) {
+ so->tx.buf = newbuf;
+ so->tx.buflen = max_pdu_size;
+ }
+ }
+
+ if (!size || size > so->tx.buflen) {
+ err = -EINVAL;
+ goto err_out_drop;
+ }
+
+ /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
+ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
+
+ /* does the given data fit into a single frame for SF_BROADCAST? */
+ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) &&
+ (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) {
+ err = -EINVAL;
+ goto err_out_drop;
+ }
+
+ err = memcpy_from_msg(so->tx.buf, msg, size);
+ if (err < 0)
+ goto err_out_drop;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev) {
+ err = -ENXIO;
+ goto err_out_drop;
+ }
+
+ skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb) {
+ dev_put(dev);
+ goto err_out_drop;
+ }
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ so->tx.len = size;
+ so->tx.idx = 0;
+
+ cf = (struct canfd_frame *)skb->data;
+ skb_put_zero(skb, so->ll.mtu);
+
+ /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */
+ if (so->cfecho)
+ pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho);
+
+ /* check for single frame transmission depending on TX_DL */
+ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) {
+ /* The message size generally fits into a SingleFrame - good.
+ *
+ * SF_DL ESC offset optimization:
+ *
+ * When TX_DL is greater 8 but the message would still fit
+ * into a 8 byte CAN frame, we can omit the offset.
+ * This prevents a protocol caused length extension from
+ * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling.
+ */
+ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae)
+ off = 0;
+
+ isotp_fill_dataframe(cf, so, ae, off);
+
+ /* place single frame N_PCI w/o length in appropriate index */
+ cf->data[ae] = N_PCI_SF;
+
+ /* place SF_DL size value depending on the SF_DL ESC offset */
+ if (off)
+ cf->data[SF_PCI_SZ4 + ae] = size;
+ else
+ cf->data[ae] |= size;
+
+ /* set CF echo tag for isotp_rcv_echo() (SF-mode) */
+ so->cfecho = *(u32 *)cf->data;
+ } else {
+ /* send first frame */
+
+ isotp_create_fframe(cf, so, ae);
+
+ if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) {
+ /* set timer for FC-less operation (STmin = 0) */
+ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
+ so->tx_gap = ktime_set(0, so->force_tx_stmin);
+ else
+ so->tx_gap = ktime_set(0, so->frame_txtime);
+
+ /* disable wait for FCs due to activated block size */
+ so->txfc.bs = 0;
+
+ /* set CF echo tag for isotp_rcv_echo() (CF-mode) */
+ so->cfecho = *(u32 *)cf->data;
+ } else {
+ /* standard flow control check */
+ so->tx.state = ISOTP_WAIT_FIRST_FC;
+
+ /* start timeout for FC */
+ hrtimer_sec = ISOTP_FC_TIMEOUT;
+
+ /* no CF echo tag for isotp_rcv_echo() (FF-mode) */
+ so->cfecho = 0;
+ }
+ }
+
+ hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
+ HRTIMER_MODE_REL_SOFT);
+
+ /* send the first or only CAN frame */
+ cf->flags = so->ll.tx_flags;
+
+ skb->dev = dev;
+ skb->sk = sk;
+ err = can_send(skb, 1);
+ dev_put(dev);
+ if (err) {
+ pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
+ __func__, ERR_PTR(err));
+
+ /* no transmission -> no timeout monitoring */
+ hrtimer_cancel(&so->txtimer);
+
+ /* reset consecutive frame echo tag */
+ so->cfecho = 0;
+
+ goto err_out_drop;
+ }
+
+ if (wait_tx_done) {
+ /* wait for complete transmission of current pdu */
+ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ if (err)
+ goto err_event_drop;
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+ }
+
+ return size;
+
+err_event_drop:
+ /* got signal: force tx state machine to be idle */
+ so->tx.state = ISOTP_IDLE;
+ hrtimer_cancel(&so->txfrtimer);
+ hrtimer_cancel(&so->txtimer);
+err_out_drop:
+ /* drop this PDU and unlock a potential wait queue */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+
+ return err;
+}
+
+static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct isotp_sock *so = isotp_sk(sk);
+ int ret = 0;
+
+ if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ if (!so->bound)
+ return -EADDRNOTAVAIL;
+
+ skb = skb_recv_datagram(sk, flags, &ret);
+ if (!skb)
+ return ret;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ ret = memcpy_to_msg(msg, skb->data, size);
+ if (ret < 0)
+ goto out_err;
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (msg->msg_name) {
+ __sockaddr_check_size(ISOTP_MIN_NAMELEN);
+ msg->msg_namelen = ISOTP_MIN_NAMELEN;
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ /* set length of return value */
+ ret = (flags & MSG_TRUNC) ? skb->len : size;
+
+out_err:
+ skb_free_datagram(sk, skb);
+
+ return ret;
+}
+
+static int isotp_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so;
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ so = isotp_sk(sk);
+ net = sock_net(sk);
+
+ /* wait for complete transmission of current pdu */
+ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 &&
+ cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE)
+ ;
+
+ /* force state machines to be idle also when a signal occurred */
+ so->tx.state = ISOTP_SHUTDOWN;
+ so->rx.state = ISOTP_IDLE;
+
+ spin_lock(&isotp_notifier_lock);
+ while (isotp_busy_notifier == so) {
+ spin_unlock(&isotp_notifier_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&isotp_notifier_lock);
+ }
+ list_del(&so->notifier);
+ spin_unlock(&isotp_notifier_lock);
+
+ lock_sock(sk);
+
+ /* remove current filters & unregister */
+ if (so->bound) {
+ if (so->ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, so->ifindex);
+ if (dev) {
+ if (isotp_register_rxid(so))
+ can_rx_unregister(net, dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+
+ can_rx_unregister(net, dev, so->txid,
+ SINGLE_MASK(so->txid),
+ isotp_rcv_echo, sk);
+ dev_put(dev);
+ synchronize_rcu();
+ }
+ }
+ }
+
+ hrtimer_cancel(&so->txfrtimer);
+ hrtimer_cancel(&so->txtimer);
+ hrtimer_cancel(&so->rxtimer);
+
+ so->ifindex = 0;
+ so->bound = 0;
+
+ if (so->rx.buf != so->rx.sbuf)
+ kfree(so->rx.buf);
+
+ if (so->tx.buf != so->tx.sbuf)
+ kfree(so->tx.buf);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int isotp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct net *net = sock_net(sk);
+ int ifindex;
+ struct net_device *dev;
+ canid_t tx_id = addr->can_addr.tp.tx_id;
+ canid_t rx_id = addr->can_addr.tp.rx_id;
+ int err = 0;
+ int notify_enetdown = 0;
+
+ if (len < ISOTP_MIN_NAMELEN)
+ return -EINVAL;
+
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ /* sanitize tx CAN identifier */
+ if (tx_id & CAN_EFF_FLAG)
+ tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
+ else
+ tx_id &= CAN_SFF_MASK;
+
+ /* give feedback on wrong CAN-ID value */
+ if (tx_id != addr->can_addr.tp.tx_id)
+ return -EINVAL;
+
+ /* sanitize rx CAN identifier (if needed) */
+ if (isotp_register_rxid(so)) {
+ if (rx_id & CAN_EFF_FLAG)
+ rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
+ else
+ rx_id &= CAN_SFF_MASK;
+
+ /* give feedback on wrong CAN-ID value */
+ if (rx_id != addr->can_addr.tp.rx_id)
+ return -EINVAL;
+ }
+
+ if (!addr->can_ifindex)
+ return -ENODEV;
+
+ lock_sock(sk);
+
+ if (so->bound) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* ensure different CAN IDs when the rx_id is to be registered */
+ if (isotp_register_rxid(so) && rx_id == tx_id) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ dev = dev_get_by_index(net, addr->can_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ err = -ENODEV;
+ goto out;
+ }
+ if (READ_ONCE(dev->mtu) < so->ll.mtu) {
+ dev_put(dev);
+ err = -EINVAL;
+ goto out;
+ }
+ if (!(dev->flags & IFF_UP))
+ notify_enetdown = 1;
+
+ ifindex = dev->ifindex;
+
+ if (isotp_register_rxid(so))
+ can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id),
+ isotp_rcv, sk, "isotp", sk);
+
+ /* no consecutive frame echo skb in flight */
+ so->cfecho = 0;
+
+ /* register for echo skb's */
+ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id),
+ isotp_rcv_echo, sk, "isotpe", sk);
+
+ dev_put(dev);
+
+ /* switch to new settings */
+ so->ifindex = ifindex;
+ so->rxid = rx_id;
+ so->txid = tx_id;
+ so->bound = 1;
+
+out:
+ release_sock(sk);
+
+ if (notify_enetdown) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ }
+
+ return err;
+}
+
+static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ memset(addr, 0, ISOTP_MIN_NAMELEN);
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = so->ifindex;
+ addr->can_addr.tp.rx_id = so->rxid;
+ addr->can_addr.tp.tx_id = so->txid;
+
+ return ISOTP_MIN_NAMELEN;
+}
+
+static int isotp_setsockopt_locked(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ int ret = 0;
+
+ if (so->bound)
+ return -EISCONN;
+
+ switch (optname) {
+ case CAN_ISOTP_OPTS:
+ if (optlen != sizeof(struct can_isotp_options))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->opt, optval, optlen))
+ return -EFAULT;
+
+ /* no separate rx_ext_address is given => use ext_address */
+ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
+ so->opt.rx_ext_address = so->opt.ext_address;
+
+ /* these broadcast flags are not allowed together */
+ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) {
+ /* CAN_ISOTP_SF_BROADCAST is prioritized */
+ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST;
+
+ /* give user feedback on wrong config attempt */
+ ret = -EINVAL;
+ }
+
+ /* check for frame_txtime changes (0 => no changes) */
+ if (so->opt.frame_txtime) {
+ if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO)
+ so->frame_txtime = 0;
+ else
+ so->frame_txtime = so->opt.frame_txtime;
+ }
+ break;
+
+ case CAN_ISOTP_RECV_FC:
+ if (optlen != sizeof(struct can_isotp_fc_options))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->rxfc, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_TX_STMIN:
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_RX_STMIN:
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_LL_OPTS:
+ if (optlen == sizeof(struct can_isotp_ll_options)) {
+ struct can_isotp_ll_options ll;
+
+ if (copy_from_sockptr(&ll, optval, optlen))
+ return -EFAULT;
+
+ /* check for correct ISO 11898-1 DLC data length */
+ if (ll.tx_dl != padlen(ll.tx_dl))
+ return -EINVAL;
+
+ if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU)
+ return -EINVAL;
+
+ if (ll.mtu == CAN_MTU &&
+ (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0))
+ return -EINVAL;
+
+ memcpy(&so->ll, &ll, sizeof(ll));
+
+ /* set ll_dl for tx path to similar place as for rx */
+ so->tx.ll_dl = ll.tx_dl;
+ } else {
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ ret = -ENOPROTOOPT;
+ }
+
+ return ret;
+}
+
+static int isotp_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+
+{
+ struct sock *sk = sock->sk;
+ int ret;
+
+ if (level != SOL_CAN_ISOTP)
+ return -EINVAL;
+
+ lock_sock(sk);
+ ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen);
+ release_sock(sk);
+ return ret;
+}
+
+static int isotp_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ int len;
+ void *val;
+
+ if (level != SOL_CAN_ISOTP)
+ return -EINVAL;
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case CAN_ISOTP_OPTS:
+ len = min_t(int, len, sizeof(struct can_isotp_options));
+ val = &so->opt;
+ break;
+
+ case CAN_ISOTP_RECV_FC:
+ len = min_t(int, len, sizeof(struct can_isotp_fc_options));
+ val = &so->rxfc;
+ break;
+
+ case CAN_ISOTP_TX_STMIN:
+ len = min_t(int, len, sizeof(u32));
+ val = &so->force_tx_stmin;
+ break;
+
+ case CAN_ISOTP_RX_STMIN:
+ len = min_t(int, len, sizeof(u32));
+ val = &so->force_rx_stmin;
+ break;
+
+ case CAN_ISOTP_LL_OPTS:
+ len = min_t(int, len, sizeof(struct can_isotp_ll_options));
+ val = &so->ll;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static void isotp_notify(struct isotp_sock *so, unsigned long msg,
+ struct net_device *dev)
+{
+ struct sock *sk = &so->sk;
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ return;
+
+ if (so->ifindex != dev->ifindex)
+ return;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+ /* remove current filters & unregister */
+ if (so->bound) {
+ if (isotp_register_rxid(so))
+ can_rx_unregister(dev_net(dev), dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+
+ can_rx_unregister(dev_net(dev), dev, so->txid,
+ SINGLE_MASK(so->txid),
+ isotp_rcv_echo, sk);
+ }
+
+ so->ifindex = 0;
+ so->bound = 0;
+ release_sock(sk);
+
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ break;
+
+ case NETDEV_DOWN:
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ break;
+ }
+}
+
+static int isotp_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+ if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
+ return NOTIFY_DONE;
+ if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */
+ return NOTIFY_DONE;
+
+ spin_lock(&isotp_notifier_lock);
+ list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) {
+ spin_unlock(&isotp_notifier_lock);
+ isotp_notify(isotp_busy_notifier, msg, dev);
+ spin_lock(&isotp_notifier_lock);
+ }
+ isotp_busy_notifier = NULL;
+ spin_unlock(&isotp_notifier_lock);
+ return NOTIFY_DONE;
+}
+
+static int isotp_init(struct sock *sk)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+
+ so->ifindex = 0;
+ so->bound = 0;
+
+ so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS;
+ so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
+ so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
+ so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
+ so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
+ so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+ so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+ so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS;
+ so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN;
+ so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX;
+ so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU;
+ so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL;
+ so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS;
+
+ /* set ll_dl for tx path to similar place as for rx */
+ so->tx.ll_dl = so->ll.tx_dl;
+
+ so->rx.state = ISOTP_IDLE;
+ so->tx.state = ISOTP_IDLE;
+
+ so->rx.buf = so->rx.sbuf;
+ so->tx.buf = so->tx.sbuf;
+ so->rx.buflen = ARRAY_SIZE(so->rx.sbuf);
+ so->tx.buflen = ARRAY_SIZE(so->tx.sbuf);
+
+ hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+
+ init_waitqueue_head(&so->wait);
+ spin_lock_init(&so->rx_lock);
+
+ spin_lock(&isotp_notifier_lock);
+ list_add_tail(&so->notifier, &isotp_notifier_list);
+ spin_unlock(&isotp_notifier_lock);
+
+ return 0;
+}
+
+static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+
+ __poll_t mask = datagram_poll(file, sock, wait);
+ poll_wait(file, &so->wait, wait);
+
+ /* Check for false positives due to TX state */
+ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE))
+ mask &= ~(EPOLLOUT | EPOLLWRNORM);
+
+ return mask;
+}
+
+static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops isotp_ops = {
+ .family = PF_CAN,
+ .release = isotp_release,
+ .bind = isotp_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = isotp_getname,
+ .poll = isotp_poll,
+ .ioctl = isotp_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = isotp_setsockopt,
+ .getsockopt = isotp_getsockopt,
+ .sendmsg = isotp_sendmsg,
+ .recvmsg = isotp_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct proto isotp_proto __read_mostly = {
+ .name = "CAN_ISOTP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct isotp_sock),
+ .init = isotp_init,
+};
+
+static const struct can_proto isotp_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_ISOTP,
+ .ops = &isotp_ops,
+ .prot = &isotp_proto,
+};
+
+static struct notifier_block canisotp_notifier = {
+ .notifier_call = isotp_notifier
+};
+
+static __init int isotp_module_init(void)
+{
+ int err;
+
+ max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE);
+ max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE);
+
+ pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size);
+
+ err = can_proto_register(&isotp_can_proto);
+ if (err < 0)
+ pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err));
+ else
+ register_netdevice_notifier(&canisotp_notifier);
+
+ return err;
+}
+
+static __exit void isotp_module_exit(void)
+{
+ can_proto_unregister(&isotp_can_proto);
+ unregister_netdevice_notifier(&canisotp_notifier);
+}
+
+module_init(isotp_module_init);
+module_exit(isotp_module_exit);
diff --git a/net/can/j1939/Kconfig b/net/can/j1939/Kconfig
new file mode 100644
index 000000000000..2998298b71ec
--- /dev/null
+++ b/net/can/j1939/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# SAE J1939 network layer core configuration
+#
+
+config CAN_J1939
+ tristate "SAE J1939"
+ depends on CAN
+ help
+ SAE J1939
+ Say Y to have in-kernel support for j1939 socket type. This
+ allows communication according to SAE j1939.
+ The relevant parts in kernel are
+ SAE j1939-21 (datalink & transport protocol)
+ & SAE j1939-81 (network management).
diff --git a/net/can/j1939/Makefile b/net/can/j1939/Makefile
new file mode 100644
index 000000000000..19181bdae173
--- /dev/null
+++ b/net/can/j1939/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_CAN_J1939) += can-j1939.o
+
+can-j1939-objs := \
+ address-claim.o \
+ bus.o \
+ main.o \
+ socket.o \
+ transport.o
diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c
new file mode 100644
index 000000000000..ca4ad6cdd5cb
--- /dev/null
+++ b/net/can/j1939/address-claim.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* J1939 Address Claiming.
+ * Address Claiming in the kernel
+ * - keeps track of the AC states of ECU's,
+ * - resolves NAME<=>SA taking into account the AC states of ECU's.
+ *
+ * All Address Claim msgs (including host-originated msg) are processed
+ * at the receive path (a sent msg is always received again via CAN echo).
+ * As such, the processing of AC msgs is done in the order on which msgs
+ * are sent on the bus.
+ *
+ * This module doesn't send msgs itself (e.g. replies on Address Claims),
+ * this is the responsibility of a user space application or daemon.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include "j1939-priv.h"
+
+static inline name_t j1939_skb_to_name(const struct sk_buff *skb)
+{
+ return le64_to_cpup((__le64 *)skb->data);
+}
+
+static inline bool j1939_ac_msg_is_request(struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int req_pgn;
+
+ if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST)
+ return false;
+
+ req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16);
+
+ return req_pgn == J1939_PGN_ADDRESS_CLAIMED;
+}
+
+static int j1939_ac_verify_outgoing(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+ if (skb->len != 8) {
+ netdev_notice(priv->ndev, "tx address claim with dlc %i\n",
+ skb->len);
+ return -EPROTO;
+ }
+
+ if (skcb->addr.src_name != j1939_skb_to_name(skb)) {
+ netdev_notice(priv->ndev, "tx address claim with different name\n");
+ return -EPROTO;
+ }
+
+ if (skcb->addr.sa == J1939_NO_ADDR) {
+ netdev_notice(priv->ndev, "tx address claim with broadcast sa\n");
+ return -EPROTO;
+ }
+
+ /* ac must always be a broadcast */
+ if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) {
+ netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n");
+ return -EPROTO;
+ }
+ return 0;
+}
+
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int ret;
+ u8 addr;
+
+ /* network mgmt: address claiming msgs */
+ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+ struct j1939_ecu *ecu;
+
+ ret = j1939_ac_verify_outgoing(priv, skb);
+ /* return both when failure & when successful */
+ if (ret < 0)
+ return ret;
+ ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name);
+ if (!ecu)
+ return -ENODEV;
+
+ if (ecu->addr != skcb->addr.sa)
+ /* hold further traffic for ecu, remove from parent */
+ j1939_ecu_unmap(ecu);
+ j1939_ecu_put(ecu);
+ } else if (skcb->addr.src_name) {
+ /* assign source address */
+ addr = j1939_name_to_addr(priv, skcb->addr.src_name);
+ if (!j1939_address_is_unicast(addr) &&
+ !j1939_ac_msg_is_request(skb)) {
+ netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n",
+ skcb->addr.src_name);
+ return -EADDRNOTAVAIL;
+ }
+ skcb->addr.sa = addr;
+ }
+
+ /* assign destination address */
+ if (skcb->addr.dst_name) {
+ addr = j1939_name_to_addr(priv, skcb->addr.dst_name);
+ if (!j1939_address_is_unicast(addr)) {
+ netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n",
+ skcb->addr.dst_name);
+ return -EADDRNOTAVAIL;
+ }
+ skcb->addr.da = addr;
+ }
+ return 0;
+}
+
+static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_ecu *ecu, *prev;
+ name_t name;
+
+ if (skb->len != 8) {
+ netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n",
+ skb->len);
+ return;
+ }
+
+ name = j1939_skb_to_name(skb);
+ skcb->addr.src_name = name;
+ if (!name) {
+ netdev_notice(priv->ndev, "rx address claim without name\n");
+ return;
+ }
+
+ if (!j1939_address_is_valid(skcb->addr.sa)) {
+ netdev_notice(priv->ndev, "rx address claim with broadcast sa\n");
+ return;
+ }
+
+ write_lock_bh(&priv->lock);
+
+ /* Few words on the ECU ref counting:
+ *
+ * First we get an ECU handle, either with
+ * j1939_ecu_get_by_name_locked() (increments the ref counter)
+ * or j1939_ecu_create_locked() (initializes an ECU object
+ * with a ref counter of 1).
+ *
+ * j1939_ecu_unmap_locked() will decrement the ref counter,
+ * but only if the ECU was mapped before. So "ecu" still
+ * belongs to us.
+ *
+ * j1939_ecu_timer_start() will increment the ref counter
+ * before it starts the timer, so we can put the ecu when
+ * leaving this function.
+ */
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+
+ if (ecu && ecu->addr == skcb->addr.sa) {
+ /* The ISO 11783-5 standard, in "4.5.2 - Address claim
+ * requirements", states:
+ * d) No CF shall begin, or resume, transmission on the
+ * network until 250 ms after it has successfully claimed
+ * an address except when responding to a request for
+ * address-claimed.
+ *
+ * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim
+ * prioritization" show that the CF begins the transmission
+ * after 250 ms from the first AC (address-claimed) message
+ * even if it sends another AC message during that time window
+ * to resolve the address contention with another CF.
+ *
+ * As stated in "4.4.2.3 - Address-claimed message":
+ * In order to successfully claim an address, the CF sending
+ * an address claimed message shall not receive a contending
+ * claim from another CF for at least 250 ms.
+ *
+ * As stated in "4.4.3.2 - NAME management (NM) message":
+ * 1) A commanding CF can
+ * d) request that a CF with a specified NAME transmit
+ * the address-claimed message with its current NAME.
+ * 2) A target CF shall
+ * d) send an address-claimed message in response to a
+ * request for a matching NAME
+ *
+ * Taking the above arguments into account, the 250 ms wait is
+ * requested only during network initialization.
+ *
+ * Do not restart the timer on AC message if both the NAME and
+ * the address match and so if the address has already been
+ * claimed (timer has expired) or the AC message has been sent
+ * to resolve the contention with another CF (timer is still
+ * running).
+ */
+ goto out_ecu_put;
+ }
+
+ if (!ecu && j1939_address_is_unicast(skcb->addr.sa))
+ ecu = j1939_ecu_create_locked(priv, name);
+
+ if (IS_ERR_OR_NULL(ecu))
+ goto out_unlock_bh;
+
+ /* cancel pending (previous) address claim */
+ j1939_ecu_timer_cancel(ecu);
+
+ if (j1939_address_is_idle(skcb->addr.sa)) {
+ j1939_ecu_unmap_locked(ecu);
+ goto out_ecu_put;
+ }
+
+ /* save new addr */
+ if (ecu->addr != skcb->addr.sa)
+ j1939_ecu_unmap_locked(ecu);
+ ecu->addr = skcb->addr.sa;
+
+ prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa);
+ if (prev) {
+ if (ecu->name > prev->name) {
+ j1939_ecu_unmap_locked(ecu);
+ j1939_ecu_put(prev);
+ goto out_ecu_put;
+ } else {
+ /* kick prev if less or equal */
+ j1939_ecu_unmap_locked(prev);
+ j1939_ecu_put(prev);
+ }
+ }
+
+ j1939_ecu_timer_start(ecu);
+ out_ecu_put:
+ j1939_ecu_put(ecu);
+ out_unlock_bh:
+ write_unlock_bh(&priv->lock);
+}
+
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_ecu *ecu;
+
+ /* network mgmt */
+ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+ j1939_ac_process(priv, skb);
+ } else if (j1939_address_is_unicast(skcb->addr.sa)) {
+ /* assign source name */
+ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa);
+ if (ecu) {
+ skcb->addr.src_name = ecu->name;
+ j1939_ecu_put(ecu);
+ }
+ }
+
+ /* assign destination name */
+ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da);
+ if (ecu) {
+ skcb->addr.dst_name = ecu->name;
+ j1939_ecu_put(ecu);
+ }
+}
diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c
new file mode 100644
index 000000000000..797719cb227e
--- /dev/null
+++ b/net/can/j1939/bus.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* bus for j1939 remote devices
+ * Since rtnetlink, no real bus is used.
+ */
+
+#include <net/sock.h>
+
+#include "j1939-priv.h"
+
+static void __j1939_ecu_release(struct kref *kref)
+{
+ struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref);
+ struct j1939_priv *priv = ecu->priv;
+
+ list_del(&ecu->list);
+ kfree(ecu);
+ j1939_priv_put(priv);
+}
+
+void j1939_ecu_put(struct j1939_ecu *ecu)
+{
+ kref_put(&ecu->kref, __j1939_ecu_release);
+}
+
+static void j1939_ecu_get(struct j1939_ecu *ecu)
+{
+ kref_get(&ecu->kref);
+}
+
+static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+
+ lockdep_assert_held(&priv->lock);
+
+ return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu;
+}
+
+/* ECU device interface */
+/* map ECU to a bus address space */
+static void j1939_ecu_map_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+ struct j1939_addr_ent *ent;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(ecu->addr))
+ return;
+
+ ent = &priv->ents[ecu->addr];
+
+ if (ent->ecu) {
+ netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n",
+ ecu->addr, ecu->name);
+ return;
+ }
+
+ j1939_ecu_get(ecu);
+ ent->ecu = ecu;
+ ent->nusers += ecu->nusers;
+}
+
+/* unmap ECU from a bus address space */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+ struct j1939_addr_ent *ent;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(ecu->addr))
+ return;
+
+ if (!j1939_ecu_is_mapped_locked(ecu))
+ return;
+
+ ent = &priv->ents[ecu->addr];
+ ent->ecu = NULL;
+ ent->nusers -= ecu->nusers;
+ j1939_ecu_put(ecu);
+}
+
+void j1939_ecu_unmap(struct j1939_ecu *ecu)
+{
+ write_lock_bh(&ecu->priv->lock);
+ j1939_ecu_unmap_locked(ecu);
+ write_unlock_bh(&ecu->priv->lock);
+}
+
+void j1939_ecu_unmap_all(struct j1939_priv *priv)
+{
+ int i;
+
+ write_lock_bh(&priv->lock);
+ for (i = 0; i < ARRAY_SIZE(priv->ents); i++)
+ if (priv->ents[i].ecu)
+ j1939_ecu_unmap_locked(priv->ents[i].ecu);
+ write_unlock_bh(&priv->lock);
+}
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu)
+{
+ /* The ECU is held here and released in the
+ * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel().
+ */
+ j1939_ecu_get(ecu);
+
+ /* Schedule timer in 250 msec to commit address change. */
+ hrtimer_start(&ecu->ac_timer, ms_to_ktime(250),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu)
+{
+ if (hrtimer_cancel(&ecu->ac_timer))
+ j1939_ecu_put(ecu);
+}
+
+static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer)
+{
+ struct j1939_ecu *ecu =
+ container_of(hrtimer, struct j1939_ecu, ac_timer);
+ struct j1939_priv *priv = ecu->priv;
+
+ write_lock_bh(&priv->lock);
+ /* TODO: can we test if ecu->addr is unicast before starting
+ * the timer?
+ */
+ j1939_ecu_map_locked(ecu);
+
+ /* The corresponding j1939_ecu_get() is in
+ * j1939_ecu_timer_start().
+ */
+ j1939_ecu_put(ecu);
+ write_unlock_bh(&priv->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ ecu = kzalloc(sizeof(*ecu), gfp_any());
+ if (!ecu)
+ return ERR_PTR(-ENOMEM);
+ kref_init(&ecu->kref);
+ ecu->addr = J1939_IDLE_ADDR;
+ ecu->name = name;
+
+ hrtimer_setup(&ecu->ac_timer, j1939_ecu_timer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+ INIT_LIST_HEAD(&ecu->list);
+
+ j1939_priv_get(priv);
+ ecu->priv = priv;
+ list_add_tail(&ecu->list, &priv->ecus);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+ u8 addr)
+{
+ lockdep_assert_held(&priv->lock);
+
+ return priv->ents[addr].ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(addr))
+ return NULL;
+
+ ecu = j1939_ecu_find_by_addr_locked(priv, addr);
+ if (ecu)
+ j1939_ecu_get(ecu);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr)
+{
+ struct j1939_ecu *ecu;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_get_by_addr_locked(priv, addr);
+ read_unlock_bh(&priv->lock);
+
+ return ecu;
+}
+
+/* get pointer to ecu without increasing ref counter */
+static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv,
+ name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ list_for_each_entry(ecu, &priv->ecus, list) {
+ if (ecu->name == name)
+ return ecu;
+ }
+
+ return NULL;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+ name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!name)
+ return NULL;
+
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (ecu)
+ j1939_ecu_get(ecu);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ read_unlock_bh(&priv->lock);
+
+ return ecu;
+}
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+ int addr = J1939_IDLE_ADDR;
+
+ if (!name)
+ return J1939_NO_ADDR;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (ecu && j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's SA is registered */
+ addr = ecu->addr;
+
+ read_unlock_bh(&priv->lock);
+
+ return addr;
+}
+
+/* TX addr/name accounting
+ * Transport protocol needs to know if a SA is local or not
+ * These functions originate from userspace manipulating sockets,
+ * so locking is straigforward
+ */
+
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)
+{
+ struct j1939_ecu *ecu;
+ int err = 0;
+
+ write_lock_bh(&priv->lock);
+
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers++;
+
+ if (!name)
+ goto done;
+
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ if (!ecu)
+ ecu = j1939_ecu_create_locked(priv, name);
+ err = PTR_ERR_OR_ZERO(ecu);
+ if (err) {
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers--;
+ goto done;
+ }
+
+ ecu->nusers++;
+ /* TODO: do we care if ecu->addr != sa? */
+ if (j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's sa is active already */
+ priv->ents[ecu->addr].nusers++;
+
+ done:
+ write_unlock_bh(&priv->lock);
+
+ return err;
+}
+
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa)
+{
+ struct j1939_ecu *ecu;
+
+ write_lock_bh(&priv->lock);
+
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers--;
+
+ if (!name)
+ goto done;
+
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (WARN_ON_ONCE(!ecu))
+ goto done;
+
+ ecu->nusers--;
+ /* TODO: do we care if ecu->addr != sa? */
+ if (j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's sa is active already */
+ priv->ents[ecu->addr].nusers--;
+ j1939_ecu_put(ecu);
+
+ done:
+ write_unlock_bh(&priv->lock);
+}
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
new file mode 100644
index 000000000000..81f58924b4ac
--- /dev/null
+++ b/net/can/j1939/j1939-priv.h
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#ifndef _J1939_PRIV_H_
+#define _J1939_PRIV_H_
+
+#include <linux/can/j1939.h>
+#include <net/sock.h>
+
+/* Timeout to receive the abort signal over loop back. In case CAN
+ * bus is open, the timeout should be triggered.
+ */
+#define J1939_XTP_ABORT_TIMEOUT_MS 500
+#define J1939_SIMPLE_ECHO_TIMEOUT_MS (10 * 1000)
+
+struct j1939_session;
+enum j1939_sk_errqueue_type {
+ J1939_ERRQUEUE_TX_ACK,
+ J1939_ERRQUEUE_TX_SCHED,
+ J1939_ERRQUEUE_TX_ABORT,
+ J1939_ERRQUEUE_RX_RTS,
+ J1939_ERRQUEUE_RX_DPO,
+ J1939_ERRQUEUE_RX_ABORT,
+};
+
+/* j1939 devices */
+struct j1939_ecu {
+ struct list_head list;
+ name_t name;
+ u8 addr;
+
+ /* indicates that this ecu successfully claimed @sa as its address */
+ struct hrtimer ac_timer;
+ struct kref kref;
+ struct j1939_priv *priv;
+
+ /* count users, to help transport protocol decide for interaction */
+ int nusers;
+};
+
+struct j1939_priv {
+ struct list_head ecus;
+ /* local list entry in priv
+ * These allow irq (& softirq) context lookups on j1939 devices
+ * This approach (separate lists) is done as the other 2 alternatives
+ * are not easier or even wrong
+ * 1) using the pure kobject methods involves mutexes, which are not
+ * allowed in irq context.
+ * 2) duplicating data structures would require a lot of synchronization
+ * code
+ * usage:
+ */
+
+ /* segments need a lock to protect the above list */
+ rwlock_t lock;
+
+ struct net_device *ndev;
+
+ /* list of 256 ecu ptrs, that cache the claimed addresses.
+ * also protected by the above lock
+ */
+ struct j1939_addr_ent {
+ struct j1939_ecu *ecu;
+ /* count users, to help transport protocol */
+ int nusers;
+ } ents[256];
+
+ struct kref kref;
+
+ /* List of active sessions to prevent start of conflicting
+ * one.
+ *
+ * Do not start two sessions of same type, addresses and
+ * direction.
+ */
+ struct list_head active_session_list;
+
+ /* protects active_session_list */
+ spinlock_t active_session_list_lock;
+
+ unsigned int tp_max_packet_size;
+
+ /* lock for j1939_socks list */
+ rwlock_t j1939_socks_lock;
+ struct list_head j1939_socks;
+
+ struct kref rx_kref;
+ u32 rx_tskey;
+};
+
+void j1939_ecu_put(struct j1939_ecu *ecu);
+
+/* keep the cache of what is local */
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa);
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa);
+
+static inline bool j1939_address_is_unicast(u8 addr)
+{
+ return addr <= J1939_MAX_UNICAST_ADDR;
+}
+
+static inline bool j1939_address_is_idle(u8 addr)
+{
+ return addr == J1939_IDLE_ADDR;
+}
+
+static inline bool j1939_address_is_valid(u8 addr)
+{
+ return addr != J1939_NO_ADDR;
+}
+
+static inline bool j1939_pgn_is_pdu1(pgn_t pgn)
+{
+ /* ignore dp & res bits for this */
+ return (pgn & 0xff00) < 0xf000;
+}
+
+/* utility to correctly unmap an ECU */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu);
+void j1939_ecu_unmap(struct j1939_ecu *ecu);
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+ u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv,
+ u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+ name_t name);
+
+enum j1939_transfer_type {
+ J1939_TP,
+ J1939_ETP,
+ J1939_SIMPLE,
+};
+
+struct j1939_addr {
+ name_t src_name;
+ name_t dst_name;
+ pgn_t pgn;
+
+ u8 sa;
+ u8 da;
+
+ u8 type;
+};
+
+/* control buffer of the sk_buff */
+struct j1939_sk_buff_cb {
+ /* Offset in bytes within one ETP session */
+ u32 offset;
+
+ /* for tx, MSG_SYN will be used to sync on sockets */
+ u32 msg_flags;
+ u32 tskey;
+
+ struct j1939_addr addr;
+
+ /* Flags for quick lookups during skb processing.
+ * These are set in the receive path only.
+ */
+#define J1939_ECU_LOCAL_SRC BIT(0)
+#define J1939_ECU_LOCAL_DST BIT(1)
+ u8 flags;
+
+ priority_t priority;
+};
+
+static inline
+struct j1939_sk_buff_cb *j1939_skb_to_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct j1939_sk_buff_cb) > sizeof(skb->cb));
+
+ return (struct j1939_sk_buff_cb *)skb->cb;
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb);
+bool j1939_sk_recv_match(struct j1939_priv *priv,
+ struct j1939_sk_buff_cb *skcb);
+void j1939_sk_send_loop_abort(struct sock *sk, int err);
+void j1939_sk_errqueue(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type);
+void j1939_sk_queue_activate_next(struct j1939_session *session);
+
+/* stack entries */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size);
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb);
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb);
+
+/* network management */
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name);
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu);
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu);
+void j1939_ecu_unmap_all(struct j1939_priv *priv);
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev);
+void j1939_netdev_stop(struct j1939_priv *priv);
+
+void j1939_priv_put(struct j1939_priv *priv);
+void j1939_priv_get(struct j1939_priv *priv);
+
+/* notify/alert all j1939 sockets bound to ifindex */
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv);
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv);
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);
+void j1939_tp_init(struct j1939_priv *priv);
+
+/* decrement pending skb for a j1939 socket */
+void j1939_sock_pending_del(struct sock *sk);
+
+enum j1939_session_state {
+ J1939_SESSION_NEW,
+ J1939_SESSION_ACTIVE,
+ /* waiting for abort signal on the bus */
+ J1939_SESSION_WAITING_ABORT,
+ J1939_SESSION_ACTIVE_MAX,
+ J1939_SESSION_DONE,
+};
+
+struct j1939_session {
+ struct j1939_priv *priv;
+ struct list_head active_session_list_entry;
+ struct list_head sk_session_queue_entry;
+ struct kref kref;
+ struct sock *sk;
+
+ /* ifindex, src, dst, pgn define the session block
+ * the are _never_ modified after insertion in the list
+ * this decreases locking problems a _lot_
+ */
+ struct j1939_sk_buff_cb skcb;
+ struct sk_buff_head skb_queue;
+
+ /* all tx related stuff (last_txcmd, pkt.tx)
+ * is protected (modified only) with the txtimer hrtimer
+ * 'total' & 'block' are never changed,
+ * last_cmd, last & block are protected by ->lock
+ * this means that the tx may run after cts is received that should
+ * have stopped tx, but this time discrepancy is never avoided anyhow
+ */
+ u8 last_cmd, last_txcmd;
+ bool transmission;
+ bool extd;
+ /* Total message size, number of bytes */
+ unsigned int total_message_size;
+ /* Total number of bytes queue from socket to the session */
+ unsigned int total_queued_size;
+ unsigned int tx_retry;
+
+ int err;
+ u32 tskey;
+ enum j1939_session_state state;
+
+ /* Packets counters for a (extended) transfer session. The packet is
+ * maximal of 7 bytes.
+ */
+ struct {
+ /* total - total number of packets for this session */
+ unsigned int total;
+ /* last - last packet of a transfer block after which
+ * responder should send ETP.CM_CTS and originator
+ * ETP.CM_DPO
+ */
+ unsigned int last;
+ /* tx - number of packets send by originator node.
+ * this counter can be set back if responder node
+ * didn't received all packets send by originator.
+ */
+ unsigned int tx;
+ unsigned int tx_acked;
+ /* rx - number of packets received */
+ unsigned int rx;
+ /* block - amount of packets expected in one block */
+ unsigned int block;
+ /* dpo - ETP.CM_DPO, Data Packet Offset */
+ unsigned int dpo;
+ } pkt;
+ struct hrtimer txtimer, rxtimer;
+};
+
+struct j1939_sock {
+ struct sock sk; /* must be first to skip with memset */
+ struct j1939_priv *priv;
+ struct list_head list;
+
+#define J1939_SOCK_BOUND BIT(0)
+#define J1939_SOCK_CONNECTED BIT(1)
+#define J1939_SOCK_PROMISC BIT(2)
+#define J1939_SOCK_ERRQUEUE BIT(3)
+ int state;
+
+ int ifindex;
+ struct j1939_addr addr;
+ spinlock_t filters_lock;
+ struct j1939_filter *filters;
+ int nfilters;
+ pgn_t pgn_rx_filter;
+
+ /* j1939 may emit equal PGN (!= equal CAN-id's) out of order
+ * when transport protocol comes in.
+ * To allow emitting in order, keep a 'pending' nr. of packets
+ */
+ atomic_t skb_pending;
+ wait_queue_head_t waitq;
+
+ /* lock for the sk_session_queue list */
+ spinlock_t sk_session_queue_lock;
+ struct list_head sk_session_queue;
+};
+
+static inline struct j1939_sock *j1939_sk(const struct sock *sk)
+{
+ return container_of(sk, struct j1939_sock, sk);
+}
+
+void j1939_session_get(struct j1939_session *session);
+void j1939_session_put(struct j1939_session *session);
+void j1939_session_skb_queue(struct j1939_session *session,
+ struct sk_buff *skb);
+int j1939_session_activate(struct j1939_session *session);
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec);
+void j1939_session_timers_cancel(struct j1939_session *session);
+
+#define J1939_MIN_TP_PACKET_SIZE 9
+#define J1939_MAX_TP_PACKET_SIZE (7 * 0xff)
+#define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff)
+
+#define J1939_REGULAR 0
+#define J1939_EXTENDED 1
+
+/* CAN protocol */
+extern const struct can_proto j1939_can_proto;
+
+#endif /* _J1939_PRIV_H_ */
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
new file mode 100644
index 000000000000..a93af55df5fd
--- /dev/null
+++ b/net/can/j1939/main.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* Core of can-j1939 that links j1939 to CAN. */
+
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+
+#include "j1939-priv.h"
+
+MODULE_DESCRIPTION("PF_CAN SAE J1939");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)");
+MODULE_ALIAS("can-proto-" __stringify(CAN_J1939));
+
+/* LOWLEVEL CAN interface */
+
+/* CAN_HDR: #bytes before can_frame data part */
+#define J1939_CAN_HDR (offsetof(struct can_frame, data))
+
+/* lowest layer */
+static void j1939_can_recv(struct sk_buff *iskb, void *data)
+{
+ struct j1939_priv *priv = data;
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb, *iskcb;
+ struct can_frame *cf;
+
+ /* make sure we only get Classical CAN frames */
+ if (!can_is_can_skb(iskb))
+ return;
+
+ /* create a copy of the skb
+ * j1939 only delivers the real data bytes,
+ * the header goes into sockaddr.
+ * j1939 may not touch the incoming skb in such way
+ */
+ skb = skb_clone(iskb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ j1939_priv_get(priv);
+ can_skb_set_owner(skb, iskb->sk);
+
+ /* get a pointer to the header of the skb
+ * the skb payload (pointer) is moved, so that the next skb_data
+ * returns the actual payload
+ */
+ cf = (void *)skb->data;
+ skb_pull(skb, J1939_CAN_HDR);
+
+ /* fix length, set to dlc, with 8 maximum */
+ skb_trim(skb, min_t(uint8_t, cf->len, 8));
+
+ /* set addr */
+ skcb = j1939_skb_to_cb(skb);
+ memset(skcb, 0, sizeof(*skcb));
+
+ iskcb = j1939_skb_to_cb(iskb);
+ skcb->tskey = iskcb->tskey;
+ skcb->priority = (cf->can_id >> 26) & 0x7;
+ skcb->addr.sa = cf->can_id;
+ skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX;
+ /* set default message type */
+ skcb->addr.type = J1939_TP;
+
+ if (!j1939_address_is_valid(skcb->addr.sa)) {
+ netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n",
+ __func__);
+ goto done;
+ }
+
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) {
+ /* Type 1: with destination address */
+ skcb->addr.da = skcb->addr.pgn;
+ /* normalize pgn: strip dst address */
+ skcb->addr.pgn &= 0x3ff00;
+ } else {
+ /* set broadcast address */
+ skcb->addr.da = J1939_NO_ADDR;
+ }
+
+ /* update localflags */
+ read_lock_bh(&priv->lock);
+ if (j1939_address_is_unicast(skcb->addr.sa) &&
+ priv->ents[skcb->addr.sa].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+ read_unlock_bh(&priv->lock);
+
+ /* deliver into the j1939 stack ... */
+ j1939_ac_recv(priv, skb);
+
+ if (j1939_tp_recv(priv, skb))
+ /* this means the transport layer processed the message */
+ goto done;
+
+ j1939_simple_recv(priv, skb);
+ j1939_sk_recv(priv, skb);
+ done:
+ j1939_priv_put(priv);
+ kfree_skb(skb);
+}
+
+/* NETDEV MANAGEMENT */
+
+/* values for can_rx_(un)register */
+#define J1939_CAN_ID CAN_EFF_FLAG
+#define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+static DEFINE_MUTEX(j1939_netdev_lock);
+
+static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return NULL;
+
+ rwlock_init(&priv->lock);
+ INIT_LIST_HEAD(&priv->ecus);
+ priv->ndev = ndev;
+ kref_init(&priv->kref);
+ kref_init(&priv->rx_kref);
+ dev_hold(ndev);
+
+ netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv);
+
+ return priv;
+}
+
+static inline void j1939_priv_set(struct net_device *ndev,
+ struct j1939_priv *priv)
+{
+ struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
+
+ can_ml->j1939_priv = priv;
+}
+
+static void __j1939_priv_release(struct kref *kref)
+{
+ struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref);
+ struct net_device *ndev = priv->ndev;
+
+ netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv);
+
+ WARN_ON_ONCE(!list_empty(&priv->active_session_list));
+ WARN_ON_ONCE(!list_empty(&priv->ecus));
+ WARN_ON_ONCE(!list_empty(&priv->j1939_socks));
+
+ dev_put(ndev);
+ kfree(priv);
+}
+
+void j1939_priv_put(struct j1939_priv *priv)
+{
+ kref_put(&priv->kref, __j1939_priv_release);
+}
+
+void j1939_priv_get(struct j1939_priv *priv)
+{
+ kref_get(&priv->kref);
+}
+
+static int j1939_can_rx_register(struct j1939_priv *priv)
+{
+ struct net_device *ndev = priv->ndev;
+ int ret;
+
+ j1939_priv_get(priv);
+ ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+ j1939_can_recv, priv, "j1939", NULL);
+ if (ret < 0) {
+ j1939_priv_put(priv);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void j1939_can_rx_unregister(struct j1939_priv *priv)
+{
+ struct net_device *ndev = priv->ndev;
+
+ can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+ j1939_can_recv, priv);
+
+ /* The last reference of priv is dropped by the RCU deferred
+ * j1939_sk_sock_destruct() of the last socket, so we can
+ * safely drop this reference here.
+ */
+ j1939_priv_put(priv);
+}
+
+static void __j1939_rx_release(struct kref *kref)
+ __releases(&j1939_netdev_lock)
+{
+ struct j1939_priv *priv = container_of(kref, struct j1939_priv,
+ rx_kref);
+
+ j1939_can_rx_unregister(priv);
+ j1939_ecu_unmap_all(priv);
+ j1939_priv_set(priv->ndev, NULL);
+ mutex_unlock(&j1939_netdev_lock);
+}
+
+/* get pointer to priv without increasing ref counter */
+static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
+{
+ struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
+
+ return can_ml->j1939_priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ lockdep_assert_held(&j1939_netdev_lock);
+
+ priv = j1939_ndev_to_priv(ndev);
+ if (priv)
+ j1939_priv_get(priv);
+
+ return priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ mutex_lock(&j1939_netdev_lock);
+ priv = j1939_priv_get_by_ndev_locked(ndev);
+ mutex_unlock(&j1939_netdev_lock);
+
+ return priv;
+}
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev)
+{
+ struct j1939_priv *priv, *priv_new;
+ int ret;
+
+ mutex_lock(&j1939_netdev_lock);
+ priv = j1939_priv_get_by_ndev_locked(ndev);
+ if (priv) {
+ kref_get(&priv->rx_kref);
+ mutex_unlock(&j1939_netdev_lock);
+ return priv;
+ }
+ mutex_unlock(&j1939_netdev_lock);
+
+ priv = j1939_priv_create(ndev);
+ if (!priv)
+ return ERR_PTR(-ENOMEM);
+
+ j1939_tp_init(priv);
+ rwlock_init(&priv->j1939_socks_lock);
+ INIT_LIST_HEAD(&priv->j1939_socks);
+
+ mutex_lock(&j1939_netdev_lock);
+ priv_new = j1939_priv_get_by_ndev_locked(ndev);
+ if (priv_new) {
+ /* Someone was faster than us, use their priv and roll
+ * back our's.
+ */
+ kref_get(&priv_new->rx_kref);
+ mutex_unlock(&j1939_netdev_lock);
+ dev_put(ndev);
+ kfree(priv);
+ return priv_new;
+ }
+ j1939_priv_set(ndev, priv);
+
+ ret = j1939_can_rx_register(priv);
+ if (ret < 0)
+ goto out_priv_put;
+
+ mutex_unlock(&j1939_netdev_lock);
+ return priv;
+
+ out_priv_put:
+ j1939_priv_set(ndev, NULL);
+ mutex_unlock(&j1939_netdev_lock);
+
+ dev_put(ndev);
+ kfree(priv);
+
+ return ERR_PTR(ret);
+}
+
+void j1939_netdev_stop(struct j1939_priv *priv)
+{
+ kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock);
+ j1939_priv_put(priv);
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ int ret, dlc;
+ canid_t canid;
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct can_frame *cf;
+
+ /* apply sanity checks */
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+ skcb->addr.pgn &= J1939_PGN_PDU1_MAX;
+ else
+ skcb->addr.pgn &= J1939_PGN_MAX;
+
+ if (skcb->priority > 7)
+ skcb->priority = 6;
+
+ ret = j1939_ac_fixup(priv, skb);
+ if (unlikely(ret))
+ goto failed;
+ dlc = skb->len;
+
+ /* re-claim the CAN_HDR from the SKB */
+ cf = skb_push(skb, J1939_CAN_HDR);
+
+ /* initialize header structure */
+ memset(cf, 0, J1939_CAN_HDR);
+
+ /* make it a full can frame again */
+ skb_put_zero(skb, 8 - dlc);
+
+ canid = CAN_EFF_FLAG |
+ (skcb->priority << 26) |
+ (skcb->addr.pgn << 8) |
+ skcb->addr.sa;
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+ canid |= skcb->addr.da << 8;
+
+ cf->can_id = canid;
+ cf->len = dlc;
+
+ return can_send(skb, 1);
+
+ failed:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int j1939_netdev_notify(struct notifier_block *nb,
+ unsigned long msg, void *data)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(data);
+ struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
+ struct j1939_priv *priv;
+
+ if (!can_ml)
+ goto notify_done;
+
+ priv = j1939_priv_get_by_ndev(ndev);
+ if (!priv)
+ goto notify_done;
+
+ switch (msg) {
+ case NETDEV_DOWN:
+ j1939_cancel_active_session(priv, NULL);
+ j1939_sk_netdev_event_netdown(priv);
+ j1939_ecu_unmap_all(priv);
+ break;
+ case NETDEV_UNREGISTER:
+ j1939_cancel_active_session(priv, NULL);
+ j1939_sk_netdev_event_netdown(priv);
+ j1939_sk_netdev_event_unregister(priv);
+ break;
+ }
+
+ j1939_priv_put(priv);
+
+notify_done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block j1939_netdev_notifier = {
+ .notifier_call = j1939_netdev_notify,
+};
+
+/* MODULE interface */
+static __init int j1939_module_init(void)
+{
+ int ret;
+
+ pr_info("can: SAE J1939\n");
+
+ ret = register_netdevice_notifier(&j1939_netdev_notifier);
+ if (ret)
+ goto fail_notifier;
+
+ ret = can_proto_register(&j1939_can_proto);
+ if (ret < 0) {
+ pr_err("can: registration of j1939 protocol failed\n");
+ goto fail_sk;
+ }
+
+ return 0;
+
+ fail_sk:
+ unregister_netdevice_notifier(&j1939_netdev_notifier);
+ fail_notifier:
+ return ret;
+}
+
+static __exit void j1939_module_exit(void)
+{
+ can_proto_unregister(&j1939_can_proto);
+
+ unregister_netdevice_notifier(&j1939_netdev_notifier);
+}
+
+module_init(j1939_module_init);
+module_exit(j1939_module_exit);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
new file mode 100644
index 000000000000..6272326dd614
--- /dev/null
+++ b/net/can/j1939/socket.c
@@ -0,0 +1,1393 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/errqueue.h>
+#include <linux/if_arp.h>
+
+#include "j1939-priv.h"
+
+#define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939)
+
+/* conversion function between struct sock::sk_priority from linux and
+ * j1939 priority field
+ */
+static inline priority_t j1939_prio(u32 sk_priority)
+{
+ sk_priority = min(sk_priority, 7U);
+
+ return 7 - sk_priority;
+}
+
+static inline u32 j1939_to_sk_priority(priority_t prio)
+{
+ return 7 - prio;
+}
+
+/* function to see if pgn is to be evaluated */
+static inline bool j1939_pgn_is_valid(pgn_t pgn)
+{
+ return pgn <= J1939_PGN_MAX;
+}
+
+/* test function to avoid non-zero DA placeholder for pdu1 pgn's */
+static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn)
+{
+ if (j1939_pgn_is_pdu1(pgn))
+ return !(pgn & 0xff);
+ else
+ return true;
+}
+
+static inline void j1939_sock_pending_add(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ atomic_inc(&jsk->skb_pending);
+}
+
+static int j1939_sock_pending_get(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ return atomic_read(&jsk->skb_pending);
+}
+
+void j1939_sock_pending_del(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* atomic_dec_return returns the new value */
+ if (!atomic_dec_return(&jsk->skb_pending))
+ wake_up(&jsk->waitq); /* no pending SKB's */
+}
+
+static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+ jsk->state |= J1939_SOCK_BOUND;
+ j1939_priv_get(priv);
+
+ write_lock_bh(&priv->j1939_socks_lock);
+ list_add_tail(&jsk->list, &priv->j1939_socks);
+ write_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+ write_lock_bh(&priv->j1939_socks_lock);
+ list_del_init(&jsk->list);
+ write_unlock_bh(&priv->j1939_socks_lock);
+
+ j1939_priv_put(priv);
+ jsk->state &= ~J1939_SOCK_BOUND;
+}
+
+static bool j1939_sk_queue_session(struct j1939_session *session)
+{
+ struct j1939_sock *jsk = j1939_sk(session->sk);
+ bool empty;
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ empty = list_empty(&jsk->sk_session_queue);
+ j1939_session_get(session);
+ list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue);
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+ j1939_sock_pending_add(&jsk->sk);
+
+ return empty;
+}
+
+static struct
+j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk)
+{
+ struct j1939_session *session = NULL;
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ if (!list_empty(&jsk->sk_session_queue)) {
+ session = list_last_entry(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+ if (session->total_queued_size == session->total_message_size)
+ session = NULL;
+ else
+ j1939_session_get(session);
+ }
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+
+ return session;
+}
+
+static void j1939_sk_queue_drop_all(struct j1939_priv *priv,
+ struct j1939_sock *jsk, int err)
+{
+ struct j1939_session *session, *tmp;
+
+ netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err);
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue,
+ sk_session_queue_entry) {
+ list_del_init(&session->sk_session_queue_entry);
+ session->err = err;
+ j1939_session_put(session);
+ }
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static void j1939_sk_queue_activate_next_locked(struct j1939_session *session)
+{
+ struct j1939_sock *jsk;
+ struct j1939_session *first;
+ int err;
+
+ /* RX-Session don't have a socket (yet) */
+ if (!session->sk)
+ return;
+
+ jsk = j1939_sk(session->sk);
+ lockdep_assert_held(&jsk->sk_session_queue_lock);
+
+ err = session->err;
+
+ first = list_first_entry_or_null(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+
+ /* Some else has already activated the next session */
+ if (first != session)
+ return;
+
+activate_next:
+ list_del_init(&first->sk_session_queue_entry);
+ j1939_session_put(first);
+ first = list_first_entry_or_null(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+ if (!first)
+ return;
+
+ if (j1939_session_activate(first)) {
+ netdev_warn_once(first->priv->ndev,
+ "%s: 0x%p: Identical session is already activated.\n",
+ __func__, first);
+ first->err = -EBUSY;
+ goto activate_next;
+ } else {
+ /* Give receiver some time (arbitrary chosen) to recover */
+ int time_ms = 0;
+
+ if (err)
+ time_ms = 10 + get_random_u32_below(16);
+
+ j1939_tp_schedule_txtimer(first, time_ms);
+ }
+}
+
+void j1939_sk_queue_activate_next(struct j1939_session *session)
+{
+ struct j1939_sock *jsk;
+
+ if (!session->sk)
+ return;
+
+ jsk = j1939_sk(session->sk);
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ j1939_sk_queue_activate_next_locked(session);
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static bool j1939_sk_match_dst(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ if ((jsk->state & J1939_SOCK_PROMISC))
+ return true;
+
+ /* Destination address filter */
+ if (jsk->addr.src_name && skcb->addr.dst_name) {
+ if (jsk->addr.src_name != skcb->addr.dst_name)
+ return false;
+ } else {
+ /* receive (all sockets) if
+ * - all packages that match our bind() address
+ * - all broadcast on a socket if SO_BROADCAST
+ * is set
+ */
+ if (j1939_address_is_unicast(skcb->addr.da)) {
+ if (jsk->addr.sa != skcb->addr.da)
+ return false;
+ } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+ /* receiving broadcast without SO_BROADCAST
+ * flag is not allowed
+ */
+ return false;
+ }
+ }
+
+ /* Source address filter */
+ if (jsk->state & J1939_SOCK_CONNECTED) {
+ /* receive (all sockets) if
+ * - all packages that match our connect() name or address
+ */
+ if (jsk->addr.dst_name && skcb->addr.src_name) {
+ if (jsk->addr.dst_name != skcb->addr.src_name)
+ return false;
+ } else {
+ if (jsk->addr.da != skcb->addr.sa)
+ return false;
+ }
+ }
+
+ /* PGN filter */
+ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) &&
+ jsk->pgn_rx_filter != skcb->addr.pgn)
+ return false;
+
+ return true;
+}
+
+/* matches skb control buffer (addr) with a j1939 filter */
+static bool j1939_sk_match_filter(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ const struct j1939_filter *f;
+ int nfilter;
+
+ spin_lock_bh(&jsk->filters_lock);
+
+ f = jsk->filters;
+ nfilter = jsk->nfilters;
+
+ if (!nfilter)
+ /* receive all when no filters are assigned */
+ goto filter_match_found;
+
+ for (; nfilter; ++f, --nfilter) {
+ if ((skcb->addr.pgn & f->pgn_mask) != f->pgn)
+ continue;
+ if ((skcb->addr.sa & f->addr_mask) != f->addr)
+ continue;
+ if ((skcb->addr.src_name & f->name_mask) != f->name)
+ continue;
+ goto filter_match_found;
+ }
+
+ spin_unlock_bh(&jsk->filters_lock);
+ return false;
+
+filter_match_found:
+ spin_unlock_bh(&jsk->filters_lock);
+ return true;
+}
+
+static bool j1939_sk_recv_match_one(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ if (!(jsk->state & J1939_SOCK_BOUND))
+ return false;
+
+ if (!j1939_sk_match_dst(jsk, skcb))
+ return false;
+
+ if (!j1939_sk_match_filter(jsk, skcb))
+ return false;
+
+ return true;
+}
+
+static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb)
+{
+ const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb);
+ struct j1939_sk_buff_cb *skcb;
+ enum skb_drop_reason reason;
+ struct sk_buff *skb;
+
+ if (oskb->sk == &jsk->sk)
+ return;
+
+ if (!j1939_sk_recv_match_one(jsk, oskcb))
+ return;
+
+ skb = skb_clone(oskb, GFP_ATOMIC);
+ if (!skb) {
+ pr_warn("skb clone failed\n");
+ return;
+ }
+ can_skb_set_owner(skb, oskb->sk);
+
+ skcb = j1939_skb_to_cb(skb);
+ skcb->msg_flags &= ~(MSG_DONTROUTE);
+ if (skb->sk)
+ skcb->msg_flags |= MSG_DONTROUTE;
+
+ if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0)
+ sk_skb_reason_drop(&jsk->sk, skb, reason);
+}
+
+bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb)
+{
+ struct j1939_sock *jsk;
+ bool match = false;
+
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ match = j1939_sk_recv_match_one(jsk, skcb);
+ if (match)
+ break;
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+
+ return match;
+}
+
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sock *jsk;
+
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ j1939_sk_recv_one(jsk, skb);
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_sk_sock_destruct(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* This function will be called by the generic networking code, when
+ * the socket is ultimately closed (sk->sk_destruct).
+ *
+ * The race between
+ * - processing a received CAN frame
+ * (can_receive -> j1939_can_recv)
+ * and accessing j1939_priv
+ * ... and ...
+ * - closing a socket
+ * (j1939_can_rx_unregister -> can_rx_unregister)
+ * and calling the final j1939_priv_put()
+ *
+ * is avoided by calling the final j1939_priv_put() from this
+ * RCU deferred cleanup call.
+ */
+ if (jsk->priv) {
+ j1939_priv_put(jsk->priv);
+ jsk->priv = NULL;
+ }
+
+ /* call generic CAN sock destruct */
+ can_sock_destruct(sk);
+}
+
+static int j1939_sk_init(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* Ensure that "sk" is first member in "struct j1939_sock", so that we
+ * can skip it during memset().
+ */
+ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0);
+ memset((void *)jsk + sizeof(jsk->sk), 0x0,
+ sizeof(*jsk) - sizeof(jsk->sk));
+
+ INIT_LIST_HEAD(&jsk->list);
+ init_waitqueue_head(&jsk->waitq);
+ jsk->sk.sk_priority = j1939_to_sk_priority(6);
+ jsk->sk.sk_reuse = 1; /* per default */
+ jsk->addr.sa = J1939_NO_ADDR;
+ jsk->addr.da = J1939_NO_ADDR;
+ jsk->addr.pgn = J1939_NO_PGN;
+ jsk->pgn_rx_filter = J1939_NO_PGN;
+ atomic_set(&jsk->skb_pending, 0);
+ spin_lock_init(&jsk->sk_session_queue_lock);
+ INIT_LIST_HEAD(&jsk->sk_session_queue);
+ spin_lock_init(&jsk->filters_lock);
+
+ /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ sk->sk_destruct = j1939_sk_sock_destruct;
+ sk->sk_protocol = CAN_J1939;
+
+ return 0;
+}
+
+static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len)
+{
+ if (!addr)
+ return -EDESTADDRREQ;
+ if (len < J1939_MIN_NAMELEN)
+ return -EINVAL;
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+ if (!addr->can_ifindex)
+ return -ENODEV;
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct j1939_sock *jsk = j1939_sk(sock->sk);
+ struct j1939_priv *priv;
+ struct sock *sk;
+ struct net *net;
+ int ret = 0;
+
+ ret = j1939_sk_sanity_check(addr, len);
+ if (ret)
+ return ret;
+
+ lock_sock(sock->sk);
+
+ priv = jsk->priv;
+ sk = sock->sk;
+ net = sock_net(sk);
+
+ /* Already bound to an interface? */
+ if (jsk->state & J1939_SOCK_BOUND) {
+ /* A re-bind() to a different interface is not
+ * supported.
+ */
+ if (jsk->ifindex != addr->can_ifindex) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ /* drop old references */
+ j1939_jsk_del(priv, jsk);
+ j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+ } else {
+ struct can_ml_priv *can_ml;
+ struct net_device *ndev;
+
+ ndev = dev_get_by_index(net, addr->can_ifindex);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto out_release_sock;
+ }
+
+ can_ml = can_get_ml_priv(ndev);
+ if (!can_ml) {
+ dev_put(ndev);
+ ret = -ENODEV;
+ goto out_release_sock;
+ }
+
+ if (!(ndev->flags & IFF_UP)) {
+ dev_put(ndev);
+ ret = -ENETDOWN;
+ goto out_release_sock;
+ }
+
+ priv = j1939_netdev_start(ndev);
+ dev_put(ndev);
+ if (IS_ERR(priv)) {
+ ret = PTR_ERR(priv);
+ goto out_release_sock;
+ }
+
+ jsk->ifindex = addr->can_ifindex;
+
+ /* the corresponding j1939_priv_put() is called via
+ * sk->sk_destruct, which points to j1939_sk_sock_destruct()
+ */
+ j1939_priv_get(priv);
+ jsk->priv = priv;
+ }
+
+ /* set default transmit pgn */
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ jsk->pgn_rx_filter = addr->can_addr.j1939.pgn;
+ jsk->addr.src_name = addr->can_addr.j1939.name;
+ jsk->addr.sa = addr->can_addr.j1939.addr;
+
+ /* get new references */
+ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);
+ if (ret) {
+ j1939_netdev_stop(priv);
+ jsk->priv = NULL;
+ synchronize_rcu();
+ j1939_priv_put(priv);
+ goto out_release_sock;
+ }
+
+ j1939_jsk_add(priv, jsk);
+
+ out_release_sock: /* fall through */
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int len, int flags)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct j1939_sock *jsk = j1939_sk(sock->sk);
+ int ret = 0;
+
+ ret = j1939_sk_sanity_check(addr, len);
+ if (ret)
+ return ret;
+
+ lock_sock(sock->sk);
+
+ /* bind() before connect() is mandatory */
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ /* A connect() to a different interface is not supported. */
+ if (jsk->ifindex != addr->can_ifindex) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ if (!addr->can_addr.j1939.name &&
+ addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+ !sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto out_release_sock;
+ }
+
+ jsk->addr.dst_name = addr->can_addr.j1939.name;
+ jsk->addr.da = addr->can_addr.j1939.addr;
+
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ jsk->addr.pgn = addr->can_addr.j1939.pgn;
+
+ jsk->state |= J1939_SOCK_CONNECTED;
+
+ out_release_sock: /* fall through */
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr,
+ const struct j1939_sock *jsk, int peer)
+{
+ /* There are two holes (2 bytes and 3 bytes) to clear to avoid
+ * leaking kernel information to user space.
+ */
+ memset(addr, 0, J1939_MIN_NAMELEN);
+
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = jsk->ifindex;
+ addr->can_addr.j1939.pgn = jsk->addr.pgn;
+ if (peer) {
+ addr->can_addr.j1939.name = jsk->addr.dst_name;
+ addr->can_addr.j1939.addr = jsk->addr.da;
+ } else {
+ addr->can_addr.j1939.name = jsk->addr.src_name;
+ addr->can_addr.j1939.addr = jsk->addr.sa;
+ }
+}
+
+static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) {
+ ret = -EADDRNOTAVAIL;
+ goto failure;
+ }
+
+ j1939_sk_sock2sockaddr_can(addr, jsk, peer);
+ ret = J1939_MIN_NAMELEN;
+
+ failure:
+ release_sock(sk);
+
+ return ret;
+}
+
+static int j1939_sk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ jsk = j1939_sk(sk);
+
+ if (jsk->state & J1939_SOCK_BOUND) {
+ struct j1939_priv *priv = jsk->priv;
+
+ if (wait_event_interruptible(jsk->waitq,
+ !j1939_sock_pending_get(&jsk->sk))) {
+ j1939_cancel_active_session(priv, sk);
+ j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN);
+ }
+
+ j1939_jsk_del(priv, jsk);
+
+ j1939_local_ecu_put(priv, jsk->addr.src_name,
+ jsk->addr.sa);
+
+ j1939_netdev_stop(priv);
+ }
+
+ kfree(jsk->filters);
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval,
+ unsigned int optlen, int flag)
+{
+ int tmp;
+
+ if (optlen != sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, optval, optlen))
+ return -EFAULT;
+ lock_sock(&jsk->sk);
+ if (tmp)
+ jsk->state |= flag;
+ else
+ jsk->state &= ~flag;
+ release_sock(&jsk->sk);
+ return tmp;
+}
+
+static int j1939_sk_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int tmp, count = 0, ret = 0;
+ struct j1939_filter *filters = NULL, *ofilters;
+
+ if (level != SOL_CAN_J1939)
+ return -EINVAL;
+
+ switch (optname) {
+ case SO_J1939_FILTER:
+ if (!sockptr_is_null(optval) && optlen != 0) {
+ struct j1939_filter *f;
+ int c;
+
+ if (optlen % sizeof(*filters) != 0)
+ return -EINVAL;
+
+ if (optlen > J1939_FILTER_MAX *
+ sizeof(struct j1939_filter))
+ return -EINVAL;
+
+ count = optlen / sizeof(*filters);
+ filters = memdup_sockptr(optval, optlen);
+ if (IS_ERR(filters))
+ return PTR_ERR(filters);
+
+ for (f = filters, c = count; c; f++, c--) {
+ f->name &= f->name_mask;
+ f->pgn &= f->pgn_mask;
+ f->addr &= f->addr_mask;
+ }
+ }
+
+ lock_sock(&jsk->sk);
+ spin_lock_bh(&jsk->filters_lock);
+ ofilters = jsk->filters;
+ jsk->filters = filters;
+ jsk->nfilters = count;
+ spin_unlock_bh(&jsk->filters_lock);
+ release_sock(&jsk->sk);
+ kfree(ofilters);
+ return 0;
+ case SO_J1939_PROMISC:
+ return j1939_sk_setsockopt_flag(jsk, optval, optlen,
+ J1939_SOCK_PROMISC);
+ case SO_J1939_ERRQUEUE:
+ ret = j1939_sk_setsockopt_flag(jsk, optval, optlen,
+ J1939_SOCK_ERRQUEUE);
+ if (ret < 0)
+ return ret;
+
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ skb_queue_purge(&sk->sk_error_queue);
+ return ret;
+ case SO_J1939_SEND_PRIO:
+ if (optlen != sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, optval, optlen))
+ return -EFAULT;
+ if (tmp < 0 || tmp > 7)
+ return -EDOM;
+ if (tmp < 2 && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ lock_sock(&jsk->sk);
+ jsk->sk.sk_priority = j1939_to_sk_priority(tmp);
+ release_sock(&jsk->sk);
+ return 0;
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int j1939_sk_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int ret, ulen;
+ /* set defaults for using 'int' properties */
+ int tmp = 0;
+ int len = sizeof(tmp);
+ void *val = &tmp;
+
+ if (level != SOL_CAN_J1939)
+ return -EINVAL;
+ if (get_user(ulen, optlen))
+ return -EFAULT;
+ if (ulen < 0)
+ return -EINVAL;
+
+ lock_sock(&jsk->sk);
+ switch (optname) {
+ case SO_J1939_PROMISC:
+ tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0;
+ break;
+ case SO_J1939_ERRQUEUE:
+ tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0;
+ break;
+ case SO_J1939_SEND_PRIO:
+ tmp = j1939_prio(jsk->sk.sk_priority);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ goto no_copy;
+ }
+
+ /* copy to user, based on 'len' & 'val'
+ * but most sockopt's are 'int' properties, and have 'len' & 'val'
+ * left unchanged, but instead modified 'tmp'
+ */
+ if (len > ulen)
+ ret = -EFAULT;
+ else if (put_user(len, optlen))
+ ret = -EFAULT;
+ else if (copy_to_user(optval, val, len))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ no_copy:
+ release_sock(&jsk->sk);
+ return ret;
+}
+
+static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+ int ret = 0;
+
+ if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ if (flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939,
+ SCM_J1939_ERRQUEUE);
+
+ skb = skb_recv_datagram(sk, flags, &ret);
+ if (!skb)
+ return ret;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ ret = memcpy_to_msg(msg, skb->data, size);
+ if (ret < 0) {
+ skb_free_datagram(sk, skb);
+ return ret;
+ }
+
+ skcb = j1939_skb_to_cb(skb);
+ if (j1939_address_is_valid(skcb->addr.da))
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR,
+ sizeof(skcb->addr.da), &skcb->addr.da);
+
+ if (skcb->addr.dst_name)
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME,
+ sizeof(skcb->addr.dst_name), &skcb->addr.dst_name);
+
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO,
+ sizeof(skcb->priority), &skcb->priority);
+
+ if (msg->msg_name) {
+ struct sockaddr_can *paddr = msg->msg_name;
+
+ msg->msg_namelen = J1939_MIN_NAMELEN;
+ memset(msg->msg_name, 0, msg->msg_namelen);
+ paddr->can_family = AF_CAN;
+ paddr->can_ifindex = skb->skb_iif;
+ paddr->can_addr.j1939.name = skcb->addr.src_name;
+ paddr->can_addr.j1939.addr = skcb->addr.sa;
+ paddr->can_addr.j1939.pgn = skcb->addr.pgn;
+ }
+
+ sock_recv_cmsgs(msg, sk, skb);
+ msg->msg_flags |= skcb->msg_flags;
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
+ struct sock *sk,
+ struct msghdr *msg, size_t size,
+ int *errcode)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_sk_buff_cb *skcb;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = sock_alloc_send_skb(sk,
+ size +
+ sizeof(struct can_frame) -
+ sizeof(((struct can_frame *)NULL)->data) +
+ sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &ret);
+ if (!skb)
+ goto failure;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ skb_reserve(skb, offsetof(struct can_frame, data));
+
+ ret = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (ret < 0)
+ goto free_skb;
+
+ skb->dev = ndev;
+
+ skcb = j1939_skb_to_cb(skb);
+ memset(skcb, 0, sizeof(*skcb));
+ skcb->addr = jsk->addr;
+ skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority));
+
+ if (msg->msg_name) {
+ struct sockaddr_can *addr = msg->msg_name;
+
+ if (addr->can_addr.j1939.name ||
+ addr->can_addr.j1939.addr != J1939_NO_ADDR) {
+ skcb->addr.dst_name = addr->can_addr.j1939.name;
+ skcb->addr.da = addr->can_addr.j1939.addr;
+ }
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ skcb->addr.pgn = addr->can_addr.j1939.pgn;
+ }
+
+ *errcode = ret;
+ return skb;
+
+free_skb:
+ kfree_skb(skb);
+failure:
+ *errcode = ret;
+ return NULL;
+}
+
+static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type)
+{
+ switch (type) {
+ case J1939_ERRQUEUE_RX_RTS:
+ return
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */
+ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */
+ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */
+ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */
+ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */
+ 0;
+ default:
+ return
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
+ 0;
+ }
+}
+
+static struct sk_buff *
+j1939_sk_get_timestamping_opt_stats(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type)
+{
+ struct sk_buff *stats;
+ u32 size;
+
+ stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ if (session->skcb.addr.type == J1939_SIMPLE)
+ size = session->total_message_size;
+ else
+ size = min(session->pkt.tx_acked * 7,
+ session->total_message_size);
+
+ switch (type) {
+ case J1939_ERRQUEUE_RX_RTS:
+ nla_put_u32(stats, J1939_NLA_TOTAL_SIZE,
+ session->total_message_size);
+ nla_put_u32(stats, J1939_NLA_PGN,
+ session->skcb.addr.pgn);
+ nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME,
+ session->skcb.addr.src_name, J1939_NLA_PAD);
+ nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME,
+ session->skcb.addr.dst_name, J1939_NLA_PAD);
+ nla_put_u8(stats, J1939_NLA_SRC_ADDR,
+ session->skcb.addr.sa);
+ nla_put_u8(stats, J1939_NLA_DEST_ADDR,
+ session->skcb.addr.da);
+ break;
+ default:
+ nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+ }
+
+ return stats;
+}
+
+static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+ enum j1939_sk_errqueue_type type)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sock *jsk;
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ char *state = "UNK";
+ u32 tsflags;
+ int err;
+
+ jsk = j1939_sk(sk);
+
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ return;
+
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ switch (type) {
+ case J1939_ERRQUEUE_TX_ACK:
+ if (!(tsflags & SOF_TIMESTAMPING_TX_ACK))
+ return;
+ break;
+ case J1939_ERRQUEUE_TX_SCHED:
+ if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED))
+ return;
+ break;
+ case J1939_ERRQUEUE_TX_ABORT:
+ break;
+ case J1939_ERRQUEUE_RX_RTS:
+ fallthrough;
+ case J1939_ERRQUEUE_RX_DPO:
+ fallthrough;
+ case J1939_ERRQUEUE_RX_ABORT:
+ if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE))
+ return;
+ break;
+ default:
+ netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
+ }
+
+ skb = j1939_sk_get_timestamping_opt_stats(session, type);
+ if (!skb)
+ return;
+
+ skb->tstamp = ktime_get_real();
+
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ switch (type) {
+ case J1939_ERRQUEUE_TX_ACK:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = SCM_TSTAMP_ACK;
+ state = "TX ACK";
+ break;
+ case J1939_ERRQUEUE_TX_SCHED:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = SCM_TSTAMP_SCHED;
+ state = "TX SCH";
+ break;
+ case J1939_ERRQUEUE_TX_ABORT:
+ serr->ee.ee_errno = session->err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_TX_ABORT;
+ state = "TX ABT";
+ break;
+ case J1939_ERRQUEUE_RX_RTS:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_RX_RTS;
+ state = "RX RTS";
+ break;
+ case J1939_ERRQUEUE_RX_DPO:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_RX_DPO;
+ state = "RX DPO";
+ break;
+ case J1939_ERRQUEUE_RX_ABORT:
+ serr->ee.ee_errno = session->err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_RX_ABORT;
+ state = "RX ABT";
+ break;
+ }
+
+ serr->opt_stats = true;
+ if (tsflags & SOF_TIMESTAMPING_OPT_ID)
+ serr->ee.ee_data = session->tskey;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n",
+ __func__, session, session->tskey, state);
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+};
+
+void j1939_sk_errqueue(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sock *jsk;
+
+ if (session->sk) {
+ /* send TX notifications to the socket of origin */
+ __j1939_sk_errqueue(session, session->sk, type);
+ return;
+ }
+
+ /* spread RX notifications to all sockets subscribed to this session */
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ if (j1939_sk_recv_match_one(jsk, &session->skcb))
+ __j1939_sk_errqueue(session, &jsk->sk, type);
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+};
+
+void j1939_sk_send_loop_abort(struct sock *sk, int err)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ if (jsk->state & J1939_SOCK_ERRQUEUE)
+ return;
+
+ sk->sk_err = err;
+
+ sk_error_report(sk);
+}
+
+static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk,
+ struct msghdr *msg, size_t size)
+
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_session *session = j1939_sk_get_incomplete_session(jsk);
+ struct sk_buff *skb;
+ size_t segment_size, todo_size;
+ int ret = 0;
+
+ if (session &&
+ session->total_message_size != session->total_queued_size + size) {
+ j1939_session_put(session);
+ return -EIO;
+ }
+
+ todo_size = size;
+
+ do {
+ struct j1939_sk_buff_cb *skcb;
+
+ segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE,
+ todo_size);
+
+ /* Allocate skb for one segment */
+ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size,
+ &ret);
+ if (ret)
+ break;
+
+ skcb = j1939_skb_to_cb(skb);
+
+ if (!session) {
+ /* at this point the size should be full size
+ * of the session
+ */
+ skcb->offset = 0;
+ session = j1939_tp_send(priv, skb, size);
+ if (IS_ERR(session)) {
+ ret = PTR_ERR(session);
+ goto kfree_skb;
+ }
+ if (j1939_sk_queue_session(session)) {
+ /* try to activate session if we a
+ * fist in the queue
+ */
+ if (!j1939_session_activate(session)) {
+ j1939_tp_schedule_txtimer(session, 0);
+ } else {
+ ret = -EBUSY;
+ session->err = ret;
+ j1939_sk_queue_drop_all(priv, jsk,
+ EBUSY);
+ break;
+ }
+ }
+ } else {
+ skcb->offset = session->total_queued_size;
+ j1939_session_skb_queue(session, skb);
+ }
+
+ todo_size -= segment_size;
+ session->total_queued_size += segment_size;
+ } while (todo_size);
+
+ switch (ret) {
+ case 0: /* OK */
+ if (todo_size)
+ netdev_warn(priv->ndev,
+ "no error found and not completely queued?! %zu\n",
+ todo_size);
+ ret = size;
+ break;
+ case -ERESTARTSYS:
+ ret = -EINTR;
+ fallthrough;
+ case -EAGAIN: /* OK */
+ if (todo_size != size)
+ ret = size - todo_size;
+ break;
+ default: /* ERROR */
+ break;
+ }
+
+ if (session)
+ j1939_session_put(session);
+
+ return ret;
+
+ kfree_skb:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_priv *priv;
+ int ifindex;
+ int ret;
+
+ lock_sock(sock->sk);
+ /* various socket state tests */
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ priv = jsk->priv;
+ ifindex = jsk->ifindex;
+
+ if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) {
+ /* no source address assigned yet */
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ /* deal with provided destination address info */
+ if (msg->msg_name) {
+ struct sockaddr_can *addr = msg->msg_name;
+
+ if (msg->msg_namelen < J1939_MIN_NAMELEN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (addr->can_family != AF_CAN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (addr->can_ifindex && addr->can_ifindex != ifindex) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (!addr->can_addr.j1939.name &&
+ addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
+ } else {
+ if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
+ }
+
+ ret = j1939_sk_send_loop(priv, sk, msg, size);
+
+sendmsg_done:
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
+{
+ struct j1939_sock *jsk;
+ int error_code = ENETDOWN;
+
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ jsk->sk.sk_err = error_code;
+ if (!sock_flag(&jsk->sk, SOCK_DEAD))
+ sk_error_report(&jsk->sk);
+
+ j1939_sk_queue_drop_all(priv, jsk, error_code);
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+}
+
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv)
+{
+ struct sock *sk;
+ struct j1939_sock *jsk;
+ bool wait_rcu = false;
+
+rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ /* Skip if j1939_jsk_add() is not called on this socket. */
+ if (!(jsk->state & J1939_SOCK_BOUND))
+ continue;
+ sk = &jsk->sk;
+ sock_hold(sk);
+ read_unlock_bh(&priv->j1939_socks_lock);
+ /* Check if j1939_jsk_del() is not yet called on this socket after holding
+ * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call
+ * j1939_jsk_del() with socket's lock held.
+ */
+ lock_sock(sk);
+ if (jsk->state & J1939_SOCK_BOUND) {
+ /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del().
+ * Make this socket no longer bound, by pretending as if j1939_sk_bind()
+ * dropped old references but did not get new references.
+ */
+ j1939_jsk_del(priv, jsk);
+ j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+ j1939_netdev_stop(priv);
+ /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from
+ * calling the corresponding j1939_priv_put().
+ *
+ * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after
+ * an RCU grace period. But since the caller is holding a ref on this
+ * "priv", we can defer synchronize_rcu() until immediately before
+ * the caller calls j1939_priv_put().
+ */
+ j1939_priv_put(priv);
+ jsk->priv = NULL;
+ wait_rcu = true;
+ }
+ release_sock(sk);
+ sock_put(sk);
+ goto rescan;
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+ if (wait_rcu)
+ synchronize_rcu();
+}
+
+static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops j1939_ops = {
+ .family = PF_CAN,
+ .release = j1939_sk_release,
+ .bind = j1939_sk_bind,
+ .connect = j1939_sk_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = j1939_sk_getname,
+ .poll = datagram_poll,
+ .ioctl = j1939_sk_no_ioctlcmd,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = j1939_sk_setsockopt,
+ .getsockopt = j1939_sk_getsockopt,
+ .sendmsg = j1939_sk_sendmsg,
+ .recvmsg = j1939_sk_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct proto j1939_proto __read_mostly = {
+ .name = "CAN_J1939",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct j1939_sock),
+ .init = j1939_sk_init,
+};
+
+const struct can_proto j1939_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_J1939,
+ .ops = &j1939_ops,
+ .prot = &j1939_proto,
+};
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
new file mode 100644
index 000000000000..fbf5c8001c9d
--- /dev/null
+++ b/net/can/j1939/transport.c
@@ -0,0 +1,2220 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/can/skb.h>
+
+#include "j1939-priv.h"
+
+#define J1939_XTP_TX_RETRY_LIMIT 100
+
+#define J1939_ETP_PGN_CTL 0xc800
+#define J1939_ETP_PGN_DAT 0xc700
+#define J1939_TP_PGN_CTL 0xec00
+#define J1939_TP_PGN_DAT 0xeb00
+
+#define J1939_TP_CMD_RTS 0x10
+#define J1939_TP_CMD_CTS 0x11
+#define J1939_TP_CMD_EOMA 0x13
+#define J1939_TP_CMD_BAM 0x20
+#define J1939_TP_CMD_ABORT 0xff
+
+#define J1939_ETP_CMD_RTS 0x14
+#define J1939_ETP_CMD_CTS 0x15
+#define J1939_ETP_CMD_DPO 0x16
+#define J1939_ETP_CMD_EOMA 0x17
+#define J1939_ETP_CMD_ABORT 0xff
+
+enum j1939_xtp_abort {
+ J1939_XTP_NO_ABORT = 0,
+ J1939_XTP_ABORT_BUSY = 1,
+ /* Already in one or more connection managed sessions and
+ * cannot support another.
+ *
+ * EALREADY:
+ * Operation already in progress
+ */
+
+ J1939_XTP_ABORT_RESOURCE = 2,
+ /* System resources were needed for another task so this
+ * connection managed session was terminated.
+ *
+ * EMSGSIZE:
+ * The socket type requires that message be sent atomically,
+ * and the size of the message to be sent made this
+ * impossible.
+ */
+
+ J1939_XTP_ABORT_TIMEOUT = 3,
+ /* A timeout occurred and this is the connection abort to
+ * close the session.
+ *
+ * EHOSTUNREACH:
+ * The destination host cannot be reached (probably because
+ * the host is down or a remote router cannot reach it).
+ */
+
+ J1939_XTP_ABORT_GENERIC = 4,
+ /* CTS messages received when data transfer is in progress
+ *
+ * EBADMSG:
+ * Not a data message
+ */
+
+ J1939_XTP_ABORT_FAULT = 5,
+ /* Maximal retransmit request limit reached
+ *
+ * ENOTRECOVERABLE:
+ * State not recoverable
+ */
+
+ J1939_XTP_ABORT_UNEXPECTED_DATA = 6,
+ /* Unexpected data transfer packet
+ *
+ * ENOTCONN:
+ * Transport endpoint is not connected
+ */
+
+ J1939_XTP_ABORT_BAD_SEQ = 7,
+ /* Bad sequence number (and software is not able to recover)
+ *
+ * EILSEQ:
+ * Illegal byte sequence
+ */
+
+ J1939_XTP_ABORT_DUP_SEQ = 8,
+ /* Duplicate sequence number (and software is not able to
+ * recover)
+ */
+
+ J1939_XTP_ABORT_EDPO_UNEXPECTED = 9,
+ /* Unexpected EDPO packet (ETP) or Message size > 1785 bytes
+ * (TP)
+ */
+
+ J1939_XTP_ABORT_BAD_EDPO_PGN = 10,
+ /* Unexpected EDPO PGN (PGN in EDPO is bad) */
+
+ J1939_XTP_ABORT_EDPO_OUTOF_CTS = 11,
+ /* EDPO number of packets is greater than CTS */
+
+ J1939_XTP_ABORT_BAD_EDPO_OFFSET = 12,
+ /* Bad EDPO offset */
+
+ J1939_XTP_ABORT_OTHER_DEPRECATED = 13,
+ /* Deprecated. Use 250 instead (Any other reason) */
+
+ J1939_XTP_ABORT_ECTS_UNXPECTED_PGN = 14,
+ /* Unexpected ECTS PGN (PGN in ECTS is bad) */
+
+ J1939_XTP_ABORT_ECTS_TOO_BIG = 15,
+ /* ECTS requested packets exceeds message size */
+
+ J1939_XTP_ABORT_OTHER = 250,
+ /* Any other reason (if a Connection Abort reason is
+ * identified that is not listed in the table use code 250)
+ */
+};
+
+static unsigned int j1939_tp_block = 255;
+static unsigned int j1939_tp_packet_delay;
+static unsigned int j1939_tp_padding = 1;
+
+/* helpers */
+static const char *j1939_xtp_abort_to_str(enum j1939_xtp_abort abort)
+{
+ switch (abort) {
+ case J1939_XTP_ABORT_BUSY:
+ return "Already in one or more connection managed sessions and cannot support another.";
+ case J1939_XTP_ABORT_RESOURCE:
+ return "System resources were needed for another task so this connection managed session was terminated.";
+ case J1939_XTP_ABORT_TIMEOUT:
+ return "A timeout occurred and this is the connection abort to close the session.";
+ case J1939_XTP_ABORT_GENERIC:
+ return "CTS messages received when data transfer is in progress";
+ case J1939_XTP_ABORT_FAULT:
+ return "Maximal retransmit request limit reached";
+ case J1939_XTP_ABORT_UNEXPECTED_DATA:
+ return "Unexpected data transfer packet";
+ case J1939_XTP_ABORT_BAD_SEQ:
+ return "Bad sequence number (and software is not able to recover)";
+ case J1939_XTP_ABORT_DUP_SEQ:
+ return "Duplicate sequence number (and software is not able to recover)";
+ case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+ return "Unexpected EDPO packet (ETP) or Message size > 1785 bytes (TP)";
+ case J1939_XTP_ABORT_BAD_EDPO_PGN:
+ return "Unexpected EDPO PGN (PGN in EDPO is bad)";
+ case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+ return "EDPO number of packets is greater than CTS";
+ case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+ return "Bad EDPO offset";
+ case J1939_XTP_ABORT_OTHER_DEPRECATED:
+ return "Deprecated. Use 250 instead (Any other reason)";
+ case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+ return "Unexpected ECTS PGN (PGN in ECTS is bad)";
+ case J1939_XTP_ABORT_ECTS_TOO_BIG:
+ return "ECTS requested packets exceeds message size";
+ case J1939_XTP_ABORT_OTHER:
+ return "Any other reason (if a Connection Abort reason is identified that is not listed in the table use code 250)";
+ default:
+ return "<unknown>";
+ }
+}
+
+static int j1939_xtp_abort_to_errno(struct j1939_priv *priv,
+ enum j1939_xtp_abort abort)
+{
+ int err;
+
+ switch (abort) {
+ case J1939_XTP_NO_ABORT:
+ WARN_ON_ONCE(abort == J1939_XTP_NO_ABORT);
+ err = 0;
+ break;
+ case J1939_XTP_ABORT_BUSY:
+ err = EALREADY;
+ break;
+ case J1939_XTP_ABORT_RESOURCE:
+ err = EMSGSIZE;
+ break;
+ case J1939_XTP_ABORT_TIMEOUT:
+ err = EHOSTUNREACH;
+ break;
+ case J1939_XTP_ABORT_GENERIC:
+ err = EBADMSG;
+ break;
+ case J1939_XTP_ABORT_FAULT:
+ err = ENOTRECOVERABLE;
+ break;
+ case J1939_XTP_ABORT_UNEXPECTED_DATA:
+ err = ENOTCONN;
+ break;
+ case J1939_XTP_ABORT_BAD_SEQ:
+ err = EILSEQ;
+ break;
+ case J1939_XTP_ABORT_DUP_SEQ:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_BAD_EDPO_PGN:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_OTHER_DEPRECATED:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_ECTS_TOO_BIG:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_OTHER:
+ err = EPROTO;
+ break;
+ default:
+ netdev_warn(priv->ndev, "Unknown abort code %i", abort);
+ err = EPROTO;
+ }
+
+ return err;
+}
+
+static inline void j1939_session_list_lock(struct j1939_priv *priv)
+{
+ spin_lock_bh(&priv->active_session_list_lock);
+}
+
+static inline void j1939_session_list_unlock(struct j1939_priv *priv)
+{
+ spin_unlock_bh(&priv->active_session_list_lock);
+}
+
+void j1939_session_get(struct j1939_session *session)
+{
+ kref_get(&session->kref);
+}
+
+/* session completion functions */
+static void __j1939_session_drop(struct j1939_session *session)
+{
+ if (!session->transmission)
+ return;
+
+ j1939_sock_pending_del(session->sk);
+ sock_put(session->sk);
+}
+
+static void j1939_session_destroy(struct j1939_session *session)
+{
+ struct sk_buff *skb;
+
+ if (session->transmission) {
+ if (session->err)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT);
+ else
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK);
+ } else if (session->err) {
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry));
+ WARN_ON_ONCE(!list_empty(&session->active_session_list_entry));
+
+ while ((skb = skb_dequeue(&session->skb_queue)) != NULL) {
+ /* drop ref taken in j1939_session_skb_queue() */
+ skb_unref(skb);
+ kfree_skb(skb);
+ }
+ __j1939_session_drop(session);
+ j1939_priv_put(session->priv);
+ kfree(session);
+}
+
+static void __j1939_session_release(struct kref *kref)
+{
+ struct j1939_session *session = container_of(kref, struct j1939_session,
+ kref);
+
+ j1939_session_destroy(session);
+}
+
+void j1939_session_put(struct j1939_session *session)
+{
+ kref_put(&session->kref, __j1939_session_release);
+}
+
+static void j1939_session_txtimer_cancel(struct j1939_session *session)
+{
+ if (hrtimer_cancel(&session->txtimer))
+ j1939_session_put(session);
+}
+
+static void j1939_session_rxtimer_cancel(struct j1939_session *session)
+{
+ if (hrtimer_cancel(&session->rxtimer))
+ j1939_session_put(session);
+}
+
+void j1939_session_timers_cancel(struct j1939_session *session)
+{
+ j1939_session_txtimer_cancel(session);
+ j1939_session_rxtimer_cancel(session);
+}
+
+static inline bool j1939_cb_is_broadcast(const struct j1939_sk_buff_cb *skcb)
+{
+ return (!skcb->addr.dst_name && (skcb->addr.da == 0xff));
+}
+
+static void j1939_session_skb_drop_old(struct j1939_session *session)
+{
+ struct sk_buff *do_skb;
+ struct j1939_sk_buff_cb *do_skcb;
+ unsigned int offset_start;
+ unsigned long flags;
+
+ if (skb_queue_len(&session->skb_queue) < 2)
+ return;
+
+ offset_start = session->pkt.tx_acked * 7;
+
+ spin_lock_irqsave(&session->skb_queue.lock, flags);
+ do_skb = skb_peek(&session->skb_queue);
+ do_skcb = j1939_skb_to_cb(do_skb);
+
+ if ((do_skcb->offset + do_skb->len) < offset_start) {
+ __skb_unlink(do_skb, &session->skb_queue);
+ /* drop ref taken in j1939_session_skb_queue() */
+ skb_unref(do_skb);
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+
+ kfree_skb(do_skb);
+ } else {
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+ }
+}
+
+void j1939_session_skb_queue(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_priv *priv = session->priv;
+
+ j1939_ac_fixup(priv, skb);
+
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+ skb_get(skb);
+ skb_queue_tail(&session->skb_queue, skb);
+}
+
+static struct
+sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session,
+ unsigned int offset_start)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *do_skcb;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *do_skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&session->skb_queue.lock, flags);
+ skb_queue_walk(&session->skb_queue, do_skb) {
+ do_skcb = j1939_skb_to_cb(do_skb);
+
+ if ((offset_start >= do_skcb->offset &&
+ offset_start < (do_skcb->offset + do_skb->len)) ||
+ (offset_start == 0 && do_skcb->offset == 0 && do_skb->len == 0)) {
+ skb = do_skb;
+ }
+ }
+
+ if (skb)
+ skb_get(skb);
+
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+
+ if (!skb)
+ netdev_dbg(priv->ndev, "%s: 0x%p: no skb found for start: %i, queue size: %i\n",
+ __func__, session, offset_start,
+ skb_queue_len(&session->skb_queue));
+
+ return skb;
+}
+
+static struct sk_buff *j1939_session_skb_get(struct j1939_session *session)
+{
+ unsigned int offset_start;
+
+ offset_start = session->pkt.dpo * 7;
+ return j1939_session_skb_get_by_offset(session, offset_start);
+}
+
+/* see if we are receiver
+ * returns 0 for broadcasts, although we will receive them
+ */
+static inline int j1939_tp_im_receiver(const struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & J1939_ECU_LOCAL_DST;
+}
+
+/* see if we are sender */
+static inline int j1939_tp_im_transmitter(const struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & J1939_ECU_LOCAL_SRC;
+}
+
+/* see if we are involved as either receiver or transmitter */
+static int j1939_tp_im_involved(const struct j1939_sk_buff_cb *skcb, bool swap)
+{
+ if (swap)
+ return j1939_tp_im_receiver(skcb);
+ else
+ return j1939_tp_im_transmitter(skcb);
+}
+
+static int j1939_tp_im_involved_anydir(struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & (J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+}
+
+/* extract pgn from flow-ctl message */
+static inline pgn_t j1939_xtp_ctl_to_pgn(const u8 *dat)
+{
+ pgn_t pgn;
+
+ pgn = (dat[7] << 16) | (dat[6] << 8) | (dat[5] << 0);
+ if (j1939_pgn_is_pdu1(pgn))
+ pgn &= 0xffff00;
+ return pgn;
+}
+
+static inline unsigned int j1939_tp_ctl_to_size(const u8 *dat)
+{
+ return (dat[2] << 8) + (dat[1] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_packet(const u8 *dat)
+{
+ return (dat[4] << 16) | (dat[3] << 8) | (dat[2] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_size(const u8 *dat)
+{
+ return (dat[4] << 24) | (dat[3] << 16) |
+ (dat[2] << 8) | (dat[1] << 0);
+}
+
+/* find existing session:
+ * reverse: swap cb's src & dst
+ * there is no problem with matching broadcasts, since
+ * broadcasts (no dst, no da) would never call this
+ * with reverse == true
+ */
+static bool j1939_session_match(struct j1939_addr *se_addr,
+ struct j1939_addr *sk_addr, bool reverse)
+{
+ if (se_addr->type != sk_addr->type)
+ return false;
+
+ if (reverse) {
+ if (se_addr->src_name) {
+ if (se_addr->src_name != sk_addr->dst_name)
+ return false;
+ } else if (se_addr->sa != sk_addr->da) {
+ return false;
+ }
+
+ if (se_addr->dst_name) {
+ if (se_addr->dst_name != sk_addr->src_name)
+ return false;
+ } else if (se_addr->da != sk_addr->sa) {
+ return false;
+ }
+ } else {
+ if (se_addr->src_name) {
+ if (se_addr->src_name != sk_addr->src_name)
+ return false;
+ } else if (se_addr->sa != sk_addr->sa) {
+ return false;
+ }
+
+ if (se_addr->dst_name) {
+ if (se_addr->dst_name != sk_addr->dst_name)
+ return false;
+ } else if (se_addr->da != sk_addr->da) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr_locked(struct j1939_priv *priv,
+ struct list_head *root,
+ struct j1939_addr *addr,
+ bool reverse, bool transmitter)
+{
+ struct j1939_session *session;
+
+ lockdep_assert_held(&priv->active_session_list_lock);
+
+ list_for_each_entry(session, root, active_session_list_entry) {
+ j1939_session_get(session);
+ if (j1939_session_match(&session->skcb.addr, addr, reverse) &&
+ session->transmission == transmitter)
+ return session;
+ j1939_session_put(session);
+ }
+
+ return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_simple(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ lockdep_assert_held(&priv->active_session_list_lock);
+
+ list_for_each_entry(session, &priv->active_session_list,
+ active_session_list_entry) {
+ j1939_session_get(session);
+ if (session->skcb.addr.type == J1939_SIMPLE &&
+ session->tskey == skcb->tskey && session->sk == skb->sk)
+ return session;
+ j1939_session_put(session);
+ }
+
+ return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr(struct j1939_priv *priv,
+ struct j1939_addr *addr,
+ bool reverse, bool transmitter)
+{
+ struct j1939_session *session;
+
+ j1939_session_list_lock(priv);
+ session = j1939_session_get_by_addr_locked(priv,
+ &priv->active_session_list,
+ addr, reverse, transmitter);
+ j1939_session_list_unlock(priv);
+
+ return session;
+}
+
+static void j1939_skbcb_swap(struct j1939_sk_buff_cb *skcb)
+{
+ u8 tmp = 0;
+
+ swap(skcb->addr.dst_name, skcb->addr.src_name);
+ swap(skcb->addr.da, skcb->addr.sa);
+
+ /* swap SRC and DST flags, leave other untouched */
+ if (skcb->flags & J1939_ECU_LOCAL_SRC)
+ tmp |= J1939_ECU_LOCAL_DST;
+ if (skcb->flags & J1939_ECU_LOCAL_DST)
+ tmp |= J1939_ECU_LOCAL_SRC;
+ skcb->flags &= ~(J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+ skcb->flags |= tmp;
+}
+
+static struct
+sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool ctl,
+ bool swap_src_dst)
+{
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+
+ skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv),
+ GFP_ATOMIC);
+ if (unlikely(!skb))
+ return ERR_PTR(-ENOMEM);
+
+ skb->dev = priv->ndev;
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ /* reserve CAN header */
+ skb_reserve(skb, offsetof(struct can_frame, data));
+
+ /* skb->cb must be large enough to hold a j1939_sk_buff_cb structure */
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*re_skcb));
+
+ memcpy(skb->cb, re_skcb, sizeof(*re_skcb));
+ skcb = j1939_skb_to_cb(skb);
+ if (swap_src_dst)
+ j1939_skbcb_swap(skcb);
+
+ if (ctl) {
+ if (skcb->addr.type == J1939_ETP)
+ skcb->addr.pgn = J1939_ETP_PGN_CTL;
+ else
+ skcb->addr.pgn = J1939_TP_PGN_CTL;
+ } else {
+ if (skcb->addr.type == J1939_ETP)
+ skcb->addr.pgn = J1939_ETP_PGN_DAT;
+ else
+ skcb->addr.pgn = J1939_TP_PGN_DAT;
+ }
+
+ return skb;
+}
+
+/* TP transmit packet functions */
+static int j1939_tp_tx_dat(struct j1939_session *session,
+ const u8 *dat, int len)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *skb;
+
+ skb = j1939_tp_tx_dat_new(priv, &session->skcb,
+ false, false);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ skb_put_data(skb, dat, len);
+ if (j1939_tp_padding && len < 8)
+ memset(skb_put(skb, 8 - len), 0xff, 8 - len);
+
+ return j1939_send_one(priv, skb);
+}
+
+static int j1939_xtp_do_tx_ctl(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool swap_src_dst, pgn_t pgn, const u8 *dat)
+{
+ struct sk_buff *skb;
+ u8 *skdat;
+
+ if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+ return 0;
+
+ skb = j1939_tp_tx_dat_new(priv, re_skcb, true, swap_src_dst);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ skdat = skb_put(skb, 8);
+ memcpy(skdat, dat, 5);
+ skdat[5] = (pgn >> 0);
+ skdat[6] = (pgn >> 8);
+ skdat[7] = (pgn >> 16);
+
+ return j1939_send_one(priv, skb);
+}
+
+static inline int j1939_tp_tx_ctl(struct j1939_session *session,
+ bool swap_src_dst, const u8 *dat)
+{
+ struct j1939_priv *priv = session->priv;
+
+ return j1939_xtp_do_tx_ctl(priv, &session->skcb,
+ swap_src_dst,
+ session->skcb.addr.pgn, dat);
+}
+
+static int j1939_xtp_tx_abort(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool swap_src_dst,
+ enum j1939_xtp_abort err,
+ pgn_t pgn)
+{
+ u8 dat[5];
+
+ if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+ return 0;
+
+ memset(dat, 0xff, sizeof(dat));
+ dat[0] = J1939_TP_CMD_ABORT;
+ dat[1] = err;
+ return j1939_xtp_do_tx_ctl(priv, re_skcb, swap_src_dst, pgn, dat);
+}
+
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec)
+{
+ j1939_session_get(session);
+ hrtimer_start(&session->txtimer, ms_to_ktime(msec),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+static inline void j1939_tp_set_rxtimeout(struct j1939_session *session,
+ int msec)
+{
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_get(session);
+ hrtimer_start(&session->rxtimer, ms_to_ktime(msec),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+static int j1939_session_tx_rts(struct j1939_session *session)
+{
+ u8 dat[8];
+ int ret;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ dat[1] = (session->total_message_size >> 0);
+ dat[2] = (session->total_message_size >> 8);
+ dat[3] = session->pkt.total;
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ dat[0] = J1939_ETP_CMD_RTS;
+ dat[1] = (session->total_message_size >> 0);
+ dat[2] = (session->total_message_size >> 8);
+ dat[3] = (session->total_message_size >> 16);
+ dat[4] = (session->total_message_size >> 24);
+ } else if (j1939_cb_is_broadcast(&session->skcb)) {
+ dat[0] = J1939_TP_CMD_BAM;
+ /* fake cts for broadcast */
+ session->pkt.tx = 0;
+ } else {
+ dat[0] = J1939_TP_CMD_RTS;
+ dat[4] = dat[3];
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, false, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+ if (dat[0] == J1939_TP_CMD_BAM) {
+ j1939_tp_schedule_txtimer(session, 50);
+ j1939_tp_set_rxtimeout(session, 250);
+ } else {
+ j1939_tp_set_rxtimeout(session, 1250);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_dpo(struct j1939_session *session)
+{
+ unsigned int pkt;
+ u8 dat[8];
+ int ret;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ dat[0] = J1939_ETP_CMD_DPO;
+ session->pkt.dpo = session->pkt.tx_acked;
+ pkt = session->pkt.dpo;
+ dat[1] = session->pkt.last - session->pkt.tx_acked;
+ dat[2] = (pkt >> 0);
+ dat[3] = (pkt >> 8);
+ dat[4] = (pkt >> 16);
+
+ ret = j1939_tp_tx_ctl(session, false, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 1250);
+ session->pkt.tx = session->pkt.tx_acked;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_dat(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *se_skcb;
+ int offset, pkt_done, pkt_end;
+ unsigned int len, pdelay;
+ struct sk_buff *se_skb;
+ const u8 *tpdat;
+ int ret = 0;
+ u8 dat[8];
+
+ se_skb = j1939_session_skb_get_by_offset(session, session->pkt.tx * 7);
+ if (!se_skb)
+ return -ENOBUFS;
+
+ se_skcb = j1939_skb_to_cb(se_skb);
+ tpdat = se_skb->data;
+ ret = 0;
+ pkt_done = 0;
+ if (session->skcb.addr.type != J1939_ETP &&
+ j1939_cb_is_broadcast(&session->skcb))
+ pkt_end = session->pkt.total;
+ else
+ pkt_end = session->pkt.last;
+
+ while (session->pkt.tx < pkt_end) {
+ dat[0] = session->pkt.tx - session->pkt.dpo + 1;
+ offset = (session->pkt.tx * 7) - se_skcb->offset;
+ len = se_skb->len - offset;
+ if (len > 7)
+ len = 7;
+
+ if (offset + len > se_skb->len) {
+ netdev_err_once(priv->ndev,
+ "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n",
+ __func__, session, se_skcb->offset,
+ se_skb->len , session->pkt.tx);
+ ret = -EOVERFLOW;
+ goto out_free;
+ }
+
+ if (!len) {
+ ret = -ENOBUFS;
+ break;
+ }
+
+ memcpy(&dat[1], &tpdat[offset], len);
+ ret = j1939_tp_tx_dat(session, dat, len + 1);
+ if (ret < 0) {
+ /* ENOBUFS == CAN interface TX queue is full */
+ if (ret != -ENOBUFS)
+ netdev_alert(priv->ndev,
+ "%s: 0x%p: queue data error: %i\n",
+ __func__, session, ret);
+ break;
+ }
+
+ session->last_txcmd = 0xff;
+ pkt_done++;
+ session->pkt.tx++;
+ pdelay = j1939_cb_is_broadcast(&session->skcb) ? 50 :
+ j1939_tp_packet_delay;
+
+ if (session->pkt.tx < session->pkt.total && pdelay) {
+ j1939_tp_schedule_txtimer(session, pdelay);
+ break;
+ }
+ }
+
+ if (pkt_done)
+ j1939_tp_set_rxtimeout(session, 250);
+
+ out_free:
+ if (ret)
+ kfree_skb(se_skb);
+ else
+ consume_skb(se_skb);
+
+ return ret;
+}
+
+static int j1939_xtp_txnext_transmiter(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (!j1939_tp_im_transmitter(&session->skcb)) {
+ netdev_alert(priv->ndev, "%s: 0x%p: called by not transmitter!\n",
+ __func__, session);
+ return -EINVAL;
+ }
+
+ switch (session->last_cmd) {
+ case 0:
+ ret = j1939_session_tx_rts(session);
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ if (session->last_txcmd != J1939_ETP_CMD_DPO) {
+ ret = j1939_session_tx_dpo(session);
+ if (ret)
+ return ret;
+ }
+
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ case 0xff: /* did some data */
+ case J1939_ETP_CMD_DPO:
+ case J1939_TP_CMD_BAM:
+ ret = j1939_session_tx_dat(session);
+
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+ __func__, session, session->last_cmd);
+ }
+
+ return ret;
+}
+
+static int j1939_session_tx_cts(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ unsigned int pkt, len;
+ int ret;
+ u8 dat[8];
+
+ if (!j1939_sk_recv_match(priv, &session->skcb))
+ return -ENOENT;
+
+ len = session->pkt.total - session->pkt.rx;
+ len = min3(len, session->pkt.block, j1939_tp_block ?: 255);
+ memset(dat, 0xff, sizeof(dat));
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ pkt = session->pkt.rx + 1;
+ dat[0] = J1939_ETP_CMD_CTS;
+ dat[1] = len;
+ dat[2] = (pkt >> 0);
+ dat[3] = (pkt >> 8);
+ dat[4] = (pkt >> 16);
+ } else {
+ dat[0] = J1939_TP_CMD_CTS;
+ dat[1] = len;
+ dat[2] = session->pkt.rx + 1;
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, true, dat);
+ if (ret < 0)
+ return ret;
+
+ if (len)
+ /* only mark cts done when len is set */
+ session->last_txcmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_eoma(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ u8 dat[8];
+ int ret;
+
+ if (!j1939_sk_recv_match(priv, &session->skcb))
+ return -ENOENT;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ dat[0] = J1939_ETP_CMD_EOMA;
+ dat[1] = session->total_message_size >> 0;
+ dat[2] = session->total_message_size >> 8;
+ dat[3] = session->total_message_size >> 16;
+ dat[4] = session->total_message_size >> 24;
+ } else {
+ dat[0] = J1939_TP_CMD_EOMA;
+ dat[1] = session->total_message_size;
+ dat[2] = session->total_message_size >> 8;
+ dat[3] = session->pkt.total;
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, true, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+
+ /* wait for the EOMA packet to come in */
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_xtp_txnext_receiver(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (!j1939_tp_im_receiver(&session->skcb)) {
+ netdev_alert(priv->ndev, "%s: 0x%p: called by not receiver!\n",
+ __func__, session);
+ return -EINVAL;
+ }
+
+ switch (session->last_cmd) {
+ case J1939_TP_CMD_RTS:
+ case J1939_ETP_CMD_RTS:
+ ret = j1939_session_tx_cts(session);
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ case J1939_TP_CMD_CTS:
+ case 0xff: /* did some data */
+ case J1939_ETP_CMD_DPO:
+ if ((session->skcb.addr.type == J1939_TP &&
+ j1939_cb_is_broadcast(&session->skcb)))
+ break;
+
+ if (session->pkt.rx >= session->pkt.total) {
+ ret = j1939_session_tx_eoma(session);
+ } else if (session->pkt.rx >= session->pkt.last) {
+ session->last_txcmd = 0;
+ ret = j1939_session_tx_cts(session);
+ }
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+ __func__, session, session->last_cmd);
+ }
+
+ return ret;
+}
+
+static int j1939_simple_txnext(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *se_skb = j1939_session_skb_get(session);
+ struct sk_buff *skb;
+ int ret;
+
+ if (!se_skb)
+ return 0;
+
+ skb = skb_clone(se_skb, GFP_ATOMIC);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ can_skb_set_owner(skb, se_skb->sk);
+
+ j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS);
+
+ ret = j1939_send_one(priv, skb);
+ if (ret)
+ goto out_free;
+
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_SCHED);
+ j1939_sk_queue_activate_next(session);
+
+ out_free:
+ if (ret)
+ kfree_skb(se_skb);
+ else
+ consume_skb(se_skb);
+
+ return ret;
+}
+
+static bool j1939_session_deactivate_locked(struct j1939_session *session)
+{
+ bool active = false;
+
+ lockdep_assert_held(&session->priv->active_session_list_lock);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_ACTIVE_MAX) {
+ active = true;
+
+ list_del_init(&session->active_session_list_entry);
+ session->state = J1939_SESSION_DONE;
+ j1939_session_put(session);
+ }
+
+ return active;
+}
+
+static bool j1939_session_deactivate(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ bool active;
+
+ j1939_session_list_lock(priv);
+ active = j1939_session_deactivate_locked(session);
+ j1939_session_list_unlock(priv);
+
+ return active;
+}
+
+static void
+j1939_session_deactivate_activate_next(struct j1939_session *session)
+{
+ if (j1939_session_deactivate(session))
+ j1939_sk_queue_activate_next(session);
+}
+
+static void __j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ struct j1939_priv *priv = session->priv;
+
+ WARN_ON_ONCE(!err);
+ lockdep_assert_held(&session->priv->active_session_list_lock);
+
+ session->err = j1939_xtp_abort_to_errno(priv, err);
+ session->state = J1939_SESSION_WAITING_ABORT;
+ /* do not send aborts on incoming broadcasts */
+ if (!j1939_cb_is_broadcast(&session->skcb)) {
+ j1939_xtp_tx_abort(priv, &session->skcb,
+ !session->transmission,
+ err, session->skcb.addr.pgn);
+ }
+
+ if (session->sk)
+ j1939_sk_send_loop_abort(session->sk, session->err);
+}
+
+static void j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ j1939_session_list_lock(session->priv);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_WAITING_ABORT) {
+ j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+ __j1939_session_cancel(session, err);
+ }
+
+ j1939_session_list_unlock(session->priv);
+
+ if (!session->sk)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+}
+
+static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer)
+{
+ struct j1939_session *session =
+ container_of(hrtimer, struct j1939_session, txtimer);
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (session->skcb.addr.type == J1939_SIMPLE) {
+ ret = j1939_simple_txnext(session);
+ } else {
+ if (session->transmission)
+ ret = j1939_xtp_txnext_transmiter(session);
+ else
+ ret = j1939_xtp_txnext_receiver(session);
+ }
+
+ switch (ret) {
+ case -ENOBUFS:
+ /* Retry limit is currently arbitrary chosen */
+ if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) {
+ session->tx_retry++;
+ j1939_tp_schedule_txtimer(session,
+ 10 + get_random_u32_below(16));
+ } else {
+ netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n",
+ __func__, session);
+ session->err = -ENETUNREACH;
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_deactivate_activate_next(session);
+ }
+ break;
+ case -ENETDOWN:
+ /* In this case we should get a netdev_event(), all active
+ * sessions will be cleared by j1939_cancel_active_session().
+ * So handle this as an error, but let
+ * j1939_cancel_active_session() do the cleanup including
+ * propagation of the error to user space.
+ */
+ break;
+ case -EOVERFLOW:
+ j1939_session_cancel(session, J1939_XTP_ABORT_ECTS_TOO_BIG);
+ break;
+ case 0:
+ session->tx_retry = 0;
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n",
+ __func__, session, ret);
+ if (session->skcb.addr.type != J1939_SIMPLE) {
+ j1939_session_cancel(session, J1939_XTP_ABORT_OTHER);
+ } else {
+ session->err = ret;
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_deactivate_activate_next(session);
+ }
+ }
+
+ j1939_session_put(session);
+
+ return HRTIMER_NORESTART;
+}
+
+static void j1939_session_completed(struct j1939_session *session)
+{
+ struct sk_buff *se_skb;
+
+ if (!session->transmission) {
+ se_skb = j1939_session_skb_get(session);
+ /* distribute among j1939 receivers */
+ j1939_sk_recv(session->priv, se_skb);
+ consume_skb(se_skb);
+ }
+
+ j1939_session_deactivate_activate_next(session);
+}
+
+static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer)
+{
+ struct j1939_session *session = container_of(hrtimer,
+ struct j1939_session,
+ rxtimer);
+ struct j1939_priv *priv = session->priv;
+
+ if (session->state == J1939_SESSION_WAITING_ABORT) {
+ netdev_alert(priv->ndev, "%s: 0x%p: abort rx timeout. Force session deactivation\n",
+ __func__, session);
+
+ j1939_session_deactivate_activate_next(session);
+
+ } else if (session->skcb.addr.type == J1939_SIMPLE) {
+ netdev_alert(priv->ndev, "%s: 0x%p: Timeout. Failed to send simple message.\n",
+ __func__, session);
+
+ /* The message is probably stuck in the CAN controller and can
+ * be send as soon as CAN bus is in working state again.
+ */
+ session->err = -ETIME;
+ j1939_session_deactivate(session);
+ } else {
+ j1939_session_list_lock(session->priv);
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_ACTIVE_MAX) {
+ netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n",
+ __func__, session);
+ j1939_session_get(session);
+ hrtimer_start(&session->rxtimer,
+ ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS),
+ HRTIMER_MODE_REL_SOFT);
+ __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT);
+ }
+ j1939_session_list_unlock(session->priv);
+
+ if (!session->sk)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+ }
+
+ j1939_session_put(session);
+
+ return HRTIMER_NORESTART;
+}
+
+static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session,
+ const struct sk_buff *skb)
+{
+ const struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ pgn_t pgn = j1939_xtp_ctl_to_pgn(skb->data);
+ struct j1939_priv *priv = session->priv;
+ enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+ u8 cmd = skb->data[0];
+
+ if (session->skcb.addr.pgn == pgn)
+ return false;
+
+ switch (cmd) {
+ case J1939_TP_CMD_BAM:
+ abort = J1939_XTP_NO_ABORT;
+ break;
+
+ case J1939_ETP_CMD_RTS:
+ fallthrough;
+ case J1939_TP_CMD_RTS:
+ abort = J1939_XTP_ABORT_BUSY;
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN;
+ break;
+
+ case J1939_ETP_CMD_DPO:
+ abort = J1939_XTP_ABORT_BAD_EDPO_PGN;
+ break;
+
+ case J1939_ETP_CMD_EOMA:
+ fallthrough;
+ case J1939_TP_CMD_EOMA:
+ abort = J1939_XTP_ABORT_OTHER;
+ break;
+
+ case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+ abort = J1939_XTP_NO_ABORT;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ netdev_warn(priv->ndev, "%s: 0x%p: CMD 0x%02x with PGN 0x%05x for running session with different PGN 0x%05x.\n",
+ __func__, session, cmd, pgn, session->skcb.addr.pgn);
+ if (abort != J1939_XTP_NO_ABORT)
+ j1939_xtp_tx_abort(priv, skcb, true, abort, pgn);
+
+ return true;
+}
+
+static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb,
+ bool reverse, bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ u8 abort = skb->data[1];
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, reverse,
+ transmitter);
+ if (!session)
+ return;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ goto abort_put;
+
+ netdev_info(priv->ndev, "%s: 0x%p: 0x%05x: (%u) %s\n", __func__,
+ session, j1939_xtp_ctl_to_pgn(skb->data), abort,
+ j1939_xtp_abort_to_str(abort));
+
+ j1939_session_timers_cancel(session);
+ session->err = j1939_xtp_abort_to_errno(priv, abort);
+ if (session->sk)
+ j1939_sk_send_loop_abort(session->sk, session->err);
+ else
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+ j1939_session_deactivate_activate_next(session);
+
+abort_put:
+ j1939_session_put(session);
+}
+
+/* abort packets may come in 2 directions */
+static void
+j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ j1939_xtp_rx_abort_one(priv, skb, false, transmitter);
+ j1939_xtp_rx_abort_one(priv, skb, true, transmitter);
+}
+
+static void
+j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ const u8 *dat;
+ int len;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ dat = skb->data;
+
+ if (skcb->addr.type == J1939_ETP)
+ len = j1939_etp_ctl_to_size(dat);
+ else
+ len = j1939_tp_ctl_to_size(dat);
+
+ if (session->total_message_size != len) {
+ netdev_warn_once(session->priv->ndev,
+ "%s: 0x%p: Incorrect size. Expected: %i; got: %i.\n",
+ __func__, session, session->total_message_size,
+ len);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ session->pkt.tx_acked = session->pkt.total;
+ j1939_session_timers_cancel(session);
+ /* transmitted without problems */
+ j1939_session_completed(session);
+}
+
+static void
+j1939_xtp_rx_eoma(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+ transmitter);
+ if (!session)
+ return;
+
+ j1939_xtp_rx_eoma_one(session, skb);
+ j1939_session_put(session);
+}
+
+static void
+j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb)
+{
+ enum j1939_xtp_abort err = J1939_XTP_ABORT_FAULT;
+ unsigned int pkt;
+ const u8 *dat;
+
+ dat = skb->data;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ if (session->last_cmd == dat[0]) {
+ err = J1939_XTP_ABORT_DUP_SEQ;
+ goto out_session_cancel;
+ }
+
+ if (session->skcb.addr.type == J1939_ETP)
+ pkt = j1939_etp_ctl_to_packet(dat);
+ else
+ pkt = dat[2];
+
+ if (!pkt)
+ goto out_session_cancel;
+ else if (dat[1] > session->pkt.block /* 0xff for etp */)
+ goto out_session_cancel;
+
+ /* set packet counters only when not CTS(0) */
+ session->pkt.tx_acked = pkt - 1;
+ j1939_session_skb_drop_old(session);
+ session->pkt.last = session->pkt.tx_acked + dat[1];
+ if (session->pkt.last > session->pkt.total)
+ /* safety measure */
+ session->pkt.last = session->pkt.total;
+ /* TODO: do not set tx here, do it in txtimer */
+ session->pkt.tx = session->pkt.tx_acked;
+
+ session->last_cmd = dat[0];
+ if (dat[1]) {
+ j1939_tp_set_rxtimeout(session, 1250);
+ if (session->transmission) {
+ if (session->pkt.tx_acked)
+ j1939_sk_errqueue(session,
+ J1939_ERRQUEUE_TX_SCHED);
+ j1939_session_txtimer_cancel(session);
+ j1939_tp_schedule_txtimer(session, 0);
+ }
+ } else {
+ /* CTS(0) */
+ j1939_tp_set_rxtimeout(session, 550);
+ }
+ return;
+
+ out_session_cancel:
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, err);
+}
+
+static void
+j1939_xtp_rx_cts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+ transmitter);
+ if (!session)
+ return;
+ j1939_xtp_rx_cts_one(session, skb);
+ j1939_session_put(session);
+}
+
+static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size)
+{
+ struct j1939_session *session;
+ struct j1939_sk_buff_cb *skcb;
+
+ session = kzalloc(sizeof(*session), gfp_any());
+ if (!session)
+ return NULL;
+
+ INIT_LIST_HEAD(&session->active_session_list_entry);
+ INIT_LIST_HEAD(&session->sk_session_queue_entry);
+ kref_init(&session->kref);
+
+ j1939_priv_get(priv);
+ session->priv = priv;
+ session->total_message_size = size;
+ session->state = J1939_SESSION_NEW;
+
+ skb_queue_head_init(&session->skb_queue);
+ skb_queue_tail(&session->skb_queue, skb_get(skb));
+
+ skcb = j1939_skb_to_cb(skb);
+ memcpy(&session->skcb, skcb, sizeof(session->skcb));
+
+ hrtimer_setup(&session->txtimer, j1939_tp_txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&session->rxtimer, j1939_tp_rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+
+ netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n",
+ __func__, session, skcb->addr.sa, skcb->addr.da);
+
+ return session;
+}
+
+static struct
+j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
+ int size,
+ const struct j1939_sk_buff_cb *rel_skcb)
+{
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+ struct j1939_session *session;
+
+ skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb->dev = priv->ndev;
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ skcb = j1939_skb_to_cb(skb);
+ memcpy(skcb, rel_skcb, sizeof(*skcb));
+
+ session = j1939_session_new(priv, skb, size);
+ if (!session) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ /* alloc data area */
+ skb_put(skb, size);
+ /* skb is recounted in j1939_session_new() */
+ return session;
+}
+
+int j1939_session_activate(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_session *active = NULL;
+ int ret = 0;
+
+ j1939_session_list_lock(priv);
+ if (session->skcb.addr.type != J1939_SIMPLE)
+ active = j1939_session_get_by_addr_locked(priv,
+ &priv->active_session_list,
+ &session->skcb.addr, false,
+ session->transmission);
+ if (active) {
+ j1939_session_put(active);
+ ret = -EAGAIN;
+ } else {
+ WARN_ON_ONCE(session->state != J1939_SESSION_NEW);
+ list_add_tail(&session->active_session_list_entry,
+ &priv->active_session_list);
+ j1939_session_get(session);
+ session->state = J1939_SESSION_ACTIVE;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n",
+ __func__, session);
+ }
+ j1939_session_list_unlock(priv);
+
+ return ret;
+}
+
+static struct
+j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+ struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ const u8 *dat;
+ int len, ret;
+ pgn_t pgn;
+
+ netdev_dbg(priv->ndev, "%s\n", __func__);
+
+ dat = skb->data;
+ pgn = j1939_xtp_ctl_to_pgn(dat);
+ skcb.addr.pgn = pgn;
+
+ if (!j1939_sk_recv_match(priv, &skcb))
+ return NULL;
+
+ if (skcb.addr.type == J1939_ETP) {
+ len = j1939_etp_ctl_to_size(dat);
+ if (len > J1939_MAX_ETP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ else if (len > priv->tp_max_packet_size)
+ abort = J1939_XTP_ABORT_RESOURCE;
+ else if (len <= J1939_MAX_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ } else {
+ len = j1939_tp_ctl_to_size(dat);
+ if (len > J1939_MAX_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ else if (len > priv->tp_max_packet_size)
+ abort = J1939_XTP_ABORT_RESOURCE;
+ else if (len < J1939_MIN_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ }
+
+ if (abort != J1939_XTP_NO_ABORT) {
+ j1939_xtp_tx_abort(priv, &skcb, true, abort, pgn);
+ return NULL;
+ }
+
+ session = j1939_session_fresh_new(priv, len, &skcb);
+ if (!session) {
+ j1939_xtp_tx_abort(priv, &skcb, true,
+ J1939_XTP_ABORT_RESOURCE, pgn);
+ return NULL;
+ }
+
+ /* initialize the control buffer: plain copy */
+ session->pkt.total = (len + 6) / 7;
+ session->pkt.block = 0xff;
+ if (skcb.addr.type != J1939_ETP) {
+ if (dat[3] != session->pkt.total)
+ netdev_alert(priv->ndev, "%s: 0x%p: strange total, %u != %u\n",
+ __func__, session, session->pkt.total,
+ dat[3]);
+ session->pkt.total = dat[3];
+ session->pkt.block = min(dat[3], dat[4]);
+ }
+
+ session->pkt.rx = 0;
+ session->pkt.tx = 0;
+
+ session->tskey = priv->rx_tskey++;
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS);
+
+ ret = j1939_session_activate(session);
+ if (ret) {
+ /* Entering this scope indicates an issue with the J1939 bus.
+ * Possible scenarios include:
+ * - A time lapse occurred, and a new session was initiated
+ * due to another packet being sent correctly. This could
+ * have been caused by too long interrupt, debugger, or being
+ * out-scheduled by another task.
+ * - The bus is receiving numerous erroneous packets, either
+ * from a malfunctioning device or during a test scenario.
+ */
+ netdev_alert(priv->ndev, "%s: 0x%p: concurrent session with same addr (%02x %02x) is already active.\n",
+ __func__, session, skcb.addr.sa, skcb.addr.da);
+ j1939_session_put(session);
+ return NULL;
+ }
+
+ return session;
+}
+
+static int j1939_xtp_rx_rts_session_active(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_priv *priv = session->priv;
+
+ if (!session->transmission) {
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return -EBUSY;
+
+ /* RTS on active session */
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+ }
+
+ if (session->last_cmd != 0) {
+ /* we received a second rts on the same connection */
+ netdev_alert(priv->ndev, "%s: 0x%p: connection exists (%02x %02x). last cmd: %x\n",
+ __func__, session, skcb->addr.sa, skcb->addr.da,
+ session->last_cmd);
+
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+ if (session->transmission)
+ j1939_session_deactivate_activate_next(session);
+
+ return -EBUSY;
+ }
+
+ if (session->skcb.addr.sa != skcb->addr.sa ||
+ session->skcb.addr.da != skcb->addr.da)
+ netdev_warn(priv->ndev, "%s: 0x%p: session->skcb.addr.sa=0x%02x skcb->addr.sa=0x%02x session->skcb.addr.da=0x%02x skcb->addr.da=0x%02x\n",
+ __func__, session,
+ session->skcb.addr.sa, skcb->addr.sa,
+ session->skcb.addr.da, skcb->addr.da);
+ /* make sure 'sa' & 'da' are correct !
+ * They may be 'not filled in yet' for sending
+ * skb's, since they did not pass the Address Claim ever.
+ */
+ session->skcb.addr.sa = skcb->addr.sa;
+ session->skcb.addr.da = skcb->addr.da;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ u8 cmd = skb->data[0];
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ transmitter);
+
+ if (!session) {
+ if (transmitter) {
+ /* If we're the transmitter and this function is called,
+ * we received our own RTS. A session has already been
+ * created.
+ *
+ * For some reasons however it might have been destroyed
+ * already. So don't create a new one here (using
+ * "j1939_xtp_rx_rts_session_new()") as this will be a
+ * receiver session.
+ *
+ * The reasons the session is already destroyed might
+ * be:
+ * - user space closed socket was and the session was
+ * aborted
+ * - session was aborted due to external abort message
+ */
+ return;
+ }
+ session = j1939_xtp_rx_rts_session_new(priv, skb);
+ if (!session) {
+ if (cmd == J1939_TP_CMD_BAM && j1939_sk_recv_match(priv, skcb))
+ netdev_info(priv->ndev, "%s: failed to create TP BAM session\n",
+ __func__);
+ return;
+ }
+ } else {
+ if (j1939_xtp_rx_rts_session_active(session, skb)) {
+ j1939_session_put(session);
+ return;
+ }
+ }
+ session->last_cmd = cmd;
+
+ if (cmd == J1939_TP_CMD_BAM) {
+ if (!session->transmission)
+ j1939_tp_set_rxtimeout(session, 750);
+ } else {
+ if (!session->transmission) {
+ j1939_session_txtimer_cancel(session);
+ j1939_tp_schedule_txtimer(session, 0);
+ }
+ j1939_tp_set_rxtimeout(session, 1250);
+ }
+
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dpo_one(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ const u8 *dat = skb->data;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ /* transmitted without problems */
+ session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data);
+ session->last_cmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 750);
+
+ if (!session->transmission)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO);
+}
+
+static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ transmitter);
+ if (!session) {
+ netdev_info(priv->ndev,
+ "%s: no connection found\n", __func__);
+ return;
+ }
+
+ j1939_xtp_rx_dpo_one(session, skb);
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat_one(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT;
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *skcb, *se_skcb;
+ struct sk_buff *se_skb = NULL;
+ const u8 *dat;
+ u8 *tpdat;
+ int offset;
+ int nbytes;
+ bool final = false;
+ bool remain = false;
+ bool do_cts_eoma = false;
+ int packet;
+
+ skcb = j1939_skb_to_cb(skb);
+ dat = skb->data;
+ if (skb->len != 8) {
+ /* makes no sense */
+ abort = J1939_XTP_ABORT_UNEXPECTED_DATA;
+ goto out_session_cancel;
+ }
+
+ switch (session->last_cmd) {
+ case 0xff:
+ break;
+ case J1939_ETP_CMD_DPO:
+ if (skcb->addr.type == J1939_ETP)
+ break;
+ fallthrough;
+ case J1939_TP_CMD_BAM:
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ if (skcb->addr.type != J1939_ETP)
+ break;
+ fallthrough;
+ default:
+ netdev_info(priv->ndev, "%s: 0x%p: last %02x\n", __func__,
+ session, session->last_cmd);
+ goto out_session_cancel;
+ }
+
+ packet = (dat[0] - 1 + session->pkt.dpo);
+ if (packet > session->pkt.total ||
+ (session->pkt.rx + 1) > session->pkt.total) {
+ netdev_info(priv->ndev, "%s: 0x%p: should have been completed\n",
+ __func__, session);
+ goto out_session_cancel;
+ }
+
+ se_skb = j1939_session_skb_get_by_offset(session, packet * 7);
+ if (!se_skb) {
+ netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__,
+ session);
+ goto out_session_cancel;
+ }
+
+ se_skcb = j1939_skb_to_cb(se_skb);
+ offset = packet * 7 - se_skcb->offset;
+ nbytes = se_skb->len - offset;
+ if (nbytes > 7)
+ nbytes = 7;
+ if (nbytes <= 0 || (nbytes + 1) > skb->len) {
+ netdev_info(priv->ndev, "%s: 0x%p: nbytes %i, len %i\n",
+ __func__, session, nbytes, skb->len);
+ goto out_session_cancel;
+ }
+
+ tpdat = se_skb->data;
+ if (!session->transmission) {
+ memcpy(&tpdat[offset], &dat[1], nbytes);
+ } else {
+ int err;
+
+ err = memcmp(&tpdat[offset], &dat[1], nbytes);
+ if (err)
+ netdev_err_once(priv->ndev,
+ "%s: 0x%p: Data of RX-looped back packet (%*ph) doesn't match TX data (%*ph)!\n",
+ __func__, session,
+ nbytes, &dat[1],
+ nbytes, &tpdat[offset]);
+ }
+
+ if (packet == session->pkt.rx)
+ session->pkt.rx++;
+
+ if (se_skcb->addr.type != J1939_ETP &&
+ j1939_cb_is_broadcast(&session->skcb)) {
+ if (session->pkt.rx >= session->pkt.total)
+ final = true;
+ else
+ remain = true;
+ } else {
+ /* never final, an EOMA must follow */
+ if (session->pkt.rx >= session->pkt.last)
+ do_cts_eoma = true;
+ }
+
+ if (final) {
+ j1939_session_timers_cancel(session);
+ j1939_session_completed(session);
+ } else if (remain) {
+ if (!session->transmission)
+ j1939_tp_set_rxtimeout(session, 750);
+ } else if (do_cts_eoma) {
+ j1939_tp_set_rxtimeout(session, 1250);
+ if (!session->transmission)
+ j1939_tp_schedule_txtimer(session, 0);
+ } else {
+ j1939_tp_set_rxtimeout(session, 750);
+ }
+ session->last_cmd = 0xff;
+ consume_skb(se_skb);
+ j1939_session_put(session);
+
+ return;
+
+ out_session_cancel:
+ kfree_skb(se_skb);
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, abort);
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb;
+ struct j1939_session *session;
+
+ skcb = j1939_skb_to_cb(skb);
+
+ if (j1939_tp_im_transmitter(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ true);
+ if (!session)
+ netdev_info(priv->ndev, "%s: no tx connection found\n",
+ __func__);
+ else
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+
+ if (j1939_tp_im_receiver(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ false);
+ if (!session)
+ netdev_info(priv->ndev, "%s: no rx connection found\n",
+ __func__);
+ else
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+
+ if (j1939_cb_is_broadcast(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ false);
+ if (session)
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+}
+
+/* j1939 main intf */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ int ret;
+
+ if (skcb->addr.pgn == J1939_TP_PGN_DAT ||
+ skcb->addr.pgn == J1939_TP_PGN_CTL ||
+ skcb->addr.pgn == J1939_ETP_PGN_DAT ||
+ skcb->addr.pgn == J1939_ETP_PGN_CTL)
+ /* avoid conflict */
+ return ERR_PTR(-EDOM);
+
+ if (size > priv->tp_max_packet_size)
+ return ERR_PTR(-EMSGSIZE);
+
+ if (size <= 8)
+ skcb->addr.type = J1939_SIMPLE;
+ else if (size > J1939_MAX_TP_PACKET_SIZE)
+ skcb->addr.type = J1939_ETP;
+ else
+ skcb->addr.type = J1939_TP;
+
+ if (skcb->addr.type == J1939_ETP &&
+ j1939_cb_is_broadcast(skcb))
+ return ERR_PTR(-EDESTADDRREQ);
+
+ /* fill in addresses from names */
+ ret = j1939_ac_fixup(priv, skb);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ /* fix DST flags, it may be used there soon */
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+
+ /* src is always local, I'm sending ... */
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+ /* prepare new session */
+ session = j1939_session_new(priv, skb, size);
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ /* skb is recounted in j1939_session_new() */
+ sock_hold(skb->sk);
+ session->sk = skb->sk;
+ session->transmission = true;
+ session->pkt.total = (size + 6) / 7;
+ session->pkt.block = skcb->addr.type == J1939_ETP ? 255 :
+ min(j1939_tp_block ?: 255, session->pkt.total);
+
+ if (j1939_cb_is_broadcast(&session->skcb))
+ /* set the end-packet for broadcast */
+ session->pkt.last = session->pkt.total;
+
+ skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1;
+ session->tskey = skcb->tskey;
+
+ return session;
+}
+
+static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int extd = J1939_TP;
+ u8 cmd = skb->data[0];
+
+ switch (cmd) {
+ case J1939_ETP_CMD_RTS:
+ extd = J1939_ETP;
+ fallthrough;
+ case J1939_TP_CMD_BAM:
+ if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) {
+ netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+ fallthrough;
+ case J1939_TP_CMD_RTS:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (cmd == J1939_TP_CMD_RTS && j1939_cb_is_broadcast(skcb)) {
+ netdev_alert(priv->ndev, "%s: rts without destination (%02x)\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_rts(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb) || j1939_cb_is_broadcast(skcb))
+ j1939_xtp_rx_rts(priv, skb, false);
+
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ extd = J1939_ETP;
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_cts(priv, skb, false);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_cts(priv, skb, true);
+
+ break;
+
+ case J1939_ETP_CMD_DPO:
+ if (skcb->addr.type != J1939_ETP)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_dpo(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_dpo(priv, skb, false);
+
+ break;
+
+ case J1939_ETP_CMD_EOMA:
+ extd = J1939_ETP;
+ fallthrough;
+ case J1939_TP_CMD_EOMA:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_eoma(priv, skb, false);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_eoma(priv, skb, true);
+
+ break;
+
+ case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+ if (j1939_cb_is_broadcast(skcb)) {
+ netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_abort(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_abort(priv, skb, false);
+
+ break;
+ default:
+ return;
+ }
+}
+
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+ if (!j1939_tp_im_involved_anydir(skcb) && !j1939_cb_is_broadcast(skcb))
+ return 0;
+
+ switch (skcb->addr.pgn) {
+ case J1939_ETP_PGN_DAT:
+ skcb->addr.type = J1939_ETP;
+ fallthrough;
+ case J1939_TP_PGN_DAT:
+ j1939_xtp_rx_dat(priv, skb);
+ break;
+
+ case J1939_ETP_PGN_CTL:
+ skcb->addr.type = J1939_ETP;
+ fallthrough;
+ case J1939_TP_PGN_CTL:
+ if (skb->len < 8)
+ return 0; /* Don't care. Nothing to extract here */
+
+ j1939_tp_cmd_recv(priv, skb);
+ break;
+ default:
+ return 0; /* no problem */
+ }
+ return 1; /* "I processed the message" */
+}
+
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_session *session;
+
+ if (!skb->sk)
+ return;
+
+ if (skb->sk->sk_family != AF_CAN ||
+ skb->sk->sk_protocol != CAN_J1939)
+ return;
+
+ j1939_session_list_lock(priv);
+ session = j1939_session_get_simple(priv, skb);
+ j1939_session_list_unlock(priv);
+ if (!session) {
+ netdev_warn(priv->ndev,
+ "%s: Received already invalidated message\n",
+ __func__);
+ return;
+ }
+
+ j1939_session_timers_cancel(session);
+ j1939_session_deactivate(session);
+ j1939_session_put(session);
+}
+
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk)
+{
+ struct j1939_session *session, *saved;
+
+ netdev_dbg(priv->ndev, "%s, sk: %p\n", __func__, sk);
+ j1939_session_list_lock(priv);
+ list_for_each_entry_safe(session, saved,
+ &priv->active_session_list,
+ active_session_list_entry) {
+ if (!sk || sk == session->sk) {
+ if (hrtimer_try_to_cancel(&session->txtimer) == 1)
+ j1939_session_put(session);
+ if (hrtimer_try_to_cancel(&session->rxtimer) == 1)
+ j1939_session_put(session);
+
+ session->err = ESHUTDOWN;
+ j1939_session_deactivate_locked(session);
+ }
+ }
+ j1939_session_list_unlock(priv);
+ return NOTIFY_DONE;
+}
+
+void j1939_tp_init(struct j1939_priv *priv)
+{
+ spin_lock_init(&priv->active_session_list_lock);
+ INIT_LIST_HEAD(&priv->active_session_list);
+ priv->tp_max_packet_size = J1939_MAX_ETP_PACKET_SIZE;
+}
diff --git a/net/can/proc.c b/net/can/proc.c
new file mode 100644
index 000000000000..0938bf7dd646
--- /dev/null
+++ b/net/can/proc.c
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * proc.c - procfs support for Protocol family CAN core module
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+
+#include "af_can.h"
+
+/*
+ * proc filenames for the PF_CAN core
+ */
+
+#define CAN_PROC_STATS "stats"
+#define CAN_PROC_RESET_STATS "reset_stats"
+#define CAN_PROC_RCVLIST_ALL "rcvlist_all"
+#define CAN_PROC_RCVLIST_FIL "rcvlist_fil"
+#define CAN_PROC_RCVLIST_INV "rcvlist_inv"
+#define CAN_PROC_RCVLIST_SFF "rcvlist_sff"
+#define CAN_PROC_RCVLIST_EFF "rcvlist_eff"
+#define CAN_PROC_RCVLIST_ERR "rcvlist_err"
+
+static int user_reset;
+
+static const char rx_list_name[][8] = {
+ [RX_ERR] = "rx_err",
+ [RX_ALL] = "rx_all",
+ [RX_FIL] = "rx_fil",
+ [RX_INV] = "rx_inv",
+};
+
+/*
+ * af_can statistics stuff
+ */
+
+static void can_init_stats(struct net *net)
+{
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ /*
+ * This memset function is called from a timer context (when
+ * can_stattimer is active which is the default) OR in a process
+ * context (reading the proc_fs when can_stattimer is disabled).
+ */
+ memset(pkg_stats, 0, sizeof(struct can_pkg_stats));
+ pkg_stats->jiffies_init = jiffies;
+
+ rcv_lists_stats->stats_reset++;
+
+ if (user_reset) {
+ user_reset = 0;
+ rcv_lists_stats->user_reset++;
+ }
+}
+
+static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
+ unsigned long count)
+{
+ if (oldjif == newjif)
+ return 0;
+
+ /* see can_stat_update() - this should NEVER happen! */
+ if (count > (ULONG_MAX / HZ)) {
+ printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n",
+ count);
+ return 99999999;
+ }
+
+ return (count * HZ) / (newjif - oldjif);
+}
+
+void can_stat_update(struct timer_list *t)
+{
+ struct net *net = timer_container_of(net, t, can.stattimer);
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ unsigned long j = jiffies; /* snapshot */
+
+ long rx_frames = atomic_long_read(&pkg_stats->rx_frames);
+ long tx_frames = atomic_long_read(&pkg_stats->tx_frames);
+ long matches = atomic_long_read(&pkg_stats->matches);
+ long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta);
+ long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta);
+ long matches_delta = atomic_long_read(&pkg_stats->matches_delta);
+
+ /* restart counting in timer context on user request */
+ if (user_reset)
+ can_init_stats(net);
+
+ /* restart counting on jiffies overflow */
+ if (j < pkg_stats->jiffies_init)
+ can_init_stats(net);
+
+ /* prevent overflow in calc_rate() */
+ if (rx_frames > (LONG_MAX / HZ))
+ can_init_stats(net);
+
+ /* prevent overflow in calc_rate() */
+ if (tx_frames > (LONG_MAX / HZ))
+ can_init_stats(net);
+
+ /* matches overflow - very improbable */
+ if (matches > (LONG_MAX / 100))
+ can_init_stats(net);
+
+ /* calc total values */
+ if (rx_frames)
+ pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames;
+
+ pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j,
+ tx_frames);
+ pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j,
+ rx_frames);
+
+ /* calc current values */
+ if (rx_frames_delta)
+ pkg_stats->current_rx_match_ratio =
+ (matches_delta * 100) / rx_frames_delta;
+
+ pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta);
+ pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta);
+
+ /* check / update maximum values */
+ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate)
+ pkg_stats->max_tx_rate = pkg_stats->current_tx_rate;
+
+ if (pkg_stats->max_rx_rate < pkg_stats->current_rx_rate)
+ pkg_stats->max_rx_rate = pkg_stats->current_rx_rate;
+
+ if (pkg_stats->max_rx_match_ratio < pkg_stats->current_rx_match_ratio)
+ pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio;
+
+ /* clear values for 'current rate' calculation */
+ atomic_long_set(&pkg_stats->tx_frames_delta, 0);
+ atomic_long_set(&pkg_stats->rx_frames_delta, 0);
+ atomic_long_set(&pkg_stats->matches_delta, 0);
+
+ /* restart timer (one second) */
+ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ));
+}
+
+/*
+ * proc read functions
+ */
+
+static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list,
+ struct net_device *dev)
+{
+ struct receiver *r;
+
+ hlist_for_each_entry_rcu(r, rx_list, list) {
+ char *fmt = (r->can_id & CAN_EFF_FLAG)?
+ " %-5s %08x %08x %pK %pK %8ld %s\n" :
+ " %-5s %03x %08x %pK %pK %8ld %s\n";
+
+ seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask,
+ r->func, r->data, r->matches, r->ident);
+ }
+}
+
+static void can_print_recv_banner(struct seq_file *m)
+{
+ /*
+ * can1. 00000000 00000000 00000000
+ * ....... 0 tp20
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ seq_puts(m, " device can_id can_mask function userdata matches ident\n");
+ else
+ seq_puts(m, " device can_id can_mask function userdata matches ident\n");
+}
+
+static int can_stats_proc_show(struct seq_file *m, void *v)
+{
+ struct net *net = m->private;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+
+ seq_putc(m, '\n');
+ seq_printf(m, " %8ld transmitted frames (TXF)\n",
+ atomic_long_read(&pkg_stats->tx_frames));
+ seq_printf(m, " %8ld received frames (RXF)\n",
+ atomic_long_read(&pkg_stats->rx_frames));
+ seq_printf(m, " %8ld matched frames (RXMF)\n",
+ atomic_long_read(&pkg_stats->matches));
+
+ seq_putc(m, '\n');
+
+ if (net->can.stattimer.function == can_stat_update) {
+ seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
+ pkg_stats->total_rx_match_ratio);
+
+ seq_printf(m, " %8ld frames/s total tx rate (TXR)\n",
+ pkg_stats->total_tx_rate);
+ seq_printf(m, " %8ld frames/s total rx rate (RXR)\n",
+ pkg_stats->total_rx_rate);
+
+ seq_putc(m, '\n');
+
+ seq_printf(m, " %8ld %% current match ratio (CRXMR)\n",
+ pkg_stats->current_rx_match_ratio);
+
+ seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n",
+ pkg_stats->current_tx_rate);
+ seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n",
+ pkg_stats->current_rx_rate);
+
+ seq_putc(m, '\n');
+
+ seq_printf(m, " %8ld %% max match ratio (MRXMR)\n",
+ pkg_stats->max_rx_match_ratio);
+
+ seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n",
+ pkg_stats->max_tx_rate);
+ seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n",
+ pkg_stats->max_rx_rate);
+
+ seq_putc(m, '\n');
+ }
+
+ seq_printf(m, " %8ld current receive list entries (CRCV)\n",
+ rcv_lists_stats->rcv_entries);
+ seq_printf(m, " %8ld maximum receive list entries (MRCV)\n",
+ rcv_lists_stats->rcv_entries_max);
+
+ if (rcv_lists_stats->stats_reset)
+ seq_printf(m, "\n %8ld statistic resets (STR)\n",
+ rcv_lists_stats->stats_reset);
+
+ if (rcv_lists_stats->user_reset)
+ seq_printf(m, " %8ld user statistic resets (USTR)\n",
+ rcv_lists_stats->user_reset);
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int can_reset_stats_proc_show(struct seq_file *m, void *v)
+{
+ struct net *net = m->private;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+
+ user_reset = 1;
+
+ if (net->can.stattimer.function == can_stat_update) {
+ seq_printf(m, "Scheduled statistic reset #%ld.\n",
+ rcv_lists_stats->stats_reset + 1);
+ } else {
+ if (pkg_stats->jiffies_init != jiffies)
+ can_init_stats(net);
+
+ seq_printf(m, "Performed statistic reset #%ld.\n",
+ rcv_lists_stats->stats_reset);
+ }
+ return 0;
+}
+
+static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
+ struct net_device *dev,
+ struct can_dev_rcv_lists *dev_rcv_lists)
+{
+ if (!hlist_empty(&dev_rcv_lists->rx[idx])) {
+ can_print_recv_banner(m);
+ can_print_rcvlist(m, &dev_rcv_lists->rx[idx], dev);
+ } else
+ seq_printf(m, " (%s: no entry)\n", DNAME(dev));
+
+}
+
+static int can_rcvlist_proc_show(struct seq_file *m, void *v)
+{
+ /* double cast to prevent GCC warning */
+ int idx = (int)(long)pde_data(m->file->f_inode);
+ struct net_device *dev;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct net *net = m->private;
+
+ seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
+
+ rcu_read_lock();
+
+ /* receive list for 'all' CAN devices (dev == NULL) */
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_one(m, idx, NULL, dev_rcv_lists);
+
+ /* receive list for registered CAN devices */
+ for_each_netdev_rcu(net, dev) {
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+ if (can_ml)
+ can_rcvlist_proc_show_one(m, idx, dev,
+ &can_ml->dev_rcv_lists);
+ }
+
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static inline void can_rcvlist_proc_show_array(struct seq_file *m,
+ struct net_device *dev,
+ struct hlist_head *rcv_array,
+ unsigned int rcv_array_sz)
+{
+ unsigned int i;
+ int all_empty = 1;
+
+ /* check whether at least one list is non-empty */
+ for (i = 0; i < rcv_array_sz; i++)
+ if (!hlist_empty(&rcv_array[i])) {
+ all_empty = 0;
+ break;
+ }
+
+ if (!all_empty) {
+ can_print_recv_banner(m);
+ for (i = 0; i < rcv_array_sz; i++) {
+ if (!hlist_empty(&rcv_array[i]))
+ can_print_rcvlist(m, &rcv_array[i], dev);
+ }
+ } else
+ seq_printf(m, " (%s: no entry)\n", DNAME(dev));
+}
+
+static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
+{
+ struct net_device *dev;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct net *net = m->private;
+
+ /* RX_SFF */
+ seq_puts(m, "\nreceive list 'rx_sff':\n");
+
+ rcu_read_lock();
+
+ /* sff receive list for 'all' CAN devices (dev == NULL) */
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_sff,
+ ARRAY_SIZE(dev_rcv_lists->rx_sff));
+
+ /* sff receive list for registered CAN devices */
+ for_each_netdev_rcu(net, dev) {
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+ if (can_ml) {
+ dev_rcv_lists = &can_ml->dev_rcv_lists;
+ can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff,
+ ARRAY_SIZE(dev_rcv_lists->rx_sff));
+ }
+ }
+
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
+{
+ struct net_device *dev;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct net *net = m->private;
+
+ /* RX_EFF */
+ seq_puts(m, "\nreceive list 'rx_eff':\n");
+
+ rcu_read_lock();
+
+ /* eff receive list for 'all' CAN devices (dev == NULL) */
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_eff,
+ ARRAY_SIZE(dev_rcv_lists->rx_eff));
+
+ /* eff receive list for registered CAN devices */
+ for_each_netdev_rcu(net, dev) {
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+ if (can_ml) {
+ dev_rcv_lists = &can_ml->dev_rcv_lists;
+ can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff,
+ ARRAY_SIZE(dev_rcv_lists->rx_eff));
+ }
+ }
+
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+/*
+ * can_init_proc - create main CAN proc directory and procfs entries
+ */
+void can_init_proc(struct net *net)
+{
+ /* create /proc/net/can directory */
+ net->can.proc_dir = proc_net_mkdir(net, "can", net->proc_net);
+
+ if (!net->can.proc_dir) {
+ printk(KERN_INFO "can: failed to create /proc/net/can . "
+ "CONFIG_PROC_FS missing?\n");
+ return;
+ }
+
+ /* own procfs entries from the AF_CAN core */
+ net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644,
+ net->can.proc_dir, can_stats_proc_show, NULL);
+ net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS,
+ 0644, net->can.proc_dir, can_reset_stats_proc_show,
+ NULL);
+ net->can.pde_rcvlist_err = proc_create_net_single(CAN_PROC_RCVLIST_ERR,
+ 0644, net->can.proc_dir, can_rcvlist_proc_show,
+ (void *)RX_ERR);
+ net->can.pde_rcvlist_all = proc_create_net_single(CAN_PROC_RCVLIST_ALL,
+ 0644, net->can.proc_dir, can_rcvlist_proc_show,
+ (void *)RX_ALL);
+ net->can.pde_rcvlist_fil = proc_create_net_single(CAN_PROC_RCVLIST_FIL,
+ 0644, net->can.proc_dir, can_rcvlist_proc_show,
+ (void *)RX_FIL);
+ net->can.pde_rcvlist_inv = proc_create_net_single(CAN_PROC_RCVLIST_INV,
+ 0644, net->can.proc_dir, can_rcvlist_proc_show,
+ (void *)RX_INV);
+ net->can.pde_rcvlist_eff = proc_create_net_single(CAN_PROC_RCVLIST_EFF,
+ 0644, net->can.proc_dir, can_rcvlist_eff_proc_show, NULL);
+ net->can.pde_rcvlist_sff = proc_create_net_single(CAN_PROC_RCVLIST_SFF,
+ 0644, net->can.proc_dir, can_rcvlist_sff_proc_show, NULL);
+}
+
+/*
+ * can_remove_proc - remove procfs entries and main CAN proc directory
+ */
+void can_remove_proc(struct net *net)
+{
+ if (!net->can.proc_dir)
+ return;
+
+ if (net->can.pde_stats)
+ remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir);
+
+ if (net->can.pde_reset_stats)
+ remove_proc_entry(CAN_PROC_RESET_STATS, net->can.proc_dir);
+
+ if (net->can.pde_rcvlist_err)
+ remove_proc_entry(CAN_PROC_RCVLIST_ERR, net->can.proc_dir);
+
+ if (net->can.pde_rcvlist_all)
+ remove_proc_entry(CAN_PROC_RCVLIST_ALL, net->can.proc_dir);
+
+ if (net->can.pde_rcvlist_fil)
+ remove_proc_entry(CAN_PROC_RCVLIST_FIL, net->can.proc_dir);
+
+ if (net->can.pde_rcvlist_inv)
+ remove_proc_entry(CAN_PROC_RCVLIST_INV, net->can.proc_dir);
+
+ if (net->can.pde_rcvlist_eff)
+ remove_proc_entry(CAN_PROC_RCVLIST_EFF, net->can.proc_dir);
+
+ if (net->can.pde_rcvlist_sff)
+ remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir);
+
+ remove_proc_entry("can", net->proc_net);
+}
diff --git a/net/can/raw.c b/net/can/raw.c
new file mode 100644
index 000000000000..be1ef7cf4204
--- /dev/null
+++ b/net/can/raw.c
@@ -0,0 +1,1158 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* raw.c - Raw sockets for protocol family CAN
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */
+#include <linux/can/skb.h>
+#include <linux/can/raw.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+MODULE_DESCRIPTION("PF_CAN raw protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>");
+MODULE_ALIAS("can-proto-1");
+
+#define RAW_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)
+
+#define MASK_ALL 0
+
+/* A raw socket has a list of can_filters attached to it, each receiving
+ * the CAN frames matching that filter. If the filter list is empty,
+ * no CAN frames will be received by the socket. The default after
+ * opening the socket, is to have one filter which receives all frames.
+ * The filter list is allocated dynamically with the exception of the
+ * list containing only one item. This common case is optimized by
+ * storing the single filter in dfilter, to avoid using dynamic memory.
+ */
+
+struct uniqframe {
+ const struct sk_buff *skb;
+ int skbcnt;
+ unsigned int join_rx_count;
+};
+
+struct raw_sock {
+ struct sock sk;
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+ struct list_head notifier;
+ int ifindex;
+ unsigned int bound:1;
+ unsigned int loopback:1;
+ unsigned int recv_own_msgs:1;
+ unsigned int fd_frames:1;
+ unsigned int xl_frames:1;
+ unsigned int join_filters:1;
+ struct can_raw_vcid_options raw_vcid_opts;
+ canid_t tx_vcid_shifted;
+ canid_t rx_vcid_shifted;
+ canid_t rx_vcid_mask_shifted;
+ can_err_mask_t err_mask;
+ int count; /* number of active filters */
+ struct can_filter dfilter; /* default/single filter */
+ struct can_filter *filter; /* pointer to filter(s) */
+ struct uniqframe __percpu *uniq;
+};
+
+static LIST_HEAD(raw_notifier_list);
+static DEFINE_SPINLOCK(raw_notifier_lock);
+static struct raw_sock *raw_busy_notifier;
+
+/* Return pointer to store the extra msg flags for raw_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *raw_flags(struct sk_buff *skb)
+{
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
+ sizeof(unsigned int));
+
+ /* return pointer after struct sockaddr_can */
+ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
+static inline struct raw_sock *raw_sk(const struct sock *sk)
+{
+ return (struct raw_sock *)sk;
+}
+
+static void raw_rcv(struct sk_buff *oskb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct raw_sock *ro = raw_sk(sk);
+ enum skb_drop_reason reason;
+ struct sockaddr_can *addr;
+ struct sk_buff *skb;
+ unsigned int *pflags;
+
+ /* check the received tx sock reference */
+ if (!ro->recv_own_msgs && oskb->sk == sk)
+ return;
+
+ /* make sure to not pass oversized frames to the socket */
+ if (!ro->fd_frames && can_is_canfd_skb(oskb))
+ return;
+
+ if (can_is_canxl_skb(oskb)) {
+ struct canxl_frame *cxl = (struct canxl_frame *)oskb->data;
+
+ /* make sure to not pass oversized frames to the socket */
+ if (!ro->xl_frames)
+ return;
+
+ /* filter CAN XL VCID content */
+ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) {
+ /* apply VCID filter if user enabled the filter */
+ if ((cxl->prio & ro->rx_vcid_mask_shifted) !=
+ (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted))
+ return;
+ } else {
+ /* no filter => do not forward VCID tagged frames */
+ if (cxl->prio & CANXL_VCID_MASK)
+ return;
+ }
+ }
+
+ /* eliminate multiple filter matches for the same skb */
+ if (this_cpu_ptr(ro->uniq)->skb == oskb &&
+ this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
+ if (!ro->join_filters)
+ return;
+
+ this_cpu_inc(ro->uniq->join_rx_count);
+ /* drop frame until all enabled filters matched */
+ if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count)
+ return;
+ } else {
+ this_cpu_ptr(ro->uniq)->skb = oskb;
+ this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
+ this_cpu_ptr(ro->uniq)->join_rx_count = 1;
+ /* drop first frame to check all enabled filters? */
+ if (ro->join_filters && ro->count > 1)
+ return;
+ }
+
+ /* clone the given skb to be able to enqueue it into the rcv queue */
+ skb = skb_clone(oskb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Put the datagram to the queue so that raw_recvmsg() can get
+ * it from there. We need to pass the interface index to
+ * raw_recvmsg(). We pass a whole struct sockaddr_can in
+ * skb->cb containing the interface index.
+ */
+
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can));
+ addr = (struct sockaddr_can *)skb->cb;
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = skb->dev->ifindex;
+
+ /* add CAN specific message flags for raw_recvmsg() */
+ pflags = raw_flags(skb);
+ *pflags = 0;
+ if (oskb->sk)
+ *pflags |= MSG_DONTROUTE;
+ if (oskb->sk == sk)
+ *pflags |= MSG_CONFIRM;
+
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0)
+ sk_skb_reason_drop(sk, skb, reason);
+}
+
+static int raw_enable_filters(struct net *net, struct net_device *dev,
+ struct sock *sk, struct can_filter *filter,
+ int count)
+{
+ int err = 0;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ err = can_rx_register(net, dev, filter[i].can_id,
+ filter[i].can_mask,
+ raw_rcv, sk, "raw", sk);
+ if (err) {
+ /* clean up successfully registered filters */
+ while (--i >= 0)
+ can_rx_unregister(net, dev, filter[i].can_id,
+ filter[i].can_mask,
+ raw_rcv, sk);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int raw_enable_errfilter(struct net *net, struct net_device *dev,
+ struct sock *sk, can_err_mask_t err_mask)
+{
+ int err = 0;
+
+ if (err_mask)
+ err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG,
+ raw_rcv, sk, "raw", sk);
+
+ return err;
+}
+
+static void raw_disable_filters(struct net *net, struct net_device *dev,
+ struct sock *sk, struct can_filter *filter,
+ int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ can_rx_unregister(net, dev, filter[i].can_id,
+ filter[i].can_mask, raw_rcv, sk);
+}
+
+static inline void raw_disable_errfilter(struct net *net,
+ struct net_device *dev,
+ struct sock *sk,
+ can_err_mask_t err_mask)
+
+{
+ if (err_mask)
+ can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG,
+ raw_rcv, sk);
+}
+
+static inline void raw_disable_allfilters(struct net *net,
+ struct net_device *dev,
+ struct sock *sk)
+{
+ struct raw_sock *ro = raw_sk(sk);
+
+ raw_disable_filters(net, dev, sk, ro->filter, ro->count);
+ raw_disable_errfilter(net, dev, sk, ro->err_mask);
+}
+
+static int raw_enable_allfilters(struct net *net, struct net_device *dev,
+ struct sock *sk)
+{
+ struct raw_sock *ro = raw_sk(sk);
+ int err;
+
+ err = raw_enable_filters(net, dev, sk, ro->filter, ro->count);
+ if (!err) {
+ err = raw_enable_errfilter(net, dev, sk, ro->err_mask);
+ if (err)
+ raw_disable_filters(net, dev, sk, ro->filter,
+ ro->count);
+ }
+
+ return err;
+}
+
+static void raw_notify(struct raw_sock *ro, unsigned long msg,
+ struct net_device *dev)
+{
+ struct sock *sk = &ro->sk;
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ return;
+
+ if (ro->dev != dev)
+ return;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+ /* remove current filters & unregister */
+ if (ro->bound) {
+ raw_disable_allfilters(dev_net(dev), dev, sk);
+ netdev_put(dev, &ro->dev_tracker);
+ }
+
+ if (ro->count > 1)
+ kfree(ro->filter);
+
+ ro->ifindex = 0;
+ ro->bound = 0;
+ ro->dev = NULL;
+ ro->count = 0;
+ release_sock(sk);
+
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ break;
+
+ case NETDEV_DOWN:
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ break;
+ }
+}
+
+static int raw_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+ if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
+ return NOTIFY_DONE;
+ if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */
+ return NOTIFY_DONE;
+
+ spin_lock(&raw_notifier_lock);
+ list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) {
+ spin_unlock(&raw_notifier_lock);
+ raw_notify(raw_busy_notifier, msg, dev);
+ spin_lock(&raw_notifier_lock);
+ }
+ raw_busy_notifier = NULL;
+ spin_unlock(&raw_notifier_lock);
+ return NOTIFY_DONE;
+}
+
+static int raw_init(struct sock *sk)
+{
+ struct raw_sock *ro = raw_sk(sk);
+
+ ro->bound = 0;
+ ro->ifindex = 0;
+ ro->dev = NULL;
+
+ /* set default filter to single entry dfilter */
+ ro->dfilter.can_id = 0;
+ ro->dfilter.can_mask = MASK_ALL;
+ ro->filter = &ro->dfilter;
+ ro->count = 1;
+
+ /* set default loopback behaviour */
+ ro->loopback = 1;
+ ro->recv_own_msgs = 0;
+ ro->fd_frames = 0;
+ ro->xl_frames = 0;
+ ro->join_filters = 0;
+
+ /* alloc_percpu provides zero'ed memory */
+ ro->uniq = alloc_percpu(struct uniqframe);
+ if (unlikely(!ro->uniq))
+ return -ENOMEM;
+
+ /* set notifier */
+ spin_lock(&raw_notifier_lock);
+ list_add_tail(&ro->notifier, &raw_notifier_list);
+ spin_unlock(&raw_notifier_lock);
+
+ return 0;
+}
+
+static int raw_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro;
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ ro = raw_sk(sk);
+ net = sock_net(sk);
+
+ spin_lock(&raw_notifier_lock);
+ while (raw_busy_notifier == ro) {
+ spin_unlock(&raw_notifier_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&raw_notifier_lock);
+ }
+ list_del(&ro->notifier);
+ spin_unlock(&raw_notifier_lock);
+
+ rtnl_lock();
+ lock_sock(sk);
+
+ /* remove current filters & unregister */
+ if (ro->bound) {
+ if (ro->dev) {
+ raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk);
+ netdev_put(ro->dev, &ro->dev_tracker);
+ } else {
+ raw_disable_allfilters(net, NULL, sk);
+ }
+ }
+
+ if (ro->count > 1)
+ kfree(ro->filter);
+
+ ro->ifindex = 0;
+ ro->bound = 0;
+ ro->dev = NULL;
+ ro->count = 0;
+ free_percpu(ro->uniq);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ rtnl_unlock();
+
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int raw_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ struct net_device *dev = NULL;
+ int ifindex;
+ int err = 0;
+ int notify_enetdown = 0;
+
+ if (len < RAW_MIN_NAMELEN)
+ return -EINVAL;
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ rtnl_lock();
+ lock_sock(sk);
+
+ if (ro->bound && addr->can_ifindex == ro->ifindex)
+ goto out;
+
+ if (addr->can_ifindex) {
+ dev = dev_get_by_index(sock_net(sk), addr->can_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (dev->type != ARPHRD_CAN) {
+ err = -ENODEV;
+ goto out_put_dev;
+ }
+
+ if (!(dev->flags & IFF_UP))
+ notify_enetdown = 1;
+
+ ifindex = dev->ifindex;
+
+ /* filters set by default/setsockopt */
+ err = raw_enable_allfilters(sock_net(sk), dev, sk);
+ if (err)
+ goto out_put_dev;
+
+ } else {
+ ifindex = 0;
+
+ /* filters set by default/setsockopt */
+ err = raw_enable_allfilters(sock_net(sk), NULL, sk);
+ }
+
+ if (!err) {
+ if (ro->bound) {
+ /* unregister old filters */
+ if (ro->dev) {
+ raw_disable_allfilters(dev_net(ro->dev),
+ ro->dev, sk);
+ /* drop reference to old ro->dev */
+ netdev_put(ro->dev, &ro->dev_tracker);
+ } else {
+ raw_disable_allfilters(sock_net(sk), NULL, sk);
+ }
+ }
+ ro->ifindex = ifindex;
+ ro->bound = 1;
+ /* bind() ok -> hold a reference for new ro->dev */
+ ro->dev = dev;
+ if (ro->dev)
+ netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL);
+ }
+
+out_put_dev:
+ /* remove potential reference from dev_get_by_index() */
+ dev_put(dev);
+out:
+ release_sock(sk);
+ rtnl_unlock();
+
+ if (notify_enetdown) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ }
+
+ return err;
+}
+
+static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ memset(addr, 0, RAW_MIN_NAMELEN);
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = ro->ifindex;
+
+ return RAW_MIN_NAMELEN;
+}
+
+static int raw_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ struct can_filter *filter = NULL; /* dyn. alloc'ed filters */
+ struct can_filter sfilter; /* single filter */
+ struct net_device *dev = NULL;
+ can_err_mask_t err_mask = 0;
+ int count = 0;
+ int flag;
+ int err = 0;
+
+ if (level != SOL_CAN_RAW)
+ return -EINVAL;
+
+ switch (optname) {
+ case CAN_RAW_FILTER:
+ if (optlen % sizeof(struct can_filter) != 0)
+ return -EINVAL;
+
+ if (optlen > CAN_RAW_FILTER_MAX * sizeof(struct can_filter))
+ return -EINVAL;
+
+ count = optlen / sizeof(struct can_filter);
+
+ if (count > 1) {
+ /* filter does not fit into dfilter => alloc space */
+ filter = memdup_sockptr(optval, optlen);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+ } else if (count == 1) {
+ if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter)))
+ return -EFAULT;
+ }
+
+ rtnl_lock();
+ lock_sock(sk);
+
+ dev = ro->dev;
+ if (ro->bound && dev) {
+ if (dev->reg_state != NETREG_REGISTERED) {
+ if (count > 1)
+ kfree(filter);
+ err = -ENODEV;
+ goto out_fil;
+ }
+ }
+
+ if (ro->bound) {
+ /* (try to) register the new filters */
+ if (count == 1)
+ err = raw_enable_filters(sock_net(sk), dev, sk,
+ &sfilter, 1);
+ else
+ err = raw_enable_filters(sock_net(sk), dev, sk,
+ filter, count);
+ if (err) {
+ if (count > 1)
+ kfree(filter);
+ goto out_fil;
+ }
+
+ /* remove old filter registrations */
+ raw_disable_filters(sock_net(sk), dev, sk, ro->filter,
+ ro->count);
+ }
+
+ /* remove old filter space */
+ if (ro->count > 1)
+ kfree(ro->filter);
+
+ /* link new filters to the socket */
+ if (count == 1) {
+ /* copy filter data for single filter */
+ ro->dfilter = sfilter;
+ filter = &ro->dfilter;
+ }
+ ro->filter = filter;
+ ro->count = count;
+
+ out_fil:
+ release_sock(sk);
+ rtnl_unlock();
+
+ break;
+
+ case CAN_RAW_ERR_FILTER:
+ if (optlen != sizeof(err_mask))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&err_mask, optval, optlen))
+ return -EFAULT;
+
+ err_mask &= CAN_ERR_MASK;
+
+ rtnl_lock();
+ lock_sock(sk);
+
+ dev = ro->dev;
+ if (ro->bound && dev) {
+ if (dev->reg_state != NETREG_REGISTERED) {
+ err = -ENODEV;
+ goto out_err;
+ }
+ }
+
+ /* remove current error mask */
+ if (ro->bound) {
+ /* (try to) register the new err_mask */
+ err = raw_enable_errfilter(sock_net(sk), dev, sk,
+ err_mask);
+
+ if (err)
+ goto out_err;
+
+ /* remove old err_mask registration */
+ raw_disable_errfilter(sock_net(sk), dev, sk,
+ ro->err_mask);
+ }
+
+ /* link new err_mask to the socket */
+ ro->err_mask = err_mask;
+
+ out_err:
+ release_sock(sk);
+ rtnl_unlock();
+
+ break;
+
+ case CAN_RAW_LOOPBACK:
+ if (optlen != sizeof(flag))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&flag, optval, optlen))
+ return -EFAULT;
+
+ ro->loopback = !!flag;
+ break;
+
+ case CAN_RAW_RECV_OWN_MSGS:
+ if (optlen != sizeof(flag))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&flag, optval, optlen))
+ return -EFAULT;
+
+ ro->recv_own_msgs = !!flag;
+ break;
+
+ case CAN_RAW_FD_FRAMES:
+ if (optlen != sizeof(flag))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&flag, optval, optlen))
+ return -EFAULT;
+
+ /* Enabling CAN XL includes CAN FD */
+ if (ro->xl_frames && !flag)
+ return -EINVAL;
+
+ ro->fd_frames = !!flag;
+ break;
+
+ case CAN_RAW_XL_FRAMES:
+ if (optlen != sizeof(flag))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&flag, optval, optlen))
+ return -EFAULT;
+
+ ro->xl_frames = !!flag;
+
+ /* Enabling CAN XL includes CAN FD */
+ if (ro->xl_frames)
+ ro->fd_frames = ro->xl_frames;
+ break;
+
+ case CAN_RAW_XL_VCID_OPTS:
+ if (optlen != sizeof(ro->raw_vcid_opts))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen))
+ return -EFAULT;
+
+ /* prepare 32 bit values for handling in hot path */
+ ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET;
+ ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET;
+ ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET;
+ break;
+
+ case CAN_RAW_JOIN_FILTERS:
+ if (optlen != sizeof(flag))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&flag, optval, optlen))
+ return -EFAULT;
+
+ ro->join_filters = !!flag;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static int raw_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ int flag;
+ int len;
+ void *val;
+
+ if (level != SOL_CAN_RAW)
+ return -EINVAL;
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case CAN_RAW_FILTER: {
+ int err = 0;
+
+ lock_sock(sk);
+ if (ro->count > 0) {
+ int fsize = ro->count * sizeof(struct can_filter);
+
+ /* user space buffer to small for filter list? */
+ if (len < fsize) {
+ /* return -ERANGE and needed space in optlen */
+ err = -ERANGE;
+ if (put_user(fsize, optlen))
+ err = -EFAULT;
+ } else {
+ if (len > fsize)
+ len = fsize;
+ if (copy_to_user(optval, ro->filter, len))
+ err = -EFAULT;
+ }
+ } else {
+ len = 0;
+ }
+ release_sock(sk);
+
+ if (!err)
+ err = put_user(len, optlen);
+ return err;
+ }
+ case CAN_RAW_ERR_FILTER:
+ if (len > sizeof(can_err_mask_t))
+ len = sizeof(can_err_mask_t);
+ val = &ro->err_mask;
+ break;
+
+ case CAN_RAW_LOOPBACK:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ flag = ro->loopback;
+ val = &flag;
+ break;
+
+ case CAN_RAW_RECV_OWN_MSGS:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ flag = ro->recv_own_msgs;
+ val = &flag;
+ break;
+
+ case CAN_RAW_FD_FRAMES:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ flag = ro->fd_frames;
+ val = &flag;
+ break;
+
+ case CAN_RAW_XL_FRAMES:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ flag = ro->xl_frames;
+ val = &flag;
+ break;
+
+ case CAN_RAW_XL_VCID_OPTS: {
+ int err = 0;
+
+ /* user space buffer to small for VCID opts? */
+ if (len < sizeof(ro->raw_vcid_opts)) {
+ /* return -ERANGE and needed space in optlen */
+ err = -ERANGE;
+ if (put_user(sizeof(ro->raw_vcid_opts), optlen))
+ err = -EFAULT;
+ } else {
+ if (len > sizeof(ro->raw_vcid_opts))
+ len = sizeof(ro->raw_vcid_opts);
+ if (copy_to_user(optval, &ro->raw_vcid_opts, len))
+ err = -EFAULT;
+ }
+ if (!err)
+ err = put_user(len, optlen);
+ return err;
+ }
+ case CAN_RAW_JOIN_FILTERS:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ flag = ro->join_filters;
+ val = &flag;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb)
+{
+ struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
+
+ /* sanitize non CAN XL bits */
+ cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK);
+
+ /* clear VCID in CAN XL frame if pass through is disabled */
+ if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS))
+ cxl->prio &= CANXL_PRIO_MASK;
+
+ /* set VCID in CAN XL frame if enabled */
+ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) {
+ cxl->prio &= CANXL_PRIO_MASK;
+ cxl->prio |= ro->tx_vcid_shifted;
+ }
+}
+
+static inline bool raw_dev_cc_enabled(struct net_device *dev,
+ struct can_priv *priv)
+{
+ /* The CANXL-only mode disables error-signalling on the CAN bus
+ * which is needed to send CAN CC/FD frames
+ */
+ if (priv)
+ return !can_dev_in_xl_only_mode(priv);
+
+ /* virtual CAN interfaces always support CAN CC */
+ return true;
+}
+
+static inline bool raw_dev_fd_enabled(struct net_device *dev,
+ struct can_priv *priv)
+{
+ /* check FD ctrlmode on real CAN interfaces */
+ if (priv)
+ return (priv->ctrlmode & CAN_CTRLMODE_FD);
+
+ /* check MTU for virtual CAN FD interfaces */
+ return (READ_ONCE(dev->mtu) >= CANFD_MTU);
+}
+
+static inline bool raw_dev_xl_enabled(struct net_device *dev,
+ struct can_priv *priv)
+{
+ /* check XL ctrlmode on real CAN interfaces */
+ if (priv)
+ return (priv->ctrlmode & CAN_CTRLMODE_XL);
+
+ /* check MTU for virtual CAN XL interfaces */
+ return can_is_canxl_dev_mtu(READ_ONCE(dev->mtu));
+}
+
+static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct can_priv *priv = safe_candev_priv(dev);
+
+ /* Classical CAN */
+ if (can_is_can_skb(skb) && raw_dev_cc_enabled(dev, priv))
+ return CAN_MTU;
+
+ /* CAN FD */
+ if (ro->fd_frames && can_is_canfd_skb(skb) &&
+ raw_dev_fd_enabled(dev, priv))
+ return CANFD_MTU;
+
+ /* CAN XL */
+ if (ro->xl_frames && can_is_canxl_skb(skb) &&
+ raw_dev_xl_enabled(dev, priv))
+ return CANXL_MTU;
+
+ return 0;
+}
+
+static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ struct sockcm_cookie sockc;
+ struct sk_buff *skb;
+ struct net_device *dev;
+ unsigned int txmtu;
+ int ifindex;
+ int err = -EINVAL;
+
+ /* check for valid CAN frame sizes */
+ if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU)
+ return -EINVAL;
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
+
+ if (msg->msg_namelen < RAW_MIN_NAMELEN)
+ return -EINVAL;
+
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ ifindex = addr->can_ifindex;
+ } else {
+ ifindex = ro->ifindex;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ if (!dev)
+ return -ENXIO;
+
+ skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto put_dev;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ /* fill the skb before testing for valid CAN frames */
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err < 0)
+ goto free_skb;
+
+ err = -EINVAL;
+
+ /* check for valid CAN (CC/FD/XL) frame content */
+ txmtu = raw_check_txframe(ro, skb, dev);
+ if (!txmtu)
+ goto free_skb;
+
+ /* only CANXL: clear/forward/set VCID value */
+ if (txmtu == CANXL_MTU)
+ raw_put_canxl_vcid(ro, skb);
+
+ sockcm_init(&sockc, sk);
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err))
+ goto free_skb;
+ }
+
+ skb->dev = dev;
+ skb->priority = sockc.priority;
+ skb->mark = sockc.mark;
+ skb->tstamp = sockc.transmit_time;
+
+ skb_setup_tx_timestamp(skb, &sockc);
+
+ err = can_send(skb, ro->loopback);
+
+ dev_put(dev);
+
+ if (err)
+ goto send_failed;
+
+ return size;
+
+free_skb:
+ kfree_skb(skb);
+put_dev:
+ dev_put(dev);
+send_failed:
+ return err;
+}
+
+static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sk, msg, size,
+ SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE);
+
+ skb = skb_recv_datagram(sk, flags, &err);
+ if (!skb)
+ return err;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ err = memcpy_to_msg(msg, skb->data, size);
+ if (err < 0) {
+ skb_free_datagram(sk, skb);
+ return err;
+ }
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (msg->msg_name) {
+ __sockaddr_check_size(RAW_MIN_NAMELEN);
+ msg->msg_namelen = RAW_MIN_NAMELEN;
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ /* assign the flags that have been recorded in raw_rcv() */
+ msg->msg_flags |= *(raw_flags(skb));
+
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops raw_ops = {
+ .family = PF_CAN,
+ .release = raw_release,
+ .bind = raw_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = raw_getname,
+ .poll = datagram_poll,
+ .ioctl = raw_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = raw_setsockopt,
+ .getsockopt = raw_getsockopt,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct proto raw_proto __read_mostly = {
+ .name = "CAN_RAW",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct raw_sock),
+ .init = raw_init,
+};
+
+static const struct can_proto raw_can_proto = {
+ .type = SOCK_RAW,
+ .protocol = CAN_RAW,
+ .ops = &raw_ops,
+ .prot = &raw_proto,
+};
+
+static struct notifier_block canraw_notifier = {
+ .notifier_call = raw_notifier
+};
+
+static __init int raw_module_init(void)
+{
+ int err;
+
+ pr_info("can: raw protocol\n");
+
+ err = register_netdevice_notifier(&canraw_notifier);
+ if (err)
+ return err;
+
+ err = can_proto_register(&raw_can_proto);
+ if (err < 0) {
+ pr_err("can: registration of raw protocol failed\n");
+ goto register_proto_failed;
+ }
+
+ return 0;
+
+register_proto_failed:
+ unregister_netdevice_notifier(&canraw_notifier);
+ return err;
+}
+
+static __exit void raw_module_exit(void)
+{
+ can_proto_unregister(&raw_can_proto);
+ unregister_netdevice_notifier(&canraw_notifier);
+}
+
+module_init(raw_module_init);
+module_exit(raw_module_exit);
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ea60e3ef0834
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config CEPH_LIB
+ tristate "Ceph core library"
+ depends on INET
+ select CRC32
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_GCM
+ select CRYPTO_LIB_SHA256
+ select CRYPTO
+ select KEYS
+ default n
+ help
+ Choose Y or M here to include cephlib, which provides the
+ common functionality to both the Ceph filesystem and
+ to the rados block device (rbd).
+
+ More information at https://ceph.io/.
+
+ If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+ bool "Include file:line in ceph debug output"
+ depends on CEPH_LIB
+ default n
+ help
+ If you say Y here, debug output will include a filename and
+ line to aid debugging. This increases kernel size and slows
+ execution slightly when debug call sites are enabled (e.g.,
+ via CONFIG_DYNAMIC_DEBUG).
+
+ If unsure, say N.
+
+config CEPH_LIB_USE_DNS_RESOLVER
+ bool "Use in-kernel support for DNS lookup"
+ depends on CEPH_LIB
+ select DNS_RESOLVER
+ default n
+ help
+ If you say Y here, hostnames (e.g. monitor addresses) will
+ be resolved using the CONFIG_DNS_RESOLVER facility.
+
+ For information on how to use CONFIG_DNS_RESOLVER consult
+ Documentation/networking/dns_resolver.rst
+
+ If unsure, say N.
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..8802a0c0155d
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for CEPH filesystem.
+#
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+ mon_client.o decode.o \
+ cls_lock_client.o \
+ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ striper.o \
+ debugfs.o \
+ auth.o auth_none.o \
+ crypto.o armor.o \
+ auth_x.o \
+ ceph_strings.o ceph_hash.o \
+ pagevec.o snapshot.o string_table.o \
+ messenger_v1.o messenger_v2.o
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 000000000000..0db8065928df
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+ return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+ if (c >= 'A' && c <= 'Z')
+ return c - 'A';
+ if (c >= 'a' && c <= 'z')
+ return c - 'a' + 26;
+ if (c >= '0' && c <= '9')
+ return c - '0' + 52;
+ if (c == '+')
+ return 62;
+ if (c == '/')
+ return 63;
+ if (c == '=')
+ return 0; /* just non-negative, please */
+ return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+ int line = 0;
+
+ while (src < end) {
+ unsigned char a, b, c;
+
+ a = *src++;
+ *dst++ = encode_bits(a >> 2);
+ if (src < end) {
+ b = *src++;
+ *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+ if (src < end) {
+ c = *src++;
+ *dst++ = encode_bits(((b & 15) << 2) |
+ (c >> 6));
+ *dst++ = encode_bits(c & 63);
+ } else {
+ *dst++ = encode_bits((b & 15) << 2);
+ *dst++ = '=';
+ }
+ } else {
+ *dst++ = encode_bits(((a & 3) << 4));
+ *dst++ = '=';
+ *dst++ = '=';
+ }
+ olen += 4;
+ line += 4;
+ if (line == 64) {
+ line = 0;
+ *(dst++) = '\n';
+ olen++;
+ }
+ }
+ return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+
+ while (src < end) {
+ int a, b, c, d;
+
+ if (src[0] == '\n') {
+ src++;
+ continue;
+ }
+ if (src + 4 > end)
+ return -EINVAL;
+ a = decode_bits(src[0]);
+ b = decode_bits(src[1]);
+ c = decode_bits(src[2]);
+ d = decode_bits(src[3]);
+ if (a < 0 || b < 0 || c < 0 || d < 0)
+ return -EINVAL;
+
+ *dst++ = (a << 2) | (b >> 4);
+ if (src[2] == '=')
+ return olen + 1;
+ *dst++ = ((b & 15) << 4) | (c >> 2);
+ if (src[3] == '=')
+ return olen + 2;
+ *dst++ = ((c & 3) << 6) | d;
+ olen += 3;
+ src += 4;
+ }
+ return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 000000000000..d38c9eadbe2f
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,659 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+ CEPH_AUTH_NONE,
+ CEPH_AUTH_CEPHX
+};
+
+static int init_protocol(struct ceph_auth_client *ac, int proto)
+{
+ dout("%s proto %d\n", __func__, proto);
+
+ switch (proto) {
+ case CEPH_AUTH_NONE:
+ return ceph_auth_none_init(ac);
+ case CEPH_AUTH_CEPHX:
+ return ceph_x_init(ac);
+ default:
+ pr_err("bad auth protocol %d\n", proto);
+ return -EINVAL;
+ }
+}
+
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id)
+{
+ dout("%s global_id %llu\n", __func__, global_id);
+
+ if (!global_id)
+ pr_err("got zero global_id\n");
+
+ if (ac->global_id && global_id != ac->global_id)
+ pr_err("global_id changed from %llu to %llu\n", ac->global_id,
+ global_id);
+
+ ac->global_id = global_id;
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes)
+{
+ struct ceph_auth_client *ac;
+
+ ac = kzalloc(sizeof(*ac), GFP_NOFS);
+ if (!ac)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&ac->mutex);
+ ac->negotiating = true;
+ if (name)
+ ac->name = name;
+ else
+ ac->name = CEPH_AUTH_NAME_DEFAULT;
+ ac->key = key;
+ ac->preferred_mode = con_modes[0];
+ ac->fallback_mode = con_modes[1];
+
+ dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+ ac->name, ac->preferred_mode, ac->fallback_mode);
+ return ac;
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+ dout("auth_destroy %p\n", ac);
+ if (ac->ops)
+ ac->ops->destroy(ac);
+ kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+ mutex_lock(&ac->mutex);
+ dout("auth_reset %p\n", ac);
+ if (ac->ops && !ac->negotiating)
+ ac->ops->reset(ac);
+ ac->negotiating = true;
+ mutex_unlock(&ac->mutex);
+}
+
+/*
+ * EntityName, not to be confused with entity_name_t
+ */
+int ceph_auth_entity_name_encode(const char *name, void **p, void *end)
+{
+ int len = strlen(name);
+
+ if (*p + 2*sizeof(u32) + len > end)
+ return -ERANGE;
+ ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_32(p, len);
+ ceph_encode_copy(p, name, len);
+ return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor. Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+ struct ceph_mon_request_header *monhdr = buf;
+ void *p = monhdr + 1, *end = buf + len, *lenp;
+ int i, num;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ dout("auth_build_hello\n");
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, CEPH_AUTH_UNKNOWN); /* no protocol, yet */
+
+ lenp = p;
+ p += sizeof(u32);
+
+ ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+ ceph_encode_8(&p, 1);
+ num = ARRAY_SIZE(supported_protocols);
+ ceph_encode_32(&p, num);
+ ceph_decode_need(&p, end, num * sizeof(u32), bad);
+ for (i = 0; i < num; i++)
+ ceph_encode_32(&p, supported_protocols[i]);
+
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret < 0)
+ goto out;
+ ceph_decode_need(&p, end, sizeof(u64), bad);
+ ceph_encode_64(&p, ac->global_id);
+
+ ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+ ret = p - buf;
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+bad:
+ ret = -ERANGE;
+ goto out;
+}
+
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+ void *buf, int buf_len)
+{
+ void *end = buf + buf_len;
+ void *p;
+ int ret;
+
+ p = buf;
+ if (add_header) {
+ /* struct ceph_mon_request_header + protocol */
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_16_safe(&p, end, -1, e_range);
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ }
+
+ ceph_encode_need(&p, end, sizeof(u32), e_range);
+ ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+ if (ret < 0) {
+ pr_err("auth protocol '%s' building request failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), ret);
+ return ret;
+ }
+ dout(" built request %d bytes\n", ret);
+ ceph_encode_32(&p, ret);
+ return p + ret - buf;
+
+e_range:
+ return -ERANGE;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len)
+{
+ void *p = buf;
+ void *end = buf + len;
+ int protocol;
+ s32 result;
+ u64 global_id;
+ void *payload, *payload_end;
+ int payload_len;
+ char *result_msg;
+ int result_msg_len;
+ int ret = -EINVAL;
+
+ mutex_lock(&ac->mutex);
+ dout("handle_auth_reply %p %p\n", p, end);
+ ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+ protocol = ceph_decode_32(&p);
+ result = ceph_decode_32(&p);
+ global_id = ceph_decode_64(&p);
+ payload_len = ceph_decode_32(&p);
+ payload = p;
+ p += payload_len;
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ result_msg_len = ceph_decode_32(&p);
+ result_msg = p;
+ p += result_msg_len;
+ if (p != end)
+ goto bad;
+
+ dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+ result_msg, global_id, payload_len);
+
+ payload_end = payload + payload_len;
+
+ if (ac->negotiating) {
+ /* server does not support our protocols? */
+ if (!protocol && result < 0) {
+ ret = result;
+ goto out;
+ }
+ /* set up (new) protocol handler? */
+ if (ac->protocol && ac->protocol != protocol) {
+ ac->ops->destroy(ac);
+ ac->protocol = 0;
+ ac->ops = NULL;
+ }
+ if (ac->protocol != protocol) {
+ ret = init_protocol(ac, protocol);
+ if (ret) {
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(protocol), ret);
+ goto out;
+ }
+ }
+
+ ac->negotiating = false;
+ }
+
+ if (result) {
+ pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+ ret = result;
+ goto out;
+ }
+
+ ret = ac->ops->handle_reply(ac, global_id, payload, payload_end,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN) {
+ ret = build_request(ac, true, reply_buf, reply_len);
+ goto out;
+ } else if (ret) {
+ goto out;
+ }
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+bad:
+ pr_err("failed to decode auth msg\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops->should_authenticate(ac))
+ ret = build_request(ac, true, msg_buf, msg_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops)
+ ret = ac->ops->is_authenticated(ac);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_is_authenticated);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer)
+ ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ else if (ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, auth);
+ else
+ ret = 0;
+ if (ret)
+ goto out;
+
+ *proto = ac->protocol;
+ if (pref_mode && fallb_mode) {
+ *pref_mode = ac->preferred_mode;
+ *fallb_mode = ac->fallback_mode;
+ }
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
+
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
+{
+ a->destroy(a);
+}
+EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
+
+int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->add_authorizer_challenge)
+ ret = ac->ops->add_authorizer_challenge(ac, a, challenge_buf,
+ challenge_buf_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
+
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->verify_authorizer_reply)
+ ret = ac->ops->verify_authorizer_reply(ac, a,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
+
+void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
+{
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+ mutex_unlock(&ac->mutex);
+}
+EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ if (arr[i] == val)
+ return true;
+ }
+
+ return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+ WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+ if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+ ceph_encode_32_safe(p, end, 2, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ ceph_encode_32_safe(p, end, fallb_mode, e_range);
+ } else {
+ ceph_encode_32_safe(p, end, 1, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ }
+
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+ int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+ void *end = buf + buf_len;
+ void *lenp;
+ void *p;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+ ret = init_protocol(ac, proto);
+ if (ret) {
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(proto), ret);
+ goto out;
+ }
+ } else {
+ WARN_ON(ac->protocol != proto);
+ ac->ops->reset(ac);
+ }
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+ if (ret)
+ goto out;
+
+ lenp = p;
+ p += 4; /* space for len */
+
+ ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret)
+ goto out;
+
+ ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+ ceph_encode_32(&lenp, p - lenp - 4);
+ ret = p - buf;
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+e_range:
+ ret = -ERANGE;
+ goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, false, buf, buf_len);
+ else
+ WARN_ON(ret >= 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, global_id, reply, reply + reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ WARN_ON(ret == -EAGAIN || ret > 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed\n",
+ ceph_auth_proto_name(ac->protocol));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->preferred_mode));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->fallback_mode));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ int pref_mode, fallb_mode;
+ int proto;
+ void *p;
+ int ret;
+
+ ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+ &pref_mode, &fallb_mode);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, proto, e_range);
+ ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+ if (ret)
+ return ret;
+
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ void *p;
+ int ret;
+
+ ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+ reply, reply_len);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed by %s\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->preferred_mode),
+ ceph_entity_type_name(peer_type));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->fallback_mode),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type), result);
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 000000000000..77b5519bc45f
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = true;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return xi->starting;
+}
+
+static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac,
+ struct ceph_none_authorizer *au)
+{
+ void *p = au->buf;
+ void *const end = p + sizeof(au->buf);
+ int ret;
+
+ ceph_encode_8_safe(&p, end, 1, e_range);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret < 0)
+ return ret;
+
+ ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+ au->buf_len = p - (void *)au->buf;
+ dout("%s built authorizer len %d\n", __func__, au->buf_len);
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
+{
+ return 0;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = false;
+ ceph_auth_set_global_id(ac, global_id);
+ return 0;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a)
+{
+ kfree(a);
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id. it is
+ * identical for all services we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_none_authorizer *au;
+ int ret;
+
+ au = kmalloc(sizeof(*au), GFP_NOFS);
+ if (!au)
+ return -ENOMEM;
+
+ au->base.destroy = ceph_auth_none_destroy_authorizer;
+
+ ret = ceph_auth_none_build_authorizer(ac, au);
+ if (ret) {
+ kfree(au);
+ return ret;
+ }
+
+ auth->authorizer = (struct ceph_authorizer *) au;
+ auth->authorizer_buf = au->buf;
+ auth->authorizer_buf_len = au->buf_len;
+ auth->authorizer_reply_buf = NULL;
+ auth->authorizer_reply_buf_len = 0;
+
+ return 0;
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+ .reset = reset,
+ .destroy = destroy,
+ .is_authenticated = is_authenticated,
+ .should_authenticate = should_authenticate,
+ .build_request = build_request,
+ .handle_reply = handle_reply,
+ .create_authorizer = ceph_auth_none_create_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi;
+
+ dout("ceph_auth_none_init %p\n", ac);
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ return -ENOMEM;
+
+ xi->starting = true;
+
+ ac->protocol = CEPH_AUTH_NONE;
+ ac->private = xi;
+ ac->ops = &ceph_auth_none_ops;
+ return 0;
+}
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 000000000000..bb121539e796
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+ struct ceph_authorizer base;
+ char buf[128];
+ int buf_len;
+};
+
+struct ceph_auth_none_info {
+ bool starting;
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 000000000000..a21c157daf7d
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,1124 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int missing;
+ int need; /* missing + need renewal */
+
+ ceph_x_validate_tickets(ac, &need);
+ missing = ac->want_keys & ~xi->have_keys;
+ WARN_ON((need & missing) != missing);
+ dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, missing, !missing);
+ return !missing;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, need, !!need);
+ return !!need;
+}
+
+static int ceph_x_encrypt_offset(void)
+{
+ return sizeof(u32) + sizeof(struct ceph_x_encrypt_header);
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+ return ceph_x_encrypt_offset() + ilen + 16;
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf,
+ int buf_len, int plaintext_len)
+{
+ struct ceph_x_encrypt_header *hdr = buf + sizeof(u32);
+ int ciphertext_len;
+ int ret;
+
+ hdr->struct_v = 1;
+ hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC);
+
+ ret = ceph_crypt(secret, true, buf + sizeof(u32), buf_len - sizeof(u32),
+ plaintext_len + sizeof(struct ceph_x_encrypt_header),
+ &ciphertext_len);
+ if (ret)
+ return ret;
+
+ ceph_encode_32(&buf, ciphertext_len);
+ return sizeof(u32) + ciphertext_len;
+}
+
+static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p,
+ int ciphertext_len)
+{
+ struct ceph_x_encrypt_header *hdr = p;
+ int plaintext_len;
+ int ret;
+
+ ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len,
+ &plaintext_len);
+ if (ret)
+ return ret;
+
+ if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) {
+ pr_err("%s bad magic\n", __func__);
+ return -EINVAL;
+ }
+
+ return plaintext_len - sizeof(*hdr);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end)
+{
+ int ciphertext_len;
+ int ret;
+
+ ceph_decode_32_safe(p, end, ciphertext_len, e_inval);
+ ceph_decode_need(p, end, ciphertext_len, e_inval);
+
+ ret = __ceph_x_decrypt(secret, *p, ciphertext_len);
+ if (ret < 0)
+ return ret;
+
+ *p += ciphertext_len;
+ return ret;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+ struct ceph_x_ticket_handler *th;
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+ while (*p) {
+ parent = *p;
+ th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+ if (service < th->service)
+ p = &(*p)->rb_left;
+ else if (service > th->service)
+ p = &(*p)->rb_right;
+ else
+ return th;
+ }
+
+ /* add it */
+ th = kzalloc(sizeof(*th), GFP_NOFS);
+ if (!th)
+ return ERR_PTR(-ENOMEM);
+ th->service = service;
+ rb_link_node(&th->node, parent, p);
+ rb_insert_color(&th->node, &xi->ticket_handlers);
+ return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("remove_ticket_handler %p %d\n", th, th->service);
+ rb_erase(&th->node, &xi->ticket_handlers);
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ kfree(th);
+}
+
+static int process_one_ticket(struct ceph_auth_client *ac,
+ struct ceph_crypto_key *secret,
+ void **p, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int type;
+ u8 tkt_struct_v, blob_struct_v;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int dlen;
+ char is_enc;
+ struct timespec64 validity;
+ void *tp, *tpend;
+ void **ptp;
+ struct ceph_crypto_key new_session_key = { 0 };
+ struct ceph_buffer *new_ticket_blob;
+ time64_t new_expires, new_renew_after;
+ u64 new_secret_id;
+ int ret;
+
+ ceph_decode_need(p, end, sizeof(u32) + 1, bad);
+
+ type = ceph_decode_32(p);
+ dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+ tkt_struct_v = ceph_decode_8(p);
+ if (tkt_struct_v != 1)
+ goto bad;
+
+ th = get_ticket_handler(ac, type);
+ if (IS_ERR(th)) {
+ ret = PTR_ERR(th);
+ goto out;
+ }
+
+ /* blob for me */
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
+ goto out;
+ dout(" decrypted %d bytes\n", ret);
+ dend = dp + ret;
+
+ ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad);
+ if (tkt_struct_v != 1)
+ goto bad;
+
+ ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+ if (ret)
+ goto out;
+
+ ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad);
+ ceph_decode_timespec64(&validity, dp);
+ dp += sizeof(struct ceph_timespec);
+ new_expires = ktime_get_real_seconds() + validity.tv_sec;
+ new_renew_after = new_expires - (validity.tv_sec / 4);
+ dout(" expires=%llu renew_after=%llu\n", new_expires,
+ new_renew_after);
+
+ /* ticket blob for service */
+ ceph_decode_8_safe(p, end, is_enc, bad);
+ if (is_enc) {
+ /* encrypted */
+ tp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, end);
+ if (ret < 0)
+ goto out;
+ dout(" encrypted ticket, decrypted %d bytes\n", ret);
+ ptp = &tp;
+ tpend = tp + ret;
+ } else {
+ /* unencrypted */
+ ptp = p;
+ tpend = end;
+ }
+ ceph_decode_32_safe(ptp, tpend, dlen, bad);
+ dout(" ticket blob is %d bytes\n", dlen);
+ ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad);
+ blob_struct_v = ceph_decode_8(ptp);
+ if (blob_struct_v != 1)
+ goto bad;
+
+ new_secret_id = ceph_decode_64(ptp);
+ ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend);
+ if (ret)
+ goto out;
+
+ /* all is well, update our ticket */
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ th->session_key = new_session_key;
+ th->ticket_blob = new_ticket_blob;
+ th->secret_id = new_secret_id;
+ th->expires = new_expires;
+ th->renew_after = new_renew_after;
+ th->have_key = true;
+ dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+ type, ceph_entity_type_name(type), th->secret_id,
+ (int)th->ticket_blob->vec.iov_len);
+ xi->have_keys |= th->service;
+ return 0;
+
+bad:
+ ret = -EINVAL;
+out:
+ ceph_crypto_key_destroy(&new_session_key);
+ return ret;
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+ struct ceph_crypto_key *secret,
+ void **p, void *end)
+{
+ u8 reply_struct_v;
+ u32 num;
+ int ret;
+
+ ceph_decode_8_safe(p, end, reply_struct_v, bad);
+ if (reply_struct_v != 1)
+ return -EINVAL;
+
+ ceph_decode_32_safe(p, end, num, bad);
+ dout("%d tickets\n", num);
+
+ while (num--) {
+ ret = process_one_ticket(ac, secret, p, end);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+/*
+ * Encode and encrypt the second part (ceph_x_authorize_b) of the
+ * authorizer. The first part (ceph_x_authorize_a) should already be
+ * encoded.
+ */
+static int encrypt_authorizer(struct ceph_x_authorizer *au,
+ u64 *server_challenge)
+{
+ struct ceph_x_authorize_a *msg_a;
+ struct ceph_x_authorize_b *msg_b;
+ void *p, *end;
+ int ret;
+
+ msg_a = au->buf->vec.iov_base;
+ WARN_ON(msg_a->ticket_blob.secret_id != cpu_to_le64(au->secret_id));
+ p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len);
+ end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+ msg_b = p + ceph_x_encrypt_offset();
+ msg_b->struct_v = 2;
+ msg_b->nonce = cpu_to_le64(au->nonce);
+ if (server_challenge) {
+ msg_b->have_challenge = 1;
+ msg_b->server_challenge_plus_one =
+ cpu_to_le64(*server_challenge + 1);
+ } else {
+ msg_b->have_challenge = 0;
+ msg_b->server_challenge_plus_one = 0;
+ }
+
+ ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
+ if (ret < 0)
+ return ret;
+
+ p += ret;
+ if (server_challenge) {
+ WARN_ON(p != end);
+ } else {
+ WARN_ON(p > end);
+ au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+ }
+
+ return 0;
+}
+
+static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au)
+{
+ ceph_crypto_key_destroy(&au->session_key);
+ if (au->buf) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th,
+ struct ceph_x_authorizer *au)
+{
+ int maxlen;
+ struct ceph_x_authorize_a *msg_a;
+ struct ceph_x_authorize_b *msg_b;
+ int ret;
+ int ticket_blob_len =
+ (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+ dout("build_authorizer for %s %p\n",
+ ceph_entity_type_name(th->service), au);
+
+ ceph_crypto_key_destroy(&au->session_key);
+ ret = ceph_crypto_key_clone(&au->session_key, &th->session_key);
+ if (ret)
+ goto out_au;
+
+ maxlen = sizeof(*msg_a) + ticket_blob_len +
+ ceph_x_encrypt_buflen(sizeof(*msg_b));
+ dout(" need len %d\n", maxlen);
+ if (au->buf && au->buf->alloc_len < maxlen) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+ if (!au->buf) {
+ au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+ if (!au->buf) {
+ ret = -ENOMEM;
+ goto out_au;
+ }
+ }
+ au->service = th->service;
+ WARN_ON(!th->secret_id);
+ au->secret_id = th->secret_id;
+
+ msg_a = au->buf->vec.iov_base;
+ msg_a->struct_v = 1;
+ msg_a->global_id = cpu_to_le64(ac->global_id);
+ msg_a->service_id = cpu_to_le32(th->service);
+ msg_a->ticket_blob.struct_v = 1;
+ msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+ msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+ if (ticket_blob_len) {
+ memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+ th->ticket_blob->vec.iov_len);
+ }
+ dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+ le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+ get_random_bytes(&au->nonce, sizeof(au->nonce));
+ ret = encrypt_authorizer(au, NULL);
+ if (ret) {
+ pr_err("failed to encrypt authorizer: %d", ret);
+ goto out_au;
+ }
+
+ dout(" built authorizer nonce %llx len %d\n", au->nonce,
+ (int)au->buf->vec.iov_len);
+ return 0;
+
+out_au:
+ ceph_x_authorizer_cleanup(au);
+ return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+ void **p, void *end)
+{
+ ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, th->secret_id);
+ if (th->ticket_blob) {
+ const char *buf = th->ticket_blob->vec.iov_base;
+ u32 len = th->ticket_blob->vec.iov_len;
+
+ ceph_encode_32_safe(p, end, len, bad);
+ ceph_encode_copy_safe(p, end, buf, len, bad);
+ } else {
+ ceph_encode_32_safe(p, end, 0, bad);
+ }
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+static bool need_key(struct ceph_x_ticket_handler *th)
+{
+ if (!th->have_key)
+ return true;
+
+ return ktime_get_real_seconds() >= th->renew_after;
+}
+
+static bool have_key(struct ceph_x_ticket_handler *th)
+{
+ if (th->have_key && ktime_get_real_seconds() >= th->expires) {
+ dout("ticket %d (%s) secret_id %llu expired\n", th->service,
+ ceph_entity_type_name(th->service), th->secret_id);
+ th->have_key = false;
+ }
+
+ return th->have_key;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+ int want = ac->want_keys;
+ struct ceph_x_info *xi = ac->private;
+ int service;
+
+ *pneed = ac->want_keys & ~(xi->have_keys);
+
+ for (service = 1; service <= want; service <<= 1) {
+ struct ceph_x_ticket_handler *th;
+
+ if (!(ac->want_keys & service))
+ continue;
+
+ if (*pneed & service)
+ continue;
+
+ th = get_ticket_handler(ac, service);
+ if (IS_ERR(th)) {
+ *pneed |= service;
+ continue;
+ }
+
+ if (need_key(th))
+ *pneed |= service;
+ if (!have_key(th))
+ xi->have_keys &= ~service;
+ }
+}
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+ struct ceph_x_request_header *head = buf;
+ void *p;
+ int ret;
+ struct ceph_x_ticket_handler *th =
+ get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys,
+ xi->have_keys, need);
+
+ if (need & CEPH_ENTITY_TYPE_AUTH) {
+ struct ceph_x_authenticate *auth = (void *)(head + 1);
+ void *enc_buf = xi->auth_authorizer.enc_buf;
+ struct ceph_x_challenge_blob *blob = enc_buf +
+ ceph_x_encrypt_offset();
+ u64 *u;
+
+ p = auth + 1;
+ if (p > end)
+ return -ERANGE;
+
+ dout(" get_auth_session_key\n");
+ head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+ /* encrypt and hash */
+ get_random_bytes(&auth->client_challenge, sizeof(u64));
+ blob->client_challenge = auth->client_challenge;
+ blob->server_challenge = cpu_to_le64(xi->server_challenge);
+ ret = ceph_x_encrypt(&xi->secret, enc_buf, CEPHX_AU_ENC_BUF_LEN,
+ sizeof(*blob));
+ if (ret < 0)
+ return ret;
+
+ auth->struct_v = 3; /* nautilus+ */
+ auth->key = 0;
+ for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
+ auth->key ^= *(__le64 *)u;
+ dout(" server_challenge %llx client_challenge %llx key %llx\n",
+ xi->server_challenge, le64_to_cpu(auth->client_challenge),
+ le64_to_cpu(auth->key));
+
+ /* now encode the old ticket if exists */
+ ret = ceph_x_encode_ticket(th, &p, end);
+ if (ret < 0)
+ return ret;
+
+ /* nautilus+: request service tickets at the same time */
+ need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+ WARN_ON(!need);
+ ceph_encode_32_safe(&p, end, need, e_range);
+ return p - buf;
+ }
+
+ if (need) {
+ dout(" get_principal_session_key\n");
+ ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY,
+ e_range);
+ ceph_encode_copy_safe(&p, end,
+ xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len, e_range);
+ ceph_encode_8_safe(&p, end, 1, e_range);
+ ceph_encode_32_safe(&p, end, need, e_range);
+ return p - buf;
+ }
+
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+static int decode_con_secret(void **p, void *end, u8 *con_secret,
+ int *con_secret_len)
+{
+ int len;
+
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+
+ dout("%s len %d\n", __func__, len);
+ if (con_secret) {
+ if (len > CEPH_MAX_CON_SECRET_LEN) {
+ pr_err("connection secret too big %d\n", len);
+ goto bad_memzero;
+ }
+ memcpy(con_secret, *p, len);
+ *con_secret_len = len;
+ }
+ memzero_explicit(*p, len);
+ *p += len;
+ return 0;
+
+bad_memzero:
+ memzero_explicit(*p, len);
+bad:
+ pr_err("failed to decode connection secret\n");
+ return -EINVAL;
+}
+
+static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id,
+ void **p, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int len;
+ int ret;
+
+ /* AUTH ticket */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+ if (ret)
+ return ret;
+
+ ceph_auth_set_global_id(ac, global_id);
+ if (*p == end) {
+ /* pre-nautilus (or didn't request service tickets!) */
+ WARN_ON(session_key || con_secret);
+ return 0;
+ }
+
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ if (session_key) {
+ memcpy(session_key, th->session_key.key, th->session_key.len);
+ *session_key_len = th->session_key.len;
+ }
+
+ /* connection secret */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ ceph_decode_need(p, end, len, e_inval);
+ dout("%s connection secret blob len %d\n", __func__, len);
+ if (len > 0) {
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+ if (ret)
+ return ret;
+ }
+
+ /* service tickets */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ ceph_decode_need(p, end, len, e_inval);
+ dout("%s service tickets blob len %d\n", __func__, len);
+ if (len > 0) {
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ p, *p + len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_ticket_handler *th;
+ int len = end - buf;
+ int result;
+ void *p;
+ int op;
+ int ret;
+
+ if (xi->starting) {
+ /* it's a hello */
+ struct ceph_x_server_challenge *sc = buf;
+
+ if (len != sizeof(*sc))
+ return -EINVAL;
+ xi->server_challenge = le64_to_cpu(sc->server_challenge);
+ dout("handle_reply got server challenge %llx\n",
+ xi->server_challenge);
+ xi->starting = false;
+ xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+ return -EAGAIN;
+ }
+
+ p = buf;
+ ceph_decode_16_safe(&p, end, op, e_inval);
+ ceph_decode_32_safe(&p, end, result, e_inval);
+ dout("handle_reply op %d result %d\n", op, result);
+ switch (op) {
+ case CEPHX_GET_AUTH_SESSION_KEY:
+ /* AUTH ticket + [connection secret] + service tickets */
+ ret = handle_auth_session_key(ac, global_id, &p, end,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ break;
+
+ case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ /* service tickets */
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (ret)
+ return ret;
+ if (ac->want_keys == xi->have_keys)
+ return 0;
+ return -EAGAIN;
+
+e_inval:
+ return -EINVAL;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+
+ ceph_x_authorizer_cleanup(au);
+ kfree(au);
+}
+
+static int ceph_x_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+ int ret;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = kzalloc(sizeof(*au), GFP_NOFS);
+ if (!au)
+ return -ENOMEM;
+
+ au->base.destroy = ceph_x_destroy_authorizer;
+
+ ret = ceph_x_build_authorizer(ac, th, au);
+ if (ret) {
+ kfree(au);
+ return ret;
+ }
+
+ auth->authorizer = (struct ceph_authorizer *) au;
+ auth->authorizer_buf = au->buf->vec.iov_base;
+ auth->authorizer_buf_len = au->buf->vec.iov_len;
+ auth->authorizer_reply_buf = au->enc_buf;
+ auth->authorizer_reply_buf_len = CEPHX_AU_ENC_BUF_LEN;
+ auth->sign_message = ac->ops->sign_message;
+ auth->check_message_signature = ac->ops->check_message_signature;
+
+ return 0;
+}
+
+static int ceph_x_update_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = (struct ceph_x_authorizer *)auth->authorizer;
+ if (au->secret_id < th->secret_id) {
+ dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
+ au->service, au->secret_id, th->secret_id);
+ return ceph_x_build_authorizer(ac, th, au);
+ }
+ return 0;
+}
+
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+ void *challenge, int challenge_len,
+ u64 *server_challenge)
+{
+ void *dp, *dend;
+ int ret;
+
+ /* no leading len */
+ ret = __ceph_x_decrypt(secret, challenge, challenge_len);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dp = challenge + sizeof(struct ceph_x_encrypt_header);
+ dend = dp + ret;
+
+ ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */
+ ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+ dout("%s server_challenge %llu\n", __func__, *server_challenge);
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge, int challenge_len)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+ u64 server_challenge;
+ int ret;
+
+ ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+ challenge_len, &server_challenge);
+ if (ret) {
+ pr_err("failed to decrypt authorize challenge: %d", ret);
+ return ret;
+ }
+
+ ret = encrypt_authorizer(au, &server_challenge);
+ if (ret) {
+ pr_err("failed to encrypt authorizer w/ challenge: %d", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+ void **p, void *end, u64 *nonce_plus_one,
+ u8 *con_secret, int *con_secret_len)
+{
+ void *dp, *dend;
+ u8 struct_v;
+ int ret;
+
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+ ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+ dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+ if (struct_v >= 2) {
+ ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+ u64 nonce_plus_one;
+ int ret;
+
+ if (session_key) {
+ memcpy(session_key, au->session_key.key, au->session_key.len);
+ *session_key_len = au->session_key.len;
+ }
+
+ ret = decrypt_authorizer_reply(&au->session_key, &reply,
+ reply + reply_len, &nonce_plus_one,
+ con_secret, con_secret_len);
+ if (ret)
+ return ret;
+
+ if (nonce_plus_one != au->nonce + 1) {
+ pr_err("failed to authenticate server\n");
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("reset\n");
+ xi->starting = true;
+ xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *p;
+
+ dout("ceph_x_destroy %p\n", ac);
+ ceph_crypto_key_destroy(&xi->secret);
+
+ while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+ struct ceph_x_ticket_handler *th =
+ rb_entry(p, struct ceph_x_ticket_handler, node);
+ remove_ticket_handler(ac, th);
+ }
+
+ ceph_x_authorizer_cleanup(&xi->auth_authorizer);
+
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
+{
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return;
+
+ if (th->have_key) {
+ dout("ticket %d (%s) secret_id %llu invalidated\n",
+ th->service, ceph_entity_type_name(th->service),
+ th->secret_id);
+ th->have_key = false;
+ }
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type)
+{
+ /*
+ * We are to invalidate a service ticket in the hopes of
+ * getting a new, hopefully more valid, one. But, we won't get
+ * it unless our AUTH ticket is good, so invalidate AUTH ticket
+ * as well, just in case.
+ */
+ invalidate_ticket(ac, peer_type);
+ invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
+}
+
+static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg,
+ __le64 *psig)
+{
+ void *enc_buf = au->enc_buf;
+ int ret;
+
+ if (!CEPH_HAVE_FEATURE(msg->con->peer_features, CEPHX_V2)) {
+ struct {
+ __le32 len;
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 middle_crc;
+ __le32 data_crc;
+ } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
+
+ sigblock->len = cpu_to_le32(4*sizeof(u32));
+ sigblock->header_crc = msg->hdr.crc;
+ sigblock->front_crc = msg->footer.front_crc;
+ sigblock->middle_crc = msg->footer.middle_crc;
+ sigblock->data_crc = msg->footer.data_crc;
+
+ ret = ceph_x_encrypt(&au->session_key, enc_buf,
+ CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock));
+ if (ret < 0)
+ return ret;
+
+ *psig = *(__le64 *)(enc_buf + sizeof(u32));
+ } else {
+ struct {
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 front_len;
+ __le32 middle_crc;
+ __le32 middle_len;
+ __le32 data_crc;
+ __le32 data_len;
+ __le32 seq_lower_word;
+ } __packed *sigblock = enc_buf;
+ struct {
+ __le64 a, b, c, d;
+ } __packed *penc = enc_buf;
+ int ciphertext_len;
+
+ sigblock->header_crc = msg->hdr.crc;
+ sigblock->front_crc = msg->footer.front_crc;
+ sigblock->front_len = msg->hdr.front_len;
+ sigblock->middle_crc = msg->footer.middle_crc;
+ sigblock->middle_len = msg->hdr.middle_len;
+ sigblock->data_crc = msg->footer.data_crc;
+ sigblock->data_len = msg->hdr.data_len;
+ sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq;
+
+ /* no leading len, no ceph_x_encrypt_header */
+ ret = ceph_crypt(&au->session_key, true, enc_buf,
+ CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock),
+ &ciphertext_len);
+ if (ret)
+ return ret;
+
+ *psig = penc->a ^ penc->b ^ penc->c ^ penc->d;
+ }
+
+ return 0;
+}
+
+static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ __le64 sig;
+ int ret;
+
+ if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
+ return 0;
+
+ ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig);
+ if (ret)
+ return ret;
+
+ msg->footer.sig = sig;
+ msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED;
+ return 0;
+}
+
+static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ __le64 sig_check;
+ int ret;
+
+ if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
+ return 0;
+
+ ret = calc_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig_check);
+ if (ret)
+ return ret;
+ if (sig_check == msg->footer.sig)
+ return 0;
+ if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED)
+ dout("ceph_x_check_message_signature %p has signature %llx "
+ "expect %llx\n", msg, msg->footer.sig, sig_check);
+ else
+ dout("ceph_x_check_message_signature %p sender did not set "
+ "CEPH_MSG_FOOTER_SIGNED\n", msg);
+ return -EBADMSG;
+}
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+ .is_authenticated = ceph_x_is_authenticated,
+ .should_authenticate = ceph_x_should_authenticate,
+ .build_request = ceph_x_build_request,
+ .handle_reply = ceph_x_handle_reply,
+ .create_authorizer = ceph_x_create_authorizer,
+ .update_authorizer = ceph_x_update_authorizer,
+ .add_authorizer_challenge = ceph_x_add_authorizer_challenge,
+ .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+ .invalidate_authorizer = ceph_x_invalidate_authorizer,
+ .reset = ceph_x_reset,
+ .destroy = ceph_x_destroy,
+ .sign_message = ceph_x_sign_message,
+ .check_message_signature = ceph_x_check_message_signature,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi;
+ int ret;
+
+ dout("ceph_x_init %p\n", ac);
+ ret = -ENOMEM;
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ goto out;
+
+ ret = -EINVAL;
+ if (!ac->key) {
+ pr_err("no secret set (for auth_x protocol)\n");
+ goto out_nomem;
+ }
+
+ ret = ceph_crypto_key_clone(&xi->secret, ac->key);
+ if (ret < 0) {
+ pr_err("cannot clone key: %d\n", ret);
+ goto out_nomem;
+ }
+
+ xi->starting = true;
+ xi->ticket_handlers = RB_ROOT;
+
+ ac->protocol = CEPH_AUTH_CEPHX;
+ ac->private = xi;
+ ac->ops = &ceph_x_ops;
+ return 0;
+
+out_nomem:
+ kfree(xi);
+out:
+ return ret;
+}
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 000000000000..c03735f96df9
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+ struct rb_node node;
+ unsigned int service;
+
+ struct ceph_crypto_key session_key;
+ bool have_key;
+
+ u64 secret_id;
+ struct ceph_buffer *ticket_blob;
+
+ time64_t renew_after, expires;
+};
+
+#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */
+
+struct ceph_x_authorizer {
+ struct ceph_authorizer base;
+ struct ceph_crypto_key session_key;
+ struct ceph_buffer *buf;
+ unsigned int service;
+ u64 nonce;
+ u64 secret_id;
+ char enc_buf[CEPHX_AU_ENC_BUF_LEN] __aligned(8);
+};
+
+struct ceph_x_info {
+ struct ceph_crypto_key secret;
+
+ bool starting;
+ u64 server_challenge;
+
+ unsigned int have_keys;
+ struct rb_root ticket_handlers;
+
+ struct ceph_x_authorizer auth_authorizer;
+};
+
+int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..9c60feeb1bcb
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY 0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+ __u8 struct_v;
+ __le64 secret_id;
+ __le32 blob_len;
+ char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+ __le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+ __le16 op;
+ __le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+ __u8 struct_v;
+ __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+ __u8 struct_v;
+ __le64 client_challenge;
+ __le64 key;
+ /* old_ticket blob */
+ /* nautilus+: other_keys */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+ __u8 struct_v;
+ __le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+ __le64 server_challenge;
+ __le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ * a - service id, ticket blob
+ * b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+ __u8 struct_v;
+ __le64 global_id;
+ __le32 service_id;
+ struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+ __u8 struct_v;
+ __le64 nonce;
+ __u8 have_challenge;
+ __le64 server_challenge_plus_one;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_challenge {
+ __u8 struct_v;
+ __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+ __u8 struct_v;
+ __le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encryption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+ __u8 struct_v;
+ __le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 000000000000..7e51f128045d
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h> /* for kvmalloc */
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+ struct ceph_buffer *b;
+
+ b = kmalloc(sizeof(*b), gfp);
+ if (!b)
+ return NULL;
+
+ b->vec.iov_base = kvmalloc(len, gfp);
+ if (!b->vec.iov_base) {
+ kfree(b);
+ return NULL;
+ }
+
+ kref_init(&b->kref);
+ b->alloc_len = len;
+ b->vec.iov_len = len;
+ dout("buffer_new %p\n", b);
+ return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+ struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+ dout("buffer_release %p\n", b);
+ kvfree(b->vec.iov_base);
+ kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+ size_t len;
+
+ ceph_decode_need(p, end, sizeof(u32), bad);
+ len = ceph_decode_32(p);
+ dout("decode_buffer len %d\n", (int)len);
+ ceph_decode_need(p, end, len, bad);
+ *b = ceph_buffer_new(len, GFP_NOFS);
+ if (!*b)
+ return -ENOMEM;
+ ceph_decode_copy(p, (*b)->vec.iov_base, len);
+ return 0;
+bad:
+ return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..e734e57be083
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,926 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/key.h>
+#include <keys/ceph-type.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/fs_parser.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include "crypto.h"
+
+
+/*
+ * Module compatibility interface. For now it doesn't do anything,
+ * but its existence signals a certain level of functionality.
+ *
+ * The data buffer is used to pass information both to and from
+ * libceph. The return value indicates whether libceph determines
+ * it is compatible with the caller (from another kernel module),
+ * given the provided data.
+ *
+ * The data pointer can be null.
+ */
+bool libceph_compatible(void *data)
+{
+ return true;
+}
+EXPORT_SYMBOL(libceph_compatible);
+
+static int param_get_supported_features(char *buffer,
+ const struct kernel_param *kp)
+{
+ return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT);
+}
+static const struct kernel_param_ops param_ops_supported_features = {
+ .get = param_get_supported_features,
+};
+module_param_cb(supported_features, &param_ops_supported_features, NULL,
+ 0444);
+
+const char *ceph_msg_type_name(int type)
+{
+ switch (type) {
+ case CEPH_MSG_SHUTDOWN: return "shutdown";
+ case CEPH_MSG_PING: return "ping";
+ case CEPH_MSG_AUTH: return "auth";
+ case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+ case CEPH_MSG_MON_MAP: return "mon_map";
+ case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+ case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+ case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+ case CEPH_MSG_STATFS: return "statfs";
+ case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+ case CEPH_MSG_MON_GET_VERSION: return "mon_get_version";
+ case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply";
+ case CEPH_MSG_MDS_MAP: return "mds_map";
+ case CEPH_MSG_FS_MAP_USER: return "fs_map_user";
+ case CEPH_MSG_CLIENT_SESSION: return "client_session";
+ case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+ case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+ case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+ case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+ case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+ case CEPH_MSG_CLIENT_QUOTA: return "client_quota";
+ case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+ case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+ case CEPH_MSG_POOLOP_REPLY: return "poolop_reply";
+ case CEPH_MSG_POOLOP: return "poolop";
+ case CEPH_MSG_MON_COMMAND: return "mon_command";
+ case CEPH_MSG_MON_COMMAND_ACK: return "mon_command_ack";
+ case CEPH_MSG_OSD_MAP: return "osd_map";
+ case CEPH_MSG_OSD_OP: return "osd_op";
+ case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+ case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
+ case CEPH_MSG_OSD_BACKOFF: return "osd_backoff";
+ default: return "unknown";
+ }
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+ if (client->have_fsid) {
+ if (ceph_fsid_compare(&client->fsid, fsid)) {
+ pr_err("bad fsid, had %pU got %pU",
+ &client->fsid, fsid);
+ return -1;
+ }
+ } else {
+ memcpy(&client->fsid, fsid, sizeof(*fsid));
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+ if (!s1 && !s2)
+ return 0;
+ if (s1 && !s2)
+ return -1;
+ if (!s1 && s2)
+ return 1;
+ return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+ struct ceph_client *client)
+{
+ struct ceph_options *opt1 = new_opt;
+ struct ceph_options *opt2 = client->options;
+ int ofs = offsetof(struct ceph_options, mon_addr);
+ int i;
+ int ret;
+
+ /*
+ * Don't bother comparing options if network namespaces don't
+ * match.
+ */
+ if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net)))
+ return -1;
+
+ ret = memcmp(opt1, opt2, ofs);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(opt1->name, opt2->name);
+ if (ret)
+ return ret;
+
+ if (opt1->key && !opt2->key)
+ return -1;
+ if (!opt1->key && opt2->key)
+ return 1;
+ if (opt1->key && opt2->key) {
+ if (opt1->key->type != opt2->key->type)
+ return -1;
+ if (opt1->key->created.tv_sec != opt2->key->created.tv_sec)
+ return -1;
+ if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec)
+ return -1;
+ if (opt1->key->len != opt2->key->len)
+ return -1;
+ if (opt1->key->key && !opt2->key->key)
+ return -1;
+ if (!opt1->key->key && opt2->key->key)
+ return 1;
+ if (opt1->key->key && opt2->key->key) {
+ ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len);
+ if (ret)
+ return ret;
+ }
+ }
+
+ ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
+ if (ret)
+ return ret;
+
+ /* any matching mon ip implies a match */
+ for (i = 0; i < opt1->num_mon; i++) {
+ if (ceph_monmap_contains(client->monc.monmap,
+ &opt1->mon_addr[i]))
+ return 0;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+ int i = 0;
+ char tmp[3];
+ int err = -EINVAL;
+ int d;
+
+ dout("%s '%s'\n", __func__, str);
+ tmp[2] = 0;
+ while (*str && i < 16) {
+ if (ispunct(*str)) {
+ str++;
+ continue;
+ }
+ if (!isxdigit(str[0]) || !isxdigit(str[1]))
+ break;
+ tmp[0] = str[0];
+ tmp[1] = str[1];
+ if (sscanf(tmp, "%x", &d) < 1)
+ break;
+ fsid->fsid[i] = d & 0xff;
+ i++;
+ str += 2;
+ }
+
+ if (i == 16)
+ err = 0;
+ dout("%s ret %d got fsid %pU\n", __func__, err, fsid);
+ return err;
+}
+EXPORT_SYMBOL(ceph_parse_fsid);
+
+/*
+ * ceph options
+ */
+enum {
+ Opt_osdkeepalivetimeout,
+ Opt_mount_timeout,
+ Opt_osd_idle_ttl,
+ Opt_osd_request_timeout,
+ /* int args above */
+ Opt_fsid,
+ Opt_name,
+ Opt_secret,
+ Opt_key,
+ Opt_ip,
+ Opt_crush_location,
+ Opt_read_from_replica,
+ Opt_ms_mode,
+ /* string args above */
+ Opt_share,
+ Opt_crc,
+ Opt_cephx_require_signatures,
+ Opt_cephx_sign_messages,
+ Opt_tcp_nodelay,
+ Opt_abort_on_full,
+ Opt_rxbounce,
+};
+
+enum {
+ Opt_read_from_replica_no,
+ Opt_read_from_replica_balance,
+ Opt_read_from_replica_localize,
+};
+
+static const struct constant_table ceph_param_read_from_replica[] = {
+ {"no", Opt_read_from_replica_no},
+ {"balance", Opt_read_from_replica_balance},
+ {"localize", Opt_read_from_replica_localize},
+ {}
+};
+
+enum ceph_ms_mode {
+ Opt_ms_mode_legacy,
+ Opt_ms_mode_crc,
+ Opt_ms_mode_secure,
+ Opt_ms_mode_prefer_crc,
+ Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+ {"legacy", Opt_ms_mode_legacy},
+ {"crc", Opt_ms_mode_crc},
+ {"secure", Opt_ms_mode_secure},
+ {"prefer-crc", Opt_ms_mode_prefer_crc},
+ {"prefer-secure", Opt_ms_mode_prefer_secure},
+ {}
+};
+
+static const struct fs_parameter_spec ceph_parameters[] = {
+ fsparam_flag ("abort_on_full", Opt_abort_on_full),
+ __fsparam (NULL, "cephx_require_signatures", Opt_cephx_require_signatures,
+ fs_param_neg_with_no|fs_param_deprecated, NULL),
+ fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages),
+ fsparam_flag_no ("crc", Opt_crc),
+ fsparam_string ("crush_location", Opt_crush_location),
+ fsparam_string ("fsid", Opt_fsid),
+ fsparam_string ("ip", Opt_ip),
+ fsparam_string ("key", Opt_key),
+ fsparam_u32 ("mount_timeout", Opt_mount_timeout),
+ fsparam_string ("name", Opt_name),
+ fsparam_u32 ("osd_idle_ttl", Opt_osd_idle_ttl),
+ fsparam_u32 ("osd_request_timeout", Opt_osd_request_timeout),
+ fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout),
+ fsparam_enum ("read_from_replica", Opt_read_from_replica,
+ ceph_param_read_from_replica),
+ fsparam_flag ("rxbounce", Opt_rxbounce),
+ fsparam_enum ("ms_mode", Opt_ms_mode,
+ ceph_param_ms_mode),
+ fsparam_string ("secret", Opt_secret),
+ fsparam_flag_no ("share", Opt_share),
+ fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay),
+ {}
+};
+
+struct ceph_options *ceph_alloc_options(void)
+{
+ struct ceph_options *opt;
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return NULL;
+
+ opt->crush_locs = RB_ROOT;
+ opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+ GFP_KERNEL);
+ if (!opt->mon_addr) {
+ kfree(opt);
+ return NULL;
+ }
+
+ opt->flags = CEPH_OPT_DEFAULT;
+ opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
+ opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ return opt;
+}
+EXPORT_SYMBOL(ceph_alloc_options);
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+ dout("destroy_options %p\n", opt);
+ if (!opt)
+ return;
+
+ ceph_clear_crush_locs(&opt->crush_locs);
+ kfree(opt->name);
+ if (opt->key) {
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
+ }
+ kfree(opt->mon_addr);
+ kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+/* get secret from key store */
+static int get_secret(struct ceph_crypto_key *dst, const char *name,
+ struct p_log *log)
+{
+ struct key *ukey;
+ int key_err;
+ int err = 0;
+ struct ceph_crypto_key *ckey;
+
+ ukey = request_key(&key_type_ceph, name, NULL);
+ if (IS_ERR(ukey)) {
+ /* request_key errors don't map nicely to mount(2)
+ errors; don't even try, but still printk */
+ key_err = PTR_ERR(ukey);
+ switch (key_err) {
+ case -ENOKEY:
+ error_plog(log, "Failed due to key not found: %s",
+ name);
+ break;
+ case -EKEYEXPIRED:
+ error_plog(log, "Failed due to expired key: %s",
+ name);
+ break;
+ case -EKEYREVOKED:
+ error_plog(log, "Failed due to revoked key: %s",
+ name);
+ break;
+ default:
+ error_plog(log, "Failed due to key error %d: %s",
+ key_err, name);
+ }
+ err = -EPERM;
+ goto out;
+ }
+
+ ckey = ukey->payload.data[0];
+ err = ceph_crypto_key_clone(dst, ckey);
+ if (err)
+ goto out_key;
+ /* pass through, err is 0 */
+
+out_key:
+ key_put(ukey);
+out:
+ return err;
+}
+
+int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
+ struct fc_log *l, char delim)
+{
+ struct p_log log = {.prefix = "libceph", .log = l};
+ int ret;
+
+ /* ip1[:port1][<delim>ip2[:port2]...] */
+ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON,
+ &opt->num_mon, delim);
+ if (ret) {
+ error_plog(&log, "Failed to parse monitor IPs: %d", ret);
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_parse_mon_ips);
+
+int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
+ struct fc_log *l)
+{
+ struct fs_parse_result result;
+ int token, err;
+ struct p_log log = {.prefix = "libceph", .log = l};
+
+ token = __fs_parse(&log, ceph_parameters, param, &result);
+ dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_ip:
+ err = ceph_parse_ips(param->string,
+ param->string + param->size,
+ &opt->my_addr, 1, NULL, ',');
+ if (err) {
+ error_plog(&log, "Failed to parse ip: %d", err);
+ return err;
+ }
+ opt->flags |= CEPH_OPT_MYIP;
+ break;
+
+ case Opt_fsid:
+ err = ceph_parse_fsid(param->string, &opt->fsid);
+ if (err) {
+ error_plog(&log, "Failed to parse fsid: %d", err);
+ return err;
+ }
+ opt->flags |= CEPH_OPT_FSID;
+ break;
+ case Opt_name:
+ kfree(opt->name);
+ opt->name = param->string;
+ param->string = NULL;
+ break;
+ case Opt_secret:
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
+
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key)
+ return -ENOMEM;
+ err = ceph_crypto_key_unarmor(opt->key, param->string);
+ if (err) {
+ error_plog(&log, "Failed to parse secret: %d", err);
+ return err;
+ }
+ break;
+ case Opt_key:
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
+
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key)
+ return -ENOMEM;
+ return get_secret(opt->key, param->string, &log);
+ case Opt_crush_location:
+ ceph_clear_crush_locs(&opt->crush_locs);
+ err = ceph_parse_crush_location(param->string,
+ &opt->crush_locs);
+ if (err) {
+ error_plog(&log, "Failed to parse CRUSH location: %d",
+ err);
+ return err;
+ }
+ break;
+ case Opt_read_from_replica:
+ switch (result.uint_32) {
+ case Opt_read_from_replica_no:
+ opt->read_from_replica = 0;
+ break;
+ case Opt_read_from_replica_balance:
+ opt->read_from_replica = CEPH_OSD_FLAG_BALANCE_READS;
+ break;
+ case Opt_read_from_replica_localize:
+ opt->read_from_replica = CEPH_OSD_FLAG_LOCALIZE_READS;
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case Opt_ms_mode:
+ switch (result.uint_32) {
+ case Opt_ms_mode_legacy:
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_prefer_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+ break;
+ case Opt_ms_mode_prefer_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_CRC;
+ break;
+ default:
+ BUG();
+ }
+ break;
+
+ case Opt_osdkeepalivetimeout:
+ /* 0 isn't well defined right now, reject it */
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_keepalive_timeout =
+ msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_osd_idle_ttl:
+ /* 0 isn't well defined right now, reject it */
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_idle_ttl = msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_mount_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->mount_timeout = msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_osd_request_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_request_timeout =
+ msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+
+ case Opt_share:
+ if (!result.negated)
+ opt->flags &= ~CEPH_OPT_NOSHARE;
+ else
+ opt->flags |= CEPH_OPT_NOSHARE;
+ break;
+ case Opt_crc:
+ if (!result.negated)
+ opt->flags &= ~CEPH_OPT_NOCRC;
+ else
+ opt->flags |= CEPH_OPT_NOCRC;
+ break;
+ case Opt_cephx_require_signatures:
+ if (!result.negated)
+ warn_plog(&log, "Ignoring cephx_require_signatures");
+ else
+ warn_plog(&log, "Ignoring nocephx_require_signatures, use nocephx_sign_messages");
+ break;
+ case Opt_cephx_sign_messages:
+ if (!result.negated)
+ opt->flags &= ~CEPH_OPT_NOMSGSIGN;
+ else
+ opt->flags |= CEPH_OPT_NOMSGSIGN;
+ break;
+ case Opt_tcp_nodelay:
+ if (!result.negated)
+ opt->flags |= CEPH_OPT_TCP_NODELAY;
+ else
+ opt->flags &= ~CEPH_OPT_TCP_NODELAY;
+ break;
+
+ case Opt_abort_on_full:
+ opt->flags |= CEPH_OPT_ABORT_ON_FULL;
+ break;
+ case Opt_rxbounce:
+ opt->flags |= CEPH_OPT_RXBOUNCE;
+ break;
+
+ default:
+ BUG();
+ }
+
+ return 0;
+
+out_of_range:
+ return inval_plog(&log, "%s out of range", param->key);
+}
+EXPORT_SYMBOL(ceph_parse_param);
+
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
+ bool show_all)
+{
+ struct ceph_options *opt = client->options;
+ size_t pos = m->count;
+ struct rb_node *n;
+
+ if (opt->name) {
+ seq_puts(m, "name=");
+ seq_escape(m, opt->name, ", \t\n\\");
+ seq_putc(m, ',');
+ }
+ if (opt->key)
+ seq_puts(m, "secret=<hidden>,");
+
+ if (!RB_EMPTY_ROOT(&opt->crush_locs)) {
+ seq_puts(m, "crush_location=");
+ for (n = rb_first(&opt->crush_locs); ; ) {
+ struct crush_loc_node *loc =
+ rb_entry(n, struct crush_loc_node, cl_node);
+
+ seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name,
+ loc->cl_loc.cl_name);
+ n = rb_next(n);
+ if (!n)
+ break;
+
+ seq_putc(m, '|');
+ }
+ seq_putc(m, ',');
+ }
+ if (opt->read_from_replica == CEPH_OSD_FLAG_BALANCE_READS) {
+ seq_puts(m, "read_from_replica=balance,");
+ } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
+ seq_puts(m, "read_from_replica=localize,");
+ }
+ if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+ if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=secure,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+ seq_puts(m, "ms_mode=prefer-crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+ seq_puts(m, "ms_mode=prefer-secure,");
+ }
+ }
+
+ if (opt->flags & CEPH_OPT_FSID)
+ seq_printf(m, "fsid=%pU,", &opt->fsid);
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ seq_puts(m, "noshare,");
+ if (opt->flags & CEPH_OPT_NOCRC)
+ seq_puts(m, "nocrc,");
+ if (opt->flags & CEPH_OPT_NOMSGSIGN)
+ seq_puts(m, "nocephx_sign_messages,");
+ if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+ seq_puts(m, "notcp_nodelay,");
+ if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL))
+ seq_puts(m, "abort_on_full,");
+ if (opt->flags & CEPH_OPT_RXBOUNCE)
+ seq_puts(m, "rxbounce,");
+
+ if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+ seq_printf(m, "mount_timeout=%d,",
+ jiffies_to_msecs(opt->mount_timeout) / 1000);
+ if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+ seq_printf(m, "osd_idle_ttl=%d,",
+ jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
+ if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+ seq_printf(m, "osdkeepalivetimeout=%d,",
+ jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+ if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
+ seq_printf(m, "osd_request_timeout=%d,",
+ jiffies_to_msecs(opt->osd_request_timeout) / 1000);
+
+ /* drop redundant comma */
+ if (m->count != pos)
+ m->count--;
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_print_client_options);
+
+struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client)
+{
+ return &client->msgr.inst.addr;
+}
+EXPORT_SYMBOL(ceph_client_addr);
+
+u64 ceph_client_gid(struct ceph_client *client)
+{
+ return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_gid);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+ struct ceph_client *client;
+ struct ceph_entity_addr *myaddr = NULL;
+ int err;
+
+ err = wait_for_random_bytes();
+ if (err < 0)
+ return ERR_PTR(err);
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (client == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ client->private = private;
+ client->options = opt;
+
+ mutex_init(&client->mount_mutex);
+ init_waitqueue_head(&client->auth_wq);
+ client->auth_err = 0;
+
+ client->extra_mon_dispatch = NULL;
+ client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
+
+ if (!ceph_test_opt(client, NOMSGSIGN))
+ client->required_features |= CEPH_FEATURE_MSG_AUTH;
+
+ /* msgr */
+ if (ceph_test_opt(client, MYIP))
+ myaddr = &client->options->my_addr;
+
+ ceph_messenger_init(&client->msgr, myaddr);
+
+ /* subsystems */
+ err = ceph_monc_init(&client->monc, client);
+ if (err < 0)
+ goto fail;
+ err = ceph_osdc_init(&client->osdc, client);
+ if (err < 0)
+ goto fail_monc;
+
+ return client;
+
+fail_monc:
+ ceph_monc_stop(&client->monc);
+fail:
+ ceph_messenger_fini(&client->msgr);
+ kfree(client);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+ dout("destroy_client %p\n", client);
+
+ atomic_set(&client->msgr.stopping, 1);
+
+ /* unmount */
+ ceph_osdc_stop(&client->osdc);
+ ceph_monc_stop(&client->monc);
+ ceph_messenger_fini(&client->msgr);
+
+ ceph_debugfs_client_cleanup(client);
+
+ ceph_destroy_options(client->options);
+
+ kfree(client);
+ dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+void ceph_reset_client_addr(struct ceph_client *client)
+{
+ ceph_messenger_reset_nonce(&client->msgr);
+ ceph_monc_reopen_session(&client->monc);
+ ceph_osdc_reopen_osds(&client->osdc);
+}
+EXPORT_SYMBOL(ceph_reset_client_addr);
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ long timeout = ceph_timeout_jiffies(client->options->mount_timeout);
+ bool have_monmap, have_osdmap;
+ int err;
+
+ /* open session, and wait for mon and osd maps */
+ err = ceph_monc_open_session(&client->monc);
+ if (err < 0)
+ return err;
+
+ add_wait_queue(&client->auth_wq, &wait);
+ for (;;) {
+ mutex_lock(&client->monc.mutex);
+ err = client->auth_err;
+ have_monmap = client->monc.monmap && client->monc.monmap->epoch;
+ mutex_unlock(&client->monc.mutex);
+
+ down_read(&client->osdc.lock);
+ have_osdmap = client->osdc.osdmap && client->osdc.osdmap->epoch;
+ up_read(&client->osdc.lock);
+
+ if (err || (have_monmap && have_osdmap))
+ break;
+
+ if (signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ if (!timeout) {
+ err = -ETIMEDOUT;
+ break;
+ }
+
+ /* wait */
+ dout("mount waiting for mon_map\n");
+ timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
+ }
+ remove_wait_queue(&client->auth_wq, &wait);
+
+ if (err)
+ return err;
+
+ pr_info("client%llu fsid %pU\n", ceph_client_gid(client),
+ &client->fsid);
+ ceph_debugfs_client_init(client);
+
+ return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+int ceph_open_session(struct ceph_client *client)
+{
+ int ret;
+
+ dout("open_session start\n");
+ mutex_lock(&client->mount_mutex);
+
+ ret = __ceph_open_session(client);
+
+ mutex_unlock(&client->mount_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+ unsigned long timeout)
+{
+ u64 newest_epoch;
+ int ret;
+
+ ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
+ if (ret)
+ return ret;
+
+ if (client->osdc.osdmap->epoch >= newest_epoch)
+ return 0;
+
+ ceph_osdc_maybe_request_map(&client->osdc);
+ return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout);
+}
+EXPORT_SYMBOL(ceph_wait_for_latest_osdmap);
+
+static int __init init_ceph_lib(void)
+{
+ int ret = 0;
+
+ ceph_debugfs_init();
+
+ ret = ceph_crypto_init();
+ if (ret < 0)
+ goto out_debugfs;
+
+ ret = ceph_msgr_init();
+ if (ret < 0)
+ goto out_crypto;
+
+ ret = ceph_osdc_setup();
+ if (ret < 0)
+ goto out_msgr;
+
+ pr_info("loaded (mon/osd proto %d/%d)\n",
+ CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+
+ return 0;
+
+out_msgr:
+ ceph_msgr_exit();
+out_crypto:
+ ceph_crypto_shutdown();
+out_debugfs:
+ ceph_debugfs_cleanup();
+ return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+ dout("exit_ceph_lib\n");
+ WARN_ON(!ceph_strings_empty());
+
+ ceph_osdc_cleanup();
+ ceph_msgr_exit();
+ ceph_crypto_shutdown();
+ ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph core library");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 000000000000..16a47c0eef37
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,131 @@
+
+#include <linux/ceph/types.h>
+#include <linux/module.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * https://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c) \
+ do { \
+ a = a - b; a = a - c; a = a ^ (c >> 13); \
+ b = b - c; b = b - a; b = b ^ (a << 8); \
+ c = c - a; c = c - b; c = c ^ (b >> 13); \
+ a = a - b; a = a - c; a = a ^ (c >> 12); \
+ b = b - c; b = b - a; b = b ^ (a << 16); \
+ c = c - a; c = c - b; c = c ^ (b >> 5); \
+ a = a - b; a = a - c; a = a ^ (c >> 3); \
+ b = b - c; b = b - a; b = b ^ (a << 10); \
+ c = c - a; c = c - b; c = c ^ (b >> 15); \
+ } while (0)
+
+unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length)
+{
+ const unsigned char *k = (const unsigned char *)str;
+ __u32 a, b, c; /* the internal state */
+ __u32 len; /* how many key bytes still need mixing */
+
+ /* Set up the internal state */
+ len = length;
+ a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ b = a;
+ c = 0; /* variable initialization of internal state */
+
+ /* handle most of the key */
+ while (len >= 12) {
+ a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+ ((__u32)k[3] << 24));
+ b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+ ((__u32)k[7] << 24));
+ c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+ ((__u32)k[11] << 24));
+ mix(a, b, c);
+ k = k + 12;
+ len = len - 12;
+ }
+
+ /* handle the last 11 bytes */
+ c = c + length;
+ switch (len) {
+ case 11:
+ c = c + ((__u32)k[10] << 24);
+ fallthrough;
+ case 10:
+ c = c + ((__u32)k[9] << 16);
+ fallthrough;
+ case 9:
+ c = c + ((__u32)k[8] << 8);
+ /* the first byte of c is reserved for the length */
+ fallthrough;
+ case 8:
+ b = b + ((__u32)k[7] << 24);
+ fallthrough;
+ case 7:
+ b = b + ((__u32)k[6] << 16);
+ fallthrough;
+ case 6:
+ b = b + ((__u32)k[5] << 8);
+ fallthrough;
+ case 5:
+ b = b + k[4];
+ fallthrough;
+ case 4:
+ a = a + ((__u32)k[3] << 24);
+ fallthrough;
+ case 3:
+ a = a + ((__u32)k[2] << 16);
+ fallthrough;
+ case 2:
+ a = a + ((__u32)k[1] << 8);
+ fallthrough;
+ case 1:
+ a = a + k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a, b, c);
+
+ return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned int ceph_str_hash_linux(const char *str, unsigned int length)
+{
+ unsigned long hash = 0;
+ unsigned char c;
+
+ while (length--) {
+ c = *str++;
+ hash = (hash + (c << 4) + (c >> 4)) * 11;
+ }
+ return hash;
+}
+
+
+unsigned int ceph_str_hash(int type, const char *s, unsigned int len)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return ceph_str_hash_linux(s, len);
+ case CEPH_STR_HASH_RJENKINS:
+ return ceph_str_hash_rjenkins(s, len);
+ default:
+ return -1;
+ }
+}
+EXPORT_SYMBOL(ceph_str_hash);
+
+const char *ceph_str_hash_name(int type)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return "linux";
+ case CEPH_STR_HASH_RJENKINS:
+ return "rjenkins";
+ default:
+ return "unknown";
+ }
+}
+EXPORT_SYMBOL(ceph_str_hash_name);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..355fea272120
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+ switch (type) {
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_AUTH: return "auth";
+ default: return "unknown";
+ }
+}
+EXPORT_SYMBOL(ceph_entity_type_name);
+
+const char *ceph_auth_proto_name(int proto)
+{
+ switch (proto) {
+ case CEPH_AUTH_UNKNOWN:
+ return "unknown";
+ case CEPH_AUTH_NONE:
+ return "none";
+ case CEPH_AUTH_CEPHX:
+ return "cephx";
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_con_mode_name(int mode)
+{
+ switch (mode) {
+ case CEPH_CON_MODE_UNKNOWN:
+ return "unknown";
+ case CEPH_CON_MODE_CRC:
+ return "crc";
+ case CEPH_CON_MODE_SECURE:
+ return "secure";
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+ switch (op) {
+#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return (str);
+__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
+#undef GENERATE_CASE
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_osd_watch_op_name(int o)
+{
+ switch (o) {
+ case CEPH_OSD_WATCH_OP_UNWATCH:
+ return "unwatch";
+ case CEPH_OSD_WATCH_OP_WATCH:
+ return "watch";
+ case CEPH_OSD_WATCH_OP_RECONNECT:
+ return "reconnect";
+ case CEPH_OSD_WATCH_OP_PING:
+ return "ping";
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_osd_state_name(int s)
+{
+ switch (s) {
+ case CEPH_OSD_EXISTS:
+ return "exists";
+ case CEPH_OSD_UP:
+ return "up";
+ case CEPH_OSD_AUTOOUT:
+ return "autoout";
+ case CEPH_OSD_NEW:
+ return "new";
+ default:
+ return "???";
+ }
+}
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
new file mode 100644
index 000000000000..66136a4c1ce7
--- /dev/null
+++ b/net/ceph/cls_lock_client.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/cls_lock_client.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+
+/**
+ * ceph_cls_lock - grab rados lock for object
+ * @osdc: OSD client instance
+ * @oid: object to lock
+ * @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED)
+ * @cookie: user-defined identifier for this instance of the lock
+ * @tag: user-defined tag
+ * @desc: user-defined lock description
+ * @flags: lock flags
+ *
+ * All operations on the same lock should use the same tag.
+ */
+int ceph_cls_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *cookie,
+ char *tag, char *desc, u8 flags)
+{
+ int lock_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ int tag_len = strlen(tag);
+ int desc_len = strlen(desc);
+ void *p, *end;
+ struct page *lock_op_page;
+ struct timespec64 mtime;
+ int ret;
+
+ lock_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ desc_len + sizeof(__le32) +
+ sizeof(struct ceph_timespec) +
+ /* flag and type */
+ sizeof(u8) + sizeof(u8) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (lock_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ lock_op_page = alloc_page(GFP_NOIO);
+ if (!lock_op_page)
+ return -ENOMEM;
+
+ p = page_address(lock_op_page);
+ end = p + lock_op_buf_size;
+
+ /* encode cls_lock_lock_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ lock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ ceph_encode_string(&p, end, desc, desc_len);
+ /* only support infinite duration */
+ memset(&mtime, 0, sizeof(mtime));
+ ceph_encode_timespec64(p, &mtime);
+ p += sizeof(struct ceph_timespec);
+ ceph_encode_8(&p, flags);
+
+ dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
+ __func__, lock_name, type, cookie, tag, desc, flags);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
+ CEPH_OSD_FLAG_WRITE, lock_op_page,
+ lock_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(lock_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_lock);
+
+/**
+ * ceph_cls_unlock - release rados lock for object
+ * @osdc: OSD client instance
+ * @oid: object to lock
+ * @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @cookie: user-defined identifier for this instance of the lock
+ */
+int ceph_cls_unlock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie)
+{
+ int unlock_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ void *p, *end;
+ struct page *unlock_op_page;
+ int ret;
+
+ unlock_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (unlock_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ unlock_op_page = alloc_page(GFP_NOIO);
+ if (!unlock_op_page)
+ return -ENOMEM;
+
+ p = page_address(unlock_op_page);
+ end = p + unlock_op_buf_size;
+
+ /* encode cls_lock_unlock_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ unlock_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+
+ dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
+ CEPH_OSD_FLAG_WRITE, unlock_op_page,
+ unlock_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(unlock_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_unlock);
+
+/**
+ * ceph_cls_break_lock - release rados lock for object for specified client
+ * @osdc: OSD client instance
+ * @oid: object to lock
+ * @oloc: object to lock
+ * @lock_name: the name of the lock
+ * @cookie: user-defined identifier for this instance of the lock
+ * @locker: current lock owner
+ */
+int ceph_cls_break_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie,
+ struct ceph_entity_name *locker)
+{
+ int break_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ struct page *break_op_page;
+ void *p, *end;
+ int ret;
+
+ break_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ sizeof(u8) + sizeof(__le64) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (break_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ break_op_page = alloc_page(GFP_NOIO);
+ if (!break_op_page)
+ return -ENOMEM;
+
+ p = page_address(break_op_page);
+ end = p + break_op_buf_size;
+
+ /* encode cls_lock_break_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ break_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_copy(&p, locker, sizeof(*locker));
+ ceph_encode_string(&p, end, cookie, cookie_len);
+
+ dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
+ cookie, ENTITY_NAME(*locker));
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
+ CEPH_OSD_FLAG_WRITE, break_op_page,
+ break_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(break_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_break_lock);
+
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *old_cookie,
+ char *tag, char *new_cookie)
+{
+ int cookie_op_buf_size;
+ int name_len = strlen(lock_name);
+ int old_cookie_len = strlen(old_cookie);
+ int tag_len = strlen(tag);
+ int new_cookie_len = strlen(new_cookie);
+ void *p, *end;
+ struct page *cookie_op_page;
+ int ret;
+
+ cookie_op_buf_size = name_len + sizeof(__le32) +
+ old_cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ new_cookie_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (cookie_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ cookie_op_page = alloc_page(GFP_NOIO);
+ if (!cookie_op_page)
+ return -ENOMEM;
+
+ p = page_address(cookie_op_page);
+ end = p + cookie_op_buf_size;
+
+ /* encode cls_lock_set_cookie_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, old_cookie, old_cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ ceph_encode_string(&p, end, new_cookie, new_cookie_len);
+
+ dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n",
+ __func__, lock_name, type, old_cookie, tag, new_cookie);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie",
+ CEPH_OSD_FLAG_WRITE, cookie_op_page,
+ cookie_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(cookie_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_set_cookie);
+
+void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
+{
+ int i;
+
+ for (i = 0; i < num_lockers; i++)
+ kfree(lockers[i].id.cookie);
+ kfree(lockers);
+}
+EXPORT_SYMBOL(ceph_free_lockers);
+
+static int decode_locker(void **p, void *end, struct ceph_locker *locker)
+{
+ u8 struct_v;
+ u32 len;
+ char *s;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "locker_id_t", &struct_v, &len);
+ if (ret)
+ return ret;
+
+ ceph_decode_copy(p, &locker->id.name, sizeof(locker->id.name));
+ s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ locker->id.cookie = s;
+
+ ret = ceph_start_decoding(p, end, 1, "locker_info_t", &struct_v, &len);
+ if (ret)
+ return ret;
+
+ *p += sizeof(struct ceph_timespec); /* skip expiration */
+
+ ret = ceph_decode_entity_addr(p, end, &locker->info.addr);
+ if (ret)
+ return ret;
+
+ len = ceph_decode_32(p);
+ *p += len; /* skip description */
+
+ dout("%s %s%llu cookie %s addr %s\n", __func__,
+ ENTITY_NAME(locker->id.name), locker->id.cookie,
+ ceph_pr_addr(&locker->info.addr));
+ return 0;
+}
+
+static int decode_lockers(void **p, void *end, u8 *type, char **tag,
+ struct ceph_locker **lockers, u32 *num_lockers)
+{
+ u8 struct_v;
+ u32 struct_len;
+ char *s;
+ int i;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "cls_lock_get_info_reply",
+ &struct_v, &struct_len);
+ if (ret)
+ return ret;
+
+ *num_lockers = ceph_decode_32(p);
+ *lockers = kcalloc(*num_lockers, sizeof(**lockers), GFP_NOIO);
+ if (!*lockers)
+ return -ENOMEM;
+
+ for (i = 0; i < *num_lockers; i++) {
+ ret = decode_locker(p, end, *lockers + i);
+ if (ret)
+ goto err_free_lockers;
+ }
+
+ *type = ceph_decode_8(p);
+ s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO);
+ if (IS_ERR(s)) {
+ ret = PTR_ERR(s);
+ goto err_free_lockers;
+ }
+
+ *tag = s;
+ return 0;
+
+err_free_lockers:
+ ceph_free_lockers(*lockers, *num_lockers);
+ return ret;
+}
+
+/*
+ * On success, the caller is responsible for:
+ *
+ * kfree(tag);
+ * ceph_free_lockers(lockers, num_lockers);
+ */
+int ceph_cls_lock_info(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 *type, char **tag,
+ struct ceph_locker **lockers, u32 *num_lockers)
+{
+ int get_info_op_buf_size;
+ int name_len = strlen(lock_name);
+ struct page *get_info_op_page, *reply_page;
+ size_t reply_len = PAGE_SIZE;
+ void *p, *end;
+ int ret;
+
+ get_info_op_buf_size = name_len + sizeof(__le32) +
+ CEPH_ENCODING_START_BLK_LEN;
+ if (get_info_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ get_info_op_page = alloc_page(GFP_NOIO);
+ if (!get_info_op_page)
+ return -ENOMEM;
+
+ reply_page = alloc_page(GFP_NOIO);
+ if (!reply_page) {
+ __free_page(get_info_op_page);
+ return -ENOMEM;
+ }
+
+ p = page_address(get_info_op_page);
+ end = p + get_info_op_buf_size;
+
+ /* encode cls_lock_get_info_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ get_info_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+
+ dout("%s lock_name %s\n", __func__, lock_name);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
+ CEPH_OSD_FLAG_READ, get_info_op_page,
+ get_info_op_buf_size, &reply_page, &reply_len);
+
+ dout("%s: status %d\n", __func__, ret);
+ if (ret >= 0) {
+ p = page_address(reply_page);
+ end = p + reply_len;
+
+ ret = decode_lockers(&p, end, type, tag, lockers, num_lockers);
+ }
+
+ __free_page(get_info_op_page);
+ __free_page(reply_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_lock_info);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+ char *lock_name, u8 type, char *cookie, char *tag)
+{
+ int assert_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ int tag_len = strlen(tag);
+ struct page **pages;
+ void *p, *end;
+ int ret;
+
+ assert_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (assert_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
+ if (ret)
+ return ret;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ p = page_address(pages[0]);
+ end = p + assert_op_buf_size;
+
+ /* encode cls_lock_assert_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ WARN_ON(p != end);
+
+ osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
+ 0, false, true);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 000000000000..254ded0b05f6
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef __KERNEL__
+# include <linux/slab.h>
+# include <linux/crush/crush.h>
+#else
+# include "crush_compat.h"
+# include "crush.h"
+#endif
+
+const char *crush_bucket_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM: return "uniform";
+ case CRUSH_BUCKET_LIST: return "list";
+ case CRUSH_BUCKET_TREE: return "tree";
+ case CRUSH_BUCKET_STRAW: return "straw";
+ case CRUSH_BUCKET_STRAW2: return "straw2";
+ default: return "unknown";
+ }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
+{
+ if ((__u32)p >= b->size)
+ return 0;
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return ((struct crush_bucket_uniform *)b)->item_weight;
+ case CRUSH_BUCKET_LIST:
+ return ((struct crush_bucket_list *)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
+ case CRUSH_BUCKET_STRAW:
+ return ((struct crush_bucket_straw *)b)->item_weights[p];
+ case CRUSH_BUCKET_STRAW2:
+ return ((struct crush_bucket_straw2 *)b)->item_weights[p];
+ }
+ return 0;
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+ kfree(b->item_weights);
+ kfree(b->sum_weights);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+ kfree(b->h.items);
+ kfree(b->node_weights);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+ kfree(b->straws);
+ kfree(b->item_weights);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
+{
+ kfree(b->item_weights);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+ break;
+ case CRUSH_BUCKET_LIST:
+ crush_destroy_bucket_list((struct crush_bucket_list *)b);
+ break;
+ case CRUSH_BUCKET_TREE:
+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
+ break;
+ }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+ /* buckets */
+ if (map->buckets) {
+ __s32 b;
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
+ }
+ kfree(map->buckets);
+ }
+
+ /* rules */
+ if (map->rules) {
+ __u32 b;
+ for (b = 0; b < map->max_rules; b++)
+ crush_destroy_rule(map->rules[b]);
+ kfree(map->rules);
+ }
+
+#ifndef __KERNEL__
+ kfree(map->choose_tries);
+#else
+ clear_crush_names(&map->type_names);
+ clear_crush_names(&map->names);
+ clear_choose_args(map);
+#endif
+ kfree(map);
+}
+
+void crush_destroy_rule(struct crush_rule *rule)
+{
+ kfree(rule);
+}
diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h
new file mode 100644
index 000000000000..aae534c901a4
--- /dev/null
+++ b/net/ceph/crush/crush_ln_table.h
@@ -0,0 +1,164 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CRUSH_LN_H
+#define CEPH_CRUSH_LN_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
+ 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
+ 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
+ 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
+ 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll,
+ 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll,
+ 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll,
+ 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll,
+ 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell,
+ 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll,
+ 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll,
+ 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll,
+ 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll,
+ 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll,
+ 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll,
+ 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all,
+ 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll,
+ 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all,
+ 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell,
+ 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll,
+ 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll,
+ 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll,
+ 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll,
+ 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll,
+ 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll,
+ 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll,
+ 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll,
+ 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell,
+ 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll,
+ 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll,
+ 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll,
+ 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll,
+ 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll,
+ 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll,
+ 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll,
+ 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll,
+ 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll,
+ 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll,
+ 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll,
+ 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll,
+ 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll,
+ 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll,
+ 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll,
+ 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll,
+ 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll,
+ 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll,
+ 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll,
+ 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll,
+ 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll,
+ 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll,
+ 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll,
+ 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll,
+ 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll,
+ 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll,
+ 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell,
+ 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell,
+ 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll,
+ 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell,
+ 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll,
+ 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll,
+ 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll,
+ 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll,
+ 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll,
+ 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
+ 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
+ 0x0000800000000000ll, 0x0000ffff00000000ll,
+};
+
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
+ 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
+ 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
+ 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
+ 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull,
+ 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull,
+ 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull,
+ 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull,
+ 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull,
+ 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull,
+ 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull,
+ 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull,
+ 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull,
+ 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull,
+ 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull,
+ 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull,
+ 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull,
+ 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull,
+ 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull,
+ 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull,
+ 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull,
+ 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull,
+ 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull,
+ 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull,
+ 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull,
+ 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull,
+ 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull,
+ 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull,
+ 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull,
+ 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull,
+ 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull,
+ 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull,
+ 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull,
+ 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull,
+ 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull,
+ 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull,
+ 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull,
+ 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull,
+ 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull,
+ 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull,
+ 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull,
+ 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull,
+ 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull,
+ 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull,
+ 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull,
+ 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull,
+ 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull,
+ 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull,
+ 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull,
+ 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull,
+ 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull,
+ 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull,
+ 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull,
+ 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull,
+ 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull,
+ 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull,
+ 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull,
+ 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull,
+ 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull,
+ 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull,
+ 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull,
+ 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull,
+ 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull,
+ 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull,
+ 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
+};
+
+#endif
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 000000000000..fe79f6d2d0db
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * https://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do { \
+ a = a-b; a = a-c; a = a^(c>>13); \
+ b = b-c; b = b-a; b = b^(a<<8); \
+ c = c-a; c = c-b; c = c^(b>>13); \
+ a = a-b; a = a-c; a = a^(c>>12); \
+ b = b-c; b = b-a; b = b^(a<<16); \
+ c = c-a; c = c-b; c = c^(b>>5); \
+ a = a-b; a = a-c; a = a^(c>>3); \
+ b = b-c; b = b-a; b = b^(a<<10); \
+ c = c-a; c = c-b; c = c^(b>>15); \
+ } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+ __u32 hash = crush_hash_seed ^ a;
+ __u32 b = a;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, a, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(x, a, hash);
+ crush_hashmix(b, y, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(a, x, hash);
+ crush_hashmix(y, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, d, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(e, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ crush_hashmix(d, x, hash);
+ crush_hashmix(y, e, hash);
+ return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1(a);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_2(a, b);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_3(a, b, c);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_4(a, b, c, d);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_5(a, b, c, d, e);
+ default:
+ return 0;
+ }
+}
+
+const char *crush_hash_name(int type)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return "rjenkins1";
+ default:
+ return "unknown";
+ }
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 000000000000..3a5bd1cd1e99
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,1099 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
+# include <linux/crush/mapper.h>
+#else
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
+# include "mapper.h"
+#endif
+#include "crush_ln_table.h"
+
+#define dprintk(args...) /* printf(args) */
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
+{
+ __u32 i;
+
+ for (i = 0; i < map->max_rules; i++) {
+ if (map->rules[i] &&
+ map->rules[i]->mask.ruleset == ruleset &&
+ map->rules[i]->mask.type == type &&
+ map->rules[i]->mask.min_size <= size &&
+ map->rules[i]->mask.max_size >= size)
+ return i;
+ }
+ return -1;
+}
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors. Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+ struct crush_work_bucket *work,
+ int x, int r)
+{
+ unsigned int pr = r % bucket->size;
+ unsigned int i, s;
+
+ /* start a new permutation if @x has changed */
+ if (work->perm_x != (__u32)x || work->perm_n == 0) {
+ dprintk("bucket %d new x=%d\n", bucket->id, x);
+ work->perm_x = x;
+
+ /* optimize common r=0 case */
+ if (pr == 0) {
+ s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+ bucket->size;
+ work->perm[0] = s;
+ work->perm_n = 0xffff; /* magic value, see below */
+ goto out;
+ }
+
+ for (i = 0; i < bucket->size; i++)
+ work->perm[i] = i;
+ work->perm_n = 0;
+ } else if (work->perm_n == 0xffff) {
+ /* clean up after the r=0 case above */
+ for (i = 1; i < bucket->size; i++)
+ work->perm[i] = i;
+ work->perm[work->perm[0]] = 0;
+ work->perm_n = 1;
+ }
+
+ /* calculate permutation up to pr */
+ for (i = 0; i < work->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+ while (work->perm_n <= pr) {
+ unsigned int p = work->perm_n;
+ /* no point in swapping the final entry */
+ if (p < bucket->size - 1) {
+ i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+ (bucket->size - p);
+ if (i) {
+ unsigned int t = work->perm[p + i];
+ work->perm[p + i] = work->perm[p];
+ work->perm[p] = t;
+ }
+ dprintk(" perm_choose swap %d with %d\n", p, p+i);
+ }
+ work->perm_n++;
+ }
+ for (i = 0; i < bucket->size; i++)
+ dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
+
+ s = work->perm[pr];
+out:
+ dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+ bucket->size, x, r, pr, s);
+ return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+ struct crush_work_bucket *work, int x, int r)
+{
+ return bucket_perm_choose(&bucket->h, work, x, r);
+}
+
+/* list */
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
+ int x, int r)
+{
+ int i;
+
+ for (i = bucket->h.size-1; i >= 0; i--) {
+ __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
+ r, bucket->h.id);
+ w &= 0xffff;
+ dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+ "sw %x rand %llx",
+ i, x, r, bucket->h.items[i], bucket->item_weights[i],
+ bucket->sum_weights[i], w);
+ w *= bucket->sum_weights[i];
+ w = w >> 16;
+ /*dprintk(" scaled %llx\n", w);*/
+ if (w < bucket->item_weights[i]) {
+ return bucket->h.items[i];
+ }
+ }
+
+ dprintk("bad list sums for bucket %d\n", bucket->h.id);
+ return bucket->h.items[0];
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+
+static int left(int x)
+{
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+ return x & 1;
+}
+
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
+ int x, int r)
+{
+ int n;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->num_nodes >> 1;
+
+ while (!terminal(n)) {
+ int l;
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+ bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* descend to the left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
+ int x, int r)
+{
+ __u32 i;
+ int high = 0;
+ __u64 high_draw = 0;
+ __u64 draw;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+ draw &= 0xffff;
+ draw *= bucket->straws[i];
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
+{
+ unsigned int x = xin;
+ int iexpon, index1, index2;
+ __u64 RH, LH, LL, xl64, result;
+
+ x++;
+
+ /* normalize input */
+ iexpon = 15;
+
+ /*
+ * figure out number of bits we need to shift and
+ * do it in one step instead of iteratively
+ */
+ if (!(x & 0x18000)) {
+ int bits = __builtin_clz(x & 0x1FFFF) - 16;
+ x <<= bits;
+ iexpon = 15 - bits;
+ }
+
+ index1 = (x >> 8) << 1;
+ /* RH ~ 2^56/index1 */
+ RH = __RH_LH_tbl[index1 - 256];
+ /* LH ~ 2^48 * log2(index1/256) */
+ LH = __RH_LH_tbl[index1 + 1 - 256];
+
+ /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+ xl64 = (__s64)x * RH;
+ xl64 >>= 48;
+
+ result = iexpon;
+ result <<= (12 + 32);
+
+ index2 = xl64 & 0xff;
+ /* LL ~ 2^48*log2(1.0+index2/2^15) */
+ LL = __LL_tbl[index2];
+
+ LH = LH + LL;
+
+ LH >>= (48 - 12 - 32);
+ result += LH;
+
+ return result;
+}
+
+
+/*
+ * straw2
+ *
+ * for reference, see:
+ *
+ * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ *
+ */
+
+static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
+ const struct crush_choose_arg *arg,
+ int position)
+{
+ if (!arg || !arg->weight_set)
+ return bucket->item_weights;
+
+ if (position >= arg->weight_set_size)
+ position = arg->weight_set_size - 1;
+ return arg->weight_set[position].weights;
+}
+
+static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
+ const struct crush_choose_arg *arg)
+{
+ if (!arg || !arg->ids)
+ return bucket->h.items;
+
+ return arg->ids;
+}
+
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
+ int x, int r,
+ const struct crush_choose_arg *arg,
+ int position)
+{
+ unsigned int i, high = 0;
+ unsigned int u;
+ __s64 ln, draw, high_draw = 0;
+ __u32 *weights = get_choose_arg_weights(bucket, arg, position);
+ __s32 *ids = get_choose_arg_ids(bucket, arg);
+
+ for (i = 0; i < bucket->h.size; i++) {
+ dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
+ if (weights[i]) {
+ u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
+ u &= 0xffff;
+
+ /*
+ * for some reason slightly less than 0x10000 produces
+ * a slightly more accurate distribution... probably a
+ * rounding effect.
+ *
+ * the natural log lookup table maps [0,0xffff]
+ * (corresponding to real numbers [1/0x10000, 1] to
+ * [0, 0xffffffffffff] (corresponding to real numbers
+ * [-11.090355,0]).
+ */
+ ln = crush_ln(u) - 0x1000000000000ll;
+
+ /*
+ * divide by 16.16 fixed-point weight. note
+ * that the ln value is negative, so a larger
+ * weight means a larger (less negative) value
+ * for draw.
+ */
+ draw = div64_s64(ln, weights[i]);
+ } else {
+ draw = S64_MIN;
+ }
+
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+
+ return bucket->h.items[high];
+}
+
+
+static int crush_bucket_choose(const struct crush_bucket *in,
+ struct crush_work_bucket *work,
+ int x, int r,
+ const struct crush_choose_arg *arg,
+ int position)
+{
+ dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+ BUG_ON(in->size == 0);
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose(
+ (const struct crush_bucket_uniform *)in,
+ work, x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((const struct crush_bucket_list *)in,
+ x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((const struct crush_bucket_tree *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose(
+ (const struct crush_bucket_straw *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW2:
+ return bucket_straw2_choose(
+ (const struct crush_bucket_straw2 *)in,
+ x, r, arg, position);
+ default:
+ dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
+ return in->items[0];
+ }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(const struct crush_map *map,
+ const __u32 *weight, int weight_max,
+ int item, int x)
+{
+ if (item >= weight_max)
+ return 1;
+ if (weight[item] >= 0x10000)
+ return 0;
+ if (weight[item] == 0)
+ return 1;
+ if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+ < weight[item])
+ return 0;
+ return 1;
+}
+
+/**
+ * crush_choose_firstn - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @work: working space initialized by crush_init_workspace()
+ * @bucket: the bucket we are choose an item from
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @out_size: size of the out vector
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
+ * @vary_r: pass r to recursive calls
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
+ * @choose_args: weights and ids for each known bucket
+ */
+static int crush_choose_firstn(const struct crush_map *map,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ int out_size,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ unsigned int local_retries,
+ unsigned int local_fallback_retries,
+ int recurse_to_leaf,
+ unsigned int vary_r,
+ unsigned int stable,
+ int *out2,
+ int parent_r,
+ const struct crush_choose_arg *choose_args)
+{
+ int rep;
+ unsigned int ftotal, flocal;
+ int retry_descent, retry_bucket, skip_rep;
+ const struct crush_bucket *in = bucket;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide, reject;
+ int count = out_size;
+
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r, stable);
+
+ for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
+ /* keep trying until we get a non-out, non-colliding item */
+ ftotal = 0;
+ skip_rep = 0;
+ do {
+ retry_descent = 0;
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ flocal = 0;
+ do {
+ collide = 0;
+ retry_bucket = 0;
+ r = rep + parent_r;
+ /* r' = r + f_total */
+ r += ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ reject = 1;
+ goto reject;
+ }
+ if (local_fallback_retries > 0 &&
+ flocal >= (in->size>>1) &&
+ flocal > local_fallback_retries)
+ item = bucket_perm_choose(
+ in, work->work[-1-in->id],
+ x, r);
+ else
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r,
+ (choose_args ?
+ &choose_args[-1-in->id] : NULL),
+ outpos);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ skip_rep = 1;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ skip_rep = 1;
+ break;
+ }
+ in = map->buckets[-1-item];
+ retry_bucket = 1;
+ continue;
+ }
+
+ /* collision? */
+ for (i = 0; i < outpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+
+ reject = 0;
+ if (!collide && recurse_to_leaf) {
+ if (item < 0) {
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
+ if (crush_choose_firstn(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, stable ? 1 : outpos+1, 0,
+ out2, outpos, count,
+ recurse_tries, 0,
+ local_retries,
+ local_fallback_retries,
+ 0,
+ vary_r,
+ stable,
+ NULL,
+ sub_r,
+ choose_args) <= outpos)
+ /* didn't get leaf */
+ reject = 1;
+ } else {
+ /* we already have a leaf! */
+ out2[outpos] = item;
+ }
+ }
+
+ if (!reject && !collide) {
+ /* out? */
+ if (itemtype == 0)
+ reject = is_out(map, weight,
+ weight_max,
+ item, x);
+ }
+
+reject:
+ if (reject || collide) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal <= local_retries)
+ /* retry locally a few times */
+ retry_bucket = 1;
+ else if (local_fallback_retries > 0 &&
+ flocal <= in->size + local_fallback_retries)
+ /* exhaustive bucket search */
+ retry_bucket = 1;
+ else if (ftotal < tries)
+ /* then retry descent */
+ retry_descent = 1;
+ else
+ /* else give up */
+ skip_rep = 1;
+ dprintk(" reject %d collide %d "
+ "ftotal %u flocal %u\n",
+ reject, collide, ftotal,
+ flocal);
+ }
+ } while (retry_bucket);
+ } while (retry_descent);
+
+ if (skip_rep) {
+ dprintk("skip rep\n");
+ continue;
+ }
+
+ dprintk("CHOOSE got %d\n", item);
+ out[outpos] = item;
+ outpos++;
+ count--;
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
+ }
+
+ dprintk("CHOOSE returns %d\n", outpos);
+ return outpos;
+}
+
+
+/*
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ */
+static void crush_choose_indep(const struct crush_map *map,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int left, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ int recurse_to_leaf,
+ int *out2,
+ int parent_r,
+ const struct crush_choose_arg *choose_args)
+{
+ const struct crush_bucket *in = bucket;
+ int endpos = outpos + left;
+ int rep;
+ unsigned int ftotal;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide;
+
+ dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ /* initially my result is undefined */
+ for (rep = outpos; rep < endpos; rep++) {
+ out[rep] = CRUSH_ITEM_UNDEF;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_UNDEF;
+ }
+
+ for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+ if (out2 && ftotal) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] != CRUSH_ITEM_UNDEF)
+ continue;
+
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ for (;;) {
+ /* note: we base the choice on the position
+ * even in the nested call. that means that
+ * if the first layer chooses the same bucket
+ * in a different position, we will tend to
+ * choose a different item in that bucket.
+ * this will involve more devices in data
+ * movement and tend to distribute the load.
+ */
+ r = rep + parent_r;
+
+ /* be careful */
+ if (in->alg == CRUSH_BUCKET_UNIFORM &&
+ in->size % numrep == 0)
+ /* r'=r+(n+1)*f_total */
+ r += (numrep+1) * ftotal;
+ else
+ /* r' = r + n*f_total */
+ r += numrep * ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ dprintk(" empty bucket\n");
+ break;
+ }
+
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r,
+ (choose_args ?
+ &choose_args[-1-in->id] : NULL),
+ outpos);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] =
+ CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ collide = 0;
+ for (i = outpos; i < endpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+ if (collide)
+ break;
+
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ crush_choose_indep(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r,
+ choose_args);
+ if (out2[rep] == CRUSH_ITEM_NONE) {
+ /* placed nothing; no leaf */
+ break;
+ }
+ } else {
+ /* we already have a leaf! */
+ out2[rep] = item;
+ }
+ }
+
+ /* out? */
+ if (itemtype == 0 &&
+ is_out(map, weight, weight_max, item, x))
+ break;
+
+ /* yay! */
+ out[rep] = item;
+ left--;
+ break;
+ }
+ }
+ }
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] == CRUSH_ITEM_UNDEF) {
+ out[rep] = CRUSH_ITEM_NONE;
+ }
+ if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+ out2[rep] = CRUSH_ITEM_NONE;
+ }
+ }
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+ if (out2) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
+}
+
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+ struct crush_work *w = v;
+ __s32 b;
+
+ /*
+ * We work by moving through the available space and setting
+ * values and pointers as we go.
+ *
+ * It's a bit like Forth's use of the 'allot' word since we
+ * set the pointer first and then reserve the space for it to
+ * point to by incrementing the point.
+ */
+ v += sizeof(struct crush_work);
+ w->work = v;
+ v += map->max_buckets * sizeof(struct crush_work_bucket *);
+ for (b = 0; b < map->max_buckets; ++b) {
+ if (!map->buckets[b])
+ continue;
+
+ w->work[b] = v;
+ switch (map->buckets[b]->alg) {
+ default:
+ v += sizeof(struct crush_work_bucket);
+ break;
+ }
+ w->work[b]->perm_x = 0;
+ w->work[b]->perm_n = 0;
+ w->work[b]->perm = v;
+ v += map->buckets[b]->size * sizeof(__u32);
+ }
+ BUG_ON(v - (void *)w != map->working_size);
+}
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @cwin: pointer to at least crush_work_size() bytes of memory
+ * @choose_args: weights and ids for each known bucket
+ */
+int crush_do_rule(const struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ const __u32 *weight, int weight_max,
+ void *cwin, const struct crush_choose_arg *choose_args)
+{
+ int result_len;
+ struct crush_work *cw = cwin;
+ int *a = cwin + map->working_size;
+ int *b = a + result_max;
+ int *c = b + result_max;
+ int *w = a;
+ int *o = b;
+ int recurse_to_leaf;
+ int wsize = 0;
+ int osize;
+ const struct crush_rule *rule;
+ __u32 step;
+ int i, j;
+ int numrep;
+ int out_size;
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one.
+ */
+ int choose_tries = map->choose_total_tries + 1;
+ int choose_leaf_tries = 0;
+ /*
+ * the local tries values were counted as "retries", though,
+ * and need no adjustment
+ */
+ int choose_local_retries = map->choose_local_tries;
+ int choose_local_fallback_retries = map->choose_local_fallback_tries;
+
+ int vary_r = map->chooseleaf_vary_r;
+ int stable = map->chooseleaf_stable;
+
+ if ((__u32)ruleno >= map->max_rules) {
+ dprintk(" bad ruleno %d\n", ruleno);
+ return 0;
+ }
+
+ rule = map->rules[ruleno];
+ result_len = 0;
+
+ for (step = 0; step < rule->len; step++) {
+ int firstn = 0;
+ const struct crush_rule_step *curstep = &rule->steps[step];
+
+ switch (curstep->op) {
+ case CRUSH_RULE_TAKE:
+ if ((curstep->arg1 >= 0 &&
+ curstep->arg1 < map->max_devices) ||
+ (-1-curstep->arg1 >= 0 &&
+ -1-curstep->arg1 < map->max_buckets &&
+ map->buckets[-1-curstep->arg1])) {
+ w[0] = curstep->arg1;
+ wsize = 1;
+ } else {
+ dprintk(" bad take value %d\n", curstep->arg1);
+ }
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ if (curstep->arg1 > 0)
+ choose_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ if (curstep->arg1 > 0)
+ choose_leaf_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_fallback_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ if (curstep->arg1 >= 0)
+ vary_r = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+ if (curstep->arg1 >= 0)
+ stable = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ firstn = 1;
+ fallthrough;
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ if (wsize == 0)
+ break;
+
+ recurse_to_leaf =
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_INDEP;
+
+ /* reset output */
+ osize = 0;
+
+ for (i = 0; i < wsize; i++) {
+ int bno;
+ numrep = curstep->arg1;
+ if (numrep <= 0) {
+ numrep += result_max;
+ if (numrep <= 0)
+ continue;
+ }
+ j = 0;
+ /* make sure bucket id is valid */
+ bno = -1 - w[i];
+ if (bno < 0 || bno >= map->max_buckets) {
+ /* w[i] is probably CRUSH_ITEM_NONE */
+ dprintk(" bad w[i] %d\n", w[i]);
+ continue;
+ }
+ if (firstn) {
+ int recurse_tries;
+ if (choose_leaf_tries)
+ recurse_tries =
+ choose_leaf_tries;
+ else if (map->chooseleaf_descend_once)
+ recurse_tries = 1;
+ else
+ recurse_tries = choose_tries;
+ osize += crush_choose_firstn(
+ map,
+ cw,
+ map->buckets[bno],
+ weight, weight_max,
+ x, numrep,
+ curstep->arg2,
+ o+osize, j,
+ result_max-osize,
+ choose_tries,
+ recurse_tries,
+ choose_local_retries,
+ choose_local_fallback_retries,
+ recurse_to_leaf,
+ vary_r,
+ stable,
+ c+osize,
+ 0,
+ choose_args);
+ } else {
+ out_size = ((numrep < (result_max-osize)) ?
+ numrep : (result_max-osize));
+ crush_choose_indep(
+ map,
+ cw,
+ map->buckets[bno],
+ weight, weight_max,
+ x, out_size, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ choose_leaf_tries ?
+ choose_leaf_tries : 1,
+ recurse_to_leaf,
+ c+osize,
+ 0,
+ choose_args);
+ osize += out_size;
+ }
+ }
+
+ if (recurse_to_leaf)
+ /* copy final _leaf_ values to output set */
+ memcpy(o, c, osize*sizeof(*o));
+
+ /* swap o and w arrays */
+ swap(o, w);
+ wsize = osize;
+ break;
+
+
+ case CRUSH_RULE_EMIT:
+ for (i = 0; i < wsize && result_len < result_max; i++) {
+ result[result_len] = w[i];
+ result_len++;
+ }
+ wsize = 0;
+ break;
+
+ default:
+ dprintk(" unknown op %d at step %d\n",
+ curstep->op, step);
+ break;
+ }
+ }
+
+ return result_len;
+}
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 000000000000..01b2ce1e8fc0
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
+#include <linux/key-type.h>
+#include <linux/sched/mm.h>
+
+#include <keys/ceph-type.h>
+#include <keys/user-type.h>
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+/*
+ * Set ->key and ->tfm. The rest of the key should be filled in before
+ * this function is called.
+ */
+static int set_secret(struct ceph_crypto_key *key, void *buf)
+{
+ unsigned int noio_flag;
+ int ret;
+
+ key->key = NULL;
+ key->tfm = NULL;
+
+ switch (key->type) {
+ case CEPH_CRYPTO_NONE:
+ return 0; /* nothing to do */
+ case CEPH_CRYPTO_AES:
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ if (!key->len)
+ return -EINVAL;
+
+ key->key = kmemdup(buf, key->len, GFP_NOIO);
+ if (!key->key) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */
+ noio_flag = memalloc_noio_save();
+ key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(key->tfm)) {
+ ret = PTR_ERR(key->tfm);
+ key->tfm = NULL;
+ goto fail;
+ }
+
+ ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len);
+ if (ret)
+ goto fail;
+
+ return 0;
+
+fail:
+ ceph_crypto_key_destroy(key);
+ return ret;
+}
+
+int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
+ const struct ceph_crypto_key *src)
+{
+ memcpy(dst, src, sizeof(struct ceph_crypto_key));
+ return set_secret(dst, src->key);
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ int ret;
+
+ ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+ key->type = ceph_decode_16(p);
+ ceph_decode_copy(p, &key->created, sizeof(key->created));
+ key->len = ceph_decode_16(p);
+ ceph_decode_need(p, end, key->len, bad);
+ ret = set_secret(key, *p);
+ memzero_explicit(*p, key->len);
+ *p += key->len;
+ return ret;
+
+bad:
+ dout("failed to decode crypto key\n");
+ return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+ int inlen = strlen(inkey);
+ int blen = inlen * 3 / 4;
+ void *buf, *p;
+ int ret;
+
+ dout("crypto_key_unarmor %s\n", inkey);
+ buf = kmalloc(blen, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+ blen = ceph_unarmor(buf, inkey, inkey+inlen);
+ if (blen < 0) {
+ kfree(buf);
+ return blen;
+ }
+
+ p = buf;
+ ret = ceph_crypto_key_decode(key, &p, p + blen);
+ kfree(buf);
+ if (ret)
+ return ret;
+ dout("crypto_key_unarmor key %p type %d len %d\n", key,
+ key->type, key->len);
+ return 0;
+}
+
+void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+ if (key) {
+ kfree_sensitive(key->key);
+ key->key = NULL;
+ if (key->tfm) {
+ crypto_free_sync_skcipher(key->tfm);
+ key->tfm = NULL;
+ }
+ }
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+/*
+ * Should be used for buffers allocated with kvmalloc().
+ * Currently these are encrypt out-buffer (ceph_buffer) and decrypt
+ * in-buffer (msg front).
+ *
+ * Dispose of @sgt with teardown_sgtable().
+ *
+ * @prealloc_sg is to avoid memory allocation inside sg_alloc_table()
+ * in cases where a single sg is sufficient. No attempt to reduce the
+ * number of sgs by squeezing physically contiguous pages together is
+ * made though, for simplicity.
+ */
+static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+ const void *buf, unsigned int buf_len)
+{
+ struct scatterlist *sg;
+ const bool is_vmalloc = is_vmalloc_addr(buf);
+ unsigned int off = offset_in_page(buf);
+ unsigned int chunk_cnt = 1;
+ unsigned int chunk_len = PAGE_ALIGN(off + buf_len);
+ int i;
+ int ret;
+
+ if (buf_len == 0) {
+ memset(sgt, 0, sizeof(*sgt));
+ return -EINVAL;
+ }
+
+ if (is_vmalloc) {
+ chunk_cnt = chunk_len >> PAGE_SHIFT;
+ chunk_len = PAGE_SIZE;
+ }
+
+ if (chunk_cnt > 1) {
+ ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS);
+ if (ret)
+ return ret;
+ } else {
+ WARN_ON(chunk_cnt != 1);
+ sg_init_table(prealloc_sg, 1);
+ sgt->sgl = prealloc_sg;
+ sgt->nents = sgt->orig_nents = 1;
+ }
+
+ for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
+ struct page *page;
+ unsigned int len = min(chunk_len - off, buf_len);
+
+ if (is_vmalloc)
+ page = vmalloc_to_page(buf);
+ else
+ page = virt_to_page(buf);
+
+ sg_set_page(sg, page, len, off);
+
+ off = 0;
+ buf += len;
+ buf_len -= len;
+ }
+ WARN_ON(buf_len != 0);
+
+ return 0;
+}
+
+static void teardown_sgtable(struct sg_table *sgt)
+{
+ if (sgt->orig_nents > 1)
+ sg_free_table(sgt);
+}
+
+static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len)
+{
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
+ struct sg_table sgt;
+ struct scatterlist prealloc_sg;
+ char iv[AES_BLOCK_SIZE] __aligned(8);
+ int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1));
+ int crypt_len = encrypt ? in_len + pad_byte : in_len;
+ int ret;
+
+ WARN_ON(crypt_len > buf_len);
+ if (encrypt)
+ memset(buf + in_len, pad_byte, pad_byte);
+ ret = setup_sgtable(&sgt, &prealloc_sg, buf, crypt_len);
+ if (ret)
+ return ret;
+
+ memcpy(iv, aes_iv, AES_BLOCK_SIZE);
+ skcipher_request_set_sync_tfm(req, key->tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
+
+ /*
+ print_hex_dump(KERN_ERR, "key: ", DUMP_PREFIX_NONE, 16, 1,
+ key->key, key->len, 1);
+ print_hex_dump(KERN_ERR, " in: ", DUMP_PREFIX_NONE, 16, 1,
+ buf, crypt_len, 1);
+ */
+ if (encrypt)
+ ret = crypto_skcipher_encrypt(req);
+ else
+ ret = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
+ if (ret) {
+ pr_err("%s %scrypt failed: %d\n", __func__,
+ encrypt ? "en" : "de", ret);
+ goto out_sgt;
+ }
+ /*
+ print_hex_dump(KERN_ERR, "out: ", DUMP_PREFIX_NONE, 16, 1,
+ buf, crypt_len, 1);
+ */
+
+ if (encrypt) {
+ *pout_len = crypt_len;
+ } else {
+ pad_byte = *(char *)(buf + in_len - 1);
+ if (pad_byte > 0 && pad_byte <= AES_BLOCK_SIZE &&
+ in_len >= pad_byte) {
+ *pout_len = in_len - pad_byte;
+ } else {
+ pr_err("%s got bad padding %d on in_len %d\n",
+ __func__, pad_byte, in_len);
+ ret = -EPERM;
+ goto out_sgt;
+ }
+ }
+
+out_sgt:
+ teardown_sgtable(&sgt);
+ return ret;
+}
+
+int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len)
+{
+ switch (key->type) {
+ case CEPH_CRYPTO_NONE:
+ *pout_len = in_len;
+ return 0;
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len,
+ pout_len);
+ default:
+ return -ENOTSUPP;
+ }
+}
+
+static int ceph_key_preparse(struct key_preparsed_payload *prep)
+{
+ struct ceph_crypto_key *ckey;
+ size_t datalen = prep->datalen;
+ int ret;
+ void *p;
+
+ ret = -EINVAL;
+ if (datalen <= 0 || datalen > 32767 || !prep->data)
+ goto err;
+
+ ret = -ENOMEM;
+ ckey = kmalloc(sizeof(*ckey), GFP_KERNEL);
+ if (!ckey)
+ goto err;
+
+ /* TODO ceph_crypto_key_decode should really take const input */
+ p = (void *)prep->data;
+ ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen);
+ if (ret < 0)
+ goto err_ckey;
+
+ prep->payload.data[0] = ckey;
+ prep->quotalen = datalen;
+ return 0;
+
+err_ckey:
+ kfree(ckey);
+err:
+ return ret;
+}
+
+static void ceph_key_free_preparse(struct key_preparsed_payload *prep)
+{
+ struct ceph_crypto_key *ckey = prep->payload.data[0];
+ ceph_crypto_key_destroy(ckey);
+ kfree(ckey);
+}
+
+static void ceph_key_destroy(struct key *key)
+{
+ struct ceph_crypto_key *ckey = key->payload.data[0];
+
+ ceph_crypto_key_destroy(ckey);
+ kfree(ckey);
+}
+
+struct key_type key_type_ceph = {
+ .name = "ceph",
+ .preparse = ceph_key_preparse,
+ .free_preparse = ceph_key_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .destroy = ceph_key_destroy,
+};
+
+int __init ceph_crypto_init(void)
+{
+ return register_key_type(&key_type_ceph);
+}
+
+void ceph_crypto_shutdown(void)
+{
+ unregister_key_type(&key_type_ceph);
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 000000000000..23de29fc613c
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+#define CEPH_KEY_LEN 16
+#define CEPH_MAX_CON_SECRET_LEN 64
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+ int type;
+ struct ceph_timespec created;
+ int len;
+ void *key;
+ struct crypto_sync_skcipher *tfm;
+};
+
+int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
+ const struct ceph_crypto_key *src);
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end);
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+void ceph_crypto_key_destroy(struct ceph_crypto_key *key);
+
+/* crypto.c */
+int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt,
+ void *buf, int buf_len, int in_len, int *pout_len);
+int ceph_crypto_init(void);
+void ceph_crypto_shutdown(void);
+
+/* armor.c */
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..83c270bce63c
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client* - an instance of the ceph client
+ * .../osdmap - current osdmap
+ * .../monmap - current monmap
+ * .../osdc - active osd requests
+ * .../monc - mon client state
+ * .../client_options - libceph-only (i.e. not rbd or cephfs) options
+ * .../dentry_lru - dump contents of dentry lru
+ * .../caps - expose cap (reservation) stats
+ * .../bdi - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ mutex_lock(&client->monc.mutex);
+ if (client->monc.monmap == NULL)
+ goto out_unlock;
+
+ seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+ for (i = 0; i < client->monc.monmap->num_mon; i++) {
+ struct ceph_entity_inst *inst =
+ &client->monc.monmap->mon_inst[i];
+
+ seq_printf(s, "\t%s%lld\t%s\n",
+ ENTITY_NAME(inst->name),
+ ceph_pr_addr(&inst->addr));
+ }
+
+out_unlock:
+ mutex_unlock(&client->monc.mutex);
+ return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct ceph_osdmap *map;
+ struct rb_node *n;
+
+ down_read(&osdc->lock);
+ map = osdc->osdmap;
+ if (map == NULL)
+ goto out_unlock;
+
+ seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
+ osdc->epoch_barrier, map->flags);
+
+ for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+ pi->id, pi->name, pi->type, pi->size, pi->min_size,
+ pi->pg_num, pi->pg_num_mask, pi->flags,
+ pi->last_force_request_resend, pi->read_tier,
+ pi->write_tier);
+ }
+ for (i = 0; i < map->max_osd; i++) {
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+ u32 state = map->osd_state[i];
+ char sb[64];
+
+ seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n",
+ i, ceph_pr_addr(addr),
+ ((map->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state),
+ ((ceph_get_primary_affinity(map, i)*100) >> 16),
+ ceph_get_crush_locality(map, i,
+ &client->options->crush_locs));
+ }
+ for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_temp.len; i++)
+ seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+ pg->pg_temp.osds[i]);
+ seq_printf(s, "]\n");
+ }
+ for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
+ pg->pgid.seed, pg->primary_temp.osd);
+ }
+ for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_upmap.len; i++)
+ seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+ pg->pg_upmap.osds[i]);
+ seq_printf(s, "]\n");
+ }
+ for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_upmap_items.len; i++)
+ seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","),
+ pg->pg_upmap_items.from_to[i][0],
+ pg->pg_upmap_items.from_to[i][1]);
+ seq_printf(s, "]\n");
+ }
+
+out_unlock:
+ up_read(&osdc->lock);
+ return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_client *monc = &client->monc;
+ struct rb_node *rp;
+ int i;
+
+ mutex_lock(&monc->mutex);
+
+ for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+ seq_printf(s, "have %s %u", ceph_sub_str[i],
+ monc->subs[i].have);
+ if (monc->subs[i].want)
+ seq_printf(s, " want %llu%s",
+ le64_to_cpu(monc->subs[i].item.start),
+ (monc->subs[i].item.flags &
+ CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
+ seq_putc(s, '\n');
+ }
+ seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
+
+ for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+ __u16 op;
+ req = rb_entry(rp, struct ceph_mon_generic_request, node);
+ op = le16_to_cpu(req->request->hdr.type);
+ if (op == CEPH_MSG_STATFS)
+ seq_printf(s, "%llu statfs\n", req->tid);
+ else if (op == CEPH_MSG_MON_GET_VERSION)
+ seq_printf(s, "%llu mon_get_version", req->tid);
+ else
+ seq_printf(s, "%llu unknown\n", req->tid);
+ }
+
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid)
+{
+ seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed);
+ if (spgid->shard != CEPH_SPG_NOSHARD)
+ seq_printf(s, "s%d", spgid->shard);
+}
+
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
+{
+ int i;
+
+ seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed);
+ dump_spgid(s, &t->spgid);
+ seq_puts(s, "\t[");
+ for (i = 0; i < t->up.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+ seq_printf(s, "]/%d\t[", t->up.primary);
+ for (i = 0; i < t->acting.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+ seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch);
+ if (t->target_oloc.pool_ns) {
+ seq_printf(s, "%*pE/%*pE\t0x%x",
+ (int)t->target_oloc.pool_ns->len,
+ t->target_oloc.pool_ns->str,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ } else {
+ seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len,
+ t->target_oid.name, t->flags);
+ }
+ if (t->paused)
+ seq_puts(s, "\tP");
+}
+
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+ int i;
+
+ seq_printf(s, "%llu\t", req->r_tid);
+ dump_target(s, &req->r_t);
+
+ seq_printf(s, "\t%d", req->r_attempts);
+
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+ ceph_osd_op_name(op->op));
+ if (op->op == CEPH_OSD_OP_WATCH)
+ seq_printf(s, "-%s",
+ ceph_osd_watch_op_name(op->watch.op));
+ else if (op->op == CEPH_OSD_OP_CALL)
+ seq_printf(s, "-%s/%s", op->cls.class_name,
+ op->cls.method_name);
+ }
+
+ seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ dump_request(s, req);
+ }
+
+ mutex_unlock(&osd->lock);
+}
+
+static void dump_linger_request(struct seq_file *s,
+ struct ceph_osd_linger_request *lreq)
+{
+ seq_printf(s, "%llu\t", lreq->linger_id);
+ dump_target(s, &lreq->t);
+
+ seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+ lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+ lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ dump_linger_request(s, lreq);
+ }
+
+ mutex_unlock(&osd->lock);
+}
+
+static void dump_snapid(struct seq_file *s, u64 snapid)
+{
+ if (snapid == CEPH_NOSNAP)
+ seq_puts(s, "head");
+ else if (snapid == CEPH_SNAPDIR)
+ seq_puts(s, "snapdir");
+ else
+ seq_printf(s, "%llx", snapid);
+}
+
+static void dump_name_escaped(struct seq_file *s, unsigned char *name,
+ size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (name[i] == '%' || name[i] == ':' || name[i] == '/' ||
+ name[i] < 32 || name[i] >= 127) {
+ seq_printf(s, "%%%02x", name[i]);
+ } else {
+ seq_putc(s, name[i]);
+ }
+ }
+}
+
+static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid)
+{
+ if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max &&
+ hoid->pool == S64_MIN) {
+ seq_puts(s, "MIN");
+ return;
+ }
+ if (hoid->is_max) {
+ seq_puts(s, "MAX");
+ return;
+ }
+ seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits);
+ dump_name_escaped(s, hoid->nspace, hoid->nspace_len);
+ seq_putc(s, ':');
+ dump_name_escaped(s, hoid->key, hoid->key_len);
+ seq_putc(s, ':');
+ dump_name_escaped(s, hoid->oid, hoid->oid_len);
+ seq_putc(s, ':');
+ dump_snapid(s, hoid->snapid);
+}
+
+static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) {
+ struct ceph_osd_backoff *backoff =
+ rb_entry(n, struct ceph_osd_backoff, id_node);
+
+ seq_printf(s, "osd%d\t", osd->o_osd);
+ dump_spgid(s, &backoff->spgid);
+ seq_printf(s, "\t%llu\t", backoff->id);
+ dump_hoid(s, backoff->begin);
+ seq_putc(s, '\t');
+ dump_hoid(s, backoff->end);
+ seq_putc(s, '\n');
+ }
+
+ mutex_unlock(&osd->lock);
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *n;
+
+ down_read(&osdc->lock);
+ seq_printf(s, "REQUESTS %d homeless %d\n",
+ atomic_read(&osdc->num_requests),
+ atomic_read(&osdc->num_homeless));
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_requests(s, osd);
+ }
+ dump_requests(s, &osdc->homeless_osd);
+
+ seq_puts(s, "LINGER REQUESTS\n");
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_linger_requests(s, osd);
+ }
+ dump_linger_requests(s, &osdc->homeless_osd);
+
+ seq_puts(s, "BACKOFFS\n");
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_backoffs(s, osd);
+ }
+
+ up_read(&osdc->lock);
+ return 0;
+}
+
+static int client_options_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ int ret;
+
+ ret = ceph_print_client_options(s, client, true);
+ if (ret)
+ return ret;
+
+ seq_putc(s, '\n');
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(monmap);
+DEFINE_SHOW_ATTRIBUTE(osdmap);
+DEFINE_SHOW_ATTRIBUTE(monc);
+DEFINE_SHOW_ATTRIBUTE(osdc);
+DEFINE_SHOW_ATTRIBUTE(client_options);
+
+void __init ceph_debugfs_init(void)
+{
+ ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+}
+
+void ceph_debugfs_cleanup(void)
+{
+ debugfs_remove(ceph_debugfs_dir);
+}
+
+void ceph_debugfs_client_init(struct ceph_client *client)
+{
+ char name[80];
+
+ snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+ client->monc.auth->global_id);
+
+ dout("ceph_debugfs_client_init %p %s\n", client, name);
+
+ client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+
+ client->monc.debugfs_file = debugfs_create_file("monc",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &monc_fops);
+
+ client->osdc.debugfs_file = debugfs_create_file("osdc",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &osdc_fops);
+
+ client->debugfs_monmap = debugfs_create_file("monmap",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &monmap_fops);
+
+ client->debugfs_osdmap = debugfs_create_file("osdmap",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &osdmap_fops);
+
+ client->debugfs_options = debugfs_create_file("client_options",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &client_options_fops);
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+ dout("ceph_debugfs_client_cleanup %p\n", client);
+ debugfs_remove(client->debugfs_options);
+ debugfs_remove(client->debugfs_osdmap);
+ debugfs_remove(client->debugfs_monmap);
+ debugfs_remove(client->osdc.debugfs_file);
+ debugfs_remove(client->monc.debugfs_file);
+ debugfs_remove(client->debugfs_dir);
+}
+
+#else /* CONFIG_DEBUG_FS */
+
+void __init ceph_debugfs_init(void)
+{
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+void ceph_debugfs_client_init(struct ceph_client *client)
+{
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 000000000000..bc109a1a4616
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/inet.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h> /* for ceph_pr_addr() */
+
+static int
+ceph_decode_entity_addr_versioned(void **p, void *end,
+ struct ceph_entity_addr *addr)
+{
+ int ret;
+ u8 struct_v;
+ u32 struct_len, addr_len;
+ void *struct_end;
+
+ ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
+ &struct_len);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+ struct_end = *p + struct_len;
+
+ ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
+
+ ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+
+ ceph_decode_32_safe(p, end, addr_len, bad);
+ if (addr_len > sizeof(addr->in_addr))
+ goto bad;
+
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+ if (addr_len) {
+ ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
+
+ addr->in_addr.ss_family =
+ le16_to_cpu((__force __le16)addr->in_addr.ss_family);
+ }
+
+ /* Advance past anything the client doesn't yet understand */
+ *p = struct_end;
+ ret = 0;
+bad:
+ return ret;
+}
+
+static int
+ceph_decode_entity_addr_legacy(void **p, void *end,
+ struct ceph_entity_addr *addr)
+{
+ int ret = -EINVAL;
+
+ /* Skip rest of type field */
+ ceph_decode_skip_n(p, end, 3, bad);
+
+ /*
+ * Clients that don't support ADDR2 always send TYPE_NONE, change it
+ * to TYPE_LEGACY for forward compatibility.
+ */
+ addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+ ceph_decode_copy_safe(p, end, &addr->in_addr,
+ sizeof(addr->in_addr), bad);
+ addr->in_addr.ss_family =
+ be16_to_cpu((__force __be16)addr->in_addr.ss_family);
+ ret = 0;
+bad:
+ return ret;
+}
+
+int
+ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
+{
+ u8 marker;
+
+ ceph_decode_8_safe(p, end, marker, bad);
+ if (marker == 1)
+ return ceph_decode_entity_addr_versioned(p, end, addr);
+ else if (marker == 0)
+ return ceph_decode_entity_addr_legacy(p, end, addr);
+bad:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addr);
+
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+ CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ struct ceph_entity_addr tmp_addr;
+ int addr_cnt;
+ bool found;
+ u8 marker;
+ int ret;
+ int i;
+
+ ceph_decode_8_safe(p, end, marker, e_inval);
+ if (marker != 2) {
+ pr_err("bad addrvec marker %d\n", marker);
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+ dout("%s addr_cnt %d\n", __func__, addr_cnt);
+
+ found = false;
+ for (i = 0; i < addr_cnt; i++) {
+ ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+ if (ret)
+ return ret;
+
+ dout("%s i %d addr %s\n", __func__, i, ceph_pr_addr(&tmp_addr));
+ if (tmp_addr.type == my_type) {
+ if (found) {
+ pr_err("another match of type %d in addrvec\n",
+ le32_to_cpu(my_type));
+ return -EINVAL;
+ }
+
+ memcpy(addr, &tmp_addr, sizeof(*addr));
+ found = true;
+ }
+ }
+
+ if (found)
+ return 0;
+
+ if (!addr_cnt)
+ return 0; /* normal -- e.g. unused OSD id/slot */
+
+ if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr)))
+ return 0; /* weird but effectively the same as !addr_cnt */
+
+ pr_err("no match of type %d in addrvec\n", le32_to_cpu(my_type));
+ return -ENOENT;
+
+e_inval:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } u;
+
+ switch (family) {
+ case AF_INET:
+ return sizeof(u.sin);
+ case AF_INET6:
+ return sizeof(u.sin6);
+ default:
+ return sizeof(u);
+ }
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ ceph_encode_8(p, 1); /* marker */
+ ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+ sizeof(addr->nonce) +
+ sizeof(u32) + addr_len);
+ ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+ ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+ ceph_encode_32(p, addr_len);
+ ceph_encode_16(p, family);
+ ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 000000000000..70b25f4ecba6
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2222 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/nsproxy.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif /* CONFIG_BLOCK */
+#include <linux/dns_resolver.h>
+#include <net/tcp.h>
+#include <trace/events/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/export.h>
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system. The messenger provides ordered and reliable
+ * delivery. We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error). Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/*
+ * We track the state of the socket on a given connection using
+ * values defined below. The transition to a new socket state is
+ * handled by a function which verifies we aren't coming from an
+ * unexpected state.
+ *
+ * --------
+ * | NEW* | transient initial state
+ * --------
+ * | con_sock_state_init()
+ * v
+ * ----------
+ * | CLOSED | initialized, but no socket (and no
+ * ---------- TCP connection)
+ * ^ \
+ * | \ con_sock_state_connecting()
+ * | ----------------------
+ * | \
+ * + con_sock_state_closed() \
+ * |+--------------------------- \
+ * | \ \ \
+ * | ----------- \ \
+ * | | CLOSING | socket event; \ \
+ * | ----------- await close \ \
+ * | ^ \ |
+ * | | \ |
+ * | + con_sock_state_closing() \ |
+ * | / \ | |
+ * | / --------------- | |
+ * | / \ v v
+ * | / --------------
+ * | / -----------------| CONNECTING | socket created, TCP
+ * | | / -------------- connect initiated
+ * | | | con_sock_state_connected()
+ * | | v
+ * -------------
+ * | CONNECTED | TCP connection established
+ * -------------
+ *
+ * State values for ceph_connection->sock_state; NEW is assumed to be 0.
+ */
+
+#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
+#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
+#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
+#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
+#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
+
+static bool con_flag_valid(unsigned long con_flag)
+{
+ switch (con_flag) {
+ case CEPH_CON_F_LOSSYTX:
+ case CEPH_CON_F_KEEPALIVE_PENDING:
+ case CEPH_CON_F_WRITE_PENDING:
+ case CEPH_CON_F_SOCK_CLOSED:
+ case CEPH_CON_F_BACKOFF:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ clear_bit(con_flag, &con->flags);
+}
+
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ set_bit(con_flag, &con->flags);
+}
+
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ return test_bit(con_flag, &con->flags);
+}
+
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ return test_and_clear_bit(con_flag, &con->flags);
+}
+
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ return test_and_set_bit(con_flag, &con->flags);
+}
+
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache *ceph_msg_cache;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+static void queue_con(struct ceph_connection *con);
+static void cancel_con(struct ceph_connection *con);
+static void ceph_con_workfn(struct work_struct *);
+static void con_fault(struct ceph_connection *con);
+
+/*
+ * Nicely render a sockaddr as a string. An array of formatted
+ * strings is used, to approximate reentrancy.
+ */
+#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */
+#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG)
+#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1)
+#define MAX_ADDR_STR_LEN 64 /* 54 is enough */
+
+static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
+static atomic_t addr_str_seq = ATOMIC_INIT(0);
+
+struct page *ceph_zero_page; /* used in certain error cases */
+
+const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
+{
+ int i;
+ char *s;
+ struct sockaddr_storage ss = addr->in_addr; /* align */
+ struct sockaddr_in *in4 = (struct sockaddr_in *)&ss;
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss;
+
+ i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
+ s = addr_str[i];
+
+ switch (ss.ss_family) {
+ case AF_INET:
+ snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
+ le32_to_cpu(addr->type), &in4->sin_addr,
+ ntohs(in4->sin_port));
+ break;
+
+ case AF_INET6:
+ snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
+ le32_to_cpu(addr->type), &in6->sin6_addr,
+ ntohs(in6->sin6_port));
+ break;
+
+ default:
+ snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
+ ss.ss_family);
+ }
+
+ return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
+{
+ if (!ceph_msgr2(from_msgr(msgr))) {
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+ sizeof(msgr->my_enc_addr));
+ ceph_encode_banner_addr(&msgr->my_enc_addr);
+ }
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+static struct workqueue_struct *ceph_msgr_wq;
+
+static int ceph_msgr_slab_init(void)
+{
+ BUG_ON(ceph_msg_cache);
+ ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
+ if (!ceph_msg_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void ceph_msgr_slab_exit(void)
+{
+ BUG_ON(!ceph_msg_cache);
+ kmem_cache_destroy(ceph_msg_cache);
+ ceph_msg_cache = NULL;
+}
+
+static void _ceph_msgr_exit(void)
+{
+ if (ceph_msgr_wq) {
+ destroy_workqueue(ceph_msgr_wq);
+ ceph_msgr_wq = NULL;
+ }
+
+ BUG_ON(!ceph_zero_page);
+ put_page(ceph_zero_page);
+ ceph_zero_page = NULL;
+
+ ceph_msgr_slab_exit();
+}
+
+int __init ceph_msgr_init(void)
+{
+ if (ceph_msgr_slab_init())
+ return -ENOMEM;
+
+ BUG_ON(ceph_zero_page);
+ ceph_zero_page = ZERO_PAGE(0);
+ get_page(ceph_zero_page);
+
+ /*
+ * The number of active work items is limited by the number of
+ * connections, so leave @max_active at default.
+ */
+ ceph_msgr_wq = alloc_workqueue("ceph-msgr",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
+ if (ceph_msgr_wq)
+ return 0;
+
+ pr_err("msgr_init failed to create workqueue\n");
+ _ceph_msgr_exit();
+
+ return -ENOMEM;
+}
+
+void ceph_msgr_exit(void)
+{
+ BUG_ON(ceph_msgr_wq == NULL);
+
+ _ceph_msgr_exit();
+}
+
+void ceph_msgr_flush(void)
+{
+ flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+/* Connection socket state transition functions */
+
+static void con_sock_state_init(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSED);
+}
+
+static void con_sock_state_connecting(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CONNECTING);
+}
+
+static void con_sock_state_connected(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CONNECTED);
+}
+
+static void con_sock_state_closing(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
+ old_state != CON_SOCK_STATE_CONNECTED &&
+ old_state != CON_SOCK_STATE_CLOSING))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSING);
+}
+
+static void con_sock_state_closed(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
+ old_state != CON_SOCK_STATE_CLOSING &&
+ old_state != CON_SOCK_STATE_CONNECTING &&
+ old_state != CON_SOCK_STATE_CLOSED))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSED);
+}
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_sock_data_ready(struct sock *sk)
+{
+ struct ceph_connection *con = sk->sk_user_data;
+
+ trace_sk_data_ready(sk);
+
+ if (atomic_read(&con->msgr->stopping)) {
+ return;
+ }
+
+ if (sk->sk_state != TCP_CLOSE_WAIT) {
+ dout("%s %p state = %d, queueing work\n", __func__,
+ con, con->state);
+ queue_con(con);
+ }
+}
+
+/* socket has buffer space for writing */
+static void ceph_sock_write_space(struct sock *sk)
+{
+ struct ceph_connection *con = sk->sk_user_data;
+
+ /* only queue to workqueue if there is data we want to write,
+ * and there is sufficient space in the socket buffer to accept
+ * more data. clear SOCK_NOSPACE so that ceph_sock_write_space()
+ * doesn't get called again until try_write() fills the socket
+ * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
+ * and net/core/stream.c:sk_stream_write_space().
+ */
+ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
+ if (sk_stream_is_writeable(sk)) {
+ dout("%s %p queueing write work\n", __func__, con);
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ queue_con(con);
+ }
+ } else {
+ dout("%s %p nothing to write\n", __func__, con);
+ }
+}
+
+/* socket's state has changed */
+static void ceph_sock_state_change(struct sock *sk)
+{
+ struct ceph_connection *con = sk->sk_user_data;
+
+ dout("%s %p state = %d sk_state = %u\n", __func__,
+ con, con->state, sk->sk_state);
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ dout("%s TCP_CLOSE\n", __func__);
+ fallthrough;
+ case TCP_CLOSE_WAIT:
+ dout("%s TCP_CLOSE_WAIT\n", __func__);
+ con_sock_state_closing(con);
+ ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
+ queue_con(con);
+ break;
+ case TCP_ESTABLISHED:
+ dout("%s TCP_ESTABLISHED\n", __func__);
+ con_sock_state_connected(con);
+ queue_con(con);
+ break;
+ default: /* Everything else is uninteresting */
+ break;
+ }
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+ struct ceph_connection *con)
+{
+ struct sock *sk = sock->sk;
+ sk->sk_user_data = con;
+ sk->sk_data_ready = ceph_sock_data_ready;
+ sk->sk_write_space = ceph_sock_write_space;
+ sk->sk_state_change = ceph_sock_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+int ceph_tcp_connect(struct ceph_connection *con)
+{
+ struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
+ struct socket *sock;
+ unsigned int noio_flag;
+ int ret;
+
+ dout("%s con %p peer_addr %s\n", __func__, con,
+ ceph_pr_addr(&con->peer_addr));
+ BUG_ON(con->sock);
+
+ /* sock_create_kern() allocates with GFP_KERNEL */
+ noio_flag = memalloc_noio_save();
+ ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ memalloc_noio_restore(noio_flag);
+ if (ret)
+ return ret;
+ sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_use_task_frag = false;
+
+#ifdef CONFIG_LOCKDEP
+ lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+ set_sock_callbacks(sock, con);
+
+ con_sock_state_connecting(con);
+ ret = kernel_connect(sock, (struct sockaddr_unsized *)&ss, sizeof(ss),
+ O_NONBLOCK);
+ if (ret == -EINPROGRESS) {
+ dout("connect %s EINPROGRESS sk_state = %u\n",
+ ceph_pr_addr(&con->peer_addr),
+ sock->sk->sk_state);
+ } else if (ret < 0) {
+ pr_err("connect %s error %d\n",
+ ceph_pr_addr(&con->peer_addr), ret);
+ sock_release(sock);
+ return ret;
+ }
+
+ if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY))
+ tcp_sock_set_nodelay(sock->sk);
+
+ con->sock = sock;
+ return 0;
+}
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+int ceph_con_close_socket(struct ceph_connection *con)
+{
+ int rc = 0;
+
+ dout("%s con %p sock %p\n", __func__, con, con->sock);
+ if (con->sock) {
+ rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+ sock_release(con->sock);
+ con->sock = NULL;
+ }
+
+ /*
+ * Forcibly clear the SOCK_CLOSED flag. It gets set
+ * independent of the connection mutex, and we could have
+ * received a socket close event before we had the chance to
+ * shut the socket down.
+ */
+ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
+
+ con_sock_state_closed(con);
+ return rc;
+}
+
+static void ceph_con_reset_protocol(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ ceph_con_close_socket(con);
+ if (con->in_msg) {
+ WARN_ON(con->in_msg->con != con);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+ if (con->out_msg) {
+ WARN_ON(con->out_msg->con != con);
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+ if (con->bounce_page) {
+ __free_page(con->bounce_page);
+ con->bounce_page = NULL;
+ }
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_protocol(con);
+ else
+ ceph_con_v1_reset_protocol(con);
+}
+
+/*
+ * Reset a connection. Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+ list_del_init(&msg->list_head);
+
+ ceph_msg_put(msg);
+}
+
+static void ceph_msg_remove_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+ list_head);
+ ceph_msg_remove(msg);
+ }
+}
+
+void ceph_con_reset_session(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ WARN_ON(con->in_msg);
+ WARN_ON(con->out_msg);
+ ceph_msg_remove_list(&con->out_queue);
+ ceph_msg_remove_list(&con->out_sent);
+ con->out_seq = 0;
+ con->in_seq = 0;
+ con->in_seq_acked = 0;
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_session(con);
+ else
+ ceph_con_v1_reset_session(con);
+}
+
+/*
+ * mark a peer down. drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+ mutex_lock(&con->mutex);
+ dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
+ con->state = CEPH_CON_S_CLOSED;
+
+ ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next
+ connect */
+ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
+
+ ceph_con_reset_protocol(con);
+ ceph_con_reset_session(con);
+ cancel_con(con);
+ mutex_unlock(&con->mutex);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con,
+ __u8 entity_type, __u64 entity_num,
+ struct ceph_entity_addr *addr)
+{
+ mutex_lock(&con->mutex);
+ dout("con_open %p %s\n", con, ceph_pr_addr(addr));
+
+ WARN_ON(con->state != CEPH_CON_S_CLOSED);
+ con->state = CEPH_CON_S_PREOPEN;
+
+ con->peer_name.type = (__u8) entity_type;
+ con->peer_name.num = cpu_to_le64(entity_num);
+
+ memcpy(&con->peer_addr, addr, sizeof(*addr));
+ con->delay = 0; /* reset backoff memory */
+ mutex_unlock(&con->mutex);
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ return ceph_con_v2_opened(con);
+
+ return ceph_con_v1_opened(con);
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_connection *con, void *private,
+ const struct ceph_connection_operations *ops,
+ struct ceph_messenger *msgr)
+{
+ dout("con_init %p\n", con);
+ memset(con, 0, sizeof(*con));
+ con->private = private;
+ con->ops = ops;
+ con->msgr = msgr;
+
+ con_sock_state_init(con);
+
+ mutex_init(&con->mutex);
+ INIT_LIST_HEAD(&con->out_queue);
+ INIT_LIST_HEAD(&con->out_sent);
+ INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
+
+ con->state = CEPH_CON_S_CLOSED;
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+/*
+ * We maintain a global counter to order connection attempts. Get
+ * a unique seq greater than @gt.
+ */
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+ u32 ret;
+
+ spin_lock(&msgr->global_seq_lock);
+ if (msgr->global_seq < gt)
+ msgr->global_seq = gt;
+ ret = ++msgr->global_seq;
+ spin_unlock(&msgr->global_seq_lock);
+ return ret;
+}
+
+/*
+ * Discard messages that have been acked by the server.
+ */
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
+{
+ struct ceph_msg *msg;
+ u64 seq;
+
+ dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
+ while (!list_empty(&con->out_sent)) {
+ msg = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ WARN_ON(msg->needs_out_seq);
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > ack_seq)
+ break;
+
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
+}
+
+/*
+ * Discard messages that have been requeued in con_fault(), up to
+ * reconnect_seq. This avoids gratuitously resending messages that
+ * the server had received and handled prior to reconnect.
+ */
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
+{
+ struct ceph_msg *msg;
+ u64 seq;
+
+ dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
+ while (!list_empty(&con->out_queue)) {
+ msg = list_first_entry(&con->out_queue, struct ceph_msg,
+ list_head);
+ if (msg->needs_out_seq)
+ break;
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > reconnect_seq)
+ break;
+
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
+}
+
+#ifdef CONFIG_BLOCK
+
+/*
+ * For a bio data item, a piece is whatever remains of the next
+ * entry in the current bio iovec, or the first entry in the next
+ * bio in the list.
+ */
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_bio_iter *it = &cursor->bio_iter;
+
+ cursor->resid = min_t(size_t, length, data->bio_length);
+ *it = data->bio_pos;
+ if (cursor->resid < it->iter.bi_size)
+ it->iter.bi_size = cursor->resid;
+
+ BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
+}
+
+static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset,
+ size_t *length)
+{
+ struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
+ cursor->bio_iter.iter);
+
+ *page_offset = bv.bv_offset;
+ *length = bv.bv_len;
+ return bv.bv_page;
+}
+
+static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct ceph_bio_iter *it = &cursor->bio_iter;
+ struct page *page = bio_iter_page(it->bio, it->iter);
+
+ BUG_ON(bytes > cursor->resid);
+ BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
+ cursor->resid -= bytes;
+ bio_advance_iter(it->bio, &it->iter, bytes);
+
+ if (!cursor->resid)
+ return false; /* no more data */
+
+ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done &&
+ page == bio_iter_page(it->bio, it->iter)))
+ return false; /* more bytes to process in this segment */
+
+ if (!it->iter.bi_size) {
+ it->bio = it->bio->bi_next;
+ it->iter = it->bio->bi_iter;
+ if (cursor->resid < it->iter.bi_size)
+ it->iter.bi_size = cursor->resid;
+ }
+
+ BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
+static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct bio_vec *bvecs = data->bvec_pos.bvecs;
+
+ cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size);
+ cursor->bvec_iter = data->bvec_pos.iter;
+ cursor->bvec_iter.bi_size = cursor->resid;
+
+ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
+}
+
+static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset,
+ size_t *length)
+{
+ struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs,
+ cursor->bvec_iter);
+
+ *page_offset = bv.bv_offset;
+ *length = bv.bv_len;
+ return bv.bv_page;
+}
+
+static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
+ struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter);
+
+ BUG_ON(bytes > cursor->resid);
+ BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
+ cursor->resid -= bytes;
+ bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
+
+ if (!cursor->resid)
+ return false; /* no more data */
+
+ if (!bytes || (cursor->bvec_iter.bi_bvec_done &&
+ page == bvec_iter_page(bvecs, cursor->bvec_iter)))
+ return false; /* more bytes to process in this segment */
+
+ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
+ return true;
+}
+
+/*
+ * For a page array, a piece comes from the first page in the array
+ * that has not already been fully consumed.
+ */
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ int page_count;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(!data->pages);
+ BUG_ON(!data->length);
+
+ cursor->resid = min(length, data->length);
+ page_count = calc_pages_for(data->alignment, (u64)data->length);
+ cursor->page_offset = data->alignment & ~PAGE_MASK;
+ cursor->page_index = 0;
+ BUG_ON(page_count > (int)USHRT_MAX);
+ cursor->page_count = (unsigned short)page_count;
+ BUG_ON(length > SIZE_MAX - cursor->page_offset);
+}
+
+static struct page *
+ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(cursor->page_index >= cursor->page_count);
+ BUG_ON(cursor->page_offset >= PAGE_SIZE);
+
+ *page_offset = cursor->page_offset;
+ *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
+ return data->pages[cursor->page_index];
+}
+
+static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
+
+ /* Advance the cursor page offset */
+
+ cursor->resid -= bytes;
+ cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
+ if (!bytes || cursor->page_offset)
+ return false; /* more bytes to process in the current page */
+
+ if (!cursor->resid)
+ return false; /* no more data */
+
+ /* Move on to the next page; offset is already at 0 */
+
+ BUG_ON(cursor->page_index >= cursor->page_count);
+ cursor->page_index++;
+ return true;
+}
+
+/*
+ * For a pagelist, a piece is whatever remains to be consumed in the
+ * first page in the list, or the front of the next page.
+ */
+static void
+ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+ struct page *page;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ if (!length)
+ return; /* pagelist can be assigned but empty */
+
+ BUG_ON(list_empty(&pagelist->head));
+ page = list_first_entry(&pagelist->head, struct page, lru);
+
+ cursor->resid = min(length, pagelist->length);
+ cursor->page = page;
+ cursor->offset = 0;
+}
+
+static struct page *
+ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ BUG_ON(!cursor->page);
+ BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+
+ /* offset of first page in pagelist is always 0 */
+ *page_offset = cursor->offset & ~PAGE_MASK;
+ *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
+ return cursor->page;
+}
+
+static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+ BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
+
+ /* Advance the cursor offset */
+
+ cursor->resid -= bytes;
+ cursor->offset += bytes;
+ /* offset of first page in pagelist is always 0 */
+ if (!bytes || cursor->offset & ~PAGE_MASK)
+ return false; /* more bytes to process in the current page */
+
+ if (!cursor->resid)
+ return false; /* no more data */
+
+ /* Move on to the next page */
+
+ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
+ cursor->page = list_next_entry(cursor->page, lru);
+ return true;
+}
+
+static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+
+ cursor->iov_iter = data->iter;
+ cursor->lastlen = 0;
+ iov_iter_truncate(&cursor->iov_iter, length);
+ cursor->resid = iov_iter_count(&cursor->iov_iter);
+}
+
+static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct page *page;
+ ssize_t len;
+
+ if (cursor->lastlen)
+ iov_iter_revert(&cursor->iov_iter, cursor->lastlen);
+
+ len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE,
+ 1, page_offset);
+ BUG_ON(len < 0);
+
+ cursor->lastlen = len;
+
+ /*
+ * FIXME: The assumption is that the pages represented by the iov_iter
+ * are pinned, with the references held by the upper-level
+ * callers, or by virtue of being under writeback. Eventually,
+ * we'll get an iov_iter_get_pages2 variant that doesn't take
+ * page refs. Until then, just put the page ref.
+ */
+ VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
+ put_page(page);
+
+ *length = min_t(size_t, len, cursor->resid);
+ return page;
+}
+
+static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ BUG_ON(bytes > cursor->resid);
+ cursor->resid -= bytes;
+
+ if (bytes < cursor->lastlen) {
+ cursor->lastlen -= bytes;
+ } else {
+ iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen);
+ cursor->lastlen = 0;
+ }
+
+ return cursor->resid;
+}
+
+/*
+ * Message data is handled (sent or received) in pieces, where each
+ * piece resides on a single page. The network layer might not
+ * consume an entire piece at once. A data item's cursor keeps
+ * track of which piece is next to process and how much remains to
+ * be processed in that piece. It also tracks whether the current
+ * piece is the last one in the data item.
+ */
+static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
+{
+ size_t length = cursor->total_resid;
+
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ ceph_msg_data_pagelist_cursor_init(cursor, length);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ ceph_msg_data_pages_cursor_init(cursor, length);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ ceph_msg_data_bio_cursor_init(cursor, length);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_BVECS:
+ ceph_msg_data_bvecs_cursor_init(cursor, length);
+ break;
+ case CEPH_MSG_DATA_ITER:
+ ceph_msg_data_iter_cursor_init(cursor, length);
+ break;
+ case CEPH_MSG_DATA_NONE:
+ default:
+ /* BUG(); */
+ break;
+ }
+ cursor->need_crc = true;
+}
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length)
+{
+ BUG_ON(!length);
+ BUG_ON(length > msg->data_length);
+ BUG_ON(!msg->num_data_items);
+
+ cursor->total_resid = length;
+ cursor->data = msg->data;
+ cursor->sr_resid = 0;
+
+ __ceph_msg_data_cursor_init(cursor);
+}
+
+/*
+ * Return the page containing the next piece to process for a given
+ * data item, and supply the page offset and length of that piece.
+ * Indicate whether this is the last piece in this data item.
+ */
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct page *page;
+
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ page = ceph_msg_data_pages_next(cursor, page_offset, length);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ page = ceph_msg_data_bio_next(cursor, page_offset, length);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_BVECS:
+ page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
+ break;
+ case CEPH_MSG_DATA_ITER:
+ page = ceph_msg_data_iter_next(cursor, page_offset, length);
+ break;
+ case CEPH_MSG_DATA_NONE:
+ default:
+ page = NULL;
+ break;
+ }
+
+ BUG_ON(!page);
+ BUG_ON(*page_offset + *length > PAGE_SIZE);
+ BUG_ON(!*length);
+ BUG_ON(*length > cursor->resid);
+
+ return page;
+}
+
+/*
+ * Returns true if the result moves the cursor on to the next piece
+ * of the data item.
+ */
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
+{
+ bool new_piece;
+
+ BUG_ON(bytes > cursor->resid);
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ new_piece = ceph_msg_data_pages_advance(cursor, bytes);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ new_piece = ceph_msg_data_bio_advance(cursor, bytes);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_BVECS:
+ new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
+ break;
+ case CEPH_MSG_DATA_ITER:
+ new_piece = ceph_msg_data_iter_advance(cursor, bytes);
+ break;
+ case CEPH_MSG_DATA_NONE:
+ default:
+ BUG();
+ break;
+ }
+ cursor->total_resid -= bytes;
+
+ if (!cursor->resid && cursor->total_resid) {
+ cursor->data++;
+ __ceph_msg_data_cursor_init(cursor);
+ new_piece = true;
+ }
+ cursor->need_crc = new_piece;
+}
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length)
+{
+ char *kaddr;
+
+ kaddr = kmap(page);
+ BUG_ON(kaddr == NULL);
+ crc = crc32c(crc, kaddr + page_offset, length);
+ kunmap(page);
+
+ return crc;
+}
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
+{
+ struct sockaddr_storage ss = addr->in_addr; /* align */
+ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
+ struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr;
+
+ switch (ss.ss_family) {
+ case AF_INET:
+ return addr4->s_addr == htonl(INADDR_ANY);
+ case AF_INET6:
+ return ipv6_addr_any(addr6);
+ default:
+ return true;
+ }
+}
+EXPORT_SYMBOL(ceph_addr_is_blank);
+
+int ceph_addr_port(const struct ceph_entity_addr *addr)
+{
+ switch (get_unaligned(&addr->in_addr.ss_family)) {
+ case AF_INET:
+ return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port));
+ case AF_INET6:
+ return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port));
+ }
+ return 0;
+}
+
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
+{
+ switch (get_unaligned(&addr->in_addr.ss_family)) {
+ case AF_INET:
+ put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port);
+ break;
+ case AF_INET6:
+ put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port);
+ break;
+ }
+}
+
+/*
+ * Unlike other *_pton function semantics, zero indicates success.
+ */
+static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr,
+ char delim, const char **ipend)
+{
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+
+ if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) {
+ put_unaligned(AF_INET, &addr->in_addr.ss_family);
+ return 0;
+ }
+
+ if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) {
+ put_unaligned(AF_INET6, &addr->in_addr.ss_family);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Extract hostname string and resolve using kernel DNS facility.
+ */
+#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
+static int ceph_dns_resolve_name(const char *name, size_t namelen,
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
+{
+ const char *end, *delim_p;
+ char *colon_p, *ip_addr = NULL;
+ int ip_len, ret;
+
+ /*
+ * The end of the hostname occurs immediately preceding the delimiter or
+ * the port marker (':') where the delimiter takes precedence.
+ */
+ delim_p = memchr(name, delim, namelen);
+ colon_p = memchr(name, ':', namelen);
+
+ if (delim_p && colon_p)
+ end = min(delim_p, colon_p);
+ else if (!delim_p && colon_p)
+ end = colon_p;
+ else {
+ end = delim_p;
+ if (!end) /* case: hostname:/ */
+ end = name + namelen;
+ }
+
+ if (end <= name)
+ return -EINVAL;
+
+ /* do dns_resolve upcall */
+ ip_len = dns_query(current->nsproxy->net_ns,
+ NULL, name, end - name, NULL, &ip_addr, NULL, false);
+ if (ip_len > 0)
+ ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
+ else
+ ret = -ESRCH;
+
+ kfree(ip_addr);
+
+ *ipend = end;
+
+ pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
+ ret, ret ? "failed" : ceph_pr_addr(addr));
+
+ return ret;
+}
+#else
+static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * Parse a server name (IP or hostname). If a valid IP address is not found
+ * then try to extract a hostname to resolve using userspace DNS upcall.
+ */
+static int ceph_parse_server_name(const char *name, size_t namelen,
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
+{
+ int ret;
+
+ ret = ceph_pton(name, namelen, addr, delim, ipend);
+ if (ret)
+ ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend);
+
+ return ret;
+}
+
+/*
+ * Parse an ip[:port] list into an addr array. Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count, char delim)
+{
+ int i, ret = -EINVAL;
+ const char *p = c;
+
+ dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+ for (i = 0; i < max_count; i++) {
+ char cur_delim = delim;
+ const char *ipend;
+ int port;
+
+ if (*p == '[') {
+ cur_delim = ']';
+ p++;
+ }
+
+ ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim,
+ &ipend);
+ if (ret)
+ goto bad;
+ ret = -EINVAL;
+
+ p = ipend;
+
+ if (cur_delim == ']') {
+ if (*p != ']') {
+ dout("missing matching ']'\n");
+ goto bad;
+ }
+ p++;
+ }
+
+ /* port? */
+ if (p < end && *p == ':') {
+ port = 0;
+ p++;
+ while (p < end && *p >= '0' && *p <= '9') {
+ port = (port * 10) + (*p - '0');
+ p++;
+ }
+ if (port == 0)
+ port = CEPH_MON_PORT;
+ else if (port > 65535)
+ goto bad;
+ } else {
+ port = CEPH_MON_PORT;
+ }
+
+ ceph_addr_set_port(&addr[i], port);
+ /*
+ * We want the type to be set according to ms_mode
+ * option, but options are normally parsed after mon
+ * addresses. Rather than complicating parsing, set
+ * to LEGACY and override in build_initial_monmap()
+ * for mon addresses and ceph_messenger_init() for
+ * ip option.
+ */
+ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ addr[i].nonce = 0;
+
+ dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i]));
+
+ if (p == end)
+ break;
+ if (*p != delim)
+ goto bad;
+ p++;
+ }
+
+ if (p != end)
+ goto bad;
+
+ if (count)
+ *count = i + 1;
+ return 0;
+
+bad:
+ return ret;
+}
+
+/*
+ * Process message. This happens in the worker thread. The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+void ceph_con_process_message(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg = NULL;
+
+ /* if first message, set peer_name */
+ if (con->peer_name.type == 0)
+ con->peer_name = msg->hdr.src;
+
+ con->in_seq++;
+ mutex_unlock(&con->mutex);
+
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
+ msg, le64_to_cpu(msg->hdr.seq),
+ ENTITY_NAME(msg->hdr.src),
+ le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
+ le32_to_cpu(msg->hdr.data_len),
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+ con->ops->dispatch(con, msg);
+
+ mutex_lock(&con->mutex);
+}
+
+/*
+ * Atomically queue work on a connection after the specified delay.
+ * Bump @con reference to avoid races with connection teardown.
+ * Returns 0 if work was queued, or an error code otherwise.
+ */
+static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
+{
+ if (!con->ops->get(con)) {
+ dout("%s %p ref count 0\n", __func__, con);
+ return -ENOENT;
+ }
+
+ if (delay >= HZ)
+ delay = round_jiffies_relative(delay);
+
+ dout("%s %p %lu\n", __func__, con, delay);
+ if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
+ dout("%s %p - already queued\n", __func__, con);
+ con->ops->put(con);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void queue_con(struct ceph_connection *con)
+{
+ (void) queue_con_delay(con, 0);
+}
+
+static void cancel_con(struct ceph_connection *con)
+{
+ if (cancel_delayed_work(&con->work)) {
+ dout("%s %p\n", __func__, con);
+ con->ops->put(con);
+ }
+}
+
+static bool con_sock_closed(struct ceph_connection *con)
+{
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
+ return false;
+
+#define CASE(x) \
+ case CEPH_CON_S_ ## x: \
+ con->error_msg = "socket closed (con state " #x ")"; \
+ break;
+
+ switch (con->state) {
+ CASE(CLOSED);
+ CASE(PREOPEN);
+ CASE(V1_BANNER);
+ CASE(V1_CONNECT_MSG);
+ CASE(V2_BANNER_PREFIX);
+ CASE(V2_BANNER_PAYLOAD);
+ CASE(V2_HELLO);
+ CASE(V2_AUTH);
+ CASE(V2_AUTH_SIGNATURE);
+ CASE(V2_SESSION_CONNECT);
+ CASE(V2_SESSION_RECONNECT);
+ CASE(OPEN);
+ CASE(STANDBY);
+ default:
+ BUG();
+ }
+#undef CASE
+
+ return true;
+}
+
+static bool con_backoff(struct ceph_connection *con)
+{
+ int ret;
+
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
+ return false;
+
+ ret = queue_con_delay(con, con->delay);
+ if (ret) {
+ dout("%s: con %p FAILED to back off %lu\n", __func__,
+ con, con->delay);
+ BUG_ON(ret == -ENOENT);
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
+ }
+
+ return true;
+}
+
+/* Finish fault handling; con->mutex must *not* be held here */
+
+static void con_fault_finish(struct ceph_connection *con)
+{
+ dout("%s %p\n", __func__, con);
+
+ /*
+ * in case we faulted due to authentication, invalidate our
+ * current tickets so that we can get new ones.
+ */
+ if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) {
+ dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
+ if (con->ops->invalidate_authorizer)
+ con->ops->invalidate_authorizer(con);
+ con->v1.auth_retry = 0;
+ }
+
+ if (con->ops->fault)
+ con->ops->fault(con);
+}
+
+/*
+ * Do some work on a connection. Drop a connection ref when we're done.
+ */
+static void ceph_con_workfn(struct work_struct *work)
+{
+ struct ceph_connection *con = container_of(work, struct ceph_connection,
+ work.work);
+ bool fault;
+
+ mutex_lock(&con->mutex);
+ while (true) {
+ int ret;
+
+ if ((fault = con_sock_closed(con))) {
+ dout("%s: con %p SOCK_CLOSED\n", __func__, con);
+ break;
+ }
+ if (con_backoff(con)) {
+ dout("%s: con %p BACKOFF\n", __func__, con);
+ break;
+ }
+ if (con->state == CEPH_CON_S_STANDBY) {
+ dout("%s: con %p STANDBY\n", __func__, con);
+ break;
+ }
+ if (con->state == CEPH_CON_S_CLOSED) {
+ dout("%s: con %p CLOSED\n", __func__, con);
+ BUG_ON(con->sock);
+ break;
+ }
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ dout("%s: con %p PREOPEN\n", __func__, con);
+ BUG_ON(con->sock);
+ }
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_read(con);
+ else
+ ret = ceph_con_v1_try_read(con);
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ continue;
+ if (!con->error_msg)
+ con->error_msg = "socket error on read";
+ fault = true;
+ break;
+ }
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_write(con);
+ else
+ ret = ceph_con_v1_try_write(con);
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ continue;
+ if (!con->error_msg)
+ con->error_msg = "socket error on write";
+ fault = true;
+ }
+
+ break; /* If we make it to here, we're done */
+ }
+ if (fault)
+ con_fault(con);
+ mutex_unlock(&con->mutex);
+
+ if (fault)
+ con_fault_finish(con);
+
+ con->ops->put(con);
+}
+
+/*
+ * Generic error/fault handler. A retry mechanism is used with
+ * exponential backoff
+ */
+static void con_fault(struct ceph_connection *con)
+{
+ dout("fault %p state %d to peer %s\n",
+ con, con->state, ceph_pr_addr(&con->peer_addr));
+
+ pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr), con->error_msg);
+ con->error_msg = NULL;
+
+ WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+ con->state == CEPH_CON_S_CLOSED);
+
+ ceph_con_reset_protocol(con);
+
+ if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
+ dout("fault on LOSSYTX channel, marking CLOSED\n");
+ con->state = CEPH_CON_S_CLOSED;
+ return;
+ }
+
+ /* Requeue anything that hasn't been acked */
+ list_splice_init(&con->out_sent, &con->out_queue);
+
+ /* If there are no messages queued or keepalive pending, place
+ * the connection in a STANDBY state */
+ if (list_empty(&con->out_queue) &&
+ !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+ dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ con->state = CEPH_CON_S_STANDBY;
+ } else {
+ /* retry after a delay. */
+ con->state = CEPH_CON_S_PREOPEN;
+ if (!con->delay) {
+ con->delay = BASE_DELAY_INTERVAL;
+ } else if (con->delay < MAX_DELAY_INTERVAL) {
+ con->delay *= 2;
+ if (con->delay > MAX_DELAY_INTERVAL)
+ con->delay = MAX_DELAY_INTERVAL;
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
+ queue_con(con);
+ }
+}
+
+void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
+{
+ u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
+ msgr->inst.addr.nonce = cpu_to_le32(nonce);
+ ceph_encode_my_addr(msgr);
+}
+
+/*
+ * initialize a new messenger instance
+ */
+void ceph_messenger_init(struct ceph_messenger *msgr,
+ struct ceph_entity_addr *myaddr)
+{
+ spin_lock_init(&msgr->global_seq_lock);
+
+ if (myaddr) {
+ memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
+ sizeof(msgr->inst.addr.in_addr));
+ ceph_addr_set_port(&msgr->inst.addr, 0);
+ }
+
+ /*
+ * Since nautilus, clients are identified using type ANY.
+ * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+ */
+ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
+
+ /* generate a random non-zero nonce */
+ do {
+ get_random_bytes(&msgr->inst.addr.nonce,
+ sizeof(msgr->inst.addr.nonce));
+ } while (!msgr->inst.addr.nonce);
+ ceph_encode_my_addr(msgr);
+
+ atomic_set(&msgr->stopping, 0);
+ write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
+
+ dout("%s %p\n", __func__, msgr);
+}
+
+void ceph_messenger_fini(struct ceph_messenger *msgr)
+{
+ put_net(read_pnet(&msgr->net));
+}
+
+static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
+{
+ if (msg->con)
+ msg->con->ops->put(msg->con);
+
+ msg->con = con ? con->ops->get(con) : NULL;
+ BUG_ON(msg->con != con);
+}
+
+static void clear_standby(struct ceph_connection *con)
+{
+ /* come back from STANDBY? */
+ if (con->state == CEPH_CON_S_STANDBY) {
+ dout("clear_standby %p\n", con);
+ con->state = CEPH_CON_S_PREOPEN;
+ if (!ceph_msgr2(from_msgr(con->msgr)))
+ con->v1.connect_seq++;
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
+ }
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ *
+ * Consumes a ref on @msg.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ /* set src+dst */
+ msg->hdr.src = con->msgr->inst.name;
+ BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+ msg->needs_out_seq = true;
+
+ mutex_lock(&con->mutex);
+
+ if (con->state == CEPH_CON_S_CLOSED) {
+ dout("con_send %p closed, dropping %p\n", con, msg);
+ ceph_msg_put(msg);
+ mutex_unlock(&con->mutex);
+ return;
+ }
+
+ msg_con_set(msg, con);
+
+ BUG_ON(!list_empty(&msg->list_head));
+ list_add_tail(&msg->list_head, &con->out_queue);
+ dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+ ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
+ le32_to_cpu(msg->hdr.data_len));
+
+ clear_standby(con);
+ mutex_unlock(&con->mutex);
+
+ /* if there wasn't anything waiting to send before, queue
+ * new work */
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_msg_revoke(struct ceph_msg *msg)
+{
+ struct ceph_connection *con = msg->con;
+
+ if (!con) {
+ dout("%s msg %p null con\n", __func__, msg);
+ return; /* Message not in our possession */
+ }
+
+ mutex_lock(&con->mutex);
+ if (list_empty(&msg->list_head)) {
+ WARN_ON(con->out_msg == msg);
+ dout("%s con %p msg %p not linked\n", __func__, con, msg);
+ mutex_unlock(&con->mutex);
+ return;
+ }
+
+ dout("%s con %p msg %p was linked\n", __func__, con, msg);
+ msg->hdr.seq = 0;
+ ceph_msg_remove(msg);
+
+ if (con->out_msg == msg) {
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was sending\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke(con, msg);
+ else
+ ceph_con_v1_revoke(con, msg);
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ } else {
+ dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+ con, msg, con->out_msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_msg_revoke_incoming(struct ceph_msg *msg)
+{
+ struct ceph_connection *con = msg->con;
+
+ if (!con) {
+ dout("%s msg %p null con\n", __func__, msg);
+ return; /* Message not in our possession */
+ }
+
+ mutex_lock(&con->mutex);
+ if (con->in_msg == msg) {
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was recving\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke_incoming(con);
+ else
+ ceph_con_v1_revoke_incoming(con);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ } else {
+ dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+ con, msg, con->in_msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+ dout("con_keepalive %p\n", con);
+ mutex_lock(&con->mutex);
+ clear_standby(con);
+ ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
+ mutex_unlock(&con->mutex);
+
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+bool ceph_con_keepalive_expired(struct ceph_connection *con,
+ unsigned long interval)
+{
+ if (interval > 0 &&
+ (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+ struct timespec64 now;
+ struct timespec64 ts;
+ ktime_get_real_ts64(&now);
+ jiffies_to_timespec64(interval, &ts);
+ ts = timespec64_add(con->last_keepalive_ack, ts);
+ return timespec64_compare(&now, &ts) >= 0;
+ }
+ return false;
+}
+
+static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
+{
+ BUG_ON(msg->num_data_items >= msg->max_data_items);
+ return &msg->data[msg->num_data_items++];
+}
+
+static void ceph_msg_data_destroy(struct ceph_msg_data *data)
+{
+ if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) {
+ int num_pages = calc_pages_for(data->alignment, data->length);
+ ceph_release_page_vector(data->pages, num_pages);
+ } else if (data->type == CEPH_MSG_DATA_PAGELIST) {
+ ceph_pagelist_release(data->pagelist);
+ }
+}
+
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+ size_t length, size_t alignment, bool own_pages)
+{
+ struct ceph_msg_data *data;
+
+ BUG_ON(!pages);
+ BUG_ON(!length);
+
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_PAGES;
+ data->pages = pages;
+ data->length = length;
+ data->alignment = alignment & ~PAGE_MASK;
+ data->own_pages = own_pages;
+
+ msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_pages);
+
+void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
+ struct ceph_pagelist *pagelist)
+{
+ struct ceph_msg_data *data;
+
+ BUG_ON(!pagelist);
+ BUG_ON(!pagelist->length);
+
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_PAGELIST;
+ refcount_inc(&pagelist->refcnt);
+ data->pagelist = pagelist;
+
+ msg->data_length += pagelist->length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
+
+#ifdef CONFIG_BLOCK
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
+ u32 length)
+{
+ struct ceph_msg_data *data;
+
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_BIO;
+ data->bio_pos = *bio_pos;
+ data->bio_length = length;
+
+ msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_bio);
+#endif /* CONFIG_BLOCK */
+
+void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
+ struct ceph_bvec_iter *bvec_pos)
+{
+ struct ceph_msg_data *data;
+
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_BVECS;
+ data->bvec_pos = *bvec_pos;
+
+ msg->data_length += bvec_pos->iter.bi_size;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
+
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+ struct iov_iter *iter)
+{
+ struct ceph_msg_data *data;
+
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_ITER;
+ data->iter = *iter;
+
+ msg->data_length += iov_iter_count(&data->iter);
+}
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
+ gfp_t flags, bool can_fail)
+{
+ struct ceph_msg *m;
+
+ m = kmem_cache_zalloc(ceph_msg_cache, flags);
+ if (m == NULL)
+ goto out;
+
+ m->hdr.type = cpu_to_le16(type);
+ m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+ m->hdr.front_len = cpu_to_le32(front_len);
+
+ INIT_LIST_HEAD(&m->list_head);
+ kref_init(&m->kref);
+
+ /* front */
+ if (front_len) {
+ m->front.iov_base = kvmalloc(front_len, flags);
+ if (m->front.iov_base == NULL) {
+ dout("ceph_msg_new can't allocate %d bytes\n",
+ front_len);
+ goto out2;
+ }
+ } else {
+ m->front.iov_base = NULL;
+ }
+ m->front_alloc_len = m->front.iov_len = front_len;
+
+ if (max_data_items) {
+ m->data = kmalloc_array(max_data_items, sizeof(*m->data),
+ flags);
+ if (!m->data)
+ goto out2;
+
+ m->max_data_items = max_data_items;
+ }
+
+ dout("ceph_msg_new %p front %d\n", m, front_len);
+ return m;
+
+out2:
+ ceph_msg_put(m);
+out:
+ if (!can_fail) {
+ pr_err("msg_new can't create type %d front %d\n", type,
+ front_len);
+ WARN_ON(1);
+ } else {
+ dout("msg_new can't create type %d front %d\n", type,
+ front_len);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new2);
+
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+ bool can_fail)
+{
+ return ceph_msg_new2(type, front_len, 0, flags, can_fail);
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg. This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ int type = le16_to_cpu(msg->hdr.type);
+ int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+ dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+ ceph_msg_type_name(type), middle_len);
+ BUG_ON(!middle_len);
+ BUG_ON(msg->middle);
+
+ msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+ if (!msg->middle)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Allocate a message for receiving an incoming message on a
+ * connection, and save the result in con->in_msg. Uses the
+ * connection's private alloc_msg op if available.
+ *
+ * Returns 0 on success, or a negative error code.
+ *
+ * On success, if we set *skip = 1:
+ * - the next message should be skipped and ignored.
+ * - con->in_msg == NULL
+ * or if we set *skip = 0:
+ * - con->in_msg is non-null.
+ * On error (ENOMEM, EAGAIN, ...),
+ * - con->in_msg == NULL
+ */
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip)
+{
+ int middle_len = le32_to_cpu(hdr->middle_len);
+ struct ceph_msg *msg;
+ int ret = 0;
+
+ BUG_ON(con->in_msg != NULL);
+ BUG_ON(!con->ops->alloc_msg);
+
+ mutex_unlock(&con->mutex);
+ msg = con->ops->alloc_msg(con, hdr, skip);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_OPEN) {
+ if (msg)
+ ceph_msg_put(msg);
+ return -EAGAIN;
+ }
+ if (msg) {
+ BUG_ON(*skip);
+ msg_con_set(msg, con);
+ con->in_msg = msg;
+ } else {
+ /*
+ * Null message pointer means either we should skip
+ * this message or we couldn't allocate memory. The
+ * former is not an error.
+ */
+ if (*skip)
+ return 0;
+
+ con->error_msg = "error allocating memory for incoming message";
+ return -ENOMEM;
+ }
+ memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
+
+ if (middle_len && !con->in_msg->middle) {
+ ret = ceph_alloc_middle(con, con->in_msg);
+ if (ret < 0) {
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+ }
+
+ return ret;
+}
+
+struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+
+ if (list_empty(&con->out_queue))
+ return NULL;
+
+ msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+ WARN_ON(msg->con != con);
+
+ /*
+ * Put the message on "sent" list using a ref from ceph_con_send().
+ * It is put when the message is acked or revoked.
+ */
+ list_move_tail(&msg->list_head, &con->out_sent);
+
+ /*
+ * Only assign outgoing seq # if we haven't sent this message
+ * yet. If it is requeued, resend with it's original seq.
+ */
+ if (msg->needs_out_seq) {
+ msg->hdr.seq = cpu_to_le64(++con->out_seq);
+ msg->needs_out_seq = false;
+
+ if (con->ops->reencode_message)
+ con->ops->reencode_message(msg);
+ }
+
+ /*
+ * Get a ref for out_msg. It is put when we are done sending the
+ * message or in case of a fault.
+ */
+ WARN_ON(con->out_msg);
+ return con->out_msg = ceph_msg_get(msg);
+}
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+static void ceph_msg_free(struct ceph_msg *m)
+{
+ dout("%s %p\n", __func__, m);
+ kvfree(m->front.iov_base);
+ kfree(m->data);
+ kmem_cache_free(ceph_msg_cache, m);
+}
+
+static void ceph_msg_release(struct kref *kref)
+{
+ struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+ int i;
+
+ dout("%s %p\n", __func__, m);
+ WARN_ON(!list_empty(&m->list_head));
+
+ msg_con_set(m, NULL);
+
+ /* drop middle, data, if any */
+ if (m->middle) {
+ ceph_buffer_put(m->middle);
+ m->middle = NULL;
+ }
+
+ for (i = 0; i < m->num_data_items; i++)
+ ceph_msg_data_destroy(&m->data[i]);
+
+ if (m->pool)
+ ceph_msgpool_put(m->pool, m);
+ else
+ ceph_msg_free(m);
+}
+
+struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+ dout("%s %p (was %d)\n", __func__, msg,
+ kref_read(&msg->kref));
+ kref_get(&msg->kref);
+ return msg;
+}
+EXPORT_SYMBOL(ceph_msg_get);
+
+void ceph_msg_put(struct ceph_msg *msg)
+{
+ dout("%s %p (was %d)\n", __func__, msg,
+ kref_read(&msg->kref));
+ kref_put(&msg->kref, ceph_msg_release);
+}
+EXPORT_SYMBOL(ceph_msg_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+ pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
+ msg->front_alloc_len, msg->data_length);
+ print_hex_dump(KERN_DEBUG, "header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->hdr, sizeof(msg->hdr), true);
+ print_hex_dump(KERN_DEBUG, " front: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->front.iov_base, msg->front.iov_len, true);
+ if (msg->middle)
+ print_hex_dump(KERN_DEBUG, "middle: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->middle->vec.iov_base,
+ msg->middle->vec.iov_len, true);
+ print_hex_dump(KERN_DEBUG, "footer: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..c9e002d96319
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1620 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (!buf)
+ msg.msg_flags |= MSG_TRUNC;
+
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+ int page_offset, size_t length)
+{
+ struct bio_vec bvec;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ BUG_ON(page_offset + length > PAGE_SIZE);
+ bvec_set_page(&bvec, page, length, page_offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, bool more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * @more: MSG_MORE or 0.
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int more)
+{
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | more,
+ };
+ struct bio_vec bvec;
+ int ret;
+
+ /*
+ * MSG_SPLICE_PAGES cannot properly handle pages with page_count == 0,
+ * we need to fall back to sendmsg if that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag which
+ * triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(page))
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+
+ bvec_set_page(&bvec, page, size, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
+
+ ret = sock_sendmsg(sock, &msg);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+ BUG_ON(con->v1.out_skip);
+
+ con->v1.out_kvec_left = 0;
+ con->v1.out_kvec_bytes = 0;
+ con->v1.out_kvec_cur = &con->v1.out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+ size_t size, void *data)
+{
+ int index = con->v1.out_kvec_left;
+
+ BUG_ON(con->v1.out_skip);
+ BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
+
+ con->v1.out_kvec[index].iov_len = size;
+ con->v1.out_kvec[index].iov_base = data;
+ con->v1.out_kvec_left++;
+ con->v1.out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int skip = 0;
+
+ if (con->v1.out_kvec_bytes > 0) {
+ skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+ BUG_ON(con->v1.out_kvec_bytes < skip);
+ BUG_ON(!con->v1.out_kvec_left);
+ con->v1.out_kvec_bytes -= skip;
+ con->v1.out_kvec_left--;
+ }
+
+ return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+ /* Initialize data cursor if it's not a sparse read */
+ u64 len = msg->sparse_read_total ? : data_len;
+
+ ceph_msg_data_cursor_init(&msg->cursor, msg, len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con,
+ struct ceph_msg *m)
+{
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+ if (con->ops->sign_message)
+ con->ops->sign_message(m);
+ else
+ m->footer.sig = 0;
+ } else {
+ m->old_footer.flags = m->footer.flags;
+ }
+ con->v1.out_more = m->more_to_follow;
+ con->v1.out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con,
+ struct ceph_msg *m)
+{
+ u32 crc;
+
+ con_out_kvec_reset(con);
+ con->v1.out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+ }
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ m->data_length);
+ WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+ /* tag + hdr + front + middle */
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+ con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+ if (m->middle)
+ con_out_kvec_add(con, m->middle->vec.iov_len,
+ m->middle->vec.iov_base);
+
+ /* fill in hdr crc and finalize hdr */
+ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+ m->hdr.crc = cpu_to_le32(crc);
+ memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr));
+
+ /* fill in front and middle crc, footer */
+ crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+ m->footer.front_crc = cpu_to_le32(crc);
+ if (m->middle) {
+ crc = crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len);
+ m->footer.middle_crc = cpu_to_le32(crc);
+ } else
+ m->footer.middle_crc = 0;
+ dout("%s front_crc %u middle_crc %u\n", __func__,
+ le32_to_cpu(m->footer.front_crc),
+ le32_to_cpu(m->footer.middle_crc));
+ m->footer.flags = 0;
+
+ /* is there a data payload? */
+ m->footer.data_crc = 0;
+ if (m->data_length) {
+ prepare_message_data(m, m->data_length);
+ con->v1.out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con, m);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ con->v1.out_more = 1; /* more will follow.. eventually.. */
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+ dout("prepare_write_seq %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con_out_kvec_reset(con);
+ if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+ ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+ &con->v1.out_temp_keepalive2);
+ } else {
+ con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+ struct ceph_auth_handshake *auth;
+ int auth_proto;
+
+ if (!con->ops->get_authorizer) {
+ con->v1.auth = NULL;
+ con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+ con->v1.out_connect.authorizer_len = 0;
+ return 0;
+ }
+
+ auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
+ if (IS_ERR(auth))
+ return PTR_ERR(auth);
+
+ con->v1.auth = auth;
+ con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+ con->v1.out_connect.authorizer_len =
+ cpu_to_le32(auth->authorizer_buf_len);
+ return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+ con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+ &con->msgr->my_enc_addr);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, sizeof(con->v1.out_connect),
+ &con->v1.out_connect);
+ if (con->v1.auth)
+ con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+ con->v1.auth->authorizer_buf);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+ unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+ int proto;
+ int ret;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->v1.connect_seq, global_seq, proto);
+
+ con->v1.out_connect.features =
+ cpu_to_le64(from_msgr(con->msgr)->supported_features);
+ con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+ con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+ con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+ con->v1.out_connect.flags = 0;
+
+ ret = get_connect_authorizer(con);
+ if (ret)
+ return ret;
+
+ __prepare_write_connect(con);
+ return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+ while (con->v1.out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+ con->v1.out_kvec_left,
+ con->v1.out_kvec_bytes,
+ con->v1.out_more);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_kvec_bytes -= ret;
+ if (!con->v1.out_kvec_bytes)
+ break; /* done */
+
+ /* account for full iov entries consumed */
+ while (ret >= con->v1.out_kvec_cur->iov_len) {
+ BUG_ON(!con->v1.out_kvec_left);
+ ret -= con->v1.out_kvec_cur->iov_len;
+ con->v1.out_kvec_cur++;
+ con->v1.out_kvec_left--;
+ }
+ /* and for a partially-consumed entry */
+ if (ret) {
+ con->v1.out_kvec_cur->iov_len -= ret;
+ con->v1.out_kvec_cur->iov_base += ret;
+ }
+ }
+ con->v1.out_kvec_left = 0;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ u32 crc;
+
+ dout("%s %p msg %p\n", __func__, con, msg);
+
+ if (!msg->num_data_items)
+ return -EINVAL;
+
+ /*
+ * Iterate through each page that contains data to be
+ * written, and send as much as possible for each.
+ *
+ * If we are calculating the data crc (the default), we will
+ * need to map the page. If we have no pages, they have
+ * been revoked, so use the zero page.
+ */
+ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+ while (cursor->total_resid) {
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ int ret;
+
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length);
+ ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+ MSG_MORE);
+ if (ret <= 0) {
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+
+ return ret;
+ }
+ if (do_datacrc && cursor->need_crc)
+ crc = ceph_crc32c_page(crc, page, page_offset, length);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+
+ dout("%s %p msg %p done\n", __func__, con, msg);
+
+ /* prepare and queue up footer, too */
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+ else
+ msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con_out_kvec_reset(con);
+ prepare_write_message_footer(con, msg);
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+ while (con->v1.out_skip > 0) {
+ size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
+
+ ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+ MSG_MORE);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+ dout("prepare_read_seq %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_keepalive_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->v1.in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+ int end, int size, void *object)
+{
+ while (con->v1.in_base_pos < end) {
+ int left = end - con->v1.in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->v1.in_base_pos += ret;
+ }
+ return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
+
+ /* peer's banner */
+ size = strlen(CEPH_BANNER);
+ end = size;
+ ret = read_partial(con, end, size, con->v1.in_banner);
+ if (ret <= 0)
+ goto out;
+
+ size = sizeof(con->v1.actual_peer_addr);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.actual_peer_addr);
+
+ size = sizeof(con->v1.peer_addr_for_me);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
+
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
+
+ size = sizeof(con->v1.in_reply);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_reply);
+ if (ret <= 0)
+ goto out;
+
+ if (con->v1.auth) {
+ size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+ if (size > con->v1.auth->authorizer_reply_buf_len) {
+ pr_err("authorizer reply too big: %d > %zu\n", size,
+ con->v1.auth->authorizer_reply_buf_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ end += size;
+ ret = read_partial(con, end, size,
+ con->v1.auth->authorizer_reply_buf);
+ if (ret <= 0)
+ goto out;
+ }
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, con->v1.in_reply.tag,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ le32_to_cpu(con->v1.in_reply.global_seq));
+out:
+ return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ ceph_pr_addr(&con->peer_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+ con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&con->v1.actual_peer_addr),
+ le32_to_cpu(con->v1.actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr,
+ &con->v1.peer_addr_for_me.in_addr,
+ sizeof(con->v1.peer_addr_for_me.in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ ceph_encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ ceph_pr_addr(my_addr));
+ }
+
+ return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = from_msgr(con->msgr)->supported_features;
+ u64 req_feat = from_msgr(con->msgr)->required_features;
+ u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
+ int ret;
+
+ dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
+
+ if (con->v1.auth) {
+ int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
+
+ /*
+ * Any connection that defines ->get_authorizer()
+ * should also define ->add_authorizer_challenge() and
+ * ->verify_authorizer_reply().
+ *
+ * See get_connect_authorizer().
+ */
+ if (con->v1.in_reply.tag ==
+ CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ret = con->ops->add_authorizer_challenge(
+ con, con->v1.auth->authorizer_reply_buf, len);
+ if (ret < 0)
+ return ret;
+
+ con_out_kvec_reset(con);
+ __prepare_write_connect(con);
+ prepare_read_connect(con);
+ return 0;
+ }
+
+ if (len) {
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
+ }
+ }
+
+ switch (con->v1.in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->v1.out_connect.protocol_version),
+ le32_to_cpu(con->v1.in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->v1.auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->v1.auth_retry);
+ if (con->v1.auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ return -1;
+ }
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ pr_info("%s%lld %s session reset\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+ return -EAGAIN;
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+ le32_to_cpu(con->v1.out_connect.connect_seq),
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ ceph_get_global_seq(con->msgr,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_SEQ:
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+ con->state = CEPH_CON_S_OPEN;
+ con->v1.auth_retry = 0; /* we authenticated; clear flag */
+ con->v1.peer_global_seq =
+ le32_to_cpu(con->v1.in_reply.global_seq);
+ con->v1.connect_seq++;
+ con->peer_features = server_feat;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ con->v1.connect_seq);
+ WARN_ON(con->v1.connect_seq !=
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+
+ if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+ con->delay = 0; /* reset backoff memory */
+
+ if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+ prepare_write_seq(con);
+ prepare_read_seq(con);
+ } else {
+ prepare_read_tag(con);
+ }
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ con->error_msg = "protocol error, got WAIT as client";
+ return -1;
+
+ default:
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int size = sizeof(con->v1.in_temp_ack);
+ int end = size;
+
+ return read_partial(con, end, size, &con->v1.in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ u64 ack = le64_to_cpu(con->v1.in_temp_ack);
+
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
+ ceph_con_discard_sent(con, ack);
+ else
+ ceph_con_discard_requeued(con, ack);
+
+ prepare_read_tag(con);
+}
+
+static int read_partial_message_chunk(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ int ret, left;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ }
+ if (section->iov_len == sec_len)
+ *crc = crc32c(*crc, section->iov_base, section->iov_len);
+
+ return 1;
+}
+
+static inline int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ *crc = 0;
+ return read_partial_message_chunk(con, section, sec_len, crc);
+}
+
+static int read_partial_sparse_msg_extent(struct ceph_connection *con, u32 *crc)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE);
+
+ if (do_bounce && unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ while (cursor->sr_resid > 0) {
+ struct page *page, *rpage;
+ size_t off, len;
+ int ret;
+
+ page = ceph_msg_data_next(cursor, &off, &len);
+ rpage = do_bounce ? con->bounce_page : page;
+
+ /* clamp to what remains in extent */
+ len = min_t(int, len, cursor->sr_resid);
+ ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
+ if (ret <= 0)
+ return ret;
+ *crc = ceph_crc32c_page(*crc, rpage, off, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ cursor->sr_resid -= ret;
+ if (do_bounce)
+ memcpy_page(page, off, rpage, off, ret);
+ }
+ return 1;
+}
+
+static int read_partial_sparse_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ u32 crc = 0;
+ int ret = 1;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+
+ while (cursor->total_resid) {
+ if (con->v1.in_sr_kvec.iov_base)
+ ret = read_partial_message_chunk(con,
+ &con->v1.in_sr_kvec,
+ con->v1.in_sr_len,
+ &crc);
+ else if (cursor->sr_resid > 0)
+ ret = read_partial_sparse_msg_extent(con, &crc);
+ if (ret <= 0)
+ break;
+
+ memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
+ ret = con->ops->sparse_read(con, cursor,
+ (char **)&con->v1.in_sr_kvec.iov_base);
+ if (ret <= 0) {
+ ret = ret ? ret : 1; /* must return > 0 to indicate success */
+ break;
+ }
+ con->v1.in_sr_len = ret;
+ }
+
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ u32 crc = 0;
+ int ret;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length);
+ ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+ if (ret <= 0) {
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+ }
+
+ if (do_datacrc)
+ crc = ceph_crc32c_page(crc, page, page_offset, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+static int read_partial_msg_data_bounce(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ struct page *page;
+ size_t off, len;
+ u32 crc;
+ int ret;
+
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ crc = con->in_data_crc;
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &off, &len);
+ ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len);
+ if (ret <= 0) {
+ con->in_data_crc = crc;
+ return ret;
+ }
+
+ crc = crc32c(crc, page_address(con->bounce_page), ret);
+ memcpy_to_page(page, off, page_address(con->bounce_page), ret);
+
+ ceph_msg_data_advance(cursor, ret);
+ }
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ int size;
+ int end;
+ int ret;
+ unsigned int front_len, middle_len, data_len;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+ u64 seq;
+ u32 crc;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ size = sizeof(con->v1.in_hdr);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_hdr);
+ if (ret <= 0)
+ return ret;
+
+ crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+ if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
+ pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+ crc, con->v1.in_hdr.crc);
+ return -EBADMSG;
+ }
+
+ front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+
+ /* verify seq# */
+ seq = le64_to_cpu(con->v1.in_hdr.seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ con->v1.in_base_pos = -front_len - middle_len - data_len -
+ sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ return 1;
+ } else if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("read_partial_message bad seq %lld expected %lld\n",
+ seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ int skip = 0;
+
+ dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
+ front_len, data_len);
+ ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
+ if (ret < 0)
+ return ret;
+
+ BUG_ON((!con->in_msg) ^ skip);
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg said skip message\n");
+ con->v1.in_base_pos = -front_len - middle_len -
+ data_len - sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 1;
+ }
+
+ BUG_ON(!con->in_msg);
+ BUG_ON(con->in_msg->con != con);
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ /* prepare for data payload, if any */
+
+ if (data_len)
+ prepare_message_data(con->in_msg, data_len);
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec,
+ middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* (page) data */
+ if (data_len) {
+ if (!m->num_data_items)
+ return -EIO;
+
+ if (m->sparse_read_total)
+ ret = read_partial_sparse_msg_data(con);
+ else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
+ ret = read_partial_msg_data_bounce(con);
+ else
+ ret = read_partial_msg_data(con);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* footer */
+ size = sizeof_footer(con);
+ end += size;
+ ret = read_partial(con, end, size, &m->footer);
+ if (ret <= 0)
+ return ret;
+
+ if (!need_sign) {
+ m->footer.flags = m->old_footer.flags;
+ m->footer.sig = 0;
+ }
+
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (do_datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ if (need_sign && con->ops->check_message_signature &&
+ con->ops->check_message_signature(m)) {
+ pr_err("read_partial_message %p signature check failed\n", m);
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+ struct ceph_timespec ceph_ts;
+ size_t size = sizeof(ceph_ts);
+ int ret = read_partial(con, size, size, &ceph_ts);
+ if (ret <= 0)
+ return ret;
+ ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+ prepare_read_tag(con);
+ return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+ int ret = -1;
+
+more:
+ dout("try_read start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ BUG_ON(!con->sock);
+
+ dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+ con->v1.in_base_pos);
+
+ if (con->state == CEPH_CON_S_V1_BANNER) {
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
+
+ con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+ /*
+ * Received banner is good, exchange connection info.
+ * Do not reset out_kvec, as sending our banner raced
+ * with receiving peer banner after connect completed.
+ */
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ goto out;
+ prepare_read_connect(con);
+
+ /* Send connection info before awaiting response */
+ goto out;
+ }
+
+ if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_connect(con);
+ if (ret < 0)
+ goto out;
+ goto more;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+ if (con->v1.in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ */
+ ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
+ if (ret <= 0)
+ goto out;
+ dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+ con->v1.in_base_pos += ret;
+ if (con->v1.in_base_pos)
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
+ if (ret <= 0)
+ goto out;
+ dout("try_read got tag %d\n", con->v1.in_tag);
+ switch (con->v1.in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+ prepare_read_keepalive_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ ceph_con_close_socket(con);
+ con->state = CEPH_CON_S_CLOSED;
+ goto out;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc/signature";
+ fallthrough;
+ case -EBADE:
+ ret = -EIO;
+ break;
+ case -EIO:
+ con->error_msg = "io error";
+ break;
+ }
+ goto out;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ ceph_con_process_message(con);
+ if (con->state == CEPH_CON_S_OPEN)
+ prepare_read_tag(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+ con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
+ /*
+ * the final handshake seq exchange is semantically
+ * equivalent to an ACK
+ */
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto out;
+ process_ack(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+ ret = read_keepalive_ack(con);
+ if (ret <= 0)
+ goto out;
+ goto more;
+ }
+
+out:
+ dout("try_read done on %p ret %d\n", con, ret);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad tag %d\n", con->v1.in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+ int ret = 1;
+
+ dout("try_write start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_PREOPEN &&
+ con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ BUG_ON(con->sock);
+ con->state = CEPH_CON_S_V1_BANNER;
+
+ con_out_kvec_reset(con);
+ prepare_write_banner(con);
+ prepare_read_banner(con);
+
+ BUG_ON(con->in_msg);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %d\n",
+ con, con->state);
+ ret = ceph_tcp_connect(con);
+ if (ret < 0) {
+ con->error_msg = "connect error";
+ goto out;
+ }
+ }
+
+more:
+ dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
+ BUG_ON(!con->sock);
+
+ /* kvec data queued? */
+ if (con->v1.out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto out;
+ }
+ if (con->v1.out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* msg pages? */
+ msg = con->out_msg;
+ if (msg) {
+ if (con->v1.out_msg_done) {
+ ceph_msg_put(msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_message_data(con, msg);
+ if (ret == 1)
+ goto more; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto out;
+ if (ret < 0) {
+ dout("try_write write_partial_message_data err %d\n",
+ ret);
+ goto out;
+ }
+ }
+
+do_next:
+ if (con->state == CEPH_CON_S_OPEN) {
+ if (ceph_con_flag_test_and_clear(con,
+ CEPH_CON_F_KEEPALIVE_PENDING)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ /* is anything else pending? */
+ if ((msg = ceph_con_get_out_msg(con)) != NULL) {
+ prepare_write_message(con, msg);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ dout("try_write nothing else to write.\n");
+ ret = 0;
+out:
+ dout("try_write done on %p ret %d\n", con, ret);
+ return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ WARN_ON(con->v1.out_skip);
+ /* footer */
+ if (con->v1.out_msg_done) {
+ con->v1.out_skip += con_out_kvec_skip(con);
+ } else {
+ WARN_ON(!msg->data_length);
+ con->v1.out_skip += sizeof_footer(con);
+ }
+ /* data, middle, front */
+ if (msg->data_length)
+ con->v1.out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->v1.out_skip += con_out_kvec_skip(con);
+ con->v1.out_skip += con_out_kvec_skip(con);
+
+ dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+ con->v1.out_kvec_bytes, con->v1.out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+ unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+
+ /* skip rest of message */
+ con->v1.in_base_pos = con->v1.in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+
+ dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+ return con->v1.connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+ con->v1.connect_seq = 0;
+ con->v1.peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+ con->v1.out_skip = 0;
+}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..833e57849c1d
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3804 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <crypto/utils.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO 1
+#define FRAME_TAG_AUTH_REQUEST 2
+#define FRAME_TAG_AUTH_BAD_METHOD 3
+#define FRAME_TAG_AUTH_REPLY_MORE 4
+#define FRAME_TAG_AUTH_REQUEST_MORE 5
+#define FRAME_TAG_AUTH_DONE 6
+#define FRAME_TAG_AUTH_SIGNATURE 7
+#define FRAME_TAG_CLIENT_IDENT 8
+#define FRAME_TAG_SERVER_IDENT 9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT 11
+#define FRAME_TAG_SESSION_RESET 12
+#define FRAME_TAG_SESSION_RETRY 13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL 14
+#define FRAME_TAG_SESSION_RECONNECT_OK 15
+#define FRAME_TAG_WAIT 16
+#define FRAME_TAG_MESSAGE 17
+#define FRAME_TAG_KEEPALIVE2 18
+#define FRAME_TAG_KEEPALIVE2_ACK 19
+#define FRAME_TAG_ACK 20
+
+#define FRAME_LATE_STATUS_ABORTED 0x1
+#define FRAME_LATE_STATUS_COMPLETE 0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
+
+#define IN_S_HANDLE_PREAMBLE 1
+#define IN_S_HANDLE_CONTROL 2
+#define IN_S_HANDLE_CONTROL_REMAINDER 3
+#define IN_S_PREPARE_READ_DATA 4
+#define IN_S_PREPARE_READ_DATA_CONT 5
+#define IN_S_PREPARE_READ_ENC_PAGE 6
+#define IN_S_PREPARE_SPARSE_DATA 7
+#define IN_S_PREPARE_SPARSE_DATA_CONT 8
+#define IN_S_HANDLE_EPILOGUE 9
+#define IN_S_FINISH_SKIP 10
+
+#define OUT_S_QUEUE_DATA 1
+#define OUT_S_QUEUE_DATA_CONT 2
+#define OUT_S_QUEUE_ENC_PAGE 3
+#define OUT_S_QUEUE_ZEROS 4
+#define OUT_S_FINISH_MESSAGE 5
+#define OUT_S_GET_NEXT 6
+
+#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ * 1 - done, nothing (else) to read
+ * 0 - socket is empty, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p %s %zu\n", __func__, con,
+ iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+ iov_iter_count(&con->v2.in_iter));
+ ret = do_recvmsg(con->sock, &con->v2.in_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.in_iter));
+ return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ struct bio_vec bv;
+ int ret;
+
+ if (WARN_ON(!iov_iter_is_bvec(it)))
+ return -EINVAL;
+
+ while (iov_iter_count(it)) {
+ /* iov_iter_iovec() for ITER_BVEC */
+ bvec_set_page(&bv, it->bvec->bv_page,
+ min(iov_iter_count(it),
+ it->bvec->bv_len - it->iov_offset),
+ it->bvec->bv_offset + it->iov_offset);
+
+ /*
+ * MSG_SPLICE_PAGES cannot properly handle pages with
+ * page_count == 0, we need to fall back to sendmsg if
+ * that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag
+ * which triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(bv.bv_page))
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+ else
+ msg.msg_flags &= ~MSG_SPLICE_PAGES;
+
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ return 1;
+}
+
+/*
+ * Write as much as possible. The socket is expected to be corked,
+ * so we don't bother with MSG_MORE here.
+ *
+ * Return:
+ * 1 - done, nothing (else) to write
+ * 0 - socket is full, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+ if (con->v2.out_iter_sendpage)
+ ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+ else
+ ret = do_sendmsg(con->sock, &con->v2.out_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.out_iter));
+ return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+ con->v2.in_kvec_cnt++;
+
+ con->v2.in_iter.nr_segs++;
+ con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_kvec_cnt = 0;
+ iov_iter_kvec(&con->v2.in_iter, ITER_DEST, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_bvec = *bv;
+ iov_iter_bvec(&con->v2.in_iter, ITER_DEST, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ dout("%s con %p len %d\n", __func__, con, len);
+ iov_iter_discard(&con->v2.in_iter, ITER_DEST, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+ con->v2.out_kvec_cnt++;
+
+ con->v2.out_iter.nr_segs++;
+ con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvec_cnt = 0;
+
+ iov_iter_kvec(&con->v2.out_iter, ITER_SOURCE, con->v2.out_kvecs, 0, 0);
+ con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+ bool zerocopy)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_bvec = *bv;
+ con->v2.out_iter_sendpage = zerocopy;
+ iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(!con->v2.out_zero);
+
+ bvec_set_page(&con->v2.out_bvec, ceph_zero_page,
+ min(con->v2.out_zero, (int)PAGE_SIZE), 0);
+ con->v2.out_iter_sendpage = true;
+ iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+ dout("%s con %p len %d\n", __func__, con, len);
+ con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+ void *buf;
+
+ dout("%s con %p len %d\n", __func__, con, len);
+
+ if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+ return NULL;
+
+ buf = kvmalloc(len, GFP_NOIO);
+ if (!buf)
+ return NULL;
+
+ con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+ return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+ while (con->v2.conn_buf_cnt)
+ kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+ con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+ con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+ return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+ return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+ return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+ return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+ int head_len;
+ int rem_len;
+
+ BUG_ON(ctrl_len < 0 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN);
+
+ if (secure) {
+ head_len = CEPH_PREAMBLE_SECURE_LEN;
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+ }
+ } else {
+ head_len = CEPH_PREAMBLE_PLAIN_LEN;
+ if (ctrl_len)
+ head_len += ctrl_len + CEPH_CRC_LEN;
+ }
+ return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+ bool secure)
+{
+ BUG_ON(front_len < 0 || front_len > CEPH_MSG_MAX_FRONT_LEN ||
+ middle_len < 0 || middle_len > CEPH_MSG_MAX_MIDDLE_LEN ||
+ data_len < 0 || data_len > CEPH_MSG_MAX_DATA_LEN);
+
+ if (!front_len && !middle_len && !data_len)
+ return 0;
+
+ if (!secure)
+ return front_len + middle_len + data_len +
+ CEPH_EPILOGUE_PLAIN_LEN;
+
+ return padded_len(front_len) + padded_len(middle_len) +
+ padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+ return __tail_onwire_len(front_len(msg), middle_len(msg),
+ data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \
+ sizeof(struct ceph_msg_header2) + \
+ CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+ sizeof(void *),
+ sizeof(void *),
+ sizeof(void *),
+ PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+ int i;
+
+ for (i = len_cnt - 1; i >= 0; i--) {
+ if (lens[i])
+ return i + 1;
+ }
+
+ return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+ const int *lens, int len_cnt)
+{
+ int i;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = tag;
+ desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+ BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = lens[i];
+ desc->fd_aligns[i] = frame_aligns[i];
+ }
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ void *start = p;
+ int i;
+
+ memset(p, 0, CEPH_PREAMBLE_LEN);
+
+ ceph_encode_8(&p, desc->fd_tag);
+ ceph_encode_8(&p, desc->fd_seg_cnt);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ ceph_encode_32(&p, desc->fd_lens[i]);
+ ceph_encode_16(&p, desc->fd_aligns[i]);
+ }
+
+ put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ u32 crc, expected_crc;
+ int i;
+
+ crc = crc32c(0, p, crcp - p);
+ expected_crc = get_unaligned_le32(crcp);
+ if (crc != expected_crc) {
+ pr_err("bad preamble crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = ceph_decode_8(&p);
+ desc->fd_seg_cnt = ceph_decode_8(&p);
+ if (desc->fd_seg_cnt < 1 ||
+ desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+ pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = ceph_decode_32(&p);
+ desc->fd_aligns[i] = ceph_decode_16(&p);
+ }
+
+ if (desc->fd_lens[0] < 0 ||
+ desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+ pr_err("bad control segment length %d\n", desc->fd_lens[0]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[1] < 0 ||
+ desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+ pr_err("bad front segment length %d\n", desc->fd_lens[1]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[2] < 0 ||
+ desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+ pr_err("bad middle segment length %d\n", desc->fd_lens[2]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[3] < 0 ||
+ desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+ pr_err("bad data segment length %d\n", desc->fd_lens[3]);
+ return -EINVAL;
+ }
+
+ /*
+ * This would fire for FRAME_TAG_WAIT (it has one empty
+ * segment), but we should never get it as client.
+ */
+ if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+ pr_err("last segment empty, segment count %d\n",
+ desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+ cpu_to_le32s(&con->v2.out_epil.front_crc);
+ cpu_to_le32s(&con->v2.out_epil.middle_crc);
+ cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+ memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+ u32 *data_crc)
+{
+ u8 late_status;
+
+ late_status = ceph_decode_8(&p);
+ if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+ FRAME_LATE_STATUS_COMPLETE) {
+ /* we should never get an aborted message as client */
+ pr_err("bad late_status 0x%x\n", late_status);
+ return -EINVAL;
+ }
+
+ if (front_crc && middle_crc && data_crc) {
+ *front_crc = ceph_decode_32(&p);
+ *middle_crc = ceph_decode_32(&p);
+ *data_crc = ceph_decode_32(&p);
+ }
+
+ return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+ const struct ceph_msg_header2 *hdr2,
+ int front_len, int middle_len, int data_len,
+ const struct ceph_entity_name *peer_name)
+{
+ hdr->seq = hdr2->seq;
+ hdr->tid = hdr2->tid;
+ hdr->type = hdr2->type;
+ hdr->priority = hdr2->priority;
+ hdr->version = hdr2->version;
+ hdr->front_len = cpu_to_le32(front_len);
+ hdr->middle_len = cpu_to_le32(middle_len);
+ hdr->data_len = cpu_to_le32(data_len);
+ hdr->data_off = hdr2->data_off;
+ hdr->src = *peer_name;
+ hdr->compat_version = hdr2->compat_version;
+ hdr->reserved = 0;
+ hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+ const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+ hdr2->seq = hdr->seq;
+ hdr2->tid = hdr->tid;
+ hdr2->type = hdr->type;
+ hdr2->priority = hdr->priority;
+ hdr2->version = hdr->version;
+ hdr2->data_pre_padding_len = 0;
+ hdr2->data_off = hdr->data_off;
+ hdr2->ack_seq = cpu_to_le64(ack_seq);
+ hdr2->flags = 0;
+ hdr2->compat_version = hdr->compat_version;
+ hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ u32 crc, expected_crc;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+ crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+ if (crc != expected_crc) {
+ pr_err("bad control crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+ u32 middle_crc, u32 data_crc)
+{
+ if (front_len(con->in_msg)) {
+ con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+ front_len(con->in_msg));
+ } else {
+ WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+ con->in_front_crc = -1;
+ }
+
+ if (middle_len(con->in_msg))
+ con->in_middle_crc = crc32c(-1,
+ con->in_msg->middle->vec.iov_base,
+ middle_len(con->in_msg));
+ else if (data_len(con->in_msg))
+ con->in_middle_crc = -1;
+ else
+ con->in_middle_crc = 0;
+
+ if (!data_len(con->in_msg))
+ con->in_data_crc = 0;
+
+ dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+ if (con->in_front_crc != front_crc) {
+ pr_err("bad front crc, calculated %u, expected %u\n",
+ con->in_front_crc, front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != middle_crc) {
+ pr_err("bad middle crc, calculated %u, expected %u\n",
+ con->in_middle_crc, middle_crc);
+ return -EBADMSG;
+ }
+ if (con->in_data_crc != data_crc) {
+ pr_err("bad data crc, calculated %u, expected %u\n",
+ con->in_data_crc, data_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+ const u8 *session_key, int session_key_len,
+ const u8 *con_secret, int con_secret_len)
+{
+ unsigned int noio_flag;
+ int ret;
+
+ dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+ __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+ WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req);
+
+ if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+ con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+ pr_err("bad con_mode %d\n", con->v2.con_mode);
+ return -EINVAL;
+ }
+
+ if (!session_key_len) {
+ WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+ WARN_ON(con_secret_len);
+ return 0; /* auth_none */
+ }
+
+ hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len);
+ con->v2.hmac_key_set = true;
+
+ if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+ WARN_ON(con_secret_len);
+ return 0; /* auth_x, plain mode */
+ }
+
+ if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+ pr_err("con_secret too small %d\n", con_secret_len);
+ return -EINVAL;
+ }
+
+ noio_flag = memalloc_noio_save();
+ con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(con->v2.gcm_tfm)) {
+ ret = PTR_ERR(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ pr_err("failed to allocate gcm tfm context: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON((unsigned long)con_secret &
+ crypto_aead_alignmask(con->v2.gcm_tfm));
+ ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN);
+ if (ret) {
+ pr_err("failed to set gcm key: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+ ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+ if (ret) {
+ pr_err("failed to set gcm tag size: %d\n", ret);
+ return ret;
+ }
+
+ con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+ if (!con->v2.gcm_req) {
+ pr_err("failed to allocate gcm request\n");
+ return -ENOMEM;
+ }
+
+ crypto_init_wait(&con->v2.gcm_wait);
+ aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &con->v2.gcm_wait);
+
+ memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN,
+ CEPH_GCM_IV_LEN);
+ memcpy(&con->v2.out_gcm_nonce,
+ con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN,
+ CEPH_GCM_IV_LEN);
+ return 0; /* auth_x, secure mode */
+}
+
+static void ceph_hmac_sha256(struct ceph_connection *con,
+ const struct kvec *kvecs, int kvec_cnt,
+ u8 hmac[SHA256_DIGEST_SIZE])
+{
+ struct hmac_sha256_ctx ctx;
+ int i;
+
+ dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con,
+ con->v2.hmac_key_set, kvec_cnt);
+
+ if (!con->v2.hmac_key_set) {
+ memset(hmac, 0, SHA256_DIGEST_SIZE);
+ return; /* auth_none */
+ }
+
+ /* auth_x, both plain and secure modes */
+ hmac_sha256_init(&ctx, &con->v2.hmac_key);
+ for (i = 0; i < kvec_cnt; i++)
+ hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len);
+ hmac_sha256_final(&ctx, hmac);
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+ u64 counter;
+
+ counter = le64_to_cpu(nonce->counter);
+ nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+ struct scatterlist *src, struct scatterlist *dst,
+ int src_len)
+{
+ struct ceph_gcm_nonce *nonce;
+ int ret;
+
+ nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+ aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */
+ aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+ ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+ crypto_aead_decrypt(con->v2.gcm_req),
+ &con->v2.gcm_wait);
+ if (ret)
+ return ret;
+
+ gcm_inc_nonce(nonce);
+ return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+ struct bio_vec *bv)
+{
+ struct page *page;
+ size_t off, len;
+
+ WARN_ON(!cursor->total_resid);
+
+ /* skip zero-length data items */
+ while (!cursor->resid)
+ ceph_msg_data_advance(cursor, 0);
+
+ /* get a piece of data, cursor isn't advanced */
+ page = ceph_msg_data_next(cursor, &off, &len);
+ bvec_set_page(bv, page, len, off);
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+ int sg_cnt;
+
+ if (!buf_len)
+ return 0;
+
+ sg_cnt = need_padding(buf_len) ? 1 : 0;
+ if (is_vmalloc_addr(buf)) {
+ WARN_ON(offset_in_page(buf));
+ sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+ } else {
+ sg_cnt++;
+ }
+
+ return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+ int sg_cnt;
+
+ if (!data_len)
+ return 0;
+
+ sg_cnt = need_padding(data_len) ? 1 : 0;
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_cnt++;
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+ void *end = buf + buf_len;
+ struct page *page;
+ int len;
+ void *p;
+
+ if (!buf_len)
+ return;
+
+ if (is_vmalloc_addr(buf)) {
+ p = buf;
+ do {
+ page = vmalloc_to_page(p);
+ len = min_t(int, end - p, PAGE_SIZE);
+ WARN_ON(!page || !len || offset_in_page(p));
+ sg_set_page(*sg, page, len, 0);
+ *sg = sg_next(*sg);
+ p += len;
+ } while (p != end);
+ } else {
+ sg_set_buf(*sg, buf, buf_len);
+ *sg = sg_next(*sg);
+ }
+
+ if (need_padding(buf_len)) {
+ sg_set_buf(*sg, pad, padding_len(buf_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+ struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+
+ if (!data_len)
+ return;
+
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+ *sg = sg_next(*sg);
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ if (need_padding(data_len)) {
+ sg_set_buf(*sg, pad, padding_len(data_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+/**
+ * init_sgs_pages: set up scatterlist on an array of page pointers
+ * @sg: scatterlist to populate
+ * @pages: pointer to page array
+ * @dpos: position in the array to start (bytes)
+ * @dlen: len to add to sg (bytes)
+ * @pad: pointer to pad destination (if any)
+ *
+ * Populate the scatterlist from the page array, starting at an arbitrary
+ * byte in the array and running for a specified length.
+ */
+static void init_sgs_pages(struct scatterlist **sg, struct page **pages,
+ int dpos, int dlen, u8 *pad)
+{
+ int idx = dpos >> PAGE_SHIFT;
+ int off = offset_in_page(dpos);
+ int resid = dlen;
+
+ do {
+ int len = min(resid, (int)PAGE_SIZE - off);
+
+ sg_set_page(*sg, pages[idx], len, off);
+ *sg = sg_next(*sg);
+ off = 0;
+ ++idx;
+ resid -= len;
+ } while (resid);
+
+ if (need_padding(dlen)) {
+ sg_set_buf(*sg, pad, padding_len(dlen));
+ *sg = sg_next(*sg);
+ }
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+ u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+ void *epilogue, struct page **pages, int dpos,
+ bool add_tag)
+{
+ struct ceph_msg_data_cursor cursor;
+ struct scatterlist *cur_sg;
+ int dlen = data_len(msg);
+ int sg_cnt;
+ int ret;
+
+ if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+ return 0;
+
+ sg_cnt = 1; /* epilogue + [auth tag] */
+ if (front_len(msg))
+ sg_cnt += calc_sg_cnt(msg->front.iov_base,
+ front_len(msg));
+ if (middle_len(msg))
+ sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+ middle_len(msg));
+ if (dlen) {
+ if (pages) {
+ sg_cnt += calc_pages_for(dpos, dlen);
+ if (need_padding(dlen))
+ sg_cnt++;
+ } else {
+ ceph_msg_data_cursor_init(&cursor, msg, dlen);
+ sg_cnt += calc_sg_cnt_cursor(&cursor);
+ }
+ }
+
+ ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ cur_sg = sgt->sgl;
+ if (front_len(msg))
+ init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+ front_pad);
+ if (middle_len(msg))
+ init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+ middle_pad);
+ if (dlen) {
+ if (pages) {
+ init_sgs_pages(&cur_sg, pages, dpos, dlen, data_pad);
+ } else {
+ ceph_msg_data_cursor_init(&cursor, msg, dlen);
+ init_sgs_cursor(&cur_sg, &cursor, data_pad);
+ }
+ }
+
+ WARN_ON(!sg_is_last(cur_sg));
+ sg_set_buf(cur_sg, epilogue,
+ CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+ return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+ return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+ sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+ return gcm_crypt(con, false, sgs, sgs,
+ padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+/* Process sparse read data that lives in a buffer */
+static int process_v2_sparse_read(struct ceph_connection *con,
+ struct page **pages, int spos)
+{
+ struct ceph_msg_data_cursor cursor;
+ int ret;
+
+ ceph_msg_data_cursor_init(&cursor, con->in_msg,
+ con->in_msg->sparse_read_total);
+
+ for (;;) {
+ char *buf = NULL;
+
+ ret = con->ops->sparse_read(con, &cursor, &buf);
+ if (ret <= 0)
+ return ret;
+
+ dout("%s: sparse_read return %x buf %p\n", __func__, ret, buf);
+
+ do {
+ int idx = spos >> PAGE_SHIFT;
+ int soff = offset_in_page(spos);
+ struct page *spage = con->v2.in_enc_pages[idx];
+ int len = min_t(int, ret, PAGE_SIZE - soff);
+
+ if (buf) {
+ memcpy_from_page(buf, spage, soff, len);
+ buf += len;
+ } else {
+ struct bio_vec bv;
+
+ get_bvec_at(&cursor, &bv);
+ len = min_t(int, len, bv.bv_len);
+ memcpy_page(bv.bv_page, bv.bv_offset,
+ spage, soff, len);
+ ceph_msg_data_advance(&cursor, len);
+ }
+ spos += len;
+ ret -= len;
+ } while (ret);
+ }
+}
+
+static int decrypt_tail(struct ceph_connection *con)
+{
+ struct sg_table enc_sgt = {};
+ struct sg_table sgt = {};
+ struct page **pages = NULL;
+ bool sparse = !!con->in_msg->sparse_read_total;
+ int dpos = 0;
+ int tail_len;
+ int ret;
+
+ tail_len = tail_onwire_len(con->in_msg, true);
+ ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages,
+ con->v2.in_enc_page_cnt, 0, tail_len,
+ GFP_NOIO);
+ if (ret)
+ goto out;
+
+ if (sparse) {
+ dpos = padded_len(front_len(con->in_msg) + padded_len(middle_len(con->in_msg)));
+ pages = con->v2.in_enc_pages;
+ }
+
+ ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+ MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+ con->v2.in_buf, pages, dpos, true);
+ if (ret)
+ goto out;
+
+ dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con,
+ con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents);
+ ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len);
+ if (ret)
+ goto out;
+
+ if (sparse && data_len(con->in_msg)) {
+ ret = process_v2_sparse_read(con, con->v2.in_enc_pages, dpos);
+ if (ret)
+ goto out;
+ }
+
+ WARN_ON(!con->v2.in_enc_page_cnt);
+ ceph_release_page_vector(con->v2.in_enc_pages,
+ con->v2.in_enc_page_cnt);
+ con->v2.in_enc_pages = NULL;
+ con->v2.in_enc_page_cnt = 0;
+
+out:
+ sg_free_table(&sgt);
+ sg_free_table(&enc_sgt);
+ return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+ int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+ void *buf, *p;
+
+ buf = alloc_conn_buf(con, buf_len);
+ if (!buf)
+ return -ENOMEM;
+
+ p = buf;
+ ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+ ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+ ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+ ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+ WARN_ON(p != buf + buf_len);
+
+ add_out_kvec(con, buf, buf_len);
+ add_out_sign_kvec(con, buf, buf_len);
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for control crc
+ *
+ * extdata (optional):
+ * control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ * preamble
+ * control body (ctrl_len + extdata_len bytes)
+ * control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+ int ctrl_len, void *extdata, int extdata_len,
+ bool to_be_signed)
+{
+ int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+ void *crcp = base + base_len - CEPH_CRC_LEN;
+ u32 crc;
+
+ crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+ if (extdata_len)
+ crc = crc32c(crc, extdata, extdata_len);
+ put_unaligned_le32(crc, crcp);
+
+ if (!extdata_len) {
+ add_out_kvec(con, base, base_len);
+ if (to_be_signed)
+ add_out_sign_kvec(con, base, base_len);
+ return;
+ }
+
+ add_out_kvec(con, base, crcp - base);
+ add_out_kvec(con, extdata, extdata_len);
+ add_out_kvec(con, crcp, CEPH_CRC_LEN);
+ if (to_be_signed) {
+ add_out_sign_kvec(con, base, crcp - base);
+ add_out_sign_kvec(con, extdata, extdata_len);
+ add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+ }
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ struct scatterlist sg;
+ int ret;
+
+ /* inline buffer padding? */
+ if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+ memset(CTRL_BODY(base) + ctrl_len, 0,
+ CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+ sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+ ret = gcm_crypt(con, true, &sg, &sg,
+ CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for padding, if needed
+ * space for control remainder auth tag
+ * space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ * preamble
+ * control body (48 bytes)
+ * preamble auth tag
+ * control body (ctrl_len - 48 bytes)
+ * zero padding, if needed
+ * control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+ void *rem_tag = rem + padded_len(rem_len);
+ void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+ int ret;
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], base, rem - base);
+ sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+ ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+ if (ret)
+ return ret;
+
+ /* control remainder padding? */
+ if (need_padding(rem_len))
+ memset(rem + rem_len, 0, padding_len(rem_len));
+
+ sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+ ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, rem - base);
+ add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+ add_out_kvec(con, rem, pmbl_tag - rem);
+ return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len, void *extdata,
+ int extdata_len, bool to_be_signed)
+{
+ int total_len = ctrl_len + extdata_len;
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+ total_len, ctrl_len, extdata_len);
+
+ /* extdata may be vmalloc'ed but not base */
+ if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+ return -EINVAL;
+
+ init_frame_desc(&desc, tag, &total_len, 1);
+ encode_preamble(&desc, base);
+
+ if (con_secure(con)) {
+ if (WARN_ON(extdata_len || to_be_signed))
+ return -EINVAL;
+
+ if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+ /* fully inlined, inline buffer may need padding */
+ ret = prepare_head_secure_small(con, base, ctrl_len);
+ else
+ /* partially inlined, inline buffer is full */
+ ret = prepare_head_secure_big(con, base, ctrl_len);
+ if (ret)
+ return ret;
+ } else {
+ prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+ to_be_signed);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len)
+{
+ return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+ void *buf, *p;
+ int ctrl_len;
+
+ ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+ NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+ void *authorizer, *authorizer_copy;
+ int ctrl_len, authorizer_len;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ authorizer_copy = alloc_conn_buf(con, authorizer_len);
+ if (!authorizer_copy)
+ return -ENOMEM;
+
+ memcpy(authorizer_copy, authorizer, authorizer_len);
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+ authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+ void *reply, int reply_len)
+{
+ int ctrl_len, authorizer_len;
+ void *authorizer;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+ CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+ ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE,
+ con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+ CTRL_BODY(buf));
+
+ return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+ SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 global_id = ceph_client_gid(client);
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(con->v2.server_cookie);
+ WARN_ON(con->v2.connect_seq);
+ WARN_ON(con->v2.peer_global_seq);
+
+ if (!con->v2.client_cookie) {
+ do {
+ get_random_bytes(&con->v2.client_cookie,
+ sizeof(con->v2.client_cookie));
+ } while (!con->v2.client_cookie);
+ dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ } else {
+ dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ }
+
+ dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+ global_id, con->v2.global_seq, client->supported_features,
+ client->required_features, con->v2.client_cookie);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+ ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* addrvec marker */
+ ceph_encode_32(&p, 1); /* addr_cnt */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ ceph_encode_64(&p, global_id);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, client->supported_features);
+ ceph_encode_64(&p, client->required_features);
+ ceph_encode_64(&p, 0); /* flags */
+ ceph_encode_64(&p, con->v2.client_cookie);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(!con->v2.client_cookie);
+ WARN_ON(!con->v2.server_cookie);
+ WARN_ON(!con->v2.connect_seq);
+ WARN_ON(!con->v2.peer_global_seq);
+
+ dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+ con->v2.connect_seq, con->in_seq);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* entity_addrvec_t marker */
+ ceph_encode_32(&p, 1); /* my_addrs len */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_64(&p, con->v2.client_cookie);
+ ceph_encode_64(&p, con->v2.server_cookie);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, con->v2.connect_seq);
+ ceph_encode_64(&p, con->in_seq);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+ struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ dout("%s con %p timestamp %ptSp\n", __func__, con, &now);
+
+ ceph_encode_timespec64(ts, &now);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+ sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+ void *p;
+
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ p = CTRL_BODY(con->v2.out_buf);
+ ceph_encode_64(&p, con->in_seq_acked);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con,
+ struct ceph_msg *msg, bool aborted)
+{
+ dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+ msg, aborted, con->v2.out_epil.front_crc,
+ con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+ encode_epilogue_plain(con, aborted);
+ add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1. For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ prepare_head_plain(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+ if (!front_len(msg) && !middle_len(msg)) {
+ if (!data_len(msg)) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return;
+ }
+
+ con->v2.out_epil.front_crc = -1;
+ con->v2.out_epil.middle_crc = -1;
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ return;
+ }
+
+ if (front_len(msg)) {
+ con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+ front_len(msg));
+ add_out_kvec(con, msg->front.iov_base, front_len(msg));
+ } else {
+ /* middle (at least) is there, checked above */
+ con->v2.out_epil.front_crc = -1;
+ }
+
+ if (middle_len(msg)) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+ add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ } else {
+ con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+ }
+
+ if (data_len(msg)) {
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ } else {
+ con->v2.out_epil.data_crc = 0;
+ prepare_epilogue_plain(con, msg, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ }
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs. Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ void *zerop = page_address(ceph_zero_page);
+ struct sg_table enc_sgt = {};
+ struct sg_table sgt = {};
+ struct page **enc_pages;
+ int enc_page_cnt;
+ int tail_len;
+ int ret;
+
+ ret = prepare_head_secure_small(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2));
+ if (ret)
+ return ret;
+
+ tail_len = tail_onwire_len(msg, true);
+ if (!tail_len) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return 0;
+ }
+
+ encode_epilogue_secure(con, false);
+ ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop,
+ &con->v2.out_epil, NULL, 0, false);
+ if (ret)
+ goto out;
+
+ enc_page_cnt = calc_pages_for(0, tail_len);
+ enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+ if (IS_ERR(enc_pages)) {
+ ret = PTR_ERR(enc_pages);
+ goto out;
+ }
+
+ WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = enc_pages;
+ con->v2.out_enc_page_cnt = enc_page_cnt;
+ con->v2.out_enc_resid = tail_len;
+ con->v2.out_enc_i = 0;
+
+ ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+ 0, tail_len, GFP_NOIO);
+ if (ret)
+ goto out;
+
+ ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+ tail_len - CEPH_GCM_TAG_LEN);
+ if (ret)
+ goto out;
+
+ dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+ msg, sgt.orig_nents, enc_page_cnt);
+ con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+ sg_free_table(&sgt);
+ sg_free_table(&enc_sgt);
+ return ret;
+}
+
+static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ int lens[] = {
+ sizeof(struct ceph_msg_header2),
+ front_len(msg),
+ middle_len(msg),
+ data_len(msg)
+ };
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+ msg, lens[0], lens[1], lens[2], lens[3]);
+
+ if (con->in_seq > con->in_seq_acked) {
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+ }
+
+ reset_out_kvecs(con);
+ init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+ encode_preamble(&desc, con->v2.out_buf);
+ fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr,
+ con->in_seq_acked);
+
+ if (con_secure(con)) {
+ ret = prepare_message_secure(con, msg);
+ if (ret)
+ return ret;
+ } else {
+ prepare_message_plain(con, msg);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+ return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+ int payload_len)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, payload_len);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, payload_len);
+ add_in_sign_kvec(con, buf, payload_len);
+ con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+ return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf,
+ con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+ CEPH_PREAMBLE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int head_len;
+ void *buf;
+
+ reset_in_kvecs(con);
+ if (con->state == CEPH_CON_S_V2_HELLO ||
+ con->state == CEPH_CON_S_V2_AUTH) {
+ head_len = head_onwire_len(ctrl_len, false);
+ buf = alloc_conn_buf(con, head_len);
+ if (!buf)
+ return -ENOMEM;
+
+ /* preserve preamble */
+ memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+ add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+ add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+ add_in_sign_kvec(con, buf, head_len);
+ } else {
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ add_in_kvec(con, buf, ctrl_len);
+ } else {
+ add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+ }
+ add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+ }
+ con->v2.in_state = IN_S_HANDLE_CONTROL;
+ return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *buf;
+
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+ add_in_kvec(con, con->v2.in_buf,
+ padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+ con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+ return 0;
+}
+
+static int prepare_read_data(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ con->in_data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+ data_len(con->in_msg));
+
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+ return 0;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ con->in_data_crc = crc32c(con->in_data_crc,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ memcpy_to_page(bv.bv_page, bv.bv_offset,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ } else {
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+ }
+
+ ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+ if (con->v2.in_cursor.total_resid) {
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've read all data. Prepare to read epilogue.
+ */
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static int prepare_sparse_read_cont(struct ceph_connection *con)
+{
+ int ret;
+ struct bio_vec bv;
+ char *buf = NULL;
+ struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor;
+
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_SPARSE_DATA_CONT);
+
+ if (iov_iter_is_bvec(&con->v2.in_iter)) {
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ con->in_data_crc = crc32c(con->in_data_crc,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ get_bvec_at(cursor, &bv);
+ memcpy_to_page(bv.bv_page, bv.bv_offset,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ } else {
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+ }
+
+ ceph_msg_data_advance(cursor, con->v2.in_bvec.bv_len);
+ cursor->sr_resid -= con->v2.in_bvec.bv_len;
+ dout("%s: advance by 0x%x sr_resid 0x%x\n", __func__,
+ con->v2.in_bvec.bv_len, cursor->sr_resid);
+ WARN_ON_ONCE(cursor->sr_resid > cursor->total_resid);
+ if (cursor->sr_resid) {
+ get_bvec_at(cursor, &bv);
+ if (bv.bv_len > cursor->sr_resid)
+ bv.bv_len = cursor->sr_resid;
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.data_len_remain -= bv.bv_len;
+ return 0;
+ }
+ } else if (iov_iter_is_kvec(&con->v2.in_iter)) {
+ /* On first call, we have no kvec so don't compute crc */
+ if (con->v2.in_kvec_cnt) {
+ WARN_ON_ONCE(con->v2.in_kvec_cnt > 1);
+ con->in_data_crc = crc32c(con->in_data_crc,
+ con->v2.in_kvecs[0].iov_base,
+ con->v2.in_kvecs[0].iov_len);
+ }
+ } else {
+ return -EIO;
+ }
+
+ /* get next extent */
+ ret = con->ops->sparse_read(con, cursor, &buf);
+ if (ret <= 0) {
+ if (ret < 0)
+ return ret;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ return 0;
+ }
+
+ if (buf) {
+ /* receive into buffer */
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, ret);
+ con->v2.data_len_remain -= ret;
+ return 0;
+ }
+
+ if (ret > cursor->total_resid) {
+ pr_warn("%s: ret 0x%x total_resid 0x%zx resid 0x%zx\n",
+ __func__, ret, cursor->total_resid, cursor->resid);
+ return -EIO;
+ }
+ get_bvec_at(cursor, &bv);
+ if (bv.bv_len > cursor->sr_resid)
+ bv.bv_len = cursor->sr_resid;
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.data_len_remain -= ret;
+ return ret;
+}
+
+static int prepare_sparse_read_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ dout("%s: starting sparse read\n", __func__);
+
+ if (WARN_ON_ONCE(!con->ops->sparse_read))
+ return -EOPNOTSUPP;
+
+ if (!con_secure(con))
+ con->in_data_crc = -1;
+
+ ceph_msg_data_cursor_init(&con->v2.in_cursor, msg,
+ msg->sparse_read_total);
+
+ reset_in_kvecs(con);
+ con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT;
+ con->v2.data_len_remain = data_len(msg);
+ return prepare_sparse_read_cont(con);
+}
+
+static int prepare_read_tail_plain(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ if (!front_len(msg) && !middle_len(msg)) {
+ WARN_ON(!data_len(msg));
+ return prepare_read_data(con);
+ }
+
+ reset_in_kvecs(con);
+ if (front_len(msg)) {
+ add_in_kvec(con, msg->front.iov_base, front_len(msg));
+ WARN_ON(msg->front.iov_len != front_len(msg));
+ }
+ if (middle_len(msg)) {
+ add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ WARN_ON(msg->middle->vec.iov_len != middle_len(msg));
+ }
+
+ if (data_len(msg)) {
+ if (msg->sparse_read_total)
+ con->v2.in_state = IN_S_PREPARE_SPARSE_DATA;
+ else
+ con->v2.in_state = IN_S_PREPARE_READ_DATA;
+ } else {
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ }
+ return 0;
+}
+
+static void prepare_read_enc_page(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i,
+ con->v2.in_enc_resid);
+ WARN_ON(!con->v2.in_enc_resid);
+
+ bvec_set_page(&bv, con->v2.in_enc_pages[con->v2.in_enc_i],
+ min(con->v2.in_enc_resid, (int)PAGE_SIZE), 0);
+
+ set_in_bvec(con, &bv);
+ con->v2.in_enc_i++;
+ con->v2.in_enc_resid -= bv.bv_len;
+
+ if (con->v2.in_enc_resid) {
+ con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE;
+ return;
+ }
+
+ /*
+ * We are set to read the last piece of ciphertext (ending
+ * with epilogue) + auth tag.
+ */
+ WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static int prepare_read_tail_secure(struct ceph_connection *con)
+{
+ struct page **enc_pages;
+ int enc_page_cnt;
+ int tail_len;
+
+ tail_len = tail_onwire_len(con->in_msg, true);
+ WARN_ON(!tail_len);
+
+ enc_page_cnt = calc_pages_for(0, tail_len);
+ enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+ if (IS_ERR(enc_pages))
+ return PTR_ERR(enc_pages);
+
+ WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt);
+ con->v2.in_enc_pages = enc_pages;
+ con->v2.in_enc_page_cnt = enc_page_cnt;
+ con->v2.in_enc_resid = tail_len;
+ con->v2.in_enc_i = 0;
+
+ prepare_read_enc_page(con);
+ return 0;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+ con->in_seq++;
+ prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int tail_len;
+
+ dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+ desc->fd_lens[2], desc->fd_lens[3]);
+
+ tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], con_secure(con));
+ if (!tail_len) {
+ __finish_skip(con);
+ } else {
+ set_in_skip(con, tail_len);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+ }
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+ int payload_len;
+ void *p;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+ p = con->v2.in_kvecs[0].iov_base;
+ if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+ if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+ con->error_msg = "server is speaking msgr1 protocol";
+ else
+ con->error_msg = "protocol error, bad banner";
+ return -EINVAL;
+ }
+
+ p += CEPH_BANNER_V2_LEN;
+ payload_len = ceph_decode_16(&p);
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+ void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+ u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+ u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+ u64 server_feat, server_req_feat;
+ void *p;
+ int ret;
+
+ p = con->v2.in_kvecs[0].iov_base;
+ ceph_decode_64_safe(&p, end, server_feat, bad);
+ ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+ dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+ __func__, con, server_feat, server_req_feat);
+
+ if (req_feat & ~server_feat) {
+ pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+ if (server_req_feat & ~feat) {
+ pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ feat, server_req_feat & ~feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /* no reset_out_kvecs() as our banner may still be pending */
+ ret = prepare_hello(con);
+ if (ret) {
+ pr_err("prepare_hello failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_HELLO;
+ prepare_read_preamble(con);
+ return 0;
+
+bad:
+ pr_err("failed to decode banner payload\n");
+ return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_entity_addr addr_for_me;
+ u8 entity_type;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ con->error_msg = "protocol error, unexpected hello";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, entity_type, bad);
+ ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+ if (ret) {
+ pr_err("failed to decode addr_for_me: %d\n", ret);
+ return ret;
+ }
+
+ dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+ entity_type, ceph_pr_addr(&addr_for_me));
+
+ if (entity_type != con->peer_name.type) {
+ pr_err("bad peer type, want %d, got %d\n",
+ con->peer_name.type, entity_type);
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ /*
+ * Set our address to the address our first peer (i.e. monitor)
+ * sees that we are connecting from. If we are behind some sort
+ * of NAT and want to be identified by some private (not NATed)
+ * address, ip option should be used.
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+ sizeof(my_addr->in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ dout("%s con %p set my addr %s, as seen by peer %s\n",
+ __func__, con, ceph_pr_addr(my_addr),
+ ceph_pr_addr(&con->peer_addr));
+ } else {
+ dout("%s con %p my addr already set %s\n",
+ __func__, con, ceph_pr_addr(my_addr));
+ }
+
+ WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+ WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+ WARN_ON(!my_addr->nonce);
+
+ /* no reset_out_kvecs() as our hello may still be pending */
+ ret = prepare_auth_request(con);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH;
+ return 0;
+
+bad:
+ pr_err("failed to decode hello\n");
+ return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int allowed_protos[8], allowed_modes[8];
+ int allowed_proto_cnt, allowed_mode_cnt;
+ int used_proto, result;
+ int ret;
+ int i;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_bad_method";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, used_proto, bad);
+ ceph_decode_32_safe(&p, end, result, bad);
+ dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+ result);
+
+ ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+ if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+ pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_proto_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+ dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+ i, allowed_protos[i]);
+ }
+
+ ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+ if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+ pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_mode_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+ dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+ i, allowed_modes[i]);
+ }
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+ allowed_protos,
+ allowed_proto_cnt,
+ allowed_modes,
+ allowed_mode_cnt);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_bad_method\n");
+ return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int payload_len;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_reply_more";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_request_more(con, p, payload_len);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request_more failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_reply_more\n");
+ return -EINVAL;
+}
+
+/*
+ * Align session_key and con_secret to avoid GFP_ATOMIC allocation
+ * inside crypto_shash_setkey() and crypto_aead_setkey() called from
+ * setup_crypto(). __aligned(16) isn't guaranteed to work for stack
+ * objects, so do it by hand.
+ */
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+ u8 session_key_buf[CEPH_KEY_LEN + 16];
+ u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16];
+ u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16);
+ u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16);
+ int session_key_len, con_secret_len;
+ int payload_len;
+ u64 global_id;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_done";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+
+ dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+ __func__, con, global_id, con->v2.con_mode, payload_len);
+
+ mutex_unlock(&con->mutex);
+ session_key_len = 0;
+ con_secret_len = 0;
+ ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+ session_key, &session_key_len,
+ con_secret, &con_secret_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+ if (ret)
+ goto out;
+
+ ret = setup_crypto(con, session_key, session_key_len, con_secret,
+ con_secret_len);
+ if (ret)
+ goto out;
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_signature(con);
+ if (ret) {
+ pr_err("prepare_auth_signature failed: %d\n", ret);
+ goto out;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+
+out:
+ memzero_explicit(session_key_buf, sizeof(session_key_buf));
+ memzero_explicit(con_secret_buf, sizeof(con_secret_buf));
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_done\n");
+ return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u8 hmac[SHA256_DIGEST_SIZE];
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+ con->error_msg = "protocol error, unexpected auth_signature";
+ return -EINVAL;
+ }
+
+ ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt,
+ hmac);
+
+ ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+ if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+ con->error_msg = "integrity error, bad auth signature";
+ return -EBADMSG;
+ }
+
+ dout("%s con %p auth signature ok\n", __func__, con);
+
+ /* no reset_out_kvecs() as our auth_signature may still be pending */
+ if (!con->v2.server_cookie) {
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ } else {
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_signature\n");
+ return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 features, required_features;
+ struct ceph_entity_addr addr;
+ u64 global_seq;
+ u64 global_id;
+ u64 cookie;
+ u64 flags;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected server_ident";
+ return -EINVAL;
+ }
+
+ ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+ if (ret) {
+ pr_err("failed to decode server addrs: %d\n", ret);
+ return ret;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+ ceph_decode_64_safe(&p, end, features, bad);
+ ceph_decode_64_safe(&p, end, required_features, bad);
+ ceph_decode_64_safe(&p, end, flags, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+
+ dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+ global_id, global_seq, features, required_features, flags, cookie);
+
+ /* is this who we intended to talk to? */
+ if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+ pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ if (client->required_features & ~features) {
+ pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ features, client->required_features & ~features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /*
+ * Both name->type and name->num are set in ceph_con_open() but
+ * name->num may be bogus in the initial monmap. name->type is
+ * verified in handle_hello().
+ */
+ WARN_ON(!con->peer_name.type);
+ con->peer_name.num = cpu_to_le64(global_id);
+ con->v2.peer_global_seq = global_seq;
+ con->peer_features = features;
+ WARN_ON(required_features & ~client->supported_features);
+ con->v2.server_cookie = cookie;
+
+ if (flags & CEPH_MSG_CONNECT_LOSSY) {
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+ WARN_ON(con->v2.server_cookie);
+ } else {
+ WARN_ON(!con->v2.server_cookie);
+ }
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode server_ident\n");
+ return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 missing_features;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected ident_missing_features";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, missing_features, bad);
+ pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ client->supported_features, missing_features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+
+bad:
+ pr_err("failed to decode ident_missing_features\n");
+ return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reconnect_ok";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_requeued(con, seq);
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reconnect_ok\n");
+ return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 connect_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+ dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+ WARN_ON(connect_seq <= con->v2.connect_seq);
+ con->v2.connect_seq = connect_seq + 1;
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry\n");
+ return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 global_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry_global";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+
+ dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+ WARN_ON(global_seq <= con->v2.global_seq);
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry_global\n");
+ return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+ void *p, void *end)
+{
+ bool full;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reset";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, full, bad);
+ if (!full) {
+ con->error_msg = "protocol error, bad session_reset";
+ return -EINVAL;
+ }
+
+ pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reset\n");
+ return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+ void *p, void *end)
+{
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected keepalive2_ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+ ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+ dout("%s con %p timestamp %ptSp\n", __func__, con, &con->last_keepalive_ack);
+
+ return 0;
+
+bad:
+ pr_err("failed to decode keepalive2_ack\n");
+ return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_sent(con, seq);
+ return 0;
+
+bad:
+ pr_err("failed to decode ack\n");
+ return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+ int tag = con->v2.in_desc.fd_tag;
+ int ret;
+
+ dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+ switch (tag) {
+ case FRAME_TAG_HELLO:
+ ret = process_hello(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_BAD_METHOD:
+ ret = process_auth_bad_method(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_REPLY_MORE:
+ ret = process_auth_reply_more(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_DONE:
+ ret = process_auth_done(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_SIGNATURE:
+ ret = process_auth_signature(con, p, end);
+ break;
+ case FRAME_TAG_SERVER_IDENT:
+ ret = process_server_ident(con, p, end);
+ break;
+ case FRAME_TAG_IDENT_MISSING_FEATURES:
+ ret = process_ident_missing_features(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RECONNECT_OK:
+ ret = process_session_reconnect_ok(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY:
+ ret = process_session_retry(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY_GLOBAL:
+ ret = process_session_retry_global(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RESET:
+ ret = process_session_reset(con, p, end);
+ break;
+ case FRAME_TAG_KEEPALIVE2_ACK:
+ ret = process_keepalive2_ack(con, p, end);
+ break;
+ case FRAME_TAG_ACK:
+ ret = process_ack(con, p, end);
+ break;
+ default:
+ pr_err("bad tag %d\n", tag);
+ con->error_msg = "protocol error, bad tag";
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+/*
+ * Return:
+ * 1 - con->in_msg set, read message
+ * 0 - skip message
+ * <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ struct ceph_msg_header2 *hdr2 = p;
+ struct ceph_msg_header hdr;
+ int skip;
+ int ret;
+ u64 seq;
+
+ /* verify seq# */
+ seq = le64_to_cpu(hdr2->seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ return 0;
+ }
+ if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+ fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], &con->peer_name);
+ ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+ if (ret)
+ return ret;
+
+ WARN_ON(!con->in_msg ^ skip);
+ if (skip)
+ return 0;
+
+ WARN_ON(!con->in_msg);
+ WARN_ON(con->in_msg->con != con);
+ return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+ ceph_con_process_message(con);
+
+ /*
+ * We could have been closed by ceph_con_close() because
+ * ceph_con_process_message() temporarily drops con->mutex.
+ */
+ if (con->state != CEPH_CON_S_OPEN) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+ void *end = p + con->v2.in_desc.fd_lens[0];
+ struct ceph_msg *msg;
+ int ret;
+
+ if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+ return process_control(con, p, end);
+
+ ret = process_message_header(con, p, end);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ prepare_skip_message(con);
+ return 0;
+ }
+
+ msg = con->in_msg; /* set in process_message_header() */
+ if (front_len(msg)) {
+ WARN_ON(front_len(msg) > msg->front_alloc_len);
+ msg->front.iov_len = front_len(msg);
+ } else {
+ msg->front.iov_len = 0;
+ }
+ if (middle_len(msg)) {
+ WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+ msg->middle->vec.iov_len = middle_len(msg);
+ } else if (msg->middle) {
+ msg->middle->vec.iov_len = 0;
+ }
+
+ if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+ return process_message(con);
+
+ if (con_secure(con))
+ return prepare_read_tail_secure(con);
+
+ return prepare_read_tail_plain(con);
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_preamble(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad preamble auth tag";
+ return ret;
+ }
+ }
+
+ ret = decode_preamble(con->v2.in_buf, desc);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad crc";
+ else
+ con->error_msg = "protocol error, bad preamble";
+ return ret;
+ }
+
+ dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+ con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+ desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+ if (!con_secure(con))
+ return prepare_read_control(con);
+
+ if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+ return prepare_read_control_remainder(con);
+
+ return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ void *buf;
+ int ret;
+
+ WARN_ON(con_secure(con));
+
+ ret = verify_control_crc(con);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+
+ if (con->state == CEPH_CON_S_V2_AUTH) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ return __handle_control(con, buf);
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+ int ret;
+
+ WARN_ON(!con_secure(con));
+
+ ret = decrypt_control_remainder(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad control remainder auth tag";
+ return ret;
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+ CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+ u32 front_crc, middle_crc, data_crc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_tail(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad epilogue auth tag";
+ return ret;
+ }
+
+ /* just late_status */
+ ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+ } else {
+ ret = decode_epilogue(con->v2.in_buf, &front_crc,
+ &middle_crc, &data_crc);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+
+ ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+ data_crc);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+ }
+
+ return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ if (con_secure(con))
+ gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+ __finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+ con->v2.in_state);
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+ ret = process_banner_prefix(con);
+ } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+ ret = process_banner_payload(con);
+ } else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+ con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+ con->state == CEPH_CON_S_OPEN) {
+ switch (con->v2.in_state) {
+ case IN_S_HANDLE_PREAMBLE:
+ ret = handle_preamble(con);
+ break;
+ case IN_S_HANDLE_CONTROL:
+ ret = handle_control(con);
+ break;
+ case IN_S_HANDLE_CONTROL_REMAINDER:
+ ret = handle_control_remainder(con);
+ break;
+ case IN_S_PREPARE_READ_DATA:
+ ret = prepare_read_data(con);
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ prepare_read_data_cont(con);
+ ret = 0;
+ break;
+ case IN_S_PREPARE_READ_ENC_PAGE:
+ prepare_read_enc_page(con);
+ ret = 0;
+ break;
+ case IN_S_PREPARE_SPARSE_DATA:
+ ret = prepare_sparse_read_data(con);
+ break;
+ case IN_S_PREPARE_SPARSE_DATA_CONT:
+ ret = prepare_sparse_read_cont(con);
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ ret = handle_epilogue(con);
+ break;
+ case IN_S_FINISH_SKIP:
+ finish_skip(con);
+ ret = 0;
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ return -EINVAL;
+ }
+ } else {
+ WARN(1, "bad state %d", con->state);
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.in_iter));
+ return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_PREOPEN)
+ return 0;
+
+ /*
+ * We should always have something pending here. If not,
+ * avoid calling populate_in_iter() as if we read something
+ * (ceph_tcp_recv() would immediately return 1).
+ */
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+
+ for (;;) {
+ ret = ceph_tcp_recv(con);
+ if (ret <= 0)
+ return ret;
+
+ ret = populate_in_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "read processing error";
+ return ret;
+ }
+ }
+}
+
+static void queue_data(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.out_cursor, msg,
+ data_len(msg));
+
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+ ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+ if (con->v2.out_cursor.total_resid) {
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've written all data. Queue epilogue. Once it's written,
+ * we are done.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, msg, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+ con->v2.out_enc_resid);
+ WARN_ON(!con->v2.out_enc_resid);
+
+ bvec_set_page(&bv, con->v2.out_enc_pages[con->v2.out_enc_i],
+ min(con->v2.out_enc_resid, (int)PAGE_SIZE), 0);
+
+ set_out_bvec(con, &bv, false);
+ con->v2.out_enc_i++;
+ con->v2.out_enc_resid -= bv.bv_len;
+
+ if (con->v2.out_enc_resid) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+ return;
+ }
+
+ /*
+ * We've queued the last piece of ciphertext (ending with
+ * epilogue) + auth tag. Once it's written, we are done.
+ */
+ WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+ if (con->v2.out_zero) {
+ set_out_bvec_zero(con);
+ con->v2.out_zero -= con->v2.out_bvec.bv_len;
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ /*
+ * We've zero-filled everything up to epilogue. Queue epilogue
+ * with late_status set to ABORTED and crcs adjusted for zeros.
+ * Once it's written, we are done patching up for the revoke.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, msg, true);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+ dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+ /* we end up here both plain and secure modes */
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+ /* message may have been revoked */
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+
+ con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+ int ret;
+
+ dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+ con->v2.out_state);
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+ con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+ goto nothing_pending;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ WARN_ON(!con->out_msg);
+ queue_data(con, con->out_msg);
+ goto populated;
+ case OUT_S_QUEUE_DATA_CONT:
+ WARN_ON(!con->out_msg);
+ queue_data_cont(con, con->out_msg);
+ goto populated;
+ case OUT_S_QUEUE_ENC_PAGE:
+ queue_enc_page(con);
+ goto populated;
+ case OUT_S_QUEUE_ZEROS:
+ WARN_ON(con->out_msg); /* revoked */
+ queue_zeros(con, con->out_msg);
+ goto populated;
+ case OUT_S_FINISH_MESSAGE:
+ finish_message(con);
+ break;
+ case OUT_S_GET_NEXT:
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ return -EINVAL;
+ }
+
+ WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+ if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+ ret = prepare_keepalive2(con);
+ if (ret) {
+ pr_err("prepare_keepalive2 failed: %d\n", ret);
+ return ret;
+ }
+ } else if ((msg = ceph_con_get_out_msg(con)) != NULL) {
+ ret = prepare_message(con, msg);
+ if (ret) {
+ pr_err("prepare_message failed: %d\n", ret);
+ return ret;
+ }
+ } else if (con->in_seq > con->in_seq_acked) {
+ ret = prepare_ack(con);
+ if (ret) {
+ pr_err("prepare_ack failed: %d\n", ret);
+ return ret;
+ }
+ } else {
+ goto nothing_pending;
+ }
+
+populated:
+ if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter));
+ return 1;
+
+nothing_pending:
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ dout("%s con %p nothing pending\n", __func__, con);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.out_iter));
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+ /*
+ * Always bump global_seq. Bump connect_seq only if
+ * there is a session (i.e. we are reconnecting and will
+ * send session_reconnect instead of client_ident).
+ */
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+ if (con->v2.server_cookie)
+ con->v2.connect_seq++;
+
+ ret = prepare_read_banner_prefix(con);
+ if (ret) {
+ pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ reset_out_kvecs(con);
+ ret = prepare_banner(con);
+ if (ret) {
+ pr_err("prepare_banner failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ ret = ceph_tcp_connect(con);
+ if (ret) {
+ pr_err("ceph_tcp_connect failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+ }
+
+ if (!iov_iter_count(&con->v2.out_iter)) {
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ return ret;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, true);
+ for (;;) {
+ ret = ceph_tcp_send(con);
+ if (ret <= 0)
+ break;
+
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ break;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, false);
+ return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+ int len;
+
+ while (zero_len) {
+ len = min(zero_len, (int)PAGE_SIZE);
+ crc = crc32c(crc, page_address(ceph_zero_page), len);
+ zero_len -= len;
+ }
+
+ return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con,
+ struct ceph_msg *msg, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > front_len(msg));
+ sent = front_len(msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.front_crc =
+ crc32c(-1, msg->front.iov_base, sent);
+ con->v2.out_epil.front_crc =
+ crc32c_zeros(con->v2.out_epil.front_crc, resid);
+ } else {
+ con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con,
+ struct ceph_msg *msg, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > middle_len(msg));
+ sent = middle_len(msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, msg->middle->vec.iov_base, sent);
+ con->v2.out_epil.middle_crc =
+ crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+ } else {
+ con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ dout("%s con %p\n", __func__, con);
+ con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg));
+ out_zero_add(con, data_len(msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!data_len(msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ boundary = front_len(msg) + middle_len(msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(msg))
+ prepare_zero_front(con, msg, front_len(msg));
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ prepare_zero_data(con, msg);
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, msg, resid);
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ prepare_zero_data(con, msg);
+ queue_zeros(con, msg);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, msg, resid);
+ prepare_zero_data(con, msg);
+ queue_zeros(con, msg);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ int sent, resid; /* current piece of data */
+
+ WARN_ON(!data_len(msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+ WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+ sent = con->v2.out_bvec.bv_len - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, sent);
+ ceph_msg_data_advance(&con->v2.out_cursor, sent);
+ }
+ WARN_ON(resid > con->v2.out_cursor.total_resid);
+ con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+ con->v2.out_cursor.total_resid);
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, con->v2.out_cursor.total_resid);
+ queue_zeros(con, msg);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ if (!front_len(msg) && !middle_len(msg) &&
+ !data_len(msg)) {
+ WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head (empty message) - noop\n",
+ __func__, con);
+ return;
+ }
+
+ boundary = front_len(msg) + middle_len(msg) +
+ CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(msg))
+ prepare_zero_front(con, msg, front_len(msg));
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, msg, resid);
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con, msg);
+ return;
+ }
+
+ boundary = CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, msg, resid);
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con, msg);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ WARN_ON(con->v2.out_zero);
+
+ if (con_secure(con)) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+ con->v2.out_state != OUT_S_FINISH_MESSAGE);
+ dout("%s con %p secure - noop\n", __func__, con);
+ return;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ revoke_at_queue_data(con, msg);
+ break;
+ case OUT_S_QUEUE_DATA_CONT:
+ revoke_at_queue_data_cont(con, msg);
+ break;
+ case OUT_S_FINISH_MESSAGE:
+ revoke_at_finish_message(con, msg);
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ break;
+ }
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+ int remaining;
+ int resid;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+ dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+ remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+ int recved, resid; /* current piece of data */
+ int remaining;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+ recved = con->v2.in_bvec.bv_len - resid;
+ dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+ if (recved)
+ ceph_msg_data_advance(&con->v2.in_cursor, recved);
+ WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+ remaining = CEPH_EPILOGUE_PLAIN_LEN;
+ dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+ con->v2.in_cursor.total_resid, remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_enc_page(struct ceph_connection *con)
+{
+ int resid; /* current enc page (not necessarily data) */
+
+ WARN_ON(!con_secure(con));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+
+ dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid,
+ con->v2.in_enc_resid);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + con->v2.in_enc_resid);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_sparse_data(struct ceph_connection *con)
+{
+ int resid; /* current piece of data */
+ int remaining;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ dout("%s con %p resid %d\n", __func__, con, resid);
+
+ remaining = CEPH_EPILOGUE_PLAIN_LEN + con->v2.data_len_remain;
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+ int resid;
+
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ dout("%s con %p resid %d\n", __func__, con, resid);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+ switch (con->v2.in_state) {
+ case IN_S_PREPARE_SPARSE_DATA:
+ case IN_S_PREPARE_READ_DATA:
+ revoke_at_prepare_read_data(con);
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ revoke_at_prepare_read_data_cont(con);
+ break;
+ case IN_S_PREPARE_READ_ENC_PAGE:
+ revoke_at_prepare_read_enc_page(con);
+ break;
+ case IN_S_PREPARE_SPARSE_DATA_CONT:
+ revoke_at_prepare_sparse_data(con);
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ revoke_at_handle_epilogue(con);
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ break;
+ }
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+ return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+ con->v2.client_cookie = 0;
+ con->v2.server_cookie = 0;
+ con->v2.global_seq = 0;
+ con->v2.connect_seq = 0;
+ con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+ iov_iter_truncate(&con->v2.in_iter, 0);
+ iov_iter_truncate(&con->v2.out_iter, 0);
+ con->v2.out_zero = 0;
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+
+ if (con->v2.in_enc_pages) {
+ WARN_ON(!con->v2.in_enc_page_cnt);
+ ceph_release_page_vector(con->v2.in_enc_pages,
+ con->v2.in_enc_page_cnt);
+ con->v2.in_enc_pages = NULL;
+ con->v2.in_enc_page_cnt = 0;
+ }
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+
+ con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+ memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN);
+ memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN);
+
+ memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key));
+ con->v2.hmac_key_set = false;
+ if (con->v2.gcm_req) {
+ aead_request_free(con->v2.gcm_req);
+ con->v2.gcm_req = NULL;
+ }
+ if (con->v2.gcm_tfm) {
+ crypto_free_aead(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ }
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 000000000000..c227ececa925
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1596 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster. Handle requests for new map
+ * versions, and periodically resend as needed. Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information. An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates. We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure. If the connection does break, we
+ * randomly hunt for a new monitor. Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+static int decode_mon_info(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ void *mon_info_end;
+ u32 struct_len;
+ u8 struct_v;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ mon_info_end = *p + struct_len;
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ if (ret)
+ return ret;
+
+ *p = mon_info_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
+ */
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
+{
+ struct ceph_monmap *monmap = NULL;
+ struct ceph_fsid fsid;
+ u32 struct_len;
+ int blob_len;
+ int num_mon;
+ u8 struct_v;
+ u32 epoch;
+ int ret;
+ int i;
+
+ ceph_decode_32_safe(p, end, blob_len, e_inval);
+ ceph_decode_need(p, end, blob_len, e_inval);
+
+ ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+ if (ret)
+ goto fail;
+
+ dout("%s struct_v %d\n", __func__, struct_v);
+ ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+ ceph_decode_32_safe(p, end, epoch, e_inval);
+ if (struct_v >= 6) {
+ u32 feat_struct_len;
+ u8 feat_struct_v;
+
+ *p += sizeof(struct ceph_timespec); /* skip last_changed */
+ *p += sizeof(struct ceph_timespec); /* skip created */
+
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
+
+ *p += feat_struct_len; /* skip persistent_features */
+
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
+
+ *p += feat_struct_len; /* skip optional_features */
+ }
+ ceph_decode_32_safe(p, end, num_mon, e_inval);
+
+ dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+ num_mon);
+ if (num_mon > CEPH_MAX_MON)
+ goto e_inval;
+
+ monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+ if (!monmap) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ monmap->fsid = fsid;
+ monmap->epoch = epoch;
+ monmap->num_mon = num_mon;
+
+ /* legacy_mon_addr map or mon_info map */
+ for (i = 0; i < num_mon; i++) {
+ struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
+
+ if (struct_v >= 6)
+ ret = decode_mon_info(p, end, msgr2, &inst->addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &inst->addr);
+ if (ret)
+ goto fail;
+
+ dout("%s mon%d addr %s\n", __func__, i,
+ ceph_pr_addr(&inst->addr));
+ }
+
+ return monmap;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ kfree(monmap);
+ return ERR_PTR(ret);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+ int i;
+
+ for (i = 0; i < m->num_mon; i++) {
+ if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+ monc->pending_auth = 1;
+ monc->m_auth->front.iov_len = len;
+ monc->m_auth->hdr.front_len = cpu_to_le32(len);
+ ceph_msg_revoke(monc->m_auth);
+ ceph_msg_get(monc->m_auth); /* keep our ref */
+ ceph_con_send(&monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+ dout("__close_session closing mon%d\n", monc->cur_mon);
+ ceph_msg_revoke(monc->m_auth);
+ ceph_msg_revoke_incoming(monc->m_auth_reply);
+ ceph_msg_revoke(monc->m_subscribe);
+ ceph_msg_revoke_incoming(monc->m_subscribe_ack);
+ ceph_con_close(&monc->con);
+
+ monc->pending_auth = 0;
+ ceph_auth_reset(monc->auth);
+}
+
+/*
+ * Pick a new monitor at random and set cur_mon. If we are repicking
+ * (i.e. cur_mon is already set), be sure to pick a different one.
+ */
+static void pick_new_mon(struct ceph_mon_client *monc)
+{
+ int old_mon = monc->cur_mon;
+
+ BUG_ON(monc->monmap->num_mon < 1);
+
+ if (monc->monmap->num_mon == 1) {
+ monc->cur_mon = 0;
+ } else {
+ int max = monc->monmap->num_mon;
+ int o = -1;
+ int n;
+
+ if (monc->cur_mon >= 0) {
+ if (monc->cur_mon < monc->monmap->num_mon)
+ o = monc->cur_mon;
+ if (o >= 0)
+ max--;
+ }
+
+ n = get_random_u32_below(max);
+ if (o >= 0 && n >= o)
+ n++;
+
+ monc->cur_mon = n;
+ }
+
+ dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon,
+ monc->cur_mon, monc->monmap->num_mon);
+}
+
+/*
+ * Open a session with a new monitor.
+ */
+static void __open_session(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ pick_new_mon(monc);
+
+ monc->hunting = true;
+ if (monc->had_a_connection) {
+ monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF;
+ if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT)
+ monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT;
+ }
+
+ monc->sub_renew_after = jiffies; /* i.e., expired */
+ monc->sub_renew_sent = 0;
+
+ dout("%s opening mon%d\n", __func__, monc->cur_mon);
+ ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon,
+ &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+ /*
+ * Queue a keepalive to ensure that in case of an early fault
+ * the messenger doesn't put us into STANDBY state and instead
+ * retries. This also ensures that our timestamp is valid by
+ * the time we finish hunting and delayed_work() checks it.
+ */
+ ceph_con_keepalive(&monc->con);
+ if (ceph_msgr2(monc->client)) {
+ monc->pending_auth = 1;
+ return;
+ }
+
+ /* initiate authentication handshake */
+ ret = ceph_auth_build_hello(monc->auth,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ BUG_ON(ret <= 0);
+ __send_prepared_auth_request(monc, ret);
+}
+
+static void reopen_session(struct ceph_mon_client *monc)
+{
+ if (!monc->hunting)
+ pr_info("mon%d %s session lost, hunting for new mon\n",
+ monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr));
+
+ __close_session(monc);
+ __open_session(monc);
+}
+
+void ceph_monc_reopen_session(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ reopen_session(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+static void un_backoff(struct ceph_mon_client *monc)
+{
+ monc->hunt_mult /= 2; /* reduce by 50% */
+ if (monc->hunt_mult < 1)
+ monc->hunt_mult = 1;
+ dout("%s hunt_mult now %d\n", __func__, monc->hunt_mult);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+ unsigned long delay;
+
+ if (monc->hunting)
+ delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult;
+ else
+ delay = CEPH_MONC_PING_INTERVAL;
+
+ dout("__schedule_delayed after %lu\n", delay);
+ mod_delayed_work(system_percpu_wq, &monc->delayed_work,
+ round_jiffies_relative(delay));
+}
+
+const char *ceph_sub_str[] = {
+ [CEPH_SUB_MONMAP] = "monmap",
+ [CEPH_SUB_OSDMAP] = "osdmap",
+ [CEPH_SUB_FSMAP] = "fsmap.user",
+ [CEPH_SUB_MDSMAP] = "mdsmap",
+};
+
+/*
+ * Send subscribe request for one or more maps, according to
+ * monc->subs.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+ struct ceph_msg *msg = monc->m_subscribe;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ int num = 0;
+ int i;
+
+ dout("%s sent %lu\n", __func__, monc->sub_renew_sent);
+
+ BUG_ON(monc->cur_mon < 0);
+
+ if (!monc->sub_renew_sent)
+ monc->sub_renew_sent = jiffies | 1; /* never 0 */
+
+ msg->hdr.version = cpu_to_le16(2);
+
+ for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+ if (monc->subs[i].want)
+ num++;
+ }
+ BUG_ON(num < 1); /* monmap sub is always there */
+ ceph_encode_32(&p, num);
+ for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+ char buf[32];
+ int len;
+
+ if (!monc->subs[i].want)
+ continue;
+
+ len = sprintf(buf, "%s", ceph_sub_str[i]);
+ if (i == CEPH_SUB_MDSMAP &&
+ monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+ len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+ dout("%s %s start %llu flags 0x%x\n", __func__, buf,
+ le64_to_cpu(monc->subs[i].item.start),
+ monc->subs[i].item.flags);
+ ceph_encode_string(&p, end, buf, len);
+ memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
+ p += sizeof(monc->subs[i].item);
+ }
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_msg_revoke(msg);
+ ceph_con_send(&monc->con, ceph_msg_get(msg));
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ unsigned int seconds;
+ struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ seconds = le32_to_cpu(h->duration);
+
+ mutex_lock(&monc->mutex);
+ if (monc->sub_renew_sent) {
+ /*
+ * This is only needed for legacy (infernalis or older)
+ * MONs -- see delayed_work().
+ */
+ monc->sub_renew_after = monc->sub_renew_sent +
+ (seconds >> 1) * HZ - 1;
+ dout("%s sent %lu duration %d renew after %lu\n", __func__,
+ monc->sub_renew_sent, seconds, monc->sub_renew_after);
+ monc->sub_renew_sent = 0;
+ } else {
+ dout("%s sent %lu renew after %lu, ignoring\n", __func__,
+ monc->sub_renew_sent, monc->sub_renew_after);
+ }
+ mutex_unlock(&monc->mutex);
+ return;
+bad:
+ pr_err("got corrupt subscribe-ack msg\n");
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Register interest in a map
+ *
+ * @sub: one of CEPH_SUB_*
+ * @epoch: X for "every map since X", or 0 for "just the latest"
+ */
+static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub,
+ u32 epoch, bool continuous)
+{
+ __le64 start = cpu_to_le64(epoch);
+ u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0;
+
+ dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub],
+ epoch, continuous);
+
+ if (monc->subs[sub].want &&
+ monc->subs[sub].item.start == start &&
+ monc->subs[sub].item.flags == flags)
+ return false;
+
+ monc->subs[sub].item.start = start;
+ monc->subs[sub].item.flags = flags;
+ monc->subs[sub].want = true;
+
+ return true;
+}
+
+bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
+ bool continuous)
+{
+ bool need_request;
+
+ mutex_lock(&monc->mutex);
+ need_request = __ceph_monc_want_map(monc, sub, epoch, continuous);
+ mutex_unlock(&monc->mutex);
+
+ return need_request;
+}
+EXPORT_SYMBOL(ceph_monc_want_map);
+
+/*
+ * Keep track of which maps we have
+ *
+ * @sub: one of CEPH_SUB_*
+ */
+static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub,
+ u32 epoch)
+{
+ dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch);
+
+ if (monc->subs[sub].want) {
+ if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME)
+ monc->subs[sub].want = false;
+ else
+ monc->subs[sub].item.start = cpu_to_le64(epoch + 1);
+ }
+
+ monc->subs[sub].have = epoch;
+}
+
+void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
+{
+ mutex_lock(&monc->mutex);
+ __ceph_monc_got_map(monc, sub, epoch);
+ mutex_unlock(&monc->mutex);
+}
+EXPORT_SYMBOL(ceph_monc_got_map);
+
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ __send_subscribe(monc);
+ mutex_unlock(&monc->mutex);
+}
+EXPORT_SYMBOL(ceph_monc_renew_subs);
+
+/*
+ * Wait for an osdmap with a given epoch.
+ *
+ * @epoch: epoch to wait for
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+ unsigned long timeout)
+{
+ unsigned long started = jiffies;
+ long ret;
+
+ mutex_lock(&monc->mutex);
+ while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) {
+ mutex_unlock(&monc->mutex);
+
+ if (timeout && time_after_eq(jiffies, started + timeout))
+ return -ETIMEDOUT;
+
+ ret = wait_event_interruptible_timeout(monc->client->auth_wq,
+ monc->subs[CEPH_SUB_OSDMAP].have >= epoch,
+ ceph_timeout_jiffies(timeout));
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&monc->mutex);
+ }
+
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_wait_osdmap);
+
+/*
+ * Open a session with a random monitor. Request monmap and osdmap,
+ * which are waited upon in __ceph_open_session().
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true);
+ __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false);
+ __open_session(monc);
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_client *client = monc->client;
+ struct ceph_monmap *monmap;
+ void *p, *end;
+
+ mutex_lock(&monc->mutex);
+
+ dout("handle_monmap\n");
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
+ if (IS_ERR(monmap)) {
+ pr_err("problem decoding monmap, %d\n",
+ (int)PTR_ERR(monmap));
+ ceph_msg_dump(msg);
+ goto out;
+ }
+
+ if (ceph_check_fsid(client, &monmap->fsid) < 0) {
+ kfree(monmap);
+ goto out;
+ }
+
+ kfree(monc->monmap);
+ monc->monmap = monmap;
+
+ __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
+ client->have_fsid = true;
+
+out:
+ mutex_unlock(&monc->mutex);
+ wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (currently statfs, mon_get_version)
+ */
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
+
+static void release_generic_request(struct kref *kref)
+{
+ struct ceph_mon_generic_request *req =
+ container_of(kref, struct ceph_mon_generic_request, kref);
+
+ dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+ req->reply);
+ WARN_ON(!RB_EMPTY_NODE(&req->node));
+
+ if (req->reply)
+ ceph_msg_put(req->reply);
+ if (req->request)
+ ceph_msg_put(req->request);
+
+ kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+ if (req)
+ kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_get(&req->kref);
+}
+
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = kzalloc(sizeof(*req), gfp);
+ if (!req)
+ return NULL;
+
+ req->monc = monc;
+ kref_init(&req->kref);
+ RB_CLEAR_NODE(&req->node);
+ init_completion(&req->completion);
+
+ dout("%s greq %p\n", __func__, req);
+ return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ WARN_ON(req->tid);
+
+ get_generic_request(req);
+ req->tid = ++monc->last_tid;
+ insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *req)
+{
+ WARN_ON(!req->tid);
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ erase_generic_request(&monc->generic_request_tree, req);
+
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ __finish_generic_request(req);
+ put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+ if (req->complete_cb)
+ req->complete_cb(req);
+ else
+ complete_all(&req->completion);
+ put_generic_request(req);
+}
+
+static void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+ struct ceph_mon_generic_request *lookup_req;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+ mutex_lock(&monc->mutex);
+ lookup_req = lookup_generic_request(&monc->generic_request_tree,
+ req->tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ finish_generic_request(req);
+ }
+
+ mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+ int ret;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ ret = wait_for_completion_interruptible(&req->completion);
+ if (ret)
+ cancel_generic_request(req);
+ else
+ ret = req->result; /* completed */
+
+ return ret;
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ struct ceph_mon_generic_request *req;
+ u64 tid = le64_to_cpu(hdr->tid);
+ struct ceph_msg *m;
+
+ mutex_lock(&monc->mutex);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ dout("get_generic_reply %lld dne\n", tid);
+ *skip = 1;
+ m = NULL;
+ } else {
+ dout("get_generic_reply %lld got %p\n", tid, req->reply);
+ *skip = 0;
+ m = ceph_msg_get(req->reply);
+ /*
+ * we don't need to track the connection reading into
+ * this reply because we only have one open connection
+ * at a time, ever.
+ */
+ }
+ mutex_unlock(&monc->mutex);
+ return m;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+ if (msg->front.iov_len != sizeof(*reply))
+ goto bad;
+
+ mutex_lock(&monc->mutex);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
+ }
+
+ req->result = 0;
+ *req->u.st = reply->st; /* struct */
+ __finish_generic_request(req);
+ mutex_unlock(&monc->mutex);
+
+ complete_generic_request(req);
+ return;
+
+bad:
+ pr_err("corrupt statfs reply, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+ struct ceph_statfs *buf)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_statfs *h;
+ int ret = -ENOMEM;
+
+ req = alloc_generic_request(monc, GFP_NOFS);
+ if (!req)
+ goto out;
+
+ req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
+ true);
+ if (!req->request)
+ goto out;
+
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
+ if (!req->reply)
+ goto out;
+
+ req->u.st = buf;
+ req->request->hdr.version = cpu_to_le16(2);
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
+ /* fill out request */
+ h = req->request->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+ h->contains_data_pool = (data_pool != CEPH_NOPOOL);
+ h->data_pool = cpu_to_le64(data_pool);
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
+
+ ret = wait_generic_request(req);
+out:
+ put_generic_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+static void handle_get_version_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front_alloc_len;
+ u64 handle;
+
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+ ceph_decode_need(&p, end, 2*sizeof(u64), bad);
+ handle = ceph_decode_64(&p);
+ if (tid != 0 && tid != handle)
+ goto bad;
+
+ mutex_lock(&monc->mutex);
+ req = lookup_generic_request(&monc->generic_request_tree, handle);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
+ }
+
+ req->result = 0;
+ req->u.newest = ceph_decode_64(&p);
+ __finish_generic_request(req);
+ mutex_unlock(&monc->mutex);
+
+ complete_generic_request(req);
+ return;
+
+bad:
+ pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = alloc_generic_request(monc, GFP_NOIO);
+ if (!req)
+ goto err_put_req;
+
+ req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
+ sizeof(u64) + sizeof(u32) + strlen(what),
+ GFP_NOIO, true);
+ if (!req->request)
+ goto err_put_req;
+
+ req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+ true);
+ if (!req->reply)
+ goto err_put_req;
+
+ req->complete_cb = cb;
+ req->private_data = private_data;
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
+ {
+ void *p = req->request->front.iov_base;
+ void *const end = p + req->request->front_alloc_len;
+
+ ceph_encode_64(&p, req->tid); /* handle */
+ ceph_encode_string(&p, end, what, strlen(what));
+ WARN_ON(p != end);
+ }
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
+
+ return req;
+
+err_put_req:
+ put_generic_request(req);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest)
+{
+ struct ceph_mon_generic_request *req;
+ int ret;
+
+ req = __ceph_monc_get_version(monc, what, NULL, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = wait_generic_request(req);
+ if (!ret)
+ *newest = req->u.newest;
+
+ put_generic_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = __ceph_monc_get_version(monc, what, cb, private_data);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ put_generic_request(req);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
+
+static void handle_command_ack(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+ ceph_decode_need(&p, end, sizeof(struct ceph_mon_request_header) +
+ sizeof(u32), bad);
+ p += sizeof(struct ceph_mon_request_header);
+
+ mutex_lock(&monc->mutex);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
+ }
+
+ req->result = ceph_decode_32(&p);
+ __finish_generic_request(req);
+ mutex_unlock(&monc->mutex);
+
+ complete_generic_request(req);
+ return;
+
+bad:
+ pr_err("corrupt mon_command ack, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+static __printf(2, 0)
+int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
+ va_list ap)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_command *h;
+ int ret = -ENOMEM;
+ int len;
+
+ req = alloc_generic_request(monc, GFP_NOIO);
+ if (!req)
+ goto out;
+
+ req->request = ceph_msg_new(CEPH_MSG_MON_COMMAND, 256, GFP_NOIO, true);
+ if (!req->request)
+ goto out;
+
+ req->reply = ceph_msg_new(CEPH_MSG_MON_COMMAND_ACK, 512, GFP_NOIO,
+ true);
+ if (!req->reply)
+ goto out;
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
+ h = req->request->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+ h->num_strs = cpu_to_le32(1);
+ len = vsprintf(h->str, fmt, ap);
+ h->str_len = cpu_to_le32(len);
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
+
+ ret = wait_generic_request(req);
+out:
+ put_generic_request(req);
+ return ret;
+}
+
+static __printf(2, 3)
+int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = do_mon_command_vargs(monc, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
+ struct ceph_entity_addr *client_addr)
+{
+ int ret;
+
+ ret = do_mon_command(monc,
+ "{ \"prefix\": \"osd blocklist\", \
+ \"blocklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr,
+ le32_to_cpu(client_addr->nonce));
+ if (ret == -EINVAL) {
+ /*
+ * The monitor returns EINVAL on an unrecognized command.
+ * Try the legacy command -- it is exactly the same except
+ * for the name.
+ */
+ ret = do_mon_command(monc,
+ "{ \"prefix\": \"osd blacklist\", \
+ \"blacklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr,
+ le32_to_cpu(client_addr->nonce));
+ }
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure we have the osdmap that includes the blocklist
+ * entry. This is needed to ensure that the OSDs pick up the
+ * new blocklist before processing any future requests from
+ * this client.
+ */
+ return ceph_wait_for_latest_osdmap(monc->client, 0);
+}
+EXPORT_SYMBOL(ceph_monc_blocklist_add);
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+ struct ceph_mon_generic_request *req;
+ struct rb_node *p;
+
+ for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_mon_generic_request, node);
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+ }
+}
+
+/*
+ * Delayed work. If we haven't mounted yet, retry. Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM). And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+ struct ceph_mon_client *monc =
+ container_of(work, struct ceph_mon_client, delayed_work.work);
+
+ mutex_lock(&monc->mutex);
+ dout("%s mon%d\n", __func__, monc->cur_mon);
+ if (monc->cur_mon < 0) {
+ goto out;
+ }
+
+ if (monc->hunting) {
+ dout("%s continuing hunt\n", __func__);
+ reopen_session(monc);
+ } else {
+ int is_auth = ceph_auth_is_authenticated(monc->auth);
+
+ dout("%s is_authed %d\n", __func__, is_auth);
+ if (ceph_con_keepalive_expired(&monc->con,
+ CEPH_MONC_PING_TIMEOUT)) {
+ dout("monc keepalive timeout\n");
+ is_auth = 0;
+ reopen_session(monc);
+ }
+
+ if (!monc->hunting) {
+ ceph_con_keepalive(&monc->con);
+ __validate_auth(monc);
+ un_backoff(monc);
+ }
+
+ if (is_auth &&
+ !(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) {
+ unsigned long now = jiffies;
+
+ dout("%s renew subs? now %lu renew after %lu\n",
+ __func__, now, monc->sub_renew_after);
+ if (time_after_eq(now, monc->sub_renew_after))
+ __send_subscribe(monc);
+ }
+ }
+ __schedule_delayed(monc);
+
+out:
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+ __le32 my_type = ceph_msgr2(monc->client) ?
+ CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ struct ceph_options *opt = monc->client->options;
+ int num_mon = opt->num_mon;
+ int i;
+
+ /* build initial monmap */
+ monc->monmap = kzalloc(struct_size(monc->monmap, mon_inst, num_mon),
+ GFP_KERNEL);
+ if (!monc->monmap)
+ return -ENOMEM;
+ monc->monmap->num_mon = num_mon;
+
+ for (i = 0; i < num_mon; i++) {
+ struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+ memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+ sizeof(inst->addr.in_addr));
+ inst->addr.type = my_type;
+ inst->addr.nonce = 0;
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
+ }
+ return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+ int err;
+
+ dout("init\n");
+ memset(monc, 0, sizeof(*monc));
+ monc->client = cl;
+ mutex_init(&monc->mutex);
+
+ err = build_initial_monmap(monc);
+ if (err)
+ goto out;
+
+ /* connection */
+ /* authentication */
+ monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+ cl->options->con_modes);
+ if (IS_ERR(monc->auth)) {
+ err = PTR_ERR(monc->auth);
+ goto out_monmap;
+ }
+ monc->auth->want_keys =
+ CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+ CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+ /* msgs */
+ err = -ENOMEM;
+ monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+ sizeof(struct ceph_mon_subscribe_ack),
+ GFP_KERNEL, true);
+ if (!monc->m_subscribe_ack)
+ goto out_auth;
+
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128,
+ GFP_KERNEL, true);
+ if (!monc->m_subscribe)
+ goto out_subscribe_ack;
+
+ monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096,
+ GFP_KERNEL, true);
+ if (!monc->m_auth_reply)
+ goto out_subscribe;
+
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_KERNEL, true);
+ monc->pending_auth = 0;
+ if (!monc->m_auth)
+ goto out_auth_reply;
+
+ ceph_con_init(&monc->con, monc, &mon_con_ops,
+ &monc->client->msgr);
+
+ monc->cur_mon = -1;
+ monc->had_a_connection = false;
+ monc->hunt_mult = 1;
+
+ INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+ monc->generic_request_tree = RB_ROOT;
+ monc->last_tid = 0;
+
+ monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
+ return 0;
+
+out_auth_reply:
+ ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+ ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+ ceph_msg_put(monc->m_subscribe_ack);
+out_auth:
+ ceph_auth_destroy(monc->auth);
+out_monmap:
+ kfree(monc->monmap);
+out:
+ return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+ dout("stop\n");
+
+ mutex_lock(&monc->mutex);
+ __close_session(monc);
+ monc->hunting = false;
+ monc->cur_mon = -1;
+ mutex_unlock(&monc->mutex);
+
+ cancel_delayed_work_sync(&monc->delayed_work);
+
+ /*
+ * flush msgr queue before we destroy ourselves to ensure that:
+ * - any work that references our embedded con is finished.
+ * - any osd_client or other work that may reference an authorizer
+ * finishes before we shut down the auth subsystem.
+ */
+ ceph_msgr_flush();
+
+ ceph_auth_destroy(monc->auth);
+
+ WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
+ ceph_msg_put(monc->m_auth);
+ ceph_msg_put(monc->m_auth_reply);
+ ceph_msg_put(monc->m_subscribe);
+ ceph_msg_put(monc->m_subscribe_ack);
+
+ kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void finish_hunting(struct ceph_mon_client *monc)
+{
+ if (monc->hunting) {
+ dout("%s found mon%d\n", __func__, monc->cur_mon);
+ monc->hunting = false;
+ monc->had_a_connection = true;
+ un_backoff(monc);
+ __schedule_delayed(monc);
+ }
+}
+
+static void finish_auth(struct ceph_mon_client *monc, int auth_err,
+ bool was_authed)
+{
+ dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed);
+ WARN_ON(auth_err > 0);
+
+ monc->pending_auth = 0;
+ if (auth_err) {
+ monc->client->auth_err = auth_err;
+ wake_up_all(&monc->client->auth_wq);
+ return;
+ }
+
+ if (!was_authed && ceph_auth_is_authenticated(monc->auth)) {
+ dout("%s authenticated, starting session global_id %llu\n",
+ __func__, monc->auth->global_id);
+
+ monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+ monc->client->msgr.inst.name.num =
+ cpu_to_le64(monc->auth->global_id);
+
+ __send_subscribe(monc);
+ __resend_generic_request(monc);
+
+ pr_info("mon%d %s session established\n", monc->cur_mon,
+ ceph_pr_addr(&monc->con.peer_addr));
+ }
+}
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ finish_auth(monc, ret, was_authed);
+ finish_hunting(monc);
+ }
+ mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ if (monc->pending_auth)
+ return 0;
+
+ ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret <= 0)
+ return ret; /* either an error, or no need to authenticate */
+ __send_prepared_auth_request(monc, ret);
+ return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = __validate_auth(monc);
+ mutex_unlock(&monc->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+static int mon_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+ buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+ reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ finish_auth(monc, ret, was_authed);
+ if (!ret)
+ finish_hunting(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt);
+ finish_auth(monc, -EACCES, was_authed);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+/*
+ * handle incoming message
+ */
+static void mon_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ switch (type) {
+ case CEPH_MSG_AUTH_REPLY:
+ handle_auth_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ handle_subscribe_ack(monc, msg);
+ break;
+
+ case CEPH_MSG_STATFS_REPLY:
+ handle_statfs_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ handle_get_version_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_COMMAND_ACK:
+ handle_command_ack(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_MAP:
+ ceph_monc_handle_map(monc, msg);
+ break;
+
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(&monc->client->osdc, msg);
+ break;
+
+ default:
+ /* can the chained handler handle it? */
+ if (monc->client->extra_mon_dispatch &&
+ monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+ break;
+
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front_len = le32_to_cpu(hdr->front_len);
+ struct ceph_msg *m = NULL;
+
+ *skip = 0;
+
+ switch (type) {
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ m = ceph_msg_get(monc->m_subscribe_ack);
+ break;
+ case CEPH_MSG_STATFS_REPLY:
+ case CEPH_MSG_MON_COMMAND_ACK:
+ return get_generic_reply(con, hdr, skip);
+ case CEPH_MSG_AUTH_REPLY:
+ m = ceph_msg_get(monc->m_auth_reply);
+ break;
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ if (le64_to_cpu(hdr->tid) != 0)
+ return get_generic_reply(con, hdr, skip);
+
+ /*
+ * Older OSDs don't set reply tid even if the original
+ * request had a non-zero tid. Work around this weirdness
+ * by allocating a new message.
+ */
+ fallthrough;
+ case CEPH_MSG_MON_MAP:
+ case CEPH_MSG_MDS_MAP:
+ case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_FS_MAP_USER:
+ m = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ if (!m)
+ return NULL; /* ENOMEM--return skip == 0 */
+ break;
+ }
+
+ if (!m) {
+ pr_info("alloc_msg unknown type %d\n", type);
+ *skip = 1;
+ } else if (front_len > m->front_alloc_len) {
+ pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n",
+ front_len, m->front_alloc_len,
+ (unsigned int)con->peer_name.type,
+ le64_to_cpu(con->peer_name.num));
+ ceph_msg_put(m);
+ m = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ }
+
+ return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+ struct ceph_mon_client *monc = con->private;
+
+ mutex_lock(&monc->mutex);
+ dout("%s mon%d\n", __func__, monc->cur_mon);
+ if (monc->cur_mon >= 0) {
+ if (!monc->hunting) {
+ dout("%s hunting for new mon\n", __func__);
+ reopen_session(monc);
+ __schedule_delayed(monc);
+ } else {
+ dout("%s already hunting\n", __func__);
+ }
+ }
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * We can ignore refcounting on the connection struct, as all references
+ * will come from the messenger workqueue, which is drained prior to
+ * mon_client destruction.
+ */
+static struct ceph_connection *mon_get_con(struct ceph_connection *con)
+{
+ return con;
+}
+
+static void mon_put_con(struct ceph_connection *con)
+{
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+ .get = mon_get_con,
+ .put = mon_put_con,
+ .alloc_msg = mon_alloc_msg,
+ .dispatch = mon_dispatch,
+ .fault = mon_fault,
+ .get_auth_request = mon_get_auth_request,
+ .handle_auth_reply_more = mon_handle_auth_reply_more,
+ .handle_auth_done = mon_handle_auth_done,
+ .handle_auth_bad_method = mon_handle_auth_bad_method,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 000000000000..e3ecb80cd182
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
+
+static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
+{
+ struct ceph_msgpool *pool = arg;
+ struct ceph_msg *msg;
+
+ msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items,
+ gfp_mask, true);
+ if (!msg) {
+ dout("msgpool_alloc %s failed\n", pool->name);
+ } else {
+ dout("msgpool_alloc %s %p\n", pool->name, msg);
+ msg->pool = pool;
+ }
+ return msg;
+}
+
+static void msgpool_free(void *element, void *arg)
+{
+ struct ceph_msgpool *pool = arg;
+ struct ceph_msg *msg = element;
+
+ dout("msgpool_release %s %p\n", pool->name, msg);
+ msg->pool = NULL;
+ ceph_msg_put(msg);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
+ int front_len, int max_data_items, int size,
+ const char *name)
+{
+ dout("msgpool %s init\n", name);
+ pool->type = type;
+ pool->front_len = front_len;
+ pool->max_data_items = max_data_items;
+ pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
+ if (!pool->pool)
+ return -ENOMEM;
+ pool->name = name;
+ return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+ dout("msgpool %s destroy\n", pool->name);
+ mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
+ int max_data_items)
+{
+ struct ceph_msg *msg;
+
+ if (front_len > pool->front_len ||
+ max_data_items > pool->max_data_items) {
+ pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n",
+ __func__, front_len, max_data_items, pool->name,
+ pool->front_len, pool->max_data_items);
+ WARN_ON_ONCE(1);
+
+ /* try to alloc a fresh message */
+ return ceph_msg_new2(pool->type, front_len, max_data_items,
+ GFP_NOFS, false);
+ }
+
+ msg = mempool_alloc(pool->pool, GFP_NOFS);
+ dout("msgpool_get %s %p\n", pool->name, msg);
+ return msg;
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+ dout("msgpool_put %s %p\n", pool->name, msg);
+
+ /* reset msg front_len; user may have changed it */
+ msg->front.iov_len = pool->front_len;
+ msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+ msg->data_length = 0;
+ msg->num_data_items = 0;
+
+ kref_init(&msg->kref); /* retake single ref */
+ mempool_free(msg, pool->pool);
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 000000000000..6664ea73ccf8
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,5917 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/striper.h>
+
+#define OSD_OPREPLY_FRONT_LEN 512
+
+static struct kmem_cache *ceph_osd_request_cache;
+
+static const struct ceph_connection_operations osd_con_ops;
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices." (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+static void clear_backoffs(struct ceph_osd *osd);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+ bool wrlocked = true;
+
+ if (unlikely(down_read_trylock(sem))) {
+ wrlocked = false;
+ up_read(sem);
+ }
+
+ return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ WARN_ON(!(mutex_is_locked(&osd->lock) &&
+ rwsem_is_locked(&osdc->lock)) &&
+ !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+ WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly. shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
+ u64 *objnum, u64 *objoff, u64 *objlen)
+{
+ u64 orig_len = *plen;
+ u32 xlen;
+
+ /* object extent? */
+ ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
+ objoff, &xlen);
+ *objlen = xlen;
+ if (*objlen < orig_len) {
+ *plen = *objlen;
+ dout(" skipping last %llu, final file extent %llu~%llu\n",
+ orig_len - *plen, off, *plen);
+ }
+
+ dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
+ return 0;
+}
+
+static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
+{
+ memset(osd_data, 0, sizeof (*osd_data));
+ osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
+}
+
+/*
+ * Consumes @pages if @own_pages is true.
+ */
+static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
+ struct page **pages, u64 length, u32 alignment,
+ bool pages_from_pool, bool own_pages)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+ osd_data->pages = pages;
+ osd_data->length = length;
+ osd_data->alignment = alignment;
+ osd_data->pages_from_pool = pages_from_pool;
+ osd_data->own_pages = own_pages;
+}
+
+/*
+ * Consumes a ref on @pagelist.
+ */
+static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
+ struct ceph_pagelist *pagelist)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
+ osd_data->pagelist = pagelist;
+}
+
+#ifdef CONFIG_BLOCK
+static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
+ struct ceph_bio_iter *bio_pos,
+ u32 bio_length)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
+ osd_data->bio_pos = *bio_pos;
+ osd_data->bio_length = bio_length;
+}
+#endif /* CONFIG_BLOCK */
+
+static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
+ struct ceph_bvec_iter *bvec_pos,
+ u32 num_bvecs)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
+ osd_data->bvec_pos = *bvec_pos;
+ osd_data->num_bvecs = num_bvecs;
+}
+
+static void ceph_osd_iter_init(struct ceph_osd_data *osd_data,
+ struct iov_iter *iter)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_ITER;
+ osd_data->iter = *iter;
+}
+
+static struct ceph_osd_data *
+osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
+{
+ BUG_ON(which >= osd_req->r_num_ops);
+
+ return &osd_req->r_ops[which].raw_data_in;
+}
+
+struct ceph_osd_data *
+osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+ unsigned int which)
+{
+ return osd_req_op_data(osd_req, which, extent, osd_data);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data);
+
+void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages,
+ u64 length, u32 alignment,
+ bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_raw_data_in(osd_req, which);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
+
+void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages,
+ u64 length, u32 alignment,
+ bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
+
+#ifdef CONFIG_BLOCK
+void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bio_iter *bio_pos,
+ u32 bio_length)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_bio_init(osd_data, bio_pos, bio_length);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
+#endif /* CONFIG_BLOCK */
+
+void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 num_bvecs,
+ u32 bytes)
+{
+ struct ceph_osd_data *osd_data;
+ struct ceph_bvec_iter it = {
+ .bvecs = bvecs,
+ .iter = { .bi_size = bytes },
+ };
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs);
+
+void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bvec_iter *bvec_pos)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
+
+/**
+ * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer
+ * @osd_req: The request to set up
+ * @which: Index of the operation in which to set the iter
+ * @iter: The buffer iterator
+ */
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+ unsigned int which, struct iov_iter *iter)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_iter_init(osd_data, iter);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_iter);
+
+static void osd_req_op_cls_request_info_pagelist(
+ struct ceph_osd_request *osd_req,
+ unsigned int which, struct ceph_pagelist *pagelist)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_info);
+ ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+
+void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+ osd_req->r_ops[which].cls.indata_len += length;
+ osd_req->r_ops[which].indata_len += length;
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
+
+void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 num_bvecs,
+ u32 bytes)
+{
+ struct ceph_osd_data *osd_data;
+ struct ceph_bvec_iter it = {
+ .bvecs = bvecs,
+ .iter = { .bi_size = bytes },
+ };
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+ ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
+ osd_req->r_ops[which].cls.indata_len += bytes;
+ osd_req->r_ops[which].indata_len += bytes;
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs);
+
+void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, response_data);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
+
+static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
+{
+ switch (osd_data->type) {
+ case CEPH_OSD_DATA_TYPE_NONE:
+ return 0;
+ case CEPH_OSD_DATA_TYPE_PAGES:
+ return osd_data->length;
+ case CEPH_OSD_DATA_TYPE_PAGELIST:
+ return (u64)osd_data->pagelist->length;
+#ifdef CONFIG_BLOCK
+ case CEPH_OSD_DATA_TYPE_BIO:
+ return (u64)osd_data->bio_length;
+#endif /* CONFIG_BLOCK */
+ case CEPH_OSD_DATA_TYPE_BVECS:
+ return osd_data->bvec_pos.iter.bi_size;
+ case CEPH_OSD_DATA_TYPE_ITER:
+ return iov_iter_count(&osd_data->iter);
+ default:
+ WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
+ return 0;
+ }
+}
+
+static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
+{
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
+ int num_pages;
+
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ ceph_release_page_vector(osd_data->pages, num_pages);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+ ceph_pagelist_release(osd_data->pagelist);
+ }
+ ceph_osd_data_init(osd_data);
+}
+
+static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
+ unsigned int which)
+{
+ struct ceph_osd_req_op *op;
+
+ BUG_ON(which >= osd_req->r_num_ops);
+ op = &osd_req->r_ops[which];
+
+ switch (op->op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ kfree(op->extent.sparse_ext);
+ ceph_osd_data_release(&op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_CALL:
+ ceph_osd_data_release(&op->cls.request_info);
+ ceph_osd_data_release(&op->cls.request_data);
+ ceph_osd_data_release(&op->cls.response_data);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ ceph_osd_data_release(&op->xattr.osd_data);
+ break;
+ case CEPH_OSD_OP_STAT:
+ ceph_osd_data_release(&op->raw_data_in);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osd_data_release(&op->notify_ack.request_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osd_data_release(&op->notify.request_data);
+ ceph_osd_data_release(&op->notify.response_data);
+ break;
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ ceph_osd_data_release(&op->list_watchers.response_data);
+ break;
+ case CEPH_OSD_OP_COPY_FROM2:
+ ceph_osd_data_release(&op->copy_from.osd_data);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+ ceph_oid_init(&t->base_oid);
+ ceph_oloc_init(&t->base_oloc);
+ ceph_oid_init(&t->target_oid);
+ ceph_oloc_init(&t->target_oloc);
+
+ ceph_osds_init(&t->acting);
+ ceph_osds_init(&t->up);
+ t->size = -1;
+ t->min_size = -1;
+
+ t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+ const struct ceph_osd_request_target *src)
+{
+ ceph_oid_copy(&dest->base_oid, &src->base_oid);
+ ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+ ceph_oid_copy(&dest->target_oid, &src->target_oid);
+ ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+ dest->pgid = src->pgid; /* struct */
+ dest->spgid = src->spgid; /* struct */
+ dest->pg_num = src->pg_num;
+ dest->pg_num_mask = src->pg_num_mask;
+ ceph_osds_copy(&dest->acting, &src->acting);
+ ceph_osds_copy(&dest->up, &src->up);
+ dest->size = src->size;
+ dest->min_size = src->min_size;
+ dest->sort_bitwise = src->sort_bitwise;
+ dest->recovery_deletes = src->recovery_deletes;
+
+ dest->flags = src->flags;
+ dest->used_replica = src->used_replica;
+ dest->paused = src->paused;
+
+ dest->epoch = src->epoch;
+ dest->last_force_resend = src->last_force_resend;
+
+ dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+ ceph_oid_destroy(&t->base_oid);
+ ceph_oloc_destroy(&t->base_oloc);
+ ceph_oid_destroy(&t->target_oid);
+ ceph_oloc_destroy(&t->target_oloc);
+}
+
+/*
+ * requests
+ */
+static void request_release_checks(struct ceph_osd_request *req)
+{
+ WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+ WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+ WARN_ON(!list_empty(&req->r_private_item));
+ WARN_ON(req->r_osd);
+}
+
+static void ceph_osdc_release_request(struct kref *kref)
+{
+ struct ceph_osd_request *req = container_of(kref,
+ struct ceph_osd_request, r_kref);
+ unsigned int which;
+
+ dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
+ req->r_request, req->r_reply);
+ request_release_checks(req);
+
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply)
+ ceph_msg_put(req->r_reply);
+
+ for (which = 0; which < req->r_num_ops; which++)
+ osd_req_op_data_release(req, which);
+
+ target_destroy(&req->r_t);
+ ceph_put_snap_context(req->r_snapc);
+
+ if (req->r_mempool)
+ mempool_free(req, req->r_osdc->req_mempool);
+ else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
+ kmem_cache_free(ceph_osd_request_cache, req);
+ else
+ kfree(req);
+}
+
+void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+ dout("%s %p (was %d)\n", __func__, req,
+ kref_read(&req->r_kref));
+ kref_get(&req->r_kref);
+}
+EXPORT_SYMBOL(ceph_osdc_get_request);
+
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+ if (req) {
+ dout("%s %p (was %d)\n", __func__, req,
+ kref_read(&req->r_kref));
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_put_request);
+
+static void request_init(struct ceph_osd_request *req)
+{
+ /* req only, each op is zeroed in osd_req_op_init() */
+ memset(req, 0, sizeof(*req));
+
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ RB_CLEAR_NODE(&req->r_node);
+ RB_CLEAR_NODE(&req->r_mc_node);
+ INIT_LIST_HEAD(&req->r_private_item);
+
+ target_init(&req->r_t);
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+ struct ceph_snap_context *snapc,
+ unsigned int num_ops,
+ bool use_mempool,
+ gfp_t gfp_flags)
+{
+ struct ceph_osd_request *req;
+
+ if (use_mempool) {
+ BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
+ req = mempool_alloc(osdc->req_mempool, gfp_flags);
+ } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
+ req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
+ } else {
+ BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
+ req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags);
+ }
+ if (unlikely(!req))
+ return NULL;
+
+ request_init(req);
+ req->r_osdc = osdc;
+ req->r_mempool = use_mempool;
+ req->r_num_ops = num_ops;
+ req->r_snapid = CEPH_NOSNAP;
+ req->r_snapc = ceph_get_snap_context(snapc);
+
+ dout("%s req %p\n", __func__, req);
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
+{
+ return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
+}
+
+static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp,
+ int num_request_data_items,
+ int num_reply_data_items)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_msg *msg;
+ int msg_size;
+
+ WARN_ON(req->r_request || req->r_reply);
+ WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
+
+ /* create request message */
+ msg_size = CEPH_ENCODING_START_BLK_LEN +
+ CEPH_PGID_ENCODING_LEN + 1; /* spgid */
+ msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ sizeof(struct ceph_osd_reqid); /* reqid */
+ msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
+ msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
+ msg_size += 4 + req->r_base_oid.name_len; /* oid */
+ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
+ msg_size += 8; /* snapid */
+ msg_size += 8; /* snap_seq */
+ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
+ msg_size += 4 + 8; /* retry_attempt, features */
+
+ if (req->r_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size,
+ num_request_data_items);
+ else
+ msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size,
+ num_request_data_items, gfp, true);
+ if (!msg)
+ return -ENOMEM;
+
+ memset(msg->front.iov_base, 0, msg->front.iov_len);
+ req->r_request = msg;
+
+ /* create reply message */
+ msg_size = OSD_OPREPLY_FRONT_LEN;
+ msg_size += req->r_base_oid.name_len;
+ msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+ if (req->r_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size,
+ num_reply_data_items);
+ else
+ msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size,
+ num_reply_data_items, gfp, true);
+ if (!msg)
+ return -ENOMEM;
+
+ req->r_reply = msg;
+
+ return 0;
+}
+
+static bool osd_req_opcode_valid(u16 opcode)
+{
+ switch (opcode) {
+#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
+__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
+#undef GENERATE_CASE
+ default:
+ return false;
+ }
+}
+
+static void get_num_data_items(struct ceph_osd_request *req,
+ int *num_request_data_items,
+ int *num_reply_data_items)
+{
+ struct ceph_osd_req_op *op;
+
+ *num_request_data_items = 0;
+ *num_reply_data_items = 0;
+
+ for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_COPY_FROM2:
+ *num_request_data_items += 1;
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ *num_reply_data_items += 1;
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_NOTIFY:
+ *num_request_data_items += 1;
+ *num_reply_data_items += 1;
+ break;
+ case CEPH_OSD_OP_CALL:
+ *num_request_data_items += 2;
+ *num_reply_data_items += 1;
+ break;
+
+ default:
+ WARN_ON(!osd_req_opcode_valid(op->op));
+ break;
+ }
+ }
+}
+
+/*
+ * oid, oloc and OSD op opcode(s) must be filled in before this function
+ * is called.
+ */
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+ int num_request_data_items, num_reply_data_items;
+
+ get_num_data_items(req, &num_request_data_items, &num_reply_data_items);
+ return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items,
+ num_reply_data_items);
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
+
+/*
+ * This is an osd op init function for opcodes that have no data or
+ * other information associated with them. It also serves as a
+ * common init routine for all the other init functions, below.
+ */
+struct ceph_osd_req_op *
+osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, u32 flags)
+{
+ struct ceph_osd_req_op *op;
+
+ BUG_ON(which >= osd_req->r_num_ops);
+ BUG_ON(!osd_req_opcode_valid(opcode));
+
+ op = &osd_req->r_ops[which];
+ memset(op, 0, sizeof (*op));
+ op->op = opcode;
+ op->flags = flags;
+
+ return op;
+}
+EXPORT_SYMBOL(osd_req_op_init);
+
+void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ u64 offset, u64 length,
+ u64 truncate_size, u32 truncate_seq)
+{
+ struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which,
+ opcode, 0);
+ size_t payload_len = 0;
+
+ BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
+ opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
+ opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ);
+
+ op->extent.offset = offset;
+ op->extent.length = length;
+ op->extent.truncate_size = truncate_size;
+ op->extent.truncate_seq = truncate_seq;
+ if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
+ payload_len += length;
+
+ op->indata_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_extent_init);
+
+void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 length)
+{
+ struct ceph_osd_req_op *op;
+ u64 previous;
+
+ BUG_ON(which >= osd_req->r_num_ops);
+ op = &osd_req->r_ops[which];
+ previous = op->extent.length;
+
+ if (length == previous)
+ return; /* Nothing to do */
+ BUG_ON(length > previous);
+
+ op->extent.length = length;
+ if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+ op->indata_len -= previous - length;
+}
+EXPORT_SYMBOL(osd_req_op_extent_update);
+
+void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 offset_inc)
+{
+ struct ceph_osd_req_op *op, *prev_op;
+
+ BUG_ON(which + 1 >= osd_req->r_num_ops);
+
+ prev_op = &osd_req->r_ops[which];
+ op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
+ /* dup previous one */
+ op->indata_len = prev_op->indata_len;
+ op->outdata_len = prev_op->outdata_len;
+ op->extent = prev_op->extent;
+ /* adjust offset */
+ op->extent.offset += offset_inc;
+ op->extent.length -= offset_inc;
+
+ if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+ op->indata_len -= offset_inc;
+}
+EXPORT_SYMBOL(osd_req_op_extent_dup_last);
+
+int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+ const char *class, const char *method)
+{
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pagelist;
+ size_t payload_len = 0;
+ size_t size;
+ int ret;
+
+ op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
+
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!pagelist)
+ return -ENOMEM;
+
+ op->cls.class_name = class;
+ size = strlen(class);
+ BUG_ON(size > (size_t) U8_MAX);
+ op->cls.class_len = size;
+ ret = ceph_pagelist_append(pagelist, class, size);
+ if (ret)
+ goto err_pagelist_free;
+ payload_len += size;
+
+ op->cls.method_name = method;
+ size = strlen(method);
+ BUG_ON(size > (size_t) U8_MAX);
+ op->cls.method_len = size;
+ ret = ceph_pagelist_append(pagelist, method, size);
+ if (ret)
+ goto err_pagelist_free;
+ payload_len += size;
+
+ osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
+ op->indata_len = payload_len;
+ return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
+}
+EXPORT_SYMBOL(osd_req_op_cls_init);
+
+int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, const char *name, const void *value,
+ size_t size, u8 cmp_op, u8 cmp_mode)
+{
+ struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which,
+ opcode, 0);
+ struct ceph_pagelist *pagelist;
+ size_t payload_len;
+ int ret;
+
+ BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
+
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!pagelist)
+ return -ENOMEM;
+
+ payload_len = strlen(name);
+ op->xattr.name_len = payload_len;
+ ret = ceph_pagelist_append(pagelist, name, payload_len);
+ if (ret)
+ goto err_pagelist_free;
+
+ op->xattr.value_len = size;
+ ret = ceph_pagelist_append(pagelist, value, size);
+ if (ret)
+ goto err_pagelist_free;
+ payload_len += size;
+
+ op->xattr.cmp_op = cmp_op;
+ op->xattr.cmp_mode = cmp_mode;
+
+ ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
+ op->indata_len = payload_len;
+ return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
+}
+EXPORT_SYMBOL(osd_req_op_xattr_init);
+
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+ u8 watch_opcode, u64 cookie, u32 gen)
+{
+ struct ceph_osd_req_op *op;
+
+ op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
+ op->watch.cookie = cookie;
+ op->watch.op = watch_opcode;
+ op->watch.gen = gen;
+}
+
+/*
+ * prot_ver, timeout and notify payload (may be empty) should already be
+ * encoded in @request_pl
+ */
+static void osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+ u64 cookie, struct ceph_pagelist *request_pl)
+{
+ struct ceph_osd_req_op *op;
+
+ op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+ op->notify.cookie = cookie;
+
+ ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl);
+ op->indata_len = request_pl->length;
+}
+
+/*
+ * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_*
+ */
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size,
+ u32 flags)
+{
+ struct ceph_osd_req_op *op;
+
+ op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0);
+ op->alloc_hint.expected_object_size = expected_object_size;
+ op->alloc_hint.expected_write_size = expected_write_size;
+ op->alloc_hint.flags = flags;
+
+ /*
+ * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+ * not worth a feature bit. Set FAILOK per-op flag to make
+ * sure older osds don't trip over an unsupported opcode.
+ */
+ op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
+
+static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
+ struct ceph_osd_data *osd_data)
+{
+ u64 length = ceph_osd_data_length(osd_data);
+
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+ BUG_ON(length > (u64) SIZE_MAX);
+ if (length)
+ ceph_msg_data_add_pages(msg, osd_data->pages,
+ length, osd_data->alignment, false);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+ BUG_ON(!length);
+ ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
+#ifdef CONFIG_BLOCK
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
+ ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length);
+#endif
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
+ ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) {
+ ceph_msg_data_add_iter(msg, &osd_data->iter);
+ } else {
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
+ }
+}
+
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+ const struct ceph_osd_req_op *src)
+{
+ switch (src->op) {
+ case CEPH_OSD_OP_STAT:
+ break;
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_ZERO:
+ case CEPH_OSD_OP_TRUNCATE:
+ dst->extent.offset = cpu_to_le64(src->extent.offset);
+ dst->extent.length = cpu_to_le64(src->extent.length);
+ dst->extent.truncate_size =
+ cpu_to_le64(src->extent.truncate_size);
+ dst->extent.truncate_seq =
+ cpu_to_le32(src->extent.truncate_seq);
+ break;
+ case CEPH_OSD_OP_CALL:
+ dst->cls.class_len = src->cls.class_len;
+ dst->cls.method_len = src->cls.method_len;
+ dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+ break;
+ case CEPH_OSD_OP_WATCH:
+ dst->watch.cookie = cpu_to_le64(src->watch.cookie);
+ dst->watch.ver = cpu_to_le64(0);
+ dst->watch.op = src->watch.op;
+ dst->watch.gen = cpu_to_le32(src->watch.gen);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ dst->notify.cookie = cpu_to_le64(src->notify.cookie);
+ break;
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ dst->alloc_hint.expected_object_size =
+ cpu_to_le64(src->alloc_hint.expected_object_size);
+ dst->alloc_hint.expected_write_size =
+ cpu_to_le64(src->alloc_hint.expected_write_size);
+ dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+ dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+ dst->xattr.cmp_op = src->xattr.cmp_op;
+ dst->xattr.cmp_mode = src->xattr.cmp_mode;
+ break;
+ case CEPH_OSD_OP_CREATE:
+ case CEPH_OSD_OP_DELETE:
+ break;
+ case CEPH_OSD_OP_COPY_FROM2:
+ dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
+ dst->copy_from.src_version =
+ cpu_to_le64(src->copy_from.src_version);
+ dst->copy_from.flags = src->copy_from.flags;
+ dst->copy_from.src_fadvise_flags =
+ cpu_to_le32(src->copy_from.src_fadvise_flags);
+ break;
+ case CEPH_OSD_OP_ASSERT_VER:
+ dst->assert_ver.unused = cpu_to_le64(0);
+ dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver);
+ break;
+ default:
+ pr_err("unsupported osd opcode %s\n",
+ ceph_osd_op_name(src->op));
+ WARN_ON(1);
+
+ return 0;
+ }
+
+ dst->op = cpu_to_le16(src->op);
+ dst->flags = cpu_to_le32(src->flags);
+ dst->payload_len = cpu_to_le32(src->indata_len);
+
+ return src->indata_len;
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately. (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 off, u64 *plen,
+ unsigned int which, int num_ops,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ u32 truncate_seq,
+ u64 truncate_size,
+ bool use_mempool)
+{
+ struct ceph_osd_request *req;
+ u64 objnum = 0;
+ u64 objoff = 0;
+ u64 objlen = 0;
+ int r;
+
+ BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
+ opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
+ opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE &&
+ opcode != CEPH_OSD_OP_SPARSE_READ);
+
+ req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
+ GFP_NOFS);
+ if (!req) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* calculate max write size */
+ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
+ if (r)
+ goto fail;
+
+ if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
+ osd_req_op_init(req, which, opcode, 0);
+ } else {
+ u32 object_size = layout->object_size;
+ u32 object_base = off - objoff;
+ if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
+ if (truncate_size <= object_base) {
+ truncate_size = 0;
+ } else {
+ truncate_size -= object_base;
+ if (truncate_size > object_size)
+ truncate_size = object_size;
+ }
+ }
+ osd_req_op_extent_init(req, which, opcode, objoff, objlen,
+ truncate_size, truncate_seq);
+ }
+
+ req->r_base_oloc.pool = layout->pool_id;
+ req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
+ ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
+ req->r_flags = flags | osdc->client->options->read_from_replica;
+
+ req->r_snapid = vino.snap;
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ req->r_data_offset = off;
+
+ if (num_ops > 1) {
+ int num_req_ops, num_rep_ops;
+
+ /*
+ * If this is a multi-op write request, assume that we'll need
+ * request ops. If it's a multi-op read then assume we'll need
+ * reply ops. Anything else and call it -EINVAL.
+ */
+ if (flags & CEPH_OSD_FLAG_WRITE) {
+ num_req_ops = num_ops;
+ num_rep_ops = 0;
+ } else if (flags & CEPH_OSD_FLAG_READ) {
+ num_req_ops = 0;
+ num_rep_ops = num_ops;
+ } else {
+ r = -EINVAL;
+ goto fail;
+ }
+
+ r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops,
+ num_rep_ops);
+ } else {
+ r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ }
+ if (r)
+ goto fail;
+
+ return req;
+
+fail:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
+{
+ WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ);
+
+ op->extent.sparse_ext_cnt = cnt;
+ op->extent.sparse_ext = kmalloc_array(cnt,
+ sizeof(*op->extent.sparse_ext),
+ GFP_NOFS);
+ if (!op->extent.sparse_ext)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
+
+/*
+ * Call @fn on each OSD request as long as @fn returns 0.
+ */
+static void for_each_request(struct ceph_osd_client *osdc,
+ int (*fn)(struct ceph_osd_request *req, void *arg),
+ void *arg)
+{
+ struct rb_node *n, *p;
+
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ for (p = rb_first(&osd->o_requests); p; ) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ p = rb_next(p);
+ if (fn(req, arg))
+ return;
+ }
+ }
+
+ for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ p = rb_next(p);
+ if (fn(req, arg))
+ return;
+ }
+}
+
+static bool osd_homeless(struct ceph_osd *osd)
+{
+ return osd->o_osd == CEPH_HOMELESS_OSD;
+}
+
+static bool osd_registered(struct ceph_osd *osd)
+{
+ verify_osdc_locked(osd->o_osdc);
+
+ return !RB_EMPTY_NODE(&osd->o_node);
+}
+
+/*
+ * Assumes @osd is zero-initialized.
+ */
+static void osd_init(struct ceph_osd *osd)
+{
+ refcount_set(&osd->o_ref, 1);
+ RB_CLEAR_NODE(&osd->o_node);
+ spin_lock_init(&osd->o_requests_lock);
+ osd->o_requests = RB_ROOT;
+ osd->o_linger_requests = RB_ROOT;
+ osd->o_backoff_mappings = RB_ROOT;
+ osd->o_backoffs_by_id = RB_ROOT;
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ osd->o_incarnation = 1;
+ mutex_init(&osd->lock);
+}
+
+static void ceph_init_sparse_read(struct ceph_sparse_read *sr)
+{
+ kfree(sr->sr_extent);
+ memset(sr, '\0', sizeof(*sr));
+ sr->sr_state = CEPH_SPARSE_READ_HDR;
+}
+
+static void osd_cleanup(struct ceph_osd *osd)
+{
+ WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
+ WARN_ON(!list_empty(&osd->o_osd_lru));
+ WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+ ceph_init_sparse_read(&osd->o_sparse_read);
+
+ if (osd->o_auth.authorizer) {
+ WARN_ON(osd_homeless(osd));
+ ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ }
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
+{
+ struct ceph_osd *osd;
+
+ WARN_ON(onum == CEPH_HOMELESS_OSD);
+
+ osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+ osd_init(osd);
+ osd->o_osdc = osdc;
+ osd->o_osd = onum;
+ osd->o_sparse_op_idx = -1;
+
+ ceph_init_sparse_read(&osd->o_sparse_read);
+
+ ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
+
+ return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+ if (refcount_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
+ refcount_read(&osd->o_ref));
+ return osd;
+ } else {
+ dout("get_osd %p FAIL\n", osd);
+ return NULL;
+ }
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+ dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
+ refcount_read(&osd->o_ref) - 1);
+ if (refcount_dec_and_test(&osd->o_ref)) {
+ osd_cleanup(osd);
+ kfree(osd);
+ }
+}
+
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
+
+static void __move_osd_to_lru(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+ BUG_ON(!list_empty(&osd->o_osd_lru));
+
+ spin_lock(&osdc->osd_lru_lock);
+ list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
+
+ osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
+}
+
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
+{
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests))
+ __move_osd_to_lru(osd);
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ spin_lock(&osdc->osd_lru_lock);
+ if (!list_empty(&osd->o_osd_lru))
+ list_del_init(&osd->o_osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
+}
+
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
+
+ verify_osdc_wrlocked(osdc);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ ceph_con_close(&osd->o_con);
+
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ n = rb_next(n); /* unlink_request() */
+
+ dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+ unlink_request(osd, req);
+ link_request(&osdc->homeless_osd, req);
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ n = rb_next(n); /* unlink_linger() */
+
+ dout(" reassigning lreq %p linger_id %llu\n", lreq,
+ lreq->linger_id);
+ unlink_linger(osd, lreq);
+ link_linger(&osdc->homeless_osd, lreq);
+ }
+ clear_backoffs(osd);
+
+ __remove_osd_from_lru(osd);
+ erase_osd(&osdc->osds, osd);
+ put_osd(osd);
+}
+
+/*
+ * reset osd connect
+ */
+static int reopen_osd(struct ceph_osd *osd)
+{
+ struct ceph_entity_addr *peer_addr;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+ close_osd(osd);
+ return -ENODEV;
+ }
+
+ peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
+ if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
+ !ceph_con_opened(&osd->o_con)) {
+ struct rb_node *n;
+
+ dout("osd addr hasn't changed and connection never opened, "
+ "letting msgr retry\n");
+ /* touch each r_stamp for handle_timeout()'s benfit */
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ req->r_stamp = jiffies;
+ }
+
+ return -EAGAIN;
+ }
+
+ ceph_con_close(&osd->o_con);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
+ osd->o_incarnation++;
+
+ return 0;
+}
+
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+ bool wrlocked)
+{
+ struct ceph_osd *osd;
+
+ if (wrlocked)
+ verify_osdc_wrlocked(osdc);
+ else
+ verify_osdc_locked(osdc);
+
+ if (o != CEPH_HOMELESS_OSD)
+ osd = lookup_osd(&osdc->osds, o);
+ else
+ osd = &osdc->homeless_osd;
+ if (!osd) {
+ if (!wrlocked)
+ return ERR_PTR(-EAGAIN);
+
+ osd = create_osd(osdc, o);
+ insert_osd(&osdc->osds, osd);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+ &osdc->osdmap->osd_addr[osd->o_osd]);
+ }
+
+ dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+ return osd;
+}
+
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
+{
+ verify_osd_locked(osd);
+ WARN_ON(!req->r_tid || req->r_osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ spin_lock(&osd->o_requests_lock);
+ insert_request(&osd->o_requests, req);
+ spin_unlock(&osd->o_requests_lock);
+ req->r_osd = osd;
+}
+
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
+{
+ verify_osd_locked(osd);
+ WARN_ON(req->r_osd != osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ req->r_osd = NULL;
+ spin_lock(&osd->o_requests_lock);
+ erase_request(&osd->o_requests, req);
+ spin_unlock(&osd->o_requests_lock);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __pool_full(struct ceph_pg_pool_info *pi)
+{
+ return pi->flags & CEPH_POOL_FLAG_FULL;
+}
+
+static bool have_pool_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ if (__pool_full(pi))
+ return true;
+ }
+
+ return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return __pool_full(pi);
+}
+
+/*
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
+ */
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+ const struct ceph_osd_request_target *t,
+ struct ceph_pg_pool_info *pi)
+{
+ bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ __pool_full(pi);
+
+ WARN_ON(pi->id != t->target_oloc.pool);
+ return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+ ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+ (osdc->osdmap->epoch < osdc->epoch_barrier);
+}
+
+static int pick_random_replica(const struct ceph_osds *acting)
+{
+ int i = get_random_u32_below(acting->size);
+
+ dout("%s picked osd%d, primary osd%d\n", __func__,
+ acting->osds[i], acting->primary);
+ return i;
+}
+
+/*
+ * Picks the closest replica based on client's location given by
+ * crush_location option. Prefers the primary if the locality is
+ * the same.
+ */
+static int pick_closest_replica(struct ceph_osd_client *osdc,
+ const struct ceph_osds *acting)
+{
+ struct ceph_options *opt = osdc->client->options;
+ int best_i, best_locality;
+ int i = 0, locality;
+
+ do {
+ locality = ceph_get_crush_locality(osdc->osdmap,
+ acting->osds[i],
+ &opt->crush_locs);
+ if (i == 0 ||
+ (locality >= 0 && best_locality < 0) ||
+ (locality >= 0 && best_locality >= 0 &&
+ locality < best_locality)) {
+ best_i = i;
+ best_locality = locality;
+ }
+ } while (++i < acting->size);
+
+ dout("%s picked osd%d with locality %d, primary osd%d\n", __func__,
+ acting->osds[best_i], best_locality, acting->primary);
+ return best_i;
+}
+
+enum calc_target_result {
+ CALC_TARGET_NO_ACTION = 0,
+ CALC_TARGET_NEED_RESEND,
+ CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+ struct ceph_osd_request_target *t,
+ bool any_change)
+{
+ struct ceph_pg_pool_info *pi;
+ struct ceph_pg pgid, last_pgid;
+ struct ceph_osds up, acting;
+ bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+ bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
+ bool force_resend = false;
+ bool unpaused = false;
+ bool legacy_change = false;
+ bool split = false;
+ bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+ bool recovery_deletes = ceph_osdmap_flag(osdc,
+ CEPH_OSDMAP_RECOVERY_DELETES);
+ enum calc_target_result ct_res;
+
+ t->epoch = osdc->osdmap->epoch;
+ pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+ if (!pi) {
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
+ }
+
+ if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+ if (t->last_force_resend < pi->last_force_request_resend) {
+ t->last_force_resend = pi->last_force_request_resend;
+ force_resend = true;
+ } else if (t->last_force_resend == 0) {
+ force_resend = true;
+ }
+ }
+
+ /* apply tiering */
+ ceph_oid_copy(&t->target_oid, &t->base_oid);
+ ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+ if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ if (is_read && pi->read_tier >= 0)
+ t->target_oloc.pool = pi->read_tier;
+ if (is_write && pi->write_tier >= 0)
+ t->target_oloc.pool = pi->write_tier;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
+ if (!pi) {
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
+ }
+ }
+
+ __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid);
+ last_pgid.pool = pgid.pool;
+ last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+ ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
+ if (any_change &&
+ ceph_is_new_interval(&t->acting,
+ &acting,
+ &t->up,
+ &up,
+ t->size,
+ pi->size,
+ t->min_size,
+ pi->min_size,
+ t->pg_num,
+ pi->pg_num,
+ t->sort_bitwise,
+ sort_bitwise,
+ t->recovery_deletes,
+ recovery_deletes,
+ &last_pgid))
+ force_resend = true;
+
+ if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+ t->paused = false;
+ unpaused = true;
+ }
+ legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
+ ceph_osds_changed(&t->acting, &acting,
+ t->used_replica || any_change);
+ if (t->pg_num)
+ split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
+
+ if (legacy_change || force_resend || split) {
+ t->pgid = pgid; /* struct */
+ ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
+ ceph_osds_copy(&t->acting, &acting);
+ ceph_osds_copy(&t->up, &up);
+ t->size = pi->size;
+ t->min_size = pi->min_size;
+ t->pg_num = pi->pg_num;
+ t->pg_num_mask = pi->pg_num_mask;
+ t->sort_bitwise = sort_bitwise;
+ t->recovery_deletes = recovery_deletes;
+
+ if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+ !is_write && pi->type == CEPH_POOL_TYPE_REP &&
+ acting.size > 1) {
+ int pos;
+
+ WARN_ON(!is_read || acting.osds[0] != acting.primary);
+ if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
+ pos = pick_random_replica(&acting);
+ } else {
+ pos = pick_closest_replica(osdc, &acting);
+ }
+ t->osd = acting.osds[pos];
+ t->used_replica = pos > 0;
+ } else {
+ t->osd = acting.primary;
+ t->used_replica = false;
+ }
+ }
+
+ if (unpaused || legacy_change || force_resend || split)
+ ct_res = CALC_TARGET_NEED_RESEND;
+ else
+ ct_res = CALC_TARGET_NO_ACTION;
+
+out:
+ dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
+ legacy_change, force_resend, split, ct_res, t->osd);
+ return ct_res;
+}
+
+static struct ceph_spg_mapping *alloc_spg_mapping(void)
+{
+ struct ceph_spg_mapping *spg;
+
+ spg = kmalloc(sizeof(*spg), GFP_NOIO);
+ if (!spg)
+ return NULL;
+
+ RB_CLEAR_NODE(&spg->node);
+ spg->backoffs = RB_ROOT;
+ return spg;
+}
+
+static void free_spg_mapping(struct ceph_spg_mapping *spg)
+{
+ WARN_ON(!RB_EMPTY_NODE(&spg->node));
+ WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
+
+ kfree(spg);
+}
+
+/*
+ * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
+ * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
+ * defined only within a specific spgid; it does not pass anything to
+ * children on split, or to another primary.
+ */
+DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
+ RB_BYPTR, const struct ceph_spg *, node)
+
+static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
+{
+ return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
+}
+
+static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
+ void **pkey, size_t *pkey_len)
+{
+ if (hoid->key_len) {
+ *pkey = hoid->key;
+ *pkey_len = hoid->key_len;
+ } else {
+ *pkey = hoid->oid;
+ *pkey_len = hoid->oid_len;
+ }
+}
+
+static int compare_names(const void *name1, size_t name1_len,
+ const void *name2, size_t name2_len)
+{
+ int ret;
+
+ ret = memcmp(name1, name2, min(name1_len, name2_len));
+ if (!ret) {
+ if (name1_len < name2_len)
+ ret = -1;
+ else if (name1_len > name2_len)
+ ret = 1;
+ }
+ return ret;
+}
+
+static int hoid_compare(const struct ceph_hobject_id *lhs,
+ const struct ceph_hobject_id *rhs)
+{
+ void *effective_key1, *effective_key2;
+ size_t effective_key1_len, effective_key2_len;
+ int ret;
+
+ if (lhs->is_max < rhs->is_max)
+ return -1;
+ if (lhs->is_max > rhs->is_max)
+ return 1;
+
+ if (lhs->pool < rhs->pool)
+ return -1;
+ if (lhs->pool > rhs->pool)
+ return 1;
+
+ if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
+ return -1;
+ if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
+ return 1;
+
+ ret = compare_names(lhs->nspace, lhs->nspace_len,
+ rhs->nspace, rhs->nspace_len);
+ if (ret)
+ return ret;
+
+ hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
+ hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
+ ret = compare_names(effective_key1, effective_key1_len,
+ effective_key2, effective_key2_len);
+ if (ret)
+ return ret;
+
+ ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
+ if (ret)
+ return ret;
+
+ if (lhs->snapid < rhs->snapid)
+ return -1;
+ if (lhs->snapid > rhs->snapid)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
+ * compat stuff here.
+ *
+ * Assumes @hoid is zero-initialized.
+ */
+static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid)
+{
+ u8 struct_v;
+ u32 struct_len;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ if (struct_v < 4) {
+ pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
+ goto e_inval;
+ }
+
+ hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
+ GFP_NOIO);
+ if (IS_ERR(hoid->key)) {
+ ret = PTR_ERR(hoid->key);
+ hoid->key = NULL;
+ return ret;
+ }
+
+ hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
+ GFP_NOIO);
+ if (IS_ERR(hoid->oid)) {
+ ret = PTR_ERR(hoid->oid);
+ hoid->oid = NULL;
+ return ret;
+ }
+
+ ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
+ ceph_decode_32_safe(p, end, hoid->hash, e_inval);
+ ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
+
+ hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
+ GFP_NOIO);
+ if (IS_ERR(hoid->nspace)) {
+ ret = PTR_ERR(hoid->nspace);
+ hoid->nspace = NULL;
+ return ret;
+ }
+
+ ceph_decode_64_safe(p, end, hoid->pool, e_inval);
+
+ ceph_hoid_build_hash_cache(hoid);
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
+{
+ return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
+ 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
+}
+
+static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid)
+{
+ ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
+ ceph_encode_string(p, end, hoid->key, hoid->key_len);
+ ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
+ ceph_encode_64(p, hoid->snapid);
+ ceph_encode_32(p, hoid->hash);
+ ceph_encode_8(p, hoid->is_max);
+ ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
+ ceph_encode_64(p, hoid->pool);
+}
+
+static void free_hoid(struct ceph_hobject_id *hoid)
+{
+ if (hoid) {
+ kfree(hoid->key);
+ kfree(hoid->oid);
+ kfree(hoid->nspace);
+ kfree(hoid);
+ }
+}
+
+static struct ceph_osd_backoff *alloc_backoff(void)
+{
+ struct ceph_osd_backoff *backoff;
+
+ backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
+ if (!backoff)
+ return NULL;
+
+ RB_CLEAR_NODE(&backoff->spg_node);
+ RB_CLEAR_NODE(&backoff->id_node);
+ return backoff;
+}
+
+static void free_backoff(struct ceph_osd_backoff *backoff)
+{
+ WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
+ WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
+
+ free_hoid(backoff->begin);
+ free_hoid(backoff->end);
+ kfree(backoff);
+}
+
+/*
+ * Within a specific spgid, backoffs are managed by ->begin hoid.
+ */
+DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
+ RB_BYVAL, spg_node);
+
+static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root,
+ const struct ceph_hobject_id *hoid)
+{
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ struct ceph_osd_backoff *cur =
+ rb_entry(n, struct ceph_osd_backoff, spg_node);
+ int cmp;
+
+ cmp = hoid_compare(hoid, cur->begin);
+ if (cmp < 0) {
+ n = n->rb_left;
+ } else if (cmp > 0) {
+ if (hoid_compare(hoid, cur->end) < 0)
+ return cur;
+
+ n = n->rb_right;
+ } else {
+ return cur;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Each backoff has a unique id within its OSD session.
+ */
+DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
+
+static void clear_backoffs(struct ceph_osd *osd)
+{
+ while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
+ struct ceph_spg_mapping *spg =
+ rb_entry(rb_first(&osd->o_backoff_mappings),
+ struct ceph_spg_mapping, node);
+
+ while (!RB_EMPTY_ROOT(&spg->backoffs)) {
+ struct ceph_osd_backoff *backoff =
+ rb_entry(rb_first(&spg->backoffs),
+ struct ceph_osd_backoff, spg_node);
+
+ erase_backoff(&spg->backoffs, backoff);
+ erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+ free_backoff(backoff);
+ }
+ erase_spg_mapping(&osd->o_backoff_mappings, spg);
+ free_spg_mapping(spg);
+ }
+}
+
+/*
+ * Set up a temporary, non-owning view into @t.
+ */
+static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
+ const struct ceph_osd_request_target *t)
+{
+ hoid->key = NULL;
+ hoid->key_len = 0;
+ hoid->oid = t->target_oid.name;
+ hoid->oid_len = t->target_oid.name_len;
+ hoid->snapid = CEPH_NOSNAP;
+ hoid->hash = t->pgid.seed;
+ hoid->is_max = false;
+ if (t->target_oloc.pool_ns) {
+ hoid->nspace = t->target_oloc.pool_ns->str;
+ hoid->nspace_len = t->target_oloc.pool_ns->len;
+ } else {
+ hoid->nspace = NULL;
+ hoid->nspace_len = 0;
+ }
+ hoid->pool = t->target_oloc.pool;
+ ceph_hoid_build_hash_cache(hoid);
+}
+
+static bool should_plug_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd *osd = req->r_osd;
+ struct ceph_spg_mapping *spg;
+ struct ceph_osd_backoff *backoff;
+ struct ceph_hobject_id hoid;
+
+ spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
+ if (!spg)
+ return false;
+
+ hoid_fill_from_target(&hoid, &req->r_t);
+ backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
+ if (!backoff)
+ return false;
+
+ dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
+ __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
+ backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
+ return true;
+}
+
+/*
+ * Keep get_num_data_items() in sync with this function.
+ */
+static void setup_request_data(struct ceph_osd_request *req)
+{
+ struct ceph_msg *request_msg = req->r_request;
+ struct ceph_msg *reply_msg = req->r_reply;
+ struct ceph_osd_req_op *op;
+
+ if (req->r_request->num_data_items || req->r_reply->num_data_items)
+ return;
+
+ WARN_ON(request_msg->data_length || reply_msg->data_length);
+ for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ WARN_ON(op->indata_len != op->extent.length);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ WARN_ON(op->indata_len != op->xattr.name_len +
+ op->xattr.value_len);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->xattr.osd_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osdc_msg_data_add(request_msg,
+ &op->notify_ack.request_data);
+ break;
+ case CEPH_OSD_OP_COPY_FROM2:
+ ceph_osdc_msg_data_add(request_msg,
+ &op->copy_from.osd_data);
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ ceph_osdc_msg_data_add(reply_msg,
+ &op->raw_data_in);
+ break;
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ ceph_osdc_msg_data_add(reply_msg,
+ &op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ ceph_osdc_msg_data_add(reply_msg,
+ &op->list_watchers.response_data);
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_CALL:
+ WARN_ON(op->indata_len != op->cls.class_len +
+ op->cls.method_len +
+ op->cls.indata_len);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->cls.request_info);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(request_msg,
+ &op->cls.request_data);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(reply_msg,
+ &op->cls.response_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osdc_msg_data_add(request_msg,
+ &op->notify.request_data);
+ ceph_osdc_msg_data_add(reply_msg,
+ &op->notify.response_data);
+ break;
+ }
+ }
+}
+
+static void encode_pgid(void **p, const struct ceph_pg *pgid)
+{
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, pgid->pool);
+ ceph_encode_32(p, pgid->seed);
+ ceph_encode_32(p, -1); /* preferred */
+}
+
+static void encode_spgid(void **p, const struct ceph_spg *spgid)
+{
+ ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
+ encode_pgid(p, &spgid->pgid);
+ ceph_encode_8(p, spgid->shard);
+}
+
+static void encode_oloc(void **p, void *end,
+ const struct ceph_object_locator *oloc)
+{
+ ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
+ ceph_encode_64(p, oloc->pool);
+ ceph_encode_32(p, -1); /* preferred */
+ ceph_encode_32(p, 0); /* key len */
+ if (oloc->pool_ns)
+ ceph_encode_string(p, end, oloc->pool_ns->str,
+ oloc->pool_ns->len);
+ else
+ ceph_encode_32(p, 0);
+}
+
+static void encode_request_partial(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ u32 data_len = 0;
+ int i;
+
+ if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+ /* snapshots aren't writeable */
+ WARN_ON(req->r_snapid != CEPH_NOSNAP);
+ } else {
+ WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+ req->r_data_offset || req->r_snapc);
+ }
+
+ setup_request_data(req);
+
+ encode_spgid(&p, &req->r_t.spgid); /* actual spg */
+ ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
+ ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+ ceph_encode_32(&p, req->r_flags);
+
+ /* reqid */
+ ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
+ memset(p, 0, sizeof(struct ceph_osd_reqid));
+ p += sizeof(struct ceph_osd_reqid);
+
+ /* trace */
+ memset(p, 0, sizeof(struct ceph_blkin_trace_info));
+ p += sizeof(struct ceph_blkin_trace_info);
+
+ ceph_encode_32(&p, 0); /* client_inc, always 0 */
+ ceph_encode_timespec64(p, &req->r_mtime);
+ p += sizeof(struct ceph_timespec);
+
+ encode_oloc(&p, end, &req->r_t.target_oloc);
+ ceph_encode_string(&p, end, req->r_t.target_oid.name,
+ req->r_t.target_oid.name_len);
+
+ /* ops, can imply data */
+ ceph_encode_16(&p, req->r_num_ops);
+ for (i = 0; i < req->r_num_ops; i++) {
+ data_len += osd_req_encode_op(p, &req->r_ops[i]);
+ p += sizeof(struct ceph_osd_op);
+ }
+
+ ceph_encode_64(&p, req->r_snapid); /* snapid */
+ if (req->r_snapc) {
+ ceph_encode_64(&p, req->r_snapc->seq);
+ ceph_encode_32(&p, req->r_snapc->num_snaps);
+ for (i = 0; i < req->r_snapc->num_snaps; i++)
+ ceph_encode_64(&p, req->r_snapc->snaps[i]);
+ } else {
+ ceph_encode_64(&p, 0); /* snap_seq */
+ ceph_encode_32(&p, 0); /* snaps len */
+ }
+
+ ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+ BUG_ON(p > end - 8); /* space for features */
+
+ msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
+ /* front_len is finalized in encode_request_finish() */
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ msg->hdr.data_len = cpu_to_le32(data_len);
+ /*
+ * The header "data_off" is a hint to the receiver allowing it
+ * to align received data into its buffers such that there's no
+ * need to re-copy it before writing it to disk (direct I/O).
+ */
+ msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+ dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
+ req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+}
+
+static void encode_request_finish(struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const partial_end = p + msg->front.iov_len;
+ void *const end = p + msg->front_alloc_len;
+
+ if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
+ /* luminous OSD -- encode features and be done */
+ p = partial_end;
+ ceph_encode_64(&p, msg->con->peer_features);
+ } else {
+ struct {
+ char spgid[CEPH_ENCODING_START_BLK_LEN +
+ CEPH_PGID_ENCODING_LEN + 1];
+ __le32 hash;
+ __le32 epoch;
+ __le32 flags;
+ char reqid[CEPH_ENCODING_START_BLK_LEN +
+ sizeof(struct ceph_osd_reqid)];
+ char trace[sizeof(struct ceph_blkin_trace_info)];
+ __le32 client_inc;
+ struct ceph_timespec mtime;
+ } __packed head;
+ struct ceph_pg pgid;
+ void *oloc, *oid, *tail;
+ int oloc_len, oid_len, tail_len;
+ int len;
+
+ /*
+ * Pre-luminous OSD -- reencode v8 into v4 using @head
+ * as a temporary buffer. Encode the raw PG; the rest
+ * is just a matter of moving oloc, oid and tail blobs
+ * around.
+ */
+ memcpy(&head, p, sizeof(head));
+ p += sizeof(head);
+
+ oloc = p;
+ p += CEPH_ENCODING_START_BLK_LEN;
+ pgid.pool = ceph_decode_64(&p);
+ p += 4 + 4; /* preferred, key len */
+ len = ceph_decode_32(&p);
+ p += len; /* nspace */
+ oloc_len = p - oloc;
+
+ oid = p;
+ len = ceph_decode_32(&p);
+ p += len;
+ oid_len = p - oid;
+
+ tail = p;
+ tail_len = partial_end - p;
+
+ p = msg->front.iov_base;
+ ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
+ ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
+ ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
+ ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
+
+ /* reassert_version */
+ memset(p, 0, sizeof(struct ceph_eversion));
+ p += sizeof(struct ceph_eversion);
+
+ BUG_ON(p >= oloc);
+ memmove(p, oloc, oloc_len);
+ p += oloc_len;
+
+ pgid.seed = le32_to_cpu(head.hash);
+ encode_pgid(&p, &pgid); /* raw pg */
+
+ BUG_ON(p >= oid);
+ memmove(p, oid, oid_len);
+ p += oid_len;
+
+ /* tail -- ops, snapid, snapc, retry_attempt */
+ BUG_ON(p >= tail);
+ memmove(p, tail, tail_len);
+ p += tail_len;
+
+ msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+ }
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+ dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
+ le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
+ le16_to_cpu(msg->hdr.version));
+}
+
+/*
+ * @req has to be assigned a tid and registered.
+ */
+static void send_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd *osd = req->r_osd;
+
+ verify_osd_locked(osd);
+ WARN_ON(osd->o_osd != req->r_t.osd);
+
+ /* backoff? */
+ if (should_plug_request(req))
+ return;
+
+ /*
+ * We may have a previously queued request message hanging
+ * around. Cancel it to avoid corrupting the msgr.
+ */
+ if (req->r_sent)
+ ceph_msg_revoke(req->r_request);
+
+ req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+ if (req->r_attempts)
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ else
+ WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+ encode_request_partial(req, req->r_request);
+
+ dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
+ __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+ req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
+ req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
+ req->r_attempts);
+
+ req->r_t.paused = false;
+ req->r_stamp = jiffies;
+ req->r_attempts++;
+
+ req->r_sent = osd->o_incarnation;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+ bool continuous = false;
+
+ verify_osdc_locked(osdc);
+ WARN_ON(!osdc->osdmap->epoch);
+
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("%s osdc %p continuous\n", __func__, osdc);
+ continuous = true;
+ } else {
+ dout("%s osdc %p onetime\n", __func__, osdc);
+ }
+
+ if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch + 1, continuous))
+ ceph_monc_renew_subs(&osdc->client->monc);
+}
+
+static void complete_request(struct ceph_osd_request *req, int err);
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd;
+ enum calc_target_result ct_res;
+ int err = 0;
+ bool need_send = false;
+ bool promoted = false;
+
+ WARN_ON(req->r_tid);
+ dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+ ct_res = calc_target(osdc, &req->r_t, false);
+ if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+ goto promote;
+
+ osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+ if (IS_ERR(osd)) {
+ WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+ goto promote;
+ }
+
+ if (osdc->abort_err) {
+ dout("req %p abort_err %d\n", req, osdc->abort_err);
+ err = osdc->abort_err;
+ } else if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+ dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+ osdc->epoch_barrier);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("req %p pausewr\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("req %p pauserd\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+ CEPH_OSD_FLAG_FULL_FORCE)) &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.base_oloc.pool))) {
+ dout("req %p full/pool_full\n", req);
+ if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) {
+ err = -ENOSPC;
+ } else {
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL))
+ pr_warn_ratelimited("cluster is full (osdmap FULL)\n");
+ else
+ pr_warn_ratelimited("pool %lld is full or reached quota\n",
+ req->r_t.base_oloc.pool);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ }
+ } else if (!osd_homeless(osd)) {
+ need_send = true;
+ } else {
+ maybe_request_map(osdc);
+ }
+
+ mutex_lock(&osd->lock);
+ /*
+ * Assign the tid atomically with send_request() to protect
+ * multiple writes to the same object from racing with each
+ * other, resulting in out of order ops on the OSDs.
+ */
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(osd, req);
+ if (need_send)
+ send_request(req);
+ else if (err)
+ complete_request(req, err);
+ mutex_unlock(&osd->lock);
+
+ if (!err && ct_res == CALC_TARGET_POOL_DNE)
+ send_map_check(req);
+
+ if (promoted)
+ downgrade_write(&osdc->lock);
+ return;
+
+promote:
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ wrlocked = true;
+ promoted = true;
+ goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+ WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+ WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
+
+ req->r_flags |= CEPH_OSD_FLAG_ONDISK;
+ atomic_inc(&req->r_osdc->num_requests);
+
+ req->r_start_stamp = jiffies;
+ req->r_start_latency = ktime_get();
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
+ ceph_osdc_get_request(req);
+ account_request(req);
+ __submit_request(req, wrlocked);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+
+ WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ req->r_end_latency = ktime_get();
+
+ if (req->r_osd) {
+ ceph_init_sparse_read(&req->r_osd->o_sparse_read);
+ unlink_request(req->r_osd, req);
+ }
+ atomic_dec(&osdc->num_requests);
+
+ /*
+ * If an OSD has failed or returned and a request has been sent
+ * twice, it's possible to get a reply and end up here while the
+ * request message is queued for delivery. We will ignore the
+ * reply, so not a big deal, but better to try and catch it.
+ */
+ ceph_msg_revoke(req->r_request);
+ ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+ dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
+ req->r_tid, req->r_callback, req->r_result);
+
+ if (req->r_callback)
+ req->r_callback(req);
+ complete_all(&req->r_completion);
+ ceph_osdc_put_request(req);
+}
+
+static void complete_request_workfn(struct work_struct *work)
+{
+ struct ceph_osd_request *req =
+ container_of(work, struct ceph_osd_request, r_complete_work);
+
+ __complete_request(req);
+}
+
+/*
+ * This is open-coded in handle_reply().
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+ dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+ req->r_result = err;
+ finish_request(req);
+
+ INIT_WORK(&req->r_complete_work, complete_request_workfn);
+ queue_work(req->r_osdc->completion_wq, &req->r_complete_work);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (!lookup_req)
+ return;
+
+ WARN_ON(lookup_req != req);
+ erase_request_mc(&osdc->map_checks, req);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ cancel_map_check(req);
+ finish_request(req);
+ complete_all(&req->r_completion);
+ ceph_osdc_put_request(req);
+}
+
+static void abort_request(struct ceph_osd_request *req, int err)
+{
+ dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+ cancel_map_check(req);
+ complete_request(req, err);
+}
+
+static int abort_fn(struct ceph_osd_request *req, void *arg)
+{
+ int err = *(int *)arg;
+
+ abort_request(req, err);
+ return 0; /* continue iteration */
+}
+
+/*
+ * Abort all in-flight requests with @err and arrange for all future
+ * requests to be failed immediately.
+ */
+void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
+{
+ dout("%s osdc %p err %d\n", __func__, osdc, err);
+ down_write(&osdc->lock);
+ for_each_request(osdc, abort_fn, &err);
+ osdc->abort_err = err;
+ up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_abort_requests);
+
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
+{
+ down_write(&osdc->lock);
+ osdc->abort_err = 0;
+ up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
+
+static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ if (likely(eb > osdc->epoch_barrier)) {
+ dout("updating epoch_barrier from %u to %u\n",
+ osdc->epoch_barrier, eb);
+ osdc->epoch_barrier = eb;
+ /* Request map if we're not to the barrier yet */
+ if (eb > osdc->osdmap->epoch)
+ maybe_request_map(osdc);
+ }
+}
+
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ down_read(&osdc->lock);
+ if (unlikely(eb > osdc->epoch_barrier)) {
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ update_epoch_barrier(osdc, eb);
+ up_write(&osdc->lock);
+ } else {
+ up_read(&osdc->lock);
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
+/*
+ * We can end up releasing caps as a result of abort_request().
+ * In that case, we probably want to ensure that the cap release message
+ * has an updated epoch barrier in it, so set the epoch barrier prior to
+ * aborting the first request.
+ */
+static int abort_on_full_fn(struct ceph_osd_request *req, void *arg)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ bool *victims = arg;
+
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.base_oloc.pool))) {
+ if (!*victims) {
+ update_epoch_barrier(osdc, osdc->osdmap->epoch);
+ *victims = true;
+ }
+ abort_request(req, -ENOSPC);
+ }
+
+ return 0; /* continue iteration */
+}
+
+/*
+ * Drop all pending requests that are stalled waiting on a full condition to
+ * clear, and complete them with ENOSPC as the return code. Set the
+ * osdc->epoch_barrier to the latest map epoch that we've seen if any were
+ * cancelled.
+ */
+static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
+{
+ bool victims = false;
+
+ if (ceph_test_opt(osdc->client, ABORT_ON_FULL) &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)))
+ for_each_request(osdc, abort_on_full_fn, &victims);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (req->r_attempts) {
+ /*
+ * We sent a request earlier, which means that
+ * previously the pool existed, and now it does not
+ * (i.e., it was deleted).
+ */
+ req->r_map_dne_bound = map->epoch;
+ dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+ req->r_tid);
+ } else {
+ dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, map->epoch);
+ }
+
+ if (req->r_map_dne_bound) {
+ if (map->epoch >= req->r_map_dne_bound) {
+ /* we had a new enough map */
+ pr_info_ratelimited("tid %llu pool does not exist\n",
+ req->r_tid);
+ complete_request(req, -ENOENT);
+ }
+ } else {
+ send_map_check(req);
+ }
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_request *req;
+ u64 tid = greq->private_data;
+
+ WARN_ON(greq->result || !greq->u.newest);
+
+ down_write(&osdc->lock);
+ req = lookup_request_mc(&osdc->map_checks, tid);
+ if (!req) {
+ dout("%s tid %llu dne\n", __func__, tid);
+ goto out_unlock;
+ }
+
+ dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+ if (!req->r_map_dne_bound)
+ req->r_map_dne_bound = greq->u.newest;
+ erase_request_mc(&osdc->map_checks, req);
+ check_pool_dne(req);
+
+ ceph_osdc_put_request(req);
+out_unlock:
+ up_write(&osdc->lock);
+}
+
+static void send_map_check(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ return;
+ }
+
+ ceph_osdc_get_request(req);
+ insert_request_mc(&osdc->map_checks, req);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ map_check_cb, req->r_tid);
+ WARN_ON(ret);
+}
+
+/*
+ * lingering requests, watch/notify v2 infrastructure
+ */
+static void linger_release(struct kref *kref)
+{
+ struct ceph_osd_linger_request *lreq =
+ container_of(kref, struct ceph_osd_linger_request, kref);
+
+ dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+ lreq->reg_req, lreq->ping_req);
+ WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+ WARN_ON(!list_empty(&lreq->scan_item));
+ WARN_ON(!list_empty(&lreq->pending_lworks));
+ WARN_ON(lreq->osd);
+
+ if (lreq->request_pl)
+ ceph_pagelist_release(lreq->request_pl);
+ if (lreq->notify_id_pages)
+ ceph_release_page_vector(lreq->notify_id_pages, 1);
+
+ ceph_osdc_put_request(lreq->reg_req);
+ ceph_osdc_put_request(lreq->ping_req);
+ target_destroy(&lreq->t);
+ kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+ if (lreq)
+ kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+ kref_get(&lreq->kref);
+ return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_linger_request *lreq;
+
+ lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+ if (!lreq)
+ return NULL;
+
+ kref_init(&lreq->kref);
+ mutex_init(&lreq->lock);
+ RB_CLEAR_NODE(&lreq->node);
+ RB_CLEAR_NODE(&lreq->osdc_node);
+ RB_CLEAR_NODE(&lreq->mc_node);
+ INIT_LIST_HEAD(&lreq->scan_item);
+ INIT_LIST_HEAD(&lreq->pending_lworks);
+ init_completion(&lreq->reg_commit_wait);
+ init_completion(&lreq->notify_finish_wait);
+
+ lreq->osdc = osdc;
+ target_init(&lreq->t);
+
+ dout("%s lreq %p\n", __func__, lreq);
+ return lreq;
+}
+
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
+/*
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
+ */
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
+{
+ verify_osd_locked(osd);
+ WARN_ON(!lreq->linger_id || lreq->osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
+
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_linger(&osd->o_linger_requests, lreq);
+ lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
+{
+ verify_osd_locked(osd);
+ WARN_ON(lreq->osd != osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
+
+ lreq->osd = NULL;
+ erase_linger(&osd->o_linger_requests, lreq);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ verify_osdc_locked(lreq->osdc);
+
+ return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ bool registered;
+
+ down_read(&osdc->lock);
+ registered = __linger_registered(lreq);
+ up_read(&osdc->lock);
+
+ return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(lreq->linger_id);
+
+ linger_get(lreq);
+ lreq->linger_id = ++osdc->last_linger_id;
+ insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+
+ erase_linger_osdc(&osdc->linger_requests, lreq);
+ linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ WARN_ON(!req->r_linger);
+ cancel_request(req);
+ linger_put(lreq);
+}
+
+struct linger_work {
+ struct work_struct work;
+ struct ceph_osd_linger_request *lreq;
+ struct list_head pending_item;
+ unsigned long queued_stamp;
+
+ union {
+ struct {
+ u64 notify_id;
+ u64 notifier_id;
+ void *payload; /* points into @msg front */
+ size_t payload_len;
+
+ struct ceph_msg *msg; /* for ceph_msg_put() */
+ } notify;
+ struct {
+ int err;
+ } error;
+ };
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+ work_func_t workfn)
+{
+ struct linger_work *lwork;
+
+ lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+ if (!lwork)
+ return NULL;
+
+ INIT_WORK(&lwork->work, workfn);
+ INIT_LIST_HEAD(&lwork->pending_item);
+ lwork->lreq = linger_get(lreq);
+
+ return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ mutex_lock(&lreq->lock);
+ list_del(&lwork->pending_item);
+ mutex_unlock(&lreq->lock);
+
+ linger_put(lreq);
+ kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_lreq_locked(lreq);
+ WARN_ON(!list_empty(&lwork->pending_item));
+
+ lwork->queued_stamp = jiffies;
+ list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+ queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
+ }
+
+ WARN_ON(!lreq->is_watch);
+ dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+ __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+ lwork->notify.payload_len);
+ lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+ lwork->notify.notifier_id, lwork->notify.payload,
+ lwork->notify.payload_len);
+
+out:
+ ceph_msg_put(lwork->notify.msg);
+ lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
+ }
+
+ dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+ lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+ lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+ struct linger_work *lwork;
+
+ lwork = lwork_alloc(lreq, do_watch_error);
+ if (!lwork) {
+ pr_err("failed to allocate error-lwork\n");
+ return;
+ }
+
+ lwork->error.err = lreq->last_error;
+ lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+ int result)
+{
+ if (!completion_done(&lreq->reg_commit_wait)) {
+ lreq->reg_commit_error = (result <= 0 ? result : 0);
+ complete_all(&lreq->reg_commit_wait);
+ }
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ if (req != lreq->reg_req) {
+ dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+ __func__, lreq, lreq->linger_id, req, lreq->reg_req);
+ goto out;
+ }
+
+ dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+ lreq->linger_id, req->r_result);
+ linger_reg_commit_complete(lreq, req->r_result);
+ lreq->committed = true;
+
+ if (!lreq->is_watch) {
+ struct ceph_osd_data *osd_data =
+ osd_req_op_data(req, 0, notify, response_data);
+ void *p = page_address(osd_data->pages[0]);
+
+ WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+ osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+ /* make note of the notify_id */
+ if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+ lreq->notify_id = ceph_decode_64(&p);
+ dout("lreq %p notify_id %llu\n", lreq,
+ lreq->notify_id);
+ } else {
+ dout("lreq %p no notify_id\n", lreq);
+ }
+ }
+
+out:
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
+
+static int normalize_watch_error(int err)
+{
+ /*
+ * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+ * notification and a failure to reconnect because we raced with
+ * the delete appear the same to the user.
+ */
+ if (err == -ENOENT)
+ err = -ENOTCONN;
+
+ return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ if (req != lreq->reg_req) {
+ dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+ __func__, lreq, lreq->linger_id, req, lreq->reg_req);
+ goto out;
+ }
+
+ dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+ lreq, lreq->linger_id, req->r_result, lreq->last_error);
+ if (req->r_result < 0) {
+ if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
+ }
+ }
+
+out:
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_request *req;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+
+ if (lreq->reg_req) {
+ if (lreq->reg_req->r_osd)
+ cancel_linger_request(lreq->reg_req);
+ ceph_osdc_put_request(lreq->reg_req);
+ }
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO);
+ BUG_ON(!req);
+
+ target_copy(&req->r_t, &lreq->t);
+ req->r_mtime = lreq->mtime;
+
+ if (lreq->is_watch && lreq->committed) {
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT,
+ lreq->linger_id, ++lreq->register_gen);
+ dout("lreq %p reconnect register_gen %u\n", lreq,
+ req->r_ops[0].watch.gen);
+ req->r_callback = linger_reconnect_cb;
+ } else {
+ if (lreq->is_watch) {
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH,
+ lreq->linger_id, 0);
+ } else {
+ lreq->notify_id = 0;
+
+ refcount_inc(&lreq->request_pl->refcnt);
+ osd_req_op_notify_init(req, 0, lreq->linger_id,
+ lreq->request_pl);
+ ceph_osd_data_pages_init(
+ osd_req_op_data(req, 0, notify, response_data),
+ lreq->notify_id_pages, PAGE_SIZE, 0, false, false);
+ }
+ dout("lreq %p register\n", lreq);
+ req->r_callback = linger_commit_cb;
+ }
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ BUG_ON(ret);
+
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+ lreq->reg_req = req;
+ mutex_unlock(&lreq->lock);
+
+ submit_request(req, true);
+}
+
+static void linger_ping_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ if (req != lreq->ping_req) {
+ dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+ __func__, lreq, lreq->linger_id, req, lreq->ping_req);
+ goto out;
+ }
+
+ dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+ __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+ lreq->last_error);
+ if (lreq->register_gen == req->r_ops[0].watch.gen) {
+ if (!req->r_result) {
+ lreq->watch_valid_thru = lreq->ping_sent;
+ } else if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
+ }
+ } else {
+ dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+ lreq->register_gen, req->r_ops[0].watch.gen);
+ }
+
+out:
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
+
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_request *req;
+ int ret;
+
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("%s PAUSERD\n", __func__);
+ return;
+ }
+
+ lreq->ping_sent = jiffies;
+ dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+ __func__, lreq, lreq->linger_id, lreq->ping_sent,
+ lreq->register_gen);
+
+ if (lreq->ping_req) {
+ if (lreq->ping_req->r_osd)
+ cancel_linger_request(lreq->ping_req);
+ ceph_osdc_put_request(lreq->ping_req);
+ }
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO);
+ BUG_ON(!req);
+
+ target_copy(&req->r_t, &lreq->t);
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id,
+ lreq->register_gen);
+ req->r_callback = linger_ping_cb;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ BUG_ON(ret);
+
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+ lreq->ping_req = req;
+
+ ceph_osdc_get_request(req);
+ account_request(req);
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(lreq->osd, req);
+ send_request(req);
+}
+
+static void linger_submit(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd *osd;
+
+ down_write(&osdc->lock);
+ linger_register(lreq);
+
+ calc_target(osdc, &lreq->t, false);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ link_linger(osd, lreq);
+
+ send_linger(lreq);
+ up_write(&osdc->lock);
+}
+
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (!lookup_lreq)
+ return;
+
+ WARN_ON(lookup_lreq != lreq);
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ linger_put(lreq);
+}
+
+/*
+ * @lreq has to be both registered and linked.
+ */
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+ if (lreq->ping_req && lreq->ping_req->r_osd)
+ cancel_linger_request(lreq->ping_req);
+ if (lreq->reg_req && lreq->reg_req->r_osd)
+ cancel_linger_request(lreq->reg_req);
+ cancel_linger_map_check(lreq);
+ unlink_linger(lreq->osd, lreq);
+ linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ down_write(&osdc->lock);
+ if (__linger_registered(lreq))
+ __linger_cancel(lreq);
+ up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (lreq->register_gen) {
+ lreq->map_dne_bound = map->epoch;
+ dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+ lreq, lreq->linger_id);
+ } else {
+ dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ map->epoch);
+ }
+
+ if (lreq->map_dne_bound) {
+ if (map->epoch >= lreq->map_dne_bound) {
+ /* we had a new enough map */
+ pr_info("linger_id %llu pool does not exist\n",
+ lreq->linger_id);
+ linger_reg_commit_complete(lreq, -ENOENT);
+ __linger_cancel(lreq);
+ }
+ } else {
+ send_linger_map_check(lreq);
+ }
+}
+
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_linger_request *lreq;
+ u64 linger_id = greq->private_data;
+
+ WARN_ON(greq->result || !greq->u.newest);
+
+ down_write(&osdc->lock);
+ lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+ if (!lreq) {
+ dout("%s linger_id %llu dne\n", __func__, linger_id);
+ goto out_unlock;
+ }
+
+ dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ greq->u.newest);
+ if (!lreq->map_dne_bound)
+ lreq->map_dne_bound = greq->u.newest;
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ check_linger_pool_dne(lreq);
+
+ linger_put(lreq);
+out_unlock:
+ up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (lookup_lreq) {
+ WARN_ON(lookup_lreq != lreq);
+ return;
+ }
+
+ linger_get(lreq);
+ insert_linger_mc(&osdc->linger_map_checks, lreq);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ linger_map_check_cb, lreq->linger_id);
+ WARN_ON(ret);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_killable(&lreq->reg_commit_wait);
+ return ret ?: lreq->reg_commit_error;
+}
+
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq,
+ unsigned long timeout)
+{
+ long left;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait,
+ ceph_timeout_jiffies(timeout));
+ if (left <= 0)
+ left = left ?: -ETIMEDOUT;
+ else
+ left = lreq->notify_finish_error; /* completed */
+
+ return left;
+}
+
+/*
+ * Timeout callback, called every N seconds. When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
+ */
+static void handle_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client, timeout_work.work);
+ struct ceph_options *opts = osdc->client->options;
+ unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+ unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
+ LIST_HEAD(slow_osds);
+ struct rb_node *n, *p;
+
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
+
+ /*
+ * ping osds that are a bit slow. this ensures that if there
+ * is a break in the TCP connection we will notice, and reopen
+ * a connection with that osd (from the fault callback).
+ */
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ bool found = false;
+
+ for (p = rb_first(&osd->o_requests); p; ) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ p = rb_next(p); /* abort_request() */
+
+ if (time_before(req->r_stamp, cutoff)) {
+ dout(" req %p tid %llu on osd%d is laggy\n",
+ req, req->r_tid, osd->o_osd);
+ found = true;
+ }
+ if (opts->osd_request_timeout &&
+ time_before(req->r_start_stamp, expiry_cutoff)) {
+ pr_err_ratelimited("tid %llu on osd%d timeout\n",
+ req->r_tid, osd->o_osd);
+ abort_request(req, -ETIMEDOUT);
+ }
+ }
+ for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(p, struct ceph_osd_linger_request, node);
+
+ dout(" lreq %p linger_id %llu is served by osd%d\n",
+ lreq, lreq->linger_id, osd->o_osd);
+ found = true;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed && !lreq->last_error)
+ send_linger_ping(lreq);
+ mutex_unlock(&lreq->lock);
+ }
+
+ if (found)
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ }
+
+ if (opts->osd_request_timeout) {
+ for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ p = rb_next(p); /* abort_request() */
+
+ if (time_before(req->r_start_stamp, expiry_cutoff)) {
+ pr_err_ratelimited("tid %llu on osd%d timeout\n",
+ req->r_tid, osdc->homeless_osd.o_osd);
+ abort_request(req, -ETIMEDOUT);
+ }
+ }
+ }
+
+ if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+ maybe_request_map(osdc);
+
+ while (!list_empty(&slow_osds)) {
+ struct ceph_osd *osd = list_first_entry(&slow_osds,
+ struct ceph_osd,
+ o_keepalive_item);
+ list_del_init(&osd->o_keepalive_item);
+ ceph_con_keepalive(&osd->o_con);
+ }
+
+ up_write(&osdc->lock);
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client,
+ osds_timeout_work.work);
+ unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+ struct ceph_osd *osd, *nosd;
+
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (time_before(jiffies, osd->lru_ttl))
+ break;
+
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ close_osd(osd);
+ }
+
+ up_write(&osdc->lock);
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(delay));
+}
+
+static int ceph_oloc_decode(void **p, void *end,
+ struct ceph_object_locator *oloc)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret = 0;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_v < 3) {
+ pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ if (struct_cv > 6) {
+ pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ oloc->pool = ceph_decode_64(p);
+ *p += 4; /* skip preferred */
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::key is set\n");
+ goto e_inval;
+ }
+
+ if (struct_v >= 5) {
+ bool changed = false;
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ ceph_decode_need(p, end, len, e_inval);
+ if (!oloc->pool_ns ||
+ ceph_compare_string(oloc->pool_ns, *p, len))
+ changed = true;
+ *p += len;
+ } else {
+ if (oloc->pool_ns)
+ changed = true;
+ }
+ if (changed) {
+ /* redirect changes namespace */
+ pr_warn("ceph_object_locator::nspace is changed\n");
+ goto e_inval;
+ }
+ }
+
+ if (struct_v >= 6) {
+ s64 hash = ceph_decode_64(p);
+ if (hash != -1) {
+ pr_warn("ceph_object_locator::hash is set\n");
+ goto e_inval;
+ }
+ }
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
+static int ceph_redirect_decode(void **p, void *end,
+ struct ceph_request_redirect *redir)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_cv > 1) {
+ pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ ret = ceph_oloc_decode(p, end, &redir->oloc);
+ if (ret)
+ goto out;
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_request_redirect::object_name is set\n");
+ goto e_inval;
+ }
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
+struct MOSDOpReply {
+ struct ceph_pg pgid;
+ u64 flags;
+ int result;
+ u32 epoch;
+ int num_ops;
+ u32 outdata_len[CEPH_OSD_MAX_OPS];
+ s32 rval[CEPH_OSD_MAX_OPS];
+ int retry_attempt;
+ struct ceph_eversion replay_version;
+ u64 user_version;
+ struct ceph_request_redirect redirect;
+};
+
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ u16 version = le16_to_cpu(msg->hdr.version);
+ struct ceph_eversion bad_replay_version;
+ u8 decode_redir;
+ u32 len;
+ int ret;
+ int i;
+
+ ceph_decode_32_safe(&p, end, len, e_inval);
+ ceph_decode_need(&p, end, len, e_inval);
+ p += len; /* skip oid */
+
+ ret = ceph_decode_pgid(&p, end, &m->pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_64_safe(&p, end, m->flags, e_inval);
+ ceph_decode_32_safe(&p, end, m->result, e_inval);
+ ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+ memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+ p += sizeof(bad_replay_version);
+ ceph_decode_32_safe(&p, end, m->epoch, e_inval);
+
+ ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+ if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+ goto e_inval;
+
+ ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+ e_inval);
+ for (i = 0; i < m->num_ops; i++) {
+ struct ceph_osd_op *op = p;
+
+ m->outdata_len[i] = le32_to_cpu(op->payload_len);
+ p += sizeof(*op);
+ }
+
+ ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+ for (i = 0; i < m->num_ops; i++)
+ ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
+
+ if (version >= 5) {
+ ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+ memcpy(&m->replay_version, p, sizeof(m->replay_version));
+ p += sizeof(m->replay_version);
+ ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+ } else {
+ m->replay_version = bad_replay_version; /* struct */
+ m->user_version = le64_to_cpu(m->replay_version.version);
+ }
+
+ if (version >= 6) {
+ if (version >= 7)
+ ceph_decode_8_safe(&p, end, decode_redir, e_inval);
+ else
+ decode_redir = 1;
+ } else {
+ decode_redir = 0;
+ }
+
+ if (decode_redir) {
+ ret = ceph_redirect_decode(&p, end, &m->redirect);
+ if (ret)
+ return ret;
+ } else {
+ ceph_oloc_init(&m->redirect.oloc);
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * Handle MOSDOpReply. Set ->r_result and call the callback if it is
+ * specified.
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_osd_request *req;
+ struct MOSDOpReply m;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ u32 data_len = 0;
+ int ret;
+ int i;
+
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
+
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
+ if (!req) {
+ dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+ goto out_unlock_session;
+ }
+
+ m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
+ ret = decode_MOSDOpReply(msg, &m);
+ m.redirect.oloc.pool_ns = NULL;
+ if (ret) {
+ pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+ req->r_tid, ret);
+ ceph_msg_dump(msg);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+ __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+ m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+ le64_to_cpu(m.replay_version.version), m.user_version);
+
+ if (m.retry_attempt >= 0) {
+ if (m.retry_attempt != req->r_attempts - 1) {
+ dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+ req, req->r_tid, m.retry_attempt,
+ req->r_attempts - 1);
+ goto out_unlock_session;
+ }
+ } else {
+ WARN_ON(1); /* MOSDOpReply v4 is assumed */
+ }
+
+ if (!ceph_oloc_empty(&m.redirect.oloc)) {
+ dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+ m.redirect.oloc.pool);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ /*
+ * Not ceph_oloc_copy() - changing pool_ns is not
+ * supported.
+ */
+ req->r_t.target_oloc.pool = m.redirect.oloc.pool;
+ req->r_flags |= CEPH_OSD_FLAG_REDIRECTED |
+ CEPH_OSD_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_FLAG_IGNORE_CACHE;
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
+
+ if (m.result == -EAGAIN) {
+ dout("req %p tid %llu EAGAIN\n", req, req->r_tid);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ /*
+ * The object is missing on the replica or not (yet)
+ * readable. Clear pgid to force a resend to the primary
+ * via legacy_change.
+ */
+ req->r_t.pgid.pool = 0;
+ req->r_t.pgid.seed = 0;
+ WARN_ON(!req->r_t.used_replica);
+ req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS);
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
+
+ if (m.num_ops != req->r_num_ops) {
+ pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+ req->r_num_ops, req->r_tid);
+ goto fail_request;
+ }
+ for (i = 0; i < req->r_num_ops; i++) {
+ dout(" req %p tid %llu op %d rval %d len %u\n", req,
+ req->r_tid, i, m.rval[i], m.outdata_len[i]);
+ req->r_ops[i].rval = m.rval[i];
+ req->r_ops[i].outdata_len = m.outdata_len[i];
+ data_len += m.outdata_len[i];
+ }
+ if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+ pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+ le32_to_cpu(msg->hdr.data_len), req->r_tid);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+ req, req->r_tid, m.result, data_len);
+
+ /*
+ * Since we only ever request ONDISK, we should only ever get
+ * one (type of) reply back.
+ */
+ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+ req->r_version = m.user_version;
+ req->r_result = m.result ?: data_len;
+ finish_request(req);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
+
+ __complete_request(req);
+ return;
+
+fail_request:
+ complete_request(req, -EIO);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+}
+
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ pi->was_full = __pool_full(pi);
+ }
+}
+
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return pi->was_full && !__pool_full(pi);
+}
+
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ enum calc_target_result ct_res;
+
+ ct_res = calc_target(osdc, &lreq->t, true);
+ if (ct_res == CALC_TARGET_NEED_RESEND) {
+ struct ceph_osd *osd;
+
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ if (osd != lreq->osd) {
+ unlink_linger(lreq->osd, lreq);
+ link_linger(osd, lreq);
+ }
+ }
+
+ return ct_res;
+}
+
+/*
+ * Requeue requests whose mapping to an OSD has changed.
+ */
+static void scan_requests(struct ceph_osd *osd,
+ bool force_resend,
+ bool cleared_full,
+ bool check_pool_cleared_full,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
+ bool force_resend_writes;
+
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* recalc_linger_target() */
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+ lreq->linger_id);
+ ct_res = recalc_linger_target(lreq);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+ if (!force_resend && !force_resend_writes)
+ break;
+
+ fallthrough;
+ case CALC_TARGET_NEED_RESEND:
+ cancel_linger_map_check(lreq);
+ /*
+ * scan_requests() for the previous epoch(s)
+ * may have already added it to the list, since
+ * it's not unlinked here.
+ */
+ if (list_empty(&lreq->scan_item))
+ list_add_tail(&lreq->scan_item, need_resend_linger);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ list_del_init(&lreq->scan_item);
+ check_linger_pool_dne(lreq);
+ break;
+ }
+ }
+
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ ct_res = calc_target(osdc, &req->r_t, false);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+ if (!force_resend &&
+ (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+ !force_resend_writes))
+ break;
+
+ fallthrough;
+ case CALC_TARGET_NEED_RESEND:
+ cancel_map_check(req);
+ unlink_request(osd, req);
+ insert_request(need_resend, req);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_pool_dne(req);
+ break;
+ }
+ }
+}
+
+static int handle_one_map(struct ceph_osd_client *osdc,
+ void *p, void *end, bool incremental,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osdmap *newmap;
+ struct rb_node *n;
+ bool skipped_map = false;
+ bool was_full;
+
+ was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ set_pool_was_full(osdc);
+
+ if (incremental)
+ newmap = osdmap_apply_incremental(&p, end,
+ ceph_msgr2(osdc->client),
+ osdc->osdmap);
+ else
+ newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
+ if (IS_ERR(newmap))
+ return PTR_ERR(newmap);
+
+ if (newmap != osdc->osdmap) {
+ /*
+ * Preserve ->was_full before destroying the old map.
+ * For pools that weren't in the old map, ->was_full
+ * should be false.
+ */
+ for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+ struct ceph_pg_pool_info *old_pi;
+
+ old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+ if (old_pi)
+ pi->was_full = old_pi->was_full;
+ else
+ WARN_ON(pi->was_full);
+ }
+
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 < newmap->epoch) {
+ WARN_ON(incremental);
+ skipped_map = true;
+ }
+
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
+ }
+
+ was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+ need_resend, need_resend_linger);
+
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n); /* close_osd() */
+
+ scan_requests(osd, skipped_map, was_full, true, need_resend,
+ need_resend_linger);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap, osd->o_osd),
+ sizeof(struct ceph_entity_addr)))
+ close_osd(osd);
+ }
+
+ return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osd_linger_request *lreq, *nlreq;
+ enum calc_target_result ct_res;
+ struct rb_node *n;
+
+ /* make sure need_resend targets reflect latest map */
+ for (n = rb_first(need_resend); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ n = rb_next(n);
+
+ if (req->r_t.epoch < osdc->osdmap->epoch) {
+ ct_res = calc_target(osdc, &req->r_t, false);
+ if (ct_res == CALC_TARGET_POOL_DNE) {
+ erase_request(need_resend, req);
+ check_pool_dne(req);
+ }
+ }
+ }
+
+ for (n = rb_first(need_resend); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ struct ceph_osd *osd;
+
+ n = rb_next(n);
+ erase_request(need_resend, req); /* before link_request() */
+
+ osd = lookup_create_osd(osdc, req->r_t.osd, true);
+ link_request(osd, req);
+ if (!req->r_linger) {
+ if (!osd_homeless(osd) && !req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
+ }
+ }
+
+ list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+ if (!osd_homeless(lreq->osd))
+ send_linger(lreq);
+
+ list_del_init(&lreq->scan_item);
+ }
+}
+
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster. Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ u32 nr_maps, maplen;
+ u32 epoch;
+ struct ceph_fsid fsid;
+ struct rb_root need_resend = RB_ROOT;
+ LIST_HEAD(need_resend_linger);
+ bool handled_incremental = false;
+ bool was_pauserd, was_pausewr;
+ bool pauserd, pausewr;
+ int err;
+
+ dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+ down_write(&osdc->lock);
+
+ /* verify fsid */
+ ceph_decode_need(&p, end, sizeof(fsid), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(osdc->client, &fsid) < 0)
+ goto bad;
+
+ was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
+
+ /* incremental maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d inc maps\n", nr_maps);
+ while (nr_maps > 0) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 == epoch) {
+ dout("applying incremental map %u len %d\n",
+ epoch, maplen);
+ err = handle_one_map(osdc, p, p + maplen, true,
+ &need_resend, &need_resend_linger);
+ if (err)
+ goto bad;
+ handled_incremental = true;
+ } else {
+ dout("ignoring incremental map %u len %d\n",
+ epoch, maplen);
+ }
+ p += maplen;
+ nr_maps--;
+ }
+ if (handled_incremental)
+ goto done;
+
+ /* full maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d full maps\n", nr_maps);
+ while (nr_maps) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ if (nr_maps > 1) {
+ dout("skipping non-latest full map %u len %d\n",
+ epoch, maplen);
+ } else if (osdc->osdmap->epoch >= epoch) {
+ dout("skipping full map %u len %d, "
+ "older than our %u\n", epoch, maplen,
+ osdc->osdmap->epoch);
+ } else {
+ dout("taking full map %u len %d\n", epoch, maplen);
+ err = handle_one_map(osdc, p, p + maplen, false,
+ &need_resend, &need_resend_linger);
+ if (err)
+ goto bad;
+ }
+ p += maplen;
+ nr_maps--;
+ }
+
+done:
+ /*
+ * subscribe to subsequent osdmap updates if full to ensure
+ * we find out when we are no longer full and stop returning
+ * ENOSPC.
+ */
+ pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
+ if (was_pauserd || was_pausewr || pauserd || pausewr ||
+ osdc->osdmap->epoch < osdc->epoch_barrier)
+ maybe_request_map(osdc);
+
+ kick_requests(osdc, &need_resend, &need_resend_linger);
+
+ ceph_osdc_abort_on_full(osdc);
+ ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch);
+ up_write(&osdc->lock);
+ wake_up_all(&osdc->client->auth_wq);
+ return;
+
+bad:
+ pr_err("osdc handle_map corrupt msg\n");
+ ceph_msg_dump(msg);
+ up_write(&osdc->lock);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void kick_osd_requests(struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ clear_backoffs(osd);
+
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ n = rb_next(n); /* cancel_linger_request() */
+
+ if (!req->r_linger) {
+ if (!req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
+ }
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ send_linger(lreq);
+ }
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ down_write(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock;
+ }
+
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ maybe_request_map(osdc);
+
+out_unlock:
+ up_write(&osdc->lock);
+}
+
+struct MOSDBackoff {
+ struct ceph_spg spgid;
+ u32 map_epoch;
+ u8 op;
+ u64 id;
+ struct ceph_hobject_id *begin;
+ struct ceph_hobject_id *end;
+};
+
+static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ u8 struct_v;
+ u32 struct_len;
+ int ret;
+
+ ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
+ if (ret)
+ return ret;
+
+ ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
+ ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
+ ceph_decode_8_safe(&p, end, m->op, e_inval);
+ ceph_decode_64_safe(&p, end, m->id, e_inval);
+
+ m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
+ if (!m->begin)
+ return -ENOMEM;
+
+ ret = decode_hoid(&p, end, m->begin);
+ if (ret) {
+ free_hoid(m->begin);
+ return ret;
+ }
+
+ m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
+ if (!m->end) {
+ free_hoid(m->begin);
+ return -ENOMEM;
+ }
+
+ ret = decode_hoid(&p, end, m->end);
+ if (ret) {
+ free_hoid(m->begin);
+ free_hoid(m->end);
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static struct ceph_msg *create_backoff_message(
+ const struct ceph_osd_backoff *backoff,
+ u32 map_epoch)
+{
+ struct ceph_msg *msg;
+ void *p, *end;
+ int msg_size;
+
+ msg_size = CEPH_ENCODING_START_BLK_LEN +
+ CEPH_PGID_ENCODING_LEN + 1; /* spgid */
+ msg_size += 4 + 1 + 8; /* map_epoch, op, id */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ hoid_encoding_size(backoff->begin);
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ hoid_encoding_size(backoff->end);
+
+ msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
+ if (!msg)
+ return NULL;
+
+ p = msg->front.iov_base;
+ end = p + msg->front_alloc_len;
+
+ encode_spgid(&p, &backoff->spgid);
+ ceph_encode_32(&p, map_epoch);
+ ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
+ ceph_encode_64(&p, backoff->id);
+ encode_hoid(&p, end, backoff->begin);
+ encode_hoid(&p, end, backoff->end);
+ BUG_ON(p != end);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+ return msg;
+}
+
+static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m)
+{
+ struct ceph_spg_mapping *spg;
+ struct ceph_osd_backoff *backoff;
+ struct ceph_msg *msg;
+
+ dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
+ m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
+
+ spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
+ if (!spg) {
+ spg = alloc_spg_mapping();
+ if (!spg) {
+ pr_err("%s failed to allocate spg\n", __func__);
+ return;
+ }
+ spg->spgid = m->spgid; /* struct */
+ insert_spg_mapping(&osd->o_backoff_mappings, spg);
+ }
+
+ backoff = alloc_backoff();
+ if (!backoff) {
+ pr_err("%s failed to allocate backoff\n", __func__);
+ return;
+ }
+ backoff->spgid = m->spgid; /* struct */
+ backoff->id = m->id;
+ backoff->begin = m->begin;
+ m->begin = NULL; /* backoff now owns this */
+ backoff->end = m->end;
+ m->end = NULL; /* ditto */
+
+ insert_backoff(&spg->backoffs, backoff);
+ insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+
+ /*
+ * Ack with original backoff's epoch so that the OSD can
+ * discard this if there was a PG split.
+ */
+ msg = create_backoff_message(backoff, m->map_epoch);
+ if (!msg) {
+ pr_err("%s failed to allocate msg\n", __func__);
+ return;
+ }
+ ceph_con_send(&osd->o_con, msg);
+}
+
+static bool target_contained_by(const struct ceph_osd_request_target *t,
+ const struct ceph_hobject_id *begin,
+ const struct ceph_hobject_id *end)
+{
+ struct ceph_hobject_id hoid;
+ int cmp;
+
+ hoid_fill_from_target(&hoid, t);
+ cmp = hoid_compare(&hoid, begin);
+ return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0);
+}
+
+static void handle_backoff_unblock(struct ceph_osd *osd,
+ const struct MOSDBackoff *m)
+{
+ struct ceph_spg_mapping *spg;
+ struct ceph_osd_backoff *backoff;
+ struct rb_node *n;
+
+ dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
+ m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
+
+ backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
+ if (!backoff) {
+ pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
+ __func__, osd->o_osd, m->spgid.pgid.pool,
+ m->spgid.pgid.seed, m->spgid.shard, m->id);
+ return;
+ }
+
+ if (hoid_compare(backoff->begin, m->begin) &&
+ hoid_compare(backoff->end, m->end)) {
+ pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
+ __func__, osd->o_osd, m->spgid.pgid.pool,
+ m->spgid.pgid.seed, m->spgid.shard, m->id);
+ /* unblock it anyway... */
+ }
+
+ spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
+ BUG_ON(!spg);
+
+ erase_backoff(&spg->backoffs, backoff);
+ erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+ free_backoff(backoff);
+
+ if (RB_EMPTY_ROOT(&spg->backoffs)) {
+ erase_spg_mapping(&osd->o_backoff_mappings, spg);
+ free_spg_mapping(spg);
+ }
+
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
+ /*
+ * Match against @m, not @backoff -- the PG may
+ * have split on the OSD.
+ */
+ if (target_contained_by(&req->r_t, m->begin, m->end)) {
+ /*
+ * If no other installed backoff applies,
+ * resend.
+ */
+ send_request(req);
+ }
+ }
+ }
+}
+
+static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct MOSDBackoff m;
+ int ret;
+
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ up_read(&osdc->lock);
+ return;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
+
+ mutex_lock(&osd->lock);
+ ret = decode_MOSDBackoff(msg, &m);
+ if (ret) {
+ pr_err("failed to decode MOSDBackoff: %d\n", ret);
+ ceph_msg_dump(msg);
+ goto out_unlock;
+ }
+
+ switch (m.op) {
+ case CEPH_OSD_BACKOFF_OP_BLOCK:
+ handle_backoff_block(osd, &m);
+ break;
+ case CEPH_OSD_BACKOFF_OP_UNBLOCK:
+ handle_backoff_unblock(osd, &m);
+ break;
+ default:
+ pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
+ }
+
+ free_hoid(m.begin);
+ free_hoid(m.end);
+
+out_unlock:
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
+}
+
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ struct ceph_osd_linger_request *lreq;
+ struct linger_work *lwork;
+ u8 proto_ver, opcode;
+ u64 cookie, notify_id;
+ u64 notifier_id = 0;
+ s32 return_code = 0;
+ void *payload = NULL;
+ u32 payload_len = 0;
+
+ ceph_decode_8_safe(&p, end, proto_ver, bad);
+ ceph_decode_8_safe(&p, end, opcode, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+ p += 8; /* skip ver */
+ ceph_decode_64_safe(&p, end, notify_id, bad);
+
+ if (proto_ver >= 1) {
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+ payload = p;
+ p += payload_len;
+ }
+
+ if (le16_to_cpu(msg->hdr.version) >= 2)
+ ceph_decode_32_safe(&p, end, return_code, bad);
+
+ if (le16_to_cpu(msg->hdr.version) >= 3)
+ ceph_decode_64_safe(&p, end, notifier_id, bad);
+
+ down_read(&osdc->lock);
+ lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+ if (!lreq) {
+ dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+ cookie);
+ goto out_unlock_osdc;
+ }
+
+ mutex_lock(&lreq->lock);
+ dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+ opcode, cookie, lreq, lreq->is_watch);
+ if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+ if (!lreq->last_error) {
+ lreq->last_error = -ENOTCONN;
+ queue_watch_error(lreq);
+ }
+ } else if (!lreq->is_watch) {
+ /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+ if (lreq->notify_id && lreq->notify_id != notify_id) {
+ dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+ lreq->notify_id, notify_id);
+ } else if (!completion_done(&lreq->notify_finish_wait)) {
+ struct ceph_msg_data *data =
+ msg->num_data_items ? &msg->data[0] : NULL;
+
+ if (data) {
+ if (lreq->preply_pages) {
+ WARN_ON(data->type !=
+ CEPH_MSG_DATA_PAGES);
+ *lreq->preply_pages = data->pages;
+ *lreq->preply_len = data->length;
+ data->own_pages = false;
+ }
+ }
+ lreq->notify_finish_error = return_code;
+ complete_all(&lreq->notify_finish_wait);
+ }
+ } else {
+ /* CEPH_WATCH_EVENT_NOTIFY */
+ lwork = lwork_alloc(lreq, do_watch_notify);
+ if (!lwork) {
+ pr_err("failed to allocate notify-lwork\n");
+ goto out_unlock_lreq;
+ }
+
+ lwork->notify.notify_id = notify_id;
+ lwork->notify.notifier_id = notifier_id;
+ lwork->notify.payload = payload;
+ lwork->notify.payload_len = payload_len;
+ lwork->notify.msg = ceph_msg_get(msg);
+ lwork_queue(lwork);
+ }
+
+out_unlock_lreq:
+ mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return;
+
+bad:
+ pr_err("osdc handle_watch_notify corrupt msg\n");
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ down_read(&osdc->lock);
+ submit_request(req, false);
+ up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * Unregister request. If @req was registered, it isn't completed:
+ * r_result isn't set and __complete_request() isn't invoked.
+ *
+ * If @req wasn't registered, this call may have raced with
+ * handle_reply(), in which case r_result would already be set and
+ * __complete_request() would be getting invoked, possibly even
+ * concurrently with this call.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+
+ down_write(&osdc->lock);
+ if (req->r_osd)
+ cancel_request(req);
+ up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
+
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+ unsigned long timeout)
+{
+ long left;
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ left = wait_for_completion_killable_timeout(&req->r_completion,
+ ceph_timeout_jiffies(timeout));
+ if (left <= 0) {
+ left = left ?: -ETIMEDOUT;
+ ceph_osdc_cancel_request(req);
+ } else {
+ left = req->r_result; /* completed */
+ }
+
+ return left;
+}
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush. avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n, *p;
+ u64 last_tid = atomic64_read(&osdc->last_tid);
+
+again:
+ down_read(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ mutex_lock(&osd->lock);
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_tid > last_tid)
+ break;
+
+ if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
+ dout("%s waiting on req %p tid %llu last_tid %llu\n",
+ __func__, req, req->r_tid, last_tid);
+ wait_for_completion(&req->r_completion);
+ ceph_osdc_put_request(req);
+ goto again;
+ }
+
+ mutex_unlock(&osd->lock);
+ }
+
+ up_read(&osdc->lock);
+ dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * Returns a handle, caller owns a ref.
+ */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data)
+{
+ struct ceph_osd_linger_request *lreq;
+ int ret;
+
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return ERR_PTR(-ENOMEM);
+
+ lreq->is_watch = true;
+ lreq->wcb = wcb;
+ lreq->errcb = errcb;
+ lreq->data = data;
+ lreq->watch_valid_thru = jiffies;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_WRITE;
+ ktime_get_real_ts64(&lreq->mtime);
+
+ linger_submit(lreq);
+ ret = linger_reg_commit_wait(lreq);
+ if (ret) {
+ linger_cancel(lreq);
+ goto err_put_lreq;
+ }
+
+ return lreq;
+
+err_put_lreq:
+ linger_put(lreq);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ceph_osdc_watch);
+
+/*
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
+ */
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_options *opts = osdc->client->options;
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+ ktime_get_real_ts64(&req->r_mtime);
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH,
+ lreq->linger_id, 0);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req);
+ linger_cancel(lreq);
+ linger_put(lreq);
+ ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_unwatch);
+
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+ u64 notify_id, u64 cookie, void *payload,
+ u32 payload_len)
+{
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
+
+ op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+ pl = ceph_pagelist_alloc(GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
+
+ ret = ceph_pagelist_encode_64(pl, notify_id);
+ ret |= ceph_pagelist_encode_64(pl, cookie);
+ if (payload) {
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ } else {
+ ret |= ceph_pagelist_encode_32(pl, 0);
+ }
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
+ }
+
+ ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
+}
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ u32 payload_len)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
+
+ ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+ payload_len);
+ if (ret)
+ goto out_put_req;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
+/*
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
+ */
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ u32 payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len)
+{
+ struct ceph_osd_linger_request *lreq;
+ int ret;
+
+ WARN_ON(!timeout);
+ if (preply_pages) {
+ *preply_pages = NULL;
+ *preply_len = 0;
+ }
+
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return -ENOMEM;
+
+ lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO);
+ if (!lreq->request_pl) {
+ ret = -ENOMEM;
+ goto out_put_lreq;
+ }
+
+ ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */
+ ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout);
+ ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len);
+ ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out_put_lreq;
+ }
+
+ /* for notify_id */
+ lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(lreq->notify_id_pages)) {
+ ret = PTR_ERR(lreq->notify_id_pages);
+ lreq->notify_id_pages = NULL;
+ goto out_put_lreq;
+ }
+
+ lreq->preply_pages = preply_pages;
+ lreq->preply_len = preply_len;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+ linger_submit(lreq);
+ ret = linger_reg_commit_wait(lreq);
+ if (!ret)
+ ret = linger_notify_finish_wait(lreq,
+ msecs_to_jiffies(2 * timeout * MSEC_PER_SEC));
+ else
+ dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+ linger_cancel(lreq);
+out_put_lreq:
+ linger_put(lreq);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
+static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
+{
+ u8 struct_v;
+ u32 struct_len;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 2, "watch_item_t",
+ &struct_v, &struct_len);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+ ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad);
+ ceph_decode_64_safe(p, end, item->cookie, bad);
+ ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */
+
+ if (struct_v >= 2) {
+ ret = ceph_decode_entity_addr(p, end, &item->addr);
+ if (ret)
+ goto bad;
+ } else {
+ ret = 0;
+ }
+
+ dout("%s %s%llu cookie %llu addr %s\n", __func__,
+ ENTITY_NAME(item->name), item->cookie,
+ ceph_pr_addr(&item->addr));
+bad:
+ return ret;
+}
+
+static int decode_watchers(void **p, void *end,
+ struct ceph_watch_item **watchers,
+ u32 *num_watchers)
+{
+ u8 struct_v;
+ u32 struct_len;
+ int i;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
+ &struct_v, &struct_len);
+ if (ret)
+ return ret;
+
+ *num_watchers = ceph_decode_32(p);
+ *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO);
+ if (!*watchers)
+ return -ENOMEM;
+
+ for (i = 0; i < *num_watchers; i++) {
+ ret = decode_watcher(p, end, *watchers + i);
+ if (ret) {
+ kfree(*watchers);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * On success, the caller is responsible for:
+ *
+ * kfree(watchers);
+ */
+int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_watch_item **watchers,
+ u32 *num_watchers)
+{
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out_put_req;
+ }
+
+ osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
+ ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
+ response_data),
+ pages, PAGE_SIZE, 0, false, true);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+ if (ret >= 0) {
+ void *p = page_address(pages[0]);
+ void *const end = p + req->r_ops[0].outdata_len;
+
+ ret = decode_watchers(&p, end, watchers, num_watchers);
+ }
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_list_watchers);
+
+/*
+ * Call all pending notify callbacks - for use after a watch is
+ * unregistered, to make sure no more callbacks for it will be invoked
+ */
+void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
+{
+ dout("%s osdc %p\n", __func__, osdc);
+ flush_workqueue(osdc->notify_wq);
+}
+EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+ down_read(&osdc->lock);
+ maybe_request_map(osdc);
+ up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
+
+/*
+ * Execute an OSD class method on an object.
+ *
+ * @flags: CEPH_OSD_FLAG_*
+ * @resp_len: in/out param for reply length
+ */
+int ceph_osdc_call(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ const char *class, const char *method,
+ unsigned int flags,
+ struct page *req_page, size_t req_len,
+ struct page **resp_pages, size_t *resp_len)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ if (req_len > PAGE_SIZE)
+ return -E2BIG;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = flags;
+
+ ret = osd_req_op_cls_init(req, 0, class, method);
+ if (ret)
+ goto out_put_req;
+
+ if (req_page)
+ osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
+ 0, false, false);
+ if (resp_pages)
+ osd_req_op_cls_response_data_pages(req, 0, resp_pages,
+ *resp_len, 0, false, false);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+ if (ret >= 0) {
+ ret = req->r_ops[0].rval;
+ if (resp_pages)
+ *resp_len = req->r_ops[0].outdata_len;
+ }
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_call);
+
+/*
+ * reset all osd connections
+ */
+void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ down_write(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ }
+ up_write(&osdc->lock);
+}
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+ int err;
+
+ dout("init\n");
+ osdc->client = client;
+ init_rwsem(&osdc->lock);
+ osdc->osds = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->osd_lru);
+ spin_lock_init(&osdc->osd_lru_lock);
+ osd_init(&osdc->homeless_osd);
+ osdc->homeless_osd.o_osdc = osdc;
+ osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+ osdc->last_linger_id = CEPH_LINGER_ID_START;
+ osdc->linger_requests = RB_ROOT;
+ osdc->map_checks = RB_ROOT;
+ osdc->linger_map_checks = RB_ROOT;
+ INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+ INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+ err = -ENOMEM;
+ osdc->osdmap = ceph_osdmap_alloc();
+ if (!osdc->osdmap)
+ goto out;
+
+ osdc->req_mempool = mempool_create_slab_pool(10,
+ ceph_osd_request_cache);
+ if (!osdc->req_mempool)
+ goto out_map;
+
+ err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
+ PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op");
+ if (err < 0)
+ goto out_mempool;
+ err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
+ PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10,
+ "osd_op_reply");
+ if (err < 0)
+ goto out_msgpool;
+
+ err = -ENOMEM;
+ osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
+ if (!osdc->notify_wq)
+ goto out_msgpool_reply;
+
+ osdc->completion_wq = create_singlethread_workqueue("ceph-completion");
+ if (!osdc->completion_wq)
+ goto out_notify_wq;
+
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
+ return 0;
+
+out_notify_wq:
+ destroy_workqueue(osdc->notify_wq);
+out_msgpool_reply:
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+out_msgpool:
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+ mempool_destroy(osdc->req_mempool);
+out_map:
+ ceph_osdmap_destroy(osdc->osdmap);
+out:
+ return err;
+}
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+ destroy_workqueue(osdc->completion_wq);
+ destroy_workqueue(osdc->notify_wq);
+ cancel_delayed_work_sync(&osdc->timeout_work);
+ cancel_delayed_work_sync(&osdc->osds_timeout_work);
+
+ down_write(&osdc->lock);
+ while (!RB_EMPTY_ROOT(&osdc->osds)) {
+ struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+ struct ceph_osd, o_node);
+ close_osd(osd);
+ }
+ up_write(&osdc->lock);
+ WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
+ osd_cleanup(&osdc->homeless_osd);
+
+ WARN_ON(!list_empty(&osdc->osd_lru));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+ WARN_ON(atomic_read(&osdc->num_requests));
+ WARN_ON(atomic_read(&osdc->num_homeless));
+
+ ceph_osdmap_destroy(osdc->osdmap);
+ mempool_destroy(osdc->req_mempool);
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
+ u8 copy_from_flags)
+{
+ struct ceph_osd_req_op *op;
+ struct page **pages;
+ void *p, *end;
+
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2,
+ dst_fadvise_flags);
+ op->copy_from.snapid = src_snapid;
+ op->copy_from.src_version = src_version;
+ op->copy_from.flags = copy_from_flags;
+ op->copy_from.src_fadvise_flags = src_fadvise_flags;
+
+ p = page_address(pages[0]);
+ end = p + PAGE_SIZE;
+ ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
+ encode_oloc(&p, end, src_oloc);
+ ceph_encode_32(&p, truncate_seq);
+ ceph_encode_64(&p, truncate_size);
+ op->indata_len = PAGE_SIZE - (end - p);
+
+ ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
+ op->indata_len, 0, false, true);
+ return 0;
+}
+EXPORT_SYMBOL(osd_req_op_copy_from_init);
+
+int __init ceph_osdc_setup(void)
+{
+ size_t size = sizeof(struct ceph_osd_request) +
+ CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
+
+ BUG_ON(ceph_osd_request_cache);
+ ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
+ 0, 0, NULL);
+
+ return ceph_osd_request_cache ? 0 : -ENOMEM;
+}
+
+void ceph_osdc_cleanup(void)
+{
+ BUG_ON(!ceph_osd_request_cache);
+ kmem_cache_destroy(ceph_osd_request_cache);
+ ceph_osd_request_cache = NULL;
+}
+
+/*
+ * handle incoming message
+ */
+static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(osdc, msg);
+ break;
+ case CEPH_MSG_OSD_OPREPLY:
+ handle_reply(osd, msg);
+ break;
+ case CEPH_MSG_OSD_BACKOFF:
+ handle_backoff(osd, msg);
+ break;
+ case CEPH_MSG_WATCH_NOTIFY:
+ handle_watch_notify(osdc, msg);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+
+ ceph_msg_put(msg);
+}
+
+/* How much sparse data was requested? */
+static u64 sparse_data_requested(struct ceph_osd_request *req)
+{
+ u64 len = 0;
+
+ if (req->r_flags & CEPH_OSD_FLAG_READ) {
+ int i;
+
+ for (i = 0; i < req->r_num_ops; ++i) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ if (op->op == CEPH_OSD_OP_SPARSE_READ)
+ len += op->extent.length;
+ }
+ }
+ return len;
+}
+
+/*
+ * Lookup and return message for incoming reply. Don't try to do
+ * anything about a larger than preallocated data portion of the
+ * message at the moment - for now, just skip the message.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_msg *m = NULL;
+ struct ceph_osd_request *req;
+ int front_len = le32_to_cpu(hdr->front_len);
+ int data_len = le32_to_cpu(hdr->data_len);
+ u64 tid = le64_to_cpu(hdr->tid);
+ u64 srlen;
+
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+ *skip = 1;
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
+ if (!req) {
+ dout("%s osd%d tid %llu unknown, skipping\n", __func__,
+ osd->o_osd, tid);
+ *skip = 1;
+ goto out_unlock_session;
+ }
+
+ ceph_msg_revoke_incoming(req->r_reply);
+
+ if (front_len > req->r_reply->front_alloc_len) {
+ pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
+ __func__, osd->o_osd, req->r_tid, front_len,
+ req->r_reply->front_alloc_len);
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
+ false);
+ if (!m)
+ goto out_unlock_session;
+ ceph_msg_put(req->r_reply);
+ req->r_reply = m;
+ }
+
+ srlen = sparse_data_requested(req);
+ if (!srlen && data_len > req->r_reply->data_length) {
+ pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
+ __func__, osd->o_osd, req->r_tid, data_len,
+ req->r_reply->data_length);
+ m = NULL;
+ *skip = 1;
+ goto out_unlock_session;
+ }
+
+ m = ceph_msg_get(req->r_reply);
+ m->sparse_read_total = srlen;
+
+ dout("get_reply tid %lld %p\n", tid, m);
+
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return m;
+}
+
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+ struct ceph_msg *m;
+ int type = le16_to_cpu(hdr->type);
+ u32 front_len = le32_to_cpu(hdr->front_len);
+ u32 data_len = le32_to_cpu(hdr->data_len);
+
+ m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false);
+ if (!m)
+ return NULL;
+
+ if (data_len) {
+ struct page **pages;
+
+ pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+ GFP_NOIO);
+ if (IS_ERR(pages)) {
+ ceph_msg_put(m);
+ return NULL;
+ }
+
+ ceph_msg_data_add_pages(m, pages, data_len, 0, true);
+ }
+
+ return m;
+}
+
+static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ int type = le16_to_cpu(hdr->type);
+
+ *skip = 0;
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_OSD_BACKOFF:
+ case CEPH_MSG_WATCH_NOTIFY:
+ return alloc_msg_with_page_vector(hdr);
+ case CEPH_MSG_OSD_OPREPLY:
+ return get_reply(con, hdr, skip);
+ default:
+ pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+ osd->o_osd, type);
+ *skip = 1;
+ return NULL;
+ }
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *osd_get_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ if (get_osd(osd))
+ return con;
+ return NULL;
+}
+
+static void osd_put_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately. Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *
+osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ force_new, proto, NULL, NULL);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return auth;
+}
+
+static int osd_add_authorizer_challenge(struct ceph_connection *con,
+ void *challenge_buf, int challenge_buf_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
+ challenge_buf, challenge_buf_len);
+}
+
+static int osd_verify_authorizer_reply(struct ceph_connection *con)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+ NULL, NULL, NULL, NULL);
+}
+
+static int osd_invalidate_authorizer(struct ceph_connection *con)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+ return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static int osd_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+ int ret;
+
+ if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+ used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt)) {
+ ret = ceph_monc_validate_auth(monc);
+ if (ret)
+ return ret;
+ }
+
+ return -EACCES;
+}
+
+static void osd_reencode_message(struct ceph_msg *msg)
+{
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (type == CEPH_MSG_OSD_OP)
+ encode_request_finish(msg);
+}
+
+static int osd_sign_message(struct ceph_msg *msg)
+{
+ struct ceph_osd *o = msg->con->private;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_sign_message(auth, msg);
+}
+
+static int osd_check_message_signature(struct ceph_msg *msg)
+{
+ struct ceph_osd *o = msg->con->private;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_check_message_signature(auth, msg);
+}
+
+static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len,
+ bool zero)
+{
+ while (len) {
+ struct page *page;
+ size_t poff, plen;
+
+ page = ceph_msg_data_next(cursor, &poff, &plen);
+ if (plen > len)
+ plen = len;
+ if (zero)
+ zero_user_segment(page, poff, poff + plen);
+ len -= plen;
+ ceph_msg_data_advance(cursor, plen);
+ }
+}
+
+static int prep_next_sparse_read(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_sparse_read *sr = &o->o_sparse_read;
+ struct ceph_osd_request *req;
+ struct ceph_osd_req_op *op;
+
+ spin_lock(&o->o_requests_lock);
+ req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid));
+ if (!req) {
+ spin_unlock(&o->o_requests_lock);
+ return -EBADR;
+ }
+
+ if (o->o_sparse_op_idx < 0) {
+ dout("%s: [%d] starting new sparse read req\n",
+ __func__, o->o_osd);
+ } else {
+ u64 end;
+
+ op = &req->r_ops[o->o_sparse_op_idx];
+
+ WARN_ON_ONCE(op->extent.sparse_ext);
+
+ /* hand back buffer we took earlier */
+ op->extent.sparse_ext = sr->sr_extent;
+ sr->sr_extent = NULL;
+ op->extent.sparse_ext_cnt = sr->sr_count;
+ sr->sr_ext_len = 0;
+ dout("%s: [%d] completed extent array len %d cursor->resid %zd\n",
+ __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid);
+ /* Advance to end of data for this operation */
+ end = ceph_sparse_ext_map_end(op);
+ if (end < sr->sr_req_len)
+ advance_cursor(cursor, sr->sr_req_len - end, false);
+ }
+
+ ceph_init_sparse_read(sr);
+
+ /* find next op in this request (if any) */
+ while (++o->o_sparse_op_idx < req->r_num_ops) {
+ op = &req->r_ops[o->o_sparse_op_idx];
+ if (op->op == CEPH_OSD_OP_SPARSE_READ)
+ goto found;
+ }
+
+ /* reset for next sparse read request */
+ spin_unlock(&o->o_requests_lock);
+ o->o_sparse_op_idx = -1;
+ return 0;
+found:
+ sr->sr_req_off = op->extent.offset;
+ sr->sr_req_len = op->extent.length;
+ sr->sr_pos = sr->sr_req_off;
+ dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__,
+ o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len);
+
+ /* hand off request's sparse extent map buffer */
+ sr->sr_ext_len = op->extent.sparse_ext_cnt;
+ op->extent.sparse_ext_cnt = 0;
+ sr->sr_extent = op->extent.sparse_ext;
+ op->extent.sparse_ext = NULL;
+
+ spin_unlock(&o->o_requests_lock);
+ return 1;
+}
+
+#ifdef __BIG_ENDIAN
+static inline void convert_extent_map(struct ceph_sparse_read *sr)
+{
+ int i;
+
+ for (i = 0; i < sr->sr_count; i++) {
+ struct ceph_sparse_extent *ext = &sr->sr_extent[i];
+
+ ext->off = le64_to_cpu((__force __le64)ext->off);
+ ext->len = le64_to_cpu((__force __le64)ext->len);
+ }
+}
+#else
+static inline void convert_extent_map(struct ceph_sparse_read *sr)
+{
+}
+#endif
+
+static int osd_sparse_read(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor,
+ char **pbuf)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_sparse_read *sr = &o->o_sparse_read;
+ u32 count = sr->sr_count;
+ u64 eoff, elen, len = 0;
+ int i, ret;
+
+ switch (sr->sr_state) {
+ case CEPH_SPARSE_READ_HDR:
+next_op:
+ ret = prep_next_sparse_read(con, cursor);
+ if (ret <= 0)
+ return ret;
+
+ /* number of extents */
+ ret = sizeof(sr->sr_count);
+ *pbuf = (char *)&sr->sr_count;
+ sr->sr_state = CEPH_SPARSE_READ_EXTENTS;
+ break;
+ case CEPH_SPARSE_READ_EXTENTS:
+ /* Convert sr_count to host-endian */
+ count = le32_to_cpu((__force __le32)sr->sr_count);
+ sr->sr_count = count;
+ dout("[%d] got %u extents\n", o->o_osd, count);
+
+ if (count > 0) {
+ if (!sr->sr_extent || count > sr->sr_ext_len) {
+ /* no extent array provided, or too short */
+ kfree(sr->sr_extent);
+ sr->sr_extent = kmalloc_array(count,
+ sizeof(*sr->sr_extent),
+ GFP_NOIO);
+ if (!sr->sr_extent) {
+ pr_err("%s: failed to allocate %u extents\n",
+ __func__, count);
+ return -ENOMEM;
+ }
+ sr->sr_ext_len = count;
+ }
+ ret = count * sizeof(*sr->sr_extent);
+ *pbuf = (char *)sr->sr_extent;
+ sr->sr_state = CEPH_SPARSE_READ_DATA_LEN;
+ break;
+ }
+ /* No extents? Read data len */
+ fallthrough;
+ case CEPH_SPARSE_READ_DATA_LEN:
+ convert_extent_map(sr);
+ ret = sizeof(sr->sr_datalen);
+ *pbuf = (char *)&sr->sr_datalen;
+ sr->sr_state = CEPH_SPARSE_READ_DATA_PRE;
+ break;
+ case CEPH_SPARSE_READ_DATA_PRE:
+ /* Convert sr_datalen to host-endian */
+ sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen);
+ for (i = 0; i < count; i++)
+ len += sr->sr_extent[i].len;
+ if (sr->sr_datalen != len) {
+ pr_warn_ratelimited("data len %u != extent len %llu\n",
+ sr->sr_datalen, len);
+ return -EREMOTEIO;
+ }
+ sr->sr_state = CEPH_SPARSE_READ_DATA;
+ fallthrough;
+ case CEPH_SPARSE_READ_DATA:
+ if (sr->sr_index >= count) {
+ sr->sr_state = CEPH_SPARSE_READ_HDR;
+ goto next_op;
+ }
+
+ eoff = sr->sr_extent[sr->sr_index].off;
+ elen = sr->sr_extent[sr->sr_index].len;
+
+ dout("[%d] ext %d off 0x%llx len 0x%llx\n",
+ o->o_osd, sr->sr_index, eoff, elen);
+
+ if (elen > INT_MAX) {
+ dout("Sparse read extent length too long (0x%llx)\n",
+ elen);
+ return -EREMOTEIO;
+ }
+
+ /* zero out anything from sr_pos to start of extent */
+ if (sr->sr_pos < eoff)
+ advance_cursor(cursor, eoff - sr->sr_pos, true);
+
+ /* Set position to end of extent */
+ sr->sr_pos = eoff + elen;
+
+ /* send back the new length and nullify the ptr */
+ cursor->sr_resid = elen;
+ ret = elen;
+ *pbuf = NULL;
+
+ /* Bump the array index */
+ ++sr->sr_index;
+ break;
+ }
+ return ret;
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+ .get = osd_get_con,
+ .put = osd_put_con,
+ .sparse_read = osd_sparse_read,
+ .alloc_msg = osd_alloc_msg,
+ .dispatch = osd_dispatch,
+ .fault = osd_fault,
+ .reencode_message = osd_reencode_message,
+ .get_authorizer = osd_get_authorizer,
+ .add_authorizer_challenge = osd_add_authorizer_challenge,
+ .verify_authorizer_reply = osd_verify_authorizer_reply,
+ .invalidate_authorizer = osd_invalidate_authorizer,
+ .sign_message = osd_sign_message,
+ .check_message_signature = osd_check_message_signature,
+ .get_auth_request = osd_get_auth_request,
+ .handle_auth_reply_more = osd_handle_auth_reply_more,
+ .handle_auth_done = osd_handle_auth_done,
+ .handle_auth_bad_method = osd_handle_auth_bad_method,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 000000000000..d245fa508e1c
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,3110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+static __printf(2, 3)
+void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid,
+ map->epoch, &vaf);
+
+ va_end(args);
+}
+
+char *ceph_osdmap_state_str(char *str, int len, u32 state)
+{
+ if (!len)
+ return str;
+
+ if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
+ snprintf(str, len, "exists, up");
+ else if (state & CEPH_OSD_EXISTS)
+ snprintf(str, len, "exists");
+ else if (state & CEPH_OSD_UP)
+ snprintf(str, len, "up");
+ else
+ snprintf(str, len, "doesn't exist");
+
+ return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned int t)
+{
+ int b = 0;
+ while (t) {
+ t = t >> 1;
+ b++;
+ }
+ return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+ pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
+ pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+ struct crush_bucket_uniform *b)
+{
+ dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+ ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+ b->item_weight = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+ struct crush_bucket_list *b)
+{
+ int j;
+ dout("crush_decode_list_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->sum_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->sum_weights[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+ struct crush_bucket_tree *b)
+{
+ int j;
+ dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+ ceph_decode_8_safe(p, end, b->num_nodes, bad);
+ b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+ if (b->node_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+ for (j = 0; j < b->num_nodes; j++)
+ b->node_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+ struct crush_bucket_straw *b)
+{
+ int j;
+ dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->straws == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->straws[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_straw2_bucket(void **p, void *end,
+ struct crush_bucket_straw2 *b)
+{
+ int j;
+ dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++)
+ b->item_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+struct crush_name_node {
+ struct rb_node cn_node;
+ int cn_id;
+ char cn_name[];
+};
+
+static struct crush_name_node *alloc_crush_name(size_t name_len)
+{
+ struct crush_name_node *cn;
+
+ cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
+ if (!cn)
+ return NULL;
+
+ RB_CLEAR_NODE(&cn->cn_node);
+ return cn;
+}
+
+static void free_crush_name(struct crush_name_node *cn)
+{
+ WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
+
+ kfree(cn);
+}
+
+DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
+
+static int decode_crush_names(void **p, void *end, struct rb_root *root)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct crush_name_node *cn;
+ int id;
+ u32 name_len;
+
+ ceph_decode_32_safe(p, end, id, e_inval);
+ ceph_decode_32_safe(p, end, name_len, e_inval);
+ ceph_decode_need(p, end, name_len, e_inval);
+
+ cn = alloc_crush_name(name_len);
+ if (!cn)
+ return -ENOMEM;
+
+ cn->cn_id = id;
+ memcpy(cn->cn_name, *p, name_len);
+ cn->cn_name[name_len] = '\0';
+ *p += name_len;
+
+ if (!__insert_crush_name(root, cn)) {
+ free_crush_name(cn);
+ return -EEXIST;
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+void clear_crush_names(struct rb_root *root)
+{
+ while (!RB_EMPTY_ROOT(root)) {
+ struct crush_name_node *cn =
+ rb_entry(rb_first(root), struct crush_name_node, cn_node);
+
+ erase_crush_name(root, cn);
+ free_crush_name(cn);
+ }
+}
+
+static struct crush_choose_arg_map *alloc_choose_arg_map(void)
+{
+ struct crush_choose_arg_map *arg_map;
+
+ arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
+ if (!arg_map)
+ return NULL;
+
+ RB_CLEAR_NODE(&arg_map->node);
+ return arg_map;
+}
+
+static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
+{
+ if (arg_map) {
+ int i, j;
+
+ WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
+
+ for (i = 0; i < arg_map->size; i++) {
+ struct crush_choose_arg *arg = &arg_map->args[i];
+
+ for (j = 0; j < arg->weight_set_size; j++)
+ kfree(arg->weight_set[j].weights);
+ kfree(arg->weight_set);
+ kfree(arg->ids);
+ }
+ kfree(arg_map->args);
+ kfree(arg_map);
+ }
+}
+
+DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
+ node);
+
+void clear_choose_args(struct crush_map *c)
+{
+ while (!RB_EMPTY_ROOT(&c->choose_args)) {
+ struct crush_choose_arg_map *arg_map =
+ rb_entry(rb_first(&c->choose_args),
+ struct crush_choose_arg_map, node);
+
+ erase_choose_arg_map(&c->choose_args, arg_map);
+ free_choose_arg_map(arg_map);
+ }
+}
+
+static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
+{
+ u32 *a = NULL;
+ u32 len;
+ int ret;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len) {
+ u32 i;
+
+ a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
+ if (!a) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ceph_decode_need(p, end, len * sizeof(u32), e_inval);
+ for (i = 0; i < len; i++)
+ a[i] = ceph_decode_32(p);
+ }
+
+ *plen = len;
+ return a;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ kfree(a);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Assumes @arg is zero-initialized.
+ */
+static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
+{
+ int ret;
+
+ ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
+ if (arg->weight_set_size) {
+ u32 i;
+
+ arg->weight_set = kmalloc_array(arg->weight_set_size,
+ sizeof(*arg->weight_set),
+ GFP_NOIO);
+ if (!arg->weight_set)
+ return -ENOMEM;
+
+ for (i = 0; i < arg->weight_set_size; i++) {
+ struct crush_weight_set *w = &arg->weight_set[i];
+
+ w->weights = decode_array_32_alloc(p, end, &w->size);
+ if (IS_ERR(w->weights)) {
+ ret = PTR_ERR(w->weights);
+ w->weights = NULL;
+ return ret;
+ }
+ }
+ }
+
+ arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
+ if (IS_ERR(arg->ids)) {
+ ret = PTR_ERR(arg->ids);
+ arg->ids = NULL;
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_choose_args(void **p, void *end, struct crush_map *c)
+{
+ struct crush_choose_arg_map *arg_map = NULL;
+ u32 num_choose_arg_maps, num_buckets;
+ int ret;
+
+ ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
+ while (num_choose_arg_maps--) {
+ arg_map = alloc_choose_arg_map();
+ if (!arg_map) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ceph_decode_64_safe(p, end, arg_map->choose_args_index,
+ e_inval);
+ arg_map->size = c->max_buckets;
+ arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
+ GFP_NOIO);
+ if (!arg_map->args) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ceph_decode_32_safe(p, end, num_buckets, e_inval);
+ while (num_buckets--) {
+ struct crush_choose_arg *arg;
+ u32 bucket_index;
+
+ ceph_decode_32_safe(p, end, bucket_index, e_inval);
+ if (bucket_index >= arg_map->size)
+ goto e_inval;
+
+ arg = &arg_map->args[bucket_index];
+ ret = decode_choose_arg(p, end, arg);
+ if (ret)
+ goto fail;
+
+ if (arg->ids_size &&
+ arg->ids_size != c->buckets[bucket_index]->size)
+ goto e_inval;
+ }
+
+ insert_choose_arg_map(&c->choose_args, arg_map);
+ }
+
+ return 0;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ free_choose_arg_map(arg_map);
+ return ret;
+}
+
+static void crush_finalize(struct crush_map *c)
+{
+ __s32 b;
+
+ /* Space for the array of pointers to per-bucket workspace */
+ c->working_size = sizeof(struct crush_work) +
+ c->max_buckets * sizeof(struct crush_work_bucket *);
+
+ for (b = 0; b < c->max_buckets; b++) {
+ if (!c->buckets[b])
+ continue;
+
+ switch (c->buckets[b]->alg) {
+ default:
+ /*
+ * The base case, permutation variables and
+ * the pointer to the permutation array.
+ */
+ c->working_size += sizeof(struct crush_work_bucket);
+ break;
+ }
+ /* Every bucket has a permutation array. */
+ c->working_size += c->buckets[b]->size * sizeof(__u32);
+ }
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+ struct crush_map *c;
+ int err;
+ int i, j;
+ void **p = &pbyval;
+ void *start = pbyval;
+ u32 magic;
+
+ dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ c = kzalloc(sizeof(*c), GFP_NOFS);
+ if (c == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ c->type_names = RB_ROOT;
+ c->names = RB_ROOT;
+ c->choose_args = RB_ROOT;
+
+ /* set tunables to default values */
+ c->choose_local_tries = 2;
+ c->choose_local_fallback_tries = 5;
+ c->choose_total_tries = 19;
+ c->chooseleaf_descend_once = 0;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ magic = ceph_decode_32(p);
+ if (magic != CRUSH_MAGIC) {
+ pr_err("crush_decode magic %x != current %x\n",
+ (unsigned int)magic, (unsigned int)CRUSH_MAGIC);
+ goto bad;
+ }
+ c->max_buckets = ceph_decode_32(p);
+ c->max_rules = ceph_decode_32(p);
+ c->max_devices = ceph_decode_32(p);
+
+ c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+ if (c->buckets == NULL)
+ goto badmem;
+ c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+ if (c->rules == NULL)
+ goto badmem;
+
+ /* buckets */
+ for (i = 0; i < c->max_buckets; i++) {
+ int size = 0;
+ u32 alg;
+ struct crush_bucket *b;
+
+ ceph_decode_32_safe(p, end, alg, bad);
+ if (alg == 0) {
+ c->buckets[i] = NULL;
+ continue;
+ }
+ dout("crush_decode bucket %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(struct crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(struct crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(struct crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(struct crush_bucket_straw);
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ size = sizeof(struct crush_bucket_straw2);
+ break;
+ default:
+ goto bad;
+ }
+ BUG_ON(size == 0);
+ b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+ if (b == NULL)
+ goto badmem;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ b->id = ceph_decode_32(p);
+ b->type = ceph_decode_16(p);
+ b->alg = ceph_decode_8(p);
+ b->hash = ceph_decode_8(p);
+ b->weight = ceph_decode_32(p);
+ b->size = ceph_decode_32(p);
+
+ dout("crush_decode bucket size %d off %x %p to %p\n",
+ b->size, (int)(*p-start), *p, end);
+
+ b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+ if (b->items == NULL)
+ goto badmem;
+
+ ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+ for (j = 0; j < b->size; j++)
+ b->items[j] = ceph_decode_32(p);
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ err = crush_decode_uniform_bucket(p, end,
+ (struct crush_bucket_uniform *)b);
+ if (err < 0)
+ goto fail;
+ break;
+ case CRUSH_BUCKET_LIST:
+ err = crush_decode_list_bucket(p, end,
+ (struct crush_bucket_list *)b);
+ if (err < 0)
+ goto fail;
+ break;
+ case CRUSH_BUCKET_TREE:
+ err = crush_decode_tree_bucket(p, end,
+ (struct crush_bucket_tree *)b);
+ if (err < 0)
+ goto fail;
+ break;
+ case CRUSH_BUCKET_STRAW:
+ err = crush_decode_straw_bucket(p, end,
+ (struct crush_bucket_straw *)b);
+ if (err < 0)
+ goto fail;
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ err = crush_decode_straw2_bucket(p, end,
+ (struct crush_bucket_straw2 *)b);
+ if (err < 0)
+ goto fail;
+ break;
+ }
+ }
+
+ /* rules */
+ dout("rule vec is %p\n", c->rules);
+ for (i = 0; i < c->max_rules; i++) {
+ u32 yes;
+ struct crush_rule *r;
+
+ ceph_decode_32_safe(p, end, yes, bad);
+ if (!yes) {
+ dout("crush_decode NO rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+ c->rules[i] = NULL;
+ continue;
+ }
+
+ dout("crush_decode rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ /* len */
+ ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+ if (yes > (ULONG_MAX - sizeof(*r))
+ / sizeof(struct crush_rule_step))
+ goto bad;
+#endif
+ r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
+ if (r == NULL)
+ goto badmem;
+ dout(" rule %d is at %p\n", i, r);
+ c->rules[i] = r;
+ r->len = yes;
+ ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+ for (j = 0; j < r->len; j++) {
+ r->steps[j].op = ceph_decode_32(p);
+ r->steps[j].arg1 = ceph_decode_32(p);
+ r->steps[j].arg2 = ceph_decode_32(p);
+ }
+ }
+
+ err = decode_crush_names(p, end, &c->type_names);
+ if (err)
+ goto fail;
+
+ err = decode_crush_names(p, end, &c->names);
+ if (err)
+ goto fail;
+
+ ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
+
+ /* tunables */
+ ceph_decode_need(p, end, 3*sizeof(u32), done);
+ c->choose_local_tries = ceph_decode_32(p);
+ c->choose_local_fallback_tries = ceph_decode_32(p);
+ c->choose_total_tries = ceph_decode_32(p);
+ dout("crush decode tunable choose_local_tries = %d\n",
+ c->choose_local_tries);
+ dout("crush decode tunable choose_local_fallback_tries = %d\n",
+ c->choose_local_fallback_tries);
+ dout("crush decode tunable choose_total_tries = %d\n",
+ c->choose_total_tries);
+
+ ceph_decode_need(p, end, sizeof(u32), done);
+ c->chooseleaf_descend_once = ceph_decode_32(p);
+ dout("crush decode tunable chooseleaf_descend_once = %d\n",
+ c->chooseleaf_descend_once);
+
+ ceph_decode_need(p, end, sizeof(u8), done);
+ c->chooseleaf_vary_r = ceph_decode_8(p);
+ dout("crush decode tunable chooseleaf_vary_r = %d\n",
+ c->chooseleaf_vary_r);
+
+ /* skip straw_calc_version, allowed_bucket_algs */
+ ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
+ *p += sizeof(u8) + sizeof(u32);
+
+ ceph_decode_need(p, end, sizeof(u8), done);
+ c->chooseleaf_stable = ceph_decode_8(p);
+ dout("crush decode tunable chooseleaf_stable = %d\n",
+ c->chooseleaf_stable);
+
+ if (*p != end) {
+ /* class_map */
+ ceph_decode_skip_map(p, end, 32, 32, bad);
+ /* class_name */
+ ceph_decode_skip_map(p, end, 32, string, bad);
+ /* class_bucket */
+ ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
+ }
+
+ if (*p != end) {
+ err = decode_choose_args(p, end, c);
+ if (err)
+ goto fail;
+ }
+
+done:
+ crush_finalize(c);
+ dout("crush_decode success\n");
+ return c;
+
+badmem:
+ err = -ENOMEM;
+fail:
+ dout("crush_decode fail %d\n", err);
+ crush_destroy(c);
+ return ERR_PTR(err);
+
+bad:
+ err = -EINVAL;
+ goto fail;
+}
+
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
+{
+ if (lhs->pool < rhs->pool)
+ return -1;
+ if (lhs->pool > rhs->pool)
+ return 1;
+ if (lhs->seed < rhs->seed)
+ return -1;
+ if (lhs->seed > rhs->seed)
+ return 1;
+
+ return 0;
+}
+
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
+{
+ int ret;
+
+ ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
+ if (ret)
+ return ret;
+
+ if (lhs->shard < rhs->shard)
+ return -1;
+ if (lhs->shard > rhs->shard)
+ return 1;
+
+ return 0;
+}
+
+static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
+{
+ struct ceph_pg_mapping *pg;
+
+ pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
+ if (!pg)
+ return NULL;
+
+ RB_CLEAR_NODE(&pg->node);
+ return pg;
+}
+
+static void free_pg_mapping(struct ceph_pg_mapping *pg)
+{
+ WARN_ON(!RB_EMPTY_NODE(&pg->node));
+
+ kfree(pg);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
+DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
+ RB_BYPTR, const struct ceph_pg *, node)
+
+/*
+ * rbtree of pg pool info
+ */
+DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
+
+struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
+{
+ return lookup_pg_pool(&map->pg_pools, id);
+}
+
+const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ if (id == CEPH_NOPOOL)
+ return NULL;
+
+ if (WARN_ON_ONCE(id > (u64) INT_MAX))
+ return NULL;
+
+ pi = lookup_pg_pool(&map->pg_pools, id);
+ return pi ? pi->name : NULL;
+}
+EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+ struct rb_node *rbp;
+
+ for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rbp, struct ceph_pg_pool_info, node);
+ if (pi->name && strcmp(pi->name, name) == 0)
+ return pi->id;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = lookup_pg_pool(&map->pg_pools, id);
+ return pi ? pi->flags : 0;
+}
+EXPORT_SYMBOL(ceph_pg_pool_flags);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+ erase_pg_pool(root, pi);
+ kfree(pi->name);
+ kfree(pi);
+}
+
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+ u8 ev, cv;
+ unsigned len, num;
+ void *pool_end;
+
+ ceph_decode_need(p, end, 2 + 4, bad);
+ ev = ceph_decode_8(p); /* encoding version */
+ cv = ceph_decode_8(p); /* compat version */
+ if (ev < 5) {
+ pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
+ return -EINVAL;
+ }
+ if (cv > 9) {
+ pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
+ return -EINVAL;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, bad);
+ pool_end = *p + len;
+
+ pi->type = ceph_decode_8(p);
+ pi->size = ceph_decode_8(p);
+ pi->crush_ruleset = ceph_decode_8(p);
+ pi->object_hash = ceph_decode_8(p);
+
+ pi->pg_num = ceph_decode_32(p);
+ pi->pgp_num = ceph_decode_32(p);
+
+ *p += 4 + 4; /* skip lpg* */
+ *p += 4; /* skip last_change */
+ *p += 8 + 4; /* skip snap_seq, snap_epoch */
+
+ /* skip snaps */
+ num = ceph_decode_32(p);
+ while (num--) {
+ *p += 8; /* snapid key */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ /* skip removed_snaps */
+ num = ceph_decode_32(p);
+ *p += num * (8 + 8);
+
+ *p += 8; /* skip auid */
+ pi->flags = ceph_decode_64(p);
+ *p += 4; /* skip crash_replay_interval */
+
+ if (ev >= 7)
+ pi->min_size = ceph_decode_8(p);
+ else
+ pi->min_size = pi->size - pi->size / 2;
+
+ if (ev >= 8)
+ *p += 8 + 8; /* skip quota_max_* */
+
+ if (ev >= 9) {
+ /* skip tiers */
+ num = ceph_decode_32(p);
+ *p += num * 8;
+
+ *p += 8; /* skip tier_of */
+ *p += 1; /* skip cache_mode */
+
+ pi->read_tier = ceph_decode_64(p);
+ pi->write_tier = ceph_decode_64(p);
+ } else {
+ pi->read_tier = -1;
+ pi->write_tier = -1;
+ }
+
+ if (ev >= 10) {
+ /* skip properties */
+ num = ceph_decode_32(p);
+ while (num--) {
+ len = ceph_decode_32(p);
+ *p += len; /* key */
+ len = ceph_decode_32(p);
+ *p += len; /* val */
+ }
+ }
+
+ if (ev >= 11) {
+ /* skip hit_set_params */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+
+ *p += 4; /* skip hit_set_period */
+ *p += 4; /* skip hit_set_count */
+ }
+
+ if (ev >= 12)
+ *p += 4; /* skip stripe_width */
+
+ if (ev >= 13) {
+ *p += 8; /* skip target_max_bytes */
+ *p += 8; /* skip target_max_objects */
+ *p += 4; /* skip cache_target_dirty_ratio_micro */
+ *p += 4; /* skip cache_target_full_ratio_micro */
+ *p += 4; /* skip cache_min_flush_age */
+ *p += 4; /* skip cache_min_evict_age */
+ }
+
+ if (ev >= 14) {
+ /* skip erasure_code_profile */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ /*
+ * last_force_op_resend_preluminous, will be overridden if the
+ * map was encoded with RESEND_ON_SPLIT
+ */
+ if (ev >= 15)
+ pi->last_force_request_resend = ceph_decode_32(p);
+ else
+ pi->last_force_request_resend = 0;
+
+ if (ev >= 16)
+ *p += 4; /* skip min_read_recency_for_promote */
+
+ if (ev >= 17)
+ *p += 8; /* skip expected_num_objects */
+
+ if (ev >= 19)
+ *p += 4; /* skip cache_target_dirty_high_ratio_micro */
+
+ if (ev >= 20)
+ *p += 4; /* skip min_write_recency_for_promote */
+
+ if (ev >= 21)
+ *p += 1; /* skip use_gmt_hitset */
+
+ if (ev >= 22)
+ *p += 1; /* skip fast_read */
+
+ if (ev >= 23) {
+ *p += 4; /* skip hit_set_grade_decay_rate */
+ *p += 4; /* skip hit_set_search_last_n */
+ }
+
+ if (ev >= 24) {
+ /* skip opts */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ if (ev >= 25)
+ pi->last_force_request_resend = ceph_decode_32(p);
+
+ /* ignore the rest */
+
+ *p = pool_end;
+ calc_pg_masks(pi);
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+ struct ceph_pg_pool_info *pi;
+ u32 num, len;
+ u64 pool;
+
+ ceph_decode_32_safe(p, end, num, bad);
+ dout(" %d pool names\n", num);
+ while (num--) {
+ ceph_decode_64_safe(p, end, pool, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ dout(" pool %llu len %d\n", pool, len);
+ ceph_decode_need(p, end, len, bad);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
+ if (pi) {
+ char *name = kstrndup(*p, len, GFP_NOFS);
+
+ if (!name)
+ return -ENOMEM;
+ kfree(pi->name);
+ pi->name = name;
+ dout(" name is %s\n", pi->name);
+ }
+ *p += len;
+ }
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+/*
+ * CRUSH workspaces
+ *
+ * workspace_manager framework borrowed from fs/btrfs/compression.c.
+ * Two simplifications: there is only one type of workspace and there
+ * is always at least one workspace.
+ */
+static struct crush_work *alloc_workspace(const struct crush_map *c)
+{
+ struct crush_work *work;
+ size_t work_size;
+
+ WARN_ON(!c->working_size);
+ work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
+ dout("%s work_size %zu bytes\n", __func__, work_size);
+
+ work = kvmalloc(work_size, GFP_NOIO);
+ if (!work)
+ return NULL;
+
+ INIT_LIST_HEAD(&work->item);
+ crush_init_workspace(c, work);
+ return work;
+}
+
+static void free_workspace(struct crush_work *work)
+{
+ WARN_ON(!list_empty(&work->item));
+ kvfree(work);
+}
+
+static void init_workspace_manager(struct workspace_manager *wsm)
+{
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ wsm->free_ws = 0;
+ init_waitqueue_head(&wsm->ws_wait);
+}
+
+static void add_initial_workspace(struct workspace_manager *wsm,
+ struct crush_work *work)
+{
+ WARN_ON(!list_empty(&wsm->idle_ws));
+
+ list_add(&work->item, &wsm->idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+}
+
+static void cleanup_workspace_manager(struct workspace_manager *wsm)
+{
+ struct crush_work *work;
+
+ while (!list_empty(&wsm->idle_ws)) {
+ work = list_first_entry(&wsm->idle_ws, struct crush_work,
+ item);
+ list_del_init(&work->item);
+ free_workspace(work);
+ }
+ atomic_set(&wsm->total_ws, 0);
+ wsm->free_ws = 0;
+}
+
+/*
+ * Finds an available workspace or allocates a new one. If it's not
+ * possible to allocate a new one, waits until there is one.
+ */
+static struct crush_work *get_workspace(struct workspace_manager *wsm,
+ const struct crush_map *c)
+{
+ struct crush_work *work;
+ int cpus = num_online_cpus();
+
+again:
+ spin_lock(&wsm->ws_lock);
+ if (!list_empty(&wsm->idle_ws)) {
+ work = list_first_entry(&wsm->idle_ws, struct crush_work,
+ item);
+ list_del_init(&work->item);
+ wsm->free_ws--;
+ spin_unlock(&wsm->ws_lock);
+ return work;
+
+ }
+ if (atomic_read(&wsm->total_ws) > cpus) {
+ DEFINE_WAIT(wait);
+
+ spin_unlock(&wsm->ws_lock);
+ prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
+ schedule();
+ finish_wait(&wsm->ws_wait, &wait);
+ goto again;
+ }
+ atomic_inc(&wsm->total_ws);
+ spin_unlock(&wsm->ws_lock);
+
+ work = alloc_workspace(c);
+ if (!work) {
+ atomic_dec(&wsm->total_ws);
+ wake_up(&wsm->ws_wait);
+
+ /*
+ * Do not return the error but go back to waiting. We
+ * have the initial workspace and the CRUSH computation
+ * time is bounded so we will get it eventually.
+ */
+ WARN_ON(atomic_read(&wsm->total_ws) < 1);
+ goto again;
+ }
+ return work;
+}
+
+/*
+ * Puts a workspace back on the list or frees it if we have enough
+ * idle ones sitting around.
+ */
+static void put_workspace(struct workspace_manager *wsm,
+ struct crush_work *work)
+{
+ spin_lock(&wsm->ws_lock);
+ if (wsm->free_ws <= num_online_cpus()) {
+ list_add(&work->item, &wsm->idle_ws);
+ wsm->free_ws++;
+ spin_unlock(&wsm->ws_lock);
+ goto wake;
+ }
+ spin_unlock(&wsm->ws_lock);
+
+ free_workspace(work);
+ atomic_dec(&wsm->total_ws);
+wake:
+ if (wq_has_sleeper(&wsm->ws_wait))
+ wake_up(&wsm->ws_wait);
+}
+
+/*
+ * osd map
+ */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+ struct ceph_osdmap *map;
+
+ map = kzalloc(sizeof(*map), GFP_NOIO);
+ if (!map)
+ return NULL;
+
+ map->pg_pools = RB_ROOT;
+ map->pool_max = -1;
+ map->pg_temp = RB_ROOT;
+ map->primary_temp = RB_ROOT;
+ map->pg_upmap = RB_ROOT;
+ map->pg_upmap_items = RB_ROOT;
+
+ init_workspace_manager(&map->crush_wsm);
+
+ return map;
+}
+
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+ dout("osdmap_destroy %p\n", map);
+
+ if (map->crush)
+ crush_destroy(map->crush);
+ cleanup_workspace_manager(&map->crush_wsm);
+
+ while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_temp),
+ struct ceph_pg_mapping, node);
+ erase_pg_mapping(&map->pg_temp, pg);
+ free_pg_mapping(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->primary_temp),
+ struct ceph_pg_mapping, node);
+ erase_pg_mapping(&map->primary_temp, pg);
+ free_pg_mapping(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_upmap),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_upmap);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_upmap_items),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_upmap_items);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rb_first(&map->pg_pools),
+ struct ceph_pg_pool_info, node);
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ kvfree(map->osd_primary_affinity);
+ kfree(map);
+}
+
+/*
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
+{
+ u32 *state;
+ u32 *weight;
+ struct ceph_entity_addr *addr;
+ u32 to_copy;
+ int i;
+
+ dout("%s old %u new %u\n", __func__, map->max_osd, max);
+ if (max == map->max_osd)
+ return 0;
+
+ state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
+ weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
+ addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
+ if (!state || !weight || !addr) {
+ kvfree(state);
+ kvfree(weight);
+ kvfree(addr);
+ return -ENOMEM;
+ }
+
+ to_copy = min(map->max_osd, max);
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, to_copy * sizeof(*state));
+ memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
+ memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ }
+
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
+ for (i = map->max_osd; i < max; i++) {
+ map->osd_state[i] = 0;
+ map->osd_weight[i] = CEPH_OSD_OUT;
+ memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
+ }
+
+ if (map->osd_primary_affinity) {
+ u32 *affinity;
+
+ affinity = kvmalloc(array_size(max, sizeof(*affinity)),
+ GFP_NOFS);
+ if (!affinity)
+ return -ENOMEM;
+
+ memcpy(affinity, map->osd_primary_affinity,
+ to_copy * sizeof(*affinity));
+ kvfree(map->osd_primary_affinity);
+
+ map->osd_primary_affinity = affinity;
+ for (i = map->max_osd; i < max; i++)
+ map->osd_primary_affinity[i] =
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ }
+
+ map->max_osd = max;
+
+ return 0;
+}
+
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+ struct crush_work *work;
+
+ if (IS_ERR(crush))
+ return PTR_ERR(crush);
+
+ work = alloc_workspace(crush);
+ if (!work) {
+ crush_destroy(crush);
+ return -ENOMEM;
+ }
+
+ if (map->crush)
+ crush_destroy(map->crush);
+ cleanup_workspace_manager(&map->crush_wsm);
+ map->crush = crush;
+ add_initial_workspace(&map->crush_wsm, work);
+ return 0;
+}
+
+#define OSDMAP_WRAPPER_COMPAT_VER 7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
+
+/*
+ * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
+ */
+static int get_osdmap_client_data_v(void **p, void *end,
+ const char *prefix, u8 *v)
+{
+ u8 struct_v;
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ if (struct_v >= 7) {
+ u8 struct_compat;
+
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+ pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
+ struct_v, struct_compat,
+ OSDMAP_WRAPPER_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore wrapper struct_len */
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+ pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+ struct_v, struct_compat,
+ OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore client data struct_len */
+ } else {
+ u16 version;
+
+ *p -= 1;
+ ceph_decode_16_safe(p, end, version, e_inval);
+ if (version < 6) {
+ pr_warn("got v %d < 6 of %s ceph_osdmap\n",
+ version, prefix);
+ return -EINVAL;
+ }
+
+ /* old osdmap encoding */
+ struct_v = 0;
+ }
+
+ *v = struct_v;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg_pool_info *pi;
+ u64 pool;
+ int ret;
+
+ ceph_decode_64_safe(p, end, pool, e_inval);
+
+ pi = lookup_pg_pool(&map->pg_pools, pool);
+ if (!incremental || !pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ return -ENOMEM;
+
+ RB_CLEAR_NODE(&pi->node);
+ pi->id = pool;
+
+ if (!__insert_pg_pool(&map->pg_pools, pi)) {
+ kfree(pi);
+ return -EEXIST;
+ }
+ }
+
+ ret = decode_pool(p, end, pi);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, false);
+}
+
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, true);
+}
+
+typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
+
+static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
+ decode_mapping_fn_t fn, bool incremental)
+{
+ u32 n;
+
+ WARN_ON(!incremental && !fn);
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg_mapping *pg;
+ struct ceph_pg pgid;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ pg = lookup_pg_mapping(mapping_root, &pgid);
+ if (pg) {
+ WARN_ON(!incremental);
+ erase_pg_mapping(mapping_root, pg);
+ free_pg_mapping(pg);
+ }
+
+ if (fn) {
+ pg = fn(p, end, incremental);
+ if (IS_ERR(pg))
+ return PTR_ERR(pg);
+
+ if (pg) {
+ pg->pgid = pgid; /* struct */
+ insert_pg_mapping(mapping_root, pg);
+ }
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
+ bool incremental)
+{
+ struct ceph_pg_mapping *pg;
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len == 0 && incremental)
+ return NULL; /* new_pg_temp: [] to remove */
+ if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
+ return ERR_PTR(-EINVAL);
+
+ ceph_decode_need(p, end, len * sizeof(u32), e_inval);
+ pg = alloc_pg_mapping(len * sizeof(u32));
+ if (!pg)
+ return ERR_PTR(-ENOMEM);
+
+ pg->pg_temp.len = len;
+ for (i = 0; i < len; i++)
+ pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+ return pg;
+
+e_inval:
+ return ERR_PTR(-EINVAL);
+}
+
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
+ false);
+}
+
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
+ true);
+}
+
+static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
+ bool incremental)
+{
+ struct ceph_pg_mapping *pg;
+ u32 osd;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ if (osd == (u32)-1 && incremental)
+ return NULL; /* new_primary_temp: -1 to remove */
+
+ pg = alloc_pg_mapping(0);
+ if (!pg)
+ return ERR_PTR(-ENOMEM);
+
+ pg->primary_temp.osd = osd;
+ return pg;
+
+e_inval:
+ return ERR_PTR(-EINVAL);
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->primary_temp,
+ __decode_primary_temp, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->primary_temp,
+ __decode_primary_temp, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+ if (!map->osd_primary_affinity)
+ return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+ return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+ if (!map->osd_primary_affinity) {
+ int i;
+
+ map->osd_primary_affinity = kvmalloc(
+ array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
+ GFP_NOFS);
+ if (!map->osd_primary_affinity)
+ return -ENOMEM;
+
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_primary_affinity[i] =
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ }
+
+ map->osd_primary_affinity[osd] = aff;
+
+ return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len == 0) {
+ kvfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
+ return 0;
+ }
+ if (len != map->max_osd)
+ goto e_inval;
+
+ ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+ for (i = 0; i < map->max_osd; i++) {
+ int ret;
+
+ ret = set_primary_affinity(map, i, ceph_decode_32(p));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ u32 osd, aff;
+ int ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ ceph_decode_32_safe(p, end, aff, e_inval);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
+ ret = set_primary_affinity(map, osd, aff);
+ if (ret)
+ return ret;
+
+ osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff);
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
+ bool __unused)
+{
+ return __decode_pg_temp(p, end, false);
+}
+
+static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+ false);
+}
+
+static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+ true);
+}
+
+static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
+}
+
+static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
+ bool __unused)
+{
+ struct ceph_pg_mapping *pg;
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
+ return ERR_PTR(-EINVAL);
+
+ ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
+ pg = alloc_pg_mapping(2 * len * sizeof(u32));
+ if (!pg)
+ return ERR_PTR(-ENOMEM);
+
+ pg->pg_upmap_items.len = len;
+ for (i = 0; i < len; i++) {
+ pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
+ pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
+ }
+
+ return pg;
+
+e_inval:
+ return ERR_PTR(-EINVAL);
+}
+
+static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap_items,
+ __decode_pg_upmap_items, false);
+}
+
+static int decode_new_pg_upmap_items(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap_items,
+ __decode_pg_upmap_items, true);
+}
+
+static int decode_old_pg_upmap_items(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
+}
+
+/*
+ * decode a full map.
+ */
+static int osdmap_decode(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map)
+{
+ u8 struct_v;
+ u32 epoch = 0;
+ void *start = *p;
+ u32 max;
+ u32 len, i;
+ int err;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
+ err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+ if (err)
+ goto bad;
+
+ /* fsid, epoch, created, modified */
+ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+ sizeof(map->created) + sizeof(map->modified), e_inval);
+ ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+ epoch = map->epoch = ceph_decode_32(p);
+ ceph_decode_copy(p, &map->created, sizeof(map->created));
+ ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+ /* pools */
+ err = decode_pools(p, end, map);
+ if (err)
+ goto bad;
+
+ /* pool_name */
+ err = decode_pool_names(p, end, map);
+ if (err)
+ goto bad;
+
+ ceph_decode_32_safe(p, end, map->pool_max, e_inval);
+
+ ceph_decode_32_safe(p, end, map->flags, e_inval);
+
+ /* max_osd */
+ ceph_decode_32_safe(p, end, max, e_inval);
+
+ /* (re)alloc osd arrays */
+ err = osdmap_set_max_osd(map, max);
+ if (err)
+ goto bad;
+
+ /* osd_state, osd_weight, osd_addrs->client_addr */
+ ceph_decode_need(p, end, 3*sizeof(u32) +
+ map->max_osd*(struct_v >= 5 ? sizeof(u32) :
+ sizeof(u8)) +
+ sizeof(*map->osd_weight), e_inval);
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
+ if (struct_v >= 5) {
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_state[i] = ceph_decode_32(p);
+ } else {
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_state[i] = ceph_decode_8(p);
+ }
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_weight[i] = ceph_decode_32(p);
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
+ for (i = 0; i < map->max_osd; i++) {
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+ if (struct_v >= 8)
+ err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ else
+ err = ceph_decode_entity_addr(p, end, addr);
+ if (err)
+ goto bad;
+
+ dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
+ }
+
+ /* pg_temp */
+ err = decode_pg_temp(p, end, map);
+ if (err)
+ goto bad;
+
+ /* primary_temp */
+ if (struct_v >= 1) {
+ err = decode_primary_temp(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ /* primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_primary_affinity(p, end, map);
+ if (err)
+ goto bad;
+ } else {
+ WARN_ON(map->osd_primary_affinity);
+ }
+
+ /* crush */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+ if (err)
+ goto bad;
+
+ *p += len;
+ if (struct_v >= 3) {
+ /* erasure_code_profiles */
+ ceph_decode_skip_map_of_map(p, end, string, string, string,
+ e_inval);
+ }
+
+ if (struct_v >= 4) {
+ err = decode_pg_upmap(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_pg_upmap_items(p, end, map);
+ if (err)
+ goto bad;
+ } else {
+ WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
+ WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
+ }
+
+ /* ignore the rest */
+ *p = end;
+
+ dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
+ return 0;
+
+e_inval:
+ err = -EINVAL;
+bad:
+ pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+ err, epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ return err;
+}
+
+/*
+ * Allocate and decode a full map.
+ */
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
+{
+ struct ceph_osdmap *map;
+ int ret;
+
+ map = ceph_osdmap_alloc();
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ ret = osdmap_decode(p, end, msgr2, map);
+ if (ret) {
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(ret);
+ }
+
+ return map;
+}
+
+/*
+ * Encoding order is (new_up_client, new_state, new_weight). Need to
+ * apply in the (new_weight, new_state, new_up_client) order, because
+ * an incremental map may look like e.g.
+ *
+ * new_up_client: { osd=6, addr=... } # set osd_state and addr
+ * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
+ */
+static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
+ bool msgr2, struct ceph_osdmap *map)
+{
+ void *new_up_client;
+ void *new_state;
+ void *new_weight_end;
+ u32 len;
+ int ret;
+ int i;
+
+ new_up_client = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ for (i = 0; i < len; ++i) {
+ struct ceph_entity_addr addr;
+
+ ceph_decode_skip_32(p, end, e_inval);
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
+ }
+
+ new_state = *p;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
+ ceph_decode_need(p, end, len, e_inval);
+ *p += len;
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ s32 osd;
+ u32 w;
+
+ ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
+ osd = ceph_decode_32(p);
+ w = ceph_decode_32(p);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
+ osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w,
+ w == CEPH_OSD_IN ? "(in)" :
+ (w == CEPH_OSD_OUT ? "(out)" : ""));
+ map->osd_weight[osd] = w;
+
+ /*
+ * If we are marking in, set the EXISTS, and clear the
+ * AUTOOUT and NEW bits.
+ */
+ if (w) {
+ map->osd_state[osd] |= CEPH_OSD_EXISTS;
+ map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
+ CEPH_OSD_NEW);
+ }
+ }
+ new_weight_end = *p;
+
+ /* new_state (up/down) */
+ *p = new_state;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ u32 xorstate;
+
+ osd = ceph_decode_32(p);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
+ if (struct_v >= 5)
+ xorstate = ceph_decode_32(p);
+ else
+ xorstate = ceph_decode_8(p);
+ if (xorstate == 0)
+ xorstate = CEPH_OSD_UP;
+ if ((map->osd_state[osd] & CEPH_OSD_UP) &&
+ (xorstate & CEPH_OSD_UP))
+ osdmap_info(map, "osd%d down\n", osd);
+ if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
+ (xorstate & CEPH_OSD_EXISTS)) {
+ osdmap_info(map, "osd%d does not exist\n", osd);
+ ret = set_primary_affinity(map, osd,
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+ if (ret)
+ return ret;
+ memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
+ map->osd_state[osd] = 0;
+ } else {
+ map->osd_state[osd] ^= xorstate;
+ }
+ }
+
+ /* new_up_client */
+ *p = new_up_client;
+ len = ceph_decode_32(p);
+ while (len--) {
+ s32 osd;
+ struct ceph_entity_addr addr;
+
+ osd = ceph_decode_32(p);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
+
+ dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
+ osdmap_info(map, "osd%d up\n", osd);
+ map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ *p = new_weight_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map)
+{
+ struct ceph_fsid fsid;
+ u32 epoch = 0;
+ struct ceph_timespec modified;
+ s32 len;
+ u64 pool;
+ __s64 new_pool_max;
+ __s32 new_flags, max;
+ void *start = *p;
+ int err;
+ u8 struct_v;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
+ err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
+ if (err)
+ goto bad;
+
+ /* fsid, epoch, modified, new_pool_max, new_flags */
+ ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
+ sizeof(u64) + sizeof(u32), e_inval);
+ ceph_decode_copy(p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(p);
+ BUG_ON(epoch != map->epoch+1);
+ ceph_decode_copy(p, &modified, sizeof(modified));
+ new_pool_max = ceph_decode_64(p);
+ new_flags = ceph_decode_32(p);
+
+ /* full map? */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len > 0) {
+ dout("apply_incremental full map len %d, %p to %p\n",
+ len, *p, end);
+ return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
+ }
+
+ /* new crush? */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len > 0) {
+ err = osdmap_set_crush(map,
+ crush_decode(*p, min(*p + len, end)));
+ if (err)
+ goto bad;
+ *p += len;
+ }
+
+ /* new flags? */
+ if (new_flags >= 0)
+ map->flags = new_flags;
+ if (new_pool_max >= 0)
+ map->pool_max = new_pool_max;
+
+ /* new max? */
+ ceph_decode_32_safe(p, end, max, e_inval);
+ if (max >= 0) {
+ err = osdmap_set_max_osd(map, max);
+ if (err)
+ goto bad;
+ }
+
+ map->epoch++;
+ map->modified = modified;
+
+ /* new_pools */
+ err = decode_new_pools(p, end, map);
+ if (err)
+ goto bad;
+
+ /* new_pool_names */
+ err = decode_pool_names(p, end, map);
+ if (err)
+ goto bad;
+
+ /* old_pool */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_64_safe(p, end, pool, e_inval);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
+ if (pi)
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+
+ /* new_up_client, new_state, new_weight */
+ err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
+ if (err)
+ goto bad;
+
+ /* new_pg_temp */
+ err = decode_new_pg_temp(p, end, map);
+ if (err)
+ goto bad;
+
+ /* new_primary_temp */
+ if (struct_v >= 1) {
+ err = decode_new_primary_temp(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ /* new_primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_new_primary_affinity(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ if (struct_v >= 3) {
+ /* new_erasure_code_profiles */
+ ceph_decode_skip_map_of_map(p, end, string, string, string,
+ e_inval);
+ /* old_erasure_code_profiles */
+ ceph_decode_skip_set(p, end, string, e_inval);
+ }
+
+ if (struct_v >= 4) {
+ err = decode_new_pg_upmap(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_old_pg_upmap(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_new_pg_upmap_items(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_old_pg_upmap_items(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ /* ignore the rest */
+ *p = end;
+
+ dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
+ return map;
+
+e_inval:
+ err = -EINVAL;
+bad:
+ pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+ err, epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ return ERR_PTR(err);
+}
+
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src)
+{
+ ceph_oloc_destroy(dest);
+
+ dest->pool = src->pool;
+ if (src->pool_ns)
+ dest->pool_ns = ceph_get_string(src->pool_ns);
+ else
+ dest->pool_ns = NULL;
+}
+EXPORT_SYMBOL(ceph_oloc_copy);
+
+void ceph_oloc_destroy(struct ceph_object_locator *oloc)
+{
+ ceph_put_string(oloc->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_destroy);
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src)
+{
+ ceph_oid_destroy(dest);
+
+ if (src->name != src->inline_name) {
+ /* very rare, see ceph_object_id definition */
+ dest->name = kmalloc(src->name_len + 1,
+ GFP_NOIO | __GFP_NOFAIL);
+ } else {
+ dest->name = dest->inline_name;
+ }
+ memcpy(dest->name, src->name, src->name_len + 1);
+ dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+ int len;
+
+ WARN_ON(!ceph_oid_empty(oid));
+
+ len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+ if (len >= sizeof(oid->inline_name))
+ return len;
+
+ oid->name_len = len;
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ BUG_ON(oid_printf_vargs(oid, fmt, ap));
+ va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, va_list ap)
+{
+ va_list aq;
+ int len;
+
+ va_copy(aq, ap);
+ len = oid_printf_vargs(oid, fmt, aq);
+ va_end(aq);
+
+ if (len) {
+ char *external_name;
+
+ external_name = kmalloc(len + 1, gfp);
+ if (!external_name)
+ return -ENOMEM;
+
+ oid->name = external_name;
+ WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+ oid->name_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+ if (oid->name != oid->inline_name)
+ kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (lhs->size == rhs->size &&
+ !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+ return true;
+
+ return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (__osds_equal(lhs, rhs) &&
+ lhs->primary == rhs->primary)
+ return true;
+
+ return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+ /* non-empty set */
+ if (set->size > 0 && set->primary >= 0)
+ return true;
+
+ /* empty can_shift_osds set */
+ if (!set->size && set->primary == -1)
+ return true;
+
+ /* empty !can_shift_osds set - all NONE */
+ if (set->size > 0 && set->primary == -1) {
+ int i;
+
+ for (i = 0; i < set->size; i++) {
+ if (set->osds[i] != CRUSH_ITEM_NONE)
+ break;
+ }
+ if (i == set->size)
+ return true;
+ }
+
+ return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+ memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+ dest->size = src->size;
+ dest->primary = src->primary;
+}
+
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+ u32 new_pg_num)
+{
+ int old_bits = calc_bits_of(old_pg_num);
+ int old_mask = (1 << old_bits) - 1;
+ int n;
+
+ WARN_ON(pgid->seed >= old_pg_num);
+ if (new_pg_num <= old_pg_num)
+ return false;
+
+ for (n = 1; ; n++) {
+ int next_bit = n << (old_bits - 1);
+ u32 s = next_bit | pgid->seed;
+
+ if (s < old_pg_num || s == pgid->seed)
+ continue;
+ if (s >= new_pg_num)
+ break;
+
+ s = ceph_stable_mod(s, old_pg_num, old_mask);
+ if (s == pgid->seed)
+ return true;
+ }
+
+ return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
+ const struct ceph_pg *pgid)
+{
+ return !osds_equal(old_acting, new_acting) ||
+ !osds_equal(old_up, new_up) ||
+ old_size != new_size ||
+ old_min_size != new_min_size ||
+ ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
+ old_sort_bitwise != new_sort_bitwise ||
+ old_recovery_deletes != new_recovery_deletes;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+ int i;
+
+ for (i = 0; i < acting->size; i++) {
+ if (acting->osds[i] == osd)
+ return i;
+ }
+
+ return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting)
+{
+ if (!old_acting->size && !new_acting->size)
+ return false; /* both still empty */
+
+ if (!old_acting->size ^ !new_acting->size)
+ return true; /* was empty, now not, or vice versa */
+
+ if (old_acting->primary != new_acting->primary)
+ return true; /* primary changed */
+
+ if (calc_pg_rank(old_acting->primary, old_acting) !=
+ calc_pg_rank(new_acting->primary, new_acting))
+ return true;
+
+ return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change)
+{
+ if (primary_changed(old_acting, new_acting))
+ return true;
+
+ if (any_change && !__osds_equal(old_acting, new_acting))
+ return true;
+
+ return false;
+}
+
+/*
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
+ */
+void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
+{
+ WARN_ON(pi->id != oloc->pool);
+
+ if (!oloc->pool_ns) {
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
+ } else {
+ char stack_buf[256];
+ char *buf = stack_buf;
+ int nsl = oloc->pool_ns->len;
+ size_t total = nsl + 1 + oid->name_len;
+
+ if (total > sizeof(stack_buf))
+ buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL);
+ memcpy(buf, oloc->pool_ns->str, nsl);
+ buf[nsl] = '\037';
+ memcpy(buf + nsl + 1, oid->name, oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
+ if (buf != stack_buf)
+ kfree(buf);
+ dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
+ oid->name, nsl, oloc->pool_ns->str,
+ raw_pgid->pool, raw_pgid->seed);
+ }
+}
+
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
+ if (!pi)
+ return -ENOENT;
+
+ __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_pg *pgid)
+{
+ pgid->pool = raw_pgid->pool;
+ pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+ pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed). Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid)
+{
+ if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+ /* hash pool id and seed so that pool PGs do not overlap */
+ return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(raw_pgid->seed,
+ pi->pgp_num,
+ pi->pgp_num_mask),
+ raw_pgid->pool);
+ } else {
+ /*
+ * legacy behavior: add ps and pool together. this is
+ * not a great approach because the PGs from each pool
+ * will overlap on top of each other: 0.5 == 1.4 ==
+ * 2.3 == ...
+ */
+ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+ pi->pgp_num_mask) +
+ (unsigned)raw_pgid->pool;
+ }
+}
+
+/*
+ * Magic value used for a "default" fallback choose_args, used if the
+ * crush_choose_arg_map passed to do_crush() does not exist. If this
+ * also doesn't exist, fall back to canonical weights.
+ */
+#define CEPH_DEFAULT_CHOOSE_ARGS -1
+
+static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
+ int *result, int result_max,
+ const __u32 *weight, int weight_max,
+ s64 choose_args_index)
+{
+ struct crush_choose_arg_map *arg_map;
+ struct crush_work *work;
+ int r;
+
+ BUG_ON(result_max > CEPH_PG_MAX_SIZE);
+
+ arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+ choose_args_index);
+ if (!arg_map)
+ arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+ CEPH_DEFAULT_CHOOSE_ARGS);
+
+ work = get_workspace(&map->crush_wsm, map->crush);
+ r = crush_do_rule(map->crush, ruleno, x, result, result_max,
+ weight, weight_max, work,
+ arg_map ? arg_map->args : NULL);
+ put_workspace(&map->crush_wsm, work);
+ return r;
+}
+
+static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
+{
+ int i;
+
+ if (ceph_can_shift_osds(pi)) {
+ int removed = 0;
+
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (!ceph_osd_exists(osdmap, set->osds[i])) {
+ removed++;
+ continue;
+ }
+ if (removed)
+ set->osds[i - removed] = set->osds[i];
+ }
+ set->size -= removed;
+ } else {
+ /* set dne devices to NONE */
+ for (i = 0; i < set->size; i++) {
+ if (!ceph_osd_exists(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
+/*
+ * Calculate raw set (CRUSH output) for given PG and filter out
+ * nonexistent OSDs. ->primary is undefined for a raw set.
+ *
+ * Placement seed (CRUSH input) is returned through @ppps.
+ */
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *raw,
+ u32 *ppps)
+{
+ u32 pps = raw_pg_to_pps(pi, raw_pgid);
+ int ruleno;
+ int len;
+
+ ceph_osds_init(raw);
+ if (ppps)
+ *ppps = pps;
+
+ ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+ pi->size);
+ if (ruleno < 0) {
+ pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+ pi->id, pi->crush_ruleset, pi->type, pi->size);
+ return;
+ }
+
+ if (pi->size > ARRAY_SIZE(raw->osds)) {
+ pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+ pi->id, pi->crush_ruleset, pi->type, pi->size,
+ ARRAY_SIZE(raw->osds));
+ return;
+ }
+
+ len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
+ osdmap->osd_weight, osdmap->max_osd, pi->id);
+ if (len < 0) {
+ pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+ len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+ pi->size);
+ return;
+ }
+
+ raw->size = len;
+ remove_nonexistent_osds(osdmap, pi, raw);
+}
+
+/* apply pg_upmap[_items] mappings */
+static void apply_upmap(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *pgid,
+ struct ceph_osds *raw)
+{
+ struct ceph_pg_mapping *pg;
+ int i, j;
+
+ pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
+ if (pg) {
+ /* make sure targets aren't marked out */
+ for (i = 0; i < pg->pg_upmap.len; i++) {
+ int osd = pg->pg_upmap.osds[i];
+
+ if (osd != CRUSH_ITEM_NONE &&
+ osd < osdmap->max_osd &&
+ osdmap->osd_weight[osd] == 0) {
+ /* reject/ignore explicit mapping */
+ return;
+ }
+ }
+ for (i = 0; i < pg->pg_upmap.len; i++)
+ raw->osds[i] = pg->pg_upmap.osds[i];
+ raw->size = pg->pg_upmap.len;
+ /* check and apply pg_upmap_items, if any */
+ }
+
+ pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
+ if (pg) {
+ /*
+ * Note: this approach does not allow a bidirectional swap,
+ * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+ */
+ for (i = 0; i < pg->pg_upmap_items.len; i++) {
+ int from = pg->pg_upmap_items.from_to[i][0];
+ int to = pg->pg_upmap_items.from_to[i][1];
+ int pos = -1;
+ bool exists = false;
+
+ /* make sure replacement doesn't already appear */
+ for (j = 0; j < raw->size; j++) {
+ int osd = raw->osds[j];
+
+ if (osd == to) {
+ exists = true;
+ break;
+ }
+ /* ignore mapping if target is marked out */
+ if (osd == from && pos < 0 &&
+ !(to != CRUSH_ITEM_NONE &&
+ to < osdmap->max_osd &&
+ osdmap->osd_weight[to] == 0)) {
+ pos = j;
+ }
+ }
+ if (!exists && pos >= 0)
+ raw->osds[pos] = to;
+ }
+ }
+}
+
+/*
+ * Given raw set, calculate up set and up primary. By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
+ *
+ * This is done in-place - on return @set is the up set. If it's
+ * empty, ->primary will remain undefined.
+ */
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
+{
+ int i;
+
+ /* ->primary is undefined for a raw set */
+ BUG_ON(set->primary != -1);
+
+ if (ceph_can_shift_osds(pi)) {
+ int removed = 0;
+
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (ceph_osd_is_down(osdmap, set->osds[i])) {
+ removed++;
+ continue;
+ }
+ if (removed)
+ set->osds[i - removed] = set->osds[i];
+ }
+ set->size -= removed;
+ if (set->size > 0)
+ set->primary = set->osds[0];
+ } else {
+ /* set down/dne devices to NONE */
+ for (i = set->size - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
+ else
+ set->primary = set->osds[i];
+ }
+ }
+}
+
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ u32 pps,
+ struct ceph_osds *up)
+{
+ int i;
+ int pos = -1;
+
+ /*
+ * Do we have any non-default primary_affinity values for these
+ * osds?
+ */
+ if (!osdmap->osd_primary_affinity)
+ return;
+
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
+
+ if (osd != CRUSH_ITEM_NONE &&
+ osdmap->osd_primary_affinity[osd] !=
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ break;
+ }
+ }
+ if (i == up->size)
+ return;
+
+ /*
+ * Pick the primary. Feed both the seed (for the pg) and the
+ * osd into the hash/rng so that a proportional fraction of an
+ * osd's pgs get rejected as primary.
+ */
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
+ u32 aff;
+
+ if (osd == CRUSH_ITEM_NONE)
+ continue;
+
+ aff = osdmap->osd_primary_affinity[osd];
+ if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+ (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ pps, osd) >> 16) >= aff) {
+ /*
+ * We chose not to use this primary. Note it
+ * anyway as a fallback in case we don't pick
+ * anyone else, but keep looking.
+ */
+ if (pos < 0)
+ pos = i;
+ } else {
+ pos = i;
+ break;
+ }
+ }
+ if (pos < 0)
+ return;
+
+ up->primary = up->osds[pos];
+
+ if (ceph_can_shift_osds(pi) && pos > 0) {
+ /* move the new primary to the front */
+ for (i = pos; i > 0; i--)
+ up->osds[i] = up->osds[i - 1];
+ up->osds[0] = up->primary;
+ }
+}
+
+/*
+ * Get pg_temp and primary_temp mappings for given PG.
+ *
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings. This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
+ */
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *pgid,
+ struct ceph_osds *temp)
+{
+ struct ceph_pg_mapping *pg;
+ int i;
+
+ ceph_osds_init(temp);
+
+ /* pg_temp? */
+ pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
+ if (pg) {
+ for (i = 0; i < pg->pg_temp.len; i++) {
+ if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+ if (ceph_can_shift_osds(pi))
+ continue;
+
+ temp->osds[temp->size++] = CRUSH_ITEM_NONE;
+ } else {
+ temp->osds[temp->size++] = pg->pg_temp.osds[i];
+ }
+ }
+
+ /* apply pg_temp's primary */
+ for (i = 0; i < temp->size; i++) {
+ if (temp->osds[i] != CRUSH_ITEM_NONE) {
+ temp->primary = temp->osds[i];
+ break;
+ }
+ }
+ }
+
+ /* primary_temp? */
+ pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
+ if (pg)
+ temp->primary = pg->primary_temp.osd;
+}
+
+/*
+ * Map a PG to its acting set as well as its up set.
+ *
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
+ */
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting)
+{
+ struct ceph_pg pgid;
+ u32 pps;
+
+ WARN_ON(pi->id != raw_pgid->pool);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+
+ pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+ apply_upmap(osdmap, &pgid, up);
+ raw_to_up_osds(osdmap, pi, up);
+ apply_primary_affinity(osdmap, pi, pps, up);
+ get_temp_osds(osdmap, pi, &pgid, acting);
+ if (!acting->size) {
+ memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+ acting->size = up->size;
+ if (acting->primary == -1)
+ acting->primary = up->primary;
+ }
+ WARN_ON(!osds_valid(up) || !osds_valid(acting));
+}
+
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_spg *spgid)
+{
+ struct ceph_pg pgid;
+ struct ceph_osds up, acting;
+ int i;
+
+ WARN_ON(pi->id != raw_pgid->pool);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+
+ if (ceph_can_shift_osds(pi)) {
+ spgid->pgid = pgid; /* struct */
+ spgid->shard = CEPH_SPG_NOSHARD;
+ return true;
+ }
+
+ ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
+ for (i = 0; i < acting.size; i++) {
+ if (acting.osds[i] == acting.primary) {
+ spgid->pgid = pgid; /* struct */
+ spgid->shard = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Return acting primary for given PG, or -1 if none.
+ */
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid)
+{
+ struct ceph_pg_pool_info *pi;
+ struct ceph_osds up, acting;
+
+ pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+ if (!pi)
+ return -1;
+
+ ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
+ return acting.primary;
+}
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
+
+static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
+ size_t name_len)
+{
+ struct crush_loc_node *loc;
+
+ loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
+ if (!loc)
+ return NULL;
+
+ RB_CLEAR_NODE(&loc->cl_node);
+ return loc;
+}
+
+static void free_crush_loc(struct crush_loc_node *loc)
+{
+ WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
+
+ kfree(loc);
+}
+
+static int crush_loc_compare(const struct crush_loc *loc1,
+ const struct crush_loc *loc2)
+{
+ return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
+ strcmp(loc1->cl_name, loc2->cl_name);
+}
+
+DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
+ RB_BYPTR, const struct crush_loc *, cl_node)
+
+/*
+ * Parses a set of <bucket type name>':'<bucket name> pairs separated
+ * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
+ *
+ * Note that @crush_location is modified by strsep().
+ */
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
+{
+ struct crush_loc_node *loc;
+ const char *type_name, *name, *colon;
+ size_t type_name_len, name_len;
+
+ dout("%s '%s'\n", __func__, crush_location);
+ while ((type_name = strsep(&crush_location, "|"))) {
+ colon = strchr(type_name, ':');
+ if (!colon)
+ return -EINVAL;
+
+ type_name_len = colon - type_name;
+ if (type_name_len == 0)
+ return -EINVAL;
+
+ name = colon + 1;
+ name_len = strlen(name);
+ if (name_len == 0)
+ return -EINVAL;
+
+ loc = alloc_crush_loc(type_name_len, name_len);
+ if (!loc)
+ return -ENOMEM;
+
+ loc->cl_loc.cl_type_name = loc->cl_data;
+ memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
+ loc->cl_loc.cl_type_name[type_name_len] = '\0';
+
+ loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
+ memcpy(loc->cl_loc.cl_name, name, name_len);
+ loc->cl_loc.cl_name[name_len] = '\0';
+
+ if (!__insert_crush_loc(locs, loc)) {
+ free_crush_loc(loc);
+ return -EEXIST;
+ }
+
+ dout("%s type_name '%s' name '%s'\n", __func__,
+ loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
+ }
+
+ return 0;
+}
+
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
+{
+ struct rb_node *n1 = rb_first(locs1);
+ struct rb_node *n2 = rb_first(locs2);
+ int ret;
+
+ for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
+ struct crush_loc_node *loc1 =
+ rb_entry(n1, struct crush_loc_node, cl_node);
+ struct crush_loc_node *loc2 =
+ rb_entry(n2, struct crush_loc_node, cl_node);
+
+ ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
+ if (ret)
+ return ret;
+ }
+
+ if (!n1 && n2)
+ return -1;
+ if (n1 && !n2)
+ return 1;
+ return 0;
+}
+
+void ceph_clear_crush_locs(struct rb_root *locs)
+{
+ while (!RB_EMPTY_ROOT(locs)) {
+ struct crush_loc_node *loc =
+ rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
+
+ erase_crush_loc(locs, loc);
+ free_crush_loc(loc);
+ }
+}
+
+/*
+ * [a-zA-Z0-9-_.]+
+ */
+static bool is_valid_crush_name(const char *name)
+{
+ do {
+ if (!('a' <= *name && *name <= 'z') &&
+ !('A' <= *name && *name <= 'Z') &&
+ !('0' <= *name && *name <= '9') &&
+ *name != '-' && *name != '_' && *name != '.')
+ return false;
+ } while (*++name != '\0');
+
+ return true;
+}
+
+/*
+ * Gets the parent of an item. Returns its id (<0 because the
+ * parent is always a bucket), type id (>0 for the same reason,
+ * via @parent_type_id) and location (via @parent_loc). If no
+ * parent, returns 0.
+ *
+ * Does a linear search, as there are no parent pointers of any
+ * kind. Note that the result is ambiguous for items that occur
+ * multiple times in the map.
+ */
+static int get_immediate_parent(struct crush_map *c, int id,
+ u16 *parent_type_id,
+ struct crush_loc *parent_loc)
+{
+ struct crush_bucket *b;
+ struct crush_name_node *type_cn, *cn;
+ int i, j;
+
+ for (i = 0; i < c->max_buckets; i++) {
+ b = c->buckets[i];
+ if (!b)
+ continue;
+
+ /* ignore per-class shadow hierarchy */
+ cn = lookup_crush_name(&c->names, b->id);
+ if (!cn || !is_valid_crush_name(cn->cn_name))
+ continue;
+
+ for (j = 0; j < b->size; j++) {
+ if (b->items[j] != id)
+ continue;
+
+ *parent_type_id = b->type;
+ type_cn = lookup_crush_name(&c->type_names, b->type);
+ parent_loc->cl_type_name = type_cn->cn_name;
+ parent_loc->cl_name = cn->cn_name;
+ return b->id;
+ }
+ }
+
+ return 0; /* no parent */
+}
+
+/*
+ * Calculates the locality/distance from an item to a client
+ * location expressed in terms of CRUSH hierarchy as a set of
+ * (bucket type name, bucket name) pairs. Specifically, looks
+ * for the lowest-valued bucket type for which the location of
+ * @id matches one of the locations in @locs, so for standard
+ * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
+ * a matching host is closer than a matching rack and a matching
+ * data center is closer than a matching zone.
+ *
+ * Specifying multiple locations (a "multipath" location) such
+ * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
+ * is a multimap. The locality will be:
+ *
+ * - 3 for OSDs in racks foo1 and foo2
+ * - 8 for OSDs in data center bar
+ * - -1 for all other OSDs
+ *
+ * The lowest possible bucket type is 1, so the best locality
+ * for an OSD is 1 (i.e. a matching host). Locality 0 would be
+ * the OSD itself.
+ */
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+ struct rb_root *locs)
+{
+ struct crush_loc loc;
+ u16 type_id;
+
+ /*
+ * Instead of repeated get_immediate_parent() calls,
+ * the location of @id could be obtained with a single
+ * depth-first traversal.
+ */
+ for (;;) {
+ id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
+ if (id >= 0)
+ return -1; /* not local */
+
+ if (lookup_crush_loc(locs, &loc))
+ return type_id;
+ }
+}
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..5a9c4be5f222
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags)
+{
+ struct ceph_pagelist *pl;
+
+ pl = kmalloc(sizeof(*pl), gfp_flags);
+ if (!pl)
+ return NULL;
+
+ INIT_LIST_HEAD(&pl->head);
+ pl->mapped_tail = NULL;
+ pl->length = 0;
+ pl->room = 0;
+ INIT_LIST_HEAD(&pl->free_list);
+ pl->num_pages_free = 0;
+ refcount_set(&pl->refcnt, 1);
+
+ return pl;
+}
+EXPORT_SYMBOL(ceph_pagelist_alloc);
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+ if (pl->mapped_tail) {
+ struct page *page = list_entry(pl->head.prev, struct page, lru);
+ kunmap(page);
+ pl->mapped_tail = NULL;
+ }
+}
+
+void ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+ if (!refcount_dec_and_test(&pl->refcnt))
+ return;
+ ceph_pagelist_unmap_tail(pl);
+ while (!list_empty(&pl->head)) {
+ struct page *page = list_first_entry(&pl->head, struct page,
+ lru);
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ ceph_pagelist_free_reserve(pl);
+ kfree(pl);
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+ struct page *page;
+
+ if (!pl->num_pages_free) {
+ page = __page_cache_alloc(GFP_NOFS);
+ } else {
+ page = list_first_entry(&pl->free_list, struct page, lru);
+ list_del(&page->lru);
+ --pl->num_pages_free;
+ }
+ if (!page)
+ return -ENOMEM;
+ pl->room += PAGE_SIZE;
+ ceph_pagelist_unmap_tail(pl);
+ list_add_tail(&page->lru, &pl->head);
+ pl->mapped_tail = kmap(page);
+ return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+ while (pl->room < len) {
+ size_t bit = pl->room;
+ int ret;
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK),
+ buf, bit);
+ pl->length += bit;
+ pl->room -= bit;
+ buf += bit;
+ len -= bit;
+ ret = ceph_pagelist_addpage(pl);
+ if (ret)
+ return ret;
+ }
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK), buf, len);
+ pl->length += len;
+ pl->room -= len;
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/* Allocate enough pages for a pagelist to append the given amount
+ * of data without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+ if (space <= pl->room)
+ return 0;
+ space -= pl->room;
+ space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
+
+ while (space > pl->num_pages_free) {
+ struct page *page = __page_cache_alloc(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ list_add_tail(&page->lru, &pl->free_list);
+ ++pl->num_pages_free;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/* Free any pages that have been preallocated. */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+ while (!list_empty(&pl->free_list)) {
+ struct page *page = list_first_entry(&pl->free_list,
+ struct page, lru);
+ list_del(&page->lru);
+ __free_page(page);
+ --pl->num_pages_free;
+ }
+ BUG_ON(pl->num_pages_free);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..4509757d8b3b
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++) {
+ if (dirty)
+ set_page_dirty_lock(pages[i]);
+ put_page(pages[i]);
+ }
+ kvfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ __free_pages(pages[i], 0);
+ kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc_array(num_pages, sizeof(*pages), flags);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = __page_cache_alloc(flags);
+ if (pages[i] == NULL) {
+ ceph_release_page_vector(pages, i);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+void ceph_copy_from_page_vector(struct page **pages,
+ void *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ size_t po = off & ~PAGE_MASK;
+ size_t left = len;
+
+ while (left > 0) {
+ size_t l = min_t(size_t, PAGE_SIZE-po, left);
+
+ memcpy(data, page_address(pages[i]) + po, l);
+ data += l;
+ left -= l;
+ po += l;
+ if (po == PAGE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * Zero an extent within a page vector. Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+ int i = off >> PAGE_SHIFT;
+
+ off &= ~PAGE_MASK;
+
+ dout("zero_page_vector_page %u~%u\n", off, len);
+
+ /* leading partial page? */
+ if (off) {
+ int end = min((int)PAGE_SIZE, off + len);
+ dout("zeroing %d %p head from %d\n", i, pages[i],
+ (int)off);
+ zero_user_segment(pages[i], off, end);
+ len -= (end - off);
+ i++;
+ }
+ while (len >= PAGE_SIZE) {
+ dout("zeroing %d %p len=%d\n", i, pages[i], len);
+ zero_user_segment(pages[i], 0, PAGE_SIZE);
+ len -= PAGE_SIZE;
+ i++;
+ }
+ /* trailing partial page? */
+ if (len) {
+ dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+ zero_user_segment(pages[i], 0, len);
+ }
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
new file mode 100644
index 000000000000..e24315937c45
--- /dev/null
+++ b/net/ceph/snapshot.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * snapshot.c Ceph snapshot context utility routines (part of libceph)
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/ceph/libceph.h>
+
+/*
+ * Ceph snapshot contexts are reference counted objects, and the
+ * returned structure holds a single reference. Acquire additional
+ * references with ceph_get_snap_context(), and release them with
+ * ceph_put_snap_context(). When the reference count reaches zero
+ * the entire structure is freed.
+ */
+
+/*
+ * Create a new ceph snapshot context large enough to hold the
+ * indicated number of snapshot ids (which can be 0). Caller has
+ * to fill in snapc->seq and snapc->snaps[0..snap_count-1].
+ *
+ * Returns a null pointer if an error occurs.
+ */
+struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+ gfp_t gfp_flags)
+{
+ struct ceph_snap_context *snapc;
+ size_t size;
+
+ size = sizeof (struct ceph_snap_context);
+ size += snap_count * sizeof (snapc->snaps[0]);
+ snapc = kzalloc(size, gfp_flags);
+ if (!snapc)
+ return NULL;
+
+ refcount_set(&snapc->nref, 1);
+ snapc->num_snaps = snap_count;
+
+ return snapc;
+}
+EXPORT_SYMBOL(ceph_create_snap_context);
+
+struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+ if (sc)
+ refcount_inc(&sc->nref);
+ return sc;
+}
+EXPORT_SYMBOL(ceph_get_snap_context);
+
+void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+ if (!sc)
+ return;
+ if (refcount_dec_and_test(&sc->nref)) {
+ /*printk(" deleting snap_context %p\n", sc);*/
+ kfree(sc);
+ }
+}
+EXPORT_SYMBOL(ceph_put_snap_context);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 000000000000..3191d9d160a2
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+ struct ceph_string *cs, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&string_tree_lock);
+ p = &string_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist && !kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ exist = NULL;
+ }
+ spin_unlock(&string_tree_lock);
+ if (exist)
+ return exist;
+
+ cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+ if (!cs)
+ return NULL;
+
+ kref_init(&cs->kref);
+ cs->len = len;
+ memcpy(cs->str, str, len);
+ cs->str[len] = 0;
+
+retry:
+ exist = NULL;
+ parent = NULL;
+ p = &string_tree.rb_node;
+ spin_lock(&string_tree_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ ret = 0;
+ if (!exist) {
+ rb_link_node(&cs->node, parent, p);
+ rb_insert_color(&cs->node, &string_tree);
+ } else if (!kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&string_tree_lock);
+ if (ret == -EAGAIN)
+ goto retry;
+
+ if (exist) {
+ kfree(cs);
+ cs = exist;
+ }
+
+ return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+void ceph_release_string(struct kref *ref)
+{
+ struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+ spin_lock(&string_tree_lock);
+ if (!RB_EMPTY_NODE(&cs->node)) {
+ rb_erase(&cs->node, &string_tree);
+ RB_CLEAR_NODE(&cs->node);
+ }
+ spin_unlock(&string_tree_lock);
+
+ kfree_rcu(cs, rcu);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+bool ceph_strings_empty(void)
+{
+ return RB_EMPTY_ROOT(&string_tree);
+}
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
new file mode 100644
index 000000000000..3b3fa75d1189
--- /dev/null
+++ b/net/ceph/striper.c
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/math64.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/striper.h>
+#include <linux/ceph/types.h>
+
+/*
+ * Map a file extent to a stripe unit within an object.
+ * Fill in objno, offset into object, and object extent length (i.e. the
+ * number of bytes mapped, less than or equal to @l->stripe_unit).
+ *
+ * Example for stripe_count = 3, stripes_per_object = 4:
+ *
+ * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19
+ * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6
+ * stripepos | 0 | 1 | 2 | 0 | 1
+ * objno | 0 | 1 | 2 | 3 | 4
+ * objsetno | 0 | 1
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *l,
+ u64 off, u64 len,
+ u64 *objno, u64 *objoff, u32 *xlen)
+{
+ u32 stripes_per_object = l->object_size / l->stripe_unit;
+ u64 blockno; /* which su in the file (i.e. globally) */
+ u32 blockoff; /* offset into su */
+ u64 stripeno; /* which stripe */
+ u32 stripepos; /* which su in the stripe,
+ which object in the object set */
+ u64 objsetno; /* which object set */
+ u32 objsetpos; /* which stripe in the object set */
+
+ blockno = div_u64_rem(off, l->stripe_unit, &blockoff);
+ stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos);
+ objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos);
+
+ *objno = objsetno * l->stripe_count + stripepos;
+ *objoff = objsetpos * l->stripe_unit + blockoff;
+ *xlen = min_t(u64, len, l->stripe_unit - blockoff);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * Return the last extent with given objno (@object_extents is sorted
+ * by objno). If not found, return NULL and set @add_pos so that the
+ * new extent can be added with list_add(add_pos, new_ex).
+ */
+static struct ceph_object_extent *
+lookup_last(struct list_head *object_extents, u64 objno,
+ struct list_head **add_pos)
+{
+ struct list_head *pos;
+
+ list_for_each_prev(pos, object_extents) {
+ struct ceph_object_extent *ex =
+ list_entry(pos, typeof(*ex), oe_item);
+
+ if (ex->oe_objno == objno)
+ return ex;
+
+ if (ex->oe_objno < objno)
+ break;
+ }
+
+ *add_pos = pos;
+ return NULL;
+}
+
+static struct ceph_object_extent *
+lookup_containing(struct list_head *object_extents, u64 objno,
+ u64 objoff, u32 xlen)
+{
+ struct ceph_object_extent *ex;
+
+ list_for_each_entry(ex, object_extents, oe_item) {
+ if (ex->oe_objno == objno &&
+ ex->oe_off <= objoff &&
+ ex->oe_off + ex->oe_len >= objoff + xlen) /* paranoia */
+ return ex;
+
+ if (ex->oe_objno > objno)
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * Map a file extent to a sorted list of object extents.
+ *
+ * We want only one (or as few as possible) object extents per object.
+ * Adjacent object extents will be merged together, each returned object
+ * extent may reverse map to multiple different file extents.
+ *
+ * Call @alloc_fn for each new object extent and @action_fn for each
+ * mapped stripe unit, whether it was merged into an already allocated
+ * object extent or started a new object extent.
+ *
+ * Newly allocated object extents are added to @object_extents.
+ * To keep @object_extents sorted, successive calls to this function
+ * must map successive file extents (i.e. the list of file extents that
+ * are mapped using the same @object_extents must be sorted).
+ *
+ * The caller is responsible for @object_extents.
+ */
+int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ struct ceph_object_extent *alloc_fn(void *arg),
+ void *alloc_arg,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg)
+{
+ struct ceph_object_extent *last_ex, *ex;
+
+ while (len) {
+ struct list_head *add_pos = NULL;
+ u64 objno, objoff;
+ u32 xlen;
+
+ ceph_calc_file_object_mapping(l, off, len, &objno, &objoff,
+ &xlen);
+
+ last_ex = lookup_last(object_extents, objno, &add_pos);
+ if (!last_ex || last_ex->oe_off + last_ex->oe_len != objoff) {
+ ex = alloc_fn(alloc_arg);
+ if (!ex)
+ return -ENOMEM;
+
+ ex->oe_objno = objno;
+ ex->oe_off = objoff;
+ ex->oe_len = xlen;
+ if (action_fn)
+ action_fn(ex, xlen, action_arg);
+
+ if (!last_ex)
+ list_add(&ex->oe_item, add_pos);
+ else
+ list_add(&ex->oe_item, &last_ex->oe_item);
+ } else {
+ last_ex->oe_len += xlen;
+ if (action_fn)
+ action_fn(last_ex, xlen, action_arg);
+ }
+
+ off += xlen;
+ len -= xlen;
+ }
+
+ for (last_ex = list_first_entry(object_extents, typeof(*ex), oe_item),
+ ex = list_next_entry(last_ex, oe_item);
+ &ex->oe_item != object_extents;
+ last_ex = ex, ex = list_next_entry(ex, oe_item)) {
+ if (last_ex->oe_objno > ex->oe_objno ||
+ (last_ex->oe_objno == ex->oe_objno &&
+ last_ex->oe_off + last_ex->oe_len >= ex->oe_off)) {
+ WARN(1, "%s: object_extents list not sorted!\n",
+ __func__);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_file_to_extents);
+
+/*
+ * A stripped down, non-allocating version of ceph_file_to_extents(),
+ * for when @object_extents is already populated.
+ */
+int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg)
+{
+ while (len) {
+ struct ceph_object_extent *ex;
+ u64 objno, objoff;
+ u32 xlen;
+
+ ceph_calc_file_object_mapping(l, off, len, &objno, &objoff,
+ &xlen);
+
+ ex = lookup_containing(object_extents, objno, objoff, xlen);
+ if (!ex) {
+ WARN(1, "%s: objno %llu %llu~%u not found!\n",
+ __func__, objno, objoff, xlen);
+ return -EINVAL;
+ }
+
+ action_fn(ex, xlen, action_arg);
+
+ off += xlen;
+ len -= xlen;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_iterate_extents);
+
+/*
+ * Reverse map an object extent to a sorted list of file extents.
+ *
+ * On success, the caller is responsible for:
+ *
+ * kfree(file_extents)
+ */
+int ceph_extent_to_file(struct ceph_file_layout *l,
+ u64 objno, u64 objoff, u64 objlen,
+ struct ceph_file_extent **file_extents,
+ u32 *num_file_extents)
+{
+ u32 stripes_per_object = l->object_size / l->stripe_unit;
+ u64 blockno; /* which su */
+ u32 blockoff; /* offset into su */
+ u64 stripeno; /* which stripe */
+ u32 stripepos; /* which su in the stripe,
+ which object in the object set */
+ u64 objsetno; /* which object set */
+ u32 i = 0;
+
+ if (!objlen) {
+ *file_extents = NULL;
+ *num_file_extents = 0;
+ return 0;
+ }
+
+ *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) -
+ DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit);
+ *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents),
+ GFP_NOIO);
+ if (!*file_extents)
+ return -ENOMEM;
+
+ div_u64_rem(objoff, l->stripe_unit, &blockoff);
+ while (objlen) {
+ u64 off, len;
+
+ objsetno = div_u64_rem(objno, l->stripe_count, &stripepos);
+ stripeno = div_u64(objoff, l->stripe_unit) +
+ objsetno * stripes_per_object;
+ blockno = stripeno * l->stripe_count + stripepos;
+ off = blockno * l->stripe_unit + blockoff;
+ len = min_t(u64, objlen, l->stripe_unit - blockoff);
+
+ (*file_extents)[i].fe_off = off;
+ (*file_extents)[i].fe_len = len;
+
+ blockoff = 0;
+ objoff += len;
+ objlen -= len;
+ i++;
+ }
+
+ BUG_ON(i != *num_file_extents);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_extent_to_file);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
+{
+ u64 period = (u64)l->stripe_count * l->object_size;
+ u64 num_periods = DIV64_U64_ROUND_UP(size, period);
+ u64 remainder_bytes;
+ u64 remainder_objs = 0;
+
+ div64_u64_rem(size, period, &remainder_bytes);
+ if (remainder_bytes > 0 &&
+ remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
+ remainder_objs = l->stripe_count -
+ DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
+
+ return num_periods * l->stripe_count - remainder_objs;
+}
+EXPORT_SYMBOL(ceph_get_num_objects);
diff --git a/net/compat.c b/net/compat.c
index 52d32f1bc728..2c9bd0edac99 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -1,4 +1,5 @@
-/*
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c.
*
* Copyright (C) 2000 VA Linux Co
@@ -8,12 +9,12 @@
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
* Copyright (C) 2000 Hewlett-Packard Co.
* Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2000,2001 Andi Kleen, SuSE Labs
+ * Copyright (C) 2000,2001 Andi Kleen, SuSE Labs
*/
#include <linux/kernel.h>
+#include <linux/gfp.h>
#include <linux/fs.h>
-#include <linux/sched.h>
#include <linux/types.h>
#include <linux/file.h>
#include <linux/icmpv6.h>
@@ -21,111 +22,111 @@
#include <linux/syscalls.h>
#include <linux/filter.h>
#include <linux/compat.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/export.h>
#include <net/scm.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/uaccess.h>
#include <net/compat.h>
-static inline int iov_from_user_compat_to_kern(struct iovec *kiov,
- struct compat_iovec __user *uiov32,
- int niov)
+int __get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr *msg,
+ struct sockaddr __user **save_addr)
{
- int tot_len = 0;
+ ssize_t err;
- while(niov > 0) {
- compat_uptr_t buf;
- compat_size_t len;
+ kmsg->msg_flags = msg->msg_flags;
+ kmsg->msg_namelen = msg->msg_namelen;
- if(get_user(len, &uiov32->iov_len) ||
- get_user(buf, &uiov32->iov_base)) {
- tot_len = -EFAULT;
- break;
+ if (!msg->msg_name)
+ kmsg->msg_namelen = 0;
+
+ if (kmsg->msg_namelen < 0)
+ return -EINVAL;
+
+ if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
+ kmsg->msg_namelen = sizeof(struct sockaddr_storage);
+
+ kmsg->msg_control_is_user = true;
+ kmsg->msg_get_inq = 0;
+ kmsg->msg_control_user = compat_ptr(msg->msg_control);
+ kmsg->msg_controllen = msg->msg_controllen;
+
+ if (save_addr)
+ *save_addr = compat_ptr(msg->msg_name);
+
+ if (msg->msg_name && kmsg->msg_namelen) {
+ if (!save_addr) {
+ err = move_addr_to_kernel(compat_ptr(msg->msg_name),
+ kmsg->msg_namelen,
+ kmsg->msg_name);
+ if (err < 0)
+ return err;
}
- tot_len += len;
- kiov->iov_base = compat_ptr(buf);
- kiov->iov_len = (__kernel_size_t) len;
- uiov32++;
- kiov++;
- niov--;
+ } else {
+ kmsg->msg_name = NULL;
+ kmsg->msg_namelen = 0;
}
- return tot_len;
-}
-int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg)
-{
- compat_uptr_t tmp1, tmp2, tmp3;
-
- if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
- __get_user(tmp1, &umsg->msg_name) ||
- __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
- __get_user(tmp2, &umsg->msg_iov) ||
- __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
- __get_user(tmp3, &umsg->msg_control) ||
- __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
- __get_user(kmsg->msg_flags, &umsg->msg_flags))
- return -EFAULT;
- kmsg->msg_name = compat_ptr(tmp1);
- kmsg->msg_iov = compat_ptr(tmp2);
- kmsg->msg_control = compat_ptr(tmp3);
+ if (msg->msg_iovlen > UIO_MAXIOV)
+ return -EMSGSIZE;
+
+ kmsg->msg_iocb = NULL;
+ kmsg->msg_ubuf = NULL;
return 0;
}
-/* I've named the args so it is easy to tell whose space the pointers are in. */
-int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
- char *kern_address, int mode)
+int get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
{
- int tot_len;
-
- if(kern_msg->msg_namelen) {
- if(mode==VERIFY_READ) {
- int err = move_addr_to_kernel(kern_msg->msg_name,
- kern_msg->msg_namelen,
- kern_address);
- if(err < 0)
- return err;
- }
- kern_msg->msg_name = kern_address;
- } else
- kern_msg->msg_name = NULL;
+ struct compat_msghdr msg;
+ ssize_t err;
- tot_len = iov_from_user_compat_to_kern(kern_iov,
- (struct compat_iovec __user *)kern_msg->msg_iov,
- kern_msg->msg_iovlen);
- if(tot_len >= 0)
- kern_msg->msg_iov = kern_iov;
+ if (copy_from_user(&msg, umsg, sizeof(*umsg)))
+ return -EFAULT;
- return tot_len;
+ err = __get_compat_msghdr(kmsg, &msg, save_addr);
+ if (err)
+ return err;
+
+ err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE,
+ compat_ptr(msg.msg_iov), msg.msg_iovlen,
+ UIO_FASTIOV, iov, &kmsg->msg_iter);
+ return err < 0 ? err : 0;
}
/* Bleech... */
#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32))
#define CMSG_COMPAT_DATA(cmsg) \
- ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+ ((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
#define CMSG_COMPAT_SPACE(len) \
- (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+ (sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
#define CMSG_COMPAT_LEN(len) \
- (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+ (sizeof(struct compat_cmsghdr) + (len))
#define CMSG_COMPAT_FIRSTHDR(msg) \
(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
- (struct compat_cmsghdr __user *)((msg)->msg_control) : \
+ (struct compat_cmsghdr __user *)((msg)->msg_control_user) : \
(struct compat_cmsghdr __user *)NULL)
#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \
((ucmlen) >= sizeof(struct compat_cmsghdr) && \
(ucmlen) <= (unsigned long) \
((mhdr)->msg_controllen - \
- ((char *)(ucmsg) - (char *)(mhdr)->msg_control)))
+ ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user)))
static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg,
struct compat_cmsghdr __user *cmsg, int cmsg_len)
{
char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len);
- if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) >
+ if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control_user) >
msg->msg_controllen)
return NULL;
return (struct compat_cmsghdr __user *)ptr;
@@ -144,24 +145,26 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
__kernel_size_t kcmlen, tmp;
int err = -EFAULT;
+ BUILD_BUG_ON(sizeof(struct compat_cmsghdr) !=
+ CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)));
+
kcmlen = 0;
kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
- while(ucmsg != NULL) {
- if(get_user(ucmlen, &ucmsg->cmsg_len))
+ while (ucmsg != NULL) {
+ if (get_user(ucmlen, &ucmsg->cmsg_len))
return -EFAULT;
/* Catch bogons. */
if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
return -EINVAL;
- tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
- CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
tmp = CMSG_ALIGN(tmp);
kcmlen += tmp;
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
- if(kcmlen == 0)
+ if (kcmlen == 0)
return -EINVAL;
/* The kcmlen holds the 64-bit version of the control length.
@@ -172,35 +175,43 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
if (kcmlen > stackbuf_size)
kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
if (kcmsg == NULL)
- return -ENOBUFS;
+ return -ENOMEM;
/* Now copy them over neatly. */
memset(kcmsg, 0, kcmlen);
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
- while(ucmsg != NULL) {
- if (__get_user(ucmlen, &ucmsg->cmsg_len))
+ while (ucmsg != NULL) {
+ struct compat_cmsghdr cmsg;
+ if (copy_from_user(&cmsg, ucmsg, sizeof(cmsg)))
goto Efault;
- if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+ if (!CMSG_COMPAT_OK(cmsg.cmsg_len, ucmsg, kmsg))
goto Einval;
- tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
- CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = ((cmsg.cmsg_len - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
goto Einval;
kcmsg->cmsg_len = tmp;
+ kcmsg->cmsg_level = cmsg.cmsg_level;
+ kcmsg->cmsg_type = cmsg.cmsg_type;
tmp = CMSG_ALIGN(tmp);
- if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
- __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
- copy_from_user(CMSG_DATA(kcmsg),
+ if (copy_from_user(CMSG_DATA(kcmsg),
CMSG_COMPAT_DATA(ucmsg),
- (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+ (cmsg.cmsg_len - sizeof(*ucmsg))))
goto Efault;
/* Advance. */
kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
- ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
+ ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, cmsg.cmsg_len);
}
+ /*
+ * check the length of messages copied in is the same as the
+ * what we get from the first loop
+ */
+ if ((char *)kcmsg - (char *)kcmsg_base != kcmlen)
+ goto Einval;
+
/* Ok, looks like we made it. Hook it up and return success. */
+ kmsg->msg_control_is_user = false;
kmsg->msg_control = kcmsg_base;
kmsg->msg_controllen = kcmlen;
return 0;
@@ -215,26 +226,41 @@ Efault:
int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
{
- struct compat_timeval ctv;
- struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+ struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control_user;
struct compat_cmsghdr cmhdr;
+ struct old_timeval32 ctv;
+ struct old_timespec32 cts[3];
int cmlen;
- if(cm == NULL || kmsg->msg_controllen < sizeof(*cm)) {
+ if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) {
kmsg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
- if (level == SOL_SOCKET && type == SO_TIMESTAMP) {
- struct timeval *tv = (struct timeval *)data;
- ctv.tv_sec = tv->tv_sec;
- ctv.tv_usec = tv->tv_usec;
- data = &ctv;
- len = sizeof(struct compat_timeval);
- }
-
+ if (!COMPAT_USE_64BIT_TIME) {
+ if (level == SOL_SOCKET && type == SO_TIMESTAMP_OLD) {
+ struct __kernel_old_timeval *tv = (struct __kernel_old_timeval *)data;
+ ctv.tv_sec = tv->tv_sec;
+ ctv.tv_usec = tv->tv_usec;
+ data = &ctv;
+ len = sizeof(ctv);
+ }
+ if (level == SOL_SOCKET &&
+ (type == SO_TIMESTAMPNS_OLD || type == SO_TIMESTAMPING_OLD)) {
+ int count = type == SO_TIMESTAMPNS_OLD ? 1 : 3;
+ int i;
+ struct __kernel_old_timespec *ts = data;
+ for (i = 0; i < count; i++) {
+ cts[i].tv_sec = ts[i].tv_sec;
+ cts[i].tv_nsec = ts[i].tv_nsec;
+ }
+ data = &cts;
+ len = sizeof(cts[0]) * count;
+ }
+ }
+
cmlen = CMSG_COMPAT_LEN(len);
- if(kmsg->msg_controllen < cmlen) {
+ if (kmsg->msg_controllen < cmlen) {
kmsg->msg_flags |= MSG_CTRUNC;
cmlen = kmsg->msg_controllen;
}
@@ -242,49 +268,43 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
cmhdr.cmsg_type = type;
cmhdr.cmsg_len = cmlen;
- if(copy_to_user(cm, &cmhdr, sizeof cmhdr))
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
return -EFAULT;
- if(copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr)))
+ if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr)))
return -EFAULT;
cmlen = CMSG_COMPAT_SPACE(len);
- kmsg->msg_control += cmlen;
+ if (kmsg->msg_controllen < cmlen)
+ cmlen = kmsg->msg_controllen;
+ kmsg->msg_control_user += cmlen;
kmsg->msg_controllen -= cmlen;
return 0;
}
-void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
+static int scm_max_fds_compat(struct msghdr *msg)
{
- struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
- int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
- int fdnum = scm->fp->count;
- struct file **fp = scm->fp->fp;
- int __user *cmfptr;
- int err = 0, i;
+ if (msg->msg_controllen <= sizeof(struct compat_cmsghdr))
+ return 0;
+ return (msg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
+}
- if (fdnum < fdmax)
- fdmax = fdnum;
+void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct compat_cmsghdr __user *cm =
+ (struct compat_cmsghdr __user *)msg->msg_control_user;
+ unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ int fdmax = min_t(int, scm_max_fds_compat(msg), scm->fp->count);
+ int __user *cmsg_data = CMSG_COMPAT_DATA(cm);
+ int err = 0, i;
- for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) {
- int new_fd;
- err = security_file_receive(fp[i]);
- if (err)
- break;
- err = get_unused_fd();
+ for (i = 0; i < fdmax; i++) {
+ err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
if (err < 0)
break;
- new_fd = err;
- err = put_user(new_fd, cmfptr);
- if (err) {
- put_unused_fd(new_fd);
- break;
- }
- /* Bump the usage count and install the file. */
- get_file(fp[i]);
- fd_install(new_fd, fp[i]);
}
if (i > 0) {
int cmlen = CMSG_COMPAT_LEN(i * sizeof(int));
+
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
@@ -292,385 +312,204 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_COMPAT_SPACE(i * sizeof(int));
- kmsg->msg_control += cmlen;
- kmsg->msg_controllen -= cmlen;
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control_user += cmlen;
+ msg->msg_controllen -= cmlen;
}
}
- if (i < fdnum)
- kmsg->msg_flags |= MSG_CTRUNC;
+
+ if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
+ msg->msg_flags |= MSG_CTRUNC;
/*
- * All of the files that fit in the message have had their
- * usage counts incremented, so we just free the list.
+ * All of the files that fit in the message have had their usage counts
+ * incremented, so we just free the list.
*/
__scm_destroy(scm);
}
-/*
- * For now, we assume that the compatibility and native version
- * of struct ipt_entry are the same - sfr. FIXME
- */
-struct compat_ipt_replace {
- char name[IPT_TABLE_MAXNAMELEN];
- u32 valid_hooks;
- u32 num_entries;
- u32 size;
- u32 hook_entry[NF_IP_NUMHOOKS];
- u32 underflow[NF_IP_NUMHOOKS];
- u32 num_counters;
- compat_uptr_t counters; /* struct ipt_counters * */
- struct ipt_entry entries[0];
+/* Argument list sizes for compat_sys_socketcall */
+#define AL(x) ((x) * sizeof(u32))
+static unsigned char nas[21] = {
+ AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+ AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+ AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+ AL(4), AL(5), AL(4)
};
+#undef AL
-static int do_netfilter_replace(int fd, int level, int optname,
- char __user *optval, int optlen)
+static inline long __compat_sys_sendmsg(int fd,
+ struct compat_msghdr __user *msg,
+ unsigned int flags)
{
- struct compat_ipt_replace __user *urepl;
- struct ipt_replace __user *repl_nat;
- char name[IPT_TABLE_MAXNAMELEN];
- u32 origsize, tmp32, num_counters;
- unsigned int repl_nat_size;
- int ret;
- int i;
- compat_uptr_t ucntrs;
-
- urepl = (struct compat_ipt_replace __user *)optval;
- if (get_user(origsize, &urepl->size))
- return -EFAULT;
-
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (optlen != sizeof(*urepl) + origsize)
- return -ENOPROTOOPT;
-
- /* XXX Assumes that size of ipt_entry is the same both in
- * native and compat environments.
- */
- repl_nat_size = sizeof(*repl_nat) + origsize;
- repl_nat = compat_alloc_user_space(repl_nat_size);
-
- ret = -EFAULT;
- if (put_user(origsize, &repl_nat->size))
- goto out;
-
- if (!access_ok(VERIFY_READ, urepl, optlen) ||
- !access_ok(VERIFY_WRITE, repl_nat, optlen))
- goto out;
-
- if (__copy_from_user(name, urepl->name, sizeof(urepl->name)) ||
- __copy_to_user(repl_nat->name, name, sizeof(repl_nat->name)))
- goto out;
-
- if (__get_user(tmp32, &urepl->valid_hooks) ||
- __put_user(tmp32, &repl_nat->valid_hooks))
- goto out;
-
- if (__get_user(tmp32, &urepl->num_entries) ||
- __put_user(tmp32, &repl_nat->num_entries))
- goto out;
-
- if (__get_user(num_counters, &urepl->num_counters) ||
- __put_user(num_counters, &repl_nat->num_counters))
- goto out;
-
- if (__get_user(ucntrs, &urepl->counters) ||
- __put_user(compat_ptr(ucntrs), &repl_nat->counters))
- goto out;
-
- if (__copy_in_user(&repl_nat->entries[0],
- &urepl->entries[0],
- origsize))
- goto out;
-
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- if (__get_user(tmp32, &urepl->hook_entry[i]) ||
- __put_user(tmp32, &repl_nat->hook_entry[i]) ||
- __get_user(tmp32, &urepl->underflow[i]) ||
- __put_user(tmp32, &repl_nat->underflow[i]))
- goto out;
- }
-
- /*
- * Since struct ipt_counters just contains two u_int64_t members
- * we can just do the access_ok check here and pass the (converted)
- * pointer into the standard syscall. We hope that the pointer is
- * not misaligned ...
- */
- if (!access_ok(VERIFY_WRITE, compat_ptr(ucntrs),
- num_counters * sizeof(struct ipt_counters)))
- goto out;
-
-
- ret = sys_setsockopt(fd, level, optname,
- (char __user *)repl_nat, repl_nat_size);
-
-out:
- return ret;
+ return __sys_sendmsg(fd, (struct user_msghdr __user *)msg,
+ flags | MSG_CMSG_COMPAT, false);
}
-/*
- * A struct sock_filter is architecture independent.
- */
-struct compat_sock_fprog {
- u16 len;
- compat_uptr_t filter; /* struct sock_filter * */
-};
-
-static int do_set_attach_filter(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg,
+ unsigned int, flags)
{
- struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
- struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
- compat_uptr_t ptr;
- u16 len;
-
- if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) ||
- !access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) ||
- __get_user(len, &fprog32->len) ||
- __get_user(ptr, &fprog32->filter) ||
- __put_user(len, &kfprog->len) ||
- __put_user(compat_ptr(ptr), &kfprog->filter))
- return -EFAULT;
-
- return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
- sizeof(struct sock_fprog));
+ return __compat_sys_sendmsg(fd, msg, flags);
}
-static int do_set_sock_timeout(struct socket *sock, int level,
- int optname, char __user *optval, int optlen)
+static inline long __compat_sys_sendmmsg(int fd,
+ struct compat_mmsghdr __user *mmsg,
+ unsigned int vlen, unsigned int flags)
{
- struct compat_timeval __user *up = (struct compat_timeval __user *) optval;
- struct timeval ktime;
- mm_segment_t old_fs;
- int err;
-
- if (optlen < sizeof(*up))
- return -EINVAL;
- if (!access_ok(VERIFY_READ, up, sizeof(*up)) ||
- __get_user(ktime.tv_sec, &up->tv_sec) ||
- __get_user(ktime.tv_usec, &up->tv_usec))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = sock_setsockopt(sock, level, optname, (char *) &ktime, sizeof(ktime));
- set_fs(old_fs);
-
- return err;
+ return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, false);
}
-static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags)
{
- if (optname == SO_ATTACH_FILTER)
- return do_set_attach_filter(sock, level, optname,
- optval, optlen);
- if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
- return do_set_sock_timeout(sock, level, optname, optval, optlen);
-
- return sock_setsockopt(sock, level, optname, optval, optlen);
+ return __compat_sys_sendmmsg(fd, mmsg, vlen, flags);
}
-asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
- char __user *optval, int optlen)
+static inline long __compat_sys_recvmsg(int fd,
+ struct compat_msghdr __user *msg,
+ unsigned int flags)
{
- int err;
- struct socket *sock;
-
- if (level == SOL_IPV6 && optname == IPT_SO_SET_REPLACE)
- return do_netfilter_replace(fd, level, optname,
- optval, optlen);
-
- if (optlen < 0)
- return -EINVAL;
-
- if ((sock = sockfd_lookup(fd, &err))!=NULL)
- {
- err = security_socket_setsockopt(sock,level,optname);
- if (err) {
- sockfd_put(sock);
- return err;
- }
-
- if (level == SOL_SOCKET)
- err = compat_sock_setsockopt(sock, level,
- optname, optval, optlen);
- else if (sock->ops->compat_setsockopt)
- err = sock->ops->compat_setsockopt(sock, level,
- optname, optval, optlen);
- else
- err = sock->ops->setsockopt(sock, level,
- optname, optval, optlen);
- sockfd_put(sock);
- }
- return err;
+ return __sys_recvmsg(fd, (struct user_msghdr __user *)msg,
+ flags | MSG_CMSG_COMPAT, false);
}
-static int do_get_sock_timeout(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg,
+ unsigned int, flags)
{
- struct compat_timeval __user *up;
- struct timeval ktime;
- mm_segment_t old_fs;
- int len, err;
-
- up = (struct compat_timeval __user *) optval;
- if (get_user(len, optlen))
- return -EFAULT;
- if (len < sizeof(*up))
- return -EINVAL;
- len = sizeof(ktime);
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len);
- set_fs(old_fs);
-
- if (!err) {
- if (put_user(sizeof(*up), optlen) ||
- !access_ok(VERIFY_WRITE, up, sizeof(*up)) ||
- __put_user(ktime.tv_sec, &up->tv_sec) ||
- __put_user(ktime.tv_usec, &up->tv_usec))
- err = -EFAULT;
- }
- return err;
+ return __compat_sys_recvmsg(fd, msg, flags);
}
-static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+static inline long __compat_sys_recvfrom(int fd, void __user *buf,
+ compat_size_t len, unsigned int flags,
+ struct sockaddr __user *addr,
+ int __user *addrlen)
{
- if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
- return do_get_sock_timeout(sock, level, optname, optval, optlen);
- return sock_getsockopt(sock, level, optname, optval, optlen);
+ return __sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr,
+ addrlen);
}
-int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+COMPAT_SYSCALL_DEFINE4(recv, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags)
{
- struct compat_timeval __user *ctv =
- (struct compat_timeval __user*) userstamp;
- int err = -ENOENT;
-
- if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk);
- if (sk->sk_stamp.tv_sec == -1)
- return err;
- if (sk->sk_stamp.tv_sec == 0)
- do_gettimeofday(&sk->sk_stamp);
- if (put_user(sk->sk_stamp.tv_sec, &ctv->tv_sec) ||
- put_user(sk->sk_stamp.tv_usec, &ctv->tv_usec))
- err = -EFAULT;
- return err;
+ return __compat_sys_recvfrom(fd, buf, len, flags, NULL, NULL);
}
-EXPORT_SYMBOL(compat_sock_get_timestamp);
-asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
- char __user *optval, int __user *optlen)
+COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len,
+ unsigned int, flags, struct sockaddr __user *, addr,
+ int __user *, addrlen)
{
- int err;
- struct socket *sock;
-
- if ((sock = sockfd_lookup(fd, &err))!=NULL)
- {
- err = security_socket_getsockopt(sock, level,
- optname);
- if (err) {
- sockfd_put(sock);
- return err;
- }
-
- if (level == SOL_SOCKET)
- err = compat_sock_getsockopt(sock, level,
- optname, optval, optlen);
- else if (sock->ops->compat_getsockopt)
- err = sock->ops->compat_getsockopt(sock, level,
- optname, optval, optlen);
- else
- err = sock->ops->getsockopt(sock, level,
- optname, optval, optlen);
- sockfd_put(sock);
- }
- return err;
+ return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen);
}
-/* Argument list sizes for compat_sys_socketcall */
-#define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
- AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
- AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)};
-#undef AL
-asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout)
{
- return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, timeout, NULL);
}
-asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags)
+#ifdef CONFIG_COMPAT_32BIT_TIME
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct old_timespec32 __user *, timeout)
{
- return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, NULL, timeout);
}
+#endif
-asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
+COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
{
- int ret;
- u32 a[6];
+ u32 a[AUDITSC_ARGS];
+ unsigned int len;
u32 a0, a1;
-
- if (call < SYS_SOCKET || call > SYS_RECVMSG)
+ int ret;
+
+ if (call < SYS_SOCKET || call > SYS_SENDMMSG)
+ return -EINVAL;
+ len = nas[call];
+ if (len > sizeof(a))
return -EINVAL;
- if (copy_from_user(a, args, nas[call]))
+
+ if (copy_from_user(a, args, len))
return -EFAULT;
+
+ ret = audit_socketcall_compat(len / sizeof(a[0]), a);
+ if (ret)
+ return ret;
+
a0 = a[0];
a1 = a[1];
-
- switch(call) {
+
+ switch (call) {
case SYS_SOCKET:
- ret = sys_socket(a0, a1, a[2]);
+ ret = __sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
- ret = sys_bind(a0, compat_ptr(a1), a[2]);
+ ret = __sys_bind(a0, compat_ptr(a1), a[2]);
break;
case SYS_CONNECT:
- ret = sys_connect(a0, compat_ptr(a1), a[2]);
+ ret = __sys_connect(a0, compat_ptr(a1), a[2]);
break;
case SYS_LISTEN:
- ret = sys_listen(a0, a1);
+ ret = __sys_listen(a0, a1);
break;
case SYS_ACCEPT:
- ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2]));
+ ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
break;
case SYS_GETSOCKNAME:
- ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
+ ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
break;
case SYS_GETPEERNAME:
- ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
+ ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1);
break;
case SYS_SOCKETPAIR:
- ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
+ ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
break;
case SYS_SEND:
- ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
+ ret = __sys_sendto(a0, compat_ptr(a1), a[2], a[3], NULL, 0);
break;
case SYS_SENDTO:
- ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
+ ret = __sys_sendto(a0, compat_ptr(a1), a[2], a[3],
+ compat_ptr(a[4]), a[5]);
break;
case SYS_RECV:
- ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]);
+ ret = __compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3],
+ NULL, NULL);
break;
case SYS_RECVFROM:
- ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5]));
+ ret = __compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3],
+ compat_ptr(a[4]),
+ compat_ptr(a[5]));
break;
case SYS_SHUTDOWN:
- ret = sys_shutdown(a0,a1);
+ ret = __sys_shutdown(a0, a1);
break;
case SYS_SETSOCKOPT:
- ret = compat_sys_setsockopt(a0, a1, a[2],
- compat_ptr(a[3]), a[4]);
+ ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]);
break;
case SYS_GETSOCKOPT:
- ret = compat_sys_getsockopt(a0, a1, a[2],
- compat_ptr(a[3]), compat_ptr(a[4]));
+ ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]),
+ compat_ptr(a[4]));
break;
case SYS_SENDMSG:
- ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
+ ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
+ break;
+ case SYS_SENDMMSG:
+ ret = __compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_RECVMSG:
- ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
+ ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
+ break;
+ case SYS_RECVMMSG:
+ ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2],
+ a[3] | MSG_CMSG_COMPAT, NULL,
+ compat_ptr(a[4]));
+ break;
+ case SYS_ACCEPT4:
+ ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
break;
default:
ret = -EINVAL;
diff --git a/net/core/Makefile b/net/core/Makefile
index 73272d506e93..9ef2099c5426 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -1,19 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux networking core.
#
-obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o
+obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
+ flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \
- neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
+ fib_notifier.o xdp.o flow_offload.o gro.o \
+ netdev-genl.o netdev-genl-gen.o gso.o
-obj-$(CONFIG_XFRM) += flow.o
-obj-$(CONFIG_SYSFS) += net-sysfs.o
+obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
+
+obj-y += net-sysfs.o
+obj-y += hotdata.o
+obj-y += netdev_rx_queue.o
+obj-y += netdev_queues.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
+obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
-obj-$(CONFIG_WIRELESS_EXT) += wireless.o
obj-$(CONFIG_NETPOLL) += netpoll.o
-obj-$(CONFIG_NET_DMA) += user_dma.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o
+obj-$(CONFIG_NET_SELFTESTS) += selftests.o
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
+obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
+obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
+obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
+obj-$(CONFIG_OF) += of_net.o
+obj-$(CONFIG_NET_TEST) += net_test.o
+obj-$(CONFIG_NET_DEVMEM) += devmem.o
+obj-$(CONFIG_DEBUG_NET) += lock_debug.o
+obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
new file mode 100644
index 000000000000..850dd736ccd1
--- /dev/null
+++ b/net/core/bpf_sk_storage.c
@@ -0,0 +1,914 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/bpf_sk_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(sk_cache);
+
+static struct bpf_local_storage_data *
+bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_map *smap;
+
+ sk_storage =
+ rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held());
+ if (!sk_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
+}
+
+static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = bpf_sk_storage_lookup(sk, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+
+ return 0;
+}
+
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
+void bpf_sk_storage_free(struct sock *sk)
+{
+ struct bpf_local_storage *sk_storage;
+
+ rcu_read_lock_dont_migrate();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage)
+ goto out;
+
+ bpf_local_storage_destroy(sk_storage);
+out:
+ rcu_read_unlock_migrate();
+}
+
+static void bpf_sk_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &sk_cache, NULL);
+}
+
+static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &sk_cache, false);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct socket *sock;
+ int fd, err;
+
+ fd = *(int *)key;
+ sock = sockfd_lookup(fd, &err);
+ if (sock) {
+ sdata = bpf_sk_storage_lookup(sock->sk, map, true);
+ sockfd_put(sock);
+ return sdata ? sdata->data : NULL;
+ }
+
+ return ERR_PTR(err);
+}
+
+static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct socket *sock;
+ int fd, err;
+
+ fd = *(int *)key;
+ sock = sockfd_lookup(fd, &err);
+ if (sock) {
+ sdata = bpf_local_storage_update(
+ sock->sk, (struct bpf_local_storage_map *)map, value,
+ map_flags, false, GFP_ATOMIC);
+ sockfd_put(sock);
+ return PTR_ERR_OR_ZERO(sdata);
+ }
+
+ return err;
+}
+
+static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct socket *sock;
+ int fd, err;
+
+ fd = *(int *)key;
+ sock = sockfd_lookup(fd, &err);
+ if (sock) {
+ err = bpf_sk_storage_del(sock->sk, map);
+ sockfd_put(sock);
+ return err;
+ }
+
+ return err;
+}
+
+static struct bpf_local_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_elem *copy_selem;
+
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC);
+ if (!copy_selem)
+ return NULL;
+
+ if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
+ copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data, true);
+ else
+ copy_map_value(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data);
+
+ return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct bpf_local_storage *new_sk_storage = NULL;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ int ret = 0;
+
+ RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+ rcu_read_lock_dont_migrate();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+ if (!sk_storage || hlist_empty(&sk_storage->list))
+ goto out;
+
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ struct bpf_local_storage_elem *copy_selem;
+ struct bpf_local_storage_map *smap;
+ struct bpf_map *map;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ if (!(smap->map.map_flags & BPF_F_CLONE))
+ continue;
+
+ /* Note that for lockless listeners adding new element
+ * here can race with cleanup in bpf_local_storage_map_free.
+ * Try to grab map refcnt to make sure that it's still
+ * alive and prevent concurrent removal.
+ */
+ map = bpf_map_inc_not_zero(&smap->map);
+ if (IS_ERR(map))
+ continue;
+
+ copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+ if (!copy_selem) {
+ ret = -ENOMEM;
+ bpf_map_put(map);
+ goto out;
+ }
+
+ if (new_sk_storage) {
+ bpf_selem_link_map(smap, copy_selem);
+ bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
+ } else {
+ ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
+ if (ret) {
+ bpf_selem_free(copy_selem, true);
+ atomic_sub(smap->elem_size,
+ &newsk->sk_omem_alloc);
+ bpf_map_put(map);
+ goto out;
+ }
+
+ new_sk_storage =
+ rcu_dereference(copy_selem->local_storage);
+ }
+ bpf_map_put(map);
+ }
+
+out:
+ rcu_read_unlock_migrate();
+
+ /* In case of an error, don't free anything explicitly here, the
+ * caller is responsible to call bpf_sk_storage_free.
+ */
+
+ return ret;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
+ return (unsigned long)NULL;
+
+ sdata = bpf_sk_storage_lookup(sk, map, true);
+ if (sdata)
+ return (unsigned long)sdata->data;
+
+ if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
+ /* Cannot add new elem to a going away sk.
+ * Otherwise, the new elem may become a leak
+ * (and also other memory issues during map
+ * destruction).
+ */
+ refcount_inc_not_zero(&sk->sk_refcnt)) {
+ sdata = bpf_local_storage_update(
+ sk, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, false, gfp_flags);
+ /* sk must be a fullsock (guaranteed by verifier),
+ * so sock_gen_put() is unnecessary.
+ */
+ sock_put(sk);
+ return IS_ERR(sdata) ?
+ (unsigned long)NULL : (unsigned long)sdata->data;
+ }
+
+ return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!sk || !sk_fullsock(sk))
+ return -EINVAL;
+
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ int err;
+
+ err = bpf_sk_storage_del(sk, map);
+ sock_put(sk);
+ return err;
+ }
+
+ return -ENOENT;
+}
+
+static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = (struct sock *)owner;
+ int optmem_max;
+
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ /* same check as in sock_kmalloc() */
+ if (size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
+ atomic_add(size, &sk->sk_omem_alloc);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = owner;
+
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+static struct bpf_local_storage __rcu **
+bpf_sk_storage_ptr(void *owner)
+{
+ struct sock *sk = owner;
+
+ return &sk->sk_bpf_storage;
+}
+
+const struct bpf_map_ops sk_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = bpf_sk_storage_map_alloc,
+ .map_free = bpf_sk_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
+ .map_update_elem = bpf_fd_sk_storage_update_elem,
+ .map_delete_elem = bpf_fd_sk_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_local_storage_charge = bpf_sk_storage_charge,
+ .map_local_storage_uncharge = bpf_sk_storage_uncharge,
+ .map_owner_storage_ptr = bpf_sk_storage_ptr,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+};
+
+const struct bpf_func_proto bpf_sk_storage_get_proto = {
+ .func = bpf_sk_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = {
+ .func = bpf_sk_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_sk_storage_delete_proto = {
+ .func = bpf_sk_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
+{
+ if (prog->aux->dst_prog)
+ return false;
+
+ /* Ensure the tracing program is not tracing
+ * any bpf_sk_storage*() function and also
+ * use the bpf_sk_storage_(get|delete) helper.
+ */
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ /* bpf_sk_storage has no trace point */
+ return true;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
+ strlen("bpf_sk_storage"));
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (in_hardirq() || in_nmi())
+ return (unsigned long)NULL;
+
+ return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
+ gfp_flags);
+}
+
+BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
+ struct sock *, sk)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (in_hardirq() || in_nmi())
+ return -EPERM;
+
+ return ____bpf_sk_storage_delete(map, sk);
+}
+
+const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
+ .func = bpf_sk_storage_get_tracing,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+ .allowed = bpf_sk_storage_tracing_allowed,
+};
+
+const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
+ .func = bpf_sk_storage_delete_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .allowed = bpf_sk_storage_tracing_allowed,
+};
+
+struct bpf_sk_storage_diag {
+ u32 nr_maps;
+ struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * ....
+ */
+static int nla_value_size(u32 value_size)
+{
+ /* SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ */
+ return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+ nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+ u32 i;
+
+ if (!diag)
+ return;
+
+ for (i = 0; i < diag->nr_maps; i++)
+ bpf_map_put(diag->maps[i]);
+
+ kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+ const struct bpf_map *map)
+{
+ u32 i;
+
+ for (i = 0; i < diag->nr_maps; i++) {
+ if (diag->maps[i] == map)
+ return true;
+ }
+
+ return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+ struct bpf_sk_storage_diag *diag;
+ struct nlattr *nla;
+ u32 nr_maps = 0;
+ int rem, err;
+
+ /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
+ * the map_alloc_check() side also does.
+ */
+ if (!bpf_capable())
+ return ERR_PTR(-EPERM);
+
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ if (nla_len(nla) != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ nr_maps++;
+ }
+
+ diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
+ if (!diag)
+ return ERR_PTR(-ENOMEM);
+
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ int map_fd = nla_get_u32(nla);
+ struct bpf_map *map = bpf_map_get(map_fd);
+
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto err_free;
+ }
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+ bpf_map_put(map);
+ err = -EINVAL;
+ goto err_free;
+ }
+ if (diag_check_dup(diag, map)) {
+ bpf_map_put(map);
+ err = -EEXIST;
+ goto err_free;
+ }
+ diag->maps[diag->nr_maps++] = map;
+ }
+
+ return diag;
+
+err_free:
+ bpf_sk_storage_diag_free(diag);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
+{
+ struct nlattr *nla_stg, *nla_value;
+ struct bpf_local_storage_map *smap;
+
+ /* It cannot exceed max nlattr's payload */
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
+
+ nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+ if (!nla_stg)
+ return -EMSGSIZE;
+
+ smap = rcu_dereference(sdata->smap);
+ if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+ goto errout;
+
+ nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+ smap->map.value_size,
+ SK_DIAG_BPF_STORAGE_PAD);
+ if (!nla_value)
+ goto errout;
+
+ if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
+ copy_map_value_locked(&smap->map, nla_data(nla_value),
+ sdata->data, true);
+ else
+ copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+ nla_nest_end(skb, nla_stg);
+ return 0;
+
+errout:
+ nla_nest_cancel(skb, nla_stg);
+ return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+
+ rcu_read_lock();
+
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ smap = rcu_dereference(SDATA(selem)->smap);
+ diag_size += nla_value_size(smap->map.value_size);
+
+ if (nla_stgs && diag_get(SDATA(selem), skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+ struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_data *sdata;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+ u32 i;
+
+ *res_diag_size = 0;
+
+ /* No map has been specified. Dump all. */
+ if (!diag->nr_maps)
+ return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+ res_diag_size);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ for (i = 0; i < diag->nr_maps; i++) {
+ sdata = bpf_local_storage_lookup(sk_storage,
+ (struct bpf_local_storage_map *)diag->maps[i],
+ false);
+
+ if (!sdata)
+ continue;
+
+ diag_size += nla_value_size(diag->maps[i]->value_size);
+
+ if (nla_stgs && diag_get(sdata, skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
+
+struct bpf_iter_seq_sk_storage_map_info {
+ struct bpf_map *map;
+ unsigned int bucket_id;
+ unsigned skip_elems;
+};
+
+static struct bpf_local_storage_elem *
+bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
+ struct bpf_local_storage_elem *prev_selem)
+ __acquires(RCU) __releases(RCU)
+{
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ u32 skip_elems = info->skip_elems;
+ struct bpf_local_storage_map *smap;
+ u32 bucket_id = info->bucket_id;
+ u32 i, count, n_buckets;
+ struct bpf_local_storage_map_bucket *b;
+
+ smap = (struct bpf_local_storage_map *)info->map;
+ n_buckets = 1U << smap->bucket_log;
+ if (bucket_id >= n_buckets)
+ return NULL;
+
+ /* try to find next selem in the same bucket */
+ selem = prev_selem;
+ count = 0;
+ while (selem) {
+ selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
+ struct bpf_local_storage_elem, map_node);
+ if (!selem) {
+ /* not found, unlock and go to the next bucket */
+ b = &smap->buckets[bucket_id++];
+ rcu_read_unlock();
+ skip_elems = 0;
+ break;
+ }
+ sk_storage = rcu_dereference(selem->local_storage);
+ if (sk_storage) {
+ info->skip_elems = skip_elems + count;
+ return selem;
+ }
+ count++;
+ }
+
+ for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+ rcu_read_lock();
+ count = 0;
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ sk_storage = rcu_dereference(selem->local_storage);
+ if (sk_storage && count >= skip_elems) {
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return selem;
+ }
+ count++;
+ }
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
+ if (!selem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return selem;
+}
+
+static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos)
+{
+ struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_sk_storage_map_seq_find_next(seq->private, v);
+}
+
+struct bpf_iter__bpf_sk_storage_map {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(struct sock *, sk);
+ __bpf_md_ptr(void *, value);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
+ struct bpf_map *map, struct sock *sk,
+ void *value)
+
+static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+ struct bpf_iter__bpf_sk_storage_map ctx = {};
+ struct bpf_local_storage *sk_storage;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, selem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (selem) {
+ sk_storage = rcu_dereference(selem->local_storage);
+ ctx.sk = sk_storage->owner;
+ ctx.value = SDATA(selem)->data;
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_sk_storage_map_seq_show(seq, v);
+}
+
+static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ if (!v)
+ (void)__bpf_sk_storage_map_seq_show(seq, v);
+ else
+ rcu_read_unlock();
+}
+
+static int bpf_iter_init_sk_storage_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ seq_info->map = aux->map;
+ return 0;
+}
+
+static void bpf_iter_fini_sk_storage_map(void *priv_data)
+{
+ struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+}
+
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ goto put_map;
+
+ if (prog->aux->max_rdwr_access > map->value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static const struct seq_operations bpf_sk_storage_map_seq_ops = {
+ .start = bpf_sk_storage_map_seq_start,
+ .next = bpf_sk_storage_map_seq_next,
+ .stop = bpf_sk_storage_map_seq_stop,
+ .show = bpf_sk_storage_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_sk_storage_map_seq_ops,
+ .init_seq_private = bpf_iter_init_sk_storage_map,
+ .fini_seq_private = bpf_iter_fini_sk_storage_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info),
+};
+
+static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
+ .target = "bpf_sk_storage_map",
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, value),
+ PTR_TO_BUF | PTR_MAYBE_NULL },
+ },
+ .seq_info = &iter_seq_info,
+};
+
+static int __init bpf_sk_storage_map_iter_init(void)
+{
+ bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info);
+}
+late_initcall(bpf_sk_storage_map_iter_init);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 797fdd4352ce..c285c6465923 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* SUCS NET3:
*
@@ -9,7 +10,7 @@
* identical recvmsg() code. So we share it here. The poll was
* shared before but buried in udp.c so I moved it.
*
- * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
* udp.c code)
*
* Fixes:
@@ -36,8 +37,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
@@ -48,6 +48,11 @@
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/iov_iter.h>
+#include <linux/indirect_call_wrapper.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -55,6 +60,10 @@
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
+
+#include "devmem.h"
/*
* Is a socket 'connection oriented' ?
@@ -64,22 +73,34 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ /*
+ * Avoid a wakeup if event not interesting for us
+ */
+ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
/*
- * Wait for a packet..
+ * Wait for the last received packet to be different from skb
*/
-static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
+int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
+ int *err, long *timeo_p,
+ const struct sk_buff *skb)
{
int error;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, receiver_wake_function);
- prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* Socket errors? */
error = sock_error(sk);
if (error)
goto out_err;
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (READ_ONCE(queue->prev) != skb)
goto out;
/* Socket shut down? */
@@ -101,7 +122,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
error = 0;
*timeo_p = schedule_timeout(*timeo_p);
out:
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return error;
interrupted:
error = sock_intr_errno(*timeo_p);
@@ -113,13 +134,86 @@ out_noerr:
error = 1;
goto out;
}
+EXPORT_SYMBOL(__skb_wait_for_more_packets);
+
+static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+
+ if (skb->peeked)
+ return skb;
+
+ /* We have to unshare an skb before modifying it. */
+ if (!skb_shared(skb))
+ goto done;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return ERR_PTR(-ENOMEM);
+
+ skb->prev->next = nskb;
+ skb->next->prev = nskb;
+ nskb->prev = skb->prev;
+ nskb->next = skb->next;
+
+ consume_skb(skb);
+ skb = nskb;
+
+done:
+ skb->peeked = 1;
+
+ return skb;
+}
+
+struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
+ unsigned int flags,
+ int *off, int *err,
+ struct sk_buff **last)
+{
+ bool peek_at_off = false;
+ struct sk_buff *skb;
+ int _off = 0;
+
+ if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+ peek_at_off = true;
+ _off = *off;
+ }
+
+ *last = queue->prev;
+ skb_queue_walk(queue, skb) {
+ if (flags & MSG_PEEK) {
+ if (peek_at_off && _off >= skb->len &&
+ (_off || skb->peeked)) {
+ _off -= skb->len;
+ continue;
+ }
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ *err = PTR_ERR(skb);
+ return NULL;
+ }
+ }
+ refcount_inc(&skb->users);
+ } else {
+ __skb_unlink(skb, queue);
+ }
+ *off = _off;
+ return skb;
+ }
+ return NULL;
+}
/**
- * skb_recv_datagram - Receive a datagram skbuff
+ * __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
- * @flags: MSG_ flags
- * @noblock: blocking operation?
+ * @queue: socket queue from which to receive
+ * @flags: MSG\_ flags
+ * @off: an offset in bytes to peek skb from. Returns an offset
+ * within an skb where data actually starts
* @err: error code returned
+ * @last: set to last peeked message to inform the wait function
+ * what to look for when peeking
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
* and possible races. This replaces identical code in packet, raw and
@@ -127,9 +221,11 @@ out_noerr:
* the long standing peek and read race for datagram sockets. If you
* alter this routine remember it must be re-entrant.
*
- * This function will lock the socket if a skb is returned, so the caller
- * needs to unlock the socket in that case (usually by calling
- * skb_free_datagram)
+ * This function will lock the socket if a skb is returned, so
+ * the caller needs to unlock the socket in that case (usually by
+ * calling skb_free_datagram). Returns NULL with @err set to
+ * -EAGAIN if no data was available or to some other value if an
+ * error was detected.
*
* * It does not lock socket since today. This function is
* * free of race conditions. This measure should/can improve
@@ -143,11 +239,13 @@ out_noerr:
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
- int noblock, int *err)
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
+ struct sk_buff_head *queue,
+ unsigned int flags, int *off, int *err,
+ struct sk_buff **last)
{
struct sk_buff *skb;
- long timeo;
+ unsigned long cpu_flags;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
@@ -156,55 +254,107 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
if (error)
goto no_packet;
- timeo = sock_rcvtimeo(sk, noblock);
-
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
*
* Look at current nfs client by the way...
- * However, this function was corrent in any case. 8)
+ * However, this function was correct in any case. 8)
*/
- if (flags & MSG_PEEK) {
- unsigned long cpu_flags;
-
- spin_lock_irqsave(&sk->sk_receive_queue.lock,
- cpu_flags);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb)
- atomic_inc(&skb->users);
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
- cpu_flags);
- } else
- skb = skb_dequeue(&sk->sk_receive_queue);
-
+ spin_lock_irqsave(&queue->lock, cpu_flags);
+ skb = __skb_try_recv_from_queue(queue, flags, off, &error,
+ last);
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ if (error)
+ goto no_packet;
if (skb)
return skb;
- /* User doesn't want to wait */
- error = -EAGAIN;
- if (!timeo)
- goto no_packet;
+ if (!sk_can_busy_loop(sk))
+ break;
- } while (!wait_for_packet(sk, err, &timeo));
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
+ } while (READ_ONCE(queue->prev) != *last);
- return NULL;
+ error = -EAGAIN;
no_packet:
*err = error;
return NULL;
}
+EXPORT_SYMBOL(__skb_try_recv_datagram);
+
+struct sk_buff *__skb_recv_datagram(struct sock *sk,
+ struct sk_buff_head *sk_queue,
+ unsigned int flags, int *off, int *err)
+{
+ struct sk_buff *skb, *last;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
+ &last);
+ if (skb)
+ return skb;
+
+ if (*err != -EAGAIN)
+ break;
+ } while (timeo &&
+ !__skb_wait_for_more_packets(sk, sk_queue, err,
+ &timeo, last));
+
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_datagram);
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int *err)
+{
+ int off = 0;
+
+ return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags,
+ &off, err);
+}
+EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
- kfree_skb(skb);
+ consume_skb(skb);
+}
+EXPORT_SYMBOL(skb_free_datagram);
+
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
+ struct sk_buff *skb, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb))
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk_queue->lock);
+ if (skb->next) {
+ __skb_unlink(skb, sk_queue);
+ refcount_dec(&skb->users);
+ if (destructor)
+ destructor(sk, skb);
+ err = 0;
+ }
+ spin_unlock_bh(&sk_queue->lock);
+ }
+
+ sk_drops_inc(sk);
+ return err;
}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
* @skb: datagram skbuff
- * @flags: MSG_ flags
+ * @flags: MSG\_ flags
*
* This function frees a datagram skbuff that was received by
* skb_recv_datagram. The flags argument must match the one
@@ -217,192 +367,248 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
* This function currently only disables BH when acquiring the
* sk_receive_queue lock. Therefore it must not be used in a
* context where that lock is acquired in an IRQ context.
+ *
+ * It returns 0 if the packet was removed by us.
*/
-void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
- if (flags & MSG_PEEK) {
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- atomic_dec(&skb->users);
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- }
+ int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
+ NULL);
kfree_skb(skb);
+ return err;
}
-
EXPORT_SYMBOL(skb_kill_datagram);
-/**
- * skb_copy_datagram_iovec - Copy a datagram to an iovec.
- * @skb: buffer to copy
- * @offset: offset in the buffer to start copying from
- * @to: io vector to copy to
- * @len: amount of data to copy from buffer to iovec
- *
- * Note: the iovec is modified during the copy.
- */
-int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
- struct iovec *to, int len)
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
+static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, bool fault_short,
+ size_t (*cb)(const void *, size_t, void *,
+ struct iov_iter *), void *data)
{
int start = skb_headlen(skb);
- int i, copy = start - offset;
+ int i, copy = start - offset, start_off = offset, n;
+ struct sk_buff *frag_iter;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_toiovec(to, skb->data + offset, copy))
- goto fault;
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
+ offset += n;
+ if (n != copy)
+ goto short_copy;
if ((len -= copy) == 0)
return 0;
- offset += copy;
}
+ if (!skb_frags_readable(skb))
+ goto short_copy;
+
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap(page);
- err = memcpy_toiovec(to, vaddr + frag->page_offset +
- offset - start, copy);
- kunmap(page);
- if (err)
- goto fault;
+
+ n = 0;
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + p_off, p_len, data, to);
+ kunmap_local(vaddr);
+ }
+
+ offset += n;
+ if (n != copy)
+ goto short_copy;
if (!(len -= copy))
return 0;
- offset += copy;
}
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_copy_datagram_iovec(list,
- offset - start,
- to, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- }
- start = end;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (__skb_datagram_iter(frag_iter, offset - start,
+ to, copy, fault_short, cb, data))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
}
+ start = end;
}
if (!len)
return 0;
+ /* This is not really a user copy fault, but rather someone
+ * gave us a bogus length on the skb. We should probably
+ * print a warning here as it may indicate a kernel bug.
+ */
+
fault:
+ iov_iter_revert(to, offset - start_off);
return -EFAULT;
+
+short_copy:
+ if (fault_short || iov_iter_count(to))
+ goto fault;
+
+ return 0;
}
-static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
- u8 __user *to, int len,
- __wsum *csump)
+#ifdef CONFIG_NET_CRC32C
+static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes,
+ void *_crcp, struct iov_iter *i)
+{
+ u32 *crcp = _crcp;
+ size_t copied;
+
+ copied = copy_to_iter(addr, bytes, i);
+ *crcp = crc32c(*crcp, addr, copied);
+ return copied;
+}
+
+/**
+ * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator
+ * and update a CRC32C value.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ * @crcp: pointer to CRC32C value to update
+ *
+ * Return: 0 on success, -EFAULT if there was a fault during copy.
+ */
+int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, u32 *crcp)
+{
+ return __skb_datagram_iter(skb, offset, to, len, true,
+ crc32c_and_copy_to_iter, crcp);
+}
+EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter);
+#endif /* CONFIG_NET_CRC32C */
+
+static size_t simple_copy_to_iter(const void *addr, size_t bytes,
+ void *data __always_unused, struct iov_iter *i)
+{
+ return copy_to_iter(addr, bytes, i);
+}
+
+/**
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
+{
+ trace_skb_copy_datagram_iovec(skb, len);
+ return __skb_datagram_iter(skb, offset, to, len, false,
+ simple_copy_to_iter, NULL);
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
+ * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying to
+ * @from: the copy source
+ * @len: amount of data to copy to buffer from iovec
+ *
+ * Returns 0 or -EFAULT.
+ */
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+ struct iov_iter *from,
+ int len)
{
int start = skb_headlen(skb);
- int pos = 0;
int i, copy = start - offset;
+ struct sk_buff *frag_iter;
/* Copy header. */
if (copy > 0) {
- int err = 0;
if (copy > len)
copy = len;
- *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
- *csump, &err);
- if (err)
+ if (copy_from_iter(skb->data + offset, copy, from) != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
- to += copy;
- pos = copy;
}
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- __wsum csum2;
- int err = 0;
- u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ size_t copied;
if (copy > len)
copy = len;
- vaddr = kmap(page);
- csum2 = csum_and_copy_to_user(vaddr +
- frag->page_offset +
- offset - start,
- to, copy, 0, &err);
- kunmap(page);
- if (err)
+ copied = copy_page_from_iter(skb_frag_page(frag),
+ skb_frag_off(frag) + offset - start,
+ copy, from);
+ if (copied != copy)
goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
+
if (!(len -= copy))
return 0;
offset += copy;
- to += copy;
- pos += copy;
}
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list=list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- __wsum csum2 = 0;
- if (copy > len)
- copy = len;
- if (skb_copy_and_csum_datagram(list,
- offset - start,
- to, copy,
- &csum2))
- goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to += copy;
- pos += copy;
- }
- start = end;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_from_iter(frag_iter,
+ offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
}
+ start = end;
}
if (!len)
return 0;
@@ -410,126 +616,390 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
fault:
return -EFAULT;
}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-__sum16 __skb_checksum_complete(struct sk_buff *skb)
+int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
+ struct iov_iter *from, int len)
{
- __sum16 sum;
+ struct iov_iter_state state;
+ int ret;
+
+ iov_iter_save_state(from, &state);
+ ret = skb_copy_datagram_from_iter(skb, offset, from, len);
+ if (ret)
+ iov_iter_restore(from, &state);
+ return ret;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter_full);
+
+int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
+{
+ int frag = skb_shinfo(skb)->nr_frags;
+
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
- sum = csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
- netdev_rx_csum_fault(skb->dev);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ while (length && iov_iter_count(from)) {
+ struct page *head, *last_head = NULL;
+ struct page *pages[MAX_SKB_FRAGS];
+ int refs, order, n = 0;
+ size_t start;
+ ssize_t copied;
+
+ if (frag == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ copied = iov_iter_get_pages2(from, pages, length,
+ MAX_SKB_FRAGS - frag, &start);
+ if (copied < 0)
+ return -EFAULT;
+
+ length -= copied;
+
+ skb->data_len += copied;
+ skb->len += copied;
+ skb->truesize += PAGE_ALIGN(copied + start);
+
+ head = compound_head(pages[n]);
+ order = compound_order(head);
+
+ for (refs = 0; copied != 0; start = 0) {
+ int size = min_t(int, copied, PAGE_SIZE - start);
+
+ if (pages[n] - head > (1UL << order) - 1) {
+ head = compound_head(pages[n]);
+ order = compound_order(head);
+ }
+
+ start += (pages[n] - head) << PAGE_SHIFT;
+ copied -= size;
+ n++;
+ if (frag) {
+ skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];
+
+ if (head == skb_frag_page(last) &&
+ start == skb_frag_off(last) + skb_frag_size(last)) {
+ skb_frag_size_add(last, size);
+ /* We combined this page, we need to release
+ * a reference. Since compound pages refcount
+ * is shared among many pages, batch the refcount
+ * adjustments to limit false sharing.
+ */
+ last_head = head;
+ refs++;
+ continue;
+ }
+ }
+ if (refs) {
+ page_ref_sub(last_head, refs);
+ refs = 0;
+ }
+ skb_fill_page_desc_noacc(skb, frag++, head, start, size);
+ }
+ if (refs)
+ page_ref_sub(last_head, refs);
+ }
+ return 0;
+}
+
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+ int length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ size_t virt_addr, size, off;
+ struct net_iov *niov;
+
+ /* Devmem filling works by taking an IOVEC from the user where the
+ * iov_addrs are interpreted as an offset in bytes into the dma-buf to
+ * send from. We do not support other iter types.
+ */
+ if (iov_iter_type(from) != ITER_IOVEC &&
+ iov_iter_type(from) != ITER_UBUF)
+ return -EFAULT;
+
+ while (length && iov_iter_count(from)) {
+ if (i == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ virt_addr = (size_t)iter_iov_addr(from);
+ niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+ if (!niov)
+ return -EFAULT;
+
+ size = min_t(size_t, size, length);
+ size = min_t(size_t, size, iter_iov_len(from));
+
+ get_netmem(net_iov_to_netmem(niov));
+ skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+ size, PAGE_SIZE);
+ iov_iter_advance(from, size);
+ length -= size;
+ i++;
}
- return sum;
+
+ return 0;
}
-EXPORT_SYMBOL(__skb_checksum_complete);
+
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ unsigned long orig_size = skb->truesize;
+ unsigned long truesize;
+ int ret;
+
+ if (msg && msg->msg_ubuf && msg->sg_from_iter)
+ ret = msg->sg_from_iter(skb, from, length);
+ else if (binding)
+ ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
+ else
+ ret = zerocopy_fill_skb_from_iter(skb, from, length);
+
+ truesize = skb->truesize - orig_size;
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk_wmem_queued_add(sk, truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
/**
- * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ * @skb: buffer to copy
+ * @from: the source to copy from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+ int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+ return -EFAULT;
+
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iter);
+
+static __always_inline
+size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ __wsum next, *csum = priv2;
+
+ next = csum_and_copy_to_user(from + progress, iter_to, len);
+ *csum = csum_block_add(*csum, next, progress);
+ return next ? 0 : len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ __wsum *csum = priv2;
+ __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len);
+
+ *csum = csum_block_add(*csum, next, progress);
+ return 0;
+}
+
+struct csum_state {
+ __wsum csum;
+ size_t off;
+};
+
+static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
+ struct iov_iter *i)
+{
+ struct csum_state *csstate = _csstate;
+ __wsum sum;
+
+ if (WARN_ON_ONCE(i->data_source))
+ return 0;
+ if (unlikely(iov_iter_is_discard(i))) {
+ // can't use csum_memcpy() for that one - data is not copied
+ csstate->csum = csum_block_add(csstate->csum,
+ csum_partial(addr, bytes, 0),
+ csstate->off);
+ csstate->off += bytes;
+ return bytes;
+ }
+
+ sum = csum_shift(csstate->csum, csstate->off);
+
+ bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
+ copy_to_user_iter_csum,
+ memcpy_to_iter_csum);
+ csstate->csum = csum_shift(sum, csstate->off);
+ csstate->off += bytes;
+ return bytes;
+}
+
+/**
+ * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
+ * and update a checksum.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ * @csump: checksum pointer
+ */
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len,
+ __wsum *csump)
+{
+ struct csum_state csdata = { .csum = *csump };
+ int ret;
+
+ ret = __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, &csdata);
+ if (ret)
+ return ret;
+
+ *csump = csdata.csum;
+ return 0;
+}
+
+/**
+ * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
- * @iov: io vector
- *
+ * @msg: destination
+ *
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
* -EINVAL - checksum failure.
- * -EFAULT - fault during copy. Beware, in this case iovec
- * can be modified!
+ * -EFAULT - fault during copy.
*/
-int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
- int hlen, struct iovec *iov)
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
+ int hlen, struct msghdr *msg)
{
__wsum csum;
int chunk = skb->len - hlen;
- /* Skip filled elements.
- * Pretty silly, look at memcpy_toiovec, though 8)
- */
- while (!iov->iov_len)
- iov++;
+ if (!chunk)
+ return 0;
- if (iov->iov_len < chunk) {
+ if (msg_data_left(msg) < chunk) {
if (__skb_checksum_complete(skb))
- goto csum_error;
- if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
+ return -EINVAL;
+ if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
goto fault;
} else {
csum = csum_partial(skb->data, hlen, skb->csum);
- if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
+ if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
chunk, &csum))
goto fault;
- if (csum_fold(csum))
- goto csum_error;
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
- netdev_rx_csum_fault(skb->dev);
- iov->iov_len -= chunk;
- iov->iov_base += chunk;
+
+ if (csum_fold(csum)) {
+ iov_iter_revert(&msg->msg_iter, chunk);
+ return -EINVAL;
+ }
+
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(NULL, skb);
}
return 0;
-csum_error:
- return -EINVAL;
fault:
return -EFAULT;
}
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
- * datagram_poll - generic datagram poll
+ * datagram_poll_queue - same as datagram_poll, but on a specific receive
+ * queue
* @file: file struct
* @sock: socket
* @wait: poll table
+ * @rcv_queue: receive queue to poll
*
- * Datagram poll: Again totally generic. This also handles
- * sequenced packet sockets providing the socket receive queue
- * is only ever holding data ready to receive.
+ * Performs polling on the given receive queue, handling shutdown, error,
+ * and connection state. This is useful for protocols that deliver
+ * userspace-bound packets through a custom queue instead of
+ * sk->sk_receive_queue.
*
- * Note: when you _don't_ use this routine for this protocol,
- * and you use a different write policy from sock_writeable()
- * then please supply your own write_space callback.
+ * Return: poll bitmask indicating the socket's current state
*/
-unsigned int datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t datagram_poll_queue(struct file *file, struct socket *sock,
+ poll_table *wait, struct sk_buff_head *rcv_queue)
{
struct sock *sk = sock->sk;
- unsigned int mask;
+ __poll_t mask;
+ u8 shutdown;
- poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLRDHUP;
- if (sk->sk_shutdown == SHUTDOWN_MASK)
- mask |= POLLHUP;
+ if (READ_ONCE(sk->sk_err) ||
+ !skb_queue_empty_lockless(&sk->sk_error_queue))
+ mask |= EPOLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
+
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
+ if (shutdown == SHUTDOWN_MASK)
+ mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
- mask |= POLLIN | POLLRDNORM;
+ if (!skb_queue_empty_lockless(rcv_queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
if (connection_based(sk)) {
- if (sk->sk_state == TCP_CLOSE)
- mask |= POLLHUP;
+ int state = READ_ONCE(sk->sk_state);
+
+ if (state == TCP_CLOSE)
+ mask |= EPOLLHUP;
/* connection hasn't started yet? */
- if (sk->sk_state == TCP_SYN_SENT)
+ if (state == TCP_SYN_SENT)
return mask;
}
/* writable? */
if (sock_writeable(sk))
- mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
+EXPORT_SYMBOL(datagram_poll_queue);
+/**
+ * datagram_poll - generic datagram poll
+ * @file: file struct
+ * @sock: socket
+ * @wait: poll table
+ *
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you *don't* use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ *
+ * Return: poll bitmask indicating the socket's current state
+ */
+__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ return datagram_poll_queue(file, sock, wait,
+ &sock->sk->sk_receive_queue);
+}
EXPORT_SYMBOL(datagram_poll);
-EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
-EXPORT_SYMBOL(skb_copy_datagram_iovec);
-EXPORT_SYMBOL(skb_free_datagram);
-EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index 59d058a3b504..9094c0fb8c68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * NET3 Protocol independent device support routines.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * NET3 Protocol independent device support routines.
*
* Derived from the non IP parts of dev.c 1.0.19
- * Authors: Ross Biro
+ * Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
*
@@ -21,9 +17,9 @@
*
* Changes:
* D.J. Barrow : Fixed bug where dev->refcnt gets set
- * to 2 if register_netdev gets called
- * before net_dev_init & also removed a
- * few lines of code in the process.
+ * to 2 if register_netdev gets called
+ * before net_dev_init & also removed a
+ * few lines of code in the process.
* Alan Cox : device private ioctl copies fields back.
* Alan Cox : Transmit queue code does relevant
* stunts to keep the queue safe.
@@ -36,7 +32,7 @@
* Alan Cox : 100 backlog just doesn't cut it when
* you start doing multicast video 8)
* Alan Cox : Rewrote net_bh and list manager.
- * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
* Alan Cox : Took out transmit every packet pass
* Saved a few bytes in the ioctl handler
* Alan Cox : Network driver sets packet type before
@@ -46,7 +42,7 @@
* Richard Kooijman: Timestamp fixes.
* Alan Cox : Wrong field in SIOCGIFDSTADDR
* Alan Cox : Device lock protection.
- * Alan Cox : Fixed nasty side effect of device close
+ * Alan Cox : Fixed nasty side effect of device close
* changes.
* Rudi Cilibrasi : Pass the right thing to
* set_mac_address()
@@ -67,20 +63,25 @@
* Paul Rusty Russell : SIOCSIFNAME
* Pekka Riikonen : Netdev boot-time settings code
* Andrew Morton : Make unregister_netdevice wait
- * indefinitely on dev->refcnt
- * J Hadi Salim : - Backlog queue sampling
+ * indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
* - netif_rx() feedback
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/mm.h>
+#include <linux/smpboot.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -90,110 +91,355 @@
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
-#include <linux/notifier.h>
+#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
#include <linux/stat.h>
-#include <linux/if_bridge.h>
+#include <net/dsa.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
+#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
-#include <linux/kmod.h>
#include <linux/module.h>
-#include <linux/kallsyms.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
-#include <linux/wireless.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
-
-/*
- * The list of packet types we will receive (as opposed to discard)
- * and the routines to invoke.
- *
- * Why 16. Because with 16 the only overlap we get on a hash of the
- * low nibble of the protocol value is RARP/SNAP/X.25.
- *
- * NOTE: That is no longer true with the addition of VLAN tags. Not
- * sure which should go first, but I bet it won't make much
- * difference if we are running VLANs. The good news is that
- * this protocol won't be in the list unless compiled in, so
- * the average user (w/out VLANs) will not be adversely affected.
- * --BLG
- *
- * 0800 IP
- * 8100 802.1Q VLAN
- * 0001 802.3
- * 0002 AX.25
- * 0004 802.2
- * 8035 RARP
- * 0005 SNAP
- * 0805 X.25
- * 0806 ARP
- * 8137 IPX
- * 0009 Localtalk
- * 86DD IPv6
- */
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/mpls.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
+#include <trace/events/qdisc.h>
+#include <trace/events/xdp.h>
+#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
+#include <linux/static_key.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/if_macvlan.h>
+#include <linux/errqueue.h>
+#include <linux/hrtimer.h>
+#include <linux/netfilter_netdev.h>
+#include <linux/crash_dump.h>
+#include <linux/sctp.h>
+#include <net/udp_tunnel.h>
+#include <linux/net_namespace.h>
+#include <linux/indirect_call_wrapper.h>
+#include <net/devlink.h>
+#include <linux/pm_runtime.h>
+#include <linux/prandom.h>
+#include <linux/once_lite.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/rps.h>
+#include <linux/phy_link_topology.h>
+
+#include "dev.h"
+#include "devmem.h"
+#include "net-sysfs.h"
static DEFINE_SPINLOCK(ptype_lock);
-static struct list_head ptype_base[16]; /* 16 way hashed list */
-static struct list_head ptype_all; /* Taps */
+struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
+static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack);
+
+static DEFINE_MUTEX(ifalias_mutex);
+
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
+
+static unsigned int napi_gen_id = NR_CPUS;
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
+
+static inline void dev_base_seq_inc(struct net *net)
+{
+ unsigned int val = net->dev_base_seq + 1;
+
+ WRITE_ONCE(net->dev_base_seq, val ?: 1);
+}
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+ unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
+
+ return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+#ifndef CONFIG_PREEMPT_RT
+
+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
+
+static int __init setup_backlog_napi_threads(char *arg)
+{
+ static_branch_enable(&use_backlog_threads_key);
+ return 0;
+}
+early_param("thread_backlog_napi", setup_backlog_napi_threads);
+
+static bool use_backlog_threads(void)
+{
+ return static_branch_unlikely(&use_backlog_threads_key);
+}
+
+#else
+
+static bool use_backlog_threads(void)
+{
+ return true;
+}
-#ifdef CONFIG_NET_DMA
-static struct dma_client *net_dma_client;
-static unsigned int net_dma_count;
-static spinlock_t net_dma_event_lock;
#endif
-/*
- * The @dev_base list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading.
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base list, and hold dev_base_lock for writing when they do the
- * actual updates. This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
- */
-struct net_device *dev_base;
-static struct net_device **dev_tail = &dev_base;
-DEFINE_RWLOCK(dev_base_lock);
+static inline void backlog_lock_irq_save(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+ else
+ local_irq_save(*flags);
+}
+
+static inline void backlog_lock_irq_disable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_lock_irq(&sd->input_pkt_queue.lock);
+ else
+ local_irq_disable();
+}
+
+static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+ else
+ local_irq_restore(*flags);
+}
+
+static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_unlock_irq(&sd->input_pkt_queue.lock);
+ else
+ local_irq_enable();
+}
+
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
+ const char *name)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
+ if (!name_node)
+ return NULL;
+ INIT_HLIST_NODE(&name_node->hlist);
+ name_node->dev = dev;
+ name_node->name = name;
+ return name_node;
+}
+
+static struct netdev_name_node *
+netdev_name_node_head_alloc(struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = netdev_name_node_alloc(dev, dev->name);
+ if (!name_node)
+ return NULL;
+ INIT_LIST_HEAD(&name_node->list);
+ return name_node;
+}
+
+static void netdev_name_node_free(struct netdev_name_node *name_node)
+{
+ kfree(name_node);
+}
+
+static void netdev_name_node_add(struct net *net,
+ struct netdev_name_node *name_node)
+{
+ hlist_add_head_rcu(&name_node->hlist,
+ dev_name_hash(net, name_node->name));
+}
+
+static void netdev_name_node_del(struct netdev_name_node *name_node)
+{
+ hlist_del_rcu(&name_node->hlist);
+}
+
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry_rcu(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+bool netdev_name_in_use(struct net *net, const char *name)
+{
+ return netdev_name_node_lookup(net, name);
+}
+EXPORT_SYMBOL(netdev_name_in_use);
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (name_node)
+ return -EEXIST;
+ name_node = netdev_name_node_alloc(dev, name);
+ if (!name_node)
+ return -ENOMEM;
+ netdev_name_node_add(net, name_node);
+ /* The node that holds dev->name acts as a head of per-device list. */
+ list_add_tail_rcu(&name_node->list, &dev->name_node->list);
+
+ return 0;
+}
+
+static void netdev_name_node_alt_free(struct rcu_head *head)
+{
+ struct netdev_name_node *name_node =
+ container_of(head, struct netdev_name_node, rcu);
-EXPORT_SYMBOL(dev_base);
-EXPORT_SYMBOL(dev_base_lock);
+ kfree(name_node->name);
+ netdev_name_node_free(name_node);
+}
-#define NETDEV_HASHBITS 8
-static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
-static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+ netdev_name_node_del(name_node);
+ list_del(&name_node->list);
+ call_rcu(&name_node->rcu, netdev_name_node_alt_free);
+}
-static inline struct hlist_head *dev_name_hash(const char *name)
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
- return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (!name_node)
+ return -ENOENT;
+ /* lookup might have found our primary name or a name belonging
+ * to another device.
+ */
+ if (name_node == dev->name_node || name_node->dev != dev)
+ return -EINVAL;
+
+ __netdev_name_node_alt_destroy(name_node);
+ return 0;
}
-static inline struct hlist_head *dev_index_hash(int ifindex)
+static void netdev_name_node_alt_flush(struct net_device *dev)
{
- return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
+ struct netdev_name_node *name_node, *tmp;
+
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
+ list_del(&name_node->list);
+ netdev_name_node_alt_free(&name_node->rcu);
+ }
+}
+
+/* Device list insertion */
+static void list_netdevice(struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ ASSERT_RTNL();
+
+ list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
+ netdev_name_node_add(net, dev->name_node);
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_index_hash(net, dev->ifindex));
+
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_add(net, name_node);
+
+ /* We reserved the ifindex, this can't fail */
+ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
+
+ dev_base_seq_inc(net);
+}
+
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
+static void unlist_netdevice(struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ ASSERT_RTNL();
+
+ xa_erase(&net->dev_by_index, dev->ifindex);
+
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_del(name_node);
+
+ /* Unlink dev from the device chain */
+ list_del_rcu(&dev->dev_list);
+ netdev_name_node_del(dev->name_node);
+ hlist_del_rcu(&dev->index_hlist);
+
+ dev_base_seq_inc(dev_net(dev));
}
/*
@@ -206,30 +452,109 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
* Device drivers call our routines to queue packets here. We empty the
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
-#ifdef CONFIG_SYSFS
-extern int netdev_sysfs_init(void);
-extern int netdev_register_sysfs(struct net_device *);
-extern void netdev_unregister_sysfs(struct net_device *);
-#else
-#define netdev_sysfs_init() (0)
-#define netdev_register_sysfs(dev) (0)
-#define netdev_unregister_sysfs(dev) do { } while(0)
-#endif
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
+ .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
+};
+EXPORT_PER_CPU_SYMBOL(softnet_data);
+/* Page_pool has a lockless array/stack to alloc/recycle pages.
+ * PP consumers must pay attention to run APIs in the appropriate context
+ * (e.g. NAPI context).
+ */
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
-/*******************************************************************************
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] = {
+ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] = {
+ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
- Protocol management and registration routines
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
-*******************************************************************************/
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
-/*
- * For efficiency
- */
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+ int i;
+
+ i = netdev_lock_pos(dev->type);
+ lockdep_set_class_and_name(&dev->addr_list_lock,
+ &netdev_addr_lock_key[i],
+ netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
+#endif
+
+/*******************************************************************************
+ *
+ * Protocol management and registration routines
+ *
+ *******************************************************************************/
-static int netdev_nit;
/*
* Add a protocol ID to the list. Now that the input handler is
@@ -247,6 +572,23 @@ static int netdev_nit;
* --ANK (980803)
*/
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (!pt->af_packet_net && !pt->dev)
+ return NULL;
+
+ return pt->dev ? &pt->dev->ptype_all :
+ &pt->af_packet_net->ptype_all;
+ }
+
+ if (pt->dev)
+ return &pt->dev->ptype_specific;
+
+ return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
+ &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
/**
* dev_add_pack - add packet handler
* @pt: packet type declaration
@@ -255,25 +597,23 @@ static int netdev_nit;
* is linked into kernel lists and may not be freed until it has been
* removed from the kernel lists.
*
- * This call does not sleep therefore it can not
+ * This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new packet type (until the next received packet).
*/
void dev_add_pack(struct packet_type *pt)
{
- int hash;
+ struct list_head *head = ptype_head(pt);
- spin_lock_bh(&ptype_lock);
- if (pt->type == htons(ETH_P_ALL)) {
- netdev_nit++;
- list_add_rcu(&pt->list, &ptype_all);
- } else {
- hash = ntohs(pt->type) & 15;
- list_add_rcu(&pt->list, &ptype_base[hash]);
- }
- spin_unlock_bh(&ptype_lock);
+ if (WARN_ON_ONCE(!head))
+ return;
+
+ spin_lock(&ptype_lock);
+ list_add_rcu(&pt->list, head);
+ spin_unlock(&ptype_lock);
}
+EXPORT_SYMBOL(dev_add_pack);
/**
* __dev_remove_pack - remove packet handler
@@ -282,7 +622,7 @@ void dev_add_pack(struct packet_type *pt)
* Remove a protocol handler that was previously added to the kernel
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
* from the kernel lists and can be freed or reused once this function
- * returns.
+ * returns.
*
* The packet type might still be in use by receivers
* and must not be freed until after all the CPU's have gone
@@ -290,16 +630,13 @@ void dev_add_pack(struct packet_type *pt)
*/
void __dev_remove_pack(struct packet_type *pt)
{
- struct list_head *head;
+ struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
- spin_lock_bh(&ptype_lock);
+ if (!head)
+ return;
- if (pt->type == htons(ETH_P_ALL)) {
- netdev_nit--;
- head = &ptype_all;
- } else
- head = &ptype_base[ntohs(pt->type) & 15];
+ spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
if (pt == pt1) {
@@ -308,10 +645,12 @@ void __dev_remove_pack(struct packet_type *pt)
}
}
- printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
+ pr_warn("dev_remove_pack: %p not found\n", pt);
out:
- spin_unlock_bh(&ptype_lock);
+ spin_unlock(&ptype_lock);
}
+EXPORT_SYMBOL(__dev_remove_pack);
+
/**
* dev_remove_pack - remove packet handler
* @pt: packet type declaration
@@ -327,335 +666,666 @@ out:
void dev_remove_pack(struct packet_type *pt)
{
__dev_remove_pack(pt);
-
+
synchronize_net();
}
+EXPORT_SYMBOL(dev_remove_pack);
-/******************************************************************************
-
- Device Boot-time Settings Routines
-
-*******************************************************************************/
-/* Boot time configuration table */
-static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+/*******************************************************************************
+ *
+ * Device Interface Subroutines
+ *
+ *******************************************************************************/
/**
- * netdev_boot_setup_add - add new setup entry
- * @name: name of the device
- * @map: configured settings for the device
+ * dev_get_iflink - get 'iflink' value of a interface
+ * @dev: targeted interface
*
- * Adds new setup entry to the dev_boot_setup list. The function
- * returns 0 on error and 1 on success. This is a generic routine to
- * all netdevices.
+ * Indicates the ifindex the interface is linked to.
+ * Physical interfaces have the same 'ifindex' and 'iflink' values.
*/
-static int netdev_boot_setup_add(char *name, struct ifmap *map)
-{
- struct netdev_boot_setup *s;
- int i;
- s = dev_boot_setup;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
- memset(s[i].name, 0, sizeof(s[i].name));
- strcpy(s[i].name, name);
- memcpy(&s[i].map, map, sizeof(s[i].map));
- break;
- }
- }
+int dev_get_iflink(const struct net_device *dev)
+{
+ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
+ return dev->netdev_ops->ndo_get_iflink(dev);
- return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+ return READ_ONCE(dev->ifindex);
}
+EXPORT_SYMBOL(dev_get_iflink);
/**
- * netdev_boot_setup_check - check boot time settings
- * @dev: the netdevice
+ * dev_fill_metadata_dst - Retrieve tunnel egress information.
+ * @dev: targeted interface
+ * @skb: The packet.
*
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
+ * For better visibility of tunnel traffic OVS needs to retrieve
+ * egress tunnel information for a packet. Following API allows
+ * user to get this info.
*/
-int netdev_boot_setup_check(struct net_device *dev)
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
- struct netdev_boot_setup *s = dev_boot_setup;
- int i;
+ struct ip_tunnel_info *info;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
- !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
- dev->irq = s[i].map.irq;
- dev->base_addr = s[i].map.base_addr;
- dev->mem_start = s[i].map.mem_start;
- dev->mem_end = s[i].map.mem_end;
- return 1;
- }
- }
- return 0;
+ if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
+ return -EINVAL;
+
+ info = skb_tunnel_info_unclone(skb);
+ if (!info)
+ return -ENOMEM;
+ if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
+
+ return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+ int k = stack->num_paths++;
-/**
- * netdev_boot_base - get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
- *
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
- */
-unsigned long netdev_boot_base(const char *prefix, int unit)
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
+
+ return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
{
- const struct netdev_boot_setup *s = dev_boot_setup;
- char name[IFNAMSIZ];
- int i;
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ };
+ struct net_device_path *path;
+ int ret = 0;
+
+ memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
+
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
- sprintf(name, "%s%d", prefix, unit);
+ if (!ctx.dev)
+ return ret;
- /*
- * If device already registered then return base of 1
- * to indicate not to probe for this interface
- */
- if (__dev_get_by_name(name))
- return 1;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
- if (!strcmp(name, s[i].name))
- return s[i].map.base_addr;
- return 0;
+ return ret;
}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
-/*
- * Saves at boot time configured settings for any netdevice.
- */
-int __init netdev_boot_setup(char *str)
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
{
- int ints[5];
- struct ifmap map;
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
- str = get_options(str, ARRAY_SIZE(ints), ints);
- if (!str || !*str)
- return 0;
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
- /* Save settings */
- memset(&map, 0, sizeof(map));
- if (ints[0] > 0)
- map.irq = ints[1];
- if (ints[0] > 1)
- map.base_addr = ints[2];
- if (ints[0] > 2)
- map.mem_start = ints[3];
- if (ints[0] > 3)
- map.mem_end = ints[4];
+ return NULL;
+}
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return NULL;
+
+ if (WARN_ON_ONCE(!napi->dev))
+ return NULL;
+ if (!net_eq(net, dev_net(napi->dev)))
+ return NULL;
- /* Add new entry to the list */
- return netdev_boot_setup_add(str, &map);
+ return napi;
}
-__setup("netdev=", netdev_boot_setup);
+/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
-/*******************************************************************************
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
- Device Interface Subroutines
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
-*******************************************************************************/
+ dev = __netdev_put_lock(dev, net);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
/**
* __dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
* @name: name to find
*
- * Find an interface by name. Must be called under RTNL semaphore
- * or @dev_base_lock. If the name is found a pointer to the device
- * is returned. If the name is not found then %NULL is returned. The
+ * Find an interface by name. Must be called under RTNL semaphore.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned. The
* reference counters are not incremented so the caller must be
* careful with locks.
*/
-struct net_device *__dev_get_by_name(const char *name)
+struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
- struct hlist_node *p;
+ struct netdev_name_node *node_name;
- hlist_for_each(p, dev_name_hash(name)) {
- struct net_device *dev
- = hlist_entry(p, struct net_device, name_hlist);
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
- }
- return NULL;
+ node_name = netdev_name_node_lookup(net, name);
+ return node_name ? node_name->dev : NULL;
}
+EXPORT_SYMBOL(__dev_get_by_name);
/**
- * dev_get_by_name - find a device by its name
+ * dev_get_by_name_rcu - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+ struct netdev_name_node *node_name;
+
+ node_name = netdev_name_node_lookup_rcu(net, name);
+ return node_name ? node_name->dev : NULL;
+}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
+
+/* Deprecated for new users, call netdev_get_by_name() instead */
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_name);
+
+/**
+ * netdev_get_by_name() - find a device by its name
+ * @net: the applicable net namespace
* @name: name to find
+ * @tracker: tracking object for the acquired reference
+ * @gfp: allocation flags for the tracker
*
* Find an interface by name. This can be called from any
* context and does its own locking. The returned handle has
- * the usage count incremented and the caller must use dev_put() to
+ * the usage count incremented and the caller must use netdev_put() to
* release it when it is no longer needed. %NULL is returned if no
* matching device is found.
*/
-
-struct net_device *dev_get_by_name(const char *name)
+struct net_device *netdev_get_by_name(struct net *net, const char *name,
+ netdevice_tracker *tracker, gfp_t gfp)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_name(name);
+ dev = dev_get_by_name(net, name);
if (dev)
- dev_hold(dev);
- read_unlock(&dev_base_lock);
+ netdev_tracker_alloc(dev, tracker, gfp);
return dev;
}
+EXPORT_SYMBOL(netdev_get_by_name);
/**
* __dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
- * about locking. The caller must hold either the RTNL semaphore
- * or @dev_base_lock.
+ * about locking. The caller must hold the RTNL semaphore.
*/
-struct net_device *__dev_get_by_index(int ifindex)
+struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
- struct hlist_node *p;
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
- hlist_for_each(p, dev_index_hash(ifindex)) {
- struct net_device *dev
- = hlist_entry(p, struct net_device, index_hlist);
+ hlist_for_each_entry(dev, head, index_hlist)
if (dev->ifindex == ifindex)
return dev;
- }
+
return NULL;
}
+EXPORT_SYMBOL(__dev_get_by_index);
+/**
+ * dev_get_by_index_rcu - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist)
+ if (dev->ifindex == ifindex)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
+/* Deprecated for new users, call netdev_get_by_index() instead */
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_index);
/**
- * dev_get_by_index - find a device by its ifindex
+ * netdev_get_by_index() - find a device by its ifindex
+ * @net: the applicable net namespace
* @ifindex: index of device
+ * @tracker: tracking object for the acquired reference
+ * @gfp: allocation flags for the tracker
*
* Search for an interface by index. Returns NULL if the device
* is not found or a pointer to the device. The device returned has
* had a reference added and the pointer is safe until the user calls
- * dev_put to indicate they have finished with it.
+ * netdev_put() to indicate they have finished with it.
*/
+struct net_device *netdev_get_by_index(struct net *net, int ifindex,
+ netdevice_tracker *tracker, gfp_t gfp)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (dev)
+ netdev_tracker_alloc(dev, tracker, gfp);
+ return dev;
+}
+EXPORT_SYMBOL(netdev_get_by_index);
-struct net_device *dev_get_by_index(int ifindex)
+/**
+ * dev_get_by_napi_id - find a device by napi_id
+ * @napi_id: ID of the NAPI struct
+ *
+ * Search for an interface by NAPI ID. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not had
+ * its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+struct net_device *dev_get_by_napi_id(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!napi_id_valid(napi_id))
+ return NULL;
+
+ napi = napi_by_id(napi_id);
+
+ return napi ? napi->dev : NULL;
+}
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+static struct net_device *
+__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
+{
+ netdev_lock_ops_compat(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock_ops_compat(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(ifindex);
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock_ops_compat(dev, net);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
dev_hold(dev);
- read_unlock(&dev_base_lock);
- return dev;
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock_ops_compat(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock_ops_compat(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
}
+static DEFINE_SEQLOCK(netdev_rename_lock);
+
+void netdev_copy_name(struct net_device *dev, char *name)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&netdev_rename_lock);
+ strscpy(name, dev->name, IFNAMSIZ);
+ } while (read_seqretry(&netdev_rename_lock, seq));
+}
+EXPORT_IPV6_MOD_GPL(netdev_copy_name);
+
/**
- * dev_getbyhwaddr - find a device by its hardware address
+ * netdev_get_name - get a netdevice name, knowing its ifindex.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @ifindex: the ifindex of the interface to get the name from.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+ struct net_device *dev;
+ int ret;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ netdev_copy_name(dev, name);
+
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
+}
+
+/**
+ * dev_getbyhwaddr_rcu - find a device by its hardware address
+ * @net: the applicable net namespace
* @type: media type of device
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
- * is not found or a pointer to the device. The caller must hold the
- * rtnl semaphore. The returned device has not had its ref count increased
+ * is not found or a pointer to the device.
+ * The caller must hold RCU.
+ * The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
- * BUGS:
- * If the API was consistent this would be __dev_get_by_hwaddr
*/
-struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
+struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
+ const char *ha)
{
struct net_device *dev;
- ASSERT_RTNL();
+ for_each_netdev_rcu(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
- for (dev = dev_base; dev; dev = dev->next)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
- break;
- return dev;
+ return NULL;
}
+EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
+
+ return NULL;
+}
EXPORT_SYMBOL(dev_getbyhwaddr);
-struct net_device *dev_getfirstbyhwtype(unsigned short type)
+struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
- struct net_device *dev;
+ struct net_device *dev, *ret = NULL;
- rtnl_lock();
- for (dev = dev_base; dev; dev = dev->next) {
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev)
if (dev->type == type) {
dev_hold(dev);
+ ret = dev;
break;
}
- }
- rtnl_unlock();
- return dev;
+ rcu_read_unlock();
+ return ret;
}
-
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
- * dev_get_by_flags - find any device with given flags
- * @if_flags: IFF_* values
- * @mask: bitmask of bits in if_flags to check
+ * netdev_get_by_flags_rcu - find any device with given flags
+ * @net: the applicable net namespace
+ * @tracker: tracking object for the acquired reference
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
*
- * Search for any interface with the given flags. Returns NULL if a device
- * is not found or a pointer to the device. The device returned has
- * had a reference added and the pointer is safe until the user calls
- * dev_put to indicate they have finished with it.
+ * Search for any interface with the given flags.
+ *
+ * Context: rcu_read_lock() must be held.
+ * Returns: NULL if a device is not found or a pointer to the device.
*/
-
-struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
+struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
+ unsigned short if_flags, unsigned short mask)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- for (dev = dev_base; dev != NULL; dev = dev->next) {
- if (((dev->flags ^ if_flags) & mask) == 0) {
- dev_hold(dev);
- break;
+ for_each_netdev_rcu(net, dev) {
+ if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
+ netdev_hold(dev, tracker, GFP_ATOMIC);
+ return dev;
}
}
- read_unlock(&dev_base_lock);
- return dev;
+
+ return NULL;
}
+EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);
/**
* dev_valid_name - check if name is okay for network device
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
-int dev_valid_name(const char *name)
+bool dev_valid_name(const char *name)
{
if (*name == '\0')
- return 0;
- if (strlen(name) >= IFNAMSIZ)
- return 0;
+ return false;
+ if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
+ return false;
if (!strcmp(name, ".") || !strcmp(name, ".."))
- return 0;
+ return false;
while (*name) {
- if (*name == '/' || isspace(*name))
- return 0;
+ if (*name == '/' || *name == ':' || isspace(*name))
+ return false;
name++;
}
- return 1;
+ return true;
}
+EXPORT_SYMBOL(dev_valid_name);
/**
- * dev_alloc_name - allocate a name for a device
- * @dev: device
+ * __dev_alloc_name - allocate a name for a device
+ * @net: network namespace to allocate the device name in
* @name: name format string
+ * @res: result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
@@ -666,102 +1336,230 @@ int dev_valid_name(const char *name)
* Returns the number of the unit assigned or a negative errno code.
*/
-int dev_alloc_name(struct net_device *dev, const char *name)
+static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
int i = 0;
- char buf[IFNAMSIZ];
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
- long *inuse;
+ unsigned long *inuse;
struct net_device *d;
+ char buf[IFNAMSIZ];
- p = strnchr(name, IFNAMSIZ-1, '%');
- if (p) {
- /*
- * Verify the string as this thing may have come from
- * the user. There must be either one "%d" and no other "%"
- * characters.
- */
- if (p[1] != 'd' || strchr(p + 2, '%'))
- return -EINVAL;
+ /* Verify the string as this thing may have come from the user.
+ * There must be one "%d" and no other "%" characters.
+ */
+ p = strchr(name, '%');
+ if (!p || p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
- /* Use one page as a bit array of possible slots */
- inuse = (long *) get_zeroed_page(GFP_ATOMIC);
- if (!inuse)
- return -ENOMEM;
+ /* Use one page as a bit array of possible slots */
+ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
- for (d = dev_base; d; d = d->next) {
- if (!sscanf(d->name, name, &i))
+ for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
+
+ netdev_for_each_altname(d, name_node) {
+ if (!sscanf(name_node->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
- /* avoid cases where sscanf is not exact inverse of printf */
- snprintf(buf, sizeof(buf), name, i);
- if (!strncmp(buf, d->name, IFNAMSIZ))
- set_bit(i, inuse);
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
- i = find_first_zero_bit(inuse, max_netdevices);
- free_page((unsigned long) inuse);
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
- snprintf(buf, sizeof(buf), name, i);
- if (!__dev_get_by_name(buf)) {
- strlcpy(dev->name, buf, IFNAMSIZ);
- return i;
- }
+ i = find_first_zero_bit(inuse, max_netdevices);
+ bitmap_free(inuse);
+ if (i == max_netdevices)
+ return -ENFILE;
- /* It is possible to run out of possible slots
- * when the name is long and there isn't enough space left
- * for the digits, or if all bits are used.
- */
- return -ENFILE;
+ /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
+ strscpy(buf, name, IFNAMSIZ);
+ snprintf(res, IFNAMSIZ, buf, i);
+ return i;
}
+/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
+static int dev_prep_valid_name(struct net *net, struct net_device *dev,
+ const char *want_name, char *out_name,
+ int dup_errno)
+{
+ if (!dev_valid_name(want_name))
+ return -EINVAL;
+
+ if (strchr(want_name, '%'))
+ return __dev_alloc_name(net, want_name, out_name);
+
+ if (netdev_name_in_use(net, want_name))
+ return -dup_errno;
+ if (out_name != want_name)
+ strscpy(out_name, want_name, IFNAMSIZ);
+ return 0;
+}
/**
- * dev_change_name - change name of a device
+ * dev_alloc_name - allocate a name for a device
* @dev: device
- * @newname: name (or format string) must be at least IFNAMSIZ
+ * @name: name format string
*
- * Change name of a device, can pass format strings "eth%d".
- * for wildcarding.
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
*/
-int dev_change_name(struct net_device *dev, char *newname)
+
+int dev_alloc_name(struct net_device *dev, const char *name)
{
+ return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
+}
+EXPORT_SYMBOL(dev_alloc_name);
+
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
+ const char *name)
+{
+ int ret;
+
+ ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
+ return ret < 0 ? ret : 0;
+}
+
+int netif_change_name(struct net_device *dev, const char *newname)
+{
+ struct net *net = dev_net(dev);
+ unsigned char old_assign_type;
+ char oldname[IFNAMSIZ];
int err = 0;
+ int ret;
- ASSERT_RTNL();
+ ASSERT_RTNL_NET(net);
- if (dev->flags & IFF_UP)
- return -EBUSY;
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
+ return 0;
- if (!dev_valid_name(newname))
- return -EINVAL;
+ memcpy(oldname, dev->name, IFNAMSIZ);
- if (strchr(newname, '%')) {
- err = dev_alloc_name(dev, newname);
- if (err < 0)
- return err;
- strcpy(newname, dev->name);
+ write_seqlock_bh(&netdev_rename_lock);
+ err = dev_get_valid_name(net, dev, newname);
+ write_sequnlock_bh(&netdev_rename_lock);
+
+ if (err < 0)
+ return err;
+
+ if (oldname[0] && !strchr(oldname, '%'))
+ netdev_info(dev, "renamed from %s%s\n", oldname,
+ dev->flags & IFF_UP ? " (while UP)" : "");
+
+ old_assign_type = dev->name_assign_type;
+ WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
+
+rollback:
+ ret = device_rename(&dev->dev, dev->name);
+ if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
+ return ret;
}
- else if (__dev_get_by_name(newname))
- return -EEXIST;
- else
- strlcpy(dev->name, newname, IFNAMSIZ);
- err = class_device_rename(&dev->class_dev, dev->name);
- if (!err) {
- hlist_del(&dev->name_hlist);
- hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_CHANGENAME, dev);
+ netdev_adjacent_rename_links(dev, oldname);
+
+ netdev_name_node_del(dev->name_node);
+
+ synchronize_net();
+
+ netdev_name_node_add(net, dev->name_node);
+
+ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
+ ret = notifier_to_errno(ret);
+
+ if (ret) {
+ /* err >= 0 after dev_alloc_name() or stores the first errno */
+ if (err >= 0) {
+ err = ret;
+ write_seqlock_bh(&netdev_rename_lock);
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
+ memcpy(oldname, newname, IFNAMSIZ);
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
+ old_assign_type = NET_NAME_RENAMED;
+ goto rollback;
+ } else {
+ netdev_err(dev, "name change rollback failed: %d\n",
+ ret);
+ }
}
return err;
}
+int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ struct dev_ifalias *new_alias = NULL;
+
+ if (len >= IFALIASZ)
+ return -EINVAL;
+
+ if (len) {
+ new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+ if (!new_alias)
+ return -ENOMEM;
+
+ memcpy(new_alias->ifalias, alias, len);
+ new_alias->ifalias[len] = 0;
+ }
+
+ mutex_lock(&ifalias_mutex);
+ new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
+ mutex_is_locked(&ifalias_mutex));
+ mutex_unlock(&ifalias_mutex);
+
+ if (new_alias)
+ kfree_rcu(new_alias, rcuhead);
+
+ return len;
+}
+
+/**
+ * dev_get_alias - get ifalias of a device
+ * @dev: device
+ * @name: buffer to store name of ifalias
+ * @len: size of buffer
+ *
+ * get ifalias for a device. Caller must make sure dev cannot go
+ * away, e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+ const struct dev_ifalias *alias;
+ int ret = 0;
+
+ rcu_read_lock();
+ alias = rcu_dereference(dev->ifalias);
+ if (alias)
+ ret = snprintf(name, len, "%s", alias->ifalias);
+ rcu_read_unlock();
+
+ return ret;
+}
+
/**
* netdev_features_change - device changes features
* @dev: device to cause notification
@@ -770,398 +1568,1979 @@ int dev_change_name(struct net_device *dev, char *newname)
*/
void netdev_features_change(struct net_device *dev)
{
- raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
+ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);
-/**
- * netdev_state_change - device changes state
- * @dev: device to cause notification
- *
- * Called to indicate a device has changed state. This function calls
- * the notifier chains for netdev_chain and sends a NEWLINK message
- * to the routing socket.
- */
-void netdev_state_change(struct net_device *dev)
+void netif_state_change(struct net_device *dev)
{
+ netdev_ops_assert_locked_or_invisible(dev);
+
if (dev->flags & IFF_UP) {
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_CHANGE, dev);
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ struct netdev_notifier_change_info change_info = {
+ .info.dev = dev,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_CHANGE,
+ &change_info.info);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
}
}
/**
- * dev_load - load a network module
- * @name: name of interface
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
*
- * If a network interface is not present and the process has suitable
- * privileges this function loads the module. If module loading is not
- * available in this kernel then it becomes a nop.
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
*/
-
-void dev_load(const char *name)
+void __netdev_notify_peers(struct net_device *dev)
{
- struct net_device *dev;
-
- read_lock(&dev_base_lock);
- dev = __dev_get_by_name(name);
- read_unlock(&dev_base_lock);
-
- if (!dev && capable(CAP_SYS_MODULE))
- request_module("%s", name);
+ ASSERT_RTNL();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
+EXPORT_SYMBOL(__netdev_notify_peers);
-static int default_rebuild_header(struct sk_buff *skb)
+/**
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netdev_notify_peers(struct net_device *dev)
{
- printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
- skb->dev ? skb->dev->name : "NULL!!!");
- kfree_skb(skb);
- return 1;
+ rtnl_lock();
+ __netdev_notify_peers(dev);
+ rtnl_unlock();
}
+EXPORT_SYMBOL(netdev_notify_peers);
+static int napi_threaded_poll(void *data);
-/**
- * dev_open - prepare an interface for use.
- * @dev: device to open
- *
- * Takes a device from down to up state. The device's private open
- * function is invoked and then the multicast lists are loaded. Finally
- * the device is moved into the up state and a %NETDEV_UP message is
- * sent to the netdev notifier chain.
- *
- * Calling this function on an active interface is a nop. On a failure
- * a negative errno code is returned.
- */
-int dev_open(struct net_device *dev)
+static int napi_kthread_create(struct napi_struct *n)
{
- int ret = 0;
+ int err = 0;
- /*
- * Is it already up?
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
*/
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
- if (dev->flags & IFF_UP)
- return 0;
+ return err;
+}
- /*
- * Is it even present?
- */
- if (!netif_device_present(dev))
- return -ENODEV;
+static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret;
- /*
- * Call device private open method
+ ASSERT_RTNL();
+ dev_addr_check(dev);
+
+ if (!netif_device_present(dev)) {
+ /* may be detached because parent is runtime-suspended */
+ if (dev->dev.parent)
+ pm_runtime_resume(dev->dev.parent);
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ }
+
+ /* Block netpoll from trying to do any rx path servicing.
+ * If we don't do this there is a chance ndo_poll_controller
+ * or ndo_poll may be running while we open the device
*/
+ netpoll_poll_disable(dev);
+
+ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
set_bit(__LINK_STATE_START, &dev->state);
- if (dev->open) {
- ret = dev->open(dev);
- if (ret)
- clear_bit(__LINK_STATE_START, &dev->state);
+
+ netdev_ops_assert_locked(dev);
+
+ if (ops->ndo_validate_addr)
+ ret = ops->ndo_validate_addr(dev);
+
+ if (!ret && ops->ndo_open)
+ ret = ops->ndo_open(dev);
+
+ netpoll_poll_enable(dev);
+
+ if (ret)
+ clear_bit(__LINK_STATE_START, &dev->state);
+ else {
+ netif_set_up(dev, true);
+ dev_set_rx_mode(dev);
+ dev_activate(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
}
- /*
- * If it went open OK then:
- */
+ return ret;
+}
- if (!ret) {
- /*
- * Set the flags.
- */
- dev->flags |= IFF_UP;
+int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+ int ret;
- /*
- * Initialize multicasting status
- */
- dev_mc_upload(dev);
+ if (dev->flags & IFF_UP)
+ return 0;
- /*
- * Wakeup transmit queue engine
+ ret = __dev_open(dev, extack);
+ if (ret < 0)
+ return ret;
+
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
+ call_netdevice_notifiers(NETDEV_UP, dev);
+
+ return ret;
+}
+
+static void __dev_close_many(struct list_head *head)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ might_sleep();
+
+ list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+
+ clear_bit(__LINK_STATE_START, &dev->state);
+
+ /* Synchronize to scheduled poll. We cannot touch poll list, it
+ * can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
*/
- dev_activate(dev);
+ smp_mb__after_atomic(); /* Commit netif_running(). */
+ }
+
+ dev_deactivate_many(head);
+
+ list_for_each_entry(dev, head, close_list) {
+ const struct net_device_ops *ops = dev->netdev_ops;
/*
- * ... and announce new interface.
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
*/
- raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
+
+ netdev_ops_assert_locked(dev);
+
+ if (ops->ndo_stop)
+ ops->ndo_stop(dev);
+
+ netif_set_up(dev, false);
+ netpoll_poll_enable(dev);
+ }
+}
+
+static void __dev_close(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ __dev_close_many(&single);
+ list_del(&single);
+}
+
+void netif_close_many(struct list_head *head, bool unlink)
+{
+ struct net_device *dev, *tmp;
+
+ /* Remove the devices that don't need to be closed */
+ list_for_each_entry_safe(dev, tmp, head, close_list)
+ if (!(dev->flags & IFF_UP))
+ list_del_init(&dev->close_list);
+
+ __dev_close_many(head);
+
+ list_for_each_entry_safe(dev, tmp, head, close_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ if (unlink)
+ list_del_init(&dev->close_list);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");
+
+void netif_close(struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ netif_close_many(&single, true);
+ list_del(&single);
+ }
+}
+EXPORT_SYMBOL(netif_close);
+
+void netif_disable_lro(struct net_device *dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
+
+ if (unlikely(dev->features & NETIF_F_LRO))
+ netdev_WARN(dev, "failed to disable LRO!\n");
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ netdev_lock_ops(lower_dev);
+ netif_disable_lro(lower_dev);
+ netdev_unlock_ops(lower_dev);
}
- return ret;
}
+EXPORT_IPV6_MOD(netif_disable_lro);
/**
- * dev_close - shutdown an interface.
- * @dev: device to shutdown
+ * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
+ * @dev: device
*
- * This function moves an active device into down state. A
- * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- * chain.
+ * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
+ * called under RTNL. This is needed if Generic XDP is installed on
+ * the device.
*/
-int dev_close(struct net_device *dev)
+static void dev_disable_gro_hw(struct net_device *dev)
{
- if (!(dev->flags & IFF_UP))
- return 0;
+ dev->wanted_features &= ~NETIF_F_GRO_HW;
+ netdev_update_features(dev);
- /*
- * Tell people we are going down, so that they can
- * prepare to death, when device is still operating.
- */
- raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
+ if (unlikely(dev->features & NETIF_F_GRO_HW))
+ netdev_WARN(dev, "failed to disable GRO_HW!\n");
+}
- dev_deactivate(dev);
+const char *netdev_cmd_to_name(enum netdev_cmd cmd)
+{
+#define N(val) \
+ case NETDEV_##val: \
+ return "NETDEV_" __stringify(val);
+ switch (cmd) {
+ N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
+ N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
+ N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
+ N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
+ N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
+ N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
+ N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+ N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
+ N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+ N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
+ N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
+ N(XDP_FEAT_CHANGE)
+ }
+#undef N
+ return "UNKNOWN_NETDEV_EVENT";
+}
+EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
- clear_bit(__LINK_STATE_START, &dev->state);
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+ struct net_device *dev)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ };
- /* Synchronize to scheduled poll. We cannot touch poll list,
- * it can be even on different cpu. So just clear netif_running(),
- * and wait when poll really will happen. Actually, the best place
- * for this is inside dev->stop() after device stopped its irq
- * engine, but this requires more changes in devices. */
+ return nb->notifier_call(nb, val, &info);
+}
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
- while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
- /* No hurry. */
- msleep(1);
- }
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ int err;
- /*
- * Call the device specific close. This cannot fail.
- * Only if device is UP
- *
- * We allow it to be called even after a DETACH hot-plug
- * event.
- */
- if (dev->stop)
- dev->stop(dev);
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
- /*
- * Device is now down.
- */
+ if (!(dev->flags & IFF_UP))
+ return 0;
- dev->flags &= ~IFF_UP;
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
+ return 0;
+}
- /*
- * Tell people we are down
- */
- raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+}
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ netdev_lock_ops(dev);
+ err = call_netdevice_register_notifiers(nb, dev);
+ netdev_unlock_ops(dev);
+ if (err)
+ goto rollback;
+ }
return 0;
+
+rollback:
+ for_each_netdev_continue_reverse(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+ return err;
}
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
-/*
- * Device change register/unregister. These are not inline or static
- * as we export them to the world.
- */
+ for_each_netdev(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+}
+
+static int dev_boot_phase = 1;
/**
- * register_netdevice_notifier - register a network notifier block
- * @nb: notifier
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
*
- * Register a notifier to be called when network device events occur.
- * The notifier passed is linked into the kernel structures and must
- * not be reused until it has been unregistered. A negative errno code
- * is returned on a failure.
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
*
- * When registered all registration and up events are replayed
- * to the new notifier to allow device to have a race free
- * view of the network device list.
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
+ struct net *net;
int err;
+ /* Close race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
- err = raw_notifier_chain_register(&netdev_chain, nb);
- if (!err) {
- for (dev = dev_base; dev; dev = dev->next) {
- nb->notifier_call(nb, NETDEV_REGISTER, dev);
- if (dev->flags & IFF_UP)
- nb->notifier_call(nb, NETDEV_UP, dev);
- }
+ err = raw_notifier_chain_register(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+ if (dev_boot_phase)
+ goto unlock;
+ for_each_net(net) {
+ __rtnl_net_lock(net);
+ err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ if (err)
+ goto rollback;
}
+
+unlock:
rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
return err;
+
+rollback:
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
+ call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
+
+ raw_notifier_chain_unregister(&netdev_chain, nb);
+ goto unlock;
}
+EXPORT_SYMBOL(register_netdevice_notifier);
/**
- * unregister_netdevice_notifier - unregister a network notifier block
- * @nb: notifier
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
*
- * Unregister a notifier previously registered by
- * register_netdevice_notifier(). The notifier is unlinked into the
- * kernel structures and may then be reused. A negative errno code
- * is returned on a failure.
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
{
+ struct net *net;
int err;
+ /* Close race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+
+ for_each_net(net) {
+ __rtnl_net_lock(net);
+ call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
+
+unlock:
rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier);
+
+static int __register_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb,
+ bool ignore_call_fail)
+{
+ int err;
+
+ err = raw_notifier_chain_register(&net->netdev_chain, nb);
+ if (err)
+ return err;
+ if (dev_boot_phase)
+ return 0;
+
+ err = call_netdevice_register_net_notifiers(nb, net);
+ if (err && !ignore_call_fail)
+ goto chain_unregister;
+
+ return 0;
+
+chain_unregister:
+ raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ return err;
+}
+
+static int __unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ if (err)
+ return err;
+
+ call_netdevice_unregister_net_notifiers(nb, net);
+ return 0;
+}
+
+/**
+ * register_netdevice_notifier_net - register a per-netns network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_net_lock(net);
+ err = __register_netdevice_notifier_net(net, nb, false);
+ rtnl_net_unlock(net);
+
return err;
}
+EXPORT_SYMBOL(register_netdevice_notifier_net);
+
+/**
+ * unregister_netdevice_notifier_net - unregister a per-netns
+ * network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier_net(). The notifier is unlinked from the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_net_lock(net);
+ err = __unregister_netdevice_notifier_net(net, nb);
+ rtnl_net_unlock(net);
+
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
+
+static void __move_netdevice_notifier_net(struct net *src_net,
+ struct net *dst_net,
+ struct notifier_block *nb)
+{
+ __unregister_netdevice_notifier_net(src_net, nb);
+ __register_netdevice_notifier_net(dst_net, nb, true);
+}
+
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
+int register_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_net_dev_lock(dev);
+ err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
+ if (!err) {
+ nn->nb = nb;
+ list_add(&nn->list, &dev->net_notifier_list);
+ }
+ rtnl_net_dev_unlock(dev);
+
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
+
+int unregister_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_net_dev_lock(dev);
+ list_del(&nn->list);
+ err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
+ rtnl_net_dev_unlock(dev);
+
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
+
+static void move_netdevice_notifiers_dev_net(struct net_device *dev,
+ struct net *net)
+{
+ struct netdev_net_notifier *nn;
+
+ list_for_each_entry(nn, &dev->net_notifier_list, list)
+ __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
+}
+
+/**
+ * call_netdevice_notifiers_info - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+int call_netdevice_notifiers_info(unsigned long val,
+ struct netdev_notifier_info *info)
+{
+ struct net *net = dev_net(info->dev);
+ int ret;
+
+ ASSERT_RTNL();
+
+ /* Run per-netns notifier block chain first, then run the global one.
+ * Hopefully, one day, the global one is going to be removed after
+ * all notifier block registrators get converted to be per-netns.
+ */
+ ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
+ if (ret & NOTIFY_STOP_MASK)
+ return ret;
+ return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+
+/**
+ * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
+ * for and rollback on error
+ * @val_up: value passed unmodified to notifier function
+ * @val_down: value passed unmodified to the notifier function when
+ * recovering from an error on @val_up
+ * @info: notifier information data
+ *
+ * Call all per-netns network notifier blocks, but not notifier blocks on
+ * the global notifier chain. Parameters and return value are as for
+ * raw_notifier_call_chain_robust().
+ */
+
+static int
+call_netdevice_notifiers_info_robust(unsigned long val_up,
+ unsigned long val_down,
+ struct netdev_notifier_info *info)
+{
+ struct net *net = dev_net(info->dev);
+
+ ASSERT_RTNL();
+
+ return raw_notifier_call_chain_robust(&net->netdev_chain,
+ val_up, val_down, info);
+}
+
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ .extack = extack,
+ };
+
+ return call_netdevice_notifiers_info(val, &info);
+}
/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
- * @v: pointer passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
-int call_netdevice_notifiers(unsigned long val, void *v)
+int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- return raw_notifier_call_chain(&netdev_chain, val, v);
+ return call_netdevice_notifiers_extack(val, dev, NULL);
}
+EXPORT_SYMBOL(call_netdevice_notifiers);
+
+/**
+ * call_netdevice_notifiers_mtu - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @arg: additional u32 argument passed to the notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+static int call_netdevice_notifiers_mtu(unsigned long val,
+ struct net_device *dev, u32 arg)
+{
+ struct netdev_notifier_info_ext info = {
+ .info.dev = dev,
+ .ext.mtu = arg,
+ };
+
+ BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
+
+ return call_netdevice_notifiers_info(val, &info.info);
+}
+
+#ifdef CONFIG_NET_INGRESS
+static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
+
+void net_inc_ingress_queue(void)
+{
+ static_branch_inc(&ingress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
+
+void net_dec_ingress_queue(void)
+{
+ static_branch_dec(&ingress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
+#endif
+
+#ifdef CONFIG_NET_EGRESS
+static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
+
+void net_inc_egress_queue(void)
+{
+ static_branch_inc(&egress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_inc_egress_queue);
+
+void net_dec_egress_queue(void)
+{
+ static_branch_dec(&egress_needed_key);
+}
+EXPORT_SYMBOL_GPL(net_dec_egress_queue);
+#endif
-/* When > 0 there are consumers of rx skb time stamps */
-static atomic_t netstamp_needed = ATOMIC_INIT(0);
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
+#endif
+
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
+#ifdef CONFIG_JUMP_LABEL
+static atomic_t netstamp_needed_deferred;
+static atomic_t netstamp_wanted;
+static void netstamp_clear(struct work_struct *work)
+{
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+ int wanted;
+
+ wanted = atomic_add_return(deferred, &netstamp_wanted);
+ if (wanted > 0)
+ static_branch_enable(&netstamp_needed_key);
+ else
+ static_branch_disable(&netstamp_needed_key);
+}
+static DECLARE_WORK(netstamp_work, netstamp_clear);
+#endif
void net_enable_timestamp(void)
{
- atomic_inc(&netstamp_needed);
+#ifdef CONFIG_JUMP_LABEL
+ int wanted = atomic_read(&netstamp_wanted);
+
+ while (wanted > 0) {
+ if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
+ return;
+ }
+ atomic_inc(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
+ static_branch_inc(&netstamp_needed_key);
+#endif
}
+EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
- atomic_dec(&netstamp_needed);
+#ifdef CONFIG_JUMP_LABEL
+ int wanted = atomic_read(&netstamp_wanted);
+
+ while (wanted > 1) {
+ if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
+ return;
+ }
+ atomic_dec(&netstamp_needed_deferred);
+ schedule_work(&netstamp_work);
+#else
+ static_branch_dec(&netstamp_needed_key);
+#endif
}
+EXPORT_SYMBOL(net_disable_timestamp);
-void __net_timestamp(struct sk_buff *skb)
+static inline void net_timestamp_set(struct sk_buff *skb)
{
- struct timeval tv;
+ skb->tstamp = 0;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ if (static_branch_unlikely(&netstamp_needed_key))
+ skb->tstamp = ktime_get_real();
+}
+
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch_unlikely(&netstamp_needed_key)) { \
+ if ((COND) && !(SKB)->tstamp) \
+ (SKB)->tstamp = ktime_get_real(); \
+ } \
- do_gettimeofday(&tv);
- skb_set_timestamp(skb, &tv);
+bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
+{
+ return __is_skb_forwardable(dev, skb, true);
}
-EXPORT_SYMBOL(__net_timestamp);
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
-static inline void net_timestamp(struct sk_buff *skb)
+static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+ bool check_mtu)
{
- if (atomic_read(&netstamp_needed))
- __net_timestamp(skb);
- else {
- skb->tstamp.off_sec = 0;
- skb->tstamp.off_usec = 0;
+ int ret = ____dev_forward_skb(dev, skb, check_mtu);
+
+ if (likely(!ret)) {
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
+
+ return ret;
+}
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, true);
}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
+
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped, but freed)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
+int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+}
+
+static int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
+ return -ENOMEM;
+ refcount_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static inline void deliver_ptype_list_skb(struct sk_buff *skb,
+ struct packet_type **pt,
+ struct net_device *orig_dev,
+ __be16 type,
+ struct list_head *ptype_list)
+{
+ struct packet_type *ptype, *pt_prev = *pt;
+
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (ptype->type != type)
+ continue;
+ if (unlikely(pt_prev))
+ deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+ *pt = pt_prev;
+}
+
+static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
+{
+ if (!ptype->af_packet_priv || !skb->sk)
+ return false;
+
+ if (ptype->id_match)
+ return ptype->id_match(ptype, skb->sk);
+ else if ((struct sock *)ptype->af_packet_priv == skb->sk)
+ return true;
+
+ return false;
+}
+
+/**
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
+ *
+ * @dev: network device to check for the presence of taps
+ */
+bool dev_nit_active_rcu(const struct net_device *dev)
+{
+ /* Callers may hold either RCU or RCU BH lock */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ return !list_empty(&dev_net(dev)->ptype_all) ||
+ !list_empty(&dev->ptype_all);
+}
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
*/
-static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct packet_type *ptype;
-
- net_timestamp(skb);
+ struct packet_type *ptype, *pt_prev = NULL;
+ struct list_head *ptype_list;
+ struct sk_buff *skb2 = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ ptype_list = &dev_net_rcu(dev)->ptype_all;
+again:
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (READ_ONCE(ptype->ignore_outgoing))
+ continue;
+
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
- if ((ptype->dev == dev || !ptype->dev) &&
- (ptype->af_packet_priv == NULL ||
- (struct sock *)ptype->af_packet_priv != skb->sk)) {
- struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
+ if (skb_loop_sk(ptype, skb))
+ continue;
+
+ if (unlikely(pt_prev)) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
+
+ /* need to clone skb, done only once */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out_unlock;
+
+ net_timestamp_set(skb2);
+
+ /* skb->nh should be correctly
+ * set by sender, so that the second statement is
+ * just protection against buggy protocols.
+ */
+ skb_reset_mac_header(skb2);
+
+ if (skb_network_header(skb2) < skb2->data ||
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+ net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+ ntohs(skb2->protocol),
+ dev->name);
+ skb_reset_network_header(skb2);
+ }
+
+ skb2->transport_header = skb2->network_header;
+ skb2->pkt_type = PACKET_OUTGOING;
+ pt_prev = ptype;
+ }
+
+ if (ptype_list != &dev->ptype_all) {
+ ptype_list = &dev->ptype_all;
+ goto again;
+ }
+out_unlock:
+ if (pt_prev) {
+ if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
+ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
+ else
+ kfree_skb(skb2);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
+
+/**
+ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+ if (dev->num_tc) {
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+ int i;
+
+ /* walk through the TCs and see if it falls into any of them */
+ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+ if ((txq - tc->offset) < tc->count)
+ return i;
+ }
+
+ /* didn't find it, just return -1 to indicate no match */
+ return -1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_txq_to_tc);
+
+#ifdef CONFIG_XPS
+static struct static_key xps_needed __read_mostly;
+static struct static_key xps_rxqs_needed __read_mostly;
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P) \
+ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *old_maps, int tci, u16 index)
+{
+ struct xps_map *map = NULL;
+ int pos;
+
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ return false;
+
+ for (pos = map->len; pos--;) {
+ if (map->queues[pos] != index)
+ continue;
+
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
+ break;
+ }
+
+ if (old_maps)
+ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool remove_xps_queue_cpu(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ int cpu, u16 offset, u16 count)
+{
+ int num_tc = dev_maps->num_tc;
+ bool active = false;
+ int tci;
+
+ for (tci = cpu * num_tc; num_tc--; tci++) {
+ int i, j;
+
+ for (i = count, j = offset; i--; j++) {
+ if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
+ }
- /* skb->nh should be correctly
- set by sender, so that the second statement is
- just protection against buggy protocols.
- */
- skb2->mac.raw = skb2->data;
-
- if (skb2->nh.raw < skb2->data ||
- skb2->nh.raw > skb2->tail) {
- if (net_ratelimit())
- printk(KERN_CRIT "protocol %04x is "
- "buggy, dev %s\n",
- skb2->protocol, dev->name);
- skb2->nh.raw = skb2->data;
+ active |= i < 0;
+ }
+
+ return active;
+}
+
+static void reset_xps_maps(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ enum xps_map_type type)
+{
+ static_key_slow_dec_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
+ kfree_rcu(dev_maps, rcu);
+}
+
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ u16 offset, u16 count)
+{
+ struct xps_dev_maps *dev_maps;
+ bool active = false;
+ int i, j;
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ return;
+
+ for (j = 0; j < dev_maps->nr_ids; j++)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
+ if (!active)
+ reset_xps_maps(dev, dev_maps, type);
+
+ if (type == XPS_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
+ }
+}
+
+static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
+ u16 count)
+{
+ if (!static_key_false(&xps_needed))
+ return;
+
+ cpus_read_lock();
+ mutex_lock(&xps_map_mutex);
+
+ if (static_key_false(&xps_rxqs_needed))
+ clean_xps_maps(dev, XPS_RXQS, offset, count);
+
+ clean_xps_maps(dev, XPS_CPUS, offset, count);
+
+ mutex_unlock(&xps_map_mutex);
+ cpus_read_unlock();
+}
+
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
+}
+
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+ u16 index, bool is_rxqs_map)
+{
+ struct xps_map *new_map;
+ int alloc_len = XPS_MIN_MAP_ALLOC;
+ int i, pos;
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] != index)
+ continue;
+ return map;
+ }
+
+ /* Need to add tx-queue to this CPU's/rx-queue's existing map */
+ if (map) {
+ if (pos < map->alloc_len)
+ return map;
+
+ alloc_len = map->alloc_len * 2;
+ }
+
+ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+ * map
+ */
+ if (is_rxqs_map)
+ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+ else
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(attr_index));
+ if (!new_map)
+ return NULL;
+
+ for (i = 0; i < pos; i++)
+ new_map->queues[i] = map->queues[i];
+ new_map->alloc_len = alloc_len;
+ new_map->len = pos;
+
+ return new_map;
+}
+
+/* Copy xps maps at a given index */
+static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *new_dev_maps, int index,
+ int tc, bool skip_tc)
+{
+ int i, tci = index * dev_maps->num_tc;
+ struct xps_map *map;
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc && skip_tc)
+ continue;
+
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+}
+
+/* Must be called under cpus_read_lock */
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type)
+{
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ const unsigned long *online_mask = NULL;
+ bool active = false, copy = false;
+ int i, j, tci, numa_node_id = -2;
+ int maps_sz, num_tc = 1, tc = 0;
+ struct xps_map *map, *new_map;
+ unsigned int nr_ids;
+
+ WARN_ON_ONCE(index >= dev->num_tx_queues);
+
+ if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
+ num_tc = dev->num_tc;
+ if (num_tc < 0)
+ return -EINVAL;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
+ mutex_lock(&xps_map_mutex);
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (type == XPS_RXQS) {
+ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+ nr_ids = dev->num_rx_queues;
+ } else {
+ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+ if (num_possible_cpus() > 1)
+ online_mask = cpumask_bits(cpu_online_mask);
+ nr_ids = nr_cpu_ids;
+ }
+
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
+
+ /* The old dev_maps could be larger or smaller than the one we're
+ * setting up now, as dev->num_tc or nr_ids could have been updated in
+ * between. We could try to be smart, but let's be safe instead and only
+ * copy foreign traffic classes if the two map sizes match.
+ */
+ if (dev_maps &&
+ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ copy = true;
+
+ /* allocate memory for queue storage */
+ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+ j < nr_ids;) {
+ if (!new_dev_maps) {
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
}
- skb2->h.raw = skb2->nh.raw;
- skb2->pkt_type = PACKET_OUTGOING;
- ptype->func(skb2, skb->dev, ptype, skb->dev);
+ new_dev_maps->nr_ids = nr_ids;
+ new_dev_maps->num_tc = num_tc;
}
+
+ tci = j * num_tc + tc;
+ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
+
+ map = expand_xps_map(map, j, index, type == XPS_RXQS);
+ if (!map)
+ goto error;
+
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
- rcu_read_unlock();
+
+ if (!new_dev_maps)
+ goto out_no_new_maps;
+
+ if (!dev_maps) {
+ /* Increment static keys at most once per type */
+ static_key_slow_inc_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+ }
+
+ for (j = 0; j < nr_ids; j++) {
+ bool skip_tc = false;
+
+ tci = j * num_tc + tc;
+ if (netif_attr_test_mask(j, mask, nr_ids) &&
+ netif_attr_test_online(j, online_mask, nr_ids)) {
+ /* add tx-queue to CPU/rx-queue maps */
+ int pos = 0;
+
+ skip_tc = true;
+
+ map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ while ((pos < map->len) && (map->queues[pos] != index))
+ pos++;
+
+ if (pos == map->len)
+ map->queues[map->len++] = index;
+#ifdef CONFIG_NUMA
+ if (type == XPS_CPUS) {
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(j);
+ else if (numa_node_id != cpu_to_node(j))
+ numa_node_id = -1;
+ }
+#endif
+ }
+
+ if (copy)
+ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ skip_tc);
+ }
+
+ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
+
+ /* Cleanup old maps */
+ if (!dev_maps)
+ goto out_no_old_maps;
+
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
+
+ if (copy) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ if (map == new_map)
+ continue;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ }
+ }
+
+ old_dev_maps = dev_maps;
+
+out_no_old_maps:
+ dev_maps = new_dev_maps;
+ active = true;
+
+out_no_new_maps:
+ if (type == XPS_CPUS)
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ?
+ numa_node_id : NUMA_NO_NODE);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ /* removes tx-queue from unused CPUs/rx-queues */
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ tci = j * dev_maps->num_tc;
+
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc &&
+ netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ continue;
+
+ active |= remove_xps_queue(dev_maps,
+ copy ? old_dev_maps : NULL,
+ tci, index);
+ }
+ }
+
+ if (old_dev_maps)
+ kfree_rcu(old_dev_maps, rcu);
+
+ /* free map if not active */
+ if (!active)
+ reset_xps_maps(dev, dev_maps, type);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+
+ return 0;
+error:
+ /* remove any maps that we added */
+ for (j = 0; j < nr_ids; j++) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ map = copy ?
+ xmap_dereference(dev_maps->attr_map[tci]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
+ }
+
+ mutex_unlock(&xps_map_mutex);
+
+ kfree(new_dev_maps);
+ return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
+ cpus_read_unlock();
-void __netif_schedule(struct net_device *dev)
+ return ret;
+}
+EXPORT_SYMBOL(netif_set_xps_queue);
+
+#endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
- if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
- unsigned long flags;
- struct softnet_data *sd;
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
- local_irq_save(flags);
- sd = &__get_cpu_var(softnet_data);
- dev->next_sched = sd->output_queue;
- sd->output_queue = dev;
- raise_softirq_irqoff(NET_TX_SOFTIRQ);
- local_irq_restore(flags);
+ /* Unbind any subordinate channels */
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev)
+ netdev_unbind_sb_channel(dev, txq->sb_dev);
+ }
+}
+
+void netdev_reset_tc(struct net_device *dev)
+{
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ netdev_unbind_all_sb_channels(dev);
+
+ /* Reset TC configuration of device */
+ dev->num_tc = 0;
+ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+ if (tc >= dev->num_tc)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues(dev, offset, count);
+#endif
+ dev->tc_to_txq[tc].count = count;
+ dev->tc_to_txq[tc].offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+ if (num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ netdev_unbind_all_sb_channels(dev);
+
+ dev->num_tc = num_tc;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+ memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+ memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev == sb_dev)
+ txq->sb_dev = NULL;
+ }
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+ struct net_device *sb_dev,
+ u8 tc, u16 count, u16 offset)
+{
+ /* Make certain the sb_dev and dev are already configured */
+ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+ return -EINVAL;
+
+ /* We cannot hand out queues we don't have */
+ if ((offset + count) > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Record the mapping */
+ sb_dev->tc_to_txq[tc].count = count;
+ sb_dev->tc_to_txq[tc].offset = offset;
+
+ /* Provide a way for Tx queue to find the tc_to_txq map or
+ * XPS map for itself.
+ */
+ while (count--)
+ netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+ /* Do not use a multiqueue device to represent a subordinate channel */
+ if (netif_is_multiqueue(dev))
+ return -ENODEV;
+
+ /* We allow channels 1 - 32767 to be used for subordinate channels.
+ * Channel 0 is meant to be "native" mode and used only to represent
+ * the main root device. We allow writing 0 to reset the device back
+ * to normal mode after being used as a subordinate channel.
+ */
+ if (channel > S16_MAX)
+ return -EINVAL;
+
+ dev->num_tc = -channel;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
+/*
+ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
+ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
+ */
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+{
+ bool disabling;
+ int rc;
+
+ disabling = txq < dev->real_num_tx_queues;
+
+ if (txq < 1 || txq > dev->num_tx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
+ netdev_ops_assert_locked(dev);
+
+ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+ txq);
+ if (rc)
+ return rc;
+
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
+ net_shaper_set_real_num_tx_queues(dev, txq);
+
+ dev_qdisc_change_real_num_tx(dev, txq);
+
+ dev->real_num_tx_queues = txq;
+
+ if (disabling) {
+ synchronize_net();
+ qdisc_reset_all_tx_gt(dev, txq);
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, txq);
+#endif
+ }
+ } else {
+ dev->real_num_tx_queues = txq;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_tx_queues);
+
+/**
+ * netif_set_real_num_rx_queues - set actual number of RX queues used
+ * @dev: Network device
+ * @rxq: Actual number of RX queues
+ *
+ * This must be called either with the rtnl_lock held or before
+ * registration of the net device. Returns 0 on success, or a
+ * negative error code. If called before registration, it always
+ * succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+ int rc;
+
+ if (rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED) {
+ netdev_ops_assert_locked(dev);
+
+ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+ rxq);
+ if (rc)
+ return rc;
+ }
+
+ dev->real_num_rx_queues = rxq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+
+/**
+ * netif_set_real_num_queues - set actual number of RX and TX queues used
+ * @dev: Network device
+ * @txq: Actual number of TX queues
+ * @rxq: Actual number of RX queues
+ *
+ * Set the real number of both TX and RX queues.
+ * Does nothing if the number of queues is already correct.
+ */
+int netif_set_real_num_queues(struct net_device *dev,
+ unsigned int txq, unsigned int rxq)
+{
+ unsigned int old_rxq = dev->real_num_rx_queues;
+ int err;
+
+ if (txq < 1 || txq > dev->num_tx_queues ||
+ rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ /* Start from increases, so the error path only does decreases -
+ * decreases can't fail.
+ */
+ if (rxq > dev->real_num_rx_queues) {
+ err = netif_set_real_num_rx_queues(dev, rxq);
+ if (err)
+ return err;
+ }
+ if (txq > dev->real_num_tx_queues) {
+ err = netif_set_real_num_tx_queues(dev, txq);
+ if (err)
+ goto undo_rx;
+ }
+ if (rxq < dev->real_num_rx_queues)
+ WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
+ if (txq < dev->real_num_tx_queues)
+ WARN_ON(netif_set_real_num_tx_queues(dev, txq));
+
+ return 0;
+undo_rx:
+ WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
+ return err;
+}
+EXPORT_SYMBOL(netif_set_real_num_queues);
+
+/**
+ * netif_set_tso_max_size() - set the max size of TSO frames supported
+ * @dev: netdev to update
+ * @size: max skb->len of a TSO frame
+ *
+ * Set the limit on the size of TSO super-frames the device can handle.
+ * Unless explicitly set the stack will assume the value of
+ * %GSO_LEGACY_MAX_SIZE.
+ */
+void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
+{
+ dev->tso_max_size = min(GSO_MAX_SIZE, size);
+ if (size < READ_ONCE(dev->gso_max_size))
+ netif_set_gso_max_size(dev, size);
+ if (size < READ_ONCE(dev->gso_ipv4_max_size))
+ netif_set_gso_ipv4_max_size(dev, size);
+}
+EXPORT_SYMBOL(netif_set_tso_max_size);
+
+/**
+ * netif_set_tso_max_segs() - set the max number of segs supported for TSO
+ * @dev: netdev to update
+ * @segs: max number of TCP segments
+ *
+ * Set the limit on the number of TCP segments the device can generate from
+ * a single TSO super-frame.
+ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
+ */
+void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
+{
+ dev->tso_max_segs = segs;
+ if (segs < READ_ONCE(dev->gso_max_segs))
+ netif_set_gso_max_segs(dev, segs);
+}
+EXPORT_SYMBOL(netif_set_tso_max_segs);
+
+/**
+ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
+ * @to: netdev to update
+ * @from: netdev from which to copy the limits
+ */
+void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
+{
+ netif_set_tso_max_size(to, from->tso_max_size);
+ netif_set_tso_max_segs(to, from->tso_max_segs);
+}
+EXPORT_SYMBOL(netif_inherit_tso_max);
+
+/**
+ * netif_get_num_default_rss_queues - default number of RSS queues
+ *
+ * Default value is the number of physical cores if there are only 1 or 2, or
+ * divided by 2 if there are more.
+ */
+int netif_get_num_default_rss_queues(void)
+{
+ cpumask_var_t cpus;
+ int cpu, count = 0;
+
+ if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
+ return 1;
+
+ cpumask_copy(cpus, cpu_online_mask);
+ for_each_cpu(cpu, cpus) {
+ ++count;
+ cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
}
+ free_cpumask_var(cpus);
+
+ return count > 2 ? DIV_ROUND_UP(count, 2) : count;
+}
+EXPORT_SYMBOL(netif_get_num_default_rss_queues);
+
+static void __netif_reschedule(struct Qdisc *q)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sd = this_cpu_ptr(&softnet_data);
+ q->next_sched = NULL;
+ *sd->output_queue_tailp = q;
+ sd->output_queue_tailp = &q->next_sched;
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+ /* If q->defer_list is not empty, at least one thread is
+ * in __dev_xmit_skb() before llist_del_all(&q->defer_list).
+ * This thread will attempt to run the queue.
+ */
+ if (!llist_empty(&q->defer_list))
+ return;
+
+ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+ __netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);
-void __netif_rx_schedule(struct net_device *dev)
+struct dev_kfree_skb_cb {
+ enum skb_drop_reason reason;
+};
+
+static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
+{
+ return (struct dev_kfree_skb_cb *)skb->cb;
+}
+
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+ rcu_read_lock();
+ if (!netif_xmit_stopped(txq)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(dev_queue->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
unsigned long flags;
+ if (unlikely(!skb))
+ return;
+
+ if (likely(refcount_read(&skb->users) == 1)) {
+ smp_rmb();
+ refcount_set(&skb->users, 0);
+ } else if (likely(!refcount_dec_and_test(&skb->users))) {
+ return;
+ }
+ get_kfree_skb_cb(skb)->reason = reason;
local_irq_save(flags);
- dev_hold(dev);
- list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
- if (dev->quota < 0)
- dev->quota += dev->weight;
- else
- dev->quota = dev->weight;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ skb->next = __this_cpu_read(softnet_data.completion_queue);
+ __this_cpu_write(softnet_data.completion_queue, skb);
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
-EXPORT_SYMBOL(__netif_rx_schedule);
+EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
-void dev_kfree_skb_any(struct sk_buff *skb)
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (in_irq() || irqs_disabled())
- dev_kfree_skb_irq(skb);
+ if (in_hardirq() || irqs_disabled())
+ dev_kfree_skb_irq_reason(skb, reason);
else
- dev_kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
-EXPORT_SYMBOL(dev_kfree_skb_any);
+EXPORT_SYMBOL(dev_kfree_skb_any_reason);
-/* Hot-plugging. */
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
void netif_device_detach(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
- netif_stop_queue(dev);
+ netif_tx_stop_all_queues(dev);
}
}
EXPORT_SYMBOL(netif_device_detach);
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
void netif_device_attach(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
- netif_wake_queue(dev);
- __netdev_watchdog_up(dev);
+ netif_tx_wake_all_queues(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+static u16 skb_tx_hash(const struct net_device *dev,
+ const struct net_device *sb_dev,
+ struct sk_buff *skb)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = dev->real_num_tx_queues;
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+ qoffset = sb_dev->tc_to_txq[tc].offset;
+ qcount = sb_dev->tc_to_txq[tc].count;
+ if (unlikely(!qcount)) {
+ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+ sb_dev->name, qoffset, tc);
+ qoffset = 0;
+ qcount = dev->real_num_tx_queues;
+ }
+ }
+
+ if (skb_rx_queue_recorded(skb)) {
+ DEBUG_NET_WARN_ON_ONCE(qcount == 0);
+ hash = skb_get_rx_queue(skb);
+ if (hash >= qoffset)
+ hash -= qoffset;
+ while (unlikely(hash >= qcount))
+ hash -= qcount;
+ return hash + qoffset;
+ }
+
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+
+void skb_warn_bad_offload(const struct sk_buff *skb)
+{
+ static const netdev_features_t null_features;
+ struct net_device *dev = skb->dev;
+ const char *name = "";
+
+ if (!net_ratelimit())
+ return;
+
+ if (dev) {
+ if (dev->dev.parent)
+ name = dev_driver_string(dev->dev.parent);
+ else
+ name = netdev_name(dev);
+ }
+ skb_dump(KERN_WARNING, skb, false);
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
+ name, dev ? &dev->features : &null_features,
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
+}
/*
* Invalidate hardware checksum when packet is to be mangled, and
@@ -1170,480 +3549,2165 @@ EXPORT_SYMBOL(netif_device_attach);
int skb_checksum_help(struct sk_buff *skb)
{
__wsum csum;
- int ret = 0, offset = skb->h.raw - skb->data;
+ int ret = 0, offset;
if (skb->ip_summed == CHECKSUM_COMPLETE)
goto out_set_summed;
- if (unlikely(skb_shinfo(skb)->gso_size)) {
- /* Let GSO fix up the checksum. */
- goto out_set_summed;
+ if (unlikely(skb_is_gso(skb))) {
+ skb_warn_bad_offload(skb);
+ return -EINVAL;
}
- if (skb_cloned(skb)) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (!skb_frags_readable(skb)) {
+ return -EFAULT;
+ }
+
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
if (ret)
goto out;
}
- BUG_ON(offset > (int)skb->len);
- csum = skb_checksum(skb, offset, skb->len-offset, 0);
-
- offset = skb->tail - skb->h.raw;
- BUG_ON(offset <= 0);
- BUG_ON(skb->csum_offset + 2 > offset);
+ offset = skb_checksum_start_offset(skb);
+ ret = -EINVAL;
+ if (unlikely(offset >= skb_headlen(skb))) {
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
+ offset, skb_headlen(skb));
+ goto out;
+ }
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
- *(__sum16*)(skb->h.raw + skb->csum_offset) = csum_fold(csum);
+ offset += skb->csum_offset;
+ if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
+ offset + sizeof(__sum16), skb_headlen(skb));
+ goto out;
+ }
+ ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
+ if (ret)
+ goto out;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
-out:
+out:
return ret;
}
+EXPORT_SYMBOL(skb_checksum_help);
-/**
- * skb_gso_segment - Perform segmentation on skb.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
- */
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+#ifdef CONFIG_NET_CRC32C
+int skb_crc32c_csum_help(struct sk_buff *skb)
{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_type *ptype;
- __be16 type = skb->protocol;
- int err;
+ u32 crc;
+ int ret = 0, offset, start;
- BUG_ON(skb_shinfo(skb)->frag_list);
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ goto out;
- skb->mac.raw = skb->data;
- skb->mac_len = skb->nh.raw - skb->data;
- __skb_pull(skb, skb->mac_len);
+ if (unlikely(skb_is_gso(skb)))
+ goto out;
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
- return ERR_PTR(err);
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (unlikely(skb_has_shared_frag(skb))) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
}
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
- if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- err = ptype->gso_send_check(skb);
- segs = ERR_PTR(err);
- if (err || skb_gso_ok(skb, features))
- break;
- __skb_push(skb, skb->data - skb->nh.raw);
- }
- segs = ptype->gso_segment(skb, features);
- break;
- }
+ start = skb_checksum_start_offset(skb);
+ offset = start + offsetof(struct sctphdr, checksum);
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
+ ret = -EINVAL;
+ goto out;
}
- rcu_read_unlock();
- __skb_push(skb, skb->data - skb->mac.raw);
+ ret = skb_ensure_writable(skb, offset + sizeof(__le32));
+ if (ret)
+ goto out;
- return segs;
+ crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
+ *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
+ skb_reset_csum_not_inet(skb);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(skb_crc32c_csum_help);
+#endif /* CONFIG_NET_CRC32C */
+
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
+{
+ __be16 type = skb->protocol;
+
+ /* Tunnel gso handlers can set protocol to ethernet. */
+ if (type == htons(ETH_P_TEB)) {
+ struct ethhdr *eth;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+ return 0;
+
+ eth = (struct ethhdr *)skb->data;
+ type = eth->h_proto;
+ }
+
+ return vlan_get_protocol_and_depth(skb, type, depth);
}
-EXPORT_SYMBOL(skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
-void netdev_rx_csum_fault(struct net_device *dev)
+static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
- if (net_ratelimit()) {
- printk(KERN_ERR "%s: hw csum failure.\n",
- dev ? dev->name : "<unknown>");
- dump_stack();
- }
+ netdev_err(dev, "hw csum failure\n");
+ skb_dump(KERN_ERR, skb, true);
+ dump_stack();
+}
+
+void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
+{
+ DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
-/* Actually, we should eliminate this check as soon as we know, that:
- * 1. IOMMU is present and allows to map all the memory.
- * 2. No high memory really exists on this machine.
- */
-
-static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+/* XXX: check that highmem exists at all on the given machine. */
+static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
int i;
- if (dev->features & NETIF_F_HIGHDMA)
- return 0;
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- if (PageHighMem(skb_shinfo(skb)->frags[i].page))
- return 1;
+ if (!(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = skb_frag_page(frag);
+ if (page && PageHighMem(page))
+ return 1;
+ }
+ }
#endif
return 0;
}
-struct dev_gso_cb {
- void (*destructor)(struct sk_buff *skb);
-};
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (eth_p_mpls(type))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
+}
+#endif
+
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ __be16 type;
+
+ type = skb_network_protocol(skb, NULL);
+ features = net_mpls_features(skb, features, type);
-#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+ if (skb->ip_summed != CHECKSUM_NONE &&
+ !can_checksum_protocol(features, type)) {
+ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+ }
+ if (illegal_highdma(skb->dev, skb))
+ features &= ~NETIF_F_SG;
+
+ return features;
+}
-static void dev_gso_skb_destructor(struct sk_buff *skb)
+netdev_features_t passthru_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
{
- struct dev_gso_cb *cb;
+ return features;
+}
+EXPORT_SYMBOL(passthru_features_check);
- do {
- struct sk_buff *nskb = skb->next;
+static netdev_features_t dflt_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return vlan_features_check(skb, features);
+}
+
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
- skb->next = nskb->next;
- nskb->next = NULL;
- kfree_skb(nskb);
- } while (skb->next);
+ if (gso_segs > READ_ONCE(dev->gso_max_segs))
+ return features & ~NETIF_F_GSO_MASK;
- cb = DEV_GSO_CB(skb);
- if (cb->destructor)
- cb->destructor(skb);
+ if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
+ return features & ~NETIF_F_GSO_MASK;
+
+ if (!skb_shinfo(skb)->gso_type) {
+ skb_warn_bad_offload(skb);
+ return features & ~NETIF_F_GSO_MASK;
+ }
+
+ /* Support for GSO partial features requires software
+ * intervention before we can actually process the packets
+ * so we need to strip support for any partial features now
+ * and we can pull them back in after we have partially
+ * segmented the frame.
+ */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+ features &= ~dev->gso_partial_features;
+
+ /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
+ * has the potential to be fragmented so that TSO does not generate
+ * segments with the same ID. For encapsulated packets, the ID mangling
+ * feature is guaranteed not to use the same ID for the outer IPv4
+ * headers of the generated segments if the headers have the potential
+ * to be fragmented, so there is no need to clear the IPv4 ID mangling
+ * feature (see the section about NETIF_F_TSO_MANGLEID in
+ * segmentation-offloads.rst).
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ struct iphdr *iph = skb->encapsulation ?
+ inner_ip_hdr(skb) : ip_hdr(skb);
+
+ if (!(iph->frag_off & htons(IP_DF)))
+ features &= ~NETIF_F_TSO_MANGLEID;
+ }
+
+ /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
+ * so neither does TSO that depends on it.
+ */
+ if (features & NETIF_F_IPV6_CSUM &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+ vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
+ skb_transport_header_was_set(skb) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
+
+ return features;
}
-/**
- * dev_gso_segment - Perform emulated hardware segmentation on skb.
- * @skb: buffer to segment
- *
- * This function segments the given skb and stores the list of segments
- * in skb->next.
- */
-static int dev_gso_segment(struct sk_buff *skb)
+netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- struct sk_buff *segs;
- int features = dev->features & ~(illegal_highdma(dev, skb) ?
- NETIF_F_SG : 0);
+ netdev_features_t features = dev->features;
- segs = skb_gso_segment(skb, features);
+ if (skb_is_gso(skb))
+ features = gso_features_check(skb, dev, features);
- /* Verifying header integrity only. */
- if (!segs)
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (skb_vlan_tagged(skb))
+ features = netdev_intersect_features(features,
+ dev->vlan_features |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
+
+ if (dev->netdev_ops->ndo_features_check)
+ features &= dev->netdev_ops->ndo_features_check(skb, dev,
+ features);
+ else
+ features &= dflt_features_check(skb, dev, features);
+
+ return harmonize_features(skb, features);
+}
+EXPORT_SYMBOL(netif_skb_features);
+
+static int xmit_one(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq, bool more)
+{
+ unsigned int len;
+ int rc;
+
+ if (dev_nit_active_rcu(dev))
+ dev_queue_xmit_nit(skb, dev);
+
+ len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = netdev_start_xmit(skb, dev, txq, more);
+ trace_net_dev_xmit(skb, rc, dev, len);
+
+ return rc;
+}
+
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
+ struct netdev_queue *txq, int *ret)
+{
+ struct sk_buff *skb = first;
+ int rc = NETDEV_TX_OK;
+
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ skb_mark_not_on_list(skb);
+ rc = xmit_one(skb, dev, txq, next != NULL);
+ if (unlikely(!dev_xmit_complete(rc))) {
+ skb->next = next;
+ goto out;
+ }
+
+ skb = next;
+ if (netif_tx_queue_stopped(txq) && skb) {
+ rc = NETDEV_TX_BUSY;
+ break;
+ }
+ }
+
+out:
+ *ret = rc;
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (skb_vlan_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto))
+ skb = __vlan_hwaccel_push_inside(skb);
+ return skb;
+}
+
+int skb_csum_hwoffload_help(struct sk_buff *skb,
+ const netdev_features_t features)
+{
+ if (unlikely(skb_csum_is_sctp(skb)))
+ return !!(features & NETIF_F_SCTP_CRC) ? 0 :
+ skb_crc32c_csum_help(skb);
+
+ if (features & NETIF_F_HW_CSUM)
return 0;
- if (unlikely(IS_ERR(segs)))
- return PTR_ERR(segs);
+ if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ goto sw_checksum;
- skb->next = segs;
- DEV_GSO_CB(skb)->destructor = skb->destructor;
- skb->destructor = dev_gso_skb_destructor;
+ switch (skb->csum_offset) {
+ case offsetof(struct tcphdr, check):
+ case offsetof(struct udphdr, check):
+ return 0;
+ }
+ }
- return 0;
+sw_checksum:
+ return skb_checksum_help(skb);
}
+EXPORT_SYMBOL(skb_csum_hwoffload_help);
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
+ */
+static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+ struct net_device *dev)
{
- if (likely(!skb->next)) {
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
+ struct sk_buff *skb);
+ struct sock *sk = skb->sk;
+
+ sk_validate = NULL;
+ if (sk) {
+ if (sk_fullsock(sk))
+ sk_validate = sk->sk_validate_xmit_skb;
+ else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
+ sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
+ }
+
+ if (sk_validate) {
+ skb = sk_validate(sk, dev, skb);
+ } else if (unlikely(skb_is_decrypted(skb))) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+ }
+#endif
- if (netif_needs_gso(dev, skb)) {
- if (unlikely(dev_gso_segment(skb)))
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
+{
+ netdev_features_t features;
+
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
+ features = netif_skb_features(skb);
+ skb = validate_xmit_vlan(skb, features);
+ if (unlikely(!skb))
+ goto out_null;
+
+ skb = sk_validate_xmit_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
+ if (netif_needs_gso(skb, features)) {
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, features);
+ if (IS_ERR(segs)) {
+ goto out_kfree_skb;
+ } else if (segs) {
+ consume_skb(skb);
+ skb = segs;
+ }
+ } else {
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* If packet is not checksummed and device does not
+ * support checksumming for this protocol, complete
+ * checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (skb_csum_hwoffload_help(skb, features))
goto out_kfree_skb;
- if (skb->next)
- goto gso;
}
+ }
+
+ skb = validate_xmit_xfrm(skb, features, again);
+
+ return skb;
+
+out_kfree_skb:
+ kfree_skb(skb);
+out_null:
+ dev_core_stats_tx_dropped_inc(dev);
+ return NULL;
+}
+
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
+{
+ struct sk_buff *next, *head = NULL, *tail;
+
+ for (; skb != NULL; skb = next) {
+ next = skb->next;
+ skb_mark_not_on_list(skb);
+
+ /* in case skb won't be segmented, point to itself */
+ skb->prev = skb;
+
+ skb = validate_xmit_skb(skb, dev, again);
+ if (!skb)
+ continue;
- return dev->hard_start_xmit(skb, dev);
+ if (!head)
+ head = skb;
+ else
+ tail->next = skb;
+ /* If skb was segmented, skb->prev points to
+ * the last segment. If not, it still contains skb.
+ */
+ tail = skb->prev;
+ }
+ return head;
+}
+EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
+
+static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u16 gso_segs;
+
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ if (!shinfo->gso_size) {
+ qdisc_skb_cb(skb)->pkt_segs = 1;
+ return;
+ }
+
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
+
+ /* To get more precise estimation of bytes sent on wire,
+ * we add to pkt_len the headers size of all segments
+ */
+ if (skb_transport_header_was_set(skb)) {
+ unsigned int hdr_len;
+
+ /* mac layer + network layer */
+ if (!skb->encapsulation)
+ hdr_len = skb_transport_offset(skb);
+ else
+ hdr_len = skb_inner_transport_offset(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, hdr_len,
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, hdr_len,
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
+
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
+ int payload = skb->len - hdr_len;
+
+ /* Malicious packet. */
+ if (payload <= 0)
+ return;
+ gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ shinfo->gso_segs = gso_segs;
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs;
+ }
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
+}
+
+static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
+ struct sk_buff **to_free,
+ struct netdev_queue *txq)
+{
+ int rc;
-gso:
+ rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
+ if (rc == NET_XMIT_SUCCESS)
+ trace_qdisc_enqueue(q, txq, skb);
+ return rc;
+}
+
+static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
+ spinlock_t *root_lock = qdisc_lock(q);
+ struct llist_node *ll_list, *first_n;
+ unsigned long defer_count = 0;
+ int rc;
+
+ qdisc_calculate_pkt_len(skb, q);
+
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+
+ if (q->flags & TCQ_F_NOLOCK) {
+ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
+ qdisc_run_begin(q)) {
+ /* Retest nolock_qdisc_is_empty() within the protection
+ * of q->seqlock to protect from racing with requeuing.
+ */
+ if (unlikely(!nolock_qdisc_is_empty(q))) {
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ __qdisc_run(q);
+ to_free2 = qdisc_run_end(q);
+
+ goto free_skbs;
+ }
+
+ qdisc_bstats_cpu_update(q, skb);
+ if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
+ !nolock_qdisc_is_empty(q))
+ __qdisc_run(q);
+
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ goto free_skbs;
+ }
+
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ to_free2 = qdisc_run(q);
+ goto free_skbs;
+ }
+
+ /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+ * In the try_cmpxchg() loop, we want to increment q->defer_count
+ * at most once to limit the number of skbs in defer_list.
+ * We perform the defer_count increment only if the list is not empty,
+ * because some arches have slow atomic_long_inc_return().
+ */
+ first_n = READ_ONCE(q->defer_list.first);
do {
- struct sk_buff *nskb = skb->next;
- int rc;
+ if (first_n && !defer_count) {
+ defer_count = atomic_long_inc_return(&q->defer_count);
+ if (unlikely(defer_count > READ_ONCE(q->limit))) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+ return NET_XMIT_DROP;
+ }
+ }
+ skb->ll_node.next = first_n;
+ } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
- skb->next = nskb->next;
- nskb->next = NULL;
- rc = dev->hard_start_xmit(nskb, dev);
- if (unlikely(rc)) {
- nskb->next = skb->next;
- skb->next = nskb;
- return rc;
+ /* If defer_list was not empty, we know the cpu which queued
+ * the first skb will process the whole list for us.
+ */
+ if (first_n)
+ return NET_XMIT_SUCCESS;
+
+ spin_lock(root_lock);
+
+ ll_list = llist_del_all(&q->defer_list);
+ /* There is a small race because we clear defer_count not atomically
+ * with the prior llist_del_all(). This means defer_list could grow
+ * over q->limit.
+ */
+ atomic_long_set(&q->defer_count, 0);
+
+ ll_list = llist_reverse_order(ll_list);
+
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+ __qdisc_drop(skb, &to_free);
+ rc = NET_XMIT_DROP;
+ goto unlock;
+ }
+ if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ !llist_next(ll_list) && qdisc_run_begin(q)) {
+ /*
+ * This is a work-conserving queue; there are no old skbs
+ * waiting to be sent out; and the qdisc is not running -
+ * xmit the skb directly.
+ */
+
+ DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+ struct sk_buff,
+ ll_node));
+ qdisc_bstats_update(q, skb);
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
+ __qdisc_run(q);
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ } else {
+ int count = 0;
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ prefetch(next);
+ prefetch(&next->priority);
+ skb_mark_not_on_list(skb);
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ count++;
}
- if (unlikely(netif_queue_stopped(dev) && skb->next))
- return NETDEV_TX_BUSY;
- } while (skb->next);
-
- skb->destructor = DEV_GSO_CB(skb)->destructor;
+ to_free2 = qdisc_run(q);
+ if (count != 1)
+ rc = NET_XMIT_SUCCESS;
+ }
+unlock:
+ spin_unlock(root_lock);
-out_kfree_skb:
- kfree_skb(skb);
+free_skbs:
+ tcf_kfree_skb_list(to_free);
+ tcf_kfree_skb_list(to_free2);
+ return rc;
+}
+
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ const struct netprio_map *map;
+ const struct sock *sk;
+ unsigned int prioidx;
+
+ if (skb->priority)
+ return;
+ map = rcu_dereference_bh(skb->dev->priomap);
+ if (!map)
+ return;
+ sk = skb_to_full_sk(skb);
+ if (!sk)
+ return;
+
+ prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
+
+ if (prioidx < map->priomap_len)
+ skb->priority = map->priomap[prioidx];
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
+/**
+ * dev_loopback_xmit - loop back @skb
+ * @net: network namespace this loopback is happening in
+ * @sk: sk needed to be a netfilter okfn
+ * @skb: buffer to transmit
+ */
+int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ __skb_pull(skb, skb_network_offset(skb));
+ skb->pkt_type = PACKET_LOOPBACK;
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
+ skb_dst_force(skb);
+ netif_rx(skb);
return 0;
}
+EXPORT_SYMBOL(dev_loopback_xmit);
-#define HARD_TX_LOCK(dev, cpu) { \
- if ((dev->features & NETIF_F_LLTX) == 0) { \
- netif_tx_lock(dev); \
- } \
+#ifdef CONFIG_NET_EGRESS
+static struct netdev_queue *
+netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
+{
+ int qm = skb_get_queue_mapping(skb);
+
+ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
+}
+
+#ifndef CONFIG_PREEMPT_RT
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+
+#else
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return current->net_xmit.skip_txqueue;
}
-#define HARD_TX_UNLOCK(dev) { \
- if ((dev->features & NETIF_F_LLTX) == 0) { \
- netif_tx_unlock(dev); \
- } \
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ current->net_xmit.skip_txqueue = skip;
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+#endif
+#endif /* CONFIG_NET_EGRESS */
+
+#ifdef CONFIG_NET_XGRESS
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+ enum skb_drop_reason *drop_reason)
+{
+ int ret = TC_ACT_UNSPEC;
+#ifdef CONFIG_NET_CLS_ACT
+ struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
+ struct tcf_result res;
+
+ if (!miniq)
+ return ret;
+
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
+
+ tc_skb_cb(skb)->mru = 0;
+ qdisc_skb_cb(skb)->post_ct = false;
+ tcf_set_drop_reason(skb, *drop_reason);
+
+ mini_qdisc_bstats_cpu_update(miniq, skb);
+ ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+ /* Only tcf related quirks below. */
+ switch (ret) {
+ case TC_ACT_SHOT:
+ *drop_reason = tcf_get_drop_reason(skb);
+ mini_qdisc_qstats_cpu_drop(miniq);
+ break;
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(res.classid);
+ break;
+ }
+#endif /* CONFIG_NET_CLS_ACT */
+ return ret;
+}
+
+static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
+
+void tcx_inc(void)
+{
+ static_branch_inc(&tcx_needed_key);
+}
+
+void tcx_dec(void)
+{
+ static_branch_dec(&tcx_needed_key);
+}
+
+static __always_inline enum tcx_action_base
+tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
+ const bool needs_mac)
+{
+ const struct bpf_mprog_fp *fp;
+ const struct bpf_prog *prog;
+ int ret = TCX_NEXT;
+
+ if (needs_mac)
+ __skb_push(skb, skb->mac_len);
+ bpf_mprog_foreach_prog(entry, fp, prog) {
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run(prog, skb);
+ if (ret != TCX_NEXT)
+ break;
+ }
+ if (needs_mac)
+ __skb_pull(skb, skb->mac_len);
+ return tcx_action_code(skb, ret);
+}
+
+static __always_inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev, bool *another)
+{
+ struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int sch_ret;
+
+ if (!entry)
+ return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ if (unlikely(*pt_prev)) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ qdisc_pkt_len_segs_init(skb);
+ tcx_set_ingress(skb, true);
+
+ if (static_branch_unlikely(&tcx_needed_key)) {
+ sch_ret = tcx_run(entry, skb, true);
+ if (sch_ret != TC_ACT_UNSPEC)
+ goto ingress_verdict;
+ }
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
+ingress_verdict:
+ switch (sch_ret) {
+ case TC_ACT_REDIRECT:
+ /* skb_mac_header check was done by BPF, so we can safely
+ * push the L2 header back before redirecting to another
+ * netdev.
+ */
+ __skb_push(skb, skb->mac_len);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
+ *ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ case TC_ACT_SHOT:
+ kfree_skb_reason(skb, drop_reason);
+ *ret = NET_RX_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ /* used by tc_run */
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ consume_skb(skb);
+ fallthrough;
+ case TC_ACT_CONSUMED:
+ *ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ }
+ bpf_net_ctx_clear(bpf_net_ctx);
+
+ return skb;
+}
+
+static __always_inline struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int sch_ret;
+
+ if (!entry)
+ return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
+ * already set by the caller.
+ */
+ if (static_branch_unlikely(&tcx_needed_key)) {
+ sch_ret = tcx_run(entry, skb, false);
+ if (sch_ret != TC_ACT_UNSPEC)
+ goto egress_verdict;
+ }
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
+egress_verdict:
+ switch (sch_ret) {
+ case TC_ACT_REDIRECT:
+ /* No need to push/pop skb's mac_header here on egress! */
+ skb_do_redirect(skb);
+ *ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ case TC_ACT_SHOT:
+ kfree_skb_reason(skb, drop_reason);
+ *ret = NET_XMIT_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ /* used by tc_run */
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ consume_skb(skb);
+ fallthrough;
+ case TC_ACT_CONSUMED:
+ *ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ }
+ bpf_net_ctx_clear(bpf_net_ctx);
+
+ return skb;
+}
+#else
+static __always_inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev, bool *another)
+{
+ return skb;
+}
+
+static __always_inline struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ return skb;
+}
+#endif /* CONFIG_NET_XGRESS */
+
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+ struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+ int tc = netdev_get_prio_tc_map(dev, skb->priority);
+ struct xps_map *map;
+ int queue_index = -1;
+
+ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ return queue_index;
+
+ tci *= dev_maps->num_tc;
+ tci += tc;
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(
+ skb_get_hash(skb), map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+ struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct sock *sk = skb->sk;
+ int queue_index = -1;
+
+ if (!static_key_false(&xps_needed))
+ return -1;
+
+ rcu_read_lock();
+ if (!static_key_false(&xps_rxqs_needed))
+ goto get_cpus_map;
+
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
+ if (dev_maps) {
+ int tci = sk_rx_queue_get(sk);
+
+ if (tci >= 0)
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+ }
+
+get_cpus_map:
+ if (queue_index < 0) {
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
+ if (dev_maps) {
+ unsigned int tci = skb->sender_cpu - 1;
+
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
+{
+ return 0;
+}
+EXPORT_SYMBOL(dev_pick_tx_zero);
+
+int sk_tx_queue_get(const struct sock *sk)
+{
+ int resel, val;
+
+ if (!sk)
+ return -1;
+ /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+ * and sk_tx_queue_set().
+ */
+ val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+ if (val == NO_QUEUE_MAPPING)
+ return -1;
+
+ if (!sk_fullsock(sk))
+ return val;
+
+ resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+ if (resel && time_is_before_jiffies(
+ READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+ return -1;
+
+ return val;
+}
+EXPORT_SYMBOL(sk_tx_queue_get);
+
+u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ sb_dev = sb_dev ? : dev;
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, sb_dev, skb);
+
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, sb_dev, skb);
+
+ if (sk && sk_fullsock(sk) &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+EXPORT_SYMBOL(netdev_pick_tx);
+
+struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ struct net_device *sb_dev)
+{
+ int queue_index = 0;
+
+#ifdef CONFIG_XPS
+ u32 sender_cpu = skb->sender_cpu - 1;
+
+ if (sender_cpu >= (u32)NR_CPUS)
+ skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
+ else
+ queue_index = netdev_pick_tx(dev, skb, sb_dev);
+
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
}
/**
- * dev_queue_xmit - transmit a buffer
- * @skb: buffer to transmit
- *
- * Queue a buffer for transmission to a network device. The caller must
- * have set the device and priority and built the buffer before calling
- * this function. The function can be called from an interrupt.
+ * __dev_queue_xmit() - transmit a buffer
+ * @skb: buffer to transmit
+ * @sb_dev: suboordinate device used for L2 forwarding offload
*
- * A negative errno code is returned on a failure. A success does not
- * guarantee the frame will be transmitted as it may be dropped due
- * to congestion or traffic shaping.
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
*
- * -----------------------------------------------------------------------------------
- * I notice this method can also return errors from the queue disciplines,
- * including NET_XMIT_DROP, which is a positive value. So, errors can also
- * be positive.
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
*
- * Regardless of the return value, the skb is consumed, so it is currently
- * difficult to retry a send to this method. (You can bump the ref count
- * before sending to hold a reference for retry if you are careful.)
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
*
- * When calling this method, interrupts MUST be enabled. This is because
- * the BH enable code must have IRQs enabled so that it will not deadlock.
- * --BLG
+ * Return:
+ * * 0 - buffer successfully transmitted
+ * * positive qdisc return code - NET_XMIT_DROP etc.
+ * * negative errno - other errors
*/
-
-int dev_queue_xmit(struct sk_buff *skb)
+int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
+ struct netdev_queue *txq = NULL;
struct Qdisc *q;
int rc = -ENOMEM;
+ bool again = false;
- /* GSO will handle the following emulations directly. */
- if (netif_needs_gso(dev, skb))
- goto gso;
+ skb_reset_mac_header(skb);
+ skb_assert_len(skb);
- if (skb_shinfo(skb)->frag_list &&
- !(dev->features & NETIF_F_FRAGLIST) &&
- __skb_linearize(skb))
- goto out_kfree_skb;
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
- /* Fragmented skb is linearized if device does not support SG,
- * or if at least one of fragments is in highmem and device
- * does not support DMA from it.
+ /* Disable soft irqs for various locks below. Also
+ * stops preemption for RCU.
*/
- if (skb_shinfo(skb)->nr_frags &&
- (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
- __skb_linearize(skb))
- goto out_kfree_skb;
+ rcu_read_lock_bh();
+
+ skb_update_prio(skb);
+
+ qdisc_pkt_len_segs_init(skb);
+ tcx_set_ingress(skb, false);
+#ifdef CONFIG_NET_EGRESS
+ if (static_branch_unlikely(&egress_needed_key)) {
+ if (nf_hook_egress_active()) {
+ skb = nf_hook_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
- /* If packet is not checksummed and device does not support
- * checksumming for this protocol, complete checksumming here.
- */
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- (!(dev->features & NETIF_F_GEN_CSUM) &&
- (!(dev->features & NETIF_F_IP_CSUM) ||
- skb->protocol != htons(ETH_P_IP))))
- if (skb_checksum_help(skb))
- goto out_kfree_skb;
+ netdev_xmit_skip_txqueue(false);
-gso:
- spin_lock_prefetch(&dev->queue_lock);
+ nf_skip_egress(skb, true);
+ skb = sch_handle_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ nf_skip_egress(skb, false);
- /* Disable soft irqs for various locks below. Also
- * stops preemption for RCU.
+ if (netdev_xmit_txqueue_skipped())
+ txq = netdev_tx_queue_mapping(dev, skb);
+ }
+#endif
+ /* If device/qdisc don't need skb->dst, release it right now while
+ * its hot in this cpu cache.
*/
- rcu_read_lock_bh();
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
- /* Updates of qdisc are serialized by queue_lock.
- * The struct Qdisc which is pointed to by qdisc is now a
- * rcu structure - it may be accessed without acquiring
- * a lock (but the structure may be stale.) The freeing of the
- * qdisc will be deferred until it's known that there are no
- * more references to it.
- *
- * If the qdisc has an enqueue function, we still need to
- * hold the queue_lock before calling it, since queue_lock
- * also serializes access to the device queue.
- */
+ if (!txq)
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
- q = rcu_dereference(dev->qdisc);
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
-#endif
+ q = rcu_dereference_bh(txq->qdisc);
+
+ trace_net_dev_queue(skb);
if (q->enqueue) {
- /* Grab device queue */
- spin_lock(&dev->queue_lock);
- q = dev->qdisc;
- if (q->enqueue) {
- rc = q->enqueue(skb, q);
- qdisc_run(dev);
- spin_unlock(&dev->queue_lock);
-
- rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
- goto out;
- }
- spin_unlock(&dev->queue_lock);
+ rc = __dev_xmit_skb(skb, q, dev, txq);
+ goto out;
}
/* The device has no queue. Common case for software devices:
- loopback, all the sorts of tunnels...
+ * loopback, all the sorts of tunnels...
- Really, it is unlikely that netif_tx_lock protection is necessary
- here. (f.e. loopback and IP tunnels are clean ignoring statistics
- counters.)
- However, it is possible, that they rely on protection
- made by us here.
+ * Really, it is unlikely that netif_tx_lock protection is necessary
+ * here. (f.e. loopback and IP tunnels are clean ignoring statistics
+ * counters.)
+ * However, it is possible, that they rely on protection
+ * made by us here.
- Check this and shot the lock. It is not prone from deadlocks.
- Either shot noqueue qdisc, it is even simpler 8)
+ * Check this and shot the lock. It is not prone from deadlocks.
+ *Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- if (dev->xmit_lock_owner != cpu) {
-
- HARD_TX_LOCK(dev, cpu);
-
- if (!netif_queue_stopped(dev)) {
- rc = 0;
- if (!dev_hard_start_xmit(skb, dev)) {
- HARD_TX_UNLOCK(dev);
+ /* Other cpus might concurrently change txq->xmit_lock_owner
+ * to -1 or to their cpu id, but not to our id.
+ */
+ if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
+ if (dev_xmit_recursion())
+ goto recursion_alert;
+
+ skb = validate_xmit_skb(skb, dev, &again);
+ if (!skb)
+ goto out;
+
+ HARD_TX_LOCK(dev, txq, cpu);
+
+ if (!netif_xmit_stopped(txq)) {
+ dev_xmit_recursion_inc();
+ skb = dev_hard_start_xmit(skb, dev, txq, &rc);
+ dev_xmit_recursion_dec();
+ if (dev_xmit_complete(rc)) {
+ HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
- HARD_TX_UNLOCK(dev);
- if (net_ratelimit())
- printk(KERN_CRIT "Virtual device %s asks to "
- "queue packet!\n", dev->name);
+ HARD_TX_UNLOCK(dev, txq);
+ net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
+ dev->name);
} else {
/* Recursion is detected! It is possible,
- * unfortunately */
- if (net_ratelimit())
- printk(KERN_CRIT "Dead loop on virtual device "
- "%s, fix it urgently!\n", dev->name);
+ * unfortunately
+ */
+recursion_alert:
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
-out_kfree_skb:
- kfree_skb(skb);
+ dev_core_stats_tx_dropped_inc(dev);
+ kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
+EXPORT_SYMBOL(__dev_queue_xmit);
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *orig_skb = skb;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+ bool again = false;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ skb = validate_xmit_skb_list(skb, dev, &again);
+ if (skb != orig_skb)
+ goto drop;
-/*=======================================================================
- Receiver routines
- =======================================================================*/
+ skb_set_queue_mapping(skb, queue_id);
+ txq = skb_get_tx_queue(dev, skb);
-int netdev_max_backlog = 1000;
-int netdev_budget = 300;
-int weight_p = 64; /* old backlog weight */
+ local_bh_disable();
-DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
+ dev_xmit_recursion_inc();
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq))
+ ret = netdev_start_xmit(skb, dev, txq, false);
+ HARD_TX_UNLOCK(dev, txq);
+ dev_xmit_recursion_dec();
+ local_bh_enable();
+ return ret;
+drop:
+ dev_core_stats_tx_dropped_inc(dev);
+ kfree_skb_list(skb);
+ return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(__dev_direct_xmit);
+
+/*************************************************************************
+ * Receiver routines
+ *************************************************************************/
+static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
+
+int weight_p __read_mostly = 64; /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
+
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+ struct napi_struct *napi)
+{
+ struct task_struct *thread;
+
+ lockdep_assert_irqs_disabled();
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable()/netif_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
+ goto use_local_napi;
+
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ wake_up_process(thread);
+ return;
+ }
+ }
+
+use_local_napi:
+ DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
+ list_add_tail(&napi->poll_list, &sd->poll_list);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
+ /* If not called from net_rx_action()
+ * we have to raise NET_RX_SOFTIRQ.
+ */
+ if (!sd->in_net_rx_action)
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+#ifdef CONFIG_RPS
+
+struct static_key_false rps_needed __read_mostly;
+EXPORT_SYMBOL(rps_needed);
+struct static_key_false rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
+
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
+#ifdef CONFIG_RFS_ACCEL
/**
- * netif_rx - post buffer to the network code
- * @skb: buffer to post
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
*
- * This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
*
- * return values:
- * NET_RX_SUCCESS (no congestion)
- * NET_RX_CN_LOW (low congestion)
- * NET_RX_CN_MOD (moderate congestion)
- * NET_RX_CN_HIGH (high congestion)
- * NET_RX_DROP (packet was dropped)
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
+{
+ if (next_cpu < nr_cpu_ids) {
+ u32 head;
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+
+ old_rflow = rflow;
+ rflow = tmp_rflow;
+ WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
+ if (old_rflow->filter == rc)
+ WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
+ out:
+#endif
+ head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
+ rps_input_queue_tail_save(&rflow->last_qtail, head);
+ }
+
+ WRITE_ONCE(rflow->cpu, next_cpu);
+ return rflow;
+}
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
+{
+ const struct rps_sock_flow_table *sock_flow_table;
+ struct netdev_rx_queue *rxqueue = dev->_rx;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_map *map;
+ int cpu = -1;
+ u32 flow_id;
+ u32 tcpu;
+ u32 hash;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+ goto done;
+ }
+ rxqueue += index;
+ }
+
+ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ map = rcu_dereference(rxqueue->rps_map);
+ if (!flow_table && !map)
+ goto done;
+
+ skb_reset_network_header(skb);
+ hash = skb_get_hash(skb);
+ if (!hash)
+ goto done;
+
+ sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
+ if (flow_table && sock_flow_table) {
+ struct rps_dev_flow *rflow;
+ u32 next_cpu;
+ u32 ident;
+
+ /* First check into global flow table if there is a match.
+ * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
+ */
+ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
+ if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
+ goto try_rps;
+
+ next_cpu = ident & net_hotdata.rps_cpu_mask;
+
+ /* OK, now we know there is a match,
+ * we can look at the local (per receive queue) flow table
+ */
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
+ tcpu = rflow->cpu;
+
+ /*
+ * If the desired CPU (where last recvmsg was done) is
+ * different from current CPU (one in the rx-queue flow
+ * table entry), switch if one of the following holds:
+ * - Current CPU is unset (>= nr_cpu_ids).
+ * - Current CPU is offline.
+ * - The current CPU's queue tail has advanced beyond the
+ * last packet that was enqueued using this table entry.
+ * This guarantees that all previous packets for the flow
+ * have been dequeued, thus preserving in order delivery.
+ */
+ if (unlikely(tcpu != next_cpu) &&
+ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
+ ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
+ rflow->last_qtail)) >= 0)) {
+ tcpu = next_cpu;
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
+ }
+
+ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
+ *rflowp = rflow;
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+try_rps:
+
+ if (map) {
+ tcpu = map->cpus[reciprocal_scale(hash, map->len)];
+ if (cpu_online(tcpu)) {
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+done:
+ return cpu;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
+ rflow = &flow_table->flows[flow_id];
+ cpu = READ_ONCE(rflow->cpu);
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
+/* Called from hardirq (IPI) context */
+static void rps_trigger_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ ____napi_schedule(sd, &sd->backlog);
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
+}
+
+#endif /* CONFIG_RPS */
+
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+}
+
+/*
+ * After we queued a packet into sd->input_pkt_queue,
+ * we need to make sure this queue is serviced soon.
+ *
+ * - If this is another cpu queue, link it to our rps_ipi_list,
+ * and make sure we will process rps_ipi_list from net_rx_action().
*
+ * - If this is our own queue, NAPI schedule our backlog.
+ * Note that this also raises NET_RX_SOFTIRQ.
*/
+static void napi_schedule_rps(struct softnet_data *sd)
+{
+ struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
-int netif_rx(struct sk_buff *skb)
+#ifdef CONFIG_RPS
+ if (sd != mysd) {
+ if (use_backlog_threads()) {
+ __napi_schedule_irqoff(&sd->backlog);
+ return;
+ }
+
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+ /* If not called from net_rx_action() or napi_threaded_poll()
+ * we have to raise NET_RX_SOFTIRQ.
+ */
+ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return;
+ }
+#endif /* CONFIG_RPS */
+ __napi_schedule_irqoff(&mysd->backlog);
+}
+
+void kick_defer_list_purge(unsigned int cpu)
{
- struct softnet_data *queue;
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
unsigned long flags;
- /* if netpoll wants it, pretend we never saw it */
- if (netpoll_rx(skb))
- return NET_RX_DROP;
+ if (use_backlog_threads()) {
+ backlog_lock_irq_save(sd, &flags);
- if (!skb->tstamp.off_sec)
- net_timestamp(skb);
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ __napi_schedule_irqoff(&sd->backlog);
- /*
- * The code is rearranged so that the path is the most
- * short when CPU is congested, but is still operating.
- */
- local_irq_save(flags);
- queue = &__get_cpu_var(softnet_data);
+ backlog_unlock_irq_restore(sd, &flags);
- __get_cpu_var(netdev_rx_stat).total++;
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
-enqueue:
- dev_hold(skb->dev);
- __skb_queue_tail(&queue->input_pkt_queue, skb);
- local_irq_restore(flags);
- return NET_RX_SUCCESS;
+ } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+ }
+}
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen,
+ int max_backlog)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ unsigned int old_flow, new_flow;
+ const struct softnet_data *sd;
+ struct sd_flow_limit *fl;
+
+ if (likely(qlen < (max_backlog >> 1)))
+ return false;
+
+ sd = this_cpu_ptr(&softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(fl->count, fl->count + 1);
+ rcu_read_unlock();
+ return true;
}
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+ unsigned int *qtail)
+{
+ enum skb_drop_reason reason;
+ struct softnet_data *sd;
+ unsigned long flags;
+ unsigned int qlen;
+ int max_backlog;
+ u32 tail;
+
+ reason = SKB_DROP_REASON_DEV_READY;
+ if (unlikely(!netif_running(skb->dev)))
+ goto bad_dev;
- netif_rx_schedule(&queue->backlog_dev);
- goto enqueue;
+ sd = &per_cpu(softnet_data, cpu);
+
+ qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
+ max_backlog = READ_ONCE(net_hotdata.max_backlog);
+ if (unlikely(qlen > max_backlog) ||
+ skb_flow_limit(skb, qlen, max_backlog))
+ goto cpu_backlog_drop;
+ backlog_lock_irq_save(sd, &flags);
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (likely(qlen <= max_backlog)) {
+ if (!qlen) {
+ /* Schedule NAPI for backlog device. We can use
+ * non atomic operation as we own the queue lock.
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED,
+ &sd->backlog.state))
+ napi_schedule_rps(sd);
+ }
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ tail = rps_input_queue_tail_incr(sd);
+ backlog_unlock_irq_restore(sd, &flags);
+
+ /* save the tail outside of the critical section */
+ rps_input_queue_tail_save(qtail, tail);
+ return NET_RX_SUCCESS;
}
- __get_cpu_var(netdev_rx_stat).dropped++;
- local_irq_restore(flags);
+ backlog_unlock_irq_restore(sd, &flags);
- kfree_skb(skb);
+cpu_backlog_drop:
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
+ numa_drop_add(&sd->drop_counters, 1);
+bad_dev:
+ dev_core_stats_rx_dropped_inc(skb->dev);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
-int netif_rx_ni(struct sk_buff *skb)
+static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
- int err;
+ struct net_device *dev = skb->dev;
+ struct netdev_rx_queue *rxqueue;
- preempt_disable();
- err = netif_rx(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
+ rxqueue = dev->_rx;
- return err;
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+
+ return rxqueue; /* Return first rxqueue */
+ }
+ rxqueue += index;
+ }
+ return rxqueue;
}
-EXPORT_SYMBOL(netif_rx_ni);
+u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ void *orig_data, *orig_data_end, *hard_start;
+ struct netdev_rx_queue *rxqueue;
+ bool orig_bcast, orig_host;
+ u32 mac_len, frame_sz;
+ __be16 orig_eth_type;
+ struct ethhdr *eth;
+ u32 metalen, act;
+ int off;
+
+ /* The XDP program wants to see the packet starting at the MAC
+ * header.
+ */
+ mac_len = skb->data - skb_mac_header(skb);
+ hard_start = skb->data - skb_headroom(skb);
+
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ frame_sz = (void *)skb_end_pointer(skb) - hard_start;
+ frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ rxqueue = netif_get_rxqueue(skb);
+ xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
+ xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
+ skb_headlen(skb) + mac_len, true);
+ if (skb_is_nonlinear(skb)) {
+ skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+ xdp_buff_set_frags_flag(xdp);
+ } else {
+ xdp_buff_clear_frags_flag(xdp);
+ }
+
+ orig_data_end = xdp->data_end;
+ orig_data = xdp->data;
+ eth = (struct ethhdr *)xdp->data;
+ orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
+ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
+ orig_eth_type = eth->h_proto;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+ /* check if bpf_xdp_adjust_head was used */
+ off = xdp->data - orig_data;
+ if (off) {
+ if (off > 0)
+ __skb_pull(skb, off);
+ else if (off < 0)
+ __skb_push(skb, -off);
+
+ skb->mac_header += off;
+ skb_reset_network_header(skb);
+ }
+
+ /* check if bpf_xdp_adjust_tail was used */
+ off = xdp->data_end - orig_data_end;
+ if (off != 0) {
+ skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
+ skb->len += off; /* positive on grow, negative on shrink */
+ }
+
+ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+ * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+ */
+ if (xdp_buff_has_frags(xdp))
+ skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+ else
+ skb->data_len = 0;
+
+ /* check if XDP changed eth hdr such SKB needs update */
+ eth = (struct ethhdr *)xdp->data;
+ if ((orig_eth_type != eth->h_proto) ||
+ (orig_host != ether_addr_equal_64bits(eth->h_dest,
+ skb->dev->dev_addr)) ||
+ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
+ __skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ }
+
+ /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
+ * before calling us again on redirect path. We do not call do_redirect
+ * as we leave that up to the caller.
+ *
+ * Caller is responsible for managing lifetime of skb (i.e. calling
+ * kfree_skb in response to actions it cannot handle/XDP_DROP).
+ */
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ __skb_push(skb, mac_len);
+ break;
+ case XDP_PASS:
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen)
+ skb_metadata_set(skb, metalen);
+ break;
+ }
+
+ return act;
+}
+
+static int
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
+{
+ struct sk_buff *skb = *pskb;
+ int err, hroom, troom;
-static inline struct net_device *skb_bond(struct sk_buff *skb)
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
+ return 0;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ troom = skb->tail + skb->data_len - skb->end;
+ err = pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+ if (err)
+ return err;
+
+ return skb_linearize(skb);
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *skb = *pskb;
+ u32 mac_len, act = XDP_DROP;
+
+ /* Reinjected packets coming from act_mirred or similar should
+ * not get XDP generic processing.
+ */
+ if (skb_is_redirected(skb))
+ return XDP_PASS;
+
+ /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+ * bytes. This is the guarantee that also native XDP provides,
+ * thus we need to do it here as well.
+ */
+ mac_len = skb->data - skb_mac_header(skb);
+ __skb_push(skb, mac_len);
+
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
+ skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+ if (netif_skb_check_for_xdp(pskb, xdp_prog))
+ goto do_drop;
+ }
+
+ __skb_pull(*pskb, mac_len);
+
+ act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ case XDP_PASS:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception((*pskb)->dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ do_drop:
+ kfree_skb(*pskb);
+ break;
+ }
+
+ return act;
+}
+
+/* When doing generic XDP we have to bypass the qdisc layer and the
+ * network taps in order to match in-driver-XDP behavior. This also means
+ * that XDP packets are able to starve other packets going through a qdisc,
+ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
+ * queues, so they do not have this starvation issue.
+ */
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ bool free_skb = true;
+ int cpu, rc;
- if (dev->master) {
- if (skb_bond_should_drop(skb)) {
- kfree_skb(skb);
- return NULL;
+ txq = netdev_core_pick_tx(dev, skb, NULL);
+ cpu = smp_processor_id();
+ HARD_TX_LOCK(dev, txq, cpu);
+ if (!netif_xmit_frozen_or_drv_stopped(txq)) {
+ rc = netdev_start_xmit(skb, dev, txq, 0);
+ if (dev_xmit_complete(rc))
+ free_skb = false;
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ if (free_skb) {
+ trace_xdp_exception(dev, xdp_prog, XDP_TX);
+ dev_core_stats_tx_dropped_inc(dev);
+ kfree_skb(skb);
+ }
+}
+
+static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
+
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+
+ if (xdp_prog) {
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
+ if (act != XDP_PASS) {
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
+ &xdp, xdp_prog);
+ if (err)
+ goto out_redir;
+ break;
+ case XDP_TX:
+ generic_xdp_tx(*pskb, xdp_prog);
+ break;
+ }
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return XDP_DROP;
}
- skb->dev = dev->master;
+ bpf_net_ctx_clear(bpf_net_ctx);
}
+ return XDP_PASS;
+out_redir:
+ bpf_net_ctx_clear(bpf_net_ctx);
+ kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
+ return XDP_DROP;
+}
+EXPORT_SYMBOL_GPL(do_xdp_generic);
- return dev;
+static int netif_rx_internal(struct sk_buff *skb)
+{
+ int ret;
+
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
+
+ trace_netif_rx(skb);
+
+#ifdef CONFIG_RPS
+ if (static_branch_unlikely(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu;
+
+ rcu_read_lock();
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
+ if (cpu < 0)
+ cpu = smp_processor_id();
+
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+ rcu_read_unlock();
+ } else
+#endif
+ {
+ unsigned int qtail;
+
+ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
+ }
+ return ret;
}
-static void net_tx_action(struct softirq_action *h)
+/**
+ * __netif_rx - Slightly optimized version of netif_rx
+ * @skb: buffer to post
+ *
+ * This behaves as netif_rx except that it does not disable bottom halves.
+ * As a result this function may only be invoked from the interrupt context
+ * (either hard or soft interrupt).
+ */
+int __netif_rx(struct sk_buff *skb)
{
- struct softnet_data *sd = &__get_cpu_var(softnet_data);
+ int ret;
+
+ lockdep_assert_once(hardirq_count() | softirq_count());
+
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ return ret;
+}
+EXPORT_SYMBOL(__netif_rx);
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process via the backlog NAPI device. It
+ * always succeeds. The buffer may be dropped during processing for
+ * congestion control or by the protocol layers.
+ * The network buffer is passed via the backlog NAPI device. Modern NIC
+ * driver should use NAPI and GRO.
+ * This function can used from interrupt and from process context. The
+ * caller from process context must not disable interrupts before invoking
+ * this function.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+int netif_rx(struct sk_buff *skb)
+{
+ bool need_bh_off = !(hardirq_count() | softirq_count());
+ int ret;
+
+ if (need_bh_off)
+ local_bh_disable();
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ if (need_bh_off)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(netif_rx);
+
+static __latent_entropy void net_tx_action(void)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
@@ -1655,194 +5719,396 @@ static void net_tx_action(struct softirq_action *h)
while (clist) {
struct sk_buff *skb = clist;
+
clist = clist->next;
- BUG_TRAP(!atomic_read(&skb->users));
- __kfree_skb(skb);
+ WARN_ON(refcount_read(&skb->users));
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
+ trace_consume_skb(skb, net_tx_action);
+ else
+ trace_kfree_skb(skb, net_tx_action,
+ get_kfree_skb_cb(skb)->reason, NULL);
+
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __napi_kfree_skb(skb,
+ get_kfree_skb_cb(skb)->reason);
}
}
if (sd->output_queue) {
- struct net_device *head;
+ struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
+ sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
+ rcu_read_lock();
+
while (head) {
- struct net_device *dev = head;
- head = head->next_sched;
+ spinlock_t *root_lock = NULL;
+ struct sk_buff *to_free;
+ struct Qdisc *q = head;
- smp_mb__before_clear_bit();
- clear_bit(__LINK_STATE_SCHED, &dev->state);
+ head = head->next_sched;
- if (spin_trylock(&dev->queue_lock)) {
- qdisc_run(dev);
- spin_unlock(&dev->queue_lock);
- } else {
- netif_schedule(dev);
+ /* We need to make sure head->next_sched is read
+ * before clearing __QDISC_STATE_SCHED
+ */
+ smp_mb__before_atomic();
+
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state))) {
+ /* There is a synchronize_net() between
+ * STATE_DEACTIVATED flag being set and
+ * qdisc_reset()/some_qdisc_is_busy() in
+ * dev_deactivate(), so we can safely bail out
+ * early here to avoid data race between
+ * qdisc_deactivate() and some_qdisc_is_busy()
+ * for lockless qdisc.
+ */
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ continue;
}
+
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ to_free = qdisc_run(q);
+ if (root_lock)
+ spin_unlock(root_lock);
+ tcf_kfree_skb_list(to_free);
}
+
+ rcu_read_unlock();
}
+
+ xfrm_dev_backlog(sd);
}
-static __inline__ int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
+/* This hook is defined here for ATM LANE */
+int (*br_fdb_test_addr_hook)(struct net_device *dev,
+ unsigned char *addr) __read_mostly;
+EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
+#endif
+
+/**
+ * netdev_is_rx_handler_busy - check if receive handler is registered
+ * @dev: device to check
+ *
+ * Check if a receive handler is already registered for a given device.
+ * Return true if there one.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+bool netdev_is_rx_handler_busy(struct net_device *dev)
{
- atomic_inc(&skb->users);
- return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ ASSERT_RTNL();
+ return dev && rtnl_dereference(dev->rx_handler);
}
+EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
-int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
-struct net_bridge;
-struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
- unsigned char *addr);
-void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
-
-static __inline__ int handle_bridge(struct sk_buff **pskb,
- struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
+/**
+ * netdev_rx_handler_register - register receive handler
+ * @dev: device to register a handler for
+ * @rx_handler: receive handler to register
+ * @rx_handler_data: data pointer that is used by rx handler
+ *
+ * Register a receive handler for a device. This handler will then be
+ * called from __netif_receive_skb. A negative errno code is returned
+ * on a failure.
+ *
+ * The caller must hold the rtnl_mutex.
+ *
+ * For a general description of rx_handler, see enum rx_handler_result.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+ rx_handler_func_t *rx_handler,
+ void *rx_handler_data)
{
- struct net_bridge_port *port;
+ if (netdev_is_rx_handler_busy(dev))
+ return -EBUSY;
- if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
- (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
- return 0;
+ if (dev->priv_flags & IFF_NO_RX_HANDLER)
+ return -EINVAL;
- if (*pt_prev) {
- *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
- *pt_prev = NULL;
- }
-
- return br_handle_frame_hook(port, pskb);
+ /* Note: rx_handler_data must be set before rx_handler */
+ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+ rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+ return 0;
}
-#else
-#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
-#endif
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
-#ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesnt stop any functionality; if you dont have
- * the ingress scheduler, you just cant add policies on ingress.
+/**
+ * netdev_rx_handler_unregister - unregister receive handler
+ * @dev: device to unregister a handler from
+ *
+ * Unregister a receive handler from a device.
*
+ * The caller must hold the rtnl_mutex.
*/
-static int ing_filter(struct sk_buff *skb)
+void netdev_rx_handler_unregister(struct net_device *dev)
{
- struct Qdisc *q;
- struct net_device *dev = skb->dev;
- int result = TC_ACT_OK;
-
- if (dev->qdisc_ingress) {
- __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
- if (MAX_RED_LOOP < ttl++) {
- printk(KERN_WARNING "Redir loop detected Dropping packet (%s->%s)\n",
- skb->input_dev->name, skb->dev->name);
- return TC_ACT_SHOT;
- }
- skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
+ ASSERT_RTNL();
+ RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
+ RCU_INIT_POINTER(dev->rx_handler_data, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
- skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
+/*
+ * Limit the use of PFMEMALLOC reserves to those protocols that implement
+ * the special handling of PFMEMALLOC skbs.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ return true;
+ default:
+ return false;
+ }
+}
- spin_lock(&dev->ingress_lock);
- if ((q = dev->qdisc_ingress) != NULL)
- result = q->enqueue(skb, q);
- spin_unlock(&dev->ingress_lock);
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ if (nf_hook_ingress_active(skb)) {
+ int ingress_retval;
- }
+ if (unlikely(*pt_prev)) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
- return result;
+ rcu_read_lock();
+ ingress_retval = nf_hook_ingress(skb);
+ rcu_read_unlock();
+ return ingress_retval;
+ }
+ return 0;
}
-#endif
-int netif_receive_skb(struct sk_buff *skb)
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
+ struct packet_type **ppt_prev)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
struct packet_type *ptype, *pt_prev;
+ rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
+ bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
- /* if we've gotten here through NAPI, check netpoll */
- if (skb->dev->poll && netpoll_rx(skb))
- return NET_RX_DROP;
-
- if (!skb->tstamp.off_sec)
- net_timestamp(skb);
+ net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
- if (!skb->input_dev)
- skb->input_dev = skb->dev;
+ trace_netif_receive_skb(skb);
- orig_dev = skb_bond(skb);
+ orig_dev = skb->dev;
- if (!orig_dev)
- return NET_RX_DROP;
+ skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
+#endif
+ skb_reset_mac_len(skb);
- __get_cpu_var(netdev_rx_stat).total++;
+ pt_prev = NULL;
- skb->h.raw = skb->nh.raw = skb->data;
- skb->mac_len = skb->nh.raw - skb->mac.raw;
+another_round:
+ skb->skb_iif = skb->dev->ifindex;
- pt_prev = NULL;
+ __this_cpu_inc(softnet_data.processed);
- rcu_read_lock();
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
+ int ret2;
-#ifdef CONFIG_NET_CLS_ACT
- if (skb->tc_verd & TC_NCLS) {
- skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
- goto ncls;
- }
-#endif
+ migrate_disable();
+ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ &skb);
+ migrate_enable();
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (!ptype->dev || ptype->dev == skb->dev) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
+ if (ret2 != XDP_PASS) {
+ ret = NET_RX_DROP;
+ goto out;
}
}
-#ifdef CONFIG_NET_CLS_ACT
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL; /* noone else should process this after*/
- } else {
- skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+ if (eth_type_vlan(skb->protocol)) {
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ goto out;
}
- ret = ing_filter(skb);
+ if (skb_skip_tc_classify(skb))
+ goto skip_classify;
- if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
- kfree_skb(skb);
- goto out;
+ if (pfmemalloc)
+ goto skip_taps;
+
+ list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+ list) {
+ if (unlikely(pt_prev))
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+ if (unlikely(pt_prev))
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
}
- skb->tc_verd = 0;
-ncls:
+skip_taps:
+#ifdef CONFIG_NET_INGRESS
+ if (static_branch_unlikely(&ingress_needed_key)) {
+ bool another = false;
+
+ nf_skip_egress(skb, true);
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+ &another);
+ if (another)
+ goto another_round;
+ if (!skb)
+ goto out;
+
+ nf_skip_egress(skb, false);
+ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ goto out;
+ }
#endif
+ skb_reset_redirect(skb);
+skip_classify:
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
+ drop_reason = SKB_DROP_REASON_PFMEMALLOC;
+ goto drop;
+ }
- if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
- goto out;
+ if (skb_vlan_tag_present(skb)) {
+ if (unlikely(pt_prev)) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ if (vlan_do_receive(&skb))
+ goto another_round;
+ else if (unlikely(!skb))
+ goto out;
+ }
- type = skb->protocol;
- list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
- if (ptype->type == type &&
- (!ptype->dev || ptype->dev == skb->dev)) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
+ rx_handler = rcu_dereference(skb->dev->rx_handler);
+ if (rx_handler) {
+ if (unlikely(pt_prev)) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
}
+ switch (rx_handler(&skb)) {
+ case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
+ goto out;
+ case RX_HANDLER_ANOTHER:
+ goto another_round;
+ case RX_HANDLER_EXACT:
+ deliver_exact = true;
+ break;
+ case RX_HANDLER_PASS:
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
+check_vlan_id:
+ if (skb_vlan_tag_get_id(skb)) {
+ /* Vlan id is non 0 and vlan_do_receive() above couldn't
+ * find vlan device.
+ */
+ skb->pkt_type = PACKET_OTHERHOST;
+ } else if (eth_type_vlan(skb->protocol)) {
+ /* Outer header is 802.1P with vlan 0, inner header is
+ * 802.1Q or 802.1AD and vlan_do_receive() above could
+ * not find vlan dev for vlan id 0.
+ */
+ __vlan_hwaccel_clear_tag(skb);
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ goto out;
+ if (vlan_do_receive(&skb))
+ /* After stripping off 802.1P header with vlan 0
+ * vlan dev is found for inner header.
+ */
+ goto another_round;
+ else if (unlikely(!skb))
+ goto out;
+ else
+ /* We have stripped outer 802.1P vlan 0 header.
+ * But could not find vlan dev.
+ * check again for vlan id to set OTHERHOST.
+ */
+ goto check_vlan_id;
+ }
+ /* Note: we might in the future use prio bits
+ * and set skb->priority like in vlan_do_receive()
+ * For the time being, just ignore Priority Code Point
+ */
+ __vlan_hwaccel_clear_tag(skb);
+ }
+
+ type = skb->protocol;
+
+ /* deliver only exact match when indicated */
+ if (likely(!deliver_exact)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &ptype_base[ntohs(type) &
+ PTYPE_HASH_MASK]);
+
+ /* orig_dev and skb->dev could belong to different netns;
+ * Even in such case we need to traverse only the list
+ * coming from skb->dev, as the ptype owner (packet socket)
+ * will use dev_net(skb->dev) to do namespace filtering.
+ */
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &dev_net_rcu(skb->dev)->ptype_specific);
+ }
+
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &orig_dev->ptype_specific);
+
+ if (unlikely(skb->dev != orig_dev)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &skb->dev->ptype_specific);
}
if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ *ppt_prev = pt_prev;
} else {
- kfree_skb(skb);
+drop:
+ if (!deliver_exact)
+ dev_core_stats_rx_dropped_inc(skb->dev);
+ else
+ dev_core_stats_rx_nohandler_inc(skb->dev);
+
+ kfree_skb_reason(skb, drop_reason);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
@@ -1850,518 +6116,3568 @@ ncls:
}
out:
+ /* The invariant here is that if *ppt_prev is not NULL
+ * then skb should also be non-NULL.
+ *
+ * Apparently *ppt_prev assignment above holds this invariant due to
+ * skb dereferencing near it.
+ */
+ *pskb = skb;
+ return ret;
+}
+
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+ int ret;
+
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
+ if (pt_prev)
+ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
+ skb->dev, pt_prev, orig_dev);
+ return ret;
+}
+
+/**
+ * netif_receive_skb_core - special purpose version of netif_receive_skb
+ * @skb: buffer to process
+ *
+ * More direct receive version of netif_receive_skb(). It should
+ * only be used by callers that have a need to skip RPS and Generic XDP.
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
+
return ret;
}
+EXPORT_SYMBOL(netif_receive_skb_core);
-static int process_backlog(struct net_device *backlog_dev, int *budget)
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
- int work = 0;
- int quota = min(backlog_dev->quota, *budget);
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
- unsigned long start_time = jiffies;
+ struct sk_buff *skb, *next;
- backlog_dev->weight = weight_p;
- for (;;) {
- struct sk_buff *skb;
- struct net_device *dev;
+ if (!pt_prev)
+ return;
+ if (list_empty(head))
+ return;
+ if (pt_prev->list_func != NULL)
+ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
+ ip_list_rcv, head, pt_prev, orig_dev);
+ else
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ }
+}
- local_irq_disable();
- skb = __skb_dequeue(&queue->input_pkt_queue);
- if (!skb)
- goto job_done;
- local_irq_enable();
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+ /* Fast-path assumptions:
+ * - There is no RX handler.
+ * - Only one packet_type matches.
+ * If either of these fails, we will end up doing some per-packet
+ * processing in-line, then handling the 'last ptype' for the whole
+ * sublist. This can't cause out-of-order delivery to any single ptype,
+ * because the 'last ptype' must be constant across the sublist, and all
+ * other ptypes are handled per-packet.
+ */
+ /* Current (common) ptype of sublist */
+ struct packet_type *pt_curr = NULL;
+ /* Current (common) orig_dev of sublist */
+ struct net_device *od_curr = NULL;
+ struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+
+ skb_list_del_init(skb);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
+ if (!pt_prev)
+ continue;
+ if (pt_curr != pt_prev || od_curr != orig_dev) {
+ /* dispatch old sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ pt_curr = pt_prev;
+ od_curr = orig_dev;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
- dev = skb->dev;
+ /* dispatch final sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+}
- netif_receive_skb(skb);
+static int __netif_receive_skb(struct sk_buff *skb)
+{
+ int ret;
- dev_put(dev);
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
+ unsigned int noreclaim_flag;
- work++;
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ noreclaim_flag = memalloc_noreclaim_save();
+ ret = __netif_receive_skb_one_core(skb, true);
+ memalloc_noreclaim_restore(noreclaim_flag);
+ } else
+ ret = __netif_receive_skb_one_core(skb, false);
- if (work >= quota || jiffies - start_time > 1)
- break;
+ return ret;
+}
+static void __netif_receive_skb_list(struct list_head *head)
+{
+ unsigned long noreclaim_flag = 0;
+ struct sk_buff *skb, *next;
+ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+ struct list_head sublist;
+
+ /* Handle the previous sublist */
+ list_cut_before(&sublist, head, &skb->list);
+ if (!list_empty(&sublist))
+ __netif_receive_skb_list_core(&sublist, pfmemalloc);
+ pfmemalloc = !pfmemalloc;
+ /* See comments in __netif_receive_skb */
+ if (pfmemalloc)
+ noreclaim_flag = memalloc_noreclaim_save();
+ else
+ memalloc_noreclaim_restore(noreclaim_flag);
+ }
}
+ /* Handle the remaining sublist */
+ if (!list_empty(head))
+ __netif_receive_skb_list_core(head, pfmemalloc);
+ /* Restore pflags */
+ if (pfmemalloc)
+ memalloc_noreclaim_restore(noreclaim_flag);
+}
- backlog_dev->quota -= work;
- *budget -= work;
- return -1;
+static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
+{
+ struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
+ struct bpf_prog *new = xdp->prog;
+ int ret = 0;
-job_done:
- backlog_dev->quota -= work;
- *budget -= work;
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ rcu_assign_pointer(dev->xdp_prog, new);
+ if (old)
+ bpf_prog_put(old);
+
+ if (old && !new) {
+ static_branch_dec(&generic_xdp_needed_key);
+ } else if (new && !old) {
+ static_branch_inc(&generic_xdp_needed_key);
+ netif_disable_lro(dev);
+ dev_disable_gro_hw(dev);
+ }
+ break;
- list_del(&backlog_dev->poll_list);
- smp_mb__before_clear_bit();
- netif_poll_enable(backlog_dev);
+ default:
+ ret = -EINVAL;
+ break;
+ }
- local_irq_enable();
- return 0;
+ return ret;
}
-static void net_rx_action(struct softirq_action *h)
+static int netif_receive_skb_internal(struct sk_buff *skb)
{
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
- unsigned long start_time = jiffies;
- int budget = netdev_budget;
- void *have;
+ int ret;
- local_irq_disable();
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
- while (!list_empty(&queue->poll_list)) {
- struct net_device *dev;
+ if (skb_defer_rx_timestamp(skb))
+ return NET_RX_SUCCESS;
- if (budget <= 0 || jiffies - start_time > 1)
- goto softnet_break;
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_branch_unlikely(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+#endif
+ ret = __netif_receive_skb(skb);
+ rcu_read_unlock();
+ return ret;
+}
- local_irq_enable();
+void netif_receive_skb_list_internal(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
+ skb);
+ skb_list_del_init(skb);
+ if (!skb_defer_rx_timestamp(skb))
+ list_add_tail(&skb->list, &sublist);
+ }
+ list_splice_init(&sublist, head);
- dev = list_entry(queue->poll_list.next,
- struct net_device, poll_list);
- have = netpoll_poll_lock(dev);
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_branch_unlikely(&rps_needed)) {
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ /* Will be handled, remove from list */
+ skb_list_del_init(skb);
+ enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ }
+ }
+ }
+#endif
+ __netif_receive_skb_list(head);
+ rcu_read_unlock();
+}
- if (dev->quota <= 0 || dev->poll(dev, &budget)) {
- netpoll_poll_unlock(have);
- local_irq_disable();
- list_move_tail(&dev->poll_list, &queue->poll_list);
- if (dev->quota < 0)
- dev->quota += dev->weight;
- else
- dev->quota = dev->weight;
- } else {
- netpoll_poll_unlock(have);
- dev_put(dev);
- local_irq_disable();
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+ int ret;
+
+ trace_netif_receive_skb_entry(skb);
+
+ ret = netif_receive_skb_internal(skb);
+ trace_netif_receive_skb_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb);
+
+/**
+ * netif_receive_skb_list - process many receive buffers from network
+ * @head: list of skbs to process.
+ *
+ * Since return value of netif_receive_skb() is normally ignored, and
+ * wouldn't be meaningful for a list, this function returns void.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+ struct sk_buff *skb;
+
+ if (list_empty(head))
+ return;
+ if (trace_netif_receive_skb_list_entry_enabled()) {
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ }
+ netif_receive_skb_list_internal(head);
+ trace_netif_receive_skb_list_exit(0);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
+/* Network device is going away, flush any packets still pending */
+static void flush_backlog(struct work_struct *work)
+{
+ struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
+ struct softnet_data *sd;
+
+ __skb_queue_head_init(&list);
+ local_bh_disable();
+ sd = this_cpu_ptr(&softnet_data);
+
+ backlog_lock_irq_disable(sd);
+ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
+ __skb_unlink(skb, &sd->input_pkt_queue);
+ __skb_queue_tail(&list, skb);
+ rps_input_queue_head_incr(sd);
}
}
-out:
-#ifdef CONFIG_NET_DMA
- /*
- * There may not be any more sk_buffs coming right now, so push
- * any pending DMA copies to hardware
+ backlog_unlock_irq_enable(sd);
+
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
+ __skb_unlink(skb, &sd->process_queue);
+ __skb_queue_tail(&list, skb);
+ rps_input_queue_head_incr(sd);
+ }
+ }
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
+ local_bh_enable();
+
+ __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
+}
+
+static bool flush_required(int cpu)
+{
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
+
+ backlog_lock_irq_disable(sd);
+
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
*/
- if (net_dma_client) {
- struct dma_chan *chan;
- rcu_read_lock();
- list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
- dma_async_memcpy_issue_pending(chan);
- rcu_read_unlock();
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ backlog_unlock_irq_enable(sd);
+
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
+ */
+ return true;
+}
+
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
+static void flush_all_backlogs(void)
+{
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
+ unsigned int cpu;
+
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
+
+ cpus_read_lock();
+
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
+ }
+ }
+
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in unregister_netdevice_many() will take care of
+ * them.
+ */
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
+
+ cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
+}
+
+static void net_rps_send_ipi(struct softnet_data *remsd)
+{
+#ifdef CONFIG_RPS
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu, &remsd->csd);
+ remsd = next;
}
#endif
- local_irq_enable();
- return;
+}
-softnet_break:
- __get_cpu_var(netdev_rx_stat).time_squeeze++;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- goto out;
+/*
+ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
+ */
+static void net_rps_action_and_irq_enable(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *remsd = sd->rps_ipi_list;
+
+ if (!use_backlog_threads() && remsd) {
+ sd->rps_ipi_list = NULL;
+
+ local_irq_enable();
+
+ /* Send pending IPI's to kick RPS processing on remote cpus. */
+ net_rps_send_ipi(remsd);
+ } else
+#endif
+ local_irq_enable();
+}
+
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ return !use_backlog_threads() && sd->rps_ipi_list;
+#else
+ return false;
+#endif
}
-static gifconf_func_t * gifconf_list [NPROTO];
+static int process_backlog(struct napi_struct *napi, int quota)
+{
+ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+ bool again = true;
+ int work = 0;
+
+ /* Check if we have pending ipi, its better to send them now,
+ * not waiting net_rx_action() end.
+ */
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+
+ napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
+ while (again) {
+ struct sk_buff *skb;
+
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
+ while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
+ rcu_read_lock();
+ __netif_receive_skb(skb);
+ rcu_read_unlock();
+ if (++work >= quota) {
+ rps_input_queue_head_add(sd, work);
+ return work;
+ }
+
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
+ }
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
+
+ backlog_lock_irq_disable(sd);
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
+ /*
+ * Inline a custom version of __napi_complete().
+ * only current cpu owns and manipulates this napi,
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+ napi->state &= NAPIF_STATE_THREADED;
+ again = false;
+ } else {
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
+ }
+ backlog_unlock_irq_enable(sd);
+ }
+
+ if (work)
+ rps_input_queue_head_add(sd, work);
+ return work;
+}
/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
+ * __napi_schedule - schedule for receive
+ * @n: entry to schedule
*
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
-int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
+void __napi_schedule(struct napi_struct *n)
{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
+ unsigned long flags;
+ local_irq_save(flags);
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__napi_schedule);
-/*
- * Map an interface index to its name (SIOCGIFNAME)
+/**
+ * napi_schedule_prep - check if napi can be scheduled
+ * @n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running. This is used as a condition variable to
+ * insure only one NAPI poll instance runs. We also make
+ * sure there is no pending NAPI disable.
*/
+bool napi_schedule_prep(struct napi_struct *n)
+{
+ unsigned long new, val = READ_ONCE(n->state);
-/*
- * We need this ioctl for efficient implementation of the
- * if_indextoname() function required by the IPv6 API. Without
- * it, we would have to search all the interfaces to find a
- * match. --pb
+ do {
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ return false;
+ new = val | NAPIF_STATE_SCHED;
+
+ /* Sets STATE_MISSED bit if STATE_SCHED was already set
+ * This was suggested by Alexander Duyck, as compiler
+ * emits better code than :
+ * if (val & NAPIF_STATE_SCHED)
+ * new |= NAPIF_STATE_MISSED;
+ */
+ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
+ NAPIF_STATE_MISSED;
+ } while (!try_cmpxchg(&n->state, &val, new));
+
+ return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked.
+ *
+ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
+ * because the interrupt disabled assumption might not be true
+ * due to force-threaded interrupts and spinlock substitution.
*/
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ else
+ __napi_schedule(n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
-static int dev_ifname(struct ifreq __user *arg)
+bool napi_complete_done(struct napi_struct *n, int work_done)
{
- struct net_device *dev;
- struct ifreq ifr;
+ unsigned long flags, val, new, timeout = 0;
+ bool ret = true;
/*
- * Fetch the caller's info block.
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
*/
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
+ return false;
+
+ if (work_done) {
+ if (n->gro.bitmask)
+ timeout = napi_get_gro_flush_timeout(n);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
+ }
+ if (n->defer_hard_irqs_count > 0) {
+ n->defer_hard_irqs_count--;
+ timeout = napi_get_gro_flush_timeout(n);
+ if (timeout)
+ ret = false;
+ }
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
+ /*
+ * When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer.
+ */
+ gro_flush_normal(&n->gro, !!timeout);
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(ifr.ifr_ifindex);
- if (!dev) {
- read_unlock(&dev_base_lock);
- return -ENODEV;
+ if (unlikely(!list_empty(&n->poll_list))) {
+ /* If n->poll_list is not empty, we need to mask irqs */
+ local_irq_save(flags);
+ list_del_init(&n->poll_list);
+ local_irq_restore(flags);
}
+ WRITE_ONCE(n->list_owner, -1);
- strcpy(ifr.ifr_name, dev->name);
- read_unlock(&dev_base_lock);
+ val = READ_ONCE(n->state);
+ do {
+ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
- return -EFAULT;
- return 0;
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_SCHED_THREADED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
+
+ /* If STATE_MISSED was set, leave STATE_SCHED set,
+ * because we will call napi->poll() one more time.
+ * This C code was suggested by Alexander Duyck to help gcc.
+ */
+ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
+ NAPIF_STATE_SCHED;
+ } while (!try_cmpxchg(&n->state, &val, new));
+
+ if (unlikely(val & NAPIF_STATE_MISSED)) {
+ __napi_schedule(n);
+ return false;
+ }
+
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ return ret;
}
+EXPORT_SYMBOL(napi_complete_done);
-/*
- * Perform a SIOCGIFCONF call. This structure will change
- * size eventually, and there is nothing I can do about it.
- * Thus we will need a 'compatibility mode'.
+static void skb_defer_free_flush(void)
+{
+ struct llist_node *free_list;
+ struct sk_buff *skb, *next;
+ struct skb_defer_node *sdn;
+ int node;
+
+ for_each_node(node) {
+ sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
+
+ if (llist_empty(&sdn->defer_list))
+ continue;
+ atomic_long_set(&sdn->defer_count, 0);
+ free_list = llist_del_all(&sdn->defer_list);
+
+ llist_for_each_entry_safe(skb, next, free_list, ll_node) {
+ prefetch(next);
+ napi_consume_skb(skb, 1);
+ }
+ }
+}
+
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+{
+ if (!skip_schedule) {
+ gro_normal_list(&napi->gro);
+ __napi_schedule(napi);
+ return;
+ }
+
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush_normal(&napi->gro, HZ >= 1000);
+
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+enum {
+ NAPI_F_PREFER_BUSY_POLL = 1,
+ NAPI_F_END_ON_RESCHED = 2,
+};
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
+ unsigned flags, u16 budget)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ bool skip_schedule = false;
+ unsigned long timeout;
+ int rc;
+
+ /* Busy polling means there is a high chance device driver hard irq
+ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
+ * set in napi_schedule_prep().
+ * Since we are about to call napi->poll() once more, we can safely
+ * clear NAPI_STATE_MISSED.
+ *
+ * Note: x86 could use a single "lock and ..." instruction
+ * to perform these two clear_bit()
+ */
+ clear_bit(NAPI_STATE_MISSED, &napi->state);
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ if (flags & NAPI_F_PREFER_BUSY_POLL) {
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
+ timeout = napi_get_gro_flush_timeout(napi);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, budget);
+ /* We can't gro_normal_list() here, because napi->poll() might have
+ * rearmed the napi (napi_complete_done()) in which case it could
+ * already be running on another CPU.
+ */
+ trace_napi_poll(napi, rc, budget);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == budget)
+ __busy_poll_stop(napi, skip_schedule);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+}
+
+static void __napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, unsigned flags, u16 budget)
+{
+ unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
+ int (*napi_poll)(struct napi_struct *napi, int budget);
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ void *have_poll_lock = NULL;
+ struct napi_struct *napi;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+restart:
+ napi_poll = NULL;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ for (;;) {
+ int work = 0;
+
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
+ goto count;
+ }
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val) {
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
+ goto count;
+ }
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
+ }
+ work = napi_poll(napi, budget);
+ trace_napi_poll(napi, work, budget);
+ gro_normal_list(&napi->gro);
+count:
+ if (work > 0)
+ __NET_ADD_STATS(dev_net(napi->dev),
+ LINUX_MIB_BUSYPOLLRXPACKETS, work);
+ skb_defer_free_flush();
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+
+ if (!loop_end || loop_end(loop_end_arg, start_time))
+ break;
+
+ if (unlikely(need_resched())) {
+ if (flags & NAPI_F_END_ON_RESCHED)
+ break;
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ if (loop_end(loop_end_arg, start_time))
+ return;
+ goto restart;
+ }
+ cpu_relax();
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+}
+
+void napi_busy_loop_rcu(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = NAPI_F_END_ON_RESCHED;
+
+ if (prefer_busy_poll)
+ flags |= NAPI_F_PREFER_BUSY_POLL;
+
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
+}
+
+void napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
+
+ rcu_read_lock();
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(napi_busy_loop);
+
+void napi_suspend_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ unsigned long timeout = napi_get_irq_suspend_timeout(napi);
+
+ if (timeout)
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ }
+ rcu_read_unlock();
+}
+
+void napi_resume_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ /* If irq_suspend_timeout is set to 0 between the call to
+ * napi_suspend_irqs and now, the original value still
+ * determines the safety timeout as intended and napi_watchdog
+ * will resume irq processing.
+ */
+ if (napi_get_irq_suspend_timeout(napi)) {
+ local_bh_disable();
+ napi_schedule(napi);
+ local_bh_enable();
+ }
+ }
+ rcu_read_unlock();
+}
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->gro.cached_napi_id = napi_id;
+
+ WRITE_ONCE(napi->napi_id, napi_id);
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+ WARN_ON_ONCE(napi_by_id(napi_id));
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
+
+static void napi_hash_add(struct napi_struct *napi)
+{
+ unsigned long flags;
+
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
+ return;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+
+ /* 0..NR_CPUS range is reserved for sender_cpu use */
+ do {
+ if (unlikely(!napi_id_valid(++napi_gen_id)))
+ napi_gen_id = MIN_NAPI_ID;
+ } while (napi_by_id(napi_gen_id));
+
+ __napi_hash_add_with_id(napi, napi_gen_id);
+
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
*/
+static void napi_hash_del(struct napi_struct *napi)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
-static int dev_ifconf(char __user *arg)
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
- struct ifconf ifc;
- struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
+ struct napi_struct *napi;
- /*
- * Fetch the caller's info block.
+ napi = container_of(timer, struct napi_struct, timer);
+
+ /* Note : we use a relaxed variant of napi_schedule_prep() not setting
+ * NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
+ if (!napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
+ __napi_schedule_irqoff(napi);
+ }
- if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
- return -EFAULT;
+ return HRTIMER_NORESTART;
+}
- pos = ifc.ifc_buf;
- len = ifc.ifc_len;
+static void napi_stop_kthread(struct napi_struct *napi)
+{
+ unsigned long val, new;
- /*
- * Loop over the interfaces, and write an info block for each.
- */
-
- total = 0;
- for (dev = dev_base; dev; dev = dev->next) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total);
- if (done < 0)
- return -EFAULT;
- total += done;
+ /* Wait until the napi STATE_THREADED is unset. */
+ while (true) {
+ val = READ_ONCE(napi->state);
+
+ /* If napi kthread own this napi or the napi is idle,
+ * STATE_THREADED can be unset here.
+ */
+ if ((val & NAPIF_STATE_SCHED_THREADED) ||
+ !(val & NAPIF_STATE_SCHED)) {
+ new = val & (~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL));
+ } else {
+ msleep(20);
+ continue;
+ }
+
+ if (try_cmpxchg(&napi->state, &val, new))
+ break;
+ }
+
+ /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by
+ * the kthread.
+ */
+ while (true) {
+ if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
+ break;
+
+ msleep(20);
+ }
+
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+}
+
+static void napi_set_threaded_state(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded_mode)
+{
+ bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+ bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
+int napi_set_threaded(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded)
+{
+ if (threaded) {
+ if (!napi->thread) {
+ int err = napi_kthread_create(napi);
+
+ if (err)
+ return err;
+ }
+ }
+
+ if (napi->config)
+ napi->config->threaded = threaded;
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ if (!threaded && napi->thread) {
+ napi_stop_kthread(napi);
+ } else {
+ /* Make sure kthread is created before THREADED bit is set. */
+ smp_mb__before_atomic();
+ napi_set_threaded_state(napi, threaded);
+ }
+
+ return 0;
+}
+
+int netif_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded)
+{
+ struct napi_struct *napi;
+ int i, err = 0;
+
+ netdev_assert_locked_or_invisible(dev);
+
+ if (threaded) {
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (!napi->thread) {
+ err = napi_kthread_create(napi);
+ if (err) {
+ threaded = NETDEV_NAPI_THREADED_DISABLED;
+ break;
+ }
}
}
- }
+ }
- /*
- * All done. Write the updated control block back to the caller.
+ WRITE_ONCE(dev->threaded, threaded);
+
+ /* The error should not occur as the kthreads are already created. */
+ list_for_each_entry(napi, &dev->napi_list, dev_list)
+ WARN_ON_ONCE(napi_set_threaded(napi, threaded));
+
+ /* Override the config for all NAPIs even if currently not listed */
+ for (i = 0; i < dev->num_napi_configs; i++)
+ dev->napi_config[i].threaded = threaded;
+
+ return err;
+}
+
+/**
+ * netif_threaded_enable() - enable threaded NAPIs
+ * @dev: net_device instance
+ *
+ * Enable threaded mode for the NAPI instances of the device. This may be useful
+ * for devices where multiple NAPI instances get scheduled by a single
+ * interrupt. Threaded NAPI allows moving the NAPI processing to cores other
+ * than the core where IRQ is mapped.
+ *
+ * This function should be called before @dev is registered.
+ */
+void netif_threaded_enable(struct net_device *dev)
+{
+ WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
+}
+EXPORT_SYMBOL(netif_threaded_enable);
+
+/**
+ * netif_queue_set_napi - Associate queue with the napi
+ * @dev: device to which NAPI and queue belong
+ * @queue_index: Index of queue
+ * @type: queue type as RX or TX
+ * @napi: NAPI context, pass NULL to clear previously set NAPI
+ *
+ * Set queue with its corresponding napi context. This should be done after
+ * registering the NAPI handler for the queue-vector and the queues have been
+ * mapped to the corresponding interrupt vector.
+ */
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+ enum netdev_queue_type type, struct napi_struct *napi)
+{
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+
+ if (WARN_ON_ONCE(napi && !napi->dev))
+ return;
+ netdev_ops_assert_locked_or_invisible(dev);
+
+ switch (type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(dev, queue_index);
+ rxq->napi = napi;
+ return;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(dev, queue_index);
+ txq->napi = napi;
+ return;
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL(netif_queue_set_napi);
+
+static void
+netif_napi_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ struct napi_struct *napi =
+ container_of(notify, struct napi_struct, notify);
+#ifdef CONFIG_RFS_ACCEL
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+ int err;
+#endif
+
+ if (napi->config && napi->dev->irq_affinity_auto)
+ cpumask_copy(&napi->config->affinity_mask, mask);
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
+ if (err)
+ netdev_warn(napi->dev, "RMAP update failed (%d)\n",
+ err);
+ }
+#endif
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static void netif_napi_affinity_release(struct kref *ref)
+{
+ struct napi_struct *napi =
+ container_of(ref, struct napi_struct, notify.kref);
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+
+ netdev_assert_locked(napi->dev);
+ WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
+ &napi->state));
+
+ if (!napi->dev->rx_cpu_rmap_auto)
+ return;
+ rmap->obj[napi->napi_rmap_idx] = NULL;
+ napi->napi_rmap_idx = -1;
+ cpu_rmap_put(rmap);
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ if (dev->rx_cpu_rmap_auto)
+ return 0;
+
+ dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+ if (!dev->rx_cpu_rmap)
+ return -ENOMEM;
+
+ dev->rx_cpu_rmap_auto = true;
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+ struct cpu_rmap *rmap = dev->rx_cpu_rmap;
+
+ if (!dev->rx_cpu_rmap_auto)
+ return;
+
+ /* Free the rmap */
+ cpu_rmap_put(rmap);
+ dev->rx_cpu_rmap = NULL;
+ dev->rx_cpu_rmap_auto = false;
+}
+
+#else
+static void netif_napi_affinity_release(struct kref *ref)
+{
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+}
+#endif
+
+void netif_set_affinity_auto(struct net_device *dev)
+{
+ unsigned int i, maxqs, numa;
+
+ maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
+ numa = dev_to_node(&dev->dev);
+
+ for (i = 0; i < maxqs; i++)
+ cpumask_set_cpu(cpumask_local_spread(i, numa),
+ &dev->napi_config[i].affinity_mask);
+
+ dev->irq_affinity_auto = true;
+}
+EXPORT_SYMBOL(netif_set_affinity_auto);
+
+void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
+{
+ int rc;
+
+ netdev_assert_locked_or_invisible(napi->dev);
+
+ if (napi->irq == irq)
+ return;
+
+ /* Remove existing resources */
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ napi->irq = irq;
+ if (irq < 0 ||
+ (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
+ return;
+
+ /* Abort for buggy drivers */
+ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
+ return;
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
+ if (rc < 0)
+ return;
+
+ cpu_rmap_get(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = rc;
+ }
+#endif
+
+ /* Use core IRQ notifier */
+ napi->notify.notify = netif_napi_irq_notify;
+ napi->notify.release = netif_napi_affinity_release;
+ rc = irq_set_affinity_notifier(irq, &napi->notify);
+ if (rc) {
+ netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
+ rc);
+ goto put_rmap;
+ }
+
+ set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
+ return;
+
+put_rmap:
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
+ cpu_rmap_put(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = -1;
+ }
+#endif
+ napi->notify.notify = NULL;
+ napi->notify.release = NULL;
+}
+EXPORT_SYMBOL(netif_napi_set_irq_locked);
+
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+ if (n->dev->irq_affinity_auto &&
+ test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
+ irq_set_affinity(n->irq, &n->config->affinity_mask);
+
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us.
*/
- ifc.ifc_len = total;
+ if (n->config->napi_id) {
+ napi_hash_add_with_id(n, n->config->napi_id);
+ } else {
+ napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
- /*
- * Both BSD and Solaris return 0 here, so we do too.
+ WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->irq_suspend_timeout = n->irq_suspend_timeout;
+ napi_hash_del(n);
+}
+
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (napi_id_valid(pos->napi_id))
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ netdev_assert_locked(dev);
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
+ INIT_LIST_HEAD(&napi->poll_list);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
+ hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ gro_init(&napi->gro);
+ napi->skb = NULL;
+ napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ netdev_err_once(dev, "%s() called with weight %d\n", __func__,
+ weight);
+ napi->weight = weight;
+ napi->dev = dev;
+#ifdef CONFIG_NETPOLL
+ napi->poll_owner = -1;
+#endif
+ napi->list_owner = -1;
+ set_bit(NAPI_STATE_SCHED, &napi->state);
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
+ netif_napi_dev_list_add(dev, napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
+ napi_get_frags_check(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
*/
- return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+ if (napi_get_threaded_config(dev, napi))
+ if (napi_kthread_create(napi))
+ dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
+ netif_napi_set_irq_locked(napi, -1);
}
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-#ifdef CONFIG_PROC_FS
-/*
- * This is invoked by the /proc filesystem handler to display a device
- * in detail.
+void napi_disable_locked(struct napi_struct *n)
+{
+ unsigned long val, new;
+
+ might_sleep();
+ netdev_assert_locked(n->dev);
+
+ set_bit(NAPI_STATE_DISABLE, &n->state);
+
+ val = READ_ONCE(n->state);
+ do {
+ while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
+ usleep_range(20, 200);
+ val = READ_ONCE(n->state);
+ }
+
+ new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
+ new &= ~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL |
+ NAPIF_STATE_PREFER_BUSY_POLL);
+ } while (!try_cmpxchg(&n->state, &val, new));
+
+ hrtimer_cancel(&n->timer);
+
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable_locked);
+
+/**
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
+ *
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-static __inline__ struct net_device *dev_get_idx(loff_t pos)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
+{
+ unsigned long new, val = READ_ONCE(n->state);
+
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
+ do {
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
+
+ new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
+ if (n->dev->threaded && n->thread)
+ new |= NAPIF_STATE_THREADED;
+ } while (!try_cmpxchg(&n->state, &val, new));
+}
+EXPORT_SYMBOL(napi_enable_locked);
+
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_enable);
+
+/* Must be called in process context */
+void __netif_napi_del_locked(struct napi_struct *napi)
+{
+ netdev_assert_locked(napi->dev);
+
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ /* Make sure NAPI is disabled (or was never enabled). */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
+ list_del_rcu(&napi->dev_list);
+ napi_free_frags(napi);
+
+ gro_cleanup(&napi->gro);
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
+}
+EXPORT_SYMBOL(__netif_napi_del_locked);
+
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+ int work, weight;
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (napi_is_scheduled(n)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n, work, weight);
+
+ xdp_do_check_flushed(n);
+ }
+
+ if (unlikely(work > weight))
+ netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
+
+ if (likely(work < weight))
+ return work;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ return work;
+ }
+
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ return work;
+ }
+
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush_normal(&n->gro, HZ >= 1000);
+
+ /* Some drivers may have called napi_schedule
+ * prior to exhausting their budget.
+ */
+ if (unlikely(!list_empty(&n->poll_list))) {
+ pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+ n->dev ? n->dev->name : "backlog");
+ return work;
+ }
+
+ *repoll = true;
+
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll) {
+#if defined(CONFIG_DEBUG_NET)
+ if (unlikely(!napi_is_scheduled(n)))
+ pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
+ n->dev->name, n->poll);
+#endif
+ list_add_tail(&n->poll_list, repoll);
+ }
+ netpoll_poll_unlock(have);
+
+ return work;
+}
+
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+ /* Testing SCHED_THREADED bit here to make sure the current
+ * kthread owns this napi and could poll on this napi.
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return -1;
+}
+
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ struct softnet_data *sd;
+ unsigned long last_qs = jiffies;
+
+ for (;;) {
+ bool repoll = false;
+ void *have;
+
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ sd = this_cpu_ptr(&softnet_data);
+ sd->in_napi_threaded_poll = true;
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ sd->in_napi_threaded_poll = false;
+ barrier();
+
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+ skb_defer_free_flush();
+ bpf_net_ctx_clear(bpf_net_ctx);
+
+ /* When busy poll is enabled, the old packets are not flushed in
+ * napi_complete_done. So flush them here.
+ */
+ if (busy_poll)
+ gro_flush_normal(&napi->gro, HZ >= 1000);
+ local_bh_enable();
+
+ /* Call cond_resched here to avoid watchdog warnings. */
+ if (repoll || busy_poll) {
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
+ }
+
+ if (!repoll)
+ break;
+ }
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ bool want_busy_poll;
+ bool in_busy_poll;
+ unsigned long val;
+
+ while (!napi_thread_wait(napi)) {
+ val = READ_ONCE(napi->state);
+
+ want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+ in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
+
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ want_busy_poll = false;
+
+ if (want_busy_poll != in_busy_poll)
+ assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+ want_busy_poll);
+
+ napi_threaded_poll_loop(napi, want_busy_poll);
+ }
+
+ return 0;
+}
+
+static __latent_entropy void net_rx_action(void)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ unsigned long time_limit = jiffies +
+ usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int budget = READ_ONCE(net_hotdata.netdev_budget);
+ LIST_HEAD(list);
+ LIST_HEAD(repoll);
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+start:
+ sd->in_net_rx_action = true;
+ local_irq_disable();
+ list_splice_init(&sd->poll_list, &list);
+ local_irq_enable();
+
+ for (;;) {
+ struct napi_struct *n;
+
+ skb_defer_free_flush();
+
+ if (list_empty(&list)) {
+ if (list_empty(&repoll)) {
+ sd->in_net_rx_action = false;
+ barrier();
+ /* We need to check if ____napi_schedule()
+ * had refilled poll_list while
+ * sd->in_net_rx_action was true.
+ */
+ if (!list_empty(&sd->poll_list))
+ goto start;
+ if (!sd_has_rps_ipi_waiting(sd))
+ goto end;
+ }
+ break;
+ }
+
+ n = list_first_entry(&list, struct napi_struct, poll_list);
+ budget -= napi_poll(n, &repoll);
+
+ /* If softirq window is exhausted then punt.
+ * Allow this to run for 2 jiffies since which will allow
+ * an average latency of 1.5/HZ.
+ */
+ if (unlikely(budget <= 0 ||
+ time_after_eq(jiffies, time_limit))) {
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
+ break;
+ }
+ }
+
+ local_irq_disable();
+
+ list_splice_tail_init(&sd->poll_list, &list);
+ list_splice_tail(&repoll, &list);
+ list_splice(&list, &sd->poll_list);
+ if (!list_empty(&sd->poll_list))
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ else
+ sd->in_net_rx_action = false;
+
+ net_rps_action_and_irq_enable(sd);
+end:
+ bpf_net_ctx_clear(bpf_net_ctx);
+}
+
+struct netdev_adjacent {
struct net_device *dev;
- loff_t i;
+ netdevice_tracker dev_tracker;
+
+ /* upper master flag, there can only be one master device per list */
+ bool master;
+
+ /* lookup ignore flag */
+ bool ignore;
+
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
- for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
+ /* private field for the users */
+ void *private;
- return i == pos ? dev : NULL;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
+ struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ list_for_each_entry(adj, adj_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
+ }
+ return NULL;
}
-void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+static int ____netdev_has_upper_dev(struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
- read_lock(&dev_base_lock);
- return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ struct net_device *dev = (struct net_device *)priv->data;
+
+ return upper_dev == dev;
}
-void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+/**
+ * netdev_has_upper_dev - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks only immediate upper device,
+ * not through a complete stack of devices. The caller must hold the RTNL lock.
+ */
+bool netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
+ ASSERT_RTNL();
+
+ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
+ &priv);
}
+EXPORT_SYMBOL(netdev_has_upper_dev);
+
+/**
+ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
-void dev_seq_stop(struct seq_file *seq, void *v)
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
{
- read_unlock(&dev_base_lock);
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
+ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
+ &priv);
}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
-static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+/**
+ * netdev_has_any_upper_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to an upper device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+bool netdev_has_any_upper_dev(struct net_device *dev)
{
- if (dev->get_stats) {
- struct net_device_stats *stats = dev->get_stats(dev);
+ ASSERT_RTNL();
- seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
- "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
- dev->name, stats->rx_bytes, stats->rx_packets,
- stats->rx_errors,
- stats->rx_dropped + stats->rx_missed_errors,
- stats->rx_fifo_errors,
- stats->rx_length_errors + stats->rx_over_errors +
- stats->rx_crc_errors + stats->rx_frame_errors,
- stats->rx_compressed, stats->multicast,
- stats->tx_bytes, stats->tx_packets,
- stats->tx_errors, stats->tx_dropped,
- stats->tx_fifo_errors, stats->collisions,
- stats->tx_carrier_errors +
- stats->tx_aborted_errors +
- stats->tx_window_errors +
- stats->tx_heartbeat_errors,
- stats->tx_compressed);
- } else
- seq_printf(seq, "%6s: No statistics available.\n", dev->name);
+ return !list_empty(&dev->adj_list.upper);
}
+EXPORT_SYMBOL(netdev_has_any_upper_dev);
-/*
- * Called from the PROCfs module. This now uses the new arbitrary sized
- * /proc/net interface to create /proc/net/dev
+/**
+ * netdev_master_upper_dev_get - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RTNL lock.
*/
-static int dev_seq_show(struct seq_file *seq, void *v)
+struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Inter-| Receive "
- " | Transmit\n"
- " face |bytes packets errs drop fifo frame "
- "compressed multicast|bytes packets errs "
- "drop fifo colls carrier compressed\n");
- else
- dev_seq_printf_stats(seq, v);
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get);
+
+static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master) && !upper->ignore)
+ return upper->dev;
+ return NULL;
+}
+
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.lower);
+}
+
+void *netdev_adjacent_get_private(struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ adj = list_entry(adj_list, struct netdev_adjacent, list);
+
+ return adj->private;
+}
+EXPORT_SYMBOL(netdev_adjacent_get_private);
+
+/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
+static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+ *ignore = upper->ignore;
+
+ return upper->dev;
+}
+
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+
+static int __netdev_walk_all_upper_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = __netdev_next_upper_dev(now, &iter, &ignore);
+ if (!udev)
+ break;
+ if (ignore)
+ continue;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
return 0;
}
-static struct netif_rx_stats *softnet_get_online(loff_t *pos)
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
- struct netif_rx_stats *rc = NULL;
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
- while (*pos < NR_CPUS)
- if (cpu_online(*pos)) {
- rc = &per_cpu(netdev_rx_stat, *pos);
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev_rcu(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
break;
- } else
- ++*pos;
- return rc;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
-static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+static bool __netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
{
- return softnet_get_online(pos);
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = (void *)upper_dev,
+ };
+
+ ASSERT_RTNL();
+
+ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
+ &priv);
}
-static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+/**
+ * netdev_lower_get_next_private - Get the next ->private from the
+ * lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold either hold the
+ * RTNL lock or its own locking that guarantees that the neighbour lower
+ * list will remain unchanged.
+ */
+void *netdev_lower_get_next_private(struct net_device *dev,
+ struct list_head **iter)
{
- ++*pos;
- return softnet_get_online(pos);
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->private;
}
+EXPORT_SYMBOL(netdev_lower_get_next_private);
-static void softnet_seq_stop(struct seq_file *seq, void *v)
+/**
+ * netdev_lower_get_next_private_rcu - Get the next ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+ struct list_head **iter)
{
+ struct netdev_adjacent *lower;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->private;
}
+EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
-static int softnet_seq_show(struct seq_file *seq, void *v)
+/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchanged.
+ */
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
- struct netif_rx_stats *s = v;
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next);
+
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+
+static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+ *ignore = lower->ignore;
+
+ return lower->dev;
+}
+
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- s->total, s->dropped, s->time_squeeze, 0,
- 0, 0, 0, 0, /* was fastroute */
- s->cpu_collision );
return 0;
}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
-static struct seq_operations dev_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = dev_seq_show,
-};
+static int __netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = __netdev_next_lower_dev(now, &iter, &ignore);
+ if (!ldev)
+ break;
+ if (ignore)
+ continue;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
-static int dev_seq_open(struct inode *inode, struct file *file)
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
+
+static u8 __netdev_upper_depth(struct net_device *dev)
{
- return seq_open(file, &dev_seq_ops);
+ struct net_device *udev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+ bool ignore;
+
+ for (iter = &dev->adj_list.upper,
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore);
+ udev;
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
+ if (max_depth < udev->upper_level)
+ max_depth = udev->upper_level;
+ }
+
+ return max_depth;
}
-static struct file_operations dev_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+static u8 __netdev_lower_depth(struct net_device *dev)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+ bool ignore;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
+ ldev;
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
+ if (max_depth < ldev->lower_level)
+ max_depth = ldev->lower_level;
+ }
-static struct seq_operations softnet_seq_ops = {
- .start = softnet_seq_start,
- .next = softnet_seq_next,
- .stop = softnet_seq_stop,
- .show = softnet_seq_show,
-};
+ return max_depth;
+}
-static int softnet_seq_open(struct inode *inode, struct file *file)
+static int __netdev_update_upper_level(struct net_device *dev,
+ struct netdev_nested_priv *__unused)
{
- return seq_open(file, &softnet_seq_ops);
+ dev->upper_level = __netdev_upper_depth(dev) + 1;
+ return 0;
}
-static struct file_operations softnet_seq_fops = {
- .owner = THIS_MODULE,
- .open = softnet_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+#ifdef CONFIG_LOCKDEP
+static LIST_HEAD(net_unlink_list);
-#ifdef CONFIG_WIRELESS_EXT
-extern int wireless_proc_init(void);
-#else
-#define wireless_proc_init() 0
+static void net_unlink_todo(struct net_device *dev)
+{
+ if (list_empty(&dev->unlink_list))
+ list_add_tail(&dev->unlink_list, &net_unlink_list);
+}
#endif
-static int __init dev_proc_init(void)
+static int __netdev_update_lower_level(struct net_device *dev,
+ struct netdev_nested_priv *priv)
{
- int rc = -ENOMEM;
+ dev->lower_level = __netdev_lower_depth(dev) + 1;
- if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
- goto out;
- if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
- goto out_dev;
- if (wireless_proc_init())
- goto out_softnet;
- rc = 0;
-out:
- return rc;
-out_softnet:
- proc_net_remove("softnet_stat");
-out_dev:
- proc_net_remove("dev");
- goto out;
+#ifdef CONFIG_LOCKDEP
+ if (!priv)
+ return 0;
+
+ if (priv->flags & NESTED_SYNC_IMM)
+ dev->nested_level = dev->lower_level - 1;
+ if (priv->flags & NESTED_SYNC_TODO)
+ net_unlink_todo(dev);
+#endif
+ return 0;
}
-#else
-#define dev_proc_init() 0
-#endif /* CONFIG_PROC_FS */
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev_rcu(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
/**
- * netdev_set_master - set up master/slave pair
- * @slave: slave device
- * @master: new master device
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
*
- * Changes the master device of the slave. Pass %NULL to break the
- * bonding. The caller must hold the RTNL semaphore. On a failure
- * a negative errno code is returned. On success the reference counts
- * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
- * function returns zero.
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
*/
-int netdev_set_master(struct net_device *slave, struct net_device *master)
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
- struct net_device *old = slave->master;
+ struct netdev_adjacent *lower;
- ASSERT_RTNL();
+ lower = list_first_or_null_rcu(&dev->adj_list.lower,
+ struct netdev_adjacent, list);
+ if (lower)
+ return lower->private;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
+/**
+ * netdev_master_upper_dev_get_rcu - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RCU read lock.
+ */
+struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_first_or_null_rcu(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (upper && likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
+
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", adj_dev->name);
+ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
+ linkname);
+}
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
+ char *name,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", name);
+ sysfs_remove_link(&(dev->dev.kobj), linkname);
+}
+
+static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ return (dev_list == &dev->adj_list.upper ||
+ dev_list == &dev->adj_list.lower) &&
+ net_eq(dev_net(dev), dev_net(adj_dev));
+}
+
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list,
+ void *private, bool master)
+{
+ struct netdev_adjacent *adj;
+ int ret;
+
+ adj = __netdev_find_adj(adj_dev, dev_list);
+
+ if (adj) {
+ adj->ref_nr += 1;
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+ dev->name, adj_dev->name, adj->ref_nr);
+
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->ref_nr = 1;
+ adj->private = private;
+ adj->ignore = false;
+ netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
+
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
+ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
+
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
+ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
+ if (ret)
+ goto free_adj;
+ }
+
+ /* Ensure that master link is always the first item in list. */
if (master) {
- if (old)
- return -EBUSY;
- dev_hold(master);
+ ret = sysfs_create_link(&(dev->dev.kobj),
+ &(adj_dev->dev.kobj), "master");
+ if (ret)
+ goto remove_symlinks;
+
+ list_add_rcu(&adj->list, dev_list);
+ } else {
+ list_add_tail_rcu(&adj->list, dev_list);
}
- slave->master = master;
-
- synchronize_net();
+ return 0;
- if (old)
- dev_put(old);
+remove_symlinks:
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+free_adj:
+ netdev_put(adj_dev, &adj->dev_tracker);
+ kfree(adj);
- if (master)
- slave->flags |= IFF_SLAVE;
- else
- slave->flags &= ~IFF_SLAVE;
+ return ret;
+}
+
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev,
+ u16 ref_nr,
+ struct list_head *dev_list)
+{
+ struct netdev_adjacent *adj;
+
+ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+ dev->name, adj_dev->name, ref_nr);
+
+ adj = __netdev_find_adj(adj_dev, dev_list);
+
+ if (!adj) {
+ pr_err("Adjacency does not exist for device %s from %s\n",
+ dev->name, adj_dev->name);
+ WARN_ON(1);
+ return;
+ }
+
+ if (adj->ref_nr > ref_nr) {
+ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+ dev->name, adj_dev->name, ref_nr,
+ adj->ref_nr - ref_nr);
+ adj->ref_nr -= ref_nr;
+ return;
+ }
+
+ if (adj->master)
+ sysfs_remove_link(&(dev->dev.kobj), "master");
+
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+
+ list_del_rcu(&adj->list);
+ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+ netdev_put(adj_dev, &adj->dev_tracker);
+ kfree_rcu(adj, rcu);
+}
+
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list,
+ void *private, bool master)
+{
+ int ret;
+
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
+ private, master);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
+ private, false);
+ if (ret) {
+ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
+ return ret;
+ }
- rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
return 0;
}
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ u16 ref_nr,
+ struct list_head *up_list,
+ struct list_head *down_list)
+{
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
+}
+
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private, bool master)
+{
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
+}
+
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower);
+}
+
+static int __netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev, bool master,
+ void *upper_priv, void *upper_info,
+ struct netdev_nested_priv *priv,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_changeupper_info changeupper_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ },
+ .upper_dev = upper_dev,
+ .master = master,
+ .linking = true,
+ .upper_info = upper_info,
+ };
+ struct net_device *master_dev;
+ int ret = 0;
+
+ ASSERT_RTNL();
+
+ if (dev == upper_dev)
+ return -EBUSY;
+
+ /* To prevent loops, check if dev is not upper device to upper_dev. */
+ if (__netdev_has_upper_dev(upper_dev, dev))
+ return -EBUSY;
+
+ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+ return -EMLINK;
+
+ if (!master) {
+ if (__netdev_has_upper_dev(dev, upper_dev))
+ return -EEXIST;
+ } else {
+ master_dev = __netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ return master_dev == upper_dev ? -EEXIST : -EBUSY;
+ }
+
+ ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
+ master);
+ if (ret)
+ return ret;
+
+ ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto rollback;
+
+ __netdev_update_upper_level(dev, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, priv);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ priv);
+
+ return 0;
+
+rollback:
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ return ret;
+}
+
/**
- * dev_set_promiscuity - update promiscuity count on a device
- * @dev: device
- * @inc: modifier
+ * netdev_upper_dev_link - Add a link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ * @extack: netlink extended ack
+ *
+ * Adds a link to device which is upper to this one. The caller must hold
+ * the RTNL lock. On a failure a negative errno code is returned.
+ * On success the reference counts are adjusted and the function
+ * returns zero.
+ */
+int netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ return __netdev_upper_dev_link(dev, upper_dev, false,
+ NULL, NULL, &priv, extack);
+}
+EXPORT_SYMBOL(netdev_upper_dev_link);
+
+/**
+ * netdev_master_upper_dev_link - Add a master link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ * @upper_priv: upper device private
+ * @upper_info: upper info to be passed down via notifier
+ * @extack: netlink extended ack
+ *
+ * Adds a link to device which is upper to this one. In this case, only
+ * one master upper device can be linked, although other non-master devices
+ * might be linked as well. The caller must hold the RTNL lock.
+ * On a failure a negative errno code is returned. On success the reference
+ * counts are adjusted and the function returns zero.
+ */
+int netdev_master_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *upper_priv, void *upper_info,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ return __netdev_upper_dev_link(dev, upper_dev, true,
+ upper_priv, upper_info, &priv, extack);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link);
+
+static void __netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
+{
+ struct netdev_notifier_changeupper_info changeupper_info = {
+ .info = {
+ .dev = dev,
+ },
+ .upper_dev = upper_dev,
+ .linking = false,
+ };
+
+ ASSERT_RTNL();
+
+ changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+
+ call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
+ &changeupper_info.info);
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
+ &changeupper_info.info);
+
+ __netdev_update_upper_level(dev, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, priv);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ priv);
+}
+
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ __netdev_upper_dev_unlink(dev, upper_dev, &priv);
+}
+EXPORT_SYMBOL(netdev_upper_dev_unlink);
+
+static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
+ struct net_device *lower_dev,
+ bool val)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
+ if (adj)
+ adj->ignore = val;
+
+ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
+ if (adj)
+ adj->ignore = val;
+}
+
+static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
+}
+
+static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
+}
+
+int netdev_adjacent_change_prepare(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+ int err;
+
+ if (!new_dev)
+ return 0;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_disable(dev, old_dev);
+ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
+ extack);
+ if (err) {
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_adjacent_change_prepare);
+
+void netdev_adjacent_change_commit(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ if (!new_dev || !old_dev)
+ return;
+
+ if (new_dev == old_dev)
+ return;
+
+ netdev_adjacent_dev_enable(dev, old_dev);
+ __netdev_upper_dev_unlink(old_dev, dev, &priv);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_commit);
+
+void netdev_adjacent_change_abort(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+
+ if (!new_dev)
+ return;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+
+ __netdev_upper_dev_unlink(new_dev, dev, &priv);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_abort);
+
+/**
+ * netdev_bonding_info_change - Dispatch event about slave change
+ * @dev: device
+ * @bonding_info: info to dispatch
*
- * Add or remove promiscuity from a device. While the count in the device
- * remains above zero the interface remains promiscuous. Once it hits zero
- * the device reverts back to normal filtering operation. A negative inc
- * value is used to drop promiscuity on the device.
+ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
*/
-void dev_set_promiscuity(struct net_device *dev, int inc)
+void netdev_bonding_info_change(struct net_device *dev,
+ struct netdev_bonding_info *bonding_info)
+{
+ struct netdev_notifier_bonding_info info = {
+ .info.dev = dev,
+ };
+
+ memcpy(&info.bonding_info, bonding_info,
+ sizeof(struct netdev_bonding_info));
+ call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
+ &info.info);
+}
+EXPORT_SYMBOL(netdev_bonding_info_change);
+
+static int netdev_offload_xstats_enable_l3(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+ int err;
+ int rc;
+
+ dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
+ GFP_KERNEL);
+ if (!dev->offload_xstats_l3)
+ return -ENOMEM;
+
+ rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
+ NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ err = notifier_to_errno(rc);
+ if (err)
+ goto free_stats;
+
+ return 0;
+
+free_stats:
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+ return err;
+}
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return netdev_offload_xstats_enable_l3(dev, extack);
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enable);
+
+static void netdev_offload_xstats_disable_l3(struct net_device *dev)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+}
+
+int netdev_offload_xstats_disable(struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ if (!netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ netdev_offload_xstats_disable_l3(dev);
+ return 0;
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_disable);
+
+static void netdev_offload_xstats_disable_all(struct net_device *dev)
+{
+ netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
+}
+
+static struct rtnl_hw_stats64 *
+netdev_offload_xstats_get_ptr(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return dev->offload_xstats_l3;
+ }
+
+ WARN_ON(1);
+ return NULL;
+}
+
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ return netdev_offload_xstats_get_ptr(dev, type);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enabled);
+
+struct netdev_notifier_offload_xstats_ru {
+ bool used;
+};
+
+struct netdev_notifier_offload_xstats_rd {
+ struct rtnl_hw_stats64 stats;
+ bool used;
+};
+
+static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
+ const struct rtnl_hw_stats64 *src)
+{
+ dest->rx_packets += src->rx_packets;
+ dest->tx_packets += src->tx_packets;
+ dest->rx_bytes += src->rx_bytes;
+ dest->tx_bytes += src->tx_bytes;
+ dest->rx_errors += src->rx_errors;
+ dest->tx_errors += src->tx_errors;
+ dest->rx_dropped += src->rx_dropped;
+ dest->tx_dropped += src->tx_dropped;
+ dest->multicast += src->multicast;
+}
+
+static int netdev_offload_xstats_get_used(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_ru report_used = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_used = &report_used,
+ };
+ int rc;
+
+ WARN_ON(!netdev_offload_xstats_enabled(dev, type));
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+ &info.info);
+ *p_used = report_used.used;
+ return notifier_to_errno(rc);
+}
+
+static int netdev_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_rd report_delta = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_delta = &report_delta,
+ };
+ struct rtnl_hw_stats64 *stats;
+ int rc;
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return -EINVAL;
+
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+ &info.info);
+
+ /* Cache whatever we got, even if there was an error, otherwise the
+ * successful stats retrievals would get lost.
+ */
+ netdev_hw_stats64_add(stats, &report_delta.stats);
+
+ if (p_stats)
+ *p_stats = *stats;
+ *p_used = report_delta.used;
+
+ return notifier_to_errno(rc);
+}
+
+int netdev_offload_xstats_get(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats, bool *p_used,
+ struct netlink_ext_ack *extack)
{
- unsigned short old_flags = dev->flags;
+ ASSERT_RTNL();
- if ((dev->promiscuity += inc) == 0)
- dev->flags &= ~IFF_PROMISC;
+ if (p_stats)
+ return netdev_offload_xstats_get_stats(dev, type, p_stats,
+ p_used, extack);
else
- dev->flags |= IFF_PROMISC;
- if (dev->flags != old_flags) {
- dev_mc_upload(dev);
- printk(KERN_INFO "device %s %s promiscuous mode\n",
- dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
- "left");
- audit_log(current->audit_context, GFP_ATOMIC,
- AUDIT_ANOM_PROMISCUOUS,
- "dev=%s prom=%d old_prom=%d auid=%u",
- dev->name, (dev->flags & IFF_PROMISC),
- (old_flags & IFF_PROMISC),
- audit_get_loginuid(current->audit_context));
+ return netdev_offload_xstats_get_used(dev, type, p_used,
+ extack);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_get);
+
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
+ const struct rtnl_hw_stats64 *stats)
+{
+ report_delta->used = true;
+ netdev_hw_stats64_add(&report_delta->stats, stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
+
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
+{
+ report_used->used = true;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_used);
+
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ const struct rtnl_hw_stats64 *p_stats)
+{
+ struct rtnl_hw_stats64 *stats;
+
+ ASSERT_RTNL();
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return;
+
+ netdev_hw_stats64_add(stats, p_stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
+
+/**
+ * netdev_get_xmit_slave - Get the xmit slave of master device
+ * @dev: device
+ * @skb: The packet
+ * @all_slaves: assume all the slaves are active
+ *
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ * %NULL is returned if no slave is found.
+ */
+
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+ struct sk_buff *skb,
+ bool all_slaves)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_xmit_slave)
+ return NULL;
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
+}
+EXPORT_SYMBOL(netdev_get_xmit_slave);
+
+static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_sk_get_lower_dev)
+ return NULL;
+ return ops->ndo_sk_get_lower_dev(dev, sk);
+}
+
+/**
+ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
+ * @dev: device
+ * @sk: the socket
+ *
+ * %NULL is returned if no lower device is found.
+ */
+
+struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ struct net_device *lower;
+
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ while (lower) {
+ dev = lower;
+ lower = netdev_sk_get_lower_dev(dev, sk);
}
+
+ return dev;
}
+EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
+
+static void netdev_adjacent_add_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.lower);
+ }
+}
+
+static void netdev_adjacent_del_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.lower);
+ }
+}
+
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net, dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ }
+}
+
+void *netdev_lower_dev_get_private(struct net_device *dev,
+ struct net_device *lower_dev)
+{
+ struct netdev_adjacent *lower;
+
+ if (!lower_dev)
+ return NULL;
+ lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
+ if (!lower)
+ return NULL;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
+ * netdev_lower_state_changed - Dispatch event about lower device state change
+ * @lower_dev: device
+ * @lower_state_info: state to dispatch
*
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
+ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
*/
+void netdev_lower_state_changed(struct net_device *lower_dev,
+ void *lower_state_info)
+{
+ struct netdev_notifier_changelowerstate_info changelowerstate_info = {
+ .info.dev = lower_dev,
+ };
+
+ ASSERT_RTNL();
+ changelowerstate_info.lower_state_info = lower_state_info;
+ call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
+ &changelowerstate_info.info);
+}
+EXPORT_SYMBOL(netdev_lower_state_changed);
-void dev_set_allmulti(struct net_device *dev, int inc)
+static void dev_change_rx_flags(struct net_device *dev, int flags)
{
- unsigned short old_flags = dev->flags;
+ const struct net_device_ops *ops = dev->netdev_ops;
- dev->flags |= IFF_ALLMULTI;
- if ((dev->allmulti += inc) == 0)
- dev->flags &= ~IFF_ALLMULTI;
- if (dev->flags ^ old_flags)
- dev_mc_upload(dev);
+ if (ops->ndo_change_rx_flags)
+ ops->ndo_change_rx_flags(dev, flags);
}
-unsigned dev_get_flags(const struct net_device *dev)
+static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
- unsigned flags;
+ unsigned int old_flags = dev->flags;
+ unsigned int promiscuity, flags;
+ kuid_t uid;
+ kgid_t gid;
- flags = (dev->flags & ~(IFF_PROMISC |
+ ASSERT_RTNL();
+
+ promiscuity = dev->promiscuity + inc;
+ if (promiscuity == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch promisc and return error.
+ */
+ if (unlikely(inc > 0)) {
+ netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
+ return -EOVERFLOW;
+ }
+ flags = old_flags & ~IFF_PROMISC;
+ } else {
+ flags = old_flags | IFF_PROMISC;
+ }
+ WRITE_ONCE(dev->promiscuity, promiscuity);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
+ netdev_info(dev, "%s promiscuous mode\n",
+ dev->flags & IFF_PROMISC ? "entered" : "left");
+ if (audit_enabled) {
+ current_uid_gid(&uid, &gid);
+ audit_log(audit_context(), GFP_ATOMIC,
+ AUDIT_ANOM_PROMISCUOUS,
+ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+ dev->name, (dev->flags & IFF_PROMISC),
+ (old_flags & IFF_PROMISC),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ audit_get_sessionid(current));
+ }
+
+ dev_change_rx_flags(dev, IFF_PROMISC);
+ }
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
+ return 0;
+}
+
+int netif_set_promiscuity(struct net_device *dev, int inc)
+{
+ unsigned int old_flags = dev->flags;
+ int err;
+
+ err = __dev_set_promiscuity(dev, inc, true);
+ if (err < 0)
+ return err;
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ return err;
+}
+
+int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+ unsigned int allmulti, flags;
+
+ ASSERT_RTNL();
+
+ allmulti = dev->allmulti + inc;
+ if (allmulti == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch allmulti and return error.
+ */
+ if (unlikely(inc > 0)) {
+ netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
+ return -EOVERFLOW;
+ }
+ flags = old_flags & ~IFF_ALLMULTI;
+ } else {
+ flags = old_flags | IFF_ALLMULTI;
+ }
+ WRITE_ONCE(dev->allmulti, allmulti);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
+ netdev_info(dev, "%s allmulticast mode\n",
+ dev->flags & IFF_ALLMULTI ? "entered" : "left");
+ dev_change_rx_flags(dev, IFF_ALLMULTI);
+ dev_set_rx_mode(dev);
+ if (notify)
+ __dev_notify_flags(dev, old_flags,
+ dev->gflags ^ old_gflags, 0, NULL);
+ }
+ return 0;
+}
+
+/*
+ * Upload unicast and multicast address lists to device and
+ * configure RX filtering. When the device doesn't support unicast
+ * filtering it is put in promiscuous mode while unicast addresses
+ * are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
+ /* Unicast addresses changes may only happen under the rtnl,
+ * therefore calling __dev_set_promiscuity here is safe.
+ */
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ __dev_set_promiscuity(dev, 1, false);
+ dev->uc_promisc = true;
+ } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ __dev_set_promiscuity(dev, -1, false);
+ dev->uc_promisc = false;
+ }
+ }
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+/**
+ * netif_get_flags() - get flags reported to userspace
+ * @dev: device
+ *
+ * Get the combination of flag bits exported through APIs to userspace.
+ */
+unsigned int netif_get_flags(const struct net_device *dev)
+{
+ unsigned int flags;
+
+ flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
- (dev->gflags & (IFF_PROMISC |
+ (READ_ONCE(dev->gflags) & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
@@ -2375,11 +9691,15 @@ unsigned dev_get_flags(const struct net_device *dev)
return flags;
}
+EXPORT_SYMBOL(netif_get_flags);
-int dev_change_flags(struct net_device *dev, unsigned flags)
+int __dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
+ unsigned int old_flags = dev->flags;
int ret;
- int old_flags = dev->flags;
+
+ ASSERT_RTNL();
/*
* Set the flags on our device.
@@ -2395,7 +9715,10 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
* Load in the correct multicast list now the flags have changed.
*/
- dev_mc_upload(dev);
+ if ((old_flags ^ flags) & IFF_MULTICAST)
+ dev_change_rx_flags(dev, IFF_MULTICAST);
+
+ dev_set_rx_mode(dev);
/*
* Have we downed the interface. We handle IFF_UP ourselves
@@ -2404,482 +9727,1563 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
*/
ret = 0;
- if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
- ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
-
- if (!ret)
- dev_mc_upload(dev);
+ if ((old_flags ^ flags) & IFF_UP) {
+ if (old_flags & IFF_UP)
+ __dev_close(dev);
+ else
+ ret = __dev_open(dev, extack);
}
- if (dev->flags & IFF_UP &&
- ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
- IFF_VOLATILE)))
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_CHANGE, dev);
-
if ((flags ^ dev->gflags) & IFF_PROMISC) {
- int inc = (flags & IFF_PROMISC) ? +1 : -1;
+ int inc = (flags & IFF_PROMISC) ? 1 : -1;
+ old_flags = dev->flags;
+
dev->gflags ^= IFF_PROMISC;
- dev_set_promiscuity(dev, inc);
+
+ if (__dev_set_promiscuity(dev, inc, false) >= 0)
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
- is important. Some (broken) drivers set IFF_PROMISC, when
- IFF_ALLMULTI is requested not asking us and not reporting.
+ * is important. Some (broken) drivers set IFF_PROMISC, when
+ * IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
- int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
+ int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
+
dev->gflags ^= IFF_ALLMULTI;
- dev_set_allmulti(dev, inc);
+ netif_set_allmulti(dev, inc, false);
+ }
+
+ return ret;
+}
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges, u32 portid,
+ const struct nlmsghdr *nlh)
+{
+ unsigned int changes = dev->flags ^ old_flags;
+
+ if (gchanges)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
+
+ if (changes & IFF_UP) {
+ if (dev->flags & IFF_UP)
+ call_netdevice_notifiers(NETDEV_UP, dev);
+ else
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
}
- if (old_flags ^ dev->flags)
- rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
+ if (dev->flags & IFF_UP &&
+ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+ struct netdev_notifier_change_info change_info = {
+ .info = {
+ .dev = dev,
+ },
+ .flags_changed = changes,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
+ }
+}
+
+int netif_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
+
+ ret = __dev_change_flags(dev, flags, extack);
+ if (ret < 0)
+ return ret;
+ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
+ __dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int __netif_set_mtu(struct net_device *dev, int new_mtu)
{
- int err;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_mtu)
+ return ops->ndo_change_mtu(dev, new_mtu);
+
+ /* Pairs with all the lockless reads of dev->mtu in the stack */
+ WRITE_ONCE(dev->mtu, new_mtu);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");
+
+int dev_validate_mtu(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu less than device minimum");
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * netif_set_mtu_ext() - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ * @extack: netlink extended ack
+ *
+ * Change the maximum transfer size of the network device.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ int err, orig_mtu;
+
+ netdev_ops_assert_locked(dev);
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive. */
- if (new_mtu < 0)
- return -EINVAL;
+ err = dev_validate_mtu(dev, new_mtu, extack);
+ if (err)
+ return err;
if (!netif_device_present(dev))
return -ENODEV;
- err = 0;
- if (dev->change_mtu)
- err = dev->change_mtu(dev, new_mtu);
- else
- dev->mtu = new_mtu;
- if (!err && dev->flags & IFF_UP)
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_CHANGEMTU, dev);
+ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ orig_mtu = dev->mtu;
+ err = __netif_set_mtu(dev, new_mtu);
+
+ if (!err) {
+ err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ orig_mtu);
+ err = notifier_to_errno(err);
+ if (err) {
+ /* setting mtu back and notifying everyone again,
+ * so that they have a chance to revert changes.
+ */
+ __netif_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ new_mtu);
+ }
+ }
+ return err;
+}
+
+int netif_set_mtu(struct net_device *dev, int new_mtu)
+{
+ struct netlink_ext_ack extack;
+ int err;
+
+ memset(&extack, 0, sizeof(extack));
+ err = netif_set_mtu_ext(dev, new_mtu, &extack);
+ if (err && extack._msg)
+ net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
+EXPORT_SYMBOL(netif_set_mtu);
+
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ unsigned int orig_len = dev->tx_queue_len;
+ int res;
+
+ if (new_len != (unsigned int)new_len)
+ return -ERANGE;
+
+ if (new_len != orig_len) {
+ WRITE_ONCE(dev->tx_queue_len, new_len);
+ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ res = notifier_to_errno(res);
+ if (res)
+ goto err_rollback;
+ res = dev_qdisc_change_tx_queue_len(dev);
+ if (res)
+ goto err_rollback;
+ }
+
+ return 0;
+
+err_rollback:
+ netdev_err(dev, "refused to change device tx_queue_len\n");
+ WRITE_ONCE(dev->tx_queue_len, orig_len);
+ return res;
+}
+
+void netif_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+
+/**
+ * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
+ * @dev: device
+ * @addr: new address
+ * @extack: netlink extended ack
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_pre_changeaddr_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .dev_addr = addr,
+ };
+ int rc;
+
+ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
+ return notifier_to_errno(rc);
+}
+EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
{
+ const struct net_device_ops *ops = dev->netdev_ops;
int err;
- if (!dev->set_mac_address)
+ if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
- if (sa->sa_family != dev->type)
+ if (ss->ss_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = dev->set_mac_address(dev, sa);
- if (!err)
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_CHANGEADDR, dev);
- return err;
+ err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
+ if (err)
+ return err;
+ if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, ss);
+ if (err)
+ return err;
+ }
+ dev->addr_assign_type = NET_ADDR_SET;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ return 0;
}
-/*
- * Perform the SIOCxIFxxx calls.
+DECLARE_RWSEM(dev_addr_sem);
+
+/* "sa" is a true struct sockaddr with limited "sa_data" member. */
+int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+ size_t size = sizeof(sa->sa_data);
+ struct net_device *dev;
+ int ret = 0;
+
+ down_read(&dev_addr_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ if (!dev->addr_len)
+ memset(sa->sa_data, 0, size);
+ else
+ memcpy(sa->sa_data, dev->dev_addr,
+ min_t(size_t, size, dev->addr_len));
+ sa->sa_family = dev->type;
+
+unlock:
+ rcu_read_unlock();
+ up_read(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");
+
+int netif_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_carrier(dev, new_carrier);
+}
+
+/**
+ * dev_get_phys_port_id - Get device physical port ID
+ * @dev: device
+ * @ppid: port ID
+ *
+ * Get device physical port ID
*/
-static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid)
{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_id(dev, ppid);
+}
+
+/**
+ * dev_get_phys_port_name - Get device physical port name
+ * @dev: device
+ * @name: port name
+ * @len: limit of bytes to copy to name
+ *
+ * Get device physical port name
+ */
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
int err;
- struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
- if (!dev)
- return -ENODEV;
+ if (ops->ndo_get_phys_port_name) {
+ err = ops->ndo_get_phys_port_name(dev, name, len);
+ if (err != -EOPNOTSUPP)
+ return err;
+ }
+ return devlink_compat_phys_port_name_get(dev, name, len);
+}
- switch (cmd) {
- case SIOCGIFFLAGS: /* Get interface flags */
- ifr->ifr_flags = dev_get_flags(dev);
- return 0;
+/**
+ * netif_get_port_parent_id() - Get the device's port parent identifier
+ * @dev: network device
+ * @ppid: pointer to a storage for the port's parent identifier
+ * @recurse: allow/disallow recursion to lower devices
+ *
+ * Get the devices's port parent identifier.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int netif_get_port_parent_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid, bool recurse)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct netdev_phys_item_id first = { };
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err;
- case SIOCSIFFLAGS: /* Set interface flags */
- return dev_change_flags(dev, ifr->ifr_flags);
+ if (ops->ndo_get_port_parent_id) {
+ err = ops->ndo_get_port_parent_id(dev, ppid);
+ if (err != -EOPNOTSUPP)
+ return err;
+ }
- case SIOCGIFMETRIC: /* Get the metric on the interface
- (currently unused) */
- ifr->ifr_metric = 0;
- return 0;
+ err = devlink_compat_switch_id_get(dev, ppid);
+ if (!recurse || err != -EOPNOTSUPP)
+ return err;
- case SIOCSIFMETRIC: /* Set the metric on the interface
- (currently unused) */
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = netif_get_port_parent_id(lower_dev, ppid, true);
+ if (err)
+ break;
+ if (!first.id_len)
+ first = *ppid;
+ else if (memcmp(&first, ppid, sizeof(*ppid)))
return -EOPNOTSUPP;
+ }
- case SIOCGIFMTU: /* Get the MTU of a device */
- ifr->ifr_mtu = dev->mtu;
- return 0;
+ return err;
+}
+EXPORT_SYMBOL(netif_get_port_parent_id);
+
+/**
+ * netdev_port_same_parent_id - Indicate if two network devices have
+ * the same port parent identifier
+ * @a: first network device
+ * @b: second network device
+ */
+bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
+{
+ struct netdev_phys_item_id a_id = { };
+ struct netdev_phys_item_id b_id = { };
- case SIOCSIFMTU: /* Set the MTU of a device */
- return dev_set_mtu(dev, ifr->ifr_mtu);
+ if (netif_get_port_parent_id(a, &a_id, true) ||
+ netif_get_port_parent_id(b, &b_id, true))
+ return false;
+
+ return netdev_phys_item_id_same(&a_id, &b_id);
+}
+EXPORT_SYMBOL(netdev_port_same_parent_id);
+
+int netif_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ if (!dev->change_proto_down)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ if (proto_down)
+ netif_carrier_off(dev);
+ else
+ netif_carrier_on(dev);
+ WRITE_ONCE(dev->proto_down, proto_down);
+ return 0;
+}
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+/**
+ * netdev_change_proto_down_reason_locked - proto down reason
+ *
+ * @dev: device
+ * @mask: proto down mask
+ * @value: proto down value
+ */
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value)
+{
+ u32 proto_down_reason;
+ int b;
+
+ if (!mask) {
+ proto_down_reason = value;
+ } else {
+ proto_down_reason = dev->proto_down_reason;
+ for_each_set_bit(b, &mask, 32) {
+ if (value & (1 << b))
+ proto_down_reason |= BIT(b);
else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
+ proto_down_reason &= ~BIT(b);
+ }
+ }
+ WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
+}
- case SIOCSIFHWADDR:
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+struct bpf_xdp_link {
+ struct bpf_link link;
+ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
+ int flags;
+};
- case SIOCSIFHWBROADCAST:
- if (ifr->ifr_hwaddr.sa_family != dev->type)
- return -EINVAL;
- memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_CHANGEADDR, dev);
- return 0;
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
+{
+ if (flags & XDP_FLAGS_HW_MODE)
+ return XDP_MODE_HW;
+ if (flags & XDP_FLAGS_DRV_MODE)
+ return XDP_MODE_DRV;
+ if (flags & XDP_FLAGS_SKB_MODE)
+ return XDP_MODE_SKB;
+ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
+}
- case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ switch (mode) {
+ case XDP_MODE_SKB:
+ return generic_xdp_install;
+ case XDP_MODE_DRV:
+ case XDP_MODE_HW:
+ return dev->netdev_ops->ndo_bpf;
+ default:
+ return NULL;
+ }
+}
- case SIOCSIFMAP:
- if (dev->set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev->set_config(dev, &ifr->ifr_map);
- }
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ return dev->xdp_state[mode].link;
+}
+
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
+
+ if (link)
+ return link->link.prog;
+ return dev->xdp_state[mode].prog;
+}
+
+u8 dev_xdp_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
+ count++;
+ return count;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ if (!dev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
+ return -EBUSY;
+ }
+
+ return dev->netdev_ops->ndo_bpf(dev, bpf);
+}
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
+
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ struct bpf_prog *prog = dev_xdp_prog(dev, mode);
+
+ return prog ? prog->aux->id : 0;
+}
+
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_xdp_link *link)
+{
+ dev->xdp_state[mode].link = link;
+ dev->xdp_state[mode].prog = NULL;
+}
+
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_prog *prog)
+{
+ dev->xdp_state[mode].link = NULL;
+ dev->xdp_state[mode].prog = prog;
+}
+
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
+ bpf_op_t bpf_op, struct netlink_ext_ack *extack,
+ u32 flags, struct bpf_prog *prog)
+{
+ struct netdev_bpf xdp;
+ int err;
+
+ netdev_ops_assert_locked(dev);
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
+ return -EBUSY;
+ }
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
+ xdp.extack = extack;
+ xdp.flags = flags;
+ xdp.prog = prog;
+
+ /* Drivers assume refcnt is already incremented (i.e, prog pointer is
+ * "moved" into driver), so they don't increment it on their own, but
+ * they do decrement refcnt when program is detached or replaced.
+ * Given net_device also owns link/prog, we need to bump refcnt here
+ * to prevent drivers from underflowing it.
+ */
+ if (prog)
+ bpf_prog_inc(prog);
+ err = bpf_op(dev, &xdp);
+ if (err) {
+ if (prog)
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ if (mode != XDP_MODE_HW)
+ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
+
+ return 0;
+}
+
+static void dev_xdp_uninstall(struct net_device *dev)
+{
+ struct bpf_xdp_link *link;
+ struct bpf_prog *prog;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+
+ ASSERT_RTNL();
+
+ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
+ prog = dev_xdp_prog(dev, mode);
+ if (!prog)
+ continue;
+
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op)
+ continue;
+
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+
+ /* auto-detach link from net device */
+ link = dev_xdp_link(dev, mode);
+ if (link)
+ link->dev = NULL;
+ else
+ bpf_prog_put(prog);
+
+ dev_xdp_set_link(dev, mode, NULL);
+ }
+}
+
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog, u32 flags)
+{
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
+ struct bpf_prog *cur_prog;
+ struct net_device *upper;
+ struct list_head *iter;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+ int err;
+
+ ASSERT_RTNL();
+
+ /* either link or prog attachment, never both */
+ if (link && (new_prog || old_prog))
+ return -EINVAL;
+ /* link supports only XDP mode flags */
+ if (link && (flags & ~XDP_FLAGS_MODES)) {
+ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
+ return -EINVAL;
+ }
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
+ if (num_modes > 1) {
+ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
+ return -EINVAL;
+ }
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
+ NL_SET_ERR_MSG(extack,
+ "More than one program loaded, unset mode is ambiguous");
+ return -EINVAL;
+ }
+ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
+ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
+ return -EINVAL;
+ }
+
+ mode = dev_xdp_mode(dev, flags);
+ /* can't replace attached link */
+ if (dev_xdp_link(dev, mode)) {
+ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
+ return -EBUSY;
+ }
+
+ /* don't allow if an upper device already has a program */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (dev_xdp_prog_count(upper) > 0) {
+ NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
+ return -EEXIST;
+ }
+ }
+
+ cur_prog = dev_xdp_prog(dev, mode);
+ /* can't replace attached prog with link */
+ if (link && cur_prog) {
+ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
+ return -EBUSY;
+ }
+ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
+
+ /* put effective new program into new_prog */
+ if (link)
+ new_prog = link->link.prog;
+
+ if (new_prog) {
+ bool offload = mode == XDP_MODE_HW;
+ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
+ ? XDP_MODE_DRV : XDP_MODE_SKB;
+
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
+ NL_SET_ERR_MSG(extack, "XDP program already attached");
+ return -EBUSY;
+ }
+ if (!offload && dev_xdp_prog(dev, other_mode)) {
+ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
+ return -EEXIST;
+ }
+ if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
+ NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
+ return -EINVAL;
+ }
+ if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
+ NL_SET_ERR_MSG(extack, "Program bound to different device");
+ return -EINVAL;
+ }
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
+ return -EINVAL;
+ }
+ }
+
+ /* don't call drivers if the effective program didn't change */
+ if (new_prog != cur_prog) {
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op) {
+ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
return -EOPNOTSUPP;
+ }
- case SIOCADDMULTI:
- if (!dev->set_multicast_list ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
- dev->addr_len, 1);
-
- case SIOCDELMULTI:
- if (!dev->set_multicast_list ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
- dev->addr_len, 1);
-
- case SIOCGIFINDEX:
- ifr->ifr_ifindex = dev->ifindex;
- return 0;
+ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
+ if (err)
+ return err;
+ }
- case SIOCGIFTXQLEN:
- ifr->ifr_qlen = dev->tx_queue_len;
- return 0;
+ if (link)
+ dev_xdp_set_link(dev, mode, link);
+ else
+ dev_xdp_set_prog(dev, mode, new_prog);
+ if (cur_prog)
+ bpf_prog_put(cur_prog);
- case SIOCSIFTXQLEN:
- if (ifr->ifr_qlen < 0)
- return -EINVAL;
- dev->tx_queue_len = ifr->ifr_qlen;
- return 0;
+ return 0;
+}
- case SIOCSIFNAME:
- ifr->ifr_newname[IFNAMSIZ-1] = '\0';
- return dev_change_name(dev, ifr->ifr_newname);
+static int dev_xdp_attach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
+{
+ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
+}
- /*
- * Unknown or private ioctl
- */
+static int dev_xdp_detach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
+{
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
- default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
- cmd == SIOCBONDRELEASE ||
- cmd == SIOCBONDSETHWADDR ||
- cmd == SIOCBONDSLAVEINFOQUERY ||
- cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCWANDEV) {
- err = -EOPNOTSUPP;
- if (dev->do_ioctl) {
- if (netif_device_present(dev))
- err = dev->do_ioctl(dev, ifr,
- cmd);
- else
- err = -ENODEV;
- }
- } else
- err = -EINVAL;
+ ASSERT_RTNL();
+
+ mode = dev_xdp_mode(dev, link->flags);
+ if (dev_xdp_link(dev, mode) != link)
+ return -EINVAL;
+
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+ dev_xdp_set_link(dev, mode, NULL);
+ return 0;
+}
+
+static void bpf_xdp_link_release(struct bpf_link *link)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ rtnl_lock();
+ /* if racing with net_device's tear down, xdp_link->dev might be
+ * already NULL, in which case link was already auto-detached
+ */
+ if (xdp_link->dev) {
+ netdev_lock_ops(xdp_link->dev);
+ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ netdev_unlock_ops(xdp_link->dev);
+ xdp_link->dev = NULL;
}
+
+ rtnl_unlock();
+}
+
+static int bpf_xdp_link_detach(struct bpf_link *link)
+{
+ bpf_xdp_link_release(link);
+ return 0;
+}
+
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ kfree(xdp_link);
+}
+
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+}
+
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
+
+ info->xdp.ifindex = ifindex;
+ return 0;
+}
+
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+ int err = 0;
+
+ rtnl_lock();
+
+ /* link might have been auto-released already, so fail */
+ if (!xdp_link->dev) {
+ err = -ENOLINK;
+ goto out_unlock;
+ }
+
+ if (old_prog && link->prog != old_prog) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ old_prog = link->prog;
+ if (old_prog->type != new_prog->type ||
+ old_prog->expected_attach_type != new_prog->expected_attach_type) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (old_prog == new_prog) {
+ /* no-op, don't disturb drivers */
+ bpf_prog_put(new_prog);
+ goto out_unlock;
+ }
+
+ netdev_lock_ops(xdp_link->dev);
+ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
+ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
+ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
+ xdp_link->flags, new_prog);
+ netdev_unlock_ops(xdp_link->dev);
+ if (err)
+ goto out_unlock;
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ rtnl_unlock();
return err;
}
-/*
- * This function handles all "interface"-type I/O control requests. The actual
- * 'doing' part of this is dev_ifsioc above.
+static const struct bpf_link_ops bpf_xdp_link_lops = {
+ .release = bpf_xdp_link_release,
+ .dealloc = bpf_xdp_link_dealloc,
+ .detach = bpf_xdp_link_detach,
+ .show_fdinfo = bpf_xdp_link_show_fdinfo,
+ .fill_link_info = bpf_xdp_link_fill_link_info,
+ .update_prog = bpf_xdp_link_update,
+};
+
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct netlink_ext_ack extack = {};
+ struct bpf_xdp_link *link;
+ struct net_device *dev;
+ int err, fd;
+
+ rtnl_lock();
+ dev = dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
+ attr->link_create.attach_type);
+ link->dev = dev;
+ link->flags = attr->link_create.flags;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto unlock;
+ }
+
+ netdev_lock_ops(dev);
+ err = dev_xdp_attach_link(dev, &extack, link);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+
+ if (err) {
+ link->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ trace_bpf_xdp_link_attach_failed(extack._msg);
+ goto out_put_dev;
+ }
+
+ fd = bpf_link_settle(&link_primer);
+ /* link itself doesn't hold dev's refcnt to not complicate shutdown */
+ dev_put(dev);
+ return fd;
+
+unlock:
+ rtnl_unlock();
+
+out_put_dev:
+ dev_put(dev);
+ return err;
+}
+
+/**
+ * dev_change_xdp_fd - set or clear a bpf program for a device rx path
+ * @dev: device
+ * @extack: netlink extended ack
+ * @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
+ * @flags: xdp-related flags
+ *
+ * Set or clear a bpf program for a device
*/
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, int expected_fd, u32 flags)
+{
+ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
+ struct bpf_prog *new_prog = NULL, *old_prog = NULL;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (fd >= 0) {
+ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(new_prog))
+ return PTR_ERR(new_prog);
+ }
+
+ if (expected_fd >= 0) {
+ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(old_prog)) {
+ err = PTR_ERR(old_prog);
+ old_prog = NULL;
+ goto err_out;
+ }
+ }
+
+ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
+
+err_out:
+ if (err && new_prog)
+ bpf_prog_put(new_prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ return err;
+}
+
+u32 dev_get_min_mp_channel_count(const struct net_device *dev)
+{
+ int i;
+
+ netdev_ops_assert_locked(dev);
+
+ for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
+ if (dev->_rx[i].mp_params.mp_priv)
+ /* The channel count is the idx plus 1. */
+ return i + 1;
+
+ return 0;
+}
/**
- * dev_ioctl - network device ioctl
- * @cmd: command to issue
- * @arg: pointer to a struct ifreq in user space
+ * dev_index_reserve() - allocate an ifindex in a namespace
+ * @net: the applicable net namespace
+ * @ifindex: requested ifindex, pass %0 to get one allocated
+ *
+ * Allocate a ifindex for a new device. Caller must either use the ifindex
+ * to store the device (via list_netdevice()) or call dev_index_release()
+ * to give the index up.
*
- * Issue ioctl functions to devices. This is normally called by the
- * user space syscall interfaces but can sometimes be useful for
- * other purposes. The return value is the return from the syscall if
- * positive or a negative errno code on error.
+ * Return: a suitable unique value for a new device interface number or -errno.
*/
+static int dev_index_reserve(struct net *net, u32 ifindex)
+{
+ int err;
+
+ if (ifindex > INT_MAX) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ if (!ifindex)
+ err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
+ xa_limit_31b, &net->ifindex, GFP_KERNEL);
+ else
+ err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
+ if (err < 0)
+ return err;
+
+ return ifindex;
+}
-int dev_ioctl(unsigned int cmd, void __user *arg)
+static void dev_index_release(struct net *net, int ifindex)
{
- struct ifreq ifr;
- int ret;
- char *colon;
+ /* Expect only unused indexes, unlist_netdevice() removes the used */
+ WARN_ON(xa_erase(&net->dev_by_index, ifindex));
+}
- /* One special case: SIOCGIFCONF takes ifconf argument
- and requires shared lock, because it sleeps writing
- to user space.
- */
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == READ_ONCE(cleanup_net_task);
+#else
+ return false;
+#endif
+}
- if (cmd == SIOCGIFCONF) {
- rtnl_lock();
- ret = dev_ifconf((char __user *) arg);
- rtnl_unlock();
- return ret;
+/* Delayed registration/unregisteration */
+LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+atomic_t dev_unreg_count = ATOMIC_INIT(0);
+
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
+}
+
+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+ struct net_device *upper, netdev_features_t features)
+{
+ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+ netdev_features_t feature;
+ int feature_bit;
+
+ for_each_netdev_feature(upper_disables, feature_bit) {
+ feature = __NETIF_F_BIT(feature_bit);
+ if (!(upper->wanted_features & feature)
+ && (features & feature)) {
+ netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
+ &feature, upper->name);
+ features &= ~feature;
+ }
}
- if (cmd == SIOCGIFNAME)
- return dev_ifname((struct ifreq __user *)arg);
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
+ return features;
+}
- ifr.ifr_name[IFNAMSIZ-1] = 0;
+static void netdev_sync_lower_features(struct net_device *upper,
+ struct net_device *lower, netdev_features_t features)
+{
+ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+ netdev_features_t feature;
+ int feature_bit;
+
+ for_each_netdev_feature(upper_disables, feature_bit) {
+ feature = __NETIF_F_BIT(feature_bit);
+ if (!(features & feature) && (lower->features & feature)) {
+ netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
+ &feature, lower->name);
+ netdev_lock_ops(lower);
+ lower->wanted_features &= ~feature;
+ __netdev_update_features(lower);
+
+ if (unlikely(lower->features & feature))
+ netdev_WARN(upper, "failed to disable %pNF on %s!\n",
+ &feature, lower->name);
+ else
+ netdev_features_change(lower);
+ netdev_unlock_ops(lower);
+ }
+ }
+}
- colon = strchr(ifr.ifr_name, ':');
- if (colon)
- *colon = 0;
+static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
+{
+ netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
+ bool hw_csum = features & NETIF_F_HW_CSUM;
- /*
- * See which interface the caller is talking about.
- */
+ return ip_csum || hw_csum;
+}
- switch (cmd) {
- /*
- * These ioctl calls:
- * - can be done by all.
- * - atomic and do not require locking.
- * - return a value
- */
- case SIOCGIFFLAGS:
- case SIOCGIFMETRIC:
- case SIOCGIFMTU:
- case SIOCGIFHWADDR:
- case SIOCGIFSLAVE:
- case SIOCGIFMAP:
- case SIOCGIFINDEX:
- case SIOCGIFTXQLEN:
- dev_load(ifr.ifr_name);
- read_lock(&dev_base_lock);
- ret = dev_ifsioc(&ifr, cmd);
- read_unlock(&dev_base_lock);
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_warn(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
- case SIOCETHTOOL:
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ethtool(&ifr);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
+ /* TSO requires that SG is present as well. */
+ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
+ features &= ~NETIF_F_ALL_TSO;
+ }
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - return a value
- */
- case SIOCGMIIPHY:
- case SIOCGMIIREG:
- case SIOCSIFNAME:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(&ifr, cmd);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
+ }
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - do not return a value
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
+ }
+
+ /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
+ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
+ features &= ~NETIF_F_TSO_MANGLEID;
+
+ /* TSO ECN requires that TSO is present as well. */
+ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+ features &= ~NETIF_F_TSO_ECN;
+
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
+
+ /* GSO partial features require GSO partial be set */
+ if ((features & dev->gso_partial_features) &&
+ !(features & NETIF_F_GSO_PARTIAL)) {
+ netdev_dbg(dev,
+ "Dropping partially supported GSO features since no GSO partial.\n");
+ features &= ~dev->gso_partial_features;
+ }
+
+ if (!(features & NETIF_F_RXCSUM)) {
+ /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
+ * successfully merged by hardware must also have the
+ * checksum verified by hardware. If the user does not
+ * want to enable RXCSUM, logically, we should disable GRO_HW.
*/
- case SIOCSIFFLAGS:
- case SIOCSIFMETRIC:
- case SIOCSIFMTU:
- case SIOCSIFMAP:
- case SIOCSIFHWADDR:
- case SIOCSIFSLAVE:
- case SIOCADDMULTI:
- case SIOCDELMULTI:
- case SIOCSIFHWBROADCAST:
- case SIOCSIFTXQLEN:
- case SIOCSMIIREG:
- case SIOCBONDENSLAVE:
- case SIOCBONDRELEASE:
- case SIOCBONDSETHWADDR:
- case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- /* fall through */
- case SIOCBONDSLAVEINFOQUERY:
- case SIOCBONDINFOQUERY:
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(&ifr, cmd);
- rtnl_unlock();
- return ret;
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
- case SIOCGIFMEM:
- /* Get the per device memory space. We can add this but
- * currently do not support it */
- case SIOCSIFMEM:
- /* Set the per device memory buffer space.
- * Not applicable in our case */
- case SIOCSIFLINK:
- return -EINVAL;
+ /* LRO/HW-GRO features cannot be combined with RX-FCS */
+ if (features & NETIF_F_RXFCS) {
+ if (features & NETIF_F_LRO) {
+ netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
- /*
- * Unknown or private ioctl.
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
+
+ if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
+ netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
+ if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
+ }
+
+ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_RX;
+ }
+
+ if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
+ features &= ~NETIF_F_GSO_UDP_L4;
+ }
+
+ return features;
+}
+
+int __netdev_update_features(struct net_device *dev)
+{
+ struct net_device *upper, *lower;
+ netdev_features_t features;
+ struct list_head *iter;
+ int err = -1;
+
+ ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ /* some features can't be enabled if they're off on an upper device */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter)
+ features = netdev_sync_upper_features(dev, upper, features);
+
+ if (dev->features == features)
+ goto sync_lower;
+
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+ else
+ err = 0;
+
+ if (unlikely(err < 0)) {
+ netdev_err(dev,
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
+ /* return non-0 since some features might have changed and
+ * it's better to fire a spurious notification than miss it
*/
- default:
- if (cmd == SIOCWANDEV ||
- (cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15)) {
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(&ifr, cmd);
- rtnl_unlock();
- if (!ret && copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- return ret;
+ return -1;
+ }
+
+sync_lower:
+ /* some features must be disabled on lower devices when disabled
+ * on an upper device (think: bonding master or bridge)
+ */
+ netdev_for_each_lower_dev(dev, lower, iter)
+ netdev_sync_lower_features(dev, lower, features);
+
+ if (!err) {
+ netdev_features_t diff = features ^ dev->features;
+
+ if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
+ /* udp_tunnel_{get,drop}_rx_info both need
+ * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
+ * device, or they won't do anything.
+ * Thus we need to update dev->features
+ * *before* calling udp_tunnel_get_rx_info,
+ * but *after* calling udp_tunnel_drop_rx_info.
+ */
+ udp_tunnel_nic_lock(dev);
+ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
+ dev->features = features;
+ udp_tunnel_get_rx_info(dev);
+ } else {
+ udp_tunnel_drop_rx_info(dev);
}
-#ifdef CONFIG_WIRELESS_EXT
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
- /* If command is `set a parameter', or
- * `get the encoding parameters', check if
- * the user has the right to do it */
- if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE
- || cmd == SIOCGIWENCODEEXT) {
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- }
- dev_load(ifr.ifr_name);
- rtnl_lock();
- /* Follow me in net/core/wireless.c */
- ret = wireless_process_ioctl(&ifr, cmd);
- rtnl_unlock();
- if (IW_IS_GET(cmd) &&
- copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- return ret;
+ udp_tunnel_nic_unlock(dev);
+ }
+
+ if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
+ if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+ dev->features = features;
+ err |= vlan_get_rx_ctag_filter_info(dev);
+ } else {
+ vlan_drop_rx_ctag_filter_info(dev);
}
-#endif /* CONFIG_WIRELESS_EXT */
- return -EINVAL;
+ }
+
+ if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
+ if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
+ dev->features = features;
+ err |= vlan_get_rx_stag_filter_info(dev);
+ } else {
+ vlan_drop_rx_stag_filter_info(dev);
+ }
+ }
+
+ dev->features = features;
}
+
+ return err < 0 ? 0 : 1;
}
+/**
+ * netdev_update_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications if it
+ * has changed. Should be called after driver or hardware dependent
+ * conditions might have changed that influence the features.
+ */
+void netdev_update_features(struct net_device *dev)
+{
+ if (__netdev_update_features(dev))
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_update_features);
/**
- * dev_new_index - allocate an ifindex
+ * netdev_change_features - recalculate device features
+ * @dev: the device to check
*
- * Returns a suitable unique value for a new device interface
- * number. The caller must hold the rtnl semaphore or the
- * dev_base_lock to be sure it remains unique.
+ * Recalculate dev->features set and send notifications even
+ * if they have not changed. Should be called instead of
+ * netdev_update_features() if also dev->vlan_features might
+ * have changed to allow the changes to be propagated to stacked
+ * VLAN devices.
*/
-static int dev_new_index(void)
+void netdev_change_features(struct net_device *dev)
{
- static int ifindex;
- for (;;) {
- if (++ifindex <= 0)
- ifindex = 1;
- if (!__dev_get_by_index(ifindex))
- return ifindex;
+ __netdev_update_features(dev);
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_change_features);
+
+/**
+ * netif_stacked_transfer_operstate - transfer operstate
+ * @rootdev: the root or lower level device to transfer state from
+ * @dev: the device to transfer operstate to
+ *
+ * Transfer operational state from root to device. This is normally
+ * called when a stacking relationship exists between the root
+ * device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev)
+{
+ if (rootdev->operstate == IF_OPER_DORMANT)
+ netif_dormant_on(dev);
+ else
+ netif_dormant_off(dev);
+
+ if (rootdev->operstate == IF_OPER_TESTING)
+ netif_testing_on(dev);
+ else
+ netif_testing_off(dev);
+
+ if (netif_carrier_ok(rootdev))
+ netif_carrier_on(dev);
+ else
+ netif_carrier_off(dev);
+}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+ unsigned int i, count = dev->num_rx_queues;
+ struct netdev_rx_queue *rx;
+ size_t sz = count * sizeof(*rx);
+ int err = 0;
+
+ BUG_ON(count < 1);
+
+ rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!rx)
+ return -ENOMEM;
+
+ dev->_rx = rx;
+
+ for (i = 0; i < count; i++) {
+ rx[i].dev = dev;
+
+ /* XDP RX-queue setup */
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
+ if (err < 0)
+ goto err_rxq_info;
}
+ return 0;
+
+err_rxq_info:
+ /* Rollback successful reg's and free other resources */
+ while (i--)
+ xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+ kvfree(dev->_rx);
+ dev->_rx = NULL;
+ return err;
}
-static int dev_boot_phase = 1;
+static void netif_free_rx_queues(struct net_device *dev)
+{
+ unsigned int i, count = dev->num_rx_queues;
-/* Delayed registration/unregisteration */
-static DEFINE_SPINLOCK(net_todo_list_lock);
-static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
+ /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
+ if (!dev->_rx)
+ return;
+
+ for (i = 0; i < count; i++)
+ xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
+
+ kvfree(dev->_rx);
+}
-static inline void net_set_todo(struct net_device *dev)
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue, void *_unused)
{
- spin_lock(&net_todo_list_lock);
- list_add_tail(&dev->todo_list, &net_todo_list);
- spin_unlock(&net_todo_list_lock);
+ /* Initialize queue lock */
+ spin_lock_init(&queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ queue->xmit_lock_owner = -1;
+ netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
+ queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
+}
+
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ kvfree(dev->_tx);
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+ unsigned int count = dev->num_tx_queues;
+ struct netdev_queue *tx;
+ size_t sz = count * sizeof(*tx);
+
+ if (count < 1 || count > 0xffff)
+ return -EINVAL;
+
+ tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!tx)
+ return -ENOMEM;
+
+ dev->_tx = tx;
+
+ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ spin_lock_init(&dev->tx_global_lock);
+
+ return 0;
+}
+
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
+ netif_tx_stop_queue(txq);
+ }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
+static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
+{
+ void __percpu *v;
+
+ /* Drivers implementing ndo_get_peer_dev must support tstat
+ * accounting, so that skb_do_redirect() can bump the dev's
+ * RX stats upon network namespace switch.
+ */
+ if (dev->netdev_ops->ndo_get_peer_dev &&
+ dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
+ return -EOPNOTSUPP;
+
+ switch (dev->pcpu_stat_type) {
+ case NETDEV_PCPU_STAT_NONE:
+ return 0;
+ case NETDEV_PCPU_STAT_LSTATS:
+ v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
+ break;
+ case NETDEV_PCPU_STAT_TSTATS:
+ v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ break;
+ case NETDEV_PCPU_STAT_DSTATS:
+ v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return v ? 0 : -ENOMEM;
+}
+
+static void netdev_do_free_pcpu_stats(struct net_device *dev)
+{
+ switch (dev->pcpu_stat_type) {
+ case NETDEV_PCPU_STAT_NONE:
+ return;
+ case NETDEV_PCPU_STAT_LSTATS:
+ free_percpu(dev->lstats);
+ break;
+ case NETDEV_PCPU_STAT_TSTATS:
+ free_percpu(dev->tstats);
+ break;
+ case NETDEV_PCPU_STAT_DSTATS:
+ free_percpu(dev->dstats);
+ break;
+ }
+}
+
+static void netdev_free_phy_link_topology(struct net_device *dev)
+{
+ struct phy_link_topology *topo = dev->link_topo;
+
+ if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
+ xa_destroy(&topo->phys);
+ kfree(topo);
+ dev->link_topo = NULL;
+ }
}
/**
- * register_netdevice - register a network device
- * @dev: device to register
+ * register_netdevice() - register a network device
+ * @dev: device to register
*
- * Take a completed network device structure and add it to the kernel
- * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
- * chain. 0 is returned on success. A negative errno code is returned
- * on a failure to set up the device, or if the name is a duplicate.
- *
- * Callers must hold the rtnl semaphore. You may want
- * register_netdev() instead of this.
- *
- * BUGS:
- * The locking appears insufficient to guarantee two parallel registers
- * will not get the same name.
+ * Take a prepared network device structure and make it externally accessible.
+ * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
+ * Callers must hold the rtnl lock - you may want register_netdev()
+ * instead of this.
*/
-
int register_netdevice(struct net_device *dev)
{
- struct hlist_head *head;
- struct hlist_node *p;
int ret;
+ struct net *net = dev_net(dev);
+ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+ NETDEV_FEATURE_COUNT);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
@@ -2887,88 +11291,115 @@ int register_netdevice(struct net_device *dev)
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+ BUG_ON(!net);
- spin_lock_init(&dev->queue_lock);
- spin_lock_init(&dev->_xmit_lock);
- dev->xmit_lock_owner = -1;
-#ifdef CONFIG_NET_CLS_ACT
- spin_lock_init(&dev->ingress_lock);
-#endif
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
+ /* rss ctx ID 0 is reserved for the default context, start from 1 */
+ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
+ mutex_init(&dev->ethtool->rss_lock);
- dev->iflink = -1;
+ spin_lock_init(&dev->addr_list_lock);
+ netdev_set_addr_lockdep_class(dev);
+
+ ret = dev_get_valid_name(net, dev, dev->name);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ dev->name_node = netdev_name_node_head_alloc(dev);
+ if (!dev->name_node)
+ goto out;
/* Init, if this function is available */
- if (dev->init) {
- ret = dev->init(dev);
+ if (dev->netdev_ops->ndo_init) {
+ ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
- goto out;
+ goto err_free_name;
}
}
-
- if (!dev_valid_name(dev->name)) {
+
+ if (((dev->hw_features | dev->features) &
+ NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
+ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
+ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL;
- goto out;
+ goto err_uninit;
}
- dev->ifindex = dev_new_index();
- if (dev->iflink == -1)
- dev->iflink = dev->ifindex;
+ ret = netdev_do_alloc_pcpu_stats(dev);
+ if (ret)
+ goto err_uninit;
- /* Check for existence of name */
- head = dev_name_hash(dev->name);
- hlist_for_each(p, head) {
- struct net_device *d
- = hlist_entry(p, struct net_device, name_hlist);
- if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
- ret = -EEXIST;
- goto out;
- }
- }
+ ret = dev_index_reserve(net, dev->ifindex);
+ if (ret < 0)
+ goto err_free_pcpu;
+ dev->ifindex = ret;
- /* Fix illegal SG+CSUM combinations. */
- if ((dev->features & NETIF_F_SG) &&
- !(dev->features & NETIF_F_ALL_CSUM)) {
- printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_SG;
- }
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
+ dev->features |= NETIF_F_SOFT_FEATURES;
- /* TSO requires that SG is present as well. */
- if ((dev->features & NETIF_F_TSO) &&
- !(dev->features & NETIF_F_SG)) {
- printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_TSO;
- }
- if (dev->features & NETIF_F_UFO) {
- if (!(dev->features & NETIF_F_HW_CSUM)) {
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
- "NETIF_F_HW_CSUM feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_UFO;
- }
- if (!(dev->features & NETIF_F_SG)) {
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
- "NETIF_F_SG feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_UFO;
- }
+ if (dev->udp_tunnel_nic_info) {
+ dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+ dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
- /*
- * nil rebuild_header routine,
- * that should be never called and used as just bug trap.
+ dev->wanted_features = dev->features & dev->hw_features;
+
+ if (!(dev->flags & IFF_LOOPBACK))
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
+
+ /* If IPv4 TCP segmentation offload is supported we should also
+ * allow the device to enable segmenting the frame with the option
+ * of ignoring a static IP ID value. This doesn't enable the
+ * feature itself but allows the user to enable it later.
+ */
+ if (dev->hw_features & NETIF_F_TSO)
+ dev->hw_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->vlan_features & NETIF_F_TSO)
+ dev->vlan_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->mpls_features & NETIF_F_TSO)
+ dev->mpls_features |= NETIF_F_TSO_MANGLEID;
+ if (dev->hw_enc_features & NETIF_F_TSO)
+ dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
+
+ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
+ dev->vlan_features |= NETIF_F_HIGHDMA;
- if (!dev->rebuild_header)
- dev->rebuild_header = default_rebuild_header;
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
- ret = netdev_register_sysfs(dev);
+ /* Make NETIF_F_SG inheritable to MPLS.
+ */
+ dev->mpls_features |= NETIF_F_SG;
+
+ ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+ ret = notifier_to_errno(ret);
if (ret)
- goto out;
- dev->reg_state = NETREG_REGISTERED;
+ goto err_ifindex_release;
+
+ ret = netdev_register_kobject(dev);
+
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
+
+ if (ret)
+ goto err_uninit_notify;
+
+ netdev_lock_ops(dev);
+ __netdev_update_features(dev);
+ netdev_unlock_ops(dev);
/*
* Default initial state at registry is that the
@@ -2977,23 +11408,79 @@ int register_netdevice(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
- dev->next = NULL;
+ linkwatch_init_dev(dev);
+
dev_init_scheduler(dev);
- write_lock_bh(&dev_base_lock);
- *dev_tail = dev;
- dev_tail = &dev->next;
- hlist_add_head(&dev->name_hlist, head);
- hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
- dev_hold(dev);
- write_unlock_bh(&dev_base_lock);
- /* Notify protocols, that a new device appeared. */
- raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+ netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
+ list_netdevice(dev);
- ret = 0;
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+
+ /* If the device has permanent device address, driver should
+ * set dev_addr and also addr_assign_type should be set to
+ * NET_ADDR_PERM (default value).
+ */
+ if (dev->addr_assign_type == NET_ADDR_PERM)
+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
+
+ /* Notify protocols, that a new device appeared. */
+ netdev_lock_ops(dev);
+ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ /* Expect explicit free_netdev() on failure */
+ dev->needs_free_netdev = false;
+ unregister_netdevice_queue(dev, NULL);
+ goto out;
+ }
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
return ret;
+
+err_uninit_notify:
+ call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+err_ifindex_release:
+ dev_index_release(net, dev->ifindex);
+err_free_pcpu:
+ netdev_do_free_pcpu_stats(dev);
+err_uninit:
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+err_free_name:
+ netdev_name_node_free(dev->name_node);
+ goto out;
+}
+EXPORT_SYMBOL(register_netdevice);
+
+/* Initialize the core of a dummy net device.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
+ */
+static void init_dummy_netdev(struct net_device *dev)
+{
+ /* make sure we BUG if trying to hit standard
+ * register/unregister code path
+ */
+ dev->reg_state = NETREG_DUMMY;
+
+ /* a dummy interface is started by default */
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
}
/**
@@ -3005,35 +11492,47 @@ out:
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
- * This is a wrapper around register_netdev that takes the rtnl semaphore
+ * This is a wrapper around register_netdevice that takes the rtnl semaphore
* and expands the device name if you passed a format string to
* alloc_netdev.
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- rtnl_lock();
+ if (rtnl_net_lock_killable(net))
+ return -EINTR;
- /*
- * If the name is a format string the caller wants us to do a
- * name allocation.
- */
- if (strchr(dev->name, '%')) {
- err = dev_alloc_name(dev, dev->name);
- if (err < 0)
- goto out;
- }
-
err = register_netdevice(dev);
-out:
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
-/*
- * netdev_wait_allrefs - wait until all references are gone.
+int netdev_refcnt_read(const struct net_device *dev)
+{
+#ifdef CONFIG_PCPU_DEV_REFCNT
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+#else
+ return refcount_read(&dev->dev_refcnt);
+#endif
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
+/**
+ * netdev_wait_allrefs_any - wait until all references are gone.
+ * @list: list of net_devices to wait on
*
* This is called when unregistering network devices.
*
@@ -3041,44 +11540,71 @@ EXPORT_SYMBOL(register_netdev);
* for netdevice notification, and cleanup and put back the
* reference if they receive an UNREGISTER event.
* We can get stuck here if buggy protocols don't correctly
- * call dev_put.
+ * call dev_put.
*/
-static void netdev_wait_allrefs(struct net_device *dev)
+static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
unsigned long rebroadcast_time, warning_time;
+ struct net_device *dev;
+ int wait = 0;
rebroadcast_time = warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 0) {
+
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ while (true) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
- raw_notifier_call_chain(&netdev_chain,
- NETDEV_UNREGISTER, dev);
-
- if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
- &dev->state)) {
- /* We must not have linkwatch events
- * pending on unregister. If this
- * happens, we simply run the queue
- * unscheduled, resulting in a noop
- * for this device.
- */
- linkwatch_run_queue();
- }
+ list_for_each_entry(dev, list, todo_list)
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ __rtnl_unlock();
+ rcu_barrier();
+ rtnl_lock();
+
+ list_for_each_entry(dev, list, todo_list)
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ break;
+ }
__rtnl_unlock();
rebroadcast_time = jiffies;
}
- msleep(250);
+ rcu_barrier();
+
+ if (!wait) {
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
+
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ if (time_after(jiffies, warning_time +
+ READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
+ list_for_each_entry(dev, list, todo_list) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, netdev_refcnt_read(dev));
+ ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ }
- if (time_after(jiffies, warning_time + 10 * HZ)) {
- printk(KERN_EMERG "unregister_netdevice: "
- "waiting for %s to become free. Usage "
- "count = %d\n",
- dev->name, atomic_read(&dev->refcnt));
warning_time = jiffies;
}
}
@@ -3098,234 +11624,833 @@ static void netdev_wait_allrefs(struct net_device *dev)
* free_netdev(y1);
* free_netdev(y2);
*
- * We are invoked by rtnl_unlock() after it drops the semaphore.
+ * We are invoked by rtnl_unlock().
* This allows us to deal with problems:
* 1) We can delete sysfs objects which invoke hotplug
* without deadlocking with linkwatch via keventd.
* 2) Since we run with the RTNL semaphore not held, we can sleep
* safely in order to wait for the netdev refcnt to drop to zero.
+ *
+ * We must not return until all unregister events added during
+ * the interval the lock was held have been completed.
*/
-static DEFINE_MUTEX(net_todo_run_mutex);
void netdev_run_todo(void)
{
+ struct net_device *dev, *tmp;
struct list_head list;
+ int cnt;
+#ifdef CONFIG_LOCKDEP
+ struct list_head unlink_list;
- /* Need to guard against multiple cpu's getting out of order. */
- mutex_lock(&net_todo_run_mutex);
+ list_replace_init(&net_unlink_list, &unlink_list);
- /* Not safe to do outside the semaphore. We must not return
- * until all unregister events invoked by the local processor
- * have been completed (either by this todo run, or one on
- * another cpu).
- */
- if (list_empty(&net_todo_list))
- goto out;
+ while (!list_empty(&unlink_list)) {
+ dev = list_first_entry(&unlink_list, struct net_device,
+ unlink_list);
+ list_del_init(&dev->unlink_list);
+ dev->nested_level = dev->lower_level - 1;
+ }
+#endif
/* Snapshot list, allow later requests */
- spin_lock(&net_todo_list_lock);
list_replace_init(&net_todo_list, &list);
- spin_unlock(&net_todo_list_lock);
- while (!list_empty(&list)) {
- struct net_device *dev
- = list_entry(list.next, struct net_device, todo_list);
- list_del(&dev->todo_list);
+ __rtnl_unlock();
+
+ /* Wait for rcu callbacks to finish before next phase */
+ if (!list_empty(&list))
+ rcu_barrier();
+ list_for_each_entry_safe(dev, tmp, &list, todo_list) {
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
- printk(KERN_ERR "network todo '%s' but state %d\n",
- dev->name, dev->reg_state);
- dump_stack();
+ netdev_WARN(dev, "run_todo but not unregistering\n");
+ list_del(&dev->todo_list);
continue;
}
- netdev_unregister_sysfs(dev);
- dev->reg_state = NETREG_UNREGISTERED;
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
+ linkwatch_sync_dev(dev);
+ }
- netdev_wait_allrefs(dev);
+ cnt = 0;
+ while (!list_empty(&list)) {
+ dev = netdev_wait_allrefs_any(&list);
+ list_del(&dev->todo_list);
/* paranoia */
- BUG_ON(atomic_read(&dev->refcnt));
- BUG_TRAP(!dev->ip_ptr);
- BUG_TRAP(!dev->ip6_ptr);
- BUG_TRAP(!dev->dn_ptr);
+ BUG_ON(netdev_refcnt_read(dev) != 1);
+ BUG_ON(!list_empty(&dev->ptype_all));
+ BUG_ON(!list_empty(&dev->ptype_specific));
+ WARN_ON(rcu_access_pointer(dev->ip_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+
+ netdev_do_free_pcpu_stats(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+ if (dev->needs_free_netdev)
+ free_netdev(dev);
+
+ cnt++;
+
+ /* Free network device */
+ kobject_put(&dev->dev.kobj);
+ }
+ if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
+}
- /* It must be the very last action,
- * after this 'dev' may point to freed up memory.
- */
- if (dev->destructor)
- dev->destructor(dev);
+/* Collate per-cpu network dstats statistics
+ *
+ * Read per-cpu network statistics from dev->dstats and populate the related
+ * fields in @s.
+ */
+static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_dstats __percpu *dstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, rx_drops;
+ u64 tx_packets, tx_bytes, tx_drops;
+ const struct pcpu_dstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(dstats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ rx_drops = u64_stats_read(&stats->rx_drops);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ tx_drops = u64_stats_read(&stats->tx_drops);
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->rx_dropped += rx_drops;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ s->tx_dropped += tx_drops;
}
+}
-out:
- mutex_unlock(&net_todo_run_mutex);
+/* ndo_get_stats64 implementation for dtstats-based accounting.
+ *
+ * Populate @s from dev->stats and dev->dstats. This is used internally by the
+ * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
+ */
+static void dev_get_dstats64(const struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_dstats(s, dev->dstats);
}
+/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
+ * all the same fields in the same order as net_device_stats, with only
+ * the type differing, but rtnl_link_stats64 may have additional fields
+ * at the end for newer counters.
+ */
+void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+ const struct net_device_stats *netdev_stats)
+{
+ size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
+ const atomic_long_t *src = (atomic_long_t *)netdev_stats;
+ u64 *dst = (u64 *)stats64;
+
+ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
+ for (i = 0; i < n; i++)
+ dst[i] = (unsigned long)atomic_long_read(&src[i]);
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + n * sizeof(u64), 0,
+ sizeof(*stats64) - n * sizeof(u64));
+}
+EXPORT_SYMBOL(netdev_stats_to_stats64);
+
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+ struct net_device *dev)
+{
+ struct net_device_core_stats __percpu *p;
+
+ p = alloc_percpu_gfp(struct net_device_core_stats,
+ GFP_ATOMIC | __GFP_NOWARN);
+
+ if (p && cmpxchg(&dev->core_stats, NULL, p))
+ free_percpu(p);
+
+ /* This READ_ONCE() pairs with the cmpxchg() above */
+ return READ_ONCE(dev->core_stats);
+}
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+ unsigned long __percpu *field;
+
+ if (unlikely(!p)) {
+ p = netdev_core_stats_alloc(dev);
+ if (!p)
+ return;
+ }
+
+ field = (unsigned long __percpu *)((void __percpu *)p + offset);
+ this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
+
/**
- * alloc_netdev - allocate network device
- * @sizeof_priv: size of private data to allocate space for
- * @name: device name format string
- * @setup: callback to initialize device
+ * dev_get_stats - get network device statistics
+ * @dev: device to get statistics from
+ * @storage: place to store stats
*
- * Allocates a struct net_device with private data area for driver use
- * and performs basic initialization.
+ * Get network statistics from device. Return @storage.
+ * The device driver may provide its own method by setting
+ * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ * otherwise the internal statistics structure is used.
*/
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *))
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *storage)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ const struct net_device_core_stats __percpu *p;
+
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
+ if (ops->ndo_get_stats64) {
+ memset(storage, 0, sizeof(*storage));
+ ops->ndo_get_stats64(dev, storage);
+ } else if (ops->ndo_get_stats) {
+ netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
+ dev_get_tstats64(dev, storage);
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
+ dev_get_dstats64(dev, storage);
+ } else {
+ netdev_stats_to_stats64(storage, &dev->stats);
+ }
+
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ p = READ_ONCE(dev->core_stats);
+ if (p) {
+ const struct net_device_core_stats *core_stats;
+ int i;
+
+ for_each_possible_cpu(i) {
+ core_stats = per_cpu_ptr(p, i);
+ storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
+ storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
+ storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
+ storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
+ }
+ }
+ return storage;
+}
+EXPORT_SYMBOL(dev_get_stats);
+
+/**
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
+ * @s: place to store stats
+ * @netstats: per-cpu network stats to read from
+ *
+ * Read per-cpu network statistics and populate the related fields in @s.
+ */
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_sw_netstats __percpu *netstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ const struct pcpu_sw_netstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(netstats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ }
+}
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
+
+/**
+ * dev_get_tstats64 - ndo_get_stats64 implementation
+ * @dev: device to get statistics from
+ * @s: place to store stats
+ *
+ * Populate @s from dev->stats and dev->tstats. Can be used as
+ * ndo_get_stats64() callback.
+ */
+void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_sw_netstats(s, dev->tstats);
+}
+EXPORT_SYMBOL_GPL(dev_get_tstats64);
+
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+ struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (queue)
+ return queue;
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+ if (!queue)
+ return NULL;
+ netdev_init_one_queue(dev, queue, NULL);
+ RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
+ RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
+ rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+ return queue;
+}
+
+static const struct ethtool_ops default_ethtool_ops;
+
+void netdev_set_default_ethtool_ops(struct net_device *dev,
+ const struct ethtool_ops *ops)
+{
+ if (dev->ethtool_ops == &default_ethtool_ops)
+ dev->ethtool_ops = ops;
+}
+EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+
+/**
+ * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
+ * @dev: netdev to enable the IRQ coalescing on
+ *
+ * Sets a conservative default for SW IRQ coalescing. Users can use
+ * sysfs attributes to override the default values.
+ */
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
+{
+ WARN_ON(dev->reg_state == NETREG_REGISTERED);
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ netdev_set_gro_flush_timeout(dev, 20000);
+ netdev_set_defer_hard_irqs(dev, 1);
+ }
+}
+EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
+
+/**
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
+ */
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *),
+ unsigned int txqs, unsigned int rxqs)
{
- void *p;
struct net_device *dev;
- int alloc_size;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
- /* ensure 32-byte alignment of both the device and private area */
- alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
- alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
+ if (txqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
+ return NULL;
+ }
- p = kzalloc(alloc_size, GFP_KERNEL);
- if (!p) {
- printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
+ if (rxqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
- dev = (struct net_device *)
- (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
- dev->padded = (char *)dev - (char *)p;
+ maxqs = max(txqs, rxqs);
- if (sizeof_priv)
- dev->priv = netdev_priv(dev);
+ dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
+ GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!dev)
+ return NULL;
+
+ dev->priv_len = sizeof_priv;
+ ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
+#ifdef CONFIG_PCPU_DEV_REFCNT
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
+ goto free_dev;
+ __dev_hold(dev);
+#else
+ refcount_set(&dev->dev_refcnt, 1);
+#endif
+
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
+ dev_mc_init(dev);
+ dev_uc_init(dev);
+
+ dev_net_set(dev, &init_net);
+
+ dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
+ dev->xdp_zc_max_segs = 1;
+ dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
+ dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
+ dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
+ dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
+ dev->tso_max_segs = TSO_MAX_SEGS;
+ dev->upper_level = 1;
+ dev->lower_level = 1;
+#ifdef CONFIG_LOCKDEP
+ dev->nested_level = 0;
+ INIT_LIST_HEAD(&dev->unlink_list);
+#endif
+
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->close_list);
+ INIT_LIST_HEAD(&dev->link_watch_list);
+ INIT_LIST_HEAD(&dev->adj_list.upper);
+ INIT_LIST_HEAD(&dev->adj_list.lower);
+ INIT_LIST_HEAD(&dev->ptype_all);
+ INIT_LIST_HEAD(&dev->ptype_specific);
+ INIT_LIST_HEAD(&dev->net_notifier_list);
+#ifdef CONFIG_NET_SCHED
+ hash_init(dev->qdisc_hash);
+#endif
+
+ mutex_init(&dev->lock);
+
+ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
- strcpy(dev->name, name);
+
+ if (!dev->tx_queue_len) {
+ dev->priv_flags |= IFF_NO_QUEUE;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+ }
+
+ dev->num_tx_queues = txqs;
+ dev->real_num_tx_queues = txqs;
+ if (netif_alloc_netdev_queues(dev))
+ goto free_all;
+
+ dev->num_rx_queues = rxqs;
+ dev->real_num_rx_queues = rxqs;
+ if (netif_alloc_rx_queues(dev))
+ goto free_all;
+ dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
+ if (!dev->ethtool)
+ goto free_all;
+
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
+ dev->num_napi_configs = maxqs;
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
+ strscpy(dev->name, name);
+ dev->name_assign_type = name_assign_type;
+ dev->group = INIT_NETDEV_GROUP;
+ if (!dev->ethtool_ops)
+ dev->ethtool_ops = &default_ethtool_ops;
+
+ nf_hook_netdev_init(dev);
+
return dev;
+
+free_all:
+ free_netdev(dev);
+ return NULL;
+
+free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
+ free_percpu(dev->pcpu_refcnt);
+free_dev:
+#endif
+ kvfree(dev);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_netdev_mqs);
+
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
}
-EXPORT_SYMBOL(alloc_netdev);
/**
- * free_netdev - free network device
- * @dev: device
+ * free_netdev - free network device
+ * @dev: device
*
- * This function does the last stage of destroying an allocated device
- * interface. The reference to the device object is released.
- * If this is the last reference then it will be freed.
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released. If this
+ * is the last reference then it will be freed.Must be called in process
+ * context.
*/
void free_netdev(struct net_device *dev)
{
-#ifdef CONFIG_SYSFS
+ might_sleep();
+
+ /* When called immediately after register_netdevice() failed the unwind
+ * handling may still be dismantling the device. Handle that case by
+ * deferring the free.
+ */
+ if (dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+ dev->needs_free_netdev = true;
+ return;
+ }
+
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
+ kfree(dev->ethtool);
+ netif_free_tx_queues(dev);
+ netif_free_rx_queues(dev);
+
+ kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+
+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
+ netdev_napi_exit(dev);
+
+ netif_del_cpu_rmap(dev);
+
+ ref_tracker_dir_exit(&dev->refcnt_tracker);
+#ifdef CONFIG_PCPU_DEV_REFCNT
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+#endif
+ free_percpu(dev->core_stats);
+ dev->core_stats = NULL;
+ free_percpu(dev->xdp_bulkq);
+ dev->xdp_bulkq = NULL;
+
+ netdev_free_phy_link_topology(dev);
+
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- kfree((char *)dev - dev->padded);
+ if (dev->reg_state == NETREG_UNINITIALIZED ||
+ dev->reg_state == NETREG_DUMMY) {
+ kvfree(dev);
return;
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
- dev->reg_state = NETREG_RELEASED;
+ WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
- /* will free via class release */
- class_device_put(&dev->class_dev);
-#else
- kfree((char *)dev - dev->padded);
-#endif
+ /* will free via device release */
+ put_device(&dev->dev);
}
-
-/* Synchronize with packet receive processing. */
-void synchronize_net(void)
+EXPORT_SYMBOL(free_netdev);
+
+/**
+ * alloc_netdev_dummy - Allocate and initialize a dummy net device.
+ * @sizeof_priv: size of private data to allocate space for
+ *
+ * Return: the allocated net_device on success, NULL otherwise
+ */
+struct net_device *alloc_netdev_dummy(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
+ init_dummy_netdev);
+}
+EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
+
+/**
+ * synchronize_net - Synchronize with packet receive processing
+ *
+ * Wait for packets currently being received to be done.
+ * Does not block later packets from starting.
+ */
+void synchronize_net(void)
{
might_sleep();
- synchronize_rcu();
+ if (from_cleanup_net() || rtnl_is_locked())
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(synchronize_net);
+
+static void netdev_rss_contexts_free(struct net_device *dev)
+{
+ struct ethtool_rxfh_context *ctx;
+ unsigned long context;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
+ xa_erase(&dev->ethtool->rss_ctx, context);
+ dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
+ kfree(ctx);
+ }
+ xa_destroy(&dev->ethtool->rss_ctx);
+ mutex_unlock(&dev->ethtool->rss_lock);
}
/**
- * unregister_netdevice - remove device from the kernel
+ * unregister_netdevice_queue - remove device from the kernel
* @dev: device
+ * @head: list
*
* This function shuts down a device interface and removes it
- * from the kernel tables. On success 0 is returned, on a failure
- * a negative errno code is returned.
+ * from the kernel tables.
+ * If head not NULL, device is queued to be unregistered later.
*
* Callers must hold the rtnl semaphore. You may want
* unregister_netdev() instead of this.
*/
-int unregister_netdevice(struct net_device *dev)
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
+{
+ ASSERT_RTNL();
+
+ if (head) {
+ list_move_tail(&dev->unreg_list, head);
+ } else {
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ unregister_netdevice_many(&single);
+ }
+}
+EXPORT_SYMBOL(unregister_netdevice_queue);
+
+static void dev_memory_provider_uninstall(struct net_device *dev)
{
- struct net_device *d, **dp;
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct netdev_rx_queue *rxq = &dev->_rx[i];
+ struct pp_memory_provider_params *p = &rxq->mp_params;
+
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ }
+}
+
+/* devices must be UP and netdev_lock()'d */
+static void netif_close_many_and_unlock(struct list_head *close_head)
+{
+ struct net_device *dev, *tmp;
+
+ netif_close_many(close_head, false);
+
+ /* ... now unlock them */
+ list_for_each_entry_safe(dev, tmp, close_head, close_list) {
+ netdev_unlock(dev);
+ list_del_init(&dev->close_list);
+ }
+}
+
+static void netif_close_many_and_unlock_cond(struct list_head *close_head)
+{
+#ifdef CONFIG_LOCKDEP
+ /* We can only track up to MAX_LOCK_DEPTH locks per task.
+ *
+ * Reserve half the available slots for additional locks possibly
+ * taken by notifiers and (soft)irqs.
+ */
+ unsigned int limit = MAX_LOCK_DEPTH / 2;
+
+ if (lockdep_depth(current) > limit)
+ netif_close_many_and_unlock(close_head);
+#endif
+}
+
+void unregister_netdevice_many_notify(struct list_head *head,
+ u32 portid, const struct nlmsghdr *nlh)
+{
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
+ int cnt = 0;
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
- /* Some devices call without registering for initialization unwind. */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
- "was registered\n", dev->name, dev);
- return -ENODEV;
- }
+ if (list_empty(head))
+ return;
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
- /* If device is running, close it first. */
- if (dev->flags & IFF_UP)
- dev_close(dev);
-
- /* And unlink it from device chain. */
- for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
- if (d == dev) {
- write_lock_bh(&dev_base_lock);
- hlist_del(&dev->name_hlist);
- hlist_del(&dev->index_hlist);
- if (dev_tail == &dev->next)
- dev_tail = dp;
- *dp = d->next;
- write_unlock_bh(&dev_base_lock);
- break;
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
}
- }
- if (!d) {
- printk(KERN_ERR "unregister net_device: '%s' not found\n",
- dev->name);
- return -ENODEV;
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
- dev->reg_state = NETREG_UNREGISTERING;
+ /* If device is running, close it first. Start with ops locked... */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (!(dev->flags & IFF_UP))
+ continue;
+ if (netdev_need_ops_lock(dev)) {
+ list_add_tail(&dev->close_list, &close_head);
+ netdev_lock(dev);
+ }
+ netif_close_many_and_unlock_cond(&close_head);
+ }
+ netif_close_many_and_unlock(&close_head);
+ /* ... now go over the rest. */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (!netdev_need_ops_lock(dev))
+ list_add_tail(&dev->close_list, &close_head);
+ }
+ netif_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
+ }
+ flush_all_backlogs();
synchronize_net();
- /* Shutdown queueing discipline. */
- dev_shutdown(dev);
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
-
- /* Notify protocols, that we are about to destroy
- this device. They should clean all the things.
- */
- raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
-
- /*
- * Flush the multicast chain
- */
- dev_mc_discard(dev);
+ /* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
+ dev_shutdown(dev);
+ dev_tcx_uninstall(dev);
+ dev_xdp_uninstall(dev);
+ dev_memory_provider_uninstall(dev);
+ netdev_unlock_ops(dev);
+ bpf_dev_bound_netdev_unregister(dev);
+
+ netdev_offload_xstats_disable_all(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ GFP_KERNEL, NULL, 0,
+ portid, nlh);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ netdev_name_node_alt_flush(dev);
+ netdev_name_node_free(dev->name_node);
+
+ netdev_rss_contexts_free(dev);
+
+ call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
- if (dev->uninit)
- dev->uninit(dev);
+ mutex_destroy(&dev->ethtool->rss_lock);
- /* Notifier chain MUST detach us from master device. */
- BUG_TRAP(!dev->master);
+ net_shaper_flush_netdev(dev);
- /* Finish processing unregister after unlock */
- net_set_todo(dev);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ }
synchronize_net();
- dev_put(dev);
- return 0;
+ list_for_each_entry(dev, head, unreg_list) {
+ netdev_put(dev, &dev->dev_registered_tracker);
+ net_set_todo(dev);
+ cnt++;
+ }
+ atomic_add(cnt, &dev_unreg_count);
+
+ list_del(head);
}
/**
+ * unregister_netdevice_many - unregister many devices
+ * @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack won't be corrupted later.
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+ unregister_netdevice_many_notify(head, 0, NULL);
+}
+EXPORT_SYMBOL(unregister_netdevice_many);
+
+/**
* unregister_netdev - remove device from the kernel
* @dev: device
*
* This function shuts down a device interface and removes it
- * from the kernel tables. On success 0 is returned, on a failure
- * a negative errno code is returned.
+ * from the kernel tables.
*
* This is just a wrapper for unregister_netdevice that takes
* the rtnl semaphore. In general you want to use this and not
@@ -3333,26 +12458,201 @@ int unregister_netdevice(struct net_device *dev)
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
-
EXPORT_SYMBOL(unregister_netdev);
-#ifdef CONFIG_HOTPLUG_CPU
-static int dev_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *ocpu)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat, int new_ifindex,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_name_node *name_node;
+ struct net *net_old = dev_net(dev);
+ char new_name[IFNAMSIZ] = {};
+ int err, new_nsid;
+
+ ASSERT_RTNL();
+
+ /* Don't allow namespace local devices to be moved. */
+ err = -EINVAL;
+ if (dev->netns_immutable) {
+ NL_SET_ERR_MSG(extack, "The interface netns is immutable");
+ goto out;
+ }
+
+ /* Ensure the device has been registered */
+ if (dev->reg_state != NETREG_REGISTERED) {
+ NL_SET_ERR_MSG(extack, "The interface isn't registered");
+ goto out;
+ }
+
+ /* Get out if there is nothing todo */
+ err = 0;
+ if (net_eq(net_old, net))
+ goto out;
+
+ /* Pick the destination device name, and ensure
+ * we can use it in the destination network namespace.
+ */
+ err = -EEXIST;
+ if (netdev_name_in_use(net, dev->name)) {
+ /* We get here if we can't use the current device name */
+ if (!pat) {
+ NL_SET_ERR_MSG(extack,
+ "An interface with the same name exists in the target netns");
+ goto out;
+ }
+ err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Unable to use '%s' for the new interface name in the target netns",
+ pat);
+ goto out;
+ }
+ }
+ /* Check that none of the altnames conflicts. */
+ err = -EEXIST;
+ netdev_for_each_altname(dev, name_node) {
+ if (netdev_name_in_use(net, name_node->name)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "An interface with the altname %s exists in the target netns",
+ name_node->name);
+ goto out;
+ }
+ }
+
+ /* Check that new_ifindex isn't used yet. */
+ if (new_ifindex) {
+ err = dev_index_reserve(net, new_ifindex);
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "The ifindex %d is not available in the target netns",
+ new_ifindex);
+ goto out;
+ }
+ } else {
+ /* If there is an ifindex conflict assign a new one */
+ err = dev_index_reserve(net, dev->ifindex);
+ if (err == -EBUSY)
+ err = dev_index_reserve(net, 0);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Unable to allocate a new ifindex in the target netns");
+ goto out;
+ }
+ new_ifindex = err;
+ }
+
+ /*
+ * And now a mini version of register_netdevice unregister_netdevice.
+ */
+
+ netdev_lock_ops(dev);
+ /* If device is running close it first. */
+ netif_close(dev);
+ /* And unlink it from device chain */
+ unlist_netdevice(dev);
+
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->moving_ns = true;
+ netdev_unlock(dev);
+
+ synchronize_net();
+
+ /* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
+ dev_shutdown(dev);
+ netdev_unlock_ops(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ *
+ * Note that dev->reg_state stays at NETREG_REGISTERED.
+ * This is wanted because this way 8021q and macvlan know
+ * the device is just moving and can keep their slaves up.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ rcu_barrier();
+
+ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
+
+ rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
+ new_ifindex);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ /* Send a netdev-removed uevent to the old namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+ netdev_adjacent_del_links(dev);
+
+ /* Move per-net netdevice notifiers that are following the netdevice */
+ move_netdevice_notifiers_dev_net(dev, net);
+
+ /* Actually switch the network namespace */
+ netdev_lock(dev);
+ dev_net_set(dev, net);
+ netdev_unlock(dev);
+ dev->ifindex = new_ifindex;
+
+ if (new_name[0]) {
+ /* Rename the netdev to prepared name */
+ write_seqlock_bh(&netdev_rename_lock);
+ strscpy(dev->name, new_name, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
+ }
+
+ /* Fixup kobjects */
+ dev_set_uevent_suppress(&dev->dev, 1);
+ err = device_rename(&dev->dev, dev->name);
+ dev_set_uevent_suppress(&dev->dev, 0);
+ WARN_ON(err);
+
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
+
+ /* Adapt owner in case owning user namespace of target network
+ * namespace is different from the original one.
+ */
+ err = netdev_change_owner(dev, net_old, net);
+ WARN_ON(err);
+
+ netdev_lock(dev);
+ dev->moving_ns = false;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+
+ /* Add the device back in the hashes */
+ list_netdevice(dev);
+ /* Notify protocols, that a new device appeared. */
+ call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
+
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
+
+ synchronize_net();
+ err = 0;
+out:
+ return err;
+}
+
+static int dev_cpu_dead(unsigned int oldcpu)
{
struct sk_buff **list_skb;
- struct net_device **list_net;
struct sk_buff *skb;
- unsigned int cpu, oldcpu = (unsigned long)ocpu;
- struct softnet_data *sd, *oldsd;
-
- if (action != CPU_DEAD)
- return NOTIFY_OK;
+ unsigned int cpu;
+ struct softnet_data *sd, *oldsd, *remsd = NULL;
local_irq_disable();
cpu = smp_processor_id();
@@ -3367,102 +12667,434 @@ static int dev_cpu_callback(struct notifier_block *nfb,
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
- /* Find end of our output_queue. */
- list_net = &sd->output_queue;
- while (*list_net)
- list_net = &(*list_net)->next_sched;
/* Append output queue from offline CPU. */
- *list_net = oldsd->output_queue;
- oldsd->output_queue = NULL;
+ if (oldsd->output_queue) {
+ *sd->output_queue_tailp = oldsd->output_queue;
+ sd->output_queue_tailp = oldsd->output_queue_tailp;
+ oldsd->output_queue = NULL;
+ oldsd->output_queue_tailp = &oldsd->output_queue;
+ }
+ /* Append NAPI poll list from offline CPU, with one exception :
+ * process_backlog() must be called by cpu owning percpu backlog.
+ * We properly handle process_queue & input_pkt_queue later.
+ */
+ while (!list_empty(&oldsd->poll_list)) {
+ struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
+ struct napi_struct,
+ poll_list);
+
+ list_del_init(&napi->poll_list);
+ if (napi->poll == process_backlog)
+ napi->state &= NAPIF_STATE_THREADED;
+ else
+ ____napi_schedule(sd, napi);
+ }
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ if (!use_backlog_threads()) {
+#ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+#endif
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+ }
+
/* Process offline CPU's input_pkt_queue */
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+ netif_rx(skb);
+ rps_input_queue_head_incr(oldsd);
+ }
+ while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
+ rps_input_queue_head_incr(oldsd);
+ }
- return NOTIFY_OK;
+ return 0;
}
-#endif /* CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_NET_DMA
/**
- * net_dma_rebalance -
- * This is called when the number of channels allocated to the net_dma_client
- * changes. The net_dma_client tries to have one DMA channel per CPU.
+ * netdev_increment_features - increment feature set by one
+ * @all: current feature set
+ * @one: new feature set
+ * @mask: mask feature set
+ *
+ * Computes a new feature set after adding a device with feature set
+ * @one to the master device with current feature set @all. Will not
+ * enable anything that is off in @mask. Returns the new feature set.
*/
-static void net_dma_rebalance(void)
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
{
- unsigned int cpu, i, n;
- struct dma_chan *chan;
+ if (mask & NETIF_F_HW_CSUM)
+ mask |= NETIF_F_CSUM_MASK;
+ mask |= NETIF_F_VLAN_CHALLENGED;
- if (net_dma_count == 0) {
- for_each_online_cpu(cpu)
- rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
- return;
- }
+ all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
+ all &= one | ~NETIF_F_ALL_FOR_ALL;
- i = 0;
- cpu = first_cpu(cpu_online_map);
+ /* If one device supports hw checksumming, set for all. */
+ if (all & NETIF_F_HW_CSUM)
+ all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
- rcu_read_lock();
- list_for_each_entry(chan, &net_dma_client->channels, client_node) {
- n = ((num_online_cpus() / net_dma_count)
- + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+ return all;
+}
+EXPORT_SYMBOL(netdev_increment_features);
- while(n) {
- per_cpu(softnet_data, cpu).net_dma = chan;
- cpu = next_cpu(cpu, cpu_online_map);
- n--;
+/**
+ * netdev_compute_master_upper_features - compute feature from lowers
+ * @dev: the upper device
+ * @update_header: whether to update upper device's header_len/headroom/tailroom
+ *
+ * Recompute the upper device's feature based on all lower devices.
+ */
+void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
+{
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+ netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
+ netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
+ netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
+ netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
+ netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
+ unsigned short max_header_len = ETH_HLEN;
+ unsigned int tso_max_size = TSO_MAX_SIZE;
+ unsigned short max_headroom = 0;
+ unsigned short max_tailroom = 0;
+ u16 tso_max_segs = TSO_MAX_SEGS;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ mpls_features = netdev_base_features(mpls_features);
+ vlan_features = netdev_base_features(vlan_features);
+ enc_features = netdev_base_features(enc_features);
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ gso_partial_features = netdev_increment_features(gso_partial_features,
+ lower_dev->gso_partial_features,
+ MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);
+
+ vlan_features = netdev_increment_features(vlan_features,
+ lower_dev->vlan_features,
+ MASTER_UPPER_DEV_VLAN_FEATURES);
+
+ enc_features = netdev_increment_features(enc_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_ENC_FEATURES);
+
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ xfrm_features = netdev_increment_features(xfrm_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_XFRM_FEATURES);
+
+ mpls_features = netdev_increment_features(mpls_features,
+ lower_dev->mpls_features,
+ MASTER_UPPER_DEV_MPLS_FEATURES);
+
+ dst_release_flag &= lower_dev->priv_flags;
+
+ if (update_header) {
+ max_header_len = max(max_header_len, lower_dev->hard_header_len);
+ max_headroom = max(max_headroom, lower_dev->needed_headroom);
+ max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
}
- i++;
+
+ tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
+ tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
}
- rcu_read_unlock();
+
+ dev->gso_partial_features = gso_partial_features;
+ dev->vlan_features = vlan_features;
+ dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ dev->hw_enc_features |= xfrm_features;
+ dev->mpls_features = mpls_features;
+
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+ dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+ dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+ if (update_header) {
+ dev->hard_header_len = max_header_len;
+ dev->needed_headroom = max_headroom;
+ dev->needed_tailroom = max_tailroom;
+ }
+
+ netif_set_tso_max_segs(dev, tso_max_segs);
+ netif_set_tso_max_size(dev, tso_max_size);
+
+ netdev_change_features(dev);
+}
+EXPORT_SYMBOL(netdev_compute_master_upper_features);
+
+static struct hlist_head * __net_init netdev_create_hash(void)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < NETDEV_HASHENTRIES; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+/* Initialize per network namespace state */
+static int __net_init netdev_init(struct net *net)
+{
+ BUILD_BUG_ON(GRO_HASH_BUCKETS >
+ BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
+
+ INIT_LIST_HEAD(&net->dev_base_head);
+
+ net->dev_name_head = netdev_create_hash();
+ if (net->dev_name_head == NULL)
+ goto err_name;
+
+ net->dev_index_head = netdev_create_hash();
+ if (net->dev_index_head == NULL)
+ goto err_idx;
+
+ xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
+
+ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
+
+ return 0;
+
+err_idx:
+ kfree(net->dev_name_head);
+err_name:
+ return -ENOMEM;
}
/**
- * netdev_dma_event - event callback for the net_dma_client
- * @client: should always be net_dma_client
- * @chan: DMA channel for the event
- * @event: event type
+ * netdev_drivername - network driver for the device
+ * @dev: network device
+ *
+ * Determine network driver for device.
*/
-static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
- enum dma_event event)
+const char *netdev_drivername(const struct net_device *dev)
{
- spin_lock(&net_dma_event_lock);
- switch (event) {
- case DMA_RESOURCE_ADDED:
- net_dma_count++;
- net_dma_rebalance();
- break;
- case DMA_RESOURCE_REMOVED:
- net_dma_count--;
- net_dma_rebalance();
- break;
- default:
- break;
+ const struct device_driver *driver;
+ const struct device *parent;
+ const char *empty = "";
+
+ parent = dev->dev.parent;
+ if (!parent)
+ return empty;
+
+ driver = parent->driver;
+ if (driver && driver->name)
+ return driver->name;
+ return empty;
+}
+
+static void __netdev_printk(const char *level, const struct net_device *dev,
+ struct va_format *vaf)
+{
+ if (dev && dev->dev.parent) {
+ dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s%s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
+ } else if (dev) {
+ printk("%s%s%s: %pV",
+ level, netdev_name(dev), netdev_reg_state(dev), vaf);
+ } else {
+ printk("%s(NULL net_device): %pV", level, vaf);
}
- spin_unlock(&net_dma_event_lock);
}
-/**
- * netdev_dma_regiser - register the networking subsystem as a DMA client
- */
-static int __init netdev_dma_register(void)
+void netdev_printk(const char *level, const struct net_device *dev,
+ const char *format, ...)
{
- spin_lock_init(&net_dma_event_lock);
- net_dma_client = dma_async_client_register(netdev_dma_event);
- if (net_dma_client == NULL)
- return -ENOMEM;
+ struct va_format vaf;
+ va_list args;
- dma_async_client_chan_request(net_dma_client, num_online_cpus());
- return 0;
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ __netdev_printk(level, dev, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level) \
+void func(const struct net_device *dev, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ __netdev_printk(level, dev, &vaf); \
+ \
+ va_end(args); \
+} \
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
+static void __net_exit netdev_exit(struct net *net)
+{
+ kfree(net->dev_name_head);
+ kfree(net->dev_index_head);
+ xa_destroy(&net->dev_by_index);
+ if (net != &init_net)
+ WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}
-#else
-static int __init netdev_dma_register(void) { return -ENODEV; }
-#endif /* CONFIG_NET_DMA */
+static struct pernet_operations __net_initdata netdev_net_ops = {
+ .init = netdev_init,
+ .exit = netdev_exit,
+};
+
+static void __net_exit default_device_exit_net(struct net *net)
+{
+ struct netdev_name_node *name_node, *tmp;
+ struct net_device *dev, *aux;
+ /*
+ * Push all migratable network devices back to the
+ * initial network namespace
+ */
+ ASSERT_RTNL();
+ for_each_netdev_safe(net, dev, aux) {
+ int err;
+ char fb_name[IFNAMSIZ];
+
+ /* Ignore unmoveable devices (i.e. loopback) */
+ if (dev->netns_immutable)
+ continue;
+
+ /* Leave virtual devices for the generic cleanup */
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
+ continue;
+
+ /* Push remaining network devices to init_net */
+ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ if (netdev_name_in_use(&init_net, fb_name))
+ snprintf(fb_name, IFNAMSIZ, "dev%%d");
+
+ netdev_for_each_altname_safe(dev, name_node, tmp)
+ if (netdev_name_in_use(&init_net, name_node->name))
+ __netdev_name_node_alt_destroy(name_node);
+
+ err = dev_change_net_namespace(dev, &init_net, fb_name);
+ if (err) {
+ pr_emerg("%s: failed to move %s to init_net: %d\n",
+ __func__, dev->name, err);
+ BUG();
+ }
+ }
+}
+
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+ /* At exit all network devices most be removed from a network
+ * namespace. Do this in the reverse order of registration.
+ * Do this across as many network namespaces as possible to
+ * improve batching efficiency.
+ */
+ struct net_device *dev;
+ struct net *net;
+ LIST_HEAD(dev_kill_list);
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ default_device_exit_net(net);
+ cond_resched();
+ }
+
+ list_for_each_entry(net, net_list, exit_list) {
+ for_each_netdev_reverse(net, dev) {
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
+ dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+ else
+ unregister_netdevice_queue(dev, &dev_kill_list);
+ }
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations __net_initdata default_device_ops = {
+ .exit_batch = default_device_exit_batch,
+};
+
+static void __init net_dev_struct_check(void)
+{
+ /* TX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
+#ifdef CONFIG_XPS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
+
+ /* TXRX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
+
+ /* RX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
+#ifdef CONFIG_NETPOLL
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
+}
/*
* Initialize the DEV module. At boot time this walks the device list and
@@ -3471,6 +13103,67 @@ static int __init netdev_dma_register(void) { return -ENODEV; }
*
*/
+/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
+#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
+
+static int net_page_pool_create(int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+ struct page_pool_params page_pool_params = {
+ .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+ .flags = PP_FLAG_SYSTEM_POOL,
+ .nid = cpu_to_mem(cpuid),
+ };
+ struct page_pool *pp_ptr;
+ int err;
+
+ pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
+ if (IS_ERR(pp_ptr))
+ return -ENOMEM;
+
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
+#endif
+ return 0;
+}
+
+static int backlog_napi_should_run(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+}
+
+static void run_backlog_napi(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+
+ napi_threaded_poll_loop(&sd->backlog, false);
+}
+
+static void backlog_napi_setup(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ napi->thread = this_cpu_read(backlog_napi);
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+}
+
+static struct smp_hotplug_thread backlog_threads = {
+ .store = &backlog_napi,
+ .thread_should_run = backlog_napi_should_run,
+ .thread_fn = run_backlog_napi,
+ .thread_comm = "backlog_napi/%u",
+ .setup = backlog_napi_setup,
+};
+
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
@@ -3481,99 +13174,104 @@ static int __init net_dev_init(void)
BUG_ON(!dev_boot_phase);
+ net_dev_struct_check();
+
if (dev_proc_init())
goto out;
- if (netdev_sysfs_init())
+ if (netdev_kobject_init())
goto out;
- INIT_LIST_HEAD(&ptype_all);
- for (i = 0; i < 16; i++)
+ for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
- for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
- INIT_HLIST_HEAD(&dev_name_head[i]);
-
- for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
- INIT_HLIST_HEAD(&dev_index_head[i]);
+ if (register_pernet_subsys(&netdev_net_ops))
+ goto out;
/*
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct softnet_data *queue;
+ struct softnet_data *sd = &per_cpu(softnet_data, i);
- queue = &per_cpu(softnet_data, i);
- skb_queue_head_init(&queue->input_pkt_queue);
- queue->completion_queue = NULL;
- INIT_LIST_HEAD(&queue->poll_list);
- set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
- queue->backlog_dev.weight = weight_p;
- queue->backlog_dev.poll = process_backlog;
- atomic_set(&queue->backlog_dev.refcnt, 1);
- }
+ skb_queue_head_init(&sd->input_pkt_queue);
+ skb_queue_head_init(&sd->process_queue);
+#ifdef CONFIG_XFRM_OFFLOAD
+ skb_queue_head_init(&sd->xfrm_backlog);
+#endif
+ INIT_LIST_HEAD(&sd->poll_list);
+ sd->output_queue_tailp = &sd->output_queue;
+#ifdef CONFIG_RPS
+ INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
+ sd->cpu = i;
+#endif
+ INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
+
+ gro_init(&sd->backlog.gro);
+ sd->backlog.poll = process_backlog;
+ sd->backlog.weight = weight_p;
+ INIT_LIST_HEAD(&sd->backlog.poll_list);
- netdev_dma_register();
+ if (net_page_pool_create(i))
+ goto out;
+ }
+ net_hotdata.skb_defer_nodes =
+ __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
+ __alignof__(struct skb_defer_node));
+ if (!net_hotdata.skb_defer_nodes)
+ goto out;
+ if (use_backlog_threads())
+ smpboot_register_percpu_thread(&backlog_threads);
dev_boot_phase = 0;
- open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
- open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+ /* The loopback device is special if any other network devices
+ * is present in a network namespace the loopback device must
+ * be present. Since we now dynamically allocate and free the
+ * loopback device ensure this invariant is maintained by
+ * keeping the loopback device as the first device on the
+ * list of network devices. Ensuring the loopback devices
+ * is the first device that appears and the last network device
+ * that disappears.
+ */
+ if (register_pernet_device(&loopback_net_ops))
+ goto out;
+
+ if (register_pernet_device(&default_device_ops))
+ goto out;
+
+ open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+ open_softirq(NET_RX_SOFTIRQ, net_rx_action);
- hotcpu_notifier(dev_cpu_callback, 0);
- dst_init();
- dev_mcast_init();
+ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+ NULL, dev_cpu_dead);
+ WARN_ON(rc < 0);
rc = 0;
-out:
- return rc;
-}
-subsys_initcall(net_dev_init);
+ /* avoid static key IPIs to isolated CPUs */
+ if (housekeeping_enabled(HK_TYPE_MISC))
+ net_enable_timestamp();
+out:
+ if (rc < 0) {
+ for_each_possible_cpu(i) {
+ struct page_pool *pp_ptr;
-EXPORT_SYMBOL(__dev_get_by_index);
-EXPORT_SYMBOL(__dev_get_by_name);
-EXPORT_SYMBOL(__dev_remove_pack);
-EXPORT_SYMBOL(dev_valid_name);
-EXPORT_SYMBOL(dev_add_pack);
-EXPORT_SYMBOL(dev_alloc_name);
-EXPORT_SYMBOL(dev_close);
-EXPORT_SYMBOL(dev_get_by_flags);
-EXPORT_SYMBOL(dev_get_by_index);
-EXPORT_SYMBOL(dev_get_by_name);
-EXPORT_SYMBOL(dev_open);
-EXPORT_SYMBOL(dev_queue_xmit);
-EXPORT_SYMBOL(dev_remove_pack);
-EXPORT_SYMBOL(dev_set_allmulti);
-EXPORT_SYMBOL(dev_set_promiscuity);
-EXPORT_SYMBOL(dev_change_flags);
-EXPORT_SYMBOL(dev_set_mtu);
-EXPORT_SYMBOL(dev_set_mac_address);
-EXPORT_SYMBOL(free_netdev);
-EXPORT_SYMBOL(netdev_boot_setup_check);
-EXPORT_SYMBOL(netdev_set_master);
-EXPORT_SYMBOL(netdev_state_change);
-EXPORT_SYMBOL(netif_receive_skb);
-EXPORT_SYMBOL(netif_rx);
-EXPORT_SYMBOL(register_gifconf);
-EXPORT_SYMBOL(register_netdevice);
-EXPORT_SYMBOL(register_netdevice_notifier);
-EXPORT_SYMBOL(skb_checksum_help);
-EXPORT_SYMBOL(synchronize_net);
-EXPORT_SYMBOL(unregister_netdevice);
-EXPORT_SYMBOL(unregister_netdevice_notifier);
-EXPORT_SYMBOL(net_enable_timestamp);
-EXPORT_SYMBOL(net_disable_timestamp);
-EXPORT_SYMBOL(dev_get_flags);
+ pp_ptr = per_cpu(system_page_pool.pool, i);
+ if (!pp_ptr)
+ continue;
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
-EXPORT_SYMBOL(br_handle_frame_hook);
-EXPORT_SYMBOL(br_fdb_get_hook);
-EXPORT_SYMBOL(br_fdb_put_hook);
-#endif
+ xdp_unreg_page_pool(pp_ptr);
+ page_pool_destroy(pp_ptr);
+ per_cpu(system_page_pool.pool, i) = NULL;
+ }
+ }
-#ifdef CONFIG_KMOD
-EXPORT_SYMBOL(dev_load);
-#endif
+ return rc;
+}
-EXPORT_PER_CPU_SYMBOL(softnet_data);
+subsys_initcall(net_dev_init);
diff --git a/net/core/dev.h b/net/core/dev.h
new file mode 100644
index 000000000000..da18536cbd35
--- /dev/null
+++ b/net/core/dev.h
@@ -0,0 +1,406 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_DEV_H
+#define _NET_CORE_DEV_H
+
+#include <linux/cleanup.h>
+#include <linux/types.h>
+#include <linux/rwsem.h>
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+
+struct net;
+struct netlink_ext_ack;
+struct cpumask;
+
+/* Random bits of netdevice that don't need to be exposed */
+#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
+struct sd_flow_limit {
+ struct rcu_head rcu;
+ unsigned int count;
+ u8 log_buckets;
+ unsigned int history_head;
+ u16 history[FLOW_LIMIT_HISTORY];
+ u8 buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
+
+#define for_each_netdev_lock_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock) = NULL; \
+ (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
+ ifindex++)
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex);
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *,
+ if (_T) netdev_unlock_ops_compat(_T));
+
+#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \
+ (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \
+ &ifindex)); \
+ ifindex++)
+
+#ifdef CONFIG_PROC_FS
+int __init dev_proc_init(void);
+#else
+#define dev_proc_init() 0
+#endif
+
+void linkwatch_init_dev(struct net_device *dev);
+void linkwatch_run_queue(void);
+
+void dev_addr_flush(struct net_device *dev);
+int dev_addr_init(struct net_device *dev);
+void dev_addr_check(struct net_device *dev);
+
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+void net_shaper_flush_netdev(struct net_device *dev);
+void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq);
+#else
+static inline void net_shaper_flush_netdev(struct net_device *dev) {}
+static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq) {}
+#endif
+
+/* sysctls not referred to from outside net/core/ */
+extern int netdev_unregister_timeout_secs;
+extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+
+extern struct rw_semaphore dev_addr_sem;
+
+/* rtnl helpers */
+extern struct list_head net_todo_list;
+void netdev_run_todo(void);
+
+/* netdev management, shared between various uAPI entry points */
+struct netdev_name_node {
+ struct hlist_node hlist;
+ struct list_head list;
+ struct net_device *dev;
+ const char *name;
+ struct rcu_head rcu;
+};
+
+int netdev_get_name(struct net *net, char *name, int ifindex);
+int netif_change_name(struct net_device *dev, const char *newname);
+int dev_change_name(struct net_device *dev, const char *newname);
+
+#define netdev_for_each_altname(dev, namenode) \
+ list_for_each_entry((namenode), &(dev)->name_node->list, list)
+#define netdev_for_each_altname_safe(dev, namenode, next) \
+ list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \
+ list)
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+
+int dev_validate_mtu(struct net_device *dev, int mtu,
+ struct netlink_ext_ack *extack);
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack);
+
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid);
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len);
+
+int netif_change_proto_down(struct net_device *dev, bool proto_down);
+int dev_change_proto_down(struct net_device *dev, bool proto_down);
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value);
+
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, int expected_fd, u32 flags);
+
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+void netif_set_group(struct net_device *dev, int new_group);
+void dev_set_group(struct net_device *dev, int new_group);
+int netif_change_carrier(struct net_device *dev, bool new_carrier);
+int dev_change_carrier(struct net_device *dev, bool new_carrier);
+
+void __dev_set_rx_mode(struct net_device *dev);
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges, u32 portid,
+ const struct nlmsghdr *nlh);
+
+void unregister_netdevice_many_notify(struct list_head *head,
+ u32 portid, const struct nlmsghdr *nlh);
+
+static inline void netif_set_up(struct net_device *dev, bool value)
+{
+ if (value)
+ dev->flags |= IFF_UP;
+ else
+ dev->flags &= ~IFF_UP;
+
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->up = value;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+}
+
+static inline void netif_set_gso_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* dev->gso_max_size is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_max_size, size);
+ if (size <= GSO_LEGACY_MAX_SIZE)
+ WRITE_ONCE(dev->gso_ipv4_max_size, size);
+}
+
+static inline void netif_set_gso_max_segs(struct net_device *dev,
+ unsigned int segs)
+{
+ /* dev->gso_max_segs is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_max_segs, segs);
+}
+
+static inline void netif_set_gro_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* This pairs with the READ_ONCE() in skb_gro_receive() */
+ WRITE_ONCE(dev->gro_max_size, size);
+ if (size <= GRO_LEGACY_MAX_SIZE)
+ WRITE_ONCE(dev->gro_ipv4_max_size, size);
+}
+
+static inline void netif_set_gso_ipv4_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_ipv4_max_size, size);
+}
+
+static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* This pairs with the READ_ONCE() in skb_gro_receive() */
+ WRITE_ONCE(dev->gro_ipv4_max_size, size);
+}
+
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+ return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+ WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+ u32 defer)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_defer_hard_irqs(napi, defer);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
+}
+
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+ unsigned long timeout)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_gro_flush_timeout(napi, timeout);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
+}
+
+/**
+ * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
+ * @n: napi struct to get the irq_suspend_timeout from
+ *
+ * Return: the per-NAPI value of the irq_suspend_timeout field.
+ */
+static inline unsigned long
+napi_get_irq_suspend_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->irq_suspend_timeout);
+}
+
+/**
+ * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
+ * @n: napi struct to set the irq_suspend_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
+ */
+static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->irq_suspend_timeout, timeout);
+}
+
+static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
+{
+ if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
+ return NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ if (test_bit(NAPI_STATE_THREADED, &n->state))
+ return NETDEV_NAPI_THREADED_ENABLED;
+
+ return NETDEV_NAPI_THREADED_DISABLED;
+}
+
+static inline enum netdev_napi_threaded
+napi_get_threaded_config(struct net_device *dev, struct napi_struct *n)
+{
+ if (n->config)
+ return n->config->threaded;
+ return dev->threaded;
+}
+
+int napi_set_threaded(struct napi_struct *n,
+ enum netdev_napi_threaded threaded);
+
+int netif_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded);
+
+int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
+
+/* Best effort check that NAPI is not idle (can't be scheduled to run) */
+static inline void napi_assert_will_not_race(const struct napi_struct *napi)
+{
+ /* uninitialized instance, can't race */
+ if (!napi->poll_list.next)
+ return;
+
+ /* SCHED bit is set on disabled instances */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+ WARN_ON(READ_ONCE(napi->list_owner) != -1);
+}
+
+void kick_defer_list_purge(unsigned int cpu);
+
+#define XMIT_RECURSION_LIMIT 8
+
+#ifndef CONFIG_PREEMPT_RT
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+ XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ __this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.recursion);
+}
+#else
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.recursion--;
+}
+#endif
+
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack);
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg);
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);
+
+#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
new file mode 100644
index 000000000000..76c91f224886
--- /dev/null
+++ b/net/core/dev_addr_lists.c
@@ -0,0 +1,1051 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/core/dev_addr_lists.c - Functions for handling net device lists
+ * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This file contains functions for working with unicast, multicast and device
+ * addresses lists.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/export.h>
+#include <linux/list.h>
+
+#include "dev.h"
+
+/*
+ * General list handling functions
+ */
+
+static int __hw_addr_insert(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *new, int addr_len)
+{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
+ struct netdev_hw_addr *ha;
+
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(new->addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&new->type, &ha->type, sizeof(new->type));
+
+ parent = *ins_point;
+ if (diff < 0)
+ ins_point = &parent->rb_left;
+ else if (diff > 0)
+ ins_point = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node_rcu(&new->node, parent, ins_point);
+ rb_insert_color(&new->node, &list->tree);
+
+ return 0;
+}
+
+static struct netdev_hw_addr*
+__hw_addr_create(const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
+{
+ struct netdev_hw_addr *ha;
+ int alloc_size;
+
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return NULL;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ ha->refcount = 1;
+ ha->global_use = global;
+ ha->synced = sync ? 1 : 0;
+ ha->sync_cnt = 0;
+
+ return ha;
+}
+
+static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync,
+ int sync_count, bool exclusive)
+{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
+ struct netdev_hw_addr *ha;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ parent = *ins_point;
+ if (diff < 0) {
+ ins_point = &parent->rb_left;
+ } else if (diff > 0) {
+ ins_point = &parent->rb_right;
+ } else {
+ if (exclusive)
+ return -EEXIST;
+ if (global) {
+ /* check if addr is already used as global */
+ if (ha->global_use)
+ return 0;
+ else
+ ha->global_use = true;
+ }
+ if (sync) {
+ if (ha->synced && sync_count)
+ return -EEXIST;
+ else
+ ha->synced++;
+ }
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
+ if (!ha)
+ return -ENOMEM;
+
+ rb_link_node(&ha->node, parent, ins_point);
+ rb_insert_color(&ha->node, &list->tree);
+
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
+}
+
+static int __hw_addr_add(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
+ 0, false);
+}
+
+static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *ha, bool global,
+ bool sync)
+{
+ if (global && !ha->global_use)
+ return -ENOENT;
+
+ if (sync && !ha->synced)
+ return -ENOENT;
+
+ if (global)
+ ha->global_use = false;
+
+ if (sync)
+ ha->synced--;
+
+ if (--ha->refcount)
+ return 0;
+
+ rb_erase(&ha->node, &list->tree);
+
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ list->count--;
+ return 0;
+}
+
+static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ struct rb_node *node;
+
+ node = list->tree.rb_node;
+
+ while (node) {
+ struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
+ int diff = memcmp(addr, ha->addr, addr_len);
+
+ if (diff == 0 && addr_type)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ if (diff < 0)
+ node = node->rb_left;
+ else if (diff > 0)
+ node = node->rb_right;
+ else
+ return ha;
+ }
+
+ return NULL;
+}
+
+static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
+{
+ struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);
+
+ if (!ha)
+ return -ENOENT;
+ return __hw_addr_del_entry(list, ha, global, sync);
+}
+
+static int __hw_addr_del(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
+}
+
+static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true, ha->sync_cnt, false);
+ if (err && err != -EEXIST)
+ return err;
+
+ if (!err) {
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+
+static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true);
+ if (err)
+ return;
+ ha->sync_cnt--;
+ /* address on from list is not marked synced */
+ __hw_addr_del_entry(from_list, ha, false, false);
+}
+
+int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt == ha->refcount) {
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ } else {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(__hw_addr_sync_multiple);
+
+/* This function only works where there is a strict 1-1 relationship
+ * between source and destination of they synch. If you ever need to
+ * sync addresses to more then 1 destination, you need to use
+ * __hw_addr_sync_multiple().
+ */
+int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (!ha->sync_cnt) {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ } else if (ha->refcount == 1)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+ return err;
+}
+EXPORT_SYMBOL(__hw_addr_sync);
+
+void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync);
+
+/**
+ * __hw_addr_sync_dev - Synchronize device's multicast list
+ * @list: address list to synchronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking
+ * into account references
+ * @list: address list to synchronize
+ * @dev: device to sync
+ * @sync: function to call if address or reference on it should be added
+ * @unsync: function to call if address or some reference on it should removed
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address or references on it
+ * add/remove notifications. The unsync function may be NULL in which case
+ * the addresses or references on it requiring removal will simply be
+ * removed without any notification to the device. That is responsibility of
+ * the driver to identify and distribute address or references on it between
+ * internal address tables.
+ **/
+int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *,
+ const unsigned char *, int),
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err, ref_cnt;
+
+ /* first go through and flush out any unsynced/stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address is not used */
+ if ((ha->sync_cnt << 1) <= ha->refcount)
+ continue;
+
+ /* if fails defer unsyncing address */
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ if (unsync && unsync(dev, ha->addr, ref_cnt))
+ continue;
+
+ ha->refcount = (ref_cnt << 1) + 1;
+ ha->sync_cnt = ref_cnt;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync updated/new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address added or reused */
+ if ((ha->sync_cnt << 1) >= ha->refcount)
+ continue;
+
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ err = sync(dev, ha->addr, ref_cnt);
+ if (err)
+ return err;
+
+ ha->refcount = ref_cnt << 1;
+ ha->sync_cnt = ref_cnt;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_ref_sync_dev);
+
+/**
+ * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on
+ * it from device
+ * @list: address list to remove synchronized addresses (references on it) from
+ * @dev: device to sync
+ * @unsync: function to call if address and references on it should be removed
+ *
+ * Remove all addresses that were added to the device by
+ * __hw_addr_ref_sync_dev(). This function is intended to be called from the
+ * ndo_stop or ndo_open functions on devices that require explicit address (or
+ * references on it) add/remove notifications. If the unsync function pointer
+ * is NULL then this function can be used to just reset the sync_cnt for the
+ * addresses in the list.
+ **/
+void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr, ha->sync_cnt))
+ continue;
+
+ ha->refcount -= ha->sync_cnt - 1;
+ ha->sync_cnt = 0;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_ref_unsync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchronized addresses from device
+ * @list: address list to remove synchronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list->tree = RB_ROOT;
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ }
+ list->count = 0;
+}
+
+void __hw_addr_init(struct netdev_hw_addr_list *list)
+{
+ INIT_LIST_HEAD(&list->list);
+ list->count = 0;
+ list->tree = RB_ROOT;
+}
+EXPORT_SYMBOL(__hw_addr_init);
+
+/*
+ * Device addresses handling functions
+ */
+
+/* Check that netdev->dev_addr is not written to directly as this would
+ * break the rbtree layout. All changes should go thru dev_addr_set() and co.
+ * Remove this check in mid-2024.
+ */
+void dev_addr_check(struct net_device *dev)
+{
+ if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN))
+ return;
+
+ netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr);
+ netdev_warn(dev, "Expected addr: %*ph\n",
+ MAX_ADDR_LEN, dev->dev_addr_shadow);
+ netdev_WARN(dev, "Incorrect netdev->dev_addr\n");
+}
+
+/**
+ * dev_addr_flush - Flush device address list
+ * @dev: device
+ *
+ * Flush device address list and reset ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+ dev_addr_check(dev);
+
+ __hw_addr_flush(&dev->dev_addrs);
+ dev->dev_addr = NULL;
+}
+
+/**
+ * dev_addr_init - Init device address list
+ * @dev: device
+ *
+ * Init device address list and create the first element,
+ * used by ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_init(&dev->dev_addrs);
+ memset(addr, 0, sizeof(addr));
+ err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
+ NETDEV_HW_ADDR_T_LAN);
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+
+void dev_addr_mod(struct net_device *dev, unsigned int offset,
+ const void *addr, size_t len)
+{
+ struct netdev_hw_addr *ha;
+
+ dev_addr_check(dev);
+
+ ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]);
+ rb_erase(&ha->node, &dev->dev_addrs.tree);
+ memcpy(&ha->addr[offset], addr, len);
+ memcpy(&dev->dev_addr_shadow[offset], addr, len);
+ WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len));
+}
+EXPORT_SYMBOL(dev_addr_mod);
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ * @addr_type: address type
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = netif_pre_changeaddr_notify(dev, addr, NULL);
+ if (err)
+ return err;
+ err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ * @addr_type: address type
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+ struct netdev_hw_addr *ha;
+
+ ASSERT_RTNL();
+
+ /*
+ * We can not remove the first address from the list because
+ * dev->dev_addr points to that.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == addr_type && ha->refcount == 1)
+ return -ENOENT;
+
+ err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
+ addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/*
+ * Unicast list handling functions
+ */
+
+/**
+ * dev_uc_add_excl - Add a global secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false,
+ 0, true);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add_excl);
+
+/**
+ * dev_uc_add - Add a secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a secondary unicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_uc_add(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add);
+
+/**
+ * dev_uc_del - Release secondary unicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a secondary unicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_uc_del(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_del);
+
+/**
+ * dev_uc_sync - Synchronize device's unicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. This function assumes that
+ * addresses will only ever be synced to the @to devices and no other.
+ */
+int dev_uc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock(to);
+ err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync);
+
+/**
+ * dev_uc_sync_multiple - Synchronize device's unicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have been deleted from the source. The source device
+ * must be locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. It allows for a single source
+ * device to be synced to multiple destination devices.
+ */
+int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock(to);
+ err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync_multiple);
+
+/**
+ * dev_uc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_uc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_uc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two
+ * reasons:
+ * 1) This is always called without any addr_list_lock, so as the
+ * outermost one here, it must be 0.
+ * 2) This is called by some callers after unlinking the upper device,
+ * so the dev->lower_level becomes 1 again.
+ * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or
+ * larger.
+ */
+ netif_addr_lock_bh(from);
+ netif_addr_lock(to);
+ __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_uc_unsync);
+
+/**
+ * dev_uc_flush - Flush unicast addresses
+ * @dev: device
+ *
+ * Flush unicast addresses.
+ */
+void dev_uc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->uc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_uc_flush);
+
+/**
+ * dev_uc_init - Init unicast address list
+ * @dev: device
+ *
+ * Init unicast address list.
+ */
+void dev_uc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->uc);
+}
+EXPORT_SYMBOL(dev_uc_init);
+
+/*
+ * Multicast list handling functions
+ */
+
+/**
+ * dev_mc_add_excl - Add a global secondary multicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false,
+ 0, true);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_add_excl);
+
+static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false,
+ 0, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+/**
+ * dev_mc_add - Add a multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a multicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_mc_add(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_add);
+
+/**
+ * dev_mc_add_global - Add a global multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a global multicast address to the device.
+ */
+int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_add_global);
+
+static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+
+/**
+ * dev_mc_del - Delete a multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_del);
+
+/**
+ * dev_mc_del_global - Delete a global multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_del_global);
+
+/**
+ * dev_mc_sync - Synchronize device's multicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock(to);
+ err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+/**
+ * dev_mc_sync_multiple - Synchronize device's multicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices. It allows for a single
+ * source device to be synced to multiple destination devices.
+ */
+int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock(to);
+ err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync_multiple);
+
+/**
+ * dev_mc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_mc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ /* See the above comments inside dev_uc_unsync(). */
+ netif_addr_lock_bh(from);
+ netif_addr_lock(to);
+ __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
+/**
+ * dev_mc_flush - Flush multicast addresses
+ * @dev: device
+ *
+ * Flush multicast addresses.
+ */
+void dev_mc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->mc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_mc_flush);
+
+/**
+ * dev_mc_init - Init multicast address list
+ * @dev: device
+ *
+ * Init multicast address list.
+ */
+void dev_mc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->mc);
+}
+EXPORT_SYMBOL(dev_mc_init);
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
new file mode 100644
index 000000000000..8e1dba825e94
--- /dev/null
+++ b/net/core/dev_addr_lists_test.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+static const struct net_device_ops dummy_netdev_ops = {
+};
+
+struct dev_addr_test_priv {
+ u32 addr_seen;
+};
+
+static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen |= 1 << a[0];
+ return 0;
+}
+
+static int dev_addr_test_unsync(struct net_device *netdev,
+ const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen &= ~(1 << a[0]);
+ return 0;
+}
+
+static int dev_addr_test_init(struct kunit *test)
+{
+ struct dev_addr_test_priv *datp;
+ struct net_device *netdev;
+ int err;
+
+ netdev = alloc_etherdev(sizeof(*datp));
+ KUNIT_ASSERT_TRUE(test, !!netdev);
+
+ test->priv = netdev;
+ netdev->netdev_ops = &dummy_netdev_ops;
+
+ err = register_netdev(netdev);
+ if (err) {
+ free_netdev(netdev);
+ KUNIT_FAIL(test, "Can't register netdev %d", err);
+ }
+
+ return 0;
+}
+
+static void dev_addr_test_exit(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+
+ unregister_netdev(netdev);
+ free_netdev(netdev);
+}
+
+static void dev_addr_test_basic(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ rtnl_lock();
+ KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr));
+
+ memset(addr, 3, sizeof(addr));
+ dev_addr_set(netdev, addr);
+ KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr));
+ rtnl_unlock();
+}
+
+static void dev_addr_test_sync_one(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 2, datp->addr_seen);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ datp->addr_seen = 0;
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ /* It's not going to sync anything because the main address is
+ * considered synced and we overwrite in place.
+ */
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_add_del(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+ /* Add 3 again */
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1, datp->addr_seen);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_del_main(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ rtnl_lock();
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ rtnl_unlock();
+}
+
+static void dev_addr_test_add_set(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+ /* There is no external API like dev_addr_add_excl(),
+ * so shuffle the tree a little bit and exploit aliasing.
+ */
+ for (i = 1; i < 16; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ memset(addr, i, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ memset(addr, 0, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_add_excl(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ rtnl_lock();
+ for (i = 0; i < 10; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr));
+ }
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+
+ for (i = 0; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+ }
+ for (i = 1; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+ }
+ rtnl_unlock();
+}
+
+static struct kunit_case dev_addr_test_cases[] = {
+ KUNIT_CASE(dev_addr_test_basic),
+ KUNIT_CASE(dev_addr_test_sync_one),
+ KUNIT_CASE(dev_addr_test_add_del),
+ KUNIT_CASE(dev_addr_test_del_main),
+ KUNIT_CASE(dev_addr_test_add_set),
+ KUNIT_CASE(dev_addr_test_add_excl),
+ {}
+};
+
+static struct kunit_suite dev_addr_test_suite = {
+ .name = "dev-addr-list-test",
+ .test_cases = dev_addr_test_cases,
+ .init = dev_addr_test_init,
+ .exit = dev_addr_test_exit,
+};
+kunit_test_suite(dev_addr_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list");
+MODULE_LICENSE("GPL");
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
new file mode 100644
index 000000000000..f28852078aa6
--- /dev/null
+++ b/net/core/dev_api.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+
+#include "dev.h"
+
+/**
+ * dev_change_name() - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_name(dev, newname);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_set_alias() - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_alias(dev, alias, len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_alias);
+
+/**
+ * dev_change_flags() - change device settings
+ * @dev: device
+ * @flags: device state flags
+ * @extack: netlink extended ack
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_flags(dev, flags, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+/**
+ * dev_set_group() - change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ netdev_lock_ops(dev);
+ netif_set_group(dev, new_group);
+ netdev_unlock_ops(dev);
+}
+
+int dev_set_mac_address_user(struct net_device *dev,
+ struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+ up_write(&dev_addr_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+/**
+ * dev_change_net_namespace() - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat)
+{
+ return __dev_change_net_namespace(dev, net, pat, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+/**
+ * dev_change_carrier() - change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_carrier(dev, new_carrier);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_tx_queue_len() - change TX queue length of a netdevice
+ * @dev: device
+ * @new_len: new tx queue length
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_tx_queue_len(dev, new_len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_proto_down() - set carrier according to proto_down
+ * @dev: device
+ * @proto_down: new value
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_proto_down(dev, proto_down);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_open() - prepare an interface for use
+ * @dev: device to open
+ * @extack: netlink extended ack
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_open(dev, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+/**
+ * dev_close() - shutdown an interface
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+void dev_close(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_close(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_close);
+
+int dev_eth_ioctl(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret = -ENODEV;
+
+ if (!ops->ndo_eth_ioctl)
+ return -EOPNOTSUPP;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_eth_ioctl(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_eth_ioctl);
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mtu(dev, new_mtu);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ * dev_disable_lro() - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_disable_lro(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+/**
+ * dev_set_promiscuity() - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_promiscuity(dev, inc);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+/**
+ * dev_set_allmulti() - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_allmulti(dev, inc, true);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/**
+ * dev_set_mac_address() - change Media Access Control Address
+ * @dev: device
+ * @ss: new address
+ * @extack: netlink extended ack
+ *
+ * Change the hardware (MAC) address of the device
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_xdp_propagate(dev, bpf);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_propagate);
+
+/**
+ * netdev_state_change() - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_state_change(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(netdev_state_change);
+
+int dev_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded)
+{
+ int ret;
+
+ netdev_lock(dev);
+ ret = netif_set_threaded(dev, threaded);
+ netdev_unlock(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_threaded);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
new file mode 100644
index 000000000000..53a53357cfef
--- /dev/null
+++ b/net/core/dev_ioctl.c
@@ -0,0 +1,875 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kmod.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/phylib_stubs.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/wireless.h>
+#include <linux/if_bridge.h>
+#include <net/dsa_stubs.h>
+#include <net/netdev_lock.h>
+#include <net/wext.h>
+
+#include "dev.h"
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq *ifr)
+{
+ ifr->ifr_name[IFNAMSIZ-1] = 0;
+ return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
+}
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+int dev_ifconf(struct net *net, struct ifconf __user *uifc)
+{
+ struct net_device *dev;
+ void __user *pos;
+ size_t size;
+ int len, total = 0, done;
+
+ /* both the ifconf and the ifreq structures are slightly different */
+ if (in_compat_syscall()) {
+ struct compat_ifconf ifc32;
+
+ if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf)))
+ return -EFAULT;
+
+ pos = compat_ptr(ifc32.ifcbuf);
+ len = ifc32.ifc_len;
+ size = sizeof(struct compat_ifreq);
+ } else {
+ struct ifconf ifc;
+
+ if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+ size = sizeof(struct ifreq);
+ }
+
+ /* Loop over the interfaces, and write an info block for each. */
+ rtnl_net_lock(net);
+ for_each_netdev(net, dev) {
+ if (!pos)
+ done = inet_gifconf(dev, NULL, 0, size);
+ else
+ done = inet_gifconf(dev, pos + total,
+ len - total, size);
+ if (done < 0) {
+ rtnl_net_unlock(net);
+ return -EFAULT;
+ }
+ total += done;
+ }
+ rtnl_net_unlock(net);
+
+ return put_user(total, &uifc->ifc_len);
+}
+
+static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct ifmap *ifmap = &ifr->ifr_map;
+
+ if (in_compat_syscall()) {
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap;
+
+ cifmap->mem_start = dev->mem_start;
+ cifmap->mem_end = dev->mem_end;
+ cifmap->base_addr = dev->base_addr;
+ cifmap->irq = dev->irq;
+ cifmap->dma = dev->dma;
+ cifmap->port = dev->if_port;
+
+ return 0;
+ }
+
+ ifmap->mem_start = dev->mem_start;
+ ifmap->mem_end = dev->mem_end;
+ ifmap->base_addr = dev->base_addr;
+ ifmap->irq = dev->irq;
+ ifmap->dma = dev->dma;
+ ifmap->port = dev->if_port;
+
+ return 0;
+}
+
+static int netif_setifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map;
+
+ if (!dev->netdev_ops->ndo_set_config)
+ return -EOPNOTSUPP;
+
+ if (in_compat_syscall()) {
+ struct ifmap ifmap = {
+ .mem_start = cifmap->mem_start,
+ .mem_end = cifmap->mem_end,
+ .base_addr = cifmap->base_addr,
+ .irq = cifmap->irq,
+ .dma = cifmap->dma,
+ .port = cifmap->port,
+ };
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifmap);
+ }
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map);
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = (short)netif_get_flags(dev);
+ return 0;
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface
+ (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCGIFSLAVE:
+ err = -EINVAL;
+ break;
+
+ case SIOCGIFMAP:
+ return dev_getifmap(dev, ifr);
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ default:
+ /* dev_ioctl() should ensure this case
+ * is never reached
+ */
+ WARN_ON(1);
+ err = -ENOTTY;
+ break;
+
+ }
+ return err;
+}
+
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
+{
+ enum hwtstamp_tx_types tx_type;
+ enum hwtstamp_rx_filters rx_filter;
+ int tx_type_valid = 0;
+ int rx_filter_valid = 0;
+
+ if (cfg->flags & ~HWTSTAMP_FLAG_MASK)
+ return -EINVAL;
+
+ tx_type = cfg->tx_type;
+ rx_filter = cfg->rx_filter;
+
+ switch (tx_type) {
+ case HWTSTAMP_TX_OFF:
+ case HWTSTAMP_TX_ON:
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ case HWTSTAMP_TX_ONESTEP_P2P:
+ tx_type_valid = 1;
+ break;
+ case __HWTSTAMP_TX_CNT:
+ /* not a real value */
+ break;
+ }
+
+ switch (rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ case HWTSTAMP_FILTER_ALL:
+ case HWTSTAMP_FILTER_SOME:
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ case HWTSTAMP_FILTER_NTP_ALL:
+ rx_filter_valid = 1;
+ break;
+ case __HWTSTAMP_FILTER_CNT:
+ /* not a real value */
+ break;
+ }
+
+ if (!tx_type_valid || !rx_filter_valid)
+ return -ERANGE;
+
+ return 0;
+}
+
+/**
+ * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC
+ * or of attached phylib PHY
+ * @dev: Network device
+ * @cfg: Timestamping configuration structure
+ *
+ * Helper for calling the default hardware provider timestamping.
+ *
+ * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but
+ * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this
+ * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not
+ * implemented for now, which is still more accurate than letting the netdev
+ * handle the GET request.
+ */
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct hwtstamp_provider *hwprov;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ cfg->qualifier = hwprov->desc.qualifier;
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev)
+ return phy_hwtstamp_get(hwprov->phydev, cfg);
+
+ if (hwprov->source == HWTSTAMP_SOURCE_NETDEV)
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+
+ return -EOPNOTSUPP;
+ }
+
+ if (phy_is_default_hwtstamp(dev->phydev))
+ return phy_hwtstamp_get(dev->phydev, cfg);
+
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+}
+
+static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config kernel_cfg = {};
+ struct hwtstamp_config cfg;
+ int err;
+
+ if (!ops->ndo_hwtstamp_get)
+ return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ kernel_cfg.ifr = ifr;
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, &kernel_cfg);
+ netdev_unlock_ops(dev);
+ if (err)
+ return err;
+
+ /* If the request was resolved through an unconverted driver, omit
+ * the copy_to_user(), since the implementation has already done that
+ */
+ if (!kernel_cfg.copied_to_user) {
+ hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+
+ if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC
+ * or of attached phylib PHY
+ * @dev: Network device
+ * @cfg: Timestamping configuration structure
+ * @extack: Netlink extended ack message structure, for error reporting
+ *
+ * Helper for enforcing a common policy that phylib timestamping, if available,
+ * should take precedence in front of hardware timestamping provided by the
+ * netdev. If the netdev driver needs to perform specific actions even for PHY
+ * timestamping to work properly (a switch port must trap the timestamped
+ * frames and not forward them), it must set dev->see_all_hwtstamp_requests.
+ */
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config old_cfg = {};
+ struct hwtstamp_provider *hwprov;
+ struct phy_device *phydev;
+ bool changed = false;
+ bool phy_ts;
+ int err;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev) {
+ phy_ts = true;
+ phydev = hwprov->phydev;
+ } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) {
+ phy_ts = false;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ cfg->qualifier = hwprov->desc.qualifier;
+ } else {
+ phy_ts = phy_is_default_hwtstamp(dev->phydev);
+ if (phy_ts)
+ phydev = dev->phydev;
+ }
+
+ cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
+
+ if (phy_ts && dev->see_all_hwtstamp_requests) {
+ err = ops->ndo_hwtstamp_get(dev, &old_cfg);
+ if (err)
+ return err;
+ }
+
+ if (!phy_ts || dev->see_all_hwtstamp_requests) {
+ err = ops->ndo_hwtstamp_set(dev, cfg, extack);
+ if (err) {
+ if (extack->_msg)
+ netdev_err(dev, "%s\n", extack->_msg);
+ return err;
+ }
+ }
+
+ if (phy_ts && dev->see_all_hwtstamp_requests)
+ changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
+
+ if (phy_ts) {
+ err = phy_hwtstamp_set(phydev, cfg, extack);
+ if (err) {
+ if (changed)
+ ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config kernel_cfg = {};
+ struct netlink_ext_ack extack = {};
+ struct hwtstamp_config cfg;
+ int err;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
+ kernel_cfg.ifr = ifr;
+
+ err = net_hwtstamp_validate(&kernel_cfg);
+ if (err)
+ return err;
+
+ err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack);
+ if (err) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ if (!ops->ndo_hwtstamp_set)
+ return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack);
+ netdev_unlock_ops(dev);
+ if (err)
+ return err;
+
+ /* The driver may have modified the configuration, so copy the
+ * updated version of it back to user space
+ */
+ if (!kernel_cfg.copied_to_user) {
+ hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+
+ if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
+ struct kernel_hwtstamp_config *kernel_cfg)
+{
+ struct ifreq ifrr;
+ int err;
+
+ if (!kernel_cfg->ifr)
+ return -EINVAL;
+
+ strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
+ ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
+
+ err = dev_eth_ioctl(dev, &ifrr, cmd);
+ if (err)
+ return err;
+
+ kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru;
+ kernel_cfg->copied_to_user = true;
+
+ return 0;
+}
+
+int generic_hwtstamp_get_lower(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_cfg)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (ops->ndo_hwtstamp_get) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
+
+ /* Legacy path: unconverted lower driver */
+ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
+}
+EXPORT_SYMBOL(generic_hwtstamp_get_lower);
+
+int generic_hwtstamp_set_lower(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_cfg,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (ops->ndo_hwtstamp_set) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
+
+ /* Legacy path: unconverted lower driver */
+ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
+}
+EXPORT_SYMBOL(generic_hwtstamp_set_lower);
+
+static int dev_siocbond(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocbond) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_siocbond(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocdevprivate) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocwandev) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_siocwandev(dev, ifs);
+ netdev_unlock_ops(dev);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rtnl_net_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
+ unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ const struct net_device_ops *ops;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->netdev_ops;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags, NULL);
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface
+ (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ return dev_set_mtu(dev, ifr->ifr_mtu);
+
+ case SIOCSIFHWADDR:
+ if (dev->addr_len > sizeof(ifr->ifr_hwaddr))
+ return -EINVAL;
+ return dev_set_mac_address_user(dev,
+ (struct sockaddr_storage *)&ifr->ifr_hwaddr,
+ NULL);
+
+ case SIOCSIFHWBROADCAST:
+ if (ifr->ifr_hwaddr.sa_family != dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
+ netdev_lock_ops(dev);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ netdev_unlock_ops(dev);
+ return 0;
+
+ case SIOCSIFMAP:
+ netdev_lock_ops(dev);
+ err = netif_setifmap(dev, ifr);
+ netdev_unlock_ops(dev);
+ return err;
+
+ case SIOCADDMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ netdev_lock_ops(dev);
+ err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
+
+ case SIOCDELMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ netdev_lock_ops(dev);
+ err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
+
+ case SIOCSIFTXQLEN:
+ if (ifr->ifr_qlen < 0)
+ return -EINVAL;
+ return dev_change_tx_queue_len(dev, ifr->ifr_qlen);
+
+ case SIOCSIFNAME:
+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+ return dev_change_name(dev, ifr->ifr_newname);
+
+ case SIOCWANDEV:
+ return dev_siocwandev(dev, &ifr->ifr_settings);
+
+ case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15:
+ return dev_siocdevprivate(dev, ifr, data, cmd);
+
+ case SIOCSHWTSTAMP:
+ return dev_set_hwtstamp(dev, ifr);
+
+ case SIOCGHWTSTAMP:
+ return dev_get_hwtstamp(dev, ifr);
+
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSMIIREG:
+ return dev_eth_ioctl(dev, ifr, cmd);
+
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ case SIOCBONDCHANGEACTIVE:
+ return dev_siocbond(dev, ifr, cmd);
+
+ /* Unknown ioctl */
+ default:
+ err = -EINVAL;
+ }
+ return err;
+}
+
+/**
+ * dev_load - load a network module
+ * @net: the applicable net namespace
+ * @name: name of interface
+ *
+ * If a network interface is not present and the process has suitable
+ * privileges this function loads the module. If module loading is not
+ * available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int no_module;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ rcu_read_unlock();
+
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE))
+ request_module("%s", name);
+}
+EXPORT_SYMBOL(dev_load);
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ * dev_ioctl - network device ioctl
+ * @net: the applicable net namespace
+ * @cmd: command to issue
+ * @ifr: pointer to a struct ifreq in user space
+ * @data: data exchanged with userspace
+ * @need_copyout: whether or not copy_to_user() should be called
+ *
+ * Issue ioctl functions to devices. This is normally called by the
+ * user space syscall interfaces but can sometimes be useful for
+ * other purposes. The return value is the return from the syscall if
+ * positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
+ void __user *data, bool *need_copyout)
+{
+ int ret;
+ char *colon;
+
+ if (need_copyout)
+ *need_copyout = true;
+ if (cmd == SIOCGIFNAME)
+ return dev_ifname(net, ifr);
+
+ ifr->ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr->ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch (cmd) {
+ case SIOCGIFHWADDR:
+ dev_load(net, ifr->ifr_name);
+ ret = netif_get_mac_address(&ifr->ifr_hwaddr, net,
+ ifr->ifr_name);
+ if (colon)
+ *colon = ':';
+ return ret;
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(net, ifr->ifr_name);
+ rcu_read_lock();
+ ret = dev_ifsioc_locked(net, ifr, cmd);
+ rcu_read_unlock();
+ if (colon)
+ *colon = ':';
+ return ret;
+
+ case SIOCETHTOOL:
+ dev_load(net, ifr->ifr_name);
+ ret = dev_ethtool(net, ifr, data);
+ if (colon)
+ *colon = ':';
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - return a value
+ */
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSIFNAME:
+ dev_load(net, ifr->ifr_name);
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ rtnl_net_lock(net);
+ ret = dev_ifsioc(net, ifr, data, cmd);
+ rtnl_net_unlock(net);
+
+ if (colon)
+ *colon = ':';
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFMAP:
+ case SIOCSIFTXQLEN:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ fallthrough;
+ /*
+ * These ioctl calls:
+ * - require local superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSMIIREG:
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ case SIOCSHWTSTAMP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ fallthrough;
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ dev_load(net, ifr->ifr_name);
+
+ rtnl_net_lock(net);
+ ret = dev_ifsioc(net, ifr, data, cmd);
+ rtnl_net_unlock(net);
+
+ if (need_copyout)
+ *need_copyout = false;
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but
+ * currently do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space.
+ * Not applicable in our case */
+ case SIOCSIFLINK:
+ return -ENOTTY;
+
+ /*
+ * Unknown or private ioctl.
+ */
+ default:
+ if (cmd == SIOCWANDEV ||
+ cmd == SIOCGHWTSTAMP ||
+ (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr->ifr_name);
+
+ rtnl_net_lock(net);
+ ret = dev_ifsioc(net, ifr, data, cmd);
+ rtnl_net_unlock(net);
+ return ret;
+ }
+ return -ENOTTY;
+ }
+}
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
deleted file mode 100644
index b22648d04d36..000000000000
--- a/net/core/dev_mcast.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Linux NET3: Multicast List maintenance.
- *
- * Authors:
- * Tim Kordas <tjk@nostromo.eeap.cwru.edu>
- * Richard Underwood <richard@wuzz.demon.co.uk>
- *
- * Stir fried together from the IP multicast and CAP patches above
- * Alan Cox <Alan.Cox@linux.org>
- *
- * Fixes:
- * Alan Cox : Update the device on a real delete
- * rather than any time but...
- * Alan Cox : IFF_ALLMULTI support.
- * Alan Cox : New format set_multicast_list() calls.
- * Gleb Natapov : Remove dev_mc_lock.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/route.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/arp.h>
-
-
-/*
- * Device multicast list maintenance.
- *
- * This is used both by IP and by the user level maintenance functions.
- * Unlike BSD we maintain a usage count on a given multicast address so
- * that a casual user application can add/delete multicasts used by
- * protocols without doing damage to the protocols when it deletes the
- * entries. It also helps IP as it tracks overlapping maps.
- *
- * Device mc lists are changed by bh at least if IPv6 is enabled,
- * so that it must be bh protected.
- *
- * We block accesses to device mc filters with netif_tx_lock.
- */
-
-/*
- * Update the multicast list into the physical NIC controller.
- */
-
-static void __dev_mc_upload(struct net_device *dev)
-{
- /* Don't do anything till we up the interface
- * [dev_open will call this function so the list will
- * stay sane]
- */
-
- if (!(dev->flags&IFF_UP))
- return;
-
- /*
- * Devices with no set multicast or which have been
- * detached don't get set.
- */
-
- if (dev->set_multicast_list == NULL ||
- !netif_device_present(dev))
- return;
-
- dev->set_multicast_list(dev);
-}
-
-void dev_mc_upload(struct net_device *dev)
-{
- netif_tx_lock_bh(dev);
- __dev_mc_upload(dev);
- netif_tx_unlock_bh(dev);
-}
-
-/*
- * Delete a device level multicast
- */
-
-int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
-{
- int err = 0;
- struct dev_mc_list *dmi, **dmip;
-
- netif_tx_lock_bh(dev);
-
- for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
- /*
- * Find the entry we want to delete. The device could
- * have variable length entries so check these too.
- */
- if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
- alen == dmi->dmi_addrlen) {
- if (glbl) {
- int old_glbl = dmi->dmi_gusers;
- dmi->dmi_gusers = 0;
- if (old_glbl == 0)
- break;
- }
- if (--dmi->dmi_users)
- goto done;
-
- /*
- * Last user. So delete the entry.
- */
- *dmip = dmi->next;
- dev->mc_count--;
-
- kfree(dmi);
-
- /*
- * We have altered the list, so the card
- * loaded filter is now wrong. Fix it
- */
- __dev_mc_upload(dev);
-
- netif_tx_unlock_bh(dev);
- return 0;
- }
- }
- err = -ENOENT;
-done:
- netif_tx_unlock_bh(dev);
- return err;
-}
-
-/*
- * Add a device level multicast
- */
-
-int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
-{
- int err = 0;
- struct dev_mc_list *dmi, *dmi1;
-
- dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
-
- netif_tx_lock_bh(dev);
- for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
- if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
- dmi->dmi_addrlen == alen) {
- if (glbl) {
- int old_glbl = dmi->dmi_gusers;
- dmi->dmi_gusers = 1;
- if (old_glbl)
- goto done;
- }
- dmi->dmi_users++;
- goto done;
- }
- }
-
- if ((dmi = dmi1) == NULL) {
- netif_tx_unlock_bh(dev);
- return -ENOMEM;
- }
- memcpy(dmi->dmi_addr, addr, alen);
- dmi->dmi_addrlen = alen;
- dmi->next = dev->mc_list;
- dmi->dmi_users = 1;
- dmi->dmi_gusers = glbl ? 1 : 0;
- dev->mc_list = dmi;
- dev->mc_count++;
-
- __dev_mc_upload(dev);
-
- netif_tx_unlock_bh(dev);
- return 0;
-
-done:
- netif_tx_unlock_bh(dev);
- kfree(dmi1);
- return err;
-}
-
-/*
- * Discard multicast list when a device is downed
- */
-
-void dev_mc_discard(struct net_device *dev)
-{
- netif_tx_lock_bh(dev);
-
- while (dev->mc_list != NULL) {
- struct dev_mc_list *tmp = dev->mc_list;
- dev->mc_list = tmp->next;
- if (tmp->dmi_users > tmp->dmi_gusers)
- printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users);
- kfree(tmp);
- }
- dev->mc_count = 0;
-
- netif_tx_unlock_bh(dev);
-}
-
-#ifdef CONFIG_PROC_FS
-static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos)
-{
- struct net_device *dev;
- loff_t off = 0;
-
- read_lock(&dev_base_lock);
- for (dev = dev_base; dev; dev = dev->next) {
- if (off++ == *pos)
- return dev;
- }
- return NULL;
-}
-
-static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct net_device *dev = v;
- ++*pos;
- return dev->next;
-}
-
-static void dev_mc_seq_stop(struct seq_file *seq, void *v)
-{
- read_unlock(&dev_base_lock);
-}
-
-
-static int dev_mc_seq_show(struct seq_file *seq, void *v)
-{
- struct dev_mc_list *m;
- struct net_device *dev = v;
-
- netif_tx_lock_bh(dev);
- for (m = dev->mc_list; m; m = m->next) {
- int i;
-
- seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
- dev->name, m->dmi_users, m->dmi_gusers);
-
- for (i = 0; i < m->dmi_addrlen; i++)
- seq_printf(seq, "%02x", m->dmi_addr[i]);
-
- seq_putc(seq, '\n');
- }
- netif_tx_unlock_bh(dev);
- return 0;
-}
-
-static struct seq_operations dev_mc_seq_ops = {
- .start = dev_mc_seq_start,
- .next = dev_mc_seq_next,
- .stop = dev_mc_seq_stop,
- .show = dev_mc_seq_show,
-};
-
-static int dev_mc_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &dev_mc_seq_ops);
-}
-
-static struct file_operations dev_mc_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_mc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-#endif
-
-void __init dev_mcast_init(void)
-{
- proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops);
-}
-
-EXPORT_SYMBOL(dev_mc_add);
-EXPORT_SYMBOL(dev_mc_delete);
-EXPORT_SYMBOL(dev_mc_upload);
diff --git a/net/core/devmem.c b/net/core/devmem.c
new file mode 100644
index 000000000000..ec4217d6c0b4
--- /dev/null
+++ b/net/core/devmem.c
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Devmem TCP
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <trace/events/page_pool.h>
+
+#include "devmem.h"
+#include "mp_dmabuf_devmem.h"
+#include "page_pool_priv.h"
+
+/* Device memory support */
+
+static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
+
+static const struct memory_provider_ops dmabuf_devmem_ops;
+
+bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return niov->type == NET_IOV_DMABUF;
+}
+
+static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
+ struct gen_pool_chunk *chunk,
+ void *not_used)
+{
+ struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
+
+ kvfree(owner->area.niovs);
+ kfree(owner);
+}
+
+static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+
+ owner = net_devmem_iov_to_chunk_owner(niov);
+ return owner->base_dma_addr +
+ ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
+{
+ struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
+
+ size_t size, avail;
+
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+
+ size = gen_pool_size(binding->chunk_pool);
+ avail = gen_pool_avail(binding->chunk_pool);
+
+ if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu",
+ size, avail))
+ gen_pool_destroy(binding->chunk_pool);
+
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ binding->direction);
+ dma_buf_detach(binding->dmabuf, binding->attachment);
+ dma_buf_put(binding->dmabuf);
+ xa_destroy(&binding->bound_rxqs);
+ kvfree(binding->tx_vec);
+ kfree(binding);
+}
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+ unsigned long dma_addr;
+ struct net_iov *niov;
+ ssize_t offset;
+ ssize_t index;
+
+ dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
+ (void **)&owner);
+ if (!dma_addr)
+ return NULL;
+
+ offset = dma_addr - owner->base_dma_addr;
+ index = offset / PAGE_SIZE;
+ niov = &owner->area.niovs[index];
+
+ niov->desc.pp_magic = 0;
+ niov->desc.pp = NULL;
+ atomic_long_set(&niov->desc.pp_ref_count, 0);
+
+ return niov;
+}
+
+void net_devmem_free_dmabuf(struct net_iov *niov)
+{
+ struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov);
+ unsigned long dma_addr = net_devmem_get_dma_addr(niov);
+
+ if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
+ PAGE_SIZE)))
+ return;
+
+ gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
+}
+
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct netdev_rx_queue *rxq;
+ unsigned long xa_idx;
+ unsigned int rxq_idx;
+
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+ * erase.
+ */
+ synchronize_net();
+
+ if (binding->list.next)
+ list_del(&binding->list);
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, rxq) {
+ const struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
+
+ rxq_idx = get_netdev_rx_queue_index(rxq);
+
+ __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
+ }
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+{
+ struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
+ struct netdev_rx_queue *rxq;
+ u32 xa_idx;
+ int err;
+
+ err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
+ if (err)
+ return err;
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b,
+ GFP_KERNEL);
+ if (err)
+ goto err_close_rxq;
+
+ return 0;
+
+err_close_rxq:
+ __net_mp_close_rxq(dev, rxq_idx, &mp_params);
+ return err;
+}
+
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ static u32 id_alloc_next;
+ struct scatterlist *sg;
+ struct dma_buf *dmabuf;
+ unsigned int sg_idx, i;
+ unsigned long virtual;
+ int err;
+
+ if (!dma_dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't support DMA");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return ERR_CAST(dmabuf);
+
+ binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!binding) {
+ err = -ENOMEM;
+ goto err_put_dmabuf;
+ }
+
+ binding->dev = dev;
+ xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
+
+ refcount_set(&binding->ref, 1);
+
+ mutex_init(&binding->lock);
+
+ binding->dmabuf = dmabuf;
+ binding->direction = direction;
+
+ binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev);
+ if (IS_ERR(binding->attachment)) {
+ err = PTR_ERR(binding->attachment);
+ NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
+ goto err_free_binding;
+ }
+
+ binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
+ direction);
+ if (IS_ERR(binding->sgt)) {
+ err = PTR_ERR(binding->sgt);
+ NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
+ goto err_detach;
+ }
+
+ if (direction == DMA_TO_DEVICE) {
+ binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+ sizeof(struct net_iov *),
+ GFP_KERNEL);
+ if (!binding->tx_vec) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+ }
+
+ /* For simplicity we expect to make PAGE_SIZE allocations, but the
+ * binding can be much more flexible than that. We may be able to
+ * allocate MTU sized chunks here. Leave that for future work...
+ */
+ binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+ dev_to_node(&dev->dev));
+ if (!binding->chunk_pool) {
+ err = -ENOMEM;
+ goto err_tx_vec;
+ }
+
+ virtual = 0;
+ for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
+ dma_addr_t dma_addr = sg_dma_address(sg);
+ struct dmabuf_genpool_chunk_owner *owner;
+ size_t len = sg_dma_len(sg);
+ struct net_iov *niov;
+
+ owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!owner) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ owner->area.base_virtual = virtual;
+ owner->base_dma_addr = dma_addr;
+ owner->area.num_niovs = len / PAGE_SIZE;
+ owner->binding = binding;
+
+ err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
+ dma_addr, len, dev_to_node(&dev->dev),
+ owner);
+ if (err) {
+ kfree(owner);
+ err = -EINVAL;
+ goto err_free_chunks;
+ }
+
+ owner->area.niovs = kvmalloc_array(owner->area.num_niovs,
+ sizeof(*owner->area.niovs),
+ GFP_KERNEL);
+ if (!owner->area.niovs) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ for (i = 0; i < owner->area.num_niovs; i++) {
+ niov = &owner->area.niovs[i];
+ niov->type = NET_IOV_DMABUF;
+ niov->owner = &owner->area;
+ page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
+ net_devmem_get_dma_addr(niov));
+ if (direction == DMA_TO_DEVICE)
+ binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
+ }
+
+ virtual += len;
+ }
+
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_chunks;
+
+ list_add(&binding->list, &priv->bindings);
+
+ return binding;
+
+err_free_chunks:
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+ gen_pool_destroy(binding->chunk_pool);
+err_tx_vec:
+ kvfree(binding->tx_vec);
+err_unmap:
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ direction);
+err_detach:
+ dma_buf_detach(dmabuf, binding->attachment);
+err_free_binding:
+ kfree(binding);
+err_put_dmabuf:
+ dma_buf_put(dmabuf);
+ return ERR_PTR(err);
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+
+ rcu_read_lock();
+ binding = xa_load(&net_devmem_dmabuf_bindings, id);
+ if (binding) {
+ if (!net_devmem_dmabuf_binding_get(binding))
+ binding = NULL;
+ }
+ rcu_read_unlock();
+
+ return binding;
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
+}
+
+void net_devmem_put_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+ unsigned int dmabuf_id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct net_device *dst_dev;
+ struct dst_entry *dst;
+ int err = 0;
+
+ binding = net_devmem_lookup_dmabuf(dmabuf_id);
+ if (!binding || !binding->tx_vec) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ /* If dst is NULL (route expired), attempt to rebuild it. */
+ if (unlikely(!dst)) {
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+ err = -EHOSTUNREACH;
+ goto out_unlock;
+ }
+ dst = __sk_dst_get(sk);
+ if (unlikely(!dst)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ /* The dma-addrs in this binding are only reachable to the corresponding
+ * net_device.
+ */
+ dst_dev = dst_dev_rcu(dst);
+ if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ rcu_read_unlock();
+ return binding;
+
+out_unlock:
+ rcu_read_unlock();
+out_err:
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
+ return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+ size_t virt_addr, size_t *off, size_t *size)
+{
+ if (virt_addr >= binding->dmabuf->size)
+ return NULL;
+
+ *off = virt_addr % PAGE_SIZE;
+ *size = PAGE_SIZE - *off;
+
+ return binding->tx_vec[virt_addr / PAGE_SIZE];
+}
+
+/*** "Dmabuf devmem memory provider" ***/
+
+int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ if (!binding)
+ return -EINVAL;
+
+ /* dma-buf dma addresses do not need and should not be used with
+ * dma_sync_for_cpu/device. Force disable dma_sync.
+ */
+ pool->dma_sync = false;
+ pool->dma_sync_for_cpu = false;
+
+ if (pool->p.order != 0)
+ return -E2BIG;
+
+ net_devmem_dmabuf_binding_get(binding);
+ return 0;
+}
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+ struct net_iov *niov;
+ netmem_ref netmem;
+
+ niov = net_devmem_alloc_dmabuf(binding);
+ if (!niov)
+ return 0;
+
+ netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+ return netmem;
+}
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem));
+
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return false;
+
+ if (WARN_ON_ONCE(refcount != 1))
+ return false;
+
+ page_pool_clear_pp_info(netmem);
+
+ net_devmem_free_dmabuf(netmem_to_net_iov(netmem));
+
+ /* We don't want the page pool put_page()ing our net_iovs. */
+ return false;
+}
+
+static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq)
+{
+ const struct net_devmem_dmabuf_binding *binding = mp_priv;
+ int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF;
+
+ return nla_put_u32(rsp, type, binding->id);
+}
+
+static void mp_dmabuf_devmem_uninstall(void *mp_priv,
+ struct netdev_rx_queue *rxq)
+{
+ struct net_devmem_dmabuf_binding *binding = mp_priv;
+ struct netdev_rx_queue *bound_rxq;
+ unsigned long xa_idx;
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) {
+ if (bound_rxq == rxq) {
+ xa_erase(&binding->bound_rxqs, xa_idx);
+ if (xa_empty(&binding->bound_rxqs)) {
+ mutex_lock(&binding->lock);
+ binding->dev = NULL;
+ mutex_unlock(&binding->lock);
+ }
+ break;
+ }
+ }
+}
+
+static const struct memory_provider_ops dmabuf_devmem_ops = {
+ .init = mp_dmabuf_devmem_init,
+ .destroy = mp_dmabuf_devmem_destroy,
+ .alloc_netmems = mp_dmabuf_devmem_alloc_netmems,
+ .release_netmem = mp_dmabuf_devmem_release_page,
+ .nl_fill = mp_dmabuf_devmem_nl_fill,
+ .uninstall = mp_dmabuf_devmem_uninstall,
+};
diff --git a/net/core/devmem.h b/net/core/devmem.h
new file mode 100644
index 000000000000..0b43a648cd2e
--- /dev/null
+++ b/net/core/devmem.h
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Device memory TCP support
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemb@google.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com>
+ *
+ */
+#ifndef _NET_DEVMEM_H
+#define _NET_DEVMEM_H
+
+#include <net/netmem.h>
+#include <net/netdev_netlink.h>
+
+struct netlink_ext_ack;
+
+struct net_devmem_dmabuf_binding {
+ struct dma_buf *dmabuf;
+ struct dma_buf_attachment *attachment;
+ struct sg_table *sgt;
+ struct net_device *dev;
+ struct gen_pool *chunk_pool;
+ /* Protect dev */
+ struct mutex lock;
+
+ /* The user holds a ref (via the netlink API) for as long as they want
+ * the binding to remain alive. Each page pool using this binding holds
+ * a ref to keep the binding alive. The page_pool does not release the
+ * ref until all the net_iovs allocated from this binding are released
+ * back to the page_pool.
+ *
+ * The binding undos itself and unmaps the underlying dmabuf once all
+ * those refs are dropped and the binding is no longer desired or in
+ * use.
+ *
+ * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
+ * reference, making sure that the binding remains alive until all the
+ * net_iovs are no longer used. net_iovs allocated from this binding
+ * that are stuck in the TX path for any reason (such as awaiting
+ * retransmits) hold a reference to the binding until the skb holding
+ * them is freed.
+ */
+ refcount_t ref;
+
+ /* The list of bindings currently active. Used for netlink to notify us
+ * of the user dropping the bind.
+ */
+ struct list_head list;
+
+ /* rxq's this binding is active on. */
+ struct xarray bound_rxqs;
+
+ /* ID of this binding. Globally unique to all bindings currently
+ * active.
+ */
+ u32 id;
+
+ /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */
+ enum dma_data_direction direction;
+
+ /* Array of net_iov pointers for this binding, sorted by virtual
+ * address. This array is convenient to map the virtual addresses to
+ * net_iovs in the TX path.
+ */
+ struct net_iov **tx_vec;
+
+ struct work_struct unbind_w;
+};
+
+#if defined(CONFIG_NET_DEVMEM)
+/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
+ * entry from the dmabuf is inserted into the genpool as a chunk, and needs
+ * this owner struct to keep track of some metadata necessary to create
+ * allocations from this chunk.
+ */
+struct dmabuf_genpool_chunk_owner {
+ struct net_iov_area area;
+ struct net_devmem_dmabuf_binding *binding;
+
+ /* dma_addr of the start of the chunk. */
+ dma_addr_t base_dma_addr;
+};
+
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack);
+
+static inline struct dmabuf_genpool_chunk_owner *
+net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return container_of(owner, struct dmabuf_genpool_chunk_owner, area);
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return net_devmem_iov_to_chunk_owner(niov)->binding;
+}
+
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
+{
+ return net_devmem_iov_binding(niov)->id;
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return owner->base_virtual +
+ ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+static inline bool
+net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
+{
+ return refcount_inc_not_zero(&binding->ref);
+}
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+ if (!refcount_dec_and_test(&binding->ref))
+ return;
+
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov);
+void net_devmem_put_net_iov(struct net_iov *niov);
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
+void net_devmem_free_dmabuf(struct net_iov *ppiov);
+
+bool net_is_devmem_iov(struct net_iov *niov);
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size);
+
+#else
+struct net_devmem_dmabuf_binding;
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline void net_devmem_get_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void net_devmem_put_net_iov(struct net_iov *niov)
+{
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd,
+ struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ return NULL;
+}
+
+static inline void
+net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline int
+net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ return NULL;
+}
+
+static inline void net_devmem_free_dmabuf(struct net_iov *ppiov)
+{
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return false;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size)
+{
+ return NULL;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return NULL;
+}
+#endif
+
+#endif /* _NET_DEVMEM_H */
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 000000000000..60d31c2feed3
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,1789 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/bitfield.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/genetlink.h>
+#include <net/netevent.h>
+#include <net/flow_offload.h>
+#include <net/dropreason.h>
+#include <net/devlink.h>
+
+#include <trace/events/skb.h>
+#include <trace/events/napi.h>
+#include <trace/events/devlink.h>
+
+#include <linux/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+static int trace_state = TRACE_OFF;
+static bool monitor_hw;
+
+/* net_dm_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ */
+static DEFINE_MUTEX(net_dm_mutex);
+
+struct net_dm_stats {
+ u64_stats_t dropped;
+ struct u64_stats_sync syncp;
+};
+
+#define NET_DM_MAX_HW_TRAP_NAME_LEN 40
+
+struct net_dm_hw_entry {
+ char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN];
+ u32 count;
+};
+
+struct net_dm_hw_entries {
+ u32 num_entries;
+ struct net_dm_hw_entry entries[];
+};
+
+struct per_cpu_dm_data {
+ raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and
+ * 'send_timer'
+ */
+ union {
+ struct sk_buff *skb;
+ struct net_dm_hw_entries *hw_entries;
+ };
+ struct sk_buff_head drop_queue;
+ struct work_struct dm_alert_work;
+ struct timer_list send_timer;
+ struct net_dm_stats stats;
+};
+
+struct dm_hw_stat_delta {
+ unsigned long last_rx;
+ unsigned long last_drop_val;
+ struct rcu_head rcu;
+};
+
+static struct genl_family net_drop_monitor_family;
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+static unsigned long dm_hw_check_delta = 2*HZ;
+
+static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
+static u32 net_dm_trunc_len;
+static u32 net_dm_queue_len = 1000;
+
+struct net_dm_alert_ops {
+ void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason,
+ struct sock *rx_sk);
+ void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
+ int work, int budget);
+ void (*work_item_func)(struct work_struct *work);
+ void (*hw_work_item_func)(struct work_struct *work);
+ void (*hw_trap_probe)(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata);
+};
+
+struct net_dm_skb_cb {
+ union {
+ struct devlink_trap_metadata *hw_metadata;
+ void *pc;
+ };
+ enum skb_drop_reason reason;
+};
+
+#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
+
+static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+ size_t al;
+ struct net_dm_alert_msg *msg;
+ struct nlattr *nla;
+ struct sk_buff *skb;
+ unsigned long flags;
+ void *msg_header;
+
+ al = sizeof(struct net_dm_alert_msg);
+ al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ al += sizeof(struct nlattr);
+
+ skb = genlmsg_new(al, GFP_KERNEL);
+
+ if (!skb)
+ goto err;
+
+ msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ if (!msg_header) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ if (!nla) {
+ nlmsg_free(skb);
+ skb = NULL;
+ goto err;
+ }
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ goto out;
+
+err:
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+out:
+ raw_spin_lock_irqsave(&data->lock, flags);
+ swap(data->skb, skb);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+
+ if (skb) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh);
+
+ genlmsg_end(skb, genlmsg_data(gnlh));
+ }
+
+ return skb;
+}
+
+static const struct genl_multicast_group dropmon_mcgrps[] = {
+ { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, },
+};
+
+static void send_dm_alert(struct work_struct *work)
+{
+ struct sk_buff *skb;
+ struct per_cpu_dm_data *data;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ skb = reset_per_cpu_data(data);
+
+ if (skb)
+ genlmsg_multicast(&net_drop_monitor_family, skb, 0,
+ 0, GFP_KERNEL);
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.
+ */
+static void sched_send_work(struct timer_list *t)
+{
+ struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer);
+
+ schedule_work(&data->dm_alert_work);
+}
+
+static void trace_drop_common(struct sk_buff *skb, void *location)
+{
+ struct net_dm_alert_msg *msg;
+ struct net_dm_drop_point *point;
+ struct nlmsghdr *nlh;
+ struct nlattr *nla;
+ int i;
+ struct sk_buff *dskb;
+ struct per_cpu_dm_data *data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ data = this_cpu_ptr(&dm_cpu_data);
+ raw_spin_lock(&data->lock);
+ dskb = data->skb;
+
+ if (!dskb)
+ goto out;
+
+ nlh = (struct nlmsghdr *)dskb->data;
+ nla = genlmsg_data(nlmsg_data(nlh));
+ msg = nla_data(nla);
+ point = msg->points;
+ for (i = 0; i < msg->entries; i++) {
+ if (!memcmp(&location, &point->pc, sizeof(void *))) {
+ point->count++;
+ goto out;
+ }
+ point++;
+ }
+ if (msg->entries == dm_hit_limit)
+ goto out;
+ /*
+ * We need to create a new entry
+ */
+ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
+ nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
+ memcpy(point->pc, &location, sizeof(void *));
+ point->count = 1;
+ msg->entries++;
+
+ if (!timer_pending(&data->send_timer)) {
+ data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&data->send_timer);
+ }
+
+out:
+ raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
+{
+ trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
+ int work, int budget)
+{
+ struct net_device *dev = napi->dev;
+ struct dm_hw_stat_delta *stat;
+ /*
+ * Don't check napi structures with no associated device
+ */
+ if (!dev)
+ return;
+
+ rcu_read_lock();
+ stat = rcu_dereference(dev->dm_private);
+ if (stat) {
+ /*
+ * only add a note to our monitor buffer if:
+ * 1) its after the last_rx delta
+ * 2) our rx_dropped count has gone up
+ */
+ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) &&
+ (dev->stats.rx_dropped != stat->last_drop_val)) {
+ trace_drop_common(NULL, NULL);
+ stat->last_drop_val = dev->stats.rx_dropped;
+ stat->last_rx = jiffies;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static struct net_dm_hw_entries *
+net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
+{
+ struct net_dm_hw_entries *hw_entries;
+ unsigned long flags;
+
+ hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit),
+ GFP_KERNEL);
+ if (!hw_entries) {
+ /* If the memory allocation failed, we try to perform another
+ * allocation in 1/10 second. Otherwise, the probe function
+ * will constantly bail out.
+ */
+ mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
+ }
+
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
+ swap(hw_data->hw_entries, hw_entries);
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
+
+ return hw_entries;
+}
+
+static int net_dm_hw_entry_put(struct sk_buff *msg,
+ const struct net_dm_hw_entry *hw_entry)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_hw_entries_put(struct sk_buff *msg,
+ const struct net_dm_hw_entries *hw_entries)
+{
+ struct nlattr *attr;
+ int i;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES);
+ if (!attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < hw_entries->num_entries; i++) {
+ int rc;
+
+ rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]);
+ if (rc)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+net_dm_hw_summary_report_fill(struct sk_buff *msg,
+ const struct net_dm_hw_entries *hw_entries)
+{
+ struct net_dm_alert_msg anc_hdr = { 0 };
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ /* We need to put the ancillary header in order not to break user
+ * space.
+ */
+ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr))
+ goto nla_put_failure;
+
+ rc = net_dm_hw_entries_put(msg, hw_entries);
+ if (rc)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void net_dm_hw_summary_work(struct work_struct *work)
+{
+ struct net_dm_hw_entries *hw_entries;
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff *msg;
+ int rc;
+
+ hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+ if (!hw_entries)
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_hw_summary_report_fill(msg, hw_entries);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ kfree(hw_entries);
+}
+
+static void
+net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
+{
+ struct net_dm_hw_entries *hw_entries;
+ struct net_dm_hw_entry *hw_entry;
+ struct per_cpu_dm_data *hw_data;
+ unsigned long flags;
+ int i;
+
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
+ hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
+ hw_entries = hw_data->hw_entries;
+
+ if (!hw_entries)
+ goto out;
+
+ for (i = 0; i < hw_entries->num_entries; i++) {
+ hw_entry = &hw_entries->entries[i];
+ if (!strncmp(hw_entry->trap_name, metadata->trap_name,
+ NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
+ hw_entry->count++;
+ goto out;
+ }
+ }
+ if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit))
+ goto out;
+
+ hw_entry = &hw_entries->entries[hw_entries->num_entries];
+ strscpy(hw_entry->trap_name, metadata->trap_name,
+ NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
+ hw_entry->count = 1;
+ hw_entries->num_entries++;
+
+ if (!timer_pending(&hw_data->send_timer)) {
+ hw_data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&hw_data->send_timer);
+ }
+
+out:
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
+}
+
+static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
+ .kfree_skb_probe = trace_kfree_skb_hit,
+ .napi_poll_probe = trace_napi_poll_hit,
+ .work_item_func = send_dm_alert,
+ .hw_work_item_func = net_dm_hw_summary_work,
+ .hw_trap_probe = net_dm_hw_trap_summary_probe,
+};
+
+static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
+ struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
+{
+ ktime_t tstamp = ktime_get_real();
+ struct per_cpu_dm_data *data;
+ struct net_dm_skb_cb *cb;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ if (!skb_mac_header_was_set(skb))
+ return;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ cb = NET_DM_SKB_CB(nskb);
+ cb->reason = reason;
+ cb->pc = location;
+ /* Override the timestamp because we care about the time when the
+ * packet was dropped.
+ */
+ nskb->tstamp = tstamp;
+
+ data = this_cpu_ptr(&dm_cpu_data);
+
+ spin_lock_irqsave(&data->drop_queue.lock, flags);
+ if (skb_queue_len(&data->drop_queue) < net_dm_queue_len)
+ __skb_queue_tail(&data->drop_queue, nskb);
+ else
+ goto unlock_free;
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+ schedule_work(&data->dm_alert_work);
+
+ return;
+
+unlock_free:
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+ u64_stats_update_begin(&data->stats.syncp);
+ u64_stats_inc(&data->stats.dropped);
+ u64_stats_update_end(&data->stats.syncp);
+ consume_skb(nskb);
+}
+
+static void net_dm_packet_trace_napi_poll_hit(void *ignore,
+ struct napi_struct *napi,
+ int work, int budget)
+{
+}
+
+static size_t net_dm_in_port_size(void)
+{
+ /* NET_DM_ATTR_IN_PORT nest */
+ return nla_total_size(0) +
+ /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PORT_NETDEV_NAME */
+ nla_total_size(IFNAMSIZ + 1);
+}
+
+#define NET_DM_MAX_SYMBOL_LEN 40
+#define NET_DM_MAX_REASON_LEN 50
+
+static size_t net_dm_packet_report_size(size_t payload_len)
+{
+ size_t size;
+
+ size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+ return NLMSG_ALIGN(size) +
+ /* NET_DM_ATTR_ORIGIN */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PC */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_SYMBOL */
+ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) +
+ /* NET_DM_ATTR_IN_PORT */
+ net_dm_in_port_size() +
+ /* NET_DM_ATTR_TIMESTAMP */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_ORIG_LEN */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PROTO */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_REASON */
+ nla_total_size(NET_DM_MAX_REASON_LEN + 1) +
+ /* NET_DM_ATTR_PAYLOAD */
+ nla_total_size(payload_len);
+}
+
+static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex,
+ const char *name)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (ifindex &&
+ nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
+ goto nla_put_failure;
+
+ if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
+ size_t payload_len)
+{
+ struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
+ const struct drop_reason_list *list = NULL;
+ unsigned int subsys, subsys_reason;
+ char buf[NET_DM_MAX_SYMBOL_LEN];
+ struct nlattr *attr;
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_PACKET_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+ NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK);
+ if (subsys < SKB_DROP_REASON_SUBSYS_NUM)
+ list = rcu_dereference(drop_reasons_by_subsys[subsys]);
+ subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK;
+ if (!list ||
+ subsys_reason >= list->n_reasons ||
+ !list->reasons[subsys_reason] ||
+ strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) {
+ list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]);
+ subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ }
+ if (nla_put_string(msg, NET_DM_ATTR_REASON,
+ list->reasons[subsys_reason])) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ rcu_read_unlock();
+
+ snprintf(buf, sizeof(buf), "%pS", cb->pc);
+ if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
+ goto nla_put_failure;
+
+ rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL);
+ if (rc)
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+ ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+ goto nla_put_failure;
+
+ if (!payload_len)
+ goto out;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+ goto nla_put_failure;
+
+ attr = skb_put(msg, nla_total_size(payload_len));
+ attr->nla_type = NET_DM_ATTR_PAYLOAD;
+ attr->nla_len = nla_attr_size(payload_len);
+ if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO)
+
+static void net_dm_packet_report(struct sk_buff *skb)
+{
+ struct sk_buff *msg;
+ size_t payload_len;
+ int rc;
+
+ /* Make sure we start copying the packet from the MAC header */
+ if (skb->data > skb_mac_header(skb))
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else
+ skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+ /* Ensure packet fits inside a single netlink attribute */
+ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+ if (net_dm_trunc_len)
+ payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_packet_report_fill(msg, skb, payload_len);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ consume_skb(skb);
+}
+
+static void net_dm_packet_work(struct work_struct *work)
+{
+ struct per_cpu_dm_data *data;
+ struct sk_buff_head list;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_irqsave(&data->drop_queue.lock, flags);
+ skb_queue_splice_tail_init(&data->drop_queue, &list);
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+ while ((skb = __skb_dequeue(&list)))
+ net_dm_packet_report(skb);
+}
+
+static size_t
+net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata)
+{
+ return hw_metadata->fa_cookie ?
+ nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0;
+}
+
+static size_t
+net_dm_hw_packet_report_size(size_t payload_len,
+ const struct devlink_trap_metadata *hw_metadata)
+{
+ size_t size;
+
+ size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+ return NLMSG_ALIGN(size) +
+ /* NET_DM_ATTR_ORIGIN */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */
+ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) +
+ /* NET_DM_ATTR_HW_TRAP_NAME */
+ nla_total_size(strlen(hw_metadata->trap_name) + 1) +
+ /* NET_DM_ATTR_IN_PORT */
+ net_dm_in_port_size() +
+ /* NET_DM_ATTR_FLOW_ACTION_COOKIE */
+ net_dm_flow_action_cookie_size(hw_metadata) +
+ /* NET_DM_ATTR_TIMESTAMP */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_ORIG_LEN */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PROTO */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PAYLOAD */
+ nla_total_size(payload_len);
+}
+
+static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
+ struct sk_buff *skb, size_t payload_len)
+{
+ struct devlink_trap_metadata *hw_metadata;
+ struct nlattr *attr;
+ void *hdr;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_PACKET_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME,
+ hw_metadata->trap_group_name))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME,
+ hw_metadata->trap_name))
+ goto nla_put_failure;
+
+ if (hw_metadata->input_dev) {
+ struct net_device *dev = hw_metadata->input_dev;
+ int rc;
+
+ rc = net_dm_packet_report_in_port_put(msg, dev->ifindex,
+ dev->name);
+ if (rc)
+ goto nla_put_failure;
+ }
+
+ if (hw_metadata->fa_cookie &&
+ nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE,
+ hw_metadata->fa_cookie->cookie_len,
+ hw_metadata->fa_cookie->cookie))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+ ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+ goto nla_put_failure;
+
+ if (!payload_len)
+ goto out;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+ goto nla_put_failure;
+
+ attr = skb_put(msg, nla_total_size(payload_len));
+ attr->nla_type = NET_DM_ATTR_PAYLOAD;
+ attr->nla_len = nla_attr_size(payload_len);
+ if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static struct devlink_trap_metadata *
+net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
+{
+ const struct flow_action_cookie *fa_cookie;
+ struct devlink_trap_metadata *hw_metadata;
+ const char *trap_group_name;
+ const char *trap_name;
+
+ hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ if (!hw_metadata)
+ return NULL;
+
+ trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC);
+ if (!trap_group_name)
+ goto free_hw_metadata;
+ hw_metadata->trap_group_name = trap_group_name;
+
+ trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC);
+ if (!trap_name)
+ goto free_trap_group;
+ hw_metadata->trap_name = trap_name;
+
+ if (metadata->fa_cookie) {
+ size_t cookie_size = sizeof(*fa_cookie) +
+ metadata->fa_cookie->cookie_len;
+
+ fa_cookie = kmemdup(metadata->fa_cookie, cookie_size,
+ GFP_ATOMIC);
+ if (!fa_cookie)
+ goto free_trap_name;
+ hw_metadata->fa_cookie = fa_cookie;
+ }
+
+ hw_metadata->input_dev = metadata->input_dev;
+ netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker,
+ GFP_ATOMIC);
+
+ return hw_metadata;
+
+free_trap_name:
+ kfree(trap_name);
+free_trap_group:
+ kfree(trap_group_name);
+free_hw_metadata:
+ kfree(hw_metadata);
+ return NULL;
+}
+
+static void
+net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata)
+{
+ netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker);
+ kfree(hw_metadata->fa_cookie);
+ kfree(hw_metadata->trap_name);
+ kfree(hw_metadata->trap_group_name);
+ kfree(hw_metadata);
+}
+
+static void net_dm_hw_packet_report(struct sk_buff *skb)
+{
+ struct devlink_trap_metadata *hw_metadata;
+ struct sk_buff *msg;
+ size_t payload_len;
+ int rc;
+
+ if (skb->data > skb_mac_header(skb))
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else
+ skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+ if (net_dm_trunc_len)
+ payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata),
+ GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_hw_packet_report_fill(msg, skb, payload_len);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata);
+ consume_skb(skb);
+}
+
+static void net_dm_hw_packet_work(struct work_struct *work)
+{
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff_head list;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+ skb_queue_splice_tail_init(&hw_data->drop_queue, &list);
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+ while ((skb = __skb_dequeue(&list)))
+ net_dm_hw_packet_report(skb);
+}
+
+static void
+net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
+{
+ struct devlink_trap_metadata *n_hw_metadata;
+ ktime_t tstamp = ktime_get_real();
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
+ if (!skb_mac_header_was_set(skb))
+ return;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ n_hw_metadata = net_dm_hw_metadata_copy(metadata);
+ if (!n_hw_metadata)
+ goto free;
+
+ NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata;
+ nskb->tstamp = tstamp;
+
+ hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+
+ spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+ if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len)
+ __skb_queue_tail(&hw_data->drop_queue, nskb);
+ else
+ goto unlock_free;
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+ schedule_work(&hw_data->dm_alert_work);
+
+ return;
+
+unlock_free:
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+ u64_stats_update_begin(&hw_data->stats.syncp);
+ u64_stats_inc(&hw_data->stats.dropped);
+ u64_stats_update_end(&hw_data->stats.syncp);
+ net_dm_hw_metadata_free(n_hw_metadata);
+free:
+ consume_skb(nskb);
+}
+
+static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
+ .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit,
+ .napi_poll_probe = net_dm_packet_trace_napi_poll_hit,
+ .work_item_func = net_dm_packet_work,
+ .hw_work_item_func = net_dm_hw_packet_work,
+ .hw_trap_probe = net_dm_hw_trap_packet_probe,
+};
+
+static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
+ [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops,
+ [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops,
+};
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
+{
+ return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+}
+
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
+ unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+ tracepoint_synchronize_unregister();
+}
+#else
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
+{
+ return -EOPNOTSUPP;
+}
+
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
+}
+#endif
+
+static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu, rc;
+
+ if (monitor_hw) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
+ return -EAGAIN;
+ }
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ if (!try_module_get(THIS_MODULE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+ return -ENODEV;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct net_dm_hw_entries *hw_entries;
+
+ INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func);
+ timer_setup(&hw_data->send_timer, sched_send_work, 0);
+ hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+ kfree(hw_entries);
+ }
+
+ rc = net_dm_hw_probe_register(ops);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint");
+ goto err_module_put;
+ }
+
+ monitor_hw = true;
+
+ return 0;
+
+err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct devlink_trap_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
+ }
+ }
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu;
+
+ if (!monitor_hw) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled");
+ return;
+ }
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ monitor_hw = false;
+
+ net_dm_hw_probe_unregister(ops);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct devlink_trap_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
+ }
+ }
+
+ module_put(THIS_MODULE);
+}
+
+static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu, rc;
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ if (!try_module_get(THIS_MODULE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+ return -ENODEV;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ INIT_WORK(&data->dm_alert_work, ops->work_item_func);
+ timer_setup(&data->send_timer, sched_send_work, 0);
+ /* Allocate a new per-CPU skb for the summary alert message and
+ * free the old one which might contain stale data from
+ * previous tracing.
+ */
+ skb = reset_per_cpu_data(data);
+ consume_skb(skb);
+ }
+
+ rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint");
+ goto err_module_put;
+ }
+
+ rc = register_trace_napi_poll(ops->napi_poll_probe, NULL);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint");
+ goto err_unregister_trace;
+ }
+
+ return 0;
+
+err_unregister_trace:
+ unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static void net_dm_trace_off_set(void)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu;
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ unregister_trace_napi_poll(ops->napi_poll_probe, NULL);
+ unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+
+ tracepoint_synchronize_unregister();
+
+ /* Make sure we do not send notifications to user space after request
+ * to stop tracing returns.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
+
+ module_put(THIS_MODULE);
+}
+
+static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack)
+{
+ int rc = 0;
+
+ if (state == trace_state) {
+ NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state");
+ return -EAGAIN;
+ }
+
+ switch (state) {
+ case TRACE_ON:
+ rc = net_dm_trace_on_set(extack);
+ break;
+ case TRACE_OFF:
+ net_dm_trace_off_set();
+ break;
+ default:
+ rc = 1;
+ break;
+ }
+
+ if (!rc)
+ trace_state = state;
+ else
+ rc = -EINPROGRESS;
+
+ return rc;
+}
+
+static bool net_dm_is_monitoring(void)
+{
+ return trace_state == TRACE_ON || monitor_hw;
+}
+
+static int net_dm_alert_mode_get_from_info(struct genl_info *info,
+ enum net_dm_alert_mode *p_alert_mode)
+{
+ u8 val;
+
+ val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]);
+
+ switch (val) {
+ case NET_DM_ALERT_MODE_SUMMARY:
+ case NET_DM_ALERT_MODE_PACKET:
+ *p_alert_mode = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int net_dm_alert_mode_set(struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ enum net_dm_alert_mode alert_mode;
+ int rc;
+
+ if (!info->attrs[NET_DM_ATTR_ALERT_MODE])
+ return 0;
+
+ rc = net_dm_alert_mode_get_from_info(info, &alert_mode);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode");
+ return -EINVAL;
+ }
+
+ net_dm_alert_mode = alert_mode;
+
+ return 0;
+}
+
+static void net_dm_trunc_len_set(struct genl_info *info)
+{
+ if (!info->attrs[NET_DM_ATTR_TRUNC_LEN])
+ return;
+
+ net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
+}
+
+static void net_dm_queue_len_set(struct genl_info *info)
+{
+ if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
+ return;
+
+ net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]);
+}
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ int rc;
+
+ if (net_dm_is_monitoring()) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring");
+ return -EBUSY;
+ }
+
+ rc = net_dm_alert_mode_set(info);
+ if (rc)
+ return rc;
+
+ net_dm_trunc_len_set(info);
+
+ net_dm_queue_len_set(info);
+
+ return 0;
+}
+
+static int net_dm_monitor_start(bool set_sw, bool set_hw,
+ struct netlink_ext_ack *extack)
+{
+ bool sw_set = false;
+ int rc;
+
+ if (set_sw) {
+ rc = set_all_monitor_traces(TRACE_ON, extack);
+ if (rc)
+ return rc;
+ sw_set = true;
+ }
+
+ if (set_hw) {
+ rc = net_dm_hw_monitor_start(extack);
+ if (rc)
+ goto err_monitor_hw;
+ }
+
+ return 0;
+
+err_monitor_hw:
+ if (sw_set)
+ set_all_monitor_traces(TRACE_OFF, extack);
+ return rc;
+}
+
+static void net_dm_monitor_stop(bool set_sw, bool set_hw,
+ struct netlink_ext_ack *extack)
+{
+ if (set_hw)
+ net_dm_hw_monitor_stop(extack);
+ if (set_sw)
+ set_all_monitor_traces(TRACE_OFF, extack);
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS];
+ bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS];
+ struct netlink_ext_ack *extack = info->extack;
+
+ /* To maintain backward compatibility, we start / stop monitoring of
+ * software drops if no flag is specified.
+ */
+ if (!set_sw && !set_hw)
+ set_sw = true;
+
+ switch (info->genlhdr->cmd) {
+ case NET_DM_CMD_START:
+ return net_dm_monitor_start(set_sw, set_hw, extack);
+ case NET_DM_CMD_STOP:
+ net_dm_monitor_stop(set_sw, set_hw, extack);
+ return 0;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ int rc;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rc = net_dm_config_fill(msg, info);
+ if (rc)
+ goto free_msg;
+
+ return genlmsg_reply(msg, info);
+
+free_msg:
+ nlmsg_free(msg);
+ return rc;
+}
+
+static void net_dm_stats_read(struct net_dm_stats *stats)
+{
+ int cpu;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct net_dm_stats *cpu_stats = &data->stats;
+ unsigned int start;
+ u64 dropped;
+
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ dropped = u64_stats_read(&cpu_stats->dropped);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->dropped, dropped);
+ }
+}
+
+static int net_dm_stats_put(struct sk_buff *msg)
+{
+ struct net_dm_stats stats;
+ struct nlattr *attr;
+
+ net_dm_stats_read(&stats);
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+ u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static void net_dm_hw_stats_read(struct net_dm_stats *stats)
+{
+ int cpu;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct net_dm_stats *cpu_stats = &hw_data->stats;
+ unsigned int start;
+ u64 dropped;
+
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ dropped = u64_stats_read(&cpu_stats->dropped);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->dropped, dropped);
+ }
+}
+
+static int net_dm_hw_stats_put(struct sk_buff *msg)
+{
+ struct net_dm_stats stats;
+ struct nlattr *attr;
+
+ net_dm_hw_stats_read(&stats);
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+ u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
+{
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ rc = net_dm_stats_put(msg);
+ if (rc)
+ goto nla_put_failure;
+
+ rc = net_dm_hw_stats_put(msg);
+ if (rc)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ int rc;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rc = net_dm_stats_fill(msg, info);
+ if (rc)
+ goto free_msg;
+
+ return genlmsg_reply(msg, info);
+
+free_msg:
+ nlmsg_free(msg);
+ return rc;
+}
+
+static int dropmon_net_event(struct notifier_block *ev_block,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dm_hw_stat_delta *stat;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private)))
+ break;
+ stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ break;
+
+ stat->last_rx = jiffies;
+ rcu_assign_pointer(dev->dm_private, stat);
+
+ break;
+ case NETDEV_UNREGISTER:
+ stat = rtnl_dereference(dev->dm_private);
+ if (stat) {
+ rcu_assign_pointer(dev->dm_private, NULL);
+ kfree_rcu(stat, rcu);
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
+ [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
+ [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
+ [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
+ [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
+ [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG },
+ [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG },
+};
+
+static const struct genl_small_ops dropmon_ops[] = {
+ {
+ .cmd = NET_DM_CMD_CONFIG,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = net_dm_cmd_config,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = NET_DM_CMD_START,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = net_dm_cmd_trace,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = NET_DM_CMD_STOP,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = net_dm_cmd_trace,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = NET_DM_CMD_CONFIG_GET,
+ .doit = net_dm_cmd_config_get,
+ },
+ {
+ .cmd = NET_DM_CMD_STATS_GET,
+ .doit = net_dm_cmd_stats_get,
+ },
+};
+
+static int net_dm_nl_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ mutex_lock(&net_dm_mutex);
+
+ return 0;
+}
+
+static void net_dm_nl_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ mutex_unlock(&net_dm_mutex);
+}
+
+static struct genl_family net_drop_monitor_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+ .maxattr = NET_DM_ATTR_MAX,
+ .policy = net_dm_nl_policy,
+ .pre_doit = net_dm_nl_pre_doit,
+ .post_doit = net_dm_nl_post_doit,
+ .module = THIS_MODULE,
+ .small_ops = dropmon_ops,
+ .n_small_ops = ARRAY_SIZE(dropmon_ops),
+ .resv_start_op = NET_DM_CMD_STATS_GET + 1,
+ .mcgrps = dropmon_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
+};
+
+static struct notifier_block dropmon_net_notifier = {
+ .notifier_call = dropmon_net_event
+};
+
+static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)
+{
+ raw_spin_lock_init(&data->lock);
+ skb_queue_head_init(&data->drop_queue);
+ u64_stats_init(&data->stats.syncp);
+}
+
+static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data)
+{
+ WARN_ON(!skb_queue_empty(&data->drop_queue));
+}
+
+static void net_dm_cpu_data_init(int cpu)
+{
+ struct per_cpu_dm_data *data;
+
+ data = &per_cpu(dm_cpu_data, cpu);
+ __net_dm_cpu_data_init(data);
+}
+
+static void net_dm_cpu_data_fini(int cpu)
+{
+ struct per_cpu_dm_data *data;
+
+ data = &per_cpu(dm_cpu_data, cpu);
+ /* At this point, we should have exclusive access
+ * to this struct and can free the skb inside it.
+ */
+ consume_skb(data->skb);
+ __net_dm_cpu_data_fini(data);
+}
+
+static void net_dm_hw_cpu_data_init(int cpu)
+{
+ struct per_cpu_dm_data *hw_data;
+
+ hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ __net_dm_cpu_data_init(hw_data);
+}
+
+static void net_dm_hw_cpu_data_fini(int cpu)
+{
+ struct per_cpu_dm_data *hw_data;
+
+ hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ kfree(hw_data->hw_entries);
+ __net_dm_cpu_data_fini(hw_data);
+}
+
+static int __init init_net_drop_monitor(void)
+{
+ int cpu, rc;
+
+ pr_info("Initializing network drop monitor service\n");
+
+ if (sizeof(void *) > 8) {
+ pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
+ return -ENOSPC;
+ }
+
+ for_each_possible_cpu(cpu) {
+ net_dm_cpu_data_init(cpu);
+ net_dm_hw_cpu_data_init(cpu);
+ }
+
+ rc = register_netdevice_notifier(&dropmon_net_notifier);
+ if (rc < 0) {
+ pr_crit("Failed to register netdevice notifier\n");
+ return rc;
+ }
+
+ rc = genl_register_family(&net_drop_monitor_family);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
+ goto out_unreg;
+ }
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
+
+ rc = 0;
+
+ goto out;
+
+out_unreg:
+ WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
+out:
+ return rc;
+}
+
+static void exit_net_drop_monitor(void)
+{
+ int cpu;
+
+ /*
+ * Because of the module_get/put we do in the trace state change path
+ * we are guaranteed not to have any current users when we get here
+ */
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
+
+ for_each_possible_cpu(cpu) {
+ net_dm_hw_cpu_data_fini(cpu);
+ net_dm_cpu_data_fini(cpu);
+ }
+}
+
+module_init(init_net_drop_monitor);
+module_exit(exit_net_drop_monitor);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
+MODULE_ALIAS_GENL_FAMILY("NET_DM");
+MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts");
diff --git a/net/core/dst.c b/net/core/dst.c
index 1a5e49da0e77..e9d35f49c9e7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/core/dst.c Protocol independent destination cache.
*
@@ -9,275 +10,346 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/workqueue.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/netdevice.h>
-#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/sched.h>
+#include <linux/prefetch.h>
+#include <net/lwtunnel.h>
+#include <net/xfrm.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
-/* Locking strategy:
- * 1) Garbage collection state of dead destination cache
- * entries is protected by dst_lock.
- * 2) GC is run only from BH context, and is the only remover
- * of entries.
- * 3) Entries are added to the garbage list from both BH
- * and non-BH context, so local BH disabling is needed.
- * 4) All operations modify state, so a spinlock is used.
- */
-static struct dst_entry *dst_garbage_list;
-#if RT_CACHE_DEBUG >= 2
-static atomic_t dst_total = ATOMIC_INIT(0);
+int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dst_discard_out);
+
+const struct dst_metrics dst_default_metrics = {
+ /* This initializer is needed to force linker to place this variable
+ * into const section. Otherwise it might end into bss section.
+ * We really want to avoid false sharing on this variable, and catch
+ * any writes on it.
+ */
+ .refcnt = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL(dst_default_metrics);
+
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+ struct net_device *dev, int initial_obsolete,
+ unsigned short flags)
+{
+ dst->dev = dev;
+ netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC);
+ dst->ops = ops;
+ dst_init_metrics(dst, dst_default_metrics.metrics, true);
+ dst->expires = 0UL;
+#ifdef CONFIG_XFRM
+ dst->xfrm = NULL;
+#endif
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->error = 0;
+ dst->obsolete = initial_obsolete;
+ dst->header_len = 0;
+ dst->trailer_len = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->tclassid = 0;
#endif
-static DEFINE_SPINLOCK(dst_lock);
+ dst->lwtstate = NULL;
+ rcuref_init(&dst->__rcuref, 1);
+ INIT_LIST_HEAD(&dst->rt_uncached);
+ dst->__use = 0;
+ dst->lastuse = jiffies;
+ dst->flags = flags;
+ if (!(flags & DST_NOCOUNT))
+ dst_entries_add(ops, 1);
+}
+EXPORT_SYMBOL(dst_init);
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_obsolete, unsigned short flags)
+{
+ struct dst_entry *dst;
+
+ if (ops->gc &&
+ !(flags & DST_NOCOUNT) &&
+ dst_entries_get_fast(ops) > ops->gc_thresh)
+ ops->gc(ops);
+
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
-static unsigned long dst_gc_timer_expires;
-static unsigned long dst_gc_timer_inc = DST_GC_MAX;
-static void dst_run_gc(unsigned long);
-static void ___dst_free(struct dst_entry * dst);
+ dst_init(dst, ops, dev, initial_obsolete, flags);
-static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0);
+ return dst;
+}
+EXPORT_SYMBOL(dst_alloc);
-static void dst_run_gc(unsigned long dummy)
+static void dst_destroy(struct dst_entry *dst)
{
- int delayed = 0;
- int work_performed;
- struct dst_entry * dst, **dstp;
+ struct dst_entry *child = NULL;
- if (!spin_trylock(&dst_lock)) {
- mod_timer(&dst_gc_timer, jiffies + HZ/10);
- return;
- }
+ smp_rmb();
- del_timer(&dst_gc_timer);
- dstp = &dst_garbage_list;
- work_performed = 0;
- while ((dst = *dstp) != NULL) {
- if (atomic_read(&dst->__refcnt)) {
- dstp = &dst->next;
- delayed++;
- continue;
- }
- *dstp = dst->next;
- work_performed = 1;
-
- dst = dst_destroy(dst);
- if (dst) {
- /* NOHASH and still referenced. Unless it is already
- * on gc list, invalidate it and add to gc list.
- *
- * Note: this is temporary. Actually, NOHASH dst's
- * must be obsoleted when parent is obsoleted.
- * But we do not have state "obsoleted, but
- * referenced by parent", so it is right.
- */
- if (dst->obsolete > 1)
- continue;
-
- ___dst_free(dst);
- dst->next = *dstp;
- *dstp = dst;
- dstp = &dst->next;
- }
- }
- if (!dst_garbage_list) {
- dst_gc_timer_inc = DST_GC_MAX;
- goto out;
- }
- if (!work_performed) {
- if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
- dst_gc_timer_expires = DST_GC_MAX;
- dst_gc_timer_inc += DST_GC_INC;
- } else {
- dst_gc_timer_inc = DST_GC_INC;
- dst_gc_timer_expires = DST_GC_MIN;
+#ifdef CONFIG_XFRM
+ if (dst->xfrm) {
+ struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+
+ child = xdst->child;
}
-#if RT_CACHE_DEBUG >= 2
- printk("dst_total: %d/%d %ld\n",
- atomic_read(&dst_total), delayed, dst_gc_timer_expires);
#endif
- mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
+ if (dst->ops->destroy)
+ dst->ops->destroy(dst);
+ netdev_put(dst->dev, &dst->dev_tracker);
+
+ lwtstate_put(dst->lwtstate);
-out:
- spin_unlock(&dst_lock);
+ if (dst->flags & DST_METADATA)
+ metadata_dst_free((struct metadata_dst *)dst);
+ else
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ dst = child;
+ if (dst)
+ dst_release_immediate(dst);
}
-static int dst_discard_in(struct sk_buff *skb)
+static void dst_destroy_rcu(struct rcu_head *head)
{
- kfree_skb(skb);
- return 0;
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst_destroy(dst);
}
-static int dst_discard_out(struct sk_buff *skb)
+/* Operations to mark dst as DEAD and clean up the net device referenced
+ * by dst:
+ * 1. put the dst under blackhole interface and discard all tx/rx packets
+ * on this route.
+ * 2. release the net_device
+ * This function should be called when removing routes from the fib tree
+ * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
+ * make the next dst_ops->check() fail.
+ */
+void dst_dev_put(struct dst_entry *dst)
{
- kfree_skb(skb);
- return 0;
+ struct net_device *dev = dst->dev;
+
+ WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD);
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev);
+ WRITE_ONCE(dst->input, dst_discard);
+ WRITE_ONCE(dst->output, dst_discard_out);
+ rcu_assign_pointer(dst->dev_rcu, blackhole_netdev);
+ netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
+ GFP_ATOMIC);
}
+EXPORT_SYMBOL(dst_dev_put);
-void * dst_alloc(struct dst_ops * ops)
+static void dst_count_dec(struct dst_entry *dst)
{
- struct dst_entry * dst;
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
+}
- if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
- if (ops->gc())
- return NULL;
- }
- dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC);
- if (!dst)
- return NULL;
- memset(dst, 0, ops->entry_size);
- atomic_set(&dst->__refcnt, 0);
- dst->ops = ops;
- dst->lastuse = jiffies;
- dst->path = dst;
- dst->input = dst_discard_in;
- dst->output = dst_discard_out;
-#if RT_CACHE_DEBUG >= 2
- atomic_inc(&dst_total);
+void dst_release(struct dst_entry *dst)
+{
+ if (dst && rcuref_put(&dst->__rcuref)) {
+#ifdef CONFIG_DST_CACHE
+ if (dst->flags & DST_METADATA) {
+ struct metadata_dst *md_dst = (struct metadata_dst *)dst;
+
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_reset_now(&md_dst->u.tun_info.dst_cache);
+ }
#endif
- atomic_inc(&ops->entries);
- return dst;
+ dst_count_dec(dst);
+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
+ }
}
+EXPORT_SYMBOL(dst_release);
-static void ___dst_free(struct dst_entry * dst)
+void dst_release_immediate(struct dst_entry *dst)
{
- /* The first case (dev==NULL) is required, when
- protocol module is unloaded.
- */
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
- dst->input = dst_discard_in;
- dst->output = dst_discard_out;
+ if (dst && rcuref_put(&dst->__rcuref)) {
+ dst_count_dec(dst);
+ dst_destroy(dst);
}
- dst->obsolete = 2;
}
+EXPORT_SYMBOL(dst_release_immediate);
-void __dst_free(struct dst_entry * dst)
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
- spin_lock_bh(&dst_lock);
- ___dst_free(dst);
- dst->next = dst_garbage_list;
- dst_garbage_list = dst;
- if (dst_gc_timer_inc > DST_GC_INC) {
- dst_gc_timer_inc = DST_GC_INC;
- dst_gc_timer_expires = DST_GC_MIN;
- mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
+ struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
+
+ if (p) {
+ struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ refcount_set(&p->refcnt, 1);
+ memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ } else if (prev & DST_METRICS_REFCOUNTED) {
+ if (refcount_dec_and_test(&old_p->refcnt))
+ kfree(old_p);
+ }
}
- spin_unlock_bh(&dst_lock);
+ BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
+ return (u32 *)p;
}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
-struct dst_entry *dst_destroy(struct dst_entry * dst)
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
- struct dst_entry *child;
- struct neighbour *neigh;
- struct hh_cache *hh;
-
- smp_rmb();
+ unsigned long prev, new;
-again:
- neigh = dst->neighbour;
- hh = dst->hh;
- child = dst->child;
+ new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
- dst->hh = NULL;
- if (hh && atomic_dec_and_test(&hh->hh_refcnt))
- kfree(hh);
+struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
- if (neigh) {
- dst->neighbour = NULL;
- neigh_release(neigh);
- }
+u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ return NULL;
+}
- atomic_dec(&dst->ops->entries);
+struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ return NULL;
+}
- if (dst->ops->destroy)
- dst->ops->destroy(dst);
- if (dst->dev)
- dev_put(dst->dev);
-#if RT_CACHE_DEBUG >= 2
- atomic_dec(&dst_total);
-#endif
- kmem_cache_free(dst->ops->kmem_cachep, dst);
+void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
+{
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);
- dst = child;
- if (dst) {
- int nohash = dst->flags & DST_NOHASH;
-
- if (atomic_dec_and_test(&dst->__refcnt)) {
- /* We were real parent of this dst, so kill child. */
- if (nohash)
- goto again;
- } else {
- /* Child is still referenced, return it for freeing. */
- if (nohash)
- return dst;
- /* Child is still in his hash table */
- }
- }
- return NULL;
+void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
}
+EXPORT_SYMBOL_GPL(dst_blackhole_redirect);
-/* Dirty hack. We did it in 2.2 (in __dst_free),
- * we have _very_ good reasons not to repeat
- * this mistake in 2.3, but we have no choice
- * now. _It_ _is_ _explicit_ _deliberate_
- * _race_ _condition_.
- *
- * Commented and originally written by Alexey.
- */
-static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
+unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
{
- if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, unregister);
-
- if (dev != dst->dev)
- return;
-
- if (!unregister) {
- dst->input = dst_discard_in;
- dst->output = dst_discard_out;
- } else {
- dst->dev = &loopback_dev;
- dev_hold(&loopback_dev);
- dev_put(dev);
- if (dst->neighbour && dst->neighbour->dev == dev) {
- dst->neighbour->dev = &loopback_dev;
- dev_put(dev);
- dev_hold(&loopback_dev);
- }
- }
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst_dev(dst)->mtu;
}
+EXPORT_SYMBOL_GPL(dst_blackhole_mtu);
+
+static struct dst_ops dst_blackhole_ops = {
+ .family = AF_UNSPEC,
+ .neigh_lookup = dst_blackhole_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
+};
-static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static void __metadata_dst_init(struct metadata_dst *md_dst,
+ enum metadata_type type, u8 optslen)
{
- struct net_device *dev = ptr;
struct dst_entry *dst;
- switch (event) {
- case NETDEV_UNREGISTER:
- case NETDEV_DOWN:
- spin_lock_bh(&dst_lock);
- for (dst = dst_garbage_list; dst; dst = dst->next) {
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- }
- spin_unlock_bh(&dst_lock);
- break;
- }
- return NOTIFY_DONE;
+ dst = &md_dst->dst;
+ dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
+ DST_METADATA | DST_NOCOUNT);
+ memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+ md_dst->type = type;
}
-static struct notifier_block dst_dev_notifier = {
- .notifier_call = dst_dev_event,
-};
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+ gfp_t flags)
+{
+ struct metadata_dst *md_dst;
+
+ md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen),
+ flags);
+ if (!md_dst)
+ return NULL;
+
+ __metadata_dst_init(md_dst, type, optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+ if (md_dst->type == METADATA_XFRM)
+ dst_release(md_dst->u.xfrm_info.dst_orig);
+ kfree(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free);
-void __init dst_init(void)
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
{
- register_netdevice_notifier(&dst_dev_notifier);
+ int cpu;
+ struct metadata_dst __percpu *md_dst;
+
+ md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options,
+ optslen),
+ __alignof__(struct metadata_dst), flags);
+ if (!md_dst)
+ return NULL;
+
+ for_each_possible_cpu(cpu)
+ __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
+
+ return md_dst;
}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
-EXPORT_SYMBOL(__dst_free);
-EXPORT_SYMBOL(dst_alloc);
-EXPORT_SYMBOL(dst_destroy);
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
+
+#ifdef CONFIG_DST_CACHE
+ if (one_md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
+#endif
+ if (one_md_dst->type == METADATA_XFRM)
+ dst_release(one_md_dst->u.xfrm_info.dst_orig);
+ }
+ free_percpu(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000000..9ab4902324e1
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+ unsigned long refresh_ts;
+ struct dst_entry *dst;
+ local_lock_t bh_lock;
+ u32 cookie;
+ union {
+ struct in_addr in_saddr;
+ struct in6_addr in6_saddr;
+ };
+};
+
+static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+ struct dst_entry *dst, u32 cookie)
+{
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+ dst_release(dst_cache->dst);
+ if (dst)
+ dst_hold(dst);
+
+ dst_cache->cookie = cookie;
+ dst_cache->dst = dst;
+}
+
+static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+ struct dst_cache_pcpu *idst)
+{
+ struct dst_entry *dst;
+
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+ dst = idst->dst;
+ if (!dst)
+ goto fail;
+
+ /* the cache already hold a dst reference; it can't go away */
+ dst_hold(dst);
+
+ if (unlikely(!time_after(idst->refresh_ts,
+ READ_ONCE(dst_cache->reset_ts)) ||
+ (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) {
+ dst_cache_per_cpu_dst_set(idst, NULL, 0);
+ dst_release(dst);
+ goto fail;
+ }
+ return dst;
+
+fail:
+ idst->refresh_ts = jiffies;
+ return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return NULL;
+ }
+
+ *saddr = idst->in_saddr.s_addr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst_rtable(dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+ __be32 saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(idst, dst, 0);
+ idst->in_saddr.s_addr = saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+ const struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(idst, dst,
+ rt6_get_cookie(dst_rt6_info(dst)));
+ idst->in6_saddr = *saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+ struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return NULL;
+ }
+
+ *saddr = idst->in6_saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+ unsigned int i;
+
+ dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+ gfp | __GFP_ZERO);
+ if (!dst_cache->cache)
+ return -ENOMEM;
+ for_each_possible_cpu(i)
+ local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock);
+
+ dst_cache_reset(dst_cache);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ for_each_possible_cpu(i)
+ dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+ free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_reset_now(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ dst_cache_reset(dst_cache);
+ for_each_possible_cpu(i) {
+ struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
+ struct dst_entry *dst = idst->dst;
+
+ idst->cookie = 0;
+ idst->dst = NULL;
+ dst_release(dst);
+ }
+}
+EXPORT_SYMBOL_GPL(dst_cache_reset_now);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
deleted file mode 100644
index 87dc556fd9d6..000000000000
--- a/net/core/ethtool.c
+++ /dev/null
@@ -1,984 +0,0 @@
-/*
- * net/core/ethtool.c - Ethtool ioctl handler
- * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
- *
- * This file is where we call all the ethtool_ops commands to get
- * the information ethtool needs. We fall back to calling do_ioctl()
- * for drivers which haven't been converted to ethtool_ops yet.
- *
- * It's GPL, stupid.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/ethtool.h>
-#include <linux/netdevice.h>
-#include <asm/uaccess.h>
-
-/*
- * Some useful ethtool_ops methods that're device independent.
- * If we find that all drivers want to do the same thing here,
- * we can turn these into dev_() function calls.
- */
-
-u32 ethtool_op_get_link(struct net_device *dev)
-{
- return netif_carrier_ok(dev) ? 1 : 0;
-}
-
-u32 ethtool_op_get_tx_csum(struct net_device *dev)
-{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_IP_CSUM;
- else
- dev->features &= ~NETIF_F_IP_CSUM;
-
- return 0;
-}
-
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_HW_CSUM;
- else
- dev->features &= ~NETIF_F_HW_CSUM;
-
- return 0;
-}
-u32 ethtool_op_get_sg(struct net_device *dev)
-{
- return (dev->features & NETIF_F_SG) != 0;
-}
-
-int ethtool_op_set_sg(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_SG;
- else
- dev->features &= ~NETIF_F_SG;
-
- return 0;
-}
-
-u32 ethtool_op_get_tso(struct net_device *dev)
-{
- return (dev->features & NETIF_F_TSO) != 0;
-}
-
-int ethtool_op_set_tso(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_TSO;
- else
- dev->features &= ~NETIF_F_TSO;
-
- return 0;
-}
-
-int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
-{
- unsigned char len = dev->addr_len;
- if ( addr->size < len )
- return -ETOOSMALL;
-
- addr->size = len;
- memcpy(data, dev->perm_addr, len);
- return 0;
-}
-
-
-u32 ethtool_op_get_ufo(struct net_device *dev)
-{
- return (dev->features & NETIF_F_UFO) != 0;
-}
-
-int ethtool_op_set_ufo(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_UFO;
- else
- dev->features &= ~NETIF_F_UFO;
- return 0;
-}
-
-/* Handlers for each ethtool command */
-
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_cmd cmd = { ETHTOOL_GSET };
- int err;
-
- if (!dev->ethtool_ops->get_settings)
- return -EOPNOTSUPP;
-
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
-
- if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_cmd cmd;
-
- if (!dev->ethtool_ops->set_settings)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
- return -EFAULT;
-
- return dev->ethtool_ops->set_settings(dev, &cmd);
-}
-
-static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_drvinfo info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
-
- if (!ops->get_drvinfo)
- return -EOPNOTSUPP;
-
- memset(&info, 0, sizeof(info));
- info.cmd = ETHTOOL_GDRVINFO;
- ops->get_drvinfo(dev, &info);
-
- if (ops->self_test_count)
- info.testinfo_len = ops->self_test_count(dev);
- if (ops->get_stats_count)
- info.n_stats = ops->get_stats_count(dev);
- if (ops->get_regs_len)
- info.regdump_len = ops->get_regs_len(dev);
- if (ops->get_eeprom_len)
- info.eedump_len = ops->get_eeprom_len(dev);
-
- if (copy_to_user(useraddr, &info, sizeof(info)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_regs regs;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- void *regbuf;
- int reglen, ret;
-
- if (!ops->get_regs || !ops->get_regs_len)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&regs, useraddr, sizeof(regs)))
- return -EFAULT;
-
- reglen = ops->get_regs_len(dev);
- if (regs.len > reglen)
- regs.len = reglen;
-
- regbuf = kmalloc(reglen, GFP_USER);
- if (!regbuf)
- return -ENOMEM;
-
- ops->get_regs(dev, &regs, regbuf);
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &regs, sizeof(regs)))
- goto out;
- useraddr += offsetof(struct ethtool_regs, data);
- if (copy_to_user(useraddr, regbuf, regs.len))
- goto out;
- ret = 0;
-
- out:
- kfree(regbuf);
- return ret;
-}
-
-static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
-
- if (!dev->ethtool_ops->get_wol)
- return -EOPNOTSUPP;
-
- dev->ethtool_ops->get_wol(dev, &wol);
-
- if (copy_to_user(useraddr, &wol, sizeof(wol)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_wolinfo wol;
-
- if (!dev->ethtool_ops->set_wol)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&wol, useraddr, sizeof(wol)))
- return -EFAULT;
-
- return dev->ethtool_ops->set_wol(dev, &wol);
-}
-
-static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GMSGLVL };
-
- if (!dev->ethtool_ops->get_msglevel)
- return -EOPNOTSUPP;
-
- edata.data = dev->ethtool_ops->get_msglevel(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_msglevel)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- dev->ethtool_ops->set_msglevel(dev, edata.data);
- return 0;
-}
-
-static int ethtool_nway_reset(struct net_device *dev)
-{
- if (!dev->ethtool_ops->nway_reset)
- return -EOPNOTSUPP;
-
- return dev->ethtool_ops->nway_reset(dev);
-}
-
-static int ethtool_get_link(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GLINK };
-
- if (!dev->ethtool_ops->get_link)
- return -EOPNOTSUPP;
-
- edata.data = dev->ethtool_ops->get_link(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_eeprom eeprom;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- u8 *data;
- int ret;
-
- if (!ops->get_eeprom || !ops->get_eeprom_len)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
- return -EFAULT;
-
- /* Check for wrap and zero */
- if (eeprom.offset + eeprom.len <= eeprom.offset)
- return -EINVAL;
-
- /* Check for exceeding total eeprom len */
- if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
- return -EINVAL;
-
- data = kmalloc(eeprom.len, GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
- goto out;
-
- ret = ops->get_eeprom(dev, &eeprom, data);
- if (ret)
- goto out;
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
- goto out;
- if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
- goto out;
- ret = 0;
-
- out:
- kfree(data);
- return ret;
-}
-
-static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_eeprom eeprom;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- u8 *data;
- int ret;
-
- if (!ops->set_eeprom || !ops->get_eeprom_len)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
- return -EFAULT;
-
- /* Check for wrap and zero */
- if (eeprom.offset + eeprom.len <= eeprom.offset)
- return -EINVAL;
-
- /* Check for exceeding total eeprom len */
- if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
- return -EINVAL;
-
- data = kmalloc(eeprom.len, GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
- goto out;
-
- ret = ops->set_eeprom(dev, &eeprom, data);
- if (ret)
- goto out;
-
- if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
- ret = -EFAULT;
-
- out:
- kfree(data);
- return ret;
-}
-
-static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE };
-
- if (!dev->ethtool_ops->get_coalesce)
- return -EOPNOTSUPP;
-
- dev->ethtool_ops->get_coalesce(dev, &coalesce);
-
- if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_coalesce coalesce;
-
- if (!dev->ethtool_ops->set_coalesce)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
- return -EFAULT;
-
- return dev->ethtool_ops->set_coalesce(dev, &coalesce);
-}
-
-static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM };
-
- if (!dev->ethtool_ops->get_ringparam)
- return -EOPNOTSUPP;
-
- dev->ethtool_ops->get_ringparam(dev, &ringparam);
-
- if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_ringparam ringparam;
-
- if (!dev->ethtool_ops->set_ringparam)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
- return -EFAULT;
-
- return dev->ethtool_ops->set_ringparam(dev, &ringparam);
-}
-
-static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
-
- if (!dev->ethtool_ops->get_pauseparam)
- return -EOPNOTSUPP;
-
- dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
-
- if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_pauseparam pauseparam;
-
- if (!dev->ethtool_ops->set_pauseparam)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
- return -EFAULT;
-
- return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
-}
-
-static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GRXCSUM };
-
- if (!dev->ethtool_ops->get_rx_csum)
- return -EOPNOTSUPP;
-
- edata.data = dev->ethtool_ops->get_rx_csum(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_rx_csum)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- dev->ethtool_ops->set_rx_csum(dev, edata.data);
- return 0;
-}
-
-static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GTXCSUM };
-
- if (!dev->ethtool_ops->get_tx_csum)
- return -EOPNOTSUPP;
-
- edata.data = dev->ethtool_ops->get_tx_csum(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int __ethtool_set_sg(struct net_device *dev, u32 data)
-{
- int err;
-
- if (!data && dev->ethtool_ops->set_tso) {
- err = dev->ethtool_ops->set_tso(dev, 0);
- if (err)
- return err;
- }
-
- if (!data && dev->ethtool_ops->set_ufo) {
- err = dev->ethtool_ops->set_ufo(dev, 0);
- if (err)
- return err;
- }
- return dev->ethtool_ops->set_sg(dev, data);
-}
-
-static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
- int err;
-
- if (!dev->ethtool_ops->set_tx_csum)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg) {
- err = __ethtool_set_sg(dev, 0);
- if (err)
- return err;
- }
-
- return dev->ethtool_ops->set_tx_csum(dev, edata.data);
-}
-
-static int ethtool_get_sg(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GSG };
-
- if (!dev->ethtool_ops->get_sg)
- return -EOPNOTSUPP;
-
- edata.data = dev->ethtool_ops->get_sg(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_sg)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data &&
- !(dev->features & NETIF_F_ALL_CSUM))
- return -EINVAL;
-
- return __ethtool_set_sg(dev, edata.data);
-}
-
-static int ethtool_get_tso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GTSO };
-
- if (!dev->ethtool_ops->get_tso)
- return -EOPNOTSUPP;
-
- edata.data = dev->ethtool_ops->get_tso(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_tso)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
-
- return dev->ethtool_ops->set_tso(dev, edata.data);
-}
-
-static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GUFO };
-
- if (!dev->ethtool_ops->get_ufo)
- return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_ufo(dev);
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_ufo)
- return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
- if (edata.data && !(dev->features & NETIF_F_HW_CSUM))
- return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, edata.data);
-}
-
-static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GGSO };
-
- edata.data = dev->features & NETIF_F_GSO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data)
- dev->features |= NETIF_F_GSO;
- else
- dev->features &= ~NETIF_F_GSO;
- return 0;
-}
-
-static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_test test;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- u64 *data;
- int ret;
-
- if (!ops->self_test || !ops->self_test_count)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&test, useraddr, sizeof(test)))
- return -EFAULT;
-
- test.len = ops->self_test_count(dev);
- data = kmalloc(test.len * sizeof(u64), GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ops->self_test(dev, &test, data);
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &test, sizeof(test)))
- goto out;
- useraddr += sizeof(test);
- if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
- goto out;
- ret = 0;
-
- out:
- kfree(data);
- return ret;
-}
-
-static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_gstrings gstrings;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- u8 *data;
- int ret;
-
- if (!ops->get_strings)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
- return -EFAULT;
-
- switch (gstrings.string_set) {
- case ETH_SS_TEST:
- if (!ops->self_test_count)
- return -EOPNOTSUPP;
- gstrings.len = ops->self_test_count(dev);
- break;
- case ETH_SS_STATS:
- if (!ops->get_stats_count)
- return -EOPNOTSUPP;
- gstrings.len = ops->get_stats_count(dev);
- break;
- default:
- return -EINVAL;
- }
-
- data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ops->get_strings(dev, gstrings.string_set, data);
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
- goto out;
- useraddr += sizeof(gstrings);
- if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
- goto out;
- ret = 0;
-
- out:
- kfree(data);
- return ret;
-}
-
-static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_value id;
-
- if (!dev->ethtool_ops->phys_id)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&id, useraddr, sizeof(id)))
- return -EFAULT;
-
- return dev->ethtool_ops->phys_id(dev, id.data);
-}
-
-static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_stats stats;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- u64 *data;
- int ret;
-
- if (!ops->get_ethtool_stats || !ops->get_stats_count)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&stats, useraddr, sizeof(stats)))
- return -EFAULT;
-
- stats.n_stats = ops->get_stats_count(dev);
- data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ops->get_ethtool_stats(dev, &stats, data);
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &stats, sizeof(stats)))
- goto out;
- useraddr += sizeof(stats);
- if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
- goto out;
- ret = 0;
-
- out:
- kfree(data);
- return ret;
-}
-
-static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_perm_addr epaddr;
- u8 *data;
- int ret;
-
- if (!dev->ethtool_ops->get_perm_addr)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
- return -EFAULT;
-
- data = kmalloc(epaddr.size, GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
- if (ret)
- return ret;
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
- goto out;
- useraddr += sizeof(epaddr);
- if (copy_to_user(useraddr, data, epaddr.size))
- goto out;
- ret = 0;
-
- out:
- kfree(data);
- return ret;
-}
-
-/* The main entry point in this file. Called from net/core/dev.c */
-
-int dev_ethtool(struct ifreq *ifr)
-{
- struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
- void __user *useraddr = ifr->ifr_data;
- u32 ethcmd;
- int rc;
- unsigned long old_features;
-
- if (!dev || !netif_device_present(dev))
- return -ENODEV;
-
- if (!dev->ethtool_ops)
- goto ioctl;
-
- if (copy_from_user(&ethcmd, useraddr, sizeof (ethcmd)))
- return -EFAULT;
-
- /* Allow some commands to be done by anyone */
- switch(ethcmd) {
- case ETHTOOL_GDRVINFO:
- case ETHTOOL_GMSGLVL:
- case ETHTOOL_GCOALESCE:
- case ETHTOOL_GRINGPARAM:
- case ETHTOOL_GPAUSEPARAM:
- case ETHTOOL_GRXCSUM:
- case ETHTOOL_GTXCSUM:
- case ETHTOOL_GSG:
- case ETHTOOL_GSTRINGS:
- case ETHTOOL_GTSO:
- case ETHTOOL_GPERMADDR:
- case ETHTOOL_GUFO:
- case ETHTOOL_GGSO:
- break;
- default:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- }
-
- if(dev->ethtool_ops->begin)
- if ((rc = dev->ethtool_ops->begin(dev)) < 0)
- return rc;
-
- old_features = dev->features;
-
- switch (ethcmd) {
- case ETHTOOL_GSET:
- rc = ethtool_get_settings(dev, useraddr);
- break;
- case ETHTOOL_SSET:
- rc = ethtool_set_settings(dev, useraddr);
- break;
- case ETHTOOL_GDRVINFO:
- rc = ethtool_get_drvinfo(dev, useraddr);
- break;
- case ETHTOOL_GREGS:
- rc = ethtool_get_regs(dev, useraddr);
- break;
- case ETHTOOL_GWOL:
- rc = ethtool_get_wol(dev, useraddr);
- break;
- case ETHTOOL_SWOL:
- rc = ethtool_set_wol(dev, useraddr);
- break;
- case ETHTOOL_GMSGLVL:
- rc = ethtool_get_msglevel(dev, useraddr);
- break;
- case ETHTOOL_SMSGLVL:
- rc = ethtool_set_msglevel(dev, useraddr);
- break;
- case ETHTOOL_NWAY_RST:
- rc = ethtool_nway_reset(dev);
- break;
- case ETHTOOL_GLINK:
- rc = ethtool_get_link(dev, useraddr);
- break;
- case ETHTOOL_GEEPROM:
- rc = ethtool_get_eeprom(dev, useraddr);
- break;
- case ETHTOOL_SEEPROM:
- rc = ethtool_set_eeprom(dev, useraddr);
- break;
- case ETHTOOL_GCOALESCE:
- rc = ethtool_get_coalesce(dev, useraddr);
- break;
- case ETHTOOL_SCOALESCE:
- rc = ethtool_set_coalesce(dev, useraddr);
- break;
- case ETHTOOL_GRINGPARAM:
- rc = ethtool_get_ringparam(dev, useraddr);
- break;
- case ETHTOOL_SRINGPARAM:
- rc = ethtool_set_ringparam(dev, useraddr);
- break;
- case ETHTOOL_GPAUSEPARAM:
- rc = ethtool_get_pauseparam(dev, useraddr);
- break;
- case ETHTOOL_SPAUSEPARAM:
- rc = ethtool_set_pauseparam(dev, useraddr);
- break;
- case ETHTOOL_GRXCSUM:
- rc = ethtool_get_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_SRXCSUM:
- rc = ethtool_set_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_GTXCSUM:
- rc = ethtool_get_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_STXCSUM:
- rc = ethtool_set_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_GSG:
- rc = ethtool_get_sg(dev, useraddr);
- break;
- case ETHTOOL_SSG:
- rc = ethtool_set_sg(dev, useraddr);
- break;
- case ETHTOOL_GTSO:
- rc = ethtool_get_tso(dev, useraddr);
- break;
- case ETHTOOL_STSO:
- rc = ethtool_set_tso(dev, useraddr);
- break;
- case ETHTOOL_TEST:
- rc = ethtool_self_test(dev, useraddr);
- break;
- case ETHTOOL_GSTRINGS:
- rc = ethtool_get_strings(dev, useraddr);
- break;
- case ETHTOOL_PHYS_ID:
- rc = ethtool_phys_id(dev, useraddr);
- break;
- case ETHTOOL_GSTATS:
- rc = ethtool_get_stats(dev, useraddr);
- break;
- case ETHTOOL_GPERMADDR:
- rc = ethtool_get_perm_addr(dev, useraddr);
- break;
- case ETHTOOL_GUFO:
- rc = ethtool_get_ufo(dev, useraddr);
- break;
- case ETHTOOL_SUFO:
- rc = ethtool_set_ufo(dev, useraddr);
- break;
- case ETHTOOL_GGSO:
- rc = ethtool_get_gso(dev, useraddr);
- break;
- case ETHTOOL_SGSO:
- rc = ethtool_set_gso(dev, useraddr);
- break;
- default:
- rc = -EOPNOTSUPP;
- }
-
- if(dev->ethtool_ops->complete)
- dev->ethtool_ops->complete(dev);
-
- if (old_features != dev->features)
- netdev_features_change(dev);
-
- return rc;
-
- ioctl:
- /* Keep existing behaviour for the moment. */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (dev->do_ioctl)
- return dev->do_ioctl(dev, ifr, SIOCETHTOOL);
- return -EOPNOTSUPP;
-}
-
-EXPORT_SYMBOL(dev_ethtool);
-EXPORT_SYMBOL(ethtool_op_get_link);
-EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
-EXPORT_SYMBOL(ethtool_op_get_sg);
-EXPORT_SYMBOL(ethtool_op_get_tso);
-EXPORT_SYMBOL(ethtool_op_get_tx_csum);
-EXPORT_SYMBOL(ethtool_op_set_sg);
-EXPORT_SYMBOL(ethtool_op_set_tso);
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-EXPORT_SYMBOL(ethtool_op_set_ufo);
-EXPORT_SYMBOL(ethtool_op_get_ufo);
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..2a140b3ea669
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+ int err;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_register &&
+ fops->slave_pre_register(slave_dev, failover_dev))
+ goto done;
+
+ err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto done;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
+
+ if (fops && fops->slave_register &&
+ !fops->slave_register(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_unregister &&
+ fops->slave_pre_unregister(slave_dev, failover_dev))
+ goto done;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
+
+ if (fops && fops->slave_unregister &&
+ !fops->slave_unregister(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change &&
+ !fops->slave_link_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change &&
+ !fops->slave_name_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ netdev_put(failover_dev, &failover->dev_tracker);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
new file mode 100644
index 000000000000..5cdca49b1d7c
--- /dev/null
+++ b/net/core/fib_notifier.c
@@ -0,0 +1,197 @@
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/fib_notifier.h>
+
+static unsigned int fib_notifier_net_id;
+
+struct fib_notifier_net {
+ struct list_head fib_notifier_ops;
+ struct atomic_notifier_head fib_chain;
+};
+
+int call_fib_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ int err;
+
+ err = nb->notifier_call(nb, event_type, info);
+ return notifier_to_errno(err);
+}
+EXPORT_SYMBOL(call_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+ int err;
+
+ err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info);
+ return notifier_to_errno(err);
+}
+EXPORT_SYMBOL(call_fib_notifiers);
+
+static unsigned int fib_seq_sum(struct net *net)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+ struct fib_notifier_ops *ops;
+ unsigned int fib_seq = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
+ if (!try_module_get(ops->owner))
+ continue;
+ fib_seq += ops->fib_seq_read(net);
+ module_put(ops->owner);
+ }
+ rcu_read_unlock();
+
+ return fib_seq;
+}
+
+static int fib_net_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+ struct fib_notifier_ops *ops;
+ int err = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
+ if (!try_module_get(ops->owner))
+ continue;
+ err = ops->fib_dump(net, nb, extack);
+ module_put(ops->owner);
+ if (err)
+ goto unlock;
+ }
+
+unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ atomic_notifier_chain_register(&fn_net->fib_chain, nb);
+ if (fib_seq == fib_seq_sum(net))
+ return true;
+ atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct net *net, struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ struct netlink_ext_ack *extack)
+{
+ int retries = 0;
+ int err;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum(net);
+
+ err = fib_net_dump(net, nb, extack);
+ if (err)
+ return err;
+
+ if (fib_dump_is_consistent(net, nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct net *net, struct notifier_block *nb)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+static int __fib_notifier_ops_register(struct fib_notifier_ops *ops,
+ struct net *net)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+ struct fib_notifier_ops *o;
+
+ list_for_each_entry(o, &fn_net->fib_notifier_ops, list)
+ if (ops->family == o->family)
+ return -EEXIST;
+ list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops);
+ return 0;
+}
+
+struct fib_notifier_ops *
+fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net)
+{
+ struct fib_notifier_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return ERR_PTR(-ENOMEM);
+
+ err = __fib_notifier_ops_register(ops, net);
+ if (err)
+ goto err_register;
+
+ return ops;
+
+err_register:
+ kfree(ops);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fib_notifier_ops_register);
+
+void fib_notifier_ops_unregister(struct fib_notifier_ops *ops)
+{
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL(fib_notifier_ops_unregister);
+
+static int __net_init fib_notifier_net_init(struct net *net)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ INIT_LIST_HEAD(&fn_net->fib_notifier_ops);
+ ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain);
+ return 0;
+}
+
+static void __net_exit fib_notifier_net_exit(struct net *net)
+{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops));
+}
+
+static struct pernet_operations fib_notifier_net_ops = {
+ .init = fib_notifier_net_init,
+ .exit = fib_notifier_net_exit,
+ .id = &fib_notifier_net_id,
+ .size = sizeof(struct fib_notifier_net),
+};
+
+static int __init fib_notifier_init(void)
+{
+ return register_pernet_subsys(&fib_notifier_net_ops);
+}
+
+subsys_initcall(fib_notifier_init);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 1df6cd4568d3..8ca634964e36 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1,31 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/core/fib_rules.c Generic Routing Rules
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
+#include <net/sock.h>
#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
+#include <linux/indirect_call_wrapper.h>
+
+#if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES)
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+#define INDIRECT_CALL_MT(f, f2, f1, ...) \
+ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
+#endif
+#elif defined(CONFIG_IP_MULTIPLE_TABLES)
+#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__)
+#endif
+
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
+bool fib_rule_matchall(const struct fib_rule *rule)
+{
+ if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) ||
+ rule->mark || rule->tun_id || rule->flags)
+ return false;
+ if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
+ return false;
+ if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
+ !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
+ return false;
+ if (fib_rule_port_range_set(&rule->sport_range))
+ return false;
+ if (fib_rule_port_range_set(&rule->dport_range))
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(fib_rule_matchall);
+
+int fib_default_rule_add(struct fib_rules_ops *ops,
+ u32 pref, u32 table)
+{
+ struct fib_rule *r;
+
+ r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
+ if (r == NULL)
+ return -ENOMEM;
+
+ refcount_set(&r->refcnt, 1);
+ r->action = FR_ACT_TO_TBL;
+ r->pref = pref;
+ r->table = table;
+ r->proto = RTPROT_KERNEL;
+ r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
+
+ r->suppress_prefixlen = -1;
+ r->suppress_ifgroup = -1;
+
+ /* The lock is not required here, the list in unreachable
+ * at the moment this function is called */
+ list_add_tail(&r->list, &ops->rules_list);
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+{
+ struct list_head *pos;
+ struct fib_rule *rule;
+
+ if (!list_empty(&ops->rules_list)) {
+ pos = ops->rules_list.next;
+ if (pos->next != &ops->rules_list) {
+ rule = list_entry(pos->next, struct fib_rule, list);
+ if (rule->pref)
+ return rule->pref - 1;
+ }
+ }
-static LIST_HEAD(rules_ops);
-static DEFINE_SPINLOCK(rules_mod_lock);
+ return 0;
+}
static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid);
-static struct fib_rules_ops *lookup_rules_ops(int family)
+static struct fib_rules_ops *lookup_rules_ops(const struct net *net,
+ int family)
{
struct fib_rules_ops *ops;
rcu_read_lock();
- list_for_each_entry_rcu(ops, &rules_ops, list) {
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
if (ops->family == family) {
if (!try_module_get(ops->owner))
ops = NULL;
@@ -44,10 +126,19 @@ static void rules_ops_put(struct fib_rules_ops *ops)
module_put(ops->owner);
}
-int fib_rules_register(struct fib_rules_ops *ops)
+static void flush_route_cache(struct fib_rules_ops *ops)
+{
+ if (ops->flush_cache)
+ ops->flush_cache(ops);
+}
+
+static int __fib_rules_register(struct fib_rules_ops *ops)
{
int err = -EEXIST;
struct fib_rules_ops *o;
+ struct net *net;
+
+ net = ops->fro_net;
if (ops->rule_size < sizeof(struct fib_rule))
return -EINVAL;
@@ -57,68 +148,164 @@ int fib_rules_register(struct fib_rules_ops *ops)
ops->action == NULL)
return -EINVAL;
- spin_lock(&rules_mod_lock);
- list_for_each_entry(o, &rules_ops, list)
+ spin_lock(&net->rules_mod_lock);
+ list_for_each_entry(o, &net->rules_ops, list)
if (ops->family == o->family)
goto errout;
- list_add_tail_rcu(&ops->list, &rules_ops);
+ list_add_tail_rcu(&ops->list, &net->rules_ops);
err = 0;
errout:
- spin_unlock(&rules_mod_lock);
+ spin_unlock(&net->rules_mod_lock);
return err;
}
+struct fib_rules_ops *
+fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
+{
+ struct fib_rules_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (ops == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ops->rules_list);
+ ops->fro_net = net;
+
+ err = __fib_rules_register(ops);
+ if (err) {
+ kfree(ops);
+ ops = ERR_PTR(err);
+ }
+
+ return ops;
+}
EXPORT_SYMBOL_GPL(fib_rules_register);
-static void cleanup_ops(struct fib_rules_ops *ops)
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
{
struct fib_rule *rule, *tmp;
- list_for_each_entry_safe(rule, tmp, ops->rules_list, list) {
+ list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
list_del_rcu(&rule->list);
+ if (ops->delete)
+ ops->delete(rule);
fib_rule_put(rule);
}
}
-int fib_rules_unregister(struct fib_rules_ops *ops)
+void fib_rules_unregister(struct fib_rules_ops *ops)
{
- int err = 0;
- struct fib_rules_ops *o;
+ struct net *net = ops->fro_net;
- spin_lock(&rules_mod_lock);
- list_for_each_entry(o, &rules_ops, list) {
- if (o == ops) {
- list_del_rcu(&o->list);
- cleanup_ops(ops);
- goto out;
- }
- }
+ spin_lock(&net->rules_mod_lock);
+ list_del_rcu(&ops->list);
+ spin_unlock(&net->rules_mod_lock);
- err = -ENOENT;
-out:
- spin_unlock(&rules_mod_lock);
+ fib_rules_cleanup_ops(ops);
+ kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
- synchronize_rcu();
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
- return err;
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
}
-EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
+static int nla_get_port_range(struct nlattr *pattr,
+ struct fib_rule_port_range *port_range)
+{
+ const struct fib_rule_port_range *pr = nla_data(pattr);
+
+ if (!fib_rule_port_range_valid(pr))
+ return -EINVAL;
+
+ port_range->start = pr->start;
+ port_range->end = pr->end;
+
+ return 0;
+}
+
+static int nla_put_port_range(struct sk_buff *skb, int attrtype,
+ struct fib_rule_port_range *range)
+{
+ return nla_put(skb, attrtype, sizeof(*range), range);
+}
+
+static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex,
+ const struct flowi *fl)
+{
+ u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master);
+
+ return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) :
+ fl->flowi_iif == iifindex;
+}
+
+static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex,
+ const struct flowi *fl)
+{
+ u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master);
+
+ return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) :
+ fl->flowi_oif == oifindex;
+}
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
- struct flowi *fl, int flags)
+ struct flowi *fl, int flags,
+ struct fib_lookup_arg *arg)
{
- int ret = 0;
+ int iifindex, oifindex, ret = 0;
+
+ iifindex = READ_ONCE(rule->iifindex);
+ if (iifindex && !fib_rule_iif_match(rule, iifindex, fl))
+ goto out;
+
+ oifindex = READ_ONCE(rule->oifindex);
+ if (oifindex && !fib_rule_oif_match(rule, oifindex, fl))
+ goto out;
+
+ if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
+ goto out;
- if (rule->ifindex && (rule->ifindex != fl->iif))
+ if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
goto out;
- if ((rule->mark ^ fl->mark) & rule->mark_mask)
+ if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
- ret = ops->match(rule, fl, flags);
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
+ ret = INDIRECT_CALL_MT(ops->match,
+ fib6_rule_match,
+ fib4_rule_match,
+ rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}
@@ -131,190 +318,826 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
rcu_read_lock();
- list_for_each_entry_rcu(rule, ops->rules_list, list) {
- if (!fib_rule_match(rule, ops, fl, flags))
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+jumped:
+ if (!fib_rule_match(rule, ops, fl, flags, arg))
+ continue;
+
+ if (rule->action == FR_ACT_GOTO) {
+ struct fib_rule *target;
+
+ target = rcu_dereference(rule->ctarget);
+ if (target == NULL) {
+ continue;
+ } else {
+ rule = target;
+ goto jumped;
+ }
+ } else if (rule->action == FR_ACT_NOP)
+ continue;
+ else
+ err = INDIRECT_CALL_MT(ops->action,
+ fib6_rule_action,
+ fib4_rule_action,
+ rule, fl, flags, arg);
+
+ if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress,
+ fib6_rule_suppress,
+ fib4_rule_suppress,
+ rule, flags, arg))
continue;
- err = ops->action(rule, fl, flags, arg);
if (err != -EAGAIN) {
- fib_rule_get(rule);
- arg->rule = rule;
- goto out;
+ if ((arg->flags & FIB_LOOKUP_NOREF) ||
+ likely(refcount_inc_not_zero(&rule->refcnt))) {
+ arg->rule = rule;
+ goto out;
+ }
+ break;
}
}
- err = -ENETUNREACH;
+ err = -ESRCH;
out:
rcu_read_unlock();
return err;
}
-
EXPORT_SYMBOL_GPL(fib_rules_lookup);
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int call_fib_rule_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib_rule *rule, int family,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rule_notifier_info info = {
+ .info.family = family,
+ .info.extack = extack,
+ .rule = rule,
+ };
+
+ return call_fib_notifier(nb, event_type, &info.info);
+}
+
+static int call_fib_rule_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib_rule *rule,
+ struct fib_rules_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rule_notifier_info info = {
+ .info.family = ops->family,
+ .info.extack = extack,
+ .rule = rule,
+ };
+
+ ASSERT_RTNL_NET(net);
+
+ /* Paired with READ_ONCE() in fib_rules_seq() */
+ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1);
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
+/* Called with rcu_read_lock() */
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rules_ops *ops;
+ struct fib_rule *rule;
+ int err = 0;
+
+ ops = lookup_rules_ops(net, family);
+ if (!ops)
+ return -EAFNOSUPPORT;
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD,
+ rule, family, extack);
+ if (err)
+ break;
+ }
+ rules_ops_put(ops);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_rules_dump);
+
+unsigned int fib_rules_seq_read(const struct net *net, int family)
+{
+ unsigned int fib_rules_seq;
+ struct fib_rules_ops *ops;
+
+ ops = lookup_rules_ops(net, family);
+ if (!ops)
+ return 0;
+ /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */
+ fib_rules_seq = READ_ONCE(ops->fib_rules_seq);
+ rules_ops_put(ops);
+
+ return fib_rules_seq;
+}
+EXPORT_SYMBOL_GPL(fib_rules_seq_read);
+
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct fib_rule *rule,
+ bool user_priority)
+{
+ struct fib_rule *r;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rule->action && r->action != rule->action)
+ continue;
+
+ if (rule->table && r->table != rule->table)
+ continue;
+
+ if (user_priority && r->pref != rule->pref)
+ continue;
+
+ if (rule->iifname[0] &&
+ memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ continue;
+
+ if (rule->oifname[0] &&
+ memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ continue;
+
+ if (rule->mark && r->mark != rule->mark)
+ continue;
+
+ if (rule->suppress_ifgroup != -1 &&
+ r->suppress_ifgroup != rule->suppress_ifgroup)
+ continue;
+
+ if (rule->suppress_prefixlen != -1 &&
+ r->suppress_prefixlen != rule->suppress_prefixlen)
+ continue;
+
+ if (rule->mark_mask && r->mark_mask != rule->mark_mask)
+ continue;
+
+ if (rule->tun_id && r->tun_id != rule->tun_id)
+ continue;
+
+ if (rule->l3mdev && r->l3mdev != rule->l3mdev)
+ continue;
+
+ if (uid_range_set(&rule->uid_range) &&
+ (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end)))
+ continue;
+
+ if (rule->ip_proto && r->ip_proto != rule->ip_proto)
+ continue;
+
+ if (rule->proto && r->proto != rule->proto)
+ continue;
+
+ if (fib_rule_port_range_set(&rule->sport_range) &&
+ !fib_rule_port_range_compare(&r->sport_range,
+ &rule->sport_range))
+ continue;
+
+ if (rule->sport_mask && r->sport_mask != rule->sport_mask)
+ continue;
+
+ if (fib_rule_port_range_set(&rule->dport_range) &&
+ !fib_rule_port_range_compare(&r->dport_range,
+ &rule->dport_range))
+ continue;
+
+ if (rule->dport_mask && r->dport_mask != rule->dport_mask)
+ continue;
+
+ if (!ops->compare(r, frh, tb))
+ continue;
+ return r;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ nlrule->l3mdev = nla_get_u8(nla);
+ if (nlrule->l3mdev != 1) {
+ NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+ return -1;
+ }
+
+ return 0;
+}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+ return -1;
+}
+#endif
+
+static int fib_nl2rule_port_mask(const struct nlattr *mask_attr,
+ const struct fib_rule_port_range *range,
+ u16 *port_mask,
+ struct netlink_ext_ack *extack)
+{
+ if (!fib_rule_port_range_valid(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask without port value");
+ return -EINVAL;
+ }
+
+ if (fib_rule_port_is_range(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask for port range");
+ return -EINVAL;
+ }
+
+ if (range->start & ~nla_get_u16(mask_attr)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask");
+ return -EINVAL;
+ }
+
+ *port_mask = nla_get_u16(mask_attr);
+
+ return 0;
+}
+
+static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct fib_rule **rule,
+ bool *user_priority)
{
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX+1];
+ struct fib_rule *nlrule = NULL;
int err = -EINVAL;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
- goto errout;
-
- ops = lookup_rules_ops(frh->family);
- if (ops == NULL) {
- err = EAFNOSUPPORT;
- goto errout;
+ if (frh->src_len)
+ if (!tb[FRA_SRC] ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
- if (err < 0)
- goto errout;
+ if (frh->dst_len)
+ if (!tb[FRA_DST] ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid dst address");
+ goto errout;
+ }
- rule = kzalloc(ops->rule_size, GFP_KERNEL);
- if (rule == NULL) {
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
+ if (!nlrule) {
err = -ENOMEM;
goto errout;
}
+ refcount_set(&nlrule->refcnt, 1);
+ nlrule->fr_net = net;
- if (tb[FRA_PRIORITY])
- rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ if (tb[FRA_PRIORITY]) {
+ nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ *user_priority = true;
+ }
- if (tb[FRA_IFNAME]) {
- struct net_device *dev;
+ nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC);
+
+ if (tb[FRA_IIFNAME]) {
+ nlrule->iifindex = -1;
+ nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ }
- rule->ifindex = -1;
- nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(rule->ifname);
- if (dev)
- rule->ifindex = dev->ifindex;
+ if (tb[FRA_OIFNAME]) {
+ nlrule->oifindex = -1;
+ nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
}
if (tb[FRA_FWMARK]) {
- rule->mark = nla_get_u32(tb[FRA_FWMARK]);
- if (rule->mark)
+ nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (nlrule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
- rule->mark_mask = 0xFFFFFFFF;
+ nlrule->mark_mask = 0xFFFFFFFF;
}
if (tb[FRA_FWMASK])
- rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+
+ if (tb[FRA_TUN_ID])
+ nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
+ if (tb[FRA_L3MDEV] &&
+ fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+ goto errout_free;
+
+ nlrule->action = frh->action;
+ nlrule->flags = frh->flags;
+ nlrule->table = frh_get_table(frh, tb);
+ if (tb[FRA_SUPPRESS_PREFIXLEN])
+ nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ else
+ nlrule->suppress_prefixlen = -1;
+
+ if (tb[FRA_SUPPRESS_IFGROUP])
+ nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ else
+ nlrule->suppress_ifgroup = -1;
+
+ if (tb[FRA_GOTO]) {
+ if (nlrule->action != FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Unexpected goto");
+ goto errout_free;
+ }
+
+ nlrule->target = nla_get_u32(tb[FRA_GOTO]);
+ } else if (nlrule->action == FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
+ goto errout_free;
+ }
+
+ if (nlrule->l3mdev && nlrule->table) {
+ NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
+ goto errout_free;
+ }
- rule->action = frh->action;
- rule->flags = frh->flags;
- rule->table = frh_get_table(frh, tb);
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ NL_SET_ERR_MSG(extack, "No permission to set uid");
+ goto errout_free;
+ }
- if (!rule->pref && ops->default_pref)
- rule->pref = ops->default_pref();
+ nlrule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&nlrule->uid_range) ||
+ !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+ NL_SET_ERR_MSG(extack, "Invalid uid range");
+ goto errout_free;
+ }
+ } else {
+ nlrule->uid_range = fib_kuid_range_unset;
+ }
+
+ if (tb[FRA_IP_PROTO])
+ nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+
+ if (tb[FRA_SPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+ &nlrule->sport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid sport range");
+ goto errout_free;
+ }
+ if (!fib_rule_port_is_range(&nlrule->sport_range))
+ nlrule->sport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_SPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK],
+ &nlrule->sport_range,
+ &nlrule->sport_mask, extack);
+ if (err)
+ goto errout_free;
+ }
- err = ops->configure(rule, skb, nlh, frh, tb);
+ if (tb[FRA_DPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+ &nlrule->dport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid dport range");
+ goto errout_free;
+ }
+ if (!fib_rule_port_is_range(&nlrule->dport_range))
+ nlrule->dport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_DPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK],
+ &nlrule->dport_range,
+ &nlrule->dport_mask, extack);
+ if (err)
+ goto errout_free;
+ }
+
+ *rule = nlrule;
+
+ return 0;
+
+errout_free:
+ kfree(nlrule);
+errout:
+ return err;
+}
+
+static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ if (!tb[FRA_PRIORITY])
+ nlrule->pref = fib_default_rule_pref(ops);
+
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
+ return -EINVAL;
+ }
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname);
+ if (dev) {
+ nlrule->iifindex = dev->ifindex;
+ nlrule->iif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname);
+ if (dev) {
+ nlrule->oifindex = dev->ifindex;
+ nlrule->oif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ return 0;
+}
+
+static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
+ struct nlattr **tb, struct fib_rule *rule)
+{
+ struct fib_rule *r;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action != rule->action)
+ continue;
+
+ if (r->table != rule->table)
+ continue;
+
+ if (r->pref != rule->pref)
+ continue;
+
+ if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ continue;
+
+ if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ continue;
+
+ if (r->mark != rule->mark)
+ continue;
+
+ if (r->suppress_ifgroup != rule->suppress_ifgroup)
+ continue;
+
+ if (r->suppress_prefixlen != rule->suppress_prefixlen)
+ continue;
+
+ if (r->mark_mask != rule->mark_mask)
+ continue;
+
+ if (r->tun_id != rule->tun_id)
+ continue;
+
+ if (r->l3mdev != rule->l3mdev)
+ continue;
+
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
+ if (r->ip_proto != rule->ip_proto)
+ continue;
+
+ if (r->proto != rule->proto)
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->sport_range,
+ &rule->sport_range))
+ continue;
+
+ if (r->sport_mask != rule->sport_mask)
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->dport_range,
+ &rule->dport_range))
+ continue;
+
+ if (r->dport_mask != rule->dport_mask)
+ continue;
+
+ if (!ops->compare(r, frh, tb))
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
+ [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 },
+ [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [FRA_PRIORITY] = { .type = NLA_U32 },
+ [FRA_FWMARK] = { .type = NLA_U32 },
+ [FRA_FLOW] = { .type = NLA_U32 },
+ [FRA_TUN_ID] = { .type = NLA_U64 },
+ [FRA_FWMASK] = { .type = NLA_U32 },
+ [FRA_TABLE] = { .type = NLA_U32 },
+ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 },
+ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 },
+ [FRA_GOTO] = { .type = NLA_U32 },
+ [FRA_L3MDEV] = { .type = NLA_U8 },
+ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) },
+ [FRA_PROTOCOL] = { .type = NLA_U8 },
+ [FRA_IP_PROTO] = { .type = NLA_U8 },
+ [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
+ [FRA_FLOWLABEL] = { .type = NLA_BE32 },
+ [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
+ [FRA_SPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2),
+};
+
+int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
+{
+ struct fib_rule *rule = NULL, *r, *last = NULL;
+ int err = -EINVAL, unresolved = 0;
+ struct fib_rules_ops *ops = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
+ bool user_priority = false;
+ struct fib_rule_hdr *frh;
+
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (!ops) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
+ fib_rule_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority);
+ if (err)
+ goto errout;
+
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(rule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
+ if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
+ rule_exists(ops, frh, tb, rule)) {
+ err = -EEXIST;
+ goto errout_free;
+ }
+
+ err = ops->configure(rule, skb, frh, tb, extack);
if (err < 0)
goto errout_free;
- list_for_each_entry(r, ops->rules_list, list) {
+ err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops,
+ extack);
+ if (err < 0)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
}
- fib_rule_get(rule);
-
if (last)
list_add_rcu(&rule->list, &last->list);
else
- list_add_rcu(&rule->list, ops->rules_list);
+ list_add_rcu(&rule->list, &ops->rules_list);
+
+ if (ops->unresolved_rules) {
+ /*
+ * There are unresolved goto rules in the list, check if
+ * any of them are pointing to this new rule.
+ */
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action == FR_ACT_GOTO &&
+ r->target == rule->pref &&
+ rtnl_dereference(r->ctarget) == NULL) {
+ rcu_assign_pointer(r->ctarget, rule);
+ if (--ops->unresolved_rules == 0)
+ break;
+ }
+ }
+ }
+
+ if (rule->action == FR_ACT_GOTO)
+ ops->nr_goto_rules++;
- notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
+ if (unresolved)
+ ops->unresolved_rules++;
+
+ if (rule->tun_id)
+ ip_tunnel_need_metadata();
+
+ fib_rule_get(rule);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
rules_ops_put(ops);
return 0;
errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(rule);
errout:
rules_ops_put(ops);
return err;
}
+EXPORT_SYMBOL_GPL(fib_newrule);
-int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
+
+int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
+{
+ struct fib_rule *rule = NULL, *nlrule = NULL;
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule;
struct nlattr *tb[FRA_MAX+1];
+ bool user_priority = false;
+ struct fib_rule_hdr *frh;
int err = -EINVAL;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
+ }
- ops = lookup_rules_ops(frh->family);
+ ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
- err = EAFNOSUPPORT;
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
- if (err < 0)
+ err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
+ fib_rule_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
+ }
- list_for_each_entry(rule, ops->rules_list, list) {
- if (frh->action && (frh->action != rule->action))
- continue;
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority);
+ if (err)
+ goto errout;
- if (frh->table && (frh_get_table(frh, tb) != rule->table))
- continue;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
- if (tb[FRA_PRIORITY] &&
- (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
- continue;
+ err = fib_nl2rule_rtnl(nlrule, ops, tb, extack);
+ if (err)
+ goto errout_free;
- if (tb[FRA_IFNAME] &&
- nla_strcmp(tb[FRA_IFNAME], rule->ifname))
- continue;
+ rule = rule_find(ops, frh, tb, nlrule, user_priority);
+ if (!rule) {
+ err = -ENOENT;
+ goto errout_free;
+ }
- if (tb[FRA_FWMARK] &&
- (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
- continue;
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout_free;
+ }
- if (tb[FRA_FWMASK] &&
- (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
- continue;
+ if (ops->delete) {
+ err = ops->delete(rule);
+ if (err)
+ goto errout_free;
+ }
- if (!ops->compare(rule, frh, tb))
- continue;
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
- if (rule->flags & FIB_RULE_PERMANENT) {
- err = -EPERM;
- goto errout;
- }
+ list_del_rcu(&rule->list);
- list_del_rcu(&rule->list);
- synchronize_rcu();
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).pid);
- fib_rule_put(rule);
- rules_ops_put(ops);
- return 0;
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
}
- err = -ENOENT;
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ struct fib_rule *n, *r;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
+ ops->unresolved_rules++;
+ }
+ }
+
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ kfree(nlrule);
+ return 0;
+
+errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+ kfree(nlrule);
errout:
rules_ops_put(ops);
return err;
}
+EXPORT_SYMBOL_GPL(fib_delrule);
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
struct fib_rule *rule)
{
size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
- + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */
+ + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+ nla_total_size(4) /* FRA_PRIORITY */
+ nla_total_size(4) /* FRA_TABLE */
+ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
- + nla_total_size(4); /* FRA_FWMASK */
+ + nla_total_size(4) /* FRA_FWMASK */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range))
+ + nla_total_size(1) /* FRA_PROTOCOL */
+ + nla_total_size(1) /* FRA_IP_PROTO */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */
+ + nla_total_size(2) /* FRA_SPORT_MASK */
+ + nla_total_size(2); /* FRA_DPORT_MASK */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -331,87 +1154,204 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
if (nlh == NULL)
- return -1;
+ return -EMSGSIZE;
frh = nlmsg_data(nlh);
- frh->table = rule->table;
- NLA_PUT_U32(skb, FRA_TABLE, rule->table);
+ frh->family = ops->family;
+ frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, FRA_TABLE, rule->table))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
+ goto nla_put_failure;
frh->res1 = 0;
frh->res2 = 0;
frh->action = rule->action;
frh->flags = rule->flags;
- if (rule->ifname[0])
- NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname);
+ if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto))
+ goto nla_put_failure;
+
+ if (rule->action == FR_ACT_GOTO &&
+ rcu_access_pointer(rule->ctarget) == NULL)
+ frh->flags |= FIB_RULE_UNRESOLVED;
+
+ if (rule->iifname[0]) {
+ if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
+ goto nla_put_failure;
+ if (READ_ONCE(rule->iifindex) == -1)
+ frh->flags |= FIB_RULE_IIF_DETACHED;
+ }
- if (rule->pref)
- NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
+ if (rule->oifname[0]) {
+ if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
+ goto nla_put_failure;
+ if (READ_ONCE(rule->oifindex) == -1)
+ frh->flags |= FIB_RULE_OIF_DETACHED;
+ }
- if (rule->mark)
- NLA_PUT_U32(skb, FRA_FWMARK, rule->mark);
+ if ((rule->pref &&
+ nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
+ (rule->mark &&
+ nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
+ ((rule->mark_mask || rule->mark) &&
+ nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
+ (rule->target &&
+ nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+ (rule->tun_id &&
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
+ (rule->l3mdev &&
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)) ||
+ (fib_rule_port_range_set(&rule->sport_range) &&
+ nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK,
+ rule->sport_mask)) ||
+ (fib_rule_port_range_set(&rule->dport_range) &&
+ nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK,
+ rule->dport_mask)) ||
+ (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
+ goto nla_put_failure;
- if (rule->mark_mask || rule->mark)
- NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask);
+ if (rule->suppress_ifgroup != -1) {
+ if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+ goto nla_put_failure;
+ }
- if (ops->fill(rule, skb, nlh, frh) < 0)
+ if (ops->fill(rule, skb, frh) < 0)
goto nla_put_failure;
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-int fib_rules_dump(struct sk_buff *skb, struct netlink_callback *cb, int family)
+static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_rules_ops *ops)
{
int idx = 0;
struct fib_rule *rule;
+ int err = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ if (idx < cb->args[1])
+ goto skip;
+
+ err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWRULE,
+ NLM_F_MULTI, ops);
+ if (err)
+ break;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ rules_ops_put(ops);
+
+ return err;
+}
+
+static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rule_hdr *frh;
+
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
+ return -EINVAL;
+ }
+
+ if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
+ frh->res1 || frh->res2 || frh->action || frh->flags) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid values in header for fib rule dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
struct fib_rules_ops *ops;
+ int err, idx = 0, family;
- ops = lookup_rules_ops(family);
- if (ops == NULL)
- return -EAFNOSUPPORT;
+ if (cb->strict_check) {
+ err = fib_valid_dumprule_req(nlh, cb->extack);
+ if (err < 0)
+ return err;
+ }
+
+ family = rtnl_msg_family(nlh);
+ if (family != AF_UNSPEC) {
+ /* Protocol specific dump request */
+ ops = lookup_rules_ops(net, family);
+ if (ops == NULL)
+ return -EAFNOSUPPORT;
+
+ return dump_rules(skb, cb, ops);
+ }
+
+ err = 0;
rcu_read_lock();
- list_for_each_entry(rule, ops->rules_list, list) {
- if (idx < cb->args[0])
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (idx < cb->args[0] || !try_module_get(ops->owner))
goto skip;
- if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWRULE,
- NLM_F_MULTI, ops) < 0)
+ err = dump_rules(skb, cb, ops);
+ if (err < 0)
break;
+
+ cb->args[1] = 0;
skip:
idx++;
}
rcu_read_unlock();
cb->args[0] = idx;
- rules_ops_put(ops);
- return skb->len;
+ return err;
}
-EXPORT_SYMBOL_GPL(fib_rules_dump);
-
static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid)
{
+ struct net *net;
struct sk_buff *skb;
- int err = -ENOBUFS;
+ int err = -ENOMEM;
+ net = ops->fro_net;
skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
if (skb == NULL)
goto errout;
err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
- /* failure implies BUG in fib_rule_nlmsg_size() */
- BUG_ON(err < 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
- err = rtnl_notify(skb, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ return;
errout:
- if (err < 0)
- rtnl_set_sk_err(ops->nlgroup, err);
+ rtnl_set_sk_err(net, ops->nlgroup, err);
}
static void attach_rules(struct list_head *rules, struct net_device *dev)
@@ -419,9 +1359,18 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
struct fib_rule *rule;
list_for_each_entry(rule, rules, list) {
- if (rule->ifindex == -1 &&
- strcmp(dev->name, rule->ifname) == 0)
- rule->ifindex = dev->ifindex;
+ if (rule->iifindex == -1 &&
+ strcmp(dev->name, rule->iifname) == 0) {
+ WRITE_ONCE(rule->iifindex, dev->ifindex);
+ WRITE_ONCE(rule->iif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
+ if (rule->oifindex == -1 &&
+ strcmp(dev->name, rule->oifname) == 0) {
+ WRITE_ONCE(rule->oifindex, dev->ifindex);
+ WRITE_ONCE(rule->oif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
}
}
@@ -429,35 +1378,47 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
{
struct fib_rule *rule;
- list_for_each_entry(rule, rules, list)
- if (rule->ifindex == dev->ifindex)
- rule->ifindex = -1;
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == dev->ifindex) {
+ WRITE_ONCE(rule->iifindex, -1);
+ WRITE_ONCE(rule->iif_is_l3_master, false);
+ }
+ if (rule->oifindex == dev->ifindex) {
+ WRITE_ONCE(rule->oifindex, -1);
+ WRITE_ONCE(rule->oif_is_l3_master, false);
+ }
+ }
}
static int fib_rules_event(struct notifier_block *this, unsigned long event,
- void *ptr)
+ void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
struct fib_rules_ops *ops;
ASSERT_RTNL();
- rcu_read_lock();
switch (event) {
case NETDEV_REGISTER:
- list_for_each_entry(ops, &rules_ops, list)
- attach_rules(ops->rules_list, dev);
+ list_for_each_entry(ops, &net->rules_ops, list)
+ attach_rules(&ops->rules_list, dev);
+ break;
+
+ case NETDEV_CHANGENAME:
+ list_for_each_entry(ops, &net->rules_ops, list) {
+ detach_rules(&ops->rules_list, dev);
+ attach_rules(&ops->rules_list, dev);
+ }
break;
case NETDEV_UNREGISTER:
- list_for_each_entry(ops, &rules_ops, list)
- detach_rules(ops->rules_list, dev);
+ list_for_each_entry(ops, &net->rules_ops, list)
+ detach_rules(&ops->rules_list, dev);
break;
}
- rcu_read_unlock();
-
return NOTIFY_DONE;
}
@@ -465,9 +1426,53 @@ static struct notifier_block fib_rules_notifier = {
.notifier_call = fib_rules_event,
};
+static int __net_init fib_rules_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->rules_ops);
+ spin_lock_init(&net->rules_mod_lock);
+ return 0;
+}
+
+static void __net_exit fib_rules_net_exit(struct net *net)
+{
+ WARN_ON_ONCE(!list_empty(&net->rules_ops));
+}
+
+static struct pernet_operations fib_rules_net_ops = {
+ .init = fib_rules_net_init,
+ .exit = fib_rules_net_exit,
+};
+
+static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+};
+
static int __init fib_rules_init(void)
{
- return register_netdevice_notifier(&fib_rules_notifier);
+ int err;
+
+ rtnl_register_many(fib_rules_rtnl_msg_handlers);
+
+ err = register_pernet_subsys(&fib_rules_net_ops);
+ if (err < 0)
+ goto fail;
+
+ err = register_netdevice_notifier(&fib_rules_notifier);
+ if (err < 0)
+ goto fail_unregister;
+
+ return 0;
+
+fail_unregister:
+ unregister_pernet_subsys(&fib_rules_net_ops);
+fail:
+ rtnl_unregister_many(fib_rules_rtnl_msg_handlers);
+ return err;
}
subsys_initcall(fib_rules_init);
diff --git a/net/core/filter.c b/net/core/filter.c
index 0df843b667f4..616e0520a0bb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,284 +1,1071 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux Socket Filter - Kernel level socket filtering
*
- * Author:
- * Jay Schulist <jschlst@samba.org>
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
*
- * Based on the design of:
- * - The Berkeley Packet Filter
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
*
* Andi Kleen - Fix a few bad bugs and races.
- * Kris Katterjohn - Added many additional checks in sk_chk_filter()
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
+#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/if_arp.h>
+#include <linux/gfp.h>
+#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
+#include <net/netlink.h>
#include <linux/skbuff.h>
+#include <linux/skmsg.h>
#include <net/sock.h>
+#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/uaccess.h>
+#include <linux/unaligned.h>
#include <linux/filter.h>
+#include <linux/ratelimit.h>
+#include <linux/seccomp.h>
+#include <linux/if_vlan.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <net/sch_generic.h>
+#include <net/cls_cgroup.h>
+#include <net/dst_metadata.h>
+#include <net/dst.h>
+#include <net/sock_reuseport.h>
+#include <net/busy_poll.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/udp.h>
+#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/ip_fib.h>
+#include <net/nexthop.h>
+#include <net/flow.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6_stubs.h>
+#include <net/bpf_sk_storage.h>
+#include <net/transp_v6.h>
+#include <linux/btf_ids.h>
+#include <net/tls.h>
+#include <net/xdp.h>
+#include <net/mptcp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netkit.h>
+#include <linux/un.h>
+#include <net/xdp_sock_drv.h>
+#include <net/inet_dscp.h>
-/* No hurry in this branch */
-static void *__load_pointer(struct sk_buff *skb, int k)
-{
- u8 *ptr = NULL;
+#include "dev.h"
- if (k >= SKF_NET_OFF)
- ptr = skb->nh.raw + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
- ptr = skb->mac.raw + k - SKF_LL_OFF;
+/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
+static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");
- if (ptr >= skb->head && ptr < skb->tail)
- return ptr;
- return NULL;
-}
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
-static inline void *load_pointer(struct sk_buff *skb, int k,
- unsigned int size, void *buffer)
+int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
- if (k >= 0)
- return skb_header_pointer(skb, k, size, buffer);
- else {
- if (k >= SKF_AD_OFF)
- return NULL;
- return __load_pointer(skb, k);
+ if (in_compat_syscall()) {
+ struct compat_sock_fprog f32;
+
+ if (len != sizeof(f32))
+ return -EINVAL;
+ if (copy_from_sockptr(&f32, src, sizeof(f32)))
+ return -EFAULT;
+ memset(dst, 0, sizeof(*dst));
+ dst->len = f32.len;
+ dst->filter = compat_ptr(f32.filter);
+ } else {
+ if (len != sizeof(*dst))
+ return -EINVAL;
+ if (copy_from_sockptr(dst, src, sizeof(*dst)))
+ return -EFAULT;
}
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
/**
- * sk_run_filter - run a filter on a socket
- * @skb: buffer to run the filter on
- * @filter: filter to apply
- * @flen: length of filter
+ * sk_filter_trim_cap - run a packet through a socket filter
+ * @sk: sock associated with &sk_buff
+ * @skb: buffer to filter
+ * @cap: limit on how short the eBPF program may trim the packet
+ * @reason: record drop reason on errors (negative return value)
+ *
+ * Run the eBPF program and then cut skb->data to correct size returned by
+ * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
*
- * Decode and apply filter instructions to the skb->data.
- * Return length to keep, 0 for none. skb is the data we are
- * filtering, filter is the array of filter instructions, and
- * len is the number of filter blocks in the array.
*/
-unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
+ unsigned int cap, enum skb_drop_reason *reason)
{
- struct sock_filter *fentry; /* We walk down these */
- void *ptr;
- u32 A = 0; /* Accumulator */
- u32 X = 0; /* Index Register */
- u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
- u32 tmp;
- int k;
- int pc;
+ int err;
+ struct sk_filter *filter;
/*
- * Process array of filter instructions.
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
*/
- for (pc = 0; pc < flen; pc++) {
- fentry = &filter[pc];
-
- switch (fentry->code) {
- case BPF_ALU|BPF_ADD|BPF_X:
- A += X;
- continue;
- case BPF_ALU|BPF_ADD|BPF_K:
- A += fentry->k;
- continue;
- case BPF_ALU|BPF_SUB|BPF_X:
- A -= X;
- continue;
- case BPF_ALU|BPF_SUB|BPF_K:
- A -= fentry->k;
- continue;
- case BPF_ALU|BPF_MUL|BPF_X:
- A *= X;
- continue;
- case BPF_ALU|BPF_MUL|BPF_K:
- A *= fentry->k;
- continue;
- case BPF_ALU|BPF_DIV|BPF_X:
- if (X == 0)
- return 0;
- A /= X;
- continue;
- case BPF_ALU|BPF_DIV|BPF_K:
- A /= fentry->k;
- continue;
- case BPF_ALU|BPF_AND|BPF_X:
- A &= X;
- continue;
- case BPF_ALU|BPF_AND|BPF_K:
- A &= fentry->k;
- continue;
- case BPF_ALU|BPF_OR|BPF_X:
- A |= X;
- continue;
- case BPF_ALU|BPF_OR|BPF_K:
- A |= fentry->k;
- continue;
- case BPF_ALU|BPF_LSH|BPF_X:
- A <<= X;
- continue;
- case BPF_ALU|BPF_LSH|BPF_K:
- A <<= fentry->k;
- continue;
- case BPF_ALU|BPF_RSH|BPF_X:
- A >>= X;
- continue;
- case BPF_ALU|BPF_RSH|BPF_K:
- A >>= fentry->k;
- continue;
- case BPF_ALU|BPF_NEG:
- A = -A;
- continue;
- case BPF_JMP|BPF_JA:
- pc += fentry->k;
- continue;
- case BPF_JMP|BPF_JGT|BPF_K:
- pc += (A > fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JGE|BPF_K:
- pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JEQ|BPF_K:
- pc += (A == fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JSET|BPF_K:
- pc += (A & fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JGT|BPF_X:
- pc += (A > X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JGE|BPF_X:
- pc += (A >= X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JEQ|BPF_X:
- pc += (A == X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JSET|BPF_X:
- pc += (A & X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_LD|BPF_W|BPF_ABS:
- k = fentry->k;
-load_w:
- ptr = load_pointer(skb, k, 4, &tmp);
- if (ptr != NULL) {
- A = ntohl(get_unaligned((__be32 *)ptr));
- continue;
- }
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ *reason = SKB_DROP_REASON_PFMEMALLOC;
+ return -ENOMEM;
+ }
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err) {
+ *reason = SKB_DROP_REASON_SOCKET_FILTER;
+ return err;
+ }
+
+ err = security_sock_rcv_skb(sk, skb);
+ if (err) {
+ *reason = SKB_DROP_REASON_SECURITY_HOOK;
+ return err;
+ }
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ struct sock *save_sk = skb->sk;
+ unsigned int pkt_len;
+
+ skb->sk = sk;
+ pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+ skb->sk = save_sk;
+ err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+ if (err)
+ *reason = SKB_DROP_REASON_SOCKET_FILTER;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(sk_filter_trim_cap);
+
+BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
+{
+ return skb_get_poff(skb);
+}
+
+BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
+{
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
+{
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (!nla_ok(nla, skb->len - a))
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
+{
+ if (likely(offset >= 0))
+ return offset;
+
+ if (offset >= SKF_NET_OFF)
+ return offset - SKF_NET_OFF + skb_network_offset(skb);
+
+ if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
+ return offset - SKF_LL_OFF + skb_mac_offset(skb);
+
+ return INT_MIN;
+}
+
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u8 tmp;
+ const int len = sizeof(tmp);
+
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
+
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ else
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ __be16 tmp;
+ const int len = sizeof(tmp);
+
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
+
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ else
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ __be32 tmp;
+ const int len = sizeof(tmp);
+
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
+
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ else
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (skb_field) {
+ case SKF_AD_MARK:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_PKTTYPE:
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+ break;
+
+ case SKF_AD_QUEUE:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_VLAN_TAG:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
+
+ /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_tci));
+ break;
+ case SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_all));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct bpf_insn **insnp)
+{
+ struct bpf_insn *insn = *insnp;
+ u32 cnt;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TPID:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_proto));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
break;
- case BPF_LD|BPF_H|BPF_ABS:
- k = fentry->k;
-load_h:
- ptr = load_pointer(skb, k, 2, &tmp);
- if (ptr != NULL) {
- A = ntohs(get_unaligned((__be16 *)ptr));
- continue;
- }
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
break;
- case BPF_LD|BPF_B|BPF_ABS:
- k = fentry->k;
-load_b:
- ptr = load_pointer(skb, k, 1, &tmp);
- if (ptr != NULL) {
- A = *(u8 *)ptr;
- continue;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
+ bpf_user_rnd_init_once();
+ break;
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+ const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+ int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+ bool endian = BPF_SIZE(fp->code) == BPF_H ||
+ BPF_SIZE(fp->code) == BPF_W;
+ bool indirect = BPF_MODE(fp->code) == BPF_IND;
+ const int ip_align = NET_IP_ALIGN;
+ struct bpf_insn *insn = *insnp;
+ int offset = fp->k;
+
+ if (!indirect &&
+ ((unaligned_ok && offset >= 0) ||
+ (!unaligned_ok && offset >= 0 &&
+ offset + ip_align >= 0 &&
+ offset + ip_align % size == 0))) {
+ bool ldx_off_ok = offset <= S16_MAX;
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+ if (offset)
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
+ size, 2 + endian + (!ldx_off_ok * 2));
+ if (ldx_off_ok) {
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_D, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_TMP, 0);
+ }
+ if (endian)
+ *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+ *insn++ = BPF_JMP_A(8);
+ }
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+ if (fp->k)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+ }
+
+ switch (BPF_SIZE(fp->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+ break;
+ default:
+ return false;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn = BPF_EXIT_INSN();
+
+ *insnp = insn;
+ return true;
+}
+
+/**
+ * bpf_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: allocated 'struct bpf_prog' or NULL
+ * @new_len: pointer to store length of converted program
+ * @seen_ld_abs: bool whether we've seen ld_abs/ind
+ *
+ * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
+ * style extended BPF (eBPF).
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
+ */
+static int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_prog *new_prog, int *new_len,
+ bool *seen_ld_abs)
+{
+ int new_flen = 0, pass = 0, target, i, stack_off;
+ struct bpf_insn *new_insn, *first_insn = NULL;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
+
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ first_insn = new_prog->insnsi;
+ addrs = kcalloc(len, sizeof(*addrs),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = first_insn;
+ fp = prog;
+
+ /* Classic BPF related prologue emission. */
+ if (new_prog) {
+ /* Classic BPF expects A and X to be reset first. These need
+ * to be guaranteed to be the first two instructions.
+ */
+ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+
+ /* All programs must keep CTX in callee saved BPF_REG_CTX.
+ * In eBPF case it's done by the compiler, here we need to
+ * do this ourself. Initial CTX is present in BPF_REG_ARG1.
+ */
+ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ if (*seen_ld_abs) {
+ /* For packet access in classic BPF, cache skb->data
+ * in callee-saved BPF R8 and skb->len - skb->data_len
+ * (headlen) in BPF R9. Since classic BPF is read-only
+ * on CTX, we only need to cache it once.
+ */
+ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ BPF_REG_D, BPF_REG_CTX,
+ offsetof(struct sk_buff, data));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, data_len));
+ *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+ }
+ } else {
+ new_insn += 3;
+ }
+
+ for (i = 0; i < len; fp++, i++) {
+ struct bpf_insn tmp_insns[32] = { };
+ struct bpf_insn *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - first_insn;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ convert_bpf_ld_abs(fp, &insn)) {
+ *seen_ld_abs = true;
+ break;
}
+
+ if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
+ fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
+ *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
+ /* Error with exception code on div/mod by 0.
+ * For cBPF programs, this was always return 0.
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn++ = BPF_EXIT_INSN();
+ }
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
break;
- case BPF_LD|BPF_W|BPF_LEN:
- A = skb->len;
- continue;
- case BPF_LDX|BPF_W|BPF_LEN:
- X = skb->len;
- continue;
- case BPF_LD|BPF_W|BPF_IND:
- k = X + fentry->k;
- goto load_w;
- case BPF_LD|BPF_H|BPF_IND:
- k = X + fentry->k;
- goto load_h;
- case BPF_LD|BPF_B|BPF_IND:
- k = X + fentry->k;
- goto load_b;
- case BPF_LDX|BPF_B|BPF_MSH:
- ptr = load_pointer(skb, fentry->k, 1, &tmp);
- if (ptr != NULL) {
- X = (*(u8 *)ptr & 0xf) << 2;
- continue;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ const s32 off_min = S16_MIN, off_max = S16_MAX; \
+ s32 off; \
+ \
+ if (target >= len || target < 0) \
+ goto err; \
+ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ off -= insn - tmp_insns; \
+ /* Reject anything not fitting into insn->off. */ \
+ if (off < off_min || off > off_max) \
+ goto err; \
+ insn->off = off; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
+ insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
}
- return 0;
- case BPF_LD|BPF_IMM:
- A = fentry->k;
- continue;
- case BPF_LDX|BPF_IMM:
- X = fentry->k;
- continue;
- case BPF_LD|BPF_MEM:
- A = mem[fentry->k];
- continue;
- case BPF_LDX|BPF_MEM:
- X = mem[fentry->k];
- continue;
- case BPF_MISC|BPF_TAX:
- X = A;
- continue;
- case BPF_MISC|BPF_TXA:
- A = X;
- continue;
- case BPF_RET|BPF_K:
- return fentry->k;
- case BPF_RET|BPF_A:
- return A;
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Convert some jumps when 'jump_true' is next insn. */
+ if (fp->jt == 0) {
+ switch (BPF_OP(fp->code)) {
+ case BPF_JEQ:
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ break;
+ case BPF_JGT:
+ insn->code = BPF_JMP | BPF_JLE | bpf_src;
+ break;
+ case BPF_JGE:
+ insn->code = BPF_JMP | BPF_JLT | bpf_src;
+ break;
+ default:
+ goto jmp_rest;
+ }
+
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+jmp_rest:
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B: {
+ struct sock_filter tmp = {
+ .code = BPF_LD | BPF_ABS | BPF_B,
+ .k = fp->k,
+ };
+
+ *seen_ld_abs = true;
+
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ convert_bpf_ld_abs(&tmp, &insn);
+ insn++;
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* tmp = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+ }
+ /* RET_K is remapped into 2 insns. RET_A case doesn't need an
+ * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
+ */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ if (BPF_RVAL(fp->code) == BPF_K)
+ *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
+ 0, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
case BPF_ST:
- mem[fentry->k] = A;
- continue;
case BPF_STX:
- mem[fentry->k] = X;
- continue;
- default:
- WARN_ON(1);
- return 0;
- }
+ stack_off = fp->k * 4 + 4;
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -stack_off);
+ /* check_load_and_stores() verifies that classic BPF can
+ * load from stack only after write, so tracking
+ * stack_depth for ST|STX insns is enough
+ */
+ if (new_prog && new_prog->aux->stack_depth < stack_off)
+ new_prog->aux->stack_depth = stack_off;
+ break;
- /*
- * Handle ancillary data, which are impossible
- * (or very difficult) to get parsing packet contents.
- */
- switch (k-SKF_AD_OFF) {
- case SKF_AD_PROTOCOL:
- A = ntohs(skb->protocol);
- continue;
- case SKF_AD_PKTTYPE:
- A = skb->pkt_type;
- continue;
- case SKF_AD_IFINDEX:
- A = skb->dev->ifindex;
- continue;
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ stack_off = fp->k * 4 + 4;
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -stack_off);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unknown instruction. */
default:
- return 0;
+ goto err;
}
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
+ }
+
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - first_insn;
+ if (*seen_ld_abs)
+ *new_len += 4; /* Prologue bits. */
+ return 0;
}
+ pass++;
+ if (new_flen != new_insn - first_insn) {
+ new_flen = new_insn - first_insn;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
+ }
+
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
+}
+
+/* Security:
+ *
+ * As we dont want to clear mem[] array for each packet going through
+ * __bpf_prog_run(), we check that filter loaded by user never try to read
+ * a cell if not previously written, and we check all branches to be sure
+ * a malicious user doesn't try to abuse us.
+ */
+static int check_load_and_stores(const struct sock_filter *filter, int flen)
+{
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
+ int pc, ret = 0;
+
+ BUILD_BUG_ON(BPF_MEMWORDS > 16);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return -ENOMEM;
+
+ memset(masks, 0xff, flen * sizeof(*masks));
+
+ for (pc = 0; pc < flen; pc++) {
+ memvalid &= masks[pc];
+
+ switch (filter[pc].code) {
+ case BPF_ST:
+ case BPF_STX:
+ memvalid |= (1 << filter[pc].k);
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ if (!(memvalid & (1 << filter[pc].k))) {
+ ret = -EINVAL;
+ goto error;
+ }
+ break;
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
+ masks[pc + 1 + filter[pc].k] &= memvalid;
+ memvalid = ~0;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
+ masks[pc + 1 + filter[pc].jt] &= memvalid;
+ masks[pc + 1 + filter[pc].jf] &= memvalid;
+ memvalid = ~0;
+ break;
+ }
+ }
+error:
+ kfree(masks);
+ return ret;
+}
+
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
+}
+
+static bool bpf_check_basics_ok(const struct sock_filter *filter,
+ unsigned int flen)
+{
+ if (filter == NULL)
+ return false;
+ if (flen == 0 || flen > BPF_MAXINSNS)
+ return false;
+
+ return true;
}
/**
- * sk_chk_filter - verify socket filter code
+ * bpf_check_classic - verify socket filter code
* @filter: filter to verify
* @flen: length of filter
*
@@ -291,100 +1078,467 @@ load_b:
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int sk_chk_filter(struct sock_filter *filter, int flen)
+static int bpf_check_classic(const struct sock_filter *filter,
+ unsigned int flen)
{
- struct sock_filter *ftest;
+ bool anc_found;
int pc;
- if (flen == 0 || flen > BPF_MAXINSNS)
- return -EINVAL;
-
- /* check the filter code now */
+ /* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
- ftest = &filter[pc];
+ const struct sock_filter *ftest = &filter[pc];
- /* Only allow valid instructions */
- switch (ftest->code) {
- case BPF_ALU|BPF_ADD|BPF_K:
- case BPF_ALU|BPF_ADD|BPF_X:
- case BPF_ALU|BPF_SUB|BPF_K:
- case BPF_ALU|BPF_SUB|BPF_X:
- case BPF_ALU|BPF_MUL|BPF_K:
- case BPF_ALU|BPF_MUL|BPF_X:
- case BPF_ALU|BPF_DIV|BPF_X:
- case BPF_ALU|BPF_AND|BPF_K:
- case BPF_ALU|BPF_AND|BPF_X:
- case BPF_ALU|BPF_OR|BPF_K:
- case BPF_ALU|BPF_OR|BPF_X:
- case BPF_ALU|BPF_LSH|BPF_K:
- case BPF_ALU|BPF_LSH|BPF_X:
- case BPF_ALU|BPF_RSH|BPF_K:
- case BPF_ALU|BPF_RSH|BPF_X:
- case BPF_ALU|BPF_NEG:
- case BPF_LD|BPF_W|BPF_ABS:
- case BPF_LD|BPF_H|BPF_ABS:
- case BPF_LD|BPF_B|BPF_ABS:
- case BPF_LD|BPF_W|BPF_LEN:
- case BPF_LD|BPF_W|BPF_IND:
- case BPF_LD|BPF_H|BPF_IND:
- case BPF_LD|BPF_B|BPF_IND:
- case BPF_LD|BPF_IMM:
- case BPF_LDX|BPF_W|BPF_LEN:
- case BPF_LDX|BPF_B|BPF_MSH:
- case BPF_LDX|BPF_IMM:
- case BPF_MISC|BPF_TAX:
- case BPF_MISC|BPF_TXA:
- case BPF_RET|BPF_K:
- case BPF_RET|BPF_A:
- break;
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
+ return -EINVAL;
/* Some instructions need special checks */
-
- case BPF_ALU|BPF_DIV|BPF_K:
- /* check for division by zero */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
if (ftest->k == 0)
return -EINVAL;
break;
-
- case BPF_LD|BPF_MEM:
- case BPF_LDX|BPF_MEM:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ if (ftest->k >= 32)
+ return -EINVAL;
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
case BPF_ST:
case BPF_STX:
- /* check for invalid memory addresses */
+ /* Check for invalid memory addresses */
if (ftest->k >= BPF_MEMWORDS)
return -EINVAL;
break;
-
- case BPF_JMP|BPF_JA:
- /*
- * Note, the large ftest->k might cause loops.
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
* Compare this with conditional jumps below,
* where offsets are limited. --ANK (981016)
*/
- if (ftest->k >= (unsigned)(flen-pc-1))
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
return -EINVAL;
break;
-
- case BPF_JMP|BPF_JEQ|BPF_K:
- case BPF_JMP|BPF_JEQ|BPF_X:
- case BPF_JMP|BPF_JGE|BPF_K:
- case BPF_JMP|BPF_JGE|BPF_X:
- case BPF_JMP|BPF_JGT|BPF_K:
- case BPF_JMP|BPF_JGT|BPF_X:
- case BPF_JMP|BPF_JSET|BPF_K:
- case BPF_JMP|BPF_JSET|BPF_X:
- /* for conditionals both must be safe */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
if (pc + ftest->jt + 1 >= flen ||
pc + ftest->jf + 1 >= flen)
return -EINVAL;
break;
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
+ anc_found = false;
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
+ if (anc_found == false && ftest->k >= SKF_AD_OFF)
+ return -EINVAL;
+ }
+ }
- default:
- return -EINVAL;
+ /* Last instruction must be a RET code */
+ switch (filter[flen - 1].code) {
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ return check_load_and_stores(filter, flen);
+ }
+
+ return -EINVAL;
+}
+
+static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+
+ fkprog->filter = kmemdup(fp->insns, fsize,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void bpf_release_orig_filter(struct bpf_prog *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
+static void __bpf_prog_release(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ } else {
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+ }
+}
+
+static void __sk_filter_release(struct sk_filter *fp)
+{
+ __bpf_prog_release(fp->prog);
+ kfree(fp);
+}
+
+/**
+ * sk_filter_release_rcu - Release a socket filter by rcu_head
+ * @rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_release_rcu(struct rcu_head *rcu)
+{
+ struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+ __sk_filter_release(fp);
+}
+
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+ if (refcount_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ atomic_sub(filter_size, &sk->sk_omem_alloc);
+ sk_filter_release(fp);
+}
+
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ /* same check as in sock_kmalloc() */
+ if (filter_size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
+ atomic_add(filter_size, &sk->sk_omem_alloc);
+ return true;
+ }
+ return false;
+}
+
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ if (!refcount_inc_not_zero(&fp->refcnt))
+ return false;
+
+ if (!__sk_filter_charge(sk, fp)) {
+ sk_filter_release(fp);
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
+{
+ struct sock_filter *old_prog;
+ struct bpf_prog *old_fp;
+ int err, new_len, old_len = fp->len;
+ bool seen_ld_abs = false;
+
+ /* We are free to overwrite insns et al right here as it won't be used at
+ * this point in time anymore internally after the migration to the eBPF
+ * instruction representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct bpf_insn));
+
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* 1st pass: calculate the new program length. */
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+ &seen_ld_abs);
+ if (err)
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
+ err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+ &seen_ld_abs);
+ if (err)
+ /* 2nd bpf_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by krealloc().
+ */
+ goto out_err_free;
+
+ fp = bpf_prog_select_runtime(fp, &err);
+ if (err)
+ goto out_err_free;
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+}
+
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+ bpf_aux_classic_check_t trans)
+{
+ int err;
+
+ fp->bpf_func = NULL;
+ fp->jited = 0;
+
+ err = bpf_check_classic(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+
+ /* There might be additional checks and transformations
+ * needed on classic filters, f.e. in case of seccomp.
+ */
+ if (trans) {
+ err = trans(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+ }
+
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
+ bpf_jit_compile(fp);
+
+ /* JIT compiler couldn't process this filter, so do the eBPF translation
+ * for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = bpf_migrate_filter(fp);
+
+ return fp;
+}
+
+/**
+ * bpf_prog_create - create an unattached filter
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ *
+ * Create a filter independent of any socket. We first run some
+ * sanity checks on it to make sure it does not explode on us later.
+ * If an error occurs or there is insufficient memory for the filter
+ * a negative errno code is returned. On success the return is zero.
+ */
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ memcpy(fp->insns, fprog->filter, fsize);
+
+ fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp, NULL);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_create);
+
+/**
+ * bpf_prog_create_from_user - create an unattached filter from user buffer
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ * @trans: post-classic verifier transformation handler
+ * @save_orig: save classic BPF program
+ *
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
+ */
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+ bpf_aux_classic_check_t trans, bool save_orig)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+ int err;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(fp);
+ return -EFAULT;
+ }
+
+ fp->len = fprog->len;
+ fp->orig_prog = NULL;
+
+ if (save_orig) {
+ err = bpf_prog_store_orig_filter(fp, fprog);
+ if (err) {
+ __bpf_prog_free(fp);
+ return -ENOMEM;
}
}
- return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp, trans);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
+
+void bpf_prog_destroy(struct bpf_prog *fp)
+{
+ __bpf_prog_release(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
+
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->prog = prog;
+
+ if (!__sk_filter_charge(sk, fp)) {
+ kfree(fp);
+ return -ENOMEM;
+ }
+ refcount_set(&fp->refcnt, 1);
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+
+static
+struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *prog;
+ int err;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return ERR_PTR(-EPERM);
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (!bpf_check_basics_ok(fprog->filter, fprog->len))
+ return ERR_PTR(-EINVAL);
+
+ prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!prog)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(prog);
+ return ERR_PTR(-EFAULT);
+ }
+
+ prog->len = fprog->len;
+
+ err = bpf_prog_store_orig_filter(prog, fprog);
+ if (err) {
+ __bpf_prog_free(prog);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ return bpf_prepare_filter(prog, NULL);
}
/**
@@ -399,40 +1553,11026 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
*/
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
- struct sk_filter *fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+ struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
- /* Make sure new filter is there and in the right amounts. */
- if (fprog->filter == NULL)
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
+ __bpf_prog_release(prog);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
+
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
+ int err, optmem_max;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max)
+ err = -ENOMEM;
+ else
+ err = reuseport_attach_prog(sk, prog);
+
+ if (err)
+ __bpf_prog_release(prog);
+
+ return err;
+}
+
+static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return ERR_PTR(-EPERM);
+
+ return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+}
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog;
+ int err, optmem_max;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (PTR_ERR(prog) == -EINVAL)
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
+ /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
+ * bpf prog (e.g. sockmap). It depends on the
+ * limitation imposed by bpf_prog_load().
+ * Hence, sysctl_optmem_max is not checked.
+ */
+ if ((sk->sk_type != SOCK_STREAM &&
+ sk->sk_type != SOCK_DGRAM) ||
+ (sk->sk_protocol != IPPROTO_UDP &&
+ sk->sk_protocol != IPPROTO_TCP) ||
+ (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)) {
+ err = -ENOTSUPP;
+ goto err_prog_put;
+ }
+ } else {
+ /* BPF_PROG_TYPE_SOCKET_FILTER */
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max) {
+ err = -ENOMEM;
+ goto err_prog_put;
+ }
+ }
+
+ err = reuseport_attach_prog(sk, prog);
+err_prog_put:
+ if (err)
+ bpf_prog_put(prog);
+
+ return err;
+}
+
+void sk_reuseport_prog_free(struct bpf_prog *prog)
+{
+ if (!prog)
+ return;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ bpf_prog_put(prog);
+ else
+ bpf_prog_destroy(prog);
+}
+
+static inline int __bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+#ifdef CONFIG_DEBUG_NET
+ /* Avoid a splat in pskb_may_pull_reason() */
+ if (write_len > INT_MAX)
return -EINVAL;
+#endif
+ return skb_ensure_writable(skb, write_len);
+}
- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
- if (!fp)
+static inline int bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ int err = __bpf_try_make_writable(skb, write_len);
+
+ bpf_compute_data_pointers(skb);
+ return err;
+}
+
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+ return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
+static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
+static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len, u64, flags)
+{
+ void *ptr;
+
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
+ return -EINVAL;
+ if (unlikely(offset > INT_MAX))
+ return -EFAULT;
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+
+ ptr = skb->data + offset;
+ if (flags & BPF_F_RECOMPUTE_CSUM)
+ __skb_postpull_rcsum(skb, ptr, len, offset);
+
+ memcpy(ptr, from, len);
+
+ if (flags & BPF_F_RECOMPUTE_CSUM)
+ __skb_postpush_rcsum(skb, ptr, len, offset);
+ if (flags & BPF_F_INVALIDATE_HASH)
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+ .func = bpf_skb_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+ u32 len, u64 flags)
+{
+ return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
+}
+
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+ void *, to, u32, len)
+{
+ void *ptr;
+
+ if (unlikely(offset > INT_MAX))
+ goto err_clear;
+
+ ptr = skb_header_pointer(skb, offset, len, to);
+ if (unlikely(!ptr))
+ goto err_clear;
+ if (ptr != to)
+ memcpy(to, ptr, len);
+
+ return 0;
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+ .func = bpf_skb_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+{
+ return ____bpf_skb_load_bytes(skb, offset, to, len);
+}
+
+BPF_CALL_4(bpf_flow_dissector_load_bytes,
+ const struct bpf_flow_dissector *, ctx, u32, offset,
+ void *, to, u32, len)
+{
+ void *ptr;
+
+ if (unlikely(offset > 0xffff))
+ goto err_clear;
+
+ if (unlikely(!ctx->skb))
+ goto err_clear;
+
+ ptr = skb_header_pointer(ctx->skb, offset, len, to);
+ if (unlikely(!ptr))
+ goto err_clear;
+ if (ptr != to)
+ memcpy(to, ptr, len);
+
+ return 0;
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
+ .func = bpf_flow_dissector_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+ u32, offset, void *, to, u32, len, u32, start_header)
+{
+ u8 *end = skb_tail_pointer(skb);
+ u8 *start, *ptr;
+
+ if (unlikely(offset > 0xffff))
+ goto err_clear;
+
+ switch (start_header) {
+ case BPF_HDR_START_MAC:
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ goto err_clear;
+ start = skb_mac_header(skb);
+ break;
+ case BPF_HDR_START_NET:
+ start = skb_network_header(skb);
+ break;
+ default:
+ goto err_clear;
+ }
+
+ ptr = start + offset;
+
+ if (likely(ptr + len <= end)) {
+ memcpy(to, ptr, len);
+ return 0;
+ }
+
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+ .func = bpf_skb_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+ .func = bpf_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
+{
+ return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_sk_fullsock_proto = {
+ .func = bpf_sk_fullsock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+};
+
+static inline int sk_skb_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ return __bpf_try_make_writable(skb, write_len);
+}
+
+BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto sk_skb_pull_data_proto = {
+ .func = sk_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
+{
+ __sum16 *ptr;
+
+ if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
+ if (unlikely(offset > 0xffff || offset & 1))
+ return -EFAULT;
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
+ return -EFAULT;
+
+ ptr = (__sum16 *)(skb->data + offset);
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ csum_replace_by_diff(ptr, to);
+ break;
+ case 2:
+ csum_replace2(ptr, from, to);
+ break;
+ case 4:
+ csum_replace4(ptr, from, to);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+ .func = bpf_l3_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
+{
+ bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+ bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+ bool do_mforce = flags & BPF_F_MARK_ENFORCE;
+ bool is_ipv6 = flags & BPF_F_IPV6;
+ __sum16 *ptr;
+
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
+ return -EINVAL;
+ if (unlikely(offset > 0xffff || offset & 1))
+ return -EFAULT;
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
+ return -EFAULT;
+
+ ptr = (__sum16 *)(skb->data + offset);
+ if (is_mmzero && !do_mforce && !*ptr)
+ return 0;
+
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
+ break;
+ case 2:
+ inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
+ break;
+ case 4:
+ inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (is_mmzero && !*ptr)
+ *ptr = CSUM_MANGLED_0;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+ .func = bpf_l4_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+ __be32 *, to, u32, to_size, __wsum, seed)
+{
+ /* This is quite flexible, some examples:
+ *
+ * from_size == 0, to_size > 0, seed := csum --> pushing data
+ * from_size > 0, to_size == 0, seed := csum --> pulling data
+ * from_size > 0, to_size > 0, seed := 0 --> diffing data
+ *
+ * Even for diffing, from_size and to_size don't need to be equal.
+ */
+
+ __wsum ret = seed;
+
+ if (from_size && to_size)
+ ret = csum_sub(csum_partial(to, to_size, ret),
+ csum_partial(from, from_size, 0));
+ else if (to_size)
+ ret = csum_partial(to, to_size, ret);
+
+ else if (from_size)
+ ret = ~csum_partial(from, from_size, ~ret);
+
+ return csum_from32to16((__force unsigned int)ret);
+}
+
+static const struct bpf_func_proto bpf_csum_diff_proto = {
+ .func = bpf_csum_diff,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+ /* The interface is to be used in combination with bpf_csum_diff()
+ * for direct packet writes. csum rotation for alignment as well
+ * as emulating csum_sub() can be done from the eBPF program.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ return (skb->csum = csum_add(skb->csum, csum));
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+ .func = bpf_csum_update,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
+{
+ /* The interface is to be used in combination with bpf_skb_adjust_room()
+ * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
+ * is passed as flags, for example.
+ */
+ switch (level) {
+ case BPF_CSUM_LEVEL_INC:
+ __skb_incr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_DEC:
+ __skb_decr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_RESET:
+ __skb_reset_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_QUERY:
+ return skb->ip_summed == CHECKSUM_UNNECESSARY ?
+ skb->csum_level : -EACCES;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_csum_level_proto = {
+ .func = bpf_csum_level,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return dev_forward_skb_nomtu(dev, skb);
+}
+
+static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ int ret = ____dev_forward_skb(dev, skb, false);
+
+ if (likely(!ret)) {
+ skb->dev = dev;
+ ret = netif_rx(skb);
+ }
+
+ return ret;
+}
+
+static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ int ret;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ kfree_skb(skb);
+ return -ENETDOWN;
+ }
+
+ skb->dev = dev;
+ skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
+ skb_clear_tstamp(skb);
+
+ dev_xmit_recursion_inc();
+ ret = dev_queue_xmit(skb);
+ dev_xmit_recursion_dec();
+
+ return ret;
+}
+
+static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ unsigned int mlen = skb_network_offset(skb);
+
+ if (unlikely(skb->len <= mlen)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
+ if (mlen) {
+ __skb_pull(skb, mlen);
+
+ /* At ingress, the mac header has already been pulled once.
+ * At egress, skb_pospull_rcsum has to be done in case that
+ * the skb is originated from ingress (i.e. a forwarded skb)
+ * to ensure that rcsum starts at net header.
+ */
+ if (!skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+ }
+ skb_pop_mac_header(skb);
+ skb_reset_mac_len(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
+ bpf_push_mac_rcsum(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ if (dev_is_mac_header_xmit(dev))
+ return __bpf_redirect_common(skb, dev, flags);
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *nexthop;
+ struct dst_entry *dst = NULL;
+ struct neighbour *neigh;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb_clear_tstamp(skb);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+ if (!nh) {
+ dst = skb_dst(skb);
+ nexthop = rt6_nexthop(dst_rt6_info(dst),
+ &ipv6_hdr(skb)->daddr);
+ } else {
+ nexthop = &nh->ipv6_nh;
+ }
+ neigh = ip_neigh_gw6(dev, nexthop);
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ local_bh_disable();
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, false);
+ dev_xmit_recursion_dec();
+ local_bh_enable();
+ rcu_read_unlock();
+ return ret;
+ }
+ rcu_read_unlock();
+ if (dst)
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_flags = FLOWI_FLAG_ANYSRC,
+ .flowi6_mark = skb->mark,
+ .flowlabel = ip6_flowinfo(ip6h),
+ .flowi6_oif = dev->ifindex,
+ .flowi6_proto = ip6h->nexthdr,
+ .daddr = ip6h->daddr,
+ .saddr = ip6h->saddr,
+ };
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ goto out_drop;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ } else if (nh->nh_family != AF_INET6) {
+ goto out_drop;
+ }
+
+ err = bpf_out_neigh_v6(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ DEV_STATS_INC(dev, tx_errors);
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_IPV6 */
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ bool is_v6gw = false;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb_clear_tstamp(skb);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+ if (!nh) {
+ struct rtable *rt = skb_rtable(skb);
+
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+ } else if (nh->nh_family == AF_INET6) {
+ neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+ is_v6gw = true;
+ } else if (nh->nh_family == AF_INET) {
+ neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+ } else {
+ rcu_read_unlock();
+ goto out_drop;
+ }
+
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ local_bh_disable();
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, is_v6gw);
+ dev_xmit_recursion_dec();
+ local_bh_enable();
+ rcu_read_unlock();
+ return ret;
+ }
+ rcu_read_unlock();
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct iphdr *ip4h = ip_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct flowi4 fl4 = {
+ .flowi4_flags = FLOWI_FLAG_ANYSRC,
+ .flowi4_mark = skb->mark,
+ .flowi4_dscp = ip4h_dscp(ip4h),
+ .flowi4_oif = dev->ifindex,
+ .flowi4_proto = ip4h->protocol,
+ .daddr = ip4h->daddr,
+ .saddr = ip4h->saddr,
+ };
+ struct rtable *rt;
+
+ rt = ip_route_output_flow(net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out_drop;
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+ ip_rt_put(rt);
+ goto out_drop;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ }
+
+ err = bpf_out_neigh_v4(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ DEV_STATS_INC(dev, tx_errors);
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_INET */
+
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ struct ethhdr *ethh = eth_hdr(skb);
+
+ if (unlikely(skb->mac_header >= skb->network_header))
+ goto out;
+ bpf_push_mac_rcsum(skb);
+ if (is_multicast_ether_addr(ethh->h_dest))
+ goto out;
+
+ skb_pull(skb, sizeof(*ethh));
+ skb_unset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return __bpf_redirect_neigh_v4(skb, dev, nh);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return __bpf_redirect_neigh_v6(skb, dev, nh);
+out:
+ kfree_skb(skb);
+ return -ENOTSUPP;
+}
+
+/* Internal, non-exposed redirect flags. */
+enum {
+ BPF_F_NEIGH = (1ULL << 16),
+ BPF_F_PEER = (1ULL << 17),
+ BPF_F_NEXTHOP = (1ULL << 18),
+#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
+};
+
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
+{
+ struct net_device *dev;
+ struct sk_buff *clone;
+ int ret;
+
+ BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
+
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
+ return -EINVAL;
+
+ /* BPF test infra's convert___skb_to_skb() can create type-less
+ * GSO packets. gso_features_check() will detect this as a bad
+ * offload. However, lets not leak them out in the first place.
+ */
+ if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type))
+ return -EBADMSG;
+
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
+ if (unlikely(!dev))
+ return -EINVAL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!clone))
return -ENOMEM;
- if (copy_from_user(fp->insns, fprog->filter, fsize)) {
- sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+
+ /* For direct write, we need to keep the invariant that the skbs
+ * we're dealing with need to be uncloned. Should uncloning fail
+ * here, we need to free the just generated clone to unclone once
+ * again.
+ */
+ ret = bpf_try_make_head_writable(skb);
+ if (unlikely(ret)) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+
+ return __bpf_redirect(clone, dev, flags);
+}
+
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
+ .func = bpf_clone_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static struct net_device *skb_get_peer_dev(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (likely(ops->ndo_get_peer_dev))
+ return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
+ netkit_peer_dev, dev);
+ return NULL;
+}
+
+int skb_do_redirect(struct sk_buff *skb)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ struct net *net = dev_net(skb->dev);
+ struct net_device *dev;
+ u32 flags = ri->flags;
+
+ dev = dev_get_by_index_rcu(net, ri->tgt_index);
+ ri->tgt_index = 0;
+ ri->flags = 0;
+ if (unlikely(!dev))
+ goto out_drop;
+ if (flags & BPF_F_PEER) {
+ if (unlikely(!skb_at_tc_ingress(skb)))
+ goto out_drop;
+ dev = skb_get_peer_dev(dev);
+ if (unlikely(!dev ||
+ !(dev->flags & IFF_UP) ||
+ net_eq(net, dev_net(dev))))
+ goto out_drop;
+ skb->dev = dev;
+ dev_sw_netstats_rx_add(dev, skb->len);
+ skb_scrub_packet(skb, false);
+ return -EAGAIN;
+ }
+ return flags & BPF_F_NEIGH ?
+ __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
+ &ri->nh : NULL) :
+ __bpf_redirect(skb, dev, flags);
+out_drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
+ return TC_ACT_SHOT;
+
+ ri->flags = flags;
+ ri->tgt_index = ifindex;
+
+ return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_proto = {
+ .func = bpf_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+
+ if (unlikely(flags))
+ return TC_ACT_SHOT;
+
+ ri->flags = BPF_F_PEER;
+ ri->tgt_index = ifindex;
+
+ return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_peer_proto = {
+ .func = bpf_redirect_peer,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+ int, plen, u64, flags)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+
+ if (unlikely((plen && plen < sizeof(*params)) || flags))
+ return TC_ACT_SHOT;
+
+ ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
+ ri->tgt_index = ifindex;
+
+ BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+ if (plen)
+ memcpy(&ri->nh, params, sizeof(ri->nh));
+
+ return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_neigh_proto = {
+ .func = bpf_redirect_neigh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
+{
+ msg->apply_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+ .func = bpf_msg_apply_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
+{
+ msg->cork_bytes = bytes;
+ return 0;
+}
+
+static void sk_msg_reset_curr(struct sk_msg *msg)
+{
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else {
+ u32 i = msg->sg.end;
+
+ sk_msg_iter_var_prev(i);
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
+}
+
+static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
+ .func = bpf_msg_cork_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+ u32, end, u64, flags)
+{
+ u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+ u32 first_sge, last_sge, i, shift, bytes_sg_total;
+ struct scatterlist *sge;
+ u8 *raw, *to, *from;
+ struct page *page;
+
+ if (unlikely(flags || end <= start))
+ return -EINVAL;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ offset += len;
+ len = sk_msg_elem(msg, i)->length;
+ if (start < offset + len)
+ break;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ if (unlikely(start >= offset + len))
+ return -EINVAL;
+
+ first_sge = i;
+ /* The start may point into the sg element so we need to also
+ * account for the headroom.
+ */
+ bytes_sg_total = start - offset + bytes;
+ if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
+ goto out;
+
+ /* At this point we need to linearize multiple scatterlist
+ * elements or a single shared page. Either way we need to
+ * copy into a linear buffer exclusively owned by BPF. Then
+ * place the buffer in the scatterlist and fixup the original
+ * entries by removing the entries now in the linear buffer
+ * and shifting the remaining entries. For now we do not try
+ * to copy partial entries to avoid complexity of running out
+ * of sg_entry slots. The downside is reading a single byte
+ * will copy the entire sg entry.
+ */
+ do {
+ copy += sk_msg_elem(msg, i)->length;
+ sk_msg_iter_var_next(i);
+ if (bytes_sg_total <= copy)
+ break;
+ } while (i != msg->sg.end);
+ last_sge = i;
+
+ if (unlikely(bytes_sg_total > copy))
+ return -EINVAL;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+ get_order(copy));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ raw = page_address(page);
+ i = first_sge;
+ do {
+ sge = sk_msg_elem(msg, i);
+ from = sg_virt(sge);
+ len = sge->length;
+ to = raw + poffset;
+
+ memcpy(to, from, len);
+ poffset += len;
+ sge->length = 0;
+ put_page(sg_page(sge));
+
+ sk_msg_iter_var_next(i);
+ } while (i != last_sge);
+
+ sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
+
+ /* To repair sg ring we need to shift entries. If we only
+ * had a single entry though we can just replace it and
+ * be done. Otherwise walk the ring and shift the entries.
+ */
+ WARN_ON_ONCE(last_sge == first_sge);
+ shift = last_sge > first_sge ?
+ last_sge - first_sge - 1 :
+ NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
+ if (!shift)
+ goto out;
+
+ i = first_sge;
+ sk_msg_iter_var_next(i);
+ do {
+ u32 move_from;
+
+ if (i + shift >= NR_MSG_FRAG_IDS)
+ move_from = i + shift - NR_MSG_FRAG_IDS;
+ else
+ move_from = i + shift;
+ if (move_from == msg->sg.end)
+ break;
+
+ msg->sg.data[i] = msg->sg.data[move_from];
+ msg->sg.data[move_from].length = 0;
+ msg->sg.data[move_from].page_link = 0;
+ msg->sg.data[move_from].offset = 0;
+ sk_msg_iter_var_next(i);
+ } while (1);
+
+ msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+ msg->sg.end - shift + NR_MSG_FRAG_IDS :
+ msg->sg.end - shift;
+out:
+ sk_msg_reset_curr(msg);
+ msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
+ msg->data_end = msg->data + bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+ .func = bpf_msg_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
+ u32, len, u64, flags)
+{
+ struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
+ u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
+ u8 *raw, *to, *from;
+ struct page *page;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ offset += l;
+ l = sk_msg_elem(msg, i)->length;
+
+ if (start < offset + l)
+ break;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ if (start > offset + l)
+ return -EINVAL;
+
+ space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+ /* If no space available will fallback to copy, we need at
+ * least one scatterlist elem available to push data into
+ * when start aligns to the beginning of an element or two
+ * when it falls inside an element. We handle the start equals
+ * offset case because its the common case for inserting a
+ * header.
+ */
+ if (!space || (space == 1 && start != offset))
+ copy = msg->sg.data[i].length;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+ get_order(copy + len));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ if (copy) {
+ int front, back;
+
+ raw = page_address(page);
+
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
+ psge = sk_msg_elem(msg, i);
+ front = start - offset;
+ back = psge->length - front;
+ from = sg_virt(psge);
+
+ if (front)
+ memcpy(raw, from, front);
+
+ if (back) {
+ from += front;
+ to = raw + front + len;
+
+ memcpy(to, from, back);
+ }
+
+ put_page(sg_page(psge));
+ new = i;
+ goto place_new;
+ }
+
+ if (start - offset) {
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
+ psge = sk_msg_elem(msg, i);
+ rsge = sk_msg_elem_cpy(msg, i);
+
+ psge->length = start - offset;
+ rsge.length -= psge->length;
+ rsge.offset += start;
+
+ sk_msg_iter_var_next(i);
+ sg_unmark_end(psge);
+ sg_unmark_end(&rsge);
+ }
+
+ /* Slot(s) to place newly allocated data */
+ sk_msg_iter_next(msg, end);
+ new = i;
+ sk_msg_iter_var_next(i);
+
+ if (i == msg->sg.end) {
+ if (!rsge.length)
+ goto place_new;
+ sk_msg_iter_next(msg, end);
+ goto place_new;
+ }
+
+ /* Shift one or two slots as needed */
+ sge = sk_msg_elem_cpy(msg, new);
+ sg_unmark_end(&sge);
+
+ nsge = sk_msg_elem_cpy(msg, i);
+ if (rsge.length) {
+ sk_msg_iter_var_next(i);
+ nnsge = sk_msg_elem_cpy(msg, i);
+ sk_msg_iter_next(msg, end);
+ }
+
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sge = nsge;
+ sk_msg_iter_var_next(i);
+ if (rsge.length) {
+ nsge = nnsge;
+ nnsge = sk_msg_elem_cpy(msg, i);
+ } else {
+ nsge = sk_msg_elem_cpy(msg, i);
+ }
+ }
+
+place_new:
+ /* Place newly allocated data buffer */
+ sk_mem_charge(msg->sk, len);
+ msg->sg.size += len;
+ __clear_bit(new, msg->sg.copy);
+ sg_set_page(&msg->sg.data[new], page, len + copy, 0);
+ if (rsge.length) {
+ get_page(sg_page(&rsge));
+ sk_msg_iter_var_next(new);
+ msg->sg.data[new] = rsge;
+ }
+
+ sk_msg_reset_curr(msg);
+ sk_msg_compute_data_pointers(msg);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_push_data_proto = {
+ .func = bpf_msg_push_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ int prev;
+
+ put_page(sg_page(sge));
+ do {
+ prev = i;
+ sk_msg_iter_var_next(i);
+ msg->sg.data[prev] = msg->sg.data[i];
+ } while (i != msg->sg.end);
+
+ sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+ struct scatterlist tmp, sge;
+
+ sk_msg_iter_next(msg, end);
+ sge = sk_msg_elem_cpy(msg, i);
+ sk_msg_iter_var_next(i);
+ tmp = sk_msg_elem_cpy(msg, i);
+
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sk_msg_iter_var_next(i);
+ sge = tmp;
+ tmp = sk_msg_elem_cpy(msg, i);
+ }
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+ u32, len, u64, flags)
+{
+ u32 i = 0, l = 0, space, offset = 0;
+ u64 last = start + len;
+ int pop;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ offset += l;
+ l = sk_msg_elem(msg, i)->length;
+
+ if (start < offset + l)
+ break;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ /* Bounds checks: start and pop must be inside message */
+ if (start >= offset + l || last > msg->sg.size)
+ return -EINVAL;
+
+ space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+ pop = len;
+ /* --------------| offset
+ * -| start |-------- len -------|
+ *
+ * |----- a ----|-------- pop -------|----- b ----|
+ * |______________________________________________| length
+ *
+ *
+ * a: region at front of scatter element to save
+ * b: region at back of scatter element to save when length > A + pop
+ * pop: region to pop from element, same as input 'pop' here will be
+ * decremented below per iteration.
+ *
+ * Two top-level cases to handle when start != offset, first B is non
+ * zero and second B is zero corresponding to when a pop includes more
+ * than one element.
+ *
+ * Then if B is non-zero AND there is no space allocate space and
+ * compact A, B regions into page. If there is space shift ring to
+ * the right free'ing the next element in ring to place B, leaving
+ * A untouched except to reduce length.
+ */
+ if (start != offset) {
+ struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ int a = start - offset;
+ int b = sge->length - pop - a;
+
+ sk_msg_iter_var_next(i);
+
+ if (b > 0) {
+ if (space) {
+ sge->length = a;
+ sk_msg_shift_right(msg, i);
+ nsge = sk_msg_elem(msg, i);
+ get_page(sg_page(sge));
+ sg_set_page(nsge,
+ sg_page(sge),
+ b, sge->offset + pop + a);
+ } else {
+ struct page *page, *orig;
+ u8 *to, *from;
+
+ page = alloc_pages(__GFP_NOWARN |
+ __GFP_COMP | GFP_ATOMIC,
+ get_order(a + b));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ orig = sg_page(sge);
+ from = sg_virt(sge);
+ to = page_address(page);
+ memcpy(to, from, a);
+ memcpy(to + a, from + a + pop, b);
+ sg_set_page(sge, page, a + b, 0);
+ put_page(orig);
+ }
+ pop = 0;
+ } else {
+ pop -= (sge->length - a);
+ sge->length = a;
+ }
+ }
+
+ /* From above the current layout _must_ be as follows,
+ *
+ * -| offset
+ * -| start
+ *
+ * |---- pop ---|---------------- b ------------|
+ * |____________________________________________| length
+ *
+ * Offset and start of the current msg elem are equal because in the
+ * previous case we handled offset != start and either consumed the
+ * entire element and advanced to the next element OR pop == 0.
+ *
+ * Two cases to handle here are first pop is less than the length
+ * leaving some remainder b above. Simply adjust the element's layout
+ * in this case. Or pop >= length of the element so that b = 0. In this
+ * case advance to next element decrementing pop.
+ */
+ while (pop) {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (pop < sge->length) {
+ sge->length -= pop;
+ sge->offset += pop;
+ pop = 0;
+ } else {
+ pop -= sge->length;
+ sk_msg_shift_left(msg, i);
+ }
+ }
+
+ sk_mem_uncharge(msg->sk, len - pop);
+ msg->sg.size -= (len - pop);
+ sk_msg_reset_curr(msg);
+ sk_msg_compute_data_pointers(msg);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pop_data_proto = {
+ .func = bpf_msg_pop_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ return sock_cgroup_classid(&sk->sk_cgrp_data);
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
+ .func = bpf_skb_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+#endif
+
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
+{
+ return task_get_classid(skb);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
+ .func = bpf_get_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
+{
+ return dst_tclassid(skb);
+}
+
+static const struct bpf_func_proto bpf_get_route_realm_proto = {
+ .func = bpf_get_route_realm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
+{
+ /* If skb_clear_hash() was called due to mangling, we can
+ * trigger SW recalculation here. Later access to hash
+ * can then use the inline skb->hash via context directly
+ * instead of calling this helper again.
+ */
+ return skb_get_hash(skb);
+}
+
+static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
+ .func = bpf_get_hash_recalc,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+ /* After all direct packet write, this can be used once for
+ * triggering a lazy recalc on next skb_get_hash() invocation.
+ */
+ skb_clear_hash(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+ .func = bpf_set_hash_invalid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
+{
+ /* Set user specified hash as L4(+), so that it gets returned
+ * on skb_get_hash() call unless BPF prog later on triggers a
+ * skb_clear_hash().
+ */
+ __skb_set_sw_hash(skb, hash, true);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_proto = {
+ .func = bpf_set_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+ u16, vlan_tci)
+{
+ int ret;
+
+ if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+ vlan_proto != htons(ETH_P_8021AD)))
+ vlan_proto = htons(ETH_P_8021Q);
+
+ bpf_push_mac_rcsum(skb);
+ ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+ bpf_pull_mac_rcsum(skb);
+ skb_reset_mac_len(skb);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+ .func = bpf_skb_vlan_push,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
+{
+ int ret;
+
+ bpf_push_mac_rcsum(skb);
+ ret = skb_vlan_pop(skb);
+ bpf_pull_mac_rcsum(skb);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+ .func = bpf_skb_vlan_pop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+ skb->protocol = htons(proto);
+ if (skb_valid_dst(skb))
+ skb_dst_drop(skb);
+}
+
+static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* Caller already did skb_cow() with meta_len+len as headroom,
+ * so no need to do it here.
+ */
+ skb_push(skb, len);
+ skb_postpush_data_move(skb, len, off);
+ memset(skb->data + off, 0, len);
+
+ /* No skb_postpush_rcsum(skb, skb->data + off, len)
+ * needed here as it does not change the skb->csum
+ * result for checksum complete when summing over
+ * zeroed blocks.
+ */
+ return 0;
+}
+
+static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ void *old_data;
+
+ /* skb_ensure_writable() is not needed here, as we're
+ * already working on an uncloned skb.
+ */
+ if (unlikely(!pskb_may_pull(skb, off + len)))
+ return -ENOMEM;
+
+ old_data = skb->data;
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, old_data + off, len);
+ skb_postpull_data_move(skb, len, off);
+
+ return 0;
+}
+
+static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* There's no need for __skb_push()/__skb_pull() pair to
+ * get to the start of the mac header as we're guaranteed
+ * to always start from here under eBPF.
+ */
+ ret = bpf_skb_generic_push(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header -= len;
+ skb->network_header -= len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* Same here, __skb_push()/__skb_pull() pair not needed. */
+ ret = bpf_skb_generic_pop(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header += len;
+ skb->network_header += len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ const u8 meta_len = skb_metadata_len(skb);
+ u32 off = skb_mac_header_len(skb);
+ int ret;
+
+ ret = skb_cow(skb, meta_len + len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
+ if (shinfo->gso_type & SKB_GSO_TCPV4) {
+ shinfo->gso_type &= ~SKB_GSO_TCPV4;
+ shinfo->gso_type |= SKB_GSO_TCPV6;
+ }
+ }
+
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb_mac_header_len(skb);
+ int ret;
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
+ if (shinfo->gso_type & SKB_GSO_TCPV6) {
+ shinfo->gso_type &= ~SKB_GSO_TCPV6;
+ shinfo->gso_type |= SKB_GSO_TCPV4;
+ }
+ }
+
+ bpf_skb_change_protocol(skb, ETH_P_IP);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
+{
+ __be16 from_proto = skb->protocol;
+
+ if (from_proto == htons(ETH_P_IP) &&
+ to_proto == htons(ETH_P_IPV6))
+ return bpf_skb_proto_4_to_6(skb);
+
+ if (from_proto == htons(ETH_P_IPV6) &&
+ to_proto == htons(ETH_P_IP))
+ return bpf_skb_proto_6_to_4(skb);
+
+ return -ENOTSUPP;
+}
+
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+ u64, flags)
+{
+ int ret;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ /* General idea is that this helper does the basic groundwork
+ * needed for changing the protocol, and eBPF program fills the
+ * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
+ * and other helpers, rather than passing a raw buffer here.
+ *
+ * The rationale is to keep this minimal and without a need to
+ * deal with raw packet data. F.e. even if we would pass buffers
+ * here, the program still needs to call the bpf_lX_csum_replace()
+ * helpers anyway. Plus, this way we keep also separation of
+ * concerns, since f.e. bpf_skb_store_bytes() should only take
+ * care of stores.
+ *
+ * Currently, additional options and extension header space are
+ * not supported, but flags register is reserved so we can adapt
+ * that. For offloads, we mark packet as dodgy, so that headers
+ * need to be verified first.
+ */
+ ret = bpf_skb_proto_xlat(skb, proto);
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_proto_proto = {
+ .func = bpf_skb_change_proto,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
+{
+ /* We only allow a restricted subset to be changed for now. */
+ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
+ !skb_pkt_type_ok(pkt_type)))
+ return -EINVAL;
+
+ skb->pkt_type = pkt_type;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_type_proto = {
+ .func = bpf_skb_change_type,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return sizeof(struct iphdr);
+ case htons(ETH_P_IPV6):
+ return sizeof(struct ipv6hdr);
+ default:
+ return ~0U;
+ }
+}
+
+#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+
+#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+
+#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
+ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
+ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
+ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
+ BPF_F_ADJ_ROOM_ENCAP_L2( \
+ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
+ BPF_F_ADJ_ROOM_DECAP_L3_MASK)
+
+static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
+ u64 flags)
+{
+ u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
+ bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
+ u16 mac_len = 0, inner_net = 0, inner_trans = 0;
+ const u8 meta_len = skb_metadata_len(skb);
+ unsigned int gso_type = SKB_GSO_DODGY;
+ int ret;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
+ /* udp gso_size delineates datagrams, only allow if fixed */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
+ !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+ return -ENOTSUPP;
+ }
+
+ ret = skb_cow_head(skb, meta_len + len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (encap) {
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -ENOTSUPP;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+ return -EINVAL;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+ return -EINVAL;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
+ if (skb->encapsulation)
+ return -EALREADY;
+
+ mac_len = skb->network_header - skb->mac_header;
+ inner_net = skb->network_header;
+ if (inner_mac_len > len_diff)
+ return -EINVAL;
+ inner_trans = skb->transport_header;
+ }
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (encap) {
+ skb->inner_mac_header = inner_net - inner_mac_len;
+ skb->inner_network_header = inner_net;
+ skb->inner_transport_header = inner_trans;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);
+
+ skb->encapsulation = 1;
+ skb_set_network_header(skb, mac_len);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
+ gso_type |= SKB_GSO_GRE;
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+ gso_type |= SKB_GSO_IPXIP6;
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
+ gso_type |= SKB_GSO_IPXIP4;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
+ flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
+ int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
+ sizeof(struct ipv6hdr) :
+ sizeof(struct iphdr);
+
+ skb_set_transport_header(skb, mac_len + nh_len);
+ }
+
+ /* Match skb->protocol to new outer l3 protocol */
+ if (skb->protocol == htons(ETH_P_IP) &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ else if (skb->protocol == htons(ETH_P_IPV6) &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
+ bpf_skb_change_protocol(skb, ETH_P_IP);
+ }
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Header must be checked, and gso_segs recomputed. */
+ shinfo->gso_type |= gso_type;
+ shinfo->gso_segs = 0;
+
+ /* Due to header growth, MSS needs to be downgraded.
+ * There is a BUG_ON() when segmenting the frag_list with
+ * head_frag true, so linearize the skb after downgrading
+ * the MSS.
+ */
+ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
+ skb_decrease_gso_size(shinfo, len_diff);
+ if (shinfo->frag_list)
+ return skb_linearize(skb);
+ }
+ }
+
+ return 0;
+}
+
+static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
+ u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_DECAP_L3_MASK |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
+ return -EINVAL;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
+ /* udp gso_size delineates datagrams, only allow if fixed */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
+ !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+ return -ENOTSUPP;
+ }
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ /* Match skb->protocol to new outer l3 protocol */
+ if (skb->protocol == htons(ETH_P_IP) &&
+ flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ else if (skb->protocol == htons(ETH_P_IPV6) &&
+ flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
+ bpf_skb_change_protocol(skb, ETH_P_IP);
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Due to header shrink, MSS can be upgraded. */
+ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+ skb_increase_gso_size(shinfo, len_diff);
+
+ /* Header must be checked, and gso_segs recomputed. */
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
+ }
+
+ return 0;
+}
+
+#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
+
+BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
+{
+ u32 len_diff_abs = abs(len_diff);
+ bool shrink = len_diff < 0;
+ int ret = 0;
+
+ if (unlikely(flags || mode))
+ return -EINVAL;
+ if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
+
+ if (!shrink) {
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+ __skb_push(skb, len_diff_abs);
+ memset(skb->data, 0, len_diff_abs);
+ } else {
+ if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
+ return -ENOMEM;
+ __skb_pull(skb, len_diff_abs);
}
+ if (tls_sw_has_ctx_rx(skb->sk)) {
+ struct strp_msg *rxm = strp_msg(skb);
- atomic_set(&fp->refcnt, 1);
- fp->len = fprog->len;
+ rxm->full_len += len_diff;
+ }
+ return ret;
+}
- err = sk_chk_filter(fp->insns, fp->len);
- if (!err) {
- struct sk_filter *old_fp;
+static const struct bpf_func_proto sk_skb_adjust_room_proto = {
+ .func = sk_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
- rcu_read_lock_bh();
- old_fp = rcu_dereference(sk->sk_filter);
- rcu_assign_pointer(sk->sk_filter, fp);
- rcu_read_unlock_bh();
- fp = old_fp;
+BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
+{
+ u32 len_cur, len_diff_abs = abs(len_diff);
+ u32 len_min = bpf_skb_net_base_len(skb);
+ u32 len_max = BPF_SKB_MAX_LEN;
+ __be16 proto = skb->protocol;
+ bool shrink = len_diff < 0;
+ u32 off;
+ int ret;
+
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
+ return -EINVAL;
+ if (unlikely(len_diff_abs > 0xfffU))
+ return -EFAULT;
+ if (unlikely(proto != htons(ETH_P_IP) &&
+ proto != htons(ETH_P_IPV6)))
+ return -ENOTSUPP;
+
+ off = skb_mac_header_len(skb);
+ switch (mode) {
+ case BPF_ADJ_ROOM_NET:
+ off += bpf_skb_net_base_len(skb);
+ break;
+ case BPF_ADJ_ROOM_MAC:
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
+ if (!shrink)
+ return -EINVAL;
+
+ switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
+ case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
+ len_min = sizeof(struct iphdr);
+ break;
+ case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
+ len_min = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ len_cur = skb->len - skb_network_offset(skb);
+ if ((shrink && (len_diff_abs >= len_cur ||
+ len_cur - len_diff_abs < len_min)) ||
+ (!shrink && (skb->len + len_diff_abs > len_max &&
+ !skb_is_gso(skb))))
+ return -ENOTSUPP;
+
+ ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
+ bpf_skb_net_grow(skb, off, len_diff_abs, flags);
+ if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
+ __skb_reset_checksum_unnecessary(skb);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
+ .func = bpf_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u32 __bpf_skb_min_len(const struct sk_buff *skb)
+{
+ int offset = skb_network_offset(skb);
+ u32 min_len = 0;
+
+ if (offset > 0)
+ min_len = offset;
+ if (skb_transport_header_was_set(skb)) {
+ offset = skb_transport_offset(skb);
+ if (offset > 0)
+ min_len = offset;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ offset = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ }
+ return min_len;
+}
+
+static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ unsigned int old_len = skb->len;
+ int ret;
+
+ ret = __skb_grow_rcsum(skb, new_len);
+ if (!ret)
+ memset(skb->data + old_len, 0, new_len - old_len);
+ return ret;
+}
+
+static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ return __skb_trim_rcsum(skb, new_len);
+}
+
+static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
+ u64 flags)
+{
+ u32 max_len = BPF_SKB_MAX_LEN;
+ u32 min_len = __bpf_skb_min_len(skb);
+ int ret;
+
+ if (unlikely(flags || new_len > max_len || new_len < min_len))
+ return -EINVAL;
+ if (skb->encapsulation)
+ return -ENOTSUPP;
+
+ /* The basic idea of this helper is that it's performing the
+ * needed work to either grow or trim an skb, and eBPF program
+ * rewrites the rest via helpers like bpf_skb_store_bytes(),
+ * bpf_lX_csum_replace() and others rather than passing a raw
+ * buffer here. This one is a slow path helper and intended
+ * for replies with control messages.
+ *
+ * Like in bpf_skb_change_proto(), we want to keep this rather
+ * minimal and without protocol specifics so that we are able
+ * to separate concerns as in bpf_skb_store_bytes() should only
+ * be the one responsible for writing buffers.
+ *
+ * It's really expected to be a slow path operation here for
+ * control message replies, so we're implicitly linearizing,
+ * uncloning and drop offloads from the skb by this.
+ */
+ ret = __bpf_try_make_writable(skb, skb->len);
+ if (!ret) {
+ if (new_len > skb->len)
+ ret = bpf_skb_grow_rcsum(skb, new_len);
+ else if (new_len < skb->len)
+ ret = bpf_skb_trim_rcsum(skb, new_len);
+ if (!ret && skb_is_gso(skb))
+ skb_gso_reset(skb);
+ }
+ return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_tail(skb, new_len, flags);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_tail_proto = {
+ .func = bpf_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ return __bpf_skb_change_tail(skb, new_len, flags);
+}
+
+static const struct bpf_func_proto sk_skb_change_tail_proto = {
+ .func = sk_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
+ u64 flags)
+{
+ const u8 meta_len = skb_metadata_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (int)head_room < 0 ||
+ (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, meta_len + head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ skb_postpush_data_move(skb, head_room, 0);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ }
+
+ return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_head(skb, head_room, flags);
+
+ bpf_compute_data_pointers(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ return __bpf_skb_change_head(skb, head_room, flags);
+}
+
+static const struct bpf_func_proto sk_skb_change_head_proto = {
+ .func = sk_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
+{
+ return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
+};
+
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+ return xdp_data_meta_unsupported(xdp) ? 0 :
+ xdp->data - xdp->data_meta;
+}
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
+ unsigned long metalen = xdp_get_metalen(xdp);
+ void *data_start = xdp_frame_end + metalen;
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < data_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ if (metalen)
+ memmove(xdp->data_meta + offset,
+ xdp->data_meta, metalen);
+ xdp->data_meta += offset;
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush)
+{
+ unsigned long ptr_len, ptr_off = 0;
+ skb_frag_t *next_frag, *end_frag;
+ struct skb_shared_info *sinfo;
+ void *src, *dst;
+ u8 *ptr_buf;
+
+ if (likely(xdp->data_end - xdp->data >= off + len)) {
+ src = flush ? buf : xdp->data + off;
+ dst = flush ? xdp->data + off : buf;
+ memcpy(dst, src, len);
+ return;
+ }
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ end_frag = &sinfo->frags[sinfo->nr_frags];
+ next_frag = &sinfo->frags[0];
+
+ ptr_len = xdp->data_end - xdp->data;
+ ptr_buf = xdp->data;
+
+ while (true) {
+ if (off < ptr_off + ptr_len) {
+ unsigned long copy_off = off - ptr_off;
+ unsigned long copy_len = min(len, ptr_len - copy_off);
+
+ src = flush ? buf : ptr_buf + copy_off;
+ dst = flush ? ptr_buf + copy_off : buf;
+ memcpy(dst, src, copy_len);
+
+ off += copy_len;
+ len -= copy_len;
+ buf += copy_len;
+ }
+
+ if (!len || next_frag == end_frag)
+ break;
+
+ ptr_off += ptr_len;
+ ptr_buf = skb_frag_address(next_frag);
+ ptr_len = skb_frag_size(next_frag);
+ next_frag++;
+ }
+}
+
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+ u32 size = xdp->data_end - xdp->data;
+ struct skb_shared_info *sinfo;
+ void *addr = xdp->data;
+ int i;
+
+ if (unlikely(offset > 0xffff || len > 0xffff))
+ return ERR_PTR(-EFAULT);
+
+ if (unlikely(offset + len > xdp_get_buff_len(xdp)))
+ return ERR_PTR(-EINVAL);
+
+ if (likely(offset < size)) /* linear area */
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ offset -= size;
+ for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+ u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+ if (offset < frag_size) {
+ addr = skb_frag_address(&sinfo->frags[i]);
+ size = frag_size;
+ break;
+ }
+ offset -= frag_size;
+ }
+out:
+ return offset + len <= size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+ else
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+ .func = bpf_xdp_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+ return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
+}
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+ else
+ memcpy(ptr, buf, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+ .func = bpf_xdp_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+ return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
+}
+
+static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+ struct xdp_rxq_info *rxq = xdp->rxq;
+ unsigned int tailroom;
+
+ if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
+ return -EOPNOTSUPP;
+
+ tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+ if (unlikely(offset > tailroom))
+ return -EINVAL;
+
+ memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+ skb_frag_size_add(frag, offset);
+ sinfo->xdp_frags_size += offset;
+ if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
+ xsk_buff_get_tail(xdp)->data_end += offset;
+
+ return 0;
+}
+
+static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
+ bool tail, bool release)
+{
+ struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) :
+ xsk_buff_get_head(xdp);
+
+ if (release) {
+ xsk_buff_del_frag(zc_frag);
+ } else {
+ if (tail)
+ zc_frag->data_end -= shrink;
+ else
+ zc_frag->data += shrink;
+ }
+
+ return zc_frag;
+}
+
+static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
+ int shrink, bool tail)
+{
+ enum xdp_mem_type mem_type = xdp->rxq->mem.type;
+ bool release = skb_frag_size(frag) == shrink;
+ netmem_ref netmem = skb_frag_netmem(frag);
+ struct xdp_buff *zc_frag = NULL;
+
+ if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
+ netmem = 0;
+ zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release);
+ }
+
+ if (release) {
+ __xdp_return(netmem, mem_type, false, zc_frag);
+ } else {
+ if (!tail)
+ skb_frag_off_add(frag, shrink);
+ skb_frag_size_sub(frag, shrink);
+ }
+
+ return release;
+}
+
+static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, n_frags_free = 0, len_free = 0;
+
+ if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
+ return -EINVAL;
+
+ for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ int shrink = min_t(int, offset, skb_frag_size(frag));
+
+ len_free += shrink;
+ offset -= shrink;
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, true))
+ n_frags_free++;
+ }
+ sinfo->nr_frags -= n_frags_free;
+ sinfo->xdp_frags_size -= len_free;
+
+ if (unlikely(!sinfo->nr_frags)) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
+ xdp->data_end -= offset;
+ }
+
+ return 0;
+}
+
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+ void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
+ void *data_end = xdp->data_end + offset;
+
+ if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
+ if (offset < 0)
+ return bpf_xdp_frags_shrink_tail(xdp, -offset);
+
+ return bpf_xdp_frags_increase_tail(xdp, offset);
+ }
+
+ /* Notice that xdp_data_hard_end have reserved some tailroom */
+ if (unlikely(data_end > data_hard_end))
+ return -EINVAL;
+
+ if (unlikely(data_end < xdp->data + ETH_HLEN))
+ return -EINVAL;
+
+ /* Clear memory area on grow, can contain uninit kernel memory */
+ if (offset > 0)
+ memset(xdp->data_end, 0, offset);
+
+ xdp->data_end = data_end;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+ .func = bpf_xdp_adjust_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
+ void *meta = xdp->data_meta + offset;
+ unsigned long metalen = xdp->data - meta;
+
+ if (xdp_data_meta_unsupported(xdp))
+ return -ENOTSUPP;
+ if (unlikely(meta < xdp_frame_end ||
+ meta > xdp->data))
+ return -EINVAL;
+ if (unlikely(xdp_metalen_invalid(metalen)))
+ return -EACCES;
+
+ xdp->data_meta = meta;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+ .func = bpf_xdp_adjust_meta,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+/**
+ * DOC: xdp redirect
+ *
+ * XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ * of the redirect and store it (along with some other metadata) in a per-CPU
+ * struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ * call xdp_do_redirect() which will use the information in struct
+ * bpf_redirect_info to actually enqueue the frame into a map type-specific
+ * bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call
+ * xdp_do_flush(), which will flush all the different bulk queues,
+ * thus completing the redirect. Note that xdp_do_flush() must be
+ * called before napi_complete_done() in the driver, as the
+ * XDP_REDIRECT logic relies on being inside a single NAPI instance
+ * through to the xdp_do_flush() call for RCU protection of all
+ * in-kernel data structures.
+ */
+/*
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
+void xdp_do_flush(void)
+{
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev)
+ __dev_flush(lh_dev);
+ if (lh_map)
+ __cpu_map_flush(lh_map);
+ if (lh_xsk)
+ __xsk_map_flush(lh_xsk);
+}
+EXPORT_SYMBOL_GPL(xdp_do_flush);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
+{
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+ bool missed = false;
+
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev) {
+ __dev_flush(lh_dev);
+ missed = true;
+ }
+ if (lh_map) {
+ __cpu_map_flush(lh_map);
+ missed = true;
+ }
+ if (lh_xsk) {
+ __xsk_map_flush(lh_xsk);
+ missed = true;
+ }
+
+ WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+ napi->poll);
+}
+#endif
+
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
+
+u32 xdp_master_redirect(struct xdp_buff *xdp)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ struct net_device *master, *slave;
+
+ master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
+ slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
+ if (slave && slave != xdp->rxq->dev) {
+ /* The target device is different from the receiving device, so
+ * redirect it to the new device.
+ * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
+ * drivers to unmap the packet from their rx ring.
+ */
+ ri->tgt_index = slave->ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+ return XDP_REDIRECT;
+ }
+ return XDP_TX;
+}
+EXPORT_SYMBOL_GPL(xdp_master_redirect);
+
+static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
+ const struct net_device *dev,
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ int err;
+
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+
+ err = __xsk_map_redirect(fwd, xdp);
+ if (unlikely(err))
+ goto err;
+
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
+ return err;
+}
+
+static __always_inline int
+__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
+{
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
+ struct bpf_map *map;
+ int err;
+
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+
+ if (unlikely(!xdpf)) {
+ err = -EOVERFLOW;
+ goto err;
+ }
+
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_enqueue_multi(xdpf, dev, map,
+ flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_enqueue(fwd, xdpf, dev);
+ }
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_enqueue(fwd, xdpf, dev);
+ break;
+ case BPF_MAP_TYPE_UNSPEC:
+ if (map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ break;
+ }
+ err = dev_xdp_enqueue(fwd, xdpf, dev);
+ break;
+ }
+ fallthrough;
+ default:
+ err = -EBADRQC;
+ }
+
+ if (unlikely(err))
+ goto err;
+
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
+ return err;
+}
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ enum bpf_map_type map_type = ri->map_type;
+
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
+ return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+ return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
+ xdp_prog);
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect);
+
+int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ enum bpf_map_type map_type = ri->map_type;
+
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
+ return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+ return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
+
+static int xdp_do_generic_redirect_map(struct net_device *dev,
+ struct sk_buff *skb,
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog,
+ void *fwd, enum bpf_map_type map_type,
+ u32 map_id, u32 flags)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ struct bpf_map *map;
+ int err;
+
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+ flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ }
+ if (unlikely(err))
+ goto err;
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = xsk_generic_rcv(fwd, xdp);
+ if (err)
+ goto err;
+ consume_skb(skb);
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_generic_redirect(fwd, skb);
+ if (unlikely(err))
+ goto err;
+ break;
+ default:
+ err = -EBADRQC;
+ goto err;
+ }
+
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
+ return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
+ int err;
+
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+
+ if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+ }
+
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
+ return err;
+}
+
+BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+
+ if (unlikely(flags))
+ return XDP_ABORTED;
+
+ /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ * by map_idr) is used for ifindex based XDP redirect.
+ */
+ ri->tgt_index = ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_proto = {
+ .func = bpf_xdp_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
+ u64, flags)
+{
+ return map->ops->map_redirect(map, key, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
+ .func = bpf_xdp_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
+ unsigned long off, unsigned long len)
+{
+ void *ptr = skb_header_pointer(skb, off, len, dst_buff);
+
+ if (unlikely(!ptr))
+ return len;
+ if (ptr != dst_buff)
+ memcpy(dst_buff, ptr, len);
+
+ return 0;
+}
+
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
+{
+ u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (unlikely(!skb || skb_size > skb->len))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
+ bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_event_output_proto = {
+ .func = bpf_skb_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
+
+const struct bpf_func_proto bpf_skb_output_proto = {
+ .func = bpf_skb_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_skb_output_btf_ids[0],
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+static unsigned short bpf_tunnel_key_af(u64 flags)
+{
+ return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+ u32, size, u64, flags)
+{
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
+ void *to_orig = to;
+ int err;
+
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
+ BPF_F_TUNINFO_FLAGS)))) {
+ err = -EINVAL;
+ goto err_clear;
+ }
+ if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
+ err = -EPROTO;
+ goto err_clear;
+ }
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ err = -EINVAL;
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ goto set_compat;
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ if (ip_tunnel_info_af(info) != AF_INET)
+ goto err_clear;
+set_compat:
+ to = (struct bpf_tunnel_key *)compat;
+ break;
+ default:
+ goto err_clear;
+ }
+ }
+
+ to->tunnel_id = be64_to_cpu(info->key.tun_id);
+ to->tunnel_tos = info->key.tos;
+ to->tunnel_ttl = info->key.ttl;
+ if (flags & BPF_F_TUNINFO_FLAGS)
+ to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
+ else
+ to->tunnel_ext = 0;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
+ sizeof(to->remote_ipv6));
+ memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
+ sizeof(to->local_ipv6));
+ to->tunnel_label = be32_to_cpu(info->key.label);
+ } else {
+ to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
+ memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
+ to->tunnel_label = 0;
+ }
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key)))
+ memcpy(to_orig, to, size);
+
+ return 0;
+err_clear:
+ memset(to_orig, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+ .func = bpf_skb_get_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
+{
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ int err;
+
+ if (unlikely(!info ||
+ !ip_tunnel_is_options_present(info->key.tun_flags))) {
+ err = -ENOENT;
+ goto err_clear;
+ }
+ if (unlikely(size < info->options_len)) {
+ err = -ENOMEM;
+ goto err_clear;
+ }
+
+ ip_tunnel_info_opts_get(to, info);
+ if (size > info->options_len)
+ memset(to + info->options_len, 0, size - info->options_len);
+
+ return info->options_len;
+err_clear:
+ memset(to, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+ .func = bpf_skb_get_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static struct metadata_dst __percpu *md_dst;
+
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+ const struct bpf_tunnel_key *, from, u32, size, u64, flags)
+{
+ struct metadata_dst *md = this_cpu_ptr(md_dst);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
+ struct ip_tunnel_info *info;
+
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
+ BPF_F_NO_TUNNEL_KEY)))
+ return -EINVAL;
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ memcpy(compat, from, size);
+ memset(compat + size, 0, sizeof(compat) - size);
+ from = (const struct bpf_tunnel_key *) compat;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
+ from->tunnel_ext))
+ return -EINVAL;
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *) md);
+ skb_dst_set(skb, (struct dst_entry *) md);
+
+ info = &md->u.tun_info;
+ memset(info, 0, sizeof(*info));
+ info->mode = IP_TUNNEL_INFO_TX;
+
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
+ __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
+ flags & BPF_F_DONT_FRAGMENT);
+ __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
+ !(flags & BPF_F_ZERO_CSUM_TX));
+ __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
+ flags & BPF_F_SEQ_NUMBER);
+ __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
+ !(flags & BPF_F_NO_TUNNEL_KEY));
+
+ info->key.tun_id = cpu_to_be64(from->tunnel_id);
+ info->key.tos = from->tunnel_tos;
+ info->key.ttl = from->tunnel_ttl;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ info->mode |= IP_TUNNEL_INFO_IPV6;
+ memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
+ sizeof(from->remote_ipv6));
+ memcpy(&info->key.u.ipv6.src, from->local_ipv6,
+ sizeof(from->local_ipv6));
+ info->key.label = cpu_to_be32(from->tunnel_label) &
+ IPV6_FLOWLABEL_MASK;
+ } else {
+ info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
+ info->key.flow_flags = FLOWI_FLAG_ANYSRC;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+ .func = bpf_skb_set_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+ const u8 *, from, u32, size)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct metadata_dst *md = this_cpu_ptr(md_dst);
+ IP_TUNNEL_DECLARE_FLAGS(present) = { };
+
+ if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+ return -EINVAL;
+ if (unlikely(size > IP_TUNNEL_OPTS_MAX))
+ return -ENOMEM;
+
+ ip_tunnel_set_options_present(present);
+ ip_tunnel_info_opts_set(info, from, size, present);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+ .func = bpf_skb_set_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
+{
+ if (!md_dst) {
+ struct metadata_dst __percpu *tmp;
+
+ tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+ METADATA_IP_TUNNEL,
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+ if (cmpxchg(&md_dst, NULL, tmp))
+ metadata_dst_free_percpu(tmp);
+ }
+
+ switch (which) {
+ case BPF_FUNC_skb_set_tunnel_key:
+ return &bpf_skb_set_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return &bpf_skb_set_tunnel_opt_proto;
+ default:
+ return NULL;
+ }
+}
+
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+ u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+ struct sock *sk;
+
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
+ return -ENOENT;
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return sk_under_cgroup_hierarchy(sk, cgrp);
+}
+
+static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
+ .func = bpf_skb_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
+{
+ struct cgroup *cgrp;
+
+ sk = sk_to_full_sk(sk);
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ return cgroup_id(cgrp);
+}
+
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ return __bpf_sk_cgroup_id(skb->sk);
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+ .func = bpf_skb_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
+ int ancestor_level)
+{
+ struct cgroup *ancestor;
+ struct cgroup *cgrp;
+
+ sk = sk_to_full_sk(sk);
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ancestor = cgroup_ancestor(cgrp, ancestor_level);
+ if (!ancestor)
+ return 0;
+
+ return cgroup_id(ancestor);
+}
+
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+ ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
+ .func = bpf_skb_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
+{
+ return __bpf_sk_cgroup_id(sk);
+}
+
+static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
+ .func = bpf_sk_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
+ .func = bpf_sk_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+};
+#endif
+
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
+ unsigned long off, unsigned long len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+ bpf_xdp_copy_buf(xdp, off, dst, len, false);
+ return 0;
+}
+
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
+{
+ u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, xdp,
+ xdp_size, bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_event_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
+{
+ return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
+ .func = bpf_get_socket_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
+ .func = bpf_get_socket_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return __sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+{
+ return sk ? sock_gen_cookie(sk) : 0;
+}
+
+const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+ .func = bpf_get_socket_ptr_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return __sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
+ .func = bpf_get_socket_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+ const struct net *net = sk ? sock_net(sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
+{
+ return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
+ .func = bpf_get_netns_cookie,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
+ .func = bpf_get_netns_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
+ .func = bpf_get_netns_cookie_sk_msg,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
+{
+ struct sock *sk = sk_to_full_sk(skb->sk);
+ kuid_t kuid;
+
+ if (!sk || !sk_fullsock(sk))
+ return overflowuid;
+ kuid = sock_net_uid(sock_net(sk), sk);
+ return from_kuid_munged(sock_net(sk)->user_ns, kuid);
+}
+
+static const struct bpf_func_proto bpf_get_socket_uid_proto = {
+ .func = bpf_get_socket_uid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
+{
+ u32 sk_bpf_cb_flags;
+
+ if (getopt) {
+ *(u32 *)optval = sk->sk_bpf_cb_flags;
+ return 0;
+ }
+
+ sk_bpf_cb_flags = *(u32 *)optval;
+
+ if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+ return -EINVAL;
+
+ sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+ return 0;
+}
+
+static int sol_socket_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ switch (optname) {
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_KEEPALIVE:
+ case SO_PRIORITY:
+ case SO_REUSEPORT:
+ case SO_RCVLOWAT:
+ case SO_MARK:
+ case SO_MAX_PACING_RATE:
+ case SO_BINDTOIFINDEX:
+ case SO_TXREHASH:
+ case SK_BPF_CB_FLAGS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case SO_BINDTODEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (optname == SK_BPF_CB_FLAGS)
+ return sk_bpf_set_get_cb_flags(sk, optval, getopt);
+
+ if (getopt) {
+ if (optname == SO_BINDTODEVICE)
+ return -EINVAL;
+ return sk_getsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
+
+ return sk_setsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_BPF_SOCK_OPS_CB_FLAGS: {
+ int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;
+
+ memcpy(optval, &cb_flags, optlen);
+ break;
+ }
+ case TCP_BPF_RTO_MIN: {
+ int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);
+
+ memcpy(optval, &rto_min_us, optlen);
+ break;
+ }
+ case TCP_BPF_DELACK_MAX: {
+ int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);
+
+ memcpy(optval, &delack_max_us, optlen);
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+ int val;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ val = *(int *)optval;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > tp->syn_data)
+ return -EINVAL;
+ tcp_snd_cwnd_set(tp, val);
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0)
+ return -EINVAL;
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
+ case TCP_BPF_SOCK_OPS_CB_FLAGS:
+ if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
+ return -EINVAL;
+ tp->bpf_sock_ops_cb_flags = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
+ int *optlen, bool getopt)
+{
+ struct tcp_sock *tp;
+ int ret;
+
+ if (*optlen < 2)
+ return -EINVAL;
+
+ if (getopt) {
+ if (!inet_csk(sk)->icsk_ca_ops)
+ return -EINVAL;
+ /* BPF expects NULL-terminated tcp-cc string */
+ optval[--(*optlen)] = '\0';
+ return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
+
+ /* "cdg" is the only cc that alloc a ptr
+ * in inet_csk_ca area. The bpf-tcp-cc may
+ * overwrite this ptr after switching to cdg.
+ */
+ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
+ return -ENOTSUPP;
+
+ /* It stops this looping
+ *
+ * .init => bpf_setsockopt(tcp_cc) => .init =>
+ * bpf_setsockopt(tcp_cc)" => .init => ....
+ *
+ * The second bpf_setsockopt(tcp_cc) is not allowed
+ * in order to break the loop when both .init
+ * are the same bpf prog.
+ *
+ * This applies even the second bpf_setsockopt(tcp_cc)
+ * does not cause a loop. This limits only the first
+ * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
+ * pick a fallback cc (eg. peer does not support ECN)
+ * and the second '.init' cannot fallback to
+ * another.
+ */
+ tp = tcp_sk(sk);
+ if (tp->bpf_chg_cc_inprogress)
+ return -EBUSY;
+
+ tp->bpf_chg_cc_inprogress = 1;
+ ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval), *optlen);
+ tp->bpf_chg_cc_inprogress = 0;
+ return ret;
+}
+
+static int sol_tcp_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_NODELAY:
+ case TCP_MAXSEG:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPCNT:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ case TCP_USER_TIMEOUT:
+ case TCP_NOTSENT_LOWAT:
+ case TCP_SAVE_SYN:
+ case TCP_RTO_MAX_MS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case TCP_CONGESTION:
+ return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
+ case TCP_SAVED_SYN:
+ if (*optlen < 1)
+ return -EINVAL;
+ break;
+ default:
+ if (getopt)
+ return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
+ return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
+ }
+
+ if (getopt) {
+ if (optname == TCP_SAVED_SYN) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->saved_syn ||
+ *optlen > tcp_saved_syn_len(tp->saved_syn))
+ return -EINVAL;
+ memcpy(optval, tp->saved_syn->data, *optlen);
+ /* It cannot free tp->saved_syn here because it
+ * does not know if the user space still needs it.
+ */
+ return 0;
+ }
+
+ return do_tcp_getsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
+
+ return do_tcp_setsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int sol_ip_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ switch (optname) {
+ case IP_TOS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
}
- if (fp)
- sk_filter_release(sk, fp);
+ if (getopt)
+ return do_ip_getsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return do_ip_setsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int sol_ipv6_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ switch (optname) {
+ case IPV6_TCLASS:
+ case IPV6_AUTOFLOWLABEL:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int __bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (!sk_fullsock(sk))
+ return -EINVAL;
+
+ if (level == SOL_SOCKET)
+ return sol_socket_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ return sol_ip_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
+
+ return -EINVAL;
+}
+
+static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
+{
+ return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+}
+
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static int __bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ int err, saved_optlen = optlen;
+
+ if (!sk_fullsock(sk)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (level == SOL_SOCKET)
+ err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
+ else
+ err = -EINVAL;
+
+done:
+ if (err)
+ optlen = 0;
+ if (optlen < saved_optlen)
+ memset(optval + optlen, 0, saved_optlen - optlen);
return err;
}
-EXPORT_SYMBOL(sk_chk_filter);
-EXPORT_SYMBOL(sk_run_filter);
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+ .func = bpf_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+ .func = bpf_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
+ .func = bpf_unlocked_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
+ .func = bpf_unlocked_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
+ .func = bpf_sock_addr_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
+ .func = bpf_sock_addr_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk,
+ char *optval, int optlen,
+ bool getopt)
+{
+ int val;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (!sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (getopt) {
+ *(int *)optval = sk->sk_bypass_prot_mem;
+ return 0;
+ }
+
+ val = *(int *)optval;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ sk->sk_bypass_prot_mem = val;
+ return 0;
+}
+
+BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM)
+ return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false);
+
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = {
+ .func = bpf_sock_create_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) {
+ int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true);
+
+ if (err)
+ memset(optval, 0, optlen);
+
+ return err;
+ }
+
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = {
+ .func = bpf_sock_create_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+ .func = bpf_sock_ops_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
+ int optname, const u8 **start)
+{
+ struct sk_buff *syn_skb = bpf_sock->syn_skb;
+ const u8 *hdr_start;
+ int ret;
+
+ if (syn_skb) {
+ /* sk is a request_sock here */
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = syn_skb->data;
+ ret = tcp_hdrlen(syn_skb);
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = skb_network_header(syn_skb);
+ ret = skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+ hdr_start = skb_mac_header(syn_skb);
+ ret = skb_mac_header_len(syn_skb) +
+ skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ }
+ } else {
+ struct sock *sk = bpf_sock->sk;
+ struct saved_syn *saved_syn;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ /* synack retransmit. bpf_sock->syn_skb will
+ * not be available. It has to resort to
+ * saved_syn (if it is saved).
+ */
+ saved_syn = inet_reqsk(sk)->saved_syn;
+ else
+ saved_syn = tcp_sk(sk)->saved_syn;
+
+ if (!saved_syn)
+ return -ENOENT;
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen;
+ ret = saved_syn->tcp_hdrlen;
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen;
+ ret = saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+
+ /* TCP_SAVE_SYN may not have saved the mac hdr */
+ if (!saved_syn->mac_hdrlen)
+ return -ENOENT;
+
+ hdr_start = saved_syn->data;
+ ret = saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ }
+ }
+
+ *start = hdr_start;
+ return ret;
+}
+
+BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
+ optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
+ int ret, copy_len = 0;
+ const u8 *start;
+
+ ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
+ if (ret > 0) {
+ copy_len = ret;
+ if (optlen < copy_len) {
+ copy_len = optlen;
+ ret = -ENOSPC;
+ }
+
+ memcpy(optval, start, copy_len);
+ }
+
+ /* Zero out unused buffer at the end */
+ memset(optval + copy_len, 0, optlen - copy_len);
+
+ return ret;
+ }
+
+ return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
+ .func = bpf_sock_ops_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
+ int, argval)
+{
+ struct sock *sk = bpf_sock->sk;
+ int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
+ return -EINVAL;
+
+ tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+ return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
+ .func = bpf_sock_ops_cb_flags_set,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
+
+BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
+ int, addr_len)
+{
+#ifdef CONFIG_INET
+ struct sock *sk = ctx->sk;
+ u32 flags = BIND_FROM_BPF;
+ int err;
+
+ err = -EINVAL;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return err;
+ if (addr->sa_family == AF_INET) {
+ if (addr_len < sizeof(struct sockaddr_in))
+ return err;
+ if (((struct sockaddr_in *)addr)->sin_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
+ return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (addr->sa_family == AF_INET6) {
+ if (addr_len < SIN6_LEN_RFC2133)
+ return err;
+ if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
+ /* ipv6_bpf_stub cannot be NULL, since it's called from
+ * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
+ */
+ return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr,
+ addr_len, flags);
+#endif /* CONFIG_IPV6 */
+ }
+#endif /* CONFIG_INET */
+
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_bind_proto = {
+ .func = bpf_bind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+#ifdef CONFIG_XFRM
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+struct metadata_dst __percpu *xfrm_bpf_md_dst;
+EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);
+
+#endif
+
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+ struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+ const struct sec_path *sp = skb_sec_path(skb);
+ const struct xfrm_state *x;
+
+ if (!sp || unlikely(index >= sp->len || flags))
+ goto err_clear;
+
+ x = sp->xvec[index];
+
+ if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+ goto err_clear;
+
+ to->reqid = x->props.reqid;
+ to->spi = x->id.spi;
+ to->family = x->props.family;
+ to->ext = 0;
+
+ if (to->family == AF_INET6) {
+ memcpy(to->remote_ipv6, x->props.saddr.a6,
+ sizeof(to->remote_ipv6));
+ } else {
+ to->remote_ipv4 = x->props.saddr.a4;
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ }
+
+ return 0;
+err_clear:
+ memset(to, 0, size);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+ .func = bpf_skb_get_xfrm_state,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
+{
+ params->h_vlan_TCI = 0;
+ params->h_vlan_proto = 0;
+ if (mtu)
+ params->mtu_result = mtu; /* union with tot_len */
+
+ return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct fib_nh_common *nhc;
+ struct in_device *in_dev;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct fib_result res;
+ struct flowi4 fl4;
+ u32 mtu = 0;
+ int err;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ /* verify forwarding is enabled on this interface */
+ in_dev = __in_dev_get_rcu(dev);
+ if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+ return BPF_FIB_LKUP_RET_FWD_DISABLED;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl4.flowi4_iif = 1;
+ fl4.flowi4_oif = params->ifindex;
+ } else {
+ fl4.flowi4_iif = params->ifindex;
+ fl4.flowi4_oif = 0;
+ }
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+
+ fl4.flowi4_proto = params->l4_protocol;
+ fl4.daddr = params->ipv4_dst;
+ fl4.saddr = params->ipv4_src;
+ fl4.fl4_sport = params->sport;
+ fl4.fl4_dport = params->dport;
+ fl4.flowi4_multipath_hash = 0;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib_table *tb;
+
+ if (flags & BPF_FIB_LOOKUP_TBID) {
+ tbid = params->tbid;
+ /* zero out for vlan output */
+ params->tbid = 0;
+ }
+
+ tb = fib_get_table(net, tbid);
+ if (unlikely(!tb))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+ } else {
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl4.flowi4_mark = params->mark;
+ else
+ fl4.flowi4_mark = 0;
+ fl4.flowi4_secid = 0;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+ err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+ }
+
+ if (err) {
+ /* map fib lookup errors to RTN_ type */
+ if (err == -EINVAL)
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ if (err == -EHOSTUNREACH)
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ if (err == -EACCES)
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
+
+ if (res.type != RTN_UNICAST)
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ if (fib_info_num_path(res.fi) > 1)
+ fib_select_path(net, &res, &fl4, NULL);
+
+ if (check_mtu) {
+ mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
+ return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
+ }
+
+ nhc = res.nhc;
+
+ /* do not handle lwt encaps right now */
+ if (nhc->nhc_lwtstate)
+ return BPF_FIB_LKUP_RET_UNSUPP_LWT;
+
+ dev = nhc->nhc_dev;
+
+ params->rt_metric = res.fi->fib_priority;
+ params->ifindex = dev->ifindex;
+
+ if (flags & BPF_FIB_LOOKUP_SRC)
+ params->ipv4_src = fib_result_prefsrc(net, &res);
+
+ /* xdp and cls_bpf programs are run in RCU-bh so
+ * rcu_read_lock_bh is not needed here
+ */
+ if (likely(nhc->nhc_gw_family != AF_INET6)) {
+ if (nhc->nhc_gw_family)
+ params->ipv4_dst = nhc->nhc_gw.ipv4;
+ } else {
+ struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
+
+ params->family = AF_INET6;
+ *dst = nhc->nhc_gw.ipv6;
+ }
+
+ if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+ goto set_fwd_params;
+
+ if (likely(nhc->nhc_gw_family != AF_INET6))
+ neigh = __ipv4_neigh_lookup_noref(dev,
+ (__force u32)params->ipv4_dst);
+ else
+ neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
+
+ if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
+ return BPF_FIB_LKUP_RET_NO_NEIGH;
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+
+set_fwd_params:
+ return bpf_fib_set_fwd_params(params, mtu);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct fib6_result res = {};
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct flowi6 fl6;
+ int strict = 0;
+ int oif, err;
+ u32 mtu = 0;
+
+ /* link local addresses are never forwarded */
+ if (rt6_need_strict(dst) || rt6_need_strict(src))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ idev = __in6_dev_get_safely(dev);
+ if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
+ return BPF_FIB_LKUP_RET_FWD_DISABLED;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl6.flowi6_iif = 1;
+ oif = fl6.flowi6_oif = params->ifindex;
+ } else {
+ oif = fl6.flowi6_iif = params->ifindex;
+ fl6.flowi6_oif = 0;
+ strict = RT6_LOOKUP_F_HAS_SADDR;
+ }
+ fl6.flowlabel = params->flowinfo;
+ fl6.flowi6_scope = 0;
+ fl6.flowi6_flags = 0;
+ fl6.mp_hash = 0;
+
+ fl6.flowi6_proto = params->l4_protocol;
+ fl6.daddr = *dst;
+ fl6.saddr = *src;
+ fl6.fl6_sport = params->sport;
+ fl6.fl6_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib6_table *tb;
+
+ if (flags & BPF_FIB_LOOKUP_TBID) {
+ tbid = params->tbid;
+ /* zero out for vlan output */
+ params->tbid = 0;
+ }
+
+ tb = ipv6_stub->fib6_get_table(net, tbid);
+ if (unlikely(!tb))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
+ strict);
+ } else {
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl6.flowi6_mark = params->mark;
+ else
+ fl6.flowi6_mark = 0;
+ fl6.flowi6_secid = 0;
+ fl6.flowi6_tun_key.tun_id = 0;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+ err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
+ }
+
+ if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
+ res.f6i == net->ipv6.fib6_null_entry))
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ switch (res.fib6_type) {
+ /* only unicast is forwarded */
+ case RTN_UNICAST:
+ break;
+ case RTN_BLACKHOLE:
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ case RTN_UNREACHABLE:
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ case RTN_PROHIBIT:
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+ default:
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
+
+ ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
+ fl6.flowi6_oif != 0, NULL, strict);
+
+ if (check_mtu) {
+ mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
+ return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
+ }
+
+ if (res.nh->fib_nh_lws)
+ return BPF_FIB_LKUP_RET_UNSUPP_LWT;
+
+ if (res.nh->fib_nh_gw_family)
+ *dst = res.nh->fib_nh_gw6;
+
+ dev = res.nh->fib_nh_dev;
+ params->rt_metric = res.f6i->fib6_metric;
+ params->ifindex = dev->ifindex;
+
+ if (flags & BPF_FIB_LOOKUP_SRC) {
+ if (res.f6i->fib6_prefsrc.plen) {
+ *src = res.f6i->fib6_prefsrc.addr;
+ } else {
+ err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
+ &fl6.daddr, 0,
+ src);
+ if (err)
+ return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
+ }
+ }
+
+ if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+ goto set_fwd_params;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+ * not needed here.
+ */
+ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
+ if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
+ return BPF_FIB_LKUP_RET_NO_NEIGH;
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+
+set_fwd_params:
+ return bpf_fib_set_fwd_params(params, mtu);
+}
+#endif
+
+#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
+ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
+ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~BPF_FIB_LOOKUP_MASK)
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+ .func = bpf_xdp_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ struct net *net = dev_net(skb->dev);
+ int rc = -EAFNOSUPPORT;
+ bool check_mtu = false;
+
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~BPF_FIB_LOOKUP_MASK)
+ return -EINVAL;
+
+ if (params->tot_len)
+ check_mtu = true;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
+ break;
+#endif
+ }
+
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
+ struct net_device *dev;
+
+ /* When tot_len isn't provided by user, check skb
+ * against MTU of FIB lookup resulting net_device
+ */
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (!is_skb_forwardable(dev, skb))
+ rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+
+ params->mtu_result = dev->mtu; /* union with tot_len */
+ }
+
+ return rc;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+ .func = bpf_skb_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
+ u32 ifindex)
+{
+ struct net *netns = dev_net(dev_curr);
+
+ /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
+ if (ifindex == 0)
+ return dev_curr;
+
+ return dev_get_by_index_rcu(netns, ifindex);
+}
+
+BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+ struct net_device *dev = skb->dev;
+ int mtu, dev_len, skb_len;
+
+ if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
+ return -EINVAL;
+ if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+ dev_len = mtu + dev->hard_header_len;
+
+ /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+
+ skb_len += len_diff; /* minus result pass check */
+ if (skb_len <= dev_len) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+ goto out;
+ }
+ /* At this point, skb->len exceed MTU, but as it include length of all
+ * segments, it can still be below MTU. The SKB can possibly get
+ * re-segmented in transmit path (see validate_xmit_skb). Thus, user
+ * must choose if segs are to be MTU checked.
+ */
+ if (skb_is_gso(skb)) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+ if (flags & BPF_MTU_CHK_SEGS) {
+ if (!skb_transport_header_was_set(skb))
+ return -EINVAL;
+ if (!skb_gso_validate_network_len(skb, mtu))
+ ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+ }
+ }
+out:
+ *mtu_len = mtu;
+ return ret;
+}
+
+BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ struct net_device *dev = xdp->rxq->dev;
+ int xdp_len = xdp->data_end - xdp->data;
+ int ret = BPF_MTU_CHK_RET_SUCCESS;
+ int mtu, dev_len;
+
+ /* XDP variant doesn't support multi-buffer segment check (yet) */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+ dev_len = mtu + dev->hard_header_len;
+
+ /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ if (*mtu_len)
+ xdp_len = *mtu_len + dev->hard_header_len;
+
+ xdp_len += len_diff; /* minus result pass check */
+ if (xdp_len > dev_len)
+ ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+
+ *mtu_len = mtu;
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
+ .func = bpf_skb_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
+ .func = bpf_xdp_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+ int err;
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+ if (!seg6_validate_srh(srh, len, false))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EBADMSG;
+
+ err = seg6_do_srh_inline(skb, srh);
+ break;
+ case BPF_LWT_ENCAP_SEG6:
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (err)
+ return err;
+
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+ bool ingress)
+{
+ return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
+}
+#endif
+
+BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+ u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_LWT_ENCAP_SEG6:
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+ case BPF_LWT_ENCAP_IP:
+ return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
+ void *, hdr, u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+ case BPF_LWT_ENCAP_IP:
+ return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
+ .func = bpf_lwt_in_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
+ .func = bpf_lwt_xmit_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
+ void *srh_tlvs, *srh_end, *ptr;
+ int srhoff = 0;
+
+ lockdep_assert_held(&srh_state->bh_lock);
+ if (srh == NULL)
+ return -EINVAL;
+
+ srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+ ptr = skb->data + offset;
+ if (ptr >= srh_tlvs && ptr + len <= srh_end)
+ srh_state->valid = false;
+ else if (ptr < (void *)&srh->flags ||
+ ptr + len > (void *)&srh->segments)
+ return -EFAULT;
+
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ memcpy(skb->data + offset, from, len);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+ .func = bpf_lwt_seg6_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+static void bpf_update_srh_state(struct sk_buff *skb)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ int srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
+ srh_state->srh = NULL;
+ } else {
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_state->hdrlen = srh_state->srh->hdrlen << 3;
+ srh_state->valid = true;
+ }
+}
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+ u32, action, void *, param, u32, param_len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ int hdroff = 0;
+ int err;
+
+ lockdep_assert_held(&srh_state->bh_lock);
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END_X:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(struct in6_addr))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+ case SEG6_LOCAL_ACTION_END_T:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(int))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_DT6:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(int))
+ return -EINVAL;
+
+ if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
+ return -EBADMSG;
+ if (!pskb_pull(skb, hdroff))
+ return -EBADMSG;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ bpf_compute_data_pointers(skb);
+ bpf_update_srh_state(skb);
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_B6:
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+ param, param_len);
+ if (!err)
+ bpf_update_srh_state(skb);
+
+ return err;
+ case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+ param, param_len);
+ if (!err)
+ bpf_update_srh_state(skb);
+
+ return err;
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+ .func = bpf_lwt_seg6_action,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+ s32, len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
+ void *srh_end, *srh_tlvs, *ptr;
+ struct ipv6hdr *hdr;
+ int srhoff = 0;
+ int ret;
+
+ lockdep_assert_held(&srh_state->bh_lock);
+ if (unlikely(srh == NULL))
+ return -EINVAL;
+
+ srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+ ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+ srh_state->hdrlen);
+ ptr = skb->data + offset;
+
+ if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+ return -EFAULT;
+ if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+ return -EFAULT;
+
+ if (len > 0) {
+ ret = skb_cow_head(skb, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, offset, len);
+ } else {
+ ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (unlikely(ret < 0))
+ return ret;
+
+ hdr = (struct ipv6hdr *)skb->data;
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_state->hdrlen += len;
+ srh_state->valid = false;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+ .func = bpf_lwt_seg6_adjust_srh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+#ifdef CONFIG_INET
+static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+ int dif, int sdif, u8 family, u8 proto)
+{
+ bool refcounted = false;
+ struct sock *sk = NULL;
+
+ if (family == AF_INET) {
+ __be32 src4 = tuple->ipv4.saddr;
+ __be32 dst4 = tuple->ipv4.daddr;
+
+ if (proto == IPPROTO_TCP)
+ sk = __inet_lookup(net, NULL, 0,
+ src4, tuple->ipv4.sport,
+ dst4, tuple->ipv4.dport,
+ dif, sdif, &refcounted);
+ else
+ sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
+ dst4, tuple->ipv4.dport,
+ dif, sdif, net->ipv4.udp_table, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
+ struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+
+ if (proto == IPPROTO_TCP)
+ sk = __inet6_lookup(net, NULL, 0,
+ src6, tuple->ipv6.sport,
+ dst6, ntohs(tuple->ipv6.dport),
+ dif, sdif, &refcounted);
+ else if (likely(ipv6_bpf_stub))
+ sk = ipv6_bpf_stub->udp6_lib_lookup(net,
+ src6, tuple->ipv6.sport,
+ dst6, tuple->ipv6.dport,
+ dif, sdif,
+ net->ipv4.udp_table, NULL);
+#endif
+ }
+
+ if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ sk = NULL;
+ }
+ return sk;
+}
+
+/* bpf_skc_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ */
+static struct sock *
+__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
+ u64 flags, int sdif)
+{
+ struct sock *sk = NULL;
+ struct net *net;
+ u8 family;
+
+ if (len == sizeof(tuple->ipv4))
+ family = AF_INET;
+ else if (len == sizeof(tuple->ipv6))
+ family = AF_INET6;
+ else
+ return NULL;
+
+ if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
+ goto out;
+
+ if (sdif < 0) {
+ if (family == AF_INET)
+ sdif = inet_sdif(skb);
+ else
+ sdif = inet6_sdif(skb);
+ }
+
+ if ((s32)netns_id < 0) {
+ net = caller_net;
+ sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
+ } else {
+ net = get_net_ns_by_id(caller_net, netns_id);
+ if (unlikely(!net))
+ goto out;
+ sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
+ put_net(net);
+ }
+
+out:
+ return sk;
+}
+
+static struct sock *
+__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
+ u64 flags, int sdif)
+{
+ struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
+ ifindex, proto, netns_id, flags,
+ sdif);
+
+ if (sk) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (sk2 != sk) {
+ sock_gen_put(sk);
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *
+bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ u8 proto, u64 netns_id, u64 flags)
+{
+ struct net *caller_net;
+ int ifindex;
+
+ if (skb->dev) {
+ caller_net = dev_net(skb->dev);
+ ifindex = skb->dev->ifindex;
+ } else {
+ caller_net = sock_net(skb->sk);
+ ifindex = 0;
+ }
+
+ return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
+ netns_id, flags, -1);
+}
+
+static struct sock *
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ u8 proto, u64 netns_id, u64 flags)
+{
+ struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
+ flags);
+
+ if (sk) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (sk2 != sk) {
+ sock_gen_put(sk);
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
+ netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
+ .func = bpf_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
+ netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+ .func = bpf_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
+ netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+ .func = bpf_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
+ .func = bpf_tc_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
+ .func = bpf_tc_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_UDP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
+ .func = bpf_tc_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_release, struct sock *, sk)
+{
+ if (sk && sk_is_refcounted(sk))
+ sock_gen_put(sk);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+ .func = bpf_sk_release,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
+};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
+ ifindex, IPPROTO_UDP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
+ .func = bpf_xdp_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
+ .func = bpf_xdp_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
+ .func = bpf_xdp_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
+ sock_net(ctx->sk), 0,
+ IPPROTO_TCP, netns_id, flags,
+ -1);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
+ .func = bpf_sock_addr_skc_lookup_tcp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
+ sock_net(ctx->sk), 0, IPPROTO_TCP,
+ netns_id, flags, -1);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
+ .func = bpf_sock_addr_sk_lookup_tcp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
+ sock_net(ctx->sk), 0, IPPROTO_UDP,
+ netns_id, flags, -1);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
+ .func = bpf_sock_addr_sk_lookup_udp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
+ icsk_retransmits))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ return size == sizeof(__u64);
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_TCP_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \
+ sizeof_field(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct tcp_sock, FIELD)); \
+ } while (0)
+
+#define BPF_INET_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \
+ FIELD) > \
+ sizeof_field(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct inet_connection_sock, \
+ FIELD), \
+ si->dst_reg, si->src_reg, \
+ offsetof( \
+ struct inet_connection_sock, \
+ FIELD)); \
+ } while (0)
+
+ BTF_TYPE_EMIT(struct bpf_tcp_sock);
+
+ switch (si->off) {
+ case offsetof(struct bpf_tcp_sock, rtt_min):
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
+ sizeof(struct minmax));
+ BUILD_BUG_ON(sizeof(struct minmax) <
+ sizeof(struct minmax_sample));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct tcp_sock, rtt_min) +
+ offsetof(struct minmax_sample, v));
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_cwnd):
+ BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
+ break;
+ case offsetof(struct bpf_tcp_sock, srtt_us):
+ BPF_TCP_SOCK_GET_COMMON(srtt_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_ssthresh):
+ BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_tcp_sock, rcv_nxt):
+ BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_nxt):
+ BPF_TCP_SOCK_GET_COMMON(snd_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_una):
+ BPF_TCP_SOCK_GET_COMMON(snd_una);
+ break;
+ case offsetof(struct bpf_tcp_sock, mss_cache):
+ BPF_TCP_SOCK_GET_COMMON(mss_cache);
+ break;
+ case offsetof(struct bpf_tcp_sock, ecn_flags):
+ BPF_TCP_SOCK_GET_COMMON(ecn_flags);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_delivered):
+ BPF_TCP_SOCK_GET_COMMON(rate_delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_interval_us):
+ BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, packets_out):
+ BPF_TCP_SOCK_GET_COMMON(packets_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, retrans_out):
+ BPF_TCP_SOCK_GET_COMMON(retrans_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, total_retrans):
+ BPF_TCP_SOCK_GET_COMMON(total_retrans);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_in):
+ BPF_TCP_SOCK_GET_COMMON(segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_in):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_out):
+ BPF_TCP_SOCK_GET_COMMON(segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_out):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, lost_out):
+ BPF_TCP_SOCK_GET_COMMON(lost_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, sacked_out):
+ BPF_TCP_SOCK_GET_COMMON(sacked_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ BPF_TCP_SOCK_GET_COMMON(bytes_received);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ BPF_TCP_SOCK_GET_COMMON(bytes_acked);
+ break;
+ case offsetof(struct bpf_tcp_sock, dsack_dups):
+ BPF_TCP_SOCK_GET_COMMON(dsack_dups);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered):
+ BPF_TCP_SOCK_GET_COMMON(delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered_ce):
+ BPF_TCP_SOCK_GET_COMMON(delivered_ce);
+ break;
+ case offsetof(struct bpf_tcp_sock, icsk_retransmits):
+ BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
+{
+ if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_tcp_sock_proto = {
+ .func = bpf_tcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+};
+
+BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
+{
+ sk = sk_to_full_sk(sk);
+
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_get_listener_sock_proto = {
+ .func = bpf_get_listener_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+};
+
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+ unsigned int iphdr_len;
+
+ switch (skb_protocol(skb, true)) {
+ case cpu_to_be16(ETH_P_IP):
+ iphdr_len = sizeof(struct iphdr);
+ break;
+ case cpu_to_be16(ETH_P_IPV6):
+ iphdr_len = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return 0;
+ }
+
+ if (skb_headlen(skb) < iphdr_len)
+ return 0;
+
+ if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+ return 0;
+
+ return INET_ECN_set_ce(skb);
+}
+
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD) \
+ do { \
+ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \
+ sizeof_field(struct bpf_xdp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct xdp_sock, FIELD)); \
+ } while (0)
+
+ BTF_TYPE_EMIT(struct bpf_xdp_sock);
+
+ switch (si->off) {
+ case offsetof(struct bpf_xdp_sock, queue_id):
+ BPF_XDP_SOCK_GET(queue_id);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+ .func = bpf_skb_ecn_set_ce,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+ struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+ int ret;
+
+ if (unlikely(!sk || th_len < sizeof(*th)))
+ return -EINVAL;
+
+ /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
+ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
+ return -EINVAL;
+
+ if (!th->ack || th->rst || th->syn)
+ return -ENOENT;
+
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
+ if (tcp_synq_no_recent_overflow(sk))
+ return -ENOENT;
+
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
+ return -EINVAL;
+
+ ret = __cookie_v4_check((struct iphdr *)iph, th);
+ break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ case 6:
+ if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
+ break;
+#endif /* CONFIG_IPV6 */
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ if (ret > 0)
+ return 0;
+
+ return -ENOENT;
+#else
+ return -ENOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
+ .func = bpf_tcp_check_syncookie,
+ .gpl_only = true,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+ struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
+ return -ENOENT;
+
+ if (!th->syn || th->ack || th->fin || th->rst)
+ return -EINVAL;
+
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
+ return -EINVAL;
+
+ mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
+ break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ case 6:
+ if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
+ break;
+#endif /* CONFIG_IPV6 */
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ if (mss == 0)
+ return -ENOENT;
+
+ return cookie | ((u64)mss << 32);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYN_COOKIES */
+}
+
+static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
+ .func = bpf_tcp_gen_syncookie,
+ .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (!sk || flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (sk_unhashed(sk))
+ return -EOPNOTSUPP;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
+ u8 search_kind, const u8 *magic,
+ u8 magic_len, bool *eol)
+{
+ u8 kind, kind_len;
+
+ *eol = false;
+
+ while (op < opend) {
+ kind = op[0];
+
+ if (kind == TCPOPT_EOL) {
+ *eol = true;
+ return ERR_PTR(-ENOMSG);
+ } else if (kind == TCPOPT_NOP) {
+ op++;
+ continue;
+ }
+
+ if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
+ /* Something is wrong in the received header.
+ * Follow the TCP stack's tcp_parse_options()
+ * and just bail here.
+ */
+ return ERR_PTR(-EFAULT);
+
+ kind_len = op[1];
+ if (search_kind == kind) {
+ if (!magic_len)
+ return op;
+
+ if (magic_len > kind_len - 2)
+ return ERR_PTR(-ENOMSG);
+
+ if (!memcmp(&op[2], magic, magic_len))
+ return op;
+ }
+
+ op += kind_len;
+ }
+
+ return ERR_PTR(-ENOMSG);
+}
+
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ void *, search_res, u32, len, u64, flags)
+{
+ bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
+ const u8 *op, *opend, *magic, *search = search_res;
+ u8 search_kind, search_len, copy_len, magic_len;
+ int ret;
+
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ /* 2 byte is the minimal option len except TCPOPT_NOP and
+ * TCPOPT_EOL which are useless for the bpf prog to learn
+ * and this helper disallow loading them also.
+ */
+ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
+ return -EINVAL;
+
+ search_kind = search[0];
+ search_len = search[1];
+
+ if (search_len > len || search_kind == TCPOPT_NOP ||
+ search_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (search_kind == TCPOPT_EXP || search_kind == 253) {
+ /* 16 or 32 bit magic. +2 for kind and kind length */
+ if (search_len != 4 && search_len != 6)
+ return -EINVAL;
+ magic = &search[2];
+ magic_len = search_len - 2;
+ } else {
+ if (search_len)
+ return -EINVAL;
+ magic = NULL;
+ magic_len = 0;
+ }
+
+ if (load_syn) {
+ ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
+ if (ret < 0)
+ return ret;
+
+ opend = op + ret;
+ op += sizeof(struct tcphdr);
+ } else {
+ if (!bpf_sock->skb ||
+ bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ /* This bpf_sock->op cannot call this helper */
+ return -EPERM;
+
+ opend = bpf_sock->skb_data_end;
+ op = bpf_sock->skb->data + sizeof(struct tcphdr);
+ }
+
+ op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
+ &eol);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+
+ copy_len = op[1];
+ ret = copy_len;
+ if (copy_len > len) {
+ ret = -ENOSPC;
+ copy_len = len;
+ }
+
+ memcpy(search_res, op, copy_len);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
+ .func = bpf_sock_ops_load_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ const void *, from, u32, len, u64, flags)
+{
+ u8 new_kind, new_kind_len, magic_len = 0, *opend;
+ const u8 *op, *new_op, *magic = NULL;
+ struct sk_buff *skb;
+ bool eol;
+
+ if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+ return -EPERM;
+
+ if (len < 2 || flags)
+ return -EINVAL;
+
+ new_op = from;
+ new_kind = new_op[0];
+ new_kind_len = new_op[1];
+
+ if (new_kind_len > len || new_kind == TCPOPT_NOP ||
+ new_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (new_kind_len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ /* 253 is another experimental kind */
+ if (new_kind == TCPOPT_EXP || new_kind == 253) {
+ if (new_kind_len < 4)
+ return -EINVAL;
+ /* Match for the 2 byte magic also.
+ * RFC 6994: the magic could be 2 or 4 bytes.
+ * Hence, matching by 2 byte only is on the
+ * conservative side but it is the right
+ * thing to do for the 'search-for-duplication'
+ * purpose.
+ */
+ magic = &new_op[2];
+ magic_len = 2;
+ }
+
+ /* Check for duplication */
+ skb = bpf_sock->skb;
+ op = skb->data + sizeof(struct tcphdr);
+ opend = bpf_sock->skb_data_end;
+
+ op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
+ &eol);
+ if (!IS_ERR(op))
+ return -EEXIST;
+
+ if (PTR_ERR(op) != -ENOMSG)
+ return PTR_ERR(op);
+
+ if (eol)
+ /* The option has been ended. Treat it as no more
+ * header option can be written.
+ */
+ return -ENOSPC;
+
+ /* No duplication found. Store the header option. */
+ memcpy(opend, from, new_kind_len);
+
+ bpf_sock->remaining_opt_len -= new_kind_len;
+ bpf_sock->skb_data_end += new_kind_len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
+ .func = bpf_sock_ops_store_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ u32, len, u64, flags)
+{
+ if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ return -EPERM;
+
+ if (flags || len < 2)
+ return -EINVAL;
+
+ if (len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ bpf_sock->remaining_opt_len -= len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
+ .func = bpf_sock_ops_reserve_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
+ u64, tstamp, u32, tstamp_type)
+{
+ /* skb_clear_delivery_time() is done for inet protocol */
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -EOPNOTSUPP;
+
+ switch (tstamp_type) {
+ case BPF_SKB_CLOCK_REALTIME:
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ break;
+ case BPF_SKB_CLOCK_MONOTONIC:
+ if (!tstamp)
+ return -EINVAL;
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_MONOTONIC;
+ break;
+ case BPF_SKB_CLOCK_TAI:
+ if (!tstamp)
+ return -EINVAL;
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_TAI;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
+ .func = bpf_skb_set_tstamp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_SYN_COOKIES
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
+ struct tcphdr *, th, u32, th_len)
+{
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
+ cookie = __cookie_v4_init_sequence(iph, th, &mss);
+
+ return cookie | ((u64)mss << 32);
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
+ .func = bpf_tcp_raw_gen_syncookie_ipv4,
+ .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct iphdr),
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
+ struct tcphdr *, th, u32, th_len)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+ const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+ sizeof(struct ipv6hdr);
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
+ cookie = __cookie_v6_init_sequence(iph, th, &mss);
+
+ return cookie | ((u64)mss << 32);
+#else
+ return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
+ .func = bpf_tcp_raw_gen_syncookie_ipv6,
+ .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct ipv6hdr),
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
+ struct tcphdr *, th)
+{
+ if (__cookie_v4_check(iph, th) > 0)
+ return 0;
+
+ return -EACCES;
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
+ .func = bpf_tcp_raw_check_syncookie_ipv4,
+ .gpl_only = true, /* __cookie_v4_check is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct iphdr),
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg2_size = sizeof(struct tcphdr),
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
+ struct tcphdr *, th)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (__cookie_v6_check(iph, th) > 0)
+ return 0;
+
+ return -EACCES;
+#else
+ return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
+ .func = bpf_tcp_raw_check_syncookie_ipv6,
+ .gpl_only = true, /* __cookie_v6_check is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct ipv6hdr),
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg2_size = sizeof(struct tcphdr),
+};
+#endif /* CONFIG_SYN_COOKIES */
+
+#endif /* CONFIG_INET */
+
+bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_clone_redirect:
+ case BPF_FUNC_l3_csum_replace:
+ case BPF_FUNC_l4_csum_replace:
+ case BPF_FUNC_lwt_push_encap:
+ case BPF_FUNC_lwt_seg6_action:
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ case BPF_FUNC_msg_pop_data:
+ case BPF_FUNC_msg_pull_data:
+ case BPF_FUNC_msg_push_data:
+ case BPF_FUNC_skb_adjust_room:
+ case BPF_FUNC_skb_change_head:
+ case BPF_FUNC_skb_change_proto:
+ case BPF_FUNC_skb_change_tail:
+ case BPF_FUNC_skb_pull_data:
+ case BPF_FUNC_skb_store_bytes:
+ case BPF_FUNC_skb_vlan_pop:
+ case BPF_FUNC_skb_vlan_push:
+ case BPF_FUNC_store_hdr_opt:
+ case BPF_FUNC_xdp_adjust_head:
+ case BPF_FUNC_xdp_adjust_meta:
+ case BPF_FUNC_xdp_adjust_tail:
+ /* tail-called program could call any of the above */
+ case BPF_FUNC_tail_call:
+ return true;
+ default:
+ return false;
+ }
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto __weak;
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
+
+static const struct bpf_func_proto *
+sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_cg_sock_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ return &bpf_sock_create_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ return &bpf_sock_create_getsockopt_proto;
+ default:
+ return NULL;
+ }
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+ case BPF_FUNC_bind:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_bind_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sock_addr_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sock_addr_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_sock_addr_skc_lookup_tcp_proto;
+#endif /* CONFIG_INET */
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return &bpf_sock_addr_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return &bpf_sock_addr_getsockopt_proto;
+ default:
+ return NULL;
+ }
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
+const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
+
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+ case BPF_FUNC_sk_fullsock:
+ return &bpf_sk_fullsock_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+ case BPF_FUNC_sk_cgroup_id:
+ return &bpf_sk_cgroup_id_proto;
+ case BPF_FUNC_sk_ancestor_cgroup_id:
+ return &bpf_sk_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_skc_lookup_tcp_proto;
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+ case BPF_FUNC_get_listener_sock:
+ return &bpf_get_listener_sock_proto;
+ case BPF_FUNC_skb_ecn_set_ce:
+ return &bpf_skb_ecn_set_ce_proto;
+#endif
+ default:
+ return sk_filter_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_skb_vlan_push:
+ return &bpf_skb_vlan_push_proto;
+ case BPF_FUNC_skb_vlan_pop:
+ return &bpf_skb_vlan_pop_proto;
+ case BPF_FUNC_skb_change_proto:
+ return &bpf_skb_change_proto_proto;
+ case BPF_FUNC_skb_change_type:
+ return &bpf_skb_change_type_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &bpf_skb_adjust_room_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_redirect_neigh:
+ return &bpf_redirect_neigh_proto;
+ case BPF_FUNC_redirect_peer:
+ return &bpf_redirect_peer_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ case BPF_FUNC_set_hash:
+ return &bpf_set_hash_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_skb_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_skb_check_mtu_proto;
+ case BPF_FUNC_sk_fullsock:
+ return &bpf_sk_fullsock_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_XFRM
+ case BPF_FUNC_skb_get_xfrm_state:
+ return &bpf_skb_get_xfrm_state_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_skb_cgroup_classid:
+ return &bpf_skb_cgroup_classid_proto;
+#endif
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_tc_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_tc_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+ case BPF_FUNC_get_listener_sock:
+ return &bpf_get_listener_sock_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_tc_skc_lookup_tcp_proto;
+ case BPF_FUNC_tcp_check_syncookie:
+ return &bpf_tcp_check_syncookie_proto;
+ case BPF_FUNC_skb_ecn_set_ce:
+ return &bpf_skb_ecn_set_ce_proto;
+ case BPF_FUNC_tcp_gen_syncookie:
+ return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
+ case BPF_FUNC_skb_set_tstamp:
+ return &bpf_skb_set_tstamp_proto;
+#ifdef CONFIG_SYN_COOKIES
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+ return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+ return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+ return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+ return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
+#endif
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_xdp_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
+ case BPF_FUNC_xdp_adjust_meta:
+ return &bpf_xdp_adjust_meta_proto;
+ case BPF_FUNC_redirect:
+ return &bpf_xdp_redirect_proto;
+ case BPF_FUNC_redirect_map:
+ return &bpf_xdp_redirect_map_proto;
+ case BPF_FUNC_xdp_adjust_tail:
+ return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_proto;
+ case BPF_FUNC_xdp_load_bytes:
+ return &bpf_xdp_load_bytes_proto;
+ case BPF_FUNC_xdp_store_bytes:
+ return &bpf_xdp_store_bytes_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_xdp_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_xdp_check_mtu_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_xdp_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_xdp_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_xdp_skc_lookup_tcp_proto;
+ case BPF_FUNC_tcp_check_syncookie:
+ return &bpf_tcp_check_syncookie_proto;
+ case BPF_FUNC_tcp_gen_syncookie:
+ return &bpf_tcp_gen_syncookie_proto;
+#ifdef CONFIG_SYN_COOKIES
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+ return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+ return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+ return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+ return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
+#endif
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+
+#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
+ /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
+ * kfuncs are defined in two different modules, and we want to be able
+ * to use them interchangeably with the same BTF type ID. Because modules
+ * can't de-duplicate BTF IDs between each other, we need the type to be
+ * referenced in the vmlinux BTF or the verifier will get confused about
+ * the different types. So we add this dummy type reference which will
+ * be included in vmlinux BTF, allowing both modules to refer to the
+ * same type ID.
+ */
+ BTF_TYPE_EMIT(struct nf_conn___init);
+#endif
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
+static const struct bpf_func_proto *
+sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_sock_ops_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ return &bpf_sock_ops_getsockopt_proto;
+ case BPF_FUNC_sock_ops_cb_flags_set:
+ return &bpf_sock_ops_cb_flags_set_proto;
+ case BPF_FUNC_sock_map_update:
+ return &bpf_sock_map_update_proto;
+ case BPF_FUNC_sock_hash_update:
+ return &bpf_sock_hash_update_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_ops_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_ops_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_load_hdr_opt:
+ return &bpf_sock_ops_load_hdr_opt_proto;
+ case BPF_FUNC_store_hdr_opt:
+ return &bpf_sock_ops_store_hdr_opt_proto;
+ case BPF_FUNC_reserve_hdr_opt:
+ return &bpf_sock_ops_reserve_hdr_opt_proto;
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif /* CONFIG_INET */
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
+static const struct bpf_func_proto *
+sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_msg_redirect_map:
+ return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_redirect_hash:
+ return &bpf_msg_redirect_hash_proto;
+ case BPF_FUNC_msg_apply_bytes:
+ return &bpf_msg_apply_bytes_proto;
+ case BPF_FUNC_msg_cork_bytes:
+ return &bpf_msg_cork_bytes_proto;
+ case BPF_FUNC_msg_pull_data:
+ return &bpf_msg_pull_data_proto;
+ case BPF_FUNC_msg_push_data:
+ return &bpf_msg_push_data_proto;
+ case BPF_FUNC_msg_pop_data:
+ return &bpf_msg_pop_data_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sk_msg_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
+static const struct bpf_func_proto *
+sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &sk_skb_pull_data_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &sk_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &sk_skb_change_head_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &sk_skb_adjust_room_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_sk_redirect_map:
+ return &bpf_sk_redirect_map_proto;
+ case BPF_FUNC_sk_redirect_hash:
+ return &bpf_sk_redirect_hash_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_skc_lookup_tcp_proto;
+#endif
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_flow_dissector_load_bytes_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_in_push_encap_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_xmit_push_encap_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ return &bpf_lwt_seg6_store_bytes_proto;
+ case BPF_FUNC_lwt_seg6_action:
+ return &bpf_lwt_seg6_action_proto;
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ return &bpf_lwt_seg6_adjust_srh_proto;
+#endif
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct __sk_buff))
+ return false;
+
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ if (off + size > offsetofend(struct __sk_buff, cb[4]))
+ return false;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ break;
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
+ if (size != size_default)
+ return false;
+ break;
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
+ if (type == BPF_WRITE || size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range_ptr(struct __sk_buff, sk):
+ if (type == BPF_WRITE || size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+ break;
+ case offsetof(struct __sk_buff, tstamp_type):
+ return false;
+ case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+ /* Explicitly prohibit access to padding in __sk_buff. */
+ return false;
+ default:
+ /* Only narrow read access allowed for now. */
+ if (type == BPF_WRITE) {
+ if (size != size_default)
+ return false;
+ } else {
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool sk_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool cg_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+ return false;
+ break;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+ return false;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+/* Attach type specific accesses */
+static bool __sock_filter_check_attach_type(int off,
+ enum bpf_access_type access_type,
+ enum bpf_attach_type attach_type)
+{
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ case offsetof(struct bpf_sock, mark):
+ case offsetof(struct bpf_sock, priority):
+ switch (attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
+ goto full_access;
+ default:
+ return false;
+ }
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ switch (attach_type) {
+ case BPF_CGROUP_INET4_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ switch (attach_type) {
+ case BPF_CGROUP_INET6_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ case bpf_ctx_range(struct bpf_sock, src_port):
+ switch (attach_type) {
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ }
+read_only:
+ return access_type == BPF_READ;
+full_access:
+ return true;
+}
+
+bool bpf_sock_common_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range_till(struct bpf_sock, type, priority):
+ return false;
+ default:
+ return bpf_sock_is_valid_access(off, size, type, info);
+ }
+}
+
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+ int field_size;
+
+ if (off < 0 || off >= sizeof(struct bpf_sock))
+ return false;
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case offsetof(struct bpf_sock, state):
+ case offsetof(struct bpf_sock, family):
+ case offsetof(struct bpf_sock, type):
+ case offsetof(struct bpf_sock, protocol):
+ case offsetof(struct bpf_sock, src_port):
+ case offsetof(struct bpf_sock, rx_queue_mapping):
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock, dst_ip4):
+ case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sock, dst_port):
+ field_size = size == size_default ?
+ size_default : sizeof_field(struct bpf_sock, dst_port);
+ bpf_ctx_record_field_size(info, field_size);
+ return bpf_ctx_narrow_access_ok(off, size, field_size);
+ case offsetofend(struct bpf_sock, dst_port) ...
+ offsetof(struct bpf_sock, dst_ip4) - 1:
+ return false;
+ }
+
+ return size == size_default;
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (!bpf_sock_is_valid_access(off, size, type, info))
+ return false;
+ return __sock_filter_check_attach_type(off, type,
+ prog->expected_attach_type);
+}
+
+static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Neither direct read nor direct write requires any preliminary
+ * action.
+ */
+ return 0;
+}
+
+static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog, int drop_verdict)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!direct_write)
+ return 0;
+
+ /* if (!skb->cloned)
+ * goto start;
+ *
+ * (Fast-path, otherwise approximation that we might be
+ * a clone, do the rest in helper.)
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+ /* ret = bpf_skb_pull_data(skb, 0); */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_pull_data);
+ /* if (!ret)
+ * goto restore;
+ * return TC_ACT_SHOT;
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
+ *insn++ = BPF_EXIT_INSN();
+
+ /* restore: */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ /* start: */
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+ struct bpf_insn *insn_buf)
+{
+ bool indirect = BPF_MODE(orig->code) == BPF_IND;
+ struct bpf_insn *insn = insn_buf;
+
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+ if (orig->imm)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+ }
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+
+ switch (BPF_SIZE(orig->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+ break;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
+}
+
+static bool tc_cls_act_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, queue_mapping):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ info->reg_type = PTR_TO_PACKET_META;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
+ case offsetof(struct __sk_buff, tstamp_type):
+ /* The convert_ctx_access() on reading and writing
+ * __sk_buff->tstamp depends on whether the bpf prog
+ * has used __sk_buff->tstamp_type or not.
+ * Thus, we need to set prog->tstamp_type_access
+ * earlier during is_valid_access() here.
+ */
+ ((struct bpf_prog *)prog)->tstamp_type_access = 1;
+ return size == sizeof(__u8);
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+DEFINE_MUTEX(nf_conn_btf_access_lock);
+EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
+
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size);
+EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
+
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ int ret = -EACCES;
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, reg, off, size);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
+static bool __is_valid_xdp_access(int off, int size)
+{
+ if (off < 0 || off >= sizeof(struct xdp_md))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
+}
+
+static bool xdp_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
+ switch (off) {
+ case offsetof(struct xdp_md, egress_ifindex):
+ return false;
+ }
+ }
+
+ if (type == BPF_WRITE) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ switch (off) {
+ case offsetof(struct xdp_md, rx_queue_index):
+ return __is_valid_xdp_access(off, size);
+ }
+ }
+ return false;
+ } else {
+ switch (off) {
+ case offsetof(struct xdp_md, data_meta):
+ case offsetof(struct xdp_md, data):
+ case offsetof(struct xdp_md, data_end):
+ if (info->is_ldsx)
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct xdp_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct xdp_md, data_meta):
+ info->reg_type = PTR_TO_PACKET_META;
+ break;
+ case offsetof(struct xdp_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_xdp_access(off, size);
+}
+
+void bpf_warn_invalid_xdp_action(const struct net_device *dev,
+ const struct bpf_prog *prog, u32 act)
+{
+ const u32 act_max = XDP_REDIRECT;
+
+ pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
+ act > act_max ? "Illegal" : "Driver unsupported",
+ act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ int ret = -EACCES;
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, reg, off, size);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
+static bool sock_addr_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sock_addr))
+ return false;
+ if (off % size != 0)
+ return false;
+
+ /* Disallow access to fields not belonging to the attach type's address
+ * family.
+ */
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP6_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, user_port):
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, size_default);
+
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ user_ip6))
+ return true;
+
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ msg_src_ip6))
+ return true;
+
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ } else {
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ user_ip6))
+ return true;
+
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ msg_src_ip6))
+ return true;
+
+ if (size != size_default)
+ return false;
+ }
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sock_addr, sk):
+ if (type != BPF_READ)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, user_family):
+ case bpf_ctx_range(struct bpf_sock_addr, family):
+ case bpf_ctx_range(struct bpf_sock_addr, type):
+ case bpf_ctx_range(struct bpf_sock_addr, protocol):
+ if (type != BPF_READ)
+ return false;
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool sock_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sock_ops))
+ return false;
+
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock_ops, reply):
+ case offsetof(struct bpf_sock_ops, sk_txhash):
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ switch (off) {
+ case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
+ bytes_acked):
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sock_ops, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size,
+ size_default);
+ case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp):
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ }
+
+ return true;
+}
+
+static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
+}
+
+static bool sk_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool sk_msg_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range_ptr(struct sk_msg_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range_ptr(struct sk_msg_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range_ptr(struct sk_msg_md, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case bpf_ctx_range(struct sk_msg_md, family):
+ case bpf_ctx_range(struct sk_msg_md, remote_ip4):
+ case bpf_ctx_range(struct sk_msg_md, local_ip4):
+ case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range(struct sk_msg_md, remote_port):
+ case bpf_ctx_range(struct sk_msg_md, local_port):
+ case bpf_ctx_range(struct sk_msg_md, size):
+ if (size != sizeof(__u32))
+ return false;
+ break;
+ default:
+ return false;
+ }
+ return true;
+}
+
+static bool flow_dissector_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct __sk_buff))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ return true;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ return true;
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_FLOW_KEYS;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_flow_dissector, data));
+ break;
+
+ case offsetof(struct __sk_buff, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_flow_dissector, data_end));
+ break;
+
+ case offsetof(struct __sk_buff, flow_keys):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_flow_dissector, flow_keys));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+ BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
+ BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
+ BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
+ BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
+ *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
+#else
+ BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
+#endif
+
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb_shinfo(SKB); */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ BPF_REG_AX, skb_reg,
+ offsetof(struct sk_buff, end));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
+ dst_reg, skb_reg,
+ offsetof(struct sk_buff, head));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
+#else
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ dst_reg, skb_reg,
+ offsetof(struct sk_buff, end));
+#endif
+
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+
+#ifdef CONFIG_NET_XGRESS
+ /* If the tstamp_type is read,
+ * the bpf prog is aware the tstamp could have delivery time.
+ * Thus, read skb->tstamp as is if tstamp_type_access is true.
+ */
+ if (!prog->tstamp_type_access) {
+ /* AX is needed because src_reg and dst_reg could be the same */
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ /* check if ingress mask bits is set */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ *insn++ = BPF_JMP_A(4);
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
+ *insn++ = BPF_JMP_A(2);
+ /* skb->tc_at_ingress && skb->tstamp_type,
+ * read 0 as the (rcv) timestamp.
+ */
+ *insn++ = BPF_MOV64_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+ }
+#endif
+
+ *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
+ offsetof(struct sk_buff, tstamp));
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->src_reg;
+ __u8 skb_reg = si->dst_reg;
+
+#ifdef CONFIG_NET_XGRESS
+ /* If the tstamp_type is read,
+ * the bpf prog is aware the tstamp could have delivery time.
+ * Thus, write skb->tstamp as is if tstamp_type_access is true.
+ * Otherwise, writing at ingress will have to clear the
+ * skb->tstamp_type bit also.
+ */
+ if (!prog->tstamp_type_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ /* Writing __sk_buff->tstamp as ingress, goto <clear> */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ /* goto <store> */
+ *insn++ = BPF_JMP_A(2);
+ /* <clear>: skb->tstamp_type */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
+ *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
+ }
+#endif
+
+ /* <store>: skb->tstamp = tstamp */
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
+ skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
+ return insn;
+}
+
+#define BPF_EMIT_STORE(size, si, off) \
+ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \
+ (si)->dst_reg, (si)->src_reg, (off), (si)->imm)
+
+static u32 bpf_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, len):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, len, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, protocol, 2,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, vlan_proto):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, vlan_proto, 2,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, priority):
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, skb_iif, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, hash):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, hash, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, mark):
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, pkt_type):
+ *target_size = 1;
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
+ PKT_TYPE_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, queue_mapping):
+ if (type == BPF_WRITE) {
+ u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
+
+ if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
+ *insn++ = BPF_JMP_A(0); /* noop */
+ break;
+ }
+
+ if (BPF_CLASS(si->code) == BPF_STX)
+ *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
+ *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ queue_mapping,
+ 2, target_size));
+ }
+ break;
+
+ case offsetof(struct __sk_buff, vlan_present):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ vlan_all, 4, target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
+ break;
+
+ case offsetof(struct __sk_buff, vlan_tci):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, vlan_tci, 2,
+ target_size));
+ break;
+
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetofend(struct __sk_buff, cb[4]) - 1:
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct qdisc_skb_cb, data)) %
+ sizeof(__u64));
+
+ prog->cb_access = 1;
+ off = si->off;
+ off -= offsetof(struct __sk_buff, cb[0]);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_classid):
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, tc_classid);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, tc_classid);
+ *target_size = 2;
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ break;
+
+ case offsetof(struct __sk_buff, data_meta):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_meta);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct bpf_skb_data_end, data_meta);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, data_end):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, data_end);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct bpf_skb_data_end, data_end);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, tc_index):
+#ifdef CONFIG_NET_SCHED
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_H, si,
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
+#else
+ *target_size = 2;
+ if (type == BPF_WRITE)
+ *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
+ else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, napi_id):
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, napi_id, 4,
+ target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#else
+ *target_size = 4;
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+ break;
+ case offsetof(struct __sk_buff, family):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_family,
+ 2, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip4):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_daddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, local_ip4):
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_rcv_saddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip6[0]) ...
+ offsetof(struct __sk_buff, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, remote_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ case offsetof(struct __sk_buff, local_ip6[0]) ...
+ offsetof(struct __sk_buff, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, local_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, remote_port):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_dport,
+ 2, target_size));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, local_port):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_num, 2, target_size));
+ break;
+
+ case offsetof(struct __sk_buff, tstamp):
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
+
+ if (type == BPF_WRITE)
+ insn = bpf_convert_tstamp_write(prog, si, insn);
+ else
+ insn = bpf_convert_tstamp_read(prog, si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, tstamp_type):
+ insn = bpf_convert_tstamp_type_read(si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, gso_segs):
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_segs, 2,
+ target_size));
+ break;
+ case offsetof(struct __sk_buff, gso_size):
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_size, 2,
+ target_size));
+ break;
+ case offsetof(struct __sk_buff, wire_len):
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, wire_len);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, pkt_len);
+ *target_size = 4;
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ break;
+ case offsetof(struct __sk_buff, hwtstamp):
+ BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
+ BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);
+
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW,
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ hwtstamps, 8,
+ target_size));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, mark):
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ offsetof(struct sock, sk_mark));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ break;
+
+ case offsetof(struct bpf_sock, priority):
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ offsetof(struct sock, sk_priority));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_family),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common,
+ skc_family,
+ sizeof_field(struct sock_common,
+ skc_family),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_type),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_type,
+ sizeof_field(struct sock, sk_type),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_protocol),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_protocol,
+ sizeof_field(struct sock, sk_protocol),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, src_ip4):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_rcv_saddr,
+ sizeof_field(struct sock_common,
+ skc_rcv_saddr),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, dst_ip4):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_daddr,
+ sizeof_field(struct sock_common,
+ skc_daddr),
+ target_size));
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ off = si->off;
+ off -= offsetof(struct bpf_sock, src_ip6[0]);
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(
+ struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0],
+ sizeof_field(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]),
+ target_size) + off);
+#else
+ (void)off;
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ off = si->off;
+ off -= offsetof(struct bpf_sock, dst_ip6[0]);
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common,
+ skc_v6_daddr.s6_addr32[0],
+ sizeof_field(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]),
+ target_size) + off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+ *target_size = 4;
+#endif
+ break;
+
+ case offsetof(struct bpf_sock, src_port):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_num),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_num,
+ sizeof_field(struct sock_common,
+ skc_num),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, dst_port):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_dport,
+ sizeof_field(struct sock_common,
+ skc_dport),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, state):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_state),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_state,
+ sizeof_field(struct sock_common,
+ skc_state),
+ target_size));
+ break;
+ case offsetof(struct bpf_sock, rx_queue_mapping):
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_rx_queue_mapping,
+ sizeof_field(struct sock,
+ sk_rx_queue_mapping),
+ target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
+ 1);
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+#else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+ *target_size = 2;
+#endif
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
+ break;
+ default:
+ return bpf_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct xdp_md, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, data));
+ break;
+ case offsetof(struct xdp_md, data_meta):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, data_meta));
+ break;
+ case offsetof(struct xdp_md, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, data_end));
+ break;
+ case offsetof(struct xdp_md, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, rxq));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_rxq_info, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+ case offsetof(struct xdp_md, rx_queue_index):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, rxq));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_rxq_info,
+ queue_index));
+ break;
+ case offsetof(struct xdp_md, egress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, txq));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_txq_info, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
+ * context Structure, F is Field in context structure that contains a pointer
+ * to Nested Structure of type NS that has the field NF.
+ *
+ * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
+ * sure that SIZE is not greater than actual size of S.F.NF.
+ *
+ * If offset OFF is provided, the load happens from that offset relative to
+ * offset of NF.
+ */
+#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \
+ do { \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \
+ si->src_reg, offsetof(S, F)); \
+ *insn++ = BPF_LDX_MEM( \
+ SIZE, si->dst_reg, si->dst_reg, \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
+ target_size) \
+ + OFF); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
+ BPF_FIELD_SIZEOF(NS, NF), 0)
+
+/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
+ * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
+ *
+ * In addition it uses Temporary Field TF (member of struct S) as the 3rd
+ * "register" since two registers available in convert_ctx_access are not
+ * enough: we can't override neither SRC, since it contains value to store, nor
+ * DST since it contains pointer to context that may be used by later
+ * instructions. But we need a temporary place to save pointer to nested
+ * structure whose field we want to store to.
+ */
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \
+ do { \
+ int tmp_reg = BPF_REG_9; \
+ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
+ --tmp_reg; \
+ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
+ --tmp_reg; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \
+ offsetof(S, TF)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
+ si->dst_reg, offsetof(S, F)); \
+ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \
+ tmp_reg, si->src_reg, \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
+ target_size) \
+ + OFF, \
+ si->imm); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
+ offsetof(S, TF)); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
+ TF) \
+ do { \
+ if (type == BPF_WRITE) { \
+ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \
+ OFF, TF); \
+ } else { \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
+ S, NS, F, NF, SIZE, OFF); \
+ } \
+ } while (0)
+
+static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_addr, user_family):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sockaddr, uaddr, sa_family);
+ break;
+
+ case offsetof(struct bpf_sock_addr, user_ip4):
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
+ sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
+ tmp_reg);
+ break;
+
+ case offsetof(struct bpf_sock_addr, user_port):
+ /* To get port we need to know sa_family first and then treat
+ * sockaddr as either sockaddr_in or sockaddr_in6.
+ * Though we can simplify since port field has same offset and
+ * size in both structures.
+ * Here we check this invariant and use just one of the
+ * structures if it's true.
+ */
+ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
+ offsetof(struct sockaddr_in6, sin6_port));
+ BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
+ sizeof_field(struct sockaddr_in6, sin6_port));
+ /* Account for sin6_port being smaller than user_port. */
+ port_size = min(port_size, BPF_LDST_BYTES(si));
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
+ break;
+
+ case offsetof(struct bpf_sock_addr, family):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_family);
+ break;
+
+ case offsetof(struct bpf_sock_addr, type):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_type);
+ break;
+
+ case offsetof(struct bpf_sock_addr, protocol):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_protocol);
+ break;
+
+ case offsetof(struct bpf_sock_addr, msg_src_ip4):
+ /* Treat t_ctx as struct in_addr for msg_src_ip4. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+ s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+ /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+ s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+ break;
+ case offsetof(struct bpf_sock_addr, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_addr_kern, sk));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
+ do { \
+ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \
+ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
+ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ fullsock_reg = reg; \
+ jmp += 2; \
+ } \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock), \
+ fullsock_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
+ if (si->dst_reg == si->src_reg) \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
+ OBJ_FIELD), \
+ si->dst_reg, si->dst_reg, \
+ offsetof(OBJ, OBJ_FIELD)); \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_JMP_A(1); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } \
+ } while (0)
+
+#define SOCK_OPS_GET_SK() \
+ do { \
+ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ fullsock_reg = reg; \
+ jmp += 2; \
+ } \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ fullsock_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
+ if (si->dst_reg == si->src_reg) \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_JMP_A(1); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } \
+ } while (0)
+
+#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
+ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)
+
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
+ do { \
+ int reg = BPF_REG_9; \
+ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
+ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock), \
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \
+ BPF_MEM | BPF_CLASS(si->code), \
+ reg, si->src_reg, \
+ offsetof(OBJ, OBJ_FIELD), \
+ si->imm); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
+ do { \
+ if (TYPE == BPF_WRITE) \
+ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
+ else \
+ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
+ } while (0)
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_ops, op):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ op),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, op));
+ break;
+
+ case offsetof(struct bpf_sock_ops, replylong[0]) ...
+ offsetof(struct bpf_sock_ops, replylong[3]):
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
+ sizeof_field(struct bpf_sock_ops_kern, reply));
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
+ sizeof_field(struct bpf_sock_ops_kern, replylong));
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, replylong[0]);
+ off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ break;
+
+ case offsetof(struct bpf_sock_ops, family):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip4):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip4):
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_port):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_port):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
+
+ case offsetof(struct bpf_sock_ops, is_fullsock):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern,
+ is_fullsock),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ is_fullsock));
+ break;
+
+ case offsetof(struct bpf_sock_ops, state):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_state));
+ break;
+
+ case offsetof(struct bpf_sock_ops, rtt_min):
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
+ sizeof(struct minmax));
+ BUILD_BUG_ON(sizeof(struct minmax) <
+ sizeof(struct minmax_sample));
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct tcp_sock, rtt_min) +
+ sizeof_field(struct minmax_sample, t));
+ break;
+
+ case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
+ SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
+ struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, sk_txhash):
+ SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+ struct sock, type);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
+ break;
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_ssthresh):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_sock_ops, rcv_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_una):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
+ break;
+ case offsetof(struct bpf_sock_ops, mss_cache):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
+ break;
+ case offsetof(struct bpf_sock_ops, ecn_flags):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_delivered):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_interval_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
+ break;
+ case offsetof(struct bpf_sock_ops, packets_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
+ break;
+ case offsetof(struct bpf_sock_ops, retrans_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
+ break;
+ case offsetof(struct bpf_sock_ops, total_retrans):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, lost_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
+ break;
+ case offsetof(struct bpf_sock_ops, sacked_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_received):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_acked):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
+ break;
+ case offsetof(struct bpf_sock_ops, sk):
+ SOCK_OPS_GET_SK();
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb_data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb_data_end));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, data));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_len):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, len));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ off = offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, tcp_flags);
+ *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
+ tcp_flags),
+ si->dst_reg, si->dst_reg, off);
+ break;
+ case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
+ struct bpf_insn *jmp_on_null_skb;
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ /* Reserve one insn to test skb == NULL */
+ jmp_on_null_skb = insn++;
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ hwtstamps, 8,
+ target_size));
+ *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
+ insn - jmp_on_null_skb - 1);
+ break;
+ }
+ }
+ return insn - insn_buf;
+}
+
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ int reg;
+ int temp_reg_off = offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, temp_reg);
+
+ if (si->src_reg == si->dst_reg) {
+ /* We need an extra register, choose and save a register. */
+ reg = BPF_REG_9;
+ if (si->src_reg == reg || si->dst_reg == reg)
+ reg--;
+ if (si->src_reg == reg || si->dst_reg == reg)
+ reg--;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
+ } else {
+ reg = si->dst_reg;
+ }
+
+ /* reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+
+ /* reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
+
+ if (si->src_reg == si->dst_reg) {
+ /* Restore the saved register */
+ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
+ *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
+ }
+
+ return insn;
+}
+
+static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, data_end):
+ insn = bpf_convert_data_end_access(si, insn);
+ break;
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetofend(struct __sk_buff, cb[4]) - 1:
+ BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, data)) %
+ sizeof(__u64));
+
+ prog->cb_access = 1;
+ off = si->off;
+ off -= offsetof(struct __sk_buff, cb[0]);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct sk_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+
+ default:
+ return bpf_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
+ int off;
+#endif
+
+ /* convert ctx uses the fact sg element is first in struct */
+ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
+
+ switch (si->off) {
+ case offsetof(struct sk_msg_md, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, data));
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, data_end));
+ break;
+ case offsetof(struct sk_msg_md, family):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip4):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip4):
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+ offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip6[0]) ...
+ offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, remote_port):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_port):
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
+
+ case offsetof(struct sk_msg_md, size):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_sg, size));
+ break;
+
+ case offsetof(struct sk_msg_md, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_filter_verifier_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_ld_abs = bpf_gen_ld_abs,
+};
+
+const struct bpf_prog_ops sk_filter_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
+ .get_func_proto = tc_cls_act_func_proto,
+ .is_valid_access = tc_cls_act_is_valid_access,
+ .convert_ctx_access = tc_cls_act_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+ .gen_ld_abs = bpf_gen_ld_abs,
+ .btf_struct_access = tc_cls_act_btf_struct_access,
+};
+
+const struct bpf_prog_ops tc_cls_act_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops xdp_verifier_ops = {
+ .get_func_proto = xdp_func_proto,
+ .is_valid_access = xdp_is_valid_access,
+ .convert_ctx_access = xdp_convert_ctx_access,
+ .gen_prologue = bpf_noop_prologue,
+ .btf_struct_access = xdp_btf_struct_access,
+};
+
+const struct bpf_prog_ops xdp_prog_ops = {
+ .test_run = bpf_prog_test_run_xdp,
+};
+
+const struct bpf_verifier_ops cg_skb_verifier_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = cg_skb_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_skb_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+ .get_func_proto = lwt_in_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_in_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+ .get_func_proto = lwt_out_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_out_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops lwt_xmit_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+ .get_func_proto = lwt_seg6local_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_sock_verifier_ops = {
+ .get_func_proto = sock_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = bpf_sock_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sock_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
+ .get_func_proto = sock_addr_func_proto,
+ .is_valid_access = sock_addr_is_valid_access,
+ .convert_ctx_access = sock_addr_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sock_addr_prog_ops = {
+};
+
+const struct bpf_verifier_ops sock_ops_verifier_ops = {
+ .get_func_proto = sock_ops_func_proto,
+ .is_valid_access = sock_ops_is_valid_access,
+ .convert_ctx_access = sock_ops_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sock_ops_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_skb_verifier_ops = {
+ .get_func_proto = sk_skb_func_proto,
+ .is_valid_access = sk_skb_is_valid_access,
+ .convert_ctx_access = sk_skb_convert_ctx_access,
+ .gen_prologue = sk_skb_prologue,
+};
+
+const struct bpf_prog_ops sk_skb_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_msg_verifier_ops = {
+ .get_func_proto = sk_msg_func_proto,
+ .is_valid_access = sk_msg_is_valid_access,
+ .convert_ctx_access = sk_msg_convert_ctx_access,
+ .gen_prologue = bpf_noop_prologue,
+};
+
+const struct bpf_prog_ops sk_msg_prog_ops = {
+};
+
+const struct bpf_verifier_ops flow_dissector_verifier_ops = {
+ .get_func_proto = flow_dissector_func_proto,
+ .is_valid_access = flow_dissector_is_valid_access,
+ .convert_ctx_access = flow_dissector_convert_ctx_access,
+};
+
+const struct bpf_prog_ops flow_dissector_prog_ops = {
+ .test_run = bpf_prog_test_run_flow_dissector,
+};
+
+int sk_detach_filter(struct sock *sk)
+{
+ int ret = -ENOENT;
+ struct sk_filter *filter;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ filter = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
+ if (filter) {
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ sk_filter_uncharge(sk, filter);
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
+
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ int ret = 0;
+
+ sockopt_lock_sock(sk);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ lockdep_sock_is_held(sk));
+ if (!filter)
+ goto out;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore. eBPF programs that
+ * have no original program cannot be dumped through this.
+ */
+ ret = -EACCES;
+ fprog = filter->prog->orig_prog;
+ if (!fprog)
+ goto out;
+
+ ret = fprog->len;
+ if (!len)
+ /* User space only enquires number of filter blocks. */
+ goto out;
+
+ ret = -EINVAL;
+ if (len < fprog->len)
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
+ goto out;
+
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
+out:
+ sockopt_release_sock(sk);
+ return ret;
+}
+
+#ifdef CONFIG_INET
+static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
+ struct sock_reuseport *reuse,
+ struct sock *sk, struct sk_buff *skb,
+ struct sock *migrating_sk,
+ u32 hash)
+{
+ reuse_kern->skb = skb;
+ reuse_kern->sk = sk;
+ reuse_kern->selected_sk = NULL;
+ reuse_kern->migrating_sk = migrating_sk;
+ reuse_kern->data_end = skb->data + skb_headlen(skb);
+ reuse_kern->hash = hash;
+ reuse_kern->reuseport_id = reuse->reuseport_id;
+ reuse_kern->bind_inany = reuse->bind_inany;
+}
+
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ struct sock *migrating_sk,
+ u32 hash)
+{
+ struct sk_reuseport_kern reuse_kern;
+ enum sk_action action;
+
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
+ action = bpf_prog_run(prog, &reuse_kern);
+
+ if (action == SK_PASS)
+ return reuse_kern.selected_sk;
+ else
+ return ERR_PTR(-ECONNREFUSED);
+}
+
+BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
+ struct bpf_map *, map, void *, key, u32, flags)
+{
+ bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
+ struct sock_reuseport *reuse;
+ struct sock *selected_sk;
+ int err;
+
+ selected_sk = map->ops->map_lookup_elem(map, key);
+ if (!selected_sk)
+ return -ENOENT;
+
+ reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
+ if (!reuse) {
+ /* reuseport_array has only sk with non NULL sk_reuseport_cb.
+ * The only (!reuse) case here is - the sk has already been
+ * unhashed (e.g. by close()), so treat it as -ENOENT.
+ *
+ * Other maps (e.g. sock_map) do not provide this guarantee and
+ * the sk may never be in the reuseport group to begin with.
+ */
+ err = is_sockarray ? -ENOENT : -EINVAL;
+ goto error;
+ }
+
+ if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
+ struct sock *sk = reuse_kern->sk;
+
+ if (sk->sk_protocol != selected_sk->sk_protocol) {
+ err = -EPROTOTYPE;
+ } else if (sk->sk_family != selected_sk->sk_family) {
+ err = -EAFNOSUPPORT;
+ } else {
+ /* Catch all. Likely bound to a different sockaddr. */
+ err = -EBADFD;
+ }
+ goto error;
+ }
+
+ reuse_kern->selected_sk = selected_sk;
+
+ return 0;
+error:
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
+ return err;
+}
+
+static const struct bpf_func_proto sk_select_reuseport_proto = {
+ .func = sk_select_reuseport,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(sk_reuseport_load_bytes,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len)
+{
+ return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
+ .func = sk_reuseport_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(sk_reuseport_load_bytes_relative,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len, u32, start_header)
+{
+ return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
+ len, start_header);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
+ .func = sk_reuseport_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_reuseport_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sk_select_reuseport:
+ return &sk_select_reuseport_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &sk_reuseport_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &sk_reuseport_load_bytes_relative_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static bool
+sk_reuseport_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const u32 size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
+ off % size || type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_reuseport_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, hash):
+ return size == size_default;
+
+ case offsetof(struct sk_reuseport_md, sk):
+ info->reg_type = PTR_TO_SOCKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+ return size == sizeof(__u64);
+
+ /* Fields that allow narrowing */
+ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
+ if (size < sizeof_field(struct sk_buff, protocol))
+ return false;
+ fallthrough;
+ case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
+ case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
+ case bpf_ctx_range(struct sk_reuseport_md, len):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+
+ default:
+ return false;
+ }
+}
+
+#define SK_REUSEPORT_LOAD_FIELD(F) ({ \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ si->dst_reg, si->src_reg, \
+ bpf_target_off(struct sk_reuseport_kern, F, \
+ sizeof_field(struct sk_reuseport_kern, F), \
+ target_size)); \
+ })
+
+#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sk_buff, \
+ skb, \
+ SKB_FIELD)
+
+#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sock, \
+ sk, \
+ SK_FIELD)
+
+static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct sk_reuseport_md, data):
+ SK_REUSEPORT_LOAD_SKB_FIELD(data);
+ break;
+
+ case offsetof(struct sk_reuseport_md, len):
+ SK_REUSEPORT_LOAD_SKB_FIELD(len);
+ break;
+
+ case offsetof(struct sk_reuseport_md, eth_protocol):
+ SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
+ break;
+
+ case offsetof(struct sk_reuseport_md, ip_protocol):
+ SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
+ break;
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ SK_REUSEPORT_LOAD_FIELD(data_end);
+ break;
+
+ case offsetof(struct sk_reuseport_md, hash):
+ SK_REUSEPORT_LOAD_FIELD(hash);
+ break;
+
+ case offsetof(struct sk_reuseport_md, bind_inany):
+ SK_REUSEPORT_LOAD_FIELD(bind_inany);
+ break;
+
+ case offsetof(struct sk_reuseport_md, sk):
+ SK_REUSEPORT_LOAD_FIELD(sk);
+ break;
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
+ .get_func_proto = sk_reuseport_func_proto,
+ .is_valid_access = sk_reuseport_is_valid_access,
+ .convert_ctx_access = sk_reuseport_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_reuseport_prog_ops = {
+};
+
+DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
+EXPORT_SYMBOL(bpf_sk_lookup_enabled);
+
+BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
+ struct sock *, sk, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
+ BPF_SK_LOOKUP_F_NO_REUSEPORT)))
+ return -EINVAL;
+ if (unlikely(sk && sk_is_refcounted(sk)))
+ return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
+ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
+ return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
+ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
+ return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
+
+ /* Check if socket is suitable for packet L3/L4 protocol */
+ if (sk && sk->sk_protocol != ctx->protocol)
+ return -EPROTOTYPE;
+ if (sk && sk->sk_family != ctx->family &&
+ (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
+ return -EAFNOSUPPORT;
+
+ if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
+ return -EEXIST;
+
+ /* Select socket as lookup result */
+ ctx->selected_sk = sk;
+ ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
+ .func = bpf_sk_lookup_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_lookup_assign_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static bool sk_lookup_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk):
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ return size == sizeof(__u64);
+
+ case bpf_ctx_range(struct bpf_sk_lookup, family):
+ case bpf_ctx_range(struct bpf_sk_lookup, protocol):
+ case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
+ case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
+ case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+ case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
+ bpf_ctx_record_field_size(info, sizeof(__u32));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+
+ case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+ /* Allow 4-byte access to 2-byte field for backward compatibility */
+ if (size == sizeof(__u32))
+ return true;
+ bpf_ctx_record_field_size(info, sizeof(__be16));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
+
+ case offsetofend(struct bpf_sk_lookup, remote_port) ...
+ offsetof(struct bpf_sk_lookup, local_ip4) - 1:
+ /* Allow access to zero padding for backward compatibility */
+ bpf_ctx_record_field_size(info, sizeof(__u16));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
+
+ default:
+ return false;
+ }
+}
+
+static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sk_lookup, sk):
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, selected_sk));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, family):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ family, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ protocol, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, remote_ip4):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ v4.saddr, 4, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, local_ip4):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ v4.daddr, 4, target_size));
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sk_lookup,
+ remote_ip6[0], remote_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+ int off = si->off;
+
+ off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
+ off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, v6.saddr));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ }
+ case bpf_ctx_range_till(struct bpf_sk_lookup,
+ local_ip6[0], local_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+ int off = si->off;
+
+ off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
+ off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, v6.daddr));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ }
+ case offsetof(struct bpf_sk_lookup, remote_port):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ sport, 2, target_size));
+ break;
+
+ case offsetofend(struct bpf_sk_lookup, remote_port):
+ *target_size = 2;
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+ break;
+
+ case offsetof(struct bpf_sk_lookup, local_port):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ dport, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ ingress_ifindex, 4, target_size));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_prog_ops sk_lookup_prog_ops = {
+ .test_run = bpf_prog_test_run_sk_lookup,
+};
+
+const struct bpf_verifier_ops sk_lookup_verifier_ops = {
+ .get_func_proto = sk_lookup_func_proto,
+ .is_valid_access = sk_lookup_is_valid_access,
+ .convert_ctx_access = sk_lookup_convert_ctx_access,
+};
+
+#endif /* CONFIG_INET */
+
+DEFINE_BPF_DISPATCHER(xdp)
+
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+{
+ bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
+}
+
+BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
+#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+
+BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
+{
+ /* tcp6_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct tcp6_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_family == AF_INET6)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
+ .func = bpf_skc_to_tcp6_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
+{
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
+ .func = bpf_skc_to_tcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
+{
+ /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
+ * generated if CONFIG_INET=n. Trigger an explicit generation here.
+ */
+ BTF_TYPE_EMIT(struct inet_timewait_sock);
+ BTF_TYPE_EMIT(struct tcp_timewait_sock);
+
+#ifdef CONFIG_INET
+ if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
+ return (unsigned long)sk;
+#endif
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
+ return (unsigned long)sk;
+#endif
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
+ .func = bpf_skc_to_tcp_timewait_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
+{
+#ifdef CONFIG_INET
+ if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+ return (unsigned long)sk;
+#endif
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+ return (unsigned long)sk;
+#endif
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
+ .func = bpf_skc_to_tcp_request_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
+};
+
+BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
+{
+ /* udp6_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct udp6_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
+ sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
+ .func = bpf_skc_to_udp6_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
+};
+
+BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
+{
+ /* unix_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct unix_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
+ .func = bpf_skc_to_unix_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
+};
+
+BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
+{
+ BTF_TYPE_EMIT(struct mptcp_sock);
+ return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
+}
+
+const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
+ .func = bpf_skc_to_mptcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
+};
+
+BPF_CALL_1(bpf_sock_from_file, struct file *, file)
+{
+ return (unsigned long)sock_from_file(file);
+}
+
+BTF_ID_LIST(bpf_sock_from_file_btf_ids)
+BTF_ID(struct, socket)
+BTF_ID(struct, file)
+
+const struct bpf_func_proto bpf_sock_from_file_proto = {
+ .func = bpf_sock_from_file,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = &bpf_sock_from_file_btf_ids[0],
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_sock_from_file_btf_ids[1],
+};
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func;
+
+ switch (func_id) {
+ case BPF_FUNC_skc_to_tcp6_sock:
+ func = &bpf_skc_to_tcp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_sock:
+ func = &bpf_skc_to_tcp_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ func = &bpf_skc_to_tcp_timewait_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ func = &bpf_skc_to_tcp_request_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_udp6_sock:
+ func = &bpf_skc_to_udp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_unix_sock:
+ func = &bpf_skc_to_unix_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_mptcp_sock:
+ func = &bpf_skc_to_mptcp_sock_proto;
+ break;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+ return NULL;
+
+ return func;
+}
+
+/**
+ * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
+ * @skb: socket buffer carrying the metadata
+ * @offset: offset into the metadata area, must be <= skb_metadata_len()
+ */
+void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
+{
+ return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
+}
+
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len, u64 flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ if (unlikely(bpf_try_make_writable(skb, 0)))
+ return -EFAULT;
+
+ memmove(bpf_skb_meta_pointer(skb, offset), from, len);
+ return 0;
+}
+
+__bpf_kfunc_start_defs();
+__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)s;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+
+ return 0;
+}
+
+/**
+ * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
+ * @skb_: socket buffer carrying the metadata
+ * @flags: future use, must be zero
+ * @ptr__uninit: dynptr to initialize
+ *
+ * Set up a dynptr for access to the metadata area earlier allocated from the
+ * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
+ * &__sk_buff->data_meta.
+ *
+ * Return:
+ * * %0 - dynptr ready to use
+ * * %-EINVAL - invalid flags, dynptr set to null
+ */
+__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)skb_;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
+ const u8 *sun_path, u32 sun_path__sz)
+{
+ struct sockaddr_un *un;
+
+ if (sa_kern->sk->sk_family != AF_UNIX)
+ return -EINVAL;
+
+ /* We do not allow changing the address to unnamed or larger than the
+ * maximum allowed address size for a unix sockaddr.
+ */
+ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
+ return -EINVAL;
+
+ un = (struct sockaddr_un *)sa_kern->uaddr;
+ memcpy(un->sun_path, sun_path, sun_path__sz);
+ sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
+ struct bpf_tcp_req_attrs *attrs, int attrs__sz)
+{
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ struct sk_buff *skb = (struct sk_buff *)s;
+ const struct request_sock_ops *ops;
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
+ struct request_sock *req;
+ struct net *net;
+ __u16 min_mss;
+ u32 tsoff = 0;
+
+ if (attrs__sz != sizeof(*attrs) ||
+ attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
+ return -EINVAL;
+
+ if (!skb_at_tc_ingress(skb))
+ return -EINVAL;
+
+ net = dev_net(skb->dev);
+ if (net != sock_net(sk))
+ return -ENETUNREACH;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ops = &tcp_request_sock_ops;
+ min_mss = 536;
+ break;
+#if IS_BUILTIN(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ops = &tcp6_request_sock_ops;
+ min_mss = IPV6_MIN_MTU - 60;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
+ sk_is_mptcp(sk))
+ return -EINVAL;
+
+ if (attrs->mss < min_mss)
+ return -EINVAL;
+
+ if (attrs->wscale_ok) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
+ return -EINVAL;
+
+ if (attrs->snd_wscale > TCP_MAX_WSCALE ||
+ attrs->rcv_wscale > TCP_MAX_WSCALE)
+ return -EINVAL;
+ }
+
+ if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
+ return -EINVAL;
+
+ if (attrs->tstamp_ok) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
+ return -EINVAL;
+
+ tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
+ }
+
+ req = inet_reqsk_alloc(ops, sk, false);
+ if (!req)
+ return -ENOMEM;
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+
+ req->rsk_listener = sk;
+ req->syncookie = 1;
+ req->mss = attrs->mss;
+ req->ts_recent = attrs->rcv_tsval;
+
+ ireq->snd_wscale = attrs->snd_wscale;
+ ireq->rcv_wscale = attrs->rcv_wscale;
+ ireq->tstamp_ok = !!attrs->tstamp_ok;
+ ireq->sack_ok = !!attrs->sack_ok;
+ ireq->wscale_ok = !!attrs->wscale_ok;
+ ireq->ecn_ok = !!attrs->ecn_ok;
+
+ treq->req_usec_ts = !!attrs->usec_ts_ok;
+ treq->ts_off = tsoff;
+
+ skb_orphan(skb);
+ skb->sk = req_to_sk(req);
+ skb->destructor = sock_pfree;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
+ u64 flags)
+{
+ struct sk_buff *skb;
+
+ if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
+ return -EOPNOTSUPP;
+
+ if (flags)
+ return -EINVAL;
+
+ skb = skops->skb;
+ skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
+ TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
+ skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+
+ return 0;
+}
+
+/**
+ * bpf_xdp_pull_data() - Pull in non-linear xdp data.
+ * @x: &xdp_md associated with the XDP buffer
+ * @len: length of data to be made directly accessible in the linear part
+ *
+ * Pull in data in case the XDP buffer associated with @x is non-linear and
+ * not all @len are in the linear data area.
+ *
+ * Direct packet access allows reading and writing linear XDP data through
+ * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which
+ * ends up in the linear part of the xdp_buff depends on the NIC and its
+ * configuration. When a frag-capable XDP program wants to directly access
+ * headers that may be in the non-linear area, call this kfunc to make sure
+ * the data is available in the linear area. Alternatively, use dynptr or
+ * bpf_xdp_{load,store}_bytes() to access data without pulling.
+ *
+ * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate
+ * headers in the non-linear data area.
+ *
+ * A call to this kfunc may reduce headroom. If there is not enough tailroom
+ * in the linear data area, metadata and data will be shifted down.
+ *
+ * A call to this kfunc is susceptible to change the buffer geometry.
+ * Therefore, at load time, all checks on pointers previously done by the
+ * verifier are invalidated and must be performed again, if the kfunc is used
+ * in combination with direct packet access.
+ *
+ * Return:
+ * * %0 - success
+ * * %-EINVAL - invalid len
+ */
+__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, delta, shift, headroom, tailroom, n_frags_free = 0;
+ void *data_hard_end = xdp_data_hard_end(xdp);
+ int data_len = xdp->data_end - xdp->data;
+ void *start;
+
+ if (len <= data_len)
+ return 0;
+
+ if (unlikely(len > xdp_get_buff_len(xdp)))
+ return -EINVAL;
+
+ start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta;
+
+ headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame);
+ tailroom = data_hard_end - xdp->data_end;
+
+ delta = len - data_len;
+ if (unlikely(delta > tailroom + headroom))
+ return -EINVAL;
+
+ shift = delta - tailroom;
+ if (shift > 0) {
+ memmove(start - shift, start, xdp->data_end - start);
+
+ xdp->data_meta -= shift;
+ xdp->data -= shift;
+ xdp->data_end -= shift;
+ }
+
+ for (i = 0; i < sinfo->nr_frags && delta; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ u32 shrink = min_t(u32, delta, skb_frag_size(frag));
+
+ memcpy(xdp->data_end, skb_frag_address(frag), shrink);
+
+ xdp->data_end += shrink;
+ sinfo->xdp_frags_size -= shrink;
+ delta -= shrink;
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, false))
+ n_frags_free++;
+ }
+
+ if (unlikely(n_frags_free)) {
+ memmove(sinfo->frags, sinfo->frags + n_frags_free,
+ (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t));
+
+ sinfo->nr_frags -= n_frags_free;
+
+ if (!sinfo->nr_frags) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
+ }
+ }
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ int err;
+
+ err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
+ if (err)
+ return err;
+
+ bpf_dynptr_set_rdonly(ptr);
+
+ return 0;
+}
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
+BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_ID_FLAGS(func, bpf_xdp_pull_data)
+BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
+BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
+BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
+BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb_meta,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_xdp,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_addr,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_tcp_reqsk,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_ops,
+};
+
+static int __init bpf_kfunc_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ &bpf_kfunc_set_sock_addr);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
+}
+late_initcall(bpf_kfunc_init);
+
+__bpf_kfunc_start_defs();
+
+/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
+ *
+ * The function expects a non-NULL pointer to a socket, and invokes the
+ * protocol specific socket destroy handlers.
+ *
+ * The helper can only be called from BPF contexts that have acquired the socket
+ * locks.
+ *
+ * Parameters:
+ * @sock: Pointer to socket to be destroyed
+ *
+ * Return:
+ * On error, may return EPROTONOSUPPORT, EINVAL.
+ * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
+ * 0 otherwise
+ */
+__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
+{
+ struct sock *sk = (struct sock *)sock;
+
+ /* The locking semantics that allow for synchronous execution of the
+ * destroy handlers are only supported for TCP and UDP.
+ * Supporting protocols will need to acquire sock lock in the BPF context
+ * prior to invoking this kfunc.
+ */
+ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
+ sk->sk_protocol != IPPROTO_UDP))
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
+
+static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
+ prog->expected_attach_type != BPF_TRACE_ITER)
+ return -EACCES;
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_sk_iter_kfunc_ids,
+ .filter = tracing_iter_filter,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/net/core/flow.c b/net/core/flow.c
deleted file mode 100644
index b16d31ae5e54..000000000000
--- a/net/core/flow.c
+++ /dev/null
@@ -1,381 +0,0 @@
-/* flow.c: Generic flow cache.
- *
- * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
- * Copyright (C) 2003 David S. Miller (davem@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/jhash.h>
-#include <linux/interrupt.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/bitops.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/mutex.h>
-#include <net/flow.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-#include <linux/security.h>
-
-struct flow_cache_entry {
- struct flow_cache_entry *next;
- u16 family;
- u8 dir;
- struct flowi key;
- u32 genid;
- void *object;
- atomic_t *object_ref;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size (1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static kmem_cache_t *flow_cachep __read_mostly;
-
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
- int hash_rnd_recalc;
- u32 hash_rnd;
- int count;
-} ____cacheline_aligned;
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-
-#define flow_hash_rnd_recalc(cpu) \
- (per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
- (per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
- (per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
-
-struct flow_flush_info {
- atomic_t cpuleft;
- struct completion completion;
-};
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
-
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
-
-static void flow_cache_new_hashrnd(unsigned long arg)
-{
- int i;
-
- for_each_possible_cpu(i)
- flow_hash_rnd_recalc(i) = 1;
-
- flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
- add_timer(&flow_hash_rnd_timer);
-}
-
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
-{
- if (fle->object)
- atomic_dec(fle->object_ref);
- kmem_cache_free(flow_cachep, fle);
- flow_count(cpu)--;
-}
-
-static void __flow_cache_shrink(int cpu, int shrink_to)
-{
- struct flow_cache_entry *fle, **flp;
- int i;
-
- for (i = 0; i < flow_hash_size; i++) {
- int k = 0;
-
- flp = &flow_table(cpu)[i];
- while ((fle = *flp) != NULL && k < shrink_to) {
- k++;
- flp = &fle->next;
- }
- while ((fle = *flp) != NULL) {
- *flp = fle->next;
- flow_entry_kill(cpu, fle);
- }
- }
-}
-
-static void flow_cache_shrink(int cpu)
-{
- int shrink_to = flow_lwm / flow_hash_size;
-
- __flow_cache_shrink(cpu, shrink_to);
-}
-
-static void flow_new_hash_rnd(int cpu)
-{
- get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
- flow_hash_rnd_recalc(cpu) = 0;
-
- __flow_cache_shrink(cpu, 0);
-}
-
-static u32 flow_hash_code(struct flowi *key, int cpu)
-{
- u32 *k = (u32 *) key;
-
- return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
- (flow_hash_size - 1));
-}
-
-#if (BITS_PER_LONG == 64)
-typedef u64 flow_compare_t;
-#else
-typedef u32 flow_compare_t;
-#endif
-
-extern void flowi_is_missized(void);
-
-/* I hear what you're saying, use memcmp. But memcmp cannot make
- * important assumptions that we can here, such as alignment and
- * constant size.
- */
-static int flow_key_compare(struct flowi *key1, struct flowi *key2)
-{
- flow_compare_t *k1, *k1_lim, *k2;
- const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
-
- if (sizeof(struct flowi) % sizeof(flow_compare_t))
- flowi_is_missized();
-
- k1 = (flow_compare_t *) key1;
- k1_lim = k1 + n_elem;
-
- k2 = (flow_compare_t *) key2;
-
- do {
- if (*k1++ != *k2++)
- return 1;
- } while (k1 < k1_lim);
-
- return 0;
-}
-
-void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
- flow_resolve_t resolver)
-{
- struct flow_cache_entry *fle, **head;
- unsigned int hash;
- int cpu;
-
- local_bh_disable();
- cpu = smp_processor_id();
-
- fle = NULL;
- /* Packet really early in init? Making flow_cache_init a
- * pre-smp initcall would solve this. --RR */
- if (!flow_table(cpu))
- goto nocache;
-
- if (flow_hash_rnd_recalc(cpu))
- flow_new_hash_rnd(cpu);
- hash = flow_hash_code(key, cpu);
-
- head = &flow_table(cpu)[hash];
- for (fle = *head; fle; fle = fle->next) {
- if (fle->family == family &&
- fle->dir == dir &&
- flow_key_compare(key, &fle->key) == 0) {
- if (fle->genid == atomic_read(&flow_cache_genid)) {
- void *ret = fle->object;
-
- if (ret)
- atomic_inc(fle->object_ref);
- local_bh_enable();
-
- return ret;
- }
- break;
- }
- }
-
- if (!fle) {
- if (flow_count(cpu) > flow_hwm)
- flow_cache_shrink(cpu);
-
- fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
- if (fle) {
- fle->next = *head;
- *head = fle;
- fle->family = family;
- fle->dir = dir;
- memcpy(&fle->key, key, sizeof(*key));
- fle->object = NULL;
- flow_count(cpu)++;
- }
- }
-
-nocache:
- {
- int err;
- void *obj;
- atomic_t *obj_ref;
-
- err = resolver(key, family, dir, &obj, &obj_ref);
-
- if (fle) {
- if (err) {
- /* Force security policy check on next lookup */
- *head = fle->next;
- flow_entry_kill(cpu, fle);
- } else {
- fle->genid = atomic_read(&flow_cache_genid);
-
- if (fle->object)
- atomic_dec(fle->object_ref);
-
- fle->object = obj;
- fle->object_ref = obj_ref;
- if (obj)
- atomic_inc(fle->object_ref);
- }
- }
- local_bh_enable();
-
- if (err)
- obj = ERR_PTR(err);
- return obj;
- }
-}
-
-static void flow_cache_flush_tasklet(unsigned long data)
-{
- struct flow_flush_info *info = (void *)data;
- int i;
- int cpu;
-
- cpu = smp_processor_id();
- for (i = 0; i < flow_hash_size; i++) {
- struct flow_cache_entry *fle;
-
- fle = flow_table(cpu)[i];
- for (; fle; fle = fle->next) {
- unsigned genid = atomic_read(&flow_cache_genid);
-
- if (!fle->object || fle->genid == genid)
- continue;
-
- fle->object = NULL;
- atomic_dec(fle->object_ref);
- }
- }
-
- if (atomic_dec_and_test(&info->cpuleft))
- complete(&info->completion);
-}
-
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
-static void flow_cache_flush_per_cpu(void *data)
-{
- struct flow_flush_info *info = data;
- int cpu;
- struct tasklet_struct *tasklet;
-
- cpu = smp_processor_id();
-
- tasklet = flow_flush_tasklet(cpu);
- tasklet->data = (unsigned long)info;
- tasklet_schedule(tasklet);
-}
-
-void flow_cache_flush(void)
-{
- struct flow_flush_info info;
- static DEFINE_MUTEX(flow_flush_sem);
-
- /* Don't want cpus going down or up during this. */
- lock_cpu_hotplug();
- mutex_lock(&flow_flush_sem);
- atomic_set(&info.cpuleft, num_online_cpus());
- init_completion(&info.completion);
-
- local_bh_disable();
- smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
- flow_cache_flush_tasklet((unsigned long)&info);
- local_bh_enable();
-
- wait_for_completion(&info.completion);
- mutex_unlock(&flow_flush_sem);
- unlock_cpu_hotplug();
-}
-
-static void __devinit flow_cache_cpu_prepare(int cpu)
-{
- struct tasklet_struct *tasklet;
- unsigned long order;
-
- for (order = 0;
- (PAGE_SIZE << order) <
- (sizeof(struct flow_cache_entry *)*flow_hash_size);
- order++)
- /* NOTHING */;
-
- flow_table(cpu) = (struct flow_cache_entry **)
- __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
- if (!flow_table(cpu))
- panic("NET: failed to allocate flow cache order %lu\n", order);
-
- flow_hash_rnd_recalc(cpu) = 1;
- flow_count(cpu) = 0;
-
- tasklet = flow_flush_tasklet(cpu);
- tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int flow_cache_cpu(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- if (action == CPU_DEAD)
- __flow_cache_shrink((unsigned long)hcpu, 0);
- return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-static int __init flow_cache_init(void)
-{
- int i;
-
- flow_cachep = kmem_cache_create("flow_cache",
- sizeof(struct flow_cache_entry),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
- flow_hash_shift = 10;
- flow_lwm = 2 * flow_hash_size;
- flow_hwm = 4 * flow_hash_size;
-
- init_timer(&flow_hash_rnd_timer);
- flow_hash_rnd_timer.function = flow_cache_new_hashrnd;
- flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
- add_timer(&flow_hash_rnd_timer);
-
- for_each_possible_cpu(i)
- flow_cache_cpu_prepare(i);
-
- hotcpu_notifier(flow_cache_cpu, 0);
- return 0;
-}
-
-module_init(flow_cache_init);
-
-EXPORT_SYMBOL(flow_cache_genid);
-EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 000000000000..1b61bb25ba0e
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,2101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/filter.h>
+#include <net/dsa.h>
+#include <net/dst_metadata.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/gre.h>
+#include <net/pptp.h>
+#include <net/tipc.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/stddef.h>
+#include <linux/if_ether.h>
+#include <linux/if_hsr.h>
+#include <linux/mpls.h>
+#include <linux/tcp.h>
+#include <linux/ptp_classify.h>
+#include <net/flow_dissector.h>
+#include <net/pkt_cls.h>
+#include <scsi/fc/fc_fcoe.h>
+#include <uapi/linux/batadv_packet.h>
+#include <linux/bpf.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#endif
+#include <linux/bpf-netns.h>
+
+static void dissector_set_key(struct flow_dissector *flow_dissector,
+ enum flow_dissector_key_id key_id)
+{
+ flow_dissector->used_keys |= (1ULL << key_id);
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+ const struct flow_dissector_key *key,
+ unsigned int key_count)
+{
+ unsigned int i;
+
+ memset(flow_dissector, 0, sizeof(*flow_dissector));
+
+ for (i = 0; i < key_count; i++, key++) {
+ /* User should make sure that every key target offset is within
+ * boundaries of unsigned short.
+ */
+ BUG_ON(key->offset > USHRT_MAX);
+ BUG_ON(dissector_uses_key(flow_dissector,
+ key->key_id));
+
+ dissector_set_key(flow_dissector, key->key_id);
+ flow_dissector->offset[key->key_id] = key->offset;
+ }
+
+ /* Ensure that the dissector always includes control and basic key.
+ * That way we are able to avoid handling lack of these in fast path.
+ */
+ BUG_ON(!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL));
+ BUG_ON(!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC));
+}
+EXPORT_SYMBOL(skb_flow_dissector_init);
+
+#ifdef CONFIG_BPF_SYSCALL
+int flow_dissector_bpf_prog_attach_check(struct net *net,
+ struct bpf_prog *prog)
+{
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
+
+ if (net == &init_net) {
+ /* BPF flow dissector in the root namespace overrides
+ * any per-net-namespace one. When attaching to root,
+ * make sure we don't have any BPF program attached
+ * to the non-root namespaces.
+ */
+ struct net *ns;
+
+ for_each_net(ns) {
+ if (ns == &init_net)
+ continue;
+ if (rcu_access_pointer(ns->bpf.run_array[type]))
+ return -EEXIST;
+ }
+ } else {
+ /* Make sure root flow dissector is not attached
+ * when attaching to the non-root namespace.
+ */
+ if (rcu_access_pointer(init_net.bpf.run_array[type]))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+
+/**
+ * skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: sk_buff to extract the ports from
+ * @thoff: transport header offset
+ * @ip_proto: protocol for which to get port offset
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve the ports at offset thoff + poff where poff
+ * is the protocol port offset returned from proto_ports_offset
+ */
+__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ const void *data, int hlen)
+{
+ int poff = proto_ports_offset(ip_proto);
+
+ if (!data) {
+ data = skb->data;
+ hlen = skb_headlen(skb);
+ }
+
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ ports = __skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), data, hlen, &_ports);
+ if (ports)
+ return *ports;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_flow_get_ports);
+
+static bool icmp_has_id(u8 type)
+{
+ switch (type) {
+ case ICMP_ECHO:
+ case ICMP_ECHOREPLY:
+ case ICMP_TIMESTAMP:
+ case ICMP_TIMESTAMPREPLY:
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
+ * @skb: sk_buff to extract from
+ * @key_icmp: struct flow_dissector_key_icmp to fill
+ * @data: raw buffer pointer to the packet
+ * @thoff: offset to extract at
+ * @hlen: packet header length
+ */
+void skb_flow_get_icmp_tci(const struct sk_buff *skb,
+ struct flow_dissector_key_icmp *key_icmp,
+ const void *data, int thoff, int hlen)
+{
+ struct icmphdr *ih, _ih;
+
+ ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
+ if (!ih)
+ return;
+
+ key_icmp->type = ih->type;
+ key_icmp->code = ih->code;
+
+ /* As we use 0 to signal that the Id field is not present,
+ * avoid confusion with packets without such field
+ */
+ if (icmp_has_id(ih->type))
+ key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
+ else
+ key_icmp->id = 0;
+}
+EXPORT_SYMBOL(skb_flow_get_icmp_tci);
+
+/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
+ * using skb_flow_get_icmp_tci().
+ */
+static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int thoff, int hlen)
+{
+ struct flow_dissector_key_icmp *key_icmp;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
+ return;
+
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+
+ skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
+}
+
+static void __skb_flow_dissect_ah(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_ipsec *key_ah;
+ struct ip_auth_hdr _hdr, *hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_ah = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPSEC,
+ target_container);
+
+ key_ah->spi = hdr->spi;
+}
+
+static void __skb_flow_dissect_esp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_ipsec *key_esp;
+ struct ip_esp_hdr _hdr, *hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_esp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPSEC,
+ target_container);
+
+ key_esp->spi = hdr->spi;
+}
+
+static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_l2tpv3 *key_l2tpv3;
+ struct {
+ __be32 session_id;
+ } *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_L2TPV3,
+ target_container);
+
+ key_l2tpv3->session_id = hdr->session_id;
+}
+
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_meta *meta;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+ return;
+
+ meta = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_META,
+ target_container);
+ meta->ingress_ifindex = skb->skb_iif;
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ if (tc_skb_ext_tc_enabled()) {
+ struct tc_skb_ext *ext;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+ if (ext)
+ meta->l2_miss = ext->l2_miss;
+ }
+#endif
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
+static void
+skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
+ u32 ctrl_flags,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_control *ctrl;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
+ return;
+
+ ctrl = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_CONTROL,
+ target_container);
+ ctrl->addr_type = type;
+ ctrl->flags = ctrl_flags;
+}
+
+void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, u16 *ctinfo_map,
+ size_t mapsize, bool post_ct, u16 zone)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct flow_dissector_key_ct *key;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_labels *cl;
+ struct nf_conn *ct;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
+ return;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct && !post_ct)
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CT,
+ target_container);
+
+ if (!ct) {
+ key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_INVALID;
+ key->ct_zone = zone;
+ return;
+ }
+
+ if (ctinfo < mapsize)
+ key->ct_state = ctinfo_map[ctinfo];
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+ key->ct_zone = ct->zone.id;
+#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ key->ct_mark = READ_ONCE(ct->mark);
+#endif
+
+ cl = nf_ct_labels_find(ct);
+ if (cl)
+ memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
+#endif /* CONFIG_NF_CONNTRACK */
+}
+EXPORT_SYMBOL(skb_flow_dissect_ct);
+
+void
+skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct ip_tunnel_info *info;
+ struct ip_tunnel_key *key;
+ u32 ctrl_flags = 0;
+
+ /* A quick check to see if there might be something to do. */
+ if (!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_KEYID) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_PORTS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_OPTS))
+ return;
+
+ info = skb_tunnel_info(skb);
+ if (!info)
+ return;
+
+ key = &info->key;
+
+ if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
+ if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
+ if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;
+
+ switch (ip_tunnel_info_af(info)) {
+ case AF_INET:
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
+ struct flow_dissector_key_ipv4_addrs *ipv4;
+
+ ipv4 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+ target_container);
+ ipv4->src = key->u.ipv4.src;
+ ipv4->dst = key->u.ipv4.dst;
+ }
+ break;
+ case AF_INET6:
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
+ struct flow_dissector_key_ipv6_addrs *ipv6;
+
+ ipv6 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+ target_container);
+ ipv6->src = key->u.ipv6.src;
+ ipv6->dst = key->u.ipv6.dst;
+ }
+ break;
+ default:
+ skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
+ target_container);
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+ struct flow_dissector_key_keyid *keyid;
+
+ keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_KEYID,
+ target_container);
+ keyid->keyid = tunnel_id_to_key32(key->tun_id);
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+ struct flow_dissector_key_ports *tp;
+
+ tp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_PORTS,
+ target_container);
+ tp->src = key->tp_src;
+ tp->dst = key->tp_dst;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
+ struct flow_dissector_key_ip *ip;
+
+ ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP,
+ target_container);
+ ip->tos = key->tos;
+ ip->ttl = key->ttl;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
+ struct flow_dissector_key_enc_opts *enc_opt;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ u32 val;
+
+ enc_opt = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_OPTS,
+ target_container);
+
+ if (!info->options_len)
+ return;
+
+ enc_opt->len = info->options_len;
+ ip_tunnel_info_opts_get(enc_opt->data, info);
+
+ ip_tunnel_set_options_present(flags);
+ ip_tunnel_flags_and(flags, info->key.tun_flags, flags);
+
+ val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
+ IP_TUNNEL_GENEVE_OPT_BIT);
+ enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
+ }
+}
+EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
+
+void skb_flow_dissect_hash(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_hash *key;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_HASH,
+ target_container);
+
+ key->hash = skb_get_hash_raw(skb);
+}
+EXPORT_SYMBOL(skb_flow_dissect_hash);
+
+static enum flow_dissect_ret
+__skb_flow_dissect_mpls(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data, int nhoff,
+ int hlen, int lse_index, bool *entropy_label)
+{
+ struct mpls_label *hdr, _hdr;
+ u32 entry, label, bos;
+
+ if (!dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
+ !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ if (lse_index >= FLOW_DIS_MPLS_MAX)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ entry = ntohl(hdr->entry);
+ label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+ bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
+ struct flow_dissector_key_mpls *key_mpls;
+ struct flow_dissector_mpls_lse *lse;
+
+ key_mpls = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS,
+ target_container);
+ lse = &key_mpls->ls[lse_index];
+
+ lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ lse->mpls_bos = bos;
+ lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+ lse->mpls_label = label;
+ dissector_set_mpls_lse(key_mpls, lse_index);
+ }
+
+ if (*entropy_label &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+ struct flow_dissector_key_keyid *key_keyid;
+
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+ target_container);
+ key_keyid->keyid = cpu_to_be32(label);
+ }
+
+ *entropy_label = label == MPLS_LABEL_ENTROPY;
+
+ return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
+static enum flow_dissect_ret
+__skb_flow_dissect_arp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_arp *key_arp;
+ struct {
+ unsigned char ar_sha[ETH_ALEN];
+ unsigned char ar_sip[4];
+ unsigned char ar_tha[ETH_ALEN];
+ unsigned char ar_tip[4];
+ } *arp_eth, _arp_eth;
+ const struct arphdr *arp;
+ struct arphdr _arp;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
+ hlen, &_arp);
+ if (!arp)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_hln != ETH_ALEN ||
+ arp->ar_pln != 4 ||
+ (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST)))
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
+ sizeof(_arp_eth), data,
+ hlen, &_arp_eth);
+ if (!arp_eth)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ key_arp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ARP,
+ target_container);
+
+ memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
+ memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));
+
+ /* Only store the lower byte of the opcode;
+ * this covers ARPOP_REPLY and ARPOP_REQUEST.
+ */
+ key_arp->op = ntohs(arp->ar_op) & 0xff;
+
+ ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
+ ether_addr_copy(key_arp->tha, arp_eth->ar_tha);
+
+ return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
+__skb_flow_dissect_cfm(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_cfm *key, *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
+ target_container);
+
+ key->mdl_ver = hdr->mdl_ver;
+ key->opcode = hdr->opcode;
+
+ return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
+__skb_flow_dissect_gre(const struct sk_buff *skb,
+ struct flow_dissector_key_control *key_control,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ __be16 *p_proto, int *p_nhoff, int *p_hlen,
+ unsigned int flags)
+{
+ struct flow_dissector_key_keyid *key_keyid;
+ struct gre_base_hdr *hdr, _hdr;
+ int offset = 0;
+ u16 gre_ver;
+
+ hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
+ data, *p_hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ /* Only look inside GRE without routing */
+ if (hdr->flags & GRE_ROUTING)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ /* Only look inside GRE for version 0 and 1 */
+ gre_ver = ntohs(hdr->flags & GRE_VERSION);
+ if (gre_ver > 1)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ *p_proto = hdr->protocol;
+ if (gre_ver) {
+ /* Version1 must be PPTP, and check the flags */
+ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+ }
+
+ offset += sizeof(struct gre_base_hdr);
+
+ if (hdr->flags & GRE_CSUM)
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
+
+ if (hdr->flags & GRE_KEY) {
+ const __be32 *keyid;
+ __be32 _keyid;
+
+ keyid = __skb_header_pointer(skb, *p_nhoff + offset,
+ sizeof(_keyid),
+ data, *p_hlen, &_keyid);
+ if (!keyid)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+ key_keyid = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_GRE_KEYID,
+ target_container);
+ if (gre_ver == 0)
+ key_keyid->keyid = *keyid;
+ else
+ key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
+ }
+ offset += sizeof_field(struct gre_full_hdr, key);
+ }
+
+ if (hdr->flags & GRE_SEQ)
+ offset += sizeof_field(struct pptp_gre_header, seq);
+
+ if (gre_ver == 0) {
+ if (*p_proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, *p_nhoff + offset,
+ sizeof(_eth),
+ data, *p_hlen, &_eth);
+ if (!eth)
+ return FLOW_DISSECT_RET_OUT_BAD;
+ *p_proto = eth->h_proto;
+ offset += sizeof(*eth);
+
+ /* Cap headers that we access via pointers at the
+ * end of the Ethernet header as our maximum alignment
+ * at that point is only 2 bytes.
+ */
+ if (NET_IP_ALIGN)
+ *p_hlen = *p_nhoff + offset;
+ }
+ } else { /* version 1, must be PPTP */
+ u8 _ppp_hdr[PPP_HDRLEN];
+ u8 *ppp_hdr;
+
+ if (hdr->flags & GRE_ACK)
+ offset += sizeof_field(struct pptp_gre_header, ack);
+
+ ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
+ sizeof(_ppp_hdr),
+ data, *p_hlen, _ppp_hdr);
+ if (!ppp_hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ switch (PPP_PROTOCOL(ppp_hdr)) {
+ case PPP_IP:
+ *p_proto = htons(ETH_P_IP);
+ break;
+ case PPP_IPV6:
+ *p_proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ /* Could probably catch some more like MPLS */
+ break;
+ }
+
+ offset += PPP_HDRLEN;
+ }
+
+ *p_nhoff += offset;
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
+/**
+ * __skb_flow_dissect_batadv() - dissect batman-adv header
+ * @skb: sk_buff to with the batman-adv header
+ * @key_control: flow dissectors control key
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @p_proto: pointer used to update the protocol to process next
+ * @p_nhoff: pointer used to update inner network header offset
+ * @hlen: packet header length
+ * @flags: any combination of FLOW_DISSECTOR_F_*
+ *
+ * ETH_P_BATMAN packets are tried to be dissected. Only
+ * &struct batadv_unicast packets are actually processed because they contain an
+ * inner ethernet header and are usually followed by actual network header. This
+ * allows the flow dissector to continue processing the packet.
+ *
+ * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
+ * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
+ * otherwise FLOW_DISSECT_RET_OUT_BAD
+ */
+static enum flow_dissect_ret
+__skb_flow_dissect_batadv(const struct sk_buff *skb,
+ struct flow_dissector_key_control *key_control,
+ const void *data, __be16 *p_proto, int *p_nhoff,
+ int hlen, unsigned int flags)
+{
+ struct {
+ struct batadv_unicast_packet batadv_unicast;
+ struct ethhdr eth;
+ } *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
+ &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ *p_proto = hdr->eth.h_proto;
+ *p_nhoff += sizeof(*hdr);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
+static void
+__skb_flow_dissect_tcp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int thoff, int hlen)
+{
+ struct flow_dissector_key_tcp *key_tcp;
+ struct tcphdr *th, _th;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
+ return;
+
+ th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
+ if (!th)
+ return;
+
+ if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
+ return;
+
+ key_tcp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TCP,
+ target_container);
+ key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
+}
+
+static void
+__skb_flow_dissect_ports(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, u8 ip_proto, int hlen)
+{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ __be32 ports;
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+
+ if (!key_ports && !key_ports_range)
+ return;
+
+ ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
+
+ if (key_ports)
+ key_ports->ports = ports;
+
+ if (key_ports_range)
+ key_ports_range->tp.ports = ports;
+}
+
+static void
+__skb_flow_dissect_ipv4(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ const struct iphdr *iph)
+{
+ struct flow_dissector_key_ip *key_ip;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
+ return;
+
+ key_ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ target_container);
+ key_ip->tos = iph->tos;
+ key_ip->ttl = iph->ttl;
+}
+
+static void
+__skb_flow_dissect_ipv6(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ const struct ipv6hdr *iph)
+{
+ struct flow_dissector_key_ip *key_ip;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
+ return;
+
+ key_ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ target_container);
+ key_ip->tos = ipv6_get_dsfield(iph);
+ key_ip->ttl = iph->hop_limit;
+}
+
+/* Maximum number of protocol headers that can be parsed in
+ * __skb_flow_dissect
+ */
+#define MAX_FLOW_DISSECT_HDRS 15
+
+static bool skb_flow_dissect_allowed(int *num_hdrs)
+{
+ ++*num_hdrs;
+
+ return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
+}
+
+static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_tags *key_tags;
+
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+ key_control->thoff = flow_keys->thoff;
+ if (flow_keys->is_frag)
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+ if (flow_keys->is_first_frag)
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flow_keys->is_encap)
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+ key_basic->n_proto = flow_keys->n_proto;
+ key_basic->ip_proto = flow_keys->ip_proto;
+
+ if (flow_keys->addr_proto == ETH_P_IP &&
+ dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
+ key_addrs->v4addrs.src = flow_keys->ipv4_src;
+ key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
+ memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
+ sizeof(key_addrs->v6addrs.dst));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->src = flow_keys->sport;
+ key_ports->dst = flow_keys->dport;
+ }
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+ key_ports_range->tp.src = flow_keys->sport;
+ key_ports_range->tp.dst = flow_keys->dport;
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_keys->flow_label);
+ }
+}
+
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
+{
+ struct bpf_flow_keys *flow_keys = ctx->flow_keys;
+ u32 result;
+
+ /* Pass parameters to the BPF program */
+ memset(flow_keys, 0, sizeof(*flow_keys));
+ flow_keys->n_proto = proto;
+ flow_keys->nhoff = nhoff;
+ flow_keys->thoff = flow_keys->nhoff;
+
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
+ (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
+ (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
+ (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+ flow_keys->flags = flags;
+
+ result = bpf_prog_run_pin_on_cpu(prog, ctx);
+
+ flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
+ flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
+ flow_keys->nhoff, hlen);
+
+ return result;
+}
+
+static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
+{
+ return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
+}
+
+/**
+ * __skb_flow_dissect - extract the flow_keys struct and return it
+ * @net: associated network namespace, derived from @skb if NULL
+ * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
+ * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ * @flags: flags that control the dissection process, e.g.
+ * FLOW_DISSECTOR_F_STOP_AT_ENCAP.
+ *
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
+ */
+bool __skb_flow_dissect(const struct net *net,
+ const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
+{
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_tags *key_tags;
+ struct flow_dissector_key_vlan *key_vlan;
+ enum flow_dissect_ret fdret;
+ enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ bool mpls_el = false;
+ int mpls_lse = 0;
+ int num_hdrs = 0;
+ u8 ip_proto = 0;
+ bool ret;
+
+ if (!data) {
+ data = skb->data;
+ proto = skb_vlan_tag_present(skb) ?
+ skb->vlan_proto : skb->protocol;
+ nhoff = skb_network_offset(skb);
+ hlen = skb_headlen(skb);
+#if IS_ENABLED(CONFIG_NET_DSA)
+ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
+ proto == htons(ETH_P_XDSA))) {
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
+ const struct dsa_device_ops *ops;
+ int offset = 0;
+
+ ops = skb->dev->dsa_ptr->tag_ops;
+ /* Only DSA header taggers break flow dissection */
+ if (ops->needed_headroom &&
+ (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
+ if (ops->flow_dissect)
+ ops->flow_dissect(skb, &proto, &offset);
+ else
+ dsa_tag_generic_flow_dissect(skb,
+ &proto,
+ &offset);
+ hlen -= offset;
+ nhoff += offset;
+ }
+ }
+#endif
+ }
+
+ /* It is ensured by skb_flow_dissector_init() that control key will
+ * be always present.
+ */
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+
+ /* It is ensured by skb_flow_dissector_init() that basic key will
+ * be always present.
+ */
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+
+ rcu_read_lock();
+
+ if (skb) {
+ if (!net) {
+ if (skb->dev)
+ net = dev_net_rcu(skb->dev);
+ else if (skb->sk)
+ net = sock_net(skb->sk);
+ }
+ }
+
+ DEBUG_NET_WARN_ON_ONCE(!net);
+ if (net) {
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
+ struct bpf_prog_array *run_array;
+
+ run_array = rcu_dereference(init_net.bpf.run_array[type]);
+ if (!run_array)
+ run_array = rcu_dereference(net->bpf.run_array[type]);
+
+ if (run_array) {
+ struct bpf_flow_keys flow_keys;
+ struct bpf_flow_dissector ctx = {
+ .flow_keys = &flow_keys,
+ .data = data,
+ .data_end = data + hlen,
+ };
+ __be16 n_proto = proto;
+ struct bpf_prog *prog;
+ u32 result;
+
+ if (skb) {
+ ctx.skb = skb;
+ /* we can't use 'proto' in the skb case
+ * because it might be set to skb->vlan_proto
+ * which has been pulled from the data
+ */
+ n_proto = skb->protocol;
+ }
+
+ prog = READ_ONCE(run_array->items[0].prog);
+ result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
+ hlen, flags);
+ if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ struct flow_dissector_key_eth_addrs *key_eth_addrs;
+
+ key_eth_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ETH_ADDRS,
+ target_container);
+ memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
+ struct flow_dissector_key_num_of_vlans *key_num_of_vlans;
+
+ key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
+ target_container);
+ key_num_of_vlans->num_of_vlans = 0;
+ }
+
+proto_again:
+ fdret = FLOW_DISSECT_RET_CONTINUE;
+
+ switch (proto) {
+ case htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
+ if (!iph || iph->ihl < 5) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += iph->ihl * 4;
+
+ ip_proto = iph->protocol;
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
+
+ memcpy(&key_addrs->v4addrs.src, &iph->saddr,
+ sizeof(key_addrs->v4addrs.src));
+ memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v4addrs.dst));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ }
+
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
+ if (ip_is_fragment(iph)) {
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+
+ if (iph->frag_off & htons(IP_OFFSET)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ } else {
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (!(flags &
+ FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+ }
+ }
+
+ break;
+ }
+ case htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
+ if (!iph) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ ip_proto = iph->nexthdr;
+ nhoff += sizeof(struct ipv6hdr);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
+
+ memcpy(&key_addrs->v6addrs.src, &iph->saddr,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v6addrs.dst));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ if ((dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+ (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+ ip6_flowlabel(iph)) {
+ __be32 flow_label = ip6_flowlabel(iph);
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_label);
+ }
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+ }
+
+ __skb_flow_dissect_ipv6(skb, flow_dissector,
+ target_container, data, iph);
+
+ break;
+ }
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan = NULL;
+ struct vlan_hdr _vlan;
+ __be16 saved_vlan_tpid = proto;
+
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
+ skb && skb_vlan_tag_present(skb)) {
+ proto = skb->protocol;
+ } else {
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
+ data, hlen, &_vlan);
+ if (!vlan) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
+ !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
+ struct flow_dissector_key_num_of_vlans *key_nvs;
+
+ key_nvs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
+ target_container);
+ key_nvs->num_of_vlans++;
+ }
+
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
+ } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
+ } else {
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, dissector_vlan)) {
+ key_vlan = skb_flow_dissector_target(flow_dissector,
+ dissector_vlan,
+ target_container);
+
+ if (!vlan) {
+ key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
+ key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
+ } else {
+ key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
+ VLAN_VID_MASK;
+ key_vlan->vlan_priority =
+ (ntohs(vlan->h_vlan_TCI) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ }
+ key_vlan->vlan_tpid = saved_vlan_tpid;
+ key_vlan->vlan_eth_type = proto;
+ }
+
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+ case htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ u16 ppp_proto;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ /* least significant bit of the most significant octet
+ * indicates if protocol field was compressed
+ */
+ ppp_proto = ntohs(hdr->proto);
+ if (ppp_proto & 0x0100) {
+ ppp_proto = ppp_proto >> 8;
+ nhoff += PPPOE_SES_HLEN - 1;
+ } else {
+ nhoff += PPPOE_SES_HLEN;
+ }
+
+ if (ppp_proto == PPP_IP) {
+ proto = htons(ETH_P_IP);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto == PPP_IPV6) {
+ proto = htons(ETH_P_IPV6);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto == PPP_MPLS_UC) {
+ proto = htons(ETH_P_MPLS_UC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto == PPP_MPLS_MC) {
+ proto = htons(ETH_P_MPLS_MC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto_is_valid(ppp_proto)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ } else {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PPPOE)) {
+ struct flow_dissector_key_pppoe *key_pppoe;
+
+ key_pppoe = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PPPOE,
+ target_container);
+ key_pppoe->session_id = hdr->hdr.sid;
+ key_pppoe->ppp_proto = htons(ppp_proto);
+ key_pppoe->type = htons(ETH_P_PPP_SES);
+ }
+ break;
+ }
+ case htons(ETH_P_TIPC): {
+ struct tipc_basic_hdr *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
+ data, hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TIPC,
+ target_container);
+ key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
+ key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
+ }
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ case htons(ETH_P_MPLS_UC):
+ case htons(ETH_P_MPLS_MC):
+ fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen, mpls_lse,
+ &mpls_el);
+ nhoff += sizeof(struct mpls_label);
+ mpls_lse++;
+ break;
+ case htons(ETH_P_FCOE):
+ if ((hlen - nhoff) < FCOE_HEADER_LEN) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += FCOE_HEADER_LEN;
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ fdret = __skb_flow_dissect_arp(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen);
+ break;
+
+ case htons(ETH_P_BATMAN):
+ fdret = __skb_flow_dissect_batadv(skb, key_control, data,
+ &proto, &nhoff, hlen, flags);
+ break;
+
+ case htons(ETH_P_1588): {
+ struct ptp_header *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += sizeof(struct ptp_header);
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ case htons(ETH_P_PRP):
+ case htons(ETH_P_HSR): {
+ struct hsr_tag *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
+ &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = hdr->encap_proto;
+ nhoff += HSR_HLEN;
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
+ case htons(ETH_P_CFM):
+ fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen);
+ break;
+
+ default:
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ /* Process result of proto processing */
+ switch (fdret) {
+ case FLOW_DISSECT_RET_OUT_GOOD:
+ goto out_good;
+ case FLOW_DISSECT_RET_PROTO_AGAIN:
+ if (skb_flow_dissect_allowed(&num_hdrs))
+ goto proto_again;
+ goto out_good;
+ case FLOW_DISSECT_RET_CONTINUE:
+ case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+ break;
+ case FLOW_DISSECT_RET_OUT_BAD:
+ default:
+ goto out_bad;
+ }
+
+ip_proto_again:
+ fdret = FLOW_DISSECT_RET_CONTINUE;
+
+ switch (ip_proto) {
+ case IPPROTO_GRE:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
+ target_container, data,
+ &proto, &nhoff, &hlen, flags);
+ break;
+
+ case NEXTHDR_HOP:
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_DEST: {
+ u8 _opthdr[2], *opthdr;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
+ data, hlen, &_opthdr);
+ if (!opthdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ ip_proto = opthdr[0];
+ nhoff += (opthdr[1] + 1) << 3;
+
+ fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
+ break;
+ }
+ case NEXTHDR_FRAGMENT: {
+ struct frag_hdr _fh, *fh;
+
+ if (proto != htons(ETH_P_IPV6))
+ break;
+
+ fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
+ data, hlen, &_fh);
+
+ if (!fh) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+
+ nhoff += sizeof(_fh);
+ ip_proto = fh->nexthdr;
+
+ if (!(fh->frag_off & htons(IP6_OFFSET))) {
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
+ fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
+ break;
+ }
+ }
+
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+ case IPPROTO_IPIP:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ proto = htons(ETH_P_IP);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+
+ case IPPROTO_IPV6:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ proto = htons(ETH_P_IPV6);
+
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+ if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+
+
+ case IPPROTO_MPLS:
+ proto = htons(ETH_P_MPLS_UC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+
+ case IPPROTO_TCP:
+ __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_L2TP:
+ __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_ESP:
+ __skb_flow_dissect_esp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_AH:
+ __skb_flow_dissect_ah(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ default:
+ break;
+ }
+
+ if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
+ __skb_flow_dissect_ports(skb, flow_dissector, target_container,
+ data, nhoff, ip_proto, hlen);
+
+ /* Process result of IP proto processing */
+ switch (fdret) {
+ case FLOW_DISSECT_RET_PROTO_AGAIN:
+ if (skb_flow_dissect_allowed(&num_hdrs))
+ goto proto_again;
+ break;
+ case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+ if (skb_flow_dissect_allowed(&num_hdrs))
+ goto ip_proto_again;
+ break;
+ case FLOW_DISSECT_RET_OUT_GOOD:
+ case FLOW_DISSECT_RET_CONTINUE:
+ break;
+ case FLOW_DISSECT_RET_OUT_BAD:
+ default:
+ goto out_bad;
+ }
+
+out_good:
+ ret = true;
+
+out:
+ key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
+ key_basic->n_proto = proto;
+ key_basic->ip_proto = ip_proto;
+
+ return ret;
+
+out_bad:
+ ret = false;
+ goto out;
+}
+EXPORT_SYMBOL(__skb_flow_dissect);
+
+static siphash_aligned_key_t hashrnd;
+static __always_inline void __flow_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static const void *flow_keys_hash_start(const struct flow_keys *flow)
+{
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
+ return &flow->FLOW_KEYS_HASH_START_FIELD;
+}
+
+static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
+{
+ size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
+
+ BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ diff -= sizeof(flow->addrs.v4addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ diff -= sizeof(flow->addrs.v6addrs);
+ break;
+ case FLOW_DISSECTOR_KEY_TIPC:
+ diff -= sizeof(flow->addrs.tipckey);
+ break;
+ }
+ return sizeof(*flow) - diff;
+}
+
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.src;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.src);
+ case FLOW_DISSECTOR_KEY_TIPC:
+ return flow->addrs.tipckey.key;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(flow_get_u32_src);
+
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+ switch (flow->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ return flow->addrs.v4addrs.dst;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ return (__force __be32)ipv6_addr_hash(
+ &flow->addrs.v6addrs.dst);
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
+
+/* Sort the source and destination IP and the ports,
+ * to have consistent hash within the two directions
+ */
+static inline void __flow_hash_consistentify(struct flow_keys *keys)
+{
+ int addr_diff, i;
+
+ switch (keys->control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ if ((__force u32)keys->addrs.v4addrs.dst <
+ (__force u32)keys->addrs.v4addrs.src)
+ swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ addr_diff = memcmp(&keys->addrs.v6addrs.dst,
+ &keys->addrs.v6addrs.src,
+ sizeof(keys->addrs.v6addrs.dst));
+ if (addr_diff < 0) {
+ for (i = 0; i < 4; i++)
+ swap(keys->addrs.v6addrs.src.s6_addr32[i],
+ keys->addrs.v6addrs.dst.s6_addr32[i]);
+ }
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
+ swap(keys->ports.src, keys->ports.dst);
+ }
+ break;
+ }
+}
+
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ u32 hash;
+
+ __flow_hash_consistentify(keys);
+
+ hash = siphash(flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
+ if (!hash)
+ hash = 1;
+
+ return hash;
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+ __flow_hash_secret_init();
+ return __flow_hash_from_keys(keys, &hashrnd);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
+u32 flow_hash_from_keys_seed(struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ return __flow_hash_from_keys(keys, keyval);
+}
+EXPORT_SYMBOL(flow_hash_from_keys_seed);
+
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+ struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ skb_flow_dissect_flow_keys(skb, keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ return __flow_hash_from_keys(keys, keyval);
+}
+
+struct _flow_keys_digest_data {
+ __be16 n_proto;
+ u8 ip_proto;
+ u8 padding;
+ __be32 ports;
+ __be32 src;
+ __be32 dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+ const struct flow_keys *flow)
+{
+ struct _flow_keys_digest_data *data =
+ (struct _flow_keys_digest_data *)digest;
+
+ BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+ memset(digest, 0, sizeof(*digest));
+
+ data->n_proto = flow->basic.n_proto;
+ data->ip_proto = flow->basic.ip_proto;
+ data->ports = flow->ports.ports;
+ data->src = flow->addrs.v4addrs.src;
+ data->dst = flow->addrs.v4addrs.dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
+static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
+
+u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+
+ __flow_hash_secret_init();
+
+ memset(&keys, 0, sizeof(keys));
+ __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
+ &keys, NULL, 0, 0, 0, 0);
+
+ return __flow_hash_from_keys(&keys, &hashrnd);
+}
+EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);
+
+/**
+ * __skb_get_hash_net: calculate a flow hash
+ * @net: associated network namespace, derived from @skb if NULL
+ * @skb: sk_buff to calculate flow hash from
+ *
+ * This function calculates a flow hash based on src/dst addresses
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
+ */
+void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 hash;
+
+ memset(&keys, 0, sizeof(keys));
+
+ __skb_flow_dissect(net, skb, &flow_keys_dissector,
+ &keys, NULL, 0, 0, 0,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ __flow_hash_secret_init();
+
+ hash = __flow_hash_from_keys(&keys, &hashrnd);
+
+ __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
+}
+EXPORT_SYMBOL(__skb_get_hash_net);
+
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+ const siphash_key_t *perturb)
+{
+ struct flow_keys keys;
+
+ return ___skb_get_hash(skb, &keys, perturb);
+}
+EXPORT_SYMBOL(skb_get_hash_perturb);
+
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
+ const struct flow_keys_basic *keys, int hlen)
+{
+ u32 poff = keys->control.thoff;
+
+ /* skip L4 headers for fragments after the first */
+ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
+ !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
+ return poff;
+
+ switch (keys->basic.ip_proto) {
+ case IPPROTO_TCP: {
+ /* access doff as u8 to avoid unaligned access */
+ const u8 *doff;
+ u8 _doff;
+
+ doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
+ data, hlen, &_doff);
+ if (!doff)
+ return poff;
+
+ poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ poff += sizeof(struct udphdr);
+ break;
+ /* For the rest, we do not really care about header
+ * extensions at this point for now.
+ */
+ case IPPROTO_ICMP:
+ poff += sizeof(struct icmphdr);
+ break;
+ case IPPROTO_ICMPV6:
+ poff += sizeof(struct icmp6hdr);
+ break;
+ case IPPROTO_IGMP:
+ poff += sizeof(struct igmphdr);
+ break;
+ case IPPROTO_DCCP:
+ poff += sizeof(struct dccp_hdr);
+ break;
+ case IPPROTO_SCTP:
+ poff += sizeof(struct sctphdr);
+ break;
+ }
+
+ return poff;
+}
+
+/**
+ * skb_get_poff - get the offset to the payload
+ * @skb: sk_buff to get the payload offset from
+ *
+ * The function will get the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys_basic keys;
+
+ if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
+ NULL, 0, 0, 0, 0))
+ return 0;
+
+ return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
+}
+
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
+{
+ memset(keys, 0, sizeof(*keys));
+
+ memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
+ sizeof(keys->addrs.v6addrs.src));
+ memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
+ sizeof(keys->addrs.v6addrs.dst));
+ keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ keys->ports.src = fl6->fl6_sport;
+ keys->ports.dst = fl6->fl6_dport;
+ keys->keyid.keyid = fl6->fl6_gre_key;
+ keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ keys->basic.ip_proto = fl6->flowi6_proto;
+
+ return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi6);
+
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v4addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v6addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_TIPC,
+ .offset = offsetof(struct flow_keys, addrs.tipckey),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_PORTS,
+ .offset = offsetof(struct flow_keys, ports),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_VLAN,
+ .offset = offsetof(struct flow_keys, vlan),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ .offset = offsetof(struct flow_keys, tags),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+ .offset = offsetof(struct flow_keys, keyid),
+ },
+};
+
+static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v4addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ .offset = offsetof(struct flow_keys, addrs.v6addrs),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_PORTS,
+ .offset = offsetof(struct flow_keys, ports),
+ },
+};
+
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
+ {
+ .key_id = FLOW_DISSECTOR_KEY_CONTROL,
+ .offset = offsetof(struct flow_keys, control),
+ },
+ {
+ .key_id = FLOW_DISSECTOR_KEY_BASIC,
+ .offset = offsetof(struct flow_keys, basic),
+ },
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
+
+static int __init init_default_flow_dissectors(void)
+{
+ skb_flow_dissector_init(&flow_keys_dissector,
+ flow_keys_dissector_keys,
+ ARRAY_SIZE(flow_keys_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_dissector_symmetric,
+ flow_keys_dissector_symmetric_keys,
+ ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
+ skb_flow_dissector_init(&flow_keys_basic_dissector,
+ flow_keys_basic_dissector_keys,
+ ARRAY_SIZE(flow_keys_basic_dissector_keys));
+ return 0;
+}
+core_initcall(init_default_flow_dissectors);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
new file mode 100644
index 000000000000..bc5169482710
--- /dev/null
+++ b/net/core/flow_offload.c
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/flow_offload.h>
+#include <linux/rtnetlink.h>
+#include <linux/mutex.h>
+#include <linux/rhashtable.h>
+
+struct flow_rule *flow_rule_alloc(unsigned int num_actions)
+{
+ struct flow_rule *rule;
+ int i;
+
+ rule = kzalloc(struct_size(rule, action.entries, num_actions),
+ GFP_KERNEL);
+ if (!rule)
+ return NULL;
+
+ rule->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
+
+ return rule;
+}
+EXPORT_SYMBOL(flow_rule_alloc);
+
+struct flow_offload_action *offload_action_alloc(unsigned int num_actions)
+{
+ struct flow_offload_action *fl_action;
+ int i;
+
+ fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions),
+ GFP_KERNEL);
+ if (!fl_action)
+ return NULL;
+
+ fl_action->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
+
+ return fl_action;
+}
+
+#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \
+ const struct flow_match *__m = &(__rule)->match; \
+ struct flow_dissector *__d = (__m)->dissector; \
+ \
+ (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \
+ (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \
+
+void flow_rule_match_meta(const struct flow_rule *rule,
+ struct flow_match_meta *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out);
+}
+EXPORT_SYMBOL(flow_rule_match_meta);
+
+void flow_rule_match_basic(const struct flow_rule *rule,
+ struct flow_match_basic *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out);
+}
+EXPORT_SYMBOL(flow_rule_match_basic);
+
+void flow_rule_match_control(const struct flow_rule *rule,
+ struct flow_match_control *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out);
+}
+EXPORT_SYMBOL(flow_rule_match_control);
+
+void flow_rule_match_eth_addrs(const struct flow_rule *rule,
+ struct flow_match_eth_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_eth_addrs);
+
+void flow_rule_match_vlan(const struct flow_rule *rule,
+ struct flow_match_vlan *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out);
+}
+EXPORT_SYMBOL(flow_rule_match_vlan);
+
+void flow_rule_match_cvlan(const struct flow_rule *rule,
+ struct flow_match_vlan *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out);
+}
+EXPORT_SYMBOL(flow_rule_match_cvlan);
+
+void flow_rule_match_arp(const struct flow_rule *rule,
+ struct flow_match_arp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_arp);
+
+void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv4_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipv4_addrs);
+
+void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv6_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipv6_addrs);
+
+void flow_rule_match_ip(const struct flow_rule *rule,
+ struct flow_match_ip *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ip);
+
+void flow_rule_match_ports(const struct flow_rule *rule,
+ struct flow_match_ports *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ports);
+
+void flow_rule_match_ports_range(const struct flow_rule *rule,
+ struct flow_match_ports_range *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ports_range);
+
+void flow_rule_match_tcp(const struct flow_rule *rule,
+ struct flow_match_tcp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_tcp);
+
+void flow_rule_match_ipsec(const struct flow_rule *rule,
+ struct flow_match_ipsec *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipsec);
+
+void flow_rule_match_icmp(const struct flow_rule *rule,
+ struct flow_match_icmp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_icmp);
+
+void flow_rule_match_mpls(const struct flow_rule *rule,
+ struct flow_match_mpls *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_mpls);
+
+void flow_rule_match_enc_control(const struct flow_rule *rule,
+ struct flow_match_control *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_control);
+
+void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv4_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs);
+
+void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv6_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs);
+
+void flow_rule_match_enc_ip(const struct flow_rule *rule,
+ struct flow_match_ip *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ip);
+
+void flow_rule_match_enc_ports(const struct flow_rule *rule,
+ struct flow_match_ports *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ports);
+
+void flow_rule_match_enc_keyid(const struct flow_rule *rule,
+ struct flow_match_enc_keyid *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_keyid);
+
+void flow_rule_match_enc_opts(const struct flow_rule *rule,
+ struct flow_match_enc_opts *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_opts);
+
+struct flow_action_cookie *flow_action_cookie_create(void *data,
+ unsigned int len,
+ gfp_t gfp)
+{
+ struct flow_action_cookie *cookie;
+
+ cookie = kmalloc(sizeof(*cookie) + len, gfp);
+ if (!cookie)
+ return NULL;
+ cookie->cookie_len = len;
+ memcpy(cookie->cookie, data, len);
+ return cookie;
+}
+EXPORT_SYMBOL(flow_action_cookie_create);
+
+void flow_action_cookie_destroy(struct flow_action_cookie *cookie)
+{
+ kfree(cookie);
+}
+EXPORT_SYMBOL(flow_action_cookie_destroy);
+
+void flow_rule_match_ct(const struct flow_rule *rule,
+ struct flow_match_ct *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ct);
+
+void flow_rule_match_pppoe(const struct flow_rule *rule,
+ struct flow_match_pppoe *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_pppoe);
+
+void flow_rule_match_l2tpv3(const struct flow_rule *rule,
+ struct flow_match_l2tpv3 *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out);
+}
+EXPORT_SYMBOL(flow_rule_match_l2tpv3);
+
+struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv))
+{
+ struct flow_block_cb *block_cb;
+
+ block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+ if (!block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ block_cb->cb = cb;
+ block_cb->cb_ident = cb_ident;
+ block_cb->cb_priv = cb_priv;
+ block_cb->release = release;
+
+ return block_cb;
+}
+EXPORT_SYMBOL(flow_block_cb_alloc);
+
+void flow_block_cb_free(struct flow_block_cb *block_cb)
+{
+ if (block_cb->release)
+ block_cb->release(block_cb->cb_priv);
+
+ kfree(block_cb);
+}
+EXPORT_SYMBOL(flow_block_cb_free);
+
+struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block,
+ flow_setup_cb_t *cb, void *cb_ident)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, &block->cb_list, list) {
+ if (block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return block_cb;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(flow_block_cb_lookup);
+
+void *flow_block_cb_priv(struct flow_block_cb *block_cb)
+{
+ return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(flow_block_cb_priv);
+
+void flow_block_cb_incref(struct flow_block_cb *block_cb)
+{
+ block_cb->refcnt++;
+}
+EXPORT_SYMBOL(flow_block_cb_incref);
+
+unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb)
+{
+ return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(flow_block_cb_decref);
+
+bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident,
+ struct list_head *driver_block_list)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, driver_block_list, driver_list) {
+ if (block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(flow_block_cb_is_busy);
+
+int flow_block_cb_setup_simple(struct flow_block_offload *f,
+ struct list_head *driver_block_list,
+ flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ bool ingress_only)
+{
+ struct flow_block_cb *block_cb;
+
+ if (ingress_only &&
+ f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ f->driver_block_list = driver_block_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, driver_block_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f->block, cb, cb_ident);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL(flow_block_cb_setup_simple);
+
+static DEFINE_MUTEX(flow_indr_block_lock);
+static LIST_HEAD(flow_block_indr_list);
+static LIST_HEAD(flow_block_indr_dev_list);
+static LIST_HEAD(flow_indir_dev_list);
+
+struct flow_indr_dev {
+ struct list_head list;
+ flow_indr_block_bind_cb_t *cb;
+ void *cb_priv;
+ refcount_t refcnt;
+};
+
+static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
+{
+ struct flow_indr_dev *indr_dev;
+
+ indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL);
+ if (!indr_dev)
+ return NULL;
+
+ indr_dev->cb = cb;
+ indr_dev->cb_priv = cb_priv;
+ refcount_set(&indr_dev->refcnt, 1);
+
+ return indr_dev;
+}
+
+struct flow_indir_dev_info {
+ void *data;
+ struct net_device *dev;
+ struct Qdisc *sch;
+ enum tc_setup_type type;
+ void (*cleanup)(struct flow_block_cb *block_cb);
+ struct list_head list;
+ enum flow_block_command command;
+ enum flow_block_binder_type binder_type;
+ struct list_head *cb_list;
+};
+
+static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+ struct flow_block_offload bo;
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ memset(&bo, 0, sizeof(bo));
+ bo.command = cur->command;
+ bo.binder_type = cur->binder_type;
+ INIT_LIST_HEAD(&bo.cb_list);
+ cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup);
+ list_splice(&bo.cb_list, cur->cb_list);
+ }
+}
+
+int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+ struct flow_indr_dev *indr_dev;
+
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) {
+ if (indr_dev->cb == cb &&
+ indr_dev->cb_priv == cb_priv) {
+ refcount_inc(&indr_dev->refcnt);
+ mutex_unlock(&flow_indr_block_lock);
+ return 0;
+ }
+ }
+
+ indr_dev = flow_indr_dev_alloc(cb, cb_priv);
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return -ENOMEM;
+ }
+
+ list_add(&indr_dev->list, &flow_block_indr_dev_list);
+ existing_qdiscs_register(cb, cb_priv);
+ mutex_unlock(&flow_indr_block_lock);
+
+ tcf_action_reoffload_cb(cb, cb_priv, true);
+
+ return 0;
+}
+EXPORT_SYMBOL(flow_indr_dev_register);
+
+static void __flow_block_indr_cleanup(void (*release)(void *cb_priv),
+ void *cb_priv,
+ struct list_head *cleanup_list)
+{
+ struct flow_block_cb *this, *next;
+
+ list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
+ if (this->release == release &&
+ this->indr.cb_priv == cb_priv)
+ list_move(&this->indr.list, cleanup_list);
+ }
+}
+
+static void flow_block_indr_notify(struct list_head *cleanup_list)
+{
+ struct flow_block_cb *this, *next;
+
+ list_for_each_entry_safe(this, next, cleanup_list, indr.list) {
+ list_del(&this->indr.list);
+ this->indr.cleanup(this);
+ }
+}
+
+void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ void (*release)(void *cb_priv))
+{
+ struct flow_indr_dev *this, *next, *indr_dev = NULL;
+ LIST_HEAD(cleanup_list);
+
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) {
+ if (this->cb == cb &&
+ this->cb_priv == cb_priv &&
+ refcount_dec_and_test(&this->refcnt)) {
+ indr_dev = this;
+ list_del(&indr_dev->list);
+ break;
+ }
+ }
+
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return;
+ }
+
+ __flow_block_indr_cleanup(release, cb_priv, &cleanup_list);
+ mutex_unlock(&flow_indr_block_lock);
+
+ tcf_action_reoffload_cb(cb, cb_priv, false);
+ flow_block_indr_notify(&cleanup_list);
+ kfree(indr_dev);
+}
+EXPORT_SYMBOL(flow_indr_dev_unregister);
+
+static void flow_block_indr_init(struct flow_block_cb *flow_block,
+ struct flow_block_offload *bo,
+ struct net_device *dev, struct Qdisc *sch, void *data,
+ void *cb_priv,
+ void (*cleanup)(struct flow_block_cb *block_cb))
+{
+ flow_block->indr.binder_type = bo->binder_type;
+ flow_block->indr.data = data;
+ flow_block->indr.cb_priv = cb_priv;
+ flow_block->indr.dev = dev;
+ flow_block->indr.sch = sch;
+ flow_block->indr.cleanup = cleanup;
+}
+
+struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv),
+ struct flow_block_offload *bo,
+ struct net_device *dev,
+ struct Qdisc *sch, void *data,
+ void *indr_cb_priv,
+ void (*cleanup)(struct flow_block_cb *block_cb))
+{
+ struct flow_block_cb *block_cb;
+
+ block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release);
+ if (IS_ERR(block_cb))
+ goto out;
+
+ flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup);
+ list_add(&block_cb->indr.list, &flow_block_indr_list);
+
+out:
+ return block_cb;
+}
+EXPORT_SYMBOL(flow_indr_block_cb_alloc);
+
+static struct flow_indir_dev_info *find_indir_dev(void *data)
+{
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ if (cur->data == data)
+ return cur;
+ }
+ return NULL;
+}
+
+static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb),
+ struct flow_block_offload *bo)
+{
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (info)
+ return -EEXIST;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->data = data;
+ info->dev = dev;
+ info->sch = sch;
+ info->type = type;
+ info->cleanup = cleanup;
+ info->command = bo->command;
+ info->binder_type = bo->binder_type;
+ info->cb_list = bo->cb_list_head;
+
+ list_add(&info->list, &flow_indir_dev_list);
+ return 0;
+}
+
+static int indir_dev_remove(void *data)
+{
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (!info)
+ return -ENOENT;
+
+ list_del(&info->list);
+
+ kfree(info);
+ return 0;
+}
+
+int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void *data,
+ struct flow_block_offload *bo,
+ void (*cleanup)(struct flow_block_cb *block_cb))
+{
+ struct flow_indr_dev *this;
+ u32 count = 0;
+ int err;
+
+ mutex_lock(&flow_indr_block_lock);
+ if (bo) {
+ if (bo->command == FLOW_BLOCK_BIND)
+ indir_dev_add(data, dev, sch, type, cleanup, bo);
+ else if (bo->command == FLOW_BLOCK_UNBIND)
+ indir_dev_remove(data);
+ }
+
+ list_for_each_entry(this, &flow_block_indr_dev_list, list) {
+ err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup);
+ if (!err)
+ count++;
+ }
+
+ mutex_unlock(&flow_indr_block_lock);
+
+ return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count;
+}
+EXPORT_SYMBOL(flow_indr_dev_setup_offload);
+
+bool flow_indr_dev_exists(void)
+{
+ return !list_empty(&flow_block_indr_dev_list);
+}
+EXPORT_SYMBOL(flow_indr_dev_exists);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 3cad026764f0..f112156db587 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -1,21 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/gen_estimator.c Simple rate estimator.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
* names to make it usable in general net subsystem.
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -31,219 +27,254 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
+ */
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
+struct net_rate_estimator {
+ struct gnet_stats_basic_sync *bstats;
+ spinlock_t *stats_lock;
+ bool running;
+ struct gnet_stats_basic_sync __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
- The resulting time constant is:
+ seqcount_t seq;
+ u64 last_packets;
+ u64 last_bytes;
- T = A/(-ln(1-W))
+ u64 avpps;
+ u64 avbps;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
+};
- NOTES.
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_sync *b)
+{
+ gnet_stats_basic_sync_init(b);
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- * The stored value for avbps is scaled by 2^5, so that maximal
- rate is ~1Gbit, avpps is scaled by 2^10.
+ gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running);
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
- */
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
-#define EST_MAX_INTERVAL 5
+}
-struct gen_estimator
+static void est_timer(struct timer_list *t)
{
- struct gen_estimator *next;
- struct gnet_stats_basic *bstats;
- struct gnet_stats_rate_est *rate_est;
- spinlock_t *stats_lock;
- unsigned interval;
- int ewma_log;
- u64 last_bytes;
- u32 last_packets;
- u32 avpps;
- u32 avbps;
-};
+ struct net_rate_estimator *est = timer_container_of(est, t, timer);
+ struct gnet_stats_basic_sync b;
+ u64 b_bytes, b_packets;
+ u64 rate, brate;
-struct gen_estimator_head
-{
- struct timer_list timer;
- struct gen_estimator *list;
-};
+ est_fetch_counters(est, &b);
+ b_bytes = u64_stats_read(&b.bytes);
+ b_packets = u64_stats_read(&b.packets);
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
+ brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log);
+ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
-/* Estimator array lock */
-static DEFINE_RWLOCK(est_lock);
+ rate = (b_packets - est->last_packets) << (10 - est->intvl_log);
+ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
-static void est_timer(unsigned long arg)
-{
- int idx = (int)arg;
- struct gen_estimator *e;
+ preempt_disable_nested();
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
+ preempt_enable_nested();
- read_lock(&est_lock);
- for (e = elist[idx].list; e; e = e->next) {
- u64 nbytes;
- u32 npackets;
- u32 rate;
+ est->last_bytes = b_bytes;
+ est->last_packets = b_packets;
- spin_lock(e->stats_lock);
- nbytes = e->bstats->bytes;
- npackets = e->bstats->packets;
- rate = (nbytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = nbytes;
- e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
- e->rate_est->bps = (e->avbps+0xF)>>5;
-
- rate = (npackets - e->last_packets)<<(12 - idx);
- e->last_packets = npackets;
- e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
- e->rate_est->pps = (e->avpps+0x1FF)>>10;
- spin_unlock(e->stats_lock);
- }
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
- read_unlock(&est_lock);
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
+ }
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
* gen_new_estimator - create a new rate estimator
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
+ * @lock: lock for statistics and control path
+ * @running: true if @bstats represents a running qdisc, thus @bstats'
+ * internal values might change during basic reads. Only used
+ * if @bstats_cpu is NULL
* @opt: rate estimator configuration TLV
*
* Creates a new rate estimator with &bstats as source and &rate_est
* as destination. A new timer with the interval specified in the
* configuration TLV is created. Upon each interval, the latest statistics
* will be read from &bstats and the estimated rate will be stored in
- * &rate_est with the statistics lock grabed during this period.
- *
+ * &rate_est with the statistics lock grabbed during this period.
+ *
* Returns 0 on success or a negative error code.
+ *
*/
-int gen_new_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+int gen_new_estimator(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu_bstats,
+ struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *lock,
+ bool running,
+ struct nlattr *opt)
{
- struct gen_estimator *est;
- struct gnet_estimator *parm = RTA_DATA(opt);
+ struct gnet_estimator *parm = nla_data(opt);
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_sync b;
+ int intvl_log;
- if (RTA_PAYLOAD(opt) < sizeof(*parm))
+ if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
+ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+ return -EINVAL;
+
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- est->interval = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
- est->stats_lock = stats_lock;
+ est->stats_lock = lock;
+ est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = bstats->bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = bstats->packets;
- est->avpps = rate_est->pps<<10;
-
- est->next = elist[est->interval].list;
- if (est->next == NULL) {
- init_timer(&elist[est->interval].timer);
- elist[est->interval].timer.data = est->interval;
- elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
- elist[est->interval].timer.function = est_timer;
- add_timer(&elist[est->interval].timer);
+ est->intvl_log = intvl_log;
+ est->cpu_bstats = cpu_bstats;
+
+ if (lock)
+ local_bh_disable();
+ est_fetch_counters(est, &b);
+ if (lock)
+ local_bh_enable();
+ est->last_bytes = u64_stats_read(&b.bytes);
+ est->last_packets = u64_stats_read(&b.packets);
+
+ if (lock)
+ spin_lock_bh(lock);
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ timer_delete_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- write_lock_bh(&est_lock);
- elist[est->interval].list = est;
- write_unlock_bh(&est_lock);
+
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ timer_setup(&est->timer, est_timer, 0);
+ mod_timer(&est->timer, est->next_jiffies);
+
+ rcu_assign_pointer(*rate_est, est);
+ if (lock)
+ spin_unlock_bh(lock);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
+EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
+ *
+ * Removes the rate estimator.
*
- * Removes the rate estimator specified by &bstats and &rate_est
- * and deletes the timer.
*/
-void gen_kill_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- int idx;
- struct gen_estimator *est, **pest;
-
- for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
- int killed = 0;
- pest = &elist[idx].list;
- while ((est=*pest) != NULL) {
- if (est->rate_est != rate_est || est->bstats != bstats) {
- pest = &est->next;
- continue;
- }
-
- write_lock_bh(&est_lock);
- *pest = est->next;
- write_unlock_bh(&est_lock);
-
- kfree(est);
- killed++;
- }
- if (killed && elist[idx].list == NULL)
- del_timer(&elist[idx].timer);
+ struct net_rate_estimator *est;
+
+ est = unrcu_pointer(xchg(rate_est, NULL));
+ if (est) {
+ timer_shutdown_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
}
+EXPORT_SYMBOL(gen_kill_estimator);
/**
- * gen_replace_estimator - replace rate estimator configruation
+ * gen_replace_estimator - replace rate estimator configuration
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
+ * @lock: lock for statistics and control path
+ * @running: true if @bstats represents a running qdisc, thus @bstats'
+ * internal values might change during basic reads. Only used
+ * if @cpu_bstats is NULL
* @opt: rate estimator configuration TLV
*
* Replaces the configuration of a rate estimator by calling
* gen_kill_estimator() and gen_new_estimator().
- *
+ *
* Returns 0 on success or a negative error code.
*/
-int
-gen_replace_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock,
- struct rtattr *opt)
+int gen_replace_estimator(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu_bstats,
+ struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *lock,
+ bool running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ lock, running, opt);
}
-
-
-EXPORT_SYMBOL(gen_kill_estimator);
-EXPORT_SYMBOL(gen_new_estimator);
EXPORT_SYMBOL(gen_replace_estimator);
+
+/**
+ * gen_estimator_active - test if estimator is currently in use
+ * @rate_est: rate estimator
+ *
+ * Returns true if estimator is active, and false if not.
+ */
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
+{
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
+
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
+
+ rcu_read_unlock();
+ return true;
+}
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 8f21490355fa..b71ccaec0991 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/gen_stats.c
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
* Jamal Hadi Salim
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * See Documentation/networking/gen_stats.txt
+ * See Documentation/networking/gen_stats.rst
*/
#include <linux/types.h>
@@ -20,17 +16,23 @@
#include <linux/socket.h>
#include <linux/rtnetlink.h>
#include <linux/gen_stats.h>
+#include <net/netlink.h>
#include <net/gen_stats.h>
-
+#include <net/sch_generic.h>
static inline int
-gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
{
- RTA_PUT(d->skb, type, size, buf);
+ if (nla_put_64bit(d->skb, type, size, buf, padattr))
+ goto nla_put_failure;
return 0;
-rtattr_failure:
- spin_unlock_bh(d->lock);
+nla_put_failure:
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+ kfree(d->xstats);
+ d->xstats = NULL;
+ d->xstats_len = 0;
return -1;
}
@@ -42,6 +44,7 @@ rtattr_failure:
* @xstats_type: TLV type for backward compatibility xstats TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
@@ -54,30 +57,48 @@ rtattr_failure:
*/
int
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
- int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+ int xstats_type, spinlock_t *lock,
+ struct gnet_dump *d, int padattr)
+ __acquires(lock)
{
memset(d, 0, sizeof(*d));
-
- spin_lock_bh(lock);
- d->lock = lock;
+
if (type)
- d->tail = (struct rtattr *) skb->tail;
+ d->tail = (struct nlattr *)skb_tail_pointer(skb);
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
+ d->padattr = padattr;
+ if (lock) {
+ d->lock = lock;
+ spin_lock_bh(lock);
+ }
+ if (d->tail) {
+ int ret = gnet_stats_copy(d, type, NULL, 0, padattr);
- if (d->tail)
- return gnet_stats_copy(d, type, NULL, 0);
+ /* The initial attribute added in gnet_stats_copy() may be
+ * preceded by a padding attribute, in which case d->tail will
+ * end up pointing at the padding instead of the real attribute.
+ * Fix this so gnet_stats_finish_copy() adjusts the length of
+ * the right attribute.
+ */
+ if (ret == 0 && d->tail->nla_type == padattr)
+ d->tail = (struct nlattr *)((char *)d->tail +
+ NLA_ALIGN(d->tail->nla_len));
+ return ret;
+ }
return 0;
}
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
/**
- * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * gnet_stats_start_copy - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
@@ -87,15 +108,147 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
*/
int
gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
- struct gnet_dump *d)
+ struct gnet_dump *d, int padattr)
+{
+ return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr);
+}
+EXPORT_SYMBOL(gnet_stats_start_copy);
+
+/* Must not be inlined, due to u64_stats seqcount_t lockdep key */
+void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b)
+{
+ u64_stats_set(&b->bytes, 0);
+ u64_stats_set(&b->packets, 0);
+ u64_stats_init(&b->syncp);
+}
+EXPORT_SYMBOL(gnet_stats_basic_sync_init);
+
+static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu)
+{
+ u64 t_bytes = 0, t_packets = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes, packets;
+
+ do {
+ start = u64_stats_fetch_begin(&bcpu->syncp);
+ bytes = u64_stats_read(&bcpu->bytes);
+ packets = u64_stats_read(&bcpu->packets);
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
+
+ t_bytes += bytes;
+ t_packets += packets;
+ }
+ _bstats_update(bstats, t_bytes, t_packets);
+}
+
+void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b, bool running)
+{
+ unsigned int start;
+ u64 bytes = 0;
+ u64 packets = 0;
+
+ WARN_ON_ONCE((cpu || running) && in_hardirq());
+
+ if (cpu) {
+ gnet_stats_add_basic_cpu(bstats, cpu);
+ return;
+ }
+ do {
+ if (running)
+ start = u64_stats_fetch_begin(&b->syncp);
+ bytes = u64_stats_read(&b->bytes);
+ packets = u64_stats_read(&b->packets);
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
+
+ _bstats_update(bstats, bytes, packets);
+}
+EXPORT_SYMBOL(gnet_stats_add_basic);
+
+static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b, bool running)
+{
+ unsigned int start;
+
+ if (cpu) {
+ u64 t_bytes = 0, t_packets = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes, packets;
+
+ do {
+ start = u64_stats_fetch_begin(&bcpu->syncp);
+ bytes = u64_stats_read(&bcpu->bytes);
+ packets = u64_stats_read(&bcpu->packets);
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
+
+ t_bytes += bytes;
+ t_packets += packets;
+ }
+ *ret_bytes = t_bytes;
+ *ret_packets = t_packets;
+ return;
+ }
+ do {
+ if (running)
+ start = u64_stats_fetch_begin(&b->syncp);
+ *ret_bytes = u64_stats_read(&b->bytes);
+ *ret_packets = u64_stats_read(&b->packets);
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
+}
+
+static int
+___gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ int type, bool running)
{
- return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
+ u64 bstats_bytes, bstats_packets;
+
+ gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running);
+
+ if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
+ d->tc_stats.bytes = bstats_bytes;
+ d->tc_stats.packets = bstats_packets;
+ }
+
+ if (d->tail) {
+ struct gnet_stats_basic sb;
+ int res;
+
+ memset(&sb, 0, sizeof(sb));
+ sb.bytes = bstats_bytes;
+ sb.packets = bstats_packets;
+ res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD);
+ if (res < 0 || sb.packets == bstats_packets)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets,
+ sizeof(bstats_packets), TCA_STATS_PAD);
+ }
+ return 0;
}
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
+ * @cpu: copy statistic per cpu
* @b: basic statistics
+ * @running: true if @b represents a running qdisc, thus @b's
+ * internal values might change during basic reads.
+ * Only used if @cpu is NULL
+ *
+ * Context: task; must not be run from IRQ or BH contexts
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -104,23 +257,46 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b)
+gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ bool running)
{
- if (d->compat_tc_stats) {
- d->tc_stats.bytes = b->bytes;
- d->tc_stats.packets = b->packets;
- }
-
- if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b));
+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running);
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic);
- return 0;
+/**
+ * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ * @running: true if @b represents a running qdisc, thus @b's
+ * internal values might change during basic reads.
+ * Only used if @cpu is NULL
+ *
+ * Context: task; must not be run from IRQ or BH contexts
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic_hw(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ bool running)
+{
+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running);
}
+EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -129,45 +305,109 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b)
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r)
+gnet_stats_copy_rate_est(struct gnet_dump *d,
+ struct net_rate_estimator __rcu **rate_est)
{
+ struct gnet_stats_rate_est64 sample;
+ struct gnet_stats_rate_est est;
+ int res;
+
+ if (!gen_estimator_read(rate_est, &sample))
+ return 0;
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
+ /* we have some time before reaching 2^32 packets per second */
+ est.pps = sample.pps;
+
if (d->compat_tc_stats) {
- d->tc_stats.bps = r->bps;
- d->tc_stats.pps = r->pps;
+ d->tc_stats.bps = est.bps;
+ d->tc_stats.pps = est.pps;
}
- if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
+ if (d->tail) {
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
+ TCA_STATS_PAD);
+ if (res < 0 || est.bps == sample.bps)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
+ }
return 0;
}
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+
+static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
+
+ qstats->qlen += qcpu->qlen;
+ qstats->backlog += qcpu->backlog;
+ qstats->drops += qcpu->drops;
+ qstats->requeues += qcpu->requeues;
+ qstats->overlimits += qcpu->overlimits;
+ }
+}
+
+void gnet_stats_add_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q)
+{
+ if (cpu) {
+ gnet_stats_add_queue_cpu(qstats, cpu);
+ } else {
+ qstats->qlen += q->qlen;
+ qstats->backlog += q->backlog;
+ qstats->drops += q->drops;
+ qstats->requeues += q->requeues;
+ qstats->overlimits += q->overlimits;
+ }
+}
+EXPORT_SYMBOL(gnet_stats_add_queue);
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
* @d: dumping handle
+ * @cpu_q: per cpu queue statistics
* @q: queue statistics
+ * @qlen: queue length statistics
*
* Appends the queue statistics to the top level TLV created by
- * gnet_stats_start_copy().
+ * gnet_stats_start_copy(). Using per cpu queue statistics if
+ * they are available.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
+gnet_stats_copy_queue(struct gnet_dump *d,
+ struct gnet_stats_queue __percpu *cpu_q,
+ struct gnet_stats_queue *q, __u32 qlen)
{
+ struct gnet_stats_queue qstats = {0};
+
+ gnet_stats_add_queue(&qstats, cpu_q, q);
+ qstats.qlen = qlen;
+
if (d->compat_tc_stats) {
- d->tc_stats.drops = q->drops;
- d->tc_stats.qlen = q->qlen;
- d->tc_stats.backlog = q->backlog;
- d->tc_stats.overlimits = q->overlimits;
+ d->tc_stats.drops = qstats.drops;
+ d->tc_stats.qlen = qstats.qlen;
+ d->tc_stats.backlog = qstats.backlog;
+ d->tc_stats.overlimits = qstats.overlimits;
}
if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q));
+ return gnet_stats_copy(d, TCA_STATS_QUEUE,
+ &qstats, sizeof(qstats),
+ TCA_STATS_PAD);
return 0;
}
+EXPORT_SYMBOL(gnet_stats_copy_queue);
/**
* gnet_stats_copy_app - copy application specific statistics into statistics TLV
@@ -175,7 +415,7 @@ gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
* @st: application specific statistics data
* @len: length of data
*
- * Appends the application sepecific statistics to the top level TLV created by
+ * Appends the application specific statistics to the top level TLV created by
* gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
* handle is in backward compatibility mode.
*
@@ -186,15 +426,25 @@ int
gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
{
if (d->compat_xstats) {
- d->xstats = st;
+ d->xstats = kmemdup(st, len, GFP_ATOMIC);
+ if (!d->xstats)
+ goto err_out;
d->xstats_len = len;
}
if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_APP, st, len);
+ return gnet_stats_copy(d, TCA_STATS_APP, st, len,
+ TCA_STATS_PAD);
return 0;
+
+err_out:
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+ d->xstats_len = 0;
+ return -1;
}
+EXPORT_SYMBOL(gnet_stats_copy_app);
/**
* gnet_stats_finish_copy - finish dumping procedure
@@ -212,28 +462,24 @@ int
gnet_stats_finish_copy(struct gnet_dump *d)
{
if (d->tail)
- d->tail->rta_len = d->skb->tail - (u8 *) d->tail;
+ d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
if (d->compat_tc_stats)
if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
- sizeof(d->tc_stats)) < 0)
+ sizeof(d->tc_stats), d->padattr) < 0)
return -1;
if (d->compat_xstats && d->xstats) {
if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
- d->xstats_len) < 0)
+ d->xstats_len, d->padattr) < 0)
return -1;
}
- spin_unlock_bh(d->lock);
+ if (d->lock)
+ spin_unlock_bh(d->lock);
+ kfree(d->xstats);
+ d->xstats = NULL;
+ d->xstats_len = 0;
return 0;
}
-
-
-EXPORT_SYMBOL(gnet_stats_start_copy);
-EXPORT_SYMBOL(gnet_stats_start_copy_compat);
-EXPORT_SYMBOL(gnet_stats_copy_basic);
-EXPORT_SYMBOL(gnet_stats_copy_rate_est);
-EXPORT_SYMBOL(gnet_stats_copy_queue);
-EXPORT_SYMBOL(gnet_stats_copy_app);
EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/gro.c b/net/core/gro.c
new file mode 100644
index 000000000000..76f9c3712422
--- /dev/null
+++ b/net/core/gro.c
@@ -0,0 +1,835 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/psp.h>
+#include <net/gro.h>
+#include <net/dst_metadata.h>
+#include <net/busy_poll.h>
+#include <trace/events/net.h>
+#include <linux/skbuff_ref.h>
+
+#define MAX_GRO_SKBS 8
+
+static DEFINE_SPINLOCK(offload_lock);
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct packet_offload *elem;
+
+ spin_lock(&offload_lock);
+ list_for_each_entry(elem, &net_hotdata.offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &net_hotdata.offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
+
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ unsigned int new_truesize;
+ struct sk_buff *lp;
+ int segs;
+
+ /* Do not splice page pool based packets w/ non-page pool
+ * packets. This can result in reference count issues as page
+ * pool pages will not decrement the reference count and will
+ * instead be immediately returned to the pool or have frag
+ * count decremented.
+ */
+ if (p->pp_recycle != skb->pp_recycle)
+ return -ETOOMANYREFS;
+
+ if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) ||
+ NAPI_GRO_CB(skb)->flush))
+ return -E2BIG;
+
+ if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
+ if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
+ (p->protocol == htons(ETH_P_IPV6) &&
+ skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
+ p->encapsulation)
+ return -E2BIG;
+ }
+
+ segs = NAPI_GRO_CB(skb)->count;
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ skb_frag_off_add(frag, offset);
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ delta_truesize = skb->truesize - new_truesize;
+
+ skb->truesize = new_truesize;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ skb_frag_fill_page_desc(frag, page, first_offset, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ delta_truesize = skb->truesize - new_truesize;
+ skb->truesize = new_truesize;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+
+merge:
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skb_frag_off_add(&skbinfo->frags[0], eat);
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ __skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count += segs;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
+static void gro_complete(struct gro_node *gro, struct sk_buff *skb)
+{
+ struct list_head *head = &net_hotdata.offload_base;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return;
+ }
+
+out:
+ gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count);
+}
+
+static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old)
+{
+ struct list_head *head = &gro->hash[index].list;
+ struct sk_buff *skb, *p;
+
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+ skb_list_del_init(skb);
+ gro_complete(gro, skb);
+ gro->hash[index].count--;
+ }
+
+ if (!gro->hash[index].count)
+ __clear_bit(index, &gro->bitmask);
+}
+
+/*
+ * gro->hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void __gro_flush(struct gro_node *gro, bool flush_old)
+{
+ unsigned long bitmask = gro->bitmask;
+ unsigned int i, base = ~0U;
+
+ while ((i = ffs(bitmask)) != 0) {
+ bitmask >>= i;
+ base += i;
+ __gro_flush_chain(gro, base, flush_old);
+ }
+}
+EXPORT_SYMBOL(__gro_flush);
+
+static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb,
+ const struct sk_buff *p,
+ unsigned long diffs)
+{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ return diffs;
+}
+
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
+{
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+ struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ unsigned long diffs;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_all ^ skb->vlan_all;
+ diffs |= skb_metadata_differs(p, skb);
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+
+ /* in most common scenarios 'slow_gro' is 0
+ * otherwise we are already on some slower paths
+ * either skip all the infrequent tests altogether or
+ * avoid trying too hard to skip each of them individually
+ */
+ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
+ diffs |= p->sk != skb->sk;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+
+ diffs |= gro_list_prepare_tc_ext(skb, p, diffs);
+ diffs |= __psp_skb_coalesce_diff(skb, p, diffs);
+ }
+
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
+{
+ const struct skb_shared_info *pinfo;
+ const skb_frag_t *frag0;
+ unsigned int headlen;
+
+ NAPI_GRO_CB(skb)->network_offset = 0;
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ headlen = skb_headlen(skb);
+ NAPI_GRO_CB(skb)->frag0 = skb->data;
+ NAPI_GRO_CB(skb)->frag0_len = headlen;
+ if (headlen)
+ return;
+
+ pinfo = skb_shinfo(skb);
+ frag0 = &pinfo->frags[0];
+
+ if (pinfo->nr_frags && skb_frag_page(frag0) &&
+ !PageHighMem(skb_frag_page(frag0)) &&
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ skb_frag_off_add(&pinfo->frags[0], grow);
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static void gro_try_pull_from_frag0(struct sk_buff *skb)
+{
+ int grow = skb_gro_offset(skb) - skb_headlen(skb);
+
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+}
+
+static void gro_flush_oldest(struct gro_node *gro, struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ skb_list_del_init(oldest);
+ gro_complete(gro, oldest);
+}
+
+static enum gro_result dev_gro_receive(struct gro_node *gro,
+ struct sk_buff *skb)
+{
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct list_head *head = &net_hotdata.offload_base;
+ struct gro_list *gro_list = &gro->hash[bucket];
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct sk_buff *pp = NULL;
+ enum gro_result ret;
+ int same_flow;
+
+ if (netif_elide_gro(skb->dev))
+ goto normal;
+
+ gro_list_prepare(&gro_list->list, skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type == type && ptype->callbacks.gro_receive)
+ goto found_ptype;
+ }
+ rcu_read_unlock();
+ goto normal;
+
+found_ptype:
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+ sizeof(u32))); /* Avoid slow unaligned acc */
+ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
+ NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->count = 1;
+ if (unlikely(skb_is_gso(skb))) {
+ NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
+ /* Only support TCP and non DODGY users. */
+ if (!skb_is_gso_tcp(skb) ||
+ (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY))
+ NAPI_GRO_CB(skb)->flush = 1;
+ }
+
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ break;
+ }
+
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ &gro_list->list, skb);
+
+ rcu_read_unlock();
+
+ if (PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ skb_list_del_init(pp);
+ gro_complete(gro, pp);
+ gro_list->count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(gro, &gro_list->list);
+ else
+ gro_list->count++;
+
+ /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */
+ gro_try_pull_from_frag0(skb);
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ if (!skb_is_gso(skb))
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ list_add(&skb->list, &gro_list->list);
+ ret = GRO_HELD;
+ok:
+ if (gro_list->count) {
+ if (!test_bit(bucket, &gro->bitmask))
+ __set_bit(bucket, &gro->bitmask);
+ } else if (test_bit(bucket, &gro->bitmask)) {
+ __clear_bit(bucket, &gro->bitmask);
+ }
+
+ return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ gro_try_pull_from_frag0(skb);
+ goto ok;
+}
+
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &net_hotdata.offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &net_hotdata.offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ gro_normal_one(gro, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb)
+{
+ gro_result_t ret;
+
+ __skb_mark_napi_id(skb, gro);
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb, 0);
+
+ ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb));
+ trace_napi_gro_receive_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(gro_receive_skb);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo;
+
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ __vlan_hwaccel_clear_tag(skb);
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+
+ /* eth_type_trans() assumes pkt_type is PACKET_HOST */
+ skb->pkt_type = PACKET_HOST;
+
+ skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
+
+ shinfo = skb_shinfo(skb);
+ shinfo->gso_type = 0;
+ shinfo->gso_size = 0;
+ shinfo->hwtstamps.hwtstamp = 0;
+
+ if (unlikely(skb->slow_gro)) {
+ skb_orphan(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
+ skb->slow_gro = 0;
+ }
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL)
+ gro_normal_one(&napi->gro, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb, hlen);
+
+ if (unlikely(!skb_gro_may_pull(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ eth = (const struct ethhdr *)skb->data;
+
+ if (NAPI_GRO_CB(skb)->frag0 != skb->data)
+ gro_pull_from_frag0(skb, hlen);
+
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ gro_result_t ret;
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ trace_napi_gro_frags_entry(skb);
+
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb));
+ trace_napi_gro_frags_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
+void gro_init(struct gro_node *gro)
+{
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&gro->hash[i].list);
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ INIT_LIST_HEAD(&gro->rx_list);
+ gro->rx_count = 0;
+}
+
+void gro_cleanup(struct gro_node *gro)
+{
+ struct sk_buff *skb, *n;
+
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ list_for_each_entry_safe(skb, n, &gro->hash[i].list, list)
+ kfree_skb(skb);
+
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ list_for_each_entry_safe(skb, n, &gro->rx_list, list)
+ kfree_skb(skb);
+
+ gro->rx_count = 0;
+}
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
new file mode 100644
index 000000000000..a725d21159a6
--- /dev/null
+++ b/net/core/gro_cells.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <net/gro_cells.h>
+#include <net/hotdata.h>
+
+struct gro_cell {
+ struct sk_buff_head napi_skbs;
+ struct napi_struct napi;
+ local_lock_t bh_lock;
+};
+
+int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ bool have_bh_lock = false;
+ struct gro_cell *cell;
+ int res;
+
+ rcu_read_lock();
+ if (unlikely(!(dev->flags & IFF_UP)))
+ goto drop;
+
+ if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
+ res = netif_rx(skb);
+ goto unlock;
+ }
+
+ local_lock_nested_bh(&gcells->cells->bh_lock);
+ have_bh_lock = true;
+ cell = this_cpu_ptr(gcells->cells);
+
+ if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) {
+drop:
+ dev_core_stats_rx_dropped_inc(dev);
+ kfree_skb(skb);
+ res = NET_RX_DROP;
+ goto unlock;
+ }
+
+ __skb_queue_tail(&cell->napi_skbs, skb);
+ if (skb_queue_len(&cell->napi_skbs) == 1)
+ napi_schedule(&cell->napi);
+
+ res = NET_RX_SUCCESS;
+
+unlock:
+ if (have_bh_lock)
+ local_unlock_nested_bh(&gcells->cells->bh_lock);
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL(gro_cells_receive);
+
+/* called under BH context */
+static int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+ struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+ struct sk_buff *skb;
+ int work_done = 0;
+
+ while (work_done < budget) {
+ __local_lock_nested_bh(&cell->bh_lock);
+ skb = __skb_dequeue(&cell->napi_skbs);
+ __local_unlock_nested_bh(&cell->bh_lock);
+ if (!skb)
+ break;
+ napi_gro_receive(napi, skb);
+ work_done++;
+ }
+
+ if (work_done < budget)
+ napi_complete_done(napi, work_done);
+ return work_done;
+}
+
+int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+ int i;
+
+ gcells->cells = alloc_percpu(struct gro_cell);
+ if (!gcells->cells)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ __skb_queue_head_init(&cell->napi_skbs);
+ local_lock_init(&cell->bh_lock);
+
+ set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
+
+ netif_napi_add(dev, &cell->napi, gro_cell_poll);
+ napi_enable(&cell->napi);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gro_cells_init);
+
+struct percpu_free_defer {
+ struct rcu_head rcu;
+ void __percpu *ptr;
+};
+
+static void percpu_free_defer_callback(struct rcu_head *head)
+{
+ struct percpu_free_defer *defer;
+
+ defer = container_of(head, struct percpu_free_defer, rcu);
+ free_percpu(defer->ptr);
+ kfree(defer);
+}
+
+void gro_cells_destroy(struct gro_cells *gcells)
+{
+ struct percpu_free_defer *defer;
+ int i;
+
+ if (!gcells->cells)
+ return;
+ for_each_possible_cpu(i) {
+ struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
+ napi_disable(&cell->napi);
+ __netif_napi_del(&cell->napi);
+ __skb_queue_purge(&cell->napi_skbs);
+ }
+ /* We need to observe an rcu grace period before freeing ->cells,
+ * because netpoll could access dev->napi_list under rcu protection.
+ * Try hard using call_rcu() instead of synchronize_rcu(),
+ * because we might be called from cleanup_net(), and we
+ * definitely do not want to block this critical task.
+ */
+ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN);
+ if (likely(defer)) {
+ defer->ptr = gcells->cells;
+ call_rcu(&defer->rcu, percpu_free_defer_callback);
+ } else {
+ /* We do not hold RTNL at this point, synchronize_net()
+ * would not be able to expedite this sync.
+ */
+ synchronize_rcu_expedited();
+ free_percpu(gcells->cells);
+ }
+ gcells->cells = NULL;
+}
+EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/gso.c b/net/core/gso.c
new file mode 100644
index 000000000000..bcd156372f4d
--- /dev/null
+++ b/net/core/gso.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/skbuff.h>
+#include <linux/sctp.h>
+#include <net/gso.h>
+#include <net/gro.h>
+
+/**
+ * skb_eth_gso_segment - segmentation handler for ethernet protocols.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @type: Ethernet Protocol ID
+ */
+struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, __be16 type)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_eth_gso_segment);
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static bool skb_needs_check(const struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_UNNECESSARY;
+
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ *
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ struct sk_buff *segs;
+
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ /* We're going to init ->check field in TCP or UDP header */
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
+ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ segs = skb_mac_gso_segment(skb, features);
+
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
+ skb_warn_bad_offload(skb);
+
+ return segs;
+}
+EXPORT_SYMBOL(__skb_gso_segment);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int thlen = 0;
+
+ if (skb->encapsulation) {
+ thlen = skb_inner_transport_header(skb) -
+ skb_transport_header(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ thlen += inner_tcp_hdrlen(skb);
+ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ thlen = tcp_hdrlen(skb);
+ } else if (unlikely(skb_is_gso_sctp(skb))) {
+ thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
+ }
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return thlen + shinfo->gso_size;
+}
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) -
+ skb_network_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_mac_seglen is used to determine the real size of the
+ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
+ * headers (TCP/UDP).
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
+ *
+ * There are a couple of instances where we have a GSO skb, and we
+ * want to determine what size it would be after it is segmented.
+ *
+ * We might want to check:
+ * - L3+L4+payload size (e.g. IP forwarding)
+ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
+ *
+ * This is a helper to do that correctly considering GSO_BY_FRAGS.
+ *
+ * @skb: GSO skb
+ *
+ * @seg_len: The segmented length (from skb_gso_*_seglen). In the
+ * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
+ *
+ * @max_len: The maximum permissible length.
+ *
+ * Returns true if the segmented length <= max length.
+ */
+static inline bool skb_gso_size_check(const struct sk_buff *skb,
+ unsigned int seg_len,
+ unsigned int max_len) {
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ const struct sk_buff *iter;
+
+ if (shinfo->gso_size != GSO_BY_FRAGS)
+ return seg_len <= max_len;
+
+ /* Undo this so we can re-use header sizes */
+ seg_len -= GSO_BY_FRAGS;
+
+ skb_walk_frags(skb, iter) {
+ if (seg_len + skb_headlen(iter) > max_len)
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
+ *
+ * @skb: GSO skb
+ * @mtu: MTU to validate against
+ *
+ * skb_gso_validate_network_len validates if a given skb will fit a
+ * wanted MTU once split. It considers L3 headers, L4 headers, and the
+ * payload.
+ */
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
+{
+ return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
+
+/**
+ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
+ *
+ * @skb: GSO skb
+ * @len: length to validate against
+ *
+ * skb_gso_validate_mac_len validates if a given skb will fit a wanted
+ * length once split, including L2, L3 and L4 headers and the payload.
+ */
+bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
+{
+ return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
+
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
new file mode 100644
index 000000000000..dddd5c287cf0
--- /dev/null
+++ b/net/core/hotdata.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/cache.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <net/aligned_data.h>
+#include <net/hotdata.h>
+#include <net/ip.h>
+#include <net/proto_memory.h>
+
+struct net_hotdata net_hotdata __cacheline_aligned = {
+ .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
+ .gro_normal_batch = 8,
+
+ .netdev_budget = 300,
+ /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+ .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ,
+
+ .tstamp_prequeue = 1,
+ .max_backlog = 1000,
+ .dev_tx_weight = 64,
+ .dev_rx_weight = 64,
+ .sysctl_max_skb_frags = MAX_SKB_FRAGS,
+ .sysctl_skb_defer_max = 128,
+ .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
+};
+EXPORT_SYMBOL(net_hotdata);
+
+struct net_aligned_data net_aligned_data;
+EXPORT_IPV6_MOD(net_aligned_data);
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000000..ac1a66df9adc
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+ if (likely(bm_pool->frag_size <= PAGE_SIZE))
+ skb_free_frag(buf);
+ else
+ kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+ int frag_size = bm_pool->frag_size;
+ void *buf;
+
+ if (likely(frag_size <= PAGE_SIZE))
+ buf = netdev_alloc_frag(frag_size);
+ else
+ buf = kmalloc(frag_size, gfp);
+
+ if (!buf)
+ return -ENOMEM;
+
+ if (bm_pool->construct)
+ if (bm_pool->construct(bm_pool, buf)) {
+ hwbm_buf_free(bm_pool, buf);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
+{
+ int err, i;
+
+ mutex_lock(&bm_pool->buf_lock);
+ if (bm_pool->buf_num == bm_pool->size) {
+ pr_warn("pool already filled\n");
+ mutex_unlock(&bm_pool->buf_lock);
+ return bm_pool->buf_num;
+ }
+
+ if (buf_num + bm_pool->buf_num > bm_pool->size) {
+ pr_warn("cannot allocate %d buffers for pool\n",
+ buf_num);
+ mutex_unlock(&bm_pool->buf_lock);
+ return 0;
+ }
+
+ if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+ pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+ buf_num, bm_pool->buf_num);
+ mutex_unlock(&bm_pool->buf_lock);
+ return 0;
+ }
+
+ for (i = 0; i < buf_num; i++) {
+ err = hwbm_pool_refill(bm_pool, GFP_KERNEL);
+ if (err < 0)
+ break;
+ }
+
+ /* Update BM driver with number of buffers added to pool */
+ bm_pool->buf_num += i;
+
+ pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+ mutex_unlock(&bm_pool->buf_lock);
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c
new file mode 100644
index 000000000000..669b357b73b2
--- /dev/null
+++ b/net/core/ieee8021q_helpers.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/array_size.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <net/dscp.h>
+#include <net/ieee8021q.h>
+
+/* verify that table covers all 8 traffic types */
+#define TT_MAP_SIZE_OK(tbl) \
+ compiletime_assert(ARRAY_SIZE(tbl) == IEEE8021Q_TT_MAX, \
+ #tbl " size mismatch")
+
+/* The following arrays map Traffic Types (TT) to traffic classes (TC) for
+ * different number of queues as shown in the example provided by
+ * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and
+ * Table I-1 "Traffic type to traffic class mapping".
+ */
+static const u8 ieee8021q_8queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4,
+ [IEEE8021Q_TT_VO] = 5,
+ [IEEE8021Q_TT_IC] = 6,
+ [IEEE8021Q_TT_NC] = 7,
+};
+
+static const u8 ieee8021q_7queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4,
+ [IEEE8021Q_TT_IC] = 5,
+ [IEEE8021Q_TT_NC] = 6,
+};
+
+static const u8 ieee8021q_6queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2,
+ [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3,
+ [IEEE8021Q_TT_IC] = 4,
+ [IEEE8021Q_TT_NC] = 5,
+};
+
+static const u8 ieee8021q_5queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3,
+ [IEEE8021Q_TT_NC] = 4,
+};
+
+static const u8 ieee8021q_4queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3,
+};
+
+static const u8 ieee8021q_3queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2,
+};
+
+static const u8 ieee8021q_2queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1,
+};
+
+static const u8 ieee8021q_1queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0,
+ [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0,
+};
+
+/**
+ * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class
+ * @tt: IEEE 802.1Q Traffic Type
+ * @num_queues: Number of queues
+ *
+ * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based
+ * on the number of queues configured on the NIC. The mapping is based on the
+ * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic
+ * class mapping" and Table I-1 "Traffic type to traffic class mapping".
+ *
+ * Return: Traffic Class corresponding to the given Traffic Type or negative
+ * value in case of error.
+ */
+int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues)
+{
+ if (tt < 0 || tt >= IEEE8021Q_TT_MAX) {
+ pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt,
+ IEEE8021Q_TT_MAX);
+ return -EINVAL;
+ }
+
+ switch (num_queues) {
+ case 8:
+ TT_MAP_SIZE_OK(ieee8021q_8queue_tt_tc_map);
+ return ieee8021q_8queue_tt_tc_map[tt];
+ case 7:
+ TT_MAP_SIZE_OK(ieee8021q_7queue_tt_tc_map);
+ return ieee8021q_7queue_tt_tc_map[tt];
+ case 6:
+ TT_MAP_SIZE_OK(ieee8021q_6queue_tt_tc_map);
+ return ieee8021q_6queue_tt_tc_map[tt];
+ case 5:
+ TT_MAP_SIZE_OK(ieee8021q_5queue_tt_tc_map);
+ return ieee8021q_5queue_tt_tc_map[tt];
+ case 4:
+ TT_MAP_SIZE_OK(ieee8021q_4queue_tt_tc_map);
+ return ieee8021q_4queue_tt_tc_map[tt];
+ case 3:
+ TT_MAP_SIZE_OK(ieee8021q_3queue_tt_tc_map);
+ return ieee8021q_3queue_tt_tc_map[tt];
+ case 2:
+ TT_MAP_SIZE_OK(ieee8021q_2queue_tt_tc_map);
+ return ieee8021q_2queue_tt_tc_map[tt];
+ case 1:
+ TT_MAP_SIZE_OK(ieee8021q_1queue_tt_tc_map);
+ return ieee8021q_1queue_tt_tc_map[tt];
+ }
+
+ pr_err("Invalid number of queues %d\n", num_queues);
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc);
+
+/**
+ * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type
+ * @dscp: IETF DSCP value
+ *
+ * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT).
+ * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic
+ * Type, this function is inspired by the RFC8325 documentation which describe
+ * the mapping between DSCP and 802.11 User Priority (UP) values.
+ *
+ * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value
+ */
+int ietf_dscp_to_ieee8021q_tt(u8 dscp)
+{
+ switch (dscp) {
+ case DSCP_CS0:
+ /* Comment from RFC8325:
+ * [RFC4594], Section 4.8, recommends High-Throughput Data be marked
+ * AF1x (that is, AF11, AF12, and AF13, according to the rules defined
+ * in [RFC2475]).
+ *
+ * By default (as described in Section 2.3), High-Throughput Data will
+ * map to UP 1 and, thus, to the Background Access Category (AC_BK),
+ * which is contrary to the intent expressed in [RFC4594].
+
+ * Unfortunately, there really is no corresponding fit for the High-
+ * Throughput Data service class within the constrained 4 Access
+ * Category [IEEE.802.11-2016] model. If the High-Throughput Data
+ * service class is assigned to the Best Effort Access Category (AC_BE),
+ * then it would contend with Low-Latency Data (while [RFC4594]
+ * recommends a distinction in servicing between these service classes)
+ * as well as with the default service class; alternatively, if it is
+ * assigned to the Background Access Category (AC_BK), then it would
+ * receive a less-then-best-effort service and contend with Low-Priority
+ * Data (as discussed in Section 4.2.10).
+ *
+ * As such, since there is no directly corresponding fit for the High-
+ * Throughout Data service class within the [IEEE.802.11-2016] model, it
+ * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby
+ * admitting it to the Best Effort Access Category (AC_BE).
+ *
+ * Note: The above text is from RFC8325 which is describing the mapping
+ * between DSCP and 802.11 User Priority (UP) values. The mapping
+ * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but
+ * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q
+ * Traffic Types BE and BK.
+ */
+ case DSCP_AF11:
+ case DSCP_AF12:
+ case DSCP_AF13:
+ return IEEE8021Q_TT_BE;
+ /* Comment from RFC8325:
+ * RFC3662 and RFC4594 both recommend Low-Priority Data be marked
+ * with DSCP CS1. The Low-Priority Data service class loosely
+ * corresponds to the [IEEE.802.11-2016] Background Access Category
+ */
+ case DSCP_CS1:
+ return IEEE8021Q_TT_BK;
+ case DSCP_CS2:
+ case DSCP_AF21:
+ case DSCP_AF22:
+ case DSCP_AF23:
+ return IEEE8021Q_TT_EE;
+ case DSCP_CS3:
+ case DSCP_AF31:
+ case DSCP_AF32:
+ case DSCP_AF33:
+ return IEEE8021Q_TT_CA;
+ case DSCP_CS4:
+ case DSCP_AF41:
+ case DSCP_AF42:
+ case DSCP_AF43:
+ return IEEE8021Q_TT_VI;
+ case DSCP_CS5:
+ case DSCP_EF:
+ case DSCP_VOICE_ADMIT:
+ return IEEE8021Q_TT_VO;
+ case DSCP_CS6:
+ return IEEE8021Q_TT_IC;
+ case DSCP_CS7:
+ return IEEE8021Q_TT_NC;
+ }
+
+ return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp);
+}
+EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt);
diff --git a/net/core/iovec.c b/net/core/iovec.c
deleted file mode 100644
index 04b249c40b5b..000000000000
--- a/net/core/iovec.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * iovec manipulation routines.
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Fixes:
- * Andrew Lunn : Errors in iovec copying.
- * Pedro Roque : Added memcpy_fromiovecend and
- * csum_..._fromiovecend.
- * Andi Kleen : fixed error handling for 2.1
- * Alexey Kuznetsov: 2.1 optimisations
- * Andi Kleen : Fix csum*fromiovecend for IPv6.
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/in6.h>
-#include <asm/uaccess.h>
-#include <asm/byteorder.h>
-#include <net/checksum.h>
-#include <net/sock.h>
-
-/*
- * Verify iovec. The caller must ensure that the iovec is big enough
- * to hold the message iovec.
- *
- * Save time not doing access_ok. copy_*_user will make this work
- * in any case.
- */
-
-int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode)
-{
- int size, err, ct;
-
- if (m->msg_namelen) {
- if (mode == VERIFY_READ) {
- err = move_addr_to_kernel(m->msg_name, m->msg_namelen,
- address);
- if (err < 0)
- return err;
- }
- m->msg_name = address;
- } else {
- m->msg_name = NULL;
- }
-
- size = m->msg_iovlen * sizeof(struct iovec);
- if (copy_from_user(iov, m->msg_iov, size))
- return -EFAULT;
-
- m->msg_iov = iov;
- err = 0;
-
- for (ct = 0; ct < m->msg_iovlen; ct++) {
- err += iov[ct].iov_len;
- /*
- * Goal is not to verify user data, but to prevent returning
- * negative value, which is interpreted as errno.
- * Overflow is still possible, but it is harmless.
- */
- if (err < 0)
- return -EMSGSIZE;
- }
-
- return err;
-}
-
-/*
- * Copy kernel to iovec. Returns -EFAULT on error.
- *
- * Note: this modifies the original iovec.
- */
-
-int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
-{
- while (len > 0) {
- if (iov->iov_len) {
- int copy = min_t(unsigned int, iov->iov_len, len);
- if (copy_to_user(iov->iov_base, kdata, copy))
- return -EFAULT;
- kdata += copy;
- len -= copy;
- iov->iov_len -= copy;
- iov->iov_base += copy;
- }
- iov++;
- }
-
- return 0;
-}
-
-/*
- * Copy iovec to kernel. Returns -EFAULT on error.
- *
- * Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
- while (len > 0) {
- if (iov->iov_len) {
- int copy = min_t(unsigned int, len, iov->iov_len);
- if (copy_from_user(kdata, iov->iov_base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- iov++;
- }
-
- return 0;
-}
-
-/*
- * For use with ip_build_xmit
- */
-int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
- int len)
-{
- /* Skip over the finished iovecs */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- }
-
- while (len > 0) {
- u8 __user *base = iov->iov_base + offset;
- int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
- offset = 0;
- if (copy_from_user(kdata, base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov++;
- }
-
- return 0;
-}
-
-/*
- * And now for the all-in-one: copy and checksum from a user iovec
- * directly to a datagram
- * Calls to csum_partial but the last must be in 32 bit chunks
- *
- * ip_build_xmit must ensure that when fragmenting only the last
- * call to this function will be unaligned also.
- */
-int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
- int offset, unsigned int len, __wsum *csump)
-{
- __wsum csum = *csump;
- int partial_cnt = 0, err = 0;
-
- /* Skip over the finished iovecs */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- }
-
- while (len > 0) {
- u8 __user *base = iov->iov_base + offset;
- int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
- offset = 0;
-
- /* There is a remnant from previous iov. */
- if (partial_cnt) {
- int par_len = 4 - partial_cnt;
-
- /* iov component is too short ... */
- if (par_len > copy) {
- if (copy_from_user(kdata, base, copy))
- goto out_fault;
- kdata += copy;
- base += copy;
- partial_cnt += copy;
- len -= copy;
- iov++;
- if (len)
- continue;
- *csump = csum_partial(kdata - partial_cnt,
- partial_cnt, csum);
- goto out;
- }
- if (copy_from_user(kdata, base, par_len))
- goto out_fault;
- csum = csum_partial(kdata - partial_cnt, 4, csum);
- kdata += par_len;
- base += par_len;
- copy -= par_len;
- len -= par_len;
- partial_cnt = 0;
- }
-
- if (len > copy) {
- partial_cnt = copy % 4;
- if (partial_cnt) {
- copy -= partial_cnt;
- if (copy_from_user(kdata + copy, base + copy,
- partial_cnt))
- goto out_fault;
- }
- }
-
- if (copy) {
- csum = csum_and_copy_from_user(base, kdata, copy,
- csum, &err);
- if (err)
- goto out;
- }
- len -= copy + partial_cnt;
- kdata += copy + partial_cnt;
- iov++;
- }
- *csump = csum;
-out:
- return err;
-
-out_fault:
- err = -EFAULT;
- goto out;
-}
-
-EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
-EXPORT_SYMBOL(memcpy_fromiovec);
-EXPORT_SYMBOL(memcpy_fromiovecend);
-EXPORT_SYMBOL(memcpy_toiovec);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 4b36114744c5..212cde35affa 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux network device link state notification
*
* Author:
* Stefan Rompf <sux@loplof.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/module.h>
@@ -19,40 +14,56 @@
#include <linux/rtnetlink.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
-#include <asm/types.h>
+#include <linux/types.h>
+#include "dev.h"
enum lw_bits {
- LW_RUNNING = 0,
- LW_SE_USED
+ LW_URGENT = 0,
};
static unsigned long linkwatch_flags;
static unsigned long linkwatch_nextevent;
-static void linkwatch_event(void *dummy);
-static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL);
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
static LIST_HEAD(lweventlist);
static DEFINE_SPINLOCK(lweventlist_lock);
-struct lw_event {
- struct list_head list;
- struct net_device *dev;
-};
+static unsigned int default_operstate(const struct net_device *dev)
+{
+ if (netif_testing(dev))
+ return IF_OPER_TESTING;
+
+ /* Some uppers (DSA) have additional sources for being down, so
+ * first check whether lower is indeed the source of its down state.
+ */
+ if (!netif_carrier_ok(dev)) {
+ struct net_device *peer;
+ int iflink;
+
+ /* If called from netdev_run_todo()/linkwatch_sync_dev(),
+ * dev_net(dev) can be already freed, and RTNL is not held.
+ */
+ if (dev->reg_state <= NETREG_REGISTERED)
+ iflink = dev_get_iflink(dev);
+ else
+ iflink = dev->ifindex;
-/* Avoid kmalloc() for most systems */
-static struct lw_event singleevent;
+ if (iflink == dev->ifindex)
+ return IF_OPER_DOWN;
-static unsigned char default_operstate(const struct net_device *dev)
-{
- if (!netif_carrier_ok(dev))
- return (dev->ifindex != dev->iflink ?
- IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+ ASSERT_RTNL();
+ peer = __dev_get_by_index(dev_net(dev), iflink);
+ if (!peer)
+ return IF_OPER_DOWN;
+
+ return netif_carrier_ok(peer) ? IF_OPER_DOWN :
+ IF_OPER_LOWERLAYERDOWN;
+ }
if (netif_dormant(dev))
return IF_OPER_DORMANT;
@@ -60,123 +71,250 @@ static unsigned char default_operstate(const struct net_device *dev)
return IF_OPER_UP;
}
-
static void rfc2863_policy(struct net_device *dev)
{
- unsigned char operstate = default_operstate(dev);
+ unsigned int operstate = default_operstate(dev);
- if (operstate == dev->operstate)
+ if (operstate == READ_ONCE(dev->operstate))
return;
- write_lock_bh(&dev_base_lock);
-
switch(dev->link_mode) {
+ case IF_LINK_MODE_TESTING:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_LINK_MODE_DORMANT:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_DORMANT;
break;
-
case IF_LINK_MODE_DEFAULT:
default:
break;
- };
+ }
+
+ WRITE_ONCE(dev->operstate, operstate);
+}
- dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
+void linkwatch_init_dev(struct net_device *dev)
+{
+ /* Handle pre-registration link state changes */
+ if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
+ netif_testing(dev))
+ rfc2863_policy(dev);
}
-/* Must be called with the rtnl semaphore held */
-void linkwatch_run_queue(void)
+static bool linkwatch_urgent_event(struct net_device *dev)
{
- struct list_head head, *n, *next;
+ if (!netif_running(dev))
+ return false;
- spin_lock_irq(&lweventlist_lock);
- list_replace_init(&lweventlist, &head);
- spin_unlock_irq(&lweventlist_lock);
+ if (dev->ifindex != dev_get_iflink(dev))
+ return true;
- list_for_each_safe(n, next, &head) {
- struct lw_event *event = list_entry(n, struct lw_event, list);
- struct net_device *dev = event->dev;
+ if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
+ return true;
- if (event == &singleevent) {
- clear_bit(LW_SE_USED, &linkwatch_flags);
- } else {
- kfree(event);
- }
+ return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
+}
- /* We are about to handle this device,
- * so new events can be accepted
- */
- clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
- rfc2863_policy(dev);
- if (dev->flags & IFF_UP) {
- if (netif_carrier_ok(dev)) {
- WARN_ON(dev->qdisc_sleeping == &noop_qdisc);
- dev_activate(dev);
- } else
- dev_deactivate(dev);
-
- netdev_state_change(dev);
- }
+static void linkwatch_add_event(struct net_device *dev)
+{
+ unsigned long flags;
- dev_put(dev);
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (list_empty(&dev->link_watch_list)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
}
-}
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
-static void linkwatch_event(void *dummy)
+static void linkwatch_schedule_work(int urgent)
{
- /* Limit the number of linkwatch events to one
- * per second so that a runaway driver does not
- * cause a storm of messages on the netlink
- * socket
- */
- linkwatch_nextevent = jiffies + HZ;
- clear_bit(LW_RUNNING, &linkwatch_flags);
+ unsigned long delay = linkwatch_nextevent - jiffies;
- rtnl_lock();
- linkwatch_run_queue();
- rtnl_unlock();
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ return;
+
+ /* Minimise down-time: drop delay for up event. */
+ if (urgent) {
+ if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
+ return;
+ delay = 0;
+ }
+
+ /* If we wrap around we'll delay it by at most HZ. */
+ if (delay > HZ)
+ delay = 0;
+
+ /*
+ * If urgent, schedule immediate execution; otherwise, don't
+ * override the existing timer.
+ */
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ mod_delayed_work(system_dfl_wq, &linkwatch_work, 0);
+ else
+ queue_delayed_work(system_dfl_wq, &linkwatch_work, delay);
}
-void linkwatch_fire_event(struct net_device *dev)
+static void linkwatch_do_dev(struct net_device *dev)
{
- if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
- unsigned long flags;
- struct lw_event *event;
-
- if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) {
- event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC);
-
- if (unlikely(event == NULL)) {
- clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
- return;
- }
- } else {
- event = &singleevent;
- }
+ /*
+ * Make sure the above read is complete since it can be
+ * rewritten as soon as we clear the bit below.
+ */
+ smp_mb__before_atomic();
+
+ /* We are about to handle this device,
+ * so new events can be accepted
+ */
+ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+ rfc2863_policy(dev);
+ if (dev->flags & IFF_UP) {
+ if (netif_carrier_ok(dev))
+ dev_activate(dev);
+ else
+ dev_deactivate(dev);
+
+ netif_state_change(dev);
+ }
+ /* Note: our callers are responsible for calling netdev_tracker_free().
+ * This is the reason we use __dev_put() instead of dev_put().
+ */
+ __dev_put(dev);
+}
- dev_hold(dev);
- event->dev = dev;
+static void __linkwatch_run_queue(int urgent_only)
+{
+#define MAX_DO_DEV_PER_LOOP 100
- spin_lock_irqsave(&lweventlist_lock, flags);
- list_add_tail(&event->list, &lweventlist);
- spin_unlock_irqrestore(&lweventlist_lock, flags);
+ int do_dev = MAX_DO_DEV_PER_LOOP;
+ /* Use a local list here since we add non-urgent
+ * events back to the global one when called with
+ * urgent_only=1.
+ */
+ LIST_HEAD(wrk);
- if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) {
- unsigned long delay = linkwatch_nextevent - jiffies;
+ /* Give urgent case more budget */
+ if (urgent_only)
+ do_dev += MAX_DO_DEV_PER_LOOP;
- /* If we wrap around we'll delay it by at most HZ. */
- if (!delay || delay > HZ)
- schedule_work(&linkwatch_work);
- else
- schedule_delayed_work(&linkwatch_work, delay);
+ /*
+ * Limit the number of linkwatch events to one
+ * per second so that a runaway driver does not
+ * cause a storm of messages on the netlink
+ * socket. This limit does not apply to up events
+ * while the device qdisc is down.
+ */
+ if (!urgent_only)
+ linkwatch_nextevent = jiffies + HZ;
+ /* Limit wrap-around effect on delay. */
+ else if (time_after(linkwatch_nextevent, jiffies + HZ))
+ linkwatch_nextevent = jiffies;
+
+ clear_bit(LW_URGENT, &linkwatch_flags);
+
+ spin_lock_irq(&lweventlist_lock);
+ list_splice_init(&lweventlist, &wrk);
+
+ while (!list_empty(&wrk) && do_dev > 0) {
+ struct net_device *dev;
+
+ dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+ list_del_init(&dev->link_watch_list);
+
+ if (!netif_device_present(dev) ||
+ (urgent_only && !linkwatch_urgent_event(dev))) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ continue;
}
+ /* We must free netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
+ spin_unlock_irq(&lweventlist_lock);
+ netdev_lock_ops(dev);
+ linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
+ do_dev--;
+ spin_lock_irq(&lweventlist_lock);
+ }
+
+ /* Add the remaining work back to lweventlist */
+ list_splice_init(&wrk, &lweventlist);
+
+ if (!list_empty(&lweventlist))
+ linkwatch_schedule_work(0);
+ spin_unlock_irq(&lweventlist_lock);
+}
+
+static bool linkwatch_clean_dev(struct net_device *dev)
+{
+ unsigned long flags;
+ bool clean = false;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (!list_empty(&dev->link_watch_list)) {
+ list_del_init(&dev->link_watch_list);
+ clean = true;
+ /* We must release netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+
+ return clean;
+}
+
+void __linkwatch_sync_dev(struct net_device *dev)
+{
+ netdev_ops_assert_locked(dev);
+
+ if (linkwatch_clean_dev(dev))
+ linkwatch_do_dev(dev);
+}
+
+void linkwatch_sync_dev(struct net_device *dev)
+{
+ if (linkwatch_clean_dev(dev)) {
+ netdev_lock_ops(dev);
+ linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
}
}
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
+{
+ __linkwatch_run_queue(0);
+}
+
+
+static void linkwatch_event(struct work_struct *dummy)
+{
+ rtnl_lock();
+ __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
+ rtnl_unlock();
+}
+
+
+void linkwatch_fire_event(struct net_device *dev)
+{
+ bool urgent = linkwatch_urgent_event(dev);
+
+ if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
+ linkwatch_add_event(dev);
+ } else if (!urgent)
+ return;
+
+ linkwatch_schedule_work(urgent);
+}
EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c
new file mode 100644
index 000000000000..9e9fb25314b9
--- /dev/null
+++ b/net/core/lock_debug.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
+#include <net/netns/generic.h>
+
+int netdev_debug_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ enum netdev_cmd cmd = event;
+
+ /* Keep enum and don't add default to trigger -Werror=switch */
+ switch (cmd) {
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_assert_locked(dev);
+ fallthrough;
+ case NETDEV_CHANGE:
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ netdev_ops_assert_locked(dev);
+ fallthrough;
+ case NETDEV_DOWN:
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_PRE_CHANGEADDR:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_FEAT_CHANGE:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_PRE_UP:
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_POST_TYPE_CHANGE:
+ case NETDEV_POST_INIT:
+ case NETDEV_PRE_UNINIT:
+ case NETDEV_RELEASE:
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_JOIN:
+ case NETDEV_CHANGEUPPER:
+ case NETDEV_RESEND_IGMP:
+ case NETDEV_PRECHANGEMTU:
+ case NETDEV_CHANGEINFODATA:
+ case NETDEV_BONDING_INFO:
+ case NETDEV_PRECHANGEUPPER:
+ case NETDEV_CHANGELOWERSTATE:
+ case NETDEV_UDP_TUNNEL_PUSH_INFO:
+ case NETDEV_UDP_TUNNEL_DROP_INFO:
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ case NETDEV_CVLAN_FILTER_PUSH_INFO:
+ case NETDEV_CVLAN_FILTER_DROP_INFO:
+ case NETDEV_SVLAN_FILTER_PUSH_INFO:
+ case NETDEV_SVLAN_FILTER_DROP_INFO:
+ case NETDEV_OFFLOAD_XSTATS_ENABLE:
+ case NETDEV_OFFLOAD_XSTATS_DISABLE:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
+ ASSERT_RTNL();
+ break;
+
+ case NETDEV_CHANGENAME:
+ ASSERT_RTNL_NET(net);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL");
+
+static int rtnl_net_debug_net_id;
+
+static int __net_init rtnl_net_debug_net_init(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ nb->notifier_call = netdev_debug_event;
+
+ return register_netdevice_notifier_net(net, nb);
+}
+
+static void __net_exit rtnl_net_debug_net_exit(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ unregister_netdevice_notifier_net(net, nb);
+}
+
+static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = {
+ .init = rtnl_net_debug_net_init,
+ .exit = rtnl_net_debug_net_exit,
+ .id = &rtnl_net_debug_net_id,
+ .size = sizeof(struct notifier_block),
+};
+
+static struct notifier_block rtnl_net_debug_block = {
+ .notifier_call = netdev_debug_event,
+};
+
+static int __init rtnl_net_debug_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&rtnl_net_debug_net_ops);
+ if (ret)
+ return ret;
+
+ ret = register_netdevice_notifier(&rtnl_net_debug_block);
+ if (ret)
+ unregister_pernet_subsys(&rtnl_net_debug_net_ops);
+
+ return ret;
+}
+
+subsys_initcall(rtnl_net_debug_init);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..9f40be0c3e71
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ */
+
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/flow.h>
+#include <net/lwtunnel.h>
+#include <net/gre.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
+#include <net/ipv6_stubs.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int ret;
+
+ /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
+ * BPF prog and skb_do_redirect().
+ */
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+
+ switch (ret) {
+ case BPF_OK:
+ case BPF_LWT_REROUTE:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ skb_reset_mac_header(skb);
+ skb_do_redirect(skb);
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+
+ return ret;
+}
+
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+ enum skb_drop_reason reason;
+ int err = -EINVAL;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct net_device *dev = skb_dst(skb)->dev;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ dev_hold(dev);
+ skb_dst_drop(skb);
+ reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ err = reason ? -EINVAL : 0;
+ dev_put(dev);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ skb_dst_drop(skb);
+ err = ipv6_stub->ipv6_route_input(skb);
+ } else {
+ err = -EAFNOSUPPORT;
+ }
+
+ if (err)
+ goto err;
+ return dst_input(skb);
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ if (ret == BPF_LWT_REROUTE)
+ return bpf_lwt_input_reroute(skb);
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
+{
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+ struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+ int oif = l3mdev ? l3mdev->ifindex : 0;
+ struct dst_entry *dst = NULL;
+ int err = -EAFNOSUPPORT;
+ struct sock *sk;
+ struct net *net;
+ bool ipv4;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ ipv4 = true;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ipv4 = false;
+ else
+ goto err;
+
+ sk = sk_to_full_sk(skb->sk);
+ if (sk) {
+ if (sk->sk_bound_dev_if)
+ oif = sk->sk_bound_dev_if;
+ net = sock_net(sk);
+ } else {
+ net = dev_net(skb_dst(skb)->dev);
+ }
+
+ if (ipv4) {
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {};
+ struct rtable *rt;
+
+ fl4.flowi4_oif = oif;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_uid = sock_net_uid(net, sk);
+ fl4.flowi4_dscp = ip4h_dscp(iph);
+ fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ fl4.flowi4_proto = iph->protocol;
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto err;
+ }
+ dst = &rt->dst;
+ } else {
+ struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct flowi6 fl6 = {};
+
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(net, sk);
+ fl6.flowlabel = ip6_flowinfo(iph6);
+ fl6.flowi6_proto = iph6->nexthdr;
+ fl6.daddr = iph6->daddr;
+ fl6.saddr = iph6->saddr;
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto err;
+ }
+ }
+ if (unlikely(dst->error)) {
+ err = dst->error;
+ dst_release(dst);
+ goto err;
+ }
+
+ /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+ * was done for the previous dst, so we are doing it here again, in
+ * case the new dst needs much more space. The call below is a noop
+ * if there is enough header space in skb.
+ */
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ goto err;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ if (unlikely(err))
+ return net_xmit_errno(err);
+
+ /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+ return LWTUNNEL_XMIT_DONE;
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int hh_len = dst->dev->hard_header_len;
+ __be16 proto = skb->protocol;
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header changed, e.g. via bpf_lwt_push_encap,
+ * BPF_LWT_REROUTE below should have been used if the
+ * protocol was also changed.
+ */
+ if (skb->protocol != proto) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb, hh_len);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ case BPF_LWT_REROUTE:
+ return bpf_lwt_xmit_reroute(skb);
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr,
+ bpf_prog_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy,
+ extack);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
+ int encap_len)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_type |= gso_type;
+ skb_decrease_gso_size(shinfo, encap_len);
+ shinfo->gso_segs = 0;
+ return 0;
+}
+
+static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
+{
+ int next_hdr_offset;
+ void *next_hdr;
+ __u8 protocol;
+
+ /* SCTP and UDP_L4 gso need more nuanced handling than what
+ * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
+ * So at the moment only TCP GSO packets are let through.
+ */
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ return -ENOTSUPP;
+
+ if (ipv4) {
+ protocol = ip_hdr(skb)->protocol;
+ next_hdr_offset = sizeof(struct iphdr);
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
+ } else {
+ protocol = ipv6_hdr(skb)->nexthdr;
+ next_hdr_offset = sizeof(struct ipv6hdr);
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
+ }
+
+ switch (protocol) {
+ case IPPROTO_GRE:
+ next_hdr_offset += sizeof(struct gre_base_hdr);
+ if (next_hdr_offset > encap_len)
+ return -EINVAL;
+
+ if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
+ return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
+ encap_len);
+ return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
+
+ case IPPROTO_UDP:
+ next_hdr_offset += sizeof(struct udphdr);
+ if (next_hdr_offset > encap_len)
+ return -EINVAL;
+
+ if (((struct udphdr *)next_hdr)->check)
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
+ encap_len);
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
+
+ case IPPROTO_IP:
+ case IPPROTO_IPV6:
+ if (ipv4)
+ return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
+ else
+ return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+}
+
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+ struct iphdr *iph;
+ bool ipv4;
+ int err;
+
+ if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+ return -EINVAL;
+
+ /* validate protocol and length */
+ iph = (struct iphdr *)hdr;
+ if (iph->version == 4) {
+ ipv4 = true;
+ if (unlikely(len < iph->ihl * 4))
+ return -EINVAL;
+ } else if (iph->version == 6) {
+ ipv4 = false;
+ if (unlikely(len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (ingress)
+ err = skb_cow_head(skb, len + skb->mac_len);
+ else
+ err = skb_cow_head(skb,
+ len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
+ if (unlikely(err))
+ return err;
+
+ /* push the encap headers and fix pointers */
+ skb_reset_inner_headers(skb);
+ skb_reset_inner_mac_header(skb); /* mac header is not yet set */
+ skb_set_inner_protocol(skb, skb->protocol);
+ skb->encapsulation = 1;
+ skb_push(skb, len);
+ if (ingress)
+ skb_postpush_rcsum(skb, iph, len);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), hdr, len);
+ bpf_compute_data_pointers(skb);
+ skb_clear_hash(skb);
+
+ if (ipv4) {
+ skb->protocol = htons(ETH_P_IP);
+ iph = ip_hdr(skb);
+
+ if (!iph->check)
+ iph->check = ip_fast_csum((unsigned char *)iph,
+ iph->ihl);
+ } else {
+ skb->protocol = htons(ETH_P_IPV6);
+ }
+
+ if (skb_is_gso(skb))
+ return handle_gso_encap(skb, ipv4, len);
+
+ return 0;
+}
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000000..f9d76d85d04f
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,474 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * lwtunnel Infrastructure for light weight tunnels like mpls
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+#include <net/ip6_fib.h>
+#include <net/rtnh.h>
+
+#include "dev.h"
+
+DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
+
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+ /* Only lwt encaps implemented without using an interface for
+ * the encap need to return a string here.
+ */
+ switch (encap_type) {
+ case LWTUNNEL_ENCAP_MPLS:
+ return "MPLS";
+ case LWTUNNEL_ENCAP_ILA:
+ return "ILA";
+ case LWTUNNEL_ENCAP_SEG6:
+ return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
+ case LWTUNNEL_ENCAP_SEG6_LOCAL:
+ return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_RPL:
+ return "RPL";
+ case LWTUNNEL_ENCAP_IOAM6:
+ return "IOAM6";
+ case LWTUNNEL_ENCAP_XFRM:
+ /* module autoload not supported for encap type */
+ return NULL;
+ case LWTUNNEL_ENCAP_IP6:
+ case LWTUNNEL_ENCAP_IP:
+ case LWTUNNEL_ENCAP_NONE:
+ case __LWTUNNEL_ENCAP_MAX:
+ /* should not have got here */
+ WARN_ON(1);
+ break;
+ }
+ return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+ struct lwtunnel_state *lws;
+
+ lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+ return lws;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_state_alloc);
+
+static const struct lwtunnel_encap_ops __rcu *
+ lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ return !cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int encap_type)
+{
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[encap_type],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net *net, u16 encap_type,
+ struct nlattr *encap, unsigned int family,
+ const void *cfg, struct lwtunnel_state **lws,
+ struct netlink_ext_ack *extack)
+{
+ const struct lwtunnel_encap_ops *ops;
+ bool found = false;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX) {
+ NL_SET_ERR_MSG_ATTR(extack, encap,
+ "Unknown LWT encapsulation type");
+ return ret;
+ }
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state && try_module_get(ops->owner)))
+ found = true;
+ rcu_read_unlock();
+
+ if (found) {
+ ret = ops->build_state(net, encap, family, cfg, lws, extack);
+ if (ret)
+ module_put(ops->owner);
+ } else {
+ /* don't rely on -EOPNOTSUPP to detect match as build_state
+ * handlers could return it
+ */
+ NL_SET_ERR_MSG_ATTR(extack, encap,
+ "LWT encapsulation type not supported");
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_build_state);
+
+int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX) {
+ NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type");
+ return ret;
+ }
+
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+ if (encap_type_str) {
+ request_module("rtnl-lwt-%s", encap_type_str);
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
+ }
+ }
+#endif
+ ret = ops ? 0 : -EOPNOTSUPP;
+ if (ret < 0)
+ NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type);
+
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
+ struct netlink_ext_ack *extack)
+{
+ struct rtnexthop *rtnh = (struct rtnexthop *)attr;
+ struct nlattr *nla_entype;
+ struct nlattr *attrs;
+ u16 encap_type;
+ int attrlen;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ attrs = rtnh_attrs(rtnh);
+ nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+
+ if (nla_entype) {
+ if (nla_len(nla_entype) < sizeof(u16)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE");
+ return -EINVAL;
+ }
+ encap_type = nla_get_u16(nla_entype);
+
+ if (lwtunnel_valid_encap_type(encap_type, extack))
+ return -EOPNOTSUPP;
+ }
+ }
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr);
+
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+ const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+ if (ops->destroy_state) {
+ ops->destroy_state(lws);
+ kfree_rcu(lws, rcu);
+ } else {
+ kfree(lws);
+ }
+ module_put(ops->owner);
+}
+EXPORT_SYMBOL_GPL(lwtstate_free);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
+ int encap_attr, int encap_type_attr)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct nlattr *nest;
+ int ret;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, encap_attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->fill_encap))
+ ret = ops->fill_encap(skb, lwtstate);
+ rcu_read_unlock();
+
+ if (ret)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ ret = nla_put_u16(skb, encap_type_attr, lwtstate->type);
+ if (ret)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+
+ return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->get_encap_size))
+ ret = nla_total_size(ops->get_encap_size(lwtstate));
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!a && !b)
+ return 0;
+
+ if (!a || !b)
+ return 1;
+
+ if (a->type != b->type)
+ return 1;
+
+ if (a->type == LWTUNNEL_ENCAP_NONE ||
+ a->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[a->type]);
+ if (likely(ops && ops->cmp_encap))
+ ret = ops->cmp_encap(a, b);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
+
+int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ struct dst_entry *dst;
+ int ret;
+
+ local_bh_disable();
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->output)) {
+ dev_xmit_recursion_inc();
+ ret = ops->output(net, sk, skb);
+ dev_xmit_recursion_dec();
+ }
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ goto out;
+
+drop:
+ kfree_skb(skb);
+
+out:
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_output);
+
+int lwtunnel_xmit(struct sk_buff *skb)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ struct dst_entry *dst;
+ int ret;
+
+ local_bh_disable();
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
+
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->xmit)) {
+ dev_xmit_recursion_inc();
+ ret = ops->xmit(skb);
+ dev_xmit_recursion_dec();
+ }
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ goto out;
+
+drop:
+ kfree_skb(skb);
+
+out:
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_xmit);
+
+int lwtunnel_input(struct sk_buff *skb)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ struct dst_entry *dst;
+ int ret;
+
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->input)) {
+ dev_xmit_recursion_inc();
+ ret = ops->input(skb);
+ dev_xmit_recursion_dec();
+ }
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lwtunnel_input);
diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h
new file mode 100644
index 000000000000..67cd0dd7319c
--- /dev/null
+++ b/net/core/mp_dmabuf_devmem.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Dmabuf device memory provider.
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ *
+ */
+#ifndef _NET_MP_DMABUF_DEVMEM_H
+#define _NET_MP_DMABUF_DEVMEM_H
+
+#include <net/netmem.h>
+
+#if defined(CONFIG_NET_DEVMEM)
+int mp_dmabuf_devmem_init(struct page_pool *pool);
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp);
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool);
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem);
+#else
+static inline int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline netmem_ref
+mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ return 0;
+}
+
+static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static inline bool
+mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ return false;
+}
+#endif
+
+#endif /* _NET_MP_DMABUF_DEVMEM_H */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ba509a4a8e92..96a3b1a93252 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Generic address resolution entity
*
@@ -5,69 +6,82 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
* Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
* Harald Welte Add neighbour cache statistics like rtstat
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/socket.h>
-#include <linux/sched.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/times.h>
+#include <net/net_namespace.h>
#include <net/neighbour.h>
+#include <net/arp.h>
#include <net/dst.h>
+#include <net/ip.h>
#include <net/sock.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/random.h>
#include <linux/string.h>
+#include <linux/log2.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
-#define NEIGH_DEBUG 1
+#include <trace/events/neigh.h>
-#define NEIGH_PRINTK(x...) printk(x)
-#define NEIGH_NOPRINTK(x...) do { ; } while(0)
-#define NEIGH_PRINTK0 NEIGH_PRINTK
-#define NEIGH_PRINTK1 NEIGH_NOPRINTK
-#define NEIGH_PRINTK2 NEIGH_NOPRINTK
-
-#if NEIGH_DEBUG >= 1
-#undef NEIGH_PRINTK1
-#define NEIGH_PRINTK1 NEIGH_PRINTK
-#endif
-#if NEIGH_DEBUG >= 2
-#undef NEIGH_PRINTK2
-#define NEIGH_PRINTK2 NEIGH_PRINTK
-#endif
+#define NEIGH_DEBUG 1
+#define neigh_dbg(level, fmt, ...) \
+do { \
+ if (level <= NEIGH_DEBUG) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
#define PNEIGH_HASHMASK 0xF
-static void neigh_timer_handler(unsigned long arg);
-#ifdef CONFIG_ARPD
-static void neigh_app_notify(struct neighbour *n);
-#endif
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
-void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
+static void neigh_timer_handler(struct timer_list *t);
+static void __neigh_notify(struct neighbour *n, int type, int flags,
+ u32 pid);
+static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
+static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm);
-static struct neigh_table *neigh_tables;
#ifdef CONFIG_PROC_FS
-static struct file_operations neigh_stat_seq_fops;
+static const struct seq_operations neigh_stat_seq_ops;
#endif
+static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
+{
+ int i;
+
+ switch (family) {
+ default:
+ DEBUG_NET_WARN_ON_ONCE(1);
+ fallthrough; /* to avoid panic by null-ptr-deref */
+ case AF_INET:
+ i = NEIGH_ARP_TABLE;
+ break;
+ case AF_INET6:
+ i = NEIGH_ND_TABLE;
+ break;
+ }
+
+ return &dev->neighbours[i];
+}
+
/*
- Neighbour hash table buckets are protected with rwlock tbl->lock.
+ Neighbour hash table buckets are protected with tbl->lock.
- All the scans/updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
@@ -92,19 +106,22 @@ static struct file_operations neigh_stat_seq_fops;
the most complicated procedure, which we allow is dev->hard_header.
It is supposed, that dev->hard_header is simplistic and does
not make callbacks to neighbour tables.
-
- The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
- list of neighbour tables. This list is used only in process context,
*/
-static DEFINE_RWLOCK(neigh_tbl_lock);
-
-static int neigh_blackhole(struct sk_buff *skb)
+static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
return -ENETDOWN;
}
+static void neigh_cleanup_and_release(struct neighbour *neigh)
+{
+ trace_neigh_cleanup_and_release(neigh, 0);
+ __neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ neigh_release(neigh);
+}
+
/*
* It is random distribution in the interval (1/2)*base...(3/2)*base.
* It corresponds to default IPv6 settings and is not overridable,
@@ -113,281 +130,531 @@ static int neigh_blackhole(struct sk_buff *skb)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return (base ? (net_random() % base) + (base >> 1) : 0);
+ return base ? get_random_u32_below(base) + (base >> 1) : 0;
+}
+EXPORT_SYMBOL(neigh_rand_reach_time);
+
+static void neigh_mark_dead(struct neighbour *n)
+{
+ n->dead = 1;
+ if (!list_empty(&n->gc_list)) {
+ list_del_init(&n->gc_list);
+ atomic_dec(&n->tbl->gc_entries);
+ }
+ if (!list_empty(&n->managed_list))
+ list_del_init(&n->managed_list);
+}
+
+static void neigh_update_gc_list(struct neighbour *n)
+{
+ bool on_gc_list, exempt_from_gc;
+
+ spin_lock_bh(&n->tbl->lock);
+ write_lock(&n->lock);
+ if (n->dead)
+ goto out;
+
+ /* remove from the gc list if new state is permanent or if neighbor is
+ * externally learned / validated; otherwise entry should be on the gc
+ * list
+ */
+ exempt_from_gc = n->nud_state & NUD_PERMANENT ||
+ n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED);
+ on_gc_list = !list_empty(&n->gc_list);
+
+ if (exempt_from_gc && on_gc_list) {
+ list_del_init(&n->gc_list);
+ atomic_dec(&n->tbl->gc_entries);
+ } else if (!exempt_from_gc && !on_gc_list) {
+ /* add entries to the tail; cleaning removes from the front */
+ list_add_tail(&n->gc_list, &n->tbl->gc_list);
+ atomic_inc(&n->tbl->gc_entries);
+ }
+out:
+ write_unlock(&n->lock);
+ spin_unlock_bh(&n->tbl->lock);
+}
+
+static void neigh_update_managed_list(struct neighbour *n)
+{
+ bool on_managed_list, add_to_managed;
+
+ spin_lock_bh(&n->tbl->lock);
+ write_lock(&n->lock);
+ if (n->dead)
+ goto out;
+
+ add_to_managed = n->flags & NTF_MANAGED;
+ on_managed_list = !list_empty(&n->managed_list);
+
+ if (!add_to_managed && on_managed_list)
+ list_del_init(&n->managed_list);
+ else if (add_to_managed && !on_managed_list)
+ list_add_tail(&n->managed_list, &n->tbl->managed_list);
+out:
+ write_unlock(&n->lock);
+ spin_unlock_bh(&n->tbl->lock);
}
+static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
+ bool *gc_update, bool *managed_update)
+{
+ u32 ndm_flags, old_flags = neigh->flags;
+
+ if (!(flags & NEIGH_UPDATE_F_ADMIN))
+ return;
+
+ ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
+ ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0;
+ ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0;
+
+ if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) {
+ if (ndm_flags & NTF_EXT_LEARNED)
+ neigh->flags |= NTF_EXT_LEARNED;
+ else
+ neigh->flags &= ~NTF_EXT_LEARNED;
+ *notify = 1;
+ *gc_update = true;
+ }
+ if ((old_flags ^ ndm_flags) & NTF_MANAGED) {
+ if (ndm_flags & NTF_MANAGED)
+ neigh->flags |= NTF_MANAGED;
+ else
+ neigh->flags &= ~NTF_MANAGED;
+ *notify = 1;
+ *managed_update = true;
+ }
+ if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) {
+ if (ndm_flags & NTF_EXT_VALIDATED)
+ neigh->flags |= NTF_EXT_VALIDATED;
+ else
+ neigh->flags &= ~NTF_EXT_VALIDATED;
+ *notify = 1;
+ *gc_update = true;
+ }
+}
+
+bool neigh_remove_one(struct neighbour *n)
+{
+ bool retval = false;
+
+ write_lock(&n->lock);
+ if (refcount_read(&n->refcnt) == 1) {
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ neigh_mark_dead(n);
+ retval = true;
+ }
+ write_unlock(&n->lock);
+ if (retval)
+ neigh_cleanup_and_release(n);
+ return retval;
+}
static int neigh_forced_gc(struct neigh_table *tbl)
{
+ int max_clean = atomic_read(&tbl->gc_entries) -
+ READ_ONCE(tbl->gc_thresh2);
+ u64 tmax = ktime_get_ns() + NSEC_PER_MSEC;
+ unsigned long tref = jiffies - 5 * HZ;
+ struct neighbour *n, *tmp;
int shrunk = 0;
- int i;
+ int loop = 0;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
- write_lock_bh(&tbl->lock);
- for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np;
+ spin_lock_bh(&tbl->lock);
+
+ list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
+ if (refcount_read(&n->refcnt) == 1) {
+ bool remove = false;
- np = &tbl->hash_buckets[i];
- while ((n = *np) != NULL) {
- /* Neighbour record may be discarded if:
- * - nobody refers to it.
- * - it is not permanent
- */
write_lock(&n->lock);
- if (atomic_read(&n->refcnt) == 1 &&
- !(n->nud_state & NUD_PERMANENT)) {
- *np = n->next;
- n->dead = 1;
- shrunk = 1;
- write_unlock(&n->lock);
- neigh_release(n);
- continue;
- }
+ if ((n->nud_state == NUD_FAILED) ||
+ (n->nud_state == NUD_NOARP) ||
+ (tbl->is_multicast &&
+ tbl->is_multicast(n->primary_key)) ||
+ !time_in_range(n->updated, tref, jiffies))
+ remove = true;
write_unlock(&n->lock);
- np = &n->next;
+
+ if (remove && neigh_remove_one(n))
+ shrunk++;
+ if (shrunk >= max_clean)
+ break;
+ if (++loop == 16) {
+ if (ktime_get_ns() > tmax)
+ goto unlock;
+ loop = 0;
+ }
}
}
- tbl->last_flush = jiffies;
-
- write_unlock_bh(&tbl->lock);
+ WRITE_ONCE(tbl->last_flush, jiffies);
+unlock:
+ spin_unlock_bh(&tbl->lock);
return shrunk;
}
+static void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+ /* Use safe distance from the jiffies - LONG_MAX point while timer
+ * is running in DELAY/PROBE state but still show to user space
+ * large times in the past.
+ */
+ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ);
+
+ neigh_hold(n);
+ if (!time_in_range(n->confirmed, mint, jiffies))
+ n->confirmed = mint;
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
+ if (unlikely(mod_timer(&n->timer, when))) {
+ printk("NEIGH: BUG, double timer add, state is %x\n",
+ n->nud_state);
+ dump_stack();
+ }
+}
+
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
- del_timer(&n->timer)) {
+ timer_delete(&n->timer)) {
neigh_release(n);
return 1;
}
return 0;
}
-static void pneigh_queue_purge(struct sk_buff_head *list)
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_parms_qlen_dec(struct net_device *dev, int family)
+{
+ struct neigh_parms *p;
+
+ rcu_read_lock();
+ p = neigh_get_dev_parms_rcu(dev, family);
+ if (p)
+ p->qlen--;
+ rcu_read_unlock();
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
+ int family)
+{
+ struct sk_buff_head tmp;
+ unsigned long flags;
struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL) {
+ skb_queue_head_init(&tmp);
+ spin_lock_irqsave(&list->lock, flags);
+ skb = skb_peek(list);
+ while (skb != NULL) {
+ struct sk_buff *skb_next = skb_peek_next(skb, list);
+ struct net_device *dev = skb->dev;
+
+ if (net == NULL || net_eq(dev_net(dev), net)) {
+ neigh_parms_qlen_dec(dev, family);
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&tmp, skb);
+ }
+ skb = skb_next;
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ while ((skb = __skb_dequeue(&tmp))) {
dev_put(skb->dev);
kfree_skb(skb);
}
}
-static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+static void neigh_flush_one(struct neighbour *n)
{
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+
+ write_lock(&n->lock);
+
+ neigh_del_timer(n);
+ neigh_mark_dead(n);
+
+ if (refcount_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ * We must destroy neighbour entry,
+ * but someone still uses it.
+ *
+ * The destroy will be delayed until
+ * the last user releases us, but
+ * we must kill timers etc. and move
+ * it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ WRITE_ONCE(n->output, neigh_blackhole);
+
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+
+ neigh_dbg(2, "neigh %p is stray\n", n);
+ }
+
+ write_unlock(&n->lock);
+
+ neigh_cleanup_and_release(n);
+}
+
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
+{
+ struct hlist_head *dev_head;
+ struct hlist_node *tmp;
+ struct neighbour *n;
+
+ dev_head = neigh_get_dev_table(dev, tbl->family);
+
+ hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
+ if (skip_perm &&
+ (n->nud_state & NUD_PERMANENT ||
+ n->flags & NTF_EXT_VALIDATED))
+ continue;
+
+ neigh_flush_one(n);
+ }
+}
+
+static void neigh_flush_table(struct neigh_table *tbl)
+{
+ struct neigh_hash_table *nht;
int i;
- for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np = &tbl->hash_buckets[i];
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
- while ((n = *np) != NULL) {
- if (dev && n->dev != dev) {
- np = &n->next;
- continue;
- }
- *np = n->next;
- write_lock(&n->lock);
- neigh_del_timer(n);
- n->dead = 1;
-
- if (atomic_read(&n->refcnt) != 1) {
- /* The most unpleasant situation.
- We must destroy neighbour entry,
- but someone still uses it.
-
- The destroy will be delayed until
- the last user releases us, but
- we must kill timers etc. and move
- it to safe state.
- */
- skb_queue_purge(&n->arp_queue);
- n->output = neigh_blackhole;
- if (n->nud_state & NUD_VALID)
- n->nud_state = NUD_NOARP;
- else
- n->nud_state = NUD_NONE;
- NEIGH_PRINTK2("neigh %p is stray.\n", n);
- }
- write_unlock(&n->lock);
- neigh_release(n);
- }
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct hlist_node *tmp;
+ struct neighbour *n;
+
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i])
+ neigh_flush_one(n);
}
}
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev, false);
+ spin_unlock_bh(&tbl->lock);
}
+EXPORT_SYMBOL(neigh_changeaddr);
-int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
{
- write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
- pneigh_ifdown(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ if (likely(dev)) {
+ neigh_flush_dev(tbl, dev, skip_perm);
+ } else {
+ DEBUG_NET_WARN_ON_ONCE(skip_perm);
+ neigh_flush_table(tbl);
+ }
+ spin_unlock_bh(&tbl->lock);
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_ifdown(tbl, dev, skip_perm);
+ pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
+ tbl->family);
+ if (skb_queue_empty_lockless(&tbl->proxy_queue))
+ timer_delete_sync(&tbl->proxy_timer);
return 0;
}
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev)
+{
+ __neigh_ifdown(tbl, dev, true);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_carrier_down);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ __neigh_ifdown(tbl, dev, false);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_ifdown);
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl,
+ struct net_device *dev,
+ u32 flags, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
- int entries;
-
- entries = atomic_inc_return(&tbl->entries) - 1;
- if (entries >= tbl->gc_thresh3 ||
- (entries >= tbl->gc_thresh2 &&
- time_after(now, tbl->last_flush + 5 * HZ))) {
- if (!neigh_forced_gc(tbl) &&
- entries >= tbl->gc_thresh3)
+ int entries, gc_thresh3;
+
+ if (exempt_from_gc)
+ goto do_alloc;
+
+ entries = atomic_inc_return(&tbl->gc_entries) - 1;
+ gc_thresh3 = READ_ONCE(tbl->gc_thresh3);
+ if (entries >= gc_thresh3 ||
+ (entries >= READ_ONCE(tbl->gc_thresh2) &&
+ time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) {
+ if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) {
+ net_info_ratelimited("%s: neighbor table overflow!\n",
+ tbl->id);
+ NEIGH_CACHE_STAT_INC(tbl, table_fulls);
goto out_entries;
+ }
}
- n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC);
+do_alloc:
+ n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
- memset(n, 0, tbl->entry_size);
-
- skb_queue_head_init(&n->arp_queue);
+ __skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
+ seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
+ n->flags = flags;
+ seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
- init_timer(&n->timer);
- n->timer.function = neigh_timer_handler;
- n->timer.data = (unsigned long)n;
+ timer_setup(&n->timer, neigh_timer_handler, 0);
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
- atomic_set(&n->refcnt, 1);
+ refcount_set(&n->refcnt, 1);
n->dead = 1;
+ INIT_LIST_HEAD(&n->gc_list);
+ INIT_LIST_HEAD(&n->managed_list);
+
+ atomic_inc(&tbl->entries);
out:
return n;
out_entries:
- atomic_dec(&tbl->entries);
+ if (!exempt_from_gc)
+ atomic_dec(&tbl->gc_entries);
goto out;
}
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static void neigh_get_hash_rnd(u32 *x)
+{
+ *x = get_random_u32() | 1;
+}
+
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
- unsigned long size = entries * sizeof(struct neighbour *);
- struct neighbour **ret;
+ size_t size = (1 << shift) * sizeof(struct hlist_head);
+ struct hlist_head *hash_heads;
+ struct neigh_hash_table *ret;
+ int i;
- if (size <= PAGE_SIZE) {
- ret = kzalloc(size, GFP_ATOMIC);
- } else {
- ret = (struct neighbour **)
- __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
+ ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+ if (!ret)
+ return NULL;
+
+ hash_heads = kzalloc(size, GFP_ATOMIC);
+ if (!hash_heads) {
+ kfree(ret);
+ return NULL;
}
+ ret->hash_heads = hash_heads;
+ ret->hash_shift = shift;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free_rcu(struct rcu_head *head)
{
- unsigned long size = entries * sizeof(struct neighbour *);
+ struct neigh_hash_table *nht = container_of(head,
+ struct neigh_hash_table,
+ rcu);
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
+ kfree(nht->hash_heads);
+ kfree(nht);
}
-static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+ unsigned long new_shift)
{
- struct neighbour **new_hash, **old_hash;
- unsigned int i, new_hash_mask, old_entries;
+ unsigned int i, hash;
+ struct neigh_hash_table *new_nht, *old_nht;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
- BUG_ON(new_entries & (new_entries - 1));
- new_hash = neigh_hash_alloc(new_entries);
- if (!new_hash)
- return;
-
- old_entries = tbl->hash_mask + 1;
- new_hash_mask = new_entries - 1;
- old_hash = tbl->hash_buckets;
+ old_nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ new_nht = neigh_hash_alloc(new_shift);
+ if (!new_nht)
+ return old_nht;
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
- for (i = 0; i < old_entries; i++) {
- struct neighbour *n, *next;
+ for (i = 0; i < (1 << old_nht->hash_shift); i++) {
+ struct hlist_node *tmp;
+ struct neighbour *n;
- for (n = old_hash[i]; n; n = next) {
- unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
+ neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
+ hash = tbl->hash(n->primary_key, n->dev,
+ new_nht->hash_rnd);
- hash_val &= new_hash_mask;
- next = n->next;
+ hash >>= (32 - new_nht->hash_shift);
- n->next = new_hash[hash_val];
- new_hash[hash_val] = n;
+ hlist_del_rcu(&n->hash);
+ hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
}
- tbl->hash_buckets = new_hash;
- tbl->hash_mask = new_hash_mask;
- neigh_hash_free(old_hash, old_entries);
+ rcu_assign_pointer(tbl->nht, new_nht);
+ call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+ return new_nht;
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
- int key_len = tbl->key_len;
- u32 hash_val = tbl->hash(pkey, dev);
-
- NEIGH_CACHE_STAT_INC(tbl, lookups);
-
- read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
- if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
- neigh_hold(n);
- NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
- }
- }
- read_unlock_bh(&tbl->lock);
- return n;
-}
-
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
-{
- struct neighbour *n;
- int key_len = tbl->key_len;
- u32 hash_val = tbl->hash(pkey, NULL);
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
- if (!memcmp(n->primary_key, pkey, key_len)) {
- neigh_hold(n);
- NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
- }
+ rcu_read_lock();
+ n = __neigh_lookup_noref(tbl, pkey, dev);
+ if (n) {
+ if (!refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
}
- read_unlock_bh(&tbl->lock);
+
+ rcu_read_unlock();
return n;
}
+EXPORT_SYMBOL(neigh_lookup);
-struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev)
+static struct neighbour *
+___neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, u32 flags,
+ bool exempt_from_gc, bool want_ref)
{
- u32 hash_val;
- int key_len = tbl->key_len;
+ u32 hash_val, key_len = tbl->key_len;
+ struct neighbour *n1, *rc, *n;
+ struct neigh_hash_table *nht;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
+ trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
@@ -395,7 +662,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
@@ -403,6 +670,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(dev, n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
@@ -410,149 +685,231 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
- n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
+ n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
- if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
- neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+ if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+ nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
- hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
+ hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
- for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
- if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
- neigh_hold(n1);
+ neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
+ if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
+ if (want_ref)
+ neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
- n->next = tbl->hash_buckets[hash_val];
- tbl->hash_buckets[hash_val] = n;
n->dead = 0;
- neigh_hold(n);
- write_unlock_bh(&tbl->lock);
- NEIGH_PRINTK2("neigh %p is created.\n", n);
+ if (!exempt_from_gc)
+ list_add_tail(&n->gc_list, &n->tbl->gc_list);
+ if (n->flags & NTF_MANAGED)
+ list_add_tail(&n->managed_list, &n->tbl->managed_list);
+ if (want_ref)
+ neigh_hold(n);
+ hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
+
+ hlist_add_head_rcu(&n->dev_list,
+ neigh_get_dev_table(dev, tbl->family));
+
+ spin_unlock_bh(&tbl->lock);
+ neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
out_neigh_release:
+ if (!exempt_from_gc)
+ atomic_dec(&tbl->gc_entries);
neigh_release(n);
goto out;
}
-struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev, int creat)
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
{
- struct pneigh_entry *n;
- int key_len = tbl->key_len;
- u32 hash_val = *(u32 *)(pkey + key_len - 4);
+ bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);
+ return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
+}
+EXPORT_SYMBOL(__neigh_create);
+
+static u32 pneigh_hash(const void *pkey, unsigned int key_len)
+{
+ u32 hash_val = *(u32 *)(pkey + key_len - 4);
hash_val ^= (hash_val >> 16);
hash_val ^= hash_val >> 8;
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
+ return hash_val;
+}
+
+struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey,
+ struct net_device *dev)
+{
+ struct pneigh_entry *n;
+ unsigned int key_len;
+ u32 hash_val;
- read_lock_bh(&tbl->lock);
+ key_len = tbl->key_len;
+ hash_val = pneigh_hash(pkey, key_len);
+ n = rcu_dereference_check(tbl->phash_buckets[hash_val],
+ lockdep_is_held(&tbl->phash_lock));
- for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
+ while (n) {
if (!memcmp(n->key, pkey, key_len) &&
- (n->dev == dev || !n->dev)) {
- read_unlock_bh(&tbl->lock);
- goto out;
- }
+ net_eq(pneigh_net(n), net) &&
+ (n->dev == dev || !n->dev))
+ return n;
+
+ n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock));
}
- read_unlock_bh(&tbl->lock);
- n = NULL;
- if (!creat)
- goto out;
- n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
- if (!n)
+ return NULL;
+}
+EXPORT_IPV6_MOD(pneigh_lookup);
+
+int pneigh_create(struct neigh_table *tbl, struct net *net,
+ const void *pkey, struct net_device *dev,
+ u32 flags, u8 protocol, bool permanent)
+{
+ struct pneigh_entry *n;
+ unsigned int key_len;
+ u32 hash_val;
+ int err = 0;
+
+ mutex_lock(&tbl->phash_lock);
+
+ n = pneigh_lookup(tbl, net, pkey, dev);
+ if (n)
+ goto update;
+
+ key_len = tbl->key_len;
+ n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ if (!n) {
+ err = -ENOBUFS;
goto out;
+ }
+ write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
- if (dev)
- dev_hold(dev);
+ netdev_hold(dev, &n->dev_tracker, GFP_KERNEL);
if (tbl->pconstructor && tbl->pconstructor(n)) {
- if (dev)
- dev_put(dev);
+ netdev_put(dev, &n->dev_tracker);
kfree(n);
- n = NULL;
+ err = -ENOBUFS;
goto out;
}
- write_lock_bh(&tbl->lock);
+ hash_val = pneigh_hash(pkey, key_len);
n->next = tbl->phash_buckets[hash_val];
- tbl->phash_buckets[hash_val] = n;
- write_unlock_bh(&tbl->lock);
+ rcu_assign_pointer(tbl->phash_buckets[hash_val], n);
+update:
+ WRITE_ONCE(n->flags, flags);
+ n->permanent = permanent;
+ WRITE_ONCE(n->protocol, protocol);
out:
- return n;
+ mutex_unlock(&tbl->phash_lock);
+ return err;
}
+static void pneigh_destroy(struct rcu_head *rcu)
+{
+ struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu);
+
+ netdev_put(n->dev, &n->dev_tracker);
+ kfree(n);
+}
-int pneigh_delete(struct neigh_table *tbl, const void *pkey,
+int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
struct net_device *dev)
{
- struct pneigh_entry *n, **np;
- int key_len = tbl->key_len;
- u32 hash_val = *(u32 *)(pkey + key_len - 4);
+ struct pneigh_entry *n, __rcu **np;
+ unsigned int key_len;
+ u32 hash_val;
- hash_val ^= (hash_val >> 16);
- hash_val ^= hash_val >> 8;
- hash_val ^= hash_val >> 4;
- hash_val &= PNEIGH_HASHMASK;
+ key_len = tbl->key_len;
+ hash_val = pneigh_hash(pkey, key_len);
- write_lock_bh(&tbl->lock);
- for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+ mutex_lock(&tbl->phash_lock);
+
+ for (np = &tbl->phash_buckets[hash_val];
+ (n = rcu_dereference_protected(*np, 1)) != NULL;
np = &n->next) {
- if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
- *np = n->next;
- write_unlock_bh(&tbl->lock);
+ if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
+ net_eq(pneigh_net(n), net)) {
+ rcu_assign_pointer(*np, n->next);
+
+ mutex_unlock(&tbl->phash_lock);
+
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
- kfree(n);
+
+ call_rcu(&n->rcu, pneigh_destroy);
return 0;
}
}
- write_unlock_bh(&tbl->lock);
+
+ mutex_unlock(&tbl->phash_lock);
return -ENOENT;
}
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
{
- struct pneigh_entry *n, **np;
+ struct pneigh_entry *n, __rcu **np;
+ LIST_HEAD(head);
u32 h;
+ mutex_lock(&tbl->phash_lock);
+
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
np = &tbl->phash_buckets[h];
- while ((n = *np) != NULL) {
+ while ((n = rcu_dereference_protected(*np, 1)) != NULL) {
+ if (skip_perm && n->permanent)
+ goto skip;
if (!dev || n->dev == dev) {
- *np = n->next;
- if (tbl->pdestructor)
- tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
- kfree(n);
+ rcu_assign_pointer(*np, n->next);
+ list_add(&n->free_node, &head);
continue;
}
+skip:
np = &n->next;
}
}
- return -ENOENT;
+
+ mutex_unlock(&tbl->phash_lock);
+
+ while (!list_empty(&head)) {
+ n = list_first_entry(&head, typeof(*n), free_node);
+ list_del(&n->free_node);
+
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+
+ call_rcu(&n->rcu, pneigh_destroy);
+ }
}
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+ if (refcount_dec_and_test(&parms->refcnt))
+ kfree(parms);
+}
/*
* neighbour must already be out of the table;
@@ -560,43 +917,36 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
*/
void neigh_destroy(struct neighbour *neigh)
{
- struct hh_cache *hh;
+ struct net_device *dev = neigh->dev;
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
- printk(KERN_WARNING
- "Destroying alive neighbour %p\n", neigh);
+ pr_warn("Destroying alive neighbour %p\n", neigh);
dump_stack();
return;
}
if (neigh_del_timer(neigh))
- printk(KERN_WARNING "Impossible event.\n");
+ pr_warn("Impossible event\n");
- while ((hh = neigh->hh) != NULL) {
- neigh->hh = hh->hh_next;
- hh->hh_next = NULL;
- write_lock_bh(&hh->hh_lock);
- hh->hh_output = neigh_blackhole;
- write_unlock_bh(&hh->hh_lock);
- if (atomic_dec_and_test(&hh->hh_refcnt))
- kfree(hh);
- }
-
- if (neigh->parms->neigh_destructor)
- (neigh->parms->neigh_destructor)(neigh);
+ write_lock_bh(&neigh->lock);
+ __skb_queue_purge(&neigh->arp_queue);
+ write_unlock_bh(&neigh->lock);
+ neigh->arp_queue_len_bytes = 0;
- skb_queue_purge(&neigh->arp_queue);
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
- dev_put(neigh->dev);
+ netdev_put(dev, &neigh->dev_tracker);
neigh_parms_put(neigh->parms);
- NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
+ neigh_dbg(2, "neigh %p is destroyed\n", neigh);
atomic_dec(&neigh->tbl->entries);
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+ kfree_rcu(neigh, rcu);
}
+EXPORT_SYMBOL(neigh_destroy);
/* Neighbour state is suspicious;
disable fast path.
@@ -605,14 +955,9 @@ void neigh_destroy(struct neighbour *neigh)
*/
static void neigh_suspect(struct neighbour *neigh)
{
- struct hh_cache *hh;
-
- NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
-
- neigh->output = neigh->ops->output;
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
- for (hh = neigh->hh; hh; hh = hh->hh_next)
- hh->hh_output = neigh->ops->output;
+ WRITE_ONCE(neigh->output, neigh->ops->output);
}
/* Neighbour state is OK;
@@ -622,108 +967,145 @@ static void neigh_suspect(struct neighbour *neigh)
*/
static void neigh_connect(struct neighbour *neigh)
{
- struct hh_cache *hh;
-
- NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
+ neigh_dbg(2, "neigh %p is connected\n", neigh);
- neigh->output = neigh->ops->connected_output;
-
- for (hh = neigh->hh; hh; hh = hh->hh_next)
- hh->hh_output = neigh->ops->hh_output;
+ WRITE_ONCE(neigh->output, neigh->ops->connected_output);
}
-static void neigh_periodic_timer(unsigned long arg)
+static void neigh_periodic_work(struct work_struct *work)
{
- struct neigh_table *tbl = (struct neigh_table *)arg;
- struct neighbour *n, **np;
- unsigned long expire, now = jiffies;
+ struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neigh_hash_table *nht;
+ struct hlist_node *tmp;
+ struct neighbour *n;
+ unsigned int i;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
- write_lock(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
/*
* periodically recompute ReachableTime from random function
*/
- if (time_after(now, tbl->last_rand + 300 * HZ)) {
+ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
- tbl->last_rand = now;
- for (p = &tbl->parms; p; p = p->next)
- p->reachable_time =
- neigh_rand_reach_time(p->base_reachable_time);
- }
- np = &tbl->hash_buckets[tbl->hash_chain_gc];
- tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
+ WRITE_ONCE(tbl->last_rand, jiffies);
+ list_for_each_entry(p, &tbl->parms_list, list)
+ neigh_set_reach_time(p);
+ }
- while ((n = *np) != NULL) {
- unsigned int state;
+ if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
+ goto out;
- write_lock(&n->lock);
+ for (i = 0 ; i < (1 << nht->hash_shift); i++) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
+ unsigned int state;
- state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
- write_unlock(&n->lock);
- goto next_elt;
- }
+ write_lock(&n->lock);
- if (time_before(n->used, n->confirmed))
- n->used = n->confirmed;
+ state = n->nud_state;
+ if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+ (n->flags &
+ (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) {
+ write_unlock(&n->lock);
+ continue;
+ }
- if (atomic_read(&n->refcnt) == 1 &&
- (state == NUD_FAILED ||
- time_after(now, n->used + n->parms->gc_staletime))) {
- *np = n->next;
- n->dead = 1;
+ if (time_before(n->used, n->confirmed) &&
+ time_is_before_eq_jiffies(n->confirmed))
+ n->used = n->confirmed;
+
+ if (refcount_read(&n->refcnt) == 1 &&
+ (state == NUD_FAILED ||
+ !time_in_range_open(jiffies, n->used,
+ n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ neigh_mark_dead(n);
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
write_unlock(&n->lock);
- neigh_release(n);
- continue;
}
- write_unlock(&n->lock);
-
-next_elt:
- np = &n->next;
+ /*
+ * It's fine to release lock here, even if hash table
+ * grows while we are preempted.
+ */
+ spin_unlock_bh(&tbl->lock);
+ cond_resched();
+ spin_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
}
-
- /* Cycle through all hash buckets every base_reachable_time/2 ticks.
- * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
- * base_reachable_time.
+out:
+ /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
+ * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
+ * BASE_REACHABLE_TIME.
*/
- expire = tbl->parms.base_reachable_time >> 1;
- expire /= (tbl->hash_mask + 1);
- if (!expire)
- expire = 1;
-
- mod_timer(&tbl->gc_timer, now + expire);
-
- write_unlock(&tbl->lock);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
+ spin_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- return (n->nud_state & NUD_PROBE ?
- p->ucast_probes :
- p->ucast_probes + p->app_probes + p->mcast_probes);
+ return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
+ (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) :
+ NEIGH_VAR(p, MCAST_PROBES));
}
-static inline void neigh_add_timer(struct neighbour *n, unsigned long when)
+static void neigh_invalidate(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
{
- if (unlikely(mod_timer(&n->timer, when))) {
- printk("NEIGH: BUG, double timer add, state is %x\n",
- n->nud_state);
- dump_stack();
+ struct sk_buff *skb;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
+ neigh_dbg(2, "neigh %p is failed\n", neigh);
+ neigh->updated = jiffies;
+
+ /* It is very thin place. report_unreachable is very complicated
+ routine. Particularly, it can hit the same neighbour entry!
+
+ So that, we try to be accurate and avoid dead loop. --ANK
+ */
+ while (neigh->nud_state == NUD_FAILED &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ write_unlock(&neigh->lock);
+ neigh->ops->error_report(neigh, skb);
+ write_lock(&neigh->lock);
}
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
+static void neigh_probe(struct neighbour *neigh)
+ __releases(neigh->lock)
+{
+ struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_clone(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ if (neigh->ops->solicit)
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ consume_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
-static void neigh_timer_handler(unsigned long arg)
+static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
- struct neighbour *neigh = (struct neighbour *)arg;
- unsigned state;
+ struct neighbour *neigh = timer_container_of(neigh, t, timer);
+ unsigned int state;
int notify = 0;
write_lock(&neigh->lock);
@@ -732,180 +1114,195 @@ static void neigh_timer_handler(unsigned long arg)
now = jiffies;
next = now + HZ;
- if (!(state & NUD_IN_TIMER)) {
-#ifndef CONFIG_SMP
- printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
-#endif
+ if (!(state & NUD_IN_TIMER))
goto out;
- }
if (state & NUD_REACHABLE) {
- if (time_before_eq(now,
+ if (time_before_eq(now,
neigh->confirmed + neigh->parms->reachable_time)) {
- NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
+ neigh_dbg(2, "neigh %p is still alive\n", neigh);
next = neigh->confirmed + neigh->parms->reachable_time;
} else if (time_before_eq(now,
- neigh->used + neigh->parms->delay_probe_time)) {
- NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
- neigh->nud_state = NUD_DELAY;
+ neigh->used +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_DELAY);
neigh->updated = jiffies;
neigh_suspect(neigh);
- next = now + neigh->parms->delay_probe_time;
+ next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
} else {
- NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
- neigh->nud_state = NUD_STALE;
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_STALE);
neigh->updated = jiffies;
neigh_suspect(neigh);
notify = 1;
}
} else if (state & NUD_DELAY) {
- if (time_before_eq(now,
- neigh->confirmed + neigh->parms->delay_probe_time)) {
- NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh);
- neigh->nud_state = NUD_REACHABLE;
+ if (time_before_eq(now,
+ neigh->confirmed +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is now reachable\n", neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
neigh->updated = jiffies;
neigh_connect(neigh);
notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
- NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
- neigh->nud_state = NUD_PROBE;
+ neigh_dbg(2, "neigh %p is probed\n", neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_PROBE);
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
- next = now + neigh->parms->retrans_time;
+ notify = 1;
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + neigh->parms->retrans_time;
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
- struct sk_buff *skb;
-
- neigh->nud_state = NUD_FAILED;
- neigh->updated = jiffies;
- notify = 1;
- NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
- NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
-
- /* It is very thin place. report_unreachable is very complicated
- routine. Particularly, it can hit the same neighbour entry!
-
- So that, we try to be accurate and avoid dead loop. --ANK
- */
- while (neigh->nud_state == NUD_FAILED &&
- (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- write_unlock(&neigh->lock);
- neigh->ops->error_report(neigh, skb);
- write_lock(&neigh->lock);
+ if (neigh->nud_state == NUD_PROBE &&
+ neigh->flags & NTF_EXT_VALIDATED) {
+ WRITE_ONCE(neigh->nud_state, NUD_STALE);
+ neigh->updated = jiffies;
+ } else {
+ WRITE_ONCE(neigh->nud_state, NUD_FAILED);
+ neigh_invalidate(neigh);
}
- skb_queue_purge(&neigh->arp_queue);
+ notify = 1;
+ goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
- if (time_before(next, jiffies + HZ/2))
- next = jiffies + HZ/2;
+ if (time_before(next, jiffies + HZ/100))
+ next = jiffies + HZ/100;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
- struct sk_buff *skb = skb_peek(&neigh->arp_queue);
- /* keep skb alive even if arp_queue overflows */
- if (skb)
- skb_get(skb);
- write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb);
- atomic_inc(&neigh->probes);
- if (skb)
- kfree_skb(skb);
+ neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
}
+
if (notify)
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ neigh_update_notify(neigh, 0);
+
+ trace_neigh_timer_handler(neigh, 0);
-#ifdef CONFIG_ARPD
- if (notify && neigh->parms->app_probes)
- neigh_app_notify(neigh);
-#endif
neigh_release(neigh);
}
-int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
+ const bool immediate_ok)
{
int rc;
- unsigned long now;
+ bool immediate_probe = false;
write_lock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
+ if (neigh->dead)
+ goto out_dead;
- now = jiffies;
-
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
- if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
- atomic_set(&neigh->probes, neigh->parms->ucast_probes);
- neigh->nud_state = NUD_INCOMPLETE;
- neigh->updated = jiffies;
- neigh_hold(neigh);
- neigh_add_timer(neigh, now + 1);
+ if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
+ NEIGH_VAR(neigh->parms, APP_PROBES)) {
+ unsigned long next, now = jiffies;
+
+ atomic_set(&neigh->probes,
+ NEIGH_VAR(neigh->parms, UCAST_PROBES));
+ neigh_del_timer(neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
+ neigh->updated = now;
+ if (!immediate_ok) {
+ next = now + 1;
+ } else {
+ immediate_probe = true;
+ next = now + max(NEIGH_VAR(neigh->parms,
+ RETRANS_TIME),
+ HZ / 100);
+ }
+ neigh_add_timer(neigh, next);
} else {
- neigh->nud_state = NUD_FAILED;
+ WRITE_ONCE(neigh->nud_state, NUD_FAILED);
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
- if (skb)
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
- NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
- neigh_hold(neigh);
- neigh->nud_state = NUD_DELAY;
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
+ neigh_del_timer(neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_DELAY);
neigh->updated = jiffies;
- neigh_add_timer(neigh,
- jiffies + neigh->parms->delay_probe_time);
+ neigh_add_timer(neigh, jiffies +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
- if (skb_queue_len(&neigh->arp_queue) >=
- neigh->parms->queue_len) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
struct sk_buff *buff;
- buff = neigh->arp_queue.next;
- __skb_unlink(buff, &neigh->arp_queue);
- kfree_skb(buff);
+
+ buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
+ kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
+ NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
+ skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
- write_unlock_bh(&neigh->lock);
+ if (immediate_probe)
+ neigh_probe(neigh);
+ else
+ write_unlock(&neigh->lock);
+ local_bh_enable();
+ trace_neigh_event_send_done(neigh, rc);
return rc;
+
+out_dead:
+ if (neigh->nud_state & NUD_STALE)
+ goto out_unlock_bh;
+ write_unlock_bh(&neigh->lock);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
+ trace_neigh_event_send_dead(neigh, 1);
+ return 1;
}
+EXPORT_SYMBOL(__neigh_event_send);
static void neigh_update_hhs(struct neighbour *neigh)
{
struct hh_cache *hh;
- void (*update)(struct hh_cache*, struct net_device*, unsigned char *) =
- neigh->dev->header_cache_update;
+ void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
+ = NULL;
+
+ if (neigh->dev->header_ops)
+ update = neigh->dev->header_ops->cache_update;
if (update) {
- for (hh = neigh->hh; hh; hh = hh->hh_next) {
- write_lock_bh(&hh->hh_lock);
+ hh = &neigh->hh;
+ if (READ_ONCE(hh->hh_len)) {
+ write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
- write_unlock_bh(&hh->hh_lock);
+ write_sequnlock_bh(&hh->hh_lock);
}
}
}
-
-
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
@@ -913,28 +1310,31 @@ static void neigh_update_hhs(struct neighbour *neigh)
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
if it is different.
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
- lladdr instead of overriding it
+ lladdr instead of overriding it
if it is different.
- It also allows to retain current state
- if lladdr is unchanged.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
-
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+ NEIGH_UPDATE_F_USE means that the entry is user triggered.
+ NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed.
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
+ NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed
+ or invalidated.
Caller MUST hold reference count on the entry.
*/
-
-int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
- u32 flags)
+static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
+ u8 new, u32 flags, u32 nlmsg_pid,
+ struct netlink_ext_ack *extack)
{
- u8 old;
- int err;
- int notify = 0;
- struct net_device *dev;
+ bool gc_update = false, managed_update = false;
int update_isrouter = 0;
+ struct net_device *dev;
+ int err, notify = 0;
+ u8 old;
+
+ trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid);
write_lock_bh(&neigh->lock);
@@ -942,17 +1342,35 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
old = neigh->nud_state;
err = -EPERM;
- if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ if (neigh->dead) {
+ NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
+ new = old;
+ goto out;
+ }
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
+ neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
+ if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
+ new = old & ~NUD_PERMANENT;
+ WRITE_ONCE(neigh->nud_state, new);
+ err = 0;
+ goto out;
+ }
+
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
neigh_suspect(neigh);
- neigh->nud_state = new;
+ WRITE_ONCE(neigh->nud_state, new);
err = 0;
notify = old & NUD_VALID;
+ if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ (new & NUD_FAILED)) {
+ neigh_invalidate(neigh);
+ notify = 1;
+ }
goto out;
}
@@ -966,7 +1384,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
- compare new & old
- if they are different, check override flag
*/
- if ((old & NUD_VALID) &&
+ if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
} else {
@@ -974,14 +1392,18 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
use it, otherwise discard the request.
*/
err = -EINVAL;
- if (!(old & NUD_VALID))
+ if (!(old & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack, "No link layer address given");
goto out;
+ }
lladdr = neigh->ha;
}
+ /* Update confirmed timestamp for neighbour entry after we
+ * received ARP packet even if it doesn't change IP to MAC binding.
+ */
if (new & NUD_CONNECTED)
neigh->confirmed = jiffies;
- neigh->updated = jiffies;
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
@@ -999,31 +1421,39 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
goto out;
} else {
if (lladdr == neigh->ha && new == NUD_STALE &&
- ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
- (old & NUD_CONNECTED))
- )
+ !(flags & NEIGH_UPDATE_F_ADMIN))
new = old;
}
}
+ /* Update timestamp only once we know we will make a change to the
+ * neighbour entry. Otherwise we risk to move the locktime window with
+ * noop updates and ignore relevant ARP updates.
+ */
+ if (new != old || lladdr != neigh->ha)
+ neigh->updated = jiffies;
+
if (new != old) {
neigh_del_timer(neigh);
- if (new & NUD_IN_TIMER) {
- neigh_hold(neigh);
- neigh_add_timer(neigh, (jiffies +
- ((new & NUD_REACHABLE) ?
+ if (new & NUD_PROBE)
+ atomic_set(&neigh->probes, 0);
+ if (new & NUD_IN_TIMER)
+ neigh_add_timer(neigh, (jiffies +
+ ((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
- }
- neigh->nud_state = new;
+ WRITE_ONCE(neigh->nud_state, new);
+ notify = 1;
}
if (lladdr != neigh->ha) {
+ write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
+ write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
- (neigh->parms->base_reachable_time << 1);
+ (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
notify = 1;
}
if (new == old)
@@ -1039,33 +1469,75 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- struct neighbour *n1 = neigh;
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
- /* On shaper/eql skb->dst->neighbour != neigh :( */
- if (skb->dst && skb->dst->neighbour)
- n1 = skb->dst->neighbour;
- n1->output(skb);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst &&
+ READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ READ_ONCE(n1->output)(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
write_lock_bh(&neigh->lock);
}
- skb_queue_purge(&neigh->arp_queue);
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
}
out:
- if (update_isrouter) {
- neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
- (neigh->flags | NTF_ROUTER) :
- (neigh->flags & ~NTF_ROUTER);
- }
+ if (update_isrouter)
+ neigh_update_is_router(neigh, flags, &notify);
write_unlock_bh(&neigh->lock);
-
+ if (((new ^ old) & NUD_PERMANENT) || gc_update)
+ neigh_update_gc_list(neigh);
+ if (managed_update)
+ neigh_update_managed_list(neigh);
if (notify)
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
-#ifdef CONFIG_ARPD
- if (notify && neigh->parms->app_probes)
- neigh_app_notify(neigh);
-#endif
+ neigh_update_notify(neigh, nlmsg_pid);
+ trace_neigh_update_done(neigh, err);
return err;
}
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+ u32 flags, u32 nlmsg_pid)
+{
+ return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL);
+}
+EXPORT_SYMBOL(neigh_update);
+
+/* Update the neigh to listen temporarily for probe responses, even if it is
+ * in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
+ */
+void __neigh_set_probe_once(struct neighbour *neigh)
+{
+ if (neigh->dead)
+ return;
+ neigh->updated = jiffies;
+ if (!(neigh->nud_state & NUD_FAILED))
+ return;
+ WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
+ neigh_add_timer(neigh,
+ jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100));
+}
+EXPORT_SYMBOL(__neigh_set_probe_once);
+
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
struct net_device *dev)
@@ -1073,175 +1545,163 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
lladdr || !dev->addr_len);
if (neigh)
- neigh_update(neigh, lladdr, NUD_STALE,
- NEIGH_UPDATE_F_OVERRIDE);
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_OVERRIDE, 0);
return neigh;
}
+EXPORT_SYMBOL(neigh_event_ns);
-static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
- __be16 protocol)
-{
- struct hh_cache *hh;
- struct net_device *dev = dst->dev;
-
- for (hh = n->hh; hh; hh = hh->hh_next)
- if (hh->hh_type == protocol)
- break;
-
- if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
- rwlock_init(&hh->hh_lock);
- hh->hh_type = protocol;
- atomic_set(&hh->hh_refcnt, 0);
- hh->hh_next = NULL;
- if (dev->hard_header_cache(n, hh)) {
- kfree(hh);
- hh = NULL;
- } else {
- atomic_inc(&hh->hh_refcnt);
- hh->hh_next = n->hh;
- n->hh = hh;
- if (n->nud_state & NUD_CONNECTED)
- hh->hh_output = n->ops->hh_output;
- else
- hh->hh_output = n->ops->output;
- }
- }
- if (hh) {
- atomic_inc(&hh->hh_refcnt);
- dst->hh = hh;
- }
-}
-
-/* This function can be used in contexts, where only old dev_queue_xmit
- worked, f.e. if you want to override normal output path (eql, shaper),
- but resolution is not made yet.
- */
-
-int neigh_compat_output(struct sk_buff *skb)
+/* called with read_lock_bh(&n->lock); */
+static void neigh_hh_init(struct neighbour *n)
{
- struct net_device *dev = skb->dev;
+ struct net_device *dev = n->dev;
+ __be16 prot = n->tbl->protocol;
+ struct hh_cache *hh = &n->hh;
- __skb_pull(skb, skb->nh.raw - skb->data);
+ write_lock_bh(&n->lock);
- if (dev->hard_header &&
- dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
- skb->len) < 0 &&
- dev->rebuild_header(skb))
- return 0;
+ /* Only one thread can come in here and initialize the
+ * hh_cache entry.
+ */
+ if (!hh->hh_len)
+ dev->header_ops->cache(n, hh, prot);
- return dev_queue_xmit(skb);
+ write_unlock_bh(&n->lock);
}
/* Slow and careful. */
-int neigh_resolve_output(struct sk_buff *skb)
+int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh;
int rc = 0;
- if (!dst || !(neigh = dst->neighbour))
- goto discard;
-
- __skb_pull(skb, skb->nh.raw - skb->data);
-
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
- if (dev->hard_header_cache && !dst->hh) {
- write_lock_bh(&neigh->lock);
- if (!dst->hh)
- neigh_hh_init(neigh, dst, dst->ops->protocol);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- write_unlock_bh(&neigh->lock);
- } else {
- read_lock_bh(&neigh->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
- }
+ unsigned int seq;
+
+ if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
+ neigh_hh_init(neigh);
+
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
if (err >= 0)
- rc = neigh->ops->queue_xmit(skb);
+ rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
-discard:
- NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
- dst, dst ? dst->neighbour : NULL);
out_kfree_skb:
rc = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
goto out;
}
+EXPORT_SYMBOL(neigh_resolve_output);
/* As fast as possible without hh cache */
-int neigh_connected_output(struct sk_buff *skb)
+int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
- int err;
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh = dst->neighbour;
struct net_device *dev = neigh->dev;
+ unsigned int seq;
+ int err;
- __skb_pull(skb, skb->nh.raw - skb->data);
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
- read_lock_bh(&neigh->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
if (err >= 0)
- err = neigh->ops->queue_xmit(skb);
+ err = dev_queue_xmit(skb);
else {
err = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
}
return err;
}
+EXPORT_SYMBOL(neigh_connected_output);
+
+int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_direct_output);
+
+static void neigh_managed_work(struct work_struct *work)
+{
+ struct neigh_table *tbl = container_of(work, struct neigh_table,
+ managed_work.work);
+ struct neighbour *neigh;
+
+ spin_lock_bh(&tbl->lock);
+ list_for_each_entry(neigh, &tbl->managed_list, managed_list)
+ neigh_event_send_probe(neigh, NULL, false);
+ queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
+ NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
+ spin_unlock_bh(&tbl->lock);
+}
-static void neigh_proxy_process(unsigned long arg)
+static void neigh_proxy_process(struct timer_list *t)
{
- struct neigh_table *tbl = (struct neigh_table *)arg;
+ struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer);
long sched_next = 0;
unsigned long now = jiffies;
- struct sk_buff *skb;
+ struct sk_buff *skb, *n;
spin_lock(&tbl->proxy_queue.lock);
- skb = tbl->proxy_queue.next;
-
- while (skb != (struct sk_buff *)&tbl->proxy_queue) {
- struct sk_buff *back = skb;
- long tdif = NEIGH_CB(back)->sched_next - now;
+ skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+ long tdif = NEIGH_CB(skb)->sched_next - now;
- skb = skb->next;
if (tdif <= 0) {
- struct net_device *dev = back->dev;
- __skb_unlink(back, &tbl->proxy_queue);
- if (tbl->proxy_redo && netif_running(dev))
- tbl->proxy_redo(back);
- else
- kfree_skb(back);
+ struct net_device *dev = skb->dev;
+
+ neigh_parms_qlen_dec(dev, tbl->family);
+ __skb_unlink(skb, &tbl->proxy_queue);
+
+ if (tbl->proxy_redo && netif_running(dev)) {
+ rcu_read_lock();
+ tbl->proxy_redo(skb);
+ rcu_read_unlock();
+ } else {
+ kfree_skb(skb);
+ }
dev_put(dev);
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
- del_timer(&tbl->proxy_timer);
+ timer_delete(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
+static unsigned long neigh_proxy_delay(struct neigh_parms *p)
+{
+ /* If proxy_delay is zero, do not call get_random_u32_below()
+ * as it is undefined behavior.
+ */
+ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY);
+
+ return proxy_delay ?
+ jiffies + get_random_u32_below(proxy_delay) : jiffies;
+}
+
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
- unsigned long now = jiffies;
- unsigned long sched_next = now + (net_random() % p->proxy_delay);
+ unsigned long sched_next = neigh_proxy_delay(p);
- if (tbl->proxy_queue.qlen > p->proxy_qlen) {
+ if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
@@ -1250,47 +1710,66 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
- if (del_timer(&tbl->proxy_timer)) {
+ if (timer_delete(&tbl->proxy_timer)) {
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
- dst_release(skb->dst);
- skb->dst = NULL;
+ skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
+ p->qlen++;
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
+EXPORT_SYMBOL(pneigh_enqueue);
+static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
+ struct net *net, int ifindex)
+{
+ struct neigh_parms *p;
+
+ list_for_each_entry(p, &tbl->parms_list, list) {
+ if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
+ (!p->dev && !ifindex && net_eq(net, &init_net)))
+ return p;
+ }
+
+ return NULL;
+}
struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
struct neigh_table *tbl)
{
- struct neigh_parms *p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
+ struct neigh_parms *p;
+ struct net *net = dev_net(dev);
+ const struct net_device_ops *ops = dev->netdev_ops;
+ p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl = tbl;
- atomic_set(&p->refcnt, 1);
- INIT_RCU_HEAD(&p->rcu_head);
- p->reachable_time =
- neigh_rand_reach_time(p->base_reachable_time);
- if (dev) {
- if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
- kfree(p);
- return NULL;
- }
+ refcount_set(&p->refcnt, 1);
+ neigh_set_reach_time(p);
+ p->qlen = 0;
+ netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
+ p->dev = dev;
+ write_pnet(&p->net, net);
+ p->sysctl_table = NULL;
- dev_hold(dev);
- p->dev = dev;
+ if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+ netdev_put(dev, &p->dev_tracker);
+ kfree(p);
+ return NULL;
}
- p->sysctl_table = NULL;
- write_lock_bh(&tbl->lock);
- p->next = tbl->parms.next;
- tbl->parms.next = p;
- write_unlock_bh(&tbl->lock);
+
+ spin_lock_bh(&tbl->lock);
+ list_add_rcu(&p->list, &tbl->parms.list);
+ spin_unlock_bh(&tbl->lock);
+
+ neigh_parms_data_state_cleanall(p);
}
return p;
}
+EXPORT_SYMBOL(neigh_parms_alloc);
static void neigh_rcu_free_parms(struct rcu_head *head)
{
@@ -1302,295 +1781,390 @@ static void neigh_rcu_free_parms(struct rcu_head *head)
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
- struct neigh_parms **p;
-
if (!parms || parms == &tbl->parms)
return;
- write_lock_bh(&tbl->lock);
- for (p = &tbl->parms.next; *p; p = &(*p)->next) {
- if (*p == parms) {
- *p = parms->next;
- parms->dead = 1;
- write_unlock_bh(&tbl->lock);
- if (parms->dev)
- dev_put(parms->dev);
- call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
- return;
- }
- }
- write_unlock_bh(&tbl->lock);
- NEIGH_PRINTK1("neigh_parms_release: not found\n");
-}
-void neigh_parms_destroy(struct neigh_parms *parms)
-{
- kfree(parms);
+ spin_lock_bh(&tbl->lock);
+ list_del_rcu(&parms->list);
+ parms->dead = 1;
+ spin_unlock_bh(&tbl->lock);
+
+ netdev_put(parms->dev, &parms->dev_tracker);
+ call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
+EXPORT_SYMBOL(neigh_parms_release);
+
+static struct lock_class_key neigh_table_proxy_queue_class;
-void neigh_table_init_no_netlink(struct neigh_table *tbl)
+static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+
+void neigh_table_init(int index, struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
- atomic_set(&tbl->parms.refcnt, 1);
- INIT_RCU_HEAD(&tbl->parms.rcu_head);
- tbl->parms.reachable_time =
- neigh_rand_reach_time(tbl->parms.base_reachable_time);
+ INIT_LIST_HEAD(&tbl->parms_list);
+ INIT_LIST_HEAD(&tbl->gc_list);
+ INIT_LIST_HEAD(&tbl->managed_list);
+
+ list_add(&tbl->parms.list, &tbl->parms_list);
+ write_pnet(&tbl->parms.net, &init_net);
+ refcount_set(&tbl->parms.refcnt, 1);
+ neigh_set_reach_time(&tbl->parms);
+ tbl->parms.qlen = 0;
- if (!tbl->kmem_cachep)
- tbl->kmem_cachep =
- kmem_cache_create(tbl->id, tbl->entry_size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
-
+
#ifdef CONFIG_PROC_FS
- tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat);
- if (!tbl->pde)
+ if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat,
+ &neigh_stat_seq_ops, tbl))
panic("cannot create neighbour proc dir entry");
- tbl->pde->proc_fops = &neigh_stat_seq_fops;
- tbl->pde->data = tbl;
#endif
- tbl->hash_mask = 1;
- tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
- if (!tbl->hash_buckets || !tbl->phash_buckets)
+ if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+ if (!tbl->entry_size)
+ tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
+ tbl->key_len, NEIGH_PRIV_ALIGN);
+ else
+ WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
+
+ spin_lock_init(&tbl->lock);
+ mutex_init(&tbl->phash_lock);
- rwlock_init(&tbl->lock);
- init_timer(&tbl->gc_timer);
- tbl->gc_timer.data = (unsigned long)tbl;
- tbl->gc_timer.function = neigh_periodic_timer;
- tbl->gc_timer.expires = now + 1;
- add_timer(&tbl->gc_timer);
+ INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ tbl->parms.reachable_time);
+ INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0);
- init_timer(&tbl->proxy_timer);
- tbl->proxy_timer.data = (unsigned long)tbl;
- tbl->proxy_timer.function = neigh_proxy_process;
- skb_queue_head_init(&tbl->proxy_queue);
+ timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
+ skb_queue_head_init_class(&tbl->proxy_queue,
+ &neigh_table_proxy_queue_class);
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
-}
-
-void neigh_table_init(struct neigh_table *tbl)
-{
- struct neigh_table *tmp;
- neigh_table_init_no_netlink(tbl);
- write_lock(&neigh_tbl_lock);
- for (tmp = neigh_tables; tmp; tmp = tmp->next) {
- if (tmp->family == tbl->family)
- break;
- }
- tbl->next = neigh_tables;
- neigh_tables = tbl;
- write_unlock(&neigh_tbl_lock);
-
- if (unlikely(tmp)) {
- printk(KERN_ERR "NEIGH: Registering multiple tables for "
- "family %d\n", tbl->family);
- dump_stack();
- }
+ rcu_assign_pointer(neigh_tables[index], tbl);
}
+EXPORT_SYMBOL(neigh_table_init);
-int neigh_table_clear(struct neigh_table *tbl)
+/*
+ * Only called from ndisc_cleanup(), which means this is dead code
+ * because we no longer can unload IPv6 module.
+ */
+int neigh_table_clear(int index, struct neigh_table *tbl)
{
- struct neigh_table **tp;
+ RCU_INIT_POINTER(neigh_tables[index], NULL);
+ synchronize_rcu();
/* It is not clean... Fix it to unload IPv6 module safely */
- del_timer_sync(&tbl->gc_timer);
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ cancel_delayed_work_sync(&tbl->managed_work);
+ cancel_delayed_work_sync(&tbl->gc_work);
+ timer_delete_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
- printk(KERN_CRIT "neighbour leakage\n");
- write_lock(&neigh_tbl_lock);
- for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
- if (*tp == tbl) {
- *tp = tbl->next;
- break;
- }
- }
- write_unlock(&neigh_tbl_lock);
+ pr_crit("neighbour leakage\n");
- neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
- tbl->hash_buckets = NULL;
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
+ tbl->nht = NULL;
kfree(tbl->phash_buckets);
tbl->phash_buckets = NULL;
+ remove_proc_entry(tbl->id, init_net.proc_net_stat);
+
free_percpu(tbl->stats);
tbl->stats = NULL;
return 0;
}
+EXPORT_SYMBOL(neigh_table_clear);
-int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static struct neigh_table *neigh_find_table(int family)
+{
+ struct neigh_table *tbl = NULL;
+
+ switch (family) {
+ case AF_INET:
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);
+ break;
+ case AF_INET6:
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);
+ break;
+ }
+
+ return tbl;
+}
+
+const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
+ [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
+ [NDA_PROBES] = { .type = NLA_U32 },
+ [NDA_VLAN] = { .type = NLA_U16 },
+ [NDA_PORT] = { .type = NLA_U16 },
+ [NDA_VNI] = { .type = NLA_U32 },
+ [NDA_IFINDEX] = { .type = NLA_U32 },
+ [NDA_MASTER] = { .type = NLA_U32 },
+ [NDA_PROTOCOL] = { .type = NLA_U8 },
+ [NDA_NH_ID] = { .type = NLA_U32 },
+ [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK),
+ [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *dst_attr;
struct neigh_table *tbl;
+ struct neighbour *neigh;
struct net_device *dev = NULL;
int err = -EINVAL;
+ ASSERT_RTNL();
if (nlmsg_len(nlh) < sizeof(*ndm))
goto out;
dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
- if (dst_attr == NULL)
+ if (!dst_attr) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
goto out;
+ }
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
- dev = dev_get_by_index(ndm->ndm_ifindex);
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- struct neighbour *neigh;
-
- if (tbl->family != ndm->ndm_family)
- continue;
- read_unlock(&neigh_tbl_lock);
-
- if (nla_len(dst_attr) < tbl->key_len)
- goto out_dev_put;
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
- if (ndm->ndm_flags & NTF_PROXY) {
- err = pneigh_delete(tbl, nla_data(dst_attr), dev);
- goto out_dev_put;
- }
+ if (nla_len(dst_attr) < (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address");
+ goto out;
+ }
- if (dev == NULL)
- goto out_dev_put;
+ if (ndm->ndm_flags & NTF_PROXY) {
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out;
+ }
- neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
- if (neigh == NULL) {
- err = -ENOENT;
- goto out_dev_put;
- }
+ if (dev == NULL)
+ goto out;
- err = neigh_update(neigh, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
- goto out_dev_put;
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
+ goto out;
}
- read_unlock(&neigh_tbl_lock);
- err = -EAFNOSUPPORT;
-out_dev_put:
- if (dev)
- dev_put(dev);
+ err = __neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN,
+ NETLINK_CB(skb).portid, extack);
+ spin_lock_bh(&tbl->lock);
+ neigh_release(neigh);
+ neigh_remove_one(neigh);
+ spin_unlock_bh(&tbl->lock);
+
out:
return err;
}
-int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
+ struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct neigh_table *tbl;
struct net_device *dev = NULL;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
+ u8 protocol = 0;
+ u32 ndm_flags;
int err;
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ ASSERT_RTNL();
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
+ nda_policy, extack);
if (err < 0)
goto out;
err = -EINVAL;
- if (tb[NDA_DST] == NULL)
+ if (!tb[NDA_DST]) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
goto out;
+ }
ndm = nlmsg_data(nlh);
+ ndm_flags = ndm->ndm_flags;
+ if (tb[NDA_FLAGS_EXT]) {
+ u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+ BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE <
+ (sizeof(ndm->ndm_flags) * BITS_PER_BYTE +
+ hweight32(NTF_EXT_MASK)));
+ ndm_flags |= (ext << NTF_EXT_SHIFT);
+ }
if (ndm->ndm_ifindex) {
- dev = dev_get_by_index(ndm->ndm_ifindex);
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
- if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
- goto out_dev_put;
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) {
+ NL_SET_ERR_MSG(extack, "Invalid link address");
+ goto out;
+ }
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
- struct neighbour *neigh;
- void *dst, *lladdr;
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
- if (tbl->family != ndm->ndm_family)
- continue;
- read_unlock(&neigh_tbl_lock);
+ if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address");
+ goto out;
+ }
+
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
- if (nla_len(tb[NDA_DST]) < tbl->key_len)
- goto out_dev_put;
- dst = nla_data(tb[NDA_DST]);
- lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+ if (tb[NDA_PROTOCOL])
+ protocol = nla_get_u8(tb[NDA_PROTOCOL]);
+ if (ndm_flags & NTF_PROXY) {
+ if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) {
+ NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination");
+ goto out;
+ }
+
+ err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol,
+ !!(ndm->ndm_state & NUD_PERMANENT));
+ goto out;
+ }
- if (ndm->ndm_flags & NTF_PROXY) {
- struct pneigh_entry *pn;
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device not specified");
+ goto out;
+ }
- err = -ENOBUFS;
- pn = pneigh_lookup(tbl, dst, dev, 1);
- if (pn) {
- pn->flags = ndm->ndm_flags;
- err = 0;
- }
- goto out_dev_put;
+ if (tbl->allow_add && !tbl->allow_add(dev, extack)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT;
+ bool exempt_from_gc = ndm_permanent ||
+ ndm_flags & (NTF_EXT_LEARNED |
+ NTF_EXT_VALIDATED);
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
+ goto out;
+ }
+ if (ndm_permanent && (ndm_flags & NTF_MANAGED)) {
+ NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry");
+ err = -EINVAL;
+ goto out;
}
+ if (ndm_flags & NTF_EXT_VALIDATED) {
+ u8 state = ndm->ndm_state;
- if (dev == NULL)
- goto out_dev_put;
+ /* NTF_USE and NTF_MANAGED will result in the neighbor
+ * being created with an invalid state (NUD_NONE).
+ */
+ if (ndm_flags & (NTF_USE | NTF_MANAGED))
+ state = NUD_NONE;
- neigh = neigh_lookup(tbl, dst, dev);
- if (neigh == NULL) {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- err = -ENOENT;
- goto out_dev_put;
- }
-
- neigh = __neigh_lookup_errno(tbl, dst, dev);
- if (IS_ERR(neigh)) {
- err = PTR_ERR(neigh);
- goto out_dev_put;
- }
- } else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
- err = -EEXIST;
- neigh_release(neigh);
- goto out_dev_put;
+ if (!(state & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot create externally validated neighbor with an invalid state");
+ err = -EINVAL;
+ goto out;
}
+ }
- if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
- flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ neigh = ___neigh_create(tbl, dst, dev,
+ ndm_flags &
+ (NTF_EXT_LEARNED | NTF_MANAGED |
+ NTF_EXT_VALIDATED),
+ exempt_from_gc, true);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out;
+ }
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
+ goto out;
}
+ if (ndm_flags & NTF_EXT_VALIDATED) {
+ u8 state = ndm->ndm_state;
- err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
- neigh_release(neigh);
- goto out_dev_put;
- }
+ /* NTF_USE and NTF_MANAGED do not update the existing
+ * state other than clearing it if it was
+ * NUD_PERMANENT.
+ */
+ if (ndm_flags & (NTF_USE | NTF_MANAGED))
+ state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT;
- read_unlock(&neigh_tbl_lock);
- err = -EAFNOSUPPORT;
+ if (!(state & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot mark neighbor as externally validated with an invalid state");
+ err = -EINVAL;
+ neigh_release(neigh);
+ goto out;
+ }
+ }
-out_dev_put:
- if (dev)
- dev_put(dev);
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~(NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+ }
+
+ if (protocol)
+ neigh->protocol = protocol;
+ if (ndm_flags & NTF_EXT_LEARNED)
+ flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+ if (ndm_flags & NTF_ROUTER)
+ flags |= NEIGH_UPDATE_F_ISROUTER;
+ if (ndm_flags & NTF_MANAGED)
+ flags |= NEIGH_UPDATE_F_MANAGED;
+ if (ndm_flags & NTF_USE)
+ flags |= NEIGH_UPDATE_F_USE;
+ if (ndm_flags & NTF_EXT_VALIDATED)
+ flags |= NEIGH_UPDATE_F_EXT_VALIDATED;
+
+ err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+ NETLINK_CB(skb).portid, extack);
+ if (!err && ndm_flags & (NTF_USE | NTF_MANAGED))
+ neigh_event_send(neigh, NULL);
+ neigh_release(neigh);
out:
return err;
}
@@ -1599,33 +2173,50 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, NDTA_PARMS);
+ nest = nla_nest_start_noflag(skb, NDTA_PARMS);
if (nest == NULL)
return -ENOBUFS;
- if (parms->dev)
- NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
-
- NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
- NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
- NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
- NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
- NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
- NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
- NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
- NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
- parms->base_reachable_time);
- NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
- NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
- NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
- NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
- NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
- NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
-
+ if ((parms->dev &&
+ nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) ||
+ nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ nla_put_u32(skb, NDTPA_QUEUE_LEN,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
+ nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
+ nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
+ nla_put_u32(skb, NDTPA_UCAST_PROBES,
+ NEIGH_VAR(parms, UCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_PROBES,
+ NEIGH_VAR(parms, MCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_REPROBES,
+ NEIGH_VAR(parms, MCAST_REPROBES)) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time),
+ NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_GC_STALETIME,
+ NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
+ NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_RETRANS_TIME,
+ NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
+ NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_PROXY_DELAY,
+ NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_LOCKTIME,
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
+ NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
+ goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
- return nla_nest_cancel(skb, nest);
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
}
static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
@@ -1636,39 +2227,40 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
if (nlh == NULL)
- return -ENOBUFS;
+ return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
-
- read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
- NLA_PUT_STRING(skb, NDTA_NAME, tbl->id);
- NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
- NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
- NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
- NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
-
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval),
+ NDTA_PAD) ||
+ nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) ||
+ nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) ||
+ nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3)))
+ goto nla_put_failure;
{
unsigned long now = jiffies;
- unsigned int flush_delta = now - tbl->last_flush;
- unsigned int rand_delta = now - tbl->last_rand;
-
+ long flush_delta = now - READ_ONCE(tbl->last_flush);
+ long rand_delta = now - READ_ONCE(tbl->last_rand);
+ struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
.ndtc_entry_size = tbl->entry_size,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
- .ndtc_hash_rnd = tbl->hash_rnd,
- .ndtc_hash_mask = tbl->hash_mask,
- .ndtc_hash_chain_gc = tbl->hash_chain_gc,
- .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen),
};
- NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+ nht = rcu_dereference(tbl->nht);
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
+ ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
+
+ if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
+ goto nla_put_failure;
}
{
@@ -1681,31 +2273,34 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
struct neigh_statistics *st;
st = per_cpu_ptr(tbl->stats, cpu);
- ndst.ndts_allocs += st->allocs;
- ndst.ndts_destroys += st->destroys;
- ndst.ndts_hash_grows += st->hash_grows;
- ndst.ndts_res_failed += st->res_failed;
- ndst.ndts_lookups += st->lookups;
- ndst.ndts_hits += st->hits;
- ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
- ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
- ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
- ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ ndst.ndts_allocs += READ_ONCE(st->allocs);
+ ndst.ndts_destroys += READ_ONCE(st->destroys);
+ ndst.ndts_hash_grows += READ_ONCE(st->hash_grows);
+ ndst.ndts_res_failed += READ_ONCE(st->res_failed);
+ ndst.ndts_lookups += READ_ONCE(st->lookups);
+ ndst.ndts_hits += READ_ONCE(st->hits);
+ ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast);
+ ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast);
+ ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs);
+ ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs);
+ ndst.ndts_table_fulls += READ_ONCE(st->table_fulls);
}
- NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+ if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
+ NDTA_PAD))
+ goto nla_put_failure;
}
BUG_ON(tbl->parms.dev);
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
- read_unlock_bh(&tbl->lock);
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- read_unlock_bh(&tbl->lock);
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
static int neightbl_fill_param_info(struct sk_buff *skb,
@@ -1719,11 +2314,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
if (nlh == NULL)
- return -ENOBUFS;
+ return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
-
- read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -1732,27 +2325,14 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
neightbl_fill_parms(skb, parms) < 0)
goto errout;
- read_unlock_bh(&tbl->lock);
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
errout:
- read_unlock_bh(&tbl->lock);
- return nlmsg_cancel(skb, nlh);
-}
-
-static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
- int ifindex)
-{
- struct neigh_parms *p;
-
- for (p = &tbl->parms; p; p = p->next)
- if ((p->dev && p->dev->ifindex == ifindex) ||
- (!p->dev && !ifindex))
- return p;
-
- return NULL;
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-static struct nla_policy nl_neightbl_policy[NDTA_MAX+1] __read_mostly = {
+static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
[NDTA_NAME] = { .type = NLA_STRING },
[NDTA_THRESH1] = { .type = NLA_U32 },
[NDTA_THRESH2] = { .type = NLA_U32 },
@@ -1761,13 +2341,15 @@ static struct nla_policy nl_neightbl_policy[NDTA_MAX+1] __read_mostly = {
[NDTA_PARMS] = { .type = NLA_NESTED },
};
-static struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] __read_mostly = {
+static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
[NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 },
[NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
[NDTPA_GC_STALETIME] = { .type = NLA_U64 },
[NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
@@ -1775,17 +2357,21 @@ static struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] __read_mostly = {
[NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
[NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
+ [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 },
};
-int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NDTA_MAX + 1];
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
- struct nlattr *tb[NDTA_MAX+1];
- int err;
+ bool found = false;
+ int err, tidx;
- err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
- nl_neightbl_policy);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+ nl_neightbl_policy, extack);
if (err < 0)
goto errout;
@@ -1795,40 +2381,50 @@ int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
}
ndtmsg = nlmsg_data(nlh);
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+
+ rcu_read_lock();
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ tbl = rcu_dereference(neigh_tables[tidx]);
+ if (!tbl)
+ continue;
+
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
- if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
+ found = true;
break;
+ }
}
- if (tbl == NULL) {
+ if (!found) {
+ rcu_read_unlock();
err = -ENOENT;
- goto errout_locked;
+ goto errout;
}
- /*
+ /*
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
struct neigh_parms *p;
int i, ifindex = 0;
- err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
- nl_ntbl_parm_policy);
+ err = nla_parse_nested_deprecated(tbp, NDTPA_MAX,
+ tb[NDTA_PARMS],
+ nl_ntbl_parm_policy, extack);
if (err < 0)
goto errout_tbl_lock;
if (tbp[NDTPA_IFINDEX])
ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
- p = lookup_neigh_params(tbl, ifindex);
+ p = lookup_neigh_parms(tbl, net, ifindex);
if (p == NULL) {
err = -ENOENT;
goto errout_tbl_lock;
@@ -1840,104 +2436,186 @@ int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
switch (i) {
case NDTPA_QUEUE_LEN:
- p->queue_len = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN));
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_PROXY_QLEN:
- p->proxy_qlen = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, PROXY_QLEN,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_APP_PROBES:
- p->app_probes = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, APP_PROBES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_UCAST_PROBES:
- p->ucast_probes = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, UCAST_PROBES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_MCAST_PROBES:
- p->mcast_probes = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, MCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_MCAST_REPROBES:
+ NEIGH_VAR_SET(p, MCAST_REPROBES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_BASE_REACHABLE_TIME:
- p->base_reachable_time = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
+ nla_get_msecs(tbp[i]));
+ /* update reachable_time as well, otherwise, the change will
+ * only be effective after the next time neigh_periodic_work
+ * decides to recompute it (can be multiple minutes)
+ */
+ neigh_set_reach_time(p);
break;
case NDTPA_GC_STALETIME:
- p->gc_staletime = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, GC_STALETIME,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_DELAY_PROBE_TIME:
- p->delay_probe_time = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
+ nla_get_msecs(tbp[i]));
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ break;
+ case NDTPA_INTERVAL_PROBE_TIME_MS:
+ NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_RETRANS_TIME:
- p->retrans_time = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, RETRANS_TIME,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_ANYCAST_DELAY:
- p->anycast_delay = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, ANYCAST_DELAY,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_PROXY_DELAY:
- p->proxy_delay = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, PROXY_DELAY,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_LOCKTIME:
- p->locktime = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, LOCKTIME,
+ nla_get_msecs(tbp[i]));
break;
}
}
}
+ err = -ENOENT;
+ if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+ tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+ !net_eq(net, &init_net))
+ goto errout_tbl_lock;
+
if (tb[NDTA_THRESH1])
- tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+ WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1]));
if (tb[NDTA_THRESH2])
- tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+ WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2]));
if (tb[NDTA_THRESH3])
- tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+ WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3]));
if (tb[NDTA_GC_INTERVAL])
- tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+ WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL]));
err = 0;
errout_tbl_lock:
- write_unlock_bh(&tbl->lock);
-errout_locked:
- read_unlock(&neigh_tbl_lock);
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
errout:
return err;
}
-int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct ndtmsg *ndtm;
+
+ ndtm = nlmsg_payload(nlh, sizeof(*ndtm));
+ if (!ndtm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
+ return -EINVAL;
+ }
+
+ if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ndtm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
int family, tidx, nidx = 0;
int tbl_skip = cb->args[0];
int neigh_skip = cb->args[1];
struct neigh_table *tbl;
- family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+ if (cb->strict_check) {
+ int err = neightbl_valid_dump_info(nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+
+ rcu_read_lock();
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
+ tbl = rcu_dereference(neigh_tables[tidx]);
+ if (!tbl)
+ continue;
+
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
- if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
- NLM_F_MULTI) <= 0)
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) < 0)
break;
- for (nidx = 0, p = tbl->parms.next; p; p = p->next, nidx++) {
- if (nidx < neigh_skip)
+ nidx = 0;
+ p = list_next_entry(&tbl->parms, list);
+ list_for_each_entry_from_rcu(p, &tbl->parms_list, list) {
+ if (!net_eq(neigh_parms_net(p), net))
continue;
+ if (nidx < neigh_skip)
+ goto next;
+
if (neightbl_fill_param_info(skb, tbl, p,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
- NLM_F_MULTI) <= 0)
+ NLM_F_MULTI) < 0)
goto out;
+ next:
+ nidx++;
}
neigh_skip = 0;
}
out:
- read_unlock(&neigh_tbl_lock);
+ rcu_read_unlock();
+
cb->args[0] = tidx;
cb->args[1] = nidx;
@@ -1947,6 +2625,7 @@ out:
static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
u32 pid, u32 seq, int type, unsigned int flags)
{
+ u32 neigh_flags, neigh_flags_ext;
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
@@ -1954,112 +2633,509 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
- return -ENOBUFS;
+ return -EMSGSIZE;
+
+ neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT;
+ neigh_flags = neigh->flags & NTF_OLD_MASK;
ndm = nlmsg_data(nlh);
ndm->ndm_family = neigh->ops->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
- ndm->ndm_flags = neigh->flags;
+ ndm->ndm_flags = neigh_flags;
ndm->ndm_type = neigh->type;
ndm->ndm_ifindex = neigh->dev->ifindex;
- NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
+ if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
+ goto nla_put_failure;
read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
- if ((neigh->nud_state & NUD_VALID) &&
- nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
- read_unlock_bh(&neigh->lock);
- goto nla_put_failure;
+ if (neigh->nud_state & NUD_VALID) {
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, neigh, neigh->dev);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+ read_unlock_bh(&neigh->lock);
+ goto nla_put_failure;
+ }
}
- ci.ndm_used = now - neigh->used;
- ci.ndm_confirmed = now - neigh->confirmed;
- ci.ndm_updated = now - neigh->updated;
- ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
+ ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
+ ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+ ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
+ ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
- NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
- NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
+ if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
+ nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol))
+ goto nla_put_failure;
+ if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
+ goto nla_put_failure;
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
+static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
+ u32 pid, u32 seq, int type, unsigned int flags,
+ struct neigh_table *tbl)
+{
+ u32 neigh_flags, neigh_flags_ext;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+ u8 protocol;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ neigh_flags = READ_ONCE(pn->flags);
+ neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT;
+ neigh_flags &= NTF_OLD_MASK;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = tbl->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = neigh_flags | NTF_PROXY;
+ ndm->ndm_type = RTN_UNICAST;
+ ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
+ ndm->ndm_state = NUD_NONE;
+
+ if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
+ goto nla_put_failure;
+
+ protocol = READ_ONCE(pn->protocol);
+ if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol))
+ goto nla_put_failure;
+ if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid)
+{
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
+}
+
+static bool neigh_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;
+
+ /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
+ * invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
+{
+ if (filter_idx && (!dev || dev->ifindex != filter_idx))
+ return true;
+
+ return false;
+}
+
+struct neigh_dump_filter {
+ int master_idx;
+ int dev_idx;
+};
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct neigh_dump_filter *filter)
{
+ struct net *net = sock_net(skb->sk);
struct neighbour *n;
- int rc, h, s_h = cb->args[1];
+ int err = 0, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
+ struct neigh_hash_table *nht;
+ unsigned int flags = NLM_F_MULTI;
- read_lock_bh(&tbl->lock);
- for (h = 0; h <= tbl->hash_mask; h++) {
- if (h < s_h)
- continue;
+ if (filter->dev_idx || filter->master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
+
+ nht = rcu_dereference(tbl->nht);
+
+ for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
- for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) {
- if (idx < s_idx)
- continue;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI) <= 0) {
- read_unlock_bh(&tbl->lock);
- rc = -1;
+ idx = 0;
+ neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) {
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
+ neigh_master_filtered(n->dev, filter->master_idx))
+ goto next;
+ err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags);
+ if (err < 0)
goto out;
- }
+next:
+ idx++;
}
}
- read_unlock_bh(&tbl->lock);
- rc = skb->len;
out:
cb->args[1] = h;
cb->args[2] = idx;
- return rc;
+ return err;
}
-int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct neigh_dump_filter *filter)
{
+ struct pneigh_entry *n;
+ struct net *net = sock_net(skb->sk);
+ int err = 0, h, s_h = cb->args[3];
+ int idx, s_idx = idx = cb->args[4];
+ unsigned int flags = NLM_F_MULTI;
+
+ if (filter->dev_idx || filter->master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
+
+ for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0;
+ n;
+ n = rcu_dereference(n->next)) {
+ if (idx < s_idx || pneigh_net(n) != net)
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
+ neigh_master_filtered(n->dev, filter->master_idx))
+ goto next;
+ err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags, tbl);
+ if (err < 0)
+ goto out;
+ next:
+ idx++;
+ }
+ }
+
+out:
+ cb->args[3] = h;
+ cb->args[4] = idx;
+ return err;
+}
+
+static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
+ bool strict_check,
+ struct neigh_dump_filter *filter,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ int err, i;
+
+ if (strict_check) {
+ struct ndmsg *ndm;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
+ ndm->ndm_state || ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_flags & ~NTF_PROXY) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg),
+ tb, NDA_MAX, nda_policy,
+ extack);
+ } else {
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, nda_policy, extack);
+ }
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ /* all new attributes should require strict_check */
+ switch (i) {
+ case NDA_IFINDEX:
+ filter->dev_idx = nla_get_u32(tb[i]);
+ break;
+ case NDA_MASTER:
+ filter->master_idx = nla_get_u32(tb[i]);
+ break;
+ default:
+ if (strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct neigh_dump_filter filter = {};
struct neigh_table *tbl;
int t, family, s_t;
+ int proxy = 0;
+ int err;
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+
+ /* check for full ndmsg structure presence, family member is
+ * the same for both structures
+ */
+ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY)
+ proxy = 1;
+
+ err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
+ if (err < 0 && cb->strict_check)
+ return err;
+ err = 0;
- read_lock(&neigh_tbl_lock);
- family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
s_t = cb->args[0];
- for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+ rcu_read_lock();
+ for (t = 0; t < NEIGH_NR_TABLES; t++) {
+ tbl = rcu_dereference(neigh_tables[t]);
+
+ if (!tbl)
+ continue;
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
- if (neigh_dump_table(tbl, skb, cb) < 0)
+ if (proxy)
+ err = pneigh_dump_table(tbl, skb, cb, &filter);
+ else
+ err = neigh_dump_table(tbl, skb, cb, &filter);
+ if (err < 0)
break;
}
- read_unlock(&neigh_tbl_lock);
+ rcu_read_unlock();
cb->args[0] = t;
- return skb->len;
+ return err;
+}
+
+static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ndmsg *ndm;
+ int err, i;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (ndm->ndm_flags & ~NTF_PROXY) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) {
+ NL_SET_ERR_MSG(extack, "No device specified");
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, nda_policy, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ for (i = 0; i <= NDA_MAX; ++i) {
+ switch (i) {
+ case NDA_DST:
+ if (!tb[i]) {
+ NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST);
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ default:
+ if (!tb[i])
+ continue;
+
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return ndm;
+}
+
+static inline size_t neigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4) /* NDA_PROBES */
+ + nla_total_size(4) /* NDA_FLAGS_EXT */
+ + nla_total_size(1); /* NDA_PROTOCOL */
+}
+
+static inline size_t pneigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(4) /* NDA_FLAGS_EXT */
+ + nla_total_size(1); /* NDA_PROTOCOL */
+}
+
+static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ u32 pid = NETLINK_CB(in_skb).portid;
+ struct nlattr *tb[NDA_MAX + 1];
+ struct net_device *dev = NULL;
+ u32 seq = nlh->nlmsg_seq;
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+ struct sk_buff *skb;
+ struct ndmsg *ndm;
+ void *dst;
+ int err;
+
+ ndm = neigh_valid_get_req(nlh, tb, extack);
+ if (IS_ERR(ndm))
+ return PTR_ERR(ndm);
+
+ if (ndm->ndm_flags & NTF_PROXY)
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL);
+ else
+ skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ rcu_read_lock();
+
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (!tbl) {
+ NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request");
+ err = -EAFNOSUPPORT;
+ goto err_unlock;
+ }
+
+ if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request");
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ dst = nla_data(tb[NDA_DST]);
+
+ if (ndm->ndm_ifindex) {
+ dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Unknown device ifindex");
+ err = -ENODEV;
+ goto err_unlock;
+ }
+ }
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
+ pn = pneigh_lookup(tbl, net, dst, dev);
+ if (!pn) {
+ NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found");
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl);
+ if (err)
+ goto err_unlock;
+ } else {
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (!neigh) {
+ NL_SET_ERR_MSG(extack, "Neighbour entry not found");
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0);
+ neigh_release(neigh);
+ if (err)
+ goto err_unlock;
+ }
+
+ rcu_read_unlock();
+
+ return rtnl_unicast(skb, net, pid);
+err_unlock:
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return err;
}
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
int chain;
+ struct neigh_hash_table *nht;
+
+ rcu_read_lock();
+ nht = rcu_dereference(tbl->nht);
- read_lock_bh(&tbl->lock);
- for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ spin_lock_bh(&tbl->lock); /* avoid resizes */
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
- for (n = tbl->hash_buckets[chain]; n; n = n->next)
+ neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
- read_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -2067,66 +3143,128 @@ EXPORT_SYMBOL(neigh_for_each);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
+ struct neigh_hash_table *nht;
int chain;
- for (chain = 0; chain <= tbl->hash_mask; chain++) {
- struct neighbour *n, **np;
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct hlist_node *tmp;
+ struct neighbour *n;
- np = &tbl->hash_buckets[chain];
- while ((n = *np) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- *np = n->next;
- n->dead = 1;
- } else
- np = &n->next;
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ neigh_mark_dead(n);
+ }
write_unlock(&n->lock);
if (release)
- neigh_release(n);
+ neigh_cleanup_and_release(n);
}
}
}
EXPORT_SYMBOL(__neigh_for_each_release);
+int neigh_xmit(int index, struct net_device *dev,
+ const void *addr, struct sk_buff *skb)
+{
+ int err = -EAFNOSUPPORT;
+
+ if (likely(index < NEIGH_NR_TABLES)) {
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+
+ rcu_read_lock();
+ tbl = rcu_dereference(neigh_tables[index]);
+ if (!tbl)
+ goto out_unlock;
+ if (index == NEIGH_ARP_TABLE) {
+ u32 key = *((u32 *)addr);
+
+ neigh = __ipv4_neigh_lookup_noref(dev, key);
+ } else {
+ neigh = __neigh_lookup_noref(tbl, addr, dev);
+ }
+ if (!neigh)
+ neigh = __neigh_create(tbl, addr, dev, false);
+ err = PTR_ERR(neigh);
+ if (IS_ERR(neigh)) {
+ rcu_read_unlock();
+ goto out_kfree_skb;
+ }
+ err = READ_ONCE(neigh->output)(neigh, skb);
+out_unlock:
+ rcu_read_unlock();
+ }
+ else if (index == NEIGH_LINK_TABLE) {
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ addr, NULL, skb->len);
+ if (err < 0)
+ goto out_kfree_skb;
+ err = dev_queue_xmit(skb);
+ }
+out:
+ return err;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_xmit);
+
#ifdef CONFIG_PROC_FS
+static struct neighbour *neigh_get_valid(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ if (!net_eq(dev_net(n->dev), net))
+ return NULL;
+
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
+ if (!v)
+ return NULL;
+ if (pos)
+ return v;
+ }
+
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ return n;
+
+ if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
+ return n;
+
+ return NULL;
+}
+
static struct neighbour *neigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
- struct neighbour *n = NULL;
- int bucket = state->bucket;
+ struct neigh_hash_table *nht = state->nht;
+ struct neighbour *n, *tmp;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
- n = tbl->hash_buckets[bucket];
-
- while (n) {
- if (state->neigh_sub_iter) {
- loff_t fakep = 0;
- void *v;
- v = state->neigh_sub_iter(state, n, &fakep);
- if (!v)
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (n->nud_state & ~NUD_NOARP)
- break;
- next:
- n = n->next;
+ while (++state->bucket < (1 << nht->hash_shift)) {
+ neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
+ tmp = neigh_get_valid(seq, n, NULL);
+ if (tmp)
+ return tmp;
}
-
- if (n)
- break;
}
- state->bucket = bucket;
- return n;
+ return NULL;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
@@ -2134,43 +3272,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
+ struct neighbour *tmp;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
+
if (v)
return n;
}
- n = n->next;
-
- while (1) {
- while (n) {
- if (state->neigh_sub_iter) {
- void *v = state->neigh_sub_iter(state, n, pos);
- if (v)
- return n;
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (n->nud_state & ~NUD_NOARP)
- break;
- next:
- n = n->next;
+ hlist_for_each_entry_continue(n, hash) {
+ tmp = neigh_get_valid(seq, n, pos);
+ if (tmp) {
+ n = tmp;
+ goto out;
}
-
- if (n)
- break;
-
- if (++state->bucket > tbl->hash_mask)
- break;
-
- n = tbl->hash_buckets[state->bucket];
}
+ n = neigh_get_first(seq);
+out:
if (n && pos)
--(*pos);
+
return n;
}
@@ -2179,6 +3302,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
struct neighbour *n = neigh_get_first(seq);
if (n) {
+ --(*pos);
while (*pos) {
n = neigh_get_next(seq, n, pos);
if (!n)
@@ -2191,13 +3315,17 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
- int bucket = state->bucket;
+ int bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
- pn = tbl->phash_buckets[bucket];
+ pn = rcu_dereference(tbl->phash_buckets[bucket]);
+
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = rcu_dereference(pn->next);
if (pn)
break;
}
@@ -2211,13 +3339,21 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
- pn = pn->next;
+ do {
+ pn = rcu_dereference(pn->next);
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
- pn = tbl->phash_buckets[state->bucket];
+
+ pn = rcu_dereference(tbl->phash_buckets[state->bucket]);
+
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = rcu_dereference(pn->next);
if (pn)
break;
}
@@ -2233,6 +3369,7 @@ static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
struct pneigh_entry *pn = pneigh_get_first(seq);
if (pn) {
+ --(*pos);
while (*pos) {
pn = pneigh_get_next(seq, pn, pos);
if (!pn)
@@ -2246,27 +3383,30 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
void *rc;
+ loff_t idxpos = *pos;
- rc = neigh_get_idx(seq, pos);
+ rc = neigh_get_idx(seq, &idxpos);
if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
- rc = pneigh_get_idx(seq, pos);
+ rc = pneigh_get_idx(seq, &idxpos);
return rc;
}
void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+ __acquires(tbl->lock)
+ __acquires(rcu)
{
struct neigh_seq_state *state = seq->private;
- loff_t pos_minus_one;
state->tbl = tbl;
- state->bucket = 0;
+ state->bucket = -1;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
+ state->nht = rcu_dereference(tbl->nht);
+ spin_lock_bh(&tbl->lock);
- pos_minus_one = *pos - 1;
- return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
+ return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL(neigh_seq_start);
@@ -2276,7 +3416,7 @@ void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
void *rc;
if (v == SEQ_START_TOKEN) {
- rc = neigh_get_idx(seq, pos);
+ rc = neigh_get_first(seq);
goto out;
}
@@ -2298,11 +3438,14 @@ out:
EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
+ __releases(tbl->lock)
+ __releases(rcu)
{
struct neigh_seq_state *state = seq->private;
struct neigh_table *tbl = state->tbl;
- read_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_seq_stop);
@@ -2310,14 +3453,13 @@ EXPORT_SYMBOL(neigh_seq_stop);
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct proc_dir_entry *pde = seq->private;
- struct neigh_table *tbl = pde->data;
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
-
- for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
@@ -2328,16 +3470,16 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct proc_dir_entry *pde = seq->private;
- struct neigh_table *tbl = pde->data;
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
int cpu;
- for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
+ (*pos)++;
return NULL;
}
@@ -2348,17 +3490,17 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
- struct proc_dir_entry *pde = seq->private;
- struct neigh_table *tbl = pde->data;
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs\n");
+ seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
- seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx\n",
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx "
+ "%08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
@@ -2374,389 +3516,417 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
st->rcv_probes_ucast,
st->periodic_gc_runs,
- st->forced_gc_runs
+ st->forced_gc_runs,
+ st->unres_discards,
+ st->table_fulls
);
return 0;
}
-static struct seq_operations neigh_stat_seq_ops = {
+static const struct seq_operations neigh_stat_seq_ops = {
.start = neigh_stat_seq_start,
.next = neigh_stat_seq_next,
.stop = neigh_stat_seq_stop,
.show = neigh_stat_seq_show,
};
+#endif /* CONFIG_PROC_FS */
-static int neigh_stat_seq_open(struct inode *inode, struct file *file)
+static void __neigh_notify(struct neighbour *n, int type, int flags,
+ u32 pid)
{
- int ret = seq_open(file, &neigh_stat_seq_ops);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ struct net *net;
+
+ rcu_read_lock();
+ net = dev_net_rcu(n->dev);
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
- if (!ret) {
- struct seq_file *sf = file->private_data;
- sf->private = PDE(inode);
+ err = neigh_fill_info(skb, n, pid, 0, type, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
}
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ goto out;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+out:
+ rcu_read_unlock();
+}
+
+void neigh_app_ns(struct neighbour *n)
+{
+ __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
+}
+EXPORT_SYMBOL(neigh_app_ns);
+
+#ifdef CONFIG_SYSCTL
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
+
+static int proc_unres_qlen(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ struct ctl_table tmp = *ctl;
+
+ tmp.extra1 = SYSCTL_ZERO;
+ tmp.extra2 = &unres_qlen_max;
+ tmp.data = &size;
+
+ size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
return ret;
-};
+}
-static struct file_operations neigh_stat_seq_fops = {
- .owner = THIS_MODULE,
- .open = neigh_stat_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
+ int index)
+{
+ struct net_device *dev;
+ int family = neigh_parms_family(p);
-#endif /* CONFIG_PROC_FS */
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct neigh_parms *dst_p =
+ neigh_get_dev_parms_rcu(dev, family);
-#ifdef CONFIG_ARPD
-static inline size_t neigh_nlmsg_size(void)
+ if (dst_p && !test_bit(index, dst_p->data_state))
+ dst_p->data[index] = p->data[index];
+ }
+ rcu_read_unlock();
+}
+
+static void neigh_proc_update(const struct ctl_table *ctl, int write)
{
- return NLMSG_ALIGN(sizeof(struct ndmsg))
- + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
- + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
- + nla_total_size(sizeof(struct nda_cacheinfo))
- + nla_total_size(4); /* NDA_PROBES */
+ struct net_device *dev = ctl->extra1;
+ struct neigh_parms *p = ctl->extra2;
+ struct net *net = neigh_parms_net(p);
+ int index = (int *) ctl->data - p->data;
+
+ if (!write)
+ return;
+
+ set_bit(index, p->data_state);
+ if (index == NEIGH_VAR_DELAY_PROBE_TIME)
+ call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
+ if (!dev) /* NULL dev means this is default value */
+ neigh_copy_dflt_parms(net, p, index);
}
-static void __neigh_notify(struct neighbour *n, int type, int flags)
+static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- struct sk_buff *skb;
- int err = -ENOBUFS;
+ struct ctl_table tmp = *ctl;
+ int ret;
- skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
- if (skb == NULL)
- goto errout;
+ tmp.extra1 = SYSCTL_ZERO;
+ tmp.extra2 = SYSCTL_INT_MAX;
- err = neigh_fill_info(skb, n, 0, 0, type, flags);
- /* failure implies BUG in neigh_nlmsg_size() */
- BUG_ON(err < 0);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
- err = rtnl_notify(skb, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
-errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_NEIGH, err);
+static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ int min = msecs_to_jiffies(1);
+
+ tmp.extra1 = &min;
+ tmp.extra2 = NULL;
+
+ ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
}
-void neigh_app_ns(struct neighbour *n)
+int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
}
+EXPORT_SYMBOL(neigh_proc_dointvec);
-static void neigh_app_notify(struct neighbour *n)
+int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- __neigh_notify(n, RTM_NEWNEIGH, 0);
+ int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
}
+EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
-#endif /* CONFIG_ARPD */
+static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
-#ifdef CONFIG_SYSCTL
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
+
+static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct neigh_parms *p = ctl->extra2;
+ int ret;
+
+ if (strcmp(ctl->procname, "base_reachable_time") == 0)
+ ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+ else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0)
+ ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+ else
+ ret = -1;
+
+ if (write && ret == 0) {
+ /* update reachable_time as well, otherwise, the change will
+ * only be effective after the next time neigh_periodic_work
+ * decides to recompute it
+ */
+ neigh_set_reach_time(p);
+ }
+ return ret;
+}
+
+#define NEIGH_PARMS_DATA_OFFSET(index) \
+ (&((struct neigh_parms *) 0)->data[index])
+
+#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
+ [NEIGH_VAR_ ## attr] = { \
+ .procname = name, \
+ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ }
+
+#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)
+
+#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)
+
+#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- ctl_table neigh_vars[__NET_NEIGH_MAX];
- ctl_table neigh_dev[2];
- ctl_table neigh_neigh_dir[2];
- ctl_table neigh_proto_dir[2];
- ctl_table neigh_root_dir[2];
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
- {
- .ctl_name = NET_NEIGH_MCAST_SOLICIT,
- .procname = "mcast_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_UCAST_SOLICIT,
- .procname = "ucast_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_APP_SOLICIT,
- .procname = "app_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_RETRANS_TIME,
- .procname = "retrans_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_REACHABLE_TIME,
- .procname = "base_reachable_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_DELAY_PROBE_TIME,
- .procname = "delay_first_probe_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_GC_STALE_TIME,
- .procname = "gc_stale_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_UNRES_QLEN,
- .procname = "unres_qlen",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_PROXY_QLEN,
- .procname = "proxy_qlen",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_ANYCAST_DELAY,
- .procname = "anycast_delay",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_PROXY_DELAY,
- .procname = "proxy_delay",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_LOCKTIME,
- .procname = "locktime",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_GC_INTERVAL,
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
+ "interval_probe_time_ms"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
+ NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
+ [NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
- {
- .ctl_name = NET_NEIGH_GC_THRESH1,
+ [NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
+ .proc_handler = proc_dointvec_minmax,
},
- {
- .ctl_name = NET_NEIGH_GC_THRESH2,
+ [NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
+ .proc_handler = proc_dointvec_minmax,
},
- {
- .ctl_name = NET_NEIGH_GC_THRESH3,
+ [NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_RETRANS_TIME_MS,
- .procname = "retrans_time_ms",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_ms_jiffies,
- .strategy = &sysctl_ms_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_REACHABLE_TIME_MS,
- .procname = "base_reachable_time_ms",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_ms_jiffies,
- .strategy = &sysctl_ms_jiffies,
- },
- },
- .neigh_dev = {
- {
- .ctl_name = NET_PROTO_CONF_DEFAULT,
- .procname = "default",
- .mode = 0555,
- },
- },
- .neigh_neigh_dir = {
- {
- .procname = "neigh",
- .mode = 0555,
- },
- },
- .neigh_proto_dir = {
- {
- .mode = 0555,
- },
- },
- .neigh_root_dir = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
+ .proc_handler = proc_dointvec_minmax,
},
},
};
int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
- int p_id, int pdev_id, char *p_name,
- proc_handler *handler, ctl_handler *strategy)
+ proc_handler *handler)
{
- struct neigh_sysctl_table *t = kmemdup(&neigh_sysctl_template,
- sizeof(*t), GFP_KERNEL);
- const char *dev_name_source = NULL;
- char *dev_name = NULL;
- int err = 0;
+ int i;
+ struct neigh_sysctl_table *t;
+ const char *dev_name_source;
+ char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
+ char *p_name;
+ size_t neigh_vars_size;
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT);
if (!t)
- return -ENOBUFS;
- t->neigh_vars[0].data = &p->mcast_probes;
- t->neigh_vars[1].data = &p->ucast_probes;
- t->neigh_vars[2].data = &p->app_probes;
- t->neigh_vars[3].data = &p->retrans_time;
- t->neigh_vars[4].data = &p->base_reachable_time;
- t->neigh_vars[5].data = &p->delay_probe_time;
- t->neigh_vars[6].data = &p->gc_staletime;
- t->neigh_vars[7].data = &p->queue_len;
- t->neigh_vars[8].data = &p->proxy_qlen;
- t->neigh_vars[9].data = &p->anycast_delay;
- t->neigh_vars[10].data = &p->proxy_delay;
- t->neigh_vars[11].data = &p->locktime;
+ goto err;
+
+ for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
+ t->neigh_vars[i].data += (long) p;
+ t->neigh_vars[i].extra1 = dev;
+ t->neigh_vars[i].extra2 = p;
+ }
+ neigh_vars_size = ARRAY_SIZE(t->neigh_vars);
if (dev) {
dev_name_source = dev->name;
- t->neigh_dev[0].ctl_name = dev->ifindex;
- t->neigh_vars[12].procname = NULL;
- t->neigh_vars[13].procname = NULL;
- t->neigh_vars[14].procname = NULL;
- t->neigh_vars[15].procname = NULL;
+ /* Terminate the table early */
+ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;
} else {
- dev_name_source = t->neigh_dev[0].procname;
- t->neigh_vars[12].data = (int *)(p + 1);
- t->neigh_vars[13].data = (int *)(p + 1) + 1;
- t->neigh_vars[14].data = (int *)(p + 1) + 2;
- t->neigh_vars[15].data = (int *)(p + 1) + 3;
+ struct neigh_table *tbl = p->tbl;
+ dev_name_source = "default";
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
}
- t->neigh_vars[16].data = &p->retrans_time;
- t->neigh_vars[17].data = &p->base_reachable_time;
-
- if (handler || strategy) {
+ if (handler) {
/* RetransTime */
- t->neigh_vars[3].proc_handler = handler;
- t->neigh_vars[3].strategy = strategy;
- t->neigh_vars[3].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
/* ReachableTime */
- t->neigh_vars[4].proc_handler = handler;
- t->neigh_vars[4].strategy = strategy;
- t->neigh_vars[4].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
/* RetransTime (in milliseconds)*/
- t->neigh_vars[16].proc_handler = handler;
- t->neigh_vars[16].strategy = strategy;
- t->neigh_vars[16].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
/* ReachableTime (in milliseconds) */
- t->neigh_vars[17].proc_handler = handler;
- t->neigh_vars[17].strategy = strategy;
- t->neigh_vars[17].extra1 = dev;
- }
-
- dev_name = kstrdup(dev_name_source, GFP_KERNEL);
- if (!dev_name) {
- err = -ENOBUFS;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+ } else {
+ /* Those handlers will update p->reachable_time after
+ * base_reachable_time(_ms) is set to ensure the new timer starts being
+ * applied after the next neighbour update instead of waiting for
+ * neigh_periodic_work to update its value (can be multiple minutes)
+ * So any handler that replaces them should do this as well
+ */
+ /* ReachableTime */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler =
+ neigh_proc_base_reachable_time;
+ /* ReachableTime (in milliseconds) */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler =
+ neigh_proc_base_reachable_time;
+ }
+
+ switch (neigh_parms_family(p)) {
+ case AF_INET:
+ p_name = "ipv4";
+ break;
+ case AF_INET6:
+ p_name = "ipv6";
+ break;
+ default:
+ BUG();
+ }
+
+ snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
+ p_name, dev_name_source);
+ t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p),
+ neigh_path, t->neigh_vars,
+ neigh_vars_size);
+ if (!t->sysctl_header)
goto free;
- }
-
- t->neigh_dev[0].procname = dev_name;
-
- t->neigh_neigh_dir[0].ctl_name = pdev_id;
-
- t->neigh_proto_dir[0].procname = p_name;
- t->neigh_proto_dir[0].ctl_name = p_id;
-
- t->neigh_dev[0].child = t->neigh_vars;
- t->neigh_neigh_dir[0].child = t->neigh_dev;
- t->neigh_proto_dir[0].child = t->neigh_neigh_dir;
- t->neigh_root_dir[0].child = t->neigh_proto_dir;
- t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0);
- if (!t->sysctl_header) {
- err = -ENOBUFS;
- goto free_procname;
- }
p->sysctl_table = t;
return 0;
- /* error path */
- free_procname:
- kfree(dev_name);
- free:
+free:
kfree(t);
-
- return err;
+err:
+ return -ENOBUFS;
}
+EXPORT_SYMBOL(neigh_sysctl_register);
void neigh_sysctl_unregister(struct neigh_parms *p)
{
if (p->sysctl_table) {
struct neigh_sysctl_table *t = p->sysctl_table;
p->sysctl_table = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->neigh_dev[0].procname);
+ unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
}
+EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
-EXPORT_SYMBOL(__neigh_event_send);
-EXPORT_SYMBOL(neigh_changeaddr);
-EXPORT_SYMBOL(neigh_compat_output);
-EXPORT_SYMBOL(neigh_connected_output);
-EXPORT_SYMBOL(neigh_create);
-EXPORT_SYMBOL(neigh_delete);
-EXPORT_SYMBOL(neigh_destroy);
-EXPORT_SYMBOL(neigh_dump_info);
-EXPORT_SYMBOL(neigh_event_ns);
-EXPORT_SYMBOL(neigh_ifdown);
-EXPORT_SYMBOL(neigh_lookup);
-EXPORT_SYMBOL(neigh_lookup_nodev);
-EXPORT_SYMBOL(neigh_parms_alloc);
-EXPORT_SYMBOL(neigh_parms_release);
-EXPORT_SYMBOL(neigh_rand_reach_time);
-EXPORT_SYMBOL(neigh_resolve_output);
-EXPORT_SYMBOL(neigh_table_clear);
-EXPORT_SYMBOL(neigh_table_init);
-EXPORT_SYMBOL(neigh_table_init_no_netlink);
-EXPORT_SYMBOL(neigh_update);
-EXPORT_SYMBOL(pneigh_enqueue);
-EXPORT_SYMBOL(pneigh_lookup);
+static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNEIGH, .doit = neigh_add},
+ {.msgtype = RTM_DELNEIGH, .doit = neigh_delete},
+ {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+};
-#ifdef CONFIG_ARPD
-EXPORT_SYMBOL(neigh_app_ns);
-#endif
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(neigh_sysctl_register);
-EXPORT_SYMBOL(neigh_sysctl_unregister);
-#endif
+static int __init neigh_init(void)
+{
+ rtnl_register_many(neigh_rtnl_msg_handlers);
+ return 0;
+}
+
+subsys_initcall(neigh_init);
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
new file mode 100644
index 000000000000..70e0e9a3b650
--- /dev/null
+++ b/net/core/net-procfs.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/wext.h>
+#include <net/hotdata.h>
+
+#include "dev.h"
+
+static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos)
+{
+ unsigned long ifindex = *pos;
+ struct net_device *dev;
+
+ for_each_netdev_dump(seq_file_net(seq), dev, ifindex) {
+ *pos = dev->ifindex;
+ return dev;
+ }
+ return NULL;
+}
+
+static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ return dev_seq_from_index(seq, pos);
+}
+
+static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return dev_seq_from_index(seq, pos);
+}
+
+static void dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+ "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+ dev->name, stats->rx_bytes, stats->rx_packets,
+ stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors +
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes, stats->tx_packets,
+ stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors +
+ stats->tx_aborted_errors +
+ stats->tx_window_errors +
+ stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized
+ * /proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Inter-| Receive "
+ " | Transmit\n"
+ " face |bytes packets errs drop fifo frame "
+ "compressed multicast|bytes packets errs "
+ "drop fifo colls carrier compressed\n");
+ else
+ dev_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static u32 softnet_input_pkt_queue_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->input_pkt_queue);
+}
+
+static u32 softnet_process_queue_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->process_queue);
+}
+
+static struct softnet_data *softnet_get_online(loff_t *pos)
+{
+ struct softnet_data *sd = NULL;
+
+ while (*pos < nr_cpu_ids)
+ if (cpu_online(*pos)) {
+ sd = &per_cpu(softnet_data, *pos);
+ break;
+ } else
+ ++*pos;
+ return sd;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+ struct softnet_data *sd = v;
+ u32 input_qlen = softnet_input_pkt_queue_len(sd);
+ u32 process_qlen = softnet_process_queue_len(sd);
+ unsigned int flow_limit_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ /* Pairs with WRITE_ONCE() in skb_flow_limit() */
+ if (fl)
+ flow_limit_count = READ_ONCE(fl->count);
+ rcu_read_unlock();
+#endif
+
+ /* the index is the CPU id owing this sd. Since offline CPUs are not
+ * displayed, it would be othrwise not trivial for the user-space
+ * mapping the data a specific CPU
+ */
+ seq_printf(seq,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x\n",
+ READ_ONCE(sd->processed),
+ numa_drop_read(&sd->drop_counters),
+ READ_ONCE(sd->time_squeeze), 0,
+ 0, 0, 0, 0, /* was fastroute */
+ 0, /* was cpu_collision */
+ READ_ONCE(sd->received_rps), flow_limit_count,
+ input_qlen + process_qlen, (int)seq->index,
+ input_qlen, process_qlen);
+ return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_seq_show,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+ .start = softnet_seq_start,
+ .next = softnet_seq_next,
+ .stop = softnet_seq_stop,
+ .show = softnet_seq_show,
+};
+
+static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct list_head *ptype_list = NULL;
+ struct packet_type *pt = NULL;
+ struct net_device *dev;
+ loff_t i = 0;
+ int t;
+
+ for_each_netdev_rcu(seq_file_net(seq), dev) {
+ ptype_list = &dev->ptype_all;
+ list_for_each_entry_rcu(pt, ptype_list, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+ list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+ return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
+ struct packet_type *pt;
+ struct list_head *nxt;
+ int hash;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ptype_get_idx(seq, 0);
+
+ pt = v;
+ nxt = pt->list.next;
+ if (pt->dev) {
+ if (nxt != &pt->dev->ptype_all)
+ goto found;
+
+ dev = pt->dev;
+ for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
+ if (!list_empty(&dev->ptype_all)) {
+ nxt = dev->ptype_all.next;
+ goto found;
+ }
+ }
+ nxt = net->ptype_all.next;
+ goto net_ptype_all;
+ }
+
+ if (pt->af_packet_net) {
+net_ptype_all:
+ if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
+ goto found;
+
+ if (nxt == &net->ptype_all) {
+ /* continue with ->ptype_specific if it's not empty */
+ nxt = net->ptype_specific.next;
+ if (nxt != &net->ptype_specific)
+ goto found;
+ }
+
+ hash = 0;
+ nxt = ptype_base[0].next;
+ } else
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+ while (nxt == &ptype_base[hash]) {
+ if (++hash >= PTYPE_HASH_SIZE)
+ return NULL;
+ nxt = ptype_base[hash].next;
+ }
+found:
+ return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+ struct packet_type *pt = v;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Type Device Function\n");
+ else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
+ (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) {
+ if (pt->type == htons(ETH_P_ALL))
+ seq_puts(seq, "ALL ");
+ else
+ seq_printf(seq, "%04x", ntohs(pt->type));
+
+ seq_printf(seq, " %-8s %ps\n",
+ pt->dev ? pt->dev->name : "", pt->func);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+ .start = ptype_seq_start,
+ .next = ptype_seq_next,
+ .stop = ptype_seq_stop,
+ .show = ptype_seq_show,
+};
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+ int rc = -ENOMEM;
+
+ if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops,
+ sizeof(struct seq_net_private)))
+ goto out;
+ if (!proc_create_seq("softnet_stat", 0444, net->proc_net,
+ &softnet_seq_ops))
+ goto out_dev;
+ if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
+ sizeof(struct seq_net_private)))
+ goto out_softnet;
+
+ if (wext_proc_init(net))
+ goto out_ptype;
+ rc = 0;
+out:
+ return rc;
+out_ptype:
+ remove_proc_entry("ptype", net->proc_net);
+out_softnet:
+ remove_proc_entry("softnet_stat", net->proc_net);
+out_dev:
+ remove_proc_entry("dev", net->proc_net);
+ goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+ wext_proc_exit(net);
+
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ remove_proc_entry("dev", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+ .init = dev_proc_net_init,
+ .exit = dev_proc_net_exit,
+};
+
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct netdev_hw_addr *ha;
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ netif_addr_lock_bh(dev);
+ netdev_for_each_mc_addr(ha, dev) {
+ seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n",
+ dev->ifindex, dev->name,
+ ha->refcount, ha->global_use,
+ (int)dev->addr_len, ha->addr);
+ }
+ netif_addr_unlock_bh(dev);
+ return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_mc_seq_show,
+};
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+ if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+ remove_proc_entry("dev_mcast", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+ .init = dev_mc_net_init,
+ .exit = dev_mc_net_exit,
+};
+
+int __init dev_proc_init(void)
+{
+ int ret = register_pernet_subsys(&dev_proc_ops);
+ if (!ret)
+ return register_pernet_subsys(&dev_mc_net_ops);
+ return ret;
+}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f47f319bb7dc..ca878525ad7c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1,276 +1,840 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net-sysfs.c - network device class and attributes
*
* Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/isolation.h>
+#include <linux/nsproxy.h>
#include <net/sock.h>
+#include <net/net_namespace.h>
#include <linux/rtnetlink.h>
-#include <linux/wireless.h>
-#include <net/iw_handler.h>
-
-#define to_class_dev(obj) container_of(obj,struct class_device,kobj)
-#define to_net_dev(class) container_of(class, struct net_device, class_dev)
-
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
+#include <linux/of.h>
+#include <linux/of_net.h>
+#include <linux/cpu.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/rps.h>
+
+#include "dev.h"
+#include "net-sysfs.h"
+
+#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
-static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
+static const char fmt_uint[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
-static inline int dev_isalive(const struct net_device *dev)
+/* Caller holds RTNL, netdev->lock or RCU */
+static inline int dev_isalive(const struct net_device *dev)
{
- return dev->reg_state <= NETREG_REGISTERED;
+ return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
+}
+
+/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active,
+ * when unregistering a net device and accessing associated sysfs files. The
+ * potential deadlock is as follow:
+ *
+ * CPU 0 CPU 1
+ *
+ * rtnl_lock vfs_read
+ * unregister_netdevice_many kernfs_seq_start
+ * device_del / kobject_put kernfs_get_active (kn->active++)
+ * kernfs_drain sysfs_kf_seq_show
+ * wait_event( rtnl_lock
+ * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release
+ * -> waits on CPU 1 to decrease kn->active the rtnl lock.
+ *
+ * The historical fix was to use rtnl_trylock with restart_syscall to bail out
+ * of sysfs operations when the lock couldn't be taken. This fixed the above
+ * issue as it allowed CPU 1 to bail out of the ABBA situation.
+ *
+ * But it came with performances issues, as syscalls are being restarted in
+ * loops when there was contention on the rtnl lock, with huge slow downs in
+ * specific scenarios (e.g. lots of virtual interfaces created and userspace
+ * daemons querying their attributes).
+ *
+ * The idea below is to bail out of the active kernfs_node protection
+ * (kn->active) while trying to take the rtnl lock.
+ *
+ * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The
+ * net device is guaranteed to be alive if this returns successfully.
+ */
+static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr,
+ struct net_device *ndev)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ /* First, we hold a reference to the net device as the unregistration
+ * path might run in parallel. This will ensure the net device and the
+ * associated sysfs objects won't be freed while we try to take the rtnl
+ * lock.
+ */
+ dev_hold(ndev);
+ /* sysfs_break_active_protection was introduced to allow self-removal of
+ * devices and their associated sysfs files by bailing out of the
+ * sysfs/kernfs protection. We do this here to allow the unregistration
+ * path to complete in parallel. The following takes a reference on the
+ * kobject and the kernfs_node being accessed.
+ *
+ * This works because we hold a reference onto the net device and the
+ * unregistration path will wait for us eventually in netdev_run_todo
+ * (outside an rtnl lock section).
+ */
+ kn = sysfs_break_active_protection(kobj, attr);
+ /* We can now try to take the rtnl lock. This can't deadlock us as the
+ * unregistration path is able to drain sysfs files (kernfs_node) thanks
+ * to the above dance.
+ */
+ if (rtnl_lock_interruptible()) {
+ ret = -ERESTARTSYS;
+ goto unbreak;
+ }
+ /* Check dismantle on the device hasn't started, otherwise deny the
+ * operation.
+ */
+ if (!dev_isalive(ndev)) {
+ rtnl_unlock();
+ ret = -ENODEV;
+ goto unbreak;
+ }
+ /* We are now sure the device dismantle hasn't started nor that it can
+ * start before we exit the locking section as we hold the rtnl lock.
+ * There's no need to keep unbreaking the sysfs protection nor to hold
+ * a net device reference from that point; that was only needed to take
+ * the rtnl lock.
+ */
+unbreak:
+ sysfs_unbreak_active_protection(kn);
+ dev_put(ndev);
+
+ return ret;
}
/* use same locking rules as GIF* ioctl's */
-static ssize_t netdev_show(const struct class_device *cd, char *buf,
+static ssize_t netdev_show(const struct device *dev,
+ struct device_attribute *attr, char *buf,
ssize_t (*format)(const struct net_device *, char *))
{
- struct net_device *net = to_net_dev(cd);
+ struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
- if (dev_isalive(net))
- ret = (*format)(net, buf);
- read_unlock(&dev_base_lock);
+ rcu_read_lock();
+ if (dev_isalive(ndev))
+ ret = (*format)(ndev, buf);
+ rcu_read_unlock();
return ret;
}
/* generate a show function for simple field */
#define NETDEVICE_SHOW(field, format_string) \
-static ssize_t format_##field(const struct net_device *net, char *buf) \
+static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sprintf(buf, format_string, net->field); \
+ return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \
} \
-static ssize_t show_##field(struct class_device *cd, char *buf) \
+static ssize_t field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
{ \
- return netdev_show(cd, buf, format_##field); \
-}
+ return netdev_show(dev, attr, buf, format_##field); \
+} \
+#define NETDEVICE_SHOW_RO(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RO(field)
+
+#define NETDEVICE_SHOW_RW(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RW(field)
/* use same locking and permission rules as SIF* ioctl's */
-static ssize_t netdev_store(struct class_device *dev,
+static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len,
int (*set)(struct net_device *, unsigned long))
{
- struct net_device *net = to_net_dev(dev);
- char *endp;
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
unsigned long new;
- int ret = -EINVAL;
+ int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- new = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
goto err;
- rtnl_lock();
- if (dev_isalive(net)) {
- if ((ret = (*set)(net, new)) == 0)
- ret = len;
- }
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ goto err;
+
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+
rtnl_unlock();
err:
return ret;
}
-NETDEVICE_SHOW(addr_len, fmt_dec);
-NETDEVICE_SHOW(iflink, fmt_dec);
-NETDEVICE_SHOW(ifindex, fmt_dec);
-NETDEVICE_SHOW(features, fmt_long_hex);
-NETDEVICE_SHOW(type, fmt_dec);
-NETDEVICE_SHOW(link_mode, fmt_dec);
+/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */
+static ssize_t
+netdev_lock_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ return ret;
+
+ netdev_lock(netdev);
-/* use same locking rules as GIFHWADDR ioctl's */
-static ssize_t format_addr(char *buf, const unsigned char *addr, int len)
+ if (dev_isalive(netdev)) {
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+ }
+ netdev_unlock(netdev);
+
+ return ret;
+}
+
+NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
+NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
+NETDEVICE_SHOW_RO(addr_len, fmt_dec);
+NETDEVICE_SHOW_RO(ifindex, fmt_dec);
+NETDEVICE_SHOW_RO(type, fmt_dec);
+NETDEVICE_SHOW_RO(link_mode, fmt_dec);
+
+static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
- int i;
- char *cp = buf;
+ struct net_device *ndev = to_net_dev(dev);
- for (i = 0; i < len; i++)
- cp += sprintf(cp, "%02x%c", addr[i],
- i == (len - 1) ? '\n' : ':');
- return cp - buf;
+ return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev));
}
+static DEVICE_ATTR_RO(iflink);
-static ssize_t show_address(struct class_device *dev, char *buf)
+static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
{
- struct net_device *net = to_net_dev(dev);
+ return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type));
+}
+
+static ssize_t name_assign_type_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
- if (dev_isalive(net))
- ret = format_addr(buf, net->dev_addr, net->addr_len);
- read_unlock(&dev_base_lock);
+ if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN)
+ ret = netdev_show(dev, attr, buf, format_name_assign_type);
+
return ret;
}
+static DEVICE_ATTR_RO(name_assign_type);
-static ssize_t show_broadcast(struct class_device *dev, char *buf)
+/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */
+static ssize_t address_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
- struct net_device *net = to_net_dev(dev);
- if (dev_isalive(net))
- return format_addr(buf, net->broadcast, net->addr_len);
- return -EINVAL;
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ down_read(&dev_addr_sem);
+
+ rcu_read_lock();
+ if (dev_isalive(ndev))
+ ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
+ rcu_read_unlock();
+
+ up_read(&dev_addr_sem);
+ return ret;
+}
+static DEVICE_ATTR_RO(address);
+
+static ssize_t broadcast_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ if (dev_isalive(ndev))
+ ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
+ rcu_read_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(broadcast);
+
+static int change_carrier(struct net_device *dev, unsigned long new_carrier)
+{
+ if (!netif_running(dev))
+ return -EINVAL;
+ return dev_change_carrier(dev, (bool)new_carrier);
+}
+
+static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ /* The check is also done in change_carrier; this helps returning early
+ * without hitting the locking section in netdev_store.
+ */
+ if (!netdev->netdev_ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+
+ return netdev_store(dev, attr, buf, len, change_carrier);
+}
+
+static ssize_t carrier_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if (netif_running(netdev)) {
+ /* Synchronize carrier state with link watch,
+ * see also rtnl_getlink().
+ */
+ linkwatch_sync_dev(netdev);
+
+ ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ }
+
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RW(carrier);
+
+static ssize_t speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the locking section below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if (netif_running(netdev)) {
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd))
+ ret = sysfs_emit(buf, fmt_dec, cmd.base.speed);
+ }
+ rtnl_unlock();
+ return ret;
}
+static DEVICE_ATTR_RO(speed);
-static ssize_t show_carrier(struct class_device *dev, char *buf)
+static ssize_t duplex_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the locking section below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
if (netif_running(netdev)) {
- return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd)) {
+ const char *duplex;
+
+ switch (cmd.base.duplex) {
+ case DUPLEX_HALF:
+ duplex = "half";
+ break;
+ case DUPLEX_FULL:
+ duplex = "full";
+ break;
+ default:
+ duplex = "unknown";
+ break;
+ }
+ ret = sysfs_emit(buf, "%s\n", duplex);
+ }
}
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(duplex);
+
+static ssize_t testing_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev));
+
return -EINVAL;
}
+static DEVICE_ATTR_RO(testing);
-static ssize_t show_dormant(struct class_device *dev, char *buf)
+static ssize_t dormant_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev));
return -EINVAL;
}
+static DEVICE_ATTR_RO(dormant);
-static const char *operstates[] = {
+static const char *const operstates[] = {
"unknown",
"notpresent", /* currently unused */
"down",
"lowerlayerdown",
- "testing", /* currently unused */
+ "testing",
"dormant",
"up"
};
-static ssize_t show_operstate(struct class_device *dev, char *buf)
+static ssize_t operstate_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
const struct net_device *netdev = to_net_dev(dev);
unsigned char operstate;
- read_lock(&dev_base_lock);
- operstate = netdev->operstate;
+ operstate = READ_ONCE(netdev->operstate);
if (!netif_running(netdev))
operstate = IF_OPER_DOWN;
- read_unlock(&dev_base_lock);
if (operstate >= ARRAY_SIZE(operstates))
return -EINVAL; /* should not happen */
- return sprintf(buf, "%s\n", operstates[operstate]);
+ return sysfs_emit(buf, "%s\n", operstates[operstate]);
+}
+static DEVICE_ATTR_RO(operstate);
+
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ return sysfs_emit(buf, fmt_dec,
+ atomic_read(&netdev->carrier_up_count) +
+ atomic_read(&netdev->carrier_down_count));
+}
+static DEVICE_ATTR_RO(carrier_changes);
+
+static ssize_t carrier_up_count_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+}
+static DEVICE_ATTR_RO(carrier_up_count);
+
+static ssize_t carrier_down_count_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
}
+static DEVICE_ATTR_RO(carrier_down_count);
/* read-write attributes */
-NETDEVICE_SHOW(mtu, fmt_dec);
-static int change_mtu(struct net_device *net, unsigned long new_mtu)
+static int change_mtu(struct net_device *dev, unsigned long new_mtu)
{
- return dev_set_mtu(net, (int) new_mtu);
+ return dev_set_mtu(dev, (int)new_mtu);
}
-static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len)
+static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_mtu);
+ return netdev_store(dev, attr, buf, len, change_mtu);
}
+NETDEVICE_SHOW_RW(mtu, fmt_dec);
-NETDEVICE_SHOW(flags, fmt_hex);
+static int change_flags(struct net_device *dev, unsigned long new_flags)
+{
+ return dev_change_flags(dev, (unsigned int)new_flags, NULL);
+}
-static int change_flags(struct net_device *net, unsigned long new_flags)
+static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return dev_change_flags(net, (unsigned) new_flags);
+ return netdev_store(dev, attr, buf, len, change_flags);
}
+NETDEVICE_SHOW_RW(flags, fmt_hex);
-static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len)
+static ssize_t tx_queue_len_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_flags);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len);
}
+NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
-NETDEVICE_SHOW(tx_queue_len, fmt_ulong);
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+ netdev_set_gro_flush_timeout(dev, val);
+ return 0;
+}
-static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
+static ssize_t gro_flush_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- net->tx_queue_len = new_len;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
+static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
+{
+ if (val > S32_MAX)
+ return -ERANGE;
+
+ netdev_set_defer_hard_irqs(dev, (u32)val);
return 0;
}
-static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, size_t len)
+static ssize_t napi_defer_hard_irqs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_tx_queue_len);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_lock_store(dev, attr, buf, len,
+ change_napi_defer_hard_irqs);
}
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint);
+
+static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ size_t count = len;
+ ssize_t ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
-NETDEVICE_SHOW(weight, fmt_dec);
+ /* ignore trailing newline */
+ if (len > 0 && buf[len - 1] == '\n')
+ --count;
-static int change_weight(struct net_device *net, unsigned long new_weight)
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = dev_set_alias(netdev, buf, count);
+ if (ret < 0)
+ goto err;
+ ret = len;
+ netdev_state_change(netdev);
+err:
+ rtnl_unlock();
+
+ return ret;
+}
+
+static ssize_t ifalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- net->weight = new_weight;
+ const struct net_device *netdev = to_net_dev(dev);
+ char tmp[IFALIASZ];
+ ssize_t ret;
+
+ ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+ if (ret > 0)
+ ret = sysfs_emit(buf, "%s\n", tmp);
+ return ret;
+}
+static DEVICE_ATTR_RW(ifalias);
+
+static int change_group(struct net_device *dev, unsigned long new_group)
+{
+ dev_set_group(dev, (int)new_group);
return 0;
}
-static ssize_t store_weight(struct class_device *dev, const char *buf, size_t len)
-{
- return netdev_store(dev, buf, len, change_weight);
-}
-
-static struct class_device_attribute net_class_attributes[] = {
- __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
- __ATTR(iflink, S_IRUGO, show_iflink, NULL),
- __ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
- __ATTR(features, S_IRUGO, show_features, NULL),
- __ATTR(type, S_IRUGO, show_type, NULL),
- __ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
- __ATTR(address, S_IRUGO, show_address, NULL),
- __ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
- __ATTR(carrier, S_IRUGO, show_carrier, NULL),
- __ATTR(dormant, S_IRUGO, show_dormant, NULL),
- __ATTR(operstate, S_IRUGO, show_operstate, NULL),
- __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
- __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
- __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
- store_tx_queue_len),
- __ATTR(weight, S_IRUGO | S_IWUSR, show_weight, store_weight),
- {}
+static ssize_t group_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_group);
+}
+NETDEVICE_SHOW(group, fmt_dec);
+static DEVICE_ATTR(netdev_group, 0644, group_show, group_store);
+
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+ return dev_change_proto_down(dev, (bool)proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
+static ssize_t phys_port_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct netdev_phys_item_id ppid;
+ ssize_t ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
+
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_id);
+
+static ssize_t phys_port_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ char name[IFNAMSIZ];
+ ssize_t ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sysfs_emit(buf, "%s\n", name);
+
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_name);
+
+static ssize_t phys_switch_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct netdev_phys_item_id ppid = { };
+ ssize_t ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+
+ ret = netif_get_port_parent_id(netdev, &ppid, false);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
+
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_switch_id);
+
+static struct attribute *netdev_phys_attrs[] __ro_after_init = {
+ &dev_attr_phys_port_id.attr,
+ &dev_attr_phys_port_name.attr,
+ &dev_attr_phys_switch_id.attr,
+ NULL,
+};
+
+static umode_t netdev_phys_is_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (attr == &dev_attr_phys_port_id.attr) {
+ if (!netdev->netdev_ops->ndo_get_phys_port_id)
+ return 0;
+ } else if (attr == &dev_attr_phys_port_name.attr) {
+ if (!netdev->netdev_ops->ndo_get_phys_port_name &&
+ !netdev->devlink_port)
+ return 0;
+ } else if (attr == &dev_attr_phys_switch_id.attr) {
+ if (!netdev->netdev_ops->ndo_get_port_parent_id &&
+ !netdev->devlink_port)
+ return 0;
+ }
+
+ return attr->mode;
+}
+
+static const struct attribute_group netdev_phys_group = {
+ .attrs = netdev_phys_attrs,
+ .is_visible = netdev_phys_is_visible,
+};
+
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ rcu_read_lock();
+
+ if (dev_isalive(netdev))
+ ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded));
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = netif_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
+static struct attribute *net_class_attrs[] __ro_after_init = {
+ &dev_attr_netdev_group.attr,
+ &dev_attr_type.attr,
+ &dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
+ &dev_attr_iflink.attr,
+ &dev_attr_ifindex.attr,
+ &dev_attr_name_assign_type.attr,
+ &dev_attr_addr_assign_type.attr,
+ &dev_attr_addr_len.attr,
+ &dev_attr_link_mode.attr,
+ &dev_attr_address.attr,
+ &dev_attr_broadcast.attr,
+ &dev_attr_speed.attr,
+ &dev_attr_duplex.attr,
+ &dev_attr_dormant.attr,
+ &dev_attr_testing.attr,
+ &dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
+ &dev_attr_ifalias.attr,
+ &dev_attr_carrier.attr,
+ &dev_attr_mtu.attr,
+ &dev_attr_flags.attr,
+ &dev_attr_tx_queue_len.attr,
+ &dev_attr_gro_flush_timeout.attr,
+ &dev_attr_napi_defer_hard_irqs.attr,
+ &dev_attr_proto_down.attr,
+ &dev_attr_carrier_up_count.attr,
+ &dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(net_class);
/* Show a given an attribute in the statistics group */
-static ssize_t netstat_show(const struct class_device *cd, char *buf,
+static ssize_t netstat_show(const struct device *d,
+ struct device_attribute *attr, char *buf,
unsigned long offset)
{
- struct net_device *dev = to_net_dev(cd);
- struct net_device_stats *stats;
+ struct net_device *dev = to_net_dev(d);
ssize_t ret = -EINVAL;
- if (offset > sizeof(struct net_device_stats) ||
- offset % sizeof(unsigned long) != 0)
- WARN_ON(1);
+ WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+ offset % sizeof(u64) != 0);
- read_lock(&dev_base_lock);
- if (dev_isalive(dev) && dev->get_stats &&
- (stats = (*dev->get_stats)(dev)))
- ret = sprintf(buf, fmt_ulong,
- *(unsigned long *)(((u8 *) stats) + offset));
+ rcu_read_lock();
+ if (dev_isalive(dev)) {
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
- read_unlock(&dev_base_lock);
+ ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
+ }
+ rcu_read_unlock();
return ret;
}
/* generate a read-only statistics attribute */
#define NETSTAT_ENTRY(name) \
-static ssize_t show_##name(struct class_device *cd, char *buf) \
+static ssize_t name##_show(struct device *d, \
+ struct device_attribute *attr, char *buf) \
{ \
- return netstat_show(cd, buf, \
- offsetof(struct net_device_stats, name)); \
+ return netstat_show(d, attr, buf, \
+ offsetof(struct rtnl_link_stats64, name)); \
} \
-static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+static DEVICE_ATTR_RO(name)
NETSTAT_ENTRY(rx_packets);
NETSTAT_ENTRY(tx_packets);
@@ -295,182 +859,1564 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
NETSTAT_ENTRY(tx_window_errors);
NETSTAT_ENTRY(rx_compressed);
NETSTAT_ENTRY(tx_compressed);
-
-static struct attribute *netstat_attrs[] = {
- &class_device_attr_rx_packets.attr,
- &class_device_attr_tx_packets.attr,
- &class_device_attr_rx_bytes.attr,
- &class_device_attr_tx_bytes.attr,
- &class_device_attr_rx_errors.attr,
- &class_device_attr_tx_errors.attr,
- &class_device_attr_rx_dropped.attr,
- &class_device_attr_tx_dropped.attr,
- &class_device_attr_multicast.attr,
- &class_device_attr_collisions.attr,
- &class_device_attr_rx_length_errors.attr,
- &class_device_attr_rx_over_errors.attr,
- &class_device_attr_rx_crc_errors.attr,
- &class_device_attr_rx_frame_errors.attr,
- &class_device_attr_rx_fifo_errors.attr,
- &class_device_attr_rx_missed_errors.attr,
- &class_device_attr_tx_aborted_errors.attr,
- &class_device_attr_tx_carrier_errors.attr,
- &class_device_attr_tx_fifo_errors.attr,
- &class_device_attr_tx_heartbeat_errors.attr,
- &class_device_attr_tx_window_errors.attr,
- &class_device_attr_rx_compressed.attr,
- &class_device_attr_tx_compressed.attr,
+NETSTAT_ENTRY(rx_nohandler);
+
+static struct attribute *netstat_attrs[] __ro_after_init = {
+ &dev_attr_rx_packets.attr,
+ &dev_attr_tx_packets.attr,
+ &dev_attr_rx_bytes.attr,
+ &dev_attr_tx_bytes.attr,
+ &dev_attr_rx_errors.attr,
+ &dev_attr_tx_errors.attr,
+ &dev_attr_rx_dropped.attr,
+ &dev_attr_tx_dropped.attr,
+ &dev_attr_multicast.attr,
+ &dev_attr_collisions.attr,
+ &dev_attr_rx_length_errors.attr,
+ &dev_attr_rx_over_errors.attr,
+ &dev_attr_rx_crc_errors.attr,
+ &dev_attr_rx_frame_errors.attr,
+ &dev_attr_rx_fifo_errors.attr,
+ &dev_attr_rx_missed_errors.attr,
+ &dev_attr_tx_aborted_errors.attr,
+ &dev_attr_tx_carrier_errors.attr,
+ &dev_attr_tx_fifo_errors.attr,
+ &dev_attr_tx_heartbeat_errors.attr,
+ &dev_attr_tx_window_errors.attr,
+ &dev_attr_rx_compressed.attr,
+ &dev_attr_tx_compressed.attr,
+ &dev_attr_rx_nohandler.attr,
NULL
};
-
-static struct attribute_group netstat_group = {
+static const struct attribute_group netstat_group = {
.name = "statistics",
.attrs = netstat_attrs,
};
-#ifdef WIRELESS_EXT
-/* helper function that does all the locking etc for wireless stats */
-static ssize_t wireless_show(struct class_device *cd, char *buf,
- ssize_t (*format)(const struct iw_statistics *,
- char *))
+static struct attribute *wireless_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group wireless_group = {
+ .name = "wireless",
+ .attrs = wireless_attrs,
+};
+
+static bool wireless_group_needed(struct net_device *ndev)
{
- struct net_device *dev = to_net_dev(cd);
- const struct iw_statistics *iw = NULL;
- ssize_t ret = -EINVAL;
-
- read_lock(&dev_base_lock);
- if (dev_isalive(dev)) {
- if(dev->wireless_handlers &&
- dev->wireless_handlers->get_wireless_stats)
- iw = dev->wireless_handlers->get_wireless_stats(dev);
- if (iw != NULL)
- ret = (*format)(iw, buf);
+#if IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ return true;
+#endif
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ if (ndev->wireless_handlers)
+ return true;
+#endif
+ return false;
+}
+
+#else /* CONFIG_SYSFS */
+#define net_class_groups NULL
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_SYSFS
+#define to_rx_queue_attr(_attr) \
+ container_of(_attr, struct rx_queue_attribute, attr)
+
+#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
+
+static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, buf);
+}
+
+static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, buf, count);
+}
+
+static const struct sysfs_ops rx_queue_sysfs_ops = {
+ .show = rx_queue_attr_show,
+ .store = rx_queue_attr_store,
+};
+
+#ifdef CONFIG_RPS
+static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
+{
+ struct rps_map *map;
+ cpumask_var_t mask;
+ int i, len;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ if (map)
+ for (i = 0; i < map->len; i++)
+ cpumask_set_cpu(map->cpus[i], mask);
+
+ len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask));
+ rcu_read_unlock();
+ free_cpumask_var(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
+ cpumask_var_t mask)
+{
+ static DEFINE_MUTEX(rps_map_mutex);
+ struct rps_map *old_map, *map;
+ int cpu, i;
+
+ map = kzalloc(max_t(unsigned int,
+ RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+ GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ i = 0;
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ map->cpus[i++] = cpu;
+
+ if (i) {
+ map->len = i;
+ } else {
+ kfree(map);
+ map = NULL;
}
- read_unlock(&dev_base_lock);
- return ret;
+ mutex_lock(&rps_map_mutex);
+ old_map = rcu_dereference_protected(queue->rps_map,
+ mutex_is_locked(&rps_map_mutex));
+ rcu_assign_pointer(queue->rps_map, map);
+
+ if (map)
+ static_branch_inc(&rps_needed);
+ if (old_map)
+ static_branch_dec(&rps_needed);
+
+ mutex_unlock(&rps_map_mutex);
+
+ if (old_map)
+ kfree_rcu(old_map, rcu);
+ return 0;
+}
+
+int rps_cpumask_housekeeping(struct cpumask *mask)
+{
+ if (!cpumask_empty(mask)) {
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err)
+ goto out;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto out;
+
+ err = netdev_rx_queue_set_rps_mask(queue, mask);
+
+out:
+ free_cpumask_var(mask);
+ return err ? : len;
+}
+
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned long val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = 1UL << flow_table->log;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%lu\n", val);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+ vfree(table);
+}
+
+static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ unsigned long mask, count;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
+
+ if (count) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check
+ * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
+ if (!table)
+ return -ENOMEM;
+
+ table->log = ilog2(mask) + 1;
+ for (count = 0; count <= mask; count++) {
+ table->flows[count].cpu = RPS_NO_CPU;
+ table->flows[count].filter = RPS_NO_FILTER;
+ }
+ } else {
+ table = NULL;
+ }
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = rcu_dereference_protected(queue->rps_flow_table,
+ lockdep_is_held(&rps_dev_flow_lock));
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
+static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
+ = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
+ = __ATTR(rps_flow_cnt, 0644,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
+
+static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
+#ifdef CONFIG_RPS
+ &rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
+#endif
+ NULL
+};
+ATTRIBUTE_GROUPS(rx_queue_default);
+
+static void rx_queue_release(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
+ struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+
+ map = rcu_dereference_protected(queue->rps_map, 1);
+ if (map) {
+ RCU_INIT_POINTER(queue->rps_map, NULL);
+ kfree_rcu(map, rcu);
+ }
+
+ flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
+ if (flow_table) {
+ RCU_INIT_POINTER(queue->rps_flow_table, NULL);
+ call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
+ }
+#endif
+
+ memset(kobj, 0, sizeof(*kobj));
+ netdev_put(queue->dev, &queue->dev_tracker);
+}
+
+static const void *rx_queue_namespace(const struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->namespace)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static void rx_queue_get_ownership(const struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = rx_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
+static const struct kobj_type rx_queue_ktype = {
+ .sysfs_ops = &rx_queue_sysfs_ops,
+ .release = rx_queue_release,
+ .namespace = rx_queue_namespace,
+ .get_ownership = rx_queue_get_ownership,
+};
+
+static int rx_queue_default_mask(struct net_device *dev,
+ struct netdev_rx_queue *queue)
+{
+#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL)
+ struct cpumask *rps_default_mask;
+ int res = 0;
+
+ mutex_lock(&rps_default_mask_mutex);
+
+ rps_default_mask = dev_net(dev)->core.rps_default_mask;
+ if (rps_default_mask && !cpumask_empty(rps_default_mask))
+ res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask);
+
+ mutex_unlock(&rps_default_mask_mutex);
+
+ return res;
+#else
+ return 0;
+#endif
+}
+
+static int rx_queue_add_kobject(struct net_device *dev, int index)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ /* Rx queues are cleared in rx_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed");
+ return -EAGAIN;
+ }
+
+ /* Kobject_put later will trigger rx_queue_release call which
+ * decreases dev refcount: Take that reference here
+ */
+ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
+
+ kobj->kset = dev->queues_kset;
+ error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
+ "rx-%u", index);
+ if (error)
+ goto err;
+
+ queue->groups = rx_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
+ if (dev->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
+ if (error)
+ goto err_default_groups;
+ }
+
+ error = rx_queue_default_mask(dev, queue);
+ if (error)
+ goto err_default_groups;
+
+ kobject_uevent(kobj, KOBJ_ADD);
+
+ return error;
+
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
+err:
+ kobject_put(kobj);
+ return error;
+}
+
+static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (dev->sysfs_rx_queue_group)
+ error = sysfs_group_change_owner(
+ kobj, dev->sysfs_rx_queue_group, kuid, kgid);
+
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = old_num; i < new_num; i++) {
+ error = rx_queue_add_kobject(dev, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ struct netdev_rx_queue *queue = &dev->_rx[i];
+ struct kobject *kobj = &queue->kobj;
+
+ if (!check_net(dev_net(dev)))
+ kobj->uevent_suppress = 1;
+ if (dev->sysfs_rx_queue_group)
+ sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ sysfs_remove_groups(kobj, queue->groups);
+ kobject_put(kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
+static int net_rx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = 0; i < num; i++) {
+ error = rx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf);
+ ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len);
+};
+#define to_netdev_queue_attr(_attr) \
+ container_of(_attr, struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ const struct netdev_queue_attribute *attribute
+ = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(kobj, attr, queue, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ const struct netdev_queue_attribute *attribute
+ = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(kobj, attr, queue, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+ .show = netdev_queue_attr_show,
+ .store = netdev_queue_attr_store,
+};
+
+static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
+
+ return sysfs_emit(buf, fmt_ulong, trans_timeout);
+}
+
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int i;
+
+ i = queue - dev->_tx;
+ BUG_ON(i >= dev->num_tx_queues);
+
+ return i;
+}
+
+static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ int num_tc, tc, index, ret;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
+
+ index = get_netdev_queue_index(queue);
+
+ /* If queue belongs to subordinate dev use its TC mapping */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+
+ rtnl_unlock();
+
+ if (tc < 0)
+ return -EINVAL;
+
+ /* We can report the traffic class one of two ways:
+ * Subordinate device traffic classes are reported with the traffic
+ * class first, and then the subordinate class so for example TC0 on
+ * subordinate device 2 will be reported as "0-2". If the queue
+ * belongs to the root device it will be reported with just the
+ * traffic class, so just "0" for TC 0 for example.
+ */
+ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) :
+ sysfs_emit(buf, "%d\n", tc);
+}
+
+#ifdef CONFIG_XPS
+static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
+}
+
+static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ int err, index = get_netdev_queue_index(queue);
+ struct net_device *dev = queue->dev;
+ u32 rate = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* The check is also done later; this helps returning early without
+ * hitting the locking section below.
+ */
+ if (!dev->netdev_ops->ndo_set_tx_maxrate)
+ return -EOPNOTSUPP;
+
+ err = kstrtou32(buf, 10, &rate);
+ if (err < 0)
+ return err;
+
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err)
+ return err;
+
+ err = -EOPNOTSUPP;
+ netdev_lock_ops(dev);
+ if (dev->netdev_ops->ndo_set_tx_maxrate)
+ err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+ netdev_unlock_ops(dev);
+
+ if (!err) {
+ queue->tx_maxrate = rate;
+ rtnl_unlock();
+ return len;
+ }
+
+ rtnl_unlock();
+ return err;
+}
+
+static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init
+ = __ATTR_RW(tx_maxrate);
+#endif
+
+static struct netdev_queue_attribute queue_trans_timeout __ro_after_init
+ = __ATTR_RO(tx_timeout);
+
+static struct netdev_queue_attribute queue_traffic_class __ro_after_init
+ = __ATTR_RO(traffic_class);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sysfs_emit(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) {
+ value = DQL_MAX_LIMIT;
+ } else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
+ = __ATTR(hold_time, 0644,
+ bql_show_hold_time, bql_set_hold_time);
+
+static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
+}
+
+static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ value = msecs_to_jiffies(value);
+ if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG))
+ return -ERANGE;
+
+ if (!dql->stall_thrs && value)
+ dql->last_reap = jiffies;
+ /* Force last_reap to be live */
+ smp_wmb();
+ dql->stall_thrs = value;
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
+ __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
+
+static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
+}
+
+static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ WRITE_ONCE(queue->dql.stall_max, 0);
+ return len;
}
-/* show function template for wireless fields */
-#define WIRELESS_SHOW(name, field, format_string) \
-static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \
+static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
+ __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
+
+static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%lu\n", dql->stall_cnt);
+}
+
+static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
+ __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
+
+static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
+ __ATTR(inflight, 0444, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, char *buf) \
{ \
- return sprintf(buf, format_string, iw->field); \
+ return bql_show(buf, queue->dql.FIELD); \
} \
-static ssize_t show_iw_##name(struct class_device *cd, char *buf) \
+ \
+static ssize_t bql_set_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, \
+ const char *buf, size_t len) \
{ \
- return wireless_show(cd, buf, format_iw_##name); \
+ return bql_set(buf, len, &queue->dql.FIELD); \
} \
-static CLASS_DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL)
-
-WIRELESS_SHOW(status, status, fmt_hex);
-WIRELESS_SHOW(link, qual.qual, fmt_dec);
-WIRELESS_SHOW(level, qual.level, fmt_dec);
-WIRELESS_SHOW(noise, qual.noise, fmt_dec);
-WIRELESS_SHOW(nwid, discard.nwid, fmt_dec);
-WIRELESS_SHOW(crypt, discard.code, fmt_dec);
-WIRELESS_SHOW(fragment, discard.fragment, fmt_dec);
-WIRELESS_SHOW(misc, discard.misc, fmt_dec);
-WIRELESS_SHOW(retries, discard.retries, fmt_dec);
-WIRELESS_SHOW(beacon, miss.beacon, fmt_dec);
-
-static struct attribute *wireless_attrs[] = {
- &class_device_attr_status.attr,
- &class_device_attr_link.attr,
- &class_device_attr_level.attr,
- &class_device_attr_noise.attr,
- &class_device_attr_nwid.attr,
- &class_device_attr_crypt.attr,
- &class_device_attr_fragment.attr,
- &class_device_attr_retries.attr,
- &class_device_attr_misc.attr,
- &class_device_attr_beacon.attr,
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \
+ = __ATTR(NAME, 0644, \
+ bql_show_ ## NAME, bql_set_ ## NAME)
+
+BQL_ATTR(limit, limit);
+BQL_ATTR(limit_max, max_limit);
+BQL_ATTR(limit_min, min_limit);
+
+static struct attribute *dql_attrs[] __ro_after_init = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ &bql_stall_thrs_attribute.attr,
+ &bql_stall_cnt_attribute.attr,
+ &bql_stall_max_attribute.attr,
NULL
};
-static struct attribute_group wireless_group = {
- .name = "wireless",
- .attrs = wireless_attrs,
+static const struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
};
+#else
+/* Fake declaration, all the code using it should be dead */
+static const struct attribute_group dql_group = {};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
+static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
+ int tc, char *buf, enum xps_map_type type)
+{
+ struct xps_dev_maps *dev_maps;
+ unsigned long *mask;
+ unsigned int nr_ids;
+ int j, len;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps[type]);
+
+ /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0
+ * when dev_maps hasn't been allocated yet, to be backward compatible.
+ */
+ nr_ids = dev_maps ? dev_maps->nr_ids :
+ (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);
+
+ mask = bitmap_zalloc(nr_ids, GFP_NOWAIT);
+ if (!mask) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
+ if (!dev_maps || tc >= dev_maps->num_tc)
+ goto out_no_maps;
+
+ for (j = 0; j < nr_ids; j++) {
+ int i, tci = j * dev_maps->num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ __set_bit(j, mask);
+ break;
+ }
+ }
+ }
+out_no_maps:
+ rcu_read_unlock();
+
+ len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
+ bitmap_free(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ int len, tc, ret;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ index = get_netdev_queue_index(queue);
+
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
+ rtnl_unlock();
+
+ len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
+
+ dev_put(dev);
+ return len;
+}
+
+static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ cpumask_var_t mask;
+ int err;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ err = netif_set_xps_queue(dev, mask, index);
+ rtnl_unlock();
+
+ free_cpumask_var(mask);
+
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
+ = __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ int tc, ret;
+
+ index = get_netdev_queue_index(queue);
+
+ ret = sysfs_rtnl_lock(kobj, attr, dev);
+ if (ret)
+ return ret;
+
+ tc = netdev_txq_to_tc(dev, index);
+
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
+ rtnl_unlock();
+
+ ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL;
+ dev_put(dev);
+ return ret;
+}
+
+static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct net_device *dev = queue->dev;
+ struct net *net = dev_net(dev);
+ unsigned long *mask;
+ unsigned int index;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
+ if (!mask)
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+ if (err) {
+ bitmap_free(mask);
+ return err;
+ }
+
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
+ bitmap_free(mask);
+ return err;
+ }
+
+ cpus_read_lock();
+ err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
+ cpus_read_unlock();
+
+ rtnl_unlock();
+
+ bitmap_free(mask);
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+ = __ATTR_RW(xps_rxqs);
+#endif /* CONFIG_XPS */
+
+static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
+ &queue_trans_timeout.attr,
+ &queue_traffic_class.attr,
+#ifdef CONFIG_XPS
+ &xps_cpus_attribute.attr,
+ &xps_rxqs_attribute.attr,
+ &queue_tx_maxrate.attr,
#endif
+ NULL
+};
+ATTRIBUTE_GROUPS(netdev_queue_default);
-#ifdef CONFIG_HOTPLUG
-static int netdev_uevent(struct class_device *cd, char **envp,
- int num_envp, char *buf, int size)
+static void netdev_queue_release(struct kobject *kobj)
{
- struct net_device *dev = to_net_dev(cd);
- int i = 0;
- int n;
+ struct netdev_queue *queue = to_netdev_queue(kobj);
- /* pass interface to uevent. */
- envp[i++] = buf;
- n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1;
- buf += n;
- size -= n;
+ memset(kobj, 0, sizeof(*kobj));
+ netdev_put(queue->dev, &queue->dev_tracker);
+}
- if ((size <= 0) || (i >= num_envp))
+static const void *netdev_queue_namespace(const struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->namespace)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static void netdev_queue_get_ownership(const struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = netdev_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
+static const struct kobj_type netdev_queue_ktype = {
+ .sysfs_ops = &netdev_queue_sysfs_ops,
+ .release = netdev_queue_release,
+ .namespace = netdev_queue_namespace,
+ .get_ownership = netdev_queue_get_ownership,
+};
+
+static bool netdev_uses_bql(const struct net_device *dev)
+{
+ if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE))
+ return false;
+
+ return IS_ENABLED(CONFIG_BQL);
+}
+
+static int netdev_queue_add_kobject(struct net_device *dev, int index)
+{
+ struct netdev_queue *queue = dev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ /* Tx queues are cleared in netdev_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed");
+ return -EAGAIN;
+ }
+
+ /* Kobject_put later will trigger netdev_queue_release call
+ * which decreases dev refcount: Take that reference here
+ */
+ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
+
+ kobj->kset = dev->queues_kset;
+ error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+ "tx-%u", index);
+ if (error)
+ goto err;
+
+ queue->groups = netdev_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
+ if (netdev_uses_bql(dev)) {
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto err_default_groups;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+ return 0;
+
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
+err:
+ kobject_put(kobj);
+ return error;
+}
+
+static int tx_queue_change_owner(struct net_device *ndev, int index,
+ kuid_t kuid, kgid_t kgid)
+{
+ struct netdev_queue *queue = ndev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (netdev_uses_bql(ndev))
+ error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
+
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+ /* Tx queue kobjects are allowed to be updated when a device is being
+ * unregistered, but solely to remove queues from qdiscs. Any path
+ * adding queues should be fixed.
+ */
+ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num,
+ "New queues can't be registered after device unregistration.");
+
+ for (i = old_num; i < new_num; i++) {
+ error = netdev_queue_add_kobject(dev, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ struct netdev_queue *queue = dev->_tx + i;
+
+ if (!check_net(dev_net(dev)))
+ queue->kobj.uevent_suppress = 1;
+
+ if (netdev_uses_bql(dev))
+ sysfs_remove_group(&queue->kobj, &dql_group);
+
+ sysfs_remove_groups(&queue->kobj, queue->groups);
+ kobject_put(&queue->kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
+static int net_tx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ error = tx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
+static int register_queue_kobjects(struct net_device *dev)
+{
+ int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ dev->queues_kset = kset_create_and_add("queues",
+ NULL, &dev->dev.kobj);
+ if (!dev->queues_kset)
return -ENOMEM;
+ real_rx = dev->real_num_rx_queues;
+#endif
+ real_tx = dev->real_num_tx_queues;
+
+ error = net_rx_queue_update_kobjects(dev, 0, real_rx);
+ if (error)
+ goto error;
+ rxq = real_rx;
+
+ error = netdev_queue_update_kobjects(dev, 0, real_tx);
+ if (error)
+ goto error;
+ txq = real_tx;
- envp[i] = NULL;
return 0;
+
+error:
+ netdev_queue_update_kobjects(dev, txq, 0);
+ net_rx_queue_update_kobjects(dev, rxq, 0);
+#ifdef CONFIG_SYSFS
+ kset_unregister(dev->queues_kset);
+#endif
+ return error;
}
+
+static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ if (ndev->queues_kset) {
+ error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid);
+ if (error)
+ return error;
+ }
+ real_rx = ndev->real_num_rx_queues;
#endif
+ real_tx = ndev->real_num_tx_queues;
+
+ error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid);
+ if (error)
+ return error;
+
+ error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+static void remove_queue_kobjects(struct net_device *dev)
+{
+ int real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ real_rx = dev->real_num_rx_queues;
+#endif
+ real_tx = dev->real_num_tx_queues;
+
+ net_rx_queue_update_kobjects(dev, real_rx, 0);
+ netdev_queue_update_kobjects(dev, real_tx, 0);
+
+ netdev_lock_ops(dev);
+ dev->real_num_rx_queues = 0;
+ dev->real_num_tx_queues = 0;
+ netdev_unlock_ops(dev);
+#ifdef CONFIG_SYSFS
+ kset_unregister(dev->queues_kset);
+#endif
+}
+
+static bool net_current_may_mount(void)
+{
+ struct net *net = current->nsproxy->net_ns;
+
+ return ns_capable(net->user_ns, CAP_SYS_ADMIN);
+}
+
+static void *net_grab_current_ns(void)
+{
+ struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+ if (ns)
+ refcount_inc(&ns->passive);
+#endif
+ return ns;
+}
+
+static const void *net_initial_ns(void)
+{
+ return &init_net;
+}
+
+static const void *net_netlink_ns(struct sock *sk)
+{
+ return sock_net(sk);
+}
+
+const struct kobj_ns_type_operations net_ns_type_operations = {
+ .type = KOBJ_NS_TYPE_NET,
+ .current_may_mount = net_current_may_mount,
+ .grab_current_ns = net_grab_current_ns,
+ .netlink_ns = net_netlink_ns,
+ .initial_ns = net_initial_ns,
+ .drop_ns = net_drop_ns,
+};
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
+
+static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env)
+{
+ const struct net_device *dev = to_net_dev(d);
+ int retval;
+
+ /* pass interface to uevent. */
+ retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
+ if (retval)
+ goto exit;
+
+ /* pass ifindex to uevent.
+ * ifindex is useful as it won't change (interface name may change)
+ * and is what RtNetlink uses natively.
+ */
+ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
+
+exit:
+ return retval;
+}
/*
- * netdev_release -- destroy and free a dead device.
- * Called when last reference to class_device kobject is gone.
+ * netdev_release -- destroy and free a dead device.
+ * Called when last reference to device kobject is gone.
*/
-static void netdev_release(struct class_device *cd)
+static void netdev_release(struct device *d)
{
- struct net_device *dev
- = container_of(cd, struct net_device, class_dev);
+ struct net_device *dev = to_net_dev(d);
BUG_ON(dev->reg_state != NETREG_RELEASED);
- kfree((char *)dev - dev->padded);
+ /* no need to wait for rcu grace period:
+ * device is dead and about to be freed.
+ */
+ kfree(rcu_access_pointer(dev->ifalias));
+ kvfree(dev);
+}
+
+static const void *net_namespace(const struct device *d)
+{
+ const struct net_device *dev = to_net_dev(d);
+
+ return dev_net(dev);
+}
+
+static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid)
+{
+ const struct net_device *dev = to_net_dev(d);
+ const struct net *net = dev_net(dev);
+
+ net_ns_get_ownership(net, uid, gid);
}
-static struct class net_class = {
+static const struct class net_class = {
.name = "net",
- .release = netdev_release,
- .class_dev_attrs = net_class_attributes,
-#ifdef CONFIG_HOTPLUG
- .uevent = netdev_uevent,
-#endif
+ .dev_release = netdev_release,
+ .dev_groups = net_class_groups,
+ .dev_uevent = netdev_uevent,
+ .ns_type = &net_ns_type_operations,
+ .namespace = net_namespace,
+ .get_ownership = net_get_ownership,
};
-void netdev_unregister_sysfs(struct net_device * net)
+#ifdef CONFIG_OF
+static int of_dev_node_match(struct device *dev, const void *data)
+{
+ for (; dev; dev = dev->parent) {
+ if (dev->of_node == data)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * of_find_net_device_by_node - lookup the net device for the device node
+ * @np: OF device node
+ *
+ * Looks up the net_device structure corresponding with the device node.
+ * If successful, returns a pointer to the net_device with the embedded
+ * struct device refcount incremented by one, or NULL on failure. The
+ * refcount must be dropped when done with the net_device.
+ */
+struct net_device *of_find_net_device_by_node(struct device_node *np)
{
- class_device_del(&(net->class_dev));
+ struct device *dev;
+
+ dev = class_find_device(&net_class, NULL, np, of_dev_node_match);
+ if (!dev)
+ return NULL;
+
+ return to_net_dev(dev);
+}
+EXPORT_SYMBOL(of_find_net_device_by_node);
+#endif
+
+/* Delete sysfs entries but hold kobject reference until after all
+ * netdev references are gone.
+ */
+void netdev_unregister_kobject(struct net_device *ndev)
+{
+ struct device *dev = &ndev->dev;
+
+ if (!check_net(dev_net(ndev)))
+ dev_set_uevent_suppress(dev, 1);
+
+ kobject_get(&dev->kobj);
+
+ remove_queue_kobjects(ndev);
+
+ pm_runtime_set_memalloc_noio(dev, false);
+
+ device_del(dev);
}
/* Create sysfs entries for network device. */
-int netdev_register_sysfs(struct net_device *net)
+int netdev_register_kobject(struct net_device *ndev)
{
- struct class_device *class_dev = &(net->class_dev);
- struct attribute_group **groups = net->sysfs_groups;
+ struct device *dev = &ndev->dev;
+ const struct attribute_group **groups = ndev->sysfs_groups;
+ int error = 0;
- class_device_initialize(class_dev);
- class_dev->class = &net_class;
- class_dev->class_data = net;
- class_dev->groups = groups;
+ device_initialize(dev);
+ dev->class = &net_class;
+ dev->platform_data = ndev;
+ dev->groups = groups;
- BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ);
- strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE);
+ dev_set_name(dev, "%s", ndev->name);
- if (net->get_stats)
- *groups++ = &netstat_group;
+#ifdef CONFIG_SYSFS
+ /* Allow for a device specific group */
+ if (*groups)
+ groups++;
-#ifdef WIRELESS_EXT
- if (net->wireless_handlers && net->wireless_handlers->get_wireless_stats)
+ *groups++ = &netstat_group;
+ *groups++ = &netdev_phys_group;
+
+ if (wireless_group_needed(ndev))
*groups++ = &wireless_group;
-#endif
+#endif /* CONFIG_SYSFS */
+
+ error = device_add(dev);
+ if (error)
+ return error;
+
+ error = register_queue_kobjects(ndev);
+ if (error) {
+ device_del(dev);
+ return error;
+ }
+
+ pm_runtime_set_memalloc_noio(dev, true);
+
+ return error;
+}
+
+/* Change owner for sysfs entries when moving network devices across network
+ * namespaces owned by different user namespaces.
+ */
+int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
+ const struct net *net_new)
+{
+ kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID;
+ kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID;
+ struct device *dev = &ndev->dev;
+ int error;
- return class_device_add(class_dev);
+ net_ns_get_ownership(net_old, &old_uid, &old_gid);
+ net_ns_get_ownership(net_new, &new_uid, &new_gid);
+
+ /* The network namespace was changed but the owning user namespace is
+ * identical so there's no need to change the owner of sysfs entries.
+ */
+ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid))
+ return 0;
+
+ error = device_change_owner(dev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ error = queue_change_owner(ndev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
+ const void *ns)
+{
+ return class_create_file_ns(&net_class, class_attr, ns);
+}
+EXPORT_SYMBOL(netdev_class_create_file_ns);
+
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
+ const void *ns)
+{
+ class_remove_file_ns(&net_class, class_attr, ns);
}
+EXPORT_SYMBOL(netdev_class_remove_file_ns);
-int netdev_sysfs_init(void)
+int __init netdev_kobject_init(void)
{
+ kobj_ns_type_register(&net_ns_type_operations);
return class_register(&net_class);
}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
new file mode 100644
index 000000000000..e938f25e8e86
--- /dev/null
+++ b/net/core/net-sysfs.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_SYSFS_H__
+#define __NET_SYSFS_H__
+
+int __init netdev_kobject_init(void);
+int netdev_register_kobject(struct net_device *);
+void netdev_unregister_kobject(struct net_device *);
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+ int old_num, int new_num);
+int netdev_change_owner(struct net_device *, const struct net *net_old,
+ const struct net *net_new);
+
+extern struct mutex rps_default_mask_mutex;
+
+#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 000000000000..f2fa34b1d78d
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/slab.h>
+
+#include <linux/unaligned.h>
+#include <asm/bitops.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
+#include <trace/events/net.h>
+#include <trace/events/napi.h>
+#include <trace/events/sock.h>
+#include <trace/events/udp.h>
+#include <trace/events/tcp.h>
+#include <trace/events/fib.h>
+#include <trace/events/qdisc.h>
+#if IS_ENABLED(CONFIG_BRIDGE)
+#include <trace/events/bridge.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full);
+#endif
+
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
+#include <trace/events/neigh.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
new file mode 100644
index 000000000000..a6e6a964a287
--- /dev/null
+++ b/net/core/net_namespace.c
@@ -0,0 +1,1549 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/rculist.h>
+#include <linux/nsproxy.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/file.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
+#include <linux/net_namespace.h>
+#include <linux/sched/task.h>
+#include <linux/uidgid.h>
+#include <linux/proc_fs.h>
+#include <linux/nstree.h>
+
+#include <net/aligned_data.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ * Our network namespace constructor/destructor lists
+ */
+
+static LIST_HEAD(pernet_list);
+static struct list_head *first_device = &pernet_list;
+
+LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
+
+/* Protects net_namespace_list. Nests iside rtnl_lock() */
+DECLARE_RWSEM(net_rwsem);
+EXPORT_SYMBOL_GPL(net_rwsem);
+
+#ifdef CONFIG_KEYS
+static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
+#endif
+
+struct net init_net;
+EXPORT_SYMBOL(init_net);
+
+static bool init_net_initialized;
+/*
+ * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_device pointer.
+ * This is internal net namespace object. Please, don't use it
+ * outside.
+ */
+DECLARE_RWSEM(pernet_ops_rwsem);
+
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
+#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
+
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+ unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
+ unsigned int generic_size;
+ struct net_generic *ng;
+
+ generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);
+
+ ng = kzalloc(generic_size, GFP_KERNEL);
+ if (ng)
+ ng->s.len = gen_ptrs;
+
+ return ng;
+}
+
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
+{
+ struct net_generic *ng, *old_ng;
+
+ BUG_ON(id < MIN_PERNET_OPS_ID);
+
+ old_ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
+
+ ng = net_alloc_generic();
+ if (!ng)
+ return -ENOMEM;
+
+ /*
+ * Some synchronisation notes:
+ *
+ * The net_generic explores the net->gen array inside rcu
+ * read section. Besides once set the net->gen->ptr[x]
+ * pointer never changes (see rules in netns/generic.h).
+ *
+ * That said, we simply duplicate this array and schedule
+ * the old copy for kfree after a grace period.
+ */
+
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
+
+ rcu_assign_pointer(net->gen, ng);
+ kfree_rcu(old_ng, s.rcu);
+ return 0;
+}
+
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+ struct net_generic *ng;
+ int err = -ENOMEM;
+ void *data = NULL;
+
+ if (ops->id) {
+ data = kzalloc(ops->size, GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ err = net_assign_generic(net, *ops->id, data);
+ if (err)
+ goto cleanup;
+ }
+ err = 0;
+ if (ops->init)
+ err = ops->init(net);
+ if (!err)
+ return 0;
+
+ if (ops->id) {
+ ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ ng->ptr[*ops->id] = NULL;
+ }
+
+cleanup:
+ kfree(data);
+
+out:
+ return err;
+}
+
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
+ }
+}
+
+static void ops_exit_rtnl_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ const struct pernet_operations *saved_ops = ops;
+ LIST_HEAD(dev_kill_list);
+ struct net *net;
+
+ rtnl_lock();
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ __rtnl_net_lock(net);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ if (ops->exit_rtnl)
+ ops->exit_rtnl(net, &dev_kill_list);
+ }
+
+ __rtnl_net_unlock(net);
+ }
+
+ unregister_netdevice_many(&dev_kill_list);
+
+ rtnl_unlock();
+}
+
+static void ops_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ if (ops->exit) {
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ ops->exit(net);
+ cond_resched();
+ }
+ }
+
+ if (ops->exit_batch)
+ ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->id) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ kfree(net_generic(net, *ops->id));
+ }
+}
+
+static void ops_undo_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list,
+ bool expedite_rcu)
+{
+ const struct pernet_operations *saved_ops;
+ bool hold_rtnl = false;
+
+ if (!ops)
+ ops = list_entry(ops_list, typeof(*ops), list);
+
+ saved_ops = ops;
+
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ hold_rtnl |= !!ops->exit_rtnl;
+ ops_pre_exit_list(ops, net_exit_list);
+ }
+
+ /* Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so the
+ * rcu_barrier() after ops_undo_list() isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
+ */
+ if (expedite_rcu)
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+
+ if (hold_rtnl)
+ ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_exit_list(ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_free_list(ops, net_exit_list);
+}
+
+static void ops_undo_single(struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ LIST_HEAD(ops_list);
+
+ list_add(&ops->list, &ops_list);
+ ops_undo_list(&ops_list, NULL, net_exit_list, false);
+ list_del(&ops->list);
+}
+
+/* should be called with nsid_lock held */
+static int alloc_netid(struct net *net, struct net *peer, int reqid)
+{
+ int min = 0, max = 0;
+
+ if (reqid >= 0) {
+ min = reqid;
+ max = reqid + 1;
+ }
+
+ return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
+}
+
+/* This function is used by idr_for_each(). If net is equal to peer, the
+ * function returns the id so that idr_for_each() stops. Because we cannot
+ * returns the id 0 (idr_for_each() will not stop), we return the magic value
+ * NET_ID_ZERO (-1) for it.
+ */
+#define NET_ID_ZERO -1
+static int net_eq_idr(int id, void *net, void *peer)
+{
+ if (net_eq(net, peer))
+ return id ? : NET_ID_ZERO;
+ return 0;
+}
+
+/* Must be called from RCU-critical section or with nsid_lock held */
+static int __peernet2id(const struct net *net, struct net *peer)
+{
+ int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+
+ /* Magic value for id 0. */
+ if (id == NET_ID_ZERO)
+ return 0;
+ if (id > 0)
+ return id;
+
+ return NETNSA_NSID_NOT_ASSIGNED;
+}
+
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
+ struct nlmsghdr *nlh, gfp_t gfp);
+/* This function returns the id of a peer netns. If no id is assigned, one will
+ * be allocated and returned.
+ */
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
+{
+ int id;
+
+ if (!check_net(net))
+ return NETNSA_NSID_NOT_ASSIGNED;
+
+ spin_lock(&net->nsid_lock);
+ id = __peernet2id(net, peer);
+ if (id >= 0) {
+ spin_unlock(&net->nsid_lock);
+ return id;
+ }
+
+ /* When peer is obtained from RCU lists, we may race with
+ * its cleanup. Check whether it's alive, and this guarantees
+ * we never hash a peer back to net->netns_ids, after it has
+ * just been idr_remove()'d from there in cleanup_net().
+ */
+ if (!maybe_get_net(peer)) {
+ spin_unlock(&net->nsid_lock);
+ return NETNSA_NSID_NOT_ASSIGNED;
+ }
+
+ id = alloc_netid(net, peer, -1);
+ spin_unlock(&net->nsid_lock);
+
+ put_net(peer);
+ if (id < 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
+
+ rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);
+
+ return id;
+}
+EXPORT_SYMBOL_GPL(peernet2id_alloc);
+
+/* This function returns, if assigned, the id of a peer netns. */
+int peernet2id(const struct net *net, struct net *peer)
+{
+ int id;
+
+ rcu_read_lock();
+ id = __peernet2id(net, peer);
+ rcu_read_unlock();
+
+ return id;
+}
+EXPORT_SYMBOL(peernet2id);
+
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(const struct net *net, struct net *peer)
+{
+ return peernet2id(net, peer) >= 0;
+}
+
+struct net *get_net_ns_by_id(const struct net *net, int id)
+{
+ struct net *peer;
+
+ if (id < 0)
+ return NULL;
+
+ rcu_read_lock();
+ peer = idr_find(&net->netns_ids, id);
+ if (peer)
+ peer = maybe_get_net(peer);
+ rcu_read_unlock();
+
+ return peer;
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
+
+static __net_init void preinit_net_sysctl(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ /* Limits per socket sk_omem_alloc usage.
+ * TCP zerocopy regular usage needs 128 KB.
+ */
+ net->core.sysctl_optmem_max = 128 * 1024;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+ net->core.sysctl_tstamp_allow_data = 1;
+ net->core.sysctl_txq_reselection = msecs_to_jiffies(1000);
+}
+
+/* init code that must occur even if setup_net() is not called. */
+static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
+{
+ int ret;
+
+ ret = ns_common_init(net);
+ if (ret)
+ return ret;
+
+ refcount_set(&net->passive, 1);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
+ ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
+
+ get_random_bytes(&net->hash_mix, sizeof(u32));
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+
+ idr_init(&net->netns_ids);
+ spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
+
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+ mutex_init(&net->rtnl_mutex);
+ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
+#endif
+
+ INIT_LIST_HEAD(&net->ptype_all);
+ INIT_LIST_HEAD(&net->ptype_specific);
+ preinit_net_sysctl(net);
+ return 0;
+}
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net)
+{
+ /* Must be called with pernet_ops_rwsem held */
+ const struct pernet_operations *ops;
+ LIST_HEAD(net_exit_list);
+ int error = 0;
+
+ net->net_cookie = ns_tree_gen_id(net);
+
+ list_for_each_entry(ops, &pernet_list, list) {
+ error = ops_init(ops, net);
+ if (error < 0)
+ goto out_undo;
+ }
+ down_write(&net_rwsem);
+ list_add_tail_rcu(&net->list, &net_namespace_list);
+ up_write(&net_rwsem);
+ ns_tree_add_raw(net);
+out:
+ return error;
+
+out_undo:
+ /* Walk through the list backwards calling the exit functions
+ * for the pernet modules whose init functions did not fail.
+ */
+ list_add(&net->exit_list, &net_exit_list);
+ ops_undo_list(&pernet_list, ops, &net_exit_list, false);
+ rcu_barrier();
+ goto out;
+}
+
+#ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
+static struct kmem_cache *net_cachep __ro_after_init;
+static struct workqueue_struct *netns_wq;
+
+static struct net *net_alloc(void)
+{
+ struct net *net = NULL;
+ struct net_generic *ng;
+
+ ng = net_alloc_generic();
+ if (!ng)
+ goto out;
+
+ net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+ if (!net)
+ goto out_free;
+
+#ifdef CONFIG_KEYS
+ net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
+ if (!net->key_domain)
+ goto out_free_2;
+ refcount_set(&net->key_domain->usage, 1);
+#endif
+
+ rcu_assign_pointer(net->gen, ng);
+out:
+ return net;
+
+#ifdef CONFIG_KEYS
+out_free_2:
+ kmem_cache_free(net_cachep, net);
+ net = NULL;
+#endif
+out_free:
+ kfree(ng);
+ goto out;
+}
+
+static LLIST_HEAD(defer_free_list);
+
+static void net_complete_free(void)
+{
+ struct llist_node *kill_list;
+ struct net *net, *next;
+
+ /* Get the list of namespaces to free from last round. */
+ kill_list = llist_del_all(&defer_free_list);
+
+ llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
+ kmem_cache_free(net_cachep, net);
+
+}
+
+void net_passive_dec(struct net *net)
+{
+ if (refcount_dec_and_test(&net->passive)) {
+ kfree(rcu_access_pointer(net->gen));
+
+ /* There should not be any trackers left there. */
+ ref_tracker_dir_exit(&net->notrefcnt_tracker);
+
+ /* Wait for an extra rcu_barrier() before final free. */
+ llist_add(&net->defer_free_list, &defer_free_list);
+ }
+}
+
+void net_drop_ns(void *p)
+{
+ struct net *net = (struct net *)p;
+
+ if (net)
+ net_passive_dec(net);
+}
+
+struct net *copy_net_ns(u64 flags,
+ struct user_namespace *user_ns, struct net *old_net)
+{
+ struct ucounts *ucounts;
+ struct net *net;
+ int rv;
+
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+
+ ucounts = inc_net_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ net = net_alloc();
+ if (!net) {
+ rv = -ENOMEM;
+ goto dec_ucounts;
+ }
+
+ rv = preinit_net(net, user_ns);
+ if (rv < 0)
+ goto dec_ucounts;
+ net->ucounts = ucounts;
+ get_user_ns(user_ns);
+
+ rv = down_read_killable(&pernet_ops_rwsem);
+ if (rv < 0)
+ goto put_userns;
+
+ rv = setup_net(net);
+
+ up_read(&pernet_ops_rwsem);
+
+ if (rv < 0) {
+put_userns:
+ ns_common_free(net);
+#ifdef CONFIG_KEYS
+ key_remove_domain(net->key_domain);
+#endif
+ put_user_ns(user_ns);
+ net_passive_dec(net);
+dec_ucounts:
+ dec_net_namespaces(ucounts);
+ return ERR_PTR(rv);
+ }
+ return net;
+}
+
+/**
+ * net_ns_get_ownership - get sysfs ownership data for @net
+ * @net: network namespace in question (can be NULL)
+ * @uid: kernel user ID for sysfs objects
+ * @gid: kernel group ID for sysfs objects
+ *
+ * Returns the uid/gid pair of root in the user namespace associated with the
+ * given network namespace.
+ */
+void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
+{
+ if (net) {
+ kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
+ kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
+
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+ } else {
+ *uid = GLOBAL_ROOT_UID;
+ *gid = GLOBAL_ROOT_GID;
+ }
+}
+EXPORT_SYMBOL_GPL(net_ns_get_ownership);
+
+static void unhash_nsid(struct net *net, struct net *last)
+{
+ struct net *tmp;
+ /* This function is only called from cleanup_net() work,
+ * and this work is the only process, that may delete
+ * a net from net_namespace_list. So, when the below
+ * is executing, the list may only grow. Thus, we do not
+ * use for_each_net_rcu() or net_rwsem.
+ */
+ for_each_net(tmp) {
+ int id;
+
+ spin_lock(&tmp->nsid_lock);
+ id = __peernet2id(tmp, net);
+ if (id >= 0)
+ idr_remove(&tmp->netns_ids, id);
+ spin_unlock(&tmp->nsid_lock);
+ if (id >= 0)
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
+ GFP_KERNEL);
+ if (tmp == last)
+ break;
+ }
+ spin_lock(&net->nsid_lock);
+ idr_destroy(&net->netns_ids);
+ spin_unlock(&net->nsid_lock);
+}
+
+static LLIST_HEAD(cleanup_list);
+
+struct task_struct *cleanup_net_task;
+
+static void cleanup_net(struct work_struct *work)
+{
+ struct llist_node *net_kill_list;
+ struct net *net, *tmp, *last;
+ LIST_HEAD(net_exit_list);
+
+ WRITE_ONCE(cleanup_net_task, current);
+
+ /* Atomically snapshot the list of namespaces to cleanup */
+ net_kill_list = llist_del_all(&cleanup_list);
+
+ down_read(&pernet_ops_rwsem);
+
+ /* Don't let anyone else find us. */
+ down_write(&net_rwsem);
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ ns_tree_remove(net);
+ list_del_rcu(&net->list);
+ }
+ /* Cache last net. After we unlock rtnl, no one new net
+ * added to net_namespace_list can assign nsid pointer
+ * to a net from net_kill_list (see peernet2id_alloc()).
+ * So, we skip them in unhash_nsid().
+ *
+ * Note, that unhash_nsid() does not delete nsid links
+ * between net_kill_list's nets, as they've already
+ * deleted from net_namespace_list. But, this would be
+ * useless anyway, as netns_ids are destroyed there.
+ */
+ last = list_last_entry(&net_namespace_list, struct net, list);
+ up_write(&net_rwsem);
+
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ unhash_nsid(net, last);
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+
+ ops_undo_list(&pernet_list, NULL, &net_exit_list, true);
+
+ up_read(&pernet_ops_rwsem);
+
+ /* Ensure there are no outstanding rcu callbacks using this
+ * network namespace.
+ */
+ rcu_barrier();
+
+ net_complete_free();
+
+ /* Finally it is safe to free my network namespace structure */
+ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+ list_del_init(&net->exit_list);
+ ns_common_free(net);
+ dec_net_namespaces(net->ucounts);
+#ifdef CONFIG_KEYS
+ key_remove_domain(net->key_domain);
+#endif
+ put_user_ns(net->user_ns);
+ net_passive_dec(net);
+ }
+ WRITE_ONCE(cleanup_net_task, NULL);
+}
+
+/**
+ * net_ns_barrier - wait until concurrent net_cleanup_work is done
+ *
+ * cleanup_net runs from work queue and will first remove namespaces
+ * from the global list, then run net exit functions.
+ *
+ * Call this in module exit path to make sure that all netns
+ * ->exit ops have been invoked before the function is removed.
+ */
+void net_ns_barrier(void)
+{
+ down_write(&pernet_ops_rwsem);
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL(net_ns_barrier);
+
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
+
+void __put_net(struct net *net)
+{
+ ref_tracker_dir_exit(&net->refcnt_tracker);
+ /* Cleanup the network namespace in process context */
+ if (llist_add(&net->cleanup_list, &cleanup_list))
+ queue_work(netns_wq, &net_cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__put_net);
+
+/**
+ * get_net_ns - increment the refcount of the network namespace
+ * @ns: common namespace (net)
+ *
+ * Returns the net's common namespace or ERR_PTR() if ref is zero.
+ */
+struct ns_common *get_net_ns(struct ns_common *ns)
+{
+ struct net *net;
+
+ net = maybe_get_net(container_of(ns, struct net, ns));
+ if (net)
+ return &net->ns;
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(get_net_ns);
+
+struct net *get_net_ns_by_fd(int fd)
+{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
+ if (proc_ns_file(fd_file(f))) {
+ struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
+ if (ns->ops == &netns_operations)
+ return get_net(container_of(ns, struct net, ns));
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
+#endif
+
+struct net *get_net_ns_by_pid(pid_t pid)
+{
+ struct task_struct *tsk;
+ struct net *net;
+
+ /* Lookup the network namespace */
+ net = ERR_PTR(-ESRCH);
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk) {
+ struct nsproxy *nsproxy;
+ task_lock(tsk);
+ nsproxy = tsk->nsproxy;
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ task_unlock(tsk);
+ }
+ rcu_read_unlock();
+ return net;
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
+
+#ifdef CONFIG_NET_NS_REFCNT_TRACKER
+static void net_ns_net_debugfs(struct net *net)
+{
+ ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt",
+ net->net_cookie, net->ns.inum);
+ ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt",
+ net->net_cookie, net->ns.inum);
+}
+
+static int __init init_net_debugfs(void)
+{
+ ref_tracker_dir_debugfs(&init_net.refcnt_tracker);
+ ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker);
+ net_ns_net_debugfs(&init_net);
+ return 0;
+}
+late_initcall(init_net_debugfs);
+#else
+static void net_ns_net_debugfs(struct net *net)
+{
+}
+#endif
+
+static __net_init int net_ns_net_init(struct net *net)
+{
+ net_ns_net_debugfs(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+ .init = net_ns_net_init,
+};
+
+static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
+ [NETNSA_NONE] = { .type = NLA_UNSPEC },
+ [NETNSA_NSID] = { .type = NLA_S32 },
+ [NETNSA_PID] = { .type = NLA_U32 },
+ [NETNSA_FD] = { .type = NLA_U32 },
+ [NETNSA_TARGET_NSID] = { .type = NLA_S32 },
+};
+
+static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NETNSA_MAX + 1];
+ struct nlattr *nla;
+ struct net *peer;
+ int nsid, err;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
+ NETNSA_MAX, rtnl_net_policy, extack);
+ if (err < 0)
+ return err;
+ if (!tb[NETNSA_NSID]) {
+ NL_SET_ERR_MSG(extack, "nsid is missing");
+ return -EINVAL;
+ }
+ nsid = nla_get_s32(tb[NETNSA_NSID]);
+
+ if (tb[NETNSA_PID]) {
+ peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+ nla = tb[NETNSA_PID];
+ } else if (tb[NETNSA_FD]) {
+ peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+ nla = tb[NETNSA_FD];
+ } else {
+ NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
+ return -EINVAL;
+ }
+ if (IS_ERR(peer)) {
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
+ return PTR_ERR(peer);
+ }
+
+ spin_lock(&net->nsid_lock);
+ if (__peernet2id(net, peer) >= 0) {
+ spin_unlock(&net->nsid_lock);
+ err = -EEXIST;
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack,
+ "Peer netns already has a nsid assigned");
+ goto out;
+ }
+
+ err = alloc_netid(net, peer, nsid);
+ spin_unlock(&net->nsid_lock);
+ if (err >= 0) {
+ rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
+ nlh, GFP_KERNEL);
+ err = 0;
+ } else if (err == -ENOSPC && nsid >= 0) {
+ err = -EEXIST;
+ NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
+ NL_SET_ERR_MSG(extack, "The specified nsid is already used");
+ }
+out:
+ put_net(peer);
+ return err;
+}
+
+static int rtnl_net_get_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+ + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
+ ;
+}
+
+struct net_fill_args {
+ u32 portid;
+ u32 seq;
+ int flags;
+ int cmd;
+ int nsid;
+ bool add_ref;
+ int ref_nsid;
+};
+
+static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
+{
+ struct nlmsghdr *nlh;
+ struct rtgenmsg *rth;
+
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
+ args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rth = nlmsg_data(nlh);
+ rth->rtgen_family = AF_UNSPEC;
+
+ if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
+ goto nla_put_failure;
+
+ if (args->add_ref &&
+ nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int rtnl_net_valid_getid_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
+ tb, NETNSA_MAX, rtnl_net_policy,
+ extack);
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
+ NETNSA_MAX, rtnl_net_policy,
+ extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETNSA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NETNSA_PID:
+ case NETNSA_FD:
+ case NETNSA_NSID:
+ case NETNSA_TARGET_NSID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NETNSA_MAX + 1];
+ struct net_fill_args fillargs = {
+ .portid = NETLINK_CB(skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .cmd = RTM_NEWNSID,
+ };
+ struct net *peer, *target = net;
+ struct nlattr *nla;
+ struct sk_buff *msg;
+ int err;
+
+ err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+ if (tb[NETNSA_PID]) {
+ peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+ nla = tb[NETNSA_PID];
+ } else if (tb[NETNSA_FD]) {
+ peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+ nla = tb[NETNSA_FD];
+ } else if (tb[NETNSA_NSID]) {
+ peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
+ if (!peer)
+ peer = ERR_PTR(-ENOENT);
+ nla = tb[NETNSA_NSID];
+ } else {
+ NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
+ return -EINVAL;
+ }
+
+ if (IS_ERR(peer)) {
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
+ return PTR_ERR(peer);
+ }
+
+ if (tb[NETNSA_TARGET_NSID]) {
+ int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);
+
+ target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
+ if (IS_ERR(target)) {
+ NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
+ NL_SET_ERR_MSG(extack,
+ "Target netns reference is invalid");
+ err = PTR_ERR(target);
+ goto out;
+ }
+ fillargs.add_ref = true;
+ fillargs.ref_nsid = peernet2id(net, peer);
+ }
+
+ msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ fillargs.nsid = peernet2id(target, peer);
+ err = rtnl_net_fill(msg, &fillargs);
+ if (err < 0)
+ goto err_out;
+
+ err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
+ goto out;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ if (fillargs.add_ref)
+ put_net(target);
+ put_net(peer);
+ return err;
+}
+
+struct rtnl_net_dump_cb {
+ struct net *tgt_net;
+ struct net *ref_net;
+ struct sk_buff *skb;
+ struct net_fill_args fillargs;
+ int idx;
+ int s_idx;
+};
+
+/* Runs in RCU-critical section. */
+static int rtnl_net_dumpid_one(int id, void *peer, void *data)
+{
+ struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
+ int ret;
+
+ if (net_cb->idx < net_cb->s_idx)
+ goto cont;
+
+ net_cb->fillargs.nsid = id;
+ if (net_cb->fillargs.add_ref)
+ net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
+ ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
+ if (ret < 0)
+ return ret;
+
+cont:
+ net_cb->idx++;
+ return 0;
+}
+
+static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
+ struct rtnl_net_dump_cb *net_cb,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[NETNSA_MAX + 1];
+ int err, i;
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
+ NETNSA_MAX, rtnl_net_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NETNSA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ if (i == NETNSA_TARGET_NSID) {
+ struct net *net;
+
+ net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
+ if (IS_ERR(net)) {
+ NL_SET_BAD_ATTR(extack, tb[i]);
+ NL_SET_ERR_MSG(extack,
+ "Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ net_cb->fillargs.add_ref = true;
+ net_cb->ref_net = net_cb->tgt_net;
+ net_cb->tgt_net = net;
+ } else {
+ NL_SET_BAD_ATTR(extack, tb[i]);
+ NL_SET_ERR_MSG(extack,
+ "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtnl_net_dump_cb net_cb = {
+ .tgt_net = sock_net(skb->sk),
+ .skb = skb,
+ .fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = cb->nlh->nlmsg_seq,
+ .flags = NLM_F_MULTI,
+ .cmd = RTM_NEWNSID,
+ },
+ .idx = 0,
+ .s_idx = cb->args[0],
+ };
+ int err = 0;
+
+ if (cb->strict_check) {
+ err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
+ if (err < 0)
+ goto end;
+ }
+
+ rcu_read_lock();
+ idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+ rcu_read_unlock();
+
+ cb->args[0] = net_cb.idx;
+end:
+ if (net_cb.fillargs.add_ref)
+ put_net(net_cb.tgt_net);
+ return err;
+}
+
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
+ struct nlmsghdr *nlh, gfp_t gfp)
+{
+ struct net_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
+ .cmd = cmd,
+ .nsid = id,
+ };
+ struct sk_buff *msg;
+ int err = -ENOMEM;
+
+ msg = nlmsg_new(rtnl_net_get_size(), gfp);
+ if (!msg)
+ goto out;
+
+ err = rtnl_net_fill(msg, &fillargs);
+ if (err < 0)
+ goto err_out;
+
+ rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
+ return;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ rtnl_set_sk_err(net, RTNLGRP_NSID, err);
+}
+
+#ifdef CONFIG_NET_NS
+static void __init netns_ipv4_struct_check(void)
+{
+ /* TX readonly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_early_retrans);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_tso_win_divisor);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_tso_rtt_log);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_autocorking);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_min_snd_mss);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_notsent_lowat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_limit_output_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_min_rtt_wlen);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_wmem);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_ip_fwd_use_pmtu);
+
+ /* RX readonly hotpath cache line */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_moderate_rcvbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_rcvbuf_low_rtt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_ip_early_demux);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_early_demux);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_l3mdev_accept);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_reordering);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_rmem);
+}
+#endif
+
+static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
+ .dumpit = rtnl_net_dumpid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
+void __init net_ns_init(void)
+{
+ struct net_generic *ng;
+
+#ifdef CONFIG_NET_NS
+ netns_ipv4_struct_check();
+ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
+ SMP_CACHE_BYTES,
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
+
+ /* Create workqueue for cleanup */
+ netns_wq = create_singlethread_workqueue("netns");
+ if (!netns_wq)
+ panic("Could not create netns workq");
+#endif
+
+ ng = net_alloc_generic();
+ if (!ng)
+ panic("Could not allocate generic netns");
+
+ rcu_assign_pointer(init_net.gen, ng);
+
+#ifdef CONFIG_KEYS
+ init_net.key_domain = &init_net_key_domain;
+#endif
+ /*
+ * This currently cannot fail as the initial network namespace
+ * has a static inode number.
+ */
+ if (preinit_net(&init_net, &init_user_ns))
+ panic("Could not preinitialize the initial network namespace");
+
+ down_write(&pernet_ops_rwsem);
+ if (setup_net(&init_net))
+ panic("Could not setup the initial network namespace");
+
+ init_net_initialized = true;
+ up_write(&pernet_ops_rwsem);
+
+ if (register_pernet_subsys(&net_ns_ops))
+ panic("Could not register network namespace subsystems");
+
+ rtnl_register_many(net_ns_rtnl_msg_handlers);
+}
+
+#ifdef CONFIG_NET_NS
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ LIST_HEAD(net_exit_list);
+ struct net *net;
+ int error;
+
+ list_add_tail(&ops->list, list);
+ if (ops->init || ops->id) {
+ /* We held write locked pernet_ops_rwsem, and parallel
+ * setup_net() and cleanup_net() are not possible.
+ */
+ for_each_net(net) {
+ error = ops_init(ops, net);
+ if (error)
+ goto out_undo;
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+ }
+ return 0;
+
+out_undo:
+ /* If I have an error cleanup all namespaces I initialized */
+ list_del(&ops->list);
+ ops_undo_single(ops, &net_exit_list);
+ return error;
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ LIST_HEAD(net_exit_list);
+ struct net *net;
+
+ /* See comment in __register_pernet_operations() */
+ for_each_net(net)
+ list_add_tail(&net->exit_list, &net_exit_list);
+
+ list_del(&ops->list);
+ ops_undo_single(ops, &net_exit_list);
+}
+
+#else
+
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ if (!init_net_initialized) {
+ list_add_tail(&ops->list, list);
+ return 0;
+ }
+
+ return ops_init(ops, &init_net);
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ if (!init_net_initialized) {
+ list_del(&ops->list);
+ } else {
+ LIST_HEAD(net_exit_list);
+
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
+ }
+}
+
+#endif /* CONFIG_NET_NS */
+
+static DEFINE_IDA(net_generic_ids);
+
+static int register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ int error;
+
+ if (WARN_ON(!!ops->id ^ !!ops->size))
+ return -EINVAL;
+
+ if (ops->id) {
+ error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
+ GFP_KERNEL);
+ if (error < 0)
+ return error;
+ *ops->id = error;
+ /* This does not require READ_ONCE as writers already hold
+ * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
+ * net_alloc_generic.
+ */
+ WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
+ }
+ error = __register_pernet_operations(list, ops);
+ if (error) {
+ rcu_barrier();
+ if (ops->id)
+ ida_free(&net_generic_ids, *ops->id);
+ }
+
+ return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+ __unregister_pernet_operations(ops);
+ rcu_barrier();
+ if (ops->id)
+ ida_free(&net_generic_ids, *ops->id);
+}
+
+/**
+ * register_pernet_subsys - register a network namespace subsystem
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a subsystem which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_subsys(struct pernet_operations *ops)
+{
+ int error;
+ down_write(&pernet_ops_rwsem);
+ error = register_pernet_operations(first_device, ops);
+ up_write(&pernet_ops_rwsem);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_subsys);
+
+/**
+ * unregister_pernet_subsys - unregister a network namespace subsystem
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_subsys(struct pernet_operations *ops)
+{
+ down_write(&pernet_ops_rwsem);
+ unregister_pernet_operations(ops);
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
+
+/**
+ * register_pernet_device - register a network namespace device
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a device which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_device(struct pernet_operations *ops)
+{
+ int error;
+ down_write(&pernet_ops_rwsem);
+ error = register_pernet_operations(&pernet_list, ops);
+ if (!error && (first_device == &pernet_list))
+ first_device = &ops->list;
+ up_write(&pernet_ops_rwsem);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_device);
+
+/**
+ * unregister_pernet_device - unregister a network namespace netdevice
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_device(struct pernet_operations *ops)
+{
+ down_write(&pernet_ops_rwsem);
+ if (&ops->list == first_device)
+ first_device = first_device->next;
+ unregister_pernet_operations(ops);
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static struct ns_common *netns_get(struct task_struct *task)
+{
+ struct net *net = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ task_unlock(task);
+
+ return net ? &net->ns : NULL;
+}
+
+static void netns_put(struct ns_common *ns)
+{
+ put_net(to_net_ns(ns));
+}
+
+static int netns_install(struct nsset *nsset, struct ns_common *ns)
+{
+ struct nsproxy *nsproxy = nsset->nsproxy;
+ struct net *net = to_net_ns(ns);
+
+ if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ put_net(nsproxy->net_ns);
+ nsproxy->net_ns = get_net(net);
+ return 0;
+}
+
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+ return to_net_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations netns_operations = {
+ .name = "net",
+ .get = netns_get,
+ .put = netns_put,
+ .install = netns_install,
+ .owner = netns_owner,
+};
+#endif
diff --git a/net/core/net_test.c b/net/core/net_test.c
new file mode 100644
index 000000000000..9c3a590865d2
--- /dev/null
+++ b/net/core/net_test.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+
+/* GSO */
+
+#include <linux/skbuff.h>
+
+static const char hdr[] = "abcdefgh";
+#define GSO_TEST_SIZE 1000
+
+static void __init_skb(struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ memcpy(skb_mac_header(skb), hdr, sizeof(hdr));
+
+ /* skb_segment expects skb->data at start of payload */
+ skb_pull(skb, sizeof(hdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ /* proto is arbitrary, as long as not ETH_P_TEB or vlan */
+ skb->protocol = htons(ETH_P_ATALK);
+ skb_shinfo(skb)->gso_size = GSO_TEST_SIZE;
+}
+
+enum gso_test_nr {
+ GSO_TEST_LINEAR,
+ GSO_TEST_NO_GSO,
+ GSO_TEST_FRAGS,
+ GSO_TEST_FRAGS_PURE,
+ GSO_TEST_GSO_PARTIAL,
+ GSO_TEST_FRAG_LIST,
+ GSO_TEST_FRAG_LIST_PURE,
+ GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ GSO_TEST_GSO_BY_FRAGS,
+};
+
+struct gso_test_case {
+ enum gso_test_nr id;
+ const char *name;
+
+ /* input */
+ unsigned int linear_len;
+ unsigned int nr_frags;
+ const unsigned int *frags;
+ unsigned int nr_frag_skbs;
+ const unsigned int *frag_skbs;
+
+ /* output as expected */
+ unsigned int nr_segs;
+ const unsigned int *segs;
+};
+
+static struct gso_test_case cases[] = {
+ {
+ .id = GSO_TEST_NO_GSO,
+ .name = "no_gso",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_segs = 1,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_LINEAR,
+ .name = "linear",
+ .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1,
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS,
+ .name = "frags",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS_PURE,
+ .name = "frags_pure",
+ .nr_frags = 3,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ },
+ {
+ .id = GSO_TEST_GSO_PARTIAL,
+ .name = "gso_partial",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 89319d3801d1: frag_list on mss boundaries */
+ .id = GSO_TEST_FRAG_LIST,
+ .name = "frag_list",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_FRAG_LIST_PURE,
+ .name = "frag_list_pure",
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ /* commit 43170c4e0ba7: GRO of frag_list trains */
+ .id = GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ .name = "frag_list_non_uniform",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and
+ * commit 90017accff61 ("sctp: Add GSO support")
+ *
+ * "there will be a cover skb with protocol headers and
+ * children ones containing the actual segments"
+ */
+ .id = GSO_TEST_GSO_BY_FRAGS,
+ .name = "gso_by_frags",
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { 100, 200, 300, 400 },
+ },
+};
+
+static void gso_test_case_to_desc(struct gso_test_case *t, char *desc)
+{
+ sprintf(desc, "%s", t->name);
+}
+
+KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc);
+
+static void gso_test_func(struct kunit *test)
+{
+ const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct sk_buff *skb, *segs, *cur, *next, *last;
+ const struct gso_test_case *tcase;
+ netdev_features_t features;
+ struct page *page;
+ int i;
+
+ tcase = test->param_value;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, skb);
+ __skb_put(skb, sizeof(hdr) + tcase->linear_len);
+
+ __init_skb(skb);
+
+ if (tcase->nr_frags) {
+ unsigned int pg_off = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ page_ref_add(page, tcase->nr_frags - 1);
+
+ for (i = 0; i < tcase->nr_frags; i++) {
+ skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]);
+ pg_off += tcase->frags[i];
+ }
+
+ KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE);
+
+ skb->data_len = pg_off;
+ skb->len += skb->data_len;
+ skb->truesize += skb->data_len;
+ }
+
+ if (tcase->frag_skbs) {
+ unsigned int total_size = 0, total_true_size = 0;
+ struct sk_buff *frag_skb, *prev = NULL;
+
+ for (i = 0; i < tcase->nr_frag_skbs; i++) {
+ unsigned int frag_size;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+
+ frag_size = tcase->frag_skbs[i];
+ frag_skb = build_skb(page_address(page),
+ frag_size + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, frag_skb);
+ __skb_put(frag_skb, frag_size);
+
+ if (prev)
+ prev->next = frag_skb;
+ else
+ skb_shinfo(skb)->frag_list = frag_skb;
+ prev = frag_skb;
+
+ total_size += frag_size;
+ total_true_size += frag_skb->truesize;
+ }
+
+ skb->len += total_size;
+ skb->data_len += total_size;
+ skb->truesize += total_true_size;
+
+ if (tcase->id == GSO_TEST_GSO_BY_FRAGS)
+ skb_shinfo(skb)->gso_size = GSO_BY_FRAGS;
+ }
+
+ features = NETIF_F_SG | NETIF_F_HW_CSUM;
+ if (tcase->id == GSO_TEST_GSO_PARTIAL)
+ features |= NETIF_F_GSO_PARTIAL;
+
+ /* TODO: this should also work with SG,
+ * rather than hit BUG_ON(i >= nfrags)
+ */
+ if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM)
+ features &= ~NETIF_F_SG;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs)) {
+ KUNIT_FAIL(test, "segs error %pe", segs);
+ goto free_gso_skb;
+ } else if (!segs) {
+ KUNIT_FAIL(test, "no segments");
+ goto free_gso_skb;
+ }
+
+ last = segs->prev;
+ for (cur = segs, i = 0; cur; cur = next, i++) {
+ next = cur->next;
+
+ KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]);
+
+ /* segs have skb->data pointing to the mac header */
+ KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data);
+ KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr));
+
+ /* header was copied to all segs */
+ KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0);
+
+ /* last seg can be found through segs->prev pointer */
+ if (!next)
+ KUNIT_ASSERT_PTR_EQ(test, cur, last);
+
+ consume_skb(cur);
+ }
+
+ KUNIT_ASSERT_EQ(test, i, tcase->nr_segs);
+
+free_gso_skb:
+ consume_skb(skb);
+}
+
+/* IP tunnel flags */
+
+#include <net/ip_tunnels.h>
+
+struct ip_tunnel_flags_test {
+ const char *name;
+
+ const u16 *src_bits;
+ const u16 *exp_bits;
+ u8 src_num;
+ u8 exp_num;
+
+ __be16 exp_val;
+ bool exp_comp;
+};
+
+#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \
+ .name = (n), \
+ .src_bits = (src), \
+ .src_num = ARRAY_SIZE(src), \
+ .exp_comp = (comp), \
+ .exp_val = (eval), \
+ .exp_bits = (exp), \
+ .exp_num = ARRAY_SIZE(exp), \
+}
+
+/* These are __be16-compatible and can be compared as is */
+static const u16 ip_tunnel_flags_1[] = {
+ IP_TUNNEL_KEY_BIT,
+ IP_TUNNEL_STRICT_BIT,
+ IP_TUNNEL_ERSPAN_OPT_BIT,
+};
+
+/* Due to the previous flags design limitation, setting either
+ * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT``
+ * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they
+ * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is
+ * backward-compatible.
+ */
+#ifdef __LITTLE_ENDIAN
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT
+#else
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT
+#endif
+
+static const u16 ip_tunnel_flags_2_src[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+};
+
+static const u16 ip_tunnel_flags_2_exp[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+ IP_TUNNEL_SIT_ISATAP_BIT,
+};
+
+/* Bits 17 and higher are not compatible with __be16 flags */
+static const u16 ip_tunnel_flags_3_src[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+ 17,
+ 18,
+ 20,
+};
+
+static const u16 ip_tunnel_flags_3_exp[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+};
+
+static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = {
+ IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true,
+ cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) |
+ BIT(IP_TUNNEL_STRICT_BIT) |
+ BIT(IP_TUNNEL_ERSPAN_OPT_BIT)),
+ ip_tunnel_flags_1),
+ IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true,
+ VTI_ISVTI, ip_tunnel_flags_2_exp),
+ IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false,
+ cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)),
+ ip_tunnel_flags_3_exp),
+};
+
+static void
+ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t,
+ char *desc)
+{
+ strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE);
+}
+KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test,
+ ip_tunnel_flags_test_case_to_desc);
+
+static void ip_tunnel_flags_test_run(struct kunit *test)
+{
+ const struct ip_tunnel_flags_test *t = test->param_value;
+ IP_TUNNEL_DECLARE_FLAGS(src) = { };
+ IP_TUNNEL_DECLARE_FLAGS(exp) = { };
+ IP_TUNNEL_DECLARE_FLAGS(out);
+
+ for (u32 j = 0; j < t->src_num; j++)
+ __set_bit(t->src_bits[j], src);
+ for (u32 j = 0; j < t->exp_num; j++)
+ __set_bit(t->exp_bits[j], exp);
+
+ KUNIT_ASSERT_EQ(test, t->exp_comp,
+ ip_tunnel_flags_is_be16_compat(src));
+ KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val,
+ (__force u16)ip_tunnel_flags_to_be16(src));
+
+ ip_tunnel_flags_from_be16(out, t->exp_val);
+ KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out));
+}
+
+static struct kunit_case net_test_cases[] = {
+ KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
+ KUNIT_CASE_PARAM(ip_tunnel_flags_test_run,
+ ip_tunnel_flags_test_gen_params),
+ { },
+};
+
+static struct kunit_suite net_test_suite = {
+ .name = "net_core",
+ .test_cases = net_test_cases,
+};
+kunit_test_suite(net_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for networking core");
+MODULE_LICENSE("GPL");
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
new file mode 100644
index 000000000000..dff66d8fb325
--- /dev/null
+++ b/net/core/netclassid_cgroup.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/core/netclassid_cgroup.c Classid Cgroupfs Handling
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fdtable.h>
+#include <linux/sched/task.h>
+
+#include <net/cls_cgroup.h>
+#include <net/sock.h>
+
+static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
+}
+
+struct cgroup_cls_state *task_cls_state(struct task_struct *p)
+{
+ return css_cls_state(task_css_check(p, net_cls_cgrp_id,
+ rcu_read_lock_held() ||
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_trace_held()));
+}
+EXPORT_SYMBOL_GPL(task_cls_state);
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_cls_state *cs;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ return &cs->css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
+
+ if (parent)
+ cs->classid = parent->classid;
+
+ return 0;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_cls_state(css));
+}
+
+/*
+ * To avoid freezing of sockets creation for tasks with big number of threads
+ * and opened sockets lets release file_lock every 1000 iterated descriptors.
+ * New sockets will already have been created with new classid.
+ */
+
+struct update_classid_context {
+ u32 classid;
+ unsigned int batch;
+};
+
+#define UPDATE_CLASSID_BATCH 1000
+
+static int update_classid_sock(const void *v, struct file *file, unsigned int n)
+{
+ struct update_classid_context *ctx = (void *)v;
+ struct socket *sock = sock_from_file(file);
+
+ if (sock)
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
+ if (--ctx->batch == 0) {
+ ctx->batch = UPDATE_CLASSID_BATCH;
+ return n + 1;
+ }
+ return 0;
+}
+
+static void update_classid_task(struct task_struct *p, u32 classid)
+{
+ struct update_classid_context ctx = {
+ .classid = classid,
+ .batch = UPDATE_CLASSID_BATCH
+ };
+ unsigned int fd = 0;
+
+ /* Only update the leader task, when many threads in this task,
+ * so it can avoid the useless traversal.
+ */
+ if (p != p->group_leader)
+ return;
+
+ do {
+ task_lock(p);
+ fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
+ task_unlock(p);
+ cond_resched();
+ } while (fd);
+}
+
+static void cgrp_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ update_classid_task(p, css_cls_state(css)->classid);
+ }
+}
+
+static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css_cls_state(css)->classid;
+}
+
+static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 value)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ cs->classid = (u32)value;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it)))
+ update_classid_task(p, cs->classid);
+ css_task_iter_end(&it);
+
+ return 0;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "classid",
+ .read_u64 = read_classid,
+ .write_u64 = write_classid,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_cls_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = cgrp_attach,
+ .legacy_cftypes = ss_files,
+};
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
new file mode 100644
index 000000000000..ba673e81716f
--- /dev/null
+++ b/net/core/netdev-genl-gen.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netdev-genl-gen.h"
+
+#include <uapi/linux/netdev.h>
+#include <net/netdev_netlink.h>
+
+/* Integer value ranges */
+static const struct netlink_range_validation netdev_a_page_pool_id_range = {
+ .min = 1ULL,
+ .max = U32_MAX,
+};
+
+static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = {
+ .min = 1ULL,
+ .max = S32_MAX,
+};
+
+static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
+ .max = S32_MAX,
+};
+
+/* Common nested types */
+const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
+ [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
+ [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
+};
+
+const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_DEV_GET - do */
+static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
+ [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_PAGE_POOL_GET - do */
+#ifdef CONFIG_PAGE_POOL
+static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = {
+ [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
+};
+#endif /* CONFIG_PAGE_POOL */
+
+/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */
+#ifdef CONFIG_PAGE_POOL_STATS
+static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = {
+ [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy),
+};
+#endif /* CONFIG_PAGE_POOL_STATS */
+
+/* NETDEV_CMD_QUEUE_GET - do */
+static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+};
+
+/* NETDEV_CMD_QUEUE_GET - dump */
+static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_NAPI_GET - do */
+static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+};
+
+/* NETDEV_CMD_NAPI_GET - dump */
+static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = {
+ [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_QSTATS_GET - dump */
+static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = {
+ [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),
+};
+
+/* NETDEV_CMD_BIND_RX - do */
+static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+ [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+};
+
+/* NETDEV_CMD_NAPI_SET - do */
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+ [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
+ [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+ [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
+ [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
+};
+
+/* NETDEV_CMD_BIND_TX - do */
+static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+};
+
+/* Ops table for netdev */
+static const struct genl_split_ops netdev_nl_ops[] = {
+ {
+ .cmd = NETDEV_CMD_DEV_GET,
+ .doit = netdev_nl_dev_get_doit,
+ .policy = netdev_dev_get_nl_policy,
+ .maxattr = NETDEV_A_DEV_IFINDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_DEV_GET,
+ .dumpit = netdev_nl_dev_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#ifdef CONFIG_PAGE_POOL
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_GET,
+ .doit = netdev_nl_page_pool_get_doit,
+ .policy = netdev_page_pool_get_nl_policy,
+ .maxattr = NETDEV_A_PAGE_POOL_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_GET,
+ .dumpit = netdev_nl_page_pool_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#endif /* CONFIG_PAGE_POOL */
+#ifdef CONFIG_PAGE_POOL_STATS
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET,
+ .doit = netdev_nl_page_pool_stats_get_doit,
+ .policy = netdev_page_pool_stats_get_nl_policy,
+ .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET,
+ .dumpit = netdev_nl_page_pool_stats_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#endif /* CONFIG_PAGE_POOL_STATS */
+ {
+ .cmd = NETDEV_CMD_QUEUE_GET,
+ .doit = netdev_nl_queue_get_doit,
+ .policy = netdev_queue_get_do_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_TYPE,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_QUEUE_GET,
+ .dumpit = netdev_nl_queue_get_dumpit,
+ .policy = netdev_queue_get_dump_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_IFINDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_GET,
+ .doit = netdev_nl_napi_get_doit,
+ .policy = netdev_napi_get_do_nl_policy,
+ .maxattr = NETDEV_A_NAPI_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_GET,
+ .dumpit = netdev_nl_napi_get_dumpit,
+ .policy = netdev_napi_get_dump_nl_policy,
+ .maxattr = NETDEV_A_NAPI_IFINDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_QSTATS_GET,
+ .dumpit = netdev_nl_qstats_get_dumpit,
+ .policy = netdev_qstats_get_nl_policy,
+ .maxattr = NETDEV_A_QSTATS_SCOPE,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_BIND_RX,
+ .doit = netdev_nl_bind_rx_doit,
+ .policy = netdev_bind_rx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_SET,
+ .doit = netdev_nl_napi_set_doit,
+ .policy = netdev_napi_set_nl_policy,
+ .maxattr = NETDEV_A_NAPI_THREADED,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_BIND_TX,
+ .doit = netdev_nl_bind_tx_doit,
+ .policy = netdev_bind_tx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group netdev_nl_mcgrps[] = {
+ [NETDEV_NLGRP_MGMT] = { "mgmt", },
+ [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", },
+};
+
+static void __netdev_nl_sock_priv_init(void *priv)
+{
+ netdev_nl_sock_priv_init(priv);
+}
+
+static void __netdev_nl_sock_priv_destroy(void *priv)
+{
+ netdev_nl_sock_priv_destroy(priv);
+}
+
+struct genl_family netdev_nl_family __ro_after_init = {
+ .name = NETDEV_FAMILY_NAME,
+ .version = NETDEV_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = netdev_nl_ops,
+ .n_split_ops = ARRAY_SIZE(netdev_nl_ops),
+ .mcgrps = netdev_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps),
+ .sock_priv_size = sizeof(struct netdev_nl_sock),
+ .sock_priv_init = __netdev_nl_sock_priv_init,
+ .sock_priv_destroy = __netdev_nl_sock_priv_destroy,
+};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
new file mode 100644
index 000000000000..cffc08517a41
--- /dev/null
+++ b/net/core/netdev-genl-gen.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_NETDEV_GEN_H
+#define _LINUX_NETDEV_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/netdev.h>
+#include <net/netdev_netlink.h>
+
+/* Common nested types */
+extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
+extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_queue_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ NETDEV_NLGRP_MGMT,
+ NETDEV_NLGRP_PAGE_POOL,
+};
+
+extern struct genl_family netdev_nl_family;
+
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv);
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv);
+
+#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
new file mode 100644
index 000000000000..470fabbeacd9
--- /dev/null
+++ b/net/core/netdev-genl.c
@@ -0,0 +1,1203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/busy_poll.h>
+#include <net/net_namespace.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/sock.h>
+#include <net/xdp.h>
+#include <net/xdp_sock.h>
+#include <net/page_pool/memory_provider.h>
+
+#include "dev.h"
+#include "devmem.h"
+#include "netdev-genl-gen.h"
+
+struct netdev_nl_dump_ctx {
+ unsigned long ifindex;
+ unsigned int rxq_idx;
+ unsigned int txq_idx;
+ unsigned int napi_id;
+};
+
+static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
+{
+ NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx);
+
+ return (struct netdev_nl_dump_ctx *)cb->ctx;
+}
+
+static int
+netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info)
+{
+ u64 xsk_features = 0;
+ u64 xdp_rx_meta = 0;
+ void *hdr;
+
+ netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+ xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+ if (netdev->xsk_tx_metadata_ops) {
+ if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
+ }
+
+ if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
+ netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ xdp_rx_meta, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
+ xsk_features, NETDEV_A_DEV_PAD))
+ goto err_cancel_msg;
+
+ if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
+ if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+ netdev->xdp_zc_max_segs))
+ goto err_cancel_msg;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static void
+netdev_genl_dev_notify(struct net_device *netdev, int cmd)
+{
+ struct genl_info info;
+ struct sk_buff *ntf;
+
+ if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev),
+ NETDEV_NLGRP_MGMT))
+ return;
+
+ genl_info_init_ntf(&info, &netdev_nl_family, cmd);
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ if (netdev_nl_dev_fill(netdev, ntf, &info)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf,
+ 0, NETDEV_NLGRP_MGMT, GFP_KERNEL);
+}
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ u32 ifindex;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_free_msg;
+ }
+
+ err = netdev_nl_dev_fill(netdev, rsp, info);
+ netdev_unlock(netdev);
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ int err;
+
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
+ const struct genl_info *info)
+{
+ unsigned long irq_suspend_timeout;
+ unsigned long gro_flush_timeout;
+ u32 napi_defer_hard_irqs;
+ void *hdr;
+ pid_t pid;
+
+ if (!napi->dev->up)
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
+ goto nla_put_failure;
+
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
+ goto nla_put_failure;
+
+ if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq))
+ goto nla_put_failure;
+
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED,
+ napi_get_threaded(napi)))
+ goto nla_put_failure;
+
+ if (napi->thread) {
+ pid = task_pid_nr(napi->thread);
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid))
+ goto nla_put_failure;
+ }
+
+ napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
+ if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ napi_defer_hard_irqs))
+ goto nla_put_failure;
+
+ irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+ irq_suspend_timeout))
+ goto nla_put_failure;
+
+ gro_flush_timeout = napi_get_gro_flush_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+ gro_flush_timeout))
+ goto nla_put_failure;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ struct sk_buff *rsp;
+ u32 napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
+ err = netdev_nl_napi_fill_one(rsp, napi, info);
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
+
+ if (err) {
+ goto err_free_msg;
+ } else if (!rsp->len) {
+ err = -ENOENT;
+ goto err_free_msg;
+ }
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ struct napi_struct *napi;
+ unsigned int prev_id;
+ int err = 0;
+
+ if (!netdev->up)
+ return err;
+
+ prev_id = UINT_MAX;
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (!napi_id_valid(napi->napi_id))
+ continue;
+
+ /* Dump continuation below depends on the list being sorted */
+ WARN_ON_ONCE(napi->napi_id >= prev_id);
+ prev_id = napi->napi_id;
+
+ if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
+ continue;
+
+ err = netdev_nl_napi_fill_one(rsp, napi, info);
+ if (err)
+ return err;
+ ctx->napi_id = napi->napi_id;
+ }
+ return err;
+}
+
+int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ u32 ifindex = 0;
+ int err = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
+
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
+ err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
+ netdev_unlock(netdev);
+ } else {
+ err = -ENODEV;
+ }
+ } else {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
+ if (err < 0)
+ break;
+ ctx->napi_id = 0;
+ }
+ }
+
+ return err;
+}
+
+static int
+netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
+{
+ u64 irq_suspend_timeout = 0;
+ u64 gro_flush_timeout = 0;
+ u8 threaded = 0;
+ u32 defer = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_THREADED]) {
+ int ret;
+
+ threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]);
+ ret = napi_set_threaded(napi, threaded);
+ if (ret)
+ return ret;
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
+ defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
+ napi_set_defer_hard_irqs(napi, defer);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
+ irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
+ napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
+ gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
+ napi_set_gro_flush_timeout(napi, gro_flush_timeout);
+ }
+
+ return 0;
+}
+
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ unsigned int napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
+ err = netdev_nl_napi_set_config(napi, info);
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
+
+ return err;
+}
+
+static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi)
+{
+ if (napi && napi_id_valid(napi->napi_id))
+ return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id);
+ return 0;
+}
+
+static int
+netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
+ u32 q_idx, u32 q_type, const struct genl_info *info)
+{
+ struct pp_memory_provider_params *params;
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) ||
+ nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) ||
+ nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(netdev, q_idx);
+ if (nla_put_napi_id(rsp, rxq->napi))
+ goto nla_put_failure;
+
+ params = &rxq->mp_params;
+ if (params->mp_ops &&
+ params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
+ goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+
+ break;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(netdev, q_idx);
+ if (nla_put_napi_id(rsp, txq->napi))
+ goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (txq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+ break;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id,
+ u32 q_type)
+{
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ if (q_id >= netdev->real_num_rx_queues)
+ return -EINVAL;
+ return 0;
+ case NETDEV_QUEUE_TYPE_TX:
+ if (q_id >= netdev->real_num_tx_queues)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
+ u32 q_type, const struct genl_info *info)
+{
+ int err;
+
+ if (!netdev->up)
+ return -ENOENT;
+
+ err = netdev_nl_queue_validate(netdev, q_idx, q_type);
+ if (err)
+ return err;
+
+ return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info);
+}
+
+int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 q_id, q_type, ifindex;
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX))
+ return -EINVAL;
+
+ q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]);
+ q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]);
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info),
+ ifindex);
+ if (netdev) {
+ err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
+ netdev_unlock_ops_compat(netdev);
+ } else {
+ err = -ENODEV;
+ }
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ int err = 0;
+
+ if (!netdev->up)
+ return err;
+
+ for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx,
+ NETDEV_QUEUE_TYPE_RX, info);
+ if (err)
+ return err;
+ }
+ for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx,
+ NETDEV_QUEUE_TYPE_TX, info);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ u32 ifindex = 0;
+ int err = 0;
+
+ if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (netdev) {
+ err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
+ netdev_unlock_ops_compat(netdev);
+ } else {
+ err = -ENODEV;
+ }
+ } else {
+ for_each_netdev_lock_ops_compat_scoped(net, netdev,
+ ctx->ifindex) {
+ err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
+ if (err < 0)
+ break;
+ ctx->rxq_idx = 0;
+ ctx->txq_idx = 0;
+ }
+ }
+
+ return err;
+}
+
+#define NETDEV_STAT_NOT_SET (~0ULL)
+
+static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size)
+{
+ const u64 *add = _add;
+ u64 *sum = _sum;
+
+ while (size) {
+ if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET)
+ *sum += *add;
+ sum++;
+ add++;
+ size -= 8;
+ }
+}
+
+static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value)
+{
+ if (value == NETDEV_STAT_NOT_SET)
+ return 0;
+ return nla_put_uint(rsp, attr_id, value);
+}
+
+static int
+netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
+{
+ if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int
+netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
+{
+ if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int
+netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp,
+ u32 q_type, int i, const struct genl_info *info)
+{
+ const struct netdev_stat_ops *ops = netdev->stat_ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+ if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) ||
+ nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) ||
+ nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i))
+ goto nla_put_failure;
+
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ memset(&rx, 0xff, sizeof(rx));
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ if (!memchr_inv(&rx, 0xff, sizeof(rx)))
+ goto nla_cancel;
+ if (netdev_nl_stats_write_rx(rsp, &rx))
+ goto nla_put_failure;
+ break;
+ case NETDEV_QUEUE_TYPE_TX:
+ memset(&tx, 0xff, sizeof(tx));
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ if (!memchr_inv(&tx, 0xff, sizeof(tx)))
+ goto nla_cancel;
+ if (netdev_nl_stats_write_tx(rsp, &tx))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+nla_cancel:
+ genlmsg_cancel(rsp, hdr);
+ return 0;
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ const struct netdev_stat_ops *ops = netdev->stat_ops;
+ int i, err;
+
+ if (!(netdev->flags & IFF_UP))
+ return 0;
+
+ i = ctx->rxq_idx;
+ while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) {
+ err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX,
+ i, info);
+ if (err)
+ return err;
+ ctx->rxq_idx = ++i;
+ }
+ i = ctx->txq_idx;
+ while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) {
+ err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX,
+ i, info);
+ if (err)
+ return err;
+ ctx->txq_idx = ++i;
+ }
+
+ ctx->rxq_idx = 0;
+ ctx->txq_idx = 0;
+ return 0;
+}
+
+/**
+ * netdev_stat_queue_sum() - add up queue stats from range of queues
+ * @netdev: net_device
+ * @rx_start: index of the first Rx queue to query
+ * @rx_end: index after the last Rx queue (first *not* to query)
+ * @rx_sum: output Rx stats, should be already initialized
+ * @tx_start: index of the first Tx queue to query
+ * @tx_end: index after the last Tx queue (first *not* to query)
+ * @tx_sum: output Tx stats, should be already initialized
+ *
+ * Add stats from [start, end) range of queue IDs to *x_sum structs.
+ * The sum structs must be already initialized. Usually this
+ * helper is invoked from the .get_base_stats callbacks of drivers
+ * to account for stats of disabled queues. In that case the ranges
+ * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues).
+ */
+void netdev_stat_queue_sum(struct net_device *netdev,
+ int rx_start, int rx_end,
+ struct netdev_queue_stats_rx *rx_sum,
+ int tx_start, int tx_end,
+ struct netdev_queue_stats_tx *tx_sum)
+{
+ const struct netdev_stat_ops *ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ int i;
+
+ ops = netdev->stat_ops;
+
+ for (i = rx_start; i < rx_end; i++) {
+ memset(&rx, 0xff, sizeof(rx));
+ if (ops->get_queue_stats_rx)
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ netdev_nl_stats_add(rx_sum, &rx, sizeof(rx));
+ }
+ for (i = tx_start; i < tx_end; i++) {
+ memset(&tx, 0xff, sizeof(tx));
+ if (ops->get_queue_stats_tx)
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ netdev_nl_stats_add(tx_sum, &tx, sizeof(tx));
+ }
+}
+EXPORT_SYMBOL(netdev_stat_queue_sum);
+
+static int
+netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info)
+{
+ struct netdev_queue_stats_rx rx_sum;
+ struct netdev_queue_stats_tx tx_sum;
+ void *hdr;
+
+ /* Netdev can't guarantee any complete counters */
+ if (!netdev->stat_ops->get_base_stats)
+ return 0;
+
+ memset(&rx_sum, 0xff, sizeof(rx_sum));
+ memset(&tx_sum, 0xff, sizeof(tx_sum));
+
+ netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum);
+
+ /* The op was there, but nothing reported, don't bother */
+ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
+ !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum)))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+ if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum,
+ 0, netdev->real_num_tx_queues, &tx_sum);
+
+ if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
+ netdev_nl_stats_write_tx(rsp, &tx_sum))
+ goto nla_put_failure;
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope,
+ struct sk_buff *skb, const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ if (!netdev->stat_ops)
+ return 0;
+
+ switch (scope) {
+ case 0:
+ return netdev_nl_stats_by_netdev(netdev, skb, info);
+ case NETDEV_QSTATS_SCOPE_QUEUE:
+ return netdev_nl_stats_by_queue(netdev, skb, info, ctx);
+ }
+
+ return -EINVAL; /* Should not happen, per netlink policy */
+}
+
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ unsigned int ifindex;
+ unsigned int scope;
+ int err = 0;
+
+ scope = 0;
+ if (info->attrs[NETDEV_A_QSTATS_SCOPE])
+ scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]);
+
+ ifindex = 0;
+ if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (!netdev) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ return -ENODEV;
+ }
+ if (netdev->stat_ops) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ } else {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ err = -EOPNOTSUPP;
+ }
+ netdev_unlock_ops_compat(netdev);
+ return err;
+ }
+
+ for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ if (err < 0)
+ break;
+ }
+
+ return err;
+}
+
+static int netdev_nl_read_rxq_bitmap(struct genl_info *info,
+ u32 rxq_bitmap_len,
+ unsigned long *rxq_bitmap)
+{
+ const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
+ struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct nlattr *attr;
+ int rem, err = 0;
+ u32 rxq_idx;
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(tb, maxtype, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE))
+ return -EINVAL;
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ if (rxq_idx >= rxq_bitmap_len) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]);
+ return -EINVAL;
+ }
+
+ bitmap_set(rxq_bitmap, rxq_idx, 1);
+ }
+
+ return 0;
+}
+
+static struct device *
+netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct device *dma_dev = NULL;
+ u32 rxq_idx, prev_rxq_idx;
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ struct device *rxq_dma_dev;
+
+ rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
+ if (dma_dev && rxq_dma_dev != dma_dev) {
+ NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
+ rxq_idx, prev_rxq_idx);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dma_dev = rxq_dma_dev;
+ prev_rxq_idx = rxq_idx;
+ }
+
+ return dma_dev;
+}
+
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ u32 ifindex, dmabuf_fd, rxq_idx;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ unsigned long *rxq_bitmap;
+ struct device *dma_dev;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ err = 0;
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+ if (!netif_device_present(netdev))
+ err = -ENODEV;
+ else if (!netdev_need_ops_lock(netdev))
+ err = -EOPNOTSUPP;
+ if (err) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_DEV_IFINDEX]);
+ goto err_unlock;
+ }
+
+ rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL);
+ if (!rxq_bitmap) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+
+ err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues,
+ rxq_bitmap);
+ if (err)
+ goto err_rxq_bitmap;
+
+ dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack);
+ if (IS_ERR(dma_dev)) {
+ err = PTR_ERR(dma_dev);
+ goto err_rxq_bitmap;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_rxq_bitmap;
+ }
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
+ info->extack);
+ if (err)
+ goto err_unbind;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ err = genlmsg_reply(rsp, info);
+ if (err)
+ goto err_unbind;
+
+ bitmap_free(rxq_bitmap);
+
+ netdev_unlock(netdev);
+
+ mutex_unlock(&priv->lock);
+
+ return 0;
+
+err_unbind:
+ net_devmem_unbind_dmabuf(binding);
+err_rxq_bitmap:
+ bitmap_free(rxq_bitmap);
+err_unlock:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ struct device *dma_dev;
+ u32 ifindex, dmabuf_fd;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+
+ if (!netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!netdev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+
+ dma_dev = netdev_queue_get_dma_dev(netdev, 0);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock_netdev;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(netdev);
+ mutex_unlock(&priv->lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_netdev:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
+{
+ INIT_LIST_HEAD(&priv->bindings);
+ mutex_init(&priv->lock);
+}
+
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct net_devmem_dmabuf_binding *temp;
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ mutex_lock(&priv->lock);
+ list_for_each_entry_safe(binding, temp, &priv->bindings, list) {
+ mutex_lock(&binding->lock);
+ dev = binding->dev;
+ if (!dev) {
+ mutex_unlock(&binding->lock);
+ net_devmem_unbind_dmabuf(binding);
+ continue;
+ }
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ mutex_unlock(&binding->lock);
+
+ netdev_lock(dev);
+ net_devmem_unbind_dmabuf(binding);
+ netdev_unlock(dev);
+ netdev_put(dev, &dev_tracker);
+ }
+ mutex_unlock(&priv->lock);
+}
+
+static int netdev_genl_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ netdev_lock_ops_to_full(netdev);
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+ netdev_unlock_full_to_ops(netdev);
+ break;
+ case NETDEV_UNREGISTER:
+ netdev_lock(netdev);
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+ netdev_unlock(netdev);
+ break;
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block netdev_genl_nb = {
+ .notifier_call = netdev_genl_netdevice_event,
+};
+
+static int __init netdev_genl_init(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&netdev_genl_nb);
+ if (err)
+ return err;
+
+ err = genl_register_family(&netdev_nl_family);
+ if (err)
+ goto err_unreg_ntf;
+
+ return 0;
+
+err_unreg_ntf:
+ unregister_netdevice_notifier(&netdev_genl_nb);
+ return err;
+}
+
+subsys_initcall(netdev_genl_init);
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
new file mode 100644
index 000000000000..251f27a8307f
--- /dev/null
+++ b/net/core/netdev_queues.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/netdev_queues.h>
+
+/**
+ * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
+ * @dev: net_device
+ * @idx: queue index
+ *
+ * Get dma device for zero-copy operations to be used for this queue.
+ * When such device is not available or valid, the function will return NULL.
+ *
+ * Return: Device or NULL on error
+ */
+struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
+{
+ const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct device *dma_dev;
+
+ if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
+ dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
+ else
+ dma_dev = dev->dev.parent;
+
+ return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+}
+
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
new file mode 100644
index 000000000000..c7d9341b7630
--- /dev/null
+++ b/net/core/netdev_rx_queue.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/memory_provider.h>
+
+#include "page_pool_priv.h"
+
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+
+ return !!rxq->mp_params.mp_ops;
+}
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
+ void *new_mem, *old_mem;
+ int err;
+
+ if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free ||
+ !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start)
+ return -EOPNOTSUPP;
+
+ netdev_assert_locked(dev);
+
+ new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!new_mem)
+ return -ENOMEM;
+
+ old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!old_mem) {
+ err = -ENOMEM;
+ goto err_free_new_mem;
+ }
+
+ err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_free_old_mem;
+
+ err = page_pool_check_memory_provider(dev, rxq);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ if (netif_running(dev)) {
+ err = qops->ndo_queue_stop(dev, old_mem, rxq_idx);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_start_queue;
+ } else {
+ swap(new_mem, old_mem);
+ }
+
+ qops->ndo_queue_mem_free(dev, old_mem);
+
+ kvfree(old_mem);
+ kvfree(new_mem);
+
+ return 0;
+
+err_start_queue:
+ /* Restarting the queue with old_mem should be successful as we haven't
+ * changed any of the queue configuration, and there is not much we can
+ * do to recover from a failure here.
+ *
+ * WARN if we fail to recover the old rx queue, and at least free
+ * old_mem so we don't also leak that.
+ */
+ if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ WARN(1,
+ "Failed to restart old queue in error path. RX queue %d may be unhealthy.",
+ rxq_idx);
+ qops->ndo_queue_mem_free(dev, old_mem);
+ }
+
+err_free_new_queue_mem:
+ qops->ndo_queue_mem_free(dev, new_mem);
+
+err_free_old_mem:
+ kvfree(old_mem);
+
+err_free_new_mem:
+ kvfree(new_mem);
+
+ return err;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
+
+int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_rx_queue *rxq;
+ int ret;
+
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
+ return -EINVAL;
+ }
+ if (dev->cfg->hds_thresh) {
+ NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
+ return -EINVAL;
+ }
+ if (dev_xdp_prog_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
+ return -EEXIST;
+ }
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ if (rxq->mp_params.mp_ops) {
+ NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
+ return -EEXIST;
+ }
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool) {
+ NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
+ return -EBUSY;
+ }
+#endif
+
+ rxq->mp_params = *p;
+ ret = netdev_rx_queue_restart(dev, rxq_idx);
+ if (ret) {
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ }
+ return ret;
+}
+
+int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ struct pp_memory_provider_params *p)
+{
+ int ret;
+
+ netdev_lock(dev);
+ ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
+ netdev_unlock(dev);
+ return ret;
+}
+
+void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
+{
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
+ return;
+
+ rxq = __netif_get_rx_queue(dev, ifq_idx);
+
+ /* Callers holding a netdev ref may get here after we already
+ * went thru shutdown via dev_memory_provider_uninstall().
+ */
+ if (dev->reg_state > NETREG_REGISTERED &&
+ !rxq->mp_params.mp_ops)
+ return;
+
+ if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops ||
+ rxq->mp_params.mp_priv != old_p->mp_priv))
+ return;
+
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ err = netdev_rx_queue_restart(dev, ifq_idx);
+ WARN_ON(err && err != -ENETDOWN);
+}
+
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
+ struct pp_memory_provider_params *old_p)
+{
+ netdev_lock(dev);
+ __net_mp_close_rxq(dev, ifq_idx, old_p);
+ netdev_unlock(dev);
+}
diff --git a/net/core/netevent.c b/net/core/netevent.c
index 35d02c38554e..5bb615e963cc 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Network event notifiers
*
@@ -5,16 +6,13 @@
* Tom Tucker <tom@opengridcomputing.com>
* Steve Wise <swise@opengridcomputing.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
*/
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
+#include <linux/export.h>
+#include <net/netevent.h>
static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
@@ -29,14 +27,12 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
*/
int register_netevent_notifier(struct notifier_block *nb)
{
- int err;
-
- err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
- return err;
+ return atomic_notifier_chain_register(&netevent_notif_chain, nb);
}
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
/**
- * netevent_unregister_notifier - unregister a netevent notifier block
+ * unregister_netevent_notifier - unregister a netevent notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
@@ -49,6 +45,7 @@ int unregister_netevent_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
}
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
/**
* call_netevent_notifiers - call all netevent notifier blocks
@@ -63,7 +60,4 @@ int call_netevent_notifiers(unsigned long val, void *v)
{
return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
}
-
-EXPORT_SYMBOL_GPL(register_netevent_notifier);
-EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
new file mode 100644
index 000000000000..23175cb2bd86
--- /dev/null
+++ b/net/core/netmem_priv.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NETMEM_PRIV_H
+#define __NETMEM_PRIV_H
+
+static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
+{
+ return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
+}
+
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+ netmem_to_nmdesc(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
+{
+ WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+
+ netmem_to_nmdesc(netmem)->pp_magic = 0;
+}
+
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+
+static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
+{
+ netmem_to_nmdesc(netmem)->pp = pool;
+}
+
+static inline void netmem_set_dma_addr(netmem_ref netmem,
+ unsigned long dma_addr)
+{
+ netmem_to_nmdesc(netmem)->dma_addr = dma_addr;
+}
+
+static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return 0;
+
+ magic = netmem_to_nmdesc(netmem)->pp_magic;
+
+ return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
+}
+
+static inline void netmem_set_dma_index(netmem_ref netmem,
+ unsigned long id)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return;
+
+ magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
+ netmem_to_nmdesc(netmem)->pp_magic = magic;
+}
+#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 3c58846fcaa5..09f72f10813c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Common framework for low-level network console, dump, and debugger code
*
@@ -9,7 +10,10 @@
* Copyright (C) 2002 Red Hat, Inc.
*/
-#include <linux/smp_lock.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
@@ -22,9 +26,17 @@
#include <linux/delay.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/if_vlan.h>
#include <net/tcp.h>
#include <net/udp.h>
-#include <asm/unaligned.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ip6_checksum.h>
+#include <linux/unaligned.h>
+#include <trace/events/napi.h>
+#include <linux/kconfig.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -33,148 +45,199 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
-#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define USEC_PER_POLL 50
-static struct sk_buff_head skb_pool;
+#define MAX_SKB_SIZE \
+ (sizeof(struct ethhdr) + \
+ sizeof(struct iphdr) + \
+ sizeof(struct udphdr) + \
+ MAX_UDP_CHUNK)
-static atomic_t trapped;
+static void zap_completion_queue(void);
-#define USEC_PER_POLL 50
-#define NETPOLL_RX_ENABLED 1
-#define NETPOLL_RX_DROP 2
+static unsigned int carrier_timeout = 4;
+module_param(carrier_timeout, uint, 0644);
-#define MAX_SKB_SIZE \
- (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
- sizeof(struct iphdr) + sizeof(struct ethhdr))
+static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ netdev_tx_t status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (skb_vlan_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ }
-static void zap_completion_queue(void);
-static void arp_reply(struct sk_buff *skb);
+ status = netdev_start_xmit(skb, dev, txq, false);
-static void queue_process(void *p)
+out:
+ return status;
+}
+
+static void queue_process(struct work_struct *work)
{
- struct netpoll_info *npinfo = p;
+ struct netpoll_info *npinfo =
+ container_of(work, struct netpoll_info, tx_work.work);
struct sk_buff *skb;
+ unsigned long flags;
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ unsigned int q_index;
if (!netif_device_present(dev) || !netif_running(dev)) {
- __kfree_skb(skb);
+ kfree_skb(skb);
continue;
}
- netif_tx_lock_bh(dev);
- if (netif_queue_stopped(dev) ||
- dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ local_irq_save(flags);
+ /* check if skb->queue_mapping is still valid */
+ q_index = skb_get_queue_mapping(skb);
+ if (unlikely(q_index >= dev->real_num_tx_queues)) {
+ q_index = q_index % dev->real_num_tx_queues;
+ skb_set_queue_mapping(skb, q_index);
+ }
+ txq = netdev_get_tx_queue(dev, q_index);
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (netif_xmit_frozen_or_stopped(txq) ||
+ !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
skb_queue_head(&npinfo->txq, skb);
- netif_tx_unlock_bh(dev);
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
schedule_delayed_work(&npinfo->tx_work, HZ/10);
return;
}
-
- netif_tx_unlock_bh(dev);
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
}
}
-static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
- unsigned short ulen, __be32 saddr, __be32 daddr)
+static int netif_local_xmit_active(struct net_device *dev)
{
- __wsum psum;
-
- if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
- return 0;
+ int i;
- psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
- if (skb->ip_summed == CHECKSUM_COMPLETE &&
- !csum_fold(csum_add(psum, skb->csum)))
- return 0;
-
- skb->csum = psum;
+ if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id())
+ return 1;
+ }
- return __skb_checksum_complete(skb);
+ return 0;
}
-/*
- * Check whether delayed processing was scheduled for our NIC. If so,
- * we attempt to grab the poll lock and use ->poll() to pump the card.
- * If this fails, either we've recursed in ->poll() or it's already
- * running on another CPU.
- *
- * Note: we don't mask interrupts with this lock because we're using
- * trylock here and interrupts are already disabled in the softirq
- * case. Further, we test the poll_owner to avoid recursion on UP
- * systems where the lock doesn't exist.
- *
- * In cases where there is bi-directional communications, reading only
- * one message at a time can lead to packets being dropped by the
- * network adapter, forcing superfluous retries and possibly timeouts.
- * Thus, we set our budget to greater than 1.
- */
-static void poll_napi(struct netpoll *np)
+static void poll_one_napi(struct napi_struct *napi)
{
- struct netpoll_info *npinfo = np->dev->npinfo;
- int budget = 16;
+ int work;
+
+ /* If we set this bit but see that it has already been set,
+ * that indicates that napi has been disabled and we need
+ * to abort this operation
+ */
+ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
+ return;
+
+ /* We explicitly pass the polling call a budget of 0 to
+ * indicate that we are clearing the Tx path only.
+ */
+ work = napi->poll(napi, 0);
+ WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll);
+ trace_napi_poll(napi, work, 0);
- if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
- npinfo->poll_owner != smp_processor_id() &&
- spin_trylock(&npinfo->poll_lock)) {
- npinfo->rx_flags |= NETPOLL_RX_DROP;
- atomic_inc(&trapped);
+ clear_bit(NAPI_STATE_NPSVC, &napi->state);
+}
- np->dev->poll(np->dev, &budget);
+static void poll_napi(struct net_device *dev)
+{
+ struct napi_struct *napi;
+ int cpu = smp_processor_id();
- atomic_dec(&trapped);
- npinfo->rx_flags &= ~NETPOLL_RX_DROP;
- spin_unlock(&npinfo->poll_lock);
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
+ if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
+ poll_one_napi(napi);
+ smp_store_release(&napi->poll_owner, -1);
+ }
}
}
-static void service_arp_queue(struct netpoll_info *npi)
+void netpoll_poll_dev(struct net_device *dev)
{
- struct sk_buff *skb;
+ struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ const struct net_device_ops *ops;
- if (unlikely(!npi))
+ /* Don't do any rx activity if the dev_lock mutex is held
+ * the dev_open/close paths use this to block netpoll activity
+ * while changing device state
+ */
+ if (!ni || down_trylock(&ni->dev_lock))
return;
- skb = skb_dequeue(&npi->arp_tx);
-
- while (skb != NULL) {
- arp_reply(skb);
- skb = skb_dequeue(&npi->arp_tx);
+ /* Some drivers will take the same locks in poll and xmit,
+ * we can't poll if local CPU is already in xmit.
+ */
+ if (!netif_running(dev) || netif_local_xmit_active(dev)) {
+ up(&ni->dev_lock);
+ return;
}
+
+ ops = dev->netdev_ops;
+ if (ops->ndo_poll_controller)
+ ops->ndo_poll_controller(dev);
+
+ poll_napi(dev);
+
+ up(&ni->dev_lock);
+
+ zap_completion_queue();
}
+EXPORT_SYMBOL(netpoll_poll_dev);
-void netpoll_poll(struct netpoll *np)
+void netpoll_poll_disable(struct net_device *dev)
{
- if (!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
- return;
+ struct netpoll_info *ni;
- /* Process pending work on NIC */
- np->dev->poll_controller(np->dev);
- if (np->dev->poll)
- poll_napi(np);
+ might_sleep();
+ ni = rtnl_dereference(dev->npinfo);
+ if (ni)
+ down(&ni->dev_lock);
+}
- service_arp_queue(np->dev->npinfo);
+void netpoll_poll_enable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
- zap_completion_queue();
+ ni = rtnl_dereference(dev->npinfo);
+ if (ni)
+ up(&ni->dev_lock);
}
-static void refill_skbs(void)
+static void refill_skbs(struct netpoll *np)
{
+ struct sk_buff_head *skb_pool;
struct sk_buff *skb;
- unsigned long flags;
- spin_lock_irqsave(&skb_pool.lock, flags);
- while (skb_pool.qlen < MAX_SKBS) {
+ skb_pool = &np->skb_pool;
+
+ while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
- __skb_queue_tail(&skb_pool, skb);
+ skb_queue_tail(skb_pool, skb);
}
- spin_unlock_irqrestore(&skb_pool.lock, flags);
}
static void zap_completion_queue(void)
@@ -193,10 +256,12 @@ static void zap_completion_queue(void)
while (clist != NULL) {
struct sk_buff *skb = clist;
clist = clist->next;
- if (skb->destructor)
+ if (!skb_irq_freeable(skb)) {
+ refcount_set(&skb->users, 1);
dev_kfree_skb_any(skb); /* put this one back */
- else
+ } else {
__kfree_skb(skb);
+ }
}
}
@@ -209,584 +274,587 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
struct sk_buff *skb;
zap_completion_queue();
- refill_skbs();
repeat:
skb = alloc_skb(len, GFP_ATOMIC);
- if (!skb)
- skb = skb_dequeue(&skb_pool);
+ if (!skb) {
+ skb = skb_dequeue(&np->skb_pool);
+ schedule_work(&np->refill_wq);
+ }
if (!skb) {
if (++count < 10) {
- netpoll_poll(np);
+ netpoll_poll_dev(np->dev);
goto repeat;
}
return NULL;
}
- atomic_set(&skb->users, 1);
+ refcount_set(&skb->users, 1);
skb_reserve(skb, reserve);
return skb;
}
-static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+static int netpoll_owner_active(struct net_device *dev)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
+ if (READ_ONCE(napi->poll_owner) == smp_processor_id())
+ return 1;
+ }
+ return 0;
+}
+
+/* call with IRQ disabled */
+static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status = NETDEV_TX_BUSY;
+ netdev_tx_t status = NETDEV_TX_BUSY;
+ netdev_tx_t ret = NET_XMIT_DROP;
+ struct net_device *dev;
unsigned long tries;
- struct net_device *dev = np->dev;
- struct netpoll_info *npinfo = np->dev->npinfo;
+ /* It is up to the caller to keep npinfo alive. */
+ struct netpoll_info *npinfo;
+
+ lockdep_assert_irqs_disabled();
+
+ dev = np->dev;
+ rcu_read_lock();
+ npinfo = rcu_dereference_bh(dev->npinfo);
- if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
- __kfree_skb(skb);
- return;
- }
+ if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
+ dev_kfree_skb_irq(skb);
+ goto out;
+ }
/* don't get messages out of order, and no recursion */
- if (skb_queue_len(&npinfo->txq) == 0 &&
- npinfo->poll_owner != smp_processor_id() &&
- netif_tx_trylock(dev)) {
+ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
+ struct netdev_queue *txq;
+
+ txq = netdev_core_pick_tx(dev, skb, NULL);
+
/* try until next clock tick */
- for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) {
- if (!netif_queue_stopped(dev))
- status = dev->hard_start_xmit(skb, dev);
+ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
+ tries > 0; --tries) {
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
- if (status == NETDEV_TX_OK)
- break;
+ HARD_TX_UNLOCK(dev, txq);
+
+ if (dev_xmit_complete(status))
+ break;
+
+ }
/* tickle device maybe there is some cleanup */
- netpoll_poll(np);
+ netpoll_poll_dev(np->dev);
udelay(USEC_PER_POLL);
}
- netif_tx_unlock(dev);
+
+ WARN_ONCE(!irqs_disabled(),
+ "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n",
+ dev->name, dev->netdev_ops->ndo_start_xmit);
+
}
- if (status != NETDEV_TX_OK) {
+ if (!dev_xmit_complete(status)) {
skb_queue_tail(&npinfo->txq, skb);
- schedule_work(&npinfo->tx_work);
+ schedule_delayed_work(&npinfo->tx_work,0);
}
+ ret = NETDEV_TX_OK;
+out:
+ rcu_read_unlock();
+ return ret;
}
-void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb,
+ int len)
{
- int total_len, eth_len, ip_len, udp_len;
- struct sk_buff *skb;
struct udphdr *udph;
- struct iphdr *iph;
- struct ethhdr *eth;
+ int udp_len;
- udp_len = len + sizeof(*udph);
- ip_len = eth_len = udp_len + sizeof(*iph);
- total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
+ udp_len = len + sizeof(struct udphdr);
+ udph = udp_hdr(skb);
- skb = find_skb(np, total_len, total_len - len);
- if (!skb)
- return;
-
- memcpy(skb->data, msg, len);
- skb->len += len;
-
- skb->h.uh = udph = (struct udphdr *) skb_push(skb, sizeof(*udph));
- udph->source = htons(np->local_port);
- udph->dest = htons(np->remote_port);
- udph->len = htons(udp_len);
+ /* check needs to be set, since it will be consumed in csum_partial */
udph->check = 0;
- udph->check = csum_tcpudp_magic(htonl(np->local_ip),
- htonl(np->remote_ip),
- udp_len, IPPROTO_UDP,
- csum_partial((unsigned char *)udph, udp_len, 0));
+ if (np->ipv6)
+ udph->check = csum_ipv6_magic(&np->local_ip.in6,
+ &np->remote_ip.in6,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ else
+ udph->check = csum_tcpudp_magic(np->local_ip.ip,
+ np->remote_ip.ip,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
+}
+
+netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+ unsigned long flags;
+ netdev_tx_t ret;
+
+ if (unlikely(!np)) {
+ dev_kfree_skb_irq(skb);
+ ret = NET_XMIT_DROP;
+ } else {
+ local_irq_save(flags);
+ ret = __netpoll_send_skb(np, skb);
+ local_irq_restore(flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netpoll_send_skb);
+
+static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len)
+{
+ struct ipv6hdr *ip6h;
- skb->nh.iph = iph = (struct iphdr *)skb_push(skb, sizeof(*iph));
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+
+ /* ip6h->version = 6; ip6h->priority = 0; */
+ *(unsigned char *)ip6h = 0x60;
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+
+ ip6h->payload_len = htons(sizeof(struct udphdr) + len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 32;
+ ip6h->saddr = np->local_ip.in6;
+ ip6h->daddr = np->remote_ip.in6;
+
+ skb->protocol = htons(ETH_P_IPV6);
+}
+
+static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len)
+{
+ static atomic_t ip_ident;
+ struct iphdr *iph;
+ int ip_len;
+
+ ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr);
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
/* iph->version = 4; iph->ihl = 5; */
- put_unaligned(0x45, (unsigned char *)iph);
- iph->tos = 0;
- put_unaligned(htons(ip_len), &(iph->tot_len));
- iph->id = 0;
+ *(unsigned char *)iph = 0x45;
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &iph->tot_len);
+ iph->id = htons(atomic_inc_return(&ip_ident));
iph->frag_off = 0;
- iph->ttl = 64;
+ iph->ttl = 64;
iph->protocol = IPPROTO_UDP;
- iph->check = 0;
- put_unaligned(htonl(np->local_ip), &(iph->saddr));
- put_unaligned(htonl(np->remote_ip), &(iph->daddr));
- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ iph->check = 0;
+ put_unaligned(np->local_ip.ip, &iph->saddr);
+ put_unaligned(np->remote_ip.ip, &iph->daddr);
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->protocol = htons(ETH_P_IP);
+}
- eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
- skb->mac.raw = skb->data;
- skb->protocol = eth->h_proto = htons(ETH_P_IP);
- memcpy(eth->h_source, np->local_mac, 6);
- memcpy(eth->h_dest, np->remote_mac, 6);
+static void push_udp(struct netpoll *np, struct sk_buff *skb, int len)
+{
+ struct udphdr *udph;
+ int udp_len;
- skb->dev = np->dev;
+ udp_len = len + sizeof(struct udphdr);
- netpoll_send_skb(np, skb);
-}
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
-static void arp_reply(struct sk_buff *skb)
-{
- struct netpoll_info *npinfo = skb->dev->npinfo;
- struct arphdr *arp;
- unsigned char *arp_ptr;
- int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
- __be32 sip, tip;
- struct sk_buff *send_skb;
- struct netpoll *np = NULL;
+ udph = udp_hdr(skb);
+ udph->source = htons(np->local_port);
+ udph->dest = htons(np->remote_port);
+ udph->len = htons(udp_len);
- if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
- np = npinfo->rx_np;
- if (!np)
- return;
+ netpoll_udp_checksum(np, skb, len);
+}
- /* No arp on this interface */
- if (skb->dev->flags & IFF_NOARP)
- return;
+static void push_eth(struct netpoll *np, struct sk_buff *skb)
+{
+ struct ethhdr *eth;
- if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
- (2 * skb->dev->addr_len) +
- (2 * sizeof(u32)))))
- return;
+ eth = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ ether_addr_copy(eth->h_source, np->dev->dev_addr);
+ ether_addr_copy(eth->h_dest, np->remote_mac);
+ if (np->ipv6)
+ eth->h_proto = htons(ETH_P_IPV6);
+ else
+ eth->h_proto = htons(ETH_P_IP);
+}
- skb->h.raw = skb->nh.raw = skb->data;
- arp = skb->nh.arph;
+int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+{
+ int total_len, ip_len, udp_len;
+ struct sk_buff *skb;
- if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
- arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
- arp->ar_pro != htons(ETH_P_IP) ||
- arp->ar_op != htons(ARPOP_REQUEST))
- return;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!irqs_disabled());
- arp_ptr = (unsigned char *)(arp+1) + skb->dev->addr_len;
- memcpy(&sip, arp_ptr, 4);
- arp_ptr += 4 + skb->dev->addr_len;
- memcpy(&tip, arp_ptr, 4);
+ udp_len = len + sizeof(struct udphdr);
+ if (np->ipv6)
+ ip_len = udp_len + sizeof(struct ipv6hdr);
+ else
+ ip_len = udp_len + sizeof(struct iphdr);
- /* Should we ignore arp? */
- if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip))
- return;
+ total_len = ip_len + LL_RESERVED_SPACE(np->dev);
- size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4);
- send_skb = find_skb(np, size + LL_RESERVED_SPACE(np->dev),
- LL_RESERVED_SPACE(np->dev));
+ skb = find_skb(np, total_len + np->dev->needed_tailroom,
+ total_len - len);
+ if (!skb)
+ return -ENOMEM;
- if (!send_skb)
- return;
+ skb_copy_to_linear_data(skb, msg, len);
+ skb_put(skb, len);
- send_skb->nh.raw = send_skb->data;
- arp = (struct arphdr *) skb_put(send_skb, size);
- send_skb->dev = skb->dev;
- send_skb->protocol = htons(ETH_P_ARP);
+ push_udp(np, skb, len);
+ if (np->ipv6)
+ push_ipv6(np, skb, len);
+ else
+ push_ipv4(np, skb, len);
+ push_eth(np, skb);
+ skb->dev = np->dev;
- /* Fill the device header for the ARP frame */
+ return (int)netpoll_send_skb(np, skb);
+}
+EXPORT_SYMBOL(netpoll_send_udp);
- if (np->dev->hard_header &&
- np->dev->hard_header(send_skb, skb->dev, ptype,
- np->remote_mac, np->local_mac,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- return;
- }
- /*
- * Fill out the arp protocol part.
- *
- * we only support ethernet device type,
- * which (according to RFC 1390) should always equal 1 (Ethernet).
- */
+static void skb_pool_flush(struct netpoll *np)
+{
+ struct sk_buff_head *skb_pool;
- arp->ar_hrd = htons(np->dev->type);
- arp->ar_pro = htons(ETH_P_IP);
- arp->ar_hln = np->dev->addr_len;
- arp->ar_pln = 4;
- arp->ar_op = htons(type);
+ cancel_work_sync(&np->refill_wq);
+ skb_pool = &np->skb_pool;
+ skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
+}
- arp_ptr=(unsigned char *)(arp + 1);
- memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &tip, 4);
- arp_ptr += 4;
- memcpy(arp_ptr, np->remote_mac, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &sip, 4);
+static void refill_skbs_work_handler(struct work_struct *work)
+{
+ struct netpoll *np =
+ container_of(work, struct netpoll, refill_wq);
- netpoll_send_skb(np, send_skb);
+ refill_skbs(np);
}
-int __netpoll_rx(struct sk_buff *skb)
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
- int proto, len, ulen;
- struct iphdr *iph;
- struct udphdr *uh;
- struct netpoll_info *npi = skb->dev->npinfo;
- struct netpoll *np = npi->rx_np;
+ struct netpoll_info *npinfo;
+ const struct net_device_ops *ops;
+ int err;
- if (!np)
- goto out;
- if (skb->dev->type != ARPHRD_ETHER)
- goto out;
+ skb_queue_head_init(&np->skb_pool);
+ INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
- /* check if netpoll clients need ARP */
- if (skb->protocol == __constant_htons(ETH_P_ARP) &&
- atomic_read(&trapped)) {
- skb_queue_tail(&npi->arp_tx, skb);
- return 1;
+ if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
+ np_err(np, "%s doesn't support polling, aborting\n",
+ ndev->name);
+ err = -ENOTSUPP;
+ goto out;
}
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto != ETH_P_IP)
- goto out;
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto out;
- if (skb_shared(skb))
- goto out;
+ npinfo = rtnl_dereference(ndev->npinfo);
+ if (!npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto out;
+ }
- iph = (struct iphdr *)skb->data;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
- if (iph->ihl < 5 || iph->version != 4)
- goto out;
- if (!pskb_may_pull(skb, iph->ihl*4))
- goto out;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto out;
+ sema_init(&npinfo->dev_lock, 1);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < iph->ihl*4)
- goto out;
+ refcount_set(&npinfo->refcnt, 1);
- if (iph->protocol != IPPROTO_UDP)
- goto out;
+ ops = ndev->netdev_ops;
+ if (ops->ndo_netpoll_setup) {
+ err = ops->ndo_netpoll_setup(ndev);
+ if (err)
+ goto free_npinfo;
+ }
+ } else {
+ refcount_inc(&npinfo->refcnt);
+ }
- len -= iph->ihl*4;
- uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
- ulen = ntohs(uh->len);
+ np->dev = ndev;
+ strscpy(np->dev_name, ndev->name, IFNAMSIZ);
- if (ulen != len)
- goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
- goto out;
- if (np->local_ip && np->local_ip != ntohl(iph->daddr))
- goto out;
- if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
- goto out;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- goto out;
+ /* fill up the skb queue */
+ refill_skbs(np);
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
+ /* last thing to do is link it to the net device structure */
+ rcu_assign_pointer(ndev->npinfo, npinfo);
- kfree_skb(skb);
- return 1;
+ return 0;
+free_npinfo:
+ kfree(npinfo);
out:
- if (atomic_read(&trapped)) {
- kfree_skb(skb);
- return 1;
- }
+ return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
- return 0;
+/*
+ * Returns a pointer to a string representation of the identifier used
+ * to select the egress interface for the given netpoll instance. buf
+ * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ */
+static char *egress_dev(struct netpoll *np, char *buf)
+{
+ if (np->dev_name[0])
+ return np->dev_name;
+
+ snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ return buf;
}
-int netpoll_parse_options(struct netpoll *np, char *opt)
+static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev,
+ unsigned int timeout)
{
- char *cur=opt, *delim;
+ unsigned long atmost;
+
+ atmost = jiffies + timeout * HZ;
+ while (!netif_carrier_ok(ndev)) {
+ if (time_after(jiffies, atmost)) {
+ np_notice(np, "timeout waiting for carrier\n");
+ break;
+ }
+ msleep(1);
+ }
+}
- if (*cur != '@') {
- if ((delim = strchr(cur, '@')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->local_port = simple_strtol(cur, NULL, 10);
- cur = delim;
+/*
+ * Take the IPv6 from ndev and populate local_ip structure in netpoll
+ */
+static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
+{
+ char buf[MAC_ADDR_STR_LEN + 1];
+ int err = -EDESTADDRREQ;
+ struct inet6_dev *idev;
+
+ if (!IS_ENABLED(CONFIG_IPV6)) {
+ np_err(np, "IPv6 is not supported %s, aborting\n",
+ egress_dev(np, buf));
+ return -EINVAL;
}
- cur++;
- printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port);
-
- if (*cur != '/') {
- if ((delim = strchr(cur, '/')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->local_ip = ntohl(in_aton(cur));
- cur = delim;
-
- printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
- np->name, HIPQUAD(np->local_ip));
+
+ idev = __in6_dev_get(ndev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) !=
+ !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
+ continue;
+ /* Got the IP, let's return */
+ np->local_ip.in6 = ifp->addr;
+ err = 0;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
}
- cur++;
-
- if (*cur != ',') {
- /* parse out dev name */
- if ((delim = strchr(cur, ',')) == NULL)
- goto parse_failed;
- *delim = 0;
- strlcpy(np->dev_name, cur, sizeof(np->dev_name));
- cur = delim;
+ if (err) {
+ np_err(np, "no IPv6 address for %s, aborting\n",
+ egress_dev(np, buf));
+ return err;
}
- cur++;
- printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name);
+ np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
+ return 0;
+}
- if (*cur != '@') {
- /* dst port */
- if ((delim = strchr(cur, '@')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_port = simple_strtol(cur, NULL, 10);
- cur = delim;
+/*
+ * Take the IPv4 from ndev and populate local_ip structure in netpoll
+ */
+static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
+{
+ char buf[MAC_ADDR_STR_LEN + 1];
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rtnl(ndev);
+ if (!in_dev) {
+ np_err(np, "no IP address for %s, aborting\n",
+ egress_dev(np, buf));
+ return -EDESTADDRREQ;
}
- cur++;
- printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port);
-
- /* dst ip */
- if ((delim = strchr(cur, '/')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_ip = ntohl(in_aton(cur));
- cur = delim + 1;
-
- printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n",
- np->name, HIPQUAD(np->remote_ip));
-
- if (*cur != 0) {
- /* MAC address */
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[0] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[1] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[2] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[3] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[4] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- np->remote_mac[5] = simple_strtol(cur, NULL, 16);
+
+ ifa = rtnl_dereference(in_dev->ifa_list);
+ if (!ifa) {
+ np_err(np, "no IP address for %s, aborting\n",
+ egress_dev(np, buf));
+ return -EDESTADDRREQ;
}
- printk(KERN_INFO "%s: remote ethernet address "
- "%02x:%02x:%02x:%02x:%02x:%02x\n",
- np->name,
- np->remote_mac[0],
- np->remote_mac[1],
- np->remote_mac[2],
- np->remote_mac[3],
- np->remote_mac[4],
- np->remote_mac[5]);
+ np->local_ip.ip = ifa->ifa_local;
+ np_info(np, "local IP %pI4\n", &np->local_ip.ip);
return 0;
-
- parse_failed:
- printk(KERN_INFO "%s: couldn't parse config at %s!\n",
- np->name, cur);
- return -1;
}
int netpoll_setup(struct netpoll *np)
{
+ struct net *net = current->nsproxy->net_ns;
+ char buf[MAC_ADDR_STR_LEN + 1];
struct net_device *ndev = NULL;
- struct in_device *in_dev;
- struct netpoll_info *npinfo;
- unsigned long flags;
+ bool ip_overwritten = false;
int err;
- if (np->dev_name)
- ndev = dev_get_by_name(np->dev_name);
- if (!ndev) {
- printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
- np->name, np->dev_name);
- return -ENODEV;
- }
-
- np->dev = ndev;
- if (!ndev->npinfo) {
- npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
- if (!npinfo) {
- err = -ENOMEM;
- goto release;
- }
-
- npinfo->rx_flags = 0;
- npinfo->rx_np = NULL;
- spin_lock_init(&npinfo->poll_lock);
- npinfo->poll_owner = -1;
-
- spin_lock_init(&npinfo->rx_lock);
- skb_queue_head_init(&npinfo->arp_tx);
- skb_queue_head_init(&npinfo->txq);
- INIT_WORK(&npinfo->tx_work, queue_process, npinfo);
+ rtnl_lock();
+ if (np->dev_name[0])
+ ndev = __dev_get_by_name(net, np->dev_name);
+ else if (is_valid_ether_addr(np->dev_mac))
+ ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
- atomic_set(&npinfo->refcnt, 1);
- } else {
- npinfo = ndev->npinfo;
- atomic_inc(&npinfo->refcnt);
+ if (!ndev) {
+ np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
+ err = -ENODEV;
+ goto unlock;
}
+ netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
- if (!ndev->poll_controller) {
- printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
- np->name, np->dev_name);
- err = -ENOTSUPP;
- goto release;
+ if (netdev_master_upper_dev_get(ndev)) {
+ np_err(np, "%s is a slave device, aborting\n",
+ egress_dev(np, buf));
+ err = -EBUSY;
+ goto put;
}
if (!netif_running(ndev)) {
- unsigned long atmost, atleast;
-
- printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
- np->name, np->dev_name);
-
- rtnl_lock();
- err = dev_open(ndev);
- rtnl_unlock();
+ np_info(np, "device %s not up yet, forcing it\n",
+ egress_dev(np, buf));
+ err = dev_open(ndev, NULL);
if (err) {
- printk(KERN_ERR "%s: failed to open %s\n",
- np->name, ndev->name);
- goto release;
+ np_err(np, "failed to open %s\n", ndev->name);
+ goto put;
}
- atleast = jiffies + HZ/10;
- atmost = jiffies + 4*HZ;
- while (!netif_carrier_ok(ndev)) {
- if (time_after(jiffies, atmost)) {
- printk(KERN_NOTICE
- "%s: timeout waiting for carrier\n",
- np->name);
- break;
- }
- cond_resched();
- }
-
- /* If carrier appears to come up instantly, we don't
- * trust it and pause so that we don't pump all our
- * queued console messages into the bitbucket.
- */
-
- if (time_before(jiffies, atleast)) {
- printk(KERN_NOTICE "%s: carrier detect appears"
- " untrustworthy, waiting 4 seconds\n",
- np->name);
- msleep(4000);
- }
+ rtnl_unlock();
+ netpoll_wait_carrier(np, ndev, carrier_timeout);
+ rtnl_lock();
}
- if (is_zero_ether_addr(np->local_mac) && ndev->dev_addr)
- memcpy(np->local_mac, ndev->dev_addr, 6);
-
- if (!np->local_ip) {
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(ndev);
-
- if (!in_dev || !in_dev->ifa_list) {
- rcu_read_unlock();
- printk(KERN_ERR "%s: no IP address for %s, aborting\n",
- np->name, np->dev_name);
- err = -EDESTADDRREQ;
- goto release;
+ if (!np->local_ip.ip) {
+ if (!np->ipv6) {
+ err = netpoll_take_ipv4(np, ndev);
+ if (err)
+ goto put;
+ } else {
+ err = netpoll_take_ipv6(np, ndev);
+ if (err)
+ goto put;
}
-
- np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
- rcu_read_unlock();
- printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
- np->name, HIPQUAD(np->local_ip));
- }
-
- if (np->rx_hook) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_flags |= NETPOLL_RX_ENABLED;
- npinfo->rx_np = np;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ ip_overwritten = true;
}
- /* fill up the skb queue */
- refill_skbs();
-
- /* last thing to do is link it to the net device structure */
- ndev->npinfo = npinfo;
+ err = __netpoll_setup(np, ndev);
+ if (err)
+ goto flush;
+ rtnl_unlock();
- /* avoid racing with NAPI reading npinfo */
+ /* Make sure all NAPI polls which started before dev->npinfo
+ * was visible have exited before we start calling NAPI poll.
+ * NAPI skips locking if dev->npinfo is NULL.
+ */
synchronize_rcu();
return 0;
- release:
- if (!ndev->npinfo)
- kfree(npinfo);
- np->dev = NULL;
- dev_put(ndev);
+flush:
+ skb_pool_flush(np);
+put:
+ DEBUG_NET_WARN_ON_ONCE(np->dev);
+ if (ip_overwritten)
+ memset(&np->local_ip, 0, sizeof(np->local_ip));
+ netdev_put(ndev, &np->dev_tracker);
+unlock:
+ rtnl_unlock();
return err;
}
+EXPORT_SYMBOL(netpoll_setup);
-static int __init netpoll_init(void)
+static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
{
- skb_queue_head_init(&skb_pool);
- return 0;
+ struct netpoll_info *npinfo =
+ container_of(rcu_head, struct netpoll_info, rcu);
+
+ skb_queue_purge(&npinfo->txq);
+
+ /* we can't call cancel_delayed_work_sync here, as we are in softirq */
+ cancel_delayed_work(&npinfo->tx_work);
+
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ /* now cancel it again */
+ cancel_delayed_work(&npinfo->tx_work);
+ kfree(npinfo);
}
-core_initcall(netpoll_init);
-void netpoll_cleanup(struct netpoll *np)
+static void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- unsigned long flags;
- if (np->dev) {
- npinfo = np->dev->npinfo;
- if (npinfo) {
- if (npinfo->rx_np == np) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_np = NULL;
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
+ npinfo = rtnl_dereference(np->dev->npinfo);
+ if (!npinfo)
+ return;
- np->dev->npinfo = NULL;
- if (atomic_dec_and_test(&npinfo->refcnt)) {
- skb_queue_purge(&npinfo->arp_tx);
- skb_queue_purge(&npinfo->txq);
- cancel_rearming_delayed_work(&npinfo->tx_work);
- flush_scheduled_work();
+ /* At this point, there is a single npinfo instance per netdevice, and
+ * its refcnt tracks how many netpoll structures are linked to it. We
+ * only perform npinfo cleanup when the refcnt decrements to zero.
+ */
+ if (refcount_dec_and_test(&npinfo->refcnt)) {
+ const struct net_device_ops *ops;
- kfree(npinfo);
- }
- }
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
- dev_put(np->dev);
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
}
- np->dev = NULL;
+ skb_pool_flush(np);
}
-int netpoll_trap(void)
+void __netpoll_free(struct netpoll *np)
{
- return atomic_read(&trapped);
+ ASSERT_RTNL();
+
+ /* Wait for transmitting packets to finish before freeing. */
+ synchronize_net();
+ __netpoll_cleanup(np);
+ kfree(np);
}
+EXPORT_SYMBOL_GPL(__netpoll_free);
-void netpoll_set_trap(int trap)
+void do_netpoll_cleanup(struct netpoll *np)
{
- if (trap)
- atomic_inc(&trapped);
- else
- atomic_dec(&trapped);
+ __netpoll_cleanup(np);
+ netdev_put(np->dev, &np->dev_tracker);
+ np->dev = NULL;
}
+EXPORT_SYMBOL(do_netpoll_cleanup);
-EXPORT_SYMBOL(netpoll_set_trap);
-EXPORT_SYMBOL(netpoll_trap);
-EXPORT_SYMBOL(netpoll_parse_options);
-EXPORT_SYMBOL(netpoll_setup);
+void netpoll_cleanup(struct netpoll *np)
+{
+ rtnl_lock();
+ if (!np->dev)
+ goto out;
+ do_netpoll_cleanup(np);
+out:
+ rtnl_unlock();
+}
EXPORT_SYMBOL(netpoll_cleanup);
-EXPORT_SYMBOL(netpoll_send_udp);
-EXPORT_SYMBOL(netpoll_poll);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 000000000000..8456dfbe2eb4
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <linux/sched/task.h>
+
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+#include <linux/fdtable.h>
+
+/*
+ * netprio allocates per-net_device priomap array which is indexed by
+ * css->id. Limiting css ID to 16bits doesn't lose anything.
+ */
+#define NETPRIO_ID_MAX USHRT_MAX
+
+#define PRIOMAP_MIN_SZ 128
+
+/*
+ * Extend @dev->priomap so that it's large enough to accommodate
+ * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
+ * return. Must be called under rtnl lock.
+ */
+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
+{
+ struct netprio_map *old, *new;
+ size_t new_sz, new_len;
+
+ /* is the existing priomap large enough? */
+ old = rtnl_dereference(dev->priomap);
+ if (old && old->priomap_len > target_idx)
+ return 0;
+
+ /*
+ * Determine the new size. Let's keep it power-of-two. We start
+ * from PRIOMAP_MIN_SZ and double it until it's large enough to
+ * accommodate @target_idx.
+ */
+ new_sz = PRIOMAP_MIN_SZ;
+ while (true) {
+ new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
+ sizeof(new->priomap[0]);
+ if (new_len > target_idx)
+ break;
+ new_sz *= 2;
+ /* overflowed? */
+ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
+ return -ENOSPC;
+ }
+
+ /* allocate & copy */
+ new = kzalloc(new_sz, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (old)
+ memcpy(new->priomap, old->priomap,
+ old->priomap_len * sizeof(old->priomap[0]));
+
+ new->priomap_len = new_len;
+
+ /* install the new priomap */
+ rcu_assign_pointer(dev->priomap, new);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/**
+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ *
+ * Should be called under RCU read or rtnl lock.
+ */
+static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
+{
+ struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
+ int id = css->id;
+
+ if (map && id < map->priomap_len)
+ return map->priomap[id];
+ return 0;
+}
+
+/**
+ * netprio_set_prio - set netprio on a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ * @prio: prio to set
+ *
+ * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
+ * lock and may fail under memory pressure for non-zero @prio.
+ */
+static int netprio_set_prio(struct cgroup_subsys_state *css,
+ struct net_device *dev, u32 prio)
+{
+ struct netprio_map *map;
+ int id = css->id;
+ int ret;
+
+ /* avoid extending priomap for zero writes */
+ map = rtnl_dereference(dev->priomap);
+ if (!prio && (!map || map->priomap_len <= id))
+ return 0;
+
+ ret = extend_netdev_table(dev, id);
+ if (ret)
+ return ret;
+
+ map = rtnl_dereference(dev->priomap);
+ map->priomap[id] = prio;
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kzalloc(sizeof(*css), GFP_KERNEL);
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *parent_css = css->parent;
+ struct net_device *dev;
+ int ret = 0;
+
+ if (css->id > NETPRIO_ID_MAX)
+ return -ENOSPC;
+
+ if (!parent_css)
+ return 0;
+
+ rtnl_lock();
+ /*
+ * Inherit prios from the parent. As all prios are set during
+ * onlining, there is no need to clear them on offline.
+ */
+ for_each_netdev(&init_net, dev) {
+ u32 prio = netprio_prio(parent_css, dev);
+
+ ret = netprio_set_prio(css, dev, prio);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+ return ret;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css->id;
+}
+
+static int read_priomap(struct seq_file *sf, void *v)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev)
+ seq_printf(sf, "%s %u\n", dev->name,
+ netprio_prio(seq_css(sf), dev));
+ rcu_read_unlock();
+ return 0;
+}
+
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char devname[IFNAMSIZ + 1];
+ struct net_device *dev;
+ u32 prio;
+ int ret;
+
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ return -EINVAL;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ return -ENODEV;
+
+ rtnl_lock();
+
+ ret = netprio_set_prio(of_css(of), dev, prio);
+
+ rtnl_unlock();
+ dev_put(dev);
+ return ret ?: nbytes;
+}
+
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+ struct socket *sock = sock_from_file(file);
+
+ if (sock)
+ sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+ (unsigned long)v);
+ return 0;
+}
+
+static void net_prio_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *p;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ void *v = (void *)(unsigned long)css->id;
+
+ task_lock(p);
+ iterate_fd(p->files, 0, update_netprio, v);
+ task_unlock(p);
+ }
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .seq_show = read_priomap,
+ .write = write_priomap,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_prio_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = net_prio_attach,
+ .legacy_cftypes = ss_files,
+};
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netprio_map *old;
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ register_netdevice_notifier(&netprio_device_notifier);
+ return 0;
+}
+subsys_initcall(init_cgroup_netprio);
diff --git a/net/core/of_net.c b/net/core/of_net.c
new file mode 100644
index 000000000000..93ea425b9248
--- /dev/null
+++ b/net/core/of_net.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * OF helpers for network devices.
+ *
+ * Initially copied out of arch/powerpc/kernel/prom_parse.c
+ */
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/phy.h>
+#include <linux/export.h>
+#include <linux/device.h>
+#include <linux/nvmem-consumer.h>
+
+/**
+ * of_get_phy_mode - Get phy mode for given device_node
+ * @np: Pointer to the given device_node
+ * @interface: Pointer to the result
+ *
+ * The function gets phy interface string from property 'phy-mode' or
+ * 'phy-connection-type'. The index in phy_modes table is set in
+ * interface and 0 returned. In case of error interface is set to
+ * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV.
+ */
+int of_get_phy_mode(struct device_node *np, phy_interface_t *interface)
+{
+ const char *pm;
+ int err, i;
+
+ *interface = PHY_INTERFACE_MODE_NA;
+
+ err = of_property_read_string(np, "phy-mode", &pm);
+ if (err < 0)
+ err = of_property_read_string(np, "phy-connection-type", &pm);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
+ if (!strcasecmp(pm, phy_modes(i))) {
+ *interface = i;
+ return 0;
+ }
+
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(of_get_phy_mode);
+
+static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
+{
+ struct property *pp = of_find_property(np, name, NULL);
+
+ if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) {
+ memcpy(addr, pp->value, ETH_ALEN);
+ return 0;
+ }
+ return -ENODEV;
+}
+
+int of_get_mac_address_nvmem(struct device_node *np, u8 *addr)
+{
+ struct platform_device *pdev = of_find_device_by_node(np);
+ struct nvmem_cell *cell;
+ const void *mac;
+ size_t len;
+ int ret;
+
+ /* Try lookup by device first, there might be a nvmem_cell_lookup
+ * associated with a given device.
+ */
+ if (pdev) {
+ ret = nvmem_get_mac_address(&pdev->dev, addr);
+ put_device(&pdev->dev);
+ return ret;
+ }
+
+ cell = of_nvmem_cell_get(np, "mac-address");
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ mac = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ kfree(mac);
+ return -EINVAL;
+ }
+
+ memcpy(addr, mac, ETH_ALEN);
+ kfree(mac);
+
+ return 0;
+}
+EXPORT_SYMBOL(of_get_mac_address_nvmem);
+
+/**
+ * of_get_mac_address()
+ * @np: Caller's Device Node
+ * @addr: Pointer to a six-byte array for the result
+ *
+ * Search the device tree for the best MAC address to use. 'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address. If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree. If any
+ * of the above isn't set, then try to get MAC address from nvmem cell named
+ * 'mac-address'.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the device tree, but were not set by U-Boot. For example, the
+ * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
+ * addresses. Some older U-Boots only initialized 'local-mac-address'. In
+ * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
+ * but is all zeros.
+ *
+ * Return: 0 on success and errno in case of error.
+*/
+int of_get_mac_address(struct device_node *np, u8 *addr)
+{
+ int ret;
+
+ if (!np)
+ return -ENODEV;
+
+ ret = of_get_mac_addr(np, "mac-address", addr);
+ if (!ret)
+ return 0;
+
+ ret = of_get_mac_addr(np, "local-mac-address", addr);
+ if (!ret)
+ return 0;
+
+ ret = of_get_mac_addr(np, "address", addr);
+ if (!ret)
+ return 0;
+
+ return of_get_mac_address_nvmem(np, addr);
+}
+EXPORT_SYMBOL(of_get_mac_address);
+
+/**
+ * of_get_ethdev_address()
+ * @np: Caller's Device Node
+ * @dev: Pointer to netdevice which address will be updated
+ *
+ * Search the device tree for the best MAC address to use.
+ * If found set @dev->dev_addr to that address.
+ *
+ * See documentation of of_get_mac_address() for more information on how
+ * the best address is determined.
+ *
+ * Return: 0 on success and errno in case of error.
+ */
+int of_get_ethdev_address(struct device_node *np, struct net_device *dev)
+{
+ u8 addr[ETH_ALEN];
+ int ret;
+
+ ret = of_get_mac_address(np, addr);
+ if (!ret)
+ eth_hw_addr_set(dev, addr);
+ return ret;
+}
+EXPORT_SYMBOL(of_get_ethdev_address);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000000..265a729431bb
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,1354 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+
+#include <linux/error-injection.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/xdp.h>
+
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for put_page() */
+#include <linux/poison.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include <trace/events/page_pool.h>
+
+#include "dev.h"
+#include "mp_dmabuf_devmem.h"
+#include "netmem_priv.h"
+#include "page_pool_priv.h"
+
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (60 * HZ)
+
+#define BIAS_MAX (LONG_MAX >> 1)
+
+#ifdef CONFIG_PAGE_POOL_STATS
+static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
+
+/* alloc_stat_inc is intended to be used in softirq context */
+#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
+/* recycle_stat_inc is safe to use when preemption is possible. */
+#define recycle_stat_inc(pool, __stat) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_inc(s->__stat); \
+ } while (0)
+
+#define recycle_stat_add(pool, __stat, val) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_add(s->__stat, val); \
+ } while (0)
+
+static const char pp_stats[][ETH_GSTRING_LEN] = {
+ "rx_pp_alloc_fast",
+ "rx_pp_alloc_slow",
+ "rx_pp_alloc_slow_ho",
+ "rx_pp_alloc_empty",
+ "rx_pp_alloc_refill",
+ "rx_pp_alloc_waive",
+ "rx_pp_recycle_cached",
+ "rx_pp_recycle_cache_full",
+ "rx_pp_recycle_ring",
+ "rx_pp_recycle_ring_full",
+ "rx_pp_recycle_released_ref",
+};
+
+/**
+ * page_pool_get_stats() - fetch page pool stats
+ * @pool: pool from which page was allocated
+ * @stats: struct page_pool_stats to fill in
+ *
+ * Retrieve statistics about the page_pool. This API is only available
+ * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
+ * A pointer to a caller allocated struct page_pool_stats structure
+ * is passed to this API which is filled in. The caller can then report
+ * those stats to the user (perhaps via ethtool, debugfs, etc.).
+ */
+bool page_pool_get_stats(const struct page_pool *pool,
+ struct page_pool_stats *stats)
+{
+ int cpu = 0;
+
+ if (!stats)
+ return false;
+
+ /* The caller is responsible to initialize stats. */
+ stats->alloc_stats.fast += pool->alloc_stats.fast;
+ stats->alloc_stats.slow += pool->alloc_stats.slow;
+ stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
+ stats->alloc_stats.empty += pool->alloc_stats.empty;
+ stats->alloc_stats.refill += pool->alloc_stats.refill;
+ stats->alloc_stats.waive += pool->alloc_stats.waive;
+
+ for_each_possible_cpu(cpu) {
+ const struct page_pool_recycle_stats *pcpu =
+ per_cpu_ptr(pool->recycle_stats, cpu);
+
+ stats->recycle_stats.cached += pcpu->cached;
+ stats->recycle_stats.cache_full += pcpu->cache_full;
+ stats->recycle_stats.ring += pcpu->ring;
+ stats->recycle_stats.ring_full += pcpu->ring_full;
+ stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(page_pool_get_stats);
+
+u8 *page_pool_ethtool_stats_get_strings(u8 *data)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
+ memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
+ data += ETH_GSTRING_LEN;
+ }
+
+ return data;
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
+
+int page_pool_ethtool_stats_get_count(void)
+{
+ return ARRAY_SIZE(pp_stats);
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
+
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
+{
+ const struct page_pool_stats *pool_stats = stats;
+
+ *data++ = pool_stats->alloc_stats.fast;
+ *data++ = pool_stats->alloc_stats.slow;
+ *data++ = pool_stats->alloc_stats.slow_high_order;
+ *data++ = pool_stats->alloc_stats.empty;
+ *data++ = pool_stats->alloc_stats.refill;
+ *data++ = pool_stats->alloc_stats.waive;
+ *data++ = pool_stats->recycle_stats.cached;
+ *data++ = pool_stats->recycle_stats.cache_full;
+ *data++ = pool_stats->recycle_stats.ring;
+ *data++ = pool_stats->recycle_stats.ring_full;
+ *data++ = pool_stats->recycle_stats.released_refcnt;
+
+ return data;
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get);
+
+#else
+#define alloc_stat_inc(...) do { } while (0)
+#define recycle_stat_inc(...) do { } while (0)
+#define recycle_stat_add(...) do { } while (0)
+#endif
+
+static bool page_pool_producer_lock(struct page_pool *pool)
+ __acquires(&pool->ring.producer_lock)
+{
+ bool in_softirq = in_softirq();
+
+ if (in_softirq)
+ spin_lock(&pool->ring.producer_lock);
+ else
+ spin_lock_bh(&pool->ring.producer_lock);
+
+ return in_softirq;
+}
+
+static void page_pool_producer_unlock(struct page_pool *pool,
+ bool in_softirq)
+ __releases(&pool->ring.producer_lock)
+{
+ if (in_softirq)
+ spin_unlock(&pool->ring.producer_lock);
+ else
+ spin_unlock_bh(&pool->ring.producer_lock);
+}
+
+static void page_pool_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
+ CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
+ PAGE_POOL_FRAG_GROUP_ALIGN);
+}
+
+static int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params,
+ int cpuid)
+{
+ unsigned int ring_qsize = 1024; /* Default */
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ page_pool_struct_check();
+
+ memcpy(&pool->p, &params->fast, sizeof(pool->p));
+ memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
+
+ pool->cpuid = cpuid;
+ pool->dma_sync_for_cpu = true;
+
+ /* Validate only known flags were used */
+ if (pool->slow.flags & ~PP_FLAG_ALL)
+ return -EINVAL;
+
+ if (pool->p.pool_size)
+ ring_qsize = min(pool->p.pool_size, 16384);
+
+ /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+ * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+ * which is the XDP_TX use-case.
+ */
+ if (pool->slow.flags & PP_FLAG_DMA_MAP) {
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ pool->dma_map = true;
+ }
+
+ if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
+ /* In order to request DMA-sync-for-device the page
+ * needs to be mapped
+ */
+ if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
+ return -EINVAL;
+
+ if (!pool->p.max_len)
+ return -EINVAL;
+
+ pool->dma_sync = true;
+
+ /* pool->p.offset has to be set according to the address
+ * offset used by the DMA engine to start copying rx data
+ */
+ }
+
+ pool->has_init_callback = !!pool->slow.init_callback;
+
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+ if (!pool->recycle_stats)
+ return -ENOMEM;
+ } else {
+ /* For system page pool instance we use a singular stats object
+ * instead of allocating a separate percpu variable for each
+ * (also percpu) page pool instance.
+ */
+ pool->recycle_stats = &pp_system_recycle_stats;
+ pool->system = true;
+ }
+#endif
+
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+ return -ENOMEM;
+ }
+
+ atomic_set(&pool->pages_state_release_cnt, 0);
+
+ /* Driver calling page_pool_create() also call page_pool_destroy() */
+ refcount_set(&pool->user_cnt, 1);
+
+ xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
+
+ if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
+ netdev_assert_locked(pool->slow.netdev);
+ rxq = __netif_get_rx_queue(pool->slow.netdev,
+ pool->slow.queue_idx);
+ pool->mp_priv = rxq->mp_params.mp_priv;
+ pool->mp_ops = rxq->mp_params.mp_ops;
+ }
+
+ if (pool->mp_ops) {
+ if (!pool->dma_map || !pool->dma_sync) {
+ err = -EOPNOTSUPP;
+ goto free_ptr_ring;
+ }
+
+ if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
+ err = -EFAULT;
+ goto free_ptr_ring;
+ }
+
+ err = pool->mp_ops->init(pool);
+ if (err) {
+ pr_warn("%s() mem-provider init failed %d\n", __func__,
+ err);
+ goto free_ptr_ring;
+ }
+
+ static_branch_inc(&page_pool_mem_providers);
+ } else if (pool->p.order > MAX_PAGE_ORDER) {
+ err = -EINVAL;
+ goto free_ptr_ring;
+ }
+
+ return 0;
+
+free_ptr_ring:
+ ptr_ring_cleanup(&pool->ring, NULL);
+ xa_destroy(&pool->dma_mapped);
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+ return err;
+}
+
+static void page_pool_uninit(struct page_pool *pool)
+{
+ ptr_ring_cleanup(&pool->ring, NULL);
+ xa_destroy(&pool->dma_mapped);
+
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+}
+
+/**
+ * page_pool_create_percpu() - create a page pool for a given cpu.
+ * @params: parameters, see struct page_pool_params
+ * @cpuid: cpu identifier
+ */
+struct page_pool *
+page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
+{
+ struct page_pool *pool;
+ int err;
+
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ err = page_pool_init(pool, params, cpuid);
+ if (err < 0)
+ goto err_free;
+
+ err = page_pool_list(pool);
+ if (err)
+ goto err_uninit;
+
+ return pool;
+
+err_uninit:
+ page_pool_uninit(pool);
+err_free:
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(page_pool_create_percpu);
+
+/**
+ * page_pool_create() - create a page pool
+ * @params: parameters, see struct page_pool_params
+ */
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ return page_pool_create_percpu(params, -1);
+}
+EXPORT_SYMBOL(page_pool_create);
+
+static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem);
+
+static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
+{
+ struct ptr_ring *r = &pool->ring;
+ netmem_ref netmem;
+ int pref_nid; /* preferred NUMA node */
+
+ /* Quicker fallback, avoid locks when ring is empty */
+ if (__ptr_ring_empty(r)) {
+ alloc_stat_inc(pool, empty);
+ return 0;
+ }
+
+ /* Softirq guarantee CPU and thus NUMA node is stable. This,
+ * assumes CPU refilling driver RX-ring will also run RX-NAPI.
+ */
+#ifdef CONFIG_NUMA
+ pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
+#else
+ /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
+ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
+#endif
+
+ /* Refill alloc array, but only if NUMA match */
+ do {
+ netmem = (__force netmem_ref)__ptr_ring_consume(r);
+ if (unlikely(!netmem))
+ break;
+
+ if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+ } else {
+ /* NUMA mismatch;
+ * (1) release 1 page to page-allocator and
+ * (2) break out to fallthrough to alloc_pages_node.
+ * This limit stress on page buddy alloactor.
+ */
+ page_pool_return_netmem(pool, netmem);
+ alloc_stat_inc(pool, waive);
+ netmem = 0;
+ break;
+ }
+ } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
+
+ /* Return last page */
+ if (likely(pool->alloc.count > 0)) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, refill);
+ }
+
+ return netmem;
+}
+
+/* fast path */
+static netmem_ref __page_pool_get_cached(struct page_pool *pool)
+{
+ netmem_ref netmem;
+
+ /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, fast);
+ } else {
+ netmem = page_pool_refill_alloc_cache(pool);
+ }
+
+ return netmem;
+}
+
+static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
+{
+#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+ dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
+
+ dma_sync_size = min(dma_sync_size, pool->p.max_len);
+ __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
+ dma_sync_size, pool->p.dma_dir);
+#endif
+}
+
+static __always_inline void
+page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
+{
+ if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
+ rcu_read_lock();
+ /* re-check under rcu_read_lock() to sync with page_pool_scrub() */
+ if (pool->dma_sync)
+ __page_pool_dma_sync_for_device(pool, netmem,
+ dma_sync_size);
+ rcu_read_unlock();
+ }
+}
+
+static int page_pool_register_dma_index(struct page_pool *pool,
+ netmem_ref netmem, gfp_t gfp)
+{
+ int err = 0;
+ u32 id;
+
+ if (unlikely(!PP_DMA_INDEX_BITS))
+ goto out;
+
+ if (in_softirq())
+ err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ else
+ err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ if (err) {
+ WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ goto out;
+ }
+
+ netmem_set_dma_index(netmem, id);
+out:
+ return err;
+}
+
+static int page_pool_release_dma_index(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ struct page *old, *page = netmem_to_page(netmem);
+ unsigned long id;
+
+ if (unlikely(!PP_DMA_INDEX_BITS))
+ return 0;
+
+ id = netmem_get_dma_index(netmem);
+ if (!id)
+ return -1;
+
+ if (in_softirq())
+ old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
+ else
+ old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
+ if (old != page)
+ return -1;
+
+ netmem_set_dma_index(netmem, 0);
+
+ return 0;
+}
+
+static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
+{
+ dma_addr_t dma;
+ int err;
+
+ /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
+ * since dma_addr_t can be either 32 or 64 bits and does not always fit
+ * into page private data (i.e 32bit cpu with 64bit DMA caps)
+ * This mapping is kept for lifetime of page, until leaving pool.
+ */
+ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
+ (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
+ if (dma_mapping_error(pool->p.dev, dma))
+ return false;
+
+ if (page_pool_set_dma_addr_netmem(netmem, dma)) {
+ WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
+ goto unmap_failed;
+ }
+
+ err = page_pool_register_dma_index(pool, netmem, gfp);
+ if (err)
+ goto unset_failed;
+
+ page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
+
+ return true;
+
+unset_failed:
+ page_pool_set_dma_addr_netmem(netmem, 0);
+unmap_failed:
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ return false;
+}
+
+static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
+ gfp_t gfp)
+{
+ struct page *page;
+
+ gfp |= __GFP_COMP;
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (unlikely(!page))
+ return NULL;
+
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
+ put_page(page);
+ return NULL;
+ }
+
+ alloc_stat_inc(pool, slow_high_order);
+ page_pool_set_pp_info(pool, page_to_netmem(page));
+
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, page_to_netmem(page),
+ pool->pages_state_hold_cnt);
+ return page;
+}
+
+/* slow path */
+static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool,
+ gfp_t gfp)
+{
+ const int bulk = PP_ALLOC_CACHE_REFILL;
+ unsigned int pp_order = pool->p.order;
+ bool dma_map = pool->dma_map;
+ netmem_ref netmem;
+ int i, nr_pages;
+
+ /* Unconditionally set NOWARN if allocating from NAPI.
+ * Drivers forget to set it, and OOM reports on packet Rx are useless.
+ */
+ if ((gfp & GFP_ATOMIC) == GFP_ATOMIC)
+ gfp |= __GFP_NOWARN;
+
+ /* Don't support bulk alloc for high-order pages */
+ if (unlikely(pp_order))
+ return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
+
+ /* Unnecessary as alloc cache is empty, but guarantees zero count */
+ if (unlikely(pool->alloc.count > 0))
+ return pool->alloc.cache[--pool->alloc.count];
+
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
+ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
+
+ nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+ (struct page **)pool->alloc.cache);
+ if (unlikely(!nr_pages))
+ return 0;
+
+ /* Pages have been filled into alloc.cache array, but count is zero and
+ * page element have not been (possibly) DMA mapped.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ netmem = pool->alloc.cache[i];
+ if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
+ put_page(netmem_to_page(netmem));
+ continue;
+ }
+
+ page_pool_set_pp_info(pool, netmem);
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem,
+ pool->pages_state_hold_cnt);
+ }
+
+ /* Return last page */
+ if (likely(pool->alloc.count > 0)) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, slow);
+ } else {
+ netmem = 0;
+ }
+
+ /* When page just alloc'ed is should/must have refcnt 1. */
+ return netmem;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ netmem_ref netmem;
+
+ /* Fast-path: Get a page from cache */
+ netmem = __page_pool_get_cached(pool);
+ if (netmem)
+ return netmem;
+
+ /* Slow-path: cache empty, do real allocation */
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ netmem = pool->mp_ops->alloc_netmems(pool, gfp);
+ else
+ netmem = __page_pool_alloc_netmems_slow(pool, gfp);
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_netmems);
+ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b) (s32)((a) - (b))
+
+s32 page_pool_inflight(const struct page_pool *pool, bool strict)
+{
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 inflight;
+
+ inflight = _distance(hold_cnt, release_cnt);
+
+ if (strict) {
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages",
+ inflight);
+ } else {
+ inflight = max(0, inflight);
+ }
+
+ return inflight;
+}
+
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
+{
+ netmem_set_pp(netmem, pool);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);
+
+ /* Ensuring all pages have been split into one fragment initially:
+ * page_pool_set_pp_info() is only called once for every page when it
+ * is allocated from the page allocator and page_pool_fragment_page()
+ * is dirtying the same cache line as the page->pp_magic above, so
+ * the overhead is negligible.
+ */
+ page_pool_fragment_netmem(netmem, 1);
+ if (pool->has_init_callback)
+ pool->slow.init_callback(netmem, pool->slow.init_arg);
+}
+
+void page_pool_clear_pp_info(netmem_ref netmem)
+{
+ netmem_clear_pp_magic(netmem);
+ netmem_set_pp(netmem, NULL);
+}
+
+static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ dma_addr_t dma;
+
+ if (!pool->dma_map)
+ /* Always account for inflight pages, even if we didn't
+ * map them
+ */
+ return;
+
+ if (page_pool_release_dma_index(pool, netmem))
+ return;
+
+ dma = page_pool_get_dma_addr_netmem(netmem);
+
+ /* When page is unmapped, it cannot be returned to our pool */
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ page_pool_set_dma_addr_netmem(netmem, 0);
+}
+
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem)
+{
+ int count;
+ bool put;
+
+ put = true;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ put = pool->mp_ops->release_netmem(pool, netmem);
+ else
+ __page_pool_release_netmem_dma(pool, netmem);
+
+ /* This may be the last page returned, releasing the pool, so
+ * it is not safe to reference pool afterwards.
+ */
+ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, netmem, count);
+
+ if (put) {
+ page_pool_clear_pp_info(netmem);
+ put_page(netmem_to_page(netmem));
+ }
+ /* An optimization would be to call __free_pages(page, pool->p.order)
+ * knowing page is not part of page-cache (thus avoiding a
+ * __page_cache_release() call).
+ */
+}
+
+static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
+{
+ bool in_softirq, ret;
+
+ /* BH protection not needed if current is softirq */
+ in_softirq = page_pool_producer_lock(pool);
+ ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem);
+ if (ret)
+ recycle_stat_inc(pool, ring);
+ page_pool_producer_unlock(pool, in_softirq);
+
+ return ret;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool page_pool_recycle_in_cache(netmem_ref netmem,
+ struct page_pool *pool)
+{
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
+ recycle_stat_inc(pool, cache_full);
+ return false;
+ }
+
+ /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+ recycle_stat_inc(pool, cached);
+ return true;
+}
+
+static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
+{
+ return netmem_is_net_iov(netmem) ||
+ (page_ref_count(netmem_to_page(netmem)) == 1 &&
+ !page_is_pfmemalloc(netmem_to_page(netmem)));
+}
+
+/* If the page refcnt == 1, this will try to recycle the page.
+ * If pool->dma_sync is set, we'll try to sync the DMA area for
+ * the configured size min(dma_sync_size, pool->max_len).
+ * If the page refcnt != 1, then the page will be returned to memory
+ * subsystem.
+ */
+static __always_inline netmem_ref
+__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ lockdep_assert_no_hardirq();
+
+ /* This allocator is optimized for the XDP mode that uses
+ * one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * refcnt == 1 means page_pool owns page, and can recycle it.
+ *
+ * page is NOT reusable when allocated when system is under
+ * some pressure. (page_is_pfmemalloc)
+ */
+ if (likely(__page_pool_page_can_be_recycled(netmem))) {
+ /* Read barrier done in page_ref_count / READ_ONCE */
+
+ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
+
+ if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
+ return 0;
+
+ /* Page found as candidate for recycling */
+ return netmem;
+ }
+
+ /* Fallback/non-XDP mode: API user have elevated refcnt.
+ *
+ * Many drivers split up the page into fragments, and some
+ * want to keep doing this to save memory and do refcnt based
+ * recycling. Support this use case too, to ease drivers
+ * switching between XDP/non-XDP.
+ *
+ * In-case page_pool maintains the DMA mapping, API user must
+ * call page_pool_put_page once. In this elevated refcnt
+ * case, the DMA is unmapped/released, as driver is likely
+ * doing refcnt based recycle tricks, meaning another process
+ * will be invoking put_page.
+ */
+ recycle_stat_inc(pool, released_refcnt);
+ page_pool_return_netmem(pool, netmem);
+
+ return 0;
+}
+
+static bool page_pool_napi_local(const struct page_pool *pool)
+{
+ const struct napi_struct *napi;
+ u32 cpuid;
+
+ /* On PREEMPT_RT the softirq can be preempted by the consumer */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
+
+ if (unlikely(!in_softirq()))
+ return false;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ cpuid = smp_processor_id();
+ if (READ_ONCE(pool->cpuid) == cpuid)
+ return true;
+
+ napi = READ_ONCE(pool->p.napi);
+
+ return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
+void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ if (!allow_direct)
+ allow_direct = page_pool_napi_local(pool);
+
+ netmem = __page_pool_put_page(pool, netmem, dma_sync_size,
+ allow_direct);
+ if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
+ /* Cache full, fallback to free pages */
+ recycle_stat_inc(pool, ring_full);
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
+
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
+ allow_direct);
+}
+EXPORT_SYMBOL(page_pool_put_unrefed_page);
+
+static void page_pool_recycle_ring_bulk(struct page_pool *pool,
+ netmem_ref *bulk,
+ u32 bulk_len)
+{
+ bool in_softirq;
+ u32 i;
+
+ /* Bulk produce into ptr_ring page_pool cache */
+ in_softirq = page_pool_producer_lock(pool);
+
+ for (i = 0; i < bulk_len; i++) {
+ if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
+ /* ring full */
+ recycle_stat_inc(pool, ring_full);
+ break;
+ }
+ }
+
+ page_pool_producer_unlock(pool, in_softirq);
+ recycle_stat_add(pool, ring, i);
+
+ /* Hopefully all pages were returned into ptr_ring */
+ if (likely(i == bulk_len))
+ return;
+
+ /*
+ * ptr_ring cache is full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation.
+ */
+ for (; i < bulk_len; i++)
+ page_pool_return_netmem(pool, bulk[i]);
+}
+
+/**
+ * page_pool_put_netmem_bulk() - release references on multiple netmems
+ * @data: array holding netmem references
+ * @count: number of entries in @data
+ *
+ * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
+ * will release leftover netmems to the memory provider.
+ * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_netmem_bulk(), as this function overwrites it.
+ */
+void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
+{
+ u32 bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ netmem_ref netmem = netmem_compound_head(data[i]);
+
+ if (page_pool_unref_and_test(netmem))
+ data[bulk_len++] = netmem;
+ }
+
+ count = bulk_len;
+ while (count) {
+ netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
+ struct page_pool *pool = NULL;
+ bool allow_direct;
+ u32 foreign = 0;
+
+ bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ struct page_pool *netmem_pp;
+ netmem_ref netmem = data[i];
+
+ netmem_pp = netmem_get_pp(netmem);
+ if (unlikely(!pool)) {
+ pool = netmem_pp;
+ allow_direct = page_pool_napi_local(pool);
+ } else if (netmem_pp != pool) {
+ /*
+ * If the netmem belongs to a different
+ * page_pool, save it for another round.
+ */
+ data[foreign++] = netmem;
+ continue;
+ }
+
+ netmem = __page_pool_put_page(pool, netmem, -1,
+ allow_direct);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (netmem)
+ bulk[bulk_len++] = netmem;
+ }
+
+ if (bulk_len)
+ page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
+
+ count = foreign;
+ }
+}
+EXPORT_SYMBOL(page_pool_put_netmem_bulk);
+
+static netmem_ref page_pool_drain_frag(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+
+ /* Some user is still using the page frag */
+ if (likely(page_pool_unref_netmem(netmem, drain_count)))
+ return 0;
+
+ if (__page_pool_page_can_be_recycled(netmem)) {
+ page_pool_dma_sync_for_device(pool, netmem, -1);
+ return netmem;
+ }
+
+ page_pool_return_netmem(pool, netmem);
+ return 0;
+}
+
+static void page_pool_free_frag(struct page_pool *pool)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+ netmem_ref netmem = pool->frag_page;
+
+ pool->frag_page = 0;
+
+ if (!netmem || page_pool_unref_netmem(netmem, drain_count))
+ return;
+
+ page_pool_return_netmem(pool, netmem);
+}
+
+netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
+ unsigned int *offset, unsigned int size,
+ gfp_t gfp)
+{
+ unsigned int max_size = PAGE_SIZE << pool->p.order;
+ netmem_ref netmem = pool->frag_page;
+
+ if (WARN_ON(size > max_size))
+ return 0;
+
+ size = ALIGN(size, dma_get_cache_alignment());
+ *offset = pool->frag_offset;
+
+ if (netmem && *offset + size > max_size) {
+ netmem = page_pool_drain_frag(pool, netmem);
+ if (netmem) {
+ recycle_stat_inc(pool, cached);
+ alloc_stat_inc(pool, fast);
+ goto frag_reset;
+ }
+ }
+
+ if (!netmem) {
+ netmem = page_pool_alloc_netmems(pool, gfp);
+ if (unlikely(!netmem)) {
+ pool->frag_page = 0;
+ return 0;
+ }
+
+ pool->frag_page = netmem;
+
+frag_reset:
+ pool->frag_users = 1;
+ *offset = 0;
+ pool->frag_offset = size;
+ page_pool_fragment_netmem(netmem, BIAS_MAX);
+ return netmem;
+ }
+
+ pool->frag_users++;
+ pool->frag_offset = *offset + size;
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
+
+struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
+ unsigned int size, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
+ gfp));
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
+static void page_pool_empty_ring(struct page_pool *pool)
+{
+ netmem_ref netmem;
+
+ /* Empty recycle ring */
+ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
+ /* Verify the refcnt invariant of cached pages */
+ if (!(netmem_ref_count(netmem) == 1))
+ pr_crit("%s() page_pool refcnt %d violation\n",
+ __func__, netmem_ref_count(netmem));
+
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+
+static void __page_pool_destroy(struct page_pool *pool)
+{
+ if (pool->disconnect)
+ pool->disconnect(pool);
+
+ page_pool_unlist(pool);
+ page_pool_uninit(pool);
+
+ if (pool->mp_ops) {
+ pool->mp_ops->destroy(pool);
+ static_branch_dec(&page_pool_mem_providers);
+ }
+
+ kfree(pool);
+}
+
+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
+{
+ netmem_ref netmem;
+
+ if (pool->destroy_cnt)
+ return;
+
+ /* Empty alloc cache, assume caller made sure this is
+ * no-longer in use, and page_pool_alloc_pages() cannot be
+ * call concurrently.
+ */
+ while (pool->alloc.count) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+
+static void page_pool_scrub(struct page_pool *pool)
+{
+ unsigned long id;
+ void *ptr;
+
+ page_pool_empty_alloc_cache_once(pool);
+ if (!pool->destroy_cnt++ && pool->dma_map) {
+ if (pool->dma_sync) {
+ /* Disable page_pool_dma_sync_for_device() */
+ pool->dma_sync = false;
+
+ /* Make sure all concurrent returns that may see the old
+ * value of dma_sync (and thus perform a sync) have
+ * finished before doing the unmapping below. Skip the
+ * wait if the device doesn't actually need syncing, or
+ * if there are no outstanding mapped pages.
+ */
+ if (dma_dev_need_sync(pool->p.dev) &&
+ !xa_empty(&pool->dma_mapped))
+ synchronize_net();
+ }
+
+ xa_for_each(&pool->dma_mapped, id, ptr)
+ __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr));
+ }
+
+ /* No more consumers should exist, but producers could still
+ * be in-flight.
+ */
+ page_pool_empty_ring(pool);
+}
+
+static int page_pool_release(struct page_pool *pool)
+{
+ bool in_softirq;
+ int inflight;
+
+ page_pool_scrub(pool);
+ inflight = page_pool_inflight(pool, true);
+ /* Acquire producer lock to make sure producers have exited. */
+ in_softirq = page_pool_producer_lock(pool);
+ page_pool_producer_unlock(pool, in_softirq);
+ if (!inflight)
+ __page_pool_destroy(pool);
+
+ return inflight;
+}
+
+static void page_pool_release_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+ void *netdev;
+ int inflight;
+
+ inflight = page_pool_release(pool);
+ /* In rare cases, a driver bug may cause inflight to go negative.
+ * Don't reschedule release if inflight is 0 or negative.
+ * - If 0, the page_pool has been destroyed
+ * - if negative, we will never recover
+ * in both cases no reschedule is necessary.
+ */
+ if (inflight <= 0)
+ return;
+
+ /* Periodic warning for page pools the user can't see */
+ netdev = READ_ONCE(pool->slow.netdev);
+ if (time_after_eq(jiffies, pool->defer_warn) &&
+ (!netdev || netdev == NET_PTR_POISON)) {
+ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
+
+ pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
+ __func__, pool->user.id, inflight, sec);
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+ const struct xdp_mem_info *mem)
+{
+ refcount_inc(&pool->user_cnt);
+ pool->disconnect = disconnect;
+ pool->xdp_mem_id = mem->id;
+}
+
+/**
+ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
+ * @pool: page pool to modify
+ * @napi: NAPI instance to associate the page pool with
+ *
+ * Associate a page pool with a NAPI instance for lockless page recycling.
+ * This is useful when a new page pool has to be added to a NAPI instance
+ * without disabling that NAPI instance, to mark the point at which control
+ * path "hands over" the page pool to the NAPI instance. In most cases driver
+ * can simply set the @napi field in struct page_pool_params, and does not
+ * have to call this helper.
+ *
+ * The function is idempotent, but does not implement any refcounting.
+ * Single page_pool_disable_direct_recycling() will disable recycling,
+ * no matter how many times enable was called.
+ */
+void page_pool_enable_direct_recycling(struct page_pool *pool,
+ struct napi_struct *napi)
+{
+ if (READ_ONCE(pool->p.napi) == napi)
+ return;
+ WARN_ON(!napi || pool->p.napi);
+
+ mutex_lock(&page_pools_lock);
+ WRITE_ONCE(pool->p.napi, napi);
+ mutex_unlock(&page_pools_lock);
+}
+EXPORT_SYMBOL(page_pool_enable_direct_recycling);
+
+void page_pool_disable_direct_recycling(struct page_pool *pool)
+{
+ /* Disable direct recycling based on pool->cpuid.
+ * Paired with READ_ONCE() in page_pool_napi_local().
+ */
+ WRITE_ONCE(pool->cpuid, -1);
+
+ if (!pool->p.napi)
+ return;
+
+ napi_assert_will_not_race(pool->p.napi);
+
+ mutex_lock(&page_pools_lock);
+ WRITE_ONCE(pool->p.napi, NULL);
+ mutex_unlock(&page_pools_lock);
+}
+EXPORT_SYMBOL(page_pool_disable_direct_recycling);
+
+void page_pool_destroy(struct page_pool *pool)
+{
+ if (!pool)
+ return;
+
+ if (!page_pool_put(pool))
+ return;
+
+ page_pool_disable_direct_recycling(pool);
+ page_pool_free_frag(pool);
+
+ if (!page_pool_release(pool))
+ return;
+
+ page_pool_detached(pool);
+ pool->defer_start = jiffies;
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+
+ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+EXPORT_SYMBOL(page_pool_destroy);
+
+/* Caller must provide appropriate safe context, e.g. NAPI. */
+void page_pool_update_nid(struct page_pool *pool, int new_nid)
+{
+ netmem_ref netmem;
+
+ trace_page_pool_update_nid(pool, new_nid);
+ pool->p.nid = new_nid;
+
+ /* Flush pool alloc cache, as refill will check NUMA node */
+ while (pool->alloc.count) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+EXPORT_SYMBOL(page_pool_update_nid);
+
+bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr);
+}
+
+/* Associate a niov with a page pool. Should follow with a matching
+ * net_mp_niov_clear_page_pool()
+ */
+void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+}
+
+/* Disassociate a niov from a page pool. Should only be used in the
+ * ->release_netmem() path.
+ */
+void net_mp_niov_clear_page_pool(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_clear_pp_info(netmem);
+}
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
new file mode 100644
index 000000000000..2fb06d5f6d55
--- /dev/null
+++ b/net/core/page_pool_priv.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PAGE_POOL_PRIV_H
+#define __PAGE_POOL_PRIV_H
+
+#include <net/page_pool/helpers.h>
+
+#include "netmem_priv.h"
+
+extern struct mutex page_pools_lock;
+
+s32 page_pool_inflight(const struct page_pool *pool, bool strict);
+
+int page_pool_list(struct page_pool *pool);
+void page_pool_detached(struct page_pool *pool);
+void page_pool_unlist(struct page_pool *pool);
+
+static inline bool
+page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr)
+{
+ if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+ netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT);
+
+ /* We assume page alignment to shave off bottom bits,
+ * if this "compression" doesn't work we need to drop.
+ */
+ return addr != (dma_addr_t)netmem_get_dma_addr(netmem)
+ << PAGE_SHIFT;
+ }
+
+ netmem_set_dma_addr(netmem, addr);
+ return false;
+}
+
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
+}
+
+#if defined(CONFIG_PAGE_POOL)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
+void page_pool_clear_pp_info(netmem_ref netmem);
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq);
+#else
+static inline void page_pool_set_pp_info(struct page_pool *pool,
+ netmem_ref netmem)
+{
+}
+static inline void page_pool_clear_pp_info(netmem_ref netmem)
+{
+}
+static inline int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
new file mode 100644
index 000000000000..c82a95beceff
--- /dev/null
+++ b/net/core/page_pool_user.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/xarray.h>
+#include <net/busy_poll.h>
+#include <net/net_debug.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
+
+#include "page_pool_priv.h"
+#include "netdev-genl-gen.h"
+
+static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
+/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev,
+ * pool->user.
+ * Ordering: inside rtnl_lock
+ */
+DEFINE_MUTEX(page_pools_lock);
+
+/* Page pools are only reachable from user space (via netlink) if they are
+ * linked to a netdev at creation time. Following page pool "visibility"
+ * states are possible:
+ * - normal
+ * - user.list: linked to real netdev, netdev: real netdev
+ * - orphaned - real netdev has disappeared
+ * - user.list: linked to lo, netdev: lo
+ * - invisible - either (a) created without netdev linking, (b) unlisted due
+ * to error, or (c) the entire namespace which owned this pool disappeared
+ * - user.list: unhashed, netdev: unknown
+ */
+
+typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info);
+
+static int
+netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill)
+{
+ struct page_pool *pool;
+ struct sk_buff *rsp;
+ int err;
+
+ mutex_lock(&page_pools_lock);
+ pool = xa_load(&page_pools, id);
+ if (!pool || hlist_unhashed(&pool->user.list) ||
+ !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) {
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+
+ err = fill(rsp, pool, info);
+ if (err)
+ goto err_free_msg;
+
+ mutex_unlock(&page_pools_lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+err_unlock:
+ mutex_unlock(&page_pools_lock);
+ return err;
+}
+
+struct page_pool_dump_cb {
+ unsigned long ifindex;
+ u32 pp_id;
+};
+
+static int
+netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ pp_nl_fill_cb fill)
+{
+ struct page_pool_dump_cb *state = (void *)cb->ctx;
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ struct page_pool *pool;
+ int err = 0;
+
+ rtnl_lock();
+ mutex_lock(&page_pools_lock);
+ for_each_netdev_dump(net, netdev, state->ifindex) {
+ hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+ if (state->pp_id && state->pp_id < pool->user.id)
+ continue;
+
+ state->pp_id = pool->user.id;
+ err = fill(skb, pool, info);
+ if (err)
+ goto out;
+ }
+
+ state->pp_id = 0;
+ }
+out:
+ mutex_unlock(&page_pools_lock);
+ rtnl_unlock();
+
+ return err;
+}
+
+static int
+page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info)
+{
+#ifdef CONFIG_PAGE_POOL_STATS
+ struct page_pool_stats stats = {};
+ struct nlattr *nest;
+ void *hdr;
+
+ if (!page_pool_get_stats(pool, &stats))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO);
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) ||
+ (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
+ nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
+ pool->slow.netdev->ifindex)))
+ goto err_cancel_nest;
+
+ nla_nest_end(rsp, nest);
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST,
+ stats.alloc_stats.fast) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW,
+ stats.alloc_stats.slow) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER,
+ stats.alloc_stats.slow_high_order) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY,
+ stats.alloc_stats.empty) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL,
+ stats.alloc_stats.refill) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE,
+ stats.alloc_stats.waive) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED,
+ stats.recycle_stats.cached) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL,
+ stats.recycle_stats.cache_full) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING,
+ stats.recycle_stats.ring) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL,
+ stats.recycle_stats.ring_full) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT,
+ stats.recycle_stats.released_refcnt))
+ goto err_cancel_msg;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+err_cancel_nest:
+ nla_nest_cancel(rsp, nest);
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+#else
+ GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS");
+ return -EOPNOTSUPP;
+#endif
+}
+
+int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)];
+ struct nlattr *nest;
+ int err;
+ u32 id;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO))
+ return -EINVAL;
+
+ nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO];
+ err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest,
+ netdev_page_pool_info_nl_policy,
+ info->extack);
+ if (err)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID))
+ return -EINVAL;
+ if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NETDEV_A_PAGE_POOL_IFINDEX],
+ "selecting by ifindex not supported");
+ return -EINVAL;
+ }
+
+ id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]);
+
+ return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill);
+}
+
+int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill);
+}
+
+static int
+page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info)
+{
+ size_t inflight, refsz;
+ unsigned int napi_id;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id))
+ goto err_cancel;
+
+ if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
+ nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
+ pool->slow.netdev->ifindex))
+ goto err_cancel;
+
+ napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0;
+ if (napi_id_valid(napi_id) &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id))
+ goto err_cancel;
+
+ inflight = page_pool_inflight(pool, false);
+ refsz = PAGE_SIZE << pool->p.order;
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
+ inflight * refsz))
+ goto err_cancel;
+ if (pool->user.detach_time &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME,
+ pool->user.detach_time))
+ goto err_cancel;
+
+ if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
+ goto err_cancel;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+err_cancel:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd)
+{
+ struct genl_info info;
+ struct sk_buff *ntf;
+ struct net *net;
+
+ lockdep_assert_held(&page_pools_lock);
+
+ /* 'invisible' page pools don't matter */
+ if (hlist_unhashed(&pool->user.list))
+ return;
+ net = dev_net(pool->slow.netdev);
+
+ if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL))
+ return;
+
+ genl_info_init_ntf(&info, &netdev_nl_family, cmd);
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ if (page_pool_nl_fill(ntf, pool, &info)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&netdev_nl_family, net, ntf,
+ 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL);
+}
+
+int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 id;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID))
+ return -EINVAL;
+
+ id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]);
+
+ return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill);
+}
+
+int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill);
+}
+
+int page_pool_list(struct page_pool *pool)
+{
+ static u32 id_alloc_next;
+ int err;
+
+ mutex_lock(&page_pools_lock);
+ err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b,
+ &id_alloc_next, GFP_KERNEL);
+ if (err < 0)
+ goto err_unlock;
+
+ INIT_HLIST_NODE(&pool->user.list);
+ if (pool->slow.netdev) {
+ hlist_add_head(&pool->user.list,
+ &pool->slow.netdev->page_pools);
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
+ }
+
+ mutex_unlock(&page_pools_lock);
+ return 0;
+
+err_unlock:
+ mutex_unlock(&page_pools_lock);
+ return err;
+}
+
+void page_pool_detached(struct page_pool *pool)
+{
+ mutex_lock(&page_pools_lock);
+ pool->user.detach_time = ktime_get_boottime_seconds();
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
+ mutex_unlock(&page_pools_lock);
+}
+
+void page_pool_unlist(struct page_pool *pool)
+{
+ mutex_lock(&page_pools_lock);
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF);
+ xa_erase(&page_pools, pool->user.id);
+ if (!hlist_unhashed(&pool->user.list))
+ hlist_del(&pool->user.list);
+ mutex_unlock(&page_pools_lock);
+}
+
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ void *binding = rxq->mp_params.mp_priv;
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ if (!binding)
+ return 0;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
+ if (pool->mp_priv != binding)
+ continue;
+
+ if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
+ mutex_unlock(&page_pools_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&page_pools_lock);
+ return -ENODATA;
+}
+
+static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
+{
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) {
+ hlist_del_init(&pool->user.list);
+ pool->slow.netdev = NET_PTR_POISON;
+ }
+ mutex_unlock(&page_pools_lock);
+}
+
+static void page_pool_unreg_netdev(struct net_device *netdev)
+{
+ struct page_pool *pool, *last;
+ struct net_device *lo;
+
+ lo = dev_net(netdev)->loopback_dev;
+
+ mutex_lock(&page_pools_lock);
+ last = NULL;
+ hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+ pool->slow.netdev = lo;
+ netdev_nl_page_pool_event(pool,
+ NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
+ last = pool;
+ }
+ if (last)
+ hlist_splice_init(&netdev->page_pools, &last->user.list,
+ &lo->page_pools);
+ mutex_unlock(&page_pools_lock);
+}
+
+static int
+page_pool_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ if (hlist_empty(&netdev->page_pools))
+ return NOTIFY_OK;
+
+ if (netdev->ifindex != LOOPBACK_IFINDEX)
+ page_pool_unreg_netdev(netdev);
+ else
+ page_pool_unreg_netdev_wipe(netdev);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block page_pool_netdevice_nb = {
+ .notifier_call = page_pool_netdevice_event,
+};
+
+static int __init page_pool_user_init(void)
+{
+ return register_netdevice_notifier(&page_pool_netdevice_nb);
+}
+
+subsys_initcall(page_pool_user_init);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1897a3a385d8..d41b03fd1f63 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Authors:
* Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
@@ -6,16 +7,10 @@
*
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
* Ben Greear <greearb@candelatech.com>
- * Jens Låås <jens.laas@data.slu.se>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
+ * Jens Låås <jens.laas@data.slu.se>
*
* A tool for loading the network with preconfigurated packets.
- * The tool is implemented as a linux module. Parameters are output
+ * The tool is implemented as a linux module. Parameters are output
* device, delay (to hard_xmit), number of packets, and whether
* to use multiple SKBs or just the same one.
* pktgen uses the installed interface's output routine.
@@ -44,14 +39,14 @@
* * Add IOCTL interface to easily get counters & configuration.
* --Ben Greear <greearb@candelatech.com>
*
- * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
- * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
+ * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
+ * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
* as a "fastpath" with a configurable number of clones after alloc's.
- * clone_skb=0 means all packets are allocated this also means ranges time
- * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
+ * clone_skb=0 means all packets are allocated this also means ranges time
+ * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
* clones.
*
- * Also moved to /proc/net/pktgen/
+ * Also moved to /proc/net/pktgen/
* --ro
*
* Sept 10: Fixed threading/locking. Lots of bone-headed and more clever
@@ -60,28 +55,28 @@
*
* Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
*
- *
* 021124 Finished major redesign and rewrite for new functionality.
- * See Documentation/networking/pktgen.txt for how to use this.
+ * See Documentation/networking/pktgen.rst for how to use this.
*
* The new operation:
- * For each CPU one thread/process is created at start. This process checks
- * for running devices in the if_list and sends packets until count is 0 it
- * also the thread checks the thread->control which is used for inter-process
- * communication. controlling process "posts" operations to the threads this
- * way. The if_lock should be possible to remove when add/rem_device is merged
- * into this too.
+ * For each CPU one thread/process is created at start. This process checks
+ * for running devices in the if_list and sends packets until count is 0 it
+ * also the thread checks the thread->control which is used for inter-process
+ * communication. controlling process "posts" operations to the threads this
+ * way.
+ * The if_list is RCU protected, and the if_lock remains to protect updating
+ * of if_list, from "add_device" as it invoked from userspace (via proc write).
*
- * By design there should only be *one* "controlling" process. In practice
- * multiple write accesses gives unpredictable result. Understood by "write"
- * to /proc gives result code thats should be read be the "writer".
+ * By design there should only be *one* "controlling" process. In practice
+ * multiple write accesses gives unpredictable result. Understood by "write"
+ * to /proc gives result code that should be read be the "writer".
* For practical use this should be no problem.
*
- * Note when adding devices to a specific CPU there good idea to also assign
- * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
+ * Note when adding devices to a specific CPU there good idea to also assign
+ * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
* --ro
*
- * Fix refcount off by one if first packet fails, potential null deref,
+ * Fix refcount off by one if first packet fails, potential null deref,
* memleak 030710- KJP
*
* First "ranges" functionality for ipv6 030726 --ro
@@ -89,35 +84,40 @@
* Included flow support. 030802 ANK.
*
* Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
- *
+ *
* Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
* ia64 compilation fix from Aron Griffis <aron@hp.com> 040604
*
- * New xmit() return, do_div and misc clean up by Stephen Hemminger
+ * New xmit() return, do_div and misc clean up by Stephen Hemminger
* <shemminger@osdl.org> 040923
*
- * Randy Dunlap fixed u64 printk compiler waring
+ * Randy Dunlap fixed u64 printk compiler warning
*
* Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
* New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
*
- * Corrections from Nikolai Malykh (nmalykh@bilim.com)
+ * Corrections from Nikolai Malykh (nmalykh@bilim.com)
* Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
*
- * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
+ * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
* 050103
*
* MPLS support by Steven Whitehouse <steve@chygwyn.com>
*
* 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
*
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sys.h>
#include <linux/types.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
-#include <linux/smp_lock.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -129,6 +129,8 @@
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
+#include <linux/hrtimer.h>
+#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/timer.h>
#include <linux/list.h>
@@ -148,120 +150,173 @@
#include <linux/seq_file.h>
#include <linux/wait.h>
#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <linux/mmzone.h>
+#include <net/net_namespace.h>
#include <net/checksum.h>
#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
#include <net/addrconf.h>
+#include <net/xfrm.h>
+#include <net/netns/generic.h>
#include <asm/byteorder.h>
#include <linux/rcupdate.h>
-#include <asm/bitops.h>
-#include <asm/io.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
#include <asm/dma.h>
-#include <asm/uaccess.h>
#include <asm/div64.h> /* do_div */
-#include <asm/timex.h>
-#define VERSION "pktgen v2.68: Packet Generator for packet performance testing.\n"
-
-/* #define PG_DEBUG(a) a */
-#define PG_DEBUG(a)
-
-/* The buckets are exponential in 'width' */
-#define LAT_BUCKETS_MAX 32
+#define VERSION "2.75"
#define IP_NAME_SZ 32
#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
-#define MPLS_STACK_BOTTOM __constant_htonl(0x00000100)
+#define MPLS_STACK_BOTTOM htonl(0x00000100)
+/* Max number of internet mix entries that can be specified in imix_weights. */
+#define MAX_IMIX_ENTRIES 20
+#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
+
+#define func_enter() pr_debug("entering %s\n", __func__)
+
+#define PKT_FLAGS \
+ pf(IPV6) /* Interface in IPV6 Mode */ \
+ pf(IPSRC_RND) /* IP-Src Random */ \
+ pf(IPDST_RND) /* IP-Dst Random */ \
+ pf(TXSIZE_RND) /* Transmit size is random */ \
+ pf(UDPSRC_RND) /* UDP-Src Random */ \
+ pf(UDPDST_RND) /* UDP-Dst Random */ \
+ pf(UDPCSUM) /* Include UDP checksum */ \
+ pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \
+ pf(MPLS_RND) /* Random MPLS labels */ \
+ pf(QUEUE_MAP_RND) /* queue map Random */ \
+ pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \
+ pf(FLOW_SEQ) /* Sequential flows */ \
+ pf(IPSEC) /* ipsec on for flows */ \
+ pf(MACSRC_RND) /* MAC-Src Random */ \
+ pf(MACDST_RND) /* MAC-Dst Random */ \
+ pf(VID_RND) /* Random VLAN ID */ \
+ pf(SVID_RND) /* Random SVLAN ID */ \
+ pf(NODE) /* Node memory alloc*/ \
+ pf(SHARED) /* Shared SKB */ \
+
+#define pf(flag) flag##_SHIFT,
+enum pkt_flags {
+ PKT_FLAGS
+};
+#undef pf
/* Device flag bits */
-#define F_IPSRC_RND (1<<0) /* IP-Src Random */
-#define F_IPDST_RND (1<<1) /* IP-Dst Random */
-#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */
-#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */
-#define F_MACSRC_RND (1<<4) /* MAC-Src Random */
-#define F_MACDST_RND (1<<5) /* MAC-Dst Random */
-#define F_TXSIZE_RND (1<<6) /* Transmit size is random */
-#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */
-#define F_MPLS_RND (1<<8) /* Random MPLS labels */
-#define F_VID_RND (1<<9) /* Random VLAN ID */
-#define F_SVID_RND (1<<10) /* Random SVLAN ID */
+#define pf(flag) static const __u32 F_##flag = (1<<flag##_SHIFT);
+PKT_FLAGS
+#undef pf
+
+#define pf(flag) __stringify(flag),
+static char *pkt_flag_names[] = {
+ PKT_FLAGS
+};
+#undef pf
+
+#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names)
/* Thread control flag bits */
-#define T_TERMINATE (1<<0)
-#define T_STOP (1<<1) /* Stop run */
-#define T_RUN (1<<2) /* Start run */
-#define T_REMDEVALL (1<<3) /* Remove all devs */
-#define T_REMDEV (1<<4) /* Remove one dev */
+#define T_STOP (1<<0) /* Stop run */
+#define T_RUN (1<<1) /* Start run */
+#define T_REMDEVALL (1<<2) /* Remove all devs */
+#define T_REMDEV (1<<3) /* Remove one dev */
+
+/* Xmit modes */
+#define M_START_XMIT 0 /* Default normal TX */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
-/* If lock -- can be removed after some work */
-#define if_lock(t) spin_lock(&(t->if_lock));
-#define if_unlock(t) spin_unlock(&(t->if_lock));
+/* If lock -- protects updating of if_list */
+#define if_lock(t) mutex_lock(&(t->if_lock))
+#define if_unlock(t) mutex_unlock(&(t->if_lock))
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
#define PG_PROC_DIR "pktgen"
#define PGCTRL "pgctrl"
-static struct proc_dir_entry *pg_proc_dir = NULL;
#define MAX_CFLOWS 65536
#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+struct imix_pkt {
+ u64 size;
+ u64 weight;
+ u64 count_so_far;
+};
+
struct flow_state {
__be32 cur_daddr;
int count;
+#ifdef CONFIG_XFRM
+ struct xfrm_state *x;
+#endif
+ __u32 flags;
};
-struct pktgen_dev {
+/* flow flag bits */
+#define F_INIT (1<<0) /* flow has been initialized */
+struct pktgen_dev {
/*
* Try to keep frequent/infrequent used vars. separated.
*/
+ struct proc_dir_entry *entry; /* proc file */
+ struct pktgen_thread *pg_thread;/* the owner */
+ struct list_head list; /* chaining in the thread's run-queue */
+ struct rcu_head rcu; /* freed by RCU */
- char ifname[IFNAMSIZ];
- char result[512];
-
- struct pktgen_thread *pg_thread; /* the owner */
- struct list_head list; /* Used for chaining in the thread's run-queue */
-
- int running; /* if this changes to false, the test will stop */
+ int running; /* if false, the test will stop */
/* If min != max, then we will either do a linear iteration, or
* we will do a random selection from within the range.
*/
__u32 flags;
+ int xmit_mode;
+ int min_pkt_size;
+ int max_pkt_size;
+ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
+ int nfrags;
int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
+ * removal by worker thread
+ */
+
+ struct page *page;
+ u64 delay; /* nano-seconds */
- int min_pkt_size; /* = ETH_ZLEN; */
- int max_pkt_size; /* = ETH_ZLEN; */
- int nfrags;
- __u32 delay_us; /* Default delay */
- __u32 delay_ns;
__u64 count; /* Default No packets to send */
__u64 sofar; /* How many pkts we've sent so far */
__u64 tx_bytes; /* How many bytes we've transmitted */
- __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */
+ __u64 errors; /* Errors when trying to transmit, */
/* runtime counters relating to clone_skb */
- __u64 next_tx_us; /* timestamp of when to tx next */
- __u32 next_tx_ns;
- __u64 allocated_skbs;
__u32 clone_count;
int last_ok; /* Was last skb sent?
- * Or a failed transmit of some sort? This will keep
- * sequence numbers in order, for example.
+ * Or a failed transmit of some sort?
+ * This will keep sequence numbers in order
*/
- __u64 started_at; /* micro-seconds */
- __u64 stopped_at; /* micro-seconds */
- __u64 idle_acc; /* micro-seconds */
+ ktime_t next_tx;
+ ktime_t started_at;
+ ktime_t stopped_at;
+ u64 idle_acc; /* nano-seconds */
+
__u32 seq_num;
- int clone_skb; /* Use multiple SKBs during packet gen. If this number
- * is greater than 1, then that many copies of the same
- * packet will be sent before a new packet is allocated.
- * For instance, if you want to send 1024 identical packets
- * before creating a new packet, set clone_skb to 1024.
+ int clone_skb; /*
+ * Use multiple SKBs during packet gen.
+ * If this number is greater than 1, then
+ * that many copies of the same packet will be
+ * sent before a new packet is allocated.
+ * If you want to send 1024 identical packets
+ * before creating a new packet,
+ * set clone_skb to 1024.
*/
char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
@@ -293,11 +348,21 @@ struct pktgen_dev {
__u16 udp_dst_max; /* exclusive, dest UDP port */
/* DSCP + ECN */
- __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */
- __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ * are for dscp codepoint
+ */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ * (see RFC 3260, sec. 4)
+ */
+
+ /* IMIX */
+ unsigned int n_imix_entries;
+ struct imix_pkt imix_entries[MAX_IMIX_ENTRIES];
+ /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/
+ __u8 imix_distribution[IMIX_PRECISION];
/* MPLS */
- unsigned nr_labels; /* Depth of stack, 0 = no MPLS */
+ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
__be32 labels[MAX_MPLS_LABELS];
/* VLAN/SVLAN (802.1Q/Q-in-Q) */
@@ -319,34 +384,57 @@ struct pktgen_dev {
__u32 cur_src_mac_offset;
__be32 cur_saddr;
__be32 cur_daddr;
+ __u16 ip_id;
__u16 cur_udp_dst;
__u16 cur_udp_src;
+ __u16 cur_queue_map;
__u32 cur_pkt_size;
+ __u32 last_pkt_size;
__u8 hh[14];
/* = {
- 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
-
- We fill in SRC address later
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x08, 0x00
- };
+ * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+ *
+ * We fill in SRC address later
+ * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ * 0x08, 0x00
+ * };
*/
__u16 pad; /* pad out the hh struct to an even 16 bytes */
- struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we
+ struct sk_buff *skb; /* skb we are to transmit next, used for when we
* are transmitting the same one multiple times
*/
- struct net_device *odev; /* The out-going device. Note that the device should
- * have it's pg_info pointer pointing back to this
- * device. This will be set when the user specifies
- * the out-going device name (not when the inject is
- * started as it used to do.)
- */
+ struct net_device *odev; /* The out-going device.
+ * Note that the device should have it's
+ * pg_info pointer pointing back to this
+ * device.
+ * Set when the user specifies the out-going
+ * device name (not when the inject is
+ * started as it used to do.)
+ */
+ netdevice_tracker dev_tracker;
+ char odevname[32];
struct flow_state *flows;
- unsigned cflows; /* Concurrent flows (config) */
- unsigned lflow; /* Flow length (config) */
- unsigned nflows; /* accumulated flows (stats) */
+ unsigned int cflows; /* Concurrent flows (config) */
+ unsigned int lflow; /* Flow length (config) */
+ unsigned int nflows; /* accumulated flows (stats) */
+ unsigned int curfl; /* current sequenced flow (state)*/
+
+ u16 queue_map_min;
+ u16 queue_map_max;
+ __u32 skb_priority; /* skb priority field */
+ unsigned int burst; /* number of duplicated packets to burst */
+ int node; /* Memory node */
+
+#ifdef CONFIG_XFRM
+ __u8 ipsmode; /* IPSEC mode (config) */
+ __u8 ipsproto; /* IPSEC type (config) */
+ __u32 spi;
+ struct xfrm_dst xdst;
+ struct dst_ops dstops;
+#endif
+ char result[512];
};
struct pktgen_hdr {
@@ -356,297 +444,188 @@ struct pktgen_hdr {
__be32 tv_usec;
};
+
+static unsigned int pg_net_id __read_mostly;
+
+struct pktgen_net {
+ struct net *net;
+ struct proc_dir_entry *proc_dir;
+ struct list_head pktgen_threads;
+ bool pktgen_exiting;
+};
+
struct pktgen_thread {
- spinlock_t if_lock;
+ struct mutex if_lock; /* for list of devices */
struct list_head if_list; /* All device here */
struct list_head th_list;
- int removed;
- char name[32];
+ struct task_struct *tsk;
char result[512];
- u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */
- /* Field for thread to receive "posted" events terminate, stop ifs etc. */
+ /* Field for thread to receive "posted" events terminate,
+ * stop ifs etc.
+ */
u32 control;
- int pid;
int cpu;
wait_queue_head_t queue;
+ struct completion start_done;
+ struct pktgen_net *net;
};
#define REMOVE 1
#define FIND 0
-/* This code works around the fact that do_div cannot handle two 64-bit
- numbers, and regular 64-bit division doesn't work on x86 kernels.
- --Ben
-*/
-
-#define PG_DIV 0
-
-/* This was emailed to LMKL by: Chris Caputo <ccaputo@alt.net>
- * Function copied/adapted/optimized from:
- *
- * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html
- *
- * Copyright 1994, University of Cambridge Computer Laboratory
- * All Rights Reserved.
- *
- */
-static inline s64 divremdi3(s64 x, s64 y, int type)
-{
- u64 a = (x < 0) ? -x : x;
- u64 b = (y < 0) ? -y : y;
- u64 res = 0, d = 1;
-
- if (b > 0) {
- while (b < a) {
- b <<= 1;
- d <<= 1;
- }
- }
-
- do {
- if (a >= b) {
- a -= b;
- res += d;
- }
- b >>= 1;
- d >>= 1;
- }
- while (d);
-
- if (PG_DIV == type) {
- return (((x ^ y) & (1ll << 63)) == 0) ? res : -(s64) res;
- } else {
- return ((x & (1ll << 63)) == 0) ? a : -(s64) a;
- }
-}
-
-/* End of hacks to deal with 64-bit math on x86 */
-
-/** Convert to milliseconds */
-static inline __u64 tv_to_ms(const struct timeval *tv)
-{
- __u64 ms = tv->tv_usec / 1000;
- ms += (__u64) tv->tv_sec * (__u64) 1000;
- return ms;
-}
-
-/** Convert to micro-seconds */
-static inline __u64 tv_to_us(const struct timeval *tv)
-{
- __u64 us = tv->tv_usec;
- us += (__u64) tv->tv_sec * (__u64) 1000000;
- return us;
-}
-
-static inline __u64 pg_div(__u64 n, __u32 base)
-{
- __u64 tmp = n;
- do_div(tmp, base);
- /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n",
- n, base, tmp); */
- return tmp;
-}
-
-static inline __u64 pg_div64(__u64 n, __u64 base)
-{
- __u64 tmp = n;
-/*
- * How do we know if the architecture we are running on
- * supports division with 64 bit base?
- *
- */
-#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__)
-
- do_div(tmp, base);
-#else
- tmp = divremdi3(n, base, PG_DIV);
-#endif
- return tmp;
-}
-
-static inline u32 pktgen_random(void)
-{
-#if 0
- __u32 n;
- get_random_bytes(&n, 4);
- return n;
-#else
- return net_random();
-#endif
-}
-
-static inline __u64 getCurMs(void)
-{
- struct timeval tv;
- do_gettimeofday(&tv);
- return tv_to_ms(&tv);
-}
-
-static inline __u64 getCurUs(void)
-{
- struct timeval tv;
- do_gettimeofday(&tv);
- return tv_to_us(&tv);
-}
-
-static inline __u64 tv_diff(const struct timeval *a, const struct timeval *b)
-{
- return tv_to_us(a) - tv_to_us(b);
-}
-
-/* old include end */
-
-static char version[] __initdata = VERSION;
+static const char version[] =
+ "Packet Generator for packet performance testing. Version: " VERSION "\n";
static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
- const char *ifname);
+ const char *ifname, bool exact);
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
-static void pktgen_run_all_threads(void);
-static void pktgen_stop_all_threads_ifs(void);
-static int pktgen_stop_device(struct pktgen_dev *pkt_dev);
+static void pktgen_run_all_threads(struct pktgen_net *pn);
+static void pktgen_reset_all_threads(struct pktgen_net *pn);
+static void pktgen_stop_all_threads(struct pktgen_net *pn);
+
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
-static int pktgen_mark_device(const char *ifname);
-static unsigned int scan_ip6(const char *s, char ip[16]);
-static unsigned int fmt_ip6(char *s, const char ip[16]);
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev);
/* Module parameters, defaults. */
-static int pg_count_d = 1000; /* 1000 pkts by default */
-static int pg_delay_d;
-static int pg_clone_skb_d;
-static int debug;
+static int pg_count_d __read_mostly = 1000;
+static int pg_delay_d __read_mostly;
+static int pg_clone_skb_d __read_mostly;
+static int debug __read_mostly;
static DEFINE_MUTEX(pktgen_thread_lock);
-static LIST_HEAD(pktgen_threads);
static struct notifier_block pktgen_notifier_block = {
.notifier_call = pktgen_device_event,
};
/*
- * /proc handling functions
+ * /proc handling functions
*
*/
static int pgctrl_show(struct seq_file *seq, void *v)
{
- seq_puts(seq, VERSION);
+ seq_puts(seq, version);
return 0;
}
-static ssize_t pgctrl_write(struct file *file, const char __user * buf,
- size_t count, loff_t * ppos)
+static ssize_t pgctrl_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
{
- int err = 0;
char data[128];
+ size_t max;
+ struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
- if (!capable(CAP_NET_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
- if (count > sizeof(data))
- count = sizeof(data);
+ if (count < 1)
+ return -EINVAL;
- if (copy_from_user(data, buf, count)) {
- err = -EFAULT;
- goto out;
- }
- data[count - 1] = 0; /* Make string */
+ max = min(count, sizeof(data) - 1);
+ if (copy_from_user(data, buf, max))
+ return -EFAULT;
- if (!strcmp(data, "stop"))
- pktgen_stop_all_threads_ifs();
+ if (data[max - 1] == '\n')
+ data[max - 1] = 0; /* strip trailing '\n', terminate string */
+ else
+ data[max] = 0; /* terminate string */
+ if (!strcmp(data, "stop"))
+ pktgen_stop_all_threads(pn);
else if (!strcmp(data, "start"))
- pktgen_run_all_threads();
-
+ pktgen_run_all_threads(pn);
+ else if (!strcmp(data, "reset"))
+ pktgen_reset_all_threads(pn);
else
- printk("pktgen: Unknown command: %s\n", data);
-
- err = count;
+ return -EINVAL;
-out:
- return err;
+ return count;
}
static int pgctrl_open(struct inode *inode, struct file *file)
{
- return single_open(file, pgctrl_show, PDE(inode)->data);
+ return single_open(file, pgctrl_show, pde_data(inode));
}
-static struct file_operations pktgen_fops = {
- .owner = THIS_MODULE,
- .open = pgctrl_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pgctrl_write,
- .release = single_release,
+static const struct proc_ops pktgen_proc_ops = {
+ .proc_open = pgctrl_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pgctrl_write,
+ .proc_release = single_release,
};
static int pktgen_if_show(struct seq_file *seq, void *v)
{
- int i;
- struct pktgen_dev *pkt_dev = seq->private;
- __u64 sa;
- __u64 stopped;
- __u64 now = getCurUs();
+ const struct pktgen_dev *pkt_dev = seq->private;
+ ktime_t stopped;
+ unsigned int i;
+ u64 idle;
seq_printf(seq,
"Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
(unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size);
+ if (pkt_dev->n_imix_entries > 0) {
+ seq_puts(seq, " imix_weights: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].weight);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
- " frags: %d delay: %u clone_skb: %d ifname: %s\n",
- pkt_dev->nfrags,
- 1000 * pkt_dev->delay_us + pkt_dev->delay_ns,
- pkt_dev->clone_skb, pkt_dev->ifname);
+ " frags: %d delay: %llu clone_skb: %d ifname: %s\n",
+ pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
+ pkt_dev->clone_skb, pkt_dev->odevname);
seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows,
pkt_dev->lflow);
+ seq_printf(seq,
+ " queue_map_min: %u queue_map_max: %u\n",
+ pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
+
+ if (pkt_dev->skb_priority)
+ seq_printf(seq, " skb_priority: %u\n",
+ pkt_dev->skb_priority);
+
if (pkt_dev->flags & F_IPV6) {
- char b1[128], b2[128], b3[128];
- fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr);
- fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
- fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
seq_printf(seq,
- " saddr: %s min_saddr: %s max_saddr: %s\n", b1,
- b2, b3);
-
- fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr);
- fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
- fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
+ " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n"
+ " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n",
+ &pkt_dev->in6_saddr,
+ &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
+ &pkt_dev->in6_daddr,
+ &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
+ } else {
seq_printf(seq,
- " daddr: %s min_daddr: %s max_daddr: %s\n", b1,
- b2, b3);
-
- } else
+ " dst_min: %s dst_max: %s\n",
+ pkt_dev->dst_min, pkt_dev->dst_max);
seq_printf(seq,
- " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n",
- pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min,
- pkt_dev->src_max);
+ " src_min: %s src_max: %s\n",
+ pkt_dev->src_min, pkt_dev->src_max);
+ }
seq_puts(seq, " src_mac: ");
- if (is_zero_ether_addr(pkt_dev->src_mac))
- for (i = 0; i < 6; i++)
- seq_printf(seq, "%02X%s", pkt_dev->odev->dev_addr[i],
- i == 5 ? " " : ":");
- else
- for (i = 0; i < 6; i++)
- seq_printf(seq, "%02X%s", pkt_dev->src_mac[i],
- i == 5 ? " " : ":");
+ seq_printf(seq, "%pM ",
+ is_zero_ether_addr(pkt_dev->src_mac) ?
+ pkt_dev->odev->dev_addr : pkt_dev->src_mac);
- seq_printf(seq, "dst_mac: ");
- for (i = 0; i < 6; i++)
- seq_printf(seq, "%02X%s", pkt_dev->dst_mac[i],
- i == 5 ? "\n" : ":");
+ seq_puts(seq, "dst_mac: ");
+ seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
" udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
@@ -658,79 +637,86 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
if (pkt_dev->nr_labels) {
- unsigned i;
- seq_printf(seq, " mpls: ");
- for(i = 0; i < pkt_dev->nr_labels; i++)
+ seq_puts(seq, " mpls: ");
+ for (i = 0; i < pkt_dev->nr_labels; i++)
seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
i == pkt_dev->nr_labels-1 ? "\n" : ", ");
}
- if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->vlan_id != 0xffff)
seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n",
- pkt_dev->vlan_id, pkt_dev->vlan_p, pkt_dev->vlan_cfi);
- }
+ pkt_dev->vlan_id, pkt_dev->vlan_p,
+ pkt_dev->vlan_cfi);
- if (pkt_dev->svlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff)
seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n",
- pkt_dev->svlan_id, pkt_dev->svlan_p, pkt_dev->svlan_cfi);
- }
+ pkt_dev->svlan_id, pkt_dev->svlan_p,
+ pkt_dev->svlan_cfi);
- if (pkt_dev->tos) {
+ if (pkt_dev->tos)
seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos);
- }
- if (pkt_dev->traffic_class) {
+ if (pkt_dev->traffic_class)
seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
- }
-
- seq_printf(seq, " Flags: ");
-
- if (pkt_dev->flags & F_IPV6)
- seq_printf(seq, "IPV6 ");
- if (pkt_dev->flags & F_IPSRC_RND)
- seq_printf(seq, "IPSRC_RND ");
+ if (pkt_dev->burst > 1)
+ seq_printf(seq, " burst: %d\n", pkt_dev->burst);
- if (pkt_dev->flags & F_IPDST_RND)
- seq_printf(seq, "IPDST_RND ");
+ if (pkt_dev->node >= 0)
+ seq_printf(seq, " node: %d\n", pkt_dev->node);
- if (pkt_dev->flags & F_TXSIZE_RND)
- seq_printf(seq, "TXSIZE_RND ");
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
+ seq_puts(seq, " xmit_mode: netif_receive\n");
+ else if (pkt_dev->xmit_mode == M_QUEUE_XMIT)
+ seq_puts(seq, " xmit_mode: xmit_queue\n");
- if (pkt_dev->flags & F_UDPSRC_RND)
- seq_printf(seq, "UDPSRC_RND ");
+ seq_puts(seq, " Flags: ");
- if (pkt_dev->flags & F_UDPDST_RND)
- seq_printf(seq, "UDPDST_RND ");
+ for (i = 0; i < NR_PKT_FLAGS; i++) {
+ if (i == FLOW_SEQ_SHIFT)
+ if (!pkt_dev->cflows)
+ continue;
- if (pkt_dev->flags & F_MPLS_RND)
- seq_printf(seq, "MPLS_RND ");
-
- if (pkt_dev->flags & F_MACSRC_RND)
- seq_printf(seq, "MACSRC_RND ");
+ if (pkt_dev->flags & (1 << i)) {
+ seq_printf(seq, "%s ", pkt_flag_names[i]);
+#ifdef CONFIG_XFRM
+ if (i == IPSEC_SHIFT && pkt_dev->spi)
+ seq_printf(seq, "spi:%u ", pkt_dev->spi);
+#endif
+ } else if (i == FLOW_SEQ_SHIFT) {
+ seq_puts(seq, "FLOW_RND ");
+ }
+ }
- if (pkt_dev->flags & F_MACDST_RND)
- seq_printf(seq, "MACDST_RND ");
+ seq_puts(seq, "\n");
- if (pkt_dev->flags & F_VID_RND)
- seq_printf(seq, "VID_RND ");
+ /* not really stopped, more like last-running-at */
+ stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;
+ idle = pkt_dev->idle_acc;
+ do_div(idle, NSEC_PER_USEC);
- if (pkt_dev->flags & F_SVID_RND)
- seq_printf(seq, "SVID_RND ");
+ seq_printf(seq,
+ "Current:\n pkts-sofar: %llu errors: %llu\n",
+ (unsigned long long)pkt_dev->sofar,
+ (unsigned long long)pkt_dev->errors);
- seq_puts(seq, "\n");
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
- sa = pkt_dev->started_at;
- stopped = pkt_dev->stopped_at;
- if (pkt_dev->running)
- stopped = now; /* not really stopped, more like last-running-at */
+ seq_puts(seq, " imix_size_counts: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].count_so_far);
+ }
+ seq_puts(seq, "\n");
+ }
seq_printf(seq,
- "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n",
- (unsigned long long)pkt_dev->sofar,
- (unsigned long long)pkt_dev->errors, (unsigned long long)sa,
- (unsigned long long)stopped,
- (unsigned long long)pkt_dev->idle_acc);
+ " started: %lluus stopped: %lluus idle: %lluus\n",
+ (unsigned long long) ktime_to_us(pkt_dev->started_at),
+ (unsigned long long) ktime_to_us(stopped),
+ (unsigned long long) idle);
seq_printf(seq,
" seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
@@ -738,57 +724,60 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
pkt_dev->cur_src_mac_offset);
if (pkt_dev->flags & F_IPV6) {
- char b1[128], b2[128];
- fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr);
- fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr);
- seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
+ seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n",
+ &pkt_dev->cur_in6_saddr,
+ &pkt_dev->cur_in6_daddr);
} else
- seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
- pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+ seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n",
+ &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);
seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
+ seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map);
+
seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
if (pkt_dev->result[0])
seq_printf(seq, "Result: %s\n", pkt_dev->result);
else
- seq_printf(seq, "Result: Idle\n");
+ seq_puts(seq, "Result: Idle\n");
return 0;
}
-static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, __u32 *num)
+static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen,
+ __u32 *num)
{
- int i = 0;
+ size_t i = 0;
+
*num = 0;
- for(; i < maxlen; i++) {
+ for (; i < maxlen; i++) {
+ int value;
char c;
- *num <<= 4;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
- if ((c >= '0') && (c <= '9'))
- *num |= c - '0';
- else if ((c >= 'a') && (c <= 'f'))
- *num |= c - 'a' + 10;
- else if ((c >= 'A') && (c <= 'F'))
- *num |= c - 'A' + 10;
- else
+ value = hex_to_bin(c);
+ if (value >= 0) {
+ *num <<= 4;
+ *num |= value;
+ } else {
break;
+ }
}
return i;
}
-static int count_trail_chars(const char __user * user_buffer,
- unsigned int maxlen)
+static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -801,20 +790,21 @@ static int count_trail_chars(const char __user * user_buffer,
break;
default:
goto done;
- };
+ }
}
done:
return i;
}
-static unsigned long num_arg(const char __user * user_buffer,
- unsigned long maxlen, unsigned long *num)
+static ssize_t num_arg(const char __user *user_buffer, size_t maxlen,
+ unsigned long *num)
{
- int i = 0;
+ size_t i;
*num = 0;
- for (; i < maxlen; i++) {
+ for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
if ((c >= '0') && (c <= '9')) {
@@ -826,12 +816,13 @@ static unsigned long num_arg(const char __user * user_buffer,
return i;
}
-static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+static ssize_t strn_len(const char __user *user_buffer, size_t maxlen)
{
- int i = 0;
+ size_t i;
- for (; i < maxlen; i++) {
+ for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -840,79 +831,189 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
case '\r':
case '\t':
case ' ':
+ case '=':
goto done_str;
- break;
default:
break;
- };
+ }
}
done_str:
return i;
}
-static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+/* Parses imix entries from user buffer.
+ * The user buffer should consist of imix entries separated by spaces
+ * where each entry consists of size and weight delimited by commas.
+ * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example.
+ */
+static ssize_t get_imix_entries(const char __user *buffer,
+ size_t maxlen,
+ struct pktgen_dev *pkt_dev)
+{
+ size_t i = 0, max;
+ ssize_t len;
+ char c;
+
+ pkt_dev->n_imix_entries = 0;
+
+ do {
+ unsigned long weight;
+ unsigned long size;
+
+ if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &size);
+ if (len < 0)
+ return len;
+ i += len;
+ if (i >= maxlen)
+ return -EINVAL;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ /* Check for comma between size_i and weight_i */
+ if (c != ',')
+ return -EINVAL;
+ i++;
+ if (i >= maxlen)
+ return -EINVAL;
+
+ if (size < 14 + 20 + 8)
+ size = 14 + 20 + 8;
+
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &weight);
+ if (len < 0)
+ return len;
+ if (weight <= 0)
+ return -EINVAL;
+
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size;
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight;
+
+ i += len;
+ pkt_dev->n_imix_entries++;
+
+ if (i >= maxlen)
+ break;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ i++;
+ } while (c == ' ');
+
+ return i;
+}
+
+static ssize_t get_labels(const char __user *buffer,
+ size_t maxlen, struct pktgen_dev *pkt_dev)
{
- unsigned n = 0;
+ unsigned int n = 0;
+ size_t i = 0, max;
+ ssize_t len;
char c;
- ssize_t i = 0;
- int len;
pkt_dev->nr_labels = 0;
do {
__u32 tmp;
- len = hex32_arg(&buffer[i], 8, &tmp);
- if (len <= 0)
+
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(8, maxlen - i);
+ len = hex32_arg(&buffer[i], max, &tmp);
+ if (len < 0)
return len;
+
+ /* return empty list in case of invalid input or zero value */
+ if (len == 0 || tmp == 0)
+ return maxlen;
+
pkt_dev->labels[n] = htonl(tmp);
if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
pkt_dev->flags |= F_MPLS_RND;
i += len;
+ n++;
+ if (i >= maxlen)
+ break;
if (get_user(c, &buffer[i]))
return -EFAULT;
i++;
- n++;
- if (n >= MAX_MPLS_LABELS)
- return -E2BIG;
- } while(c == ',');
+ } while (c == ',');
pkt_dev->nr_labels = n;
return i;
}
+static __u32 pktgen_read_flag(const char *f, bool *disable)
+{
+ __u32 i;
+
+ if (f[0] == '!') {
+ *disable = true;
+ f++;
+ }
+
+ for (i = 0; i < NR_PKT_FLAGS; i++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT)
+ continue;
+
+ /* allow only disabling ipv6 flag */
+ if (!*disable && i == IPV6_SHIFT)
+ continue;
+
+ if (strcmp(f, pkt_flag_names[i]) == 0)
+ return 1 << i;
+ }
+
+ if (strcmp(f, "FLOW_RND") == 0) {
+ *disable = !*disable;
+ return F_FLOW_SEQ;
+ }
+
+ return 0;
+}
+
static ssize_t pktgen_if_write(struct file *file,
- const char __user * user_buffer, size_t count,
- loff_t * offset)
+ const char __user *user_buffer, size_t count,
+ loff_t *offset)
{
- struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct seq_file *seq = file->private_data;
struct pktgen_dev *pkt_dev = seq->private;
- int i = 0, max, len;
+ size_t i, max;
+ ssize_t len;
char name[16], valstr[32];
unsigned long value = 0;
char *pg_result = NULL;
- int tmp = 0;
char buf[128];
pg_result = &(pkt_dev->result[0]);
if (count < 1) {
- printk("pktgen: wrong command format\n");
+ pr_warn("wrong command format\n");
return -EINVAL;
}
- max = count - i;
- tmp = count_trail_chars(&user_buffer[i], max);
- if (tmp < 0) {
- printk("pktgen: illegal format\n");
- return tmp;
+ max = count;
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0) {
+ pr_warn("illegal format\n");
+ return len;
}
- i += tmp;
+ i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
- if (len < 0) {
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
+
memset(name, 0, sizeof(name));
if (copy_from_user(name, &user_buffer[i], len))
return -EFAULT;
@@ -926,44 +1027,46 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
if (debug) {
- char tb[count + 1];
- if (copy_from_user(tb, user_buffer, count))
- return -EFAULT;
- tb[count] = 0;
- printk("pktgen: %s,%lu buffer -:%s:-\n", name,
- (unsigned long)count, tb);
+ size_t copy = min_t(size_t, count + 1, 1024);
+ char *tp = strndup_user(user_buffer, copy);
+
+ if (IS_ERR(tp))
+ return PTR_ERR(tp);
+
+ pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp);
+ kfree(tp);
}
if (!strcmp(name, "min_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
pkt_dev->min_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: min_pkt_size=%u",
+ sprintf(pg_result, "OK: min_pkt_size=%d",
pkt_dev->min_pkt_size);
return count;
}
if (!strcmp(name, "max_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->max_pkt_size) {
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: max_pkt_size=%u",
+ sprintf(pg_result, "OK: max_pkt_size=%d",
pkt_dev->max_pkt_size);
return count;
}
@@ -971,11 +1074,11 @@ static ssize_t pktgen_if_write(struct file *file,
/* Shortcut for min = max */
if (!strcmp(name, "pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
@@ -983,54 +1086,96 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "imix_weights")) {
+ if (pkt_dev->clone_skb > 0)
+ return -EINVAL;
+
+ max = count - i;
+ len = get_imix_entries(&user_buffer[i], max, pkt_dev);
+ if (len < 0)
+ return len;
+
+ fill_imix_distribution(pkt_dev);
+
return count;
}
if (!strcmp(name, "debug")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
debug = value;
sprintf(pg_result, "OK: debug=%u", debug);
return count;
}
if (!strcmp(name, "frags")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
pkt_dev->nfrags = value;
- sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags);
return count;
}
if (!strcmp(name, "delay")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
- if (value == 0x7FFFFFFF) {
- pkt_dev->delay_us = 0x7FFFFFFF;
- pkt_dev->delay_ns = 0;
- } else {
- pkt_dev->delay_us = value / 1000;
- pkt_dev->delay_ns = value % 1000;
- }
- sprintf(pg_result, "OK: delay=%u",
- 1000 * pkt_dev->delay_us + pkt_dev->delay_ns);
+
+ if (value == 0x7FFFFFFF)
+ pkt_dev->delay = ULLONG_MAX;
+ else
+ pkt_dev->delay = (u64)value;
+
+ sprintf(pg_result, "OK: delay=%llu",
+ (unsigned long long) pkt_dev->delay);
+ return count;
+ }
+ if (!strcmp(name, "rate")) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
+
+ if (!value)
+ return -EINVAL;
+ pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "ratep")) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
+
+ if (!value)
+ return -EINVAL;
+ pkt_dev->delay = NSEC_PER_SEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
return count;
}
if (!strcmp(name, "udp_src_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value != pkt_dev->udp_src_min) {
pkt_dev->udp_src_min = value;
pkt_dev->cur_udp_src = value;
@@ -1039,11 +1184,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value != pkt_dev->udp_dst_min) {
pkt_dev->udp_dst_min = value;
pkt_dev->cur_udp_dst = value;
@@ -1052,11 +1197,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value != pkt_dev->udp_src_max) {
pkt_dev->udp_src_max = value;
pkt_dev->cur_udp_src = value;
@@ -1065,11 +1210,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value != pkt_dev->udp_dst_max) {
pkt_dev->udp_dst_max = value;
pkt_dev->cur_udp_dst = value;
@@ -1078,33 +1223,43 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "clone_skb")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+ /* clone_skb is not supported for netif_receive xmit_mode and
+ * IMIX mode.
+ */
+ if ((value > 0) &&
+ ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
+ !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ return -EOPNOTSUPP;
+ if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ !(pkt_dev->flags & F_SHARED)))
+ return -EINVAL;
+
pkt_dev->clone_skb = value;
sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
return count;
}
if (!strcmp(name, "count")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
pkt_dev->count = value;
sprintf(pg_result, "OK: count=%llu",
(unsigned long long)pkt_dev->count);
return count;
}
if (!strcmp(name, "src_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (pkt_dev->src_mac_count != value) {
pkt_dev->src_mac_count = value;
pkt_dev->cur_src_mac_offset = 0;
@@ -1114,11 +1269,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "dst_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (pkt_dev->dst_mac_count != value) {
pkt_dev->dst_mac_count = value;
pkt_dev->cur_dst_mac_offset = 0;
@@ -1127,137 +1282,177 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->dst_mac_count);
return count;
}
- if (!strcmp(name, "flag")) {
- char f[32];
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
- if (len < 0) {
+ if (!strcmp(name, "burst")) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- if (copy_from_user(f, &user_buffer[i], len))
- return -EFAULT;
- i += len;
- if (strcmp(f, "IPSRC_RND") == 0)
- pkt_dev->flags |= F_IPSRC_RND;
-
- else if (strcmp(f, "!IPSRC_RND") == 0)
- pkt_dev->flags &= ~F_IPSRC_RND;
- else if (strcmp(f, "TXSIZE_RND") == 0)
- pkt_dev->flags |= F_TXSIZE_RND;
+ if ((value > 1) &&
+ ((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
+ ((pkt_dev->xmit_mode == M_START_XMIT) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
+ return -EOPNOTSUPP;
- else if (strcmp(f, "!TXSIZE_RND") == 0)
- pkt_dev->flags &= ~F_TXSIZE_RND;
+ if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ return -EINVAL;
- else if (strcmp(f, "IPDST_RND") == 0)
- pkt_dev->flags |= F_IPDST_RND;
-
- else if (strcmp(f, "!IPDST_RND") == 0)
- pkt_dev->flags &= ~F_IPDST_RND;
-
- else if (strcmp(f, "UDPSRC_RND") == 0)
- pkt_dev->flags |= F_UDPSRC_RND;
-
- else if (strcmp(f, "!UDPSRC_RND") == 0)
- pkt_dev->flags &= ~F_UDPSRC_RND;
-
- else if (strcmp(f, "UDPDST_RND") == 0)
- pkt_dev->flags |= F_UDPDST_RND;
-
- else if (strcmp(f, "!UDPDST_RND") == 0)
- pkt_dev->flags &= ~F_UDPDST_RND;
-
- else if (strcmp(f, "MACSRC_RND") == 0)
- pkt_dev->flags |= F_MACSRC_RND;
-
- else if (strcmp(f, "!MACSRC_RND") == 0)
- pkt_dev->flags &= ~F_MACSRC_RND;
+ pkt_dev->burst = value < 1 ? 1 : value;
+ sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
+ return count;
+ }
+ if (!strcmp(name, "node")) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
- else if (strcmp(f, "MACDST_RND") == 0)
- pkt_dev->flags |= F_MACDST_RND;
+ if (node_possible(value)) {
+ pkt_dev->node = value;
+ sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
+ } else {
+ sprintf(pg_result, "ERROR: node not possible");
+ }
+ return count;
+ }
+ if (!strcmp(name, "xmit_mode")) {
+ char f[32];
- else if (strcmp(f, "!MACDST_RND") == 0)
- pkt_dev->flags &= ~F_MACDST_RND;
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
+ return len;
- else if (strcmp(f, "MPLS_RND") == 0)
- pkt_dev->flags |= F_MPLS_RND;
+ memset(f, 0, sizeof(f));
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
- else if (strcmp(f, "!MPLS_RND") == 0)
- pkt_dev->flags &= ~F_MPLS_RND;
+ if (strcmp(f, "start_xmit") == 0) {
+ pkt_dev->xmit_mode = M_START_XMIT;
+ } else if (strcmp(f, "netif_receive") == 0) {
+ /* clone_skb set earlier, not supported in this mode */
+ if (pkt_dev->clone_skb > 0)
+ return -EOPNOTSUPP;
- else if (strcmp(f, "VID_RND") == 0)
- pkt_dev->flags |= F_VID_RND;
+ pkt_dev->xmit_mode = M_NETIF_RECEIVE;
- else if (strcmp(f, "!VID_RND") == 0)
- pkt_dev->flags &= ~F_VID_RND;
+ /* make sure new packet is allocated every time
+ * pktgen_xmit() is called
+ */
+ pkt_dev->last_ok = 1;
+ } else if (strcmp(f, "queue_xmit") == 0) {
+ pkt_dev->xmit_mode = M_QUEUE_XMIT;
+ pkt_dev->last_ok = 1;
+ } else {
+ sprintf(pg_result,
+ "xmit_mode -:%s:- unknown\nAvailable modes: %s",
+ f, "start_xmit, netif_receive\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: xmit_mode=%s", f);
+ return count;
+ }
+ if (!strcmp(name, "flag")) {
+ bool disable = false;
+ __u32 flag;
+ char f[32];
+ char *end;
- else if (strcmp(f, "SVID_RND") == 0)
- pkt_dev->flags |= F_SVID_RND;
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
+ return len;
- else if (strcmp(f, "!SVID_RND") == 0)
- pkt_dev->flags &= ~F_SVID_RND;
+ memset(f, 0, 32);
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
- else if (strcmp(f, "!IPV6") == 0)
- pkt_dev->flags &= ~F_IPV6;
+ flag = pktgen_read_flag(f, &disable);
+ if (flag) {
+ if (disable) {
+ /* If "clone_skb", or "burst" parameters are
+ * configured, it means that the skb still
+ * needs to be referenced by the pktgen, so
+ * the skb must be shared.
+ */
+ if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ pkt_dev->burst > 1))
+ return -EINVAL;
+ pkt_dev->flags &= ~flag;
+ } else {
+ pkt_dev->flags |= flag;
+ }
- else {
- sprintf(pg_result,
- "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND\n");
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
- sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+
+ /* Unknown flag */
+ end = pkt_dev->result + sizeof(pkt_dev->result);
+ pg_result += sprintf(pg_result,
+ "Flag -:%s:- unknown\n"
+ "Available flags, (prepend ! to un-set flag):\n", f);
+
+ for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ continue;
+ pg_result += snprintf(pg_result, end - pg_result,
+ "%s, ", pkt_flag_names[n]);
+ }
+ if (!WARN_ON_ONCE(pg_result >= end)) {
+ /* Remove the comma and whitespace at the end */
+ *(pg_result - 2) = '\0';
+ }
+
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
- if (len < 0) {
+ max = min(sizeof(pkt_dev->dst_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
- memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strncpy(pkt_dev->dst_min, buf, len);
+ strscpy_pad(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
if (debug)
- printk("pktgen: dst_min set to: %s\n",
- pkt_dev->dst_min);
- i += len;
+ pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
+
sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
return count;
}
if (!strcmp(name, "dst_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
- if (len < 0) {
+ max = min(sizeof(pkt_dev->dst_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
-
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
- memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strncpy(pkt_dev->dst_max, buf, len);
+ strscpy_pad(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
if (debug)
- printk("pktgen: dst_max set to: %s\n",
- pkt_dev->dst_max);
- i += len;
+ pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
+
sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
return count;
}
if (!strcmp(name, "dst6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1267,20 +1462,20 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
if (debug)
- printk("pktgen: dst6 set to: %s\n", buf);
+ pr_debug("dst6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6=%s", buf);
return count;
}
if (!strcmp(name, "dst6_min")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1290,20 +1485,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
- &pkt_dev->min_in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
if (debug)
- printk("pktgen: dst6_min set to: %s\n", buf);
+ pr_debug("dst6_min set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_min=%s", buf);
return count;
}
if (!strcmp(name, "dst6_max")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1313,18 +1507,18 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
if (debug)
- printk("pktgen: dst6_max set to: %s\n", buf);
+ pr_debug("dst6_max set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_max=%s", buf);
return count;
}
if (!strcmp(name, "src6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1334,134 +1528,91 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
- fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
if (debug)
- printk("pktgen: src6 set to: %s\n", buf);
+ pr_debug("src6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: src6=%s", buf);
return count;
}
if (!strcmp(name, "src_min")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
- if (len < 0) {
+ max = min(sizeof(pkt_dev->src_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
+
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
- memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strncpy(pkt_dev->src_min, buf, len);
+ strscpy_pad(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
if (debug)
- printk("pktgen: src_min set to: %s\n",
- pkt_dev->src_min);
- i += len;
+ pr_debug("src_min set to: %s\n", pkt_dev->src_min);
+
sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
return count;
}
if (!strcmp(name, "src_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
- if (len < 0) {
+ max = min(sizeof(pkt_dev->src_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
+
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
- memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strncpy(pkt_dev->src_max, buf, len);
+ strscpy_pad(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
if (debug)
- printk("pktgen: src_max set to: %s\n",
- pkt_dev->src_max);
- i += len;
+ pr_debug("src_max set to: %s\n", pkt_dev->src_max);
+
sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
return count;
}
if (!strcmp(name, "dst_mac")) {
- char *v = valstr;
- unsigned char old_dmac[ETH_ALEN];
- unsigned char *m = pkt_dev->dst_mac;
- memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN);
-
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
- if (len < 0) {
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
+
memset(valstr, 0, sizeof(valstr));
if (copy_from_user(valstr, &user_buffer[i], len))
return -EFAULT;
- i += len;
-
- for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) {
- if (*v >= '0' && *v <= '9') {
- *m *= 16;
- *m += *v - '0';
- }
- if (*v >= 'A' && *v <= 'F') {
- *m *= 16;
- *m += *v - 'A' + 10;
- }
- if (*v >= 'a' && *v <= 'f') {
- *m *= 16;
- *m += *v - 'a' + 10;
- }
- if (*v == ':') {
- m++;
- *m = 0;
- }
- }
+ if (!mac_pton(valstr, pkt_dev->dst_mac))
+ return -EINVAL;
/* Set up Dest MAC */
- if (compare_ether_addr(old_dmac, pkt_dev->dst_mac))
- memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
+ ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac);
- sprintf(pg_result, "OK: dstmac");
+ sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
return count;
}
if (!strcmp(name, "src_mac")) {
- char *v = valstr;
- unsigned char *m = pkt_dev->src_mac;
-
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
- if (len < 0) {
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
+ if (len < 0)
return len;
- }
+
memset(valstr, 0, sizeof(valstr));
if (copy_from_user(valstr, &user_buffer[i], len))
return -EFAULT;
- i += len;
- for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) {
- if (*v >= '0' && *v <= '9') {
- *m *= 16;
- *m += *v - '0';
- }
- if (*v >= 'A' && *v <= 'F') {
- *m *= 16;
- *m += *v - 'A' + 10;
- }
- if (*v >= 'a' && *v <= 'f') {
- *m *= 16;
- *m += *v - 'a' + 10;
- }
- if (*v == ':') {
- m++;
- *m = 0;
- }
- }
+ if (!mac_pton(valstr, pkt_dev->src_mac))
+ return -EINVAL;
+ /* Set up Src MAC */
+ ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac);
- sprintf(pg_result, "OK: srcmac");
+ sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
return count;
}
@@ -1472,11 +1623,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "flows")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value > MAX_CFLOWS)
value = MAX_CFLOWS;
@@ -1484,53 +1635,89 @@ static ssize_t pktgen_if_write(struct file *file,
sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
return count;
}
+#ifdef CONFIG_XFRM
+ if (!strcmp(name, "spi")) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
+ pkt_dev->spi = value;
+ sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
+ return count;
+ }
+#endif
if (!strcmp(name, "flowlen")) {
- len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) {
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
pkt_dev->lflow = value;
sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
return count;
}
+ if (!strcmp(name, "queue_map_min")) {
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
+
+ pkt_dev->queue_map_min = value;
+ sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_max")) {
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
+
+ pkt_dev->queue_map_max = value;
+ sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
+ return count;
+ }
+
if (!strcmp(name, "mpls")) {
- unsigned n, offset;
- len = get_labels(&user_buffer[i], pkt_dev);
- if (len < 0) { return len; }
- i += len;
- offset = sprintf(pg_result, "OK: mpls=");
- for(n = 0; n < pkt_dev->nr_labels; n++)
- offset += sprintf(pg_result + offset,
- "%08x%s", ntohl(pkt_dev->labels[n]),
- n == pkt_dev->nr_labels-1 ? "" : ",");
+ unsigned int n, cnt;
+
+ max = count - i;
+ len = get_labels(&user_buffer[i], max, pkt_dev);
+ if (len < 0)
+ return len;
+
+ cnt = sprintf(pg_result, "OK: mpls=");
+ for (n = 0; n < pkt_dev->nr_labels; n++)
+ cnt += sprintf(pg_result + cnt,
+ "%08x%s", ntohl(pkt_dev->labels[n]),
+ n == pkt_dev->nr_labels-1 ? "" : ",");
if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) {
pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
pkt_dev->svlan_id = 0xffff;
if (debug)
- printk("pktgen: VLAN/SVLAN auto turned off\n");
+ pr_debug("VLAN/SVLAN auto turned off\n");
}
return count;
}
if (!strcmp(name, "vlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
- if (len < 0) {
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (value <= 4095) {
pkt_dev->vlan_id = value; /* turn on VLAN */
if (debug)
- printk("pktgen: VLAN turned on\n");
+ pr_debug("VLAN turned on\n");
if (debug && pkt_dev->nr_labels)
- printk("pktgen: MPLS auto turned off\n");
+ pr_debug("MPLS auto turned off\n");
pkt_dev->nr_labels = 0; /* turn off MPLS */
sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
@@ -1539,17 +1726,17 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->svlan_id = 0xffff;
if (debug)
- printk("pktgen: VLAN/SVLAN turned off\n");
+ pr_debug("VLAN/SVLAN turned off\n");
}
return count;
}
if (!strcmp(name, "vlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
- if (len < 0) {
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_p = value;
sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
@@ -1560,11 +1747,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
- if (len < 0) {
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_cfi = value;
sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
@@ -1575,19 +1762,19 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
- if (len < 0) {
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
pkt_dev->svlan_id = value; /* turn on SVLAN */
if (debug)
- printk("pktgen: SVLAN turned on\n");
+ pr_debug("SVLAN turned on\n");
if (debug && pkt_dev->nr_labels)
- printk("pktgen: MPLS auto turned off\n");
+ pr_debug("MPLS auto turned off\n");
pkt_dev->nr_labels = 0; /* turn off MPLS */
sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
@@ -1596,17 +1783,17 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->svlan_id = 0xffff;
if (debug)
- printk("pktgen: VLAN/SVLAN turned off\n");
+ pr_debug("VLAN/SVLAN turned off\n");
}
return count;
}
if (!strcmp(name, "svlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
- if (len < 0) {
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_p = value;
sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
@@ -1617,11 +1804,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
- if (len < 0) {
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
return len;
- }
- i += len;
+
if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_cfi = value;
sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
@@ -1632,12 +1819,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "tos")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
- if (len < 0) {
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (len == 2) {
pkt_dev->tos = tmp_value;
sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
@@ -1648,12 +1836,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "traffic_class")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
- if (len < 0) {
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
+ if (len < 0)
return len;
- }
- i += len;
+
if (len == 2) {
pkt_dev->traffic_class = tmp_value;
sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
@@ -1663,83 +1852,91 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
+ if (!strcmp(name, "skb_priority")) {
+ max = min(9, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
+ if (len < 0)
+ return len;
+
+ pkt_dev->skb_priority = value;
+ sprintf(pg_result, "OK: skb_priority=%i",
+ pkt_dev->skb_priority);
+ return count;
+ }
+
sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
return -EINVAL;
}
static int pktgen_if_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_if_show, PDE(inode)->data);
+ return single_open(file, pktgen_if_show, pde_data(inode));
}
-static struct file_operations pktgen_if_fops = {
- .owner = THIS_MODULE,
- .open = pktgen_if_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_if_write,
- .release = single_release,
+static const struct proc_ops pktgen_if_proc_ops = {
+ .proc_open = pktgen_if_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pktgen_if_write,
+ .proc_release = single_release,
};
static int pktgen_thread_show(struct seq_file *seq, void *v)
{
struct pktgen_thread *t = seq->private;
- struct pktgen_dev *pkt_dev;
+ const struct pktgen_dev *pkt_dev;
BUG_ON(!t);
- seq_printf(seq, "Name: %s max_before_softirq: %d\n",
- t->name, t->max_before_softirq);
+ seq_puts(seq, "Running: ");
- seq_printf(seq, "Running: ");
-
- if_lock(t);
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (pkt_dev->running)
- seq_printf(seq, "%s ", pkt_dev->ifname);
+ seq_printf(seq, "%s ", pkt_dev->odevname);
- seq_printf(seq, "\nStopped: ");
+ seq_puts(seq, "\nStopped: ");
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (!pkt_dev->running)
- seq_printf(seq, "%s ", pkt_dev->ifname);
+ seq_printf(seq, "%s ", pkt_dev->odevname);
if (t->result[0])
seq_printf(seq, "\nResult: %s\n", t->result);
else
- seq_printf(seq, "\nResult: NA\n");
+ seq_puts(seq, "\nResult: NA\n");
- if_unlock(t);
+ rcu_read_unlock();
return 0;
}
static ssize_t pktgen_thread_write(struct file *file,
- const char __user * user_buffer,
- size_t count, loff_t * offset)
+ const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
- struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct seq_file *seq = file->private_data;
struct pktgen_thread *t = seq->private;
- int i = 0, max, len, ret;
+ size_t i, max;
+ ssize_t len, ret;
char name[40];
char *pg_result;
- unsigned long value = 0;
if (count < 1) {
// sprintf(pg_result, "Wrong command format");
return -EINVAL;
}
- max = count - i;
- len = count_trail_chars(&user_buffer[i], max);
+ max = count;
+ len = count_trail_chars(user_buffer, max);
if (len < 0)
return len;
- i += len;
+ i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1756,10 +1953,10 @@ static ssize_t pktgen_thread_write(struct file *file,
i += len;
if (debug)
- printk("pktgen: t=%s, count=%lu\n", name, (unsigned long)count);
+ pr_debug("t=%s, count=%lu\n", name, (unsigned long)count);
if (!t) {
- printk("pktgen: ERROR: No thread\n");
+ pr_err("ERROR: No thread\n");
ret = -EINVAL;
goto out;
}
@@ -1768,20 +1965,25 @@ static ssize_t pktgen_thread_write(struct file *file,
if (!strcmp(name, "add_device")) {
char f[32];
+
memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0) {
ret = len;
goto out;
}
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
+
mutex_lock(&pktgen_thread_lock);
- pktgen_add_device(t, f);
+ ret = pktgen_add_device(t, f);
mutex_unlock(&pktgen_thread_lock);
- ret = count;
- sprintf(pg_result, "OK: add_device=%s", f);
+ if (!ret) {
+ ret = count;
+ sprintf(pg_result, "OK: add_device=%s", f);
+ } else
+ sprintf(pg_result, "ERROR: can not add device %s", f);
goto out;
}
@@ -1796,12 +1998,8 @@ static ssize_t pktgen_thread_write(struct file *file,
}
if (!strcmp(name, "max_before_softirq")) {
- len = num_arg(&user_buffer[i], 10, &value);
- mutex_lock(&pktgen_thread_lock);
- t->max_before_softirq = value;
- mutex_unlock(&pktgen_thread_lock);
+ sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use");
ret = count;
- sprintf(pg_result, "OK: max_before_softirq=%lu", value);
goto out;
}
@@ -1812,32 +2010,31 @@ out:
static int pktgen_thread_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_thread_show, PDE(inode)->data);
+ return single_open(file, pktgen_thread_show, pde_data(inode));
}
-static struct file_operations pktgen_thread_fops = {
- .owner = THIS_MODULE,
- .open = pktgen_thread_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_thread_write,
- .release = single_release,
+static const struct proc_ops pktgen_thread_proc_ops = {
+ .proc_open = pktgen_thread_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pktgen_thread_write,
+ .proc_release = single_release,
};
/* Think find or remove for NN */
-static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)
+static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
+ const char *ifname, int remove)
{
struct pktgen_thread *t;
struct pktgen_dev *pkt_dev = NULL;
+ bool exact = (remove == FIND);
- list_for_each_entry(t, &pktgen_threads, th_list) {
- pkt_dev = pktgen_find_dev(t, ifname);
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ pkt_dev = pktgen_find_dev(t, ifname, exact);
if (pkt_dev) {
if (remove) {
- if_lock(t);
pkt_dev->removal_mark = 1;
t->control |= T_REMDEV;
- if_unlock(t);
}
break;
}
@@ -1848,106 +2045,147 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)
/*
* mark a device for removal
*/
-static int pktgen_mark_device(const char *ifname)
+static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
{
struct pktgen_dev *pkt_dev = NULL;
const int max_tries = 10, msec_per_try = 125;
int i = 0;
- int ret = 0;
mutex_lock(&pktgen_thread_lock);
- PG_DEBUG(printk("pktgen: pktgen_mark_device marking %s for removal\n",
- ifname));
+ pr_debug("%s: marking %s for removal\n", __func__, ifname);
while (1) {
- pkt_dev = __pktgen_NN_threads(ifname, REMOVE);
+ pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);
if (pkt_dev == NULL)
break; /* success */
mutex_unlock(&pktgen_thread_lock);
- PG_DEBUG(printk("pktgen: pktgen_mark_device waiting for %s "
- "to disappear....\n", ifname));
+ pr_debug("%s: waiting for %s to disappear....\n",
+ __func__, ifname);
schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
mutex_lock(&pktgen_thread_lock);
if (++i >= max_tries) {
- printk("pktgen_mark_device: timed out after waiting "
- "%d msec for device %s to be removed\n",
- msec_per_try * i, ifname);
- ret = 1;
+ pr_err("%s: timed out after waiting %d msec for device %s to be removed\n",
+ __func__, msec_per_try * i, ifname);
break;
}
}
mutex_unlock(&pktgen_thread_lock);
+}
- return ret;
+static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)
+{
+ struct pktgen_thread *t;
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ struct pktgen_dev *pkt_dev;
+
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ if (pkt_dev->odev != dev)
+ continue;
+
+ proc_remove(pkt_dev->entry);
+
+ pkt_dev->entry = proc_create_data(dev->name, 0600,
+ pn->proc_dir,
+ &pktgen_if_proc_ops,
+ pkt_dev);
+ if (!pkt_dev->entry)
+ pr_err("can't move proc entry for '%s'\n",
+ dev->name);
+ break;
+ }
+ if_unlock(t);
+ }
+ mutex_unlock(&pktgen_thread_lock);
}
static int pktgen_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
- struct net_device *dev = (struct net_device *)(ptr);
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id);
+
+ if (pn->pktgen_exiting)
+ return NOTIFY_DONE;
/* It is OK that we do not hold the group lock right now,
* as we run under the RTNL lock.
*/
switch (event) {
- case NETDEV_CHANGEADDR:
- case NETDEV_GOING_DOWN:
- case NETDEV_DOWN:
- case NETDEV_UP:
- /* Ignore for now */
+ case NETDEV_CHANGENAME:
+ pktgen_change_name(pn, dev);
break;
case NETDEV_UNREGISTER:
- pktgen_mark_device(dev->name);
+ pktgen_mark_device(pn, dev->name);
break;
- };
+ }
return NOTIFY_DONE;
}
+static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev,
+ const char *ifname)
+{
+ char b[IFNAMSIZ+5];
+ int i;
+
+ for (i = 0; ifname[i] != '@'; i++) {
+ if (i == IFNAMSIZ)
+ break;
+
+ b[i] = ifname[i];
+ }
+ b[i] = 0;
+
+ return dev_get_by_name(pn->net, b);
+}
+
+
/* Associate pktgen_dev with a device. */
-static struct net_device *pktgen_setup_dev(struct pktgen_dev *pkt_dev)
+static int pktgen_setup_dev(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev, const char *ifname)
{
struct net_device *odev;
+ int err;
/* Clean old setups */
-
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
- odev = dev_get_by_name(pkt_dev->ifname);
-
+ odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);
if (!odev) {
- printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname);
- goto out;
- }
- if (odev->type != ARPHRD_ETHER) {
- printk("pktgen: not an ethernet device: \"%s\"\n",
- pkt_dev->ifname);
- goto out_put;
- }
- if (!netif_running(odev)) {
- printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname);
- goto out_put;
+ pr_err("no such netdevice: \"%s\"\n", ifname);
+ return -ENODEV;
}
- pkt_dev->odev = odev;
- return pkt_dev->odev;
+ if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) {
+ pr_err("not an ethernet or loopback device: \"%s\"\n", ifname);
+ err = -EINVAL;
+ } else if (!netif_running(odev)) {
+ pr_err("device is down: \"%s\"\n", ifname);
+ err = -ENETDOWN;
+ } else {
+ pkt_dev->odev = odev;
+ netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL);
+ return 0;
+ }
-out_put:
dev_put(odev);
-out:
- return NULL;
-
+ return err;
}
/* Read pkt_dev from the interface and set up internal pktgen_dev
@@ -1955,39 +2193,51 @@ out:
*/
static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
{
- /* Try once more, just in case it works now. */
- if (!pkt_dev->odev)
- pktgen_setup_dev(pkt_dev);
+ int ntxq;
if (!pkt_dev->odev) {
- printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n");
+ pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n");
sprintf(pkt_dev->result,
"ERROR: pkt_dev->odev == NULL in setup_inject.\n");
return;
}
+ /* make sure that we don't pick a non-existing transmit queue */
+ ntxq = pkt_dev->odev->real_num_tx_queues;
+
+ if (ntxq <= pkt_dev->queue_map_min) {
+ pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
+ }
+ if (pkt_dev->queue_map_max >= ntxq) {
+ pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
+ }
+
/* Default to the interface's mac if not explicitly set. */
if (is_zero_ether_addr(pkt_dev->src_mac))
- memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN);
+ ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);
/* Set up Dest MAC */
- memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
-
- /* Set up pkt size */
- pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);
if (pkt_dev->flags & F_IPV6) {
- /*
- * Skip this automatic address setting until locks or functions
- * gets exported
- */
-
-#ifdef NOTNOW
int i, set = 0, err = 1;
struct inet6_dev *idev;
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
+ for (i = 0; i < sizeof(struct in6_addr); i++)
if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
set = 1;
break;
@@ -2002,18 +2252,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
*/
rcu_read_lock();
- if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) {
+ idev = __in6_dev_get(pkt_dev->odev);
+ if (idev) {
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
- for (ifp = idev->addr_list; ifp;
- ifp = ifp->if_next) {
- if (ifp->scope == IFA_LINK
- && !(ifp->
- flags & IFA_F_TENTATIVE)) {
- ipv6_addr_copy(&pkt_dev->
- cur_in6_saddr,
- &ifp->addr);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if ((ifp->scope & IFA_LINK) &&
+ !(ifp->flags & IFA_F_TENTATIVE)) {
+ pkt_dev->cur_in6_saddr = ifp->addr;
err = 0;
break;
}
@@ -2022,10 +2269,16 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
}
rcu_read_unlock();
if (err)
- printk("pktgen: ERROR: IPv6 link address not availble.\n");
+ pr_err("ERROR: IPv6 link address not available\n");
}
-#endif
} else {
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
pkt_dev->saddr_min = 0;
pkt_dev->saddr_max = 0;
if (strlen(pkt_dev->src_min) == 0) {
@@ -2035,9 +2288,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
rcu_read_lock();
in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
- if (in_dev->ifa_list) {
- pkt_dev->saddr_min =
- in_dev->ifa_list->ifa_address;
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(in_dev->ifa_list);
+ if (ifa) {
+ pkt_dev->saddr_min = ifa->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
}
@@ -2051,6 +2306,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
}
/* Initialize current values. */
+ pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size)
+ pkt_dev->max_pkt_size = pkt_dev->min_pkt_size;
+
pkt_dev->cur_dst_mac_offset = 0;
pkt_dev->cur_src_mac_offset = 0;
pkt_dev->cur_saddr = pkt_dev->saddr_min;
@@ -2060,29 +2319,140 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
pkt_dev->nflows = 0;
}
-static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us)
+
+static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
{
- __u64 start;
- __u64 now;
+ ktime_t start_time, end_time;
+ s64 remaining;
+ struct hrtimer_sleeper t;
- start = now = getCurUs();
- printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now));
- while (now < spin_until_us) {
- /* TODO: optimize sleeping behavior */
- if (spin_until_us - now > jiffies_to_usecs(1) + 1)
- schedule_timeout_interruptible(1);
- else if (spin_until_us - now > 100) {
- do_softirq();
- if (!pkt_dev->running)
- return;
- if (need_resched())
+ hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_set_expires(&t.timer, spin_until);
+
+ remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
+ if (remaining <= 0)
+ goto out;
+
+ start_time = ktime_get();
+ if (remaining < 100000) {
+ /* for small delays (<100us), just loop until limit is reached */
+ do {
+ end_time = ktime_get();
+ } while (ktime_compare(end_time, spin_until) < 0);
+ } else {
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS);
+
+ if (likely(t.task))
schedule();
+
+ hrtimer_cancel(&t.timer);
+ } while (t.task && pkt_dev->running && !signal_pending(current));
+ __set_current_state(TASK_RUNNING);
+ end_time = ktime_get();
+ }
+
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
+ pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ destroy_hrtimer_on_stack(&t.timer);
+}
+
+static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
+ pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
+ pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
+}
+
+static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow)
+{
+ return !!(pkt_dev->flows[flow].flags & F_INIT);
+}
+
+static inline int f_pick(struct pktgen_dev *pkt_dev)
+{
+ int flow = pkt_dev->curfl;
+
+ if (pkt_dev->flags & F_FLOW_SEQ) {
+ if (pkt_dev->flows[flow].count >= pkt_dev->lflow) {
+ /* reset time */
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ pkt_dev->curfl += 1;
+ if (pkt_dev->curfl >= pkt_dev->cflows)
+ pkt_dev->curfl = 0; /*reset */
}
+ } else {
+ flow = get_random_u32_below(pkt_dev->cflows);
+ pkt_dev->curfl = flow;
- now = getCurUs();
+ if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ }
}
- pkt_dev->idle_acc += now - start;
+ return pkt_dev->curfl;
+}
+
+
+/* If there was already an IPSEC SA, we keep it as is, else
+ * we go look for it ...
+ */
+#define DUMMY_MARK 0
+static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
+{
+#ifdef CONFIG_XFRM
+ struct xfrm_state *x = pkt_dev->flows[flow].x;
+ struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+
+ if (!x) {
+
+ if (pkt_dev->spi) {
+ /* We need as quick as possible to find the right SA
+ * Searching with minimum criteria to achieve, this.
+ */
+ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
+ } else {
+ /* slow path: we don't already have xfrm_state */
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
+ (xfrm_address_t *)&pkt_dev->cur_daddr,
+ (xfrm_address_t *)&pkt_dev->cur_saddr,
+ AF_INET,
+ pkt_dev->ipsmode,
+ pkt_dev->ipsproto, 0);
+ }
+ if (x) {
+ pkt_dev->flows[flow].x = x;
+ set_pkt_overhead(pkt_dev);
+ pkt_dev->pkt_overhead += x->props.header_len;
+ }
+
+ }
+#endif
+}
+static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ pkt_dev->cur_queue_map = smp_processor_id();
+
+ else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
+ __u16 t;
+
+ if (pkt_dev->flags & F_QUEUE_MAP_RND) {
+ t = get_random_u32_inclusive(pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
+ } else {
+ t = pkt_dev->cur_queue_map + 1;
+ if (t > pkt_dev->queue_map_max)
+ t = pkt_dev->queue_map_min;
+ }
+ pkt_dev->cur_queue_map = t;
+ }
+ pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues;
}
/* Increment/randomize headers according to flags and current values
@@ -2094,12 +2464,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 imx;
int flow = 0;
- if (pkt_dev->cflows) {
- flow = pktgen_random() % pkt_dev->cflows;
-
- if (pkt_dev->flows[flow].count > pkt_dev->lflow)
- pkt_dev->flows[flow].count = 0;
- }
+ if (pkt_dev->cflows)
+ flow = f_pick(pkt_dev);
/* Deal with source MAC */
if (pkt_dev->src_mac_count > 1) {
@@ -2107,10 +2473,10 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACSRC_RND)
- mc = pktgen_random() % (pkt_dev->src_mac_count);
+ mc = get_random_u32_below(pkt_dev->src_mac_count);
else {
mc = pkt_dev->cur_src_mac_offset++;
- if (pkt_dev->cur_src_mac_offset >
+ if (pkt_dev->cur_src_mac_offset >=
pkt_dev->src_mac_count)
pkt_dev->cur_src_mac_offset = 0;
}
@@ -2133,11 +2499,11 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACDST_RND)
- mc = pktgen_random() % (pkt_dev->dst_mac_count);
+ mc = get_random_u32_below(pkt_dev->dst_mac_count);
else {
mc = pkt_dev->cur_dst_mac_offset++;
- if (pkt_dev->cur_dst_mac_offset >
+ if (pkt_dev->cur_dst_mac_offset >=
pkt_dev->dst_mac_count) {
pkt_dev->cur_dst_mac_offset = 0;
}
@@ -2156,28 +2522,27 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
}
if (pkt_dev->flags & F_MPLS_RND) {
- unsigned i;
- for(i = 0; i < pkt_dev->nr_labels; i++)
+ unsigned int i;
+
+ for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
- ((__force __be32)pktgen_random() &
+ ((__force __be32)get_random_u32() &
htonl(0x000fffff));
}
if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
- pkt_dev->vlan_id = pktgen_random() % 4096;
+ pkt_dev->vlan_id = get_random_u32_below(4096);
}
if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
- pkt_dev->svlan_id = pktgen_random() % 4096;
+ pkt_dev->svlan_id = get_random_u32_below(4096);
}
if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src =
- ((pktgen_random() %
- (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) +
- pkt_dev->udp_src_min);
+ pkt_dev->cur_udp_src = get_random_u32_inclusive(pkt_dev->udp_src_min,
+ pkt_dev->udp_src_max - 1);
else {
pkt_dev->cur_udp_src++;
@@ -2188,10 +2553,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst =
- ((pktgen_random() %
- (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) +
- pkt_dev->udp_dst_min);
+ pkt_dev->cur_udp_dst = get_random_u32_inclusive(pkt_dev->udp_dst_min,
+ pkt_dev->udp_dst_max - 1);
} else {
pkt_dev->cur_udp_dst++;
if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
@@ -2201,23 +2564,24 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (!(pkt_dev->flags & F_IPV6)) {
- if ((imn = ntohl(pkt_dev->saddr_min)) < (imx =
- ntohl(pkt_dev->
- saddr_max))) {
+ imn = ntohl(pkt_dev->saddr_min);
+ imx = ntohl(pkt_dev->saddr_max);
+ if (imn < imx) {
__u32 t;
+
if (pkt_dev->flags & F_IPSRC_RND)
- t = ((pktgen_random() % (imx - imn)) + imn);
+ t = get_random_u32_inclusive(imn, imx - 1);
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
- if (t > imx) {
+ if (t > imx)
t = imn;
- }
+
}
pkt_dev->cur_saddr = htonl(t);
}
- if (pkt_dev->cflows && pkt_dev->flows[flow].count != 0) {
+ if (pkt_dev->cflows && f_seen(pkt_dev, flow)) {
pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
} else {
imn = ntohl(pkt_dev->daddr_min);
@@ -2225,18 +2589,17 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
__be32 s;
- if (pkt_dev->flags & F_IPDST_RND) {
- t = pktgen_random() % (imx - imn) + imn;
- s = htonl(t);
+ if (pkt_dev->flags & F_IPDST_RND) {
- while (LOOPBACK(s) || MULTICAST(s)
- || BADCLASS(s) || ZERONET(s)
- || LOCAL_MCAST(s)) {
- t = (pktgen_random() %
- (imx - imn)) + imn;
+ do {
+ t = get_random_u32_inclusive(imn, imx - 1);
s = htonl(t);
- }
+ } while (ipv4_is_loopback(s) ||
+ ipv4_is_multicast(s) ||
+ ipv4_is_lbcast(s) ||
+ ipv4_is_zeronet(s) ||
+ ipv4_is_local_multicast(s));
pkt_dev->cur_daddr = s;
} else {
t = ntohl(pkt_dev->cur_daddr);
@@ -2248,25 +2611,24 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
}
}
if (pkt_dev->cflows) {
+ pkt_dev->flows[flow].flags |= F_INIT;
pkt_dev->flows[flow].cur_daddr =
pkt_dev->cur_daddr;
+ if (pkt_dev->flags & F_IPSEC)
+ get_ipsec_sa(pkt_dev, flow);
pkt_dev->nflows++;
}
}
} else { /* IPV6 * */
- if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ;
- else {
+ if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {
int i;
/* Only random destinations yet */
for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- (((__force __be32)pktgen_random() |
+ (((__force __be32)get_random_u32() |
pkt_dev->min_in6_daddr.s6_addr32[i]) &
pkt_dev->max_in6_daddr.s6_addr32[i]);
}
@@ -2275,27 +2637,175 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
+
if (pkt_dev->flags & F_TXSIZE_RND) {
- t = ((pktgen_random() %
- (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size))
- + pkt_dev->min_pkt_size);
+ t = get_random_u32_inclusive(pkt_dev->min_pkt_size,
+ pkt_dev->max_pkt_size - 1);
} else {
t = pkt_dev->cur_pkt_size + 1;
if (t > pkt_dev->max_pkt_size)
t = pkt_dev->min_pkt_size;
}
pkt_dev->cur_pkt_size = t;
+ } else if (pkt_dev->n_imix_entries > 0) {
+ struct imix_pkt *entry;
+ __u32 t = get_random_u32_below(IMIX_PRECISION);
+ __u8 entry_index = pkt_dev->imix_distribution[t];
+
+ entry = &pkt_dev->imix_entries[entry_index];
+ entry->count_so_far++;
+ pkt_dev->cur_pkt_size = entry->size;
}
+ set_cur_queue_map(pkt_dev);
+
pkt_dev->flows[flow].count++;
}
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev)
+{
+ int cumulative_probabilites[MAX_IMIX_ENTRIES];
+ int j = 0;
+ __u64 cumulative_prob = 0;
+ __u64 total_weight = 0;
+ int i = 0;
+
+ for (i = 0; i < pkt_dev->n_imix_entries; i++)
+ total_weight += pkt_dev->imix_entries[i].weight;
+
+ /* Fill cumulative_probabilites with sum of normalized probabilities */
+ for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) {
+ cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight *
+ IMIX_PRECISION,
+ total_weight);
+ cumulative_probabilites[i] = cumulative_prob;
+ }
+ cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100;
+
+ for (i = 0; i < IMIX_PRECISION; i++) {
+ if (i == cumulative_probabilites[j])
+ j++;
+ pkt_dev->imix_distribution[i] = j;
+ }
+}
+
+#ifdef CONFIG_XFRM
+static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
+
+ [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */
+};
+
+static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
+{
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int err = 0;
+ struct net *net = dev_net(pkt_dev->odev);
+
+ if (!x)
+ return 0;
+ /* XXX: we dont support tunnel mode for now until
+ * we resolve the dst issue
+ */
+ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
+ return 0;
+
+ /* But when user specify an valid SPI, transformation
+ * supports both transport/tunnel mode + ESP/AH type.
+ */
+ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
+ skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
+
+ rcu_read_lock_bh();
+ err = pktgen_xfrm_outer_mode_output(x, skb);
+ rcu_read_unlock_bh();
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
+ goto error;
+ }
+ err = x->type->output(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ goto error;
+ }
+ spin_lock_bh(&x->lock);
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+ spin_unlock_bh(&x->lock);
+error:
+ return err;
+}
+
+static void free_SAs(struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->cflows) {
+ /* let go of the SAs if we have them */
+ int i;
+
+ for (i = 0; i < pkt_dev->cflows; i++) {
+ struct xfrm_state *x = pkt_dev->flows[i].x;
+
+ if (x) {
+ xfrm_state_put(x);
+ pkt_dev->flows[i].x = NULL;
+ }
+ }
+ }
+}
+
+static int process_ipsec(struct pktgen_dev *pkt_dev,
+ struct sk_buff *skb, __be16 protocol)
+{
+ if (pkt_dev->flags & F_IPSEC) {
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int nhead = 0;
+
+ if (x) {
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ int ret;
+
+ nhead = x->props.header_len - skb_headroom(skb);
+ if (nhead > 0) {
+ ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("Error expanding ipsec packet %d\n",
+ ret);
+ goto err;
+ }
+ }
+
+ /* ipsec is not expecting ll header */
+ skb_pull(skb, ETH_HLEN);
+ ret = pktgen_output_ipsec(skb, pkt_dev);
+ if (ret) {
+ pr_err("Error creating ipsec packet %d\n", ret);
+ goto err;
+ }
+ /* restore ll */
+ eth = skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);
+ eth->h_proto = protocol;
+
+ /* Update IPv4 header len as well as checksum value */
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(skb->len - ETH_HLEN);
+ ip_send_check(iph);
+ }
+ }
+ return 1;
+err:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
{
- unsigned i;
- for(i = 0; i < pkt_dev->nr_labels; i++) {
+ unsigned int i;
+
+ for (i = 0; i < pkt_dev->nr_labels; i++)
*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
- }
+
mpls--;
*mpls |= MPLS_STACK_BOTTOM;
}
@@ -2306,6 +2816,113 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,
return htons(id | (cfi << 12) | (prio << 13));
}
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timespec64 timestamp;
+ struct pktgen_hdr *pgh;
+
+ pgh = skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ skb_put_zero(skb, datalen);
+ } else {
+ int frags = pkt_dev->nfrags;
+ int i, len;
+ int frag_len;
+
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ skb_put_zero(skb, len);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ frag_len = min_t(int, datalen / frags, PAGE_SIZE);
+ while (datalen > 0) {
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ get_page(pkt_dev->page);
+
+ /*last fragment, fill rest of data*/
+ if (i == (frags - 1))
+ skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
+ pkt_dev->page, 0,
+ min(datalen, PAGE_SIZE));
+ else
+ skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
+ pkt_dev->page, 0, frag_len);
+
+ datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ if (pkt_dev->flags & F_NO_TIMESTAMP) {
+ pgh->tv_sec = 0;
+ pgh->tv_usec = 0;
+ } else {
+ /*
+ * pgh->tv_sec wraps in y2106 when interpreted as unsigned
+ * as done by wireshark, or y2038 when interpreted as signed.
+ * This is probably harmless, but if anyone wants to improve
+ * it, we could introduce a variant that puts 64-bit nanoseconds
+ * into the respective header bytes.
+ * This would also be slightly faster to read.
+ */
+ ktime_get_real_ts64(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_nsec / NSEC_PER_USEC);
+ }
+}
+
+static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
+ struct pktgen_dev *pkt_dev)
+{
+ unsigned int extralen = LL_RESERVED_SPACE(dev);
+ struct sk_buff *skb = NULL;
+ unsigned int size;
+
+ size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead;
+ if (pkt_dev->flags & F_NODE) {
+ int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
+
+ skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node);
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+ } else {
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ }
+
+ /* the caller pre-fetches from skb->data and reserves for the mac hdr */
+ if (likely(skb))
+ skb_reserve(skb, extralen - 16);
+
+ return skb;
+}
+
static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
@@ -2314,77 +2931,81 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct udphdr *udph;
int datalen, iplen;
struct iphdr *iph;
- struct pktgen_hdr *pgh = NULL;
- __be16 protocol = __constant_htons(ETH_P_IP);
+ __be16 protocol = htons(ETH_P_IP);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
__be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
__be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
__be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
-
+ u16 queue_map;
if (pkt_dev->nr_labels)
- protocol = __constant_htons(ETH_P_MPLS_UC);
+ protocol = htons(ETH_P_MPLS_UC);
if (pkt_dev->vlan_id != 0xffff)
- protocol = __constant_htons(ETH_P_8021Q);
+ protocol = htons(ETH_P_8021Q);
/* Update any of the values, used when we're incrementing various
* fields.
*/
mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
- datalen = (odev->hard_header_len + 16) & ~0xf;
- skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + datalen +
- pkt_dev->nr_labels*sizeof(u32) +
- VLAN_TAG_SIZE(pkt_dev) + SVLAN_TAG_SIZE(pkt_dev),
- GFP_ATOMIC);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
- skb_reserve(skb, datalen);
+ prefetchw(skb->data);
+ skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
- eth = (__u8 *) skb_push(skb, 14);
- mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ eth = skb_push(skb, 14);
+ mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));
if (pkt_dev->nr_labels)
mpls_push(mpls, pkt_dev);
if (pkt_dev->vlan_id != 0xffff) {
- if(pkt_dev->svlan_id != 0xffff) {
- svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = skb_put(skb, sizeof(__be16));
*svlan_tci = build_tci(pkt_dev->svlan_id,
pkt_dev->svlan_cfi,
pkt_dev->svlan_p);
- svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
- *svlan_encapsulated_proto = __constant_htons(ETH_P_8021Q);
+ svlan_encapsulated_proto = skb_put(skb,
+ sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
}
- vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ vlan_tci = skb_put(skb, sizeof(__be16));
*vlan_tci = build_tci(pkt_dev->vlan_id,
pkt_dev->vlan_cfi,
pkt_dev->vlan_p);
- vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
- *vlan_encapsulated_proto = __constant_htons(ETH_P_IP);
+ vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IP);
}
- iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
- udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->len);
+ iph = skb_put(skb, sizeof(struct iphdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(__be16 *) & eth[12] = protocol;
+ *(__be16 *)&eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
- pkt_dev->nr_labels*sizeof(u32) - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev);
- if (datalen < sizeof(struct pktgen_hdr))
+ pkt_dev->pkt_overhead;
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr))
datalen = sizeof(struct pktgen_hdr);
udph->source = htons(pkt_dev->cur_udp_src);
udph->dest = htons(pkt_dev->cur_udp_dst);
udph->len = htons(datalen + 8); /* DATA + udphdr */
- udph->check = 0; /* No checksum */
+ udph->check = 0;
iph->ihl = 5;
iph->version = 4;
@@ -2393,342 +3014,133 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
iph->protocol = IPPROTO_UDP; /* UDP */
iph->saddr = pkt_dev->cur_saddr;
iph->daddr = pkt_dev->cur_daddr;
+ iph->id = htons(pkt_dev->ip_id);
+ pkt_dev->ip_id++;
iph->frag_off = 0;
iplen = 20 + 8 + datalen;
iph->tot_len = htons(iplen);
- iph->check = 0;
- iph->check = ip_fast_csum((void *)iph, iph->ihl);
+ ip_send_check(iph);
skb->protocol = protocol;
- skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32) -
- VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev);
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- skb->nh.iph = iph;
- skb->h.uh = udph;
-
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
- int frags = pkt_dev->nfrags;
- int i;
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- skb_put(skb, datalen - frags * PAGE_SIZE);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+ udp4_hwcsum(skb, iph->saddr, iph->daddr);
+ } else {
+ __wsum csum = skb_checksum(skb, skb_transport_offset(skb), datalen + 8, 0);
- skb_shinfo(skb)->frags[i - 1].size -= rem;
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen + 8, IPPROTO_UDP, csum);
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
}
- /* Stamp the time, and sequence number, convert them to network byte order */
-
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
+#ifdef CONFIG_XFRM
+ if (!process_ipsec(pkt_dev, skb, protocol))
+ return NULL;
+#endif
return skb;
}
-/*
- * scan_ip6, fmt_ip taken from dietlibc-0.21
- * Author Felix von Leitner <felix-dietlibc@fefe.de>
- *
- * Slightly modified for kernel.
- * Should be candidate for net/ipv4/utils.c
- * --ro
- */
-
-static unsigned int scan_ip6(const char *s, char ip[16])
-{
- unsigned int i;
- unsigned int len = 0;
- unsigned long u;
- char suffix[16];
- unsigned int prefixlen = 0;
- unsigned int suffixlen = 0;
- __be32 tmp;
-
- for (i = 0; i < 16; i++)
- ip[i] = 0;
-
- for (;;) {
- if (*s == ':') {
- len++;
- if (s[1] == ':') { /* Found "::", skip to part 2 */
- s += 2;
- len++;
- break;
- }
- s++;
- }
- {
- char *tmp;
- u = simple_strtoul(s, &tmp, 16);
- i = tmp - s;
- }
-
- if (!i)
- return 0;
- if (prefixlen == 12 && s[i] == '.') {
-
- /* the last 4 bytes may be written as IPv4 address */
-
- tmp = in_aton(s);
- memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp));
- return i + len;
- }
- ip[prefixlen++] = (u >> 8);
- ip[prefixlen++] = (u & 255);
- s += i;
- len += i;
- if (prefixlen == 16)
- return len;
- }
-
-/* part 2, after "::" */
- for (;;) {
- if (*s == ':') {
- if (suffixlen == 0)
- break;
- s++;
- len++;
- } else if (suffixlen != 0)
- break;
- {
- char *tmp;
- u = simple_strtol(s, &tmp, 16);
- i = tmp - s;
- }
- if (!i) {
- if (*s)
- len--;
- break;
- }
- if (suffixlen + prefixlen <= 12 && s[i] == '.') {
- tmp = in_aton(s);
- memcpy((struct in_addr *)(suffix + suffixlen), &tmp,
- sizeof(tmp));
- suffixlen += 4;
- len += strlen(s);
- break;
- }
- suffix[suffixlen++] = (u >> 8);
- suffix[suffixlen++] = (u & 255);
- s += i;
- len += i;
- if (prefixlen + suffixlen == 16)
- break;
- }
- for (i = 0; i < suffixlen; i++)
- ip[16 - suffixlen + i] = suffix[i];
- return len;
-}
-
-static char tohex(char hexdigit)
-{
- return hexdigit > 9 ? hexdigit + 'a' - 10 : hexdigit + '0';
-}
-
-static int fmt_xlong(char *s, unsigned int i)
-{
- char *bak = s;
- *s = tohex((i >> 12) & 0xf);
- if (s != bak || *s != '0')
- ++s;
- *s = tohex((i >> 8) & 0xf);
- if (s != bak || *s != '0')
- ++s;
- *s = tohex((i >> 4) & 0xf);
- if (s != bak || *s != '0')
- ++s;
- *s = tohex(i & 0xf);
- return s - bak + 1;
-}
-
-static unsigned int fmt_ip6(char *s, const char ip[16])
-{
- unsigned int len;
- unsigned int i;
- unsigned int temp;
- unsigned int compressing;
- int j;
-
- len = 0;
- compressing = 0;
- for (j = 0; j < 16; j += 2) {
-
-#ifdef V4MAPPEDPREFIX
- if (j == 12 && !memcmp(ip, V4mappedprefix, 12)) {
- inet_ntoa_r(*(struct in_addr *)(ip + 12), s);
- temp = strlen(s);
- return len + temp;
- }
-#endif
- temp = ((unsigned long)(unsigned char)ip[j] << 8) +
- (unsigned long)(unsigned char)ip[j + 1];
- if (temp == 0) {
- if (!compressing) {
- compressing = 1;
- if (j == 0) {
- *s++ = ':';
- ++len;
- }
- }
- } else {
- if (compressing) {
- compressing = 0;
- *s++ = ':';
- ++len;
- }
- i = fmt_xlong(s, temp);
- len += i;
- s += i;
- if (j < 14) {
- *s++ = ':';
- ++len;
- }
- }
- }
- if (compressing) {
- *s++ = ':';
- ++len;
- }
- *s = 0;
- return len;
-}
-
static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
struct sk_buff *skb = NULL;
__u8 *eth;
struct udphdr *udph;
- int datalen;
+ int datalen, udplen;
struct ipv6hdr *iph;
- struct pktgen_hdr *pgh = NULL;
- __be16 protocol = __constant_htons(ETH_P_IPV6);
+ __be16 protocol = htons(ETH_P_IPV6);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
__be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
__be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
__be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
if (pkt_dev->nr_labels)
- protocol = __constant_htons(ETH_P_MPLS_UC);
+ protocol = htons(ETH_P_MPLS_UC);
if (pkt_dev->vlan_id != 0xffff)
- protocol = __constant_htons(ETH_P_8021Q);
+ protocol = htons(ETH_P_8021Q);
/* Update any of the values, used when we're incrementing various
* fields.
*/
mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
- skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16 +
- pkt_dev->nr_labels*sizeof(u32) +
- VLAN_TAG_SIZE(pkt_dev) + SVLAN_TAG_SIZE(pkt_dev),
- GFP_ATOMIC);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
+ prefetchw(skb->data);
skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
- eth = (__u8 *) skb_push(skb, 14);
- mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ eth = skb_push(skb, 14);
+ mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));
if (pkt_dev->nr_labels)
mpls_push(mpls, pkt_dev);
if (pkt_dev->vlan_id != 0xffff) {
- if(pkt_dev->svlan_id != 0xffff) {
- svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = skb_put(skb, sizeof(__be16));
*svlan_tci = build_tci(pkt_dev->svlan_id,
pkt_dev->svlan_cfi,
pkt_dev->svlan_p);
- svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
- *svlan_encapsulated_proto = __constant_htons(ETH_P_8021Q);
+ svlan_encapsulated_proto = skb_put(skb,
+ sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
}
- vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ vlan_tci = skb_put(skb, sizeof(__be16));
*vlan_tci = build_tci(pkt_dev->vlan_id,
pkt_dev->vlan_cfi,
pkt_dev->vlan_p);
- vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
- *vlan_encapsulated_proto = __constant_htons(ETH_P_IPV6);
+ vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IPV6);
}
- iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
- udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->len);
+ iph = skb_put(skb, sizeof(struct ipv6hdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(__be16 *) & eth[12] = protocol;
+ *(__be16 *) &eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 -
sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
- pkt_dev->nr_labels*sizeof(u32) - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev);
+ pkt_dev->pkt_overhead;
- if (datalen < sizeof(struct pktgen_hdr)) {
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {
datalen = sizeof(struct pktgen_hdr);
- if (net_ratelimit())
- printk(KERN_INFO "pktgen: increased datalen to %d\n",
- datalen);
+ net_info_ratelimited("increased datalen to %d\n", datalen);
}
+ udplen = datalen + sizeof(struct udphdr);
udph->source = htons(pkt_dev->cur_udp_src);
udph->dest = htons(pkt_dev->cur_udp_dst);
- udph->len = htons(datalen + sizeof(struct udphdr));
- udph->check = 0; /* No checksum */
+ udph->len = htons(udplen);
+ udph->check = 0;
- *(__be32 *) iph = __constant_htonl(0x60000000); /* Version + flow */
+ *(__be32 *) iph = htonl(0x60000000); /* Version + flow */
if (pkt_dev->traffic_class) {
/* Version + traffic class + flow (0) */
@@ -2737,93 +3149,40 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
iph->hop_limit = 32;
- iph->payload_len = htons(sizeof(struct udphdr) + datalen);
+ iph->payload_len = htons(udplen);
iph->nexthdr = IPPROTO_UDP;
- ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
- ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
- skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32) -
- VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev);
skb->protocol = protocol;
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- skb->nh.ipv6h = iph;
- skb->h.uh = udph;
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
- int frags = pkt_dev->nfrags;
- int i;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- skb_put(skb, datalen - frags * PAGE_SIZE);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
-
- skb_shinfo(skb)->frags[i - 1].size -= rem;
-
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
- }
-
- /* Stamp the time, and sequence number, convert them to network byte order */
- /* should we update cloned packets too ? */
- if (pgh) {
- struct timeval timestamp;
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
+ } else {
+ __wsum csum = skb_checksum(skb, skb_transport_offset(skb), udplen, 0);
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum);
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
}
- /* pkt_dev->seq_num++; FF: you really mean this? */
return skb;
}
-static inline struct sk_buff *fill_packet(struct net_device *odev,
- struct pktgen_dev *pkt_dev)
+static struct sk_buff *fill_packet(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
{
if (pkt_dev->flags & F_IPV6)
return fill_packet_ipv6(odev, pkt_dev);
@@ -2847,10 +3206,10 @@ static void pktgen_run(struct pktgen_thread *t)
struct pktgen_dev *pkt_dev;
int started = 0;
- PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t));
+ func_enter();
- if_lock(t);
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
/*
* setup odev and create initial packet.
@@ -2859,134 +3218,156 @@ static void pktgen_run(struct pktgen_thread *t)
if (pkt_dev->odev) {
pktgen_clear_counters(pkt_dev);
- pkt_dev->running = 1; /* Cranke yeself! */
pkt_dev->skb = NULL;
- pkt_dev->started_at = getCurUs();
- pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */
- pkt_dev->next_tx_ns = 0;
+ pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
+
+ set_pkt_overhead(pkt_dev);
- strcpy(pkt_dev->result, "Starting");
+ strscpy(pkt_dev->result, "Starting");
+ pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
- strcpy(pkt_dev->result, "Error starting");
+ strscpy(pkt_dev->result, "Error starting");
}
- if_unlock(t);
+ rcu_read_unlock();
if (started)
t->control &= ~(T_STOP);
}
-static void pktgen_stop_all_threads_ifs(void)
+static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags)
{
struct pktgen_thread *t;
- PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads_ifs.\n"));
-
mutex_lock(&pktgen_thread_lock);
- list_for_each_entry(t, &pktgen_threads, th_list)
- t->control |= T_STOP;
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (flags);
mutex_unlock(&pktgen_thread_lock);
}
-static int thread_is_running(struct pktgen_thread *t)
+static void pktgen_stop_all_threads(struct pktgen_net *pn)
{
- struct pktgen_dev *pkt_dev;
- int res = 0;
+ func_enter();
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ pktgen_handle_all_threads(pn, T_STOP);
+}
+
+static int thread_is_running(const struct pktgen_thread *t)
+{
+ const struct pktgen_dev *pkt_dev;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (pkt_dev->running) {
- res = 1;
- break;
+ rcu_read_unlock();
+ return 1;
}
- return res;
+ rcu_read_unlock();
+ return 0;
}
static int pktgen_wait_thread_run(struct pktgen_thread *t)
{
- if_lock(t);
-
while (thread_is_running(t)) {
- if_unlock(t);
-
+ /* note: 't' will still be around even after the unlock/lock
+ * cycle because pktgen_thread threads are only cleared at
+ * net exit
+ */
+ mutex_unlock(&pktgen_thread_lock);
msleep_interruptible(100);
+ mutex_lock(&pktgen_thread_lock);
if (signal_pending(current))
goto signal;
- if_lock(t);
}
- if_unlock(t);
return 1;
signal:
return 0;
}
-static int pktgen_wait_all_threads_run(void)
+static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
{
struct pktgen_thread *t;
int sig = 1;
+ /* prevent from racing with rmmod */
+ if (!try_module_get(THIS_MODULE))
+ return sig;
+
mutex_lock(&pktgen_thread_lock);
- list_for_each_entry(t, &pktgen_threads, th_list) {
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
sig = pktgen_wait_thread_run(t);
if (sig == 0)
break;
}
if (sig == 0)
- list_for_each_entry(t, &pktgen_threads, th_list)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_STOP);
mutex_unlock(&pktgen_thread_lock);
+ module_put(THIS_MODULE);
return sig;
}
-static void pktgen_run_all_threads(void)
+static void pktgen_run_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
+ func_enter();
- PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n"));
+ pktgen_handle_all_threads(pn, T_RUN);
- mutex_lock(&pktgen_thread_lock);
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
- list_for_each_entry(t, &pktgen_threads, th_list)
- t->control |= (T_RUN);
+ pktgen_wait_all_threads_run(pn);
+}
- mutex_unlock(&pktgen_thread_lock);
+static void pktgen_reset_all_threads(struct pktgen_net *pn)
+{
+ func_enter();
- schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
+ pktgen_handle_all_threads(pn, T_REMDEVALL);
- pktgen_wait_all_threads_run();
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+ pktgen_wait_all_threads_run(pn);
}
static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
{
- __u64 total_us, bps, mbps, pps, idle;
+ __u64 bps, mbps, pps;
char *p = pkt_dev->result;
-
- total_us = pkt_dev->stopped_at - pkt_dev->started_at;
-
- idle = pkt_dev->idle_acc;
+ ktime_t elapsed = ktime_sub(pkt_dev->stopped_at,
+ pkt_dev->started_at);
+ ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
- (unsigned long long)total_us,
- (unsigned long long)(total_us - idle),
- (unsigned long long)idle,
+ (unsigned long long)ktime_to_us(elapsed),
+ (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
+ (unsigned long long)ktime_to_us(idle),
(unsigned long long)pkt_dev->sofar,
pkt_dev->cur_pkt_size, nr_frags);
- pps = pkt_dev->sofar * USEC_PER_SEC;
+ pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
+ ktime_to_ns(elapsed));
- while ((total_us >> 32) != 0) {
- pps >>= 1;
- total_us >>= 1;
- }
-
- do_div(pps, total_us);
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+ struct imix_pkt *entry;
- bps = pps * 8 * pkt_dev->cur_pkt_size;
+ bps = 0;
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ entry = &pkt_dev->imix_entries[i];
+ bps += entry->size * entry->count_so_far;
+ }
+ bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed));
+ } else {
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+ }
mbps = bps;
do_div(mbps, 1000000);
@@ -2998,19 +3379,20 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
}
/* Set stopped-at timer, remove from running list, do counters & statistics */
-
static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
{
int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
if (!pkt_dev->running) {
- printk("pktgen: interface: %s is already stopped\n",
- pkt_dev->ifname);
+ pr_warn("interface: %s is already stopped\n",
+ pkt_dev->odevname);
return -EINVAL;
}
- pkt_dev->stopped_at = getCurUs();
pkt_dev->running = 0;
+ kfree_skb(pkt_dev->skb);
+ pkt_dev->skb = NULL;
+ pkt_dev->stopped_at = ktime_get();
show_results(pkt_dev, nr_frags);
@@ -3021,17 +3403,17 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
{
struct pktgen_dev *pkt_dev, *best = NULL;
- if_lock(t);
-
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
if (!pkt_dev->running)
continue;
if (best == NULL)
best = pkt_dev;
- else if (pkt_dev->next_tx_us < best->next_tx_us)
+ else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
best = pkt_dev;
}
- if_unlock(t);
+ rcu_read_unlock();
+
return best;
}
@@ -3039,19 +3421,15 @@ static void pktgen_stop(struct pktgen_thread *t)
{
struct pktgen_dev *pkt_dev;
- PG_DEBUG(printk("pktgen: entering pktgen_stop\n"));
+ func_enter();
- if_lock(t);
+ rcu_read_lock();
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
pktgen_stop_device(pkt_dev);
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
-
- pkt_dev->skb = NULL;
}
- if_unlock(t);
+ rcu_read_unlock();
}
/*
@@ -3063,9 +3441,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
struct list_head *q, *n;
struct pktgen_dev *cur;
- PG_DEBUG(printk("pktgen: entering pktgen_rem_one_if\n"));
-
- if_lock(t);
+ func_enter();
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
@@ -3073,16 +3449,13 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
if (!cur->removal_mark)
continue;
- if (cur->skb)
- kfree_skb(cur->skb);
+ kfree_skb(cur->skb);
cur->skb = NULL;
pktgen_remove_device(t, cur);
break;
}
-
- if_unlock(t);
}
static void pktgen_rem_all_ifs(struct pktgen_thread *t)
@@ -3090,260 +3463,274 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
struct list_head *q, *n;
struct pktgen_dev *cur;
- /* Remove all devices, free mem */
+ func_enter();
- PG_DEBUG(printk("pktgen: entering pktgen_rem_all_ifs\n"));
- if_lock(t);
+ /* Remove all devices, free mem */
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
- if (cur->skb)
- kfree_skb(cur->skb);
+ kfree_skb(cur->skb);
cur->skb = NULL;
pktgen_remove_device(t, cur);
}
-
- if_unlock(t);
}
static void pktgen_rem_thread(struct pktgen_thread *t)
{
/* Remove from the thread list */
-
- remove_proc_entry(t->name, pg_proc_dir);
-
- mutex_lock(&pktgen_thread_lock);
-
- list_del(&t->th_list);
-
- mutex_unlock(&pktgen_thread_lock);
+ remove_proc_entry(t->tsk->comm, t->net->proc_dir);
}
-static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
+static void pktgen_resched(struct pktgen_dev *pkt_dev)
{
- struct net_device *odev = NULL;
- __u64 idle_start = 0;
- int ret;
+ ktime_t idle_start = ktime_get();
- odev = pkt_dev->odev;
+ schedule();
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
- if (pkt_dev->delay_us || pkt_dev->delay_ns) {
- u64 now;
+static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
- now = getCurUs();
- if (now < pkt_dev->next_tx_us)
- spin(pkt_dev, pkt_dev->next_tx_us);
+ while (refcount_read(&(pkt_dev->skb->users)) != 1) {
+ if (signal_pending(current))
+ break;
- /* This is max DELAY, this has special meaning of
- * "never transmit"
- */
- if (pkt_dev->delay_us == 0x7FFFFFFF) {
- pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us;
- pkt_dev->next_tx_ns = pkt_dev->delay_ns;
- goto out;
- }
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
}
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
- if (netif_queue_stopped(odev) || need_resched()) {
- idle_start = getCurUs();
+static void pktgen_xmit(struct pktgen_dev *pkt_dev)
+{
+ bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
+ struct net_device *odev = pkt_dev->odev;
+ struct netdev_queue *txq;
+ unsigned int burst = 1;
+ struct sk_buff *skb;
+ int clone_skb = 0;
+ int ret;
- if (!netif_running(odev)) {
- pktgen_stop_device(pkt_dev);
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
- pkt_dev->skb = NULL;
- goto out;
- }
- if (need_resched())
- schedule();
+ /* If 'skb_shared' is false, the read of possible
+ * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ * prevent some concurrent changes from slipping in. And the stabilized
+ * config will be read in during the next run of pktgen_xmit.
+ */
+ if (skb_shared) {
+ burst = READ_ONCE(pkt_dev->burst);
+ clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ }
- pkt_dev->idle_acc += getCurUs() - idle_start;
+ /* If device is offline, then don't send */
+ if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
+ pktgen_stop_device(pkt_dev);
+ return;
+ }
- if (netif_queue_stopped(odev)) {
- pkt_dev->next_tx_us = getCurUs(); /* TODO */
- pkt_dev->next_tx_ns = 0;
- goto out; /* Try the next interface */
- }
+ /* This is max DELAY, this has special meaning of
+ * "never transmit"
+ */
+ if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
+ pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);
+ return;
}
- if (pkt_dev->last_ok || !pkt_dev->skb) {
- if ((++pkt_dev->clone_count >= pkt_dev->clone_skb)
- || (!pkt_dev->skb)) {
- /* build a new pkt */
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
+ /* If no skb or clone count exhausted then get new one */
+ if (!pkt_dev->skb || (pkt_dev->last_ok &&
+ ++pkt_dev->clone_count >= clone_skb)) {
+ /* build a new pkt */
+ kfree_skb(pkt_dev->skb);
- pkt_dev->skb = fill_packet(odev, pkt_dev);
- if (pkt_dev->skb == NULL) {
- printk("pktgen: ERROR: couldn't allocate skb in fill_packet.\n");
- schedule();
- pkt_dev->clone_count--; /* back out increment, OOM */
- goto out;
- }
- pkt_dev->allocated_skbs++;
- pkt_dev->clone_count = 0; /* reset counter */
+ pkt_dev->skb = fill_packet(odev, pkt_dev);
+ if (pkt_dev->skb == NULL) {
+ pr_err("ERROR: couldn't allocate skb in fill_packet\n");
+ schedule();
+ pkt_dev->clone_count--; /* back out increment, OOM */
+ return;
}
+ pkt_dev->last_pkt_size = pkt_dev->skb->len;
+ pkt_dev->clone_count = 0; /* reset counter */
}
- netif_tx_lock_bh(odev);
- if (!netif_queue_stopped(odev)) {
-
- atomic_inc(&(pkt_dev->skb->users));
- retry_now:
- ret = odev->hard_start_xmit(pkt_dev->skb, odev);
- if (likely(ret == NETDEV_TX_OK)) {
- pkt_dev->last_ok = 1;
+ if (pkt_dev->delay && pkt_dev->last_ok)
+ spin(pkt_dev, pkt_dev->next_tx);
+
+ if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
+ skb = pkt_dev->skb;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (skb_shared)
+ refcount_add(burst, &skb->users);
+ local_bh_disable();
+ do {
+ ret = netif_receive_skb(skb);
+ if (ret == NET_RX_DROP)
+ pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
- pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
-
- } else if (ret == NETDEV_TX_LOCKED
- && (odev->features & NETIF_F_LLTX)) {
- cpu_relax();
- goto retry_now;
- } else { /* Retry it next time */
+ if (unlikely(!skb_shared)) {
+ pkt_dev->skb = NULL;
+ break;
+ }
+ if (refcount_read(&skb->users) != burst) {
+ /* skb was queued by rps/rfs or taps,
+ * so cannot reuse this skb
+ */
+ WARN_ON(refcount_sub_and_test(burst - 1, &skb->users));
+ /* get out of the loop and wait
+ * until skb is consumed
+ */
+ break;
+ }
+ /* skb was 'freed' by stack, so clean few
+ * bits and reuse it
+ */
+ skb_reset_redirect(skb);
+ } while (--burst > 0);
+ goto out; /* Skips xmit_mode M_START_XMIT */
+ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
+ local_bh_disable();
+ if (skb_shared)
+ refcount_inc(&pkt_dev->skb->users);
- atomic_dec(&(pkt_dev->skb->users));
+ ret = dev_queue_xmit(pkt_dev->skb);
- if (debug && net_ratelimit())
- printk(KERN_INFO "pktgen: Hard xmit error\n");
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ /* These are all valid return codes for a qdisc but
+ * indicate packets are being dropped or will likely
+ * be dropped soon.
+ */
+ case NETDEV_TX_BUSY:
+ /* qdisc may call dev_hard_start_xmit directly in cases
+ * where no queues exist e.g. loopback device, virtual
+ * devices, etc. In this case we need to handle
+ * NETDEV_TX_ codes.
+ */
+ default:
pkt_dev->errors++;
- pkt_dev->last_ok = 0;
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ break;
}
+ goto out;
+ }
- pkt_dev->next_tx_us = getCurUs();
- pkt_dev->next_tx_ns = 0;
+ txq = skb_get_tx_queue(odev, pkt_dev->skb);
- pkt_dev->next_tx_us += pkt_dev->delay_us;
- pkt_dev->next_tx_ns += pkt_dev->delay_ns;
+ local_bh_disable();
- if (pkt_dev->next_tx_ns > 1000) {
- pkt_dev->next_tx_us++;
- pkt_dev->next_tx_ns -= 1000;
- }
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
+
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
+ pkt_dev->last_ok = 0;
+ goto unlock;
}
+ if (skb_shared)
+ refcount_add(burst, &pkt_dev->skb->users);
+
+xmit_more:
+ ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
- else { /* Retry it next time */
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
+ switch (ret) {
+ case NETDEV_TX_OK:
+ pkt_dev->last_ok = 1;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq))
+ goto xmit_more;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ /* skb has been consumed */
+ pkt_dev->errors++;
+ break;
+ default: /* Drivers are not supposed to return other values! */
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ pkt_dev->errors++;
+ fallthrough;
+ case NETDEV_TX_BUSY:
+ /* Retry it next time */
+ if (skb_shared)
+ refcount_dec(&pkt_dev->skb->users);
pkt_dev->last_ok = 0;
- pkt_dev->next_tx_us = getCurUs(); /* TODO */
- pkt_dev->next_tx_ns = 0;
}
+ if (unlikely(burst))
+ WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users));
+unlock:
+ HARD_TX_UNLOCK(odev, txq);
- netif_tx_unlock_bh(odev);
+out:
+ local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- if (atomic_read(&(pkt_dev->skb->users)) != 1) {
- idle_start = getCurUs();
- while (atomic_read(&(pkt_dev->skb->users)) != 1) {
- if (signal_pending(current)) {
- break;
- }
- schedule();
- }
- pkt_dev->idle_acc += getCurUs() - idle_start;
- }
+ if (pkt_dev->skb)
+ pktgen_wait_for_skb(pkt_dev);
/* Done with this */
pktgen_stop_device(pkt_dev);
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
- pkt_dev->skb = NULL;
}
-out:;
}
-/*
+/*
* Main loop of the thread goes here
*/
-static void pktgen_thread_worker(struct pktgen_thread *t)
+static int pktgen_thread_worker(void *arg)
{
- DEFINE_WAIT(wait);
+ struct pktgen_thread *t = arg;
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- sigset_t tmpsig;
- u32 max_before_softirq;
- u32 tx_since_softirq = 0;
-
- daemonize("pktgen/%d", cpu);
-
- /* Block all signals except SIGKILL, SIGSTOP and SIGTERM */
-
- spin_lock_irq(&current->sighand->siglock);
- tmpsig = current->blocked;
- siginitsetinv(&current->blocked,
- sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGTERM));
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- /* Migrate to the right CPU */
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
- if (smp_processor_id() != cpu)
- BUG();
+ WARN_ON_ONCE(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
+ complete(&t->start_done);
- t->control &= ~(T_TERMINATE);
- t->control &= ~(T_RUN);
- t->control &= ~(T_STOP);
- t->control &= ~(T_REMDEVALL);
- t->control &= ~(T_REMDEV);
-
- t->pid = current->pid;
-
- PG_DEBUG(printk("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid));
+ pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
- max_before_softirq = t->max_before_softirq;
-
- __set_current_state(TASK_INTERRUPTIBLE);
- mb();
-
- while (1) {
-
- __set_current_state(TASK_RUNNING);
-
- /*
- * Get next dev to xmit -- if any.
- */
+ set_freezable();
+ while (!kthread_should_stop()) {
pkt_dev = next_to_run(t);
- if (pkt_dev) {
+ if (unlikely(!pkt_dev && t->control == 0)) {
+ if (t->net->pktgen_exiting)
+ break;
+ wait_event_freezable_timeout(t->queue,
+ t->control != 0, HZ / 10);
+ continue;
+ }
+ if (likely(pkt_dev)) {
pktgen_xmit(pkt_dev);
- /*
- * We like to stay RUNNING but must also give
- * others fair share.
- */
-
- tx_since_softirq += pkt_dev->last_ok;
-
- if (tx_since_softirq > max_before_softirq) {
- if (local_softirq_pending())
- do_softirq();
- tx_since_softirq = 0;
- }
- } else {
- prepare_to_wait(&(t->queue), &wait, TASK_INTERRUPTIBLE);
- schedule_timeout(HZ / 10);
- finish_wait(&(t->queue), &wait);
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
}
- /*
- * Back from sleep, either due to the timeout or signal.
- * We check if we have any "posted" work for us.
- */
-
- if (t->control & T_TERMINATE || signal_pending(current))
- /* we received a request to terminate ourself */
- break;
-
if (t->control & T_STOP) {
pktgen_stop(t);
t->control &= ~(T_STOP);
@@ -3364,41 +3751,45 @@ static void pktgen_thread_worker(struct pktgen_thread *t)
t->control &= ~(T_REMDEV);
}
- if (need_resched())
- schedule();
+ try_to_freeze();
}
- PG_DEBUG(printk("pktgen: %s stopping all device\n", t->name));
+ pr_debug("%s stopping all device\n", t->tsk->comm);
pktgen_stop(t);
- PG_DEBUG(printk("pktgen: %s removing all device\n", t->name));
+ pr_debug("%s removing all device\n", t->tsk->comm);
pktgen_rem_all_ifs(t);
- PG_DEBUG(printk("pktgen: %s removing thread.\n", t->name));
+ pr_debug("%s removing thread\n", t->tsk->comm);
pktgen_rem_thread(t);
- t->removed = 1;
+ return 0;
}
static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
- const char *ifname)
+ const char *ifname, bool exact)
{
struct pktgen_dev *p, *pkt_dev = NULL;
- if_lock(t);
-
- list_for_each_entry(p, &t->if_list, list)
- if (strncmp(p->ifname, ifname, IFNAMSIZ) == 0) {
+ size_t len = strlen(ifname);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &t->if_list, list)
+ if (strncmp(p->odevname, ifname, len) == 0) {
+ if (p->odevname[len]) {
+ if (exact || p->odevname[len] != '@')
+ continue;
+ }
pkt_dev = p;
break;
}
- if_unlock(t);
- PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev));
+ rcu_read_unlock();
+ pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
return pkt_dev;
}
-/*
- * Adds a dev at front of if_list.
+/*
+ * Adds a dev at front of if_list.
*/
static int add_dev_to_thread(struct pktgen_thread *t,
@@ -3406,17 +3797,24 @@ static int add_dev_to_thread(struct pktgen_thread *t,
{
int rv = 0;
+ /* This function cannot be called concurrently, as its called
+ * under pktgen_thread_lock mutex, but it can run from
+ * userspace on another CPU than the kthread. The if_lock()
+ * is used here to sync with concurrent instances of
+ * _rem_dev_from_if_list() invoked via kthread, which is also
+ * updating the if_list
+ */
if_lock(t);
if (pkt_dev->pg_thread) {
- printk("pktgen: ERROR: already assigned to a thread.\n");
+ pr_err("ERROR: already assigned to a thread\n");
rv = -EBUSY;
goto out;
}
- list_add(&pkt_dev->list, &t->if_list);
- pkt_dev->pg_thread = t;
pkt_dev->running = 0;
+ pkt_dev->pg_thread = t;
+ list_add_rcu(&pkt_dev->list, &t->if_list);
out:
if_unlock(t);
@@ -3428,147 +3826,143 @@ out:
static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
{
struct pktgen_dev *pkt_dev;
- struct proc_dir_entry *pe;
+ int err;
+ int node = cpu_to_node(t->cpu);
/* We don't allow a device to be on several threads */
- pkt_dev = __pktgen_NN_threads(ifname, FIND);
+ pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);
if (pkt_dev) {
- printk("pktgen: ERROR: interface already used.\n");
+ pr_err("ERROR: interface already used\n");
return -EBUSY;
}
- pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
+ pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node);
if (!pkt_dev)
return -ENOMEM;
- pkt_dev->flows = vmalloc(MAX_CFLOWS * sizeof(struct flow_state));
+ strscpy(pkt_dev->odevname, ifname);
+ pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS,
+ sizeof(struct flow_state)),
+ node);
if (pkt_dev->flows == NULL) {
kfree(pkt_dev);
return -ENOMEM;
}
- memset(pkt_dev->flows, 0, MAX_CFLOWS * sizeof(struct flow_state));
pkt_dev->removal_mark = 0;
- pkt_dev->min_pkt_size = ETH_ZLEN;
- pkt_dev->max_pkt_size = ETH_ZLEN;
pkt_dev->nfrags = 0;
- pkt_dev->clone_skb = pg_clone_skb_d;
- pkt_dev->delay_us = pg_delay_d / 1000;
- pkt_dev->delay_ns = pg_delay_d % 1000;
+ pkt_dev->delay = pg_delay_d;
pkt_dev->count = pg_count_d;
pkt_dev->sofar = 0;
pkt_dev->udp_src_min = 9; /* sink port */
pkt_dev->udp_src_max = 9;
pkt_dev->udp_dst_min = 9;
pkt_dev->udp_dst_max = 9;
-
pkt_dev->vlan_p = 0;
pkt_dev->vlan_cfi = 0;
pkt_dev->vlan_id = 0xffff;
pkt_dev->svlan_p = 0;
pkt_dev->svlan_cfi = 0;
pkt_dev->svlan_id = 0xffff;
-
- strncpy(pkt_dev->ifname, ifname, IFNAMSIZ);
-
- if (!pktgen_setup_dev(pkt_dev)) {
- printk("pktgen: ERROR: pktgen_setup_dev failed.\n");
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
- return -ENODEV;
- }
-
- pe = create_proc_entry(ifname, 0600, pg_proc_dir);
- if (!pe) {
- printk("pktgen: cannot create %s/%s procfs entry.\n",
+ pkt_dev->burst = 1;
+ pkt_dev->node = NUMA_NO_NODE;
+ pkt_dev->flags = F_SHARED; /* SKB shared by default */
+
+ err = pktgen_setup_dev(t->net, pkt_dev, ifname);
+ if (err)
+ goto out1;
+ if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+ pkt_dev->clone_skb = pg_clone_skb_d;
+
+ pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
+ &pktgen_if_proc_ops, pkt_dev);
+ if (!pkt_dev->entry) {
+ pr_err("cannot create %s/%s procfs entry\n",
PG_PROC_DIR, ifname);
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
- return -EINVAL;
+ err = -EINVAL;
+ goto out2;
}
- pe->proc_fops = &pktgen_if_fops;
- pe->data = pkt_dev;
+#ifdef CONFIG_XFRM
+ pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
+ pkt_dev->ipsproto = IPPROTO_ESP;
+
+ /* xfrm tunnel mode needs additional dst to extract outer
+ * ip header protocol/ttl/id field, here create a phony one.
+ * instead of looking for a valid rt, which definitely hurting
+ * performance under such circumstance.
+ */
+ pkt_dev->dstops.family = AF_INET;
+ pkt_dev->xdst.u.dst.dev = pkt_dev->odev;
+ dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false);
+ pkt_dev->xdst.child = &pkt_dev->xdst.u.dst;
+ pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops;
+#endif
return add_dev_to_thread(t, pkt_dev);
+out2:
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
+out1:
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return err;
}
-static struct pktgen_thread *__init pktgen_find_thread(const char *name)
+static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
{
struct pktgen_thread *t;
-
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pktgen_threads, th_list)
- if (strcmp(t->name, name) == 0) {
- mutex_unlock(&pktgen_thread_lock);
- return t;
- }
-
- mutex_unlock(&pktgen_thread_lock);
- return NULL;
-}
-
-static int __init pktgen_create_thread(const char *name, int cpu)
-{
- int err;
- struct pktgen_thread *t = NULL;
struct proc_dir_entry *pe;
+ struct task_struct *p;
- if (strlen(name) > 31) {
- printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n");
- return -EINVAL;
- }
-
- if (pktgen_find_thread(name)) {
- printk("pktgen: ERROR: thread: %s already exists\n", name);
- return -EINVAL;
- }
-
- t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL);
+ t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
+ cpu_to_node(cpu));
if (!t) {
- printk("pktgen: ERROR: out of memory, can't create new thread.\n");
+ pr_err("ERROR: out of memory, can't create new thread\n");
return -ENOMEM;
}
- strcpy(t->name, name);
- spin_lock_init(&t->if_lock);
+ mutex_init(&t->if_lock);
t->cpu = cpu;
- pe = create_proc_entry(t->name, 0600, pg_proc_dir);
- if (!pe) {
- printk("pktgen: cannot create %s/%s procfs entry.\n",
- PG_PROC_DIR, t->name);
- kfree(t);
- return -EINVAL;
- }
-
- pe->proc_fops = &pktgen_thread_fops;
- pe->data = t;
-
INIT_LIST_HEAD(&t->if_list);
- list_add_tail(&t->th_list, &pktgen_threads);
+ list_add_tail(&t->th_list, &pn->pktgen_threads);
+ init_completion(&t->start_done);
- t->removed = 0;
+ p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d");
+ if (IS_ERR(p)) {
+ pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
+ list_del(&t->th_list);
+ kfree(t);
+ return PTR_ERR(p);
+ }
+
+ t->tsk = p;
- err = kernel_thread((void *)pktgen_thread_worker, (void *)t,
- CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
- if (err < 0) {
- printk("pktgen: kernel_thread() failed for cpu %d\n", t->cpu);
- remove_proc_entry(t->name, pg_proc_dir);
+ pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
+ &pktgen_thread_proc_ops, t);
+ if (!pe) {
+ pr_err("cannot create %s/%s procfs entry\n",
+ PG_PROC_DIR, t->tsk->comm);
+ kthread_stop(p);
list_del(&t->th_list);
kfree(t);
- return err;
+ return -EINVAL;
}
+ t->net = pn;
+ get_task_struct(p);
+ wake_up_process(p);
+ wait_for_completion(&t->start_done);
+
return 0;
}
-/*
- * Removes a device from the thread if_list.
+/*
+ * Removes a device from the thread if_list.
*/
static void _rem_dev_from_if_list(struct pktgen_thread *t,
struct pktgen_dev *pkt_dev)
@@ -3576,124 +3970,164 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,
struct list_head *q, *n;
struct pktgen_dev *p;
+ if_lock(t);
list_for_each_safe(q, n, &t->if_list) {
p = list_entry(q, struct pktgen_dev, list);
if (p == pkt_dev)
- list_del(&p->list);
+ list_del_rcu(&p->list);
}
+ if_unlock(t);
}
static int pktgen_remove_device(struct pktgen_thread *t,
struct pktgen_dev *pkt_dev)
{
-
- PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev));
+ pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
if (pkt_dev->running) {
- printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n");
+ pr_warn("WARNING: trying to remove a running interface, stopping it now\n");
pktgen_stop_device(pkt_dev);
}
/* Dis-associate from the interface */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
- /* And update the thread if_list */
+ /* Remove proc before if_list entry, because add_device uses
+ * list to determine if interface already exist, avoid race
+ * with proc_create_data()
+ */
+ proc_remove(pkt_dev->entry);
+ /* And update the thread if_list */
_rem_dev_from_if_list(t, pkt_dev);
- /* Clean up proc file system */
-
- remove_proc_entry(pkt_dev->ifname, pg_proc_dir);
-
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
+ kfree_rcu(pkt_dev, rcu);
return 0;
}
-static int __init pg_init(void)
+static int __net_init pg_net_init(struct net *net)
{
- int cpu;
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
struct proc_dir_entry *pe;
-
- printk(version);
-
- pg_proc_dir = proc_mkdir(PG_PROC_DIR, proc_net);
- if (!pg_proc_dir)
+ int cpu, ret = 0;
+
+ pn->net = net;
+ INIT_LIST_HEAD(&pn->pktgen_threads);
+ pn->pktgen_exiting = false;
+ pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
+ if (!pn->proc_dir) {
+ pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
return -ENODEV;
- pg_proc_dir->owner = THIS_MODULE;
-
- pe = create_proc_entry(PGCTRL, 0600, pg_proc_dir);
+ }
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops);
if (pe == NULL) {
- printk("pktgen: ERROR: cannot create %s procfs entry.\n",
- PGCTRL);
- proc_net_remove(PG_PROC_DIR);
- return -EINVAL;
+ pr_err("cannot create %s procfs entry\n", PGCTRL);
+ ret = -EINVAL;
+ goto remove;
}
- pe->proc_fops = &pktgen_fops;
- pe->data = NULL;
-
- /* Register us to receive netdevice events */
- register_netdevice_notifier(&pktgen_notifier_block);
-
+ cpus_read_lock();
for_each_online_cpu(cpu) {
int err;
- char buf[30];
- sprintf(buf, "kpktgend_%i", cpu);
- err = pktgen_create_thread(buf, cpu);
+ err = pktgen_create_thread(cpu, pn);
if (err)
- printk("pktgen: WARNING: Cannot create thread for cpu %d (%d)\n",
- cpu, err);
+ pr_warn("Cannot create thread for cpu %d (%d)\n",
+ cpu, err);
}
+ cpus_read_unlock();
- if (list_empty(&pktgen_threads)) {
- printk("pktgen: ERROR: Initialization failed for all threads\n");
- unregister_netdevice_notifier(&pktgen_notifier_block);
- remove_proc_entry(PGCTRL, pg_proc_dir);
- proc_net_remove(PG_PROC_DIR);
- return -ENODEV;
+ if (list_empty(&pn->pktgen_threads)) {
+ pr_err("Initialization failed for all threads\n");
+ ret = -ENODEV;
+ goto remove_entry;
}
return 0;
+
+remove_entry:
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+remove:
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+ return ret;
}
-static void __exit pg_cleanup(void)
+static void __net_exit pg_net_exit(struct net *net)
{
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
struct pktgen_thread *t;
struct list_head *q, *n;
- wait_queue_head_t queue;
- init_waitqueue_head(&queue);
+ LIST_HEAD(list);
/* Stop all interfaces & threads */
+ pn->pktgen_exiting = true;
- list_for_each_safe(q, n, &pktgen_threads) {
- t = list_entry(q, struct pktgen_thread, th_list);
- t->control |= (T_TERMINATE);
+ mutex_lock(&pktgen_thread_lock);
+ list_splice_init(&pn->pktgen_threads, &list);
+ mutex_unlock(&pktgen_thread_lock);
- wait_event_interruptible_timeout(queue, (t->removed == 1), HZ);
+ list_for_each_safe(q, n, &list) {
+ t = list_entry(q, struct pktgen_thread, th_list);
+ list_del(&t->th_list);
+ kthread_stop_put(t->tsk);
+ kfree(t);
}
- /* Un-register us from receiving netdevice events */
- unregister_netdevice_notifier(&pktgen_notifier_block);
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+}
- /* Clean up proc file system */
- remove_proc_entry(PGCTRL, pg_proc_dir);
- proc_net_remove(PG_PROC_DIR);
+static struct pernet_operations pg_net_ops = {
+ .init = pg_net_init,
+ .exit = pg_net_exit,
+ .id = &pg_net_id,
+ .size = sizeof(struct pktgen_net),
+};
+
+static int __init pg_init(void)
+{
+ int ret = 0;
+
+ pr_info("%s", version);
+ ret = register_pernet_subsys(&pg_net_ops);
+ if (ret)
+ return ret;
+ ret = register_netdevice_notifier(&pktgen_notifier_block);
+ if (ret)
+ unregister_pernet_subsys(&pg_net_ops);
+
+ return ret;
+}
+
+static void __exit pg_cleanup(void)
+{
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+ unregister_pernet_subsys(&pg_net_ops);
+ /* Don't need rcu_barrier() due to use of kfree_rcu() */
}
module_init(pg_init);
module_exit(pg_cleanup);
-MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se");
+MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>");
MODULE_DESCRIPTION("Packet Generator tool");
MODULE_LICENSE("GPL");
+MODULE_VERSION(VERSION);
module_param(pg_count_d, int, 0);
+MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject");
module_param(pg_delay_d, int, 0);
+MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)");
module_param(pg_clone_skb_d, int, 0);
+MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet");
module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Enable debugging of pktgen module");
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 000000000000..598041b0499e
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* PTP classifier
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ * ret a ; return PTP class
+ *
+ * ; PTP over UDP over IPv4 over 802.1Q over Ethernet
+ * test_8021q_ipv4:
+ * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
+ * ldb [27] ; load proto
+ * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
+ * ldh [24] ; load frag offset field
+ * jset #0x1fff, drop_8021q_ipv4; don't allow fragments
+ * ldxb 4*([18]&0xf) ; load IP header len
+ * ldh [x + 20] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 26] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over 802.1Q over Ethernet
+ * test_8021q_ipv6:
+ * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
+ * ldb [24] ; load proto
+ * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
+ * ldh [60] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [66] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x40 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct bpf_prog *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return bpf_prog_run(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type)
+{
+ u8 *ptr = skb_mac_header(skb);
+
+ if (type & PTP_CLASS_VLAN)
+ ptr += VLAN_HLEN;
+
+ switch (type & PTP_CLASS_PMASK) {
+ case PTP_CLASS_IPV4:
+ ptr += IPV4_HLEN(ptr) + UDP_HLEN;
+ break;
+ case PTP_CLASS_IPV6:
+ ptr += IP6_HLEN + UDP_HLEN;
+ break;
+ case PTP_CLASS_L2:
+ break;
+ default:
+ return NULL;
+ }
+
+ ptr += ETH_HLEN;
+
+ /* Ensure that the entire header is present in this packet. */
+ if (ptr + sizeof(struct ptp_header) > skb->data + skb->len)
+ return NULL;
+
+ return (struct ptp_header *)ptr;
+}
+EXPORT_SYMBOL_GPL(ptp_parse_header);
+
+bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type)
+{
+ struct ptp_header *hdr;
+
+ hdr = ptp_parse_header(skb, type);
+ if (!hdr)
+ return false;
+
+ return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC;
+}
+EXPORT_SYMBOL_GPL(ptp_msg_is_sync);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 32, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 35, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x000000c0 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x0000001b },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000018 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x00000012 },
+ { 0x48, 0, 0, 0x00000014 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x0000001a },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000090 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 8, 0x000086dd },
+ { 0x30, 0, 0, 0x00000018 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x0000003c },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x00000042 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x000000a0 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000040 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog;
+
+ ptp_prog.len = ARRAY_SIZE(ptp_filter);
+ ptp_prog.filter = ptp_filter;
+
+ BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5f0818d815e6..897a8f01a67b 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -1,20 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET Generic infrastructure for Network protocols.
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* From code originally in include/net/tcp.h
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/tcp.h>
#include <linux/vmalloc.h>
#include <net/request_sock.h>
@@ -26,77 +23,105 @@
* but then some measure against one socket starving all other sockets
* would be needed.
*
- * It was 128 by default. Experiments with real servers show, that
+ * The minimum value of it is 128. Experiments with real servers show that
* it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
-int sysctl_max_syn_backlog = 256;
-int reqsk_queue_alloc(struct request_sock_queue *queue,
- unsigned int nr_table_entries)
+void reqsk_queue_alloc(struct request_sock_queue *queue)
{
- size_t lopt_size = sizeof(struct listen_sock);
- struct listen_sock *lopt;
-
- nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
- nr_table_entries = max_t(u32, nr_table_entries, 8);
- nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
- lopt_size += nr_table_entries * sizeof(struct request_sock *);
- if (lopt_size > PAGE_SIZE)
- lopt = __vmalloc(lopt_size,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
- else
- lopt = kzalloc(lopt_size, GFP_KERNEL);
- if (lopt == NULL)
- return -ENOMEM;
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
- for (lopt->max_qlen_log = 3;
- (1 << lopt->max_qlen_log) < nr_table_entries;
- lopt->max_qlen_log++);
-
- get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
- rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
- lopt->nr_table_entries = nr_table_entries;
-
- write_lock_bh(&queue->syn_wait_lock);
- queue->listen_opt = lopt;
- write_unlock_bh(&queue->syn_wait_lock);
-
- return 0;
}
-EXPORT_SYMBOL(reqsk_queue_alloc);
-
-void reqsk_queue_destroy(struct request_sock_queue *queue)
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock.
+ *
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
{
- /* make all the listen_opt local to us */
- struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
- size_t lopt_size = sizeof(struct listen_sock) +
- lopt->nr_table_entries * sizeof(struct request_sock *);
-
- if (lopt->qlen != 0) {
- unsigned int i;
-
- for (i = 0; i < lopt->nr_table_entries; i++) {
- struct request_sock *req;
-
- while ((req = lopt->syn_table[i]) != NULL) {
- lopt->syn_table[i] = req->dl_next;
- lopt->qlen--;
- reqsk_free(req);
- }
- }
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->tfo_listener = false;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ reqsk_put(req);
+ return;
}
-
- BUG_TRAP(lopt->qlen == 0);
- if (lopt_size > PAGE_SIZE)
- vfree(lopt);
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->rsk_timer.expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
else
- kfree(lopt);
-}
+ fastopenq->rskq_rst_tail->dl_next = req;
-EXPORT_SYMBOL(reqsk_queue_destroy);
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e76539a5eb5e..b1ed55141d8a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,21 +8,16 @@
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
- * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ * Vitaly E. Lavrov RTA_OK arithmetic was wrong.
*/
+#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -36,10 +32,13 @@
#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/if_addr.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <linux/bpf.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/string.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -48,156 +47,962 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/udp.h>
+#include <net/tcp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/fib_rules.h>
-#include <net/netlink.h>
-#ifdef CONFIG_NET_WIRELESS_RTNETLINK
-#include <linux/wireless.h>
-#include <net/iw_handler.h>
-#endif /* CONFIG_NET_WIRELESS_RTNETLINK */
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
+#include <net/devlink.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/addrconf.h>
+#endif
+#include <linux/dpll.h>
+
+#include "dev.h"
+
+#define RTNL_MAX_TYPE 50
+#define RTNL_SLAVE_MAX_TYPE 44
+
+struct rtnl_link {
+ rtnl_doit_func doit;
+ rtnl_dumpit_func dumpit;
+ struct module *owner;
+ unsigned int flags;
+ struct rcu_head rcu;
+};
static DEFINE_MUTEX(rtnl_mutex);
-static struct sock *rtnl;
void rtnl_lock(void)
{
mutex_lock(&rtnl_mutex);
}
+EXPORT_SYMBOL(rtnl_lock);
+
+int rtnl_lock_interruptible(void)
+{
+ return mutex_lock_interruptible(&rtnl_mutex);
+}
+
+int rtnl_lock_killable(void)
+{
+ return mutex_lock_killable(&rtnl_mutex);
+}
+
+static struct sk_buff *defer_kfree_skb_list;
+void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
+{
+ if (head && tail) {
+ tail->next = defer_kfree_skb_list;
+ defer_kfree_skb_list = head;
+ }
+}
+EXPORT_SYMBOL(rtnl_kfree_skbs);
void __rtnl_unlock(void)
{
+ struct sk_buff *head = defer_kfree_skb_list;
+
+ defer_kfree_skb_list = NULL;
+
+ /* Ensure that we didn't actually add any TODO item when __rtnl_unlock()
+ * is used. In some places, e.g. in cfg80211, we have code that will do
+ * something like
+ * rtnl_lock()
+ * wiphy_lock()
+ * ...
+ * rtnl_unlock()
+ *
+ * and because netdev_run_todo() acquires the RTNL for items on the list
+ * we could cause a situation such as this:
+ * Thread 1 Thread 2
+ * rtnl_lock()
+ * unregister_netdevice()
+ * __rtnl_unlock()
+ * rtnl_lock()
+ * wiphy_lock()
+ * rtnl_unlock()
+ * netdev_run_todo()
+ * __rtnl_unlock()
+ *
+ * // list not empty now
+ * // because of thread 2
+ * rtnl_lock()
+ * while (!list_empty(...))
+ * rtnl_lock()
+ * wiphy_lock()
+ * **** DEADLOCK ****
+ *
+ * However, usage of __rtnl_unlock() is rare, and so we can ensure that
+ * it's not used in cases where something is added to do the list.
+ */
+ WARN_ON(!list_empty(&net_todo_list));
+
mutex_unlock(&rtnl_mutex);
+
+ while (head) {
+ struct sk_buff *next = head->next;
+
+ kfree_skb(head);
+ cond_resched();
+ head = next;
+ }
}
void rtnl_unlock(void)
{
- mutex_unlock(&rtnl_mutex);
- if (rtnl && rtnl->sk_receive_queue.qlen)
- rtnl->sk_data_ready(rtnl, 0);
+ /* This fellow will unlock it for us. */
netdev_run_todo();
}
+EXPORT_SYMBOL(rtnl_unlock);
int rtnl_trylock(void)
{
return mutex_trylock(&rtnl_mutex);
}
+EXPORT_SYMBOL(rtnl_trylock);
+
+int rtnl_is_locked(void)
+{
+ return mutex_is_locked(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_is_locked);
+
+bool refcount_dec_and_rtnl_lock(refcount_t *r)
+{
+ return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
+}
+EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);
+
+#ifdef CONFIG_PROVE_LOCKING
+bool lockdep_rtnl_is_held(void)
+{
+ return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_lock);
+
+void __rtnl_net_unlock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_unlock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_unlock);
+
+void rtnl_net_lock(struct net *net)
+{
+ rtnl_lock();
+ __rtnl_net_lock(net);
+}
+EXPORT_SYMBOL(rtnl_net_lock);
+
+void rtnl_net_unlock(struct net *net)
+{
+ __rtnl_net_unlock(net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(rtnl_net_unlock);
+
+int rtnl_net_trylock(struct net *net)
+{
+ int ret = rtnl_trylock();
+
+ if (ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+EXPORT_SYMBOL(rtnl_net_trylock);
+
+int rtnl_net_lock_killable(struct net *net)
+{
+ int ret = rtnl_lock_killable();
+
+ if (!ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ if (net_eq(net_a, net_b))
+ return 0;
+
+ /* always init_net first */
+ if (net_eq(net_a, &init_net))
+ return -1;
+
+ if (net_eq(net_b, &init_net))
+ return 1;
+
+ /* otherwise lock in ascending order */
+ return net_a < net_b ? -1 : 1;
+}
+
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
+{
+ const struct net *net_a, *net_b;
+
+ net_a = container_of(a, struct net, rtnl_mutex.dep_map);
+ net_b = container_of(b, struct net, rtnl_mutex.dep_map);
+
+ return rtnl_net_cmp_locks(net_a, net_b);
+}
+
+bool rtnl_net_is_locked(struct net *net)
+{
+ return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_net_is_locked);
+
+bool lockdep_rtnl_net_is_held(struct net *net)
+{
+ return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
+#else
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ /* No need to swap */
+ return -1;
+}
+#endif
+
+struct rtnl_nets {
+ /* ->newlink() needs to freeze 3 netns at most;
+ * 2 for the new device, 1 for its peer.
+ */
+ struct net *net[3];
+ unsigned char len;
+};
+
+static void rtnl_nets_init(struct rtnl_nets *rtnl_nets)
+{
+ memset(rtnl_nets, 0, sizeof(*rtnl_nets));
+}
+
+static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ put_net(rtnl_nets->net[i]);
+ rtnl_nets->net[i] = NULL;
+ }
+
+ rtnl_nets->len = 0;
+}
+
+/**
+ * rtnl_nets_add - Add netns to be locked before ->newlink().
+ *
+ * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net().
+ * @net: netns pointer with an extra refcnt held.
+ *
+ * The extra refcnt is released in rtnl_nets_destroy().
+ */
+static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net)
+{
+ int i;
+
+ DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net));
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) {
+ case 0:
+ put_net(net);
+ return;
+ case 1:
+ swap(rtnl_nets->net[i], net);
+ }
+ }
+
+ rtnl_nets->net[i] = net;
+ rtnl_nets->len++;
+}
+
+static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ rtnl_lock();
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_lock(rtnl_nets->net[i]);
+}
+
+static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_unlock(rtnl_nets->net[i]);
+
+ rtnl_unlock();
+}
+
+static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
-int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
+static inline int rtm_msgindex(int msgtype)
{
- memset(tb, 0, sizeof(struct rtattr*)*maxattr);
+ int msgindex = msgtype - RTM_BASE;
+
+ /*
+ * msgindex < 0 implies someone tried to register a netlink
+ * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
+ * the message type has not been added to linux/rtnetlink.h
+ */
+ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);
+
+ return msgindex;
+}
+
+static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
+{
+ struct rtnl_link __rcu **tab;
+
+ if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
+ protocol = PF_UNSPEC;
+
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
+ if (!tab)
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
- while (RTA_OK(rta, len)) {
- unsigned flavor = rta->rta_type;
- if (flavor && flavor <= maxattr)
- tb[flavor-1] = rta;
- rta = RTA_NEXT(rta, len);
+ return rcu_dereference_rtnl(tab[msgtype]);
+}
+
+static int rtnl_register_internal(struct module *owner,
+ int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ unsigned int flags)
+{
+ struct rtnl_link *link, *old;
+ struct rtnl_link __rcu **tab;
+ int msgindex;
+ int ret = -ENOBUFS;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ rtnl_lock();
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ if (tab == NULL) {
+ tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
+ if (!tab)
+ goto unlock;
+
+ /* ensures we see the 0 stores */
+ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
+ }
+
+ old = rtnl_dereference(tab[msgindex]);
+ if (old) {
+ link = kmemdup(old, sizeof(*old), GFP_KERNEL);
+ if (!link)
+ goto unlock;
+ } else {
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ goto unlock;
+ }
+
+ WARN_ON(link->owner && link->owner != owner);
+ link->owner = owner;
+
+ WARN_ON(doit && link->doit && link->doit != doit);
+ if (doit)
+ link->doit = doit;
+ WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
+ if (dumpit)
+ link->dumpit = dumpit;
+
+ WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL &&
+ (flags & RTNL_FLAG_BULK_DEL_SUPPORTED));
+ link->flags |= flags;
+
+ /* publish protocol:msgtype */
+ rcu_assign_pointer(tab[msgindex], link);
+ ret = 0;
+ if (old)
+ kfree_rcu(old, rcu);
+unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+/**
+ * rtnl_unregister - Unregister a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static int rtnl_unregister(int protocol, int msgtype)
+{
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ rtnl_lock();
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ if (!tab) {
+ rtnl_unlock();
+ return -ENOENT;
}
+
+ link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
+ rtnl_unlock();
+
+ kfree_rcu(link, rcu);
+
return 0;
}
-struct rtnetlink_link * rtnetlink_links[NPROTO];
+/**
+ * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
+ * @protocol : Protocol family or PF_UNSPEC
+ *
+ * Identical to calling rtnl_unregister() for all registered message types
+ * of a certain protocol family.
+ */
+void rtnl_unregister_all(int protocol)
+{
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+
+ rtnl_lock();
+ tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL);
+ if (!tab) {
+ rtnl_unlock();
+ return;
+ }
+ for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
+ link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
+ kfree_rcu(link, rcu);
+ }
+ rtnl_unlock();
+
+ synchronize_net();
+
+ kfree(tab);
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister_all);
+
+/**
+ * __rtnl_register_many - Register rtnetlink message types
+ * @handlers: Array of struct rtnl_msg_handlers
+ * @n: The length of @handlers
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * When one element of @handlers fails to register,
+ * 1) built-in: panics.
+ * 2) modules : the previous successful registrations are unwinded
+ * and an error is returned.
+ *
+ * Use rtnl_register_many().
+ */
+int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n)
+{
+ const struct rtnl_msg_handler *handler;
+ int i, err;
+
+ for (i = 0, handler = handlers; i < n; i++, handler++) {
+ err = rtnl_register_internal(handler->owner, handler->protocol,
+ handler->msgtype, handler->doit,
+ handler->dumpit, handler->flags);
+ if (err) {
+ if (!handler->owner)
+ panic("Unable to register rtnetlink message "
+ "handlers, %pS\n", handlers);
+
+ __rtnl_unregister_many(handlers, i);
+ break;
+ }
+ }
-static const int rtm_min[RTM_NR_FAMILIES] =
+ return err;
+}
+EXPORT_SYMBOL_GPL(__rtnl_register_many);
+
+void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n)
{
- [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
- [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
- [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)),
- [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
- [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)),
- [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
- [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
-};
+ const struct rtnl_msg_handler *handler;
+ int i;
-static const int rta_max[RTM_NR_FAMILIES] =
+ for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--)
+ rtnl_unregister(handler->protocol, handler->msgtype);
+}
+EXPORT_SYMBOL_GPL(__rtnl_unregister_many);
+
+static DEFINE_MUTEX(link_ops_mutex);
+static LIST_HEAD(link_ops);
+
+static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index)
{
- [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX,
- [RTM_FAM(RTM_NEWADDR)] = IFA_MAX,
- [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX,
- [RTM_FAM(RTM_NEWRULE)] = FRA_MAX,
- [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
- [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
-};
+ struct rtnl_link_ops *ops;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind)) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
+ }
+
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
+}
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ struct rtnl_link_ops *tmp;
+ int err;
+
+ /* Sanity-check max sizes to avoid stack buffer overflow. */
+ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+ ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+ return -EINVAL;
+
+ /* The check for alloc/setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if ((ops->alloc || ops->setup) && !ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ err = init_srcu_struct(&ops->srcu);
+ if (err)
+ return err;
+
+ mutex_lock(&link_ops_mutex);
+
+ list_for_each_entry(tmp, &link_ops, list) {
+ if (!strcmp(ops->kind, tmp->kind)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ list_add_tail_rcu(&ops->list, &link_ops);
+unlock:
+ mutex_unlock(&link_ops_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+ struct net_device *dev;
+ LIST_HEAD(list_kill);
+
+ for_each_netdev(net, dev) {
+ if (dev->rtnl_link_ops == ops)
+ ops->dellink(dev, &list_kill);
+ }
+ unregister_netdevice_many(&list_kill);
+}
+
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(&netdev_unregistering_wq, &wait);
+ for (;;) {
+ rtnl_lock();
+ /* We held write locked pernet_ops_rwsem, and parallel
+ * setup_net() and cleanup_net() are not possible.
+ */
+ if (!atomic_read(&dev_unreg_count))
+ break;
+ __rtnl_unlock();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
+}
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ struct net *net;
+
+ mutex_lock(&link_ops_mutex);
+ list_del_rcu(&ops->list);
+ mutex_unlock(&link_ops_mutex);
+
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
-void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
+ /* Close the race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
+ rtnl_lock_unregistering_all();
+
+ for_each_net(net)
+ __rtnl_kill_links(net, ops);
+
+ rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
+}
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
+
+static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ size_t size = 0;
+
+ rcu_read_lock();
+
+ master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+ if (!master_dev)
+ goto out;
+
+ ops = master_dev->rtnl_link_ops;
+ if (!ops || !ops->get_slave_size)
+ goto out;
+ /* IFLA_INFO_SLAVE_DATA + nested data */
+ size = nla_total_size(sizeof(struct nlattr)) +
+ ops->get_slave_size(master_dev, dev);
+
+out:
+ rcu_read_unlock();
+ return size;
+}
+
+static size_t rtnl_link_get_size(const struct net_device *dev)
{
- struct rtattr *rta;
- int size = RTA_LENGTH(attrlen);
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ size_t size;
+
+ if (!ops)
+ return 0;
+
+ size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */
+
+ if (ops->get_size)
+ /* IFLA_INFO_DATA + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ ops->get_size(dev);
+
+ if (ops->get_xstats_size)
+ /* IFLA_INFO_XSTATS */
+ size += nla_total_size(ops->get_xstats_size(dev));
+
+ size += rtnl_link_get_slave_info_data_size(dev);
+
+ return size;
+}
+
+static LIST_HEAD(rtnl_af_ops);
+
+static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index)
+{
+ struct rtnl_af_ops *ops;
+
+ ASSERT_RTNL();
- rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
- rta->rta_type = attrtype;
- rta->rta_len = size;
- memcpy(RTA_DATA(rta), data, attrlen);
- memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
+ if (ops->family == family) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
+ }
+
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
+}
+
+/**
+ * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
+ * @ops: struct rtnl_af_ops * to register
+ *
+ * Return: 0 on success or a negative error code.
+ */
+int rtnl_af_register(struct rtnl_af_ops *ops)
+{
+ int err = init_srcu_struct(&ops->srcu);
+
+ if (err)
+ return err;
+
+ rtnl_lock();
+ list_add_tail_rcu(&ops->list, &rtnl_af_ops);
+ rtnl_unlock();
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(rtnl_af_register);
-size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
+/**
+ * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ */
+void rtnl_af_unregister(struct rtnl_af_ops *ops)
{
- size_t ret = RTA_PAYLOAD(rta);
- char *src = RTA_DATA(rta);
+ rtnl_lock();
+ list_del_rcu(&ops->list);
+ rtnl_unlock();
+
+ synchronize_rcu();
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
+}
+EXPORT_SYMBOL_GPL(rtnl_af_unregister);
- if (ret > 0 && src[ret - 1] == '\0')
- ret--;
- if (size > 0) {
- size_t len = (ret >= size) ? size - 1 : ret;
- memset(dest, 0, size);
- memcpy(dest, src, len);
+static size_t rtnl_link_get_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct rtnl_af_ops *af_ops;
+ size_t size;
+
+ /* IFLA_AF_SPEC */
+ size = nla_total_size(sizeof(struct nlattr));
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_link_af_size) {
+ /* AF_* + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ af_ops->get_link_af_size(dev, ext_filter_mask);
+ }
}
+ rcu_read_unlock();
+
+ return size;
+}
+
+static bool rtnl_have_link_slave_info(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ bool ret = false;
+
+ rcu_read_lock();
+
+ master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+ if (master_dev && master_dev->rtnl_link_ops)
+ ret = true;
+ rcu_read_unlock();
return ret;
}
-int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+static int rtnl_link_slave_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
{
- int err = 0;
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ struct nlattr *slave_data;
+ int err;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_slave_info) {
+ slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA);
+ if (!slave_data)
+ return -EMSGSIZE;
+ err = ops->fill_slave_info(skb, master_dev, dev);
+ if (err < 0)
+ goto err_cancel_slave_data;
+ nla_nest_end(skb, slave_data);
+ }
+ return 0;
- NETLINK_CB(skb).dst_group = group;
- if (echo)
- atomic_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
- if (echo)
- err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+err_cancel_slave_data:
+ nla_nest_cancel(skb, slave_data);
return err;
}
-int rtnl_unicast(struct sk_buff *skb, u32 pid)
+static int rtnl_link_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
{
- return nlmsg_unicast(rtnl, skb, pid);
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ struct nlattr *data;
+ int err;
+
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_xstats) {
+ err = ops->fill_xstats(skb, dev);
+ if (err < 0)
+ return err;
+ }
+ if (ops->fill_info) {
+ data = nla_nest_start_noflag(skb, IFLA_INFO_DATA);
+ if (data == NULL)
+ return -EMSGSIZE;
+ err = ops->fill_info(skb, dev);
+ if (err < 0)
+ goto err_cancel_data;
+ nla_nest_end(skb, data);
+ }
+ return 0;
+
+err_cancel_data:
+ nla_nest_cancel(skb, data);
+ return err;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct nlattr *linkinfo;
+ int err = -EMSGSIZE;
+
+ linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO);
+ if (linkinfo == NULL)
+ goto out;
+
+ err = rtnl_link_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ err = rtnl_link_slave_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ nla_nest_end(skb, linkinfo);
+ return 0;
+
+err_cancel_link:
+ nla_nest_cancel(skb, linkinfo);
+out:
+ return err;
}
-int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
- struct nlmsghdr *nlh, gfp_t flags)
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
- int report = 0;
+ struct sock *rtnl = net->rtnl;
- if (nlh)
- report = nlmsg_report(nlh);
+ return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL);
+}
+
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{
+ struct sock *rtnl = net->rtnl;
- return nlmsg_notify(rtnl, skb, pid, group, report, flags);
+ return nlmsg_unicast(rtnl, skb, pid);
}
+EXPORT_SYMBOL(rtnl_unicast);
-void rtnl_set_sk_err(u32 group, int error)
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ const struct nlmsghdr *nlh, gfp_t flags)
{
+ struct sock *rtnl = net->rtnl;
+
+ nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
+}
+EXPORT_SYMBOL(rtnl_notify);
+
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+ struct sock *rtnl = net->rtnl;
+
netlink_set_err(rtnl, 0, group, error);
}
+EXPORT_SYMBOL(rtnl_set_sk_err);
int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
{
struct nlattr *mx;
int i, valid = 0;
- mx = nla_nest_start(skb, RTA_METRICS);
+ /* nothing is dumped for dst_default_metrics, so just skip the loop */
+ if (metrics == dst_default_metrics.metrics)
+ return 0;
+
+ mx = nla_nest_start_noflag(skb, RTA_METRICS);
if (mx == NULL)
return -ENOBUFS;
for (i = 0; i < RTAX_MAX; i++) {
if (metrics[i]) {
+ if (i == RTAX_CC_ALGO - 1) {
+ char tmp[TCP_CA_NAME_MAX], *name;
+
+ name = tcp_ca_get_name_by_key(metrics[i], tmp);
+ if (!name)
+ continue;
+ if (nla_put_string(skb, i + 1, name))
+ goto nla_put_failure;
+ } else if (i == RTAX_FEATURES - 1) {
+ u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+
+ if (!user_features)
+ continue;
+ BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
+ if (nla_put_u32(skb, i + 1, user_features))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, i + 1, metrics[i]))
+ goto nla_put_failure;
+ }
valid++;
- NLA_PUT_U32(skb, i+1, metrics[i]);
}
}
@@ -209,59 +1014,98 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
return nla_nest_end(skb, mx);
nla_put_failure:
- return nla_nest_cancel(skb, mx);
+ nla_nest_cancel(skb, mx);
+ return -EMSGSIZE;
}
+EXPORT_SYMBOL(rtnetlink_put_metrics);
int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
- u32 ts, u32 tsage, long expires, u32 error)
+ long expires, u32 error)
{
struct rta_cacheinfo ci = {
- .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
- .rta_used = dst->__use,
- .rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
- .rta_ts = ts,
- .rta_tsage = tsage,
};
+ unsigned long delta;
- if (expires)
- ci.rta_expires = jiffies_to_clock_t(expires);
+ if (dst) {
+ delta = jiffies - READ_ONCE(dst->lastuse);
+ ci.rta_lastuse = jiffies_delta_to_clock_t(delta);
+ ci.rta_used = dst->__use;
+ ci.rta_clntref = rcuref_read(&dst->__rcuref);
+ }
+ if (expires) {
+ unsigned long clock;
+ clock = jiffies_to_clock_t(abs(expires));
+ clock = min_t(unsigned long, clock, INT_MAX);
+ ci.rta_expires = (expires > 0) ? clock : -clock;
+ }
return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
}
-
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+void netif_set_operstate(struct net_device *dev, int newstate)
+{
+ unsigned int old = READ_ONCE(dev->operstate);
+
+ do {
+ if (old == newstate)
+ return;
+ } while (!try_cmpxchg(&dev->operstate, &old, newstate));
+
+ netif_state_change(dev);
+}
+EXPORT_SYMBOL(netif_set_operstate);
+
static void set_operstate(struct net_device *dev, unsigned char transition)
{
- unsigned char operstate = dev->operstate;
+ unsigned char operstate = READ_ONCE(dev->operstate);
- switch(transition) {
+ switch (transition) {
case IF_OPER_UP:
if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_TESTING ||
operstate == IF_OPER_UNKNOWN) &&
- !netif_dormant(dev))
+ !netif_dormant(dev) && !netif_testing(dev))
operstate = IF_OPER_UP;
break;
+ case IF_OPER_TESTING:
+ if (netif_oper_up(dev))
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_OPER_DORMANT:
- if (operstate == IF_OPER_UP ||
- operstate == IF_OPER_UNKNOWN)
+ if (netif_oper_up(dev))
operstate = IF_OPER_DORMANT;
break;
- };
-
- if (dev->operstate != operstate) {
- write_lock_bh(&dev_base_lock);
- dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
- netdev_state_change(dev);
}
+
+ netif_set_operstate(dev, operstate);
+}
+
+static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
+{
+ return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
+ (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
+}
+
+static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
+ const struct ifinfomsg *ifm)
+{
+ unsigned int flags = ifm->ifi_flags;
+
+ /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+ if (ifm->ifi_change)
+ flags = (flags & ifm->ifi_change) |
+ (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);
+
+ return flags;
}
static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
- struct net_device_stats *b)
+ const struct rtnl_link_stats64 *b)
{
a->rx_packets = b->rx_packets;
a->tx_packets = b->tx_packets;
@@ -290,15 +1134,149 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
a->rx_compressed = b->rx_compressed;
a->tx_compressed = b->tx_compressed;
-};
-static inline size_t if_nlmsg_size(int iwbuflen)
+ a->rx_nohandler = b->rx_nohandler;
+}
+
+/* All VF info */
+static inline int rtnl_vfinfo_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
- return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
+ int num_vfs = dev_num_vf(dev->dev.parent);
+ size_t size = nla_total_size(0);
+ size += num_vfs *
+ (nla_total_size(0) +
+ nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_broadcast)) +
+ nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct ifla_vf_vlan_info)) +
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_link_state)) +
+ nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ nla_total_size(sizeof(struct ifla_vf_trust)));
+ if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
+ size += num_vfs *
+ (nla_total_size(0) + /* nest IFLA_VF_STATS */
+ /* IFLA_VF_STATS_RX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_BROADCAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_MULTICAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)));
+ }
+ if (dev->netdev_ops->ndo_get_vf_guid)
+ size += num_vfs * 2 *
+ nla_total_size(sizeof(struct ifla_vf_guid));
+ return size;
+ } else
+ return 0;
+}
+
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ size_t port_size = nla_total_size(4) /* PORT_VF */
+ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
+ + nla_total_size(1) /* PROT_VDP_REQUEST */
+ + nla_total_size(2); /* PORT_VDP_RESPONSE */
+ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
+ size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+ size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+ if (dev_num_vf(dev->dev.parent))
+ return port_self_size + vf_ports_size +
+ vf_port_size * dev_num_vf(dev->dev.parent);
+ else
+ return port_self_size;
+}
+
+static size_t rtnl_xdp_size(void)
+{
+ size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
+ nla_total_size(1) + /* XDP_ATTACHED */
+ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */
+ nla_total_size(4); /* XDP_<mode>_PROG_ID */
+
+ return xdp_size;
+}
+
+static size_t rtnl_prop_list_size(const struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ unsigned int cnt = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(name_node, &dev->name_node->list, list)
+ cnt++;
+ rcu_read_unlock();
+
+ if (!cnt)
+ return 0;
+
+ return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ);
+}
+
+static size_t rtnl_proto_down_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(1);
+
+ /* Assume dev->proto_down_reason is not zero. */
+ size += nla_total_size(0) + nla_total_size(4);
+
+ return size;
+}
+
+static size_t rtnl_devlink_port_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */
+
+ if (dev->devlink_port)
+ size += devlink_nl_port_handle_size(dev->devlink_port);
+
+ return size;
+}
+
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+ size += dpll_netdev_pin_handle_size(dev);
+
+ return size;
+}
+
+static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ size_t size;
+
+ size = NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
- + nla_total_size(sizeof(struct rtnl_link_ifmap))
- + nla_total_size(sizeof(struct rtnl_link_stats))
+ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ nla_total_size(4) /* IFLA_TXQLEN */
@@ -306,169 +1284,1801 @@ static inline size_t if_nlmsg_size(int iwbuflen)
+ nla_total_size(4) /* IFLA_MTU */
+ nla_total_size(4) /* IFLA_LINK */
+ nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(1) /* IFLA_CARRIER */
+ + nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_ALLMULTI */
+ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
- + nla_total_size(iwbuflen);
+ + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ + nla_total_size(4) /* IFLA_LINK_NETNSID */
+ + nla_total_size(4) /* IFLA_GROUP */
+ + nla_total_size(ext_filter_mask
+ & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ + rtnl_xdp_size() /* IFLA_XDP */
+ + nla_total_size(4) /* IFLA_EVENT */
+ + nla_total_size(4) /* IFLA_NEW_NETNSID */
+ + nla_total_size(4) /* IFLA_NEW_IFINDEX */
+ + rtnl_proto_down_size(dev) /* proto down */
+ + nla_total_size(4) /* IFLA_TARGET_NETNSID */
+ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ + nla_total_size(4) /* IFLA_MIN_MTU */
+ + nla_total_size(4) /* IFLA_MAX_MTU */
+ + rtnl_prop_list_size(dev)
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ + rtnl_devlink_port_size(dev)
+ + rtnl_dpll_pin_size(dev)
+ + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ + nla_total_size(2) /* IFLA_HEADROOM */
+ + nla_total_size(2) /* IFLA_TAILROOM */
+ + 0;
+
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS))
+ size += nla_total_size(sizeof(struct rtnl_link_stats)) +
+ nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ return size;
+}
+
+static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *vf_ports;
+ struct nlattr *vf_port;
+ int vf;
+ int err;
+
+ vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS);
+ if (!vf_ports)
+ return -EMSGSIZE;
+
+ for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
+ vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT);
+ if (!vf_port)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_PORT_VF, vf))
+ goto nla_put_failure;
+ err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ if (err) {
+ nla_nest_cancel(skb, vf_port);
+ continue;
+ }
+ nla_nest_end(skb, vf_port);
+ }
+
+ nla_nest_end(skb, vf_ports);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, vf_ports);
+ return -EMSGSIZE;
+}
+
+static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *port_self;
+ int err;
+
+ port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF);
+ if (!port_self)
+ return -EMSGSIZE;
+
+ err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
+ if (err) {
+ nla_nest_cancel(skb, port_self);
+ return (err == -EMSGSIZE) ? err : 0;
+ }
+
+ nla_nest_end(skb, port_self);
+
+ return 0;
+}
+
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ int err;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+
+ err = rtnl_port_self_fill(skb, dev);
+ if (err)
+ return err;
+
+ if (dev_num_vf(dev->dev.parent)) {
+ err = rtnl_vf_ports_fill(skb, dev);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_item_id ppid;
+
+ err = dev_get_phys_port_id(dev, &ppid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ char name[IFNAMSIZ];
+ int err;
+
+ err = dev_get_phys_port_name(dev, name, sizeof(name));
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct netdev_phys_item_id ppid = { };
+ int err;
+
+ err = netif_get_port_parent_id(dev, &ppid, false);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct rtnl_link_stats64 *sp;
+ struct nlattr *attr;
+
+ attr = nla_reserve_64bit(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64), IFLA_PAD);
+ if (!attr)
+ return -EMSGSIZE;
+
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (!attr)
+ return -EMSGSIZE;
+
+ copy_rtnl_link_stats(nla_data(attr), sp);
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
+ struct net_device *dev,
+ int vfs_num,
+ u32 ext_filter_mask)
+{
+ struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct nlattr *vf, *vfstats, *vfvlanlist;
+ struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_vlan_info vf_vlan_info;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_stats vf_stats;
+ struct ifla_vf_trust vf_trust;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_broadcast vf_broadcast;
+ struct ifla_vf_info ivi;
+ struct ifla_vf_guid node_guid;
+ struct ifla_vf_guid port_guid;
+
+ memset(&ivi, 0, sizeof(ivi));
+
+ /* Not all SR-IOV capable drivers support the
+ * spoofcheck and "RSS query enable" query. Preset to
+ * -1 so the user space tool can detect that the driver
+ * didn't report anything.
+ */
+ ivi.spoofchk = -1;
+ ivi.rss_query_en = -1;
+ ivi.trusted = -1;
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
+ /* VLAN Protocol by default is 802.1Q */
+ ivi.vlan_proto = htons(ETH_P_8021Q);
+ if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
+ return 0;
+
+ memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+ memset(&node_guid, 0, sizeof(node_guid));
+ memset(&port_guid, 0, sizeof(port_guid));
+
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_vlan_info.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf =
+ vf_rss_query_en.vf =
+ vf_trust.vf =
+ node_guid.vf =
+ port_guid.vf = ivi.vf;
+
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_vlan_info.vlan = ivi.vlan;
+ vf_vlan_info.qos = ivi.qos;
+ vf_vlan_info.vlan_proto = ivi.vlan_proto;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
+ vf_rss_query_en.setting = ivi.rss_query_en;
+ vf_trust.setting = ivi.trusted;
+ vf = nla_nest_start_noflag(skb, IFLA_VF_INFO);
+ if (!vf)
+ return -EMSGSIZE;
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate) ||
+ nla_put(skb, IFLA_VF_RSS_QUERY_EN,
+ sizeof(vf_rss_query_en),
+ &vf_rss_query_en) ||
+ nla_put(skb, IFLA_VF_TRUST,
+ sizeof(vf_trust), &vf_trust))
+ goto nla_put_vf_failure;
+
+ if (dev->netdev_ops->ndo_get_vf_guid &&
+ !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid,
+ &port_guid)) {
+ if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid),
+ &node_guid) ||
+ nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid),
+ &port_guid))
+ goto nla_put_vf_failure;
+ }
+ vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST);
+ if (!vfvlanlist)
+ goto nla_put_vf_failure;
+ if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+ &vf_vlan_info)) {
+ nla_nest_cancel(skb, vfvlanlist);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfvlanlist);
+ if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
+ &vf_stats);
+ vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS);
+ if (!vfstats)
+ goto nla_put_vf_failure;
+ if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+ vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+ vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfstats);
+ }
+ nla_nest_end(skb, vf);
+ return 0;
+
+nla_put_vf_failure:
+ nla_nest_cancel(skb, vf);
+ return -EMSGSIZE;
+}
+
+static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct nlattr *vfinfo;
+ int i, num_vfs;
+
+ if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0))
+ return 0;
+
+ num_vfs = dev_num_vf(dev->dev.parent);
+ if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs))
+ return -EMSGSIZE;
+
+ if (!dev->netdev_ops->ndo_get_vf_config)
+ return 0;
+
+ vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST);
+ if (!vfinfo)
+ return -EMSGSIZE;
+
+ for (i = 0; i < num_vfs; i++) {
+ if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) {
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
+ }
+ }
+
+ nla_nest_end(skb, vfinfo);
+ return 0;
+}
+
+static int rtnl_fill_link_ifmap(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct rtnl_link_ifmap map;
+
+ memset(&map, 0, sizeof(map));
+ map.mem_start = READ_ONCE(dev->mem_start);
+ map.mem_end = READ_ONCE(dev->mem_end);
+ map.base_addr = READ_ONCE(dev->base_addr);
+ map.irq = READ_ONCE(dev->irq);
+ map.dma = READ_ONCE(dev->dma);
+ map.port = READ_ONCE(dev->if_port);
+
+ if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
+{
+ const struct bpf_prog *generic_xdp_prog;
+ u32 res = 0;
+
+ rcu_read_lock();
+ generic_xdp_prog = rcu_dereference(dev->xdp_prog);
+ if (generic_xdp_prog)
+ res = generic_xdp_prog->aux->id;
+ rcu_read_unlock();
+
+ return res;
+}
+
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+ return dev_xdp_prog_id(dev, XDP_MODE_DRV);
+}
+
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+ return dev_xdp_prog_id(dev, XDP_MODE_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+ u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+ u32 (*get_prog_id)(struct net_device *dev))
+{
+ u32 curr_id;
+ int err;
+
+ curr_id = get_prog_id(dev);
+ if (!curr_id)
+ return 0;
+
+ *prog_id = curr_id;
+ err = nla_put_u32(skb, attr, curr_id);
+ if (err)
+ return err;
+
+ if (*mode != XDP_ATTACHED_NONE)
+ *mode = XDP_ATTACHED_MULTI;
+ else
+ *mode = tgt_mode;
+
+ return 0;
+}
+
+static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *xdp;
+ u32 prog_id;
+ int err;
+ u8 mode;
+
+ xdp = nla_nest_start_noflag(skb, IFLA_XDP);
+ if (!xdp)
+ return -EMSGSIZE;
+
+ prog_id = 0;
+ mode = XDP_ATTACHED_NONE;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+ IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+ IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+ IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
+ if (err)
+ goto err_cancel;
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
+ if (err)
+ goto err_cancel;
+
+ if (prog_id && mode != XDP_ATTACHED_MULTI) {
+ err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
+ if (err)
+ goto err_cancel;
+ }
+
+ nla_nest_end(skb, xdp);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, xdp);
+ return err;
+}
+
+static u32 rtnl_get_event(unsigned long event)
+{
+ u32 rtnl_event_type = IFLA_EVENT_NONE;
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ rtnl_event_type = IFLA_EVENT_REBOOT;
+ break;
+ case NETDEV_FEAT_CHANGE:
+ rtnl_event_type = IFLA_EVENT_FEATURES;
+ break;
+ case NETDEV_BONDING_FAILOVER:
+ rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER;
+ break;
+ case NETDEV_NOTIFY_PEERS:
+ rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS;
+ break;
+ case NETDEV_RESEND_IGMP:
+ rtnl_event_type = IFLA_EVENT_IGMP_RESEND;
+ break;
+ case NETDEV_CHANGEINFODATA:
+ rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS;
+ break;
+ default:
+ break;
+ }
+
+ return rtnl_event_type;
+}
+
+static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
+{
+ const struct net_device *upper_dev;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ upper_dev = netdev_master_upper_dev_get_rcu(dev);
+ if (upper_dev)
+ ret = nla_put_u32(skb, IFLA_MASTER,
+ READ_ONCE(upper_dev->ifindex));
+
+ rcu_read_unlock();
+ return ret;
+}
+
+static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev,
+ bool force)
+{
+ int iflink = dev_get_iflink(dev);
+
+ if (force || READ_ONCE(dev->ifindex) != iflink)
+ return nla_put_u32(skb, IFLA_LINK, iflink);
+
+ return 0;
+}
+
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ char buf[IFALIASZ];
+ int ret;
+
+ ret = dev_get_alias(dev, buf, sizeof(buf));
+ return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
+static int rtnl_fill_link_netnsid(struct sk_buff *skb,
+ const struct net_device *dev,
+ struct net *src_net, gfp_t gfp)
+{
+ bool put_iflink = false;
+
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
+ struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
+
+ if (!net_eq(dev_net(dev), link_net)) {
+ int id = peernet2id_alloc(src_net, link_net, gfp);
+
+ if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
+ return -EMSGSIZE;
+
+ put_iflink = true;
+ }
+ }
+
+ return nla_put_iflink(skb, dev, put_iflink);
+}
+
+static int rtnl_fill_link_af(struct sk_buff *skb,
+ const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ const struct rtnl_af_ops *af_ops;
+ struct nlattr *af_spec;
+
+ af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
+ if (!af_spec)
+ return -EMSGSIZE;
+
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ struct nlattr *af;
+ int err;
+
+ if (!af_ops->fill_link_af)
+ continue;
+
+ af = nla_nest_start_noflag(skb, af_ops->family);
+ if (!af)
+ return -EMSGSIZE;
+
+ err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
+ /*
+ * Caller may return ENODATA to indicate that there
+ * was no data to be dumped. This is not an error, it
+ * means we should trim the attribute header and
+ * continue.
+ */
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, af);
+ else if (err < 0)
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, af);
+ }
+
+ nla_nest_end(skb, af_spec);
+ return 0;
+}
+
+static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ int count = 0;
+
+ list_for_each_entry_rcu(name_node, &dev->name_node->list, list) {
+ if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name))
+ return -EMSGSIZE;
+ count++;
+ }
+ return count;
+}
+
+/* RCU protected. */
+static int rtnl_fill_prop_list(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *prop_list;
+ int ret;
+
+ prop_list = nla_nest_start(skb, IFLA_PROP_LIST);
+ if (!prop_list)
+ return -EMSGSIZE;
+
+ ret = rtnl_fill_alt_ifnames(skb, dev);
+ if (ret <= 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, prop_list);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, prop_list);
+ return ret;
+}
+
+static int rtnl_fill_proto_down(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *pr;
+ u32 preason;
+
+ if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))
+ goto nla_put_failure;
+
+ preason = READ_ONCE(dev->proto_down_reason);
+ if (!preason)
+ return 0;
+
+ pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON);
+ if (!pr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) {
+ nla_nest_cancel(skb, pr);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, pr);
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
}
-static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
- void *iwbuf, int iwbuflen, int type, u32 pid,
- u32 seq, u32 change, unsigned int flags)
+static int rtnl_fill_devlink_port(struct sk_buff *skb,
+ const struct net_device *dev)
{
+ struct nlattr *devlink_port_nest;
+ int ret;
+
+ devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT);
+ if (!devlink_port_nest)
+ return -EMSGSIZE;
+
+ if (dev->devlink_port) {
+ ret = devlink_nl_port_handle_fill(skb, dev->devlink_port);
+ if (ret < 0)
+ goto nest_cancel;
+ }
+
+ nla_nest_end(skb, devlink_port_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, devlink_port_nest);
+ return ret;
+}
+
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *dpll_pin_nest;
+ int ret;
+
+ dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+ if (!dpll_pin_nest)
+ return -EMSGSIZE;
+
+ ret = dpll_netdev_add_pin_handle(skb, dev);
+ if (ret < 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, dpll_pin_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, dpll_pin_nest);
+ return ret;
+}
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb,
+ struct net_device *dev, struct net *src_net,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, u32 ext_filter_mask,
+ u32 event, int *new_nsid, int new_ifindex,
+ int tgt_netnsid, gfp_t gfp)
+{
+ char devname[IFNAMSIZ];
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
+ struct Qdisc *qdisc;
+ ASSERT_RTNL();
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
if (nlh == NULL)
- return -ENOBUFS;
+ return -EMSGSIZE;
ifm = nlmsg_data(nlh);
ifm->ifi_family = AF_UNSPEC;
ifm->__ifi_pad = 0;
- ifm->ifi_type = dev->type;
- ifm->ifi_index = dev->ifindex;
- ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_type = READ_ONCE(dev->type);
+ ifm->ifi_index = READ_ONCE(dev->ifindex);
+ ifm->ifi_flags = netif_get_flags(dev);
ifm->ifi_change = change;
- NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
- NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len);
- NLA_PUT_U32(skb, IFLA_WEIGHT, dev->weight);
- NLA_PUT_U8(skb, IFLA_OPERSTATE,
- netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
- NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
- NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+ if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
+ goto nla_put_failure;
+
+ netdev_copy_name(dev, devname);
+ if (nla_put_string(skb, IFLA_IFNAME, devname))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? READ_ONCE(dev->operstate) :
+ IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
+ nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) ||
+ nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
+ nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) ||
+ nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES,
+ READ_ONCE(dev->num_tx_queues)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS,
+ READ_ONCE(dev->gso_max_segs)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE,
+ READ_ONCE(dev->gso_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_MAX_SIZE,
+ READ_ONCE(dev->gro_max_size)) ||
+ nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gso_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gro_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SIZE,
+ READ_ONCE(dev->tso_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
+ READ_ONCE(dev->tso_max_segs)) ||
+ nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
+ READ_ONCE(dev->max_pacing_offload_horizon)) ||
+#ifdef CONFIG_RPS
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
+ READ_ONCE(dev->num_rx_queues)) ||
+#endif
+ put_master_ifindex(skb, dev) ||
+ nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
+ nla_put_ifalias(skb, dev) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_up_count) +
+ atomic_read(&dev->carrier_down_count)) ||
+ nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
+ atomic_read(&dev->carrier_up_count)) ||
+ nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
+ atomic_read(&dev->carrier_down_count)) ||
+ nla_put_u16(skb, IFLA_HEADROOM,
+ READ_ONCE(dev->needed_headroom)) ||
+ nla_put_u16(skb, IFLA_TAILROOM,
+ READ_ONCE(dev->needed_tailroom)))
+ goto nla_put_failure;
+
+ if (rtnl_fill_proto_down(skb, dev))
+ goto nla_put_failure;
+
+ if (event != IFLA_EVENT_NONE) {
+ if (nla_put_u32(skb, IFLA_EVENT, event))
+ goto nla_put_failure;
+ }
+
+ if (dev->addr_len) {
+ if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
+ nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
+ goto nla_put_failure;
+ }
+
+ if (rtnl_phys_port_id_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_phys_port_name_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_phys_switch_id_fill(skb, dev))
+ goto nla_put_failure;
- if (dev->ifindex != dev->iflink)
- NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) &&
+ rtnl_fill_stats(skb, dev))
+ goto nla_put_failure;
- if (dev->master)
- NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex);
+ if (rtnl_fill_vf(skb, dev, ext_filter_mask))
+ goto nla_put_failure;
- if (dev->qdisc_sleeping)
- NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc_sleeping->ops->id);
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
+ goto nla_put_failure;
- if (1) {
- struct rtnl_link_ifmap map = {
- .mem_start = dev->mem_start,
- .mem_end = dev->mem_end,
- .base_addr = dev->base_addr,
- .irq = dev->irq,
- .dma = dev->dma,
- .port = dev->if_port,
- };
- NLA_PUT(skb, IFLA_MAP, sizeof(map), &map);
+ if (rtnl_xdp_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
+ if (rtnl_link_fill(skb, dev) < 0)
+ goto nla_put_failure;
}
- if (dev->addr_len) {
- NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
- NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
+ if (new_nsid &&
+ nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
+ goto nla_put_failure;
+ if (new_ifindex &&
+ nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
+ goto nla_put_failure;
+
+ if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) &&
+ nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC))
+ goto nla_put_failure_rcu;
+ qdisc = rcu_dereference(dev->qdisc);
+ if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id))
+ goto nla_put_failure_rcu;
+ if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
+ goto nla_put_failure_rcu;
+ if (rtnl_fill_link_ifmap(skb, dev))
+ goto nla_put_failure_rcu;
+ if (rtnl_fill_prop_list(skb, dev))
+ goto nla_put_failure_rcu;
+ rcu_read_unlock();
+
+ if (dev->dev.parent &&
+ nla_put_string(skb, IFLA_PARENT_DEV_NAME,
+ dev_name(dev->dev.parent)))
+ goto nla_put_failure;
+
+ if (dev->dev.parent && dev->dev.parent->bus &&
+ nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
+ dev->dev.parent->bus->name))
+ goto nla_put_failure;
+
+ if (rtnl_fill_devlink_port(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_fill_dpll_pin(skb, dev))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure_rcu:
+ rcu_read_unlock();
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN },
+ [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+ [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
+ [IFLA_MTU] = { .type = NLA_U32 },
+ [IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
+ [IFLA_CARRIER] = { .type = NLA_U8 },
+ [IFLA_TXQLEN] = { .type = NLA_U32 },
+ [IFLA_WEIGHT] = { .type = NLA_U32 },
+ [IFLA_OPERSTATE] = { .type = NLA_U8 },
+ [IFLA_LINKMODE] = { .type = NLA_U8 },
+ [IFLA_LINKINFO] = { .type = NLA_NESTED },
+ [IFLA_NET_NS_PID] = { .type = NLA_U32 },
+ [IFLA_NET_NS_FD] = { .type = NLA_U32 },
+ /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to
+ * allow 0-length string (needed to remove an alias).
+ */
+ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 },
+ [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
+ [IFLA_VF_PORTS] = { .type = NLA_NESTED },
+ [IFLA_PORT_SELF] = { .type = NLA_NESTED },
+ [IFLA_AF_SPEC] = { .type = NLA_NESTED },
+ [IFLA_EXT_MASK] = { .type = NLA_U32 },
+ [IFLA_PROMISCUITY] = { .type = NLA_U32 },
+ [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
+ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+ [IFLA_LINK_NETNSID] = { .type = NLA_S32 },
+ [IFLA_PROTO_DOWN] = { .type = NLA_U8 },
+ [IFLA_XDP] = { .type = NLA_NESTED },
+ [IFLA_EVENT] = { .type = NLA_U32 },
+ [IFLA_GROUP] = { .type = NLA_U32 },
+ [IFLA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
+ [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
+ [IFLA_MIN_MTU] = { .type = NLA_U32 },
+ [IFLA_MAX_MTU] = { .type = NLA_U32 },
+ [IFLA_PROP_LIST] = { .type = NLA_NESTED },
+ [IFLA_ALT_IFNAME] = { .type = NLA_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
+ [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
+ [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
+ [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
+ [IFLA_ALLMULTI] = { .type = NLA_REJECT },
+ [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
+ [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT },
+ [IFLA_HEADROOM] = { .type = NLA_REJECT },
+ [IFLA_TAILROOM] = { .type = NLA_REJECT },
+};
+
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+ [IFLA_INFO_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_DATA] = { .type = NLA_NESTED },
+ [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+ [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_BROADCAST] = { .type = NLA_REJECT },
+ [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
+ [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
+ [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) },
+ [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
+ [IFLA_VF_STATS] = { .type = NLA_NESTED },
+ [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) },
+ [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+ [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+};
+
+static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
+ [IFLA_PORT_VF] = { .type = NLA_U32 },
+ [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
+ .len = PORT_PROFILE_MAX },
+ [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
+ [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
+
+ /* Unused, but we need to keep it here since user space could
+ * fill it. It's also broken with regard to NLA_BINARY use in
+ * combination with structs.
+ */
+ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_port_vsi) },
+};
+
+static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
+ [IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
+ [IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla,
+ int *ops_srcu_index)
+{
+ struct nlattr *linfo[IFLA_INFO_MAX + 1];
+ struct rtnl_link_ops *ops = NULL;
+
+ if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0)
+ return NULL;
+
+ if (linfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
+ nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind, ops_srcu_index);
}
- if (dev->get_stats) {
- struct net_device_stats *stats = dev->get_stats(dev);
- if (stats) {
- struct nlattr *attr;
+ return ops;
+}
- attr = nla_reserve(skb, IFLA_STATS,
- sizeof(struct rtnl_link_stats));
- if (attr == NULL)
- goto nla_put_failure;
+static bool link_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
- copy_rtnl_link_stats(nla_data(attr), stats);
- }
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+
+ /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need
+ * another invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool link_kind_filtered(const struct net_device *dev,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (kind_ops && dev->rtnl_link_ops != kind_ops)
+ return true;
+
+ return false;
+}
+
+static bool link_dump_filtered(struct net_device *dev,
+ int master_idx,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (link_master_filtered(dev, master_idx) ||
+ link_kind_filtered(dev, kind_ops))
+ return true;
+
+ return false;
+}
+
+/**
+ * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
+ * @sk: netlink socket
+ * @netnsid: network namespace identifier
+ *
+ * Returns the network namespace identified by netnsid on success or an error
+ * pointer on failure.
+ */
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
+{
+ struct net *net;
+
+ net = get_net_ns_by_id(sock_net(sk), netnsid);
+ if (!net)
+ return ERR_PTR(-EINVAL);
+
+ /* For now, the caller is required to have CAP_NET_ADMIN in
+ * the user namespace owning the target net ns.
+ */
+ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return ERR_PTR(-EACCES);
}
+ return net;
+}
+EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);
- if (iwbuf)
- NLA_PUT(skb, IFLA_WIRELESS, iwbuflen, iwbuf);
+static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
+ bool strict_check, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int hdrlen;
- return nlmsg_end(skb, nlh);
+ if (strict_check) {
+ struct ifinfomsg *ifm;
-nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for link dump");
+ return -EINVAL;
+ }
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
+ return -EINVAL;
+ }
+ if (ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps");
+ return -EINVAL;
+ }
+
+ return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb,
+ IFLA_MAX, ifla_policy,
+ extack);
+ }
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy,
+ extack);
}
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx;
- int s_idx = cb->args[0];
+ struct netlink_ext_ack *extack = cb->extack;
+ struct rtnl_link_ops *kind_ops = NULL;
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
+ unsigned int flags = NLM_F_MULTI;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct {
+ unsigned long ifindex;
+ } *ctx = (void *)cb->ctx;
+ struct net *tgt_net = net;
+ u32 ext_filter_mask = 0;
struct net_device *dev;
+ int ops_srcu_index;
+ int master_idx = 0;
+ int netnsid = -1;
+ int err, i;
- read_lock(&dev_base_lock);
- for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
- if (idx < s_idx)
+ err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
+ if (err < 0) {
+ if (cb->strict_check)
+ return err;
+
+ goto walk_entries;
+ }
+
+ for (i = 0; i <= IFLA_MAX; ++i) {
+ if (!tb[i])
continue;
- if (rtnl_fill_ifinfo(skb, dev, NULL, 0, RTM_NEWLINK,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0)
+
+ /* new attributes should only be added with strict checking */
+ switch (i) {
+ case IFLA_TARGET_NETNSID:
+ netnsid = nla_get_s32(tb[i]);
+ tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
+ if (IS_ERR(tgt_net)) {
+ NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
+ err = PTR_ERR(tgt_net);
+ netnsid = -1;
+ goto out;
+ }
+ break;
+ case IFLA_EXT_MASK:
+ ext_filter_mask = nla_get_u32(tb[i]);
+ break;
+ case IFLA_MASTER:
+ master_idx = nla_get_u32(tb[i]);
break;
+ case IFLA_LINKINFO:
+ kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index);
+ break;
+ default:
+ if (cb->strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
+ err = -EINVAL;
+ goto out;
+ }
+ }
}
- read_unlock(&dev_base_lock);
- cb->args[0] = idx;
- return skb->len;
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
+
+walk_entries:
+ err = 0;
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
+ err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq, 0, flags,
+ ext_filter_mask, 0, NULL, 0,
+ netnsid, GFP_KERNEL);
+ if (err < 0)
+ break;
+ }
+
+
+ cb->seq = tgt_net->dev_base_seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+out:
+
+ if (kind_ops)
+ rtnl_link_ops_put(kind_ops, ops_srcu_index);
+ if (netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
}
-static struct nla_policy ifla_policy[IFLA_MAX+1] __read_mostly = {
- [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
- [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
- [IFLA_MTU] = { .type = NLA_U32 },
- [IFLA_TXQLEN] = { .type = NLA_U32 },
- [IFLA_WEIGHT] = { .type = NLA_U32 },
- [IFLA_OPERSTATE] = { .type = NLA_U8 },
- [IFLA_LINKMODE] = { .type = NLA_U8 },
+int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
+ struct netlink_ext_ack *exterr)
+{
+ const struct ifinfomsg *ifmp;
+ const struct nlattr *attrs;
+ size_t len;
+
+ ifmp = nla_data(nla_peer);
+ attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg);
+ len = nla_len(nla_peer) - sizeof(struct ifinfomsg);
+
+ if (ifmp->ifi_index < 0) {
+ NL_SET_ERR_MSG_ATTR(exterr, nla_peer,
+ "ifindex can't be negative");
+ return -EINVAL;
+ }
+
+ return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy,
+ exterr);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);
+
+static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[])
+{
+ struct net *net = NULL;
+
+ /* Examine the link attributes and figure out which
+ * network namespace we are talking about.
+ */
+ if (tb[IFLA_NET_NS_PID])
+ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ else if (tb[IFLA_NET_NS_FD])
+ net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
+
+ return net;
+}
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net = rtnl_link_get_net_ifla(tb);
+
+ if (!net)
+ net = get_net(src_net);
+
+ return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
+
+/* Figure out which network namespace we are talking about by
+ * examining the link attributes in the following order:
+ *
+ * 1. IFLA_NET_NS_PID
+ * 2. IFLA_NET_NS_FD
+ * 3. IFLA_TARGET_NETNSID
+ */
+static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
+ struct nlattr *tb[])
+{
+ struct net *net;
+
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
+ return rtnl_link_get_net(src_net, tb);
+
+ if (!tb[IFLA_TARGET_NETNSID])
+ return get_net(src_net);
+
+ net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
+ if (!net)
+ return ERR_PTR(-EINVAL);
+
+ return net;
+}
+
+static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
+ struct net *src_net,
+ struct nlattr *tb[], int cap)
+{
+ struct net *net;
+
+ net = rtnl_link_get_net_by_nlattr(src_net, tb);
+ if (IS_ERR(net))
+ return net;
+
+ if (!netlink_ns_capable(skb, net->user_ns, cap)) {
+ put_net(net);
+ return ERR_PTR(-EPERM);
+ }
+
+ return net;
+}
+
+/* Verify that rtnetlink requests do not pass additional properties
+ * potentially referring to different network namespaces.
+ */
+static int rtnl_ensure_unique_netns(struct nlattr *tb[],
+ struct netlink_ext_ack *extack,
+ bool netns_id_only)
+{
+
+ if (netns_id_only) {
+ if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
+ return 0;
+
+ NL_SET_ERR_MSG(extack, "specified netns attribute not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+ goto invalid_attr;
+
+ if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
+ goto invalid_attr;
+
+ if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
+ goto invalid_attr;
+
+ return 0;
+
+invalid_attr:
+ NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified");
+ return -EINVAL;
+}
+
+static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+ int max_tx_rate)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_set_vf_rate)
+ return -EOPNOTSUPP;
+ if (max_tx_rate && max_tx_rate < min_tx_rate)
+ return -EINVAL;
+
+ return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate);
+}
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_GSO_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) {
+ NL_SET_ERR_MSG(extack, "too big gso_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GSO_MAX_SEGS] &&
+ (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS ||
+ nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) {
+ NL_SET_ERR_MSG(extack, "too big gso_max_segs");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GRO_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "too big gro_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) {
+ NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem, err;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
+
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
+ if (!af_ops)
+ return -EAFNOSUPPORT;
+
+ if (!af_ops->set_link_af)
+ err = -EOPNOTSUPP;
+ else if (af_ops->validate_link_af)
+ err = af_ops->validate_link_af(dev, af, extack);
+ else
+ err = 0;
+
+ rtnl_af_put(af_ops, af_ops_srcu_index);
+
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+ int guid_type)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+}
+
+static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+{
+ if (dev->type != ARPHRD_INFINIBAND)
+ return -EOPNOTSUPP;
+
+ return handle_infiniband_guid(dev, ivt, guid_type);
+}
+
+static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err = -EINVAL;
+
+ if (tb[IFLA_VF_MAC]) {
+ struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+
+ if (ivm->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_mac)
+ err = ops->ndo_set_vf_mac(dev, ivm->vf,
+ ivm->mac);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN]) {
+ struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+
+ if (ivv->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_vlan)
+ err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
+ ivv->qos,
+ htons(ETH_P_8021Q));
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN_LIST]) {
+ struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+ struct nlattr *attr;
+ int rem, len = 0;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_vlan)
+ return err;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+ nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) {
+ return -EINVAL;
+ }
+ if (len >= MAX_VLAN_LIST_LEN)
+ return -EOPNOTSUPP;
+ ivvl[len] = nla_data(attr);
+
+ len++;
+ }
+ if (len == 0)
+ return -EINVAL;
+
+ if (ivvl[0]->vf >= INT_MAX)
+ return -EINVAL;
+ err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+ ivvl[0]->qos, ivvl[0]->vlan_proto);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_TX_RATE]) {
+ struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
+ struct ifla_vf_info ivf;
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
+ if (err < 0)
+ return err;
+
+ err = rtnl_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate, ivt->rate);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_RATE]) {
+ struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+
+ err = rtnl_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate, ivt->max_tx_rate);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_SPOOFCHK]) {
+ struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+
+ if (ivs->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_LINK_STATE]) {
+ struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+
+ if (ivl->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_link_state)
+ err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+ ivl->link_state);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_RSS_QUERY_EN]) {
+ struct ifla_vf_rss_query_en *ivrssq_en;
+
+ err = -EOPNOTSUPP;
+ ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ivrssq_en->vf >= INT_MAX)
+ return -EINVAL;
+ if (ops->ndo_set_vf_rss_query_en)
+ err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
+ ivrssq_en->setting);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_TRUST]) {
+ struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_trust)
+ err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_IB_NODE_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+ }
+
+ if (tb[IFLA_VF_IB_PORT_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+ }
+
+ return err;
+}
+
+static int do_set_master(struct net_device *dev, int ifindex,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops;
+ int err;
+
+ /* Release the lower lock, the upper is responsible for locking
+ * the lower if needed. None of the existing upper devices
+ * use netdev instance lock, so don't grab it.
+ */
+
+ if (upper_dev) {
+ if (upper_dev->ifindex == ifindex)
+ return 0;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_del_slave) {
+ netdev_unlock_ops(dev);
+ err = ops->ndo_del_slave(upper_dev, dev);
+ netdev_lock_ops(dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!upper_dev)
+ return -EINVAL;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ netdev_unlock_ops(dev);
+ err = ops->ndo_add_slave(upper_dev, dev, extack);
+ netdev_lock_ops(dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = {
+ [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 },
+ [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 },
};
-static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int do_set_proto_down(struct net_device *dev,
+ struct nlattr *nl_proto_down,
+ struct nlattr *nl_proto_down_reason,
+ struct netlink_ext_ack *extack)
{
- struct ifinfomsg *ifm;
- struct net_device *dev;
- int err, send_addr_notify = 0, modified = 0;
- struct nlattr *tb[IFLA_MAX+1];
+ struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1];
+ unsigned long mask = 0;
+ u32 value;
+ bool proto_down;
+ int err;
+
+ if (!dev->change_proto_down) {
+ NL_SET_ERR_MSG(extack, "Protodown not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ if (nl_proto_down_reason) {
+ err = nla_parse_nested_deprecated(pdreason,
+ IFLA_PROTO_DOWN_REASON_MAX,
+ nl_proto_down_reason,
+ ifla_proto_down_reason_policy,
+ NULL);
+ if (err < 0)
+ return err;
+
+ if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) {
+ NL_SET_ERR_MSG(extack, "Invalid protodown reason value");
+ return -EINVAL;
+ }
+
+ value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]);
+
+ if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
+ mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);
+
+ netdev_change_proto_down_reason_locked(dev, mask, value);
+ }
+
+ if (nl_proto_down) {
+ proto_down = nla_get_u8(nl_proto_down);
+
+ /* Don't turn off protodown if there are active reasons */
+ if (!proto_down && dev->proto_down_reason) {
+ NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
+ return -EBUSY;
+ }
+ err = netif_change_proto_down(dev, proto_down);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+#define DO_SETLINK_MODIFIED 0x01
+/* notify flag means notify + modified. */
+#define DO_SETLINK_NOTIFY 0x03
+static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
+ struct net *tgt_net, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb, int status)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
char ifname[IFNAMSIZ];
+ int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ err = validate_linkmsg(dev, tb, extack);
if (err < 0)
- goto errout;
+ return err;
if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
else
ifname[0] = '\0';
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
- if (ifm->ifi_index >= 0)
- dev = dev_get_by_index(ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = dev_get_by_name(ifname);
- else
- goto errout;
+ if (!net_eq(tgt_net, dev_net(dev))) {
+ const char *pat = ifname[0] ? ifname : NULL;
+ int new_ifindex;
- if (dev == NULL) {
- err = -ENODEV;
- goto errout;
- }
+ new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0);
- if (tb[IFLA_ADDRESS] &&
- nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
- goto errout_dev;
+ err = __dev_change_net_namespace(dev, tgt_net, pat,
+ new_ifindex, extack);
+ if (err)
+ return err;
- if (tb[IFLA_BROADCAST] &&
- nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
- goto errout_dev;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ netdev_lock_ops(dev);
if (tb[IFLA_MAP]) {
struct rtnl_link_ifmap *u_map;
struct ifmap k_map;
- if (!dev->set_config) {
+ if (!ops->ndo_set_config) {
err = -EOPNOTSUPP;
- goto errout_dev;
+ goto errout;
}
if (!netif_device_present(dev)) {
err = -ENODEV;
- goto errout_dev;
+ goto errout;
}
u_map = nla_data(tb[IFLA_MAP]);
@@ -479,49 +3089,44 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
k_map.dma = (unsigned char) u_map->dma;
k_map.port = (unsigned char) u_map->port;
- err = dev->set_config(dev, &k_map);
+ err = ops->ndo_set_config(dev, &k_map);
if (err < 0)
- goto errout_dev;
+ goto errout;
- modified = 1;
+ status |= DO_SETLINK_NOTIFY;
}
if (tb[IFLA_ADDRESS]) {
- struct sockaddr *sa;
- int len;
+ struct sockaddr_storage ss = { };
- if (!dev->set_mac_address) {
- err = -EOPNOTSUPP;
- goto errout_dev;
- }
+ netdev_unlock_ops(dev);
- if (!netif_device_present(dev)) {
- err = -ENODEV;
- goto errout_dev;
- }
+ /* dev_addr_sem is an outer lock, enforce proper ordering */
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
- len = sizeof(sa_family_t) + dev->addr_len;
- sa = kmalloc(len, GFP_KERNEL);
- if (!sa) {
- err = -ENOMEM;
- goto errout_dev;
+ ss.ss_family = dev->type;
+ memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len);
+ err = netif_set_mac_address(dev, &ss, extack);
+ if (err) {
+ up_write(&dev_addr_sem);
+ goto errout;
}
- sa->sa_family = dev->type;
- memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
- dev->addr_len);
- err = dev->set_mac_address(dev, sa);
- kfree(sa);
- if (err)
- goto errout_dev;
- send_addr_notify = 1;
- modified = 1;
+ status |= DO_SETLINK_MODIFIED;
+
+ up_write(&dev_addr_sem);
}
if (tb[IFLA_MTU]) {
- err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+ err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
- goto errout_dev;
- modified = 1;
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_GROUP]) {
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ status |= DO_SETLINK_NOTIFY;
}
/*
@@ -529,322 +3134,3891 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
* name provided implies that a name change has been
* requested.
*/
- if (ifm->ifi_index >= 0 && ifname[0]) {
- err = dev_change_name(dev, ifname);
+ if (ifm->ifi_index > 0 && ifname[0]) {
+ err = netif_change_name(dev, ifname);
if (err < 0)
- goto errout_dev;
- modified = 1;
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
}
-#ifdef CONFIG_NET_WIRELESS_RTNETLINK
- if (tb[IFLA_WIRELESS]) {
- /* Call Wireless Extensions.
- * Various stuff checked in there... */
- err = wireless_rtnetlink_set(dev, nla_data(tb[IFLA_WIRELESS]),
- nla_len(tb[IFLA_WIRELESS]));
+ if (tb[IFLA_IFALIAS]) {
+ err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
if (err < 0)
- goto errout_dev;
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
}
-#endif /* CONFIG_NET_WIRELESS_RTNETLINK */
if (tb[IFLA_BROADCAST]) {
nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
- send_addr_notify = 1;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ }
+
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
+ if (err < 0)
+ goto errout;
}
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
- if (ifm->ifi_flags)
- dev_change_flags(dev, ifm->ifi_flags);
+ if (tb[IFLA_CARRIER]) {
+ err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
- if (tb[IFLA_TXQLEN])
- dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_TXQLEN]) {
+ unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
- if (tb[IFLA_WEIGHT])
- dev->weight = nla_get_u32(tb[IFLA_WEIGHT]);
+ err = netif_change_tx_queue_len(dev, value);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_GSO_MAX_SIZE]) {
+ u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
+
+ if (dev->gso_max_size ^ max_size) {
+ netif_set_gso_max_size(dev, max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GSO_MAX_SEGS]) {
+ u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+
+ if (dev->gso_max_segs ^ max_segs) {
+ netif_set_gso_max_segs(dev, max_segs);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GRO_MAX_SIZE]) {
+ u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);
+
+ if (dev->gro_max_size ^ gro_max_size) {
+ netif_set_gro_max_size(dev, gro_max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE]) {
+ u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]);
+
+ if (dev->gso_ipv4_max_size ^ max_size) {
+ netif_set_gso_ipv4_max_size(dev, max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE]) {
+ u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]);
+
+ if (dev->gro_ipv4_max_size ^ gro_max_size) {
+ netif_set_gro_ipv4_max_size(dev, gro_max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
if (tb[IFLA_OPERSTATE])
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINKMODE]) {
- write_lock_bh(&dev_base_lock);
- dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
- write_unlock_bh(&dev_base_lock);
+ unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
+
+ if (dev->link_mode ^ value)
+ status |= DO_SETLINK_NOTIFY;
+ WRITE_ONCE(dev->link_mode, value);
+ }
+
+ if (tb[IFLA_VFINFO_LIST]) {
+ struct nlattr *vfinfo[IFLA_VF_MAX + 1];
+ struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX,
+ attr,
+ ifla_vf_policy,
+ NULL);
+ if (err < 0)
+ goto errout;
+ err = do_setvfinfo(dev, vfinfo);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
}
+ err = 0;
+
+ if (tb[IFLA_VF_PORTS]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
+ struct nlattr *attr;
+ int vf;
+ int rem;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_port)
+ goto errout;
+ nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
+ if (nla_type(attr) != IFLA_VF_PORT ||
+ nla_len(attr) < NLA_HDRLEN) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
+ attr,
+ ifla_port_policy,
+ NULL);
+ if (err < 0)
+ goto errout;
+ if (!port[IFLA_PORT_VF]) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ vf = nla_get_u32(port[IFLA_PORT_VF]);
+ err = ops->ndo_set_vf_port(dev, vf, port);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
err = 0;
-errout_dev:
- if (err < 0 && modified && net_ratelimit())
- printk(KERN_WARNING "A link change request failed with "
- "some changes comitted already. Interface %s may "
- "have been left with an inconsistent configuration, "
- "please check.\n", dev->name);
+ if (tb[IFLA_PORT_SELF]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
- if (send_addr_notify)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
+ tb[IFLA_PORT_SELF],
+ ifla_port_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_port)
+ err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
+
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
+ if (!af_ops) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = af_ops->set_link_af(dev, af, extack);
+ rtnl_af_put(af_ops, af_ops_srcu_index);
+
+ if (err < 0)
+ goto errout;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+ if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) {
+ err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN],
+ tb[IFLA_PROTO_DOWN_REASON], extack);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_XDP]) {
+ struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
+
+ err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX,
+ tb[IFLA_XDP],
+ ifla_xdp_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ if (xdp[IFLA_XDP_FD]) {
+ int expected_fd = -1;
+
+ if (xdp_flags & XDP_FLAGS_REPLACE) {
+ if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ expected_fd =
+ nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+ }
+
+ err = dev_change_xdp_fd(dev, extack,
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ expected_fd,
+ xdp_flags);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
- dev_put(dev);
errout:
+ if (status & DO_SETLINK_MODIFIED) {
+ if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
+ netif_state_change(dev);
+
+ if (err < 0)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
+ }
+
+ netdev_unlock_ops(dev);
+
return err;
}
-static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static struct net_device *rtnl_dev_get(struct net *net,
+ struct nlattr *tb[])
{
- struct ifinfomsg *ifm;
+ char ifname[ALTIFNAMSIZ];
+
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else if (tb[IFLA_ALT_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ);
+ else
+ return NULL;
+
+ return __dev_get_by_name(net, ifname);
+}
+
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
+ struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFLA_MAX+1];
struct net_device *dev = NULL;
- struct sk_buff *nskb;
- char *iw_buf = NULL, *iw = NULL;
- int iw_buf_len = 0;
+ struct rtnl_nets rtnl_nets;
+ struct net *tgt_net;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
+ if (err < 0)
+ goto errout;
+
+ err = rtnl_ensure_unique_netns(tb, extack, false);
+ if (err < 0)
+ goto errout;
+
+ tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ err = PTR_ERR(tgt_net);
+ goto errout;
+ }
+
+ rtnl_nets_init(&rtnl_nets);
+ rtnl_nets_add(&rtnl_nets, get_net(net));
+ rtnl_nets_add(&rtnl_nets, tgt_net);
+
+ rtnl_nets_lock(&rtnl_nets);
+
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, tb);
+ else
+ err = -EINVAL;
+
+ if (dev)
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
+ else if (!err)
+ err = -ENODEV;
+
+ rtnl_nets_unlock(&rtnl_nets);
+ rtnl_nets_destroy(&rtnl_nets);
+errout:
+ return err;
+}
+
+static int rtnl_group_dellink(const struct net *net, int group)
+{
+ struct net_device *dev, *aux;
+ LIST_HEAD(list_kill);
+ bool found = false;
+
+ if (!group)
+ return -EPERM;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ found = true;
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (!found)
+ return -ENODEV;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ ops = dev->rtnl_link_ops;
+ ops->dellink(dev, &list_kill);
+ }
+ }
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+
+int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh)
+{
+ const struct rtnl_link_ops *ops;
+ LIST_HEAD(list_kill);
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many_notify(&list_kill, portid, nlh);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_delete_link);
+
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
+ struct net *net = sock_net(skb->sk);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct net *tgt_net = net;
+ int netnsid = -1;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
if (err < 0)
return err;
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+
+ rtnl_net_lock(tgt_net);
+
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(tgt_net, tb);
+
+ if (dev)
+ err = rtnl_delete_link(dev, portid, nlh);
+ else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ err = -ENODEV;
+ else if (tb[IFLA_GROUP])
+ err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
+ else
+ err = -EINVAL;
+
+ rtnl_net_unlock(tgt_net);
+
+ if (netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
+}
+
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+ u32 portid, const struct nlmsghdr *nlh)
+{
+ unsigned int old_flags, changed;
+ int err;
+
+ old_flags = dev->flags;
+ if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
+ err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ NULL);
+ if (err < 0)
+ return err;
+ }
+
+ changed = old_flags ^ dev->flags;
+ if (dev->rtnl_link_initializing) {
+ dev->rtnl_link_initializing = false;
+ changed = ~0U;
+ }
+
+ __dev_notify_flags(dev, old_flags, changed, portid, nlh);
+ return 0;
+}
+EXPORT_SYMBOL(rtnl_configure_link);
+
+struct net_device *rtnl_create_link(struct net *net, const char *ifname,
+ unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev;
+ unsigned int num_tx_queues = 1;
+ unsigned int num_rx_queues = 1;
+ int err;
+
+ if (tb[IFLA_NUM_TX_QUEUES])
+ num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
+ else if (ops->get_num_tx_queues)
+ num_tx_queues = ops->get_num_tx_queues();
+
+ if (tb[IFLA_NUM_RX_QUEUES])
+ num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
+ else if (ops->get_num_rx_queues)
+ num_rx_queues = ops->get_num_rx_queues();
+
+ if (num_tx_queues < 1 || num_tx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (num_rx_queues < 1 || num_rx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (ops->alloc) {
+ dev = ops->alloc(tb, ifname, name_assign_type,
+ num_tx_queues, num_rx_queues);
+ if (IS_ERR(dev))
+ return dev;
+ } else {
+ dev = alloc_netdev_mqs(ops->priv_size, ifname,
+ name_assign_type, ops->setup,
+ num_tx_queues, num_rx_queues);
+ }
+
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = ops;
+ dev->rtnl_link_initializing = true;
+
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
+
+ err = dev_validate_mtu(dev, mtu, extack);
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ dev->mtu = mtu;
+ }
+ if (tb[IFLA_ADDRESS]) {
+ __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
+ dev->addr_assign_type = NET_ADDR_SET;
+ }
+ if (tb[IFLA_BROADCAST])
+ memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
+ nla_len(tb[IFLA_BROADCAST]));
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+ if (tb[IFLA_LINKMODE])
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ if (tb[IFLA_GSO_MAX_SIZE])
+ netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
+ if (tb[IFLA_GSO_MAX_SEGS])
+ netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
+ if (tb[IFLA_GRO_MAX_SIZE])
+ netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE])
+ netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]));
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE])
+ netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]));
+
+ return dev;
+}
+EXPORT_SYMBOL(rtnl_create_link);
+
+struct rtnl_newlink_tbs {
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+};
+
+static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net_device *dev, struct net *tgt_net,
+ struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr ** const linkinfo = tbs->linkinfo;
+ struct nlattr ** const tb = tbs->tb;
+ int status = 0;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops || !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net_device *master_dev;
+
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
+ if (m_ops->slave_maxtype) {
+ err = nla_parse_nested_deprecated(tbs->slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy, extack);
+ if (err < 0)
+ return err;
+
+ slave_data = tbs->slave_attr;
+ }
+
+ err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status);
+}
+
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, struct net *tgt_net,
+ int group, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb)
+{
+ struct net_device *dev, *aux;
+ int err;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
+ const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb, struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ unsigned char name_assign_type = NET_NAME_USER;
+ struct rtnl_newlink_params params = {
+ .src_net = sock_net(skb->sk),
+ .link_net = link_net,
+ .peer_net = peer_net,
+ .tb = tb,
+ .data = data,
+ };
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net_device *dev;
+ char ifname[IFNAMSIZ];
+ int err;
+
+ if (!ops->alloc && !ops->setup)
+ return -EOPNOTSUPP;
+
+ if (tb[IFLA_IFNAME]) {
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ } else {
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
+
+ dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb,
+ extack);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink)
+ err = ops->newlink(dev, &params, extack);
+ else
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
+
+ netdev_lock_ops(dev);
+
+ err = rtnl_configure_link(dev, ifm, portid, nlh);
+ if (err < 0)
+ goto out_unregister;
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+ if (err)
+ goto out_unregister;
+ }
+
+ netdev_unlock_ops(dev);
+out:
+ return err;
+out_unregister:
+ netdev_unlock_ops(dev);
+ if (ops->newlink) {
+ LIST_HEAD(list_kill);
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ } else {
+ unregister_netdevice(dev);
+ }
+ goto out;
+}
+
+static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops,
+ struct nlattr *tbp[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ int err;
+
+ if (!data || !data[ops->peer_type])
+ return rtnl_link_get_net_ifla(tbp);
+
+ err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (ops->validate) {
+ err = ops->validate(tb, NULL, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ return rtnl_link_get_net_ifla(tb);
+}
+
+static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
+ struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr ** const tb = tbs->tb;
+ struct net *net = sock_net(skb->sk);
+ struct net *device_net;
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ bool link_specified;
+
+ /* When creating, lookup for existing device in target net namespace */
+ device_net = (nlh->nlmsg_flags & NLM_F_CREATE) &&
+ (nlh->nlmsg_flags & NLM_F_EXCL) ?
+ tgt_net : net;
+
ifm = nlmsg_data(nlh);
- if (ifm->ifi_index >= 0) {
- dev = dev_get_by_index(ifm->ifi_index);
- if (dev == NULL)
+ if (ifm->ifi_index > 0) {
+ link_specified = true;
+ dev = __dev_get_by_index(device_net, ifm->ifi_index);
+ } else if (ifm->ifi_index < 0) {
+ NL_SET_ERR_MSG(extack, "ifindex can't be negative");
+ return -EINVAL;
+ } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
+ link_specified = true;
+ dev = rtnl_dev_get(device_net, tb);
+ } else {
+ link_specified = false;
+ dev = NULL;
+ }
+
+ if (dev)
+ return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack);
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
+ * or it's for a group
+ */
+ if (link_specified || !tb[IFLA_GROUP])
return -ENODEV;
- } else
+
+ return rtnl_group_changelink(skb, net, tgt_net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, extack, tb);
+ }
+
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+ NL_SET_ERR_MSG(extack, "Unknown device type");
+ return -EOPNOTSUPP;
+ }
+
+ return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh,
+ tb, data, extack);
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *tgt_net, *link_net = NULL, *peer_net = NULL;
+ struct nlattr **tb, **linkinfo, **data = NULL;
+ struct rtnl_link_ops *ops = NULL;
+ struct rtnl_newlink_tbs *tbs;
+ struct rtnl_nets rtnl_nets;
+ int ops_srcu_index;
+ int ret;
+
+ tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
+ if (!tbs)
+ return -ENOMEM;
+
+ tb = tbs->tb;
+ ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, extack);
+ if (ret < 0)
+ goto free;
+
+ ret = rtnl_ensure_unique_netns(tb, extack, false);
+ if (ret < 0)
+ goto free;
+
+ linkinfo = tbs->linkinfo;
+ if (tb[IFLA_LINKINFO]) {
+ ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
+ tb[IFLA_LINKINFO],
+ ifla_info_policy, NULL);
+ if (ret < 0)
+ goto free;
+ } else {
+ memset(linkinfo, 0, sizeof(tbs->linkinfo));
+ }
+
+ if (linkinfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
+ nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ request_module("rtnl-link-%s", kind);
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+ }
+#endif
+ }
+
+ rtnl_nets_init(&rtnl_nets);
+
+ if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE) {
+ ret = -EINVAL;
+ goto put_ops;
+ }
+
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy, extack);
+ if (ret < 0)
+ goto put_ops;
+
+ data = tbs->attr;
+ }
+
+ if (ops->validate) {
+ ret = ops->validate(tb, data, extack);
+ if (ret < 0)
+ goto put_ops;
+ }
+
+ if (ops->peer_type) {
+ peer_net = rtnl_get_peer_net(ops, tb, data, extack);
+ if (IS_ERR(peer_net)) {
+ ret = PTR_ERR(peer_net);
+ goto put_ops;
+ }
+ if (peer_net)
+ rtnl_nets_add(&rtnl_nets, peer_net);
+ }
+ }
+
+ tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ ret = PTR_ERR(tgt_net);
+ goto put_net;
+ }
+
+ rtnl_nets_add(&rtnl_nets, tgt_net);
+
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+
+ link_net = get_net_ns_by_id(tgt_net, id);
+ if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
+ ret = -EINVAL;
+ goto put_net;
+ }
+
+ rtnl_nets_add(&rtnl_nets, link_net);
+
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto put_net;
+ }
+ }
+
+ rtnl_nets_lock(&rtnl_nets);
+ ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack);
+ rtnl_nets_unlock(&rtnl_nets);
+
+put_net:
+ rtnl_nets_destroy(&rtnl_nets);
+put_ops:
+ if (ops)
+ rtnl_link_ops_put(ops, ops_srcu_index);
+free:
+ kfree(tbs);
+ return ret;
+}
+
+static int rtnl_valid_getlink_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+ int i, err;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for get link");
return -EINVAL;
+ }
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
-#ifdef CONFIG_NET_WIRELESS_RTNETLINK
- if (tb[IFLA_WIRELESS]) {
- /* Call Wireless Extensions. We need to know the size before
- * we can alloc. Various stuff checked in there... */
- err = wireless_rtnetlink_get(dev, nla_data(tb[IFLA_WIRELESS]),
- nla_len(tb[IFLA_WIRELESS]),
- &iw_buf, &iw_buf_len);
- if (err < 0)
- goto errout;
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
+ if (err)
+ return err;
- iw += IW_EV_POINT_OFF;
+ for (i = 0; i <= IFLA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFLA_IFNAME:
+ case IFLA_ALT_IFNAME:
+ case IFLA_EXT_MASK:
+ case IFLA_TARGET_NETNSID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request");
+ return -EINVAL;
+ }
}
-#endif /* CONFIG_NET_WIRELESS_RTNETLINK */
- nskb = nlmsg_new(if_nlmsg_size(iw_buf_len), GFP_KERNEL);
- if (nskb == NULL) {
- err = -ENOBUFS;
- goto errout;
+ return 0;
+}
+
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
+ struct ifinfomsg *ifm;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct sk_buff *nskb;
+ int netnsid = -1;
+ int err;
+ u32 ext_filter_mask = 0;
+
+ err = rtnl_valid_getlink_req(skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
}
- err = rtnl_fill_ifinfo(nskb, dev, iw, iw_buf_len, RTM_NEWLINK,
- NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, 0);
- /* failure impilies BUG in if_nlmsg_size or wireless_rtnetlink_get */
- BUG_ON(err < 0);
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
- err = rtnl_unicast(nskb, NETLINK_CB(skb).pid);
-errout:
- kfree(iw_buf);
- dev_put(dev);
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(tgt_net, tb);
+ else
+ goto out;
+
+ err = -ENODEV;
+ if (dev == NULL)
+ goto out;
+
+ err = -ENOBUFS;
+ nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
+ if (nskb == NULL)
+ goto out;
+
+ /* Synchronize the carrier state so we don't report a state
+ * that we're not actually going to honour immediately; if
+ * the driver just did a carrier off->on transition, we can
+ * only TX if link watch work has run, but without this we'd
+ * already report carrier on, even if it doesn't work yet.
+ */
+ linkwatch_sync_dev(dev);
+
+ err = rtnl_fill_ifinfo(nskb, dev, net,
+ RTM_NEWLINK, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask,
+ 0, NULL, 0, netnsid, GFP_KERNEL);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+out:
+ if (netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
+}
+
+static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
+ bool *changed, struct netlink_ext_ack *extack)
+{
+ char *alt_ifname;
+ size_t size;
+ int err;
+
+ err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
+ if (err)
+ return err;
+ if (cmd == RTM_NEWLINKPROP) {
+ size = rtnl_prop_list_size(dev);
+ size += nla_total_size(ALTIFNAMSIZ);
+ if (size >= U16_MAX) {
+ NL_SET_ERR_MSG(extack,
+ "effective property list too long");
+ return -EINVAL;
+ }
+ }
+
+ alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
+ if (!alt_ifname)
+ return -ENOMEM;
+
+ if (cmd == RTM_NEWLINKPROP) {
+ err = netdev_name_node_alt_create(dev, alt_ifname);
+ if (!err)
+ alt_ifname = NULL;
+ } else if (cmd == RTM_DELLINKPROP) {
+ err = netdev_name_node_alt_destroy(dev, alt_ifname);
+ } else {
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ }
+
+ kfree(alt_ifname);
+ if (!err)
+ *changed = true;
return err;
}
+static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ bool changed = false;
+ struct nlattr *attr;
+ int err, rem;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ if (err)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, tb);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!tb[IFLA_PROP_LIST])
+ return 0;
+
+ nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) {
+ switch (nla_type(attr)) {
+ case IFLA_ALT_IFNAME:
+ err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack);
+ if (err)
+ return err;
+ break;
+ }
+ }
+
+ if (changed)
+ netdev_state_change(dev);
+ return 0;
+}
+
+static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack);
+}
+
+static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
+}
+
+static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ size_t min_ifinfo_dump_size = 0;
+ u32 ext_filter_mask = 0;
+ struct net_device *dev;
+ struct nlattr *nla;
+ int hdrlen, rem;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return NLMSG_GOODSIZE;
+
+ nla_for_each_attr_type(nla, IFLA_EXT_MASK,
+ nlmsg_attrdata(nlh, hdrlen),
+ nlmsg_attrlen(nlh, hdrlen), rem) {
+ if (nla_len(nla) == sizeof(u32))
+ ext_filter_mask = nla_get_u32(nla);
+ }
+
+ if (!ext_filter_mask)
+ return NLMSG_GOODSIZE;
+ /*
+ * traverse the list of net devices and compute the minimum
+ * buffer size based upon the filter mask.
+ */
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ min_ifinfo_dump_size = max(min_ifinfo_dump_size,
+ if_nlmsg_size(dev, ext_filter_mask));
+ }
+ rcu_read_unlock();
+
+ return nlmsg_total_size(min_ifinfo_dump_size);
+}
+
static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx;
int s_idx = cb->family;
+ int type = cb->nlh->nlmsg_type - RTM_BASE;
+ int ret = 0;
if (s_idx == 0)
s_idx = 1;
- for (idx=1; idx<NPROTO; idx++) {
- int type = cb->nlh->nlmsg_type-RTM_BASE;
+
+ for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
+ rtnl_dumpit_func dumpit;
+
if (idx < s_idx || idx == PF_PACKET)
continue;
- if (rtnetlink_links[idx] == NULL ||
- rtnetlink_links[idx][type].dumpit == NULL)
+
+ if (type < 0 || type >= RTM_NR_MSGTYPES)
continue;
- if (idx > s_idx)
+
+ tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
+ if (!tab)
+ continue;
+
+ link = rcu_dereference_rtnl(tab[type]);
+ if (!link)
+ continue;
+
+ dumpit = link->dumpit;
+ if (!dumpit)
+ continue;
+
+ if (idx > s_idx) {
memset(&cb->args[0], 0, sizeof(cb->args));
- if (rtnetlink_links[idx][type].dumpit(skb, cb))
+ cb->prev_seq = 0;
+ cb->seq = 0;
+ }
+ ret = dumpit(skb, cb);
+ if (ret)
break;
}
cb->family = idx;
- return skb->len;
+ return skb->len ? : ret;
}
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+ unsigned int change,
+ u32 event, gfp_t flags, int *new_nsid,
+ int new_ifindex, u32 portid,
+ const struct nlmsghdr *nlh)
{
+ struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ u32 seq = 0;
- skb = nlmsg_new(if_nlmsg_size(0), GFP_KERNEL);
+ skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
if (skb == NULL)
goto errout;
- err = rtnl_fill_ifinfo(skb, dev, NULL, 0, type, 0, 0, change, 0);
- /* failure implies BUG in if_nlmsg_size() */
- BUG_ON(err < 0);
+ if (nlmsg_report(nlh))
+ seq = nlmsg_seq(nlh);
+ else
+ portid = 0;
+
+ err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
+ type, portid, seq, change, 0, 0, event,
+ new_nsid, new_ifindex, -1, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ return skb;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
+ u32 portid, const struct nlmsghdr *nlh)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
+}
+
+static void rtmsg_ifinfo_event(int type, struct net_device *dev,
+ unsigned int change, u32 event,
+ gfp_t flags, int *new_nsid, int new_ifindex,
+ u32 portid, const struct nlmsghdr *nlh)
+{
+ struct sk_buff *skb;
+
+ if (dev->reg_state != NETREG_REGISTERED)
+ return;
+
+ skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
+ new_ifindex, portid, nlh);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
+{
+ rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+ NULL, 0, portid, nlh);
+}
+
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags, int *new_nsid, int new_ifindex)
+{
+ rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+ new_nsid, new_ifindex, 0, NULL);
+}
+
+static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ u8 *addr, u16 vid, u32 pid, u32 seq,
+ int type, unsigned int flags,
+ int nlflags, u16 ndm_state)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = flags;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dev->ifindex;
+ ndm->ndm_state = ndm_state;
+
+ if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr))
+ goto nla_put_failure;
+ if (vid)
+ if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) +
+ nla_total_size(dev->addr_len) + /* NDA_LLADDR */
+ nla_total_size(sizeof(u16)) + /* NDA_VLAN */
+ 0;
+}
+
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
+ u16 ndm_state)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
+ 0, 0, type, NTF_SELF, 0, ndm_state);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
- err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/*
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid,
+ u16 flags)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
+ return err;
+ }
+
+ if (tb[NDA_FLAGS_EXT]) {
+ netdev_info(dev, "invalid flags given to default FDB implementation\n");
+ return err;
+ }
+
+ if (vid) {
+ netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_add_excl(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_add_excl(dev, addr);
+
+ /* Only return duplicate errors if NLM_F_EXCL is set */
+ if (err == -EEXIST && !(flags & NLM_F_EXCL))
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
+ struct netlink_ext_ack *extack)
+{
+ u16 vid = 0;
+
+ if (vlan_attr) {
+ if (nla_len(vlan_attr) != sizeof(u16)) {
+ NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
+ return -EINVAL;
+ }
+
+ vid = nla_get_u16(vlan_attr);
+
+ if (!vid || vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "invalid vlan id");
+ return -EINVAL;
+ }
+ }
+ *p_vid = vid;
+ return 0;
+}
+
+static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ u8 *addr;
+ u16 vid;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL,
+ extack);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ NL_SET_ERR_MSG(extack, "invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "invalid address");
+ return -EINVAL;
+ }
+
+ if (dev->type != ARPHRD_ETHER) {
+ NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ netif_is_bridge_port(dev)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+ bool notified = false;
+
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags, &notified, extack);
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if ((ndm->ndm_flags & NTF_SELF)) {
+ bool notified = false;
+
+ if (dev->netdev_ops->ndo_fdb_add)
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ vid,
+ nlh->nlmsg_flags,
+ &notified, extack);
+ else
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
+
+ if (!err && !notified) {
+ rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
+ ndm->ndm_state);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+/*
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (!(ndm->ndm_state & NUD_PERMANENT)) {
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_del(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_del(dev, addr);
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
+static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
+ struct net *net = sock_net(skb->sk);
+ const struct net_device_ops *ops;
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ __u8 *addr = NULL;
+ int err;
+ u16 vid;
+
+ if (!del_bulk) {
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
+ NULL, extack);
+ } else {
+ /* For bulk delete, the drivers will parse the message with
+ * policy.
+ */
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ }
if (err < 0)
- rtnl_set_sk_err(RTNLGRP_LINK, err);
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ NL_SET_ERR_MSG(extack, "invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ if (!del_bulk) {
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "invalid address");
+ return -EINVAL;
+ }
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
+ }
+
+ if (dev->type != ARPHRD_ETHER) {
+ NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices");
+ return -EINVAL;
+ }
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ netif_is_bridge_port(dev)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ bool notified = false;
+
+ ops = br_dev->netdev_ops;
+ if (!del_bulk) {
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
+ } else {
+ if (ops->ndo_fdb_del_bulk)
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
+ }
+
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if (ndm->ndm_flags & NTF_SELF) {
+ bool notified = false;
+
+ ops = dev->netdev_ops;
+ if (!del_bulk) {
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+ } else {
+ /* in case err was cleared by NTF_MASTER call */
+ err = -EOPNOTSUPP;
+ if (ops->ndo_fdb_del_bulk)
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
+ }
+
+ if (!err) {
+ if (!del_bulk && !notified)
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
+ ndm->ndm_state);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
}
-/* Protected by RTNL sempahore. */
-static struct rtattr **rta_buf;
-static int rtattr_max;
+static int nlmsg_populate_fdb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int *idx,
+ struct netdev_hw_addr_list *list)
+{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct netdev_hw_addr *ha;
+ u32 portid, seq;
+ int err;
-/* Process one rtnetlink message. */
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
-static __inline__ int
-rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
+ list_for_each_entry(ha, &list->list, list) {
+ if (*idx < ctx->fdb_idx)
+ goto skip;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
+ portid, seq,
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI, NUD_PERMANENT);
+ if (err < 0)
+ return err;
+skip:
+ *idx += 1;
+ }
+ return 0;
+}
+
+/**
+ * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
+ * @skb: socket buffer to store message in
+ * @cb: netlink callback
+ * @dev: netdevice
+ * @filter_dev: ignored
+ * @idx: the number of FDB table entries dumped is added to *@idx
+ *
+ * Default netdevice operation to dump the existing unicast address list.
+ * Returns number of addresses from list put in skb.
+ */
+int ndo_dflt_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev,
+ int *idx)
{
- struct rtnetlink_link *link;
- struct rtnetlink_link *link_tab;
- int sz_idx, kind;
- int min_len;
- int family;
- int type;
int err;
- /* Only requests are handled by kernel now */
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
+ if (dev->type != ARPHRD_ETHER)
+ return -EINVAL;
- type = nlh->nlmsg_type;
+ netif_addr_lock_bh(dev);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
+ if (err)
+ goto out;
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_dump);
- /* A control message: ignore them */
- if (type < RTM_BASE)
- return 0;
+static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
+ int *br_idx, int *brport_idx,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ struct ndmsg *ndm;
+ int err, i;
- /* Unknown message: reply with EINVAL */
- if (type > RTM_MAX)
- goto err_inval;
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
+ return -EINVAL;
+ }
- type -= RTM_BASE;
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_flags || ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
+ return -EINVAL;
+ }
- /* All the messages must have at least 1 byte length */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ *brport_idx = ndm->ndm_ifindex;
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_IFINDEX:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request");
+ return -EINVAL;
+ }
+ *brport_idx = nla_get_u32(tb[NDA_IFINDEX]);
+ break;
+ case NDA_MASTER:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request");
+ return -EINVAL;
+ }
+ *br_idx = nla_get_u32(tb[NDA_MASTER]);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
+ int *br_idx, int *brport_idx,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX+1];
+ int err;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
+ * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails.
+ * So, check for ndmsg with an optional u32 attribute (not used here).
+ * Fortunately these sizes don't conflict with the size of ifinfomsg
+ * with an optional attribute.
+ */
+ if (nlmsg_len(nlh) != sizeof(struct ndmsg) &&
+ (nlmsg_len(nlh) != sizeof(struct ndmsg) +
+ nla_attr_size(sizeof(u32)))) {
+ struct ifinfomsg *ifm;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
+ tb, IFLA_MAX, ifla_policy,
+ extack);
+ if (err < 0) {
+ return -EINVAL;
+ } else if (err == 0) {
+ if (tb[IFLA_MASTER])
+ *br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ }
+
+ ifm = nlmsg_data(nlh);
+ *brport_idx = ifm->ifi_index;
+ }
+ return 0;
+}
+
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct net_device_ops *ops = NULL, *cops = NULL;
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_device *dev, *br_dev = NULL;
+ struct net *net = sock_net(skb->sk);
+ int brport_idx = 0;
+ int br_idx = 0;
+ int fidx = 0;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);
+
+ if (cb->strict_check)
+ err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ else
+ err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ if (err < 0)
+ return err;
+
+ if (br_idx) {
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev)
+ return -ENODEV;
+
+ ops = br_dev->netdev_ops;
+ }
+
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
+
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (netif_is_bridge_port(dev)) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+ } else {
+ if (dev != br_dev &&
+ !netif_is_bridge_port(dev))
+ continue;
+
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !netif_is_bridge_master(dev))
+ continue;
+ cops = ops;
+ }
+
+ if (netif_is_bridge_port(dev)) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ break;
+ }
+ }
+
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
+ if (err == -EMSGSIZE)
+ break;
+
+ cops = NULL;
+
+ /* reset fdb offset to 0 for rest of the interfaces */
+ ctx->fdb_idx = 0;
+ fidx = 0;
+ }
+
+ ctx->fdb_idx = fidx;
+
+ return skb->len;
+}
+
+static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
+ struct nlattr **tb, u8 *ndm_flags,
+ int *br_idx, int *brport_idx, u8 **addr,
+ u16 *vid, struct netlink_ext_ack *extack)
+{
+ struct ndmsg *ndm;
+ int err, i;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, nda_policy, extack);
+ if (err < 0)
+ return err;
+
+ *ndm_flags = ndm->ndm_flags;
+ *brport_idx = ndm->ndm_ifindex;
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_MASTER:
+ *br_idx = nla_get_u32(tb[i]);
+ break;
+ case NDA_LLADDR:
+ if (nla_len(tb[i]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "Invalid address in fdb get request");
+ return -EINVAL;
+ }
+ *addr = nla_data(tb[i]);
+ break;
+ case NDA_VLAN:
+ err = fdb_vid_parse(tb[i], vid, extack);
+ if (err)
+ return err;
+ break;
+ case NDA_VNI:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = NULL, *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NDA_MAX + 1];
+ struct sk_buff *skb;
+ int brport_idx = 0;
+ u8 ndm_flags = 0;
+ int br_idx = 0;
+ u8 *addr = NULL;
+ u16 vid = 0;
+ int err;
+
+ err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx,
+ &brport_idx, &addr, &vid, extack);
+ if (err < 0)
+ return err;
+
+ if (!addr) {
+ NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request");
+ return -EINVAL;
+ }
+
+ if (brport_idx) {
+ dev = __dev_get_by_index(net, brport_idx);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Unknown device ifindex");
+ return -ENODEV;
+ }
+ }
+
+ if (br_idx) {
+ if (dev) {
+ NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive");
+ return -EINVAL;
+ }
+
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev) {
+ NL_SET_ERR_MSG(extack, "Invalid master ifindex");
+ return -EINVAL;
+ }
+ ops = br_dev->netdev_ops;
+ }
+
+ if (dev) {
+ if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
+ if (!netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG(extack, "Device is not a bridge port");
+ return -EINVAL;
+ }
+ br_dev = netdev_master_upper_dev_get(dev);
+ if (!br_dev) {
+ NL_SET_ERR_MSG(extack, "Master of device not found");
+ return -EINVAL;
+ }
+ ops = br_dev->netdev_ops;
+ } else {
+ if (!(ndm_flags & NTF_SELF)) {
+ NL_SET_ERR_MSG(extack, "Missing NTF_SELF");
+ return -EINVAL;
+ }
+ ops = dev->netdev_ops;
+ }
+ }
+
+ if (!br_dev && !dev) {
+ NL_SET_ERR_MSG(extack, "No device specified");
+ return -ENODEV;
+ }
+
+ if (!ops || !ops->ndo_fdb_get) {
+ NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (br_dev)
+ dev = br_dev;
+ err = ops->ndo_fdb_get(skb, tb, dev, addr, vid,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+ if (err)
+ goto out;
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ kfree_skb(skb);
+ return err;
+}
+
+static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
+ unsigned int attrnum, unsigned int flag)
+{
+ if (mask & flag)
+ return nla_put_u8(skb, attrnum, !!(flags & flag));
+ return 0;
+}
+
+int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode,
+ u32 flags, u32 mask, int nlflags,
+ u32 filter_mask,
+ int (*vlan_fill)(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 filter_mask))
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ struct nlattr *br_afspec;
+ struct nlattr *protinfo;
+ u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ int err = 0;
+
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_BRIDGE;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = netif_get_flags(dev);
+ ifm->ifi_change = 0;
+
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (br_dev &&
+ nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ goto nla_put_failure;
+
+ br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
+ if (!br_afspec)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+
+ if (mode != BRIDGE_MODE_UNDEF) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
+ if (vlan_fill) {
+ err = vlan_fill(skb, dev, filter_mask);
+ if (err) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, br_afspec);
+
+ protinfo = nla_nest_start(skb, IFLA_PROTINFO);
+ if (!protinfo)
+ goto nla_put_failure;
+
+ if (brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING, BR_LEARNING) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
+ nla_nest_cancel(skb, protinfo);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, protinfo);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return err ? err : -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
+
+static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
+ bool strict_check, u32 *filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX+1];
+ int err, i;
+
+ if (strict_check) {
+ struct ifinfomsg *ifm;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
+ return -EINVAL;
+ }
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh,
+ sizeof(struct ifinfomsg),
+ tb, IFLA_MAX, ifla_policy,
+ extack);
+ } else {
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
+ tb, IFLA_MAX, ifla_policy,
+ extack);
+ }
+ if (err < 0)
+ return err;
+
+ /* new attributes should only be added with strict checking */
+ for (i = 0; i <= IFLA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFLA_EXT_MASK:
+ *filter_mask = nla_get_u32(tb[i]);
+ break;
+ default:
+ if (strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx = 0;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ u32 seq = nlh->nlmsg_seq;
+ u32 filter_mask = 0;
+ int err;
+
+ err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask,
+ cb->extack);
+ if (err < 0 && cb->strict_check)
+ return err;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0]) {
+ err = br_dev->netdev_ops->ndo_bridge_getlink(
+ skb, portid, seq, dev,
+ filter_mask, NLM_F_MULTI);
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
+ }
+ idx++;
+ }
+
+ if (ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0]) {
+ err = ops->ndo_bridge_getlink(skb, portid,
+ seq, dev,
+ filter_mask,
+ NLM_F_MULTI);
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
+ }
+ idx++;
+ }
+ }
+ err = skb->len;
+out_err:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return err;
+}
+
+static inline size_t bridge_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_MTU */
+ + nla_total_size(sizeof(u32)) /* IFLA_LINK */
+ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */
+ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */
+ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */
+ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -EOPNOTSUPP;
+
+ if (!dev->netdev_ops->ndo_bridge_getlink)
return 0;
- family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
- if (family >= NPROTO) {
- *errp = -EAFNOSUPPORT;
- return -1;
+ skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto errout;
}
- link_tab = rtnetlink_links[family];
- if (link_tab == NULL)
- link_tab = rtnetlink_links[PF_UNSPEC];
- link = &link_tab[type];
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
+ if (err < 0)
+ goto errout;
- sz_idx = type>>2;
- kind = type&3;
+ /* Notification info is only filled for bridge ports, not the bridge
+ * device itself. Therefore, a zero notification length is valid and
+ * should not result in an error.
+ */
+ if (!skb->len)
+ goto errout;
- if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) {
- *errp = -EPERM;
- return -1;
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return 0;
+errout:
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ if (err)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return err;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr, *br_flags_attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
}
- if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
- if (link->dumpit == NULL)
- link = &(rtnetlink_links[PF_UNSPEC][type]);
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
- if (link->dumpit == NULL)
- goto err_inval;
+ br_flags_attr = attr;
+ flags = nla_get_u16(attr);
+ }
- if ((*errp = netlink_dump_start(rtnl, skb, nlh,
- link->dumpit, NULL)) != 0) {
- return -1;
+ if (nla_type(attr) == IFLA_BRIDGE_MODE) {
+ if (nla_len(attr) < sizeof(u16))
+ return -EINVAL;
+ }
}
+ }
- netlink_queue_skip(nlh, skb);
- return -1;
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags,
+ extack);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_setlink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
+ flags,
+ extack);
+ if (!err) {
+ flags &= ~BRIDGE_FLAGS_SELF;
+
+ /* Generate event to notify upper layer of bridge
+ * change
+ */
+ err = rtnl_bridge_notify(dev);
+ }
+ }
+
+ if (br_flags_attr)
+ memcpy(nla_data(br_flags_attr), &flags, sizeof(flags));
+out:
+ return err;
+}
+
+static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
+ rem) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_dellink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
+ flags);
+
+ if (!err) {
+ flags &= ~BRIDGE_FLAGS_SELF;
+
+ /* Generate event to notify upper layer of bridge
+ * change
+ */
+ err = rtnl_bridge_notify(dev);
+ }
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+out:
+ return err;
+}
+
+static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
+{
+ return (mask & IFLA_STATS_FILTER_BIT(attrid)) &&
+ (!idxattr || idxattr == attrid);
+}
+
+static bool
+rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id)
+{
+ return dev->netdev_ops &&
+ dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats &&
+ dev->netdev_ops->ndo_has_offload_stats(dev, attr_id);
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id)
+{
+ return rtnl_offload_xstats_have_ndo(dev, attr_id) ?
+ sizeof(struct rtnl_link_stats64) : 0;
+}
+
+static int
+rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
+ struct sk_buff *skb)
+{
+ unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id);
+ struct nlattr *attr = NULL;
+ void *attr_data;
+ int err;
+
+ if (!size)
+ return -ENODATA;
+
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ return -EMSGSIZE;
+
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
+
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+ return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
+}
+
+struct rtnl_offload_xstats_request_used {
+ bool request;
+ bool used;
+};
+
+static int
+rtnl_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_offload_xstats_request_used *ru,
+ struct rtnl_hw_stats64 *stats,
+ struct netlink_ext_ack *extack)
+{
+ bool request;
+ bool used;
+ int err;
+
+ request = netdev_offload_xstats_enabled(dev, type);
+ if (!request) {
+ used = false;
+ goto out;
+ }
+
+ err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
+ if (err)
+ return err;
+
+out:
+ if (ru) {
+ ru->request = request;
+ ru->used = used;
+ }
+ return 0;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
+ struct rtnl_offload_xstats_request_used *ru)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr_id);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_offload_xstats_request_used ru_l3;
+ struct nlattr *nest;
+ int err;
+
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
+ if (err)
+ return err;
+
+ nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
+ IFLA_OFFLOAD_XSTATS_L3_STATS,
+ &ru_l3))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
+ int *prividx, u32 off_filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
+ int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
+ bool have_data = false;
+ int err;
+
+ if (*prividx <= attr_id_cpu_hit &&
+ (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
+ err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
+ if (!err) {
+ have_data = true;
+ } else if (err != -ENODATA) {
+ *prividx = attr_id_cpu_hit;
+ return err;
+ }
+ }
+
+ if (*prividx <= attr_id_hw_s_info &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
+ *prividx = attr_id_hw_s_info;
+
+ err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
+ if (err)
+ return err;
+
+ have_data = true;
+ *prividx = 0;
+ }
+
+ if (*prividx <= attr_id_l3_stats &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
+ unsigned int size_l3;
+ struct nlattr *attr;
+
+ *prividx = attr_id_l3_stats;
+
+ size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+ if (!size_l3)
+ goto skip_l3_stats;
+ attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ return -EMSGSIZE;
+
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
+ nla_data(attr), extack);
+ if (err)
+ return err;
+
+ have_data = true;
+skip_l3_stats:
+ *prividx = 0;
+ }
+
+ if (!have_data)
+ return -ENODATA;
+
+ *prividx = 0;
+ return 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
+ nla_total_size(sizeof(u8)) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
+ nla_total_size(sizeof(u8)) +
+ 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_L3_STATS */
+ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
+ 0;
+}
+
+static int rtnl_offload_xstats_get_size(const struct net_device *dev,
+ u32 off_filter_mask)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
+ int nla_size = 0;
+ int size;
+
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
+ size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
+ nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);
+
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
+ size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (nla_size != 0)
+ nla_size += nla_total_size(0);
+
+ return nla_size;
+}
+
+struct rtnl_stats_dump_filters {
+ /* mask[0] filters outer attributes. Then individual nests have their
+ * filtering mask at the index of the nested attribute.
+ */
+ u32 mask[IFLA_STATS_MAX + 1];
+};
+
+static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags,
+ const struct rtnl_stats_dump_filters *filters,
+ int *idxattr, int *prividx,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int filter_mask = filters->mask[0];
+ struct if_stats_msg *ifsm;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ int s_prividx = *prividx;
+ int err;
+
+ ASSERT_RTNL();
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifsm = nlmsg_data(nlh);
+ ifsm->family = PF_UNSPEC;
+ ifsm->pad1 = 0;
+ ifsm->pad2 = 0;
+ ifsm->ifindex = dev->ifindex;
+ ifsm->filter_mask = filter_mask;
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) {
+ struct rtnl_link_stats64 *sp;
+
+ attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
+ sizeof(struct rtnl_link_stats64),
+ IFLA_STATS_UNSPEC);
+ if (!attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ sp = nla_data(attr);
+ dev_get_stats(dev, sp);
}
- memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
- min_len = rtm_min[sz_idx];
- if (nlh->nlmsg_len < min_len)
- goto err_inval;
+ if (ops && ops->fill_linkxstats) {
+ *idxattr = IFLA_STATS_LINK_XSTATS;
+ attr = nla_nest_start_noflag(skb,
+ IFLA_STATS_LINK_XSTATS);
+ if (!attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
- if (nlh->nlmsg_len > min_len) {
- int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
- struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len);
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
- while (RTA_OK(attr, attrlen)) {
- unsigned flavor = attr->rta_type;
- if (flavor) {
- if (flavor > rta_max[sz_idx])
- goto err_inval;
- rta_buf[flavor-1] = attr;
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
+ *idxattr)) {
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->fill_linkxstats) {
+ *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
+ attr = nla_nest_start_noflag(skb,
+ IFLA_STATS_LINK_XSTATS_SLAVE);
+ if (!attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
}
- attr = RTA_NEXT(attr, attrlen);
+
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
}
}
- if (link->doit == NULL)
- link = &(rtnetlink_links[PF_UNSPEC][type]);
- if (link->doit == NULL)
- goto err_inval;
- err = link->doit(skb, nlh, (void *)&rta_buf[0]);
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+ *idxattr)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
+ *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+ attr = nla_nest_start_noflag(skb,
+ IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = rtnl_offload_xstats_fill(skb, dev, prividx,
+ off_filter_mask, extack);
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, attr);
+ else
+ nla_nest_end(skb, attr);
+
+ if (err && err != -ENODATA)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+ struct rtnl_af_ops *af_ops;
+
+ *idxattr = IFLA_STATS_AF_SPEC;
+ attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC);
+ if (!attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->fill_stats_af) {
+ struct nlattr *af;
+
+ af = nla_nest_start_noflag(skb,
+ af_ops->family);
+ if (!af) {
+ rcu_read_unlock();
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+ err = af_ops->fill_stats_af(skb, dev);
+
+ if (err == -ENODATA) {
+ nla_nest_cancel(skb, af);
+ } else if (err < 0) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, af);
+ }
+ }
+ rcu_read_unlock();
+
+ nla_nest_end(skb, attr);
+
+ *idxattr = 0;
+ }
+
+ nlmsg_end(skb, nlh);
+
+ return 0;
+
+nla_put_failure:
+ /* not a multi message or no progress mean a real error */
+ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx)
+ nlmsg_cancel(skb, nlh);
+ else
+ nlmsg_end(skb, nlh);
- *errp = err;
return err;
+}
-err_inval:
- *errp = -EINVAL;
- return -1;
+static size_t if_nlmsg_stats_size(const struct net_device *dev,
+ const struct rtnl_stats_dump_filters *filters)
+{
+ size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
+ unsigned int filter_mask = filters->mask[0];
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
+ size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ int attr = IFLA_STATS_LINK_XSTATS;
+
+ if (ops && ops->get_linkxstats_size) {
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
+ /* for IFLA_STATS_LINK_XSTATS */
+ size += nla_total_size(0);
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
+ struct net_device *_dev = (struct net_device *)dev;
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ /* netdev_master_upper_dev_get can't take const */
+ master = netdev_master_upper_dev_get(_dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->get_linkxstats_size) {
+ int attr = IFLA_STATS_LINK_XSTATS_SLAVE;
+
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
+ /* for IFLA_STATS_LINK_XSTATS_SLAVE */
+ size += nla_total_size(0);
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
+ size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+ struct rtnl_af_ops *af_ops;
+
+ /* for IFLA_STATS_AF_SPEC */
+ size += nla_total_size(0);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_stats_af_size) {
+ size += nla_total_size(
+ af_ops->get_stats_af_size(dev));
+
+ /* for AF_* */
+ size += nla_total_size(0);
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return size;
}
-static void rtnetlink_rcv(struct sock *sk, int len)
+#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)
+
+static const struct nla_policy
+rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
+ [IFLA_STATS_LINK_OFFLOAD_XSTATS] =
+ NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
+};
+
+static const struct nla_policy
+rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_GET_FILTERS] =
+ NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
+};
+
+static const struct nla_policy
+ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
{
- unsigned int qlen = 0;
+ struct nlattr *tb[IFLA_STATS_MAX + 1];
+ int err;
+ int at;
- do {
- mutex_lock(&rtnl_mutex);
- netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg);
- mutex_unlock(&rtnl_mutex);
-
- netdev_run_todo();
- } while (qlen);
-}
-
-static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
-{
- [RTM_GETLINK - RTM_BASE] = { .doit = rtnl_getlink,
- .dumpit = rtnl_dump_ifinfo },
- [RTM_SETLINK - RTM_BASE] = { .doit = rtnl_setlink },
- [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnl_dump_all },
- [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnl_dump_all },
- [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
- [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
- [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info },
-#ifdef CONFIG_FIB_RULES
- [RTM_NEWRULE - RTM_BASE] = { .doit = fib_nl_newrule },
- [RTM_DELRULE - RTM_BASE] = { .doit = fib_nl_delrule },
+ err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
+ rtnl_stats_get_policy_filters, extack);
+ if (err < 0)
+ return err;
+
+ for (at = 1; at <= IFLA_STATS_MAX; at++) {
+ if (tb[at]) {
+ if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
+ NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
+ return -EINVAL;
+ }
+ filters->mask[at] = nla_get_u32(tb[at]);
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
+ u32 filter_mask,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ int err;
+ int i;
+
+ filters->mask[0] = filter_mask;
+ for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
+ filters->mask[i] = -1U;
+
+ err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
+ IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_GET_FILTERS]) {
+ err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
+ filters, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
+ bool is_dump, struct netlink_ext_ack *extack)
+{
+ struct if_stats_msg *ifsm;
+
+ ifsm = nlmsg_payload(nlh, sizeof(*ifsm));
+ if (!ifsm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
+ return -EINVAL;
+ }
+
+ if (!strict_check)
+ return 0;
+
+ /* only requests using strict checks can pass data to influence
+ * the dump. The legacy exception is filter_mask.
+ */
+ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
+ return -EINVAL;
+ }
+ if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
+ NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct rtnl_stats_dump_filters filters;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ int idxattr = 0, prividx = 0;
+ struct if_stats_msg *ifsm;
+ struct sk_buff *nskb;
+ int err;
+
+ err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+ false, extack);
+ if (err)
+ return err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
+ return -EINVAL;
+ }
+
+ err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
+ if (err)
+ return err;
+
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
+ if (!nskb)
+ return -ENOBUFS;
+
+ err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
+ NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ 0, &filters, &idxattr, &prividx, extack);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else {
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+ }
+
+ return err;
+}
+
+static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct rtnl_stats_dump_filters filters;
+ struct net *net = sock_net(skb->sk);
+ unsigned int flags = NLM_F_MULTI;
+ struct if_stats_msg *ifsm;
+ struct {
+ unsigned long ifindex;
+ int idxattr;
+ int prividx;
+ } *ctx = (void *)cb->ctx;
+ struct net_device *dev;
+ int err;
+
+ cb->seq = net->dev_base_seq;
+
+ err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack);
+ if (err)
+ return err;
+
+ ifsm = nlmsg_data(cb->nlh);
+ if (!ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
+ return -EINVAL;
+ }
+
+ err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
+ extack);
+ if (err)
+ return err;
+
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, &filters,
+ &ctx->idxattr, &ctx->prividx,
+ extack);
+ /* If we ran out of room on the first message,
+ * we're in trouble.
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err < 0)
+ break;
+ ctx->prividx = 0;
+ ctx->idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
+
+ return err;
+}
+
+void rtnl_offload_xstats_notify(struct net_device *dev)
+{
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct net *net = dev_net(dev);
+ int idxattr = 0, prividx = 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ ASSERT_RTNL();
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+
+ skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
+ GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
+ &response_filters, &idxattr, &prividx, NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_STATS, err);
+}
+EXPORT_SYMBOL(rtnl_offload_xstats_notify);
+
+static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ struct if_stats_msg *ifsm;
+ bool notify = false;
+ int err;
+
+ err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+ false, extack);
+ if (err)
+ return err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
+ return -EINVAL;
+ }
+
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
+ ifla_stats_set_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
+ u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);
+
+ if (req)
+ err = netdev_offload_xstats_enable(dev, t_l3, extack);
+ else
+ err = netdev_offload_xstats_disable(dev, t_l3);
+
+ if (!err)
+ notify = true;
+ else if (err != -EALREADY)
+ return err;
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ }
+
+ if (notify)
+ rtnl_offload_xstats_notify(dev);
+
+ return 0;
+}
+
+static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct br_port_msg *bpm;
+
+ bpm = nlmsg_payload(nlh, sizeof(*bpm));
+ if (!bpm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
+ return -EINVAL;
+ }
+
+ if (bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
+ return -EINVAL;
+ }
+ if (nlmsg_attrlen(nlh, sizeof(*bpm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+struct rtnl_mdb_dump_ctx {
+ long idx;
+};
+
+static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx, s_idx;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx);
+
+ if (cb->strict_check) {
+ err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
+ if (err)
+ return err;
+ }
+
+ s_idx = ctx->idx;
+ idx = 0;
+
+ for_each_netdev(net, dev) {
+ if (idx < s_idx)
+ goto skip;
+ if (!dev->netdev_ops->ndo_mdb_dump)
+ goto skip;
+
+ err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb);
+ if (err == -EMSGSIZE)
+ goto out;
+ /* Moving on to next device, reset markers and sequence
+ * counters since they are all maintained per-device.
+ */
+ memset(cb->ctx, 0, sizeof(cb->ctx));
+ cb->prev_seq = 0;
+ cb->seq = 0;
+skip:
+ idx++;
+ }
+
+out:
+ ctx->idx = idx;
+ return skb->len;
+}
+
+static int rtnl_validate_mdb_entry_get(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->ifindex) {
+ NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->state) {
+ NL_SET_ERR_MSG(extack, "Entry state cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (entry->addr.proto != htons(ETH_P_IP) &&
+ entry->addr.proto != htons(ETH_P_IPV6) &&
+ entry->addr.proto != 0) {
+ NL_SET_ERR_MSG(extack, "Unknown entry protocol");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = {
+ [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_get,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1];
+ struct net *net = sock_net(in_skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb,
+ MDBA_GET_ENTRY_MAX, mdba_get_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_get) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+}
+
+static int rtnl_validate_mdb_entry(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->ifindex == 0) {
+ NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed");
+ return -EINVAL;
+ }
+
+ if (entry->addr.proto == htons(ETH_P_IP)) {
+ if (!ipv4_is_multicast(entry->addr.u.ip4) &&
+ !ipv4_is_zeronet(entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0");
+ return -EINVAL;
+ }
+ if (ipv4_is_local_multicast(entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast");
+ return -EINVAL;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (entry->addr.proto == htons(ETH_P_IPV6)) {
+ if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) {
+ NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes");
+ return -EINVAL;
+ }
#endif
- [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnl_dump_all },
- [RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info },
- [RTM_SETNEIGHTBL - RTM_BASE] = { .doit = neightbl_set },
+ } else if (entry->addr.proto == 0) {
+ /* L2 mdb */
+ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
+ NL_SET_ERR_MSG(extack, "L2 entry group is not multicast");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Unknown entry protocol");
+ return -EINVAL;
+ }
+
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
+ NL_SET_ERR_MSG(extack, "Unknown entry state");
+ return -EINVAL;
+ }
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = {
+ [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 },
+ [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};
+static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
+ MDBA_SET_ENTRY_MAX, mdba_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_add) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack);
+}
+
+static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+ struct br_mdb_entry zero_entry = {};
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
+ NL_SET_ERR_MSG(extack, "Unknown entry state");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be set");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_N_VID - 1) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) {
+ NL_SET_ERR_MSG(extack, "Entry address cannot be set");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = {
+ [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_del_bulk,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
+ struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ if (!del_bulk)
+ err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
+ MDBA_SET_ENTRY_MAX, mdba_policy,
+ extack);
+ else
+ err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX,
+ mdba_del_bulk_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (del_bulk) {
+ if (!dev->netdev_ops->ndo_mdb_del_bulk) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion");
+ return -EOPNOTSUPP;
+ }
+ return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack);
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_del) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_del(dev, tb, extack);
+}
+
+/* Process one rtnetlink message. */
+
+static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED);
+ rtnl_dumpit_func dumpit = cb->data;
+ int err;
+
+ /* Previous iteration have already finished, avoid calling->dumpit()
+ * again, it may not expect to be called after it reached the end.
+ */
+ if (!dumpit)
+ return 0;
+
+ if (needs_lock)
+ rtnl_lock();
+ err = dumpit(skb, cb);
+ if (needs_lock)
+ rtnl_unlock();
+
+ /* Old dump handlers used to send NLM_DONE as in a separate recvmsg().
+ * Some applications which parse netlink manually depend on this.
+ */
+ if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) {
+ if (err < 0 && err != -EMSGSIZE)
+ return err;
+ if (!err)
+ cb->data = NULL;
+
+ return skb->len;
+ }
+ return err;
+}
+
+static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *control)
+{
+ if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE ||
+ !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) {
+ WARN_ON(control->data);
+ control->data = control->dump;
+ control->dump = rtnl_dumpit;
+ }
+
+ return netlink_dump_start(ssk, skb, nlh, control);
+}
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rtnl_link *link;
+ enum rtnl_kinds kind;
+ struct module *owner;
+ int err = -EOPNOTSUPP;
+ rtnl_doit_func doit;
+ unsigned int flags;
+ int family;
+ int type;
+
+ type = nlh->nlmsg_type;
+ if (type > RTM_MAX)
+ return -EOPNOTSUPP;
+
+ type -= RTM_BASE;
+
+ /* All the messages must have at least 1 byte length */
+ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
+ return 0;
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+ kind = rtnl_msgtype_kind(type);
+
+ if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ rcu_read_lock();
+ if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) {
+ struct sock *rtnl;
+ rtnl_dumpit_func dumpit;
+ u32 min_dump_alloc = 0;
+
+ link = rtnl_get_link(family, type);
+ if (!link || !link->dumpit) {
+ family = PF_UNSPEC;
+ link = rtnl_get_link(family, type);
+ if (!link || !link->dumpit)
+ goto err_unlock;
+ }
+ owner = link->owner;
+ dumpit = link->dumpit;
+ flags = link->flags;
+
+ if (type == RTM_GETLINK - RTM_BASE)
+ min_dump_alloc = rtnl_calcit(skb, nlh);
+
+ err = 0;
+ /* need to do this before rcu_read_unlock() */
+ if (!try_module_get(owner))
+ err = -EPROTONOSUPPORT;
+
+ rcu_read_unlock();
+
+ rtnl = net->rtnl;
+ if (err == 0) {
+ struct netlink_dump_control c = {
+ .dump = dumpit,
+ .min_dump_alloc = min_dump_alloc,
+ .module = owner,
+ .flags = flags,
+ };
+ err = rtnetlink_dump_start(rtnl, skb, nlh, &c);
+ /* netlink_dump_start() will keep a reference on
+ * module if dump is still in progress.
+ */
+ module_put(owner);
+ }
+ return err;
+ }
+
+ link = rtnl_get_link(family, type);
+ if (!link || !link->doit) {
+ family = PF_UNSPEC;
+ link = rtnl_get_link(PF_UNSPEC, type);
+ if (!link || !link->doit)
+ goto out_unlock;
+ }
+
+ owner = link->owner;
+ if (!try_module_get(owner)) {
+ err = -EPROTONOSUPPORT;
+ goto out_unlock;
+ }
+
+ flags = link->flags;
+ if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
+ !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
+ NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
+ module_put(owner);
+ goto err_unlock;
+ }
+
+ if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
+ doit = link->doit;
+ rcu_read_unlock();
+ if (doit)
+ err = doit(skb, nlh, extack);
+ module_put(owner);
+ return err;
+ }
+ rcu_read_unlock();
+
+ rtnl_lock();
+ link = rtnl_get_link(family, type);
+ if (link && link->doit)
+ err = link->doit(skb, nlh, extack);
+ rtnl_unlock();
+
+ module_put(owner);
+
+ return err;
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+
+err_unlock:
+ rcu_read_unlock();
+ return -EOPNOTSUPP;
+}
+
+static void rtnetlink_rcv(struct sk_buff *skb)
+{
+ netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+}
+
+static int rtnetlink_bind(struct net *net, int group)
+{
+ switch (group) {
+ case RTNLGRP_IPV4_MROUTE_R:
+ case RTNLGRP_IPV6_MROUTE_R:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ }
+ return 0;
+}
+
static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
switch (event) {
- case NETDEV_UNREGISTER:
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
- break;
- case NETDEV_REGISTER:
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
- break;
- case NETDEV_UP:
- case NETDEV_DOWN:
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
- break;
- case NETDEV_CHANGE:
- case NETDEV_GOING_DOWN:
+ case NETDEV_REBOOT:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_CHANGENAME:
+ case NETDEV_FEAT_CHANGE:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_POST_TYPE_CHANGE:
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_CHANGEUPPER:
+ case NETDEV_RESEND_IGMP:
+ case NETDEV_CHANGEINFODATA:
+ case NETDEV_CHANGELOWERSTATE:
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
+ GFP_KERNEL, NULL, 0, 0, NULL);
break;
default:
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
break;
}
return NOTIFY_DONE;
@@ -854,36 +7028,76 @@ static struct notifier_block rtnetlink_dev_notifier = {
.notifier_call = rtnetlink_event,
};
-void __init rtnetlink_init(void)
+
+static int __net_init rtnetlink_net_init(struct net *net)
{
- int i;
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .groups = RTNLGRP_MAX,
+ .input = rtnetlink_rcv,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .bind = rtnetlink_bind,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
+ if (!sk)
+ return -ENOMEM;
+ net->rtnl = sk;
+ return 0;
+}
+
+static void __net_exit rtnetlink_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->rtnl);
+ net->rtnl = NULL;
+}
- rtattr_max = 0;
- for (i = 0; i < ARRAY_SIZE(rta_max); i++)
- if (rta_max[i] > rtattr_max)
- rtattr_max = rta_max[i];
- rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
- if (!rta_buf)
- panic("rtnetlink_init: cannot allocate rta_buf\n");
-
- rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
- THIS_MODULE);
- if (rtnl == NULL)
+static struct pernet_operations rtnetlink_net_ops = {
+ .init = rtnetlink_net_init,
+ .exit = rtnetlink_net_exit,
+};
+
+static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
+ .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ {.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get,
+ .dumpit = rtnl_stats_dump},
+ {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set},
+ {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop},
+ {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK,
+ .dumpit = rtnl_bridge_getlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK,
+ .doit = rtnl_bridge_dellink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK,
+ .doit = rtnl_bridge_setlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get,
+ .dumpit = rtnl_fdb_dump},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get,
+ .dumpit = rtnl_mdb_dump},
+};
+
+void __init rtnetlink_init(void)
+{
+ if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
- netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
+
register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table;
- rtnetlink_links[PF_PACKET] = link_rtnetlink_table;
-}
-EXPORT_SYMBOL(__rta_fill);
-EXPORT_SYMBOL(rtattr_strlcpy);
-EXPORT_SYMBOL(rtattr_parse);
-EXPORT_SYMBOL(rtnetlink_links);
-EXPORT_SYMBOL(rtnetlink_put_metrics);
-EXPORT_SYMBOL(rtnl_lock);
-EXPORT_SYMBOL(rtnl_trylock);
-EXPORT_SYMBOL(rtnl_unlock);
-EXPORT_SYMBOL(rtnl_unicast);
-EXPORT_SYMBOL(rtnl_notify);
-EXPORT_SYMBOL(rtnl_set_sk_err);
+ rtnl_register_many(rtnetlink_rtnl_msg_handlers);
+}
diff --git a/net/core/scm.c b/net/core/scm.c
index 271cf060ef8c..cd87f66671aa 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* scm.c - Socket level control messages processing.
*
* Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Alignment and value checking mods by Craig Metz
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -14,6 +10,7 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/user.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/stat.h>
@@ -24,29 +21,46 @@
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/security.h>
+#include <linux/pid_namespace.h>
+#include <linux/pid.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/errqueue.h>
+#include <linux/io_uring.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/compat.h>
#include <net/scm.h>
+#include <net/cls_cgroup.h>
+#include <net/af_unix.h>
/*
- * Only allow a user to send credentials, that they could set with
+ * Only allow a user to send credentials, that they could set with
* setu(g)id.
*/
static __inline__ int scm_check_creds(struct ucred *creds)
{
- if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) &&
- ((creds->uid == current->uid || creds->uid == current->euid ||
- creds->uid == current->suid) || capable(CAP_SETUID)) &&
- ((creds->gid == current->gid || creds->gid == current->egid ||
- creds->gid == current->sgid) || capable(CAP_SETGID))) {
+ const struct cred *cred = current_cred();
+ kuid_t uid = make_kuid(cred->user_ns, creds->uid);
+ kgid_t gid = make_kgid(cred->user_ns, creds->gid);
+
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
+
+ if ((creds->pid == task_tgid_vnr(current) ||
+ ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
+ ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
+ uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
+ ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+ gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
return 0;
}
return -EPERM;
@@ -59,7 +73,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
struct file **fpp;
int i, num;
- num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+ num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
if (num <= 0)
return 0;
@@ -69,31 +83,52 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (!fpl)
{
- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
+ fpl->count_unix = 0;
+ fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+ fpl->inflight = false;
+ fpl->dead = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+#endif
}
fpp = &fpl->fp[fpl->count];
- if (fpl->count + num > SCM_MAX_FD)
+ if (fpl->count + num > fpl->max)
return -EINVAL;
-
+
/*
* Verify the descriptors and increment the usage count.
*/
-
+
for (i=0; i< num; i++)
{
int fd = fdp[i];
struct file *file;
- if (fd < 0 || !(file = fget(fd)))
+ if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
+ /* don't allow io_uring files */
+ if (io_is_uring_fops(file)) {
+ fput(file);
+ return -EINVAL;
+ }
+ if (unix_get_socket(file))
+ fpl->count_unix++;
+
*fpp++ = file;
fpl->count++;
}
+
+ if (!fpl->user)
+ fpl->user = get_uid(current_user());
+
return num;
}
@@ -106,24 +141,42 @@ void __scm_destroy(struct scm_cookie *scm)
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
+ free_uid(fpl->user);
kfree(fpl);
}
}
+EXPORT_SYMBOL(__scm_destroy);
+
+static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid)
+{
+ int err;
+
+ /* drop all previous references */
+ scm_destroy_cred(scm);
+
+ err = pidfs_register_pid(pid);
+ if (unlikely(err))
+ return err;
+
+ scm->pid = pid;
+ scm->creds.pid = pid_vnr(pid);
+ return 0;
+}
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
+ const struct proto_ops *ops = READ_ONCE(sock->ops);
struct cmsghdr *cmsg;
int err;
- for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
- {
+ for_each_cmsghdr(cmsg, msg) {
err = -EINVAL;
/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
/* The first check was omitted in <= 2.2.5. The reasoning was
that parser checks cmsg_len in any case, so that
additional check would be work duplication.
- But if cmsg_level is not SOL_SOCKET, we do not check
+ But if cmsg_level is not SOL_SOCKET, we do not check
for too short ancillary data object at all! Oops.
OK, let's add it...
*/
@@ -136,18 +189,51 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
+ if (!ops || ops->family != PF_UNIX)
+ goto error;
err=scm_fp_copy(cmsg, &p->fp);
if (err<0)
goto error;
break;
case SCM_CREDENTIALS:
+ {
+ struct ucred creds;
+ kuid_t uid;
+ kgid_t gid;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
goto error;
- memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
- err = scm_check_creds(&p->creds);
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&creds);
if (err)
goto error;
+
+ if (!p->pid || pid_vnr(p->pid) != creds.pid) {
+ struct pid *pid;
+ err = -ESRCH;
+ pid = find_get_pid(creds.pid);
+ if (!pid)
+ goto error;
+
+ /* pass a struct pid reference from
+ * find_get_pid() to scm_replace_pid().
+ */
+ err = scm_replace_pid(p, pid);
+ if (err) {
+ put_pid(pid);
+ goto error;
+ }
+ }
+
+ err = -EINVAL;
+ uid = make_kuid(current_user_ns(), creds.uid);
+ gid = make_kgid(current_user_ns(), creds.gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ goto error;
+
+ p->creds.uid = uid;
+ p->creds.gid = gid;
break;
+ }
default:
goto error;
}
@@ -159,23 +245,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
p->fp = NULL;
}
return 0;
-
+
error:
scm_destroy(p);
return err;
}
+EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
- struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control;
- struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
- int err;
- if (MSG_CMSG_COMPAT & msg->msg_flags)
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
return put_cmsg_compat(msg, level, type, len, data);
- if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) {
msg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
@@ -183,88 +267,137 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
msg->msg_flags |= MSG_CTRUNC;
cmlen = msg->msg_controllen;
}
- cmhdr.cmsg_level = level;
- cmhdr.cmsg_type = type;
- cmhdr.cmsg_len = cmlen;
-
- err = -EFAULT;
- if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
- goto out;
- if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
- goto out;
- cmlen = CMSG_SPACE(len);
- msg->msg_control += cmlen;
+
+ if (msg->msg_control_is_user) {
+ struct cmsghdr __user *cm = msg->msg_control_user;
+
+ check_object_size(data, cmlen - sizeof(*cm), true);
+
+ scoped_user_write_access_size(cm, cmlen, efault) {
+ unsafe_put_user(cmlen, &cm->cmsg_len, efault);
+ unsafe_put_user(level, &cm->cmsg_level, efault);
+ unsafe_put_user(type, &cm->cmsg_type, efault);
+ unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
+ cmlen - sizeof(*cm), efault);
+ }
+ } else {
+ struct cmsghdr *cm = msg->msg_control;
+
+ cm->cmsg_level = level;
+ cm->cmsg_type = type;
+ cm->cmsg_len = cmlen;
+ memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm));
+ }
+
+ cmlen = min(CMSG_SPACE(len), msg->msg_controllen);
+ if (msg->msg_control_is_user)
+ msg->msg_control_user += cmlen;
+ else
+ msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
- err = 0;
-out:
- return err;
+ return 0;
+
+efault:
+ return -EFAULT;
}
+EXPORT_SYMBOL(put_cmsg);
-void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len,
+ void *data)
+{
+ /* Don't produce truncated CMSGs */
+ if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len))
+ return -ETOOSMALL;
+
+ return put_cmsg(msg, level, type, len, data);
+}
+
+void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
{
- struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control;
+ struct scm_timestamping64 tss;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tss.ts); i++) {
+ tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec;
+ tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec;
+ }
+
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL(put_cmsg_scm_timestamping64);
+
+void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
+{
+ struct scm_timestamping tss;
+ int i;
- int fdmax = 0;
- int fdnum = scm->fp->count;
- struct file **fp = scm->fp->fp;
- int __user *cmfptr;
+ for (i = 0; i < ARRAY_SIZE(tss.ts); i++) {
+ tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec;
+ tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec;
+ }
+
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL(put_cmsg_scm_timestamping);
+
+static int scm_max_fds(struct msghdr *msg)
+{
+ if (msg->msg_controllen <= sizeof(struct cmsghdr))
+ return 0;
+ return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int);
+}
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr __user *cm =
+ (__force struct cmsghdr __user *)msg->msg_control_user;
+ unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count);
+ int __user *cmsg_data = CMSG_USER_DATA(cm);
int err = 0, i;
- if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ /* no use for FD passing from kernel space callers */
+ if (WARN_ON_ONCE(!msg->msg_control_is_user))
+ return;
+
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
scm_detach_fds_compat(msg, scm);
return;
}
- if (msg->msg_controllen > sizeof(struct cmsghdr))
- fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
- / sizeof(int));
-
- if (fdnum < fdmax)
- fdmax = fdnum;
-
- for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++)
- {
- int new_fd;
- err = security_file_receive(fp[i]);
- if (err)
- break;
- err = get_unused_fd();
+ for (i = 0; i < fdmax; i++) {
+ err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
if (err < 0)
break;
- new_fd = err;
- err = put_user(new_fd, cmfptr);
- if (err) {
- put_unused_fd(new_fd);
- break;
- }
- /* Bump the usage count and install the file. */
- get_file(fp[i]);
- fd_install(new_fd, fp[i]);
}
- if (i > 0)
- {
- int cmlen = CMSG_LEN(i*sizeof(int));
+ if (i > 0) {
+ int cmlen = CMSG_LEN(i * sizeof(int));
+
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
- cmlen = CMSG_SPACE(i*sizeof(int));
- msg->msg_control += cmlen;
+ cmlen = CMSG_SPACE(i * sizeof(int));
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control_user += cmlen;
msg->msg_controllen -= cmlen;
}
}
- if (i < fdnum || (fdnum && fdmax <= 0))
+
+ if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
msg->msg_flags |= MSG_CTRUNC;
/*
- * All of the files that fit in the message have had their
- * usage counts incremented, so we just free the list.
+ * All of the files that fit in the message have had their usage counts
+ * incremented, so we just free the list.
*/
__scm_destroy(scm);
}
+EXPORT_SYMBOL(scm_detach_fds);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
{
@@ -274,17 +407,142 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (!fpl)
return NULL;
- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
+ new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
+ GFP_KERNEL_ACCOUNT);
if (new_fpl) {
- for (i=fpl->count-1; i>=0; i--)
+ for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
- memcpy(new_fpl, fpl, sizeof(*fpl));
+
+ new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+ new_fpl->inflight = false;
+ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
}
return new_fpl;
}
-
-EXPORT_SYMBOL(__scm_destroy);
-EXPORT_SYMBOL(__scm_send);
-EXPORT_SYMBOL(put_cmsg);
-EXPORT_SYMBOL(scm_detach_fds);
EXPORT_SYMBOL(scm_fp_dup);
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct lsm_context ctx;
+ int err;
+
+ if (sk->sk_scm_security) {
+ err = security_secid_to_secctx(scm->secid, &ctx);
+
+ if (err >= 0) {
+ put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
+ ctx.context);
+
+ security_release_secctx(&ctx);
+ }
+ }
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return sk->sk_scm_security;
+}
+#else
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return false;
+}
+#endif
+
+static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct file *pidfd_file = NULL;
+ int len, pidfd;
+
+ /* put_cmsg() doesn't return an error if CMSG is truncated,
+ * that's why we need to opencode these checks here.
+ */
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
+ len = sizeof(struct compat_cmsghdr) + sizeof(int);
+ else
+ len = sizeof(struct cmsghdr) + sizeof(int);
+
+ if (msg->msg_controllen < len) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+
+ if (!scm->pid)
+ return;
+
+ pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file);
+
+ if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
+ if (pidfd_file) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+ }
+
+ return;
+ }
+
+ if (pidfd_file)
+ fd_install(pidfd, pidfd_file);
+}
+
+static bool __scm_recv_common(struct sock *sk, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!msg->msg_control) {
+ if (sk->sk_scm_credentials || sk->sk_scm_pidfd ||
+ scm->fp || scm_has_secdata(sk))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ scm_destroy(scm);
+ return false;
+ }
+
+ if (sk->sk_scm_credentials) {
+ struct user_namespace *current_ns = current_user_ns();
+ struct ucred ucreds = {
+ .pid = scm->creds.pid,
+ .uid = from_kuid_munged(current_ns, scm->creds.uid),
+ .gid = from_kgid_munged(current_ns, scm->creds.gid),
+ };
+
+ put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
+ }
+
+ scm_passec(sk, msg, scm);
+
+ if (scm->fp)
+ scm_detach_fds(msg, scm);
+
+ return true;
+}
+
+void scm_recv(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ scm_destroy_cred(scm);
+}
+EXPORT_SYMBOL(scm_recv);
+
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ if (sock->sk->sk_scm_pidfd)
+ scm_pidfd_recv(msg, scm);
+
+ scm_destroy_cred(scm);
+}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
new file mode 100644
index 000000000000..9a3965680451
--- /dev/null
+++ b/net/core/secure_seq.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/random.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/siphash.h>
+#include <net/secure_seq.h>
+
+#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <linux/in6.h>
+#include <net/tcp.h>
+
+static siphash_aligned_key_t net_secret;
+static siphash_aligned_key_t ts_secret;
+
+#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
+
+static __always_inline void net_secret_init(void)
+{
+ net_get_random_once(&net_secret, sizeof(net_secret));
+}
+
+static __always_inline void ts_secret_init(void)
+{
+ net_get_random_once(&ts_secret, sizeof(ts_secret));
+}
+#endif
+
+#ifdef CONFIG_INET
+static u32 seq_scale(u32 seq)
+{
+ /*
+ * As close as possible to RFC 793, which
+ * suggests using a 250 kHz clock.
+ * Further reading shows this assumes 2 Mb/s networks.
+ * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
+ * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
+ * we also need to limit the resolution so that the u32 seq
+ * overlaps less than one time per MSL (2 minutes).
+ * Choosing a clock of 64 ns period is OK. (period of 274 s)
+ */
+ return seq + (ktime_get_real_ns() >> 6);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+u32 secure_tcpv6_ts_off(const struct net *net,
+ const __be32 *saddr, const __be32 *daddr)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ };
+
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+ return 0;
+
+ ts_secret_init();
+ return siphash(&combined, offsetofend(typeof(combined), daddr),
+ &ts_secret);
+}
+EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
+
+u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .sport = sport,
+ .dport = dport
+ };
+ u32 hash;
+
+ net_secret_init();
+ hash = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+ return seq_scale(hash);
+}
+EXPORT_SYMBOL(secure_tcpv6_seq);
+
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+ __be16 dport)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ unsigned int timeseed;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *(struct in6_addr *)saddr,
+ .daddr = *(struct in6_addr *)daddr,
+ .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ .dport = dport,
+ };
+ net_secret_init();
+ return siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
+#endif
+
+#ifdef CONFIG_INET
+u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
+{
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+ return 0;
+
+ ts_secret_init();
+ return siphash_2u32((__force u32)saddr, (__force u32)daddr,
+ &ts_secret);
+}
+
+/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
+u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash;
+
+ net_secret_init();
+ hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ &net_secret);
+ return seq_scale(hash);
+}
+EXPORT_SYMBOL_GPL(secure_tcp_seq);
+
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+{
+ net_secret_init();
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport,
+ jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ &net_secret);
+}
+EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
+#endif
diff --git a/net/core/selftests.c b/net/core/selftests.c
new file mode 100644
index 000000000000..8b81feb82c4a
--- /dev/null
+++ b/net/core/selftests.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates.
+ * stmmac Selftests Support
+ *
+ * Author: Jose Abreu <joabreu@synopsys.com>
+ *
+ * Ported from stmmac by:
+ * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de>
+ */
+
+#include <linux/phy.h>
+#include <net/selftests.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+static u8 net_test_next_id;
+
+struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+ struct net_packet_attrs *attr)
+{
+ struct sk_buff *skb = NULL;
+ struct udphdr *uhdr = NULL;
+ struct tcphdr *thdr = NULL;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct iphdr *ihdr;
+ int iplen, size;
+
+ size = attr->size + NET_TEST_PKT_SIZE;
+
+ if (attr->tcp)
+ size += sizeof(struct tcphdr);
+ else
+ size += sizeof(struct udphdr);
+
+ if (attr->max_size && attr->max_size > size)
+ size = attr->max_size;
+
+ skb = netdev_alloc_skb(ndev, size);
+ if (!skb)
+ return NULL;
+
+ prefetchw(skb->data);
+
+ ehdr = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ skb_set_network_header(skb, skb->len);
+ ihdr = skb_put(skb, sizeof(*ihdr));
+
+ skb_set_transport_header(skb, skb->len);
+ if (attr->tcp)
+ thdr = skb_put(skb, sizeof(*thdr));
+ else
+ uhdr = skb_put(skb, sizeof(*uhdr));
+
+ eth_zero_addr(ehdr->h_dest);
+
+ if (attr->src)
+ ether_addr_copy(ehdr->h_source, attr->src);
+ if (attr->dst)
+ ether_addr_copy(ehdr->h_dest, attr->dst);
+
+ ehdr->h_proto = htons(ETH_P_IP);
+
+ if (attr->tcp) {
+ memset(thdr, 0, sizeof(*thdr));
+ thdr->source = htons(attr->sport);
+ thdr->dest = htons(attr->dport);
+ thdr->doff = sizeof(struct tcphdr) / 4;
+ } else {
+ uhdr->source = htons(attr->sport);
+ uhdr->dest = htons(attr->dport);
+ uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size);
+ if (attr->max_size)
+ uhdr->len = htons(attr->max_size -
+ (sizeof(*ihdr) + sizeof(*ehdr)));
+ uhdr->check = 0;
+ }
+
+ ihdr->ihl = 5;
+ ihdr->ttl = 32;
+ ihdr->version = 4;
+ if (attr->tcp)
+ ihdr->protocol = IPPROTO_TCP;
+ else
+ ihdr->protocol = IPPROTO_UDP;
+ iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size;
+ if (attr->tcp)
+ iplen += sizeof(*thdr);
+ else
+ iplen += sizeof(*uhdr);
+
+ if (attr->max_size)
+ iplen = attr->max_size - sizeof(*ehdr);
+
+ ihdr->tot_len = htons(iplen);
+ ihdr->frag_off = 0;
+ ihdr->saddr = htonl(attr->ip_src);
+ ihdr->daddr = htonl(attr->ip_dst);
+ ihdr->tos = 0;
+ ihdr->id = 0;
+ ip_send_check(ihdr);
+
+ shdr = skb_put(skb, sizeof(*shdr));
+ shdr->version = 0;
+ shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC);
+ attr->id = id;
+ shdr->id = id;
+
+ if (attr->size) {
+ void *payload = skb_put(skb, attr->size);
+
+ memset(payload, 0, attr->size);
+ }
+
+ if (attr->max_size && attr->max_size > skb->len) {
+ size_t pad_len = attr->max_size - skb->len;
+ void *pad = skb_put(skb, pad_len);
+
+ memset(pad, 0, pad_len);
+ }
+
+ skb->csum = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ if (attr->tcp) {
+ int l4len = skb->len - skb_transport_offset(skb);
+
+ thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (attr->bad_csum) {
+ /* Force mangled checksum */
+ if (skb_checksum_help(skb)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ if (thdr->check != CSUM_MANGLED_0)
+ thdr->check = CSUM_MANGLED_0;
+ else
+ thdr->check = csum16_sub(thdr->check,
+ cpu_to_be16(1));
+ }
+ } else {
+ udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr);
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ndev;
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(net_test_get_skb);
+
+static int net_test_loopback_validate(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *pt,
+ struct net_device *orig_ndev)
+{
+ struct net_test_priv *tpriv = pt->af_packet_priv;
+ const unsigned char *src = tpriv->packet->src;
+ const unsigned char *dst = tpriv->packet->dst;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct udphdr *uhdr;
+ struct tcphdr *thdr;
+ struct iphdr *ihdr;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+
+ if (skb_linearize(skb))
+ goto out;
+ if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN))
+ goto out;
+
+ ehdr = (struct ethhdr *)skb_mac_header(skb);
+ if (dst) {
+ if (!ether_addr_equal_unaligned(ehdr->h_dest, dst))
+ goto out;
+ }
+
+ if (src) {
+ if (!ether_addr_equal_unaligned(ehdr->h_source, src))
+ goto out;
+ }
+
+ ihdr = ip_hdr(skb);
+ if (tpriv->double_vlan)
+ ihdr = (struct iphdr *)(skb_network_header(skb) + 4);
+
+ if (tpriv->packet->tcp) {
+ if (ihdr->protocol != IPPROTO_TCP)
+ goto out;
+
+ thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (thdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr));
+ } else {
+ if (ihdr->protocol != IPPROTO_UDP)
+ goto out;
+
+ uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (uhdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr));
+ }
+
+ if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC))
+ goto out;
+ if (tpriv->packet->id != shdr->id)
+ goto out;
+
+ if (tpriv->packet->bad_csum && skb->ip_summed == CHECKSUM_UNNECESSARY)
+ tpriv->ok = -EIO;
+ else
+ tpriv->ok = true;
+
+ complete(&tpriv->comp);
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int __net_test_loopback(struct net_device *ndev,
+ struct net_packet_attrs *attr)
+{
+ struct net_test_priv *tpriv;
+ struct sk_buff *skb = NULL;
+ int ret = 0;
+
+ tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
+ if (!tpriv)
+ return -ENOMEM;
+
+ tpriv->ok = false;
+ init_completion(&tpriv->comp);
+
+ tpriv->pt.type = htons(ETH_P_IP);
+ tpriv->pt.func = net_test_loopback_validate;
+ tpriv->pt.dev = ndev;
+ tpriv->pt.af_packet_priv = tpriv;
+ tpriv->packet = attr;
+ dev_add_pack(&tpriv->pt);
+
+ skb = net_test_get_skb(ndev, net_test_next_id, attr);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ net_test_next_id++;
+ ret = dev_direct_xmit(skb, attr->queue_mapping);
+ if (ret < 0) {
+ goto cleanup;
+ } else if (ret > 0) {
+ ret = -ENETUNREACH;
+ goto cleanup;
+ }
+
+ if (!attr->timeout)
+ attr->timeout = NET_LB_TIMEOUT;
+
+ wait_for_completion_timeout(&tpriv->comp, attr->timeout);
+ if (tpriv->ok < 0)
+ ret = tpriv->ok;
+ else if (!tpriv->ok)
+ ret = -ETIMEDOUT;
+ else
+ ret = 0;
+
+cleanup:
+ dev_remove_pack(&tpriv->pt);
+ kfree(tpriv);
+ return ret;
+}
+
+static int net_test_netif_carrier(struct net_device *ndev)
+{
+ return netif_carrier_ok(ndev) ? 0 : -ENOLINK;
+}
+
+static int net_test_phy_phydev(struct net_device *ndev)
+{
+ return ndev->phydev ? 0 : -EOPNOTSUPP;
+}
+
+static int net_test_phy_loopback_enable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, true, 0);
+}
+
+static int net_test_phy_loopback_disable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, false, 0);
+}
+
+static int net_test_phy_loopback_udp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_udp_mtu(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.max_size = ndev->mtu;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_tcp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.tcp = true;
+ return __net_test_loopback(ndev, &attr);
+}
+
+/**
+ * net_test_phy_loopback_tcp_bad_csum - PHY loopback test with a deliberately
+ * corrupted TCP checksum
+ * @ndev: the network device to test
+ *
+ * Builds the same minimal Ethernet/IPv4/TCP frame as
+ * net_test_phy_loopback_tcp(), then flips the least-significant bit of the TCP
+ * checksum so the resulting value is provably invalid (neither 0 nor 0xFFFF).
+ * The frame is transmitted through the device’s internal PHY loopback path:
+ *
+ * test code -> MAC driver -> MAC HW -> xMII -> PHY ->
+ * internal PHY loopback -> xMII -> MAC HW -> MAC driver -> test code
+ *
+ * Result interpretation
+ * ---------------------
+ * 0 The frame is delivered to the stack and the driver reports
+ * ip_summed as CHECKSUM_NONE or CHECKSUM_COMPLETE - both are
+ * valid ways to indicate “bad checksum, let the stack verify.â€
+ * -ETIMEDOUT The MAC/PHY silently dropped the frame; hardware checksum
+ * verification filtered it out before the driver saw it.
+ * -EIO The driver returned the frame with ip_summed ==
+ * CHECKSUM_UNNECESSARY, falsely claiming a valid checksum and
+ * indicating a serious RX-path defect.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+static int net_test_phy_loopback_tcp_bad_csum(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.tcp = true;
+ attr.bad_csum = true;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static const struct net_test {
+ char name[ETH_GSTRING_LEN];
+ int (*fn)(struct net_device *ndev);
+} net_selftests[] = {
+ {
+ .name = "Carrier ",
+ .fn = net_test_netif_carrier,
+ }, {
+ .name = "PHY dev is present ",
+ .fn = net_test_phy_phydev,
+ }, {
+ /* This test should be done before all PHY loopback test */
+ .name = "PHY internal loopback, enable ",
+ .fn = net_test_phy_loopback_enable,
+ }, {
+ .name = "PHY internal loopback, UDP ",
+ .fn = net_test_phy_loopback_udp,
+ }, {
+ .name = "PHY internal loopback, MTU ",
+ .fn = net_test_phy_loopback_udp_mtu,
+ }, {
+ .name = "PHY internal loopback, TCP ",
+ .fn = net_test_phy_loopback_tcp,
+ }, {
+ .name = "PHY loopback, bad TCP csum ",
+ .fn = net_test_phy_loopback_tcp_bad_csum,
+ }, {
+ /* This test should be done after all PHY loopback test */
+ .name = "PHY internal loopback, disable",
+ .fn = net_test_phy_loopback_disable,
+ },
+};
+
+void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf)
+{
+ int count = net_selftest_get_count();
+ int i;
+
+ memset(buf, 0, sizeof(*buf) * count);
+ net_test_next_id = 0;
+
+ if (etest->flags != ETH_TEST_FL_OFFLINE) {
+ netdev_err(ndev, "Only offline tests are supported\n");
+ etest->flags |= ETH_TEST_FL_FAILED;
+ return;
+ }
+
+
+ for (i = 0; i < count; i++) {
+ buf[i] = net_selftests[i].fn(ndev);
+ if (buf[i] && (buf[i] != -EOPNOTSUPP))
+ etest->flags |= ETH_TEST_FL_FAILED;
+ }
+}
+EXPORT_SYMBOL_GPL(net_selftest);
+
+int net_selftest_get_count(void)
+{
+ return ARRAY_SIZE(net_selftests);
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_count);
+
+void net_selftest_get_strings(u8 *data)
+{
+ int i;
+
+ for (i = 0; i < net_selftest_get_count(); i++)
+ ethtool_sprintf(&data, "%2d. %s", i + 1,
+ net_selftests[i].name);
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_strings);
+
+MODULE_DESCRIPTION("Common library for generic PHY ethtool selftests");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>");
diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c
new file mode 100644
index 000000000000..4235db6bdfad
--- /dev/null
+++ b/net/core/skb_fault_injection.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+static struct {
+ struct fault_attr attr;
+ char devname[IFNAMSIZ];
+ bool filtered;
+} skb_realloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .filtered = false,
+};
+
+static bool should_fail_net_realloc_skb(struct sk_buff *skb)
+{
+ struct net_device *net = skb->dev;
+
+ if (skb_realloc.filtered &&
+ strncmp(net->name, skb_realloc.devname, IFNAMSIZ))
+ /* device name filter set, but names do not match */
+ return false;
+
+ if (!should_fail(&skb_realloc.attr, 1))
+ return false;
+
+ return true;
+}
+ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE);
+
+void skb_might_realloc(struct sk_buff *skb)
+{
+ if (!should_fail_net_realloc_skb(skb))
+ return;
+
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_might_realloc);
+
+static int __init fail_skb_realloc_setup(char *str)
+{
+ return setup_fault_attr(&skb_realloc.attr, str);
+}
+__setup("fail_skb_realloc=", fail_skb_realloc_setup);
+
+static void reset_settings(void)
+{
+ skb_realloc.filtered = false;
+ memset(&skb_realloc.devname, 0, IFNAMSIZ);
+}
+
+static ssize_t devname_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ ssize_t ret;
+
+ reset_settings();
+ ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ,
+ ppos, buffer, count);
+ if (ret < 0)
+ return ret;
+
+ skb_realloc.devname[IFNAMSIZ - 1] = '\0';
+ /* Remove a possible \n at the end of devname */
+ strim(skb_realloc.devname);
+
+ if (strnlen(skb_realloc.devname, IFNAMSIZ))
+ skb_realloc.filtered = true;
+
+ return count;
+}
+
+static ssize_t devname_read(struct file *file,
+ char __user *buffer,
+ size_t size, loff_t *ppos)
+{
+ if (!skb_realloc.filtered)
+ return 0;
+
+ return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname,
+ strlen(skb_realloc.devname));
+}
+
+static const struct file_operations devname_ops = {
+ .write = devname_write,
+ .read = devname_read,
+};
+
+static int __init fail_skb_realloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_skb_realloc", NULL,
+ &skb_realloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_file("devname", mode, dir, NULL, &devname_ops);
+
+ return 0;
+}
+
+late_initcall(fail_skb_realloc_debugfs);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a90bc439488e..a00808f7be6a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Routines having to do with the 'struct sk_buff' memory handlers.
*
- * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
* Florian La Roche <rzsfl@rz.uni-sb.de>
*
- * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $
- *
* Fixes:
* Alan Cox : Fixed the worst of the load
* balancer bugs.
@@ -27,266 +26,880 @@
* disabled, or you better be *real* sure that the operation is atomic
* with respect to whatever list is being frobbed (e.g. via lock_sock()
* or via disabling bottom half handlers, etc).
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
* The functions in this file will not compile correctly with gcc 2.4.x
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
+#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+#include <linux/errqueue.h>
+#include <linux/prefetch.h>
+#include <linux/bitfield.h>
+#include <linux/if_vlan.h>
+#include <linux/mpls.h>
+#include <linux/kcov.h>
+#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/hotdata.h>
+#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/mpls.h>
+#include <net/mptcp.h>
+#include <net/mctp.h>
+#include <net/page_pool/helpers.h>
+#include <net/psp/types.h>
+#include <net/dropreason.h>
+#include <net/xdp_sock.h>
+
+#include <linux/uaccess.h>
+#include <trace/events/skb.h>
+#include <linux/highmem.h>
+#include <linux/capability.h>
+#include <linux/user_namespace.h>
+#include <linux/indirect_call_wrapper.h>
+#include <linux/textsearch.h>
+
+#include "dev.h"
+#include "devmem.h"
+#include "netmem_priv.h"
+#include "sock_destructor.h"
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static struct kmem_cache *skbuff_ext_cache __ro_after_init;
+#endif
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
-static kmem_cache_t *skbuff_head_cache __read_mostly;
-static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
+ * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
+ * size, and we can differentiate heads from skb_small_head_cache
+ * vs system slabs by looking at their size (skb_end_offset()).
+ */
+#define SKB_SMALL_HEAD_CACHE_SIZE \
+ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
+ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
+ SKB_SMALL_HEAD_SIZE)
-/*
- * Keep out-of-line to prevent kernel bloat.
- * __builtin_return_address is not used because it is not always
- * reliable.
+#define SKB_SMALL_HEAD_HEADROOM \
+ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
+
+/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
+ * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
+ * netmem is a page.
*/
+static_assert(offsetof(struct bio_vec, bv_page) ==
+ offsetof(skb_frag_t, netmem));
+static_assert(sizeof_field(struct bio_vec, bv_page) ==
+ sizeof_field(skb_frag_t, netmem));
+
+static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
+static_assert(sizeof_field(struct bio_vec, bv_len) ==
+ sizeof_field(skb_frag_t, len));
+
+static_assert(offsetof(struct bio_vec, bv_offset) ==
+ offsetof(skb_frag_t, offset));
+static_assert(sizeof_field(struct bio_vec, bv_offset) ==
+ sizeof_field(skb_frag_t, offset));
+
+#undef FN
+#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
+static const char * const drop_reasons[] = {
+ [SKB_CONSUMED] = "CONSUMED",
+ DEFINE_DROP_REASON(FN, FN)
+};
+
+static const struct drop_reason_list drop_reasons_core = {
+ .reasons = drop_reasons,
+ .n_reasons = ARRAY_SIZE(drop_reasons),
+};
+
+const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
+ [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
+};
+EXPORT_SYMBOL(drop_reasons_by_subsys);
/**
- * skb_over_panic - private function
- * @skb: buffer
- * @sz: size
- * @here: address
- *
- * Out of line support code for skb_put(). Not user callable.
+ * drop_reasons_register_subsys - register another drop reason subsystem
+ * @subsys: the subsystem to register, must not be the core
+ * @list: the list of drop reasons within the subsystem, must point to
+ * a statically initialized list
*/
-void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+ const struct drop_reason_list *list)
{
- printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%p end:%p dev:%s\n",
- here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
- BUG();
+ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+ subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+ "invalid subsystem %d\n", subsys))
+ return;
+
+ /* must point to statically allocated memory, so INIT is OK */
+ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
+EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
/**
- * skb_under_panic - private function
- * @skb: buffer
- * @sz: size
- * @here: address
+ * drop_reasons_unregister_subsys - unregister a drop reason subsystem
+ * @subsys: the subsystem to remove, must not be the core
*
- * Out of line support code for skb_push(). Not user callable.
+ * Note: This will synchronize_rcu() to ensure no users when it returns.
*/
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
+{
+ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+ subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+ "invalid subsystem %d\n", subsys))
+ return;
-void skb_under_panic(struct sk_buff *skb, int sz, void *here)
+ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
+
+/**
+ * skb_panic - private function for out-of-line support
+ * @skb: buffer
+ * @sz: size
+ * @addr: address
+ * @msg: skb_over_panic or skb_under_panic
+ *
+ * Out-of-line support for skb_put() and skb_push().
+ * Called via the wrapper skb_over_panic() or skb_under_panic().
+ * Keep out of line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always reliable.
+ */
+static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
+ const char msg[])
{
- printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%p end:%p dev:%s\n",
- here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
+ pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
+ msg, addr, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
BUG();
}
-void skb_truesize_bug(struct sk_buff *skb)
+static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
- printk(KERN_ERR "SKB BUG: Invalid truesize (%u) "
- "len=%u, sizeof(sk_buff)=%Zd\n",
- skb->truesize, skb->len, sizeof(struct sk_buff));
+ skb_panic(skb, sz, addr, __func__);
}
-EXPORT_SYMBOL(skb_truesize_bug);
-/* Allocate a new skbuff. We do this ourselves so we can fill in a few
- * 'private' fields and also do memory statistics to find all the
- * [BEEP] leaks.
- *
+static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+#define NAPI_SKB_CACHE_SIZE 128
+#define NAPI_SKB_CACHE_BULK 32
+#define NAPI_SKB_CACHE_FREE 32
+
+struct napi_alloc_cache {
+ local_lock_t bh_lock;
+ struct page_frag_cache page;
+ unsigned int skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ void *data;
+
+ fragsz = SKB_DATA_ALIGN(fragsz);
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ data = __page_frag_alloc_align(&nc->page, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN, align_mask);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return data;
+
+}
+EXPORT_SYMBOL(__napi_alloc_frag_align);
+
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ void *data;
+
+ if (in_hardirq() || irqs_disabled()) {
+ struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
+
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ data = __page_frag_alloc_align(nc, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN,
+ align_mask);
+ } else {
+ local_bh_disable();
+ data = __napi_alloc_frag_align(fragsz, align_mask);
+ local_bh_enable();
+ }
+ return data;
+}
+EXPORT_SYMBOL(__netdev_alloc_frag_align);
+
+/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler
+ * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset.
*/
+static u32 skbuff_cache_size __read_mostly;
+
+static struct sk_buff *napi_skb_cache_get(bool alloc)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ if (unlikely(!nc->skb_count)) {
+ if (alloc)
+ nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
+ if (unlikely(!nc->skb_count)) {
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return NULL;
+ }
+ }
+
+ skb = nc->skb_cache[--nc->skb_count];
+ if (nc->skb_count)
+ prefetch(nc->skb_cache[nc->skb_count - 1]);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ kasan_mempool_unpoison_object(skb, skbuff_cache_size);
+
+ return skb;
+}
/**
- * __alloc_skb - allocate a network buffer
- * @size: size to allocate
- * @gfp_mask: allocation mask
- * @fclone: allocate from fclone cache instead of head cache
- * and allocate a cloned (child) skb
+ * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
+ * @skbs: pointer to an at least @n-sized array to fill with skb pointers
+ * @n: number of entries to provide
*
- * Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of size bytes. The object has a reference count of one.
- * The return is the buffer. On a failure the return is %NULL.
+ * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
+ * the pointers into the provided array @skbs. If there are less entries
+ * available, tries to replenish the cache and bulk-allocates the diff from
+ * the MM layer if needed.
+ * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
+ * ready for {,__}build_skb_around() and don't have any data buffers attached.
+ * Must be called *only* from the BH context.
*
- * Buffers may only be allocated from interrupts using a @gfp_mask of
- * %GFP_ATOMIC.
+ * Return: number of successfully allocated skbs (@n if no actual allocation
+ * needed or kmem_cache_alloc_bulk() didn't fail).
*/
-struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone)
+u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
{
- kmem_cache_t *cache;
- struct skb_shared_info *shinfo;
- struct sk_buff *skb;
- u8 *data;
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 bulk, total = n;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ if (nc->skb_count >= n)
+ goto get;
+
+ /* No enough cached skbs. Try refilling the cache first */
+ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
+ nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN, bulk,
+ &nc->skb_cache[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* Still not enough. Bulk-allocate the missing part directly, zeroed */
+ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
+ n - nc->skb_count, &skbs[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* kmem_cache didn't allocate the number we need, limit the output */
+ total -= n - nc->skb_count;
+ n = nc->skb_count;
+
+get:
+ for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
+ skbs[i] = nc->skb_cache[base + i];
+
+ kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
+ memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ }
- cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+ nc->skb_count -= n;
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
- /* Get the HEAD */
- skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
- if (!skb)
- goto out;
+ return total;
+}
+EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
- /* Get the DATA. Size must match skb_add_mtu(). */
- size = SKB_DATA_ALIGN(size);
- data = kmalloc_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask);
- if (!data)
- goto nodata;
+static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
+ unsigned int size)
+{
+ struct skb_shared_info *shinfo;
+
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- memset(skb, 0, offsetof(struct sk_buff, truesize));
- skb->truesize = size + sizeof(struct sk_buff);
- atomic_set(&skb->users, 1);
+ /* Assumes caller memset cleared SKB */
+ skb->truesize = SKB_TRUESIZE(size);
+ refcount_set(&skb->users, 1);
skb->head = data;
skb->data = data;
- skb->tail = data;
- skb->end = data + size;
+ skb_reset_tail_pointer(skb);
+ skb_set_end_offset(skb, size);
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+ skb->alloc_cpu = raw_smp_processor_id();
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
- shinfo->nr_frags = 0;
- shinfo->gso_size = 0;
- shinfo->gso_segs = 0;
- shinfo->gso_type = 0;
- shinfo->ip6_frag_id = 0;
- shinfo->frag_list = NULL;
- if (fclone) {
- struct sk_buff *child = skb + 1;
- atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ skb_set_kcov_handle(skb, kcov_common_handle());
+}
- skb->fclone = SKB_FCLONE_ORIG;
- atomic_set(fclone_ref, 1);
+static inline void *__slab_build_skb(void *data, unsigned int *size)
+{
+ void *resized;
+
+ /* Must find the allocation size (and grow it to match). */
+ *size = ksize(data);
+ /* krealloc() will immediately return "data" when
+ * "ksize(data)" is requested: it is the existing upper
+ * bounds. As a result, GFP_ATOMIC will be ignored. Note
+ * that this "new" pointer needs to be passed back to the
+ * caller for use so the __alloc_size hinting will be
+ * tracked correctly.
+ */
+ resized = krealloc(data, *size, GFP_ATOMIC);
+ WARN_ON_ONCE(resized != data);
+ return resized;
+}
+
+/* build_skb() variant which can operate on slab buffers.
+ * Note that this should be used sparingly as slab buffers
+ * cannot be combined efficiently by GRO!
+ */
+struct sk_buff *slab_build_skb(void *data)
+{
+ struct sk_buff *skb;
+ unsigned int size;
+
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ data = __slab_build_skb(data, &size);
+ __finalize_skb_around(skb, data, size);
+
+ return skb;
+}
+EXPORT_SYMBOL(slab_build_skb);
- child->fclone = SKB_FCLONE_UNAVAILABLE;
+/* Caller must provide SKB that is memset cleared */
+static void __build_skb_around(struct sk_buff *skb, void *data,
+ unsigned int frag_size)
+{
+ unsigned int size = frag_size;
+
+ /* frag_size == 0 is considered deprecated now. Callers
+ * using slab buffer should use slab_build_skb() instead.
+ */
+ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
+ data = __slab_build_skb(data, &size);
+
+ __finalize_skb_around(skb, data, size);
+}
+
+/**
+ * __build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data (must not be 0)
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated from the page
+ * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
+ * allocation is deprecated, and callers should use slab_build_skb()
+ * instead.)
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb;
+
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
+
+ return skb;
+}
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __build_skb(data, frag_size);
+
+ if (likely(skb && frag_size)) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
}
-out:
return skb;
-nodata:
- kmem_cache_free(cache, skb);
- skb = NULL;
- goto out;
}
+EXPORT_SYMBOL(build_skb);
+
+/**
+ * build_skb_around - build a network buffer around provided skb
+ * @skb: sk_buff provide by caller, must be memset cleared
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
+ */
+struct sk_buff *build_skb_around(struct sk_buff *skb,
+ void *data, unsigned int frag_size)
+{
+ if (unlikely(!skb))
+ return NULL;
+
+ __build_skb_around(skb, data, frag_size);
+
+ if (frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+ return skb;
+}
+EXPORT_SYMBOL(build_skb_around);
+
+/**
+ * __napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
+ *
+ * Version of __build_skb() that uses NAPI percpu caches to obtain
+ * skbuff_head instead of inplace allocation.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb;
+
+ skb = napi_skb_cache_get(true);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
+
+ return skb;
+}
+
+/**
+ * napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
+ *
+ * Version of __napi_build_skb() that takes care of skb->head_frag
+ * and skb->pfmemalloc when the data is a page or page fragment.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __napi_build_skb(data, frag_size);
+
+ if (likely(skb) && frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(napi_build_skb);
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
+ bool *pfmemalloc)
+{
+ bool ret_pfmemalloc = false;
+ size_t obj_size;
+ void *obj;
+
+ obj_size = SKB_HEAD_ALIGN(*size);
+ if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
+ !(flags & KMALLOC_NOT_NORMAL_BITS)) {
+ obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ *size = SKB_SMALL_HEAD_CACHE_SIZE;
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
+ goto out;
+ }
+
+ obj_size = kmalloc_size_roundup(obj_size);
+ /* The following cast might truncate high-order bits of obj_size, this
+ * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
+ */
+ *size = (unsigned int)obj_size;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(obj_size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(obj_size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
+}
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
/**
- * alloc_skb_from_cache - allocate a network buffer
- * @cp: kmem_cache from which to allocate the data area
- * (object size must be big enough for @size bytes + skb overheads)
+ * __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
*
- * Allocate a new &sk_buff. The returned buffer has no headroom and
- * tail room of size bytes. The object has a reference count of one.
- * The return is the buffer. On a failure the return is %NULL.
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
-struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
- unsigned int size,
- gfp_t gfp_mask)
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ struct kmem_cache *cache;
+ bool pfmemalloc;
u8 *data;
- /* Get the HEAD */
- skb = kmem_cache_alloc(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
- if (!skb)
- goto out;
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
- /* Get the DATA. */
- size = SKB_DATA_ALIGN(size);
- data = kmem_cache_alloc(cp, gfp_mask);
- if (!data)
+ if (flags & SKB_ALLOC_FCLONE) {
+ cache = net_hotdata.skbuff_fclone_cache;
+ goto fallback;
+ }
+ cache = net_hotdata.skbuff_cache;
+ if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id()))
+ goto fallback;
+
+ if (flags & SKB_ALLOC_NAPI) {
+ skb = napi_skb_cache_get(true);
+ if (unlikely(!skb))
+ return NULL;
+ } else if (!in_hardirq() && !irqs_disabled()) {
+ local_bh_disable();
+ skb = napi_skb_cache_get(false);
+ local_bh_enable();
+ }
+
+ if (!skb) {
+fallback:
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
+ if (unlikely(!skb))
+ return NULL;
+ }
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
+ if (unlikely(!data))
goto nodata;
+ /* kmalloc_size_roundup() might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ prefetchw(data + SKB_WITH_OVERHEAD(size));
- memset(skb, 0, offsetof(struct sk_buff, truesize));
- skb->truesize = size + sizeof(struct sk_buff);
- atomic_set(&skb->users, 1);
- skb->head = data;
- skb->data = data;
- skb->tail = data;
- skb->end = data + size;
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, size);
+ skb->pfmemalloc = pfmemalloc;
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ refcount_set(&fclones->fclone_ref, 1);
+ }
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_segs = 0;
- skb_shinfo(skb)->gso_type = 0;
- skb_shinfo(skb)->frag_list = NULL;
-out:
return skb;
+
nodata:
- kmem_cache_free(skbuff_head_cache, skb);
- skb = NULL;
- goto out;
+ kmem_cache_free(cache, skb);
+ return NULL;
}
+EXPORT_SYMBOL(__alloc_skb);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
- * @length: length to allocate
+ * @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
*
* Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has unspecified headroom built in. Users should allocate
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
* the headroom they think they need without accounting for the
* built in space. The built in space is used for optimisations.
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+ gfp_t gfp_mask)
+{
+ struct page_frag_cache *nc;
+ struct sk_buff *skb;
+ bool pfmemalloc;
+ void *data;
+
+ len += NET_SKB_PAD;
+
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
+ }
+
+ len = SKB_HEAD_ALIGN(len);
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ if (in_hardirq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
+ } else {
+ local_bh_disable();
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ nc = this_cpu_ptr(&napi_alloc_cache.page);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
+
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ local_bh_enable();
+ }
+
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
+ }
+
+ if (pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+
+skb_fail:
+ return skb;
+}
+EXPORT_SYMBOL(__netdev_alloc_skb);
+
+/**
+ * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * @napi: napi instance this buffer was allocated for
+ * @len: length to allocate
+ *
+ * Allocate a new sk_buff for use in NAPI receive. This buffer will
+ * attempt to allocate the head from a special reserved region used
+ * only for NAPI Rx allocation. By doing this we can save several
+ * CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
+ gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
+ struct napi_alloc_cache *nc;
struct sk_buff *skb;
+ bool pfmemalloc;
+ void *data;
+
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+ len += NET_SKB_PAD + NET_IP_ALIGN;
+
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
+ NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+ goto skb_success;
+ }
+
+ len = SKB_HEAD_ALIGN(len);
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc = this_cpu_ptr(&napi_alloc_cache);
+
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
- skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
- if (likely(skb)) {
- skb_reserve(skb, NET_SKB_PAD);
- skb->dev = dev;
+ if (unlikely(!data))
+ return NULL;
+
+ skb = __napi_build_skb(data, len);
+ if (unlikely(!skb)) {
+ skb_free_frag(data);
+ return NULL;
}
+
+ if (pfmemalloc)
+ skb->pfmemalloc = 1;
+ skb->head_frag = 1;
+
+skb_success:
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+
+skb_fail:
return skb;
}
+EXPORT_SYMBOL(napi_alloc_skb);
-static void skb_drop_list(struct sk_buff **listp)
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
+ int off, int size, unsigned int truesize)
{
- struct sk_buff *list = *listp;
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
- *listp = NULL;
+ skb_fill_netmem_desc(skb, i, netmem, off, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_add_rx_frag_netmem);
- do {
- struct sk_buff *this = list;
- list = list->next;
- kfree_skb(this);
- } while (list);
+void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
+ unsigned int truesize)
+{
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
+ skb_frag_size_add(frag, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_coalesce_rx_frag);
+
+static void skb_drop_list(struct sk_buff **listp)
+{
+ kfree_skb_list(*listp);
+ *listp = NULL;
}
static inline void skb_drop_fraglist(struct sk_buff *skb)
@@ -298,61 +911,275 @@ static void skb_clone_fraglist(struct sk_buff *skb)
{
struct sk_buff *list;
- for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
+ skb_walk_frags(skb, list)
skb_get(list);
}
-static void skb_release_data(struct sk_buff *skb)
+int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
+ unsigned int headroom)
{
- if (!skb->cloned ||
- !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
- &skb_shinfo(skb)->dataref)) {
- if (skb_shinfo(skb)->nr_frags) {
- int i;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+ u32 size, truesize, len, max_head_size, off;
+ struct sk_buff *skb = *pskb, *nskb;
+ int err, i, head_off;
+ void *data;
+
+ /* XDP does not support fraglist so we need to linearize
+ * the skb.
+ */
+ if (skb_has_frag_list(skb))
+ return -EOPNOTSUPP;
+
+ max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
+ if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+ return -ENOMEM;
+
+ size = min_t(u32, skb->len, max_head_size);
+ truesize = SKB_HEAD_ALIGN(size) + headroom;
+ data = page_pool_dev_alloc_va(pool, &truesize);
+ if (!data)
+ return -ENOMEM;
+
+ nskb = napi_build_skb(data, truesize);
+ if (!nskb) {
+ page_pool_free_va(pool, data, true);
+ return -ENOMEM;
+ }
+
+ skb_reserve(nskb, headroom);
+ skb_copy_header(nskb, skb);
+ skb_mark_for_recycle(nskb);
+
+ err = skb_copy_bits(skb, 0, nskb->data, size);
+ if (err) {
+ consume_skb(nskb);
+ return err;
+ }
+ skb_put(nskb, size);
+
+ head_off = skb_headroom(nskb) - skb_headroom(skb);
+ skb_headers_offset_update(nskb, head_off);
+
+ off = size;
+ len = skb->len - off;
+ for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+ struct page *page;
+ u32 page_off;
+
+ size = min_t(u32, len, PAGE_SIZE);
+ truesize = size;
+
+ page = page_pool_dev_alloc(pool, &page_off, &truesize);
+ if (!page) {
+ consume_skb(nskb);
+ return -ENOMEM;
}
- if (skb_shinfo(skb)->frag_list)
- skb_drop_fraglist(skb);
+ skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+ err = skb_copy_bits(skb, off, page_address(page) + page_off,
+ size);
+ if (err) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ len -= size;
+ off += size;
+ }
+
+ consume_skb(skb);
+ *pskb = nskb;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+EXPORT_SYMBOL(skb_pp_cow_data);
+
+int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
+ const struct bpf_prog *prog)
+{
+ if (!prog->aux->xdp_has_frags)
+ return -EINVAL;
- kfree(skb->head);
+ return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
+}
+EXPORT_SYMBOL(skb_cow_data_for_xdp);
+
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+bool napi_pp_put_page(netmem_ref netmem)
+{
+ netmem = netmem_compound_head(netmem);
+
+ if (unlikely(!netmem_is_pp(netmem)))
+ return false;
+
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
+
+ return true;
+}
+EXPORT_SYMBOL(napi_pp_put_page);
+#endif
+
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+{
+ if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
+ return false;
+ return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
+}
+
+/**
+ * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
+ * @skb: page pool aware skb
+ *
+ * Increase the fragment reference count (pp_ref_count) of a skb. This is
+ * intended to gain fragment references only for page pool aware skbs,
+ * i.e. when skb->pp_recycle is true, and not for fragments in a
+ * non-pp-recycling skb. It has a fallback to increase references on normal
+ * pages, as page pool aware skbs may also have normal page fragments.
+ */
+static int skb_pp_frag_ref(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo;
+ netmem_ref head_netmem;
+ int i;
+
+ if (!skb->pp_recycle)
+ return -EINVAL;
+
+ shinfo = skb_shinfo(skb);
+
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+ if (likely(netmem_is_pp(head_netmem)))
+ page_pool_ref_netmem(head_netmem);
+ else
+ page_ref_inc(netmem_to_page(head_netmem));
+ }
+ return 0;
+}
+
+static void skb_kfree_head(void *head, unsigned int end_offset)
+{
+ if (end_offset == SKB_SMALL_HEAD_HEADROOM)
+ kmem_cache_free(net_hotdata.skb_small_head_cache, head);
+ else
+ kfree(head);
+}
+
+static void skb_free_head(struct sk_buff *skb)
+{
+ unsigned char *head = skb->head;
+
+ if (skb->head_frag) {
+ if (skb_pp_recycle(skb, head))
+ return;
+ skb_free_frag(head);
+ } else {
+ skb_kfree_head(head, skb_end_offset(skb));
}
}
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
+
+ if (!skb_data_unref(skb, shinfo))
+ goto exit;
+
+ if (skb_zcopy(skb)) {
+ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
+
+ skb_zcopy_clear(skb, true);
+ if (skip_unref)
+ goto free_head;
+ }
+
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+
+free_head:
+ if (shinfo->frag_list)
+ kfree_skb_list_reason(shinfo->frag_list, reason);
+
+ skb_free_head(skb);
+exit:
+ /* When we clone an SKB we copy the reycling bit. The pp_recycle
+ * bit is only set on the head though, so in order to avoid races
+ * while trying to recycle fragments on __skb_frag_unref() we need
+ * to make one SKB responsible for triggering the recycle path.
+ * So disable the recycling bit if an SKB is cloned and we have
+ * additional references to the fragmented part of the SKB.
+ * Eventually the last SKB will have the recycling bit set and it's
+ * dataref set to 0, which will trigger the recycling
+ */
+ skb->pp_recycle = 0;
+}
+
/*
* Free an skbuff by memory without cleaning the state.
*/
-void kfree_skbmem(struct sk_buff *skb)
+static void kfree_skbmem(struct sk_buff *skb)
{
- struct sk_buff *other;
- atomic_t *fclone_ref;
+ struct sk_buff_fclones *fclones;
- skb_release_data(skb);
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
- kmem_cache_free(skbuff_head_cache, skb);
- break;
+ kmem_cache_free(net_hotdata.skbuff_cache, skb);
+ return;
case SKB_FCLONE_ORIG:
- fclone_ref = (atomic_t *) (skb + 2);
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, skb);
- break;
-
- case SKB_FCLONE_CLONE:
- fclone_ref = (atomic_t *) (skb + 1);
- other = skb - 1;
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
- /* The clone portion is available for
- * fast-cloning again.
+ /* We usually free the clone (TX completion) before original skb
+ * This test would have no chance to be true for the clone,
+ * while here, branch prediction will be good.
*/
- skb->fclone = SKB_FCLONE_UNAVAILABLE;
+ if (refcount_read(&fclones->fclone_ref) == 1)
+ goto fastpath;
+ break;
- if (atomic_dec_and_test(fclone_ref))
- kmem_cache_free(skbuff_fclone_cache, other);
+ default: /* SKB_FCLONE_CLONE */
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
- };
+ }
+ if (!refcount_dec_and_test(&fclones->fclone_ref))
+ return;
+fastpath:
+ kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
+}
+
+void skb_release_head_state(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ if (skb->destructor) {
+ DEBUG_NET_WARN_ON_ONCE(in_hardirq());
+#ifdef CONFIG_INET
+ INDIRECT_CALL_4(skb->destructor,
+ tcp_wfree, __sock_wfree, sock_wfree,
+ xsk_destruct_skb,
+ skb);
+#else
+ INDIRECT_CALL_2(skb->destructor,
+ sock_wfree, xsk_destruct_skb,
+ skb);
+
+#endif
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ }
+ nf_reset_ct(skb);
+ skb_ext_reset(skb);
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ skb_release_head_state(skb);
+ if (likely(skb->head))
+ skb_release_data(skb, reason);
}
/**
@@ -366,52 +1193,864 @@ void kfree_skbmem(struct sk_buff *skb)
void __kfree_skb(struct sk_buff *skb)
{
- dst_release(skb->dst);
-#ifdef CONFIG_XFRM
- secpath_put(skb->sp);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ kfree_skbmem(skb);
+}
+EXPORT_SYMBOL(__kfree_skb);
+
+static __always_inline
+bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
+{
+ if (unlikely(!skb_unref(skb)))
+ return false;
+
+ DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
+ u32_get_bits(reason,
+ SKB_DROP_REASON_SUBSYS_MASK) >=
+ SKB_DROP_REASON_SUBSYS_NUM);
+
+ if (reason == SKB_CONSUMED)
+ trace_consume_skb(skb, __builtin_return_address(0));
+ else
+ trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
+ return true;
+}
+
+/**
+ * sk_skb_reason_drop - free an sk_buff with special reason
+ * @sk: the socket to receive @skb, or NULL if not applicable
+ * @skb: buffer to free
+ * @reason: reason why this skb is dropped
+ *
+ * Drop a reference to the buffer and free it if the usage count has hit
+ * zero. Meanwhile, pass the receiving socket and drop reason to
+ * 'kfree_skb' tracepoint.
+ */
+void __fix_address
+sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ if (__sk_skb_reason_drop(sk, skb, reason))
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(sk_skb_reason_drop);
+
+#define KFREE_SKB_BULK_SIZE 16
+
+struct skb_free_array {
+ unsigned int skb_count;
+ void *skb_array[KFREE_SKB_BULK_SIZE];
+};
+
+static void kfree_skb_add_bulk(struct sk_buff *skb,
+ struct skb_free_array *sa,
+ enum skb_drop_reason reason)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ skb_release_all(skb, reason);
+ sa->skb_array[sa->skb_count++] = skb;
+
+ if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
+ sa->skb_array);
+ sa->skb_count = 0;
+ }
+}
+
+void __fix_address
+kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
+{
+ struct skb_free_array sa;
+
+ sa.skb_count = 0;
+
+ while (segs) {
+ struct sk_buff *next = segs->next;
+
+ if (__sk_skb_reason_drop(NULL, segs, reason)) {
+ skb_poison_list(segs);
+ kfree_skb_add_bulk(segs, &sa, reason);
+ }
+
+ segs = next;
+ }
+
+ if (sa.skb_count)
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
+}
+EXPORT_SYMBOL(kfree_skb_list_reason);
+
+/* Dump skb information and contents.
+ *
+ * Must only be called from net_ratelimit()-ed paths.
+ *
+ * Dumps whole packets if full_pkt, only headers otherwise.
+ */
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct net_device *dev = skb->dev;
+ struct sock *sk = skb->sk;
+ struct sk_buff *list_skb;
+ bool has_mac, has_trans;
+ int headroom, tailroom;
+ int i, len, seg_len;
+
+ if (full_pkt)
+ len = skb->len;
+ else
+ len = min_t(int, skb->len, MAX_HEADER + 128);
+
+ headroom = skb_headroom(skb);
+ tailroom = skb_tailroom(skb);
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
+ "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
+ "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
+ "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
+ "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
+ "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
+ level, skb->len, headroom, skb_headlen(skb), tailroom,
+ has_mac ? skb->mac_header : -1,
+ has_mac ? skb_mac_header_len(skb) : -1,
+ skb->mac_len,
+ skb->network_header,
+ has_trans ? skb_network_header_len(skb) : -1,
+ has_trans ? skb->transport_header : -1,
+ sh->tx_flags, sh->nr_frags,
+ sh->gso_size, sh->gso_type, sh->gso_segs,
+ skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
+ skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
+ skb->hash, skb->sw_hash, skb->l4_hash,
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
+ skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
+ skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
+ skb->inner_network_header, skb->inner_transport_header);
+
+ if (dev)
+ printk("%sdev name=%s feat=%pNF\n",
+ level, dev->name, &dev->features);
+ if (sk)
+ printk("%ssk family=%hu type=%u proto=%u\n",
+ level, sk->sk_family, sk->sk_type, sk->sk_protocol);
+
+ if (full_pkt && headroom)
+ print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->head, headroom, false);
+
+ seg_len = min_t(int, skb_headlen(skb), len);
+ if (seg_len)
+ print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->data, seg_len, false);
+ len -= seg_len;
+
+ if (full_pkt && tailroom)
+ print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb_tail_pointer(skb), tailroom, false);
+
+ for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ if (skb_frag_is_net_iov(frag)) {
+ printk("%sskb frag %d: not readable\n", level, i);
+ len -= skb_frag_size(frag);
+ if (!len)
+ break;
+ continue;
+ }
+
+ skb_frag_foreach_page(frag, skb_frag_off(frag),
+ skb_frag_size(frag), p, p_off, p_len,
+ copied) {
+ seg_len = min_t(int, p_len, len);
+ vaddr = kmap_atomic(p);
+ print_hex_dump(level, "skb frag: ",
+ DUMP_PREFIX_OFFSET,
+ 16, 1, vaddr + p_off, seg_len, false);
+ kunmap_atomic(vaddr);
+ len -= seg_len;
+ if (!len)
+ break;
+ }
+ }
+
+ if (full_pkt && skb_has_frag_list(skb)) {
+ printk("skb fraglist:\n");
+ skb_walk_frags(skb, list_skb)
+ skb_dump(level, list_skb, true);
+ }
+}
+EXPORT_SYMBOL(skb_dump);
+
+/**
+ * skb_tx_error - report an sk_buff xmit error
+ * @skb: buffer that triggered an error
+ *
+ * Report xmit error if a device callback is tracking this skb.
+ * skb must be freed afterwards.
+ */
+void skb_tx_error(struct sk_buff *skb)
+{
+ if (skb) {
+ skb_zcopy_downgrade_managed(skb);
+ skb_zcopy_clear(skb, true);
+ }
+}
+EXPORT_SYMBOL(skb_tx_error);
+
+#ifdef CONFIG_TRACEPOINTS
+/**
+ * consume_skb - free an skbuff
+ * @skb: buffer to free
+ *
+ * Drop a ref to the buffer and free it if the usage count has hit zero
+ * Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ * is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+ if (!skb_unref(skb))
+ return;
+
+ trace_consume_skb(skb, __builtin_return_address(0));
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
#endif
- if (skb->destructor) {
- WARN_ON(in_irq());
- skb->destructor(skb);
+
+/**
+ * __consume_stateless_skb - free an skbuff, assuming it is stateless
+ * @skb: buffer to free
+ *
+ * Alike consume_skb(), but this variant assumes that this is the last
+ * skb reference and all the head states have been already dropped
+ */
+void __consume_stateless_skb(struct sk_buff *skb)
+{
+ trace_consume_skb(skb, __builtin_return_address(0));
+ skb_release_data(skb, SKB_CONSUMED);
+ kfree_skbmem(skb);
+}
+
+static void napi_skb_cache_put(struct sk_buff *skb)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ if (!kasan_mempool_poison_object(skb))
+ return;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc->skb_cache[nc->skb_count++] = skb;
+
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+ u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE;
+
+ for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++)
+ kasan_mempool_unpoison_object(nc->skb_cache[i],
+ skbuff_cache_size);
+
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache,
+ NAPI_SKB_CACHE_FREE,
+ nc->skb_cache + remaining);
+ nc->skb_count = remaining;
+ }
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+}
+
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ skb_release_all(skb, reason);
+ napi_skb_cache_put(skb);
+}
+
+void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ if (unlikely(skb->slow_gro)) {
+ nf_reset_ct(skb);
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
+ skb_orphan(skb);
+ skb->slow_gro = 0;
+ }
+ napi_skb_cache_put(skb);
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+ /* Zero budget indicate non-NAPI context called us, like netpoll */
+ if (unlikely(!budget || !skb)) {
+ dev_consume_skb_any(skb);
+ return;
}
-#ifdef CONFIG_NETFILTER
- nf_conntrack_put(skb->nfct);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- nf_conntrack_put_reasm(skb->nfct_reasm);
+
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+
+ if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) {
+ skb_release_head_state(skb);
+ return skb_attempt_defer_free(skb);
+ }
+
+ if (!skb_unref(skb))
+ return;
+
+ /* if reaching here SKB is ready to free */
+ trace_consume_skb(skb, __builtin_return_address(0));
+
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ skb_release_all(skb, SKB_CONSUMED);
+ napi_skb_cache_put(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
+/* Make sure a field is contained by headers group */
+#define CHECK_SKB_FIELD(field) \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
+ offsetof(struct sk_buff, headers.field)); \
+
+static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ new->tstamp = old->tstamp;
+ /* We do not copy old->sk */
+ new->dev = old->dev;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
+ skb_dst_copy(new, old);
+ __skb_ext_copy(new, old);
+ __nf_copy(new, old, false);
+
+ /* Note : this field could be in the headers group.
+ * It is not yet because we do not want to have a 16 bit hole
+ */
+ new->queue_mapping = old->queue_mapping;
+
+ memcpy(&new->headers, &old->headers, sizeof(new->headers));
+ CHECK_SKB_FIELD(protocol);
+ CHECK_SKB_FIELD(csum);
+ CHECK_SKB_FIELD(hash);
+ CHECK_SKB_FIELD(priority);
+ CHECK_SKB_FIELD(skb_iif);
+ CHECK_SKB_FIELD(vlan_proto);
+ CHECK_SKB_FIELD(vlan_tci);
+ CHECK_SKB_FIELD(transport_header);
+ CHECK_SKB_FIELD(network_header);
+ CHECK_SKB_FIELD(mac_header);
+ CHECK_SKB_FIELD(inner_protocol);
+ CHECK_SKB_FIELD(inner_transport_header);
+ CHECK_SKB_FIELD(inner_network_header);
+ CHECK_SKB_FIELD(inner_mac_header);
+ CHECK_SKB_FIELD(mark);
+#ifdef CONFIG_NETWORK_SECMARK
+ CHECK_SKB_FIELD(secmark);
#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- nf_bridge_put(skb->nf_bridge);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ CHECK_SKB_FIELD(napi_id);
#endif
+ CHECK_SKB_FIELD(alloc_cpu);
+#ifdef CONFIG_XPS
+ CHECK_SKB_FIELD(sender_cpu);
#endif
-/* XXX: IS this still necessary? - JHS */
#ifdef CONFIG_NET_SCHED
- skb->tc_index = 0;
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = 0;
-#endif
+ CHECK_SKB_FIELD(tc_index);
#endif
- kfree_skbmem(skb);
+}
+
+/*
+ * You should not add any new code to this function. Add it to
+ * __copy_skb_header above instead.
+ */
+static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+{
+#define C(x) n->x = skb->x
+
+ n->next = n->prev = NULL;
+ n->sk = NULL;
+ __copy_skb_header(n, skb);
+
+ C(len);
+ C(data_len);
+ C(mac_len);
+ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+ n->cloned = 1;
+ n->nohdr = 0;
+ n->peeked = 0;
+ C(pfmemalloc);
+ C(pp_recycle);
+ n->destructor = NULL;
+ C(tail);
+ C(end);
+ C(head);
+ C(head_frag);
+ C(data);
+ C(truesize);
+ refcount_set(&n->users, 1);
+
+ atomic_inc(&(skb_shinfo(skb)->dataref));
+ skb->cloned = 1;
+
+ return n;
+#undef C
}
/**
- * kfree_skb - free an sk_buff
- * @skb: buffer to free
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+ struct sk_buff *n;
+
+ n = alloc_skb(0, GFP_ATOMIC);
+ if (!n)
+ return NULL;
+
+ n->len = first->len;
+ n->data_len = first->len;
+ n->truesize = first->truesize;
+
+ skb_shinfo(n)->frag_list = first;
+
+ __copy_skb_header(n, first);
+ n->destructor = NULL;
+
+ return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
+/**
+ * skb_morph - morph one skb into another
+ * @dst: the skb to receive the contents
+ * @src: the skb to supply the contents
+ *
+ * This is identical to skb_clone except that the target skb is
+ * supplied by the user.
*
- * Drop a reference to the buffer and free it if the usage count has
- * hit zero.
+ * The target skb is returned upon exit.
*/
-void kfree_skb(struct sk_buff *skb)
+struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- if (unlikely(!skb))
- return;
- if (likely(atomic_read(&skb->users) == 1))
- smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users)))
- return;
- __kfree_skb(skb);
+ skb_release_all(dst, SKB_CONSUMED);
+ return __skb_clone(dst, src);
+}
+EXPORT_SYMBOL_GPL(skb_morph);
+
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+{
+ unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
+ struct user_struct *user;
+
+ if (capable(CAP_IPC_LOCK) || !size)
+ return 0;
+
+ rlim = rlimit(RLIMIT_MEMLOCK);
+ if (rlim == RLIM_INFINITY)
+ return 0;
+
+ num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
+ max_pg = rlim >> PAGE_SHIFT;
+ user = mmp->user ? : current_user();
+
+ old_pg = atomic_long_read(&user->locked_vm);
+ do {
+ new_pg = old_pg + num_pg;
+ if (new_pg > max_pg)
+ return -ENOBUFS;
+ } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
+
+ if (!mmp->user) {
+ mmp->user = get_uid(user);
+ mmp->num_pg = num_pg;
+ } else {
+ mmp->num_pg += num_pg;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
+
+void mm_unaccount_pinned_pages(struct mmpin *mmp)
+{
+ if (mmp->user) {
+ atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
+ free_uid(mmp->user);
+ }
+}
+EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
+
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
+{
+ struct ubuf_info_msgzc *uarg;
+ struct sk_buff *skb;
+
+ WARN_ON_ONCE(!in_task());
+
+ skb = sock_omalloc(sk, 0, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
+ uarg = (void *)skb->cb;
+ uarg->mmp.user = NULL;
+
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
+ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+ uarg->len = 1;
+ uarg->bytelen = size;
+ uarg->zerocopy = 1;
+ uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ refcount_set(&uarg->ubuf.refcnt, 1);
+ sock_hold(sk);
+
+ return &uarg->ubuf;
+}
+
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
+{
+ return container_of((void *)uarg, struct sk_buff, cb);
+}
+
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg, bool devmem)
+{
+ if (uarg) {
+ struct ubuf_info_msgzc *uarg_zc;
+ const u32 byte_limit = 1 << 19; /* limit to a few TSO */
+ u32 bytelen, next;
+
+ /* there might be non MSG_ZEROCOPY users */
+ if (uarg->ops != &msg_zerocopy_ubuf_ops)
+ return NULL;
+
+ /* realloc only when socket is locked (TCP, UDP cork),
+ * so uarg->len and sk_zckey access is serialized
+ */
+ if (!sock_owned_by_user(sk)) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ uarg_zc = uarg_to_msgzc(uarg);
+ bytelen = uarg_zc->bytelen + size;
+ if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ /* TCP can create new skb to attach new uarg */
+ if (sk->sk_type == SOCK_STREAM)
+ goto new_alloc;
+ return NULL;
+ }
+
+ next = (u32)atomic_read(&sk->sk_zckey);
+ if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
+ return NULL;
+ uarg_zc->len++;
+ uarg_zc->bytelen = bytelen;
+ atomic_set(&sk->sk_zckey, ++next);
+
+ /* no extra ref when appending to datagram (MSG_MORE) */
+ if (sk->sk_type == SOCK_STREAM)
+ net_zcopy_get(uarg);
+
+ return uarg;
+ }
+ }
+
+new_alloc:
+ return msg_zerocopy_alloc(sk, size, devmem);
+}
+EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ u32 old_lo, old_hi;
+ u64 sum_len;
+
+ old_lo = serr->ee.ee_info;
+ old_hi = serr->ee.ee_data;
+ sum_len = old_hi - old_lo + 1ULL + len;
+
+ if (sum_len >= (1ULL << 32))
+ return false;
+
+ if (lo != old_hi + 1)
+ return false;
+
+ serr->ee.ee_data += len;
+ return true;
}
+static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
+{
+ struct sk_buff *tail, *skb = skb_from_uarg(uarg);
+ struct sock_exterr_skb *serr;
+ struct sock *sk = skb->sk;
+ struct sk_buff_head *q;
+ unsigned long flags;
+ bool is_zerocopy;
+ u32 lo, hi;
+ u16 len;
+
+ mm_unaccount_pinned_pages(&uarg->mmp);
+
+ /* if !len, there was only 1 call, and it was aborted
+ * so do not queue a completion notification
+ */
+ if (!uarg->len || sock_flag(sk, SOCK_DEAD))
+ goto release;
+
+ len = uarg->len;
+ lo = uarg->id;
+ hi = uarg->id + len - 1;
+ is_zerocopy = uarg->zerocopy;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = 0;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+ serr->ee.ee_data = hi;
+ serr->ee.ee_info = lo;
+ if (!is_zerocopy)
+ serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+
+ q = &sk->sk_error_queue;
+ spin_lock_irqsave(&q->lock, flags);
+ tail = skb_peek_tail(q);
+ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+ !skb_zerocopy_notify_extend(tail, lo, len)) {
+ __skb_queue_tail(q, skb);
+ skb = NULL;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ sk_error_report(sk);
+
+release:
+ consume_skb(skb);
+ sock_put(sk);
+}
+
+static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
+{
+ struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
+
+ uarg_zc->zerocopy = uarg_zc->zerocopy & success;
+
+ if (refcount_dec_and_test(&uarg->refcnt))
+ __msg_zerocopy_callback(uarg_zc);
+}
+
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
+{
+ struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
+
+ atomic_dec(&sk->sk_zckey);
+ uarg_to_msgzc(uarg)->len--;
+
+ if (have_uref)
+ msg_zerocopy_complete(NULL, uarg, true);
+}
+EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
+
+const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
+ .complete = msg_zerocopy_complete,
+};
+EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, int len,
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int err, orig_len = skb->len;
+
+ if (uarg->ops->link_skb) {
+ err = uarg->ops->link_skb(skb, uarg);
+ if (err)
+ return err;
+ } else {
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
+
+ /* An skb can only point to one uarg. This edge case happens
+ * when TCP appends to an skb, but zerocopy_realloc triggered
+ * a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+ }
+
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
+ if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+ struct sock *save_sk = skb->sk;
+
+ /* Streams do not free skb on error. Reset to prev state. */
+ iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
+ skb->sk = sk;
+ ___pskb_trim(skb, orig_len);
+ skb->sk = save_sk;
+ return err;
+ }
+
+ skb_zcopy_set(skb, uarg, NULL);
+ return skb->len - orig_len;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ int i;
+
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+}
+EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
+
+static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+ gfp_t gfp_mask)
+{
+ if (skb_zcopy(orig)) {
+ if (skb_zcopy(nskb)) {
+ /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
+ if (!gfp_mask) {
+ WARN_ON_ONCE(1);
+ return -ENOMEM;
+ }
+ if (skb_uarg(nskb) == skb_uarg(orig))
+ return 0;
+ if (skb_copy_ubufs(nskb, GFP_ATOMIC))
+ return -EIO;
+ }
+ skb_zcopy_set(nskb, skb_uarg(orig), NULL);
+ }
+ return 0;
+}
+
+/**
+ * skb_copy_ubufs - copy userspace skb frags buffers to kernel
+ * @skb: the skb to modify
+ * @gfp_mask: allocation priority
+ *
+ * This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
+ * It will copy all frags into kernel and drop the reference
+ * to userspace pages.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ *
+ * Returns 0 on success or a negative error code on failure
+ * to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int num_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page, *head = NULL;
+ int i, order, psize, new_frags;
+ u32 d_off;
+
+ if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+ return -EINVAL;
+
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
+ if (!num_frags)
+ goto release;
+
+ /* We might have to allocate high order pages, so compute what minimum
+ * page order is needed.
+ */
+ order = 0;
+ while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
+ order++;
+ psize = (PAGE_SIZE << order);
+
+ new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
+ for (i = 0; i < new_frags; i++) {
+ page = alloc_pages(gfp_mask | __GFP_COMP, order);
+ if (!page) {
+ while (head) {
+ struct page *next = (struct page *)page_private(head);
+ put_page(head);
+ head = next;
+ }
+ return -ENOMEM;
+ }
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ page = head;
+ d_off = 0;
+ for (i = 0; i < num_frags; i++) {
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ u32 copy, done = 0;
+ vaddr = kmap_atomic(p);
+
+ while (done < p_len) {
+ if (d_off == psize) {
+ d_off = 0;
+ page = (struct page *)page_private(page);
+ }
+ copy = min_t(u32, psize - d_off, p_len - done);
+ memcpy(page_address(page) + d_off,
+ vaddr + p_off + done, copy);
+ done += copy;
+ d_off += copy;
+ }
+ kunmap_atomic(vaddr);
+ }
+ }
+
+ /* skb frags release userspace buffers */
+ for (i = 0; i < num_frags; i++)
+ skb_frag_unref(skb, i);
+
+ /* skb frags point to kernel buffers */
+ for (i = 0; i < new_frags - 1; i++) {
+ __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
+ head = (struct page *)page_private(head);
+ }
+ __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
+ d_off);
+ skb_shinfo(skb)->nr_frags = new_frags;
+
+release:
+ skb_zcopy_clear(skb, false);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_copy_ubufs);
+
/**
* skb_clone - duplicate an sk_buff
* @skb: buffer to clone
@@ -428,141 +2067,66 @@ void kfree_skb(struct sk_buff *skb)
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
+ struct sk_buff_fclones *fclones = container_of(skb,
+ struct sk_buff_fclones,
+ skb1);
struct sk_buff *n;
- n = skb + 1;
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
+
if (skb->fclone == SKB_FCLONE_ORIG &&
- n->fclone == SKB_FCLONE_UNAVAILABLE) {
- atomic_t *fclone_ref = (atomic_t *) (n + 1);
+ refcount_read(&fclones->fclone_ref) == 1) {
+ n = &fclones->skb2;
+ refcount_set(&fclones->fclone_ref, 2);
n->fclone = SKB_FCLONE_CLONE;
- atomic_inc(fclone_ref);
} else {
- n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
if (!n)
return NULL;
+
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
-#define C(x) n->x = skb->x
-
- n->next = n->prev = NULL;
- n->sk = NULL;
- C(tstamp);
- C(dev);
- C(h);
- C(nh);
- C(mac);
- C(dst);
- dst_clone(skb->dst);
- C(sp);
-#ifdef CONFIG_INET
- secpath_get(skb->sp);
-#endif
- memcpy(n->cb, skb->cb, sizeof(skb->cb));
- C(len);
- C(data_len);
- C(csum);
- C(local_df);
- n->cloned = 1;
- n->nohdr = 0;
- C(pkt_type);
- C(ip_summed);
- C(priority);
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
- C(ipvs_property);
-#endif
- C(protocol);
- n->destructor = NULL;
- C(mark);
-#ifdef CONFIG_NETFILTER
- C(nfct);
- nf_conntrack_get(skb->nfct);
- C(nfctinfo);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- C(nfct_reasm);
- nf_conntrack_get_reasm(skb->nfct_reasm);
-#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- C(nf_bridge);
- nf_bridge_get(skb->nf_bridge);
-#endif
-#endif /*CONFIG_NETFILTER*/
-#ifdef CONFIG_NET_SCHED
- C(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
- n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
- n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
- n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
- C(input_dev);
-#endif
- skb_copy_secmark(n, skb);
-#endif
- C(truesize);
- atomic_set(&n->users, 1);
- C(head);
- C(data);
- C(tail);
- C(end);
-
- atomic_inc(&(skb_shinfo(skb)->dataref));
- skb->cloned = 1;
+ return __skb_clone(n, skb);
+}
+EXPORT_SYMBOL(skb_clone);
- return n;
+void skb_headers_offset_update(struct sk_buff *skb, int off)
+{
+ /* Only adjust this if it actually is csum_start rather than csum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += off;
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->transport_header += off;
+ skb->network_header += off;
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += off;
+ skb->inner_transport_header += off;
+ skb->inner_network_header += off;
+ skb->inner_mac_header += off;
}
+EXPORT_SYMBOL(skb_headers_offset_update);
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
- /*
- * Shift between the two data areas in bytes
- */
- unsigned long offset = new->data - old->data;
+ __copy_skb_header(new, old);
- new->sk = NULL;
- new->dev = old->dev;
- new->priority = old->priority;
- new->protocol = old->protocol;
- new->dst = dst_clone(old->dst);
-#ifdef CONFIG_INET
- new->sp = secpath_get(old->sp);
-#endif
- new->h.raw = old->h.raw + offset;
- new->nh.raw = old->nh.raw + offset;
- new->mac.raw = old->mac.raw + offset;
- memcpy(new->cb, old->cb, sizeof(old->cb));
- new->local_df = old->local_df;
- new->fclone = SKB_FCLONE_UNAVAILABLE;
- new->pkt_type = old->pkt_type;
- new->tstamp = old->tstamp;
- new->destructor = NULL;
- new->mark = old->mark;
-#ifdef CONFIG_NETFILTER
- new->nfct = old->nfct;
- nf_conntrack_get(old->nfct);
- new->nfctinfo = old->nfctinfo;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- new->nfct_reasm = old->nfct_reasm;
- nf_conntrack_get_reasm(old->nfct_reasm);
-#endif
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
- new->ipvs_property = old->ipvs_property;
-#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- new->nf_bridge = old->nf_bridge;
- nf_bridge_get(old->nf_bridge);
-#endif
-#endif
-#ifdef CONFIG_NET_SCHED
-#ifdef CONFIG_NET_CLS_ACT
- new->tc_verd = old->tc_verd;
-#endif
- new->tc_index = old->tc_index;
-#endif
- skb_copy_secmark(new, old);
- atomic_set(&new->users, 1);
skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+EXPORT_SYMBOL(skb_copy_header);
+
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
/**
* skb_copy - create private copy of an sk_buff
@@ -583,12 +2147,20 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb->data - skb->head;
- /*
- * Allocate the copy buffer
- */
- struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len,
- gfp_mask);
+ struct sk_buff *n;
+ unsigned int size;
+ int headerlen;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ headerlen = skb_headroom(skb);
+ size = skb_end_offset(skb) + skb->data_len;
+ n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -596,21 +2168,22 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
skb_reserve(n, headerlen);
/* Set the tail pointer and length */
skb_put(n, skb->len);
- n->csum = skb->csum;
- n->ip_summed = skb->ip_summed;
- if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
- BUG();
+ BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
return n;
}
-
+EXPORT_SYMBOL(skb_copy);
/**
- * pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
* @skb: buffer to copy
+ * @headroom: headroom of new skb
* @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
*
* Make a copy of both an &sk_buff and part of its data, located
* in header. Fragmented data remain shared. This is used when
@@ -620,24 +2193,22 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
{
- /*
- * Allocate the copy buffer
- */
- struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
+ unsigned int size = skb_headlen(skb) + headroom;
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb->data - skb->head);
+ skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
- memcpy(n->data, skb->data, n->len);
- n->csum = skb->csum;
- n->ip_summed = skb->ip_summed;
+ skb_copy_from_linear_data(skb, n->data, n->len);
n->truesize += skb->data_len;
n->data_len = skb->data_len;
@@ -646,22 +2217,29 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shinfo(skb)->nr_frags) {
int i;
+ if (skb_orphan_frags(skb, gfp_mask) ||
+ skb_zerocopy_clone(n, skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
+ }
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
- get_page(skb_shinfo(n)->frags[i].page);
+ skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
}
- if (skb_shinfo(skb)->frag_list) {
+ if (skb_has_frag_list(skb)) {
skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
skb_clone_fraglist(n);
}
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
out:
return n;
}
+EXPORT_SYMBOL(__pskb_copy_fclone);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -670,62 +2248,103 @@ out:
* @ntail: room to add at tail
* @gfp_mask: allocation priority
*
- * Expands (or creates identical copy, if &nhead and &ntail are zero)
- * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
+ * Expands (or creates identical copy, if @nhead and @ntail are zero)
+ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
* reference count of 1. Returns zero in the case of success or error,
* if expansion failed. In the last case, &sk_buff is not changed.
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
+ *
+ * Note: If you skb_push() the start of the buffer after reallocating the
+ * header, call skb_postpush_data_move() first to move the metadata out of
+ * the way before writing to &sk_buff->data.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{
- int i;
- u8 *data;
- int size = nhead + (skb->end - skb->head) + ntail;
+ unsigned int osize = skb_end_offset(skb);
+ unsigned int size = osize + nhead + ntail;
long off;
+ u8 *data;
+ int i;
+
+ BUG_ON(nhead < 0);
+
+ BUG_ON(skb_shared(skb));
- if (skb_shared(skb))
- BUG();
+ skb_zcopy_downgrade_managed(skb);
- size = SKB_DATA_ALIGN(size);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
+ size = SKB_WITH_OVERHEAD(size);
/* Copy only real data... and, alas, header. This should be
- * optimized for the cases when header is void. */
- memcpy(data + nhead, skb->head, skb->tail - skb->head);
- memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- get_page(skb_shinfo(skb)->frags[i].page);
-
- if (skb_shinfo(skb)->frag_list)
- skb_clone_fraglist(skb);
+ * optimized for the cases when header is void.
+ */
+ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
- skb_release_data(skb);
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
+ if (skb_zcopy(skb))
+ refcount_inc(&skb_uarg(skb)->refcnt);
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ skb_release_data(skb, SKB_CONSUMED);
+ } else {
+ skb_free_head(skb);
+ }
off = (data + nhead) - skb->head;
skb->head = data;
- skb->end = data + size;
+ skb->head_frag = 0;
skb->data += off;
- skb->tail += off;
- skb->mac.raw += off;
- skb->h.raw += off;
- skb->nh.raw += off;
+
+ skb_set_end_offset(skb, size);
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ off = nhead;
+#endif
+ skb->tail += off;
+ skb_headers_offset_update(skb, nhead);
skb->cloned = 0;
+ skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ /* It is not generally safe to change skb->truesize.
+ * For the moment, we really care of rx path, or
+ * when skb is orphaned (not attached to a socket).
+ */
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb->truesize += size - osize;
+
return 0;
+nofrags:
+ skb_kfree_head(data, size);
nodata:
return -ENOMEM;
}
+EXPORT_SYMBOL(pskb_expand_head);
/* Make private copy of skb with writable head and some headroom */
@@ -746,7 +2365,105 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
}
return skb2;
}
+EXPORT_SYMBOL(skb_realloc_headroom);
+
+/* Note: We plan to rework this in linux-6.4 */
+int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+ unsigned int saved_end_offset, saved_truesize;
+ struct skb_shared_info *shinfo;
+ int res;
+
+ saved_end_offset = skb_end_offset(skb);
+ saved_truesize = skb->truesize;
+ res = pskb_expand_head(skb, 0, 0, pri);
+ if (res)
+ return res;
+
+ skb->truesize = saved_truesize;
+
+ if (likely(skb_end_offset(skb) == saved_end_offset))
+ return 0;
+
+ /* We can not change skb->end if the original or new value
+ * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
+ */
+ if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
+ skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
+ /* We think this path should not be taken.
+ * Add a temporary trace to warn us just in case.
+ */
+ pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
+ saved_end_offset, skb_end_offset(skb));
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ shinfo = skb_shinfo(skb);
+
+ /* We are about to change back skb->end,
+ * we need to move skb_shinfo() to its new location.
+ */
+ memmove(skb->head + saved_end_offset,
+ shinfo,
+ offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
+
+ skb_set_end_offset(skb, saved_end_offset);
+
+ return 0;
+}
+
+/**
+ * skb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @headroom: needed headroom
+ *
+ * Unlike skb_realloc_headroom, this one does not allocate a new skb
+ * if possible; copies skb->sk to new skb as needed
+ * and frees original skb in case of failures.
+ *
+ * It expect increased headroom and generates warning otherwise.
+ */
+
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
+{
+ int delta = headroom - skb_headroom(skb);
+ int osize = skb_end_offset(skb);
+ struct sock *sk = skb->sk;
+
+ if (WARN_ONCE(delta <= 0,
+ "%s is expecting an increase in the headroom", __func__))
+ return skb;
+
+ delta = SKB_DATA_ALIGN(delta);
+ /* pskb_expand_head() might crash, if skb is shared. */
+ if (skb_shared(skb) || !is_skb_wmem(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (unlikely(!nskb))
+ goto fail;
+
+ if (sk)
+ skb_set_owner_w(nskb, sk);
+ consume_skb(skb);
+ skb = nskb;
+ }
+ if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
+ goto fail;
+
+ if (sk && is_skb_wmem(skb)) {
+ delta = skb_end_offset(skb) - osize;
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ skb->truesize += delta;
+ }
+ return skb;
+
+fail:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_expand_head);
/**
* skb_copy_expand - copy and expand sk_buff
@@ -765,9 +2482,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
*
* You must pass %GFP_ATOMIC as the allocation priority if this function
* is called from an interrupt.
- *
- * BUG ALERT: ip_summed is not copied. Why does this work? Is it used
- * only by netfilter in the cases when checksum is recalculated? --ANK
*/
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
int newheadroom, int newtailroom,
@@ -776,10 +2490,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask);
int head_copy_len, head_copy_off;
+ struct sk_buff *n;
+ int oldheadroom;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ oldheadroom = skb_headroom(skb);
+ n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (!n)
return NULL;
@@ -788,7 +2512,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/* Set the tail pointer and length */
skb_put(n, skb->len);
- head_copy_len = skb_headroom(skb);
+ head_copy_len = oldheadroom;
head_copy_off = 0;
if (newheadroom <= head_copy_len)
head_copy_len = newheadroom;
@@ -796,32 +2520,36 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
head_copy_off = newheadroom - head_copy_len;
/* Copy the linear header and data. */
- if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
- skb->len + head_copy_len))
- BUG();
+ BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+ skb->len + head_copy_len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
+
+ skb_headers_offset_update(n, newheadroom - oldheadroom);
return n;
}
+EXPORT_SYMBOL(skb_copy_expand);
/**
- * skb_pad - zero pad the tail of an skb
+ * __skb_pad - zero pad the tail of an skb
* @skb: buffer to pad
* @pad: space to pad
+ * @free_on_error: free buffer on error
*
* Ensure that a buffer is followed by a padding area that is zero
* filled. Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire.
*
- * May return error in out of memory cases. The skb is freed on error.
+ * May return error in out of memory cases. The skb is freed on error
+ * if @free_on_error is true.
*/
-
-int skb_pad(struct sk_buff *skb, int pad)
+
+int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
int err;
int ntail;
-
+
/* If the skbuff is non linear tailroom is always zero.. */
if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
memset(skb->data+skb->len, 0, pad);
@@ -846,10 +2574,131 @@ int skb_pad(struct sk_buff *skb, int pad)
return 0;
free_skb:
- kfree_skb(skb);
+ if (free_on_error)
+ kfree_skb(skb);
return err;
-}
-
+}
+EXPORT_SYMBOL(__skb_pad);
+
+/**
+ * pskb_put - add data to the tail of a potentially fragmented buffer
+ * @skb: start of the buffer to use
+ * @tail: tail fragment of the buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the potentially
+ * fragmented buffer. @tail must be the last fragment of @skb -- or
+ * @skb itself. If this would exceed the total buffer size the kernel
+ * will panic. A pointer to the first byte of the extra data is
+ * returned.
+ */
+
+void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+{
+ if (tail != skb) {
+ skb->data_len += len;
+ skb->len += len;
+ }
+ return skb_put(tail, len);
+}
+EXPORT_SYMBOL_GPL(pskb_put);
+
+/**
+ * skb_put - add data to a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer. If this would
+ * exceed the total buffer size the kernel will panic. A pointer to the
+ * first byte of the extra data is returned.
+ */
+void *skb_put(struct sk_buff *skb, unsigned int len)
+{
+ void *tmp = skb_tail_pointer(skb);
+ SKB_LINEAR_ASSERT(skb);
+ skb->tail += len;
+ skb->len += len;
+ if (unlikely(skb->tail > skb->end))
+ skb_over_panic(skb, len, __builtin_return_address(0));
+ return tmp;
+}
+EXPORT_SYMBOL(skb_put);
+
+/**
+ * skb_push - add data to the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer at the buffer
+ * start. If this would exceed the total buffer headroom the kernel will
+ * panic. A pointer to the first byte of the extra data is returned.
+ */
+void *skb_push(struct sk_buff *skb, unsigned int len)
+{
+ skb->data -= len;
+ skb->len += len;
+ if (unlikely(skb->data < skb->head))
+ skb_under_panic(skb, len, __builtin_return_address(0));
+ return skb->data;
+}
+EXPORT_SYMBOL(skb_push);
+
+/**
+ * skb_pull - remove data from the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the next data in the buffer
+ * is returned. Once the data has been pulled future pushes will overwrite
+ * the old data.
+ */
+void *skb_pull(struct sk_buff *skb, unsigned int len)
+{
+ return skb_pull_inline(skb, len);
+}
+EXPORT_SYMBOL(skb_pull);
+
+/**
+ * skb_pull_data - remove data from the start of a buffer returning its
+ * original position.
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the original data in the buffer
+ * is returned after checking if there is enough data to pull. Once the
+ * data has been pulled future pushes will overwrite the old data.
+ */
+void *skb_pull_data(struct sk_buff *skb, size_t len)
+{
+ void *data = skb->data;
+
+ if (skb->len < len)
+ return NULL;
+
+ skb_pull(skb, len);
+
+ return data;
+}
+EXPORT_SYMBOL(skb_pull_data);
+
+/**
+ * skb_trim - remove end from a buffer
+ * @skb: buffer to alter
+ * @len: new length
+ *
+ * Cut the length of a buffer down by removing data from the tail. If
+ * the buffer is already under the length specified it is not modified.
+ * The skb must be linear.
+ */
+void skb_trim(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->len > len)
+ __skb_trim(skb, len);
+}
+EXPORT_SYMBOL(skb_trim);
+
/* Trims skb to length len. It can change skb pointers.
*/
@@ -871,22 +2720,22 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len)
goto drop_pages;
for (; i < nfrags; i++) {
- int end = offset + skb_shinfo(skb)->frags[i].size;
+ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (end < len) {
offset = end;
continue;
}
- skb_shinfo(skb)->frags[i++].size = len - offset;
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
drop_pages:
skb_shinfo(skb)->nr_frags = i;
for (; i < nfrags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
- if (skb_shinfo(skb)->frag_list)
+ if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
goto done;
}
@@ -903,7 +2752,7 @@ drop_pages:
return -ENOMEM;
nfrag->next = frag->next;
- kfree_skb(frag);
+ consume_skb(frag);
frag = nfrag;
*fragp = frag;
}
@@ -929,11 +2778,35 @@ done:
} else {
skb->len = len;
skb->data_len = 0;
- skb->tail = skb->data + len;
+ skb_set_tail_pointer(skb, len);
}
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb_condense(skb);
return 0;
}
+EXPORT_SYMBOL(___pskb_trim);
+
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ int delta = skb->len - len;
+
+ skb->csum = csum_block_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0),
+ len);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+ int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+ if (offset + sizeof(__sum16) > hdlen)
+ return -EINVAL;
+ }
+ return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
/**
* __pskb_pull_tail - advance tail of skb header
@@ -960,7 +2833,7 @@ done:
*
* It is pretty complicated. Luckily, it is called only in exceptional cases.
*/
-unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
+void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
/* If skb has not enough free space at tail, get new one
* plus 128 bytes for future expansions. If we have enough
@@ -968,31 +2841,36 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
return NULL;
}
- if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta))
- BUG();
+ BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
+ skb_tail_pointer(skb), delta));
/* Optimization: no fragments, no reasons to preestimate
* size of pulled pages. Superb.
*/
- if (!skb_shinfo(skb)->frag_list)
+ if (!skb_has_frag_list(skb))
goto pull_pages;
/* Estimate size of pulled pages. */
eat = delta;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size >= eat)
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size >= eat)
goto pull_pages;
- eat -= skb_shinfo(skb)->frags[i].size;
+ eat -= size;
}
/* If we need update frag list, we are in troubles.
- * Certainly, it possible to add an offset to skb data,
+ * Certainly, it is possible to add an offset to skb data,
* but taking into account that pulling is expected to
* be very rare operation, it is worth to fight against
* further bloating skb head and crucify ourselves here instead.
@@ -1004,8 +2882,6 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
struct sk_buff *insp = NULL;
do {
- BUG_ON(!list);
-
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
@@ -1013,6 +2889,9 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
insp = list;
} else {
/* Eaten partially. */
+ if (skb_is_gso(skb) && !list->head_frag &&
+ skb_headlen(list))
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
if (skb_shared(list)) {
/* Sucks! We need to fork list. :-( */
@@ -1027,8 +2906,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
insp = list;
}
if (!pskb_pull(list, eat)) {
- if (clone)
- kfree_skb(clone);
+ kfree_skb(clone);
return NULL;
}
break;
@@ -1038,7 +2916,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Free pulled out fragments. */
while ((list = skb_shinfo(skb)->frag_list) != insp) {
skb_shinfo(skb)->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -1052,14 +2930,20 @@ pull_pages:
eat = delta;
k = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
} else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
+
+ *frag = skb_shinfo(skb)->frags[i];
if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
+ skb_frag_off_add(frag, eat);
+ skb_frag_size_sub(frag, eat);
+ if (!i)
+ goto end;
eat = 0;
}
k++;
@@ -1067,18 +2951,37 @@ pull_pages:
}
skb_shinfo(skb)->nr_frags = k;
+end:
skb->tail += delta;
skb->data_len -= delta;
- return skb->tail;
-}
+ if (!skb->data_len)
+ skb_zcopy_clear(skb, false);
-/* Copy some data bits from skb to kernel buffer. */
+ return skb_tail_pointer(skb);
+}
+EXPORT_SYMBOL(__pskb_pull_tail);
+/**
+ * skb_copy_bits - copy bits from skb to kernel buffer
+ * @skb: source skb
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source skb to the
+ * destination buffer.
+ *
+ * CAUTION ! :
+ * If its prototype is ever changed,
+ * check arch/{*}/net/{*}.S files,
+ * since it is called from BPF assembly code.
+ */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
- int i, copy;
int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
if (offset > (int)skb->len - len)
goto fault;
@@ -1087,30 +2990,38 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
if ((copy = start - offset) > 0) {
if (copy > len)
copy = len;
- memcpy(to, skb->data + offset, copy);
+ skb_copy_from_linear_data_offset(skb, offset, to, copy);
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
- memcpy(to,
- vaddr + skb_shinfo(skb)->frags[i].page_offset+
- offset - start, copy);
- kunmap_skb_frag(vaddr);
+ skb_frag_foreach_page(f,
+ skb_frag_off(f) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ memcpy(to + copied, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
if ((len -= copy) == 0)
return 0;
@@ -1120,35 +3031,368 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_copy_bits(list, offset - start,
- to, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to += copy;
- }
- start = end;
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_bits(frag_iter, offset - start, to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
}
+ start = end;
}
+
if (!len)
return 0;
fault:
return -EFAULT;
}
+EXPORT_SYMBOL(skb_copy_bits);
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sock *sk)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return NULL;
+
+ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
+
+ memcpy(page_address(pfrag->page) + pfrag->offset,
+ page_address(page) + *offset, *len);
+ *offset = pfrag->offset;
+ pfrag->offset += *len;
+
+ return pfrag->page;
+}
+
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+ unsigned int *len, unsigned int offset, bool linear,
+ struct sock *sk)
+{
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
+
+ if (linear) {
+ page = linear_to_page(page, len, &offset, sk);
+ if (!page)
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
+ spd->pages[spd->nr_pages] = page;
+ spd->partial[spd->nr_pages].len = *len;
+ spd->partial[spd->nr_pages].offset = offset;
+ spd->nr_pages++;
+
+ return false;
+}
+
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk)
+{
+ if (!*len)
+ return true;
+
+ /* skip this segment if already processed */
+ if (*off >= plen) {
+ *off -= plen;
+ return false;
+ }
+
+ /* ignore any bits we already processed */
+ poff += *off;
+ plen -= *off;
+ *off = 0;
+
+ do {
+ unsigned int flen = min(*len, plen);
+
+ if (spd_fill_page(spd, page, &flen, poff, linear, sk))
+ return true;
+ poff += flen;
+ plen -= flen;
+ *len -= flen;
+ if (!*len)
+ return true;
+ } while (plen);
+
+ return false;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. It reports true if the
+ * pipe is full or if we already spliced the requested length.
+ */
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
+{
+ struct sk_buff *iter;
+ int seg;
+
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
+ */
+ if (__splice_segment(virt_to_page(skb->data),
+ (unsigned long) skb->data & (PAGE_SIZE - 1),
+ skb_headlen(skb),
+ offset, len, spd,
+ skb_head_is_locked(skb),
+ sk))
+ return true;
+
+ /*
+ * then map the fragments
+ */
+ if (!skb_frags_readable(skb))
+ return false;
+
+ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+ if (WARN_ON_ONCE(!skb_frag_page(f)))
+ return false;
+
+ if (__splice_segment(skb_frag_page(f),
+ skb_frag_off(f), skb_frag_size(f),
+ offset, len, spd, false, sk))
+ return true;
+ }
+
+ skb_walk_frags(skb, iter) {
+ if (*offset >= iter->len) {
+ *offset -= iter->len;
+ continue;
+ }
+ /* __skb_splice_bits() only fails if the output has no room
+ * left, so no point in going over the frag_list for the error
+ * case.
+ */
+ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list.
+ */
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
+ struct pipe_inode_info *pipe, unsigned int tlen,
+ unsigned int flags)
+{
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages_max = MAX_SKB_FRAGS,
+ .ops = &nosteal_pipe_buf_ops,
+ .spd_release = sock_spd_release,
+ };
+ int ret = 0;
+
+ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
+
+ if (spd.nr_pages)
+ ret = splice_to_pipe(pipe, &spd);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(skb_splice_bits);
+
+static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
+{
+ struct socket *sock = sk->sk_socket;
+ size_t size = msg_data_left(msg);
+
+ if (!sock)
+ return -EINVAL;
+
+ if (!sock->ops->sendmsg_locked)
+ return sock_no_sendmsg_locked(sk, msg, size);
+
+ return sock->ops->sendmsg_locked(sk, msg, size);
+}
+
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return sock_sendmsg(sock, msg);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+ int len, sendmsg_func sendmsg, int flags)
+{
+ int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
+ unsigned int orig_len = len;
+ struct sk_buff *head = skb;
+ unsigned short fragidx;
+ int slen, ret;
+
+do_frag_list:
+
+ /* Deal with head data */
+ while (offset < skb_headlen(skb) && len) {
+ struct kvec kv;
+ struct msghdr msg;
+
+ slen = min_t(int, len, skb_headlen(skb) - offset);
+ kv.iov_base = skb->data + offset;
+ kv.iov_len = slen;
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | flags;
+ if (slen < len)
+ msg.msg_flags |= more_hint;
+
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
+ ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+ sendmsg_unlocked, sk, &msg);
+ if (ret <= 0)
+ goto error;
+
+ offset += ret;
+ len -= ret;
+ }
+
+ /* All the data was skb head? */
+ if (!len)
+ goto out;
+
+ /* Make offset relative to start of frags */
+ offset -= skb_headlen(skb);
+
+ /* Find where we are in frag list */
+ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
+
+ if (offset < skb_frag_size(frag))
+ break;
+
+ offset -= skb_frag_size(frag);
+ }
+
+ for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
+
+ slen = min_t(size_t, len, skb_frag_size(frag) - offset);
+
+ while (slen) {
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
+ };
+
+ if (slen < len)
+ msg.msg_flags |= more_hint;
+ bvec_set_page(&bvec, skb_frag_page(frag), slen,
+ skb_frag_off(frag) + offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
+ slen);
+
+ ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+ sendmsg_unlocked, sk, &msg);
+ if (ret <= 0)
+ goto error;
+
+ len -= ret;
+ offset += ret;
+ slen -= ret;
+ }
+
+ offset = 0;
+ }
+
+ if (len) {
+ /* Process any frag lists */
+
+ if (skb == head) {
+ if (skb_has_frag_list(skb)) {
+ skb = skb_shinfo(skb)->frag_list;
+ goto do_frag_list;
+ }
+ } else if (skb->next) {
+ skb = skb->next;
+ goto do_frag_list;
+ }
+ }
+
+out:
+ return orig_len - len;
+
+error:
+ return orig_len == len ? ret : orig_len - len;
+}
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
+
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
+}
/**
* skb_store_bits - store bits from kernel buffer to skb
@@ -1162,10 +3406,11 @@ fault:
* traversing fragment lists and such.
*/
-int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
+int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
- int i, copy;
int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
if (offset > (int)skb->len - len)
goto fault;
@@ -1173,30 +3418,38 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
if ((copy = start - offset) > 0) {
if (copy > len)
copy = len;
- memcpy(skb->data + offset, from, copy);
+ skb_copy_to_linear_data_offset(skb, offset, from, copy);
if ((len -= copy) == 0)
return 0;
offset += copy;
from += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + frag->size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
- memcpy(vaddr + frag->page_offset + offset - start,
- from, copy);
- kunmap_skb_frag(vaddr);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ memcpy(vaddr + p_off, from + copied, p_len);
+ kunmap_atomic(vaddr);
+ }
if ((len -= copy) == 0)
return 0;
@@ -1206,28 +3459,24 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_store_bits(list, offset - start,
- from, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- from += copy;
- }
- start = end;
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_store_bits(frag_iter, offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
}
+ start = end;
}
if (!len)
return 0;
@@ -1235,16 +3484,14 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
fault:
return -EFAULT;
}
-
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
+ struct sk_buff *frag_iter;
int pos = 0;
/* Checksum header. */
@@ -1258,76 +3505,85 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
__wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
- csum2 = csum_partial(vaddr + frag->page_offset +
- offset - start, copy, 0);
- kunmap_skb_frag(vaddr);
- csum = csum_block_add(csum, csum2, pos);
+
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
+ kunmap_atomic(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ pos += p_len;
+ }
+
if (!(len -= copy))
return csum;
offset += copy;
- pos += copy;
}
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list = list->next) {
- int end;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- __wsum csum2;
- if (copy > len)
- copy = len;
- csum2 = skb_checksum(list, offset - start,
- copy, 0);
- csum = csum_block_add(csum, csum2, pos);
- if ((len -= copy) == 0)
- return csum;
- offset += copy;
- pos += copy;
- }
- start = end;
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ if (copy > len)
+ copy = len;
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos += copy;
}
+ start = end;
}
BUG_ON(len);
return csum;
}
+EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
- u8 *to, int len, __wsum csum)
+ u8 *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
+ struct sk_buff *frag_iter;
int pos = 0;
+ __wsum csum = 0;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
csum = csum_partial_copy_nocheck(skb->data + offset, to,
- copy, csum);
+ copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -1335,64 +3591,307 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (!skb_frags_readable(skb))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
__wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
- csum2 = csum_partial_copy_nocheck(vaddr +
- frag->page_offset +
- offset - start, to,
- copy, 0);
- kunmap_skb_frag(vaddr);
- csum = csum_block_add(csum, csum2, pos);
+
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ csum2 = csum_partial_copy_nocheck(vaddr + p_off,
+ to + copied,
+ p_len);
+ kunmap_atomic(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ pos += p_len;
+ }
+
if (!(len -= copy))
return csum;
offset += copy;
to += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ __wsum csum2;
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ csum2 = skb_copy_and_csum_bits(frag_iter,
+ offset - start,
+ to, copy);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
pos += copy;
}
start = end;
}
+ BUG_ON(len);
+ return csum;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_bits);
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
- for (; list; list = list->next) {
- __wsum csum2;
- int end;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
- BUG_TRAP(start <= offset + len);
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- csum2 = skb_copy_and_csum_bits(list,
- offset - start,
- to, copy, 0);
- csum = csum_block_add(csum, csum2, pos);
- if ((len -= copy) == 0)
- return csum;
- offset += copy;
- to += copy;
- pos += copy;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
}
- start = end;
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
}
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
}
BUG_ON(len);
- return csum;
+
+ return crc;
+}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
+
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+ if (!skb_shared(skb))
+ skb->csum_valid = !sum;
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+/* This function assumes skb->csum already holds pseudo header's checksum,
+ * which has been changed from the hardware checksum, for example, by
+ * __skb_checksum_validate_complete(). And, the original skb->csum must
+ * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
+ *
+ * It returns non-zero if the recomputed checksum is still invalid, otherwise
+ * zero. The new checksum is stored back into skb->csum unless the skb is
+ * shared.
+ */
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ sum = csum_fold(csum_add(skb->csum, csum));
+ /* This check is inverted, because we already knew the hardware
+ * checksum is invalid before calling this function. So, if the
+ * re-computed checksum is valid instead, then we have a mismatch
+ * between the original skb->csum and skb_checksum(). This means either
+ * the original hardware checksum is incorrect or we screw up skb->csum
+ * when moving skb->data around.
+ */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ if (!skb_shared(skb)) {
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+ }
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
+ hlen = skb_headlen(from);
+ if (!hlen)
+ hlen = from->len;
+ }
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @from: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
+ */
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ int ret;
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
+
+ if (hlen) {
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
+ offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ skb_len_add(to, len + plen);
+
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+ skb_zerocopy_clone(to, from, GFP_ATOMIC);
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ int size;
+
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
+ len);
+ skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
+ len -= size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
@@ -1400,18 +3899,18 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
long csstart;
if (skb->ip_summed == CHECKSUM_PARTIAL)
- csstart = skb->h.raw - skb->data;
+ csstart = skb_checksum_start_offset(skb);
else
csstart = skb_headlen(skb);
BUG_ON(csstart > skb_headlen(skb));
- memcpy(to, skb->data, csstart);
+ skb_copy_from_linear_data(skb, to, csstart);
csum = 0;
if (csstart != skb->len)
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
- skb->len - csstart, 0);
+ skb->len - csstart);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
long csstuff = csstart + skb->csum_offset;
@@ -1419,6 +3918,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
*((__sum16 *)(to + csstuff)) = csum_fold(csum);
}
}
+EXPORT_SYMBOL(skb_copy_and_csum_dev);
/**
* skb_dequeue - remove from the head of the queue
@@ -1439,6 +3939,7 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list)
spin_unlock_irqrestore(&list->lock, flags);
return result;
}
+EXPORT_SYMBOL(skb_dequeue);
/**
* skb_dequeue_tail - remove from the tail of the queue
@@ -1458,22 +3959,83 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
spin_unlock_irqrestore(&list->lock, flags);
return result;
}
+EXPORT_SYMBOL(skb_dequeue_tail);
/**
- * skb_queue_purge - empty a list
+ * skb_queue_purge_reason - empty a list
* @list: list to empty
+ * @reason: drop reason
*
* Delete all buffers on an &sk_buff list. Each buffer is removed from
* the list and one reference dropped. This function takes the list
* lock and is atomic with respect to other list locking functions.
*/
-void skb_queue_purge(struct sk_buff_head *list)
+void skb_queue_purge_reason(struct sk_buff_head *list,
+ enum skb_drop_reason reason)
{
- struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL)
+ struct sk_buff_head tmp;
+ unsigned long flags;
+
+ if (skb_queue_empty_lockless(list))
+ return;
+
+ __skb_queue_head_init(&tmp);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_splice_init(list, &tmp);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ __skb_queue_purge_reason(&tmp, reason);
+}
+EXPORT_SYMBOL(skb_queue_purge_reason);
+
+/**
+ * skb_rbtree_purge - empty a skb rbtree
+ * @root: root of the rbtree to empty
+ * Return value: the sum of truesizes of all purged skbs.
+ *
+ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ * the list and one reference dropped. This function does not take
+ * any lock. Synchronization should be handled by the caller (e.g., TCP
+ * out-of-order queue is protected by the socket lock).
+ */
+unsigned int skb_rbtree_purge(struct rb_root *root)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ sum += skb->truesize;
kfree_skb(skb);
+ }
+ return sum;
}
+void skb_errqueue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb, *next;
+ struct sk_buff_head kill;
+ unsigned long flags;
+
+ __skb_queue_head_init(&kill);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk_safe(list, skb, next) {
+ if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
+ continue;
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&kill, skb);
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+ __skb_queue_purge(&kill);
+}
+EXPORT_SYMBOL(skb_errqueue_purge);
+
/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
@@ -1493,6 +4055,7 @@ void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
__skb_queue_head(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
+EXPORT_SYMBOL(skb_queue_head);
/**
* skb_queue_tail - queue a buffer at the list tail
@@ -1513,6 +4076,7 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
__skb_queue_tail(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
+EXPORT_SYMBOL(skb_queue_tail);
/**
* skb_unlink - remove a buffer from a list
@@ -1532,6 +4096,7 @@ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
__skb_unlink(skb, list);
spin_unlock_irqrestore(&list->lock, flags);
}
+EXPORT_SYMBOL(skb_unlink);
/**
* skb_append - append a buffer
@@ -1548,44 +4113,10 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head
unsigned long flags;
spin_lock_irqsave(&list->lock, flags);
- __skb_append(old, newsk, list);
+ __skb_queue_after(list, old, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
-
-
-/**
- * skb_insert - insert a buffer
- * @old: buffer to insert before
- * @newsk: buffer to insert
- * @list: list to use
- *
- * Place a packet before a given packet in a list. The list locks are
- * taken and this function is atomic with respect to other list locked
- * calls.
- *
- * A buffer cannot be placed on two lists at the same time.
- */
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&list->lock, flags);
- __skb_insert(newsk, old->prev, old, list);
- spin_unlock_irqrestore(&list->lock, flags);
-}
-
-#if 0
-/*
- * Tune the memory allocator for a new MTU size.
- */
-void skb_add_mtu(int mtu)
-{
- /* Must match allocation in alloc_skb */
- mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
-
- kmem_add_cache_size(mtu);
-}
-#endif
+EXPORT_SYMBOL(skb_append);
static inline void skb_split_inside_header(struct sk_buff *skb,
struct sk_buff* skb1,
@@ -1593,19 +4124,20 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
{
int i;
- memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len);
-
+ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
+ pos - len);
/* And move data appendix as is. */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
skb->data_len = 0;
skb->len = len;
- skb->tail = skb->data + len;
+ skb_set_tail_pointer(skb, len);
}
static inline void skb_split_no_header(struct sk_buff *skb,
@@ -1621,7 +4153,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,
skb->data_len = len - pos;
for (i = 0; i < nfrags; i++) {
- int size = skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + size > len) {
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
@@ -1635,10 +4167,10 @@ static inline void skb_split_no_header(struct sk_buff *skb,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb1)->frags[0].page_offset += len - pos;
- skb_shinfo(skb1)->frags[0].size -= len - pos;
- skb_shinfo(skb)->frags[i].size = len - pos;
+ skb_frag_ref(skb, i);
+ skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
}
k++;
@@ -1647,6 +4179,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ skb1->unreadable = skb->unreadable;
}
/**
@@ -1658,12 +4192,162 @@ static inline void skb_split_no_header(struct sk_buff *skb,
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
+ const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
+
+ skb_zcopy_downgrade_managed(skb);
+ skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
+ skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
skb_split_no_header(skb, skb1, len, pos);
}
+EXPORT_SYMBOL(skb_split);
+
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * Caller cannot keep skb_shinfo related pointers past calling here!
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+ return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+ int from, to, merge, todo;
+ skb_frag_t *fragfrom, *fragto;
+
+ BUG_ON(shiftlen > skb->len);
+
+ if (skb_headlen(skb))
+ return 0;
+ if (skb_zcopy(tgt) || skb_zcopy(skb))
+ return 0;
+
+ DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
+ DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));
+
+ todo = shiftlen;
+ from = 0;
+ to = skb_shinfo(tgt)->nr_frags;
+ fragfrom = &skb_shinfo(skb)->frags[from];
+
+ /* Actual merge is delayed until the point when we know we can
+ * commit all, so that we don't have to undo partial changes
+ */
+ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ skb_frag_off(fragfrom))) {
+ merge = -1;
+ } else {
+ merge = to - 1;
+
+ todo -= skb_frag_size(fragfrom);
+ if (todo < 0) {
+ if (skb_prepare_for_shift(skb) ||
+ skb_prepare_for_shift(tgt))
+ return 0;
+
+ /* All previous frag pointers might be stale! */
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, shiftlen);
+ skb_frag_size_sub(fragfrom, shiftlen);
+ skb_frag_off_add(fragfrom, shiftlen);
+
+ goto onlymerged;
+ }
+
+ from++;
+ }
+
+ /* Skip full, not-fitting skb to avoid expensive operations */
+ if ((shiftlen == skb->len) &&
+ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+ return 0;
+
+ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+ return 0;
+
+ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+ if (to == MAX_SKB_FRAGS)
+ return 0;
+
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[to];
+
+ if (todo >= skb_frag_size(fragfrom)) {
+ *fragto = *fragfrom;
+ todo -= skb_frag_size(fragfrom);
+ from++;
+ to++;
+
+ } else {
+ __skb_frag_ref(fragfrom);
+ skb_frag_page_copy(fragto, fragfrom);
+ skb_frag_off_copy(fragto, fragfrom);
+ skb_frag_size_set(fragto, todo);
+
+ skb_frag_off_add(fragfrom, todo);
+ skb_frag_size_sub(fragfrom, todo);
+ todo = 0;
+
+ to++;
+ break;
+ }
+ }
+
+ /* Ready to "commit" this state change to tgt */
+ skb_shinfo(tgt)->nr_frags = to;
+
+ if (merge >= 0) {
+ fragfrom = &skb_shinfo(skb)->frags[0];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, skb_frag_size(fragfrom));
+ __skb_frag_unref(fragfrom, skb->pp_recycle);
+ }
+
+ /* Reposition in the original skb */
+ to = 0;
+ while (from < skb_shinfo(skb)->nr_frags)
+ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+ skb_shinfo(skb)->nr_frags = to;
+
+ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+ /* Most likely the tgt won't ever need its checksum anymore, skb on
+ * the other hand might need it if it needs to be resent
+ */
+ tgt->ip_summed = CHECKSUM_PARTIAL;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_len_add(skb, -shiftlen);
+ skb_len_add(tgt, shiftlen);
+
+ return shiftlen;
+}
/**
* skb_prepare_seq_read - Prepare a sequential read of skb data
@@ -1683,7 +4367,9 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
st->root_skb = st->cur_skb = skb;
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
+ st->frag_off = 0;
}
+EXPORT_SYMBOL(skb_prepare_seq_read);
/**
* skb_seq_read - Sequentially read skb data
@@ -1691,22 +4377,22 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
* @data: destination pointer for data to be returned
* @st: state variable
*
- * Reads a block of skb data at &consumed relative to the
+ * Reads a block of skb data at @consumed relative to the
* lower offset specified to skb_prepare_seq_read(). Assigns
- * the head of the data block to &data and returns the length
+ * the head of the data block to @data and returns the length
* of the block or 0 if the end of the skb data or the upper
* offset has been reached.
*
* The caller is not required to consume all of the data
- * returned, i.e. &consumed is typically set to the number
+ * returned, i.e. @consumed is typically set to the number
* of bytes already consumed and the next call to
* skb_seq_read() will return the remaining part of the block.
*
- * Note: The size of each block of data returned can be arbitary,
- * this limitation is the cost for zerocopy seqeuental
+ * Note 1: The size of each block of data returned can be arbitrary,
+ * this limitation is the cost for zerocopy sequential
* reads of potentially non linear data.
*
- * Note: Fragment lists within fragments are not implemented
+ * Note 2: Fragment lists within fragments are not implemented
* at the moment, state->root_skb could be replaced with
* a stack for this purpose.
*/
@@ -1716,55 +4402,86 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
unsigned int block_limit, abs_offset = consumed + st->lower_offset;
skb_frag_t *frag;
- if (unlikely(abs_offset >= st->upper_offset))
+ if (unlikely(abs_offset >= st->upper_offset)) {
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
return 0;
+ }
next_skb:
- block_limit = skb_headlen(st->cur_skb);
+ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
- if (abs_offset < block_limit) {
- *data = st->cur_skb->data + abs_offset;
+ if (abs_offset < block_limit && !st->frag_data) {
+ *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
return block_limit - abs_offset;
}
+ if (!skb_frags_readable(st->cur_skb))
+ return 0;
+
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ unsigned int pg_idx, pg_off, pg_sz;
+
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = frag->size + st->stepped_offset;
+ pg_idx = 0;
+ pg_off = skb_frag_off(frag);
+ pg_sz = skb_frag_size(frag);
+
+ if (skb_frag_must_loop(skb_frag_page(frag))) {
+ pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+ pg_off = offset_in_page(pg_off + st->frag_off);
+ pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+ PAGE_SIZE - pg_off);
+ }
+
+ block_limit = pg_sz + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_skb_frag(frag);
+ st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
- *data = (u8 *) st->frag_data + frag->page_offset +
+ *data = (u8 *)st->frag_data + pg_off +
(abs_offset - st->stepped_offset);
return block_limit - abs_offset;
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
- st->frag_idx++;
- st->stepped_offset += frag->size;
+ st->stepped_offset += pg_sz;
+ st->frag_off += pg_sz;
+ if (st->frag_off == skb_frag_size(frag)) {
+ st->frag_off = 0;
+ st->frag_idx++;
+ }
}
- if (st->cur_skb->next) {
- st->cur_skb = st->cur_skb->next;
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
st->frag_idx = 0;
goto next_skb;
- } else if (st->root_skb == st->cur_skb &&
- skb_shinfo(st->root_skb)->frag_list) {
- st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+ } else if (st->cur_skb->next) {
+ st->cur_skb = st->cur_skb->next;
+ st->frag_idx = 0;
goto next_skb;
}
return 0;
}
+EXPORT_SYMBOL(skb_seq_read);
/**
* skb_abort_seq_read - Abort a sequential read of skb data
@@ -1776,8 +4493,44 @@ next_skb:
void skb_abort_seq_read(struct skb_seq_state *st)
{
if (st->frag_data)
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
+}
+EXPORT_SYMBOL(skb_abort_seq_read);
+
+/**
+ * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
+ * @st: source skb_seq_state
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @offset bytes into the source @st to the destination
+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent
+ * call to this function. If offset needs to decrease from the previous use `st`
+ * should be reset first.
+ *
+ * Return: 0 on success or -EINVAL if the copy ended early
+ */
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
+{
+ const u8 *data;
+ u32 sqlen;
+
+ for (;;) {
+ sqlen = skb_seq_read(offset, &data, st);
+ if (sqlen == 0)
+ return -EINVAL;
+ if (sqlen >= len) {
+ memcpy(to, data, len);
+ return 0;
+ }
+ memcpy(to, data, sqlen);
+ to += sqlen;
+ offset += sqlen;
+ len -= sqlen;
+ }
}
+EXPORT_SYMBOL(skb_copy_seq_read);
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
@@ -1799,7 +4552,6 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
* @from: search offset
* @to: search limit
* @config: textsearch configuration
- * @state: uninitialized textsearch state variable
*
* Finds a pattern in the skb data according to the specified
* textsearch configuration. Use textsearch_next() to retrieve
@@ -1807,163 +4559,356 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
* to the first occurrence or UINT_MAX if no match was found.
*/
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
- unsigned int to, struct ts_config *config,
- struct ts_state *state)
+ unsigned int to, struct ts_config *config)
{
+ unsigned int patlen = config->ops->get_pattern_len(config);
+ struct ts_state state;
unsigned int ret;
+ BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
+
config->get_next_block = skb_ts_get_next_block;
config->finish = skb_ts_finish;
- skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
- ret = textsearch_find(config, state);
- return (ret <= to - from ? ret : UINT_MAX);
+ ret = textsearch_find(config, &state);
+ return (ret + patlen <= to - from ? ret : UINT_MAX);
}
+EXPORT_SYMBOL(skb_find_text);
-/**
- * skb_append_datato_frags: - append the user data to a skb
- * @sk: sock structure
- * @skb: skb structure to be appened with user data.
- * @getfrag: call back function to be used for getting the user data
- * @from: pointer to user message iov
- * @length: length of the iov message
- *
- * Description: This procedure append the user data in the fragment part
- * of the skb if any page alloc fails user this procedure returns -ENOMEM
- */
-int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
- int (*getfrag)(void *from, char *to, int offset,
- int len, int odd, struct sk_buff *skb),
- void *from, int length)
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+ int offset, size_t size, size_t max_frags)
{
- int frg_cnt = 0;
- skb_frag_t *frag = NULL;
- struct page *page = NULL;
- int copy, left;
- int offset = 0;
- int ret;
-
- do {
- /* Return error if we don't have space for new frag */
- frg_cnt = skb_shinfo(skb)->nr_frags;
- if (frg_cnt >= MAX_SKB_FRAGS)
- return -EFAULT;
-
- /* allocate a new page for next frag */
- page = alloc_pages(sk->sk_allocation, 0);
-
- /* If alloc_page fails just return failure and caller will
- * free previous allocated pages by doing kfree_skb()
- */
- if (page == NULL)
- return -ENOMEM;
-
- /* initialize the next frag */
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
- skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
- skb->truesize += PAGE_SIZE;
- atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
-
- /* get the new initialized frag */
- frg_cnt = skb_shinfo(skb)->nr_frags;
- frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
-
- /* copy the user data to page */
- left = PAGE_SIZE - frag->page_offset;
- copy = (length > left)? left : length;
-
- ret = getfrag(from, (page_address(frag->page) +
- frag->page_offset + frag->size),
- offset, copy, 0, skb);
- if (ret < 0)
- return -EFAULT;
-
- /* copy was successful so update the size parameters */
- sk->sk_sndmsg_off += copy;
- frag->size += copy;
- skb->len += copy;
- skb->data_len += copy;
- offset += copy;
- length -= copy;
-
- } while (length > 0);
+ int i = skb_shinfo(skb)->nr_frags;
+
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+ } else if (i < max_frags) {
+ skb_zcopy_downgrade_managed(skb);
+ get_page(page);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
+ } else {
+ return -EMSGSIZE;
+ }
return 0;
}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
- * @start: start of data before pull
* @len: length of data pulled
*
* This function performs an skb_pull on the packet and updates
- * update the CHECKSUM_COMPLETE checksum. It should be used on
+ * the CHECKSUM_COMPLETE checksum. It should be used on
* receive path processing instead of skb_pull unless you know
* that the checksum difference is zero (e.g., a valid IP header)
* or you are setting ip_summed to CHECKSUM_NONE.
*/
-unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
+ unsigned char *data = skb->data;
+
BUG_ON(len > skb->len);
- skb->len -= len;
- BUG_ON(skb->len < skb->data_len);
- skb_postpull_rcsum(skb, skb->data, len);
- return skb->data += len;
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, data, len);
+ return skb->data;
}
-
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
+{
+ skb_frag_t head_frag;
+ struct page *page;
+
+ page = virt_to_head_page(frag_skb->head);
+ skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
+ (unsigned char *)page_address(page),
+ skb_headlen(frag_skb));
+ return head_frag;
+}
+
+struct sk_buff *skb_segment_list(struct sk_buff *skb,
+ netdev_features_t features,
+ unsigned int offset)
+{
+ struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
+ unsigned int tnl_hlen = skb_tnl_header_len(skb);
+ unsigned int delta_truesize = 0;
+ unsigned int delta_len = 0;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *nskb, *tmp;
+ int len_diff, err;
+
+ skb_push(skb, -skb_network_offset(skb) + offset);
+
+ /* Ensure the head is writeable before touching the shared info */
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
+ goto err_linearize;
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ while (list_skb) {
+ nskb = list_skb;
+ list_skb = list_skb->next;
+
+ err = 0;
+ delta_truesize += nskb->truesize;
+ if (skb_shared(nskb)) {
+ tmp = skb_clone(nskb, GFP_ATOMIC);
+ if (tmp) {
+ consume_skb(nskb);
+ nskb = tmp;
+ err = skb_unclone(nskb, GFP_ATOMIC);
+ } else {
+ err = -ENOMEM;
+ }
+ }
+
+ if (!tail)
+ skb->next = nskb;
+ else
+ tail->next = nskb;
+
+ if (unlikely(err)) {
+ nskb->next = list_skb;
+ goto err_linearize;
+ }
+
+ tail = nskb;
+
+ delta_len += nskb->len;
+
+ skb_push(nskb, -skb_network_offset(nskb) + offset);
+
+ skb_release_head_state(nskb);
+ len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
+ __copy_skb_header(nskb, skb);
+
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+ nskb->transport_header += len_diff;
+ skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ offset + tnl_hlen);
+
+ if (skb_needs_linearize(nskb, features) &&
+ __skb_linearize(nskb))
+ goto err_linearize;
+ }
+
+ skb->truesize = skb->truesize - delta_truesize;
+ skb->data_len = skb->data_len - delta_len;
+ skb->len = skb->len - delta_len;
+
+ skb_gso_reset(skb);
+
+ skb->prev = tail;
+
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto err_linearize;
+
+ skb_get(skb);
+
+ return skb;
+
+err_linearize:
+ kfree_skb_list(skb->next);
+ skb->next = NULL;
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(skb_segment_list);
+
/**
* skb_segment - Perform protocol segmentation on skb.
- * @skb: buffer to segment
+ * @head_skb: buffer to segment
* @features: features for the output path (see dev->features)
*
* This function performs segmentation on the given skb. It returns
- * the segment at the given position. It returns NULL if there are
- * no more segments to generate, or when an error is encountered.
+ * a pointer to the first in a list of new skbs for the segments.
+ * In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+ netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
- unsigned int mss = skb_shinfo(skb)->gso_size;
- unsigned int doffset = skb->data - skb->mac.raw;
+ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+ unsigned int mss = skb_shinfo(head_skb)->gso_size;
+ unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
unsigned int offset = doffset;
+ unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int partial_segs = 0;
unsigned int headroom;
- unsigned int len;
- int sg = features & NETIF_F_SG;
- int nfrags = skb_shinfo(skb)->nr_frags;
+ unsigned int len = head_skb->len;
+ struct sk_buff *frag_skb;
+ skb_frag_t *frag;
+ __be16 proto;
+ bool csum, sg;
int err = -ENOMEM;
int i = 0;
- int pos;
+ int nfrags, pos;
+
+ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
+ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
+ struct sk_buff *check_skb;
+
+ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
+ if (skb_headlen(check_skb) && !check_skb->head_frag) {
+ /* gso_size is untrusted, and we have a frag_list with
+ * a linear non head_frag item.
+ *
+ * If head_skb's headlen does not fit requested gso_size,
+ * it means that the frag_list members do NOT terminate
+ * on exact gso_size boundaries. Hence we cannot perform
+ * skb_frag_t page sharing. Therefore we must fallback to
+ * copying the frag_list skbs; we do so by disabling SG.
+ */
+ features &= ~NETIF_F_SG;
+ break;
+ }
+ }
+ }
- __skb_push(skb, doffset);
- headroom = skb_headroom(skb);
- pos = skb_headlen(skb);
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, NULL);
+ if (unlikely(!proto))
+ return ERR_PTR(-EINVAL);
+
+ sg = !!(features & NETIF_F_SG);
+ csum = !!can_checksum_protocol(features, proto);
+
+ if (sg && csum && (mss != GSO_BY_FRAGS)) {
+ if (!(features & NETIF_F_GSO_PARTIAL)) {
+ struct sk_buff *iter;
+ unsigned int frag_len;
+
+ if (!list_skb ||
+ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+ goto normal;
+
+ /* If we get here then all the required
+ * GSO features except frag_list are supported.
+ * Try to split the SKB to multiple GSO SKBs
+ * with no frag_list.
+ * Currently we can do that only when the buffers don't
+ * have a linear part and all the buffers except
+ * the last are of the same length.
+ */
+ frag_len = list_skb->len;
+ skb_walk_frags(head_skb, iter) {
+ if (frag_len != iter->len && iter->next)
+ goto normal;
+ if (skb_headlen(iter) && !iter->head_frag)
+ goto normal;
+
+ len -= iter->len;
+ }
+
+ if (len != frag_len)
+ goto normal;
+ }
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ * Cap len to not accidentally hit GSO_BY_FRAGS.
+ */
+ partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
+ if (partial_segs > 1)
+ mss *= partial_segs;
+ else
+ partial_segs = 0;
+ }
+
+normal:
+ headroom = skb_headroom(head_skb);
+ pos = skb_headlen(head_skb);
+
+ if (skb_orphan_frags(head_skb, GFP_ATOMIC))
+ return ERR_PTR(-ENOMEM);
+
+ nfrags = skb_shinfo(head_skb)->nr_frags;
+ frag = skb_shinfo(head_skb)->frags;
+ frag_skb = head_skb;
do {
struct sk_buff *nskb;
- skb_frag_t *frag;
+ skb_frag_t *nskb_frag;
int hsize;
- int k;
int size;
- len = skb->len - offset;
- if (len > mss)
- len = mss;
+ if (unlikely(mss == GSO_BY_FRAGS)) {
+ len = list_skb->len;
+ } else {
+ len = head_skb->len - offset;
+ if (len > mss)
+ len = mss;
+ }
- hsize = skb_headlen(skb) - offset;
- if (hsize < 0)
- hsize = 0;
- if (hsize > len || !sg)
- hsize = len;
+ hsize = skb_headlen(head_skb) - offset;
- nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
- if (unlikely(!nskb))
- goto err;
+ if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
+ (skb_headlen(list_skb) == len || sg)) {
+ BUG_ON(skb_headlen(list_skb) > len);
+
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ if (unlikely(!nskb))
+ goto err;
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ pos += skb_headlen(list_skb);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ size = skb_frag_size(frag);
+ if (pos + size > offset + len)
+ break;
+
+ i++;
+ pos += size;
+ frag++;
+ }
+
+ list_skb = list_skb->next;
+
+ if (unlikely(pskb_trim(nskb, len))) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ hsize = skb_end_offset(nskb);
+ if (skb_cow_head(nskb, doffset + headroom)) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ nskb->truesize += skb_end_offset(nskb) - hsize;
+ skb_release_head_state(nskb);
+ __skb_push(nskb, doffset);
+ } else {
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
+ NUMA_NO_NODE);
+
+ if (unlikely(!nskb))
+ goto err;
+
+ skb_reserve(nskb, headroom);
+ __skb_put(nskb, doffset);
+ }
if (segs)
tail->next = nskb;
@@ -1971,123 +4916,2504 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
segs = nskb;
tail = nskb;
- nskb->dev = skb->dev;
- nskb->priority = skb->priority;
- nskb->protocol = skb->protocol;
- nskb->dst = dst_clone(skb->dst);
- memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
- nskb->pkt_type = skb->pkt_type;
- nskb->mac_len = skb->mac_len;
+ __copy_skb_header(nskb, head_skb);
- skb_reserve(nskb, headroom);
- nskb->mac.raw = nskb->data;
- nskb->nh.raw = nskb->data + skb->mac_len;
- nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
- memcpy(skb_put(nskb, doffset), skb->data, doffset);
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+ skb_reset_mac_len(nskb);
+
+ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
+
+ if (nskb->len == len + doffset)
+ goto perform_csum_check;
if (!sg) {
- nskb->csum = skb_copy_and_csum_bits(skb, offset,
- skb_put(nskb, len),
- len, 0);
+ if (!csum) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb,
+ len),
+ len);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ } else {
+ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
+ goto err;
+ }
continue;
}
- frag = skb_shinfo(nskb)->frags;
- k = 0;
+ nskb_frag = skb_shinfo(nskb)->frags;
+
+ skb_copy_from_linear_data_offset(head_skb, offset,
+ skb_put(nskb, hsize), hsize);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum = skb->csum;
- memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
+ skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+ SKBFL_SHARED_FRAG;
+
+ if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+ goto err;
while (pos < offset + len) {
- BUG_ON(i >= nfrags);
+ if (i >= nfrags) {
+ if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
+ skb_zerocopy_clone(nskb, list_skb,
+ GFP_ATOMIC))
+ goto err;
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ if (!skb_headlen(list_skb)) {
+ BUG_ON(!nfrags);
+ } else {
+ BUG_ON(!list_skb->head_frag);
- *frag = skb_shinfo(skb)->frags[i];
- get_page(frag->page);
- size = frag->size;
+ /* to make room for head_frag. */
+ i--;
+ frag--;
+ }
+
+ list_skb = list_skb->next;
+ }
+
+ if (unlikely(skb_shinfo(nskb)->nr_frags >=
+ MAX_SKB_FRAGS)) {
+ net_warn_ratelimited(
+ "skb_segment: too many frags: %u %u\n",
+ pos, mss);
+ err = -EINVAL;
+ goto err;
+ }
+
+ *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
+ __skb_frag_ref(nskb_frag);
+ size = skb_frag_size(nskb_frag);
if (pos < offset) {
- frag->page_offset += offset - pos;
- frag->size -= offset - pos;
+ skb_frag_off_add(nskb_frag, offset - pos);
+ skb_frag_size_sub(nskb_frag, offset - pos);
}
- k++;
+ skb_shinfo(nskb)->nr_frags++;
if (pos + size <= offset + len) {
i++;
+ frag++;
pos += size;
} else {
- frag->size -= pos + size - (offset + len);
- break;
+ skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
+ goto skip_fraglist;
}
- frag++;
+ nskb_frag++;
}
- skb_shinfo(nskb)->nr_frags = k;
+skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
- } while ((offset += len) < skb->len);
+perform_csum_check:
+ if (!csum) {
+ if (skb_has_shared_frag(nskb) &&
+ __skb_linearize(nskb))
+ goto err;
+
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ }
+ } while ((offset += len) < head_skb->len);
+
+ /* Some callers want to get the end of the list.
+ * Put it in segs->prev to avoid walking the list.
+ * (see validate_xmit_skb_list() for example)
+ */
+ segs->prev = tail;
+
+ if (partial_segs) {
+ struct sk_buff *iter;
+ int type = skb_shinfo(head_skb)->gso_type;
+ unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
+
+ /* Update type to add partial and then remove dodgy if set */
+ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
+ type &= ~SKB_GSO_DODGY;
+
+ /* Update GSO info and prepare to start updating headers on
+ * our way back down the stack of protocols.
+ */
+ for (iter = segs; iter; iter = iter->next) {
+ skb_shinfo(iter)->gso_size = gso_size;
+ skb_shinfo(iter)->gso_segs = partial_segs;
+ skb_shinfo(iter)->gso_type = type;
+ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+ }
+
+ if (tail->len - doffset <= gso_size)
+ skb_shinfo(tail)->gso_size = 0;
+ else if (tail != segs)
+ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
+ }
+
+ /* Following permits correct backpressure, for protocols
+ * using skb_set_owner_w().
+ * Idea is to tranfert ownership from head_skb to last segment.
+ */
+ if (head_skb->destructor == sock_wfree) {
+ swap(tail->truesize, head_skb->truesize);
+ swap(tail->destructor, head_skb->destructor);
+ swap(tail->sk, head_skb->sk);
+ }
return segs;
err:
- while ((skb = segs)) {
- segs = skb->next;
- kfree(skb);
- }
+ kfree_skb_list(segs);
return ERR_PTR(err);
}
-
EXPORT_SYMBOL_GPL(skb_segment);
+#ifdef CONFIG_SKB_EXTENSIONS
+#define SKB_EXT_ALIGN_VALUE 8
+#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
+
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
+#endif
+#ifdef CONFIG_XFRM
+ [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
+#endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
+#endif
+#if IS_ENABLED(CONFIG_MPTCP)
+ [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
+#endif
+#if IS_ENABLED(CONFIG_MCTP_FLOWS)
+ [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
+#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
+#endif
+};
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+ unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
+ l += skb_ext_type_len[i];
+
+ return l;
+}
+
+static void skb_extensions_init(void)
+{
+ BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
+ BUILD_BUG_ON(skb_ext_total_length() > 255);
+#endif
+
+ skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
+ SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
+#else
+static void skb_extensions_init(void) {}
+#endif
+
+/* The SKB kmem_cache slab is critical for network performance. Never
+ * merge/alias the slab with similar sized objects. This avoids fragmentation
+ * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
+ */
+#ifndef CONFIG_SLUB_TINY
+#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE
+#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
+#define FLAG_SKB_NO_MERGE 0
+#endif
+
void __init skb_init(void)
{
- skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+ net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
- skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
- (2*sizeof(struct sk_buff)) +
- sizeof(atomic_t),
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|
+ FLAG_SKB_NO_MERGE,
+ offsetof(struct sk_buff, cb),
+ sizeof_field(struct sk_buff, cb),
+ NULL);
+ skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
+ net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
+ NULL);
+ /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
+ * struct skb_shared_info is located at the end of skb->head,
+ * and should not be copied to/from user.
+ */
+ net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+ SKB_SMALL_HEAD_CACHE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ 0,
+ SKB_SMALL_HEAD_HEADROOM,
+ NULL);
+ skb_extensions_init();
}
-EXPORT_SYMBOL(___pskb_trim);
-EXPORT_SYMBOL(__kfree_skb);
-EXPORT_SYMBOL(kfree_skb);
-EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(__alloc_skb);
-EXPORT_SYMBOL(__netdev_alloc_skb);
-EXPORT_SYMBOL(pskb_copy);
-EXPORT_SYMBOL(pskb_expand_head);
-EXPORT_SYMBOL(skb_checksum);
-EXPORT_SYMBOL(skb_clone);
-EXPORT_SYMBOL(skb_clone_fraglist);
-EXPORT_SYMBOL(skb_copy);
-EXPORT_SYMBOL(skb_copy_and_csum_bits);
-EXPORT_SYMBOL(skb_copy_and_csum_dev);
-EXPORT_SYMBOL(skb_copy_bits);
-EXPORT_SYMBOL(skb_copy_expand);
-EXPORT_SYMBOL(skb_over_panic);
-EXPORT_SYMBOL(skb_pad);
-EXPORT_SYMBOL(skb_realloc_headroom);
-EXPORT_SYMBOL(skb_under_panic);
-EXPORT_SYMBOL(skb_dequeue);
-EXPORT_SYMBOL(skb_dequeue_tail);
-EXPORT_SYMBOL(skb_insert);
-EXPORT_SYMBOL(skb_queue_purge);
-EXPORT_SYMBOL(skb_queue_head);
-EXPORT_SYMBOL(skb_queue_tail);
-EXPORT_SYMBOL(skb_unlink);
-EXPORT_SYMBOL(skb_append);
-EXPORT_SYMBOL(skb_split);
-EXPORT_SYMBOL(skb_prepare_seq_read);
-EXPORT_SYMBOL(skb_seq_read);
-EXPORT_SYMBOL(skb_abort_seq_read);
-EXPORT_SYMBOL(skb_find_text);
-EXPORT_SYMBOL(skb_append_datato_frags);
+static int
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
+ unsigned int recursion_level)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int elt = 0;
+
+ if (unlikely(recursion_level >= 24))
+ return -EMSGSIZE;
+
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ sg_set_buf(sg, skb->data + offset, copy);
+ elt++;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+ return -EMSGSIZE;
+
+ if (copy > len)
+ copy = len;
+ sg_set_page(&sg[elt], skb_frag_page(frag), copy,
+ skb_frag_off(frag) + offset - start);
+ elt++;
+ if (!(len -= copy))
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end, ret;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+ return -EMSGSIZE;
+
+ if (copy > len)
+ copy = len;
+ ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+ copy, recursion_level + 1);
+ if (unlikely(ret < 0))
+ return ret;
+ elt += ret;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+ return elt;
+}
+
+/**
+ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ * @skb: Socket buffer containing the buffers to be mapped
+ * @sg: The scatter-gather list to map into
+ * @offset: The offset into the buffer's contents to start mapping
+ * @len: Length of buffer space to be mapped
+ *
+ * Fill the specified scatter-gather list with mappings/pointers into a
+ * region of the buffer space attached to a socket buffer. Returns either
+ * the number of scatterlist items used, or -EMSGSIZE if the contents
+ * could not fit.
+ */
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
+
+ if (nsg <= 0)
+ return nsg;
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping multiple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len, 0);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
+
+
+/**
+ * skb_cow_data - Check that a socket buffer's data buffers are writable
+ * @skb: The socket buffer to check.
+ * @tailbits: Amount of trailing space to be added
+ * @trailer: Returned pointer to the skb where the @tailbits space begins
+ *
+ * Make sure that the data buffers attached to a socket buffer are
+ * writable. If they are not, private copies are made of the data buffers
+ * and the socket buffer is set to use these instead.
+ *
+ * If @tailbits is given, make sure that there is space to write @tailbits
+ * bytes of data beyond current end of socket buffer. @trailer will be
+ * set to point to the skb in which this space begins.
+ *
+ * The number of scatterlist elements required to completely map the
+ * COW'd and extended socket buffer will be returned.
+ */
+int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
+{
+ int copyflag;
+ int elt;
+ struct sk_buff *skb1, **skb_p;
+
+ /* If skb is cloned or its head is paged, reallocate
+ * head pulling out all the pages (pages are considered not writable
+ * at the moment even if they are anonymous).
+ */
+ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
+ !__pskb_pull_tail(skb, __skb_pagelen(skb)))
+ return -ENOMEM;
+
+ /* Easy case. Most of packets will go this way. */
+ if (!skb_has_frag_list(skb)) {
+ /* A little of trouble, not enough of space for trailer.
+ * This should not happen, when stack is tuned to generate
+ * good frames. OK, on miss we reallocate and reserve even more
+ * space, 128 bytes is fair. */
+
+ if (skb_tailroom(skb) < tailbits &&
+ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Voila! */
+ *trailer = skb;
+ return 1;
+ }
+
+ /* Misery. We are in troubles, going to mincer fragments... */
+
+ elt = 1;
+ skb_p = &skb_shinfo(skb)->frag_list;
+ copyflag = 0;
+
+ while ((skb1 = *skb_p) != NULL) {
+ int ntail = 0;
+
+ /* The fragment is partially pulled by someone,
+ * this can happen on input. Copy it and everything
+ * after it. */
+
+ if (skb_shared(skb1))
+ copyflag = 1;
+
+ /* If the skb is the last, worry about trailer. */
+
+ if (skb1->next == NULL && tailbits) {
+ if (skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1) ||
+ skb_tailroom(skb1) < tailbits)
+ ntail = tailbits + 128;
+ }
+
+ if (copyflag ||
+ skb_cloned(skb1) ||
+ ntail ||
+ skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1)) {
+ struct sk_buff *skb2;
+
+ /* Fuck, we are miserable poor guys... */
+ if (ntail == 0)
+ skb2 = skb_copy(skb1, GFP_ATOMIC);
+ else
+ skb2 = skb_copy_expand(skb1,
+ skb_headroom(skb1),
+ ntail,
+ GFP_ATOMIC);
+ if (unlikely(skb2 == NULL))
+ return -ENOMEM;
+
+ if (skb1->sk)
+ skb_set_owner_w(skb2, skb1->sk);
+
+ /* Looking around. Are we still alive?
+ * OK, link new skb, drop old one */
+
+ skb2->next = skb1->next;
+ *skb_p = skb2;
+ kfree_skb(skb1);
+ skb1 = skb2;
+ }
+ elt++;
+ *trailer = skb1;
+ skb_p = &skb1->next;
+ }
+
+ return elt;
+}
+EXPORT_SYMBOL_GPL(skb_cow_data);
+
+static void sock_rmem_free(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+static void skb_set_err_queue(struct sk_buff *skb)
+{
+ /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
+ * So, it is safe to (mis)use it to mark skbs on the error queue.
+ */
+ skb->pkt_type = PACKET_OUTGOING;
+ BUILD_BUG_ON(PACKET_OUTGOING == 0);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)READ_ONCE(sk->sk_rcvbuf))
+ return -ENOMEM;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_rmem_free;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ skb_set_err_queue(skb);
+
+ /* before exiting rcu section, make sure dst is refcounted */
+ skb_dst_force(skb);
+
+ skb_queue_tail(&sk->sk_error_queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
+struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
+{
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
+ unsigned long flags;
+
+ if (skb_queue_empty_lockless(q))
+ return NULL;
+
+ spin_lock_irqsave(&q->lock, flags);
+ skb = __skb_dequeue(q);
+ if (skb && (skb_next = skb_peek(q))) {
+ icmp_next = is_icmp_err_skb(skb_next);
+ if (icmp_next)
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
+ sk_error_report(sk);
+
+ return skb;
+}
+EXPORT_SYMBOL(sock_dequeue_err_skb);
+
+/**
+ * skb_clone_sk - create clone of skb, and take reference to socket
+ * @skb: the skb to clone
+ *
+ * This function creates a clone of a buffer that holds a reference on
+ * sk_refcnt. Buffers created via this function are meant to be
+ * returned using sock_queue_err_skb, or free via kfree_skb.
+ *
+ * When passing buffers allocated with this function to sock_queue_err_skb
+ * it is necessary to wrap the call with sock_hold/sock_put in order to
+ * prevent the socket from being released prior to being enqueued on
+ * the sk_error_queue.
+ */
+struct sk_buff *skb_clone_sk(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sk_buff *clone;
+
+ if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return NULL;
+ }
+
+ clone->sk = sk;
+ clone->destructor = sock_efree;
+
+ return clone;
+}
+EXPORT_SYMBOL(skb_clone_sk);
+
+static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct sock *sk,
+ int tstype,
+ bool opt_stats)
+{
+ struct sock_exterr_skb *serr;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = tstype;
+ serr->opt_stats = opt_stats;
+ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
+ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ serr->ee.ee_data = skb_shinfo(skb)->tskey;
+ if (sk_is_tcp(sk))
+ serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
+ }
+
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+}
+
+static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
+{
+ bool ret;
+
+ if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
+ return true;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ ret = sk->sk_socket && sk->sk_socket->file &&
+ file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
+ read_unlock_bh(&sk->sk_callback_lock);
+ return ret;
+}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+
+ if (!skb_may_tx_timestamp(sk, false))
+ goto err;
+
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
+ sock_put(sk);
+ return;
+ }
+
+err:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ case SCM_TSTAMP_COMPLETION:
+ return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ struct sk_buff *skb;
+ bool tsonly, opt_stats = false;
+ u32 tsflags;
+
+ if (!sk)
+ return;
+
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+ skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
+ return;
+
+ tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
+ if (!skb_may_tx_timestamp(sk, tsonly))
+ return;
+
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk_is_tcp(sk)) {
+ skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
+ ack_skb);
+ opt_stats = true;
+ } else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+
+ if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
+ kfree_skb(skb);
+ return;
+ }
+ }
+ if (!skb)
+ return;
+
+ if (tsonly) {
+ skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
+ SKBTX_ANY_TSTAMP;
+ skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
+ }
+
+ if (hwtstamps)
+ *skb_hwtstamps(skb) = *hwtstamps;
+ else
+ __net_timestamp(skb);
+
+ __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
+}
+EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
+ SCM_TSTAMP_SND);
+}
+EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+
+#ifdef CONFIG_WIRELESS
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err = 1;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ /* Take a reference to prevent skb_orphan() from freeing the socket,
+ * but only if the socket refcount is not zero.
+ */
+ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
+ err = sock_queue_err_skb(sk, skb);
+ sock_put(sk);
+ }
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+#endif /* CONFIG_WIRELESS */
+
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+ u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+ u32 csum_start = skb_headroom(skb) + (u32)start;
+
+ if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
+ net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
+ start, off, skb_headroom(skb), skb_headlen(skb));
+ return false;
+ }
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = csum_start;
+ skb->csum_offset = off;
+ skb->transport_header = csum_start;
+ return true;
+}
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
+
+static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
+ unsigned int max)
+{
+ if (skb_headlen(skb) >= len)
+ return 0;
+
+ /* If we need to pullup then pullup to the max, so we
+ * won't need to do it again.
+ */
+ if (max > skb->len)
+ max = skb->len;
+
+ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ if (skb_headlen(skb) < len)
+ return -EPROTO;
+
+ return 0;
+}
+
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ int err;
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * maximally sized IP and TCP or UDP headers.
+ */
+#define MAX_IP_HDR_LEN 128
+
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
+{
+ unsigned int off;
+ bool fragment;
+ __sum16 *csum;
+ int err;
+
+ fragment = false;
+
+ err = skb_maybe_pull_tail(skb,
+ sizeof(struct iphdr),
+ MAX_IP_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ if (ip_is_fragment(ip_hdr(skb)))
+ fragment = true;
+
+ off = ip_hdrlen(skb);
+
+ err = -EPROTO;
+
+ if (fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * an IPv6 header, all options, and a maximal TCP or UDP header.
+ */
+#define MAX_IPV6_HDR_LEN 256
+
+#define OPT_HDR(type, skb, off) \
+ (type *)(skb_network_header(skb) + (off))
+
+static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+ u8 nexthdr;
+ unsigned int off;
+ unsigned int len;
+ bool fragment;
+ bool done;
+ __sum16 *csum;
+
+ fragment = false;
+ done = false;
+
+ off = sizeof(struct ipv6hdr);
+
+ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ while (off <= len && !done) {
+ switch (nexthdr) {
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING: {
+ struct ipv6_opt_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ipv6_opt_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_optlen(hp);
+ break;
+ }
+ case IPPROTO_AH: {
+ struct ip_auth_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ip_auth_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ip_auth_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_authlen(hp);
+ break;
+ }
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct frag_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct frag_hdr, skb, off);
+
+ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
+ fragment = true;
+
+ nexthdr = hp->nexthdr;
+ off += sizeof(struct frag_hdr);
+ break;
+ }
+ default:
+ done = true;
+ break;
+ }
+ }
+
+ err = -EPROTO;
+
+ if (!done || fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/**
+ * skb_checksum_setup - set up partial checksum offset
+ * @skb: the skb to set up
+ * @recalculate: if true the pseudo-header checksum will be recalculated
+ */
+int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ err = skb_checksum_setup_ipv4(skb, recalculate);
+ break;
+
+ case htons(ETH_P_IPV6):
+ err = skb_checksum_setup_ipv6(skb, recalculate);
+ break;
+
+ default:
+ err = -EPROTO;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+ unsigned int transport_len)
+{
+ struct sk_buff *skb_chk;
+ unsigned int len = skb_transport_offset(skb) + transport_len;
+ int ret;
+
+ if (skb->len < len)
+ return NULL;
+ else if (skb->len == len)
+ return skb;
+
+ skb_chk = skb_clone(skb, GFP_ATOMIC);
+ if (!skb_chk)
+ return NULL;
+
+ ret = pskb_trim_rcsum(skb_chk, len);
+ if (ret) {
+ kfree_skb(skb_chk);
+ return NULL;
+ }
+
+ return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+ unsigned int transport_len,
+ __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+ struct sk_buff *skb_chk;
+ unsigned int offset = skb_transport_offset(skb);
+ __sum16 ret;
+
+ skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+ if (!skb_chk)
+ goto err;
+
+ if (!pskb_may_pull(skb_chk, offset))
+ goto err;
+
+ skb_pull_rcsum(skb_chk, offset);
+ ret = skb_chkf(skb_chk);
+ skb_push_rcsum(skb_chk, offset);
+
+ if (ret)
+ goto err;
+
+ return skb_chk;
+
+err:
+ if (skb_chk && skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return NULL;
+
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
+void __skb_warn_lro_forwarding(const struct sk_buff *skb)
+{
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
+}
+EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen) {
+ skb_release_head_state(skb);
+ kmem_cache_free(net_hotdata.skbuff_cache, skb);
+ } else {
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(kfree_skb_partial);
+
+/**
+ * skb_try_coalesce - try to merge skb to prior one
+ * @to: prior buffer
+ * @from: buffer to add
+ * @fragstolen: pointer to boolean
+ * @delta_truesize: how much more was allocated than was requested
+ */
+bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ bool *fragstolen, int *delta_truesize)
+{
+ struct skb_shared_info *to_shinfo, *from_shinfo;
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (skb_cloned(to))
+ return false;
+
+ /* In general, avoid mixing page_pool and non-page_pool allocated
+ * pages within the same SKB. In theory we could take full
+ * references if @from is cloned and !@to->pp_recycle but its
+ * tricky (due to potential race with the clone disappearing) and
+ * rare, so not worth dealing with.
+ */
+ if (to->pp_recycle != from->pp_recycle)
+ return false;
+
+ if (skb_frags_readable(from) != skb_frags_readable(to))
+ return false;
+
+ if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
+ if (len)
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ *delta_truesize = 0;
+ return true;
+ }
+
+ to_shinfo = skb_shinfo(to);
+ from_shinfo = skb_shinfo(from);
+ if (to_shinfo->frag_list || from_shinfo->frag_list)
+ return false;
+ if (skb_zcopy(to) || skb_zcopy(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (to_shinfo->nr_frags +
+ from_shinfo->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, to_shinfo->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (to_shinfo->nr_frags +
+ from_shinfo->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(to_shinfo->frags + to_shinfo->nr_frags,
+ from_shinfo->frags,
+ from_shinfo->nr_frags * sizeof(skb_frag_t));
+ to_shinfo->nr_frags += from_shinfo->nr_frags;
+
+ if (!skb_cloned(from))
+ from_shinfo->nr_frags = 0;
+
+ /* if the skb is not cloned this does nothing
+ * since we set nr_frags to 0.
+ */
+ if (skb_pp_frag_ref(from)) {
+ for (i = 0; i < from_shinfo->nr_frags; i++)
+ __skb_frag_ref(&from_shinfo->frags[i]);
+ }
+
+ to->truesize += delta;
+ to->len += len;
+ to->data_len += len;
+
+ *delta_truesize = delta;
+ return true;
+}
+EXPORT_SYMBOL(skb_try_coalesce);
+
+/**
+ * skb_scrub_packet - scrub an skb
+ *
+ * @skb: buffer to clean
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulating a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
+ */
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
+ nf_reset_trace(skb);
+
+#ifdef CONFIG_NET_SWITCHDEV
+ skb->offload_fwd_mark = 0;
+ skb->offload_l3_fwd_mark = 0;
+#endif
+ ipvs_reset(skb);
+
+ if (!xnet)
+ return;
+
+ skb->mark = 0;
+ skb_clear_tstamp(skb);
+}
+EXPORT_SYMBOL_GPL(skb_scrub_packet);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+ int mac_len, meta_len;
+ void *meta;
+
+ if (skb_cow(skb, skb_headroom(skb)) < 0) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ mac_len = skb->data - skb_mac_header(skb);
+ if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
+ memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
+ mac_len - VLAN_HLEN - ETH_TLEN);
+ }
+
+ meta_len = skb_metadata_len(skb);
+ if (meta_len) {
+ meta = skb_metadata_end(skb) - meta_len;
+ memmove(meta + VLAN_HLEN, meta, meta_len);
+ }
+
+ skb->mac_header += VLAN_HLEN;
+ return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+ struct vlan_hdr *vhdr;
+ u16 vlan_tci;
+
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ /* vlan_tci is already set-up so leave this for another time */
+ return skb;
+ }
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto err_free;
+ /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
+ goto err_free;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ vlan_tci = ntohs(vhdr->h_vlan_TCI);
+ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+ skb_pull_rcsum(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vhdr);
+
+ skb = skb_reorder_vlan_header(skb);
+ if (unlikely(!skb))
+ goto err_free;
+
+ skb_reset_network_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+
+err_free:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
+
+int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
+{
+ if (!pskb_may_pull(skb, write_len))
+ return -ENOMEM;
+
+ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+ return 0;
+
+ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable);
+
+int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
+{
+ int needed_headroom = dev->needed_headroom;
+ int needed_tailroom = dev->needed_tailroom;
+
+ /* For tail taggers, we need to pad short frames ourselves, to ensure
+ * that the tail tag does not fail at its role of being at the end of
+ * the packet, once the conduit interface pads the frame. Account for
+ * that pad length here, and pad later.
+ */
+ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+ needed_tailroom += ETH_ZLEN - skb->len;
+ /* skb_headroom() returns unsigned int... */
+ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+ needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+ if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+ /* No reallocation needed, yay! */
+ return 0;
+
+ return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+ GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable_head_tail);
+
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+{
+ int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ if (WARN_ONCE(offset,
+ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
+ err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+
+ vlan_remove_tag(skb, vlan_tci);
+
+ skb->mac_header += VLAN_HLEN;
+
+ if (skb_network_offset(skb) < ETH_HLEN)
+ skb_set_network_header(skb, ETH_HLEN);
+
+ skb_reset_mac_len(skb);
+
+ return err;
+}
+EXPORT_SYMBOL(__skb_vlan_pop);
+
+/* Pop a vlan tag either from hwaccel or from payload.
+ * Expects skb->data at mac header.
+ */
+int skb_vlan_pop(struct sk_buff *skb)
+{
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ int err;
+
+ if (likely(skb_vlan_tag_present(skb))) {
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ if (unlikely(!eth_type_vlan(skb->protocol)))
+ return 0;
+
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (err)
+ return err;
+ }
+ /* move next vlan tag to hw accel tag */
+ if (likely(!eth_type_vlan(skb->protocol)))
+ return 0;
+
+ vlan_proto = skb->protocol;
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (unlikely(err))
+ return err;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_pop);
+
+/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
+ * Expects skb->data at mac header.
+ */
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+{
+ if (skb_vlan_tag_present(skb)) {
+ int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ if (WARN_ONCE(offset,
+ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
+ err = __vlan_insert_tag(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ if (err)
+ return err;
+
+ skb->protocol = skb->vlan_proto;
+ skb->network_header -= VLAN_HLEN;
+
+ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+ }
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_push);
+
+/**
+ * skb_eth_pop() - Drop the Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ *
+ * Drop the Ethernet header of @skb.
+ *
+ * Expects that skb->data points to the mac header and that no VLAN tags are
+ * present.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_pop(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
+ skb_network_offset(skb) < ETH_HLEN)
+ return -EPROTO;
+
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_pop);
+
+/**
+ * skb_eth_push() - Add a new Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ * @dst: Destination MAC address of the new header
+ * @src: Source MAC address of the new header
+ *
+ * Prepend @skb with a new Ethernet header.
+ *
+ * Expects that skb->data points to the mac header, which must be empty.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
+ const unsigned char *src)
+{
+ struct ethhdr *eth;
+ int err;
+
+ if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
+ return -EPROTO;
+
+ err = skb_cow_head(skb, sizeof(*eth));
+ if (err < 0)
+ return err;
+
+ skb_push(skb, sizeof(*eth));
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ eth = eth_hdr(skb);
+ ether_addr_copy(eth->h_dest, dst);
+ ether_addr_copy(eth->h_source, src);
+ eth->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, eth, sizeof(*eth));
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_push);
+
+/* Update the ethertype of hdr and the skb csum value if required. */
+static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~hdr->h_proto, ethertype };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
+/**
+ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
+ * the packet
+ *
+ * @skb: buffer
+ * @mpls_lse: MPLS label stack entry to push
+ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ * @mac_len: length of the MAC header
+ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
+ * ethernet
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
+ int mac_len, bool ethernet)
+{
+ struct mpls_shim_hdr *lse;
+ int err;
+
+ if (unlikely(!eth_p_mpls(mpls_proto)))
+ return -EINVAL;
+
+ /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
+ if (skb->encapsulation)
+ return -EINVAL;
+
+ err = skb_cow_head(skb, MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_protocol(skb, skb->protocol);
+ }
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ mac_len);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_reset_mac_len(skb);
+
+ lse = mpls_hdr(skb);
+ lse->label_stack_entry = mpls_lse;
+ skb_postpush_rcsum(skb, lse, MPLS_HLEN);
+
+ if (ethernet && mac_len >= ETH_HLEN)
+ skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
+ skb->protocol = mpls_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_push);
+
+/**
+ * skb_mpls_pop() - pop the outermost MPLS header
+ *
+ * @skb: buffer
+ * @next_proto: ethertype of header after popped MPLS header
+ * @mac_len: length of the MAC header
+ * @ethernet: flag to indicate if the packet is ethernet
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
+ bool ethernet)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return 0;
+
+ err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ mac_len);
+
+ __skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+
+ if (ethernet && mac_len >= ETH_HLEN) {
+ struct ethhdr *hdr;
+
+ /* use mpls_hdr() to get ethertype to account for VLANs. */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ skb_mod_eth_type(skb, hdr, next_proto);
+ }
+ skb->protocol = next_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_pop);
+
+/**
+ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
+ *
+ * @skb: buffer
+ * @mpls_lse: new MPLS label stack entry to update to
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ mpls_hdr(skb)->label_stack_entry = mpls_lse;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
+
+/**
+ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
+ *
+ * @skb: buffer
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_dec_ttl(struct sk_buff *skb)
+{
+ u32 lse;
+ u8 ttl;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
+ lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
+ ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ if (!--ttl)
+ return -EINVAL;
+
+ lse &= ~MPLS_LS_TTL_MASK;
+ lse |= ttl << MPLS_LS_TTL_SHIFT;
+
+ return skb_mpls_update_lse(skb, cpu_to_be32(lse));
+}
+EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
+
+/**
+ * alloc_skb_with_frags - allocate skb with page frags
+ *
+ * @header_len: size of linear part
+ * @data_len: needed length in frags
+ * @order: max page order desired.
+ * @errcode: pointer to error code if any
+ * @gfp_mask: allocation mask
+ *
+ * This can be used to allocate a paged skb, given a maximal order for frags.
+ */
+struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
+ unsigned long data_len,
+ int order,
+ int *errcode,
+ gfp_t gfp_mask)
+{
+ unsigned long chunk;
+ struct sk_buff *skb;
+ struct page *page;
+ int nr_frags = 0;
+
+ *errcode = -EMSGSIZE;
+ if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
+ return NULL;
+
+ *errcode = -ENOBUFS;
+ skb = alloc_skb(header_len, gfp_mask);
+ if (!skb)
+ return NULL;
+
+ while (data_len) {
+ if (nr_frags == MAX_SKB_FRAGS)
+ goto failure;
+ while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
+ order--;
+
+ if (order) {
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP |
+ __GFP_NOWARN,
+ order);
+ if (!page) {
+ order--;
+ continue;
+ }
+ } else {
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
+ }
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
+ nr_frags++;
+ skb->truesize += (PAGE_SIZE << order);
+ data_len -= chunk;
+ }
+ return skb;
+
+failure:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+ const int headlen, gfp_t gfp_mask)
+{
+ int i;
+ unsigned int size = skb_end_offset(skb);
+ int new_hlen = headlen - off;
+ u8 *data;
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+ size = SKB_WITH_OVERHEAD(size);
+
+ /* Copy real data, and all frags */
+ skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+ skb->len -= off;
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ if (skb_cloned(skb)) {
+ /* drop the old head gracefully */
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ skb_kfree_head(data, size);
+ return -ENOMEM;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+ skb_release_data(skb, SKB_CONSUMED);
+ } else {
+ /* we can reuse existing recount- all we did was
+ * relocate values
+ */
+ skb_free_head(skb);
+ }
+
+ skb->head = data;
+ skb->data = data;
+ skb->head_frag = 0;
+ skb_set_end_offset(skb, size);
+ skb_set_tail_pointer(skb, skb_headlen(skb));
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+ return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
+ gfp_t gfp_mask)
+{
+ struct sk_buff *list = shinfo->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ if (!list) {
+ pr_err("Not enough bytes to eat. Want %d\n", eat);
+ return -EFAULT;
+ }
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+ if (skb_shared(list)) {
+ clone = skb_clone(list, gfp_mask);
+ if (!clone)
+ return -ENOMEM;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without problems. */
+ insp = list;
+ }
+ if (pskb_carve(list, eat, gfp_mask) < 0) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = shinfo->frag_list) != insp) {
+ shinfo->frag_list = list->next;
+ consume_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ shinfo->frag_list = clone;
+ }
+ return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+ int pos, gfp_t gfp_mask)
+{
+ int i, k = 0;
+ unsigned int size = skb_end_offset(skb);
+ u8 *data;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+ struct skb_shared_info *shinfo;
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ return -ENOMEM;
+ size = SKB_WITH_OVERHEAD(size);
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ skb_kfree_head(data, size);
+ return -ENOMEM;
+ }
+ shinfo = (struct skb_shared_info *)(data + size);
+ for (i = 0; i < nfrags; i++) {
+ int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (pos + fsize > off) {
+ shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < off) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ skb_frag_off_add(&shinfo->frags[0], off - pos);
+ skb_frag_size_sub(&shinfo->frags[0], off - pos);
+ }
+ skb_frag_ref(skb, i);
+ k++;
+ }
+ pos += fsize;
+ }
+ shinfo->nr_frags = k;
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ /* split line is in frag list */
+ if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
+ /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
+ if (skb_has_frag_list(skb))
+ kfree_skb_list(skb_shinfo(skb)->frag_list);
+ skb_kfree_head(data, size);
+ return -ENOMEM;
+ }
+ skb_release_data(skb, SKB_CONSUMED);
+
+ skb->head = data;
+ skb->head_frag = 0;
+ skb->data = data;
+ skb_set_end_offset(skb, size);
+ skb_reset_tail_pointer(skb);
+ skb_headers_offset_update(skb, 0);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ skb->len -= off;
+ skb->data_len = skb->len;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+ return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+ int headlen = skb_headlen(skb);
+
+ if (len < headlen)
+ return pskb_carve_inside_header(skb, len, headlen, gfp);
+ else
+ return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+ int to_copy, gfp_t gfp)
+{
+ struct sk_buff *clone = skb_clone(skb, gfp);
+
+ if (!clone)
+ return NULL;
+
+ if (pskb_carve(clone, off, gfp) < 0 ||
+ pskb_trim(clone, to_copy)) {
+ kfree_skb(clone);
+ return NULL;
+ }
+ return clone;
+}
+EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (skb->data_len) {
+ if (skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb) || !skb_frags_readable(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+ }
+ /* At this point, skb->truesize might be over estimated,
+ * because skb had a fragment, and fragments do not tell
+ * their truesize.
+ * When we pulled its content into skb->head, fragment
+ * was freed, but __pskb_pull_tail() could not possibly
+ * adjust skb->truesize, not knowing the frag truesize.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
+EXPORT_SYMBOL(skb_condense);
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+ return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
+}
+
+/**
+ * __skb_ext_alloc - allocate a new skb extensions storage
+ *
+ * @flags: See kmalloc().
+ *
+ * Returns the newly allocated pointer. The pointer can later attached to a
+ * skb via __skb_ext_set().
+ * Note: caller must handle the skb_ext as an opaque data.
+ */
+struct skb_ext *__skb_ext_alloc(gfp_t flags)
+{
+ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
+
+ if (new) {
+ memset(new->offset, 0, sizeof(new->offset));
+ refcount_set(&new->refcnt, 1);
+ }
+
+ return new;
+}
+
+static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
+ unsigned int old_active)
+{
+ struct skb_ext *new;
+
+ if (refcount_read(&old->refcnt) == 1)
+ return old;
+
+ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
+ refcount_set(&new->refcnt, 1);
+
+#ifdef CONFIG_XFRM
+ if (old_active & (1 << SKB_EXT_SEC_PATH)) {
+ struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_hold(sp->xvec[i]);
+ }
+#endif
+#ifdef CONFIG_MCTP_FLOWS
+ if (old_active & (1 << SKB_EXT_MCTP)) {
+ struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);
+
+ if (flow->key)
+ refcount_inc(&flow->key->refs);
+ }
+#endif
+ __skb_ext_put(old);
+ return new;
+}
+
+/**
+ * __skb_ext_set - attach the specified extension storage to this skb
+ * @skb: buffer
+ * @id: extension id
+ * @ext: extension storage previously allocated via __skb_ext_alloc()
+ *
+ * Existing extensions, if any, are cleared.
+ *
+ * Returns the pointer to the extension.
+ */
+void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
+ struct skb_ext *ext)
+{
+ unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
+
+ skb_ext_put(skb);
+ newlen = newoff + skb_ext_type_len[id];
+ ext->chunks = newlen;
+ ext->offset[id] = newoff;
+ skb->extensions = ext;
+ skb->active_extensions = 1 << id;
+ return skb_ext_get_ptr(ext, id);
+}
+EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extension or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *new, *old = NULL;
+ unsigned int newlen, newoff;
+
+ if (skb->active_extensions) {
+ old = skb->extensions;
+
+ new = skb_ext_maybe_cow(old, skb->active_extensions);
+ if (!new)
+ return NULL;
+
+ if (__skb_ext_exist(new, id))
+ goto set_active;
+
+ newoff = new->chunks;
+ } else {
+ newoff = SKB_EXT_CHUNKSIZEOF(*new);
+
+ new = __skb_ext_alloc(GFP_ATOMIC);
+ if (!new)
+ return NULL;
+ }
+
+ newlen = newoff + skb_ext_type_len[id];
+ new->chunks = newlen;
+ new->offset[id] = newoff;
+set_active:
+ skb->slow_gro = 1;
+ skb->extensions = new;
+ skb->active_extensions |= 1 << id;
+ return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+#ifdef CONFIG_XFRM
+static void skb_ext_put_sp(struct sec_path *sp)
+{
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_put(sp->xvec[i]);
+}
+#endif
+
+#ifdef CONFIG_MCTP_FLOWS
+static void skb_ext_put_mctp(struct mctp_flow *flow)
+{
+ if (flow->key)
+ mctp_key_unref(flow->key);
+}
+#endif
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *ext = skb->extensions;
+
+ skb->active_extensions &= ~(1 << id);
+ if (skb->active_extensions == 0) {
+ skb->extensions = NULL;
+ __skb_ext_put(ext);
+#ifdef CONFIG_XFRM
+ } else if (id == SKB_EXT_SEC_PATH &&
+ refcount_read(&ext->refcnt) == 1) {
+ struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
+
+ skb_ext_put_sp(sp);
+ sp->len = 0;
+#endif
+ }
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_put(struct skb_ext *ext)
+{
+ /* If this is last clone, nothing can increment
+ * it after check passes. Avoids one atomic op.
+ */
+ if (refcount_read(&ext->refcnt) == 1)
+ goto free_now;
+
+ if (!refcount_dec_and_test(&ext->refcnt))
+ return;
+free_now:
+#ifdef CONFIG_XFRM
+ if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
+ skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
+#endif
+#ifdef CONFIG_MCTP_FLOWS
+ if (__skb_ext_exist(ext, SKB_EXT_MCTP))
+ skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
+#endif
+
+ kmem_cache_free(skbuff_ext_cache, ext);
+}
+EXPORT_SYMBOL(__skb_ext_put);
+#endif /* CONFIG_SKB_EXTENSIONS */
+
+static void kfree_skb_napi_cache(struct sk_buff *skb)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ local_bh_disable();
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ local_bh_enable();
+}
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+ struct skb_defer_node *sdn;
+ unsigned long defer_count;
+ int cpu = skb->alloc_cpu;
+ unsigned int defer_max;
+ bool kick;
+
+ if (cpu == raw_smp_processor_id() ||
+ WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu)) {
+nodefer: kfree_skb_napi_cache(skb);
+ return;
+ }
+
+ DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
+ DEBUG_NET_WARN_ON_ONCE(skb->destructor);
+ DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));
+
+ sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
+
+ defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
+ defer_count = atomic_long_inc_return(&sdn->defer_count);
+
+ if (defer_count >= defer_max)
+ goto nodefer;
+
+ llist_add(&skb->ll_node, &sdn->defer_list);
+
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = (defer_count - 1) == (defer_max >> 1);
+
+ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+ * if we are unlucky enough (this seems very unlikely).
+ */
+ if (unlikely(kick))
+ kick_defer_list_purge(cpu);
+}
+
+static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
+ size_t offset, size_t len)
+{
+ const char *kaddr;
+ __wsum csum;
+
+ kaddr = kmap_local_page(page);
+ csum = csum_partial(kaddr + offset, len, 0);
+ kunmap_local(kaddr);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+}
+
+/**
+ * skb_splice_from_iter - Splice (or copy) pages to skbuff
+ * @skb: The buffer to add pages to
+ * @iter: Iterator representing the pages to be added
+ * @maxsize: Maximum amount of pages to be added
+ *
+ * This is a common helper function for supporting MSG_SPLICE_PAGES. It
+ * extracts pages from an iterator and adds them to the socket buffer if
+ * possible, copying them to fragments if not possible (such as if they're slab
+ * pages).
+ *
+ * Returns the amount of data spliced/copied or -EMSGSIZE if there's
+ * insufficient space in the buffer to transfer anything.
+ */
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+ ssize_t maxsize)
+{
+ size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
+ struct page *pages[8], **ppages = pages;
+ ssize_t spliced = 0, ret = 0;
+ unsigned int i;
+
+ while (iter->count > 0) {
+ ssize_t space, nr, len;
+ size_t off;
+
+ ret = -EMSGSIZE;
+ space = frag_limit - skb_shinfo(skb)->nr_frags;
+ if (space < 0)
+ break;
+
+ /* We might be able to coalesce without increasing nr_frags */
+ nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
+
+ len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
+ if (len <= 0) {
+ ret = len ?: -EIO;
+ break;
+ }
+
+ i = 0;
+ do {
+ struct page *page = pages[i++];
+ size_t part = min_t(size_t, PAGE_SIZE - off, len);
+
+ ret = -EIO;
+ if (WARN_ON_ONCE(!sendpage_ok(page)))
+ goto out;
+
+ ret = skb_append_pagefrags(skb, page, off, part,
+ frag_limit);
+ if (ret < 0) {
+ iov_iter_revert(iter, len);
+ goto out;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb_splice_csum_page(skb, page, off, part);
+
+ off = 0;
+ spliced += part;
+ maxsize -= part;
+ len -= part;
+ } while (len > 0);
+
+ if (maxsize <= 0)
+ break;
+ }
+
+out:
+ skb_len_add(skb, spliced);
+ return spliced ?: ret;
+}
+EXPORT_SYMBOL(skb_splice_from_iter);
+
+static __always_inline
+size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
+ size_t len, void *to, void *priv2)
+{
+ __wsum *csum = priv2;
+ __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
+
+ *csum = csum_block_add(*csum, next, progress);
+ return 0;
+}
+
+static __always_inline
+size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
+ size_t len, void *to, void *priv2)
+{
+ __wsum next, *csum = priv2;
+
+ next = csum_and_copy_from_user(iter_from, to + progress, len);
+ *csum = csum_block_add(*csum, next, progress);
+ return next ? 0 : len;
+}
+
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+ __wsum *csum, struct iov_iter *i)
+{
+ size_t copied;
+
+ if (WARN_ON_ONCE(!i->data_source))
+ return false;
+ copied = iterate_and_advance2(i, bytes, addr, csum,
+ copy_from_user_iter_csum,
+ memcpy_from_iter_csum);
+ if (likely(copied == bytes))
+ return true;
+ iov_iter_revert(i, copied);
+ return false;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 000000000000..2ac7731e1e0a
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,1289 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+#include <trace/events/sock.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+ if (msg->sg.end > msg->sg.start &&
+ elem_first_coalesce < msg->sg.end)
+ return true;
+
+ if (msg->sg.end < msg->sg.start &&
+ (elem_first_coalesce > msg->sg.start ||
+ elem_first_coalesce < msg->sg.end))
+ return true;
+
+ return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+ int elem_first_coalesce)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+ u32 osize = msg->sg.size;
+ int ret = 0;
+
+ len -= msg->sg.size;
+ while (len > 0) {
+ struct scatterlist *sge;
+ u32 orig_offset;
+ int use, i;
+
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
+
+ orig_offset = pfrag->offset;
+ use = min_t(int, len, pfrag->size - orig_offset);
+ if (!sk_wmem_schedule(sk, use)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
+
+ i = msg->sg.end;
+ sk_msg_iter_var_prev(i);
+ sge = &msg->sg.data[i];
+
+ if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+ sg_page(sge) == pfrag->page &&
+ sge->offset + sge->length == orig_offset) {
+ sge->length += use;
+ } else {
+ if (sk_msg_full(msg)) {
+ ret = -ENOSPC;
+ break;
+ }
+
+ sge = &msg->sg.data[msg->sg.end];
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sk_msg_iter_next(msg, end);
+ }
+
+ sk_mem_charge(sk, use);
+ msg->sg.size += use;
+ pfrag->offset += use;
+ len -= use;
+ }
+
+ return ret;
+
+msg_trim:
+ sk_msg_trim(sk, msg, osize);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+ u32 off, u32 len)
+{
+ int i = src->sg.start;
+ struct scatterlist *sge = sk_msg_elem(src, i);
+ struct scatterlist *sgd = NULL;
+ u32 sge_len, sge_off;
+
+ while (off) {
+ if (sge->length > off)
+ break;
+ off -= sge->length;
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && off)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ while (len) {
+ sge_len = sge->length - off;
+ if (sge_len > len)
+ sge_len = len;
+
+ if (dst->sg.end)
+ sgd = sk_msg_elem(dst, dst->sg.end - 1);
+
+ if (sgd &&
+ (sg_page(sge) == sg_page(sgd)) &&
+ (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) {
+ sgd->length += sge_len;
+ dst->sg.size += sge_len;
+ } else if (!sk_msg_full(dst)) {
+ sge_off = sge->offset + off;
+ sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
+ } else {
+ return -ENOSPC;
+ }
+
+ off = 0;
+ len -= sge_len;
+ sk_mem_charge(sk, sge_len);
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && len)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_msg_clone);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (bytes < sge->length) {
+ sge->length -= bytes;
+ sge->offset += bytes;
+ sk_mem_uncharge(sk, bytes);
+ break;
+ }
+
+ sk_mem_uncharge(sk, sge->length);
+ bytes -= sge->length;
+ sge->length = 0;
+ sge->offset = 0;
+ sk_msg_iter_var_next(i);
+ } while (bytes && i != msg->sg.end);
+ msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = &msg->sg.data[i];
+ int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+ sk_mem_uncharge(sk, uncharge);
+ bytes -= uncharge;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ u32 len = sge->length;
+
+ /* When the skb owns the memory we free it from consume_skb path. */
+ if (!msg->skb) {
+ if (charge)
+ sk_mem_uncharge(sk, len);
+ put_page(sg_page(sge));
+ }
+ memset(sge, 0, sizeof(*sge));
+ return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ int freed = 0;
+
+ while (msg->sg.size) {
+ msg->sg.size -= sge->length;
+ freed += sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, msg->sg.size);
+ sge = sk_msg_elem(msg, i);
+ }
+ consume_skb(msg->skb);
+ sk_msg_init(msg);
+ return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, bool charge)
+{
+ struct scatterlist *sge;
+ u32 i = msg->sg.start;
+
+ while (bytes) {
+ sge = sk_msg_elem(msg, i);
+ if (!sge->length)
+ break;
+ if (bytes < sge->length) {
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
+ sge->length -= bytes;
+ sge->offset += bytes;
+ msg->sg.size -= bytes;
+ break;
+ }
+
+ msg->sg.size -= sge->length;
+ bytes -= sge->length;
+ sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, bytes);
+ }
+ msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+ u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+ int trim = msg->sg.size - len;
+ u32 i = msg->sg.end;
+
+ if (trim <= 0) {
+ WARN_ON(trim < 0);
+ return;
+ }
+
+ sk_msg_iter_var_prev(i);
+ msg->sg.size = len;
+ while (msg->sg.data[i].length &&
+ trim >= msg->sg.data[i].length) {
+ trim -= msg->sg.data[i].length;
+ sk_msg_free_elem(sk, msg, i, true);
+ sk_msg_iter_var_prev(i);
+ if (!trim)
+ goto out;
+ }
+
+ msg->sg.data[i].length -= trim;
+ sk_mem_uncharge(sk, trim);
+ /* Adjust copybreak if it falls into the trimmed part of last buf */
+ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length)
+ msg->sg.copybreak = msg->sg.data[i].length;
+out:
+ sk_msg_iter_var_next(i);
+ msg->sg.end = i;
+
+ /* If we trim data a full sg elem before curr pointer update
+ * copybreak and current so that any future copy operations
+ * start at new copy location.
+ * However trimmed data that has not yet been used in a copy op
+ * does not require an update.
+ */
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >=
+ sk_msg_iter_dist(msg->sg.start, msg->sg.end)) {
+ sk_msg_iter_var_prev(i);
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+ const int to_max_pages = MAX_MSG_FRAGS;
+ struct page *pages[MAX_MSG_FRAGS];
+ ssize_t orig, copied, use, offset;
+
+ orig = msg->sg.size;
+ while (bytes > 0) {
+ i = 0;
+ maxpages = to_max_pages - num_elems;
+ if (maxpages == 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ copied = iov_iter_get_pages2(from, pages, bytes, maxpages,
+ &offset);
+ if (copied <= 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ bytes -= copied;
+ msg->sg.size += copied;
+
+ while (copied) {
+ use = min_t(int, copied, PAGE_SIZE - offset);
+ sg_set_page(&msg->sg.data[msg->sg.end],
+ pages[i], use, offset);
+ sg_unmark_end(&msg->sg.data[msg->sg.end]);
+ sk_mem_charge(sk, use);
+
+ offset = 0;
+ copied -= use;
+ sk_msg_iter_next(msg, end);
+ num_elems++;
+ i++;
+ }
+ /* When zerocopy is mixed with sk_msg_*copy* operations we
+ * may have a copybreak set in this case clear and prefer
+ * zerocopy remainder when possible.
+ */
+ msg->sg.copybreak = 0;
+ msg->sg.curr = msg->sg.end;
+ }
+out:
+ /* Revert iov_iter updates, msg will need to use 'trim' later if it
+ * also needs to be cleared.
+ */
+ if (ret)
+ iov_iter_revert(from, msg->sg.size - orig);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int ret = -ENOSPC, i = msg->sg.curr;
+ u32 copy, buf_size, copied = 0;
+ struct scatterlist *sge;
+ void *to;
+
+ do {
+ sge = sk_msg_elem(msg, i);
+ /* This is possible if a trim operation shrunk the buffer */
+ if (msg->sg.copybreak >= sge->length) {
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ if (i == msg->sg.end)
+ break;
+ sge = sk_msg_elem(msg, i);
+ }
+
+ buf_size = sge->length - msg->sg.copybreak;
+ copy = (buf_size > bytes) ? bytes : buf_size;
+ to = sg_virt(sge) + msg->sg.copybreak;
+ msg->sg.copybreak += copy;
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+ ret = copy_from_iter_nocache(to, copy, from);
+ else
+ ret = copy_from_iter(to, copy, from);
+ if (ret != copy) {
+ ret = -EFAULT;
+ goto out;
+ }
+ bytes -= copy;
+ copied += copy;
+ if (!bytes)
+ break;
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+out:
+ msg->sg.curr = i;
+ return (ret < 0) ? ret : copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ if (copy)
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy) {
+ copied = copied ? copied : -EFAULT;
+ goto out;
+ }
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb) {
+ sk_mem_uncharge(sk, copy);
+ atomic_sub(copy, &sk->sk_rmem_alloc);
+ }
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ goto out;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while ((i != msg_rx->sg.end) && !sg_is_last(sge));
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+out:
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
+bool sk_msg_is_readable(struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+EXPORT_SYMBOL_GPL(sk_msg_is_readable);
+
+static struct sk_msg *alloc_sk_msg(gfp_t gfp)
+{
+ struct sk_msg *msg;
+
+ msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN);
+ if (unlikely(!msg))
+ return NULL;
+ sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
+ return msg;
+}
+
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return NULL;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return NULL;
+
+ return alloc_sk_msg(GFP_KERNEL);
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+ u32 off, u32 len,
+ struct sk_psock *psock,
+ struct sock *sk,
+ struct sk_msg *msg,
+ bool take_ref)
+{
+ int num_sge, copied;
+
+ /* skb_to_sgvec will fail when the total number of fragments in
+ * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
+ * caller may aggregate multiple skbs.
+ */
+ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
+ if (num_sge < 0) {
+ /* skb linearize may fail with ENOMEM, but lets simply try again
+ * later if this happens. Under memory pressure we don't want to
+ * drop the skb. We need to linearize the skb so that the mapping
+ * in skb_to_sgvec can not error.
+ * Note that skb_linearize requires the skb not to be shared.
+ */
+ if (skb_linearize(skb))
+ return -EAGAIN;
+
+ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
+ if (unlikely(num_sge < 0))
+ return num_sge;
+ }
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
+ copied = len;
+ msg->sg.start = 0;
+ msg->sg.size = copied;
+ msg->sg.end = num_sge;
+ msg->skb = take_ref ? skb_get(skb) : skb;
+
+ sk_psock_queue_msg(psock, msg);
+ sk_psock_data_ready(sk, psock);
+ return copied;
+}
+
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool take_ref);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len)
+{
+ struct sock *sk = psock->sk;
+ struct sk_msg *msg;
+ int err;
+
+ /* If we are receiving on the same sock skb->sk is already assigned,
+ * skip memory accounting and owner transition seeing it already set
+ * correctly.
+ */
+ if (unlikely(skb->sk == sk))
+ return sk_psock_skb_ingress_self(psock, skb, off, len, true);
+ msg = sk_psock_create_ingress_msg(sk, skb);
+ if (!msg)
+ return -EAGAIN;
+
+ /* This will transition ownership of the data from the socket where
+ * the BPF program was run initiating the redirect to the socket
+ * we will eventually receive this data on. The data will be released
+ * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+ * into user buffers.
+ */
+ skb_set_owner_r(skb, sk);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
+ if (err < 0)
+ kfree(msg);
+ return err;
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool take_ref)
+{
+ struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
+ struct sock *sk = psock->sk;
+ int err;
+
+ if (unlikely(!msg))
+ return -EAGAIN;
+ skb_set_owner_r(skb, sk);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
+ if (err < 0)
+ kfree(msg);
+ return err;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool ingress)
+{
+ if (!ingress) {
+ if (!sock_writeable(psock->sk))
+ return -EAGAIN;
+ return skb_send_sock(psock->sk, skb, off, len);
+ }
+
+ return sk_psock_skb_ingress(psock, skb, off, len);
+}
+
+static void sk_psock_skb_state(struct sk_psock *psock,
+ struct sk_psock_work_state *state,
+ int len, int off)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ state->len = len;
+ state->off = off;
+ }
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
+ struct sk_psock_work_state *state = &psock->work_state;
+ struct sk_buff *skb = NULL;
+ u32 len = 0, off = 0;
+ bool ingress;
+ int ret;
+
+ /* If sk is quickly removed from the map and then added back, the old
+ * psock should not be scheduled, because there are now two psocks
+ * pointing to the same sk.
+ */
+ if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ return;
+
+ /* Increment the psock refcnt to synchronize with close(fd) path in
+ * sock_map_close(), ensuring we wait for backlog thread completion
+ * before sk_socket freed. If refcnt increment fails, it indicates
+ * sock_map_close() completed with sk_socket potentially already freed.
+ */
+ if (!sk_psock_get(psock->sk))
+ return;
+ mutex_lock(&psock->work_mutex);
+ while ((skb = skb_peek(&psock->ingress_skb))) {
+ len = skb->len;
+ off = 0;
+ if (skb_bpf_strparser(skb)) {
+ struct strp_msg *stm = strp_msg(skb);
+
+ off = stm->offset;
+ len = stm->full_len;
+ }
+
+ /* Resume processing from previous partial state */
+ if (unlikely(state->len)) {
+ len = state->len;
+ off = state->off;
+ }
+
+ ingress = skb_bpf_ingress(skb);
+ skb_bpf_redirect_clear(skb);
+ do {
+ ret = -EIO;
+ if (!sock_flag(psock->sk, SOCK_DEAD))
+ ret = sk_psock_handle_skb(psock, skb, off,
+ len, ingress);
+ if (ret <= 0) {
+ if (ret == -EAGAIN) {
+ sk_psock_skb_state(psock, state, len, off);
+ /* Restore redir info we cleared before */
+ skb_bpf_set_redir(skb, psock->sk, ingress);
+ /* Delay slightly to prioritize any
+ * other work that might be here.
+ */
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_delayed_work(&psock->work, 1);
+ goto end;
+ }
+ /* Hard errors break pipe and stop xmit. */
+ sk_psock_report_error(psock, ret ? -ret : EPIPE);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ goto end;
+ }
+ off += ret;
+ len -= ret;
+ } while (len);
+
+ /* The entire skb sent, clear state */
+ sk_psock_skb_state(psock, state, 0, 0);
+ skb = skb_dequeue(&psock->ingress_skb);
+ kfree_skb(skb);
+ }
+end:
+ mutex_unlock(&psock->work_mutex);
+ sk_psock_put(psock->sk, psock);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+ struct sk_psock *psock;
+ struct proto *prot;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ if (sk->sk_user_data) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
+ if (!psock) {
+ psock = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ prot = READ_ONCE(sk->sk_prot);
+ psock->sk = sk;
+ psock->eval = __SK_NONE;
+ psock->sk_proto = prot;
+ psock->saved_unhash = prot->unhash;
+ psock->saved_destroy = prot->destroy;
+ psock->saved_close = prot->close;
+ psock->saved_write_space = sk->sk_write_space;
+
+ INIT_LIST_HEAD(&psock->link);
+ spin_lock_init(&psock->link_lock);
+
+ INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
+ INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
+ skb_queue_head_init(&psock->ingress_skb);
+
+ sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+ refcount_set(&psock->refcnt, 1);
+
+ __rcu_assign_sk_user_data_with_flags(sk, psock,
+ SK_USER_DATA_NOCOPY |
+ SK_USER_DATA_PSOCK);
+ sock_hold(sk);
+
+out:
+ write_unlock_bh(&sk->sk_callback_lock);
+ return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ spin_lock_bh(&psock->link_lock);
+ link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+ list);
+ if (link)
+ list_del(&link->list);
+ spin_unlock_bh(&psock->link_lock);
+ return link;
+}
+
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+ list_del(&msg->list);
+ if (!msg->skb)
+ atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
+ sk_msg_free(psock->sk, msg);
+ kfree(msg);
+ }
+}
+
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(psock->sk, skb);
+ }
+ __sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+ struct sk_psock_link *link, *tmp;
+
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ }
+}
+
+void sk_psock_stop(struct sk_psock *psock)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock);
+
+static void sk_psock_destroy(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
+ /* No sk_callback_lock since already detached. */
+
+ sk_psock_done_strp(psock);
+
+ cancel_delayed_work_sync(&psock->work);
+ __sk_psock_zap_ingress(psock);
+ mutex_destroy(&psock->work_mutex);
+
+ psock_progs_drop(&psock->progs);
+
+ sk_psock_link_destroy(psock);
+ sk_psock_cork_free(psock);
+
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ if (psock->sk_pair)
+ sock_put(psock->sk_pair);
+ sock_put(psock->sk);
+ kfree(psock);
+}
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk_psock_restore_proto(sk, psock);
+ rcu_assign_sk_user_data(sk, NULL);
+ if (psock->progs.stream_parser)
+ sk_psock_stop_strp(sk, psock);
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
+ sk_psock_stop_verdict(sk, psock);
+ write_unlock_bh(&sk->sk_callback_lock);
+
+ sk_psock_stop(psock);
+
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_percpu_wq, &psock->rwork);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+ switch (verdict) {
+ case SK_PASS:
+ return redir ? __SK_REDIRECT : __SK_PASS;
+ case SK_DROP:
+ default:
+ break;
+ }
+
+ return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.msg_parser);
+ if (unlikely(!prog)) {
+ ret = __SK_PASS;
+ goto out;
+ }
+
+ sk_msg_compute_data_pointers(msg);
+ msg->sk = sk;
+ ret = bpf_prog_run_pin_on_cpu(prog, msg);
+ ret = sk_psock_map_verd(ret, msg->sk_redir);
+ psock->apply_bytes = msg->apply_bytes;
+ if (ret == __SK_REDIRECT) {
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ if (!msg->sk_redir) {
+ ret = __SK_DROP;
+ goto out;
+ }
+ psock->redir_ingress = sk_msg_to_ingress(msg);
+ psock->sk_redir = msg->sk_redir;
+ sock_hold(psock->sk_redir);
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
+{
+ struct sk_psock *psock_other;
+ struct sock *sk_other;
+
+ sk_other = skb_bpf_redirect_fetch(skb);
+ /* This error is a buggy BPF program, it returned a redirect
+ * return code, but then didn't set a redirect interface.
+ */
+ if (unlikely(!sk_other)) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+ psock_other = sk_psock(sk_other);
+ /* This error indicates the socket is being torn down or had another
+ * error that caused the pipe to break. We can't send a packet on
+ * a socket that is in this state so we drop the skb.
+ */
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_delayed_work(&psock_other->work, 0);
+ spin_unlock_bh(&psock_other->ingress_lock);
+ return 0;
+}
+
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb,
+ struct sk_psock *from, int verdict)
+{
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_psock_skb_redirect(from, skb);
+ break;
+ case __SK_PASS:
+ case __SK_DROP:
+ default:
+ break;
+ }
+}
+
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog;
+ int ret = __SK_PASS;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (likely(prog)) {
+ skb->sk = psock->sk;
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
+ }
+ sk_psock_tls_verdict_apply(skb, psock, ret);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
+
+static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
+ int verdict)
+{
+ struct sock *sk_other;
+ int err = 0;
+ u32 len, off;
+
+ switch (verdict) {
+ case __SK_PASS:
+ err = -EIO;
+ sk_other = psock->sk;
+ if (sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ goto out_free;
+
+ skb_bpf_set_ingress(skb);
+
+ /* If the queue is empty then we can submit directly
+ * into the msg queue. If its not empty we have to
+ * queue work otherwise we may get OOO data. Otherwise,
+ * if sk_psock_skb_ingress errors will be handled by
+ * retrying later from workqueue.
+ */
+ if (skb_queue_empty(&psock->ingress_skb)) {
+ len = skb->len;
+ off = 0;
+ if (skb_bpf_strparser(skb)) {
+ struct strp_msg *stm = strp_msg(skb);
+
+ off = stm->offset;
+ len = stm->full_len;
+ }
+ err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
+ }
+ if (err < 0) {
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_delayed_work(&psock->work, 0);
+ err = 0;
+ }
+ spin_unlock_bh(&psock->ingress_lock);
+ if (err < 0)
+ goto out_free;
+ }
+ break;
+ case __SK_REDIRECT:
+ tcp_eat_skb(psock->sk, skb);
+ err = sk_psock_skb_redirect(psock, skb);
+ break;
+ case __SK_DROP:
+ default:
+out_free:
+ skb_bpf_redirect_clear(skb);
+ tcp_eat_skb(psock->sk, skb);
+ sock_drop(psock->sk, skb);
+ }
+
+ return err;
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_delayed_work(&psock->work, 0);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock;
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = strp->sk;
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ sock_drop(sk, skb);
+ goto out;
+ }
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (likely(prog)) {
+ skb->sk = sk;
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ skb_bpf_set_strparser(skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
+ }
+ sk_psock_verdict_apply(psock, skb, ret);
+out:
+ rcu_read_unlock();
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
+ struct bpf_prog *prog;
+ int ret = skb->len;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.stream_parser);
+ if (likely(prog)) {
+ skb->sk = psock->sk;
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ skb->sk = NULL;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_strp_data_ready(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ trace_sk_data_ready(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (tls_sw_has_ctx_rx(sk)) {
+ psock->saved_data_ready(sk);
+ } else {
+ read_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ read_unlock_bh(&sk->sk_callback_lock);
+ }
+ }
+ rcu_read_unlock();
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ int ret;
+
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ ret = strp_init(&psock->strp, sk, &cb);
+ if (!ret)
+ sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+
+ if (sk_is_tcp(sk)) {
+ psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ }
+ return ret;
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ psock_set_prog(&psock->progs.stream_parser, NULL);
+
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED))
+ strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
+static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_psock *psock;
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+ int len = skb->len;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ len = 0;
+ tcp_eat_skb(sk, skb);
+ sock_drop(sk, skb);
+ goto out;
+ }
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ }
+ ret = sk_psock_verdict_apply(psock, skb, ret);
+ if (ret < 0)
+ len = ret;
+out:
+ rcu_read_unlock();
+ return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ const struct proto_ops *ops;
+ int copied;
+
+ trace_sk_data_ready(sk);
+
+ if (unlikely(!sock))
+ return;
+ ops = READ_ONCE(sock->ops);
+ if (!ops || !ops->read_skb)
+ return;
+ copied = ops->read_skb(sk, sk_psock_verdict_recv);
+ if (copied >= 0) {
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock)
+ sk_psock_data_ready(sk, psock);
+ rcu_read_unlock();
+ }
+}
+
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_verdict_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ psock_set_prog(&psock->progs.stream_verdict, NULL);
+ psock_set_prog(&psock->progs.skb_verdict, NULL);
+
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index ab8fafadb4ba..45c98bf524b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,9 +7,6 @@
* Generic socket support routines. Memory allocators, socket lock/release
* handler for protocols to use and generic option handler.
*
- *
- * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
@@ -34,7 +32,7 @@
* Alan Cox : TCP ack handling is buggy, the DESTROY timer
* was buggy. Put a remove_sock() in the handler
* for memory when we hit 0. Also altered the timer
- * code. The ACK stuff can wait and needs major
+ * code. The ACK stuff can wait and needs major
* TCP layer surgery.
* Alan Cox : Fixed TCP ack bug, removed remove sock
* and fixed timer/inet_bh race.
@@ -83,16 +81,14 @@
* Arnaldo C. Melo : cleanups, use skb_queue_purge
*
* To Fix:
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
+#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
@@ -101,6 +97,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -110,175 +107,465 @@
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
+#include <linux/udp.h>
#include <linux/init.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/highmem.h>
+#include <linux/user_namespace.h>
+#include <linux/static_key.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
+#include <linux/compat.h>
+#include <linux/mroute.h>
+#include <linux/mroute6.h>
+#include <linux/icmpv6.h>
+
+#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
+#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
+#include <net/proto_memory.h>
+#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
+#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
#include <linux/filter.h>
+#include <net/sock_reuseport.h>
+#include <net/bpf_sk_storage.h>
+
+#include <trace/events/sock.h>
-#ifdef CONFIG_INET
#include <net/tcp.h>
-#endif
+#include <net/busy_poll.h>
+#include <net/phonet/phonet.h>
+
+#include <linux/ethtool.h>
+
+#include <uapi/linux/pidfd.h>
+
+#include "dev.h"
+
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
+static void sock_def_write_space(struct sock *sk);
+
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
/*
* Each address family might have different locking rules, so we have
- * one slock key per address family:
+ * one slock key per address family and separate keys for internal and
+ * userspace sockets.
*/
static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
* locks is fast):
*/
-static const char *af_family_key_strings[AF_MAX+1] = {
- "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
- "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
- "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
- "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
- "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
- "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
- "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
- "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
- "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
- "sk_lock-27" , "sk_lock-28" , "sk_lock-29" ,
- "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX"
+
+#define _sock_locks(x) \
+ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
+ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
+ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
+ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
+ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
+ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
+ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
+ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
+ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
+ x "27" , x "28" , x "AF_CAN" , \
+ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
+ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
+ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
+ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MCTP" , \
+ x "AF_MAX"
+
+static const char *const af_family_key_strings[AF_MAX+1] = {
+ _sock_locks("sk_lock-")
};
-static const char *af_family_slock_key_strings[AF_MAX+1] = {
- "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
- "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
- "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
- "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
- "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
- "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
- "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
- "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
- "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
- "slock-27" , "slock-28" , "slock-29" ,
- "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_MAX"
+static const char *const af_family_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("slock-")
+};
+static const char *const af_family_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("clock-")
+};
+
+static const char *const af_family_kern_key_strings[AF_MAX+1] = {
+ _sock_locks("k-sk_lock-")
+};
+static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-slock-")
+};
+static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-clock-")
+};
+static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
+ _sock_locks("rlock-")
+};
+static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
+ _sock_locks("wlock-")
+};
+static const char *const af_family_elock_key_strings[AF_MAX+1] = {
+ _sock_locks("elock-")
};
-#endif
/*
- * sk_callback_lock locking rules are per-address-family,
+ * sk_callback_lock and sk queues locking rules are per-address-family,
* so split the lock classes by using a per-AF key:
*/
static struct lock_class_key af_callback_keys[AF_MAX];
+static struct lock_class_key af_rlock_keys[AF_MAX];
+static struct lock_class_key af_wlock_keys[AF_MAX];
+static struct lock_class_key af_elock_keys[AF_MAX];
+static struct lock_class_key af_kern_callback_keys[AF_MAX];
+
+/* Run time adjustable parameters. */
+__u32 sysctl_wmem_max __read_mostly = 4 << 20;
+EXPORT_SYMBOL(sysctl_wmem_max);
+__u32 sysctl_rmem_max __read_mostly = 4 << 20;
+EXPORT_SYMBOL(sysctl_rmem_max);
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
+
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
-/* Take into consideration the size of the struct sk_buff overhead in the
- * determination of these values, since that is non-constant across
- * platforms. This makes socket queueing behavior and performance
- * not depend upon such differences.
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
*/
-#define _SK_MEM_PACKETS 256
-#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
-#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_branch_inc(&memalloc_socks_key);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
-/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
-__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_branch_dec(&memalloc_socks_key);
-/* Maximal space eaten by iovec or ancilliary data plus some space */
-int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. SOCK_MEMALLOC may be cleared while
+ * it has rmem allocations due to the last swapfile being deactivated
+ * but there is a risk that the socket is unusable due to exceeding
+ * the rmem limits. Reclaim the reserves and obey rmem limits again.
+ */
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
- struct timeval tv;
+ int ret;
+ unsigned int noreclaim_flag;
- if (optlen < sizeof(tv))
- return -EINVAL;
- if (copy_from_user(&tv, optval, sizeof(tv)))
- return -EFAULT;
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ noreclaim_flag = memalloc_noreclaim_save();
+ ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
+void sk_error_report(struct sock *sk)
+{
+ sk->sk_error_report(sk);
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ fallthrough;
+ case AF_INET6:
+ trace_inet_sk_error_report(sk);
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(sk_error_report);
+
+int sock_get_timeout(long timeo, void *optval, bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+
+ if (timeo == MAX_SCHEDULE_TIMEOUT) {
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ } else {
+ tv.tv_sec = timeo / HZ;
+ tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
+ }
+
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
+ *(struct old_timeval32 *)optval = tv32;
+ return sizeof(tv32);
+ }
+
+ if (old_timeval) {
+ struct __kernel_old_timeval old_tv;
+ old_tv.tv_sec = tv.tv_sec;
+ old_tv.tv_usec = tv.tv_usec;
+ *(struct __kernel_old_timeval *)optval = old_tv;
+ return sizeof(old_tv);
+ }
+
+ *(struct __kernel_sock_timeval *)optval = tv;
+ return sizeof(tv);
+}
+EXPORT_SYMBOL(sock_get_timeout);
+
+int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
+ sockptr_t optval, int optlen, bool old_timeval)
+{
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ struct old_timeval32 tv32;
+
+ if (optlen < sizeof(tv32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
+ return -EFAULT;
+ tv->tv_sec = tv32.tv_sec;
+ tv->tv_usec = tv32.tv_usec;
+ } else if (old_timeval) {
+ struct __kernel_old_timeval old_tv;
+
+ if (optlen < sizeof(old_tv))
+ return -EINVAL;
+ if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
+ return -EFAULT;
+ tv->tv_sec = old_tv.tv_sec;
+ tv->tv_usec = old_tv.tv_usec;
+ } else {
+ if (optlen < sizeof(*tv))
+ return -EINVAL;
+ if (copy_from_sockptr(tv, optval, sizeof(*tv)))
+ return -EFAULT;
+ }
- *timeo_p = MAX_SCHEDULE_TIMEOUT;
- if (tv.tv_sec == 0 && tv.tv_usec == 0)
+ return 0;
+}
+EXPORT_SYMBOL(sock_copy_user_timeval);
+
+static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
+ bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+ int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
+ long val;
+
+ if (err)
+ return err;
+
+ if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+ return -EDOM;
+
+ if (tv.tv_sec < 0) {
+ static int warned __read_mostly;
+
+ WRITE_ONCE(*timeo_p, 0);
+ if (warned < 10 && net_ratelimit()) {
+ warned++;
+ pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
+ __func__, current->comm, task_pid_nr(current));
+ }
return 0;
- if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
- *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+ }
+ val = MAX_SCHEDULE_TIMEOUT;
+ if ((tv.tv_sec || tv.tv_usec) &&
+ (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
+ val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
+ USEC_PER_SEC / HZ);
+ WRITE_ONCE(*timeo_p, val);
return 0;
}
-static void sock_warn_obsolete_bsdism(const char *name)
+static bool sk_set_prio_allowed(const struct sock *sk, int val)
{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- printk(KERN_WARNING "process `%s' is using obsolete "
- "%s SO_BSDCOMPAT\n", warncomm, name);
- warned++;
+ return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
+}
+
+static bool sock_needs_netstamp(const struct sock *sk)
+{
+ switch (sk->sk_family) {
+ case AF_UNSPEC:
+ case AF_UNIX:
+ return false;
+ default:
+ return true;
}
}
-static void sock_disable_timestamp(struct sock *sk)
-{
- if (sock_flag(sk, SOCK_TIMESTAMP)) {
- sock_reset_flag(sk, SOCK_TIMESTAMP);
- net_disable_timestamp();
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
+{
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (sock_needs_netstamp(sk) &&
+ !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ net_disable_timestamp();
}
}
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int err = 0;
- int skb_len;
+ unsigned long flags;
+ struct sk_buff_head *list = &sk->sk_receive_queue;
- /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
- err = -ENOMEM;
- goto out;
+ if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
+ sk_drops_inc(sk);
+ trace_sock_rcvqueue_full(sk, skb);
+ return -ENOMEM;
}
- err = sk_filter(sk, skb);
- if (err)
- goto out;
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+ sk_drops_inc(sk);
+ return -ENOBUFS;
+ }
skb->dev = NULL;
skb_set_owner_r(skb, sk);
- /* Cache the SKB length before we tack it onto the receive
- * queue. Once it is added it no longer belongs to us and
- * may be freed by other threads of control pulling packets
- * from the queue.
+ /* we escape from rcu protected region, make sure we dont leak
+ * a norefcounted dst
*/
- skb_len = skb->len;
+ skb_dst_force(skb);
- skb_queue_tail(&sk->sk_receive_queue, skb);
+ spin_lock_irqsave(&list->lock, flags);
+ sock_skb_set_dropcount(sk, skb);
+ __skb_queue_tail(list, skb);
+ spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
+ return 0;
+}
+EXPORT_SYMBOL(__sock_queue_rcv_skb);
+
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
+{
+ enum skb_drop_reason drop_reason;
+ int err;
+
+ err = sk_filter_reason(sk, skb, &drop_reason);
+ if (err)
+ goto out;
+
+ err = __sock_queue_rcv_skb(sk, skb);
+ switch (err) {
+ case -ENOMEM:
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ break;
+ case -ENOBUFS:
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ break;
+ default:
+ drop_reason = SKB_NOT_DROPPED_YET;
+ break;
+ }
out:
+ if (reason)
+ *reason = drop_reason;
return err;
}
-EXPORT_SYMBOL(sock_queue_rcv_skb);
+EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
-int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
+ const int nested, unsigned int trim_cap, bool refcounted)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
int rc = NET_RX_SUCCESS;
+ int err;
- if (sk_filter(sk, skb))
+ if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
goto discard_and_relse;
skb->dev = NULL;
+ if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
+ sk_drops_inc(sk);
+ reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ goto discard_and_relse;
+ }
if (nested)
bh_lock_sock_nested(sk);
else
@@ -289,27 +576,44 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
*/
mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
- rc = sk->sk_backlog_rcv(sk, skb);
+ rc = sk_backlog_rcv(sk, skb);
+
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
+ } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
+ bh_unlock_sock(sk);
+ if (err == -ENOMEM)
+ reason = SKB_DROP_REASON_PFMEMALLOC;
+ if (err == -ENOBUFS)
+ reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+ sk_drops_inc(sk);
+ goto discard_and_relse;
+ }
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else
- sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
out:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return rc;
discard_and_relse:
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
goto out;
}
-EXPORT_SYMBOL(sk_receive_skb);
+EXPORT_SYMBOL(__sk_receive_skb);
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
+ u32));
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
- struct dst_entry *dst = sk->sk_dst_cache;
-
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
- sk->sk_dst_cache = NULL;
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && READ_ONCE(dst->obsolete) &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
+ sk_tx_queue_clear(sk);
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
}
@@ -322,7 +626,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && READ_ONCE(dst->obsolete) &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_dst_reset(sk);
dst_release(dst);
return NULL;
@@ -332,474 +638,1536 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+
+ /* Sorry... */
+ ret = -EPERM;
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out;
+
+ ret = -EINVAL;
+ if (ifindex < 0)
+ goto out;
+
+ /* Paired with all READ_ONCE() done locklessly. */
+ WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
+
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ sk_dst_reset(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
+{
+ int ret;
+
+ if (lock_sk)
+ lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, ifindex);
+ if (lock_sk)
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
+static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+ int index;
+
+ ret = -EINVAL;
+ if (optlen < 0)
+ goto out;
+
+ /* Bind this socket to a particular device like "eth0",
+ * as specified in the passed interface name. If the
+ * name is "" or the option length is zero the socket
+ * is not bound.
+ */
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
+ memset(devname, 0, sizeof(devname));
+
+ ret = -EFAULT;
+ if (copy_from_sockptr(devname, optval, optlen))
+ goto out;
+
+ index = 0;
+ if (devname[0] != '\0') {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, devname);
+ if (dev)
+ index = dev->ifindex;
+ rcu_read_unlock();
+ ret = -ENODEV;
+ if (!dev)
+ goto out;
+ }
+
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
+out:
+#endif
+
+ return ret;
+}
+
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+
+ if (bound_dev_if == 0) {
+ len = 0;
+ goto zero;
+ }
+
+ ret = -EINVAL;
+ if (len < IFNAMSIZ)
+ goto out;
+
+ ret = netdev_get_name(net, devname, bound_dev_if);
+ if (ret)
+ goto out;
+
+ len = strlen(devname) + 1;
+
+ ret = -EFAULT;
+ if (copy_to_sockptr(optval, devname, len))
+ goto out;
+
+zero:
+ ret = -EFAULT;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ goto out;
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+bool sk_mc_loop(const struct sock *sk)
+{
+ if (dev_recursion_level())
+ return false;
+ if (!sk)
+ return true;
+ /* IPV6_ADDRFORM can change sk->sk_family under us. */
+ switch (READ_ONCE(sk->sk_family)) {
+ case AF_INET:
+ return inet_test_bit(MC_LOOP, sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return inet6_test_bit(MC6_LOOP, sk);
+#endif
+ }
+ WARN_ON_ONCE(1);
+ return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
+void sock_set_reuseaddr(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuse = SK_CAN_REUSE;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
+void sock_set_reuseport(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuseport = true;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
+void sock_no_linger(struct sock *sk)
+{
+ lock_sock(sk);
+ WRITE_ONCE(sk->sk_lingertime, 0);
+ sock_set_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+ WRITE_ONCE(sk->sk_priority, priority);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
+ else
+ WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+ sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
+ if (val) {
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ }
+}
+
+void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
+{
+ switch (optname) {
+ case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
+ break;
+ case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
+ case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
+ case SO_TIMESTAMPNS_NEW:
+ __sock_set_timestamps(sk, valbool, true, true);
+ break;
+ }
+}
+
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
+ bool match = false;
+ int *vclock_index;
+ int i, num;
+
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+ if (!dev) {
+ pr_err("%s: sock not bind to device\n", __func__);
+ return -EOPNOTSUPP;
+ }
+
+ num = ethtool_get_phc_vclocks(dev, &vclock_index);
+ dev_put(dev);
+
+ for (i = 0; i < num; i++) {
+ if (*(vclock_index + i) == phc_index) {
+ match = true;
+ break;
+ }
+ }
+
+ if (num > 0)
+ kfree(vclock_index);
+
+ if (!match)
+ return -EINVAL;
+
+ WRITE_ONCE(sk->sk_bind_phc, phc_index);
+
+ return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping)
+{
+ int val = timestamping.flags;
+ int ret;
+
+ if (val & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
+ !(val & SOF_TIMESTAMPING_OPT_ID))
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk_is_tcp(sk)) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))
+ return -EINVAL;
+ if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
+ else
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
+ } else {
+ atomic_set(&sk->sk_tskey, 0);
+ }
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY))
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_BIND_PHC) {
+ ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+ if (ret)
+ return ret;
+ }
+
+ WRITE_ONCE(sk->sk_tsflags, val);
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
+
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ return 0;
+}
+
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+ __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
+void sock_set_keepalive(struct sock *sk)
+{
+ lock_sock(sk);
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, true);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+ * as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+ /* We double it on the way in to account for "struct sk_buff" etc.
+ * overhead. Applications assume that the SO_RCVBUF setting they make
+ * will allow that much actual data to be received on that socket.
+ *
+ * Applications are unaware that "struct sk_buff" and other overheads
+ * allocate from the receive buffer during socket buffer allocation.
+ *
+ * And after considering the possible alternatives, returning the value
+ * we actually used in getsockopt is the most desirable behavior.
+ */
+ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __sock_set_rcvbuf(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
+static void __sock_set_mark(struct sock *sk, u32 val)
+{
+ if (val != sk->sk_mark) {
+ WRITE_ONCE(sk->sk_mark, val);
+ sk_dst_reset(sk);
+ }
+}
+
+void sock_set_mark(struct sock *sk, u32 val)
+{
+ lock_sock(sk);
+ __sock_set_mark(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_mark);
+
+static void sock_release_reserved_memory(struct sock *sk, int bytes)
+{
+ /* Round down bytes to multiple of pages */
+ bytes = round_down(bytes, PAGE_SIZE);
+
+ WARN_ON(bytes > sk->sk_reserved_mem);
+ WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
+ sk_mem_reclaim(sk);
+}
+
+static int sock_reserve_memory(struct sock *sk, int bytes)
+{
+ long allocated;
+ bool charged;
+ int pages;
+
+ if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (!bytes)
+ return 0;
+
+ pages = sk_mem_pages(bytes);
+
+ /* pre-charge to memcg */
+ charged = mem_cgroup_sk_charge(sk, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!charged)
+ return -ENOMEM;
+
+ if (sk->sk_bypass_prot_mem)
+ goto success;
+
+ /* pre-charge to forward_alloc */
+ sk_memory_allocated_add(sk, pages);
+ allocated = sk_memory_allocated(sk);
+
+ /* If the system goes into memory pressure with this
+ * precharge, give up and return error.
+ */
+ if (allocated > sk_prot_mem_limits(sk, 1)) {
+ sk_memory_allocated_sub(sk, pages);
+ mem_cgroup_sk_uncharge(sk, pages);
+ return -ENOMEM;
+ }
+
+success:
+ sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
+
+ WRITE_ONCE(sk->sk_reserved_mem,
+ sk->sk_reserved_mem + (pages << PAGE_SHIFT));
+
+ return 0;
+}
+
+#ifdef CONFIG_PAGE_POOL
+
+/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
+ * in 1 syscall. The limit exists to limit the amount of memory the kernel
+ * allocates to copy these tokens, and to prevent looping over the frags for
+ * too long.
+ */
+#define MAX_DONTNEED_TOKENS 128
+#define MAX_DONTNEED_FRAGS 1024
+
+static noinline_for_stack int
+sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+ unsigned int num_tokens, i, j, k, netmem_num = 0;
+ struct dmabuf_token *tokens;
+ int ret = 0, num_frags = 0;
+ netmem_ref netmems[16];
+
+ if (!sk_is_tcp(sk))
+ return -EBADF;
+
+ if (optlen % sizeof(*tokens) ||
+ optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
+ return -EINVAL;
+
+ num_tokens = optlen / sizeof(*tokens);
+ tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
+ if (!tokens)
+ return -ENOMEM;
+
+ if (copy_from_sockptr(tokens, optval, optlen)) {
+ kvfree(tokens);
+ return -EFAULT;
+ }
+
+ xa_lock_bh(&sk->sk_user_frags);
+ for (i = 0; i < num_tokens; i++) {
+ for (j = 0; j < tokens[i].token_count; j++) {
+ if (++num_frags > MAX_DONTNEED_FRAGS)
+ goto frag_limit_reached;
+
+ netmem_ref netmem = (__force netmem_ref)__xa_erase(
+ &sk->sk_user_frags, tokens[i].token_start + j);
+
+ if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ continue;
+
+ netmems[netmem_num++] = netmem;
+ if (netmem_num == ARRAY_SIZE(netmems)) {
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+ netmem_num = 0;
+ xa_lock_bh(&sk->sk_user_frags);
+ }
+ ret++;
+ }
+ }
+
+frag_limit_reached:
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+
+ kvfree(tokens);
+ return ret;
+}
+#endif
+
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
+static int sockopt_validate_clockid(__kernel_clockid_t value)
+{
+ switch (value) {
+ case CLOCK_REALTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_TAI:
+ return 0;
+ }
+ return -EINVAL;
+}
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
- struct sock *sk=sock->sk;
- struct sk_filter *filter;
+ struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
+ struct sock_txtime sk_txtime;
int val;
int valbool;
struct linger ling;
int ret = 0;
-
+
/*
* Options without arguments
*/
-#ifdef SO_DONTLINGER /* Compatibility item... */
- if (optname == SO_DONTLINGER) {
- lock_sock(sk);
- sock_reset_flag(sk, SOCK_LINGER);
- release_sock(sk);
+ if (optname == SO_BINDTODEVICE)
+ return sock_setbindtodevice(sk, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ /* handle options which do not require locking the socket. */
+ switch (optname) {
+ case SO_PRIORITY:
+ if (sk_set_prio_allowed(sk, val)) {
+ sock_set_priority(sk, val);
+ return 0;
+ }
+ return -EPERM;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ return -ENOPROTOOPT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ if (val < 0)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ return 0;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ return 0;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
+ !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (val < 0 || val > U16_MAX)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
return 0;
- }
#endif
-
- if(optlen<sizeof(int))
- return(-EINVAL);
-
- if (get_user(val, (int __user *)optval))
- return -EFAULT;
-
- valbool = val?1:0;
+ case SO_MAX_PACING_RATE:
+ {
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+ unsigned long pacing_rate;
- lock_sock(sk);
+ if (sizeof(ulval) != sizeof(val) &&
+ optlen >= sizeof(ulval) &&
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+ return -EFAULT;
+ }
+ if (ulval != ~0UL)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
+ /* Pairs with READ_ONCE() from sk_getsockopt() */
+ WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+ pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+ if (ulval < pacing_rate)
+ WRITE_ONCE(sk->sk_pacing_rate, ulval);
+ return 0;
+ }
+ case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+ if (val < -1 || val > 1)
+ return -EINVAL;
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+ /* Paired with READ_ONCE() in tcp_rtx_synack()
+ * and sk_getsockopt().
+ */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ return 0;
+ case SO_PEEK_OFF:
+ {
+ int (*set_peek_off)(struct sock *sk, int val);
+
+ set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
+ if (set_peek_off)
+ ret = set_peek_off(sk, val);
+ else
+ ret = -EOPNOTSUPP;
+ return ret;
+ }
+#ifdef CONFIG_PAGE_POOL
+ case SO_DEVMEM_DONTNEED:
+ return sock_devmem_dontneed(sk, optval, optlen);
+#endif
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ return sock_set_timeout(&sk->sk_sndtimeo, optval,
+ optlen, optname == SO_SNDTIMEO_OLD);
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ return sock_set_timeout(&sk->sk_rcvtimeo, optval,
+ optlen, optname == SO_RCVTIMEO_OLD);
+ }
- switch(optname)
- {
- case SO_DEBUG:
- if(val && !capable(CAP_NET_ADMIN))
- {
- ret = -EACCES;
- }
- else if (valbool)
- sock_set_flag(sk, SOCK_DBG);
- else
- sock_reset_flag(sk, SOCK_DBG);
- break;
- case SO_REUSEADDR:
- sk->sk_reuse = valbool;
- break;
- case SO_TYPE:
- case SO_ERROR:
- ret = -ENOPROTOOPT;
- break;
- case SO_DONTROUTE:
- if (valbool)
- sock_set_flag(sk, SOCK_LOCALROUTE);
- else
- sock_reset_flag(sk, SOCK_LOCALROUTE);
- break;
- case SO_BROADCAST:
- sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
- break;
- case SO_SNDBUF:
- /* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_wmem_max)
- val = sysctl_wmem_max;
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
+ case SO_DEBUG:
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
+ ret = -EACCES;
+ else
+ sock_valbool_flag(sk, SOCK_DBG, valbool);
+ break;
+ case SO_REUSEADDR:
+ sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ break;
+ case SO_REUSEPORT:
+ if (valbool && !sk_is_inet(sk))
+ ret = -EOPNOTSUPP;
+ else
+ sk->sk_reuseport = valbool;
+ break;
+ case SO_DONTROUTE:
+ sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ sk_dst_reset(sk);
+ break;
+ case SO_BROADCAST:
+ sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+ break;
+ case SO_SNDBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- if ((val * 2) < SOCK_MIN_SNDBUF)
- sk->sk_sndbuf = SOCK_MIN_SNDBUF;
- else
- sk->sk_sndbuf = val * 2;
+ /* Ensure val * 2 fits into an int, to prevent max_t()
+ * from treating it as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
+ /* Wake up sending tasks if we upped the value. */
+ sk->sk_write_space(sk);
+ break;
- /*
- * Wake up sending tasks if we
- * upped the value.
- */
- sk->sk_write_space(sk);
- break;
-
- case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- break;
- }
- goto set_sndbuf;
-
- case SO_RCVBUF:
- /* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_rmem_max)
- val = sysctl_rmem_max;
-set_rcvbuf:
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /*
- * We double it on the way in to account for
- * "struct sk_buff" etc. overhead. Applications
- * assume that the SO_RCVBUF setting they make will
- * allow that much actual data to be received on that
- * socket.
- *
- * Applications are unaware that "struct sk_buff" and
- * other overheads allocate from the receive buffer
- * during socket buffer allocation.
- *
- * And after considering the possible alternatives,
- * returning the value we actually used in getsockopt
- * is the most desirable behavior.
- */
- if ((val * 2) < SOCK_MIN_RCVBUF)
- sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
- else
- sk->sk_rcvbuf = val * 2;
+ case SO_SNDBUFFORCE:
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
break;
+ }
- case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- break;
- }
- goto set_rcvbuf;
+ /* No negative values (to prevent underflow, as val will be
+ * multiplied by 2).
+ */
+ if (val < 0)
+ val = 0;
+ goto set_sndbuf;
+
+ case SO_RCVBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
+ break;
- case SO_KEEPALIVE:
-#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP)
- tcp_set_keepalive(sk, valbool);
-#endif
- sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ case SO_RCVBUFFORCE:
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
break;
+ }
- case SO_OOBINLINE:
- sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+ /* No negative values (to prevent underflow, as val will be
+ * multiplied by 2).
+ */
+ __sock_set_rcvbuf(sk, max(val, 0));
+ break;
+
+ case SO_KEEPALIVE:
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, valbool);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ break;
+
+ case SO_OOBINLINE:
+ sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+ break;
+
+ case SO_NO_CHECK:
+ sk->sk_no_check_tx = valbool;
+ break;
+
+ case SO_LINGER:
+ if (optlen < sizeof(ling)) {
+ ret = -EINVAL; /* 1003.1g */
break;
-
- case SO_NO_CHECK:
- sk->sk_no_check = valbool;
+ }
+ if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
+ ret = -EFAULT;
break;
+ }
+ if (!ling.l_onoff) {
+ sock_reset_flag(sk, SOCK_LINGER);
+ } else {
+ unsigned long t_sec = ling.l_linger;
- case SO_PRIORITY:
- if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
- sk->sk_priority = val;
+ if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
+ WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
else
- ret = -EPERM;
- break;
-
- case SO_LINGER:
- if(optlen<sizeof(ling)) {
- ret = -EINVAL; /* 1003.1g */
- break;
- }
- if (copy_from_user(&ling,optval,sizeof(ling))) {
+ WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
+ sock_set_flag(sk, SOCK_LINGER);
+ }
+ break;
+
+ case SO_BSDCOMPAT:
+ break;
+
+ case SO_TIMESTAMP_OLD:
+ case SO_TIMESTAMP_NEW:
+ case SO_TIMESTAMPNS_OLD:
+ case SO_TIMESTAMPNS_NEW:
+ sock_set_timestamp(sk, optname, valbool);
+ break;
+
+ case SO_TIMESTAMPING_NEW:
+ case SO_TIMESTAMPING_OLD:
+ if (optlen == sizeof(timestamping)) {
+ if (copy_from_sockptr(&timestamping, optval,
+ sizeof(timestamping))) {
ret = -EFAULT;
break;
}
- if (!ling.l_onoff)
- sock_reset_flag(sk, SOCK_LINGER);
- else {
-#if (BITS_PER_LONG == 32)
- if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
- sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
- else
-#endif
- sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
- sock_set_flag(sk, SOCK_LINGER);
- }
- break;
+ } else {
+ memset(&timestamping, 0, sizeof(timestamping));
+ timestamping.flags = val;
+ }
+ ret = sock_set_timestamping(sk, optname, timestamping);
+ break;
- case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
+ case SO_RCVLOWAT:
+ {
+ int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
+
+ if (val < 0)
+ val = INT_MAX;
+ if (sock)
+ set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
+ if (set_rcvlowat)
+ ret = set_rcvlowat(sk, val);
+ else
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
+ break;
+ }
+ case SO_ATTACH_FILTER: {
+ struct sock_fprog fprog;
+
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
+ ret = sk_attach_filter(&fprog, sk);
+ break;
+ }
+ case SO_ATTACH_BPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_attach_bpf(ufd, sk);
+ }
+ break;
+
+ case SO_ATTACH_REUSEPORT_CBPF: {
+ struct sock_fprog fprog;
+
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
+ ret = sk_reuseport_attach_filter(&fprog, sk);
+ break;
+ }
+ case SO_ATTACH_REUSEPORT_EBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_reuseport_attach_bpf(ufd, sk);
+ }
+ break;
+
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
+ break;
+
+ case SO_DETACH_FILTER:
+ ret = sk_detach_filter(sk);
+ break;
+
+ case SO_LOCK_FILTER:
+ if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
+ ret = -EPERM;
+ else
+ sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
+ break;
+
+ case SO_MARK:
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
break;
+ }
- case SO_PASSCRED:
- if (valbool)
- set_bit(SOCK_PASSCRED, &sock->flags);
+ __sock_set_mark(sk, val);
+ break;
+ case SO_RCVMARK:
+ sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
+ break;
+
+ case SO_RCVPRIORITY:
+ sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
+ break;
+
+ case SO_RXQ_OVFL:
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
+ break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
+ case SO_NOFCS:
+ sock_valbool_flag(sk, SOCK_NOFCS, valbool);
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
+ case SO_PASSCRED:
+ if (sk_may_scm_recv(sk))
+ sk->sk_scm_credentials = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSSEC:
+ if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
+ sk->sk_scm_security = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSPIDFD:
+ if (sk_is_unix(sk))
+ sk->sk_scm_pidfd = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (sk_is_unix(sk))
+ sk->sk_scm_rights = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_INCOMING_CPU:
+ reuseport_update_incoming_cpu(sk, val);
+ break;
+
+ case SO_CNX_ADVICE:
+ if (val == 1)
+ dst_negative_advice(sk);
+ break;
+
+ case SO_ZEROCOPY:
+ if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
+ if (!(sk_is_tcp(sk) ||
+ (sk->sk_type == SOCK_DGRAM &&
+ sk->sk_protocol == IPPROTO_UDP)))
+ ret = -EOPNOTSUPP;
+ } else if (sk->sk_family != PF_RDS) {
+ ret = -EOPNOTSUPP;
+ }
+ if (!ret) {
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
else
- clear_bit(SOCK_PASSCRED, &sock->flags);
- break;
+ sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ }
+ break;
- case SO_TIMESTAMP:
- if (valbool) {
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk);
- } else
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ case SO_TXTIME:
+ if (optlen != sizeof(struct sock_txtime)) {
+ ret = -EINVAL;
break;
-
- case SO_RCVLOWAT:
- if (val < 0)
- val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
+ } else if (copy_from_sockptr(&sk_txtime, optval,
+ sizeof(struct sock_txtime))) {
+ ret = -EFAULT;
break;
-
- case SO_RCVTIMEO:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
+ ret = -EINVAL;
+ break;
+ }
+ /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+ * scheduler has enough safe guards.
+ */
+ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
break;
+ }
- case SO_SNDTIMEO:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ ret = sockopt_validate_clockid(sk_txtime.clockid);
+ if (ret)
break;
-#ifdef CONFIG_NETDEVICES
- case SO_BINDTODEVICE:
- {
- char devname[IFNAMSIZ];
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
- /* Sorry... */
- if (!capable(CAP_NET_RAW)) {
- ret = -EPERM;
- break;
- }
+ case SO_BINDTOIFINDEX:
+ ret = sock_bindtoindex_locked(sk, val);
+ break;
- /* Bind this socket to a particular device like "eth0",
- * as specified in the passed interface name. If the
- * name is "" or the option length is zero the socket
- * is not bound.
- */
-
- if (!valbool) {
- sk->sk_bound_dev_if = 0;
- } else {
- if (optlen > IFNAMSIZ - 1)
- optlen = IFNAMSIZ - 1;
- memset(devname, 0, sizeof(devname));
- if (copy_from_user(devname, optval, optlen)) {
- ret = -EFAULT;
- break;
- }
-
- /* Remove any cached route for this socket. */
- sk_dst_reset(sk);
-
- if (devname[0] == '\0') {
- sk->sk_bound_dev_if = 0;
- } else {
- struct net_device *dev = dev_get_by_name(devname);
- if (!dev) {
- ret = -ENODEV;
- break;
- }
- sk->sk_bound_dev_if = dev->ifindex;
- dev_put(dev);
- }
- }
+ case SO_BUF_LOCK:
+ if (val & ~SOCK_BUF_LOCK_MASK) {
+ ret = -EINVAL;
break;
}
-#endif
+ sk->sk_userlocks = val | (sk->sk_userlocks &
+ ~SOCK_BUF_LOCK_MASK);
+ break;
+ case SO_RESERVE_MEM:
+ {
+ int delta;
- case SO_ATTACH_FILTER:
+ if (val < 0) {
ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
+ break;
+ }
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
+ delta = val - sk->sk_reserved_mem;
+ if (delta < 0)
+ sock_release_reserved_memory(sk, -delta);
+ else
+ ret = sock_reserve_memory(sk, delta);
+ break;
+ }
- ret = sk_attach_filter(&fprog, sk);
- }
- break;
+ default:
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ sockopt_release_sock(sk);
+ return ret;
+}
- case SO_DETACH_FILTER:
- rcu_read_lock_bh();
- filter = rcu_dereference(sk->sk_filter);
- if (filter) {
- rcu_assign_pointer(sk->sk_filter, NULL);
- sk_filter_release(sk, filter);
- rcu_read_unlock_bh();
- break;
- }
- rcu_read_unlock_bh();
- ret = -ENONET;
- break;
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
+EXPORT_SYMBOL(sock_setsockopt);
- case SO_PASSSEC:
- if (valbool)
- set_bit(SOCK_PASSSEC, &sock->flags);
- else
- clear_bit(SOCK_PASSSEC, &sock->flags);
- break;
+static const struct cred *sk_get_peer_cred(struct sock *sk)
+{
+ const struct cred *cred;
- /* We implement the SO_SNDLOWAT etc to
- not be settable (1003.1g 5.3) */
- default:
- ret = -ENOPROTOOPT;
- break;
- }
- release_sock(sk);
- return ret;
+ spin_lock(&sk->sk_peer_lock);
+ cred = get_cred(sk->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ return cred;
}
+static void cred_to_ucred(struct pid *pid, const struct cred *cred,
+ struct ucred *ucred)
+{
+ ucred->pid = pid_vnr(pid);
+ ucred->uid = ucred->gid = -1;
+ if (cred) {
+ struct user_namespace *current_ns = current_user_ns();
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
+ }
+}
+
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
- struct sock *sk = sock->sk;
-
- union
- {
- int val;
- struct linger ling;
- struct timeval tm;
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
+{
+ struct socket *sock = sk->sk_socket;
+
+ union {
+ int val;
+ u64 val64;
+ unsigned long ulval;
+ struct linger ling;
+ struct old_timeval32 tm32;
+ struct __kernel_old_timeval tm;
+ struct __kernel_sock_timeval stm;
+ struct sock_txtime txtime;
+ struct so_timestamping timestamping;
} v;
-
- unsigned int lv = sizeof(int);
+
+ int lv = sizeof(int);
int len;
-
- if(get_user(len,optlen))
- return -EFAULT;
- if(len < 0)
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+ if (len < 0)
return -EINVAL;
-
- switch(optname)
- {
- case SO_DEBUG:
- v.val = sock_flag(sk, SOCK_DBG);
- break;
-
- case SO_DONTROUTE:
- v.val = sock_flag(sk, SOCK_LOCALROUTE);
- break;
-
- case SO_BROADCAST:
- v.val = !!sock_flag(sk, SOCK_BROADCAST);
- break;
- case SO_SNDBUF:
- v.val = sk->sk_sndbuf;
- break;
-
- case SO_RCVBUF:
- v.val = sk->sk_rcvbuf;
- break;
+ memset(&v, 0, sizeof(v));
+
+ switch (optname) {
+ case SO_DEBUG:
+ v.val = sock_flag(sk, SOCK_DBG);
+ break;
+
+ case SO_DONTROUTE:
+ v.val = sock_flag(sk, SOCK_LOCALROUTE);
+ break;
+
+ case SO_BROADCAST:
+ v.val = sock_flag(sk, SOCK_BROADCAST);
+ break;
+
+ case SO_SNDBUF:
+ v.val = READ_ONCE(sk->sk_sndbuf);
+ break;
+
+ case SO_RCVBUF:
+ v.val = READ_ONCE(sk->sk_rcvbuf);
+ break;
+
+ case SO_REUSEADDR:
+ v.val = sk->sk_reuse;
+ break;
+
+ case SO_REUSEPORT:
+ v.val = sk->sk_reuseport;
+ break;
+
+ case SO_KEEPALIVE:
+ v.val = sock_flag(sk, SOCK_KEEPOPEN);
+ break;
+
+ case SO_TYPE:
+ v.val = sk->sk_type;
+ break;
+
+ case SO_PROTOCOL:
+ v.val = sk->sk_protocol;
+ break;
+
+ case SO_DOMAIN:
+ v.val = sk->sk_family;
+ break;
+
+ case SO_ERROR:
+ v.val = -sock_error(sk);
+ if (v.val == 0)
+ v.val = xchg(&sk->sk_err_soft, 0);
+ break;
+
+ case SO_OOBINLINE:
+ v.val = sock_flag(sk, SOCK_URGINLINE);
+ break;
+
+ case SO_NO_CHECK:
+ v.val = sk->sk_no_check_tx;
+ break;
+
+ case SO_PRIORITY:
+ v.val = READ_ONCE(sk->sk_priority);
+ break;
+
+ case SO_LINGER:
+ lv = sizeof(v.ling);
+ v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
+ v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
+ break;
+
+ case SO_BSDCOMPAT:
+ break;
+
+ case SO_TIMESTAMP_OLD:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_TSTAMP_NEW) &&
+ !sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPNS_OLD:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
+ break;
+
+ case SO_TIMESTAMP_NEW:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
+ break;
+
+ case SO_TIMESTAMPNS_NEW:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
+ break;
+
+ case SO_TIMESTAMPING_OLD:
+ case SO_TIMESTAMPING_NEW:
+ lv = sizeof(v.timestamping);
+ /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
+ * returning the flags when they were set through the same option.
+ * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
+ */
+ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
+ v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
+ v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
+ }
+ break;
- case SO_REUSEADDR:
- v.val = sk->sk_reuse;
- break;
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
+ SO_RCVTIMEO_OLD == optname);
+ break;
- case SO_KEEPALIVE:
- v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
- break;
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
+ SO_SNDTIMEO_OLD == optname);
+ break;
- case SO_TYPE:
- v.val = sk->sk_type;
- break;
+ case SO_RCVLOWAT:
+ v.val = READ_ONCE(sk->sk_rcvlowat);
+ break;
- case SO_ERROR:
- v.val = -sock_error(sk);
- if(v.val==0)
- v.val = xchg(&sk->sk_err_soft, 0);
- break;
+ case SO_SNDLOWAT:
+ v.val = 1;
+ break;
- case SO_OOBINLINE:
- v.val = !!sock_flag(sk, SOCK_URGINLINE);
- break;
-
- case SO_NO_CHECK:
- v.val = sk->sk_no_check;
- break;
+ case SO_PASSCRED:
+ if (!sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
- case SO_PRIORITY:
- v.val = sk->sk_priority;
- break;
-
- case SO_LINGER:
- lv = sizeof(v.ling);
- v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
- v.ling.l_linger = sk->sk_lingertime / HZ;
- break;
-
- case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
- break;
+ v.val = sk->sk_scm_credentials;
+ break;
- case SO_TIMESTAMP:
- v.val = sock_flag(sk, SOCK_RCVTSTAMP);
- break;
+ case SO_PASSPIDFD:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
- case SO_RCVTIMEO:
- lv=sizeof(struct timeval);
- if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
- }
- break;
+ v.val = sk->sk_scm_pidfd;
+ break;
- case SO_SNDTIMEO:
- lv=sizeof(struct timeval);
- if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
- }
- break;
+ case SO_PASSRIGHTS:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
- case SO_RCVLOWAT:
- v.val = sk->sk_rcvlowat;
- break;
+ v.val = sk->sk_scm_rights;
+ break;
- case SO_SNDLOWAT:
- v.val=1;
- break;
+ case SO_PEERCRED:
+ {
+ struct ucred peercred;
+ if (len > sizeof(peercred))
+ len = sizeof(peercred);
- case SO_PASSCRED:
- v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
- break;
+ spin_lock(&sk->sk_peer_lock);
+ cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+ spin_unlock(&sk->sk_peer_lock);
- case SO_PEERCRED:
- if (len > sizeof(sk->sk_peercred))
- len = sizeof(sk->sk_peercred);
- if (copy_to_user(optval, &sk->sk_peercred, len))
- return -EFAULT;
- goto lenout;
+ if (copy_to_sockptr(optval, &peercred, len))
+ return -EFAULT;
+ goto lenout;
+ }
- case SO_PEERNAME:
- {
- char address[128];
+ case SO_PEERPIDFD:
+ {
+ struct pid *peer_pid;
+ struct file *pidfd_file = NULL;
+ unsigned int flags = 0;
+ int pidfd;
- if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
- return -ENOTCONN;
- if (lv < len)
- return -EINVAL;
- if (copy_to_user(optval, address, len))
- return -EFAULT;
- goto lenout;
- }
+ if (len > sizeof(pidfd))
+ len = sizeof(pidfd);
- /* Dubious BSD thing... Probably nobody even uses it, but
- * the UNIX standard wants it for whatever reason... -DaveM
+ spin_lock(&sk->sk_peer_lock);
+ peer_pid = get_pid(sk->sk_peer_pid);
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (!peer_pid)
+ return -ENODATA;
+
+ /* The use of PIDFD_STALE requires stashing of struct pid
+ * on pidfs with pidfs_register_pid() and only AF_UNIX
+ * were prepared for this.
*/
- case SO_ACCEPTCONN:
- v.val = sk->sk_state == TCP_LISTEN;
- break;
+ if (sk->sk_family == AF_UNIX)
+ flags = PIDFD_STALE;
- case SO_PASSSEC:
- v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
- break;
+ pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
+ put_pid(peer_pid);
+ if (pidfd < 0)
+ return pidfd;
+
+ if (copy_to_sockptr(optval, &pidfd, len) ||
+ copy_to_sockptr(optlen, &len, sizeof(int))) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
- case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return -EFAULT;
+ }
+
+ fd_install(pidfd, pidfd_file);
+ return 0;
+ }
+
+ case SO_PEERGROUPS:
+ {
+ const struct cred *cred;
+ int ret, n;
+
+ cred = sk_get_peer_cred(sk);
+ if (!cred)
+ return -ENODATA;
+
+ n = cred->group_info->ngroups;
+ if (len < n * sizeof(gid_t)) {
+ len = n * sizeof(gid_t);
+ put_cred(cred);
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
+ }
+ len = n * sizeof(gid_t);
- default:
- return(-ENOPROTOOPT);
+ ret = groups_to_user(optval, cred->group_info);
+ put_cred(cred);
+ if (ret)
+ return ret;
+ goto lenout;
+ }
+
+ case SO_PEERNAME:
+ {
+ struct sockaddr_storage address;
+
+ lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
+ if (lv < 0)
+ return -ENOTCONN;
+ if (lv < len)
+ return -EINVAL;
+ if (copy_to_sockptr(optval, &address, len))
+ return -EFAULT;
+ goto lenout;
+ }
+
+ /* Dubious BSD thing... Probably nobody even uses it, but
+ * the UNIX standard wants it for whatever reason... -DaveM
+ */
+ case SO_ACCEPTCONN:
+ v.val = sk->sk_state == TCP_LISTEN;
+ break;
+
+ case SO_PASSSEC:
+ if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_security;
+ break;
+
+ case SO_PEERSEC:
+ return security_socket_getpeersec_stream(sock,
+ optval, optlen, len);
+
+ case SO_MARK:
+ v.val = READ_ONCE(sk->sk_mark);
+ break;
+
+ case SO_RCVMARK:
+ v.val = sock_flag(sk, SOCK_RCVMARK);
+ break;
+
+ case SO_RCVPRIORITY:
+ v.val = sock_flag(sk, SOCK_RCVPRIORITY);
+ break;
+
+ case SO_RXQ_OVFL:
+ v.val = sock_flag(sk, SOCK_RXQ_OVFL);
+ break;
+
+ case SO_WIFI_STATUS:
+ v.val = sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
+ case SO_PEEK_OFF:
+ if (!READ_ONCE(sock->ops)->set_peek_off)
+ return -EOPNOTSUPP;
+
+ v.val = READ_ONCE(sk->sk_peek_off);
+ break;
+ case SO_NOFCS:
+ v.val = sock_flag(sk, SOCK_NOFCS);
+ break;
+
+ case SO_BINDTODEVICE:
+ return sock_getbindtodevice(sk, optval, optlen, len);
+
+ case SO_GET_FILTER:
+ len = sk_get_filter(sk, optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+
+ case SO_LOCK_FILTER:
+ v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
+ break;
+
+ case SO_BPF_EXTENSIONS:
+ v.val = bpf_tell_extensions();
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_ll_usec);
+ break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
+ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
+ lv = sizeof(v.ulval);
+ v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
+ } else {
+ /* 32bit version */
+ v.val = min_t(unsigned long, ~0U,
+ READ_ONCE(sk->sk_max_pacing_rate));
+ }
+ break;
+
+ case SO_INCOMING_CPU:
+ v.val = READ_ONCE(sk->sk_incoming_cpu);
+ break;
+
+ case SO_MEMINFO:
+ {
+ u32 meminfo[SK_MEMINFO_VARS];
+
+ sk_get_meminfo(sk, meminfo);
+
+ len = min_t(unsigned int, len, sizeof(meminfo));
+ if (copy_to_sockptr(optval, &meminfo, len))
+ return -EFAULT;
+
+ goto lenout;
+ }
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_INCOMING_NAPI_ID:
+ v.val = READ_ONCE(sk->sk_napi_id);
+
+ /* aggregate non-NAPI IDs down to 0 */
+ if (!napi_id_valid(v.val))
+ v.val = 0;
+
+ break;
+#endif
+
+ case SO_COOKIE:
+ lv = sizeof(u64);
+ if (len < lv)
+ return -EINVAL;
+ v.val64 = sock_gen_cookie(sk);
+ break;
+
+ case SO_ZEROCOPY:
+ v.val = sock_flag(sk, SOCK_ZEROCOPY);
+ break;
+
+ case SO_TXTIME:
+ lv = sizeof(v.txtime);
+ v.txtime.clockid = sk->sk_clockid;
+ v.txtime.flags |= sk->sk_txtime_deadline_mode ?
+ SOF_TXTIME_DEADLINE_MODE : 0;
+ v.txtime.flags |= sk->sk_txtime_report_errors ?
+ SOF_TXTIME_REPORT_ERRORS : 0;
+ break;
+
+ case SO_BINDTOIFINDEX:
+ v.val = READ_ONCE(sk->sk_bound_dev_if);
+ break;
+
+ case SO_NETNS_COOKIE:
+ lv = sizeof(u64);
+ if (len != lv)
+ return -EINVAL;
+ v.val64 = sock_net(sk)->net_cookie;
+ break;
+
+ case SO_BUF_LOCK:
+ v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+ break;
+
+ case SO_RESERVE_MEM:
+ v.val = READ_ONCE(sk->sk_reserved_mem);
+ break;
+
+ case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+
+ /* Paired with WRITE_ONCE() in sk_setsockopt() */
+ v.val = READ_ONCE(sk->sk_txrehash);
+ break;
+
+ default:
+ /* We implement the SO_SNDLOWAT etc to not be settable
+ * (1003.1g 7).
+ */
+ return -ENOPROTOOPT;
}
+
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
- return -EFAULT;
- return 0;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ return 0;
}
/*
@@ -807,66 +2175,87 @@ lenout:
*
* (We also register the sk_lock with the lock validator.)
*/
-static void inline sock_lock_init(struct sock *sk)
+static inline void sock_lock_init(struct sock *sk)
{
- spin_lock_init(&sk->sk_lock.slock);
- sk->sk_lock.owner = NULL;
- init_waitqueue_head(&sk->sk_lock.wq);
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)&sk->sk_lock, sizeof(sk->sk_lock));
-
- /*
- * Mark both the sk_lock and the sk_lock.slock as a
- * per-address-family lock class:
- */
- lockdep_set_class_and_name(&sk->sk_lock.slock,
- af_family_slock_keys + sk->sk_family,
- af_family_slock_key_strings[sk->sk_family]);
- lockdep_init_map(&sk->sk_lock.dep_map,
- af_family_key_strings[sk->sk_family],
- af_family_keys + sk->sk_family, 0);
+ sk_owner_clear(sk);
+
+ if (sk->sk_kern_sock)
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_kern_slock_key_strings[sk->sk_family],
+ af_family_kern_slock_keys + sk->sk_family,
+ af_family_kern_key_strings[sk->sk_family],
+ af_family_kern_keys + sk->sk_family);
+ else
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_slock_key_strings[sk->sk_family],
+ af_family_slock_keys + sk->sk_family,
+ af_family_key_strings[sk->sk_family],
+ af_family_keys + sk->sk_family);
}
-/**
- * sk_alloc - All socket objects are allocated here
- * @family: protocol family
- * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
- * @prot: struct proto associated with this new sock instance
- * @zero_it: if we should zero the newly allocated sock
+/*
+ * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
+ * even temporarily, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/
-struct sock *sk_alloc(int family, gfp_t priority,
- struct proto *prot, int zero_it)
+static void sock_copy(struct sock *nsk, const struct sock *osk)
{
- struct sock *sk = NULL;
- kmem_cache_t *slab = prot->slab;
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
+#ifdef CONFIG_SECURITY_NETWORK
+ void *sptr = nsk->sk_security;
+#endif
- if (slab != NULL)
- sk = kmem_cache_alloc(slab, priority);
- else
+ /* If we move sk_tx_queue_mapping out of the private section,
+ * we must check if sk_tx_queue_clear() is called after
+ * sock_copy() in sk_clone_lock().
+ */
+ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+ offsetof(struct sock, sk_dontcopy_begin) ||
+ offsetof(struct sock, sk_tx_queue_mapping) >=
+ offsetof(struct sock, sk_dontcopy_end));
+
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+ unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
+ /* alloc is larger than struct, see sk_prot_alloc() */);
+
+#ifdef CONFIG_SECURITY_NETWORK
+ nsk->sk_security = sptr;
+ security_sk_clone(osk, nsk);
+#endif
+}
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+ int family)
+{
+ struct sock *sk;
+ struct kmem_cache *slab;
+
+ slab = prot->slab;
+ if (slab != NULL) {
+ sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+ if (!sk)
+ return sk;
+ if (want_init_on_alloc(priority))
+ sk_prot_clear_nulls(sk, prot->obj_size);
+ } else
sk = kmalloc(prot->obj_size, priority);
- if (sk) {
- if (zero_it) {
- memset(sk, 0, prot->obj_size);
- sk->sk_family = family;
- /*
- * See comment in struct sock definition to understand
- * why we need sk_prot_creator -acme
- */
- sk->sk_prot = sk->sk_prot_creator = prot;
- sock_lock_init(sk);
- }
-
+ if (sk != NULL) {
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
- goto out_free;
+ goto out_free_sec;
}
+
return sk;
+out_free_sec:
+ security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
@@ -875,175 +2264,558 @@ out_free:
return NULL;
}
-void sk_free(struct sock *sk)
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+ struct kmem_cache *slab;
+ struct module *owner;
+
+ owner = prot->owner;
+ slab = prot->slab;
+
+ cgroup_sk_free(&sk->sk_cgrp_data);
+ mem_cgroup_sk_free(sk);
+ security_sk_free(sk);
+
+ sk_owner_put(sk);
+
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ module_put(owner);
+}
+
+/**
+ * sk_alloc - All socket objects are allocated here
+ * @net: the applicable net namespace
+ * @family: protocol family
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+ struct proto *prot, int kern)
{
+ struct sock *sk;
+
+ sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+ if (sk) {
+ sk->sk_family = family;
+ /*
+ * See comment in struct sock definition to understand
+ * why we need sk_prot_creator -acme
+ */
+ sk->sk_prot = sk->sk_prot_creator = prot;
+
+ if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
+ sk->sk_bypass_prot_mem = 1;
+
+ sk->sk_kern_sock = kern;
+ sock_lock_init(sk);
+
+ sk->sk_net_refcnt = kern ? 0 : 1;
+ if (likely(sk->sk_net_refcnt)) {
+ get_net_track(net, &sk->ns_tracker, priority);
+ sock_inuse_add(net, 1);
+ } else {
+ net_passive_inc(net);
+ __netns_tracker_alloc(net, &sk->ns_tracker,
+ false, priority);
+ }
+
+ sock_net_set(sk, net);
+ refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
+
+ mem_cgroup_sk_alloc(sk);
+ cgroup_sk_alloc(&sk->sk_cgrp_data);
+ sock_update_classid(&sk->sk_cgrp_data);
+ sock_update_netprioidx(&sk->sk_cgrp_data);
+ sk_tx_queue_clear(sk);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL(sk_alloc);
+
+/* Sockets having SOCK_RCU_FREE will call this function after one RCU
+ * grace period. This is the case for UDP sockets and TCP listeners.
+ */
+static void __sk_destruct(struct rcu_head *head)
+{
+ struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct net *net = sock_net(sk);
struct sk_filter *filter;
- struct module *owner = sk->sk_prot_creator->owner;
if (sk->sk_destruct)
sk->sk_destruct(sk);
- filter = rcu_dereference(sk->sk_filter);
+ filter = rcu_dereference_check(sk->sk_filter,
+ refcount_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
- sk_filter_release(sk, filter);
- rcu_assign_pointer(sk->sk_filter, NULL);
+ sk_filter_uncharge(sk, filter);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk);
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
+
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_sk_storage_free(sk);
+#endif
if (atomic_read(&sk->sk_omem_alloc))
- printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
- __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
+ pr_debug("%s: optmem leakage (%d bytes) detected\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
- security_sk_free(sk);
- if (sk->sk_prot_creator->slab != NULL)
- kmem_cache_free(sk->sk_prot_creator->slab, sk);
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
+ /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
+ put_cred(sk->sk_peer_cred);
+ put_pid(sk->sk_peer_pid);
+
+ if (likely(sk->sk_net_refcnt)) {
+ put_net_track(net, &sk->ns_tracker);
+ } else {
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ }
+ sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+void sk_net_refcnt_upgrade(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ WARN_ON_ONCE(sk->sk_net_refcnt);
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+}
+EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
+
+void sk_destruct(struct sock *sk)
+{
+ bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
+
+ if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+ reuseport_detach_sock(sk);
+ use_call_rcu = true;
+ }
+
+ if (use_call_rcu)
+ call_rcu(&sk->sk_rcu, __sk_destruct);
else
- kfree(sk);
- module_put(owner);
+ __sk_destruct(&sk->sk_rcu);
+}
+
+static void __sk_free(struct sock *sk)
+{
+ if (likely(sk->sk_net_refcnt))
+ sock_inuse_add(sock_net(sk), -1);
+
+ if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
+ sock_diag_broadcast_destroy(sk);
+ else
+ sk_destruct(sk);
+}
+
+void sk_free(struct sock *sk)
+{
+ /*
+ * We subtract one from sk_wmem_alloc and can know if
+ * some packets are still in some tx queue.
+ * If not null, sock_wfree() will call __sk_free(sk) later
+ */
+ if (refcount_dec_and_test(&sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sk_free);
+
+static void sk_init_common(struct sock *sk)
+{
+ skb_queue_head_init(&sk->sk_receive_queue);
+ skb_queue_head_init(&sk->sk_write_queue);
+ skb_queue_head_init(&sk->sk_error_queue);
+
+ rwlock_init(&sk->sk_callback_lock);
+ lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
+ af_rlock_keys + sk->sk_family,
+ af_family_rlock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_write_queue.lock,
+ af_wlock_keys + sk->sk_family,
+ af_family_wlock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_error_queue.lock,
+ af_elock_keys + sk->sk_family,
+ af_family_elock_key_strings[sk->sk_family]);
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
}
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+/**
+ * sk_clone - clone a socket
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @lock: if true, lock the cloned sk
+ *
+ * If @lock is true, the clone is locked by bh_lock_sock(), and
+ * caller must unlock socket even in error path by bh_unlock_sock().
+ */
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
+ bool lock)
{
- struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct sk_filter *filter;
+ bool is_charged = true;
+ struct sock *newsk;
+
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
+ if (!newsk)
+ goto out;
- if (newsk != NULL) {
- struct sk_filter *filter;
+ sock_copy(newsk, sk);
- sock_copy(newsk, sk);
+ newsk->sk_prot_creator = prot;
- /* SANITY */
- sk_node_init(&newsk->sk_node);
- sock_lock_init(newsk);
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt)) {
+ get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
+ sock_inuse_add(sock_net(newsk), 1);
+ } else {
+ /* Kernel sockets are not elevating the struct net refcount.
+ * Instead, use a tracker to more easily detect if a layer
+ * is not properly dismantling its kernel sockets at netns
+ * destroy time.
+ */
+ net_passive_inc(sock_net(newsk));
+ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
+ false, priority);
+ }
+
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+
+ if (lock)
bh_lock_sock(newsk);
- atomic_set(&newsk->sk_rmem_alloc, 0);
- atomic_set(&newsk->sk_wmem_alloc, 0);
- atomic_set(&newsk->sk_omem_alloc, 0);
- skb_queue_head_init(&newsk->sk_receive_queue);
- skb_queue_head_init(&newsk->sk_write_queue);
-#ifdef CONFIG_NET_DMA
- skb_queue_head_init(&newsk->sk_async_wait_queue);
-#endif
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
- rwlock_init(&newsk->sk_dst_lock);
- rwlock_init(&newsk->sk_callback_lock);
- lockdep_set_class(&newsk->sk_callback_lock,
- af_callback_keys + newsk->sk_family);
-
- newsk->sk_dst_cache = NULL;
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
- newsk->sk_send_head = NULL;
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
-
- sock_reset_flag(newsk, SOCK_DONE);
- skb_queue_head_init(&newsk->sk_error_queue);
-
- filter = newsk->sk_filter;
- if (filter != NULL)
- sk_filter_charge(newsk, filter);
-
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- newsk = NULL;
- goto out;
- }
+ atomic_set(&newsk->sk_rmem_alloc, 0);
- newsk->sk_err = 0;
- newsk->sk_priority = 0;
- atomic_set(&newsk->sk_refcnt, 2);
+ refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
- /*
- * Increment the counter in the same struct proto as the master
- * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
- * is the same as sk->sk_prot->socks, as this field was copied
- * with memcpy).
- *
- * This _changes_ the previous behaviour, where
- * tcp_create_openreq_child always was incrementing the
- * equivalent to tcp_prot->socks (inet_sock_nr), so this have
- * to be taken into account in all callers. -acme
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_reserved_mem = 0;
+ DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
+ sk_drops_reset(newsk);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
+
+ sock_reset_flag(newsk, SOCK_DONE);
+
+#ifdef CONFIG_MEMCG
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+#endif
+
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
+
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
*/
- sk_refcnt_debug_inc(newsk);
- newsk->sk_socket = NULL;
- newsk->sk_sleep = NULL;
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
- if (newsk->sk_prot->sockets_allocated)
- atomic_inc(newsk->sk_prot->sockets_allocated);
+ goto free;
}
+
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+
+ if (bpf_sk_storage_clone(sk, newsk))
+ goto free;
+
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ newsk->sk_user_data = NULL;
+
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+
+ /* Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
+
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, NULL);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
out:
return newsk;
+free:
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free()
+ */
+ newsk->sk_destruct = NULL;
+ if (lock)
+ bh_unlock_sock(newsk);
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
}
-
EXPORT_SYMBOL_GPL(sk_clone);
-void __init sk_init(void)
+static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
- if (num_physpages <= 4096) {
- sysctl_wmem_max = 32767;
- sysctl_rmem_max = 32767;
- sysctl_wmem_default = 32767;
- sysctl_rmem_default = 32767;
- } else if (num_physpages >= 131072) {
- sysctl_wmem_max = 131071;
- sysctl_rmem_max = 131071;
+ bool is_ipv6 = false;
+ u32 max_size;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ is_ipv6 = (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
+#endif
+ /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
+ max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
+ READ_ONCE(dev->gso_ipv4_max_size);
+ if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
+ max_size = GSO_LEGACY_MAX_SIZE;
+
+ return max_size - (MAX_TCP_HEADER + 1);
+}
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+ const struct net_device *dev;
+ u32 max_segs = 1;
+
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ sk->sk_route_caps = dev->features;
+ if (sk_is_tcp(sk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk->sk_route_caps |= NETIF_F_GSO;
+ icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
}
+ if (sk->sk_route_caps & NETIF_F_GSO)
+ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+ if (unlikely(sk->sk_gso_disabled))
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ if (sk_can_gso(sk)) {
+ if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ } else {
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
+ max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
+ }
+ }
+ sk->sk_gso_max_segs = max_segs;
+ sk_dst_set(sk, dst);
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
/*
* Simple resource managers for sockets.
*/
-/*
- * Write buffer destructor automatically called from kfree_skb.
+/*
+ * Write buffer destructor automatically called from kfree_skb.
*/
void sock_wfree(struct sk_buff *skb)
{
+ unsigned int len = skb->truesize;
struct sock *sk = skb->sk;
+ bool free;
+ int old;
+
+ if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ if (sock_flag(sk, SOCK_RCU_FREE) &&
+ sk->sk_write_space == sock_def_write_space) {
+ rcu_read_lock();
+ free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
+ &old);
+ sock_def_write_space_wfree(sk, old - len);
+ rcu_read_unlock();
+ if (unlikely(free))
+ __sk_free(sk);
+ return;
+ }
- /* In case it might be waiting for more memory. */
- atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
- if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
+ /*
+ * Keep a reference on sk_wmem_alloc, this will be released
+ * after sk_write_space() call
+ */
+ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
sk->sk_write_space(sk);
- sock_put(sk);
+ len = 1;
+ }
+ /*
+ * if sk_wmem_alloc reaches 0, we must finish what sk_free()
+ * could not do because of in-flight packets
+ */
+ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
+ __sk_free(sk);
}
+EXPORT_SYMBOL(sock_wfree);
-/*
- * Read buffer destructor automatically called from kfree_skb.
+/* This variant of sock_wfree() is used by TCP,
+ * since it sets SOCK_USE_WRITE_QUEUE.
*/
-void sock_rfree(struct sk_buff *skb)
+void __sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+
+void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+ int old_wmem;
+
+ skb_orphan(skb);
+#ifdef CONFIG_INET
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
+#endif
+ skb->sk = sk;
+ skb->destructor = sock_wfree;
+ skb_set_hash_from_sk(skb, sk);
+ /*
+ * We used to take a refcount on sk, but following operation
+ * is enough to guarantee sk_free() won't free this sock until
+ * all in-flight packets are completed
+ */
+ __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
+
+ /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
+ * is in a host queue (qdisc, NIC queue).
+ * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
+ * based on XPS for better performance.
+ * Otherwise clear ooo_okay to not risk Out Of Order delivery.
+ */
+ skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
}
+EXPORT_SYMBOL(skb_set_owner_w);
+
+static bool can_skb_orphan_partial(const struct sk_buff *skb)
+{
+ /* Drivers depend on in-order delivery for crypto offload,
+ * partial orphan breaks out-of-order-OK logic.
+ */
+ if (skb_is_decrypted(skb))
+ return false;
+ return (skb->destructor == sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
+}
-int sock_i_uid(struct sock *sk)
+/* This helper is used by netem, as it can hold packets in its
+ * delay queue. We want to allow the owner socket to send more
+ * packets, as if they were already TX completed by a typical driver.
+ * But we also want to keep skb->sk set because some packet schedulers
+ * rely on it (sch_fq for example).
+ */
+void skb_orphan_partial(struct sk_buff *skb)
{
- int uid;
+ if (skb_is_tcp_pure_ack(skb))
+ return;
+
+ if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
+ return;
- read_lock(&sk->sk_callback_lock);
- uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
- read_unlock(&sk->sk_callback_lock);
- return uid;
+ skb_orphan(skb);
}
+EXPORT_SYMBOL(skb_orphan_partial);
-unsigned long sock_i_ino(struct sock *sk)
+/*
+ * Read buffer destructor automatically called from kfree_skb.
+ */
+void sock_rfree(struct sk_buff *skb)
{
- unsigned long ino;
+ struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
- read_lock(&sk->sk_callback_lock);
- ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock(&sk->sk_callback_lock);
- return ino;
+ atomic_sub(len, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, len);
}
+EXPORT_SYMBOL(sock_rfree);
+
+/*
+ * Buffer destructor for skbs that are not used directly in read or write
+ * path, e.g. for error handler skbs. Automatically called from kfree_skb.
+ */
+void sock_efree(struct sk_buff *skb)
+{
+ sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_efree);
+
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ if (!sk_is_refcounted(sk))
+ return;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
+ inet_reqsk(sk)->rsk_listener = NULL;
+ reqsk_free(inet_reqsk(sk));
+ return;
+ }
+
+ sock_gen_put(sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
/*
* Allocate a skb from the socket's send buffer.
@@ -1051,8 +2823,10 @@ unsigned long sock_i_ino(struct sock *sk)
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- struct sk_buff * skb = alloc_skb(size, priority);
+ if (force ||
+ refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
+ struct sk_buff *skb = alloc_skb(size, priority);
+
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -1060,33 +2834,47 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
}
return NULL;
}
+EXPORT_SYMBOL(sock_wmalloc);
-/*
- * Allocate a skb from the socket's receive buffer.
- */
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+static void sock_ofree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_omem_alloc);
+}
+
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
gfp_t priority)
{
- if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
- struct sk_buff *skb = alloc_skb(size, priority);
- if (skb) {
- skb_set_owner_r(skb, sk);
- return skb;
- }
- }
- return NULL;
+ struct sk_buff *skb;
+
+ /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
+ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
+ READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
+ return NULL;
+
+ skb = alloc_skb(size, priority);
+ if (!skb)
+ return NULL;
+
+ atomic_add(skb->truesize, &sk->sk_omem_alloc);
+ skb->sk = sk;
+ skb->destructor = sock_ofree;
+ return skb;
}
-/*
+/*
* Allocate a memory block from the socket's option memory buffer.
- */
+ */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
- * might sleep.
+ * might sleep.
*/
atomic_add(size, &sk->sk_omem_alloc);
mem = kmalloc(size, priority);
@@ -1096,40 +2884,76 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
}
return NULL;
}
+EXPORT_SYMBOL(sock_kmalloc);
/*
- * Free an option memory block.
+ * Duplicate the input "src" memory block using the socket's
+ * option memory buffer.
*/
-void sock_kfree_s(struct sock *sk, void *mem, int size)
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority)
{
- kfree(mem);
+ void *mem;
+
+ mem = sock_kmalloc(sk, size, priority);
+ if (mem)
+ memcpy(mem, src, size);
+ return mem;
+}
+EXPORT_SYMBOL(sock_kmemdup);
+
+/* Free an option memory block. Note, we actually want the inline
+ * here as this allows gcc to detect the nullify and fold away the
+ * condition entirely.
+ */
+static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
+ const bool nullify)
+{
+ if (WARN_ON_ONCE(!mem))
+ return;
+ if (nullify)
+ kfree_sensitive(mem);
+ else
+ kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
}
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, false);
+}
+EXPORT_SYMBOL(sock_kfree_s);
+
+void sock_kzfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, true);
+}
+EXPORT_SYMBOL(sock_kzfree_s);
+
/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
I think, these locks should be removed for datagram sockets.
*/
-static long sock_wait_for_wmem(struct sock * sk, long timeo)
+static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
for (;;) {
if (!timeo)
break;
if (signal_pending(current))
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
- if (sk->sk_shutdown & SEND_SHUTDOWN)
+ if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
break;
- if (sk->sk_err)
+ if (READ_ONCE(sk->sk_err))
break;
timeo = schedule_timeout(timeo);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return timeo;
}
@@ -1138,71 +2962,28 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo)
* Generic send/receive buffer handlers
*/
-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
- unsigned long header_len,
- unsigned long data_len,
- int noblock, int *errcode)
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+ unsigned long data_len, int noblock,
+ int *errcode, int max_page_order)
{
struct sk_buff *skb;
- gfp_t gfp_mask;
long timeo;
int err;
- gfp_mask = sk->sk_allocation;
- if (gfp_mask & __GFP_WAIT)
- gfp_mask |= __GFP_REPEAT;
-
timeo = sock_sndtimeo(sk, noblock);
- while (1) {
+ for (;;) {
err = sock_error(sk);
if (err != 0)
goto failure;
err = -EPIPE;
- if (sk->sk_shutdown & SEND_SHUTDOWN)
+ if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- skb = alloc_skb(header_len, gfp_mask);
- if (skb) {
- int npages;
- int i;
-
- /* No pages, we're done... */
- if (!data_len)
- break;
-
- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- skb->truesize += data_len;
- skb_shinfo(skb)->nr_frags = npages;
- for (i = 0; i < npages; i++) {
- struct page *page;
- skb_frag_t *frag;
-
- page = alloc_pages(sk->sk_allocation, 0);
- if (!page) {
- err = -ENOBUFS;
- skb_shinfo(skb)->nr_frags = i;
- kfree_skb(skb);
- goto failure;
- }
-
- frag = &skb_shinfo(skb)->frags[i];
- frag->page = page;
- frag->page_offset = 0;
- frag->size = (data_len >= PAGE_SIZE ?
- PAGE_SIZE :
- data_len);
- data_len -= PAGE_SIZE;
- }
-
- /* Full success... */
- break;
- }
- err = -ENOBUFS;
- goto failure;
- }
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
+ break;
+
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
if (!timeo)
@@ -1211,8 +2992,10 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
goto interrupted;
timeo = sock_wait_for_wmem(sk, timeo);
}
-
- skb_set_owner_w(skb, sk);
+ skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
+ errcode, sk->sk_allocation);
+ if (skb)
+ skb_set_owner_w(skb, sk);
return skb;
interrupted:
@@ -1221,83 +3004,475 @@ failure:
*errcode = err;
return NULL;
}
+EXPORT_SYMBOL(sock_alloc_send_pskb);
+
+int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
+ struct sockcm_cookie *sockc)
+{
+ u32 tsflags;
+
+ BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
+
+ switch (cmsg->cmsg_type) {
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SO_TIMESTAMPING_OLD:
+ case SO_TIMESTAMPING_NEW:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+
+ tsflags = *(u32 *)CMSG_DATA(cmsg);
+ if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
+ return -EINVAL;
+
+ sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
+ sockc->tsflags |= tsflags;
+ break;
+ case SCM_TXTIME:
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
+ return -EINVAL;
+ sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
+ break;
+ case SCM_TS_OPT_ID:
+ if (sk_is_tcp(sk))
+ return -EINVAL;
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
+ sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
+ break;
+ /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
+ case SCM_RIGHTS:
+ case SCM_CREDENTIALS:
+ break;
+ case SO_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
+ return -EPERM;
+ sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SCM_DEVMEM_DMABUF:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__sock_cmsg_send);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode)
+int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+ struct sockcm_cookie *sockc)
{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+ struct cmsghdr *cmsg;
+ int ret;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+ ret = __sock_cmsg_send(sk, cmsg, sockc);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(sock_cmsg_send);
+
+static void sk_enter_memory_pressure(struct sock *sk)
+{
+ if (!sk->sk_prot->enter_memory_pressure)
+ return;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+}
+
+static void sk_leave_memory_pressure(struct sock *sk)
+{
+ if (sk->sk_prot->leave_memory_pressure) {
+ INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
+ tcp_leave_memory_pressure, sk);
+ } else {
+ unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
+
+ if (memory_pressure && READ_ONCE(*memory_pressure))
+ WRITE_ONCE(*memory_pressure, 0);
+ }
}
-static void __lock_sock(struct sock *sk)
+DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
+
+/**
+ * skb_page_frag_refill - check that a page_frag contains enough room
+ * @sz: minimum size of the fragment we want to get
+ * @pfrag: pointer to page_frag
+ * @gfp: priority for memory allocation
+ *
+ * Note: While this allocator tries to use high order pages, there is
+ * no guarantee that allocations succeed. Therefore, @sz MUST be
+ * less or equal than PAGE_SIZE.
+ */
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
+{
+ if (pfrag->page) {
+ if (page_ref_count(pfrag->page) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ pfrag->offset = 0;
+ if (SKB_FRAG_PAGE_ORDER &&
+ !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+ return true;
+ }
+ }
+ pfrag->page = alloc_page(gfp);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE;
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(skb_page_frag_refill);
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+ return true;
+
+ if (!sk->sk_bypass_prot_mem)
+ sk_enter_memory_pressure(sk);
+
+ sk_stream_moderate_sndbuf(sk);
+
+ return false;
+}
+EXPORT_SYMBOL(sk_page_frag_refill);
+
+void __lock_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
{
DEFINE_WAIT(wait);
- for(;;) {
+ for (;;) {
prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_bh(&sk->sk_lock.slock);
schedule();
spin_lock_bh(&sk->sk_lock.slock);
- if(!sock_owned_by_user(sk))
+ if (!sock_owned_by_user(sk))
break;
}
finish_wait(&sk->sk_lock.wq, &wait);
}
-static void __release_sock(struct sock *sk)
+void __release_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
{
- struct sk_buff *skb = sk->sk_backlog.head;
+ struct sk_buff *skb, *next;
+ int nb = 0;
- do {
+ while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
- bh_unlock_sock(sk);
-
- do {
- struct sk_buff *next = skb->next;
- skb->next = NULL;
- sk->sk_backlog_rcv(sk, skb);
+ spin_unlock_bh(&sk->sk_lock.slock);
- /*
- * We are in process context here with softirqs
- * disabled, use cond_resched_softirq() to preempt.
- * This is safe to do because we've taken the backlog
- * queue private:
- */
- cond_resched_softirq();
+ while (1) {
+ next = skb->next;
+ prefetch(next);
+ DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
+ skb_mark_not_on_list(skb);
+ sk_backlog_rcv(sk, skb);
skb = next;
- } while (skb != NULL);
+ if (!skb)
+ break;
- bh_lock_sock(sk);
- } while((skb = sk->sk_backlog.head) != NULL);
+ if (!(++nb & 15))
+ cond_resched();
+ }
+
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
+
+ /*
+ * Doing the zeroing here guarantee we can not loop forever
+ * while a wild producer attempts to flood us.
+ */
+ sk->sk_backlog.len = 0;
+}
+
+void __sk_flush_backlog(struct sock *sk)
+{
+ spin_lock_bh(&sk->sk_lock.slock);
+ __release_sock(sk);
+
+ if (sk->sk_prot->release_cb)
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
+
+ spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
* @timeo: for how long
+ * @skb: last skb seen on sk_receive_queue
*
* Now socket state including sk->sk_err is changed only under lock,
* hence we may omit checks after joining wait queue.
* We check receive queue before schedule() only as optimization;
* it is very likely that release_sock() added new data.
*/
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc;
- DEFINE_WAIT(wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- finish_wait(sk->sk_sleep, &wait);
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
-
EXPORT_SYMBOL(sk_wait_data);
+/**
+ * __sk_mem_raise_allocated - increase memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @amt: pages to allocate
+ * @kind: allocation type
+ *
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
+ *
+ * Unlike the globally shared limits among the sockets under same protocol,
+ * consuming the budget of a memcg won't have direct effect on other ones.
+ * So be optimistic about memcg's tolerance, and leave the callers to decide
+ * whether or not to raise allocated through sk_under_memory_pressure() or
+ * its variants.
+ */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
+{
+ bool memcg_enabled = false, charged = false;
+ struct proto *prot = sk->sk_prot;
+ long allocated = 0;
+
+ if (!sk->sk_bypass_prot_mem) {
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ }
+
+ if (mem_cgroup_sk_enabled(sk)) {
+ memcg_enabled = true;
+ charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
+ if (!charged)
+ goto suppress_allocation;
+ }
+
+ if (!allocated)
+ return 1;
+
+ /* Under limit. */
+ if (allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
+ return 1;
+ }
+
+ /* Under pressure. */
+ if (allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
+
+ /* Over hard limit. */
+ if (allocated > sk_prot_mem_limits(sk, 2))
+ goto suppress_allocation;
+
+ /* Guarantee minimum buffer size under pressure (either global
+ * or memcg) to make sure features described in RFC 7323 (TCP
+ * Extensions for High Performance) work properly.
+ *
+ * This rule does NOT stand when exceeds global or memcg's hard
+ * limit, or else a DoS attack can be taken place by spawning
+ * lots of sockets whose usage are under minimum buffer size.
+ */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
+ return 1;
+
+ } else { /* SK_MEM_SEND */
+ int wmem0 = sk_get_wmem0(sk, prot);
+
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < wmem0)
+ return 1;
+ } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
+ return 1;
+ }
+ }
+
+ if (sk_has_memory_pressure(sk)) {
+ u64 alloc;
+
+ /* The following 'average' heuristic is within the
+ * scope of global accounting, so it only makes
+ * sense for global memory pressure.
+ */
+ if (!sk_under_global_memory_pressure(sk))
+ return 1;
+
+ /* Try to be fair among all the sockets under global
+ * pressure by allowing the ones that below average
+ * usage to raise.
+ */
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
+ /* Force charge with __GFP_NOFAIL */
+ if (memcg_enabled && !charged)
+ mem_cgroup_sk_charge(sk, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
+ return 1;
+ }
+ }
+
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
+
+ if (allocated)
+ sk_memory_allocated_sub(sk, amt);
+
+ if (charged)
+ mem_cgroup_sk_uncharge(sk, amt);
+
+ return 0;
+}
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int ret, amt = sk_mem_pages(size);
+
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+ ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+ if (!ret)
+ sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
+ return ret;
+}
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_mem_reduce_allocated - reclaim memory_allocated
+ * @sk: socket
+ * @amount: number of quanta
+ *
+ * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
+ */
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
+{
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_uncharge(sk, amount);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_sub(sk, amount);
+
+ if (sk_under_global_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
+}
+
+/**
+ * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= PAGE_SHIFT;
+ sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
+ __sk_mem_reduce_allocated(sk, amount);
+}
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+void __sk_charge(struct sock *sk, gfp_t gfp)
+{
+ int amt;
+
+ gfp |= __GFP_NOFAIL;
+ if (mem_cgroup_from_sk(sk)) {
+ /* The socket has not been accepted yet, no need
+ * to look at newsk->sk_wmem_queued.
+ */
+ amt = sk_mem_pages(sk->sk_forward_alloc +
+ atomic_read(&sk->sk_rmem_alloc));
+ if (amt)
+ mem_cgroup_sk_charge(sk, amt, gfp);
+ }
+
+ kmem_cache_charge(sk, gfp);
+}
+
+int sk_set_peek_off(struct sock *sk, int val)
+{
+ WRITE_ONCE(sk->sk_peek_off, val);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
+
/*
* Set of default routines for initialising struct proto_ops when
* the protocol does not support a particular function. In certain
@@ -1305,94 +3480,96 @@ EXPORT_SYMBOL(sk_wait_data);
* function, some default processing is provided.
*/
-int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_bind);
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
int len, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_connect);
int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+int sock_no_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_accept);
-int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
- int *len, int peer)
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+ int peer)
{
return -EOPNOTSUPP;
}
-
-unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
-{
- return 0;
-}
+EXPORT_SYMBOL(sock_no_getname);
int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_ioctl);
int sock_no_listen(struct socket *sock, int backlog)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_listen);
int sock_no_shutdown(struct socket *sock, int how)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_shutdown);
-int sock_no_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
-{
- return -EOPNOTSUPP;
-}
-
-int sock_no_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_sendmsg);
-int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
- size_t len)
+int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_sendmsg_locked);
-int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
- size_t len, int flags)
+int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
+ int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_recvmsg);
int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
/* Mirror missing mmap method error code */
return -ENODEV;
}
+EXPORT_SYMBOL(sock_no_mmap);
-ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
{
- ssize_t res;
- struct msghdr msg = {.msg_flags = flags};
- struct kvec iov;
- char *kaddr = kmap(page);
- iov.iov_base = kaddr + offset;
- iov.iov_len = size;
- res = kernel_sendmsg(sock, &msg, &iov, 1, size);
- kunmap(page);
- return res;
+ struct socket *sock;
+
+ sock = sock_from_file(file);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
}
/*
@@ -1401,60 +3578,98 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
static void sock_def_wakeup(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible_all(sk->sk_sleep);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
}
static void sock_def_error_report(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- sk_wake_async(sk,0,POLL_ERR);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLERR);
+ sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
+ rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+void sock_def_readable(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- sk_wake_async(sk,1,POLL_IN);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ trace_sk_data_ready(sk);
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+ EPOLLRDNORM | EPOLLRDBAND);
+ sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
}
static void sock_def_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ if (sock_writeable(sk)) {
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, 2, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
- read_unlock(&sk->sk_callback_lock);
+ rcu_read_unlock();
+}
+
+/* An optimised version of sock_def_write_space(), should only be called
+ * for SOCK_RCU_FREE sockets under RCU read section and after putting
+ * ->sk_wmem_alloc.
+ */
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
+{
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if (__sock_writeable(sk, wmem_alloc)) {
+ struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+ /* rely on refcount_sub from sock_wfree() */
+ smp_mb__after_atomic();
+ if (wq && waitqueue_active(&wq->wait))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
}
static void sock_def_destruct(struct sock *sk)
{
- kfree(sk->sk_protinfo);
}
void sk_send_sigurg(struct sock *sk)
{
if (sk->sk_socket && sk->sk_socket->file)
- if (send_sigurg(&sk->sk_socket->file->f_owner))
- sk_wake_async(sk, 3, POLL_PRI);
+ if (send_sigurg(sk->sk_socket->file))
+ sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
+EXPORT_SYMBOL(sk_send_sigurg);
void sk_reset_timer(struct sock *sk, struct timer_list* timer,
unsigned long expires)
@@ -1462,50 +3677,46 @@ void sk_reset_timer(struct sock *sk, struct timer_list* timer,
if (!mod_timer(timer, expires))
sock_hold(sk);
}
-
EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (timer_pending(timer) && del_timer(timer))
+ if (timer_delete(timer))
__sock_put(sk);
}
-
EXPORT_SYMBOL(sk_stop_timer);
-void sock_init_data(struct socket *sock, struct sock *sk)
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
- skb_queue_head_init(&sk->sk_receive_queue);
- skb_queue_head_init(&sk->sk_write_queue);
- skb_queue_head_init(&sk->sk_error_queue);
-#ifdef CONFIG_NET_DMA
- skb_queue_head_init(&sk->sk_async_wait_queue);
-#endif
+ if (timer_delete_sync(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer_sync);
+void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
+{
+ sk_init_common(sk);
sk->sk_send_head = NULL;
- init_timer(&sk->sk_timer);
-
+ timer_setup(&sk->sk_timer, NULL, 0);
+
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
- sk->sk_socket = sock;
+ sk->sk_use_task_frag = true;
+ sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
- if(sock)
- {
+ if (sock) {
sk->sk_type = sock->type;
- sk->sk_sleep = &sock->wait;
+ RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
- } else
- sk->sk_sleep = NULL;
-
- rwlock_init(&sk->sk_dst_lock);
- rwlock_init(&sk->sk_callback_lock);
- lockdep_set_class(&sk->sk_callback_lock,
- af_callback_keys + sk->sk_family);
+ } else {
+ RCU_INIT_POINTER(sk->sk_wq, NULL);
+ }
+ sk->sk_uid = uid;
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
@@ -1513,160 +3724,262 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ sk->sk_peek_off = -1;
+
+ sk->sk_peer_pid = NULL;
+ sk->sk_peer_cred = NULL;
+ spin_lock_init(&sk->sk_peer_lock);
- sk->sk_peercred.pid = 0;
- sk->sk_peercred.uid = -1;
- sk->sk_peercred.gid = -1;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
- sk->sk_stamp.tv_sec = -1L;
- sk->sk_stamp.tv_usec = -1L;
+ sk->sk_stamp = SK_DEFAULT_STAMP;
+#if BITS_PER_LONG==32
+ seqlock_init(&sk->sk_stamp_seq);
+#endif
+ atomic_set(&sk->sk_zckey, 0);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ sk->sk_napi_id = 0;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
+#endif
+
+ sk->sk_max_pacing_rate = ~0UL;
+ sk->sk_pacing_rate = ~0UL;
+ WRITE_ONCE(sk->sk_pacing_shift, 10);
+ sk->sk_incoming_cpu = -1;
+
+ sk_rx_queue_clear(sk);
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&sk->sk_refcnt, 1);
+ sk_drops_reset(sk);
+}
+EXPORT_SYMBOL(sock_init_data_uid);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ kuid_t uid = sock ?
+ SOCK_INODE(sock)->i_uid :
+ make_kuid(sock_net(sk)->user_ns, 0);
- atomic_set(&sk->sk_refcnt, 1);
+ sock_init_data_uid(sock, sk, uid);
}
+EXPORT_SYMBOL(sock_init_data);
-void fastcall lock_sock_nested(struct sock *sk, int subclass)
+void lock_sock_nested(struct sock *sk, int subclass)
{
+ /* The sk_lock has mutex_lock() semantics here. */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (sk->sk_lock.owner)
+ if (sock_owned_by_user_nocheck(sk))
__lock_sock(sk);
- sk->sk_lock.owner = (void *)1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
+ sk->sk_lock.owned = 1;
+ spin_unlock_bh(&sk->sk_lock.slock);
}
-
EXPORT_SYMBOL(lock_sock_nested);
-void fastcall release_sock(struct sock *sk)
+void release_sock(struct sock *sk)
{
- /*
- * The sk_lock has mutex_unlock() semantics:
- */
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
- sk->sk_lock.owner = NULL;
+
+ if (sk->sk_prot->release_cb)
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
+
+ sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);
-int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
-{
- if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk);
- if (sk->sk_stamp.tv_sec == -1)
+bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+
+ if (!sock_owned_by_user_nocheck(sk)) {
+ /*
+ * Fast path return with bottom halves disabled and
+ * sock::sk_lock.slock held.
+ *
+ * The 'mutex' is not contended and holding
+ * sock::sk_lock.slock prevents all other lockers to
+ * proceed so the corresponding unlock_sock_fast() can
+ * avoid the slow path of release_sock() completely and
+ * just release slock.
+ *
+ * From a semantical POV this is equivalent to 'acquiring'
+ * the 'mutex', hence the corresponding lockdep
+ * mutex_release() has to happen in the fast path of
+ * unlock_sock_fast().
+ */
+ return false;
+ }
+
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ __acquire(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ return true;
+}
+EXPORT_SYMBOL(__lock_sock_fast);
+
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+ bool timeval, bool time32)
+{
+ struct sock *sk = sock->sk;
+ struct timespec64 ts;
+
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ ts = ktime_to_timespec64(sock_read_timestamp(sk));
+ if (ts.tv_sec == -1)
return -ENOENT;
- if (sk->sk_stamp.tv_sec == 0)
- do_gettimeofday(&sk->sk_stamp);
- return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
- -EFAULT : 0;
-}
-EXPORT_SYMBOL(sock_get_timestamp);
-
-void sock_enable_timestamp(struct sock *sk)
-{
- if (!sock_flag(sk, SOCK_TIMESTAMP)) {
- sock_set_flag(sk, SOCK_TIMESTAMP);
- net_enable_timestamp();
+ if (ts.tv_sec == 0) {
+ ktime_t kt = ktime_get_real();
+ sock_write_timestamp(sk, kt);
+ ts = ktime_to_timespec64(kt);
+ }
+
+ if (timeval)
+ ts.tv_nsec /= 1000;
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+ if (time32)
+ return put_old_timespec32(&ts, userstamp);
+#endif
+#ifdef CONFIG_SPARC64
+ /* beware of padding in sparc64 timeval */
+ if (timeval && !in_compat_syscall()) {
+ struct __kernel_old_timeval __user tv = {
+ .tv_sec = ts.tv_sec,
+ .tv_usec = ts.tv_nsec,
+ };
+ if (copy_to_user(userstamp, &tv, sizeof(tv)))
+ return -EFAULT;
+ return 0;
+ }
+#endif
+ return put_timespec64(&ts, userstamp);
+}
+EXPORT_SYMBOL(sock_gettstamp);
+
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
+{
+ if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
+ sock_set_flag(sk, flag);
+ /*
+ * we just set one of the two flags which require net
+ * time stamping, but time stamping might have been on
+ * already because of the other one
+ */
+ if (sock_needs_netstamp(sk) &&
+ !(previous_flags & SK_FLAGS_TIMESTAMP))
+ net_enable_timestamp();
}
}
-EXPORT_SYMBOL(sock_enable_timestamp);
+
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+ int level, int type)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ int copied, err;
+
+ err = -EAGAIN;
+ skb = sock_dequeue_err_skb(sk);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+ put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
/*
* Get a socket option on an socket.
*
* FIX: POSIX 1003.1g is very ambiguous here. It states that
* asynchronous errors should be reported by getsockopt. We assume
- * this means if you specify SO_ERROR (otherwise whats the point of it).
+ * this means if you specify SO_ERROR (otherwise what is the point of it).
*/
int sock_common_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(sock_common_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_setsockopt != NULL)
- return sk->sk_prot->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_getsockopt);
-#endif
-
-int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
+int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
int err;
- err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
}
-
EXPORT_SYMBOL(sock_common_recvmsg);
/*
* Set socket options on an inet socket.
*/
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(sock_common_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_setsockopt != NULL)
- return sk->sk_prot->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_setsockopt);
-#endif
-
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
sk->sk_prot->destroy(sk);
/*
- * Observation: when sock_common_release is called, processes have
+ * Observation: when sk_common_release is called, processes have
* no access to socket. But net still has.
* Step one, detach it from networking:
*
@@ -1691,184 +4004,321 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
- sk_refcnt_debug_release(sk);
sock_put(sk);
}
-
EXPORT_SYMBOL(sk_common_release);
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
+void sk_get_meminfo(const struct sock *sk, u32 *mem)
+{
+ memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
+ mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
+ mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
+}
-int proto_register(struct proto *prot, int alloc_slab)
+#ifdef CONFIG_PROC_FS
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
- char *request_sock_slab_name = NULL;
- char *timewait_sock_slab_name;
- int rc = -ENOBUFS;
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
- if (alloc_slab) {
- prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ for_each_possible_cpu(cpu)
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
- if (prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
- prot->name);
- goto out;
- }
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
- if (prot->rsk_prot != NULL) {
- static const char mask[] = "request_sock_%s";
+int sock_inuse_get(struct net *net)
+{
+ int cpu, res = 0;
- request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
- if (request_sock_slab_name == NULL)
- goto out_free_sock_slab;
+ for_each_possible_cpu(cpu)
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
- sprintf(request_sock_slab_name, mask, prot->name);
- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
- prot->rsk_prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ return res;
+}
- if (prot->rsk_prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
- prot->name);
- goto out_free_request_sock_slab_name;
- }
- }
+EXPORT_SYMBOL_GPL(sock_inuse_get);
- if (prot->twsk_prot != NULL) {
- static const char mask[] = "tw_sock_%s";
+static int __net_init sock_inuse_init_net(struct net *net)
+{
+ net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+ if (net->core.prot_inuse == NULL)
+ return -ENOMEM;
+ return 0;
+}
- timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+static void __net_exit sock_inuse_exit_net(struct net *net)
+{
+ free_percpu(net->core.prot_inuse);
+}
- if (timewait_sock_slab_name == NULL)
- goto out_free_request_sock_slab;
+static struct pernet_operations net_inuse_ops = {
+ .init = sock_inuse_init_net,
+ .exit = sock_inuse_exit_net,
+};
- sprintf(timewait_sock_slab_name, mask, prot->name);
- prot->twsk_prot->twsk_slab =
- kmem_cache_create(timewait_sock_slab_name,
- prot->twsk_prot->twsk_obj_size,
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (prot->twsk_prot->twsk_slab == NULL)
- goto out_free_timewait_sock_slab_name;
- }
- }
+static __init int net_inuse_init(void)
+{
+ if (register_pernet_subsys(&net_inuse_ops))
+ panic("Cannot initialize net inuse counters");
- write_lock(&proto_list_lock);
- list_add(&prot->node, &proto_list);
- write_unlock(&proto_list_lock);
- rc = 0;
-out:
- return rc;
-out_free_timewait_sock_slab_name:
- kfree(timewait_sock_slab_name);
-out_free_request_sock_slab:
- if (prot->rsk_prot && prot->rsk_prot->slab) {
- kmem_cache_destroy(prot->rsk_prot->slab);
- prot->rsk_prot->slab = NULL;
- }
-out_free_request_sock_slab_name:
- kfree(request_sock_slab_name);
-out_free_sock_slab:
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
- goto out;
+ return 0;
}
-EXPORT_SYMBOL(proto_register);
+core_initcall(net_inuse_init);
-void proto_unregister(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
{
- write_lock(&proto_list_lock);
- list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
- if (prot->slab != NULL) {
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
+ pr_err("PROTO_INUSE_NR exhausted\n");
+ return -ENOSPC;
}
- if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
- const char *name = kmem_cache_name(prot->rsk_prot->slab);
-
- kmem_cache_destroy(prot->rsk_prot->slab);
- kfree(name);
- prot->rsk_prot->slab = NULL;
- }
+ set_bit(prot->inuse_idx, proto_inuse_idx);
+ return 0;
+}
- if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
+static void release_proto_idx(struct proto *prot)
+{
+ if (prot->inuse_idx != PROTO_INUSE_NR)
+ clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline int assign_proto_idx(struct proto *prot)
+{
+ return 0;
+}
- kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(name);
- prot->twsk_prot->twsk_slab = NULL;
- }
+static inline void release_proto_idx(struct proto *prot)
+{
}
-EXPORT_SYMBOL(proto_unregister);
+#endif
-#ifdef CONFIG_PROC_FS
-static inline struct proto *__proto_head(void)
+static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
- return list_entry(proto_list.next, struct proto, node);
+ if (!twsk_prot)
+ return;
+ kfree(twsk_prot->twsk_slab_name);
+ twsk_prot->twsk_slab_name = NULL;
+ kmem_cache_destroy(twsk_prot->twsk_slab);
+ twsk_prot->twsk_slab = NULL;
}
-static inline struct proto *proto_head(void)
+static int tw_prot_init(const struct proto *prot)
{
- return list_empty(&proto_list) ? NULL : __proto_head();
+ struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
+
+ if (!twsk_prot)
+ return 0;
+
+ twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
+ prot->name);
+ if (!twsk_prot->twsk_slab_name)
+ return -ENOMEM;
+
+ twsk_prot->twsk_slab =
+ kmem_cache_create(twsk_prot->twsk_slab_name,
+ twsk_prot->twsk_obj_size, 0,
+ SLAB_ACCOUNT | prot->slab_flags,
+ NULL);
+ if (!twsk_prot->twsk_slab) {
+ pr_crit("%s: Can't create timewait sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+
+ return 0;
}
-static inline struct proto *proto_next(struct proto *proto)
+static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
- return proto->node.next == &proto_list ? NULL :
- list_entry(proto->node.next, struct proto, node);
+ if (!rsk_prot)
+ return;
+ kfree(rsk_prot->slab_name);
+ rsk_prot->slab_name = NULL;
+ kmem_cache_destroy(rsk_prot->slab);
+ rsk_prot->slab = NULL;
}
-static inline struct proto *proto_get_idx(loff_t pos)
+static int req_prot_init(const struct proto *prot)
{
- struct proto *proto;
- loff_t i = 0;
+ struct request_sock_ops *rsk_prot = prot->rsk_prot;
- list_for_each_entry(proto, &proto_list, node)
- if (i++ == pos)
+ if (!rsk_prot)
+ return 0;
+
+ rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
+ prot->name);
+ if (!rsk_prot->slab_name)
+ return -ENOMEM;
+
+ rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
+ rsk_prot->obj_size, 0,
+ SLAB_ACCOUNT | prot->slab_flags,
+ NULL);
+
+ if (!rsk_prot->slab) {
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int proto_register(struct proto *prot, int alloc_slab)
+{
+ int ret = -ENOBUFS;
+
+ if (prot->memory_allocated && !prot->sysctl_mem) {
+ pr_err("%s: missing sysctl_mem\n", prot->name);
+ return -EINVAL;
+ }
+ if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
+ pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
+ return -EINVAL;
+ }
+ if (alloc_slab) {
+ prot->slab = kmem_cache_create_usercopy(prot->name,
+ prot->obj_size, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags,
+ prot->useroffset, prot->usersize,
+ NULL);
+
+ if (prot->slab == NULL) {
+ pr_crit("%s: Can't create sock SLAB cache!\n",
+ prot->name);
goto out;
+ }
+
+ if (req_prot_init(prot))
+ goto out_free_request_sock_slab;
- proto = NULL;
+ if (tw_prot_init(prot))
+ goto out_free_timewait_sock_slab;
+ }
+
+ mutex_lock(&proto_list_mutex);
+ ret = assign_proto_idx(prot);
+ if (ret) {
+ mutex_unlock(&proto_list_mutex);
+ goto out_free_timewait_sock_slab;
+ }
+ list_add(&prot->node, &proto_list);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+
+out_free_timewait_sock_slab:
+ if (alloc_slab)
+ tw_prot_cleanup(prot->twsk_prot);
+out_free_request_sock_slab:
+ if (alloc_slab) {
+ req_prot_cleanup(prot->rsk_prot);
+
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
out:
- return proto;
+ return ret;
}
+EXPORT_SYMBOL(proto_register);
+
+void proto_unregister(struct proto *prot)
+{
+ mutex_lock(&proto_list_mutex);
+ release_proto_idx(prot);
+ list_del(&prot->node);
+ mutex_unlock(&proto_list_mutex);
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+
+ req_prot_cleanup(prot->rsk_prot);
+ tw_prot_cleanup(prot->twsk_prot);
+}
+EXPORT_SYMBOL(proto_unregister);
+
+int sock_load_diag_module(int family, int protocol)
+{
+ if (!protocol) {
+ if (!sock_is_registered(family))
+ return -ENOENT;
+
+ return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family);
+ }
+
+#ifdef CONFIG_INET
+ if (family == AF_INET &&
+ protocol != IPPROTO_RAW &&
+ protocol < MAX_INET_PROTOS &&
+ !rcu_access_pointer(inet_protos[protocol]))
+ return -ENOENT;
+#endif
+
+ return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family, protocol);
+}
+EXPORT_SYMBOL(sock_load_diag_module);
+
+#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(proto_list_mutex)
{
- read_lock(&proto_list_lock);
- return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ mutex_lock(&proto_list_mutex);
+ return seq_list_start_head(&proto_list, *pos);
}
static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
+ return seq_list_next(v, &proto_list, pos);
}
static void proto_seq_stop(struct seq_file *seq, void *v)
+ __releases(proto_list_mutex)
{
- read_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
}
static char proto_method_implemented(const void *method)
{
return method == NULL ? 'n' : 'y';
}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
+}
+
+static const char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
- seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
- "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+
+ seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
+ "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
- proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
- proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
- proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ sock_prot_inuse_get(seq_file_net(seq), proto),
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
proto->max_header,
proto->slab == NULL ? "no" : "yes",
module_name(proto->owner),
@@ -1884,7 +4334,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
proto_method_implemented(proto->getsockopt),
proto_method_implemented(proto->sendmsg),
proto_method_implemented(proto->recvmsg),
- proto_method_implemented(proto->sendpage),
proto_method_implemented(proto->bind),
proto_method_implemented(proto->backlog_rcv),
proto_method_implemented(proto->hash),
@@ -1895,7 +4344,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
static int proto_seq_show(struct seq_file *seq, void *v)
{
- if (v == SEQ_START_TOKEN)
+ if (v == &proto_list)
seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
"protocol",
"size",
@@ -1905,72 +4354,196 @@ static int proto_seq_show(struct seq_file *seq, void *v)
"maxhdr",
"slab",
"module",
- "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+ "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
else
- proto_seq_printf(seq, v);
+ proto_seq_printf(seq, list_entry(v, struct proto, node));
return 0;
}
-static struct seq_operations proto_seq_ops = {
+static const struct seq_operations proto_seq_ops = {
.start = proto_seq_start,
.next = proto_seq_next,
.stop = proto_seq_stop,
.show = proto_seq_show,
};
-static int proto_seq_open(struct inode *inode, struct file *file)
+static __net_init int proto_init_net(struct net *net)
+{
+ if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
{
- return seq_open(file, &proto_seq_ops);
+ remove_proc_entry("protocols", net->proc_net);
}
-static struct file_operations proto_seq_fops = {
- .owner = THIS_MODULE,
- .open = proto_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+ .init = proto_init_net,
+ .exit = proto_exit_net,
};
static int __init proto_init(void)
{
- /* register /proc/net/protocols */
- return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+ return register_pernet_subsys(&proto_net_ops);
}
subsys_initcall(proto_init);
#endif /* PROC_FS */
-EXPORT_SYMBOL(sk_alloc);
-EXPORT_SYMBOL(sk_free);
-EXPORT_SYMBOL(sk_send_sigurg);
-EXPORT_SYMBOL(sock_alloc_send_skb);
-EXPORT_SYMBOL(sock_init_data);
-EXPORT_SYMBOL(sock_kfree_s);
-EXPORT_SYMBOL(sock_kmalloc);
-EXPORT_SYMBOL(sock_no_accept);
-EXPORT_SYMBOL(sock_no_bind);
-EXPORT_SYMBOL(sock_no_connect);
-EXPORT_SYMBOL(sock_no_getname);
-EXPORT_SYMBOL(sock_no_getsockopt);
-EXPORT_SYMBOL(sock_no_ioctl);
-EXPORT_SYMBOL(sock_no_listen);
-EXPORT_SYMBOL(sock_no_mmap);
-EXPORT_SYMBOL(sock_no_poll);
-EXPORT_SYMBOL(sock_no_recvmsg);
-EXPORT_SYMBOL(sock_no_sendmsg);
-EXPORT_SYMBOL(sock_no_sendpage);
-EXPORT_SYMBOL(sock_no_setsockopt);
-EXPORT_SYMBOL(sock_no_shutdown);
-EXPORT_SYMBOL(sock_no_socketpair);
-EXPORT_SYMBOL(sock_rfree);
-EXPORT_SYMBOL(sock_setsockopt);
-EXPORT_SYMBOL(sock_wfree);
-EXPORT_SYMBOL(sock_wmalloc);
-EXPORT_SYMBOL(sock_i_uid);
-EXPORT_SYMBOL(sock_i_ino);
-EXPORT_SYMBOL(sysctl_optmem_max);
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_rmem_max);
-EXPORT_SYMBOL(sysctl_wmem_max);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+bool sk_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct sock *sk = p;
+
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+ return true;
+
+ if (sk_is_udp(sk) &&
+ !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
+ return true;
+
+ return sk_busy_loop_timeout(sk, start_time);
+}
+EXPORT_SYMBOL(sk_busy_loop_end);
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
+{
+ if (!sk->sk_prot->bind_add)
+ return -EOPNOTSUPP;
+ return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);
+
+/* Copy 'size' bytes from userspace and return `size` back to userspace */
+int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
+ void __user *arg, void *karg, size_t size)
+{
+ int ret;
+
+ if (copy_from_user(karg, arg, size))
+ return -EFAULT;
+
+ ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, karg, size))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(sock_ioctl_inout);
+
+/* This is the most common ioctl prep function, where the result (4 bytes) is
+ * copied back to userspace if the ioctl() returns successfully. No input is
+ * copied from userspace as input argument.
+ */
+static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ int ret, karg = 0;
+
+ ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
+ if (ret)
+ return ret;
+
+ return put_user(karg, (int __user *)arg);
+}
+
+/* A wrapper around sock ioctls, which copies the data from userspace
+ * (depending on the protocol/ioctl), and copies back the result to userspace.
+ * The main motivation for this function is to pass kernel memory to the
+ * protocol ioctl callbacks, instead of userspace memory.
+ */
+int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ int rc = 1;
+
+ if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
+ rc = ipmr_sk_ioctl(sk, cmd, arg);
+ else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
+ rc = ip6mr_sk_ioctl(sk, cmd, arg);
+ else if (sk_is_phonet(sk))
+ rc = phonet_sk_ioctl(sk, cmd, arg);
+
+ /* If ioctl was processed, returns its value */
+ if (rc <= 0)
+ return rc;
+
+ /* Otherwise call the default handler */
+ return sock_ioctl_out(sk, cmd, arg);
+}
+EXPORT_SYMBOL(sk_ioctl);
+
+static int __init sock_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+#ifdef CONFIG_MEMCG
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
#endif
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
+ return 0;
+}
+
+core_initcall(sock_struct_check);
diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h
new file mode 100644
index 000000000000..2f396e6bfba5
--- /dev/null
+++ b/net/core/sock_destructor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
+#define _NET_CORE_SOCK_DESTRUCTOR_H
+#include <net/tcp.h>
+
+static inline bool is_skb_wmem(const struct sk_buff *skb)
+{
+ return skb->destructor == sock_wfree ||
+ skb->destructor == __sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
+}
+#endif
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 000000000000..026ce9bd9e5e
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,355 @@
+/* License: GPL */
+
+#include <linux/filter.h>
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+#include <linux/nospec.h>
+#include <linux/cookie.h>
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
+
+static const struct sock_diag_inet_compat __rcu *inet_rcv_compat;
+
+static struct workqueue_struct *broadcast_wq;
+
+DEFINE_COOKIE(sock_cookie);
+
+u64 __sock_gen_cookie(struct sock *sk)
+{
+ u64 res = atomic64_read(&sk->sk_cookie);
+
+ if (!res) {
+ u64 new = gen_cookie_next(&sock_cookie);
+
+ atomic64_cmpxchg(&sk->sk_cookie, res, new);
+
+ /* Another thread might have changed sk_cookie before us. */
+ res = atomic64_read(&sk->sk_cookie);
+ }
+ return res;
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+ u64 res;
+
+ if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
+ return 0;
+
+ res = sock_gen_cookie(sk);
+ if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+ return -ESTALE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
+{
+ u64 res = sock_gen_cookie(sk);
+
+ cookie[0] = (u32)res;
+ cookie[1] = (u32)(res >> 32);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ u32 mem[SK_MEMINFO_VARS];
+
+ sk_get_meminfo(sk, mem);
+
+ return nla_put(skb, attrtype, sizeof(mem), &mem);
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
+ struct sk_buff *skb, int attrtype)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ struct nlattr *attr;
+ unsigned int flen;
+ int err = 0;
+
+ if (!may_report_filterinfo) {
+ nla_reserve(skb, attrtype, 0);
+ return 0;
+ }
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (!filter)
+ goto out;
+
+ fprog = filter->prog->orig_prog;
+ if (!fprog)
+ goto out;
+
+ flen = bpf_classic_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
+ if (attr == NULL) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(nla_data(attr), fprog->filter, flen);
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(sock_diag_put_filterinfo);
+
+struct broadcast_sk {
+ struct sock *sk;
+ struct work_struct work;
+};
+
+static size_t sock_diag_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+ + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
+ + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+}
+
+static const struct sock_diag_handler *sock_diag_lock_handler(int family)
+{
+ const struct sock_diag_handler *handler;
+
+ rcu_read_lock();
+ handler = rcu_dereference(sock_diag_handlers[family]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
+
+ return handler;
+}
+
+static void sock_diag_unlock_handler(const struct sock_diag_handler *handler)
+{
+ module_put(handler->owner);
+}
+
+static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+{
+ struct broadcast_sk *bsk =
+ container_of(work, struct broadcast_sk, work);
+ struct sock *sk = bsk->sk;
+ const struct sock_diag_handler *hndl;
+ struct sk_buff *skb;
+ const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+ int err = -1;
+
+ WARN_ON(group == SKNLGRP_NONE);
+
+ skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ hndl = sock_diag_lock_handler(sk->sk_family);
+ if (hndl) {
+ if (hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ sock_diag_unlock_handler(hndl);
+ }
+ if (!err)
+ nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
+ GFP_KERNEL);
+ else
+ kfree_skb(skb);
+out:
+ sk_destruct(sk);
+ kfree(bsk);
+}
+
+void sock_diag_broadcast_destroy(struct sock *sk)
+{
+ /* Note, this function is often called from an interrupt context. */
+ struct broadcast_sk *bsk =
+ kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC);
+ if (!bsk)
+ return sk_destruct(sk);
+ bsk->sk = sk;
+ INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+ queue_work(broadcast_wq, &bsk->work);
+}
+
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
+{
+ xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr));
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
+{
+ const struct sock_diag_inet_compat *old;
+
+ old = unrcu_pointer(xchg(&inet_rcv_compat, NULL));
+ WARN_ON_ONCE(old != ptr);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(const struct sock_diag_handler *hndl)
+{
+ int family = hndl->family;
+
+ if (family >= AF_MAX)
+ return -EINVAL;
+
+ return !cmpxchg((const struct sock_diag_handler **)
+ &sock_diag_handlers[family],
+ NULL, hndl) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(const struct sock_diag_handler *hndl)
+{
+ int family = hndl->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ xchg((const struct sock_diag_handler **)&sock_diag_handlers[family],
+ NULL);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = nlmsg_data(nlh);
+ const struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ if (req->sdiag_family >= AF_MAX)
+ return -EINVAL;
+ req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
+
+ if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family]))
+ sock_load_diag_module(req->sdiag_family, 0);
+
+ hndl = sock_diag_lock_handler(req->sdiag_family);
+ if (hndl == NULL)
+ return -ENOENT;
+
+ if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
+ err = hndl->dump(skb, nlh);
+ else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
+ err = hndl->destroy(skb, nlh);
+ else
+ err = -EOPNOTSUPP;
+ sock_diag_unlock_handler(hndl);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ const struct sock_diag_inet_compat *ptr;
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ if (!rcu_access_pointer(inet_rcv_compat))
+ sock_load_diag_module(AF_INET, 0);
+
+ rcu_read_lock();
+ ptr = rcu_dereference(inet_rcv_compat);
+ if (ptr && !try_module_get(ptr->owner))
+ ptr = NULL;
+ rcu_read_unlock();
+
+ ret = -EOPNOTSUPP;
+ if (ptr) {
+ ret = ptr->fn(skb, nlh);
+ module_put(ptr->owner);
+ }
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ case SOCK_DESTROY:
+ return __sock_diag_cmd(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+}
+
+static int sock_diag_bind(struct net *net, int group)
+{
+ switch (group) {
+ case SKNLGRP_INET_TCP_DESTROY:
+ case SKNLGRP_INET_UDP_DESTROY:
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET]))
+ sock_load_diag_module(AF_INET, 0);
+ break;
+ case SKNLGRP_INET6_TCP_DESTROY:
+ case SKNLGRP_INET6_UDP_DESTROY:
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET6]))
+ sock_load_diag_module(AF_INET6, 0);
+ break;
+ }
+ return 0;
+}
+
+int sock_diag_destroy(struct sock *sk, int err)
+{
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!sk->sk_prot->diag_destroy)
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, err);
+}
+EXPORT_SYMBOL_GPL(sock_diag_destroy);
+
+static int __net_init diag_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .groups = SKNLGRP_MAX,
+ .input = sock_diag_rcv,
+ .bind = sock_diag_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ };
+
+ net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
+ return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->diag_nlsk);
+ net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+ .init = diag_net_init,
+ .exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+ broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0);
+ BUG_ON(!broadcast_wq);
+ return register_pernet_subsys(&diag_net_ops);
+}
+device_initcall(sock_diag_init);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 000000000000..5947b38e4f8b
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1959 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/sock_diag.h>
+#include <net/udp.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sks;
+ struct sk_psock_progs progs;
+ spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+/* This mutex is used to
+ * - protect race between prog/link attach/detach and link prog update, and
+ * - protect race between releasing and accessing map in bpf_link.
+ * A single global mutex lock is used since it is expected contention is low.
+ */
+static DEFINE_MUTEX(sockmap_mutex);
+
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which);
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+
+ if (attr->max_entries == 0 ||
+ attr->key_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&stab->map, attr);
+ spin_lock_init(&stab->lock);
+
+ stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (!stab->sks) {
+ bpf_map_area_free(stab);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &stab->map;
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->target_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
+ return ret;
+}
+
+int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->target_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ prog = bpf_prog_get(attr->attach_bpf_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != ptype) {
+ ret = -EINVAL;
+ goto put_prog;
+ }
+
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
+put_prog:
+ bpf_prog_put(prog);
+ return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+ __acquires(&sk->sk_lock.slock)
+{
+ lock_sock(sk);
+ rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ rcu_read_unlock();
+ release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+ struct sk_psock_link *link,
+ struct bpf_map *map, void *link_raw)
+{
+ link->link_raw = link_raw;
+ link->map = map;
+ spin_lock_bh(&psock->link_lock);
+ list_add_tail(&link->list, &psock->link);
+ spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+ struct sk_psock *psock, void *link_raw)
+{
+ bool strp_stop = false, verdict_stop = false;
+ struct sk_psock_link *link, *tmp;
+
+ spin_lock_bh(&psock->link_lock);
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ if (link->link_raw == link_raw) {
+ struct bpf_map *map = link->map;
+ struct sk_psock_progs *progs = sock_map_progs(map);
+
+ if (psock->saved_data_ready && progs->stream_parser)
+ strp_stop = true;
+ if (psock->saved_data_ready && progs->stream_verdict)
+ verdict_stop = true;
+ if (psock->saved_data_ready && progs->skb_verdict)
+ verdict_stop = true;
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ break;
+ }
+ }
+ spin_unlock_bh(&psock->link_lock);
+ if (strp_stop || verdict_stop) {
+ write_lock_bh(&sk->sk_callback_lock);
+ if (strp_stop)
+ sk_psock_stop_strp(sk, psock);
+ if (verdict_stop)
+ sk_psock_stop_verdict(sk, psock);
+
+ if (psock->psock_update_sk_prot)
+ psock->psock_update_sk_prot(sk, psock, false);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+ struct sk_psock *psock = sk_psock(sk);
+
+ if (likely(psock)) {
+ sock_map_del_link(sk, psock, link_raw);
+ sk_psock_put(sk, psock);
+ }
+}
+
+static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
+{
+ if (!sk->sk_prot->psock_update_sk_prot)
+ return -EINVAL;
+ psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+ return sk->sk_prot->psock_update_sk_prot(sk, psock, false);
+}
+
+static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock) {
+ if (sk->sk_prot->close != sock_map_close) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ psock = ERR_PTR(-EBUSY);
+ }
+out:
+ rcu_read_unlock();
+ return psock;
+}
+
+static int sock_map_link(struct bpf_map *map, struct sock *sk)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog *stream_verdict = NULL;
+ struct bpf_prog *stream_parser = NULL;
+ struct bpf_prog *skb_verdict = NULL;
+ struct bpf_prog *msg_parser = NULL;
+ struct sk_psock *psock;
+ int ret;
+
+ stream_verdict = READ_ONCE(progs->stream_verdict);
+ if (stream_verdict) {
+ stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+ if (IS_ERR(stream_verdict))
+ return PTR_ERR(stream_verdict);
+ }
+
+ stream_parser = READ_ONCE(progs->stream_parser);
+ if (stream_parser) {
+ stream_parser = bpf_prog_inc_not_zero(stream_parser);
+ if (IS_ERR(stream_parser)) {
+ ret = PTR_ERR(stream_parser);
+ goto out_put_stream_verdict;
+ }
+ }
+
+ msg_parser = READ_ONCE(progs->msg_parser);
+ if (msg_parser) {
+ msg_parser = bpf_prog_inc_not_zero(msg_parser);
+ if (IS_ERR(msg_parser)) {
+ ret = PTR_ERR(msg_parser);
+ goto out_put_stream_parser;
+ }
+ }
+
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ if (skb_verdict) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict)) {
+ ret = PTR_ERR(skb_verdict);
+ goto out_put_msg_parser;
+ }
+ }
+
+ psock = sock_map_psock_get_checked(sk);
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ if (psock) {
+ if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+ (stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
+ sk_psock_put(sk, psock);
+ ret = -EBUSY;
+ goto out_progs;
+ }
+ } else {
+ psock = sk_psock_init(sk, map->numa_node);
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
+ goto out_progs;
+ }
+ }
+
+ if (msg_parser)
+ psock_set_prog(&psock->progs.msg_parser, msg_parser);
+ if (stream_parser)
+ psock_set_prog(&psock->progs.stream_parser, stream_parser);
+ if (stream_verdict)
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+ if (skb_verdict)
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+
+ /* msg_* and stream_* programs references tracked in psock after this
+ * point. Reference dec and cleanup will occur through psock destructor
+ */
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0) {
+ sk_psock_put(sk, psock);
+ goto out;
+ }
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (stream_parser && stream_verdict && !psock->saved_data_ready) {
+ if (sk_is_tcp(sk))
+ ret = sk_psock_init_strp(sk, psock);
+ else
+ ret = -EOPNOTSUPP;
+ if (ret) {
+ write_unlock_bh(&sk->sk_callback_lock);
+ sk_psock_put(sk, psock);
+ goto out;
+ }
+ sk_psock_start_strp(sk, psock);
+ } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+ sk_psock_start_verdict(sk,psock);
+ } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+ sk_psock_start_verdict(sk, psock);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+out_progs:
+ if (skb_verdict)
+ bpf_prog_put(skb_verdict);
+out_put_msg_parser:
+ if (msg_parser)
+ bpf_prog_put(msg_parser);
+out_put_stream_parser:
+ if (stream_parser)
+ bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+ if (stream_verdict)
+ bpf_prog_put(stream_verdict);
+out:
+ return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
+ synchronize_rcu();
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct sock **psk = &stab->sks[i];
+ struct sock *sk;
+
+ sk = xchg(psk, NULL);
+ if (sk) {
+ sock_hold(sk);
+ lock_sock(sk);
+ rcu_read_lock();
+ sock_map_unref(sk, psk);
+ rcu_read_unlock();
+ release_sock(sk);
+ sock_put(sk);
+ }
+ }
+
+ /* wait for psock readers accessing its map link */
+ synchronize_rcu();
+
+ bpf_map_area_free(stab->sks);
+ bpf_map_area_free(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (unlikely(key >= map->max_entries))
+ return NULL;
+ return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
+}
+
+static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ __sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+ struct sock **psk)
+{
+ struct sock *sk = NULL;
+ int err = 0;
+
+ spin_lock_bh(&stab->lock);
+ if (!sk_test || sk_test == *psk)
+ sk = xchg(psk, NULL);
+
+ if (likely(sk))
+ sock_map_unref(sk, psk);
+ else
+ err = -EINVAL;
+
+ spin_unlock_bh(&stab->lock);
+ return err;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ __sock_map_delete(stab, sk, link_raw);
+}
+
+static long sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = *(u32 *)key;
+ struct sock **psk;
+
+ if (unlikely(i >= map->max_entries))
+ return -EINVAL;
+
+ psk = &stab->sks[i];
+ return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *key_next = next;
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+ if (i >= stab->map.max_entries)
+ *key_next = 0;
+ else
+ *key_next = i + 1;
+ return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ struct sock *osk;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(idx >= map->max_entries))
+ return -E2BIG;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ spin_lock_bh(&stab->lock);
+ osk = stab->sks[idx];
+ if (osk && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!osk && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, &stab->sks[idx]);
+ stab->sks[idx] = sk;
+ if (osk)
+ sock_map_unref(osk, &stab->sks[idx]);
+ spin_unlock_bh(&stab->lock);
+ return 0;
+out_unlock:
+ spin_unlock_bh(&stab->lock);
+ if (psock)
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+ return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
+}
+
+static bool sock_map_redirect_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return sk->sk_state != TCP_LISTEN;
+ else
+ return sk->sk_state == TCP_ESTABLISHED;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+ return !!sk->sk_prot->psock_update_sk_prot;
+}
+
+static bool sock_map_sk_state_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
+ if (sk_is_stream_unix(sk))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ if (sk_is_vsock(sk) &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ return true;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags);
+
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
+{
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
+
+ sock = sockfd_lookup(ufd, &ret);
+ if (!sock)
+ return ret;
+ sk = sock->sk;
+ if (!sk) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sock_map_sk_is_suitable(sk)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ sock_map_sk_acquire(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ sock_map_sk_release(sk);
+out:
+ sockfd_put(sock);
+ return ret;
+}
+
+static long sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct sock *sk = (struct sock *)value;
+ int ret;
+
+ if (unlikely(!sk || !sk_fullsock(sk)))
+ return -EINVAL;
+
+ if (!sock_map_sk_is_suitable(sk))
+ return -EOPNOTSUPP;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_map_update_common(map, *(u32 *)key, sops->sk,
+ flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
+
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+ return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct sock_map_seq_info {
+ struct bpf_map *map;
+ struct sock *sk;
+ u32 index;
+};
+
+struct bpf_iter__sockmap {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(void *, key);
+ __bpf_md_ptr(struct sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key,
+ struct sock *sk)
+
+static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
+{
+ if (unlikely(info->index >= info->map->max_entries))
+ return NULL;
+
+ info->sk = __sock_map_lookup_elem(info->map, info->index);
+
+ /* can't return sk directly, since that might be NULL */
+ return info;
+}
+
+static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_map_seq_stop */
+ rcu_read_lock();
+ return sock_map_seq_lookup_elem(info);
+}
+
+static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ ++*pos;
+ ++info->index;
+
+ return sock_map_seq_lookup_elem(info);
+}
+
+static int sock_map_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !v);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+ ctx.sk = info->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_map_seq_show(seq, NULL);
+
+ /* pairs with sock_map_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_map_seq_ops = {
+ .start = sock_map_seq_start,
+ .next = sock_map_seq_next,
+ .stop = sock_map_seq_stop,
+ .show = sock_map_seq_show,
+};
+
+static int sock_map_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ info->map = aux->map;
+ return 0;
+}
+
+static void sock_map_fini_seq_private(void *priv_data)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ bpf_map_put_with_uref(info->map);
+}
+
+static u64 sock_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_stab);
+
+ usage += (u64)map->max_entries * sizeof(struct sock *);
+ return usage;
+}
+
+static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
+ .seq_ops = &sock_map_seq_ops,
+ .init_seq_private = sock_map_init_seq_private,
+ .fini_seq_private = sock_map_fini_seq_private,
+ .seq_priv_size = sizeof(struct sock_map_seq_info),
+};
+
+BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab)
+const struct bpf_map_ops sock_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_lookup_elem_sys_only = sock_map_lookup_sys,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+ .map_lookup_elem = sock_map_lookup,
+ .map_release_uref = sock_map_release_progs,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = sock_map_mem_usage,
+ .map_btf_id = &sock_map_btf_ids[0],
+ .iter_seq_info = &sock_map_iter_seq_info,
+};
+
+struct bpf_shtab_elem {
+ struct rcu_head rcu;
+ u32 hash;
+ struct sock *sk;
+ struct hlist_node node;
+ u8 key[];
+};
+
+struct bpf_shtab_bucket {
+ struct hlist_head head;
+ spinlock_t lock;
+};
+
+struct bpf_shtab {
+ struct bpf_map map;
+ struct bpf_shtab_bucket *buckets;
+ u32 buckets_num;
+ u32 elem_size;
+ struct sk_psock_progs progs;
+ atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+ return jhash(key, len, 0);
+}
+
+static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab,
+ u32 hash)
+{
+ return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_shtab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+ u32 key_size)
+{
+ struct bpf_shtab_elem *elem;
+
+ hlist_for_each_entry_rcu(elem, head, node) {
+ if (elem->hash == hash &&
+ !memcmp(&elem->key, key, key_size))
+ return elem;
+ }
+
+ return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+ return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_shtab *htab,
+ struct bpf_shtab_elem *elem)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_elem *elem_probe, *elem = link_raw;
+ struct bpf_shtab_bucket *bucket;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ bucket = sock_hash_select_bucket(htab, elem->hash);
+
+ /* elem may be deleted in parallel from the map, but access here
+ * is okay since it's going away only after RCU grace period.
+ * However, we need to check whether it's still present.
+ */
+ spin_lock_bh(&bucket->lock);
+ elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+ elem->key, map->key_size);
+ if (elem_probe && elem_probe == elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ spin_unlock_bh(&bucket->lock);
+}
+
+static long sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u32 hash, key_size = map->key_size;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ int ret = -ENOENT;
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ ret = 0;
+ }
+ spin_unlock_bh(&bucket->lock);
+ return ret;
+}
+
+static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
+ void *key, u32 key_size,
+ u32 hash, struct sock *sk,
+ struct bpf_shtab_elem *old)
+{
+ struct bpf_shtab_elem *new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+
+ new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!new) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(new->key, key, key_size);
+ new->sk = sk;
+ new->hash = hash;
+ return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_shtab_elem *elem, *elem_new;
+ struct bpf_shtab_bucket *bucket;
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!elem && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+ if (IS_ERR(elem_new)) {
+ ret = PTR_ERR(elem_new);
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, elem_new);
+ /* Add new element to the head of the list, so that
+ * concurrent search will find it before old elem.
+ */
+ hlist_add_head_rcu(&elem_new->node, &bucket->head);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ spin_unlock_bh(&bucket->lock);
+ return 0;
+out_unlock:
+ spin_unlock_bh(&bucket->lock);
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+ void *key_next)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_elem *elem, *elem_next;
+ u32 hash, key_size = map->key_size;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first_elem;
+ hash = sock_hash_bucket_hash(key, key_size);
+ head = &sock_hash_select_bucket(htab, hash)->head;
+ elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+ if (!elem)
+ goto find_first_elem;
+
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)),
+ struct bpf_shtab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+
+ i = hash & (htab->buckets_num - 1);
+ i++;
+find_first_elem:
+ for (; i < htab->buckets_num; i++) {
+ head = &sock_hash_select_bucket(htab, i)->head;
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)),
+ struct bpf_shtab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_shtab *htab;
+ int i, err;
+
+ if (attr->max_entries == 0 ||
+ attr->key_size == 0 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+ if (attr->key_size > MAX_BPF_STACK)
+ return ERR_PTR(-E2BIG);
+
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct bpf_shtab_elem) +
+ round_up(htab->map.key_size, 8);
+ if (htab->buckets_num == 0 ||
+ htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) {
+ err = -EINVAL;
+ goto free_htab;
+ }
+
+ htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+ sizeof(struct bpf_shtab_bucket),
+ htab->map.numa_node);
+ if (!htab->buckets) {
+ err = -ENOMEM;
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->buckets_num; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ bpf_map_area_free(htab);
+ return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_bucket *bucket;
+ struct hlist_head unlink_list;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+ int i;
+
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
+ synchronize_rcu();
+ for (i = 0; i < htab->buckets_num; i++) {
+ bucket = sock_hash_select_bucket(htab, i);
+
+ /* We are racing with sock_hash_delete_from_link to
+ * enter the spin-lock critical section. Every socket on
+ * the list is still linked to sockhash. Since link
+ * exists, psock exists and holds a ref to socket. That
+ * lets us to grab a socket ref too.
+ */
+ spin_lock_bh(&bucket->lock);
+ hlist_for_each_entry(elem, &bucket->head, node)
+ sock_hold(elem->sk);
+ hlist_move_list(&bucket->head, &unlink_list);
+ spin_unlock_bh(&bucket->lock);
+
+ /* Process removed entries out of atomic context to
+ * block for socket lock before deleting the psock's
+ * link to sockhash.
+ */
+ hlist_for_each_entry_safe(elem, node, &unlink_list, node) {
+ hlist_del(&elem->node);
+ lock_sock(elem->sk);
+ rcu_read_lock();
+ sock_map_unref(elem->sk, elem);
+ rcu_read_unlock();
+ release_sock(elem->sk);
+ sock_put(elem->sk);
+ sock_hash_free_elem(htab, elem);
+ }
+ cond_resched();
+ }
+
+ /* wait for psock readers accessing its map link */
+ synchronize_rcu();
+
+ bpf_map_area_free(htab->buckets);
+ bpf_map_area_free(htab);
+}
+
+static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ __sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static void *sock_hash_lookup(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_hash_update_common(map, key, sops->sk, flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
+
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+ return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct sock_hash_seq_info {
+ struct bpf_map *map;
+ struct bpf_shtab *htab;
+ u32 bucket_id;
+};
+
+static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
+ struct bpf_shtab_elem *prev_elem)
+{
+ const struct bpf_shtab *htab = info->htab;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ node = rcu_dereference(hlist_next_rcu(&prev_elem->node));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+
+ /* no more elements, continue in the next bucket */
+ info->bucket_id++;
+ }
+
+ for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
+ bucket = &htab->buckets[info->bucket_id];
+ node = rcu_dereference(hlist_first_rcu(&bucket->head));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+ }
+
+ return NULL;
+}
+
+static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_hash_seq_stop */
+ rcu_read_lock();
+ return sock_hash_seq_find_next(info, NULL);
+}
+
+static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ ++*pos;
+ return sock_hash_seq_find_next(info, v);
+}
+
+static int sock_hash_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_shtab_elem *elem = v;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !elem);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->key;
+ ctx.sk = elem->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_hash_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_hash_seq_show(seq, NULL);
+
+ /* pairs with sock_hash_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_hash_seq_ops = {
+ .start = sock_hash_seq_start,
+ .next = sock_hash_seq_next,
+ .stop = sock_hash_seq_stop,
+ .show = sock_hash_seq_show,
+};
+
+static int sock_hash_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ info->map = aux->map;
+ info->htab = container_of(aux->map, struct bpf_shtab, map);
+ return 0;
+}
+
+static void sock_hash_fini_seq_private(void *priv_data)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ bpf_map_put_with_uref(info->map);
+}
+
+static u64 sock_hash_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u64 usage = sizeof(*htab);
+
+ usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket);
+ usage += atomic_read(&htab->count) * (u64)htab->elem_size;
+ return usage;
+}
+
+static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
+ .seq_ops = &sock_hash_seq_ops,
+ .init_seq_private = sock_hash_init_seq_private,
+ .fini_seq_private = sock_hash_fini_seq_private,
+ .seq_priv_size = sizeof(struct sock_hash_seq_info),
+};
+
+BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab)
+const struct bpf_map_ops sock_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
+ .map_lookup_elem = sock_hash_lookup,
+ .map_lookup_elem_sys_only = sock_hash_lookup_sys,
+ .map_release_uref = sock_hash_release_progs,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = sock_hash_mem_usage,
+ .map_btf_id = &sock_hash_map_btf_ids[0],
+ .iter_seq_info = &sock_hash_iter_seq_info,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return &container_of(map, struct bpf_stab, map)->progs;
+ case BPF_MAP_TYPE_SOCKHASH:
+ return &container_of(map, struct bpf_shtab, map)->progs;
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ struct bpf_link ***plink, u32 which)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog **cur_pprog;
+ struct bpf_link **cur_plink;
+
+ if (!progs)
+ return -EOPNOTSUPP;
+
+ switch (which) {
+ case BPF_SK_MSG_VERDICT:
+ cur_pprog = &progs->msg_parser;
+ cur_plink = &progs->msg_parser_link;
+ break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ case BPF_SK_SKB_STREAM_PARSER:
+ cur_pprog = &progs->stream_parser;
+ cur_plink = &progs->stream_parser_link;
+ break;
+#endif
+ case BPF_SK_SKB_STREAM_VERDICT:
+ if (progs->skb_verdict)
+ return -EBUSY;
+ cur_pprog = &progs->stream_verdict;
+ cur_plink = &progs->stream_verdict_link;
+ break;
+ case BPF_SK_SKB_VERDICT:
+ if (progs->stream_verdict)
+ return -EBUSY;
+ cur_pprog = &progs->skb_verdict;
+ cur_plink = &progs->skb_verdict_link;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ *pprog = cur_pprog;
+ if (plink)
+ *plink = cur_plink;
+ return 0;
+}
+
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which)
+{
+ struct bpf_prog **pprog;
+ struct bpf_link **plink;
+ int ret;
+
+ ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);
+ if (ret)
+ return ret;
+
+ /* for prog_attach/prog_detach/link_attach, return error if a bpf_link
+ * exists for that prog.
+ */
+ if ((!link || prog) && *plink)
+ return -EBUSY;
+
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (!ret)
+ *plink = NULL;
+ } else {
+ psock_set_prog(pprog, prog);
+ if (link)
+ *plink = link;
+ }
+
+ return ret;
+}
+
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_cnt = 0, flags = 0;
+ struct bpf_prog **pprog;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ u32 id = 0;
+ int ret;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->target_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ rcu_read_lock();
+
+ ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);
+ if (ret)
+ goto end;
+
+ prog = *pprog;
+ prog_cnt = !prog ? 0 : 1;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ goto end;
+
+ /* we do not hold the refcnt, the bpf prog may be released
+ * asynchronously and the id would be set to 0.
+ */
+ id = data_race(prog->aux->id);
+ if (id == 0)
+ prog_cnt = 0;
+
+end:
+ rcu_read_unlock();
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) ||
+ (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) ||
+ copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+ switch (link->map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return sock_map_delete_from_link(link->map, sk,
+ link->link_raw);
+ case BPF_MAP_TYPE_SOCKHASH:
+ return sock_hash_delete_from_link(link->map, sk,
+ link->link_raw);
+ default:
+ break;
+ }
+}
+
+static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ while ((link = sk_psock_link_pop(psock))) {
+ sock_map_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+void sock_map_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ saved_unhash = READ_ONCE(sk->sk_prot)->unhash;
+ } else {
+ saved_unhash = psock->saved_unhash;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ }
+ if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
+ return;
+ if (saved_unhash)
+ saved_unhash(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_unhash);
+
+void sock_map_destroy(struct sock *sk)
+{
+ void (*saved_destroy)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ saved_destroy = READ_ONCE(sk->sk_prot)->destroy;
+ } else {
+ saved_destroy = psock->saved_destroy;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ sk_psock_stop(psock);
+ sk_psock_put(sk, psock);
+ }
+ if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
+ return;
+ if (saved_destroy)
+ saved_destroy(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_destroy);
+
+void sock_map_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ saved_close = psock->saved_close;
+ sock_map_remove_links(sk, psock);
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ goto no_psock;
+ rcu_read_unlock();
+ sk_psock_stop(psock);
+ release_sock(sk);
+ cancel_delayed_work_sync(&psock->work);
+ sk_psock_put(sk, psock);
+ } else {
+ saved_close = READ_ONCE(sk->sk_prot)->close;
+no_psock:
+ rcu_read_unlock();
+ release_sock(sk);
+ }
+
+ /* Make sure we do not recurse. This is a bug.
+ * Leak the socket instead of crashing on a stack overflow.
+ */
+ if (WARN_ON_ONCE(saved_close == sock_map_close))
+ return;
+ saved_close(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(sock_map_close);
+
+struct sockmap_link {
+ struct bpf_link link;
+ struct bpf_map *map;
+};
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+ struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+
+ mutex_lock(&sockmap_mutex);
+ if (!sockmap_link->map)
+ goto out;
+
+ WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+ link->attach_type));
+
+ bpf_map_put_with_uref(sockmap_link->map);
+ sockmap_link->map = NULL;
+out:
+ mutex_unlock(&sockmap_mutex);
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+ sock_map_link_release(link);
+ return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+ kfree(link);
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *prog,
+ struct bpf_prog *old)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ struct bpf_prog **pprog, *old_link_prog;
+ struct bpf_link **plink;
+ int ret = 0;
+
+ mutex_lock(&sockmap_mutex);
+
+ /* If old prog is not NULL, ensure old prog is the same as link->prog. */
+ if (old && link->prog != old) {
+ ret = -EPERM;
+ goto out;
+ }
+ /* Ensure link->prog has the same type/attach_type as the new prog. */
+ if (link->prog->type != prog->type ||
+ link->prog->expected_attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sockmap_link->map) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
+ link->attach_type);
+ if (ret)
+ goto out;
+
+ /* return error if the stored bpf_link does not match the incoming bpf_link. */
+ if (link != *plink) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (ret)
+ goto out;
+ } else {
+ psock_set_prog(pprog, prog);
+ }
+
+ bpf_prog_inc(prog);
+ old_link_prog = xchg(&link->prog, prog);
+ bpf_prog_put(old_link_prog);
+
+out:
+ mutex_unlock(&sockmap_mutex);
+ return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+ u32 map_id = 0;
+
+ mutex_lock(&sockmap_mutex);
+ if (sockmap_link->map)
+ map_id = sockmap_link->map->id;
+ mutex_unlock(&sockmap_mutex);
+ return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ info->sockmap.map_id = map_id;
+ info->sockmap.attach_type = link->attach_type;
+ return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ seq_printf(seq, "map_id:\t%u\n", map_id);
+ seq_printf(seq, "attach_type:\t%u\n", link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+ .release = sock_map_link_release,
+ .dealloc = sock_map_link_dealloc,
+ .detach = sock_map_link_detach,
+ .update_prog = sock_map_link_update_prog,
+ .fill_link_info = sock_map_link_fill_info,
+ .show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct sockmap_link *sockmap_link;
+ enum bpf_attach_type attach_type;
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ map = bpf_map_get_with_uref(attr->link_create.target_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+ if (!sockmap_link) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ attach_type = attr->link_create.attach_type;
+ bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog,
+ attach_type);
+ sockmap_link->map = map;
+
+ ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+ if (ret) {
+ kfree(sockmap_link);
+ goto out;
+ }
+
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+ mutex_unlock(&sockmap_mutex);
+ if (ret) {
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+
+ /* Increase refcnt for the prog since when old prog is replaced with
+ * psock_replace_prog() and psock_set_prog() its refcnt will be decreased.
+ *
+ * Actually, we do not need to increase refcnt for the prog since bpf_link
+ * will hold a reference. But in order to have less complexity w.r.t.
+ * replacing/setting prog, let us increase the refcnt to make things simpler.
+ */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out:
+ bpf_map_put_with_uref(map);
+ return ret;
+}
+
+static int sock_map_iter_attach_target(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP &&
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto put_map;
+
+ if (prog->aux->max_rdonly_access > map->key_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static struct bpf_iter_reg sock_map_iter_reg = {
+ .target = "sockmap",
+ .attach_target = sock_map_iter_attach_target,
+ .detach_target = sock_map_iter_detach_target,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__sockmap, key),
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+ { offsetof(struct bpf_iter__sockmap, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_sockmap_iter_init(void)
+{
+ sock_map_iter_reg.ctx_arg_info[1].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&sock_map_iter_reg);
+}
+late_initcall(bpf_sockmap_iter_init);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
new file mode 100644
index 000000000000..4211710393a8
--- /dev/null
+++ b/net/core/sock_reuseport.c
@@ -0,0 +1,748 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * To speed up listener socket lookup, create an array to store all sockets
+ * listening on the same port. This allows a decision to be made after finding
+ * the first socket. An optional BPF program can also be configured for
+ * selecting the socket index from the array of available sockets.
+ */
+
+#include <net/ip.h>
+#include <net/sock_reuseport.h>
+#include <linux/bpf.h>
+#include <linux/idr.h>
+#include <linux/filter.h>
+#include <linux/rcupdate.h>
+
+#define INIT_SOCKS 128
+
+DEFINE_SPINLOCK(reuseport_lock);
+
+static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany);
+
+void reuseport_has_conns_set(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (likely(reuse))
+ reuse->has_conns = 1;
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_has_conns_set);
+
+static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
+{
+ /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
+}
+
+static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
+{
+ /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
+}
+
+static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+ if (sk->sk_incoming_cpu >= 0)
+ __reuseport_get_incoming_cpu(reuse);
+}
+
+static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+ if (sk->sk_incoming_cpu >= 0)
+ __reuseport_put_incoming_cpu(reuse);
+}
+
+void reuseport_update_incoming_cpu(struct sock *sk, int val)
+{
+ struct sock_reuseport *reuse;
+ int old_sk_incoming_cpu;
+
+ if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
+ /* Paired with REAE_ONCE() in sk_incoming_cpu_update()
+ * and compute_score().
+ */
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+ return;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+
+ /* This must be done under reuseport_lock to avoid a race with
+ * reuseport_grow(), which accesses sk->sk_incoming_cpu without
+ * lock_sock() when detaching a shutdown()ed sk.
+ *
+ * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
+ */
+ old_sk_incoming_cpu = sk->sk_incoming_cpu;
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* reuseport_grow() has detached a closed sk. */
+ if (!reuse)
+ goto out;
+
+ if (old_sk_incoming_cpu < 0 && val >= 0)
+ __reuseport_get_incoming_cpu(reuse);
+ else if (old_sk_incoming_cpu >= 0 && val < 0)
+ __reuseport_put_incoming_cpu(reuse);
+
+out:
+ spin_unlock_bh(&reuseport_lock);
+}
+
+static int reuseport_sock_index(struct sock *sk,
+ const struct sock_reuseport *reuse,
+ bool closed)
+{
+ int left, right;
+
+ if (!closed) {
+ left = 0;
+ right = reuse->num_socks;
+ } else {
+ left = reuse->max_socks - reuse->num_closed_socks;
+ right = reuse->max_socks;
+ }
+
+ for (; left < right; left++)
+ if (reuse->socks[left] == sk)
+ return left;
+ return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+ reuseport_get_incoming_cpu(sk, reuse);
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, false);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+ reuseport_put_incoming_cpu(sk, reuse);
+
+ return true;
+}
+
+static void __reuseport_add_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+ reuseport_get_incoming_cpu(sk, reuse);
+}
+
+static bool __reuseport_detach_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, true);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+ reuseport_put_incoming_cpu(sk, reuse);
+
+ return true;
+}
+
+static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC);
+ if (!reuse)
+ return NULL;
+
+ reuse->max_socks = max_socks;
+
+ RCU_INIT_POINTER(reuse->prog, NULL);
+ return reuse;
+}
+
+int reuseport_alloc(struct sock *sk, bool bind_inany)
+{
+ struct sock_reuseport *reuse;
+ int id, ret = 0;
+
+ /* bh lock used since this function call may precede hlist lock in
+ * soft irq of receive path or setsockopt from process context
+ */
+ spin_lock_bh(&reuseport_lock);
+
+ /* Allocation attempts can occur concurrently via the setsockopt path
+ * and the bind/hash path. Nothing to do when we lose the race.
+ */
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (reuse) {
+ if (reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
+ goto out;
+ }
+
+ /* Only set reuse->bind_inany if the bind_inany is true.
+ * Otherwise, it will overwrite the reuse->bind_inany
+ * which was set by the bind/hash path.
+ */
+ if (bind_inany)
+ reuse->bind_inany = bind_inany;
+ goto out;
+ }
+
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ ret = id;
+ goto out;
+ }
+
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
+ reuse->socks[0] = sk;
+ reuse->num_socks = 1;
+ reuseport_get_incoming_cpu(sk, reuse);
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+out:
+ spin_unlock_bh(&reuseport_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(reuseport_alloc);
+
+static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
+{
+ struct sock_reuseport *more_reuse;
+ u32 more_socks_size, i;
+
+ more_socks_size = reuse->max_socks * 2U;
+ if (more_socks_size > U16_MAX) {
+ if (reuse->num_closed_socks) {
+ /* Make room by removing a closed sk.
+ * The child has already been migrated.
+ * Only reqsk left at this point.
+ */
+ struct sock *sk;
+
+ sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
+ __reuseport_detach_closed_sock(sk, reuse);
+
+ return reuse;
+ }
+
+ return NULL;
+ }
+
+ more_reuse = __reuseport_alloc(more_socks_size);
+ if (!more_reuse)
+ return NULL;
+
+ more_reuse->num_socks = reuse->num_socks;
+ more_reuse->num_closed_socks = reuse->num_closed_socks;
+ more_reuse->prog = reuse->prog;
+ more_reuse->reuseport_id = reuse->reuseport_id;
+ more_reuse->bind_inany = reuse->bind_inany;
+ more_reuse->has_conns = reuse->has_conns;
+ more_reuse->incoming_cpu = reuse->incoming_cpu;
+
+ memcpy(more_reuse->socks, reuse->socks,
+ reuse->num_socks * sizeof(struct sock *));
+ memcpy(more_reuse->socks +
+ (more_reuse->max_socks - more_reuse->num_closed_socks),
+ reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
+ reuse->num_closed_socks * sizeof(struct sock *));
+ more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
+
+ for (i = 0; i < reuse->max_socks; ++i)
+ rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
+ more_reuse);
+
+ /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
+ * that reuse and more_reuse can temporarily share a reference
+ * to prog.
+ */
+ kfree_rcu(reuse, rcu);
+ return more_reuse;
+}
+
+static void reuseport_free_rcu(struct rcu_head *head)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = container_of(head, struct sock_reuseport, rcu);
+ sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
+ ida_free(&reuseport_ida, reuse->reuseport_id);
+ kfree(reuse);
+}
+
+/**
+ * reuseport_add_sock - Add a socket to the reuseport group of another.
+ * @sk: New socket to add to the group.
+ * @sk2: Socket belonging to the existing reuseport group.
+ * @bind_inany: Whether or not the group is bound to a local INANY address.
+ *
+ * May return ENOMEM and not add socket to group under memory pressure.
+ */
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
+{
+ struct sock_reuseport *old_reuse, *reuse;
+
+ if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
+ int err = reuseport_alloc(sk2, bind_inany);
+
+ if (err)
+ return err;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (old_reuse && old_reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
+
+ spin_unlock_bh(&reuseport_lock);
+ return err;
+ }
+
+ if (old_reuse && old_reuse->num_socks != 1) {
+ spin_unlock_bh(&reuseport_lock);
+ return -EBUSY;
+ }
+
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+ }
+
+ __reuseport_add_sock(sk, reuse);
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+
+ if (old_reuse)
+ call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_add_sock);
+
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany)
+{
+ if (old_reuse == reuse) {
+ /* If sk was in the same reuseport group, just pop sk out of
+ * the closed section and push sk into the listening section.
+ */
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, old_reuse);
+ return 0;
+ }
+
+ if (!reuse) {
+ /* In bind()/listen() path, we cannot carry over the eBPF prog
+ * for the shutdown()ed socket. In setsockopt() path, we should
+ * not change the eBPF prog of listening sockets by attaching a
+ * prog to the shutdown()ed socket. Thus, we will allocate a new
+ * reuseport group and detach sk from the old group.
+ */
+ int id;
+
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse)
+ return -ENOMEM;
+
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ return id;
+ }
+
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
+ } else {
+ /* Move sk from the old group to the new one if
+ * - all the other listeners in the old group were close()d or
+ * shutdown()ed, and then sk2 has listen()ed on the same port
+ * OR
+ * - sk listen()ed without bind() (or with autobind), was
+ * shutdown()ed, and then listen()s on another port which
+ * sk2 listen()s on.
+ */
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse)
+ return -ENOMEM;
+ }
+ }
+
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, reuse);
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
+ call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
+ return 0;
+}
+
+void reuseport_detach_sock(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* reuseport_grow() has detached a closed sk */
+ if (!reuse)
+ goto out;
+
+ /* Notify the bpf side. The sk may be added to a sockarray
+ * map. If so, sockarray logic will remove it from the map.
+ *
+ * Other bpf map types that work with reuseport, like sockmap,
+ * don't need an explicit callback from here. They override sk
+ * unhash/close ops to remove the sk from the map before we
+ * get to this point.
+ */
+ bpf_sk_reuseport_detach(sk);
+
+ rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+
+ if (!__reuseport_detach_closed_sock(sk, reuse))
+ __reuseport_detach_sock(sk, reuse);
+
+ if (reuse->num_socks + reuse->num_closed_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+
+out:
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_detach_sock);
+
+void reuseport_stop_listen_sock(struct sock *sk)
+{
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&reuseport_lock);
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
+ (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
+ /* Migration capable, move sk from the listening section
+ * to the closed section.
+ */
+ bpf_sk_reuseport_detach(sk);
+
+ __reuseport_detach_sock(sk, reuse);
+ __reuseport_add_closed_sock(sk, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+ return;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+ }
+
+ /* Not capable to do migration, detach immediately */
+ reuseport_detach_sock(sk);
+}
+EXPORT_SYMBOL(reuseport_stop_listen_sock);
+
+static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sk_buff *nskb = NULL;
+ u32 index;
+
+ if (skb_shared(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+ skb = nskb;
+ }
+
+ /* temporarily advance data past protocol header */
+ if (!pskb_pull(skb, hdr_len)) {
+ kfree_skb(nskb);
+ return NULL;
+ }
+ index = bpf_prog_run_save_cb(prog, skb);
+ __skb_push(skb, hdr_len);
+
+ consume_skb(nskb);
+
+ if (index >= socks)
+ return NULL;
+
+ return reuse->socks[index];
+}
+
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+ u32 hash, u16 num_socks)
+{
+ struct sock *first_valid_sk = NULL;
+ int i, j;
+
+ i = j = reciprocal_scale(hash, num_socks);
+ do {
+ struct sock *sk = reuse->socks[i];
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
+ if (!READ_ONCE(reuse->incoming_cpu))
+ return sk;
+
+ /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ return sk;
+
+ if (!first_valid_sk)
+ first_valid_sk = sk;
+ }
+
+ i++;
+ if (i >= num_socks)
+ i = 0;
+ } while (i != j);
+
+ return first_valid_sk;
+}
+
+/**
+ * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: First socket in the group.
+ * @hash: When no BPF filter is available, use this hash to select.
+ * @skb: skb to run through BPF filter.
+ * @hdr_len: BPF filter expects skb data pointer at payload data. If
+ * the skb does not yet point at the payload, this parameter represents
+ * how far the pointer needs to advance to reach the payload.
+ * Returns a socket that should receive the packet (or NULL on error).
+ */
+struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+ struct sock *sk2 = NULL;
+ u16 socks;
+
+ rcu_read_lock();
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+
+ /* if memory allocation failed or add call is not yet complete */
+ if (!reuse)
+ goto out;
+
+ prog = rcu_dereference(reuse->prog);
+ socks = READ_ONCE(reuse->num_socks);
+ if (likely(socks)) {
+ /* paired with smp_wmb() in __reuseport_add_sock() */
+ smp_rmb();
+
+ if (!prog || !skb)
+ goto select_by_hash;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
+ else
+ sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
+
+select_by_hash:
+ /* no bpf or invalid bpf result: fall back to hash usage */
+ if (!sk2)
+ sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
+ }
+
+out:
+ rcu_read_unlock();
+ return sk2;
+}
+EXPORT_SYMBOL(reuseport_select_sock);
+
+/**
+ * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: close()ed or shutdown()ed socket in the group.
+ * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ * NEW_SYN_RECV request socket during 3WHS.
+ * @skb: skb to run through BPF filter.
+ * Returns a socket (with sk_refcnt +1) that should accept the child socket
+ * (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+ struct sock *migrating_sk,
+ struct sk_buff *skb)
+{
+ struct sock_reuseport *reuse;
+ struct sock *nsk = NULL;
+ bool allocated = false;
+ struct bpf_prog *prog;
+ u16 socks;
+ u32 hash;
+
+ rcu_read_lock();
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (!reuse)
+ goto out;
+
+ socks = READ_ONCE(reuse->num_socks);
+ if (unlikely(!socks))
+ goto failure;
+
+ /* paired with smp_wmb() in __reuseport_add_sock() */
+ smp_rmb();
+
+ hash = migrating_sk->sk_hash;
+ prog = rcu_dereference(reuse->prog);
+ if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
+ goto select_by_hash;
+ goto failure;
+ }
+
+ if (!skb) {
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ goto failure;
+ allocated = true;
+ }
+
+ nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+ if (allocated)
+ kfree_skb(skb);
+
+select_by_hash:
+ if (!nsk)
+ nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+ if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
+ nsk = NULL;
+ goto failure;
+ }
+
+out:
+ rcu_read_unlock();
+ return nsk;
+
+failure:
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ goto out;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
+int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ if (sk_unhashed(sk)) {
+ int err;
+
+ if (!sk->sk_reuseport)
+ return -EINVAL;
+
+ err = reuseport_alloc(sk, false);
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(reuse->prog, prog);
+ spin_unlock_bh(&reuseport_lock);
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_attach_prog);
+
+int reuseport_detach_prog(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ old_prog = NULL;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* reuse must be checked after acquiring the reuseport_lock
+ * because reuseport_grow() can detach a closed sk.
+ */
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+ }
+
+ if (sk_unhashed(sk) && reuse->num_closed_socks) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOENT;
+ }
+
+ old_prog = rcu_replace_pointer(reuse->prog, old_prog,
+ lockdep_is_held(&reuseport_lock));
+ spin_unlock_bh(&reuseport_lock);
+
+ if (!old_prog)
+ return -ENOENT;
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_detach_prog);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7decf70b0..7a37e7dd2c43 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* SUCS NET3:
*
@@ -9,10 +10,11 @@
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* (from old tcp.c code)
- * Alan Cox <alan@redhat.com> (Borrowed comments 8-))
+ * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
*/
#include <linux/module.h>
+#include <linux/sched/signal.h>
#include <linux/net.h>
#include <linux/signal.h>
#include <linux/tcp.h>
@@ -21,26 +23,33 @@
/**
* sk_stream_write_space - stream socket write_space callback.
- * @sk: socket
+ * @sk: pointer to the socket structure
*
- * FIXME: write proper description
+ * This function is invoked when there's space available in the socket's
+ * send buffer for writing. It first checks if the socket is writable,
+ * clears the SOCK_NOSPACE flag indicating that memory for writing
+ * is now available, wakes up any processes waiting for write operations
+ * and sends asynchronous notifications if needed.
*/
void sk_stream_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
+ if (__sk_stream_is_writeable(sk, 1) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
- sock_wake_async(sock, 2, POLL_OUT);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
}
}
-EXPORT_SYMBOL(sk_stream_write_space);
-
/**
* sk_stream_wait_connect - Wait for a socket to get into the connected state
* @sk: sock to wait on
@@ -50,8 +59,8 @@ EXPORT_SYMBOL(sk_stream_write_space);
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
int done;
do {
@@ -65,46 +74,44 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
- !sk->sk_err &&
- !((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
- finish_wait(sk->sk_sleep, &wait);
+ !READ_ONCE(sk->sk_err) &&
+ !((1 << READ_ONCE(sk->sk_state)) &
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
- return 0;
+ return done < 0 ? done : 0;
}
-
EXPORT_SYMBOL(sk_stream_wait_connect);
/**
* sk_stream_closing - Return 1 if we still have things to send in our buffers.
* @sk: socket to verify
*/
-static inline int sk_stream_closing(struct sock *sk)
+static int sk_stream_closing(const struct sock *sk)
{
- return (1 << sk->sk_state) &
+ return (1 << READ_ONCE(sk->sk_state)) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
}
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk->sk_sleep, &wait,
- TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk->sk_sleep, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
-
EXPORT_SYMBOL(sk_stream_wait_close);
/**
@@ -114,36 +121,38 @@ EXPORT_SYMBOL(sk_stream_wait_close);
*/
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
{
- int err = 0;
+ int ret, err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2;
- while (1) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ add_wait_queue(sk_sleep(sk), &wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ while (1) {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p)
- goto do_nonblock;
+ goto do_eagain;
if (signal_pending(current))
goto do_interrupted;
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk_stream_memory_free(sk) && !vm_wait)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, !sk->sk_err &&
- !(sk->sk_shutdown & SEND_SHUTDOWN) &&
- sk_stream_memory_free(sk) &&
- vm_wait);
+ ret = sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) ||
+ (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) && !vm_wait),
+ &wait);
sk->sk_write_pending--;
+ if (ret < 0)
+ goto do_error;
if (vm_wait) {
vm_wait -= current_timeo;
@@ -156,33 +165,28 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- finish_wait(sk->sk_sleep, &wait);
+ if (!sock_flag(sk, SOCK_DEAD))
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
err = -EPIPE;
goto out;
-do_nonblock:
+do_eagain:
+ /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
+ * be generated later.
+ * When TCP receives ACK packets that make room, tcp_check_space()
+ * only calls tcp_new_space() if SOCK_NOSPACE is set.
+ */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
goto out;
do_interrupted:
err = sock_intr_errno(*timeo_p);
goto out;
}
-
EXPORT_SYMBOL(sk_stream_wait_memory);
-void sk_stream_rfree(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
-
- skb_truesize_check(skb);
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- sk->sk_forward_alloc += skb->truesize;
-}
-
-EXPORT_SYMBOL(sk_stream_rfree);
-
int sk_stream_error(struct sock *sk, int flags, int err)
{
if (err == -EPIPE)
@@ -191,100 +195,30 @@ int sk_stream_error(struct sock *sk, int flags, int err)
send_sig(SIGPIPE, current, 0);
return err;
}
-
EXPORT_SYMBOL(sk_stream_error);
-void __sk_stream_mem_reclaim(struct sock *sk)
-{
- atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM,
- sk->sk_prot->memory_allocated);
- sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1;
- if (*sk->sk_prot->memory_pressure &&
- (atomic_read(sk->sk_prot->memory_allocated) <
- sk->sk_prot->sysctl_mem[0]))
- *sk->sk_prot->memory_pressure = 0;
-}
-
-EXPORT_SYMBOL(__sk_stream_mem_reclaim);
-
-int sk_stream_mem_schedule(struct sock *sk, int size, int kind)
-{
- int amt = sk_stream_pages(size);
-
- sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM;
- atomic_add(amt, sk->sk_prot->memory_allocated);
-
- /* Under limit. */
- if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) {
- if (*sk->sk_prot->memory_pressure)
- *sk->sk_prot->memory_pressure = 0;
- return 1;
- }
-
- /* Over hard limit. */
- if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) {
- sk->sk_prot->enter_memory_pressure();
- goto suppress_allocation;
- }
-
- /* Under pressure. */
- if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1])
- sk->sk_prot->enter_memory_pressure();
-
- if (kind) {
- if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0])
- return 1;
- } else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0])
- return 1;
-
- if (!*sk->sk_prot->memory_pressure ||
- sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) *
- sk_stream_pages(sk->sk_wmem_queued +
- atomic_read(&sk->sk_rmem_alloc) +
- sk->sk_forward_alloc))
- return 1;
-
-suppress_allocation:
-
- if (!kind) {
- sk_stream_moderate_sndbuf(sk);
-
- /* Fail only if socket is _under_ its sndbuf.
- * In this case we cannot block, so that we have to fail.
- */
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
- return 1;
- }
-
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM;
- atomic_sub(amt, sk->sk_prot->memory_allocated);
- return 0;
-}
-
-EXPORT_SYMBOL(sk_stream_mem_schedule);
-
void sk_stream_kill_queues(struct sock *sk)
{
/* First the read buffer. */
__skb_queue_purge(&sk->sk_receive_queue);
- /* Next, the error queue. */
- __skb_queue_purge(&sk->sk_error_queue);
+ /* Next, the error queue.
+ * We need to use queue lock, because other threads might
+ * add packets to the queue without socket lock being held.
+ */
+ skb_queue_purge(&sk->sk_error_queue);
/* Next, the write queue. */
- BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
+ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim_final(sk);
- BUG_TRAP(!sk->sk_wmem_queued);
- BUG_TRAP(!sk->sk_forward_alloc);
+ WARN_ON_ONCE(sk->sk_wmem_queued);
/* It is _impossible_ for the backlog to contain anything
* when we get here. All user references to this socket
* have gone away, only the net layer knows can touch it.
*/
}
-
EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 1e75b1585460..8d4decb2606f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* -*- linux-c -*-
* sysctl_net_core.c: sysctl interface to net core subsystem.
*
@@ -5,138 +6,799 @@
* Added /proc/sys/net/core directory entry (empty =) ). [MS]
*/
+#include <linux/filter.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/module.h>
#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched/isolation.h>
+
+#include <net/ip.h>
#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/pkt_sched.h>
+#include <net/hotdata.h>
+#include <net/proto_memory.h>
+#include <net/rps.h>
+
+#include "dev.h"
+#include "net-sysfs.h"
+
+static int int_3600 = 3600;
+static int min_sndbuf = SOCK_MIN_SNDBUF;
+static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
+static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
+static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ;
+
+static int net_msg_warn; /* Unused, but still a sysctl */
+
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
+/* 0 - Keep current behavior:
+ * IPv4: inherit all current settings from init_net
+ * IPv6: reset all settings to default
+ * 1 - Both inherit all current settings from init_net
+ * 2 - Both reset all settings to default
+ * 3 - Both inherit all settings from current netns
+ */
+int sysctl_devconf_inherit_init_net __read_mostly;
+EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
-#ifdef CONFIG_SYSCTL
+#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
+static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
+{
+ char *kbuf;
+ int len;
-extern int netdev_max_backlog;
-extern int weight_p;
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
-extern __u32 sysctl_wmem_max;
-extern __u32 sysctl_rmem_max;
+ /* CPUs are displayed as a hex bitmap + a comma between each groups of 8
+ * nibbles (except the last one which has a newline instead).
+ * Guesstimate the buffer size at the group granularity level.
+ */
+ len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp);
+ kbuf = kmalloc(len, GFP_KERNEL);
+ if (!kbuf) {
+ *lenp = 0;
+ return -ENOMEM;
+ }
-extern int sysctl_core_destroy_delay;
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ goto free_buf;
+ }
-#ifdef CONFIG_XFRM
-extern u32 sysctl_xfrm_aevent_etime;
-extern u32 sysctl_xfrm_aevent_rseqth;
+ /* scnprintf writes a trailing null char not counted in the returned
+ * length, override it with a newline.
+ */
+ kbuf[len++] = '\n';
+ memcpy(buffer, kbuf, len);
+ *lenp = len;
+ *ppos += len;
+
+free_buf:
+ kfree(kbuf);
+ return 0;
+}
#endif
-ctl_table core_table[] = {
-#ifdef CONFIG_NET
+#ifdef CONFIG_RPS
+
+DEFINE_MUTEX(rps_default_mask_mutex);
+
+static int rps_default_mask_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = (struct net *)table->data;
+ struct cpumask *mask;
+ int err = 0;
+
+ mutex_lock(&rps_default_mask_mutex);
+ mask = net->core.rps_default_mask;
+ if (write) {
+ if (!mask) {
+ mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ net->core.rps_default_mask = mask;
+ }
+ err = -ENOMEM;
+ if (!mask)
+ goto done;
+
+ err = cpumask_parse(buffer, mask);
+ if (err)
+ goto done;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto done;
+ } else {
+ err = dump_cpumask(buffer, lenp, ppos,
+ mask ?: cpu_none_mask);
+ }
+
+done:
+ mutex_unlock(&rps_default_mask_mutex);
+ return err;
+}
+
+static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int orig_size, size;
+ int ret, i;
+ struct ctl_table tmp = {
+ .data = &size,
+ .maxlen = sizeof(size),
+ .mode = table->mode
+ };
+ struct rps_sock_flow_table *orig_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+
+ mutex_lock(&sock_flow_mutex);
+
+ orig_sock_table = rcu_dereference_protected(
+ net_hotdata.rps_sock_flow_table,
+ lockdep_is_held(&sock_flow_mutex));
+ size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+ if (write) {
+ if (size) {
+ if (size > 1<<29) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table =
+ vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!sock_table) {
+ mutex_unlock(&sock_flow_mutex);
+ return -ENOMEM;
+ }
+ net_hotdata.rps_cpu_mask =
+ roundup_pow_of_two(nr_cpu_ids) - 1;
+ sock_table->mask = size - 1;
+ } else
+ sock_table = orig_sock_table;
+
+ for (i = 0; i < size; i++)
+ sock_table->ents[i] = RPS_NO_CPU;
+ } else
+ sock_table = NULL;
+
+ if (sock_table != orig_sock_table) {
+ rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
+ sock_table);
+ if (sock_table) {
+ static_branch_inc(&rps_needed);
+ static_branch_inc(&rfs_needed);
+ }
+ if (orig_sock_table) {
+ static_branch_dec(&rps_needed);
+ static_branch_dec(&rfs_needed);
+ kvfree_rcu(orig_sock_table, rcu);
+ }
+ }
+ }
+
+ mutex_unlock(&sock_flow_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_RPS */
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct sd_flow_limit *cur;
+ struct softnet_data *sd;
+ cpumask_var_t mask;
+ int i, len, ret = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (write) {
+ ret = cpumask_parse(buffer, mask);
+ if (ret)
+ goto done;
+
+ mutex_lock(&flow_limit_update_mutex);
+ len = sizeof(*cur) + netdev_flow_limit_table_len;
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ cur = rcu_dereference_protected(sd->flow_limit,
+ lockdep_is_held(&flow_limit_update_mutex));
+ if (cur && !cpumask_test_cpu(i, mask)) {
+ RCU_INIT_POINTER(sd->flow_limit, NULL);
+ kfree_rcu(cur, rcu);
+ } else if (!cur && cpumask_test_cpu(i, mask)) {
+ cur = kzalloc_node(len, GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cur) {
+ /* not unwinding previous changes */
+ ret = -ENOMEM;
+ goto write_unlock;
+ }
+ cur->log_buckets = ilog2(netdev_flow_limit_table_len);
+ rcu_assign_pointer(sd->flow_limit, cur);
+ }
+ }
+write_unlock:
+ mutex_unlock(&flow_limit_update_mutex);
+ } else {
+ cpumask_clear(mask);
+ rcu_read_lock();
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ if (rcu_dereference(sd->flow_limit))
+ cpumask_set_cpu(i, mask);
+ }
+ rcu_read_unlock();
+
+ ret = dump_cpumask(buffer, lenp, ppos, mask);
+ }
+
+done:
+ free_cpumask_var(mask);
+ return ret;
+}
+
+static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int old, *ptr;
+ int ret;
+
+ mutex_lock(&flow_limit_update_mutex);
+
+ ptr = table->data;
+ old = *ptr;
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write && !is_power_of_2(*ptr)) {
+ *ptr = old;
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&flow_limit_update_mutex);
+ return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ char id[IFNAMSIZ];
+ struct ctl_table tbl = {
+ .data = id,
+ .maxlen = IFNAMSIZ,
+ };
+ int ret;
+
+ qdisc_get_default(id, IFNAMSIZ);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = qdisc_set_default(id);
+ return ret;
+}
+#endif
+
+static int proc_do_dev_weight(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ static DEFINE_MUTEX(dev_weight_mutex);
+ int ret, weight;
+
+ mutex_lock(&dev_weight_mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ weight = READ_ONCE(weight_p);
+ WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
+ WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
+ }
+ mutex_unlock(&dev_weight_mutex);
+
+ return ret;
+}
+
+static int proc_do_rss_key(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table fake_table;
+ char buf[NETDEV_RSS_KEY_LEN * 3];
+
+ snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
+ fake_table.data = buf;
+ fake_table.maxlen = sizeof(buf);
+ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_BPF_JIT
+static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, jit_enable = *(int *)table->data;
+ int min = *(int *)table->extra1;
+ int max = *(int *)table->extra2;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &jit_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (jit_enable < 2 ||
+ (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) {
+ *(int *)table->data = jit_enable;
+ if (jit_enable == 2)
+ pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
+ } else {
+ ret = -EPERM;
+ }
+ }
+
+ if (write && ret && min == max)
+ pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
+
+ return ret;
+}
+
+# ifdef CONFIG_HAVE_EBPF_JIT
+static int
+proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+# endif /* CONFIG_HAVE_EBPF_JIT */
+
+static int
+proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
+static struct ctl_table net_core_table[] = {
{
- .ctl_name = NET_CORE_WMEM_MAX,
- .procname = "wmem_max",
- .data = &sysctl_wmem_max,
+ .procname = "mem_pcpu_rsv",
+ .data = &net_hotdata.sysctl_mem_pcpu_rsv,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_mem_pcpu_rsv,
},
{
- .ctl_name = NET_CORE_RMEM_MAX,
- .procname = "rmem_max",
- .data = &sysctl_rmem_max,
+ .procname = "dev_weight",
+ .data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
- .ctl_name = NET_CORE_WMEM_DEFAULT,
- .procname = "wmem_default",
- .data = &sysctl_wmem_default,
+ .procname = "dev_weight_rx_bias",
+ .data = &dev_weight_rx_bias,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
- .ctl_name = NET_CORE_RMEM_DEFAULT,
- .procname = "rmem_default",
- .data = &sysctl_rmem_default,
+ .procname = "dev_weight_tx_bias",
+ .data = &dev_weight_tx_bias,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
- .ctl_name = NET_CORE_DEV_WEIGHT,
- .procname = "dev_weight",
- .data = &weight_p,
+ .procname = "netdev_max_backlog",
+ .data = &net_hotdata.max_backlog,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_CORE_MAX_BACKLOG,
- .procname = "netdev_max_backlog",
- .data = &netdev_max_backlog,
+ .procname = "netdev_rss_key",
+ .data = &netdev_rss_key,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_do_rss_key,
+ },
+#ifdef CONFIG_BPF_JIT
+ {
+ .procname = "bpf_jit_enable",
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_bpf_enable,
+# ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+# else
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+# endif
+ },
+# ifdef CONFIG_HAVE_EBPF_JIT
+ {
+ .procname = "bpf_jit_harden",
+ .data = &bpf_jit_harden,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_bpf_restricted,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "bpf_jit_kallsyms",
+ .data = &bpf_jit_kallsyms,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_bpf_restricted,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+# endif
+ {
+ .procname = "bpf_jit_limit",
+ .data = &bpf_jit_limit,
+ .maxlen = sizeof(long),
+ .mode = 0600,
+ .proc_handler = proc_dolongvec_minmax_bpf_restricted,
+ .extra1 = SYSCTL_LONG_ONE,
+ .extra2 = &bpf_jit_limit_max,
+ },
+#endif
+ {
+ .procname = "netdev_tstamp_prequeue",
+ .data = &net_hotdata.tstamp_prequeue,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_CORE_MSG_COST,
.procname = "message_cost",
- .data = &net_msg_cost,
+ .data = &net_ratelimit_state.interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_CORE_MSG_BURST,
.procname = "message_burst",
- .data = &net_msg_burst,
+ .data = &net_ratelimit_state.burst,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
+#ifdef CONFIG_RPS
{
- .ctl_name = NET_CORE_OPTMEM_MAX,
- .procname = "optmem_max",
- .data = &sysctl_optmem_max,
+ .procname = "rps_sock_flow_entries",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = rps_sock_flow_sysctl
},
-#ifdef CONFIG_XFRM
+#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+ {
+ .procname = "flow_limit_cpu_bitmap",
+ .mode = 0644,
+ .proc_handler = flow_limit_cpu_sysctl
+ },
+ {
+ .procname = "flow_limit_table_len",
+ .data = &netdev_flow_limit_table_len,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = flow_limit_table_len_sysctl
+ },
+#endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_RX_BUSY_POLL
{
- .ctl_name = NET_CORE_AEVENT_ETIME,
- .procname = "xfrm_aevent_etime",
- .data = &sysctl_xfrm_aevent_etime,
- .maxlen = sizeof(u32),
+ .procname = "busy_poll",
+ .data = &sysctl_net_busy_poll,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
},
{
- .ctl_name = NET_CORE_AEVENT_RSEQTH,
- .procname = "xfrm_aevent_rseqth",
- .data = &sysctl_xfrm_aevent_rseqth,
- .maxlen = sizeof(u32),
+ .procname = "busy_read",
+ .data = &sysctl_net_busy_read,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
},
-#endif /* CONFIG_XFRM */
-#endif /* CONFIG_NET */
+#endif
+#ifdef CONFIG_NET_SCHED
+ {
+ .procname = "default_qdisc",
+ .mode = 0644,
+ .maxlen = IFNAMSIZ,
+ .proc_handler = set_default_qdisc
+ },
+#endif
+ {
+ .procname = "netdev_budget",
+ .data = &net_hotdata.netdev_budget,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "warnings",
+ .data = &net_msg_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "max_skb_frags",
+ .data = &net_hotdata.sysctl_max_skb_frags,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &max_skb_frags,
+ },
+ {
+ .procname = "netdev_budget_usecs",
+ .data = &net_hotdata.netdev_budget_usecs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &netdev_budget_usecs_min,
+ },
+ {
+ .procname = "fb_tunnels_only_for_init_net",
+ .data = &sysctl_fb_tunnels_only_for_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "devconf_inherit_init_net",
+ .data = &sysctl_devconf_inherit_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
+ .procname = "high_order_alloc_disable",
+ .data = &net_high_order_alloc_disable_key.key,
+ .maxlen = sizeof(net_high_order_alloc_disable_key),
+ .mode = 0644,
+ .proc_handler = proc_do_static_key,
+ },
+ {
+ .procname = "gro_normal_batch",
+ .data = &net_hotdata.gro_normal_batch,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "netdev_unregister_timeout_secs",
+ .data = &netdev_unregister_timeout_secs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &int_3600,
+ },
+ {
+ .procname = "skb_defer_max",
+ .data = &net_hotdata.sysctl_skb_defer_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
+static struct ctl_table netns_core_table[] = {
+#if IS_ENABLED(CONFIG_RPS)
+ {
+ .procname = "rps_default_mask",
+ .data = &init_net,
+ .mode = 0644,
+ .proc_handler = rps_default_mask_sysctl
+ },
+#endif
{
- .ctl_name = NET_CORE_SOMAXCONN,
.procname = "somaxconn",
- .data = &sysctl_somaxconn,
+ .data = &init_net.core.sysctl_somaxconn,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_dointvec_minmax
},
{
- .ctl_name = NET_CORE_BUDGET,
- .procname = "netdev_budget",
- .data = &netdev_budget,
+ .procname = "optmem_max",
+ .data = &init_net.core.sysctl_optmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_dointvec_minmax
+ },
+ {
+ .procname = "txrehash",
+ .data = &init_net.core.sysctl_txrehash,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "txq_reselection_ms",
+ .data = &init_net.core.sysctl_txq_reselection,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "tstamp_allow_data",
+ .data = &init_net.core.sysctl_tstamp_allow_data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "bypass_prot_mem",
+ .data = &init_net.core.sysctl_bypass_prot_mem,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ /* sysctl_core_net_init() will set the values after this
+ * to readonly in network namespaces
+ */
+ {
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
},
- { .ctl_name = 0 }
};
+static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
+{
+ /* fallback tunnels for initns only */
+ if (!strncmp(str, "initns", 6))
+ sysctl_fb_tunnels_only_for_init_net = 1;
+ /* no fallback tunnels anywhere */
+ else if (!strncmp(str, "none", 4))
+ sysctl_fb_tunnels_only_for_init_net = 2;
+
+ return 1;
+}
+__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
+
+static __net_init int sysctl_core_net_init(struct net *net)
+{
+ size_t table_size = ARRAY_SIZE(netns_core_table);
+ struct ctl_table *tbl;
+
+ tbl = netns_core_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+ tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+
+ for (i = 0; i < table_size; ++i) {
+ if (tbl[i].data == &sysctl_wmem_max)
+ break;
+
+ tbl[i].data += (char *)net - (char *)&init_net;
+ }
+ for (; i < table_size; ++i)
+ tbl[i].mode &= ~0222;
+ }
+
+ net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);
+ if (net->core.sysctl_hdr == NULL)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (tbl != netns_core_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_core_net_exit(struct net *net)
+{
+ const struct ctl_table *tbl;
+
+ tbl = net->core.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->core.sysctl_hdr);
+ BUG_ON(tbl == netns_core_table);
+#if IS_ENABLED(CONFIG_RPS)
+ kfree(net->core.rps_default_mask);
#endif
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_core_ops = {
+ .init = sysctl_core_net_init,
+ .exit = sysctl_core_net_exit,
+};
+
+static __init int sysctl_core_init(void)
+{
+ register_net_sysctl(&init_net, "net/core", net_core_table);
+ return register_pernet_subsys(&sysctl_core_ops);
+}
+
+fs_initcall(sysctl_core_init);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 000000000000..a50a7ef49ae8
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/ptp_clock_kernel.h>
+
+static unsigned int classify(const struct sk_buff *skb)
+{
+ if (likely(skb->dev && skb->dev->phydev &&
+ skb->dev->phydev->mii_ts))
+ return ptp_classify_raw(skb);
+ else
+ return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+ struct hwtstamp_provider *hwprov;
+ struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
+ struct sk_buff *clone;
+ unsigned int type;
+
+ if (!skb->sk || !skb->dev)
+ return;
+
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ type = classify(skb);
+ if (type == PTP_CLASS_NONE)
+ return;
+
+ mii_ts = phydev->mii_ts;
+ if (likely(mii_ts->txtstamp)) {
+ clone = skb_clone_sk(skb);
+ if (!clone)
+ return;
+ mii_ts->txtstamp(mii_ts, clone, type);
+ }
+}
+EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+ struct hwtstamp_provider *hwprov;
+ struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
+ unsigned int type;
+
+ if (!skb->dev)
+ return false;
+
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+
+ __skb_push(skb, ETH_HLEN);
+
+ type = ptp_classify_raw(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ mii_ts = phydev->mii_ts;
+ if (likely(mii_ts->rxtstamp))
+ return mii_ts->rxtstamp(mii_ts, skb, type);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 000000000000..6df997b9076e
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/tso.h>
+#include <linux/unaligned.h>
+
+void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ int hdr_len = skb_transport_offset(skb) + tso->tlen;
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ if (!tso->ipv6) {
+ struct iphdr *iph = (void *)(hdr + mac_hdr_len);
+
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tso->ip_id++;
+ } else {
+ struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len);
+
+ iph->payload_len = htons(size + tso->tlen);
+ }
+ hdr += skb_transport_offset(skb);
+ if (tso->tlen != sizeof(struct udphdr)) {
+ struct tcphdr *tcph = (struct tcphdr *)hdr;
+
+ put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+ } else {
+ struct udphdr *uh = (struct udphdr *)hdr;
+
+ uh->len = htons(sizeof(*uh) + size);
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size; /* not worth avoiding this operation for UDP */
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = skb_frag_size(frag);
+ tso->data = skb_frag_address(frag);
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+int tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr);
+ int hdr_len = skb_transport_offset(skb) + tlen;
+
+ tso->tlen = tlen;
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0;
+ tso->next_frag_idx = 0;
+ tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6);
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = skb_frag_size(frag);
+ tso->data = skb_frag_address(frag);
+ tso->next_frag_idx++;
+ }
+ return hdr_len;
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
deleted file mode 100644
index 248a6b666aff..000000000000
--- a/net/core/user_dma.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/socket.h>
-#include <linux/rtnetlink.h> /* for BUG_TRAP */
-#include <net/tcp.h>
-#include <net/netdma.h>
-
-#define NET_DMA_DEFAULT_COPYBREAK 4096
-
-int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
-
-/**
- * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
- * @skb - buffer to copy
- * @offset - offset in the buffer to start copying from
- * @iovec - io vector to copy to
- * @len - amount of data to copy from buffer to iovec
- * @pinned_list - locked iovec buffer data
- *
- * Note: the iovec is modified during the copy.
- */
-int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
- struct sk_buff *skb, int offset, struct iovec *to,
- size_t len, struct dma_pinned_list *pinned_list)
-{
- int start = skb_headlen(skb);
- int i, copy = start - offset;
- dma_cookie_t cookie = 0;
-
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
- skb->data + offset, copy);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
-
- /* Copy paged appendix. Hmm... why does this look so complicated? */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + skb_shinfo(skb)->frags[i].size;
- copy = end - offset;
- if ((copy = end - offset) > 0) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
-
- if (copy > len)
- copy = len;
-
- cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
- frag->page_offset + offset - start, copy);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
- start = end;
- }
-
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- copy = end - offset;
- if (copy > 0) {
- if (copy > len)
- copy = len;
- cookie = dma_skb_copy_datagram_iovec(chan, list,
- offset - start, to, copy,
- pinned_list);
- if (cookie < 0)
- goto fault;
- len -= copy;
- if (len == 0)
- goto end;
- offset += copy;
- }
- start = end;
- }
- }
-
-end:
- if (!len) {
- skb->dma_cookie = cookie;
- return cookie;
- }
-
-fault:
- return -EFAULT;
-}
diff --git a/net/core/utils.c b/net/core/utils.c
index d93fe64f6693..5e63b0ea21f3 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Generic address resultion entity
+ * Generic address resolution entity
*
* Authors:
* net_random Alan Cox
@@ -7,38 +8,36 @@
* in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
*
* Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/mm.h>
#include <linux/net.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/random.h>
#include <linux/percpu.h>
#include <linux/init.h>
+#include <linux/ratelimit.h>
+#include <linux/socket.h>
-#include <asm/byteorder.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/ipv6.h>
-int net_msg_cost = 5*HZ;
-int net_msg_burst = 10;
+#include <asm/byteorder.h>
+#include <linux/uaccess.h>
-/*
+DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
+/*
* All net warning printk()s should be guarded by this function.
- */
+ */
int net_ratelimit(void)
{
- return __printk_ratelimit(net_msg_cost, net_msg_burst);
+ return __ratelimit(&net_ratelimit_state);
}
EXPORT_SYMBOL(net_ratelimit);
@@ -50,19 +49,16 @@ EXPORT_SYMBOL(net_ratelimit);
__be32 in_aton(const char *str)
{
- unsigned long l;
+ unsigned int l;
unsigned int val;
int i;
l = 0;
- for (i = 0; i < 4; i++)
- {
+ for (i = 0; i < 4; i++) {
l <<= 8;
- if (*str != '\0')
- {
+ if (*str != '\0') {
val = 0;
- while (*str != '\0' && *str != '.' && *str != '\n')
- {
+ while (*str != '\0' && *str != '.' && *str != '\n') {
val *= 10;
val += *str - '0';
str++;
@@ -72,9 +68,8 @@ __be32 in_aton(const char *str)
str++;
}
}
- return(htonl(l));
+ return htonl(l);
}
-
EXPORT_SYMBOL(in_aton);
#define IN6PTON_XDIGIT 0x00010000
@@ -88,37 +83,41 @@ EXPORT_SYMBOL(in_aton);
#define IN6PTON_NULL 0x20000000 /* first/tail */
#define IN6PTON_UNKNOWN 0x40000000
-static inline int digit2bin(char c, char delim)
+static inline int xdigit2bin(char c, int delim)
{
- if (c == delim || c == '\0')
- return IN6PTON_DELIM;
- if (c == '.')
- return IN6PTON_DOT;
- if (c >= '0' && c <= '9')
- return (IN6PTON_DIGIT | (c - '0'));
- return IN6PTON_UNKNOWN;
-}
+ int val;
-static inline int xdigit2bin(char c, char delim)
-{
if (c == delim || c == '\0')
return IN6PTON_DELIM;
if (c == ':')
return IN6PTON_COLON_MASK;
if (c == '.')
return IN6PTON_DOT;
- if (c >= '0' && c <= '9')
- return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0'));
- if (c >= 'a' && c <= 'f')
- return (IN6PTON_XDIGIT | (c - 'a' + 10));
- if (c >= 'A' && c <= 'F')
- return (IN6PTON_XDIGIT | (c - 'A' + 10));
+
+ val = hex_to_bin(c);
+ if (val >= 0)
+ return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
+ if (delim == -1)
+ return IN6PTON_DELIM;
return IN6PTON_UNKNOWN;
}
+/**
+ * in4_pton - convert an IPv4 address from literal to binary representation
+ * @src: the start of the IPv4 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[4] array) representation of the IPv4 address
+ * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
int in4_pton(const char *src, int srclen,
u8 *dst,
- char delim, const char **end)
+ int delim, const char **end)
{
const char *s;
u8 *d;
@@ -132,19 +131,19 @@ int in4_pton(const char *src, int srclen,
s = src;
d = dbuf;
i = 0;
- while(1) {
+ while (1) {
int c;
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
- if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM))) {
+ if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
goto out;
}
- if (c & (IN6PTON_DOT | IN6PTON_DELIM)) {
+ if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
if (w == 0)
goto out;
*d++ = w & 0xff;
w = 0;
i++;
- if (c & IN6PTON_DELIM) {
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
if (i != 4)
goto out;
break;
@@ -168,12 +167,23 @@ out:
*end = s;
return ret;
}
-
EXPORT_SYMBOL(in4_pton);
+/**
+ * in6_pton - convert an IPv6 address from literal to binary representation
+ * @src: the start of the IPv6 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[16] array) representation of the IPv6 address
+ * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
int in6_pton(const char *src, int srclen,
u8 *dst,
- char delim, const char **end)
+ int delim, const char **end)
{
const char *s, *tok = NULL;
u8 *d, *dc = NULL;
@@ -271,11 +281,11 @@ cont:
i = 15; d--;
if (dc) {
- while(d >= dc)
+ while (d >= dc)
dst[i--] = *d--;
- while(i >= dc - dbuf)
+ while (i >= dc - dbuf)
dst[i--] = 0;
- while(i >= 0)
+ while (i >= 0)
dst[i--] = *d--;
} else
memcpy(dst, dbuf, sizeof(dbuf));
@@ -286,5 +296,191 @@ out:
*end = s;
return ret;
}
-
EXPORT_SYMBOL(in6_pton);
+
+static int inet4_pton(const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+ size_t srclen = strlen(src);
+
+ if (srclen > INET_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+ '\n', NULL) == 0)
+ return -EINVAL;
+
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(port_num);
+
+ return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+ const char *scope_delim;
+ size_t srclen = strlen(src);
+
+ if (srclen > INET6_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+ '%', &scope_delim) == 0)
+ return -EINVAL;
+
+ if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+ src + srclen != scope_delim && *scope_delim == '%') {
+ struct net_device *dev;
+ char scope_id[16];
+ size_t scope_len = min_t(size_t, sizeof(scope_id) - 1,
+ src + srclen - scope_delim - 1);
+
+ memcpy(scope_id, scope_delim + 1, scope_len);
+ scope_id[scope_len] = '\0';
+
+ dev = dev_get_by_name(net, scope_id);
+ if (dev) {
+ addr6->sin6_scope_id = dev->ifindex;
+ dev_put(dev);
+ } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+ return -EINVAL;
+ }
+ }
+
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(port_num);
+
+ return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+ const char *src, const char *port, struct sockaddr_storage *addr)
+{
+ u16 port_num;
+ int ret = -EINVAL;
+
+ if (port) {
+ if (kstrtou16(port, 0, &port_num))
+ return -EINVAL;
+ } else {
+ port_num = 0;
+ }
+
+ switch (af) {
+ case AF_INET:
+ ret = inet4_pton(src, port_num, addr);
+ break;
+ case AF_INET6:
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ case AF_UNSPEC:
+ ret = inet4_pton(src, port_num, addr);
+ if (ret)
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ default:
+ pr_err("unexpected address family %d\n", af);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
+bool inet_addr_is_any(struct sockaddr_storage *addr)
+{
+ if (addr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
+ const struct sockaddr_in6 in6_any =
+ { .sin6_addr = IN6ADDR_ANY_INIT };
+
+ if (!memcmp(in6->sin6_addr.s6_addr,
+ in6_any.sin6_addr.s6_addr, 16))
+ return true;
+ } else if (addr->ss_family == AF_INET) {
+ struct sockaddr_in *in = (struct sockaddr_in *)addr;
+
+ if (in->sin_addr.s_addr == htonl(INADDR_ANY))
+ return true;
+ } else {
+ pr_warn("unexpected address family %u\n", addr->ss_family);
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(inet_addr_is_any);
+
+void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, bool pseudohdr)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ csum_replace4(sum, from, to);
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_add(csum_sub(~(skb->csum),
+ (__force __wsum)from),
+ (__force __wsum)to);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum),
+ (__force __wsum)from),
+ (__force __wsum)to));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace4);
+
+/**
+ * inet_proto_csum_replace16 - update layer 4 header checksum field
+ * @sum: Layer 4 header checksum field
+ * @skb: sk_buff for the packet
+ * @from: old IPv6 address
+ * @to: new IPv6 address
+ * @pseudohdr: True if layer 4 header checksum includes pseudoheader
+ *
+ * Update layer 4 header as per the update in IPv6 src/dst address.
+ *
+ * There is no need to update skb->csum in this function, because update in two
+ * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other
+ * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to
+ * update skb->csum, because update in 3 fields a.) IPv4 src/dst address,
+ * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as
+ * L4 Header checksum for skb->csum calculation.
+ */
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ bool pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
+void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
+ __wsum diff, bool pseudohdr, bool ipv6)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ csum_replace_by_diff(sum, diff);
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6)
+ skb->csum = ~csum_sub(diff, skb->csum);
+ } else if (pseudohdr) {
+ *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
+ }
+}
+EXPORT_SYMBOL(inet_proto_csum_replace_by_diff);
diff --git a/net/core/wireless.c b/net/core/wireless.c
deleted file mode 100644
index cb1b8728d7ee..000000000000
--- a/net/core/wireless.c
+++ /dev/null
@@ -1,2353 +0,0 @@
-/*
- * This file implement the Wireless Extensions APIs.
- *
- * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 1997-2006 Jean Tourrilhes, All Rights Reserved.
- *
- * (As all part of the Linux kernel, this file is GPL)
- */
-
-/************************** DOCUMENTATION **************************/
-/*
- * API definition :
- * --------------
- * See <linux/wireless.h> for details of the APIs and the rest.
- *
- * History :
- * -------
- *
- * v1 - 5.12.01 - Jean II
- * o Created this file.
- *
- * v2 - 13.12.01 - Jean II
- * o Move /proc/net/wireless stuff from net/core/dev.c to here
- * o Make Wireless Extension IOCTLs go through here
- * o Added iw_handler handling ;-)
- * o Added standard ioctl description
- * o Initial dumb commit strategy based on orinoco.c
- *
- * v3 - 19.12.01 - Jean II
- * o Make sure we don't go out of standard_ioctl[] in ioctl_standard_call
- * o Add event dispatcher function
- * o Add event description
- * o Propagate events as rtnetlink IFLA_WIRELESS option
- * o Generate event on selected SET requests
- *
- * v4 - 18.04.02 - Jean II
- * o Fix stupid off by one in iw_ioctl_description : IW_ESSID_MAX_SIZE + 1
- *
- * v5 - 21.06.02 - Jean II
- * o Add IW_PRIV_TYPE_ADDR in priv_type_size (+cleanup)
- * o Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes
- * o Add IWEVCUSTOM for driver specific event/scanning token
- * o Turn on WE_STRICT_WRITE by default + kernel warning
- * o Fix WE_STRICT_WRITE in ioctl_export_private() (32 => iw_num)
- * o Fix off-by-one in test (extra_size <= IFNAMSIZ)
- *
- * v6 - 9.01.03 - Jean II
- * o Add common spy support : iw_handler_set_spy(), wireless_spy_update()
- * o Add enhanced spy support : iw_handler_set_thrspy() and event.
- * o Add WIRELESS_EXT version display in /proc/net/wireless
- *
- * v6 - 18.06.04 - Jean II
- * o Change get_spydata() method for added safety
- * o Remove spy #ifdef, they are always on -> cleaner code
- * o Allow any size GET request if user specifies length > max
- * and if request has IW_DESCR_FLAG_NOMAX flag or is SIOCGIWPRIV
- * o Start migrating get_wireless_stats to struct iw_handler_def
- * o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus
- * Based on patch from Pavel Roskin <proski@gnu.org> :
- * o Fix kernel data leak to user space in private handler handling
- *
- * v7 - 18.3.05 - Jean II
- * o Remove (struct iw_point *)->pointer from events and streams
- * o Remove spy_offset from struct iw_handler_def
- * o Start deprecating dev->get_wireless_stats, output a warning
- * o If IW_QUAL_DBM is set, show dBm values in /proc/net/wireless
- * o Don't loose INVALID/DBM flags when clearing UPDATED flags (iwstats)
- *
- * v8 - 17.02.06 - Jean II
- * o RtNetlink requests support (SET/GET)
- *
- * v8b - 03.08.06 - Herbert Xu
- * o Fix Wireless Event locking issues.
- *
- * v9 - 14.3.06 - Jean II
- * o Change length in ESSID and NICK to strlen() instead of strlen()+1
- * o Make standard_ioctl_num and standard_event_num unsigned
- * o Remove (struct net_device *)->get_wireless_stats()
- */
-
-/***************************** INCLUDES *****************************/
-
-#include <linux/module.h>
-#include <linux/types.h> /* off_t */
-#include <linux/netdevice.h> /* struct ifreq, dev_get_by_name() */
-#include <linux/proc_fs.h>
-#include <linux/rtnetlink.h> /* rtnetlink stuff */
-#include <linux/seq_file.h>
-#include <linux/init.h> /* for __init */
-#include <linux/if_arp.h> /* ARPHRD_ETHER */
-#include <linux/etherdevice.h> /* compare_ether_addr */
-#include <linux/interrupt.h>
-
-#include <linux/wireless.h> /* Pretty obvious */
-#include <net/iw_handler.h> /* New driver API */
-#include <net/netlink.h>
-
-#include <asm/uaccess.h> /* copy_to_user() */
-
-/**************************** CONSTANTS ****************************/
-
-/* Debugging stuff */
-#undef WE_IOCTL_DEBUG /* Debug IOCTL API */
-#undef WE_RTNETLINK_DEBUG /* Debug RtNetlink API */
-#undef WE_EVENT_DEBUG /* Debug Event dispatcher */
-#undef WE_SPY_DEBUG /* Debug enhanced spy support */
-
-/* Options */
-//CONFIG_NET_WIRELESS_RTNETLINK /* Wireless requests over RtNetlink */
-#define WE_EVENT_RTNETLINK /* Propagate events using RtNetlink */
-#define WE_SET_EVENT /* Generate an event on some set commands */
-
-/************************* GLOBAL VARIABLES *************************/
-/*
- * You should not use global variables, because of re-entrancy.
- * On our case, it's only const, so it's OK...
- */
-/*
- * Meta-data about all the standard Wireless Extension request we
- * know about.
- */
-static const struct iw_ioctl_description standard_ioctl[] = {
- [SIOCSIWCOMMIT - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWNAME - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_CHAR,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWNWID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWNWID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWFREQ - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_FREQ,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWFREQ - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_FREQ,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWMODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_UINT,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWMODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_UINT,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWSENS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWSENS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWRANGE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWRANGE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_range),
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWPRIV - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWPRIV - SIOCIWFIRST] = { /* (handled directly by us) */
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct iw_priv_args),
- .max_tokens = 16,
- .flags = IW_DESCR_FLAG_NOMAX,
- },
- [SIOCSIWSTATS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWSTATS - SIOCIWFIRST] = { /* (handled directly by us) */
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_statistics),
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct sockaddr),
- .max_tokens = IW_MAX_SPY,
- },
- [SIOCGIWSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct sockaddr) +
- sizeof(struct iw_quality),
- .max_tokens = IW_MAX_SPY,
- },
- [SIOCSIWTHRSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct iw_thrspy),
- .min_tokens = 1,
- .max_tokens = 1,
- },
- [SIOCGIWTHRSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct iw_thrspy),
- .min_tokens = 1,
- .max_tokens = 1,
- },
- [SIOCSIWAP - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [SIOCGIWAP - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWMLME - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_mlme),
- .max_tokens = sizeof(struct iw_mlme),
- },
- [SIOCGIWAPLIST - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct sockaddr) +
- sizeof(struct iw_quality),
- .max_tokens = IW_MAX_AP,
- .flags = IW_DESCR_FLAG_NOMAX,
- },
- [SIOCSIWSCAN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = 0,
- .max_tokens = sizeof(struct iw_scan_req),
- },
- [SIOCGIWSCAN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_SCAN_MAX_DATA,
- .flags = IW_DESCR_FLAG_NOMAX,
- },
- [SIOCSIWESSID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWESSID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWNICKN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE,
- },
- [SIOCGIWNICKN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE,
- },
- [SIOCSIWRATE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWRATE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWRTS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWRTS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWFRAG - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWFRAG - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWTXPOW - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWTXPOW - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWRETRY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWRETRY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWENCODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ENCODING_TOKEN_MAX,
- .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT,
- },
- [SIOCGIWENCODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ENCODING_TOKEN_MAX,
- .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT,
- },
- [SIOCSIWPOWER - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWPOWER - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWGENIE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [SIOCGIWGENIE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [SIOCSIWAUTH - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWAUTH - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWENCODEEXT - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_encode_ext),
- .max_tokens = sizeof(struct iw_encode_ext) +
- IW_ENCODING_TOKEN_MAX,
- },
- [SIOCGIWENCODEEXT - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_encode_ext),
- .max_tokens = sizeof(struct iw_encode_ext) +
- IW_ENCODING_TOKEN_MAX,
- },
- [SIOCSIWPMKSA - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_pmksa),
- .max_tokens = sizeof(struct iw_pmksa),
- },
-};
-static const unsigned standard_ioctl_num = (sizeof(standard_ioctl) /
- sizeof(struct iw_ioctl_description));
-
-/*
- * Meta-data about all the additional standard Wireless Extension events
- * we know about.
- */
-static const struct iw_ioctl_description standard_event[] = {
- [IWEVTXDROP - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [IWEVQUAL - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_QUAL,
- },
- [IWEVCUSTOM - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_CUSTOM_MAX,
- },
- [IWEVREGISTERED - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [IWEVEXPIRED - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [IWEVGENIE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [IWEVMICHAELMICFAILURE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_michaelmicfailure),
- },
- [IWEVASSOCREQIE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [IWEVASSOCRESPIE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [IWEVPMKIDCAND - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_pmkid_cand),
- },
-};
-static const unsigned standard_event_num = (sizeof(standard_event) /
- sizeof(struct iw_ioctl_description));
-
-/* Size (in bytes) of the various private data types */
-static const char iw_priv_type_size[] = {
- 0, /* IW_PRIV_TYPE_NONE */
- 1, /* IW_PRIV_TYPE_BYTE */
- 1, /* IW_PRIV_TYPE_CHAR */
- 0, /* Not defined */
- sizeof(__u32), /* IW_PRIV_TYPE_INT */
- sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */
- sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */
- 0, /* Not defined */
-};
-
-/* Size (in bytes) of various events */
-static const int event_type_size[] = {
- IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */
- 0,
- IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */
- 0,
- IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */
- IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */
- IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */
- 0,
- IW_EV_POINT_LEN, /* Without variable payload */
- IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */
- IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */
-};
-
-/************************ COMMON SUBROUTINES ************************/
-/*
- * Stuff that may be used in various place or doesn't fit in one
- * of the section below.
- */
-
-/* ---------------------------------------------------------------- */
-/*
- * Return the driver handler associated with a specific Wireless Extension.
- * Called from various place, so make sure it remains efficient.
- */
-static inline iw_handler get_handler(struct net_device *dev,
- unsigned int cmd)
-{
- /* Don't "optimise" the following variable, it will crash */
- unsigned int index; /* *MUST* be unsigned */
-
- /* Check if we have some wireless handlers defined */
- if(dev->wireless_handlers == NULL)
- return NULL;
-
- /* Try as a standard command */
- index = cmd - SIOCIWFIRST;
- if(index < dev->wireless_handlers->num_standard)
- return dev->wireless_handlers->standard[index];
-
- /* Try as a private command */
- index = cmd - SIOCIWFIRSTPRIV;
- if(index < dev->wireless_handlers->num_private)
- return dev->wireless_handlers->private[index];
-
- /* Not found */
- return NULL;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Get statistics out of the driver
- */
-static inline struct iw_statistics *get_wireless_stats(struct net_device *dev)
-{
- /* New location */
- if((dev->wireless_handlers != NULL) &&
- (dev->wireless_handlers->get_wireless_stats != NULL))
- return dev->wireless_handlers->get_wireless_stats(dev);
-
- /* Not found */
- return (struct iw_statistics *) NULL;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Call the commit handler in the driver
- * (if exist and if conditions are right)
- *
- * Note : our current commit strategy is currently pretty dumb,
- * but we will be able to improve on that...
- * The goal is to try to agreagate as many changes as possible
- * before doing the commit. Drivers that will define a commit handler
- * are usually those that need a reset after changing parameters, so
- * we want to minimise the number of reset.
- * A cool idea is to use a timer : at each "set" command, we re-set the
- * timer, when the timer eventually fires, we call the driver.
- * Hopefully, more on that later.
- *
- * Also, I'm waiting to see how many people will complain about the
- * netif_running(dev) test. I'm open on that one...
- * Hopefully, the driver will remember to do a commit in "open()" ;-)
- */
-static inline int call_commit_handler(struct net_device * dev)
-{
- if((netif_running(dev)) &&
- (dev->wireless_handlers->standard[0] != NULL)) {
- /* Call the commit handler on the driver */
- return dev->wireless_handlers->standard[0](dev, NULL,
- NULL, NULL);
- } else
- return 0; /* Command completed successfully */
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Calculate size of private arguments
- */
-static inline int get_priv_size(__u16 args)
-{
- int num = args & IW_PRIV_SIZE_MASK;
- int type = (args & IW_PRIV_TYPE_MASK) >> 12;
-
- return num * iw_priv_type_size[type];
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Re-calculate the size of private arguments
- */
-static inline int adjust_priv_size(__u16 args,
- union iwreq_data * wrqu)
-{
- int num = wrqu->data.length;
- int max = args & IW_PRIV_SIZE_MASK;
- int type = (args & IW_PRIV_TYPE_MASK) >> 12;
-
- /* Make sure the driver doesn't goof up */
- if (max < num)
- num = max;
-
- return num * iw_priv_type_size[type];
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Standard Wireless Handler : get wireless stats
- * Allow programatic access to /proc/net/wireless even if /proc
- * doesn't exist... Also more efficient...
- */
-static int iw_handler_get_iwstats(struct net_device * dev,
- struct iw_request_info * info,
- union iwreq_data * wrqu,
- char * extra)
-{
- /* Get stats from the driver */
- struct iw_statistics *stats;
-
- stats = get_wireless_stats(dev);
- if (stats != (struct iw_statistics *) NULL) {
-
- /* Copy statistics to extra */
- memcpy(extra, stats, sizeof(struct iw_statistics));
- wrqu->data.length = sizeof(struct iw_statistics);
-
- /* Check if we need to clear the updated flag */
- if(wrqu->data.flags != 0)
- stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
- return 0;
- } else
- return -EOPNOTSUPP;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Standard Wireless Handler : get iwpriv definitions
- * Export the driver private handler definition
- * They will be picked up by tools like iwpriv...
- */
-static int iw_handler_get_private(struct net_device * dev,
- struct iw_request_info * info,
- union iwreq_data * wrqu,
- char * extra)
-{
- /* Check if the driver has something to export */
- if((dev->wireless_handlers->num_private_args == 0) ||
- (dev->wireless_handlers->private_args == NULL))
- return -EOPNOTSUPP;
-
- /* Check if there is enough buffer up there */
- if(wrqu->data.length < dev->wireless_handlers->num_private_args) {
- /* User space can't know in advance how large the buffer
- * needs to be. Give it a hint, so that we can support
- * any size buffer we want somewhat efficiently... */
- wrqu->data.length = dev->wireless_handlers->num_private_args;
- return -E2BIG;
- }
-
- /* Set the number of available ioctls. */
- wrqu->data.length = dev->wireless_handlers->num_private_args;
-
- /* Copy structure to the user buffer. */
- memcpy(extra, dev->wireless_handlers->private_args,
- sizeof(struct iw_priv_args) * wrqu->data.length);
-
- return 0;
-}
-
-
-/******************** /proc/net/wireless SUPPORT ********************/
-/*
- * The /proc/net/wireless file is a human readable user-space interface
- * exporting various wireless specific statistics from the wireless devices.
- * This is the most popular part of the Wireless Extensions ;-)
- *
- * This interface is a pure clone of /proc/net/dev (in net/core/dev.c).
- * The content of the file is basically the content of "struct iw_statistics".
- */
-
-#ifdef CONFIG_PROC_FS
-
-/* ---------------------------------------------------------------- */
-/*
- * Print one entry (line) of /proc/net/wireless
- */
-static __inline__ void wireless_seq_printf_stats(struct seq_file *seq,
- struct net_device *dev)
-{
- /* Get stats from the driver */
- struct iw_statistics *stats = get_wireless_stats(dev);
-
- if (stats) {
- seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d "
- "%6d %6d %6d\n",
- dev->name, stats->status, stats->qual.qual,
- stats->qual.updated & IW_QUAL_QUAL_UPDATED
- ? '.' : ' ',
- ((__s32) stats->qual.level) -
- ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
- stats->qual.updated & IW_QUAL_LEVEL_UPDATED
- ? '.' : ' ',
- ((__s32) stats->qual.noise) -
- ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
- stats->qual.updated & IW_QUAL_NOISE_UPDATED
- ? '.' : ' ',
- stats->discard.nwid, stats->discard.code,
- stats->discard.fragment, stats->discard.retries,
- stats->discard.misc, stats->miss.beacon);
- stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
- }
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Print info for /proc/net/wireless (print all entries)
- */
-static int wireless_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN)
- seq_printf(seq, "Inter-| sta-| Quality | Discarded "
- "packets | Missed | WE\n"
- " face | tus | link level noise | nwid "
- "crypt frag retry misc | beacon | %d\n",
- WIRELESS_EXT);
- else
- wireless_seq_printf_stats(seq, v);
- return 0;
-}
-
-static struct seq_operations wireless_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = wireless_seq_show,
-};
-
-static int wireless_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &wireless_seq_ops);
-}
-
-static struct file_operations wireless_seq_fops = {
- .owner = THIS_MODULE,
- .open = wireless_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-int __init wireless_proc_init(void)
-{
- /* Create /proc/net/wireless entry */
- if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops))
- return -ENOMEM;
-
- return 0;
-}
-#endif /* CONFIG_PROC_FS */
-
-/************************** IOCTL SUPPORT **************************/
-/*
- * The original user space API to configure all those Wireless Extensions
- * is through IOCTLs.
- * In there, we check if we need to call the new driver API (iw_handler)
- * or just call the driver ioctl handler.
- */
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a standard Wireless Extension handler.
- * We do various checks and also take care of moving data between
- * user space and kernel space.
- */
-static int ioctl_standard_call(struct net_device * dev,
- struct ifreq * ifr,
- unsigned int cmd,
- iw_handler handler)
-{
- struct iwreq * iwr = (struct iwreq *) ifr;
- const struct iw_ioctl_description * descr;
- struct iw_request_info info;
- int ret = -EINVAL;
-
- /* Get the description of the IOCTL */
- if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
- return -EOPNOTSUPP;
- descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n",
- ifr->ifr_name, cmd);
- printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
-#endif /* WE_IOCTL_DEBUG */
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have a pointer to user space data or not */
- if(descr->header_type != IW_HEADER_TYPE_POINT) {
-
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, &(iwr->u), NULL);
-
-#ifdef WE_SET_EVENT
- /* Generate an event to notify listeners of the change */
- if((descr->flags & IW_DESCR_FLAG_EVENT) &&
- ((ret == 0) || (ret == -EIWCOMMIT)))
- wireless_send_event(dev, cmd, &(iwr->u), NULL);
-#endif /* WE_SET_EVENT */
- } else {
- char * extra;
- int extra_size;
- int user_length = 0;
- int err;
- int essid_compat = 0;
-
- /* Calculate space needed by arguments. Always allocate
- * for max space. Easier, and won't last long... */
- extra_size = descr->max_tokens * descr->token_size;
-
- /* Check need for ESSID compatibility for WE < 21 */
- switch (cmd) {
- case SIOCSIWESSID:
- case SIOCGIWESSID:
- case SIOCSIWNICKN:
- case SIOCGIWNICKN:
- if (iwr->u.data.length == descr->max_tokens + 1)
- essid_compat = 1;
- else if (IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
- char essid[IW_ESSID_MAX_SIZE + 1];
-
- err = copy_from_user(essid, iwr->u.data.pointer,
- iwr->u.data.length *
- descr->token_size);
- if (err)
- return -EFAULT;
-
- if (essid[iwr->u.data.length - 1] == '\0')
- essid_compat = 1;
- }
- break;
- default:
- break;
- }
-
- iwr->u.data.length -= essid_compat;
-
- /* Check what user space is giving us */
- if(IW_IS_SET(cmd)) {
- /* Check NULL pointer */
- if((iwr->u.data.pointer == NULL) &&
- (iwr->u.data.length != 0))
- return -EFAULT;
- /* Check if number of token fits within bounds */
- if(iwr->u.data.length > descr->max_tokens)
- return -E2BIG;
- if(iwr->u.data.length < descr->min_tokens)
- return -EINVAL;
- } else {
- /* Check NULL pointer */
- if(iwr->u.data.pointer == NULL)
- return -EFAULT;
- /* Save user space buffer size for checking */
- user_length = iwr->u.data.length;
-
- /* Don't check if user_length > max to allow forward
- * compatibility. The test user_length < min is
- * implied by the test at the end. */
-
- /* Support for very large requests */
- if((descr->flags & IW_DESCR_FLAG_NOMAX) &&
- (user_length > descr->max_tokens)) {
- /* Allow userspace to GET more than max so
- * we can support any size GET requests.
- * There is still a limit : -ENOMEM. */
- extra_size = user_length * descr->token_size;
- /* Note : user_length is originally a __u16,
- * and token_size is controlled by us,
- * so extra_size won't get negative and
- * won't overflow... */
- }
- }
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n",
- dev->name, extra_size);
-#endif /* WE_IOCTL_DEBUG */
-
- /* Create the kernel buffer */
- /* kzalloc ensures NULL-termination for essid_compat */
- extra = kzalloc(extra_size, GFP_KERNEL);
- if (extra == NULL) {
- return -ENOMEM;
- }
-
- /* If it is a SET, get all the extra data in here */
- if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
- err = copy_from_user(extra, iwr->u.data.pointer,
- iwr->u.data.length *
- descr->token_size);
- if (err) {
- kfree(extra);
- return -EFAULT;
- }
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Got %d bytes\n",
- dev->name,
- iwr->u.data.length * descr->token_size);
-#endif /* WE_IOCTL_DEBUG */
- }
-
- /* Call the handler */
- ret = handler(dev, &info, &(iwr->u), extra);
-
- iwr->u.data.length += essid_compat;
-
- /* If we have something to return to the user */
- if (!ret && IW_IS_GET(cmd)) {
- /* Check if there is enough buffer up there */
- if(user_length < iwr->u.data.length) {
- kfree(extra);
- return -E2BIG;
- }
-
- err = copy_to_user(iwr->u.data.pointer, extra,
- iwr->u.data.length *
- descr->token_size);
- if (err)
- ret = -EFAULT;
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n",
- dev->name,
- iwr->u.data.length * descr->token_size);
-#endif /* WE_IOCTL_DEBUG */
- }
-
-#ifdef WE_SET_EVENT
- /* Generate an event to notify listeners of the change */
- if((descr->flags & IW_DESCR_FLAG_EVENT) &&
- ((ret == 0) || (ret == -EIWCOMMIT))) {
- if(descr->flags & IW_DESCR_FLAG_RESTRICT)
- /* If the event is restricted, don't
- * export the payload */
- wireless_send_event(dev, cmd, &(iwr->u), NULL);
- else
- wireless_send_event(dev, cmd, &(iwr->u),
- extra);
- }
-#endif /* WE_SET_EVENT */
-
- /* Cleanup - I told you it wasn't that long ;-) */
- kfree(extra);
- }
-
- /* Call commit handler if needed and defined */
- if(ret == -EIWCOMMIT)
- ret = call_commit_handler(dev);
-
- /* Here, we will generate the appropriate event if needed */
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a private Wireless Extension handler.
- * We do various checks and also take care of moving data between
- * user space and kernel space.
- * It's not as nice and slimline as the standard wrapper. The cause
- * is struct iw_priv_args, which was not really designed for the
- * job we are going here.
- *
- * IMPORTANT : This function prevent to set and get data on the same
- * IOCTL and enforce the SET/GET convention. Not doing it would be
- * far too hairy...
- * If you need to set and get data at the same time, please don't use
- * a iw_handler but process it in your ioctl handler (i.e. use the
- * old driver API).
- */
-static inline int ioctl_private_call(struct net_device * dev,
- struct ifreq * ifr,
- unsigned int cmd,
- iw_handler handler)
-{
- struct iwreq * iwr = (struct iwreq *) ifr;
- const struct iw_priv_args * descr = NULL;
- struct iw_request_info info;
- int extra_size = 0;
- int i;
- int ret = -EINVAL;
-
- /* Get the description of the IOCTL */
- for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
- if(cmd == dev->wireless_handlers->private_args[i].cmd) {
- descr = &(dev->wireless_handlers->private_args[i]);
- break;
- }
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n",
- ifr->ifr_name, cmd);
- if(descr) {
- printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n",
- dev->name, descr->name,
- descr->set_args, descr->get_args);
- }
-#endif /* WE_IOCTL_DEBUG */
-
- /* Compute the size of the set/get arguments */
- if(descr != NULL) {
- if(IW_IS_SET(cmd)) {
- int offset = 0; /* For sub-ioctls */
- /* Check for sub-ioctl handler */
- if(descr->name[0] == '\0')
- /* Reserve one int for sub-ioctl index */
- offset = sizeof(__u32);
-
- /* Size of set arguments */
- extra_size = get_priv_size(descr->set_args);
-
- /* Does it fits in iwr ? */
- if((descr->set_args & IW_PRIV_SIZE_FIXED) &&
- ((extra_size + offset) <= IFNAMSIZ))
- extra_size = 0;
- } else {
- /* Size of get arguments */
- extra_size = get_priv_size(descr->get_args);
-
- /* Does it fits in iwr ? */
- if((descr->get_args & IW_PRIV_SIZE_FIXED) &&
- (extra_size <= IFNAMSIZ))
- extra_size = 0;
- }
- }
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have a pointer to user space data or not. */
- if(extra_size == 0) {
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u));
- } else {
- char * extra;
- int err;
-
- /* Check what user space is giving us */
- if(IW_IS_SET(cmd)) {
- /* Check NULL pointer */
- if((iwr->u.data.pointer == NULL) &&
- (iwr->u.data.length != 0))
- return -EFAULT;
-
- /* Does it fits within bounds ? */
- if(iwr->u.data.length > (descr->set_args &
- IW_PRIV_SIZE_MASK))
- return -E2BIG;
- } else {
- /* Check NULL pointer */
- if(iwr->u.data.pointer == NULL)
- return -EFAULT;
- }
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n",
- dev->name, extra_size);
-#endif /* WE_IOCTL_DEBUG */
-
- /* Always allocate for max space. Easier, and won't last
- * long... */
- extra = kmalloc(extra_size, GFP_KERNEL);
- if (extra == NULL) {
- return -ENOMEM;
- }
-
- /* If it is a SET, get all the extra data in here */
- if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
- err = copy_from_user(extra, iwr->u.data.pointer,
- extra_size);
- if (err) {
- kfree(extra);
- return -EFAULT;
- }
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Got %d elem\n",
- dev->name, iwr->u.data.length);
-#endif /* WE_IOCTL_DEBUG */
- }
-
- /* Call the handler */
- ret = handler(dev, &info, &(iwr->u), extra);
-
- /* If we have something to return to the user */
- if (!ret && IW_IS_GET(cmd)) {
-
- /* Adjust for the actual length if it's variable,
- * avoid leaking kernel bits outside. */
- if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) {
- extra_size = adjust_priv_size(descr->get_args,
- &(iwr->u));
- }
-
- err = copy_to_user(iwr->u.data.pointer, extra,
- extra_size);
- if (err)
- ret = -EFAULT;
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n",
- dev->name, iwr->u.data.length);
-#endif /* WE_IOCTL_DEBUG */
- }
-
- /* Cleanup - I told you it wasn't that long ;-) */
- kfree(extra);
- }
-
-
- /* Call commit handler if needed and defined */
- if(ret == -EIWCOMMIT)
- ret = call_commit_handler(dev);
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Main IOCTl dispatcher. Called from the main networking code
- * (dev_ioctl() in net/core/dev.c).
- * Check the type of IOCTL and call the appropriate wrapper...
- */
-int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd)
-{
- struct net_device *dev;
- iw_handler handler;
-
- /* Permissions are already checked in dev_ioctl() before calling us.
- * The copy_to/from_user() of ifr is also dealt with in there */
-
- /* Make sure the device exist */
- if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
- return -ENODEV;
-
- /* A bunch of special cases, then the generic case...
- * Note that 'cmd' is already filtered in dev_ioctl() with
- * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */
- switch(cmd)
- {
- case SIOCGIWSTATS:
- /* Get Wireless Stats */
- return ioctl_standard_call(dev,
- ifr,
- cmd,
- &iw_handler_get_iwstats);
-
- case SIOCGIWPRIV:
- /* Check if we have some wireless handlers defined */
- if(dev->wireless_handlers != NULL) {
- /* We export to user space the definition of
- * the private handler ourselves */
- return ioctl_standard_call(dev,
- ifr,
- cmd,
- &iw_handler_get_private);
- }
- // ## Fall-through for old API ##
- default:
- /* Generic IOCTL */
- /* Basic check */
- if (!netif_device_present(dev))
- return -ENODEV;
- /* New driver API : try to find the handler */
- handler = get_handler(dev, cmd);
- if(handler != NULL) {
- /* Standard and private are not the same */
- if(cmd < SIOCIWFIRSTPRIV)
- return ioctl_standard_call(dev,
- ifr,
- cmd,
- handler);
- else
- return ioctl_private_call(dev,
- ifr,
- cmd,
- handler);
- }
- /* Old driver API : call driver ioctl handler */
- if (dev->do_ioctl) {
- return dev->do_ioctl(dev, ifr, cmd);
- }
- return -EOPNOTSUPP;
- }
- /* Not reached */
- return -EINVAL;
-}
-
-/********************** RTNETLINK REQUEST API **********************/
-/*
- * The alternate user space API to configure all those Wireless Extensions
- * is through RtNetlink.
- * This API support only the new driver API (iw_handler).
- *
- * This RtNetlink API use the same query/reply model as the ioctl API.
- * Maximum effort has been done to fit in the RtNetlink model, and
- * we support both RtNetlink Set and RtNelink Get operations.
- * On the other hand, we don't offer Dump operations because of the
- * following reasons :
- * o Large number of parameters, most optional
- * o Large size of some parameters (> 100 bytes)
- * o Each parameters need to be extracted from hardware
- * o Scan requests can take seconds and disable network activity.
- * Because of this high cost/overhead, we want to return only the
- * parameters the user application is really interested in.
- * We could offer partial Dump using the IW_DESCR_FLAG_DUMP flag.
- *
- * The API uses the standard RtNetlink socket. When the RtNetlink code
- * find a IFLA_WIRELESS field in a RtNetlink SET_LINK request,
- * it calls here.
- */
-
-#ifdef CONFIG_NET_WIRELESS_RTNETLINK
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a standard Wireless Extension GET handler.
- * We do various checks and call the handler with the proper args.
- */
-static int rtnetlink_standard_get(struct net_device * dev,
- struct iw_event * request,
- int request_len,
- iw_handler handler,
- char ** p_buf,
- int * p_len)
-{
- const struct iw_ioctl_description * descr = NULL;
- unsigned int cmd;
- union iwreq_data * wrqu;
- int hdr_len;
- struct iw_request_info info;
- char * buffer = NULL;
- int buffer_size = 0;
- int ret = -EINVAL;
-
- /* Get the description of the Request */
- cmd = request->cmd;
- if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
- return -EOPNOTSUPP;
- descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Found standard handler for 0x%04X\n",
- dev->name, cmd);
- printk(KERN_DEBUG "%s (WE.r) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Check if wrqu is complete */
- hdr_len = event_type_size[descr->header_type];
- if(request_len < hdr_len) {
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG
- "%s (WE.r) : Wireless request too short (%d)\n",
- dev->name, request_len);
-#endif /* WE_RTNETLINK_DEBUG */
- return -EINVAL;
- }
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have extra data in the reply or not */
- if(descr->header_type != IW_HEADER_TYPE_POINT) {
-
- /* Create the kernel buffer that we will return.
- * It's at an offset to match the TYPE_POINT case... */
- buffer_size = request_len + IW_EV_POINT_OFF;
- buffer = kmalloc(buffer_size, GFP_KERNEL);
- if (buffer == NULL) {
- return -ENOMEM;
- }
- /* Copy event data */
- memcpy(buffer + IW_EV_POINT_OFF, request, request_len);
- /* Use our own copy of wrqu */
- wrqu = (union iwreq_data *) (buffer + IW_EV_POINT_OFF
- + IW_EV_LCP_LEN);
-
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, wrqu, NULL);
-
- } else {
- union iwreq_data wrqu_point;
- char * extra = NULL;
- int extra_size = 0;
-
- /* Get a temp copy of wrqu (skip pointer) */
- memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF,
- ((char *) request) + IW_EV_LCP_LEN,
- IW_EV_POINT_LEN - IW_EV_LCP_LEN);
-
- /* Calculate space needed by arguments. Always allocate
- * for max space. Easier, and won't last long... */
- extra_size = descr->max_tokens * descr->token_size;
- /* Support for very large requests */
- if((descr->flags & IW_DESCR_FLAG_NOMAX) &&
- (wrqu_point.data.length > descr->max_tokens))
- extra_size = (wrqu_point.data.length
- * descr->token_size);
- buffer_size = extra_size + IW_EV_POINT_LEN + IW_EV_POINT_OFF;
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes (%d bytes)\n",
- dev->name, extra_size, buffer_size);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Create the kernel buffer that we will return */
- buffer = kmalloc(buffer_size, GFP_KERNEL);
- if (buffer == NULL) {
- return -ENOMEM;
- }
-
- /* Put wrqu in the right place (just before extra).
- * Leave space for IWE header and dummy pointer...
- * Note that IW_EV_LCP_LEN==4 bytes, so it's still aligned...
- */
- memcpy(buffer + IW_EV_LCP_LEN + IW_EV_POINT_OFF,
- ((char *) &wrqu_point) + IW_EV_POINT_OFF,
- IW_EV_POINT_LEN - IW_EV_LCP_LEN);
- wrqu = (union iwreq_data *) (buffer + IW_EV_LCP_LEN);
-
- /* Extra comes logically after that. Offset +12 bytes. */
- extra = buffer + IW_EV_POINT_OFF + IW_EV_POINT_LEN;
-
- /* Call the handler */
- ret = handler(dev, &info, wrqu, extra);
-
- /* Calculate real returned length */
- extra_size = (wrqu->data.length * descr->token_size);
- /* Re-adjust reply size */
- request->len = extra_size + IW_EV_POINT_LEN;
-
- /* Put the iwe header where it should, i.e. scrap the
- * dummy pointer. */
- memcpy(buffer + IW_EV_POINT_OFF, request, IW_EV_LCP_LEN);
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Reply 0x%04X, hdr_len %d, tokens %d, extra_size %d, buffer_size %d\n", dev->name, cmd, hdr_len, wrqu->data.length, extra_size, buffer_size);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Check if there is enough buffer up there */
- if(wrqu_point.data.length < wrqu->data.length)
- ret = -E2BIG;
- }
-
- /* Return the buffer to the caller */
- if (!ret) {
- *p_buf = buffer;
- *p_len = request->len;
- } else {
- /* Cleanup */
- if(buffer)
- kfree(buffer);
- }
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a standard Wireless Extension SET handler.
- * We do various checks and call the handler with the proper args.
- */
-static inline int rtnetlink_standard_set(struct net_device * dev,
- struct iw_event * request,
- int request_len,
- iw_handler handler)
-{
- const struct iw_ioctl_description * descr = NULL;
- unsigned int cmd;
- union iwreq_data * wrqu;
- union iwreq_data wrqu_point;
- int hdr_len;
- char * extra = NULL;
- int extra_size = 0;
- struct iw_request_info info;
- int ret = -EINVAL;
-
- /* Get the description of the Request */
- cmd = request->cmd;
- if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
- return -EOPNOTSUPP;
- descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Found standard SET handler for 0x%04X\n",
- dev->name, cmd);
- printk(KERN_DEBUG "%s (WE.r) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Extract fixed header from request. This is properly aligned. */
- wrqu = &request->u;
-
- /* Check if wrqu is complete */
- hdr_len = event_type_size[descr->header_type];
- if(request_len < hdr_len) {
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG
- "%s (WE.r) : Wireless request too short (%d)\n",
- dev->name, request_len);
-#endif /* WE_RTNETLINK_DEBUG */
- return -EINVAL;
- }
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have extra data in the request or not */
- if(descr->header_type != IW_HEADER_TYPE_POINT) {
-
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, wrqu, NULL);
-
- } else {
- int extra_len;
-
- /* Put wrqu in the right place (skip pointer) */
- memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF,
- wrqu, IW_EV_POINT_LEN - IW_EV_LCP_LEN);
- /* Don't forget about the event code... */
- wrqu = &wrqu_point;
-
- /* Check if number of token fits within bounds */
- if(wrqu_point.data.length > descr->max_tokens)
- return -E2BIG;
- if(wrqu_point.data.length < descr->min_tokens)
- return -EINVAL;
-
- /* Real length of payload */
- extra_len = wrqu_point.data.length * descr->token_size;
-
- /* Check if request is self consistent */
- if((request_len - hdr_len) < extra_len) {
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Wireless request data too short (%d)\n",
- dev->name, extra_size);
-#endif /* WE_RTNETLINK_DEBUG */
- return -EINVAL;
- }
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes\n",
- dev->name, extra_size);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Always allocate for max space. Easier, and won't last
- * long... */
- extra_size = descr->max_tokens * descr->token_size;
- extra = kmalloc(extra_size, GFP_KERNEL);
- if (extra == NULL)
- return -ENOMEM;
-
- /* Copy extra in aligned buffer */
- memcpy(extra, ((char *) request) + hdr_len, extra_len);
-
- /* Call the handler */
- ret = handler(dev, &info, &wrqu_point, extra);
- }
-
-#ifdef WE_SET_EVENT
- /* Generate an event to notify listeners of the change */
- if((descr->flags & IW_DESCR_FLAG_EVENT) &&
- ((ret == 0) || (ret == -EIWCOMMIT))) {
- if(descr->flags & IW_DESCR_FLAG_RESTRICT)
- /* If the event is restricted, don't
- * export the payload */
- wireless_send_event(dev, cmd, wrqu, NULL);
- else
- wireless_send_event(dev, cmd, wrqu, extra);
- }
-#endif /* WE_SET_EVENT */
-
- /* Cleanup - I told you it wasn't that long ;-) */
- if(extra)
- kfree(extra);
-
- /* Call commit handler if needed and defined */
- if(ret == -EIWCOMMIT)
- ret = call_commit_handler(dev);
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a private Wireless Extension GET handler.
- * Same as above...
- * It's not as nice and slimline as the standard wrapper. The cause
- * is struct iw_priv_args, which was not really designed for the
- * job we are going here.
- *
- * IMPORTANT : This function prevent to set and get data on the same
- * IOCTL and enforce the SET/GET convention. Not doing it would be
- * far too hairy...
- * If you need to set and get data at the same time, please don't use
- * a iw_handler but process it in your ioctl handler (i.e. use the
- * old driver API).
- */
-static inline int rtnetlink_private_get(struct net_device * dev,
- struct iw_event * request,
- int request_len,
- iw_handler handler,
- char ** p_buf,
- int * p_len)
-{
- const struct iw_priv_args * descr = NULL;
- unsigned int cmd;
- union iwreq_data * wrqu;
- int hdr_len;
- struct iw_request_info info;
- int extra_size = 0;
- int i;
- char * buffer = NULL;
- int buffer_size = 0;
- int ret = -EINVAL;
-
- /* Get the description of the Request */
- cmd = request->cmd;
- for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
- if(cmd == dev->wireless_handlers->private_args[i].cmd) {
- descr = &(dev->wireless_handlers->private_args[i]);
- break;
- }
- if(descr == NULL)
- return -EOPNOTSUPP;
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Found private handler for 0x%04X\n",
- dev->name, cmd);
- printk(KERN_DEBUG "%s (WE.r) : Name %s, set %X, get %X\n",
- dev->name, descr->name, descr->set_args, descr->get_args);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Compute the max size of the get arguments */
- extra_size = get_priv_size(descr->get_args);
-
- /* Does it fits in wrqu ? */
- if((descr->get_args & IW_PRIV_SIZE_FIXED) &&
- (extra_size <= IFNAMSIZ)) {
- hdr_len = extra_size;
- extra_size = 0;
- } else {
- hdr_len = IW_EV_POINT_LEN;
- }
-
- /* Check if wrqu is complete */
- if(request_len < hdr_len) {
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG
- "%s (WE.r) : Wireless request too short (%d)\n",
- dev->name, request_len);
-#endif /* WE_RTNETLINK_DEBUG */
- return -EINVAL;
- }
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have a pointer to user space data or not. */
- if(extra_size == 0) {
-
- /* Create the kernel buffer that we will return.
- * It's at an offset to match the TYPE_POINT case... */
- buffer_size = request_len + IW_EV_POINT_OFF;
- buffer = kmalloc(buffer_size, GFP_KERNEL);
- if (buffer == NULL) {
- return -ENOMEM;
- }
- /* Copy event data */
- memcpy(buffer + IW_EV_POINT_OFF, request, request_len);
- /* Use our own copy of wrqu */
- wrqu = (union iwreq_data *) (buffer + IW_EV_POINT_OFF
- + IW_EV_LCP_LEN);
-
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, wrqu, (char *) wrqu);
-
- } else {
- char * extra;
-
- /* Buffer for full reply */
- buffer_size = extra_size + IW_EV_POINT_LEN + IW_EV_POINT_OFF;
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes (%d bytes)\n",
- dev->name, extra_size, buffer_size);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Create the kernel buffer that we will return */
- buffer = kmalloc(buffer_size, GFP_KERNEL);
- if (buffer == NULL) {
- return -ENOMEM;
- }
-
- /* Put wrqu in the right place (just before extra).
- * Leave space for IWE header and dummy pointer...
- * Note that IW_EV_LCP_LEN==4 bytes, so it's still aligned...
- */
- memcpy(buffer + IW_EV_LCP_LEN + IW_EV_POINT_OFF,
- ((char *) request) + IW_EV_LCP_LEN,
- IW_EV_POINT_LEN - IW_EV_LCP_LEN);
- wrqu = (union iwreq_data *) (buffer + IW_EV_LCP_LEN);
-
- /* Extra comes logically after that. Offset +12 bytes. */
- extra = buffer + IW_EV_POINT_OFF + IW_EV_POINT_LEN;
-
- /* Call the handler */
- ret = handler(dev, &info, wrqu, extra);
-
- /* Adjust for the actual length if it's variable,
- * avoid leaking kernel bits outside. */
- if (!(descr->get_args & IW_PRIV_SIZE_FIXED))
- extra_size = adjust_priv_size(descr->get_args, wrqu);
- /* Re-adjust reply size */
- request->len = extra_size + IW_EV_POINT_LEN;
-
- /* Put the iwe header where it should, i.e. scrap the
- * dummy pointer. */
- memcpy(buffer + IW_EV_POINT_OFF, request, IW_EV_LCP_LEN);
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Reply 0x%04X, hdr_len %d, tokens %d, extra_size %d, buffer_size %d\n", dev->name, cmd, hdr_len, wrqu->data.length, extra_size, buffer_size);
-#endif /* WE_RTNETLINK_DEBUG */
- }
-
- /* Return the buffer to the caller */
- if (!ret) {
- *p_buf = buffer;
- *p_len = request->len;
- } else {
- /* Cleanup */
- if(buffer)
- kfree(buffer);
- }
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a private Wireless Extension SET handler.
- * Same as above...
- * It's not as nice and slimline as the standard wrapper. The cause
- * is struct iw_priv_args, which was not really designed for the
- * job we are going here.
- *
- * IMPORTANT : This function prevent to set and get data on the same
- * IOCTL and enforce the SET/GET convention. Not doing it would be
- * far too hairy...
- * If you need to set and get data at the same time, please don't use
- * a iw_handler but process it in your ioctl handler (i.e. use the
- * old driver API).
- */
-static inline int rtnetlink_private_set(struct net_device * dev,
- struct iw_event * request,
- int request_len,
- iw_handler handler)
-{
- const struct iw_priv_args * descr = NULL;
- unsigned int cmd;
- union iwreq_data * wrqu;
- union iwreq_data wrqu_point;
- int hdr_len;
- char * extra = NULL;
- int extra_size = 0;
- int offset = 0; /* For sub-ioctls */
- struct iw_request_info info;
- int i;
- int ret = -EINVAL;
-
- /* Get the description of the Request */
- cmd = request->cmd;
- for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
- if(cmd == dev->wireless_handlers->private_args[i].cmd) {
- descr = &(dev->wireless_handlers->private_args[i]);
- break;
- }
- if(descr == NULL)
- return -EOPNOTSUPP;
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Found private handler for 0x%04X\n",
- ifr->ifr_name, cmd);
- printk(KERN_DEBUG "%s (WE.r) : Name %s, set %X, get %X\n",
- dev->name, descr->name, descr->set_args, descr->get_args);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Compute the size of the set arguments */
- /* Check for sub-ioctl handler */
- if(descr->name[0] == '\0')
- /* Reserve one int for sub-ioctl index */
- offset = sizeof(__u32);
-
- /* Size of set arguments */
- extra_size = get_priv_size(descr->set_args);
-
- /* Does it fits in wrqu ? */
- if((descr->set_args & IW_PRIV_SIZE_FIXED) &&
- (extra_size <= IFNAMSIZ)) {
- hdr_len = IW_EV_LCP_LEN + extra_size;
- extra_size = 0;
- } else {
- hdr_len = IW_EV_POINT_LEN;
- }
-
- /* Extract fixed header from request. This is properly aligned. */
- wrqu = &request->u;
-
- /* Check if wrqu is complete */
- if(request_len < hdr_len) {
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG
- "%s (WE.r) : Wireless request too short (%d)\n",
- dev->name, request_len);
-#endif /* WE_RTNETLINK_DEBUG */
- return -EINVAL;
- }
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have a pointer to user space data or not. */
- if(extra_size == 0) {
-
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, wrqu, (char *) wrqu);
-
- } else {
- int extra_len;
-
- /* Put wrqu in the right place (skip pointer) */
- memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF,
- wrqu, IW_EV_POINT_LEN - IW_EV_LCP_LEN);
-
- /* Does it fits within bounds ? */
- if(wrqu_point.data.length > (descr->set_args &
- IW_PRIV_SIZE_MASK))
- return -E2BIG;
-
- /* Real length of payload */
- extra_len = adjust_priv_size(descr->set_args, &wrqu_point);
-
- /* Check if request is self consistent */
- if((request_len - hdr_len) < extra_len) {
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Wireless request data too short (%d)\n",
- dev->name, extra_size);
-#endif /* WE_RTNETLINK_DEBUG */
- return -EINVAL;
- }
-
-#ifdef WE_RTNETLINK_DEBUG
- printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes\n",
- dev->name, extra_size);
-#endif /* WE_RTNETLINK_DEBUG */
-
- /* Always allocate for max space. Easier, and won't last
- * long... */
- extra = kmalloc(extra_size, GFP_KERNEL);
- if (extra == NULL)
- return -ENOMEM;
-
- /* Copy extra in aligned buffer */
- memcpy(extra, ((char *) request) + hdr_len, extra_len);
-
- /* Call the handler */
- ret = handler(dev, &info, &wrqu_point, extra);
-
- /* Cleanup - I told you it wasn't that long ;-) */
- kfree(extra);
- }
-
- /* Call commit handler if needed and defined */
- if(ret == -EIWCOMMIT)
- ret = call_commit_handler(dev);
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Main RtNetlink dispatcher. Called from the main networking code
- * (do_getlink() in net/core/rtnetlink.c).
- * Check the type of Request and call the appropriate wrapper...
- */
-int wireless_rtnetlink_get(struct net_device * dev,
- char * data,
- int len,
- char ** p_buf,
- int * p_len)
-{
- struct iw_event * request = (struct iw_event *) data;
- iw_handler handler;
-
- /* Check length */
- if(len < IW_EV_LCP_LEN) {
- printk(KERN_DEBUG "%s (WE.r) : RtNetlink request too short (%d)\n",
- dev->name, len);
- return -EINVAL;
- }
-
- /* ReCheck length (len may have padding) */
- if(request->len > len) {
- printk(KERN_DEBUG "%s (WE.r) : RtNetlink request len invalid (%d-%d)\n",
- dev->name, request->len, len);
- return -EINVAL;
- }
-
- /* Only accept GET requests in here */
- if(!IW_IS_GET(request->cmd))
- return -EOPNOTSUPP;
-
- /* If command is `get the encoding parameters', check if
- * the user has the right to do it */
- if (request->cmd == SIOCGIWENCODE ||
- request->cmd == SIOCGIWENCODEEXT) {
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- }
-
- /* Special cases */
- if(request->cmd == SIOCGIWSTATS)
- /* Get Wireless Stats */
- return rtnetlink_standard_get(dev,
- request,
- request->len,
- &iw_handler_get_iwstats,
- p_buf, p_len);
- if(request->cmd == SIOCGIWPRIV) {
- /* Check if we have some wireless handlers defined */
- if(dev->wireless_handlers == NULL)
- return -EOPNOTSUPP;
- /* Get Wireless Stats */
- return rtnetlink_standard_get(dev,
- request,
- request->len,
- &iw_handler_get_private,
- p_buf, p_len);
- }
-
- /* Basic check */
- if (!netif_device_present(dev))
- return -ENODEV;
-
- /* Try to find the handler */
- handler = get_handler(dev, request->cmd);
- if(handler != NULL) {
- /* Standard and private are not the same */
- if(request->cmd < SIOCIWFIRSTPRIV)
- return rtnetlink_standard_get(dev,
- request,
- request->len,
- handler,
- p_buf, p_len);
- else
- return rtnetlink_private_get(dev,
- request,
- request->len,
- handler,
- p_buf, p_len);
- }
-
- return -EOPNOTSUPP;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Main RtNetlink dispatcher. Called from the main networking code
- * (do_setlink() in net/core/rtnetlink.c).
- * Check the type of Request and call the appropriate wrapper...
- */
-int wireless_rtnetlink_set(struct net_device * dev,
- char * data,
- int len)
-{
- struct iw_event * request = (struct iw_event *) data;
- iw_handler handler;
-
- /* Check length */
- if(len < IW_EV_LCP_LEN) {
- printk(KERN_DEBUG "%s (WE.r) : RtNetlink request too short (%d)\n",
- dev->name, len);
- return -EINVAL;
- }
-
- /* ReCheck length (len may have padding) */
- if(request->len > len) {
- printk(KERN_DEBUG "%s (WE.r) : RtNetlink request len invalid (%d-%d)\n",
- dev->name, request->len, len);
- return -EINVAL;
- }
-
- /* Only accept SET requests in here */
- if(!IW_IS_SET(request->cmd))
- return -EOPNOTSUPP;
-
- /* Basic check */
- if (!netif_device_present(dev))
- return -ENODEV;
-
- /* New driver API : try to find the handler */
- handler = get_handler(dev, request->cmd);
- if(handler != NULL) {
- /* Standard and private are not the same */
- if(request->cmd < SIOCIWFIRSTPRIV)
- return rtnetlink_standard_set(dev,
- request,
- request->len,
- handler);
- else
- return rtnetlink_private_set(dev,
- request,
- request->len,
- handler);
- }
-
- return -EOPNOTSUPP;
-}
-#endif /* CONFIG_NET_WIRELESS_RTNETLINK */
-
-
-/************************* EVENT PROCESSING *************************/
-/*
- * Process events generated by the wireless layer or the driver.
- * Most often, the event will be propagated through rtnetlink
- */
-
-#ifdef WE_EVENT_RTNETLINK
-/* ---------------------------------------------------------------- */
-/*
- * Locking...
- * ----------
- *
- * Thanks to Herbert Xu <herbert@gondor.apana.org.au> for fixing
- * the locking issue in here and implementing this code !
- *
- * The issue : wireless_send_event() is often called in interrupt context,
- * while the Netlink layer can never be called in interrupt context.
- * The fully formed RtNetlink events are queued, and then a tasklet is run
- * to feed those to Netlink.
- * The skb_queue is interrupt safe, and its lock is not held while calling
- * Netlink, so there is no possibility of dealock.
- * Jean II
- */
-
-static struct sk_buff_head wireless_nlevent_queue;
-
-static int __init wireless_nlevent_init(void)
-{
- skb_queue_head_init(&wireless_nlevent_queue);
- return 0;
-}
-
-subsys_initcall(wireless_nlevent_init);
-
-static void wireless_nlevent_process(unsigned long data)
-{
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&wireless_nlevent_queue)))
- rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
-}
-
-static DECLARE_TASKLET(wireless_nlevent_tasklet, wireless_nlevent_process, 0);
-
-/* ---------------------------------------------------------------- */
-/*
- * Fill a rtnetlink message with our event data.
- * Note that we propage only the specified event and don't dump the
- * current wireless config. Dumping the wireless config is far too
- * expensive (for each parameter, the driver need to query the hardware).
- */
-static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
- struct net_device * dev,
- int type,
- char * event,
- int event_len)
-{
- struct ifinfomsg *r;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
-
- nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
- r = NLMSG_DATA(nlh);
- r->ifi_family = AF_UNSPEC;
- r->__ifi_pad = 0;
- r->ifi_type = dev->type;
- r->ifi_index = dev->ifindex;
- r->ifi_flags = dev_get_flags(dev);
- r->ifi_change = 0; /* Wireless changes don't affect those flags */
-
- /* Add the wireless events in the netlink packet */
- RTA_PUT(skb, IFLA_WIRELESS, event_len, event);
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-nlmsg_failure:
-rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Create and broadcast and send it on the standard rtnetlink socket
- * This is a pure clone rtmsg_ifinfo() in net/core/rtnetlink.c
- * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field
- * within a RTM_NEWLINK event.
- */
-static inline void rtmsg_iwinfo(struct net_device * dev,
- char * event,
- int event_len)
-{
- struct sk_buff *skb;
- int size = NLMSG_GOODSIZE;
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- return;
-
- if (rtnetlink_fill_iwinfo(skb, dev, RTM_NEWLINK,
- event, event_len) < 0) {
- kfree_skb(skb);
- return;
- }
- NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
- skb_queue_tail(&wireless_nlevent_queue, skb);
- tasklet_schedule(&wireless_nlevent_tasklet);
-}
-
-#endif /* WE_EVENT_RTNETLINK */
-
-/* ---------------------------------------------------------------- */
-/*
- * Main event dispatcher. Called from other parts and drivers.
- * Send the event on the appropriate channels.
- * May be called from interrupt context.
- */
-void wireless_send_event(struct net_device * dev,
- unsigned int cmd,
- union iwreq_data * wrqu,
- char * extra)
-{
- const struct iw_ioctl_description * descr = NULL;
- int extra_len = 0;
- struct iw_event *event; /* Mallocated whole event */
- int event_len; /* Its size */
- int hdr_len; /* Size of the event header */
- int wrqu_off = 0; /* Offset in wrqu */
- /* Don't "optimise" the following variable, it will crash */
- unsigned cmd_index; /* *MUST* be unsigned */
-
- /* Get the description of the Event */
- if(cmd <= SIOCIWLAST) {
- cmd_index = cmd - SIOCIWFIRST;
- if(cmd_index < standard_ioctl_num)
- descr = &(standard_ioctl[cmd_index]);
- } else {
- cmd_index = cmd - IWEVFIRST;
- if(cmd_index < standard_event_num)
- descr = &(standard_event[cmd_index]);
- }
- /* Don't accept unknown events */
- if(descr == NULL) {
- /* Note : we don't return an error to the driver, because
- * the driver would not know what to do about it. It can't
- * return an error to the user, because the event is not
- * initiated by a user request.
- * The best the driver could do is to log an error message.
- * We will do it ourselves instead...
- */
- printk(KERN_ERR "%s (WE) : Invalid/Unknown Wireless Event (0x%04X)\n",
- dev->name, cmd);
- return;
- }
-#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n",
- dev->name, cmd);
- printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
-#endif /* WE_EVENT_DEBUG */
-
- /* Check extra parameters and set extra_len */
- if(descr->header_type == IW_HEADER_TYPE_POINT) {
- /* Check if number of token fits within bounds */
- if(wrqu->data.length > descr->max_tokens) {
- printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length);
- return;
- }
- if(wrqu->data.length < descr->min_tokens) {
- printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length);
- return;
- }
- /* Calculate extra_len - extra is NULL for restricted events */
- if(extra != NULL)
- extra_len = wrqu->data.length * descr->token_size;
- /* Always at an offset in wrqu */
- wrqu_off = IW_EV_POINT_OFF;
-#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len);
-#endif /* WE_EVENT_DEBUG */
- }
-
- /* Total length of the event */
- hdr_len = event_type_size[descr->header_type];
- event_len = hdr_len + extra_len;
-
-#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, wrqu_off %d, event_len %d\n", dev->name, cmd, hdr_len, wrqu_off, event_len);
-#endif /* WE_EVENT_DEBUG */
-
- /* Create temporary buffer to hold the event */
- event = kmalloc(event_len, GFP_ATOMIC);
- if(event == NULL)
- return;
-
- /* Fill event */
- event->len = event_len;
- event->cmd = cmd;
- memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
- if(extra != NULL)
- memcpy(((char *) event) + hdr_len, extra, extra_len);
-
-#ifdef WE_EVENT_RTNETLINK
- /* Send via the RtNetlink event channel */
- rtmsg_iwinfo(dev, (char *) event, event_len);
-#endif /* WE_EVENT_RTNETLINK */
-
- /* Cleanup */
- kfree(event);
-
- return; /* Always success, I guess ;-) */
-}
-
-/********************** ENHANCED IWSPY SUPPORT **********************/
-/*
- * In the old days, the driver was handling spy support all by itself.
- * Now, the driver can delegate this task to Wireless Extensions.
- * It needs to use those standard spy iw_handler in struct iw_handler_def,
- * push data to us via wireless_spy_update() and include struct iw_spy_data
- * in its private part (and export it in net_device->wireless_data->spy_data).
- * One of the main advantage of centralising spy support here is that
- * it becomes much easier to improve and extend it without having to touch
- * the drivers. One example is the addition of the Spy-Threshold events.
- */
-
-/* ---------------------------------------------------------------- */
-/*
- * Return the pointer to the spy data in the driver.
- * Because this is called on the Rx path via wireless_spy_update(),
- * we want it to be efficient...
- */
-static inline struct iw_spy_data * get_spydata(struct net_device *dev)
-{
- /* This is the new way */
- if(dev->wireless_data)
- return(dev->wireless_data->spy_data);
- return NULL;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : set Spy List
- */
-int iw_handler_set_spy(struct net_device * dev,
- struct iw_request_info * info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct sockaddr * address = (struct sockaddr *) extra;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- /* Disable spy collection while we copy the addresses.
- * While we copy addresses, any call to wireless_spy_update()
- * will NOP. This is OK, as anyway the addresses are changing. */
- spydata->spy_number = 0;
-
- /* We want to operate without locking, because wireless_spy_update()
- * most likely will happen in the interrupt handler, and therefore
- * have its own locking constraints and needs performance.
- * The rtnl_lock() make sure we don't race with the other iw_handlers.
- * This make sure wireless_spy_update() "see" that the spy list
- * is temporarily disabled. */
- wmb();
-
- /* Are there are addresses to copy? */
- if(wrqu->data.length > 0) {
- int i;
-
- /* Copy addresses */
- for(i = 0; i < wrqu->data.length; i++)
- memcpy(spydata->spy_address[i], address[i].sa_data,
- ETH_ALEN);
- /* Reset stats */
- memset(spydata->spy_stat, 0,
- sizeof(struct iw_quality) * IW_MAX_SPY);
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_handler_set_spy() : wireless_data %p, spydata %p, num %d\n", dev->wireless_data, spydata, wrqu->data.length);
- for (i = 0; i < wrqu->data.length; i++)
- printk(KERN_DEBUG
- "%02X:%02X:%02X:%02X:%02X:%02X \n",
- spydata->spy_address[i][0],
- spydata->spy_address[i][1],
- spydata->spy_address[i][2],
- spydata->spy_address[i][3],
- spydata->spy_address[i][4],
- spydata->spy_address[i][5]);
-#endif /* WE_SPY_DEBUG */
- }
-
- /* Make sure above is updated before re-enabling */
- wmb();
-
- /* Enable addresses */
- spydata->spy_number = wrqu->data.length;
-
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : get Spy List
- */
-int iw_handler_get_spy(struct net_device * dev,
- struct iw_request_info * info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct sockaddr * address = (struct sockaddr *) extra;
- int i;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- wrqu->data.length = spydata->spy_number;
-
- /* Copy addresses. */
- for(i = 0; i < spydata->spy_number; i++) {
- memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN);
- address[i].sa_family = AF_UNIX;
- }
- /* Copy stats to the user buffer (just after). */
- if(spydata->spy_number > 0)
- memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number),
- spydata->spy_stat,
- sizeof(struct iw_quality) * spydata->spy_number);
- /* Reset updated flags. */
- for(i = 0; i < spydata->spy_number; i++)
- spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : set spy threshold
- */
-int iw_handler_set_thrspy(struct net_device * dev,
- struct iw_request_info *info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- /* Just do it */
- memcpy(&(spydata->spy_thr_low), &(threshold->low),
- 2 * sizeof(struct iw_quality));
-
- /* Clear flag */
- memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under));
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_handler_set_thrspy() : low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level);
-#endif /* WE_SPY_DEBUG */
-
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : get spy threshold
- */
-int iw_handler_get_thrspy(struct net_device * dev,
- struct iw_request_info *info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- /* Just do it */
- memcpy(&(threshold->low), &(spydata->spy_thr_low),
- 2 * sizeof(struct iw_quality));
-
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Prepare and send a Spy Threshold event
- */
-static void iw_send_thrspy_event(struct net_device * dev,
- struct iw_spy_data * spydata,
- unsigned char * address,
- struct iw_quality * wstats)
-{
- union iwreq_data wrqu;
- struct iw_thrspy threshold;
-
- /* Init */
- wrqu.data.length = 1;
- wrqu.data.flags = 0;
- /* Copy address */
- memcpy(threshold.addr.sa_data, address, ETH_ALEN);
- threshold.addr.sa_family = ARPHRD_ETHER;
- /* Copy stats */
- memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality));
- /* Copy also thresholds */
- memcpy(&(threshold.low), &(spydata->spy_thr_low),
- 2 * sizeof(struct iw_quality));
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n",
- threshold.addr.sa_data[0],
- threshold.addr.sa_data[1],
- threshold.addr.sa_data[2],
- threshold.addr.sa_data[3],
- threshold.addr.sa_data[4],
- threshold.addr.sa_data[5], threshold.qual.level);
-#endif /* WE_SPY_DEBUG */
-
- /* Send event to user space */
- wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold);
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Call for the driver to update the spy data.
- * For now, the spy data is a simple array. As the size of the array is
- * small, this is good enough. If we wanted to support larger number of
- * spy addresses, we should use something more efficient...
- */
-void wireless_spy_update(struct net_device * dev,
- unsigned char * address,
- struct iw_quality * wstats)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- int i;
- int match = -1;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return;
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "wireless_spy_update() : wireless_data %p, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_data, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
-#endif /* WE_SPY_DEBUG */
-
- /* Update all records that match */
- for(i = 0; i < spydata->spy_number; i++)
- if(!compare_ether_addr(address, spydata->spy_address[i])) {
- memcpy(&(spydata->spy_stat[i]), wstats,
- sizeof(struct iw_quality));
- match = i;
- }
-
- /* Generate an event if we cross the spy threshold.
- * To avoid event storms, we have a simple hysteresis : we generate
- * event only when we go under the low threshold or above the
- * high threshold. */
- if(match >= 0) {
- if(spydata->spy_thr_under[match]) {
- if(wstats->level > spydata->spy_thr_high.level) {
- spydata->spy_thr_under[match] = 0;
- iw_send_thrspy_event(dev, spydata,
- address, wstats);
- }
- } else {
- if(wstats->level < spydata->spy_thr_low.level) {
- spydata->spy_thr_under[match] = 1;
- iw_send_thrspy_event(dev, spydata,
- address, wstats);
- }
- }
- }
-}
-
-EXPORT_SYMBOL(iw_handler_get_spy);
-EXPORT_SYMBOL(iw_handler_get_thrspy);
-EXPORT_SYMBOL(iw_handler_set_spy);
-EXPORT_SYMBOL(iw_handler_set_thrspy);
-EXPORT_SYMBOL(wireless_send_event);
-EXPORT_SYMBOL(wireless_spy_update);
diff --git a/net/core/xdp.c b/net/core/xdp.c
new file mode 100644
index 000000000000..9100e160113a
--- /dev/null
+++ b/net/core/xdp.c
@@ -0,0 +1,1055 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* net/core/xdp.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <linux/bug.h>
+#include <net/page_pool/helpers.h>
+
+#include <net/hotdata.h>
+#include <net/netdev_lock.h>
+#include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
+
+#define REG_STATE_NEW 0x0
+#define REG_STATE_REGISTERED 0x1
+#define REG_STATE_UNREGISTERED 0x2
+#define REG_STATE_UNUSED 0x3
+
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+ const u32 *k = data;
+ const u32 key = *k;
+
+ BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id)
+ != sizeof(u32));
+
+ /* Use cyclic increasing ID as direct hash key */
+ return key;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xdp_mem_allocator *xa = ptr;
+ u32 mem_id = *(u32 *)arg->key;
+
+ return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+ .nelem_hint = 64,
+ .head_offset = offsetof(struct xdp_mem_allocator, node),
+ .key_offset = offsetof(struct xdp_mem_allocator, mem.id),
+ .key_len = sizeof_field(struct xdp_mem_allocator, mem.id),
+ .max_size = MEM_ID_MAX,
+ .min_size = 8,
+ .automatic_shrinking = true,
+ .hashfn = xdp_mem_id_hashfn,
+ .obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+ struct xdp_mem_allocator *xa;
+
+ xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+ /* Allow this ID to be reused */
+ ida_free(&mem_id_pool, xa->mem.id);
+
+ kfree(xa);
+}
+
+static void mem_xa_remove(struct xdp_mem_allocator *xa)
+{
+ trace_mem_disconnect(xa);
+
+ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+}
+
+static void mem_allocator_disconnect(void *allocator)
+{
+ struct xdp_mem_allocator *xa;
+ struct rhashtable_iter iter;
+
+ mutex_lock(&mem_id_lock);
+
+ rhashtable_walk_enter(mem_id_ht, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
+ if (xa->allocator == allocator)
+ mem_xa_remove(xa);
+ }
+
+ rhashtable_walk_stop(&iter);
+
+ } while (xa == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+
+ mutex_unlock(&mem_id_lock);
+}
+
+void xdp_unreg_mem_model(struct xdp_mem_info *mem)
+{
+ struct xdp_mem_allocator *xa;
+ int type = mem->type;
+ int id = mem->id;
+
+ /* Reset mem info to defaults */
+ mem->id = 0;
+ mem->type = 0;
+
+ if (id == 0)
+ return;
+
+ if (type == MEM_TYPE_PAGE_POOL) {
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ page_pool_destroy(xa->page_pool);
+ }
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
+
+void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return;
+ }
+
+ xdp_unreg_mem_model(&xdp_rxq->mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
+
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
+{
+ /* Simplify driver cleanup code paths, allow unreg "unused" */
+ if (xdp_rxq->reg_state == REG_STATE_UNUSED)
+ return;
+
+ xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
+ xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
+ xdp_rxq->dev = NULL;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
+
+static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
+{
+ memset(xdp_rxq, 0, sizeof(*xdp_rxq));
+}
+
+/* Returns 0 on success, negative on failure */
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ struct net_device *dev, u32 queue_index,
+ unsigned int napi_id, u32 frag_size)
+{
+ if (!dev) {
+ WARN(1, "Missing net_device from driver");
+ return -ENODEV;
+ }
+
+ if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
+ WARN(1, "Driver promised not to register this");
+ return -EINVAL;
+ }
+
+ if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
+ WARN(1, "Missing unregister, handled but fix driver");
+ xdp_rxq_info_unreg(xdp_rxq);
+ }
+
+ /* State either UNREGISTERED or NEW */
+ xdp_rxq_info_init(xdp_rxq);
+ xdp_rxq->dev = dev;
+ xdp_rxq->queue_index = queue_index;
+ xdp_rxq->frag_size = frag_size;
+
+ xdp_rxq->reg_state = REG_STATE_REGISTERED;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);
+
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
+{
+ xdp_rxq->reg_state = REG_STATE_UNUSED;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
+
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
+{
+ return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+static int __mem_id_init_hash_table(void)
+{
+ struct rhashtable *rht;
+ int ret;
+
+ if (unlikely(mem_id_init))
+ return 0;
+
+ rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+ if (!rht)
+ return -ENOMEM;
+
+ ret = rhashtable_init(rht, &mem_id_rht_params);
+ if (ret < 0) {
+ kfree(rht);
+ return ret;
+ }
+ mem_id_ht = rht;
+ smp_mb(); /* mutex lock should provide enough pairing */
+ mem_id_init = true;
+
+ return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+ int retries = 1;
+ int id;
+
+again:
+ id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ /* Cyclic allocator, reset next id */
+ if (retries--) {
+ mem_id_next = MEM_ID_MIN;
+ goto again;
+ }
+ }
+ return id; /* errno */
+ }
+ mem_id_next = id + 1;
+
+ return id;
+}
+
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+ if (type == MEM_TYPE_PAGE_POOL)
+ return is_page_pool_compiled_in();
+
+ if (type >= MEM_TYPE_MAX)
+ return false;
+
+ return true;
+}
+
+static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
+ enum xdp_mem_type type,
+ void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+ gfp_t gfp = GFP_KERNEL;
+ int id, errno, ret;
+ void *ptr;
+
+ if (!__is_supported_mem_type(type))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mem->type = type;
+
+ if (!allocator) {
+ if (type == MEM_TYPE_PAGE_POOL)
+ return ERR_PTR(-EINVAL); /* Setup time check page_pool req */
+ return NULL;
+ }
+
+ /* Delay init of rhashtable to save memory if feature isn't used */
+ if (!mem_id_init) {
+ mutex_lock(&mem_id_lock);
+ ret = __mem_id_init_hash_table();
+ mutex_unlock(&mem_id_lock);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+ if (!xdp_alloc)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&mem_id_lock);
+ id = __mem_id_cyclic_get(gfp);
+ if (id < 0) {
+ errno = id;
+ goto err;
+ }
+ mem->id = id;
+ xdp_alloc->mem = *mem;
+ xdp_alloc->allocator = allocator;
+
+ /* Insert allocator into ID lookup table */
+ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+ if (IS_ERR(ptr)) {
+ ida_free(&mem_id_pool, mem->id);
+ mem->id = 0;
+ errno = PTR_ERR(ptr);
+ goto err;
+ }
+
+ if (type == MEM_TYPE_PAGE_POOL)
+ page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem);
+
+ mutex_unlock(&mem_id_lock);
+
+ return xdp_alloc;
+err:
+ mutex_unlock(&mem_id_lock);
+ kfree(xdp_alloc);
+ return ERR_PTR(errno);
+}
+
+int xdp_reg_mem_model(struct xdp_mem_info *mem,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+
+ xdp_alloc = __xdp_reg_mem_model(mem, type, allocator);
+ if (IS_ERR(xdp_alloc))
+ return PTR_ERR(xdp_alloc);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_reg_mem_model);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator);
+ if (IS_ERR(xdp_alloc))
+ return PTR_ERR(xdp_alloc);
+
+ if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
+ xsk_pool_set_rxq_info(allocator, xdp_rxq);
+
+ if (trace_mem_connect_enabled() && xdp_alloc)
+ trace_mem_connect(xdp_alloc, xdp_rxq);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+/**
+ * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
+ * @pool: &page_pool to register
+ *
+ * Can be used to register pools manually without connecting to any XDP RxQ
+ * info, so that the XDP layer will be aware of them. Then, they can be
+ * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
+ *
+ * Return: %0 on success, -errno on error.
+ */
+int xdp_reg_page_pool(struct page_pool *pool)
+{
+ struct xdp_mem_info mem;
+
+ return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
+}
+EXPORT_SYMBOL_GPL(xdp_reg_page_pool);
+
+/**
+ * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
+ * @pool: &page_pool to unregister
+ *
+ * A shorthand for manual unregistering page pools. If the pool was previously
+ * attached to an RxQ info, it must be detached first.
+ */
+void xdp_unreg_page_pool(const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_unreg_mem_model(&mem);
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);
+
+/**
+ * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
+ * @xdp_rxq: XDP RxQ info to attach the pool to
+ * @pool: pool to attach
+ *
+ * If the pool was registered manually, this function must be called instead
+ * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
+ */
+void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
+ const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
+
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection. The @napi_direct boolean
+ * is used for those calls sites. Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
+ bool napi_direct, struct xdp_buff *xdp)
+{
+ switch (mem_type) {
+ case MEM_TYPE_PAGE_POOL:
+ netmem = netmem_compound_head(netmem);
+ if (napi_direct && xdp_return_frame_no_direct())
+ napi_direct = false;
+ /* No need to check netmem_is_pp() as mem->type knows this a
+ * page_pool page
+ */
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
+ napi_direct);
+ break;
+ case MEM_TYPE_PAGE_SHARED:
+ page_frag_free(__netmem_address(netmem));
+ break;
+ case MEM_TYPE_PAGE_ORDER0:
+ put_page(__netmem_to_page(netmem));
+ break;
+ case MEM_TYPE_XSK_BUFF_POOL:
+ /* NB! Only valid from an xdp_buff! */
+ xsk_buff_free(xdp);
+ break;
+ default:
+ /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
+ break;
+ }
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+ struct skb_shared_info *sinfo;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ false, NULL);
+
+out:
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+ struct skb_shared_info *sinfo;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ true, NULL);
+
+out:
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
+/* XDP bulk APIs introduce a defer/flush mechanism to return
+ * pages belonging to the same xdp_mem_allocator object
+ * (identified via the mem.id field) in bulk to optimize
+ * I-cache and D-cache.
+ * The bulk queue size is set to 16 to be aligned to how
+ * XDP_REDIRECT bulking works. The bulk is flushed when
+ * it is full or when mem.id changes.
+ * xdp_frame_bulk is usually stored/allocated on the function
+ * call-stack to avoid locking penalties.
+ */
+
+/* Must be called with rcu_read_lock held */
+void xdp_return_frame_bulk(struct xdp_frame *xdpf,
+ struct xdp_frame_bulk *bq)
+{
+ if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
+ xdp_return_frame(xdpf);
+ return;
+ }
+
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
+ struct skb_shared_info *sinfo;
+ int i;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ bq->q[bq->count++] = skb_frag_netmem(frag);
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+ }
+ }
+ bq->q[bq->count++] = virt_to_netmem(xdpf->data);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+
+/**
+ * xdp_return_frag -- free one XDP frag or decrement its refcount
+ * @netmem: network memory reference to release
+ * @xdp: &xdp_buff to release the frag for
+ */
+void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
+{
+ __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frag);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+ struct skb_shared_info *sinfo;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
+ xdp->rxq->mem.type, true, xdp);
+
+out:
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);
+
+void xdp_attachment_setup(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog)
+ bpf_prog_put(info->prog);
+ info->prog = bpf->prog;
+ info->flags = bpf->flags;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_setup);
+
+struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
+{
+ unsigned int metasize, totsize;
+ void *addr, *data_to_copy;
+ struct xdp_frame *xdpf;
+ struct page *page;
+
+ /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
+ metasize = xdp_data_meta_unsupported(xdp) ? 0 :
+ xdp->data - xdp->data_meta;
+ totsize = xdp->data_end - xdp->data + metasize;
+
+ if (sizeof(*xdpf) + totsize > PAGE_SIZE)
+ return NULL;
+
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+
+ addr = page_to_virt(page);
+ xdpf = addr;
+ memset(xdpf, 0, sizeof(*xdpf));
+
+ addr += sizeof(*xdpf);
+ data_to_copy = metasize ? xdp->data_meta : xdp->data;
+ memcpy(addr, data_to_copy, totsize);
+
+ xdpf->data = addr + metasize;
+ xdpf->len = totsize - metasize;
+ xdpf->headroom = 0;
+ xdpf->metasize = metasize;
+ xdpf->frame_sz = PAGE_SIZE;
+ xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
+
+ xsk_buff_free(xdp);
+ return xdpf;
+}
+EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
+
+/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line)
+{
+ WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
+};
+EXPORT_SYMBOL_GPL(xdp_warn);
+
+/**
+ * xdp_build_skb_from_buff - create an skb from &xdp_buff
+ * @xdp: &xdp_buff to convert to an skb
+ *
+ * Perform common operations to create a new skb to pass up the stack from
+ * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
+ * skb data pointers and offsets, set the recycle bit if the buff is
+ * PP-backed, Rx queue index, protocol and update frags info.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ const struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
+ u32 nr_frags = 0;
+ int metalen;
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0)
+ skb_metadata_set(skb, metalen);
+
+ if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(nr_frags)) {
+ u32 tsize;
+
+ tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ tsize, xdp_buff_get_skb_flags(xdp));
+ }
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);
+
+/**
+ * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
+ * @skb: skb to copy frags to
+ * @xdp: XSk &xdp_buff from which the frags will be copied
+ * @pp: &page_pool backing page allocation, if available
+ *
+ * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
+ * Allocate a new buffer for each frag, copy it and attach to the skb.
+ *
+ * Return: true on success, false on netmem allocation fail.
+ */
+static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
+ const struct xdp_buff *xdp,
+ struct page_pool *pp)
+{
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ const struct skb_shared_info *xinfo;
+ u32 nr_frags, tsize = 0;
+ u32 flags = 0;
+
+ xinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = xinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ const skb_frag_t *frag = &xinfo->frags[i];
+ u32 len = skb_frag_size(frag);
+ u32 offset, truesize = len;
+ struct page *page;
+
+ page = page_pool_dev_alloc(pp, &offset, &truesize);
+ if (unlikely(!page)) {
+ sinfo->nr_frags = i;
+ return false;
+ }
+
+ memcpy(page_address(page) + offset, skb_frag_address(frag),
+ LARGEST_ALIGN(len));
+ __skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
+
+ tsize += truesize;
+ if (page_is_pfmemalloc(page))
+ flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
+ }
+
+ xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize,
+ flags);
+
+ return true;
+}
+
+/**
+ * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
+ * @xdp: source XSk buff
+ *
+ * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
+ * head, new buffer for the head, copy the data and initialize the skb fields.
+ * If there are frags, allocate new buffers for them and copy.
+ * Buffers are allocated from the system percpu pools to try recycling them.
+ * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
+ * On error, it remains untouched and the caller must take care of this.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ u32 len = xdp->data_end - xdp->data_meta;
+ u32 truesize = xdp->frame_sz;
+ struct sk_buff *skb = NULL;
+ struct page_pool *pp;
+ int metalen;
+ void *data;
+
+ if (!IS_ENABLED(CONFIG_PAGE_POOL))
+ return NULL;
+
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ pp = this_cpu_read(system_page_pool.pool);
+ data = page_pool_dev_alloc_va(pp, &truesize);
+ if (unlikely(!data))
+ goto out;
+
+ skb = napi_build_skb(data, truesize);
+ if (unlikely(!skb)) {
+ page_pool_free_va(pp, data, true);
+ goto out;
+ }
+
+ skb_mark_for_recycle(skb);
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(xdp_buff_has_frags(xdp)) &&
+ unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ napi_consume_skb(skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ xsk_buff_free(xdp);
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+out:
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
+
+struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
+ unsigned int headroom, frame_size;
+ void *hard_start;
+ u8 nr_frags;
+
+ /* xdp frags frame */
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ nr_frags = sinfo->nr_frags;
+
+ /* Part of headroom was reserved to xdpf */
+ headroom = sizeof(*xdpf) + xdpf->headroom;
+
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
+ */
+ frame_size = xdpf->frame_sz;
+
+ hard_start = xdpf->data - headroom;
+ skb = build_skb_around(skb, hard_start, frame_size);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, headroom);
+ __skb_put(skb, xdpf->len);
+ if (xdpf->metasize)
+ skb_metadata_set(skb, xdpf->metasize);
+
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_get_skb_flags(xdpf));
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame);
+
+struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct net_device *dev)
+{
+ struct sk_buff *skb;
+
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+
+ return __xdp_build_skb_from_frame(xdpf, skb, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+ unsigned int headroom, totalsize;
+ struct xdp_frame *nxdpf;
+ struct page *page;
+ void *addr;
+
+ headroom = xdpf->headroom + sizeof(*xdpf);
+ totalsize = headroom + xdpf->len;
+
+ if (unlikely(totalsize > PAGE_SIZE))
+ return NULL;
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+ addr = page_to_virt(page);
+
+ memcpy(addr, xdpf, totalsize);
+
+ nxdpf = addr;
+ nxdpf->data = addr + headroom;
+ nxdpf->frame_sz = PAGE_SIZE;
+ nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
+
+ return nxdpf;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp.
+ * @ctx: XDP context pointer.
+ * @timestamp: Return value pointer.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : means device driver does not implement kfunc
+ * * ``-ENODATA`` : means no RX-timestamp available for this frame
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash.
+ * @ctx: XDP context pointer.
+ * @hash: Return value pointer.
+ * @rss_type: Return value pointer for RSS type.
+ *
+ * The RSS hash type (@rss_type) specifies what portion of packet headers NIC
+ * hardware used when calculating RSS hash value. The RSS type can be decoded
+ * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits
+ * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types*
+ * ``XDP_RSS_TYPE_L*``.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc
+ * * ``-ENODATA`` : means no RX-hash available for this frame
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
+ enum xdp_rss_hash_type *rss_type)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag
+ * @ctx: XDP context pointer.
+ * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID).
+ * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP)
+ *
+ * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*,
+ * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use
+ * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)**
+ * and should be used as follows:
+ * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();``
+ *
+ * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag.
+ * Driver is expected to provide those in **host byte order (usually LE)**,
+ * so the bpf program should not perform byte conversion.
+ * According to 802.1Q standard, *VLAN TCI (Tag control information)*
+ * is a bit field that contains:
+ * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``,
+ * *Drop eligible indicator (DEI)* - 1 bit,
+ * *Priority code point (PCP)* - 3 bits.
+ * For detailed meaning of DEI and PCP, please refer to other sources.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
+ * * ``-ENODATA`` : VLAN tag was not stripped or is not available
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
+ __be16 *vlan_proto, u16 *vlan_tci)
+{
+ return -EOPNOTSUPP;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+BTF_KFUNCS_END(xdp_metadata_kfunc_ids)
+
+static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xdp_metadata_kfunc_ids,
+};
+
+BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+u32 bpf_xdp_metadata_kfunc_id(int id)
+{
+ /* xdp_metadata_kfunc_ids is sorted and can't be used */
+ return xdp_metadata_kfunc_ids_unsorted[id];
+}
+
+bool bpf_dev_bound_kfunc_id(u32 btf_id)
+{
+ return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id);
+}
+
+static int __init xdp_metadata_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+}
+late_initcall(xdp_metadata_init);
+
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val)
+{
+ val &= NETDEV_XDP_ACT_MASK;
+ if (dev->xdp_features == val)
+ return;
+
+ netdev_assert_locked_or_invisible(dev);
+ dev->xdp_features = val;
+
+ if (dev->reg_state == NETREG_REGISTERED)
+ call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked);
+
+void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+{
+ netdev_lock(dev);
+ xdp_set_features_flag_locked(dev, val);
+ netdev_unlock(dev);
+}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag);
+
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+ bool support_sg)
+{
+ xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT);
+
+ if (support_sg)
+ val |= NETDEV_XDP_ACT_NDO_XMIT_SG;
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked);
+
+void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+{
+ netdev_lock(dev);
+ xdp_features_set_redirect_target_locked(dev, support_sg);
+ netdev_unlock(dev);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);
+
+void xdp_features_clear_redirect_target_locked(struct net_device *dev)
+{
+ xdp_features_t val = dev->xdp_features;
+
+ val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked);
+
+void xdp_features_clear_redirect_target(struct net_device *dev)
+{
+ netdev_lock(dev);
+ xdp_features_clear_redirect_target_locked(dev);
+ netdev_unlock(dev);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);
diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig
new file mode 100644
index 000000000000..efee8b9fe1d4
--- /dev/null
+++ b/net/dcb/Kconfig
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config DCB
+ bool "Data Center Bridging support"
+ default n
+ help
+ This enables support for configuring Data Center Bridging (DCB)
+ features on DCB capable Ethernet adapters via rtnetlink. Say 'Y'
+ if you have a DCB capable Ethernet adapter which supports this
+ interface and you are connected to a DCB capable switch.
+
+ DCB is a collection of Ethernet enhancements which allow DCB capable
+ NICs and switches to support network traffic with differing
+ requirements (highly reliable, no drops vs. best effort vs. low
+ latency) to co-exist on Ethernet.
+
+ DCB features include:
+ Enhanced Transmission Selection (aka Priority Grouping) - provides a
+ framework for assigning bandwidth guarantees to traffic classes.
+ Priority-based Flow Control (PFC) - a MAC control pause frame which
+ works at the granularity of the 802.1p priority instead of the
+ link (802.3x).
+
+ If unsure, say N.
diff --git a/net/dcb/Makefile b/net/dcb/Makefile
new file mode 100644
index 000000000000..2c0fa16ee2a9
--- /dev/null
+++ b/net/dcb/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += dcbnl.o dcbevent.o
diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c
new file mode 100644
index 000000000000..8620564c2b0b
--- /dev/null
+++ b/net/dcb/dcbevent.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <net/dcbevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain);
+
+int register_dcbevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(register_dcbevent_notifier);
+
+int unregister_dcbevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(unregister_dcbevent_notifier);
+
+int call_dcbevent_notifiers(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&dcbevent_notif_chain, val, v);
+}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
new file mode 100644
index 000000000000..03eb1d941fca
--- /dev/null
+++ b/net/dcb/dcbnl.c
@@ -0,0 +1,2428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2008-2011, Intel Corporation.
+ *
+ * Description: Data Center Bridging netlink interface
+ * Author: Lucy Liu <lucy.liu@intel.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include <linux/dcbnl.h>
+#include <net/dcbevent.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+
+/* Data Center Bridging (DCB) is a collection of Ethernet enhancements
+ * intended to allow network traffic with differing requirements
+ * (highly reliable, no drops vs. best effort vs. low latency) to operate
+ * and co-exist on Ethernet. Current DCB features are:
+ *
+ * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a
+ * framework for assigning bandwidth guarantees to traffic classes.
+ *
+ * Priority-based Flow Control (PFC) - provides a flow control mechanism which
+ * can work independently for each 802.1p priority.
+ *
+ * Congestion Notification - provides a mechanism for end-to-end congestion
+ * control for protocols which do not have built-in congestion management.
+ *
+ * More information about the emerging standards for these Ethernet features
+ * can be found at: http://www.ieee802.org/1/pages/dcbridges.html
+ *
+ * This file implements an rtnetlink interface to allow configuration of DCB
+ * features for capable devices.
+ */
+
+/**************** DCB attribute policies *************************************/
+
+/* DCB netlink attributes policy */
+static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
+ [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},
+ [DCB_ATTR_STATE] = {.type = NLA_U8},
+ [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED},
+ [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED},
+ [DCB_ATTR_SET_ALL] = {.type = NLA_U8},
+ [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
+ [DCB_ATTR_CAP] = {.type = NLA_NESTED},
+ [DCB_ATTR_PFC_STATE] = {.type = NLA_U8},
+ [DCB_ATTR_BCN] = {.type = NLA_NESTED},
+ [DCB_ATTR_APP] = {.type = NLA_NESTED},
+ [DCB_ATTR_IEEE] = {.type = NLA_NESTED},
+ [DCB_ATTR_DCBX] = {.type = NLA_U8},
+ [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED},
+};
+
+/* DCB priority flow control to User Priority nested attributes */
+static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = {
+ [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB priority grouping nested attributes */
+static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = {
+ [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB traffic class nested attributes. */
+static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = {
+ [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB capabilities nested attributes. */
+static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = {
+ [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG},
+ [DCB_CAP_ATTR_PG] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_PFC] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_GSP] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_BCN] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8},
+};
+
+/* DCB capabilities nested attributes. */
+static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = {
+ [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG},
+ [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8},
+ [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8},
+};
+
+/* DCB BCN nested attributes. */
+static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = {
+ [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG},
+ [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_BETA] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_GD] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_GI] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_TD] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_W] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RD] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RU] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RI] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_C] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB APP nested attributes. */
+static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = {
+ [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8},
+ [DCB_APP_ATTR_ID] = {.type = NLA_U16},
+ [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8},
+};
+
+/* IEEE 802.1Qaz nested attributes. */
+static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
+ [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)},
+ [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)},
+ [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED},
+ [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)},
+ [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
+ [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
+ [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)},
+ [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED},
+};
+
+/* DCB number of traffic classes nested attributes. */
+static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = {
+ [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG},
+ [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8},
+ [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8},
+ [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8},
+};
+
+static LIST_HEAD(dcb_app_list);
+static LIST_HEAD(dcb_rewr_list);
+static DEFINE_SPINLOCK(dcb_lock);
+
+static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector)
+{
+ switch (selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ case IEEE_8021QAZ_APP_SEL_STREAM:
+ case IEEE_8021QAZ_APP_SEL_DGRAM:
+ case IEEE_8021QAZ_APP_SEL_ANY:
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return DCB_ATTR_IEEE_APP;
+ case DCB_APP_SEL_PCP:
+ return DCB_ATTR_DCB_APP;
+ default:
+ return DCB_ATTR_IEEE_APP_UNSPEC;
+ }
+}
+
+static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type)
+{
+ switch (type) {
+ case DCB_ATTR_IEEE_APP:
+ case DCB_ATTR_DCB_APP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector)
+{
+ return dcbnl_app_attr_type_get(selector) == type;
+}
+
+static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq,
+ u32 flags, struct nlmsghdr **nlhp)
+{
+ struct sk_buff *skb;
+ struct dcbmsg *dcb;
+ struct nlmsghdr *nlh;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags);
+ BUG_ON(!nlh);
+
+ dcb = nlmsg_data(nlh);
+ dcb->dcb_family = AF_UNSPEC;
+ dcb->cmd = cmd;
+ dcb->dcb_pad = 0;
+
+ if (nlhp)
+ *nlhp = nlh;
+
+ return skb;
+}
+
+static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */
+ if (!netdev->dcbnl_ops->getstate)
+ return -EOPNOTSUPP;
+
+ return nla_put_u8(skb, DCB_ATTR_STATE,
+ netdev->dcbnl_ops->getstate(netdev));
+}
+
+static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret;
+ int i;
+ int getall = 0;
+
+ if (!tb[DCB_ATTR_PFC_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getpfccfg)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX,
+ tb[DCB_ATTR_PFC_CFG],
+ dcbnl_pfc_up_nest, NULL);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_PFC_UP_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0,
+ &value);
+ ret = nla_put_u8(skb, i, value);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 perm_addr[MAX_ADDR_LEN];
+
+ if (!netdev->dcbnl_ops->getpermhwaddr)
+ return -EOPNOTSUPP;
+
+ memset(perm_addr, 0, sizeof(perm_addr));
+ netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr);
+
+ return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr);
+}
+
+static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret;
+ int i;
+ int getall = 0;
+
+ if (!tb[DCB_ATTR_CAP])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getcap)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX,
+ tb[DCB_ATTR_CAP], dcbnl_cap_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_CAP_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) {
+ ret = nla_put_u8(skb, i, value);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+ }
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret;
+ int i;
+ int getall = 0;
+
+ if (!tb[DCB_ATTR_NUMTCS])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getnumtcs)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX,
+ tb[DCB_ATTR_NUMTCS],
+ dcbnl_numtcs_nest, NULL);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_NUMTCS_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value);
+ if (!ret) {
+ ret = nla_put_u8(skb, i, value);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+ } else
+ return -EINVAL;
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1];
+ int ret;
+ u8 value;
+ int i;
+
+ if (!tb[DCB_ATTR_NUMTCS])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setnumtcs)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX,
+ tb[DCB_ATTR_NUMTCS],
+ dcbnl_numtcs_nest, NULL);
+ if (ret)
+ return ret;
+
+ for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+ if (data[i] == NULL)
+ continue;
+
+ value = nla_get_u8(data[i]);
+
+ ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value);
+ if (ret)
+ break;
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret);
+}
+
+static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ if (!netdev->dcbnl_ops->getpfcstate)
+ return -EOPNOTSUPP;
+
+ return nla_put_u8(skb, DCB_ATTR_PFC_STATE,
+ netdev->dcbnl_ops->getpfcstate(netdev));
+}
+
+static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 value;
+
+ if (!tb[DCB_ATTR_PFC_STATE])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setpfcstate)
+ return -EOPNOTSUPP;
+
+ value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]);
+
+ netdev->dcbnl_ops->setpfcstate(netdev, value);
+
+ return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0);
+}
+
+static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *app_nest;
+ struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
+ u16 id;
+ u8 up, idtype;
+ int ret;
+
+ if (!tb[DCB_ATTR_APP])
+ return -EINVAL;
+
+ ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX,
+ tb[DCB_ATTR_APP], dcbnl_app_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ /* all must be non-null */
+ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
+ (!app_tb[DCB_APP_ATTR_ID]))
+ return -EINVAL;
+
+ /* either by eth type or by socket number */
+ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
+ if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
+ (idtype != DCB_APP_IDTYPE_PORTNUM))
+ return -EINVAL;
+
+ id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
+
+ if (netdev->dcbnl_ops->getapp) {
+ ret = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+ if (ret < 0)
+ return ret;
+ else
+ up = ret;
+ } else {
+ struct dcb_app app = {
+ .selector = idtype,
+ .protocol = id,
+ };
+ up = dcb_getapp(netdev, &app);
+ }
+
+ app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP);
+ if (!app_nest)
+ return -EMSGSIZE;
+
+ ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype);
+ if (ret)
+ goto out_cancel;
+
+ ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id);
+ if (ret)
+ goto out_cancel;
+
+ ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up);
+ if (ret)
+ goto out_cancel;
+
+ nla_nest_end(skb, app_nest);
+
+ return 0;
+
+out_cancel:
+ nla_nest_cancel(skb, app_nest);
+ return ret;
+}
+
+static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ int ret;
+ u16 id;
+ u8 up, idtype;
+ struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
+
+ if (!tb[DCB_ATTR_APP])
+ return -EINVAL;
+
+ ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX,
+ tb[DCB_ATTR_APP], dcbnl_app_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ /* all must be non-null */
+ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
+ (!app_tb[DCB_APP_ATTR_ID]) ||
+ (!app_tb[DCB_APP_ATTR_PRIORITY]))
+ return -EINVAL;
+
+ /* either by eth type or by socket number */
+ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
+ if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
+ (idtype != DCB_APP_IDTYPE_PORTNUM))
+ return -EINVAL;
+
+ id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
+ up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]);
+
+ if (netdev->dcbnl_ops->setapp) {
+ ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up);
+ if (ret < 0)
+ return ret;
+ } else {
+ struct dcb_app app;
+ app.selector = idtype;
+ app.protocol = id;
+ app.priority = up;
+ ret = dcb_setapp(netdev, &app);
+ }
+
+ ret = nla_put_u8(skb, DCB_ATTR_APP, ret);
+ dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0);
+
+ return ret;
+}
+
+static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ struct nlattr **tb, struct sk_buff *skb, int dir)
+{
+ struct nlattr *pg_nest, *param_nest, *data;
+ struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+ struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+ u8 prio, pgid, tc_pct, up_map;
+ int ret;
+ int getall = 0;
+ int i;
+
+ if (!tb[DCB_ATTR_PG_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getpgtccfgtx ||
+ !netdev->dcbnl_ops->getpgtccfgrx ||
+ !netdev->dcbnl_ops->getpgbwgcfgtx ||
+ !netdev->dcbnl_ops->getpgbwgcfgrx)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX,
+ tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG);
+ if (!pg_nest)
+ return -EMSGSIZE;
+
+ if (pg_tb[DCB_PG_ATTR_TC_ALL])
+ getall = 1;
+
+ for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+ if (!getall && !pg_tb[i])
+ continue;
+
+ if (pg_tb[DCB_PG_ATTR_TC_ALL])
+ data = pg_tb[DCB_PG_ATTR_TC_ALL];
+ else
+ data = pg_tb[i];
+ ret = nla_parse_nested_deprecated(param_tb,
+ DCB_TC_ATTR_PARAM_MAX, data,
+ dcbnl_tc_param_nest, NULL);
+ if (ret)
+ goto err_pg;
+
+ param_nest = nla_nest_start_noflag(skb, i);
+ if (!param_nest)
+ goto err_pg;
+
+ pgid = DCB_ATTR_VALUE_UNDEFINED;
+ prio = DCB_ATTR_VALUE_UNDEFINED;
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+ up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->getpgtccfgrx(netdev,
+ i - DCB_PG_ATTR_TC_0, &prio,
+ &pgid, &tc_pct, &up_map);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->getpgtccfgtx(netdev,
+ i - DCB_PG_ATTR_TC_0, &prio,
+ &pgid, &tc_pct, &up_map);
+ }
+
+ if (param_tb[DCB_TC_ATTR_PARAM_PGID] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb,
+ DCB_TC_ATTR_PARAM_PGID, pgid);
+ if (ret)
+ goto err_param;
+ }
+ if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb,
+ DCB_TC_ATTR_PARAM_UP_MAPPING, up_map);
+ if (ret)
+ goto err_param;
+ }
+ if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb,
+ DCB_TC_ATTR_PARAM_STRICT_PRIO, prio);
+ if (ret)
+ goto err_param;
+ }
+ if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT,
+ tc_pct);
+ if (ret)
+ goto err_param;
+ }
+ nla_nest_end(skb, param_nest);
+ }
+
+ if (pg_tb[DCB_PG_ATTR_BW_ID_ALL])
+ getall = 1;
+ else
+ getall = 0;
+
+ for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+ if (!getall && !pg_tb[i])
+ continue;
+
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->getpgbwgcfgrx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->getpgbwgcfgtx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+ }
+ ret = nla_put_u8(skb, i, tc_pct);
+ if (ret)
+ goto err_pg;
+ }
+
+ nla_nest_end(skb, pg_nest);
+
+ return 0;
+
+err_param:
+ nla_nest_cancel(skb, param_nest);
+err_pg:
+ nla_nest_cancel(skb, pg_nest);
+
+ return -EMSGSIZE;
+}
+
+static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0);
+}
+
+static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1);
+}
+
+static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 value;
+
+ if (!tb[DCB_ATTR_STATE])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setstate)
+ return -EOPNOTSUPP;
+
+ value = nla_get_u8(tb[DCB_ATTR_STATE]);
+
+ return nla_put_u8(skb, DCB_ATTR_STATE,
+ netdev->dcbnl_ops->setstate(netdev, value));
+}
+
+static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1];
+ int i;
+ int ret;
+ u8 value;
+
+ if (!tb[DCB_ATTR_PFC_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setpfccfg)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX,
+ tb[DCB_ATTR_PFC_CFG],
+ dcbnl_pfc_up_nest, NULL);
+ if (ret)
+ return ret;
+
+ for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+ if (data[i] == NULL)
+ continue;
+ value = nla_get_u8(data[i]);
+ netdev->dcbnl_ops->setpfccfg(netdev,
+ data[i]->nla_type - DCB_PFC_UP_ATTR_0, value);
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0);
+}
+
+static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ int ret;
+
+ if (!tb[DCB_ATTR_SET_ALL])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setall)
+ return -EOPNOTSUPP;
+
+ ret = nla_put_u8(skb, DCB_ATTR_SET_ALL,
+ netdev->dcbnl_ops->setall(netdev));
+ dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0);
+
+ return ret;
+}
+
+static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb,
+ int dir)
+{
+ struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+ struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+ int ret;
+ int i;
+ u8 pgid;
+ u8 up_map;
+ u8 prio;
+ u8 tc_pct;
+
+ if (!tb[DCB_ATTR_PG_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setpgtccfgtx ||
+ !netdev->dcbnl_ops->setpgtccfgrx ||
+ !netdev->dcbnl_ops->setpgbwgcfgtx ||
+ !netdev->dcbnl_ops->setpgbwgcfgrx)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX,
+ tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+ if (!pg_tb[i])
+ continue;
+
+ ret = nla_parse_nested_deprecated(param_tb,
+ DCB_TC_ATTR_PARAM_MAX,
+ pg_tb[i],
+ dcbnl_tc_param_nest, NULL);
+ if (ret)
+ return ret;
+
+ pgid = DCB_ATTR_VALUE_UNDEFINED;
+ prio = DCB_ATTR_VALUE_UNDEFINED;
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+ up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO])
+ prio =
+ nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]);
+
+ if (param_tb[DCB_TC_ATTR_PARAM_PGID])
+ pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]);
+
+ if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT])
+ tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]);
+
+ if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING])
+ up_map =
+ nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]);
+
+ /* dir: Tx = 0, Rx = 1 */
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->setpgtccfgrx(netdev,
+ i - DCB_PG_ATTR_TC_0,
+ prio, pgid, tc_pct, up_map);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->setpgtccfgtx(netdev,
+ i - DCB_PG_ATTR_TC_0,
+ prio, pgid, tc_pct, up_map);
+ }
+ }
+
+ for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+ if (!pg_tb[i])
+ continue;
+
+ tc_pct = nla_get_u8(pg_tb[i]);
+
+ /* dir: Tx = 0, Rx = 1 */
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->setpgbwgcfgrx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->setpgbwgcfgtx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+ }
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0);
+}
+
+static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0);
+}
+
+static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1);
+}
+
+static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *bcn_nest;
+ struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1];
+ u8 value_byte;
+ u32 value_integer;
+ int ret;
+ bool getall = false;
+ int i;
+
+ if (!tb[DCB_ATTR_BCN])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getbcnrp ||
+ !netdev->dcbnl_ops->getbcncfg)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX,
+ tb[DCB_ATTR_BCN], dcbnl_bcn_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN);
+ if (!bcn_nest)
+ return -EMSGSIZE;
+
+ if (bcn_tb[DCB_BCN_ATTR_ALL])
+ getall = true;
+
+ for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+ if (!getall && !bcn_tb[i])
+ continue;
+
+ netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0,
+ &value_byte);
+ ret = nla_put_u8(skb, i, value_byte);
+ if (ret)
+ goto err_bcn;
+ }
+
+ for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
+ if (!getall && !bcn_tb[i])
+ continue;
+
+ netdev->dcbnl_ops->getbcncfg(netdev, i,
+ &value_integer);
+ ret = nla_put_u32(skb, i, value_integer);
+ if (ret)
+ goto err_bcn;
+ }
+
+ nla_nest_end(skb, bcn_nest);
+
+ return 0;
+
+err_bcn:
+ nla_nest_cancel(skb, bcn_nest);
+ return ret;
+}
+
+static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_BCN_ATTR_MAX + 1];
+ int i;
+ int ret;
+ u8 value_byte;
+ u32 value_int;
+
+ if (!tb[DCB_ATTR_BCN])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setbcncfg ||
+ !netdev->dcbnl_ops->setbcnrp)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX,
+ tb[DCB_ATTR_BCN], dcbnl_bcn_nest,
+ NULL);
+ if (ret)
+ return ret;
+
+ for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+ if (data[i] == NULL)
+ continue;
+ value_byte = nla_get_u8(data[i]);
+ netdev->dcbnl_ops->setbcnrp(netdev,
+ data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte);
+ }
+
+ for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
+ if (data[i] == NULL)
+ continue;
+ value_int = nla_get_u32(data[i]);
+ netdev->dcbnl_ops->setbcncfg(netdev,
+ i, value_int);
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_BCN, 0);
+}
+
+static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
+ int app_nested_type, int app_info_type,
+ int app_entry_type)
+{
+ struct dcb_peer_app_info info;
+ struct dcb_app *table = NULL;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ u16 app_count;
+ int err;
+
+
+ /**
+ * retrieve the peer app configuration form the driver. If the driver
+ * handlers fail exit without doing anything
+ */
+ err = ops->peer_getappinfo(netdev, &info, &app_count);
+ if (!err && app_count) {
+ table = kmalloc_array(app_count, sizeof(struct dcb_app),
+ GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ err = ops->peer_getapptable(netdev, table);
+ }
+
+ if (!err) {
+ u16 i;
+ struct nlattr *app;
+
+ /**
+ * build the message, from here on the only possible failure
+ * is due to the skb size
+ */
+ err = -EMSGSIZE;
+
+ app = nla_nest_start_noflag(skb, app_nested_type);
+ if (!app)
+ goto nla_put_failure;
+
+ if (app_info_type &&
+ nla_put(skb, app_info_type, sizeof(info), &info))
+ goto nla_put_failure;
+
+ for (i = 0; i < app_count; i++) {
+ if (nla_put(skb, app_entry_type, sizeof(struct dcb_app),
+ &table[i]))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, app);
+ }
+ err = 0;
+
+nla_put_failure:
+ kfree(table);
+ return err;
+}
+
+static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ enum ieee_attrs_app type;
+ struct nlattr *apptrust;
+ int nselectors, err, i;
+ u8 *selectors;
+
+ selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL);
+ if (!selectors)
+ return -ENOMEM;
+
+ err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors);
+ if (err) {
+ err = 0;
+ goto out;
+ }
+
+ apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE);
+ if (!apptrust) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ for (i = 0; i < nselectors; i++) {
+ type = dcbnl_app_attr_type_get(selectors[i]);
+ err = nla_put_u8(skb, type, selectors[i]);
+ if (err) {
+ nla_nest_cancel(skb, apptrust);
+ goto out;
+ }
+ }
+ nla_nest_end(skb, apptrust);
+
+out:
+ kfree(selectors);
+ return err;
+}
+
+/* Set or delete APP table or rewrite table entries. The APP struct is validated
+ * and the appropriate callback function is called.
+ */
+static int dcbnl_app_table_setdel(struct nlattr *attr,
+ struct net_device *netdev,
+ int (*setdel)(struct net_device *dev,
+ struct dcb_app *app))
+{
+ struct dcb_app *app_data;
+ enum ieee_attrs_app type;
+ struct nlattr *attr_itr;
+ int rem, err;
+
+ nla_for_each_nested(attr_itr, attr, rem) {
+ type = nla_type(attr_itr);
+
+ if (!dcbnl_app_attr_type_validate(type))
+ continue;
+
+ if (nla_len(attr_itr) < sizeof(struct dcb_app))
+ return -ERANGE;
+
+ app_data = nla_data(attr_itr);
+
+ if (!dcbnl_app_selector_validate(type, app_data->selector))
+ return -EINVAL;
+
+ err = setdel(netdev, app_data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */
+static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ struct nlattr *ieee, *app, *rewr;
+ struct dcb_app_type *itr;
+ int dcbx;
+ int err;
+
+ if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
+ return -EMSGSIZE;
+
+ ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE);
+ if (!ieee)
+ return -EMSGSIZE;
+
+ if (ops->ieee_getets) {
+ struct ieee_ets ets;
+ memset(&ets, 0, sizeof(ets));
+ err = ops->ieee_getets(netdev, &ets);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets))
+ return -EMSGSIZE;
+ }
+
+ if (ops->ieee_getmaxrate) {
+ struct ieee_maxrate maxrate;
+ memset(&maxrate, 0, sizeof(maxrate));
+ err = ops->ieee_getmaxrate(netdev, &maxrate);
+ if (!err) {
+ err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE,
+ sizeof(maxrate), &maxrate);
+ if (err)
+ return -EMSGSIZE;
+ }
+ }
+
+ if (ops->ieee_getqcn) {
+ struct ieee_qcn qcn;
+
+ memset(&qcn, 0, sizeof(qcn));
+ err = ops->ieee_getqcn(netdev, &qcn);
+ if (!err) {
+ err = nla_put(skb, DCB_ATTR_IEEE_QCN,
+ sizeof(qcn), &qcn);
+ if (err)
+ return -EMSGSIZE;
+ }
+ }
+
+ if (ops->ieee_getqcnstats) {
+ struct ieee_qcn_stats qcn_stats;
+
+ memset(&qcn_stats, 0, sizeof(qcn_stats));
+ err = ops->ieee_getqcnstats(netdev, &qcn_stats);
+ if (!err) {
+ err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS,
+ sizeof(qcn_stats), &qcn_stats);
+ if (err)
+ return -EMSGSIZE;
+ }
+ }
+
+ if (ops->ieee_getpfc) {
+ struct ieee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
+ err = ops->ieee_getpfc(netdev, &pfc);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc))
+ return -EMSGSIZE;
+ }
+
+ if (ops->dcbnl_getbuffer) {
+ struct dcbnl_buffer buffer;
+
+ memset(&buffer, 0, sizeof(buffer));
+ err = ops->dcbnl_getbuffer(netdev, &buffer);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
+ return -EMSGSIZE;
+ }
+
+ app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE);
+ if (!app)
+ return -EMSGSIZE;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == netdev->ifindex) {
+ enum ieee_attrs_app type =
+ dcbnl_app_attr_type_get(itr->app.selector);
+ err = nla_put(skb, type, sizeof(itr->app), &itr->app);
+ if (err) {
+ spin_unlock_bh(&dcb_lock);
+ return -EMSGSIZE;
+ }
+ }
+ }
+
+ if (netdev->dcbnl_ops->getdcbx)
+ dcbx = netdev->dcbnl_ops->getdcbx(netdev);
+ else
+ dcbx = -EOPNOTSUPP;
+
+ spin_unlock_bh(&dcb_lock);
+ nla_nest_end(skb, app);
+
+ rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE);
+ if (!rewr)
+ return -EMSGSIZE;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->ifindex == netdev->ifindex) {
+ enum ieee_attrs_app type =
+ dcbnl_app_attr_type_get(itr->app.selector);
+ err = nla_put(skb, type, sizeof(itr->app), &itr->app);
+ if (err) {
+ spin_unlock_bh(&dcb_lock);
+ nla_nest_cancel(skb, rewr);
+ return -EMSGSIZE;
+ }
+ }
+ }
+
+ spin_unlock_bh(&dcb_lock);
+ nla_nest_end(skb, rewr);
+
+ if (ops->dcbnl_getapptrust) {
+ err = dcbnl_getapptrust(netdev, skb);
+ if (err)
+ return err;
+ }
+
+ /* get peer info if available */
+ if (ops->ieee_peer_getets) {
+ struct ieee_ets ets;
+ memset(&ets, 0, sizeof(ets));
+ err = ops->ieee_peer_getets(netdev, &ets);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets))
+ return -EMSGSIZE;
+ }
+
+ if (ops->ieee_peer_getpfc) {
+ struct ieee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
+ err = ops->ieee_peer_getpfc(netdev, &pfc);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc))
+ return -EMSGSIZE;
+ }
+
+ if (ops->peer_getappinfo && ops->peer_getapptable) {
+ err = dcbnl_build_peer_app(netdev, skb,
+ DCB_ATTR_IEEE_PEER_APP,
+ DCB_ATTR_IEEE_APP_UNSPEC,
+ DCB_ATTR_IEEE_APP);
+ if (err)
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, ieee);
+ if (dcbx >= 0) {
+ err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
+ if (err)
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,
+ int dir)
+{
+ u8 pgid, up_map, prio, tc_pct;
+ const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
+ int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG;
+ struct nlattr *pg = nla_nest_start_noflag(skb, i);
+
+ if (!pg)
+ return -EMSGSIZE;
+
+ for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+ struct nlattr *tc_nest = nla_nest_start_noflag(skb, i);
+
+ if (!tc_nest)
+ return -EMSGSIZE;
+
+ pgid = DCB_ATTR_VALUE_UNDEFINED;
+ prio = DCB_ATTR_VALUE_UNDEFINED;
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+ up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (!dir)
+ ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0,
+ &prio, &pgid, &tc_pct, &up_map);
+ else
+ ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0,
+ &prio, &pgid, &tc_pct, &up_map);
+
+ if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) ||
+ nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) ||
+ nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) ||
+ nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct))
+ return -EMSGSIZE;
+ nla_nest_end(skb, tc_nest);
+ }
+
+ for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (!dir)
+ ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0,
+ &tc_pct);
+ else
+ ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0,
+ &tc_pct);
+ if (nla_put_u8(skb, i, tc_pct))
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, pg);
+ return 0;
+}
+
+static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct nlattr *cee, *app;
+ struct dcb_app_type *itr;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ int dcbx, i, err = -EMSGSIZE;
+ u8 value;
+
+ if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
+ goto nla_put_failure;
+ cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE);
+ if (!cee)
+ goto nla_put_failure;
+
+ /* local pg */
+ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) {
+ err = dcbnl_cee_pg_fill(skb, netdev, 1);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) {
+ err = dcbnl_cee_pg_fill(skb, netdev, 0);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ /* local pfc */
+ if (ops->getpfccfg) {
+ struct nlattr *pfc_nest = nla_nest_start_noflag(skb,
+ DCB_ATTR_CEE_PFC);
+
+ if (!pfc_nest)
+ goto nla_put_failure;
+
+ for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+ ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value);
+ if (nla_put_u8(skb, i, value))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, pfc_nest);
+ }
+
+ /* local app */
+ spin_lock_bh(&dcb_lock);
+ app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE);
+ if (!app)
+ goto dcb_unlock;
+
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == netdev->ifindex) {
+ struct nlattr *app_nest = nla_nest_start_noflag(skb,
+ DCB_ATTR_APP);
+ if (!app_nest)
+ goto dcb_unlock;
+
+ err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE,
+ itr->app.selector);
+ if (err)
+ goto dcb_unlock;
+
+ err = nla_put_u16(skb, DCB_APP_ATTR_ID,
+ itr->app.protocol);
+ if (err)
+ goto dcb_unlock;
+
+ err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY,
+ itr->app.priority);
+ if (err)
+ goto dcb_unlock;
+
+ nla_nest_end(skb, app_nest);
+ }
+ }
+ nla_nest_end(skb, app);
+
+ if (netdev->dcbnl_ops->getdcbx)
+ dcbx = netdev->dcbnl_ops->getdcbx(netdev);
+ else
+ dcbx = -EOPNOTSUPP;
+
+ spin_unlock_bh(&dcb_lock);
+
+ /* features flags */
+ if (ops->getfeatcfg) {
+ struct nlattr *feat = nla_nest_start_noflag(skb,
+ DCB_ATTR_CEE_FEAT);
+ if (!feat)
+ goto nla_put_failure;
+
+ for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX;
+ i++)
+ if (!ops->getfeatcfg(netdev, i, &value) &&
+ nla_put_u8(skb, i, value))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, feat);
+ }
+
+ /* peer info if available */
+ if (ops->cee_peer_getpg) {
+ struct cee_pg pg;
+ memset(&pg, 0, sizeof(pg));
+ err = ops->cee_peer_getpg(netdev, &pg);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg))
+ goto nla_put_failure;
+ }
+
+ if (ops->cee_peer_getpfc) {
+ struct cee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
+ err = ops->cee_peer_getpfc(netdev, &pfc);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc))
+ goto nla_put_failure;
+ }
+
+ if (ops->peer_getappinfo && ops->peer_getapptable) {
+ err = dcbnl_build_peer_app(netdev, skb,
+ DCB_ATTR_CEE_PEER_APP_TABLE,
+ DCB_ATTR_CEE_PEER_APP_INFO,
+ DCB_ATTR_CEE_PEER_APP);
+ if (err)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, cee);
+
+ /* DCBX state */
+ if (dcbx >= 0) {
+ err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
+ if (err)
+ goto nla_put_failure;
+ }
+ return 0;
+
+dcb_unlock:
+ spin_unlock_bh(&dcb_lock);
+nla_put_failure:
+ err = -EMSGSIZE;
+ return err;
+}
+
+static int dcbnl_notify(struct net_device *dev, int event, int cmd,
+ u32 seq, u32 portid, int dcbx_ver)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh);
+ if (!skb)
+ return -ENOMEM;
+
+ if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE)
+ err = dcbnl_ieee_fill(skb, dev);
+ else
+ err = dcbnl_cee_fill(skb, dev);
+
+ if (err < 0) {
+ /* Report error to broadcast listeners */
+ nlmsg_free(skb);
+ rtnl_set_sk_err(net, RTNLGRP_DCB, err);
+ } else {
+ /* End nlmsg and notify broadcast listeners */
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL);
+ }
+
+ return err;
+}
+
+int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd,
+ u32 seq, u32 portid)
+{
+ return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE);
+}
+EXPORT_SYMBOL(dcbnl_ieee_notify);
+
+int dcbnl_cee_notify(struct net_device *dev, int event, int cmd,
+ u32 seq, u32 portid)
+{
+ return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE);
+}
+EXPORT_SYMBOL(dcbnl_cee_notify);
+
+/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands.
+ * If any requested operation can not be completed
+ * the entire msg is aborted and error value is returned.
+ * No attempt is made to reconcile the case where only part of the
+ * cmd can be completed.
+ */
+static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+ int prio;
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_IEEE])
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX,
+ tb[DCB_ATTR_IEEE],
+ dcbnl_ieee_policy, NULL);
+ if (err)
+ return err;
+
+ if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
+ struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
+ err = ops->ieee_setets(netdev, ets);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) {
+ struct ieee_maxrate *maxrate =
+ nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]);
+ err = ops->ieee_setmaxrate(netdev, maxrate);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) {
+ struct ieee_qcn *qcn =
+ nla_data(ieee[DCB_ATTR_IEEE_QCN]);
+
+ err = ops->ieee_setqcn(netdev, qcn);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
+ struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
+ err = ops->ieee_setpfc(netdev, pfc);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
+ struct dcbnl_buffer *buffer =
+ nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+
+ for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) {
+ if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
+ err = ops->dcbnl_setbuffer(netdev, buffer);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_DCB_REWR_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE],
+ netdev,
+ ops->dcbnl_setrewr ?: dcb_setrewr);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE],
+ netdev, ops->ieee_setapp ?:
+ dcb_ieee_setapp);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) {
+ u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0};
+ struct nlattr *attr;
+ int nselectors = 0;
+ int rem;
+
+ if (!ops->dcbnl_setapptrust) {
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE],
+ rem) {
+ enum ieee_attrs_app type = nla_type(attr);
+ u8 selector;
+ int i;
+
+ if (!dcbnl_app_attr_type_validate(type) ||
+ nla_len(attr) != 1 ||
+ nselectors >= sizeof(selectors)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ selector = nla_get_u8(attr);
+
+ if (!dcbnl_app_selector_validate(type, selector)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ /* Duplicate selector ? */
+ for (i = 0; i < nselectors; i++) {
+ if (selectors[i] == selector) {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
+ selectors[nselectors++] = selector;
+ }
+
+ err = ops->dcbnl_setapptrust(netdev, selectors, nselectors);
+ if (err)
+ goto err;
+ }
+
+err:
+ err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
+ dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0);
+ return err;
+}
+
+static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return dcbnl_ieee_fill(skb, netdev);
+}
+
+static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_IEEE])
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX,
+ tb[DCB_ATTR_IEEE],
+ dcbnl_ieee_policy, NULL);
+ if (err)
+ return err;
+
+ if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE],
+ netdev, ops->ieee_delapp ?:
+ dcb_ieee_delapp);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_DCB_REWR_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE],
+ netdev,
+ ops->dcbnl_delrewr ?: dcb_delrewr);
+ if (err)
+ goto err;
+ }
+
+err:
+ err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
+ dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0);
+ return err;
+}
+
+
+/* DCBX configuration */
+static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ if (!netdev->dcbnl_ops->getdcbx)
+ return -EOPNOTSUPP;
+
+ return nla_put_u8(skb, DCB_ATTR_DCBX,
+ netdev->dcbnl_ops->getdcbx(netdev));
+}
+
+static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 value;
+
+ if (!netdev->dcbnl_ops->setdcbx)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_DCBX])
+ return -EINVAL;
+
+ value = nla_get_u8(tb[DCB_ATTR_DCBX]);
+
+ return nla_put_u8(skb, DCB_ATTR_DCBX,
+ netdev->dcbnl_ops->setdcbx(netdev, value));
+}
+
+static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret, i;
+ int getall = 0;
+
+ if (!netdev->dcbnl_ops->getfeatcfg)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_FEATCFG])
+ return -EINVAL;
+
+ ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX,
+ tb[DCB_ATTR_FEATCFG],
+ dcbnl_featcfg_nest, NULL);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_FEATCFG_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value);
+ if (!ret)
+ ret = nla_put_u8(skb, i, value);
+
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, nest);
+
+nla_put_failure:
+ return ret;
+}
+
+static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1];
+ int ret, i;
+ u8 value;
+
+ if (!netdev->dcbnl_ops->setfeatcfg)
+ return -ENOTSUPP;
+
+ if (!tb[DCB_ATTR_FEATCFG])
+ return -EINVAL;
+
+ ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX,
+ tb[DCB_ATTR_FEATCFG],
+ dcbnl_featcfg_nest, NULL);
+
+ if (ret)
+ goto err;
+
+ for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
+ if (data[i] == NULL)
+ continue;
+
+ value = nla_get_u8(data[i]);
+
+ ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value);
+
+ if (ret)
+ goto err;
+ }
+err:
+ ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret);
+
+ return ret;
+}
+
+/* Handle CEE DCBX GET commands. */
+static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return dcbnl_cee_fill(skb, netdev);
+}
+
+struct reply_func {
+ /* reply netlink message type */
+ int type;
+
+ /* function to fill message contents */
+ int (*cb)(struct net_device *, struct nlmsghdr *, u32,
+ struct nlattr **, struct sk_buff *);
+};
+
+static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = {
+ [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate },
+ [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate },
+ [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg },
+ [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg },
+ [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr },
+ [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap },
+ [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs },
+ [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs },
+ [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate },
+ [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate },
+ [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp },
+ [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp },
+ [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg },
+ [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg },
+ [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg },
+ [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg },
+ [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall },
+ [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg },
+ [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg },
+ [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get },
+ [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set },
+ [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del },
+ [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx },
+ [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx },
+ [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg },
+ [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg },
+ [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get },
+};
+
+static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ struct dcbmsg *dcb = nlmsg_data(nlh);
+ struct nlattr *tb[DCB_ATTR_MAX + 1];
+ u32 portid = NETLINK_CB(skb).portid;
+ int ret = -EINVAL;
+ struct sk_buff *reply_skb;
+ struct nlmsghdr *reply_nlh = NULL;
+ const struct reply_func *fn;
+
+ if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
+ dcbnl_rtnl_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (dcb->cmd > DCB_CMD_MAX)
+ return -EINVAL;
+
+ /* check if a reply function has been defined for the command */
+ fn = &reply_funcs[dcb->cmd];
+ if (!fn->cb)
+ return -EOPNOTSUPP;
+ if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!tb[DCB_ATTR_IFNAME])
+ return -EINVAL;
+
+ netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME]));
+ if (!netdev)
+ return -ENODEV;
+
+ if (!netdev->dcbnl_ops)
+ return -EOPNOTSUPP;
+
+ reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq,
+ nlh->nlmsg_flags, &reply_nlh);
+ if (!reply_skb)
+ return -ENOMEM;
+
+ ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb);
+ if (ret < 0) {
+ nlmsg_free(reply_skb);
+ goto out;
+ }
+
+ nlmsg_end(reply_skb, reply_nlh);
+
+ ret = rtnl_unicast(reply_skb, net, portid);
+out:
+ return ret;
+}
+
+static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app,
+ int ifindex, int proto)
+{
+ struct dcb_app_type *itr;
+
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->app.selector == app->selector &&
+ itr->app.priority == app->priority &&
+ itr->ifindex == ifindex &&
+ ((proto == -1) || itr->app.protocol == proto))
+ return itr;
+ }
+
+ return NULL;
+}
+
+static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
+ int ifindex, int prio)
+{
+ struct dcb_app_type *itr;
+
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->app.selector == app->selector &&
+ itr->app.protocol == app->protocol &&
+ itr->ifindex == ifindex &&
+ ((prio == -1) || itr->app.priority == prio))
+ return itr;
+ }
+
+ return NULL;
+}
+
+static int dcb_app_add(struct list_head *list, const struct dcb_app *app,
+ int ifindex)
+{
+ struct dcb_app_type *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+
+ memcpy(&entry->app, app, sizeof(*app));
+ entry->ifindex = ifindex;
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+/**
+ * dcb_getapp - retrieve the DCBX application user priority
+ * @dev: network interface
+ * @app: application to get user priority of
+ *
+ * On success returns a non-zero 802.1p user priority bitmap
+ * otherwise returns 0 as the invalid user priority bitmap to
+ * indicate an error.
+ */
+u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
+{
+ struct dcb_app_type *itr;
+ u8 prio = 0;
+
+ spin_lock_bh(&dcb_lock);
+ itr = dcb_app_lookup(app, dev->ifindex, -1);
+ if (itr)
+ prio = itr->app.priority;
+ spin_unlock_bh(&dcb_lock);
+
+ return prio;
+}
+EXPORT_SYMBOL(dcb_getapp);
+
+/**
+ * dcb_setapp - add CEE dcb application data to app list
+ * @dev: network interface
+ * @new: application data to add
+ *
+ * Priority 0 is an invalid priority in CEE spec. This routine
+ * removes applications from the app list if the priority is
+ * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap
+ */
+int dcb_setapp(struct net_device *dev, struct dcb_app *new)
+{
+ struct dcb_app_type *itr;
+ struct dcb_app_type event;
+ int err = 0;
+
+ event.ifindex = dev->ifindex;
+ memcpy(&event.app, new, sizeof(event.app));
+ if (dev->dcbnl_ops->getdcbx)
+ event.dcbx = dev->dcbnl_ops->getdcbx(dev);
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and replace */
+ itr = dcb_app_lookup(new, dev->ifindex, -1);
+ if (itr) {
+ if (new->priority)
+ itr->app.priority = new->priority;
+ else {
+ list_del(&itr->list);
+ kfree(itr);
+ }
+ goto out;
+ }
+ /* App type does not exist add new application type */
+ if (new->priority)
+ err = dcb_app_add(&dcb_app_list, new, dev->ifindex);
+out:
+ spin_unlock_bh(&dcb_lock);
+ if (!err)
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+ return err;
+}
+EXPORT_SYMBOL(dcb_setapp);
+
+/**
+ * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority
+ * @dev: network interface
+ * @app: where to store the retrieve application data
+ *
+ * Helper routine which on success returns a non-zero 802.1Qaz user
+ * priority bitmap otherwise returns 0 to indicate the dcb_app was
+ * not found in APP list.
+ */
+u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
+{
+ struct dcb_app_type *itr;
+ u8 prio = 0;
+
+ spin_lock_bh(&dcb_lock);
+ itr = dcb_app_lookup(app, dev->ifindex, -1);
+ if (itr)
+ prio |= 1 << itr->app.priority;
+ spin_unlock_bh(&dcb_lock);
+
+ return prio;
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_mask);
+
+/* Get protocol value from rewrite entry. */
+u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app)
+{
+ struct dcb_app_type *itr;
+ u16 proto = 0;
+
+ spin_lock_bh(&dcb_lock);
+ itr = dcb_rewr_lookup(app, dev->ifindex, -1);
+ if (itr)
+ proto = itr->app.protocol;
+ spin_unlock_bh(&dcb_lock);
+
+ return proto;
+}
+EXPORT_SYMBOL(dcb_getrewr);
+
+ /* Add rewrite entry to the rewrite list. */
+int dcb_setrewr(struct net_device *dev, struct dcb_app *new)
+{
+ int err;
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and abort if found. */
+ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex);
+out:
+ spin_unlock_bh(&dcb_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(dcb_setrewr);
+
+/* Delete rewrite entry from the rewrite list. */
+int dcb_delrewr(struct net_device *dev, struct dcb_app *del)
+{
+ struct dcb_app_type *itr;
+ int err = -ENOENT;
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and remove it. */
+ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol);
+ if (itr) {
+ list_del(&itr->list);
+ kfree(itr);
+ err = 0;
+ }
+
+ spin_unlock_bh(&dcb_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(dcb_delrewr);
+
+/**
+ * dcb_ieee_setapp - add IEEE dcb application data to app list
+ * @dev: network interface
+ * @new: application data to add
+ *
+ * This adds Application data to the list. Multiple application
+ * entries may exists for the same selector and protocol as long
+ * as the priorities are different. Priority is expected to be a
+ * 3-bit unsigned integer
+ */
+int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
+{
+ struct dcb_app_type event;
+ int err = 0;
+
+ event.ifindex = dev->ifindex;
+ memcpy(&event.app, new, sizeof(event.app));
+ if (dev->dcbnl_ops->getdcbx)
+ event.dcbx = dev->dcbnl_ops->getdcbx(dev);
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and abort if found */
+ if (dcb_app_lookup(new, dev->ifindex, new->priority)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = dcb_app_add(&dcb_app_list, new, dev->ifindex);
+out:
+ spin_unlock_bh(&dcb_lock);
+ if (!err)
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+ return err;
+}
+EXPORT_SYMBOL(dcb_ieee_setapp);
+
+/**
+ * dcb_ieee_delapp - delete IEEE dcb application data from list
+ * @dev: network interface
+ * @del: application data to delete
+ *
+ * This removes a matching APP data from the APP list
+ */
+int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
+{
+ struct dcb_app_type *itr;
+ struct dcb_app_type event;
+ int err = -ENOENT;
+
+ event.ifindex = dev->ifindex;
+ memcpy(&event.app, del, sizeof(event.app));
+ if (dev->dcbnl_ops->getdcbx)
+ event.dcbx = dev->dcbnl_ops->getdcbx(dev);
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and remove it. */
+ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) {
+ list_del(&itr->list);
+ kfree(itr);
+ err = 0;
+ }
+
+ spin_unlock_bh(&dcb_lock);
+ if (!err)
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+ return err;
+}
+EXPORT_SYMBOL(dcb_ieee_delapp);
+
+/* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from
+ * priorities to the PCP and DEI values assigned to that priority.
+ */
+void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev,
+ struct dcb_rewr_prio_pcp_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == DCB_APP_SEL_PCP &&
+ itr->app.protocol < 16 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1 << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map);
+
+/* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from
+ * priorities to the DSCP values assigned to that priority.
+ */
+void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_prio_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1ULL << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map);
+
+/*
+ * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
+ * priorities to the DSCP values assigned to that priority. Initialize p_map
+ * such that each map element holds a bit mask of DSCP values configured for
+ * that priority by APP entries.
+ */
+void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_prio_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1ULL << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
+
+/*
+ * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
+ * DSCP values to the priorities assigned to that DSCP value. Initialize p_map
+ * such that each map element holds a bit mask of priorities configured for a
+ * given DSCP value by APP entries.
+ */
+void
+dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_dscp_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+ p_map->map[itr->app.protocol] |= 1 << itr->app.priority;
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);
+
+/*
+ * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
+ * type, with valid PID values >= 1536. A special meaning is then assigned to
+ * protocol value of 0: "default priority. For use when priority is not
+ * otherwise specified".
+ *
+ * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries
+ * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default
+ * priorities set by these entries.
+ */
+u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 mask = 0;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
+ itr->app.protocol == 0 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+ mask |= 1 << itr->app.priority;
+ }
+ spin_unlock_bh(&dcb_lock);
+
+ return mask;
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask);
+
+static void dcbnl_flush_dev(struct net_device *dev)
+{
+ struct dcb_app_type *itr, *tmp;
+
+ spin_lock_bh(&dcb_lock);
+
+ list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) {
+ if (itr->ifindex == dev->ifindex) {
+ list_del(&itr->list);
+ kfree(itr);
+ }
+ }
+
+ spin_unlock_bh(&dcb_lock);
+}
+
+static int dcbnl_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ if (!dev->dcbnl_ops)
+ return NOTIFY_DONE;
+
+ dcbnl_flush_dev(dev);
+
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block dcbnl_nb __read_mostly = {
+ .notifier_call = dcbnl_netdevice_event,
+};
+
+static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_GETDCB, .doit = dcb_doit},
+ {.msgtype = RTM_SETDCB, .doit = dcb_doit},
+};
+
+static int __init dcbnl_init(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&dcbnl_nb);
+ if (err)
+ return err;
+
+ rtnl_register_many(dcbnl_rtnl_msg_handlers);
+
+ return 0;
+}
+device_initcall(dcbnl_init);
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
deleted file mode 100644
index b8a68dd41000..000000000000
--- a/net/dccp/Kconfig
+++ /dev/null
@@ -1,64 +0,0 @@
-menu "DCCP Configuration (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
-
-config IP_DCCP
- tristate "The DCCP Protocol (EXPERIMENTAL)"
- ---help---
- Datagram Congestion Control Protocol (RFC 4340)
-
- From http://www.ietf.org/rfc/rfc4340.txt:
-
- The Datagram Congestion Control Protocol (DCCP) is a transport
- protocol that implements bidirectional, unicast connections of
- congestion-controlled, unreliable datagrams. It should be suitable
- for use by applications such as streaming media, Internet telephony,
- and on-line games.
-
- To compile this protocol support as a module, choose M here: the
- module will be called dccp.
-
- If in doubt, say N.
-
-config INET_DCCP_DIAG
- depends on IP_DCCP && INET_DIAG
- def_tristate y if (IP_DCCP = y && INET_DIAG = y)
- def_tristate m
-
-config IP_DCCP_ACKVEC
- depends on IP_DCCP
- bool
-
-source "net/dccp/ccids/Kconfig"
-
-menu "DCCP Kernel Hacking"
- depends on IP_DCCP && DEBUG_KERNEL=y
-
-config IP_DCCP_DEBUG
- bool "DCCP debug messages"
- ---help---
- Only use this if you're hacking DCCP.
-
- When compiling DCCP as a module, this debugging output can be toggled
- by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
-
- Just say N.
-
-config NET_DCCPPROBE
- tristate "DCCP connection probing"
- depends on PROC_FS && KPROBES
- ---help---
- This module allows for capturing the changes to DCCP connection
- state in response to incoming packets. It is used for debugging
- DCCP congestion avoidance modules. If you don't understand
- what was just said, you don't need it: say N.
-
- Documentation on how to use DCCP connection probing can be found
- at http://linux-net.osdl.org/index.php/DccpProbe
-
- To compile this code as a module, choose M here: the
- module will be called dccp_probe.
-
-
-endmenu
-
-endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
deleted file mode 100644
index f4f8793aafff..000000000000
--- a/net/dccp/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
-
-dccp_ipv4-y := ipv4.o
-
-# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
-obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
-dccp_ipv6-y := ipv6.o
-
-dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
-
-obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
-obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
-
-dccp-$(CONFIG_SYSCTL) += sysctl.o
-
-dccp_diag-y := diag.o
-dccp_probe-y := probe.o
-
-obj-y += ccids/
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
deleted file mode 100644
index bdf1bb7a82c0..000000000000
--- a/net/dccp/ackvec.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
- * net/dccp/ackvec.c
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2 of the License;
- */
-
-#include "ackvec.h"
-#include "dccp.h"
-
-#include <linux/dccp.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-
-#include <net/sock.h>
-
-static kmem_cache_t *dccp_ackvec_slab;
-static kmem_cache_t *dccp_ackvec_record_slab;
-
-static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
-{
- struct dccp_ackvec_record *avr =
- kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
-
- if (avr != NULL)
- INIT_LIST_HEAD(&avr->dccpavr_node);
-
- return avr;
-}
-
-static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
-{
- if (unlikely(avr == NULL))
- return;
- /* Check if deleting a linked record */
- WARN_ON(!list_empty(&avr->dccpavr_node));
- kmem_cache_free(dccp_ackvec_record_slab, avr);
-}
-
-static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
-{
- /*
- * AVRs are sorted by seqno. Since we are sending them in order, we
- * just add the AVR at the head of the list.
- * -sorbo.
- */
- if (!list_empty(&av->dccpav_records)) {
- const struct dccp_ackvec_record *head =
- list_entry(av->dccpav_records.next,
- struct dccp_ackvec_record,
- dccpavr_node);
- BUG_ON(before48(avr->dccpavr_ack_seqno,
- head->dccpavr_ack_seqno));
- }
-
- list_add(&avr->dccpavr_node, &av->dccpav_records);
-}
-
-int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
- /* Figure out how many options do we need to represent the ackvec */
- const u16 nr_opts = (av->dccpav_vec_len +
- DCCP_MAX_ACKVEC_OPT_LEN - 1) /
- DCCP_MAX_ACKVEC_OPT_LEN;
- u16 len = av->dccpav_vec_len + 2 * nr_opts, i;
- struct timeval now;
- u32 elapsed_time;
- const unsigned char *tail, *from;
- unsigned char *to;
- struct dccp_ackvec_record *avr;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- dccp_timestamp(sk, &now);
- elapsed_time = timeval_delta(&now, &av->dccpav_time) / 10;
-
- if (elapsed_time != 0 &&
- dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
- return -1;
-
- avr = dccp_ackvec_record_new();
- if (avr == NULL)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- len = av->dccpav_vec_len;
- from = av->dccpav_buf + av->dccpav_buf_head;
- tail = av->dccpav_buf + DCCP_MAX_ACKVEC_LEN;
-
- for (i = 0; i < nr_opts; ++i) {
- int copylen = len;
-
- if (len > DCCP_MAX_ACKVEC_OPT_LEN)
- copylen = DCCP_MAX_ACKVEC_OPT_LEN;
-
- *to++ = DCCPO_ACK_VECTOR_0;
- *to++ = copylen + 2;
-
- /* Check if buf_head wraps */
- if (from + copylen > tail) {
- const u16 tailsize = tail - from;
-
- memcpy(to, from, tailsize);
- to += tailsize;
- len -= tailsize;
- copylen -= tailsize;
- from = av->dccpav_buf;
- }
-
- memcpy(to, from, copylen);
- from += copylen;
- to += copylen;
- len -= copylen;
- }
-
- /*
- * From RFC 4340, A.2:
- *
- * For each acknowledgement it sends, the HC-Receiver will add an
- * acknowledgement record. ack_seqno will equal the HC-Receiver
- * sequence number it used for the ack packet; ack_ptr will equal
- * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
- * equal buf_nonce.
- */
- avr->dccpavr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- avr->dccpavr_ack_ptr = av->dccpav_buf_head;
- avr->dccpavr_ack_ackno = av->dccpav_buf_ackno;
- avr->dccpavr_ack_nonce = av->dccpav_buf_nonce;
- avr->dccpavr_sent_len = av->dccpav_vec_len;
-
- dccp_ackvec_insert_avr(av, avr);
-
- dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu\n",
- dccp_role(sk), avr->dccpavr_sent_len,
- (unsigned long long)avr->dccpavr_ack_seqno,
- (unsigned long long)avr->dccpavr_ack_ackno);
- return 0;
-}
-
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
- struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
-
- if (av != NULL) {
- av->dccpav_buf_head = DCCP_MAX_ACKVEC_LEN - 1;
- av->dccpav_buf_ackno = DCCP_MAX_SEQNO + 1;
- av->dccpav_buf_nonce = av->dccpav_buf_nonce = 0;
- av->dccpav_time.tv_sec = 0;
- av->dccpav_time.tv_usec = 0;
- av->dccpav_vec_len = 0;
- INIT_LIST_HEAD(&av->dccpav_records);
- }
-
- return av;
-}
-
-void dccp_ackvec_free(struct dccp_ackvec *av)
-{
- if (unlikely(av == NULL))
- return;
-
- if (!list_empty(&av->dccpav_records)) {
- struct dccp_ackvec_record *avr, *next;
-
- list_for_each_entry_safe(avr, next, &av->dccpav_records,
- dccpavr_node) {
- list_del_init(&avr->dccpavr_node);
- dccp_ackvec_record_delete(avr);
- }
- }
-
- kmem_cache_free(dccp_ackvec_slab, av);
-}
-
-static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
- const u32 index)
-{
- return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
-}
-
-static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
- const u32 index)
-{
- return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
-}
-
-/*
- * If several packets are missing, the HC-Receiver may prefer to enter multiple
- * bytes with run length 0, rather than a single byte with a larger run length;
- * this simplifies table updates if one of the missing packets arrives.
- */
-static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
- const unsigned int packets,
- const unsigned char state)
-{
- unsigned int gap;
- long new_head;
-
- if (av->dccpav_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
- return -ENOBUFS;
-
- gap = packets - 1;
- new_head = av->dccpav_buf_head - packets;
-
- if (new_head < 0) {
- if (gap > 0) {
- memset(av->dccpav_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
- gap + new_head + 1);
- gap = -new_head;
- }
- new_head += DCCP_MAX_ACKVEC_LEN;
- }
-
- av->dccpav_buf_head = new_head;
-
- if (gap > 0)
- memset(av->dccpav_buf + av->dccpav_buf_head + 1,
- DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
-
- av->dccpav_buf[av->dccpav_buf_head] = state;
- av->dccpav_vec_len += packets;
- return 0;
-}
-
-/*
- * Implements the RFC 4340, Appendix A
- */
-int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
-{
- /*
- * Check at the right places if the buffer is full, if it is, tell the
- * caller to start dropping packets till the HC-Sender acks our ACK
- * vectors, when we will free up space in dccpav_buf.
- *
- * We may well decide to do buffer compression, etc, but for now lets
- * just drop.
- *
- * From Appendix A.1.1 (`New Packets'):
- *
- * Of course, the circular buffer may overflow, either when the
- * HC-Sender is sending data at a very high rate, when the
- * HC-Receiver's acknowledgements are not reaching the HC-Sender,
- * or when the HC-Sender is forgetting to acknowledge those acks
- * (so the HC-Receiver is unable to clean up old state). In this
- * case, the HC-Receiver should either compress the buffer (by
- * increasing run lengths when possible), transfer its state to
- * a larger buffer, or, as a last resort, drop all received
- * packets, without processing them whatsoever, until its buffer
- * shrinks again.
- */
-
- /* See if this is the first ackno being inserted */
- if (av->dccpav_vec_len == 0) {
- av->dccpav_buf[av->dccpav_buf_head] = state;
- av->dccpav_vec_len = 1;
- } else if (after48(ackno, av->dccpav_buf_ackno)) {
- const u64 delta = dccp_delta_seqno(av->dccpav_buf_ackno,
- ackno);
-
- /*
- * Look if the state of this packet is the same as the
- * previous ackno and if so if we can bump the head len.
- */
- if (delta == 1 &&
- dccp_ackvec_state(av, av->dccpav_buf_head) == state &&
- (dccp_ackvec_len(av, av->dccpav_buf_head) <
- DCCP_ACKVEC_LEN_MASK))
- av->dccpav_buf[av->dccpav_buf_head]++;
- else if (dccp_ackvec_set_buf_head_state(av, delta, state))
- return -ENOBUFS;
- } else {
- /*
- * A.1.2. Old Packets
- *
- * When a packet with Sequence Number S <= buf_ackno
- * arrives, the HC-Receiver will scan the table for
- * the byte corresponding to S. (Indexing structures
- * could reduce the complexity of this scan.)
- */
- u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
- u32 index = av->dccpav_buf_head;
-
- while (1) {
- const u8 len = dccp_ackvec_len(av, index);
- const u8 state = dccp_ackvec_state(av, index);
- /*
- * valid packets not yet in dccpav_buf have a reserved
- * entry, with a len equal to 0.
- */
- if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
- len == 0 && delta == 0) { /* Found our
- reserved seat! */
- dccp_pr_debug("Found %llu reserved seat!\n",
- (unsigned long long)ackno);
- av->dccpav_buf[index] = state;
- goto out;
- }
- /* len == 0 means one packet */
- if (delta < len + 1)
- goto out_duplicate;
-
- delta -= len + 1;
- if (++index == DCCP_MAX_ACKVEC_LEN)
- index = 0;
- }
- }
-
- av->dccpav_buf_ackno = ackno;
- dccp_timestamp(sk, &av->dccpav_time);
-out:
- return 0;
-
-out_duplicate:
- /* Duplicate packet */
- dccp_pr_debug("Received a dup or already considered lost "
- "packet: %llu\n", (unsigned long long)ackno);
- return -EILSEQ;
-}
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
-{
- dccp_pr_debug_cat("ACK vector len=%d, ackno=%llu |", len,
- (unsigned long long)ackno);
-
- while (len--) {
- const u8 state = (*vector & DCCP_ACKVEC_STATE_MASK) >> 6;
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
-
- dccp_pr_debug_cat("%d,%d|", state, rl);
- ++vector;
- }
-
- dccp_pr_debug_cat("\n");
-}
-
-void dccp_ackvec_print(const struct dccp_ackvec *av)
-{
- dccp_ackvector_print(av->dccpav_buf_ackno,
- av->dccpav_buf + av->dccpav_buf_head,
- av->dccpav_vec_len);
-}
-#endif
-
-static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
-{
- struct dccp_ackvec_record *next;
-
- /* sort out vector length */
- if (av->dccpav_buf_head <= avr->dccpavr_ack_ptr)
- av->dccpav_vec_len = avr->dccpavr_ack_ptr - av->dccpav_buf_head;
- else
- av->dccpav_vec_len = DCCP_MAX_ACKVEC_LEN - 1
- - av->dccpav_buf_head
- + avr->dccpavr_ack_ptr;
-
- /* free records */
- list_for_each_entry_safe_from(avr, next, &av->dccpav_records,
- dccpavr_node) {
- list_del_init(&avr->dccpavr_node);
- dccp_ackvec_record_delete(avr);
- }
-}
-
-void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
- const u64 ackno)
-{
- struct dccp_ackvec_record *avr;
-
- /*
- * If we traverse backwards, it should be faster when we have large
- * windows. We will be receiving ACKs for stuff we sent a while back
- * -sorbo.
- */
- list_for_each_entry_reverse(avr, &av->dccpav_records, dccpavr_node) {
- if (ackno == avr->dccpavr_ack_seqno) {
- dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu, ACKED!\n",
- dccp_role(sk), 1,
- (unsigned long long)avr->dccpavr_ack_seqno,
- (unsigned long long)avr->dccpavr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- } else if (avr->dccpavr_ack_seqno > ackno)
- break; /* old news */
- }
-}
-
-static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
- struct sock *sk, u64 *ackno,
- const unsigned char len,
- const unsigned char *vector)
-{
- unsigned char i;
- struct dccp_ackvec_record *avr;
-
- /* Check if we actually sent an ACK vector */
- if (list_empty(&av->dccpav_records))
- return;
-
- i = len;
- /*
- * XXX
- * I think it might be more efficient to work backwards. See comment on
- * rcv_ackno. -sorbo.
- */
- avr = list_entry(av->dccpav_records.next, struct dccp_ackvec_record,
- dccpavr_node);
- while (i--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl;
-
- dccp_set_seqno(&ackno_end_rl, *ackno - rl);
-
- /*
- * If our AVR sequence number is greater than the ack, go
- * forward in the AVR list until it is not so.
- */
- list_for_each_entry_from(avr, &av->dccpav_records,
- dccpavr_node) {
- if (!after48(avr->dccpavr_ack_seqno, *ackno))
- goto found;
- }
- /* End of the dccpav_records list, not found, exit */
- break;
-found:
- if (between48(avr->dccpavr_ack_seqno, ackno_end_rl, *ackno)) {
- const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
- dccp_pr_debug("%s ACK vector 0, len=%d, "
- "ack_seqno=%llu, ack_ackno=%llu, "
- "ACKED!\n",
- dccp_role(sk), len,
- (unsigned long long)
- avr->dccpavr_ack_seqno,
- (unsigned long long)
- avr->dccpavr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- }
- /*
- * If it wasn't received, continue scanning... we might
- * find another one.
- */
- }
-
- dccp_set_seqno(ackno, ackno_end_rl - 1);
- ++vector;
- }
-}
-
-int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt, const u8 *value, const u8 len)
-{
- if (len > DCCP_MAX_ACKVEC_OPT_LEN)
- return -1;
-
- /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
- dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
- ackno, len, value);
- return 0;
-}
-
-int __init dccp_ackvec_init(void)
-{
- dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
- sizeof(struct dccp_ackvec), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (dccp_ackvec_slab == NULL)
- goto out_err;
-
- dccp_ackvec_record_slab =
- kmem_cache_create("dccp_ackvec_record",
- sizeof(struct dccp_ackvec_record),
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (dccp_ackvec_record_slab == NULL)
- goto out_destroy_slab;
-
- return 0;
-
-out_destroy_slab:
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
-out_err:
- DCCP_CRIT("Unable to create Ack Vector slab cache");
- return -ENOBUFS;
-}
-
-void dccp_ackvec_exit(void)
-{
- if (dccp_ackvec_slab != NULL) {
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
- }
- if (dccp_ackvec_record_slab != NULL) {
- kmem_cache_destroy(dccp_ackvec_record_slab);
- dccp_ackvec_record_slab = NULL;
- }
-}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
deleted file mode 100644
index 96504a3b16e4..000000000000
--- a/net/dccp/ackvec.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef _ACKVEC_H
-#define _ACKVEC_H
-/*
- * net/dccp/ackvec.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/compiler.h>
-#include <linux/list.h>
-#include <linux/time.h>
-#include <linux/types.h>
-
-/* Read about the ECN nonce to see why it is 253 */
-#define DCCP_MAX_ACKVEC_OPT_LEN 253
-/* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
-
-#define DCCP_ACKVEC_STATE_RECEIVED 0
-#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
-#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
-
-#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
-#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
-
-/** struct dccp_ackvec - ack vector
- *
- * This data structure is the one defined in RFC 4340, Appendix A.
- *
- * @dccpav_buf_head - circular buffer head
- * @dccpav_buf_tail - circular buffer tail
- * @dccpav_buf_ackno - ack # of the most recent packet acknowledgeable in the
- * buffer (i.e. %dccpav_buf_head)
- * @dccpav_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
- * by the buffer with State 0
- *
- * Additionally, the HC-Receiver must keep some information about the
- * Ack Vectors it has recently sent. For each packet sent carrying an
- * Ack Vector, it remembers four variables:
- *
- * @dccpav_records - list of dccp_ackvec_record
- * @dccpav_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
- *
- * @dccpav_time - the time in usecs
- * @dccpav_buf - circular buffer of acknowledgeable packets
- */
-struct dccp_ackvec {
- u64 dccpav_buf_ackno;
- struct list_head dccpav_records;
- struct timeval dccpav_time;
- u16 dccpav_buf_head;
- u16 dccpav_vec_len;
- u8 dccpav_buf_nonce;
- u8 dccpav_ack_nonce;
- u8 dccpav_buf[DCCP_MAX_ACKVEC_LEN];
-};
-
-/** struct dccp_ackvec_record - ack vector record
- *
- * ACK vector record as defined in Appendix A of spec.
- *
- * The list is sorted by dccpavr_ack_seqno
- *
- * @dccpavr_node - node in dccpav_records
- * @dccpavr_ack_seqno - sequence number of the packet this record was sent on
- * @dccpavr_ack_ackno - sequence number being acknowledged
- * @dccpavr_ack_ptr - pointer into dccpav_buf where this record starts
- * @dccpavr_ack_nonce - dccpav_ack_nonce at the time this record was sent
- * @dccpavr_sent_len - lenght of the record in dccpav_buf
- */
-struct dccp_ackvec_record {
- struct list_head dccpavr_node;
- u64 dccpavr_ack_seqno;
- u64 dccpavr_ack_ackno;
- u16 dccpavr_ack_ptr;
- u16 dccpavr_sent_len;
- u8 dccpavr_ack_nonce;
-};
-
-struct sock;
-struct sk_buff;
-
-#ifdef CONFIG_IP_DCCP_ACKVEC
-extern int dccp_ackvec_init(void);
-extern void dccp_ackvec_exit(void);
-
-extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
-extern void dccp_ackvec_free(struct dccp_ackvec *av);
-
-extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state);
-
-extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno);
-extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt,
- const u8 *value, const u8 len);
-
-extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
-
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
-{
- return av->dccpav_vec_len;
-}
-#else /* CONFIG_IP_DCCP_ACKVEC */
-static inline int dccp_ackvec_init(void)
-{
- return 0;
-}
-
-static inline void dccp_ackvec_exit(void)
-{
-}
-
-static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
- return NULL;
-}
-
-static inline void dccp_ackvec_free(struct dccp_ackvec *av)
-{
-}
-
-static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
-{
- return -1;
-}
-
-static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno)
-{
-}
-
-static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- const u64 *ackno, const u8 opt,
- const u8 *value, const u8 len)
-{
- return -1;
-}
-
-static inline int dccp_insert_option_ackvec(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return -1;
-}
-
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
-{
- return 0;
-}
-#endif /* CONFIG_IP_DCCP_ACKVEC */
-#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
deleted file mode 100644
index ff05e59043cd..000000000000
--- a/net/dccp/ccid.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * net/dccp/ccid.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * CCID infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include "ccid.h"
-
-static struct ccid_operations *ccids[CCID_MAX];
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-static atomic_t ccids_lockct = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(ccids_lock);
-
-/*
- * The strategy is: modifications ccids vector are short, do not sleep and
- * veeery rare, but read access should be free of any exclusive locks.
- */
-static void ccids_write_lock(void)
-{
- spin_lock(&ccids_lock);
- while (atomic_read(&ccids_lockct) != 0) {
- spin_unlock(&ccids_lock);
- yield();
- spin_lock(&ccids_lock);
- }
-}
-
-static inline void ccids_write_unlock(void)
-{
- spin_unlock(&ccids_lock);
-}
-
-static inline void ccids_read_lock(void)
-{
- atomic_inc(&ccids_lockct);
- spin_unlock_wait(&ccids_lock);
-}
-
-static inline void ccids_read_unlock(void)
-{
- atomic_dec(&ccids_lockct);
-}
-
-#else
-#define ccids_write_lock() do { } while(0)
-#define ccids_write_unlock() do { } while(0)
-#define ccids_read_lock() do { } while(0)
-#define ccids_read_unlock() do { } while(0)
-#endif
-
-static kmem_cache_t *ccid_kmem_cache_create(int obj_size, const char *fmt,...)
-{
- kmem_cache_t *slab;
- char slab_name_fmt[32], *slab_name;
- va_list args;
-
- va_start(args, fmt);
- vsnprintf(slab_name_fmt, sizeof(slab_name_fmt), fmt, args);
- va_end(args);
-
- slab_name = kstrdup(slab_name_fmt, GFP_KERNEL);
- if (slab_name == NULL)
- return NULL;
- slab = kmem_cache_create(slab_name, sizeof(struct ccid) + obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (slab == NULL)
- kfree(slab_name);
- return slab;
-}
-
-static void ccid_kmem_cache_destroy(kmem_cache_t *slab)
-{
- if (slab != NULL) {
- const char *name = kmem_cache_name(slab);
-
- kmem_cache_destroy(slab);
- kfree(name);
- }
-}
-
-int ccid_register(struct ccid_operations *ccid_ops)
-{
- int err = -ENOBUFS;
-
- ccid_ops->ccid_hc_rx_slab =
- ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
- "%s_hc_rx_sock",
- ccid_ops->ccid_name);
- if (ccid_ops->ccid_hc_rx_slab == NULL)
- goto out;
-
- ccid_ops->ccid_hc_tx_slab =
- ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
- "%s_hc_tx_sock",
- ccid_ops->ccid_name);
- if (ccid_ops->ccid_hc_tx_slab == NULL)
- goto out_free_rx_slab;
-
- ccids_write_lock();
- err = -EEXIST;
- if (ccids[ccid_ops->ccid_id] == NULL) {
- ccids[ccid_ops->ccid_id] = ccid_ops;
- err = 0;
- }
- ccids_write_unlock();
- if (err != 0)
- goto out_free_tx_slab;
-
- pr_info("CCID: Registered CCID %d (%s)\n",
- ccid_ops->ccid_id, ccid_ops->ccid_name);
-out:
- return err;
-out_free_tx_slab:
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
- ccid_ops->ccid_hc_tx_slab = NULL;
- goto out;
-out_free_rx_slab:
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
- ccid_ops->ccid_hc_rx_slab = NULL;
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(ccid_register);
-
-int ccid_unregister(struct ccid_operations *ccid_ops)
-{
- ccids_write_lock();
- ccids[ccid_ops->ccid_id] = NULL;
- ccids_write_unlock();
-
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
- ccid_ops->ccid_hc_tx_slab = NULL;
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
- ccid_ops->ccid_hc_rx_slab = NULL;
-
- pr_info("CCID: Unregistered CCID %d (%s)\n",
- ccid_ops->ccid_id, ccid_ops->ccid_name);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(ccid_unregister);
-
-struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
-{
- struct ccid_operations *ccid_ops;
- struct ccid *ccid = NULL;
-
- ccids_read_lock();
-#ifdef CONFIG_KMOD
- if (ccids[id] == NULL) {
- /* We only try to load if in process context */
- ccids_read_unlock();
- if (gfp & GFP_ATOMIC)
- goto out;
- request_module("net-dccp-ccid-%d", id);
- ccids_read_lock();
- }
-#endif
- ccid_ops = ccids[id];
- if (ccid_ops == NULL)
- goto out_unlock;
-
- if (!try_module_get(ccid_ops->ccid_owner))
- goto out_unlock;
-
- ccids_read_unlock();
-
- ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, gfp);
- if (ccid == NULL)
- goto out_module_put;
- ccid->ccid_ops = ccid_ops;
- if (rx) {
- memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
- if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
- ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
- goto out_free_ccid;
- } else {
- memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
- if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
- ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
- goto out_free_ccid;
- }
-out:
- return ccid;
-out_unlock:
- ccids_read_unlock();
- goto out;
-out_free_ccid:
- kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, ccid);
- ccid = NULL;
-out_module_put:
- module_put(ccid_ops->ccid_owner);
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(ccid_new);
-
-struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
-{
- return ccid_new(id, sk, 1, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
-
-struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp)
-{
- return ccid_new(id, sk, 0, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
-
-static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
-{
- struct ccid_operations *ccid_ops;
-
- if (ccid == NULL)
- return;
-
- ccid_ops = ccid->ccid_ops;
- if (rx) {
- if (ccid_ops->ccid_hc_rx_exit != NULL)
- ccid_ops->ccid_hc_rx_exit(sk);
- kmem_cache_free(ccid_ops->ccid_hc_rx_slab, ccid);
- } else {
- if (ccid_ops->ccid_hc_tx_exit != NULL)
- ccid_ops->ccid_hc_tx_exit(sk);
- kmem_cache_free(ccid_ops->ccid_hc_tx_slab, ccid);
- }
- ccids_read_lock();
- if (ccids[ccid_ops->ccid_id] != NULL)
- module_put(ccid_ops->ccid_owner);
- ccids_read_unlock();
-}
-
-void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
-{
- ccid_delete(ccid, sk, 1);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_rx_delete);
-
-void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
-{
- ccid_delete(ccid, sk, 0);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_tx_delete);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
deleted file mode 100644
index c7c29514dce8..000000000000
--- a/net/dccp/ccid.h
+++ /dev/null
@@ -1,200 +0,0 @@
-#ifndef _CCID_H
-#define _CCID_H
-/*
- * net/dccp/ccid.h
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * CCID infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <net/sock.h>
-#include <linux/compiler.h>
-#include <linux/dccp.h>
-#include <linux/list.h>
-#include <linux/module.h>
-
-#define CCID_MAX 255
-
-struct tcp_info;
-
-struct ccid_operations {
- unsigned char ccid_id;
- const char *ccid_name;
- struct module *ccid_owner;
- kmem_cache_t *ccid_hc_rx_slab;
- __u32 ccid_hc_rx_obj_size;
- kmem_cache_t *ccid_hc_tx_slab;
- __u32 ccid_hc_tx_obj_size;
- int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
- int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
- void (*ccid_hc_rx_exit)(struct sock *sk);
- void (*ccid_hc_tx_exit)(struct sock *sk);
- void (*ccid_hc_rx_packet_recv)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
- int (*ccid_hc_rx_insert_options)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_tx_insert_options)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_packet_recv)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
- int (*ccid_hc_tx_send_packet)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_packet_sent)(struct sock *sk,
- int more, unsigned int len);
- void (*ccid_hc_rx_get_info)(struct sock *sk,
- struct tcp_info *info);
- void (*ccid_hc_tx_get_info)(struct sock *sk,
- struct tcp_info *info);
- int (*ccid_hc_rx_getsockopt)(struct sock *sk,
- const int optname, int len,
- u32 __user *optval,
- int __user *optlen);
- int (*ccid_hc_tx_getsockopt)(struct sock *sk,
- const int optname, int len,
- u32 __user *optval,
- int __user *optlen);
-};
-
-extern int ccid_register(struct ccid_operations *ccid_ops);
-extern int ccid_unregister(struct ccid_operations *ccid_ops);
-
-struct ccid {
- struct ccid_operations *ccid_ops;
- char ccid_priv[0];
-};
-
-static inline void *ccid_priv(const struct ccid *ccid)
-{
- return (void *)ccid->ccid_priv;
-}
-
-extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
- gfp_t gfp);
-
-extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
- gfp_t gfp);
-extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
- gfp_t gfp);
-
-extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
-extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
-
-static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
- return rc;
-}
-
-static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- int more, unsigned int len)
-{
- if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
-}
-
-static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
- ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
-}
-
-static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
-}
-
-static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
-{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
- value);
- return rc;
-}
-
-static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
-{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
- return rc;
-}
-
-static inline int ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_tx_insert_options != NULL)
- return ccid->ccid_ops->ccid_hc_tx_insert_options(sk, skb);
- return 0;
-}
-
-static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
- return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
- return 0;
-}
-
-static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
- struct tcp_info *info)
-{
- if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
- ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
-}
-
-static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
- struct tcp_info *info)
-{
- if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
- ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
-}
-
-static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
- const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- int rc = -ENOPROTOOPT;
- if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
- optval, optlen);
- return rc;
-}
-
-static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
- const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- int rc = -ENOPROTOOPT;
- if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
- optval, optlen);
- return rc;
-}
-#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
deleted file mode 100644
index dac89166eb18..000000000000
--- a/net/dccp/ccids/Kconfig
+++ /dev/null
@@ -1,92 +0,0 @@
-menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
- depends on IP_DCCP && EXPERIMENTAL
-
-config IP_DCCP_CCID2
- tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
- depends on IP_DCCP
- def_tristate IP_DCCP
- select IP_DCCP_ACKVEC
- ---help---
- CCID 2, TCP-like Congestion Control, denotes Additive Increase,
- Multiplicative Decrease (AIMD) congestion control with behavior
- modelled directly on TCP, including congestion window, slow start,
- timeouts, and so forth [RFC 2581]. CCID 2 achieves maximum
- bandwidth over the long term, consistent with the use of end-to-end
- congestion control, but halves its congestion window in response to
- each congestion event. This leads to the abrupt rate changes
- typical of TCP. Applications should use CCID 2 if they prefer
- maximum bandwidth utilization to steadiness of rate. This is often
- the case for applications that are not playing their data directly
- to the user. For example, a hypothetical application that
- transferred files over DCCP, using application-level retransmissions
- for lost packets, would prefer CCID 2 to CCID 3. On-line games may
- also prefer CCID 2.
-
- CCID 2 is further described in RFC 4341,
- http://www.ietf.org/rfc/rfc4341.txt
-
- This text was extracted from RFC 4340 (sec. 10.1),
- http://www.ietf.org/rfc/rfc4340.txt
-
- To compile this CCID as a module, choose M here: the module will be
- called dccp_ccid2.
-
- If in doubt, say M.
-
-config IP_DCCP_CCID2_DEBUG
- bool "CCID2 debugging messages"
- depends on IP_DCCP_CCID2
- ---help---
- Enable CCID2-specific debugging messages.
-
- When compiling CCID2 as a module, this debugging output can
- additionally be toggled by setting the ccid2_debug module
- parameter to 0 or 1.
-
- If in doubt, say N.
-
-config IP_DCCP_CCID3
- tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
- depends on IP_DCCP
- def_tristate IP_DCCP
- ---help---
- CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
- rate-controlled congestion control mechanism. TFRC is designed to
- be reasonably fair when competing for bandwidth with TCP-like flows,
- where a flow is "reasonably fair" if its sending rate is generally
- within a factor of two of the sending rate of a TCP flow under the
- same conditions. However, TFRC has a much lower variation of
- throughput over time compared with TCP, which makes CCID 3 more
- suitable than CCID 2 for applications such streaming media where a
- relatively smooth sending rate is of importance.
-
- CCID 3 is further described in RFC 4342,
- http://www.ietf.org/rfc/rfc4342.txt
-
- The TFRC congestion control algorithms were initially described in
- RFC 3448.
-
- This text was extracted from RFC 4340 (sec. 10.2),
- http://www.ietf.org/rfc/rfc4340.txt
-
- To compile this CCID as a module, choose M here: the module will be
- called dccp_ccid3.
-
- If in doubt, say M.
-
-config IP_DCCP_TFRC_LIB
- depends on IP_DCCP_CCID3
- def_tristate IP_DCCP_CCID3
-
-config IP_DCCP_CCID3_DEBUG
- bool "CCID3 debugging messages"
- depends on IP_DCCP_CCID3
- ---help---
- Enable CCID3-specific debugging messages.
-
- When compiling CCID3 as a module, this debugging output can
- additionally be toggled by setting the ccid3_debug module
- parameter to 0 or 1.
-
- If in doubt, say N.
-endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
deleted file mode 100644
index 438f20bccff7..000000000000
--- a/net/dccp/ccids/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
-
-dccp_ccid3-y := ccid3.o
-
-obj-$(CONFIG_IP_DCCP_CCID2) += dccp_ccid2.o
-
-dccp_ccid2-y := ccid2.o
-
-obj-y += lib/
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
deleted file mode 100644
index 2555be8f4790..000000000000
--- a/net/dccp/ccids/ccid2.c
+++ /dev/null
@@ -1,857 +0,0 @@
-/*
- * net/dccp/ccids/ccid2.c
- *
- * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * Changes to meet Linux coding standards, and DCCP infrastructure fixes.
- *
- * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * This implementation should follow RFC 4341
- *
- * BUGS:
- * - sequence number wrapping
- */
-
-#include "../ccid.h"
-#include "../dccp.h"
-#include "ccid2.h"
-
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-static int ccid2_debug;
-#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
-{
- int len = 0;
- int pipe = 0;
- struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
-
- /* there is data in the chain */
- if (seqp != hctx->ccid2hctx_seqt) {
- seqp = seqp->ccid2s_prev;
- len++;
- if (!seqp->ccid2s_acked)
- pipe++;
-
- while (seqp != hctx->ccid2hctx_seqt) {
- struct ccid2_seq *prev = seqp->ccid2s_prev;
-
- len++;
- if (!prev->ccid2s_acked)
- pipe++;
-
- /* packets are sent sequentially */
- BUG_ON(seqp->ccid2s_seq <= prev->ccid2s_seq);
- BUG_ON(time_before(seqp->ccid2s_sent,
- prev->ccid2s_sent));
-
- seqp = prev;
- }
- }
-
- BUG_ON(pipe != hctx->ccid2hctx_pipe);
- ccid2_pr_debug("len of chain=%d\n", len);
-
- do {
- seqp = seqp->ccid2s_prev;
- len++;
- } while (seqp != hctx->ccid2hctx_seqh);
-
- ccid2_pr_debug("total len=%d\n", len);
- BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
-}
-#else
-#define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hctx)
-#endif
-
-static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx, int num,
- gfp_t gfp)
-{
- struct ccid2_seq *seqp;
- int i;
-
- /* check if we have space to preserve the pointer to the buffer */
- if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
- sizeof(struct ccid2_seq*)))
- return -ENOMEM;
-
- /* allocate buffer and initialize linked list */
- seqp = kmalloc(sizeof(*seqp) * num, gfp);
- if (seqp == NULL)
- return -ENOMEM;
-
- for (i = 0; i < (num - 1); i++) {
- seqp[i].ccid2s_next = &seqp[i + 1];
- seqp[i + 1].ccid2s_prev = &seqp[i];
- }
- seqp[num - 1].ccid2s_next = seqp;
- seqp->ccid2s_prev = &seqp[num - 1];
-
- /* This is the first allocation. Initiate the head and tail. */
- if (hctx->ccid2hctx_seqbufc == 0)
- hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
- else {
- /* link the existing list with the one we just created */
- hctx->ccid2hctx_seqh->ccid2s_next = seqp;
- seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
-
- hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[num - 1];
- seqp[num - 1].ccid2s_next = hctx->ccid2hctx_seqt;
- }
-
- /* store the original pointer to the buffer so we can free it */
- hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
- hctx->ccid2hctx_seqbufc++;
-
- return 0;
-}
-
-static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid2_hc_tx_sock *hctx;
-
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case 0: /* XXX data packets from userland come through like this */
- case DCCP_PKT_DATA:
- case DCCP_PKT_DATAACK:
- break;
- /* No congestion control on other packets */
- default:
- return 0;
- }
-
- hctx = ccid2_hc_tx_sk(sk);
-
- ccid2_pr_debug("pipe=%d cwnd=%d\n", hctx->ccid2hctx_pipe,
- hctx->ccid2hctx_cwnd);
-
- if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) {
- /* OK we can send... make sure previous packet was sent off */
- if (!hctx->ccid2hctx_sendwait) {
- hctx->ccid2hctx_sendwait = 1;
- return 0;
- }
- }
-
- return 1; /* XXX CCID should dequeue when ready instead of polling */
-}
-
-static void ccid2_change_l_ack_ratio(struct sock *sk, int val)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- /*
- * XXX I don't really agree with val != 2. If cwnd is 1, ack ratio
- * should be 1... it shouldn't be allowed to become 2.
- * -sorbo.
- */
- if (val != 2) {
- const struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
- int max = hctx->ccid2hctx_cwnd / 2;
-
- /* round up */
- if (hctx->ccid2hctx_cwnd & 1)
- max++;
-
- if (val > max)
- val = max;
- }
-
- ccid2_pr_debug("changing local ack ratio to %d\n", val);
- WARN_ON(val <= 0);
- dp->dccps_l_ack_ratio = val;
-}
-
-static void ccid2_change_cwnd(struct ccid2_hc_tx_sock *hctx, int val)
-{
- if (val == 0)
- val = 1;
-
- /* XXX do we need to change ack ratio? */
- ccid2_pr_debug("change cwnd to %d\n", val);
-
- BUG_ON(val < 1);
- hctx->ccid2hctx_cwnd = val;
-}
-
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
-{
- ccid2_pr_debug("change SRTT to %ld\n", val);
- hctx->ccid2hctx_srtt = val;
-}
-
-static void ccid2_change_pipe(struct ccid2_hc_tx_sock *hctx, long val)
-{
- hctx->ccid2hctx_pipe = val;
-}
-
-static void ccid2_start_rto_timer(struct sock *sk);
-
-static void ccid2_hc_tx_rto_expire(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
- long s;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
- jiffies + HZ / 5);
- goto out;
- }
-
- ccid2_pr_debug("RTO_EXPIRE\n");
-
- ccid2_hc_tx_check_sanity(hctx);
-
- /* back-off timer */
- hctx->ccid2hctx_rto <<= 1;
-
- s = hctx->ccid2hctx_rto / HZ;
- if (s > 60)
- hctx->ccid2hctx_rto = 60 * HZ;
-
- ccid2_start_rto_timer(sk);
-
- /* adjust pipe, cwnd etc */
- ccid2_change_pipe(hctx, 0);
- hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd >> 1;
- if (hctx->ccid2hctx_ssthresh < 2)
- hctx->ccid2hctx_ssthresh = 2;
- ccid2_change_cwnd(hctx, 1);
-
- /* clear state about stuff we sent */
- hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
- hctx->ccid2hctx_ssacks = 0;
- hctx->ccid2hctx_acks = 0;
- hctx->ccid2hctx_sent = 0;
-
- /* clear ack ratio state. */
- hctx->ccid2hctx_arsent = 0;
- hctx->ccid2hctx_ackloss = 0;
- hctx->ccid2hctx_rpseq = 0;
- hctx->ccid2hctx_rpdupack = -1;
- ccid2_change_l_ack_ratio(sk, 1);
- ccid2_hc_tx_check_sanity(hctx);
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static void ccid2_start_rto_timer(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
- ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
-
- BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
- sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
- jiffies + hctx->ccid2hctx_rto);
-}
-
-static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
- struct ccid2_seq *next;
- u64 seq;
-
- ccid2_hc_tx_check_sanity(hctx);
-
- BUG_ON(!hctx->ccid2hctx_sendwait);
- hctx->ccid2hctx_sendwait = 0;
- ccid2_change_pipe(hctx, hctx->ccid2hctx_pipe + 1);
- BUG_ON(hctx->ccid2hctx_pipe < 0);
-
- /* There is an issue. What if another packet is sent between
- * packet_send() and packet_sent(). Then the sequence number would be
- * wrong.
- * -sorbo.
- */
- seq = dp->dccps_gss;
-
- hctx->ccid2hctx_seqh->ccid2s_seq = seq;
- hctx->ccid2hctx_seqh->ccid2s_acked = 0;
- hctx->ccid2hctx_seqh->ccid2s_sent = jiffies;
-
- next = hctx->ccid2hctx_seqh->ccid2s_next;
- /* check if we need to alloc more space */
- if (next == hctx->ccid2hctx_seqt) {
- int rc;
-
- ccid2_pr_debug("allocating more space in history\n");
- rc = ccid2_hc_tx_alloc_seq(hctx, CCID2_SEQBUF_LEN, GFP_KERNEL);
- BUG_ON(rc); /* XXX what do we do? */
-
- next = hctx->ccid2hctx_seqh->ccid2s_next;
- BUG_ON(next == hctx->ccid2hctx_seqt);
- }
- hctx->ccid2hctx_seqh = next;
-
- ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
- hctx->ccid2hctx_pipe);
-
- hctx->ccid2hctx_sent++;
-
- /* Ack Ratio. Need to maintain a concept of how many windows we sent */
- hctx->ccid2hctx_arsent++;
- /* We had an ack loss in this window... */
- if (hctx->ccid2hctx_ackloss) {
- if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
- hctx->ccid2hctx_arsent = 0;
- hctx->ccid2hctx_ackloss = 0;
- }
- } else {
- /* No acks lost up to now... */
- /* decrease ack ratio if enough packets were sent */
- if (dp->dccps_l_ack_ratio > 1) {
- /* XXX don't calculate denominator each time */
- int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
- dp->dccps_l_ack_ratio;
-
- denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
-
- if (hctx->ccid2hctx_arsent >= denom) {
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
- hctx->ccid2hctx_arsent = 0;
- }
- } else {
- /* we can't increase ack ratio further [1] */
- hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
- }
- }
-
- /* setup RTO timer */
- if (!timer_pending(&hctx->ccid2hctx_rtotimer))
- ccid2_start_rto_timer(sk);
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
- ccid2_pr_debug("pipe=%d\n", hctx->ccid2hctx_pipe);
- ccid2_pr_debug("Sent: seq=%llu\n", (unsigned long long)seq);
- do {
- struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
-
- while (seqp != hctx->ccid2hctx_seqh) {
- ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
- (unsigned long long)seqp->ccid2s_seq,
- seqp->ccid2s_acked, seqp->ccid2s_sent);
- seqp = seqp->ccid2s_next;
- }
- } while (0);
- ccid2_pr_debug("=========\n");
- ccid2_hc_tx_check_sanity(hctx);
-#endif
-}
-
-/* XXX Lame code duplication!
- * returns -1 if none was found.
- * else returns the next offset to use in the function call.
- */
-static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
- unsigned char **vec, unsigned char *veclen)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- unsigned char opt, len;
- unsigned char *value;
-
- BUG_ON(offset < 0);
- options += offset;
- opt_ptr = options;
- if (opt_ptr >= opt_end)
- return -1;
-
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_invalid_option;
-
- len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
- /*
- * Remove the type and len fields, leaving
- * just the value size
- */
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
-
- if (opt_ptr > opt_end)
- goto out_invalid_option;
- }
-
- switch (opt) {
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- *vec = value;
- *veclen = len;
- return offset + (opt_ptr - options);
- }
- }
-
- return -1;
-
-out_invalid_option:
- DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
- return -1;
-}
-
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
- sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
- ccid2_pr_debug("deleted RTO timer\n");
-}
-
-static inline void ccid2_new_ack(struct sock *sk,
- struct ccid2_seq *seqp,
- unsigned int *maxincr)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
- /* slow start */
- if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
- hctx->ccid2hctx_acks = 0;
-
- /* We can increase cwnd at most maxincr [ack_ratio/2] */
- if (*maxincr) {
- /* increase every 2 acks */
- hctx->ccid2hctx_ssacks++;
- if (hctx->ccid2hctx_ssacks == 2) {
- ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd+1);
- hctx->ccid2hctx_ssacks = 0;
- *maxincr = *maxincr - 1;
- }
- } else {
- /* increased cwnd enough for this single ack */
- hctx->ccid2hctx_ssacks = 0;
- }
- } else {
- hctx->ccid2hctx_ssacks = 0;
- hctx->ccid2hctx_acks++;
-
- if (hctx->ccid2hctx_acks >= hctx->ccid2hctx_cwnd) {
- ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd + 1);
- hctx->ccid2hctx_acks = 0;
- }
- }
-
- /* update RTO */
- if (hctx->ccid2hctx_srtt == -1 ||
- time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
- unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
- int s;
-
- /* first measurement */
- if (hctx->ccid2hctx_srtt == -1) {
- ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
- r, jiffies,
- (unsigned long long)seqp->ccid2s_seq);
- ccid2_change_srtt(hctx, r);
- hctx->ccid2hctx_rttvar = r >> 1;
- } else {
- /* RTTVAR */
- long tmp = hctx->ccid2hctx_srtt - r;
- long srtt;
-
- if (tmp < 0)
- tmp *= -1;
-
- tmp >>= 2;
- hctx->ccid2hctx_rttvar *= 3;
- hctx->ccid2hctx_rttvar >>= 2;
- hctx->ccid2hctx_rttvar += tmp;
-
- /* SRTT */
- srtt = hctx->ccid2hctx_srtt;
- srtt *= 7;
- srtt >>= 3;
- tmp = r >> 3;
- srtt += tmp;
- ccid2_change_srtt(hctx, srtt);
- }
- s = hctx->ccid2hctx_rttvar << 2;
- /* clock granularity is 1 when based on jiffies */
- if (!s)
- s = 1;
- hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
-
- /* must be at least a second */
- s = hctx->ccid2hctx_rto / HZ;
- /* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
- if (s < 1)
- hctx->ccid2hctx_rto = HZ;
-#endif
- /* max 60 seconds */
- if (s > 60)
- hctx->ccid2hctx_rto = HZ * 60;
-
- hctx->ccid2hctx_lastrtt = jiffies;
-
- ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
- hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
- hctx->ccid2hctx_rto, HZ, r);
- hctx->ccid2hctx_sent = 0;
- }
-
- /* we got a new ack, so re-start RTO timer */
- ccid2_hc_tx_kill_rto_timer(sk);
- ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
- ccid2_change_pipe(hctx, hctx->ccid2hctx_pipe-1);
- BUG_ON(hctx->ccid2hctx_pipe < 0);
-
- if (hctx->ccid2hctx_pipe == 0)
- ccid2_hc_tx_kill_rto_timer(sk);
-}
-
-static void ccid2_congestion_event(struct ccid2_hc_tx_sock *hctx,
- struct ccid2_seq *seqp)
-{
- if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
- ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
- return;
- }
-
- hctx->ccid2hctx_last_cong = jiffies;
-
- ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd >> 1);
- hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd;
- if (hctx->ccid2hctx_ssthresh < 2)
- hctx->ccid2hctx_ssthresh = 2;
-}
-
-static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
- u64 ackno, seqno;
- struct ccid2_seq *seqp;
- unsigned char *vector;
- unsigned char veclen;
- int offset = 0;
- int done = 0;
- unsigned int maxincr = 0;
-
- ccid2_hc_tx_check_sanity(hctx);
- /* check reverse path congestion */
- seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-
- /* XXX this whole "algorithm" is broken. Need to fix it to keep track
- * of the seqnos of the dupacks so that rpseq and rpdupack are correct
- * -sorbo.
- */
- /* need to bootstrap */
- if (hctx->ccid2hctx_rpdupack == -1) {
- hctx->ccid2hctx_rpdupack = 0;
- hctx->ccid2hctx_rpseq = seqno;
- } else {
- /* check if packet is consecutive */
- if ((hctx->ccid2hctx_rpseq + 1) == seqno)
- hctx->ccid2hctx_rpseq++;
- /* it's a later packet */
- else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
- hctx->ccid2hctx_rpdupack++;
-
- /* check if we got enough dupacks */
- if (hctx->ccid2hctx_rpdupack >=
- hctx->ccid2hctx_numdupack) {
- hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
- hctx->ccid2hctx_rpseq = 0;
-
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio << 1);
- }
- }
- }
-
- /* check forward path congestion */
- /* still didn't send out new data packets */
- if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
- return;
-
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_ACK:
- case DCCP_PKT_DATAACK:
- break;
- default:
- return;
- }
-
- ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
- if (after48(ackno, hctx->ccid2hctx_high_ack))
- hctx->ccid2hctx_high_ack = ackno;
-
- seqp = hctx->ccid2hctx_seqt;
- while (before48(seqp->ccid2s_seq, ackno)) {
- seqp = seqp->ccid2s_next;
- if (seqp == hctx->ccid2hctx_seqh) {
- seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
- break;
- }
- }
-
- /* If in slow-start, cwnd can increase at most Ack Ratio / 2 packets for
- * this single ack. I round up.
- * -sorbo.
- */
- maxincr = dp->dccps_l_ack_ratio >> 1;
- maxincr++;
-
- /* go through all ack vectors */
- while ((offset = ccid2_ackvector(sk, skb, offset,
- &vector, &veclen)) != -1) {
- /* go through this ack vector */
- while (veclen--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl;
-
- dccp_set_seqno(&ackno_end_rl, ackno - rl);
- ccid2_pr_debug("ackvec start:%llu end:%llu\n",
- (unsigned long long)ackno,
- (unsigned long long)ackno_end_rl);
- /* if the seqno we are analyzing is larger than the
- * current ackno, then move towards the tail of our
- * seqnos.
- */
- while (after48(seqp->ccid2s_seq, ackno)) {
- if (seqp == hctx->ccid2hctx_seqt) {
- done = 1;
- break;
- }
- seqp = seqp->ccid2s_prev;
- }
- if (done)
- break;
-
- /* check all seqnos in the range of the vector
- * run length
- */
- while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
- const u8 state = *vector &
- DCCP_ACKVEC_STATE_MASK;
-
- /* new packet received or marked */
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
- !seqp->ccid2s_acked) {
- if (state ==
- DCCP_ACKVEC_STATE_ECN_MARKED) {
- ccid2_congestion_event(hctx,
- seqp);
- } else
- ccid2_new_ack(sk, seqp,
- &maxincr);
-
- seqp->ccid2s_acked = 1;
- ccid2_pr_debug("Got ack for %llu\n",
- (unsigned long long)seqp->ccid2s_seq);
- ccid2_hc_tx_dec_pipe(sk);
- }
- if (seqp == hctx->ccid2hctx_seqt) {
- done = 1;
- break;
- }
- seqp = seqp->ccid2s_next;
- }
- if (done)
- break;
-
-
- dccp_set_seqno(&ackno, ackno_end_rl - 1);
- vector++;
- }
- if (done)
- break;
- }
-
- /* The state about what is acked should be correct now
- * Check for NUMDUPACK
- */
- seqp = hctx->ccid2hctx_seqt;
- while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
- seqp = seqp->ccid2s_next;
- if (seqp == hctx->ccid2hctx_seqh) {
- seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
- break;
- }
- }
- done = 0;
- while (1) {
- if (seqp->ccid2s_acked) {
- done++;
- if (done == hctx->ccid2hctx_numdupack)
- break;
- }
- if (seqp == hctx->ccid2hctx_seqt)
- break;
- seqp = seqp->ccid2s_prev;
- }
-
- /* If there are at least 3 acknowledgements, anything unacknowledged
- * below the last sequence number is considered lost
- */
- if (done == hctx->ccid2hctx_numdupack) {
- struct ccid2_seq *last_acked = seqp;
-
- /* check for lost packets */
- while (1) {
- if (!seqp->ccid2s_acked) {
- ccid2_pr_debug("Packet lost: %llu\n",
- (unsigned long long)seqp->ccid2s_seq);
- /* XXX need to traverse from tail -> head in
- * order to detect multiple congestion events in
- * one ack vector.
- */
- ccid2_congestion_event(hctx, seqp);
- ccid2_hc_tx_dec_pipe(sk);
- }
- if (seqp == hctx->ccid2hctx_seqt)
- break;
- seqp = seqp->ccid2s_prev;
- }
-
- hctx->ccid2hctx_seqt = last_acked;
- }
-
- /* trim acked packets in tail */
- while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
- if (!hctx->ccid2hctx_seqt->ccid2s_acked)
- break;
-
- hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
- }
-
- ccid2_hc_tx_check_sanity(hctx);
-}
-
-static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid);
-
- ccid2_change_cwnd(hctx, 1);
- /* Initialize ssthresh to infinity. This means that we will exit the
- * initial slow-start after the first packet loss. This is what we
- * want.
- */
- hctx->ccid2hctx_ssthresh = ~0;
- hctx->ccid2hctx_numdupack = 3;
- hctx->ccid2hctx_seqbufc = 0;
-
- /* XXX init ~ to window size... */
- if (ccid2_hc_tx_alloc_seq(hctx, CCID2_SEQBUF_LEN, GFP_ATOMIC) != 0)
- return -ENOMEM;
-
- hctx->ccid2hctx_sent = 0;
- hctx->ccid2hctx_rto = 3 * HZ;
- ccid2_change_srtt(hctx, -1);
- hctx->ccid2hctx_rttvar = -1;
- hctx->ccid2hctx_lastrtt = 0;
- hctx->ccid2hctx_rpdupack = -1;
- hctx->ccid2hctx_last_cong = jiffies;
- hctx->ccid2hctx_high_ack = 0;
-
- hctx->ccid2hctx_rtotimer.function = &ccid2_hc_tx_rto_expire;
- hctx->ccid2hctx_rtotimer.data = (unsigned long)sk;
- init_timer(&hctx->ccid2hctx_rtotimer);
-
- ccid2_hc_tx_check_sanity(hctx);
- return 0;
-}
-
-static void ccid2_hc_tx_exit(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
- int i;
-
- ccid2_hc_tx_kill_rto_timer(sk);
-
- for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
- kfree(hctx->ccid2hctx_seqbuf[i]);
- hctx->ccid2hctx_seqbufc = 0;
-}
-
-static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_rx_sock *hcrx = ccid2_hc_rx_sk(sk);
-
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_DATA:
- case DCCP_PKT_DATAACK:
- hcrx->ccid2hcrx_data++;
- if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
- dccp_send_ack(sk);
- hcrx->ccid2hcrx_data = 0;
- }
- break;
- }
-}
-
-static struct ccid_operations ccid2 = {
- .ccid_id = DCCPC_CCID2,
- .ccid_name = "ccid2",
- .ccid_owner = THIS_MODULE,
- .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
- .ccid_hc_tx_init = ccid2_hc_tx_init,
- .ccid_hc_tx_exit = ccid2_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
- .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
- .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
-};
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-module_param(ccid2_debug, int, 0444);
-MODULE_PARM_DESC(ccid2_debug, "Enable debug messages");
-#endif
-
-static __init int ccid2_module_init(void)
-{
- return ccid_register(&ccid2);
-}
-module_init(ccid2_module_init);
-
-static __exit void ccid2_module_exit(void)
-{
- ccid_unregister(&ccid2);
-}
-module_exit(ccid2_module_exit);
-
-MODULE_AUTHOR("Andrea Bittau <a.bittau@cs.ucl.ac.uk>");
-MODULE_DESCRIPTION("DCCP TCP-Like (CCID2) CCID");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("net-dccp-ccid-2");
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
deleted file mode 100644
index ebd79499c85a..000000000000
--- a/net/dccp/ccids/ccid2.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * net/dccp/ccids/ccid2.h
- *
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef _DCCP_CCID2_H_
-#define _DCCP_CCID2_H_
-
-#include <linux/dccp.h>
-#include <linux/timer.h>
-#include <linux/types.h>
-#include "../ccid.h"
-
-struct sock;
-
-struct ccid2_seq {
- u64 ccid2s_seq;
- unsigned long ccid2s_sent;
- int ccid2s_acked;
- struct ccid2_seq *ccid2s_prev;
- struct ccid2_seq *ccid2s_next;
-};
-
-#define CCID2_SEQBUF_LEN 1024
-#define CCID2_SEQBUF_MAX 128
-
-/** struct ccid2_hc_tx_sock - CCID2 TX half connection
- *
- * @ccid2hctx_ssacks - ACKs recv in slow start
- * @ccid2hctx_acks - ACKS recv in AI phase
- * @ccid2hctx_sent - packets sent in this window
- * @ccid2hctx_lastrtt -time RTT was last measured
- * @ccid2hctx_arsent - packets sent [ack ratio]
- * @ccid2hctx_ackloss - ack was lost in this win
- * @ccid2hctx_rpseq - last consecutive seqno
- * @ccid2hctx_rpdupack - dupacks since rpseq
-*/
-struct ccid2_hc_tx_sock {
- int ccid2hctx_cwnd;
- int ccid2hctx_ssacks;
- int ccid2hctx_acks;
- unsigned int ccid2hctx_ssthresh;
- int ccid2hctx_pipe;
- int ccid2hctx_numdupack;
- struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
- int ccid2hctx_seqbufc;
- struct ccid2_seq *ccid2hctx_seqh;
- struct ccid2_seq *ccid2hctx_seqt;
- long ccid2hctx_rto;
- long ccid2hctx_srtt;
- long ccid2hctx_rttvar;
- int ccid2hctx_sent;
- unsigned long ccid2hctx_lastrtt;
- struct timer_list ccid2hctx_rtotimer;
- unsigned long ccid2hctx_arsent;
- int ccid2hctx_ackloss;
- u64 ccid2hctx_rpseq;
- int ccid2hctx_rpdupack;
- int ccid2hctx_sendwait;
- unsigned long ccid2hctx_last_cong;
- u64 ccid2hctx_high_ack;
-};
-
-struct ccid2_hc_rx_sock {
- int ccid2hcrx_data;
-};
-
-static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
-}
-
-static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
-}
-#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
deleted file mode 100644
index 70ebe705eb75..000000000000
--- a/net/dccp/ccids/ccid3.c
+++ /dev/null
@@ -1,1266 +0,0 @@
-/*
- * net/dccp/ccids/ccid3.c
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "../ccid.h"
-#include "../dccp.h"
-#include "lib/packet_history.h"
-#include "lib/loss_interval.h"
-#include "lib/tfrc.h"
-#include "ccid3.h"
-
-/*
- * Reason for maths here is to avoid 32 bit overflow when a is big.
- * With this we get close to the limit.
- */
-static u32 usecs_div(const u32 a, const u32 b)
-{
- const u32 div = a < (UINT_MAX / (USEC_PER_SEC / 10)) ? 10 :
- a < (UINT_MAX / (USEC_PER_SEC / 50)) ? 50 :
- a < (UINT_MAX / (USEC_PER_SEC / 100)) ? 100 :
- a < (UINT_MAX / (USEC_PER_SEC / 500)) ? 500 :
- a < (UINT_MAX / (USEC_PER_SEC / 1000)) ? 1000 :
- a < (UINT_MAX / (USEC_PER_SEC / 5000)) ? 5000 :
- a < (UINT_MAX / (USEC_PER_SEC / 10000)) ? 10000 :
- a < (UINT_MAX / (USEC_PER_SEC / 50000)) ? 50000 :
- 100000;
- const u32 tmp = a * (USEC_PER_SEC / div);
- return (b >= 2 * div) ? tmp / (b / div) : tmp;
-}
-
-
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static int ccid3_debug;
-#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
-#else
-#define ccid3_pr_debug(format, a...)
-#endif
-
-static struct dccp_tx_hist *ccid3_tx_hist;
-static struct dccp_rx_hist *ccid3_rx_hist;
-static struct dccp_li_hist *ccid3_li_hist;
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
-{
- static char *ccid3_state_names[] = {
- [TFRC_SSTATE_NO_SENT] = "NO_SENT",
- [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
- [TFRC_SSTATE_FBACK] = "FBACK",
- [TFRC_SSTATE_TERM] = "TERM",
- };
-
- return ccid3_state_names[state];
-}
-#endif
-
-static void ccid3_hc_tx_set_state(struct sock *sk,
- enum ccid3_hc_tx_states state)
-{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
-
- ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
- dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
- ccid3_tx_state_name(state));
- WARN_ON(state == oldstate);
- hctx->ccid3hctx_state = state;
-}
-
-/*
- * Recalculate scheduled nominal send time t_nom, inter-packet interval
- * t_ipi, and delta value. Should be called after each change to X.
- */
-static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx)
-{
- timeval_sub_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi);
-
- /* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
- hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_x);
-
- /* Update nominal send time with regard to the new t_ipi */
- timeval_add_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi);
-
- /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
- hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
- TFRC_OPSYS_HALF_TIME_GRAN);
-}
-/*
- * Update X by
- * If (p > 0)
- * x_calc = calcX(s, R, p);
- * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
- * Else
- * If (now - tld >= R)
- * X = max(min(2 * X, 2 * X_recv), s / R);
- * tld = now;
- */
-static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
-
-{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- const __u32 old_x = hctx->ccid3hctx_x;
-
- /* To avoid large error in calcX */
- if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
- hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
- hctx->ccid3hctx_rtt,
- hctx->ccid3hctx_p);
- hctx->ccid3hctx_x = max_t(u32, min(hctx->ccid3hctx_x_calc,
- hctx->ccid3hctx_x_recv * 2),
- hctx->ccid3hctx_s / TFRC_T_MBI);
-
- } else if (timeval_delta(now, &hctx->ccid3hctx_t_ld) >=
- hctx->ccid3hctx_rtt) {
- hctx->ccid3hctx_x = max(min(hctx->ccid3hctx_x_recv,
- hctx->ccid3hctx_x ) * 2,
- usecs_div(hctx->ccid3hctx_s,
- hctx->ccid3hctx_rtt) );
- hctx->ccid3hctx_t_ld = *now;
- } else
- ccid3_pr_debug("Not changing X\n");
-
- if (hctx->ccid3hctx_x != old_x)
- ccid3_update_send_time(hctx);
-}
-
-/*
- * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1)
- * @len: DCCP packet payload size in bytes
- */
-static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
-{
- if (unlikely(len == 0))
- ccid3_pr_debug("Packet payload length is 0 - not updating\n");
- else
- hctx->ccid3hctx_s = hctx->ccid3hctx_s == 0 ? len :
- (9 * hctx->ccid3hctx_s + len) / 10;
- /*
- * Note: We could do a potential optimisation here - when `s' changes,
- * recalculate sending rate and consequently t_ipi, t_delta, and
- * t_now. This is however non-standard, and the benefits are not
- * clear, so it is currently left out.
- */
-}
-
-static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- unsigned long t_nfb = USEC_PER_SEC / 5;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- /* XXX: set some sensible MIB */
- goto restart_timer;
- }
-
- ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state));
-
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_FBACK:
- /* RFC 3448, 4.4: Halve send rate directly */
- hctx->ccid3hctx_x = min_t(u32, hctx->ccid3hctx_x / 2,
- hctx->ccid3hctx_s / TFRC_T_MBI);
-
- ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
- "bytes/s\n",
- dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state),
- hctx->ccid3hctx_x);
- /* The value of R is still undefined and so we can not recompute
- * the timout value. Keep initial value as per [RFC 4342, 5]. */
- t_nfb = TFRC_INITIAL_TIMEOUT;
- ccid3_update_send_time(hctx);
- break;
- case TFRC_SSTATE_FBACK:
- /*
- * Check if IDLE since last timeout and recv rate is less than
- * 4 packets per RTT
- */
- if (!hctx->ccid3hctx_idle ||
- (hctx->ccid3hctx_x_recv >=
- 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
- struct timeval now;
-
- ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
- dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state));
- /* Halve sending rate */
-
- /* If (X_calc > 2 * X_recv)
- * X_recv = max(X_recv / 2, s / (2 * t_mbi));
- * Else
- * X_recv = X_calc / 4;
- */
- BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
- hctx->ccid3hctx_x_calc == 0);
-
- /* check also if p is zero -> x_calc is infinity? */
- if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
- hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
- hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
- hctx->ccid3hctx_s / (2 * TFRC_T_MBI));
- else
- hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
-
- /* Update sending rate */
- dccp_timestamp(sk, &now);
- ccid3_hc_tx_update_x(sk, &now);
- }
- /*
- * Schedule no feedback timer to expire in
- * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi)
- */
- t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi);
- break;
- case TFRC_SSTATE_NO_SENT:
- DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk);
- /* fall through */
- case TFRC_SSTATE_TERM:
- goto out;
- }
-
- hctx->ccid3hctx_idle = 1;
-
-restart_timer:
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + usecs_to_jiffies(t_nfb));
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/*
- * returns
- * > 0: delay (in msecs) that should pass before actually sending
- * = 0: can send immediately
- * < 0: error condition; do not send packet
- */
-static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct dccp_tx_hist_entry *new_packet;
- struct timeval now;
- long delay;
-
- BUG_ON(hctx == NULL);
-
- /*
- * This function is called only for Data and DataAck packets. Sending
- * zero-sized Data(Ack)s is theoretically possible, but for congestion
- * control this case is pathological - ignore it.
- */
- if (unlikely(skb->len == 0))
- return -EBADMSG;
-
- /* See if last packet allocated was not sent */
- new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
- if (new_packet == NULL || new_packet->dccphtx_sent) {
- new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
- SLAB_ATOMIC);
-
- if (unlikely(new_packet == NULL)) {
- DCCP_WARN("%s, sk=%p, not enough mem to add to history,"
- "send refused\n", dccp_role(sk), sk);
- return -ENOBUFS;
- }
-
- dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
- }
-
- dccp_timestamp(sk, &now);
-
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_SENT:
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
- hctx->ccid3hctx_last_win_count = 0;
- hctx->ccid3hctx_t_last_win_count = now;
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-
- /* Set initial sending rate to 1 packet per second */
- ccid3_hc_tx_update_s(hctx, skb->len);
- hctx->ccid3hctx_x = hctx->ccid3hctx_s;
-
- /* First timeout, according to [RFC 3448, 4.2], is 1 second */
- hctx->ccid3hctx_t_ipi = USEC_PER_SEC;
- /* Initial delta: minimum of 0.5 sec and t_gran/2 */
- hctx->ccid3hctx_delta = TFRC_OPSYS_HALF_TIME_GRAN;
-
- /* Set t_0 for initial packet */
- hctx->ccid3hctx_t_nom = now;
- break;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
- delay = timeval_delta(&hctx->ccid3hctx_t_nom, &now);
- /*
- * Scheduling of packet transmissions [RFC 3448, 4.6]
- *
- * if (t_now > t_nom - delta)
- * // send the packet now
- * else
- * // send the packet in (t_nom - t_now) milliseconds.
- */
- if (delay >= hctx->ccid3hctx_delta)
- return delay / 1000L;
- break;
- case TFRC_SSTATE_TERM:
- DCCP_BUG("Illegal %s state TERM, sk=%p", dccp_role(sk), sk);
- return -EINVAL;
- }
-
- /* prepare to send now (add options etc.) */
- dp->dccps_hc_tx_insert_options = 1;
- new_packet->dccphtx_ccval = DCCP_SKB_CB(skb)->dccpd_ccval =
- hctx->ccid3hctx_last_win_count;
- timeval_add_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi);
-
- return 0;
-}
-
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct timeval now;
- unsigned long quarter_rtt;
- struct dccp_tx_hist_entry *packet;
-
- BUG_ON(hctx == NULL);
-
- dccp_timestamp(sk, &now);
-
- ccid3_hc_tx_update_s(hctx, len);
-
- packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
- if (unlikely(packet == NULL)) {
- DCCP_WARN("packet doesn't exist in history!\n");
- return;
- }
- if (unlikely(packet->dccphtx_sent)) {
- DCCP_WARN("no unsent packet in history!\n");
- return;
- }
- packet->dccphtx_tstamp = now;
- packet->dccphtx_seqno = dp->dccps_gss;
- /*
- * Check if win_count have changed
- * Algorithm in "8.1. Window Counter Value" in RFC 4342.
- */
- quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
- if (likely(hctx->ccid3hctx_rtt > 8))
- quarter_rtt /= hctx->ccid3hctx_rtt / 4;
-
- if (quarter_rtt > 0) {
- hctx->ccid3hctx_t_last_win_count = now;
- hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count +
- min_t(unsigned long, quarter_rtt, 5)) % 16;
- ccid3_pr_debug("%s, sk=%p, window changed from "
- "%u to %u!\n",
- dccp_role(sk), sk,
- packet->dccphtx_ccval,
- hctx->ccid3hctx_last_win_count);
- }
-
- hctx->ccid3hctx_idle = 0;
- packet->dccphtx_rtt = hctx->ccid3hctx_rtt;
- packet->dccphtx_sent = 1;
-}
-
-static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
- struct dccp_tx_hist_entry *packet;
- struct timeval now;
- unsigned long t_nfb;
- u32 t_elapsed;
- u32 pinv;
- u32 x_recv;
- u32 r_sample;
-
- BUG_ON(hctx == NULL);
-
- /* we are only interested in ACKs */
- if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
- DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
- return;
-
- opt_recv = &hctx->ccid3hctx_options_received;
-
- t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
- x_recv = opt_recv->ccid3or_receive_rate;
- pinv = opt_recv->ccid3or_loss_event_rate;
-
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
- /* Calculate new round trip sample by
- * R_sample = (now - t_recvdata) - t_delay */
- /* get t_recvdata from history */
- packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- if (unlikely(packet == NULL)) {
- DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist "
- "in history!\n", dccp_role(sk), sk,
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
- return;
- }
-
- /* Update RTT */
- dccp_timestamp(sk, &now);
- r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
- if (unlikely(r_sample <= t_elapsed))
- DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n",
- r_sample, t_elapsed);
- else
- r_sample -= t_elapsed;
-
- /* Update RTT estimate by
- * If (No feedback recv)
- * R = R_sample;
- * Else
- * R = q * R + (1 - q) * R_sample;
- *
- * q is a constant, RFC 3448 recomments 0.9
- */
- if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
- /* Use Larger Initial Windows [RFC 4342, sec. 5]
- * We deviate in that we use `s' instead of `MSS'. */
- u16 w_init = max( 4 * hctx->ccid3hctx_s,
- max(2 * hctx->ccid3hctx_s, 4380));
- hctx->ccid3hctx_rtt = r_sample;
- hctx->ccid3hctx_x = usecs_div(w_init, r_sample);
- hctx->ccid3hctx_t_ld = now;
-
- ccid3_update_send_time(hctx);
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
- } else {
- hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
- r_sample / 10;
- ccid3_hc_tx_update_x(sk, &now);
- }
-
- ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
- "r_sample=%us\n", dccp_role(sk), sk,
- hctx->ccid3hctx_rtt, r_sample);
-
- /* Update receive rate */
- hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
-
- /* Update loss event rate */
- if (pinv == ~0 || pinv == 0)
- hctx->ccid3hctx_p = 0;
- else {
- hctx->ccid3hctx_p = 1000000 / pinv;
-
- if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
- hctx->ccid3hctx_p = TFRC_SMALLEST_P;
- ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
- dccp_role(sk), sk);
- }
- }
-
- /* unschedule no feedback timer */
- sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
-
- /* remove all packets older than the one acked from history */
- dccp_tx_hist_purge_older(ccid3_tx_hist,
- &hctx->ccid3hctx_hist, packet);
- /*
- * As we have calculated new ipi, delta, t_nom it is possible that
- * we now can send a packet, so wake up dccp_wait_for_ccid
- */
- sk->sk_write_space(sk);
-
- /* Update timeout interval. We use the alternative variant of
- * [RFC 3448, 3.1] which sets the upper bound of t_rto to one
- * second, as it is suggested for TCP (see RFC 2988, 2.4). */
- hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
- USEC_PER_SEC );
- /*
- * Schedule no feedback timer to expire in
- * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi)
- */
- t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi);
-
- ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
- "expire in %lu jiffies (%luus)\n",
- dccp_role(sk), sk,
- usecs_to_jiffies(t_nfb), t_nfb);
-
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + usecs_to_jiffies(t_nfb));
-
- /* set idle flag */
- hctx->ccid3hctx_idle = 1;
- break;
- case TFRC_SSTATE_NO_SENT:
- DCCP_WARN("Illegal ACK received - no packet has been sent\n");
- /* fall through */
- case TFRC_SSTATE_TERM: /* ignore feedback when closing */
- break;
- }
-}
-
-static int ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-
- BUG_ON(hctx == NULL);
-
- if (sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)
- DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
- return 0;
-}
-
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
- unsigned char len, u16 idx,
- unsigned char *value)
-{
- int rc = 0;
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
-
- BUG_ON(hctx == NULL);
-
- opt_recv = &hctx->ccid3hctx_options_received;
-
- if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
- opt_recv->ccid3or_seqno = dp->dccps_gsr;
- opt_recv->ccid3or_loss_event_rate = ~0;
- opt_recv->ccid3or_loss_intervals_idx = 0;
- opt_recv->ccid3or_loss_intervals_len = 0;
- opt_recv->ccid3or_receive_rate = 0;
- }
-
- switch (option) {
- case TFRC_OPT_LOSS_EVENT_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s, sk=%p, invalid len %d "
- "for TFRC_OPT_LOSS_EVENT_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_recv->ccid3or_loss_event_rate = ntohl(*(__be32 *)value);
- ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_event_rate);
- }
- break;
- case TFRC_OPT_LOSS_INTERVALS:
- opt_recv->ccid3or_loss_intervals_idx = idx;
- opt_recv->ccid3or_loss_intervals_len = len;
- ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_intervals_idx,
- opt_recv->ccid3or_loss_intervals_len);
- break;
- case TFRC_OPT_RECEIVE_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s, sk=%p, invalid len %d "
- "for TFRC_OPT_RECEIVE_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_recv->ccid3or_receive_rate = ntohl(*(__be32 *)value);
- ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_receive_rate);
- }
- break;
- }
-
- return rc;
-}
-
-static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
-
- hctx->ccid3hctx_s = 0;
- hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
- INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
-
- hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
- hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk;
- init_timer(&hctx->ccid3hctx_no_feedback_timer);
-
- return 0;
-}
-
-static void ccid3_hc_tx_exit(struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-
- BUG_ON(hctx == NULL);
-
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
- sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
-
- /* Empty packet history */
- dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
-}
-
-/*
- * RX Half Connection methods
- */
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
-{
- static char *ccid3_rx_state_names[] = {
- [TFRC_RSTATE_NO_DATA] = "NO_DATA",
- [TFRC_RSTATE_DATA] = "DATA",
- [TFRC_RSTATE_TERM] = "TERM",
- };
-
- return ccid3_rx_state_names[state];
-}
-#endif
-
-static void ccid3_hc_rx_set_state(struct sock *sk,
- enum ccid3_hc_rx_states state)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
-
- ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
- dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
- ccid3_rx_state_name(state));
- WARN_ON(state == oldstate);
- hcrx->ccid3hcrx_state = state;
-}
-
-static inline void ccid3_hc_rx_update_s(struct ccid3_hc_rx_sock *hcrx, int len)
-{
- if (unlikely(len == 0)) /* don't update on empty packets (e.g. ACKs) */
- ccid3_pr_debug("Packet payload length is 0 - not updating\n");
- else
- hcrx->ccid3hcrx_s = hcrx->ccid3hcrx_s == 0 ? len :
- (9 * hcrx->ccid3hcrx_s + len) / 10;
-}
-
-static void ccid3_hc_rx_send_feedback(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_rx_hist_entry *packet;
- struct timeval now;
-
- ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
-
- dccp_timestamp(sk, &now);
-
- switch (hcrx->ccid3hcrx_state) {
- case TFRC_RSTATE_NO_DATA:
- hcrx->ccid3hcrx_x_recv = 0;
- break;
- case TFRC_RSTATE_DATA: {
- const u32 delta = timeval_delta(&now,
- &hcrx->ccid3hcrx_tstamp_last_feedback);
- hcrx->ccid3hcrx_x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv,
- delta);
- }
- break;
- case TFRC_RSTATE_TERM:
- DCCP_BUG("Illegal %s state TERM, sk=%p", dccp_role(sk), sk);
- return;
- }
-
- packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
- if (unlikely(packet == NULL)) {
- DCCP_WARN("%s, sk=%p, no data packet in history!\n",
- dccp_role(sk), sk);
- return;
- }
-
- hcrx->ccid3hcrx_tstamp_last_feedback = now;
- hcrx->ccid3hcrx_ccval_last_counter = packet->dccphrx_ccval;
- hcrx->ccid3hcrx_bytes_recv = 0;
-
- /* Convert to multiples of 10us */
- hcrx->ccid3hcrx_elapsed_time =
- timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
- if (hcrx->ccid3hcrx_p == 0)
- hcrx->ccid3hcrx_pinv = ~0;
- else
- hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
- dp->dccps_hc_rx_insert_options = 1;
- dccp_send_ack(sk);
-}
-
-static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- __be32 x_recv, pinv;
-
- BUG_ON(hcrx == NULL);
-
- if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
- return 0;
-
- DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_ccval_last_counter;
-
- if (dccp_packet_without_ack(skb))
- return 0;
-
- x_recv = htonl(hcrx->ccid3hcrx_x_recv);
- pinv = htonl(hcrx->ccid3hcrx_pinv);
-
- if ((hcrx->ccid3hcrx_elapsed_time != 0 &&
- dccp_insert_option_elapsed_time(sk, skb,
- hcrx->ccid3hcrx_elapsed_time)) ||
- dccp_insert_option_timestamp(sk, skb) ||
- dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
- &pinv, sizeof(pinv)) ||
- dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
- &x_recv, sizeof(x_recv)))
- return -1;
-
- return 0;
-}
-
-/* calculate first loss interval
- *
- * returns estimated loss interval in usecs */
-
-static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
- u32 rtt, delta, x_recv, fval, p, tmp2;
- struct timeval tstamp = { 0, };
- int interval = 0;
- int win_count = 0;
- int step = 0;
- u64 tmp1;
-
- list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
- dccphrx_node) {
- if (dccp_rx_hist_entry_data_packet(entry)) {
- tail = entry;
-
- switch (step) {
- case 0:
- tstamp = entry->dccphrx_tstamp;
- win_count = entry->dccphrx_ccval;
- step = 1;
- break;
- case 1:
- interval = win_count - entry->dccphrx_ccval;
- if (interval < 0)
- interval += TFRC_WIN_COUNT_LIMIT;
- if (interval > 4)
- goto found;
- break;
- }
- }
- }
-
- if (unlikely(step == 0)) {
- DCCP_WARN("%s, sk=%p, packet history has no data packets!\n",
- dccp_role(sk), sk);
- return ~0;
- }
-
- if (unlikely(interval == 0)) {
- DCCP_WARN("%s, sk=%p, Could not find a win_count interval > 0."
- "Defaulting to 1\n", dccp_role(sk), sk);
- interval = 1;
- }
-found:
- if (!tail) {
- DCCP_CRIT("tail is null\n");
- return ~0;
- }
- rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
- ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
- dccp_role(sk), sk, rtt);
-
- if (rtt == 0) {
- DCCP_WARN("RTT==0, setting to 1\n");
- rtt = 1;
- }
-
- dccp_timestamp(sk, &tstamp);
- delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback);
- x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta);
-
- if (x_recv == 0)
- x_recv = hcrx->ccid3hcrx_x_recv;
-
- tmp1 = (u64)x_recv * (u64)rtt;
- do_div(tmp1,10000000);
- tmp2 = (u32)tmp1;
-
- if (!tmp2) {
- DCCP_CRIT("tmp2 = 0, x_recv = %u, rtt =%u\n", x_recv, rtt);
- return ~0;
- }
-
- fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
- /* do not alter order above or you will get overflow on 32 bit */
- p = tfrc_calc_x_reverse_lookup(fval);
- ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
- "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
-
- if (p == 0)
- return ~0;
- else
- return 1000000 / p;
-}
-
-static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- struct dccp_li_hist_entry *head;
- u64 seq_temp;
-
- if (list_empty(&hcrx->ccid3hcrx_li_hist)) {
- if (!dccp_li_hist_interval_new(ccid3_li_hist,
- &hcrx->ccid3hcrx_li_hist, seq_loss, win_loss))
- return;
-
- head = list_entry(hcrx->ccid3hcrx_li_hist.next,
- struct dccp_li_hist_entry, dccplih_node);
- head->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
- } else {
- struct dccp_li_hist_entry *entry;
- struct list_head *tail;
-
- head = list_entry(hcrx->ccid3hcrx_li_hist.next,
- struct dccp_li_hist_entry, dccplih_node);
- /* FIXME win count check removed as was wrong */
- /* should make this check with receive history */
- /* and compare there as per section 10.2 of RFC4342 */
-
- /* new loss event detected */
- /* calculate last interval length */
- seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss);
- entry = dccp_li_hist_entry_new(ccid3_li_hist, SLAB_ATOMIC);
-
- if (entry == NULL) {
- DCCP_BUG("out of memory - can not allocate entry");
- return;
- }
-
- list_add(&entry->dccplih_node, &hcrx->ccid3hcrx_li_hist);
-
- tail = hcrx->ccid3hcrx_li_hist.prev;
- list_del(tail);
- kmem_cache_free(ccid3_li_hist->dccplih_slab, tail);
-
- /* Create the newest interval */
- entry->dccplih_seqno = seq_loss;
- entry->dccplih_interval = seq_temp;
- entry->dccplih_win_count = win_loss;
- }
-}
-
-static int ccid3_hc_rx_detect_loss(struct sock *sk,
- struct dccp_rx_hist_entry *packet)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- struct dccp_rx_hist_entry *rx_hist = dccp_rx_hist_head(&hcrx->ccid3hcrx_hist);
- u64 seqno = packet->dccphrx_seqno;
- u64 tmp_seqno;
- int loss = 0;
- u8 ccval;
-
-
- tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss;
-
- if (!rx_hist ||
- follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) {
- hcrx->ccid3hcrx_seqno_nonloss = seqno;
- hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval;
- goto detect_out;
- }
-
-
- while (dccp_delta_seqno(hcrx->ccid3hcrx_seqno_nonloss, seqno)
- > TFRC_RECV_NUM_LATE_LOSS) {
- loss = 1;
- ccid3_hc_rx_update_li(sk, hcrx->ccid3hcrx_seqno_nonloss,
- hcrx->ccid3hcrx_ccval_nonloss);
- tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss;
- dccp_inc_seqno(&tmp_seqno);
- hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno;
- dccp_inc_seqno(&tmp_seqno);
- while (dccp_rx_hist_find_entry(&hcrx->ccid3hcrx_hist,
- tmp_seqno, &ccval)) {
- hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno;
- hcrx->ccid3hcrx_ccval_nonloss = ccval;
- dccp_inc_seqno(&tmp_seqno);
- }
- }
-
- /* FIXME - this code could be simplified with above while */
- /* but works at moment */
- if (follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) {
- hcrx->ccid3hcrx_seqno_nonloss = seqno;
- hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval;
- }
-
-detect_out:
- dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
- &hcrx->ccid3hcrx_li_hist, packet,
- hcrx->ccid3hcrx_seqno_nonloss);
- return loss;
-}
-
-static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- const struct dccp_options_received *opt_recv;
- struct dccp_rx_hist_entry *packet;
- struct timeval now;
- u32 p_prev, rtt_prev, r_sample, t_elapsed;
- int loss, payload_size;
-
- BUG_ON(hcrx == NULL);
-
- opt_recv = &dccp_sk(sk)->dccps_options_received;
-
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_ACK:
- if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
- return;
- case DCCP_PKT_DATAACK:
- if (opt_recv->dccpor_timestamp_echo == 0)
- break;
- rtt_prev = hcrx->ccid3hcrx_rtt;
- dccp_timestamp(sk, &now);
- timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10);
- r_sample = timeval_usecs(&now);
- t_elapsed = opt_recv->dccpor_elapsed_time * 10;
-
- if (unlikely(r_sample <= t_elapsed))
- DCCP_WARN("r_sample=%uus, t_elapsed=%uus\n",
- r_sample, t_elapsed);
- else
- r_sample -= t_elapsed;
-
- if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
- hcrx->ccid3hcrx_rtt = r_sample;
- else
- hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 +
- r_sample / 10;
-
- if (rtt_prev != hcrx->ccid3hcrx_rtt)
- ccid3_pr_debug("%s, New RTT=%uus, elapsed time=%u\n",
- dccp_role(sk), hcrx->ccid3hcrx_rtt,
- opt_recv->dccpor_elapsed_time);
- break;
- case DCCP_PKT_DATA:
- break;
- default: /* We're not interested in other packet types, move along */
- return;
- }
-
- packet = dccp_rx_hist_entry_new(ccid3_rx_hist, sk, opt_recv->dccpor_ndp,
- skb, SLAB_ATOMIC);
- if (unlikely(packet == NULL)) {
- DCCP_WARN("%s, sk=%p, Not enough mem to add rx packet "
- "to history, consider it lost!\n", dccp_role(sk), sk);
- return;
- }
-
- loss = ccid3_hc_rx_detect_loss(sk, packet);
-
- if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
- return;
-
- payload_size = skb->len - dccp_hdr(skb)->dccph_doff * 4;
- ccid3_hc_rx_update_s(hcrx, payload_size);
-
- switch (hcrx->ccid3hcrx_state) {
- case TFRC_RSTATE_NO_DATA:
- ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
- "feedback\n",
- dccp_role(sk), sk,
- dccp_state_name(sk->sk_state), skb);
- ccid3_hc_rx_send_feedback(sk);
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
- return;
- case TFRC_RSTATE_DATA:
- hcrx->ccid3hcrx_bytes_recv += payload_size;
- if (loss)
- break;
-
- dccp_timestamp(sk, &now);
- if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
- hcrx->ccid3hcrx_rtt) {
- hcrx->ccid3hcrx_tstamp_last_ack = now;
- ccid3_hc_rx_send_feedback(sk);
- }
- return;
- case TFRC_RSTATE_TERM:
- DCCP_BUG("Illegal %s state TERM, sk=%p", dccp_role(sk), sk);
- return;
- }
-
- /* Dealing with packet loss */
- ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
- dccp_role(sk), sk, dccp_state_name(sk->sk_state));
-
- p_prev = hcrx->ccid3hcrx_p;
-
- /* Calculate loss event rate */
- if (!list_empty(&hcrx->ccid3hcrx_li_hist)) {
- u32 i_mean = dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
-
- /* Scaling up by 1000000 as fixed decimal */
- if (i_mean != 0)
- hcrx->ccid3hcrx_p = 1000000 / i_mean;
- } else
- DCCP_BUG("empty loss history");
-
- if (hcrx->ccid3hcrx_p > p_prev) {
- ccid3_hc_rx_send_feedback(sk);
- return;
- }
-}
-
-static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
-
- ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
-
- hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
- INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
- INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
- dccp_timestamp(sk, &hcrx->ccid3hcrx_tstamp_last_ack);
- hcrx->ccid3hcrx_tstamp_last_feedback = hcrx->ccid3hcrx_tstamp_last_ack;
- hcrx->ccid3hcrx_s = 0;
- hcrx->ccid3hcrx_rtt = 5000; /* XXX 5ms for now... */
- return 0;
-}
-
-static void ccid3_hc_rx_exit(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-
- BUG_ON(hcrx == NULL);
-
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
- /* Empty packet history */
- dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
-
- /* Empty loss interval history */
- dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
-}
-
-static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
-{
- const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- BUG_ON(hcrx == NULL);
-
- info->tcpi_ca_state = hcrx->ccid3hcrx_state;
- info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
-}
-
-static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
-{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- BUG_ON(hctx == NULL);
-
- info->tcpi_rto = hctx->ccid3hctx_t_rto;
- info->tcpi_rtt = hctx->ccid3hctx_rtt;
-}
-
-static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- const void *val;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_RX_INFO:
- if (len < sizeof(hcrx->ccid3hcrx_tfrc))
- return -EINVAL;
- len = sizeof(hcrx->ccid3hcrx_tfrc);
- val = &hcrx->ccid3hcrx_tfrc;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- const void *val;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(hctx->ccid3hctx_tfrc))
- return -EINVAL;
- len = sizeof(hctx->ccid3hctx_tfrc);
- val = &hctx->ccid3hctx_tfrc;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-static struct ccid_operations ccid3 = {
- .ccid_id = DCCPC_CCID3,
- .ccid_name = "ccid3",
- .ccid_owner = THIS_MODULE,
- .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
- .ccid_hc_tx_init = ccid3_hc_tx_init,
- .ccid_hc_tx_exit = ccid3_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
- .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
- .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
- .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
- .ccid_hc_rx_init = ccid3_hc_rx_init,
- .ccid_hc_rx_exit = ccid3_hc_rx_exit,
- .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
- .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
- .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
- .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
- .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
- .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
-};
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-module_param(ccid3_debug, int, 0444);
-MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
-#endif
-
-static __init int ccid3_module_init(void)
-{
- int rc = -ENOBUFS;
-
- ccid3_rx_hist = dccp_rx_hist_new("ccid3");
- if (ccid3_rx_hist == NULL)
- goto out;
-
- ccid3_tx_hist = dccp_tx_hist_new("ccid3");
- if (ccid3_tx_hist == NULL)
- goto out_free_rx;
-
- ccid3_li_hist = dccp_li_hist_new("ccid3");
- if (ccid3_li_hist == NULL)
- goto out_free_tx;
-
- rc = ccid_register(&ccid3);
- if (rc != 0)
- goto out_free_loss_interval_history;
-out:
- return rc;
-
-out_free_loss_interval_history:
- dccp_li_hist_delete(ccid3_li_hist);
- ccid3_li_hist = NULL;
-out_free_tx:
- dccp_tx_hist_delete(ccid3_tx_hist);
- ccid3_tx_hist = NULL;
-out_free_rx:
- dccp_rx_hist_delete(ccid3_rx_hist);
- ccid3_rx_hist = NULL;
- goto out;
-}
-module_init(ccid3_module_init);
-
-static __exit void ccid3_module_exit(void)
-{
- ccid_unregister(&ccid3);
-
- if (ccid3_tx_hist != NULL) {
- dccp_tx_hist_delete(ccid3_tx_hist);
- ccid3_tx_hist = NULL;
- }
- if (ccid3_rx_hist != NULL) {
- dccp_rx_hist_delete(ccid3_rx_hist);
- ccid3_rx_hist = NULL;
- }
- if (ccid3_li_hist != NULL) {
- dccp_li_hist_delete(ccid3_li_hist);
- ccid3_li_hist = NULL;
- }
-}
-module_exit(ccid3_module_exit);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
- "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
-MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
deleted file mode 100644
index 27cb20ae1da8..000000000000
--- a/net/dccp/ccids/ccid3.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * net/dccp/ccids/ccid3.h
- *
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef _DCCP_CCID3_H_
-#define _DCCP_CCID3_H_
-
-#include <linux/list.h>
-#include <linux/time.h>
-#include <linux/types.h>
-#include <linux/tfrc.h>
-#include "../ccid.h"
-
-/* Two seconds as per RFC 3448 4.2 */
-#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
-
-/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
-#define TFRC_T_MBI 64
-
-#define TFRC_SMALLEST_P 40
-
-enum ccid3_options {
- TFRC_OPT_LOSS_EVENT_RATE = 192,
- TFRC_OPT_LOSS_INTERVALS = 193,
- TFRC_OPT_RECEIVE_RATE = 194,
-};
-
-struct ccid3_options_received {
- u64 ccid3or_seqno:48,
- ccid3or_loss_intervals_idx:16;
- u16 ccid3or_loss_intervals_len;
- u32 ccid3or_loss_event_rate;
- u32 ccid3or_receive_rate;
-};
-
-/* TFRC sender states */
-enum ccid3_hc_tx_states {
- TFRC_SSTATE_NO_SENT = 1,
- TFRC_SSTATE_NO_FBACK,
- TFRC_SSTATE_FBACK,
- TFRC_SSTATE_TERM,
-};
-
-/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
- *
- * @ccid3hctx_x - Current sending rate
- * @ccid3hctx_x_recv - Receive rate
- * @ccid3hctx_x_calc - Calculated send rate (RFC 3448, 3.1)
- * @ccid3hctx_rtt - Estimate of current round trip time in usecs
- * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
- * @ccid3hctx_s - Packet size
- * @ccid3hctx_t_rto - Retransmission Timeout (RFC 3448, 3.1)
- * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6)
- * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
- * @ccid3hctx_last_win_count - Last window counter sent
- * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
- * with last_win_count value sent
- * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
- * @ccid3hctx_idle - Flag indicating that sender is idling
- * @ccid3hctx_t_ld - Time last doubled during slow start
- * @ccid3hctx_t_nom - Nominal send time of next packet
- * @ccid3hctx_delta - Send timer delta
- * @ccid3hctx_hist - Packet history
- * @ccid3hctx_options_received - Parsed set of retrieved options
- */
-struct ccid3_hc_tx_sock {
- struct tfrc_tx_info ccid3hctx_tfrc;
-#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
-#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
-#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
-#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
-#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
-#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
-#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
- u16 ccid3hctx_s;
- enum ccid3_hc_tx_states ccid3hctx_state:8;
- u8 ccid3hctx_last_win_count;
- u8 ccid3hctx_idle;
- struct timeval ccid3hctx_t_last_win_count;
- struct timer_list ccid3hctx_no_feedback_timer;
- struct timeval ccid3hctx_t_ld;
- struct timeval ccid3hctx_t_nom;
- u32 ccid3hctx_delta;
- struct list_head ccid3hctx_hist;
- struct ccid3_options_received ccid3hctx_options_received;
-};
-
-/* TFRC receiver states */
-enum ccid3_hc_rx_states {
- TFRC_RSTATE_NO_DATA = 1,
- TFRC_RSTATE_DATA,
- TFRC_RSTATE_TERM = 127,
-};
-
-/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- *
- * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3)
- * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard)
- * @ccid3hcrx_p - current loss event rate (RFC 3448 5.4)
- * @ccid3hcrx_seqno_nonloss - Last received non-loss sequence number
- * @ccid3hcrx_ccval_nonloss - Last received non-loss Window CCVal
- * @ccid3hcrx_ccval_last_counter - Tracks window counter (RFC 4342, 8.1)
- * @ccid3hcrx_state - receiver state, one of %ccid3_hc_rx_states
- * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes
- * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
- * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
- * @ccid3hcrx_hist - Packet history
- * @ccid3hcrx_li_hist - Loss Interval History
- * @ccid3hcrx_s - Received packet size in bytes
- * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
- * @ccid3hcrx_elapsed_time - Time since packet reception
- */
-struct ccid3_hc_rx_sock {
- struct tfrc_rx_info ccid3hcrx_tfrc;
-#define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv
-#define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt
-#define ccid3hcrx_p ccid3hcrx_tfrc.tfrcrx_p
- u64 ccid3hcrx_seqno_nonloss:48,
- ccid3hcrx_ccval_nonloss:4,
- ccid3hcrx_ccval_last_counter:4;
- enum ccid3_hc_rx_states ccid3hcrx_state:8;
- u32 ccid3hcrx_bytes_recv;
- struct timeval ccid3hcrx_tstamp_last_feedback;
- struct timeval ccid3hcrx_tstamp_last_ack;
- struct list_head ccid3hcrx_hist;
- struct list_head ccid3hcrx_li_hist;
- u16 ccid3hcrx_s;
- u32 ccid3hcrx_pinv;
- u32 ccid3hcrx_elapsed_time;
-};
-
-static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
-}
-
-static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
-}
-
-#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
deleted file mode 100644
index 5f940a6cbaca..000000000000
--- a/net/dccp/ccids/lib/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
-
-dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
deleted file mode 100644
index 48b9b93f8acb..000000000000
--- a/net/dccp/ccids/lib/loss_interval.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * net/dccp/ccids/lib/loss_interval.c
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <net/sock.h>
-#include "../../dccp.h"
-#include "loss_interval.h"
-
-struct dccp_li_hist *dccp_li_hist_new(const char *name)
-{
- struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
- static const char dccp_li_hist_mask[] = "li_hist_%s";
- char *slab_name;
-
- if (hist == NULL)
- goto out;
-
- slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
- GFP_ATOMIC);
- if (slab_name == NULL)
- goto out_free_hist;
-
- sprintf(slab_name, dccp_li_hist_mask, name);
- hist->dccplih_slab = kmem_cache_create(slab_name,
- sizeof(struct dccp_li_hist_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (hist->dccplih_slab == NULL)
- goto out_free_slab_name;
-out:
- return hist;
-out_free_slab_name:
- kfree(slab_name);
-out_free_hist:
- kfree(hist);
- hist = NULL;
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_li_hist_new);
-
-void dccp_li_hist_delete(struct dccp_li_hist *hist)
-{
- const char* name = kmem_cache_name(hist->dccplih_slab);
-
- kmem_cache_destroy(hist->dccplih_slab);
- kfree(name);
- kfree(hist);
-}
-
-EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
-
-void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
-{
- struct dccp_li_hist_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, list, dccplih_node) {
- list_del_init(&entry->dccplih_node);
- kmem_cache_free(hist->dccplih_slab, entry);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
-
-/* Weights used to calculate loss event rate */
-/*
- * These are integers as per section 8 of RFC3448. We can then divide by 4 *
- * when we use it.
- */
-static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
- 4, 4, 4, 4, 3, 2, 1, 1,
-};
-
-u32 dccp_li_hist_calc_i_mean(struct list_head *list)
-{
- struct dccp_li_hist_entry *li_entry, *li_next;
- int i = 0;
- u32 i_tot;
- u32 i_tot0 = 0;
- u32 i_tot1 = 0;
- u32 w_tot = 0;
-
- list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
- if (li_entry->dccplih_interval != ~0) {
- i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
- w_tot += dccp_li_hist_w[i];
- if (i != 0)
- i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
- }
-
-
- if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
- break;
- }
-
- if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
- return 0;
-
- i_tot = max(i_tot0, i_tot1);
-
- if (!w_tot) {
- DCCP_WARN("w_tot = 0\n");
- return 1;
- }
-
- return i_tot / w_tot;
-}
-
-EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
-
-int dccp_li_hist_interval_new(struct dccp_li_hist *hist,
- struct list_head *list, const u64 seq_loss, const u8 win_loss)
-{
- struct dccp_li_hist_entry *entry;
- int i;
-
- for (i = 0; i < DCCP_LI_HIST_IVAL_F_LENGTH; i++) {
- entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
- if (entry == NULL) {
- dccp_li_hist_purge(hist, list);
- DCCP_BUG("loss interval list entry is NULL");
- return 0;
- }
- entry->dccplih_interval = ~0;
- list_add(&entry->dccplih_node, list);
- }
-
- entry->dccplih_seqno = seq_loss;
- entry->dccplih_win_count = win_loss;
- return 1;
-}
-
-EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
deleted file mode 100644
index 0ae85f0340b2..000000000000
--- a/net/dccp/ccids/lib/loss_interval.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _DCCP_LI_HIST_
-#define _DCCP_LI_HIST_
-/*
- * net/dccp/ccids/lib/loss_interval.h
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-
-#define DCCP_LI_HIST_IVAL_F_LENGTH 8
-
-struct dccp_li_hist {
- kmem_cache_t *dccplih_slab;
-};
-
-extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
-extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
-
-struct dccp_li_hist_entry {
- struct list_head dccplih_node;
- u64 dccplih_seqno:48,
- dccplih_win_count:4;
- u32 dccplih_interval;
-};
-
-static inline struct dccp_li_hist_entry *
- dccp_li_hist_entry_new(struct dccp_li_hist *hist,
- const gfp_t prio)
-{
- return kmem_cache_alloc(hist->dccplih_slab, prio);
-}
-
-static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
- struct dccp_li_hist_entry *entry)
-{
- if (entry != NULL)
- kmem_cache_free(hist->dccplih_slab, entry);
-}
-
-extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
- struct list_head *list);
-
-extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
-
-extern int dccp_li_hist_interval_new(struct dccp_li_hist *hist,
- struct list_head *list, const u64 seq_loss, const u8 win_loss);
-#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
deleted file mode 100644
index b876c9c81c65..000000000000
--- a/net/dccp/ccids/lib/packet_history.c
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * net/dccp/packet_history.c
- *
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "packet_history.h"
-
-struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
-{
- struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
- static const char dccp_rx_hist_mask[] = "rx_hist_%s";
- char *slab_name;
-
- if (hist == NULL)
- goto out;
-
- slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
- GFP_ATOMIC);
- if (slab_name == NULL)
- goto out_free_hist;
-
- sprintf(slab_name, dccp_rx_hist_mask, name);
- hist->dccprxh_slab = kmem_cache_create(slab_name,
- sizeof(struct dccp_rx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (hist->dccprxh_slab == NULL)
- goto out_free_slab_name;
-out:
- return hist;
-out_free_slab_name:
- kfree(slab_name);
-out_free_hist:
- kfree(hist);
- hist = NULL;
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
-
-void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
-{
- const char* name = kmem_cache_name(hist->dccprxh_slab);
-
- kmem_cache_destroy(hist->dccprxh_slab);
- kfree(name);
- kfree(hist);
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
-
-void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
-{
- struct dccp_rx_hist_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, list, dccphrx_node) {
- list_del_init(&entry->dccphrx_node);
- kmem_cache_free(hist->dccprxh_slab, entry);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
-
-struct dccp_rx_hist_entry *
- dccp_rx_hist_find_data_packet(const struct list_head *list)
-{
- struct dccp_rx_hist_entry *entry, *packet = NULL;
-
- list_for_each_entry(entry, list, dccphrx_node)
- if (entry->dccphrx_type == DCCP_PKT_DATA ||
- entry->dccphrx_type == DCCP_PKT_DATAACK) {
- packet = entry;
- break;
- }
-
- return packet;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
-
-void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
- struct list_head *rx_list,
- struct list_head *li_list,
- struct dccp_rx_hist_entry *packet,
- u64 nonloss_seqno)
-{
- struct dccp_rx_hist_entry *entry, *next;
- u8 num_later = 0;
-
- list_add(&packet->dccphrx_node, rx_list);
-
- num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
-
- if (!list_empty(li_list)) {
- list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
- if (num_later == 0) {
- if (after48(nonloss_seqno,
- entry->dccphrx_seqno)) {
- list_del_init(&entry->dccphrx_node);
- dccp_rx_hist_entry_delete(hist, entry);
- }
- } else if (dccp_rx_hist_entry_data_packet(entry))
- --num_later;
- }
- } else {
- int step = 0;
- u8 win_count = 0; /* Not needed, but lets shut up gcc */
- int tmp;
- /*
- * We have no loss interval history so we need at least one
- * rtt:s of data packets to approximate rtt.
- */
- list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
- if (num_later == 0) {
- switch (step) {
- case 0:
- step = 1;
- /* OK, find next data packet */
- num_later = 1;
- break;
- case 1:
- step = 2;
- /* OK, find next data packet */
- num_later = 1;
- win_count = entry->dccphrx_ccval;
- break;
- case 2:
- tmp = win_count - entry->dccphrx_ccval;
- if (tmp < 0)
- tmp += TFRC_WIN_COUNT_LIMIT;
- if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
- /*
- * We have found a packet older
- * than one rtt remove the rest
- */
- step = 3;
- } else /* OK, find next data packet */
- num_later = 1;
- break;
- case 3:
- list_del_init(&entry->dccphrx_node);
- dccp_rx_hist_entry_delete(hist, entry);
- break;
- }
- } else if (dccp_rx_hist_entry_data_packet(entry))
- --num_later;
- }
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
-
-struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
-{
- struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
- static const char dccp_tx_hist_mask[] = "tx_hist_%s";
- char *slab_name;
-
- if (hist == NULL)
- goto out;
-
- slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
- GFP_ATOMIC);
- if (slab_name == NULL)
- goto out_free_hist;
-
- sprintf(slab_name, dccp_tx_hist_mask, name);
- hist->dccptxh_slab = kmem_cache_create(slab_name,
- sizeof(struct dccp_tx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (hist->dccptxh_slab == NULL)
- goto out_free_slab_name;
-out:
- return hist;
-out_free_slab_name:
- kfree(slab_name);
-out_free_hist:
- kfree(hist);
- hist = NULL;
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
-
-void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
-{
- const char* name = kmem_cache_name(hist->dccptxh_slab);
-
- kmem_cache_destroy(hist->dccptxh_slab);
- kfree(name);
- kfree(hist);
-}
-
-EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
-
-struct dccp_tx_hist_entry *
- dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
-{
- struct dccp_tx_hist_entry *packet = NULL, *entry;
-
- list_for_each_entry(entry, list, dccphtx_node)
- if (entry->dccphtx_seqno == seq) {
- packet = entry;
- break;
- }
-
- return packet;
-}
-
-EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
-
-int dccp_rx_hist_find_entry(const struct list_head *list, const u64 seq,
- u8 *ccval)
-{
- struct dccp_rx_hist_entry *packet = NULL, *entry;
-
- list_for_each_entry(entry, list, dccphrx_node)
- if (entry->dccphrx_seqno == seq) {
- packet = entry;
- break;
- }
-
- if (packet)
- *ccval = packet->dccphrx_ccval;
-
- return packet != NULL;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_find_entry);
-
-void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
- struct list_head *list,
- struct dccp_tx_hist_entry *packet)
-{
- struct dccp_tx_hist_entry *next;
-
- list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
- list_del_init(&packet->dccphtx_node);
- dccp_tx_hist_entry_delete(hist, packet);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
-
-void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
-{
- struct dccp_tx_hist_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, list, dccphtx_node) {
- list_del_init(&entry->dccphtx_node);
- dccp_tx_hist_entry_delete(hist, entry);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
- "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
-MODULE_DESCRIPTION("DCCP TFRC library");
-MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
deleted file mode 100644
index 067cf1c85a37..000000000000
--- a/net/dccp/ccids/lib/packet_history.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * net/dccp/packet_history.h
- *
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef _DCCP_PKT_HIST_
-#define _DCCP_PKT_HIST_
-
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-
-#include "../../dccp.h"
-
-/* Number of later packets received before one is considered lost */
-#define TFRC_RECV_NUM_LATE_LOSS 3
-
-#define TFRC_WIN_COUNT_PER_RTT 4
-#define TFRC_WIN_COUNT_LIMIT 16
-
-struct dccp_tx_hist_entry {
- struct list_head dccphtx_node;
- u64 dccphtx_seqno:48,
- dccphtx_ccval:4,
- dccphtx_sent:1;
- u32 dccphtx_rtt;
- struct timeval dccphtx_tstamp;
-};
-
-struct dccp_rx_hist_entry {
- struct list_head dccphrx_node;
- u64 dccphrx_seqno:48,
- dccphrx_ccval:4,
- dccphrx_type:4;
- u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
- struct timeval dccphrx_tstamp;
-};
-
-struct dccp_tx_hist {
- kmem_cache_t *dccptxh_slab;
-};
-
-extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
-extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
-
-struct dccp_rx_hist {
- kmem_cache_t *dccprxh_slab;
-};
-
-extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
-extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
-extern struct dccp_rx_hist_entry *
- dccp_rx_hist_find_data_packet(const struct list_head *list);
-
-static inline struct dccp_tx_hist_entry *
- dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
- const gfp_t prio)
-{
- struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
- prio);
-
- if (entry != NULL)
- entry->dccphtx_sent = 0;
-
- return entry;
-}
-
-static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
- struct dccp_tx_hist_entry *entry)
-{
- if (entry != NULL)
- kmem_cache_free(hist->dccptxh_slab, entry);
-}
-
-extern struct dccp_tx_hist_entry *
- dccp_tx_hist_find_entry(const struct list_head *list,
- const u64 seq);
-extern int dccp_rx_hist_find_entry(const struct list_head *list, const u64 seq,
- u8 *ccval);
-
-static inline void dccp_tx_hist_add_entry(struct list_head *list,
- struct dccp_tx_hist_entry *entry)
-{
- list_add(&entry->dccphtx_node, list);
-}
-
-extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
- struct list_head *list,
- struct dccp_tx_hist_entry *next);
-
-extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
- struct list_head *list);
-
-static inline struct dccp_tx_hist_entry *
- dccp_tx_hist_head(struct list_head *list)
-{
- struct dccp_tx_hist_entry *head = NULL;
-
- if (!list_empty(list))
- head = list_entry(list->next, struct dccp_tx_hist_entry,
- dccphtx_node);
- return head;
-}
-
-static inline struct dccp_rx_hist_entry *
- dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
- const struct sock *sk,
- const u32 ndp,
- const struct sk_buff *skb,
- const gfp_t prio)
-{
- struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
- prio);
-
- if (entry != NULL) {
- const struct dccp_hdr *dh = dccp_hdr(skb);
-
- entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- entry->dccphrx_ccval = dh->dccph_ccval;
- entry->dccphrx_type = dh->dccph_type;
- entry->dccphrx_ndp = ndp;
- dccp_timestamp(sk, &entry->dccphrx_tstamp);
- }
-
- return entry;
-}
-
-static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
- struct dccp_rx_hist_entry *entry)
-{
- if (entry != NULL)
- kmem_cache_free(hist->dccprxh_slab, entry);
-}
-
-extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
- struct list_head *list);
-
-static inline struct dccp_rx_hist_entry *
- dccp_rx_hist_head(struct list_head *list)
-{
- struct dccp_rx_hist_entry *head = NULL;
-
- if (!list_empty(list))
- head = list_entry(list->next, struct dccp_rx_hist_entry,
- dccphrx_node);
- return head;
-}
-
-static inline int
- dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
-{
- return entry->dccphrx_type == DCCP_PKT_DATA ||
- entry->dccphrx_type == DCCP_PKT_DATAACK;
-}
-
-extern void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
- struct list_head *rx_list,
- struct list_head *li_list,
- struct dccp_rx_hist_entry *packet,
- u64 nonloss_seqno);
-
-extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
- struct list_head *li_list, u8 *win_loss);
-
-#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
deleted file mode 100644
index 45f30f59ea2a..000000000000
--- a/net/dccp/ccids/lib/tfrc.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _TFRC_H_
-#define _TFRC_H_
-/*
- * net/dccp/ccids/lib/tfrc.h
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/types.h>
-
-extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
-
-#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
deleted file mode 100644
index 2601012383fb..000000000000
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * net/dccp/ccids/lib/tfrc_equation.c
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <asm/div64.h>
-#include "../../dccp.h"
-#include "tfrc.h"
-
-#define TFRC_CALC_X_ARRSIZE 500
-
-#define TFRC_CALC_X_SPLIT 50000
-/* equivalent to 0.05 */
-
-static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
- { 37172, 8172 },
- { 53499, 11567 },
- { 66664, 14180 },
- { 78298, 16388 },
- { 89021, 18339 },
- { 99147, 20108 },
- { 108858, 21738 },
- { 118273, 23260 },
- { 127474, 24693 },
- { 136520, 26052 },
- { 145456, 27348 },
- { 154316, 28589 },
- { 163130, 29783 },
- { 171919, 30935 },
- { 180704, 32049 },
- { 189502, 33130 },
- { 198328, 34180 },
- { 207194, 35202 },
- { 216114, 36198 },
- { 225097, 37172 },
- { 234153, 38123 },
- { 243294, 39055 },
- { 252527, 39968 },
- { 261861, 40864 },
- { 271305, 41743 },
- { 280866, 42607 },
- { 290553, 43457 },
- { 300372, 44293 },
- { 310333, 45117 },
- { 320441, 45929 },
- { 330705, 46729 },
- { 341131, 47518 },
- { 351728, 48297 },
- { 362501, 49066 },
- { 373460, 49826 },
- { 384609, 50577 },
- { 395958, 51320 },
- { 407513, 52054 },
- { 419281, 52780 },
- { 431270, 53499 },
- { 443487, 54211 },
- { 455940, 54916 },
- { 468635, 55614 },
- { 481581, 56306 },
- { 494785, 56991 },
- { 508254, 57671 },
- { 521996, 58345 },
- { 536019, 59014 },
- { 550331, 59677 },
- { 564939, 60335 },
- { 579851, 60988 },
- { 595075, 61636 },
- { 610619, 62279 },
- { 626491, 62918 },
- { 642700, 63553 },
- { 659253, 64183 },
- { 676158, 64809 },
- { 693424, 65431 },
- { 711060, 66050 },
- { 729073, 66664 },
- { 747472, 67275 },
- { 766266, 67882 },
- { 785464, 68486 },
- { 805073, 69087 },
- { 825103, 69684 },
- { 845562, 70278 },
- { 866460, 70868 },
- { 887805, 71456 },
- { 909606, 72041 },
- { 931873, 72623 },
- { 954614, 73202 },
- { 977839, 73778 },
- { 1001557, 74352 },
- { 1025777, 74923 },
- { 1050508, 75492 },
- { 1075761, 76058 },
- { 1101544, 76621 },
- { 1127867, 77183 },
- { 1154739, 77741 },
- { 1182172, 78298 },
- { 1210173, 78852 },
- { 1238753, 79405 },
- { 1267922, 79955 },
- { 1297689, 80503 },
- { 1328066, 81049 },
- { 1359060, 81593 },
- { 1390684, 82135 },
- { 1422947, 82675 },
- { 1455859, 83213 },
- { 1489430, 83750 },
- { 1523671, 84284 },
- { 1558593, 84817 },
- { 1594205, 85348 },
- { 1630518, 85878 },
- { 1667543, 86406 },
- { 1705290, 86932 },
- { 1743770, 87457 },
- { 1782994, 87980 },
- { 1822973, 88501 },
- { 1863717, 89021 },
- { 1905237, 89540 },
- { 1947545, 90057 },
- { 1990650, 90573 },
- { 2034566, 91087 },
- { 2079301, 91600 },
- { 2124869, 92111 },
- { 2171279, 92622 },
- { 2218543, 93131 },
- { 2266673, 93639 },
- { 2315680, 94145 },
- { 2365575, 94650 },
- { 2416371, 95154 },
- { 2468077, 95657 },
- { 2520707, 96159 },
- { 2574271, 96660 },
- { 2628782, 97159 },
- { 2684250, 97658 },
- { 2740689, 98155 },
- { 2798110, 98651 },
- { 2856524, 99147 },
- { 2915944, 99641 },
- { 2976382, 100134 },
- { 3037850, 100626 },
- { 3100360, 101117 },
- { 3163924, 101608 },
- { 3228554, 102097 },
- { 3294263, 102586 },
- { 3361063, 103073 },
- { 3428966, 103560 },
- { 3497984, 104045 },
- { 3568131, 104530 },
- { 3639419, 105014 },
- { 3711860, 105498 },
- { 3785467, 105980 },
- { 3860253, 106462 },
- { 3936229, 106942 },
- { 4013410, 107422 },
- { 4091808, 107902 },
- { 4171435, 108380 },
- { 4252306, 108858 },
- { 4334431, 109335 },
- { 4417825, 109811 },
- { 4502501, 110287 },
- { 4588472, 110762 },
- { 4675750, 111236 },
- { 4764349, 111709 },
- { 4854283, 112182 },
- { 4945564, 112654 },
- { 5038206, 113126 },
- { 5132223, 113597 },
- { 5227627, 114067 },
- { 5324432, 114537 },
- { 5422652, 115006 },
- { 5522299, 115474 },
- { 5623389, 115942 },
- { 5725934, 116409 },
- { 5829948, 116876 },
- { 5935446, 117342 },
- { 6042439, 117808 },
- { 6150943, 118273 },
- { 6260972, 118738 },
- { 6372538, 119202 },
- { 6485657, 119665 },
- { 6600342, 120128 },
- { 6716607, 120591 },
- { 6834467, 121053 },
- { 6953935, 121514 },
- { 7075025, 121976 },
- { 7197752, 122436 },
- { 7322131, 122896 },
- { 7448175, 123356 },
- { 7575898, 123815 },
- { 7705316, 124274 },
- { 7836442, 124733 },
- { 7969291, 125191 },
- { 8103877, 125648 },
- { 8240216, 126105 },
- { 8378321, 126562 },
- { 8518208, 127018 },
- { 8659890, 127474 },
- { 8803384, 127930 },
- { 8948702, 128385 },
- { 9095861, 128840 },
- { 9244875, 129294 },
- { 9395760, 129748 },
- { 9548529, 130202 },
- { 9703198, 130655 },
- { 9859782, 131108 },
- { 10018296, 131561 },
- { 10178755, 132014 },
- { 10341174, 132466 },
- { 10505569, 132917 },
- { 10671954, 133369 },
- { 10840345, 133820 },
- { 11010757, 134271 },
- { 11183206, 134721 },
- { 11357706, 135171 },
- { 11534274, 135621 },
- { 11712924, 136071 },
- { 11893673, 136520 },
- { 12076536, 136969 },
- { 12261527, 137418 },
- { 12448664, 137867 },
- { 12637961, 138315 },
- { 12829435, 138763 },
- { 13023101, 139211 },
- { 13218974, 139658 },
- { 13417071, 140106 },
- { 13617407, 140553 },
- { 13819999, 140999 },
- { 14024862, 141446 },
- { 14232012, 141892 },
- { 14441465, 142339 },
- { 14653238, 142785 },
- { 14867346, 143230 },
- { 15083805, 143676 },
- { 15302632, 144121 },
- { 15523842, 144566 },
- { 15747453, 145011 },
- { 15973479, 145456 },
- { 16201939, 145900 },
- { 16432847, 146345 },
- { 16666221, 146789 },
- { 16902076, 147233 },
- { 17140429, 147677 },
- { 17381297, 148121 },
- { 17624696, 148564 },
- { 17870643, 149007 },
- { 18119154, 149451 },
- { 18370247, 149894 },
- { 18623936, 150336 },
- { 18880241, 150779 },
- { 19139176, 151222 },
- { 19400759, 151664 },
- { 19665007, 152107 },
- { 19931936, 152549 },
- { 20201564, 152991 },
- { 20473907, 153433 },
- { 20748982, 153875 },
- { 21026807, 154316 },
- { 21307399, 154758 },
- { 21590773, 155199 },
- { 21876949, 155641 },
- { 22165941, 156082 },
- { 22457769, 156523 },
- { 22752449, 156964 },
- { 23049999, 157405 },
- { 23350435, 157846 },
- { 23653774, 158287 },
- { 23960036, 158727 },
- { 24269236, 159168 },
- { 24581392, 159608 },
- { 24896521, 160049 },
- { 25214642, 160489 },
- { 25535772, 160929 },
- { 25859927, 161370 },
- { 26187127, 161810 },
- { 26517388, 162250 },
- { 26850728, 162690 },
- { 27187165, 163130 },
- { 27526716, 163569 },
- { 27869400, 164009 },
- { 28215234, 164449 },
- { 28564236, 164889 },
- { 28916423, 165328 },
- { 29271815, 165768 },
- { 29630428, 166208 },
- { 29992281, 166647 },
- { 30357392, 167087 },
- { 30725779, 167526 },
- { 31097459, 167965 },
- { 31472452, 168405 },
- { 31850774, 168844 },
- { 32232445, 169283 },
- { 32617482, 169723 },
- { 33005904, 170162 },
- { 33397730, 170601 },
- { 33792976, 171041 },
- { 34191663, 171480 },
- { 34593807, 171919 },
- { 34999428, 172358 },
- { 35408544, 172797 },
- { 35821174, 173237 },
- { 36237335, 173676 },
- { 36657047, 174115 },
- { 37080329, 174554 },
- { 37507197, 174993 },
- { 37937673, 175433 },
- { 38371773, 175872 },
- { 38809517, 176311 },
- { 39250924, 176750 },
- { 39696012, 177190 },
- { 40144800, 177629 },
- { 40597308, 178068 },
- { 41053553, 178507 },
- { 41513554, 178947 },
- { 41977332, 179386 },
- { 42444904, 179825 },
- { 42916290, 180265 },
- { 43391509, 180704 },
- { 43870579, 181144 },
- { 44353520, 181583 },
- { 44840352, 182023 },
- { 45331092, 182462 },
- { 45825761, 182902 },
- { 46324378, 183342 },
- { 46826961, 183781 },
- { 47333531, 184221 },
- { 47844106, 184661 },
- { 48358706, 185101 },
- { 48877350, 185541 },
- { 49400058, 185981 },
- { 49926849, 186421 },
- { 50457743, 186861 },
- { 50992759, 187301 },
- { 51531916, 187741 },
- { 52075235, 188181 },
- { 52622735, 188622 },
- { 53174435, 189062 },
- { 53730355, 189502 },
- { 54290515, 189943 },
- { 54854935, 190383 },
- { 55423634, 190824 },
- { 55996633, 191265 },
- { 56573950, 191706 },
- { 57155606, 192146 },
- { 57741621, 192587 },
- { 58332014, 193028 },
- { 58926806, 193470 },
- { 59526017, 193911 },
- { 60129666, 194352 },
- { 60737774, 194793 },
- { 61350361, 195235 },
- { 61967446, 195677 },
- { 62589050, 196118 },
- { 63215194, 196560 },
- { 63845897, 197002 },
- { 64481179, 197444 },
- { 65121061, 197886 },
- { 65765563, 198328 },
- { 66414705, 198770 },
- { 67068508, 199213 },
- { 67726992, 199655 },
- { 68390177, 200098 },
- { 69058085, 200540 },
- { 69730735, 200983 },
- { 70408147, 201426 },
- { 71090343, 201869 },
- { 71777343, 202312 },
- { 72469168, 202755 },
- { 73165837, 203199 },
- { 73867373, 203642 },
- { 74573795, 204086 },
- { 75285124, 204529 },
- { 76001380, 204973 },
- { 76722586, 205417 },
- { 77448761, 205861 },
- { 78179926, 206306 },
- { 78916102, 206750 },
- { 79657310, 207194 },
- { 80403571, 207639 },
- { 81154906, 208084 },
- { 81911335, 208529 },
- { 82672880, 208974 },
- { 83439562, 209419 },
- { 84211402, 209864 },
- { 84988421, 210309 },
- { 85770640, 210755 },
- { 86558080, 211201 },
- { 87350762, 211647 },
- { 88148708, 212093 },
- { 88951938, 212539 },
- { 89760475, 212985 },
- { 90574339, 213432 },
- { 91393551, 213878 },
- { 92218133, 214325 },
- { 93048107, 214772 },
- { 93883493, 215219 },
- { 94724314, 215666 },
- { 95570590, 216114 },
- { 96422343, 216561 },
- { 97279594, 217009 },
- { 98142366, 217457 },
- { 99010679, 217905 },
- { 99884556, 218353 },
- { 100764018, 218801 },
- { 101649086, 219250 },
- { 102539782, 219698 },
- { 103436128, 220147 },
- { 104338146, 220596 },
- { 105245857, 221046 },
- { 106159284, 221495 },
- { 107078448, 221945 },
- { 108003370, 222394 },
- { 108934074, 222844 },
- { 109870580, 223294 },
- { 110812910, 223745 },
- { 111761087, 224195 },
- { 112715133, 224646 },
- { 113675069, 225097 },
- { 114640918, 225548 },
- { 115612702, 225999 },
- { 116590442, 226450 },
- { 117574162, 226902 },
- { 118563882, 227353 },
- { 119559626, 227805 },
- { 120561415, 228258 },
- { 121569272, 228710 },
- { 122583219, 229162 },
- { 123603278, 229615 },
- { 124629471, 230068 },
- { 125661822, 230521 },
- { 126700352, 230974 },
- { 127745083, 231428 },
- { 128796039, 231882 },
- { 129853241, 232336 },
- { 130916713, 232790 },
- { 131986475, 233244 },
- { 133062553, 233699 },
- { 134144966, 234153 },
- { 135233739, 234608 },
- { 136328894, 235064 },
- { 137430453, 235519 },
- { 138538440, 235975 },
- { 139652876, 236430 },
- { 140773786, 236886 },
- { 141901190, 237343 },
- { 143035113, 237799 },
- { 144175576, 238256 },
- { 145322604, 238713 },
- { 146476218, 239170 },
- { 147636442, 239627 },
- { 148803298, 240085 },
- { 149976809, 240542 },
- { 151156999, 241000 },
- { 152343890, 241459 },
- { 153537506, 241917 },
- { 154737869, 242376 },
- { 155945002, 242835 },
- { 157158929, 243294 },
- { 158379673, 243753 },
- { 159607257, 244213 },
- { 160841704, 244673 },
- { 162083037, 245133 },
- { 163331279, 245593 },
- { 164586455, 246054 },
- { 165848586, 246514 },
- { 167117696, 246975 },
- { 168393810, 247437 },
- { 169676949, 247898 },
- { 170967138, 248360 },
- { 172264399, 248822 },
- { 173568757, 249284 },
- { 174880235, 249747 },
- { 176198856, 250209 },
- { 177524643, 250672 },
- { 178857621, 251136 },
- { 180197813, 251599 },
- { 181545242, 252063 },
- { 182899933, 252527 },
- { 184261908, 252991 },
- { 185631191, 253456 },
- { 187007807, 253920 },
- { 188391778, 254385 },
- { 189783129, 254851 },
- { 191181884, 255316 },
- { 192588065, 255782 },
- { 194001698, 256248 },
- { 195422805, 256714 },
- { 196851411, 257181 },
- { 198287540, 257648 },
- { 199731215, 258115 },
- { 201182461, 258582 },
- { 202641302, 259050 },
- { 204107760, 259518 },
- { 205581862, 259986 },
- { 207063630, 260454 },
- { 208553088, 260923 },
- { 210050262, 261392 },
- { 211555174, 261861 },
- { 213067849, 262331 },
- { 214588312, 262800 },
- { 216116586, 263270 },
- { 217652696, 263741 },
- { 219196666, 264211 },
- { 220748520, 264682 },
- { 222308282, 265153 },
- { 223875978, 265625 },
- { 225451630, 266097 },
- { 227035265, 266569 },
- { 228626905, 267041 },
- { 230226576, 267514 },
- { 231834302, 267986 },
- { 233450107, 268460 },
- { 235074016, 268933 },
- { 236706054, 269407 },
- { 238346244, 269881 },
- { 239994613, 270355 },
- { 241651183, 270830 },
- { 243315981, 271305 }
-};
-
-/* Calculate the send rate as per section 3.1 of RFC3448
-
-Returns send rate in bytes per second
-
-Integer maths and lookups are used as not allowed floating point in kernel
-
-The function for Xcalc as per section 3.1 of RFC3448 is:
-
-X = s
- -------------------------------------------------------------
- R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
-
-where
-X is the trasmit rate in bytes/second
-s is the packet size in bytes
-R is the round trip time in seconds
-p is the loss event rate, between 0 and 1.0, of the number of loss events
- as a fraction of the number of packets transmitted
-t_RTO is the TCP retransmission timeout value in seconds
-b is the number of packets acknowledged by a single TCP acknowledgement
-
-we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
-
-X = s
- -----------------------------------------------------------------------
- R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
-
-
-which we can break down into:
-
-X = s
- --------
- R * f(p)
-
-where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
-
-Function parameters:
-s - bytes
-R - RTT in usecs
-p - loss rate (decimal fraction multiplied by 1,000,000)
-
-Returns Xcalc in bytes per second
-
-DON'T alter this code unless you run test cases against it as the code
-has been manipulated to stop underflow/overlow.
-
-*/
-u32 tfrc_calc_x(u16 s, u32 R, u32 p)
-{
- int index;
- u32 f;
- u64 tmp1, tmp2;
-
- if (p < TFRC_CALC_X_SPLIT)
- index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
- else
- index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
-
- if (index < 0)
- /* p should be 0 unless there is a bug in my code */
- index = 0;
-
- if (R == 0) {
- DCCP_WARN("RTT==0, setting to 1\n");
- R = 1; /* RTT can't be zero or else divide by zero */
- }
-
- BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
-
- if (p >= TFRC_CALC_X_SPLIT)
- f = tfrc_calc_x_lookup[index][0];
- else
- f = tfrc_calc_x_lookup[index][1];
-
- tmp1 = ((u64)s * 100000000);
- tmp2 = ((u64)R * (u64)f);
- do_div(tmp2, 10000);
- do_div(tmp1, tmp2);
- /* Don't alter above math unless you test due to overflow on 32 bit */
-
- return (u32)tmp1;
-}
-
-EXPORT_SYMBOL_GPL(tfrc_calc_x);
-
-/*
- * args: fvalue - function value to match
- * returns: p closest to that value
- *
- * both fvalue and p are multiplied by 1,000,000 to use ints
- */
-u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
-{
- int ctr = 0;
- int small;
-
- if (fvalue < tfrc_calc_x_lookup[0][1])
- return 0;
-
- if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
- small = 1;
- else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
- return 1000000;
- else
- small = 0;
-
- while (fvalue > tfrc_calc_x_lookup[ctr][small])
- ctr++;
-
- if (small)
- return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
- else
- return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
-}
-
-EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
deleted file mode 100644
index 68886986c8e4..000000000000
--- a/net/dccp/dccp.h
+++ /dev/null
@@ -1,453 +0,0 @@
-#ifndef _DCCP_H
-#define _DCCP_H
-/*
- * net/dccp/dccp.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <net/snmp.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include "ackvec.h"
-
-/*
- * DCCP - specific warning and debugging macros.
- */
-#define DCCP_WARN(fmt, a...) LIMIT_NETDEBUG(KERN_WARNING "%s: " fmt, \
- __FUNCTION__, ##a)
-#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
- __FILE__, __LINE__, __FUNCTION__)
-#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
-#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
- DCCP_BUG("\"%s\" holds (exception!)", \
- __stringify(cond)); \
- } while (0)
-
-#ifdef MODULE
-#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \
- printk(fmt, ##args); \
- } while(0)
-#else
-#define DCCP_PRINTK(enable, fmt, args...) printk(fmt, ##args)
-#endif
-#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
- "%s: " fmt, __FUNCTION__, ##a)
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-extern int dccp_debug;
-#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
-#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
-#else
-#define dccp_pr_debug(format, a...)
-#define dccp_pr_debug_cat(format, a...)
-#endif
-
-extern struct inet_hashinfo dccp_hashinfo;
-
-extern atomic_t dccp_orphan_count;
-
-extern void dccp_time_wait(struct sock *sk, int state, int timeo);
-
-/*
- * Set safe upper bounds for header and option length. Since Data Offset is 8
- * bits (RFC 4340, sec. 5.1), the total header length can never be more than
- * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
- * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR
- * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
- * Hence a safe upper bound for the maximum option length is 1020-28 = 992
- */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
-#define DCCP_MAX_PACKET_HDR 28
-#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
-#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
-
-#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
- * state, about 60 seconds */
-
-/* RFC 1122, 4.2.3.1 initial RTO value */
-#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
-
-/* Maximal interval between probes for local resources. */
-#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
-
-#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
-
-#define DCCP_XMIT_TIMEO 30000 /* Time/msecs for blocking transmit per packet */
-
-/* sysctl variables for DCCP */
-extern int sysctl_dccp_request_retries;
-extern int sysctl_dccp_retries1;
-extern int sysctl_dccp_retries2;
-extern int sysctl_dccp_feat_sequence_window;
-extern int sysctl_dccp_feat_rx_ccid;
-extern int sysctl_dccp_feat_tx_ccid;
-extern int sysctl_dccp_feat_ack_ratio;
-extern int sysctl_dccp_feat_send_ack_vector;
-extern int sysctl_dccp_feat_send_ndp_count;
-extern int sysctl_dccp_tx_qlen;
-
-/* is seq1 < seq2 ? */
-static inline int before48(const u64 seq1, const u64 seq2)
-{
- return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
-}
-
-/* is seq1 > seq2 ? */
-static inline int after48(const u64 seq1, const u64 seq2)
-{
- return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
-}
-
-/* is seq2 <= seq1 <= seq3 ? */
-static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
-{
- return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
-}
-
-static inline u64 max48(const u64 seq1, const u64 seq2)
-{
- return after48(seq1, seq2) ? seq1 : seq2;
-}
-
-/* is seq1 next seqno after seq2 */
-static inline int follows48(const u64 seq1, const u64 seq2)
-{
- int diff = (seq1 & 0xFFFF) - (seq2 & 0xFFFF);
-
- return diff==1;
-}
-
-enum {
- DCCP_MIB_NUM = 0,
- DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
- DCCP_MIB_ESTABRESETS, /* EstabResets */
- DCCP_MIB_CURRESTAB, /* CurrEstab */
- DCCP_MIB_OUTSEGS, /* OutSegs */
- DCCP_MIB_OUTRSTS,
- DCCP_MIB_ABORTONTIMEOUT,
- DCCP_MIB_TIMEOUTS,
- DCCP_MIB_ABORTFAILED,
- DCCP_MIB_PASSIVEOPENS,
- DCCP_MIB_ATTEMPTFAILS,
- DCCP_MIB_OUTDATAGRAMS,
- DCCP_MIB_INERRS,
- DCCP_MIB_OPTMANDATORYERROR,
- DCCP_MIB_INVALIDOPT,
- __DCCP_MIB_MAX
-};
-
-#define DCCP_MIB_MAX __DCCP_MIB_MAX
-struct dccp_mib {
- unsigned long mibs[DCCP_MIB_MAX];
-} __SNMP_MIB_ALIGN__;
-
-DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
-#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
-#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
-#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
-#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-#define DCCP_ADD_STATS_BH(field, val) \
- SNMP_ADD_STATS_BH(dccp_statistics, field, val)
-#define DCCP_ADD_STATS_USER(field, val) \
- SNMP_ADD_STATS_USER(dccp_statistics, field, val)
-
-/*
- * Checksumming routines
- */
-static inline int dccp_csum_coverage(const struct sk_buff *skb)
-{
- const struct dccp_hdr* dh = dccp_hdr(skb);
-
- if (dh->dccph_cscov == 0)
- return skb->len;
- return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
-}
-
-static inline void dccp_csum_outgoing(struct sk_buff *skb)
-{
- int cov = dccp_csum_coverage(skb);
-
- if (cov >= skb->len)
- dccp_hdr(skb)->dccph_cscov = 0;
-
- skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
-}
-
-extern void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
-
-extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
-
-extern void dccp_send_ack(struct sock *sk);
-extern void dccp_send_delayed_ack(struct sock *sk);
-extern void dccp_reqsk_send_ack(struct sk_buff *sk, struct request_sock *rsk);
-
-extern void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type);
-
-extern void dccp_write_xmit(struct sock *sk, int block);
-extern void dccp_write_space(struct sock *sk);
-
-extern void dccp_init_xmit_timers(struct sock *sk);
-static inline void dccp_clear_xmit_timers(struct sock *sk)
-{
- inet_csk_clear_xmit_timers(sk);
-}
-
-extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
-
-extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
-
-extern void dccp_set_state(struct sock *sk, const int state);
-extern void dccp_done(struct sock *sk);
-
-extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
-
-extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-
-extern struct sock *dccp_create_openreq_child(struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb);
-
-extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-
-extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst);
-extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct request_sock **prev);
-
-extern int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb);
-extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len);
-extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len);
-
-extern int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
-extern int dccp_destroy_sock(struct sock *sk);
-
-extern void dccp_close(struct sock *sk, long timeout);
-extern struct sk_buff *dccp_make_response(struct sock *sk,
- struct dst_entry *dst,
- struct request_sock *req);
-
-extern int dccp_connect(struct sock *sk);
-extern int dccp_disconnect(struct sock *sk, int flags);
-extern void dccp_hash(struct sock *sk);
-extern void dccp_unhash(struct sock *sk);
-extern int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-extern int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
-#ifdef CONFIG_COMPAT
-extern int compat_dccp_getsockopt(struct sock *sk,
- int level, int optname,
- char __user *optval, int __user *optlen);
-extern int compat_dccp_setsockopt(struct sock *sk,
- int level, int optname,
- char __user *optval, int optlen);
-#endif
-extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
-extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t size);
-extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len);
-extern void dccp_shutdown(struct sock *sk, int how);
-extern int inet_dccp_listen(struct socket *sock, int backlog);
-extern unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
-extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len);
-
-extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
-extern void dccp_send_close(struct sock *sk, const int active);
-extern int dccp_invalid_packet(struct sk_buff *skb);
-
-static inline int dccp_bad_service_code(const struct sock *sk,
- const __be32 service)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- if (dp->dccps_service == service)
- return 0;
- return !dccp_list_has_service(dp->dccps_service_list, service);
-}
-
-struct dccp_skb_cb {
- __u8 dccpd_type:4;
- __u8 dccpd_ccval:4;
- __u8 dccpd_reset_code;
- __u16 dccpd_opt_len;
- __u64 dccpd_seq;
- __u64 dccpd_ack_seq;
-};
-
-#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
-
-static inline int dccp_non_data_packet(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_ACK ||
- type == DCCP_PKT_CLOSE ||
- type == DCCP_PKT_CLOSEREQ ||
- type == DCCP_PKT_RESET ||
- type == DCCP_PKT_SYNC ||
- type == DCCP_PKT_SYNCACK;
-}
-
-static inline int dccp_packet_without_ack(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
-}
-
-#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
-#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
-
-static inline void dccp_set_seqno(u64 *seqno, u64 value)
-{
- if (value > DCCP_MAX_SEQNO)
- value -= DCCP_MAX_SEQNO + 1;
- *seqno = value;
-}
-
-static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
-{
- return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
-}
-
-static inline void dccp_inc_seqno(u64 *seqno)
-{
- if (++*seqno > DCCP_MAX_SEQNO)
- *seqno = 0;
-}
-
-static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
-{
- struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
- sizeof(*dh));
- dh->dccph_seq2 = 0;
- dh->dccph_seq = htons((gss >> 32) & 0xfffff);
- dhx->dccph_seq_low = htonl(gss & 0xffffffff);
-}
-
-static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
- const u64 gsr)
-{
- dhack->dccph_reserved1 = 0;
- dhack->dccph_ack_nr_high = htons(gsr >> 32);
- dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
-}
-
-static inline void dccp_update_gsr(struct sock *sk, u64 seq)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_minisock *dmsk = dccp_msk(sk);
-
- dp->dccps_gsr = seq;
- dccp_set_seqno(&dp->dccps_swl,
- dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
- dccp_set_seqno(&dp->dccps_swh,
- dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
-}
-
-static inline void dccp_update_gss(struct sock *sk, u64 seq)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- dp->dccps_awh = dp->dccps_gss = seq;
- dccp_set_seqno(&dp->dccps_awl,
- (dp->dccps_gss -
- dccp_msk(sk)->dccpms_sequence_window + 1));
-}
-
-static inline int dccp_ack_pending(const struct sock *sk)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- return dp->dccps_timestamp_echo != 0 ||
-#ifdef CONFIG_IP_DCCP_ACKVEC
- (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
-#endif
- inet_csk_ack_scheduled(sk);
-}
-
-extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
-extern int dccp_insert_option_elapsed_time(struct sock *sk,
- struct sk_buff *skb,
- u32 elapsed_time);
-extern int dccp_insert_option_timestamp(struct sock *sk,
- struct sk_buff *skb);
-extern int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- unsigned char option,
- const void *value, unsigned char len);
-
-extern void dccp_timestamp(const struct sock *sk, struct timeval *tv);
-
-static inline suseconds_t timeval_usecs(const struct timeval *tv)
-{
- return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
-}
-
-static inline suseconds_t timeval_delta(const struct timeval *large,
- const struct timeval *small)
-{
- time_t secs = large->tv_sec - small->tv_sec;
- suseconds_t usecs = large->tv_usec - small->tv_usec;
-
- if (usecs < 0) {
- secs--;
- usecs += USEC_PER_SEC;
- }
- return secs * USEC_PER_SEC + usecs;
-}
-
-static inline void timeval_add_usecs(struct timeval *tv,
- const suseconds_t usecs)
-{
- tv->tv_usec += usecs;
- while (tv->tv_usec >= USEC_PER_SEC) {
- tv->tv_sec++;
- tv->tv_usec -= USEC_PER_SEC;
- }
-}
-
-static inline void timeval_sub_usecs(struct timeval *tv,
- const suseconds_t usecs)
-{
- tv->tv_usec -= usecs;
- while (tv->tv_usec < 0) {
- tv->tv_sec--;
- tv->tv_usec += USEC_PER_SEC;
- }
-}
-
-#ifdef CONFIG_SYSCTL
-extern int dccp_sysctl_init(void);
-extern void dccp_sysctl_exit(void);
-#else
-static inline int dccp_sysctl_init(void)
-{
- return 0;
-}
-
-static inline void dccp_sysctl_exit(void)
-{
-}
-#endif
-
-#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
deleted file mode 100644
index 0f3745585a94..000000000000
--- a/net/dccp/diag.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * net/dccp/diag.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-
-#include <linux/module.h>
-#include <linux/inet_diag.h>
-
-#include "ccid.h"
-#include "dccp.h"
-
-static void dccp_get_info(struct sock *sk, struct tcp_info *info)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- memset(info, 0, sizeof(*info));
-
- info->tcpi_state = sk->sk_state;
- info->tcpi_retransmits = icsk->icsk_retransmits;
- info->tcpi_probes = icsk->icsk_probes_out;
- info->tcpi_backoff = icsk->icsk_backoff;
- info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
-
- if (dccp_msk(sk)->dccpms_send_ack_vector)
- info->tcpi_options |= TCPI_OPT_SACK;
-
- ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
- ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
-}
-
-static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
- void *_info)
-{
- r->idiag_rqueue = r->idiag_wqueue = 0;
-
- if (_info != NULL)
- dccp_get_info(sk, _info);
-}
-
-static struct inet_diag_handler dccp_diag_handler = {
- .idiag_hashinfo = &dccp_hashinfo,
- .idiag_get_info = dccp_diag_get_info,
- .idiag_type = DCCPDIAG_GETSOCK,
- .idiag_info_size = sizeof(struct tcp_info),
-};
-
-static int __init dccp_diag_init(void)
-{
- return inet_diag_register(&dccp_diag_handler);
-}
-
-static void __exit dccp_diag_fini(void)
-{
- inet_diag_unregister(&dccp_diag_handler);
-}
-
-module_init(dccp_diag_init);
-module_exit(dccp_diag_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCP inet_diag handler");
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
deleted file mode 100644
index 4dc487f27a1f..000000000000
--- a/net/dccp/feat.c
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- * net/dccp/feat.c
- *
- * An implementation of the DCCP protocol
- * Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-
-#include "ccid.h"
-#include "feat.h"
-
-#define DCCP_FEAT_SP_NOAGREE (-123)
-
-int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
- u8 *val, u8 len, gfp_t gfp)
-{
- struct dccp_opt_pend *opt;
-
- dccp_feat_debug(type, feature, *val);
-
- if (!dccp_feat_is_valid_type(type)) {
- DCCP_WARN("option type %d invalid in negotiation\n", type);
- return 1;
- }
- if (!dccp_feat_is_valid_length(type, feature, len)) {
- DCCP_WARN("invalid length %d\n", len);
- return 1;
- }
- /* XXX add further sanity checks */
-
- /* check if that feature is already being negotiated */
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- /* ok we found a negotiation for this option already */
- if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
- dccp_pr_debug("Replacing old\n");
- /* replace */
- BUG_ON(opt->dccpop_val == NULL);
- kfree(opt->dccpop_val);
- opt->dccpop_val = val;
- opt->dccpop_len = len;
- opt->dccpop_conf = 0;
- return 0;
- }
- }
-
- /* negotiation for a new feature */
- opt = kmalloc(sizeof(*opt), gfp);
- if (opt == NULL)
- return -ENOMEM;
-
- opt->dccpop_type = type;
- opt->dccpop_feat = feature;
- opt->dccpop_len = len;
- opt->dccpop_val = val;
- opt->dccpop_conf = 0;
- opt->dccpop_sc = NULL;
-
- BUG_ON(opt->dccpop_val == NULL);
-
- list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_change);
-
-static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
- /* figure out if we are changing our CCID or the peer's */
- const int rx = type == DCCPO_CHANGE_R;
- const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
- struct ccid *new_ccid;
-
- /* Check if nothing is being changed. */
- if (ccid_nr == new_ccid_nr)
- return 0;
-
- new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
- if (new_ccid == NULL)
- return -ENOMEM;
-
- if (rx) {
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = new_ccid;
- dmsk->dccpms_rx_ccid = new_ccid_nr;
- } else {
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_tx_ccid = new_ccid;
- dmsk->dccpms_tx_ccid = new_ccid_nr;
- }
-
- return 0;
-}
-
-/* XXX taking only u8 vals */
-static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
-{
- dccp_feat_debug(type, feat, val);
-
- switch (feat) {
- case DCCPF_CCID:
- return dccp_feat_update_ccid(sk, type, val);
- default:
- dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n",
- dccp_feat_typename(type), feat);
- break;
- }
- return 0;
-}
-
-static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
- u8 *rpref, u8 rlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- u8 *spref, slen, *res = NULL;
- int i, j, rc, agree = 1;
-
- BUG_ON(rpref == NULL);
-
- /* check if we are the black sheep */
- if (dp->dccps_role == DCCP_ROLE_CLIENT) {
- spref = rpref;
- slen = rlen;
- rpref = opt->dccpop_val;
- rlen = opt->dccpop_len;
- } else {
- spref = opt->dccpop_val;
- slen = opt->dccpop_len;
- }
- /*
- * Now we have server preference list in spref and client preference in
- * rpref
- */
- BUG_ON(spref == NULL);
- BUG_ON(rpref == NULL);
-
- /* FIXME sanity check vals */
-
- /* Are values in any order? XXX Lame "algorithm" here */
- /* XXX assume values are 1 byte */
- for (i = 0; i < slen; i++) {
- for (j = 0; j < rlen; j++) {
- if (spref[i] == rpref[j]) {
- res = &spref[i];
- break;
- }
- }
- if (res)
- break;
- }
-
- /* we didn't agree on anything */
- if (res == NULL) {
- /* confirm previous value */
- switch (opt->dccpop_feat) {
- case DCCPF_CCID:
- /* XXX did i get this right? =P */
- if (opt->dccpop_type == DCCPO_CHANGE_L)
- res = &dccp_msk(sk)->dccpms_tx_ccid;
- else
- res = &dccp_msk(sk)->dccpms_rx_ccid;
- break;
-
- default:
- DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat);
- /* XXX implement res */
- return -EFAULT;
- }
-
- dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
- agree = 0; /* this is used for mandatory options... */
- }
-
- /* need to put result and our preference list */
- /* XXX assume 1 byte vals */
- rlen = 1 + opt->dccpop_len;
- rpref = kmalloc(rlen, GFP_ATOMIC);
- if (rpref == NULL)
- return -ENOMEM;
-
- *rpref = *res;
- memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
-
- /* put it in the "confirm queue" */
- if (opt->dccpop_sc == NULL) {
- opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
- if (opt->dccpop_sc == NULL) {
- kfree(rpref);
- return -ENOMEM;
- }
- } else {
- /* recycle the confirm slot */
- BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
- kfree(opt->dccpop_sc->dccpoc_val);
- dccp_pr_debug("recycling confirm slot\n");
- }
- memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
-
- opt->dccpop_sc->dccpoc_val = rpref;
- opt->dccpop_sc->dccpoc_len = rlen;
-
- /* update the option on our side [we are about to send the confirm] */
- rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
- if (rc) {
- kfree(opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc);
- opt->dccpop_sc = NULL;
- return rc;
- }
-
- dccp_pr_debug("Will confirm %d\n", *rpref);
-
- /* say we want to change to X but we just got a confirm X, suppress our
- * change
- */
- if (!opt->dccpop_conf) {
- if (*opt->dccpop_val == *res)
- opt->dccpop_conf = 1;
- dccp_pr_debug("won't ask for change of same feature\n");
- }
-
- return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
-}
-
-static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
-{
- struct dccp_minisock *dmsk = dccp_msk(sk);
- struct dccp_opt_pend *opt;
- int rc = 1;
- u8 t;
-
- /*
- * We received a CHANGE. We gotta match it against our own preference
- * list. If we got a CHANGE_R it means it's a change for us, so we need
- * to compare our CHANGE_L list.
- */
- if (type == DCCPO_CHANGE_L)
- t = DCCPO_CHANGE_R;
- else
- t = DCCPO_CHANGE_L;
-
- /* find our preference list for this feature */
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- if (opt->dccpop_type != t || opt->dccpop_feat != feature)
- continue;
-
- /* find the winner from the two preference lists */
- rc = dccp_feat_reconcile(sk, opt, val, len);
- break;
- }
-
- /* We didn't deal with the change. This can happen if we have no
- * preference list for the feature. In fact, it just shouldn't
- * happen---if we understand a feature, we should have a preference list
- * with at least the default value.
- */
- BUG_ON(rc == 1);
-
- return rc;
-}
-
-static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
-{
- struct dccp_opt_pend *opt;
- struct dccp_minisock *dmsk = dccp_msk(sk);
- u8 *copy;
- int rc;
-
- /* NN features must be Change L (sec. 6.3.2) */
- if (type != DCCPO_CHANGE_L) {
- dccp_pr_debug("received %s for NN feature %d\n",
- dccp_feat_typename(type), feature);
- return -EFAULT;
- }
-
- /* XXX sanity check opt val */
-
- /* copy option so we can confirm it */
- opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
- if (opt == NULL)
- return -ENOMEM;
-
- copy = kmemdup(val, len, GFP_ATOMIC);
- if (copy == NULL) {
- kfree(opt);
- return -ENOMEM;
- }
-
- opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
- opt->dccpop_feat = feature;
- opt->dccpop_val = copy;
- opt->dccpop_len = len;
-
- /* change feature */
- rc = dccp_feat_update(sk, type, feature, *val);
- if (rc) {
- kfree(opt->dccpop_val);
- kfree(opt);
- return rc;
- }
-
- dccp_feat_debug(type, feature, *copy);
-
- list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
-
- return 0;
-}
-
-static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
- u8 type, u8 feature)
-{
- /* XXX check if other confirms for that are queued and recycle slot */
- struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
-
- if (opt == NULL) {
- /* XXX what do we do? Ignoring should be fine. It's a change
- * after all =P
- */
- return;
- }
-
- switch (type) {
- case DCCPO_CHANGE_L: opt->dccpop_type = DCCPO_CONFIRM_R; break;
- case DCCPO_CHANGE_R: opt->dccpop_type = DCCPO_CONFIRM_L; break;
- default: DCCP_WARN("invalid type %d\n", type); return;
-
- }
- opt->dccpop_feat = feature;
- opt->dccpop_val = NULL;
- opt->dccpop_len = 0;
-
- /* change feature */
- dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature);
-
- list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
-}
-
-static void dccp_feat_flush_confirm(struct sock *sk)
-{
- struct dccp_minisock *dmsk = dccp_msk(sk);
- /* Check if there is anything to confirm in the first place */
- int yes = !list_empty(&dmsk->dccpms_conf);
-
- if (!yes) {
- struct dccp_opt_pend *opt;
-
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- if (opt->dccpop_conf) {
- yes = 1;
- break;
- }
- }
- }
-
- if (!yes)
- return;
-
- /* OK there is something to confirm... */
- /* XXX check if packet is in flight? Send delayed ack?? */
- if (sk->sk_state == DCCP_OPEN)
- dccp_send_ack(sk);
-}
-
-int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
-{
- int rc;
-
- dccp_feat_debug(type, feature, *val);
-
- /* figure out if it's SP or NN feature */
- switch (feature) {
- /* deal with SP features */
- case DCCPF_CCID:
- rc = dccp_feat_sp(sk, type, feature, val, len);
- break;
-
- /* deal with NN features */
- case DCCPF_ACK_RATIO:
- rc = dccp_feat_nn(sk, type, feature, val, len);
- break;
-
- /* XXX implement other features */
- default:
- dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n",
- dccp_feat_typename(type), feature);
- rc = -EFAULT;
- break;
- }
-
- /* check if there were problems changing features */
- if (rc) {
- /* If we don't agree on SP, we sent a confirm for old value.
- * However we propagate rc to caller in case option was
- * mandatory
- */
- if (rc != DCCP_FEAT_SP_NOAGREE)
- dccp_feat_empty_confirm(dccp_msk(sk), type, feature);
- }
-
- /* generate the confirm [if required] */
- dccp_feat_flush_confirm(sk);
-
- return rc;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
-
-int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
- u8 *val, u8 len)
-{
- u8 t;
- struct dccp_opt_pend *opt;
- struct dccp_minisock *dmsk = dccp_msk(sk);
- int found = 0;
- int all_confirmed = 1;
-
- dccp_feat_debug(type, feature, *val);
-
- /* locate our change request */
- switch (type) {
- case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break;
- case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break;
- default: DCCP_WARN("invalid type %d\n", type);
- return 1;
-
- }
- /* XXX sanity check feature value */
-
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- if (!opt->dccpop_conf && opt->dccpop_type == t &&
- opt->dccpop_feat == feature) {
- found = 1;
- dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
-
- /* XXX do sanity check */
-
- opt->dccpop_conf = 1;
-
- /* We got a confirmation---change the option */
- dccp_feat_update(sk, opt->dccpop_type,
- opt->dccpop_feat, *val);
-
- /* XXX check the return value of dccp_feat_update */
- break;
- }
-
- if (!opt->dccpop_conf)
- all_confirmed = 0;
- }
-
- /* fix re-transmit timer */
- /* XXX gotta make sure that no option negotiation occurs during
- * connection shutdown. Consider that the CLOSEREQ is sent and timer is
- * on. if all options are confirmed it might kill timer which should
- * remain alive until close is received.
- */
- if (all_confirmed) {
- dccp_pr_debug("clear feat negotiation timer %p\n", sk);
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
- }
-
- if (!found)
- dccp_pr_debug("%s(%d, ...) never requested\n",
- dccp_feat_typename(type), feature);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
-
-void dccp_feat_clean(struct dccp_minisock *dmsk)
-{
- struct dccp_opt_pend *opt, *next;
-
- list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
- dccpop_node) {
- BUG_ON(opt->dccpop_val == NULL);
- kfree(opt->dccpop_val);
-
- if (opt->dccpop_sc != NULL) {
- BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
- kfree(opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc);
- }
-
- kfree(opt);
- }
- INIT_LIST_HEAD(&dmsk->dccpms_pending);
-
- list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
- BUG_ON(opt == NULL);
- if (opt->dccpop_val != NULL)
- kfree(opt->dccpop_val);
- kfree(opt);
- }
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_clean);
-
-/* this is to be called only when a listening sock creates its child. It is
- * assumed by the function---the confirm is not duplicated, but rather it is
- * "passed on".
- */
-int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
-{
- struct dccp_minisock *olddmsk = dccp_msk(oldsk);
- struct dccp_minisock *newdmsk = dccp_msk(newsk);
- struct dccp_opt_pend *opt;
- int rc = 0;
-
- INIT_LIST_HEAD(&newdmsk->dccpms_pending);
- INIT_LIST_HEAD(&newdmsk->dccpms_conf);
-
- list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
- struct dccp_opt_pend *newopt;
- /* copy the value of the option */
- u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
-
- if (val == NULL)
- goto out_clean;
-
- newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC);
- if (newopt == NULL) {
- kfree(val);
- goto out_clean;
- }
-
- /* insert the option */
- newopt->dccpop_val = val;
- list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
-
- /* XXX what happens with backlogs and multiple connections at
- * once...
- */
- /* the master socket no longer needs to worry about confirms */
- opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */
-
- /* reset state for a new socket */
- opt->dccpop_conf = 0;
- }
-
- /* XXX not doing anything about the conf queue */
-
-out:
- return rc;
-
-out_clean:
- dccp_feat_clean(newdmsk);
- rc = -ENOMEM;
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_clone);
-
-static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat,
- u8 *val, u8 len)
-{
- int rc = -ENOMEM;
- u8 *copy = kmemdup(val, len, GFP_KERNEL);
-
- if (copy != NULL) {
- rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL);
- if (rc)
- kfree(copy);
- }
- return rc;
-}
-
-int dccp_feat_init(struct dccp_minisock *dmsk)
-{
- int rc;
-
- INIT_LIST_HEAD(&dmsk->dccpms_pending);
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
-
- /* CCID L */
- rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID,
- &dmsk->dccpms_tx_ccid, 1);
- if (rc)
- goto out;
-
- /* CCID R */
- rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
- &dmsk->dccpms_rx_ccid, 1);
- if (rc)
- goto out;
-
- /* Ack ratio */
- rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
- &dmsk->dccpms_ack_ratio, 1);
-out:
- return rc;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_init);
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-const char *dccp_feat_typename(const u8 type)
-{
- switch(type) {
- case DCCPO_CHANGE_L: return("ChangeL");
- case DCCPO_CONFIRM_L: return("ConfirmL");
- case DCCPO_CHANGE_R: return("ChangeR");
- case DCCPO_CONFIRM_R: return("ConfirmR");
- /* the following case must not appear in feature negotation */
- default: dccp_pr_debug("unknown type %d [BUG!]\n", type);
- }
- return NULL;
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_typename);
-
-const char *dccp_feat_name(const u8 feat)
-{
- static const char *feature_names[] = {
- [DCCPF_RESERVED] = "Reserved",
- [DCCPF_CCID] = "CCID",
- [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
- [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
- [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
- [DCCPF_ACK_RATIO] = "Ack Ratio",
- [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
- [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
- [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
- [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
- };
- if (feat >= DCCPF_MIN_CCID_SPECIFIC)
- return "CCID-specific";
-
- if (dccp_feat_is_reserved(feat))
- return feature_names[DCCPF_RESERVED];
-
- return feature_names[feat];
-}
-
-EXPORT_SYMBOL_GPL(dccp_feat_name);
-#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
deleted file mode 100644
index 2c373ad7edcf..000000000000
--- a/net/dccp/feat.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef _DCCP_FEAT_H
-#define _DCCP_FEAT_H
-/*
- * net/dccp/feat.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include "dccp.h"
-
-static inline int dccp_feat_is_valid_length(u8 type, u8 feature, u8 len)
-{
- /* sec. 6.1: Confirm has at least length 3,
- * sec. 6.2: Change has at least length 4 */
- if (len < 3)
- return 1;
- if (len < 4 && (type == DCCPO_CHANGE_L || type == DCCPO_CHANGE_R))
- return 1;
- /* XXX: add per-feature length validation (sec. 6.6.8) */
- return 0;
-}
-
-static inline int dccp_feat_is_reserved(const u8 feat)
-{
- return (feat > DCCPF_DATA_CHECKSUM &&
- feat < DCCPF_MIN_CCID_SPECIFIC) ||
- feat == DCCPF_RESERVED;
-}
-
-/* feature negotiation knows only these four option types (RFC 4340, sec. 6) */
-static inline int dccp_feat_is_valid_type(const u8 optnum)
-{
- return optnum >= DCCPO_CHANGE_L && optnum <= DCCPO_CONFIRM_R;
-
-}
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-extern const char *dccp_feat_typename(const u8 type);
-extern const char *dccp_feat_name(const u8 feat);
-
-static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
-{
- dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
- dccp_feat_name(feat), feat, val);
-}
-#else
-#define dccp_feat_debug(type, feat, val)
-#endif /* CONFIG_IP_DCCP_DEBUG */
-
-extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
- u8 *val, u8 len, gfp_t gfp);
-extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
- u8 *val, u8 len);
-extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
- u8 *val, u8 len);
-extern void dccp_feat_clean(struct dccp_minisock *dmsk);
-extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
-extern int dccp_feat_init(struct dccp_minisock *dmsk);
-
-#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
deleted file mode 100644
index 7371a2f3acf4..000000000000
--- a/net/dccp/input.c
+++ /dev/null
@@ -1,577 +0,0 @@
-/*
- * net/dccp/input.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-
-#include <net/sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-
-static void dccp_fin(struct sock *sk, struct sk_buff *skb)
-{
- sk->sk_shutdown |= RCV_SHUTDOWN;
- sock_set_flag(sk, SOCK_DONE);
- __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
-}
-
-static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
-{
- dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
- dccp_fin(sk, skb);
- dccp_set_state(sk, DCCP_CLOSED);
- sk_wake_async(sk, 1, POLL_HUP);
-}
-
-static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
-{
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == CloseReq)
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
- return;
- }
-
- if (sk->sk_state != DCCP_CLOSING)
- dccp_set_state(sk, DCCP_CLOSING);
- dccp_send_close(sk, 0);
-}
-
-static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (dccp_msk(sk)->dccpms_send_ack_vector)
- dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
-}
-
-static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- struct dccp_sock *dp = dccp_sk(sk);
- u64 lswl, lawl;
-
- /*
- * Step 5: Prepare sequence numbers for Sync
- * If P.type == Sync or P.type == SyncAck,
- * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
- * / * P is valid, so update sequence number variables
- * accordingly. After this update, P will pass the tests
- * in Step 6. A SyncAck is generated if necessary in
- * Step 15 * /
- * Update S.GSR, S.SWL, S.SWH
- * Otherwise,
- * Drop packet and return
- */
- if (dh->dccph_type == DCCP_PKT_SYNC ||
- dh->dccph_type == DCCP_PKT_SYNCACK) {
- if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dp->dccps_awl, dp->dccps_awh) &&
- !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
- dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
- else
- return -1;
- }
-
- /*
- * Step 6: Check sequence numbers
- * Let LSWL = S.SWL and LAWL = S.AWL
- * If P.type == CloseReq or P.type == Close or P.type == Reset,
- * LSWL := S.GSR + 1, LAWL := S.GAR
- * If LSWL <= P.seqno <= S.SWH
- * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
- * Update S.GSR, S.SWL, S.SWH
- * If P.type != Sync,
- * Update S.GAR
- * Otherwise,
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- lswl = dp->dccps_swl;
- lawl = dp->dccps_awl;
-
- if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
- dh->dccph_type == DCCP_PKT_CLOSE ||
- dh->dccph_type == DCCP_PKT_RESET) {
- lswl = dp->dccps_gsr;
- dccp_inc_seqno(&lswl);
- lawl = dp->dccps_gar;
- }
-
- if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
- (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
- between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- lawl, dp->dccps_awh))) {
- dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
-
- if (dh->dccph_type != DCCP_PKT_SYNC &&
- (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
- DCCP_PKT_WITHOUT_ACK_SEQ))
- dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
- } else {
- DCCP_WARN("DCCP: Step 6 failed for %s packet, "
- "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
- "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
- "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
- (unsigned long long) lswl,
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq,
- (unsigned long long) dp->dccps_swh,
- (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
- DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
- (unsigned long long) lawl,
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long) dp->dccps_awh);
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
- return -1;
- }
-
- return 0;
-}
-
-static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- switch (dccp_hdr(skb)->dccph_type) {
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_DATA:
- /*
- * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
- * option if it is.
- */
- __skb_pull(skb, dh->dccph_doff * 4);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
- return 0;
- case DCCP_PKT_ACK:
- goto discard;
- case DCCP_PKT_RESET:
- /*
- * Step 9: Process Reset
- * If P.type == Reset,
- * Tear down connection
- * S.state := TIMEWAIT
- * Set TIMEWAIT timer
- * Drop packet and return
- */
- dccp_fin(sk, skb);
- dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
- return 0;
- case DCCP_PKT_CLOSEREQ:
- dccp_rcv_closereq(sk, skb);
- goto discard;
- case DCCP_PKT_CLOSE:
- dccp_rcv_close(sk, skb);
- return 0;
- case DCCP_PKT_REQUEST:
- /* Step 7
- * or (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state >= OPEN and P.type == Request
- * and P.seqno >= S.OSR)
- * or (S.state >= OPEN and P.type == Response
- * and P.seqno >= S.OSR)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if (dp->dccps_role != DCCP_ROLE_LISTEN)
- goto send_sync;
- goto check_seq;
- case DCCP_PKT_RESPONSE:
- if (dp->dccps_role != DCCP_ROLE_CLIENT)
- goto send_sync;
-check_seq:
- if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
-send_sync:
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_PKT_SYNC);
- }
- break;
- case DCCP_PKT_SYNC:
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_PKT_SYNCACK);
- /*
- * From RFC 4340, sec. 5.7
- *
- * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
- * MAY have non-zero-length application data areas, whose
- * contents receivers MUST ignore.
- */
- goto discard;
- }
-
- DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
-discard:
- __kfree_skb(skb);
- return 0;
-}
-
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- if (dccp_parse_options(sk, skb))
- goto discard;
-
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
-
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
- return __dccp_rcv_established(sk, skb, dh, len);
-discard:
- __kfree_skb(skb);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rcv_established);
-
-static int dccp_rcv_request_sent_state_process(struct sock *sk,
- struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned len)
-{
- /*
- * Step 4: Prepare sequence numbers in REQUEST
- * If S.state == REQUEST,
- * If (P.type == Response or P.type == Reset)
- * and S.AWL <= P.ackno <= S.AWH,
- * / * Set sequence number variables corresponding to the
- * other endpoint, so P will pass the tests in Step 6 * /
- * Set S.GSR, S.ISR, S.SWL, S.SWH
- * / * Response processing continues in Step 10; Reset
- * processing continues in Step 9 * /
- */
- if (dh->dccph_type == DCCP_PKT_RESPONSE) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
-
- /* Stop the REQUEST timer */
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
- BUG_TRAP(sk->sk_send_head != NULL);
- __kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
-
- if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dp->dccps_awl, dp->dccps_awh)) {
- dccp_pr_debug("invalid ackno: S.AWL=%llu, "
- "P.ackno=%llu, S.AWH=%llu \n",
- (unsigned long long)dp->dccps_awl,
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long)dp->dccps_awh);
- goto out_invalid_packet;
- }
-
- if (dccp_parse_options(sk, skb))
- goto out_invalid_packet;
-
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto out_invalid_packet; /* FIXME: change error code */
-
- dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_update_gsr(sk, dp->dccps_isr);
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- *
- * AWL was adjusted in dccp_v4_connect -acme
- */
- dccp_set_seqno(&dp->dccps_swl,
- max48(dp->dccps_swl, dp->dccps_isr));
-
- dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
-
- /*
- * Step 10: Process REQUEST state (second part)
- * If S.state == REQUEST,
- * / * If we get here, P is a valid Response from the
- * server (see Step 4), and we should move to
- * PARTOPEN state. PARTOPEN means send an Ack,
- * don't send Data packets, retransmit Acks
- * periodically, and always include any Init Cookie
- * from the Response * /
- * S.state := PARTOPEN
- * Set PARTOPEN timer
- * Continue with S.state == PARTOPEN
- * / * Step 12 will send the Ack completing the
- * three-way handshake * /
- */
- dccp_set_state(sk, DCCP_PARTOPEN);
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
- }
-
- if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
- icsk->icsk_accept_queue.rskq_defer_accept) {
- /* Save one ACK. Data will be ready after
- * several ticks, if write_pending is set.
- *
- * It may be deleted, but with this feature tcpdumps
- * look so _wonderfully_ clever, that I was not able
- * to stand against the temptation 8) --ANK
- */
- /*
- * OK, in DCCP we can as well do a similar trick, its
- * even in the draft, but there is no need for us to
- * schedule an ack here, as dccp_sendmsg does this for
- * us, also stated in the draft. -acme
- */
- __kfree_skb(skb);
- return 0;
- }
- dccp_send_ack(sk);
- return -1;
- }
-
-out_invalid_packet:
- /* dccp_v4_do_rcv will send a reset */
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
- return 1;
-}
-
-static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
- struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned len)
-{
- int queued = 0;
-
- switch (dh->dccph_type) {
- case DCCP_PKT_RESET:
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- break;
- case DCCP_PKT_DATA:
- if (sk->sk_state == DCCP_RESPOND)
- break;
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_ACK:
- /*
- * FIXME: we should be reseting the PARTOPEN (DELACK) timer
- * here but only if we haven't used the DELACK timer for
- * something else, like sending a delayed ack for a TIMESTAMP
- * echo, etc, for now were not clearing it, sending an extra
- * ACK when there is nothing else to do in DELACK is not a big
- * deal after all.
- */
-
- /* Stop the PARTOPEN timer */
- if (sk->sk_state == DCCP_PARTOPEN)
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-
- dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_set_state(sk, DCCP_OPEN);
-
- if (dh->dccph_type == DCCP_PKT_DATAACK ||
- dh->dccph_type == DCCP_PKT_DATA) {
- __dccp_rcv_established(sk, skb, dh, len);
- queued = 1; /* packet was queued
- (by __dccp_rcv_established) */
- }
- break;
- }
-
- return queued;
-}
-
-int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int old_state = sk->sk_state;
- int queued = 0;
-
- /*
- * Step 3: Process LISTEN state
- *
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
- * Cookies Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_LISTEN) {
- if (dh->dccph_type == DCCP_PKT_REQUEST) {
- if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
- skb) < 0)
- return 1;
-
- /* FIXME: do congestion control initialization */
- goto discard;
- }
- if (dh->dccph_type == DCCP_PKT_RESET)
- goto discard;
-
- /* Caller (dccp_v4_do_rcv) will send Reset */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
- }
-
- if (sk->sk_state != DCCP_REQUESTING) {
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- /*
- * Step 8: Process options and mark acknowledgeable
- */
- if (dccp_parse_options(sk, skb))
- goto discard;
-
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
-
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
- }
-
- /*
- * Step 9: Process Reset
- * If P.type == Reset,
- * Tear down connection
- * S.state := TIMEWAIT
- * Set TIMEWAIT timer
- * Drop packet and return
- */
- if (dh->dccph_type == DCCP_PKT_RESET) {
- /*
- * Queue the equivalent of TCP fin so that dccp_recvmsg
- * exits the loop
- */
- dccp_fin(sk, skb);
- dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
- return 0;
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == CloseReq)
- * or (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
- (dh->dccph_type == DCCP_PKT_RESPONSE ||
- dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
- (dp->dccps_role == DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_REQUEST) ||
- (sk->sk_state == DCCP_RESPOND &&
- dh->dccph_type == DCCP_PKT_DATA)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
- dccp_rcv_closereq(sk, skb);
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
- dccp_rcv_close(sk, skb);
- return 0;
- }
-
- if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
- goto discard;
- }
-
- switch (sk->sk_state) {
- case DCCP_CLOSED:
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
-
- case DCCP_REQUESTING:
- /* FIXME: do congestion control initialization */
-
- queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
- if (queued >= 0)
- return queued;
-
- __kfree_skb(skb);
- return 0;
-
- case DCCP_RESPOND:
- case DCCP_PARTOPEN:
- queued = dccp_rcv_respond_partopen_state_process(sk, skb,
- dh, len);
- break;
- }
-
- if (dh->dccph_type == DCCP_PKT_ACK ||
- dh->dccph_type == DCCP_PKT_DATAACK) {
- switch (old_state) {
- case DCCP_PARTOPEN:
- sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
- break;
- }
- }
-
- if (!queued) {
-discard:
- __kfree_skb(skb);
- }
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
deleted file mode 100644
index ff81679c9f17..000000000000
--- a/net/dccp/ipv4.c
+++ /dev/null
@@ -1,1076 +0,0 @@
-/*
- * net/dccp/ipv4.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/icmp.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/random.h>
-
-#include <net/icmp.h>
-#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_sock.h>
-#include <net/protocol.h>
-#include <net/sock.h>
-#include <net/timewait_sock.h>
-#include <net/tcp_states.h>
-#include <net/xfrm.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-/*
- * This is the global socket data structure used for responding to
- * the Out-of-the-blue (OOTB) packets. A control sock will be created
- * for this socket at the initialization time.
- */
-static struct socket *dccp_v4_ctl_socket;
-
-static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
-{
- return inet_csk_get_port(&dccp_hashinfo, sk, snum,
- inet_csk_bind_conflict);
-}
-
-int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct rtable *rt;
- __be32 daddr, nexthop;
- int tmp;
- int err;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < sizeof(struct sockaddr_in))
- return -EINVAL;
-
- if (usin->sin_family != AF_INET)
- return -EAFNOSUPPORT;
-
- nexthop = daddr = usin->sin_addr.s_addr;
- if (inet->opt != NULL && inet->opt->srr) {
- if (daddr == 0)
- return -EINVAL;
- nexthop = inet->opt->faddr;
- }
-
- tmp = ip_route_connect(&rt, nexthop, inet->saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_DCCP,
- inet->sport, usin->sin_port, sk);
- if (tmp < 0)
- return tmp;
-
- if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
- ip_rt_put(rt);
- return -ENETUNREACH;
- }
-
- if (inet->opt == NULL || !inet->opt->srr)
- daddr = rt->rt_dst;
-
- if (inet->saddr == 0)
- inet->saddr = rt->rt_src;
- inet->rcv_saddr = inet->saddr;
-
- inet->dport = usin->sin_port;
- inet->daddr = daddr;
-
- inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet->opt != NULL)
- inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
- /*
- * Socket identity is still unknown (sport may be zero).
- * However we set state to DCCP_REQUESTING and not releasing socket
- * lock select source port, enter ourselves into the hash tables and
- * complete initialization after this.
- */
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet_hash_connect(&dccp_death_row, sk);
- if (err != 0)
- goto failure;
-
- err = ip_route_newports(&rt, IPPROTO_DCCP, inet->sport, inet->dport,
- sk);
- if (err != 0)
- goto failure;
-
- /* OK, now commit destination to socket. */
- sk_setup_caps(sk, &rt->u.dst);
-
- dp->dccps_iss = secure_dccp_sequence_number(inet->saddr, inet->daddr,
- inet->sport, inet->dport);
- inet->id = dp->dccps_iss ^ jiffies;
-
- err = dccp_connect(sk);
- rt = NULL;
- if (err != 0)
- goto failure;
-out:
- return err;
-failure:
- /*
- * This unhashes the socket and releases the local port, if necessary.
- */
- dccp_set_state(sk, DCCP_CLOSED);
- ip_rt_put(rt);
- sk->sk_route_caps = 0;
- inet->dport = 0;
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_connect);
-
-/*
- * This routine does path mtu discovery as defined in RFC1191.
- */
-static inline void dccp_do_pmtu_discovery(struct sock *sk,
- const struct iphdr *iph,
- u32 mtu)
-{
- struct dst_entry *dst;
- const struct inet_sock *inet = inet_sk(sk);
- const struct dccp_sock *dp = dccp_sk(sk);
-
- /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
- * send out by Linux are always < 576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
- return;
-
- dst->ops->update_pmtu(dst, mtu);
-
- /* Something is about to be wrong... Remember soft error
- * for the case, if this connection will not able to recover.
- */
- if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
- sk->sk_err_soft = EMSGSIZE;
-
- mtu = dst_mtu(dst);
-
- if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- inet_csk(sk)->icsk_pmtu_cookie > mtu) {
- dccp_sync_mss(sk, mtu);
-
- /*
- * From RFC 4340, sec. 14.1:
- *
- * DCCP-Sync packets are the best choice for upward
- * probing, since DCCP-Sync probes do not risk application
- * data loss.
- */
- dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
- } /* else let the usual retransmit timer handle it */
-}
-
-/*
- * This routine is called by the ICMP module when it gets some sort of error
- * condition. If err < 0 then the socket should be closed and the error
- * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
- * After adjustment header points to the first 8 bytes of the tcp header. We
- * need to find the appropriate port.
- *
- * The locking strategy used here is very "optimistic". When someone else
- * accesses the socket the ICMP is just dropped and for some paths there is no
- * check at all. A more general error queue to queue errors for later handling
- * is probably better.
- */
-static void dccp_v4_err(struct sk_buff *skb, u32 info)
-{
- const struct iphdr *iph = (struct iphdr *)skb->data;
- const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
- (iph->ihl << 2));
- struct dccp_sock *dp;
- struct inet_sock *inet;
- const int type = skb->h.icmph->type;
- const int code = skb->h.icmph->code;
- struct sock *sk;
- __u64 seq;
- int err;
-
- if (skb->len < (iph->ihl << 2) + 8) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- return;
- }
-
- sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
- iph->saddr, dh->dccph_sport, inet_iif(skb));
- if (sk == NULL) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- return;
- }
-
- if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put(inet_twsk(sk));
- return;
- }
-
- bh_lock_sock(sk);
- /* If too many ICMPs get dropped on busy
- * servers this needs to be solved differently.
- */
- if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- dp = dccp_sk(sk);
- seq = dccp_hdr_seq(skb);
- if (sk->sk_state != DCCP_LISTEN &&
- !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- switch (type) {
- case ICMP_SOURCE_QUENCH:
- /* Just silently ignore these. */
- goto out;
- case ICMP_PARAMETERPROB:
- err = EPROTO;
- break;
- case ICMP_DEST_UNREACH:
- if (code > NR_ICMP_UNREACH)
- goto out;
-
- if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
- if (!sock_owned_by_user(sk))
- dccp_do_pmtu_discovery(sk, iph, info);
- goto out;
- }
-
- err = icmp_err_convert[code].errno;
- break;
- case ICMP_TIME_EXCEEDED:
- err = EHOSTUNREACH;
- break;
- default:
- goto out;
- }
-
- switch (sk->sk_state) {
- struct request_sock *req , **prev;
- case DCCP_LISTEN:
- if (sock_owned_by_user(sk))
- goto out;
- req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
- iph->daddr, iph->saddr);
- if (!req)
- goto out;
-
- /*
- * ICMPs are not backlogged, hence we cannot get an established
- * socket here.
- */
- BUG_TRAP(!req->sk);
-
- if (seq != dccp_rsk(req)->dreq_iss) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
- /*
- * Still in RESPOND, just remove it silently.
- * There is no good way to pass the error to the newly
- * created socket, and POSIX does not want network
- * errors returned from accept().
- */
- inet_csk_reqsk_queue_drop(sk, req, prev);
- goto out;
-
- case DCCP_REQUESTING:
- case DCCP_RESPOND:
- if (!sock_owned_by_user(sk)) {
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
- sk->sk_err = err;
-
- sk->sk_error_report(sk);
-
- dccp_done(sk);
- } else
- sk->sk_err_soft = err;
- goto out;
- }
-
- /* If we've already connected we will keep trying
- * until we time out, or the user gives up.
- *
- * rfc1122 4.2.3.9 allows to consider as hard errors
- * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
- * but it is obsoleted by pmtu discovery).
- *
- * Note, that in modern internet, where routing is unreliable
- * and in each dark corner broken firewalls sit, sending random
- * errors ordered by their masters even this two messages finally lose
- * their original sense (even Linux sends invalid PORT_UNREACHs)
- *
- * Now we are in compliance with RFCs.
- * --ANK (980905)
- */
-
- inet = inet_sk(sk);
- if (!sock_owned_by_user(sk) && inet->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
- } else /* Only an error on timeout */
- sk->sk_err_soft = err;
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
- __be32 src, __be32 dst)
-{
- return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
-}
-
-void dccp_v4_send_check(struct sock *sk, int unused, struct sk_buff *skb)
-{
- const struct inet_sock *inet = inet_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb, inet->saddr, inet->daddr);
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_send_check);
-
-static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
-{
- return secure_dccp_sequence_number(skb->nh.iph->daddr,
- skb->nh.iph->saddr,
- dccp_hdr(skb)->dccph_dport,
- dccp_hdr(skb)->dccph_sport);
-}
-
-/*
- * The three way handshake has completed - we got a valid ACK or DATAACK -
- * now create the new socket.
- *
- * This is the equivalent of TCP's tcp_v4_syn_recv_sock
- */
-struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
-{
- struct inet_request_sock *ireq;
- struct inet_sock *newinet;
- struct dccp_sock *newdp;
- struct sock *newsk;
-
- if (sk_acceptq_is_full(sk))
- goto exit_overflow;
-
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto exit;
-
- newsk = dccp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto exit;
-
- sk_setup_caps(newsk, dst);
-
- newdp = dccp_sk(newsk);
- newinet = inet_sk(newsk);
- ireq = inet_rsk(req);
- newinet->daddr = ireq->rmt_addr;
- newinet->rcv_saddr = ireq->loc_addr;
- newinet->saddr = ireq->loc_addr;
- newinet->opt = ireq->opt;
- ireq->opt = NULL;
- newinet->mc_index = inet_iif(skb);
- newinet->mc_ttl = skb->nh.iph->ttl;
- newinet->id = jiffies;
-
- dccp_sync_mss(newsk, dst_mtu(dst));
-
- __inet_hash(&dccp_hashinfo, newsk, 0);
- __inet_inherit_port(&dccp_hashinfo, sk, newsk);
-
- return newsk;
-
-exit_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-exit:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
- dst_release(dst);
- return NULL;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
-
-static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const struct iphdr *iph = skb->nh.iph;
- struct sock *nsk;
- struct request_sock **prev;
- /* Find possible connection requests. */
- struct request_sock *req = inet_csk_search_req(sk, &prev,
- dh->dccph_sport,
- iph->saddr, iph->daddr);
- if (req != NULL)
- return dccp_check_req(sk, skb, req, prev);
-
- nsk = inet_lookup_established(&dccp_hashinfo,
- iph->saddr, dh->dccph_sport,
- iph->daddr, dh->dccph_dport,
- inet_iif(skb));
- if (nsk != NULL) {
- if (nsk->sk_state != DCCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
- return sk;
-}
-
-static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
- struct sk_buff *skb)
-{
- struct rtable *rt;
- struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
- .nl_u = { .ip4_u =
- { .daddr = skb->nh.iph->saddr,
- .saddr = skb->nh.iph->daddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = dccp_hdr(skb)->dccph_dport,
- .dport = dccp_hdr(skb)->dccph_sport }
- }
- };
-
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
-
- return &rt->u.dst;
-}
-
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
-{
- int err = -1;
- struct sk_buff *skb;
-
- /* First, grab a route. */
-
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto out;
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
- ireq->rmt_addr);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
- ireq->opt);
- err = net_xmit_eval(err);
- }
-
-out:
- dst_release(dst);
- return err;
-}
-
-static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
-{
- int err;
- struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
- const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
- struct sk_buff *skb;
- struct dst_entry *dst;
- u64 seqno = 0;
-
- /* Never send a reset in response to a reset. */
- if (rxdh->dccph_type == DCCP_PKT_RESET)
- return;
-
- if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
- return;
-
- dst = dccp_v4_route_skb(dccp_v4_ctl_socket->sk, rxskb);
- if (dst == NULL)
- return;
-
- skb = alloc_skb(dccp_v4_ctl_socket->sk->sk_prot->max_header,
- GFP_ATOMIC);
- if (skb == NULL)
- goto out;
-
- /* Reserve space for headers. */
- skb_reserve(skb, dccp_v4_ctl_socket->sk->sk_prot->max_header);
- skb->dst = dst_clone(dst);
-
- dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
-
- /* Build DCCP header and checksum it. */
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_reset_len / 4;
- dh->dccph_x = 1;
- dccp_hdr_reset(skb)->dccph_reset_code =
- DCCP_SKB_CB(rxskb)->dccpd_reset_code;
-
- /* See "8.3.1. Abnormal Termination" in RFC 4340 */
- if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
-
- dccp_hdr_set_seq(dh, seqno);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb, rxskb->nh.iph->saddr,
- rxskb->nh.iph->daddr);
-
- bh_lock_sock(dccp_v4_ctl_socket->sk);
- err = ip_build_and_send_pkt(skb, dccp_v4_ctl_socket->sk,
- rxskb->nh.iph->daddr,
- rxskb->nh.iph->saddr, NULL);
- bh_unlock_sock(dccp_v4_ctl_socket->sk);
-
- if (net_xmit_eval(err) == 0) {
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- }
-out:
- dst_release(dst);
-}
-
-static void dccp_v4_reqsk_destructor(struct request_sock *req)
-{
- kfree(inet_rsk(req)->opt);
-}
-
-static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
- .family = PF_INET,
- .obj_size = sizeof(struct dccp_request_sock),
- .rtx_syn_ack = dccp_v4_send_response,
- .send_ack = dccp_reqsk_send_ack,
- .destructor = dccp_v4_reqsk_destructor,
- .send_reset = dccp_v4_ctl_send_reset,
-};
-
-int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct inet_request_sock *ireq;
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
-
- /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
- if (((struct rtable *)skb->dst)->rt_flags &
- (RTCF_BROADCAST | RTCF_MULTICAST)) {
- reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- goto drop;
- }
-
- if (dccp_bad_service_code(sk, service)) {
- reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- /*
- * Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
- goto drop;
-
- req = reqsk_alloc(&dccp_request_sock_ops);
- if (req == NULL)
- goto drop;
-
- if (dccp_parse_options(sk, skb))
- goto drop_and_free;
-
- dccp_reqsk_init(req, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- ireq = inet_rsk(req);
- ireq->loc_addr = skb->nh.iph->daddr;
- ireq->rmt_addr = skb->nh.iph->saddr;
- ireq->opt = NULL;
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
- */
- dreq = dccp_rsk(req);
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_iss = dccp_v4_init_sequence(skb);
- dreq->dreq_service = service;
-
- if (dccp_v4_send_response(sk, req, NULL))
- goto drop_and_free;
-
- inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
- dcb->dccpd_reset_code = reset_code;
- return -1;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
-
-int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- if (sk->sk_state == DCCP_OPEN) { /* Fast path */
- if (dccp_rcv_established(sk, skb, dh, skb->len))
- goto reset;
- return 0;
- }
-
- /*
- * Step 3: Process LISTEN state
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- * Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
- */
- if (sk->sk_state == DCCP_LISTEN) {
- struct sock *nsk = dccp_v4_hnd_req(sk, skb);
-
- if (nsk == NULL)
- goto discard;
-
- if (nsk != sk) {
- if (dccp_child_process(sk, nsk, skb))
- goto reset;
- return 0;
- }
- }
-
- if (dccp_rcv_state_process(sk, skb, dh, skb->len))
- goto reset;
- return 0;
-
-reset:
- dccp_v4_ctl_send_reset(sk, skb);
-discard:
- kfree_skb(skb);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
-
-/**
- * dccp_invalid_packet - check for malformed packets
- * Implements RFC 4340, 8.5: Step 1: Check header basics
- * Packets that fail these checks are ignored and do not receive Resets.
- */
-int dccp_invalid_packet(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- unsigned int cscov;
-
- if (skb->pkt_type != PACKET_HOST)
- return 1;
-
- /* If the packet is shorter than 12 bytes, drop packet and return */
- if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
- DCCP_WARN("pskb_may_pull failed\n");
- return 1;
- }
-
- dh = dccp_hdr(skb);
-
- /* If P.type is not understood, drop packet and return */
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- DCCP_WARN("invalid packet type\n");
- return 1;
- }
-
- /*
- * If P.Data Offset is too small for packet type, drop packet and return
- */
- if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
- DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
- return 1;
- }
- /*
- * If P.Data Offset is too too large for packet, drop packet and return
- */
- if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
- DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
- return 1;
- }
-
- /*
- * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
- * has short sequence numbers), drop packet and return
- */
- if (dh->dccph_type >= DCCP_PKT_DATA &&
- dh->dccph_type <= DCCP_PKT_DATAACK && dh->dccph_x == 0) {
- DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
- dccp_packet_name(dh->dccph_type));
- return 1;
- }
-
- /*
- * If P.CsCov is too large for the packet size, drop packet and return.
- * This must come _before_ checksumming (not as RFC 4340 suggests).
- */
- cscov = dccp_csum_coverage(skb);
- if (cscov > skb->len) {
- DCCP_WARN("P.CsCov %u exceeds packet length %d\n",
- dh->dccph_cscov, skb->len);
- return 1;
- }
-
- /* If header checksum is incorrect, drop packet and return.
- * (This step is completed in the AF-dependent functions.) */
- skb->csum = skb_checksum(skb, 0, cscov, 0);
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_invalid_packet);
-
-/* this is called when real data arrives */
-static int dccp_v4_rcv(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- struct sock *sk;
- int min_cov;
-
- /* Step 1: Check header basics */
-
- if (dccp_invalid_packet(skb))
- goto discard_it;
-
- /* Step 1: If header checksum is incorrect, drop packet and return */
- if (dccp_v4_csum_finish(skb, skb->nh.iph->saddr, skb->nh.iph->daddr)) {
- DCCP_WARN("dropped packet with invalid checksum\n");
- goto discard_it;
- }
-
- dh = dccp_hdr(skb);
-
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
- DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
-
- dccp_pr_debug("%8.8s "
- "src=%u.%u.%u.%u@%-5d "
- "dst=%u.%u.%u.%u@%-5d seq=%llu",
- dccp_packet_name(dh->dccph_type),
- NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
- NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
-
- if (dccp_packet_without_ack(skb)) {
- DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
- dccp_pr_debug_cat("\n");
- } else {
- DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
- dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- }
-
- /* Step 2:
- * Look up flow ID in table and get corresponding socket */
- sk = __inet_lookup(&dccp_hashinfo,
- skb->nh.iph->saddr, dh->dccph_sport,
- skb->nh.iph->daddr, dh->dccph_dport,
- inet_iif(skb));
-
- /*
- * Step 2:
- * If no socket ...
- */
- if (sk == NULL) {
- dccp_pr_debug("failed to look up flow ID in table and "
- "get corresponding socket\n");
- goto no_dccp_socket;
- }
-
- /*
- * Step 2:
- * ... or S.state == TIMEWAIT,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
- inet_twsk_put(inet_twsk(sk));
- goto no_dccp_socket;
- }
-
- /*
- * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
- * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
- * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
- */
- min_cov = dccp_sk(sk)->dccps_pcrlen;
- if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
- dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
- dh->dccph_cscov, min_cov);
- /* FIXME: "Such packets SHOULD be reported using Data Dropped
- * options (Section 11.7) with Drop Code 0, Protocol
- * Constraints." */
- goto discard_and_relse;
- }
-
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
- nf_reset(skb);
-
- return sk_receive_skb(sk, skb, 1);
-
-no_dccp_socket:
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard_it;
- /*
- * Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (dh->dccph_type != DCCP_PKT_RESET) {
- DCCP_SKB_CB(skb)->dccpd_reset_code =
- DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v4_ctl_send_reset(sk, skb);
- }
-
-discard_it:
- kfree_skb(skb);
- return 0;
-
-discard_and_relse:
- sock_put(sk);
- goto discard_it;
-}
-
-static struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v4_conn_request,
- .syn_recv_sock = dccp_v4_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ip_setsockopt,
- .getsockopt = ip_getsockopt,
- .addr2sockaddr = inet_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ip_setsockopt,
- .compat_getsockopt = compat_ip_getsockopt,
-#endif
-};
-
-static int dccp_v4_init_sock(struct sock *sk)
-{
- static __u8 dccp_v4_ctl_sock_initialized;
- int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
-
- if (err == 0) {
- if (unlikely(!dccp_v4_ctl_sock_initialized))
- dccp_v4_ctl_sock_initialized = 1;
- inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
- }
-
- return err;
-}
-
-static struct timewait_sock_ops dccp_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct inet_timewait_sock),
-};
-
-static struct proto dccp_v4_prot = {
- .name = "DCCP",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v4_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v4_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v4_do_rcv,
- .hash = dccp_hash,
- .unhash = dccp_unhash,
- .accept = inet_csk_accept,
- .get_port = dccp_v4_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp_sock),
- .rsk_prot = &dccp_request_sock_ops,
- .twsk_prot = &dccp_timewait_sock_ops,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_dccp_setsockopt,
- .compat_getsockopt = compat_dccp_getsockopt,
-#endif
-};
-
-static struct net_protocol dccp_v4_protocol = {
- .handler = dccp_v4_rcv,
- .err_handler = dccp_v4_err,
- .no_policy = 1,
-};
-
-static const struct proto_ops inet_dccp_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
- .poll = dccp_poll,
- .ioctl = inet_ioctl,
- /* FIXME: work on inet_listen to rename it to sock_common_listen */
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
-};
-
-static struct inet_protosw dccp_v4_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_v4_prot,
- .ops = &inet_dccp_ops,
- .capability = -1,
- .no_check = 0,
- .flags = INET_PROTOSW_ICSK,
-};
-
-static int __init dccp_v4_init(void)
-{
- int err = proto_register(&dccp_v4_prot, 1);
-
- if (err != 0)
- goto out;
-
- err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
- if (err != 0)
- goto out_proto_unregister;
-
- inet_register_protosw(&dccp_v4_protosw);
-
- err = inet_csk_ctl_sock_create(&dccp_v4_ctl_socket, PF_INET,
- SOCK_DCCP, IPPROTO_DCCP);
- if (err)
- goto out_unregister_protosw;
-out:
- return err;
-out_unregister_protosw:
- inet_unregister_protosw(&dccp_v4_protosw);
- inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
-out_proto_unregister:
- proto_unregister(&dccp_v4_prot);
- goto out;
-}
-
-static void __exit dccp_v4_exit(void)
-{
- inet_unregister_protosw(&dccp_v4_protosw);
- inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
- proto_unregister(&dccp_v4_prot);
-}
-
-module_init(dccp_v4_init);
-module_exit(dccp_v4_exit);
-
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
-MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
deleted file mode 100644
index c7aaa2574f52..000000000000
--- a/net/dccp/ipv6.c
+++ /dev/null
@@ -1,1269 +0,0 @@
-/*
- * DCCP over IPv6
- * Linux INET6 implementation
- *
- * Based on net/dccp6/ipv6.c
- *
- * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/xfrm.h>
-
-#include <net/addrconf.h>
-#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_sock.h>
-#include <net/inet6_connection_sock.h>
-#include <net/inet6_hashtables.h>
-#include <net/ip6_route.h>
-#include <net/ipv6.h>
-#include <net/protocol.h>
-#include <net/transp_v6.h>
-#include <net/ip6_checksum.h>
-#include <net/xfrm.h>
-
-#include "dccp.h"
-#include "ipv6.h"
-#include "feat.h"
-
-/* Socket used for sending RSTs and ACKs */
-static struct socket *dccp_v6_ctl_socket;
-
-static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
-static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
-
-static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
-{
- return inet_csk_get_port(&dccp_hashinfo, sk, snum,
- inet6_csk_bind_conflict);
-}
-
-static void dccp_v6_hash(struct sock *sk)
-{
- if (sk->sk_state != DCCP_CLOSED) {
- if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
- dccp_hash(sk);
- return;
- }
- local_bh_disable();
- __inet6_hash(&dccp_hashinfo, sk);
- local_bh_enable();
- }
-}
-
-/* add pseudo-header to DCCP checksum stored in skb->csum */
-static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
- struct in6_addr *saddr,
- struct in6_addr *daddr)
-{
- return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
-}
-
-static inline void dccp_v6_send_check(struct sock *sk, int unused_value,
- struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &np->daddr);
-}
-
-static inline __u32 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport )
-{
- return secure_tcpv6_sequence_number(saddr, daddr, sport, dport);
-}
-
-static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
-{
- return secure_dccpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
- skb->nh.ipv6h->saddr.s6_addr32,
- dccp_hdr(skb)->dccph_dport,
- dccp_hdr(skb)->dccph_sport );
-
-}
-
-static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
-{
- struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
- const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
- struct ipv6_pinfo *np;
- struct sock *sk;
- int err;
- __u64 seq;
-
- sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
- &hdr->saddr, dh->dccph_sport, inet6_iif(skb));
-
- if (sk == NULL) {
- ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
- return;
- }
-
- if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put(inet_twsk(sk));
- return;
- }
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- np = inet6_sk(sk);
-
- if (type == ICMPV6_PKT_TOOBIG) {
- struct dst_entry *dst = NULL;
-
- if (sock_owned_by_user(sk))
- goto out;
- if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
- goto out;
-
- /* icmp should have updated the destination cache entry */
- dst = __sk_dst_check(sk, np->dst_cookie);
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
-
- /* BUGGG_FUTURE: Again, it is not clear how
- to handle rthdr case. Ignore this complexity
- for now.
- */
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
- security_sk_classify_flow(sk, &fl);
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- err = xfrm_lookup(&dst, &fl, sk, 0);
- if (err < 0) {
- sk->sk_err_soft = -err;
- goto out;
- }
- } else
- dst_hold(dst);
-
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
- dccp_sync_mss(sk, dst_mtu(dst));
- } /* else let the usual retransmit timer handle it */
- dst_release(dst);
- goto out;
- }
-
- icmpv6_err_convert(type, code, &err);
-
- seq = DCCP_SKB_CB(skb)->dccpd_seq;
- /* Might be for an request_sock */
- switch (sk->sk_state) {
- struct request_sock *req, **prev;
- case DCCP_LISTEN:
- if (sock_owned_by_user(sk))
- goto out;
-
- req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
- &hdr->daddr, &hdr->saddr,
- inet6_iif(skb));
- if (req == NULL)
- goto out;
-
- /*
- * ICMPs are not backlogged, hence we cannot get an established
- * socket here.
- */
- BUG_TRAP(req->sk == NULL);
-
- if (seq != dccp_rsk(req)->dreq_iss) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- inet_csk_reqsk_queue_drop(sk, req, prev);
- goto out;
-
- case DCCP_REQUESTING:
- case DCCP_RESPOND: /* Cannot happen.
- It can, it SYNs are crossed. --ANK */
- if (!sock_owned_by_user(sk)) {
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
- sk->sk_err = err;
- /*
- * Wake people up to see the error
- * (see connect in sock.c)
- */
- sk->sk_error_report(sk);
- dccp_done(sk);
- } else
- sk->sk_err_soft = err;
- goto out;
- }
-
- if (!sock_owned_by_user(sk) && np->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
- } else
- sk->sk_err_soft = err;
-
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
-{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *skb;
- struct ipv6_txoptions *opt = NULL;
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
- int err = -1;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = ireq6->iif;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
- security_req_classify_flow(req, &fl);
-
- if (dst == NULL) {
- opt = np->opt;
- if (opt == NULL &&
- np->rxopt.bits.osrcrt == 2 &&
- ireq6->pktopts) {
- struct sk_buff *pktopts = ireq6->pktopts;
- struct inet6_skb_parm *rxopt = IP6CB(pktopts);
-
- if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk,
- (struct ipv6_rt_hdr *)(pktopts->nh.raw +
- rxopt->srcrt));
- }
-
- if (opt != NULL && opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = xfrm_lookup(&dst, &fl, sk, 0);
- if (err < 0)
- goto done;
- }
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dh->dccph_checksum = dccp_v6_csum_finish(skb,
- &ireq6->loc_addr,
- &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt, 0);
- err = net_xmit_eval(err);
- }
-
-done:
- if (opt != NULL && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- dst_release(dst);
- return err;
-}
-
-static void dccp_v6_reqsk_destructor(struct request_sock *req)
-{
- if (inet6_rsk(req)->pktopts != NULL)
- kfree_skb(inet6_rsk(req)->pktopts);
-}
-
-static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
-{
- struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
- const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
- struct sk_buff *skb;
- struct flowi fl;
- u64 seqno = 0;
-
- if (rxdh->dccph_type == DCCP_PKT_RESET)
- return;
-
- if (!ipv6_unicast_destination(rxskb))
- return;
-
- skb = alloc_skb(dccp_v6_ctl_socket->sk->sk_prot->max_header,
- GFP_ATOMIC);
- if (skb == NULL)
- return;
-
- skb_reserve(skb, dccp_v6_ctl_socket->sk->sk_prot->max_header);
-
- dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
-
- /* Swap the send and the receive. */
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_reset_len / 4;
- dh->dccph_x = 1;
- dccp_hdr_reset(skb)->dccph_reset_code =
- DCCP_SKB_CB(rxskb)->dccpd_reset_code;
-
- /* See "8.3.1. Abnormal Termination" in RFC 4340 */
- if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
-
- dccp_hdr_set_seq(dh, seqno);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v6_csum_finish(skb, &rxskb->nh.ipv6h->saddr,
- &rxskb->nh.ipv6h->daddr);
-
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
-
- fl.proto = IPPROTO_DCCP;
- fl.oif = inet6_iif(rxskb);
- fl.fl_ip_dport = dh->dccph_dport;
- fl.fl_ip_sport = dh->dccph_sport;
- security_skb_classify_flow(rxskb, &fl);
-
- /* sk = NULL, but it is safe for now. RST socket required. */
- if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
- if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
- ip6_xmit(dccp_v6_ctl_socket->sk, skb, &fl, NULL, 0);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- return;
- }
- }
-
- kfree_skb(skb);
-}
-
-static struct request_sock_ops dccp6_request_sock_ops = {
- .family = AF_INET6,
- .obj_size = sizeof(struct dccp6_request_sock),
- .rtx_syn_ack = dccp_v6_send_response,
- .send_ack = dccp_reqsk_send_ack,
- .destructor = dccp_v6_reqsk_destructor,
- .send_reset = dccp_v6_ctl_send_reset,
-};
-
-static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const struct ipv6hdr *iph = skb->nh.ipv6h;
- struct sock *nsk;
- struct request_sock **prev;
- /* Find possible connection requests. */
- struct request_sock *req = inet6_csk_search_req(sk, &prev,
- dh->dccph_sport,
- &iph->saddr,
- &iph->daddr,
- inet6_iif(skb));
- if (req != NULL)
- return dccp_check_req(sk, skb, req, prev);
-
- nsk = __inet6_lookup_established(&dccp_hashinfo,
- &iph->saddr, dh->dccph_sport,
- &iph->daddr, ntohs(dh->dccph_dport),
- inet6_iif(skb));
- if (nsk != NULL) {
- if (nsk->sk_state != DCCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
- return sk;
-}
-
-static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- struct inet6_request_sock *ireq6;
- struct ipv6_pinfo *np = inet6_sk(sk);
- const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
-
- if (skb->protocol == htons(ETH_P_IP))
- return dccp_v4_conn_request(sk, skb);
-
- if (!ipv6_unicast_destination(skb))
- goto drop;
-
- if (dccp_bad_service_code(sk, service)) {
- reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * There are no SYN attacks on IPv6, yet...
- */
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
- goto drop;
-
- req = inet6_reqsk_alloc(&dccp6_request_sock_ops);
- if (req == NULL)
- goto drop;
-
- if (dccp_parse_options(sk, skb))
- goto drop_and_free;
-
- dccp_reqsk_init(req, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- ireq6 = inet6_rsk(req);
- ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
- ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
- ireq6->pktopts = NULL;
-
- if (ipv6_opt_accepted(sk, skb) ||
- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
- atomic_inc(&skb->users);
- ireq6->pktopts = skb;
- }
- ireq6->iif = sk->sk_bound_dev_if;
-
- /* So that link locals have meaning */
- if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- ireq6->iif = inet6_iif(skb);
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
- */
- dreq = dccp_rsk(req);
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_iss = dccp_v6_init_sequence(skb);
- dreq->dreq_service = service;
-
- if (dccp_v6_send_response(sk, req, NULL))
- goto drop_and_free;
-
- inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
- dcb->dccpd_reset_code = reset_code;
- return -1;
-}
-
-static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
-{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
- struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
- struct inet_sock *newinet;
- struct dccp_sock *newdp;
- struct dccp6_sock *newdp6;
- struct sock *newsk;
- struct ipv6_txoptions *opt;
-
- if (skb->protocol == htons(ETH_P_IP)) {
- /*
- * v6 mapped
- */
- newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
- if (newsk == NULL)
- return NULL;
-
- newdp6 = (struct dccp6_sock *)newsk;
- newdp = dccp_sk(newsk);
- newinet = inet_sk(newsk);
- newinet->pinet6 = &newdp6->inet6;
- newnp = inet6_sk(newsk);
-
- memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-
- ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
- newinet->daddr);
-
- ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
- newinet->saddr);
-
- ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
-
- inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
- newsk->sk_backlog_rcv = dccp_v4_do_rcv;
- newnp->pktoptions = NULL;
- newnp->opt = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
-
- /*
- * No need to charge this sock to the relevant IPv6 refcnt debug socks count
- * here, dccp_create_openreq_child now does this for us, see the comment in
- * that function for the gory details. -acme
- */
-
- /* It is tricky place. Until this moment IPv4 tcp
- worked with IPv6 icsk.icsk_af_ops.
- Sync it now.
- */
- dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
-
- return newsk;
- }
-
- opt = np->opt;
-
- if (sk_acceptq_is_full(sk))
- goto out_overflow;
-
- if (np->rxopt.bits.osrcrt == 2 && opt == NULL && ireq6->pktopts) {
- const struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
-
- if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk,
- (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
- rxopt->srcrt));
- }
-
- if (dst == NULL) {
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- if (opt != NULL && opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
- security_sk_classify_flow(sk, &fl);
-
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out;
- }
-
- newsk = dccp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto out;
-
- /*
- * No need to charge this sock to the relevant IPv6 refcnt debug socks
- * count here, dccp_create_openreq_child now does this for us, see the
- * comment in that function for the gory details. -acme
- */
-
- __ip6_dst_store(newsk, dst, NULL, NULL);
- newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
- NETIF_F_TSO);
- newdp6 = (struct dccp6_sock *)newsk;
- newinet = inet_sk(newsk);
- newinet->pinet6 = &newdp6->inet6;
- newdp = dccp_sk(newsk);
- newnp = inet6_sk(newsk);
-
- memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-
- ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
- ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
- ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
- newsk->sk_bound_dev_if = ireq6->iif;
-
- /* Now IPv6 options...
-
- First: no IPv4 options.
- */
- newinet->opt = NULL;
-
- /* Clone RX bits */
- newnp->rxopt.all = np->rxopt.all;
-
- /* Clone pktoptions received with SYN */
- newnp->pktoptions = NULL;
- if (ireq6->pktopts != NULL) {
- newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
- kfree_skb(ireq6->pktopts);
- ireq6->pktopts = NULL;
- if (newnp->pktoptions)
- skb_set_owner_r(newnp->pktoptions, newsk);
- }
- newnp->opt = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
-
- /*
- * Clone native IPv6 options from listening socket (if any)
- *
- * Yes, keeping reference count would be much more clever, but we make
- * one more one thing there: reattach optmem to newsk.
- */
- if (opt != NULL) {
- newnp->opt = ipv6_dup_options(newsk, opt);
- if (opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- }
-
- inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt != NULL)
- inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
- newnp->opt->opt_flen);
-
- dccp_sync_mss(newsk, dst_mtu(dst));
-
- newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
-
- __inet6_hash(&dccp_hashinfo, newsk);
- inet_inherit_port(&dccp_hashinfo, sk, newsk);
-
- return newsk;
-
-out_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-out:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
- if (opt != NULL && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- dst_release(dst);
- return NULL;
-}
-
-/* The socket must have it's spinlock held when we get
- * here.
- *
- * We have a potential double-lock case here, so even when
- * doing backlog processing we use the BH locking scheme.
- * This is because we cannot sleep with the original spinlock
- * held.
- */
-static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *opt_skb = NULL;
-
- /* Imagine: socket is IPv6. IPv4 packet arrives,
- goes to IPv4 receive handler and backlogged.
- From backlog it always goes here. Kerboom...
- Fortunately, dccp_rcv_established and rcv_established
- handle them correctly, but it is not case with
- dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
- */
-
- if (skb->protocol == htons(ETH_P_IP))
- return dccp_v4_do_rcv(sk, skb);
-
- if (sk_filter(sk, skb))
- goto discard;
-
- /*
- * socket locking is here for SMP purposes as backlog rcv is currently
- * called with bh processing disabled.
- */
-
- /* Do Stevens' IPV6_PKTOPTIONS.
-
- Yes, guys, it is the only place in our code, where we
- may make it not affecting IPv4.
- The rest of code is protocol independent,
- and I do not like idea to uglify IPv4.
-
- Actually, all the idea behind IPV6_PKTOPTIONS
- looks not very well thought. For now we latch
- options, received in the last packet, enqueued
- by tcp. Feel free to propose better solution.
- --ANK (980728)
- */
- if (np->rxopt.all)
- /*
- * FIXME: Add handling of IPV6_PKTOPTIONS skb. See the comments below
- * (wrt ipv6_pktopions) and net/ipv6/tcp_ipv6.c for an example.
- */
- opt_skb = skb_clone(skb, GFP_ATOMIC);
-
- if (sk->sk_state == DCCP_OPEN) { /* Fast path */
- if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
- goto reset;
- if (opt_skb) {
- /* XXX This is where we would goto ipv6_pktoptions. */
- __kfree_skb(opt_skb);
- }
- return 0;
- }
-
- /*
- * Step 3: Process LISTEN state
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- * Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
- */
- if (sk->sk_state == DCCP_LISTEN) {
- struct sock *nsk = dccp_v6_hnd_req(sk, skb);
-
- if (nsk == NULL)
- goto discard;
- /*
- * Queue it on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket..
- */
- if (nsk != sk) {
- if (dccp_child_process(sk, nsk, skb))
- goto reset;
- if (opt_skb != NULL)
- __kfree_skb(opt_skb);
- return 0;
- }
- }
-
- if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
- goto reset;
- if (opt_skb) {
- /* XXX This is where we would goto ipv6_pktoptions. */
- __kfree_skb(opt_skb);
- }
- return 0;
-
-reset:
- dccp_v6_ctl_send_reset(sk, skb);
-discard:
- if (opt_skb != NULL)
- __kfree_skb(opt_skb);
- kfree_skb(skb);
- return 0;
-}
-
-static int dccp_v6_rcv(struct sk_buff **pskb)
-{
- const struct dccp_hdr *dh;
- struct sk_buff *skb = *pskb;
- struct sock *sk;
- int min_cov;
-
- /* Step 1: Check header basics */
-
- if (dccp_invalid_packet(skb))
- goto discard_it;
-
- /* Step 1: If header checksum is incorrect, drop packet and return. */
- if (dccp_v6_csum_finish(skb, &skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr)) {
- DCCP_WARN("dropped packet with invalid checksum\n");
- goto discard_it;
- }
-
- dh = dccp_hdr(skb);
-
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
- DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
-
- if (dccp_packet_without_ack(skb))
- DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
- else
- DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
-
- /* Step 2:
- * Look up flow ID in table and get corresponding socket */
- sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
- dh->dccph_sport,
- &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
- inet6_iif(skb));
- /*
- * Step 2:
- * If no socket ...
- */
- if (sk == NULL) {
- dccp_pr_debug("failed to look up flow ID in table and "
- "get corresponding socket\n");
- goto no_dccp_socket;
- }
-
- /*
- * Step 2:
- * ... or S.state == TIMEWAIT,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
- inet_twsk_put(inet_twsk(sk));
- goto no_dccp_socket;
- }
-
- /*
- * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
- * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
- * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
- */
- min_cov = dccp_sk(sk)->dccps_pcrlen;
- if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
- dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
- dh->dccph_cscov, min_cov);
- /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */
- goto discard_and_relse;
- }
-
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
-
- return sk_receive_skb(sk, skb, 1) ? -1 : 0;
-
-no_dccp_socket:
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard_it;
- /*
- * Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (dh->dccph_type != DCCP_PKT_RESET) {
- DCCP_SKB_CB(skb)->dccpd_reset_code =
- DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v6_ctl_send_reset(sk, skb);
- }
-
-discard_it:
- kfree_skb(skb);
- return 0;
-
-discard_and_relse:
- sock_put(sk);
- goto discard_it;
-}
-
-static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len)
-{
- struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p = NULL, final;
- struct flowi fl;
- struct dst_entry *dst;
- int addr_type;
- int err;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
-
- if (usin->sin6_family != AF_INET6)
- return -EAFNOSUPPORT;
-
- memset(&fl, 0, sizeof(fl));
-
- if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
- struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
- return -EINVAL;
- ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
- fl6_sock_release(flowlabel);
- }
- }
- /*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
- if (ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 1;
-
- addr_type = ipv6_addr_type(&usin->sin6_addr);
-
- if (addr_type & IPV6_ADDR_MULTICAST)
- return -ENETUNREACH;
-
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (addr_len >= sizeof(struct sockaddr_in6) &&
- usin->sin6_scope_id) {
- /* If interface is set while binding, indices
- * must coincide.
- */
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != usin->sin6_scope_id)
- return -EINVAL;
-
- sk->sk_bound_dev_if = usin->sin6_scope_id;
- }
-
- /* Connect to link-local address requires an interface */
- if (!sk->sk_bound_dev_if)
- return -EINVAL;
- }
-
- ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
-
- /*
- * DCCP over IPv4
- */
- if (addr_type == IPV6_ADDR_MAPPED) {
- u32 exthdrlen = icsk->icsk_ext_hdr_len;
- struct sockaddr_in sin;
-
- SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
- if (__ipv6_only_sock(sk))
- return -ENETUNREACH;
-
- sin.sin_family = AF_INET;
- sin.sin_port = usin->sin6_port;
- sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
-
- icsk->icsk_af_ops = &dccp_ipv6_mapped;
- sk->sk_backlog_rcv = dccp_v4_do_rcv;
-
- err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
- if (err) {
- icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &dccp_ipv6_af_ops;
- sk->sk_backlog_rcv = dccp_v6_do_rcv;
- goto failure;
- } else {
- ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
- inet->saddr);
- ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
- inet->rcv_saddr);
- }
-
- return err;
- }
-
- if (!ipv6_addr_any(&np->rcv_saddr))
- saddr = &np->rcv_saddr;
-
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->sport;
- security_sk_classify_flow(sk, &fl);
-
- if (np->opt != NULL && np->opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto failure;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = xfrm_lookup(&dst, &fl, sk, 0);
- if (err < 0)
- goto failure;
-
- if (saddr == NULL) {
- saddr = &fl.fl6_src;
- ipv6_addr_copy(&np->rcv_saddr, saddr);
- }
-
- /* set the source address */
- ipv6_addr_copy(&np->saddr, saddr);
- inet->rcv_saddr = LOOPBACK4_IPV6;
-
- __ip6_dst_store(sk, dst, NULL, NULL);
-
- icsk->icsk_ext_hdr_len = 0;
- if (np->opt != NULL)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
-
- inet->dport = usin->sin6_port;
-
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet6_hash_connect(&dccp_death_row, sk);
- if (err)
- goto late_failure;
-
- dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->sport, inet->dport);
- err = dccp_connect(sk);
- if (err)
- goto late_failure;
-
- return 0;
-
-late_failure:
- dccp_set_state(sk, DCCP_CLOSED);
- __sk_dst_reset(sk);
-failure:
- inet->dport = 0;
- sk->sk_route_caps = 0;
- return err;
-}
-
-static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
- .queue_xmit = inet6_csk_xmit,
- .send_check = dccp_v6_send_check,
- .rebuild_header = inet6_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct ipv6hdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
-};
-
-/*
- * DCCP over IPv4 via INET6 API
- */
-static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
-};
-
-/* NOTE: A lot of things set to zero explicitly by call to
- * sk_alloc() so need not be done here.
- */
-static int dccp_v6_init_sock(struct sock *sk)
-{
- static __u8 dccp_v6_ctl_sock_initialized;
- int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized);
-
- if (err == 0) {
- if (unlikely(!dccp_v6_ctl_sock_initialized))
- dccp_v6_ctl_sock_initialized = 1;
- inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
- }
-
- return err;
-}
-
-static int dccp_v6_destroy_sock(struct sock *sk)
-{
- dccp_destroy_sock(sk);
- return inet6_destroy_sock(sk);
-}
-
-static struct timewait_sock_ops dccp6_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
-};
-
-static struct proto dccp_v6_prot = {
- .name = "DCCPv6",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v6_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v6_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v6_do_rcv,
- .hash = dccp_v6_hash,
- .unhash = dccp_unhash,
- .accept = inet_csk_accept,
- .get_port = dccp_v6_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_v6_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp6_sock),
- .rsk_prot = &dccp6_request_sock_ops,
- .twsk_prot = &dccp6_timewait_sock_ops,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_dccp_setsockopt,
- .compat_getsockopt = compat_dccp_getsockopt,
-#endif
-};
-
-static struct inet6_protocol dccp_v6_protocol = {
- .handler = dccp_v6_rcv,
- .err_handler = dccp_v6_err,
- .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
-};
-
-static struct proto_ops inet6_dccp_ops = {
- .family = PF_INET6,
- .owner = THIS_MODULE,
- .release = inet6_release,
- .bind = inet6_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet6_getname,
- .poll = dccp_poll,
- .ioctl = inet6_ioctl,
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
-};
-
-static struct inet_protosw dccp_v6_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_v6_prot,
- .ops = &inet6_dccp_ops,
- .capability = -1,
- .flags = INET_PROTOSW_ICSK,
-};
-
-static int __init dccp_v6_init(void)
-{
- int err = proto_register(&dccp_v6_prot, 1);
-
- if (err != 0)
- goto out;
-
- err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- if (err != 0)
- goto out_unregister_proto;
-
- inet6_register_protosw(&dccp_v6_protosw);
-
- err = inet_csk_ctl_sock_create(&dccp_v6_ctl_socket, PF_INET6,
- SOCK_DCCP, IPPROTO_DCCP);
- if (err != 0)
- goto out_unregister_protosw;
-out:
- return err;
-out_unregister_protosw:
- inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- inet6_unregister_protosw(&dccp_v6_protosw);
-out_unregister_proto:
- proto_unregister(&dccp_v6_prot);
- goto out;
-}
-
-static void __exit dccp_v6_exit(void)
-{
- inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- inet6_unregister_protosw(&dccp_v6_protosw);
- proto_unregister(&dccp_v6_prot);
-}
-
-module_init(dccp_v6_init);
-module_exit(dccp_v6_exit);
-
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
-MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
deleted file mode 100644
index 6eef81fdbe56..000000000000
--- a/net/dccp/ipv6.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _DCCP_IPV6_H
-#define _DCCP_IPV6_H
-/*
- * net/dccp/ipv6.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <linux/ipv6.h>
-
-struct dccp6_sock {
- struct dccp_sock dccp;
- /*
- * ipv6_pinfo has to be the last member of dccp6_sock,
- * see inet6_sk_generic.
- */
- struct ipv6_pinfo inet6;
-};
-
-struct dccp6_request_sock {
- struct dccp_request_sock dccp;
- struct inet6_request_sock inet6;
-};
-
-struct dccp6_timewait_sock {
- struct inet_timewait_sock inet;
- struct inet6_timewait_sock tw6;
-};
-
-#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
deleted file mode 100644
index 7b52f2a03eef..000000000000
--- a/net/dccp/minisocks.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * net/dccp/minisocks.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/timer.h>
-
-#include <net/sock.h>
-#include <net/xfrm.h>
-#include <net/inet_timewait_sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-struct inet_timewait_death_row dccp_death_row = {
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
- .death_lock = SPIN_LOCK_UNLOCKED,
- .hashinfo = &dccp_hashinfo,
- .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
- (unsigned long)&dccp_death_row),
- .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
- inet_twdr_twkill_work,
- &dccp_death_row),
-/* Short-time timewait calendar */
-
- .twcal_hand = -1,
- .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
- (unsigned long)&dccp_death_row),
-};
-
-EXPORT_SYMBOL_GPL(dccp_death_row);
-
-void dccp_time_wait(struct sock *sk, int state, int timeo)
-{
- struct inet_timewait_sock *tw = NULL;
-
- if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
- tw = inet_twsk_alloc(sk, state);
-
- if (tw != NULL) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- if (tw->tw_family == PF_INET6) {
- const struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
-
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
- tw->tw_ipv6only = np->ipv6only;
- }
-#endif
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
-
- /* Get the TIME_WAIT timeout firing. */
- if (timeo < rto)
- timeo = rto;
-
- tw->tw_timeout = DCCP_TIMEWAIT_LEN;
- if (state == DCCP_TIME_WAIT)
- timeo = DCCP_TIMEWAIT_LEN;
-
- inet_twsk_schedule(tw, &dccp_death_row, timeo,
- DCCP_TIMEWAIT_LEN);
- inet_twsk_put(tw);
- } else {
- /* Sorry, if we're out of memory, just CLOSE this
- * socket up. We've got bigger problems than
- * non-graceful socket closings.
- */
- DCCP_WARN("time wait bucket table overflow\n");
- }
-
- dccp_done(sk);
-}
-
-struct sock *dccp_create_openreq_child(struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb)
-{
- /*
- * Step 3: Process LISTEN state
- *
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- */
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
-
- if (newsk != NULL) {
- const struct dccp_request_sock *dreq = dccp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(sk);
- struct dccp_sock *newdp = dccp_sk(newsk);
- struct dccp_minisock *newdmsk = dccp_msk(newsk);
-
- newdp->dccps_role = DCCP_ROLE_SERVER;
- newdp->dccps_hc_rx_ackvec = NULL;
- newdp->dccps_service_list = NULL;
- newdp->dccps_service = dreq->dreq_service;
- newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
- do_gettimeofday(&newdp->dccps_epoch);
-
- if (dccp_feat_clone(sk, newsk))
- goto out_free;
-
- if (newdmsk->dccpms_send_ack_vector) {
- newdp->dccps_hc_rx_ackvec =
- dccp_ackvec_alloc(GFP_ATOMIC);
- if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
- goto out_free;
- }
-
- newdp->dccps_hc_rx_ccid =
- ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
- newsk, GFP_ATOMIC);
- newdp->dccps_hc_tx_ccid =
- ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
- newsk, GFP_ATOMIC);
- if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
- newdp->dccps_hc_tx_ccid == NULL)) {
- dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
- ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
- ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
-out_free:
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- return NULL;
- }
-
- /*
- * Step 3: Process LISTEN state
- *
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- */
-
- /* See dccp_v4_conn_request */
- newdmsk->dccpms_sequence_window = req->rcv_wnd;
-
- newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
- dccp_update_gsr(newsk, dreq->dreq_isr);
-
- newdp->dccps_iss = dreq->dreq_iss;
- dccp_update_gss(newsk, dreq->dreq_iss);
-
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_set_seqno(&newdp->dccps_swl,
- max48(newdp->dccps_swl, newdp->dccps_isr));
- dccp_set_seqno(&newdp->dccps_awl,
- max48(newdp->dccps_awl, newdp->dccps_iss));
-
- dccp_init_xmit_timers(newsk);
-
- DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
- }
- return newsk;
-}
-
-EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
-
-/*
- * Process an incoming packet for RESPOND sockets represented
- * as an request_sock.
- */
-struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct request_sock **prev)
-{
- struct sock *child = NULL;
-
- /* Check for retransmitted REQUEST */
- if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
- struct dccp_request_sock *dreq = dccp_rsk(req);
-
- if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) {
- dccp_pr_debug("Retransmitted REQUEST\n");
- dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
- /*
- * Send another RESPONSE packet
- * To protect against Request floods, increment retrans
- * counter (backoff, monitored by dccp_response_timer).
- */
- req->retrans++;
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
- }
- /* Network Duplicate, discard packet */
- return NULL;
- }
-
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
-
- if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
- dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
- goto drop;
-
- /* Invalid ACK */
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
- dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
- "dreq_iss=%llu\n",
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long)
- dccp_rsk(req)->dreq_iss);
- goto drop;
- }
-
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL)
- goto listen_overflow;
-
- /* FIXME: deal with options */
-
- inet_csk_reqsk_queue_unlink(sk, req, prev);
- inet_csk_reqsk_queue_removed(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
-out:
- return child;
-listen_overflow:
- dccp_pr_debug("listen_overflow!\n");
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
-drop:
- if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
- req->rsk_ops->send_reset(sk, skb);
-
- inet_csk_reqsk_queue_drop(sk, req, prev);
- goto out;
-}
-
-EXPORT_SYMBOL_GPL(dccp_check_req);
-
-/*
- * Queue segment on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket.
- */
-int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb)
-{
- int ret = 0;
- const int state = child->sk_state;
-
- if (!sock_owned_by_user(child)) {
- ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
- skb->len);
-
- /* Wakeup parent, send SIGIO */
- if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
- } else {
- /* Alas, it is possible again, because we do lookup
- * in main socket hash table and lock on listening
- * socket does not protect us more.
- */
- sk_add_backlog(child, skb);
- }
-
- bh_unlock_sock(child);
- sock_put(child);
- return ret;
-}
-
-EXPORT_SYMBOL_GPL(dccp_child_process);
-
-void dccp_reqsk_send_ack(struct sk_buff *skb, struct request_sock *rsk)
-{
- DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
-}
-
-EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
-
-void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
-{
- inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
- inet_rsk(req)->acked = 0;
- req->rcv_wnd = sysctl_dccp_feat_sequence_window;
-}
-
-EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
deleted file mode 100644
index f398b43bc055..000000000000
--- a/net/dccp/options.c
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
- * net/dccp/options.c
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/dccp.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
-int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO;
-int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
-int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT;
-
-EXPORT_SYMBOL_GPL(sysctl_dccp_feat_sequence_window);
-
-void dccp_minisock_init(struct dccp_minisock *dmsk)
-{
- dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
- dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid;
- dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid;
- dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio;
- dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
- dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count;
-}
-
-static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
-{
- u32 value = 0;
-
- if (len > 3)
- value += *bf++ << 24;
- if (len > 2)
- value += *bf++ << 16;
- if (len > 1)
- value += *bf++ << 8;
- if (len > 0)
- value += *bf;
-
- return value;
-}
-
-int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
- u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr = options;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- struct dccp_options_received *opt_recv = &dp->dccps_options_received;
- unsigned char opt, len;
- unsigned char *value;
- u32 elapsed_time;
- int rc;
- int mandatory = 0;
-
- memset(opt_recv, 0, sizeof(*opt_recv));
-
- opt = len = 0;
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_invalid_option;
-
- len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
- /*
- * Remove the type and len fields, leaving
- * just the value size
- */
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
-
- if (opt_ptr > opt_end)
- goto out_invalid_option;
- }
-
- switch (opt) {
- case DCCPO_PADDING:
- break;
- case DCCPO_MANDATORY:
- if (mandatory)
- goto out_invalid_option;
- if (pkt_type != DCCP_PKT_DATA)
- mandatory = 1;
- break;
- case DCCPO_NDP_COUNT:
- if (len > 3)
- goto out_invalid_option;
-
- opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
- dccp_pr_debug("%s rx opt: NDP count=%d\n", dccp_role(sk),
- opt_recv->dccpor_ndp);
- break;
- case DCCPO_CHANGE_L:
- /* fall through */
- case DCCPO_CHANGE_R:
- if (len < 2)
- goto out_invalid_option;
- rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
- len - 1);
- /*
- * When there is a change error, change_recv is
- * responsible for dealing with it. i.e. reply with an
- * empty confirm.
- * If the change was mandatory, then we need to die.
- */
- if (rc && mandatory)
- goto out_invalid_option;
- break;
- case DCCPO_CONFIRM_L:
- /* fall through */
- case DCCPO_CONFIRM_R:
- if (len < 2)
- goto out_invalid_option;
- if (dccp_feat_confirm_recv(sk, opt, *value,
- value + 1, len - 1))
- goto out_invalid_option;
- break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (pkt_type == DCCP_PKT_DATA)
- break;
-
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
- goto out_invalid_option;
- break;
- case DCCPO_TIMESTAMP:
- if (len != 4)
- goto out_invalid_option;
-
- opt_recv->dccpor_timestamp = ntohl(*(__be32 *)value);
-
- dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
- dccp_timestamp(sk, &dp->dccps_timestamp_time);
-
- dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
- dccp_role(sk), opt_recv->dccpor_timestamp,
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- break;
- case DCCPO_TIMESTAMP_ECHO:
- if (len != 4 && len != 6 && len != 8)
- goto out_invalid_option;
-
- opt_recv->dccpor_timestamp_echo = ntohl(*(__be32 *)value);
-
- dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
- "ackno=%llu, ", dccp_role(sk),
- opt_recv->dccpor_timestamp_echo,
- len + 2,
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
-
-
- if (len == 4)
- break;
-
- if (len == 6)
- elapsed_time = ntohs(*(__be16 *)(value + 4));
- else
- elapsed_time = ntohl(*(__be32 *)(value + 4));
-
- /* Give precedence to the biggest ELAPSED_TIME */
- if (elapsed_time > opt_recv->dccpor_elapsed_time)
- opt_recv->dccpor_elapsed_time = elapsed_time;
- break;
- case DCCPO_ELAPSED_TIME:
- if (len != 2 && len != 4)
- goto out_invalid_option;
-
- if (pkt_type == DCCP_PKT_DATA)
- continue;
-
- if (len == 2)
- elapsed_time = ntohs(*(__be16 *)value);
- else
- elapsed_time = ntohl(*(__be32 *)value);
-
- if (elapsed_time > opt_recv->dccpor_elapsed_time)
- opt_recv->dccpor_elapsed_time = elapsed_time;
-
- dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
- dccp_role(sk), elapsed_time);
- break;
- /*
- * From RFC 4340, sec. 10.3:
- *
- * Option numbers 128 through 191 are for
- * options sent from the HC-Sender to the
- * HC-Receiver; option numbers 192 through 255
- * are for options sent from the HC-Receiver to
- * the HC-Sender.
- */
- case 128 ... 191: {
- const u16 idx = value - options;
-
- if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- opt, len, idx,
- value) != 0)
- goto out_invalid_option;
- }
- break;
- case 192 ... 255: {
- const u16 idx = value - options;
-
- if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- opt, len, idx,
- value) != 0)
- goto out_invalid_option;
- }
- break;
- default:
- DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
- "implemented, ignoring", sk, opt, len);
- break;
- }
-
- if (opt != DCCPO_MANDATORY)
- mandatory = 0;
- }
-
- /* mandatory was the last byte in option list -> reset connection */
- if (mandatory)
- goto out_invalid_option;
-
- return 0;
-
-out_invalid_option:
- DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
- DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
- return -1;
-}
-
-EXPORT_SYMBOL_GPL(dccp_parse_options);
-
-static void dccp_encode_value_var(const u32 value, unsigned char *to,
- const unsigned int len)
-{
- if (len > 3)
- *to++ = (value & 0xFF000000) >> 24;
- if (len > 2)
- *to++ = (value & 0xFF0000) >> 16;
- if (len > 1)
- *to++ = (value & 0xFF00) >> 8;
- if (len > 0)
- *to++ = (value & 0xFF);
-}
-
-static inline int dccp_ndp_len(const int ndp)
-{
- return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
-}
-
-int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- const unsigned char option,
- const void *value, const unsigned char len)
-{
- unsigned char *to;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
-
- to = skb_push(skb, len + 2);
- *to++ = option;
- *to++ = len + 2;
-
- memcpy(to, value, len);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option);
-
-static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- int ndp = dp->dccps_ndp_count;
-
- if (dccp_non_data_packet(skb))
- ++dp->dccps_ndp_count;
- else
- dp->dccps_ndp_count = 0;
-
- if (ndp > 0) {
- unsigned char *ptr;
- const int ndp_len = dccp_ndp_len(ndp);
- const int len = ndp_len + 2;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- ptr = skb_push(skb, len);
- *ptr++ = DCCPO_NDP_COUNT;
- *ptr++ = len;
- dccp_encode_value_var(ndp, ptr, ndp_len);
- }
-
- return 0;
-}
-
-static inline int dccp_elapsed_time_len(const u32 elapsed_time)
-{
- return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
-}
-
-int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb,
- u32 elapsed_time)
-{
- const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- const int len = 2 + elapsed_time_len;
- unsigned char *to;
-
- if (elapsed_time_len == 0)
- return 0;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- *to++ = DCCPO_ELAPSED_TIME;
- *to++ = len;
-
- if (elapsed_time_len == 2) {
- const __be16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else {
- const __be32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
- }
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
-
-void dccp_timestamp(const struct sock *sk, struct timeval *tv)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- do_gettimeofday(tv);
- tv->tv_sec -= dp->dccps_epoch.tv_sec;
- tv->tv_usec -= dp->dccps_epoch.tv_usec;
-
- while (tv->tv_usec < 0) {
- tv->tv_sec--;
- tv->tv_usec += USEC_PER_SEC;
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_timestamp);
-
-int dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
-{
- struct timeval tv;
- __be32 now;
-
- dccp_timestamp(sk, &tv);
- now = htonl(timeval_usecs(&tv) / 10);
- /* yes this will overflow but that is the point as we want a
- * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
-
- return dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
-static int dccp_insert_option_timestamp_echo(struct sock *sk,
- struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct timeval now;
- __be32 tstamp_echo;
- u32 elapsed_time;
- int len, elapsed_time_len;
- unsigned char *to;
-
- dccp_timestamp(sk, &now);
- elapsed_time = timeval_delta(&now, &dp->dccps_timestamp_time) / 10;
- elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- len = 6 + elapsed_time_len;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- *to++ = DCCPO_TIMESTAMP_ECHO;
- *to++ = len;
-
- tstamp_echo = htonl(dp->dccps_timestamp_echo);
- memcpy(to, &tstamp_echo, 4);
- to += 4;
-
- if (elapsed_time_len == 2) {
- const __be16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else if (elapsed_time_len == 4) {
- const __be32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
- }
-
- dp->dccps_timestamp_echo = 0;
- dp->dccps_timestamp_time.tv_sec = 0;
- dp->dccps_timestamp_time.tv_usec = 0;
- return 0;
-}
-
-static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
- u8 *val, u8 len)
-{
- u8 *to;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
- DCCP_WARN("packet too small for feature %d option!\n", feat);
- return -1;
- }
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
-
- to = skb_push(skb, len + 3);
- *to++ = type;
- *to++ = len + 3;
- *to++ = feat;
-
- if (len)
- memcpy(to, val, len);
-
- dccp_pr_debug("%s(%s (%d), ...), length %d\n",
- dccp_feat_typename(type),
- dccp_feat_name(feat), feat, len);
- return 0;
-}
-
-static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
- struct dccp_opt_pend *opt, *next;
- int change = 0;
-
- /* confirm any options [NN opts] */
- list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
- dccp_insert_feat_opt(skb, opt->dccpop_type,
- opt->dccpop_feat, opt->dccpop_val,
- opt->dccpop_len);
- /* fear empty confirms */
- if (opt->dccpop_val)
- kfree(opt->dccpop_val);
- kfree(opt);
- }
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
-
- /* see which features we need to send */
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- /* see if we need to send any confirm */
- if (opt->dccpop_sc) {
- dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
- opt->dccpop_feat,
- opt->dccpop_sc->dccpoc_val,
- opt->dccpop_sc->dccpoc_len);
-
- BUG_ON(!opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc);
- opt->dccpop_sc = NULL;
- }
-
- /* any option not confirmed, re-send it */
- if (!opt->dccpop_conf) {
- dccp_insert_feat_opt(skb, opt->dccpop_type,
- opt->dccpop_feat, opt->dccpop_val,
- opt->dccpop_len);
- change++;
- }
- }
-
- /* Retransmit timer.
- * If this is the master listening sock, we don't set a timer on it. It
- * should be fine because if the dude doesn't receive our RESPONSE
- * [which will contain the CHANGE] he will send another REQUEST which
- * will "retrnasmit" the change.
- */
- if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
- dccp_pr_debug("reset feat negotiation timer %p\n", sk);
-
- /* XXX don't reset the timer on re-transmissions. I.e. reset it
- * only when sending new stuff i guess. Currently the timer
- * never backs off because on re-transmission it just resets it!
- */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
- }
-
- return 0;
-}
-
-int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
-
- DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
-
- if (dmsk->dccpms_send_ndp_count &&
- dccp_insert_option_ndp(sk, skb))
- return -1;
-
- if (!dccp_packet_without_ack(skb)) {
- if (dmsk->dccpms_send_ack_vector &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
- dccp_insert_option_ackvec(sk, skb))
- return -1;
-
- if (dp->dccps_timestamp_echo != 0 &&
- dccp_insert_option_timestamp_echo(sk, skb))
- return -1;
- }
-
- if (dp->dccps_hc_rx_insert_options) {
- if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
- return -1;
- dp->dccps_hc_rx_insert_options = 0;
- }
- if (dp->dccps_hc_tx_insert_options) {
- if (ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb))
- return -1;
- dp->dccps_hc_tx_insert_options = 0;
- }
-
- /* Feature negotiation */
- /* Data packets can't do feat negotiation */
- if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
- DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
- dccp_insert_options_feat(sk, skb))
- return -1;
-
- /* XXX: insert other options when appropriate */
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
- /* The length of all options has to be a multiple of 4 */
- int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
-
- if (padding != 0) {
- padding = 4 - padding;
- memset(skb_push(skb, padding), 0, padding);
- DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
- }
- }
-
- return 0;
-}
diff --git a/net/dccp/output.c b/net/dccp/output.c
deleted file mode 100644
index 400c30b6fcae..000000000000
--- a/net/dccp/output.c
+++ /dev/null
@@ -1,589 +0,0 @@
-/*
- * net/dccp/output.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include <net/inet_sock.h>
-#include <net/sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-
-static inline void dccp_event_ack_sent(struct sock *sk)
-{
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-}
-
-static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
-{
- skb_set_owner_w(skb, sk);
- WARN_ON(sk->sk_send_head);
- sk->sk_send_head = skb;
-}
-
-/*
- * All SKB's seen here are completely headerless. It is our
- * job to build the DCCP header, and pass the packet down to
- * IP so it can do the same plus pass the packet off to the
- * device.
- */
-static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
-{
- if (likely(skb != NULL)) {
- const struct inet_sock *inet = inet_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- struct dccp_hdr *dh;
- /* XXX For now we're using only 48 bits sequence numbers */
- const u32 dccp_header_size = sizeof(*dh) +
- sizeof(struct dccp_hdr_ext) +
- dccp_packet_hdr_len(dcb->dccpd_type);
- int err, set_ack = 1;
- u64 ackno = dp->dccps_gsr;
-
- dccp_inc_seqno(&dp->dccps_gss);
-
- switch (dcb->dccpd_type) {
- case DCCP_PKT_DATA:
- set_ack = 0;
- /* fall through */
- case DCCP_PKT_DATAACK:
- break;
-
- case DCCP_PKT_REQUEST:
- set_ack = 0;
- /* fall through */
-
- case DCCP_PKT_SYNC:
- case DCCP_PKT_SYNCACK:
- ackno = dcb->dccpd_seq;
- /* fall through */
- default:
- /*
- * Only data packets should come through with skb->sk
- * set.
- */
- WARN_ON(skb->sk);
- skb_set_owner_w(skb, sk);
- break;
- }
-
- dcb->dccpd_seq = dp->dccps_gss;
-
- if (dccp_insert_options(sk, skb)) {
- kfree_skb(skb);
- return -EPROTO;
- }
-
-
- /* Build DCCP header and checksum it. */
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
- dh->dccph_type = dcb->dccpd_type;
- dh->dccph_sport = inet->sport;
- dh->dccph_dport = inet->dport;
- dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
- dh->dccph_ccval = dcb->dccpd_ccval;
- dh->dccph_cscov = dp->dccps_pcslen;
- /* XXX For now we're using only 48 bits sequence numbers */
- dh->dccph_x = 1;
-
- dp->dccps_awh = dp->dccps_gss;
- dccp_hdr_set_seq(dh, dp->dccps_gss);
- if (set_ack)
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
-
- switch (dcb->dccpd_type) {
- case DCCP_PKT_REQUEST:
- dccp_hdr_request(skb)->dccph_req_service =
- dp->dccps_service;
- break;
- case DCCP_PKT_RESET:
- dccp_hdr_reset(skb)->dccph_reset_code =
- dcb->dccpd_reset_code;
- break;
- }
-
- icsk->icsk_af_ops->send_check(sk, 0, skb);
-
- if (set_ack)
- dccp_event_ack_sent(sk);
-
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
-
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = icsk->icsk_af_ops->queue_xmit(skb, sk, 0);
- return net_xmit_eval(err);
- }
- return -ENOBUFS;
-}
-
-unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
- sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
-
- /* Now subtract optional transport overhead */
- mss_now -= icsk->icsk_ext_hdr_len;
-
- /*
- * FIXME: this should come from the CCID infrastructure, where, say,
- * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
- * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
- * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
- * make it a multiple of 4
- */
-
- mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
-
- /* And store cached results */
- icsk->icsk_pmtu_cookie = pmtu;
- dp->dccps_mss_cache = mss_now;
-
- return mss_now;
-}
-
-EXPORT_SYMBOL_GPL(dccp_sync_mss);
-
-void dccp_write_space(struct sock *sk)
-{
- read_lock(&sk->sk_callback_lock);
-
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- /* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, 2, POLL_OUT);
-
- read_unlock(&sk->sk_callback_lock);
-}
-
-/**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
- * @sk: socket to wait for
- * @timeo: for how long
- */
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
- long *timeo)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- DEFINE_WAIT(wait);
- long delay;
- int rc;
-
- while (1) {
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-
- if (sk->sk_err)
- goto do_error;
- if (!*timeo)
- goto do_nonblock;
- if (signal_pending(current))
- goto do_interrupted;
-
- rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- if (rc <= 0)
- break;
- delay = msecs_to_jiffies(rc);
- if (delay > *timeo || delay < 0)
- goto do_nonblock;
-
- sk->sk_write_pending++;
- release_sock(sk);
- *timeo -= schedule_timeout(delay);
- lock_sock(sk);
- sk->sk_write_pending--;
- }
-out:
- finish_wait(sk->sk_sleep, &wait);
- return rc;
-
-do_error:
- rc = -EPIPE;
- goto out;
-do_nonblock:
- rc = -EAGAIN;
- goto out;
-do_interrupted:
- rc = sock_intr_errno(*timeo);
- goto out;
-}
-
-static void dccp_write_xmit_timer(unsigned long data) {
- struct sock *sk = (struct sock *)data;
- struct dccp_sock *dp = dccp_sk(sk);
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
- else
- dccp_write_xmit(sk, 0);
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-void dccp_write_xmit(struct sock *sk, int block)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- long timeo = DCCP_XMIT_TIMEO; /* If a packet is taking longer than
- this we have other issues */
-
- while ((skb = skb_peek(&sk->sk_write_queue))) {
- int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- if (err > 0) {
- if (!block) {
- sk_reset_timer(sk, &dp->dccps_xmit_timer,
- msecs_to_jiffies(err)+jiffies);
- break;
- } else {
- err = dccp_wait_for_ccid(sk, skb, &timeo);
- timeo = DCCP_XMIT_TIMEO;
- }
- if (err)
- DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
- }
-
- skb_dequeue(&sk->sk_write_queue);
- if (err == 0) {
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int len = skb->len;
-
- if (sk->sk_state == DCCP_PARTOPEN) {
- /* See 8.1.5. Handshake Completion */
- inet_csk_schedule_ack(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- } else if (dccp_ack_pending(sk))
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- else
- dcb->dccpd_type = DCCP_PKT_DATA;
-
- err = dccp_transmit_skb(sk, skb);
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
- if (err)
- DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
- err);
- } else
- kfree(skb);
- }
-}
-
-int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
-{
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
- return -EHOSTUNREACH; /* Routing failure or similar. */
-
- return dccp_transmit_skb(sk, (skb_cloned(skb) ?
- pskb_copy(skb, GFP_ATOMIC):
- skb_clone(skb, GFP_ATOMIC)));
-}
-
-struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
-{
- struct dccp_hdr *dh;
- struct dccp_request_sock *dreq;
- const u32 dccp_header_size = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_response);
- struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
- GFP_ATOMIC);
- if (skb == NULL)
- return NULL;
-
- /* Reserve space for headers. */
- skb_reserve(skb, sk->sk_prot->max_header);
-
- skb->dst = dst_clone(dst);
-
- dreq = dccp_rsk(req);
- if (inet_rsk(req)->acked) /* increase ISS upon retransmission */
- dccp_inc_seqno(&dreq->dreq_iss);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
- DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
-
- if (dccp_insert_options(sk, skb)) {
- kfree_skb(skb);
- return NULL;
- }
-
- /* Build and checksum header */
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
-
- dh->dccph_sport = inet_sk(sk)->sport;
- dh->dccph_dport = inet_rsk(req)->rmt_port;
- dh->dccph_doff = (dccp_header_size +
- DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
- dh->dccph_type = DCCP_PKT_RESPONSE;
- dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dreq->dreq_iss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr);
- dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
-
- dccp_csum_outgoing(skb);
-
- /* We use `acked' to remember that a Response was already sent. */
- inet_rsk(req)->acked = 1;
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- return skb;
-}
-
-EXPORT_SYMBOL_GPL(dccp_make_response);
-
-static struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
- const enum dccp_reset_codes code)
-
-{
- struct dccp_hdr *dh;
- struct dccp_sock *dp = dccp_sk(sk);
- const u32 dccp_header_size = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
- struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
- GFP_ATOMIC);
- if (skb == NULL)
- return NULL;
-
- /* Reserve space for headers. */
- skb_reserve(skb, sk->sk_prot->max_header);
-
- skb->dst = dst_clone(dst);
-
- dccp_inc_seqno(&dp->dccps_gss);
-
- DCCP_SKB_CB(skb)->dccpd_reset_code = code;
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
- DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss;
-
- if (dccp_insert_options(sk, skb)) {
- kfree_skb(skb);
- return NULL;
- }
-
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
-
- dh->dccph_sport = inet_sk(sk)->sport;
- dh->dccph_dport = inet_sk(sk)->dport;
- dh->dccph_doff = (dccp_header_size +
- DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dp->dccps_gss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
-
- dccp_hdr_reset(skb)->dccph_reset_code = code;
- inet_csk(sk)->icsk_af_ops->send_check(sk, 0, skb);
-
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- return skb;
-}
-
-int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
-{
- /*
- * FIXME: what if rebuild_header fails?
- * Should we be doing a rebuild_header here?
- */
- int err = inet_sk_rebuild_header(sk);
-
- if (err == 0) {
- struct sk_buff *skb = dccp_make_reset(sk, sk->sk_dst_cache,
- code);
- if (skb != NULL) {
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = inet_csk(sk)->icsk_af_ops->queue_xmit(skb, sk, 0);
- return net_xmit_eval(err);
- }
- }
-
- return err;
-}
-
-/*
- * Do all connect socket setups that can be done AF independent.
- */
-static inline void dccp_connect_init(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- sk->sk_err = 0;
- sock_reset_flag(sk, SOCK_DONE);
-
- dccp_sync_mss(sk, dst_mtu(dst));
-
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_update_gss(sk, dp->dccps_iss);
- dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
-
- /* S.GAR - greatest valid acknowledgement number received on a non-Sync;
- * initialized to S.ISS (sec. 8.5) */
- dp->dccps_gar = dp->dccps_iss;
-
- icsk->icsk_retransmits = 0;
- init_timer(&dp->dccps_xmit_timer);
- dp->dccps_xmit_timer.data = (unsigned long)sk;
- dp->dccps_xmit_timer.function = dccp_write_xmit_timer;
-}
-
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- dccp_connect_init(sk);
-
- skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
- if (unlikely(skb == NULL))
- return -ENOBUFS;
-
- /* Reserve space for headers. */
- skb_reserve(skb, sk->sk_prot->max_header);
-
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
-
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
- DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
-
- /* Timer for repeating the REQUEST until an answer. */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- icsk->icsk_rto, DCCP_RTO_MAX);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_connect);
-
-void dccp_send_ack(struct sock *sk)
-{
- /* If we have been reset, we may not send again. */
- if (sk->sk_state != DCCP_CLOSED) {
- struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
- GFP_ATOMIC);
-
- if (skb == NULL) {
- inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX,
- DCCP_RTO_MAX);
- return;
- }
-
- /* Reserve space for headers */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
- dccp_transmit_skb(sk, skb);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_send_ack);
-
-void dccp_send_delayed_ack(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- /*
- * FIXME: tune this timer. elapsed time fixes the skew, so no problem
- * with using 2s, and active senders also piggyback the ACK into a
- * DATAACK packet, so this is really for quiescent senders.
- */
- unsigned long timeout = jiffies + 2 * HZ;
-
- /* Use new timeout only if there wasn't a older one earlier. */
- if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- *
- * FIXME: check the "about to expire" part
- */
- if (icsk->icsk_ack.blocked) {
- dccp_send_ack(sk);
- return;
- }
-
- if (!time_before(timeout, icsk->icsk_ack.timeout))
- timeout = icsk->icsk_ack.timeout;
- }
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
- icsk->icsk_ack.timeout = timeout;
- sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
-}
-
-void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type)
-{
- /*
- * We are not putting this on the write queue, so
- * dccp_transmit_skb() will set the ownership to this
- * sock.
- */
- struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
-
- if (skb == NULL)
- /* FIXME: how to make sure the sync is sent? */
- return;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
- DCCP_SKB_CB(skb)->dccpd_seq = seq;
-
- dccp_transmit_skb(sk, skb);
-}
-
-EXPORT_SYMBOL_GPL(dccp_send_sync);
-
-/*
- * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
- * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
- * any circumstances.
- */
-void dccp_send_close(struct sock *sk, const int active)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
-
- skb = alloc_skb(sk->sk_prot->max_header, prio);
- if (skb == NULL)
- return;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
- DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
-
- if (active) {
- dccp_write_xmit(sk, 1);
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, prio));
- /* FIXME do we need a retransmit timer here? */
- } else
- dccp_transmit_skb(sk, skb);
-}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
deleted file mode 100644
index f81e37de35d5..000000000000
--- a/net/dccp/probe.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * dccp_probe - Observe the DCCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for DCCP from Stephen Hemminger's code
- * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/dccp.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/vmalloc.h>
-
-#include "dccp.h"
-#include "ccid.h"
-#include "ccids/ccid3.h"
-
-static int port;
-
-static int bufsize = 64 * 1024;
-
-static const char procname[] = "dccpprobe";
-
-struct {
- struct kfifo *fifo;
- spinlock_t lock;
- wait_queue_head_t wait;
- struct timeval tstart;
-} dccpw;
-
-static void printl(const char *fmt, ...)
-{
- va_list args;
- int len;
- struct timeval now;
- char tbuf[256];
-
- va_start(args, fmt);
- do_gettimeofday(&now);
-
- now.tv_sec -= dccpw.tstart.tv_sec;
- now.tv_usec -= dccpw.tstart.tv_usec;
- if (now.tv_usec < 0) {
- --now.tv_sec;
- now.tv_usec += 1000000;
- }
-
- len = sprintf(tbuf, "%lu.%06lu ",
- (unsigned long) now.tv_sec,
- (unsigned long) now.tv_usec);
- len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
- va_end(args);
-
- kfifo_put(dccpw.fifo, tbuf, len);
- wake_up(&dccpw.wait);
-}
-
-static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t size)
-{
- const struct dccp_minisock *dmsk = dccp_msk(sk);
- const struct inet_sock *inet = inet_sk(sk);
- const struct ccid3_hc_tx_sock *hctx;
-
- if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
- hctx = ccid3_hc_tx_sk(sk);
- else
- hctx = NULL;
-
- if (port == 0 || ntohs(inet->dport) == port ||
- ntohs(inet->sport) == port) {
- if (hctx)
- printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %d\n",
- NIPQUAD(inet->saddr), ntohs(inet->sport),
- NIPQUAD(inet->daddr), ntohs(inet->dport), size,
- hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
- hctx->ccid3hctx_p, hctx->ccid3hctx_t_ipi);
- else
- printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
- NIPQUAD(inet->saddr), ntohs(inet->sport),
- NIPQUAD(inet->daddr), ntohs(inet->dport), size);
- }
-
- jprobe_return();
- return 0;
-}
-
-static struct jprobe dccp_send_probe = {
- .kp = {
- .symbol_name = "dccp_sendmsg",
- },
- .entry = JPROBE_ENTRY(jdccp_sendmsg),
-};
-
-static int dccpprobe_open(struct inode *inode, struct file *file)
-{
- kfifo_reset(dccpw.fifo);
- do_gettimeofday(&dccpw.tstart);
- return 0;
-}
-
-static ssize_t dccpprobe_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- int error = 0, cnt = 0;
- unsigned char *tbuf;
-
- if (!buf || len < 0)
- return -EINVAL;
-
- if (len == 0)
- return 0;
-
- tbuf = vmalloc(len);
- if (!tbuf)
- return -ENOMEM;
-
- error = wait_event_interruptible(dccpw.wait,
- __kfifo_len(dccpw.fifo) != 0);
- if (error)
- goto out_free;
-
- cnt = kfifo_get(dccpw.fifo, tbuf, len);
- error = copy_to_user(buf, tbuf, cnt);
-
-out_free:
- vfree(tbuf);
-
- return error ? error : cnt;
-}
-
-static struct file_operations dccpprobe_fops = {
- .owner = THIS_MODULE,
- .open = dccpprobe_open,
- .read = dccpprobe_read,
-};
-
-static __init int dccpprobe_init(void)
-{
- int ret = -ENOMEM;
-
- init_waitqueue_head(&dccpw.wait);
- spin_lock_init(&dccpw.lock);
- dccpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &dccpw.lock);
- if (IS_ERR(dccpw.fifo))
- return PTR_ERR(dccpw.fifo);
-
- if (!proc_net_fops_create(procname, S_IRUSR, &dccpprobe_fops))
- goto err0;
-
- ret = register_jprobe(&dccp_send_probe);
- if (ret)
- goto err1;
-
- pr_info("DCCP watch registered (port=%d)\n", port);
- return 0;
-err1:
- proc_net_remove(procname);
-err0:
- kfifo_free(dccpw.fifo);
- return ret;
-}
-module_init(dccpprobe_init);
-
-static __exit void dccpprobe_exit(void)
-{
- kfifo_free(dccpw.fifo);
- proc_net_remove(procname);
- unregister_jprobe(&dccp_send_probe);
-
-}
-module_exit(dccpprobe_exit);
-
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
-MODULE_DESCRIPTION("DCCP snooper");
-MODULE_LICENSE("GPL");
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
deleted file mode 100644
index 5ec47d9ee447..000000000000
--- a/net/dccp/proto.c
+++ /dev/null
@@ -1,1115 +0,0 @@
-/*
- * net/dccp/proto.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/init.h>
-#include <linux/random.h>
-#include <net/checksum.h>
-
-#include <net/inet_sock.h>
-#include <net/sock.h>
-#include <net/xfrm.h>
-
-#include <asm/semaphore.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/poll.h>
-
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
-
-EXPORT_SYMBOL_GPL(dccp_statistics);
-
-atomic_t dccp_orphan_count = ATOMIC_INIT(0);
-
-EXPORT_SYMBOL_GPL(dccp_orphan_count);
-
-struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
- .lhash_lock = RW_LOCK_UNLOCKED,
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-};
-
-EXPORT_SYMBOL_GPL(dccp_hashinfo);
-
-/* the maximum queue length for tx in packets. 0 is no limit */
-int sysctl_dccp_tx_qlen __read_mostly = 5;
-
-void dccp_set_state(struct sock *sk, const int state)
-{
- const int oldstate = sk->sk_state;
-
- dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
- dccp_role(sk), sk,
- dccp_state_name(oldstate), dccp_state_name(state));
- WARN_ON(state == oldstate);
-
- switch (state) {
- case DCCP_OPEN:
- if (oldstate != DCCP_OPEN)
- DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
- break;
-
- case DCCP_CLOSED:
- if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
- DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
-
- sk->sk_prot->unhash(sk);
- if (inet_csk(sk)->icsk_bind_hash != NULL &&
- !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
- inet_put_port(&dccp_hashinfo, sk);
- /* fall through */
- default:
- if (oldstate == DCCP_OPEN)
- DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
- }
-
- /* Change state AFTER socket is unhashed to avoid closed
- * socket sitting in hash tables.
- */
- sk->sk_state = state;
-}
-
-EXPORT_SYMBOL_GPL(dccp_set_state);
-
-void dccp_done(struct sock *sk)
-{
- dccp_set_state(sk, DCCP_CLOSED);
- dccp_clear_xmit_timers(sk);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- else
- inet_csk_destroy_sock(sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_done);
-
-const char *dccp_packet_name(const int type)
-{
- static const char *dccp_packet_names[] = {
- [DCCP_PKT_REQUEST] = "REQUEST",
- [DCCP_PKT_RESPONSE] = "RESPONSE",
- [DCCP_PKT_DATA] = "DATA",
- [DCCP_PKT_ACK] = "ACK",
- [DCCP_PKT_DATAACK] = "DATAACK",
- [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PKT_CLOSE] = "CLOSE",
- [DCCP_PKT_RESET] = "RESET",
- [DCCP_PKT_SYNC] = "SYNC",
- [DCCP_PKT_SYNCACK] = "SYNCACK",
- };
-
- if (type >= DCCP_NR_PKT_TYPES)
- return "INVALID";
- else
- return dccp_packet_names[type];
-}
-
-EXPORT_SYMBOL_GPL(dccp_packet_name);
-
-const char *dccp_state_name(const int state)
-{
- static char *dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
-
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
-}
-
-EXPORT_SYMBOL_GPL(dccp_state_name);
-
-void dccp_hash(struct sock *sk)
-{
- inet_hash(&dccp_hashinfo, sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_hash);
-
-void dccp_unhash(struct sock *sk)
-{
- inet_unhash(&dccp_hashinfo, sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_unhash);
-
-int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- dccp_minisock_init(&dp->dccps_minisock);
- do_gettimeofday(&dp->dccps_epoch);
-
- /*
- * FIXME: We're hardcoding the CCID, and doing this at this point makes
- * the listening (master) sock get CCID control blocks, which is not
- * necessary, but for now, to not mess with the test userspace apps,
- * lets leave it here, later the real solution is to do this in a
- * setsockopt(CCIDs-I-want/accept). -acme
- */
- if (likely(ctl_sock_initialized)) {
- int rc = dccp_feat_init(dmsk);
-
- if (rc)
- return rc;
-
- if (dmsk->dccpms_send_ack_vector) {
- dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
- if (dp->dccps_hc_rx_ackvec == NULL)
- return -ENOMEM;
- }
- dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
- sk, GFP_KERNEL);
- dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
- sk, GFP_KERNEL);
- if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
- dp->dccps_hc_tx_ccid == NULL)) {
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- if (dmsk->dccpms_send_ack_vector) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
- return -ENOMEM;
- }
- } else {
- /* control socket doesn't need feat nego */
- INIT_LIST_HEAD(&dmsk->dccpms_pending);
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
- }
-
- dccp_init_xmit_timers(sk);
- icsk->icsk_rto = DCCP_TIMEOUT_INIT;
- icsk->icsk_syn_retries = sysctl_dccp_request_retries;
- sk->sk_state = DCCP_CLOSED;
- sk->sk_write_space = dccp_write_space;
- icsk->icsk_sync_mss = dccp_sync_mss;
- dp->dccps_mss_cache = 536;
- dp->dccps_role = DCCP_ROLE_UNDEFINED;
- dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
- dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_init_sock);
-
-int dccp_destroy_sock(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
-
- /*
- * DCCP doesn't use sk_write_queue, just sk_send_head
- * for retransmissions
- */
- if (sk->sk_send_head != NULL) {
- kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- /* Clean up a referenced DCCP bind bucket. */
- if (inet_csk(sk)->icsk_bind_hash != NULL)
- inet_put_port(&dccp_hashinfo, sk);
-
- kfree(dp->dccps_service_list);
- dp->dccps_service_list = NULL;
-
- if (dmsk->dccpms_send_ack_vector) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
-
- /* clean up feature negotiation state */
- dccp_feat_clean(dmsk);
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_destroy_sock);
-
-static inline int dccp_listen_start(struct sock *sk, int backlog)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- dp->dccps_role = DCCP_ROLE_LISTEN;
- return inet_csk_listen_start(sk, backlog);
-}
-
-int dccp_disconnect(struct sock *sk, int flags)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- int err = 0;
- const int old_state = sk->sk_state;
-
- if (old_state != DCCP_CLOSED)
- dccp_set_state(sk, DCCP_CLOSED);
-
- /* ABORT function of RFC793 */
- if (old_state == DCCP_LISTEN) {
- inet_csk_listen_stop(sk);
- /* FIXME: do the active reset thing */
- } else if (old_state == DCCP_REQUESTING)
- sk->sk_err = ECONNRESET;
-
- dccp_clear_xmit_timers(sk);
- __skb_queue_purge(&sk->sk_receive_queue);
- if (sk->sk_send_head != NULL) {
- __kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- inet->dport = 0;
-
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
- inet_reset_saddr(sk);
-
- sk->sk_shutdown = 0;
- sock_reset_flag(sk, SOCK_DONE);
-
- icsk->icsk_backoff = 0;
- inet_csk_delack_init(sk);
- __sk_dst_reset(sk);
-
- BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
-
- sk->sk_error_report(sk);
- return err;
-}
-
-EXPORT_SYMBOL_GPL(dccp_disconnect);
-
-/*
- * Wait for a DCCP event.
- *
- * Note that we don't need to lock the socket, as the upper poll layers
- * take care of normal races (between the test and the event) and we don't
- * go look at any of the socket buffers directly.
- */
-unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait)
-{
- unsigned int mask;
- struct sock *sk = sock->sk;
-
- poll_wait(file, sk->sk_sleep, wait);
- if (sk->sk_state == DCCP_LISTEN)
- return inet_csk_listen_poll(sk);
-
- /* Socket is not locked. We are protected from async events
- by poll logic and correct handling of state changes
- made by another threads is impossible in any case.
- */
-
- mask = 0;
- if (sk->sk_err)
- mask = POLLERR;
-
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
- mask |= POLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-
- /* Connected? */
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
- if (atomic_read(&sk->sk_rmem_alloc) > 0)
- mask |= POLLIN | POLLRDNORM;
-
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
- mask |= POLLOUT | POLLWRNORM;
- } else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-
- /* Race breaker. If space is freed after
- * wspace test but before the flags are set,
- * IO signal will be lost.
- */
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
- mask |= POLLOUT | POLLWRNORM;
- }
- }
- }
- return mask;
-}
-
-EXPORT_SYMBOL_GPL(dccp_poll);
-
-int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
-{
- dccp_pr_debug("entry\n");
- return -ENOIOCTLCMD;
-}
-
-EXPORT_SYMBOL_GPL(dccp_ioctl);
-
-static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
- char __user *optval, int optlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_service_list *sl = NULL;
-
- if (service == DCCP_SERVICE_INVALID_VALUE ||
- optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
- return -EINVAL;
-
- if (optlen > sizeof(service)) {
- sl = kmalloc(optlen, GFP_KERNEL);
- if (sl == NULL)
- return -ENOMEM;
-
- sl->dccpsl_nr = optlen / sizeof(u32) - 1;
- if (copy_from_user(sl->dccpsl_list,
- optval + sizeof(service),
- optlen - sizeof(service)) ||
- dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
- kfree(sl);
- return -EFAULT;
- }
- }
-
- lock_sock(sk);
- dp->dccps_service = service;
-
- kfree(dp->dccps_service_list);
-
- dp->dccps_service_list = sl;
- release_sock(sk);
- return 0;
-}
-
-/* byte 1 is feature. the rest is the preference list */
-static int dccp_setsockopt_change(struct sock *sk, int type,
- struct dccp_so_feat __user *optval)
-{
- struct dccp_so_feat opt;
- u8 *val;
- int rc;
-
- if (copy_from_user(&opt, optval, sizeof(opt)))
- return -EFAULT;
-
- val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
- if (!val)
- return -ENOMEM;
-
- if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
- rc = -EFAULT;
- goto out_free_val;
- }
-
- rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
- val, opt.dccpsf_len, GFP_KERNEL);
- if (rc)
- goto out_free_val;
-
-out:
- return rc;
-
-out_free_val:
- kfree(val);
- goto out;
-}
-
-static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- int val, err = 0;
-
- if (optlen < sizeof(int))
- return -EINVAL;
-
- if (get_user(val, (int __user *)optval))
- return -EFAULT;
-
- if (optname == DCCP_SOCKOPT_SERVICE)
- return dccp_setsockopt_service(sk, val, optval, optlen);
-
- lock_sock(sk);
- switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- err = 0;
- break;
- case DCCP_SOCKOPT_CHANGE_L:
- if (optlen != sizeof(struct dccp_so_feat))
- err = -EINVAL;
- else
- err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
- (struct dccp_so_feat __user *)
- optval);
- break;
- case DCCP_SOCKOPT_CHANGE_R:
- if (optlen != sizeof(struct dccp_so_feat))
- err = -EINVAL;
- else
- err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
- (struct dccp_so_feat __user *)
- optval);
- break;
- case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */
- if (val < 0 || val > 15)
- err = -EINVAL;
- else
- dp->dccps_pcslen = val;
- break;
- case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */
- if (val < 0 || val > 15)
- err = -EINVAL;
- else {
- dp->dccps_pcrlen = val;
- /* FIXME: add feature negotiation,
- * ChangeL(MinimumChecksumCoverage, val) */
- }
- break;
- default:
- err = -ENOPROTOOPT;
- break;
- }
-
- release_sock(sk);
- return err;
-}
-
-int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
- optname, optval,
- optlen);
- return do_dccp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(dccp_setsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk_compat_setsockopt(sk, level, optname,
- optval, optlen);
- return do_dccp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
-#endif
-
-static int dccp_getsockopt_service(struct sock *sk, int len,
- __be32 __user *optval,
- int __user *optlen)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_service_list *sl;
- int err = -ENOENT, slen = 0, total_len = sizeof(u32);
-
- lock_sock(sk);
- if ((sl = dp->dccps_service_list) != NULL) {
- slen = sl->dccpsl_nr * sizeof(u32);
- total_len += slen;
- }
-
- err = -EINVAL;
- if (total_len > len)
- goto out;
-
- err = 0;
- if (put_user(total_len, optlen) ||
- put_user(dp->dccps_service, optval) ||
- (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
- err = -EFAULT;
-out:
- release_sock(sk);
- return err;
-}
-
-static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct dccp_sock *dp;
- int val, len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- if (len < sizeof(int))
- return -EINVAL;
-
- dp = dccp_sk(sk);
-
- switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_SERVICE:
- return dccp_getsockopt_service(sk, len,
- (__be32 __user *)optval, optlen);
- case DCCP_SOCKOPT_SEND_CSCOV:
- val = dp->dccps_pcslen;
- break;
- case DCCP_SOCKOPT_RECV_CSCOV:
- val = dp->dccps_pcrlen;
- break;
- case 128 ... 191:
- return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
- len, (u32 __user *)optval, optlen);
- case 192 ... 255:
- return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
- len, (u32 __user *)optval, optlen);
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, &val, len))
- return -EFAULT;
-
- return 0;
-}
-
-int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
- optname, optval,
- optlen);
- return do_dccp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(dccp_getsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk_compat_getsockopt(sk, level, optname,
- optval, optlen);
- return do_dccp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
-#endif
-
-int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- const int flags = msg->msg_flags;
- const int noblock = flags & MSG_DONTWAIT;
- struct sk_buff *skb;
- int rc, size;
- long timeo;
-
- if (len > dp->dccps_mss_cache)
- return -EMSGSIZE;
-
- lock_sock(sk);
-
- if (sysctl_dccp_tx_qlen &&
- (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
- rc = -EAGAIN;
- goto out_release;
- }
-
- timeo = sock_sndtimeo(sk, noblock);
-
- /*
- * We have to use sk_stream_wait_connect here to set sk_write_pending,
- * so that the trick in dccp_rcv_request_sent_state_process.
- */
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
- if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
- goto out_release;
-
- size = sk->sk_prot->max_header + len;
- release_sock(sk);
- skb = sock_alloc_send_skb(sk, size, noblock, &rc);
- lock_sock(sk);
- if (skb == NULL)
- goto out_release;
-
- skb_reserve(skb, sk->sk_prot->max_header);
- rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
- if (rc != 0)
- goto out_discard;
-
- skb_queue_tail(&sk->sk_write_queue, skb);
- dccp_write_xmit(sk,0);
-out_release:
- release_sock(sk);
- return rc ? : len;
-out_discard:
- kfree_skb(skb);
- goto out_release;
-}
-
-EXPORT_SYMBOL_GPL(dccp_sendmsg);
-
-int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int nonblock, int flags, int *addr_len)
-{
- const struct dccp_hdr *dh;
- long timeo;
-
- lock_sock(sk);
-
- if (sk->sk_state == DCCP_LISTEN) {
- len = -ENOTCONN;
- goto out;
- }
-
- timeo = sock_rcvtimeo(sk, nonblock);
-
- do {
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- if (skb == NULL)
- goto verify_sock_status;
-
- dh = dccp_hdr(skb);
-
- if (dh->dccph_type == DCCP_PKT_DATA ||
- dh->dccph_type == DCCP_PKT_DATAACK)
- goto found_ok_skb;
-
- if (dh->dccph_type == DCCP_PKT_RESET ||
- dh->dccph_type == DCCP_PKT_CLOSE) {
- dccp_pr_debug("found fin ok!\n");
- len = 0;
- goto found_fin_ok;
- }
- dccp_pr_debug("packet_type=%s\n",
- dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb, 0);
-verify_sock_status:
- if (sock_flag(sk, SOCK_DONE)) {
- len = 0;
- break;
- }
-
- if (sk->sk_err) {
- len = sock_error(sk);
- break;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- len = 0;
- break;
- }
-
- if (sk->sk_state == DCCP_CLOSED) {
- if (!sock_flag(sk, SOCK_DONE)) {
- /* This occurs when user tries to read
- * from never connected socket.
- */
- len = -ENOTCONN;
- break;
- }
- len = 0;
- break;
- }
-
- if (!timeo) {
- len = -EAGAIN;
- break;
- }
-
- if (signal_pending(current)) {
- len = sock_intr_errno(timeo);
- break;
- }
-
- sk_wait_data(sk, &timeo);
- continue;
- found_ok_skb:
- if (len > skb->len)
- len = skb->len;
- else if (len < skb->len)
- msg->msg_flags |= MSG_TRUNC;
-
- if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
- /* Exception. Bailout! */
- len = -EFAULT;
- break;
- }
- found_fin_ok:
- if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb, 0);
- break;
- } while (1);
-out:
- release_sock(sk);
- return len;
-}
-
-EXPORT_SYMBOL_GPL(dccp_recvmsg);
-
-int inet_dccp_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
- unsigned char old_state;
- int err;
-
- lock_sock(sk);
-
- err = -EINVAL;
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
- goto out;
-
- old_state = sk->sk_state;
- if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
- goto out;
-
- /* Really, if the socket is already in listen state
- * we can only allow the backlog to be adjusted.
- */
- if (old_state != DCCP_LISTEN) {
- /*
- * FIXME: here it probably should be sk->sk_prot->listen_start
- * see tcp_listen_start
- */
- err = dccp_listen_start(sk, backlog);
- if (err)
- goto out;
- }
- sk->sk_max_ack_backlog = backlog;
- err = 0;
-
-out:
- release_sock(sk);
- return err;
-}
-
-EXPORT_SYMBOL_GPL(inet_dccp_listen);
-
-static const unsigned char dccp_new_state[] = {
- /* current state: new state: action: */
- [0] = DCCP_CLOSED,
- [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
- [DCCP_REQUESTING] = DCCP_CLOSED,
- [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
- [DCCP_LISTEN] = DCCP_CLOSED,
- [DCCP_RESPOND] = DCCP_CLOSED,
- [DCCP_CLOSING] = DCCP_CLOSED,
- [DCCP_TIME_WAIT] = DCCP_CLOSED,
- [DCCP_CLOSED] = DCCP_CLOSED,
-};
-
-static int dccp_close_state(struct sock *sk)
-{
- const int next = dccp_new_state[sk->sk_state];
- const int ns = next & DCCP_STATE_MASK;
-
- if (ns != sk->sk_state)
- dccp_set_state(sk, ns);
-
- return next & DCCP_ACTION_FIN;
-}
-
-void dccp_close(struct sock *sk, long timeout)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- int state;
-
- lock_sock(sk);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- if (sk->sk_state == DCCP_LISTEN) {
- dccp_set_state(sk, DCCP_CLOSED);
-
- /* Special case. */
- inet_csk_listen_stop(sk);
-
- goto adjudge_to_death;
- }
-
- sk_stop_timer(sk, &dp->dccps_xmit_timer);
-
- /*
- * We need to flush the recv. buffs. We do this only on the
- * descriptor close, not protocol-sourced closes, because the
- *reader process may not have drained the data yet!
- */
- /* FIXME: check for unread data */
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- __kfree_skb(skb);
- }
-
- if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
- /* Check zero linger _after_ checking for unread data. */
- sk->sk_prot->disconnect(sk, 0);
- } else if (dccp_close_state(sk)) {
- dccp_send_close(sk, 1);
- }
-
- sk_stream_wait_close(sk, timeout);
-
-adjudge_to_death:
- state = sk->sk_state;
- sock_hold(sk);
- sock_orphan(sk);
- atomic_inc(sk->sk_prot->orphan_count);
-
- /*
- * It is the last release_sock in its life. It will remove backlog.
- */
- release_sock(sk);
- /*
- * Now socket is owned by kernel and we acquire BH lock
- * to finish close. No need to check for user refs.
- */
- local_bh_disable();
- bh_lock_sock(sk);
- BUG_TRAP(!sock_owned_by_user(sk));
-
- /* Have we already been destroyed by a softirq or backlog? */
- if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
- goto out;
-
- /*
- * The last release_sock may have processed the CLOSE or RESET
- * packet moving sock to CLOSED state, if not we have to fire
- * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
- * in draft-ietf-dccp-spec-11. -acme
- */
- if (sk->sk_state == DCCP_CLOSING) {
- /* FIXME: should start at 2 * RTT */
- /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
-#if 0
- /* Yeah, we should use sk->sk_prot->orphan_count, etc */
- dccp_set_state(sk, DCCP_CLOSED);
-#endif
- }
-
- if (sk->sk_state == DCCP_CLOSED)
- inet_csk_destroy_sock(sk);
-
- /* Otherwise, socket is reprieved until protocol close. */
-
-out:
- bh_unlock_sock(sk);
- local_bh_enable();
- sock_put(sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_close);
-
-void dccp_shutdown(struct sock *sk, int how)
-{
- dccp_pr_debug("entry\n");
-}
-
-EXPORT_SYMBOL_GPL(dccp_shutdown);
-
-static int __init dccp_mib_init(void)
-{
- int rc = -ENOMEM;
-
- dccp_statistics[0] = alloc_percpu(struct dccp_mib);
- if (dccp_statistics[0] == NULL)
- goto out;
-
- dccp_statistics[1] = alloc_percpu(struct dccp_mib);
- if (dccp_statistics[1] == NULL)
- goto out_free_one;
-
- rc = 0;
-out:
- return rc;
-out_free_one:
- free_percpu(dccp_statistics[0]);
- dccp_statistics[0] = NULL;
- goto out;
-
-}
-
-static void dccp_mib_exit(void)
-{
- free_percpu(dccp_statistics[0]);
- free_percpu(dccp_statistics[1]);
- dccp_statistics[0] = dccp_statistics[1] = NULL;
-}
-
-static int thash_entries;
-module_param(thash_entries, int, 0444);
-MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-int dccp_debug;
-module_param(dccp_debug, int, 0444);
-MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
-
-EXPORT_SYMBOL_GPL(dccp_debug);
-#endif
-
-static int __init dccp_init(void)
-{
- unsigned long goal;
- int ehash_order, bhash_order, i;
- int rc = -ENOBUFS;
-
- dccp_hashinfo.bind_bucket_cachep =
- kmem_cache_create("dccp_bind_bucket",
- sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (!dccp_hashinfo.bind_bucket_cachep)
- goto out;
-
- /*
- * Size and allocate the main established and bind bucket
- * hash tables.
- *
- * The methodology is similar to that of the buffer cache.
- */
- if (num_physpages >= (128 * 1024))
- goal = num_physpages >> (21 - PAGE_SHIFT);
- else
- goal = num_physpages >> (23 - PAGE_SHIFT);
-
- if (thash_entries)
- goal = (thash_entries *
- sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
- for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
- ;
- do {
- dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
- sizeof(struct inet_ehash_bucket);
- dccp_hashinfo.ehash_size >>= 1;
- while (dccp_hashinfo.ehash_size &
- (dccp_hashinfo.ehash_size - 1))
- dccp_hashinfo.ehash_size--;
- dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
- __get_free_pages(GFP_ATOMIC, ehash_order);
- } while (!dccp_hashinfo.ehash && --ehash_order > 0);
-
- if (!dccp_hashinfo.ehash) {
- DCCP_CRIT("Failed to allocate DCCP established hash table");
- goto out_free_bind_bucket_cachep;
- }
-
- for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
- rwlock_init(&dccp_hashinfo.ehash[i].lock);
- INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
- }
-
- bhash_order = ehash_order;
-
- do {
- dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
- sizeof(struct inet_bind_hashbucket);
- if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
- bhash_order > 0)
- continue;
- dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC, bhash_order);
- } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
-
- if (!dccp_hashinfo.bhash) {
- DCCP_CRIT("Failed to allocate DCCP bind hash table");
- goto out_free_dccp_ehash;
- }
-
- for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
- spin_lock_init(&dccp_hashinfo.bhash[i].lock);
- INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
- }
-
- rc = dccp_mib_init();
- if (rc)
- goto out_free_dccp_bhash;
-
- rc = dccp_ackvec_init();
- if (rc)
- goto out_free_dccp_mib;
-
- rc = dccp_sysctl_init();
- if (rc)
- goto out_ackvec_exit;
-out:
- return rc;
-out_ackvec_exit:
- dccp_ackvec_exit();
-out_free_dccp_mib:
- dccp_mib_exit();
-out_free_dccp_bhash:
- free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
- dccp_hashinfo.bhash = NULL;
-out_free_dccp_ehash:
- free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
- dccp_hashinfo.ehash = NULL;
-out_free_bind_bucket_cachep:
- kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- dccp_hashinfo.bind_bucket_cachep = NULL;
- goto out;
-}
-
-static void __exit dccp_fini(void)
-{
- dccp_mib_exit();
- free_pages((unsigned long)dccp_hashinfo.bhash,
- get_order(dccp_hashinfo.bhash_size *
- sizeof(struct inet_bind_hashbucket)));
- free_pages((unsigned long)dccp_hashinfo.ehash,
- get_order(dccp_hashinfo.ehash_size *
- sizeof(struct inet_ehash_bucket)));
- kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- dccp_ackvec_exit();
- dccp_sysctl_exit();
-}
-
-module_init(dccp_init);
-module_exit(dccp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
-MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
deleted file mode 100644
index fdcfca3e9208..000000000000
--- a/net/dccp/sysctl.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * net/dccp/sysctl.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License v2
- * as published by the Free Software Foundation.
- */
-
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include "dccp.h"
-#include "feat.h"
-
-#ifndef CONFIG_SYSCTL
-#error This file should not be compiled without CONFIG_SYSCTL defined
-#endif
-
-static struct ctl_table dccp_default_table[] = {
- {
- .procname = "seq_window",
- .data = &sysctl_dccp_feat_sequence_window,
- .maxlen = sizeof(sysctl_dccp_feat_sequence_window),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "rx_ccid",
- .data = &sysctl_dccp_feat_rx_ccid,
- .maxlen = sizeof(sysctl_dccp_feat_rx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tx_ccid",
- .data = &sysctl_dccp_feat_tx_ccid,
- .maxlen = sizeof(sysctl_dccp_feat_tx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ack_ratio",
- .data = &sysctl_dccp_feat_ack_ratio,
- .maxlen = sizeof(sysctl_dccp_feat_ack_ratio),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "send_ackvec",
- .data = &sysctl_dccp_feat_send_ack_vector,
- .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "send_ndp",
- .data = &sysctl_dccp_feat_send_ndp_count,
- .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "request_retries",
- .data = &sysctl_dccp_request_retries,
- .maxlen = sizeof(sysctl_dccp_request_retries),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "retries1",
- .data = &sysctl_dccp_retries1,
- .maxlen = sizeof(sysctl_dccp_retries1),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "retries2",
- .data = &sysctl_dccp_retries2,
- .maxlen = sizeof(sysctl_dccp_retries2),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tx_qlen",
- .data = &sysctl_dccp_tx_qlen,
- .maxlen = sizeof(sysctl_dccp_tx_qlen),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-
- { .ctl_name = 0, }
-};
-
-static struct ctl_table dccp_table[] = {
- {
- .ctl_name = NET_DCCP_DEFAULT,
- .procname = "default",
- .mode = 0555,
- .child = dccp_default_table,
- },
- { .ctl_name = 0, },
-};
-
-static struct ctl_table dccp_dir_table[] = {
- {
- .ctl_name = NET_DCCP,
- .procname = "dccp",
- .mode = 0555,
- .child = dccp_table,
- },
- { .ctl_name = 0, },
-};
-
-static struct ctl_table dccp_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = dccp_dir_table,
- },
- { .ctl_name = 0, },
-};
-
-static struct ctl_table_header *dccp_table_header;
-
-int __init dccp_sysctl_init(void)
-{
- dccp_table_header = register_sysctl_table(dccp_root_table, 1);
-
- return dccp_table_header != NULL ? 0 : -ENOMEM;
-}
-
-void dccp_sysctl_exit(void)
-{
- if (dccp_table_header != NULL) {
- unregister_sysctl_table(dccp_table_header);
- dccp_table_header = NULL;
- }
-}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
deleted file mode 100644
index e8f519e7f481..000000000000
--- a/net/dccp/timer.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * net/dccp/timer.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-
-#include "dccp.h"
-
-/* sysctl variables governing numbers of retransmission attempts */
-int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES;
-int sysctl_dccp_retries1 __read_mostly = TCP_RETR1;
-int sysctl_dccp_retries2 __read_mostly = TCP_RETR2;
-
-static void dccp_write_err(struct sock *sk)
-{
- sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
- sk->sk_error_report(sk);
-
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- dccp_done(sk);
- DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
-}
-
-/* A write timeout has occurred. Process the after effects. */
-static int dccp_write_timeout(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int retry_until;
-
- if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
- if (icsk->icsk_retransmits != 0)
- dst_negative_advice(&sk->sk_dst_cache);
- retry_until = icsk->icsk_syn_retries ?
- : sysctl_dccp_request_retries;
- } else {
- if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
- /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
- black hole detection. :-(
-
- It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
- case. Let me to cite the same draft, which requires for
- us to implement this:
-
- "The one security concern raised by this memo is that ICMP black holes
- are often caused by over-zealous security administrators who block
- all ICMP messages. It is vitally important that those who design and
- deploy security systems understand the impact of strict filtering on
- upper-layer protocols. The safest web site in the world is worthless
- if most TCP implementations cannot transfer data from it. It would
- be far nicer to have all of the black holes fixed rather than fixing
- all of the TCP implementations."
-
- Golden words :-).
- */
-
- dst_negative_advice(&sk->sk_dst_cache);
- }
-
- retry_until = sysctl_dccp_retries2;
- /*
- * FIXME: see tcp_write_timout and tcp_out_of_resources
- */
- }
-
- if (icsk->icsk_retransmits >= retry_until) {
- /* Has it gone just too far? */
- dccp_write_err(sk);
- return 1;
- }
- return 0;
-}
-
-/*
- * The DCCP retransmit timer.
- */
-static void dccp_retransmit_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- /* retransmit timer is used for feature negotiation throughout
- * connection. In this case, no packet is re-transmitted, but rather an
- * ack is generated and pending changes are placed into its options.
- */
- if (sk->sk_send_head == NULL) {
- dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
- if (sk->sk_state == DCCP_OPEN)
- dccp_send_ack(sk);
- goto backoff;
- }
-
- /*
- * sk->sk_send_head has to have one skb with
- * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
- * packet types. The only packets eligible for retransmission are:
- * -- Requests in client-REQUEST state (sec. 8.1.1)
- * -- Acks in client-PARTOPEN state (sec. 8.1.5)
- * -- CloseReq in server-CLOSEREQ state (sec. 8.3)
- * -- Close in node-CLOSING state (sec. 8.3) */
- BUG_TRAP(sk->sk_send_head != NULL);
-
- /*
- * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
- * sent, no need to retransmit, this sock is dead.
- */
- if (dccp_write_timeout(sk))
- goto out;
-
- /*
- * We want to know the number of packets retransmitted, not the
- * total number of retransmissions of clones of original packets.
- */
- if (icsk->icsk_retransmits == 0)
- DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
-
- if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
- /*
- * Retransmission failed because of local congestion,
- * do not backoff.
- */
- if (icsk->icsk_retransmits == 0)
- icsk->icsk_retransmits = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- min(icsk->icsk_rto,
- TCP_RESOURCE_PROBE_INTERVAL),
- DCCP_RTO_MAX);
- goto out;
- }
-
-backoff:
- icsk->icsk_backoff++;
- icsk->icsk_retransmits++;
-
- icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
- DCCP_RTO_MAX);
- if (icsk->icsk_retransmits > sysctl_dccp_retries1)
- __sk_dst_reset(sk);
-out:;
-}
-
-static void dccp_write_timer(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
- struct inet_connection_sock *icsk = inet_csk(sk);
- int event = 0;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later */
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- jiffies + (HZ / 20));
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
- goto out;
-
- if (time_after(icsk->icsk_timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- icsk->icsk_timeout);
- goto out;
- }
-
- event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
-
- switch (event) {
- case ICSK_TIME_RETRANS:
- dccp_retransmit_timer(sk);
- break;
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/*
- * Timer for listening sockets
- */
-static void dccp_response_timer(struct sock *sk)
-{
- inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
- DCCP_RTO_MAX);
-}
-
-static void dccp_keepalive_timer(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
-
- /* Only process if socket is not in use. */
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- inet_csk_reset_keepalive_timer(sk, HZ / 20);
- goto out;
- }
-
- if (sk->sk_state == DCCP_LISTEN) {
- dccp_response_timer(sk);
- goto out;
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
-static void dccp_delack_timer(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- jiffies + TCP_DELACK_MIN);
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED ||
- !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
- if (time_after(icsk->icsk_ack.timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- icsk->icsk_ack.timeout);
- goto out;
- }
-
- icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
-
- if (inet_csk_ack_scheduled(sk)) {
- if (!icsk->icsk_ack.pingpong) {
- /* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
- icsk->icsk_rto);
- } else {
- /* Delayed ACK missed: leave pingpong mode and
- * deflate ATO.
- */
- icsk->icsk_ack.pingpong = 0;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- }
- dccp_send_ack(sk);
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-void dccp_init_xmit_timers(struct sock *sk)
-{
- inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
- &dccp_keepalive_timer);
-}
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
deleted file mode 100644
index 7914fd619c5c..000000000000
--- a/net/decnet/Kconfig
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# DECnet configuration
-#
-config DECNET
- tristate "DECnet Support"
- ---help---
- The DECnet networking protocol was used in many products made by
- Digital (now Compaq). It provides reliable stream and sequenced
- packet communications over which run a variety of services similar
- to those which run over TCP/IP.
-
- To find some tools to use with the kernel layer support, please
- look at Patrick Caulfield's web site:
- <http://linux-decnet.sourceforge.net/>.
-
- More detailed documentation is available in
- <file:Documentation/networking/decnet.txt>.
-
- Be sure to say Y to "/proc file system support" and "Sysctl support"
- below when using DECnet, since you will need sysctl support to aid
- in configuration at run time.
-
- The DECnet code is also available as a module ( = code which can be
- inserted in and removed from the running kernel whenever you want).
- The module is called decnet.
-
-config DECNET_ROUTER
- bool "DECnet: router support (EXPERIMENTAL)"
- depends on DECNET && EXPERIMENTAL
- select FIB_RULES
- ---help---
- Add support for turning your DECnet Endnode into a level 1 or 2
- router. This is an experimental, but functional option. If you
- do say Y here, then make sure that you also say Y to "Kernel/User
- network link driver", "Routing messages" and "Network packet
- filtering". The first two are required to allow configuration via
- rtnetlink (you will need Alexey Kuznetsov's iproute2 package
- from <ftp://ftp.tux.org/pub/net/ip-routing/>). The "Network packet
- filtering" option will be required for the forthcoming routing daemon
- to work.
-
- See <file:Documentation/networking/decnet.txt> for more information.
-
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
deleted file mode 100644
index e44003af71f6..000000000000
--- a/net/decnet/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-obj-$(CONFIG_DECNET) += decnet.o
-
-decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \
- dn_route.o dn_dev.o dn_neigh.o dn_timer.o
-decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
-decnet-y += sysctl_net_decnet.o
-
-obj-$(CONFIG_NETFILTER) += netfilter/
-
diff --git a/net/decnet/README b/net/decnet/README
deleted file mode 100644
index 60e7ec88c81f..000000000000
--- a/net/decnet/README
+++ /dev/null
@@ -1,8 +0,0 @@
- Linux DECnet Project
- ======================
-
-The documentation for this kernel subsystem is available in the
-Documentation/networking subdirectory of this distribution and also
-on line at http://www.chygwyn.com/DECnet/
-
-Steve Whitehouse <SteveW@ACM.org>
diff --git a/net/decnet/TODO b/net/decnet/TODO
deleted file mode 100644
index ebb5ac69d128..000000000000
--- a/net/decnet/TODO
+++ /dev/null
@@ -1,41 +0,0 @@
-Steve's quick list of things that need finishing off:
-[they are in no particular order and range from the trivial to the long winded]
-
- o Proper timeouts on each neighbour (in routing mode) rather than
- just the 60 second On-Ethernet cache value.
-
- o Support for X.25 linklayer
-
- o Support for DDCMP link layer
-
- o The DDCMP device itself
-
- o PPP support (rfc1762)
-
- o Lots of testing with real applications
-
- o Verify errors etc. against POSIX 1003.1g (draft)
-
- o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
- [maybe this should be done at socket level... the control data in the
- send/recvmsg() calls should simply be a vector of set/getsockopt()
- calls]
-
- o check MSG_CTRUNC is set where it should be.
-
- o Find all the commonality between DECnet and IPv4 routing code and extract
- it into a small library of routines. [probably a project for 2.7.xx]
-
- o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
- we have a half-way house scheme which seems to work reasonably well, but
- the full scheme is still worth implementing, its not not top of my list
- right now.
-
- o Add session control message flow control
-
- o Add NSP message flow control
-
- o DECnet sendpages() function
-
- o AIO for DECnet
-
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
deleted file mode 100644
index 21f20f21dd32..000000000000
--- a/net/decnet/af_decnet.c
+++ /dev/null
@@ -1,2432 +0,0 @@
-
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Socket Layer Interface
- *
- * Authors: Eduardo Marcelo Serrat <emserrat@geocities.com>
- * Patrick Caulfield <patrick@pandh.demon.co.uk>
- *
- * Changes:
- * Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's
- * version of the code. Original copyright preserved
- * below.
- * Steve Whitehouse: Some bug fixes, cleaning up some code to make it
- * compatible with my routing layer.
- * Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick
- * Caulfield.
- * Steve Whitehouse: Further bug fixes, checking module code still works
- * with new routing layer.
- * Steve Whitehouse: Additional set/get_sockopt() calls.
- * Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new
- * code.
- * Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like
- * way. Didn't manage it entirely, but its better.
- * Steve Whitehouse: ditto for sendmsg().
- * Steve Whitehouse: A selection of bug fixes to various things.
- * Steve Whitehouse: Added TIOCOUTQ ioctl.
- * Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username.
- * Steve Whitehouse: Fixes to connect() error returns.
- * Patrick Caulfield: Fixes to delayed acceptance logic.
- * David S. Miller: New socket locking
- * Steve Whitehouse: Socket list hashing/locking
- * Arnaldo C. Melo: use capable, not suser
- * Steve Whitehouse: Removed unused code. Fix to use sk->allocation
- * when required.
- * Patrick Caulfield: /proc/net/decnet now has object name/number
- * Steve Whitehouse: Fixed local port allocation, hashed sk list
- * Matthew Wilcox: Fixes for dn_ioctl()
- * Steve Whitehouse: New connect/accept logic to allow timeouts and
- * prepare for sendpage etc.
- */
-
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-HISTORY:
-
-Version Kernel Date Author/Comments
-------- ------ ---- ---------------
-Version 0.0.1 2.0.30 01-dic-97 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
-
- First Development of DECnet Socket La-
- yer for Linux. Only supports outgoing
- connections.
-
-Version 0.0.2 2.1.105 20-jun-98 Patrick J. Caulfield
- (patrick@pandh.demon.co.uk)
-
- Port to new kernel development version.
-
-Version 0.0.3 2.1.106 25-jun-98 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
- _
- Added support for incoming connections
- so we can start developing server apps
- on Linux.
- -
- Module Support
-Version 0.0.4 2.1.109 21-jul-98 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
- _
- Added support for X11R6.4. Now we can
- use DECnet transport for X on Linux!!!
- -
-Version 0.0.5 2.1.110 01-aug-98 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
- Removed bugs on flow control
- Removed bugs on incoming accessdata
- order
- -
-Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
- dn_recvmsg fixes
-
- Patrick J. Caulfield
- dn_bind fixes
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/netfilter.h>
-#include <linux/seq_file.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <net/flow.h>
-#include <asm/system.h>
-#include <asm/ioctls.h>
-#include <linux/capability.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-
-struct dn_sock {
- struct sock sk;
- struct dn_scp scp;
-};
-
-static void dn_keepalive(struct sock *sk);
-
-#define DN_SK_HASH_SHIFT 8
-#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT)
-#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
-
-
-static const struct proto_ops dn_proto_ops;
-static DEFINE_RWLOCK(dn_hash_lock);
-static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
-static struct hlist_head dn_wild_sk;
-static atomic_t decnet_memory_allocated;
-
-static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags);
-static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
-
-static struct hlist_head *dn_find_list(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->addr.sdn_flags & SDF_WILD)
- return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL;
-
- return &dn_sk_hash[dn_ntohs(scp->addrloc) & DN_SK_HASH_MASK];
-}
-
-/*
- * Valid ports are those greater than zero and not already in use.
- */
-static int check_port(__le16 port)
-{
- struct sock *sk;
- struct hlist_node *node;
-
- if (port == 0)
- return -1;
-
- sk_for_each(sk, node, &dn_sk_hash[dn_ntohs(port) & DN_SK_HASH_MASK]) {
- struct dn_scp *scp = DN_SK(sk);
- if (scp->addrloc == port)
- return -1;
- }
- return 0;
-}
-
-static unsigned short port_alloc(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-static unsigned short port = 0x2000;
- unsigned short i_port = port;
-
- while(check_port(dn_htons(++port)) != 0) {
- if (port == i_port)
- return 0;
- }
-
- scp->addrloc = dn_htons(port);
-
- return 1;
-}
-
-/*
- * Since this is only ever called from user
- * level, we don't need a write_lock() version
- * of this.
- */
-static int dn_hash_sock(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct hlist_head *list;
- int rv = -EUSERS;
-
- BUG_ON(sk_hashed(sk));
-
- write_lock_bh(&dn_hash_lock);
-
- if (!scp->addrloc && !port_alloc(sk))
- goto out;
-
- rv = -EADDRINUSE;
- if ((list = dn_find_list(sk)) == NULL)
- goto out;
-
- sk_add_node(sk, list);
- rv = 0;
-out:
- write_unlock_bh(&dn_hash_lock);
- return rv;
-}
-
-static void dn_unhash_sock(struct sock *sk)
-{
- write_lock(&dn_hash_lock);
- sk_del_node_init(sk);
- write_unlock(&dn_hash_lock);
-}
-
-static void dn_unhash_sock_bh(struct sock *sk)
-{
- write_lock_bh(&dn_hash_lock);
- sk_del_node_init(sk);
- write_unlock_bh(&dn_hash_lock);
-}
-
-static struct hlist_head *listen_hash(struct sockaddr_dn *addr)
-{
- int i;
- unsigned hash = addr->sdn_objnum;
-
- if (hash == 0) {
- hash = addr->sdn_objnamel;
- for(i = 0; i < dn_ntohs(addr->sdn_objnamel); i++) {
- hash ^= addr->sdn_objname[i];
- hash ^= (hash << 3);
- }
- }
-
- return &dn_sk_hash[hash & DN_SK_HASH_MASK];
-}
-
-/*
- * Called to transform a socket from bound (i.e. with a local address)
- * into a listening socket (doesn't need a local port number) and rehashes
- * based upon the object name/number.
- */
-static void dn_rehash_sock(struct sock *sk)
-{
- struct hlist_head *list;
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->addr.sdn_flags & SDF_WILD)
- return;
-
- write_lock_bh(&dn_hash_lock);
- sk_del_node_init(sk);
- DN_SK(sk)->addrloc = 0;
- list = listen_hash(&DN_SK(sk)->addr);
- sk_add_node(sk, list);
- write_unlock_bh(&dn_hash_lock);
-}
-
-int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type)
-{
- int len = 2;
-
- *buf++ = type;
-
- switch(type) {
- case 0:
- *buf++ = sdn->sdn_objnum;
- break;
- case 1:
- *buf++ = 0;
- *buf++ = dn_ntohs(sdn->sdn_objnamel);
- memcpy(buf, sdn->sdn_objname, dn_ntohs(sdn->sdn_objnamel));
- len = 3 + dn_ntohs(sdn->sdn_objnamel);
- break;
- case 2:
- memset(buf, 0, 5);
- buf += 5;
- *buf++ = dn_ntohs(sdn->sdn_objnamel);
- memcpy(buf, sdn->sdn_objname, dn_ntohs(sdn->sdn_objnamel));
- len = 7 + dn_ntohs(sdn->sdn_objnamel);
- break;
- }
-
- return len;
-}
-
-/*
- * On reception of usernames, we handle types 1 and 0 for destination
- * addresses only. Types 2 and 4 are used for source addresses, but the
- * UIC, GIC are ignored and they are both treated the same way. Type 3
- * is never used as I've no idea what its purpose might be or what its
- * format is.
- */
-int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt)
-{
- unsigned char type;
- int size = len;
- int namel = 12;
-
- sdn->sdn_objnum = 0;
- sdn->sdn_objnamel = dn_htons(0);
- memset(sdn->sdn_objname, 0, DN_MAXOBJL);
-
- if (len < 2)
- return -1;
-
- len -= 2;
- *fmt = *data++;
- type = *data++;
-
- switch(*fmt) {
- case 0:
- sdn->sdn_objnum = type;
- return 2;
- case 1:
- namel = 16;
- break;
- case 2:
- len -= 4;
- data += 4;
- break;
- case 4:
- len -= 8;
- data += 8;
- break;
- default:
- return -1;
- }
-
- len -= 1;
-
- if (len < 0)
- return -1;
-
- sdn->sdn_objnamel = dn_htons(*data++);
- len -= dn_ntohs(sdn->sdn_objnamel);
-
- if ((len < 0) || (dn_ntohs(sdn->sdn_objnamel) > namel))
- return -1;
-
- memcpy(sdn->sdn_objname, data, dn_ntohs(sdn->sdn_objnamel));
-
- return size - len;
-}
-
-struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr)
-{
- struct hlist_head *list = listen_hash(addr);
- struct hlist_node *node;
- struct sock *sk;
-
- read_lock(&dn_hash_lock);
- sk_for_each(sk, node, list) {
- struct dn_scp *scp = DN_SK(sk);
- if (sk->sk_state != TCP_LISTEN)
- continue;
- if (scp->addr.sdn_objnum) {
- if (scp->addr.sdn_objnum != addr->sdn_objnum)
- continue;
- } else {
- if (addr->sdn_objnum)
- continue;
- if (scp->addr.sdn_objnamel != addr->sdn_objnamel)
- continue;
- if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, dn_ntohs(addr->sdn_objnamel)) != 0)
- continue;
- }
- sock_hold(sk);
- read_unlock(&dn_hash_lock);
- return sk;
- }
-
- sk = sk_head(&dn_wild_sk);
- if (sk) {
- if (sk->sk_state == TCP_LISTEN)
- sock_hold(sk);
- else
- sk = NULL;
- }
-
- read_unlock(&dn_hash_lock);
- return sk;
-}
-
-struct sock *dn_find_by_skb(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct sock *sk;
- struct hlist_node *node;
- struct dn_scp *scp;
-
- read_lock(&dn_hash_lock);
- sk_for_each(sk, node, &dn_sk_hash[dn_ntohs(cb->dst_port) & DN_SK_HASH_MASK]) {
- scp = DN_SK(sk);
- if (cb->src != dn_saddr2dn(&scp->peer))
- continue;
- if (cb->dst_port != scp->addrloc)
- continue;
- if (scp->addrrem && (cb->src_port != scp->addrrem))
- continue;
- sock_hold(sk);
- goto found;
- }
- sk = NULL;
-found:
- read_unlock(&dn_hash_lock);
- return sk;
-}
-
-
-
-static void dn_destruct(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- skb_queue_purge(&scp->data_xmit_queue);
- skb_queue_purge(&scp->other_xmit_queue);
- skb_queue_purge(&scp->other_receive_queue);
-
- dst_release(xchg(&sk->sk_dst_cache, NULL));
-}
-
-static int dn_memory_pressure;
-
-static void dn_enter_memory_pressure(void)
-{
- if (!dn_memory_pressure) {
- dn_memory_pressure = 1;
- }
-}
-
-static struct proto dn_proto = {
- .name = "NSP",
- .owner = THIS_MODULE,
- .enter_memory_pressure = dn_enter_memory_pressure,
- .memory_pressure = &dn_memory_pressure,
- .memory_allocated = &decnet_memory_allocated,
- .sysctl_mem = sysctl_decnet_mem,
- .sysctl_wmem = sysctl_decnet_wmem,
- .sysctl_rmem = sysctl_decnet_rmem,
- .max_header = DN_MAX_NSP_DATA_HEADER + 64,
- .obj_size = sizeof(struct dn_sock),
-};
-
-static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
-{
- struct dn_scp *scp;
- struct sock *sk = sk_alloc(PF_DECnet, gfp, &dn_proto, 1);
-
- if (!sk)
- goto out;
-
- if (sock)
- sock->ops = &dn_proto_ops;
- sock_init_data(sock, sk);
-
- sk->sk_backlog_rcv = dn_nsp_backlog_rcv;
- sk->sk_destruct = dn_destruct;
- sk->sk_no_check = 1;
- sk->sk_family = PF_DECnet;
- sk->sk_protocol = 0;
- sk->sk_allocation = gfp;
- sk->sk_sndbuf = sysctl_decnet_wmem[1];
- sk->sk_rcvbuf = sysctl_decnet_rmem[1];
-
- /* Initialization of DECnet Session Control Port */
- scp = DN_SK(sk);
- scp->state = DN_O; /* Open */
- scp->numdat = 1; /* Next data seg to tx */
- scp->numoth = 1; /* Next oth data to tx */
- scp->ackxmt_dat = 0; /* Last data seg ack'ed */
- scp->ackxmt_oth = 0; /* Last oth data ack'ed */
- scp->ackrcv_dat = 0; /* Highest data ack recv*/
- scp->ackrcv_oth = 0; /* Last oth data ack rec*/
- scp->flowrem_sw = DN_SEND;
- scp->flowloc_sw = DN_SEND;
- scp->flowrem_dat = 0;
- scp->flowrem_oth = 1;
- scp->flowloc_dat = 0;
- scp->flowloc_oth = 1;
- scp->services_rem = 0;
- scp->services_loc = 1 | NSP_FC_NONE;
- scp->info_rem = 0;
- scp->info_loc = 0x03; /* NSP version 4.1 */
- scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */
- scp->nonagle = 0;
- scp->multi_ireq = 1;
- scp->accept_mode = ACC_IMMED;
- scp->addr.sdn_family = AF_DECnet;
- scp->peer.sdn_family = AF_DECnet;
- scp->accessdata.acc_accl = 5;
- memcpy(scp->accessdata.acc_acc, "LINUX", 5);
-
- scp->max_window = NSP_MAX_WINDOW;
- scp->snd_window = NSP_MIN_WINDOW;
- scp->nsp_srtt = NSP_INITIAL_SRTT;
- scp->nsp_rttvar = NSP_INITIAL_RTTVAR;
- scp->nsp_rxtshift = 0;
-
- skb_queue_head_init(&scp->data_xmit_queue);
- skb_queue_head_init(&scp->other_xmit_queue);
- skb_queue_head_init(&scp->other_receive_queue);
-
- scp->persist = 0;
- scp->persist_fxn = NULL;
- scp->keepalive = 10 * HZ;
- scp->keepalive_fxn = dn_keepalive;
-
- init_timer(&scp->delack_timer);
- scp->delack_pending = 0;
- scp->delack_fxn = dn_nsp_delayed_ack;
-
- dn_start_slow_timer(sk);
-out:
- return sk;
-}
-
-/*
- * Keepalive timer.
- * FIXME: Should respond to SO_KEEPALIVE etc.
- */
-static void dn_keepalive(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- /*
- * By checking the other_data transmit queue is empty
- * we are double checking that we are not sending too
- * many of these keepalive frames.
- */
- if (skb_queue_empty(&scp->other_xmit_queue))
- dn_nsp_send_link(sk, DN_NOCHANGE, 0);
-}
-
-
-/*
- * Timer for shutdown/destroyed sockets.
- * When socket is dead & no packets have been sent for a
- * certain amount of time, they are removed by this
- * routine. Also takes care of sending out DI & DC
- * frames at correct times.
- */
-int dn_destroy_timer(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- scp->persist = dn_nsp_persist(sk);
-
- switch(scp->state) {
- case DN_DI:
- dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
- if (scp->nsp_rxtshift >= decnet_di_count)
- scp->state = DN_CN;
- return 0;
-
- case DN_DR:
- dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
- if (scp->nsp_rxtshift >= decnet_dr_count)
- scp->state = DN_DRC;
- return 0;
-
- case DN_DN:
- if (scp->nsp_rxtshift < decnet_dn_count) {
- /* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */
- dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
- return 0;
- }
- }
-
- scp->persist = (HZ * decnet_time_wait);
-
- if (sk->sk_socket)
- return 0;
-
- if ((jiffies - scp->stamp) >= (HZ * decnet_time_wait)) {
- dn_unhash_sock(sk);
- sock_put(sk);
- return 1;
- }
-
- return 0;
-}
-
-static void dn_destroy_sock(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- scp->nsp_rxtshift = 0; /* reset back off */
-
- if (sk->sk_socket) {
- if (sk->sk_socket->state != SS_UNCONNECTED)
- sk->sk_socket->state = SS_DISCONNECTING;
- }
-
- sk->sk_state = TCP_CLOSE;
-
- switch(scp->state) {
- case DN_DN:
- dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
- sk->sk_allocation);
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
- break;
- case DN_CR:
- scp->state = DN_DR;
- goto disc_reject;
- case DN_RUN:
- scp->state = DN_DI;
- case DN_DI:
- case DN_DR:
-disc_reject:
- dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation);
- case DN_NC:
- case DN_NR:
- case DN_RJ:
- case DN_DIC:
- case DN_CN:
- case DN_DRC:
- case DN_CI:
- case DN_CD:
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
- break;
- default:
- printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n");
- case DN_O:
- dn_stop_slow_timer(sk);
-
- dn_unhash_sock_bh(sk);
- sock_put(sk);
-
- break;
- }
-}
-
-char *dn_addr2asc(__u16 addr, char *buf)
-{
- unsigned short node, area;
-
- node = addr & 0x03ff;
- area = addr >> 10;
- sprintf(buf, "%hd.%hd", area, node);
-
- return buf;
-}
-
-
-
-static int dn_create(struct socket *sock, int protocol)
-{
- struct sock *sk;
-
- switch(sock->type) {
- case SOCK_SEQPACKET:
- if (protocol != DNPROTO_NSP)
- return -EPROTONOSUPPORT;
- break;
- case SOCK_STREAM:
- break;
- default:
- return -ESOCKTNOSUPPORT;
- }
-
-
- if ((sk = dn_alloc_sock(sock, GFP_KERNEL)) == NULL)
- return -ENOBUFS;
-
- sk->sk_protocol = protocol;
-
- return 0;
-}
-
-
-static int
-dn_release(struct socket *sock)
-{
- struct sock *sk = sock->sk;
-
- if (sk) {
- sock_orphan(sk);
- sock_hold(sk);
- lock_sock(sk);
- dn_destroy_sock(sk);
- release_sock(sk);
- sock_put(sk);
- }
-
- return 0;
-}
-
-static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr;
- struct net_device *dev;
- int rv;
-
- if (addr_len != sizeof(struct sockaddr_dn))
- return -EINVAL;
-
- if (saddr->sdn_family != AF_DECnet)
- return -EINVAL;
-
- if (dn_ntohs(saddr->sdn_nodeaddrl) && (dn_ntohs(saddr->sdn_nodeaddrl) != 2))
- return -EINVAL;
-
- if (dn_ntohs(saddr->sdn_objnamel) > DN_MAXOBJL)
- return -EINVAL;
-
- if (saddr->sdn_flags & ~SDF_WILD)
- return -EINVAL;
-
- if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum ||
- (saddr->sdn_flags & SDF_WILD)))
- return -EACCES;
-
- if (!(saddr->sdn_flags & SDF_WILD)) {
- if (dn_ntohs(saddr->sdn_nodeaddrl)) {
- read_lock(&dev_base_lock);
- for(dev = dev_base; dev; dev = dev->next) {
- if (!dev->dn_ptr)
- continue;
- if (dn_dev_islocal(dev, dn_saddr2dn(saddr)))
- break;
- }
- read_unlock(&dev_base_lock);
- if (dev == NULL)
- return -EADDRNOTAVAIL;
- }
- }
-
- rv = -EINVAL;
- lock_sock(sk);
- if (sock_flag(sk, SOCK_ZAPPED)) {
- memcpy(&scp->addr, saddr, addr_len);
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- rv = dn_hash_sock(sk);
- if (rv)
- sock_set_flag(sk, SOCK_ZAPPED);
- }
- release_sock(sk);
-
- return rv;
-}
-
-
-static int dn_auto_bind(struct socket *sock)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int rv;
-
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- scp->addr.sdn_flags = 0;
- scp->addr.sdn_objnum = 0;
-
- /*
- * This stuff is to keep compatibility with Eduardo's
- * patch. I hope I can dispense with it shortly...
- */
- if ((scp->accessdata.acc_accl != 0) &&
- (scp->accessdata.acc_accl <= 12)) {
-
- scp->addr.sdn_objnamel = dn_htons(scp->accessdata.acc_accl);
- memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, dn_ntohs(scp->addr.sdn_objnamel));
-
- scp->accessdata.acc_accl = 0;
- memset(scp->accessdata.acc_acc, 0, 40);
- }
- /* End of compatibility stuff */
-
- scp->addr.sdn_add.a_len = dn_htons(2);
- rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr);
- if (rv == 0) {
- rv = dn_hash_sock(sk);
- if (rv)
- sock_set_flag(sk, SOCK_ZAPPED);
- }
-
- return rv;
-}
-
-static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
-{
- struct dn_scp *scp = DN_SK(sk);
- DEFINE_WAIT(wait);
- int err;
-
- if (scp->state != DN_CR)
- return -EINVAL;
-
- scp->state = DN_CC;
- scp->segsize_loc = dst_metric(__sk_dst_get(sk), RTAX_ADVMSS);
- dn_send_conn_conf(sk, allocation);
-
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- for(;;) {
- release_sock(sk);
- if (scp->state == DN_CC)
- *timeo = schedule_timeout(*timeo);
- lock_sock(sk);
- err = 0;
- if (scp->state == DN_RUN)
- break;
- err = sock_error(sk);
- if (err)
- break;
- err = sock_intr_errno(*timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!*timeo)
- break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- }
- finish_wait(sk->sk_sleep, &wait);
- if (err == 0) {
- sk->sk_socket->state = SS_CONNECTED;
- } else if (scp->state != DN_CC) {
- sk->sk_socket->state = SS_UNCONNECTED;
- }
- return err;
-}
-
-static int dn_wait_run(struct sock *sk, long *timeo)
-{
- struct dn_scp *scp = DN_SK(sk);
- DEFINE_WAIT(wait);
- int err = 0;
-
- if (scp->state == DN_RUN)
- goto out;
-
- if (!*timeo)
- return -EALREADY;
-
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- for(;;) {
- release_sock(sk);
- if (scp->state == DN_CI || scp->state == DN_CC)
- *timeo = schedule_timeout(*timeo);
- lock_sock(sk);
- err = 0;
- if (scp->state == DN_RUN)
- break;
- err = sock_error(sk);
- if (err)
- break;
- err = sock_intr_errno(*timeo);
- if (signal_pending(current))
- break;
- err = -ETIMEDOUT;
- if (!*timeo)
- break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- }
- finish_wait(sk->sk_sleep, &wait);
-out:
- if (err == 0) {
- sk->sk_socket->state = SS_CONNECTED;
- } else if (scp->state != DN_CI && scp->state != DN_CC) {
- sk->sk_socket->state = SS_UNCONNECTED;
- }
- return err;
-}
-
-static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
-{
- struct socket *sock = sk->sk_socket;
- struct dn_scp *scp = DN_SK(sk);
- int err = -EISCONN;
- struct flowi fl;
-
- if (sock->state == SS_CONNECTED)
- goto out;
-
- if (sock->state == SS_CONNECTING) {
- err = 0;
- if (scp->state == DN_RUN) {
- sock->state = SS_CONNECTED;
- goto out;
- }
- err = -ECONNREFUSED;
- if (scp->state != DN_CI && scp->state != DN_CC) {
- sock->state = SS_UNCONNECTED;
- goto out;
- }
- return dn_wait_run(sk, timeo);
- }
-
- err = -EINVAL;
- if (scp->state != DN_O)
- goto out;
-
- if (addr == NULL || addrlen != sizeof(struct sockaddr_dn))
- goto out;
- if (addr->sdn_family != AF_DECnet)
- goto out;
- if (addr->sdn_flags & SDF_WILD)
- goto out;
-
- if (sock_flag(sk, SOCK_ZAPPED)) {
- err = dn_auto_bind(sk->sk_socket);
- if (err)
- goto out;
- }
-
- memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn));
-
- err = -EHOSTUNREACH;
- memset(&fl, 0, sizeof(fl));
- fl.oif = sk->sk_bound_dev_if;
- fl.fld_dst = dn_saddr2dn(&scp->peer);
- fl.fld_src = dn_saddr2dn(&scp->addr);
- dn_sk_ports_copy(&fl, scp);
- fl.proto = DNPROTO_NSP;
- if (dn_route_output_sock(&sk->sk_dst_cache, &fl, sk, flags) < 0)
- goto out;
- sk->sk_route_caps = sk->sk_dst_cache->dev->features;
- sock->state = SS_CONNECTING;
- scp->state = DN_CI;
- scp->segsize_loc = dst_metric(sk->sk_dst_cache, RTAX_ADVMSS);
-
- dn_nsp_send_conninit(sk, NSP_CI);
- err = -EINPROGRESS;
- if (*timeo) {
- err = dn_wait_run(sk, timeo);
- }
-out:
- return err;
-}
-
-static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags)
-{
- struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr;
- struct sock *sk = sock->sk;
- int err;
- long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
-
- lock_sock(sk);
- err = __dn_connect(sk, addr, addrlen, &timeo, 0);
- release_sock(sk);
-
- return err;
-}
-
-static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- switch(scp->state) {
- case DN_RUN:
- return 0;
- case DN_CR:
- return dn_confirm_accept(sk, timeo, sk->sk_allocation);
- case DN_CI:
- case DN_CC:
- return dn_wait_run(sk, timeo);
- case DN_O:
- return __dn_connect(sk, addr, addrlen, timeo, flags);
- }
-
- return -EINVAL;
-}
-
-
-static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc)
-{
- unsigned char *ptr = skb->data;
-
- acc->acc_userl = *ptr++;
- memcpy(&acc->acc_user, ptr, acc->acc_userl);
- ptr += acc->acc_userl;
-
- acc->acc_passl = *ptr++;
- memcpy(&acc->acc_pass, ptr, acc->acc_passl);
- ptr += acc->acc_passl;
-
- acc->acc_accl = *ptr++;
- memcpy(&acc->acc_acc, ptr, acc->acc_accl);
-
- skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3);
-
-}
-
-static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt)
-{
- unsigned char *ptr = skb->data;
- u16 len = *ptr++; /* yes, it's 8bit on the wire */
-
- BUG_ON(len > 16); /* we've checked the contents earlier */
- opt->opt_optl = dn_htons(len);
- opt->opt_status = 0;
- memcpy(opt->opt_data, ptr, len);
- skb_pull(skb, len + 1);
-}
-
-static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
-{
- DEFINE_WAIT(wait);
- struct sk_buff *skb = NULL;
- int err = 0;
-
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- for(;;) {
- release_sock(sk);
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL) {
- *timeo = schedule_timeout(*timeo);
- skb = skb_dequeue(&sk->sk_receive_queue);
- }
- lock_sock(sk);
- if (skb != NULL)
- break;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- break;
- err = sock_intr_errno(*timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!*timeo)
- break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- }
- finish_wait(sk->sk_sleep, &wait);
-
- return skb == NULL ? ERR_PTR(err) : skb;
-}
-
-static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
-{
- struct sock *sk = sock->sk, *newsk;
- struct sk_buff *skb = NULL;
- struct dn_skb_cb *cb;
- unsigned char menuver;
- int err = 0;
- unsigned char type;
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
- lock_sock(sk);
-
- if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) {
- release_sock(sk);
- return -EINVAL;
- }
-
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL) {
- skb = dn_wait_for_connect(sk, &timeo);
- if (IS_ERR(skb)) {
- release_sock(sk);
- return PTR_ERR(skb);
- }
- }
-
- cb = DN_SKB_CB(skb);
- sk->sk_ack_backlog--;
- newsk = dn_alloc_sock(newsock, sk->sk_allocation);
- if (newsk == NULL) {
- release_sock(sk);
- kfree_skb(skb);
- return -ENOBUFS;
- }
- release_sock(sk);
-
- dst_release(xchg(&newsk->sk_dst_cache, skb->dst));
- skb->dst = NULL;
-
- DN_SK(newsk)->state = DN_CR;
- DN_SK(newsk)->addrrem = cb->src_port;
- DN_SK(newsk)->services_rem = cb->services;
- DN_SK(newsk)->info_rem = cb->info;
- DN_SK(newsk)->segsize_rem = cb->segsize;
- DN_SK(newsk)->accept_mode = DN_SK(sk)->accept_mode;
-
- if (DN_SK(newsk)->segsize_rem < 230)
- DN_SK(newsk)->segsize_rem = 230;
-
- if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
- DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd;
-
- newsk->sk_state = TCP_LISTEN;
- memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn));
-
- /*
- * If we are listening on a wild socket, we don't want
- * the newly created socket on the wrong hash queue.
- */
- DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD;
-
- skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type));
- skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type));
- *(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
- *(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
-
- menuver = *skb->data;
- skb_pull(skb, 1);
-
- if (menuver & DN_MENUVER_ACC)
- dn_access_copy(skb, &(DN_SK(newsk)->accessdata));
-
- if (menuver & DN_MENUVER_USR)
- dn_user_copy(skb, &(DN_SK(newsk)->conndata_in));
-
- if (menuver & DN_MENUVER_PRX)
- DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY;
-
- if (menuver & DN_MENUVER_UIC)
- DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY;
-
- kfree_skb(skb);
-
- memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out),
- sizeof(struct optdata_dn));
- memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out),
- sizeof(struct optdata_dn));
-
- lock_sock(newsk);
- err = dn_hash_sock(newsk);
- if (err == 0) {
- sock_reset_flag(newsk, SOCK_ZAPPED);
- dn_send_conn_ack(newsk);
-
- /*
- * Here we use sk->sk_allocation since although the conn conf is
- * for the newsk, the context is the old socket.
- */
- if (DN_SK(newsk)->accept_mode == ACC_IMMED)
- err = dn_confirm_accept(newsk, &timeo,
- sk->sk_allocation);
- }
- release_sock(newsk);
- return err;
-}
-
-
-static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer)
-{
- struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr;
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
-
- *uaddr_len = sizeof(struct sockaddr_dn);
-
- lock_sock(sk);
-
- if (peer) {
- if ((sock->state != SS_CONNECTED &&
- sock->state != SS_CONNECTING) &&
- scp->accept_mode == ACC_IMMED) {
- release_sock(sk);
- return -ENOTCONN;
- }
-
- memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn));
- } else {
- memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn));
- }
-
- release_sock(sk);
-
- return 0;
-}
-
-
-static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table *wait)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int mask = datagram_poll(file, sock, wait);
-
- if (!skb_queue_empty(&scp->other_receive_queue))
- mask |= POLLRDBAND;
-
- return mask;
-}
-
-static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int err = -EOPNOTSUPP;
- long amount = 0;
- struct sk_buff *skb;
- int val;
-
- switch(cmd)
- {
- case SIOCGIFADDR:
- case SIOCSIFADDR:
- return dn_dev_ioctl(cmd, (void __user *)arg);
-
- case SIOCATMARK:
- lock_sock(sk);
- val = !skb_queue_empty(&scp->other_receive_queue);
- if (scp->state != DN_RUN)
- val = -ENOTCONN;
- release_sock(sk);
- return val;
-
- case TIOCOUTQ:
- amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
- if (amount < 0)
- amount = 0;
- err = put_user(amount, (int __user *)arg);
- break;
-
- case TIOCINQ:
- lock_sock(sk);
- if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) {
- amount = skb->len;
- } else {
- struct sk_buff *skb = sk->sk_receive_queue.next;
- for(;;) {
- if (skb ==
- (struct sk_buff *)&sk->sk_receive_queue)
- break;
- amount += skb->len;
- skb = skb->next;
- }
- }
- release_sock(sk);
- err = put_user(amount, (int __user *)arg);
- break;
-
- default:
- err = -ENOIOCTLCMD;
- break;
- }
-
- return err;
-}
-
-static int dn_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
- int err = -EINVAL;
-
- lock_sock(sk);
-
- if (sock_flag(sk, SOCK_ZAPPED))
- goto out;
-
- if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN))
- goto out;
-
- sk->sk_max_ack_backlog = backlog;
- sk->sk_ack_backlog = 0;
- sk->sk_state = TCP_LISTEN;
- err = 0;
- dn_rehash_sock(sk);
-
-out:
- release_sock(sk);
-
- return err;
-}
-
-
-static int dn_shutdown(struct socket *sock, int how)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int err = -ENOTCONN;
-
- lock_sock(sk);
-
- if (sock->state == SS_UNCONNECTED)
- goto out;
-
- err = 0;
- if (sock->state == SS_DISCONNECTING)
- goto out;
-
- err = -EINVAL;
- if (scp->state == DN_O)
- goto out;
-
- if (how != SHUTDOWN_MASK)
- goto out;
-
- sk->sk_shutdown = how;
- dn_destroy_sock(sk);
- err = 0;
-
-out:
- release_sock(sk);
-
- return err;
-}
-
-static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
-{
- struct sock *sk = sock->sk;
- int err;
-
- lock_sock(sk);
- err = __dn_setsockopt(sock, level, optname, optval, optlen, 0);
- release_sock(sk);
-
- return err;
-}
-
-static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, int optlen, int flags)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- long timeo;
- union {
- struct optdata_dn opt;
- struct accessdata_dn acc;
- int mode;
- unsigned long win;
- int val;
- unsigned char services;
- unsigned char info;
- } u;
- int err;
-
- if (optlen && !optval)
- return -EINVAL;
-
- if (optlen > sizeof(u))
- return -EINVAL;
-
- if (copy_from_user(&u, optval, optlen))
- return -EFAULT;
-
- switch(optname) {
- case DSO_CONDATA:
- if (sock->state == SS_CONNECTED)
- return -EISCONN;
- if ((scp->state != DN_O) && (scp->state != DN_CR))
- return -EINVAL;
-
- if (optlen != sizeof(struct optdata_dn))
- return -EINVAL;
-
- if (dn_ntohs(u.opt.opt_optl) > 16)
- return -EINVAL;
-
- memcpy(&scp->conndata_out, &u.opt, optlen);
- break;
-
- case DSO_DISDATA:
- if (sock->state != SS_CONNECTED && scp->accept_mode == ACC_IMMED)
- return -ENOTCONN;
-
- if (optlen != sizeof(struct optdata_dn))
- return -EINVAL;
-
- if (dn_ntohs(u.opt.opt_optl) > 16)
- return -EINVAL;
-
- memcpy(&scp->discdata_out, &u.opt, optlen);
- break;
-
- case DSO_CONACCESS:
- if (sock->state == SS_CONNECTED)
- return -EISCONN;
- if (scp->state != DN_O)
- return -EINVAL;
-
- if (optlen != sizeof(struct accessdata_dn))
- return -EINVAL;
-
- if ((u.acc.acc_accl > DN_MAXACCL) ||
- (u.acc.acc_passl > DN_MAXACCL) ||
- (u.acc.acc_userl > DN_MAXACCL))
- return -EINVAL;
-
- memcpy(&scp->accessdata, &u.acc, optlen);
- break;
-
- case DSO_ACCEPTMODE:
- if (sock->state == SS_CONNECTED)
- return -EISCONN;
- if (scp->state != DN_O)
- return -EINVAL;
-
- if (optlen != sizeof(int))
- return -EINVAL;
-
- if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER))
- return -EINVAL;
-
- scp->accept_mode = (unsigned char)u.mode;
- break;
-
- case DSO_CONACCEPT:
-
- if (scp->state != DN_CR)
- return -EINVAL;
- timeo = sock_rcvtimeo(sk, 0);
- err = dn_confirm_accept(sk, &timeo, sk->sk_allocation);
- return err;
-
- case DSO_CONREJECT:
-
- if (scp->state != DN_CR)
- return -EINVAL;
-
- scp->state = DN_DR;
- sk->sk_shutdown = SHUTDOWN_MASK;
- dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation);
- break;
-
- default:
-#ifdef CONFIG_NETFILTER
- return nf_setsockopt(sk, PF_DECnet, optname, optval, optlen);
-#endif
- case DSO_LINKINFO:
- case DSO_STREAM:
- case DSO_SEQPACKET:
- return -ENOPROTOOPT;
-
- case DSO_MAXWINDOW:
- if (optlen != sizeof(unsigned long))
- return -EINVAL;
- if (u.win > NSP_MAX_WINDOW)
- u.win = NSP_MAX_WINDOW;
- if (u.win == 0)
- return -EINVAL;
- scp->max_window = u.win;
- if (scp->snd_window > u.win)
- scp->snd_window = u.win;
- break;
-
- case DSO_NODELAY:
- if (optlen != sizeof(int))
- return -EINVAL;
- if (scp->nonagle == 2)
- return -EINVAL;
- scp->nonagle = (u.val == 0) ? 0 : 1;
- /* if (scp->nonagle == 1) { Push pending frames } */
- break;
-
- case DSO_CORK:
- if (optlen != sizeof(int))
- return -EINVAL;
- if (scp->nonagle == 1)
- return -EINVAL;
- scp->nonagle = (u.val == 0) ? 0 : 2;
- /* if (scp->nonagle == 0) { Push pending frames } */
- break;
-
- case DSO_SERVICES:
- if (optlen != sizeof(unsigned char))
- return -EINVAL;
- if ((u.services & ~NSP_FC_MASK) != 0x01)
- return -EINVAL;
- if ((u.services & NSP_FC_MASK) == NSP_FC_MASK)
- return -EINVAL;
- scp->services_loc = u.services;
- break;
-
- case DSO_INFO:
- if (optlen != sizeof(unsigned char))
- return -EINVAL;
- if (u.info & 0xfc)
- return -EINVAL;
- scp->info_loc = u.info;
- break;
- }
-
- return 0;
-}
-
-static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
- int err;
-
- lock_sock(sk);
- err = __dn_getsockopt(sock, level, optname, optval, optlen, 0);
- release_sock(sk);
-
- return err;
-}
-
-static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct linkinfo_dn link;
- unsigned int r_len;
- void *r_data = NULL;
- unsigned int val;
-
- if(get_user(r_len , optlen))
- return -EFAULT;
-
- switch(optname) {
- case DSO_CONDATA:
- if (r_len > sizeof(struct optdata_dn))
- r_len = sizeof(struct optdata_dn);
- r_data = &scp->conndata_in;
- break;
-
- case DSO_DISDATA:
- if (r_len > sizeof(struct optdata_dn))
- r_len = sizeof(struct optdata_dn);
- r_data = &scp->discdata_in;
- break;
-
- case DSO_CONACCESS:
- if (r_len > sizeof(struct accessdata_dn))
- r_len = sizeof(struct accessdata_dn);
- r_data = &scp->accessdata;
- break;
-
- case DSO_ACCEPTMODE:
- if (r_len > sizeof(unsigned char))
- r_len = sizeof(unsigned char);
- r_data = &scp->accept_mode;
- break;
-
- case DSO_LINKINFO:
- if (r_len > sizeof(struct linkinfo_dn))
- r_len = sizeof(struct linkinfo_dn);
-
- switch(sock->state) {
- case SS_CONNECTING:
- link.idn_linkstate = LL_CONNECTING;
- break;
- case SS_DISCONNECTING:
- link.idn_linkstate = LL_DISCONNECTING;
- break;
- case SS_CONNECTED:
- link.idn_linkstate = LL_RUNNING;
- break;
- default:
- link.idn_linkstate = LL_INACTIVE;
- }
-
- link.idn_segsize = scp->segsize_rem;
- r_data = &link;
- break;
-
- default:
-#ifdef CONFIG_NETFILTER
- {
- int val, len;
-
- if(get_user(len, optlen))
- return -EFAULT;
-
- val = nf_getsockopt(sk, PF_DECnet, optname,
- optval, &len);
- if (val >= 0)
- val = put_user(len, optlen);
- return val;
- }
-#endif
- case DSO_STREAM:
- case DSO_SEQPACKET:
- case DSO_CONACCEPT:
- case DSO_CONREJECT:
- return -ENOPROTOOPT;
-
- case DSO_MAXWINDOW:
- if (r_len > sizeof(unsigned long))
- r_len = sizeof(unsigned long);
- r_data = &scp->max_window;
- break;
-
- case DSO_NODELAY:
- if (r_len > sizeof(int))
- r_len = sizeof(int);
- val = (scp->nonagle == 1);
- r_data = &val;
- break;
-
- case DSO_CORK:
- if (r_len > sizeof(int))
- r_len = sizeof(int);
- val = (scp->nonagle == 2);
- r_data = &val;
- break;
-
- case DSO_SERVICES:
- if (r_len > sizeof(unsigned char))
- r_len = sizeof(unsigned char);
- r_data = &scp->services_rem;
- break;
-
- case DSO_INFO:
- if (r_len > sizeof(unsigned char))
- r_len = sizeof(unsigned char);
- r_data = &scp->info_rem;
- break;
- }
-
- if (r_data) {
- if (copy_to_user(optval, r_data, r_len))
- return -EFAULT;
- if (put_user(r_len, optlen))
- return -EFAULT;
- }
-
- return 0;
-}
-
-
-static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target)
-{
- struct sk_buff *skb = q->next;
- int len = 0;
-
- if (flags & MSG_OOB)
- return !skb_queue_empty(q) ? 1 : 0;
-
- while(skb != (struct sk_buff *)q) {
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- len += skb->len;
-
- if (cb->nsp_flags & 0x40) {
- /* SOCK_SEQPACKET reads to EOM */
- if (sk->sk_type == SOCK_SEQPACKET)
- return 1;
- /* so does SOCK_STREAM unless WAITALL is specified */
- if (!(flags & MSG_WAITALL))
- return 1;
- }
-
- /* minimum data length for read exceeded */
- if (len >= target)
- return 1;
-
- skb = skb->next;
- }
-
- return 0;
-}
-
-
-static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff_head *queue = &sk->sk_receive_queue;
- size_t target = size > 1 ? 1 : 0;
- size_t copied = 0;
- int rv = 0;
- struct sk_buff *skb, *nskb;
- struct dn_skb_cb *cb = NULL;
- unsigned char eor = 0;
- long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- lock_sock(sk);
-
- if (sock_flag(sk, SOCK_ZAPPED)) {
- rv = -EADDRNOTAVAIL;
- goto out;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- rv = 0;
- goto out;
- }
-
- rv = dn_check_state(sk, NULL, 0, &timeo, flags);
- if (rv)
- goto out;
-
- if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
- rv = -EOPNOTSUPP;
- goto out;
- }
-
- if (flags & MSG_OOB)
- queue = &scp->other_receive_queue;
-
- if (flags & MSG_WAITALL)
- target = size;
-
-
- /*
- * See if there is data ready to read, sleep if there isn't
- */
- for(;;) {
- if (sk->sk_err)
- goto out;
-
- if (!skb_queue_empty(&scp->other_receive_queue)) {
- if (!(flags & MSG_OOB)) {
- msg->msg_flags |= MSG_OOB;
- if (!scp->other_report) {
- scp->other_report = 1;
- goto out;
- }
- }
- }
-
- if (scp->state != DN_RUN)
- goto out;
-
- if (signal_pending(current)) {
- rv = sock_intr_errno(timeo);
- goto out;
- }
-
- if (dn_data_ready(sk, queue, flags, target))
- break;
-
- if (flags & MSG_DONTWAIT) {
- rv = -EWOULDBLOCK;
- goto out;
- }
-
- set_bit(SOCK_ASYNC_WAITDATA, &sock->flags);
- SOCK_SLEEP_PRE(sk)
-
- if (!dn_data_ready(sk, queue, flags, target))
- schedule();
-
- SOCK_SLEEP_POST(sk)
- clear_bit(SOCK_ASYNC_WAITDATA, &sock->flags);
- }
-
- for(skb = queue->next; skb != (struct sk_buff *)queue; skb = nskb) {
- unsigned int chunk = skb->len;
- cb = DN_SKB_CB(skb);
-
- if ((chunk + copied) > size)
- chunk = size - copied;
-
- if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
- rv = -EFAULT;
- break;
- }
- copied += chunk;
-
- if (!(flags & MSG_PEEK))
- skb_pull(skb, chunk);
-
- eor = cb->nsp_flags & 0x40;
- nskb = skb->next;
-
- if (skb->len == 0) {
- skb_unlink(skb, queue);
- kfree_skb(skb);
- /*
- * N.B. Don't refer to skb or cb after this point
- * in loop.
- */
- if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) {
- scp->flowloc_sw = DN_SEND;
- dn_nsp_send_link(sk, DN_SEND, 0);
- }
- }
-
- if (eor) {
- if (sk->sk_type == SOCK_SEQPACKET)
- break;
- if (!(flags & MSG_WAITALL))
- break;
- }
-
- if (flags & MSG_OOB)
- break;
-
- if (copied >= target)
- break;
- }
-
- rv = copied;
-
-
- if (eor && (sk->sk_type == SOCK_SEQPACKET))
- msg->msg_flags |= MSG_EOR;
-
-out:
- if (rv == 0)
- rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk);
-
- if ((rv >= 0) && msg->msg_name) {
- memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn));
- msg->msg_namelen = sizeof(struct sockaddr_dn);
- }
-
- release_sock(sk);
-
- return rv;
-}
-
-
-static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags)
-{
- unsigned char fctype = scp->services_rem & NSP_FC_MASK;
- if (skb_queue_len(queue) >= scp->snd_window)
- return 1;
- if (fctype != NSP_FC_NONE) {
- if (flags & MSG_OOB) {
- if (scp->flowrem_oth == 0)
- return 1;
- } else {
- if (scp->flowrem_dat == 0)
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * The DECnet spec requires the the "routing layer" accepts packets which
- * are at least 230 bytes in size. This excludes any headers which the NSP
- * layer might add, so we always assume that we'll be using the maximal
- * length header on data packets. The variation in length is due to the
- * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't
- * make much practical difference.
- */
-unsigned dn_mss_from_pmtu(struct net_device *dev, int mtu)
-{
- unsigned mss = 230 - DN_MAX_NSP_DATA_HEADER;
- if (dev) {
- struct dn_dev *dn_db = dev->dn_ptr;
- mtu -= LL_RESERVED_SPACE(dev);
- if (dn_db->use_long)
- mtu -= 21;
- else
- mtu -= 6;
- mtu -= DN_MAX_NSP_DATA_HEADER;
- } else {
- /*
- * 21 = long header, 16 = guess at MAC header length
- */
- mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16);
- }
- if (mtu > mss)
- mss = mtu;
- return mss;
-}
-
-static inline unsigned int dn_current_mss(struct sock *sk, int flags)
-{
- struct dst_entry *dst = __sk_dst_get(sk);
- struct dn_scp *scp = DN_SK(sk);
- int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem);
-
- /* Other data messages are limited to 16 bytes per packet */
- if (flags & MSG_OOB)
- return 16;
-
- /* This works out the maximum size of segment we can send out */
- if (dst) {
- u32 mtu = dst_mtu(dst);
- mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now);
- }
-
- return mss_now;
-}
-
-/*
- * N.B. We get the timeout wrong here, but then we always did get it
- * wrong before and this is another step along the road to correcting
- * it. It ought to get updated each time we pass through the routine,
- * but in practise it probably doesn't matter too much for now.
- */
-static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
- unsigned long datalen, int noblock,
- int *errcode)
-{
- struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
- noblock, errcode);
- if (skb) {
- skb->protocol = __constant_htons(ETH_P_DNA_RT);
- skb->pkt_type = PACKET_OUTGOING;
- }
- return skb;
-}
-
-static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- size_t mss;
- struct sk_buff_head *queue = &scp->data_xmit_queue;
- int flags = msg->msg_flags;
- int err = 0;
- size_t sent = 0;
- int addr_len = msg->msg_namelen;
- struct sockaddr_dn *addr = (struct sockaddr_dn *)msg->msg_name;
- struct sk_buff *skb = NULL;
- struct dn_skb_cb *cb;
- size_t len;
- unsigned char fctype;
- long timeo;
-
- if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
- return -EOPNOTSUPP;
-
- if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
- return -EINVAL;
-
- lock_sock(sk);
- timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- /*
- * The only difference between stream sockets and sequenced packet
- * sockets is that the stream sockets always behave as if MSG_EOR
- * has been set.
- */
- if (sock->type == SOCK_STREAM) {
- if (flags & MSG_EOR) {
- err = -EINVAL;
- goto out;
- }
- flags |= MSG_EOR;
- }
-
-
- err = dn_check_state(sk, addr, addr_len, &timeo, flags);
- if (err)
- goto out_err;
-
- if (sk->sk_shutdown & SEND_SHUTDOWN) {
- err = -EPIPE;
- if (!(flags & MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 0);
- goto out_err;
- }
-
- if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
- dst_negative_advice(&sk->sk_dst_cache);
-
- mss = scp->segsize_rem;
- fctype = scp->services_rem & NSP_FC_MASK;
-
- mss = dn_current_mss(sk, flags);
-
- if (flags & MSG_OOB) {
- queue = &scp->other_xmit_queue;
- if (size > mss) {
- err = -EMSGSIZE;
- goto out;
- }
- }
-
- scp->persist_fxn = dn_nsp_xmit_timeout;
-
- while(sent < size) {
- err = sock_error(sk);
- if (err)
- goto out;
-
- if (signal_pending(current)) {
- err = sock_intr_errno(timeo);
- goto out;
- }
-
- /*
- * Calculate size that we wish to send.
- */
- len = size - sent;
-
- if (len > mss)
- len = mss;
-
- /*
- * Wait for queue size to go down below the window
- * size.
- */
- if (dn_queue_too_long(scp, queue, flags)) {
- if (flags & MSG_DONTWAIT) {
- err = -EWOULDBLOCK;
- goto out;
- }
-
- SOCK_SLEEP_PRE(sk)
-
- if (dn_queue_too_long(scp, queue, flags))
- schedule();
-
- SOCK_SLEEP_POST(sk)
-
- continue;
- }
-
- /*
- * Get a suitably sized skb.
- * 64 is a bit of a hack really, but its larger than any
- * link-layer headers and has served us well as a good
- * guess as to their real length.
- */
- skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
- flags & MSG_DONTWAIT, &err);
-
- if (err)
- break;
-
- if (!skb)
- continue;
-
- cb = DN_SKB_CB(skb);
-
- skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
-
- if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
- err = -EFAULT;
- goto out;
- }
-
- if (flags & MSG_OOB) {
- cb->nsp_flags = 0x30;
- if (fctype != NSP_FC_NONE)
- scp->flowrem_oth--;
- } else {
- cb->nsp_flags = 0x00;
- if (scp->seg_total == 0)
- cb->nsp_flags |= 0x20;
-
- scp->seg_total += len;
-
- if (((sent + len) == size) && (flags & MSG_EOR)) {
- cb->nsp_flags |= 0x40;
- scp->seg_total = 0;
- if (fctype == NSP_FC_SCMC)
- scp->flowrem_dat--;
- }
- if (fctype == NSP_FC_SRC)
- scp->flowrem_dat--;
- }
-
- sent += len;
- dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB);
- skb = NULL;
-
- scp->persist = dn_nsp_persist(sk);
-
- }
-out:
-
- if (skb)
- kfree_skb(skb);
-
- release_sock(sk);
-
- return sent ? sent : err;
-
-out_err:
- err = sk_stream_error(sk, flags, err);
- release_sock(sk);
- return err;
-}
-
-static int dn_device_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = (struct net_device *)ptr;
-
- switch(event) {
- case NETDEV_UP:
- dn_dev_up(dev);
- break;
- case NETDEV_DOWN:
- dn_dev_down(dev);
- break;
- default:
- break;
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block dn_dev_notifier = {
- .notifier_call = dn_device_event,
-};
-
-extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
-
-static struct packet_type dn_dix_packet_type = {
- .type = __constant_htons(ETH_P_DNA_RT),
- .dev = NULL, /* All devices */
- .func = dn_route_rcv,
-};
-
-#ifdef CONFIG_PROC_FS
-struct dn_iter_state {
- int bucket;
-};
-
-static struct sock *dn_socket_get_first(struct seq_file *seq)
-{
- struct dn_iter_state *state = seq->private;
- struct sock *n = NULL;
-
- for(state->bucket = 0;
- state->bucket < DN_SK_HASH_SIZE;
- ++state->bucket) {
- n = sk_head(&dn_sk_hash[state->bucket]);
- if (n)
- break;
- }
-
- return n;
-}
-
-static struct sock *dn_socket_get_next(struct seq_file *seq,
- struct sock *n)
-{
- struct dn_iter_state *state = seq->private;
-
- n = sk_next(n);
-try_again:
- if (n)
- goto out;
- if (++state->bucket >= DN_SK_HASH_SIZE)
- goto out;
- n = sk_head(&dn_sk_hash[state->bucket]);
- goto try_again;
-out:
- return n;
-}
-
-static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos)
-{
- struct sock *sk = dn_socket_get_first(seq);
-
- if (sk) {
- while(*pos && (sk = dn_socket_get_next(seq, sk)))
- --*pos;
- }
- return *pos ? NULL : sk;
-}
-
-static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos)
-{
- void *rc;
- read_lock_bh(&dn_hash_lock);
- rc = socket_get_idx(seq, &pos);
- if (!rc) {
- read_unlock_bh(&dn_hash_lock);
- }
- return rc;
-}
-
-static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos)
-{
- return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-}
-
-static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- void *rc;
-
- if (v == SEQ_START_TOKEN) {
- rc = dn_socket_get_idx(seq, 0);
- goto out;
- }
-
- rc = dn_socket_get_next(seq, v);
- if (rc)
- goto out;
- read_unlock_bh(&dn_hash_lock);
-out:
- ++*pos;
- return rc;
-}
-
-static void dn_socket_seq_stop(struct seq_file *seq, void *v)
-{
- if (v && v != SEQ_START_TOKEN)
- read_unlock_bh(&dn_hash_lock);
-}
-
-#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126)
-
-static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf)
-{
- int i;
-
- switch (dn_ntohs(dn->sdn_objnamel)) {
- case 0:
- sprintf(buf, "%d", dn->sdn_objnum);
- break;
- default:
- for (i = 0; i < dn_ntohs(dn->sdn_objnamel); i++) {
- buf[i] = dn->sdn_objname[i];
- if (IS_NOT_PRINTABLE(buf[i]))
- buf[i] = '.';
- }
- buf[i] = 0;
- }
-}
-
-static char *dn_state2asc(unsigned char state)
-{
- switch(state) {
- case DN_O:
- return "OPEN";
- case DN_CR:
- return " CR";
- case DN_DR:
- return " DR";
- case DN_DRC:
- return " DRC";
- case DN_CC:
- return " CC";
- case DN_CI:
- return " CI";
- case DN_NR:
- return " NR";
- case DN_NC:
- return " NC";
- case DN_CD:
- return " CD";
- case DN_RJ:
- return " RJ";
- case DN_RUN:
- return " RUN";
- case DN_DI:
- return " DI";
- case DN_DIC:
- return " DIC";
- case DN_DN:
- return " DN";
- case DN_CL:
- return " CL";
- case DN_CN:
- return " CN";
- }
-
- return "????";
-}
-
-static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- char buf1[DN_ASCBUF_LEN];
- char buf2[DN_ASCBUF_LEN];
- char local_object[DN_MAXOBJL+3];
- char remote_object[DN_MAXOBJL+3];
-
- dn_printable_object(&scp->addr, local_object);
- dn_printable_object(&scp->peer, remote_object);
-
- seq_printf(seq,
- "%6s/%04X %04d:%04d %04d:%04d %01d %-16s "
- "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n",
- dn_addr2asc(dn_ntohs(dn_saddr2dn(&scp->addr)), buf1),
- scp->addrloc,
- scp->numdat,
- scp->numoth,
- scp->ackxmt_dat,
- scp->ackxmt_oth,
- scp->flowloc_sw,
- local_object,
- dn_addr2asc(dn_ntohs(dn_saddr2dn(&scp->peer)), buf2),
- scp->addrrem,
- scp->numdat_rcv,
- scp->numoth_rcv,
- scp->ackrcv_dat,
- scp->ackrcv_oth,
- scp->flowrem_sw,
- remote_object,
- dn_state2asc(scp->state),
- ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER"));
-}
-
-static int dn_socket_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "Local Remote\n");
- } else {
- dn_socket_format_entry(seq, v);
- }
- return 0;
-}
-
-static struct seq_operations dn_socket_seq_ops = {
- .start = dn_socket_seq_start,
- .next = dn_socket_seq_next,
- .stop = dn_socket_seq_stop,
- .show = dn_socket_seq_show,
-};
-
-static int dn_socket_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct dn_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &dn_socket_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations dn_socket_seq_fops = {
- .owner = THIS_MODULE,
- .open = dn_socket_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-#endif
-
-static struct net_proto_family dn_family_ops = {
- .family = AF_DECnet,
- .create = dn_create,
- .owner = THIS_MODULE,
-};
-
-static const struct proto_ops dn_proto_ops = {
- .family = AF_DECnet,
- .owner = THIS_MODULE,
- .release = dn_release,
- .bind = dn_bind,
- .connect = dn_connect,
- .socketpair = sock_no_socketpair,
- .accept = dn_accept,
- .getname = dn_getname,
- .poll = dn_poll,
- .ioctl = dn_ioctl,
- .listen = dn_listen,
- .shutdown = dn_shutdown,
- .setsockopt = dn_setsockopt,
- .getsockopt = dn_getsockopt,
- .sendmsg = dn_sendmsg,
- .recvmsg = dn_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-};
-
-void dn_register_sysctl(void);
-void dn_unregister_sysctl(void);
-
-MODULE_DESCRIPTION("The Linux DECnet Network Protocol");
-MODULE_AUTHOR("Linux DECnet Project Team");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NETPROTO(PF_DECnet);
-
-static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
-
-static int __init decnet_init(void)
-{
- int rc;
-
- printk(banner);
-
- rc = proto_register(&dn_proto, 1);
- if (rc != 0)
- goto out;
-
- dn_neigh_init();
- dn_dev_init();
- dn_route_init();
- dn_fib_init();
-
- sock_register(&dn_family_ops);
- dev_add_pack(&dn_dix_packet_type);
- register_netdevice_notifier(&dn_dev_notifier);
-
- proc_net_fops_create("decnet", S_IRUGO, &dn_socket_seq_fops);
- dn_register_sysctl();
-out:
- return rc;
-
-}
-module_init(decnet_init);
-
-/*
- * Prevent DECnet module unloading until its fixed properly.
- * Requires an audit of the code to check for memory leaks and
- * initialisation problems etc.
- */
-#if 0
-static void __exit decnet_exit(void)
-{
- sock_unregister(AF_DECnet);
- dev_remove_pack(&dn_dix_packet_type);
-
- dn_unregister_sysctl();
-
- unregister_netdevice_notifier(&dn_dev_notifier);
-
- dn_route_cleanup();
- dn_dev_cleanup();
- dn_neigh_cleanup();
- dn_fib_cleanup();
-
- proc_net_remove("decnet");
-
- proto_unregister(&dn_proto);
-}
-module_exit(decnet_exit);
-#endif
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
deleted file mode 100644
index 0b9d4c955154..000000000000
--- a/net/decnet/dn_dev.c
+++ /dev/null
@@ -1,1512 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Device Layer
- *
- * Authors: Steve Whitehouse <SteveW@ACM.org>
- * Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- * Steve Whitehouse : Devices now see incoming frames so they
- * can mark on who it came from.
- * Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour
- * can now have a device specific setup func.
- * Steve Whitehouse : Added /proc/sys/net/decnet/conf/<dev>/
- * Steve Whitehouse : Fixed bug which sometimes killed timer
- * Steve Whitehouse : Multiple ifaddr support
- * Steve Whitehouse : SIOCGIFCONF is now a compile time option
- * Steve Whitehouse : /proc/sys/net/decnet/conf/<sys>/forwarding
- * Steve Whitehouse : Removed timer1 - it's a user space issue now
- * Patrick Caulfield : Fixed router hello message format
- * Steve Whitehouse : Got rid of constant sizes for blksize for
- * devices. All mtu based now.
- */
-
-#include <linux/capability.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/if_addr.h>
-#include <linux/if_arp.h>
-#include <linux/if_ether.h>
-#include <linux/skbuff.h>
-#include <linux/sysctl.h>
-#include <linux/notifier.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/netlink.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-#include <net/dn_neigh.h>
-#include <net/dn_fib.h>
-
-#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn))
-
-static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
-static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00};
-static char dn_hiord[ETH_ALEN] = {0xAA,0x00,0x04,0x00,0x00,0x00};
-static unsigned char dn_eco_version[3] = {0x02,0x00,0x00};
-
-extern struct neigh_table dn_neigh_table;
-
-/*
- * decnet_address is kept in network order.
- */
-__le16 decnet_address = 0;
-
-static DEFINE_RWLOCK(dndev_lock);
-static struct net_device *decnet_default_device;
-static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
-
-static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
-static void dn_dev_delete(struct net_device *dev);
-static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa);
-
-static int dn_eth_up(struct net_device *);
-static void dn_eth_down(struct net_device *);
-static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa);
-static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa);
-
-static struct dn_dev_parms dn_dev_list[] = {
-{
- .type = ARPHRD_ETHER, /* Ethernet */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "ethernet",
- .ctl_name = NET_DECNET_CONF_ETHER,
- .up = dn_eth_up,
- .down = dn_eth_down,
- .timer3 = dn_send_brd_hello,
-},
-{
- .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "ipgre",
- .ctl_name = NET_DECNET_CONF_GRE,
- .timer3 = dn_send_brd_hello,
-},
-#if 0
-{
- .type = ARPHRD_X25, /* Bog standard X.25 */
- .mode = DN_DEV_UCAST,
- .state = DN_DEV_S_DS,
- .t2 = 1,
- .t3 = 120,
- .name = "x25",
- .ctl_name = NET_DECNET_CONF_X25,
- .timer3 = dn_send_ptp_hello,
-},
-#endif
-#if 0
-{
- .type = ARPHRD_PPP, /* DECnet over PPP */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "ppp",
- .ctl_name = NET_DECNET_CONF_PPP,
- .timer3 = dn_send_brd_hello,
-},
-#endif
-{
- .type = ARPHRD_DDCMP, /* DECnet over DDCMP */
- .mode = DN_DEV_UCAST,
- .state = DN_DEV_S_DS,
- .t2 = 1,
- .t3 = 120,
- .name = "ddcmp",
- .ctl_name = NET_DECNET_CONF_DDCMP,
- .timer3 = dn_send_ptp_hello,
-},
-{
- .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "loopback",
- .ctl_name = NET_DECNET_CONF_LOOPBACK,
- .timer3 = dn_send_brd_hello,
-}
-};
-
-#define DN_DEV_LIST_SIZE (sizeof(dn_dev_list)/sizeof(struct dn_dev_parms))
-
-#define DN_DEV_PARMS_OFFSET(x) ((int) ((char *) &((struct dn_dev_parms *)0)->x))
-
-#ifdef CONFIG_SYSCTL
-
-static int min_t2[] = { 1 };
-static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */
-static int min_t3[] = { 1 };
-static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */
-
-static int min_priority[1];
-static int max_priority[] = { 127 }; /* From DECnet spec */
-
-static int dn_forwarding_proc(ctl_table *, int, struct file *,
- void __user *, size_t *, loff_t *);
-static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context);
-
-static struct dn_dev_sysctl_table {
- struct ctl_table_header *sysctl_header;
- ctl_table dn_dev_vars[5];
- ctl_table dn_dev_dev[2];
- ctl_table dn_dev_conf_dir[2];
- ctl_table dn_dev_proto_dir[2];
- ctl_table dn_dev_root_dir[2];
-} dn_dev_sysctl = {
- NULL,
- {
- {
- .ctl_name = NET_DECNET_CONF_DEV_FORWARDING,
- .procname = "forwarding",
- .data = (void *)DN_DEV_PARMS_OFFSET(forwarding),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = dn_forwarding_proc,
- .strategy = dn_forwarding_sysctl,
- },
- {
- .ctl_name = NET_DECNET_CONF_DEV_PRIORITY,
- .procname = "priority",
- .data = (void *)DN_DEV_PARMS_OFFSET(priority),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
- .extra1 = &min_priority,
- .extra2 = &max_priority
- },
- {
- .ctl_name = NET_DECNET_CONF_DEV_T2,
- .procname = "t2",
- .data = (void *)DN_DEV_PARMS_OFFSET(t2),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
- .extra1 = &min_t2,
- .extra2 = &max_t2
- },
- {
- .ctl_name = NET_DECNET_CONF_DEV_T3,
- .procname = "t3",
- .data = (void *)DN_DEV_PARMS_OFFSET(t3),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
- .extra1 = &min_t3,
- .extra2 = &max_t3
- },
- {0}
- },
- {{
- .ctl_name = 0,
- .procname = "",
- .mode = 0555,
- .child = dn_dev_sysctl.dn_dev_vars
- }, {0}},
- {{
- .ctl_name = NET_DECNET_CONF,
- .procname = "conf",
- .mode = 0555,
- .child = dn_dev_sysctl.dn_dev_dev
- }, {0}},
- {{
- .ctl_name = NET_DECNET,
- .procname = "decnet",
- .mode = 0555,
- .child = dn_dev_sysctl.dn_dev_conf_dir
- }, {0}},
- {{
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = dn_dev_sysctl.dn_dev_proto_dir
- }, {0}}
-};
-
-static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
-{
- struct dn_dev_sysctl_table *t;
- int i;
-
- t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL);
- if (t == NULL)
- return;
-
- for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) {
- long offset = (long)t->dn_dev_vars[i].data;
- t->dn_dev_vars[i].data = ((char *)parms) + offset;
- t->dn_dev_vars[i].de = NULL;
- }
-
- if (dev) {
- t->dn_dev_dev[0].procname = dev->name;
- t->dn_dev_dev[0].ctl_name = dev->ifindex;
- } else {
- t->dn_dev_dev[0].procname = parms->name;
- t->dn_dev_dev[0].ctl_name = parms->ctl_name;
- }
-
- t->dn_dev_dev[0].child = t->dn_dev_vars;
- t->dn_dev_dev[0].de = NULL;
- t->dn_dev_conf_dir[0].child = t->dn_dev_dev;
- t->dn_dev_conf_dir[0].de = NULL;
- t->dn_dev_proto_dir[0].child = t->dn_dev_conf_dir;
- t->dn_dev_proto_dir[0].de = NULL;
- t->dn_dev_root_dir[0].child = t->dn_dev_proto_dir;
- t->dn_dev_root_dir[0].de = NULL;
- t->dn_dev_vars[0].extra1 = (void *)dev;
-
- t->sysctl_header = register_sysctl_table(t->dn_dev_root_dir, 0);
- if (t->sysctl_header == NULL)
- kfree(t);
- else
- parms->sysctl = t;
-}
-
-static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
-{
- if (parms->sysctl) {
- struct dn_dev_sysctl_table *t = parms->sysctl;
- parms->sysctl = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t);
- }
-}
-
-static int dn_forwarding_proc(ctl_table *table, int write,
- struct file *filep,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
-#ifdef CONFIG_DECNET_ROUTER
- struct net_device *dev = table->extra1;
- struct dn_dev *dn_db;
- int err;
- int tmp, old;
-
- if (table->extra1 == NULL)
- return -EINVAL;
-
- dn_db = dev->dn_ptr;
- old = dn_db->parms.forwarding;
-
- err = proc_dointvec(table, write, filep, buffer, lenp, ppos);
-
- if ((err >= 0) && write) {
- if (dn_db->parms.forwarding < 0)
- dn_db->parms.forwarding = 0;
- if (dn_db->parms.forwarding > 2)
- dn_db->parms.forwarding = 2;
- /*
- * What an ugly hack this is... its works, just. It
- * would be nice if sysctl/proc were just that little
- * bit more flexible so I don't have to write a special
- * routine, or suffer hacks like this - SJW
- */
- tmp = dn_db->parms.forwarding;
- dn_db->parms.forwarding = old;
- if (dn_db->parms.down)
- dn_db->parms.down(dev);
- dn_db->parms.forwarding = tmp;
- if (dn_db->parms.up)
- dn_db->parms.up(dev);
- }
-
- return err;
-#else
- return -EINVAL;
-#endif
-}
-
-static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
-{
-#ifdef CONFIG_DECNET_ROUTER
- struct net_device *dev = table->extra1;
- struct dn_dev *dn_db;
- int value;
-
- if (table->extra1 == NULL)
- return -EINVAL;
-
- dn_db = dev->dn_ptr;
-
- if (newval && newlen) {
- if (newlen != sizeof(int))
- return -EINVAL;
-
- if (get_user(value, (int __user *)newval))
- return -EFAULT;
- if (value < 0)
- return -EINVAL;
- if (value > 2)
- return -EINVAL;
-
- if (dn_db->parms.down)
- dn_db->parms.down(dev);
- dn_db->parms.forwarding = value;
- if (dn_db->parms.up)
- dn_db->parms.up(dev);
- }
-
- return 0;
-#else
- return -EINVAL;
-#endif
-}
-
-#else /* CONFIG_SYSCTL */
-static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
-{
-}
-static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
-
-static inline __u16 mtu2blksize(struct net_device *dev)
-{
- u32 blksize = dev->mtu;
- if (blksize > 0xffff)
- blksize = 0xffff;
-
- if (dev->type == ARPHRD_ETHER ||
- dev->type == ARPHRD_PPP ||
- dev->type == ARPHRD_IPGRE ||
- dev->type == ARPHRD_LOOPBACK)
- blksize -= 2;
-
- return (__u16)blksize;
-}
-
-static struct dn_ifaddr *dn_dev_alloc_ifa(void)
-{
- struct dn_ifaddr *ifa;
-
- ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
-
- return ifa;
-}
-
-static __inline__ void dn_dev_free_ifa(struct dn_ifaddr *ifa)
-{
- kfree(ifa);
-}
-
-static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr **ifap, int destroy)
-{
- struct dn_ifaddr *ifa1 = *ifap;
- unsigned char mac_addr[6];
- struct net_device *dev = dn_db->dev;
-
- ASSERT_RTNL();
-
- *ifap = ifa1->ifa_next;
-
- if (dn_db->dev->type == ARPHRD_ETHER) {
- if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) {
- dn_dn2eth(mac_addr, ifa1->ifa_local);
- dev_mc_delete(dev, mac_addr, ETH_ALEN, 0);
- }
- }
-
- dn_ifaddr_notify(RTM_DELADDR, ifa1);
- blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
- if (destroy) {
- dn_dev_free_ifa(ifa1);
-
- if (dn_db->ifa_list == NULL)
- dn_dev_delete(dn_db->dev);
- }
-}
-
-static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
-{
- struct net_device *dev = dn_db->dev;
- struct dn_ifaddr *ifa1;
- unsigned char mac_addr[6];
-
- ASSERT_RTNL();
-
- /* Check for duplicates */
- for(ifa1 = dn_db->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
- if (ifa1->ifa_local == ifa->ifa_local)
- return -EEXIST;
- }
-
- if (dev->type == ARPHRD_ETHER) {
- if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) {
- dn_dn2eth(mac_addr, ifa->ifa_local);
- dev_mc_add(dev, mac_addr, ETH_ALEN, 0);
- dev_mc_upload(dev);
- }
- }
-
- ifa->ifa_next = dn_db->ifa_list;
- dn_db->ifa_list = ifa;
-
- dn_ifaddr_notify(RTM_NEWADDR, ifa);
- blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
-
- return 0;
-}
-
-static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- struct dn_dev *dn_db = dev->dn_ptr;
- int rv;
-
- if (dn_db == NULL) {
- int err;
- dn_db = dn_dev_create(dev, &err);
- if (dn_db == NULL)
- return err;
- }
-
- ifa->ifa_dev = dn_db;
-
- if (dev->flags & IFF_LOOPBACK)
- ifa->ifa_scope = RT_SCOPE_HOST;
-
- rv = dn_dev_insert_ifa(dn_db, ifa);
- if (rv)
- dn_dev_free_ifa(ifa);
- return rv;
-}
-
-
-int dn_dev_ioctl(unsigned int cmd, void __user *arg)
-{
- char buffer[DN_IFREQ_SIZE];
- struct ifreq *ifr = (struct ifreq *)buffer;
- struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr;
- struct dn_dev *dn_db;
- struct net_device *dev;
- struct dn_ifaddr *ifa = NULL, **ifap = NULL;
- int ret = 0;
-
- if (copy_from_user(ifr, arg, DN_IFREQ_SIZE))
- return -EFAULT;
- ifr->ifr_name[IFNAMSIZ-1] = 0;
-
-#ifdef CONFIG_KMOD
- dev_load(ifr->ifr_name);
-#endif
-
- switch(cmd) {
- case SIOCGIFADDR:
- break;
- case SIOCSIFADDR:
- if (!capable(CAP_NET_ADMIN))
- return -EACCES;
- if (sdn->sdn_family != AF_DECnet)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
- rtnl_lock();
-
- if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) {
- ret = -ENODEV;
- goto done;
- }
-
- if ((dn_db = dev->dn_ptr) != NULL) {
- for (ifap = &dn_db->ifa_list; (ifa=*ifap) != NULL; ifap = &ifa->ifa_next)
- if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0)
- break;
- }
-
- if (ifa == NULL && cmd != SIOCSIFADDR) {
- ret = -EADDRNOTAVAIL;
- goto done;
- }
-
- switch(cmd) {
- case SIOCGIFADDR:
- *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local;
- goto rarok;
-
- case SIOCSIFADDR:
- if (!ifa) {
- if ((ifa = dn_dev_alloc_ifa()) == NULL) {
- ret = -ENOBUFS;
- break;
- }
- memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
- } else {
- if (ifa->ifa_local == dn_saddr2dn(sdn))
- break;
- dn_dev_del_ifa(dn_db, ifap, 0);
- }
-
- ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn);
-
- ret = dn_dev_set_ifa(dev, ifa);
- }
-done:
- rtnl_unlock();
-
- return ret;
-rarok:
- if (copy_to_user(arg, ifr, DN_IFREQ_SIZE))
- ret = -EFAULT;
- goto done;
-}
-
-struct net_device *dn_dev_get_default(void)
-{
- struct net_device *dev;
- read_lock(&dndev_lock);
- dev = decnet_default_device;
- if (dev) {
- if (dev->dn_ptr)
- dev_hold(dev);
- else
- dev = NULL;
- }
- read_unlock(&dndev_lock);
- return dev;
-}
-
-int dn_dev_set_default(struct net_device *dev, int force)
-{
- struct net_device *old = NULL;
- int rv = -EBUSY;
- if (!dev->dn_ptr)
- return -ENODEV;
- write_lock(&dndev_lock);
- if (force || decnet_default_device == NULL) {
- old = decnet_default_device;
- decnet_default_device = dev;
- rv = 0;
- }
- write_unlock(&dndev_lock);
- if (old)
- dev_put(old);
- return rv;
-}
-
-static void dn_dev_check_default(struct net_device *dev)
-{
- write_lock(&dndev_lock);
- if (dev == decnet_default_device) {
- decnet_default_device = NULL;
- } else {
- dev = NULL;
- }
- write_unlock(&dndev_lock);
- if (dev)
- dev_put(dev);
-}
-
-static struct dn_dev *dn_dev_by_index(int ifindex)
-{
- struct net_device *dev;
- struct dn_dev *dn_dev = NULL;
- dev = dev_get_by_index(ifindex);
- if (dev) {
- dn_dev = dev->dn_ptr;
- dev_put(dev);
- }
-
- return dn_dev;
-}
-
-static struct nla_policy dn_ifa_policy[IFA_MAX+1] __read_mostly = {
- [IFA_ADDRESS] = { .type = NLA_U16 },
- [IFA_LOCAL] = { .type = NLA_U16 },
- [IFA_LABEL] = { .type = NLA_STRING,
- .len = IFNAMSIZ - 1 },
-};
-
-static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
-{
- struct nlattr *tb[IFA_MAX+1];
- struct dn_dev *dn_db;
- struct ifaddrmsg *ifm;
- struct dn_ifaddr *ifa, **ifap;
- int err = -EADDRNOTAVAIL;
-
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
- if (err < 0)
- goto errout;
-
- ifm = nlmsg_data(nlh);
- if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL)
- goto errout;
-
- for (ifap = &dn_db->ifa_list; (ifa = *ifap); ifap = &ifa->ifa_next) {
- if (tb[IFA_LOCAL] &&
- nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2))
- continue;
-
- if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
- continue;
-
- dn_dev_del_ifa(dn_db, ifap, 1);
- return 0;
- }
-
-errout:
- return err;
-}
-
-static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
-{
- struct nlattr *tb[IFA_MAX+1];
- struct net_device *dev;
- struct dn_dev *dn_db;
- struct ifaddrmsg *ifm;
- struct dn_ifaddr *ifa;
- int err;
-
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
- if (err < 0)
- return err;
-
- if (tb[IFA_LOCAL] == NULL)
- return -EINVAL;
-
- ifm = nlmsg_data(nlh);
- if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
- return -ENODEV;
-
- if ((dn_db = dev->dn_ptr) == NULL) {
- int err;
- dn_db = dn_dev_create(dev, &err);
- if (!dn_db)
- return err;
- }
-
- if ((ifa = dn_dev_alloc_ifa()) == NULL)
- return -ENOBUFS;
-
- if (tb[IFA_ADDRESS] == NULL)
- tb[IFA_ADDRESS] = tb[IFA_LOCAL];
-
- ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]);
- ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]);
- ifa->ifa_flags = ifm->ifa_flags;
- ifa->ifa_scope = ifm->ifa_scope;
- ifa->ifa_dev = dn_db;
-
- if (tb[IFA_LABEL])
- nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
- else
- memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
-
- err = dn_dev_insert_ifa(dn_db, ifa);
- if (err)
- dn_dev_free_ifa(ifa);
-
- return err;
-}
-
-static inline size_t dn_ifaddr_nlmsg_size(void)
-{
- return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
- + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
- + nla_total_size(2) /* IFA_ADDRESS */
- + nla_total_size(2); /* IFA_LOCAL */
-}
-
-static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
- u32 pid, u32 seq, int event, unsigned int flags)
-{
- struct ifaddrmsg *ifm;
- struct nlmsghdr *nlh;
-
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
- if (nlh == NULL)
- return -ENOBUFS;
-
- ifm = nlmsg_data(nlh);
- ifm->ifa_family = AF_DECnet;
- ifm->ifa_prefixlen = 16;
- ifm->ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT;
- ifm->ifa_scope = ifa->ifa_scope;
- ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
-
- if (ifa->ifa_address)
- NLA_PUT_LE16(skb, IFA_ADDRESS, ifa->ifa_address);
- if (ifa->ifa_local)
- NLA_PUT_LE16(skb, IFA_LOCAL, ifa->ifa_local);
- if (ifa->ifa_label[0])
- NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
-
- return nlmsg_end(skb, nlh);
-
-nla_put_failure:
- return nlmsg_cancel(skb, nlh);
-}
-
-static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
-{
- struct sk_buff *skb;
- int err = -ENOBUFS;
-
- skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL);
- if (skb == NULL)
- goto errout;
-
- err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0);
- /* failure implies BUG in dn_ifaddr_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
-errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_DECnet_IFADDR, err);
-}
-
-static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
-{
- int idx, dn_idx = 0, skip_ndevs, skip_naddr;
- struct net_device *dev;
- struct dn_dev *dn_db;
- struct dn_ifaddr *ifa;
-
- skip_ndevs = cb->args[0];
- skip_naddr = cb->args[1];
-
- read_lock(&dev_base_lock);
- for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
- if (idx < skip_ndevs)
- continue;
- else if (idx > skip_ndevs) {
- /* Only skip over addresses for first dev dumped
- * in this iteration (idx == skip_ndevs) */
- skip_naddr = 0;
- }
-
- if ((dn_db = dev->dn_ptr) == NULL)
- continue;
-
- for (ifa = dn_db->ifa_list, dn_idx = 0; ifa;
- ifa = ifa->ifa_next, dn_idx++) {
- if (dn_idx < skip_naddr)
- continue;
-
- if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWADDR,
- NLM_F_MULTI) < 0)
- goto done;
- }
- }
-done:
- read_unlock(&dev_base_lock);
-
- cb->args[0] = idx;
- cb->args[1] = dn_idx;
-
- return skb->len;
-}
-
-static int dn_dev_get_first(struct net_device *dev, __le16 *addr)
-{
- struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
- struct dn_ifaddr *ifa;
- int rv = -ENODEV;
- if (dn_db == NULL)
- goto out;
- ifa = dn_db->ifa_list;
- if (ifa != NULL) {
- *addr = ifa->ifa_local;
- rv = 0;
- }
-out:
- return rv;
-}
-
-/*
- * Find a default address to bind to.
- *
- * This is one of those areas where the initial VMS concepts don't really
- * map onto the Linux concepts, and since we introduced multiple addresses
- * per interface we have to cope with slightly odd ways of finding out what
- * "our address" really is. Mostly it's not a problem; for this we just guess
- * a sensible default. Eventually the routing code will take care of all the
- * nasties for us I hope.
- */
-int dn_dev_bind_default(__le16 *addr)
-{
- struct net_device *dev;
- int rv;
- dev = dn_dev_get_default();
-last_chance:
- if (dev) {
- read_lock(&dev_base_lock);
- rv = dn_dev_get_first(dev, addr);
- read_unlock(&dev_base_lock);
- dev_put(dev);
- if (rv == 0 || dev == &loopback_dev)
- return rv;
- }
- dev = &loopback_dev;
- dev_hold(dev);
- goto last_chance;
-}
-
-static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- struct endnode_hello_message *msg;
- struct sk_buff *skb = NULL;
- __le16 *pktlen;
- struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
-
- if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL)
- return;
-
- skb->dev = dev;
-
- msg = (struct endnode_hello_message *)skb_put(skb,sizeof(*msg));
-
- msg->msgflg = 0x0D;
- memcpy(msg->tiver, dn_eco_version, 3);
- dn_dn2eth(msg->id, ifa->ifa_local);
- msg->iinfo = DN_RT_INFO_ENDN;
- msg->blksize = dn_htons(mtu2blksize(dev));
- msg->area = 0x00;
- memset(msg->seed, 0, 8);
- memcpy(msg->neighbor, dn_hiord, ETH_ALEN);
-
- if (dn_db->router) {
- struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
- dn_dn2eth(msg->neighbor, dn->addr);
- }
-
- msg->timer = dn_htons((unsigned short)dn_db->parms.t3);
- msg->mpd = 0x00;
- msg->datalen = 0x02;
- memset(msg->data, 0xAA, 2);
-
- pktlen = (__le16 *)skb_push(skb,2);
- *pktlen = dn_htons(skb->len - 2);
-
- skb->nh.raw = skb->data;
-
- dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id);
-}
-
-
-#define DRDELAY (5 * HZ)
-
-static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa)
-{
- /* First check time since device went up */
- if ((jiffies - dn_db->uptime) < DRDELAY)
- return 0;
-
- /* If there is no router, then yes... */
- if (!dn_db->router)
- return 1;
-
- /* otherwise only if we have a higher priority or.. */
- if (dn->priority < dn_db->parms.priority)
- return 1;
-
- /* if we have equal priority and a higher node number */
- if (dn->priority != dn_db->parms.priority)
- return 0;
-
- if (dn_ntohs(dn->addr) < dn_ntohs(ifa->ifa_local))
- return 1;
-
- return 0;
-}
-
-static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- int n;
- struct dn_dev *dn_db = dev->dn_ptr;
- struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
- struct sk_buff *skb;
- size_t size;
- unsigned char *ptr;
- unsigned char *i1, *i2;
- __le16 *pktlen;
- char *src;
-
- if (mtu2blksize(dev) < (26 + 7))
- return;
-
- n = mtu2blksize(dev) - 26;
- n /= 7;
-
- if (n > 32)
- n = 32;
-
- size = 2 + 26 + 7 * n;
-
- if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL)
- return;
-
- skb->dev = dev;
- ptr = skb_put(skb, size);
-
- *ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH;
- *ptr++ = 2; /* ECO */
- *ptr++ = 0;
- *ptr++ = 0;
- dn_dn2eth(ptr, ifa->ifa_local);
- src = ptr;
- ptr += ETH_ALEN;
- *ptr++ = dn_db->parms.forwarding == 1 ?
- DN_RT_INFO_L1RT : DN_RT_INFO_L2RT;
- *((__le16 *)ptr) = dn_htons(mtu2blksize(dev));
- ptr += 2;
- *ptr++ = dn_db->parms.priority; /* Priority */
- *ptr++ = 0; /* Area: Reserved */
- *((__le16 *)ptr) = dn_htons((unsigned short)dn_db->parms.t3);
- ptr += 2;
- *ptr++ = 0; /* MPD: Reserved */
- i1 = ptr++;
- memset(ptr, 0, 7); /* Name: Reserved */
- ptr += 7;
- i2 = ptr++;
-
- n = dn_neigh_elist(dev, ptr, n);
-
- *i2 = 7 * n;
- *i1 = 8 + *i2;
-
- skb_trim(skb, (27 + *i2));
-
- pktlen = (__le16 *)skb_push(skb, 2);
- *pktlen = dn_htons(skb->len - 2);
-
- skb->nh.raw = skb->data;
-
- if (dn_am_i_a_router(dn, dn_db, ifa)) {
- struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
- if (skb2) {
- dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src);
- }
- }
-
- dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
-}
-
-static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
-
- if (dn_db->parms.forwarding == 0)
- dn_send_endnode_hello(dev, ifa);
- else
- dn_send_router_hello(dev, ifa);
-}
-
-static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- int tdlen = 16;
- int size = dev->hard_header_len + 2 + 4 + tdlen;
- struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC);
- int i;
- unsigned char *ptr;
- char src[ETH_ALEN];
-
- if (skb == NULL)
- return ;
-
- skb->dev = dev;
- skb_push(skb, dev->hard_header_len);
- ptr = skb_put(skb, 2 + 4 + tdlen);
-
- *ptr++ = DN_RT_PKT_HELO;
- *((__le16 *)ptr) = ifa->ifa_local;
- ptr += 2;
- *ptr++ = tdlen;
-
- for(i = 0; i < tdlen; i++)
- *ptr++ = 0252;
-
- dn_dn2eth(src, ifa->ifa_local);
- dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
-}
-
-static int dn_eth_up(struct net_device *dev)
-{
- struct dn_dev *dn_db = dev->dn_ptr;
-
- if (dn_db->parms.forwarding == 0)
- dev_mc_add(dev, dn_rt_all_end_mcast, ETH_ALEN, 0);
- else
- dev_mc_add(dev, dn_rt_all_rt_mcast, ETH_ALEN, 0);
-
- dev_mc_upload(dev);
-
- dn_db->use_long = 1;
-
- return 0;
-}
-
-static void dn_eth_down(struct net_device *dev)
-{
- struct dn_dev *dn_db = dev->dn_ptr;
-
- if (dn_db->parms.forwarding == 0)
- dev_mc_delete(dev, dn_rt_all_end_mcast, ETH_ALEN, 0);
- else
- dev_mc_delete(dev, dn_rt_all_rt_mcast, ETH_ALEN, 0);
-}
-
-static void dn_dev_set_timer(struct net_device *dev);
-
-static void dn_dev_timer_func(unsigned long arg)
-{
- struct net_device *dev = (struct net_device *)arg;
- struct dn_dev *dn_db = dev->dn_ptr;
- struct dn_ifaddr *ifa;
-
- if (dn_db->t3 <= dn_db->parms.t2) {
- if (dn_db->parms.timer3) {
- for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
- if (!(ifa->ifa_flags & IFA_F_SECONDARY))
- dn_db->parms.timer3(dev, ifa);
- }
- }
- dn_db->t3 = dn_db->parms.t3;
- } else {
- dn_db->t3 -= dn_db->parms.t2;
- }
-
- dn_dev_set_timer(dev);
-}
-
-static void dn_dev_set_timer(struct net_device *dev)
-{
- struct dn_dev *dn_db = dev->dn_ptr;
-
- if (dn_db->parms.t2 > dn_db->parms.t3)
- dn_db->parms.t2 = dn_db->parms.t3;
-
- dn_db->timer.data = (unsigned long)dev;
- dn_db->timer.function = dn_dev_timer_func;
- dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ);
-
- add_timer(&dn_db->timer);
-}
-
-struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
-{
- int i;
- struct dn_dev_parms *p = dn_dev_list;
- struct dn_dev *dn_db;
-
- for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) {
- if (p->type == dev->type)
- break;
- }
-
- *err = -ENODEV;
- if (i == DN_DEV_LIST_SIZE)
- return NULL;
-
- *err = -ENOBUFS;
- if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL)
- return NULL;
-
- memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms));
- smp_wmb();
- dev->dn_ptr = dn_db;
- dn_db->dev = dev;
- init_timer(&dn_db->timer);
-
- dn_db->uptime = jiffies;
- if (dn_db->parms.up) {
- if (dn_db->parms.up(dev) < 0) {
- dev->dn_ptr = NULL;
- kfree(dn_db);
- return NULL;
- }
- }
-
- dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table);
-
- dn_dev_sysctl_register(dev, &dn_db->parms);
-
- dn_dev_set_timer(dev);
-
- *err = 0;
- return dn_db;
-}
-
-
-/*
- * This processes a device up event. We only start up
- * the loopback device & ethernet devices with correct
- * MAC addreses automatically. Others must be started
- * specifically.
- *
- * FIXME: How should we configure the loopback address ? If we could dispense
- * with using decnet_address here and for autobind, it will be one less thing
- * for users to worry about setting up.
- */
-
-void dn_dev_up(struct net_device *dev)
-{
- struct dn_ifaddr *ifa;
- __le16 addr = decnet_address;
- int maybe_default = 0;
- struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
-
- if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
- return;
-
- /*
- * Need to ensure that loopback device has a dn_db attached to it
- * to allow creation of neighbours against it, even though it might
- * not have a local address of its own. Might as well do the same for
- * all autoconfigured interfaces.
- */
- if (dn_db == NULL) {
- int err;
- dn_db = dn_dev_create(dev, &err);
- if (dn_db == NULL)
- return;
- }
-
- if (dev->type == ARPHRD_ETHER) {
- if (memcmp(dev->dev_addr, dn_hiord, 4) != 0)
- return;
- addr = dn_eth2dn(dev->dev_addr);
- maybe_default = 1;
- }
-
- if (addr == 0)
- return;
-
- if ((ifa = dn_dev_alloc_ifa()) == NULL)
- return;
-
- ifa->ifa_local = ifa->ifa_address = addr;
- ifa->ifa_flags = 0;
- ifa->ifa_scope = RT_SCOPE_UNIVERSE;
- strcpy(ifa->ifa_label, dev->name);
-
- dn_dev_set_ifa(dev, ifa);
-
- /*
- * Automagically set the default device to the first automatically
- * configured ethernet card in the system.
- */
- if (maybe_default) {
- dev_hold(dev);
- if (dn_dev_set_default(dev, 0))
- dev_put(dev);
- }
-}
-
-static void dn_dev_delete(struct net_device *dev)
-{
- struct dn_dev *dn_db = dev->dn_ptr;
-
- if (dn_db == NULL)
- return;
-
- del_timer_sync(&dn_db->timer);
- dn_dev_sysctl_unregister(&dn_db->parms);
- dn_dev_check_default(dev);
- neigh_ifdown(&dn_neigh_table, dev);
-
- if (dn_db->parms.down)
- dn_db->parms.down(dev);
-
- dev->dn_ptr = NULL;
-
- neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
- neigh_ifdown(&dn_neigh_table, dev);
-
- if (dn_db->router)
- neigh_release(dn_db->router);
- if (dn_db->peer)
- neigh_release(dn_db->peer);
-
- kfree(dn_db);
-}
-
-void dn_dev_down(struct net_device *dev)
-{
- struct dn_dev *dn_db = dev->dn_ptr;
- struct dn_ifaddr *ifa;
-
- if (dn_db == NULL)
- return;
-
- while((ifa = dn_db->ifa_list) != NULL) {
- dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0);
- dn_dev_free_ifa(ifa);
- }
-
- dn_dev_delete(dev);
-}
-
-void dn_dev_init_pkt(struct sk_buff *skb)
-{
- return;
-}
-
-void dn_dev_veri_pkt(struct sk_buff *skb)
-{
- return;
-}
-
-void dn_dev_hello(struct sk_buff *skb)
-{
- return;
-}
-
-void dn_dev_devices_off(void)
-{
- struct net_device *dev;
-
- rtnl_lock();
- for(dev = dev_base; dev; dev = dev->next)
- dn_dev_down(dev);
- rtnl_unlock();
-
-}
-
-void dn_dev_devices_on(void)
-{
- struct net_device *dev;
-
- rtnl_lock();
- for(dev = dev_base; dev; dev = dev->next) {
- if (dev->flags & IFF_UP)
- dn_dev_up(dev);
- }
- rtnl_unlock();
-}
-
-int register_dnaddr_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&dnaddr_chain, nb);
-}
-
-int unregister_dnaddr_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
-}
-
-#ifdef CONFIG_PROC_FS
-static inline struct net_device *dn_dev_get_next(struct seq_file *seq, struct net_device *dev)
-{
- do {
- dev = dev->next;
- } while(dev && !dev->dn_ptr);
-
- return dev;
-}
-
-static struct net_device *dn_dev_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct net_device *dev;
-
- dev = dev_base;
- if (dev && !dev->dn_ptr)
- dev = dn_dev_get_next(seq, dev);
- if (pos) {
- while(dev && (dev = dn_dev_get_next(seq, dev)))
- --pos;
- }
- return dev;
-}
-
-static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
-{
- if (*pos) {
- struct net_device *dev;
- read_lock(&dev_base_lock);
- dev = dn_dev_get_idx(seq, *pos - 1);
- if (dev == NULL)
- read_unlock(&dev_base_lock);
- return dev;
- }
- return SEQ_START_TOKEN;
-}
-
-static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct net_device *dev = v;
- loff_t one = 1;
-
- if (v == SEQ_START_TOKEN) {
- dev = dn_dev_seq_start(seq, &one);
- } else {
- dev = dn_dev_get_next(seq, dev);
- if (dev == NULL)
- read_unlock(&dev_base_lock);
- }
- ++*pos;
- return dev;
-}
-
-static void dn_dev_seq_stop(struct seq_file *seq, void *v)
-{
- if (v && v != SEQ_START_TOKEN)
- read_unlock(&dev_base_lock);
-}
-
-static char *dn_type2asc(char type)
-{
- switch(type) {
- case DN_DEV_BCAST:
- return "B";
- case DN_DEV_UCAST:
- return "U";
- case DN_DEV_MPOINT:
- return "M";
- }
-
- return "?";
-}
-
-static int dn_dev_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Name Flags T1 Timer1 T3 Timer3 BlkSize Pri State DevType Router Peer\n");
- else {
- struct net_device *dev = v;
- char peer_buf[DN_ASCBUF_LEN];
- char router_buf[DN_ASCBUF_LEN];
- struct dn_dev *dn_db = dev->dn_ptr;
-
- seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu"
- " %04hu %03d %02x %-10s %-7s %-7s\n",
- dev->name ? dev->name : "???",
- dn_type2asc(dn_db->parms.mode),
- 0, 0,
- dn_db->t3, dn_db->parms.t3,
- mtu2blksize(dev),
- dn_db->parms.priority,
- dn_db->parms.state, dn_db->parms.name,
- dn_db->router ? dn_addr2asc(dn_ntohs(*(__le16 *)dn_db->router->primary_key), router_buf) : "",
- dn_db->peer ? dn_addr2asc(dn_ntohs(*(__le16 *)dn_db->peer->primary_key), peer_buf) : "");
- }
- return 0;
-}
-
-static struct seq_operations dn_dev_seq_ops = {
- .start = dn_dev_seq_start,
- .next = dn_dev_seq_next,
- .stop = dn_dev_seq_stop,
- .show = dn_dev_seq_show,
-};
-
-static int dn_dev_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &dn_dev_seq_ops);
-}
-
-static struct file_operations dn_dev_seq_fops = {
- .owner = THIS_MODULE,
- .open = dn_dev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-#endif /* CONFIG_PROC_FS */
-
-static struct rtnetlink_link dnet_rtnetlink_table[RTM_NR_MSGTYPES] =
-{
- [RTM_NEWADDR - RTM_BASE] = { .doit = dn_nl_newaddr, },
- [RTM_DELADDR - RTM_BASE] = { .doit = dn_nl_deladdr, },
- [RTM_GETADDR - RTM_BASE] = { .dumpit = dn_nl_dump_ifaddr, },
-#ifdef CONFIG_DECNET_ROUTER
- [RTM_NEWROUTE - RTM_BASE] = { .doit = dn_fib_rtm_newroute, },
- [RTM_DELROUTE - RTM_BASE] = { .doit = dn_fib_rtm_delroute, },
- [RTM_GETROUTE - RTM_BASE] = { .doit = dn_cache_getroute,
- .dumpit = dn_fib_dump, },
- [RTM_GETRULE - RTM_BASE] = { .dumpit = dn_fib_dump_rules, },
-#else
- [RTM_GETROUTE - RTM_BASE] = { .doit = dn_cache_getroute,
- .dumpit = dn_cache_dump, },
-#endif
-
-};
-
-static int __initdata addr[2];
-module_param_array(addr, int, NULL, 0444);
-MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node");
-
-void __init dn_dev_init(void)
-{
- if (addr[0] > 63 || addr[0] < 0) {
- printk(KERN_ERR "DECnet: Area must be between 0 and 63");
- return;
- }
-
- if (addr[1] > 1023 || addr[1] < 0) {
- printk(KERN_ERR "DECnet: Node must be between 0 and 1023");
- return;
- }
-
- decnet_address = dn_htons((addr[0] << 10) | addr[1]);
-
- dn_dev_devices_on();
-
- rtnetlink_links[PF_DECnet] = dnet_rtnetlink_table;
-
- proc_net_fops_create("decnet_dev", S_IRUGO, &dn_dev_seq_fops);
-
-#ifdef CONFIG_SYSCTL
- {
- int i;
- for(i = 0; i < DN_DEV_LIST_SIZE; i++)
- dn_dev_sysctl_register(NULL, &dn_dev_list[i]);
- }
-#endif /* CONFIG_SYSCTL */
-}
-
-void __exit dn_dev_cleanup(void)
-{
- rtnetlink_links[PF_DECnet] = NULL;
-
-#ifdef CONFIG_SYSCTL
- {
- int i;
- for(i = 0; i < DN_DEV_LIST_SIZE; i++)
- dn_dev_sysctl_unregister(&dn_dev_list[i]);
- }
-#endif /* CONFIG_SYSCTL */
-
- proc_net_remove("decnet_dev");
-
- dn_dev_devices_off();
-}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
deleted file mode 100644
index 1cf010124ec5..000000000000
--- a/net/decnet/dn_fib.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Forwarding Information Base (Glue/Info List)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Alexey Kuznetsov : SMP locking changes
- * Steve Whitehouse : Rewrote it... Well to be more correct, I
- * copied most of it from the ipv4 fib code.
- * Steve Whitehouse : Updated it in style and fixed a few bugs
- * which were fixed in the ipv4 code since
- * this code was copied from it.
- *
- */
-#include <linux/string.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/uaccess.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-
-#define RT_MIN_TABLE 1
-
-#define for_fib_info() { struct dn_fib_info *fi;\
- for(fi = dn_fib_info_list; fi; fi = fi->fib_next)
-#define endfor_fib_info() }
-
-#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
- for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\
- for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define endfor_nexthops(fi) }
-
-static DEFINE_SPINLOCK(dn_fib_multipath_lock);
-static struct dn_fib_info *dn_fib_info_list;
-static DEFINE_SPINLOCK(dn_fib_info_lock);
-
-static struct
-{
- int error;
- u8 scope;
-} dn_fib_props[RTA_MAX+1] = {
- [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE },
- [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE },
- [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST },
- [RTN_BROADCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
- [RTN_ANYCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
- [RTN_MULTICAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
- [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE },
- [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE },
- [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE },
- [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE },
- [RTN_NAT] = { .error = 0, .scope = RT_SCOPE_NOWHERE },
- [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
-};
-
-static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force);
-static int dn_fib_sync_up(struct net_device *dev);
-
-void dn_fib_free_info(struct dn_fib_info *fi)
-{
- if (fi->fib_dead == 0) {
- printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n");
- return;
- }
-
- change_nexthops(fi) {
- if (nh->nh_dev)
- dev_put(nh->nh_dev);
- nh->nh_dev = NULL;
- } endfor_nexthops(fi);
- kfree(fi);
-}
-
-void dn_fib_release_info(struct dn_fib_info *fi)
-{
- spin_lock(&dn_fib_info_lock);
- if (fi && --fi->fib_treeref == 0) {
- if (fi->fib_next)
- fi->fib_next->fib_prev = fi->fib_prev;
- if (fi->fib_prev)
- fi->fib_prev->fib_next = fi->fib_next;
- if (fi == dn_fib_info_list)
- dn_fib_info_list = fi->fib_next;
- fi->fib_dead = 1;
- dn_fib_info_put(fi);
- }
- spin_unlock(&dn_fib_info_lock);
-}
-
-static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi)
-{
- const struct dn_fib_nh *onh = ofi->fib_nh;
-
- for_nexthops(fi) {
- if (nh->nh_oif != onh->nh_oif ||
- nh->nh_gw != onh->nh_gw ||
- nh->nh_scope != onh->nh_scope ||
- nh->nh_weight != onh->nh_weight ||
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
- return -1;
- onh++;
- } endfor_nexthops(fi);
- return 0;
-}
-
-static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi)
-{
- for_fib_info() {
- if (fi->fib_nhs != nfi->fib_nhs)
- continue;
- if (nfi->fib_protocol == fi->fib_protocol &&
- nfi->fib_prefsrc == fi->fib_prefsrc &&
- nfi->fib_priority == fi->fib_priority &&
- memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
- (nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0))
- return fi;
- } endfor_fib_info();
- return NULL;
-}
-
-__le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type)
-{
- while(RTA_OK(attr,attrlen)) {
- if (attr->rta_type == type)
- return *(__le16*)RTA_DATA(attr);
- attr = RTA_NEXT(attr, attrlen);
- }
-
- return 0;
-}
-
-static int dn_fib_count_nhs(struct rtattr *rta)
-{
- int nhs = 0;
- struct rtnexthop *nhp = RTA_DATA(rta);
- int nhlen = RTA_PAYLOAD(rta);
-
- while(nhlen >= (int)sizeof(struct rtnexthop)) {
- if ((nhlen -= nhp->rtnh_len) < 0)
- return 0;
- nhs++;
- nhp = RTNH_NEXT(nhp);
- }
-
- return nhs;
-}
-
-static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
-{
- struct rtnexthop *nhp = RTA_DATA(rta);
- int nhlen = RTA_PAYLOAD(rta);
-
- change_nexthops(fi) {
- int attrlen = nhlen - sizeof(struct rtnexthop);
- if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
- return -EINVAL;
-
- nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
- nh->nh_oif = nhp->rtnh_ifindex;
- nh->nh_weight = nhp->rtnh_hops + 1;
-
- if (attrlen) {
- nh->nh_gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
- }
- nhp = RTNH_NEXT(nhp);
- } endfor_nexthops(fi);
-
- return 0;
-}
-
-
-static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh)
-{
- int err;
-
- if (nh->nh_gw) {
- struct flowi fl;
- struct dn_fib_res res;
-
- memset(&fl, 0, sizeof(fl));
-
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
-
- if (r->rtm_scope >= RT_SCOPE_LINK)
- return -EINVAL;
- if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST)
- return -EINVAL;
- if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
- return -ENODEV;
- if (!(dev->flags&IFF_UP))
- return -ENETDOWN;
- nh->nh_dev = dev;
- dev_hold(dev);
- nh->nh_scope = RT_SCOPE_LINK;
- return 0;
- }
-
- memset(&fl, 0, sizeof(fl));
- fl.fld_dst = nh->nh_gw;
- fl.oif = nh->nh_oif;
- fl.fld_scope = r->rtm_scope + 1;
-
- if (fl.fld_scope < RT_SCOPE_LINK)
- fl.fld_scope = RT_SCOPE_LINK;
-
- if ((err = dn_fib_lookup(&fl, &res)) != 0)
- return err;
-
- err = -EINVAL;
- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
- goto out;
- nh->nh_scope = res.scope;
- nh->nh_oif = DN_FIB_RES_OIF(res);
- nh->nh_dev = DN_FIB_RES_DEV(res);
- if (nh->nh_dev == NULL)
- goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
- goto out;
- err = 0;
-out:
- dn_fib_res_put(&res);
- return err;
- } else {
- struct net_device *dev;
-
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
- return -EINVAL;
-
- dev = __dev_get_by_index(nh->nh_oif);
- if (dev == NULL || dev->dn_ptr == NULL)
- return -ENODEV;
- if (!(dev->flags&IFF_UP))
- return -ENETDOWN;
- nh->nh_dev = dev;
- dev_hold(nh->nh_dev);
- nh->nh_scope = RT_SCOPE_HOST;
- }
-
- return 0;
-}
-
-
-struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta *rta, const struct nlmsghdr *nlh, int *errp)
-{
- int err;
- struct dn_fib_info *fi = NULL;
- struct dn_fib_info *ofi;
- int nhs = 1;
-
- if (dn_fib_props[r->rtm_type].scope > r->rtm_scope)
- goto err_inval;
-
- if (rta->rta_mp) {
- nhs = dn_fib_count_nhs(rta->rta_mp);
- if (nhs == 0)
- goto err_inval;
- }
-
- fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL);
- err = -ENOBUFS;
- if (fi == NULL)
- goto failure;
-
- fi->fib_protocol = r->rtm_protocol;
- fi->fib_nhs = nhs;
- fi->fib_flags = r->rtm_flags;
- if (rta->rta_priority)
- fi->fib_priority = *rta->rta_priority;
- if (rta->rta_mx) {
- int attrlen = RTA_PAYLOAD(rta->rta_mx);
- struct rtattr *attr = RTA_DATA(rta->rta_mx);
-
- while(RTA_OK(attr, attrlen)) {
- unsigned flavour = attr->rta_type;
- if (flavour) {
- if (flavour > RTAX_MAX)
- goto err_inval;
- fi->fib_metrics[flavour-1] = *(unsigned*)RTA_DATA(attr);
- }
- attr = RTA_NEXT(attr, attrlen);
- }
- }
- if (rta->rta_prefsrc)
- memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 2);
-
- if (rta->rta_mp) {
- if ((err = dn_fib_get_nhs(fi, rta->rta_mp, r)) != 0)
- goto failure;
- if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
- goto err_inval;
- if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 2))
- goto err_inval;
- } else {
- struct dn_fib_nh *nh = fi->fib_nh;
- if (rta->rta_oif)
- nh->nh_oif = *rta->rta_oif;
- if (rta->rta_gw)
- memcpy(&nh->nh_gw, rta->rta_gw, 2);
- nh->nh_flags = r->rtm_flags;
- nh->nh_weight = 1;
- }
-
- if (r->rtm_type == RTN_NAT) {
- if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif)
- goto err_inval;
- memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 2);
- goto link_it;
- }
-
- if (dn_fib_props[r->rtm_type].error) {
- if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
- goto err_inval;
- goto link_it;
- }
-
- if (r->rtm_scope > RT_SCOPE_HOST)
- goto err_inval;
-
- if (r->rtm_scope == RT_SCOPE_HOST) {
- struct dn_fib_nh *nh = fi->fib_nh;
-
- /* Local address is added */
- if (nhs != 1 || nh->nh_gw)
- goto err_inval;
- nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
- err = -ENODEV;
- if (nh->nh_dev == NULL)
- goto failure;
- } else {
- change_nexthops(fi) {
- if ((err = dn_fib_check_nh(r, fi, nh)) != 0)
- goto failure;
- } endfor_nexthops(fi)
- }
-
- if (fi->fib_prefsrc) {
- if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
- memcmp(&fi->fib_prefsrc, rta->rta_dst, 2))
- if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
- goto err_inval;
- }
-
-link_it:
- if ((ofi = dn_fib_find_info(fi)) != NULL) {
- fi->fib_dead = 1;
- dn_fib_free_info(fi);
- ofi->fib_treeref++;
- return ofi;
- }
-
- fi->fib_treeref++;
- atomic_inc(&fi->fib_clntref);
- spin_lock(&dn_fib_info_lock);
- fi->fib_next = dn_fib_info_list;
- fi->fib_prev = NULL;
- if (dn_fib_info_list)
- dn_fib_info_list->fib_prev = fi;
- dn_fib_info_list = fi;
- spin_unlock(&dn_fib_info_lock);
- return fi;
-
-err_inval:
- err = -EINVAL;
-
-failure:
- *errp = err;
- if (fi) {
- fi->fib_dead = 1;
- dn_fib_free_info(fi);
- }
-
- return NULL;
-}
-
-int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowi *fl, struct dn_fib_res *res)
-{
- int err = dn_fib_props[type].error;
-
- if (err == 0) {
- if (fi->fib_flags & RTNH_F_DEAD)
- return 1;
-
- res->fi = fi;
-
- switch(type) {
- case RTN_NAT:
- DN_FIB_RES_RESET(*res);
- atomic_inc(&fi->fib_clntref);
- return 0;
- case RTN_UNICAST:
- case RTN_LOCAL:
- for_nexthops(fi) {
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (!fl->oif || fl->oif == nh->nh_oif)
- break;
- }
- if (nhsel < fi->fib_nhs) {
- res->nh_sel = nhsel;
- atomic_inc(&fi->fib_clntref);
- return 0;
- }
- endfor_nexthops(fi);
- res->fi = NULL;
- return 1;
- default:
- if (net_ratelimit())
- printk("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n", type);
- res->fi = NULL;
- return -EINVAL;
- }
- }
- return err;
-}
-
-void dn_fib_select_multipath(const struct flowi *fl, struct dn_fib_res *res)
-{
- struct dn_fib_info *fi = res->fi;
- int w;
-
- spin_lock_bh(&dn_fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- power += nh->nh_weight;
- nh->nh_power = nh->nh_weight;
- }
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power < 0) {
- spin_unlock_bh(&dn_fib_multipath_lock);
- res->nh_sel = 0;
- return;
- }
- }
-
- w = jiffies % fi->fib_power;
-
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
- if ((w -= nh->nh_power) <= 0) {
- nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&dn_fib_multipath_lock);
- return;
- }
- }
- } endfor_nexthops(fi);
- res->nh_sel = 0;
- spin_unlock_bh(&dn_fib_multipath_lock);
-}
-
-
-static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta)
-{
- int i;
-
- for(i = 1; i <= RTA_MAX; i++) {
- struct rtattr *attr = rta[i-1];
- if (attr) {
- if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2)
- return -EINVAL;
- if (i != RTA_MULTIPATH && i != RTA_METRICS &&
- i != RTA_TABLE)
- rta[i-1] = (struct rtattr *)RTA_DATA(attr);
- }
- }
-
- return 0;
-}
-
-int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
-{
- struct dn_fib_table *tb;
- struct rtattr **rta = arg;
- struct rtmsg *r = NLMSG_DATA(nlh);
-
- if (dn_fib_check_attr(r, rta))
- return -EINVAL;
-
- tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 0);
- if (tb)
- return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
-
- return -ESRCH;
-}
-
-int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
-{
- struct dn_fib_table *tb;
- struct rtattr **rta = arg;
- struct rtmsg *r = NLMSG_DATA(nlh);
-
- if (dn_fib_check_attr(r, rta))
- return -EINVAL;
-
- tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 1);
- if (tb)
- return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
-
- return -ENOBUFS;
-}
-
-static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
-{
- struct dn_fib_table *tb;
- struct {
- struct nlmsghdr nlh;
- struct rtmsg rtm;
- } req;
- struct dn_kern_rta rta;
-
- memset(&req.rtm, 0, sizeof(req.rtm));
- memset(&rta, 0, sizeof(rta));
-
- if (type == RTN_UNICAST)
- tb = dn_fib_get_table(RT_MIN_TABLE, 1);
- else
- tb = dn_fib_get_table(RT_TABLE_LOCAL, 1);
-
- if (tb == NULL)
- return;
-
- req.nlh.nlmsg_len = sizeof(req);
- req.nlh.nlmsg_type = cmd;
- req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
- req.nlh.nlmsg_pid = 0;
- req.nlh.nlmsg_seq = 0;
-
- req.rtm.rtm_dst_len = dst_len;
- req.rtm.rtm_table = tb->n;
- req.rtm.rtm_protocol = RTPROT_KERNEL;
- req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
- req.rtm.rtm_type = type;
-
- rta.rta_dst = &dst;
- rta.rta_prefsrc = &ifa->ifa_local;
- rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
-
- if (cmd == RTM_NEWROUTE)
- tb->insert(tb, &req.rtm, &rta, &req.nlh, NULL);
- else
- tb->delete(tb, &req.rtm, &rta, &req.nlh, NULL);
-}
-
-static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa)
-{
-
- fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
-
-#if 0
- if (!(dev->flags&IFF_UP))
- return;
- /* In the future, we will want to add default routes here */
-
-#endif
-}
-
-static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
-{
- int found_it = 0;
- struct net_device *dev;
- struct dn_dev *dn_db;
- struct dn_ifaddr *ifa2;
-
- ASSERT_RTNL();
-
- /* Scan device list */
- read_lock(&dev_base_lock);
- for(dev = dev_base; dev; dev = dev->next) {
- dn_db = dev->dn_ptr;
- if (dn_db == NULL)
- continue;
- for(ifa2 = dn_db->ifa_list; ifa2; ifa2 = ifa2->ifa_next) {
- if (ifa2->ifa_local == ifa->ifa_local) {
- found_it = 1;
- break;
- }
- }
- }
- read_unlock(&dev_base_lock);
-
- if (found_it == 0) {
- fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
-
- if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
- if (dn_fib_sync_down(ifa->ifa_local, NULL, 0))
- dn_fib_flush();
- }
- }
-}
-
-static void dn_fib_disable_addr(struct net_device *dev, int force)
-{
- if (dn_fib_sync_down(0, dev, force))
- dn_fib_flush();
- dn_rt_cache_flush(0);
- neigh_ifdown(&dn_neigh_table, dev);
-}
-
-static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
- struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr;
-
- switch(event) {
- case NETDEV_UP:
- dn_fib_add_ifaddr(ifa);
- dn_fib_sync_up(ifa->ifa_dev->dev);
- dn_rt_cache_flush(-1);
- break;
- case NETDEV_DOWN:
- dn_fib_del_ifaddr(ifa);
- if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
- dn_fib_disable_addr(ifa->ifa_dev->dev, 1);
- } else {
- dn_rt_cache_flush(-1);
- }
- break;
- }
- return NOTIFY_DONE;
-}
-
-static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
-{
- int ret = 0;
- int scope = RT_SCOPE_NOWHERE;
-
- if (force)
- scope = -1;
-
- for_fib_info() {
- /*
- * This makes no sense for DECnet.... we will almost
- * certainly have more than one local address the same
- * over all our interfaces. It needs thinking about
- * some more.
- */
- if (local && fi->fib_prefsrc == local) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
- } else if (dev && fi->fib_nhs) {
- int dead = 0;
-
- change_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
- dead++;
- else if (nh->nh_dev == dev &&
- nh->nh_scope != scope) {
- spin_lock_bh(&dn_fib_multipath_lock);
- nh->nh_flags |= RTNH_F_DEAD;
- fi->fib_power -= nh->nh_power;
- nh->nh_power = 0;
- spin_unlock_bh(&dn_fib_multipath_lock);
- dead++;
- }
- } endfor_nexthops(fi)
- if (dead == fi->fib_nhs) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
- }
- }
- } endfor_fib_info();
- return ret;
-}
-
-
-static int dn_fib_sync_up(struct net_device *dev)
-{
- int ret = 0;
-
- if (!(dev->flags&IFF_UP))
- return 0;
-
- for_fib_info() {
- int alive = 0;
-
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- alive++;
- continue;
- }
- if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
- continue;
- if (nh->nh_dev != dev || dev->dn_ptr == NULL)
- continue;
- alive++;
- spin_lock_bh(&dn_fib_multipath_lock);
- nh->nh_power = 0;
- nh->nh_flags &= ~RTNH_F_DEAD;
- spin_unlock_bh(&dn_fib_multipath_lock);
- } endfor_nexthops(fi);
-
- if (alive > 0) {
- fi->fib_flags &= ~RTNH_F_DEAD;
- ret++;
- }
- } endfor_fib_info();
- return ret;
-}
-
-static struct notifier_block dn_fib_dnaddr_notifier = {
- .notifier_call = dn_fib_dnaddr_event,
-};
-
-void __exit dn_fib_cleanup(void)
-{
- dn_fib_table_cleanup();
- dn_fib_rules_cleanup();
-
- unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier);
-}
-
-
-void __init dn_fib_init(void)
-{
-
- dn_fib_table_init();
- dn_fib_rules_init();
-
- register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
-}
-
-
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
deleted file mode 100644
index 7322bb36e825..000000000000
--- a/net/decnet/dn_neigh.c
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Neighbour Functions (Adjacency Database and
- * On-Ethernet Cache)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse : Fixed router listing routine
- * Steve Whitehouse : Added error_report functions
- * Steve Whitehouse : Added default router detection
- * Steve Whitehouse : Hop counts in outgoing messages
- * Steve Whitehouse : Fixed src/dst in outgoing messages so
- * forwarding now stands a good chance of
- * working.
- * Steve Whitehouse : Fixed neighbour states (for now anyway).
- * Steve Whitehouse : Made error_report functions dummies. This
- * is not the right place to return skbs.
- * Steve Whitehouse : Convert to seq_file
- *
- */
-
-#include <linux/net.h>
-#include <linux/module.h>
-#include <linux/socket.h>
-#include <linux/if_arp.h>
-#include <linux/if_ether.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/string.h>
-#include <linux/netfilter_decnet.h>
-#include <linux/spinlock.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/jhash.h>
-#include <asm/atomic.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_neigh.h>
-#include <net/dn_route.h>
-
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev);
-static int dn_neigh_construct(struct neighbour *);
-static void dn_long_error_report(struct neighbour *, struct sk_buff *);
-static void dn_short_error_report(struct neighbour *, struct sk_buff *);
-static int dn_long_output(struct sk_buff *);
-static int dn_short_output(struct sk_buff *);
-static int dn_phase3_output(struct sk_buff *);
-
-
-/*
- * For talking to broadcast devices: Ethernet & PPP
- */
-static struct neigh_ops dn_long_ops = {
- .family = AF_DECnet,
- .error_report = dn_long_error_report,
- .output = dn_long_output,
- .connected_output = dn_long_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
-};
-
-/*
- * For talking to pointopoint and multidrop devices: DDCMP and X.25
- */
-static struct neigh_ops dn_short_ops = {
- .family = AF_DECnet,
- .error_report = dn_short_error_report,
- .output = dn_short_output,
- .connected_output = dn_short_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
-};
-
-/*
- * For talking to DECnet phase III nodes
- */
-static struct neigh_ops dn_phase3_ops = {
- .family = AF_DECnet,
- .error_report = dn_short_error_report, /* Can use short version here */
- .output = dn_phase3_output,
- .connected_output = dn_phase3_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit
-};
-
-struct neigh_table dn_neigh_table = {
- .family = PF_DECnet,
- .entry_size = sizeof(struct dn_neigh),
- .key_len = sizeof(__le16),
- .hash = dn_neigh_hash,
- .constructor = dn_neigh_construct,
- .id = "dn_neigh_cache",
- .parms ={
- .tbl = &dn_neigh_table,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 0,
- .app_probes = 0,
- .mcast_probes = 0,
- .anycast_delay = 0,
- .proxy_delay = 0,
- .proxy_qlen = 0,
- .locktime = 1 * HZ,
- },
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
-};
-
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
-{
- return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd);
-}
-
-static int dn_neigh_construct(struct neighbour *neigh)
-{
- struct net_device *dev = neigh->dev;
- struct dn_neigh *dn = (struct dn_neigh *)neigh;
- struct dn_dev *dn_db;
- struct neigh_parms *parms;
-
- rcu_read_lock();
- dn_db = rcu_dereference(dev->dn_ptr);
- if (dn_db == NULL) {
- rcu_read_unlock();
- return -EINVAL;
- }
-
- parms = dn_db->neigh_parms;
- if (!parms) {
- rcu_read_unlock();
- return -EINVAL;
- }
-
- __neigh_parms_put(neigh->parms);
- neigh->parms = neigh_parms_clone(parms);
-
- if (dn_db->use_long)
- neigh->ops = &dn_long_ops;
- else
- neigh->ops = &dn_short_ops;
- rcu_read_unlock();
-
- if (dn->flags & DN_NDFLAG_P3)
- neigh->ops = &dn_phase3_ops;
-
- neigh->nud_state = NUD_NOARP;
- neigh->output = neigh->ops->connected_output;
-
- if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT))
- memcpy(neigh->ha, dev->broadcast, dev->addr_len);
- else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK))
- dn_dn2eth(neigh->ha, dn->addr);
- else {
- if (net_ratelimit())
- printk(KERN_DEBUG "Trying to create neigh for hw %d\n", dev->type);
- return -EINVAL;
- }
-
- /*
- * Make an estimate of the remote block size by assuming that its
- * two less then the device mtu, which it true for ethernet (and
- * other things which support long format headers) since there is
- * an extra length field (of 16 bits) which isn't part of the
- * ethernet headers and which the DECnet specs won't admit is part
- * of the DECnet routing headers either.
- *
- * If we over estimate here its no big deal, the NSP negotiations
- * will prevent us from sending packets which are too large for the
- * remote node to handle. In any case this figure is normally updated
- * by a hello message in most cases.
- */
- dn->blksize = dev->mtu - 2;
-
- return 0;
-}
-
-static void dn_long_error_report(struct neighbour *neigh, struct sk_buff *skb)
-{
- printk(KERN_DEBUG "dn_long_error_report: called\n");
- kfree_skb(skb);
-}
-
-
-static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb)
-{
- printk(KERN_DEBUG "dn_short_error_report: called\n");
- kfree_skb(skb);
-}
-
-static int dn_neigh_output_packet(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct dn_route *rt = (struct dn_route *)dst;
- struct neighbour *neigh = dst->neighbour;
- struct net_device *dev = neigh->dev;
- char mac_addr[ETH_ALEN];
-
- dn_dn2eth(mac_addr, rt->rt_local_src);
- if (!dev->hard_header || dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, mac_addr, skb->len) >= 0)
- return neigh->ops->queue_xmit(skb);
-
- if (net_ratelimit())
- printk(KERN_DEBUG "dn_neigh_output_packet: oops, can't send packet\n");
-
- kfree_skb(skb);
- return -EINVAL;
-}
-
-static int dn_long_output(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh = dst->neighbour;
- struct net_device *dev = neigh->dev;
- int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
- unsigned char *data;
- struct dn_long_packet *lp;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-
- if (skb_headroom(skb) < headroom) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
- if (skb2 == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "dn_long_output: no memory\n");
- kfree_skb(skb);
- return -ENOBUFS;
- }
- kfree_skb(skb);
- skb = skb2;
- if (net_ratelimit())
- printk(KERN_INFO "dn_long_output: Increasing headroom\n");
- }
-
- data = skb_push(skb, sizeof(struct dn_long_packet) + 3);
- lp = (struct dn_long_packet *)(data+3);
-
- *((__le16 *)data) = dn_htons(skb->len - 2);
- *(data + 2) = 1 | DN_RT_F_PF; /* Padding */
-
- lp->msgflg = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS));
- lp->d_area = lp->d_subarea = 0;
- dn_dn2eth(lp->d_id, cb->dst);
- lp->s_area = lp->s_subarea = 0;
- dn_dn2eth(lp->s_id, cb->src);
- lp->nl2 = 0;
- lp->visit_ct = cb->hops & 0x3f;
- lp->s_class = 0;
- lp->pt = 0;
-
- skb->nh.raw = skb->data;
-
- return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet);
-}
-
-static int dn_short_output(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh = dst->neighbour;
- struct net_device *dev = neigh->dev;
- int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
- struct dn_short_packet *sp;
- unsigned char *data;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-
- if (skb_headroom(skb) < headroom) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
- if (skb2 == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "dn_short_output: no memory\n");
- kfree_skb(skb);
- return -ENOBUFS;
- }
- kfree_skb(skb);
- skb = skb2;
- if (net_ratelimit())
- printk(KERN_INFO "dn_short_output: Increasing headroom\n");
- }
-
- data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
- *((__le16 *)data) = dn_htons(skb->len - 2);
- sp = (struct dn_short_packet *)(data+2);
-
- sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
- sp->dstnode = cb->dst;
- sp->srcnode = cb->src;
- sp->forward = cb->hops & 0x3f;
-
- skb->nh.raw = skb->data;
-
- return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet);
-}
-
-/*
- * Phase 3 output is the same is short output, execpt that
- * it clears the area bits before transmission.
- */
-static int dn_phase3_output(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh = dst->neighbour;
- struct net_device *dev = neigh->dev;
- int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
- struct dn_short_packet *sp;
- unsigned char *data;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- if (skb_headroom(skb) < headroom) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
- if (skb2 == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "dn_phase3_output: no memory\n");
- kfree_skb(skb);
- return -ENOBUFS;
- }
- kfree_skb(skb);
- skb = skb2;
- if (net_ratelimit())
- printk(KERN_INFO "dn_phase3_output: Increasing headroom\n");
- }
-
- data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
- *((__le16 *)data) = dn_htons(skb->len - 2);
- sp = (struct dn_short_packet *)(data + 2);
-
- sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
- sp->dstnode = cb->dst & dn_htons(0x03ff);
- sp->srcnode = cb->src & dn_htons(0x03ff);
- sp->forward = cb->hops & 0x3f;
-
- skb->nh.raw = skb->data;
-
- return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet);
-}
-
-/*
- * Unfortunately, the neighbour code uses the device in its hash
- * function, so we don't get any advantage from it. This function
- * basically does a neigh_lookup(), but without comparing the device
- * field. This is required for the On-Ethernet cache
- */
-
-/*
- * Pointopoint link receives a hello message
- */
-void dn_neigh_pointopoint_hello(struct sk_buff *skb)
-{
- kfree_skb(skb);
-}
-
-/*
- * Ethernet router hello message received
- */
-int dn_neigh_router_hello(struct sk_buff *skb)
-{
- struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data;
-
- struct neighbour *neigh;
- struct dn_neigh *dn;
- struct dn_dev *dn_db;
- __le16 src;
-
- src = dn_eth2dn(msg->id);
-
- neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
-
- dn = (struct dn_neigh *)neigh;
-
- if (neigh) {
- write_lock(&neigh->lock);
-
- neigh->used = jiffies;
- dn_db = (struct dn_dev *)neigh->dev->dn_ptr;
-
- if (!(neigh->nud_state & NUD_PERMANENT)) {
- neigh->updated = jiffies;
-
- if (neigh->dev->type == ARPHRD_ETHER)
- memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
-
- dn->blksize = dn_ntohs(msg->blksize);
- dn->priority = msg->priority;
-
- dn->flags &= ~DN_NDFLAG_P3;
-
- switch(msg->iinfo & DN_RT_INFO_TYPE) {
- case DN_RT_INFO_L1RT:
- dn->flags &=~DN_NDFLAG_R2;
- dn->flags |= DN_NDFLAG_R1;
- break;
- case DN_RT_INFO_L2RT:
- dn->flags |= DN_NDFLAG_R2;
- }
- }
-
- /* Only use routers in our area */
- if ((dn_ntohs(src)>>10) == (dn_ntohs((decnet_address))>>10)) {
- if (!dn_db->router) {
- dn_db->router = neigh_clone(neigh);
- } else {
- if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
- neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
- }
- }
- write_unlock(&neigh->lock);
- neigh_release(neigh);
- }
-
- kfree_skb(skb);
- return 0;
-}
-
-/*
- * Endnode hello message received
- */
-int dn_neigh_endnode_hello(struct sk_buff *skb)
-{
- struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
- struct neighbour *neigh;
- struct dn_neigh *dn;
- __le16 src;
-
- src = dn_eth2dn(msg->id);
-
- neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
-
- dn = (struct dn_neigh *)neigh;
-
- if (neigh) {
- write_lock(&neigh->lock);
-
- neigh->used = jiffies;
-
- if (!(neigh->nud_state & NUD_PERMANENT)) {
- neigh->updated = jiffies;
-
- if (neigh->dev->type == ARPHRD_ETHER)
- memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
- dn->flags &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2);
- dn->blksize = dn_ntohs(msg->blksize);
- dn->priority = 0;
- }
-
- write_unlock(&neigh->lock);
- neigh_release(neigh);
- }
-
- kfree_skb(skb);
- return 0;
-}
-
-static char *dn_find_slot(char *base, int max, int priority)
-{
- int i;
- unsigned char *min = NULL;
-
- base += 6; /* skip first id */
-
- for(i = 0; i < max; i++) {
- if (!min || (*base < *min))
- min = base;
- base += 7; /* find next priority */
- }
-
- if (!min)
- return NULL;
-
- return (*min < priority) ? (min - 6) : NULL;
-}
-
-struct elist_cb_state {
- struct net_device *dev;
- unsigned char *ptr;
- unsigned char *rs;
- int t, n;
-};
-
-static void neigh_elist_cb(struct neighbour *neigh, void *_info)
-{
- struct elist_cb_state *s = _info;
- struct dn_neigh *dn;
-
- if (neigh->dev != s->dev)
- return;
-
- dn = (struct dn_neigh *) neigh;
- if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2)))
- return;
-
- if (s->t == s->n)
- s->rs = dn_find_slot(s->ptr, s->n, dn->priority);
- else
- s->t++;
- if (s->rs == NULL)
- return;
-
- dn_dn2eth(s->rs, dn->addr);
- s->rs += 6;
- *(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0;
- *(s->rs) |= dn->priority;
- s->rs++;
-}
-
-int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n)
-{
- struct elist_cb_state state;
-
- state.dev = dev;
- state.t = 0;
- state.n = n;
- state.ptr = ptr;
- state.rs = ptr;
-
- neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state);
-
- return state.t;
-}
-
-
-#ifdef CONFIG_PROC_FS
-
-static inline void dn_neigh_format_entry(struct seq_file *seq,
- struct neighbour *n)
-{
- struct dn_neigh *dn = (struct dn_neigh *) n;
- char buf[DN_ASCBUF_LEN];
-
- read_lock(&n->lock);
- seq_printf(seq, "%-7s %s%s%s %02x %02d %07ld %-8s\n",
- dn_addr2asc(dn_ntohs(dn->addr), buf),
- (dn->flags&DN_NDFLAG_R1) ? "1" : "-",
- (dn->flags&DN_NDFLAG_R2) ? "2" : "-",
- (dn->flags&DN_NDFLAG_P3) ? "3" : "-",
- dn->n.nud_state,
- atomic_read(&dn->n.refcnt),
- dn->blksize,
- (dn->n.dev) ? dn->n.dev->name : "?");
- read_unlock(&n->lock);
-}
-
-static int dn_neigh_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "Addr Flags State Use Blksize Dev\n");
- } else {
- dn_neigh_format_entry(seq, v);
- }
-
- return 0;
-}
-
-static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos)
-{
- return neigh_seq_start(seq, pos, &dn_neigh_table,
- NEIGH_SEQ_NEIGH_ONLY);
-}
-
-static struct seq_operations dn_neigh_seq_ops = {
- .start = dn_neigh_seq_start,
- .next = neigh_seq_next,
- .stop = neigh_seq_stop,
- .show = dn_neigh_seq_show,
-};
-
-static int dn_neigh_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct neigh_seq_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &dn_neigh_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations dn_neigh_seq_fops = {
- .owner = THIS_MODULE,
- .open = dn_neigh_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-#endif
-
-void __init dn_neigh_init(void)
-{
- neigh_table_init(&dn_neigh_table);
- proc_net_fops_create("decnet_neigh", S_IRUGO, &dn_neigh_seq_fops);
-}
-
-void __exit dn_neigh_cleanup(void)
-{
- proc_net_remove("decnet_neigh");
- neigh_table_clear(&dn_neigh_table);
-}
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
deleted file mode 100644
index 39a6cf7fb566..000000000000
--- a/net/decnet/dn_nsp_in.c
+++ /dev/null
@@ -1,917 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Network Services Protocol (Input)
- *
- * Author: Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *
- * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from
- * original dn_nsp.c.
- * Steve Whitehouse: Updated to work with my new routing architecture.
- * Steve Whitehouse: Add changes from Eduardo Serrat's patches.
- * Steve Whitehouse: Put all ack handling code in a common routine.
- * Steve Whitehouse: Put other common bits into dn_nsp_rx()
- * Steve Whitehouse: More checks on skb->len to catch bogus packets
- * Fixed various race conditions and possible nasties.
- * Steve Whitehouse: Now handles returned conninit frames.
- * David S. Miller: New socket locking
- * Steve Whitehouse: Fixed lockup when socket filtering was enabled.
- * Paul Koning: Fix to push CC sockets into RUN when acks are
- * received.
- * Steve Whitehouse:
- * Patrick Caulfield: Checking conninits for correctness & sending of error
- * responses.
- * Steve Whitehouse: Added backlog congestion level return codes.
- * Patrick Caulfield:
- * Steve Whitehouse: Added flow control support (outbound)
- * Steve Whitehouse: Prepare for nonlinear skbs
- */
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <asm/system.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/termios.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/netfilter_decnet.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-extern int decnet_log_martians;
-
-static void dn_log_martian(struct sk_buff *skb, const char *msg)
-{
- if (decnet_log_martians && net_ratelimit()) {
- char *devname = skb->dev ? skb->dev->name : "???";
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- printk(KERN_INFO "DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n", msg, devname, dn_ntohs(cb->src), dn_ntohs(cb->dst), dn_ntohs(cb->src_port), dn_ntohs(cb->dst_port));
- }
-}
-
-/*
- * For this function we've flipped the cross-subchannel bit
- * if the message is an otherdata or linkservice message. Thus
- * we can use it to work out what to update.
- */
-static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short type = ((ack >> 12) & 0x0003);
- int wakeup = 0;
-
- switch(type) {
- case 0: /* ACK - Data */
- if (dn_after(ack, scp->ackrcv_dat)) {
- scp->ackrcv_dat = ack & 0x0fff;
- wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->data_xmit_queue, ack);
- }
- break;
- case 1: /* NAK - Data */
- break;
- case 2: /* ACK - OtherData */
- if (dn_after(ack, scp->ackrcv_oth)) {
- scp->ackrcv_oth = ack & 0x0fff;
- wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->other_xmit_queue, ack);
- }
- break;
- case 3: /* NAK - OtherData */
- break;
- }
-
- if (wakeup && !sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
-}
-
-/*
- * This function is a universal ack processor.
- */
-static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth)
-{
- __le16 *ptr = (__le16 *)skb->data;
- int len = 0;
- unsigned short ack;
-
- if (skb->len < 2)
- return len;
-
- if ((ack = dn_ntohs(*ptr)) & 0x8000) {
- skb_pull(skb, 2);
- ptr++;
- len += 2;
- if ((ack & 0x4000) == 0) {
- if (oth)
- ack ^= 0x2000;
- dn_ack(sk, skb, ack);
- }
- }
-
- if (skb->len < 2)
- return len;
-
- if ((ack = dn_ntohs(*ptr)) & 0x8000) {
- skb_pull(skb, 2);
- len += 2;
- if ((ack & 0x4000) == 0) {
- if (oth)
- ack ^= 0x2000;
- dn_ack(sk, skb, ack);
- }
- }
-
- return len;
-}
-
-
-/**
- * dn_check_idf - Check an image data field format is correct.
- * @pptr: Pointer to pointer to image data
- * @len: Pointer to length of image data
- * @max: The maximum allowed length of the data in the image data field
- * @follow_on: Check that this many bytes exist beyond the end of the image data
- *
- * Returns: 0 if ok, -1 on error
- */
-static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on)
-{
- unsigned char *ptr = *pptr;
- unsigned char flen = *ptr++;
-
- (*len)--;
- if (flen > max)
- return -1;
- if ((flen + follow_on) > *len)
- return -1;
-
- *len -= flen;
- *pptr = ptr + flen;
- return 0;
-}
-
-/*
- * Table of reason codes to pass back to node which sent us a badly
- * formed message, plus text messages for the log. A zero entry in
- * the reason field means "don't reply" otherwise a disc init is sent with
- * the specified reason code.
- */
-static struct {
- unsigned short reason;
- const char *text;
-} ci_err_table[] = {
- { 0, "CI: Truncated message" },
- { NSP_REASON_ID, "CI: Destination username error" },
- { NSP_REASON_ID, "CI: Destination username type" },
- { NSP_REASON_US, "CI: Source username error" },
- { 0, "CI: Truncated at menuver" },
- { 0, "CI: Truncated before access or user data" },
- { NSP_REASON_IO, "CI: Access data format error" },
- { NSP_REASON_IO, "CI: User data format error" }
-};
-
-/*
- * This function uses a slightly different lookup method
- * to find its sockets, since it searches on object name/number
- * rather than port numbers. Various tests are done to ensure that
- * the incoming data is in the correct format before it is queued to
- * a socket.
- */
-static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data;
- struct sockaddr_dn dstaddr;
- struct sockaddr_dn srcaddr;
- unsigned char type = 0;
- int dstlen;
- int srclen;
- unsigned char *ptr;
- int len;
- int err = 0;
- unsigned char menuver;
-
- memset(&dstaddr, 0, sizeof(struct sockaddr_dn));
- memset(&srcaddr, 0, sizeof(struct sockaddr_dn));
-
- /*
- * 1. Decode & remove message header
- */
- cb->src_port = msg->srcaddr;
- cb->dst_port = msg->dstaddr;
- cb->services = msg->services;
- cb->info = msg->info;
- cb->segsize = dn_ntohs(msg->segsize);
-
- if (!pskb_may_pull(skb, sizeof(*msg)))
- goto err_out;
-
- skb_pull(skb, sizeof(*msg));
-
- len = skb->len;
- ptr = skb->data;
-
- /*
- * 2. Check destination end username format
- */
- dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type);
- err++;
- if (dstlen < 0)
- goto err_out;
-
- err++;
- if (type > 1)
- goto err_out;
-
- len -= dstlen;
- ptr += dstlen;
-
- /*
- * 3. Check source end username format
- */
- srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type);
- err++;
- if (srclen < 0)
- goto err_out;
-
- len -= srclen;
- ptr += srclen;
- err++;
- if (len < 1)
- goto err_out;
-
- menuver = *ptr;
- ptr++;
- len--;
-
- /*
- * 4. Check that optional data actually exists if menuver says it does
- */
- err++;
- if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1))
- goto err_out;
-
- /*
- * 5. Check optional access data format
- */
- err++;
- if (menuver & DN_MENUVER_ACC) {
- if (dn_check_idf(&ptr, &len, 39, 1))
- goto err_out;
- if (dn_check_idf(&ptr, &len, 39, 1))
- goto err_out;
- if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0))
- goto err_out;
- }
-
- /*
- * 6. Check optional user data format
- */
- err++;
- if (menuver & DN_MENUVER_USR) {
- if (dn_check_idf(&ptr, &len, 16, 0))
- goto err_out;
- }
-
- /*
- * 7. Look up socket based on destination end username
- */
- return dn_sklist_find_listener(&dstaddr);
-err_out:
- dn_log_martian(skb, ci_err_table[err].text);
- *reason = ci_err_table[err].reason;
- return NULL;
-}
-
-
-static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb)
-{
- if (sk_acceptq_is_full(sk)) {
- kfree_skb(skb);
- return;
- }
-
- sk->sk_ack_backlog++;
- skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_state_change(sk);
-}
-
-static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dn_scp *scp = DN_SK(sk);
- unsigned char *ptr;
-
- if (skb->len < 4)
- goto out;
-
- ptr = skb->data;
- cb->services = *ptr++;
- cb->info = *ptr++;
- cb->segsize = dn_ntohs(*(__le16 *)ptr);
-
- if ((scp->state == DN_CI) || (scp->state == DN_CD)) {
- scp->persist = 0;
- scp->addrrem = cb->src_port;
- sk->sk_state = TCP_ESTABLISHED;
- scp->state = DN_RUN;
- scp->services_rem = cb->services;
- scp->info_rem = cb->info;
- scp->segsize_rem = cb->segsize;
-
- if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
- scp->max_window = decnet_no_fc_max_cwnd;
-
- if (skb->len > 0) {
- u16 dlen = *skb->data;
- if ((dlen <= 16) && (dlen <= skb->len)) {
- scp->conndata_in.opt_optl = dn_htons(dlen);
- memcpy(scp->conndata_in.opt_data, skb->data + 1, dlen);
- }
- }
- dn_nsp_send_link(sk, DN_NOCHANGE, 0);
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- }
-
-out:
- kfree_skb(skb);
-}
-
-static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CI) {
- scp->state = DN_CD;
- scp->persist = 0;
- }
-
- kfree_skb(skb);
-}
-
-static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned short reason;
-
- if (skb->len < 2)
- goto out;
-
- reason = dn_ntohs(*(__le16 *)skb->data);
- skb_pull(skb, 2);
-
- scp->discdata_in.opt_status = dn_htons(reason);
- scp->discdata_in.opt_optl = 0;
- memset(scp->discdata_in.opt_data, 0, 16);
-
- if (skb->len > 0) {
- u16 dlen = *skb->data;
- if ((dlen <= 16) && (dlen <= skb->len)) {
- scp->discdata_in.opt_optl = dn_htons(dlen);
- memcpy(scp->discdata_in.opt_data, skb->data + 1, dlen);
- }
- }
-
- scp->addrrem = cb->src_port;
- sk->sk_state = TCP_CLOSE;
-
- switch(scp->state) {
- case DN_CI:
- case DN_CD:
- scp->state = DN_RJ;
- sk->sk_err = ECONNREFUSED;
- break;
- case DN_RUN:
- sk->sk_shutdown |= SHUTDOWN_MASK;
- scp->state = DN_DN;
- break;
- case DN_DI:
- scp->state = DN_DIC;
- break;
- }
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- if (sk->sk_socket->state != SS_UNCONNECTED)
- sk->sk_socket->state = SS_DISCONNECTING;
- sk->sk_state_change(sk);
- }
-
- /*
- * It appears that its possible for remote machines to send disc
- * init messages with no port identifier if we are in the CI and
- * possibly also the CD state. Obviously we shouldn't reply with
- * a message if we don't know what the end point is.
- */
- if (scp->addrrem) {
- dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
- }
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
-
-out:
- kfree_skb(skb);
-}
-
-/*
- * disc_conf messages are also called no_resources or no_link
- * messages depending upon the "reason" field.
- */
-static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short reason;
-
- if (skb->len != 2)
- goto out;
-
- reason = dn_ntohs(*(__le16 *)skb->data);
-
- sk->sk_state = TCP_CLOSE;
-
- switch(scp->state) {
- case DN_CI:
- scp->state = DN_NR;
- break;
- case DN_DR:
- if (reason == NSP_REASON_DC)
- scp->state = DN_DRC;
- if (reason == NSP_REASON_NL)
- scp->state = DN_CN;
- break;
- case DN_DI:
- scp->state = DN_DIC;
- break;
- case DN_RUN:
- sk->sk_shutdown |= SHUTDOWN_MASK;
- case DN_CC:
- scp->state = DN_CN;
- }
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- if (sk->sk_socket->state != SS_UNCONNECTED)
- sk->sk_socket->state = SS_DISCONNECTING;
- sk->sk_state_change(sk);
- }
-
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
-
-out:
- kfree_skb(skb);
-}
-
-static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short segnum;
- unsigned char lsflags;
- signed char fcval;
- int wake_up = 0;
- char *ptr = skb->data;
- unsigned char fctype = scp->services_rem & NSP_FC_MASK;
-
- if (skb->len != 4)
- goto out;
-
- segnum = dn_ntohs(*(__le16 *)ptr);
- ptr += 2;
- lsflags = *(unsigned char *)ptr++;
- fcval = *ptr;
-
- /*
- * Here we ignore erronous packets which should really
- * should cause a connection abort. It is not critical
- * for now though.
- */
- if (lsflags & 0xf8)
- goto out;
-
- if (seq_next(scp->numoth_rcv, segnum)) {
- seq_add(&scp->numoth_rcv, 1);
- switch(lsflags & 0x04) { /* FCVAL INT */
- case 0x00: /* Normal Request */
- switch(lsflags & 0x03) { /* FCVAL MOD */
- case 0x00: /* Request count */
- if (fcval < 0) {
- unsigned char p_fcval = -fcval;
- if ((scp->flowrem_dat > p_fcval) &&
- (fctype == NSP_FC_SCMC)) {
- scp->flowrem_dat -= p_fcval;
- }
- } else if (fcval > 0) {
- scp->flowrem_dat += fcval;
- wake_up = 1;
- }
- break;
- case 0x01: /* Stop outgoing data */
- scp->flowrem_sw = DN_DONTSEND;
- break;
- case 0x02: /* Ok to start again */
- scp->flowrem_sw = DN_SEND;
- dn_nsp_output(sk);
- wake_up = 1;
- }
- break;
- case 0x04: /* Interrupt Request */
- if (fcval > 0) {
- scp->flowrem_oth += fcval;
- wake_up = 1;
- }
- break;
- }
- if (wake_up && !sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- }
-
- dn_nsp_send_oth_ack(sk);
-
-out:
- kfree_skb(skb);
-}
-
-/*
- * Copy of sock_queue_rcv_skb (from sock.h) without
- * bh_lock_sock() (its already held when this is called) which
- * also allows data and other data to be queued to a socket.
- */
-static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue)
-{
- int err;
-
- /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
- err = -ENOMEM;
- goto out;
- }
-
- err = sk_filter(sk, skb);
- if (err)
- goto out;
-
- skb_set_owner_r(skb, sk);
- skb_queue_tail(queue, skb);
-
- /* This code only runs from BH or BH protected context.
- * Therefore the plain read_lock is ok here. -DaveM
- */
- read_lock(&sk->sk_callback_lock);
- if (!sock_flag(sk, SOCK_DEAD)) {
- struct socket *sock = sk->sk_socket;
- wake_up_interruptible(sk->sk_sleep);
- if (sock && sock->fasync_list &&
- !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
- __kill_fasync(sock->fasync_list, sig,
- (sig == SIGURG) ? POLL_PRI : POLL_IN);
- }
- read_unlock(&sk->sk_callback_lock);
-out:
- return err;
-}
-
-static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short segnum;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int queued = 0;
-
- if (skb->len < 2)
- goto out;
-
- cb->segnum = segnum = dn_ntohs(*(__le16 *)skb->data);
- skb_pull(skb, 2);
-
- if (seq_next(scp->numoth_rcv, segnum)) {
-
- if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) {
- seq_add(&scp->numoth_rcv, 1);
- scp->other_report = 0;
- queued = 1;
- }
- }
-
- dn_nsp_send_oth_ack(sk);
-out:
- if (!queued)
- kfree_skb(skb);
-}
-
-static void dn_nsp_data(struct sock *sk, struct sk_buff *skb)
-{
- int queued = 0;
- unsigned short segnum;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dn_scp *scp = DN_SK(sk);
-
- if (skb->len < 2)
- goto out;
-
- cb->segnum = segnum = dn_ntohs(*(__le16 *)skb->data);
- skb_pull(skb, 2);
-
- if (seq_next(scp->numdat_rcv, segnum)) {
- if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) {
- seq_add(&scp->numdat_rcv, 1);
- queued = 1;
- }
-
- if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) {
- scp->flowloc_sw = DN_DONTSEND;
- dn_nsp_send_link(sk, DN_DONTSEND, 0);
- }
- }
-
- dn_nsp_send_data_ack(sk);
-out:
- if (!queued)
- kfree_skb(skb);
-}
-
-/*
- * If one of our conninit messages is returned, this function
- * deals with it. It puts the socket into the NO_COMMUNICATION
- * state.
- */
-static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CI) {
- scp->state = DN_NC;
- sk->sk_state = TCP_CLOSE;
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- }
-
- kfree_skb(skb);
-}
-
-static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int ret = NET_RX_DROP;
-
- /* Must not reply to returned packets */
- if (cb->rt_flags & DN_RT_F_RTS)
- goto out;
-
- if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) {
- switch(cb->nsp_flags & 0x70) {
- case 0x10:
- case 0x60: /* (Retransmitted) Connect Init */
- dn_nsp_return_disc(skb, NSP_DISCINIT, reason);
- ret = NET_RX_SUCCESS;
- break;
- case 0x20: /* Connect Confirm */
- dn_nsp_return_disc(skb, NSP_DISCCONF, reason);
- ret = NET_RX_SUCCESS;
- break;
- }
- }
-
-out:
- kfree_skb(skb);
- return ret;
-}
-
-static int dn_nsp_rx_packet(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct sock *sk = NULL;
- unsigned char *ptr = (unsigned char *)skb->data;
- unsigned short reason = NSP_REASON_NL;
-
- if (!pskb_may_pull(skb, 2))
- goto free_out;
-
- skb->h.raw = skb->data;
- cb->nsp_flags = *ptr++;
-
- if (decnet_debug_level & 2)
- printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags);
-
- if (cb->nsp_flags & 0x83)
- goto free_out;
-
- /*
- * Filter out conninits and useless packet types
- */
- if ((cb->nsp_flags & 0x0c) == 0x08) {
- switch(cb->nsp_flags & 0x70) {
- case 0x00: /* NOP */
- case 0x70: /* Reserved */
- case 0x50: /* Reserved, Phase II node init */
- goto free_out;
- case 0x10:
- case 0x60:
- if (unlikely(cb->rt_flags & DN_RT_F_RTS))
- goto free_out;
- sk = dn_find_listener(skb, &reason);
- goto got_it;
- }
- }
-
- if (!pskb_may_pull(skb, 3))
- goto free_out;
-
- /*
- * Grab the destination address.
- */
- cb->dst_port = *(__le16 *)ptr;
- cb->src_port = 0;
- ptr += 2;
-
- /*
- * If not a connack, grab the source address too.
- */
- if (pskb_may_pull(skb, 5)) {
- cb->src_port = *(__le16 *)ptr;
- ptr += 2;
- skb_pull(skb, 5);
- }
-
- /*
- * Returned packets...
- * Swap src & dst and look up in the normal way.
- */
- if (unlikely(cb->rt_flags & DN_RT_F_RTS)) {
- __le16 tmp = cb->dst_port;
- cb->dst_port = cb->src_port;
- cb->src_port = tmp;
- tmp = cb->dst;
- cb->dst = cb->src;
- cb->src = tmp;
- }
-
- /*
- * Find the socket to which this skb is destined.
- */
- sk = dn_find_by_skb(skb);
-got_it:
- if (sk != NULL) {
- struct dn_scp *scp = DN_SK(sk);
-
- /* Reset backoff */
- scp->nsp_rxtshift = 0;
-
- /*
- * We linearize everything except data segments here.
- */
- if (cb->nsp_flags & ~0x60) {
- if (unlikely(skb_linearize(skb)))
- goto free_out;
- }
-
- return sk_receive_skb(sk, skb, 0);
- }
-
- return dn_nsp_no_socket(skb, reason);
-
-free_out:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-int dn_nsp_rx(struct sk_buff *skb)
-{
- return NF_HOOK(PF_DECnet, NF_DN_LOCAL_IN, skb, skb->dev, NULL, dn_nsp_rx_packet);
-}
-
-/*
- * This is the main receive routine for sockets. It is called
- * from the above when the socket is not busy, and also from
- * sock_release() when there is a backlog queued up.
- */
-int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- if (cb->rt_flags & DN_RT_F_RTS) {
- if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68)
- dn_returned_conn_init(sk, skb);
- else
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
-
- /*
- * Control packet.
- */
- if ((cb->nsp_flags & 0x0c) == 0x08) {
- switch(cb->nsp_flags & 0x70) {
- case 0x10:
- case 0x60:
- dn_nsp_conn_init(sk, skb);
- break;
- case 0x20:
- dn_nsp_conn_conf(sk, skb);
- break;
- case 0x30:
- dn_nsp_disc_init(sk, skb);
- break;
- case 0x40:
- dn_nsp_disc_conf(sk, skb);
- break;
- }
-
- } else if (cb->nsp_flags == 0x24) {
- /*
- * Special for connacks, 'cos they don't have
- * ack data or ack otherdata info.
- */
- dn_nsp_conn_ack(sk, skb);
- } else {
- int other = 1;
-
- /* both data and ack frames can kick a CC socket into RUN */
- if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) {
- scp->state = DN_RUN;
- sk->sk_state = TCP_ESTABLISHED;
- sk->sk_state_change(sk);
- }
-
- if ((cb->nsp_flags & 0x1c) == 0)
- other = 0;
- if (cb->nsp_flags == 0x04)
- other = 0;
-
- /*
- * Read out ack data here, this applies equally
- * to data, other data, link serivce and both
- * ack data and ack otherdata.
- */
- dn_process_ack(sk, skb, other);
-
- /*
- * If we've some sort of data here then call a
- * suitable routine for dealing with it, otherwise
- * the packet is an ack and can be discarded.
- */
- if ((cb->nsp_flags & 0x0c) == 0) {
-
- if (scp->state != DN_RUN)
- goto free_out;
-
- switch(cb->nsp_flags) {
- case 0x10: /* LS */
- dn_nsp_linkservice(sk, skb);
- break;
- case 0x30: /* OD */
- dn_nsp_otherdata(sk, skb);
- break;
- default:
- dn_nsp_data(sk, skb);
- }
-
- } else { /* Ack, chuck it out here */
-free_out:
- kfree_skb(skb);
- }
- }
-
- return NET_RX_SUCCESS;
-}
-
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
deleted file mode 100644
index b342e4e8f5f8..000000000000
--- a/net/decnet/dn_nsp_out.c
+++ /dev/null
@@ -1,723 +0,0 @@
-
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Network Services Protocol (Output)
- *
- * Author: Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *
- * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from
- * original dn_nsp.c.
- * Steve Whitehouse: Updated to work with my new routing architecture.
- * Steve Whitehouse: Added changes from Eduardo Serrat's patches.
- * Steve Whitehouse: Now conninits have the "return" bit set.
- * Steve Whitehouse: Fixes to check alloc'd skbs are non NULL!
- * Moved output state machine into one function
- * Steve Whitehouse: New output state machine
- * Paul Koning: Connect Confirm message fix.
- * Eduardo Serrat: Fix to stop dn_nsp_do_disc() sending malformed packets.
- * Steve Whitehouse: dn_nsp_output() and friends needed a spring clean
- * Steve Whitehouse: Moved dn_nsp_send() in here from route.h
- */
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <net/sock.h>
-#include <asm/system.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/termios.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/if_packet.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-
-static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
-
-static void dn_nsp_send(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct dst_entry *dst;
- struct flowi fl;
-
- skb->h.raw = skb->data;
- scp->stamp = jiffies;
-
- dst = sk_dst_check(sk, 0);
- if (dst) {
-try_again:
- skb->dst = dst;
- dst_output(skb);
- return;
- }
-
- memset(&fl, 0, sizeof(fl));
- fl.oif = sk->sk_bound_dev_if;
- fl.fld_src = dn_saddr2dn(&scp->addr);
- fl.fld_dst = dn_saddr2dn(&scp->peer);
- dn_sk_ports_copy(&fl, scp);
- fl.proto = DNPROTO_NSP;
- if (dn_route_output_sock(&sk->sk_dst_cache, &fl, sk, 0) == 0) {
- dst = sk_dst_get(sk);
- sk->sk_route_caps = dst->dev->features;
- goto try_again;
- }
-
- sk->sk_err = EHOSTUNREACH;
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
-}
-
-
-/*
- * If sk == NULL, then we assume that we are supposed to be making
- * a routing layer skb. If sk != NULL, then we are supposed to be
- * creating an skb for the NSP layer.
- *
- * The eventual aim is for each socket to have a cached header size
- * for its outgoing packets, and to set hdr from this when sk != NULL.
- */
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
-{
- struct sk_buff *skb;
- int hdr = 64;
-
- if ((skb = alloc_skb(size + hdr, pri)) == NULL)
- return NULL;
-
- skb->protocol = __constant_htons(ETH_P_DNA_RT);
- skb->pkt_type = PACKET_OUTGOING;
-
- if (sk)
- skb_set_owner_w(skb, sk);
-
- skb_reserve(skb, hdr);
-
- return skb;
-}
-
-/*
- * Calculate persist timer based upon the smoothed round
- * trip time and the variance. Backoff according to the
- * nsp_backoff[] array.
- */
-unsigned long dn_nsp_persist(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
-
- t *= nsp_backoff[scp->nsp_rxtshift];
-
- if (t < HZ) t = HZ;
- if (t > (600*HZ)) t = (600*HZ);
-
- if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT)
- scp->nsp_rxtshift++;
-
- /* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */
-
- return t;
-}
-
-/*
- * This is called each time we get an estimate for the rtt
- * on the link.
- */
-static void dn_nsp_rtt(struct sock *sk, long rtt)
-{
- struct dn_scp *scp = DN_SK(sk);
- long srtt = (long)scp->nsp_srtt;
- long rttvar = (long)scp->nsp_rttvar;
- long delta;
-
- /*
- * If the jiffies clock flips over in the middle of timestamp
- * gathering this value might turn out negative, so we make sure
- * that is it always positive here.
- */
- if (rtt < 0)
- rtt = -rtt;
- /*
- * Add new rtt to smoothed average
- */
- delta = ((rtt << 3) - srtt);
- srtt += (delta >> 3);
- if (srtt >= 1)
- scp->nsp_srtt = (unsigned long)srtt;
- else
- scp->nsp_srtt = 1;
-
- /*
- * Add new rtt varience to smoothed varience
- */
- delta >>= 1;
- rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2);
- if (rttvar >= 1)
- scp->nsp_rttvar = (unsigned long)rttvar;
- else
- scp->nsp_rttvar = 1;
-
- /* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */
-}
-
-/**
- * dn_nsp_clone_and_send - Send a data packet by cloning it
- * @skb: The packet to clone and transmit
- * @gfp: memory allocation flag
- *
- * Clone a queued data or other data packet and transmit it.
- *
- * Returns: The number of times the packet has been sent previously
- */
-static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb,
- gfp_t gfp)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct sk_buff *skb2;
- int ret = 0;
-
- if ((skb2 = skb_clone(skb, gfp)) != NULL) {
- ret = cb->xmit_count;
- cb->xmit_count++;
- cb->stamp = jiffies;
- skb2->sk = skb->sk;
- dn_nsp_send(skb2);
- }
-
- return ret;
-}
-
-/**
- * dn_nsp_output - Try and send something from socket queues
- * @sk: The socket whose queues are to be investigated
- * @gfp: The memory allocation flags
- *
- * Try and send the packet on the end of the data and other data queues.
- * Other data gets priority over data, and if we retransmit a packet we
- * reduce the window by dividing it in two.
- *
- */
-void dn_nsp_output(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb;
- unsigned reduce_win = 0;
-
- /*
- * First we check for otherdata/linkservice messages
- */
- if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL)
- reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
-
- /*
- * If we may not send any data, we don't.
- * If we are still trying to get some other data down the
- * channel, we don't try and send any data.
- */
- if (reduce_win || (scp->flowrem_sw != DN_SEND))
- goto recalc_window;
-
- if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL)
- reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
-
- /*
- * If we've sent any frame more than once, we cut the
- * send window size in half. There is always a minimum
- * window size of one available.
- */
-recalc_window:
- if (reduce_win) {
- scp->snd_window >>= 1;
- if (scp->snd_window < NSP_MIN_WINDOW)
- scp->snd_window = NSP_MIN_WINDOW;
- }
-}
-
-int dn_nsp_xmit_timeout(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- dn_nsp_output(sk);
-
- if (!skb_queue_empty(&scp->data_xmit_queue) ||
- !skb_queue_empty(&scp->other_xmit_queue))
- scp->persist = dn_nsp_persist(sk);
-
- return 0;
-}
-
-static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
-{
- unsigned char *ptr = skb_push(skb, len);
-
- BUG_ON(len < 5);
-
- *ptr++ = msgflag;
- *((__le16 *)ptr) = scp->addrrem;
- ptr += 2;
- *((__le16 *)ptr) = scp->addrloc;
- ptr += 2;
- return (__le16 __force *)ptr;
-}
-
-static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short acknum = scp->numdat_rcv & 0x0FFF;
- unsigned short ackcrs = scp->numoth_rcv & 0x0FFF;
- __le16 *ptr;
-
- BUG_ON(hlen < 9);
-
- scp->ackxmt_dat = acknum;
- scp->ackxmt_oth = ackcrs;
- acknum |= 0x8000;
- ackcrs |= 0x8000;
-
- /* If this is an "other data/ack" message, swap acknum and ackcrs */
- if (other) {
- unsigned short tmp = acknum;
- acknum = ackcrs;
- ackcrs = tmp;
- }
-
- /* Set "cross subchannel" bit in ackcrs */
- ackcrs |= 0x2000;
-
- ptr = (__le16 *)dn_mk_common_header(scp, skb, msgflag, hlen);
-
- *ptr++ = dn_htons(acknum);
- *ptr++ = dn_htons(ackcrs);
-
- return ptr;
-}
-
-static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- __le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
-
- if (unlikely(oth)) {
- cb->segnum = scp->numoth;
- seq_add(&scp->numoth, 1);
- } else {
- cb->segnum = scp->numdat;
- seq_add(&scp->numdat, 1);
- }
- *(ptr++) = dn_htons(cb->segnum);
-
- return ptr;
-}
-
-void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
- gfp_t gfp, int oth)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
-
- cb->xmit_count = 0;
- dn_nsp_mk_data_header(sk, skb, oth);
-
- /*
- * Slow start: If we have been idle for more than
- * one RTT, then reset window to min size.
- */
- if ((jiffies - scp->stamp) > t)
- scp->snd_window = NSP_MIN_WINDOW;
-
- if (oth)
- skb_queue_tail(&scp->other_xmit_queue, skb);
- else
- skb_queue_tail(&scp->data_xmit_queue, skb);
-
- if (scp->flowrem_sw != DN_SEND)
- return;
-
- dn_nsp_clone_and_send(skb, gfp);
-}
-
-
-int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb2, *list, *ack = NULL;
- int wakeup = 0;
- int try_retrans = 0;
- unsigned long reftime = cb->stamp;
- unsigned long pkttime;
- unsigned short xmit_count;
- unsigned short segnum;
-
- skb2 = q->next;
- list = (struct sk_buff *)q;
- while(list != skb2) {
- struct dn_skb_cb *cb2 = DN_SKB_CB(skb2);
-
- if (dn_before_or_equal(cb2->segnum, acknum))
- ack = skb2;
-
- /* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */
-
- skb2 = skb2->next;
-
- if (ack == NULL)
- continue;
-
- /* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */
-
- /* Does _last_ packet acked have xmit_count > 1 */
- try_retrans = 0;
- /* Remember to wake up the sending process */
- wakeup = 1;
- /* Keep various statistics */
- pkttime = cb2->stamp;
- xmit_count = cb2->xmit_count;
- segnum = cb2->segnum;
- /* Remove and drop ack'ed packet */
- skb_unlink(ack, q);
- kfree_skb(ack);
- ack = NULL;
-
- /*
- * We don't expect to see acknowledgements for packets we
- * haven't sent yet.
- */
- WARN_ON(xmit_count == 0);
-
- /*
- * If the packet has only been sent once, we can use it
- * to calculate the RTT and also open the window a little
- * further.
- */
- if (xmit_count == 1) {
- if (dn_equal(segnum, acknum))
- dn_nsp_rtt(sk, (long)(pkttime - reftime));
-
- if (scp->snd_window < scp->max_window)
- scp->snd_window++;
- }
-
- /*
- * Packet has been sent more than once. If this is the last
- * packet to be acknowledged then we want to send the next
- * packet in the send queue again (assumes the remote host does
- * go-back-N error control).
- */
- if (xmit_count > 1)
- try_retrans = 1;
- }
-
- if (try_retrans)
- dn_nsp_output(sk);
-
- return wakeup;
-}
-
-void dn_nsp_send_data_ack(struct sock *sk)
-{
- struct sk_buff *skb = NULL;
-
- if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
- return;
-
- skb_reserve(skb, 9);
- dn_mk_ack_header(sk, skb, 0x04, 9, 0);
- dn_nsp_send(skb);
-}
-
-void dn_nsp_send_oth_ack(struct sock *sk)
-{
- struct sk_buff *skb = NULL;
-
- if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
- return;
-
- skb_reserve(skb, 9);
- dn_mk_ack_header(sk, skb, 0x14, 9, 1);
- dn_nsp_send(skb);
-}
-
-
-void dn_send_conn_ack (struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb = NULL;
- struct nsp_conn_ack_msg *msg;
-
- if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL)
- return;
-
- msg = (struct nsp_conn_ack_msg *)skb_put(skb, 3);
- msg->msgflg = 0x24;
- msg->dstaddr = scp->addrrem;
-
- dn_nsp_send(skb);
-}
-
-void dn_nsp_delayed_ack(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->ackxmt_oth != scp->numoth_rcv)
- dn_nsp_send_oth_ack(sk);
-
- if (scp->ackxmt_dat != scp->numdat_rcv)
- dn_nsp_send_data_ack(sk);
-}
-
-static int dn_nsp_retrans_conn_conf(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CC)
- dn_send_conn_conf(sk, GFP_ATOMIC);
-
- return 0;
-}
-
-void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb = NULL;
- struct nsp_conn_init_msg *msg;
- __u8 len = (__u8)dn_ntohs(scp->conndata_out.opt_optl);
-
- if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL)
- return;
-
- msg = (struct nsp_conn_init_msg *)skb_put(skb, sizeof(*msg));
- msg->msgflg = 0x28;
- msg->dstaddr = scp->addrrem;
- msg->srcaddr = scp->addrloc;
- msg->services = scp->services_loc;
- msg->info = scp->info_loc;
- msg->segsize = dn_htons(scp->segsize_loc);
-
- *skb_put(skb,1) = len;
-
- if (len > 0)
- memcpy(skb_put(skb, len), scp->conndata_out.opt_data, len);
-
-
- dn_nsp_send(skb);
-
- scp->persist = dn_nsp_persist(sk);
- scp->persist_fxn = dn_nsp_retrans_conn_conf;
-}
-
-
-static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
- unsigned short reason, gfp_t gfp,
- struct dst_entry *dst,
- int ddl, unsigned char *dd, __le16 rem, __le16 loc)
-{
- struct sk_buff *skb = NULL;
- int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0);
- unsigned char *msg;
-
- if ((dst == NULL) || (rem == 0)) {
- if (net_ratelimit())
- printk(KERN_DEBUG "DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", dn_ntohs(rem), dst);
- return;
- }
-
- if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL)
- return;
-
- msg = skb_put(skb, size);
- *msg++ = msgflg;
- *(__le16 *)msg = rem;
- msg += 2;
- *(__le16 *)msg = loc;
- msg += 2;
- *(__le16 *)msg = dn_htons(reason);
- msg += 2;
- if (msgflg == NSP_DISCINIT)
- *msg++ = ddl;
-
- if (ddl) {
- memcpy(msg, dd, ddl);
- }
-
- /*
- * This doesn't go via the dn_nsp_send() function since we need
- * to be able to send disc packets out which have no socket
- * associations.
- */
- skb->dst = dst_clone(dst);
- dst_output(skb);
-}
-
-
-void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
- unsigned short reason, gfp_t gfp)
-{
- struct dn_scp *scp = DN_SK(sk);
- int ddl = 0;
-
- if (msgflg == NSP_DISCINIT)
- ddl = dn_ntohs(scp->discdata_out.opt_optl);
-
- if (reason == 0)
- reason = dn_ntohs(scp->discdata_out.opt_status);
-
- dn_nsp_do_disc(sk, msgflg, reason, gfp, sk->sk_dst_cache, ddl,
- scp->discdata_out.opt_data, scp->addrrem, scp->addrloc);
-}
-
-
-void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
- unsigned short reason)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int ddl = 0;
- gfp_t gfp = GFP_ATOMIC;
-
- dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb->dst, ddl,
- NULL, cb->src_port, cb->dst_port);
-}
-
-
-void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb;
- unsigned char *ptr;
- gfp_t gfp = GFP_ATOMIC;
-
- if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
- return;
-
- skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
- ptr = skb_put(skb, 2);
- DN_SKB_CB(skb)->nsp_flags = 0x10;
- *ptr++ = lsflags;
- *ptr = fcval;
-
- dn_nsp_queue_xmit(sk, skb, gfp, 1);
-
- scp->persist = dn_nsp_persist(sk);
- scp->persist_fxn = dn_nsp_xmit_timeout;
-}
-
-static int dn_nsp_retrans_conninit(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CI)
- dn_nsp_send_conninit(sk, NSP_RCI);
-
- return 0;
-}
-
-void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct nsp_conn_init_msg *msg;
- unsigned char aux;
- unsigned char menuver;
- struct dn_skb_cb *cb;
- unsigned char type = 1;
- gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
- struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
-
- if (!skb)
- return;
-
- cb = DN_SKB_CB(skb);
- msg = (struct nsp_conn_init_msg *)skb_put(skb,sizeof(*msg));
-
- msg->msgflg = msgflg;
- msg->dstaddr = 0x0000; /* Remote Node will assign it*/
-
- msg->srcaddr = scp->addrloc;
- msg->services = scp->services_loc; /* Requested flow control */
- msg->info = scp->info_loc; /* Version Number */
- msg->segsize = dn_htons(scp->segsize_loc); /* Max segment size */
-
- if (scp->peer.sdn_objnum)
- type = 0;
-
- skb_put(skb, dn_sockaddr2username(&scp->peer, skb->tail, type));
- skb_put(skb, dn_sockaddr2username(&scp->addr, skb->tail, 2));
-
- menuver = DN_MENUVER_ACC | DN_MENUVER_USR;
- if (scp->peer.sdn_flags & SDF_PROXY)
- menuver |= DN_MENUVER_PRX;
- if (scp->peer.sdn_flags & SDF_UICPROXY)
- menuver |= DN_MENUVER_UIC;
-
- *skb_put(skb, 1) = menuver; /* Menu Version */
-
- aux = scp->accessdata.acc_userl;
- *skb_put(skb, 1) = aux;
- if (aux > 0)
- memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
-
- aux = scp->accessdata.acc_passl;
- *skb_put(skb, 1) = aux;
- if (aux > 0)
- memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
-
- aux = scp->accessdata.acc_accl;
- *skb_put(skb, 1) = aux;
- if (aux > 0)
- memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
-
- aux = (__u8)dn_ntohs(scp->conndata_out.opt_optl);
- *skb_put(skb, 1) = aux;
- if (aux > 0)
- memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux);
-
- scp->persist = dn_nsp_persist(sk);
- scp->persist_fxn = dn_nsp_retrans_conninit;
-
- cb->rt_flags = DN_RT_F_RQR;
-
- dn_nsp_send(skb);
-}
-
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
deleted file mode 100644
index 9881933167bd..000000000000
--- a/net/decnet/dn_route.c
+++ /dev/null
@@ -1,1824 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Functions (Endnode and Router)
- *
- * Authors: Steve Whitehouse <SteveW@ACM.org>
- * Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- * Steve Whitehouse : Fixes to allow "intra-ethernet" and
- * "return-to-sender" bits on outgoing
- * packets.
- * Steve Whitehouse : Timeouts for cached routes.
- * Steve Whitehouse : Use dst cache for input routes too.
- * Steve Whitehouse : Fixed error values in dn_send_skb.
- * Steve Whitehouse : Rework routing functions to better fit
- * DECnet routing design
- * Alexey Kuznetsov : New SMP locking
- * Steve Whitehouse : More SMP locking changes & dn_cache_dump()
- * Steve Whitehouse : Prerouting NF hook, now really is prerouting.
- * Fixed possible skb leak in rtnetlink funcs.
- * Steve Whitehouse : Dave Miller's dynamic hash table sizing and
- * Alexey Kuznetsov's finer grained locking
- * from ipv4/route.c.
- * Steve Whitehouse : Routing is now starting to look like a
- * sensible set of code now, mainly due to
- * my copying the IPv4 routing code. The
- * hooks here are modified and will continue
- * to evolve for a while.
- * Steve Whitehouse : Real SMP at last :-) Also new netfilter
- * stuff. Look out raw sockets your days
- * are numbered!
- * Steve Whitehouse : Added return-to-sender functions. Added
- * backlog congestion level return codes.
- * Steve Whitehouse : Fixed bug where routes were set up with
- * no ref count on net devices.
- * Steve Whitehouse : RCU for the route cache
- * Steve Whitehouse : Preparations for the flow cache
- * Steve Whitehouse : Prepare for nonlinear skbs
- */
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/in_route.h>
-#include <net/sock.h>
-#include <linux/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/rtnetlink.h>
-#include <linux/string.h>
-#include <linux/netfilter_decnet.h>
-#include <linux/rcupdate.h>
-#include <linux/times.h>
-#include <asm/errno.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_nsp.h>
-#include <net/dn_route.h>
-#include <net/dn_neigh.h>
-#include <net/dn_fib.h>
-
-struct dn_rt_hash_bucket
-{
- struct dn_route *chain;
- spinlock_t lock;
-} __attribute__((__aligned__(8)));
-
-extern struct neigh_table dn_neigh_table;
-
-
-static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00};
-
-static const int dn_rt_min_delay = 2 * HZ;
-static const int dn_rt_max_delay = 10 * HZ;
-static const int dn_rt_mtu_expires = 10 * 60 * HZ;
-
-static unsigned long dn_rt_deadline;
-
-static int dn_dst_gc(void);
-static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
-static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
-static void dn_dst_link_failure(struct sk_buff *);
-static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
-static int dn_route_input(struct sk_buff *);
-static void dn_run_flush(unsigned long dummy);
-
-static struct dn_rt_hash_bucket *dn_rt_hash_table;
-static unsigned dn_rt_hash_mask;
-
-static struct timer_list dn_route_timer;
-static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0);
-int decnet_dst_gc_interval = 2;
-
-static struct dst_ops dn_dst_ops = {
- .family = PF_DECnet,
- .protocol = __constant_htons(ETH_P_DNA_RT),
- .gc_thresh = 128,
- .gc = dn_dst_gc,
- .check = dn_dst_check,
- .negative_advice = dn_dst_negative_advice,
- .link_failure = dn_dst_link_failure,
- .update_pmtu = dn_dst_update_pmtu,
- .entry_size = sizeof(struct dn_route),
- .entries = ATOMIC_INIT(0),
-};
-
-static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
-{
- __u16 tmp = (__u16 __force)(src ^ dst);
- tmp ^= (tmp >> 3);
- tmp ^= (tmp >> 5);
- tmp ^= (tmp >> 10);
- return dn_rt_hash_mask & (unsigned)tmp;
-}
-
-static inline void dnrt_free(struct dn_route *rt)
-{
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
-static inline void dnrt_drop(struct dn_route *rt)
-{
- dst_release(&rt->u.dst);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
-static void dn_dst_check_expire(unsigned long dummy)
-{
- int i;
- struct dn_route *rt, **rtp;
- unsigned long now = jiffies;
- unsigned long expire = 120 * HZ;
-
- for(i = 0; i <= dn_rt_hash_mask; i++) {
- rtp = &dn_rt_hash_table[i].chain;
-
- spin_lock(&dn_rt_hash_table[i].lock);
- while((rt=*rtp) != NULL) {
- if (atomic_read(&rt->u.dst.__refcnt) ||
- (now - rt->u.dst.lastuse) < expire) {
- rtp = &rt->u.rt_next;
- continue;
- }
- *rtp = rt->u.rt_next;
- rt->u.rt_next = NULL;
- dnrt_free(rt);
- }
- spin_unlock(&dn_rt_hash_table[i].lock);
-
- if ((jiffies - now) > 0)
- break;
- }
-
- mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ);
-}
-
-static int dn_dst_gc(void)
-{
- struct dn_route *rt, **rtp;
- int i;
- unsigned long now = jiffies;
- unsigned long expire = 10 * HZ;
-
- for(i = 0; i <= dn_rt_hash_mask; i++) {
-
- spin_lock_bh(&dn_rt_hash_table[i].lock);
- rtp = &dn_rt_hash_table[i].chain;
-
- while((rt=*rtp) != NULL) {
- if (atomic_read(&rt->u.dst.__refcnt) ||
- (now - rt->u.dst.lastuse) < expire) {
- rtp = &rt->u.rt_next;
- continue;
- }
- *rtp = rt->u.rt_next;
- rt->u.rt_next = NULL;
- dnrt_drop(rt);
- break;
- }
- spin_unlock_bh(&dn_rt_hash_table[i].lock);
- }
-
- return 0;
-}
-
-/*
- * The decnet standards don't impose a particular minimum mtu, what they
- * do insist on is that the routing layer accepts a datagram of at least
- * 230 bytes long. Here we have to subtract the routing header length from
- * 230 to get the minimum acceptable mtu. If there is no neighbour, then we
- * assume the worst and use a long header size.
- *
- * We update both the mtu and the advertised mss (i.e. the segment size we
- * advertise to the other end).
- */
-static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
-{
- u32 min_mtu = 230;
- struct dn_dev *dn = dst->neighbour ?
- (struct dn_dev *)dst->neighbour->dev->dn_ptr : NULL;
-
- if (dn && dn->use_long == 0)
- min_mtu -= 6;
- else
- min_mtu -= 21;
-
- if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= min_mtu) {
- if (!(dst_metric_locked(dst, RTAX_MTU))) {
- dst->metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(dst, dn_rt_mtu_expires);
- }
- if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
- u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
- if (dst->metrics[RTAX_ADVMSS-1] > mss)
- dst->metrics[RTAX_ADVMSS-1] = mss;
- }
- }
-}
-
-/*
- * When a route has been marked obsolete. (e.g. routing cache flush)
- */
-static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie)
-{
- return NULL;
-}
-
-static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst)
-{
- dst_release(dst);
- return NULL;
-}
-
-static void dn_dst_link_failure(struct sk_buff *skb)
-{
- return;
-}
-
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
-{
- return ((fl1->nl_u.dn_u.daddr ^ fl2->nl_u.dn_u.daddr) |
- (fl1->nl_u.dn_u.saddr ^ fl2->nl_u.dn_u.saddr) |
- (fl1->mark ^ fl2->mark) |
- (fl1->nl_u.dn_u.scope ^ fl2->nl_u.dn_u.scope) |
- (fl1->oif ^ fl2->oif) |
- (fl1->iif ^ fl2->iif)) == 0;
-}
-
-static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp)
-{
- struct dn_route *rth, **rthp;
- unsigned long now = jiffies;
-
- rthp = &dn_rt_hash_table[hash].chain;
-
- spin_lock_bh(&dn_rt_hash_table[hash].lock);
- while((rth = *rthp) != NULL) {
- if (compare_keys(&rth->fl, &rt->fl)) {
- /* Put it first */
- *rthp = rth->u.rt_next;
- rcu_assign_pointer(rth->u.rt_next,
- dn_rt_hash_table[hash].chain);
- rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
-
- rth->u.dst.__use++;
- dst_hold(&rth->u.dst);
- rth->u.dst.lastuse = now;
- spin_unlock_bh(&dn_rt_hash_table[hash].lock);
-
- dnrt_drop(rt);
- *rp = rth;
- return 0;
- }
- rthp = &rth->u.rt_next;
- }
-
- rcu_assign_pointer(rt->u.rt_next, dn_rt_hash_table[hash].chain);
- rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
-
- dst_hold(&rt->u.dst);
- rt->u.dst.__use++;
- rt->u.dst.lastuse = now;
- spin_unlock_bh(&dn_rt_hash_table[hash].lock);
- *rp = rt;
- return 0;
-}
-
-void dn_run_flush(unsigned long dummy)
-{
- int i;
- struct dn_route *rt, *next;
-
- for(i = 0; i < dn_rt_hash_mask; i++) {
- spin_lock_bh(&dn_rt_hash_table[i].lock);
-
- if ((rt = xchg(&dn_rt_hash_table[i].chain, NULL)) == NULL)
- goto nothing_to_declare;
-
- for(; rt; rt=next) {
- next = rt->u.rt_next;
- rt->u.rt_next = NULL;
- dst_free((struct dst_entry *)rt);
- }
-
-nothing_to_declare:
- spin_unlock_bh(&dn_rt_hash_table[i].lock);
- }
-}
-
-static DEFINE_SPINLOCK(dn_rt_flush_lock);
-
-void dn_rt_cache_flush(int delay)
-{
- unsigned long now = jiffies;
- int user_mode = !in_interrupt();
-
- if (delay < 0)
- delay = dn_rt_min_delay;
-
- spin_lock_bh(&dn_rt_flush_lock);
-
- if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) {
- long tmo = (long)(dn_rt_deadline - now);
-
- if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay)
- tmo = 0;
-
- if (delay > tmo)
- delay = tmo;
- }
-
- if (delay <= 0) {
- spin_unlock_bh(&dn_rt_flush_lock);
- dn_run_flush(0);
- return;
- }
-
- if (dn_rt_deadline == 0)
- dn_rt_deadline = now + dn_rt_max_delay;
-
- dn_rt_flush_timer.expires = now + delay;
- add_timer(&dn_rt_flush_timer);
- spin_unlock_bh(&dn_rt_flush_lock);
-}
-
-/**
- * dn_return_short - Return a short packet to its sender
- * @skb: The packet to return
- *
- */
-static int dn_return_short(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb;
- unsigned char *ptr;
- __le16 *src;
- __le16 *dst;
- __le16 tmp;
-
- /* Add back headers */
- skb_push(skb, skb->data - skb->nh.raw);
-
- if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
- return NET_RX_DROP;
-
- cb = DN_SKB_CB(skb);
- /* Skip packet length and point to flags */
- ptr = skb->data + 2;
- *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
-
- dst = (__le16 *)ptr;
- ptr += 2;
- src = (__le16 *)ptr;
- ptr += 2;
- *ptr = 0; /* Zero hop count */
-
- /* Swap source and destination */
- tmp = *src;
- *src = *dst;
- *dst = tmp;
-
- skb->pkt_type = PACKET_OUTGOING;
- dn_rt_finish_output(skb, NULL, NULL);
- return NET_RX_SUCCESS;
-}
-
-/**
- * dn_return_long - Return a long packet to its sender
- * @skb: The long format packet to return
- *
- */
-static int dn_return_long(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb;
- unsigned char *ptr;
- unsigned char *src_addr, *dst_addr;
- unsigned char tmp[ETH_ALEN];
-
- /* Add back all headers */
- skb_push(skb, skb->data - skb->nh.raw);
-
- if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
- return NET_RX_DROP;
-
- cb = DN_SKB_CB(skb);
- /* Ignore packet length and point to flags */
- ptr = skb->data + 2;
-
- /* Skip padding */
- if (*ptr & DN_RT_F_PF) {
- char padlen = (*ptr & ~DN_RT_F_PF);
- ptr += padlen;
- }
-
- *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
- ptr += 2;
- dst_addr = ptr;
- ptr += 8;
- src_addr = ptr;
- ptr += 6;
- *ptr = 0; /* Zero hop count */
-
- /* Swap source and destination */
- memcpy(tmp, src_addr, ETH_ALEN);
- memcpy(src_addr, dst_addr, ETH_ALEN);
- memcpy(dst_addr, tmp, ETH_ALEN);
-
- skb->pkt_type = PACKET_OUTGOING;
- dn_rt_finish_output(skb, dst_addr, src_addr);
- return NET_RX_SUCCESS;
-}
-
-/**
- * dn_route_rx_packet - Try and find a route for an incoming packet
- * @skb: The packet to find a route for
- *
- * Returns: result of input function if route is found, error code otherwise
- */
-static int dn_route_rx_packet(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int err;
-
- if ((err = dn_route_input(skb)) == 0)
- return dst_input(skb);
-
- if (decnet_debug_level & 4) {
- char *devname = skb->dev ? skb->dev->name : "???";
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- printk(KERN_DEBUG
- "DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n",
- (int)cb->rt_flags, devname, skb->len,
- dn_ntohs(cb->src), dn_ntohs(cb->dst),
- err, skb->pkt_type);
- }
-
- if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) {
- switch(cb->rt_flags & DN_RT_PKT_MSK) {
- case DN_RT_PKT_SHORT:
- return dn_return_short(skb);
- case DN_RT_PKT_LONG:
- return dn_return_long(skb);
- }
- }
-
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static int dn_route_rx_long(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned char *ptr = skb->data;
-
- if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */
- goto drop_it;
-
- skb_pull(skb, 20);
- skb->h.raw = skb->data;
-
- /* Destination info */
- ptr += 2;
- cb->dst = dn_eth2dn(ptr);
- if (memcmp(ptr, dn_hiord_addr, 4) != 0)
- goto drop_it;
- ptr += 6;
-
-
- /* Source info */
- ptr += 2;
- cb->src = dn_eth2dn(ptr);
- if (memcmp(ptr, dn_hiord_addr, 4) != 0)
- goto drop_it;
- ptr += 6;
- /* Other junk */
- ptr++;
- cb->hops = *ptr++; /* Visit Count */
-
- return NF_HOOK(PF_DECnet, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, dn_route_rx_packet);
-
-drop_it:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-
-
-static int dn_route_rx_short(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned char *ptr = skb->data;
-
- if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */
- goto drop_it;
-
- skb_pull(skb, 5);
- skb->h.raw = skb->data;
-
- cb->dst = *(__le16 *)ptr;
- ptr += 2;
- cb->src = *(__le16 *)ptr;
- ptr += 2;
- cb->hops = *ptr & 0x3f;
-
- return NF_HOOK(PF_DECnet, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, dn_route_rx_packet);
-
-drop_it:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static int dn_route_discard(struct sk_buff *skb)
-{
- /*
- * I know we drop the packet here, but thats considered success in
- * this case
- */
- kfree_skb(skb);
- return NET_RX_SUCCESS;
-}
-
-static int dn_route_ptp_hello(struct sk_buff *skb)
-{
- dn_dev_hello(skb);
- dn_neigh_pointopoint_hello(skb);
- return NET_RX_SUCCESS;
-}
-
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
-{
- struct dn_skb_cb *cb;
- unsigned char flags = 0;
- __u16 len = dn_ntohs(*(__le16 *)skb->data);
- struct dn_dev *dn = (struct dn_dev *)dev->dn_ptr;
- unsigned char padlen = 0;
-
- if (dn == NULL)
- goto dump_it;
-
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- goto out;
-
- if (!pskb_may_pull(skb, 3))
- goto dump_it;
-
- skb_pull(skb, 2);
-
- if (len > skb->len)
- goto dump_it;
-
- skb_trim(skb, len);
-
- flags = *skb->data;
-
- cb = DN_SKB_CB(skb);
- cb->stamp = jiffies;
- cb->iif = dev->ifindex;
-
- /*
- * If we have padding, remove it.
- */
- if (flags & DN_RT_F_PF) {
- padlen = flags & ~DN_RT_F_PF;
- if (!pskb_may_pull(skb, padlen + 1))
- goto dump_it;
- skb_pull(skb, padlen);
- flags = *skb->data;
- }
-
- skb->nh.raw = skb->data;
-
- /*
- * Weed out future version DECnet
- */
- if (flags & DN_RT_F_VER)
- goto dump_it;
-
- cb->rt_flags = flags;
-
- if (decnet_debug_level & 1)
- printk(KERN_DEBUG
- "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n",
- (int)flags, (dev) ? dev->name : "???", len, skb->len,
- padlen);
-
- if (flags & DN_RT_PKT_CNTL) {
- if (unlikely(skb_linearize(skb)))
- goto dump_it;
-
- switch(flags & DN_RT_CNTL_MSK) {
- case DN_RT_PKT_INIT:
- dn_dev_init_pkt(skb);
- break;
- case DN_RT_PKT_VERI:
- dn_dev_veri_pkt(skb);
- break;
- }
-
- if (dn->parms.state != DN_DEV_S_RU)
- goto dump_it;
-
- switch(flags & DN_RT_CNTL_MSK) {
- case DN_RT_PKT_HELO:
- return NF_HOOK(PF_DECnet, NF_DN_HELLO, skb, skb->dev, NULL, dn_route_ptp_hello);
-
- case DN_RT_PKT_L1RT:
- case DN_RT_PKT_L2RT:
- return NF_HOOK(PF_DECnet, NF_DN_ROUTE, skb, skb->dev, NULL, dn_route_discard);
- case DN_RT_PKT_ERTH:
- return NF_HOOK(PF_DECnet, NF_DN_HELLO, skb, skb->dev, NULL, dn_neigh_router_hello);
-
- case DN_RT_PKT_EEDH:
- return NF_HOOK(PF_DECnet, NF_DN_HELLO, skb, skb->dev, NULL, dn_neigh_endnode_hello);
- }
- } else {
- if (dn->parms.state != DN_DEV_S_RU)
- goto dump_it;
-
- skb_pull(skb, 1); /* Pull flags */
-
- switch(flags & DN_RT_PKT_MSK) {
- case DN_RT_PKT_LONG:
- return dn_route_rx_long(skb);
- case DN_RT_PKT_SHORT:
- return dn_route_rx_short(skb);
- }
- }
-
-dump_it:
- kfree_skb(skb);
-out:
- return NET_RX_DROP;
-}
-
-static int dn_output(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct dn_route *rt = (struct dn_route *)dst;
- struct net_device *dev = dst->dev;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct neighbour *neigh;
-
- int err = -EINVAL;
-
- if ((neigh = dst->neighbour) == NULL)
- goto error;
-
- skb->dev = dev;
-
- cb->src = rt->rt_saddr;
- cb->dst = rt->rt_daddr;
-
- /*
- * Always set the Intra-Ethernet bit on all outgoing packets
- * originated on this node. Only valid flag from upper layers
- * is return-to-sender-requested. Set hop count to 0 too.
- */
- cb->rt_flags &= ~DN_RT_F_RQR;
- cb->rt_flags |= DN_RT_F_IE;
- cb->hops = 0;
-
- return NF_HOOK(PF_DECnet, NF_DN_LOCAL_OUT, skb, NULL, dev, neigh->output);
-
-error:
- if (net_ratelimit())
- printk(KERN_DEBUG "dn_output: This should not happen\n");
-
- kfree_skb(skb);
-
- return err;
-}
-
-static int dn_forward(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dst_entry *dst = skb->dst;
- struct dn_dev *dn_db = dst->dev->dn_ptr;
- struct dn_route *rt;
- struct neighbour *neigh = dst->neighbour;
- int header_len;
-#ifdef CONFIG_NETFILTER
- struct net_device *dev = skb->dev;
-#endif
-
- if (skb->pkt_type != PACKET_HOST)
- goto drop;
-
- /* Ensure that we have enough space for headers */
- rt = (struct dn_route *)skb->dst;
- header_len = dn_db->use_long ? 21 : 6;
- if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+header_len))
- goto drop;
-
- /*
- * Hop count exceeded.
- */
- if (++cb->hops > 30)
- goto drop;
-
- skb->dev = rt->u.dst.dev;
-
- /*
- * If packet goes out same interface it came in on, then set
- * the Intra-Ethernet bit. This has no effect for short
- * packets, so we don't need to test for them here.
- */
- cb->rt_flags &= ~DN_RT_F_IE;
- if (rt->rt_flags & RTCF_DOREDIRECT)
- cb->rt_flags |= DN_RT_F_IE;
-
- return NF_HOOK(PF_DECnet, NF_DN_FORWARD, skb, dev, skb->dev, neigh->output);
-
-drop:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-/*
- * Drop packet. This is used for endnodes and for
- * when we should not be forwarding packets from
- * this dest.
- */
-static int dn_blackhole(struct sk_buff *skb)
-{
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-/*
- * Used to catch bugs. This should never normally get
- * called.
- */
-static int dn_rt_bug(struct sk_buff *skb)
-{
- if (net_ratelimit()) {
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- printk(KERN_DEBUG "dn_rt_bug: skb from:%04x to:%04x\n",
- dn_ntohs(cb->src), dn_ntohs(cb->dst));
- }
-
- kfree_skb(skb);
-
- return NET_RX_BAD;
-}
-
-static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
-{
- struct dn_fib_info *fi = res->fi;
- struct net_device *dev = rt->u.dst.dev;
- struct neighbour *n;
- unsigned mss;
-
- if (fi) {
- if (DN_FIB_RES_GW(*res) &&
- DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- rt->rt_gateway = DN_FIB_RES_GW(*res);
- memcpy(rt->u.dst.metrics, fi->fib_metrics,
- sizeof(rt->u.dst.metrics));
- }
- rt->rt_type = res->type;
-
- if (dev != NULL && rt->u.dst.neighbour == NULL) {
- n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
- rt->u.dst.neighbour = n;
- }
-
- if (rt->u.dst.metrics[RTAX_MTU-1] == 0 ||
- rt->u.dst.metrics[RTAX_MTU-1] > rt->u.dst.dev->mtu)
- rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
- mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst));
- if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0 ||
- rt->u.dst.metrics[RTAX_ADVMSS-1] > mss)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = mss;
- return 0;
-}
-
-static inline int dn_match_addr(__le16 addr1, __le16 addr2)
-{
- __u16 tmp = dn_ntohs(addr1) ^ dn_ntohs(addr2);
- int match = 16;
- while(tmp) {
- tmp >>= 1;
- match--;
- }
- return match;
-}
-
-static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope)
-{
- __le16 saddr = 0;
- struct dn_dev *dn_db = dev->dn_ptr;
- struct dn_ifaddr *ifa;
- int best_match = 0;
- int ret;
-
- read_lock(&dev_base_lock);
- for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
- if (ifa->ifa_scope > scope)
- continue;
- if (!daddr) {
- saddr = ifa->ifa_local;
- break;
- }
- ret = dn_match_addr(daddr, ifa->ifa_local);
- if (ret > best_match)
- saddr = ifa->ifa_local;
- if (best_match == 0)
- saddr = ifa->ifa_local;
- }
- read_unlock(&dev_base_lock);
-
- return saddr;
-}
-
-static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
-{
- return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope);
-}
-
-static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res)
-{
- __le16 mask = dnet_make_mask(res->prefixlen);
- return (daddr&~mask)|res->fi->fib_nh->nh_gw;
-}
-
-static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *oldflp, int try_hard)
-{
- struct flowi fl = { .nl_u = { .dn_u =
- { .daddr = oldflp->fld_dst,
- .saddr = oldflp->fld_src,
- .scope = RT_SCOPE_UNIVERSE,
- } },
- .mark = oldflp->mark,
- .iif = loopback_dev.ifindex,
- .oif = oldflp->oif };
- struct dn_route *rt = NULL;
- struct net_device *dev_out = NULL;
- struct neighbour *neigh = NULL;
- unsigned hash;
- unsigned flags = 0;
- struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST };
- int err;
- int free_res = 0;
- __le16 gateway = 0;
-
- if (decnet_debug_level & 16)
- printk(KERN_DEBUG
- "dn_route_output_slow: dst=%04x src=%04x mark=%d"
- " iif=%d oif=%d\n", dn_ntohs(oldflp->fld_dst),
- dn_ntohs(oldflp->fld_src),
- oldflp->mark, loopback_dev.ifindex, oldflp->oif);
-
- /* If we have an output interface, verify its a DECnet device */
- if (oldflp->oif) {
- dev_out = dev_get_by_index(oldflp->oif);
- err = -ENODEV;
- if (dev_out && dev_out->dn_ptr == NULL) {
- dev_put(dev_out);
- dev_out = NULL;
- }
- if (dev_out == NULL)
- goto out;
- }
-
- /* If we have a source address, verify that its a local address */
- if (oldflp->fld_src) {
- err = -EADDRNOTAVAIL;
-
- if (dev_out) {
- if (dn_dev_islocal(dev_out, oldflp->fld_src))
- goto source_ok;
- dev_put(dev_out);
- goto out;
- }
- read_lock(&dev_base_lock);
- for(dev_out = dev_base; dev_out; dev_out = dev_out->next) {
- if (!dev_out->dn_ptr)
- continue;
- if (!dn_dev_islocal(dev_out, oldflp->fld_src))
- continue;
- if ((dev_out->flags & IFF_LOOPBACK) &&
- oldflp->fld_dst &&
- !dn_dev_islocal(dev_out, oldflp->fld_dst))
- continue;
- break;
- }
- read_unlock(&dev_base_lock);
- if (dev_out == NULL)
- goto out;
- dev_hold(dev_out);
-source_ok:
- ;
- }
-
- /* No destination? Assume its local */
- if (!fl.fld_dst) {
- fl.fld_dst = fl.fld_src;
-
- err = -EADDRNOTAVAIL;
- if (dev_out)
- dev_put(dev_out);
- dev_out = &loopback_dev;
- dev_hold(dev_out);
- if (!fl.fld_dst) {
- fl.fld_dst =
- fl.fld_src = dnet_select_source(dev_out, 0,
- RT_SCOPE_HOST);
- if (!fl.fld_dst)
- goto out;
- }
- fl.oif = loopback_dev.ifindex;
- res.type = RTN_LOCAL;
- goto make_route;
- }
-
- if (decnet_debug_level & 16)
- printk(KERN_DEBUG
- "dn_route_output_slow: initial checks complete."
- " dst=%o4x src=%04x oif=%d try_hard=%d\n",
- dn_ntohs(fl.fld_dst), dn_ntohs(fl.fld_src),
- fl.oif, try_hard);
-
- /*
- * N.B. If the kernel is compiled without router support then
- * dn_fib_lookup() will evaluate to non-zero so this if () block
- * will always be executed.
- */
- err = -ESRCH;
- if (try_hard || (err = dn_fib_lookup(&fl, &res)) != 0) {
- struct dn_dev *dn_db;
- if (err != -ESRCH)
- goto out;
- /*
- * Here the fallback is basically the standard algorithm for
- * routing in endnodes which is described in the DECnet routing
- * docs
- *
- * If we are not trying hard, look in neighbour cache.
- * The result is tested to ensure that if a specific output
- * device/source address was requested, then we honour that
- * here
- */
- if (!try_hard) {
- neigh = neigh_lookup_nodev(&dn_neigh_table, &fl.fld_dst);
- if (neigh) {
- if ((oldflp->oif &&
- (neigh->dev->ifindex != oldflp->oif)) ||
- (oldflp->fld_src &&
- (!dn_dev_islocal(neigh->dev,
- oldflp->fld_src)))) {
- neigh_release(neigh);
- neigh = NULL;
- } else {
- if (dev_out)
- dev_put(dev_out);
- if (dn_dev_islocal(neigh->dev, fl.fld_dst)) {
- dev_out = &loopback_dev;
- res.type = RTN_LOCAL;
- } else {
- dev_out = neigh->dev;
- }
- dev_hold(dev_out);
- goto select_source;
- }
- }
- }
-
- /* Not there? Perhaps its a local address */
- if (dev_out == NULL)
- dev_out = dn_dev_get_default();
- err = -ENODEV;
- if (dev_out == NULL)
- goto out;
- dn_db = dev_out->dn_ptr;
- /* Possible improvement - check all devices for local addr */
- if (dn_dev_islocal(dev_out, fl.fld_dst)) {
- dev_put(dev_out);
- dev_out = &loopback_dev;
- dev_hold(dev_out);
- res.type = RTN_LOCAL;
- goto select_source;
- }
- /* Not local either.... try sending it to the default router */
- neigh = neigh_clone(dn_db->router);
- BUG_ON(neigh && neigh->dev != dev_out);
-
- /* Ok then, we assume its directly connected and move on */
-select_source:
- if (neigh)
- gateway = ((struct dn_neigh *)neigh)->addr;
- if (gateway == 0)
- gateway = fl.fld_dst;
- if (fl.fld_src == 0) {
- fl.fld_src = dnet_select_source(dev_out, gateway,
- res.type == RTN_LOCAL ?
- RT_SCOPE_HOST :
- RT_SCOPE_LINK);
- if (fl.fld_src == 0 && res.type != RTN_LOCAL)
- goto e_addr;
- }
- fl.oif = dev_out->ifindex;
- goto make_route;
- }
- free_res = 1;
-
- if (res.type == RTN_NAT)
- goto e_inval;
-
- if (res.type == RTN_LOCAL) {
- if (!fl.fld_src)
- fl.fld_src = fl.fld_dst;
- if (dev_out)
- dev_put(dev_out);
- dev_out = &loopback_dev;
- dev_hold(dev_out);
- fl.oif = dev_out->ifindex;
- if (res.fi)
- dn_fib_info_put(res.fi);
- res.fi = NULL;
- goto make_route;
- }
-
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- dn_fib_select_multipath(&fl, &res);
-
- /*
- * We could add some logic to deal with default routes here and
- * get rid of some of the special casing above.
- */
-
- if (!fl.fld_src)
- fl.fld_src = DN_FIB_RES_PREFSRC(res);
-
- if (dev_out)
- dev_put(dev_out);
- dev_out = DN_FIB_RES_DEV(res);
- dev_hold(dev_out);
- fl.oif = dev_out->ifindex;
- gateway = DN_FIB_RES_GW(res);
-
-make_route:
- if (dev_out->flags & IFF_LOOPBACK)
- flags |= RTCF_LOCAL;
-
- rt = dst_alloc(&dn_dst_ops);
- if (rt == NULL)
- goto e_nobufs;
-
- atomic_set(&rt->u.dst.__refcnt, 1);
- rt->u.dst.flags = DST_HOST;
-
- rt->fl.fld_src = oldflp->fld_src;
- rt->fl.fld_dst = oldflp->fld_dst;
- rt->fl.oif = oldflp->oif;
- rt->fl.iif = 0;
- rt->fl.mark = oldflp->mark;
-
- rt->rt_saddr = fl.fld_src;
- rt->rt_daddr = fl.fld_dst;
- rt->rt_gateway = gateway ? gateway : fl.fld_dst;
- rt->rt_local_src = fl.fld_src;
-
- rt->rt_dst_map = fl.fld_dst;
- rt->rt_src_map = fl.fld_src;
-
- rt->u.dst.dev = dev_out;
- dev_hold(dev_out);
- rt->u.dst.neighbour = neigh;
- neigh = NULL;
-
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.output = dn_output;
- rt->u.dst.input = dn_rt_bug;
- rt->rt_flags = flags;
- if (flags & RTCF_LOCAL)
- rt->u.dst.input = dn_nsp_rx;
-
- err = dn_rt_set_next_hop(rt, &res);
- if (err)
- goto e_neighbour;
-
- hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst);
- dn_insert_route(rt, hash, (struct dn_route **)pprt);
-
-done:
- if (neigh)
- neigh_release(neigh);
- if (free_res)
- dn_fib_res_put(&res);
- if (dev_out)
- dev_put(dev_out);
-out:
- return err;
-
-e_addr:
- err = -EADDRNOTAVAIL;
- goto done;
-e_inval:
- err = -EINVAL;
- goto done;
-e_nobufs:
- err = -ENOBUFS;
- goto done;
-e_neighbour:
- dst_free(&rt->u.dst);
- goto e_nobufs;
-}
-
-
-/*
- * N.B. The flags may be moved into the flowi at some future stage.
- */
-static int __dn_route_output_key(struct dst_entry **pprt, const struct flowi *flp, int flags)
-{
- unsigned hash = dn_hash(flp->fld_src, flp->fld_dst);
- struct dn_route *rt = NULL;
-
- if (!(flags & MSG_TRYHARD)) {
- rcu_read_lock_bh();
- for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt;
- rt = rcu_dereference(rt->u.rt_next)) {
- if ((flp->fld_dst == rt->fl.fld_dst) &&
- (flp->fld_src == rt->fl.fld_src) &&
- (flp->mark == rt->fl.mark) &&
- (rt->fl.iif == 0) &&
- (rt->fl.oif == flp->oif)) {
- rt->u.dst.lastuse = jiffies;
- dst_hold(&rt->u.dst);
- rt->u.dst.__use++;
- rcu_read_unlock_bh();
- *pprt = &rt->u.dst;
- return 0;
- }
- }
- rcu_read_unlock_bh();
- }
-
- return dn_route_output_slow(pprt, flp, flags);
-}
-
-static int dn_route_output_key(struct dst_entry **pprt, struct flowi *flp, int flags)
-{
- int err;
-
- err = __dn_route_output_key(pprt, flp, flags);
- if (err == 0 && flp->proto) {
- err = xfrm_lookup(pprt, flp, NULL, 0);
- }
- return err;
-}
-
-int dn_route_output_sock(struct dst_entry **pprt, struct flowi *fl, struct sock *sk, int flags)
-{
- int err;
-
- err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
- if (err == 0 && fl->proto) {
- err = xfrm_lookup(pprt, fl, sk, !(flags & MSG_DONTWAIT));
- }
- return err;
-}
-
-static int dn_route_input_slow(struct sk_buff *skb)
-{
- struct dn_route *rt = NULL;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct net_device *in_dev = skb->dev;
- struct net_device *out_dev = NULL;
- struct dn_dev *dn_db;
- struct neighbour *neigh = NULL;
- unsigned hash;
- int flags = 0;
- __le16 gateway = 0;
- __le16 local_src = 0;
- struct flowi fl = { .nl_u = { .dn_u =
- { .daddr = cb->dst,
- .saddr = cb->src,
- .scope = RT_SCOPE_UNIVERSE,
- } },
- .mark = skb->mark,
- .iif = skb->dev->ifindex };
- struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE };
- int err = -EINVAL;
- int free_res = 0;
-
- dev_hold(in_dev);
-
- if ((dn_db = in_dev->dn_ptr) == NULL)
- goto out;
-
- /* Zero source addresses are not allowed */
- if (fl.fld_src == 0)
- goto out;
-
- /*
- * In this case we've just received a packet from a source
- * outside ourselves pretending to come from us. We don't
- * allow it any further to prevent routing loops, spoofing and
- * other nasties. Loopback packets already have the dst attached
- * so this only affects packets which have originated elsewhere.
- */
- err = -ENOTUNIQ;
- if (dn_dev_islocal(in_dev, cb->src))
- goto out;
-
- err = dn_fib_lookup(&fl, &res);
- if (err) {
- if (err != -ESRCH)
- goto out;
- /*
- * Is the destination us ?
- */
- if (!dn_dev_islocal(in_dev, cb->dst))
- goto e_inval;
-
- res.type = RTN_LOCAL;
- } else {
- __le16 src_map = fl.fld_src;
- free_res = 1;
-
- out_dev = DN_FIB_RES_DEV(res);
- if (out_dev == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "Bug in dn_route_input_slow() "
- "No output device\n");
- goto e_inval;
- }
- dev_hold(out_dev);
-
- if (res.r)
- src_map = fl.fld_src; /* no NAT support for now */
-
- gateway = DN_FIB_RES_GW(res);
- if (res.type == RTN_NAT) {
- fl.fld_dst = dn_fib_rules_map_destination(fl.fld_dst, &res);
- dn_fib_res_put(&res);
- free_res = 0;
- if (dn_fib_lookup(&fl, &res))
- goto e_inval;
- free_res = 1;
- if (res.type != RTN_UNICAST)
- goto e_inval;
- flags |= RTCF_DNAT;
- gateway = fl.fld_dst;
- }
- fl.fld_src = src_map;
- }
-
- switch(res.type) {
- case RTN_UNICAST:
- /*
- * Forwarding check here, we only check for forwarding
- * being turned off, if you want to only forward intra
- * area, its up to you to set the routing tables up
- * correctly.
- */
- if (dn_db->parms.forwarding == 0)
- goto e_inval;
-
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- dn_fib_select_multipath(&fl, &res);
-
- /*
- * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT
- * flag as a hint to set the intra-ethernet bit when
- * forwarding. If we've got NAT in operation, we don't do
- * this optimisation.
- */
- if (out_dev == in_dev && !(flags & RTCF_NAT))
- flags |= RTCF_DOREDIRECT;
-
- local_src = DN_FIB_RES_PREFSRC(res);
-
- case RTN_BLACKHOLE:
- case RTN_UNREACHABLE:
- break;
- case RTN_LOCAL:
- flags |= RTCF_LOCAL;
- fl.fld_src = cb->dst;
- fl.fld_dst = cb->src;
-
- /* Routing tables gave us a gateway */
- if (gateway)
- goto make_route;
-
- /* Packet was intra-ethernet, so we know its on-link */
- if (cb->rt_flags & DN_RT_F_IE) {
- gateway = cb->src;
- flags |= RTCF_DIRECTSRC;
- goto make_route;
- }
-
- /* Use the default router if there is one */
- neigh = neigh_clone(dn_db->router);
- if (neigh) {
- gateway = ((struct dn_neigh *)neigh)->addr;
- goto make_route;
- }
-
- /* Close eyes and pray */
- gateway = cb->src;
- flags |= RTCF_DIRECTSRC;
- goto make_route;
- default:
- goto e_inval;
- }
-
-make_route:
- rt = dst_alloc(&dn_dst_ops);
- if (rt == NULL)
- goto e_nobufs;
-
- rt->rt_saddr = fl.fld_src;
- rt->rt_daddr = fl.fld_dst;
- rt->rt_gateway = fl.fld_dst;
- if (gateway)
- rt->rt_gateway = gateway;
- rt->rt_local_src = local_src ? local_src : rt->rt_saddr;
-
- rt->rt_dst_map = fl.fld_dst;
- rt->rt_src_map = fl.fld_src;
-
- rt->fl.fld_src = cb->src;
- rt->fl.fld_dst = cb->dst;
- rt->fl.oif = 0;
- rt->fl.iif = in_dev->ifindex;
- rt->fl.mark = fl.mark;
-
- rt->u.dst.flags = DST_HOST;
- rt->u.dst.neighbour = neigh;
- rt->u.dst.dev = out_dev;
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.output = dn_rt_bug;
- switch(res.type) {
- case RTN_UNICAST:
- rt->u.dst.input = dn_forward;
- break;
- case RTN_LOCAL:
- rt->u.dst.output = dn_output;
- rt->u.dst.input = dn_nsp_rx;
- rt->u.dst.dev = in_dev;
- flags |= RTCF_LOCAL;
- break;
- default:
- case RTN_UNREACHABLE:
- case RTN_BLACKHOLE:
- rt->u.dst.input = dn_blackhole;
- }
- rt->rt_flags = flags;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
-
- err = dn_rt_set_next_hop(rt, &res);
- if (err)
- goto e_neighbour;
-
- hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst);
- dn_insert_route(rt, hash, (struct dn_route **)&skb->dst);
-
-done:
- if (neigh)
- neigh_release(neigh);
- if (free_res)
- dn_fib_res_put(&res);
- dev_put(in_dev);
- if (out_dev)
- dev_put(out_dev);
-out:
- return err;
-
-e_inval:
- err = -EINVAL;
- goto done;
-
-e_nobufs:
- err = -ENOBUFS;
- goto done;
-
-e_neighbour:
- dst_free(&rt->u.dst);
- goto done;
-}
-
-int dn_route_input(struct sk_buff *skb)
-{
- struct dn_route *rt;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned hash = dn_hash(cb->src, cb->dst);
-
- if (skb->dst)
- return 0;
-
- rcu_read_lock();
- for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
- rt = rcu_dereference(rt->u.rt_next)) {
- if ((rt->fl.fld_src == cb->src) &&
- (rt->fl.fld_dst == cb->dst) &&
- (rt->fl.oif == 0) &&
- (rt->fl.mark == skb->mark) &&
- (rt->fl.iif == cb->iif)) {
- rt->u.dst.lastuse = jiffies;
- dst_hold(&rt->u.dst);
- rt->u.dst.__use++;
- rcu_read_unlock();
- skb->dst = (struct dst_entry *)rt;
- return 0;
- }
- }
- rcu_read_unlock();
-
- return dn_route_input_slow(skb);
-}
-
-static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
- int event, int nowait, unsigned int flags)
-{
- struct dn_route *rt = (struct dn_route *)skb->dst;
- struct rtmsg *r;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
- long expires;
-
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
- r = NLMSG_DATA(nlh);
- r->rtm_family = AF_DECnet;
- r->rtm_dst_len = 16;
- r->rtm_src_len = 0;
- r->rtm_tos = 0;
- r->rtm_table = RT_TABLE_MAIN;
- RTA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
- r->rtm_type = rt->rt_type;
- r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
- r->rtm_scope = RT_SCOPE_UNIVERSE;
- r->rtm_protocol = RTPROT_UNSPEC;
- if (rt->rt_flags & RTCF_NOTIFY)
- r->rtm_flags |= RTM_F_NOTIFY;
- RTA_PUT(skb, RTA_DST, 2, &rt->rt_daddr);
- if (rt->fl.fld_src) {
- r->rtm_src_len = 16;
- RTA_PUT(skb, RTA_SRC, 2, &rt->fl.fld_src);
- }
- if (rt->u.dst.dev)
- RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
- /*
- * Note to self - change this if input routes reverse direction when
- * they deal only with inputs and not with replies like they do
- * currently.
- */
- RTA_PUT(skb, RTA_PREFSRC, 2, &rt->rt_local_src);
- if (rt->rt_daddr != rt->rt_gateway)
- RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway);
- if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
- goto rtattr_failure;
- expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
- if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, expires,
- rt->u.dst.error) < 0)
- goto rtattr_failure;
- if (rt->fl.iif)
- RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-nlmsg_failure:
-rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-/*
- * This is called by both endnodes and routers now.
- */
-int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
-{
- struct rtattr **rta = arg;
- struct rtmsg *rtm = NLMSG_DATA(nlh);
- struct dn_route *rt = NULL;
- struct dn_skb_cb *cb;
- int err;
- struct sk_buff *skb;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = DNPROTO_NSP;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (skb == NULL)
- return -ENOBUFS;
- skb->mac.raw = skb->data;
- cb = DN_SKB_CB(skb);
-
- if (rta[RTA_SRC-1])
- memcpy(&fl.fld_src, RTA_DATA(rta[RTA_SRC-1]), 2);
- if (rta[RTA_DST-1])
- memcpy(&fl.fld_dst, RTA_DATA(rta[RTA_DST-1]), 2);
- if (rta[RTA_IIF-1])
- memcpy(&fl.iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
-
- if (fl.iif) {
- struct net_device *dev;
- if ((dev = dev_get_by_index(fl.iif)) == NULL) {
- kfree_skb(skb);
- return -ENODEV;
- }
- if (!dev->dn_ptr) {
- dev_put(dev);
- kfree_skb(skb);
- return -ENODEV;
- }
- skb->protocol = __constant_htons(ETH_P_DNA_RT);
- skb->dev = dev;
- cb->src = fl.fld_src;
- cb->dst = fl.fld_dst;
- local_bh_disable();
- err = dn_route_input(skb);
- local_bh_enable();
- memset(cb, 0, sizeof(struct dn_skb_cb));
- rt = (struct dn_route *)skb->dst;
- if (!err && -rt->u.dst.error)
- err = rt->u.dst.error;
- } else {
- int oif = 0;
- if (rta[RTA_OIF - 1])
- memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
- fl.oif = oif;
- err = dn_route_output_key((struct dst_entry **)&rt, &fl, 0);
- }
-
- if (skb->dev)
- dev_put(skb->dev);
- skb->dev = NULL;
- if (err)
- goto out_free;
- skb->dst = &rt->u.dst;
- if (rtm->rtm_flags & RTM_F_NOTIFY)
- rt->rt_flags |= RTCF_NOTIFY;
-
- err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
-
- if (err == 0)
- goto out_free;
- if (err < 0) {
- err = -EMSGSIZE;
- goto out_free;
- }
-
- return rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
-
-out_free:
- kfree_skb(skb);
- return err;
-}
-
-/*
- * For routers, this is called from dn_fib_dump, but for endnodes its
- * called directly from the rtnetlink dispatch table.
- */
-int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct dn_route *rt;
- int h, s_h;
- int idx, s_idx;
-
- if (NLMSG_PAYLOAD(cb->nlh, 0) < sizeof(struct rtmsg))
- return -EINVAL;
- if (!(((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED))
- return 0;
-
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
- for(h = 0; h <= dn_rt_hash_mask; h++) {
- if (h < s_h)
- continue;
- if (h > s_h)
- s_idx = 0;
- rcu_read_lock_bh();
- for(rt = rcu_dereference(dn_rt_hash_table[h].chain), idx = 0;
- rt;
- rt = rcu_dereference(rt->u.rt_next), idx++) {
- if (idx < s_idx)
- continue;
- skb->dst = dst_clone(&rt->u.dst);
- if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- 1, NLM_F_MULTI) <= 0) {
- dst_release(xchg(&skb->dst, NULL));
- rcu_read_unlock_bh();
- goto done;
- }
- dst_release(xchg(&skb->dst, NULL));
- }
- rcu_read_unlock_bh();
- }
-
-done:
- cb->args[0] = h;
- cb->args[1] = idx;
- return skb->len;
-}
-
-#ifdef CONFIG_PROC_FS
-struct dn_rt_cache_iter_state {
- int bucket;
-};
-
-static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq)
-{
- struct dn_route *rt = NULL;
- struct dn_rt_cache_iter_state *s = seq->private;
-
- for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
- rcu_read_lock_bh();
- rt = dn_rt_hash_table[s->bucket].chain;
- if (rt)
- break;
- rcu_read_unlock_bh();
- }
- return rt;
-}
-
-static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt)
-{
- struct dn_rt_cache_iter_state *s = rcu_dereference(seq->private);
-
- rt = rt->u.rt_next;
- while(!rt) {
- rcu_read_unlock_bh();
- if (--s->bucket < 0)
- break;
- rcu_read_lock_bh();
- rt = dn_rt_hash_table[s->bucket].chain;
- }
- return rt;
-}
-
-static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
-{
- struct dn_route *rt = dn_rt_cache_get_first(seq);
-
- if (rt) {
- while(*pos && (rt = dn_rt_cache_get_next(seq, rt)))
- --*pos;
- }
- return *pos ? NULL : rt;
-}
-
-static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct dn_route *rt = dn_rt_cache_get_next(seq, v);
- ++*pos;
- return rt;
-}
-
-static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v)
-{
- if (v)
- rcu_read_unlock_bh();
-}
-
-static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
-{
- struct dn_route *rt = v;
- char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
-
- seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
- rt->u.dst.dev ? rt->u.dst.dev->name : "*",
- dn_addr2asc(dn_ntohs(rt->rt_daddr), buf1),
- dn_addr2asc(dn_ntohs(rt->rt_saddr), buf2),
- atomic_read(&rt->u.dst.__refcnt),
- rt->u.dst.__use,
- (int) dst_metric(&rt->u.dst, RTAX_RTT));
- return 0;
-}
-
-static struct seq_operations dn_rt_cache_seq_ops = {
- .start = dn_rt_cache_seq_start,
- .next = dn_rt_cache_seq_next,
- .stop = dn_rt_cache_seq_stop,
- .show = dn_rt_cache_seq_show,
-};
-
-static int dn_rt_cache_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct dn_rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
- rc = seq_open(file, &dn_rt_cache_seq_ops);
- if (rc)
- goto out_kfree;
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations dn_rt_cache_seq_fops = {
- .owner = THIS_MODULE,
- .open = dn_rt_cache_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-#endif /* CONFIG_PROC_FS */
-
-void __init dn_route_init(void)
-{
- int i, goal, order;
-
- dn_dst_ops.kmem_cachep =
- kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
- init_timer(&dn_route_timer);
- dn_route_timer.function = dn_dst_check_expire;
- dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
- add_timer(&dn_route_timer);
-
- goal = num_physpages >> (26 - PAGE_SHIFT);
-
- for(order = 0; (1UL << order) < goal; order++)
- /* NOTHING */;
-
- /*
- * Only want 1024 entries max, since the table is very, very unlikely
- * to be larger than that.
- */
- while(order && ((((1UL << order) * PAGE_SIZE) /
- sizeof(struct dn_rt_hash_bucket)) >= 2048))
- order--;
-
- do {
- dn_rt_hash_mask = (1UL << order) * PAGE_SIZE /
- sizeof(struct dn_rt_hash_bucket);
- while(dn_rt_hash_mask & (dn_rt_hash_mask - 1))
- dn_rt_hash_mask--;
- dn_rt_hash_table = (struct dn_rt_hash_bucket *)
- __get_free_pages(GFP_ATOMIC, order);
- } while (dn_rt_hash_table == NULL && --order > 0);
-
- if (!dn_rt_hash_table)
- panic("Failed to allocate DECnet route cache hash table\n");
-
- printk(KERN_INFO
- "DECnet: Routing cache hash table of %u buckets, %ldKbytes\n",
- dn_rt_hash_mask,
- (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024);
-
- dn_rt_hash_mask--;
- for(i = 0; i <= dn_rt_hash_mask; i++) {
- spin_lock_init(&dn_rt_hash_table[i].lock);
- dn_rt_hash_table[i].chain = NULL;
- }
-
- dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
-
- proc_net_fops_create("decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops);
-}
-
-void __exit dn_route_cleanup(void)
-{
- del_timer(&dn_route_timer);
- dn_run_flush(0);
-
- proc_net_remove("decnet_cache");
-}
-
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
deleted file mode 100644
index e32d0c3d5a96..000000000000
--- a/net/decnet/dn_rules.c
+++ /dev/null
@@ -1,275 +0,0 @@
-
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Forwarding Information Base (Rules)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- * Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c
- *
- *
- * Changes:
- * Steve Whitehouse <steve@chygwyn.com>
- * Updated for Thomas Graf's generic rules
- *
- */
-#include <linux/net.h>
-#include <linux/init.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-
-static struct fib_rules_ops dn_fib_rules_ops;
-
-struct dn_fib_rule
-{
- struct fib_rule common;
- unsigned char dst_len;
- unsigned char src_len;
- __le16 src;
- __le16 srcmask;
- __le16 dst;
- __le16 dstmask;
- __le16 srcmap;
- u8 flags;
-};
-
-static struct dn_fib_rule default_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0x7fff,
- .table = RT_TABLE_MAIN,
- .action = FR_ACT_TO_TBL,
- },
-};
-
-static LIST_HEAD(dn_fib_rules);
-
-
-int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
-{
- struct fib_lookup_arg arg = {
- .result = res,
- };
- int err;
-
- err = fib_rules_lookup(&dn_fib_rules_ops, flp, 0, &arg);
- res->r = arg.rule;
-
- return err;
-}
-
-static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
-{
- int err = -EAGAIN;
- struct dn_fib_table *tbl;
-
- switch(rule->action) {
- case FR_ACT_TO_TBL:
- break;
-
- case FR_ACT_UNREACHABLE:
- err = -ENETUNREACH;
- goto errout;
-
- case FR_ACT_PROHIBIT:
- err = -EACCES;
- goto errout;
-
- case FR_ACT_BLACKHOLE:
- default:
- err = -EINVAL;
- goto errout;
- }
-
- tbl = dn_fib_get_table(rule->table, 0);
- if (tbl == NULL)
- goto errout;
-
- err = tbl->lookup(tbl, flp, (struct dn_fib_res *)arg->result);
- if (err > 0)
- err = -EAGAIN;
-errout:
- return err;
-}
-
-static struct nla_policy dn_fib_rule_policy[FRA_MAX+1] __read_mostly = {
- FRA_GENERIC_POLICY,
- [FRA_SRC] = { .type = NLA_U16 },
- [FRA_DST] = { .type = NLA_U16 },
-};
-
-static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
-{
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
- __le16 daddr = fl->fld_dst;
- __le16 saddr = fl->fld_src;
-
- if (((saddr ^ r->src) & r->srcmask) ||
- ((daddr ^ r->dst) & r->dstmask))
- return 0;
-
- return 1;
-}
-
-static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
- struct nlattr **tb)
-{
- int err = -EINVAL;
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
- if (frh->src_len > 16 || frh->dst_len > 16 || frh->tos)
- goto errout;
-
- if (rule->table == RT_TABLE_UNSPEC) {
- if (rule->action == FR_ACT_TO_TBL) {
- struct dn_fib_table *table;
-
- table = dn_fib_empty_table();
- if (table == NULL) {
- err = -ENOBUFS;
- goto errout;
- }
-
- rule->table = table->n;
- }
- }
-
- if (tb[FRA_SRC])
- r->src = nla_get_u16(tb[FRA_SRC]);
-
- if (tb[FRA_DST])
- r->dst = nla_get_u16(tb[FRA_DST]);
-
- r->src_len = frh->src_len;
- r->srcmask = dnet_make_mask(r->src_len);
- r->dst_len = frh->dst_len;
- r->dstmask = dnet_make_mask(r->dst_len);
- err = 0;
-errout:
- return err;
-}
-
-static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
- struct nlattr **tb)
-{
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
- if (frh->src_len && (r->src_len != frh->src_len))
- return 0;
-
- if (frh->dst_len && (r->dst_len != frh->dst_len))
- return 0;
-
- if (tb[FRA_SRC] && (r->src != nla_get_u16(tb[FRA_SRC])))
- return 0;
-
- if (tb[FRA_DST] && (r->dst != nla_get_u16(tb[FRA_DST])))
- return 0;
-
- return 1;
-}
-
-unsigned dnet_addr_type(__le16 addr)
-{
- struct flowi fl = { .nl_u = { .dn_u = { .daddr = addr } } };
- struct dn_fib_res res;
- unsigned ret = RTN_UNICAST;
- struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
-
- res.r = NULL;
-
- if (tb) {
- if (!tb->lookup(tb, &fl, &res)) {
- ret = res.type;
- dn_fib_res_put(&res);
- }
- }
- return ret;
-}
-
-static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
-{
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
- frh->family = AF_DECnet;
- frh->dst_len = r->dst_len;
- frh->src_len = r->src_len;
- frh->tos = 0;
-
- if (r->dst_len)
- NLA_PUT_U16(skb, FRA_DST, r->dst);
- if (r->src_len)
- NLA_PUT_U16(skb, FRA_SRC, r->src);
-
- return 0;
-
-nla_put_failure:
- return -ENOBUFS;
-}
-
-static u32 dn_fib_rule_default_pref(void)
-{
- struct list_head *pos;
- struct fib_rule *rule;
-
- if (!list_empty(&dn_fib_rules)) {
- pos = dn_fib_rules.next;
- if (pos->next != &dn_fib_rules) {
- rule = list_entry(pos->next, struct fib_rule, list);
- if (rule->pref)
- return rule->pref - 1;
- }
- }
-
- return 0;
-}
-
-int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return fib_rules_dump(skb, cb, AF_DECnet);
-}
-
-static struct fib_rules_ops dn_fib_rules_ops = {
- .family = AF_DECnet,
- .rule_size = sizeof(struct dn_fib_rule),
- .action = dn_fib_rule_action,
- .match = dn_fib_rule_match,
- .configure = dn_fib_rule_configure,
- .compare = dn_fib_rule_compare,
- .fill = dn_fib_rule_fill,
- .default_pref = dn_fib_rule_default_pref,
- .nlgroup = RTNLGRP_DECnet_RULE,
- .policy = dn_fib_rule_policy,
- .rules_list = &dn_fib_rules,
- .owner = THIS_MODULE,
-};
-
-void __init dn_fib_rules_init(void)
-{
- list_add_tail(&default_rule.common.list, &dn_fib_rules);
- fib_rules_register(&dn_fib_rules_ops);
-}
-
-void __exit dn_fib_rules_cleanup(void)
-{
- fib_rules_unregister(&dn_fib_rules_ops);
-}
-
-
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
deleted file mode 100644
index bdbc3f431668..000000000000
--- a/net/decnet/dn_table.c
+++ /dev/null
@@ -1,900 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Forwarding Information Base (Routing Tables)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- * Mostly copied from the IPv4 routing code
- *
- *
- * Changes:
- *
- */
-#include <linux/string.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/uaccess.h>
-#include <linux/route.h> /* RTF_xxx */
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-
-struct dn_zone
-{
- struct dn_zone *dz_next;
- struct dn_fib_node **dz_hash;
- int dz_nent;
- int dz_divisor;
- u32 dz_hashmask;
-#define DZ_HASHMASK(dz) ((dz)->dz_hashmask)
- int dz_order;
- __le16 dz_mask;
-#define DZ_MASK(dz) ((dz)->dz_mask)
-};
-
-struct dn_hash
-{
- struct dn_zone *dh_zones[17];
- struct dn_zone *dh_zone_list;
-};
-
-#define dz_key_0(key) ((key).datum = 0)
-#define dz_prefix(key,dz) ((key).datum)
-
-#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
- for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define endfor_nexthops(fi) }
-
-#define DN_MAX_DIVISOR 1024
-#define DN_S_ZOMBIE 1
-#define DN_S_ACCESSED 2
-
-#define DN_FIB_SCAN(f, fp) \
-for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
-
-#define DN_FIB_SCAN_KEY(f, fp, key) \
-for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
-
-#define RT_TABLE_MIN 1
-#define DN_FIB_TABLE_HASHSZ 256
-static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
-static DEFINE_RWLOCK(dn_fib_tables_lock);
-
-static kmem_cache_t *dn_hash_kmem __read_mostly;
-static int dn_fib_hash_zombies;
-
-static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
-{
- u16 h = dn_ntohs(key.datum)>>(16 - dz->dz_order);
- h ^= (h >> 10);
- h ^= (h >> 6);
- h &= DZ_HASHMASK(dz);
- return *(dn_fib_idx_t *)&h;
-}
-
-static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz)
-{
- dn_fib_key_t k;
- k.datum = dst & DZ_MASK(dz);
- return k;
-}
-
-static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz)
-{
- return &dz->dz_hash[dn_hash(key, dz).datum];
-}
-
-static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz)
-{
- return dz->dz_hash[dn_hash(key, dz).datum];
-}
-
-static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b)
-{
- return a.datum == b.datum;
-}
-
-static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b)
-{
- return a.datum <= b.datum;
-}
-
-static inline void dn_rebuild_zone(struct dn_zone *dz,
- struct dn_fib_node **old_ht,
- int old_divisor)
-{
- int i;
- struct dn_fib_node *f, **fp, *next;
-
- for(i = 0; i < old_divisor; i++) {
- for(f = old_ht[i]; f; f = f->fn_next) {
- next = f->fn_next;
- for(fp = dn_chain_p(f->fn_key, dz);
- *fp && dn_key_leq((*fp)->fn_key, f->fn_key);
- fp = &(*fp)->fn_next)
- /* NOTHING */;
- f->fn_next = *fp;
- *fp = f;
- }
- }
-}
-
-static void dn_rehash_zone(struct dn_zone *dz)
-{
- struct dn_fib_node **ht, **old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- old_divisor = dz->dz_divisor;
-
- switch(old_divisor) {
- case 16:
- new_divisor = 256;
- new_hashmask = 0xFF;
- break;
- default:
- printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", old_divisor);
- case 256:
- new_divisor = 1024;
- new_hashmask = 0x3FF;
- break;
- }
-
- ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL);
- if (ht == NULL)
- return;
-
- write_lock_bh(&dn_fib_tables_lock);
- old_ht = dz->dz_hash;
- dz->dz_hash = ht;
- dz->dz_hashmask = new_hashmask;
- dz->dz_divisor = new_divisor;
- dn_rebuild_zone(dz, old_ht, old_divisor);
- write_unlock_bh(&dn_fib_tables_lock);
- kfree(old_ht);
-}
-
-static void dn_free_node(struct dn_fib_node *f)
-{
- dn_fib_release_info(DN_FIB_INFO(f));
- kmem_cache_free(dn_hash_kmem, f);
-}
-
-
-static struct dn_zone *dn_new_zone(struct dn_hash *table, int z)
-{
- int i;
- struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL);
- if (!dz)
- return NULL;
-
- if (z) {
- dz->dz_divisor = 16;
- dz->dz_hashmask = 0x0F;
- } else {
- dz->dz_divisor = 1;
- dz->dz_hashmask = 0;
- }
-
- dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL);
- if (!dz->dz_hash) {
- kfree(dz);
- return NULL;
- }
-
- dz->dz_order = z;
- dz->dz_mask = dnet_make_mask(z);
-
- for(i = z + 1; i <= 16; i++)
- if (table->dh_zones[i])
- break;
-
- write_lock_bh(&dn_fib_tables_lock);
- if (i>16) {
- dz->dz_next = table->dh_zone_list;
- table->dh_zone_list = dz;
- } else {
- dz->dz_next = table->dh_zones[i]->dz_next;
- table->dh_zones[i]->dz_next = dz;
- }
- table->dh_zones[z] = dz;
- write_unlock_bh(&dn_fib_tables_lock);
- return dz;
-}
-
-
-static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern_rta *rta, struct dn_fib_info *fi)
-{
- struct rtnexthop *nhp;
- int nhlen;
-
- if (rta->rta_priority && *rta->rta_priority != fi->fib_priority)
- return 1;
-
- if (rta->rta_oif || rta->rta_gw) {
- if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
- (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 2) == 0))
- return 0;
- return 1;
- }
-
- if (rta->rta_mp == NULL)
- return 0;
-
- nhp = RTA_DATA(rta->rta_mp);
- nhlen = RTA_PAYLOAD(rta->rta_mp);
-
- for_nexthops(fi) {
- int attrlen = nhlen - sizeof(struct rtnexthop);
- __le16 gw;
-
- if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
- return -EINVAL;
- if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
- return 1;
- if (attrlen) {
- gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
-
- if (gw && gw != nh->nh_gw)
- return 1;
- }
- nhp = RTNH_NEXT(nhp);
- } endfor_nexthops(fi);
-
- return 0;
-}
-
-static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
-{
- size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
- + nla_total_size(4) /* RTA_TABLE */
- + nla_total_size(2) /* RTA_DST */
- + nla_total_size(4); /* RTA_PRIORITY */
-
- /* space for nested metrics */
- payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
-
- if (fi->fib_nhs) {
- /* Also handles the special case fib_nhs == 1 */
-
- /* each nexthop is packed in an attribute */
- size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
-
- /* may contain a gateway attribute */
- nhsize += nla_total_size(4);
-
- /* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
- }
-
- return payload;
-}
-
-static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, void *dst, int dst_len,
- struct dn_fib_info *fi, unsigned int flags)
-{
- struct rtmsg *rtm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
-
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
- rtm = NLMSG_DATA(nlh);
- rtm->rtm_family = AF_DECnet;
- rtm->rtm_dst_len = dst_len;
- rtm->rtm_src_len = 0;
- rtm->rtm_tos = 0;
- rtm->rtm_table = tb_id;
- RTA_PUT_U32(skb, RTA_TABLE, tb_id);
- rtm->rtm_flags = fi->fib_flags;
- rtm->rtm_scope = scope;
- rtm->rtm_type = type;
- if (rtm->rtm_dst_len)
- RTA_PUT(skb, RTA_DST, 2, dst);
- rtm->rtm_protocol = fi->fib_protocol;
- if (fi->fib_priority)
- RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
- if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
- goto rtattr_failure;
- if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw)
- RTA_PUT(skb, RTA_GATEWAY, 2, &fi->fib_nh->nh_gw);
- if (fi->fib_nh->nh_oif)
- RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
- }
- if (fi->fib_nhs > 1) {
- struct rtnexthop *nhp;
- struct rtattr *mp_head;
- if (skb_tailroom(skb) <= RTA_SPACE(0))
- goto rtattr_failure;
- mp_head = (struct rtattr *)skb_put(skb, RTA_SPACE(0));
-
- for_nexthops(fi) {
- if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
- goto rtattr_failure;
- nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
- nhp->rtnh_flags = nh->nh_flags & 0xFF;
- nhp->rtnh_hops = nh->nh_weight - 1;
- nhp->rtnh_ifindex = nh->nh_oif;
- if (nh->nh_gw)
- RTA_PUT(skb, RTA_GATEWAY, 2, &nh->nh_gw);
- nhp->rtnh_len = skb->tail - (unsigned char *)nhp;
- } endfor_nexthops(fi);
- mp_head->rta_type = RTA_MULTIPATH;
- mp_head->rta_len = skb->tail - (u8*)mp_head;
- }
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-
-nlmsg_failure:
-rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-
-static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
- struct nlmsghdr *nlh, struct netlink_skb_parms *req)
-{
- struct sk_buff *skb;
- u32 pid = req ? req->pid : 0;
- int err = -ENOBUFS;
-
- skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL);
- if (skb == NULL)
- goto errout;
-
- err = dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id,
- f->fn_type, f->fn_scope, &f->fn_key, z,
- DN_FIB_INFO(f), 0);
- /* failure implies BUG in dn_fib_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
-errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_DECnet_ROUTE, err);
-}
-
-static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
- struct netlink_callback *cb,
- struct dn_fib_table *tb,
- struct dn_zone *dz,
- struct dn_fib_node *f)
-{
- int i, s_i;
-
- s_i = cb->args[4];
- for(i = 0; f; i++, f = f->fn_next) {
- if (i < s_i)
- continue;
- if (f->fn_state & DN_S_ZOMBIE)
- continue;
- if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->n,
- (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
- f->fn_scope, &f->fn_key, dz->dz_order,
- f->fn_info, NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static __inline__ int dn_hash_dump_zone(struct sk_buff *skb,
- struct netlink_callback *cb,
- struct dn_fib_table *tb,
- struct dn_zone *dz)
-{
- int h, s_h;
-
- s_h = cb->args[3];
- for(h = 0; h < dz->dz_divisor; h++) {
- if (h < s_h)
- continue;
- if (h > s_h)
- memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0]));
- if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL)
- continue;
- if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) {
- cb->args[3] = h;
- return -1;
- }
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- int m, s_m;
- struct dn_zone *dz;
- struct dn_hash *table = (struct dn_hash *)tb->data;
-
- s_m = cb->args[2];
- read_lock(&dn_fib_tables_lock);
- for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) {
- if (m < s_m)
- continue;
- if (m > s_m)
- memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
-
- if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) {
- cb->args[2] = m;
- read_unlock(&dn_fib_tables_lock);
- return -1;
- }
- }
- read_unlock(&dn_fib_tables_lock);
- cb->args[2] = m;
-
- return skb->len;
-}
-
-int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- unsigned int h, s_h;
- unsigned int e = 0, s_e;
- struct dn_fib_table *tb;
- struct hlist_node *node;
- int dumped = 0;
-
- if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
- ((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
- return dn_cache_dump(skb, cb);
-
- s_h = cb->args[0];
- s_e = cb->args[1];
-
- for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) {
- e = 0;
- hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) {
- if (e < s_e)
- goto next;
- if (dumped)
- memset(&cb->args[2], 0, sizeof(cb->args) -
- 2 * sizeof(cb->args[0]));
- if (tb->dump(tb, skb, cb) < 0)
- goto out;
- dumped = 1;
-next:
- e++;
- }
- }
-out:
- cb->args[1] = e;
- cb->args[0] = h;
-
- return skb->len;
-}
-
-static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
-{
- struct dn_hash *table = (struct dn_hash *)tb->data;
- struct dn_fib_node *new_f, *f, **fp, **del_fp;
- struct dn_zone *dz;
- struct dn_fib_info *fi;
- int z = r->rtm_dst_len;
- int type = r->rtm_type;
- dn_fib_key_t key;
- int err;
-
- if (z > 16)
- return -EINVAL;
-
- dz = table->dh_zones[z];
- if (!dz && !(dz = dn_new_zone(table, z)))
- return -ENOBUFS;
-
- dz_key_0(key);
- if (rta->rta_dst) {
- __le16 dst;
- memcpy(&dst, rta->rta_dst, 2);
- if (dst & ~DZ_MASK(dz))
- return -EINVAL;
- key = dz_key(dst, dz);
- }
-
- if ((fi = dn_fib_create_info(r, rta, n, &err)) == NULL)
- return err;
-
- if (dz->dz_nent > (dz->dz_divisor << 2) &&
- dz->dz_divisor > DN_MAX_DIVISOR &&
- (z==16 || (1<<z) > dz->dz_divisor))
- dn_rehash_zone(dz);
-
- fp = dn_chain_p(key, dz);
-
- DN_FIB_SCAN(f, fp) {
- if (dn_key_leq(key, f->fn_key))
- break;
- }
-
- del_fp = NULL;
-
- if (f && (f->fn_state & DN_S_ZOMBIE) &&
- dn_key_eq(f->fn_key, key)) {
- del_fp = fp;
- fp = &f->fn_next;
- f = *fp;
- goto create;
- }
-
- DN_FIB_SCAN_KEY(f, fp, key) {
- if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority)
- break;
- }
-
- if (f && dn_key_eq(f->fn_key, key) &&
- fi->fib_priority == DN_FIB_INFO(f)->fib_priority) {
- struct dn_fib_node **ins_fp;
-
- err = -EEXIST;
- if (n->nlmsg_flags & NLM_F_EXCL)
- goto out;
-
- if (n->nlmsg_flags & NLM_F_REPLACE) {
- del_fp = fp;
- fp = &f->fn_next;
- f = *fp;
- goto replace;
- }
-
- ins_fp = fp;
- err = -EEXIST;
-
- DN_FIB_SCAN_KEY(f, fp, key) {
- if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority)
- break;
- if (f->fn_type == type && f->fn_scope == r->rtm_scope
- && DN_FIB_INFO(f) == fi)
- goto out;
- }
-
- if (!(n->nlmsg_flags & NLM_F_APPEND)) {
- fp = ins_fp;
- f = *fp;
- }
- }
-
-create:
- err = -ENOENT;
- if (!(n->nlmsg_flags & NLM_F_CREATE))
- goto out;
-
-replace:
- err = -ENOBUFS;
- new_f = kmem_cache_alloc(dn_hash_kmem, SLAB_KERNEL);
- if (new_f == NULL)
- goto out;
-
- memset(new_f, 0, sizeof(struct dn_fib_node));
-
- new_f->fn_key = key;
- new_f->fn_type = type;
- new_f->fn_scope = r->rtm_scope;
- DN_FIB_INFO(new_f) = fi;
-
- new_f->fn_next = f;
- write_lock_bh(&dn_fib_tables_lock);
- *fp = new_f;
- write_unlock_bh(&dn_fib_tables_lock);
- dz->dz_nent++;
-
- if (del_fp) {
- f = *del_fp;
- write_lock_bh(&dn_fib_tables_lock);
- *del_fp = f->fn_next;
- write_unlock_bh(&dn_fib_tables_lock);
-
- if (!(f->fn_state & DN_S_ZOMBIE))
- dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
- if (f->fn_state & DN_S_ACCESSED)
- dn_rt_cache_flush(-1);
- dn_free_node(f);
- dz->dz_nent--;
- } else {
- dn_rt_cache_flush(-1);
- }
-
- dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req);
-
- return 0;
-out:
- dn_fib_release_info(fi);
- return err;
-}
-
-
-static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
-{
- struct dn_hash *table = (struct dn_hash*)tb->data;
- struct dn_fib_node **fp, **del_fp, *f;
- int z = r->rtm_dst_len;
- struct dn_zone *dz;
- dn_fib_key_t key;
- int matched;
-
-
- if (z > 16)
- return -EINVAL;
-
- if ((dz = table->dh_zones[z]) == NULL)
- return -ESRCH;
-
- dz_key_0(key);
- if (rta->rta_dst) {
- __le16 dst;
- memcpy(&dst, rta->rta_dst, 2);
- if (dst & ~DZ_MASK(dz))
- return -EINVAL;
- key = dz_key(dst, dz);
- }
-
- fp = dn_chain_p(key, dz);
-
- DN_FIB_SCAN(f, fp) {
- if (dn_key_eq(f->fn_key, key))
- break;
- if (dn_key_leq(key, f->fn_key))
- return -ESRCH;
- }
-
- matched = 0;
- del_fp = NULL;
- DN_FIB_SCAN_KEY(f, fp, key) {
- struct dn_fib_info *fi = DN_FIB_INFO(f);
-
- if (f->fn_state & DN_S_ZOMBIE)
- return -ESRCH;
-
- matched++;
-
- if (del_fp == NULL &&
- (!r->rtm_type || f->fn_type == r->rtm_type) &&
- (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
- (!r->rtm_protocol ||
- fi->fib_protocol == r->rtm_protocol) &&
- dn_fib_nh_match(r, n, rta, fi) == 0)
- del_fp = fp;
- }
-
- if (del_fp) {
- f = *del_fp;
- dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
-
- if (matched != 1) {
- write_lock_bh(&dn_fib_tables_lock);
- *del_fp = f->fn_next;
- write_unlock_bh(&dn_fib_tables_lock);
-
- if (f->fn_state & DN_S_ACCESSED)
- dn_rt_cache_flush(-1);
- dn_free_node(f);
- dz->dz_nent--;
- } else {
- f->fn_state |= DN_S_ZOMBIE;
- if (f->fn_state & DN_S_ACCESSED) {
- f->fn_state &= ~DN_S_ACCESSED;
- dn_rt_cache_flush(-1);
- }
- if (++dn_fib_hash_zombies > 128)
- dn_fib_flush();
- }
-
- return 0;
- }
-
- return -ESRCH;
-}
-
-static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table)
-{
- int found = 0;
- struct dn_fib_node *f;
-
- while((f = *fp) != NULL) {
- struct dn_fib_info *fi = DN_FIB_INFO(f);
-
- if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) {
- write_lock_bh(&dn_fib_tables_lock);
- *fp = f->fn_next;
- write_unlock_bh(&dn_fib_tables_lock);
-
- dn_free_node(f);
- found++;
- continue;
- }
- fp = &f->fn_next;
- }
-
- return found;
-}
-
-static int dn_fib_table_flush(struct dn_fib_table *tb)
-{
- struct dn_hash *table = (struct dn_hash *)tb->data;
- struct dn_zone *dz;
- int found = 0;
-
- dn_fib_hash_zombies = 0;
- for(dz = table->dh_zone_list; dz; dz = dz->dz_next) {
- int i;
- int tmp = 0;
- for(i = dz->dz_divisor-1; i >= 0; i--)
- tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table);
- dz->dz_nent -= tmp;
- found += tmp;
- }
-
- return found;
-}
-
-static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowi *flp, struct dn_fib_res *res)
-{
- int err;
- struct dn_zone *dz;
- struct dn_hash *t = (struct dn_hash *)tb->data;
-
- read_lock(&dn_fib_tables_lock);
- for(dz = t->dh_zone_list; dz; dz = dz->dz_next) {
- struct dn_fib_node *f;
- dn_fib_key_t k = dz_key(flp->fld_dst, dz);
-
- for(f = dz_chain(k, dz); f; f = f->fn_next) {
- if (!dn_key_eq(k, f->fn_key)) {
- if (dn_key_leq(k, f->fn_key))
- break;
- else
- continue;
- }
-
- f->fn_state |= DN_S_ACCESSED;
-
- if (f->fn_state&DN_S_ZOMBIE)
- continue;
-
- if (f->fn_scope < flp->fld_scope)
- continue;
-
- err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res);
-
- if (err == 0) {
- res->type = f->fn_type;
- res->scope = f->fn_scope;
- res->prefixlen = dz->dz_order;
- goto out;
- }
- if (err < 0)
- goto out;
- }
- }
- err = 1;
-out:
- read_unlock(&dn_fib_tables_lock);
- return err;
-}
-
-
-struct dn_fib_table *dn_fib_get_table(u32 n, int create)
-{
- struct dn_fib_table *t;
- struct hlist_node *node;
- unsigned int h;
-
- if (n < RT_TABLE_MIN)
- return NULL;
-
- if (n > RT_TABLE_MAX)
- return NULL;
-
- h = n & (DN_FIB_TABLE_HASHSZ - 1);
- rcu_read_lock();
- hlist_for_each_entry_rcu(t, node, &dn_fib_table_hash[h], hlist) {
- if (t->n == n) {
- rcu_read_unlock();
- return t;
- }
- }
- rcu_read_unlock();
-
- if (!create)
- return NULL;
-
- if (in_interrupt() && net_ratelimit()) {
- printk(KERN_DEBUG "DECnet: BUG! Attempt to create routing table from interrupt\n");
- return NULL;
- }
-
- t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash),
- GFP_KERNEL);
- if (t == NULL)
- return NULL;
-
- t->n = n;
- t->insert = dn_fib_table_insert;
- t->delete = dn_fib_table_delete;
- t->lookup = dn_fib_table_lookup;
- t->flush = dn_fib_table_flush;
- t->dump = dn_fib_table_dump;
- hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]);
-
- return t;
-}
-
-struct dn_fib_table *dn_fib_empty_table(void)
-{
- u32 id;
-
- for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
- if (dn_fib_get_table(id, 0) == NULL)
- return dn_fib_get_table(id, 1);
- return NULL;
-}
-
-void dn_fib_flush(void)
-{
- int flushed = 0;
- struct dn_fib_table *tb;
- struct hlist_node *node;
- unsigned int h;
-
- for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist)
- flushed += tb->flush(tb);
- }
-
- if (flushed)
- dn_rt_cache_flush(-1);
-}
-
-void __init dn_fib_table_init(void)
-{
- dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
- sizeof(struct dn_fib_info),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
-}
-
-void __exit dn_fib_table_cleanup(void)
-{
- struct dn_fib_table *t;
- struct hlist_node *node, *next;
- unsigned int h;
-
- write_lock(&dn_fib_tables_lock);
- for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry_safe(t, node, next, &dn_fib_table_hash[h],
- hlist) {
- hlist_del(&t->hlist);
- kfree(t);
- }
- }
- write_unlock(&dn_fib_tables_lock);
-}
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
deleted file mode 100644
index 09825711d58a..000000000000
--- a/net/decnet/dn_timer.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Socket Timer Functions
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse : Made keepalive timer part of the same
- * timer idea.
- * Steve Whitehouse : Added checks for sk->sock_readers
- * David S. Miller : New socket locking
- * Steve Whitehouse : Timer grabs socket ref.
- */
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <net/sock.h>
-#include <asm/atomic.h>
-#include <net/flow.h>
-#include <net/dn.h>
-
-/*
- * Slow timer is for everything else (n * 500mS)
- */
-
-#define SLOW_INTERVAL (HZ/2)
-
-static void dn_slow_timer(unsigned long arg);
-
-void dn_start_slow_timer(struct sock *sk)
-{
- sk->sk_timer.expires = jiffies + SLOW_INTERVAL;
- sk->sk_timer.function = dn_slow_timer;
- sk->sk_timer.data = (unsigned long)sk;
-
- add_timer(&sk->sk_timer);
-}
-
-void dn_stop_slow_timer(struct sock *sk)
-{
- del_timer(&sk->sk_timer);
-}
-
-static void dn_slow_timer(unsigned long arg)
-{
- struct sock *sk = (struct sock *)arg;
- struct dn_scp *scp = DN_SK(sk);
-
- sock_hold(sk);
- bh_lock_sock(sk);
-
- if (sock_owned_by_user(sk)) {
- sk->sk_timer.expires = jiffies + HZ / 10;
- add_timer(&sk->sk_timer);
- goto out;
- }
-
- /*
- * The persist timer is the standard slow timer used for retransmits
- * in both connection establishment and disconnection as well as
- * in the RUN state. The different states are catered for by changing
- * the function pointer in the socket. Setting the timer to a value
- * of zero turns it off. We allow the persist_fxn to turn the
- * timer off in a permant way by returning non-zero, so that
- * timer based routines may remove sockets. This is why we have a
- * sock_hold()/sock_put() around the timer to prevent the socket
- * going away in the middle.
- */
- if (scp->persist && scp->persist_fxn) {
- if (scp->persist <= SLOW_INTERVAL) {
- scp->persist = 0;
-
- if (scp->persist_fxn(sk))
- goto out;
- } else {
- scp->persist -= SLOW_INTERVAL;
- }
- }
-
- /*
- * Check for keepalive timeout. After the other timer 'cos if
- * the previous timer caused a retransmit, we don't need to
- * do this. scp->stamp is the last time that we sent a packet.
- * The keepalive function sends a link service packet to the
- * other end. If it remains unacknowledged, the standard
- * socket timers will eventually shut the socket down. Each
- * time we do this, scp->stamp will be updated, thus
- * we won't try and send another until scp->keepalive has passed
- * since the last successful transmission.
- */
- if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) {
- if ((jiffies - scp->stamp) >= scp->keepalive)
- scp->keepalive_fxn(sk);
- }
-
- sk->sk_timer.expires = jiffies + SLOW_INTERVAL;
-
- add_timer(&sk->sk_timer);
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig
deleted file mode 100644
index ecdb3f9f14ca..000000000000
--- a/net/decnet/netfilter/Kconfig
+++ /dev/null
@@ -1,15 +0,0 @@
-#
-# DECnet netfilter configuration
-#
-
-menu "DECnet: Netfilter Configuration"
- depends on DECNET && NETFILTER && EXPERIMENTAL
-
-config DECNET_NF_GRABULATOR
- tristate "Routing message grabulator (for userland routing daemon)"
- help
- Enable this module if you want to use the userland DECnet routing
- daemon. You will also need to enable routing support for DECnet
- unless you just want to monitor routing messages from other nodes.
-
-endmenu
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
deleted file mode 100644
index 255c1ae9daeb..000000000000
--- a/net/decnet/netfilter/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# Makefile for DECnet netfilter modules
-#
-
-obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
-
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
deleted file mode 100644
index 8b99bd33540d..000000000000
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Message Grabulator
- *
- * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/
- * This code may be copied under the GPL v.2 or at your option
- * any later version.
- *
- * Author: Steven Whitehouse <steve@chygwyn.com>
- *
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/spinlock.h>
-#include <linux/netlink.h>
-#include <linux/netfilter_decnet.h>
-
-#include <net/sock.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-
-static struct sock *dnrmg = NULL;
-
-
-static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp)
-{
- struct sk_buff *skb = NULL;
- size_t size;
- unsigned char *old_tail;
- struct nlmsghdr *nlh;
- unsigned char *ptr;
- struct nf_dn_rtmsg *rtm;
-
- size = NLMSG_SPACE(rt_skb->len);
- size += NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, 0, size - sizeof(*nlh));
- rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh);
- rtm->nfdn_ifindex = rt_skb->dev->ifindex;
- ptr = NFDN_RTMSG(rtm);
- memcpy(ptr, rt_skb->data, rt_skb->len);
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- if (skb)
- kfree_skb(skb);
- *errp = -ENOMEM;
- if (net_ratelimit())
- printk(KERN_ERR "dn_rtmsg: error creating netlink message\n");
- return NULL;
-}
-
-static void dnrmg_send_peer(struct sk_buff *skb)
-{
- struct sk_buff *skb2;
- int status = 0;
- int group = 0;
- unsigned char flags = *skb->data;
-
- switch(flags & DN_RT_CNTL_MSK) {
- case DN_RT_PKT_L1RT:
- group = DNRNG_NLGRP_L1;
- break;
- case DN_RT_PKT_L2RT:
- group = DNRNG_NLGRP_L2;
- break;
- default:
- return;
- }
-
- skb2 = dnrmg_build_message(skb, &status);
- if (skb2 == NULL)
- return;
- NETLINK_CB(skb2).dst_group = group;
- netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
-}
-
-
-static unsigned int dnrmg_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- dnrmg_send_peer(*pskb);
- return NF_ACCEPT;
-}
-
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
-{
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
-
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
-
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- /* Eventually we might send routing messages too */
-
- RCV_SKB_FAIL(-EINVAL);
-}
-
-static void dnrmg_receive_user_sk(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
- for (; qlen && (skb = skb_dequeue(&sk->sk_receive_queue)); qlen--) {
- dnrmg_receive_user_skb(skb);
- kfree_skb(skb);
- }
-}
-
-static struct nf_hook_ops dnrmg_ops = {
- .hook = dnrmg_hook,
- .pf = PF_DECnet,
- .hooknum = NF_DN_ROUTE,
- .priority = NF_DN_PRI_DNRTMSG,
-};
-
-static int __init dn_rtmsg_init(void)
-{
- int rv = 0;
-
- dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
- dnrmg_receive_user_sk, THIS_MODULE);
- if (dnrmg == NULL) {
- printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
- return -ENOMEM;
- }
-
- rv = nf_register_hook(&dnrmg_ops);
- if (rv) {
- sock_release(dnrmg->sk_socket);
- }
-
- return rv;
-}
-
-static void __exit dn_rtmsg_fini(void)
-{
- nf_unregister_hook(&dnrmg_ops);
- sock_release(dnrmg->sk_socket);
-}
-
-
-MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
-MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
-
-module_init(dn_rtmsg_init);
-module_exit(dn_rtmsg_fini);
-
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
deleted file mode 100644
index e246f054f368..000000000000
--- a/net/decnet/sysctl_net_decnet.c
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet sysctl support functions
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse - C99 changes and default device handling
- * Steve Whitehouse - Memory buffer settings, like the tcp ones
- *
- */
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include <linux/fs.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-
-#include <asm/uaccess.h>
-
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-
-int decnet_debug_level;
-int decnet_time_wait = 30;
-int decnet_dn_count = 1;
-int decnet_di_count = 3;
-int decnet_dr_count = 3;
-int decnet_log_martians = 1;
-int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
-
-/* Reasonable defaults, I hope, based on tcp's defaults */
-int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
-int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
-int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
-
-#ifdef CONFIG_SYSCTL
-extern int decnet_dst_gc_interval;
-static int min_decnet_time_wait[] = { 5 };
-static int max_decnet_time_wait[] = { 600 };
-static int min_state_count[] = { 1 };
-static int max_state_count[] = { NSP_MAXRXTSHIFT };
-static int min_decnet_dst_gc_interval[] = { 1 };
-static int max_decnet_dst_gc_interval[] = { 60 };
-static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW };
-static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW };
-static char node_name[7] = "???";
-
-static struct ctl_table_header *dn_table_header = NULL;
-
-/*
- * ctype.h :-)
- */
-#define ISNUM(x) (((x) >= '0') && ((x) <= '9'))
-#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z'))
-#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z'))
-#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x))
-#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x))
-
-static void strip_it(char *str)
-{
- for(;;) {
- switch(*str) {
- case ' ':
- case '\n':
- case '\r':
- case ':':
- *str = 0;
- case 0:
- return;
- }
- str++;
- }
-}
-
-/*
- * Simple routine to parse an ascii DECnet address
- * into a network order address.
- */
-static int parse_addr(__le16 *addr, char *str)
-{
- __u16 area, node;
-
- while(*str && !ISNUM(*str)) str++;
-
- if (*str == 0)
- return -1;
-
- area = (*str++ - '0');
- if (ISNUM(*str)) {
- area *= 10;
- area += (*str++ - '0');
- }
-
- if (*str++ != '.')
- return -1;
-
- if (!ISNUM(*str))
- return -1;
-
- node = *str++ - '0';
- if (ISNUM(*str)) {
- node *= 10;
- node += (*str++ - '0');
- }
- if (ISNUM(*str)) {
- node *= 10;
- node += (*str++ - '0');
- }
- if (ISNUM(*str)) {
- node *= 10;
- node += (*str++ - '0');
- }
-
- if ((node > 1023) || (area > 63))
- return -1;
-
- if (INVALID_END_CHAR(*str))
- return -1;
-
- *addr = dn_htons((area << 10) | node);
-
- return 0;
-}
-
-
-static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
-{
- size_t len;
- __le16 addr;
-
- if (oldval && oldlenp) {
- if (get_user(len, oldlenp))
- return -EFAULT;
- if (len) {
- if (len != sizeof(unsigned short))
- return -EINVAL;
- if (put_user(decnet_address, (__le16 __user *)oldval))
- return -EFAULT;
- }
- }
- if (newval && newlen) {
- if (newlen != sizeof(unsigned short))
- return -EINVAL;
- if (get_user(addr, (__le16 __user *)newval))
- return -EFAULT;
-
- dn_dev_devices_off();
-
- decnet_address = addr;
-
- dn_dev_devices_on();
- }
- return 0;
-}
-
-static int dn_node_address_handler(ctl_table *table, int write,
- struct file *filp,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- char addr[DN_ASCBUF_LEN];
- size_t len;
- __le16 dnaddr;
-
- if (!*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
-
- if (copy_from_user(addr, buffer, len))
- return -EFAULT;
-
- addr[len] = 0;
- strip_it(addr);
-
- if (parse_addr(&dnaddr, addr))
- return -EINVAL;
-
- dn_dev_devices_off();
-
- decnet_address = dnaddr;
-
- dn_dev_devices_on();
-
- *ppos += len;
-
- return 0;
- }
-
- dn_addr2asc(dn_ntohs(decnet_address), addr);
- len = strlen(addr);
- addr[len++] = '\n';
-
- if (len > *lenp) len = *lenp;
-
- if (copy_to_user(buffer, addr, len))
- return -EFAULT;
-
- *lenp = len;
- *ppos += len;
-
- return 0;
-}
-
-
-static int dn_def_dev_strategy(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
-{
- size_t len;
- struct net_device *dev;
- char devname[17];
- size_t namel;
- int rv = 0;
-
- devname[0] = 0;
-
- if (oldval && oldlenp) {
- if (get_user(len, oldlenp))
- return -EFAULT;
- if (len) {
- dev = dn_dev_get_default();
- if (dev) {
- strcpy(devname, dev->name);
- dev_put(dev);
- }
-
- namel = strlen(devname) + 1;
- if (len > namel) len = namel;
-
- if (copy_to_user(oldval, devname, len))
- return -EFAULT;
-
- if (put_user(len, oldlenp))
- return -EFAULT;
- }
- }
-
- if (newval && newlen) {
- if (newlen > 16)
- return -E2BIG;
-
- if (copy_from_user(devname, newval, newlen))
- return -EFAULT;
-
- devname[newlen] = 0;
-
- dev = dev_get_by_name(devname);
- if (dev == NULL)
- return -ENODEV;
-
- rv = -ENODEV;
- if (dev->dn_ptr != NULL) {
- rv = dn_dev_set_default(dev, 1);
- if (rv)
- dev_put(dev);
- }
- }
-
- return rv;
-}
-
-
-static int dn_def_dev_handler(ctl_table *table, int write,
- struct file * filp,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- size_t len;
- struct net_device *dev;
- char devname[17];
-
- if (!*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- if (*lenp > 16)
- return -E2BIG;
-
- if (copy_from_user(devname, buffer, *lenp))
- return -EFAULT;
-
- devname[*lenp] = 0;
- strip_it(devname);
-
- dev = dev_get_by_name(devname);
- if (dev == NULL)
- return -ENODEV;
-
- if (dev->dn_ptr == NULL) {
- dev_put(dev);
- return -ENODEV;
- }
-
- if (dn_dev_set_default(dev, 1)) {
- dev_put(dev);
- return -ENODEV;
- }
- *ppos += *lenp;
-
- return 0;
- }
-
- dev = dn_dev_get_default();
- if (dev == NULL) {
- *lenp = 0;
- return 0;
- }
-
- strcpy(devname, dev->name);
- dev_put(dev);
- len = strlen(devname);
- devname[len++] = '\n';
-
- if (len > *lenp) len = *lenp;
-
- if (copy_to_user(buffer, devname, len))
- return -EFAULT;
-
- *lenp = len;
- *ppos += len;
-
- return 0;
-}
-
-static ctl_table dn_table[] = {
- {
- .ctl_name = NET_DECNET_NODE_ADDRESS,
- .procname = "node_address",
- .maxlen = 7,
- .mode = 0644,
- .proc_handler = dn_node_address_handler,
- .strategy = dn_node_address_strategy,
- },
- {
- .ctl_name = NET_DECNET_NODE_NAME,
- .procname = "node_name",
- .data = node_name,
- .maxlen = 7,
- .mode = 0644,
- .proc_handler = &proc_dostring,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = NET_DECNET_DEFAULT_DEVICE,
- .procname = "default_device",
- .maxlen = 16,
- .mode = 0644,
- .proc_handler = dn_def_dev_handler,
- .strategy = dn_def_dev_strategy,
- },
- {
- .ctl_name = NET_DECNET_TIME_WAIT,
- .procname = "time_wait",
- .data = &decnet_time_wait,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_decnet_time_wait,
- .extra2 = &max_decnet_time_wait
- },
- {
- .ctl_name = NET_DECNET_DN_COUNT,
- .procname = "dn_count",
- .data = &decnet_dn_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_state_count,
- .extra2 = &max_state_count
- },
- {
- .ctl_name = NET_DECNET_DI_COUNT,
- .procname = "di_count",
- .data = &decnet_di_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_state_count,
- .extra2 = &max_state_count
- },
- {
- .ctl_name = NET_DECNET_DR_COUNT,
- .procname = "dr_count",
- .data = &decnet_dr_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_state_count,
- .extra2 = &max_state_count
- },
- {
- .ctl_name = NET_DECNET_DST_GC_INTERVAL,
- .procname = "dst_gc_interval",
- .data = &decnet_dst_gc_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_decnet_dst_gc_interval,
- .extra2 = &max_decnet_dst_gc_interval
- },
- {
- .ctl_name = NET_DECNET_NO_FC_MAX_CWND,
- .procname = "no_fc_max_cwnd",
- .data = &decnet_no_fc_max_cwnd,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &min_decnet_no_fc_max_cwnd,
- .extra2 = &max_decnet_no_fc_max_cwnd
- },
- {
- .ctl_name = NET_DECNET_MEM,
- .procname = "decnet_mem",
- .data = &sysctl_decnet_mem,
- .maxlen = sizeof(sysctl_decnet_mem),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- .strategy = &sysctl_intvec,
- },
- {
- .ctl_name = NET_DECNET_RMEM,
- .procname = "decnet_rmem",
- .data = &sysctl_decnet_rmem,
- .maxlen = sizeof(sysctl_decnet_rmem),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- .strategy = &sysctl_intvec,
- },
- {
- .ctl_name = NET_DECNET_WMEM,
- .procname = "decnet_wmem",
- .data = &sysctl_decnet_wmem,
- .maxlen = sizeof(sysctl_decnet_wmem),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- .strategy = &sysctl_intvec,
- },
- {
- .ctl_name = NET_DECNET_DEBUG_LEVEL,
- .procname = "debug",
- .data = &decnet_debug_level,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- .strategy = &sysctl_intvec,
- },
- {0}
-};
-
-static ctl_table dn_dir_table[] = {
- {
- .ctl_name = NET_DECNET,
- .procname = "decnet",
- .mode = 0555,
- .child = dn_table},
- {0}
-};
-
-static ctl_table dn_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = dn_dir_table
- },
- {0}
-};
-
-void dn_register_sysctl(void)
-{
- dn_table_header = register_sysctl_table(dn_root_table, 1);
-}
-
-void dn_unregister_sysctl(void)
-{
- unregister_sysctl_table(dn_table_header);
-}
-
-#else /* CONFIG_SYSCTL */
-void dn_unregister_sysctl(void)
-{
-}
-void dn_register_sysctl(void)
-{
-}
-
-#endif
diff --git a/net/devlink/Makefile b/net/devlink/Makefile
new file mode 100644
index 000000000000..000da622116a
--- /dev/null
+++ b/net/devlink/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \
+ resource.o param.o region.o health.o trap.o rate.o linecard.o
diff --git a/net/devlink/core.c b/net/devlink/core.c
new file mode 100644
index 000000000000..58093f49c090
--- /dev/null
+++ b/net/devlink/core.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/devlink.h>
+
+#include "devl_internal.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
+
+DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
+
+static struct devlink *devlinks_xa_get(unsigned long index)
+{
+ struct devlink *devlink;
+
+ rcu_read_lock();
+ devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED);
+ if (!devlink || !devlink_try_get(devlink))
+ devlink = NULL;
+ rcu_read_unlock();
+ return devlink;
+}
+
+/* devlink_rels xarray contains 1:1 relationships between
+ * devlink object and related nested devlink instance.
+ * The xarray index is used to get the nested object from
+ * the nested-in object code.
+ */
+static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1);
+
+#define DEVLINK_REL_IN_USE XA_MARK_0
+
+struct devlink_rel {
+ u32 index;
+ refcount_t refcount;
+ u32 devlink_index;
+ struct {
+ u32 devlink_index;
+ u32 obj_index;
+ devlink_rel_notify_cb_t *notify_cb;
+ devlink_rel_cleanup_cb_t *cleanup_cb;
+ struct delayed_work notify_work;
+ } nested_in;
+};
+
+static void devlink_rel_free(struct devlink_rel *rel)
+{
+ xa_erase(&devlink_rels, rel->index);
+ kfree(rel);
+}
+
+static void __devlink_rel_get(struct devlink_rel *rel)
+{
+ refcount_inc(&rel->refcount);
+}
+
+static void __devlink_rel_put(struct devlink_rel *rel)
+{
+ if (refcount_dec_and_test(&rel->refcount))
+ devlink_rel_free(rel);
+}
+
+static void devlink_rel_nested_in_notify_work(struct work_struct *work)
+{
+ struct devlink_rel *rel = container_of(work, struct devlink_rel,
+ nested_in.notify_work.work);
+ struct devlink *devlink;
+
+ devlink = devlinks_xa_get(rel->nested_in.devlink_index);
+ if (!devlink)
+ goto rel_put;
+ if (!devl_trylock(devlink)) {
+ devlink_put(devlink);
+ goto reschedule_work;
+ }
+ if (!devl_is_registered(devlink)) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ goto rel_put;
+ }
+ if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE))
+ rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index);
+ rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index);
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+rel_put:
+ __devlink_rel_put(rel);
+ return;
+
+reschedule_work:
+ schedule_delayed_work(&rel->nested_in.notify_work, 1);
+}
+
+static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel)
+{
+ __devlink_rel_get(rel);
+ schedule_delayed_work(&rel->nested_in.notify_work, 0);
+}
+
+static struct devlink_rel *devlink_rel_alloc(void)
+{
+ struct devlink_rel *rel;
+ static u32 next;
+ int err;
+
+ rel = kzalloc(sizeof(*rel), GFP_KERNEL);
+ if (!rel)
+ return ERR_PTR(-ENOMEM);
+
+ err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel,
+ xa_limit_32b, &next, GFP_KERNEL);
+ if (err < 0) {
+ kfree(rel);
+ return ERR_PTR(err);
+ }
+
+ refcount_set(&rel->refcount, 1);
+ INIT_DELAYED_WORK(&rel->nested_in.notify_work,
+ &devlink_rel_nested_in_notify_work);
+ return rel;
+}
+
+static void devlink_rel_put(struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink->rel;
+
+ if (!rel)
+ return;
+ xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE);
+ devlink_rel_nested_in_notify_work_schedule(rel);
+ __devlink_rel_put(rel);
+ devlink->rel = NULL;
+}
+
+void devlink_rel_nested_in_clear(u32 rel_index)
+{
+ xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE);
+}
+
+int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
+ u32 obj_index, devlink_rel_notify_cb_t *notify_cb,
+ devlink_rel_cleanup_cb_t *cleanup_cb,
+ struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink_rel_alloc();
+
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ if (IS_ERR(rel))
+ return PTR_ERR(rel);
+
+ rel->devlink_index = devlink->index;
+ rel->nested_in.devlink_index = devlink_index;
+ rel->nested_in.obj_index = obj_index;
+ rel->nested_in.notify_cb = notify_cb;
+ rel->nested_in.cleanup_cb = cleanup_cb;
+ *rel_index = rel->index;
+ xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE);
+ devlink->rel = rel;
+ return 0;
+}
+
+/**
+ * devlink_rel_nested_in_notify - Notify the object this devlink
+ * instance is nested in.
+ * @devlink: devlink
+ *
+ * This is called upon network namespace change of devlink instance.
+ * In case this devlink instance is nested in another devlink object,
+ * a notification of a change of this object should be sent
+ * over netlink. The parent devlink instance lock needs to be
+ * taken during the notification preparation.
+ * However, since the devlink lock of nested instance is held here,
+ * we would end with wrong devlink instance lock ordering and
+ * deadlock. Therefore the work is utilized to avoid that.
+ */
+void devlink_rel_nested_in_notify(struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink->rel;
+
+ if (!rel)
+ return;
+ devlink_rel_nested_in_notify_work_schedule(rel);
+}
+
+static struct devlink_rel *devlink_rel_find(unsigned long rel_index)
+{
+ return xa_find(&devlink_rels, &rel_index, rel_index,
+ DEVLINK_REL_IN_USE);
+}
+
+static struct devlink *devlink_rel_devlink_get(u32 rel_index)
+{
+ struct devlink_rel *rel;
+ u32 devlink_index;
+
+ if (!rel_index)
+ return NULL;
+ xa_lock(&devlink_rels);
+ rel = devlink_rel_find(rel_index);
+ if (rel)
+ devlink_index = rel->devlink_index;
+ xa_unlock(&devlink_rels);
+ if (!rel)
+ return NULL;
+ return devlinks_xa_get(devlink_index);
+}
+
+int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
+ u32 rel_index, int attrtype,
+ bool *msg_updated)
+{
+ struct net *net = devlink_net(devlink);
+ struct devlink *rel_devlink;
+ int err;
+
+ rel_devlink = devlink_rel_devlink_get(rel_index);
+ if (!rel_devlink)
+ return 0;
+ err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype);
+ devlink_put(rel_devlink);
+ if (!err && msg_updated)
+ *msg_updated = true;
+ return err;
+}
+
+void *devlink_priv(struct devlink *devlink)
+{
+ return &devlink->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_priv);
+
+struct devlink *priv_to_devlink(void *priv)
+{
+ return container_of(priv, struct devlink, priv);
+}
+EXPORT_SYMBOL_GPL(priv_to_devlink);
+
+struct device *devlink_to_dev(const struct devlink *devlink)
+{
+ return devlink->dev;
+}
+EXPORT_SYMBOL_GPL(devlink_to_dev);
+
+struct net *devlink_net(const struct devlink *devlink)
+{
+ return read_pnet(&devlink->_net);
+}
+EXPORT_SYMBOL_GPL(devlink_net);
+
+void devl_assert_locked(struct devlink *devlink)
+{
+ lockdep_assert_held(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_assert_locked);
+
+#ifdef CONFIG_LOCKDEP
+/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */
+bool devl_lock_is_held(struct devlink *devlink)
+{
+ return lockdep_is_held(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_lock_is_held);
+#endif
+
+void devl_lock(struct devlink *devlink)
+{
+ mutex_lock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_lock);
+
+int devl_trylock(struct devlink *devlink)
+{
+ return mutex_trylock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_trylock);
+
+void devl_unlock(struct devlink *devlink)
+{
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_unlock);
+
+/**
+ * devlink_try_get() - try to obtain a reference on a devlink instance
+ * @devlink: instance to reference
+ *
+ * Obtain a reference on a devlink instance. A reference on a devlink instance
+ * only implies that it's safe to take the instance lock. It does not imply
+ * that the instance is registered, use devl_is_registered() after taking
+ * the instance lock to check registration status.
+ */
+struct devlink *__must_check devlink_try_get(struct devlink *devlink)
+{
+ if (refcount_inc_not_zero(&devlink->refcount))
+ return devlink;
+ return NULL;
+}
+
+static void devlink_release(struct work_struct *work)
+{
+ struct devlink *devlink;
+
+ devlink = container_of(to_rcu_work(work), struct devlink, rwork);
+
+ mutex_destroy(&devlink->lock);
+ lockdep_unregister_key(&devlink->lock_key);
+ put_device(devlink->dev);
+ kvfree(devlink);
+}
+
+void devlink_put(struct devlink *devlink)
+{
+ if (refcount_dec_and_test(&devlink->refcount))
+ queue_rcu_work(system_percpu_wq, &devlink->rwork);
+}
+
+struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp)
+{
+ struct devlink *devlink = NULL;
+
+ rcu_read_lock();
+retry:
+ devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED);
+ if (!devlink)
+ goto unlock;
+
+ if (!devlink_try_get(devlink))
+ goto next;
+ if (!net_eq(devlink_net(devlink), net)) {
+ devlink_put(devlink);
+ goto next;
+ }
+unlock:
+ rcu_read_unlock();
+ return devlink;
+
+next:
+ (*indexp)++;
+ goto retry;
+}
+
+/**
+ * devl_register - Register devlink instance
+ * @devlink: devlink
+ */
+int devl_register(struct devlink *devlink)
+{
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+ devl_assert_locked(devlink);
+
+ xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ devlink_notify_register(devlink);
+ devlink_rel_nested_in_notify(devlink);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_register);
+
+void devlink_register(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_register(devlink);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ * devl_unregister - Unregister devlink instance
+ * @devlink: devlink
+ */
+void devl_unregister(struct devlink *devlink)
+{
+ ASSERT_DEVLINK_REGISTERED(devlink);
+ devl_assert_locked(devlink);
+
+ devlink_notify_unregister(devlink);
+ xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ devlink_rel_put(devlink);
+}
+EXPORT_SYMBOL_GPL(devl_unregister);
+
+void devlink_unregister(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_unregister(devlink);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ * devlink_alloc_ns - Allocate new devlink instance resources
+ * in specific namespace
+ *
+ * @ops: ops
+ * @priv_size: size of user private data
+ * @net: net namespace
+ * @dev: parent device
+ *
+ * Allocate new devlink instance resources, including devlink index
+ * and name.
+ */
+struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
+ size_t priv_size, struct net *net,
+ struct device *dev)
+{
+ struct devlink *devlink;
+ static u32 last_id;
+ int ret;
+
+ WARN_ON(!ops || !dev);
+ if (!devlink_reload_actions_valid(ops))
+ return NULL;
+
+ devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL);
+ if (!devlink)
+ return NULL;
+
+ ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
+ &last_id, GFP_KERNEL);
+ if (ret < 0)
+ goto err_xa_alloc;
+
+ devlink->dev = get_device(dev);
+ devlink->ops = ops;
+ xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->params, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC);
+ write_pnet(&devlink->_net, net);
+ INIT_LIST_HEAD(&devlink->rate_list);
+ INIT_LIST_HEAD(&devlink->linecard_list);
+ INIT_LIST_HEAD(&devlink->sb_list);
+ INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+ INIT_LIST_HEAD(&devlink->resource_list);
+ INIT_LIST_HEAD(&devlink->region_list);
+ INIT_LIST_HEAD(&devlink->reporter_list);
+ INIT_LIST_HEAD(&devlink->trap_list);
+ INIT_LIST_HEAD(&devlink->trap_group_list);
+ INIT_LIST_HEAD(&devlink->trap_policer_list);
+ INIT_RCU_WORK(&devlink->rwork, devlink_release);
+ lockdep_register_key(&devlink->lock_key);
+ mutex_init(&devlink->lock);
+ lockdep_set_class(&devlink->lock, &devlink->lock_key);
+ refcount_set(&devlink->refcount, 1);
+
+ return devlink;
+
+err_xa_alloc:
+ kvfree(devlink);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc_ns);
+
+/**
+ * devlink_free - Free devlink instance resources
+ *
+ * @devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
+ WARN_ON(!list_empty(&devlink->trap_group_list));
+ WARN_ON(!list_empty(&devlink->trap_list));
+ WARN_ON(!list_empty(&devlink->reporter_list));
+ WARN_ON(!list_empty(&devlink->region_list));
+ WARN_ON(!list_empty(&devlink->resource_list));
+ WARN_ON(!list_empty(&devlink->dpipe_table_list));
+ WARN_ON(!list_empty(&devlink->sb_list));
+ WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(!list_empty(&devlink->linecard_list));
+ WARN_ON(!xa_empty(&devlink->ports));
+
+ xa_destroy(&devlink->nested_rels);
+ xa_destroy(&devlink->snapshot_ids);
+ xa_destroy(&devlink->params);
+ xa_destroy(&devlink->ports);
+
+ xa_erase(&devlinks, devlink->index);
+
+ devlink_put(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+static void __net_exit devlink_pernet_pre_exit(struct net *net)
+{
+ struct devlink *devlink;
+ u32 actions_performed;
+ unsigned long index;
+ int err;
+
+ /* In case network namespace is getting destroyed, reload
+ * all devlink instances from this namespace into init_net.
+ */
+ devlinks_xa_for_each_registered_get(net, index, devlink) {
+ devl_dev_lock(devlink, true);
+ err = 0;
+ if (devl_is_registered(devlink))
+ err = devlink_reload(devlink, &init_net,
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_LIMIT_UNSPEC,
+ &actions_performed, NULL);
+ devl_dev_unlock(devlink, true);
+ devlink_put(devlink);
+ if (err && err != -EOPNOTSUPP)
+ pr_warn("Failed to reload devlink instance into init_net\n");
+ }
+}
+
+static struct pernet_operations devlink_pernet_ops __net_initdata = {
+ .pre_exit = devlink_pernet_pre_exit,
+};
+
+static struct notifier_block devlink_port_netdevice_nb = {
+ .notifier_call = devlink_port_netdevice_event,
+};
+
+static int __init devlink_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&devlink_pernet_ops);
+ if (err)
+ goto out;
+ err = genl_register_family(&devlink_nl_family);
+ if (err)
+ goto out_unreg_pernet_subsys;
+ err = register_netdevice_notifier(&devlink_port_netdevice_nb);
+ if (!err)
+ return 0;
+
+ genl_unregister_family(&devlink_nl_family);
+
+out_unreg_pernet_subsys:
+ unregister_pernet_subsys(&devlink_pernet_ops);
+out:
+ WARN_ON(err);
+ return err;
+}
+
+subsys_initcall(devlink_init);
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
new file mode 100644
index 000000000000..02602704bdea
--- /dev/null
+++ b/net/devlink/dev.c
@@ -0,0 +1,1442 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <linux/device.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include "devl_internal.h"
+
+struct devlink_info_req {
+ struct sk_buff *msg;
+ void (*version_cb)(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv);
+ void *version_cb_priv;
+};
+
+struct devlink_reload_combination {
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+};
+
+static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = {
+ {
+ /* can't reinitialize driver with no down time */
+ .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ .limit = DEVLINK_RELOAD_LIMIT_NO_RESET,
+ },
+};
+
+static bool
+devlink_reload_combination_is_invalid(enum devlink_reload_action action,
+ enum devlink_reload_limit limit)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++)
+ if (devlink_reload_invalid_combinations[i].action == action &&
+ devlink_reload_invalid_combinations[i].limit == limit)
+ return true;
+ return false;
+}
+
+static bool
+devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action)
+{
+ return test_bit(action, &devlink->ops->reload_actions);
+}
+
+static bool
+devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit)
+{
+ return test_bit(limit, &devlink->ops->reload_limits);
+}
+
+static int devlink_reload_stat_put(struct sk_buff *msg,
+ enum devlink_reload_limit limit, u32 value)
+{
+ struct nlattr *reload_stats_entry;
+
+ reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY);
+ if (!reload_stats_entry)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(msg, reload_stats_entry);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_entry);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote)
+{
+ struct nlattr *reload_stats_attr, *act_info, *act_stats;
+ int i, j, stat_idx;
+ u32 value;
+
+ if (!is_remote)
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS);
+ else
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS);
+
+ if (!reload_stats_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) {
+ if ((!is_remote &&
+ !devlink_reload_action_is_supported(devlink, i)) ||
+ i == DEVLINK_RELOAD_ACTION_UNSPEC)
+ continue;
+ act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO);
+ if (!act_info)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i))
+ goto action_info_nest_cancel;
+ act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS);
+ if (!act_stats)
+ goto action_info_nest_cancel;
+
+ for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) {
+ /* Remote stats are shown even if not locally supported.
+ * Stats of actions with unspecified limit are shown
+ * though drivers don't need to register unspecified
+ * limit.
+ */
+ if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC &&
+ !devlink_reload_limit_is_supported(devlink, j)) ||
+ devlink_reload_combination_is_invalid(i, j))
+ continue;
+
+ stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i;
+ if (!is_remote)
+ value = devlink->stats.reload_stats[stat_idx];
+ else
+ value = devlink->stats.remote_reload_stats[stat_idx];
+ if (devlink_reload_stat_put(msg, j, value))
+ goto action_stats_nest_cancel;
+ }
+ nla_nest_end(msg, act_stats);
+ nla_nest_end(msg, act_info);
+ }
+ nla_nest_end(msg, reload_stats_attr);
+ return 0;
+
+action_stats_nest_cancel:
+ nla_nest_cancel(msg, act_stats);
+action_info_nest_cancel:
+ nla_nest_cancel(msg, act_info);
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_nested_fill(struct sk_buff *msg, struct devlink *devlink)
+{
+ unsigned long rel_index;
+ void *unused;
+ int err;
+
+ xa_for_each(&devlink->nested_rels, rel_index, unused) {
+ err = devlink_rel_devlink_handle_put(msg, devlink,
+ rel_index,
+ DEVLINK_ATTR_NESTED_DEVLINK,
+ NULL);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ struct nlattr *dev_stats;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
+ goto nla_put_failure;
+
+ dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS);
+ if (!dev_stats)
+ goto nla_put_failure;
+
+ if (devlink_reload_stats_put(msg, devlink, false))
+ goto dev_stats_nest_cancel;
+ if (devlink_reload_stats_put(msg, devlink, true))
+ goto dev_stats_nest_cancel;
+
+ nla_nest_end(msg, dev_stats);
+
+ if (devlink_nl_nested_fill(msg, devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+dev_stats_nest_cancel:
+ nla_nest_cancel(msg, dev_stats);
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+ WARN_ON(!devl_is_registered(devlink));
+
+ if (!devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+}
+
+int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one);
+}
+
+static void devlink_rel_notify_cb(struct devlink *devlink, u32 obj_index)
+{
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void devlink_rel_cleanup_cb(struct devlink *devlink, u32 obj_index,
+ u32 rel_index)
+{
+ xa_erase(&devlink->nested_rels, rel_index);
+}
+
+int devl_nested_devlink_set(struct devlink *devlink,
+ struct devlink *nested_devlink)
+{
+ u32 rel_index;
+ int err;
+
+ err = devlink_rel_nested_in_add(&rel_index, devlink->index, 0,
+ devlink_rel_notify_cb,
+ devlink_rel_cleanup_cb,
+ nested_devlink);
+ if (err)
+ return err;
+ return xa_insert(&devlink->nested_rels, rel_index,
+ xa_mk_value(0), GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(devl_nested_devlink_set);
+
+void devlink_notify_register(struct devlink *devlink)
+{
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ devlink_linecards_notify_register(devlink);
+ devlink_ports_notify_register(devlink);
+ devlink_trap_policers_notify_register(devlink);
+ devlink_trap_groups_notify_register(devlink);
+ devlink_traps_notify_register(devlink);
+ devlink_rates_notify_register(devlink);
+ devlink_regions_notify_register(devlink);
+ devlink_params_notify_register(devlink);
+}
+
+void devlink_notify_unregister(struct devlink *devlink)
+{
+ devlink_params_notify_unregister(devlink);
+ devlink_regions_notify_unregister(devlink);
+ devlink_rates_notify_unregister(devlink);
+ devlink_traps_notify_unregister(devlink);
+ devlink_trap_groups_notify_unregister(devlink);
+ devlink_trap_policers_notify_unregister(devlink);
+ devlink_ports_notify_unregister(devlink);
+ devlink_linecards_notify_unregister(devlink);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+}
+
+static void devlink_reload_failed_set(struct devlink *devlink,
+ bool reload_failed)
+{
+ if (devlink->reload_failed == reload_failed)
+ return;
+ devlink->reload_failed = reload_failed;
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+bool devlink_is_reload_failed(const struct devlink *devlink)
+{
+ return devlink->reload_failed;
+}
+EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+
+static void
+__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats,
+ enum devlink_reload_limit limit, u32 actions_performed)
+{
+ unsigned long actions = actions_performed;
+ int stat_idx;
+ int action;
+
+ for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) {
+ stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action;
+ reload_stats[stat_idx]++;
+ }
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void
+devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit,
+ actions_performed);
+}
+
+/**
+ * devlink_remote_reload_actions_performed - Update devlink on reload actions
+ * performed which are not a direct result of devlink reload call.
+ *
+ * This should be called by a driver after performing reload actions in case it was not
+ * a result of devlink reload call. For example fw_activate was performed as a result
+ * of devlink reload triggered fw_activate on another host.
+ * The motivation for this function is to keep data on reload actions performed on this
+ * function whether it was done due to direct devlink reload call or not.
+ *
+ * @devlink: devlink
+ * @limit: reload limit
+ * @actions_performed: bitmask of actions performed
+ */
+void devlink_remote_reload_actions_performed(struct devlink *devlink,
+ enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ if (WARN_ON(!actions_performed ||
+ actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) ||
+ limit > DEVLINK_RELOAD_LIMIT_MAX))
+ return;
+
+ __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit,
+ actions_performed);
+}
+EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed);
+
+static struct net *devlink_netns_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
+ struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
+ struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
+ struct net *net;
+
+ if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
+ NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (netns_pid_attr) {
+ net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
+ } else if (netns_fd_attr) {
+ net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
+ } else if (netns_id_attr) {
+ net = get_net_ns_by_id(sock_net(skb->sk),
+ nla_get_u32(netns_id_attr));
+ if (!net)
+ net = ERR_PTR(-EINVAL);
+ } else {
+ WARN_ON(1);
+ net = ERR_PTR(-EINVAL);
+ }
+ if (IS_ERR(net)) {
+ NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return ERR_PTR(-EPERM);
+ }
+ return net;
+}
+
+static void devlink_reload_netns_change(struct devlink *devlink,
+ struct net *curr_net,
+ struct net *dest_net)
+{
+ /* Userspace needs to be notified about devlink objects
+ * removed from original and entering new network namespace.
+ * The rest of the devlink objects are re-created during
+ * reload process so the notifications are generated separatelly.
+ */
+ devlink_notify_unregister(devlink);
+ write_pnet(&devlink->_net, dest_net);
+ devlink_notify_register(devlink);
+ devlink_rel_nested_in_notify(devlink);
+}
+
+static void devlink_reload_reinit_sanity_check(struct devlink *devlink)
+{
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
+ WARN_ON(!list_empty(&devlink->trap_group_list));
+ WARN_ON(!list_empty(&devlink->trap_list));
+ WARN_ON(!list_empty(&devlink->dpipe_table_list));
+ WARN_ON(!list_empty(&devlink->sb_list));
+ WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(!list_empty(&devlink->linecard_list));
+ WARN_ON(!xa_empty(&devlink->ports));
+}
+
+int devlink_reload(struct devlink *devlink, struct net *dest_net,
+ enum devlink_reload_action action,
+ enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack)
+{
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ struct net *curr_net;
+ int err;
+
+ /* Make sure the reload operations are invoked with the device lock
+ * held to allow drivers to trigger functionality that expects it
+ * (e.g., PCI reset) and to close possible races between these
+ * operations and probe/remove.
+ */
+ device_lock_assert(devlink->dev);
+
+ memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats));
+
+ err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
+ if (err)
+ return err;
+
+ curr_net = devlink_net(devlink);
+ if (dest_net && !net_eq(dest_net, curr_net))
+ devlink_reload_netns_change(devlink, curr_net, dest_net);
+
+ if (action == DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+ devlink_params_driverinit_load_new(devlink);
+ devlink_reload_reinit_sanity_check(devlink);
+ }
+
+ err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
+ devlink_reload_failed_set(devlink, !!err);
+ if (err)
+ return err;
+
+ WARN_ON(!(*actions_performed & BIT(action)));
+ /* Catch driver on updating the remote action within devlink reload */
+ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats)));
+ devlink_reload_stats_update(devlink, limit, *actions_performed);
+ return 0;
+}
+
+static int
+devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed,
+ enum devlink_command cmd, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed,
+ actions_performed))
+ goto nla_put_failure;
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_reload_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+ struct net *dest_net = NULL;
+ u32 actions_performed;
+ int err;
+
+ err = devlink_resources_validate(devlink, NULL, info);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "resources size validation failed");
+ return err;
+ }
+
+ action = nla_get_u8_default(info->attrs[DEVLINK_ATTR_RELOAD_ACTION],
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
+
+ if (!devlink_reload_action_is_supported(devlink, action)) {
+ NL_SET_ERR_MSG(info->extack, "Requested reload action is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+
+ limit = DEVLINK_RELOAD_LIMIT_UNSPEC;
+ if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) {
+ struct nla_bitfield32 limits;
+ u32 limits_selected;
+
+ limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]);
+ limits_selected = limits.value & limits.selector;
+ if (!limits_selected) {
+ NL_SET_ERR_MSG(info->extack, "Invalid limit selected");
+ return -EINVAL;
+ }
+ for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++)
+ if (limits_selected & BIT(limit))
+ break;
+ /* UAPI enables multiselection, but currently it is not used */
+ if (limits_selected != BIT(limit)) {
+ NL_SET_ERR_MSG(info->extack, "Multiselection of limit is not supported");
+ return -EOPNOTSUPP;
+ }
+ if (!devlink_reload_limit_is_supported(devlink, limit)) {
+ NL_SET_ERR_MSG(info->extack, "Requested limit is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_reload_combination_is_invalid(action, limit)) {
+ NL_SET_ERR_MSG(info->extack, "Requested limit is invalid for this action");
+ return -EINVAL;
+ }
+ }
+ if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
+ info->attrs[DEVLINK_ATTR_NETNS_FD] ||
+ info->attrs[DEVLINK_ATTR_NETNS_ID]) {
+ dest_net = devlink_netns_get(skb, info);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+ if (!net_eq(dest_net, devlink_net(devlink)) &&
+ action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Changing namespace is only supported for reinit action");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack);
+
+ if (dest_net)
+ put_net(dest_net);
+
+ if (err)
+ return err;
+ /* For backward compatibility generate reply only if attributes used by user */
+ if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS])
+ return 0;
+
+ return devlink_nl_reload_actions_performed_snd(devlink, actions_performed,
+ DEVLINK_CMD_RELOAD, info);
+}
+
+bool devlink_reload_actions_valid(const struct devlink_ops *ops)
+{
+ const struct devlink_reload_combination *comb;
+ int i;
+
+ if (!devlink_reload_supported(ops)) {
+ if (WARN_ON(ops->reload_actions))
+ return false;
+ return true;
+ }
+
+ if (WARN_ON(!ops->reload_actions ||
+ ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX)))
+ return false;
+
+ if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) ||
+ ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX)))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) {
+ comb = &devlink_reload_invalid_combinations[i];
+ if (ops->reload_actions == BIT(comb->action) &&
+ ops->reload_limits == BIT(comb->limit))
+ return false;
+ }
+ return true;
+}
+
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
+ void *hdr;
+ int err = 0;
+ u16 mode;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (ops->eswitch_mode_get) {
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_encap_mode_get) {
+ err = ops->eswitch_encap_mode_get(devlink, &encap_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+ info->snd_portid, info->snd_seq, 0);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
+ int err = 0;
+ u16 mode;
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = devlink_rate_nodes_check(devlink, mode, info->extack);
+ if (err)
+ return err;
+ err = ops->eswitch_mode_set(devlink, mode, info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) {
+ if (!ops->eswitch_encap_mode_set)
+ return -EOPNOTSUPP;
+ encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
+ err = ops->eswitch_encap_mode_set(devlink, encap_mode,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
+
+int devlink_info_board_serial_number_put(struct devlink_info_req *req,
+ const char *bsn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER,
+ bsn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put);
+
+static int devlink_info_version_put(struct devlink_info_req *req, int attr,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ struct nlattr *nest;
+ int err;
+
+ if (req->version_cb)
+ req->version_cb(version_name, version_type,
+ req->version_cb_priv);
+
+ if (!req->msg || !*version_value)
+ return 0;
+
+ nest = nla_nest_start_noflag(req->msg, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME,
+ version_name);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE,
+ version_value);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(req->msg, nest);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(req->msg, nest);
+ return err;
+}
+
+int devlink_info_version_fixed_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
+
+int devlink_info_version_stored_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
+
+int devlink_info_version_stored_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext);
+
+int devlink_info_version_running_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
+
+int devlink_info_version_running_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
+
+static int devlink_nl_driver_info_get(struct device_driver *drv,
+ struct devlink_info_req *req)
+{
+ if (!drv)
+ return 0;
+
+ if (drv->name[0])
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME,
+ drv->name);
+
+ return 0;
+}
+
+static int
+devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags, struct netlink_ext_ack *extack)
+{
+ struct device *dev = devlink_to_dev(devlink);
+ struct devlink_info_req req = {};
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ req.msg = msg;
+ if (devlink->ops->info_get) {
+ err = devlink->ops->info_get(devlink, &req, extack);
+ if (err)
+ goto err_cancel_msg;
+ }
+
+ err = devlink_nl_driver_info_get(dev->driver, &req);
+ if (err)
+ goto err_cancel_msg;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ int err;
+
+ err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+
+int devlink_nl_info_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(msg, cb, devlink_nl_info_get_dump_one);
+}
+
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ enum devlink_command cmd,
+ struct devlink_flash_notify *params)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+ goto out;
+
+ if (params->status_msg &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+ params->status_msg))
+ goto nla_put_failure;
+ if (params->component &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+ params->component))
+ goto nla_put_failure;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ params->done))
+ goto nla_put_failure;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ params->total))
+ goto nla_put_failure;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
+ params->timeout))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ struct devlink_flash_notify *params)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
+ if (err)
+ goto out_free_msg;
+
+ devlink_nl_notify_send(devlink, msg);
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+static void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+ struct devlink_flash_notify params = {};
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE,
+ &params);
+}
+
+static void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+ struct devlink_flash_notify params = {};
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_END,
+ &params);
+}
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .done = done,
+ .total = total,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
+void devlink_flash_update_timeout_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long timeout)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .timeout = timeout,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+
+struct devlink_flash_component_lookup_ctx {
+ const char *lookup_name;
+ bool lookup_name_found;
+};
+
+static void
+devlink_flash_component_lookup_cb(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv)
+{
+ struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv;
+
+ if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT ||
+ lookup_ctx->lookup_name_found)
+ return;
+
+ lookup_ctx->lookup_name_found =
+ !strcmp(lookup_ctx->lookup_name, version_name);
+}
+
+static int devlink_flash_component_get(struct devlink *devlink,
+ struct nlattr *nla_component,
+ const char **p_component,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_flash_component_lookup_ctx lookup_ctx = {};
+ struct devlink_info_req req = {};
+ const char *component;
+ int ret;
+
+ if (!nla_component)
+ return 0;
+
+ component = nla_data(nla_component);
+
+ if (!devlink->ops->info_get) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "component update is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ lookup_ctx.lookup_name = component;
+ req.version_cb = devlink_flash_component_lookup_cb;
+ req.version_cb_priv = &lookup_ctx;
+
+ ret = devlink->ops->info_get(devlink, &req, NULL);
+ if (ret)
+ return ret;
+
+ if (!lookup_ctx.lookup_name_found) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "selected component is not supported by this device");
+ return -EINVAL;
+ }
+ *p_component = component;
+ return 0;
+}
+
+int devlink_nl_flash_update_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_overwrite_mask, *nla_file_name;
+ struct devlink_flash_update_params params = {};
+ struct devlink *devlink = info->user_ptr[0];
+ const char *file_name;
+ u32 supported_params;
+ int ret;
+
+ if (!devlink->ops->flash_update)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME))
+ return -EINVAL;
+
+ ret = devlink_flash_component_get(devlink,
+ info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT],
+ &params.component, info->extack);
+ if (ret)
+ return ret;
+
+ supported_params = devlink->ops->supported_flash_update_params;
+
+ nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
+ if (nla_overwrite_mask) {
+ struct nla_bitfield32 sections;
+
+ if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask,
+ "overwrite settings are not supported by this device");
+ return -EOPNOTSUPP;
+ }
+ sections = nla_get_bitfield32(nla_overwrite_mask);
+ params.overwrite_mask = sections.value & sections.selector;
+ }
+
+ nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME];
+ file_name = nla_data(nla_file_name);
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name,
+ "failed to locate the requested firmware file");
+ return ret;
+ }
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, info->extack);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+
+ return ret;
+}
+
+static void __devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ struct devlink_info_req req = {};
+ const struct nlattr *nlattr;
+ struct sk_buff *msg;
+ int rem, err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ req.msg = msg;
+ err = devlink->ops->info_get(devlink, &req, NULL);
+ if (err)
+ goto free_msg;
+
+ nla_for_each_attr_type(nlattr, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ (void *)msg->data, msg->len, rem) {
+ const struct nlattr *kv;
+ int rem_kv;
+
+ nla_for_each_nested_type(kv, DEVLINK_ATTR_INFO_VERSION_VALUE,
+ nlattr, rem_kv) {
+ strlcat(buf, nla_data(kv), len);
+ strlcat(buf, " ", len);
+ }
+ }
+free_msg:
+ nlmsg_consume(msg);
+}
+
+void devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ if (!devlink->ops->info_get)
+ return;
+
+ devl_lock(devlink);
+ if (devl_is_registered(devlink))
+ __devlink_compat_running_version(devlink, buf, len);
+ devl_unlock(devlink);
+}
+
+int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
+{
+ struct devlink_flash_update_params params = {};
+ int ret;
+
+ devl_lock(devlink);
+ if (!devl_is_registered(devlink)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (!devlink->ops->flash_update) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret)
+ goto out_unlock;
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, NULL);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+out_unlock:
+ devl_unlock(devlink);
+
+ return ret;
+}
+
+static int
+devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
+ u32 portid, u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *selftests;
+ void *hdr;
+ int err;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
+ DEVLINK_CMD_SELFTESTS_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto err_cancel_msg;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ if (devlink->ops->selftest_check(devlink, i, extack)) {
+ err = nla_put_flag(msg, i);
+ if (err)
+ goto err_cancel_msg;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
+ info->snd_seq, 0, info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_selftests_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ if (!devlink->ops->selftest_check)
+ return 0;
+
+ return devlink_nl_selftests_fill(msg, devlink,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+}
+
+int devlink_nl_selftests_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_selftests_get_dump_one);
+}
+
+static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
+ enum devlink_selftest_status test_status)
+{
+ struct nlattr *result_attr;
+
+ result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
+ if (!result_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
+ test_status))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, result_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, result_attr);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
+ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
+};
+
+int devlink_nl_selftests_run_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *attrs, *selftests;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+ int i;
+
+ if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS))
+ return -EINVAL;
+
+ attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
+
+ err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
+ devlink_selftest_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = -EMSGSIZE;
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto genlmsg_cancel;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ enum devlink_selftest_status test_status;
+
+ if (nla_get_flag(tb[i])) {
+ if (!devlink->ops->selftest_check(devlink, i,
+ info->extack)) {
+ if (devlink_selftest_result_put(msg, i,
+ DEVLINK_SELFTEST_STATUS_SKIP))
+ goto selftests_nest_cancel;
+ continue;
+ }
+
+ test_status = devlink->ops->selftest_run(devlink, i,
+ info->extack);
+ if (devlink_selftest_result_put(msg, i, test_status))
+ goto selftests_nest_cancel;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+selftests_nest_cancel:
+ nla_nest_cancel(msg, selftests);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return err;
+}
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
new file mode 100644
index 000000000000..14eaad9cfe35
--- /dev/null
+++ b/net/devlink/devl_internal.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include <net/devlink.h>
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <rdma/ib_verbs.h>
+
+#include "netlink_gen.h"
+
+struct devlink_rel;
+
+#define DEVLINK_REGISTERED XA_MARK_1
+
+#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \
+ (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX)
+
+struct devlink_dev_stats {
+ u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+};
+
+struct devlink {
+ u32 index;
+ struct xarray ports;
+ struct list_head rate_list;
+ struct list_head sb_list;
+ struct list_head dpipe_table_list;
+ struct list_head resource_list;
+ struct xarray params;
+ struct list_head region_list;
+ struct list_head reporter_list;
+ struct devlink_dpipe_headers *dpipe_headers;
+ struct list_head trap_list;
+ struct list_head trap_group_list;
+ struct list_head trap_policer_list;
+ struct list_head linecard_list;
+ const struct devlink_ops *ops;
+ struct xarray snapshot_ids;
+ struct devlink_dev_stats stats;
+ struct device *dev;
+ possible_net_t _net;
+ /* Serializes access to devlink instance specific objects such as
+ * port, sb, dpipe, resource, params, region, traps and more.
+ */
+ struct mutex lock;
+ struct lock_class_key lock_key;
+ u8 reload_failed:1;
+ refcount_t refcount;
+ struct rcu_work rwork;
+ struct devlink_rel *rel;
+ struct xarray nested_rels;
+ char priv[] __aligned(NETDEV_ALIGN);
+};
+
+extern struct xarray devlinks;
+extern struct genl_family devlink_nl_family;
+
+/* devlink instances are open to the access from the user space after
+ * devlink_register() call. Such logical barrier allows us to have certain
+ * expectations related to locking.
+ *
+ * Before *_register() - we are in initialization stage and no parallel
+ * access possible to the devlink instance. All drivers perform that phase
+ * by implicitly holding device_lock.
+ *
+ * After *_register() - users and driver can access devlink instance at
+ * the same time.
+ */
+#define ASSERT_DEVLINK_REGISTERED(d) \
+ WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED))
+#define ASSERT_DEVLINK_NOT_REGISTERED(d) \
+ WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED))
+
+/* Iterate over devlink pointers which were possible to get reference to.
+ * devlink_put() needs to be called for each iterated devlink pointer
+ * in loop body in order to release the reference.
+ */
+#define devlinks_xa_for_each_registered_get(net, index, devlink) \
+ for (index = 0; (devlink = devlinks_xa_find_get(net, &index)); index++)
+
+struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp);
+
+static inline bool __devl_is_registered(struct devlink *devlink)
+{
+ return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+}
+
+static inline bool devl_is_registered(struct devlink *devlink)
+{
+ devl_assert_locked(devlink);
+ return __devl_is_registered(devlink);
+}
+
+static inline void devl_dev_lock(struct devlink *devlink, bool dev_lock)
+{
+ if (dev_lock)
+ device_lock(devlink->dev);
+ devl_lock(devlink);
+}
+
+static inline void devl_dev_unlock(struct devlink *devlink, bool dev_lock)
+{
+ devl_unlock(devlink);
+ if (dev_lock)
+ device_unlock(devlink->dev);
+}
+
+typedef void devlink_rel_notify_cb_t(struct devlink *devlink, u32 obj_index);
+typedef void devlink_rel_cleanup_cb_t(struct devlink *devlink, u32 obj_index,
+ u32 rel_index);
+
+void devlink_rel_nested_in_clear(u32 rel_index);
+int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
+ u32 obj_index, devlink_rel_notify_cb_t *notify_cb,
+ devlink_rel_cleanup_cb_t *cleanup_cb,
+ struct devlink *devlink);
+void devlink_rel_nested_in_notify(struct devlink *devlink);
+int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
+ u32 rel_index, int attrtype,
+ bool *msg_updated);
+
+/* Netlink */
+enum devlink_multicast_groups {
+ DEVLINK_MCGRP_CONFIG,
+};
+
+/* state held across netlink dumps */
+struct devlink_nl_dump_state {
+ unsigned long instance;
+ int idx;
+ union {
+ /* DEVLINK_CMD_REGION_READ */
+ struct {
+ u64 start_offset;
+ };
+ /* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET */
+ struct {
+ u64 dump_ts;
+ };
+ };
+};
+
+typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags);
+
+struct devlink *
+devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs,
+ bool dev_lock);
+
+int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb,
+ devlink_nl_dump_one_func_t *dump_one);
+
+static inline struct devlink_nl_dump_state *
+devlink_dump_state(struct netlink_callback *cb)
+{
+ NL_ASSERT_CTX_FITS(struct devlink_nl_dump_state);
+
+ return (struct devlink_nl_dump_state *)cb->ctx;
+}
+
+static inline int
+devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static inline int devlink_nl_put_u64(struct sk_buff *msg, int attrtype, u64 val)
+{
+ return nla_put_u64_64bit(msg, attrtype, val, DEVLINK_ATTR_PAD);
+}
+
+int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
+ struct devlink *devlink, int attrtype);
+int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info);
+
+static inline bool devlink_nl_notify_need(struct devlink *devlink)
+{
+ return genl_has_listeners(&devlink_nl_family, devlink_net(devlink),
+ DEVLINK_MCGRP_CONFIG);
+}
+
+struct devlink_obj_desc {
+ struct rcu_head rcu;
+ const char *bus_name;
+ const char *dev_name;
+ unsigned int port_index;
+ bool port_index_valid;
+ long data[];
+};
+
+static inline void devlink_nl_obj_desc_init(struct devlink_obj_desc *desc,
+ struct devlink *devlink)
+{
+ memset(desc, 0, sizeof(*desc));
+ desc->bus_name = devlink->dev->bus->name;
+ desc->dev_name = dev_name(devlink->dev);
+}
+
+static inline void devlink_nl_obj_desc_port_set(struct devlink_obj_desc *desc,
+ struct devlink_port *devlink_port)
+{
+ desc->port_index = devlink_port->index;
+ desc->port_index_valid = true;
+}
+
+int devlink_nl_notify_filter(struct sock *dsk, struct sk_buff *skb, void *data);
+
+static inline void devlink_nl_notify_send_desc(struct devlink *devlink,
+ struct sk_buff *msg,
+ struct devlink_obj_desc *desc)
+{
+ genlmsg_multicast_netns_filtered(&devlink_nl_family,
+ devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG,
+ GFP_KERNEL,
+ devlink_nl_notify_filter, desc);
+}
+
+static inline void devlink_nl_notify_send(struct devlink *devlink,
+ struct sk_buff *msg)
+{
+ struct devlink_obj_desc desc;
+
+ devlink_nl_obj_desc_init(&desc, devlink);
+ devlink_nl_notify_send_desc(devlink, msg, &desc);
+}
+
+/* Notify */
+void devlink_notify_register(struct devlink *devlink);
+void devlink_notify_unregister(struct devlink *devlink);
+void devlink_ports_notify_register(struct devlink *devlink);
+void devlink_ports_notify_unregister(struct devlink *devlink);
+void devlink_params_notify_register(struct devlink *devlink);
+void devlink_params_notify_unregister(struct devlink *devlink);
+void devlink_regions_notify_register(struct devlink *devlink);
+void devlink_regions_notify_unregister(struct devlink *devlink);
+void devlink_trap_policers_notify_register(struct devlink *devlink);
+void devlink_trap_policers_notify_unregister(struct devlink *devlink);
+void devlink_trap_groups_notify_register(struct devlink *devlink);
+void devlink_trap_groups_notify_unregister(struct devlink *devlink);
+void devlink_traps_notify_register(struct devlink *devlink);
+void devlink_traps_notify_unregister(struct devlink *devlink);
+void devlink_rates_notify_register(struct devlink *devlink);
+void devlink_rates_notify_unregister(struct devlink *devlink);
+void devlink_linecards_notify_register(struct devlink *devlink);
+void devlink_linecards_notify_unregister(struct devlink *devlink);
+
+/* Ports */
+#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->initialized)
+
+struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ unsigned int port_index);
+int devlink_port_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr);
+struct devlink_port *
+devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info);
+struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs);
+
+/* Reload */
+bool devlink_reload_actions_valid(const struct devlink_ops *ops);
+int devlink_reload(struct devlink *devlink, struct net *dest_net,
+ enum devlink_reload_action action,
+ enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack);
+
+static inline bool devlink_reload_supported(const struct devlink_ops *ops)
+{
+ return ops->reload_down && ops->reload_up;
+}
+
+/* Params */
+void devlink_params_driverinit_load_new(struct devlink *devlink);
+
+/* Resources */
+struct devlink_resource;
+int devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info);
+
+/* Rates */
+int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack);
+
+/* Linecards */
+unsigned int devlink_linecard_index(struct devlink_linecard *linecard);
diff --git a/net/devlink/dpipe.c b/net/devlink/dpipe.c
new file mode 100644
index 000000000000..e55701b007f0
--- /dev/null
+++ b/net/devlink/dpipe.c
@@ -0,0 +1,915 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
+ {
+ .name = "destination mac",
+ .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+ .bitwidth = 48,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
+ .name = "ethernet",
+ .id = DEVLINK_DPIPE_HEADER_ETHERNET,
+ .fields = devlink_dpipe_fields_ethernet,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
+ .global = true,
+};
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet);
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
+ {
+ .name = "destination ip",
+ .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+ .bitwidth = 32,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
+ .name = "ipv4",
+ .id = DEVLINK_DPIPE_HEADER_IPV4,
+ .fields = devlink_dpipe_fields_ipv4,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
+ .global = true,
+};
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4);
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
+ {
+ .name = "destination ip",
+ .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+ .bitwidth = 128,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
+ .name = "ipv6",
+ .id = DEVLINK_DPIPE_HEADER_IPV6,
+ .fields = devlink_dpipe_fields_ipv6,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
+ .global = true,
+};
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6);
+
+int devlink_dpipe_match_put(struct sk_buff *skb,
+ struct devlink_dpipe_match *match)
+{
+ struct devlink_dpipe_header *header = match->header;
+ struct devlink_dpipe_field *field = &header->fields[match->field_id];
+ struct nlattr *match_attr;
+
+ match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH);
+ if (!match_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, match_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, match_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_match_put);
+
+static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table,
+ struct sk_buff *skb)
+{
+ struct nlattr *matches_attr;
+
+ matches_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_TABLE_MATCHES);
+ if (!matches_attr)
+ return -EMSGSIZE;
+
+ if (table->table_ops->matches_dump(table->priv, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, matches_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, matches_attr);
+ return -EMSGSIZE;
+}
+
+int devlink_dpipe_action_put(struct sk_buff *skb,
+ struct devlink_dpipe_action *action)
+{
+ struct devlink_dpipe_header *header = action->header;
+ struct devlink_dpipe_field *field = &header->fields[action->field_id];
+ struct nlattr *action_attr;
+
+ action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION);
+ if (!action_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, action_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, action_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_action_put);
+
+static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table,
+ struct sk_buff *skb)
+{
+ struct nlattr *actions_attr;
+
+ actions_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_TABLE_ACTIONS);
+ if (!actions_attr)
+ return -EMSGSIZE;
+
+ if (table->table_ops->actions_dump(table->priv, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, actions_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, actions_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_table_put(struct sk_buff *skb,
+ struct devlink_dpipe_table *table)
+{
+ struct nlattr *table_attr;
+ u64 table_size;
+
+ table_size = table->table_ops->size_get(table->priv);
+ table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
+ table->counters_enabled))
+ goto nla_put_failure;
+
+ if (table->resource_valid) {
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+ table->resource_id) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+ table->resource_units))
+ goto nla_put_failure;
+ }
+ if (devlink_dpipe_matches_put(table, skb))
+ goto nla_put_failure;
+
+ if (devlink_dpipe_actions_put(table, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, table_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, table_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb,
+ struct genl_info *info)
+{
+ int err;
+
+ if (*pskb) {
+ err = genlmsg_reply(*pskb, info);
+ if (err)
+ return err;
+ }
+ *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!*pskb)
+ return -ENOMEM;
+ return 0;
+}
+
+static int devlink_dpipe_tables_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct list_head *dpipe_tables,
+ const char *table_name)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_dpipe_table *table;
+ struct nlattr *tables_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ bool incomplete;
+ void *hdr;
+ int i;
+ int err;
+
+ table = list_first_entry(dpipe_tables,
+ struct devlink_dpipe_table, list);
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+ tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES);
+ if (!tables_attr)
+ goto nla_put_failure;
+
+ i = 0;
+ incomplete = false;
+ list_for_each_entry_from(table, dpipe_tables, list) {
+ if (!table_name) {
+ err = devlink_dpipe_table_put(skb, table);
+ if (err) {
+ if (!i)
+ goto err_table_put;
+ incomplete = true;
+ break;
+ }
+ } else {
+ if (!strcmp(table->name, table_name)) {
+ err = devlink_dpipe_table_put(skb, table);
+ if (err)
+ break;
+ }
+ }
+ i++;
+ }
+
+ nla_nest_end(skb, tables_attr);
+ genlmsg_end(skb, hdr);
+ if (incomplete)
+ goto start_again;
+
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_table_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+int devlink_nl_dpipe_table_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const char *table_name = NULL;
+
+ if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+
+ return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0,
+ &devlink->dpipe_table_list,
+ table_name);
+}
+
+static int devlink_dpipe_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE,
+ value->value_size, value->value))
+ return -EMSGSIZE;
+ if (value->mask)
+ if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK,
+ value->value_size, value->mask))
+ return -EMSGSIZE;
+ if (value->mapping_valid)
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING,
+ value->mapping_value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_action_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (!value->action)
+ return -EINVAL;
+ if (devlink_dpipe_action_put(skb, value->action))
+ return -EMSGSIZE;
+ if (devlink_dpipe_value_put(skb, value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_action_values_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *values,
+ unsigned int values_count)
+{
+ struct nlattr *action_attr;
+ int i;
+ int err;
+
+ for (i = 0; i < values_count; i++) {
+ action_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_ACTION_VALUE);
+ if (!action_attr)
+ return -EMSGSIZE;
+ err = devlink_dpipe_action_value_put(skb, &values[i]);
+ if (err)
+ goto err_action_value_put;
+ nla_nest_end(skb, action_attr);
+ }
+ return 0;
+
+err_action_value_put:
+ nla_nest_cancel(skb, action_attr);
+ return err;
+}
+
+static int devlink_dpipe_match_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (!value->match)
+ return -EINVAL;
+ if (devlink_dpipe_match_put(skb, value->match))
+ return -EMSGSIZE;
+ if (devlink_dpipe_value_put(skb, value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_match_values_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *values,
+ unsigned int values_count)
+{
+ struct nlattr *match_attr;
+ int i;
+ int err;
+
+ for (i = 0; i < values_count; i++) {
+ match_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_MATCH_VALUE);
+ if (!match_attr)
+ return -EMSGSIZE;
+ err = devlink_dpipe_match_value_put(skb, &values[i]);
+ if (err)
+ goto err_match_value_put;
+ nla_nest_end(skb, match_attr);
+ }
+ return 0;
+
+err_match_value_put:
+ nla_nest_cancel(skb, match_attr);
+ return err;
+}
+
+static int devlink_dpipe_entry_put(struct sk_buff *skb,
+ struct devlink_dpipe_entry *entry)
+{
+ struct nlattr *entry_attr, *matches_attr, *actions_attr;
+ int err;
+
+ entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index))
+ goto nla_put_failure;
+ if (entry->counter_valid)
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER,
+ entry->counter))
+ goto nla_put_failure;
+
+ matches_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES);
+ if (!matches_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_match_values_put(skb, entry->match_values,
+ entry->match_values_count);
+ if (err) {
+ nla_nest_cancel(skb, matches_attr);
+ goto err_match_values_put;
+ }
+ nla_nest_end(skb, matches_attr);
+
+ actions_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES);
+ if (!actions_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_action_values_put(skb, entry->action_values,
+ entry->action_values_count);
+ if (err) {
+ nla_nest_cancel(skb, actions_attr);
+ goto err_action_values_put;
+ }
+ nla_nest_end(skb, actions_attr);
+
+ nla_nest_end(skb, entry_attr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_match_values_put:
+err_action_values_put:
+ nla_nest_cancel(skb, entry_attr);
+ return err;
+}
+
+static struct devlink_dpipe_table *
+devlink_dpipe_table_find(struct list_head *dpipe_tables,
+ const char *table_name, struct devlink *devlink)
+{
+ struct devlink_dpipe_table *table;
+
+ list_for_each_entry_rcu(table, dpipe_tables, list,
+ lockdep_is_held(&devlink->lock)) {
+ if (!strcmp(table->name, table_name))
+ return table;
+ }
+ return NULL;
+}
+
+int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+ struct devlink *devlink;
+ int err;
+
+ err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb,
+ dump_ctx->info);
+ if (err)
+ return err;
+
+ dump_ctx->hdr = genlmsg_put(dump_ctx->skb,
+ dump_ctx->info->snd_portid,
+ dump_ctx->info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI,
+ dump_ctx->cmd);
+ if (!dump_ctx->hdr)
+ goto nla_put_failure;
+
+ devlink = dump_ctx->info->user_ptr[0];
+ if (devlink_nl_put_handle(dump_ctx->skb, devlink))
+ goto nla_put_failure;
+ dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb,
+ DEVLINK_ATTR_DPIPE_ENTRIES);
+ if (!dump_ctx->nest)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ nlmsg_free(dump_ctx->skb);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare);
+
+int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
+ struct devlink_dpipe_entry *entry)
+{
+ return devlink_dpipe_entry_put(dump_ctx->skb, entry);
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append);
+
+int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+ nla_nest_end(dump_ctx->skb, dump_ctx->nest);
+ genlmsg_end(dump_ctx->skb, dump_ctx->hdr);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close);
+
+void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
+
+{
+ unsigned int value_count, value_index;
+ struct devlink_dpipe_value *value;
+
+ value = entry->action_values;
+ value_count = entry->action_values_count;
+ for (value_index = 0; value_index < value_count; value_index++) {
+ kfree(value[value_index].value);
+ kfree(value[value_index].mask);
+ }
+
+ value = entry->match_values;
+ value_count = entry->match_values_count;
+ for (value_index = 0; value_index < value_count; value_index++) {
+ kfree(value[value_index].value);
+ kfree(value[value_index].mask);
+ }
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear);
+
+static int devlink_dpipe_entries_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct devlink_dpipe_table *table)
+{
+ struct devlink_dpipe_dump_ctx dump_ctx;
+ struct nlmsghdr *nlh;
+ int err;
+
+ dump_ctx.skb = NULL;
+ dump_ctx.cmd = cmd;
+ dump_ctx.info = info;
+
+ err = table->table_ops->entries_dump(table->priv,
+ table->counters_enabled,
+ &dump_ctx);
+ if (err)
+ return err;
+
+send_done:
+ nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(dump_ctx.skb, info);
+}
+
+int devlink_nl_dpipe_entries_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_dpipe_table *table;
+ const char *table_name;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME))
+ return -EINVAL;
+
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return -EINVAL;
+
+ if (!table->table_ops->entries_dump)
+ return -EINVAL;
+
+ return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET,
+ 0, table);
+}
+
+static int devlink_dpipe_fields_put(struct sk_buff *skb,
+ const struct devlink_dpipe_header *header)
+{
+ struct devlink_dpipe_field *field;
+ struct nlattr *field_attr;
+ int i;
+
+ for (i = 0; i < header->fields_count; i++) {
+ field = &header->fields[i];
+ field_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_FIELD);
+ if (!field_attr)
+ return -EMSGSIZE;
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type))
+ goto nla_put_failure;
+ nla_nest_end(skb, field_attr);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, field_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_header_put(struct sk_buff *skb,
+ struct devlink_dpipe_header *header)
+{
+ struct nlattr *fields_attr, *header_attr;
+ int err;
+
+ header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER);
+ if (!header_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ fields_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_HEADER_FIELDS);
+ if (!fields_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_fields_put(skb, header);
+ if (err) {
+ nla_nest_cancel(skb, fields_attr);
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, fields_attr);
+ nla_nest_end(skb, header_attr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+ nla_nest_cancel(skb, header_attr);
+ return err;
+}
+
+static int devlink_dpipe_headers_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct devlink_dpipe_headers *
+ dpipe_headers)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *headers_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *hdr;
+ int i, j;
+ int err;
+
+ i = 0;
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+ headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS);
+ if (!headers_attr)
+ goto nla_put_failure;
+
+ j = 0;
+ for (; i < dpipe_headers->headers_count; i++) {
+ err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]);
+ if (err) {
+ if (!j)
+ goto err_table_put;
+ break;
+ }
+ j++;
+ }
+ nla_nest_end(skb, headers_attr);
+ genlmsg_end(skb, hdr);
+ if (i != dpipe_headers->headers_count)
+ goto start_again;
+
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_table_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+int devlink_nl_dpipe_headers_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink->dpipe_headers)
+ return -EOPNOTSUPP;
+ return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET,
+ 0, devlink->dpipe_headers);
+}
+
+static int devlink_dpipe_table_counters_set(struct devlink *devlink,
+ const char *table_name,
+ bool enable)
+{
+ struct devlink_dpipe_table *table;
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return -EINVAL;
+
+ if (table->counter_control_extern)
+ return -EOPNOTSUPP;
+
+ if (!(table->counters_enabled ^ enable))
+ return 0;
+
+ table->counters_enabled = enable;
+ if (table->table_ops->counters_set_update)
+ table->table_ops->counters_set_update(table->priv, enable);
+ return 0;
+}
+
+int devlink_nl_dpipe_table_counters_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const char *table_name;
+ bool counters_enable;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) ||
+ GENL_REQ_ATTR_CHECK(info,
+ DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED))
+ return -EINVAL;
+
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+ counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
+
+ return devlink_dpipe_table_counters_set(devlink, table_name,
+ counters_enable);
+}
+
+/**
+ * devl_dpipe_headers_register - register dpipe headers
+ *
+ * @devlink: devlink
+ * @dpipe_headers: dpipe header array
+ *
+ * Register the headers supported by hardware.
+ */
+void devl_dpipe_headers_register(struct devlink *devlink,
+ struct devlink_dpipe_headers *dpipe_headers)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ devlink->dpipe_headers = dpipe_headers;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_register);
+
+/**
+ * devl_dpipe_headers_unregister - unregister dpipe headers
+ *
+ * @devlink: devlink
+ *
+ * Unregister the headers supported by hardware.
+ */
+void devl_dpipe_headers_unregister(struct devlink *devlink)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ devlink->dpipe_headers = NULL;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister);
+
+/**
+ * devlink_dpipe_table_counter_enabled - check if counter allocation
+ * required
+ * @devlink: devlink
+ * @table_name: tables name
+ *
+ * Used by driver to check if counter allocation is required.
+ * After counter allocation is turned on the table entries
+ * are updated to include counter statistics.
+ *
+ * After that point on the driver must respect the counter
+ * state so that each entry added to the table is added
+ * with a counter.
+ */
+bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+ bool enabled;
+
+ rcu_read_lock();
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ enabled = false;
+ if (table)
+ enabled = table->counters_enabled;
+ rcu_read_unlock();
+ return enabled;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
+
+/**
+ * devl_dpipe_table_register - register dpipe table
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @table_ops: table ops
+ * @priv: priv
+ * @counter_control_extern: external control for counters
+ */
+int devl_dpipe_table_register(struct devlink *devlink,
+ const char *table_name,
+ const struct devlink_dpipe_table_ops *table_ops,
+ void *priv, bool counter_control_extern)
+{
+ struct devlink_dpipe_table *table;
+
+ lockdep_assert_held(&devlink->lock);
+
+ if (WARN_ON(!table_ops->size_get))
+ return -EINVAL;
+
+ if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
+ devlink))
+ return -EEXIST;
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ table->name = table_name;
+ table->table_ops = table_ops;
+ table->priv = priv;
+ table->counter_control_extern = counter_control_extern;
+
+ list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_register);
+
+/**
+ * devl_dpipe_table_unregister - unregister dpipe table
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ */
+void devl_dpipe_table_unregister(struct devlink *devlink,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+
+ lockdep_assert_held(&devlink->lock);
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return;
+ list_del_rcu(&table->list);
+ kfree_rcu(table, rcu);
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister);
+
+/**
+ * devl_dpipe_table_resource_set - set the resource id
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @resource_id: resource id
+ * @resource_units: number of resource's units consumed per table's entry
+ */
+int devl_dpipe_table_resource_set(struct devlink *devlink,
+ const char *table_name, u64 resource_id,
+ u64 resource_units)
+{
+ struct devlink_dpipe_table *table;
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return -EINVAL;
+
+ table->resource_id = resource_id;
+ table->resource_units = resource_units;
+ table->resource_valid = true;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set);
diff --git a/net/devlink/health.c b/net/devlink/health.c
new file mode 100644
index 000000000000..136a67c36a20
--- /dev/null
+++ b/net/devlink/health.c
@@ -0,0 +1,1350 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include <trace/events/devlink.h>
+#include "devl_internal.h"
+
+struct devlink_fmsg_item {
+ struct list_head list;
+ int attrtype;
+ u8 nla_type;
+ u16 len;
+ int value[];
+};
+
+struct devlink_fmsg {
+ struct list_head item_list;
+ int err; /* first error encountered on some devlink_fmsg_XXX() call */
+ bool putting_binary; /* This flag forces enclosing of binary data
+ * in an array brackets. It forces using
+ * of designated API:
+ * devlink_fmsg_binary_pair_nest_start()
+ * devlink_fmsg_binary_pair_nest_end()
+ */
+};
+
+static struct devlink_fmsg *devlink_fmsg_alloc(void)
+{
+ struct devlink_fmsg *fmsg;
+
+ fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL);
+ if (!fmsg)
+ return NULL;
+
+ INIT_LIST_HEAD(&fmsg->item_list);
+
+ return fmsg;
+}
+
+static void devlink_fmsg_free(struct devlink_fmsg *fmsg)
+{
+ struct devlink_fmsg_item *item, *tmp;
+
+ list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) {
+ list_del(&item->list);
+ kfree(item);
+ }
+ kfree(fmsg);
+}
+
+struct devlink_health_reporter {
+ struct list_head list;
+ void *priv;
+ const struct devlink_health_reporter_ops *ops;
+ struct devlink *devlink;
+ struct devlink_port *devlink_port;
+ struct devlink_fmsg *dump_fmsg;
+ u64 graceful_period;
+ u64 burst_period;
+ bool auto_recover;
+ bool auto_dump;
+ u8 health_state;
+ u64 dump_ts;
+ u64 dump_real_ts;
+ u64 error_count;
+ u64 recovery_count;
+ u64 last_recovery_ts;
+};
+
+void *
+devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
+{
+ return reporter->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_priv);
+
+static struct devlink_health_reporter *
+__devlink_health_reporter_find_by_name(struct list_head *reporter_list,
+ const char *reporter_name)
+{
+ struct devlink_health_reporter *reporter;
+
+ list_for_each_entry(reporter, reporter_list, list)
+ if (!strcmp(reporter->ops->name, reporter_name))
+ return reporter;
+ return NULL;
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_find_by_name(struct devlink *devlink,
+ const char *reporter_name)
+{
+ return __devlink_health_reporter_find_by_name(&devlink->reporter_list,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
+ const char *reporter_name)
+{
+ return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+__devlink_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ if (WARN_ON(ops->default_graceful_period && !ops->recover))
+ return ERR_PTR(-EINVAL);
+
+ if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period))
+ return ERR_PTR(-EINVAL);
+
+ reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
+ if (!reporter)
+ return ERR_PTR(-ENOMEM);
+
+ reporter->priv = priv;
+ reporter->ops = ops;
+ reporter->devlink = devlink;
+ reporter->graceful_period = ops->default_graceful_period;
+ reporter->burst_period = ops->default_burst_period;
+ reporter->auto_recover = !!ops->recover;
+ reporter->auto_dump = !!ops->dump;
+ return reporter;
+}
+
+/**
+ * devl_port_health_reporter_create() - create devlink health reporter for
+ * specified port instance
+ *
+ * @port: devlink_port to which health reports will relate
+ * @ops: devlink health reporter ops
+ * @priv: driver priv pointer
+ */
+struct devlink_health_reporter *
+devl_port_health_reporter_create(struct devlink_port *port,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ devl_assert_locked(port->devlink);
+
+ if (__devlink_health_reporter_find_by_name(&port->reporter_list,
+ ops->name))
+ return ERR_PTR(-EEXIST);
+
+ reporter = __devlink_health_reporter_create(port->devlink, ops, priv);
+ if (IS_ERR(reporter))
+ return reporter;
+
+ reporter->devlink_port = port;
+ list_add_tail(&reporter->list, &port->reporter_list);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devl_port_health_reporter_create);
+
+struct devlink_health_reporter *
+devlink_port_health_reporter_create(struct devlink_port *port,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink = port->devlink;
+
+ devl_lock(devlink);
+ reporter = devl_port_health_reporter_create(port, ops, priv);
+ devl_unlock(devlink);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
+
+/**
+ * devl_health_reporter_create - create devlink health reporter
+ *
+ * @devlink: devlink instance which the health reports will relate
+ * @ops: devlink health reporter ops
+ * @priv: driver priv pointer
+ */
+struct devlink_health_reporter *
+devl_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ devl_assert_locked(devlink);
+
+ if (devlink_health_reporter_find_by_name(devlink, ops->name))
+ return ERR_PTR(-EEXIST);
+
+ reporter = __devlink_health_reporter_create(devlink, ops, priv);
+ if (IS_ERR(reporter))
+ return reporter;
+
+ list_add_tail(&reporter->list, &devlink->reporter_list);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devl_health_reporter_create);
+
+struct devlink_health_reporter *
+devlink_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ devl_lock(devlink);
+ reporter = devl_health_reporter_create(devlink, ops, priv);
+ devl_unlock(devlink);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
+
+static void
+devlink_health_reporter_free(struct devlink_health_reporter *reporter)
+{
+ if (reporter->dump_fmsg)
+ devlink_fmsg_free(reporter->dump_fmsg);
+ kfree(reporter);
+}
+
+/**
+ * devl_health_reporter_destroy() - destroy devlink health reporter
+ *
+ * @reporter: devlink health reporter to destroy
+ */
+void
+devl_health_reporter_destroy(struct devlink_health_reporter *reporter)
+{
+ devl_assert_locked(reporter->devlink);
+
+ list_del(&reporter->list);
+ devlink_health_reporter_free(reporter);
+}
+EXPORT_SYMBOL_GPL(devl_health_reporter_destroy);
+
+void
+devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
+{
+ struct devlink *devlink = reporter->devlink;
+
+ devl_lock(devlink);
+ devl_health_reporter_destroy(reporter);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
+
+static int
+devlink_nl_health_reporter_fill(struct sk_buff *msg,
+ struct devlink_health_reporter *reporter,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ struct devlink *devlink = reporter->devlink;
+ struct nlattr *reporter_attr;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ if (reporter->devlink_port) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index))
+ goto genlmsg_cancel;
+ }
+ reporter_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_HEALTH_REPORTER);
+ if (!reporter_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ reporter->ops->name))
+ goto reporter_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
+ reporter->health_state))
+ goto reporter_nest_cancel;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,
+ reporter->error_count))
+ goto reporter_nest_cancel;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,
+ reporter->recovery_count))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
+ reporter->graceful_period))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+ reporter->burst_period))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
+ reporter->auto_recover))
+ goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
+ jiffies_to_msecs(reporter->dump_ts)))
+ goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
+ reporter->dump_real_ts))
+ goto reporter_nest_cancel;
+ if (reporter->ops->dump &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ reporter->auto_dump))
+ goto reporter_nest_cancel;
+
+ nla_nest_end(msg, reporter_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+reporter_nest_cancel:
+ nla_nest_cancel(msg, reporter_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ struct devlink_port *devlink_port;
+ char *reporter_name;
+
+ if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+ return NULL;
+
+ reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+ devlink_port = devlink_port_get_from_attrs(devlink, attrs);
+ if (IS_ERR(devlink_port))
+ return devlink_health_reporter_find_by_name(devlink,
+ reporter_name);
+ else
+ return devlink_port_health_reporter_find_by_name(devlink_port,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_health_reporter_get_from_attrs(devlink, info->attrs);
+}
+
+int devlink_nl_health_reporter_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ struct sk_buff *msg;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_health_reporter_fill(msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ info->snd_portid, info->snd_seq,
+ 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct devlink_health_reporter *reporter;
+ unsigned long port_index_end = ULONG_MAX;
+ struct nlattr **attrs = info->attrs;
+ unsigned long port_index_start = 0;
+ struct devlink_port *port;
+ unsigned long port_index;
+ int idx = 0;
+ int err;
+
+ if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ port_index_end = port_index_start;
+ flags |= NLM_F_DUMP_FILTERED;
+ goto per_port_dump;
+ }
+
+ list_for_each_entry(reporter, &devlink->reporter_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_health_reporter_fill(msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ flags);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ idx++;
+ }
+per_port_dump:
+ xa_for_each_range(&devlink->ports, port_index, port,
+ port_index_start, port_index_end) {
+ list_for_each_entry(reporter, &port->reporter_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_health_reporter_fill(msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ flags);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ idx++;
+ }
+ }
+
+ return 0;
+}
+
+int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb,
+ devlink_nl_health_reporter_get_dump_one);
+}
+
+int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->recover &&
+ (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]))
+ return -EOPNOTSUPP;
+
+ if (!reporter->ops->dump &&
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ return -EOPNOTSUPP;
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
+ reporter->graceful_period =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+ if (!reporter->graceful_period)
+ reporter->burst_period = 0;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) {
+ u64 burst_period =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]);
+
+ if (!reporter->graceful_period && burst_period) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Cannot set burst period without a grace period.");
+ return -EINVAL;
+ }
+
+ reporter->burst_period = burst_period;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
+ reporter->auto_recover =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ reporter->auto_dump =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
+
+ return 0;
+}
+
+static void devlink_recover_notify(struct devlink_health_reporter *reporter,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = reporter->devlink;
+ struct devlink_obj_desc desc;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+ ASSERT_DEVLINK_REGISTERED(devlink);
+
+ if (!devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_obj_desc_init(&desc, devlink);
+ if (reporter->devlink_port)
+ devlink_nl_obj_desc_port_set(&desc, reporter->devlink_port);
+ devlink_nl_notify_send_desc(devlink, msg, &desc);
+}
+
+static bool
+devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter)
+{
+ unsigned long burst_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period);
+
+ return time_is_after_jiffies(burst_threshold);
+}
+
+void
+devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter)
+{
+ reporter->recovery_count++;
+ if (!devlink_health_reporter_in_burst(reporter))
+ /* When burst period is set, last_recovery_ts marks the first
+ * recovery within the burst period, not necessarily the last
+ * one.
+ */
+ reporter->last_recovery_ts = jiffies;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done);
+
+static int
+devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
+ void *priv_ctx, struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return 0;
+
+ if (!reporter->ops->recover)
+ return -EOPNOTSUPP;
+
+ err = reporter->ops->recover(reporter, priv_ctx, extack);
+ if (err)
+ return err;
+
+ devlink_health_reporter_recovery_done(reporter);
+ reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+
+ return 0;
+}
+
+static void
+devlink_health_dump_clear(struct devlink_health_reporter *reporter)
+{
+ if (!reporter->dump_fmsg)
+ return;
+ devlink_fmsg_free(reporter->dump_fmsg);
+ reporter->dump_fmsg = NULL;
+}
+
+static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
+ void *priv_ctx,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!reporter->ops->dump)
+ return 0;
+
+ if (reporter->dump_fmsg)
+ return 0;
+
+ reporter->dump_fmsg = devlink_fmsg_alloc();
+ if (!reporter->dump_fmsg)
+ return -ENOMEM;
+
+ devlink_fmsg_obj_nest_start(reporter->dump_fmsg);
+
+ err = reporter->ops->dump(reporter, reporter->dump_fmsg,
+ priv_ctx, extack);
+ if (err)
+ goto dump_err;
+
+ devlink_fmsg_obj_nest_end(reporter->dump_fmsg);
+ err = reporter->dump_fmsg->err;
+ if (err)
+ goto dump_err;
+
+ reporter->dump_ts = jiffies;
+ reporter->dump_real_ts = ktime_get_real_ns();
+
+ return 0;
+
+dump_err:
+ devlink_health_dump_clear(reporter);
+ return err;
+}
+
+static bool
+devlink_health_recover_abort(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state prev_state)
+{
+ unsigned long recover_ts_threshold;
+
+ if (!reporter->auto_recover)
+ return false;
+
+ /* abort if the previous error wasn't recovered */
+ if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return true;
+
+ if (devlink_health_reporter_in_burst(reporter))
+ return false;
+
+ recover_ts_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period) +
+ msecs_to_jiffies(reporter->graceful_period);
+ if (reporter->last_recovery_ts && reporter->recovery_count &&
+ time_is_after_jiffies(recover_ts_threshold))
+ return true;
+
+ return false;
+}
+
+int devlink_health_report(struct devlink_health_reporter *reporter,
+ const char *msg, void *priv_ctx)
+{
+ enum devlink_health_reporter_state prev_health_state;
+ struct devlink *devlink = reporter->devlink;
+ int ret;
+
+ /* write a log message of the current error */
+ WARN_ON(!msg);
+ trace_devlink_health_report(devlink, reporter->ops->name, msg);
+ reporter->error_count++;
+ prev_health_state = reporter->health_state;
+ reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+
+ if (devlink_health_recover_abort(reporter, prev_health_state)) {
+ trace_devlink_health_recover_aborted(devlink,
+ reporter->ops->name,
+ reporter->health_state,
+ jiffies -
+ reporter->last_recovery_ts);
+ return -ECANCELED;
+ }
+
+ if (reporter->auto_dump) {
+ devl_lock(devlink);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
+ devl_unlock(devlink);
+ }
+
+ if (!reporter->auto_recover)
+ return 0;
+
+ devl_lock(devlink);
+ ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL);
+ devl_unlock(devlink);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(devlink_health_report);
+
+void
+devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state state)
+{
+ if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY &&
+ state != DEVLINK_HEALTH_REPORTER_STATE_ERROR))
+ return;
+
+ if (reporter->health_state == state)
+ return;
+
+ reporter->health_state = state;
+ trace_devlink_health_reporter_state_update(reporter->devlink,
+ reporter->ops->name, state);
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update);
+
+int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ return devlink_health_reporter_recover(reporter, NULL, info->extack);
+}
+
+static void devlink_fmsg_err_if_binary(struct devlink_fmsg *fmsg)
+{
+ if (!fmsg->err && fmsg->putting_binary)
+ fmsg->err = -EINVAL;
+}
+
+static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype)
+{
+ struct devlink_fmsg_item *item;
+
+ if (fmsg->err)
+ return;
+
+ item = kzalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ fmsg->err = -ENOMEM;
+ return;
+ }
+
+ item->attrtype = attrtype;
+ list_add_tail(&item->list, &fmsg->item_list);
+}
+
+void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
+
+static void devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
+}
+
+void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
+
+#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN)
+
+static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
+{
+ struct devlink_fmsg_item *item;
+
+ devlink_fmsg_err_if_binary(fmsg);
+ if (fmsg->err)
+ return;
+
+ if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) {
+ fmsg->err = -EMSGSIZE;
+ return;
+ }
+
+ item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL);
+ if (!item) {
+ fmsg->err = -ENOMEM;
+ return;
+ }
+
+ item->nla_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING;
+ item->len = strlen(name) + 1;
+ item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME;
+ memcpy(&item->value, name, item->len);
+ list_add_tail(&item->list, &fmsg->item_list);
+}
+
+void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
+ devlink_fmsg_put_name(fmsg, name);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
+
+void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
+
+void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start);
+
+void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_nest_end(fmsg);
+ devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
+
+void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ fmsg->putting_binary = true;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
+
+void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ if (fmsg->err)
+ return;
+
+ if (!fmsg->putting_binary)
+ fmsg->err = -EINVAL;
+
+ fmsg->putting_binary = false;
+ devlink_fmsg_arr_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
+
+static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
+ const void *value, u16 value_len,
+ u8 value_nla_type)
+{
+ struct devlink_fmsg_item *item;
+
+ if (fmsg->err)
+ return;
+
+ if (value_len > DEVLINK_FMSG_MAX_SIZE) {
+ fmsg->err = -EMSGSIZE;
+ return;
+ }
+
+ item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL);
+ if (!item) {
+ fmsg->err = -ENOMEM;
+ return;
+ }
+
+ item->nla_type = value_nla_type;
+ item->len = value_len;
+ item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
+ memcpy(&item->value, value, item->len);
+ list_add_tail(&item->list, &fmsg->item_list);
+}
+
+static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_FLAG);
+}
+
+static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U8);
+}
+
+void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U32);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
+
+static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U64);
+}
+
+void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
+ DEVLINK_VAR_ATTR_TYPE_NUL_STRING);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
+
+void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len)
+{
+ if (!fmsg->err && !fmsg->putting_binary)
+ fmsg->err = -EINVAL;
+
+ devlink_fmsg_put_value(fmsg, value, value_len,
+ DEVLINK_VAR_ATTR_TYPE_BINARY);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
+
+void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ bool value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_bool_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put);
+
+void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u8 value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_u8_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put);
+
+void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u32 value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_u32_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put);
+
+void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u64 value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_u64_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put);
+
+void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ const char *value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_string_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
+
+void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ const void *value, u32 value_len)
+{
+ u32 data_size;
+ u32 offset;
+
+ devlink_fmsg_binary_pair_nest_start(fmsg, name);
+
+ for (offset = 0; offset < value_len; offset += data_size) {
+ data_size = value_len - offset;
+ if (data_size > DEVLINK_FMSG_MAX_SIZE)
+ data_size = DEVLINK_FMSG_MAX_SIZE;
+
+ devlink_fmsg_binary_put(fmsg, value + offset, data_size);
+ }
+
+ devlink_fmsg_binary_pair_nest_end(fmsg);
+ fmsg->putting_binary = false;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
+
+static int
+devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb)
+{
+ int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
+ u8 tmp;
+
+ switch (msg->nla_type) {
+ case DEVLINK_VAR_ATTR_TYPE_FLAG:
+ /* Always provide flag data, regardless of its value */
+ tmp = *(bool *)msg->value;
+
+ return nla_put_u8(skb, attrtype, tmp);
+ case DEVLINK_VAR_ATTR_TYPE_U8:
+ return nla_put_u8(skb, attrtype, *(u8 *)msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_U32:
+ return nla_put_u32(skb, attrtype, *(u32 *)msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_U64:
+ return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_NUL_STRING:
+ return nla_put_string(skb, attrtype, (char *)&msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_BINARY:
+ return nla_put(skb, attrtype, msg->len, (void *)&msg->value);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int
+devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ int *start)
+{
+ struct devlink_fmsg_item *item;
+ struct nlattr *fmsg_nlattr;
+ int err = 0;
+ int i = 0;
+
+ fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG);
+ if (!fmsg_nlattr)
+ return -EMSGSIZE;
+
+ list_for_each_entry(item, &fmsg->item_list, list) {
+ if (i < *start) {
+ i++;
+ continue;
+ }
+
+ switch (item->attrtype) {
+ case DEVLINK_ATTR_FMSG_OBJ_NEST_START:
+ case DEVLINK_ATTR_FMSG_PAIR_NEST_START:
+ case DEVLINK_ATTR_FMSG_ARR_NEST_START:
+ case DEVLINK_ATTR_FMSG_NEST_END:
+ err = nla_put_flag(skb, item->attrtype);
+ break;
+ case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA:
+ err = nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,
+ item->nla_type);
+ if (err)
+ break;
+ err = devlink_fmsg_item_fill_data(item, skb);
+ break;
+ case DEVLINK_ATTR_FMSG_OBJ_NAME:
+ err = nla_put_string(skb, item->attrtype,
+ (char *)&item->value);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (!err)
+ *start = ++i;
+ else
+ break;
+ }
+
+ nla_nest_end(skb, fmsg_nlattr);
+ return err;
+}
+
+static int devlink_fmsg_snd(struct devlink_fmsg *fmsg,
+ struct genl_info *info,
+ enum devlink_command cmd, int flags)
+{
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ bool last = false;
+ int index = 0;
+ void *hdr;
+ int err;
+
+ if (fmsg->err)
+ return fmsg->err;
+
+ while (!last) {
+ int tmp_index = index;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, flags | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if (!err)
+ last = true;
+ else if (err != -EMSGSIZE || tmp_index == index)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ err = genlmsg_reply(skb, info);
+ if (err)
+ return err;
+ }
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum devlink_command cmd)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ int index = state->idx;
+ int tmp_index = index;
+ void *hdr;
+ int err;
+
+ if (fmsg->err)
+ return fmsg->err;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if ((err && err != -EMSGSIZE) || tmp_index == index)
+ goto nla_put_failure;
+
+ state->idx = index;
+ genlmsg_end(skb, hdr);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return err;
+}
+
+int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ struct devlink_fmsg *fmsg;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->diagnose)
+ return -EOPNOTSUPP;
+
+ fmsg = devlink_fmsg_alloc();
+ if (!fmsg)
+ return -ENOMEM;
+
+ devlink_fmsg_obj_nest_start(fmsg);
+
+ err = reporter->ops->diagnose(reporter, fmsg, info->extack);
+ if (err)
+ goto out;
+
+ devlink_fmsg_obj_nest_end(fmsg);
+
+ err = devlink_fmsg_snd(fmsg, info,
+ DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
+
+out:
+ devlink_fmsg_free(fmsg);
+ return err;
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct devlink_health_reporter *reporter;
+ struct nlattr **attrs = info->attrs;
+ struct devlink *devlink;
+
+ devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs,
+ false);
+ if (IS_ERR(devlink))
+ return NULL;
+
+ reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+ if (!reporter) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ }
+ return reporter;
+}
+
+int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_cb_lock(cb);
+ if (!reporter)
+ return -EINVAL;
+
+ devlink = reporter->devlink;
+ if (!reporter->ops->dump) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return -EOPNOTSUPP;
+ }
+
+ if (!state->idx) {
+ err = devlink_health_do_dump(reporter, NULL, cb->extack);
+ if (err)
+ goto unlock;
+ state->dump_ts = reporter->dump_ts;
+ }
+ if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) {
+ NL_SET_ERR_MSG(cb->extack, "Dump trampled, please retry");
+ err = -EAGAIN;
+ goto unlock;
+ }
+
+ err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb,
+ DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
+unlock:
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return err;
+}
+
+int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->dump)
+ return -EOPNOTSUPP;
+
+ devlink_health_dump_clear(reporter);
+ return 0;
+}
+
+int devlink_nl_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->test)
+ return -EOPNOTSUPP;
+
+ return reporter->ops->test(reporter, info->extack);
+}
+
+/**
+ * devlink_fmsg_dump_skb - Dump sk_buffer structure
+ * @fmsg: devlink formatted message pointer
+ * @skb: pointer to skb
+ *
+ * Dump diagnostic information about sk_buff structure, like headroom, length,
+ * tailroom, MAC, etc.
+ */
+void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct sock *sk = skb->sk;
+ bool has_mac, has_trans;
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ devlink_fmsg_pair_nest_start(fmsg, "skb");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "actual len", skb->len);
+ devlink_fmsg_put(fmsg, "head len", skb_headlen(skb));
+ devlink_fmsg_put(fmsg, "data len", skb->data_len);
+ devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb));
+ devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1);
+ devlink_fmsg_put(fmsg, "MAC len",
+ has_mac ? skb_mac_header_len(skb) : -1);
+ devlink_fmsg_put(fmsg, "network hdr", skb->network_header);
+ devlink_fmsg_put(fmsg, "network hdr len",
+ has_trans ? skb_network_header_len(skb) : -1);
+ devlink_fmsg_put(fmsg, "transport hdr",
+ has_trans ? skb->transport_header : -1);
+ devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum);
+ devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed);
+ devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw);
+ devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid);
+ devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level);
+ devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash);
+ devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash);
+ devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol));
+ devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type);
+ devlink_fmsg_put(fmsg, "iif", skb->skb_iif);
+
+ if (sk) {
+ devlink_fmsg_pair_nest_start(fmsg, "sk");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "family", sk->sk_type);
+ devlink_fmsg_put(fmsg, "type", sk->sk_type);
+ devlink_fmsg_put(fmsg, "proto", sk->sk_protocol);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+ }
+
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+
+ devlink_fmsg_pair_nest_start(fmsg, "shinfo");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags);
+ devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags);
+ devlink_fmsg_put(fmsg, "gso_size", sh->gso_size);
+ devlink_fmsg_put(fmsg, "gso_type", sh->gso_type);
+ devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb);
diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c
new file mode 100644
index 000000000000..67f70a621d27
--- /dev/null
+++ b/net/devlink/linecard.c
@@ -0,0 +1,628 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+struct devlink_linecard {
+ struct list_head list;
+ struct devlink *devlink;
+ unsigned int index;
+ const struct devlink_linecard_ops *ops;
+ void *priv;
+ enum devlink_linecard_state state;
+ struct mutex state_lock; /* Protects state */
+ const char *type;
+ struct devlink_linecard_type *types;
+ unsigned int types_count;
+ u32 rel_index;
+};
+
+unsigned int devlink_linecard_index(struct devlink_linecard *linecard)
+{
+ return linecard->index;
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_by_index(struct devlink *devlink,
+ unsigned int linecard_index)
+{
+ struct devlink_linecard *devlink_linecard;
+
+ list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) {
+ if (devlink_linecard->index == linecard_index)
+ return devlink_linecard;
+ }
+ return NULL;
+}
+
+static bool devlink_linecard_index_exists(struct devlink *devlink,
+ unsigned int linecard_index)
+{
+ return devlink_linecard_get_by_index(devlink, linecard_index);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) {
+ u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]);
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (!linecard)
+ return ERR_PTR(-ENODEV);
+ return linecard;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_linecard_get_from_attrs(devlink, info->attrs);
+}
+
+struct devlink_linecard_type {
+ const char *type;
+ const void *priv;
+};
+
+static int devlink_nl_linecard_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_linecard *linecard,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_linecard_type *linecard_type;
+ struct nlattr *attr;
+ void *hdr;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state))
+ goto nla_put_failure;
+ if (linecard->type &&
+ nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type))
+ goto nla_put_failure;
+
+ if (linecard->types_count) {
+ attr = nla_nest_start(msg,
+ DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES);
+ if (!attr)
+ goto nla_put_failure;
+ for (i = 0; i < linecard->types_count; i++) {
+ linecard_type = &linecard->types[i];
+ if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE,
+ linecard_type->type)) {
+ nla_nest_cancel(msg, attr);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(msg, attr);
+ }
+
+ if (devlink_rel_devlink_handle_put(msg, devlink,
+ linecard->rel_index,
+ DEVLINK_ATTR_NESTED_DEVLINK,
+ NULL))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_linecard_notify(struct devlink_linecard *linecard,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = linecard->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW &&
+ cmd != DEVLINK_CMD_LINECARD_DEL);
+
+ if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0,
+ NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_linecards_notify_register(struct devlink *devlink)
+{
+ struct devlink_linecard *linecard;
+
+ list_for_each_entry(linecard, &devlink->linecard_list, list)
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+}
+
+void devlink_linecards_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_linecard *linecard;
+
+ list_for_each_entry_reverse(linecard, &devlink->linecard_list, list)
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+}
+
+int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_linecard *linecard;
+ struct sk_buff *msg;
+ int err;
+
+ linecard = devlink_linecard_get_from_info(devlink, info);
+ if (IS_ERR(linecard))
+ return PTR_ERR(linecard);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&linecard->state_lock);
+ err = devlink_nl_linecard_fill(msg, devlink, linecard,
+ DEVLINK_CMD_LINECARD_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ mutex_unlock(&linecard->state_lock);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_linecard *linecard;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(linecard, &devlink->linecard_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ mutex_lock(&linecard->state_lock);
+ err = devlink_nl_linecard_fill(msg, devlink, linecard,
+ DEVLINK_CMD_LINECARD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ mutex_unlock(&linecard->state_lock);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_linecard_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one);
+}
+
+static struct devlink_linecard_type *
+devlink_linecard_type_lookup(struct devlink_linecard *linecard,
+ const char *type)
+{
+ struct devlink_linecard_type *linecard_type;
+ int i;
+
+ for (i = 0; i < linecard->types_count; i++) {
+ linecard_type = &linecard->types[i];
+ if (!strcmp(type, linecard_type->type))
+ return linecard_type;
+ }
+ return NULL;
+}
+
+static int devlink_linecard_type_set(struct devlink_linecard *linecard,
+ const char *type,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_linecard_ops *ops = linecard->ops;
+ struct devlink_linecard_type *linecard_type;
+ int err;
+
+ mutex_lock(&linecard->state_lock);
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being provisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned");
+ err = -EBUSY;
+ goto out;
+ }
+
+ linecard_type = devlink_linecard_type_lookup(linecard, type);
+ if (!linecard_type) {
+ NL_SET_ERR_MSG(extack, "Unsupported line card type provided");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED &&
+ linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+ NL_SET_ERR_MSG(extack, "Line card already provisioned");
+ err = -EBUSY;
+ /* Check if the line card is provisioned in the same
+ * way the user asks. In case it is, make the operation
+ * to return success.
+ */
+ if (ops->same_provision &&
+ ops->same_provision(linecard, linecard->priv,
+ linecard_type->type,
+ linecard_type->priv))
+ err = 0;
+ goto out;
+ }
+
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING;
+ linecard->type = linecard_type->type;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ err = ops->provision(linecard, linecard->priv, linecard_type->type,
+ linecard_type->priv, extack);
+ if (err) {
+ /* Provisioning failed. Assume the linecard is unprovisioned
+ * for future operations.
+ */
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ }
+ return err;
+
+out:
+ mutex_unlock(&linecard->state_lock);
+ return err;
+}
+
+static int devlink_linecard_type_unset(struct devlink_linecard *linecard,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ mutex_lock(&linecard->state_lock);
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being provisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ err = 0;
+ goto out;
+ }
+
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) {
+ NL_SET_ERR_MSG(extack, "Line card is not provisioned");
+ err = 0;
+ goto out;
+ }
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ err = linecard->ops->unprovision(linecard, linecard->priv,
+ extack);
+ if (err) {
+ /* Unprovisioning failed. Assume the linecard is unprovisioned
+ * for future operations.
+ */
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ }
+ return err;
+
+out:
+ mutex_unlock(&linecard->state_lock);
+ return err;
+}
+
+int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_linecard *linecard;
+ int err;
+
+ linecard = devlink_linecard_get_from_info(devlink, info);
+ if (IS_ERR(linecard))
+ return PTR_ERR(linecard);
+
+ if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) {
+ const char *type;
+
+ type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]);
+ if (strcmp(type, "")) {
+ err = devlink_linecard_type_set(linecard, type, extack);
+ if (err)
+ return err;
+ } else {
+ err = devlink_linecard_type_unset(linecard, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int devlink_linecard_types_init(struct devlink_linecard *linecard)
+{
+ struct devlink_linecard_type *linecard_type;
+ unsigned int count;
+ int i;
+
+ count = linecard->ops->types_count(linecard, linecard->priv);
+ linecard->types = kmalloc_array(count, sizeof(*linecard_type),
+ GFP_KERNEL);
+ if (!linecard->types)
+ return -ENOMEM;
+ linecard->types_count = count;
+
+ for (i = 0; i < count; i++) {
+ linecard_type = &linecard->types[i];
+ linecard->ops->types_get(linecard, linecard->priv, i,
+ &linecard_type->type,
+ &linecard_type->priv);
+ }
+ return 0;
+}
+
+static void devlink_linecard_types_fini(struct devlink_linecard *linecard)
+{
+ kfree(linecard->types);
+}
+
+/**
+ * devl_linecard_create - Create devlink linecard
+ *
+ * @devlink: devlink
+ * @linecard_index: driver-specific numerical identifier of the linecard
+ * @ops: linecards ops
+ * @priv: user priv pointer
+ *
+ * Create devlink linecard instance with provided linecard index.
+ * Caller can use any indexing, even hw-related one.
+ *
+ * Return: Line card structure or an ERR_PTR() encoded error code.
+ */
+struct devlink_linecard *
+devl_linecard_create(struct devlink *devlink, unsigned int linecard_index,
+ const struct devlink_linecard_ops *ops, void *priv)
+{
+ struct devlink_linecard *linecard;
+ int err;
+
+ if (WARN_ON(!ops || !ops->provision || !ops->unprovision ||
+ !ops->types_count || !ops->types_get))
+ return ERR_PTR(-EINVAL);
+
+ if (devlink_linecard_index_exists(devlink, linecard_index))
+ return ERR_PTR(-EEXIST);
+
+ linecard = kzalloc(sizeof(*linecard), GFP_KERNEL);
+ if (!linecard)
+ return ERR_PTR(-ENOMEM);
+
+ linecard->devlink = devlink;
+ linecard->index = linecard_index;
+ linecard->ops = ops;
+ linecard->priv = priv;
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ mutex_init(&linecard->state_lock);
+
+ err = devlink_linecard_types_init(linecard);
+ if (err) {
+ mutex_destroy(&linecard->state_lock);
+ kfree(linecard);
+ return ERR_PTR(err);
+ }
+
+ list_add_tail(&linecard->list, &devlink->linecard_list);
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ return linecard;
+}
+EXPORT_SYMBOL_GPL(devl_linecard_create);
+
+/**
+ * devl_linecard_destroy - Destroy devlink linecard
+ *
+ * @linecard: devlink linecard
+ */
+void devl_linecard_destroy(struct devlink_linecard *linecard)
+{
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+ list_del(&linecard->list);
+ devlink_linecard_types_fini(linecard);
+ mutex_destroy(&linecard->state_lock);
+ kfree(linecard);
+}
+EXPORT_SYMBOL_GPL(devl_linecard_destroy);
+
+/**
+ * devlink_linecard_provision_set - Set provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ * @type: linecard type
+ *
+ * This is either called directly from the provision() op call or
+ * as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_set(struct devlink_linecard *linecard,
+ const char *type)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->type && strcmp(linecard->type, type));
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+ linecard->type = type;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
+
+/**
+ * devlink_linecard_provision_clear - Clear provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ *
+ * This is either called directly from the unprovision() op call or
+ * as a result of the unprovision() op call asynchronously.
+ */
+void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
+
+/**
+ * devlink_linecard_provision_fail - Fail provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ *
+ * This is either called directly from the provision() op call or
+ * as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail);
+
+/**
+ * devlink_linecard_activate - Set linecard active
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_activate(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED);
+ linecard->state = DEVLINK_LINECARD_STATE_ACTIVE;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_activate);
+
+/**
+ * devlink_linecard_deactivate - Set linecard inactive
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_deactivate(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ switch (linecard->state) {
+ case DEVLINK_LINECARD_STATE_ACTIVE:
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ break;
+ case DEVLINK_LINECARD_STATE_UNPROVISIONING:
+ /* Line card is being deactivated as part
+ * of unprovisioning flow.
+ */
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
+
+static void devlink_linecard_rel_notify_cb(struct devlink *devlink,
+ u32 linecard_index)
+{
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (!linecard)
+ return;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+}
+
+static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink,
+ u32 linecard_index, u32 rel_index)
+{
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (linecard && linecard->rel_index == rel_index)
+ linecard->rel_index = 0;
+}
+
+/**
+ * devlink_linecard_nested_dl_set - Attach/detach nested devlink
+ * instance to linecard.
+ *
+ * @linecard: devlink linecard
+ * @nested_devlink: devlink instance to attach or NULL to detach
+ */
+int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+ struct devlink *nested_devlink)
+{
+ return devlink_rel_nested_in_add(&linecard->rel_index,
+ linecard->devlink->index,
+ linecard->index,
+ devlink_linecard_rel_notify_cb,
+ devlink_linecard_rel_cleanup_cb,
+ nested_devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
new file mode 100644
index 000000000000..593605c1b1ef
--- /dev/null
+++ b/net/devlink/netlink.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#include <net/sock.h>
+
+#include "devl_internal.h"
+
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
+#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_DEV_LOCK BIT(2)
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+ [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+struct devlink_nl_sock_priv {
+ struct devlink_obj_desc __rcu *flt;
+ spinlock_t flt_lock; /* Protects flt. */
+};
+
+static void devlink_nl_sock_priv_init(void *priv)
+{
+ struct devlink_nl_sock_priv *sk_priv = priv;
+
+ spin_lock_init(&sk_priv->flt_lock);
+}
+
+static void devlink_nl_sock_priv_destroy(void *priv)
+{
+ struct devlink_nl_sock_priv *sk_priv = priv;
+ struct devlink_obj_desc *flt;
+
+ flt = rcu_dereference_protected(sk_priv->flt, true);
+ kfree_rcu(flt, rcu);
+}
+
+int devlink_nl_notify_filter_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_nl_sock_priv *sk_priv;
+ struct nlattr **attrs = info->attrs;
+ struct devlink_obj_desc *flt;
+ size_t data_offset = 0;
+ size_t data_size = 0;
+ char *pos;
+
+ if (attrs[DEVLINK_ATTR_BUS_NAME])
+ data_size = size_add(data_size,
+ nla_len(attrs[DEVLINK_ATTR_BUS_NAME]) + 1);
+ if (attrs[DEVLINK_ATTR_DEV_NAME])
+ data_size = size_add(data_size,
+ nla_len(attrs[DEVLINK_ATTR_DEV_NAME]) + 1);
+
+ flt = kzalloc(size_add(sizeof(*flt), data_size), GFP_KERNEL);
+ if (!flt)
+ return -ENOMEM;
+
+ pos = (char *) flt->data;
+ if (attrs[DEVLINK_ATTR_BUS_NAME]) {
+ data_offset += nla_strscpy(pos,
+ attrs[DEVLINK_ATTR_BUS_NAME],
+ data_size) + 1;
+ flt->bus_name = pos;
+ pos += data_offset;
+ }
+ if (attrs[DEVLINK_ATTR_DEV_NAME]) {
+ nla_strscpy(pos, attrs[DEVLINK_ATTR_DEV_NAME],
+ data_size - data_offset);
+ flt->dev_name = pos;
+ }
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ flt->port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ flt->port_index_valid = true;
+ }
+
+ /* Don't attach empty filter. */
+ if (!flt->bus_name && !flt->dev_name && !flt->port_index_valid) {
+ kfree(flt);
+ flt = NULL;
+ }
+
+ sk_priv = genl_sk_priv_get(&devlink_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(sk_priv)) {
+ kfree(flt);
+ return PTR_ERR(sk_priv);
+ }
+ spin_lock(&sk_priv->flt_lock);
+ flt = rcu_replace_pointer(sk_priv->flt, flt,
+ lockdep_is_held(&sk_priv->flt_lock));
+ spin_unlock(&sk_priv->flt_lock);
+ kfree_rcu(flt, rcu);
+ return 0;
+}
+
+static bool devlink_obj_desc_match(const struct devlink_obj_desc *desc,
+ const struct devlink_obj_desc *flt)
+{
+ if (desc->bus_name && flt->bus_name &&
+ strcmp(desc->bus_name, flt->bus_name))
+ return false;
+ if (desc->dev_name && flt->dev_name &&
+ strcmp(desc->dev_name, flt->dev_name))
+ return false;
+ if (desc->port_index_valid && flt->port_index_valid &&
+ desc->port_index != flt->port_index)
+ return false;
+ return true;
+}
+
+int devlink_nl_notify_filter(struct sock *dsk, struct sk_buff *skb, void *data)
+{
+ struct devlink_obj_desc *desc = data;
+ struct devlink_nl_sock_priv *sk_priv;
+ struct devlink_obj_desc *flt;
+ int ret = 0;
+
+ rcu_read_lock();
+ sk_priv = __genl_sk_priv_get(&devlink_nl_family, dsk);
+ if (!IS_ERR_OR_NULL(sk_priv)) {
+ flt = rcu_dereference(sk_priv->flt);
+ if (flt)
+ ret = !devlink_obj_desc_match(desc, flt);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
+ struct devlink *devlink, int attrtype)
+{
+ struct nlattr *nested_attr;
+ struct net *devl_net;
+
+ nested_attr = nla_nest_start(msg, attrtype);
+ if (!nested_attr)
+ return -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ devl_net = read_pnet_rcu(&devlink->_net);
+ if (!net_eq(net, devl_net)) {
+ int id = peernet2id_alloc(net, devl_net, GFP_ATOMIC);
+
+ rcu_read_unlock();
+ if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
+ return -EMSGSIZE;
+ } else {
+ rcu_read_unlock();
+ }
+
+ nla_nest_end(msg, nested_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nested_attr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info)
+{
+ int err;
+
+ if (*msg) {
+ err = genlmsg_reply(*msg, info);
+ if (err)
+ return err;
+ }
+ *msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!*msg)
+ return -ENOMEM;
+ return 0;
+}
+
+struct devlink *
+devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs,
+ bool dev_lock)
+{
+ struct devlink *devlink;
+ unsigned long index;
+ char *busname;
+ char *devname;
+
+ if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+
+ busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+ devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+ devlinks_xa_for_each_registered_get(net, index, devlink) {
+ if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+ strcmp(dev_name(devlink->dev), devname) == 0) {
+ devl_dev_lock(devlink, dev_lock);
+ if (devl_is_registered(devlink))
+ return devlink;
+ devl_dev_unlock(devlink, dev_lock);
+ }
+ devlink_put(devlink);
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info,
+ u8 flags)
+{
+ bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK;
+ struct devlink_port *devlink_port;
+ struct devlink *devlink;
+ int err;
+
+ devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs,
+ dev_lock);
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
+
+ info->user_ptr[0] = devlink;
+ if (flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (IS_ERR(devlink_port)) {
+ err = PTR_ERR(devlink_port);
+ goto unlock;
+ }
+ info->user_ptr[1] = devlink_port;
+ } else if (flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) {
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (!IS_ERR(devlink_port))
+ info->user_ptr[1] = devlink_port;
+ }
+ return 0;
+
+unlock:
+ devl_dev_unlock(devlink, dev_lock);
+ devlink_put(devlink);
+ return err;
+}
+
+int devlink_nl_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, 0);
+}
+
+int devlink_nl_pre_doit_port(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT);
+}
+
+int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK);
+}
+
+int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT);
+}
+
+static void __devlink_nl_post_doit(struct sk_buff *skb, struct genl_info *info,
+ u8 flags)
+{
+ bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK;
+ struct devlink *devlink;
+
+ devlink = info->user_ptr[0];
+ devl_dev_unlock(devlink, dev_lock);
+ devlink_put(devlink);
+}
+
+void devlink_nl_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ __devlink_nl_post_doit(skb, info, 0);
+}
+
+void
+devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ __devlink_nl_post_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK);
+}
+
+static int devlink_nl_inst_single_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb, int flags,
+ devlink_nl_dump_one_func_t *dump_one,
+ struct nlattr **attrs)
+{
+ struct devlink *devlink;
+ int err;
+
+ devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs, false);
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
+ err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED);
+
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+ if (err != -EMSGSIZE)
+ return err;
+ return msg->len;
+}
+
+static int devlink_nl_inst_iter_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb, int flags,
+ devlink_nl_dump_one_func_t *dump_one)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink *devlink;
+ int err = 0;
+
+ while ((devlink = devlinks_xa_find_get(sock_net(msg->sk),
+ &state->instance))) {
+ devl_lock(devlink);
+
+ if (devl_is_registered(devlink))
+ err = dump_one(msg, devlink, cb, flags);
+ else
+ err = 0;
+
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+ if (err)
+ break;
+
+ state->instance++;
+
+ /* restart sub-object walk for the next instance */
+ state->idx = 0;
+ }
+
+ if (err != -EMSGSIZE)
+ return err;
+ return msg->len;
+}
+
+int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb,
+ devlink_nl_dump_one_func_t *dump_one)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct nlattr **attrs = info->attrs;
+ int flags = NLM_F_MULTI;
+
+ if (attrs &&
+ (attrs[DEVLINK_ATTR_BUS_NAME] || attrs[DEVLINK_ATTR_DEV_NAME]))
+ return devlink_nl_inst_single_dumpit(msg, cb, flags, dump_one,
+ attrs);
+ else
+ return devlink_nl_inst_iter_dumpit(msg, cb, flags, dump_one);
+}
+
+struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = devlink_nl_ops,
+ .n_split_ops = ARRAY_SIZE(devlink_nl_ops),
+ .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1,
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+ .sock_priv_size = sizeof(struct devlink_nl_sock_priv),
+ .sock_priv_init = devlink_nl_sock_priv_init,
+ .sock_priv_destroy = devlink_nl_sock_priv_destroy,
+};
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
new file mode 100644
index 000000000000..f4c61c2b4f22
--- /dev/null
+++ b/net/devlink/netlink_gen.c
@@ -0,0 +1,1287 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/devlink.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink_gen.h"
+
+#include <uapi/linux/devlink.h>
+
+/* Sparse enums validation callbacks */
+static int
+devlink_attr_param_type_validate(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ switch (nla_get_u8(attr)) {
+ case DEVLINK_VAR_ATTR_TYPE_U8:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U16:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U32:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U64:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_STRING:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_FLAG:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_NUL_STRING:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_BINARY:
+ return 0;
+ }
+ NL_SET_ERR_MSG_ATTR(extack, attr, "invalid enum value");
+ return -EINVAL;
+}
+
+/* Common nested types */
+const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = {
+ [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, },
+ [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_PORT_FN_ATTR_OPSTATE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
+};
+
+const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1] = {
+ [DEVLINK_RATE_TC_ATTR_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
+ [DEVLINK_RATE_TC_ATTR_BW] = { .type = NLA_U32, },
+};
+
+const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
+ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, },
+};
+
+/* DEVLINK_CMD_GET - do */
+static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PORT_GET - do */
+static const struct nla_policy devlink_port_get_do_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_GET - dump */
+static const struct nla_policy devlink_port_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PORT_SET - do */
+static const struct nla_policy devlink_port_set_nl_policy[DEVLINK_ATTR_PORT_FUNCTION + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_MAX(NLA_U16, 3),
+ [DEVLINK_ATTR_PORT_FUNCTION] = NLA_POLICY_NESTED(devlink_dl_port_function_nl_policy),
+};
+
+/* DEVLINK_CMD_PORT_NEW - do */
+static const struct nla_policy devlink_port_new_nl_policy[DEVLINK_ATTR_PORT_PCI_SF_NUMBER + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_FLAVOUR] = NLA_POLICY_MAX(NLA_U16, 7),
+ [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_DEL - do */
+static const struct nla_policy devlink_port_del_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_SPLIT - do */
+static const struct nla_policy devlink_port_split_nl_policy[DEVLINK_ATTR_PORT_SPLIT_COUNT + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_UNSPLIT - do */
+static const struct nla_policy devlink_port_unsplit_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_GET - do */
+static const struct nla_policy devlink_sb_get_do_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_GET - dump */
+static const struct nla_policy devlink_sb_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_POOL_GET - do */
+static const struct nla_policy devlink_sb_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+};
+
+/* DEVLINK_CMD_SB_POOL_GET - dump */
+static const struct nla_policy devlink_sb_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_POOL_SET - do */
+static const struct nla_policy devlink_sb_pool_set_nl_policy[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_PORT_POOL_GET - do */
+static const struct nla_policy devlink_sb_port_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+};
+
+/* DEVLINK_CMD_SB_PORT_POOL_GET - dump */
+static const struct nla_policy devlink_sb_port_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_PORT_POOL_SET - do */
+static const struct nla_policy devlink_sb_port_pool_set_nl_policy[DEVLINK_ATTR_SB_THRESHOLD + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */
+static const struct nla_policy devlink_sb_tc_pool_bind_get_do_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, },
+};
+
+/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */
+static const struct nla_policy devlink_sb_tc_pool_bind_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_TC_POOL_BIND_SET - do */
+static const struct nla_policy devlink_sb_tc_pool_bind_set_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_OCC_SNAPSHOT - do */
+static const struct nla_policy devlink_sb_occ_snapshot_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_OCC_MAX_CLEAR - do */
+static const struct nla_policy devlink_sb_occ_max_clear_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_ESWITCH_GET - do */
+static const struct nla_policy devlink_eswitch_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_ESWITCH_SET - do */
+static const struct nla_policy devlink_eswitch_set_nl_policy[DEVLINK_ATTR_ESWITCH_ENCAP_MODE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 2),
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = NLA_POLICY_MAX(NLA_U8, 3),
+ [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+/* DEVLINK_CMD_DPIPE_TABLE_GET - do */
+static const struct nla_policy devlink_dpipe_table_get_nl_policy[DEVLINK_ATTR_DPIPE_TABLE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_DPIPE_ENTRIES_GET - do */
+static const struct nla_policy devlink_dpipe_entries_get_nl_policy[DEVLINK_ATTR_DPIPE_TABLE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_DPIPE_HEADERS_GET - do */
+static const struct nla_policy devlink_dpipe_headers_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET - do */
+static const struct nla_policy devlink_dpipe_table_counters_set_nl_policy[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8, },
+};
+
+/* DEVLINK_CMD_RESOURCE_SET - do */
+static const struct nla_policy devlink_resource_set_nl_policy[DEVLINK_ATTR_RESOURCE_SIZE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_RESOURCE_DUMP - do */
+static const struct nla_policy devlink_resource_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RELOAD - do */
+static const struct nla_policy devlink_reload_nl_policy[DEVLINK_ATTR_RELOAD_LIMITS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, 1, 2),
+ [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(6),
+ [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PARAM_GET - do */
+static const struct nla_policy devlink_param_get_do_nl_policy[DEVLINK_ATTR_PARAM_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PARAM_GET - dump */
+static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PARAM_SET - do */
+static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_RESET_DEFAULT + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate),
+ [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2),
+ [DEVLINK_ATTR_PARAM_RESET_DEFAULT] = { .type = NLA_FLAG, },
+};
+
+/* DEVLINK_CMD_REGION_GET - do */
+static const struct nla_policy devlink_region_get_do_nl_policy[DEVLINK_ATTR_REGION_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_REGION_GET - dump */
+static const struct nla_policy devlink_region_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_REGION_NEW - do */
+static const struct nla_policy devlink_region_new_nl_policy[DEVLINK_ATTR_REGION_SNAPSHOT_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_REGION_DEL - do */
+static const struct nla_policy devlink_region_del_nl_policy[DEVLINK_ATTR_REGION_SNAPSHOT_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_REGION_READ - dump */
+static const struct nla_policy devlink_region_read_nl_policy[DEVLINK_ATTR_REGION_DIRECT + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG, },
+ [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_PORT_PARAM_GET - do */
+static const struct nla_policy devlink_port_param_get_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_PARAM_SET - do */
+static const struct nla_policy devlink_port_param_set_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_INFO_GET - do */
+static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_GET - do */
+static const struct nla_policy devlink_health_reporter_get_do_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */
+static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
+static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
+static const struct nla_policy devlink_health_reporter_recover_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE - do */
+static const struct nla_policy devlink_health_reporter_diagnose_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET - dump */
+static const struct nla_policy devlink_health_reporter_dump_get_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR - do */
+static const struct nla_policy devlink_health_reporter_dump_clear_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_FLASH_UPDATE - do */
+static const struct nla_policy devlink_flash_update_nl_policy[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = NLA_POLICY_BITFIELD32(3),
+};
+
+/* DEVLINK_CMD_TRAP_GET - do */
+static const struct nla_policy devlink_trap_get_do_nl_policy[DEVLINK_ATTR_TRAP_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_GET - dump */
+static const struct nla_policy devlink_trap_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_SET - do */
+static const struct nla_policy devlink_trap_set_nl_policy[DEVLINK_ATTR_TRAP_ACTION + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_ACTION] = NLA_POLICY_MAX(NLA_U8, 2),
+};
+
+/* DEVLINK_CMD_TRAP_GROUP_GET - do */
+static const struct nla_policy devlink_trap_group_get_do_nl_policy[DEVLINK_ATTR_TRAP_GROUP_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_GROUP_GET - dump */
+static const struct nla_policy devlink_trap_group_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_GROUP_SET - do */
+static const struct nla_policy devlink_trap_group_set_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_ACTION] = NLA_POLICY_MAX(NLA_U8, 2),
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_TRAP_POLICER_GET - do */
+static const struct nla_policy devlink_trap_policer_get_do_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_TRAP_POLICER_GET - dump */
+static const struct nla_policy devlink_trap_policer_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_POLICER_SET - do */
+static const struct nla_policy devlink_trap_policer_set_nl_policy[DEVLINK_ATTR_TRAP_POLICER_BURST + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_TEST - do */
+static const struct nla_policy devlink_health_reporter_test_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RATE_GET - do */
+static const struct nla_policy devlink_rate_get_do_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RATE_GET - dump */
+static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RATE_SET - do */
+static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
+};
+
+/* DEVLINK_CMD_RATE_NEW - do */
+static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
+};
+
+/* DEVLINK_CMD_RATE_DEL - do */
+static const struct nla_policy devlink_rate_del_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_LINECARD_GET - do */
+static const struct nla_policy devlink_linecard_get_do_nl_policy[DEVLINK_ATTR_LINECARD_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_LINECARD_GET - dump */
+static const struct nla_policy devlink_linecard_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_LINECARD_SET - do */
+static const struct nla_policy devlink_linecard_set_nl_policy[DEVLINK_ATTR_LINECARD_TYPE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SELFTESTS_GET - do */
+static const struct nla_policy devlink_selftests_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SELFTESTS_RUN - do */
+static const struct nla_policy devlink_selftests_run_nl_policy[DEVLINK_ATTR_SELFTESTS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SELFTESTS] = NLA_POLICY_NESTED(devlink_dl_selftest_id_nl_policy),
+};
+
+/* DEVLINK_CMD_NOTIFY_FILTER_SET - do */
+static const struct nla_policy devlink_notify_filter_set_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* Ops table for devlink */
+const struct genl_split_ops devlink_nl_ops[74] = {
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .dumpit = devlink_nl_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .dumpit = devlink_nl_port_get_dumpit,
+ .policy = devlink_port_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_FUNCTION,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_port_new_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_new_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_del_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_del_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SPLIT,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_split_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_split_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_SPLIT_COUNT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_UNSPLIT,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_unsplit_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_unsplit_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .dumpit = devlink_nl_sb_get_dumpit,
+ .policy = devlink_sb_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_pool_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_pool_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_POOL_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .dumpit = devlink_nl_sb_pool_get_dumpit,
+ .policy = devlink_sb_pool_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_pool_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_pool_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_port_pool_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_port_pool_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_POOL_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .dumpit = devlink_nl_sb_port_pool_get_dumpit,
+ .policy = devlink_sb_port_pool_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_port_pool_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_port_pool_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_THRESHOLD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_tc_pool_bind_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_tc_pool_bind_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_TC_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit,
+ .policy = devlink_sb_tc_pool_bind_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_tc_pool_bind_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_tc_pool_bind_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_TC_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_occ_snapshot_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_occ_snapshot_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_occ_max_clear_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_occ_max_clear_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_eswitch_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_eswitch_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_eswitch_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_eswitch_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_ESWITCH_ENCAP_MODE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_table_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_table_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DPIPE_TABLE_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_entries_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_entries_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DPIPE_TABLE_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_headers_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_headers_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_table_counters_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_table_counters_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RESOURCE_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_resource_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_resource_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_RESOURCE_SIZE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RESOURCE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_resource_dump_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_resource_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RELOAD,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_dev_lock,
+ .doit = devlink_nl_reload_doit,
+ .post_doit = devlink_nl_post_doit_dev_lock,
+ .policy = devlink_reload_nl_policy,
+ .maxattr = DEVLINK_ATTR_RELOAD_LIMITS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_param_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_param_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_PARAM_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .dumpit = devlink_nl_param_get_dumpit,
+ .policy = devlink_param_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_param_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_param_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PARAM_RESET_DEFAULT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_region_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_region_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .dumpit = devlink_nl_region_get_dumpit,
+ .policy = devlink_region_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_region_new_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_region_new_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_region_del_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_region_del_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_READ,
+ .validate = GENL_DONT_VALIDATE_DUMP_STRICT,
+ .dumpit = devlink_nl_region_read_dumpit,
+ .policy = devlink_region_read_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_DIRECT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_PARAM_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_param_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_param_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_PARAM_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP_STRICT,
+ .dumpit = devlink_nl_port_param_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_PARAM_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_param_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_param_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_INFO_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_info_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_info_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_INFO_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .dumpit = devlink_nl_info_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
+ .dumpit = devlink_nl_health_reporter_get_dumpit,
+ .policy = devlink_health_reporter_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_recover_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_recover_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_diagnose_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_diagnose_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP_STRICT,
+ .dumpit = devlink_nl_health_reporter_dump_get_dumpit,
+ .policy = devlink_health_reporter_dump_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_dump_clear_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_dump_clear_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_FLASH_UPDATE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_flash_update_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_flash_update_nl_policy,
+ .maxattr = DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GET,
+ .dumpit = devlink_nl_trap_get_dumpit,
+ .policy = devlink_trap_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_ACTION,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_group_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_group_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_GROUP_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_GET,
+ .dumpit = devlink_nl_trap_group_get_dumpit,
+ .policy = devlink_trap_group_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_group_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_group_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_policer_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_policer_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .dumpit = devlink_nl_trap_policer_get_dumpit,
+ .policy = devlink_trap_policer_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_policer_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_policer_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_POLICER_BURST,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_test_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_test_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_NODE_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_GET,
+ .dumpit = devlink_nl_rate_get_dumpit,
+ .policy = devlink_rate_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_new_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_new_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_del_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_del_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_NODE_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_linecard_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_linecard_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_LINECARD_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_GET,
+ .dumpit = devlink_nl_linecard_get_dumpit,
+ .policy = devlink_linecard_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_linecard_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_linecard_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_LINECARD_TYPE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_selftests_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_selftests_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .dumpit = devlink_nl_selftests_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_RUN,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_selftests_run_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_selftests_run_nl_policy,
+ .maxattr = DEVLINK_ATTR_SELFTESTS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_NOTIFY_FILTER_SET,
+ .doit = devlink_nl_notify_filter_set_doit,
+ .policy = devlink_notify_filter_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
new file mode 100644
index 000000000000..2817d53a0eba
--- /dev/null
+++ b/net/devlink/netlink_gen.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/devlink.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_DEVLINK_GEN_H
+#define _LINUX_DEVLINK_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/devlink.h>
+
+/* Common nested types */
+extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
+extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1];
+extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
+
+/* Ops table for devlink */
+extern const struct genl_split_ops devlink_nl_ops[74];
+
+int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_pre_doit_port(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info);
+void
+devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+void
+devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+
+int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_occ_snapshot_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_dpipe_table_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_dpipe_entries_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_dpipe_headers_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_dpipe_table_counters_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_reload_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_param_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_port_param_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_param_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_port_param_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_info_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_health_reporter_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_flash_update_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_rate_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_linecard_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_selftests_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_selftests_run_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_notify_filter_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+
+#endif /* _LINUX_DEVLINK_GEN_H */
diff --git a/net/devlink/param.c b/net/devlink/param.c
new file mode 100644
index 000000000000..e0ea93eded43
--- /dev/null
+++ b/net/devlink/param.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+static const struct devlink_param devlink_param_generic[] = {
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+ .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+ .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
+ .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+ .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
+ .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+ .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+ .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
+ .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
+ .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
+ .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
+ .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
+ .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
+ .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME,
+ .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS,
+ .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS,
+ .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE,
+ },
+};
+
+static int devlink_param_generic_verify(const struct devlink_param *param)
+{
+ /* verify it match generic parameter by id and name */
+ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ if (strcmp(param->name, devlink_param_generic[param->id].name))
+ return -ENOENT;
+
+ WARN_ON(param->type != devlink_param_generic[param->id].type);
+
+ return 0;
+}
+
+static int devlink_param_driver_verify(const struct devlink_param *param)
+{
+ int i;
+
+ if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ /* verify no such name in generic params */
+ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
+ if (!strcmp(param->name, devlink_param_generic[i].name))
+ return -EEXIST;
+
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_name(struct xarray *params, const char *param_name)
+{
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+
+ xa_for_each(params, param_id, param_item) {
+ if (!strcmp(param_item->param->name, param_name))
+ return param_item;
+ }
+ return NULL;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_id(struct xarray *params, u32 param_id)
+{
+ return xa_load(params, param_id);
+}
+
+static bool
+devlink_param_cmode_is_supported(const struct devlink_param *param,
+ enum devlink_param_cmode cmode)
+{
+ return test_bit(cmode, &param->supported_cmodes);
+}
+
+static int devlink_param_get(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->get)
+ return -EOPNOTSUPP;
+ return param->get(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_set(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->set)
+ return -EOPNOTSUPP;
+ return param->set(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_get_default(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->get_default)
+ return -EOPNOTSUPP;
+
+ return param->get_default(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_reset_default(struct devlink *devlink,
+ const struct devlink_param *param,
+ enum devlink_param_cmode cmode,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->reset_default)
+ return -EOPNOTSUPP;
+
+ return param->reset_default(devlink, param->id, cmode, extack);
+}
+
+static int
+devlink_nl_param_value_put(struct sk_buff *msg, enum devlink_param_type type,
+ int nla_type, union devlink_param_value val,
+ bool flag_as_u8)
+{
+ switch (type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_put_u8(msg, nla_type, val.vu8))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_put_u16(msg, nla_type, val.vu16))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_put_u32(msg, nla_type, val.vu32))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_U64:
+ if (devlink_nl_put_u64(msg, nla_type, val.vu64))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ if (nla_put_string(msg, nla_type, val.vstr))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ /* default values of type bool are encoded with u8, so that
+ * false can be distinguished from not present
+ */
+ if (flag_as_u8) {
+ if (nla_put_u8(msg, nla_type, val.vbool))
+ return -EMSGSIZE;
+ } else {
+ if (val.vbool && nla_put_flag(msg, nla_type))
+ return -EMSGSIZE;
+ }
+ break;
+ }
+ return 0;
+}
+
+static int
+devlink_nl_param_value_fill_one(struct sk_buff *msg,
+ enum devlink_param_type type,
+ enum devlink_param_cmode cmode,
+ union devlink_param_value val,
+ union devlink_param_value default_val,
+ bool has_default)
+{
+ struct nlattr *param_value_attr;
+ int err = -EMSGSIZE;
+
+ param_value_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_PARAM_VALUE);
+ if (!param_value_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
+ goto value_nest_cancel;
+
+ err = devlink_nl_param_value_put(msg, type,
+ DEVLINK_ATTR_PARAM_VALUE_DATA,
+ val, false);
+ if (err)
+ goto value_nest_cancel;
+
+ if (has_default) {
+ err = devlink_nl_param_value_put(msg, type,
+ DEVLINK_ATTR_PARAM_VALUE_DEFAULT,
+ default_val, true);
+ if (err)
+ goto value_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_value_attr);
+ return 0;
+
+value_nest_cancel:
+ nla_nest_cancel(msg, param_value_attr);
+ return err;
+}
+
+static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+ unsigned int port_index,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ union devlink_param_value default_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ bool default_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
+ bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
+ const struct devlink_param *param = param_item->param;
+ struct devlink_param_gset_ctx ctx;
+ struct nlattr *param_values_list;
+ struct nlattr *param_attr;
+ void *hdr;
+ int err;
+ int i;
+
+ /* Get value from driver part to driverinit configuration mode */
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (param_item->driverinit_value_new_valid)
+ param_value[i] = param_item->driverinit_value_new;
+ else if (param_item->driverinit_value_valid)
+ param_value[i] = param_item->driverinit_value;
+ else
+ return -EOPNOTSUPP;
+
+ if (param_item->driverinit_value_valid) {
+ default_value[i] = param_item->driverinit_default;
+ default_value_set[i] = true;
+ }
+ } else {
+ ctx.cmode = i;
+ err = devlink_param_get(devlink, param, &ctx, extack);
+ if (err)
+ return err;
+ param_value[i] = ctx.val;
+
+ err = devlink_param_get_default(devlink, param, &ctx,
+ extack);
+ if (!err) {
+ default_value[i] = ctx.val;
+ default_value_set[i] = true;
+ } else if (err != -EOPNOTSUPP) {
+ return err;
+ }
+ }
+ param_value_set[i] = true;
+ }
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ if (cmd == DEVLINK_CMD_PORT_PARAM_GET ||
+ cmd == DEVLINK_CMD_PORT_PARAM_NEW ||
+ cmd == DEVLINK_CMD_PORT_PARAM_DEL)
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index))
+ goto genlmsg_cancel;
+
+ param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM);
+ if (!param_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
+ goto param_nest_cancel;
+ if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
+ goto param_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type))
+ goto param_nest_cancel;
+
+ param_values_list = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_PARAM_VALUES_LIST);
+ if (!param_values_list)
+ goto param_nest_cancel;
+
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!param_value_set[i])
+ continue;
+ err = devlink_nl_param_value_fill_one(msg, param->type,
+ i, param_value[i],
+ default_value[i],
+ default_value_set[i]);
+ if (err)
+ goto values_list_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_values_list);
+ nla_nest_end(msg, param_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+values_list_nest_cancel:
+ nla_nest_end(msg, param_values_list);
+param_nest_cancel:
+ nla_nest_cancel(msg, param_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+ unsigned int port_index,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL &&
+ cmd != DEVLINK_CMD_PORT_PARAM_NEW &&
+ cmd != DEVLINK_CMD_PORT_PARAM_DEL);
+
+ /* devlink_notify_register() / devlink_notify_unregister()
+ * will replay the notifications if the params are added/removed
+ * outside of the lifetime of the instance.
+ */
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+ err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd,
+ 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+static void devlink_params_notify(struct devlink *devlink,
+ enum devlink_command cmd)
+{
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+
+ xa_for_each(&devlink->params, param_id, param_item)
+ devlink_param_notify(devlink, 0, param_item, cmd);
+}
+
+void devlink_params_notify_register(struct devlink *devlink)
+{
+ devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW);
+}
+
+void devlink_params_notify_unregister(struct devlink *devlink)
+{
+ devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL);
+}
+
+static int devlink_nl_param_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+ int err = 0;
+
+ xa_for_each_start(&devlink->params, param_id, param_item, state->idx) {
+ err = devlink_nl_param_fill(msg, devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = param_id;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_param_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one);
+}
+
+static int
+devlink_param_type_get_from_info(struct genl_info *info,
+ enum devlink_param_type *param_type)
+{
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE))
+ return -EINVAL;
+
+ *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]);
+
+ return 0;
+}
+
+static int
+devlink_param_value_get_from_info(const struct devlink_param *param,
+ struct genl_info *info,
+ union devlink_param_value *value)
+{
+ struct nlattr *param_data;
+ int len;
+
+ param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA];
+
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data)
+ return -EINVAL;
+
+ switch (param->type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_len(param_data) != sizeof(u8))
+ return -EINVAL;
+ value->vu8 = nla_get_u8(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_len(param_data) != sizeof(u16))
+ return -EINVAL;
+ value->vu16 = nla_get_u16(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_len(param_data) != sizeof(u32))
+ return -EINVAL;
+ value->vu32 = nla_get_u32(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U64:
+ if (nla_len(param_data) != sizeof(u64))
+ return -EINVAL;
+ value->vu64 = nla_get_u64(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ len = strnlen(nla_data(param_data), nla_len(param_data));
+ if (len == nla_len(param_data) ||
+ len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
+ return -EINVAL;
+ strcpy(value->vstr, nla_data(param_data));
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ if (param_data && nla_len(param_data))
+ return -EINVAL;
+ value->vbool = nla_get_flag(param_data);
+ break;
+ }
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_get_from_info(struct xarray *params, struct genl_info *info)
+{
+ char *param_name;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME))
+ return NULL;
+
+ param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
+ return devlink_param_find_by_name(params, param_name);
+}
+
+int devlink_nl_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_param_item *param_item;
+ struct sk_buff *msg;
+ int err;
+
+ param_item = devlink_param_get_from_info(&devlink->params, info);
+ if (!param_item)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_param_fill(msg, devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_GET, info->snd_portid,
+ info->snd_seq, 0, info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
+ unsigned int port_index,
+ struct xarray *params,
+ struct genl_info *info,
+ enum devlink_command cmd)
+{
+ enum devlink_param_type param_type;
+ struct devlink_param_gset_ctx ctx;
+ enum devlink_param_cmode cmode;
+ struct devlink_param_item *param_item;
+ const struct devlink_param *param;
+ union devlink_param_value value;
+ bool reset_default;
+ int err = 0;
+
+ param_item = devlink_param_get_from_info(params, info);
+ if (!param_item)
+ return -EINVAL;
+ param = param_item->param;
+ err = devlink_param_type_get_from_info(info, &param_type);
+ if (err)
+ return err;
+ if (param_type != param->type)
+ return -EINVAL;
+
+ reset_default = info->attrs[DEVLINK_ATTR_PARAM_RESET_DEFAULT];
+ if (!reset_default) {
+ err = devlink_param_value_get_from_info(param, info, &value);
+ if (err)
+ return err;
+ if (param->validate) {
+ err = param->validate(devlink, param->id, value,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE))
+ return -EINVAL;
+ cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
+ if (!devlink_param_cmode_is_supported(param, cmode))
+ return -EOPNOTSUPP;
+
+ if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (reset_default) {
+ if (!param_item->driverinit_value_valid) {
+ NL_SET_ERR_MSG(info->extack,
+ "Default value not available");
+ return -EOPNOTSUPP;
+ }
+ value = param_item->driverinit_default;
+ }
+
+ param_item->driverinit_value_new = value;
+ param_item->driverinit_value_new_valid = true;
+ } else {
+ if (!param->set)
+ return -EOPNOTSUPP;
+ ctx.val = value;
+ ctx.cmode = cmode;
+ if (reset_default)
+ err = devlink_param_reset_default(devlink, param, cmode,
+ info->extack);
+ else
+ err = devlink_param_set(devlink, param, &ctx,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ devlink_param_notify(devlink, port_index, param_item, cmd);
+ return 0;
+}
+
+int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params,
+ info, DEVLINK_CMD_PARAM_NEW);
+}
+
+int devlink_nl_port_param_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ NL_SET_ERR_MSG(cb->extack, "Port params are not supported");
+ return msg->len;
+}
+
+int devlink_nl_port_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ NL_SET_ERR_MSG(info->extack, "Port params are not supported");
+ return -EINVAL;
+}
+
+int devlink_nl_port_param_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ NL_SET_ERR_MSG(info->extack, "Port params are not supported");
+ return -EINVAL;
+}
+
+static int devlink_param_verify(const struct devlink_param *param)
+{
+ if (!param || !param->name || !param->supported_cmodes)
+ return -EINVAL;
+ if (param->generic)
+ return devlink_param_generic_verify(param);
+ else
+ return devlink_param_driver_verify(param);
+}
+
+static int devlink_param_register(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+ int err;
+
+ WARN_ON(devlink_param_verify(param));
+ WARN_ON(devlink_param_find_by_name(&devlink->params, param->name));
+
+ if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+ WARN_ON(param->get || param->set);
+ else
+ WARN_ON(!param->get || !param->set);
+
+ param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
+ if (!param_item)
+ return -ENOMEM;
+
+ param_item->param = param;
+
+ err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL);
+ if (err)
+ goto err_xa_insert;
+
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+
+err_xa_insert:
+ kfree(param_item);
+ return err;
+}
+
+static void devlink_param_unregister(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->params, param->id);
+ if (WARN_ON(!param_item))
+ return;
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL);
+ xa_erase(&devlink->params, param->id);
+ kfree(param_item);
+}
+
+/**
+ * devl_params_register - register configuration parameters
+ *
+ * @devlink: devlink
+ * @params: configuration parameters array
+ * @params_count: number of parameters provided
+ *
+ * Register the configuration parameters supported by the driver.
+ */
+int devl_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i, err;
+
+ lockdep_assert_held(&devlink->lock);
+
+ for (i = 0; i < params_count; i++, param++) {
+ err = devlink_param_register(devlink, param);
+ if (err)
+ goto rollback;
+ }
+ return 0;
+
+rollback:
+ if (!i)
+ return err;
+
+ for (param--; i > 0; i--, param--)
+ devlink_param_unregister(devlink, param);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_params_register);
+
+int devlink_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_params_register(devlink, params, params_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_params_register);
+
+/**
+ * devl_params_unregister - unregister configuration parameters
+ * @devlink: devlink
+ * @params: configuration parameters to unregister
+ * @params_count: number of parameters provided
+ */
+void devl_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+
+ lockdep_assert_held(&devlink->lock);
+
+ for (i = 0; i < params_count; i++, param++)
+ devlink_param_unregister(devlink, param);
+}
+EXPORT_SYMBOL_GPL(devl_params_unregister);
+
+void devlink_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ devl_lock(devlink);
+ devl_params_unregister(devlink, params, params_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
+
+/**
+ * devl_param_driverinit_value_get - get configuration parameter
+ * value for driver initializing
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @val: pointer to store the value of parameter in driverinit
+ * configuration mode
+ *
+ * This function should be used by the driver to get driverinit
+ * configuration for initialization after reload command.
+ *
+ * Note that lockless call of this function relies on the
+ * driver to maintain following basic sane behavior:
+ * 1) Driver ensures a call to this function cannot race with
+ * registering/unregistering the parameter with the same parameter ID.
+ * 2) Driver ensures a call to this function cannot race with
+ * devl_param_driverinit_value_set() call with the same parameter ID.
+ * 3) Driver ensures a call to this function cannot race with
+ * reload operation.
+ * If the driver is not able to comply, it has to take the devlink->lock
+ * while calling this.
+ */
+int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
+ union devlink_param_value *val)
+{
+ struct devlink_param_item *param_item;
+
+ if (WARN_ON(!devlink_reload_supported(devlink->ops)))
+ return -EOPNOTSUPP;
+
+ param_item = devlink_param_find_by_id(&devlink->params, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!param_item->driverinit_value_valid)
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT)))
+ return -EOPNOTSUPP;
+
+ *val = param_item->driverinit_value;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get);
+
+/**
+ * devl_param_driverinit_value_set - set value of configuration
+ * parameter for driverinit
+ * configuration mode
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter to set for driverinit configuration mode
+ *
+ * This function should be used by the driver to set driverinit
+ * configuration mode default value.
+ */
+void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
+ union devlink_param_value init_val)
+{
+ struct devlink_param_item *param_item;
+
+ devl_assert_locked(devlink);
+
+ param_item = devlink_param_find_by_id(&devlink->params, param_id);
+ if (WARN_ON(!param_item))
+ return;
+
+ if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT)))
+ return;
+
+ param_item->driverinit_value = init_val;
+ param_item->driverinit_value_valid = true;
+ param_item->driverinit_default = init_val;
+
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set);
+
+void devlink_params_driverinit_load_new(struct devlink *devlink)
+{
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+
+ xa_for_each(&devlink->params, param_id, param_item) {
+ if (!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT) ||
+ !param_item->driverinit_value_new_valid)
+ continue;
+ param_item->driverinit_value = param_item->driverinit_value_new;
+ param_item->driverinit_value_valid = true;
+ param_item->driverinit_value_new_valid = false;
+ }
+}
+
+/**
+ * devl_param_value_changed - notify devlink on a parameter's value
+ * change. Should be called by the driver
+ * right after the change.
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ *
+ * This function should be used by the driver to notify devlink on value
+ * change, excluding driverinit configuration mode.
+ * For driverinit configuration mode driver should use the function
+ */
+void devl_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->params, param_id);
+ WARN_ON(!param_item);
+
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devl_param_value_changed);
diff --git a/net/devlink/port.c b/net/devlink/port.c
new file mode 100644
index 000000000000..93d8a25bb920
--- /dev/null
+++ b/net/devlink/port.c
@@ -0,0 +1,1604 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
+ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
+
+static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
+ [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
+ [DEVLINK_PORT_FN_ATTR_STATE] =
+ NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
+ DEVLINK_PORT_FN_STATE_ACTIVE),
+ [DEVLINK_PORT_FN_ATTR_CAPS] =
+ NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
+ [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 },
+};
+
+#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE((devlink_port)->registered)
+
+struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ unsigned int port_index)
+{
+ return xa_load(&devlink->ports, port_index);
+}
+
+struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return ERR_PTR(-ENODEV);
+ return devlink_port;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
+ u32 cap, bool is_enable)
+{
+ caps->selector |= cap;
+ if (is_enable)
+ caps->value |= cap;
+}
+
+static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_roce_get)
+ return 0;
+
+ err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable,
+ extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_migratable_get ||
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+ return 0;
+
+ err = devlink_port->ops->port_fn_migratable_get(devlink_port,
+ &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_ipsec_crypto_get ||
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+ return 0;
+
+ err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_ipsec_packet_get ||
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+ return 0;
+
+ err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ struct nla_bitfield32 caps = {};
+ int err;
+
+ err = devlink_port_fn_roce_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ if (!caps.selector)
+ return 0;
+ err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
+ caps.selector);
+ if (err)
+ return err;
+
+ *msg_updated = true;
+ return 0;
+}
+
+static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ u32 max_io_eqs;
+ int err;
+
+ if (!port->ops->port_fn_max_io_eqs_get)
+ return 0;
+
+ err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs);
+ if (err)
+ return err;
+ *msg_updated = true;
+ return 0;
+}
+
+int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
+{
+ if (devlink_nl_put_handle(msg, devlink_port->devlink))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ return -EMSGSIZE;
+ return 0;
+}
+
+size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
+{
+ struct devlink *devlink = devlink_port->devlink;
+
+ return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */
+ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */
+ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
+}
+
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+ struct devlink_port *devlink_port)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (!devlink_port->attrs_set)
+ return 0;
+ if (attrs->lanes) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes))
+ return -EMSGSIZE;
+ }
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable))
+ return -EMSGSIZE;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+ return -EMSGSIZE;
+ switch (devlink_port->attrs.flavour) {
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_pf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_vf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_sf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_sf.pf) ||
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
+ attrs->pci_sf.sf))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
+ attrs->phys.port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ attrs->phys.port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->phys.split_subport_number))
+ return -EMSGSIZE;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static int devlink_port_fn_hw_addr_fill(struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ u8 hw_addr[MAX_ADDR_LEN];
+ int hw_addr_len;
+ int err;
+
+ if (!port->ops->port_fn_hw_addr_get)
+ return 0;
+
+ err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len,
+ extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
+ if (err)
+ return err;
+ *msg_updated = true;
+ return 0;
+}
+
+static bool
+devlink_port_fn_state_valid(enum devlink_port_fn_state state)
+{
+ return state == DEVLINK_PORT_FN_STATE_INACTIVE ||
+ state == DEVLINK_PORT_FN_STATE_ACTIVE;
+}
+
+static bool
+devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
+{
+ return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED ||
+ opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
+}
+
+static int devlink_port_fn_state_fill(struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ enum devlink_port_fn_opstate opstate;
+ enum devlink_port_fn_state state;
+ int err;
+
+ if (!port->ops->port_fn_state_get)
+ return 0;
+
+ err = port->ops->port_fn_state_get(port, &state, &opstate, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ if (!devlink_port_fn_state_valid(state)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG(extack, "Invalid state read from driver");
+ return -EINVAL;
+ }
+ if (!devlink_port_fn_opstate_valid(opstate)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG(extack, "Invalid operational state read from driver");
+ return -EINVAL;
+ }
+ if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) ||
+ nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate))
+ return -EMSGSIZE;
+ *msg_updated = true;
+ return 0;
+}
+
+static int
+devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_migratable_set(devlink_port, enable,
+ extack);
+}
+
+static int
+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_roce_set(devlink_port, enable,
+ extack);
+}
+
+static int
+devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack);
+}
+
+static int
+devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack);
+}
+
+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nla_bitfield32 caps;
+ u32 caps_value;
+ int err;
+
+ caps = nla_get_bitfield32(attr);
+ caps_value = caps.value & caps.selector;
+ if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
+ err = devlink_port_fn_roce_set(devlink_port,
+ caps_value & DEVLINK_PORT_FN_CAP_ROCE,
+ extack);
+ if (err)
+ return err;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+ err = devlink_port_fn_mig_set(devlink_port, caps_value &
+ DEVLINK_PORT_FN_CAP_MIGRATABLE,
+ extack);
+ if (err)
+ return err;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) {
+ err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value &
+ DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO,
+ extack);
+ if (err)
+ return err;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) {
+ err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value &
+ DEVLINK_PORT_FN_CAP_IPSEC_PACKET,
+ extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int
+devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 max_io_eqs;
+
+ max_io_eqs = nla_get_u32(attr);
+ return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port,
+ max_io_eqs, extack);
+}
+
+static int
+devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *function_attr;
+ bool msg_updated = false;
+ int err;
+
+ function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION);
+ if (!function_attr)
+ return -EMSGSIZE;
+
+ err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_rel_devlink_handle_put(msg, port->devlink,
+ port->rel_index,
+ DEVLINK_PORT_FN_ATTR_DEVLINK,
+ &msg_updated);
+
+out:
+ if (err || !msg_updated)
+ nla_nest_cancel(msg, function_attr);
+ else
+ nla_nest_end(msg, function_attr);
+ return err;
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg,
+ struct devlink_port *devlink_port,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+
+ spin_lock_bh(&devlink_port->type_lock);
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+ goto nla_put_failure_type_locked;
+ if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+ devlink_port->desired_type))
+ goto nla_put_failure_type_locked;
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ if (devlink_port->type_eth.netdev &&
+ (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+ devlink_port->type_eth.ifindex) ||
+ nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+ devlink_port->type_eth.ifname)))
+ goto nla_put_failure_type_locked;
+ }
+ if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+ struct ib_device *ibdev = devlink_port->type_ib.ibdev;
+
+ if (ibdev &&
+ nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+ ibdev->name))
+ goto nla_put_failure_type_locked;
+ }
+ spin_unlock_bh(&devlink_port->type_lock);
+ if (devlink_nl_port_attrs_put(msg, devlink_port))
+ goto nla_put_failure;
+ if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
+ goto nla_put_failure;
+ if (devlink_port->linecard &&
+ nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX,
+ devlink_linecard_index(devlink_port->linecard)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure_type_locked:
+ spin_unlock_bh(&devlink_port->type_lock);
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_obj_desc desc;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_obj_desc_init(&desc, devlink);
+ devlink_nl_obj_desc_port_set(&desc, devlink_port);
+ devlink_nl_notify_send_desc(devlink, msg, &desc);
+}
+
+static void devlink_ports_notify(struct devlink *devlink,
+ enum devlink_command cmd)
+{
+ struct devlink_port *devlink_port;
+ unsigned long port_index;
+
+ xa_for_each(&devlink->ports, port_index, devlink_port)
+ devlink_port_notify(devlink_port, cmd);
+}
+
+void devlink_ports_notify_register(struct devlink *devlink)
+{
+ devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW);
+}
+
+void devlink_ports_notify_unregister(struct devlink *devlink)
+{
+ devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL);
+}
+
+int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_port *devlink_port;
+ unsigned long port_index;
+ int err = 0;
+
+ xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) {
+ err = devlink_nl_port_fill(msg, devlink_port,
+ DEVLINK_CMD_PORT_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ if (err) {
+ state->idx = port_index;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one);
+}
+
+static int devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type port_type)
+
+{
+ int err;
+
+ if (!devlink_port->ops->port_type_set)
+ return -EOPNOTSUPP;
+
+ if (port_type == devlink_port->type)
+ return 0;
+
+ err = devlink_port->ops->port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+
+static int devlink_port_function_hw_addr_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *hw_addr;
+ int hw_addr_len;
+
+ hw_addr = nla_data(attr);
+ hw_addr_len = nla_len(attr);
+ if (hw_addr_len > MAX_ADDR_LEN) {
+ NL_SET_ERR_MSG(extack, "Port function hardware address too long");
+ return -EINVAL;
+ }
+ if (port->type == DEVLINK_PORT_TYPE_ETH) {
+ if (hw_addr_len != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device");
+ return -EINVAL;
+ }
+ if (!is_unicast_ether_addr(hw_addr)) {
+ NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported");
+ return -EINVAL;
+ }
+ }
+
+ return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len,
+ extack);
+}
+
+static int devlink_port_fn_state_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ enum devlink_port_fn_state state;
+
+ state = nla_get_u8(attr);
+ return port->ops->port_fn_state_set(port, state, extack);
+}
+
+static int devlink_port_function_validate(struct devlink_port *devlink_port,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_port_ops *ops = devlink_port->ops;
+ struct nlattr *attr;
+
+ if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
+ !ops->port_fn_hw_addr_set) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
+ "Port doesn't support function attributes");
+ return -EOPNOTSUPP;
+ }
+ if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE],
+ "Function does not support state setting");
+ return -EOPNOTSUPP;
+ }
+ attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+ if (attr) {
+ struct nla_bitfield32 caps;
+
+ caps = nla_get_bitfield32(attr);
+ if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
+ !ops->port_fn_roce_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support RoCE function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+ if (!ops->port_fn_migratable_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support migratable function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "migratable function attribute supported for VFs only");
+ return -EOPNOTSUPP;
+ }
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) {
+ if (!ops->port_fn_ipsec_crypto_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support ipsec_crypto function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "ipsec_crypto function attribute supported for VFs only");
+ return -EOPNOTSUPP;
+ }
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) {
+ if (!ops->port_fn_ipsec_packet_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support ipsec_packet function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "ipsec_packet function attribute supported for VFs only");
+ return -EOPNOTSUPP;
+ }
+ }
+ }
+ if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] &&
+ !ops->port_fn_max_io_eqs_set) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS],
+ "Function does not support max_io_eqs setting");
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int devlink_port_function_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr,
+ devlink_function_nl_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Fail to parse port function attributes");
+ return err;
+ }
+
+ err = devlink_port_function_validate(port, tb, extack);
+ if (err)
+ return err;
+
+ attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
+ if (attr) {
+ err = devlink_port_function_hw_addr_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+ if (attr) {
+ err = devlink_port_fn_caps_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS];
+ if (attr) {
+ err = devlink_port_fn_max_io_eqs_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ /* Keep this as the last function attribute set, so that when
+ * multiple port function attributes are set along with state,
+ * Those can be applied first before activating the state.
+ */
+ attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
+ if (attr)
+ err = devlink_port_fn_state_set(port, attr, extack);
+
+ if (!err)
+ devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
+ return err;
+}
+
+int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+ enum devlink_port_type port_type;
+
+ port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+ err = devlink_port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) {
+ struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
+ struct netlink_ext_ack *extack = info->extack;
+
+ err = devlink_port_function_set(devlink_port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ u32 count;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT))
+ return -EINVAL;
+ if (!devlink_port->ops->port_split)
+ return -EOPNOTSUPP;
+
+ count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+
+ if (!devlink_port->attrs.splittable) {
+ /* Split ports cannot be split. */
+ if (devlink_port->attrs.split)
+ NL_SET_ERR_MSG(info->extack, "Port cannot be split further");
+ else
+ NL_SET_ERR_MSG(info->extack, "Port cannot be split");
+ return -EINVAL;
+ }
+
+ if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) {
+ NL_SET_ERR_MSG(info->extack, "Invalid split count");
+ return -EINVAL;
+ }
+
+ return devlink_port->ops->port_split(devlink, devlink_port, count,
+ info->extack);
+}
+
+int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink_port->ops->port_unsplit)
+ return -EOPNOTSUPP;
+ return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack);
+}
+
+int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink_port_new_attrs new_attrs = {};
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *devlink_port;
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink->ops->port_new)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] ||
+ !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
+ NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified");
+ return -EINVAL;
+ }
+ new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]);
+ new_attrs.pfnum =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ /* Port index of the new port being created by driver. */
+ new_attrs.port_index =
+ nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ new_attrs.port_index_valid = true;
+ }
+ if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) {
+ new_attrs.controller =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]);
+ new_attrs.controller_valid = true;
+ }
+ if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF &&
+ info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) {
+ new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]);
+ new_attrs.sfnum_valid = true;
+ }
+
+ err = devlink->ops->port_new(devlink, &new_attrs,
+ extack, &devlink_port);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto err_out_port_del;
+ }
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0, NULL);
+ if (WARN_ON_ONCE(err))
+ goto err_out_msg_free;
+ err = genlmsg_reply(msg, info);
+ if (err)
+ goto err_out_port_del;
+ return 0;
+
+err_out_msg_free:
+ nlmsg_free(msg);
+err_out_port_del:
+ devlink_port->ops->port_del(devlink, devlink_port, NULL);
+ return err;
+}
+
+int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink_port->ops->port_del)
+ return -EOPNOTSUPP;
+
+ return devlink_port->ops->port_del(devlink, devlink_port, extack);
+}
+
+static void devlink_port_type_warn(struct work_struct *work)
+{
+ struct devlink_port *port = container_of(to_delayed_work(work),
+ struct devlink_port,
+ type_warn_dw);
+ dev_warn(port->devlink->dev, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+ /* Ignore CPU and DSA flavours. */
+ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ /* Schedule a work to WARN in case driver does not set port
+ * type within timeout.
+ */
+ schedule_delayed_work(&devlink_port->type_warn_dw,
+ DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
+/**
+ * devlink_port_init() - Init devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ *
+ * Initialize essential stuff that is needed for functions
+ * that may be called before devlink port registration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_init(struct devlink *devlink,
+ struct devlink_port *devlink_port)
+{
+ if (devlink_port->initialized)
+ return;
+ devlink_port->devlink = devlink;
+ INIT_LIST_HEAD(&devlink_port->region_list);
+ devlink_port->initialized = true;
+}
+EXPORT_SYMBOL_GPL(devlink_port_init);
+
+/**
+ * devlink_port_fini() - Deinitialize devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Deinitialize essential stuff that is in use for functions
+ * that may be called after devlink port unregistration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_fini(struct devlink_port *devlink_port)
+{
+ WARN_ON(!list_empty(&devlink_port->region_list));
+}
+EXPORT_SYMBOL_GPL(devlink_port_fini);
+
+static const struct devlink_port_ops devlink_port_dummy_ops = {};
+
+/**
+ * devl_port_register_with_ops() - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index: driver-specific numerical identifier of the port
+ * @ops: port ops
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devl_port_register_with_ops(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index,
+ const struct devlink_port_ops *ops)
+{
+ int err;
+
+ devl_assert_locked(devlink);
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port_init(devlink, devlink_port);
+ devlink_port->registered = true;
+ devlink_port->index = port_index;
+ devlink_port->ops = ops ? ops : &devlink_port_dummy_ops;
+ spin_lock_init(&devlink_port->type_lock);
+ INIT_LIST_HEAD(&devlink_port->reporter_list);
+ err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL);
+ if (err) {
+ devlink_port->registered = false;
+ return err;
+ }
+
+ INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+ devlink_port_type_warn_schedule(devlink_port);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_port_register_with_ops);
+
+/**
+ * devlink_port_register_with_ops - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index: driver-specific numerical identifier of the port
+ * @ops: port ops
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+int devlink_port_register_with_ops(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index,
+ const struct devlink_port_ops *ops)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_port_register_with_ops(devlink, devlink_port,
+ port_index, ops);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register_with_ops);
+
+/**
+ * devl_port_unregister() - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devl_port_unregister(struct devlink_port *devlink_port)
+{
+ lockdep_assert_held(&devlink_port->devlink->lock);
+ WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET);
+
+ devlink_port_type_warn_cancel(devlink_port);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ xa_erase(&devlink_port->devlink->ports, devlink_port->index);
+ WARN_ON(!list_empty(&devlink_port->reporter_list));
+ devlink_port->registered = false;
+}
+EXPORT_SYMBOL_GPL(devl_port_unregister);
+
+/**
+ * devlink_port_unregister - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+ struct devlink *devlink = devlink_port->devlink;
+
+ devl_lock(devlink);
+ devl_port_unregister(devlink_port);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ const struct net_device_ops *ops = netdev->netdev_ops;
+
+ /* If driver registers devlink port, it should set devlink port
+ * attributes accordingly so the compat functions are called
+ * and the original ops are not used.
+ */
+ if (ops->ndo_get_phys_port_name) {
+ /* Some drivers use the same set of ndos for netdevs
+ * that have devlink_port registered and also for
+ * those who don't. Make sure that ndo_get_phys_port_name
+ * returns -EOPNOTSUPP here in case it is defined.
+ * Warn if not.
+ */
+ char name[IFNAMSIZ];
+ int err;
+
+ err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name));
+ WARN_ON(err != -EOPNOTSUPP);
+ }
+ if (ops->ndo_get_port_parent_id) {
+ /* Some drivers use the same set of ndos for netdevs
+ * that have devlink_port registered and also for
+ * those who don't. Make sure that ndo_get_port_parent_id
+ * returns -EOPNOTSUPP here in case it is defined.
+ * Warn if not.
+ */
+ struct netdev_phys_item_id ppid;
+ int err;
+
+ err = ops->ndo_get_port_parent_id(netdev, &ppid);
+ WARN_ON(err != -EOPNOTSUPP);
+ }
+}
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ struct net_device *netdev = type_dev;
+
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+ if (type == DEVLINK_PORT_TYPE_NOTSET) {
+ devlink_port_type_warn_schedule(devlink_port);
+ } else {
+ devlink_port_type_warn_cancel(devlink_port);
+ if (type == DEVLINK_PORT_TYPE_ETH && netdev)
+ devlink_port_type_netdev_checks(devlink_port, netdev);
+ }
+
+ spin_lock_bh(&devlink_port->type_lock);
+ devlink_port->type = type;
+ switch (type) {
+ case DEVLINK_PORT_TYPE_ETH:
+ devlink_port->type_eth.netdev = netdev;
+ if (netdev) {
+ ASSERT_RTNL();
+ devlink_port->type_eth.ifindex = netdev->ifindex;
+ BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) !=
+ sizeof(netdev->name));
+ strcpy(devlink_port->type_eth.ifname, netdev->name);
+ }
+ break;
+ case DEVLINK_PORT_TYPE_IB:
+ devlink_port->type_ib.ibdev = type_dev;
+ break;
+ default:
+ break;
+ }
+ spin_unlock_bh(&devlink_port->type_lock);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ *
+ * If driver is calling this, most likely it is doing something wrong.
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port)
+{
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ * devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ * @devlink_port: devlink port
+ * @ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+ struct ib_device *ibdev)
+{
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ * devlink_port_type_clear - Clear port type
+ *
+ * @devlink_port: devlink port
+ *
+ * If driver is calling this for clearing Ethernet type, most likely
+ * it is doing something wrong.
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+int devlink_port_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct devlink_port *devlink_port = netdev->devlink_port;
+ struct devlink *devlink;
+
+ if (!devlink_port)
+ return NOTIFY_OK;
+ devlink = devlink_port->devlink;
+
+ switch (event) {
+ case NETDEV_POST_INIT:
+ /* Set the type but not netdev pointer. It is going to be set
+ * later on by NETDEV_REGISTER event. Happens once during
+ * netdevice register
+ */
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+ NULL);
+ break;
+ case NETDEV_REGISTER:
+ case NETDEV_CHANGENAME:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
+ /* Set the netdev on top of previously set type. Note this
+ * event happens also during net namespace change so here
+ * we take into account netdev pointer appearing in this
+ * namespace.
+ */
+ __devlink_port_type_set(devlink_port, devlink_port->type,
+ netdev);
+ break;
+ case NETDEV_UNREGISTER:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
+ /* Clear netdev pointer, but not the type. This event happens
+ * also during net namespace change so we need to clear
+ * pointer to netdev that is going to another net namespace.
+ */
+ __devlink_port_type_set(devlink_port, devlink_port->type,
+ NULL);
+ break;
+ case NETDEV_PRE_UNINIT:
+ /* Clear the type and the netdev pointer. Happens one during
+ * netdevice unregister.
+ */
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
+ NULL);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static void __devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ devlink_port->attrs_set = true;
+ attrs->flavour = flavour;
+ if (attrs->switch_id.id_len) {
+ devlink_port->switch_port = true;
+ if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN))
+ attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN;
+ } else {
+ devlink_port->switch_port = false;
+ }
+}
+
+/**
+ * devlink_port_attrs_set - Set port attributes
+ *
+ * @devlink_port: devlink port
+ * @attrs: devlink port attrs
+ */
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+ const struct devlink_port_attrs *attrs)
+{
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+ WARN_ON(attrs->splittable && attrs->split);
+
+ devlink_port->attrs = *attrs;
+ __devlink_port_attrs_set(devlink_port, attrs->flavour);
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+
+/**
+ * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF);
+ attrs->pci_pf.controller = controller;
+ attrs->pci_pf.pf = pf;
+ attrs->pci_pf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
+
+/**
+ * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @vf: associated PCI VF number of a PF for the devlink port instance;
+ * VF number starts from 0 for the first PCI virtual function
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u16 vf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF);
+ attrs->pci_vf.controller = controller;
+ attrs->pci_vf.pf = pf;
+ attrs->pci_vf.vf = vf;
+ attrs->pci_vf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+
+/**
+ * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @sf: associated SF number of a PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u32 sf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF);
+ attrs->pci_sf.controller = controller;
+ attrs->pci_sf.pf = pf;
+ attrs->pci_sf.sf = sf;
+ attrs->pci_sf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
+
+static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index)
+{
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index,
+ u32 rel_index)
+{
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (devlink_port && devlink_port->rel_index == rel_index)
+ devlink_port->rel_index = 0;
+}
+
+/**
+ * devl_port_fn_devlink_set - Attach peer devlink
+ * instance to port function.
+ * @devlink_port: devlink port
+ * @fn_devlink: devlink instance to attach
+ */
+int devl_port_fn_devlink_set(struct devlink_port *devlink_port,
+ struct devlink *fn_devlink)
+{
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+ if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF ||
+ devlink_port->attrs.pci_sf.external))
+ return -EINVAL;
+
+ return devlink_rel_nested_in_add(&devlink_port->rel_index,
+ devlink_port->devlink->index,
+ devlink_port->index,
+ devlink_port_rel_notify_cb,
+ devlink_port_rel_cleanup_cb,
+ fn_devlink);
+}
+EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set);
+
+/**
+ * devlink_port_linecard_set - Link port with a linecard
+ *
+ * @devlink_port: devlink port
+ * @linecard: devlink linecard
+ */
+void devlink_port_linecard_set(struct devlink_port *devlink_port,
+ struct devlink_linecard *linecard)
+{
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port->linecard = linecard;
+}
+EXPORT_SYMBOL_GPL(devlink_port_linecard_set);
+
+static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
+ char *name, size_t len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int n = 0;
+
+ if (!devlink_port->attrs_set || devlink_port->attrs.no_phys_port_name)
+ return -EOPNOTSUPP;
+
+ switch (attrs->flavour) {
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ if (devlink_port->linecard)
+ n = snprintf(name, len, "l%u",
+ devlink_linecard_index(devlink_port->linecard));
+ if (n < len)
+ n += snprintf(name + n, len - n, "p%u",
+ attrs->phys.port_number);
+ if (n < len && attrs->split)
+ n += snprintf(name + n, len - n, "s%u",
+ attrs->phys.split_subport_number);
+ break;
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ case DEVLINK_PORT_FLAVOUR_UNUSED:
+ /* As CPU and DSA ports do not have a netdevice associated
+ * case should not ever happen.
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (attrs->pci_pf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (attrs->pci_vf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%uvf%u",
+ attrs->pci_vf.pf, attrs->pci_vf.vf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (attrs->pci_sf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_sf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
+ attrs->pci_sf.sf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
+ return -EOPNOTSUPP;
+ }
+
+ if (n >= len)
+ return -EINVAL;
+
+ return 0;
+}
+
+int devlink_compat_phys_port_name_get(struct net_device *dev,
+ char *name, size_t len)
+{
+ struct devlink_port *devlink_port;
+
+ /* RTNL mutex is held here which ensures that devlink_port
+ * instance cannot disappear in the middle. No need to take
+ * any devlink lock as only permanent values are accessed.
+ */
+ ASSERT_RTNL();
+
+ devlink_port = dev->devlink_port;
+ if (!devlink_port)
+ return -EOPNOTSUPP;
+
+ return __devlink_port_phys_port_name_get(devlink_port, name, len);
+}
+
+int devlink_compat_switch_id_get(struct net_device *dev,
+ struct netdev_phys_item_id *ppid)
+{
+ struct devlink_port *devlink_port;
+
+ /* Caller must hold RTNL mutex or reference to dev, which ensures that
+ * devlink_port instance cannot disappear in the middle. No need to take
+ * any devlink lock as only permanent values are accessed.
+ */
+ devlink_port = dev->devlink_port;
+ if (!devlink_port || !devlink_port->switch_port)
+ return -EOPNOTSUPP;
+
+ memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid));
+
+ return 0;
+}
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
new file mode 100644
index 000000000000..d157a8419bca
--- /dev/null
+++ b/net/devlink/rate.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+static inline bool
+devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
+}
+
+static inline bool
+devlink_rate_is_node(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
+}
+
+static struct devlink_rate *
+devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate;
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_from_attrs(devlink, info->attrs);
+ if (IS_ERR(devlink_port))
+ return ERR_CAST(devlink_port);
+ devlink_rate = devlink_port->devlink_rate;
+ return devlink_rate ?: ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
+{
+ struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate) &&
+ !strcmp(node_name, devlink_rate->name))
+ return devlink_rate;
+ }
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ const char *rate_node_name;
+ size_t len;
+
+ if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return ERR_PTR(-EINVAL);
+ rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]);
+ len = strlen(rate_node_name);
+ /* Name cannot be empty or decimal number */
+ if (!len || strspn(rate_node_name, "0123456789") == len)
+ return ERR_PTR(-EINVAL);
+
+ return devlink_rate_node_get_by_name(devlink, rate_node_name);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_rate_node_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_rate *
+devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX])
+ return devlink_rate_leaf_get_from_info(devlink, info);
+ else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return devlink_rate_node_get_from_info(devlink, info);
+ else
+ return ERR_PTR(-EINVAL);
+}
+
+static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
+{
+ struct nlattr *nla_tc_bw;
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS);
+ if (!nla_tc_bw)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) ||
+ nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i]))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla_tc_bw);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nla_tc_bw);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_rate_fill(struct sk_buff *msg,
+ struct devlink_rate *devlink_rate,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type))
+ goto nla_put_failure;
+
+ if (devlink_rate_is_leaf(devlink_rate)) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ devlink_rate->devlink_port->index))
+ goto nla_put_failure;
+ } else if (devlink_rate_is_node(devlink_rate)) {
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME,
+ devlink_rate->name))
+ goto nla_put_failure;
+ }
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE,
+ devlink_rate->tx_share))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX,
+ devlink_rate->tx_max))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY,
+ devlink_rate->tx_priority))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT,
+ devlink_rate->tx_weight))
+ goto nla_put_failure;
+
+ if (devlink_rate->parent)
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
+ devlink_rate->parent->name))
+ goto nla_put_failure;
+
+ if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_rate_notify(struct devlink_rate *devlink_rate,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_rates_notify_register(struct devlink *devlink)
+{
+ struct devlink_rate *rate_node;
+
+ list_for_each_entry(rate_node, &devlink->rate_list, list)
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+}
+
+void devlink_rates_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_rate *rate_node;
+
+ list_for_each_entry_reverse(rate_node, &devlink->rate_list, list)
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+}
+
+static int
+devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_rate *devlink_rate;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
+ u32 id = NETLINK_CB(cb->skb).portid;
+
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id,
+ cb->nlh->nlmsg_seq, flags, NULL);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one);
+}
+
+int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *devlink_rate;
+ struct sk_buff *msg;
+ int err;
+
+ devlink_rate = devlink_rate_get_from_info(devlink, info);
+ if (IS_ERR(devlink_rate))
+ return PTR_ERR(devlink_rate);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static bool
+devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
+ struct devlink_rate *parent)
+{
+ while (parent) {
+ if (parent == devlink_rate)
+ return true;
+ parent = parent->parent;
+ }
+ return false;
+}
+
+static int
+devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info,
+ struct nlattr *nla_parent)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ const char *parent_name = nla_data(nla_parent);
+ const struct devlink_ops *ops = devlink->ops;
+ size_t len = strlen(parent_name);
+ struct devlink_rate *parent;
+ int err = -EOPNOTSUPP;
+
+ parent = devlink_rate->parent;
+
+ if (parent && !len) {
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ if (err)
+ return err;
+
+ refcount_dec(&parent->refcnt);
+ devlink_rate->parent = NULL;
+ } else if (len) {
+ parent = devlink_rate_node_get_by_name(devlink, parent_name);
+ if (IS_ERR(parent))
+ return -ENODEV;
+
+ if (parent == devlink_rate) {
+ NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed");
+ return -EINVAL;
+ }
+
+ if (devlink_rate_is_node(devlink_rate) &&
+ devlink_rate_is_parent_node(devlink_rate, parent->parent)) {
+ NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node.");
+ return -EEXIST;
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ if (err)
+ return err;
+
+ if (devlink_rate->parent)
+ /* we're reassigning to other parent in this case */
+ refcount_dec(&devlink_rate->parent->refcnt);
+
+ refcount_inc(&parent->refcnt);
+ devlink_rate->parent = parent;
+ }
+
+ return 0;
+}
+
+static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
+ unsigned long *bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1];
+ u8 tc_index;
+ int err;
+
+ err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest,
+ devlink_dl_rate_tc_bws_nl_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+ DEVLINK_RATE_TC_ATTR_INDEX);
+ return -EINVAL;
+ }
+
+ tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]);
+
+ if (!tb[DEVLINK_RATE_TC_ATTR_BW]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+ DEVLINK_RATE_TC_ATTR_BW);
+ return -EINVAL;
+ }
+
+ if (test_and_set_bit(tc_index, bitmap)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Duplicate traffic class index specified (%u)",
+ tc_index);
+ return -EINVAL;
+ }
+
+ tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]);
+
+ return 0;
+}
+
+static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info)
+{
+ DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {};
+ struct devlink *devlink = devlink_rate->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {};
+ int rem, err = -EOPNOTSUPP, i;
+ struct nlattr *attr;
+
+ nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr,
+ GENL_HDRLEN, rem) {
+ err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ if (!test_bit(i, bitmap)) {
+ NL_SET_ERR_MSG_FMT(info->extack,
+ "Bandwidth values must be specified for all %u traffic classes",
+ DEVLINK_RATE_TCS_MAX);
+ return -EINVAL;
+ }
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv,
+ tc_bw, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv,
+ tc_bw, info->extack);
+
+ if (err)
+ return err;
+
+ memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw));
+
+ return 0;
+}
+
+static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
+ const struct devlink_ops *ops,
+ struct genl_info *info)
+{
+ struct nlattr *nla_parent, **attrs = info->attrs;
+ int err = -EOPNOTSUPP;
+ u32 priority;
+ u32 weight;
+ u64 rate;
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_share = rate;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_max = rate;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) {
+ priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv,
+ priority, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv,
+ priority, info->extack);
+
+ if (err)
+ return err;
+ devlink_rate->tx_priority = priority;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) {
+ weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv,
+ weight, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv,
+ weight, info->extack);
+
+ if (err)
+ return err;
+ devlink_rate->tx_weight = weight;
+ }
+
+ nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
+ if (nla_parent) {
+ err = devlink_nl_rate_parent_node_set(devlink_rate, info,
+ nla_parent);
+ if (err)
+ return err;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) {
+ err = devlink_nl_rate_tc_bw_set(devlink_rate, info);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
+ struct genl_info *info,
+ enum devlink_rate_type type)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (type == DEVLINK_RATE_TYPE_LEAF) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) {
+ NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) {
+ NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_leaf_parent_set) {
+ NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+ "TX priority set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+ "TX weight set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+ !ops->rate_leaf_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the leafs");
+ return false;
+ }
+ } else if (type == DEVLINK_RATE_TYPE_NODE) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
+ NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) {
+ NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_node_parent_set) {
+ NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+ "TX priority set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+ "TX weight set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+ !ops->rate_node_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the nodes");
+ return false;
+ }
+ } else {
+ WARN(1, "Unknown type of rate object");
+ return false;
+ }
+
+ return true;
+}
+
+int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *devlink_rate;
+ const struct devlink_ops *ops;
+ int err;
+
+ devlink_rate = devlink_rate_get_from_info(devlink, info);
+ if (IS_ERR(devlink_rate))
+ return PTR_ERR(devlink_rate);
+
+ ops = devlink->ops;
+ if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type))
+ return -EOPNOTSUPP;
+
+ err = devlink_nl_rate_set(devlink_rate, ops, info);
+
+ if (!err)
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+ return err;
+}
+
+int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *rate_node;
+ const struct devlink_ops *ops;
+ int err;
+
+ ops = devlink->ops;
+ if (!ops || !ops->rate_node_new || !ops->rate_node_del) {
+ NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
+ return -EOPNOTSUPP;
+
+ rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
+ if (!IS_ERR(rate_node))
+ return -EEXIST;
+ else if (rate_node == ERR_PTR(-EINVAL))
+ return -EINVAL;
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return -ENOMEM;
+
+ rate_node->devlink = devlink;
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL);
+ if (!rate_node->name) {
+ err = -ENOMEM;
+ goto err_strdup;
+ }
+
+ err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack);
+ if (err)
+ goto err_node_new;
+
+ err = devlink_nl_rate_set(rate_node, ops, info);
+ if (err)
+ goto err_rate_set;
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return 0;
+
+err_rate_set:
+ ops->rate_node_del(rate_node, rate_node->priv, info->extack);
+err_node_new:
+ kfree(rate_node->name);
+err_strdup:
+ kfree(rate_node);
+ return err;
+}
+
+int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *rate_node;
+ int err;
+
+ rate_node = devlink_rate_node_get_from_info(devlink, info);
+ if (IS_ERR(rate_node))
+ return PTR_ERR(rate_node);
+
+ if (refcount_read(&rate_node->refcnt) > 1) {
+ NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node.");
+ return -EBUSY;
+ }
+
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+ err = devlink->ops->rate_node_del(rate_node, rate_node->priv,
+ info->extack);
+ if (rate_node->parent)
+ refcount_dec(&rate_node->parent->refcnt);
+ list_del(&rate_node->list);
+ kfree(rate_node->name);
+ kfree(rate_node);
+ return err;
+}
+
+int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list)
+ if (devlink_rate_is_node(devlink_rate)) {
+ NL_SET_ERR_MSG(extack, "Rate node(s) exists.");
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/**
+ * devl_rate_node_create - create devlink rate node
+ * @devlink: devlink instance
+ * @priv: driver private data
+ * @node_name: name of the resulting node
+ * @parent: parent devlink_rate struct
+ *
+ * Create devlink rate object of type node
+ */
+struct devlink_rate *
+devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
+ struct devlink_rate *parent)
+{
+ struct devlink_rate *rate_node;
+
+ rate_node = devlink_rate_node_get_by_name(devlink, node_name);
+ if (!IS_ERR(rate_node))
+ return ERR_PTR(-EEXIST);
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (parent) {
+ rate_node->parent = parent;
+ refcount_inc(&rate_node->parent->refcnt);
+ }
+
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->devlink = devlink;
+ rate_node->priv = priv;
+
+ rate_node->name = kstrdup(node_name, GFP_KERNEL);
+ if (!rate_node->name) {
+ kfree(rate_node);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return rate_node;
+}
+EXPORT_SYMBOL_GPL(devl_rate_node_create);
+
+/**
+ * devl_rate_leaf_create - create devlink rate leaf
+ * @devlink_port: devlink port object to create rate object on
+ * @priv: driver private data
+ * @parent: parent devlink_rate struct
+ *
+ * Create devlink rate object of type leaf on provided @devlink_port.
+ */
+int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv,
+ struct devlink_rate *parent)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_rate *devlink_rate;
+
+ devl_assert_locked(devlink_port->devlink);
+
+ if (WARN_ON(devlink_port->devlink_rate))
+ return -EBUSY;
+
+ devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL);
+ if (!devlink_rate)
+ return -ENOMEM;
+
+ if (parent) {
+ devlink_rate->parent = parent;
+ refcount_inc(&devlink_rate->parent->refcnt);
+ }
+
+ devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
+ devlink_rate->devlink = devlink;
+ devlink_rate->devlink_port = devlink_port;
+ devlink_rate->priv = priv;
+ list_add_tail(&devlink_rate->list, &devlink->rate_list);
+ devlink_port->devlink_rate = devlink_rate;
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_rate_leaf_create);
+
+/**
+ * devl_rate_leaf_destroy - destroy devlink rate leaf
+ *
+ * @devlink_port: devlink port linked to the rate object
+ *
+ * Destroy the devlink rate object of type leaf on provided @devlink_port.
+ */
+void devl_rate_leaf_destroy(struct devlink_port *devlink_port)
+{
+ struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
+
+ devl_assert_locked(devlink_port->devlink);
+ if (!devlink_rate)
+ return;
+
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
+ if (devlink_rate->parent)
+ refcount_dec(&devlink_rate->parent->refcnt);
+ list_del(&devlink_rate->list);
+ devlink_port->devlink_rate = NULL;
+ kfree(devlink_rate);
+}
+EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy);
+
+/**
+ * devl_rate_nodes_destroy - destroy all devlink rate nodes on device
+ * @devlink: devlink instance
+ *
+ * Unset parent for all rate objects and destroy all rate nodes
+ * on specified device.
+ */
+void devl_rate_nodes_destroy(struct devlink *devlink)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ struct devlink_rate *devlink_rate, *tmp;
+
+ devl_assert_locked(devlink);
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (!devlink_rate->parent)
+ continue;
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+ else if (devlink_rate_is_node(devlink_rate))
+ ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+
+ refcount_dec(&devlink_rate->parent->refcnt);
+ devlink_rate->parent = NULL;
+ }
+ list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate)) {
+ ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL);
+ list_del(&devlink_rate->list);
+ kfree(devlink_rate->name);
+ kfree(devlink_rate);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
diff --git a/net/devlink/region.c b/net/devlink/region.c
new file mode 100644
index 000000000000..d6e5805cf3a0
--- /dev/null
+++ b/net/devlink/region.c
@@ -0,0 +1,1258 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+struct devlink_region {
+ struct devlink *devlink;
+ struct devlink_port *port;
+ struct list_head list;
+ union {
+ const struct devlink_region_ops *ops;
+ const struct devlink_port_region_ops *port_ops;
+ };
+ struct mutex snapshot_lock; /* protects snapshot_list,
+ * max_snapshots and cur_snapshots
+ * consistency.
+ */
+ struct list_head snapshot_list;
+ u32 max_snapshots;
+ u32 cur_snapshots;
+ u64 size;
+};
+
+struct devlink_snapshot {
+ struct list_head list;
+ struct devlink_region *region;
+ u8 *data;
+ u32 id;
+};
+
+static struct devlink_region *
+devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ if (!strcmp(region->ops->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_region *
+devlink_port_region_get_by_name(struct devlink_port *port,
+ const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &port->region_list, list)
+ if (!strcmp(region->port_ops->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_snapshot *
+devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
+{
+ struct devlink_snapshot *snapshot;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list)
+ if (snapshot->id == id)
+ return snapshot;
+
+ return NULL;
+}
+
+static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_snapshot *snapshot)
+{
+ struct nlattr *snap_attr;
+ int err;
+
+ snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
+ if (!snap_attr)
+ return -EMSGSIZE;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, snap_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snap_attr);
+ return err;
+}
+
+static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_region *region)
+{
+ struct devlink_snapshot *snapshot;
+ struct nlattr *snapshots_attr;
+ int err;
+
+ snapshots_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_REGION_SNAPSHOTS);
+ if (!snapshots_attr)
+ return -EMSGSIZE;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list) {
+ err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, snapshots_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snapshots_attr);
+ return err;
+}
+
+static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct devlink_region *region)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS,
+ region->max_snapshots);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static struct sk_buff *
+devlink_nl_region_notify_build(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd, u32 portid, u32 seq)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return ERR_PTR(-ENOMEM);
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto out_free_msg;
+ }
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out_cancel_msg;
+
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto out_cancel_msg;
+ }
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
+ region->ops->name);
+ if (err)
+ goto out_cancel_msg;
+
+ if (snapshot) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ snapshot->id);
+ if (err)
+ goto out_cancel_msg;
+ } else {
+ err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size);
+ if (err)
+ goto out_cancel_msg;
+ }
+ genlmsg_end(msg, hdr);
+
+ return msg;
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+ return ERR_PTR(err);
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+ if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0);
+ if (IS_ERR(msg))
+ return;
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_regions_notify_register(struct devlink *devlink)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+}
+
+void devlink_regions_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry_reverse(region, &devlink->region_list, list)
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+}
+
+/**
+ * __devlink_snapshot_id_increment - Increment number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a new snapshot begins using an id. Load the count for the
+ * given id from the snapshot xarray, increment it, and store it back.
+ *
+ * Called when a new snapshot is created with the given id.
+ *
+ * The id *must* have been previously allocated by
+ * devlink_region_snapshot_id_get().
+ *
+ * Returns 0 on success, or an error on failure.
+ */
+static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+ int err;
+
+ xa_lock(&devlink->snapshot_ids);
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ if (WARN_ON(!xa_is_value(p))) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ count = xa_to_value(p);
+ count++;
+
+ err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_ATOMIC));
+unlock:
+ xa_unlock(&devlink->snapshot_ids);
+ return err;
+}
+
+/**
+ * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a snapshot is deleted and stops using an id. Load the count
+ * for the given id from the snapshot xarray, decrement it, and store it
+ * back.
+ *
+ * If the count reaches zero, erase this id from the xarray, freeing it
+ * up for future re-use by devlink_region_snapshot_id_get().
+ *
+ * Called when a snapshot using the given id is deleted, and when the
+ * initial allocator of the id is finished using it.
+ */
+static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ xa_lock(&devlink->snapshot_ids);
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ goto unlock;
+
+ if (WARN_ON(!xa_is_value(p)))
+ goto unlock;
+
+ count = xa_to_value(p);
+
+ if (count > 1) {
+ count--;
+ __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_ATOMIC);
+ } else {
+ /* If this was the last user, we can erase this id */
+ __xa_erase(&devlink->snapshot_ids, id);
+ }
+unlock:
+ xa_unlock(&devlink->snapshot_ids);
+}
+
+/**
+ * __devlink_snapshot_id_insert - Insert a specific snapshot ID
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Mark the given snapshot id as used by inserting a zero value into the
+ * snapshot xarray.
+ *
+ * This must be called while holding the devlink instance lock. Unlike
+ * devlink_snapshot_id_get, the initial reference count is zero, not one.
+ * It is expected that the id will immediately be used before
+ * releasing the devlink instance lock.
+ *
+ * Returns zero on success, or an error code if the snapshot id could not
+ * be inserted.
+ */
+static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
+{
+ int err;
+
+ xa_lock(&devlink->snapshot_ids);
+ if (xa_load(&devlink->snapshot_ids, id)) {
+ xa_unlock(&devlink->snapshot_ids);
+ return -EEXIST;
+ }
+ err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
+ GFP_ATOMIC));
+ xa_unlock(&devlink->snapshot_ids);
+ return err;
+}
+
+/**
+ * __devlink_region_snapshot_id_get - get snapshot ID
+ * @devlink: devlink instance
+ * @id: storage to return snapshot id
+ *
+ * Allocates a new snapshot id. Returns zero on success, or a negative
+ * error on failure. Must be called while holding the devlink instance
+ * lock.
+ *
+ * Snapshot IDs are tracked using an xarray which stores the number of
+ * users of the snapshot id.
+ *
+ * Note that the caller of this function counts as a 'user', in order to
+ * avoid race conditions. The caller must release its hold on the
+ * snapshot by using devlink_region_snapshot_id_put.
+ */
+static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
+ xa_limit_32b, GFP_KERNEL);
+}
+
+/**
+ * __devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * Must be called only while holding the region snapshot lock.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+static int
+__devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ lockdep_assert_held(&region->snapshot_lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots)
+ return -ENOSPC;
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id))
+ return -EEXIST;
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot)
+ return -ENOMEM;
+
+ err = __devlink_snapshot_id_increment(devlink, snapshot_id);
+ if (err)
+ goto err_snapshot_id_increment;
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ return 0;
+
+err_snapshot_id_increment:
+ kfree(snapshot);
+ return err;
+}
+
+static void devlink_region_snapshot_del(struct devlink_region *region,
+ struct devlink_snapshot *snapshot)
+{
+ struct devlink *devlink = region->devlink;
+
+ lockdep_assert_held(&region->snapshot_lock);
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+ region->cur_snapshots--;
+ list_del(&snapshot->list);
+ region->ops->destructor(snapshot->data);
+ __devlink_snapshot_id_decrement(devlink, snapshot->id);
+ kfree(snapshot);
+}
+
+int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *port = NULL;
+ struct devlink_region *region;
+ const char *region_name;
+ struct sk_buff *msg;
+ unsigned int index;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME))
+ return -EINVAL;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
+ info->snd_portid, info->snd_seq, 0,
+ region);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct devlink_port *port,
+ int *idx, int start, int flags)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ list_for_each_entry(region, &port->region_list, list) {
+ if (*idx < start) {
+ (*idx)++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, port->devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ flags, region);
+ if (err)
+ goto out;
+ (*idx)++;
+ }
+
+out:
+ return err;
+}
+
+static int devlink_nl_region_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_region *region;
+ struct devlink_port *port;
+ unsigned long port_index;
+ int idx = 0;
+ int err;
+
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ region);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ idx++;
+ }
+
+ xa_for_each(&devlink->ports, port_index, port) {
+ err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx,
+ state->idx, flags);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int devlink_nl_region_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one);
+}
+
+int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
+ struct devlink_region *region;
+ const char *region_name;
+ unsigned int index;
+ u32 snapshot_id;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID))
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region)
+ return -EINVAL;
+
+ mutex_lock(&region->snapshot_lock);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot) {
+ mutex_unlock(&region->snapshot_lock);
+ return -EINVAL;
+ }
+
+ devlink_region_snapshot_del(region, snapshot);
+ mutex_unlock(&region->snapshot_lock);
+ return 0;
+}
+
+int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
+ struct nlattr *snapshot_id_attr;
+ struct devlink_region *region;
+ const char *region_name;
+ unsigned int index;
+ u32 snapshot_id;
+ u8 *data;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) {
+ NL_SET_ERR_MSG(info->extack, "No region name provided");
+ return -EINVAL;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region) {
+ NL_SET_ERR_MSG(info->extack, "The requested region does not exist");
+ return -EINVAL;
+ }
+
+ if (!region->ops->snapshot) {
+ NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot");
+ return -EOPNOTSUPP;
+ }
+
+ mutex_lock(&region->snapshot_lock);
+
+ if (region->cur_snapshots == region->max_snapshots) {
+ NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots");
+ err = -ENOSPC;
+ goto unlock;
+ }
+
+ snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+ if (snapshot_id_attr) {
+ snapshot_id = nla_get_u32(snapshot_id_attr);
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use");
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ goto unlock;
+ } else {
+ err = __devlink_region_snapshot_id_get(devlink, &snapshot_id);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id");
+ goto unlock;
+ }
+ }
+
+ if (port)
+ err = region->port_ops->snapshot(port, region->port_ops,
+ info->extack, &data);
+ else
+ err = region->ops->snapshot(devlink, region->ops,
+ info->extack, &data);
+ if (err)
+ goto err_snapshot_capture;
+
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ if (err)
+ goto err_snapshot_create;
+
+ if (!snapshot_id_attr) {
+ struct sk_buff *msg;
+
+ snapshot = devlink_region_snapshot_get_by_id(region,
+ snapshot_id);
+ if (WARN_ON(!snapshot)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ msg = devlink_nl_region_notify_build(region, snapshot,
+ DEVLINK_CMD_REGION_NEW,
+ info->snd_portid,
+ info->snd_seq);
+ err = PTR_ERR_OR_ZERO(msg);
+ if (err)
+ goto err_notify;
+
+ err = genlmsg_reply(msg, info);
+ if (err)
+ goto err_notify;
+ }
+
+ mutex_unlock(&region->snapshot_lock);
+ return 0;
+
+err_snapshot_create:
+ region->ops->destructor(data);
+err_snapshot_capture:
+ __devlink_snapshot_id_decrement(devlink, snapshot_id);
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+
+err_notify:
+ devlink_region_snapshot_del(region, snapshot);
+unlock:
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+}
+
+static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
+ u8 *chunk, u32 chunk_size,
+ u64 addr)
+{
+ struct nlattr *chunk_attr;
+ int err;
+
+ chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK);
+ if (!chunk_attr)
+ return -EINVAL;
+
+ err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, chunk_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, chunk_attr);
+ return err;
+}
+
+#define DEVLINK_REGION_READ_CHUNK_SIZE 256
+
+typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset,
+ struct netlink_ext_ack *extack);
+
+static int
+devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb,
+ void *cb_priv, u64 start_offset, u64 end_offset,
+ u64 *new_offset, struct netlink_ext_ack *extack)
+{
+ u64 curr_offset = start_offset;
+ int err = 0;
+ u8 *data;
+
+ /* Allocate and re-use a single buffer */
+ data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ *new_offset = start_offset;
+
+ while (curr_offset < end_offset) {
+ u32 data_size;
+
+ data_size = min_t(u32, end_offset - curr_offset,
+ DEVLINK_REGION_READ_CHUNK_SIZE);
+
+ err = cb(cb_priv, data, data_size, curr_offset, extack);
+ if (err)
+ break;
+
+ err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset);
+ if (err)
+ break;
+
+ curr_offset += data_size;
+ }
+ *new_offset = curr_offset;
+
+ kfree(data);
+
+ return err;
+}
+
+static int
+devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset,
+ struct netlink_ext_ack __always_unused *extack)
+{
+ struct devlink_snapshot *snapshot = cb_priv;
+
+ memcpy(chunk, &snapshot->data[curr_offset], chunk_size);
+
+ return 0;
+}
+
+static int
+devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset, struct netlink_ext_ack *extack)
+{
+ struct devlink_region *region = cb_priv;
+
+ return region->port_ops->read(region->port, region->port_ops, extack,
+ curr_offset, chunk_size, chunk);
+}
+
+static int
+devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset, struct netlink_ext_ack *extack)
+{
+ struct devlink_region *region = cb_priv;
+
+ return region->ops->read(region->devlink, region->ops, extack,
+ curr_offset, chunk_size, chunk);
+}
+
+int devlink_nl_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct nlattr *chunks_attr, *region_attr, *snapshot_attr;
+ u64 ret_offset, start_offset, end_offset = U64_MAX;
+ struct nlattr **attrs = info->info.attrs;
+ struct devlink_port *port = NULL;
+ devlink_chunk_fill_t *region_cb;
+ struct devlink_region *region;
+ const char *region_name;
+ struct devlink *devlink;
+ unsigned int index;
+ void *region_cb_priv;
+ void *hdr;
+ int err;
+
+ start_offset = state->start_offset;
+
+ devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs,
+ false);
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
+
+ if (!attrs[DEVLINK_ATTR_REGION_NAME]) {
+ NL_SET_ERR_MSG(cb->extack, "No region name provided");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ region_attr = attrs[DEVLINK_ATTR_REGION_NAME];
+ region_name = nla_data(region_attr);
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region) {
+ NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+ if (!snapshot_attr) {
+ if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
+ NL_SET_ERR_MSG(cb->extack, "No snapshot id provided");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (!region->ops->read) {
+ NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read");
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ if (port)
+ region_cb = &devlink_region_port_direct_fill;
+ else
+ region_cb = &devlink_region_direct_fill;
+ region_cb_priv = region;
+ } else {
+ struct devlink_snapshot *snapshot;
+ u32 snapshot_id;
+
+ if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
+ NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ snapshot_id = nla_get_u32(snapshot_attr);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot) {
+ NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ region_cb = &devlink_region_snapshot_fill;
+ region_cb_priv = snapshot;
+ }
+
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ }
+
+ if (end_offset > region->size)
+ end_offset = region->size;
+
+ /* return 0 if there is no further data to read */
+ if (start_offset == end_offset) {
+ err = 0;
+ goto out_unlock;
+ }
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
+ DEVLINK_CMD_REGION_READ);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto out_unlock;
+ }
+
+ err = devlink_nl_put_handle(skb, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (region->port) {
+ err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
+ if (err)
+ goto nla_put_failure;
+
+ chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS);
+ if (!chunks_attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv,
+ start_offset, end_offset, &ret_offset,
+ cb->extack);
+
+ if (err && err != -EMSGSIZE)
+ goto nla_put_failure;
+
+ /* Check if there was any progress done to prevent infinite loop */
+ if (ret_offset == start_offset) {
+ err = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ state->start_offset = ret_offset;
+
+ nla_nest_end(skb, chunks_attr);
+ genlmsg_end(skb, hdr);
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+out_unlock:
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return err;
+}
+
+/**
+ * devl_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ */
+struct devlink_region *devl_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots,
+ u64 region_size)
+{
+ struct devlink_region *region;
+
+ devl_assert_locked(devlink);
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ if (devlink_region_get_by_name(devlink, ops->name))
+ return ERR_PTR(-EEXIST);
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return ERR_PTR(-ENOMEM);
+
+ region->devlink = devlink;
+ region->max_snapshots = region_max_snapshots;
+ region->ops = ops;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ mutex_init(&region->snapshot_lock);
+ list_add_tail(&region->list, &devlink->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ return region;
+}
+EXPORT_SYMBOL_GPL(devl_region_create);
+
+/**
+ * devlink_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct devlink_region *region;
+
+ devl_lock(devlink);
+ region = devl_region_create(devlink, ops, region_max_snapshots,
+ region_size);
+ devl_unlock(devlink);
+ return region;
+}
+EXPORT_SYMBOL_GPL(devlink_region_create);
+
+/**
+ * devlink_port_region_create - create a new address region for a port
+ *
+ * @port: devlink port
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+struct devlink_region *
+devlink_port_region_create(struct devlink_port *port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct devlink *devlink = port->devlink;
+ struct devlink_region *region;
+ int err = 0;
+
+ ASSERT_DEVLINK_PORT_INITIALIZED(port);
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ devl_lock(devlink);
+
+ if (devlink_port_region_get_by_name(port, ops->name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ region->devlink = devlink;
+ region->port = port;
+ region->max_snapshots = region_max_snapshots;
+ region->port_ops = ops;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ mutex_init(&region->snapshot_lock);
+ list_add_tail(&region->list, &port->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ devl_unlock(devlink);
+ return region;
+
+unlock:
+ devl_unlock(devlink);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(devlink_port_region_create);
+
+/**
+ * devl_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ */
+void devl_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot, *ts;
+
+ devl_assert_locked(devlink);
+
+ /* Free all snapshots of region */
+ mutex_lock(&region->snapshot_lock);
+ list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
+ devlink_region_snapshot_del(region, snapshot);
+ mutex_unlock(&region->snapshot_lock);
+
+ list_del(&region->list);
+ mutex_destroy(&region->snapshot_lock);
+
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+ kfree(region);
+}
+EXPORT_SYMBOL_GPL(devl_region_destroy);
+
+/**
+ * devlink_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+
+ devl_lock(devlink);
+ devl_region_destroy(region);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_region_destroy);
+
+/**
+ * devlink_region_snapshot_id_get - get snapshot ID
+ *
+ * This callback should be called when adding a new snapshot,
+ * Driver should use the same id for multiple snapshots taken
+ * on multiple regions at the same time/by the same trigger.
+ *
+ * The caller of this function must use devlink_region_snapshot_id_put
+ * when finished creating regions using this id.
+ *
+ * Returns zero on success, or a negative error code on failure.
+ *
+ * @devlink: devlink
+ * @id: storage to return id
+ */
+int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ return __devlink_region_snapshot_id_get(devlink, id);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
+
+/**
+ * devlink_region_snapshot_id_put - put snapshot ID reference
+ *
+ * This should be called by a driver after finishing creating snapshots
+ * with an id. Doing so ensures that the ID can later be released in the
+ * event that all snapshots using it have been destroyed.
+ *
+ * @devlink: devlink
+ * @id: id to release reference on
+ */
+void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
+{
+ __devlink_snapshot_id_decrement(devlink, id);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
+
+/**
+ * devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+int devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ int err;
+
+ mutex_lock(&region->snapshot_lock);
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
diff --git a/net/devlink/resource.c b/net/devlink/resource.c
new file mode 100644
index 000000000000..2d6324f3d91f
--- /dev/null
+++ b/net/devlink/resource.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @id: id, per devlink instance
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ * including its children
+ * @parent: parent resource
+ * @size_params: size parameters
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+struct devlink_resource {
+ const char *name;
+ u64 id;
+ u64 size;
+ u64 size_new;
+ bool size_valid;
+ struct devlink_resource *parent;
+ struct devlink_resource_size_params size_params;
+ struct list_head list;
+ struct list_head resource_list;
+ devlink_resource_occ_get_t *occ_get;
+ void *occ_get_priv;
+};
+
+static struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+ struct devlink_resource *resource, u64 resource_id)
+{
+ struct list_head *resource_list;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ list_for_each_entry(resource, resource_list, list) {
+ struct devlink_resource *child_resource;
+
+ if (resource->id == resource_id)
+ return resource;
+
+ child_resource = devlink_resource_find(devlink, resource,
+ resource_id);
+ if (child_resource)
+ return child_resource;
+ }
+ return NULL;
+}
+
+static void
+devlink_resource_validate_children(struct devlink_resource *resource)
+{
+ struct devlink_resource *child_resource;
+ bool size_valid = true;
+ u64 parts_size = 0;
+
+ if (list_empty(&resource->resource_list))
+ goto out;
+
+ list_for_each_entry(child_resource, &resource->resource_list, list)
+ parts_size += child_resource->size_new;
+
+ if (parts_size > resource->size_new)
+ size_valid = false;
+out:
+ resource->size_valid = size_valid;
+}
+
+static int
+devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
+ struct netlink_ext_ack *extack)
+{
+ u64 reminder;
+ int err = 0;
+
+ if (size > resource->size_params.size_max) {
+ NL_SET_ERR_MSG(extack, "Size larger than maximum");
+ err = -EINVAL;
+ }
+
+ if (size < resource->size_params.size_min) {
+ NL_SET_ERR_MSG(extack, "Size smaller than minimum");
+ err = -EINVAL;
+ }
+
+ div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
+ if (reminder) {
+ NL_SET_ERR_MSG(extack, "Wrong granularity");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_resource *resource;
+ u64 resource_id;
+ u64 size;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE))
+ return -EINVAL;
+ resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (!resource)
+ return -EINVAL;
+
+ size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+ err = devlink_resource_validate_size(resource, size, info->extack);
+ if (err)
+ return err;
+
+ resource->size_new = size;
+ devlink_resource_validate_children(resource);
+ if (resource->parent)
+ devlink_resource_validate_children(resource->parent);
+ return 0;
+}
+
+static int
+devlink_resource_size_params_put(struct devlink_resource *resource,
+ struct sk_buff *skb)
+{
+ struct devlink_resource_size_params *size_params;
+
+ size_params = &resource->size_params;
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+ size_params->size_granularity) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+ size_params->size_max) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+ size_params->size_min) ||
+ nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_resource_occ_put(struct devlink_resource *resource,
+ struct sk_buff *skb)
+{
+ if (!resource->occ_get)
+ return 0;
+ return devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_OCC,
+ resource->occ_get(resource->occ_get_priv));
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *child_resource;
+ struct nlattr *child_resource_attr;
+ struct nlattr *resource_attr;
+
+ resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE);
+ if (!resource_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id))
+ goto nla_put_failure;
+ if (resource->size != resource->size_new &&
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+ resource->size_new))
+ goto nla_put_failure;
+ if (devlink_resource_occ_put(resource, skb))
+ goto nla_put_failure;
+ if (devlink_resource_size_params_put(resource, skb))
+ goto nla_put_failure;
+ if (list_empty(&resource->resource_list))
+ goto out;
+
+ if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+ resource->size_valid))
+ goto nla_put_failure;
+
+ child_resource_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_RESOURCE_LIST);
+ if (!child_resource_attr)
+ goto nla_put_failure;
+
+ list_for_each_entry(child_resource, &resource->resource_list, list) {
+ if (devlink_resource_put(devlink, skb, child_resource))
+ goto resource_put_failure;
+ }
+
+ nla_nest_end(skb, child_resource_attr);
+out:
+ nla_nest_end(skb, resource_attr);
+ return 0;
+
+resource_put_failure:
+ nla_nest_cancel(skb, child_resource_attr);
+nla_put_failure:
+ nla_nest_cancel(skb, resource_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_resource *resource;
+ struct nlattr *resources_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ bool incomplete;
+ void *hdr;
+ int i;
+ int err;
+
+ resource = list_first_entry(&devlink->resource_list,
+ struct devlink_resource, list);
+start_again:
+ err = devlink_nl_msg_reply_and_new(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+
+ resources_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_RESOURCE_LIST);
+ if (!resources_attr)
+ goto nla_put_failure;
+
+ incomplete = false;
+ i = 0;
+ list_for_each_entry_from(resource, &devlink->resource_list, list) {
+ err = devlink_resource_put(devlink, skb, resource);
+ if (err) {
+ if (!i)
+ goto err_resource_put;
+ incomplete = true;
+ break;
+ }
+ i++;
+ }
+ nla_nest_end(skb, resources_attr);
+ genlmsg_end(skb, hdr);
+ if (incomplete)
+ goto start_again;
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_nl_msg_reply_and_new(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_resource_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->resource_list))
+ return -EOPNOTSUPP;
+
+ return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
+int devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info)
+{
+ struct list_head *resource_list;
+ int err = 0;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ list_for_each_entry(resource, resource_list, list) {
+ if (!resource->size_valid)
+ return -EINVAL;
+ err = devlink_resources_validate(devlink, resource, info);
+ if (err)
+ return err;
+ }
+ return err;
+}
+
+/**
+ * devl_resource_register - devlink resource register
+ *
+ * @devlink: devlink
+ * @resource_name: resource's name
+ * @resource_size: resource's size
+ * @resource_id: resource's id
+ * @parent_resource_id: resource's parent id
+ * @size_params: size parameters
+ *
+ * Generic resources should reuse the same names across drivers.
+ * Please see the generic resources list at:
+ * Documentation/networking/devlink/devlink-resource.rst
+ */
+int devl_resource_register(struct devlink *devlink,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ struct devlink_resource *resource;
+ struct list_head *resource_list;
+ bool top_hierarchy;
+
+ lockdep_assert_held(&devlink->lock);
+
+ top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (resource)
+ return -EEXIST;
+
+ resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+ if (!resource)
+ return -ENOMEM;
+
+ if (top_hierarchy) {
+ resource_list = &devlink->resource_list;
+ } else {
+ struct devlink_resource *parent_resource;
+
+ parent_resource = devlink_resource_find(devlink, NULL,
+ parent_resource_id);
+ if (parent_resource) {
+ resource_list = &parent_resource->resource_list;
+ resource->parent = parent_resource;
+ } else {
+ kfree(resource);
+ return -EINVAL;
+ }
+ }
+
+ resource->name = resource_name;
+ resource->size = resource_size;
+ resource->size_new = resource_size;
+ resource->id = resource_id;
+ resource->size_valid = true;
+ memcpy(&resource->size_params, size_params,
+ sizeof(resource->size_params));
+ INIT_LIST_HEAD(&resource->resource_list);
+ list_add_tail(&resource->list, resource_list);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_resource_register);
+
+static void devlink_resource_unregister(struct devlink *devlink,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *tmp, *child_resource;
+
+ list_for_each_entry_safe(child_resource, tmp, &resource->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+}
+
+/**
+ * devl_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ */
+void devl_resources_unregister(struct devlink *devlink)
+{
+ struct devlink_resource *tmp, *child_resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+}
+EXPORT_SYMBOL_GPL(devl_resources_unregister);
+
+/**
+ * devlink_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_resources_unregister(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_resources_unregister(devlink);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ * devl_resource_size_get - get and update size
+ *
+ * @devlink: devlink
+ * @resource_id: the requested resource id
+ * @p_resource_size: ptr to update
+ */
+int devl_resource_size_get(struct devlink *devlink,
+ u64 resource_id,
+ u64 *p_resource_size)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (!resource)
+ return -EINVAL;
+ *p_resource_size = resource->size_new;
+ resource->size = resource->size_new;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_resource_size_get);
+
+/**
+ * devl_resource_occ_get_register - register occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+void devl_resource_occ_get_register(struct devlink *devlink,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ return;
+ WARN_ON(resource->occ_get);
+
+ resource->occ_get = occ_get;
+ resource->occ_get_priv = occ_get_priv;
+}
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_register);
+
+/**
+ * devl_resource_occ_get_unregister - unregister occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ */
+void devl_resource_occ_get_unregister(struct devlink *devlink,
+ u64 resource_id)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ return;
+ WARN_ON(!resource->occ_get);
+
+ resource->occ_get = NULL;
+ resource->occ_get_priv = NULL;
+}
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
diff --git a/net/devlink/sb.c b/net/devlink/sb.c
new file mode 100644
index 000000000000..0a76bb32502b
--- /dev/null
+++ b/net/devlink/sb.c
@@ -0,0 +1,995 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+struct devlink_sb {
+ struct list_head list;
+ unsigned int index;
+ u32 size;
+ u16 ingress_pools_count;
+ u16 egress_pools_count;
+ u16 ingress_tc_count;
+ u16 egress_tc_count;
+};
+
+static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
+{
+ return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
+}
+
+static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (devlink_sb->index == sb_index)
+ return devlink_sb;
+ }
+ return NULL;
+}
+
+static bool devlink_sb_index_exists(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ return devlink_sb_get_by_index(devlink, sb_index);
+}
+
+static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_SB_INDEX]) {
+ u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ if (!devlink_sb)
+ return ERR_PTR(-ENODEV);
+ return devlink_sb;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_sb_get_from_attrs(devlink, info->attrs);
+}
+
+static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ u16 *p_pool_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
+ if (val >= devlink_sb_pool_count(devlink_sb))
+ return -EINVAL;
+ *p_pool_index = val;
+ return 0;
+}
+
+static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ u16 *p_pool_index)
+{
+ return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
+ p_pool_index);
+}
+
+static int
+devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
+ if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val != DEVLINK_SB_POOL_TYPE_EGRESS)
+ return -EINVAL;
+ *p_pool_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_pool_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
+}
+
+static int
+devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
+ if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
+ val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
+ return -EINVAL;
+ *p_th_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_th_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
+}
+
+static int
+devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
+ if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val >= devlink_sb->ingress_tc_count)
+ return -EINVAL;
+ if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
+ val >= devlink_sb->egress_tc_count)
+ return -EINVAL;
+ *p_tc_index = val;
+ return 0;
+}
+
+static int
+devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
+ pool_type, p_tc_index);
+}
+
+static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
+ devlink_sb->ingress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
+ devlink_sb->egress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
+ devlink_sb->ingress_tc_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
+ devlink_sb->egress_tc_count))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one);
+}
+
+static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index, enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_sb_pool_info pool_info;
+ void *hdr;
+ int err;
+
+ err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
+ pool_index, &pool_info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ pool_info.threshold_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE,
+ pool_info.cell_size))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops->sb_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq, int flags)
+{
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_pool_fill(msg, devlink,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ portid, seq, flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ return 0;
+}
+
+static int
+devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int err = 0;
+ int idx = 0;
+
+ if (!devlink->ops->sb_pool_get)
+ return 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_pool_get_dumpit(msg, state->idx, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = idx;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one);
+}
+
+static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type,
+ struct netlink_ext_ack *extack)
+
+{
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops->sb_pool_set)
+ return ops->sb_pool_set(devlink, sb_index, pool_index,
+ size, threshold_type, extack);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_sb_threshold_type threshold_type;
+ struct devlink_sb *devlink_sb;
+ u16 pool_index;
+ u32 size;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_th_type_get_from_info(info, &threshold_type);
+ if (err)
+ return err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE))
+ return -EINVAL;
+
+ size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
+ return devlink_sb_pool_set(devlink, devlink_sb->index,
+ pool_index, size, threshold_type,
+ info->extack);
+}
+
+static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_port_pool_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ goto sb_occ_get_failure;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+sb_occ_get_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops->sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
+ devlink_sb, pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_port *devlink_port;
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ unsigned long port_index;
+ u16 pool_index;
+ int err;
+
+ xa_for_each(&devlink->ports, port_index, devlink_port) {
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_port_pool_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ portid, seq, flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int
+devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int idx = 0;
+ int err = 0;
+
+ if (!devlink->ops->sb_port_pool_get)
+ return 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_port_pool_get_dumpit(msg, state->idx, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = idx;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one);
+}
+
+static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold,
+ struct netlink_ext_ack *extack)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops->sb_port_pool_set)
+ return ops->sb_port_pool_set(devlink_port, sb_index,
+ pool_index, threshold, extack);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
+ pool_index, threshold, info->extack);
+}
+
+static int
+devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u16 pool_index;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ &pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_tc_port_bind_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_tc_port_bind_get(devlink_port,
+ devlink_sb->index,
+ tc_index, pool_type,
+ &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops->sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
+ devlink_sb, tc_index, pool_type,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ info->snd_portid,
+ info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_port *devlink_port;
+ unsigned long port_index;
+ u16 tc_index;
+ int err;
+
+ xa_for_each(&devlink->ports, port_index, devlink_port) {
+ for (tc_index = 0;
+ tc_index < devlink_sb->ingress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_INGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ for (tc_index = 0;
+ tc_index < devlink_sb->egress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_EGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int idx = 0;
+ int err = 0;
+
+ if (!devlink->ops->sb_tc_pool_bind_get)
+ return 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = idx;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb,
+ devlink_nl_sb_tc_pool_bind_get_dump_one);
+}
+
+static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold,
+ struct netlink_ext_ack *extack)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops->sb_tc_pool_bind_set)
+ return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold, extack);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_sb_pool_type pool_type;
+ struct devlink_sb *devlink_sb;
+ u16 tc_index;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ pool_index, threshold, info->extack);
+}
+
+int devlink_nl_sb_occ_snapshot_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ if (ops->sb_occ_snapshot)
+ return ops->sb_occ_snapshot(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ if (ops->sb_occ_max_clear)
+ return ops->sb_occ_max_clear(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ struct devlink_sb *devlink_sb;
+
+ lockdep_assert_held(&devlink->lock);
+
+ if (devlink_sb_index_exists(devlink, sb_index))
+ return -EEXIST;
+
+ devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
+ if (!devlink_sb)
+ return -ENOMEM;
+ devlink_sb->index = sb_index;
+ devlink_sb->size = size;
+ devlink_sb->ingress_pools_count = ingress_pools_count;
+ devlink_sb->egress_pools_count = egress_pools_count;
+ devlink_sb->ingress_tc_count = ingress_tc_count;
+ devlink_sb->egress_tc_count = egress_tc_count;
+ list_add_tail(&devlink_sb->list, &devlink->sb_list);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_sb_register);
+
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_sb_register(devlink, sb_index, size, ingress_pools_count,
+ egress_pools_count, ingress_tc_count,
+ egress_tc_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_sb_register);
+
+void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ lockdep_assert_held(&devlink->lock);
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ WARN_ON(!devlink_sb);
+ list_del(&devlink_sb->list);
+ kfree(devlink_sb);
+}
+EXPORT_SYMBOL_GPL(devl_sb_unregister);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ devl_lock(devlink);
+ devl_sb_unregister(devlink, sb_index);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_sb_unregister);
diff --git a/net/devlink/trap.c b/net/devlink/trap.c
new file mode 100644
index 000000000000..f36087f90db5
--- /dev/null
+++ b/net/devlink/trap.c
@@ -0,0 +1,1854 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <trace/events/devlink.h>
+
+#include "devl_internal.h"
+
+struct devlink_stats {
+ u64_stats_t rx_bytes;
+ u64_stats_t rx_packets;
+ struct u64_stats_sync syncp;
+};
+
+/**
+ * struct devlink_trap_policer_item - Packet trap policer attributes.
+ * @policer: Immutable packet trap policer attributes.
+ * @rate: Rate in packets / sec.
+ * @burst: Burst size in packets.
+ * @list: trap_policer_list member.
+ *
+ * Describes packet trap policer attributes. Created by devlink during trap
+ * policer registration.
+ */
+struct devlink_trap_policer_item {
+ const struct devlink_trap_policer *policer;
+ u64 rate;
+ u64 burst;
+ struct list_head list;
+};
+
+/**
+ * struct devlink_trap_group_item - Packet trap group attributes.
+ * @group: Immutable packet trap group attributes.
+ * @policer_item: Associated policer item. Can be NULL.
+ * @list: trap_group_list member.
+ * @stats: Trap group statistics.
+ *
+ * Describes packet trap group attributes. Created by devlink during trap
+ * group registration.
+ */
+struct devlink_trap_group_item {
+ const struct devlink_trap_group *group;
+ struct devlink_trap_policer_item *policer_item;
+ struct list_head list;
+ struct devlink_stats __percpu *stats;
+};
+
+/**
+ * struct devlink_trap_item - Packet trap attributes.
+ * @trap: Immutable packet trap attributes.
+ * @group_item: Associated group item.
+ * @list: trap_list member.
+ * @action: Trap action.
+ * @stats: Trap statistics.
+ * @priv: Driver private information.
+ *
+ * Describes both mutable and immutable packet trap attributes. Created by
+ * devlink during trap registration and used for all trap related operations.
+ */
+struct devlink_trap_item {
+ const struct devlink_trap *trap;
+ struct devlink_trap_group_item *group_item;
+ struct list_head list;
+ enum devlink_trap_action action;
+ struct devlink_stats __percpu *stats;
+ void *priv;
+};
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (policer_item->policer->id == id)
+ return policer_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_item *
+devlink_trap_item_lookup(struct devlink *devlink, const char *name)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (!strcmp(trap_item->trap->name, name))
+ return trap_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_item *
+devlink_trap_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ struct nlattr *attr;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_NAME])
+ return NULL;
+ attr = info->attrs[DEVLINK_ATTR_TRAP_NAME];
+
+ return devlink_trap_item_lookup(devlink, nla_data(attr));
+}
+
+static int
+devlink_trap_action_get_from_info(struct genl_info *info,
+ enum devlink_trap_action *p_trap_action)
+{
+ u8 val;
+
+ val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
+ switch (val) {
+ case DEVLINK_TRAP_ACTION_DROP:
+ case DEVLINK_TRAP_ACTION_TRAP:
+ case DEVLINK_TRAP_ACTION_MIRROR:
+ *p_trap_action = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_metadata_put(struct sk_buff *msg,
+ const struct devlink_trap *trap)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
+ goto nla_put_failure;
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
+ struct devlink_stats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(i) {
+ struct devlink_stats *cpu_stats;
+ u64 rx_packets, rx_bytes;
+ unsigned int start;
+
+ cpu_stats = per_cpu_ptr(trap_stats, i);
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ rx_packets = u64_stats_read(&cpu_stats->rx_packets);
+ rx_bytes = u64_stats_read(&cpu_stats->rx_bytes);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->rx_packets, rx_packets);
+ u64_stats_add(&stats->rx_bytes, rx_bytes);
+ }
+}
+
+static int
+devlink_trap_group_stats_put(struct sk_buff *msg,
+ struct devlink_stats __percpu *trap_stats)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+
+ devlink_trap_stats_read(trap_stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets)))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+ u64 drops = 0;
+ int err;
+
+ if (devlink->ops->trap_drop_counter_get) {
+ err = devlink->ops->trap_drop_counter_get(devlink,
+ trap_item->trap,
+ &drops);
+ if (err)
+ return err;
+ }
+
+ devlink_trap_stats_read(trap_item->stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink->ops->trap_drop_counter_get &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets)))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ struct devlink_trap_group_item *group_item = trap_item->group_item;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+ group_item->group->name))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type))
+ goto nla_put_failure;
+
+ if (trap_item->trap->generic &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action))
+ goto nla_put_failure;
+
+ err = devlink_trap_metadata_put(msg, trap_item->trap);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_trap_stats_put(msg, devlink, trap_item);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_item *trap_item;
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_list))
+ return -EOPNOTSUPP;
+
+ trap_item = devlink_trap_item_get_from_info(devlink, info);
+ if (!trap_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_fill(msg, devlink, trap_item,
+ DEVLINK_CMD_TRAP_NEW, info->snd_portid,
+ info->snd_seq, 0);
+ if (err)
+ goto err_trap_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_trap_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_trap_item *trap_item;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_fill(msg, devlink, trap_item,
+ DEVLINK_CMD_TRAP_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one);
+}
+
+static int __devlink_trap_action_set(struct devlink *devlink,
+ struct devlink_trap_item *trap_item,
+ enum devlink_trap_action trap_action,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) {
+ NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping");
+ return 0;
+ }
+
+ err = devlink->ops->trap_action_set(devlink, trap_item->trap,
+ trap_action, extack);
+ if (err)
+ return err;
+
+ trap_item->action = trap_action;
+
+ return 0;
+}
+
+static int devlink_trap_action_set(struct devlink *devlink,
+ struct devlink_trap_item *trap_item,
+ struct genl_info *info)
+{
+ enum devlink_trap_action trap_action;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+ return 0;
+
+ err = devlink_trap_action_get_from_info(info, &trap_action);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "Invalid trap action");
+ return -EINVAL;
+ }
+
+ return __devlink_trap_action_set(devlink, trap_item, trap_action,
+ info->extack);
+}
+
+int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_item *trap_item;
+
+ if (list_empty(&devlink->trap_list))
+ return -EOPNOTSUPP;
+
+ trap_item = devlink_trap_item_get_from_info(devlink, info);
+ if (!trap_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap");
+ return -ENOENT;
+ }
+
+ return devlink_trap_action_set(devlink, trap_item, info);
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (!strcmp(group_item->group->name, name))
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (group_item->group->id == id)
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ char *name;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME])
+ return NULL;
+ name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]);
+
+ return devlink_trap_group_item_lookup(devlink, name);
+}
+
+static int
+devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+ group_item->group->name))
+ goto nla_put_failure;
+
+ if (group_item->group->generic &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+ goto nla_put_failure;
+
+ if (group_item->policer_item &&
+ nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ group_item->policer_item->policer->id))
+ goto nla_put_failure;
+
+ err = devlink_trap_group_stats_put(msg, group_item->stats);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_group_item *group_item;
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_group_list))
+ return -EOPNOTSUPP;
+
+ group_item = devlink_trap_group_item_get_from_info(devlink, info);
+ if (!group_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap group");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_group_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_group_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_trap_group_item *group_item;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one);
+}
+
+static int
+__devlink_trap_group_action_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ enum devlink_trap_action trap_action,
+ struct netlink_ext_ack *extack)
+{
+ const char *group_name = group_item->group->name;
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ if (devlink->ops->trap_group_action_set) {
+ err = devlink->ops->trap_group_action_set(devlink, group_item->group,
+ trap_action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->group_item->group->name, group_name))
+ continue;
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP)
+ continue;
+ trap_item->action = trap_action;
+ }
+
+ return 0;
+ }
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->group_item->group->name, group_name))
+ continue;
+ err = __devlink_trap_action_set(devlink, trap_item,
+ trap_action, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+devlink_trap_group_action_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info, bool *p_modified)
+{
+ enum devlink_trap_action trap_action;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+ return 0;
+
+ err = devlink_trap_action_get_from_info(info, &trap_action);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "Invalid trap action");
+ return -EINVAL;
+ }
+
+ err = __devlink_trap_group_action_set(devlink, group_item, trap_action,
+ info->extack);
+ if (err)
+ return err;
+
+ *p_modified = true;
+
+ return 0;
+}
+
+static int devlink_trap_group_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ const struct devlink_trap_policer *policer;
+ struct nlattr **attrs = info->attrs;
+ u32 policer_id;
+ int err;
+
+ if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return 0;
+
+ if (!devlink->ops->trap_group_set)
+ return -EOPNOTSUPP;
+
+ policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (policer_id && !policer_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+ policer = policer_item ? policer_item->policer : NULL;
+
+ err = devlink->ops->trap_group_set(devlink, group_item->group, policer,
+ extack);
+ if (err)
+ return err;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_group_item *group_item;
+ bool modified = false;
+ int err;
+
+ if (list_empty(&devlink->trap_group_list))
+ return -EOPNOTSUPP;
+
+ group_item = devlink_trap_group_item_get_from_info(devlink, info);
+ if (!group_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap group");
+ return -ENOENT;
+ }
+
+ err = devlink_trap_group_action_set(devlink, group_item, info,
+ &modified);
+ if (err)
+ return err;
+
+ err = devlink_trap_group_set(devlink, group_item, info);
+ if (err)
+ goto err_trap_group_set;
+
+ return 0;
+
+err_trap_group_set:
+ if (modified)
+ NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already");
+ return err;
+}
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ u32 id;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return NULL;
+ id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+
+ return devlink_trap_policer_item_lookup(devlink, id);
+}
+
+static int
+devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct nlattr *attr;
+ u64 drops;
+ int err;
+
+ if (!devlink->ops->trap_policer_counter_get)
+ return 0;
+
+ err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
+ if (err)
+ return err;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ policer_item->policer->id))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
+ policer_item->rate))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
+ policer_item->burst))
+ goto nla_put_failure;
+
+ err = devlink_trap_policer_stats_put(msg, devlink,
+ policer_item->policer);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_policer_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_policer_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_trap_policer_item *policer_item;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one);
+}
+
+static int
+devlink_trap_policer_set(struct devlink *devlink,
+ struct devlink_trap_policer_item *policer_item,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
+ u64 rate, burst;
+ int err;
+
+ rate = policer_item->rate;
+ burst = policer_item->burst;
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
+ burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
+
+ if (rate < policer_item->policer->min_rate) {
+ NL_SET_ERR_MSG(extack, "Policer rate lower than limit");
+ return -EINVAL;
+ }
+
+ if (rate > policer_item->policer->max_rate) {
+ NL_SET_ERR_MSG(extack, "Policer rate higher than limit");
+ return -EINVAL;
+ }
+
+ if (burst < policer_item->policer->min_burst) {
+ NL_SET_ERR_MSG(extack, "Policer burst size lower than limit");
+ return -EINVAL;
+ }
+
+ if (burst > policer_item->policer->max_burst) {
+ NL_SET_ERR_MSG(extack, "Policer burst size higher than limit");
+ return -EINVAL;
+ }
+
+ err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
+ rate, burst, info->extack);
+ if (err)
+ return err;
+
+ policer_item->rate = rate;
+ policer_item->burst = burst;
+
+ return 0;
+}
+
+int devlink_nl_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ if (!devlink->ops->trap_policer_set)
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ return devlink_trap_policer_set(devlink, policer_item, info);
+}
+
+#define DEVLINK_TRAP(_id, _type) \
+ { \
+ .type = DEVLINK_TRAP_TYPE_##_type, \
+ .id = DEVLINK_TRAP_GENERIC_ID_##_id, \
+ .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \
+ }
+
+static const struct devlink_trap devlink_trap_generic[] = {
+ DEVLINK_TRAP(SMAC_MC, DROP),
+ DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP),
+ DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP),
+ DEVLINK_TRAP(INGRESS_STP_FILTER, DROP),
+ DEVLINK_TRAP(EMPTY_TX_LIST, DROP),
+ DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP),
+ DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP),
+ DEVLINK_TRAP(TTL_ERROR, EXCEPTION),
+ DEVLINK_TRAP(TAIL_DROP, DROP),
+ DEVLINK_TRAP(NON_IP_PACKET, DROP),
+ DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP),
+ DEVLINK_TRAP(DIP_LB, DROP),
+ DEVLINK_TRAP(SIP_MC, DROP),
+ DEVLINK_TRAP(SIP_LB, DROP),
+ DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP),
+ DEVLINK_TRAP(IPV4_SIP_BC, DROP),
+ DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP),
+ DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP),
+ DEVLINK_TRAP(MTU_ERROR, EXCEPTION),
+ DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION),
+ DEVLINK_TRAP(RPF, EXCEPTION),
+ DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION),
+ DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION),
+ DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION),
+ DEVLINK_TRAP(NON_ROUTABLE, DROP),
+ DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
+ DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
+ DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(STP, CONTROL),
+ DEVLINK_TRAP(LACP, CONTROL),
+ DEVLINK_TRAP(LLDP, CONTROL),
+ DEVLINK_TRAP(IGMP_QUERY, CONTROL),
+ DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL),
+ DEVLINK_TRAP(MLD_QUERY, CONTROL),
+ DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
+ DEVLINK_TRAP(IPV4_DHCP, CONTROL),
+ DEVLINK_TRAP(IPV6_DHCP, CONTROL),
+ DEVLINK_TRAP(ARP_REQUEST, CONTROL),
+ DEVLINK_TRAP(ARP_RESPONSE, CONTROL),
+ DEVLINK_TRAP(ARP_OVERLAY, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV4_BFD, CONTROL),
+ DEVLINK_TRAP(IPV6_BFD, CONTROL),
+ DEVLINK_TRAP(IPV4_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV6_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV4_BGP, CONTROL),
+ DEVLINK_TRAP(IPV6_BGP, CONTROL),
+ DEVLINK_TRAP(IPV4_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV6_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV4_PIM, CONTROL),
+ DEVLINK_TRAP(IPV6_PIM, CONTROL),
+ DEVLINK_TRAP(UC_LB, CONTROL),
+ DEVLINK_TRAP(LOCAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV6_REDIRECT, CONTROL),
+ DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(PTP_EVENT, CONTROL),
+ DEVLINK_TRAP(PTP_GENERAL, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
+ DEVLINK_TRAP(EARLY_DROP, DROP),
+ DEVLINK_TRAP(VXLAN_PARSING, DROP),
+ DEVLINK_TRAP(LLC_SNAP_PARSING, DROP),
+ DEVLINK_TRAP(VLAN_PARSING, DROP),
+ DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP),
+ DEVLINK_TRAP(MPLS_PARSING, DROP),
+ DEVLINK_TRAP(ARP_PARSING, DROP),
+ DEVLINK_TRAP(IP_1_PARSING, DROP),
+ DEVLINK_TRAP(IP_N_PARSING, DROP),
+ DEVLINK_TRAP(GRE_PARSING, DROP),
+ DEVLINK_TRAP(UDP_PARSING, DROP),
+ DEVLINK_TRAP(TCP_PARSING, DROP),
+ DEVLINK_TRAP(IPSEC_PARSING, DROP),
+ DEVLINK_TRAP(SCTP_PARSING, DROP),
+ DEVLINK_TRAP(DCCP_PARSING, DROP),
+ DEVLINK_TRAP(GTP_PARSING, DROP),
+ DEVLINK_TRAP(ESP_PARSING, DROP),
+ DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
+ DEVLINK_TRAP(DMAC_FILTER, DROP),
+ DEVLINK_TRAP(EAPOL, CONTROL),
+ DEVLINK_TRAP(LOCKED_PORT, DROP),
+};
+
+#define DEVLINK_TRAP_GROUP(_id) \
+ { \
+ .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \
+ .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \
+ }
+
+static const struct devlink_trap_group devlink_trap_group_generic[] = {
+ DEVLINK_TRAP_GROUP(L2_DROPS),
+ DEVLINK_TRAP_GROUP(L3_DROPS),
+ DEVLINK_TRAP_GROUP(L3_EXCEPTIONS),
+ DEVLINK_TRAP_GROUP(BUFFER_DROPS),
+ DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
+ DEVLINK_TRAP_GROUP(ACL_DROPS),
+ DEVLINK_TRAP_GROUP(STP),
+ DEVLINK_TRAP_GROUP(LACP),
+ DEVLINK_TRAP_GROUP(LLDP),
+ DEVLINK_TRAP_GROUP(MC_SNOOPING),
+ DEVLINK_TRAP_GROUP(DHCP),
+ DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY),
+ DEVLINK_TRAP_GROUP(BFD),
+ DEVLINK_TRAP_GROUP(OSPF),
+ DEVLINK_TRAP_GROUP(BGP),
+ DEVLINK_TRAP_GROUP(VRRP),
+ DEVLINK_TRAP_GROUP(PIM),
+ DEVLINK_TRAP_GROUP(UC_LB),
+ DEVLINK_TRAP_GROUP(LOCAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(IPV6),
+ DEVLINK_TRAP_GROUP(PTP_EVENT),
+ DEVLINK_TRAP_GROUP(PTP_GENERAL),
+ DEVLINK_TRAP_GROUP(ACL_SAMPLE),
+ DEVLINK_TRAP_GROUP(ACL_TRAP),
+ DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
+ DEVLINK_TRAP_GROUP(EAPOL),
+};
+
+static int devlink_trap_generic_verify(const struct devlink_trap *trap)
+{
+ if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ if (strcmp(trap->name, devlink_trap_generic[trap->id].name))
+ return -EINVAL;
+
+ if (trap->type != devlink_trap_generic[trap->id].type)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int devlink_trap_driver_verify(const struct devlink_trap *trap)
+{
+ int i;
+
+ if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) {
+ if (!strcmp(trap->name, devlink_trap_generic[i].name))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_verify(const struct devlink_trap *trap)
+{
+ if (!trap || !trap->name)
+ return -EINVAL;
+
+ if (trap->generic)
+ return devlink_trap_generic_verify(trap);
+ else
+ return devlink_trap_driver_verify(trap);
+}
+
+static int
+devlink_trap_group_generic_verify(const struct devlink_trap_group *group)
+{
+ if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ if (strcmp(group->name, devlink_trap_group_generic[group->id].name))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_driver_verify(const struct devlink_trap_group *group)
+{
+ int i;
+
+ if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) {
+ if (!strcmp(group->name, devlink_trap_group_generic[i].name))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_group_verify(const struct devlink_trap_group *group)
+{
+ if (group->generic)
+ return devlink_trap_group_generic_verify(group);
+ else
+ return devlink_trap_group_driver_verify(group);
+}
+
+static void
+devlink_trap_group_notify(struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW &&
+ cmd != DEVLINK_CMD_TRAP_GROUP_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0,
+ 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_trap_groups_notify_register(struct devlink *devlink)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list)
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+}
+
+void devlink_trap_groups_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list)
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+}
+
+static int
+devlink_trap_item_group_link(struct devlink *devlink,
+ struct devlink_trap_item *trap_item)
+{
+ u16 group_id = trap_item->trap->init_group_id;
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id);
+ if (WARN_ON_ONCE(!group_item))
+ return -EINVAL;
+
+ trap_item->group_item = group_item;
+
+ return 0;
+}
+
+static void devlink_trap_notify(struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW &&
+ cmd != DEVLINK_CMD_TRAP_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_traps_notify_register(struct devlink *devlink)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list)
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+}
+
+void devlink_traps_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry_reverse(trap_item, &devlink->trap_list, list)
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+}
+
+static int
+devlink_trap_register(struct devlink *devlink,
+ const struct devlink_trap *trap, void *priv)
+{
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ if (devlink_trap_item_lookup(devlink, trap->name))
+ return -EEXIST;
+
+ trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL);
+ if (!trap_item)
+ return -ENOMEM;
+
+ trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!trap_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ trap_item->trap = trap;
+ trap_item->action = trap->init_action;
+ trap_item->priv = priv;
+
+ err = devlink_trap_item_group_link(devlink, trap_item);
+ if (err)
+ goto err_group_link;
+
+ err = devlink->ops->trap_init(devlink, trap, trap_item);
+ if (err)
+ goto err_trap_init;
+
+ list_add_tail(&trap_item->list, &devlink->trap_list);
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+
+ return 0;
+
+err_trap_init:
+err_group_link:
+ free_percpu(trap_item->stats);
+err_stats_alloc:
+ kfree(trap_item);
+ return err;
+}
+
+static void devlink_trap_unregister(struct devlink *devlink,
+ const struct devlink_trap *trap)
+{
+ struct devlink_trap_item *trap_item;
+
+ trap_item = devlink_trap_item_lookup(devlink, trap->name);
+ if (WARN_ON_ONCE(!trap_item))
+ return;
+
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+ list_del(&trap_item->list);
+ if (devlink->ops->trap_fini)
+ devlink->ops->trap_fini(devlink, trap, trap_item);
+ free_percpu(trap_item->stats);
+ kfree(trap_item);
+}
+
+static void devlink_trap_disable(struct devlink *devlink,
+ const struct devlink_trap *trap)
+{
+ struct devlink_trap_item *trap_item;
+
+ trap_item = devlink_trap_item_lookup(devlink, trap->name);
+ if (WARN_ON_ONCE(!trap_item))
+ return;
+
+ devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP,
+ NULL);
+ trap_item->action = DEVLINK_TRAP_ACTION_DROP;
+}
+
+/**
+ * devl_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devl_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
+{
+ int i, err;
+
+ if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
+ return -EINVAL;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < traps_count; i++) {
+ const struct devlink_trap *trap = &traps[i];
+
+ err = devlink_trap_verify(trap);
+ if (err)
+ goto err_trap_verify;
+
+ err = devlink_trap_register(devlink, trap, priv);
+ if (err)
+ goto err_trap_register;
+ }
+
+ return 0;
+
+err_trap_register:
+err_trap_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_unregister(devlink, &traps[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_traps_register);
+
+/**
+ * devlink_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_traps_register(devlink, traps, traps_count, priv);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_traps_register);
+
+/**
+ * devl_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ */
+void devl_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ /* Make sure we do not have any packets in-flight while unregistering
+ * traps by disabling all of them and waiting for a grace period.
+ */
+ for (i = traps_count - 1; i >= 0; i--)
+ devlink_trap_disable(devlink, &traps[i]);
+ synchronize_rcu();
+ for (i = traps_count - 1; i >= 0; i--)
+ devlink_trap_unregister(devlink, &traps[i]);
+}
+EXPORT_SYMBOL_GPL(devl_traps_unregister);
+
+/**
+ * devlink_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
+{
+ devl_lock(devlink);
+ devl_traps_unregister(devlink, traps, traps_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_traps_unregister);
+
+static void
+devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
+ size_t skb_len)
+{
+ struct devlink_stats *stats;
+
+ stats = this_cpu_ptr(trap_stats);
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_add(&stats->rx_bytes, skb_len);
+ u64_stats_inc(&stats->rx_packets);
+ u64_stats_update_end(&stats->syncp);
+}
+
+static void
+devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
+ const struct devlink_trap_item *trap_item,
+ struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+{
+ metadata->trap_name = trap_item->trap->name;
+ metadata->trap_group_name = trap_item->group_item->group->name;
+ metadata->fa_cookie = fa_cookie;
+ metadata->trap_type = trap_item->trap->type;
+
+ spin_lock(&in_devlink_port->type_lock);
+ if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+ metadata->input_dev = in_devlink_port->type_eth.netdev;
+ spin_unlock(&in_devlink_port->type_lock);
+}
+
+/**
+ * devlink_trap_report - Report trapped packet to drop monitor.
+ * @devlink: devlink.
+ * @skb: Trapped packet.
+ * @trap_ctx: Trap context.
+ * @in_devlink_port: Input devlink port.
+ * @fa_cookie: Flow action cookie. Could be NULL.
+ */
+void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
+ void *trap_ctx, struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+
+{
+ struct devlink_trap_item *trap_item = trap_ctx;
+
+ devlink_trap_stats_update(trap_item->stats, skb->len);
+ devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
+
+ if (tracepoint_enabled(devlink_trap_report)) {
+ struct devlink_trap_metadata metadata = {};
+
+ devlink_trap_report_metadata_set(&metadata, trap_item,
+ in_devlink_port, fa_cookie);
+ trace_devlink_trap_report(devlink, skb, &metadata);
+ }
+}
+EXPORT_SYMBOL_GPL(devlink_trap_report);
+
+/**
+ * devlink_trap_ctx_priv - Trap context to driver private information.
+ * @trap_ctx: Trap context.
+ *
+ * Return: Driver private information passed during registration.
+ */
+void *devlink_trap_ctx_priv(void *trap_ctx)
+{
+ struct devlink_trap_item *trap_item = trap_ctx;
+
+ return trap_item->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+
+static int
+devlink_trap_group_item_policer_link(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ u32 policer_id = group_item->group->init_policer_id;
+ struct devlink_trap_policer_item *policer_item;
+
+ if (policer_id == 0)
+ return 0;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (WARN_ON_ONCE(!policer_item))
+ return -EINVAL;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_register(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ if (devlink_trap_group_item_lookup(devlink, group->name))
+ return -EEXIST;
+
+ group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+ if (!group_item)
+ return -ENOMEM;
+
+ group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!group_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ group_item->group = group;
+
+ err = devlink_trap_group_item_policer_link(devlink, group_item);
+ if (err)
+ goto err_policer_link;
+
+ if (devlink->ops->trap_group_init) {
+ err = devlink->ops->trap_group_init(devlink, group);
+ if (err)
+ goto err_group_init;
+ }
+
+ list_add_tail(&group_item->list, &devlink->trap_group_list);
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ return 0;
+
+err_group_init:
+err_policer_link:
+ free_percpu(group_item->stats);
+err_stats_alloc:
+ kfree(group_item);
+ return err;
+}
+
+static void
+devlink_trap_group_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup(devlink, group->name);
+ if (WARN_ON_ONCE(!group_item))
+ return;
+
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_del(&group_item->list);
+ free_percpu(group_item->stats);
+ kfree(group_item);
+}
+
+/**
+ * devl_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devl_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i, err;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < groups_count; i++) {
+ const struct devlink_trap_group *group = &groups[i];
+
+ err = devlink_trap_group_verify(group);
+ if (err)
+ goto err_trap_group_verify;
+
+ err = devlink_trap_group_register(devlink, group);
+ if (err)
+ goto err_trap_group_register;
+ }
+
+ return 0;
+
+err_trap_group_register:
+err_trap_group_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_register);
+
+/**
+ * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_trap_groups_register(devlink, groups, groups_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
+
+/**
+ * devl_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ */
+void devl_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ for (i = groups_count - 1; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_unregister);
+
+/**
+ * devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ devl_lock(devlink);
+ devl_trap_groups_unregister(devlink, groups, groups_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
+
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
+ cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
+ 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_trap_policers_notify_register(struct devlink *devlink)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list)
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+}
+
+void devlink_trap_policers_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list,
+ list)
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+}
+
+static int
+devlink_trap_policer_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+ int err;
+
+ if (devlink_trap_policer_item_lookup(devlink, policer->id))
+ return -EEXIST;
+
+ policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
+ if (!policer_item)
+ return -ENOMEM;
+
+ policer_item->policer = policer;
+ policer_item->rate = policer->init_rate;
+ policer_item->burst = policer->init_burst;
+
+ if (devlink->ops->trap_policer_init) {
+ err = devlink->ops->trap_policer_init(devlink, policer);
+ if (err)
+ goto err_policer_init;
+ }
+
+ list_add_tail(&policer_item->list, &devlink->trap_policer_list);
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ return 0;
+
+err_policer_init:
+ kfree(policer_item);
+ return err;
+}
+
+static void
+devlink_trap_policer_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
+ if (WARN_ON_ONCE(!policer_item))
+ return;
+
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+ list_del(&policer_item->list);
+ if (devlink->ops->trap_policer_fini)
+ devlink->ops->trap_policer_fini(devlink, policer);
+ kfree(policer_item);
+}
+
+/**
+ * devl_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ */
+int
+devl_trap_policers_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i, err;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < policers_count; i++) {
+ const struct devlink_trap_policer *policer = &policers[i];
+
+ if (WARN_ON(policer->id == 0 ||
+ policer->max_rate < policer->min_rate ||
+ policer->max_burst < policer->min_burst)) {
+ err = -EINVAL;
+ goto err_trap_policer_verify;
+ }
+
+ err = devlink_trap_policer_register(devlink, policer);
+ if (err)
+ goto err_trap_policer_register;
+ }
+ return 0;
+
+err_trap_policer_register:
+err_trap_policer_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_register);
+
+/**
+ * devl_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devl_trap_policers_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ for (i = policers_count - 1; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
diff --git a/net/devres.c b/net/devres.c
new file mode 100644
index 000000000000..5ccf6ca311dc
--- /dev/null
+++ b/net/devres.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains all networking devres helpers.
+ */
+
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+
+struct net_device_devres {
+ struct net_device *ndev;
+};
+
+static void devm_free_netdev(struct device *dev, void *this)
+{
+ struct net_device_devres *res = this;
+
+ free_netdev(res->ndev);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+ unsigned int txqs, unsigned int rxqs)
+{
+ struct net_device_devres *dr;
+
+ dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return NULL;
+
+ dr->ndev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+ if (!dr->ndev) {
+ devres_free(dr);
+ return NULL;
+ }
+
+ devres_add(dev, dr);
+
+ return dr->ndev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
+static void devm_unregister_netdev(struct device *dev, void *this)
+{
+ struct net_device_devres *res = this;
+
+ unregister_netdev(res->ndev);
+}
+
+static int netdev_devres_match(struct device *dev, void *this, void *match_data)
+{
+ struct net_device_devres *res = this;
+ struct net_device *ndev = match_data;
+
+ return ndev == res->ndev;
+}
+
+/**
+ * devm_register_netdev - resource managed variant of register_netdev()
+ * @dev: managing device for this netdev - usually the parent device
+ * @ndev: device to register
+ *
+ * This is a devres variant of register_netdev() for which the unregister
+ * function will be called automatically when the managing device is
+ * detached. Note: the net_device used must also be resource managed by
+ * the same struct device.
+ */
+int devm_register_netdev(struct device *dev, struct net_device *ndev)
+{
+ struct net_device_devres *dr;
+ int ret;
+
+ /* struct net_device must itself be managed. For now a managed netdev
+ * can only be allocated by devm_alloc_etherdev_mqs() so the check is
+ * straightforward.
+ */
+ if (WARN_ON(!devres_find(dev, devm_free_netdev,
+ netdev_devres_match, ndev)))
+ return -EINVAL;
+
+ dr = devres_alloc(devm_unregister_netdev, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ ret = register_netdev(ndev);
+ if (ret) {
+ devres_free(dr);
+ return ret;
+ }
+
+ dr->ndev = ndev;
+ devres_add(ndev->dev.parent, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL(devm_register_netdev);
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
new file mode 100644
index 000000000000..7c2dba273e35
--- /dev/null
+++ b/net/dns_resolver/Kconfig
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Configuration for DNS Resolver
+#
+config DNS_RESOLVER
+ tristate "DNS Resolver support"
+ depends on KEYS
+ help
+ Saying Y here will include support for the DNS Resolver key type
+ which can be used to make upcalls to perform DNS lookups in
+ userspace.
+
+ DNS Resolver is used to query DNS server for information. Examples
+ being resolving a UNC hostname element to an IP address for CIFS or
+ performing a DNS query for AFSDB records so that AFS can locate a
+ cell's volume location database servers.
+
+ DNS Resolver is used by the CIFS and AFS modules, and would support
+ SMB2 later. DNS Resolver is supported by the userspace upcall
+ helper "/sbin/dns.resolver" via /etc/request-key.conf.
+
+ See <file:Documentation/networking/dns_resolver.rst> for further
+ information.
+
+ To compile this as a module, choose M here: the module will be called
+ dns_resolver.
+
+ If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
new file mode 100644
index 000000000000..877532d662d0
--- /dev/null
+++ b/net/dns_resolver/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the Linux DNS Resolver.
+#
+
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o
+
+dns_resolver-y := dns_key.o dns_query.o
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
new file mode 100644
index 000000000000..c42ddd85ff1f
--- /dev/null
+++ b/net/dns_resolver/dns_key.c
@@ -0,0 +1,391 @@
+/* Key type used to cache DNS lookups made by the kernel
+ *
+ * See Documentation/networking/dns_resolver.rst
+ *
+ * Copyright (c) 2007 Igor Mammedov
+ * Author(s): Igor Mammedov (niallain@gmail.com)
+ * Steve French (sfrench@us.ibm.com)
+ * Wang Lei (wang840925@gmail.com)
+ * David Howells (dhowells@redhat.com)
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/keyctl.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/dns_resolver.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("DNS Resolver");
+MODULE_AUTHOR("Wang Lei");
+MODULE_LICENSE("GPL");
+
+unsigned int dns_resolver_debug;
+module_param_named(debug, dns_resolver_debug, uint, 0644);
+MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
+
+const struct cred *dns_resolver_cache;
+
+#define DNS_ERRORNO_OPTION "dnserror"
+
+/*
+ * Preparse instantiation data for a dns_resolver key.
+ *
+ * For normal hostname lookups, the data must be a NUL-terminated string, with
+ * the NUL char accounted in datalen.
+ *
+ * If the data contains a '#' characters, then we take the clause after each
+ * one to be an option of the form 'key=value'. The actual data of interest is
+ * the string leading up to the first '#'. For instance:
+ *
+ * "ip1,ip2,...#foo=bar"
+ *
+ * For server list requests, the data must begin with a NUL char and be
+ * followed by a byte indicating the version of the data format. Version 1
+ * looks something like (note this is packed):
+ *
+ * u8 Non-string marker (ie. 0)
+ * u8 Content (DNS_PAYLOAD_IS_*)
+ * u8 Version (e.g. 1)
+ * u8 Source of server list
+ * u8 Lookup status of server list
+ * u8 Number of servers
+ * foreach-server {
+ * __le16 Name length
+ * __le16 Priority (as per SRV record, low first)
+ * __le16 Weight (as per SRV record, higher first)
+ * __le16 Port
+ * u8 Source of address list
+ * u8 Lookup status of address list
+ * u8 Protocol (DNS_SERVER_PROTOCOL_*)
+ * u8 Number of addresses
+ * char[] Name (not NUL-terminated)
+ * foreach-address {
+ * u8 Family (DNS_ADDRESS_IS_*)
+ * union {
+ * u8[4] ipv4_addr
+ * u8[16] ipv6_addr
+ * }
+ * }
+ * }
+ *
+ */
+static int
+dns_resolver_preparse(struct key_preparsed_payload *prep)
+{
+ struct user_key_payload *upayload;
+ unsigned long derrno;
+ int ret;
+ int datalen = prep->datalen, result_len = 0;
+ const char *data = prep->data, *end, *opt;
+
+ if (datalen <= 1 || !data)
+ return -EINVAL;
+
+ if (data[0] == 0) {
+ const struct dns_server_list_v1_header *v1;
+
+ /* It may be a server list. */
+ if (datalen < sizeof(*v1))
+ return -EINVAL;
+
+ v1 = (const struct dns_server_list_v1_header *)data;
+ kenter("[%u,%u],%u", v1->hdr.content, v1->hdr.version, datalen);
+ if (v1->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST) {
+ pr_warn_ratelimited(
+ "dns_resolver: Unsupported content type (%u)\n",
+ v1->hdr.content);
+ return -EINVAL;
+ }
+
+ if (v1->hdr.version != 1) {
+ pr_warn_ratelimited(
+ "dns_resolver: Unsupported server list version (%u)\n",
+ v1->hdr.version);
+ return -EINVAL;
+ }
+
+ if ((v1->status != DNS_LOOKUP_GOOD &&
+ v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) {
+ if (prep->expiry == TIME64_MAX)
+ prep->expiry = ktime_get_real_seconds() + 1;
+ }
+
+ result_len = datalen;
+ goto store_result;
+ }
+
+ kenter("'%*.*s',%u", datalen, datalen, data, datalen);
+
+ if (!data || data[datalen - 1] != '\0')
+ return -EINVAL;
+ datalen--;
+
+ /* deal with any options embedded in the data */
+ end = data + datalen;
+ opt = memchr(data, '#', datalen);
+ if (!opt) {
+ /* no options: the entire data is the result */
+ kdebug("no options");
+ result_len = datalen;
+ } else {
+ const char *next_opt;
+
+ result_len = opt - data;
+ opt++;
+ kdebug("options: '%s'", opt);
+ do {
+ int opt_len, opt_nlen;
+ const char *eq;
+ char optval[128];
+
+ next_opt = memchr(opt, '#', end - opt) ?: end;
+ opt_len = next_opt - opt;
+ if (opt_len <= 0 || opt_len > sizeof(optval)) {
+ pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
+ opt_len);
+ return -EINVAL;
+ }
+
+ eq = memchr(opt, '=', opt_len);
+ if (eq) {
+ opt_nlen = eq - opt;
+ eq++;
+ memcpy(optval, eq, next_opt - eq);
+ optval[next_opt - eq] = '\0';
+ } else {
+ opt_nlen = opt_len;
+ optval[0] = '\0';
+ }
+
+ kdebug("option '%*.*s' val '%s'",
+ opt_nlen, opt_nlen, opt, optval);
+
+ /* see if it's an error number representing a DNS error
+ * that's to be recorded as the result in this key */
+ if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
+ memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
+ kdebug("dns error number option");
+
+ ret = kstrtoul(optval, 10, &derrno);
+ if (ret < 0)
+ goto bad_option_value;
+
+ if (derrno < 1 || derrno > 511)
+ goto bad_option_value;
+
+ kdebug("dns error no. = %lu", derrno);
+ prep->payload.data[dns_key_error] = ERR_PTR(-derrno);
+ continue;
+ }
+
+ bad_option_value:
+ pr_warn_ratelimited("Option '%*.*s' to dns_resolver key: bad/missing value\n",
+ opt_nlen, opt_nlen, opt);
+ return -EINVAL;
+ } while (opt = next_opt + 1, opt < end);
+ }
+
+ /* don't cache the result if we're caching an error saying there's no
+ * result */
+ if (prep->payload.data[dns_key_error]) {
+ kleave(" = 0 [h_error %ld]", PTR_ERR(prep->payload.data[dns_key_error]));
+ return 0;
+ }
+
+store_result:
+ kdebug("store result");
+ prep->quotalen = result_len;
+
+ upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL);
+ if (!upayload) {
+ kleave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ upayload->datalen = result_len;
+ memcpy(upayload->data, data, result_len);
+ upayload->data[result_len] = '\0';
+
+ prep->payload.data[dns_key_data] = upayload;
+ kleave(" = 0");
+ return 0;
+}
+
+/*
+ * Clean up the preparse data
+ */
+static void dns_resolver_free_preparse(struct key_preparsed_payload *prep)
+{
+ pr_devel("==>%s()\n", __func__);
+
+ kfree(prep->payload.data[dns_key_data]);
+}
+
+/*
+ * The description is of the form "[<type>:]<domain_name>"
+ *
+ * The domain name may be a simple name or an absolute domain name (which
+ * should end with a period). The domain name is case-independent.
+ */
+static bool dns_resolver_cmp(const struct key *key,
+ const struct key_match_data *match_data)
+{
+ int slen, dlen, ret = 0;
+ const char *src = key->description, *dsp = match_data->raw_data;
+
+ kenter("%s,%s", src, dsp);
+
+ if (!src || !dsp)
+ goto no_match;
+
+ if (strcasecmp(src, dsp) == 0)
+ goto matched;
+
+ slen = strlen(src);
+ dlen = strlen(dsp);
+ if (slen <= 0 || dlen <= 0)
+ goto no_match;
+ if (src[slen - 1] == '.')
+ slen--;
+ if (dsp[dlen - 1] == '.')
+ dlen--;
+ if (slen != dlen || strncasecmp(src, dsp, slen) != 0)
+ goto no_match;
+
+matched:
+ ret = 1;
+no_match:
+ kleave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Preparse the match criterion.
+ */
+static int dns_resolver_match_preparse(struct key_match_data *match_data)
+{
+ match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE;
+ match_data->cmp = dns_resolver_cmp;
+ return 0;
+}
+
+/*
+ * Describe a DNS key
+ */
+static void dns_resolver_describe(const struct key *key, struct seq_file *m)
+{
+ seq_puts(m, key->description);
+ if (key_is_positive(key)) {
+ int err = PTR_ERR(key->payload.data[dns_key_error]);
+
+ if (err)
+ seq_printf(m, ": %d", err);
+ else
+ seq_printf(m, ": %u", key->datalen);
+ }
+}
+
+/*
+ * read the DNS data
+ * - the key's semaphore is read-locked
+ */
+static long dns_resolver_read(const struct key *key,
+ char *buffer, size_t buflen)
+{
+ int err = PTR_ERR(key->payload.data[dns_key_error]);
+
+ if (err)
+ return err;
+
+ return user_read(key, buffer, buflen);
+}
+
+struct key_type key_type_dns_resolver = {
+ .name = "dns_resolver",
+ .flags = KEY_TYPE_NET_DOMAIN | KEY_TYPE_INSTANT_REAP,
+ .preparse = dns_resolver_preparse,
+ .free_preparse = dns_resolver_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .match_preparse = dns_resolver_match_preparse,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = dns_resolver_describe,
+ .read = dns_resolver_read,
+};
+
+static int __init init_dns_resolver(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ /* create an override credential set with a special thread keyring in
+ * which DNS requests are cached
+ *
+ * this is used to prevent malicious redirections from being installed
+ * with add_key().
+ */
+ cred = prepare_kernel_cred(&init_task);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = keyring_alloc(".dns_resolver",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = register_key_type(&key_type_dns_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /* instruct request_key() to use this special keyring as a cache for
+ * the results it looks up */
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ dns_resolver_cache = cred;
+
+ kdebug("DNS resolver keyring: %d\n", key_serial(keyring));
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+static void __exit exit_dns_resolver(void)
+{
+ key_revoke(dns_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_dns_resolver);
+ put_cred(dns_resolver_cache);
+}
+
+module_init(init_dns_resolver)
+module_exit(exit_dns_resolver)
+MODULE_LICENSE("GPL");
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
new file mode 100644
index 000000000000..53da62984447
--- /dev/null
+++ b/net/dns_resolver/dns_query.c
@@ -0,0 +1,170 @@
+/* Upcall routine, designed to work as a key type and working through
+ * /sbin/request-key to contact userspace when handling DNS queries.
+ *
+ * See Documentation/networking/dns_resolver.rst
+ *
+ * Copyright (c) 2007 Igor Mammedov
+ * Author(s): Igor Mammedov (niallain@gmail.com)
+ * Steve French (sfrench@us.ibm.com)
+ * Wang Lei (wang840925@gmail.com)
+ * David Howells (dhowells@redhat.com)
+ *
+ * The upcall wrapper used to make an arbitrary DNS query.
+ *
+ * This function requires the appropriate userspace tool dns.upcall to be
+ * installed and something like the following lines should be added to the
+ * /etc/request-key.conf file:
+ *
+ * create dns_resolver * * /sbin/dns.upcall %k
+ *
+ * For example to use this module to query AFSDB RR:
+ *
+ * create dns_resolver afsdb:* * /sbin/dns.afsdb %k
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/dns_resolver.h>
+#include <linux/err.h>
+#include <net/net_namespace.h>
+
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+
+#include "internal.h"
+
+/**
+ * dns_query - Query the DNS
+ * @net: The network namespace to operate in.
+ * @type: Query type (or NULL for straight host->IP lookup)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @options: Request options (or NULL if no options)
+ * @_result: Where to place the returned data (or NULL)
+ * @_expiry: Where to store the result expiry time (or NULL)
+ * @invalidate: Always invalidate the key after use
+ *
+ * The data will be returned in the pointer at *result, if provided, and the
+ * caller is responsible for freeing it.
+ *
+ * The description should be of the form "[<query_type>:]<domain_name>", and
+ * the options need to be appropriate for the query type requested. If no
+ * query_type is given, then the query is a straight hostname to IP address
+ * lookup.
+ *
+ * The DNS resolution lookup is performed by upcalling to userspace by way of
+ * requesting a key of type dns_resolver.
+ *
+ * Returns the size of the result on success, -ve error code otherwise.
+ */
+int dns_query(struct net *net,
+ const char *type, const char *name, size_t namelen,
+ const char *options, char **_result, time64_t *_expiry,
+ bool invalidate)
+{
+ struct key *rkey;
+ struct user_key_payload *upayload;
+ size_t typelen, desclen;
+ char *desc, *cp;
+ int ret, len;
+
+ kenter("%s,%*.*s,%zu,%s",
+ type, (int)namelen, (int)namelen, name, namelen, options);
+
+ if (!name || namelen == 0)
+ return -EINVAL;
+
+ /* construct the query key description as "[<type>:]<name>" */
+ typelen = 0;
+ desclen = 0;
+ if (type) {
+ typelen = strlen(type);
+ if (typelen < 1)
+ return -EINVAL;
+ desclen += typelen + 1;
+ }
+
+ if (namelen < 3 || namelen > 255)
+ return -EINVAL;
+ desclen += namelen + 1;
+
+ desc = kmalloc(desclen, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+
+ cp = desc;
+ if (type) {
+ memcpy(cp, type, typelen);
+ cp += typelen;
+ *cp++ = ':';
+ }
+ memcpy(cp, name, namelen);
+ cp += namelen;
+ *cp = '\0';
+
+ if (!options)
+ options = "";
+ kdebug("call request_key(,%s,%s)", desc, options);
+
+ /* make the upcall, using special credentials to prevent the use of
+ * add_key() to preinstall malicious redirections
+ */
+ scoped_with_creds(dns_resolver_cache)
+ rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
+ kfree(desc);
+ if (IS_ERR(rkey)) {
+ ret = PTR_ERR(rkey);
+ goto out;
+ }
+
+ down_read(&rkey->sem);
+ set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
+ rkey->perm |= KEY_USR_VIEW;
+
+ ret = key_validate(rkey);
+ if (ret < 0)
+ goto put;
+
+ /* If the DNS server gave an error, return that to the caller */
+ ret = PTR_ERR(rkey->payload.data[dns_key_error]);
+ if (ret)
+ goto put;
+
+ upayload = user_key_payload_locked(rkey);
+ len = upayload->datalen;
+
+ if (_result) {
+ ret = -ENOMEM;
+ *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL);
+ if (!*_result)
+ goto put;
+ }
+
+ if (_expiry)
+ *_expiry = rkey->expiry;
+
+ ret = len;
+put:
+ up_read(&rkey->sem);
+ if (invalidate)
+ key_invalidate(rkey);
+ key_put(rkey);
+out:
+ kleave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(dns_query);
diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h
new file mode 100644
index 000000000000..0c570d40e4d6
--- /dev/null
+++ b/net/dns_resolver/internal.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2010 Wang Lei
+ * Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved.
+ *
+ * Internal DNS Rsolver stuff
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+/*
+ * Layout of key payload words.
+ */
+enum {
+ dns_key_data,
+ dns_key_error,
+};
+
+/*
+ * dns_key.c
+ */
+extern const struct cred *dns_resolver_cache;
+
+/*
+ * debug tracing
+ */
+extern unsigned int dns_resolver_debug;
+
+#define kdebug(FMT, ...) \
+do { \
+ if (unlikely(dns_resolver_debug)) \
+ printk(KERN_DEBUG "[%-6.6s] "FMT"\n", \
+ current->comm, ##__VA_ARGS__); \
+} while (0)
+
+#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
new file mode 100644
index 000000000000..f86b30742122
--- /dev/null
+++ b/net/dsa/Kconfig
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig NET_DSA
+ tristate "Distributed Switch Architecture"
+ depends on BRIDGE || BRIDGE=n
+ depends on HSR || HSR=n
+ depends on INET && NETDEVICES
+ select GRO_CELLS
+ select NET_SWITCHDEV
+ select PHYLINK
+ select NET_DEVLINK
+ imply NET_SELFTESTS
+ help
+ Say Y if you want to enable support for the hardware switches supported
+ by the Distributed Switch Architecture.
+
+if NET_DSA
+
+# Drivers must select the appropriate tagging format(s)
+
+config NET_DSA_TAG_NONE
+ tristate "No-op tag driver"
+ help
+ Say Y or M if you want to enable support for switches which don't tag
+ frames over the CPU port.
+
+config NET_DSA_TAG_AR9331
+ tristate "Tag driver for Atheros AR9331 SoC with built-in switch"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ the Atheros AR9331 SoC with built-in switch.
+
+config NET_DSA_TAG_BRCM_COMMON
+ tristate
+ default n
+
+config NET_DSA_TAG_BRCM
+ tristate "Tag driver for Broadcom switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Broadcom switches which place the tag after the MAC source address.
+
+config NET_DSA_TAG_BRCM_LEGACY
+ tristate "Tag driver for BCM63xx legacy switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ BCM63xx legacy switches which place the tag after the MAC source
+ address.
+ This tag is used in BCM63xx legacy switches which work without the
+ original FCS and length before the tag insertion.
+
+config NET_DSA_TAG_BRCM_LEGACY_FCS
+ tristate "Tag driver for BCM53xx legacy switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ BCM53xx legacy switches which place the tag after the MAC source
+ address.
+ This tag is used in BCM53xx legacy switches which expect original
+ FCS and length before the tag insertion to be present.
+
+config NET_DSA_TAG_BRCM_PREPEND
+ tristate "Tag driver for Broadcom switches using prepended headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Broadcom switches which places the tag before the Ethernet header
+ (prepended).
+
+config NET_DSA_TAG_HELLCREEK
+ tristate "Tag driver for Hirschmann Hellcreek TSN switches"
+ help
+ Say Y or M if you want to enable support for tagging frames
+ for the Hirschmann Hellcreek TSN switches.
+
+config NET_DSA_TAG_GSWIP
+ tristate "Tag driver for Lantiq / Intel GSWIP switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Lantiq / Intel GSWIP switches.
+
+config NET_DSA_TAG_DSA_COMMON
+ tristate
+
+config NET_DSA_TAG_DSA
+ tristate "Tag driver for Marvell switches using DSA headers"
+ select NET_DSA_TAG_DSA_COMMON
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Marvell switches which use DSA headers.
+
+config NET_DSA_TAG_EDSA
+ tristate "Tag driver for Marvell switches using EtherType DSA headers"
+ select NET_DSA_TAG_DSA_COMMON
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Marvell switches which use EtherType DSA headers.
+
+config NET_DSA_TAG_MTK
+ tristate "Tag driver for Mediatek switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Mediatek switches.
+
+config NET_DSA_TAG_MXL_GSW1XX
+ tristate "Tag driver for MaxLinear GSW1xx switches"
+ help
+ The GSW1xx family of switches supports an 8-byte special tag which
+ can be used on the CPU port of the switch.
+ Say Y or M if you want to enable support for tagging frames for
+ MaxLinear GSW1xx switches.
+
+config NET_DSA_TAG_KSZ
+ tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches"
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Microchip 8795/937x/9477/9893 families of switches.
+
+config NET_DSA_TAG_OCELOT
+ tristate "Tag driver for Ocelot family of switches, using NPI port"
+ select PACKING
+ help
+ Say Y or M if you want to enable NPI tagging for the Ocelot switches
+ (VSC7511, VSC7512, VSC7513, VSC7514, VSC9953, VSC9959). In this mode,
+ the frames over the Ethernet CPU port are prepended with a
+ hardware-defined injection/extraction frame header. Flow control
+ (PAUSE frames) over the CPU port is not supported when operating in
+ this mode.
+
+config NET_DSA_TAG_OCELOT_8021Q
+ tristate "Tag driver for Ocelot family of switches, using VLAN"
+ help
+ Say Y or M if you want to enable support for tagging frames with a
+ custom VLAN-based header. Frames that require timestamping, such as
+ PTP, are not delivered over Ethernet but over register-based MMIO.
+ Flow control over the CPU port is functional in this mode. When using
+ this mode, less TCAM resources (VCAP IS1, IS2, ES0) are available for
+ use with tc-flower.
+
+config NET_DSA_TAG_QCA
+ tristate "Tag driver for Qualcomm Atheros QCA8K switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ the Qualcomm Atheros QCA8K switches.
+
+config NET_DSA_TAG_RTL4_A
+ tristate "Tag driver for Realtek 4 byte protocol A tags"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Realtek switches with 4 byte protocol A tags, such as found in
+ the Realtek RTL8366RB.
+
+config NET_DSA_TAG_RTL8_4
+ tristate "Tag driver for Realtek 8 byte protocol 4 tags"
+ help
+ Say Y or M if you want to enable support for tagging frames for Realtek
+ switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC.
+
+config NET_DSA_TAG_RZN1_A5PSW
+ tristate "Tag driver for Renesas RZ/N1 A5PSW switch"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Renesas RZ/N1 embedded switch that uses an 8 byte tag located after
+ destination MAC address.
+
+config NET_DSA_TAG_LAN9303
+ tristate "Tag driver for SMSC/Microchip LAN9303 family of switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ SMSC/Microchip LAN9303 family of switches.
+
+config NET_DSA_TAG_SJA1105
+ tristate "Tag driver for NXP SJA1105 switches"
+ select PACKING
+ help
+ Say Y or M if you want to enable support for tagging frames with the
+ NXP SJA1105 switch family. Both the native tagging protocol (which
+ is only for link-local traffic) as well as non-native tagging (based
+ on a custom 802.1Q VLAN header) are available.
+
+config NET_DSA_TAG_TRAILER
+ tristate "Tag driver for switches using a trailer tag"
+ help
+ Say Y or M if you want to enable support for tagging frames at
+ with a trailed. e.g. Marvell 88E6060.
+
+config NET_DSA_TAG_VSC73XX_8021Q
+ tristate "Tag driver for Microchip/Vitesse VSC73xx family of switches, using VLAN"
+ help
+ Say Y or M if you want to enable support for tagging frames with a
+ custom VLAN-based header.
+
+config NET_DSA_TAG_XRS700X
+ tristate "Tag driver for XRS700x switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Arrow SpeedChips XRS700x switches that use a single byte tag trailer.
+
+config NET_DSA_TAG_YT921X
+ tristate "Tag driver for Motorcomm YT921x switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Motorcomm YT921x switches.
+
+endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
new file mode 100644
index 000000000000..42d173f5a701
--- /dev/null
+++ b/net/dsa/Makefile
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# the stubs are built-in whenever DSA is built-in or module
+ifdef CONFIG_NET_DSA
+obj-y := stubs.o
+endif
+
+# the core
+obj-$(CONFIG_NET_DSA) += dsa_core.o
+dsa_core-y += \
+ conduit.o \
+ devlink.o \
+ dsa.o \
+ netlink.o \
+ port.o \
+ switch.o \
+ tag.o \
+ tag_8021q.o \
+ trace.o \
+ user.o
+
+# tagging formats
+obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
+obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o
+obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o
+obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
+obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o
+obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
+obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
+obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o
+obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
+obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
+obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o
+obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o
+obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o
+obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
+obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+obj-$(CONFIG_NET_DSA_TAG_VSC73XX_8021Q) += tag_vsc73xx_8021q.o
+obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
+obj-$(CONFIG_NET_DSA_TAG_YT921X) += tag_yt921x.o
+
+# for tracing framework to find trace.h
+CFLAGS_trace.o := -I$(src)
diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c
new file mode 100644
index 000000000000..a1b044467bd6
--- /dev/null
+++ b/net/dsa/conduit.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handling of a conduit device, switching frames via its switch fabric CPU port
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ * Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ */
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/dsa.h>
+#include <net/netdev_lock.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "port.h"
+#include "tag.h"
+
+static int dsa_conduit_get_regs_len(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int ret = 0;
+ int len;
+
+ if (ops && ops->get_regs_len) {
+ netdev_lock_ops(dev);
+ len = ops->get_regs_len(dev);
+ netdev_unlock_ops(dev);
+ if (len < 0)
+ return len;
+ ret += len;
+ }
+
+ ret += sizeof(struct ethtool_drvinfo);
+ ret += sizeof(struct ethtool_regs);
+
+ if (ds->ops->get_regs_len) {
+ len = ds->ops->get_regs_len(ds, port);
+ if (len < 0)
+ return len;
+ ret += len;
+ }
+
+ return ret;
+}
+
+static void dsa_conduit_get_regs(struct net_device *dev,
+ struct ethtool_regs *regs, void *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct ethtool_drvinfo *cpu_info;
+ struct ethtool_regs *cpu_regs;
+ int port = cpu_dp->index;
+ int len;
+
+ if (ops && ops->get_regs_len && ops->get_regs) {
+ netdev_lock_ops(dev);
+ len = ops->get_regs_len(dev);
+ if (len < 0) {
+ netdev_unlock_ops(dev);
+ return;
+ }
+ regs->len = len;
+ ops->get_regs(dev, regs, data);
+ netdev_unlock_ops(dev);
+ data += regs->len;
+ }
+
+ cpu_info = (struct ethtool_drvinfo *)data;
+ strscpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
+ data += sizeof(*cpu_info);
+ cpu_regs = (struct ethtool_regs *)data;
+ data += sizeof(*cpu_regs);
+
+ if (ds->ops->get_regs_len && ds->ops->get_regs) {
+ len = ds->ops->get_regs_len(ds, port);
+ if (len < 0)
+ return;
+ cpu_regs->len = len;
+ ds->ops->get_regs(ds, port, cpu_regs, data);
+ }
+}
+
+static ssize_t dsa_conduit_append_port_stats(struct dsa_switch *ds, int port,
+ u64 *data, size_t start)
+{
+ int count;
+
+ if (!ds->ops->get_sset_count)
+ return 0;
+
+ count = ds->ops->get_sset_count(ds, port, ETH_SS_STATS);
+ if (count < 0)
+ return count;
+
+ if (ds->ops->get_ethtool_stats)
+ ds->ops->get_ethtool_stats(ds, port, data + start);
+
+ return count;
+}
+
+static void dsa_conduit_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct dsa_port *dp, *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ int count, mcount = 0;
+
+ if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+ netdev_lock_ops(dev);
+ mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+ ops->get_ethtool_stats(dev, stats, data);
+ netdev_unlock_ops(dev);
+ }
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
+ continue;
+
+ count = dsa_conduit_append_port_stats(dp->ds, dp->index,
+ data, mcount);
+ if (count < 0)
+ return;
+
+ mcount += count;
+ }
+}
+
+static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int count = 0;
+
+ if (dev->phydev && (!ops || !ops->get_ethtool_phy_stats)) {
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ if (count >= 0)
+ phy_ethtool_get_stats(dev->phydev, stats, data);
+ } else if (ops && ops->get_sset_count && ops->get_ethtool_phy_stats) {
+ netdev_lock_ops(dev);
+ count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ ops->get_ethtool_phy_stats(dev, stats, data);
+ netdev_unlock_ops(dev);
+ }
+
+ if (count < 0)
+ count = 0;
+
+ if (ds->ops->get_ethtool_phy_stats)
+ ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
+static void dsa_conduit_append_port_sset_count(struct dsa_switch *ds, int port,
+ int sset, int *count)
+{
+ if (ds->ops->get_sset_count)
+ *count += ds->ops->get_sset_count(ds, port, sset);
+}
+
+static int dsa_conduit_get_sset_count(struct net_device *dev, int sset)
+{
+ struct dsa_port *dp, *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ int count = 0;
+
+ netdev_lock_ops(dev);
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ (!ops || !ops->get_ethtool_phy_stats))
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops && ops->get_sset_count)
+ count = ops->get_sset_count(dev, sset);
+ netdev_unlock_ops(dev);
+
+ if (count < 0)
+ count = 0;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
+ continue;
+
+ dsa_conduit_append_port_sset_count(dp->ds, dp->index, sset,
+ &count);
+ }
+
+ return count;
+}
+
+static ssize_t dsa_conduit_append_port_strings(struct dsa_switch *ds, int port,
+ u32 stringset, u8 *data,
+ size_t start)
+{
+ int len = ETH_GSTRING_LEN;
+ u8 pfx[8], *ndata;
+ int count, i;
+
+ if (!ds->ops->get_strings)
+ return 0;
+
+ snprintf(pfx, sizeof(pfx), "s%.2d_p%.2d", ds->index, port);
+ /* We do not want to be NULL-terminated, since this is a prefix */
+ pfx[sizeof(pfx) - 1] = '_';
+ ndata = data + start * len;
+ /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
+ * the output after to prepend our CPU port prefix we
+ * constructed earlier
+ */
+ ds->ops->get_strings(ds, port, stringset, ndata);
+ count = ds->ops->get_sset_count(ds, port, stringset);
+ if (count < 0)
+ return count;
+
+ for (i = 0; i < count; i++) {
+ memmove(ndata + (i * len + sizeof(pfx)),
+ ndata + i * len, len - sizeof(pfx));
+ memcpy(ndata + i * len, pfx, sizeof(pfx));
+ }
+
+ return count;
+}
+
+static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset,
+ u8 *data)
+{
+ struct dsa_port *dp, *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ int count, mcount = 0;
+
+ netdev_lock_ops(dev);
+ if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats) {
+ mcount = phy_ethtool_get_sset_count(dev->phydev);
+ if (mcount < 0)
+ mcount = 0;
+ else
+ phy_ethtool_get_strings(dev->phydev, data);
+ } else if (ops->get_sset_count && ops->get_strings) {
+ mcount = ops->get_sset_count(dev, stringset);
+ if (mcount < 0)
+ mcount = 0;
+ ops->get_strings(dev, stringset, data);
+ }
+ netdev_unlock_ops(dev);
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
+ continue;
+
+ count = dsa_conduit_append_port_strings(dp->ds, dp->index,
+ stringset, data,
+ mcount);
+ if (count < 0)
+ return;
+
+ mcount += count;
+ }
+}
+
+/* Deny PTP operations on conduit if there is at least one switch in the tree
+ * that is PTP capable.
+ */
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct dsa_switch_tree *dst;
+ struct dsa_port *dp;
+
+ dst = ds->dst;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_supports_hwtstamp(dp)) {
+ NL_SET_ERR_MSG(extack,
+ "HW timestamping not allowed on DSA conduit when switch supports the operation");
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_conduit_ethtool_setup(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct ethtool_ops *ops;
+
+ if (netif_is_lag_master(dev))
+ return 0;
+
+ ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
+ if (cpu_dp->orig_ethtool_ops)
+ memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
+
+ ops->get_regs_len = dsa_conduit_get_regs_len;
+ ops->get_regs = dsa_conduit_get_regs;
+ ops->get_sset_count = dsa_conduit_get_sset_count;
+ ops->get_ethtool_stats = dsa_conduit_get_ethtool_stats;
+ ops->get_strings = dsa_conduit_get_strings;
+ ops->get_ethtool_phy_stats = dsa_conduit_get_ethtool_phy_stats;
+
+ dev->ethtool_ops = ops;
+
+ return 0;
+}
+
+static void dsa_conduit_ethtool_teardown(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+ if (netif_is_lag_master(dev))
+ return;
+
+ dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+ cpu_dp->orig_ethtool_ops = NULL;
+}
+
+/* Keep the conduit always promiscuous if the tagging protocol requires that
+ * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
+ * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
+ * anyway.
+ */
+static void dsa_conduit_set_promiscuity(struct net_device *dev, int inc)
+{
+ const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
+
+ if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_conduit)
+ return;
+
+ ASSERT_RTNL();
+
+ dev_set_promiscuity(dev, inc);
+}
+
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *dev = to_net_dev(d);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+ return sysfs_emit(buf, "%s\n",
+ dsa_tag_protocol_to_str(cpu_dp->tag_ops));
+}
+
+static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
+ const char *end = strchrnul(buf, '\n'), *name;
+ struct net_device *dev = to_net_dev(d);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ size_t len = end - buf;
+ int err;
+
+ /* Empty string passed */
+ if (!len)
+ return -ENOPROTOOPT;
+
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ old_tag_ops = cpu_dp->tag_ops;
+ new_tag_ops = dsa_tag_driver_get_by_name(name);
+ kfree(name);
+ /* Bad tagger name? */
+ if (IS_ERR(new_tag_ops))
+ return PTR_ERR(new_tag_ops);
+
+ if (new_tag_ops == old_tag_ops)
+ /* Drop the temporarily held duplicate reference, since
+ * the DSA switch tree uses this tagger.
+ */
+ goto out;
+
+ err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops,
+ old_tag_ops);
+ if (err) {
+ /* On failure the old tagger is restored, so we don't need the
+ * driver for the new one.
+ */
+ dsa_tag_driver_put(new_tag_ops);
+ return err;
+ }
+
+ /* On success we no longer need the module for the old tagging protocol
+ */
+out:
+ dsa_tag_driver_put(old_tag_ops);
+ return count;
+}
+static DEVICE_ATTR_RW(tagging);
+
+static struct attribute *dsa_user_attrs[] = {
+ &dev_attr_tagging.attr,
+ NULL
+};
+
+static const struct attribute_group dsa_group = {
+ .name = "dsa",
+ .attrs = dsa_user_attrs,
+};
+
+static void dsa_conduit_reset_mtu(struct net_device *dev)
+{
+ int err;
+
+ err = dev_set_mtu(dev, ETH_DATA_LEN);
+ if (err)
+ netdev_dbg(dev,
+ "Unable to reset MTU to exclude DSA overheads\n");
+}
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp)
+{
+ const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct device_link *consumer_link;
+ int mtu, ret;
+
+ mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops);
+
+ /* The DSA conduit must use SET_NETDEV_DEV for this to work. */
+ if (!netif_is_lag_master(dev)) {
+ consumer_link = device_link_add(ds->dev, dev->dev.parent,
+ DL_FLAG_AUTOREMOVE_CONSUMER);
+ if (!consumer_link)
+ netdev_err(dev,
+ "Failed to create a device link to DSA switch %s\n",
+ dev_name(ds->dev));
+ }
+
+ /* The switch driver may not implement ->port_change_mtu(), case in
+ * which dsa_user_change_mtu() will not update the conduit MTU either,
+ * so we need to do that here.
+ */
+ ret = dev_set_mtu(dev, mtu);
+ if (ret)
+ netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
+ ret, mtu);
+
+ /* If we use a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point on get
+ * sent to the tag format's receive function.
+ */
+ wmb();
+
+ dev->dsa_ptr = cpu_dp;
+
+ dsa_conduit_set_promiscuity(dev, 1);
+
+ ret = dsa_conduit_ethtool_setup(dev);
+ if (ret)
+ goto out_err_reset_promisc;
+
+ ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
+ if (ret)
+ goto out_err_ethtool_teardown;
+
+ return ret;
+
+out_err_ethtool_teardown:
+ dsa_conduit_ethtool_teardown(dev);
+out_err_reset_promisc:
+ dsa_conduit_set_promiscuity(dev, -1);
+ return ret;
+}
+
+void dsa_conduit_teardown(struct net_device *dev)
+{
+ sysfs_remove_group(&dev->dev.kobj, &dsa_group);
+ dsa_conduit_ethtool_teardown(dev);
+ dsa_conduit_reset_mtu(dev);
+ dsa_conduit_set_promiscuity(dev, -1);
+
+ dev->dsa_ptr = NULL;
+
+ /* If we used a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point get sent
+ * without the tag and go through the regular receive path.
+ */
+ wmb();
+}
+
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
+{
+ bool conduit_setup = false;
+ int err;
+
+ if (!netdev_uses_dsa(lag_dev)) {
+ err = dsa_conduit_setup(lag_dev, cpu_dp);
+ if (err)
+ return err;
+
+ conduit_setup = true;
+ }
+
+ err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack);
+ if (err) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG");
+ goto out_conduit_teardown;
+ }
+
+ return 0;
+
+out_conduit_teardown:
+ if (conduit_setup)
+ dsa_conduit_teardown(lag_dev);
+ return err;
+}
+
+/* Tear down a conduit if there isn't any other user port on it,
+ * optionally also destroying LAG information.
+ */
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+ struct dsa_port *cpu_dp)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+
+ dsa_port_lag_leave(cpu_dp, lag_dev);
+
+ netdev_for_each_upper_dev_rcu(lag_dev, upper, iter)
+ if (dsa_user_dev_check(upper))
+ return;
+
+ dsa_conduit_teardown(lag_dev);
+}
diff --git a/net/dsa/conduit.h b/net/dsa/conduit.h
new file mode 100644
index 000000000000..31f8834f54bb
--- /dev/null
+++ b/net/dsa/conduit.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_CONDUIT_H
+#define __DSA_CONDUIT_H
+
+struct dsa_port;
+struct net_device;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp);
+void dsa_conduit_teardown(struct net_device *dev);
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+ struct dsa_port *cpu_dp);
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c
new file mode 100644
index 000000000000..ed342f345692
--- /dev/null
+++ b/net/dsa/devlink.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA devlink handling
+ */
+
+#include <net/dsa.h>
+#include <net/devlink.h>
+
+#include "devlink.h"
+
+static int dsa_devlink_info_get(struct devlink *dl,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (ds->ops->devlink_info_get)
+ return ds->ops->devlink_info_get(ds, req, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_devlink_sb_pool_get(struct devlink *dl,
+ unsigned int sb_index, u16 pool_index,
+ struct devlink_sb_pool_info *pool_info)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index,
+ pool_info);
+}
+
+static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_pool_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size,
+ threshold_type, extack);
+}
+
+static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 pool_index,
+ u32 *p_threshold)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index,
+ pool_index, p_threshold);
+}
+
+static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_port_pool_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index,
+ pool_index, threshold, extack);
+}
+
+static int
+dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_pool_index, u32 *p_threshold)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index,
+ tc_index, pool_type,
+ p_pool_index, p_threshold);
+}
+
+static int
+dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_tc_pool_bind_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold,
+ extack);
+}
+
+static int dsa_devlink_sb_occ_snapshot(struct devlink *dl,
+ unsigned int sb_index)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_occ_snapshot)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_snapshot(ds, sb_index);
+}
+
+static int dsa_devlink_sb_occ_max_clear(struct devlink *dl,
+ unsigned int sb_index)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_occ_max_clear)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_max_clear(ds, sb_index);
+}
+
+static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp,
+ unsigned int sb_index,
+ u16 pool_index, u32 *p_cur,
+ u32 *p_max)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_occ_port_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index,
+ pool_index, p_cur, p_max);
+}
+
+static int
+dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u32 *p_cur, u32 *p_max)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_occ_tc_port_bind_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port,
+ sb_index, tc_index,
+ pool_type, p_cur,
+ p_max);
+}
+
+static const struct devlink_ops dsa_devlink_ops = {
+ .info_get = dsa_devlink_info_get,
+ .sb_pool_get = dsa_devlink_sb_pool_get,
+ .sb_pool_set = dsa_devlink_sb_pool_set,
+ .sb_port_pool_get = dsa_devlink_sb_port_pool_get,
+ .sb_port_pool_set = dsa_devlink_sb_port_pool_set,
+ .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get,
+ .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set,
+ .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot,
+ .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear,
+ .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get,
+ .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get,
+};
+
+int dsa_devlink_param_get(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_param_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_get(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
+
+int dsa_devlink_param_set(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_param_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_set(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_set);
+
+int dsa_devlink_params_register(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ return devlink_params_register(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_register);
+
+void dsa_devlink_params_unregister(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ devlink_params_unregister(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
+
+int dsa_devlink_resource_register(struct dsa_switch *ds,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ int ret;
+
+ devl_lock(ds->devlink);
+ ret = devl_resource_register(ds->devlink, resource_name, resource_size,
+ resource_id, parent_resource_id,
+ size_params);
+ devl_unlock(ds->devlink);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_register);
+
+void dsa_devlink_resources_unregister(struct dsa_switch *ds)
+{
+ devlink_resources_unregister(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister);
+
+void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ devl_lock(ds->devlink);
+ devl_resource_occ_get_register(ds->devlink, resource_id, occ_get,
+ occ_get_priv);
+ devl_unlock(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register);
+
+void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
+ u64 resource_id)
+{
+ devl_lock(ds->devlink);
+ devl_resource_occ_get_unregister(ds->devlink, resource_id);
+ devl_unlock(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+
+struct devlink_region *
+dsa_devlink_region_create(struct dsa_switch *ds,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ return devlink_region_create(ds->devlink, ops, region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_create);
+
+struct devlink_region *
+dsa_devlink_port_region_create(struct dsa_switch *ds,
+ int port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ return devlink_port_region_create(&dp->devlink_port, ops,
+ region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create);
+
+void dsa_devlink_region_destroy(struct devlink_region *region)
+{
+ devlink_region_destroy(region);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy);
+
+int dsa_port_devlink_setup(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch_tree *dst = dp->ds->dst;
+ struct devlink_port_attrs attrs = {};
+ struct devlink *dl = dp->ds->devlink;
+ struct dsa_switch *ds = dp->ds;
+ const unsigned char *id;
+ unsigned char len;
+ int err;
+
+ memset(dlp, 0, sizeof(*dlp));
+ devlink_port_init(dl, dlp);
+
+ if (ds->ops->port_setup) {
+ err = ds->ops->port_setup(ds, dp->index);
+ if (err)
+ return err;
+ }
+
+ id = (const unsigned char *)&dst->index;
+ len = sizeof(dst->index);
+
+ attrs.phys.port_number = dp->index;
+ memcpy(attrs.switch_id.id, id, len);
+ attrs.switch_id.id_len = len;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
+ break;
+ case DSA_PORT_TYPE_CPU:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
+ break;
+ case DSA_PORT_TYPE_DSA:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
+ break;
+ case DSA_PORT_TYPE_USER:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+ break;
+ }
+
+ devlink_port_attrs_set(dlp, &attrs);
+ err = devlink_port_register(dl, dlp, dp->index);
+ if (err) {
+ if (ds->ops->port_teardown)
+ ds->ops->port_teardown(ds, dp->index);
+ return err;
+ }
+
+ return 0;
+}
+
+void dsa_port_devlink_teardown(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch *ds = dp->ds;
+
+ devlink_port_unregister(dlp);
+
+ if (ds->ops->port_teardown)
+ ds->ops->port_teardown(ds, dp->index);
+
+ devlink_port_fini(dlp);
+}
+
+void dsa_switch_devlink_register(struct dsa_switch *ds)
+{
+ devlink_register(ds->devlink);
+}
+
+void dsa_switch_devlink_unregister(struct dsa_switch *ds)
+{
+ devlink_unregister(ds->devlink);
+}
+
+int dsa_switch_devlink_alloc(struct dsa_switch *ds)
+{
+ struct dsa_devlink_priv *dl_priv;
+ struct devlink *dl;
+
+ /* Add the switch to devlink before calling setup, so that setup can
+ * add dpipe tables
+ */
+ dl = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev);
+ if (!dl)
+ return -ENOMEM;
+
+ ds->devlink = dl;
+
+ dl_priv = devlink_priv(ds->devlink);
+ dl_priv->ds = ds;
+
+ return 0;
+}
+
+void dsa_switch_devlink_free(struct dsa_switch *ds)
+{
+ devlink_free(ds->devlink);
+ ds->devlink = NULL;
+}
diff --git a/net/dsa/devlink.h b/net/dsa/devlink.h
new file mode 100644
index 000000000000..4d9f4f23705b
--- /dev/null
+++ b/net/dsa/devlink.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_DEVLINK_H
+#define __DSA_DEVLINK_H
+
+struct dsa_port;
+struct dsa_switch;
+
+int dsa_port_devlink_setup(struct dsa_port *dp);
+void dsa_port_devlink_teardown(struct dsa_port *dp);
+void dsa_switch_devlink_register(struct dsa_switch *ds);
+void dsa_switch_devlink_unregister(struct dsa_switch *ds);
+int dsa_switch_devlink_alloc(struct dsa_switch *ds);
+void dsa_switch_devlink_free(struct dsa_switch *ds);
+
+#endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
new file mode 100644
index 000000000000..a20efabe778f
--- /dev/null
+++ b/net/dsa/dsa.c
@@ -0,0 +1,1897 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA topology and switch handling
+ *
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/if_hsr.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/of.h>
+#include <linux/of_net.h>
+#include <net/dsa_stubs.h>
+#include <net/sch_generic.h>
+
+#include "conduit.h"
+#include "devlink.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "user.h"
+
+#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG
+
+static DEFINE_MUTEX(dsa2_mutex);
+LIST_HEAD(dsa_tree_list);
+
+static struct workqueue_struct *dsa_owq;
+
+/* Track the bridges with forwarding offload enabled */
+static unsigned long dsa_fwd_offloading_bridges;
+
+bool dsa_schedule_work(struct work_struct *work)
+{
+ return queue_work(dsa_owq, work);
+}
+
+void dsa_flush_workqueue(void)
+{
+ flush_workqueue(dsa_owq);
+}
+EXPORT_SYMBOL_GPL(dsa_flush_workqueue);
+
+/**
+ * dsa_lag_map() - Map LAG structure to a linear LAG array
+ * @dst: Tree in which to record the mapping.
+ * @lag: LAG structure that is to be mapped to the tree's array.
+ *
+ * dsa_lag_id/dsa_lag_by_id can then be used to translate between the
+ * two spaces. The size of the mapping space is determined by the
+ * driver by setting ds->num_lag_ids. It is perfectly legal to leave
+ * it unset if it is not needed, in which case these functions become
+ * no-ops.
+ */
+void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag)
+{
+ unsigned int id;
+
+ for (id = 1; id <= dst->lags_len; id++) {
+ if (!dsa_lag_by_id(dst, id)) {
+ dst->lags[id - 1] = lag;
+ lag->id = id;
+ return;
+ }
+ }
+
+ /* No IDs left, which is OK. Some drivers do not need it. The
+ * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id
+ * returns an error for this device when joining the LAG. The
+ * driver can then return -EOPNOTSUPP back to DSA, which will
+ * fall back to a software LAG.
+ */
+}
+
+/**
+ * dsa_lag_unmap() - Remove a LAG ID mapping
+ * @dst: Tree in which the mapping is recorded.
+ * @lag: LAG structure that was mapped.
+ *
+ * As there may be multiple users of the mapping, it is only removed
+ * if there are no other references to it.
+ */
+void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag)
+{
+ unsigned int id;
+
+ dsa_lags_foreach_id(id, dst) {
+ if (dsa_lag_by_id(dst, id) == lag) {
+ dst->lags[id - 1] = NULL;
+ lag->id = 0;
+ break;
+ }
+ }
+}
+
+struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
+ const struct net_device *lag_dev)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_lag_dev_get(dp) == lag_dev)
+ return dp->lag;
+
+ return NULL;
+}
+
+struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
+ const struct net_device *br)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_bridge_dev_get(dp) == br)
+ return dp->bridge;
+
+ return NULL;
+}
+
+static int dsa_bridge_num_find(const struct net_device *bridge_dev)
+{
+ struct dsa_switch_tree *dst;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ struct dsa_bridge *bridge;
+
+ bridge = dsa_tree_bridge_find(dst, bridge_dev);
+ if (bridge)
+ return bridge->num;
+ }
+
+ return 0;
+}
+
+unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max)
+{
+ unsigned int bridge_num = dsa_bridge_num_find(bridge_dev);
+
+ /* Switches without FDB isolation support don't get unique
+ * bridge numbering
+ */
+ if (!max)
+ return 0;
+
+ if (!bridge_num) {
+ /* First port that requests FDB isolation or TX forwarding
+ * offload for this bridge
+ */
+ bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges,
+ DSA_MAX_NUM_OFFLOADING_BRIDGES,
+ 1);
+ if (bridge_num >= max)
+ return 0;
+
+ set_bit(bridge_num, &dsa_fwd_offloading_bridges);
+ }
+
+ return bridge_num;
+}
+
+void dsa_bridge_num_put(const struct net_device *bridge_dev,
+ unsigned int bridge_num)
+{
+ /* Since we refcount bridges, we know that when we call this function
+ * it is no longer in use, so we can just go ahead and remove it from
+ * the bit mask.
+ */
+ clear_bit(bridge_num, &dsa_fwd_offloading_bridges);
+}
+
+struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
+{
+ struct dsa_switch_tree *dst;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ if (dst->index != tree_index)
+ continue;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->ds->index != sw_index)
+ continue;
+
+ return dp->ds;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_find);
+
+static struct dsa_switch_tree *dsa_tree_find(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ list_for_each_entry(dst, &dsa_tree_list, list)
+ if (dst->index == index)
+ return dst;
+
+ return NULL;
+}
+
+static struct dsa_switch_tree *dsa_tree_alloc(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (!dst)
+ return NULL;
+
+ dst->index = index;
+
+ INIT_LIST_HEAD(&dst->rtable);
+
+ INIT_LIST_HEAD(&dst->ports);
+
+ INIT_LIST_HEAD(&dst->list);
+ list_add_tail(&dst->list, &dsa_tree_list);
+
+ kref_init(&dst->refcount);
+
+ return dst;
+}
+
+static void dsa_tree_free(struct dsa_switch_tree *dst)
+{
+ if (dst->tag_ops)
+ dsa_tag_driver_put(dst->tag_ops);
+ list_del(&dst->list);
+ kfree(dst);
+}
+
+static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst)
+{
+ if (dst)
+ kref_get(&dst->refcount);
+
+ return dst;
+}
+
+static struct dsa_switch_tree *dsa_tree_touch(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = dsa_tree_find(index);
+ if (dst)
+ return dsa_tree_get(dst);
+ else
+ return dsa_tree_alloc(index);
+}
+
+static void dsa_tree_release(struct kref *ref)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = container_of(ref, struct dsa_switch_tree, refcount);
+
+ dsa_tree_free(dst);
+}
+
+static void dsa_tree_put(struct dsa_switch_tree *dst)
+{
+ if (dst)
+ kref_put(&dst->refcount, dsa_tree_release);
+}
+
+static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
+ struct device_node *dn)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->dn == dn)
+ return dp;
+
+ return NULL;
+}
+
+static struct dsa_link *dsa_link_touch(struct dsa_port *dp,
+ struct dsa_port *link_dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst;
+ struct dsa_link *dl;
+
+ dst = ds->dst;
+
+ list_for_each_entry(dl, &dst->rtable, list)
+ if (dl->dp == dp && dl->link_dp == link_dp)
+ return dl;
+
+ dl = kzalloc(sizeof(*dl), GFP_KERNEL);
+ if (!dl)
+ return NULL;
+
+ dl->dp = dp;
+ dl->link_dp = link_dp;
+
+ INIT_LIST_HEAD(&dl->list);
+ list_add_tail(&dl->list, &dst->rtable);
+
+ return dl;
+}
+
+static bool dsa_port_setup_routing_table(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst = ds->dst;
+ struct device_node *dn = dp->dn;
+ struct of_phandle_iterator it;
+ struct dsa_port *link_dp;
+ struct dsa_link *dl;
+ int err;
+
+ of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
+ link_dp = dsa_tree_find_port_by_node(dst, it.node);
+ if (!link_dp) {
+ of_node_put(it.node);
+ return false;
+ }
+
+ dl = dsa_link_touch(dp, link_dp);
+ if (!dl) {
+ of_node_put(it.node);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
+{
+ bool complete = true;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_dsa(dp)) {
+ complete = dsa_port_setup_routing_table(dp);
+ if (!complete)
+ break;
+ }
+ }
+
+ return complete;
+}
+
+static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_cpu(dp))
+ return dp;
+
+ return NULL;
+}
+
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst)
+{
+ struct device_node *ethernet;
+ struct net_device *conduit;
+ struct dsa_port *cpu_dp;
+
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
+ conduit = of_find_net_device_by_node(ethernet);
+ of_node_put(ethernet);
+
+ return conduit;
+}
+
+/* Assign the default CPU port (the first one in the tree) to all ports of the
+ * fabric which don't already have one as part of their own switch.
+ */
+static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp, *dp;
+
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ if (!cpu_dp) {
+ pr_err("DSA: tree %d has no CPU port\n", dst->index);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->cpu_dp)
+ continue;
+
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
+ }
+
+ return 0;
+}
+
+static struct dsa_port *
+dsa_switch_preferred_default_local_cpu_port(struct dsa_switch *ds)
+{
+ struct dsa_port *cpu_dp;
+
+ if (!ds->ops->preferred_default_local_cpu_port)
+ return NULL;
+
+ cpu_dp = ds->ops->preferred_default_local_cpu_port(ds);
+ if (!cpu_dp)
+ return NULL;
+
+ if (WARN_ON(!dsa_port_is_cpu(cpu_dp) || cpu_dp->ds != ds))
+ return NULL;
+
+ return cpu_dp;
+}
+
+/* Perform initial assignment of CPU ports to user ports and DSA links in the
+ * fabric, giving preference to CPU ports local to each switch. Default to
+ * using the first CPU port in the switch tree if the port does not have a CPU
+ * port local to this switch.
+ */
+static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *preferred_cpu_dp, *cpu_dp, *dp;
+
+ list_for_each_entry(cpu_dp, &dst->ports, list) {
+ if (!dsa_port_is_cpu(cpu_dp))
+ continue;
+
+ preferred_cpu_dp = dsa_switch_preferred_default_local_cpu_port(cpu_dp->ds);
+ if (preferred_cpu_dp && preferred_cpu_dp != cpu_dp)
+ continue;
+
+ /* Prefer a local CPU port */
+ dsa_switch_for_each_port(dp, cpu_dp->ds) {
+ /* Prefer the first local CPU port found */
+ if (dp->cpu_dp)
+ continue;
+
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
+ }
+ }
+
+ return dsa_tree_setup_default_cpu(dst);
+}
+
+static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = NULL;
+}
+
+static int dsa_port_setup(struct dsa_port *dp)
+{
+ bool dsa_port_link_registered = false;
+ struct dsa_switch *ds = dp->ds;
+ bool dsa_port_enabled = false;
+ int err = 0;
+
+ if (dp->setup)
+ return 0;
+
+ err = dsa_port_devlink_setup(dp);
+ if (err)
+ return err;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ dsa_port_disable(dp);
+ break;
+ case DSA_PORT_TYPE_CPU:
+ if (dp->dn) {
+ err = dsa_shared_port_link_register_of(dp);
+ if (err)
+ break;
+ dsa_port_link_registered = true;
+ } else {
+ dev_warn(ds->dev,
+ "skipping link registration for CPU port %d\n",
+ dp->index);
+ }
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
+ break;
+ case DSA_PORT_TYPE_DSA:
+ if (dp->dn) {
+ err = dsa_shared_port_link_register_of(dp);
+ if (err)
+ break;
+ dsa_port_link_registered = true;
+ } else {
+ dev_warn(ds->dev,
+ "skipping link registration for DSA port %d\n",
+ dp->index);
+ }
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
+ break;
+ case DSA_PORT_TYPE_USER:
+ of_get_mac_address(dp->dn, dp->mac);
+ err = dsa_user_create(dp);
+ break;
+ }
+
+ if (err && dsa_port_enabled)
+ dsa_port_disable(dp);
+ if (err && dsa_port_link_registered)
+ dsa_shared_port_link_unregister_of(dp);
+ if (err) {
+ dsa_port_devlink_teardown(dp);
+ return err;
+ }
+
+ dp->setup = true;
+
+ return 0;
+}
+
+static void dsa_port_teardown(struct dsa_port *dp)
+{
+ if (!dp->setup)
+ return;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ break;
+ case DSA_PORT_TYPE_CPU:
+ dsa_port_disable(dp);
+ if (dp->dn)
+ dsa_shared_port_link_unregister_of(dp);
+ break;
+ case DSA_PORT_TYPE_DSA:
+ dsa_port_disable(dp);
+ if (dp->dn)
+ dsa_shared_port_link_unregister_of(dp);
+ break;
+ case DSA_PORT_TYPE_USER:
+ if (dp->user) {
+ dsa_user_destroy(dp->user);
+ dp->user = NULL;
+ }
+ break;
+ }
+
+ dsa_port_devlink_teardown(dp);
+
+ dp->setup = false;
+}
+
+static int dsa_port_setup_as_unused(struct dsa_port *dp)
+{
+ dp->type = DSA_PORT_TYPE_UNUSED;
+ return dsa_port_setup(dp);
+}
+
+static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds)
+{
+ const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
+ struct dsa_switch_tree *dst = ds->dst;
+ int err;
+
+ if (tag_ops->proto == dst->default_proto)
+ goto connect;
+
+ rtnl_lock();
+ err = ds->ops->change_tag_protocol(ds, tag_ops->proto);
+ rtnl_unlock();
+ if (err) {
+ dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n",
+ tag_ops->name, ERR_PTR(err));
+ return err;
+ }
+
+connect:
+ if (tag_ops->connect) {
+ err = tag_ops->connect(ds);
+ if (err)
+ return err;
+ }
+
+ if (ds->ops->connect_tag_protocol) {
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
+ if (err) {
+ dev_err(ds->dev,
+ "Unable to connect to tag protocol \"%s\": %pe\n",
+ tag_ops->name, ERR_PTR(err));
+ goto disconnect;
+ }
+ }
+
+ return 0;
+
+disconnect:
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+
+ return err;
+}
+
+static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds)
+{
+ const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
+
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+}
+
+static int dsa_switch_setup(struct dsa_switch *ds)
+{
+ int err;
+
+ if (ds->setup)
+ return 0;
+
+ /* Initialize ds->phys_mii_mask before registering the user MDIO bus
+ * driver and before ops->setup() has run, since the switch drivers and
+ * the user MDIO bus driver rely on these values for probing PHY
+ * devices or not
+ */
+ ds->phys_mii_mask |= dsa_user_ports(ds);
+
+ err = dsa_switch_devlink_alloc(ds);
+ if (err)
+ return err;
+
+ err = dsa_switch_register_notifier(ds);
+ if (err)
+ goto devlink_free;
+
+ ds->configure_vlan_while_not_filtering = true;
+
+ err = ds->ops->setup(ds);
+ if (err < 0)
+ goto unregister_notifier;
+
+ err = dsa_switch_setup_tag_protocol(ds);
+ if (err)
+ goto teardown;
+
+ if (!ds->user_mii_bus && ds->ops->phy_read) {
+ ds->user_mii_bus = mdiobus_alloc();
+ if (!ds->user_mii_bus) {
+ err = -ENOMEM;
+ goto teardown;
+ }
+
+ dsa_user_mii_bus_init(ds);
+
+ err = mdiobus_register(ds->user_mii_bus);
+ if (err < 0)
+ goto free_user_mii_bus;
+ }
+
+ dsa_switch_devlink_register(ds);
+
+ ds->setup = true;
+ return 0;
+
+free_user_mii_bus:
+ if (ds->user_mii_bus && ds->ops->phy_read)
+ mdiobus_free(ds->user_mii_bus);
+teardown:
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+unregister_notifier:
+ dsa_switch_unregister_notifier(ds);
+devlink_free:
+ dsa_switch_devlink_free(ds);
+ return err;
+}
+
+static void dsa_switch_teardown(struct dsa_switch *ds)
+{
+ if (!ds->setup)
+ return;
+
+ dsa_switch_devlink_unregister(ds);
+
+ if (ds->user_mii_bus && ds->ops->phy_read) {
+ mdiobus_unregister(ds->user_mii_bus);
+ mdiobus_free(ds->user_mii_bus);
+ ds->user_mii_bus = NULL;
+ }
+
+ dsa_switch_teardown_tag_protocol(ds);
+
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+
+ dsa_switch_unregister_notifier(ds);
+
+ dsa_switch_devlink_free(ds);
+
+ ds->setup = false;
+}
+
+/* First tear down the non-shared, then the shared ports. This ensures that
+ * all work items scheduled by our switchdev handlers for user ports have
+ * completed before we destroy the refcounting kept on the shared ports.
+ */
+static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_unused(dp))
+ dsa_port_teardown(dp);
+
+ dsa_flush_workqueue();
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp))
+ dsa_port_teardown(dp);
+}
+
+static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_switch_teardown(dp->ds);
+}
+
+/* Bring shared ports up first, then non-shared ports */
+static int dsa_tree_setup_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) {
+ err = dsa_port_setup(dp);
+ if (err)
+ goto teardown;
+ }
+ }
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) {
+ err = dsa_port_setup(dp);
+ if (err) {
+ err = dsa_port_setup_as_unused(dp);
+ if (err)
+ goto teardown;
+ }
+ }
+ }
+
+ return 0;
+
+teardown:
+ dsa_tree_teardown_ports(dst);
+
+ return err;
+}
+
+static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ err = dsa_switch_setup(dp->ds);
+ if (err) {
+ dsa_tree_teardown_switches(dst);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_tree_setup_conduit(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp;
+ int err = 0;
+
+ rtnl_lock();
+
+ dsa_tree_for_each_cpu_port(cpu_dp, dst) {
+ struct net_device *conduit = cpu_dp->conduit;
+ bool admin_up = (conduit->flags & IFF_UP) &&
+ !qdisc_tx_is_noop(conduit);
+
+ err = dsa_conduit_setup(conduit, cpu_dp);
+ if (err)
+ break;
+
+ /* Replay conduit state event */
+ dsa_tree_conduit_admin_state_change(dst, conduit, admin_up);
+ dsa_tree_conduit_oper_state_change(dst, conduit,
+ netif_oper_up(conduit));
+ }
+
+ rtnl_unlock();
+
+ return err;
+}
+
+static void dsa_tree_teardown_conduit(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp;
+
+ rtnl_lock();
+
+ dsa_tree_for_each_cpu_port(cpu_dp, dst) {
+ struct net_device *conduit = cpu_dp->conduit;
+
+ /* Synthesizing an "admin down" state is sufficient for
+ * the switches to get a notification if the conduit is
+ * currently up and running.
+ */
+ dsa_tree_conduit_admin_state_change(dst, conduit, false);
+
+ dsa_conduit_teardown(conduit);
+ }
+
+ rtnl_unlock();
+}
+
+static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
+{
+ unsigned int len = 0;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->ds->num_lag_ids > len)
+ len = dp->ds->num_lag_ids;
+ }
+
+ if (!len)
+ return 0;
+
+ dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL);
+ if (!dst->lags)
+ return -ENOMEM;
+
+ dst->lags_len = len;
+ return 0;
+}
+
+static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst)
+{
+ kfree(dst->lags);
+}
+
+static void dsa_tree_teardown_routing_table(struct dsa_switch_tree *dst)
+{
+ struct dsa_link *dl, *next;
+
+ list_for_each_entry_safe(dl, next, &dst->rtable, list) {
+ list_del(&dl->list);
+ kfree(dl);
+ }
+}
+
+static int dsa_tree_setup(struct dsa_switch_tree *dst)
+{
+ bool complete;
+ int err;
+
+ if (dst->setup) {
+ pr_err("DSA: tree %d already setup! Disjoint trees?\n",
+ dst->index);
+ return -EEXIST;
+ }
+
+ complete = dsa_tree_setup_routing_table(dst);
+ if (!complete)
+ return 0;
+
+ err = dsa_tree_setup_cpu_ports(dst);
+ if (err)
+ goto teardown_rtable;
+
+ err = dsa_tree_setup_switches(dst);
+ if (err)
+ goto teardown_cpu_ports;
+
+ err = dsa_tree_setup_ports(dst);
+ if (err)
+ goto teardown_switches;
+
+ err = dsa_tree_setup_conduit(dst);
+ if (err)
+ goto teardown_ports;
+
+ err = dsa_tree_setup_lags(dst);
+ if (err)
+ goto teardown_conduit;
+
+ dst->setup = true;
+
+ pr_info("DSA: tree %d setup\n", dst->index);
+
+ return 0;
+
+teardown_conduit:
+ dsa_tree_teardown_conduit(dst);
+teardown_ports:
+ dsa_tree_teardown_ports(dst);
+teardown_switches:
+ dsa_tree_teardown_switches(dst);
+teardown_cpu_ports:
+ dsa_tree_teardown_cpu_ports(dst);
+teardown_rtable:
+ dsa_tree_teardown_routing_table(dst);
+
+ return err;
+}
+
+static void dsa_tree_teardown(struct dsa_switch_tree *dst)
+{
+ if (!dst->setup)
+ return;
+
+ dsa_tree_teardown_lags(dst);
+
+ dsa_tree_teardown_conduit(dst);
+
+ dsa_tree_teardown_ports(dst);
+
+ dsa_tree_teardown_switches(dst);
+
+ dsa_tree_teardown_cpu_ports(dst);
+
+ dsa_tree_teardown_routing_table(dst);
+
+ pr_info("DSA: tree %d torn down\n", dst->index);
+
+ dst->setup = false;
+}
+
+static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops)
+{
+ const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
+ struct dsa_notifier_tag_proto_info info;
+ int err;
+
+ dst->tag_ops = tag_ops;
+
+ /* Notify the switches from this tree about the connection
+ * to the new tagger
+ */
+ info.tag_ops = tag_ops;
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
+ if (err && err != -EOPNOTSUPP)
+ goto out_disconnect;
+
+ /* Notify the old tagger about the disconnection from this tree */
+ info.tag_ops = old_tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+
+ return 0;
+
+out_disconnect:
+ info.tag_ops = tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+ dst->tag_ops = old_tag_ops;
+
+ return err;
+}
+
+/* Since the dsa/tagging sysfs device attribute is per conduit, the assumption
+ * is that all DSA switches within a tree share the same tagger, otherwise
+ * they would have formed disjoint trees (different "dsa,member" values).
+ */
+int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops,
+ const struct dsa_device_ops *old_tag_ops)
+{
+ struct dsa_notifier_tag_proto_info info;
+ struct dsa_port *dp;
+ int err = -EBUSY;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ /* At the moment we don't allow changing the tag protocol under
+ * traffic. The rtnl_mutex also happens to serialize concurrent
+ * attempts to change the tagging protocol. If we ever lift the IFF_UP
+ * restriction, there needs to be another mutex which serializes this.
+ */
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_conduit(dp)->flags & IFF_UP)
+ goto out_unlock;
+
+ if (dp->user->flags & IFF_UP)
+ goto out_unlock;
+ }
+
+ /* Notify the tag protocol change */
+ info.tag_ops = tag_ops;
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+ if (err)
+ goto out_unwind_tagger;
+
+ err = dsa_tree_bind_tag_proto(dst, tag_ops);
+ if (err)
+ goto out_unwind_tagger;
+
+ rtnl_unlock();
+
+ return 0;
+
+out_unwind_tagger:
+ info.tag_ops = old_tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void dsa_tree_conduit_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit)
+{
+ struct dsa_notifier_conduit_state_info info;
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+
+ info.conduit = conduit;
+ info.operational = dsa_port_conduit_is_operational(cpu_dp);
+
+ dsa_tree_notify(dst, DSA_NOTIFIER_CONDUIT_STATE_CHANGE, &info);
+}
+
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ bool notify = false;
+
+ /* Don't keep track of admin state on LAG DSA conduits,
+ * but rather just of physical DSA conduits
+ */
+ if (netif_is_lag_master(conduit))
+ return;
+
+ if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+ (up && cpu_dp->conduit_oper_up))
+ notify = true;
+
+ cpu_dp->conduit_admin_up = up;
+
+ if (notify)
+ dsa_tree_conduit_state_change(dst, conduit);
+}
+
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ bool notify = false;
+
+ /* Don't keep track of oper state on LAG DSA conduits,
+ * but rather just of physical DSA conduits
+ */
+ if (netif_is_lag_master(conduit))
+ return;
+
+ if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+ (cpu_dp->conduit_admin_up && up))
+ notify = true;
+
+ cpu_dp->conduit_oper_up = up;
+
+ if (notify)
+ dsa_tree_conduit_state_change(dst, conduit);
+}
+
+static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+ struct dsa_port *dp;
+
+ dsa_switch_for_each_port(dp, ds)
+ if (dp->index == index)
+ return dp;
+
+ dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+ if (!dp)
+ return NULL;
+
+ dp->ds = ds;
+ dp->index = index;
+
+ mutex_init(&dp->addr_lists_lock);
+ mutex_init(&dp->vlans_lock);
+ INIT_LIST_HEAD(&dp->fdbs);
+ INIT_LIST_HEAD(&dp->mdbs);
+ INIT_LIST_HEAD(&dp->vlans); /* also initializes &dp->user_vlans */
+ INIT_LIST_HEAD(&dp->list);
+ list_add_tail(&dp->list, &dst->ports);
+
+ return dp;
+}
+
+static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
+{
+ dp->type = DSA_PORT_TYPE_USER;
+ dp->name = name;
+
+ return 0;
+}
+
+static int dsa_port_parse_dsa(struct dsa_port *dp)
+{
+ dp->type = DSA_PORT_TYPE_DSA;
+
+ return 0;
+}
+
+static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
+ struct net_device *conduit)
+{
+ enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
+ struct dsa_switch *mds, *ds = dp->ds;
+ unsigned int mdp_upstream;
+ struct dsa_port *mdp;
+
+ /* It is possible to stack DSA switches onto one another when that
+ * happens the switch driver may want to know if its tagging protocol
+ * is going to work in such a configuration.
+ */
+ if (dsa_user_dev_check(conduit)) {
+ mdp = dsa_user_to_port(conduit);
+ mds = mdp->ds;
+ mdp_upstream = dsa_upstream_port(mds, mdp->index);
+ tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
+ DSA_TAG_PROTO_NONE);
+ }
+
+ /* If the conduit device is not itself a DSA user in a disjoint DSA
+ * tree, then return immediately.
+ */
+ return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
+}
+
+static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *conduit,
+ const char *user_protocol)
+{
+ const struct dsa_device_ops *tag_ops = NULL;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst = ds->dst;
+ enum dsa_tag_protocol default_proto;
+
+ /* Find out which protocol the switch would prefer. */
+ default_proto = dsa_get_tag_protocol(dp, conduit);
+ if (dst->default_proto) {
+ if (dst->default_proto != default_proto) {
+ dev_err(ds->dev,
+ "A DSA switch tree can have only one tagging protocol\n");
+ return -EINVAL;
+ }
+ } else {
+ dst->default_proto = default_proto;
+ }
+
+ /* See if the user wants to override that preference. */
+ if (user_protocol) {
+ if (!ds->ops->change_tag_protocol) {
+ dev_err(ds->dev, "Tag protocol cannot be modified\n");
+ return -EINVAL;
+ }
+
+ tag_ops = dsa_tag_driver_get_by_name(user_protocol);
+ if (IS_ERR(tag_ops)) {
+ dev_warn(ds->dev,
+ "Failed to find a tagging driver for protocol %s, using default\n",
+ user_protocol);
+ tag_ops = NULL;
+ }
+ }
+
+ if (!tag_ops)
+ tag_ops = dsa_tag_driver_get_by_id(default_proto);
+
+ if (IS_ERR(tag_ops)) {
+ if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
+ return -EPROBE_DEFER;
+
+ dev_warn(ds->dev, "No tagger for this switch\n");
+ return PTR_ERR(tag_ops);
+ }
+
+ if (dst->tag_ops) {
+ if (dst->tag_ops != tag_ops) {
+ dev_err(ds->dev,
+ "A DSA switch tree can have only one tagging protocol\n");
+
+ dsa_tag_driver_put(tag_ops);
+ return -EINVAL;
+ }
+
+ /* In the case of multiple CPU ports per switch, the tagging
+ * protocol is still reference-counted only per switch tree.
+ */
+ dsa_tag_driver_put(tag_ops);
+ } else {
+ dst->tag_ops = tag_ops;
+ }
+
+ dp->conduit = conduit;
+ dp->type = DSA_PORT_TYPE_CPU;
+ dsa_port_set_tag_protocol(dp, dst->tag_ops);
+ dp->dst = dst;
+
+ /* At this point, the tree may be configured to use a different
+ * tagger than the one chosen by the switch driver during
+ * .setup, in the case when a user selects a custom protocol
+ * through the DT.
+ *
+ * This is resolved by syncing the driver with the tree in
+ * dsa_switch_setup_tag_protocol once .setup has run and the
+ * driver is ready to accept calls to .change_tag_protocol. If
+ * the driver does not support the custom protocol at that
+ * point, the tree is wholly rejected, thereby ensuring that the
+ * tree and driver are always in agreement on the protocol to
+ * use.
+ */
+ return 0;
+}
+
+static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
+{
+ struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0);
+ const char *name = of_get_property(dn, "label", NULL);
+ bool link = of_property_read_bool(dn, "link");
+
+ dp->dn = dn;
+
+ if (ethernet) {
+ struct net_device *conduit;
+ const char *user_protocol;
+
+ conduit = of_find_net_device_by_node(ethernet);
+ of_node_put(ethernet);
+ if (!conduit)
+ return -EPROBE_DEFER;
+
+ user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
+ return dsa_port_parse_cpu(dp, conduit, user_protocol);
+ }
+
+ if (link)
+ return dsa_port_parse_dsa(dp);
+
+ return dsa_port_parse_user(dp, name);
+}
+
+static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
+ struct device_node *dn)
+{
+ struct device_node *ports, *port;
+ struct dsa_port *dp;
+ int err = 0;
+ u32 reg;
+
+ ports = of_get_child_by_name(dn, "ports");
+ if (!ports) {
+ /* The second possibility is "ethernet-ports" */
+ ports = of_get_child_by_name(dn, "ethernet-ports");
+ if (!ports) {
+ dev_err(ds->dev, "no ports child node found\n");
+ return -EINVAL;
+ }
+ }
+
+ for_each_available_child_of_node(ports, port) {
+ err = of_property_read_u32(port, "reg", &reg);
+ if (err) {
+ of_node_put(port);
+ goto out_put_node;
+ }
+
+ if (reg >= ds->num_ports) {
+ dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%u)\n",
+ port, reg, ds->num_ports);
+ of_node_put(port);
+ err = -EINVAL;
+ goto out_put_node;
+ }
+
+ dp = dsa_to_port(ds, reg);
+
+ err = dsa_port_parse_of(dp, port);
+ if (err) {
+ of_node_put(port);
+ goto out_put_node;
+ }
+ }
+
+out_put_node:
+ of_node_put(ports);
+ return err;
+}
+
+static int dsa_switch_parse_member_of(struct dsa_switch *ds,
+ struct device_node *dn)
+{
+ u32 m[2] = { 0, 0 };
+ int sz;
+
+ /* Don't error out if this optional property isn't found */
+ sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2);
+ if (sz < 0 && sz != -EINVAL)
+ return sz;
+
+ ds->index = m[1];
+
+ ds->dst = dsa_tree_touch(m[0]);
+ if (!ds->dst)
+ return -ENOMEM;
+
+ if (dsa_switch_find(ds->dst->index, ds->index)) {
+ dev_err(ds->dev,
+ "A DSA switch with index %d already exists in tree %d\n",
+ ds->index, ds->dst->index);
+ return -EEXIST;
+ }
+
+ if (ds->dst->last_switch < ds->index)
+ ds->dst->last_switch = ds->index;
+
+ return 0;
+}
+
+static int dsa_switch_touch_ports(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int port;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ dp = dsa_port_touch(ds, port);
+ if (!dp)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
+{
+ int err;
+
+ err = dsa_switch_parse_member_of(ds, dn);
+ if (err)
+ return err;
+
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
+ return dsa_switch_parse_ports_of(ds, dn);
+}
+
+static int dev_is_class(struct device *dev, const void *class)
+{
+ if (dev->class != NULL && !strcmp(dev->class->name, class))
+ return 1;
+
+ return 0;
+}
+
+static struct device *dev_find_class(struct device *parent, char *class)
+{
+ if (dev_is_class(parent, class)) {
+ get_device(parent);
+ return parent;
+ }
+
+ return device_find_child(parent, class, dev_is_class);
+}
+
+static struct net_device *dsa_dev_to_net_device(struct device *dev)
+{
+ struct device *d;
+
+ d = dev_find_class(dev, "net");
+ if (d != NULL) {
+ struct net_device *nd;
+
+ nd = to_net_dev(d);
+ dev_hold(nd);
+ put_device(d);
+
+ return nd;
+ }
+
+ return NULL;
+}
+
+static int dsa_port_parse(struct dsa_port *dp, const char *name,
+ struct device *dev)
+{
+ if (!strcmp(name, "cpu")) {
+ struct net_device *conduit;
+
+ conduit = dsa_dev_to_net_device(dev);
+ if (!conduit)
+ return -EPROBE_DEFER;
+
+ dev_put(conduit);
+
+ return dsa_port_parse_cpu(dp, conduit, NULL);
+ }
+
+ if (!strcmp(name, "dsa"))
+ return dsa_port_parse_dsa(dp);
+
+ return dsa_port_parse_user(dp, name);
+}
+
+static int dsa_switch_parse_ports(struct dsa_switch *ds,
+ struct dsa_chip_data *cd)
+{
+ bool valid_name_found = false;
+ struct dsa_port *dp;
+ struct device *dev;
+ const char *name;
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ name = cd->port_names[i];
+ dev = cd->netdev[i];
+ dp = dsa_to_port(ds, i);
+
+ if (!name)
+ continue;
+
+ err = dsa_port_parse(dp, name, dev);
+ if (err)
+ return err;
+
+ valid_name_found = true;
+ }
+
+ if (!valid_name_found && i == DSA_MAX_PORTS)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
+{
+ int err;
+
+ ds->cd = cd;
+
+ /* We don't support interconnected switches nor multiple trees via
+ * platform data, so this is the unique switch of the tree.
+ */
+ ds->index = 0;
+ ds->dst = dsa_tree_touch(0);
+ if (!ds->dst)
+ return -ENOMEM;
+
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
+ return dsa_switch_parse_ports(ds, cd);
+}
+
+static void dsa_switch_release_ports(struct dsa_switch *ds)
+{
+ struct dsa_mac_addr *a, *tmp;
+ struct dsa_port *dp, *next;
+ struct dsa_vlan *v, *n;
+
+ dsa_switch_for_each_port_safe(dp, next, ds) {
+ /* These are either entries that upper layers lost track of
+ * (probably due to bugs), or installed through interfaces
+ * where one does not necessarily have to remove them, like
+ * ndo_dflt_fdb_add().
+ */
+ list_for_each_entry_safe(a, tmp, &dp->fdbs, list) {
+ dev_info(ds->dev,
+ "Cleaning up unicast address %pM vid %u from port %d\n",
+ a->addr, a->vid, dp->index);
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ list_for_each_entry_safe(a, tmp, &dp->mdbs, list) {
+ dev_info(ds->dev,
+ "Cleaning up multicast address %pM vid %u from port %d\n",
+ a->addr, a->vid, dp->index);
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ /* These are entries that upper layers have lost track of,
+ * probably due to bugs, but also due to dsa_port_do_vlan_del()
+ * having failed and the VLAN entry still lingering on.
+ */
+ list_for_each_entry_safe(v, n, &dp->vlans, list) {
+ dev_info(ds->dev,
+ "Cleaning up vid %u from port %d\n",
+ v->vid, dp->index);
+ list_del(&v->list);
+ kfree(v);
+ }
+
+ list_del(&dp->list);
+ kfree(dp);
+ }
+}
+
+static int dsa_switch_probe(struct dsa_switch *ds)
+{
+ struct dsa_switch_tree *dst;
+ struct dsa_chip_data *pdata;
+ struct device_node *np;
+ int err;
+
+ if (!ds->dev)
+ return -ENODEV;
+
+ pdata = ds->dev->platform_data;
+ np = ds->dev->of_node;
+
+ if (!ds->num_ports)
+ return -EINVAL;
+
+ if (np) {
+ err = dsa_switch_parse_of(ds, np);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else if (pdata) {
+ err = dsa_switch_parse(ds, pdata);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else {
+ err = -ENODEV;
+ }
+
+ if (err)
+ return err;
+
+ dst = ds->dst;
+ dsa_tree_get(dst);
+ err = dsa_tree_setup(dst);
+ if (err) {
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
+ }
+
+ return err;
+}
+
+int dsa_register_switch(struct dsa_switch *ds)
+{
+ int err;
+
+ mutex_lock(&dsa2_mutex);
+ err = dsa_switch_probe(ds);
+ dsa_tree_put(ds->dst);
+ mutex_unlock(&dsa2_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(dsa_register_switch);
+
+static void dsa_switch_remove(struct dsa_switch *ds)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+
+ dsa_tree_teardown(dst);
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
+}
+
+void dsa_unregister_switch(struct dsa_switch *ds)
+{
+ mutex_lock(&dsa2_mutex);
+ dsa_switch_remove(ds);
+ mutex_unlock(&dsa2_mutex);
+}
+EXPORT_SYMBOL_GPL(dsa_unregister_switch);
+
+/* If the DSA conduit chooses to unregister its net_device on .shutdown, DSA is
+ * blocking that operation from completion, due to the dev_hold taken inside
+ * netdev_upper_dev_link. Unlink the DSA user interfaces from being uppers of
+ * the DSA conduit, so that the system can reboot successfully.
+ */
+void dsa_switch_shutdown(struct dsa_switch *ds)
+{
+ struct net_device *conduit, *user_dev;
+ LIST_HEAD(close_list);
+ struct dsa_port *dp;
+
+ mutex_lock(&dsa2_mutex);
+
+ if (!ds->setup)
+ goto out;
+
+ rtnl_lock();
+
+ dsa_switch_for_each_cpu_port(dp, ds)
+ list_add(&dp->conduit->close_list, &close_list);
+
+ netif_close_many(&close_list, true);
+
+ dsa_switch_for_each_user_port(dp, ds) {
+ conduit = dsa_port_to_conduit(dp);
+ user_dev = dp->user;
+
+ netif_device_detach(user_dev);
+ netdev_upper_dev_unlink(conduit, user_dev);
+ }
+
+ /* Disconnect from further netdevice notifiers on the conduit,
+ * since netdev_uses_dsa() will now return false.
+ */
+ dsa_switch_for_each_cpu_port(dp, ds)
+ dp->conduit->dsa_ptr = NULL;
+
+ rtnl_unlock();
+out:
+ mutex_unlock(&dsa2_mutex);
+}
+EXPORT_SYMBOL_GPL(dsa_switch_shutdown);
+
+#ifdef CONFIG_PM_SLEEP
+static bool dsa_port_is_initialized(const struct dsa_port *dp)
+{
+ return dp->type == DSA_PORT_TYPE_USER && dp->user;
+}
+
+int dsa_switch_suspend(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int ret = 0;
+
+ /* Suspend user network devices */
+ dsa_switch_for_each_port(dp, ds) {
+ if (!dsa_port_is_initialized(dp))
+ continue;
+
+ ret = dsa_user_suspend(dp->user);
+ if (ret)
+ return ret;
+ }
+
+ if (ds->ops->suspend)
+ ret = ds->ops->suspend(ds);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_suspend);
+
+int dsa_switch_resume(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int ret = 0;
+
+ if (ds->ops->resume)
+ ret = ds->ops->resume(ds);
+
+ if (ret)
+ return ret;
+
+ /* Resume user network devices */
+ dsa_switch_for_each_port(dp, ds) {
+ if (!dsa_port_is_initialized(dp))
+ continue;
+
+ ret = dsa_user_resume(dp->user);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_resume);
+#endif
+
+struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
+{
+ if (!netdev || !dsa_user_dev_check(netdev))
+ return ERR_PTR(-ENODEV);
+
+ return dsa_user_to_port(netdev);
+}
+EXPORT_SYMBOL_GPL(dsa_port_from_netdev);
+
+bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b)
+{
+ if (a->type != b->type)
+ return false;
+
+ switch (a->type) {
+ case DSA_DB_PORT:
+ return a->dp == b->dp;
+ case DSA_DB_LAG:
+ return a->lag.dev == b->lag.dev;
+ case DSA_DB_BRIDGE:
+ return a->bridge.num == b->bridge.num;
+ default:
+ WARN_ON(1);
+ return false;
+ }
+}
+
+bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ struct dsa_mac_addr *a;
+
+ lockdep_assert_held(&dp->addr_lists_lock);
+
+ list_for_each_entry(a, &dp->fdbs, list) {
+ if (!ether_addr_equal(a->addr, addr) || a->vid != vid)
+ continue;
+
+ if (a->db.type == db.type && !dsa_db_equal(&a->db, &db))
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db);
+
+bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ struct dsa_mac_addr *a;
+
+ lockdep_assert_held(&dp->addr_lists_lock);
+
+ list_for_each_entry(a, &dp->mdbs, list) {
+ if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid)
+ continue;
+
+ if (a->db.type == db.type && !dsa_db_equal(&a->db, &db))
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
+
+/* Helpers for switches without specific HSR offloads, but which can implement
+ * NETIF_F_HW_HSR_DUP because their tagger uses dsa_xmit_port_mask()
+ */
+int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port,
+ struct net_device *hsr,
+ struct netlink_ext_ack *extack)
+{
+ enum hsr_port_type type;
+ int err;
+
+ err = hsr_get_port_type(hsr, dsa_to_port(ds, port)->user, &type);
+ if (err)
+ return err;
+
+ if (type != HSR_PT_SLAVE_A && type != HSR_PT_SLAVE_B) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only HSR slave ports can be offloaded");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_validate);
+
+int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port,
+ struct net_device *hsr,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
+ int err;
+
+ err = dsa_port_simple_hsr_validate(ds, port, hsr, extack);
+ if (err)
+ return err;
+
+ dsa_hsr_foreach_port(other_dp, ds, hsr) {
+ if (other_dp != dp) {
+ dp->user->features |= NETIF_F_HW_HSR_DUP;
+ other_dp->user->features |= NETIF_F_HW_HSR_DUP;
+ break;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_join);
+
+int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port,
+ struct net_device *hsr)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
+
+ dsa_hsr_foreach_port(other_dp, ds, hsr) {
+ if (other_dp != dp) {
+ dp->user->features &= ~NETIF_F_HW_HSR_DUP;
+ other_dp->user->features &= ~NETIF_F_HW_HSR_DUP;
+ break;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_leave);
+
+static const struct dsa_stubs __dsa_stubs = {
+ .conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate,
+};
+
+static void dsa_register_stubs(void)
+{
+ dsa_stubs = &__dsa_stubs;
+}
+
+static void dsa_unregister_stubs(void)
+{
+ dsa_stubs = NULL;
+}
+
+static int __init dsa_init_module(void)
+{
+ int rc;
+
+ dsa_owq = alloc_ordered_workqueue("dsa_ordered",
+ WQ_MEM_RECLAIM);
+ if (!dsa_owq)
+ return -ENOMEM;
+
+ rc = dsa_user_register_notifier();
+ if (rc)
+ goto register_notifier_fail;
+
+ dev_add_pack(&dsa_pack_type);
+
+ rc = rtnl_link_register(&dsa_link_ops);
+ if (rc)
+ goto netlink_register_fail;
+
+ dsa_register_stubs();
+
+ return 0;
+
+netlink_register_fail:
+ dsa_user_unregister_notifier();
+ dev_remove_pack(&dsa_pack_type);
+register_notifier_fail:
+ destroy_workqueue(dsa_owq);
+
+ return rc;
+}
+module_init(dsa_init_module);
+
+static void __exit dsa_cleanup_module(void)
+{
+ dsa_unregister_stubs();
+
+ rtnl_link_unregister(&dsa_link_ops);
+
+ dsa_user_unregister_notifier();
+ dev_remove_pack(&dsa_pack_type);
+ destroy_workqueue(dsa_owq);
+}
+module_exit(dsa_cleanup_module);
+
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
+MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:dsa");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/dsa/dsa.h b/net/dsa/dsa.h
new file mode 100644
index 000000000000..3cc7823e9ef3
--- /dev/null
+++ b/net/dsa/dsa.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_H
+#define __DSA_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct dsa_db;
+struct dsa_device_ops;
+struct dsa_lag;
+struct dsa_switch_tree;
+struct net_device;
+struct work_struct;
+
+extern struct list_head dsa_tree_list;
+
+bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b);
+bool dsa_schedule_work(struct work_struct *work);
+void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
+void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
+struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
+ const struct net_device *lag_dev);
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst);
+int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops,
+ const struct dsa_device_ops *old_tag_ops);
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up);
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up);
+unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
+void dsa_bridge_num_put(const struct net_device *bridge_dev,
+ unsigned int bridge_num);
+struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
+ const struct net_device *br);
+
+#endif
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
new file mode 100644
index 000000000000..1332e56349e5
--- /dev/null
+++ b/net/dsa/netlink.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2022 NXP
+ */
+#include <linux/netdevice.h>
+#include <net/rtnetlink.h>
+
+#include "netlink.h"
+#include "user.h"
+
+static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
+ [IFLA_DSA_CONDUIT] = { .type = NLA_U32 },
+};
+
+static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_DSA_CONDUIT]) {
+ u32 ifindex = nla_get_u32(data[IFLA_DSA_CONDUIT]);
+ struct net_device *conduit;
+
+ conduit = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!conduit)
+ return -EINVAL;
+
+ err = dsa_user_change_conduit(dev, conduit, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static size_t dsa_get_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(u32)) + /* IFLA_DSA_CONDUIT */
+ 0;
+}
+
+static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+
+ if (nla_put_u32(skb, IFLA_DSA_CONDUIT, conduit->ifindex))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+struct rtnl_link_ops dsa_link_ops __read_mostly = {
+ .kind = "dsa",
+ .priv_size = sizeof(struct dsa_port),
+ .maxtype = IFLA_DSA_MAX,
+ .policy = dsa_policy,
+ .changelink = dsa_changelink,
+ .get_size = dsa_get_size,
+ .fill_info = dsa_fill_info,
+ .netns_refund = true,
+};
diff --git a/net/dsa/netlink.h b/net/dsa/netlink.h
new file mode 100644
index 000000000000..7eda2fa15722
--- /dev/null
+++ b/net/dsa/netlink.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_NETLINK_H
+#define __DSA_NETLINK_H
+
+extern struct rtnl_link_ops dsa_link_ops __read_mostly;
+
+#endif
diff --git a/net/dsa/port.c b/net/dsa/port.c
new file mode 100644
index 000000000000..ca3a7f52229b
--- /dev/null
+++ b/net/dsa/port.c
@@ -0,0 +1,1955 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handling of a single switch port
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ * Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ */
+
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+
+#include "dsa.h"
+#include "port.h"
+#include "switch.h"
+#include "tag_8021q.h"
+#include "user.h"
+
+/**
+ * dsa_port_notify - Notify the switching fabric of changes to a port
+ * @dp: port on which change occurred
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Notify all switches in the DSA tree that this port's switch belongs to,
+ * including this switch itself, of an event. Allows the other switches to
+ * reconfigure themselves for cross-chip operations. Can also be used to
+ * reconfigure ports without net_devices (CPU ports, DSA links) whenever
+ * a user port's state changes.
+ */
+static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v)
+{
+ return dsa_tree_notify(dp->ds->dst, e, v);
+}
+
+static void dsa_port_notify_bridge_fdb_flush(const struct dsa_port *dp, u16 vid)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ struct switchdev_notifier_fdb_info info = {
+ .vid = vid,
+ };
+
+ /* When the port becomes standalone it has already left the bridge.
+ * Don't notify the bridge in that case.
+ */
+ if (!brport_dev)
+ return;
+
+ call_switchdev_notifiers(SWITCHDEV_FDB_FLUSH_TO_BRIDGE,
+ brport_dev, &info.info, NULL);
+}
+
+static void dsa_port_fast_age(const struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_fast_age)
+ return;
+
+ ds->ops->port_fast_age(ds, dp->index);
+
+ /* flush all VLANs */
+ dsa_port_notify_bridge_fdb_flush(dp, 0);
+}
+
+static int dsa_port_vlan_fast_age(const struct dsa_port *dp, u16 vid)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_vlan_fast_age)
+ return -EOPNOTSUPP;
+
+ err = ds->ops->port_vlan_fast_age(ds, dp->index, vid);
+
+ if (!err)
+ dsa_port_notify_bridge_fdb_flush(dp, vid);
+
+ return err;
+}
+
+static int dsa_port_msti_fast_age(const struct dsa_port *dp, u16 msti)
+{
+ DECLARE_BITMAP(vids, VLAN_N_VID) = { 0 };
+ int err, vid;
+
+ err = br_mst_get_info(dsa_port_bridge_dev_get(dp), msti, vids);
+ if (err)
+ return err;
+
+ for_each_set_bit(vid, vids, VLAN_N_VID) {
+ err = dsa_port_vlan_fast_age(dp, vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool dsa_port_can_configure_learning(struct dsa_port *dp)
+{
+ struct switchdev_brport_flags flags = {
+ .mask = BR_LEARNING,
+ };
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_bridge_flags || !ds->ops->port_pre_bridge_flags)
+ return false;
+
+ err = ds->ops->port_pre_bridge_flags(ds, dp->index, flags, NULL);
+ return !err;
+}
+
+bool dsa_port_supports_hwtstamp(struct dsa_port *dp)
+{
+ struct kernel_hwtstamp_config config = {};
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set)
+ return false;
+
+ /* "See through" shim implementations of the "get" method. */
+ err = ds->ops->port_hwtstamp_get(ds, dp->index, &config);
+ return err != -EOPNOTSUPP;
+}
+
+int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->port_stp_state_set)
+ return -EOPNOTSUPP;
+
+ ds->ops->port_stp_state_set(ds, port, state);
+
+ if (!dsa_port_can_configure_learning(dp) ||
+ (do_fast_age && dp->learning)) {
+ /* Fast age FDB entries or flush appropriate forwarding database
+ * for the given port, if we are moving it from Learning or
+ * Forwarding state, to Disabled or Blocking or Listening state.
+ * Ports that were standalone before the STP state change don't
+ * need to fast age the FDB, since address learning is off in
+ * standalone mode.
+ */
+
+ if ((dp->stp_state == BR_STATE_LEARNING ||
+ dp->stp_state == BR_STATE_FORWARDING) &&
+ (state == BR_STATE_DISABLED ||
+ state == BR_STATE_BLOCKING ||
+ state == BR_STATE_LISTENING))
+ dsa_port_fast_age(dp);
+ }
+
+ dp->stp_state = state;
+
+ return 0;
+}
+
+static void dsa_port_set_state_now(struct dsa_port *dp, u8 state,
+ bool do_fast_age)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ err = dsa_port_set_state(dp, state, do_fast_age);
+ if (err && err != -EOPNOTSUPP) {
+ dev_err(ds->dev, "port %d failed to set STP state %u: %pe\n",
+ dp->index, state, ERR_PTR(err));
+ }
+}
+
+int dsa_port_set_mst_state(struct dsa_port *dp,
+ const struct switchdev_mst_state *state,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ u8 prev_state;
+ int err;
+
+ if (!ds->ops->port_mst_state_set)
+ return -EOPNOTSUPP;
+
+ err = br_mst_get_state(dsa_port_to_bridge_port(dp), state->msti,
+ &prev_state);
+ if (err)
+ return err;
+
+ err = ds->ops->port_mst_state_set(ds, dp->index, state);
+ if (err)
+ return err;
+
+ if (!(dp->learning &&
+ (prev_state == BR_STATE_LEARNING ||
+ prev_state == BR_STATE_FORWARDING) &&
+ (state->state == BR_STATE_DISABLED ||
+ state->state == BR_STATE_BLOCKING ||
+ state->state == BR_STATE_LISTENING)))
+ return 0;
+
+ err = dsa_port_msti_fast_age(dp, state->msti);
+ if (err)
+ NL_SET_ERR_MSG_MOD(extack,
+ "Unable to flush associated VLANs");
+
+ return 0;
+}
+
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ int err;
+
+ if (ds->ops->port_enable) {
+ err = ds->ops->port_enable(ds, port, phy);
+ if (err)
+ return err;
+ }
+
+ if (!dp->bridge)
+ dsa_port_set_state_now(dp, BR_STATE_FORWARDING, false);
+
+ if (dp->pl)
+ phylink_start(dp->pl);
+
+ return 0;
+}
+
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
+{
+ int err;
+
+ rtnl_lock();
+ err = dsa_port_enable_rt(dp, phy);
+ rtnl_unlock();
+
+ return err;
+}
+
+void dsa_port_disable_rt(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (dp->pl)
+ phylink_stop(dp->pl);
+
+ if (!dp->bridge)
+ dsa_port_set_state_now(dp, BR_STATE_DISABLED, false);
+
+ if (ds->ops->port_disable)
+ ds->ops->port_disable(ds, port);
+}
+
+void dsa_port_disable(struct dsa_port *dp)
+{
+ rtnl_lock();
+ dsa_port_disable_rt(dp);
+ rtnl_unlock();
+}
+
+static void dsa_port_reset_vlan_filtering(struct dsa_port *dp,
+ struct dsa_bridge bridge)
+{
+ struct netlink_ext_ack extack = {0};
+ bool change_vlan_filtering = false;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ bool vlan_filtering;
+ int err;
+
+ if (ds->needs_standalone_vlan_filtering &&
+ !br_vlan_enabled(bridge.dev)) {
+ change_vlan_filtering = true;
+ vlan_filtering = true;
+ } else if (!ds->needs_standalone_vlan_filtering &&
+ br_vlan_enabled(bridge.dev)) {
+ change_vlan_filtering = true;
+ vlan_filtering = false;
+ }
+
+ /* If the bridge was vlan_filtering, the bridge core doesn't trigger an
+ * event for changing vlan_filtering setting upon user ports leaving
+ * it. That is a good thing, because that lets us handle it and also
+ * handle the case where the switch's vlan_filtering setting is global
+ * (not per port). When that happens, the correct moment to trigger the
+ * vlan_filtering callback is only when the last port leaves the last
+ * VLAN-aware bridge.
+ */
+ if (change_vlan_filtering && ds->vlan_filtering_is_global) {
+ dsa_switch_for_each_port(other_dp, ds) {
+ struct net_device *br = dsa_port_bridge_dev_get(other_dp);
+
+ if (br && br_vlan_enabled(br)) {
+ change_vlan_filtering = false;
+ break;
+ }
+ }
+ }
+
+ if (!change_vlan_filtering)
+ return;
+
+ err = dsa_port_vlan_filtering(dp, vlan_filtering, &extack);
+ if (extack._msg) {
+ dev_err(ds->dev, "port %d: %s\n", dp->index,
+ extack._msg);
+ }
+ if (err && err != -EOPNOTSUPP) {
+ dev_err(ds->dev,
+ "port %d failed to reset VLAN filtering to %d: %pe\n",
+ dp->index, vlan_filtering, ERR_PTR(err));
+ }
+}
+
+static int dsa_port_inherit_brport_flags(struct dsa_port *dp,
+ struct netlink_ext_ack *extack)
+{
+ const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD | BR_PORT_LOCKED;
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ int flag, err;
+
+ for_each_set_bit(flag, &mask, 32) {
+ struct switchdev_brport_flags flags = {0};
+
+ flags.mask = BIT(flag);
+
+ if (br_port_flag_is_set(brport_dev, BIT(flag)))
+ flags.val = BIT(flag);
+
+ err = dsa_port_bridge_flags(dp, flags, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ return 0;
+}
+
+static void dsa_port_clear_brport_flags(struct dsa_port *dp)
+{
+ const unsigned long val = BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+ const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD | BR_PORT_LOCKED;
+ int flag, err;
+
+ for_each_set_bit(flag, &mask, 32) {
+ struct switchdev_brport_flags flags = {0};
+
+ flags.mask = BIT(flag);
+ flags.val = val & BIT(flag);
+
+ err = dsa_port_bridge_flags(dp, flags, NULL);
+ if (err && err != -EOPNOTSUPP)
+ dev_err(dp->ds->dev,
+ "failed to clear bridge port flag %lu: %pe\n",
+ flags.val, ERR_PTR(err));
+ }
+}
+
+static int dsa_port_switchdev_sync_attrs(struct dsa_port *dp,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ int err;
+
+ err = dsa_port_inherit_brport_flags(dp, extack);
+ if (err)
+ return err;
+
+ err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev), false);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_vlan_filtering(dp, br_vlan_enabled(br), extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_ageing_time(dp, br_get_ageing_time(br));
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
+static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp,
+ struct dsa_bridge bridge)
+{
+ /* Configure the port for standalone mode (no address learning,
+ * flood everything).
+ * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events
+ * when the user requests it through netlink or sysfs, but not
+ * automatically at port join or leave, so we need to handle resetting
+ * the brport flags ourselves. But we even prefer it that way, because
+ * otherwise, some setups might never get the notification they need,
+ * for example, when a port leaves a LAG that offloads the bridge,
+ * it becomes standalone, but as far as the bridge is concerned, no
+ * port ever left.
+ */
+ dsa_port_clear_brport_flags(dp);
+
+ /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
+ * so allow it to be in BR_STATE_FORWARDING to be kept functional
+ */
+ dsa_port_set_state_now(dp, BR_STATE_FORWARDING, true);
+
+ dsa_port_reset_vlan_filtering(dp, bridge);
+
+ /* Ageing time may be global to the switch chip, so don't change it
+ * here because we have no good reason (or value) to change it to.
+ */
+}
+
+static int dsa_port_bridge_create(struct dsa_port *dp,
+ struct net_device *br,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_bridge *bridge;
+
+ bridge = dsa_tree_bridge_find(ds->dst, br);
+ if (bridge) {
+ refcount_inc(&bridge->refcount);
+ dp->bridge = bridge;
+ return 0;
+ }
+
+ bridge = kzalloc(sizeof(*bridge), GFP_KERNEL);
+ if (!bridge)
+ return -ENOMEM;
+
+ refcount_set(&bridge->refcount, 1);
+
+ bridge->dev = br;
+
+ bridge->num = dsa_bridge_num_get(br, ds->max_num_bridges);
+ if (ds->max_num_bridges && !bridge->num) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Range of offloadable bridges exceeded");
+ kfree(bridge);
+ return -EOPNOTSUPP;
+ }
+
+ dp->bridge = bridge;
+
+ return 0;
+}
+
+static void dsa_port_bridge_destroy(struct dsa_port *dp,
+ const struct net_device *br)
+{
+ struct dsa_bridge *bridge = dp->bridge;
+
+ dp->bridge = NULL;
+
+ if (!refcount_dec_and_test(&bridge->refcount))
+ return;
+
+ if (bridge->num)
+ dsa_bridge_num_put(br, bridge->num);
+
+ kfree(bridge);
+}
+
+static bool dsa_port_supports_mst(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ return ds->ops->vlan_msti_set &&
+ ds->ops->port_mst_state_set &&
+ ds->ops->port_vlan_fast_age &&
+ dsa_port_can_configure_learning(dp);
+}
+
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_notifier_bridge_info info = {
+ .dp = dp,
+ .extack = extack,
+ };
+ struct net_device *dev = dp->user;
+ struct net_device *brport_dev;
+ int err;
+
+ if (br_mst_enabled(br) && !dsa_port_supports_mst(dp))
+ return -EOPNOTSUPP;
+
+ /* Here the interface is already bridged. Reflect the current
+ * configuration so that drivers can program their chips accordingly.
+ */
+ err = dsa_port_bridge_create(dp, br, extack);
+ if (err)
+ return err;
+
+ brport_dev = dsa_port_to_bridge_port(dp);
+
+ info.bridge = *dp->bridge;
+ err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info);
+ if (err)
+ goto out_rollback;
+
+ /* Drivers which support bridge TX forwarding should set this */
+ dp->bridge->tx_fwd_offload = info.tx_fwd_offload;
+
+ err = switchdev_bridge_port_offload(brport_dev, dev, dp,
+ &dsa_user_switchdev_notifier,
+ &dsa_user_switchdev_blocking_notifier,
+ dp->bridge->tx_fwd_offload, extack);
+ if (err)
+ goto out_rollback_unbridge;
+
+ err = dsa_port_switchdev_sync_attrs(dp, extack);
+ if (err)
+ goto out_rollback_unoffload;
+
+ return 0;
+
+out_rollback_unoffload:
+ switchdev_bridge_port_unoffload(brport_dev, dp,
+ &dsa_user_switchdev_notifier,
+ &dsa_user_switchdev_blocking_notifier);
+ dsa_flush_workqueue();
+out_rollback_unbridge:
+ dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+out_rollback:
+ dsa_port_bridge_destroy(dp, br);
+ return err;
+}
+
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+
+ /* Don't try to unoffload something that is not offloaded */
+ if (!brport_dev)
+ return;
+
+ switchdev_bridge_port_unoffload(brport_dev, dp,
+ &dsa_user_switchdev_notifier,
+ &dsa_user_switchdev_blocking_notifier);
+
+ dsa_flush_workqueue();
+}
+
+void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
+{
+ struct dsa_notifier_bridge_info info = {
+ .dp = dp,
+ };
+ int err;
+
+ /* If the port could not be offloaded to begin with, then
+ * there is nothing to do.
+ */
+ if (!dp->bridge)
+ return;
+
+ info.bridge = *dp->bridge;
+
+ /* Here the port is already unbridged. Reflect the current configuration
+ * so that drivers can program their chips accordingly.
+ */
+ dsa_port_bridge_destroy(dp, br);
+
+ err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to notify DSA_NOTIFIER_BRIDGE_LEAVE: %pe\n",
+ dp->index, ERR_PTR(err));
+
+ dsa_port_switchdev_unsync_attrs(dp, info.bridge);
+}
+
+int dsa_port_lag_change(struct dsa_port *dp,
+ struct netdev_lag_lower_state_info *linfo)
+{
+ struct dsa_notifier_lag_info info = {
+ .dp = dp,
+ };
+ bool tx_enabled;
+
+ if (!dp->lag)
+ return 0;
+
+ /* On statically configured aggregates (e.g. loadbalance
+ * without LACP) ports will always be tx_enabled, even if the
+ * link is down. Thus we require both link_up and tx_enabled
+ * in order to include it in the tx set.
+ */
+ tx_enabled = linfo->link_up && linfo->tx_enabled;
+
+ if (tx_enabled == dp->lag_tx_enabled)
+ return 0;
+
+ dp->lag_tx_enabled = tx_enabled;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_LAG_CHANGE, &info);
+}
+
+static int dsa_port_lag_create(struct dsa_port *dp,
+ struct net_device *lag_dev)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_lag *lag;
+
+ lag = dsa_tree_lag_find(ds->dst, lag_dev);
+ if (lag) {
+ refcount_inc(&lag->refcount);
+ dp->lag = lag;
+ return 0;
+ }
+
+ lag = kzalloc(sizeof(*lag), GFP_KERNEL);
+ if (!lag)
+ return -ENOMEM;
+
+ refcount_set(&lag->refcount, 1);
+ mutex_init(&lag->fdb_lock);
+ INIT_LIST_HEAD(&lag->fdbs);
+ lag->dev = lag_dev;
+ dsa_lag_map(ds->dst, lag);
+ dp->lag = lag;
+
+ return 0;
+}
+
+static void dsa_port_lag_destroy(struct dsa_port *dp)
+{
+ struct dsa_lag *lag = dp->lag;
+
+ dp->lag = NULL;
+ dp->lag_tx_enabled = false;
+
+ if (!refcount_dec_and_test(&lag->refcount))
+ return;
+
+ WARN_ON(!list_empty(&lag->fdbs));
+ dsa_lag_unmap(dp->ds->dst, lag);
+ kfree(lag);
+}
+
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_notifier_lag_info info = {
+ .dp = dp,
+ .info = uinfo,
+ .extack = extack,
+ };
+ struct net_device *bridge_dev;
+ int err;
+
+ err = dsa_port_lag_create(dp, lag_dev);
+ if (err)
+ goto err_lag_create;
+
+ info.lag = *dp->lag;
+ err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info);
+ if (err)
+ goto err_lag_join;
+
+ bridge_dev = netdev_master_upper_dev_get(lag_dev);
+ if (!bridge_dev || !netif_is_bridge_master(bridge_dev))
+ return 0;
+
+ err = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (err)
+ goto err_bridge_join;
+
+ return 0;
+
+err_bridge_join:
+ dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info);
+err_lag_join:
+ dsa_port_lag_destroy(dp);
+err_lag_create:
+ return err;
+}
+
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev)
+{
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+
+ if (br)
+ dsa_port_pre_bridge_leave(dp, br);
+}
+
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev)
+{
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct dsa_notifier_lag_info info = {
+ .dp = dp,
+ };
+ int err;
+
+ if (!dp->lag)
+ return;
+
+ /* Port might have been part of a LAG that in turn was
+ * attached to a bridge.
+ */
+ if (br)
+ dsa_port_bridge_leave(dp, br);
+
+ info.lag = *dp->lag;
+
+ dsa_port_lag_destroy(dp);
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to notify DSA_NOTIFIER_LAG_LEAVE: %pe\n",
+ dp->index, ERR_PTR(err));
+}
+
+/* Must be called under rcu_read_lock() */
+static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
+ bool vlan_filtering,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ int err;
+
+ /* VLAN awareness was off, so the question is "can we turn it on".
+ * We may have had 8021q uppers, those need to go. Make sure we don't
+ * enter an inconsistent state: deny changing the VLAN awareness state
+ * as long as we have 8021q uppers.
+ */
+ if (vlan_filtering && dsa_port_is_user(dp)) {
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct net_device *upper_dev, *user = dp->user;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
+ struct bridge_vlan_info br_info;
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Must first remove VLAN uppers having VIDs also present in bridge");
+ return false;
+ }
+ }
+ }
+
+ if (!ds->vlan_filtering_is_global)
+ return true;
+
+ /* For cases where enabling/disabling VLAN awareness is global to the
+ * switch, we need to handle the case where multiple bridges span
+ * different ports of the same switch device and one of them has a
+ * different setting than what is being requested.
+ */
+ dsa_switch_for_each_port(other_dp, ds) {
+ struct net_device *other_br = dsa_port_bridge_dev_get(other_dp);
+
+ /* If it's the same bridge, it also has same
+ * vlan_filtering setting => no need to check
+ */
+ if (!other_br || other_br == dsa_port_bridge_dev_get(dp))
+ continue;
+
+ if (br_vlan_enabled(other_br) != vlan_filtering) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "VLAN filtering is a global setting");
+ return false;
+ }
+ }
+ return true;
+}
+
+int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
+ struct netlink_ext_ack *extack)
+{
+ bool old_vlan_filtering = dsa_port_is_vlan_filtering(dp);
+ struct dsa_switch *ds = dp->ds;
+ bool apply;
+ int err;
+
+ if (!ds->ops->port_vlan_filtering)
+ return -EOPNOTSUPP;
+
+ /* We are called from dsa_user_switchdev_blocking_event(),
+ * which is not under rcu_read_lock(), unlike
+ * dsa_user_switchdev_event().
+ */
+ rcu_read_lock();
+ apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack);
+ rcu_read_unlock();
+ if (!apply)
+ return -EINVAL;
+
+ if (dsa_port_is_vlan_filtering(dp) == vlan_filtering)
+ return 0;
+
+ err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering,
+ extack);
+ if (err)
+ return err;
+
+ if (ds->vlan_filtering_is_global) {
+ struct dsa_port *other_dp;
+
+ ds->vlan_filtering = vlan_filtering;
+
+ dsa_switch_for_each_user_port(other_dp, ds) {
+ struct net_device *user = other_dp->user;
+
+ /* We might be called in the unbind path, so not
+ * all user devices might still be registered.
+ */
+ if (!user)
+ continue;
+
+ err = dsa_user_manage_vlan_filtering(user,
+ vlan_filtering);
+ if (err)
+ goto restore;
+ }
+ } else {
+ dp->vlan_filtering = vlan_filtering;
+
+ err = dsa_user_manage_vlan_filtering(dp->user,
+ vlan_filtering);
+ if (err)
+ goto restore;
+ }
+
+ return 0;
+
+restore:
+ ds->ops->port_vlan_filtering(ds, dp->index, old_vlan_filtering, NULL);
+
+ if (ds->vlan_filtering_is_global)
+ ds->vlan_filtering = old_vlan_filtering;
+ else
+ dp->vlan_filtering = old_vlan_filtering;
+
+ return err;
+}
+
+/* This enforces legacy behavior for switch drivers which assume they can't
+ * receive VLAN configuration when joining a bridge with vlan_filtering=0
+ */
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp)
+{
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!br)
+ return false;
+
+ return !ds->configure_vlan_while_not_filtering && !br_vlan_enabled(br);
+}
+
+int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock)
+{
+ unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock);
+ unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
+ struct dsa_notifier_ageing_time_info info;
+ int err;
+
+ info.ageing_time = ageing_time;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info);
+ if (err)
+ return err;
+
+ dp->ageing_time = ageing_time;
+
+ return 0;
+}
+
+int dsa_port_mst_enable(struct dsa_port *dp, bool on,
+ struct netlink_ext_ack *extack)
+{
+ if (on && !dsa_port_supports_mst(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware does not support MST");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_pre_bridge_flags)
+ return -EINVAL;
+
+ return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack);
+}
+
+int dsa_port_bridge_flags(struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_bridge_flags)
+ return -EOPNOTSUPP;
+
+ err = ds->ops->port_bridge_flags(ds, dp->index, flags, extack);
+ if (err)
+ return err;
+
+ if (flags.mask & BR_LEARNING) {
+ bool learning = flags.val & BR_LEARNING;
+
+ if (learning == dp->learning)
+ return 0;
+
+ if ((dp->learning && !learning) &&
+ (dp->stp_state == BR_STATE_LEARNING ||
+ dp->stp_state == BR_STATE_FORWARDING))
+ dsa_port_fast_age(dp);
+
+ dp->learning = learning;
+ }
+
+ return 0;
+}
+
+void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->port_set_host_flood)
+ ds->ops->port_set_host_flood(ds, dp->index, uc, mc);
+}
+
+int dsa_port_vlan_msti(struct dsa_port *dp,
+ const struct switchdev_vlan_msti *msti)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->vlan_msti_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->vlan_msti_set(ds, *dp->bridge, msti);
+}
+
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu)
+{
+ struct dsa_notifier_mtu_info info = {
+ .dp = dp,
+ .mtu = new_mtu,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_MTU, &info);
+}
+
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_notifier_fdb_info info = {
+ .dp = dp,
+ .addr = addr,
+ .vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ /* Refcounting takes bridge.num as a key, and should be global for all
+ * bridges in the absence of FDB isolation, and per bridge otherwise.
+ * Force the bridge.num to zero here in the absence of FDB isolation.
+ */
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, &info);
+}
+
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_notifier_fdb_info info = {
+ .dp = dp,
+ .addr = addr,
+ .vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info);
+}
+
+static int dsa_port_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_notifier_fdb_info info = {
+ .dp = dp,
+ .addr = addr,
+ .vid = vid,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info);
+}
+
+int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_fdb_add(dp, addr, vid, db);
+}
+
+int dsa_port_bridge_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ /* Avoid a call to __dev_set_promiscuity() on the conduit, which
+ * requires rtnl_lock(), since we can't guarantee that is held here,
+ * and we can't take it either.
+ */
+ if (conduit->priv_flags & IFF_UNICAST_FLT) {
+ err = dev_uc_add(conduit, addr);
+ if (err)
+ return err;
+ }
+
+ return dsa_port_host_fdb_add(dp, addr, vid, db);
+}
+
+static int dsa_port_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_notifier_fdb_info info = {
+ .dp = dp,
+ .addr = addr,
+ .vid = vid,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info);
+}
+
+int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_fdb_del(dp, addr, vid, db);
+}
+
+int dsa_port_bridge_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ if (conduit->priv_flags & IFF_UNICAST_FLT) {
+ err = dev_uc_del(conduit, addr);
+ if (err)
+ return err;
+ }
+
+ return dsa_port_host_fdb_del(dp, addr, vid, db);
+}
+
+int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_notifier_lag_fdb_info info = {
+ .lag = dp->lag,
+ .addr = addr,
+ .vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_LAG_FDB_ADD, &info);
+}
+
+int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_notifier_lag_fdb_info info = {
+ .lag = dp->lag,
+ .addr = addr,
+ .vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_LAG_FDB_DEL, &info);
+}
+
+int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->port_fdb_dump)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_fdb_dump(ds, port, cb, data);
+}
+
+int dsa_port_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_notifier_mdb_info info = {
+ .dp = dp,
+ .mdb = mdb,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info);
+}
+
+int dsa_port_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_notifier_mdb_info info = {
+ .dp = dp,
+ .mdb = mdb,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info);
+}
+
+static int dsa_port_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_notifier_mdb_info info = {
+ .dp = dp,
+ .mdb = mdb,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info);
+}
+
+int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_mdb_add(dp, mdb, db);
+}
+
+int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ err = dev_mc_add(conduit, mdb->addr);
+ if (err)
+ return err;
+
+ return dsa_port_host_mdb_add(dp, mdb, db);
+}
+
+static int dsa_port_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_notifier_mdb_info info = {
+ .dp = dp,
+ .mdb = mdb,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info);
+}
+
+int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_mdb_del(dp, mdb, db);
+}
+
+int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ err = dev_mc_del(conduit, mdb->addr);
+ if (err)
+ return err;
+
+ return dsa_port_host_mdb_del(dp, mdb, db);
+}
+
+int dsa_port_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_notifier_vlan_info info = {
+ .dp = dp,
+ .vlan = vlan,
+ .extack = extack,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
+}
+
+int dsa_port_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_notifier_vlan_info info = {
+ .dp = dp,
+ .vlan = vlan,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
+}
+
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_notifier_vlan_info info = {
+ .dp = dp,
+ .vlan = vlan,
+ .extack = extack,
+ };
+ int err;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_ADD, &info);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ vlan_vid_add(conduit, htons(ETH_P_8021Q), vlan->vid);
+
+ return err;
+}
+
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_notifier_vlan_info info = {
+ .dp = dp,
+ .vlan = vlan,
+ };
+ int err;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_DEL, &info);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ vlan_vid_del(conduit, htons(ETH_P_8021Q), vlan->vid);
+
+ return err;
+}
+
+int dsa_port_mrp_add(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_add)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mrp_add(ds, dp->index, mrp);
+}
+
+int dsa_port_mrp_del(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_del)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mrp_del(ds, dp->index, mrp);
+}
+
+int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_add_ring_role)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mrp_add_ring_role(ds, dp->index, mrp);
+}
+
+int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_del_ring_role)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mrp_del_ring_role(ds, dp->index, mrp);
+}
+
+static int dsa_port_assign_conduit(struct dsa_port *dp,
+ struct net_device *conduit,
+ struct netlink_ext_ack *extack,
+ bool fail_on_err)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index, err;
+
+ err = ds->ops->port_change_conduit(ds, port, conduit, extack);
+ if (err && !fail_on_err)
+ dev_err(ds->dev, "port %d failed to assign conduit %s: %pe\n",
+ port, conduit->name, ERR_PTR(err));
+
+ if (err && fail_on_err)
+ return err;
+
+ dp->cpu_dp = conduit->dsa_ptr;
+ dp->cpu_port_in_lag = netif_is_lag_master(conduit);
+
+ return 0;
+}
+
+/* Change the dp->cpu_dp affinity for a user port. Note that both cross-chip
+ * notifiers and drivers have implicit assumptions about user-to-CPU-port
+ * mappings, so we unfortunately cannot delay the deletion of the objects
+ * (switchdev, standalone addresses, standalone VLANs) on the old CPU port
+ * until the new CPU port has been set up. So we need to completely tear down
+ * the old CPU port before changing it, and restore it on errors during the
+ * bringup of the new one.
+ */
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *bridge_dev = dsa_port_bridge_dev_get(dp);
+ struct net_device *old_conduit = dsa_port_to_conduit(dp);
+ struct net_device *dev = dp->user;
+ struct dsa_switch *ds = dp->ds;
+ bool vlan_filtering;
+ int err, tmp;
+
+ /* Bridges may hold host FDB, MDB and VLAN objects. These need to be
+ * migrated, so dynamically unoffload and later reoffload the bridge
+ * port.
+ */
+ if (bridge_dev) {
+ dsa_port_pre_bridge_leave(dp, bridge_dev);
+ dsa_port_bridge_leave(dp, bridge_dev);
+ }
+
+ /* The port might still be VLAN filtering even if it's no longer
+ * under a bridge, either due to ds->vlan_filtering_is_global or
+ * ds->needs_standalone_vlan_filtering. In turn this means VLANs
+ * on the CPU port.
+ */
+ vlan_filtering = dsa_port_is_vlan_filtering(dp);
+ if (vlan_filtering) {
+ err = dsa_user_manage_vlan_filtering(dev, false);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to remove standalone VLANs");
+ goto rewind_old_bridge;
+ }
+ }
+
+ /* Standalone addresses, and addresses of upper interfaces like
+ * VLAN, LAG, HSR need to be migrated.
+ */
+ dsa_user_unsync_ha(dev);
+
+ /* If live-changing, we also need to uninstall the user device address
+ * from the port FDB and the conduit interface.
+ */
+ if (dev->flags & IFF_UP)
+ dsa_user_host_uc_uninstall(dev);
+
+ err = dsa_port_assign_conduit(dp, conduit, extack, true);
+ if (err)
+ goto rewind_old_addrs;
+
+ /* If the port doesn't have its own MAC address and relies on the DSA
+ * conduit's one, inherit it again from the new DSA conduit.
+ */
+ if (is_zero_ether_addr(dp->mac))
+ eth_hw_addr_inherit(dev, conduit);
+
+ /* If live-changing, we need to install the user device address to the
+ * port FDB and the conduit interface.
+ */
+ if (dev->flags & IFF_UP) {
+ err = dsa_user_host_uc_install(dev, dev->dev_addr);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to install host UC address");
+ goto rewind_addr_inherit;
+ }
+ }
+
+ dsa_user_sync_ha(dev);
+
+ if (vlan_filtering) {
+ err = dsa_user_manage_vlan_filtering(dev, true);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to restore standalone VLANs");
+ goto rewind_new_addrs;
+ }
+ }
+
+ if (bridge_dev) {
+ err = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (err && err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to reoffload bridge");
+ goto rewind_new_vlan;
+ }
+ }
+
+ return 0;
+
+rewind_new_vlan:
+ if (vlan_filtering)
+ dsa_user_manage_vlan_filtering(dev, false);
+
+rewind_new_addrs:
+ dsa_user_unsync_ha(dev);
+
+ if (dev->flags & IFF_UP)
+ dsa_user_host_uc_uninstall(dev);
+
+rewind_addr_inherit:
+ if (is_zero_ether_addr(dp->mac))
+ eth_hw_addr_inherit(dev, old_conduit);
+
+ dsa_port_assign_conduit(dp, old_conduit, NULL, false);
+
+/* Restore the objects on the old CPU port */
+rewind_old_addrs:
+ if (dev->flags & IFF_UP) {
+ tmp = dsa_user_host_uc_install(dev, dev->dev_addr);
+ if (tmp) {
+ dev_err(ds->dev,
+ "port %d failed to restore host UC address: %pe\n",
+ dp->index, ERR_PTR(tmp));
+ }
+ }
+
+ dsa_user_sync_ha(dev);
+
+ if (vlan_filtering) {
+ tmp = dsa_user_manage_vlan_filtering(dev, true);
+ if (tmp) {
+ dev_err(ds->dev,
+ "port %d failed to restore standalone VLANs: %pe\n",
+ dp->index, ERR_PTR(tmp));
+ }
+ }
+
+rewind_old_bridge:
+ if (bridge_dev) {
+ tmp = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (tmp) {
+ dev_err(ds->dev,
+ "port %d failed to rejoin bridge %s: %pe\n",
+ dp->index, bridge_dev->name, ERR_PTR(tmp));
+ }
+ }
+
+ return err;
+}
+
+void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
+ const struct dsa_device_ops *tag_ops)
+{
+ cpu_dp->rcv = tag_ops->rcv;
+ cpu_dp->tag_ops = tag_ops;
+}
+
+/* dsa_supports_eee - indicate that EEE is supported
+ * @ds: pointer to &struct dsa_switch
+ * @port: port index
+ *
+ * A default implementation for the .support_eee() DSA operations member,
+ * which drivers can use to indicate that they support EEE on all of their
+ * user ports.
+ *
+ * Returns: true
+ */
+bool dsa_supports_eee(struct dsa_switch *ds, int port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(dsa_supports_eee);
+
+static void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+}
+
+static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface)
+{
+}
+
+static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ struct phy_device *phydev,
+ unsigned int mode,
+ phy_interface_t interface,
+ int speed, int duplex,
+ bool tx_pause, bool rx_pause)
+{
+}
+
+static const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
+ .mac_config = dsa_port_phylink_mac_config,
+ .mac_link_down = dsa_port_phylink_mac_link_down,
+ .mac_link_up = dsa_port_phylink_mac_link_up,
+};
+
+int dsa_port_phylink_create(struct dsa_port *dp)
+{
+ const struct phylink_mac_ops *mac_ops;
+ struct dsa_switch *ds = dp->ds;
+ phy_interface_t mode;
+ struct phylink *pl;
+ int err;
+
+ err = of_get_phy_mode(dp->dn, &mode);
+ if (err)
+ mode = PHY_INTERFACE_MODE_NA;
+
+ if (ds->ops->phylink_get_caps) {
+ ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config);
+ } else {
+ /* For legacy drivers */
+ if (mode != PHY_INTERFACE_MODE_NA) {
+ __set_bit(mode, dp->pl_config.supported_interfaces);
+ } else {
+ __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+ dp->pl_config.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_GMII,
+ dp->pl_config.supported_interfaces);
+ }
+ }
+
+ mac_ops = &dsa_port_phylink_mac_ops;
+ if (ds->phylink_mac_ops)
+ mac_ops = ds->phylink_mac_ops;
+
+ pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode,
+ mac_ops);
+ if (IS_ERR(pl)) {
+ pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl));
+ return PTR_ERR(pl);
+ }
+
+ dp->pl = pl;
+
+ return 0;
+}
+
+void dsa_port_phylink_destroy(struct dsa_port *dp)
+{
+ phylink_destroy(dp->pl);
+ dp->pl = NULL;
+}
+
+static int dsa_shared_port_phylink_register(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct device_node *port_dn = dp->dn;
+ int err;
+
+ dp->pl_config.dev = ds->dev;
+ dp->pl_config.type = PHYLINK_DEV;
+
+ err = dsa_port_phylink_create(dp);
+ if (err)
+ return err;
+
+ err = phylink_of_phy_connect(dp->pl, port_dn, 0);
+ if (err && err != -ENODEV) {
+ pr_err("could not attach to PHY: %d\n", err);
+ goto err_phy_connect;
+ }
+
+ return 0;
+
+err_phy_connect:
+ dsa_port_phylink_destroy(dp);
+ return err;
+}
+
+/* During the initial DSA driver migration to OF, port nodes were sometimes
+ * added to device trees with no indication of how they should operate from a
+ * link management perspective (phy-handle, fixed-link, etc). Additionally, the
+ * phy-mode may be absent. The interpretation of these port OF nodes depends on
+ * their type.
+ *
+ * User ports with no phy-handle or fixed-link are expected to connect to an
+ * internal PHY located on the ds->user_mii_bus at an MDIO address equal to
+ * the port number. This description is still actively supported.
+ *
+ * Shared (CPU and DSA) ports with no phy-handle or fixed-link are expected to
+ * operate at the maximum speed that their phy-mode is capable of. If the
+ * phy-mode is absent, they are expected to operate using the phy-mode
+ * supported by the port that gives the highest link speed. It is unspecified
+ * if the port should use flow control or not, half duplex or full duplex, or
+ * if the phy-mode is a SERDES link, whether in-band autoneg is expected to be
+ * enabled or not.
+ *
+ * In the latter case of shared ports, omitting the link management description
+ * from the firmware node is deprecated and strongly discouraged. DSA uses
+ * phylink, which rejects the firmware nodes of these ports for lacking
+ * required properties.
+ *
+ * For switches in this table, DSA will skip enforcing validation and will
+ * later omit registering a phylink instance for the shared ports, if they lack
+ * a fixed-link, a phy-handle, or a managed = "in-band-status" property.
+ * It becomes the responsibility of the driver to ensure that these ports
+ * operate at the maximum speed (whatever this means) and will interoperate
+ * with the DSA conduit or other cascade port, since phylink methods will not be
+ * invoked for them.
+ *
+ * If you are considering expanding this table for newly introduced switches,
+ * think again. It is OK to remove switches from this table if there aren't DT
+ * blobs in circulation which rely on defaulting the shared ports.
+ */
+static const char * const dsa_switches_apply_workarounds[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_XRS700X)
+ "arrow,xrs7003e",
+ "arrow,xrs7003f",
+ "arrow,xrs7004e",
+ "arrow,xrs7004f",
+#endif
+#if IS_ENABLED(CONFIG_B53)
+ "brcm,bcm5325",
+ "brcm,bcm53115",
+ "brcm,bcm53125",
+ "brcm,bcm53128",
+ "brcm,bcm5365",
+ "brcm,bcm5389",
+ "brcm,bcm5395",
+ "brcm,bcm5397",
+ "brcm,bcm5398",
+ "brcm,bcm53010-srab",
+ "brcm,bcm53011-srab",
+ "brcm,bcm53012-srab",
+ "brcm,bcm53018-srab",
+ "brcm,bcm53019-srab",
+ "brcm,bcm5301x-srab",
+ "brcm,bcm11360-srab",
+ "brcm,bcm58522-srab",
+ "brcm,bcm58525-srab",
+ "brcm,bcm58535-srab",
+ "brcm,bcm58622-srab",
+ "brcm,bcm58623-srab",
+ "brcm,bcm58625-srab",
+ "brcm,bcm88312-srab",
+ "brcm,cygnus-srab",
+ "brcm,nsp-srab",
+ "brcm,omega-srab",
+ "brcm,bcm3384-switch",
+ "brcm,bcm6328-switch",
+ "brcm,bcm6368-switch",
+ "brcm,bcm63xx-switch",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_BCM_SF2)
+ "brcm,bcm7445-switch-v4.0",
+ "brcm,bcm7278-switch-v4.0",
+ "brcm,bcm7278-switch-v4.8",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_LANTIQ_GSWIP)
+ "lantiq,xrx200-gswip",
+ "lantiq,xrx300-gswip",
+ "lantiq,xrx330-gswip",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_MV88E6060)
+ "marvell,mv88e6060",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_MV88E6XXX)
+ "marvell,mv88e6085",
+ "marvell,mv88e6190",
+ "marvell,mv88e6250",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON)
+ "microchip,ksz8765",
+ "microchip,ksz8794",
+ "microchip,ksz8795",
+ "microchip,ksz8863",
+ "microchip,ksz8873",
+ "microchip,ksz9477",
+ "microchip,ksz9897",
+ "microchip,ksz9893",
+ "microchip,ksz9563",
+ "microchip,ksz8563",
+ "microchip,ksz9567",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_SMSC_LAN9303_MDIO)
+ "smsc,lan9303-mdio",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_SMSC_LAN9303_I2C)
+ "smsc,lan9303-i2c",
+#endif
+ NULL,
+};
+
+static void dsa_shared_port_validate_of(struct dsa_port *dp,
+ bool *missing_phy_mode,
+ bool *missing_link_description)
+{
+ struct device_node *dn = dp->dn, *phy_np;
+ struct dsa_switch *ds = dp->ds;
+ phy_interface_t mode;
+
+ *missing_phy_mode = false;
+ *missing_link_description = false;
+
+ if (of_get_phy_mode(dn, &mode)) {
+ *missing_phy_mode = true;
+ dev_err(ds->dev,
+ "OF node %pOF of %s port %d lacks the required \"phy-mode\" property\n",
+ dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+ }
+
+ /* Note: of_phy_is_fixed_link() also returns true for
+ * managed = "in-band-status"
+ */
+ if (of_phy_is_fixed_link(dn))
+ return;
+
+ phy_np = of_parse_phandle(dn, "phy-handle", 0);
+ if (phy_np) {
+ of_node_put(phy_np);
+ return;
+ }
+
+ *missing_link_description = true;
+
+ dev_err(ds->dev,
+ "OF node %pOF of %s port %d lacks the required \"phy-handle\", \"fixed-link\" or \"managed\" properties\n",
+ dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+}
+
+static void dsa_shared_port_link_down(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down)
+ ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED,
+ PHY_INTERFACE_MODE_NA);
+}
+
+int dsa_shared_port_link_register_of(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ bool missing_link_description;
+ bool missing_phy_mode;
+
+ dsa_shared_port_validate_of(dp, &missing_phy_mode,
+ &missing_link_description);
+
+ if ((missing_phy_mode || missing_link_description) &&
+ !of_device_compatible_match(ds->dev->of_node,
+ dsa_switches_apply_workarounds))
+ return -EINVAL;
+
+ if (missing_link_description) {
+ dev_warn(ds->dev,
+ "Skipping phylink registration for %s port %d\n",
+ dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+ } else {
+ dsa_shared_port_link_down(dp);
+
+ return dsa_shared_port_phylink_register(dp);
+ }
+
+ return 0;
+}
+
+void dsa_shared_port_link_unregister_of(struct dsa_port *dp)
+{
+ if (dp->pl) {
+ rtnl_lock();
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
+ dsa_port_phylink_destroy(dp);
+ return;
+ }
+}
+
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_hsr_join)
+ return -EOPNOTSUPP;
+
+ dp->hsr_dev = hsr;
+
+ err = ds->ops->port_hsr_join(ds, dp->index, hsr, extack);
+ if (err)
+ dp->hsr_dev = NULL;
+
+ return err;
+}
+
+void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!dp->hsr_dev)
+ return;
+
+ dp->hsr_dev = NULL;
+
+ if (ds->ops->port_hsr_leave) {
+ err = ds->ops->port_hsr_leave(ds, dp->index, hsr);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to leave HSR %s: %pe\n",
+ dp->index, hsr->name, ERR_PTR(err));
+ }
+}
+
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast)
+{
+ struct dsa_notifier_tag_8021q_vlan_info info = {
+ .dp = dp,
+ .vid = vid,
+ };
+
+ if (broadcast)
+ return dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info);
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info);
+}
+
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast)
+{
+ struct dsa_notifier_tag_8021q_vlan_info info = {
+ .dp = dp,
+ .vid = vid,
+ };
+ int err;
+
+ if (broadcast)
+ err = dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info);
+ else
+ err = dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to notify tag_8021q VLAN %d deletion: %pe\n",
+ dp->index, vid, ERR_PTR(err));
+}
diff --git a/net/dsa/port.h b/net/dsa/port.h
new file mode 100644
index 000000000000..6bc3291573c0
--- /dev/null
+++ b/net/dsa/port.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_PORT_H
+#define __DSA_PORT_H
+
+#include <linux/types.h>
+#include <net/dsa.h>
+
+struct ifreq;
+struct netdev_lag_lower_state_info;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+struct switchdev_mst_state;
+struct switchdev_obj_port_mdb;
+struct switchdev_vlan_msti;
+struct phy_device;
+
+bool dsa_port_supports_hwtstamp(struct dsa_port *dp);
+void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
+ const struct dsa_device_ops *tag_ops);
+int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age);
+int dsa_port_set_mst_state(struct dsa_port *dp,
+ const struct switchdev_mst_state *state,
+ struct netlink_ext_ack *extack);
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy);
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
+void dsa_port_disable_rt(struct dsa_port *dp);
+void dsa_port_disable(struct dsa_port *dp);
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack);
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br);
+void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
+int dsa_port_lag_change(struct dsa_port *dp,
+ struct netdev_lag_lower_state_info *linfo);
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
+int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
+ struct netlink_ext_ack *extack);
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp);
+int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock);
+int dsa_port_mst_enable(struct dsa_port *dp, bool on,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_msti(struct dsa_port *dp,
+ const struct switchdev_vlan_msti *msti);
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu);
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid);
+int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid);
+int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
+int dsa_port_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack);
+int dsa_port_bridge_flags(struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_mrp_add(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp);
+int dsa_port_mrp_del(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp);
+int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp);
+int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp);
+int dsa_port_phylink_create(struct dsa_port *dp);
+void dsa_port_phylink_destroy(struct dsa_port *dp);
+int dsa_shared_port_link_register_of(struct dsa_port *dp);
+void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+ struct netlink_ext_ack *extack);
+void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
+void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+ struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/stubs.c b/net/dsa/stubs.c
new file mode 100644
index 000000000000..2ed8a6c85fbf
--- /dev/null
+++ b/net/dsa/stubs.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Stubs for DSA functionality called by the core network stack.
+ * These are necessary because CONFIG_NET_DSA can be a module, and built-in
+ * code cannot directly call symbols exported by modules.
+ */
+#include <net/dsa_stubs.h>
+
+const struct dsa_stubs *dsa_stubs;
+EXPORT_SYMBOL_GPL(dsa_stubs);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
new file mode 100644
index 000000000000..3d2feeea897b
--- /dev/null
+++ b/net/dsa/switch.c
@@ -0,0 +1,1134 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handling of a single switch chip, part of a switch fabric
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ * Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ */
+
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag_8021q.h"
+#include "trace.h"
+#include "user.h"
+
+static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
+ unsigned int ageing_time)
+{
+ struct dsa_port *dp;
+
+ dsa_switch_for_each_port(dp, ds)
+ if (dp->ageing_time && dp->ageing_time < ageing_time)
+ ageing_time = dp->ageing_time;
+
+ return ageing_time;
+}
+
+static int dsa_switch_ageing_time(struct dsa_switch *ds,
+ struct dsa_notifier_ageing_time_info *info)
+{
+ unsigned int ageing_time = info->ageing_time;
+
+ if (ds->ageing_time_min && ageing_time < ds->ageing_time_min)
+ return -ERANGE;
+
+ if (ds->ageing_time_max && ageing_time > ds->ageing_time_max)
+ return -ERANGE;
+
+ /* Program the fastest ageing time in case of multiple bridges */
+ ageing_time = dsa_switch_fastest_ageing_time(ds, ageing_time);
+
+ if (ds->ops->set_ageing_time)
+ return ds->ops->set_ageing_time(ds, ageing_time);
+
+ return 0;
+}
+
+static bool dsa_port_mtu_match(struct dsa_port *dp,
+ struct dsa_notifier_mtu_info *info)
+{
+ return dp == info->dp || dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp);
+}
+
+static int dsa_switch_mtu(struct dsa_switch *ds,
+ struct dsa_notifier_mtu_info *info)
+{
+ struct dsa_port *dp;
+ int ret;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_mtu_match(dp, info)) {
+ ret = ds->ops->port_change_mtu(ds, dp->index,
+ info->mtu);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_bridge_join(struct dsa_switch *ds,
+ struct dsa_notifier_bridge_info *info)
+{
+ int err;
+
+ if (info->dp->ds == ds) {
+ if (!ds->ops->port_bridge_join)
+ return -EOPNOTSUPP;
+
+ err = ds->ops->port_bridge_join(ds, info->dp->index,
+ info->bridge,
+ &info->tx_fwd_offload,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->dp->ds != ds && ds->ops->crosschip_bridge_join) {
+ err = ds->ops->crosschip_bridge_join(ds,
+ info->dp->ds->dst->index,
+ info->dp->ds->index,
+ info->dp->index,
+ info->bridge,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int dsa_switch_bridge_leave(struct dsa_switch *ds,
+ struct dsa_notifier_bridge_info *info)
+{
+ if (info->dp->ds == ds && ds->ops->port_bridge_leave)
+ ds->ops->port_bridge_leave(ds, info->dp->index, info->bridge);
+
+ if (info->dp->ds != ds && ds->ops->crosschip_bridge_leave)
+ ds->ops->crosschip_bridge_leave(ds, info->dp->ds->dst->index,
+ info->dp->ds->index,
+ info->dp->index,
+ info->bridge);
+
+ return 0;
+}
+
+/* Matches for all upstream-facing ports (the CPU port and all upstream-facing
+ * DSA links) that sit between the targeted port on which the notifier was
+ * emitted and its dedicated CPU port.
+ */
+static bool dsa_port_host_address_match(struct dsa_port *dp,
+ const struct dsa_port *targeted_dp)
+{
+ struct dsa_port *cpu_dp = targeted_dp->cpu_dp;
+
+ if (dsa_switch_is_upstream_of(dp->ds, targeted_dp->ds))
+ return dp->index == dsa_towards_port(dp->ds, cpu_dp->ds->index,
+ cpu_dp->index);
+
+ return false;
+}
+
+static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_mac_addr *a;
+
+ list_for_each_entry(a, addr_list, list)
+ if (ether_addr_equal(a->addr, addr) && a->vid == vid &&
+ dsa_db_equal(&a->db, &db))
+ return a;
+
+ return NULL;
+}
+
+static int dsa_port_do_mdb_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_mdb_add(ds, port, mdb, db);
+ trace_dsa_mdb_add_hw(dp, mdb->addr, mdb->vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db);
+ if (a) {
+ refcount_inc(&a->refcount);
+ trace_dsa_mdb_add_bump(dp, mdb->addr, mdb->vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_mdb_add(ds, port, mdb, db);
+ trace_dsa_mdb_add_hw(dp, mdb->addr, mdb->vid, &db, err);
+ if (err) {
+ kfree(a);
+ goto out;
+ }
+
+ ether_addr_copy(a->addr, mdb->addr);
+ a->vid = mdb->vid;
+ a->db = db;
+ refcount_set(&a->refcount, 1);
+ list_add_tail(&a->list, &dp->mdbs);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_port_do_mdb_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_mdb_del(ds, port, mdb, db);
+ trace_dsa_mdb_del_hw(dp, mdb->addr, mdb->vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db);
+ if (!a) {
+ trace_dsa_mdb_del_not_found(dp, mdb->addr, mdb->vid, &db);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_mdb_del_drop(dp, mdb->addr, mdb->vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ err = ds->ops->port_mdb_del(ds, port, mdb, db);
+ trace_dsa_mdb_del_hw(dp, mdb->addr, mdb->vid, &db, err);
+ if (err) {
+ refcount_set(&a->refcount, 1);
+ goto out;
+ }
+
+ list_del(&a->list);
+ kfree(a);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ trace_dsa_fdb_add_hw(dp, addr, vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db);
+ if (a) {
+ refcount_inc(&a->refcount);
+ trace_dsa_fdb_add_bump(dp, addr, vid, &db, &a->refcount);
+ goto out;
+ }
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ trace_dsa_fdb_add_hw(dp, addr, vid, &db, err);
+ if (err) {
+ kfree(a);
+ goto out;
+ }
+
+ ether_addr_copy(a->addr, addr);
+ a->vid = vid;
+ a->db = db;
+ refcount_set(&a->refcount, 1);
+ list_add_tail(&a->list, &dp->fdbs);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ trace_dsa_fdb_del_hw(dp, addr, vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db);
+ if (!a) {
+ trace_dsa_fdb_del_not_found(dp, addr, vid, &db);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_fdb_del_drop(dp, addr, vid, &db, &a->refcount);
+ goto out;
+ }
+
+ err = ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ trace_dsa_fdb_del_hw(dp, addr, vid, &db, err);
+ if (err) {
+ refcount_set(&a->refcount, 1);
+ goto out;
+ }
+
+ list_del(&a->list);
+ kfree(a);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_mac_addr *a;
+ int err = 0;
+
+ mutex_lock(&lag->fdb_lock);
+
+ a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db);
+ if (a) {
+ refcount_inc(&a->refcount);
+ trace_dsa_lag_fdb_add_bump(lag->dev, addr, vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->lag_fdb_add(ds, *lag, addr, vid, db);
+ trace_dsa_lag_fdb_add_hw(lag->dev, addr, vid, &db, err);
+ if (err) {
+ kfree(a);
+ goto out;
+ }
+
+ ether_addr_copy(a->addr, addr);
+ a->vid = vid;
+ a->db = db;
+ refcount_set(&a->refcount, 1);
+ list_add_tail(&a->list, &lag->fdbs);
+
+out:
+ mutex_unlock(&lag->fdb_lock);
+
+ return err;
+}
+
+static int dsa_switch_do_lag_fdb_del(struct dsa_switch *ds, struct dsa_lag *lag,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_mac_addr *a;
+ int err = 0;
+
+ mutex_lock(&lag->fdb_lock);
+
+ a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db);
+ if (!a) {
+ trace_dsa_lag_fdb_del_not_found(lag->dev, addr, vid, &db);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_lag_fdb_del_drop(lag->dev, addr, vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ err = ds->ops->lag_fdb_del(ds, *lag, addr, vid, db);
+ trace_dsa_lag_fdb_del_hw(lag->dev, addr, vid, &db, err);
+ if (err) {
+ refcount_set(&a->refcount, 1);
+ goto out;
+ }
+
+ list_del(&a->list);
+ kfree(a);
+
+out:
+ mutex_unlock(&lag->fdb_lock);
+
+ return err;
+}
+
+static int dsa_switch_host_fdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_fdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_fdb_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ if (dsa_port_is_cpu(dp) && info->dp->cpu_port_in_lag) {
+ err = dsa_switch_do_lag_fdb_add(ds, dp->lag,
+ info->addr,
+ info->vid,
+ info->db);
+ } else {
+ err = dsa_port_do_fdb_add(dp, info->addr,
+ info->vid, info->db);
+ }
+ if (err)
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_switch_host_fdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_fdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_fdb_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ if (dsa_port_is_cpu(dp) && info->dp->cpu_port_in_lag) {
+ err = dsa_switch_do_lag_fdb_del(ds, dp->lag,
+ info->addr,
+ info->vid,
+ info->db);
+ } else {
+ err = dsa_port_do_fdb_del(dp, info->addr,
+ info->vid, info->db);
+ }
+ if (err)
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_switch_fdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_fdb_info *info)
+{
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (!ds->ops->port_fdb_add)
+ return -EOPNOTSUPP;
+
+ return dsa_port_do_fdb_add(dp, info->addr, info->vid, info->db);
+}
+
+static int dsa_switch_fdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_fdb_info *info)
+{
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (!ds->ops->port_fdb_del)
+ return -EOPNOTSUPP;
+
+ return dsa_port_do_fdb_del(dp, info->addr, info->vid, info->db);
+}
+
+static int dsa_switch_lag_fdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_lag_fdb_info *info)
+{
+ struct dsa_port *dp;
+
+ if (!ds->ops->lag_fdb_add)
+ return -EOPNOTSUPP;
+
+ /* Notify switch only if it has a port in this LAG */
+ dsa_switch_for_each_port(dp, ds)
+ if (dsa_port_offloads_lag(dp, info->lag))
+ return dsa_switch_do_lag_fdb_add(ds, info->lag,
+ info->addr, info->vid,
+ info->db);
+
+ return 0;
+}
+
+static int dsa_switch_lag_fdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_lag_fdb_info *info)
+{
+ struct dsa_port *dp;
+
+ if (!ds->ops->lag_fdb_del)
+ return -EOPNOTSUPP;
+
+ /* Notify switch only if it has a port in this LAG */
+ dsa_switch_for_each_port(dp, ds)
+ if (dsa_port_offloads_lag(dp, info->lag))
+ return dsa_switch_do_lag_fdb_del(ds, info->lag,
+ info->addr, info->vid,
+ info->db);
+
+ return 0;
+}
+
+static int dsa_switch_lag_change(struct dsa_switch *ds,
+ struct dsa_notifier_lag_info *info)
+{
+ if (info->dp->ds == ds && ds->ops->port_lag_change)
+ return ds->ops->port_lag_change(ds, info->dp->index);
+
+ if (info->dp->ds != ds && ds->ops->crosschip_lag_change)
+ return ds->ops->crosschip_lag_change(ds, info->dp->ds->index,
+ info->dp->index);
+
+ return 0;
+}
+
+static int dsa_switch_lag_join(struct dsa_switch *ds,
+ struct dsa_notifier_lag_info *info)
+{
+ if (info->dp->ds == ds && ds->ops->port_lag_join)
+ return ds->ops->port_lag_join(ds, info->dp->index, info->lag,
+ info->info, info->extack);
+
+ if (info->dp->ds != ds && ds->ops->crosschip_lag_join)
+ return ds->ops->crosschip_lag_join(ds, info->dp->ds->index,
+ info->dp->index, info->lag,
+ info->info, info->extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_switch_lag_leave(struct dsa_switch *ds,
+ struct dsa_notifier_lag_info *info)
+{
+ if (info->dp->ds == ds && ds->ops->port_lag_leave)
+ return ds->ops->port_lag_leave(ds, info->dp->index, info->lag);
+
+ if (info->dp->ds != ds && ds->ops->crosschip_lag_leave)
+ return ds->ops->crosschip_lag_leave(ds, info->dp->ds->index,
+ info->dp->index, info->lag);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_switch_mdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
+{
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (!ds->ops->port_mdb_add)
+ return -EOPNOTSUPP;
+
+ return dsa_port_do_mdb_add(dp, info->mdb, info->db);
+}
+
+static int dsa_switch_mdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
+{
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (!ds->ops->port_mdb_del)
+ return -EOPNOTSUPP;
+
+ return dsa_port_do_mdb_del(dp, info->mdb, info->db);
+}
+
+static int dsa_switch_host_mdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_mdb_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ err = dsa_port_do_mdb_add(dp, info->mdb, info->db);
+ if (err)
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_switch_host_mdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_mdb_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ err = dsa_port_do_mdb_del(dp, info->mdb, info->db);
+ if (err)
+ break;
+ }
+ }
+
+ return err;
+}
+
+/* Port VLANs match on the targeted port and on all DSA ports */
+static bool dsa_port_vlan_match(struct dsa_port *dp,
+ struct dsa_notifier_vlan_info *info)
+{
+ return dsa_port_is_dsa(dp) || dp == info->dp;
+}
+
+/* Host VLANs match on the targeted port's CPU port, and on all DSA ports
+ * (upstream and downstream) of that switch and its upstream switches.
+ */
+static bool dsa_port_host_vlan_match(struct dsa_port *dp,
+ const struct dsa_port *targeted_dp)
+{
+ struct dsa_port *cpu_dp = targeted_dp->cpu_dp;
+
+ if (dsa_switch_is_upstream_of(dp->ds, targeted_dp->ds))
+ return dsa_port_is_dsa(dp) || dp == cpu_dp;
+
+ return false;
+}
+
+struct dsa_vlan *dsa_vlan_find(struct list_head *vlan_list,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_vlan *v;
+
+ list_for_each_entry(v, vlan_list, list)
+ if (v->vid == vlan->vid)
+ return v;
+
+ return NULL;
+}
+
+static int dsa_port_do_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct dsa_vlan *v;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports. */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ trace_dsa_vlan_add_hw(dp, vlan, err);
+
+ return err;
+ }
+
+ /* No need to propagate on shared ports the existing VLANs that were
+ * re-notified after just the flags have changed. This would cause a
+ * refcount bump which we need to avoid, since it unbalances the
+ * additions with the deletions.
+ */
+ if (vlan->changed)
+ return 0;
+
+ mutex_lock(&dp->vlans_lock);
+
+ v = dsa_vlan_find(&dp->vlans, vlan);
+ if (v) {
+ refcount_inc(&v->refcount);
+ trace_dsa_vlan_add_bump(dp, vlan, &v->refcount);
+ goto out;
+ }
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ trace_dsa_vlan_add_hw(dp, vlan, err);
+ if (err) {
+ kfree(v);
+ goto out;
+ }
+
+ v->vid = vlan->vid;
+ refcount_set(&v->refcount, 1);
+ list_add_tail(&v->list, &dp->vlans);
+
+out:
+ mutex_unlock(&dp->vlans_lock);
+
+ return err;
+}
+
+static int dsa_port_do_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct dsa_vlan *v;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_vlan_del(ds, port, vlan);
+ trace_dsa_vlan_del_hw(dp, vlan, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->vlans_lock);
+
+ v = dsa_vlan_find(&dp->vlans, vlan);
+ if (!v) {
+ trace_dsa_vlan_del_not_found(dp, vlan);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&v->refcount)) {
+ trace_dsa_vlan_del_drop(dp, vlan, &v->refcount);
+ goto out;
+ }
+
+ err = ds->ops->port_vlan_del(ds, port, vlan);
+ trace_dsa_vlan_del_hw(dp, vlan, err);
+ if (err) {
+ refcount_set(&v->refcount, 1);
+ goto out;
+ }
+
+ list_del(&v->list);
+ kfree(v);
+
+out:
+ mutex_unlock(&dp->vlans_lock);
+
+ return err;
+}
+
+static int dsa_switch_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_add(dp, info->vlan,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_del(dp, info->vlan);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_host_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_vlan_match(dp, info->dp)) {
+ err = dsa_port_do_vlan_add(dp, info->vlan,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_host_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_vlan_match(dp, info->dp)) {
+ err = dsa_port_do_vlan_del(dp, info->vlan);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_change_tag_proto(struct dsa_switch *ds,
+ struct dsa_notifier_tag_proto_info *info)
+{
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
+ struct dsa_port *dp, *cpu_dp;
+ int err;
+
+ if (!ds->ops->change_tag_protocol)
+ return -EOPNOTSUPP;
+
+ ASSERT_RTNL();
+
+ err = ds->ops->change_tag_protocol(ds, tag_ops->proto);
+ if (err)
+ return err;
+
+ dsa_switch_for_each_cpu_port(cpu_dp, ds)
+ dsa_port_set_tag_protocol(cpu_dp, tag_ops);
+
+ /* Now that changing the tag protocol can no longer fail, let's update
+ * the remaining bits which are "duplicated for faster access", and the
+ * bits that depend on the tagger, such as the MTU.
+ */
+ dsa_switch_for_each_user_port(dp, ds) {
+ struct net_device *user = dp->user;
+
+ dsa_user_setup_tagger(user);
+
+ /* rtnl_mutex is held in dsa_tree_change_tag_proto */
+ dsa_user_change_mtu(user, user->mtu);
+ }
+
+ return 0;
+}
+
+/* We use the same cross-chip notifiers to inform both the tagger side, as well
+ * as the switch side, of connection and disconnection events.
+ * Since ds->tagger_data is owned by the tagger, it isn't a hard error if the
+ * switch side doesn't support connecting to this tagger, and therefore, the
+ * fact that we don't disconnect the tagger side doesn't constitute a memory
+ * leak: the tagger will still operate with persistent per-switch memory, just
+ * with the switch side unconnected to it. What does constitute a hard error is
+ * when the switch side supports connecting but fails.
+ */
+static int
+dsa_switch_connect_tag_proto(struct dsa_switch *ds,
+ struct dsa_notifier_tag_proto_info *info)
+{
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
+ int err;
+
+ /* Notify the new tagger about the connection to this switch */
+ if (tag_ops->connect) {
+ err = tag_ops->connect(ds);
+ if (err)
+ return err;
+ }
+
+ if (!ds->ops->connect_tag_protocol)
+ return -EOPNOTSUPP;
+
+ /* Notify the switch about the connection to the new tagger */
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
+ if (err) {
+ /* Revert the new tagger's connection to this tree */
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
+ struct dsa_notifier_tag_proto_info *info)
+{
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
+
+ /* Notify the tagger about the disconnection from this switch */
+ if (tag_ops->disconnect && ds->tagger_data)
+ tag_ops->disconnect(ds);
+
+ /* No need to notify the switch, since it shouldn't have any
+ * resources to tear down
+ */
+ return 0;
+}
+
+static int
+dsa_switch_conduit_state_change(struct dsa_switch *ds,
+ struct dsa_notifier_conduit_state_info *info)
+{
+ if (!ds->ops->conduit_state_change)
+ return 0;
+
+ ds->ops->conduit_state_change(ds, info->conduit, info->operational);
+
+ return 0;
+}
+
+static int dsa_switch_event(struct notifier_block *nb,
+ unsigned long event, void *info)
+{
+ struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb);
+ int err;
+
+ switch (event) {
+ case DSA_NOTIFIER_AGEING_TIME:
+ err = dsa_switch_ageing_time(ds, info);
+ break;
+ case DSA_NOTIFIER_BRIDGE_JOIN:
+ err = dsa_switch_bridge_join(ds, info);
+ break;
+ case DSA_NOTIFIER_BRIDGE_LEAVE:
+ err = dsa_switch_bridge_leave(ds, info);
+ break;
+ case DSA_NOTIFIER_FDB_ADD:
+ err = dsa_switch_fdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_FDB_DEL:
+ err = dsa_switch_fdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_FDB_ADD:
+ err = dsa_switch_host_fdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_FDB_DEL:
+ err = dsa_switch_host_fdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_FDB_ADD:
+ err = dsa_switch_lag_fdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_FDB_DEL:
+ err = dsa_switch_lag_fdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_CHANGE:
+ err = dsa_switch_lag_change(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_JOIN:
+ err = dsa_switch_lag_join(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_LEAVE:
+ err = dsa_switch_lag_leave(ds, info);
+ break;
+ case DSA_NOTIFIER_MDB_ADD:
+ err = dsa_switch_mdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_MDB_DEL:
+ err = dsa_switch_mdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_MDB_ADD:
+ err = dsa_switch_host_mdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_MDB_DEL:
+ err = dsa_switch_host_mdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_VLAN_ADD:
+ err = dsa_switch_vlan_add(ds, info);
+ break;
+ case DSA_NOTIFIER_VLAN_DEL:
+ err = dsa_switch_vlan_del(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_VLAN_ADD:
+ err = dsa_switch_host_vlan_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_VLAN_DEL:
+ err = dsa_switch_host_vlan_del(ds, info);
+ break;
+ case DSA_NOTIFIER_MTU:
+ err = dsa_switch_mtu(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_PROTO:
+ err = dsa_switch_change_tag_proto(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_PROTO_CONNECT:
+ err = dsa_switch_connect_tag_proto(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_PROTO_DISCONNECT:
+ err = dsa_switch_disconnect_tag_proto(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_8021Q_VLAN_ADD:
+ err = dsa_switch_tag_8021q_vlan_add(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
+ err = dsa_switch_tag_8021q_vlan_del(ds, info);
+ break;
+ case DSA_NOTIFIER_CONDUIT_STATE_CHANGE:
+ err = dsa_switch_conduit_state_change(ds, info);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (err)
+ dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
+ event, err);
+
+ return notifier_from_errno(err);
+}
+
+/**
+ * dsa_tree_notify - Execute code for all switches in a DSA switch tree.
+ * @dst: collection of struct dsa_switch devices to notify.
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Given a struct dsa_switch_tree, this can be used to run a function once for
+ * each member DSA switch. The other alternative of traversing the tree is only
+ * through its ports list, which does not uniquely list the switches.
+ */
+int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v)
+{
+ struct raw_notifier_head *nh = &dst->nh;
+ int err;
+
+ err = raw_notifier_call_chain(nh, e, v);
+
+ return notifier_to_errno(err);
+}
+
+/**
+ * dsa_broadcast - Notify all DSA trees in the system.
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Can be used to notify the switching fabric of events such as cross-chip
+ * bridging between disjoint trees (such as islands of tagger-compatible
+ * switches bridged by an incompatible middle switch).
+ *
+ * WARNING: this function is not reliable during probe time, because probing
+ * between trees is asynchronous and not all DSA trees might have probed.
+ */
+int dsa_broadcast(unsigned long e, void *v)
+{
+ struct dsa_switch_tree *dst;
+ int err = 0;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ err = dsa_tree_notify(dst, e, v);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+int dsa_switch_register_notifier(struct dsa_switch *ds)
+{
+ ds->nb.notifier_call = dsa_switch_event;
+
+ return raw_notifier_chain_register(&ds->dst->nh, &ds->nb);
+}
+
+void dsa_switch_unregister_notifier(struct dsa_switch *ds)
+{
+ int err;
+
+ err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb);
+ if (err)
+ dev_err(ds->dev, "failed to unregister notifier (%d)\n", err);
+}
diff --git a/net/dsa/switch.h b/net/dsa/switch.h
new file mode 100644
index 000000000000..be0a2749cd97
--- /dev/null
+++ b/net/dsa/switch.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_SWITCH_H
+#define __DSA_SWITCH_H
+
+#include <net/dsa.h>
+
+struct netlink_ext_ack;
+
+enum {
+ DSA_NOTIFIER_AGEING_TIME,
+ DSA_NOTIFIER_BRIDGE_JOIN,
+ DSA_NOTIFIER_BRIDGE_LEAVE,
+ DSA_NOTIFIER_FDB_ADD,
+ DSA_NOTIFIER_FDB_DEL,
+ DSA_NOTIFIER_HOST_FDB_ADD,
+ DSA_NOTIFIER_HOST_FDB_DEL,
+ DSA_NOTIFIER_LAG_FDB_ADD,
+ DSA_NOTIFIER_LAG_FDB_DEL,
+ DSA_NOTIFIER_LAG_CHANGE,
+ DSA_NOTIFIER_LAG_JOIN,
+ DSA_NOTIFIER_LAG_LEAVE,
+ DSA_NOTIFIER_MDB_ADD,
+ DSA_NOTIFIER_MDB_DEL,
+ DSA_NOTIFIER_HOST_MDB_ADD,
+ DSA_NOTIFIER_HOST_MDB_DEL,
+ DSA_NOTIFIER_VLAN_ADD,
+ DSA_NOTIFIER_VLAN_DEL,
+ DSA_NOTIFIER_HOST_VLAN_ADD,
+ DSA_NOTIFIER_HOST_VLAN_DEL,
+ DSA_NOTIFIER_MTU,
+ DSA_NOTIFIER_TAG_PROTO,
+ DSA_NOTIFIER_TAG_PROTO_CONNECT,
+ DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
+ DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
+ DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
+ DSA_NOTIFIER_CONDUIT_STATE_CHANGE,
+};
+
+/* DSA_NOTIFIER_AGEING_TIME */
+struct dsa_notifier_ageing_time_info {
+ unsigned int ageing_time;
+};
+
+/* DSA_NOTIFIER_BRIDGE_* */
+struct dsa_notifier_bridge_info {
+ const struct dsa_port *dp;
+ struct dsa_bridge bridge;
+ bool tx_fwd_offload;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_FDB_* */
+struct dsa_notifier_fdb_info {
+ const struct dsa_port *dp;
+ const unsigned char *addr;
+ u16 vid;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_LAG_FDB_* */
+struct dsa_notifier_lag_fdb_info {
+ struct dsa_lag *lag;
+ const unsigned char *addr;
+ u16 vid;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_MDB_* */
+struct dsa_notifier_mdb_info {
+ const struct dsa_port *dp;
+ const struct switchdev_obj_port_mdb *mdb;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_LAG_* */
+struct dsa_notifier_lag_info {
+ const struct dsa_port *dp;
+ struct dsa_lag lag;
+ struct netdev_lag_upper_info *info;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_VLAN_* */
+struct dsa_notifier_vlan_info {
+ const struct dsa_port *dp;
+ const struct switchdev_obj_port_vlan *vlan;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_MTU */
+struct dsa_notifier_mtu_info {
+ const struct dsa_port *dp;
+ int mtu;
+};
+
+/* DSA_NOTIFIER_TAG_PROTO_* */
+struct dsa_notifier_tag_proto_info {
+ const struct dsa_device_ops *tag_ops;
+};
+
+/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */
+struct dsa_notifier_tag_8021q_vlan_info {
+ const struct dsa_port *dp;
+ u16 vid;
+};
+
+/* DSA_NOTIFIER_CONDUIT_STATE_CHANGE */
+struct dsa_notifier_conduit_state_info {
+ const struct net_device *conduit;
+ bool operational;
+};
+
+struct dsa_vlan *dsa_vlan_find(struct list_head *vlan_list,
+ const struct switchdev_obj_port_vlan *vlan);
+
+int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v);
+int dsa_broadcast(unsigned long e, void *v);
+
+int dsa_switch_register_notifier(struct dsa_switch *ds);
+void dsa_switch_unregister_notifier(struct dsa_switch *ds);
+
+#endif
diff --git a/net/dsa/tag.c b/net/dsa/tag.c
new file mode 100644
index 000000000000..79ad105902d9
--- /dev/null
+++ b/net/dsa/tag.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA tagging protocol handling
+ *
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+#include <net/dst_metadata.h>
+
+#include "tag.h"
+#include "user.h"
+
+static LIST_HEAD(dsa_tag_drivers_list);
+static DEFINE_MUTEX(dsa_tag_drivers_lock);
+
+/* Determine if we should defer delivery of skb until we have a rx timestamp.
+ *
+ * Called from dsa_switch_rcv. For now, this will only work if tagging is
+ * enabled on the switch. Normally the MAC driver would retrieve the hardware
+ * timestamp when it reads the packet out of the hardware. However in a DSA
+ * switch, the DSA driver owning the interface to which the packet is
+ * delivered is never notified unless we do so here.
+ */
+static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p,
+ struct sk_buff *skb)
+{
+ struct dsa_switch *ds = p->dp->ds;
+ unsigned int type;
+
+ if (!ds->ops->port_rxtstamp)
+ return false;
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+
+ __skb_push(skb, ETH_HLEN);
+
+ type = ptp_classify_raw(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
+}
+
+static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *unused)
+{
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct sk_buff *nskb = NULL;
+ struct dsa_user_priv *p;
+
+ if (unlikely(!cpu_dp)) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return 0;
+
+ if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) {
+ unsigned int port = md_dst->u.port_info.port_id;
+
+ skb_dst_drop(skb);
+ if (!skb_has_extensions(skb))
+ skb->slow_gro = 0;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (likely(skb->dev)) {
+ dsa_default_offload_fwd_mark(skb);
+ nskb = skb;
+ }
+ } else {
+ nskb = cpu_dp->rcv(skb, dev);
+ }
+
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb = nskb;
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ if (unlikely(!dsa_user_dev_check(skb->dev))) {
+ /* Packet is to be injected directly on an upper
+ * device, e.g. a team/bond, so skip all DSA-port
+ * specific actions.
+ */
+ netif_rx(skb);
+ return 0;
+ }
+
+ p = netdev_priv(skb->dev);
+
+ if (unlikely(cpu_dp->ds->untag_bridge_pvid ||
+ cpu_dp->ds->untag_vlan_aware_bridge_pvid)) {
+ nskb = dsa_software_vlan_untag(skb);
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb = nskb;
+ }
+
+ dev_sw_netstats_rx_add(skb->dev, skb->len + ETH_HLEN);
+
+ if (dsa_skb_defer_rx_timestamp(p, skb))
+ return 0;
+
+ gro_cells_receive(&p->gcells, skb);
+
+ return 0;
+}
+
+struct packet_type dsa_pack_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_XDSA),
+ .func = dsa_switch_rcv,
+};
+
+static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver,
+ struct module *owner)
+{
+ dsa_tag_driver->owner = owner;
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list);
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count, struct module *owner)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ dsa_tag_driver_register(dsa_tag_driver_array[i], owner);
+}
+
+static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver)
+{
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_del(&dsa_tag_driver->list);
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_drivers_register);
+
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ dsa_tag_driver_unregister(dsa_tag_driver_array[i]);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister);
+
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
+{
+ return ops->name;
+};
+
+/* Function takes a reference on the module owning the tagger,
+ * so dsa_tag_driver_put must be called afterwards.
+ */
+const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name)
+{
+ const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT);
+ struct dsa_tag_driver *dsa_tag_driver;
+
+ request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name);
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ const struct dsa_device_ops *tmp = dsa_tag_driver->ops;
+
+ if (strcmp(name, tmp->name))
+ continue;
+
+ if (!try_module_get(dsa_tag_driver->owner))
+ break;
+
+ ops = tmp;
+ break;
+ }
+ mutex_unlock(&dsa_tag_drivers_lock);
+
+ return ops;
+}
+
+const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol)
+{
+ struct dsa_tag_driver *dsa_tag_driver;
+ const struct dsa_device_ops *ops;
+ bool found = false;
+
+ request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol);
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ ops = dsa_tag_driver->ops;
+ if (ops->proto == tag_protocol) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ if (!try_module_get(dsa_tag_driver->owner))
+ ops = ERR_PTR(-ENOPROTOOPT);
+ } else {
+ ops = ERR_PTR(-ENOPROTOOPT);
+ }
+
+ mutex_unlock(&dsa_tag_drivers_lock);
+
+ return ops;
+}
+
+void dsa_tag_driver_put(const struct dsa_device_ops *ops)
+{
+ struct dsa_tag_driver *dsa_tag_driver;
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ if (dsa_tag_driver->ops == ops) {
+ module_put(dsa_tag_driver->owner);
+ break;
+ }
+ }
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
diff --git a/net/dsa/tag.h b/net/dsa/tag.h
new file mode 100644
index 000000000000..cf52283fe9df
--- /dev/null
+++ b/net/dsa/tag.h
@@ -0,0 +1,409 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_H
+#define __DSA_TAG_H
+
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+
+#include "port.h"
+#include "user.h"
+
+struct dsa_tag_driver {
+ const struct dsa_device_ops *ops;
+ struct list_head list;
+ struct module *owner;
+};
+
+extern struct packet_type dsa_pack_type;
+
+const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol);
+const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name);
+void dsa_tag_driver_put(const struct dsa_device_ops *ops);
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops);
+
+static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops)
+{
+ return ops->needed_headroom + ops->needed_tailroom;
+}
+
+static inline struct net_device *dsa_conduit_find_user(struct net_device *dev,
+ int device, int port)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->ds->index == device && dp->index == port &&
+ dp->type == DSA_PORT_TYPE_USER)
+ return dp->user;
+
+ return NULL;
+}
+
+/**
+ * dsa_software_untag_vlan_aware_bridge: Software untagging for VLAN-aware bridge
+ * @skb: Pointer to received socket buffer (packet)
+ * @br: Pointer to bridge upper interface of ingress port
+ * @vid: Parsed VID from packet
+ *
+ * The bridge can process tagged packets. Software like STP/PTP may not. The
+ * bridge can also process untagged packets, to the same effect as if they were
+ * tagged with the PVID of the ingress port. So packets tagged with the PVID of
+ * the bridge port must be software-untagged, to support both use cases.
+ */
+static inline void dsa_software_untag_vlan_aware_bridge(struct sk_buff *skb,
+ struct net_device *br,
+ u16 vid)
+{
+ u16 pvid, proto;
+ int err;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return;
+
+ err = br_vlan_get_pvid_rcu(skb->dev, &pvid);
+ if (err)
+ return;
+
+ if (vid == pvid && skb->vlan_proto == htons(proto))
+ __vlan_hwaccel_clear_tag(skb);
+}
+
+/**
+ * dsa_software_untag_vlan_unaware_bridge: Software untagging for VLAN-unaware bridge
+ * @skb: Pointer to received socket buffer (packet)
+ * @br: Pointer to bridge upper interface of ingress port
+ * @vid: Parsed VID from packet
+ *
+ * The bridge ignores all VLAN tags. Software like STP/PTP may not (it may run
+ * on the plain port, or on a VLAN upper interface). Maybe packets are coming
+ * to software as tagged with a driver-defined VID which is NOT equal to the
+ * PVID of the bridge port (since the bridge is VLAN-unaware, its configuration
+ * should NOT be committed to hardware). DSA needs a method for this private
+ * VID to be communicated by software to it, and if packets are tagged with it,
+ * software-untag them. Note: the private VID may be different per bridge, to
+ * support the FDB isolation use case.
+ *
+ * FIXME: this is currently implemented based on the broken assumption that
+ * the "private VID" used by the driver in VLAN-unaware mode is equal to the
+ * bridge PVID. It should not be, except for a coincidence; the bridge PVID is
+ * irrelevant to the data path in the VLAN-unaware mode. Thus, the VID that
+ * this function removes is wrong.
+ *
+ * All users of ds->untag_bridge_pvid should fix their drivers, if necessary,
+ * to make the two independent. Only then, if there still remains a need to
+ * strip the private VID from packets, then a new ds->ops->get_private_vid()
+ * API shall be introduced to communicate to DSA what this VID is, which needs
+ * to be stripped here.
+ */
+static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb,
+ struct net_device *br,
+ u16 vid)
+{
+ struct net_device *upper_dev;
+ u16 pvid, proto;
+ int err;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return;
+
+ err = br_vlan_get_pvid_rcu(skb->dev, &pvid);
+ if (err)
+ return;
+
+ if (vid != pvid || skb->vlan_proto != htons(proto))
+ return;
+
+ /* The sad part about attempting to untag from DSA is that we
+ * don't know, unless we check, if the skb will end up in
+ * the bridge's data path - br_allowed_ingress() - or not.
+ * For example, there might be an 8021q upper for the
+ * default_pvid of the bridge, which will steal VLAN-tagged traffic
+ * from the bridge's data path. This is a configuration that DSA
+ * supports because vlan_filtering is 0. In that case, we should
+ * definitely keep the tag, to make sure it keeps working.
+ */
+ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid);
+ if (!upper_dev)
+ __vlan_hwaccel_clear_tag(skb);
+}
+
+/**
+ * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path
+ * @skb: Pointer to socket buffer (packet)
+ *
+ * Receive path method for switches which send some packets as VLAN-tagged
+ * towards the CPU port (generally from VLAN-aware bridge ports) even when the
+ * packet was not tagged on the wire. Called when ds->untag_bridge_pvid
+ * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true.
+ *
+ * As a side effect of this method, any VLAN tag from the skb head is moved
+ * to hwaccel.
+ */
+static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ u16 vid, proto;
+ int err;
+
+ /* software untagging for standalone ports not yet necessary */
+ if (!br)
+ return skb;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return skb;
+
+ /* Move VLAN tag from data to hwaccel */
+ if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) {
+ skb = skb_vlan_untag(skb);
+ if (!skb)
+ return NULL;
+ }
+
+ if (!skb_vlan_tag_present(skb))
+ return skb;
+
+ vid = skb_vlan_tag_get_id(skb);
+
+ if (br_vlan_enabled(br)) {
+ if (dp->ds->untag_vlan_aware_bridge_pvid)
+ dsa_software_untag_vlan_aware_bridge(skb, br, vid);
+ } else {
+ if (dp->ds->untag_bridge_pvid)
+ dsa_software_untag_vlan_unaware_bridge(skb, br, vid);
+ }
+
+ return skb;
+}
+
+/* For switches without hardware support for DSA tagging to be able
+ * to support termination through the bridge.
+ */
+static inline struct net_device *
+dsa_find_designated_bridge_port_by_vid(struct net_device *conduit, u16 vid)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct bridge_vlan_info vinfo;
+ struct net_device *user;
+ struct dsa_port *dp;
+ int err;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (!dp->bridge)
+ continue;
+
+ if (dp->stp_state != BR_STATE_LEARNING &&
+ dp->stp_state != BR_STATE_FORWARDING)
+ continue;
+
+ /* Since the bridge might learn this packet, keep the CPU port
+ * affinity with the port that will be used for the reply on
+ * xmit.
+ */
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ user = dp->user;
+
+ err = br_vlan_get_info_rcu(user, vid, &vinfo);
+ if (err)
+ continue;
+
+ return user;
+ }
+
+ return NULL;
+}
+
+/* If the ingress port offloads the bridge, we mark the frame as autonomously
+ * forwarded by hardware, so the software bridge doesn't forward in twice, back
+ * to us, because we already did. However, if we're in fallback mode and we do
+ * software bridging, we are not offloading it, therefore the dp->bridge
+ * pointer is not populated, and flooding needs to be done by software (we are
+ * effectively operating in standalone ports mode).
+ */
+static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+
+ skb->offload_fwd_mark = !!(dp->bridge);
+}
+
+/* Helper for removing DSA header tags from packets in the RX path.
+ * Must not be called before skb_pull(len).
+ * skb->data
+ * |
+ * v
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | Destination MAC | Source MAC | DSA header | EType |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | |
+ * <----- len -----> <----- len ----->
+ * |
+ * >>>>>>> v
+ * >>>>>>> | | | | | | | | | | | | | | |
+ * >>>>>>> +-----------------------+-----------------------+-------+
+ * >>>>>>> | Destination MAC | Source MAC | EType |
+ * +-----------------------+-----------------------+-------+
+ * ^
+ * |
+ * skb->data
+ */
+static inline void dsa_strip_etype_header(struct sk_buff *skb, int len)
+{
+ memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN);
+}
+
+/* Helper for creating space for DSA header tags in TX path packets.
+ * Must not be called before skb_push(len).
+ *
+ * Before:
+ *
+ * <<<<<<< | | | | | | | | | | | | | | |
+ * ^ <<<<<<< +-----------------------+-----------------------+-------+
+ * | <<<<<<< | Destination MAC | Source MAC | EType |
+ * | +-----------------------+-----------------------+-------+
+ * <----- len ----->
+ * |
+ * |
+ * skb->data
+ *
+ * After:
+ *
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | Destination MAC | Source MAC | DSA header | EType |
+ * +-----------------------+-----------------------+---------------+-------+
+ * ^ | |
+ * | <----- len ----->
+ * skb->data
+ */
+static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len)
+{
+ memmove(skb->data, skb->data + len, 2 * ETH_ALEN);
+}
+
+/* On RX, eth_type_trans() on the DSA conduit pulls ETH_HLEN bytes starting from
+ * skb_mac_header(skb), which leaves skb->data pointing at the first byte after
+ * what the DSA conduit perceives as the EtherType (the beginning of the L3
+ * protocol). Since DSA EtherType header taggers treat the EtherType as part of
+ * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header
+ * is located 2 bytes behind skb->data. Note that EtherType in this context
+ * means the first 2 bytes of the DSA header, not the encapsulated EtherType
+ * that will become visible after the DSA header is stripped.
+ */
+static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb)
+{
+ return skb->data - 2;
+}
+
+/* On TX, skb->data points to the MAC header, which means that EtherType
+ * header taggers start exactly where the EtherType is (the EtherType is
+ * treated as part of the DSA header).
+ */
+static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb)
+{
+ return skb->data + 2 * ETH_ALEN;
+}
+
+static inline unsigned long dsa_xmit_port_mask(const struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ unsigned long mask = BIT(dp->index);
+
+ if (IS_ENABLED(CONFIG_HSR) &&
+ unlikely(dev->features & NETIF_F_HW_HSR_DUP)) {
+ struct net_device *hsr_dev = dp->hsr_dev;
+ struct dsa_port *other_dp;
+
+ dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev)
+ mask |= BIT(other_dp->index);
+ }
+
+ return mask;
+}
+
+/* Create 2 modaliases per tagging protocol, one to auto-load the module
+ * given the ID reported by get_tag_protocol(), and the other by name.
+ */
+#define DSA_TAG_DRIVER_ALIAS "dsa_tag:"
+#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \
+ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \
+ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \
+ __stringify(__proto##_VALUE))
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count,
+ struct module *owner);
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count);
+
+#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \
+static int __init dsa_tag_driver_module_init(void) \
+{ \
+ dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \
+ THIS_MODULE); \
+ return 0; \
+} \
+module_init(dsa_tag_driver_module_init); \
+ \
+static void __exit dsa_tag_driver_module_exit(void) \
+{ \
+ dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \
+} \
+module_exit(dsa_tag_driver_module_exit)
+
+/**
+ * module_dsa_tag_drivers() - Helper macro for registering DSA tag
+ * drivers
+ * @__ops_array: Array of tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_drivers(__ops_array) \
+dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
+
+#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
+
+/* Create a static structure we can build a linked list of dsa_tag
+ * drivers
+ */
+#define DSA_TAG_DRIVER(__ops) \
+static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \
+ .ops = &__ops, \
+}
+
+/**
+ * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
+ * driver
+ * @__ops: Single tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_driver(__ops) \
+DSA_TAG_DRIVER(__ops); \
+ \
+static struct dsa_tag_driver *dsa_tag_driver_array[] = { \
+ &DSA_TAG_DRIVER_NAME(__ops) \
+}; \
+module_dsa_tag_drivers(dsa_tag_driver_array)
+
+#endif
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
new file mode 100644
index 000000000000..53e03fd8071b
--- /dev/null
+++ b/net/dsa/tag_8021q.c
@@ -0,0 +1,588 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
+ *
+ * This module is not a complete tagger implementation. It only provides
+ * primitives for taggers that rely on 802.1Q VLAN tags to use.
+ */
+#include <linux/if_vlan.h>
+#include <linux/dsa/8021q.h>
+
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "tag_8021q.h"
+
+/* Binary structure of the fake 12-bit VID field (when the TPID is
+ * ETH_P_DSA_8021Q):
+ *
+ * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
+ * +-----------+-----+-----------------+-----------+-----------------------+
+ * | RSV | VBID| SWITCH_ID | VBID | PORT |
+ * +-----------+-----+-----------------+-----------+-----------------------+
+ *
+ * RSV - VID[11:10]:
+ * Reserved. Must be set to 3 (0b11).
+ *
+ * SWITCH_ID - VID[8:6]:
+ * Index of switch within DSA tree. Must be between 0 and 7.
+ *
+ * VBID - { VID[9], VID[5:4] }:
+ * Virtual bridge ID. If between 1 and 7, packet targets the broadcast
+ * domain of a bridge. If transmitted as zero, packet targets a single
+ * port.
+ *
+ * PORT - VID[3:0]:
+ * Index of switch port. Must be between 0 and 15.
+ */
+
+#define DSA_8021Q_RSV_VAL 3
+#define DSA_8021Q_RSV_SHIFT 10
+#define DSA_8021Q_RSV_MASK GENMASK(11, 10)
+#define DSA_8021Q_RSV ((DSA_8021Q_RSV_VAL << DSA_8021Q_RSV_SHIFT) & \
+ DSA_8021Q_RSV_MASK)
+
+#define DSA_8021Q_SWITCH_ID_SHIFT 6
+#define DSA_8021Q_SWITCH_ID_MASK GENMASK(8, 6)
+#define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \
+ DSA_8021Q_SWITCH_ID_MASK)
+
+#define DSA_8021Q_VBID_HI_SHIFT 9
+#define DSA_8021Q_VBID_HI_MASK GENMASK(9, 9)
+#define DSA_8021Q_VBID_LO_SHIFT 4
+#define DSA_8021Q_VBID_LO_MASK GENMASK(5, 4)
+#define DSA_8021Q_VBID_HI(x) (((x) & GENMASK(2, 2)) >> 2)
+#define DSA_8021Q_VBID_LO(x) ((x) & GENMASK(1, 0))
+#define DSA_8021Q_VBID(x) \
+ (((DSA_8021Q_VBID_LO(x) << DSA_8021Q_VBID_LO_SHIFT) & \
+ DSA_8021Q_VBID_LO_MASK) | \
+ ((DSA_8021Q_VBID_HI(x) << DSA_8021Q_VBID_HI_SHIFT) & \
+ DSA_8021Q_VBID_HI_MASK))
+
+#define DSA_8021Q_PORT_SHIFT 0
+#define DSA_8021Q_PORT_MASK GENMASK(3, 0)
+#define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \
+ DSA_8021Q_PORT_MASK)
+
+struct dsa_tag_8021q_vlan {
+ struct list_head list;
+ int port;
+ u16 vid;
+ refcount_t refcount;
+};
+
+struct dsa_8021q_context {
+ struct dsa_switch *ds;
+ struct list_head vlans;
+ /* EtherType of RX VID, used for filtering on conduit interface */
+ __be16 proto;
+};
+
+u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num)
+{
+ /* The VBID value of 0 is reserved for precise TX, but it is also
+ * reserved/invalid for the bridge_num, so all is well.
+ */
+ return DSA_8021Q_RSV | DSA_8021Q_VBID(bridge_num);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_vid);
+
+/* Returns the VID that will be installed as pvid for this switch port, sent as
+ * tagged egress towards the CPU port and decoded by the rcv function.
+ */
+u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp)
+{
+ return DSA_8021Q_RSV | DSA_8021Q_SWITCH_ID(dp->ds->index) |
+ DSA_8021Q_PORT(dp->index);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_standalone_vid);
+
+/* Returns the decoded switch ID from the RX VID. */
+int dsa_8021q_rx_switch_id(u16 vid)
+{
+ return (vid & DSA_8021Q_SWITCH_ID_MASK) >> DSA_8021Q_SWITCH_ID_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rx_switch_id);
+
+/* Returns the decoded port ID from the RX VID. */
+int dsa_8021q_rx_source_port(u16 vid)
+{
+ return (vid & DSA_8021Q_PORT_MASK) >> DSA_8021Q_PORT_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
+
+/* Returns the decoded VBID from the RX VID. */
+static int dsa_tag_8021q_rx_vbid(u16 vid)
+{
+ u16 vbid_hi = (vid & DSA_8021Q_VBID_HI_MASK) >> DSA_8021Q_VBID_HI_SHIFT;
+ u16 vbid_lo = (vid & DSA_8021Q_VBID_LO_MASK) >> DSA_8021Q_VBID_LO_SHIFT;
+
+ return (vbid_hi << 2) | vbid_lo;
+}
+
+bool vid_is_dsa_8021q(u16 vid)
+{
+ u16 rsv = (vid & DSA_8021Q_RSV_MASK) >> DSA_8021Q_RSV_SHIFT;
+
+ return rsv == DSA_8021Q_RSV_VAL;
+}
+EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
+
+static struct dsa_tag_8021q_vlan *
+dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid)
+{
+ struct dsa_tag_8021q_vlan *v;
+
+ list_for_each_entry(v, &ctx->vlans, list)
+ if (v->vid == vid && v->port == port)
+ return v;
+
+ return NULL;
+}
+
+static int dsa_port_do_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid,
+ u16 flags)
+{
+ struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_tag_8021q_vlan *v;
+ int port = dp->index;
+ int err;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+ return ds->ops->tag_8021q_vlan_add(ds, port, vid, flags);
+
+ v = dsa_tag_8021q_vlan_find(ctx, port, vid);
+ if (v) {
+ refcount_inc(&v->refcount);
+ return 0;
+ }
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ err = ds->ops->tag_8021q_vlan_add(ds, port, vid, flags);
+ if (err) {
+ kfree(v);
+ return err;
+ }
+
+ v->vid = vid;
+ v->port = port;
+ refcount_set(&v->refcount, 1);
+ list_add_tail(&v->list, &ctx->vlans);
+
+ return 0;
+}
+
+static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid)
+{
+ struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_tag_8021q_vlan *v;
+ int port = dp->index;
+ int err;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+ return ds->ops->tag_8021q_vlan_del(ds, port, vid);
+
+ v = dsa_tag_8021q_vlan_find(ctx, port, vid);
+ if (!v)
+ return -ENOENT;
+
+ if (!refcount_dec_and_test(&v->refcount))
+ return 0;
+
+ err = ds->ops->tag_8021q_vlan_del(ds, port, vid);
+ if (err) {
+ refcount_set(&v->refcount, 1);
+ return err;
+ }
+
+ list_del(&v->list);
+ kfree(v);
+
+ return 0;
+}
+
+static bool
+dsa_port_tag_8021q_vlan_match(struct dsa_port *dp,
+ struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+ return dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp) || dp == info->dp;
+}
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ /* Since we use dsa_broadcast(), there might be other switches in other
+ * trees which don't support tag_8021q, so don't return an error.
+ * Or they might even support tag_8021q but have not registered yet to
+ * use it (maybe they use another tagger currently).
+ */
+ if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx)
+ return 0;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_tag_8021q_vlan_match(dp, info)) {
+ u16 flags = 0;
+
+ if (dsa_port_is_user(dp))
+ flags |= BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_PVID;
+
+ err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid,
+ flags);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx)
+ return 0;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_tag_8021q_vlan_match(dp, info)) {
+ err = dsa_port_do_tag_8021q_vlan_del(dp, info->vid);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* There are 2 ways of offloading tag_8021q VLANs.
+ *
+ * One is to use a hardware TCAM to push the port's standalone VLAN into the
+ * frame when forwarding it to the CPU, as an egress modification rule on the
+ * CPU port. This is preferable because it has no side effects for the
+ * autonomous forwarding path, and accomplishes tag_8021q's primary goal of
+ * identifying the source port of each packet based on VLAN ID.
+ *
+ * The other is to commit the tag_8021q VLAN as a PVID to the VLAN table, and
+ * to configure the port as VLAN-unaware. This is less preferable because
+ * unique source port identification can only be done for standalone ports;
+ * under a VLAN-unaware bridge, all ports share the same tag_8021q VLAN as
+ * PVID, and under a VLAN-aware bridge, packets received by software will not
+ * have tag_8021q VLANs appended, just bridge VLANs.
+ *
+ * For tag_8021q implementations of the second type, this method is used to
+ * replace the standalone tag_8021q VLAN of a port with the tag_8021q VLAN to
+ * be used for VLAN-unaware bridging.
+ */
+int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port,
+ struct dsa_bridge bridge, bool *tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 standalone_vid, bridge_vid;
+ int err;
+
+ /* Delete the standalone VLAN of the port and replace it with a
+ * bridging VLAN
+ */
+ standalone_vid = dsa_tag_8021q_standalone_vid(dp);
+ bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num);
+
+ err = dsa_port_tag_8021q_vlan_add(dp, bridge_vid, true);
+ if (err)
+ return err;
+
+ dsa_port_tag_8021q_vlan_del(dp, standalone_vid, false);
+
+ *tx_fwd_offload = true;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_join);
+
+void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port,
+ struct dsa_bridge bridge)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 standalone_vid, bridge_vid;
+ int err;
+
+ /* Delete the bridging VLAN of the port and replace it with a
+ * standalone VLAN
+ */
+ standalone_vid = dsa_tag_8021q_standalone_vid(dp);
+ bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num);
+
+ err = dsa_port_tag_8021q_vlan_add(dp, standalone_vid, false);
+ if (err) {
+ dev_err(ds->dev,
+ "Failed to delete tag_8021q standalone VLAN %d from port %d: %pe\n",
+ standalone_vid, port, ERR_PTR(err));
+ }
+
+ dsa_port_tag_8021q_vlan_del(dp, bridge_vid, true);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_leave);
+
+/* Set up a port's standalone tag_8021q VLAN */
+static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
+{
+ struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 vid = dsa_tag_8021q_standalone_vid(dp);
+ struct net_device *conduit;
+ int err;
+
+ /* The CPU port is implicitly configured by
+ * configuring the front-panel ports
+ */
+ if (!dsa_port_is_user(dp))
+ return 0;
+
+ conduit = dsa_port_to_conduit(dp);
+
+ err = dsa_port_tag_8021q_vlan_add(dp, vid, false);
+ if (err) {
+ dev_err(ds->dev,
+ "Failed to apply standalone VID %d to port %d: %pe\n",
+ vid, port, ERR_PTR(err));
+ return err;
+ }
+
+ /* Add the VLAN to the conduit's RX filter. */
+ vlan_vid_add(conduit, ctx->proto, vid);
+
+ return err;
+}
+
+static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
+{
+ struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 vid = dsa_tag_8021q_standalone_vid(dp);
+ struct net_device *conduit;
+
+ /* The CPU port is implicitly configured by
+ * configuring the front-panel ports
+ */
+ if (!dsa_port_is_user(dp))
+ return;
+
+ conduit = dsa_port_to_conduit(dp);
+
+ dsa_port_tag_8021q_vlan_del(dp, vid, false);
+
+ vlan_vid_del(conduit, ctx->proto, vid);
+}
+
+static int dsa_tag_8021q_setup(struct dsa_switch *ds)
+{
+ int err, port;
+
+ ASSERT_RTNL();
+
+ for (port = 0; port < ds->num_ports; port++) {
+ err = dsa_tag_8021q_port_setup(ds, port);
+ if (err < 0) {
+ dev_err(ds->dev,
+ "Failed to setup VLAN tagging for port %d: %pe\n",
+ port, ERR_PTR(err));
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static void dsa_tag_8021q_teardown(struct dsa_switch *ds)
+{
+ int port;
+
+ ASSERT_RTNL();
+
+ for (port = 0; port < ds->num_ports; port++)
+ dsa_tag_8021q_port_teardown(ds, port);
+}
+
+int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto)
+{
+ struct dsa_8021q_context *ctx;
+ int err;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->proto = proto;
+ ctx->ds = ds;
+
+ INIT_LIST_HEAD(&ctx->vlans);
+
+ ds->tag_8021q_ctx = ctx;
+
+ err = dsa_tag_8021q_setup(ds);
+ if (err)
+ goto err_free;
+
+ return 0;
+
+err_free:
+ kfree(ctx);
+ return err;
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_register);
+
+void dsa_tag_8021q_unregister(struct dsa_switch *ds)
+{
+ struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+ struct dsa_tag_8021q_vlan *v, *n;
+
+ dsa_tag_8021q_teardown(ds);
+
+ list_for_each_entry_safe(v, n, &ctx->vlans, list) {
+ list_del(&v->list);
+ kfree(v);
+ }
+
+ ds->tag_8021q_ctx = NULL;
+
+ kfree(ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister);
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+ u16 tpid, u16 tci)
+{
+ /* skb->data points at the MAC header, which is fine
+ * for vlan_insert_tag().
+ */
+ return vlan_insert_tag(skb, htons(tpid), tci);
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
+
+static struct net_device *
+dsa_tag_8021q_find_port_by_vbid(struct net_device *conduit, int vbid)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+
+ if (WARN_ON(!vbid))
+ return NULL;
+
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (!dp->bridge)
+ continue;
+
+ if (dp->stp_state != BR_STATE_LEARNING &&
+ dp->stp_state != BR_STATE_FORWARDING)
+ continue;
+
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ if (dsa_port_bridge_num_get(dp) == vbid)
+ return dp->user;
+ }
+
+ return NULL;
+}
+
+struct net_device *dsa_tag_8021q_find_user(struct net_device *conduit,
+ int source_port, int switch_id,
+ int vid, int vbid)
+{
+ /* Always prefer precise source port information, if available */
+ if (source_port != -1 && switch_id != -1)
+ return dsa_conduit_find_user(conduit, switch_id, source_port);
+ else if (vbid >= 1)
+ return dsa_tag_8021q_find_port_by_vbid(conduit, vbid);
+
+ return dsa_find_designated_bridge_port_by_vid(conduit, vid);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_find_user);
+
+/**
+ * dsa_8021q_rcv - Decode source information from tag_8021q header
+ * @skb: RX socket buffer
+ * @source_port: pointer to storage for precise source port information.
+ * If this is known already from outside tag_8021q, the pre-initialized
+ * value is preserved. If not known, pass -1.
+ * @switch_id: similar to source_port.
+ * @vbid: pointer to storage for imprecise bridge ID. Must be pre-initialized
+ * with -1. If a positive value is returned, the source_port and switch_id
+ * are invalid.
+ * @vid: pointer to storage for original VID, in case tag_8021q decoding failed.
+ *
+ * If the packet has a tag_8021q header, decode it and set @source_port,
+ * @switch_id and @vbid, and strip the header. Otherwise set @vid and keep the
+ * header in the hwaccel area of the packet.
+ */
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+ int *vbid, int *vid)
+{
+ int tmp_source_port, tmp_switch_id, tmp_vbid;
+ __be16 vlan_proto;
+ u16 tmp_vid, tci;
+
+ if (skb_vlan_tag_present(skb)) {
+ vlan_proto = skb->vlan_proto;
+ tci = skb_vlan_tag_get(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
+
+ vlan_proto = hdr->h_vlan_proto;
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ }
+
+ tmp_vid = tci & VLAN_VID_MASK;
+ if (!vid_is_dsa_8021q(tmp_vid)) {
+ /* Not a tag_8021q frame, so return the VID to the
+ * caller for further processing, and put the tag back
+ */
+ if (vid)
+ *vid = tmp_vid;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, tci);
+
+ return;
+ }
+
+ tmp_source_port = dsa_8021q_rx_source_port(tmp_vid);
+ tmp_switch_id = dsa_8021q_rx_switch_id(tmp_vid);
+ tmp_vbid = dsa_tag_8021q_rx_vbid(tmp_vid);
+
+ /* Precise source port information is unknown when receiving from a
+ * VLAN-unaware bridging domain, and tmp_source_port and tmp_switch_id
+ * are zeroes in this case.
+ *
+ * Preserve the source information from hardware-specific mechanisms,
+ * if available. This allows us to not overwrite a valid source port
+ * and switch ID with less precise values.
+ */
+ if (tmp_vbid == 0 && *source_port == -1)
+ *source_port = tmp_source_port;
+ if (tmp_vbid == 0 && *switch_id == -1)
+ *switch_id = tmp_switch_id;
+
+ if (vbid)
+ *vbid = tmp_vbid;
+
+ skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ return;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
new file mode 100644
index 000000000000..27b8906f99ec
--- /dev/null
+++ b/net/dsa/tag_8021q.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_8021Q_H
+#define __DSA_TAG_8021Q_H
+
+#include <net/dsa.h>
+
+#include "switch.h"
+
+struct sk_buff;
+struct net_device;
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+ u16 tpid, u16 tci);
+
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+ int *vbid, int *vid);
+
+struct net_device *dsa_tag_8021q_find_user(struct net_device *conduit,
+ int source_port, int switch_id,
+ int vid, int vbid);
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info);
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info);
+
+#endif
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
new file mode 100644
index 000000000000..cbb588ca73aa
--- /dev/null
+++ b/net/dsa/tag_ar9331.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+ */
+
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+#define AR9331_NAME "ar9331"
+
+#define AR9331_HDR_LEN 2
+#define AR9331_HDR_VERSION 1
+
+#define AR9331_HDR_VERSION_MASK GENMASK(15, 14)
+#define AR9331_HDR_PRIORITY_MASK GENMASK(13, 12)
+#define AR9331_HDR_TYPE_MASK GENMASK(10, 8)
+#define AR9331_HDR_BROADCAST BIT(7)
+#define AR9331_HDR_FROM_CPU BIT(6)
+/* AR9331_HDR_RESERVED - not used or may be version field.
+ * According to the AR8216 doc it should 0b10. On AR9331 it is 0b11 on RX path
+ * and should be set to 0b11 to make it work.
+ */
+#define AR9331_HDR_RESERVED_MASK GENMASK(5, 4)
+#define AR9331_HDR_PORT_NUM_MASK GENMASK(3, 0)
+
+static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ __le16 *phdr;
+ u16 hdr;
+
+ phdr = skb_push(skb, AR9331_HDR_LEN);
+
+ hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION);
+ hdr |= AR9331_HDR_FROM_CPU | dp->index;
+ /* 0b10 for AR8216 and 0b11 for AR9331 */
+ hdr |= AR9331_HDR_RESERVED_MASK;
+
+ phdr[0] = cpu_to_le16(hdr);
+
+ return skb;
+}
+
+static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
+ struct net_device *ndev)
+{
+ u8 ver, port;
+ u16 hdr;
+
+ if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN)))
+ return NULL;
+
+ hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb));
+
+ ver = FIELD_GET(AR9331_HDR_VERSION_MASK, hdr);
+ if (unlikely(ver != AR9331_HDR_VERSION)) {
+ netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n",
+ __func__, __LINE__, hdr);
+ return NULL;
+ }
+
+ if (unlikely(hdr & AR9331_HDR_FROM_CPU)) {
+ netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n",
+ __func__, __LINE__, hdr);
+ return NULL;
+ }
+
+ skb_pull_rcsum(skb, AR9331_HDR_LEN);
+
+ /* Get source port information */
+ port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
+
+ skb->dev = dsa_conduit_find_user(ndev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ return skb;
+}
+
+static const struct dsa_device_ops ar9331_netdev_ops = {
+ .name = AR9331_NAME,
+ .proto = DSA_TAG_PROTO_AR9331,
+ .xmit = ar9331_tag_xmit,
+ .rcv = ar9331_tag_rcv,
+ .needed_headroom = AR9331_HDR_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Atheros AR9331 SoC with built-in switch");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331, AR9331_NAME);
+module_dsa_tag_driver(ar9331_netdev_ops);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
new file mode 100644
index 000000000000..cf9420439054
--- /dev/null
+++ b/net/dsa/tag_brcm.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Broadcom tag support
+ *
+ * Copyright (C) 2014 Broadcom Corporation
+ */
+
+#include <linux/dsa/brcm.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "tag.h"
+
+#define BRCM_NAME "brcm"
+#define BRCM_LEGACY_NAME "brcm-legacy"
+#define BRCM_LEGACY_FCS_NAME "brcm-legacy-fcs"
+#define BRCM_PREPEND_NAME "brcm-prepend"
+
+/* Legacy Broadcom tag (6 bytes) */
+#define BRCM_LEG_TAG_LEN 6
+
+/* Type fields */
+/* 1st byte in the tag */
+#define BRCM_LEG_TYPE_HI 0x88
+/* 2nd byte in the tag */
+#define BRCM_LEG_TYPE_LO 0x74
+
+/* Tag fields */
+/* 3rd byte in the tag */
+#define BRCM_LEG_UNICAST (0 << 5)
+#define BRCM_LEG_MULTICAST (1 << 5)
+#define BRCM_LEG_EGRESS (2 << 5)
+#define BRCM_LEG_INGRESS (3 << 5)
+#define BRCM_LEG_LEN_HI(x) (((x) >> 8) & 0x7)
+
+/* 4th byte in the tag */
+#define BRCM_LEG_LEN_LO(x) ((x) & 0xff)
+
+/* 6th byte in the tag */
+#define BRCM_LEG_PORT_ID (0xf)
+
+/* Newer Broadcom tag (4 bytes) */
+#define BRCM_TAG_LEN 4
+
+/* Tag is constructed and deconstructed using byte by byte access
+ * because the tag is placed after the MAC Source Address, which does
+ * not make it 4-bytes aligned, so this might cause unaligned accesses
+ * on most systems where this is used.
+ */
+
+/* Ingress and egress opcodes */
+#define BRCM_OPCODE_SHIFT 5
+#define BRCM_OPCODE_MASK 0x7
+
+/* Ingress fields */
+/* 1st byte in the tag */
+#define BRCM_IG_TC_SHIFT 2
+#define BRCM_IG_TC_MASK 0x7
+/* 2nd byte in the tag */
+#define BRCM_IG_TE_MASK 0x3
+#define BRCM_IG_TS_SHIFT 7
+/* 3rd byte in the tag */
+#define BRCM_IG_DSTMAP2_MASK 1
+#define BRCM_IG_DSTMAP1_MASK 0xff
+
+/* Egress fields */
+
+/* 2nd byte in the tag */
+#define BRCM_EG_CID_MASK 0xff
+
+/* 3rd byte in the tag */
+#define BRCM_EG_RC_MASK 0xff
+#define BRCM_EG_RC_RSVD (3 << 6)
+#define BRCM_EG_RC_EXCEPTION (1 << 5)
+#define BRCM_EG_RC_PROT_SNOOP (1 << 4)
+#define BRCM_EG_RC_PROT_TERM (1 << 3)
+#define BRCM_EG_RC_SWITCH (1 << 2)
+#define BRCM_EG_RC_MAC_LEARN (1 << 1)
+#define BRCM_EG_RC_MIRROR (1 << 0)
+#define BRCM_EG_TC_SHIFT 5
+#define BRCM_EG_TC_MASK 0x7
+#define BRCM_EG_PID_MASK 0x1f
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) || \
+ IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
+
+static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned int offset)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ u16 queue = skb_get_queue_mapping(skb);
+ u16 port_mask;
+ u8 *brcm_tag;
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 64 bytes (including FCS) otherwise they will be discarded when
+ * they enter the switch port logic. When Broadcom tags are enabled, we
+ * need to make sure that packets are at least 68 bytes
+ * (including FCS and tag) because the length verification is done after
+ * the Broadcom tag is stripped off the ingress packet.
+ *
+ * Let dsa_user_xmit() free the SKB
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
+ return NULL;
+
+ skb_push(skb, BRCM_TAG_LEN);
+
+ if (offset)
+ dsa_alloc_etype_header(skb, BRCM_TAG_LEN);
+
+ brcm_tag = skb->data + offset;
+
+ /* Set the ingress opcode, traffic class, tag enforcement is
+ * deprecated
+ */
+ brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) |
+ ((queue & BRCM_IG_TC_MASK) << BRCM_IG_TC_SHIFT);
+ brcm_tag[1] = 0;
+ port_mask = dsa_xmit_port_mask(skb, dev);
+ brcm_tag[2] = (port_mask >> 8) & BRCM_IG_DSTMAP2_MASK;
+ brcm_tag[3] = port_mask & BRCM_IG_DSTMAP1_MASK;
+
+ /* Now tell the conduit network device about the desired output queue
+ * as well
+ */
+ skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(dp->index, queue));
+
+ return skb;
+}
+
+/* Frames with this tag have one of these two layouts:
+ * -----------------------------------
+ * | MAC DA | MAC SA | 4b tag | Type | DSA_TAG_PROTO_BRCM
+ * -----------------------------------
+ * -----------------------------------
+ * | 4b tag | MAC DA | MAC SA | Type | DSA_TAG_PROTO_BRCM_PREPEND
+ * -----------------------------------
+ * In both cases, at receive time, skb->data points 2 bytes before the actual
+ * Ethernet type field and we have an offset of 4bytes between where skb->data
+ * and where the payload starts. So the same low-level receive function can be
+ * used.
+ */
+static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned int offset)
+{
+ int source_port;
+ u8 *brcm_tag;
+
+ if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN)))
+ return NULL;
+
+ brcm_tag = skb->data - offset;
+
+ /* The opcode should never be different than 0b000 */
+ if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK))
+ return NULL;
+
+ /* We should never see a reserved reason code without knowing how to
+ * handle it
+ */
+ if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
+ return NULL;
+
+ /* Locate which port this is coming from */
+ source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ /* Remove Broadcom tag and update checksum */
+ skb_pull_rcsum(skb, BRCM_TAG_LEN);
+
+ if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest)))
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
+static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Build the tag after the MAC Source Address */
+ return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN);
+}
+
+
+static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sk_buff *nskb;
+
+ /* skb->data points to the EtherType, the tag is right before it */
+ nskb = brcm_tag_rcv_ll(skb, dev, 2);
+ if (!nskb)
+ return nskb;
+
+ dsa_strip_etype_header(skb, BRCM_TAG_LEN);
+
+ return nskb;
+}
+
+static const struct dsa_device_ops brcm_netdev_ops = {
+ .name = BRCM_NAME,
+ .proto = DSA_TAG_PROTO_BRCM,
+ .xmit = brcm_tag_xmit,
+ .rcv = brcm_tag_rcv,
+ .needed_headroom = BRCM_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME);
+#endif
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) || \
+ IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int len = BRCM_LEG_TAG_LEN;
+ int source_port;
+ __be16 *proto;
+ u8 *brcm_tag;
+
+ if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN)))
+ return NULL;
+
+ brcm_tag = dsa_etype_header_pos_rx(skb);
+ proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN);
+
+ source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ /* The internal switch in BCM63XX SoCs always tags on egress on the CPU
+ * port. We use VID 0 internally for untagged traffic, so strip the tag
+ * if the TCI field is all 0, and keep it otherwise to also retain
+ * e.g. 802.1p tagged packets.
+ */
+ if (proto[0] == htons(ETH_P_8021Q) && proto[1] == 0)
+ len += VLAN_HLEN;
+
+ /* Remove Broadcom tag and update checksum */
+ skb_pull_rcsum(skb, len);
+
+ if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest)))
+ dsa_default_offload_fwd_mark(skb);
+
+ dsa_strip_etype_header(skb, len);
+
+ return skb;
+}
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY || CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ u8 *brcm_tag;
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 64 bytes (including FCS) otherwise they will be discarded when
+ * they enter the switch port logic. When Broadcom tags are enabled, we
+ * need to make sure that packets are at least 70 bytes
+ * (including FCS and tag) because the length verification is done after
+ * the Broadcom tag is stripped off the ingress packet.
+ *
+ * Let dsa_user_xmit() free the SKB
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+ return NULL;
+
+ skb_push(skb, BRCM_LEG_TAG_LEN);
+
+ dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN);
+
+ brcm_tag = skb->data + 2 * ETH_ALEN;
+
+ /* Broadcom tag type */
+ brcm_tag[0] = BRCM_LEG_TYPE_HI;
+ brcm_tag[1] = BRCM_LEG_TYPE_LO;
+
+ /* Broadcom tag value */
+ brcm_tag[2] = BRCM_LEG_EGRESS;
+ brcm_tag[3] = 0;
+ brcm_tag[4] = 0;
+ brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
+
+ return skb;
+}
+
+static const struct dsa_device_ops brcm_legacy_netdev_ops = {
+ .name = BRCM_LEGACY_NAME,
+ .proto = DSA_TAG_PROTO_BRCM_LEGACY,
+ .xmit = brcm_leg_tag_xmit,
+ .rcv = brcm_leg_tag_rcv,
+ .needed_headroom = BRCM_LEG_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_legacy_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME);
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ unsigned int fcs_len;
+ __le32 fcs_val;
+ u8 *brcm_tag;
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 64 bytes (including FCS) otherwise they will be discarded when
+ * they enter the switch port logic. When Broadcom tags are enabled, we
+ * need to make sure that packets are at least 70 bytes (including FCS
+ * and tag) because the length verification is done after the Broadcom
+ * tag is stripped off the ingress packet.
+ *
+ * Let dsa_user_xmit() free the SKB.
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+ return NULL;
+
+ fcs_len = skb->len;
+ fcs_val = cpu_to_le32(crc32_le(~0, skb->data, fcs_len) ^ ~0);
+
+ skb_push(skb, BRCM_LEG_TAG_LEN);
+
+ dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN);
+
+ brcm_tag = skb->data + 2 * ETH_ALEN;
+
+ /* Broadcom tag type */
+ brcm_tag[0] = BRCM_LEG_TYPE_HI;
+ brcm_tag[1] = BRCM_LEG_TYPE_LO;
+
+ /* Broadcom tag value */
+ brcm_tag[2] = BRCM_LEG_EGRESS | BRCM_LEG_LEN_HI(fcs_len);
+ brcm_tag[3] = BRCM_LEG_LEN_LO(fcs_len);
+ brcm_tag[4] = 0;
+ brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
+
+ /* Original FCS value */
+ if (__skb_pad(skb, ETH_FCS_LEN, false))
+ return NULL;
+ skb_put_data(skb, &fcs_val, ETH_FCS_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops brcm_legacy_fcs_netdev_ops = {
+ .name = BRCM_LEGACY_FCS_NAME,
+ .proto = DSA_TAG_PROTO_BRCM_LEGACY_FCS,
+ .xmit = brcm_leg_fcs_tag_xmit,
+ .rcv = brcm_leg_tag_rcv,
+ .needed_headroom = BRCM_LEG_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_legacy_fcs_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY_FCS, BRCM_LEGACY_FCS_NAME);
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
+static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* tag is prepended to the packet */
+ return brcm_tag_xmit_ll(skb, dev, 0);
+}
+
+static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* tag is prepended to the packet */
+ return brcm_tag_rcv_ll(skb, dev, ETH_HLEN);
+}
+
+static const struct dsa_device_ops brcm_prepend_netdev_ops = {
+ .name = BRCM_PREPEND_NAME,
+ .proto = DSA_TAG_PROTO_BRCM_PREPEND,
+ .xmit = brcm_tag_xmit_prepend,
+ .rcv = brcm_tag_rcv_prepend,
+ .needed_headroom = BRCM_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_prepend_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND, BRCM_PREPEND_NAME);
+#endif
+
+static struct dsa_tag_driver *dsa_tag_driver_array[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
+ &DSA_TAG_DRIVER_NAME(brcm_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+ &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+ &DSA_TAG_DRIVER_NAME(brcm_legacy_fcs_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
+ &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops),
+#endif
+};
+
+module_dsa_tag_drivers(dsa_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for Broadcom switches using in-frame headers");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
new file mode 100644
index 000000000000..2a2c4fb61a65
--- /dev/null
+++ b/net/dsa/tag_dsa.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Regular and Ethertype DSA tagging
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * Regular DSA
+ * -----------
+
+ * For untagged (in 802.1Q terms) packets, the switch will splice in
+ * the tag between the SA and the ethertype of the original
+ * packet. Tagged frames will instead have their outermost .1Q tag
+ * converted to a DSA tag. It expects the same layout when receiving
+ * packets from the CPU.
+ *
+ * Example:
+ *
+ * .----.----.----.---------
+ * Pu: | DA | SA | ET | Payload ...
+ * '----'----'----'---------
+ * 6 6 2 N
+ * .----.----.--------.-----.----.---------
+ * Pt: | DA | SA | 0x8100 | TCI | ET | Payload ...
+ * '----'----'--------'-----'----'---------
+ * 6 6 2 2 2 N
+ * .----.----.-----.----.---------
+ * Pd: | DA | SA | DSA | ET | Payload ...
+ * '----'----'-----'----'---------
+ * 6 6 4 2 N
+ *
+ * No matter if a packet is received untagged (Pu) or tagged (Pt),
+ * they will both have the same layout (Pd) when they are sent to the
+ * CPU. This is done by ignoring 802.3, replacing the ethertype field
+ * with more metadata, among which is a bit to signal if the original
+ * packet was tagged or not.
+ *
+ * Ethertype DSA
+ * -------------
+ * Uses the exact same tag format as regular DSA, but also includes a
+ * proper ethertype field (which the mv88e6xxx driver sets to
+ * ETH_P_EDSA/0xdada) followed by two zero bytes:
+ *
+ * .----.----.--------.--------.-----.----.---------
+ * | DA | SA | 0xdada | 0x0000 | DSA | ET | Payload ...
+ * '----'----'--------'--------'-----'----'---------
+ * 6 6 2 2 4 2 N
+ */
+
+#include <linux/dsa/mv88e6xxx.h>
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "tag.h"
+
+#define DSA_NAME "dsa"
+#define EDSA_NAME "edsa"
+
+#define DSA_HLEN 4
+
+/**
+ * enum dsa_cmd - DSA Command
+ * @DSA_CMD_TO_CPU: Set on packets that were trapped or mirrored to
+ * the CPU port. This is needed to implement control protocols,
+ * e.g. STP and LLDP, that must not allow those control packets to
+ * be switched according to the normal rules.
+ * @DSA_CMD_FROM_CPU: Used by the CPU to send a packet to a specific
+ * port, ignoring all the barriers that the switch normally
+ * enforces (VLANs, STP port states etc.). No source address
+ * learning takes place. "sudo send packet"
+ * @DSA_CMD_TO_SNIFFER: Set on the copies of packets that matched some
+ * user configured ingress or egress monitor criteria. These are
+ * forwarded by the switch tree to the user configured ingress or
+ * egress monitor port, which can be set to the CPU port or a
+ * regular port. If the destination is a regular port, the tag
+ * will be removed before egressing the port. If the destination
+ * is the CPU port, the tag will not be removed.
+ * @DSA_CMD_FORWARD: This tag is used on all bulk traffic passing
+ * through the switch tree, including the flows that are directed
+ * towards the CPU. Its device/port tuple encodes the original
+ * source port on which the packet ingressed. It can also be used
+ * on transmit by the CPU to defer the forwarding decision to the
+ * hardware, based on the current config of PVT/VTU/ATU
+ * etc. Source address learning takes places if enabled on the
+ * receiving DSA/CPU port.
+ */
+enum dsa_cmd {
+ DSA_CMD_TO_CPU = 0,
+ DSA_CMD_FROM_CPU = 1,
+ DSA_CMD_TO_SNIFFER = 2,
+ DSA_CMD_FORWARD = 3
+};
+
+/**
+ * enum dsa_code - TO_CPU Code
+ *
+ * @DSA_CODE_MGMT_TRAP: DA was classified as a management
+ * address. Typical examples include STP BPDUs and LLDP.
+ * @DSA_CODE_FRAME2REG: Response to a "remote management" request.
+ * @DSA_CODE_IGMP_MLD_TRAP: IGMP/MLD signaling.
+ * @DSA_CODE_POLICY_TRAP: Frame matched some policy configuration on
+ * the device. Typical examples are matching on DA/SA/VID and DHCP
+ * snooping.
+ * @DSA_CODE_ARP_MIRROR: The name says it all really.
+ * @DSA_CODE_POLICY_MIRROR: Same as @DSA_CODE_POLICY_TRAP, but the
+ * particular policy was set to trigger a mirror instead of a
+ * trap.
+ * @DSA_CODE_RESERVED_6: Unused on all devices up to at least 6393X.
+ * @DSA_CODE_RESERVED_7: Unused on all devices up to at least 6393X.
+ *
+ * A 3-bit code is used to relay why a particular frame was sent to
+ * the CPU. We only use this to determine if the packet was mirrored
+ * or trapped, i.e. whether the packet has been forwarded by hardware
+ * or not.
+ *
+ * This is the superset of all possible codes. Any particular device
+ * may only implement a subset.
+ */
+enum dsa_code {
+ DSA_CODE_MGMT_TRAP = 0,
+ DSA_CODE_FRAME2REG = 1,
+ DSA_CODE_IGMP_MLD_TRAP = 2,
+ DSA_CODE_POLICY_TRAP = 3,
+ DSA_CODE_ARP_MIRROR = 4,
+ DSA_CODE_POLICY_MIRROR = 5,
+ DSA_CODE_RESERVED_6 = 6,
+ DSA_CODE_RESERVED_7 = 7
+};
+
+static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev,
+ u8 extra)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct net_device *br_dev;
+ u8 tag_dev, tag_port;
+ enum dsa_cmd cmd;
+ u8 *dsa_header;
+
+ if (skb->offload_fwd_mark) {
+ unsigned int bridge_num = dsa_port_bridge_num_get(dp);
+ struct dsa_switch_tree *dst = dp->ds->dst;
+
+ cmd = DSA_CMD_FORWARD;
+
+ /* When offloading forwarding for a bridge, inject FORWARD
+ * packets on behalf of a virtual switch device with an index
+ * past the physical switches.
+ */
+ tag_dev = dst->last_switch + bridge_num;
+ tag_port = 0;
+ } else {
+ cmd = DSA_CMD_FROM_CPU;
+ tag_dev = dp->ds->index;
+ tag_port = dp->index;
+ }
+
+ br_dev = dsa_port_bridge_dev_get(dp);
+
+ /* If frame is already 802.1Q tagged, we can convert it to a DSA
+ * tag (avoiding a memmove), but only if the port is standalone
+ * (in which case we always send FROM_CPU) or if the port's
+ * bridge has VLAN filtering enabled (in which case the CPU port
+ * will be a member of the VLAN).
+ */
+ if (skb->protocol == htons(ETH_P_8021Q) &&
+ (!br_dev || br_vlan_enabled(br_dev))) {
+ if (extra) {
+ skb_push(skb, extra);
+ dsa_alloc_etype_header(skb, extra);
+ }
+
+ /* Construct tagged DSA tag from 802.1Q tag. */
+ dsa_header = dsa_etype_header_pos_tx(skb) + extra;
+ dsa_header[0] = (cmd << 6) | 0x20 | tag_dev;
+ dsa_header[1] = tag_port << 3;
+
+ /* Move CFI field from byte 2 to byte 1. */
+ if (dsa_header[2] & 0x10) {
+ dsa_header[1] |= 0x01;
+ dsa_header[2] &= ~0x10;
+ }
+ } else {
+ u16 vid;
+
+ vid = br_dev ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE;
+
+ skb_push(skb, DSA_HLEN + extra);
+ dsa_alloc_etype_header(skb, DSA_HLEN + extra);
+
+ /* Construct DSA header from untagged frame. */
+ dsa_header = dsa_etype_header_pos_tx(skb) + extra;
+
+ dsa_header[0] = (cmd << 6) | tag_dev;
+ dsa_header[1] = tag_port << 3;
+ dsa_header[2] = vid >> 8;
+ dsa_header[3] = vid & 0xff;
+ }
+
+ return skb;
+}
+
+static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
+ u8 extra)
+{
+ bool trap = false, trunk = false;
+ int source_device, source_port;
+ enum dsa_code code;
+ enum dsa_cmd cmd;
+ u8 *dsa_header;
+
+ /* The ethertype field is part of the DSA header. */
+ dsa_header = dsa_etype_header_pos_rx(skb);
+
+ cmd = dsa_header[0] >> 6;
+ switch (cmd) {
+ case DSA_CMD_FORWARD:
+ trunk = !!(dsa_header[1] & 4);
+ break;
+
+ case DSA_CMD_TO_CPU:
+ code = (dsa_header[1] & 0x6) | ((dsa_header[2] >> 4) & 1);
+
+ switch (code) {
+ case DSA_CODE_FRAME2REG:
+ /* Remote management is not implemented yet,
+ * drop.
+ */
+ return NULL;
+ case DSA_CODE_ARP_MIRROR:
+ case DSA_CODE_POLICY_MIRROR:
+ /* Mark mirrored packets to notify any upper
+ * device (like a bridge) that forwarding has
+ * already been done by hardware.
+ */
+ break;
+ case DSA_CODE_MGMT_TRAP:
+ case DSA_CODE_IGMP_MLD_TRAP:
+ case DSA_CODE_POLICY_TRAP:
+ /* Traps have, by definition, not been
+ * forwarded by hardware, so don't mark them.
+ */
+ trap = true;
+ break;
+ default:
+ /* Reserved code, this could be anything. Drop
+ * seems like the safest option.
+ */
+ return NULL;
+ }
+
+ break;
+
+ default:
+ return NULL;
+ }
+
+ source_device = dsa_header[0] & 0x1f;
+ source_port = (dsa_header[1] >> 3) & 0x1f;
+
+ if (trunk) {
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_lag *lag;
+
+ /* The exact source port is not available in the tag,
+ * so we inject the frame directly on the upper
+ * team/bond.
+ */
+ lag = dsa_lag_by_id(cpu_dp->dst, source_port + 1);
+ skb->dev = lag ? lag->dev : NULL;
+ } else {
+ skb->dev = dsa_conduit_find_user(dev, source_device,
+ source_port);
+ }
+
+ if (!skb->dev)
+ return NULL;
+
+ /* When using LAG offload, skb->dev is not a DSA user interface,
+ * so we cannot call dsa_default_offload_fwd_mark and we need to
+ * special-case it.
+ */
+ if (trunk)
+ skb->offload_fwd_mark = true;
+ else if (!trap)
+ dsa_default_offload_fwd_mark(skb);
+
+ /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q
+ * tag, and delete the ethertype (extra) if applicable. If the
+ * 'tagged' bit is cleared; delete the DSA tag, and ethertype
+ * if applicable.
+ */
+ if (dsa_header[0] & 0x20) {
+ u8 new_header[4];
+
+ /* Insert 802.1Q ethertype and copy the VLAN-related
+ * fields, but clear the bit that will hold CFI (since
+ * DSA uses that bit location for another purpose).
+ */
+ new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+ new_header[1] = ETH_P_8021Q & 0xff;
+ new_header[2] = dsa_header[2] & ~0x10;
+ new_header[3] = dsa_header[3];
+
+ /* Move CFI bit from its place in the DSA header to
+ * its 802.1Q-designated place.
+ */
+ if (dsa_header[1] & 0x01)
+ new_header[2] |= 0x10;
+
+ /* Update packet checksum if skb is CHECKSUM_COMPLETE. */
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __wsum c = skb->csum;
+ c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+ c = csum_sub(c, csum_partial(dsa_header + 2, 2, 0));
+ skb->csum = c;
+ }
+
+ memcpy(dsa_header, new_header, DSA_HLEN);
+
+ if (extra)
+ dsa_strip_etype_header(skb, extra);
+ } else {
+ skb_pull_rcsum(skb, DSA_HLEN);
+ dsa_strip_etype_header(skb, DSA_HLEN + extra);
+ }
+
+ return skb;
+}
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA)
+
+static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ return dsa_xmit_ll(skb, dev, 0);
+}
+
+static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+ return NULL;
+
+ return dsa_rcv_ll(skb, dev, 0);
+}
+
+static const struct dsa_device_ops dsa_netdev_ops = {
+ .name = DSA_NAME,
+ .proto = DSA_TAG_PROTO_DSA,
+ .xmit = dsa_xmit,
+ .rcv = dsa_rcv,
+ .needed_headroom = DSA_HLEN,
+};
+
+DSA_TAG_DRIVER(dsa_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA, DSA_NAME);
+#endif /* CONFIG_NET_DSA_TAG_DSA */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
+
+#define EDSA_HLEN 8
+
+static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *edsa_header;
+
+ skb = dsa_xmit_ll(skb, dev, EDSA_HLEN - DSA_HLEN);
+ if (!skb)
+ return NULL;
+
+ edsa_header = dsa_etype_header_pos_tx(skb);
+ edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+ edsa_header[1] = ETH_P_EDSA & 0xff;
+ edsa_header[2] = 0x00;
+ edsa_header[3] = 0x00;
+ return skb;
+}
+
+static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+ return NULL;
+
+ skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN);
+
+ return dsa_rcv_ll(skb, dev, EDSA_HLEN - DSA_HLEN);
+}
+
+static const struct dsa_device_ops edsa_netdev_ops = {
+ .name = EDSA_NAME,
+ .proto = DSA_TAG_PROTO_EDSA,
+ .xmit = edsa_xmit,
+ .rcv = edsa_rcv,
+ .needed_headroom = EDSA_HLEN,
+};
+
+DSA_TAG_DRIVER(edsa_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA, EDSA_NAME);
+#endif /* CONFIG_NET_DSA_TAG_EDSA */
+
+static struct dsa_tag_driver *dsa_tag_drivers[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA)
+ &DSA_TAG_DRIVER_NAME(dsa_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
+ &DSA_TAG_DRIVER_NAME(edsa_netdev_ops),
+#endif
+};
+
+module_dsa_tag_drivers(dsa_tag_drivers);
+
+MODULE_DESCRIPTION("DSA tag driver for Marvell switches using DSA headers");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
new file mode 100644
index 000000000000..5fa436121087
--- /dev/null
+++ b/net/dsa/tag_gswip.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel / Lantiq GSWIP V2.0 PMAC tag support
+ *
+ * Copyright (C) 2017 - 2018 Hauke Mehrtens <hauke@hauke-m.de>
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+#define GSWIP_NAME "gswip"
+
+#define GSWIP_TX_HEADER_LEN 4
+
+/* special tag in TX path header */
+/* Byte 0 */
+#define GSWIP_TX_SLPID_SHIFT 0 /* source port ID */
+#define GSWIP_TX_SLPID_CPU 2
+#define GSWIP_TX_SLPID_APP1 3
+#define GSWIP_TX_SLPID_APP2 4
+#define GSWIP_TX_SLPID_APP3 5
+#define GSWIP_TX_SLPID_APP4 6
+#define GSWIP_TX_SLPID_APP5 7
+
+/* Byte 1 */
+#define GSWIP_TX_CRCGEN_DIS BIT(7)
+#define GSWIP_TX_DPID_SHIFT 0 /* destination group ID */
+#define GSWIP_TX_DPID_ELAN 0
+#define GSWIP_TX_DPID_EWAN 1
+#define GSWIP_TX_DPID_CPU 2
+#define GSWIP_TX_DPID_APP1 3
+#define GSWIP_TX_DPID_APP2 4
+#define GSWIP_TX_DPID_APP3 5
+#define GSWIP_TX_DPID_APP4 6
+#define GSWIP_TX_DPID_APP5 7
+
+/* Byte 2 */
+#define GSWIP_TX_PORT_MAP_EN BIT(7)
+#define GSWIP_TX_PORT_MAP_SEL BIT(6)
+#define GSWIP_TX_LRN_DIS BIT(5)
+#define GSWIP_TX_CLASS_EN BIT(4)
+#define GSWIP_TX_CLASS_SHIFT 0
+#define GSWIP_TX_CLASS_MASK GENMASK(3, 0)
+
+/* Byte 3 */
+#define GSWIP_TX_DPID_EN BIT(0)
+#define GSWIP_TX_PORT_MAP GENMASK(6, 1)
+
+#define GSWIP_RX_HEADER_LEN 8
+
+/* special tag in RX path header */
+/* Byte 7 */
+#define GSWIP_RX_SPPID_SHIFT 4
+#define GSWIP_RX_SPPID_MASK GENMASK(6, 4)
+
+static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u8 *gswip_tag;
+
+ skb_push(skb, GSWIP_TX_HEADER_LEN);
+
+ gswip_tag = skb->data;
+ gswip_tag[0] = GSWIP_TX_SLPID_CPU;
+ gswip_tag[1] = GSWIP_TX_DPID_ELAN;
+ gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL;
+ gswip_tag[3] = FIELD_PREP(GSWIP_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev));
+ gswip_tag[3] |= GSWIP_TX_DPID_EN;
+
+ return skb;
+}
+
+static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int port;
+ u8 *gswip_tag;
+
+ if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
+ return NULL;
+
+ gswip_tag = skb->data - ETH_HLEN;
+
+ /* Get source port information */
+ port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ /* remove GSWIP tag */
+ skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops gswip_netdev_ops = {
+ .name = GSWIP_NAME,
+ .proto = DSA_TAG_PROTO_GSWIP,
+ .xmit = gswip_tag_xmit,
+ .rcv = gswip_tag_rcv,
+ .needed_headroom = GSWIP_RX_HEADER_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Lantiq / Intel GSWIP switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP, GSWIP_NAME);
+
+module_dsa_tag_driver(gswip_netdev_ops);
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
new file mode 100644
index 000000000000..544ab15685a2
--- /dev/null
+++ b/net/dsa/tag_hellcreek.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * net/dsa/tag_hellcreek.c - Hirschmann Hellcreek switch tag format handling
+ *
+ * Copyright (C) 2019,2020 Linutronix GmbH
+ * Author Kurt Kanzenbach <kurt@linutronix.de>
+ *
+ * Based on tag_ksz.c.
+ */
+
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+#define HELLCREEK_NAME "hellcreek"
+
+#define HELLCREEK_TAG_LEN 1
+
+static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u8 *tag;
+
+ /* Calculate checksums (if required) before adding the trailer tag to
+ * avoid including it in calculations. That would lead to wrong
+ * checksums after the switch strips the tag.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ tag = skb_put(skb, HELLCREEK_TAG_LEN);
+ *tag = dsa_xmit_port_mask(skb, dev);
+
+ return skb;
+}
+
+static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Tag decoding */
+ u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN;
+ unsigned int port = tag[0] & 0x03;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ netdev_warn_once(dev, "Failed to get source port: %d\n", port);
+ return NULL;
+ }
+
+ if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN))
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops hellcreek_netdev_ops = {
+ .name = HELLCREEK_NAME,
+ .proto = DSA_TAG_PROTO_HELLCREEK,
+ .xmit = hellcreek_xmit,
+ .rcv = hellcreek_rcv,
+ .needed_tailroom = HELLCREEK_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Hirschmann Hellcreek TSN switches");
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK, HELLCREEK_NAME);
+
+module_dsa_tag_driver(hellcreek_netdev_ops);
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
new file mode 100644
index 000000000000..9170a0148cc4
--- /dev/null
+++ b/net/dsa/tag_ksz.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * net/dsa/tag_ksz.c - Microchip KSZ Switch tag format handling
+ * Copyright (c) 2017 Microchip Technology
+ */
+
+#include <linux/dsa/ksz_common.h>
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/ptp_classify.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+#define KSZ8795_NAME "ksz8795"
+#define KSZ9477_NAME "ksz9477"
+#define KSZ9893_NAME "ksz9893"
+#define LAN937X_NAME "lan937x"
+
+/* Typically only one byte is used for tail tag. */
+#define KSZ_PTP_TAG_LEN 4
+#define KSZ_EGRESS_TAG_LEN 1
+#define KSZ_INGRESS_TAG_LEN 1
+
+#define KSZ_HWTS_EN 0
+
+struct ksz_tagger_private {
+ struct ksz_tagger_data data; /* Must be first */
+ unsigned long state;
+ struct kthread_worker *xmit_worker;
+};
+
+static struct ksz_tagger_private *
+ksz_tagger_private(struct dsa_switch *ds)
+{
+ return ds->tagger_data;
+}
+
+static void ksz_hwtstamp_set_state(struct dsa_switch *ds, bool on)
+{
+ struct ksz_tagger_private *priv = ksz_tagger_private(ds);
+
+ if (on)
+ set_bit(KSZ_HWTS_EN, &priv->state);
+ else
+ clear_bit(KSZ_HWTS_EN, &priv->state);
+}
+
+static void ksz_disconnect(struct dsa_switch *ds)
+{
+ struct ksz_tagger_private *priv = ds->tagger_data;
+
+ kthread_destroy_worker(priv->xmit_worker);
+ kfree(priv);
+ ds->tagger_data = NULL;
+}
+
+static int ksz_connect(struct dsa_switch *ds)
+{
+ struct ksz_tagger_data *tagger_data;
+ struct kthread_worker *xmit_worker;
+ struct ksz_tagger_private *priv;
+ int ret;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
+ ds->dst->index, ds->index);
+ if (IS_ERR(xmit_worker)) {
+ ret = PTR_ERR(xmit_worker);
+ kfree(priv);
+ return ret;
+ }
+
+ priv->xmit_worker = xmit_worker;
+ /* Export functions for switch driver use */
+ tagger_data = &priv->data;
+ tagger_data->hwtstamp_set_state = ksz_hwtstamp_set_state;
+ ds->tagger_data = priv;
+
+ return 0;
+}
+
+static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned int port, unsigned int len)
+{
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - len))
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+/*
+ * For Ingress (Host -> KSZ8795), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
+ *
+ * For Egress (KSZ8795 -> Host), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ * (eg, 0x0=port1, 0x2=port3, 0x3=port4)
+ */
+
+#define KSZ8795_TAIL_TAG_EG_PORT_M GENMASK(1, 0)
+#define KSZ8795_TAIL_TAG_OVERRIDE BIT(6)
+#define KSZ8795_TAIL_TAG_LOOKUP BIT(7)
+
+static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ethhdr *hdr;
+ u8 *tag;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
+ hdr = skb_eth_hdr(skb);
+
+ *tag = dsa_xmit_port_mask(skb, dev);
+ if (is_link_local_ether_addr(hdr->h_dest))
+ *tag |= KSZ8795_TAIL_TAG_OVERRIDE;
+
+ return skb;
+}
+
+static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *tag;
+
+ if (skb_linearize(skb))
+ return NULL;
+
+ tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+
+ return ksz_common_rcv(skb, dev, tag[0] & KSZ8795_TAIL_TAG_EG_PORT_M,
+ KSZ_EGRESS_TAG_LEN);
+}
+
+static const struct dsa_device_ops ksz8795_netdev_ops = {
+ .name = KSZ8795_NAME,
+ .proto = DSA_TAG_PROTO_KSZ8795,
+ .xmit = ksz8795_xmit,
+ .rcv = ksz8795_rcv,
+ .needed_tailroom = KSZ_INGRESS_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz8795_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME);
+
+/*
+ * For Ingress (Host -> KSZ9477), 2/6 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)|
+ * FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if PTP is enabled in the Hardware)
+ * tag0 : Prioritization (not used now)
+ * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
+ *
+ * For Egress (KSZ9477 -> Host), 1/5 bytes is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if bit 7 of tag0 is set)
+ * tag0 : zero-based value represents port
+ * (eg, 0x00=port1, 0x02=port3, 0x06=port7)
+ */
+
+#define KSZ9477_INGRESS_TAG_LEN 2
+#define KSZ9477_PTP_TAG_LEN 4
+#define KSZ9477_PTP_TAG_INDICATION BIT(7)
+
+#define KSZ9477_TAIL_TAG_EG_PORT_M GENMASK(2, 0)
+#define KSZ9477_TAIL_TAG_PRIO GENMASK(8, 7)
+#define KSZ9477_TAIL_TAG_OVERRIDE BIT(9)
+#define KSZ9477_TAIL_TAG_LOOKUP BIT(10)
+
+static void ksz_rcv_timestamp(struct sk_buff *skb, u8 *tag)
+{
+ u8 *tstamp_raw = tag - KSZ_PTP_TAG_LEN;
+ ktime_t tstamp;
+
+ tstamp = ksz_decode_tstamp(get_unaligned_be32(tstamp_raw));
+ KSZ_SKB_CB(skb)->tstamp = tstamp;
+}
+
+/* Time stamp tag *needs* to be inserted if PTP is enabled in hardware.
+ * Regardless of Whether it is a PTP frame or not.
+ */
+static void ksz_xmit_timestamp(struct dsa_port *dp, struct sk_buff *skb)
+{
+ struct ksz_tagger_private *priv;
+ struct ptp_header *ptp_hdr;
+ unsigned int ptp_type;
+ u32 tstamp_raw = 0;
+ s64 correction;
+
+ priv = ksz_tagger_private(dp->ds);
+
+ if (!test_bit(KSZ_HWTS_EN, &priv->state))
+ return;
+
+ if (!KSZ_SKB_CB(skb)->update_correction)
+ goto output_tag;
+
+ ptp_type = KSZ_SKB_CB(skb)->ptp_type;
+
+ ptp_hdr = ptp_parse_header(skb, ptp_type);
+ if (!ptp_hdr)
+ goto output_tag;
+
+ correction = (s64)get_unaligned_be64(&ptp_hdr->correction);
+
+ if (correction < 0) {
+ struct timespec64 ts;
+
+ ts = ns_to_timespec64(-correction >> 16);
+ tstamp_raw = ((ts.tv_sec & 3) << 30) | ts.tv_nsec;
+
+ /* Set correction field to 0 and update UDP checksum */
+ ptp_header_update_correction(skb, ptp_type, ptp_hdr, 0);
+ }
+
+output_tag:
+ put_unaligned_be32(tstamp_raw, skb_put(skb, KSZ_PTP_TAG_LEN));
+}
+
+/* Defer transmit if waiting for egress time stamp is required. */
+static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb)
+{
+ struct ksz_tagger_data *tagger_data = ksz_tagger_data(dp->ds);
+ struct ksz_tagger_private *priv = ksz_tagger_private(dp->ds);
+ void (*xmit_work_fn)(struct kthread_work *work);
+ struct sk_buff *clone = KSZ_SKB_CB(skb)->clone;
+ struct ksz_deferred_xmit_work *xmit_work;
+ struct kthread_worker *xmit_worker;
+
+ if (!clone)
+ return skb; /* no deferred xmit for this packet */
+
+ xmit_work_fn = tagger_data->xmit_work_fn;
+ xmit_worker = priv->xmit_worker;
+
+ if (!xmit_work_fn || !xmit_worker)
+ return NULL;
+
+ xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+ if (!xmit_work)
+ return NULL;
+
+ kthread_init_work(&xmit_work->work, xmit_work_fn);
+ /* Increase refcount so the kfree_skb in dsa_user_xmit
+ * won't really free the packet.
+ */
+ xmit_work->dp = dp;
+ xmit_work->skb = skb_get(skb);
+
+ kthread_queue_work(xmit_worker, &xmit_work->work);
+
+ return NULL;
+}
+
+static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 prio = netdev_txq_to_tc(dev, queue_mapping);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct ethhdr *hdr;
+ __be16 *tag;
+ u16 val;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ ksz_xmit_timestamp(dp, skb);
+
+ tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN);
+ hdr = skb_eth_hdr(skb);
+
+ val = dsa_xmit_port_mask(skb, dev);
+ val |= FIELD_PREP(KSZ9477_TAIL_TAG_PRIO, prio);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ val |= KSZ9477_TAIL_TAG_OVERRIDE;
+
+ *tag = cpu_to_be16(val);
+
+ return ksz_defer_xmit(dp, skb);
+}
+
+static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned int len = KSZ_EGRESS_TAG_LEN;
+ unsigned int port;
+ u8 *tag;
+
+ if (skb_linearize(skb))
+ return NULL;
+
+ /* Tag decoding */
+ tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+ port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M;
+
+ /* Extra 4-bytes PTP timestamp */
+ if (tag[0] & KSZ9477_PTP_TAG_INDICATION) {
+ ksz_rcv_timestamp(skb, tag);
+ len += KSZ_PTP_TAG_LEN;
+ }
+
+ return ksz_common_rcv(skb, dev, port, len);
+}
+
+static const struct dsa_device_ops ksz9477_netdev_ops = {
+ .name = KSZ9477_NAME,
+ .proto = DSA_TAG_PROTO_KSZ9477,
+ .xmit = ksz9477_xmit,
+ .rcv = ksz9477_rcv,
+ .connect = ksz_connect,
+ .disconnect = ksz_disconnect,
+ .needed_tailroom = KSZ9477_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz9477_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477, KSZ9477_NAME);
+
+#define KSZ9893_TAIL_TAG_PRIO GENMASK(4, 3)
+#define KSZ9893_TAIL_TAG_OVERRIDE BIT(5)
+#define KSZ9893_TAIL_TAG_LOOKUP BIT(6)
+
+static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 prio = netdev_txq_to_tc(dev, queue_mapping);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct ethhdr *hdr;
+ u8 *tag;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ ksz_xmit_timestamp(dp, skb);
+
+ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
+ hdr = skb_eth_hdr(skb);
+
+ *tag = dsa_xmit_port_mask(skb, dev);
+ *tag |= FIELD_PREP(KSZ9893_TAIL_TAG_PRIO, prio);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ *tag |= KSZ9893_TAIL_TAG_OVERRIDE;
+
+ return ksz_defer_xmit(dp, skb);
+}
+
+static const struct dsa_device_ops ksz9893_netdev_ops = {
+ .name = KSZ9893_NAME,
+ .proto = DSA_TAG_PROTO_KSZ9893,
+ .xmit = ksz9893_xmit,
+ .rcv = ksz9477_rcv,
+ .connect = ksz_connect,
+ .disconnect = ksz_disconnect,
+ .needed_tailroom = KSZ_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz9893_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME);
+
+/* For xmit, 2/6 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)|
+ * FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if PTP is enabled in the Hardware)
+ * tag0 : represents tag override, lookup and valid
+ * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8)
+ *
+ * For rcv, 1/5 bytes is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if bit 7 of tag0 is set)
+ * tag0 : zero-based value represents port
+ * (eg, 0x00=port1, 0x02=port3, 0x07=port8)
+ */
+#define LAN937X_EGRESS_TAG_LEN 2
+
+#define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE BIT(11)
+#define LAN937X_TAIL_TAG_LOOKUP BIT(12)
+#define LAN937X_TAIL_TAG_VALID BIT(13)
+#define LAN937X_TAIL_TAG_PRIO GENMASK(10, 8)
+#define LAN937X_TAIL_TAG_PORT_MASK 7
+
+static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 prio = netdev_txq_to_tc(dev, queue_mapping);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ const struct ethhdr *hdr = eth_hdr(skb);
+ __be16 *tag;
+ u16 val;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ ksz_xmit_timestamp(dp, skb);
+
+ tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN);
+
+ val = dsa_xmit_port_mask(skb, dev);
+ val |= FIELD_PREP(LAN937X_TAIL_TAG_PRIO, prio);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE;
+
+ /* Tail tag valid bit - This bit should always be set by the CPU */
+ val |= LAN937X_TAIL_TAG_VALID;
+
+ put_unaligned_be16(val, tag);
+
+ return ksz_defer_xmit(dp, skb);
+}
+
+static const struct dsa_device_ops lan937x_netdev_ops = {
+ .name = LAN937X_NAME,
+ .proto = DSA_TAG_PROTO_LAN937X,
+ .xmit = lan937x_xmit,
+ .rcv = ksz9477_rcv,
+ .connect = ksz_connect,
+ .disconnect = ksz_disconnect,
+ .needed_tailroom = LAN937X_EGRESS_TAG_LEN + KSZ_PTP_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(lan937x_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X, LAN937X_NAME);
+
+static struct dsa_tag_driver *dsa_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(lan937x_netdev_ops),
+};
+
+module_dsa_tag_drivers(dsa_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for Microchip 8795/937x/9477/9893 families of switches");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
new file mode 100644
index 000000000000..258e5d7dc5ef
--- /dev/null
+++ b/net/dsa/tag_lan9303.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Pengutronix, Juergen Borleis <jbe@pengutronix.de>
+ */
+#include <linux/dsa/lan9303.h>
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "tag.h"
+
+/* To define the outgoing port and to discover the incoming port a regular
+ * VLAN tag is used by the LAN9303. But its VID meaning is 'special':
+ *
+ * Dest MAC Src MAC TAG Type
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 | 1 2 |...
+ * |<------->|
+ * TAG:
+ * |<------------->|
+ * | 1 2 | 3 4 |
+ * TPID VID
+ * 0x8100
+ *
+ * VID bit 3 indicates a request for an ALR lookup.
+ *
+ * If VID bit 3 is zero, then bits 0 and 1 specify the destination port
+ * (0, 1, 2) or broadcast (3) or the source port (1, 2).
+ *
+ * VID bit 4 is used to specify if the STP port state should be overridden.
+ * Required when no forwarding between the external ports should happen.
+ */
+
+#define LAN9303_NAME "lan9303"
+
+#define LAN9303_TAG_LEN 4
+# define LAN9303_TAG_TX_USE_ALR BIT(3)
+# define LAN9303_TAG_TX_STP_OVERRIDE BIT(4)
+# define LAN9303_TAG_RX_IGMP BIT(3)
+# define LAN9303_TAG_RX_STP BIT(4)
+# define LAN9303_TAG_RX_TRAPPED_TO_CPU (LAN9303_TAG_RX_IGMP | \
+ LAN9303_TAG_RX_STP)
+
+/* Decide whether to transmit using ALR lookup, or transmit directly to
+ * port using tag. ALR learning is performed only when using ALR lookup.
+ * If the two external ports are bridged and the frame is unicast,
+ * then use ALR lookup to allow ALR learning on CPU port.
+ * Otherwise transmit directly to port with STP state override.
+ * See also: lan9303_separate_ports() and lan9303.pdf 6.4.10.1
+ */
+static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr)
+{
+ struct lan9303 *chip = dp->ds->priv;
+
+ return chip->is_bridged && !is_multicast_ether_addr(dest_addr);
+}
+
+static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ __be16 *lan9303_tag;
+ u16 tag;
+
+ /* provide 'LAN9303_TAG_LEN' bytes additional space */
+ skb_push(skb, LAN9303_TAG_LEN);
+
+ /* make room between MACs and Ether-Type */
+ dsa_alloc_etype_header(skb, LAN9303_TAG_LEN);
+
+ lan9303_tag = dsa_etype_header_pos_tx(skb);
+
+ tag = lan9303_xmit_use_arl(dp, skb->data) ?
+ LAN9303_TAG_TX_USE_ALR :
+ dp->index | LAN9303_TAG_TX_STP_OVERRIDE;
+ lan9303_tag[0] = htons(ETH_P_8021Q);
+ lan9303_tag[1] = htons(tag);
+
+ return skb;
+}
+
+static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ u16 lan9303_tag1;
+ unsigned int source_port;
+
+ if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) {
+ dev_warn_ratelimited(&dev->dev,
+ "Dropping packet, cannot pull\n");
+ return NULL;
+ }
+
+ if (skb_vlan_tag_present(skb)) {
+ lan9303_tag1 = skb_vlan_tag_get(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &lan9303_tag1);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ }
+
+ source_port = lan9303_tag1 & 0x3;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
+ return NULL;
+ }
+
+ if (!(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU))
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops lan9303_netdev_ops = {
+ .name = LAN9303_NAME,
+ .proto = DSA_TAG_PROTO_LAN9303,
+ .xmit = lan9303_xmit,
+ .rcv = lan9303_rcv,
+ .needed_headroom = LAN9303_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for SMSC/Microchip LAN9303 family of switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303, LAN9303_NAME);
+
+module_dsa_tag_driver(lan9303_netdev_ops);
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
new file mode 100644
index 000000000000..dea3eecaf093
--- /dev/null
+++ b/net/dsa/tag_mtk.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediatek DSA Tag support
+ * Copyright (C) 2017 Landen Chao <landen.chao@mediatek.com>
+ * Sean Wang <sean.wang@mediatek.com>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+
+#include "tag.h"
+
+#define MTK_NAME "mtk"
+
+#define MTK_HDR_LEN 4
+#define MTK_HDR_XMIT_UNTAGGED 0
+#define MTK_HDR_XMIT_TAGGED_TPID_8100 1
+#define MTK_HDR_XMIT_TAGGED_TPID_88A8 2
+#define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
+#define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0)
+#define MTK_HDR_XMIT_SA_DIS BIT(6)
+
+static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ u8 xmit_tpid;
+ u8 *mtk_tag;
+
+ skb_set_queue_mapping(skb, dp->index);
+
+ /* Build the special tag after the MAC Source Address. If VLAN header
+ * is present, it's required that VLAN header and special tag is
+ * being combined. Only in this way we can allow the switch can parse
+ * the both special and VLAN tag at the same time and then look up VLAN
+ * table with VID.
+ */
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ xmit_tpid = MTK_HDR_XMIT_TAGGED_TPID_8100;
+ break;
+ case htons(ETH_P_8021AD):
+ xmit_tpid = MTK_HDR_XMIT_TAGGED_TPID_88A8;
+ break;
+ default:
+ xmit_tpid = MTK_HDR_XMIT_UNTAGGED;
+ skb_push(skb, MTK_HDR_LEN);
+ dsa_alloc_etype_header(skb, MTK_HDR_LEN);
+ }
+
+ mtk_tag = dsa_etype_header_pos_tx(skb);
+
+ /* Mark tag attribute on special tag insertion to notify hardware
+ * whether that's a combined special tag with 802.1Q header.
+ */
+ mtk_tag[0] = xmit_tpid;
+ mtk_tag[1] = FIELD_PREP(MTK_HDR_XMIT_DP_BIT_MASK,
+ dsa_xmit_port_mask(skb, dev));
+
+ /* Tag control information is kept for 802.1Q */
+ if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) {
+ mtk_tag[2] = 0;
+ mtk_tag[3] = 0;
+ }
+
+ return skb;
+}
+
+static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ u16 hdr;
+ int port;
+ __be16 *phdr;
+
+ if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))
+ return NULL;
+
+ phdr = dsa_etype_header_pos_rx(skb);
+ hdr = ntohs(*phdr);
+
+ /* Remove MTK tag and recalculate checksum. */
+ skb_pull_rcsum(skb, MTK_HDR_LEN);
+
+ dsa_strip_etype_header(skb, MTK_HDR_LEN);
+
+ /* Get source port information */
+ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops mtk_netdev_ops = {
+ .name = MTK_NAME,
+ .proto = DSA_TAG_PROTO_MTK,
+ .xmit = mtk_tag_xmit,
+ .rcv = mtk_tag_rcv,
+ .needed_headroom = MTK_HDR_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Mediatek switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK, MTK_NAME);
+
+module_dsa_tag_driver(mtk_netdev_ops);
diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c
new file mode 100644
index 000000000000..60f7c445e656
--- /dev/null
+++ b/net/dsa/tag_mxl-gsw1xx.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * DSA driver Special Tag support for MaxLinear GSW1xx switch chips
+ *
+ * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org>
+ * Copyright (C) 2023 - 2024 MaxLinear Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+/* To define the outgoing port and to discover the incoming port a special
+ * tag is used by the GSW1xx.
+ *
+ * Dest MAC Src MAC special TAG EtherType
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ * |<--------------->|
+ */
+
+#define GSW1XX_TAG_NAME "gsw1xx"
+
+/* special tag header length (RX and TX) */
+#define GSW1XX_HEADER_LEN 8
+
+/* Word 0 = Ethertype -> 0x88C3 */
+
+/* Word 1 */
+#define GSW1XX_TX_PORT_MAP GENMASK(7, 0)
+#define GSW1XX_TX_PORT_MAP_EN BIT(15)
+#define GSW1XX_TX_CLASS_EN BIT(14)
+#define GSW1XX_TX_TIME_STAMP_EN BIT(13)
+#define GSW1XX_TX_LRN_DIS BIT(12)
+#define GSW1XX_TX_CLASS GENMASK(11, 8)
+
+/* special tag in RX path header */
+/* Word 2 */
+#define GSW1XX_RX_PORT_MAP GENMASK(15, 8)
+
+static struct sk_buff *gsw1xx_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ __be16 *gsw1xx_tag;
+ u16 tag;
+
+ /* provide additional space 'GSW1XX_HEADER_LEN' bytes */
+ skb_push(skb, GSW1XX_HEADER_LEN);
+
+ /* add space between MAC address and Ethertype */
+ dsa_alloc_etype_header(skb, GSW1XX_HEADER_LEN);
+
+ /* special tag ingress */
+ gsw1xx_tag = dsa_etype_header_pos_tx(skb);
+ gsw1xx_tag[0] = htons(ETH_P_MXLGSW);
+
+ tag = FIELD_PREP(GSW1XX_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev)) |
+ GSW1XX_TX_PORT_MAP_EN | GSW1XX_TX_LRN_DIS;
+ gsw1xx_tag[1] = htons(tag);
+ gsw1xx_tag[2] = 0;
+ gsw1xx_tag[3] = 0;
+
+ return skb;
+}
+
+static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int port;
+ __be16 *gsw1xx_tag;
+
+ if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n");
+ return NULL;
+ }
+
+ gsw1xx_tag = dsa_etype_header_pos_rx(skb);
+
+ if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n");
+ dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+ return NULL;
+ }
+
+ /* Get source port information */
+ port = FIELD_GET(GSW1XX_RX_PORT_MAP, ntohs(gsw1xx_tag[1]));
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
+ dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+ return NULL;
+ }
+
+ /* remove the GSW1xx special tag between MAC addresses and the current
+ * ethertype field.
+ */
+ skb_pull_rcsum(skb, GSW1XX_HEADER_LEN);
+ dsa_strip_etype_header(skb, GSW1XX_HEADER_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops gsw1xx_netdev_ops = {
+ .name = GSW1XX_TAG_NAME,
+ .proto = DSA_TAG_PROTO_MXL_GSW1XX,
+ .xmit = gsw1xx_tag_xmit,
+ .rcv = gsw1xx_tag_rcv,
+ .needed_headroom = GSW1XX_HEADER_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for MaxLinear GSW1xx 8 byte protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL_GSW1XX, GSW1XX_TAG_NAME);
+
+module_dsa_tag_driver(gsw1xx_netdev_ops);
diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c
new file mode 100644
index 000000000000..e9c9670a9c44
--- /dev/null
+++ b/net/dsa/tag_none.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/tag_none.c - Traffic handling for switches with no tag
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ *
+ * WARNING: do not use this for new switches. In case of no hardware
+ * tagging support, look at tag_8021q.c instead.
+ */
+
+#include "tag.h"
+
+#define NONE_NAME "none"
+
+static struct sk_buff *dsa_user_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Just return the original SKB */
+ return skb;
+}
+
+static const struct dsa_device_ops none_ops = {
+ .name = NONE_NAME,
+ .proto = DSA_TAG_PROTO_NONE,
+ .xmit = dsa_user_notag_xmit,
+};
+
+module_dsa_tag_driver(none_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_NONE, NONE_NAME);
+MODULE_DESCRIPTION("DSA no-op tag driver");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
new file mode 100644
index 000000000000..3405def79c2d
--- /dev/null
+++ b/net/dsa/tag_ocelot.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 NXP
+ */
+#include <linux/dsa/ocelot.h>
+
+#include "tag.h"
+
+#define OCELOT_NAME "ocelot"
+#define SEVILLE_NAME "seville"
+
+static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
+ __be32 ifh_prefix, void **ifh)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ struct dsa_switch *ds = dp->ds;
+ u64 vlan_tci, tag_type;
+ void *injection;
+ __be32 *prefix;
+ u32 rew_op = 0;
+ u64 qos_class;
+
+ ocelot_xmit_get_vlan_info(skb, dsa_port_bridge_dev_get(dp), &vlan_tci,
+ &tag_type);
+
+ qos_class = netdev_get_num_tc(netdev) ?
+ netdev_get_prio_tc_map(netdev, skb->priority) : skb->priority;
+
+ injection = skb_push(skb, OCELOT_TAG_LEN);
+ prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN);
+
+ *prefix = ifh_prefix;
+ memset(injection, 0, OCELOT_TAG_LEN);
+ ocelot_ifh_set_bypass(injection, 1);
+ ocelot_ifh_set_src(injection, ds->num_ports);
+ ocelot_ifh_set_qos_class(injection, qos_class);
+ ocelot_ifh_set_vlan_tci(injection, vlan_tci);
+ ocelot_ifh_set_tag_type(injection, tag_type);
+
+ rew_op = ocelot_ptp_rew_op(skb);
+ if (rew_op)
+ ocelot_ifh_set_rew_op(injection, rew_op);
+
+ *ifh = injection;
+}
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ void *injection;
+
+ ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection);
+ ocelot_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev));
+
+ return skb;
+}
+
+static struct sk_buff *seville_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ void *injection;
+
+ ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection);
+ seville_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev));
+
+ return skb;
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ u64 src_port, qos_class;
+ u64 vlan_tci, tag_type;
+ u8 *start = skb->data;
+ struct dsa_port *dp;
+ u8 *extraction;
+ u16 vlan_tpid;
+ u64 rew_val;
+
+ /* Revert skb->data by the amount consumed by the DSA conduit,
+ * so it points to the beginning of the frame.
+ */
+ skb_push(skb, ETH_HLEN);
+ /* We don't care about the short prefix, it is just for easy entrance
+ * into the DSA conduit's RX filter. Discard it now by moving it into
+ * the headroom.
+ */
+ skb_pull(skb, OCELOT_SHORT_PREFIX_LEN);
+ /* And skb->data now points to the extraction frame header.
+ * Keep a pointer to it.
+ */
+ extraction = skb->data;
+ /* Now the EFH is part of the headroom as well */
+ skb_pull(skb, OCELOT_TAG_LEN);
+ /* Reset the pointer to the real MAC header */
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ /* And move skb->data to the correct location again */
+ skb_pull(skb, ETH_HLEN);
+
+ /* Remove from inet csum the extraction header */
+ skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN);
+
+ ocelot_xfh_get_src_port(extraction, &src_port);
+ ocelot_xfh_get_qos_class(extraction, &qos_class);
+ ocelot_xfh_get_tag_type(extraction, &tag_type);
+ ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
+ ocelot_xfh_get_rew_val(extraction, &rew_val);
+
+ skb->dev = dsa_conduit_find_user(netdev, 0, src_port);
+ if (!skb->dev)
+ /* The switch will reflect back some frames sent through
+ * sockets opened on the bare DSA conduit. These will come back
+ * with src_port equal to the index of the CPU port, for which
+ * there is no user registered. So don't print any error
+ * message here (ignore and drop those frames).
+ */
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+ skb->priority = qos_class;
+ OCELOT_SKB_CB(skb)->tstamp_lo = rew_val;
+
+ /* Ocelot switches copy frames unmodified to the CPU. However, it is
+ * possible for the user to request a VLAN modification through
+ * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that
+ * the VLAN ID field from the Extraction Header gets updated, but the
+ * 802.1Q header does not (the classified VLAN only becomes visible on
+ * egress through the "port tag" of front-panel ports).
+ * So, for traffic extracted by the CPU, we want to pick up the
+ * classified VLAN and manually replace the existing 802.1Q header from
+ * the packet with it, so that the operating system is always up to
+ * date with the result of tc-vlan actions.
+ * NOTE: In VLAN-unaware mode, we don't want to do that, we want the
+ * frame to remain unmodified, because the classified VLAN is always
+ * equal to the pvid of the ingress port and should not be used for
+ * processing.
+ */
+ dp = dsa_user_to_port(skb->dev);
+ vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q;
+
+ if (dsa_port_is_vlan_filtering(dp) &&
+ eth_hdr(skb)->h_proto == htons(vlan_tpid)) {
+ u16 dummy_vlan_tci;
+
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &dummy_vlan_tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ __vlan_hwaccel_put_tag(skb, htons(vlan_tpid), vlan_tci);
+ }
+
+ return skb;
+}
+
+static const struct dsa_device_ops ocelot_netdev_ops = {
+ .name = OCELOT_NAME,
+ .proto = DSA_TAG_PROTO_OCELOT,
+ .xmit = ocelot_xmit,
+ .rcv = ocelot_rcv,
+ .needed_headroom = OCELOT_TOTAL_TAG_LEN,
+ .promisc_on_conduit = true,
+};
+
+DSA_TAG_DRIVER(ocelot_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT, OCELOT_NAME);
+
+static const struct dsa_device_ops seville_netdev_ops = {
+ .name = SEVILLE_NAME,
+ .proto = DSA_TAG_PROTO_SEVILLE,
+ .xmit = seville_xmit,
+ .rcv = ocelot_rcv,
+ .needed_headroom = OCELOT_TOTAL_TAG_LEN,
+ .promisc_on_conduit = true,
+};
+
+DSA_TAG_DRIVER(seville_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE, SEVILLE_NAME);
+
+static struct dsa_tag_driver *ocelot_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(ocelot_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(seville_netdev_ops),
+};
+
+module_dsa_tag_drivers(ocelot_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for Ocelot family of switches, using NPI port");
+MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
new file mode 100644
index 000000000000..3929584791e4
--- /dev/null
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2020-2021 NXP
+ *
+ * An implementation of the software-defined tag_8021q.c tagger format, which
+ * also preserves full functionality under a vlan_filtering bridge. It does
+ * this by using the TCAM engines for:
+ * - pushing the RX VLAN as a second, outer tag, on egress towards the CPU port
+ * - redirecting towards the correct front port based on TX VLAN and popping
+ * that on egress
+ */
+#include <linux/dsa/8021q.h>
+#include <linux/dsa/ocelot.h>
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define OCELOT_8021Q_NAME "ocelot-8021q"
+
+struct ocelot_8021q_tagger_private {
+ struct ocelot_8021q_tagger_data data; /* Must be first */
+ struct kthread_worker *xmit_worker;
+};
+
+static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
+ struct sk_buff *skb)
+{
+ struct ocelot_8021q_tagger_private *priv = dp->ds->tagger_data;
+ struct ocelot_8021q_tagger_data *data = &priv->data;
+ void (*xmit_work_fn)(struct kthread_work *work);
+ struct felix_deferred_xmit_work *xmit_work;
+ struct kthread_worker *xmit_worker;
+
+ xmit_work_fn = data->xmit_work_fn;
+ xmit_worker = priv->xmit_worker;
+
+ if (!xmit_work_fn || !xmit_worker)
+ return NULL;
+
+ /* PTP over IP packets need UDP checksumming. We may have inherited
+ * NETIF_F_HW_CSUM from the DSA conduit, but these packets are not sent
+ * through the DSA conduit, so calculate the checksum here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+ if (!xmit_work)
+ return NULL;
+
+ /* Calls felix_port_deferred_xmit in felix.c */
+ kthread_init_work(&xmit_work->work, xmit_work_fn);
+ /* Increase refcount so the kfree_skb in dsa_user_xmit
+ * won't really free the packet.
+ */
+ xmit_work->dp = dp;
+ xmit_work->skb = skb_get(skb);
+
+ kthread_queue_work(xmit_worker, &xmit_work->work);
+
+ return NULL;
+}
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+ struct ethhdr *hdr = eth_hdr(skb);
+
+ if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest))
+ return ocelot_defer_xmit(dp, skb);
+
+ return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q,
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ int src_port = -1, switch_id = -1;
+
+ dsa_8021q_rcv(skb, &src_port, &switch_id, NULL, NULL);
+
+ skb->dev = dsa_conduit_find_user(netdev, switch_id, src_port);
+ if (!skb->dev)
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static void ocelot_disconnect(struct dsa_switch *ds)
+{
+ struct ocelot_8021q_tagger_private *priv = ds->tagger_data;
+
+ kthread_destroy_worker(priv->xmit_worker);
+ kfree(priv);
+ ds->tagger_data = NULL;
+}
+
+static int ocelot_connect(struct dsa_switch *ds)
+{
+ struct ocelot_8021q_tagger_private *priv;
+ int err;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->xmit_worker = kthread_run_worker(0, "felix_xmit");
+ if (IS_ERR(priv->xmit_worker)) {
+ err = PTR_ERR(priv->xmit_worker);
+ kfree(priv);
+ return err;
+ }
+
+ ds->tagger_data = priv;
+
+ return 0;
+}
+
+static const struct dsa_device_ops ocelot_8021q_netdev_ops = {
+ .name = OCELOT_8021Q_NAME,
+ .proto = DSA_TAG_PROTO_OCELOT_8021Q,
+ .xmit = ocelot_xmit,
+ .rcv = ocelot_rcv,
+ .connect = ocelot_connect,
+ .disconnect = ocelot_disconnect,
+ .needed_headroom = VLAN_HLEN,
+ .promisc_on_conduit = true,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Ocelot family of switches, using VLAN");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q, OCELOT_8021Q_NAME);
+
+module_dsa_tag_driver(ocelot_8021q_netdev_ops);
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 000000000000..6d56a28c914c
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/bitfield.h>
+#include <net/dsa.h>
+#include <linux/dsa/tag_qca.h>
+
+#include "tag.h"
+
+#define QCA_NAME "qca"
+
+static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ __be16 *phdr;
+ u16 hdr;
+
+ skb_push(skb, QCA_HDR_LEN);
+
+ dsa_alloc_etype_header(skb, QCA_HDR_LEN);
+ phdr = dsa_etype_header_pos_tx(skb);
+
+ /* Set the version field, and set destination port information */
+ hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION);
+ hdr |= QCA_HDR_XMIT_FROM_CPU;
+ hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, dsa_xmit_port_mask(skb, dev));
+
+ *phdr = htons(hdr);
+
+ return skb;
+}
+
+static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ struct qca_tagger_data *tagger_data;
+ struct dsa_port *dp = dev->dsa_ptr;
+ struct dsa_switch *ds = dp->ds;
+ u8 ver, pk_type;
+ __be16 *phdr;
+ int port;
+ u16 hdr;
+
+ BUILD_BUG_ON(sizeof(struct qca_mgmt_ethhdr) != QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN);
+
+ tagger_data = ds->tagger_data;
+
+ if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+ return NULL;
+
+ phdr = dsa_etype_header_pos_rx(skb);
+ hdr = ntohs(*phdr);
+
+ /* Make sure the version is correct */
+ ver = FIELD_GET(QCA_HDR_RECV_VERSION, hdr);
+ if (unlikely(ver != QCA_HDR_VERSION))
+ return NULL;
+
+ /* Get pk type */
+ pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr);
+
+ /* Ethernet mgmt read/write packet */
+ if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) {
+ if (likely(tagger_data->rw_reg_ack_handler))
+ tagger_data->rw_reg_ack_handler(ds, skb);
+ return NULL;
+ }
+
+ /* Ethernet MIB counter packet */
+ if (pk_type == QCA_HDR_RECV_TYPE_MIB) {
+ if (likely(tagger_data->mib_autocast_handler))
+ tagger_data->mib_autocast_handler(ds, skb);
+ return NULL;
+ }
+
+ /* Get source port information */
+ port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ /* Remove QCA tag and recalculate checksum */
+ skb_pull_rcsum(skb, QCA_HDR_LEN);
+ dsa_strip_etype_header(skb, QCA_HDR_LEN);
+
+ return skb;
+}
+
+static int qca_tag_connect(struct dsa_switch *ds)
+{
+ struct qca_tagger_data *tagger_data;
+
+ tagger_data = kzalloc(sizeof(*tagger_data), GFP_KERNEL);
+ if (!tagger_data)
+ return -ENOMEM;
+
+ ds->tagger_data = tagger_data;
+
+ return 0;
+}
+
+static void qca_tag_disconnect(struct dsa_switch *ds)
+{
+ kfree(ds->tagger_data);
+ ds->tagger_data = NULL;
+}
+
+static const struct dsa_device_ops qca_netdev_ops = {
+ .name = QCA_NAME,
+ .proto = DSA_TAG_PROTO_QCA,
+ .connect = qca_tag_connect,
+ .disconnect = qca_tag_disconnect,
+ .xmit = qca_tag_xmit,
+ .rcv = qca_tag_rcv,
+ .needed_headroom = QCA_HDR_LEN,
+ .promisc_on_conduit = true,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Qualcomm Atheros QCA8K switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA, QCA_NAME);
+
+module_dsa_tag_driver(qca_netdev_ops);
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
new file mode 100644
index 000000000000..3cc63eacfa03
--- /dev/null
+++ b/net/dsa/tag_rtl4_a.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handler for Realtek 4 byte DSA switch tags
+ * Currently only supports protocol "A" found in RTL8366RB
+ * Copyright (c) 2020 Linus Walleij <linus.walleij@linaro.org>
+ *
+ * This "proprietary tag" header looks like so:
+ *
+ * -------------------------------------------------
+ * | MAC DA | MAC SA | 0x8899 | 2 bytes tag | Type |
+ * -------------------------------------------------
+ *
+ * The 2 bytes tag form a 16 bit big endian word. The exact
+ * meaning has been guessed from packet dumps from ingress
+ * frames.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/bits.h>
+
+#include "tag.h"
+
+#define RTL4_A_NAME "rtl4a"
+
+#define RTL4_A_HDR_LEN 4
+#define RTL4_A_PROTOCOL_SHIFT 12
+/*
+ * 0x1 = Realtek Remote Control protocol (RRCP)
+ * 0x2/0x3 seems to be used for loopback testing
+ * 0x9 = RTL8306 DSA protocol
+ * 0xa = RTL8366RB DSA protocol
+ */
+#define RTL4_A_PROTOCOL_RTL8366RB 0xa
+
+static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ __be16 *p;
+ u8 *tag;
+ u16 out;
+
+ /* Pad out to at least 60 bytes */
+ if (unlikely(__skb_put_padto(skb, ETH_ZLEN, false)))
+ return NULL;
+
+ netdev_dbg(dev, "add realtek tag to package to port %d\n",
+ dp->index);
+ skb_push(skb, RTL4_A_HDR_LEN);
+
+ dsa_alloc_etype_header(skb, RTL4_A_HDR_LEN);
+ tag = dsa_etype_header_pos_tx(skb);
+
+ /* Set Ethertype */
+ p = (__be16 *)tag;
+ *p = htons(ETH_P_REALTEK);
+
+ out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT);
+ /* The lower bits indicate the port number */
+ out |= dsa_xmit_port_mask(skb, dev);
+
+ p = (__be16 *)(tag + 2);
+ *p = htons(out);
+
+ return skb;
+}
+
+static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 protport;
+ __be16 *p;
+ u16 etype;
+ u8 *tag;
+ u8 prot;
+ u8 port;
+
+ if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN)))
+ return NULL;
+
+ tag = dsa_etype_header_pos_rx(skb);
+ p = (__be16 *)tag;
+ etype = ntohs(*p);
+ if (etype != ETH_P_REALTEK) {
+ /* Not custom, just pass through */
+ netdev_dbg(dev, "non-realtek ethertype 0x%04x\n", etype);
+ return skb;
+ }
+ p = (__be16 *)(tag + 2);
+ protport = ntohs(*p);
+ /* The 4 upper bits are the protocol */
+ prot = (protport >> RTL4_A_PROTOCOL_SHIFT) & 0x0f;
+ if (prot != RTL4_A_PROTOCOL_RTL8366RB) {
+ netdev_err(dev, "unknown realtek protocol 0x%01x\n", prot);
+ return NULL;
+ }
+ port = protport & 0xff;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ netdev_dbg(dev, "could not find user for port %d\n", port);
+ return NULL;
+ }
+
+ /* Remove RTL4 tag and recalculate checksum */
+ skb_pull_rcsum(skb, RTL4_A_HDR_LEN);
+
+ dsa_strip_etype_header(skb, RTL4_A_HDR_LEN);
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops rtl4a_netdev_ops = {
+ .name = RTL4_A_NAME,
+ .proto = DSA_TAG_PROTO_RTL4_A,
+ .xmit = rtl4a_tag_xmit,
+ .rcv = rtl4a_tag_rcv,
+ .needed_headroom = RTL4_A_HDR_LEN,
+};
+module_dsa_tag_driver(rtl4a_netdev_ops);
+
+MODULE_DESCRIPTION("DSA tag driver for Realtek 4 byte protocol A tags");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A, RTL4_A_NAME);
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
new file mode 100644
index 000000000000..2464545da4d2
--- /dev/null
+++ b/net/dsa/tag_rtl8_4.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handler for Realtek 8 byte switch tags
+ *
+ * Copyright (C) 2021 Alvin Å ipraga <alsi@bang-olufsen.dk>
+ *
+ * NOTE: Currently only supports protocol "4" found in the RTL8365MB, hence
+ * named tag_rtl8_4.
+ *
+ * This tag has the following format:
+ *
+ * 0 7|8 15
+ * |-----------------------------------+-----------------------------------|---
+ * | (16-bit) | ^
+ * | Realtek EtherType [0x8899] | |
+ * |-----------------------------------+-----------------------------------| 8
+ * | (8-bit) | (8-bit) |
+ * | Protocol [0x04] | REASON | b
+ * |-----------------------------------+-----------------------------------| y
+ * | (1) | (1) | (2) | (1) | (3) | (1) | (1) | (1) | (5) | t
+ * | FID_EN | X | FID | PRI_EN | PRI | KEEP | X | LEARN_DIS | X | e
+ * |-----------------------------------+-----------------------------------| s
+ * | (1) | (15-bit) | |
+ * | ALLOW | TX/RX | v
+ * |-----------------------------------+-----------------------------------|---
+ *
+ * With the following field descriptions:
+ *
+ * field | description
+ * ------------+-------------
+ * Realtek | 0x8899: indicates that this is a proprietary Realtek tag;
+ * EtherType | note that Realtek uses the same EtherType for
+ * | other incompatible tag formats (e.g. tag_rtl4_a.c)
+ * Protocol | 0x04: indicates that this tag conforms to this format
+ * X | reserved
+ * ------------+-------------
+ * REASON | reason for forwarding packet to CPU
+ * | 0: packet was forwarded or flooded to CPU
+ * | 80: packet was trapped to CPU
+ * FID_EN | 1: packet has an FID
+ * | 0: no FID
+ * FID | FID of packet (if FID_EN=1)
+ * PRI_EN | 1: force priority of packet
+ * | 0: don't force priority
+ * PRI | priority of packet (if PRI_EN=1)
+ * KEEP | preserve packet VLAN tag format
+ * LEARN_DIS | don't learn the source MAC address of the packet
+ * ALLOW | 1: treat TX/RX field as an allowance port mask, meaning the
+ * | packet may only be forwarded to ports specified in the
+ * | mask
+ * | 0: no allowance port mask, TX/RX field is the forwarding
+ * | port mask
+ * TX/RX | TX (switch->CPU): port number the packet was received on
+ * | RX (CPU->switch): forwarding port mask (if ALLOW=0)
+ * | allowance port mask (if ALLOW=1)
+ *
+ * The tag can be positioned before Ethertype, using tag "rtl8_4":
+ *
+ * +--------+--------+------------+------+-----
+ * | MAC DA | MAC SA | 8 byte tag | Type | ...
+ * +--------+--------+------------+------+-----
+ *
+ * The tag can also appear between the end of the payload and before the CRC,
+ * using tag "rtl8_4t":
+ *
+ * +--------+--------+------+-----+---------+------------+-----+
+ * | MAC DA | MAC SA | TYPE | ... | payload | 8-byte tag | CRC |
+ * +--------+--------+------+-----+---------+------------+-----+
+ *
+ * The added bytes after the payload will break most checksums, either in
+ * software or hardware. To avoid this issue, if the checksum is still pending,
+ * this tagger checksums the packet in software before adding the tag.
+ *
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+/* Protocols supported:
+ *
+ * 0x04 = RTL8365MB DSA protocol
+ */
+
+#define RTL8_4_NAME "rtl8_4"
+#define RTL8_4T_NAME "rtl8_4t"
+
+#define RTL8_4_TAG_LEN 8
+
+#define RTL8_4_PROTOCOL GENMASK(15, 8)
+#define RTL8_4_PROTOCOL_RTL8365MB 0x04
+#define RTL8_4_REASON GENMASK(7, 0)
+#define RTL8_4_REASON_FORWARD 0
+#define RTL8_4_REASON_TRAP 80
+
+#define RTL8_4_LEARN_DIS BIT(5)
+
+#define RTL8_4_TX GENMASK(3, 0)
+#define RTL8_4_RX GENMASK(10, 0)
+
+static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev,
+ void *tag)
+{
+ __be16 tag16[RTL8_4_TAG_LEN / 2];
+
+ /* Set Realtek EtherType */
+ tag16[0] = htons(ETH_P_REALTEK);
+
+ /* Set Protocol; zero REASON */
+ tag16[1] = htons(FIELD_PREP(RTL8_4_PROTOCOL, RTL8_4_PROTOCOL_RTL8365MB));
+
+ /* Zero FID_EN, FID, PRI_EN, PRI, KEEP; set LEARN_DIS */
+ tag16[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1));
+
+ /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */
+ tag16[3] = htons(FIELD_PREP(RTL8_4_RX, dsa_xmit_port_mask(skb, dev)));
+
+ memcpy(tag, tag16, RTL8_4_TAG_LEN);
+}
+
+static struct sk_buff *rtl8_4_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ skb_push(skb, RTL8_4_TAG_LEN);
+
+ dsa_alloc_etype_header(skb, RTL8_4_TAG_LEN);
+
+ rtl8_4_write_tag(skb, dev, dsa_etype_header_pos_tx(skb));
+
+ return skb;
+}
+
+static struct sk_buff *rtl8_4t_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Calculate the checksum here if not done yet as trailing tags will
+ * break either software or hardware based checksum
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ rtl8_4_write_tag(skb, dev, skb_put(skb, RTL8_4_TAG_LEN));
+
+ return skb;
+}
+
+static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev,
+ void *tag)
+{
+ __be16 tag16[RTL8_4_TAG_LEN / 2];
+ u16 etype;
+ u8 reason;
+ u8 proto;
+ u8 port;
+
+ memcpy(tag16, tag, RTL8_4_TAG_LEN);
+
+ /* Parse Realtek EtherType */
+ etype = ntohs(tag16[0]);
+ if (unlikely(etype != ETH_P_REALTEK)) {
+ dev_warn_ratelimited(&dev->dev,
+ "non-realtek ethertype 0x%04x\n", etype);
+ return -EPROTO;
+ }
+
+ /* Parse Protocol */
+ proto = FIELD_GET(RTL8_4_PROTOCOL, ntohs(tag16[1]));
+ if (unlikely(proto != RTL8_4_PROTOCOL_RTL8365MB)) {
+ dev_warn_ratelimited(&dev->dev,
+ "unknown realtek protocol 0x%02x\n",
+ proto);
+ return -EPROTO;
+ }
+
+ /* Parse REASON */
+ reason = FIELD_GET(RTL8_4_REASON, ntohs(tag16[1]));
+
+ /* Parse TX (switch->CPU) */
+ port = FIELD_GET(RTL8_4_TX, ntohs(tag16[3]));
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&dev->dev,
+ "could not find user for port %d\n",
+ port);
+ return -ENOENT;
+ }
+
+ if (reason != RTL8_4_REASON_TRAP)
+ dsa_default_offload_fwd_mark(skb);
+
+ return 0;
+}
+
+static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN)))
+ return NULL;
+
+ if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb))))
+ return NULL;
+
+ /* Remove tag and recalculate checksum */
+ skb_pull_rcsum(skb, RTL8_4_TAG_LEN);
+
+ dsa_strip_etype_header(skb, RTL8_4_TAG_LEN);
+
+ return skb;
+}
+
+static struct sk_buff *rtl8_4t_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ if (skb_linearize(skb))
+ return NULL;
+
+ if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN)))
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN))
+ return NULL;
+
+ return skb;
+}
+
+/* Ethertype version */
+static const struct dsa_device_ops rtl8_4_netdev_ops = {
+ .name = "rtl8_4",
+ .proto = DSA_TAG_PROTO_RTL8_4,
+ .xmit = rtl8_4_tag_xmit,
+ .rcv = rtl8_4_tag_rcv,
+ .needed_headroom = RTL8_4_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(rtl8_4_netdev_ops);
+
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4, RTL8_4_NAME);
+
+/* Tail version */
+static const struct dsa_device_ops rtl8_4t_netdev_ops = {
+ .name = "rtl8_4t",
+ .proto = DSA_TAG_PROTO_RTL8_4T,
+ .xmit = rtl8_4t_tag_xmit,
+ .rcv = rtl8_4t_tag_rcv,
+ .needed_tailroom = RTL8_4_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(rtl8_4t_netdev_ops);
+
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T, RTL8_4T_NAME);
+
+static struct dsa_tag_driver *dsa_tag_drivers[] = {
+ &DSA_TAG_DRIVER_NAME(rtl8_4_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(rtl8_4t_netdev_ops),
+};
+module_dsa_tag_drivers(dsa_tag_drivers);
+
+MODULE_DESCRIPTION("DSA tag driver for Realtek 8 byte protocol 4 tags");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
new file mode 100644
index 000000000000..10994b3470f6
--- /dev/null
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+/* To define the outgoing port and to discover the incoming port a TAG is
+ * inserted after Src MAC :
+ *
+ * Dest MAC Src MAC TAG Type
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ * |<--------------->|
+ *
+ * See struct a5psw_tag for layout
+ */
+
+#define A5PSW_NAME "a5psw"
+
+#define ETH_P_DSA_A5PSW 0xE001
+#define A5PSW_TAG_LEN 8
+#define A5PSW_CTRL_DATA_FORCE_FORWARD BIT(0)
+/* This is both used for xmit tag and rcv tagging */
+#define A5PSW_CTRL_DATA_PORT GENMASK(3, 0)
+
+struct a5psw_tag {
+ __be16 ctrl_tag;
+ __be16 ctrl_data;
+ __be16 ctrl_data2_hi;
+ __be16 ctrl_data2_lo;
+};
+
+static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct a5psw_tag *ptag;
+ u32 data2_val;
+
+ BUILD_BUG_ON(sizeof(*ptag) != A5PSW_TAG_LEN);
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 60 bytes otherwise they will be discarded when they enter the
+ * switch port logic.
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN, false))
+ return NULL;
+
+ /* provide 'A5PSW_TAG_LEN' bytes additional space */
+ skb_push(skb, A5PSW_TAG_LEN);
+
+ /* make room between MACs and Ether-Type to insert tag */
+ dsa_alloc_etype_header(skb, A5PSW_TAG_LEN);
+
+ ptag = dsa_etype_header_pos_tx(skb);
+
+ data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, dsa_xmit_port_mask(skb, dev));
+ ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW);
+ ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD);
+ ptag->ctrl_data2_lo = htons(data2_val);
+ ptag->ctrl_data2_hi = 0;
+
+ return skb;
+}
+
+static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct a5psw_tag *tag;
+ int port;
+
+ if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
+ dev_warn_ratelimited(&dev->dev,
+ "Dropping packet, cannot pull\n");
+ return NULL;
+ }
+
+ tag = dsa_etype_header_pos_rx(skb);
+
+ if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+ return NULL;
+ }
+
+ port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ skb_pull_rcsum(skb, A5PSW_TAG_LEN);
+ dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops a5psw_netdev_ops = {
+ .name = A5PSW_NAME,
+ .proto = DSA_TAG_PROTO_RZN1_A5PSW,
+ .xmit = a5psw_tag_xmit,
+ .rcv = a5psw_tag_rcv,
+ .needed_headroom = A5PSW_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Renesas RZ/N1 A5PSW switch");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW, A5PSW_NAME);
+module_dsa_tag_driver(a5psw_netdev_ops);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
new file mode 100644
index 000000000000..02adec693811
--- /dev/null
+++ b/net/dsa/tag_sja1105.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
+ */
+#include <linux/if_vlan.h>
+#include <linux/dsa/sja1105.h>
+#include <linux/dsa/8021q.h>
+#include <linux/packing.h>
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define SJA1105_NAME "sja1105"
+#define SJA1110_NAME "sja1110"
+
+/* Is this a TX or an RX header? */
+#define SJA1110_HEADER_HOST_TO_SWITCH BIT(15)
+
+/* RX header */
+#define SJA1110_RX_HEADER_IS_METADATA BIT(14)
+#define SJA1110_RX_HEADER_HOST_ONLY BIT(13)
+#define SJA1110_RX_HEADER_HAS_TRAILER BIT(12)
+
+/* Trap-to-host format (no trailer present) */
+#define SJA1110_RX_HEADER_SRC_PORT(x) (((x) & GENMASK(7, 4)) >> 4)
+#define SJA1110_RX_HEADER_SWITCH_ID(x) ((x) & GENMASK(3, 0))
+
+/* Timestamp format (trailer present) */
+#define SJA1110_RX_HEADER_TRAILER_POS(x) ((x) & GENMASK(11, 0))
+
+#define SJA1110_RX_TRAILER_SWITCH_ID(x) (((x) & GENMASK(7, 4)) >> 4)
+#define SJA1110_RX_TRAILER_SRC_PORT(x) ((x) & GENMASK(3, 0))
+
+/* Meta frame format (for 2-step TX timestamps) */
+#define SJA1110_RX_HEADER_N_TS(x) (((x) & GENMASK(8, 4)) >> 4)
+
+/* TX header */
+#define SJA1110_TX_HEADER_UPDATE_TC BIT(14)
+#define SJA1110_TX_HEADER_TAKE_TS BIT(13)
+#define SJA1110_TX_HEADER_TAKE_TS_CASC BIT(12)
+#define SJA1110_TX_HEADER_HAS_TRAILER BIT(11)
+
+/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is false */
+#define SJA1110_TX_HEADER_PRIO(x) (((x) << 7) & GENMASK(10, 7))
+#define SJA1110_TX_HEADER_TSTAMP_ID(x) ((x) & GENMASK(7, 0))
+
+/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is true */
+#define SJA1110_TX_HEADER_TRAILER_POS(x) ((x) & GENMASK(10, 0))
+
+#define SJA1110_TX_TRAILER_TSTAMP_ID(x) (((x) << 24) & GENMASK(31, 24))
+#define SJA1110_TX_TRAILER_PRIO(x) (((x) << 21) & GENMASK(23, 21))
+#define SJA1110_TX_TRAILER_SWITCHID(x) (((x) << 12) & GENMASK(15, 12))
+#define SJA1110_TX_TRAILER_DESTPORTS(x) (((x) << 1) & GENMASK(11, 1))
+
+#define SJA1110_META_TSTAMP_SIZE 10
+
+#define SJA1110_HEADER_LEN 4
+#define SJA1110_RX_TRAILER_LEN 13
+#define SJA1110_TX_TRAILER_LEN 4
+#define SJA1110_MAX_PADDING_LEN 15
+
+struct sja1105_tagger_private {
+ struct sja1105_tagger_data data; /* Must be first */
+ /* Protects concurrent access to the meta state machine
+ * from taggers running on multiple ports on SMP systems
+ */
+ spinlock_t meta_lock;
+ struct sk_buff *stampable_skb;
+ struct kthread_worker *xmit_worker;
+};
+
+static struct sja1105_tagger_private *
+sja1105_tagger_private(struct dsa_switch *ds)
+{
+ return ds->tagger_data;
+}
+
+/* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */
+static bool sja1105_is_link_local(const struct sk_buff *skb)
+{
+ const struct ethhdr *hdr = eth_hdr(skb);
+ u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+ if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META)
+ return false;
+ if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) ==
+ SJA1105_LINKLOCAL_FILTER_A)
+ return true;
+ if ((dmac & SJA1105_LINKLOCAL_FILTER_B_MASK) ==
+ SJA1105_LINKLOCAL_FILTER_B)
+ return true;
+ return false;
+}
+
+struct sja1105_meta {
+ u64 tstamp;
+ u64 dmac_byte_4;
+ u64 dmac_byte_3;
+ u64 source_port;
+ u64 switch_id;
+};
+
+static void sja1105_meta_unpack(const struct sk_buff *skb,
+ struct sja1105_meta *meta)
+{
+ u8 *buf = skb_mac_header(skb) + ETH_HLEN;
+
+ /* UM10944.pdf section 4.2.17 AVB Parameters:
+ * Structure of the meta-data follow-up frame.
+ * It is in network byte order, so there are no quirks
+ * while unpacking the meta frame.
+ *
+ * Also SJA1105 E/T only populates bits 23:0 of the timestamp
+ * whereas P/Q/R/S does 32 bits. Since the structure is the
+ * same and the E/T puts zeroes in the high-order byte, use
+ * a unified unpacking command for both device series.
+ */
+ packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0);
+ packing(buf + 4, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0);
+ packing(buf + 5, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0);
+ packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0);
+ packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0);
+}
+
+static bool sja1105_is_meta_frame(const struct sk_buff *skb)
+{
+ const struct ethhdr *hdr = eth_hdr(skb);
+ u64 smac = ether_addr_to_u64(hdr->h_source);
+ u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+ if (smac != SJA1105_META_SMAC)
+ return false;
+ if (dmac != SJA1105_META_DMAC)
+ return false;
+ if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META)
+ return false;
+ return true;
+}
+
+/* Calls sja1105_port_deferred_xmit in sja1105_main.c */
+static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
+ struct sk_buff *skb)
+{
+ struct sja1105_tagger_data *tagger_data = sja1105_tagger_data(dp->ds);
+ struct sja1105_tagger_private *priv = sja1105_tagger_private(dp->ds);
+ void (*xmit_work_fn)(struct kthread_work *work);
+ struct sja1105_deferred_xmit_work *xmit_work;
+ struct kthread_worker *xmit_worker;
+
+ xmit_work_fn = tagger_data->xmit_work_fn;
+ xmit_worker = priv->xmit_worker;
+
+ if (!xmit_work_fn || !xmit_worker)
+ return NULL;
+
+ xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+ if (!xmit_work)
+ return NULL;
+
+ kthread_init_work(&xmit_work->work, xmit_work_fn);
+ /* Increase refcount so the kfree_skb in dsa_user_xmit
+ * won't really free the packet.
+ */
+ xmit_work->dp = dp;
+ xmit_work->skb = skb_get(skb);
+
+ kthread_queue_work(xmit_worker, &xmit_work->work);
+
+ return NULL;
+}
+
+/* Send VLAN tags with a TPID that blends in with whatever VLAN protocol a
+ * bridge spanning ports of this switch might have.
+ */
+static u16 sja1105_xmit_tpid(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ u16 proto;
+
+ /* Since VLAN awareness is global, then if this port is VLAN-unaware,
+ * all ports are. Use the VLAN-unaware TPID used for tag_8021q.
+ */
+ if (!dsa_port_is_vlan_filtering(dp))
+ return ETH_P_SJA1105;
+
+ /* Port is VLAN-aware, so there is a bridge somewhere (a single one,
+ * we're sure about that). It may not be on this port though, so we
+ * need to find it.
+ */
+ dsa_switch_for_each_port(other_dp, ds) {
+ struct net_device *br = dsa_port_bridge_dev_get(other_dp);
+
+ if (!br)
+ continue;
+
+ /* Error is returned only if CONFIG_BRIDGE_VLAN_FILTERING,
+ * which seems pointless to handle, as our port cannot become
+ * VLAN-aware in that case.
+ */
+ br_vlan_get_proto(br, &proto);
+
+ return proto;
+ }
+
+ WARN_ONCE(1, "Port is VLAN-aware but cannot find associated bridge!\n");
+
+ return ETH_P_SJA1105;
+}
+
+static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ unsigned int bridge_num = dsa_port_bridge_num_get(dp);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ u16 tx_vid;
+
+ /* If the port is under a VLAN-aware bridge, just slide the
+ * VLAN-tagged packet into the FDB and hope for the best.
+ * This works because we support a single VLAN-aware bridge
+ * across the entire dst, and its VLANs cannot be shared with
+ * any standalone port.
+ */
+ if (br_vlan_enabled(br))
+ return skb;
+
+ /* If the port is under a VLAN-unaware bridge, use an imprecise
+ * TX VLAN that targets the bridge's entire broadcast domain,
+ * instead of just the specific port.
+ */
+ tx_vid = dsa_tag_8021q_bridge_vid(bridge_num);
+
+ return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid);
+}
+
+/* Transform untagged control packets into pvid-tagged control packets so that
+ * all packets sent by this tagger are VLAN-tagged and we can configure the
+ * switch to drop untagged packets coming from the DSA conduit.
+ */
+static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
+ struct sk_buff *skb, u8 pcp)
+{
+ __be16 xmit_tpid = htons(sja1105_xmit_tpid(dp));
+ struct vlan_ethhdr *hdr;
+
+ /* If VLAN tag is in hwaccel area, move it to the payload
+ * to deal with both cases uniformly and to ensure that
+ * the VLANs are added in the right order.
+ */
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ skb = __vlan_hwaccel_push_inside(skb);
+ if (!skb)
+ return NULL;
+ }
+
+ hdr = skb_vlan_eth_hdr(skb);
+
+ /* If skb is already VLAN-tagged, leave that VLAN ID in place */
+ if (hdr->h_vlan_proto == xmit_tpid)
+ return skb;
+
+ return vlan_insert_tag(skb, xmit_tpid, (pcp << VLAN_PRIO_SHIFT) |
+ SJA1105_DEFAULT_VLAN);
+}
+
+static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+
+ if (skb->offload_fwd_mark)
+ return sja1105_imprecise_xmit(skb, netdev);
+
+ /* Transmitting management traffic does not rely upon switch tagging,
+ * but instead SPI-installed management routes. Part 2 of this
+ * is the .port_deferred_xmit driver callback.
+ */
+ if (unlikely(sja1105_is_link_local(skb))) {
+ skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp);
+ if (!skb)
+ return NULL;
+
+ return sja1105_defer_xmit(dp, skb);
+ }
+
+ return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+}
+
+static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+ __be32 *tx_trailer;
+ __be16 *tx_header;
+ int trailer_pos;
+
+ if (skb->offload_fwd_mark)
+ return sja1105_imprecise_xmit(skb, netdev);
+
+ /* Transmitting control packets is done using in-band control
+ * extensions, while data packets are transmitted using
+ * tag_8021q TX VLANs.
+ */
+ if (likely(!sja1105_is_link_local(skb)))
+ return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+
+ skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp);
+ if (!skb)
+ return NULL;
+
+ skb_push(skb, SJA1110_HEADER_LEN);
+
+ dsa_alloc_etype_header(skb, SJA1110_HEADER_LEN);
+
+ trailer_pos = skb->len;
+
+ tx_header = dsa_etype_header_pos_tx(skb);
+ tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN);
+
+ tx_header[0] = htons(ETH_P_SJA1110);
+ tx_header[1] = htons(SJA1110_HEADER_HOST_TO_SWITCH |
+ SJA1110_TX_HEADER_HAS_TRAILER |
+ SJA1110_TX_HEADER_TRAILER_POS(trailer_pos));
+ *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) |
+ SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) |
+ SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index)));
+ if (clone) {
+ u8 ts_id = SJA1105_SKB_CB(clone)->ts_id;
+
+ tx_header[1] |= htons(SJA1110_TX_HEADER_TAKE_TS);
+ *tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id));
+ }
+
+ return skb;
+}
+
+static void sja1105_transfer_meta(struct sk_buff *skb,
+ const struct sja1105_meta *meta)
+{
+ struct ethhdr *hdr = eth_hdr(skb);
+
+ hdr->h_dest[3] = meta->dmac_byte_3;
+ hdr->h_dest[4] = meta->dmac_byte_4;
+ SJA1105_SKB_CB(skb)->tstamp = meta->tstamp;
+}
+
+/* This is a simple state machine which follows the hardware mechanism of
+ * generating RX timestamps:
+ *
+ * After each timestampable skb (all traffic for which send_meta1 and
+ * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame
+ * containing a partial timestamp is immediately generated by the switch and
+ * sent as a follow-up to the link-local frame on the CPU port.
+ *
+ * The meta frames have no unique identifier (such as sequence number) by which
+ * one may pair them to the correct timestampable frame.
+ * Instead, the switch has internal logic that ensures no frames are sent on
+ * the CPU port between a link-local timestampable frame and its corresponding
+ * meta follow-up. It also ensures strict ordering between ports (lower ports
+ * have higher priority towards the CPU port). For this reason, a per-port
+ * data structure is not needed/desirable.
+ *
+ * This function pairs the link-local frame with its partial timestamp from the
+ * meta follow-up frame. The full timestamp will be reconstructed later in a
+ * work queue.
+ */
+static struct sk_buff
+*sja1105_rcv_meta_state_machine(struct sk_buff *skb,
+ struct sja1105_meta *meta,
+ bool is_link_local,
+ bool is_meta)
+{
+ /* Step 1: A timestampable frame was received.
+ * Buffer it until we get its meta frame.
+ */
+ if (is_link_local) {
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+ struct sja1105_tagger_private *priv;
+ struct dsa_switch *ds = dp->ds;
+
+ priv = sja1105_tagger_private(ds);
+
+ spin_lock(&priv->meta_lock);
+ /* Was this a link-local frame instead of the meta
+ * that we were expecting?
+ */
+ if (priv->stampable_skb) {
+ dev_err_ratelimited(ds->dev,
+ "Expected meta frame, is %12llx "
+ "in the DSA conduit multicast filter?\n",
+ SJA1105_META_DMAC);
+ kfree_skb(priv->stampable_skb);
+ }
+
+ /* Hold a reference to avoid dsa_switch_rcv
+ * from freeing the skb.
+ */
+ priv->stampable_skb = skb_get(skb);
+ spin_unlock(&priv->meta_lock);
+
+ /* Tell DSA we got nothing */
+ return NULL;
+
+ /* Step 2: The meta frame arrived.
+ * Time to take the stampable skb out of the closet, annotate it
+ * with the partial timestamp, and pretend that we received it
+ * just now (basically masquerade the buffered frame as the meta
+ * frame, which serves no further purpose).
+ */
+ } else if (is_meta) {
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+ struct sja1105_tagger_private *priv;
+ struct dsa_switch *ds = dp->ds;
+ struct sk_buff *stampable_skb;
+
+ priv = sja1105_tagger_private(ds);
+
+ spin_lock(&priv->meta_lock);
+
+ stampable_skb = priv->stampable_skb;
+ priv->stampable_skb = NULL;
+
+ /* Was this a meta frame instead of the link-local
+ * that we were expecting?
+ */
+ if (!stampable_skb) {
+ dev_err_ratelimited(ds->dev,
+ "Unexpected meta frame\n");
+ spin_unlock(&priv->meta_lock);
+ return NULL;
+ }
+
+ if (stampable_skb->dev != skb->dev) {
+ dev_err_ratelimited(ds->dev,
+ "Meta frame on wrong port\n");
+ spin_unlock(&priv->meta_lock);
+ return NULL;
+ }
+
+ /* Free the meta frame and give DSA the buffered stampable_skb
+ * for further processing up the network stack.
+ */
+ kfree_skb(skb);
+ skb = stampable_skb;
+ sja1105_transfer_meta(skb, meta);
+
+ spin_unlock(&priv->meta_lock);
+ }
+
+ return skb;
+}
+
+static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb)
+{
+ u16 tpid = ntohs(eth_hdr(skb)->h_proto);
+
+ return tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q ||
+ skb_vlan_tag_present(skb);
+}
+
+static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb)
+{
+ return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110;
+}
+
+static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ int source_port = -1, switch_id = -1, vbid = -1, vid = -1;
+ struct sja1105_meta meta = {0};
+ struct ethhdr *hdr;
+ bool is_link_local;
+ bool is_meta;
+
+ hdr = eth_hdr(skb);
+ is_link_local = sja1105_is_link_local(skb);
+ is_meta = sja1105_is_meta_frame(skb);
+
+ if (is_link_local) {
+ /* Management traffic path. Switch embeds the switch ID and
+ * port ID into bytes of the destination MAC, courtesy of
+ * the incl_srcpt options.
+ */
+ source_port = hdr->h_dest[3];
+ switch_id = hdr->h_dest[4];
+ } else if (is_meta) {
+ sja1105_meta_unpack(skb, &meta);
+ source_port = meta.source_port;
+ switch_id = meta.switch_id;
+ }
+
+ /* Normal data plane traffic and link-local frames are tagged with
+ * a tag_8021q VLAN which we have to strip
+ */
+ if (sja1105_skb_has_tag_8021q(skb))
+ dsa_8021q_rcv(skb, &source_port, &switch_id, &vbid, &vid);
+ else if (source_port == -1 && switch_id == -1)
+ /* Packets with no source information have no chance of
+ * getting accepted, drop them straight away.
+ */
+ return NULL;
+
+ skb->dev = dsa_tag_8021q_find_user(netdev, source_port, switch_id,
+ vid, vbid);
+ if (!skb->dev) {
+ netdev_warn(netdev, "Couldn't decode source port\n");
+ return NULL;
+ }
+
+ if (!is_link_local)
+ dsa_default_offload_fwd_mark(skb);
+
+ return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
+ is_meta);
+}
+
+static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
+{
+ u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN;
+ int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
+ int n_ts = SJA1110_RX_HEADER_N_TS(rx_header);
+ struct sja1105_tagger_data *tagger_data;
+ struct net_device *conduit = skb->dev;
+ struct dsa_port *cpu_dp;
+ struct dsa_switch *ds;
+ int i;
+
+ cpu_dp = conduit->dsa_ptr;
+ ds = dsa_switch_find(cpu_dp->dst->index, switch_id);
+ if (!ds) {
+ net_err_ratelimited("%s: cannot find switch id %d\n",
+ conduit->name, switch_id);
+ return NULL;
+ }
+
+ tagger_data = sja1105_tagger_data(ds);
+ if (!tagger_data->meta_tstamp_handler)
+ return NULL;
+
+ for (i = 0; i <= n_ts; i++) {
+ u8 ts_id, source_port, dir;
+ u64 tstamp;
+
+ ts_id = buf[0];
+ source_port = (buf[1] & GENMASK(7, 4)) >> 4;
+ dir = (buf[1] & BIT(3)) >> 3;
+ tstamp = be64_to_cpu(*(__be64 *)(buf + 2));
+
+ tagger_data->meta_tstamp_handler(ds, source_port, ts_id, dir,
+ tstamp);
+
+ buf += SJA1110_META_TSTAMP_SIZE;
+ }
+
+ /* Discard the meta frame, we've consumed the timestamps it contained */
+ return NULL;
+}
+
+static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
+ int *source_port,
+ int *switch_id,
+ bool *host_only)
+{
+ u16 rx_header;
+
+ if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN)))
+ return NULL;
+
+ /* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly
+ * what we need because the caller has checked the EtherType (which is
+ * located 2 bytes back) and we just need a pointer to the header that
+ * comes afterwards.
+ */
+ rx_header = ntohs(*(__be16 *)skb->data);
+
+ if (rx_header & SJA1110_RX_HEADER_HOST_ONLY)
+ *host_only = true;
+
+ if (rx_header & SJA1110_RX_HEADER_IS_METADATA)
+ return sja1110_rcv_meta(skb, rx_header);
+
+ /* Timestamp frame, we have a trailer */
+ if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) {
+ int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header);
+ u8 *rx_trailer = skb_tail_pointer(skb) - SJA1110_RX_TRAILER_LEN;
+ u64 *tstamp = &SJA1105_SKB_CB(skb)->tstamp;
+ u8 last_byte = rx_trailer[12];
+
+ /* The timestamp is unaligned, so we need to use packing()
+ * to get it
+ */
+ packing(rx_trailer, tstamp, 63, 0, 8, UNPACK, 0);
+
+ *source_port = SJA1110_RX_TRAILER_SRC_PORT(last_byte);
+ *switch_id = SJA1110_RX_TRAILER_SWITCH_ID(last_byte);
+
+ /* skb->len counts from skb->data, while start_of_padding
+ * counts from the destination MAC address. Right now skb->data
+ * is still as set by the DSA conduit, so to trim away the
+ * padding and trailer we need to account for the fact that
+ * skb->data points to skb_mac_header(skb) + ETH_HLEN.
+ */
+ if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN))
+ return NULL;
+ /* Trap-to-host frame, no timestamp trailer */
+ } else {
+ *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header);
+ *switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
+ }
+
+ /* Advance skb->data past the DSA header */
+ skb_pull_rcsum(skb, SJA1110_HEADER_LEN);
+
+ dsa_strip_etype_header(skb, SJA1110_HEADER_LEN);
+
+ /* With skb->data in its final place, update the MAC header
+ * so that eth_hdr() continues to works properly.
+ */
+ skb_set_mac_header(skb, -ETH_HLEN);
+
+ return skb;
+}
+
+static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ int source_port = -1, switch_id = -1, vbid = -1, vid = -1;
+ bool host_only = false;
+
+ if (sja1110_skb_has_inband_control_extension(skb)) {
+ skb = sja1110_rcv_inband_control_extension(skb, &source_port,
+ &switch_id,
+ &host_only);
+ if (!skb)
+ return NULL;
+ }
+
+ /* Packets with in-band control extensions might still have RX VLANs */
+ if (likely(sja1105_skb_has_tag_8021q(skb)))
+ dsa_8021q_rcv(skb, &source_port, &switch_id, &vbid, &vid);
+
+ skb->dev = dsa_tag_8021q_find_user(netdev, source_port, switch_id,
+ vid, vbid);
+
+ if (!skb->dev) {
+ netdev_warn(netdev, "Couldn't decode source port\n");
+ return NULL;
+ }
+
+ if (!host_only)
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ /* No tag added for management frames, all ok */
+ if (unlikely(sja1105_is_link_local(skb)))
+ return;
+
+ dsa_tag_generic_flow_dissect(skb, proto, offset);
+}
+
+static void sja1110_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ /* Management frames have 2 DSA tags on RX, so the needed_headroom we
+ * declared is fine for the generic dissector adjustment procedure.
+ */
+ if (unlikely(sja1105_is_link_local(skb)))
+ return dsa_tag_generic_flow_dissect(skb, proto, offset);
+
+ /* For the rest, there is a single DSA tag, the tag_8021q one */
+ *offset = VLAN_HLEN;
+ *proto = ((__be16 *)skb->data)[(VLAN_HLEN / 2) - 1];
+}
+
+static void sja1105_disconnect(struct dsa_switch *ds)
+{
+ struct sja1105_tagger_private *priv = ds->tagger_data;
+
+ kthread_destroy_worker(priv->xmit_worker);
+ kfree(priv);
+ ds->tagger_data = NULL;
+}
+
+static int sja1105_connect(struct dsa_switch *ds)
+{
+ struct sja1105_tagger_private *priv;
+ struct kthread_worker *xmit_worker;
+ int err;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ spin_lock_init(&priv->meta_lock);
+
+ xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
+ ds->dst->index, ds->index);
+ if (IS_ERR(xmit_worker)) {
+ err = PTR_ERR(xmit_worker);
+ kfree(priv);
+ return err;
+ }
+
+ priv->xmit_worker = xmit_worker;
+ ds->tagger_data = priv;
+
+ return 0;
+}
+
+static const struct dsa_device_ops sja1105_netdev_ops = {
+ .name = SJA1105_NAME,
+ .proto = DSA_TAG_PROTO_SJA1105,
+ .xmit = sja1105_xmit,
+ .rcv = sja1105_rcv,
+ .connect = sja1105_connect,
+ .disconnect = sja1105_disconnect,
+ .needed_headroom = VLAN_HLEN,
+ .flow_dissect = sja1105_flow_dissect,
+ .promisc_on_conduit = true,
+};
+
+DSA_TAG_DRIVER(sja1105_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105, SJA1105_NAME);
+
+static const struct dsa_device_ops sja1110_netdev_ops = {
+ .name = SJA1110_NAME,
+ .proto = DSA_TAG_PROTO_SJA1110,
+ .xmit = sja1110_xmit,
+ .rcv = sja1110_rcv,
+ .connect = sja1105_connect,
+ .disconnect = sja1105_disconnect,
+ .flow_dissect = sja1110_flow_dissect,
+ .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN,
+ .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN,
+};
+
+DSA_TAG_DRIVER(sja1110_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110, SJA1110_NAME);
+
+static struct dsa_tag_driver *sja1105_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(sja1105_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(sja1110_netdev_ops),
+};
+
+module_dsa_tag_drivers(sja1105_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for NXP SJA1105 switches");
+MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
new file mode 100644
index 000000000000..4dce24cfe6a7
--- /dev/null
+++ b/net/dsa/tag_trailer.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * net/dsa/tag_trailer.c - Trailer tag format handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "tag.h"
+
+#define TRAILER_NAME "trailer"
+
+static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *trailer;
+
+ trailer = skb_put(skb, 4);
+ trailer[0] = 0x80;
+ trailer[1] = dsa_xmit_port_mask(skb, dev);
+ trailer[2] = 0x10;
+ trailer[3] = 0x00;
+
+ return skb;
+}
+
+static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *trailer;
+ int source_port;
+
+ if (skb_linearize(skb))
+ return NULL;
+
+ trailer = skb_tail_pointer(skb) - 4;
+ if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
+ (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00)
+ return NULL;
+
+ source_port = trailer[1] & 7;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - 4))
+ return NULL;
+
+ return skb;
+}
+
+static const struct dsa_device_ops trailer_netdev_ops = {
+ .name = TRAILER_NAME,
+ .proto = DSA_TAG_PROTO_TRAILER,
+ .xmit = trailer_xmit,
+ .rcv = trailer_rcv,
+ .needed_tailroom = 4,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for switches using a trailer tag");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER, TRAILER_NAME);
+
+module_dsa_tag_driver(trailer_netdev_ops);
diff --git a/net/dsa/tag_vsc73xx_8021q.c b/net/dsa/tag_vsc73xx_8021q.c
new file mode 100644
index 000000000000..af121a9aff7f
--- /dev/null
+++ b/net/dsa/tag_vsc73xx_8021q.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/* Copyright (C) 2024 Pawel Dembicki <paweldembicki@gmail.com>
+ */
+#include <linux/dsa/8021q.h>
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define VSC73XX_8021Q_NAME "vsc73xx-8021q"
+
+static struct sk_buff *
+vsc73xx_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+ u8 pcp;
+
+ if (skb->offload_fwd_mark) {
+ unsigned int bridge_num = dsa_port_bridge_num_get(dp);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+
+ if (br_vlan_enabled(br))
+ return skb;
+
+ tx_vid = dsa_tag_8021q_bridge_vid(bridge_num);
+ }
+
+ pcp = netdev_txq_to_tc(netdev, queue_mapping);
+
+ return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q,
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+}
+
+static struct sk_buff *
+vsc73xx_rcv(struct sk_buff *skb, struct net_device *netdev)
+{
+ int src_port = -1, switch_id = -1, vbid = -1, vid = -1;
+
+ dsa_8021q_rcv(skb, &src_port, &switch_id, &vbid, &vid);
+
+ skb->dev = dsa_tag_8021q_find_user(netdev, src_port, switch_id,
+ vid, vbid);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Couldn't decode source port\n");
+ return NULL;
+ }
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops vsc73xx_8021q_netdev_ops = {
+ .name = VSC73XX_8021Q_NAME,
+ .proto = DSA_TAG_PROTO_VSC73XX_8021Q,
+ .xmit = vsc73xx_xmit,
+ .rcv = vsc73xx_rcv,
+ .needed_headroom = VLAN_HLEN,
+ .promisc_on_conduit = true,
+};
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("DSA tag driver for VSC73XX family of switches, using VLAN");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_VSC73XX_8021Q, VSC73XX_8021Q_NAME);
+
+module_dsa_tag_driver(vsc73xx_8021q_netdev_ops);
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
new file mode 100644
index 000000000000..a05219f702c6
--- /dev/null
+++ b/net/dsa/tag_xrs700x.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * XRS700x tag format handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2020 NovaTech LLC
+ */
+
+#include <linux/bitops.h>
+
+#include "tag.h"
+
+#define XRS700X_NAME "xrs700x"
+
+static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *trailer;
+
+ trailer = skb_put(skb, 1);
+ trailer[0] = dsa_xmit_port_mask(skb, dev);
+
+ return skb;
+}
+
+static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ int source_port;
+ u8 *trailer;
+
+ trailer = skb_tail_pointer(skb) - 1;
+
+ source_port = ffs((int)trailer[0]) - 1;
+
+ if (source_port < 0)
+ return NULL;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - 1))
+ return NULL;
+
+ /* Frame is forwarded by hardware, don't forward in software. */
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops xrs700x_netdev_ops = {
+ .name = XRS700X_NAME,
+ .proto = DSA_TAG_PROTO_XRS700X,
+ .xmit = xrs700x_xmit,
+ .rcv = xrs700x_rcv,
+ .needed_tailroom = 1,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for XRS700x switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X, XRS700X_NAME);
+
+module_dsa_tag_driver(xrs700x_netdev_ops);
diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c
new file mode 100644
index 000000000000..6bbfd42dc5df
--- /dev/null
+++ b/net/dsa/tag_yt921x.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Motorcomm YT921x Switch Extended CPU Port Tagging
+ *
+ * Copyright (c) 2025 David Yang <mmyangfl@gmail.com>
+ *
+ * +----+----+-------+-----+----+---------
+ * | DA | SA | TagET | Tag | ET | Payload ...
+ * +----+----+-------+-----+----+---------
+ * 6 6 2 6 2 N
+ *
+ * Tag Ethertype: CPU_TAG_TPID_TPID (default: ETH_P_YT921X = 0x9988)
+ * * Hardcoded for the moment, but still configurable. Discuss it if there
+ * are conflicts somewhere and/or you want to change it for some reason.
+ * Tag:
+ * 2: VLAN Tag
+ * 2: Rx Port
+ * 15b: Rx Port Valid
+ * 14b-11b: Rx Port
+ * 10b-0b: Cmd?
+ * 2: Tx Port(s)
+ * 15b: Tx Port(s) Valid
+ * 10b-0b: Tx Port(s) Mask
+ */
+
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+#define YT921X_TAG_NAME "yt921x"
+
+#define YT921X_TAG_LEN 8
+
+#define YT921X_TAG_PORT_EN BIT(15)
+#define YT921X_TAG_RX_PORT_M GENMASK(14, 11)
+#define YT921X_TAG_RX_CMD_M GENMASK(10, 0)
+#define YT921X_TAG_RX_CMD(x) FIELD_PREP(YT921X_TAG_RX_CMD_M, (x))
+#define YT921X_TAG_RX_CMD_FORWARDED 0x80
+#define YT921X_TAG_RX_CMD_UNK_UCAST 0xb2
+#define YT921X_TAG_RX_CMD_UNK_MCAST 0xb4
+#define YT921X_TAG_TX_PORTS GENMASK(10, 0)
+
+static struct sk_buff *
+yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ __be16 *tag;
+ u16 tx;
+
+ skb_push(skb, YT921X_TAG_LEN);
+ dsa_alloc_etype_header(skb, YT921X_TAG_LEN);
+
+ tag = dsa_etype_header_pos_tx(skb);
+
+ tag[0] = htons(ETH_P_YT921X);
+ /* VLAN tag unrelated when TX */
+ tag[1] = 0;
+ tag[2] = 0;
+ tx = FIELD_PREP(YT921X_TAG_TX_PORTS, dsa_xmit_port_mask(skb, netdev)) |
+ YT921X_TAG_PORT_EN;
+ tag[3] = htons(tx);
+
+ return skb;
+}
+
+static struct sk_buff *
+yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
+{
+ unsigned int port;
+ __be16 *tag;
+ u16 cmd;
+ u16 rx;
+
+ if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN)))
+ return NULL;
+
+ tag = dsa_etype_header_pos_rx(skb);
+
+ if (unlikely(tag[0] != htons(ETH_P_YT921X))) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Unexpected EtherType 0x%04x\n",
+ ntohs(tag[0]));
+ return NULL;
+ }
+
+ /* Locate which port this is coming from */
+ rx = ntohs(tag[2]);
+ if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Unexpected rx tag 0x%04x\n", rx);
+ return NULL;
+ }
+
+ port = FIELD_GET(YT921X_TAG_RX_PORT_M, rx);
+ skb->dev = dsa_conduit_find_user(netdev, 0, port);
+ if (unlikely(!skb->dev)) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Couldn't decode source port %u\n", port);
+ return NULL;
+ }
+
+ cmd = FIELD_GET(YT921X_TAG_RX_CMD_M, rx);
+ switch (cmd) {
+ case YT921X_TAG_RX_CMD_FORWARDED:
+ /* Already forwarded by hardware */
+ dsa_default_offload_fwd_mark(skb);
+ break;
+ case YT921X_TAG_RX_CMD_UNK_UCAST:
+ case YT921X_TAG_RX_CMD_UNK_MCAST:
+ /* NOTE: hardware doesn't distinguish between TRAP (copy to CPU
+ * only) and COPY (forward and copy to CPU). In order to perform
+ * a soft switch, NEVER use COPY action in the switch driver.
+ */
+ break;
+ default:
+ dev_warn_ratelimited(&netdev->dev,
+ "Unexpected rx cmd 0x%02x\n", cmd);
+ break;
+ }
+
+ /* Remove YT921x tag and update checksum */
+ skb_pull_rcsum(skb, YT921X_TAG_LEN);
+ dsa_strip_etype_header(skb, YT921X_TAG_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops yt921x_netdev_ops = {
+ .name = YT921X_TAG_NAME,
+ .proto = DSA_TAG_PROTO_YT921X,
+ .xmit = yt921x_tag_xmit,
+ .rcv = yt921x_tag_rcv,
+ .needed_headroom = YT921X_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Motorcomm YT921x switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_YT921X, YT921X_TAG_NAME);
+
+module_dsa_tag_driver(yt921x_netdev_ops);
diff --git a/net/dsa/trace.c b/net/dsa/trace.c
new file mode 100644
index 000000000000..1b107165d331
--- /dev/null
+++ b/net/dsa/trace.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2022-2023 NXP
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+void dsa_db_print(const struct dsa_db *db, char buf[DSA_DB_BUFSIZ])
+{
+ switch (db->type) {
+ case DSA_DB_PORT:
+ sprintf(buf, "port %s", db->dp->name);
+ break;
+ case DSA_DB_LAG:
+ sprintf(buf, "lag %s id %d", db->lag.dev->name, db->lag.id);
+ break;
+ case DSA_DB_BRIDGE:
+ sprintf(buf, "bridge %s num %d", db->bridge.dev->name,
+ db->bridge.num);
+ break;
+ default:
+ sprintf(buf, "unknown");
+ break;
+ }
+}
+
+const char *dsa_port_kind(const struct dsa_port *dp)
+{
+ switch (dp->type) {
+ case DSA_PORT_TYPE_USER:
+ return "user";
+ case DSA_PORT_TYPE_CPU:
+ return "cpu";
+ case DSA_PORT_TYPE_DSA:
+ return "dsa";
+ default:
+ return "unused";
+ }
+}
diff --git a/net/dsa/trace.h b/net/dsa/trace.h
new file mode 100644
index 000000000000..83f3e5f78491
--- /dev/null
+++ b/net/dsa/trace.h
@@ -0,0 +1,447 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright 2022-2023 NXP
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dsa
+
+#if !defined(_NET_DSA_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NET_DSA_TRACE_H
+
+#include <net/dsa.h>
+#include <net/switchdev.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/refcount.h>
+#include <linux/tracepoint.h>
+
+/* Enough to fit "bridge %s num %d" where num has 3 digits */
+#define DSA_DB_BUFSIZ (IFNAMSIZ + 16)
+
+void dsa_db_print(const struct dsa_db *db, char buf[DSA_DB_BUFSIZ]);
+const char *dsa_port_kind(const struct dsa_port *dp);
+
+DECLARE_EVENT_CLASS(dsa_port_addr_op_hw,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db, int err),
+
+ TP_ARGS(dp, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->addr,
+ __entry->vid, __entry->db_buf, __entry->err)
+);
+
+/* Add unicast/multicast address to hardware, either on user ports
+ * (where no refcounting is kept), or on shared ports when the entry
+ * is first seen and its refcount is 1.
+ */
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_fdb_add_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_mdb_add_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+/* Delete unicast/multicast address from hardware, either on user ports or
+ * when the refcount on shared ports reaches 0
+ */
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_fdb_del_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_mdb_del_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DECLARE_EVENT_CLASS(dsa_port_addr_op_refcount,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(dp, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->addr,
+ __entry->vid, __entry->db_buf, __entry->refcount)
+);
+
+/* Bump the refcount of an existing unicast/multicast address on shared ports */
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_fdb_add_bump,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_mdb_add_bump,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+/* Drop the refcount of a multicast address that we still keep on
+ * shared ports
+ */
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_fdb_del_drop,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_mdb_del_drop,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DECLARE_EVENT_CLASS(dsa_port_addr_del_not_found,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db),
+
+ TP_ARGS(dp, addr, vid, db),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\"",
+ __get_str(dev), __get_str(kind), __entry->port,
+ __entry->addr, __entry->vid, __entry->db_buf)
+);
+
+/* Attempt to delete a unicast/multicast address on shared ports for which
+ * the delete operation was called more times than the addition
+ */
+DEFINE_EVENT(dsa_port_addr_del_not_found, dsa_fdb_del_not_found,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+ TP_ARGS(dp, addr, vid, db));
+
+DEFINE_EVENT(dsa_port_addr_del_not_found, dsa_mdb_del_not_found,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+ TP_ARGS(dp, addr, vid, db));
+
+TRACE_EVENT(dsa_lag_fdb_add_hw,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+
+ TP_ARGS(lag_dev, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->err)
+);
+
+TRACE_EVENT(dsa_lag_fdb_add_bump,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(lag_dev, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->refcount)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_hw,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+
+ TP_ARGS(lag_dev, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->err)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_drop,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(lag_dev, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->refcount)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_not_found,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+
+ TP_ARGS(lag_dev, addr, vid, db),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\"",
+ __get_str(dev), __entry->addr, __entry->vid, __entry->db_buf)
+);
+
+DECLARE_EVENT_CLASS(dsa_vlan_op_hw,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+
+ TP_ARGS(dp, vlan, err),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ __field(u16, flags)
+ __field(bool, changed)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ __entry->flags = vlan->flags;
+ __entry->changed = vlan->changed;
+ __entry->err = err;
+ ),
+
+ TP_printk("%s %s port %d vid %u%s%s%s",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid,
+ __entry->flags & BRIDGE_VLAN_INFO_PVID ? " pvid" : "",
+ __entry->flags & BRIDGE_VLAN_INFO_UNTAGGED ? " untagged" : "",
+ __entry->changed ? " (changed)" : "")
+);
+
+DEFINE_EVENT(dsa_vlan_op_hw, dsa_vlan_add_hw,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+ TP_ARGS(dp, vlan, err));
+
+DEFINE_EVENT(dsa_vlan_op_hw, dsa_vlan_del_hw,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+ TP_ARGS(dp, vlan, err));
+
+DECLARE_EVENT_CLASS(dsa_vlan_op_refcount,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+
+ TP_ARGS(dp, vlan, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ __field(u16, flags)
+ __field(bool, changed)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ __entry->flags = vlan->flags;
+ __entry->changed = vlan->changed;
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s %s port %d vid %u%s%s%s refcount %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid,
+ __entry->flags & BRIDGE_VLAN_INFO_PVID ? " pvid" : "",
+ __entry->flags & BRIDGE_VLAN_INFO_UNTAGGED ? " untagged" : "",
+ __entry->changed ? " (changed)" : "", __entry->refcount)
+);
+
+DEFINE_EVENT(dsa_vlan_op_refcount, dsa_vlan_add_bump,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+ TP_ARGS(dp, vlan, refcount));
+
+DEFINE_EVENT(dsa_vlan_op_refcount, dsa_vlan_del_drop,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+ TP_ARGS(dp, vlan, refcount));
+
+TRACE_EVENT(dsa_vlan_del_not_found,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan),
+
+ TP_ARGS(dp, vlan),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ ),
+
+ TP_printk("%s %s port %d vid %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid)
+);
+
+#endif /* _NET_DSA_TRACE_H */
+
+/* We don't want to use include/trace/events */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/dsa/user.c b/net/dsa/user.c
new file mode 100644
index 000000000000..f59d66f0975d
--- /dev/null
+++ b/net/dsa/user.c
@@ -0,0 +1,3877 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/user.c - user device handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ */
+
+#include <linux/list.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <linux/mdio.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/selftests.h>
+#include <net/tc_act/tc_mirred.h>
+#include <linux/if_bridge.h>
+#include <linux/if_hsr.h>
+#include <net/dcbnl.h>
+#include <linux/netpoll.h>
+#include <linux/string.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "user.h"
+
+struct dsa_switchdev_event_work {
+ struct net_device *dev;
+ struct net_device *orig_dev;
+ struct work_struct work;
+ unsigned long event;
+ /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
+ * SWITCHDEV_FDB_DEL_TO_DEVICE
+ */
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+ bool host_addr;
+};
+
+enum dsa_standalone_event {
+ DSA_UC_ADD,
+ DSA_UC_DEL,
+ DSA_MC_ADD,
+ DSA_MC_DEL,
+};
+
+struct dsa_standalone_event_work {
+ struct work_struct work;
+ struct net_device *dev;
+ enum dsa_standalone_event event;
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+};
+
+struct dsa_host_vlan_rx_filtering_ctx {
+ struct net_device *dev;
+ const unsigned char *addr;
+ enum dsa_standalone_event event;
+};
+
+static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
+{
+ return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
+ ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+ !ds->needs_standalone_vlan_filtering;
+}
+
+static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
+{
+ return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
+ ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+ !ds->needs_standalone_vlan_filtering;
+}
+
+static void dsa_user_standalone_event_work(struct work_struct *work)
+{
+ struct dsa_standalone_event_work *standalone_work =
+ container_of(work, struct dsa_standalone_event_work, work);
+ const unsigned char *addr = standalone_work->addr;
+ struct net_device *dev = standalone_work->dev;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_mdb mdb;
+ struct dsa_switch *ds = dp->ds;
+ u16 vid = standalone_work->vid;
+ int err;
+
+ switch (standalone_work->event) {
+ case DSA_UC_ADD:
+ err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to fdb: %d\n",
+ dp->index, addr, vid, err);
+ break;
+ }
+ break;
+
+ case DSA_UC_DEL:
+ err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from fdb: %d\n",
+ dp->index, addr, vid, err);
+ }
+
+ break;
+ case DSA_MC_ADD:
+ ether_addr_copy(mdb.addr, addr);
+ mdb.vid = vid;
+
+ err = dsa_port_standalone_host_mdb_add(dp, &mdb);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to mdb: %d\n",
+ dp->index, addr, vid, err);
+ break;
+ }
+ break;
+ case DSA_MC_DEL:
+ ether_addr_copy(mdb.addr, addr);
+ mdb.vid = vid;
+
+ err = dsa_port_standalone_host_mdb_del(dp, &mdb);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from mdb: %d\n",
+ dp->index, addr, vid, err);
+ }
+
+ break;
+ }
+
+ kfree(standalone_work);
+}
+
+static int dsa_user_schedule_standalone_work(struct net_device *dev,
+ enum dsa_standalone_event event,
+ const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_standalone_event_work *standalone_work;
+
+ standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
+ if (!standalone_work)
+ return -ENOMEM;
+
+ INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work);
+ standalone_work->event = event;
+ standalone_work->dev = dev;
+
+ ether_addr_copy(standalone_work->addr, addr);
+ standalone_work->vid = vid;
+
+ dsa_schedule_work(&standalone_work->work);
+
+ return 0;
+}
+
+static int dsa_user_host_vlan_rx_filtering(void *arg, int vid)
+{
+ struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;
+
+ return dsa_user_schedule_standalone_work(ctx->dev, ctx->event,
+ ctx->addr, vid);
+}
+
+static int dsa_user_vlan_for_each(struct net_device *dev,
+ int (*cb)(void *arg, int vid), void *arg)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_vlan *v;
+ int err;
+
+ lockdep_assert_held(&dev->addr_list_lock);
+
+ err = cb(arg, 0);
+ if (err)
+ return err;
+
+ list_for_each_entry(v, &dp->user_vlans, list) {
+ err = cb(arg, v->vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int dsa_user_sync_uc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_UC_ADD,
+ };
+
+ dev_uc_add(conduit, addr);
+
+ if (!dsa_switch_supports_uc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+static int dsa_user_unsync_uc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_UC_DEL,
+ };
+
+ dev_uc_del(conduit, addr);
+
+ if (!dsa_switch_supports_uc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+static int dsa_user_sync_mc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_MC_ADD,
+ };
+
+ dev_mc_add(conduit, addr);
+
+ if (!dsa_switch_supports_mc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+static int dsa_user_unsync_mc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_MC_DEL,
+ };
+
+ dev_mc_del(conduit, addr);
+
+ if (!dsa_switch_supports_mc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+void dsa_user_sync_ha(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+
+ netif_addr_lock_bh(dev);
+
+ netdev_for_each_synced_mc_addr(ha, dev)
+ dsa_user_sync_mc(dev, ha->addr);
+
+ netdev_for_each_synced_uc_addr(ha, dev)
+ dsa_user_sync_uc(dev, ha->addr);
+
+ netif_addr_unlock_bh(dev);
+
+ if (dsa_switch_supports_uc_filtering(ds) ||
+ dsa_switch_supports_mc_filtering(ds))
+ dsa_flush_workqueue();
+}
+
+void dsa_user_unsync_ha(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+
+ netif_addr_lock_bh(dev);
+
+ netdev_for_each_synced_uc_addr(ha, dev)
+ dsa_user_unsync_uc(dev, ha->addr);
+
+ netdev_for_each_synced_mc_addr(ha, dev)
+ dsa_user_unsync_mc(dev, ha->addr);
+
+ netif_addr_unlock_bh(dev);
+
+ if (dsa_switch_supports_uc_filtering(ds) ||
+ dsa_switch_supports_mc_filtering(ds))
+ dsa_flush_workqueue();
+}
+
+/* user mii_bus handling ***************************************************/
+static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+ struct dsa_switch *ds = bus->priv;
+
+ if (ds->phys_mii_mask & (1 << addr))
+ return ds->ops->phy_read(ds, addr, reg);
+
+ return 0xffff;
+}
+
+static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+ struct dsa_switch *ds = bus->priv;
+
+ if (ds->phys_mii_mask & (1 << addr))
+ return ds->ops->phy_write(ds, addr, reg, val);
+
+ return 0;
+}
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds)
+{
+ ds->user_mii_bus->priv = (void *)ds;
+ ds->user_mii_bus->name = "dsa user smi";
+ ds->user_mii_bus->read = dsa_user_phy_read;
+ ds->user_mii_bus->write = dsa_user_phy_write;
+ snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
+ ds->dst->index, ds->index);
+ ds->user_mii_bus->parent = ds->dev;
+ ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
+}
+
+
+/* user device handling ****************************************************/
+static int dsa_user_get_iflink(const struct net_device *dev)
+{
+ return READ_ONCE(dsa_user_to_conduit(dev)->ifindex);
+}
+
+int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (dsa_switch_supports_uc_filtering(ds)) {
+ err = dsa_port_standalone_host_fdb_add(dp, addr, 0);
+ if (err)
+ goto out;
+ }
+
+ if (!ether_addr_equal(addr, conduit->dev_addr)) {
+ err = dev_uc_add(conduit, addr);
+ if (err < 0)
+ goto del_host_addr;
+ }
+
+ return 0;
+
+del_host_addr:
+ if (dsa_switch_supports_uc_filtering(ds))
+ dsa_port_standalone_host_fdb_del(dp, addr, 0);
+out:
+ return err;
+}
+
+void dsa_user_host_uc_uninstall(struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+ dev_uc_del(conduit, dev->dev_addr);
+
+ if (dsa_switch_supports_uc_filtering(ds))
+ dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+}
+
+static int dsa_user_open(struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int err;
+
+ err = dev_open(conduit, NULL);
+ if (err < 0) {
+ netdev_err(dev, "failed to open conduit %s\n", conduit->name);
+ goto out;
+ }
+
+ err = dsa_user_host_uc_install(dev, dev->dev_addr);
+ if (err)
+ goto out;
+
+ err = dsa_port_enable_rt(dp, dev->phydev);
+ if (err)
+ goto out_del_host_uc;
+
+ return 0;
+
+out_del_host_uc:
+ dsa_user_host_uc_uninstall(dev);
+out:
+ return err;
+}
+
+static int dsa_user_close(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ dsa_port_disable_rt(dp);
+
+ dsa_user_host_uc_uninstall(dev);
+
+ return 0;
+}
+
+static void dsa_user_manage_host_flood(struct net_device *dev)
+{
+ bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ bool uc = dev->flags & IFF_PROMISC;
+
+ dsa_port_set_host_flood(dp, uc, mc);
+}
+
+static void dsa_user_change_rx_flags(struct net_device *dev, int change)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(conduit,
+ dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(conduit,
+ dev->flags & IFF_PROMISC ? 1 : -1);
+
+ if (dsa_switch_supports_uc_filtering(ds) &&
+ dsa_switch_supports_mc_filtering(ds))
+ dsa_user_manage_host_flood(dev);
+}
+
+static void dsa_user_set_rx_mode(struct net_device *dev)
+{
+ __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc);
+ __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc);
+}
+
+static int dsa_user_set_mac_address(struct net_device *dev, void *a)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct sockaddr *addr = a;
+ int err;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ if (ds->ops->port_set_mac_address) {
+ err = ds->ops->port_set_mac_address(ds, dp->index,
+ addr->sa_data);
+ if (err)
+ return err;
+ }
+
+ /* If the port is down, the address isn't synced yet to hardware or
+ * to the DSA conduit, so there is nothing to change.
+ */
+ if (!(dev->flags & IFF_UP))
+ goto out_change_dev_addr;
+
+ err = dsa_user_host_uc_install(dev, addr->sa_data);
+ if (err)
+ return err;
+
+ dsa_user_host_uc_uninstall(dev);
+
+out_change_dev_addr:
+ eth_hw_addr_set(dev, addr->sa_data);
+
+ return 0;
+}
+
+struct dsa_user_dump_ctx {
+ struct net_device *dev;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx;
+};
+
+static int
+dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid,
+ bool is_static, void *data)
+{
+ struct dsa_user_dump_ctx *dump = data;
+ struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx;
+ u32 portid = NETLINK_CB(dump->cb->skb).portid;
+ u32 seq = dump->cb->nlh->nlmsg_seq;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ if (dump->idx < ctx->fdb_idx)
+ goto skip;
+
+ nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+ sizeof(*ndm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = NTF_SELF;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dump->dev->ifindex;
+ ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+ if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
+ goto nla_put_failure;
+
+ nlmsg_end(dump->skb, nlh);
+
+skip:
+ dump->idx++;
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(dump->skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int
+dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev, struct net_device *filter_dev,
+ int *idx)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_user_dump_ctx dump = {
+ .dev = dev,
+ .skb = skb,
+ .cb = cb,
+ .idx = *idx,
+ };
+ int err;
+
+ err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump);
+ *idx = dump.idx;
+
+ return err;
+}
+
+static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+
+ return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
+}
+
+static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx,
+ const struct switchdev_attr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int ret;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ switch (attr->id) {
+ case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_set_state(dp, attr->u.stp_state, true);
+ break;
+ case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
+ break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
+ extack);
+ break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
+ break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_MST:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
+ break;
+ case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
+ extack);
+ break;
+ case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
+ break;
+ case SWITCHDEV_ATTR_ID_VLAN_MSTI:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ return ret;
+}
+
+/* Must be called under rcu_read_lock() */
+static int
+dsa_user_vlan_check_for_8021q_uppers(struct net_device *user,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct net_device *upper_dev;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+ if (vid == vlan->vid)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int dsa_user_vlan_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+ int err;
+
+ if (dsa_port_skip_vlan_configuration(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+ return 0;
+ }
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ /* Deny adding a bridge VLAN when there is already an 802.1Q upper with
+ * the same VID.
+ */
+ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
+ rcu_read_lock();
+ err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan);
+ rcu_read_unlock();
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Port already has a VLAN upper with this VID");
+ return err;
+ }
+ }
+
+ return dsa_port_vlan_add(dp, vlan, extack);
+}
+
+/* Offload a VLAN installed on the bridge or on a foreign interface by
+ * installing it as a VLAN towards the CPU port.
+ */
+static int dsa_user_host_vlan_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan vlan;
+
+ /* Do nothing if this is a software bridge */
+ if (!dp->bridge)
+ return -EOPNOTSUPP;
+
+ if (dsa_port_skip_vlan_configuration(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+ return 0;
+ }
+
+ vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ /* Even though drivers often handle CPU membership in special ways,
+ * it doesn't make sense to program a PVID, so clear this flag.
+ */
+ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
+
+ return dsa_port_host_vlan_add(dp, &vlan, extack);
+}
+
+static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int err;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_HOST_MDB:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ err = dsa_user_vlan_add(dev, obj, extack);
+ else
+ err = dsa_user_host_vlan_add(dev, obj, extack);
+ break;
+ case SWITCHDEV_OBJ_ID_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_add_ring_role(dp,
+ SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dsa_user_vlan_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+
+ if (dsa_port_skip_vlan_configuration(dp))
+ return 0;
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ return dsa_port_vlan_del(dp, vlan);
+}
+
+static int dsa_user_host_vlan_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+
+ /* Do nothing if this is a software bridge */
+ if (!dp->bridge)
+ return -EOPNOTSUPP;
+
+ if (dsa_port_skip_vlan_configuration(dp))
+ return 0;
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ return dsa_port_host_vlan_del(dp, vlan);
+}
+
+static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int err;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_HOST_MDB:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ err = dsa_user_vlan_del(dev, obj);
+ else
+ err = dsa_user_host_vlan_del(dev, obj);
+ break;
+ case SWITCHDEV_OBJ_ID_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_del_ring_role(dp,
+ SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev,
+ struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct dsa_user_priv *p = netdev_priv(dev);
+
+ return netpoll_send_skb(p->netpoll, skb);
+#else
+ BUG();
+ return NETDEV_TX_OK;
+#endif
+}
+
+static void dsa_skb_tx_timestamp(struct dsa_user_priv *p,
+ struct sk_buff *skb)
+{
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF))
+ return;
+
+ if (!ds->ops->port_txtstamp)
+ return;
+
+ ds->ops->port_txtstamp(ds, p->dp->index, skb);
+}
+
+netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
+{
+ /* SKB for netpoll still need to be mangled with the protocol-specific
+ * tag to be successfully transmitted
+ */
+ if (unlikely(netpoll_tx_running(dev)))
+ return dsa_user_netpoll_send_skb(dev, skb);
+
+ /* Queue the SKB for transmission on the parent interface, but
+ * do not modify its EtherType
+ */
+ skb->dev = dsa_user_to_conduit(dev);
+ dev_queue_xmit(skb);
+
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
+
+static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct sk_buff *nskb;
+
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
+
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ /* Handle tx timestamp if any */
+ dsa_skb_tx_timestamp(p, skb);
+
+ if (skb_ensure_writable_head_tail(skb, dev)) {
+ dev_kfree_skb_any(skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* needed_tailroom should still be 'warm' in the cache line from
+ * skb_ensure_writable_head_tail(), which has also ensured that
+ * padding is safe.
+ */
+ if (dev->needed_tailroom)
+ eth_skb_pad(skb);
+
+ /* Transmit function may have to reallocate the original SKB,
+ * in which case it must have freed it. Only free it here on error.
+ */
+ nskb = p->xmit(skb, dev);
+ if (!nskb) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ return dsa_enqueue_skb(nskb, dev);
+}
+
+/* ethtool operations *******************************************************/
+
+static void dsa_user_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
+ strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
+ strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
+}
+
+static int dsa_user_get_regs_len(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_regs_len)
+ return ds->ops->get_regs_len(ds, dp->index);
+
+ return -EOPNOTSUPP;
+}
+
+static void
+dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_regs)
+ ds->ops->get_regs(ds, dp->index, regs, _p);
+}
+
+static int dsa_user_nway_reset(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_nway_reset(dp->pl);
+}
+
+static int dsa_user_get_eeprom_len(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->cd && ds->cd->eeprom_len)
+ return ds->cd->eeprom_len;
+
+ if (ds->ops->get_eeprom_len)
+ return ds->ops->get_eeprom_len(ds);
+
+ return 0;
+}
+
+static int dsa_user_get_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eeprom)
+ return ds->ops->get_eeprom(ds, eeprom, data);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_user_set_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->set_eeprom)
+ return ds->ops->set_eeprom(ds, eeprom, data);
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_strings(struct net_device *dev,
+ uint32_t stringset, uint8_t *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (stringset == ETH_SS_STATS) {
+ ethtool_puts(&data, "tx_packets");
+ ethtool_puts(&data, "tx_bytes");
+ ethtool_puts(&data, "rx_packets");
+ ethtool_puts(&data, "rx_bytes");
+ if (ds->ops->get_strings)
+ ds->ops->get_strings(ds, dp->index, stringset, data);
+ } else if (stringset == ETH_SS_TEST) {
+ net_selftest_get_strings(data);
+ }
+
+}
+
+static void dsa_user_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct pcpu_sw_netstats *s;
+ unsigned int start;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
+
+ s = per_cpu_ptr(dev->tstats, i);
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ tx_packets = u64_stats_read(&s->tx_packets);
+ tx_bytes = u64_stats_read(&s->tx_bytes);
+ rx_packets = u64_stats_read(&s->rx_packets);
+ rx_bytes = u64_stats_read(&s->rx_bytes);
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ data[0] += tx_packets;
+ data[1] += tx_bytes;
+ data[2] += rx_packets;
+ data[3] += rx_bytes;
+ }
+ if (ds->ops->get_ethtool_stats)
+ ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
+}
+
+static int dsa_user_get_sset_count(struct net_device *dev, int sset)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (sset == ETH_SS_STATS) {
+ int count = 0;
+
+ if (ds->ops->get_sset_count) {
+ count = ds->ops->get_sset_count(ds, dp->index, sset);
+ if (count < 0)
+ return count;
+ }
+
+ return count + 4;
+ } else if (sset == ETH_SS_TEST) {
+ return net_selftest_get_count();
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_eth_phy_stats(struct net_device *dev,
+ struct ethtool_eth_phy_stats *phy_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eth_phy_stats)
+ ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
+}
+
+static void dsa_user_get_eth_mac_stats(struct net_device *dev,
+ struct ethtool_eth_mac_stats *mac_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eth_mac_stats)
+ ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
+}
+
+static void
+dsa_user_get_eth_ctrl_stats(struct net_device *dev,
+ struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eth_ctrl_stats)
+ ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
+}
+
+static void
+dsa_user_get_rmon_stats(struct net_device *dev,
+ struct ethtool_rmon_stats *rmon_stats,
+ const struct ethtool_rmon_hist_range **ranges)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_rmon_stats)
+ ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
+}
+
+static void dsa_user_get_ts_stats(struct net_device *dev,
+ struct ethtool_ts_stats *ts_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_ts_stats)
+ ds->ops->get_ts_stats(ds, dp->index, ts_stats);
+}
+
+static void dsa_user_net_selftest(struct net_device *ndev,
+ struct ethtool_test *etest, u64 *buf)
+{
+ struct dsa_port *dp = dsa_user_to_port(ndev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->self_test) {
+ ds->ops->self_test(ds, dp->index, etest, buf);
+ return;
+ }
+
+ net_selftest(ndev, etest, buf);
+}
+
+static int dsa_user_get_mm(struct net_device *dev,
+ struct ethtool_mm_state *state)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->get_mm)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_mm(ds, dp->index, state);
+}
+
+static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->set_mm)
+ return -EOPNOTSUPP;
+
+ return ds->ops->set_mm(ds, dp->index, cfg, extack);
+}
+
+static void dsa_user_get_mm_stats(struct net_device *dev,
+ struct ethtool_mm_stats *stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_mm_stats)
+ ds->ops->get_mm_stats(ds, dp->index, stats);
+}
+
+static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ phylink_ethtool_get_wol(dp->pl, w);
+
+ if (ds->ops->get_wol)
+ ds->ops->get_wol(ds, dp->index, w);
+}
+
+static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int ret = -EOPNOTSUPP;
+
+ phylink_ethtool_set_wol(dp->pl, w);
+
+ if (ds->ops->set_wol)
+ ret = ds->ops->set_wol(ds, dp->index, w);
+
+ return ret;
+}
+
+static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int ret;
+
+ /* Check whether the switch supports EEE */
+ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
+ return -EOPNOTSUPP;
+
+ /* If the port is using phylink managed EEE, then an unimplemented
+ * set_mac_eee() is permissible.
+ */
+ if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) {
+ /* Port's PHY and MAC both need to be EEE capable */
+ if (!dev->phydev)
+ return -ENODEV;
+
+ if (!ds->ops->set_mac_eee)
+ return -EOPNOTSUPP;
+
+ ret = ds->ops->set_mac_eee(ds, dp->index, e);
+ if (ret)
+ return ret;
+ } else if (ds->ops->set_mac_eee) {
+ ret = ds->ops->set_mac_eee(ds, dp->index, e);
+ if (ret)
+ return ret;
+ }
+
+ return phylink_ethtool_set_eee(dp->pl, e);
+}
+
+static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ /* Check whether the switch supports EEE */
+ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
+ return -EOPNOTSUPP;
+
+ /* Port's PHY and MAC both need to be EEE capable */
+ if (!dev->phydev)
+ return -ENODEV;
+
+ return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_user_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_user_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_ksettings_set(dp->pl, cmd);
+}
+
+static void dsa_user_get_pause_stats(struct net_device *dev,
+ struct ethtool_pause_stats *pause_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_pause_stats)
+ ds->ops->get_pause_stats(ds, dp->index, pause_stats);
+}
+
+static void dsa_user_get_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ phylink_ethtool_get_pauseparam(dp->pl, pause);
+}
+
+static int dsa_user_set_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_set_pauseparam(dp->pl, pause);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int dsa_user_netpoll_setup(struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct netpoll *netpoll;
+ int err = 0;
+
+ netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+ if (!netpoll)
+ return -ENOMEM;
+
+ err = __netpoll_setup(netpoll, conduit);
+ if (err) {
+ kfree(netpoll);
+ goto out;
+ }
+
+ p->netpoll = netpoll;
+out:
+ return err;
+}
+
+static void dsa_user_netpoll_cleanup(struct net_device *dev)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct netpoll *netpoll = p->netpoll;
+
+ if (!netpoll)
+ return;
+
+ p->netpoll = NULL;
+
+ __netpoll_free(netpoll);
+}
+
+static void dsa_user_poll_controller(struct net_device *dev)
+{
+}
+#endif
+
+static struct dsa_mall_tc_entry *
+dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_mall_tc_entry *mall_tc_entry;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+ if (mall_tc_entry->cookie == cookie)
+ return mall_tc_entry;
+
+ return NULL;
+}
+
+static int
+dsa_user_add_cls_matchall_mirred(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress, bool ingress_target)
+{
+ struct netlink_ext_ack *extack = cls->common.extack;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_mall_mirror_tc_entry *mirror;
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+ struct flow_action_entry *act;
+ struct dsa_port *to_dp;
+ int err;
+
+ if (cls->common.protocol != htons(ETH_P_ALL)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can only offload \"protocol all\" matchall filter");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ds->ops->port_mirror_add) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Switch does not support mirroring operation");
+ return -EOPNOTSUPP;
+ }
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack))
+ return -EOPNOTSUPP;
+
+ act = &cls->rule->action.entries[0];
+
+ if (!act->dev)
+ return -EINVAL;
+
+ if (dsa_user_dev_check(act->dev)) {
+ if (ingress_target) {
+ /* We can only fulfill this using software assist */
+ if (cls->common.skip_sw) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can only mirred to ingress of DSA user port if filter also runs in software");
+ return -EOPNOTSUPP;
+ }
+ to_dp = dp->cpu_dp;
+ } else {
+ to_dp = dsa_user_to_port(act->dev);
+ }
+ } else {
+ /* Handle mirroring to foreign target ports as a mirror towards
+ * the CPU. The software tc rule will take the packets from
+ * there.
+ */
+ if (cls->common.skip_sw) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can only mirred to CPU if filter also runs in software");
+ return -EOPNOTSUPP;
+ }
+ to_dp = dp->cpu_dp;
+ }
+
+ if (dp->ds != to_dp->ds) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cross-chip mirroring not implemented");
+ return -EOPNOTSUPP;
+ }
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+ mirror = &mall_tc_entry->mirror;
+ mirror->to_local_port = to_dp->index;
+ mirror->ingress = ingress;
+
+ err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
+
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
+
+static int
+dsa_user_add_cls_matchall_police(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ struct netlink_ext_ack *extack = cls->common.extack;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_mall_policer_tc_entry *policer;
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+ struct flow_action_entry *act;
+ int err;
+
+ if (!ds->ops->port_policer_add) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Policing offload not implemented");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ingress) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only supported on ingress qdisc");
+ return -EOPNOTSUPP;
+ }
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
+ if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one port policer allowed");
+ return -EEXIST;
+ }
+ }
+
+ act = &cls->rule->action.entries[0];
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_POLICER;
+ policer = &mall_tc_entry->policer;
+ policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
+ policer->burst = act->police.burst;
+
+ err = ds->ops->port_policer_add(ds, dp->index, policer);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
+
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
+
+static int dsa_user_add_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ const struct flow_action *action = &cls->rule->action;
+ struct netlink_ext_ack *extack = cls->common.extack;
+
+ if (!flow_offload_has_one_action(action)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot offload matchall filter with more than one action");
+ return -EOPNOTSUPP;
+ }
+
+ switch (action->entries[0].id) {
+ case FLOW_ACTION_MIRRED:
+ return dsa_user_add_cls_matchall_mirred(dev, cls, ingress,
+ false);
+ case FLOW_ACTION_MIRRED_INGRESS:
+ return dsa_user_add_cls_matchall_mirred(dev, cls, ingress,
+ true);
+ case FLOW_ACTION_POLICE:
+ return dsa_user_add_cls_matchall_police(dev, cls, ingress);
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unknown action");
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_user_del_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+
+ mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie);
+ if (!mall_tc_entry)
+ return;
+
+ list_del(&mall_tc_entry->list);
+
+ switch (mall_tc_entry->type) {
+ case DSA_PORT_MALL_MIRROR:
+ if (ds->ops->port_mirror_del)
+ ds->ops->port_mirror_del(ds, dp->index,
+ &mall_tc_entry->mirror);
+ break;
+ case DSA_PORT_MALL_POLICER:
+ if (ds->ops->port_policer_del)
+ ds->ops->port_policer_del(ds, dp->index);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ kfree(mall_tc_entry);
+}
+
+static int dsa_user_setup_tc_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ if (cls->common.chain_index)
+ return -EOPNOTSUPP;
+
+ switch (cls->command) {
+ case TC_CLSMATCHALL_REPLACE:
+ return dsa_user_add_cls_matchall(dev, cls, ingress);
+ case TC_CLSMATCHALL_DESTROY:
+ dsa_user_del_cls_matchall(dev, cls);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_add_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_add)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_add(ds, port, cls, ingress);
+}
+
+static int dsa_user_del_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_del)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_del(ds, port, cls, ingress);
+}
+
+static int dsa_user_stats_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_stats)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_stats(ds, port, cls, ingress);
+}
+
+static int dsa_user_setup_tc_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ switch (cls->command) {
+ case FLOW_CLS_REPLACE:
+ return dsa_user_add_cls_flower(dev, cls, ingress);
+ case FLOW_CLS_DESTROY:
+ return dsa_user_del_cls_flower(dev, cls, ingress);
+ case FLOW_CLS_STATS:
+ return dsa_user_stats_cls_flower(dev, cls, ingress);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv, bool ingress)
+{
+ struct net_device *dev = cb_priv;
+
+ if (!tc_can_offload(dev))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case TC_SETUP_CLSMATCHALL:
+ return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress);
+ case TC_SETUP_CLSFLOWER:
+ return dsa_user_setup_tc_cls_flower(dev, type_data, ingress);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type,
+ void *type_data, void *cb_priv)
+{
+ return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true);
+}
+
+static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type,
+ void *type_data, void *cb_priv)
+{
+ return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false);
+}
+
+static LIST_HEAD(dsa_user_block_cb_list);
+
+static int dsa_user_setup_tc_block(struct net_device *dev,
+ struct flow_block_offload *f)
+{
+ struct flow_block_cb *block_cb;
+ flow_setup_cb_t *cb;
+
+ if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ cb = dsa_user_setup_tc_block_cb_ig;
+ else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ cb = dsa_user_setup_tc_block_cb_eg;
+ else
+ return -EOPNOTSUPP;
+
+ f->driver_block_list = &dsa_user_block_cb_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f->block, cb, dev);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port,
+ void *type_data)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port));
+
+ if (!conduit->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data);
+}
+
+static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ switch (type) {
+ case TC_SETUP_BLOCK:
+ return dsa_user_setup_tc_block(dev, type_data);
+ case TC_SETUP_FT:
+ return dsa_user_setup_ft_block(ds, dp->index, type_data);
+ default:
+ break;
+ }
+
+ if (!ds->ops->port_setup_tc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
+}
+
+static int dsa_user_get_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
+}
+
+static int dsa_user_set_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *nfc)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->set_rxnfc(ds, dp->index, nfc);
+}
+
+static int dsa_user_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *ts)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!ds->ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_ts_info(ds, p->dp->index, ts);
+}
+
+static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ struct netlink_ext_ack extack = {0};
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+ struct dsa_vlan *v;
+ int ret;
+
+ /* User port... */
+ ret = dsa_port_vlan_add(dp, &vlan, &extack);
+ if (ret) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return ret;
+ }
+
+ /* And CPU port... */
+ ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
+ if (ret) {
+ if (extack._msg)
+ netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
+ extack._msg);
+ return ret;
+ }
+
+ if (!dsa_switch_supports_uc_filtering(ds) &&
+ !dsa_switch_supports_mc_filtering(ds))
+ return 0;
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v) {
+ ret = -ENOMEM;
+ goto rollback;
+ }
+
+ netif_addr_lock_bh(dev);
+
+ v->vid = vid;
+ list_add_tail(&v->list, &dp->user_vlans);
+
+ if (dsa_switch_supports_mc_filtering(ds)) {
+ netdev_for_each_synced_mc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_MC_ADD,
+ ha->addr, vid);
+ }
+ }
+
+ if (dsa_switch_supports_uc_filtering(ds)) {
+ netdev_for_each_synced_uc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_UC_ADD,
+ ha->addr, vid);
+ }
+ }
+
+ netif_addr_unlock_bh(dev);
+
+ dsa_flush_workqueue();
+
+ return 0;
+
+rollback:
+ dsa_port_host_vlan_del(dp, &vlan);
+ dsa_port_vlan_del(dp, &vlan);
+
+ return ret;
+}
+
+static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan vlan = {
+ .vid = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+ struct dsa_vlan *v;
+ int err;
+
+ err = dsa_port_vlan_del(dp, &vlan);
+ if (err)
+ return err;
+
+ err = dsa_port_host_vlan_del(dp, &vlan);
+ if (err)
+ return err;
+
+ if (!dsa_switch_supports_uc_filtering(ds) &&
+ !dsa_switch_supports_mc_filtering(ds))
+ return 0;
+
+ netif_addr_lock_bh(dev);
+
+ v = dsa_vlan_find(&dp->user_vlans, &vlan);
+ if (!v) {
+ netif_addr_unlock_bh(dev);
+ return -ENOENT;
+ }
+
+ list_del(&v->list);
+ kfree(v);
+
+ if (dsa_switch_supports_mc_filtering(ds)) {
+ netdev_for_each_synced_mc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_MC_DEL,
+ ha->addr, vid);
+ }
+ }
+
+ if (dsa_switch_supports_uc_filtering(ds)) {
+ netdev_for_each_synced_uc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_UC_DEL,
+ ha->addr, vid);
+ }
+ }
+
+ netif_addr_unlock_bh(dev);
+
+ dsa_flush_workqueue();
+
+ return 0;
+}
+
+static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg)
+{
+ __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+ return dsa_user_vlan_rx_add_vid(arg, proto, vid);
+}
+
+static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg)
+{
+ __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+ return dsa_user_vlan_rx_kill_vid(arg, proto, vid);
+}
+
+/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
+ * filtering is enabled. The baseline is that only ports that offload a
+ * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
+ * but there are exceptions for quirky hardware.
+ *
+ * If ds->vlan_filtering_is_global = true, then standalone ports which share
+ * the same switch with other ports that offload a VLAN-aware bridge are also
+ * inevitably VLAN-aware.
+ *
+ * To summarize, a DSA switch port offloads:
+ *
+ * - If standalone (this includes software bridge, software LAG):
+ * - if ds->needs_standalone_vlan_filtering = true, OR if
+ * (ds->vlan_filtering_is_global = true AND there are bridges spanning
+ * this switch chip which have vlan_filtering=1)
+ * - the 8021q upper VLANs
+ * - else (standalone VLAN filtering is not needed, VLAN filtering is not
+ * global, or it is, but no port is under a VLAN-aware bridge):
+ * - no VLAN (any 8021q upper is a software VLAN)
+ *
+ * - If under a vlan_filtering=0 bridge which it offload:
+ * - if ds->configure_vlan_while_not_filtering = true (default):
+ * - the bridge VLANs. These VLANs are committed to hardware but inactive.
+ * - else (deprecated):
+ * - no VLAN. The bridge VLANs are not restored when VLAN awareness is
+ * enabled, so this behavior is broken and discouraged.
+ *
+ * - If under a vlan_filtering=1 bridge which it offload:
+ * - the bridge VLANs
+ * - the 8021q upper VLANs
+ */
+int dsa_user_manage_vlan_filtering(struct net_device *user,
+ bool vlan_filtering)
+{
+ int err;
+
+ if (vlan_filtering) {
+ user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ err = vlan_for_each(user, dsa_user_restore_vlan, user);
+ if (err) {
+ vlan_for_each(user, dsa_user_clear_vlan, user);
+ user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+ return err;
+ }
+ } else {
+ err = vlan_for_each(user, dsa_user_clear_vlan, user);
+ if (err)
+ return err;
+
+ user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+ }
+
+ return 0;
+}
+
+struct dsa_hw_port {
+ struct list_head list;
+ struct net_device *dev;
+ int old_mtu;
+};
+
+static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
+{
+ const struct dsa_hw_port *p;
+ int err;
+
+ list_for_each_entry(p, hw_port_list, list) {
+ if (p->dev->mtu == mtu)
+ continue;
+
+ err = dev_set_mtu(p->dev, mtu);
+ if (err)
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ list_for_each_entry_continue_reverse(p, hw_port_list, list) {
+ if (p->dev->mtu == p->old_mtu)
+ continue;
+
+ if (dev_set_mtu(p->dev, p->old_mtu))
+ netdev_err(p->dev, "Failed to restore MTU\n");
+ }
+
+ return err;
+}
+
+static void dsa_hw_port_list_free(struct list_head *hw_port_list)
+{
+ struct dsa_hw_port *p, *n;
+
+ list_for_each_entry_safe(p, n, hw_port_list, list)
+ kfree(p);
+}
+
+/* Make the hardware datapath to/from @dev limited to a common MTU */
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+{
+ struct list_head hw_port_list;
+ struct dsa_switch_tree *dst;
+ int min_mtu = ETH_MAX_MTU;
+ struct dsa_port *other_dp;
+ int err;
+
+ if (!dp->ds->mtu_enforcement_ingress)
+ return;
+
+ if (!dp->bridge)
+ return;
+
+ INIT_LIST_HEAD(&hw_port_list);
+
+ /* Populate the list of ports that are part of the same bridge
+ * as the newly added/modified port
+ */
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ list_for_each_entry(other_dp, &dst->ports, list) {
+ struct dsa_hw_port *hw_port;
+ struct net_device *user;
+
+ if (other_dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (!dsa_port_bridge_same(dp, other_dp))
+ continue;
+
+ if (!other_dp->ds->mtu_enforcement_ingress)
+ continue;
+
+ user = other_dp->user;
+
+ if (min_mtu > user->mtu)
+ min_mtu = user->mtu;
+
+ hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
+ if (!hw_port)
+ goto out;
+
+ hw_port->dev = user;
+ hw_port->old_mtu = user->mtu;
+
+ list_add(&hw_port->list, &hw_port_list);
+ }
+ }
+
+ /* Attempt to configure the entire hardware bridge to the newly added
+ * interface's MTU first, regardless of whether the intention of the
+ * user was to raise or lower it.
+ */
+ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu);
+ if (!err)
+ goto out;
+
+ /* Clearly that didn't work out so well, so just set the minimum MTU on
+ * all hardware bridge ports now. If this fails too, then all ports will
+ * still have their old MTU rolled back anyway.
+ */
+ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
+
+out:
+ dsa_hw_port_list_free(&hw_port_list);
+}
+
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ int largest_mtu = 0;
+ int new_conduit_mtu;
+ int old_conduit_mtu;
+ int mtu_limit;
+ int overhead;
+ int cpu_mtu;
+ int err;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ dsa_tree_for_each_user_port(other_dp, ds->dst) {
+ int user_mtu;
+
+ /* During probe, this function will be called for each user
+ * device, while not all of them have been allocated. That's
+ * ok, it doesn't change what the maximum is, so ignore it.
+ */
+ if (!other_dp->user)
+ continue;
+
+ /* Pretend that we already applied the setting, which we
+ * actually haven't (still haven't done all integrity checks)
+ */
+ if (dp == other_dp)
+ user_mtu = new_mtu;
+ else
+ user_mtu = other_dp->user->mtu;
+
+ if (largest_mtu < user_mtu)
+ largest_mtu = user_mtu;
+ }
+
+ overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
+ mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead);
+ old_conduit_mtu = conduit->mtu;
+ new_conduit_mtu = largest_mtu + overhead;
+ if (new_conduit_mtu > mtu_limit)
+ return -ERANGE;
+
+ /* If the conduit MTU isn't over limit, there's no need to check the CPU
+ * MTU, since that surely isn't either.
+ */
+ cpu_mtu = largest_mtu;
+
+ /* Start applying stuff */
+ if (new_conduit_mtu != old_conduit_mtu) {
+ err = dev_set_mtu(conduit, new_conduit_mtu);
+ if (err < 0)
+ goto out_conduit_failed;
+
+ /* We only need to propagate the MTU of the CPU port to
+ * upstream switches, so emit a notifier which updates them.
+ */
+ err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
+ if (err)
+ goto out_cpu_failed;
+ }
+
+ err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
+ if (err)
+ goto out_port_failed;
+
+ WRITE_ONCE(dev->mtu, new_mtu);
+
+ dsa_bridge_mtu_normalization(dp);
+
+ return 0;
+
+out_port_failed:
+ if (new_conduit_mtu != old_conduit_mtu)
+ dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead);
+out_cpu_failed:
+ if (new_conduit_mtu != old_conduit_mtu)
+ dev_set_mtu(conduit, old_conduit_mtu);
+out_conduit_failed:
+ return err;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->port_set_apptrust)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_set_apptrust(ds, port, sel, nsel);
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->port_get_apptrust)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_get_apptrust(ds, port, sel, nsel);
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ unsigned long mask, new_prio;
+ int err, port = dp->index;
+
+ if (!ds->ops->port_set_default_prio)
+ return -EOPNOTSUPP;
+
+ err = dcb_ieee_setapp(dev, app);
+ if (err)
+ return err;
+
+ mask = dcb_ieee_getapp_mask(dev, app);
+ new_prio = __fls(mask);
+
+ err = ds->ops->port_set_default_prio(ds, port, new_prio);
+ if (err) {
+ dcb_ieee_delapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+/* Update the DSCP prio entries on all user ports of the switch in case
+ * the switch supports global DSCP prio instead of per port DSCP prios.
+ */
+static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev,
+ struct dcb_app *app, bool del)
+{
+ int (*setdel)(struct net_device *dev, struct dcb_app *app);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ int err, restore_err;
+
+ if (del)
+ setdel = dcb_ieee_delapp;
+ else
+ setdel = dcb_ieee_setapp;
+
+ dsa_switch_for_each_user_port(other_dp, ds) {
+ struct net_device *user = other_dp->user;
+
+ if (!user || user == dev)
+ continue;
+
+ err = setdel(user, app);
+ if (err)
+ goto err_try_to_restore;
+ }
+
+ return 0;
+
+err_try_to_restore:
+
+ /* Revert logic to restore previous state of app entries */
+ if (!del)
+ setdel = dcb_ieee_delapp;
+ else
+ setdel = dcb_ieee_setapp;
+
+ dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) {
+ struct net_device *user = other_dp->user;
+
+ if (!user || user == dev)
+ continue;
+
+ restore_err = setdel(user, app);
+ if (restore_err)
+ netdev_err(user, "Failed to restore DSCP prio entry configuration\n");
+ }
+
+ return err;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ unsigned long mask, new_prio;
+ int err, port = dp->index;
+ u8 dscp = app->protocol;
+
+ if (!ds->ops->port_add_dscp_prio)
+ return -EOPNOTSUPP;
+
+ if (dscp >= 64) {
+ netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
+ dscp);
+ return -EINVAL;
+ }
+
+ err = dcb_ieee_setapp(dev, app);
+ if (err)
+ return err;
+
+ mask = dcb_ieee_getapp_mask(dev, app);
+ new_prio = __fls(mask);
+
+ err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
+ if (err) {
+ dcb_ieee_delapp(dev, app);
+ return err;
+ }
+
+ if (!ds->dscp_prio_mapping_is_global)
+ return 0;
+
+ err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false);
+ if (err) {
+ if (ds->ops->port_del_dscp_prio)
+ ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio);
+ dcb_ieee_delapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev,
+ struct dcb_app *app)
+{
+ switch (app->selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ switch (app->protocol) {
+ case 0:
+ return dsa_user_dcbnl_set_default_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+ break;
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return dsa_user_dcbnl_add_dscp_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ unsigned long mask, new_prio;
+ int err, port = dp->index;
+
+ if (!ds->ops->port_set_default_prio)
+ return -EOPNOTSUPP;
+
+ err = dcb_ieee_delapp(dev, app);
+ if (err)
+ return err;
+
+ mask = dcb_ieee_getapp_mask(dev, app);
+ new_prio = mask ? __fls(mask) : 0;
+
+ err = ds->ops->port_set_default_prio(ds, port, new_prio);
+ if (err) {
+ dcb_ieee_setapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int err, port = dp->index;
+ u8 dscp = app->protocol;
+
+ if (!ds->ops->port_del_dscp_prio)
+ return -EOPNOTSUPP;
+
+ err = dcb_ieee_delapp(dev, app);
+ if (err)
+ return err;
+
+ err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
+ if (err) {
+ dcb_ieee_setapp(dev, app);
+ return err;
+ }
+
+ if (!ds->dscp_prio_mapping_is_global)
+ return 0;
+
+ err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true);
+ if (err) {
+ if (ds->ops->port_add_dscp_prio)
+ ds->ops->port_add_dscp_prio(ds, port, dscp,
+ app->priority);
+ dcb_ieee_setapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev,
+ struct dcb_app *app)
+{
+ switch (app->selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ switch (app->protocol) {
+ case 0:
+ return dsa_user_dcbnl_del_default_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+ break;
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return dsa_user_dcbnl_del_dscp_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+/* Pre-populate the DCB application priority table with the priorities
+ * configured during switch setup, which we read from hardware here.
+ */
+static int dsa_user_dcbnl_init(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ int err;
+
+ if (ds->ops->port_get_default_prio) {
+ int prio = ds->ops->port_get_default_prio(ds, port);
+ struct dcb_app app = {
+ .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
+ .protocol = 0,
+ .priority = prio,
+ };
+
+ if (prio < 0)
+ return prio;
+
+ err = dcb_ieee_setapp(dev, &app);
+ if (err)
+ return err;
+ }
+
+ if (ds->ops->port_get_dscp_prio) {
+ int protocol;
+
+ for (protocol = 0; protocol < 64; protocol++) {
+ struct dcb_app app = {
+ .selector = IEEE_8021QAZ_APP_SEL_DSCP,
+ .protocol = protocol,
+ };
+ int prio;
+
+ prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
+ if (prio == -EOPNOTSUPP)
+ continue;
+ if (prio < 0)
+ return prio;
+
+ app.priority = prio;
+
+ err = dcb_ieee_setapp(dev, &app);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static const struct ethtool_ops dsa_user_ethtool_ops = {
+ .get_drvinfo = dsa_user_get_drvinfo,
+ .get_regs_len = dsa_user_get_regs_len,
+ .get_regs = dsa_user_get_regs,
+ .nway_reset = dsa_user_nway_reset,
+ .get_link = ethtool_op_get_link,
+ .get_eeprom_len = dsa_user_get_eeprom_len,
+ .get_eeprom = dsa_user_get_eeprom,
+ .set_eeprom = dsa_user_set_eeprom,
+ .get_strings = dsa_user_get_strings,
+ .get_ethtool_stats = dsa_user_get_ethtool_stats,
+ .get_sset_count = dsa_user_get_sset_count,
+ .get_eth_phy_stats = dsa_user_get_eth_phy_stats,
+ .get_eth_mac_stats = dsa_user_get_eth_mac_stats,
+ .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats,
+ .get_rmon_stats = dsa_user_get_rmon_stats,
+ .get_ts_stats = dsa_user_get_ts_stats,
+ .set_wol = dsa_user_set_wol,
+ .get_wol = dsa_user_get_wol,
+ .set_eee = dsa_user_set_eee,
+ .get_eee = dsa_user_get_eee,
+ .get_link_ksettings = dsa_user_get_link_ksettings,
+ .set_link_ksettings = dsa_user_set_link_ksettings,
+ .get_pause_stats = dsa_user_get_pause_stats,
+ .get_pauseparam = dsa_user_get_pauseparam,
+ .set_pauseparam = dsa_user_set_pauseparam,
+ .get_rxnfc = dsa_user_get_rxnfc,
+ .set_rxnfc = dsa_user_set_rxnfc,
+ .get_ts_info = dsa_user_get_ts_info,
+ .self_test = dsa_user_net_selftest,
+ .get_mm = dsa_user_get_mm,
+ .set_mm = dsa_user_set_mm,
+ .get_mm_stats = dsa_user_get_mm_stats,
+};
+
+static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = {
+ .ieee_setapp = dsa_user_dcbnl_ieee_setapp,
+ .ieee_delapp = dsa_user_dcbnl_ieee_delapp,
+ .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust,
+ .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust,
+};
+
+static void dsa_user_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_stats64)
+ ds->ops->get_stats64(ds, dp->index, s);
+ else
+ dev_get_tstats64(dev, s);
+}
+
+static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct dsa_port *dp = dsa_user_to_port(ctx->dev);
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+
+ path->dev = ctx->dev;
+ path->type = DEV_PATH_DSA;
+ path->dsa.proto = cpu_dp->tag_ops->proto;
+ path->dsa.port = dp->index;
+ ctx->dev = conduit;
+
+ return 0;
+}
+
+static int dsa_user_hwtstamp_get(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_hwtstamp_get(ds, dp->index, cfg);
+}
+
+static int dsa_user_hwtstamp_set(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_hwtstamp_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack);
+}
+
+static const struct net_device_ops dsa_user_netdev_ops = {
+ .ndo_open = dsa_user_open,
+ .ndo_stop = dsa_user_close,
+ .ndo_start_xmit = dsa_user_xmit,
+ .ndo_change_rx_flags = dsa_user_change_rx_flags,
+ .ndo_set_rx_mode = dsa_user_set_rx_mode,
+ .ndo_set_mac_address = dsa_user_set_mac_address,
+ .ndo_fdb_dump = dsa_user_fdb_dump,
+ .ndo_eth_ioctl = dsa_user_ioctl,
+ .ndo_get_iflink = dsa_user_get_iflink,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = dsa_user_netpoll_setup,
+ .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup,
+ .ndo_poll_controller = dsa_user_poll_controller,
+#endif
+ .ndo_setup_tc = dsa_user_setup_tc,
+ .ndo_get_stats64 = dsa_user_get_stats64,
+ .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid,
+ .ndo_change_mtu = dsa_user_change_mtu,
+ .ndo_fill_forward_path = dsa_user_fill_forward_path,
+ .ndo_hwtstamp_get = dsa_user_hwtstamp_get,
+ .ndo_hwtstamp_set = dsa_user_hwtstamp_set,
+};
+
+static const struct device_type dsa_type = {
+ .name = "dsa",
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+ const struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (dp->pl)
+ phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_user_phylink_fixed_state(struct phylink_config *config,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_phylink_to_port(config);
+ struct dsa_switch *ds = dp->ds;
+
+ /* No need to check that this operation is valid, the callback would
+ * not be called if it was not.
+ */
+ ds->ops->phylink_fixed_state(ds, dp->index, state);
+}
+
+/* user device setup *******************************************************/
+static int dsa_user_phy_connect(struct net_device *user_dev, int addr,
+ u32 flags)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+ struct dsa_switch *ds = dp->ds;
+
+ user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr);
+ if (!user_dev->phydev) {
+ netdev_err(user_dev, "no phy at %d\n", addr);
+ return -ENODEV;
+ }
+
+ user_dev->phydev->dev_flags |= flags;
+
+ return phylink_connect_phy(dp->pl, user_dev->phydev);
+}
+
+static int dsa_user_phy_setup(struct net_device *user_dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+ struct device_node *port_dn = dp->dn;
+ struct dsa_switch *ds = dp->ds;
+ u32 phy_flags = 0;
+ int ret;
+
+ dp->pl_config.dev = &user_dev->dev;
+ dp->pl_config.type = PHYLINK_NETDEV;
+
+ /* The get_fixed_state callback takes precedence over polling the
+ * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set
+ * this if the switch provides such a callback.
+ */
+ if (ds->ops->phylink_fixed_state) {
+ dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state;
+ dp->pl_config.poll_fixed_state = true;
+ }
+
+ ret = dsa_port_phylink_create(dp);
+ if (ret)
+ return ret;
+
+ if (ds->ops->get_phy_flags)
+ phy_flags = ds->ops->get_phy_flags(ds, dp->index);
+
+ ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+ if (ret == -ENODEV && ds->user_mii_bus) {
+ /* We could not connect to a designated PHY or SFP, so try to
+ * use the switch internal MDIO bus instead
+ */
+ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags);
+ }
+ if (ret) {
+ netdev_err(user_dev, "failed to connect to PHY: %pe\n",
+ ERR_PTR(ret));
+ dsa_port_phylink_destroy(dp);
+ }
+
+ return ret;
+}
+
+void dsa_user_setup_tagger(struct net_device *user)
+{
+ struct dsa_port *dp = dsa_user_to_port(user);
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_user_priv *p = netdev_priv(user);
+ const struct dsa_port *cpu_dp = dp->cpu_dp;
+ const struct dsa_switch *ds = dp->ds;
+
+ user->needed_headroom = cpu_dp->tag_ops->needed_headroom;
+ user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
+ /* Try to save one extra realloc later in the TX path (in the conduit)
+ * by also inheriting the conduit's needed headroom and tailroom.
+ * The 8021q driver also does this.
+ */
+ user->needed_headroom += conduit->needed_headroom;
+ user->needed_tailroom += conduit->needed_tailroom;
+
+ p->xmit = cpu_dp->tag_ops->xmit;
+
+ user->features = conduit->vlan_features | NETIF_F_HW_TC;
+ user->hw_features |= NETIF_F_HW_TC;
+ if (user->needed_tailroom)
+ user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
+ if (ds->needs_standalone_vlan_filtering)
+ user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ user->lltx = true;
+}
+
+int dsa_user_suspend(struct net_device *user_dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+ if (!netif_running(user_dev))
+ return 0;
+
+ netif_device_detach(user_dev);
+
+ rtnl_lock();
+ phylink_stop(dp->pl);
+ rtnl_unlock();
+
+ return 0;
+}
+
+int dsa_user_resume(struct net_device *user_dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+ if (!netif_running(user_dev))
+ return 0;
+
+ netif_device_attach(user_dev);
+
+ rtnl_lock();
+ phylink_start(dp->pl);
+ rtnl_unlock();
+
+ return 0;
+}
+
+int dsa_user_create(struct dsa_port *port)
+{
+ struct net_device *conduit = dsa_port_to_conduit(port);
+ struct dsa_switch *ds = port->ds;
+ struct net_device *user_dev;
+ struct dsa_user_priv *p;
+ const char *name;
+ int assign_type;
+ int ret;
+
+ if (!ds->num_tx_queues)
+ ds->num_tx_queues = 1;
+
+ if (port->name) {
+ name = port->name;
+ assign_type = NET_NAME_PREDICTABLE;
+ } else {
+ name = "eth%d";
+ assign_type = NET_NAME_ENUM;
+ }
+
+ user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name,
+ assign_type, ether_setup,
+ ds->num_tx_queues, 1);
+ if (user_dev == NULL)
+ return -ENOMEM;
+
+ user_dev->rtnl_link_ops = &dsa_link_ops;
+ user_dev->ethtool_ops = &dsa_user_ethtool_ops;
+#if IS_ENABLED(CONFIG_DCB)
+ user_dev->dcbnl_ops = &dsa_user_dcbnl_ops;
+#endif
+ if (!is_zero_ether_addr(port->mac))
+ eth_hw_addr_set(user_dev, port->mac);
+ else
+ eth_hw_addr_inherit(user_dev, conduit);
+ user_dev->priv_flags |= IFF_NO_QUEUE;
+ if (dsa_switch_supports_uc_filtering(ds))
+ user_dev->priv_flags |= IFF_UNICAST_FLT;
+ user_dev->netdev_ops = &dsa_user_netdev_ops;
+ if (ds->ops->port_max_mtu)
+ user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+ SET_NETDEV_DEVTYPE(user_dev, &dsa_type);
+
+ SET_NETDEV_DEV(user_dev, port->ds->dev);
+ SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port);
+ user_dev->dev.of_node = port->dn;
+ user_dev->vlan_features = conduit->vlan_features;
+
+ p = netdev_priv(user_dev);
+ user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
+ ret = gro_cells_init(&p->gcells, user_dev);
+ if (ret)
+ goto out_free;
+
+ p->dp = port;
+ INIT_LIST_HEAD(&p->mall_tc_list);
+ port->user = user_dev;
+ dsa_user_setup_tagger(user_dev);
+
+ netif_carrier_off(user_dev);
+
+ ret = dsa_user_phy_setup(user_dev);
+ if (ret) {
+ netdev_err(user_dev,
+ "error %d setting up PHY for tree %d, switch %d, port %d\n",
+ ret, ds->dst->index, ds->index, port->index);
+ goto out_gcells;
+ }
+
+ rtnl_lock();
+
+ ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN);
+ if (ret && ret != -EOPNOTSUPP)
+ dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
+ ret, ETH_DATA_LEN, port->index);
+
+ ret = register_netdevice(user_dev);
+ if (ret) {
+ netdev_err(conduit, "error %d registering interface %s\n",
+ ret, user_dev->name);
+ rtnl_unlock();
+ goto out_phy;
+ }
+
+ if (IS_ENABLED(CONFIG_DCB)) {
+ ret = dsa_user_dcbnl_init(user_dev);
+ if (ret) {
+ netdev_err(user_dev,
+ "failed to initialize DCB: %pe\n",
+ ERR_PTR(ret));
+ rtnl_unlock();
+ goto out_unregister;
+ }
+ }
+
+ ret = netdev_upper_dev_link(conduit, user_dev, NULL);
+
+ rtnl_unlock();
+
+ if (ret)
+ goto out_unregister;
+
+ return 0;
+
+out_unregister:
+ unregister_netdev(user_dev);
+out_phy:
+ rtnl_lock();
+ phylink_disconnect_phy(p->dp->pl);
+ rtnl_unlock();
+ dsa_port_phylink_destroy(p->dp);
+out_gcells:
+ gro_cells_destroy(&p->gcells);
+out_free:
+ free_netdev(user_dev);
+ port->user = NULL;
+ return ret;
+}
+
+void dsa_user_destroy(struct net_device *user_dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(user_dev);
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+ struct dsa_user_priv *p = netdev_priv(user_dev);
+
+ netif_carrier_off(user_dev);
+ rtnl_lock();
+ netdev_upper_dev_unlink(conduit, user_dev);
+ unregister_netdevice(user_dev);
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
+
+ dsa_port_phylink_destroy(dp);
+ gro_cells_destroy(&p->gcells);
+ free_netdev(user_dev);
+}
+
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *old_conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct net_device *upper;
+ struct list_head *iter;
+ int err;
+
+ if (conduit == old_conduit)
+ return 0;
+
+ if (!ds->ops->port_change_conduit) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Driver does not support changing DSA conduit");
+ return -EOPNOTSUPP;
+ }
+
+ if (!netdev_uses_dsa(conduit)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Interface not eligible as DSA conduit");
+ return -EOPNOTSUPP;
+ }
+
+ netdev_for_each_upper_dev_rcu(conduit, upper, iter) {
+ if (dsa_user_dev_check(upper))
+ continue;
+ if (netif_is_bridge_master(upper))
+ continue;
+ NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers");
+ return -EOPNOTSUPP;
+ }
+
+ /* Since we allow live-changing the DSA conduit, plus we auto-open the
+ * DSA conduit when the user port opens => we need to ensure that the
+ * new DSA conduit is open too.
+ */
+ if (dev->flags & IFF_UP) {
+ err = dev_open(conduit, extack);
+ if (err)
+ return err;
+ }
+
+ netdev_upper_dev_unlink(old_conduit, dev);
+
+ err = netdev_upper_dev_link(conduit, dev, extack);
+ if (err)
+ goto out_revert_old_conduit_unlink;
+
+ err = dsa_port_change_conduit(dp, conduit, extack);
+ if (err)
+ goto out_revert_conduit_link;
+
+ /* Update the MTU of the new CPU port through cross-chip notifiers */
+ err = dsa_user_change_mtu(dev, dev->mtu);
+ if (err && err != -EOPNOTSUPP) {
+ netdev_warn(dev,
+ "nonfatal error updating MTU with new conduit: %pe\n",
+ ERR_PTR(err));
+ }
+
+ return 0;
+
+out_revert_conduit_link:
+ netdev_upper_dev_unlink(conduit, dev);
+out_revert_old_conduit_unlink:
+ netdev_upper_dev_link(old_conduit, dev, NULL);
+ return err;
+}
+
+bool dsa_user_dev_check(const struct net_device *dev)
+{
+ return dev->netdev_ops == &dsa_user_netdev_ops;
+}
+EXPORT_SYMBOL_GPL(dsa_user_dev_check);
+
+static int dsa_user_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+ struct dsa_port *dp;
+
+ if (!dsa_user_dev_check(dev))
+ return err;
+
+ dp = dsa_user_to_port(dev);
+ extack = netdev_notifier_info_to_extack(&info->info);
+
+ if (netif_is_bridge_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_port_bridge_join(dp, info->upper_dev, extack);
+ if (!err)
+ dsa_bridge_mtu_normalization(dp);
+ if (err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
+ err = 0;
+ }
+ err = notifier_from_errno(err);
+ } else {
+ dsa_port_bridge_leave(dp, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ } else if (netif_is_lag_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_port_lag_join(dp, info->upper_dev,
+ info->upper_info, extack);
+ if (err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
+ err = 0;
+ }
+ err = notifier_from_errno(err);
+ } else {
+ dsa_port_lag_leave(dp, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ } else if (is_hsr_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_port_hsr_join(dp, info->upper_dev, extack);
+ if (err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
+ err = 0;
+ }
+ err = notifier_from_errno(err);
+ } else {
+ dsa_port_hsr_leave(dp, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_user_prechangeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_port *dp;
+
+ if (!dsa_user_dev_check(dev))
+ return NOTIFY_DONE;
+
+ dp = dsa_user_to_port(dev);
+
+ if (netif_is_bridge_master(info->upper_dev) && !info->linking)
+ dsa_port_pre_bridge_leave(dp, info->upper_dev);
+ else if (netif_is_lag_master(info->upper_dev) && !info->linking)
+ dsa_port_pre_lag_leave(dp, info->upper_dev);
+ /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot
+ * meaningfully placed under a bridge yet
+ */
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_user_lag_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *lower;
+ struct list_head *iter;
+ int err = NOTIFY_DONE;
+ struct dsa_port *dp;
+
+ if (!netif_is_lag_master(dev))
+ return err;
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ if (!dsa_user_dev_check(lower))
+ continue;
+
+ dp = dsa_user_to_port(lower);
+ if (!dp->lag)
+ /* Software LAG */
+ continue;
+
+ err = dsa_user_changeupper(lower, info);
+ if (notifier_to_errno(err))
+ break;
+ }
+
+ return err;
+}
+
+/* Same as dsa_user_lag_changeupper() except that it calls
+ * dsa_user_prechangeupper()
+ */
+static int
+dsa_user_lag_prechangeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *lower;
+ struct list_head *iter;
+ int err = NOTIFY_DONE;
+ struct dsa_port *dp;
+
+ if (!netif_is_lag_master(dev))
+ return err;
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ if (!dsa_user_dev_check(lower))
+ continue;
+
+ dp = dsa_user_to_port(lower);
+ if (!dp->lag)
+ /* Software LAG */
+ continue;
+
+ err = dsa_user_prechangeupper(lower, info);
+ if (notifier_to_errno(err))
+ break;
+ }
+
+ return err;
+}
+
+static int
+dsa_prevent_bridging_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *ext_ack;
+ struct net_device *user, *br;
+ struct dsa_port *dp;
+
+ ext_ack = netdev_notifier_info_to_extack(&info->info);
+
+ if (!is_vlan_dev(dev))
+ return NOTIFY_DONE;
+
+ user = vlan_dev_real_dev(dev);
+ if (!dsa_user_dev_check(user))
+ return NOTIFY_DONE;
+
+ dp = dsa_user_to_port(user);
+ br = dsa_port_bridge_dev_get(dp);
+ if (!br)
+ return NOTIFY_DONE;
+
+ /* Deny enslaving a VLAN device into a VLAN-aware bridge */
+ if (br_vlan_enabled(br) &&
+ netif_is_bridge_master(info->upper_dev) && info->linking) {
+ NL_SET_ERR_MSG_MOD(ext_ack,
+ "Cannot make VLAN device join VLAN-aware bridge");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_user_check_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct bridge_vlan_info br_info;
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+ u16 vid;
+
+ if (!br || !br_vlan_enabled(br))
+ return NOTIFY_DONE;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+ vid = vlan_dev_vlan_id(info->upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "This VLAN is already configured by the bridge");
+ return notifier_from_errno(-EBUSY);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_user_prechangeupper_sanity_check(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_switch *ds;
+ struct dsa_port *dp;
+ int err;
+
+ if (!dsa_user_dev_check(dev))
+ return dsa_prevent_bridging_8021q_upper(dev, info);
+
+ dp = dsa_user_to_port(dev);
+ ds = dp->ds;
+
+ if (ds->ops->port_prechangeupper) {
+ err = ds->ops->port_prechangeupper(ds, dp->index, info);
+ if (err)
+ return notifier_from_errno(err);
+ }
+
+ if (is_vlan_dev(info->upper_dev))
+ return dsa_user_check_8021q_upper(dev, info);
+
+ return NOTIFY_DONE;
+}
+
+/* To be eligible as a DSA conduit, a LAG must have all lower interfaces be
+ * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of
+ * switches in the same switch tree.
+ */
+static int dsa_lag_conduit_validate(struct net_device *lag_dev,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *lower1, *lower2;
+ struct list_head *iter1, *iter2;
+
+ netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
+ netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
+ if (!netdev_uses_dsa(lower1) ||
+ !netdev_uses_dsa(lower2)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "All LAG ports must be eligible as DSA conduits");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ if (lower1 == lower2)
+ continue;
+
+ if (!dsa_port_tree_same(lower1->dsa_ptr,
+ lower2->dsa_ptr)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "LAG contains DSA conduits of disjoint switch trees");
+ return notifier_from_errno(-EINVAL);
+ }
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+
+ if (!netdev_uses_dsa(conduit))
+ return NOTIFY_DONE;
+
+ if (!info->linking)
+ return NOTIFY_DONE;
+
+ /* Allow DSA switch uppers */
+ if (dsa_user_dev_check(info->upper_dev))
+ return NOTIFY_DONE;
+
+ /* Allow bridge uppers of DSA conduits, subject to further
+ * restrictions in dsa_bridge_prechangelower_sanity_check()
+ */
+ if (netif_is_bridge_master(info->upper_dev))
+ return NOTIFY_DONE;
+
+ /* Allow LAG uppers, subject to further restrictions in
+ * dsa_lag_conduit_prechangelower_sanity_check()
+ */
+ if (netif_is_lag_master(info->upper_dev))
+ return dsa_lag_conduit_validate(info->upper_dev, extack);
+
+ NL_SET_ERR_MSG_MOD(extack,
+ "DSA conduit cannot join unknown upper interfaces");
+ return notifier_from_errno(-EBUSY);
+}
+
+static int
+dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+ struct net_device *lag_dev = info->upper_dev;
+ struct net_device *lower;
+ struct list_head *iter;
+
+ if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
+ return NOTIFY_DONE;
+
+ if (!info->linking)
+ return NOTIFY_DONE;
+
+ if (!netdev_uses_dsa(dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Only DSA conduits can join a LAG DSA conduit");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ netdev_for_each_lower_dev(lag_dev, lower, iter) {
+ if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
+ NL_SET_ERR_MSG(extack,
+ "Interface is DSA conduit for a different switch tree than this LAG");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/* Don't allow bridging of DSA conduits, since the bridge layer rx_handler
+ * prevents the DSA fake ethertype handler to be invoked, so we don't get the
+ * chance to strip off and parse the DSA switch tag protocol header (the bridge
+ * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
+ * frames).
+ * The only case where that would not be an issue is when bridging can already
+ * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev
+ * port, and is bridged only with other ports from the same hardware device.
+ */
+static int
+dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *br = info->upper_dev;
+ struct netlink_ext_ack *extack;
+ struct net_device *lower;
+ struct list_head *iter;
+
+ if (!netif_is_bridge_master(br))
+ return NOTIFY_DONE;
+
+ if (!info->linking)
+ return NOTIFY_DONE;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+
+ netdev_for_each_lower_dev(br, lower, iter) {
+ if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
+ continue;
+
+ if (!netdev_port_same_parent_id(lower, new_lower)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot do software bridging with a DSA conduit");
+ return notifier_from_errno(-EINVAL);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst,
+ struct net_device *lag_dev)
+{
+ struct net_device *new_conduit = dsa_tree_find_first_conduit(dst);
+ struct dsa_port *dp;
+ int err;
+
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_conduit(dp) != lag_dev)
+ continue;
+
+ err = dsa_user_change_conduit(dp->user, new_conduit, NULL);
+ if (err) {
+ netdev_err(dp->user,
+ "failed to restore conduit to %s: %pe\n",
+ new_conduit->name, ERR_PTR(err));
+ }
+ }
+}
+
+static int dsa_conduit_lag_join(struct net_device *conduit,
+ struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+ int err;
+
+ err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack);
+ if (err)
+ return err;
+
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_conduit(dp) != conduit)
+ continue;
+
+ err = dsa_user_change_conduit(dp->user, lag_dev, extack);
+ if (err)
+ goto restore;
+ }
+
+ return 0;
+
+restore:
+ dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
+ if (dsa_port_to_conduit(dp) != lag_dev)
+ continue;
+
+ err = dsa_user_change_conduit(dp->user, conduit, NULL);
+ if (err) {
+ netdev_err(dp->user,
+ "failed to restore conduit to %s: %pe\n",
+ conduit->name, ERR_PTR(err));
+ }
+ }
+
+ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+
+ return err;
+}
+
+static void dsa_conduit_lag_leave(struct net_device *conduit,
+ struct net_device *lag_dev)
+{
+ struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *new_cpu_dp = NULL;
+ struct net_device *lower;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(lag_dev, lower, iter) {
+ if (netdev_uses_dsa(lower)) {
+ new_cpu_dp = lower->dsa_ptr;
+ break;
+ }
+ }
+
+ if (new_cpu_dp) {
+ /* Update the CPU port of the user ports still under the LAG
+ * so that dsa_port_to_conduit() continues to work properly
+ */
+ dsa_tree_for_each_user_port(dp, dst)
+ if (dsa_port_to_conduit(dp) == lag_dev)
+ dp->cpu_dp = new_cpu_dp;
+
+ /* Update the index of the virtual CPU port to match the lowest
+ * physical CPU port
+ */
+ lag_dev->dsa_ptr = new_cpu_dp;
+ wmb();
+ } else {
+ /* If the LAG DSA conduit has no ports left, migrate back all
+ * user ports to the first physical CPU port
+ */
+ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev);
+ }
+
+ /* This DSA conduit has left its LAG in any case, so let
+ * the CPU port leave the hardware LAG as well
+ */
+ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+}
+
+static int dsa_conduit_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+
+ if (!netdev_uses_dsa(dev))
+ return err;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+
+ if (netif_is_lag_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_conduit_lag_join(dev, info->upper_dev,
+ info->upper_info, extack);
+ err = notifier_from_errno(err);
+ } else {
+ dsa_conduit_lag_leave(dev, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_user_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_PRECHANGEUPPER: {
+ struct netdev_notifier_changeupper_info *info = ptr;
+ int err;
+
+ err = dsa_user_prechangeupper_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_conduit_prechangeupper_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_lag_conduit_prechangelower_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_bridge_prechangelower_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_user_prechangeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_user_lag_prechangeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ break;
+ }
+ case NETDEV_CHANGEUPPER: {
+ int err;
+
+ err = dsa_user_changeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_user_lag_changeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_conduit_changeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ break;
+ }
+ case NETDEV_CHANGELOWERSTATE: {
+ struct netdev_notifier_changelowerstate_info *info = ptr;
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (dsa_user_dev_check(dev)) {
+ dp = dsa_user_to_port(dev);
+
+ err = dsa_port_lag_change(dp, info->lower_state_info);
+ }
+
+ /* Mirror LAG port events on DSA conduits that are in
+ * a LAG towards their respective switch CPU ports
+ */
+ if (netdev_uses_dsa(dev)) {
+ dp = dev->dsa_ptr;
+
+ err = dsa_port_lag_change(dp, info->lower_state_info);
+ }
+
+ return notifier_from_errno(err);
+ }
+ case NETDEV_CHANGE:
+ case NETDEV_UP: {
+ /* Track state of conduit port.
+ * DSA driver may require the conduit port (and indirectly
+ * the tagger) to be available for some special operation.
+ */
+ if (netdev_uses_dsa(dev)) {
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->ds->dst;
+
+ /* Track when the conduit port is UP */
+ dsa_tree_conduit_oper_state_change(dst, dev,
+ netif_oper_up(dev));
+
+ /* Track when the conduit port is ready and can accept
+ * packet.
+ * NETDEV_UP event is not enough to flag a port as ready.
+ * We also have to wait for linkwatch_do_dev to dev_activate
+ * and emit a NETDEV_CHANGE event.
+ * We check if a conduit port is ready by checking if the dev
+ * have a qdisc assigned and is not noop.
+ */
+ dsa_tree_conduit_admin_state_change(dst, dev,
+ !qdisc_tx_is_noop(dev));
+
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+ }
+ case NETDEV_GOING_DOWN: {
+ struct dsa_port *dp, *cpu_dp;
+ struct dsa_switch_tree *dst;
+ LIST_HEAD(close_list);
+
+ if (!netdev_uses_dsa(dev))
+ return NOTIFY_DONE;
+
+ cpu_dp = dev->dsa_ptr;
+ dst = cpu_dp->ds->dst;
+
+ dsa_tree_conduit_admin_state_change(dst, dev, false);
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_user(dp))
+ continue;
+
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ list_add(&dp->user->close_list, &close_list);
+ }
+
+ netif_close_many(&close_list, true);
+
+ return NOTIFY_OK;
+ }
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void
+dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
+{
+ struct switchdev_notifier_fdb_info info = {};
+
+ info.addr = switchdev_work->addr;
+ info.vid = switchdev_work->vid;
+ info.offloaded = true;
+ call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
+ switchdev_work->orig_dev, &info.info, NULL);
+}
+
+static void dsa_user_switchdev_event_work(struct work_struct *work)
+{
+ struct dsa_switchdev_event_work *switchdev_work =
+ container_of(work, struct dsa_switchdev_event_work, work);
+ const unsigned char *addr = switchdev_work->addr;
+ struct net_device *dev = switchdev_work->dev;
+ u16 vid = switchdev_work->vid;
+ struct dsa_switch *ds;
+ struct dsa_port *dp;
+ int err;
+
+ dp = dsa_user_to_port(dev);
+ ds = dp->ds;
+
+ switch (switchdev_work->event) {
+ case SWITCHDEV_FDB_ADD_TO_DEVICE:
+ if (switchdev_work->host_addr)
+ err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
+ else if (dp->lag)
+ err = dsa_port_lag_fdb_add(dp, addr, vid);
+ else
+ err = dsa_port_fdb_add(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to fdb: %d\n",
+ dp->index, addr, vid, err);
+ break;
+ }
+ dsa_fdb_offload_notify(switchdev_work);
+ break;
+
+ case SWITCHDEV_FDB_DEL_TO_DEVICE:
+ if (switchdev_work->host_addr)
+ err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
+ else if (dp->lag)
+ err = dsa_port_lag_fdb_del(dp, addr, vid);
+ else
+ err = dsa_port_fdb_del(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from fdb: %d\n",
+ dp->index, addr, vid, err);
+ }
+
+ break;
+ }
+
+ kfree(switchdev_work);
+}
+
+static bool dsa_foreign_dev_check(const struct net_device *dev,
+ const struct net_device *foreign_dev)
+{
+ const struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch_tree *dst = dp->ds->dst;
+
+ if (netif_is_bridge_master(foreign_dev))
+ return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);
+
+ if (netif_is_bridge_port(foreign_dev))
+ return !dsa_tree_offloads_bridge_port(dst, foreign_dev);
+
+ /* Everything else is foreign */
+ return true;
+}
+
+static int dsa_user_fdb_event(struct net_device *dev,
+ struct net_device *orig_dev,
+ unsigned long event, const void *ctx,
+ const struct switchdev_notifier_fdb_info *fdb_info)
+{
+ struct dsa_switchdev_event_work *switchdev_work;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ bool host_addr = fdb_info->is_local;
+ struct dsa_switch *ds = dp->ds;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ if (!dp->bridge)
+ return 0;
+
+ if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
+ if (dsa_port_offloads_bridge_port(dp, orig_dev))
+ return 0;
+
+ /* FDB entries learned by the software bridge or by foreign
+ * bridge ports should be installed as host addresses only if
+ * the driver requests assisted learning.
+ */
+ if (!ds->assisted_learning_on_cpu_port)
+ return 0;
+ }
+
+ /* Also treat FDB entries on foreign interfaces bridged with us as host
+ * addresses.
+ */
+ if (dsa_foreign_dev_check(dev, orig_dev))
+ host_addr = true;
+
+ /* Check early that we're not doing work in vain.
+ * Host addresses on LAG ports still require regular FDB ops,
+ * since the CPU port isn't in a LAG.
+ */
+ if (dp->lag && !host_addr) {
+ if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
+ return -EOPNOTSUPP;
+ } else {
+ if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
+ return -EOPNOTSUPP;
+ }
+
+ switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+ if (!switchdev_work)
+ return -ENOMEM;
+
+ netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
+ event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
+ orig_dev->name, fdb_info->addr, fdb_info->vid,
+ host_addr ? " as host address" : "");
+
+ INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work);
+ switchdev_work->event = event;
+ switchdev_work->dev = dev;
+ switchdev_work->orig_dev = orig_dev;
+
+ ether_addr_copy(switchdev_work->addr, fdb_info->addr);
+ switchdev_work->vid = fdb_info->vid;
+ switchdev_work->host_addr = host_addr;
+
+ dsa_schedule_work(&switchdev_work->work);
+
+ return 0;
+}
+
+/* Called under rcu_read_lock() */
+static int dsa_user_switchdev_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ int err;
+
+ switch (event) {
+ case SWITCHDEV_PORT_ATTR_SET:
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_user_dev_check,
+ dsa_user_port_attr_set);
+ return notifier_from_errno(err);
+ case SWITCHDEV_FDB_ADD_TO_DEVICE:
+ case SWITCHDEV_FDB_DEL_TO_DEVICE:
+ err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
+ dsa_user_dev_check,
+ dsa_foreign_dev_check,
+ dsa_user_fdb_event);
+ return notifier_from_errno(err);
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int dsa_user_switchdev_blocking_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ int err;
+
+ switch (event) {
+ case SWITCHDEV_PORT_OBJ_ADD:
+ err = switchdev_handle_port_obj_add_foreign(dev, ptr,
+ dsa_user_dev_check,
+ dsa_foreign_dev_check,
+ dsa_user_port_obj_add);
+ return notifier_from_errno(err);
+ case SWITCHDEV_PORT_OBJ_DEL:
+ err = switchdev_handle_port_obj_del_foreign(dev, ptr,
+ dsa_user_dev_check,
+ dsa_foreign_dev_check,
+ dsa_user_port_obj_del);
+ return notifier_from_errno(err);
+ case SWITCHDEV_PORT_ATTR_SET:
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_user_dev_check,
+ dsa_user_port_attr_set);
+ return notifier_from_errno(err);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dsa_user_nb __read_mostly = {
+ .notifier_call = dsa_user_netdevice_event,
+};
+
+struct notifier_block dsa_user_switchdev_notifier = {
+ .notifier_call = dsa_user_switchdev_event,
+};
+
+struct notifier_block dsa_user_switchdev_blocking_notifier = {
+ .notifier_call = dsa_user_switchdev_blocking_event,
+};
+
+int dsa_user_register_notifier(void)
+{
+ struct notifier_block *nb;
+ int err;
+
+ err = register_netdevice_notifier(&dsa_user_nb);
+ if (err)
+ return err;
+
+ err = register_switchdev_notifier(&dsa_user_switchdev_notifier);
+ if (err)
+ goto err_switchdev_nb;
+
+ nb = &dsa_user_switchdev_blocking_notifier;
+ err = register_switchdev_blocking_notifier(nb);
+ if (err)
+ goto err_switchdev_blocking_nb;
+
+ return 0;
+
+err_switchdev_blocking_nb:
+ unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+err_switchdev_nb:
+ unregister_netdevice_notifier(&dsa_user_nb);
+ return err;
+}
+
+void dsa_user_unregister_notifier(void)
+{
+ struct notifier_block *nb;
+ int err;
+
+ nb = &dsa_user_switchdev_blocking_notifier;
+ err = unregister_switchdev_blocking_notifier(nb);
+ if (err)
+ pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
+
+ err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+ if (err)
+ pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
+
+ err = unregister_netdevice_notifier(&dsa_user_nb);
+ if (err)
+ pr_err("DSA: failed to unregister user notifier (%d)\n", err);
+}
diff --git a/net/dsa/user.h b/net/dsa/user.h
new file mode 100644
index 000000000000..016884bead3c
--- /dev/null
+++ b/net/dsa/user.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_USER_H
+#define __DSA_USER_H
+
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/netpoll.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+#include <net/gro_cells.h>
+
+struct net_device;
+struct netlink_ext_ack;
+
+extern struct notifier_block dsa_user_switchdev_notifier;
+extern struct notifier_block dsa_user_switchdev_blocking_notifier;
+
+struct dsa_user_priv {
+ /* Copy of CPU port xmit for faster access in user transmit hot path */
+ struct sk_buff * (*xmit)(struct sk_buff *skb,
+ struct net_device *dev);
+
+ struct gro_cells gcells;
+
+ /* DSA port data, such as switch, port index, etc. */
+ struct dsa_port *dp;
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *netpoll;
+#endif
+
+ /* TC context */
+ struct list_head mall_tc_list;
+};
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds);
+int dsa_user_create(struct dsa_port *dp);
+void dsa_user_destroy(struct net_device *user_dev);
+int dsa_user_suspend(struct net_device *user_dev);
+int dsa_user_resume(struct net_device *user_dev);
+int dsa_user_register_notifier(void);
+void dsa_user_unregister_notifier(void);
+int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr);
+void dsa_user_host_uc_uninstall(struct net_device *dev);
+void dsa_user_sync_ha(struct net_device *dev);
+void dsa_user_unsync_ha(struct net_device *dev);
+void dsa_user_setup_tagger(struct net_device *user);
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu);
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+ struct netlink_ext_ack *extack);
+int dsa_user_manage_vlan_filtering(struct net_device *dev,
+ bool vlan_filtering);
+
+static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+
+ return p->dp;
+}
+
+static inline struct net_device *
+dsa_user_to_conduit(const struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return dsa_port_to_conduit(dp);
+}
+
+#endif
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
deleted file mode 100644
index 39a2d2975e0e..000000000000
--- a/net/econet/Kconfig
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Acorn Econet/AUN protocols
-#
-
-config ECONET
- tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
- depends on EXPERIMENTAL && INET
- ---help---
- Econet is a fairly old and slow networking protocol mainly used by
- Acorn computers to access file and print servers. It uses native
- Econet network cards. AUN is an implementation of the higher level
- parts of Econet that runs over ordinary Ethernet connections, on
- top of the UDP packet protocol, which in turn runs on top of the
- Internet protocol IP.
-
- If you say Y here, you can choose with the next two options whether
- to send Econet/AUN traffic over a UDP Ethernet connection or over
- a native Econet network card.
-
- To compile this driver as a module, choose M here: the module
- will be called econet.
-
-config ECONET_AUNUDP
- bool "AUN over UDP"
- depends on ECONET
- help
- Say Y here if you want to send Econet/AUN traffic over a UDP
- connection (UDP is a packet based protocol that runs on top of the
- Internet protocol IP) using an ordinary Ethernet network card.
-
-config ECONET_NATIVE
- bool "Native Econet"
- depends on ECONET
- help
- Say Y here if you have a native Econet network card installed in
- your computer.
diff --git a/net/econet/Makefile b/net/econet/Makefile
deleted file mode 100644
index 39f0a77abdbd..000000000000
--- a/net/econet/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for Econet support code.
-#
-
-obj-$(CONFIG_ECONET) += econet.o
-
-econet-objs := af_econet.o
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
deleted file mode 100644
index 4d66aac13483..000000000000
--- a/net/econet/af_econet.c
+++ /dev/null
@@ -1,1174 +0,0 @@
-/*
- * An implementation of the Acorn Econet and AUN protocols.
- * Philip Blundell <philb@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/route.h>
-#include <linux/inet.h>
-#include <linux/etherdevice.h>
-#include <linux/if_arp.h>
-#include <linux/wireless.h>
-#include <linux/skbuff.h>
-#include <linux/udp.h>
-#include <net/sock.h>
-#include <net/inet_common.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/if_ec.h>
-#include <net/udp.h>
-#include <net/ip.h>
-#include <linux/spinlock.h>
-#include <linux/rcupdate.h>
-#include <linux/bitops.h>
-#include <linux/mutex.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-static const struct proto_ops econet_ops;
-static struct hlist_head econet_sklist;
-static DEFINE_RWLOCK(econet_lock);
-static DEFINE_MUTEX(econet_mutex);
-
-/* Since there are only 256 possible network numbers (or fewer, depends
- how you count) it makes sense to use a simple lookup table. */
-static struct net_device *net2dev_map[256];
-
-#define EC_PORT_IP 0xd2
-
-#ifdef CONFIG_ECONET_AUNUDP
-static DEFINE_SPINLOCK(aun_queue_lock);
-static struct socket *udpsock;
-#define AUN_PORT 0x8000
-
-
-struct aunhdr
-{
- unsigned char code; /* AUN magic protocol byte */
- unsigned char port;
- unsigned char cb;
- unsigned char pad;
- unsigned long handle;
-};
-
-static unsigned long aun_seq;
-
-/* Queue of packets waiting to be transmitted. */
-static struct sk_buff_head aun_queue;
-static struct timer_list ab_cleanup_timer;
-
-#endif /* CONFIG_ECONET_AUNUDP */
-
-/* Per-packet information */
-struct ec_cb
-{
- struct sockaddr_ec sec;
- unsigned long cookie; /* Supplied by user. */
-#ifdef CONFIG_ECONET_AUNUDP
- int done;
- unsigned long seq; /* Sequencing */
- unsigned long timeout; /* Timeout */
- unsigned long start; /* jiffies */
-#endif
-#ifdef CONFIG_ECONET_NATIVE
- void (*sent)(struct sk_buff *, int result);
-#endif
-};
-
-static void econet_remove_socket(struct hlist_head *list, struct sock *sk)
-{
- write_lock_bh(&econet_lock);
- sk_del_node_init(sk);
- write_unlock_bh(&econet_lock);
-}
-
-static void econet_insert_socket(struct hlist_head *list, struct sock *sk)
-{
- write_lock_bh(&econet_lock);
- sk_add_node(sk, list);
- write_unlock_bh(&econet_lock);
-}
-
-/*
- * Pull a packet from our receive queue and hand it to the user.
- * If necessary we block.
- */
-
-static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len, int flags)
-{
- struct sock *sk = sock->sk;
- struct sk_buff *skb;
- size_t copied;
- int err;
-
- msg->msg_namelen = sizeof(struct sockaddr_ec);
-
- mutex_lock(&econet_mutex);
-
- /*
- * Call the generic datagram receiver. This handles all sorts
- * of horrible races and re-entrancy so we can forget about it
- * in the protocol layers.
- *
- * Now it will return ENETDOWN, if device have just gone down,
- * but then it will block.
- */
-
- skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
-
- /*
- * An error occurred so return it. Because skb_recv_datagram()
- * handles the blocking we don't see and worry about blocking
- * retries.
- */
-
- if(skb==NULL)
- goto out;
-
- /*
- * You lose any data beyond the buffer you gave. If it worries a
- * user program they can ask the device for its MTU anyway.
- */
-
- copied = skb->len;
- if (copied > len)
- {
- copied=len;
- msg->msg_flags|=MSG_TRUNC;
- }
-
- /* We can't use skb_copy_datagram here */
- err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
- if (err)
- goto out_free;
- skb_get_timestamp(skb, &sk->sk_stamp);
-
- if (msg->msg_name)
- memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
-
- /*
- * Free or return the buffer as appropriate. Again this
- * hides all the races and re-entrancy issues from us.
- */
- err = copied;
-
-out_free:
- skb_free_datagram(sk, skb);
-out:
- mutex_unlock(&econet_mutex);
- return err;
-}
-
-/*
- * Bind an Econet socket.
- */
-
-static int econet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
- struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr;
- struct sock *sk;
- struct econet_sock *eo;
-
- /*
- * Check legality
- */
-
- if (addr_len < sizeof(struct sockaddr_ec) ||
- sec->sec_family != AF_ECONET)
- return -EINVAL;
-
- mutex_lock(&econet_mutex);
-
- sk = sock->sk;
- eo = ec_sk(sk);
-
- eo->cb = sec->cb;
- eo->port = sec->port;
- eo->station = sec->addr.station;
- eo->net = sec->addr.net;
-
- mutex_unlock(&econet_mutex);
-
- return 0;
-}
-
-#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
-/*
- * Queue a transmit result for the user to be told about.
- */
-
-static void tx_result(struct sock *sk, unsigned long cookie, int result)
-{
- struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
- struct ec_cb *eb;
- struct sockaddr_ec *sec;
-
- if (skb == NULL)
- {
- printk(KERN_DEBUG "ec: memory squeeze, transmit result dropped.\n");
- return;
- }
-
- eb = (struct ec_cb *)&skb->cb;
- sec = (struct sockaddr_ec *)&eb->sec;
- memset(sec, 0, sizeof(struct sockaddr_ec));
- sec->cookie = cookie;
- sec->type = ECTYPE_TRANSMIT_STATUS | result;
- sec->sec_family = AF_ECONET;
-
- if (sock_queue_rcv_skb(sk, skb) < 0)
- kfree_skb(skb);
-}
-#endif
-
-#ifdef CONFIG_ECONET_NATIVE
-/*
- * Called by the Econet hardware driver when a packet transmit
- * has completed. Tell the user.
- */
-
-static void ec_tx_done(struct sk_buff *skb, int result)
-{
- struct ec_cb *eb = (struct ec_cb *)&skb->cb;
- tx_result(skb->sk, eb->cookie, result);
-}
-#endif
-
-/*
- * Send a packet. We have to work out which device it's going out on
- * and hence whether to use real Econet or the UDP emulation.
- */
-
-static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
-{
- struct sock *sk = sock->sk;
- struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name;
- struct net_device *dev;
- struct ec_addr addr;
- int err;
- unsigned char port, cb;
-#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
- struct sk_buff *skb;
- struct ec_cb *eb;
-#endif
-#ifdef CONFIG_ECONET_AUNUDP
- struct msghdr udpmsg;
- struct iovec iov[msg->msg_iovlen+1];
- struct aunhdr ah;
- struct sockaddr_in udpdest;
- __kernel_size_t size;
- int i;
- mm_segment_t oldfs;
-#endif
-
- /*
- * Check the flags.
- */
-
- if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
- return -EINVAL;
-
- /*
- * Get and verify the address.
- */
-
- mutex_lock(&econet_mutex);
-
- if (saddr == NULL) {
- struct econet_sock *eo = ec_sk(sk);
-
- addr.station = eo->station;
- addr.net = eo->net;
- port = eo->port;
- cb = eo->cb;
- } else {
- if (msg->msg_namelen < sizeof(struct sockaddr_ec)) {
- mutex_unlock(&econet_mutex);
- return -EINVAL;
- }
- addr.station = saddr->addr.station;
- addr.net = saddr->addr.net;
- port = saddr->port;
- cb = saddr->cb;
- }
-
- /* Look for a device with the right network number. */
- dev = net2dev_map[addr.net];
-
- /* If not directly reachable, use some default */
- if (dev == NULL) {
- dev = net2dev_map[0];
- /* No interfaces at all? */
- if (dev == NULL) {
- mutex_unlock(&econet_mutex);
- return -ENETDOWN;
- }
- }
-
- if (len + 15 > dev->mtu) {
- mutex_unlock(&econet_mutex);
- return -EMSGSIZE;
- }
-
- if (dev->type == ARPHRD_ECONET) {
- /* Real hardware Econet. We're not worthy etc. */
-#ifdef CONFIG_ECONET_NATIVE
- unsigned short proto = 0;
-
- dev_hold(dev);
-
- skb = sock_alloc_send_skb(sk, len+LL_RESERVED_SPACE(dev),
- msg->msg_flags & MSG_DONTWAIT, &err);
- if (skb==NULL)
- goto out_unlock;
-
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
- skb->nh.raw = skb->data;
-
- eb = (struct ec_cb *)&skb->cb;
-
- /* BUG: saddr may be NULL */
- eb->cookie = saddr->cookie;
- eb->sec = *saddr;
- eb->sent = ec_tx_done;
-
- if (dev->hard_header) {
- int res;
- struct ec_framehdr *fh;
- err = -EINVAL;
- res = dev->hard_header(skb, dev, ntohs(proto),
- &addr, NULL, len);
- /* Poke in our control byte and
- port number. Hack, hack. */
- fh = (struct ec_framehdr *)(skb->data);
- fh->cb = cb;
- fh->port = port;
- if (sock->type != SOCK_DGRAM) {
- skb->tail = skb->data;
- skb->len = 0;
- } else if (res < 0)
- goto out_free;
- }
-
- /* Copy the data. Returns -EFAULT on error */
- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
- skb->protocol = proto;
- skb->dev = dev;
- skb->priority = sk->sk_priority;
- if (err)
- goto out_free;
-
- err = -ENETDOWN;
- if (!(dev->flags & IFF_UP))
- goto out_free;
-
- /*
- * Now send it
- */
-
- dev_queue_xmit(skb);
- dev_put(dev);
- mutex_unlock(&econet_mutex);
- return(len);
-
- out_free:
- kfree_skb(skb);
- out_unlock:
- if (dev)
- dev_put(dev);
-#else
- err = -EPROTOTYPE;
-#endif
- mutex_unlock(&econet_mutex);
-
- return err;
- }
-
-#ifdef CONFIG_ECONET_AUNUDP
- /* AUN virtual Econet. */
-
- if (udpsock == NULL) {
- mutex_unlock(&econet_mutex);
- return -ENETDOWN; /* No socket - can't send */
- }
-
- /* Make up a UDP datagram and hand it off to some higher intellect. */
-
- memset(&udpdest, 0, sizeof(udpdest));
- udpdest.sin_family = AF_INET;
- udpdest.sin_port = htons(AUN_PORT);
-
- /* At the moment we use the stupid Acorn scheme of Econet address
- y.x maps to IP a.b.c.x. This should be replaced with something
- more flexible and more aware of subnet masks. */
- {
- struct in_device *idev;
- unsigned long network = 0;
-
- rcu_read_lock();
- idev = __in_dev_get_rcu(dev);
- if (idev) {
- if (idev->ifa_list)
- network = ntohl(idev->ifa_list->ifa_address) &
- 0xffffff00; /* !!! */
- }
- rcu_read_unlock();
- udpdest.sin_addr.s_addr = htonl(network | addr.station);
- }
-
- ah.port = port;
- ah.cb = cb & 0x7f;
- ah.code = 2; /* magic */
- ah.pad = 0;
-
- /* tack our header on the front of the iovec */
- size = sizeof(struct aunhdr);
- /*
- * XXX: that is b0rken. We can't mix userland and kernel pointers
- * in iovec, since on a lot of platforms copy_from_user() will
- * *not* work with the kernel and userland ones at the same time,
- * regardless of what we do with set_fs(). And we are talking about
- * econet-over-ethernet here, so "it's only ARM anyway" doesn't
- * apply. Any suggestions on fixing that code? -- AV
- */
- iov[0].iov_base = (void *)&ah;
- iov[0].iov_len = size;
- for (i = 0; i < msg->msg_iovlen; i++) {
- void __user *base = msg->msg_iov[i].iov_base;
- size_t len = msg->msg_iov[i].iov_len;
- /* Check it now since we switch to KERNEL_DS later. */
- if (!access_ok(VERIFY_READ, base, len)) {
- mutex_unlock(&econet_mutex);
- return -EFAULT;
- }
- iov[i+1].iov_base = base;
- iov[i+1].iov_len = len;
- size += len;
- }
-
- /* Get a skbuff (no data, just holds our cb information) */
- if ((skb = sock_alloc_send_skb(sk, 0,
- msg->msg_flags & MSG_DONTWAIT,
- &err)) == NULL) {
- mutex_unlock(&econet_mutex);
- return err;
- }
-
- eb = (struct ec_cb *)&skb->cb;
-
- eb->cookie = saddr->cookie;
- eb->timeout = (5*HZ);
- eb->start = jiffies;
- ah.handle = aun_seq;
- eb->seq = (aun_seq++);
- eb->sec = *saddr;
-
- skb_queue_tail(&aun_queue, skb);
-
- udpmsg.msg_name = (void *)&udpdest;
- udpmsg.msg_namelen = sizeof(udpdest);
- udpmsg.msg_iov = &iov[0];
- udpmsg.msg_iovlen = msg->msg_iovlen + 1;
- udpmsg.msg_control = NULL;
- udpmsg.msg_controllen = 0;
- udpmsg.msg_flags=0;
-
- oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */
- err = sock_sendmsg(udpsock, &udpmsg, size);
- set_fs(oldfs);
-#else
- err = -EPROTOTYPE;
-#endif
- mutex_unlock(&econet_mutex);
-
- return err;
-}
-
-/*
- * Look up the address of a socket.
- */
-
-static int econet_getname(struct socket *sock, struct sockaddr *uaddr,
- int *uaddr_len, int peer)
-{
- struct sock *sk;
- struct econet_sock *eo;
- struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr;
-
- if (peer)
- return -EOPNOTSUPP;
-
- mutex_lock(&econet_mutex);
-
- sk = sock->sk;
- eo = ec_sk(sk);
-
- sec->sec_family = AF_ECONET;
- sec->port = eo->port;
- sec->addr.station = eo->station;
- sec->addr.net = eo->net;
-
- mutex_unlock(&econet_mutex);
-
- *uaddr_len = sizeof(*sec);
- return 0;
-}
-
-static void econet_destroy_timer(unsigned long data)
-{
- struct sock *sk=(struct sock *)data;
-
- if (!atomic_read(&sk->sk_wmem_alloc) &&
- !atomic_read(&sk->sk_rmem_alloc)) {
- sk_free(sk);
- return;
- }
-
- sk->sk_timer.expires = jiffies + 10 * HZ;
- add_timer(&sk->sk_timer);
- printk(KERN_DEBUG "econet socket destroy delayed\n");
-}
-
-/*
- * Close an econet socket.
- */
-
-static int econet_release(struct socket *sock)
-{
- struct sock *sk;
-
- mutex_lock(&econet_mutex);
-
- sk = sock->sk;
- if (!sk)
- goto out_unlock;
-
- econet_remove_socket(&econet_sklist, sk);
-
- /*
- * Now the socket is dead. No more input will appear.
- */
-
- sk->sk_state_change(sk); /* It is useless. Just for sanity. */
-
- sock->sk = NULL;
- sk->sk_socket = NULL;
- sock_set_flag(sk, SOCK_DEAD);
-
- /* Purge queues */
-
- skb_queue_purge(&sk->sk_receive_queue);
-
- if (atomic_read(&sk->sk_rmem_alloc) ||
- atomic_read(&sk->sk_wmem_alloc)) {
- sk->sk_timer.data = (unsigned long)sk;
- sk->sk_timer.expires = jiffies + HZ;
- sk->sk_timer.function = econet_destroy_timer;
- add_timer(&sk->sk_timer);
-
- goto out_unlock;
- }
-
- sk_free(sk);
-
-out_unlock:
- mutex_unlock(&econet_mutex);
- return 0;
-}
-
-static struct proto econet_proto = {
- .name = "ECONET",
- .owner = THIS_MODULE,
- .obj_size = sizeof(struct econet_sock),
-};
-
-/*
- * Create an Econet socket
- */
-
-static int econet_create(struct socket *sock, int protocol)
-{
- struct sock *sk;
- struct econet_sock *eo;
- int err;
-
- /* Econet only provides datagram services. */
- if (sock->type != SOCK_DGRAM)
- return -ESOCKTNOSUPPORT;
-
- sock->state = SS_UNCONNECTED;
-
- err = -ENOBUFS;
- sk = sk_alloc(PF_ECONET, GFP_KERNEL, &econet_proto, 1);
- if (sk == NULL)
- goto out;
-
- sk->sk_reuse = 1;
- sock->ops = &econet_ops;
- sock_init_data(sock, sk);
-
- eo = ec_sk(sk);
- sock_reset_flag(sk, SOCK_ZAPPED);
- sk->sk_family = PF_ECONET;
- eo->num = protocol;
-
- econet_insert_socket(&econet_sklist, sk);
- return(0);
-out:
- return err;
-}
-
-/*
- * Handle Econet specific ioctls
- */
-
-static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
-{
- struct ifreq ifr;
- struct ec_device *edev;
- struct net_device *dev;
- struct sockaddr_ec *sec;
- int err;
-
- /*
- * Fetch the caller's info block into kernel space
- */
-
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
-
- if ((dev = dev_get_by_name(ifr.ifr_name)) == NULL)
- return -ENODEV;
-
- sec = (struct sockaddr_ec *)&ifr.ifr_addr;
-
- mutex_lock(&econet_mutex);
-
- err = 0;
- switch (cmd) {
- case SIOCSIFADDR:
- edev = dev->ec_ptr;
- if (edev == NULL) {
- /* Magic up a new one. */
- edev = kzalloc(sizeof(struct ec_device), GFP_KERNEL);
- if (edev == NULL) {
- err = -ENOMEM;
- break;
- }
- dev->ec_ptr = edev;
- } else
- net2dev_map[edev->net] = NULL;
- edev->station = sec->addr.station;
- edev->net = sec->addr.net;
- net2dev_map[sec->addr.net] = dev;
- if (!net2dev_map[0])
- net2dev_map[0] = dev;
- break;
-
- case SIOCGIFADDR:
- edev = dev->ec_ptr;
- if (edev == NULL) {
- err = -ENODEV;
- break;
- }
- memset(sec, 0, sizeof(struct sockaddr_ec));
- sec->addr.station = edev->station;
- sec->addr.net = edev->net;
- sec->sec_family = AF_ECONET;
- dev_put(dev);
- if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
- err = -EFAULT;
- break;
-
- default:
- err = -EINVAL;
- break;
- }
-
- mutex_unlock(&econet_mutex);
-
- dev_put(dev);
-
- return err;
-}
-
-/*
- * Handle generic ioctls
- */
-
-static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
-{
- struct sock *sk = sock->sk;
- void __user *argp = (void __user *)arg;
-
- switch(cmd) {
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, argp);
-
- case SIOCSIFADDR:
- case SIOCGIFADDR:
- return ec_dev_ioctl(sock, cmd, argp);
- break;
-
- default:
- return -ENOIOCTLCMD;
- }
- /*NOTREACHED*/
- return 0;
-}
-
-static struct net_proto_family econet_family_ops = {
- .family = PF_ECONET,
- .create = econet_create,
- .owner = THIS_MODULE,
-};
-
-static const struct proto_ops econet_ops = {
- .family = PF_ECONET,
- .owner = THIS_MODULE,
- .release = econet_release,
- .bind = econet_bind,
- .connect = sock_no_connect,
- .socketpair = sock_no_socketpair,
- .accept = sock_no_accept,
- .getname = econet_getname,
- .poll = datagram_poll,
- .ioctl = econet_ioctl,
- .listen = sock_no_listen,
- .shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
- .sendmsg = econet_sendmsg,
- .recvmsg = econet_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-};
-
-#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
-/*
- * Find the listening socket, if any, for the given data.
- */
-
-static struct sock *ec_listening_socket(unsigned char port, unsigned char
- station, unsigned char net)
-{
- struct sock *sk;
- struct hlist_node *node;
-
- sk_for_each(sk, node, &econet_sklist) {
- struct econet_sock *opt = ec_sk(sk);
- if ((opt->port == port || opt->port == 0) &&
- (opt->station == station || opt->station == 0) &&
- (opt->net == net || opt->net == 0))
- goto found;
- }
- sk = NULL;
-found:
- return sk;
-}
-
-/*
- * Queue a received packet for a socket.
- */
-
-static int ec_queue_packet(struct sock *sk, struct sk_buff *skb,
- unsigned char stn, unsigned char net,
- unsigned char cb, unsigned char port)
-{
- struct ec_cb *eb = (struct ec_cb *)&skb->cb;
- struct sockaddr_ec *sec = (struct sockaddr_ec *)&eb->sec;
-
- memset(sec, 0, sizeof(struct sockaddr_ec));
- sec->sec_family = AF_ECONET;
- sec->type = ECTYPE_PACKET_RECEIVED;
- sec->port = port;
- sec->cb = cb;
- sec->addr.net = net;
- sec->addr.station = stn;
-
- return sock_queue_rcv_skb(sk, skb);
-}
-#endif
-
-#ifdef CONFIG_ECONET_AUNUDP
-/*
- * Send an AUN protocol response.
- */
-
-static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb)
-{
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- .sin_port = htons(AUN_PORT),
- .sin_addr = {.s_addr = addr}
- };
- struct aunhdr ah = {.code = code, .cb = cb, .handle = seq};
- struct kvec iov = {.iov_base = (void *)&ah, .iov_len = sizeof(ah)};
- struct msghdr udpmsg;
-
- udpmsg.msg_name = (void *)&sin;
- udpmsg.msg_namelen = sizeof(sin);
- udpmsg.msg_control = NULL;
- udpmsg.msg_controllen = 0;
- udpmsg.msg_flags=0;
-
- kernel_sendmsg(udpsock, &udpmsg, &iov, 1, sizeof(ah));
-}
-
-
-/*
- * Handle incoming AUN packets. Work out if anybody wants them,
- * and send positive or negative acknowledgements as appropriate.
- */
-
-static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len)
-{
- struct iphdr *ip = skb->nh.iph;
- unsigned char stn = ntohl(ip->saddr) & 0xff;
- struct sock *sk;
- struct sk_buff *newskb;
- struct ec_device *edev = skb->dev->ec_ptr;
-
- if (! edev)
- goto bad;
-
- if ((sk = ec_listening_socket(ah->port, stn, edev->net)) == NULL)
- goto bad; /* Nobody wants it */
-
- newskb = alloc_skb((len - sizeof(struct aunhdr) + 15) & ~15,
- GFP_ATOMIC);
- if (newskb == NULL)
- {
- printk(KERN_DEBUG "AUN: memory squeeze, dropping packet.\n");
- /* Send nack and hope sender tries again */
- goto bad;
- }
-
- memcpy(skb_put(newskb, len - sizeof(struct aunhdr)), (void *)(ah+1),
- len - sizeof(struct aunhdr));
-
- if (ec_queue_packet(sk, newskb, stn, edev->net, ah->cb, ah->port))
- {
- /* Socket is bankrupt. */
- kfree_skb(newskb);
- goto bad;
- }
-
- aun_send_response(ip->saddr, ah->handle, 3, 0);
- return;
-
-bad:
- aun_send_response(ip->saddr, ah->handle, 4, 0);
-}
-
-/*
- * Handle incoming AUN transmit acknowledgements. If the sequence
- * number matches something in our backlog then kill it and tell
- * the user. If the remote took too long to reply then we may have
- * dropped the packet already.
- */
-
-static void aun_tx_ack(unsigned long seq, int result)
-{
- struct sk_buff *skb;
- unsigned long flags;
- struct ec_cb *eb;
-
- spin_lock_irqsave(&aun_queue_lock, flags);
- skb = skb_peek(&aun_queue);
- while (skb && skb != (struct sk_buff *)&aun_queue)
- {
- struct sk_buff *newskb = skb->next;
- eb = (struct ec_cb *)&skb->cb;
- if (eb->seq == seq)
- goto foundit;
-
- skb = newskb;
- }
- spin_unlock_irqrestore(&aun_queue_lock, flags);
- printk(KERN_DEBUG "AUN: unknown sequence %ld\n", seq);
- return;
-
-foundit:
- tx_result(skb->sk, eb->cookie, result);
- skb_unlink(skb, &aun_queue);
- spin_unlock_irqrestore(&aun_queue_lock, flags);
- kfree_skb(skb);
-}
-
-/*
- * Deal with received AUN frames - sort out what type of thing it is
- * and hand it to the right function.
- */
-
-static void aun_data_available(struct sock *sk, int slen)
-{
- int err;
- struct sk_buff *skb;
- unsigned char *data;
- struct aunhdr *ah;
- struct iphdr *ip;
- size_t len;
-
- while ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) {
- if (err == -EAGAIN) {
- printk(KERN_ERR "AUN: no data available?!");
- return;
- }
- printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err);
- }
-
- data = skb->h.raw + sizeof(struct udphdr);
- ah = (struct aunhdr *)data;
- len = skb->len - sizeof(struct udphdr);
- ip = skb->nh.iph;
-
- switch (ah->code)
- {
- case 2:
- aun_incoming(skb, ah, len);
- break;
- case 3:
- aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_OK);
- break;
- case 4:
- aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_NOT_LISTENING);
- break;
-#if 0
- /* This isn't quite right yet. */
- case 5:
- aun_send_response(ip->saddr, ah->handle, 6, ah->cb);
- break;
-#endif
- default:
- printk(KERN_DEBUG "unknown AUN packet (type %d)\n", data[0]);
- }
-
- skb_free_datagram(sk, skb);
-}
-
-/*
- * Called by the timer to manage the AUN transmit queue. If a packet
- * was sent to a dead or nonexistent host then we will never get an
- * acknowledgement back. After a few seconds we need to spot this and
- * drop the packet.
- */
-
-static void ab_cleanup(unsigned long h)
-{
- struct sk_buff *skb;
- unsigned long flags;
-
- spin_lock_irqsave(&aun_queue_lock, flags);
- skb = skb_peek(&aun_queue);
- while (skb && skb != (struct sk_buff *)&aun_queue)
- {
- struct sk_buff *newskb = skb->next;
- struct ec_cb *eb = (struct ec_cb *)&skb->cb;
- if ((jiffies - eb->start) > eb->timeout)
- {
- tx_result(skb->sk, eb->cookie,
- ECTYPE_TRANSMIT_NOT_PRESENT);
- skb_unlink(skb, &aun_queue);
- kfree_skb(skb);
- }
- skb = newskb;
- }
- spin_unlock_irqrestore(&aun_queue_lock, flags);
-
- mod_timer(&ab_cleanup_timer, jiffies + (HZ*2));
-}
-
-static int __init aun_udp_initialise(void)
-{
- int error;
- struct sockaddr_in sin;
-
- skb_queue_head_init(&aun_queue);
- spin_lock_init(&aun_queue_lock);
- init_timer(&ab_cleanup_timer);
- ab_cleanup_timer.expires = jiffies + (HZ*2);
- ab_cleanup_timer.function = ab_cleanup;
- add_timer(&ab_cleanup_timer);
-
- memset(&sin, 0, sizeof(sin));
- sin.sin_port = htons(AUN_PORT);
-
- /* We can count ourselves lucky Acorn machines are too dim to
- speak IPv6. :-) */
- if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, 0, &udpsock)) < 0)
- {
- printk("AUN: socket error %d\n", -error);
- return error;
- }
-
- udpsock->sk->sk_reuse = 1;
- udpsock->sk->sk_allocation = GFP_ATOMIC; /* we're going to call it
- from interrupts */
-
- error = udpsock->ops->bind(udpsock, (struct sockaddr *)&sin,
- sizeof(sin));
- if (error < 0)
- {
- printk("AUN: bind error %d\n", -error);
- goto release;
- }
-
- udpsock->sk->sk_data_ready = aun_data_available;
-
- return 0;
-
-release:
- sock_release(udpsock);
- udpsock = NULL;
- return error;
-}
-#endif
-
-#ifdef CONFIG_ECONET_NATIVE
-
-/*
- * Receive an Econet frame from a device.
- */
-
-static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
-{
- struct ec_framehdr *hdr;
- struct sock *sk;
- struct ec_device *edev = dev->ec_ptr;
-
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto drop;
-
- if (!edev)
- goto drop;
-
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- return NET_RX_DROP;
-
- if (!pskb_may_pull(skb, sizeof(struct ec_framehdr)))
- goto drop;
-
- hdr = (struct ec_framehdr *) skb->data;
-
- /* First check for encapsulated IP */
- if (hdr->port == EC_PORT_IP) {
- skb->protocol = htons(ETH_P_IP);
- skb_pull(skb, sizeof(struct ec_framehdr));
- netif_rx(skb);
- return 0;
- }
-
- sk = ec_listening_socket(hdr->port, hdr->src_stn, hdr->src_net);
- if (!sk)
- goto drop;
-
- if (ec_queue_packet(sk, skb, edev->net, hdr->src_stn, hdr->cb,
- hdr->port))
- goto drop;
-
- return 0;
-
-drop:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static struct packet_type econet_packet_type = {
- .type = __constant_htons(ETH_P_ECONET),
- .func = econet_rcv,
-};
-
-static void econet_hw_initialise(void)
-{
- dev_add_pack(&econet_packet_type);
-}
-
-#endif
-
-static int econet_notifier(struct notifier_block *this, unsigned long msg, void *data)
-{
- struct net_device *dev = (struct net_device *)data;
- struct ec_device *edev;
-
- switch (msg) {
- case NETDEV_UNREGISTER:
- /* A device has gone down - kill any data we hold for it. */
- edev = dev->ec_ptr;
- if (edev)
- {
- if (net2dev_map[0] == dev)
- net2dev_map[0] = NULL;
- net2dev_map[edev->net] = NULL;
- kfree(edev);
- dev->ec_ptr = NULL;
- }
- break;
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block econet_netdev_notifier = {
- .notifier_call =econet_notifier,
-};
-
-static void __exit econet_proto_exit(void)
-{
-#ifdef CONFIG_ECONET_AUNUDP
- del_timer(&ab_cleanup_timer);
- if (udpsock)
- sock_release(udpsock);
-#endif
- unregister_netdevice_notifier(&econet_netdev_notifier);
- sock_unregister(econet_family_ops.family);
- proto_unregister(&econet_proto);
-}
-
-static int __init econet_proto_init(void)
-{
- int err = proto_register(&econet_proto, 0);
-
- if (err != 0)
- goto out;
- sock_register(&econet_family_ops);
-#ifdef CONFIG_ECONET_AUNUDP
- spin_lock_init(&aun_queue_lock);
- aun_udp_initialise();
-#endif
-#ifdef CONFIG_ECONET_NATIVE
- econet_hw_initialise();
-#endif
- register_netdevice_notifier(&econet_netdev_notifier);
-out:
- return err;
-}
-
-module_init(econet_proto_init);
-module_exit(econet_proto_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NETPROTO(PF_ECONET);
diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile
index 7cef1d8ace27..e03eff94e0db 100644
--- a/net/ethernet/Makefile
+++ b/net/ethernet/Makefile
@@ -1,7 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Ethernet layer.
#
obj-y += eth.o
-obj-$(subst m,y,$(CONFIG_IPX)) += pe2.o
-obj-$(subst m,y,$(CONFIG_ATALK)) += pe2.o
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 2d31bf3f05c5..13a63b48b7ee 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -12,14 +13,14 @@
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Florian La Roche, <rzsfl@rz.uni-sb.de>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
- *
+ *
* Fixes:
* Mr Linux : Arp problems
* Alan Cox : Generic queue tidyup (very tiny here)
* Alan Cox : eth_header ntohs should be htons
* Alan Cox : eth_rebuild_header missing an htons and
* minor other things.
- * Tegge : Arp bug fixes.
+ * Tegge : Arp bug fixes.
* Florian : Removed many unnecessary functions, code cleanup
* and changes for new arp and skbuff.
* Alan Cox : Redid header building to reflect new format.
@@ -31,16 +32,10 @@
* older network drivers and IFF_ALLMULTI.
* Christer Weinigel : Better rebuild header message.
* Andrew Morton : 26Feb01: kill ether_setup() - use netdev_boot_setup().
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -48,20 +43,25 @@
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/nvmem-consumer.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/if_ether.h>
+#include <linux/of_net.h>
+#include <linux/pci.h>
+#include <linux/property.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/ip.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-__setup("ether=", netdev_boot_setup);
+#include <net/dsa.h>
+#include <net/flow_dissector.h>
+#include <net/gro.h>
+#include <linux/uaccess.h>
+#include <net/pkt_sched.h>
/**
* eth_header - create the Ethernet header
@@ -73,15 +73,16 @@ __setup("ether=", netdev_boot_setup);
* @len: packet length (<= skb->len)
*
*
- * Set the protocol type. For a packet of type ETH_P_802_3 we put the length
- * in here instead. It is up to the 802.2 layer to carry protocol information.
+ * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
+ * in here instead.
*/
-int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
- void *daddr, void *saddr, unsigned len)
+int eth_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
{
- struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+ struct ethhdr *eth = skb_push(skb, ETH_HLEN);
- if (type != ETH_P_802_3)
+ if (type != ETH_P_802_3 && type != ETH_P_802_2)
eth->h_proto = htons(type);
else
eth->h_proto = htons(len);
@@ -92,10 +93,10 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
if (!saddr)
saddr = dev->dev_addr;
- memcpy(eth->h_source, saddr, dev->addr_len);
+ memcpy(eth->h_source, saddr, ETH_ALEN);
if (daddr) {
- memcpy(eth->h_dest, daddr, dev->addr_len);
+ memcpy(eth->h_dest, daddr, ETH_ALEN);
return ETH_HLEN;
}
@@ -104,44 +105,43 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
*/
if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
- memset(eth->h_dest, 0, dev->addr_len);
+ eth_zero_addr(eth->h_dest);
return ETH_HLEN;
}
return -ETH_HLEN;
}
+EXPORT_SYMBOL(eth_header);
/**
- * eth_rebuild_header- rebuild the Ethernet MAC header.
- * @skb: socket buffer to update
+ * eth_get_headlen - determine the length of header for an ethernet frame
+ * @dev: pointer to network device
+ * @data: pointer to start of frame
+ * @len: total length of frame
*
- * This is called after an ARP or IPV6 ndisc it's resolution on this
- * sk_buff. We now let protocol (ARP) fill in the other fields.
- *
- * This routine CANNOT use cached dst->neigh!
- * Really, it is used only when dst->neigh is wrong.
+ * Make a best effort attempt to pull the length for all of the headers for
+ * a given frame in a linear buffer.
*/
-int eth_rebuild_header(struct sk_buff *skb)
+u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
{
- struct ethhdr *eth = (struct ethhdr *)skb->data;
- struct net_device *dev = skb->dev;
-
- switch (eth->h_proto) {
-#ifdef CONFIG_INET
- case __constant_htons(ETH_P_IP):
- return arp_find(eth->h_dest, skb);
-#endif
- default:
- printk(KERN_DEBUG
- "%s: unable to resolve type %X addresses.\n",
- dev->name, (int)eth->h_proto);
-
- memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
- break;
- }
-
- return 0;
+ const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
+ const struct ethhdr *eth = (const struct ethhdr *)data;
+ struct flow_keys_basic keys;
+
+ /* this should never happen, but better safe than sorry */
+ if (unlikely(len < sizeof(*eth)))
+ return len;
+
+ /* parse any remaining L2/L3 headers, check for L4 */
+ if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data,
+ eth->h_proto, sizeof(*eth),
+ len, flags))
+ return max_t(u32, keys.control.thoff, sizeof(*eth));
+
+ /* parse for any L4 headers */
+ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
}
+EXPORT_SYMBOL(eth_get_headlen);
/**
* eth_type_trans - determine the packet's protocol ID.
@@ -154,51 +154,42 @@ int eth_rebuild_header(struct sk_buff *skb)
*/
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
- struct ethhdr *eth;
- unsigned char *rawp;
+ const unsigned short *sap;
+ const struct ethhdr *eth;
+ __be16 res;
- skb->mac.raw = skb->data;
- skb_pull(skb, ETH_HLEN);
- eth = eth_hdr(skb);
+ skb->dev = dev;
+ skb_reset_mac_header(skb);
- if (is_multicast_ether_addr(eth->h_dest)) {
- if (!compare_ether_addr(eth->h_dest, dev->broadcast))
- skb->pkt_type = PACKET_BROADCAST;
- else
- skb->pkt_type = PACKET_MULTICAST;
- }
+ eth = eth_skb_pull_mac(skb);
+ eth_skb_pkt_type(skb, dev);
/*
- * This ALLMULTI check should be redundant by 1.4
- * so don't forget to remove it.
- *
- * Seems, you forgot to remove it. All silly devices
- * seems to set IFF_PROMISC.
+ * Some variants of DSA tagging don't have an ethertype field
+ * at all, so we check here whether one of those tagging
+ * variants has been configured on the receiving interface,
+ * and if so, set skb->protocol without looking at the packet.
*/
+ if (unlikely(netdev_uses_dsa(dev)))
+ return htons(ETH_P_XDSA);
- else if (1 /*dev->flags&IFF_PROMISC */ ) {
- if (unlikely(compare_ether_addr(eth->h_dest, dev->dev_addr)))
- skb->pkt_type = PACKET_OTHERHOST;
- }
-
- if (ntohs(eth->h_proto) >= 1536)
+ if (likely(eth_proto_is_802_3(eth->h_proto)))
return eth->h_proto;
- rawp = skb->data;
-
/*
* This is a magic hack to spot IPX packets. Older Novell breaks
* the protocol design and runs IPX over 802.3 without an 802.2 LLC
* layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
* won't work for fault tolerant netware but does for the rest.
+ * We use skb->dev as temporary storage to not hit
+ * CONFIG_STACKPROTECTOR_STRONG=y costs on some platforms.
*/
- if (*(unsigned short *)rawp == 0xFFFF)
- return htons(ETH_P_802_3);
+ sap = skb_header_pointer(skb, 0, sizeof(*sap), &skb->dev);
+ res = (sap && *sap == 0xFFFF) ? htons(ETH_P_802_3) : htons(ETH_P_802_2);
- /*
- * Real 802.2 LLC
- */
- return htons(ETH_P_802_2);
+ /* restore skb->dev in case it was mangled by skb_header_pointer(). */
+ skb->dev = dev;
+ return res;
}
EXPORT_SYMBOL(eth_type_trans);
@@ -207,37 +198,45 @@ EXPORT_SYMBOL(eth_type_trans);
* @skb: packet to extract header from
* @haddr: destination buffer
*/
-static int eth_header_parse(struct sk_buff *skb, unsigned char *haddr)
+int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
- struct ethhdr *eth = eth_hdr(skb);
+ const struct ethhdr *eth = eth_hdr(skb);
memcpy(haddr, eth->h_source, ETH_ALEN);
return ETH_ALEN;
}
+EXPORT_SYMBOL(eth_header_parse);
/**
* eth_header_cache - fill cache entry from neighbour
* @neigh: source neighbour
* @hh: destination cache entry
+ * @type: Ethernet type field
+ *
* Create an Ethernet header template from the neighbour.
*/
-int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh)
+int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
{
- __be16 type = hh->hh_type;
struct ethhdr *eth;
- struct net_device *dev = neigh->dev;
+ const struct net_device *dev = neigh->dev;
eth = (struct ethhdr *)
(((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));
- if (type == __constant_htons(ETH_P_802_3))
+ if (type == htons(ETH_P_802_3))
return -1;
eth->h_proto = type;
- memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
- memcpy(eth->h_dest, neigh->ha, dev->addr_len);
- hh->hh_len = ETH_HLEN;
+ memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
+ memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
+
+ /* Pairs with READ_ONCE() in neigh_resolve_output(),
+ * neigh_hh_output() and neigh_update_hhs().
+ */
+ smp_store_release(&hh->hh_len, ETH_HLEN);
+
return 0;
}
+EXPORT_SYMBOL(eth_header_cache);
/**
* eth_header_cache_update - update cache entry
@@ -247,78 +246,127 @@ int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh)
*
* Called by Address Resolution module to notify changes in address.
*/
-void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev,
- unsigned char *haddr)
+void eth_header_cache_update(struct hh_cache *hh,
+ const struct net_device *dev,
+ const unsigned char *haddr)
{
memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
- haddr, dev->addr_len);
+ haddr, ETH_ALEN);
}
+EXPORT_SYMBOL(eth_header_cache_update);
/**
- * eth_mac_addr - set new Ethernet hardware address
+ * eth_header_parse_protocol - extract protocol from L2 header
+ * @skb: packet to extract protocol from
+ */
+__be16 eth_header_parse_protocol(const struct sk_buff *skb)
+{
+ const struct ethhdr *eth = eth_hdr(skb);
+
+ return eth->h_proto;
+}
+EXPORT_SYMBOL(eth_header_parse_protocol);
+
+/**
+ * eth_prepare_mac_addr_change - prepare for mac change
* @dev: network device
* @p: socket address
- * Change hardware address of device.
- *
- * This doesn't change hardware matching, so needs to be overridden
- * for most real devices.
*/
-static int eth_mac_addr(struct net_device *dev, void *p)
+int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
{
struct sockaddr *addr = p;
- if (netif_running(dev))
+
+ if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
return -EBUSY;
- memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
return 0;
}
+EXPORT_SYMBOL(eth_prepare_mac_addr_change);
+
+/**
+ * eth_commit_mac_addr_change - commit mac change
+ * @dev: network device
+ * @p: socket address
+ */
+void eth_commit_mac_addr_change(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ eth_hw_addr_set(dev, addr->sa_data);
+}
+EXPORT_SYMBOL(eth_commit_mac_addr_change);
/**
- * eth_change_mtu - set new MTU size
+ * eth_mac_addr - set new Ethernet hardware address
* @dev: network device
- * @new_mtu: new Maximum Transfer Unit
+ * @p: socket address
*
- * Allow changing MTU size. Needs to be overridden for devices
- * supporting jumbo frames.
+ * Change hardware address of device.
+ *
+ * This doesn't change hardware matching, so needs to be overridden
+ * for most real devices.
*/
-static int eth_change_mtu(struct net_device *dev, int new_mtu)
+int eth_mac_addr(struct net_device *dev, void *p)
{
- if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
- return -EINVAL;
- dev->mtu = new_mtu;
+ int ret;
+
+ ret = eth_prepare_mac_addr_change(dev, p);
+ if (ret < 0)
+ return ret;
+ eth_commit_mac_addr_change(dev, p);
return 0;
}
+EXPORT_SYMBOL(eth_mac_addr);
+
+int eth_validate_addr(struct net_device *dev)
+{
+ if (!is_valid_ether_addr(dev->dev_addr))
+ return -EADDRNOTAVAIL;
+
+ return 0;
+}
+EXPORT_SYMBOL(eth_validate_addr);
+
+const struct header_ops eth_header_ops ____cacheline_aligned = {
+ .create = eth_header,
+ .parse = eth_header_parse,
+ .cache = eth_header_cache,
+ .cache_update = eth_header_cache_update,
+ .parse_protocol = eth_header_parse_protocol,
+};
/**
* ether_setup - setup Ethernet network device
* @dev: network device
+ *
* Fill in the fields of the device structure with Ethernet-generic values.
*/
void ether_setup(struct net_device *dev)
{
- dev->change_mtu = eth_change_mtu;
- dev->hard_header = eth_header;
- dev->rebuild_header = eth_rebuild_header;
- dev->set_mac_address = eth_mac_addr;
- dev->hard_header_cache = eth_header_cache;
- dev->header_cache_update= eth_header_cache_update;
- dev->hard_header_parse = eth_header_parse;
-
+ dev->header_ops = &eth_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
+ dev->min_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
- dev->tx_queue_len = 1000; /* Ethernet wants good queues */
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
-
- memset(dev->broadcast, 0xFF, ETH_ALEN);
+ dev->priv_flags |= IFF_TX_SKB_SHARING;
+
+ eth_broadcast_addr(dev->broadcast);
}
EXPORT_SYMBOL(ether_setup);
/**
- * alloc_etherdev - Allocates and sets up an Ethernet device
+ * alloc_etherdev_mqs - Allocates and sets up an Ethernet device
* @sizeof_priv: Size of additional driver-private structure to be allocated
* for this Ethernet device
+ * @txqs: The number of TX queues this device has.
+ * @rxqs: The number of RX queues this device has.
*
* Fill in the fields of the device structure with Ethernet-generic
* values. Basically does everything except registering the device.
@@ -328,8 +376,266 @@ EXPORT_SYMBOL(ether_setup);
* this private data area.
*/
-struct net_device *alloc_etherdev(int sizeof_priv)
+struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
+ unsigned int rxqs)
+{
+ return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_ENUM,
+ ether_setup, txqs, rxqs);
+}
+EXPORT_SYMBOL(alloc_etherdev_mqs);
+
+ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
+{
+ return sysfs_emit(buf, "%*phC\n", len, addr);
+}
+EXPORT_SYMBOL(sysfs_format_mac);
+
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ const struct packet_offload *ptype;
+ unsigned int hlen, off_eth;
+ struct sk_buff *pp = NULL;
+ struct ethhdr *eh, *eh2;
+ struct sk_buff *p;
+ __be16 type;
+ int flush = 1;
+
+ off_eth = skb_gro_offset(skb);
+ hlen = off_eth + sizeof(*eh);
+ eh = skb_gro_header(skb, hlen, off_eth);
+ if (unlikely(!eh))
+ goto out;
+
+ flush = 0;
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ eh2 = (struct ethhdr *)(p->data + off_eth);
+ if (compare_ether_header(eh, eh2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ type = eh->h_proto;
+
+ ptype = gro_find_receive_by_type(type);
+ if (ptype == NULL) {
+ flush = 1;
+ goto out;
+ }
+
+ skb_gro_pull(skb, sizeof(*eh));
+ skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
+
+ pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ head, skb);
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+EXPORT_SYMBOL(eth_gro_receive);
+
+int eth_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
+ __be16 type = eh->h_proto;
+ struct packet_offload *ptype;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation)
+ skb_set_inner_mac_header(skb, nhoff);
+
+ ptype = gro_find_complete_by_type(type);
+ if (ptype != NULL)
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, nhoff + sizeof(*eh));
+
+ return err;
+}
+EXPORT_SYMBOL(eth_gro_complete);
+
+static struct packet_offload eth_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_TEB),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = eth_gro_receive,
+ .gro_complete = eth_gro_complete,
+ },
+};
+
+static int __init eth_offload_init(void)
+{
+ dev_add_offload(&eth_packet_offload);
+
+ return 0;
+}
+
+fs_initcall(eth_offload_init);
+
+unsigned char * __weak arch_get_platform_mac_address(void)
+{
+ return NULL;
+}
+
+int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
+{
+ unsigned char *addr;
+ int ret;
+
+ ret = of_get_mac_address(dev->of_node, mac_addr);
+ if (!ret)
+ return 0;
+
+ addr = arch_get_platform_mac_address();
+ if (!addr)
+ return -ENODEV;
+
+ ether_addr_copy(mac_addr, addr);
+
+ return 0;
+}
+EXPORT_SYMBOL(eth_platform_get_mac_address);
+
+/**
+ * platform_get_ethdev_address - Set netdev's MAC address from a given device
+ * @dev: Pointer to the device
+ * @netdev: Pointer to netdev to write the address to
+ *
+ * Wrapper around eth_platform_get_mac_address() which writes the address
+ * directly to netdev->dev_addr.
+ */
+int platform_get_ethdev_address(struct device *dev, struct net_device *netdev)
+{
+ u8 addr[ETH_ALEN] __aligned(2);
+ int ret;
+
+ ret = eth_platform_get_mac_address(dev, addr);
+ if (!ret)
+ eth_hw_addr_set(netdev, addr);
+ return ret;
+}
+EXPORT_SYMBOL(platform_get_ethdev_address);
+
+/**
+ * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named
+ * 'mac-address' associated with given device.
+ *
+ * @dev: Device with which the mac-address cell is associated.
+ * @addrbuf: Buffer to which the MAC address will be copied on success.
+ *
+ * Returns 0 on success or a negative error number on failure.
+ */
+int nvmem_get_mac_address(struct device *dev, void *addrbuf)
+{
+ struct nvmem_cell *cell;
+ const void *mac;
+ size_t len;
+
+ cell = nvmem_cell_get(dev, "mac-address");
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ mac = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ kfree(mac);
+ return -EINVAL;
+ }
+
+ ether_addr_copy(addrbuf, mac);
+ kfree(mac);
+
+ return 0;
+}
+
+static int fwnode_get_mac_addr(struct fwnode_handle *fwnode,
+ const char *name, char *addr)
+{
+ int ret;
+
+ ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN);
+ if (ret)
+ return ret;
+
+ if (!is_valid_ether_addr(addr))
+ return -EINVAL;
+ return 0;
+}
+
+/**
+ * fwnode_get_mac_address - Get the MAC from the firmware node
+ * @fwnode: Pointer to the firmware node
+ * @addr: Address of buffer to store the MAC in
+ *
+ * Search the firmware node for the best MAC address to use. 'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address. If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the firmware tables, but were not updated by the firmware. For
+ * example, the DTS could define 'mac-address' and 'local-mac-address', with
+ * zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'.
+ * In this case, the real MAC is in 'local-mac-address', and 'mac-address'
+ * exists but is all zeros.
+ */
+int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr)
+{
+ if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) ||
+ !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) ||
+ !fwnode_get_mac_addr(fwnode, "address", addr))
+ return 0;
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(fwnode_get_mac_address);
+
+/**
+ * device_get_mac_address - Get the MAC for a given device
+ * @dev: Pointer to the device
+ * @addr: Address of buffer to store the MAC in
+ */
+int device_get_mac_address(struct device *dev, char *addr)
+{
+ if (!fwnode_get_mac_address(dev_fwnode(dev), addr))
+ return 0;
+
+ return nvmem_get_mac_address(dev, addr);
+}
+EXPORT_SYMBOL(device_get_mac_address);
+
+/**
+ * device_get_ethdev_address - Set netdev's MAC address from a given device
+ * @dev: Pointer to the device
+ * @netdev: Pointer to netdev to write the address to
+ *
+ * Wrapper around device_get_mac_address() which writes the address
+ * directly to netdev->dev_addr.
+ */
+int device_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
- return alloc_netdev(sizeof_priv, "eth%d", ether_setup);
+ u8 addr[ETH_ALEN];
+ int ret;
+
+ ret = device_get_mac_address(dev, addr);
+ if (!ret)
+ eth_hw_addr_set(netdev, addr);
+ return ret;
}
-EXPORT_SYMBOL(alloc_etherdev);
+EXPORT_SYMBOL(device_get_ethdev_address);
diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c
deleted file mode 100644
index 9d57b4fb6440..000000000000
--- a/net/ethernet/pe2.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <linux/in.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-
-#include <net/datalink.h>
-
-static int pEII_request(struct datalink_proto *dl,
- struct sk_buff *skb, unsigned char *dest_node)
-{
- struct net_device *dev = skb->dev;
-
- skb->protocol = htons(ETH_P_IPX);
- if (dev->hard_header)
- dev->hard_header(skb, dev, ETH_P_IPX,
- dest_node, NULL, skb->len);
- return dev_queue_xmit(skb);
-}
-
-struct datalink_proto *make_EII_client(void)
-{
- struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
-
- if (proto) {
- proto->header_length = 0;
- proto->request = pEII_request;
- }
-
- return proto;
-}
-
-void destroy_EII_client(struct datalink_proto *dl)
-{
- kfree(dl);
-}
-
-EXPORT_SYMBOL(destroy_EII_client);
-EXPORT_SYMBOL(make_EII_client);
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
new file mode 100644
index 000000000000..629c10916670
--- /dev/null
+++ b/net/ethtool/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-y += ioctl.o common.o
+
+obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o
+
+ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
+ linkstate.o debug.o wol.o features.o privflags.o rings.o \
+ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
+ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
+ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \
+ phy.o tsconfig.o mse.o
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
new file mode 100644
index 000000000000..f0883357d12e
--- /dev/null
+++ b/net/ethtool/bitset.c
@@ -0,0 +1,873 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <linux/bitmap.h>
+#include "netlink.h"
+#include "bitset.h"
+
+/* Some bitmaps are internally represented as an array of unsigned long, some
+ * as an array of u32 (some even as single u32 for now). To avoid the need of
+ * wrappers on caller side, we provide two set of functions: those with "32"
+ * suffix in their names expect u32 based bitmaps, those without it expect
+ * unsigned long bitmaps.
+ */
+
+static u32 ethnl_lower_bits(unsigned int n)
+{
+ return ~(u32)0 >> (32 - n % 32);
+}
+
+static u32 ethnl_upper_bits(unsigned int n)
+{
+ return ~(u32)0 << (n % 32);
+}
+
+/**
+ * ethnl_bitmap32_clear() - Clear u32 based bitmap
+ * @dst: bitmap to clear
+ * @start: beginning of the interval
+ * @end: end of the interval
+ * @mod: set if bitmap was modified
+ *
+ * Clear @nbits bits of a bitmap with indices @start <= i < @end
+ */
+static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end,
+ bool *mod)
+{
+ unsigned int start_word = start / 32;
+ unsigned int end_word = end / 32;
+ unsigned int i;
+ u32 mask;
+
+ if (end <= start)
+ return;
+
+ if (start % 32) {
+ mask = ethnl_upper_bits(start);
+ if (end_word == start_word) {
+ mask &= ethnl_lower_bits(end);
+ if (dst[start_word] & mask) {
+ dst[start_word] &= ~mask;
+ *mod = true;
+ }
+ return;
+ }
+ if (dst[start_word] & mask) {
+ dst[start_word] &= ~mask;
+ *mod = true;
+ }
+ start_word++;
+ }
+
+ for (i = start_word; i < end_word; i++) {
+ if (dst[i]) {
+ dst[i] = 0;
+ *mod = true;
+ }
+ }
+ if (end % 32) {
+ mask = ethnl_lower_bits(end);
+ if (dst[end_word] & mask) {
+ dst[end_word] &= ~mask;
+ *mod = true;
+ }
+ }
+}
+
+/**
+ * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval
+ * @map: bitmap to test
+ * @start: beginning of the interval
+ * @end: end of the interval
+ *
+ * Return: true if there is non-zero bit with index @start <= i < @end,
+ * false if the whole interval is zero
+ */
+static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start,
+ unsigned int end)
+{
+ unsigned int start_word = start / 32;
+ unsigned int end_word = end / 32;
+ u32 mask;
+
+ if (end <= start)
+ return true;
+
+ if (start % 32) {
+ mask = ethnl_upper_bits(start);
+ if (end_word == start_word) {
+ mask &= ethnl_lower_bits(end);
+ return map[start_word] & mask;
+ }
+ if (map[start_word] & mask)
+ return true;
+ start_word++;
+ }
+
+ if (!memchr_inv(map + start_word, '\0',
+ (end_word - start_word) * sizeof(u32)))
+ return true;
+ if (end % 32 == 0)
+ return true;
+ return map[end_word] & ethnl_lower_bits(end);
+}
+
+/**
+ * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask
+ * pair
+ * @dst: bitmap to update
+ * @nbits: bit size of the bitmap
+ * @value: values to set
+ * @mask: mask of bits to set
+ * @mod: set to true if bitmap is modified, preserve if not
+ *
+ * Set bits in @dst bitmap which are set in @mask to values from @value, leave
+ * the rest untouched. If destination bitmap was modified, set @mod to true,
+ * leave as it is if not.
+ */
+static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits,
+ const u32 *value, const u32 *mask, bool *mod)
+{
+ while (nbits > 0) {
+ u32 real_mask = mask ? *mask : ~(u32)0;
+ u32 new_value;
+
+ if (nbits < 32)
+ real_mask &= ethnl_lower_bits(nbits);
+ new_value = (*dst & ~real_mask) | (*value & real_mask);
+ if (new_value != *dst) {
+ *dst = new_value;
+ *mod = true;
+ }
+
+ if (nbits <= 32)
+ break;
+ dst++;
+ nbits -= 32;
+ value++;
+ if (mask)
+ mask++;
+ }
+}
+
+static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index)
+{
+ return map[index / 32] & (1U << (index % 32));
+}
+
+/**
+ * ethnl_bitset32_size() - Calculate size of bitset nested attribute
+ * @val: value bitmap (u32 based)
+ * @mask: mask bitmap (u32 based, optional)
+ * @nbits: bit length of the bitset
+ * @names: array of bit names (optional)
+ * @compact: assume compact format for output
+ *
+ * Estimate length of netlink attribute composed by a later call to
+ * ethnl_put_bitset32() call with the same arguments.
+ *
+ * Return: negative error code or attribute length estimate
+ */
+int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact)
+{
+ unsigned int len = 0;
+
+ /* list flag */
+ if (!mask)
+ len += nla_total_size(sizeof(u32));
+ /* size */
+ len += nla_total_size(sizeof(u32));
+
+ if (compact) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ /* value, mask */
+ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32));
+ } else {
+ unsigned int bits_len = 0;
+ unsigned int bit_len, i;
+
+ for (i = 0; i < nbits; i++) {
+ const char *name = names ? names[i] : NULL;
+
+ if (!ethnl_bitmap32_test_bit(mask ?: val, i))
+ continue;
+ /* index */
+ bit_len = nla_total_size(sizeof(u32));
+ /* name */
+ if (name)
+ bit_len += ethnl_strz_size(name);
+ /* value */
+ if (mask && ethnl_bitmap32_test_bit(val, i))
+ bit_len += nla_total_size(0);
+
+ /* bit nest */
+ bits_len += nla_total_size(bit_len);
+ }
+ /* bits nest */
+ len += nla_total_size(bits_len);
+ }
+
+ /* outermost nest */
+ return nla_total_size(len);
+}
+
+/**
+ * ethnl_put_bitset32() - Put a bitset nest into a message
+ * @skb: skb with the message
+ * @attrtype: attribute type for the bitset nest
+ * @val: value bitmap (u32 based)
+ * @mask: mask bitmap (u32 based, optional)
+ * @nbits: bit length of the bitset
+ * @names: array of bit names (optional)
+ * @compact: use compact format for the output
+ *
+ * Compose a nested attribute representing a bitset. If @mask is null, simple
+ * bitmap (bit list) is created, if @mask is provided, represent a value/mask
+ * pair. Bit names are only used in verbose mode and when provided by calller.
+ *
+ * Return: 0 on success, negative error value on error
+ */
+int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
+ const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact)
+{
+ struct nlattr *nest;
+ struct nlattr *attr;
+
+ nest = nla_nest_start(skb, attrtype);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits))
+ goto nla_put_failure;
+ if (compact) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+ unsigned int nbytes = nwords * sizeof(u32);
+ u32 *dst;
+
+ attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes);
+ if (!attr)
+ goto nla_put_failure;
+ dst = nla_data(attr);
+ memcpy(dst, val, nbytes);
+ if (nbits % 32)
+ dst[nwords - 1] &= ethnl_lower_bits(nbits);
+
+ if (mask) {
+ attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes);
+ if (!attr)
+ goto nla_put_failure;
+ dst = nla_data(attr);
+ memcpy(dst, mask, nbytes);
+ if (nbits % 32)
+ dst[nwords - 1] &= ethnl_lower_bits(nbits);
+ }
+ } else {
+ struct nlattr *bits;
+ unsigned int i;
+
+ bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS);
+ if (!bits)
+ goto nla_put_failure;
+ for (i = 0; i < nbits; i++) {
+ const char *name = names ? names[i] : NULL;
+
+ if (!ethnl_bitmap32_test_bit(mask ?: val, i))
+ continue;
+ attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT);
+ if (!attr)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i))
+ goto nla_put_failure;
+ if (name &&
+ ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name))
+ goto nla_put_failure;
+ if (mask && ethnl_bitmap32_test_bit(val, i) &&
+ nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE))
+ goto nla_put_failure;
+ nla_nest_end(skb, attr);
+ }
+ nla_nest_end(skb, bits);
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy bitset_policy[] = {
+ [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG },
+ [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32,
+ ETHNL_MAX_BITSET_SIZE),
+ [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY },
+ [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY },
+};
+
+static const struct nla_policy bit_policy[] = {
+ [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING },
+ [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG },
+};
+
+/**
+ * ethnl_bitset_is_compact() - check if bitset attribute represents a compact
+ * bitset
+ * @bitset: nested attribute representing a bitset
+ * @compact: pointer for return value
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact)
+{
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset,
+ bitset_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BITS]) {
+ if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK])
+ return -EINVAL;
+ *compact = false;
+ return 0;
+ }
+ if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE])
+ return -EINVAL;
+
+ *compact = true;
+ return 0;
+}
+
+/**
+ * ethnl_name_to_idx() - look up string index for a name
+ * @names: array of ETH_GSTRING_LEN sized strings
+ * @n_names: number of strings in the array
+ * @name: name to look up
+ *
+ * Return: index of the string if found, -ENOENT if not found
+ */
+static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names,
+ const char *name)
+{
+ unsigned int i;
+
+ if (!names)
+ return -ENOENT;
+
+ for (i = 0; i < n_names; i++) {
+ /* names[i] may not be null terminated */
+ if (!strncmp(names[i], name, ETH_GSTRING_LEN) &&
+ strlen(name) <= ETH_GSTRING_LEN)
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits,
+ const struct nlattr *bit_attr, bool no_mask,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(bit_policy)];
+ int ret, idx;
+
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr,
+ bit_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) {
+ const char *name;
+
+ idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]);
+ if (idx >= nbits) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_BITSET_BIT_INDEX],
+ "bit index too high");
+ return -EOPNOTSUPP;
+ }
+ name = names ? names[idx] : NULL;
+ if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name &&
+ strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name,
+ nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "bit index and name mismatch");
+ return -EINVAL;
+ }
+ } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) {
+ idx = ethnl_name_to_idx(names, nbits,
+ nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]));
+ if (idx < 0) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_BITSET_BIT_NAME],
+ "bit name not found");
+ return -EOPNOTSUPP;
+ }
+ } else {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "neither bit index nor name specified");
+ return -EINVAL;
+ }
+
+ *index = idx;
+ *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE];
+ return 0;
+}
+
+/**
+ * ethnl_bitmap32_equal() - Compare two bitmaps
+ * @map1: first bitmap
+ * @map2: second bitmap
+ * @nbits: bit size to compare
+ *
+ * Return: true if first @nbits are equal, false if not
+ */
+static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2,
+ unsigned int nbits)
+{
+ if (memcmp(map1, map2, nbits / 32 * sizeof(u32)))
+ return false;
+ if (nbits % 32 == 0)
+ return true;
+ return !((map1[nbits / 32] ^ map2[nbits / 32]) &
+ ethnl_lower_bits(nbits % 32));
+}
+
+static int
+ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, struct nlattr **tb,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ u32 *saved_bitmap = NULL;
+ struct nlattr *bit_attr;
+ bool no_mask;
+ int rem;
+ int ret;
+
+ if (tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "value only allowed in compact bitset");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask only allowed in compact bitset");
+ return -EINVAL;
+ }
+
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ if (no_mask) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+ unsigned int nbytes = nwords * sizeof(u32);
+ bool dummy;
+
+ /* The bitmap size is only the size of the map part without
+ * its mask part.
+ */
+ saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL);
+ if (!saved_bitmap)
+ return -ENOMEM;
+ memcpy(saved_bitmap, bitmap, nbytes);
+ ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy);
+ }
+
+ nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
+ bool old_val, new_val;
+ unsigned int idx;
+
+ if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS");
+ kfree(saved_bitmap);
+ return -EINVAL;
+ }
+ ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask,
+ names, extack);
+ if (ret < 0) {
+ kfree(saved_bitmap);
+ return ret;
+ }
+ old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32));
+ if (new_val != old_val) {
+ if (new_val)
+ bitmap[idx / 32] |= ((u32)1 << (idx % 32));
+ else
+ bitmap[idx / 32] &= ~((u32)1 << (idx % 32));
+ if (!no_mask)
+ *mod = true;
+ }
+ }
+
+ if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits))
+ *mod = true;
+
+ kfree(saved_bitmap);
+ return 0;
+}
+
+static int ethnl_compact_sanity_checks(unsigned int nbits,
+ const struct nlattr *nest,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ unsigned int attr_nbits, attr_nwords;
+ const struct nlattr *test_attr;
+
+ if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask not allowed in list bitset");
+ return -EINVAL;
+ }
+ if (!tb[ETHTOOL_A_BITSET_SIZE]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing size in compact bitset");
+ return -EINVAL;
+ }
+ if (!tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing value in compact bitset");
+ return -EINVAL;
+ }
+ if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing mask in compact nonlist bitset");
+ return -EINVAL;
+ }
+
+ attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ attr_nwords = DIV_ROUND_UP(attr_nbits, 32);
+ if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "bitset value length does not match size");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK] &&
+ nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "bitset mask length does not match size");
+ return -EINVAL;
+ }
+ if (attr_nbits <= nbits)
+ return 0;
+
+ test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] :
+ tb[ETHTOOL_A_BITSET_MASK];
+ if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) {
+ NL_SET_ERR_MSG_ATTR(extack, test_attr,
+ "cannot modify bits past kernel bitset size");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap
+ * @bitmap: bitmap to update
+ * @nbits: size of the updated bitmap in bits
+ * @attr: nest attribute to parse and apply
+ * @names: array of bit names; may be null for compact format
+ * @extack: extack for error reporting
+ * @mod: set this to true if bitmap is modified, leave as it is if not
+ *
+ * Apply bitset netsted attribute to a bitmap. If the attribute represents
+ * a bit list, @bitmap is set to its contents; otherwise, bits in mask are
+ * set to values from value. Bitmaps in the attribute may be longer than
+ * @nbits but the message must not request modifying any bits past @nbits.
+ *
+ * Return: negative error code on failure, 0 on success
+ */
+int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
+ unsigned int change_bits;
+ bool no_mask;
+ int ret;
+
+ if (!attr)
+ return 0;
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr,
+ bitset_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BITS])
+ return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb,
+ names, extack, mod);
+ ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack);
+ if (ret < 0)
+ return ret;
+
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ change_bits = min_t(unsigned int,
+ nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits);
+ ethnl_bitmap32_update(bitmap, change_bits,
+ nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
+ no_mask ? NULL :
+ nla_data(tb[ETHTOOL_A_BITSET_MASK]),
+ mod);
+ if (no_mask && change_bits < nbits)
+ ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod);
+
+ return 0;
+}
+
+/**
+ * ethnl_parse_bitset() - Compute effective value and mask from bitset nest
+ * @val: unsigned long based bitmap to put value into
+ * @mask: unsigned long based bitmap to put mask into
+ * @nbits: size of @val and @mask bitmaps
+ * @attr: nest attribute to parse and apply
+ * @names: array of bit names; may be null for compact format
+ * @extack: extack for error reporting
+ *
+ * Provide @nbits size long bitmaps for value and mask so that
+ * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x
+ * the same way ethnl_update_bitset() with the same bitset attribute would.
+ *
+ * Return: negative error code on failure, 0 on success
+ */
+int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
+ unsigned int nbits, const struct nlattr *attr,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
+ const struct nlattr *bit_attr;
+ bool no_mask;
+ int rem;
+ int ret;
+
+ if (!attr)
+ return 0;
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr,
+ bitset_policy, extack);
+ if (ret < 0)
+ return ret;
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+
+ if (!tb[ETHTOOL_A_BITSET_BITS]) {
+ unsigned int change_bits;
+
+ ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack);
+ if (ret < 0)
+ return ret;
+
+ change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ if (change_bits > nbits)
+ change_bits = nbits;
+ bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
+ change_bits);
+ if (change_bits < nbits)
+ bitmap_clear(val, change_bits, nbits - change_bits);
+ if (no_mask) {
+ bitmap_fill(mask, nbits);
+ } else {
+ bitmap_from_arr32(mask,
+ nla_data(tb[ETHTOOL_A_BITSET_MASK]),
+ change_bits);
+ if (change_bits < nbits)
+ bitmap_clear(mask, change_bits,
+ nbits - change_bits);
+ }
+
+ return 0;
+ }
+
+ if (tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "value only allowed in compact bitset");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask only allowed in compact bitset");
+ return -EINVAL;
+ }
+
+ bitmap_zero(val, nbits);
+ if (no_mask)
+ bitmap_fill(mask, nbits);
+ else
+ bitmap_zero(mask, nbits);
+
+ nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
+ unsigned int idx;
+ bool bit_val;
+
+ ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask,
+ names, extack);
+ if (ret < 0)
+ return ret;
+ if (bit_val)
+ __set_bit(idx, val);
+ if (!no_mask)
+ __set_bit(idx, mask);
+ }
+
+ return 0;
+}
+
+#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN)
+
+/* 64-bit big endian architectures are the only case when u32 based bitmaps
+ * and unsigned long based bitmaps have different memory layout so that we
+ * cannot simply cast the latter to the former and need actual wrappers
+ * converting the latter to the former.
+ *
+ * To reduce the number of slab allocations, the wrappers use fixed size local
+ * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the
+ * majority of bitmaps used by ethtool.
+ */
+#define ETHNL_SMALL_BITMAP_BITS 128
+#define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32)
+
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 small_val32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *mask32;
+ u32 *val32;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL);
+ if (!val32)
+ return -ENOMEM;
+ mask32 = val32 + nwords;
+ } else {
+ val32 = small_val32;
+ mask32 = small_mask32;
+ }
+
+ bitmap_to_arr32(val32, val, nbits);
+ if (mask)
+ bitmap_to_arr32(mask32, mask, nbits);
+ else
+ mask32 = NULL;
+ ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact);
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(val32);
+
+ return ret;
+}
+
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 small_val32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *mask32;
+ u32 *val32;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL);
+ if (!val32)
+ return -ENOMEM;
+ mask32 = val32 + nwords;
+ } else {
+ val32 = small_val32;
+ mask32 = small_mask32;
+ }
+
+ bitmap_to_arr32(val32, val, nbits);
+ if (mask)
+ bitmap_to_arr32(mask32, mask, nbits);
+ else
+ mask32 = NULL;
+ ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names,
+ compact);
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(val32);
+
+ return ret;
+}
+
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *bitmap32 = small_bitmap32;
+ bool u32_mod = false;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int dst_words = DIV_ROUND_UP(nbits, 32);
+
+ bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL);
+ if (!bitmap32)
+ return -ENOMEM;
+ }
+
+ bitmap_to_arr32(bitmap32, bitmap, nbits);
+ ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack,
+ &u32_mod);
+ if (u32_mod) {
+ bitmap_from_arr32(bitmap, bitmap32, nbits);
+ *mod = true;
+ }
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(bitmap32);
+
+ return ret;
+}
+
+#else
+
+/* On little endian 64-bit and all 32-bit architectures, an unsigned long
+ * based bitmap can be interpreted as u32 based one using a simple cast.
+ */
+
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits,
+ names, compact);
+}
+
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ return ethnl_put_bitset32(skb, attrtype, (const u32 *)val,
+ (const u32 *)mask, nbits, names, compact);
+}
+
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack,
+ mod);
+}
+
+#endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h
new file mode 100644
index 000000000000..c2c2e0051d00
--- /dev/null
+++ b/net/ethtool/bitset.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_BITSET_H
+#define _NET_ETHTOOL_BITSET_H
+
+#define ETHNL_MAX_BITSET_SIZE S16_MAX
+
+typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN];
+
+int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact);
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact);
+int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact);
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact);
+int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
+ const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact);
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod);
+int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod);
+int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
+ unsigned int nbits, const struct nlattr *attr,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack);
+
+#endif /* _NET_ETHTOOL_BITSET_H */
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
new file mode 100644
index 000000000000..0364b8fb577b
--- /dev/null
+++ b/net/ethtool/cabletest.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include <linux/ethtool_netlink.h>
+#include <net/netdev_lock.h>
+#include "netlink.h"
+#include "common.h"
+
+/* 802.3 standard allows 100 meters for BaseT cables. However longer
+ * cables might work, depending on the quality of the cables and the
+ * PHY. So allow testing for up to 150 meters.
+ */
+#define MAX_CABLE_LENGTH_CM (150 * 100)
+
+const struct nla_policy ethnl_cable_test_act_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd)
+{
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+ void *ehdr;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ ehdr = ethnl_bcastmsg_put(skb, cmd);
+ if (!ehdr) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ethnl_fill_reply_header(skb, phydev->attached_dev,
+ ETHTOOL_A_CABLE_TEST_NTF_HEADER);
+ if (err)
+ goto out;
+
+ err = nla_put_u8(skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS,
+ ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED);
+ if (err)
+ goto out;
+
+ genlmsg_end(skb, ehdr);
+
+ return ethnl_multicast(skb, phydev->attached_dev);
+
+out:
+ nlmsg_free(skb);
+ phydev_err(phydev, "%s: Error %pe\n", __func__, ERR_PTR(err));
+
+ return err;
+}
+
+int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ struct net_device *dev;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_CABLE_TEST_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ dev = req_info.dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ phydev = ethnl_req_get_phydev(&req_info, tb,
+ ETHTOOL_A_CABLE_TEST_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->start_cable_test) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = ops->start_cable_test(phydev, info->extack);
+
+ ethnl_ops_complete(dev);
+
+ if (!ret)
+ ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_NTF);
+
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
+{
+ int err = -ENOMEM;
+
+ /* One TDR sample occupies 20 bytes. For a 150 meter cable,
+ * with four pairs, around 12K is needed.
+ */
+ phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL);
+ if (!phydev->skb)
+ goto out;
+
+ phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd);
+ if (!phydev->ehdr) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev,
+ ETHTOOL_A_CABLE_TEST_NTF_HEADER);
+ if (err)
+ goto out;
+
+ err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS,
+ ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED);
+ if (err)
+ goto out;
+
+ phydev->nest = nla_nest_start(phydev->skb,
+ ETHTOOL_A_CABLE_TEST_NTF_NEST);
+ if (!phydev->nest) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ return 0;
+
+out:
+ nlmsg_free(phydev->skb);
+ phydev->skb = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc);
+
+void ethnl_cable_test_free(struct phy_device *phydev)
+{
+ nlmsg_free(phydev->skb);
+ phydev->skb = NULL;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_free);
+
+void ethnl_cable_test_finished(struct phy_device *phydev)
+{
+ nla_nest_end(phydev->skb, phydev->nest);
+
+ genlmsg_end(phydev->skb, phydev->ehdr);
+
+ ethnl_multicast(phydev->skb, phydev->attached_dev);
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_finished);
+
+int ethnl_cable_test_result_with_src(struct phy_device *phydev, u8 pair,
+ u8 result, u32 src)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair))
+ goto err;
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result))
+ goto err;
+ if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) {
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_RESULT_SRC, src))
+ goto err;
+ }
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_result_with_src);
+
+int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, u8 pair,
+ u32 cm, u32 src)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb,
+ ETHTOOL_A_CABLE_NEST_FAULT_LENGTH);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair))
+ goto err;
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm))
+ goto err;
+ if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) {
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_SRC,
+ src))
+ goto err;
+ }
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length_with_src);
+
+static const struct nla_policy cable_test_tdr_act_cfg_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 },
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 },
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 },
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 },
+};
+
+const struct nla_policy ethnl_cable_test_tdr_act_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_TDR_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED },
+};
+
+/* CABLE_TEST_TDR_ACT */
+static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
+ struct genl_info *info,
+ struct phy_tdr_config *cfg)
+{
+ struct nlattr *tb[ARRAY_SIZE(cable_test_tdr_act_cfg_policy)];
+ int ret;
+
+ cfg->first = 100;
+ cfg->step = 100;
+ cfg->last = MAX_CABLE_LENGTH_CM;
+ cfg->pair = PHY_PAIR_ALL;
+
+ if (!nest)
+ return 0;
+
+ ret = nla_parse_nested(tb,
+ ARRAY_SIZE(cable_test_tdr_act_cfg_policy) - 1,
+ nest, cable_test_tdr_act_cfg_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST])
+ cfg->first = nla_get_u32(
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]);
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST])
+ cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]);
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP])
+ cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]);
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) {
+ cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]);
+ if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) {
+ NL_SET_ERR_MSG_ATTR(
+ info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR],
+ "invalid pair parameter");
+ return -EINVAL;
+ }
+ }
+
+ if (cfg->first > MAX_CABLE_LENGTH_CM) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST],
+ "invalid first parameter");
+ return -EINVAL;
+ }
+
+ if (cfg->last > MAX_CABLE_LENGTH_CM) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST],
+ "invalid last parameter");
+ return -EINVAL;
+ }
+
+ if (cfg->first > cfg->last) {
+ NL_SET_ERR_MSG(info->extack, "invalid first/last parameter");
+ return -EINVAL;
+ }
+
+ if (!cfg->step) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP],
+ "invalid step parameter");
+ return -EINVAL;
+ }
+
+ if (cfg->step > (cfg->last - cfg->first)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP],
+ "step parameter too big");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ struct phy_tdr_config cfg;
+ struct net_device *dev;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ dev = req_info.dev;
+
+ ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG],
+ info, &cfg);
+ if (ret)
+ goto out_dev_put;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ phydev = ethnl_req_get_phydev(&req_info, tb,
+ ETHTOOL_A_CABLE_TEST_TDR_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->start_cable_test_tdr) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = ops->start_cable_test_tdr(phydev, info->extack, &cfg);
+
+ ethnl_ops_complete(dev);
+
+ if (!ret)
+ ethnl_cable_test_started(phydev,
+ ETHTOOL_MSG_CABLE_TEST_TDR_NTF);
+
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+out_dev_put:
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+int ethnl_cable_test_amplitude(struct phy_device *phydev,
+ u8 pair, s16 mV)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb,
+ ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair))
+ goto err;
+ if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV))
+ goto err;
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude);
+
+int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV))
+ goto err;
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse);
+
+int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last,
+ u32 step)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE,
+ first))
+ goto err;
+
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last))
+ goto err;
+
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step))
+ goto err;
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_step);
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
new file mode 100644
index 000000000000..ca4f80282448
--- /dev/null
+++ b/net/ethtool/channels.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/xdp_sock_drv.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct channels_req_info {
+ struct ethnl_req_info base;
+};
+
+struct channels_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_channels channels;
+};
+
+#define CHANNELS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct channels_reply_data, base)
+
+const struct nla_policy ethnl_channels_get_policy[] = {
+ [ETHTOOL_A_CHANNELS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int channels_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct channels_reply_data *data = CHANNELS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_channels(dev, &data->channels);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int channels_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */
+ nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */
+}
+
+static int channels_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base);
+ const struct ethtool_channels *channels = &data->channels;
+
+ if ((channels->max_rx &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX,
+ channels->max_rx) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT,
+ channels->rx_count))) ||
+ (channels->max_tx &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX,
+ channels->max_tx) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT,
+ channels->tx_count))) ||
+ (channels->max_other &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX,
+ channels->max_other) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT,
+ channels->other_count))) ||
+ (channels->max_combined &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX,
+ channels->max_combined) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT,
+ channels->combined_count))))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* CHANNELS_SET */
+
+const struct nla_policy ethnl_channels_set_policy[] = {
+ [ETHTOOL_A_CHANNELS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_channels_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ unsigned int from_channel, old_total, i;
+ bool mod = false, mod_combined = false;
+ struct net_device *dev = req_info->dev;
+ struct ethtool_channels channels = {};
+ struct nlattr **tb = info->attrs;
+ u32 err_attr;
+ int ret;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+ old_total = channels.combined_count +
+ max(channels.rx_count, channels.tx_count);
+
+ ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT],
+ &mod);
+ ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT],
+ &mod);
+ ethnl_update_u32(&channels.other_count,
+ tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod);
+ ethnl_update_u32(&channels.combined_count,
+ tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined);
+ mod |= mod_combined;
+ if (!mod)
+ return 0;
+
+ /* ensure new channel counts are within limits */
+ if (channels.rx_count > channels.max_rx)
+ err_attr = ETHTOOL_A_CHANNELS_RX_COUNT;
+ else if (channels.tx_count > channels.max_tx)
+ err_attr = ETHTOOL_A_CHANNELS_TX_COUNT;
+ else if (channels.other_count > channels.max_other)
+ err_attr = ETHTOOL_A_CHANNELS_OTHER_COUNT;
+ else if (channels.combined_count > channels.max_combined)
+ err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT;
+ else
+ err_attr = 0;
+ if (err_attr) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr],
+ "requested channel count exceeds maximum");
+ return -EINVAL;
+ }
+
+ /* ensure there is at least one RX and one TX channel */
+ if (!channels.combined_count && !channels.rx_count)
+ err_attr = ETHTOOL_A_CHANNELS_RX_COUNT;
+ else if (!channels.combined_count && !channels.tx_count)
+ err_attr = ETHTOOL_A_CHANNELS_TX_COUNT;
+ else
+ err_attr = 0;
+ if (err_attr) {
+ if (mod_combined)
+ err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT;
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr],
+ "requested channel counts would result in no RX or TX channel being configured");
+ return -EINVAL;
+ }
+
+ ret = ethtool_check_max_channel(dev, channels, info);
+ if (ret)
+ return ret;
+
+ /* Disabling channels, query zero-copy AF_XDP sockets */
+ from_channel = channels.combined_count +
+ min(channels.rx_count, channels.tx_count);
+ for (i = from_channel; i < old_total; i++)
+ if (xsk_get_pool_from_qid(dev, i)) {
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_channels(dev, &channels);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_channels_request_ops = {
+ .request_cmd = ETHTOOL_MSG_CHANNELS_GET,
+ .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_CHANNELS_HEADER,
+ .req_info_size = sizeof(struct channels_req_info),
+ .reply_data_size = sizeof(struct channels_reply_data),
+
+ .prepare_data = channels_prepare_data,
+ .reply_size = channels_reply_size,
+ .fill_reply = channels_fill_reply,
+
+ .set_validate = ethnl_set_channels_validate,
+ .set = ethnl_set_channels,
+ .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF,
+};
diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h
new file mode 100644
index 000000000000..4a9a946cabf0
--- /dev/null
+++ b/net/ethtool/cmis.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH 120
+#define ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH 2048
+#define ETHTOOL_CMIS_CDB_CMD_PAGE 0x9F
+#define ETHTOOL_CMIS_CDB_PAGE_I2C_ADDR 0x50
+
+/**
+ * struct ethtool_cmis_cdb - CDB commands parameters
+ * @cmis_rev: CMIS revision major.
+ * @read_write_len_ext: Allowable additional number of byte octets to the LPL
+ * in a READ or a WRITE CDB commands.
+ * @max_completion_time: Maximum CDB command completion time in msec.
+ */
+struct ethtool_cmis_cdb {
+ u8 cmis_rev;
+ u8 read_write_len_ext;
+ u16 max_completion_time;
+};
+
+enum ethtool_cmis_cdb_cmd_id {
+ ETHTOOL_CMIS_CDB_CMD_QUERY_STATUS = 0x0000,
+ ETHTOOL_CMIS_CDB_CMD_MODULE_FEATURES = 0x0040,
+ ETHTOOL_CMIS_CDB_CMD_FW_MANAGMENT_FEATURES = 0x0041,
+ ETHTOOL_CMIS_CDB_CMD_START_FW_DOWNLOAD = 0x0101,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_LPL = 0x0103,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_EPL = 0x0104,
+ ETHTOOL_CMIS_CDB_CMD_COMPLETE_FW_DOWNLOAD = 0x0107,
+ ETHTOOL_CMIS_CDB_CMD_RUN_FW_IMAGE = 0x0109,
+ ETHTOOL_CMIS_CDB_CMD_COMMIT_FW_IMAGE = 0x010A,
+};
+
+/**
+ * struct ethtool_cmis_cdb_request - CDB commands request fields as decribed in
+ * the CMIS standard
+ * @id: Command ID.
+ * @epl_len: EPL memory length.
+ * @lpl_len: LPL memory length.
+ * @chk_code: Check code for the previous field and the payload.
+ * @resv1: Added to match the CMIS standard request continuity.
+ * @resv2: Added to match the CMIS standard request continuity.
+ * @payload: Payload for the CDB commands.
+ * @epl: Extended payload for the CDB commands.
+ */
+struct ethtool_cmis_cdb_request {
+ __be16 id;
+ struct_group(body,
+ __be16 epl_len;
+ u8 lpl_len;
+ u8 chk_code;
+ u8 resv1;
+ u8 resv2;
+ u8 payload[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH];
+ );
+ u8 *epl; /* Everything above this field checksummed. */
+};
+
+#define CDB_F_COMPLETION_VALID BIT(0)
+#define CDB_F_STATUS_VALID BIT(1)
+#define CDB_F_MODULE_STATE_VALID BIT(2)
+
+/**
+ * struct ethtool_cmis_cdb_cmd_args - CDB commands execution arguments
+ * @req: CDB command fields as described in the CMIS standard.
+ * @max_duration: Maximum duration time for command completion in msec.
+ * @read_write_len_ext: Allowable additional number of byte octets to the LPL
+ * in a READ or a WRITE commands.
+ * @msleep_pre_rpl: Waiting time before checking reply in msec.
+ * @rpl_exp_len: Expected reply length in bytes.
+ * @flags: Validation flags for CDB commands.
+ * @err_msg: Error message to be sent to user space.
+ */
+struct ethtool_cmis_cdb_cmd_args {
+ struct ethtool_cmis_cdb_request req;
+ u16 max_duration;
+ u8 read_write_len_ext;
+ u8 msleep_pre_rpl;
+ u8 rpl_exp_len;
+ u8 flags;
+ char *err_msg;
+};
+
+/**
+ * struct ethtool_cmis_cdb_rpl_hdr - CDB commands reply header arguments
+ * @rpl_len: Reply length.
+ * @rpl_chk_code: Reply check code.
+ */
+struct ethtool_cmis_cdb_rpl_hdr {
+ u8 rpl_len;
+ u8 rpl_chk_code;
+};
+
+/**
+ * struct ethtool_cmis_cdb_rpl - CDB commands reply arguments
+ * @hdr: CDB commands reply header arguments.
+ * @payload: Payload for the CDB commands reply.
+ */
+struct ethtool_cmis_cdb_rpl {
+ struct ethtool_cmis_cdb_rpl_hdr hdr;
+ u8 payload[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH];
+};
+
+u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs);
+
+void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
+ enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl,
+ u8 lpl_len, u8 *epl, u16 epl_len,
+ u16 max_duration, u8 read_write_len_ext,
+ u16 msleep_pre_rpl, u8 rpl_exp_len,
+ u8 flags);
+
+void ethtool_cmis_cdb_check_completion_flag(u8 cmis_rev, u8 *flags);
+
+void ethtool_cmis_page_init(struct ethtool_module_eeprom *page_data,
+ u8 page, u32 offset, u32 length);
+
+struct ethtool_cmis_cdb *
+ethtool_cmis_cdb_init(struct net_device *dev,
+ const struct ethtool_module_fw_flash_params *params,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params);
+void ethtool_cmis_cdb_fini(struct ethtool_cmis_cdb *cdb);
+
+int ethtool_cmis_wait_for_cond(struct net_device *dev, u8 flags, u8 flag,
+ u16 max_duration, u32 offset,
+ bool (*cond_success)(u8), bool (*cond_fail)(u8), u8 *state);
+
+int ethtool_cmis_cdb_execute_cmd(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args);
diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c
new file mode 100644
index 000000000000..3057576bc81e
--- /dev/null
+++ b/net/ethtool/cmis_cdb.c
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/jiffies.h>
+
+#include "common.h"
+#include "module_fw.h"
+#include "cmis.h"
+
+/* For accessing the LPL field on page 9Fh, the allowable length extension is
+ * min(i, 15) byte octets where i specifies the allowable additional number of
+ * byte octets in a READ or a WRITE.
+ */
+u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs)
+{
+ return 8 * (1 + min_t(u8, num_of_byte_octs, 15));
+}
+
+void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
+ enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl,
+ u8 lpl_len, u8 *epl, u16 epl_len,
+ u16 max_duration, u8 read_write_len_ext,
+ u16 msleep_pre_rpl, u8 rpl_exp_len, u8 flags)
+{
+ args->req.id = cpu_to_be16(cmd);
+ args->req.lpl_len = lpl_len;
+ if (lpl)
+ memcpy(args->req.payload, lpl, args->req.lpl_len);
+ if (epl) {
+ args->req.epl_len = cpu_to_be16(epl_len);
+ args->req.epl = epl;
+ }
+
+ args->max_duration = max_duration;
+ args->read_write_len_ext =
+ ethtool_cmis_get_max_lpl_size(read_write_len_ext);
+ args->msleep_pre_rpl = msleep_pre_rpl;
+ args->rpl_exp_len = rpl_exp_len;
+ args->flags = flags;
+ args->err_msg = NULL;
+}
+
+void ethtool_cmis_page_init(struct ethtool_module_eeprom *page_data,
+ u8 page, u32 offset, u32 length)
+{
+ page_data->page = page;
+ page_data->offset = offset;
+ page_data->length = length;
+ page_data->i2c_address = ETHTOOL_CMIS_CDB_PAGE_I2C_ADDR;
+}
+
+#define CMIS_REVISION_PAGE 0x00
+#define CMIS_REVISION_OFFSET 0x01
+
+struct cmis_rev_rpl {
+ u8 rev;
+};
+
+static u8 cmis_rev_rpl_major(struct cmis_rev_rpl *rpl)
+{
+ return rpl->rev >> 4;
+}
+
+static int cmis_rev_major_get(struct net_device *dev, u8 *rev_major)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {0};
+ struct netlink_ext_ack extack = {};
+ struct cmis_rev_rpl rpl = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, CMIS_REVISION_PAGE,
+ CMIS_REVISION_OFFSET, sizeof(rpl));
+ page_data.data = (u8 *)&rpl;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ *rev_major = cmis_rev_rpl_major(&rpl);
+
+ return 0;
+}
+
+#define CMIS_CDB_ADVERTISEMENT_PAGE 0x01
+#define CMIS_CDB_ADVERTISEMENT_OFFSET 0xA3
+
+/* Based on section 8.4.11 "CDB Messaging Support Advertisement" in CMIS
+ * standard revision 5.2.
+ */
+struct cmis_cdb_advert_rpl {
+ u8 inst_supported;
+ u8 read_write_len_ext;
+ u8 resv1;
+ u8 resv2;
+};
+
+static u8 cmis_cdb_advert_rpl_inst_supported(struct cmis_cdb_advert_rpl *rpl)
+{
+ return rpl->inst_supported >> 6;
+}
+
+static int cmis_cdb_advertisement_get(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {};
+ struct cmis_cdb_advert_rpl rpl = {};
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, CMIS_CDB_ADVERTISEMENT_PAGE,
+ CMIS_CDB_ADVERTISEMENT_OFFSET, sizeof(rpl));
+ page_data.data = (u8 *)&rpl;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ if (!cmis_cdb_advert_rpl_inst_supported(&rpl)) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "CDB functionality is not supported",
+ NULL);
+ return -EOPNOTSUPP;
+ }
+
+ cdb->read_write_len_ext = rpl.read_write_len_ext;
+
+ return 0;
+}
+
+#define CMIS_PASSWORD_ENTRY_PAGE 0x00
+#define CMIS_PASSWORD_ENTRY_OFFSET 0x7A
+
+struct cmis_password_entry_pl {
+ __be32 password;
+};
+
+/* See section 9.3.1 "CMD 0000h: Query Status" in CMIS standard revision 5.2.
+ * struct cmis_cdb_query_status_pl and struct cmis_cdb_query_status_rpl are
+ * structured layouts of the flat arrays,
+ * struct ethtool_cmis_cdb_request::payload and
+ * struct ethtool_cmis_cdb_rpl::payload respectively.
+ */
+struct cmis_cdb_query_status_pl {
+ u16 response_delay;
+};
+
+struct cmis_cdb_query_status_rpl {
+ u8 length;
+ u8 status;
+};
+
+static int
+cmis_cdb_validate_password(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ const struct ethtool_module_fw_flash_params *params,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct cmis_cdb_query_status_pl qs_pl = {0};
+ struct ethtool_module_eeprom page_data = {};
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_password_entry_pl pe_pl = {};
+ struct cmis_cdb_query_status_rpl *rpl;
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, CMIS_PASSWORD_ENTRY_PAGE,
+ CMIS_PASSWORD_ENTRY_OFFSET, sizeof(pe_pl));
+ page_data.data = (u8 *)&pe_pl;
+
+ pe_pl = *((struct cmis_password_entry_pl *)page_data.data);
+ pe_pl.password = params->password;
+ err = ops->set_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_QUERY_STATUS,
+ (u8 *)&qs_pl, sizeof(qs_pl), NULL, 0, 0,
+ cdb->read_write_len_ext, 1000,
+ sizeof(*rpl),
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Query Status command failed",
+ args.err_msg);
+ return err;
+ }
+
+ rpl = (struct cmis_cdb_query_status_rpl *)args.req.payload;
+ if (!rpl->length || !rpl->status) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Password was not accepted",
+ NULL);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Some CDB commands asserts the CDB completion flag only from CMIS
+ * revision 5. Therefore, check the relevant validity flag only when
+ * the revision supports it.
+ */
+void ethtool_cmis_cdb_check_completion_flag(u8 cmis_rev, u8 *flags)
+{
+ *flags |= cmis_rev >= 5 ? CDB_F_COMPLETION_VALID : 0;
+}
+
+#define CMIS_CDB_MODULE_FEATURES_RESV_DATA 34
+
+/* See section 9.4.1 "CMD 0040h: Module Features" in CMIS standard revision 5.2.
+ * struct cmis_cdb_module_features_rpl is structured layout of the flat
+ * array, ethtool_cmis_cdb_rpl::payload.
+ */
+struct cmis_cdb_module_features_rpl {
+ u8 resv1[CMIS_CDB_MODULE_FEATURES_RESV_DATA];
+ __be16 max_completion_time;
+};
+
+static u16
+cmis_cdb_module_features_completion_time(struct cmis_cdb_module_features_rpl *rpl)
+{
+ return be16_to_cpu(rpl->max_completion_time);
+}
+
+static int cmis_cdb_module_features_get(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_cdb_module_features_rpl *rpl;
+ u8 flags = CDB_F_STATUS_VALID;
+ int err;
+
+ ethtool_cmis_cdb_check_completion_flag(cdb->cmis_rev, &flags);
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_MODULE_FEATURES,
+ NULL, 0, NULL, 0, 0,
+ cdb->read_write_len_ext, 1000,
+ sizeof(*rpl), flags);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Module Features command failed",
+ args.err_msg);
+ return err;
+ }
+
+ rpl = (struct cmis_cdb_module_features_rpl *)args.req.payload;
+ cdb->max_completion_time =
+ cmis_cdb_module_features_completion_time(rpl);
+
+ return 0;
+}
+
+struct ethtool_cmis_cdb *
+ethtool_cmis_cdb_init(struct net_device *dev,
+ const struct ethtool_module_fw_flash_params *params,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb *cdb;
+ int err;
+
+ cdb = kzalloc(sizeof(*cdb), GFP_KERNEL);
+ if (!cdb)
+ return ERR_PTR(-ENOMEM);
+
+ err = cmis_rev_major_get(dev, &cdb->cmis_rev);
+ if (err < 0)
+ goto err;
+
+ if (cdb->cmis_rev < 4) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "CMIS revision doesn't support module firmware flashing",
+ NULL);
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ err = cmis_cdb_advertisement_get(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err;
+
+ if (params->password_valid) {
+ err = cmis_cdb_validate_password(cdb, dev, params, ntf_params);
+ if (err < 0)
+ goto err;
+ }
+
+ err = cmis_cdb_module_features_get(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err;
+
+ return cdb;
+
+err:
+ ethtool_cmis_cdb_fini(cdb);
+ return ERR_PTR(err);
+}
+
+void ethtool_cmis_cdb_fini(struct ethtool_cmis_cdb *cdb)
+{
+ kfree(cdb);
+}
+
+static bool is_completed(u8 data)
+{
+ return !!(data & 0x40);
+}
+
+#define CMIS_CDB_STATUS_SUCCESS 0x01
+
+static bool status_success(u8 data)
+{
+ return data == CMIS_CDB_STATUS_SUCCESS;
+}
+
+#define CMIS_CDB_STATUS_FAIL 0x40
+
+static bool status_fail(u8 data)
+{
+ return data & CMIS_CDB_STATUS_FAIL;
+}
+
+struct cmis_wait_for_cond_rpl {
+ u8 state;
+};
+
+static int
+ethtool_cmis_module_poll(struct net_device *dev,
+ struct cmis_wait_for_cond_rpl *rpl, u32 offset,
+ bool (*cond_success)(u8), bool (*cond_fail)(u8))
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {0};
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, 0, offset, sizeof(*rpl));
+ page_data.data = (u8 *)rpl;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err_once(dev, "%s\n", extack._msg);
+ return -EBUSY;
+ }
+
+ if ((*cond_success)(rpl->state))
+ return 0;
+
+ if (*cond_fail && (*cond_fail)(rpl->state))
+ return -EIO;
+
+ return -EBUSY;
+}
+
+int ethtool_cmis_wait_for_cond(struct net_device *dev, u8 flags, u8 flag,
+ u16 max_duration, u32 offset,
+ bool (*cond_success)(u8), bool (*cond_fail)(u8),
+ u8 *state)
+{
+ struct cmis_wait_for_cond_rpl rpl = {};
+ unsigned long end;
+ int err;
+
+ if (!(flags & flag))
+ return 0;
+
+ if (max_duration == 0)
+ max_duration = U16_MAX;
+
+ end = jiffies + msecs_to_jiffies(max_duration);
+ do {
+ err = ethtool_cmis_module_poll(dev, &rpl, offset, cond_success,
+ cond_fail);
+ if (err != -EBUSY)
+ goto out;
+
+ msleep(20);
+ } while (time_before(jiffies, end));
+
+ err = ethtool_cmis_module_poll(dev, &rpl, offset, cond_success,
+ cond_fail);
+ if (err == -EBUSY)
+ err = -ETIMEDOUT;
+
+out:
+ *state = rpl.state;
+ return err;
+}
+
+#define CMIS_CDB_COMPLETION_FLAG_OFFSET 0x08
+
+static int cmis_cdb_wait_for_completion(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ u8 flag;
+ int err;
+
+ /* Some vendors demand waiting time before checking completion flag
+ * in some CDB commands.
+ */
+ msleep(args->msleep_pre_rpl);
+
+ err = ethtool_cmis_wait_for_cond(dev, args->flags,
+ CDB_F_COMPLETION_VALID,
+ args->max_duration,
+ CMIS_CDB_COMPLETION_FLAG_OFFSET,
+ is_completed, NULL, &flag);
+ if (err < 0)
+ args->err_msg = "Completion Flag did not set on time";
+
+ return err;
+}
+
+#define CMIS_CDB_STATUS_OFFSET 0x25
+
+static void cmis_cdb_status_fail_msg_get(u8 status, char **err_msg)
+{
+ switch (status) {
+ case 0b10000001:
+ *err_msg = "CDB Status is in progress: Busy capturing command";
+ break;
+ case 0b10000010:
+ *err_msg =
+ "CDB Status is in progress: Busy checking/validating command";
+ break;
+ case 0b10000011:
+ *err_msg = "CDB Status is in progress: Busy executing";
+ break;
+ case 0b01000000:
+ *err_msg = "CDB status failed: no specific failure";
+ break;
+ case 0b01000010:
+ *err_msg =
+ "CDB status failed: Parameter range error or parameter not supported";
+ break;
+ case 0b01000101:
+ *err_msg = "CDB status failed: CdbChkCode error";
+ break;
+ case 0b01000110:
+ *err_msg = "CDB status failed: Password error";
+ break;
+ default:
+ *err_msg = "Unknown failure reason";
+ }
+};
+
+static int cmis_cdb_wait_for_status(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ u8 status;
+ int err;
+
+ /* Some vendors demand waiting time before checking status in some
+ * CDB commands.
+ */
+ msleep(args->msleep_pre_rpl);
+
+ err = ethtool_cmis_wait_for_cond(dev, args->flags, CDB_F_STATUS_VALID,
+ args->max_duration,
+ CMIS_CDB_STATUS_OFFSET,
+ status_success, status_fail, &status);
+ if (err < 0 && !args->err_msg)
+ cmis_cdb_status_fail_msg_get(status, &args->err_msg);
+
+ return err;
+}
+
+#define CMIS_CDB_REPLY_OFFSET 0x86
+
+static int cmis_cdb_process_reply(struct net_device *dev,
+ struct ethtool_module_eeprom *page_data,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ u8 rpl_hdr_len = sizeof(struct ethtool_cmis_cdb_rpl_hdr);
+ u8 rpl_exp_len = args->rpl_exp_len + rpl_hdr_len;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct netlink_ext_ack extack = {};
+ struct ethtool_cmis_cdb_rpl *rpl;
+ int err;
+
+ if (!args->rpl_exp_len)
+ return 0;
+
+ ethtool_cmis_page_init(page_data, ETHTOOL_CMIS_CDB_CMD_PAGE,
+ CMIS_CDB_REPLY_OFFSET, rpl_exp_len);
+ page_data->data = kmalloc(page_data->length, GFP_KERNEL);
+ if (!page_data->data)
+ return -ENOMEM;
+
+ err = ops->get_module_eeprom_by_page(dev, page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ goto out;
+ }
+
+ rpl = (struct ethtool_cmis_cdb_rpl *)page_data->data;
+ if ((args->rpl_exp_len > rpl->hdr.rpl_len + rpl_hdr_len) ||
+ !rpl->hdr.rpl_chk_code) {
+ err = -EIO;
+ goto out;
+ }
+
+ args->req.lpl_len = rpl->hdr.rpl_len;
+ memcpy(args->req.payload, rpl->payload, args->req.lpl_len);
+
+out:
+ kfree(page_data->data);
+ return err;
+}
+
+static int
+__ethtool_cmis_cdb_execute_cmd(struct net_device *dev,
+ struct ethtool_module_eeprom *page_data,
+ u8 page, u32 offset, u32 length, void *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(page_data, page, offset, length);
+ page_data->data = kmemdup(data, page_data->length, GFP_KERNEL);
+ if (!page_data->data)
+ return -ENOMEM;
+
+ err = ops->set_module_eeprom_by_page(dev, page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ }
+
+ kfree(page_data->data);
+ return err;
+}
+
+#define CMIS_CDB_EPL_PAGE_START 0xA0
+#define CMIS_CDB_EPL_PAGE_END 0xAF
+#define CMIS_CDB_EPL_FW_BLOCK_OFFSET_START 128
+#define CMIS_CDB_EPL_FW_BLOCK_OFFSET_END 255
+
+static int
+ethtool_cmis_cdb_execute_epl_cmd(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args,
+ struct ethtool_module_eeprom *page_data)
+{
+ u16 epl_len = be16_to_cpu(args->req.epl_len);
+ u32 bytes_written = 0;
+ u8 page;
+ int err;
+
+ for (page = CMIS_CDB_EPL_PAGE_START;
+ page <= CMIS_CDB_EPL_PAGE_END && bytes_written < epl_len; page++) {
+ u16 offset = CMIS_CDB_EPL_FW_BLOCK_OFFSET_START;
+
+ while (offset <= CMIS_CDB_EPL_FW_BLOCK_OFFSET_END &&
+ bytes_written < epl_len) {
+ u32 bytes_left = epl_len - bytes_written;
+ u16 space_left, bytes_to_write;
+
+ space_left = CMIS_CDB_EPL_FW_BLOCK_OFFSET_END - offset + 1;
+ bytes_to_write = min_t(u16, bytes_left,
+ min_t(u16, space_left,
+ args->read_write_len_ext));
+
+ err = __ethtool_cmis_cdb_execute_cmd(dev, page_data,
+ page, offset,
+ bytes_to_write,
+ args->req.epl + bytes_written);
+ if (err < 0)
+ return err;
+
+ offset += bytes_to_write;
+ bytes_written += bytes_to_write;
+ }
+ }
+ return 0;
+}
+
+static u8 cmis_cdb_calc_checksum(const void *data, size_t size)
+{
+ const u8 *bytes = (const u8 *)data;
+ u8 checksum = 0;
+
+ for (size_t i = 0; i < size; i++)
+ checksum += bytes[i];
+
+ return ~checksum;
+}
+
+#define CMIS_CDB_CMD_ID_OFFSET 0x80
+
+int ethtool_cmis_cdb_execute_cmd(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ struct ethtool_module_eeprom page_data = {};
+ u32 offset;
+ int err;
+
+ args->req.chk_code =
+ cmis_cdb_calc_checksum(&args->req,
+ offsetof(struct ethtool_cmis_cdb_request,
+ epl));
+
+ if (args->req.lpl_len > args->read_write_len_ext) {
+ args->err_msg = "LPL length is longer than CDB read write length extension allows";
+ return -EINVAL;
+ }
+
+ /* According to the CMIS standard, there are two options to trigger the
+ * CDB commands. The default option is triggering the command by writing
+ * the CMDID bytes. Therefore, the command will be split to 2 calls:
+ * First, with everything except the CMDID field and then the CMDID
+ * field.
+ */
+ offset = CMIS_CDB_CMD_ID_OFFSET +
+ offsetof(struct ethtool_cmis_cdb_request, body);
+ err = __ethtool_cmis_cdb_execute_cmd(dev, &page_data,
+ ETHTOOL_CMIS_CDB_CMD_PAGE, offset,
+ sizeof(args->req.body),
+ &args->req.body);
+ if (err < 0)
+ return err;
+
+ if (args->req.epl_len) {
+ err = ethtool_cmis_cdb_execute_epl_cmd(dev, args, &page_data);
+ if (err < 0)
+ return err;
+ }
+
+ offset = CMIS_CDB_CMD_ID_OFFSET +
+ offsetof(struct ethtool_cmis_cdb_request, id);
+ err = __ethtool_cmis_cdb_execute_cmd(dev, &page_data,
+ ETHTOOL_CMIS_CDB_CMD_PAGE, offset,
+ sizeof(args->req.id),
+ &args->req.id);
+ if (err < 0)
+ return err;
+
+ err = cmis_cdb_wait_for_completion(dev, args);
+ if (err < 0)
+ return err;
+
+ err = cmis_cdb_wait_for_status(dev, args);
+ if (err < 0)
+ return err;
+
+ return cmis_cdb_process_reply(dev, &page_data, args);
+}
diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c
new file mode 100644
index 000000000000..df5f344209c4
--- /dev/null
+++ b/net/ethtool/cmis_fw_update.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/firmware.h>
+#include <net/netdev_lock.h>
+
+#include "common.h"
+#include "module_fw.h"
+#include "cmis.h"
+
+struct cmis_fw_update_fw_mng_features {
+ u8 start_cmd_payload_size;
+ u8 write_mechanism;
+ u16 max_duration_start;
+ u16 max_duration_write;
+ u16 max_duration_complete;
+};
+
+/* See section 9.4.2 "CMD 0041h: Firmware Management Features" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_fw_mng_features_rpl is a structured layout of the flat
+ * array, ethtool_cmis_cdb_rpl::payload.
+ */
+struct cmis_cdb_fw_mng_features_rpl {
+ u8 resv1;
+ u8 resv2;
+ u8 start_cmd_payload_size;
+ u8 resv3;
+ u8 read_write_len_ext;
+ u8 write_mechanism;
+ u8 resv4;
+ u8 resv5;
+ __be16 max_duration_start;
+ __be16 resv6;
+ __be16 max_duration_write;
+ __be16 max_duration_complete;
+ __be16 resv7;
+};
+
+enum cmis_cdb_fw_write_mechanism {
+ CMIS_CDB_FW_WRITE_MECHANISM_NONE = 0x00,
+ CMIS_CDB_FW_WRITE_MECHANISM_LPL = 0x01,
+ CMIS_CDB_FW_WRITE_MECHANISM_EPL = 0x10,
+ CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11,
+};
+
+static int
+cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct cmis_fw_update_fw_mng_features *fw_mng,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_cdb_fw_mng_features_rpl *rpl;
+ u8 flags = CDB_F_STATUS_VALID;
+ int err;
+
+ ethtool_cmis_cdb_check_completion_flag(cdb->cmis_rev, &flags);
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_FW_MANAGMENT_FEATURES,
+ NULL, 0, NULL, 0,
+ cdb->max_completion_time,
+ cdb->read_write_len_ext, 1000,
+ sizeof(*rpl), flags);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "FW Management Features command failed",
+ args.err_msg);
+ return err;
+ }
+
+ rpl = (struct cmis_cdb_fw_mng_features_rpl *)args.req.payload;
+ if (rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_NONE) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "CDB write mechanism is not supported",
+ NULL);
+ return -EOPNOTSUPP;
+ }
+
+ /* Above, we used read_write_len_ext that we got from CDB
+ * advertisement. Update it with the value that we got from module
+ * features query, which is specific for Firmware Management Commands
+ * (IDs 0100h-01FFh).
+ */
+ cdb->read_write_len_ext = rpl->read_write_len_ext;
+ fw_mng->start_cmd_payload_size = rpl->start_cmd_payload_size;
+ fw_mng->write_mechanism =
+ rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ?
+ CMIS_CDB_FW_WRITE_MECHANISM_LPL :
+ CMIS_CDB_FW_WRITE_MECHANISM_EPL;
+ fw_mng->max_duration_start = be16_to_cpu(rpl->max_duration_start);
+ fw_mng->max_duration_write = be16_to_cpu(rpl->max_duration_write);
+ fw_mng->max_duration_complete = be16_to_cpu(rpl->max_duration_complete);
+
+ return 0;
+}
+
+/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_start_fw_download_pl is a structured layout of the
+ * flat array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_start_fw_download_pl {
+ __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */,
+ __be32 image_size;
+ __be32 resv1;
+ );
+ u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH -
+ sizeof(struct cmis_cdb_start_fw_download_pl_h)];
+};
+
+static int
+cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ u8 vendor_data_size = fw_mng->start_cmd_payload_size;
+ struct cmis_cdb_start_fw_download_pl pl = {};
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ u8 lpl_len;
+ int err;
+
+ pl.image_size = cpu_to_be32(fw_update->fw->size);
+ memcpy(pl.vendor_data, fw_update->fw->data, vendor_data_size);
+
+ lpl_len = offsetof(struct cmis_cdb_start_fw_download_pl,
+ vendor_data[vendor_data_size]);
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_START_FW_DOWNLOAD,
+ (u8 *)&pl, lpl_len, NULL, 0,
+ fw_mng->max_duration_start,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Start FW download command failed",
+ args.err_msg);
+
+ return err;
+}
+
+/* See section 9.7.4 "CMD 0103h: Write Firmware Block LPL" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_write_fw_block_lpl_pl is a structured layout of the
+ * flat array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_write_fw_block_lpl_pl {
+ __be32 block_address;
+ u8 fw_block[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - sizeof(__be32)];
+};
+
+static int
+cmis_fw_update_write_image_lpl(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ u8 start = fw_mng->start_cmd_payload_size;
+ u32 offset, max_block_size, max_lpl_len;
+ u32 image_size = fw_update->fw->size;
+ int err;
+
+ max_lpl_len = min_t(u32,
+ ethtool_cmis_get_max_lpl_size(cdb->read_write_len_ext),
+ ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH);
+ max_block_size =
+ max_lpl_len - sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl,
+ block_address);
+
+ for (offset = start; offset < image_size; offset += max_block_size) {
+ struct cmis_cdb_write_fw_block_lpl_pl pl = {
+ .block_address = cpu_to_be32(offset - start),
+ };
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ u32 block_size, lpl_len;
+
+ ethnl_module_fw_flash_ntf_in_progress(fw_update->dev,
+ &fw_update->ntf_params,
+ offset - start,
+ image_size);
+ block_size = min_t(u32, max_block_size, image_size - offset);
+ memcpy(pl.fw_block, &fw_update->fw->data[offset], block_size);
+ lpl_len = block_size +
+ sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl,
+ block_address);
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_LPL,
+ (u8 *)&pl, lpl_len, NULL, 0,
+ fw_mng->max_duration_write,
+ cdb->read_write_len_ext, 1, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Write FW block LPL command failed",
+ args.err_msg);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+struct cmis_cdb_write_fw_block_epl_pl {
+ u8 fw_block[ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH];
+};
+
+static int
+cmis_fw_update_write_image_epl(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ u8 start = fw_mng->start_cmd_payload_size;
+ u32 image_size = fw_update->fw->size;
+ u32 offset, lpl_len;
+ int err;
+
+ lpl_len = sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl,
+ block_address);
+
+ for (offset = start; offset < image_size;
+ offset += ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH) {
+ struct cmis_cdb_write_fw_block_lpl_pl lpl = {
+ .block_address = cpu_to_be32(offset - start),
+ };
+ struct cmis_cdb_write_fw_block_epl_pl *epl;
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ u32 epl_len;
+
+ ethnl_module_fw_flash_ntf_in_progress(fw_update->dev,
+ &fw_update->ntf_params,
+ offset - start,
+ image_size);
+
+ epl_len = min_t(u32, ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH,
+ image_size - offset);
+ epl = kmalloc_array(epl_len, sizeof(u8), GFP_KERNEL);
+ if (!epl)
+ return -ENOMEM;
+
+ memcpy(epl->fw_block, &fw_update->fw->data[offset], epl_len);
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_EPL,
+ (u8 *)&lpl, lpl_len, (u8 *)epl,
+ epl_len,
+ fw_mng->max_duration_write,
+ cdb->read_write_len_ext, 1, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args);
+ kfree(epl);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Write FW block EPL command failed",
+ args.err_msg);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int
+cmis_fw_update_complete_download(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct cmis_fw_update_fw_mng_features *fw_mng,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ int err;
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_COMPLETE_FW_DOWNLOAD,
+ NULL, 0, NULL, 0,
+ fw_mng->max_duration_complete,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Complete FW download command failed",
+ args.err_msg);
+
+ return err;
+}
+
+static int
+cmis_fw_update_download_image(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ int err;
+
+ err = cmis_fw_update_start_download(cdb, fw_update, fw_mng);
+ if (err < 0)
+ return err;
+
+ if (fw_mng->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL) {
+ err = cmis_fw_update_write_image_lpl(cdb, fw_update, fw_mng);
+ if (err < 0)
+ return err;
+ } else {
+ err = cmis_fw_update_write_image_epl(cdb, fw_update, fw_mng);
+ if (err < 0)
+ return err;
+ }
+
+ err = cmis_fw_update_complete_download(cdb, fw_update->dev, fw_mng,
+ &fw_update->ntf_params);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+enum {
+ CMIS_MODULE_LOW_PWR = 1,
+ CMIS_MODULE_READY = 3,
+};
+
+static bool module_is_ready(u8 data)
+{
+ u8 state = (data >> 1) & 7;
+
+ return state == CMIS_MODULE_READY || state == CMIS_MODULE_LOW_PWR;
+}
+
+#define CMIS_MODULE_READY_MAX_DURATION_MSEC 1000
+#define CMIS_MODULE_STATE_OFFSET 3
+
+static int
+cmis_fw_update_wait_for_module_state(struct net_device *dev, u8 flags)
+{
+ u8 state;
+
+ return ethtool_cmis_wait_for_cond(dev, flags, CDB_F_MODULE_STATE_VALID,
+ CMIS_MODULE_READY_MAX_DURATION_MSEC,
+ CMIS_MODULE_STATE_OFFSET,
+ module_is_ready, NULL, &state);
+}
+
+/* See section 9.7.10 "CMD 0109h: Run Firmware Image" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_run_fw_image_pl is a structured layout of the flat
+ * array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_run_fw_image_pl {
+ u8 resv1;
+ u8 image_to_run;
+ u16 delay_to_reset;
+};
+
+static int
+cmis_fw_update_run_image(struct ethtool_cmis_cdb *cdb, struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_cdb_run_fw_image_pl pl = {0};
+ int err;
+
+ ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_RUN_FW_IMAGE,
+ (u8 *)&pl, sizeof(pl), NULL, 0,
+ cdb->max_completion_time,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_MODULE_STATE_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Run image command failed",
+ args.err_msg);
+ return err;
+ }
+
+ err = cmis_fw_update_wait_for_module_state(dev, args.flags);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Module is not ready on time after reset",
+ NULL);
+
+ return err;
+}
+
+static int
+cmis_fw_update_commit_image(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ int err;
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_COMMIT_FW_IMAGE,
+ NULL, 0, NULL, 0,
+ cdb->max_completion_time,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Commit image command failed",
+ args.err_msg);
+
+ return err;
+}
+
+static int cmis_fw_update_reset(struct net_device *dev)
+{
+ __u32 reset_data = ETH_RESET_PHY;
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = dev->ethtool_ops->reset(dev, &reset_data);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+void
+ethtool_cmis_fw_update(struct ethtool_cmis_fw_update_params *fw_update)
+{
+ struct ethnl_module_fw_flash_ntf_params *ntf_params =
+ &fw_update->ntf_params;
+ struct cmis_fw_update_fw_mng_features fw_mng = {0};
+ struct net_device *dev = fw_update->dev;
+ struct ethtool_cmis_cdb *cdb;
+ int err;
+
+ cdb = ethtool_cmis_cdb_init(dev, &fw_update->params, ntf_params);
+ if (IS_ERR(cdb))
+ goto err_send_ntf;
+
+ ethnl_module_fw_flash_ntf_start(dev, ntf_params);
+
+ err = cmis_fw_update_fw_mng_features_get(cdb, dev, &fw_mng, ntf_params);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ err = cmis_fw_update_download_image(cdb, fw_update, &fw_mng);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ err = cmis_fw_update_run_image(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ /* The CDB command "Run Firmware Image" resets the firmware, so the new
+ * one might have different settings.
+ * Free the old CDB instance, and init a new one.
+ */
+ ethtool_cmis_cdb_fini(cdb);
+
+ cdb = ethtool_cmis_cdb_init(dev, &fw_update->params, ntf_params);
+ if (IS_ERR(cdb))
+ goto err_send_ntf;
+
+ err = cmis_fw_update_commit_image(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ err = cmis_fw_update_reset(dev);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ ethnl_module_fw_flash_ntf_complete(dev, ntf_params);
+ ethtool_cmis_cdb_fini(cdb);
+ return;
+
+err_cdb_fini:
+ ethtool_cmis_cdb_fini(cdb);
+err_send_ntf:
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params, NULL, NULL);
+}
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
new file mode 100644
index 000000000000..3e18ca1ccc5e
--- /dev/null
+++ b/net/ethtool/coalesce.c
@@ -0,0 +1,649 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/dim.h>
+#include "netlink.h"
+#include "common.h"
+
+struct coalesce_req_info {
+ struct ethnl_req_info base;
+};
+
+struct coalesce_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_coalesce coalesce;
+ struct kernel_ethtool_coalesce kernel_coalesce;
+ u32 supported_params;
+};
+
+#define COALESCE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct coalesce_reply_data, base)
+
+#define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS
+static u32 attr_to_mask(unsigned int attr_type)
+{
+ return BIT(attr_type - __SUPPORTED_OFFSET);
+}
+
+/* build time check that indices in ethtool_ops::supported_coalesce_params
+ * match corresponding attribute types with an offset
+ */
+#define __CHECK_SUPPORTED_OFFSET(x) \
+ static_assert((ETHTOOL_ ## x) == \
+ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET))
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL);
+
+const struct nla_policy ethnl_coalesce_get_policy[] = {
+ [ETHTOOL_A_COALESCE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+ data->supported_params = dev->ethtool_ops->supported_coalesce_params;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce,
+ &data->kernel_coalesce,
+ info->extack);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int coalesce_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */
+ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */
+ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */
+ nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */
+
+ int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */
+ modersz * NET_DIM_PARAMS_NUM_PROFILES;
+
+ return nla_total_size(sizeof(u32)) + /* _RX_USECS */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */
+ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */
+ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */
+ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */
+ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */
+ total_modersz * 2; /* _{R,T}X_PROFILE */
+}
+
+static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u32(skb, attr_type, val);
+}
+
+static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u8(skb, attr_type, !!val);
+}
+
+/**
+ * coalesce_put_profile - fill reply with a nla nest with four child nla nests.
+ * @skb: socket buffer the message is stored in
+ * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE
+ * @profile: data passed to userspace
+ * @coal_flags: modifiable parameters supported by the driver
+ *
+ * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION.
+ *
+ * Return: 0 on success or a negative error code.
+ */
+static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type,
+ const struct dim_cq_moder *profile,
+ u8 coal_flags)
+{
+ struct nlattr *profile_attr, *moder_attr;
+ int i, ret;
+
+ if (!profile || !coal_flags)
+ return 0;
+
+ profile_attr = nla_nest_start(skb, attr_type);
+ if (!profile_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) {
+ moder_attr = nla_nest_start(skb,
+ ETHTOOL_A_PROFILE_IRQ_MODERATION);
+ if (!moder_attr) {
+ ret = -EMSGSIZE;
+ goto cancel_profile;
+ }
+
+ if (coal_flags & DIM_COALESCE_USEC) {
+ ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC,
+ profile[i].usec);
+ if (ret)
+ goto cancel_moder;
+ }
+
+ if (coal_flags & DIM_COALESCE_PKTS) {
+ ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS,
+ profile[i].pkts);
+ if (ret)
+ goto cancel_moder;
+ }
+
+ if (coal_flags & DIM_COALESCE_COMPS) {
+ ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS,
+ profile[i].comps);
+ if (ret)
+ goto cancel_moder;
+ }
+
+ nla_nest_end(skb, moder_attr);
+ }
+
+ nla_nest_end(skb, profile_attr);
+
+ return 0;
+
+cancel_moder:
+ nla_nest_cancel(skb, moder_attr);
+cancel_profile:
+ nla_nest_cancel(skb, profile_attr);
+ return ret;
+}
+
+static int coalesce_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce;
+ const struct ethtool_coalesce *coal = &data->coalesce;
+ u32 supported = data->supported_params;
+ struct dim_irq_moder *moder;
+ int ret = 0;
+
+ if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS,
+ coal->rx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES,
+ coal->rx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ,
+ coal->rx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ,
+ coal->rx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS,
+ coal->tx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES,
+ coal->tx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ,
+ coal->tx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ,
+ coal->tx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS,
+ coal->stats_block_coalesce_usecs, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX,
+ coal->use_adaptive_rx_coalesce, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX,
+ coal->use_adaptive_tx_coalesce, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW,
+ coal->pkt_rate_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW,
+ coal->rx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW,
+ coal->rx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW,
+ coal->tx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW,
+ coal->tx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH,
+ coal->pkt_rate_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH,
+ coal->rx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH,
+ coal->rx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH,
+ coal->tx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,
+ coal->tx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,
+ coal->rate_sample_interval, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX,
+ kcoal->use_cqe_mode_tx, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX,
+ kcoal->use_cqe_mode_rx, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES,
+ kcoal->tx_aggr_max_bytes, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES,
+ kcoal->tx_aggr_max_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS,
+ kcoal->tx_aggr_time_usecs, supported))
+ return -EMSGSIZE;
+
+ if (!req_base->dev || !req_base->dev->irq_moder)
+ return 0;
+
+ moder = req_base->dev->irq_moder;
+ rcu_read_lock();
+ if (moder->profile_flags & DIM_PROFILE_RX) {
+ ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE,
+ rcu_dereference(moder->rx_profile),
+ moder->coal_flags);
+ if (ret)
+ goto out;
+ }
+
+ if (moder->profile_flags & DIM_PROFILE_TX)
+ ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE,
+ rcu_dereference(moder->tx_profile),
+ moder->coal_flags);
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* COALESCE_SET */
+
+static const struct nla_policy coalesce_irq_moderation_policy[] = {
+ [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 },
+ [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 },
+ [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy coalesce_profile_policy[] = {
+ [ETHTOOL_A_PROFILE_IRQ_MODERATION] =
+ NLA_POLICY_NESTED(coalesce_irq_moderation_policy),
+};
+
+const struct nla_policy ethnl_coalesce_set_policy[] = {
+ [ETHTOOL_A_COALESCE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_PROFILE] =
+ NLA_POLICY_NESTED(coalesce_profile_policy),
+ [ETHTOOL_A_COALESCE_TX_PROFILE] =
+ NLA_POLICY_NESTED(coalesce_profile_policy),
+};
+
+static int
+ethnl_set_coalesce_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct dim_irq_moder *irq_moder = req_info->dev->irq_moder;
+ struct nlattr **tb = info->attrs;
+ u32 supported_params;
+ u16 a;
+
+ if (!ops->get_coalesce || !ops->set_coalesce)
+ return -EOPNOTSUPP;
+
+ /* make sure that only supported parameters are present */
+ supported_params = ops->supported_coalesce_params;
+ if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX)
+ supported_params |= ETHTOOL_COALESCE_RX_PROFILE;
+
+ if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX)
+ supported_params |= ETHTOOL_COALESCE_TX_PROFILE;
+
+ for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++)
+ if (tb[a] && !(supported_params & attr_to_mask(a))) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[a],
+ "cannot modify an unsupported parameter");
+ return -EINVAL;
+ }
+
+ return 1;
+}
+
+/**
+ * ethnl_update_irq_moder - update a specific field in the given profile
+ * @irq_moder: place that collects dim related information
+ * @irq_field: field in profile to modify
+ * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_*
+ * @tb: netlink attribute with new values or null
+ * @coal_bit: DIM_COALESCE_* bit from coal_flags
+ * @mod: pointer to bool for modification tracking
+ * @extack: netlink extended ack
+ *
+ * Return: 0 on success or a negative error code.
+ */
+static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder,
+ u16 *irq_field, u16 attr_type,
+ struct nlattr **tb,
+ u8 coal_bit, bool *mod,
+ struct netlink_ext_ack *extack)
+{
+ int ret = 0;
+ u32 val;
+
+ if (!tb[attr_type])
+ return 0;
+
+ if (irq_moder->coal_flags & coal_bit) {
+ val = nla_get_u32(tb[attr_type]);
+ if (*irq_field == val)
+ return 0;
+
+ *irq_field = val;
+ *mod = true;
+ } else {
+ NL_SET_BAD_ATTR(extack, tb[attr_type]);
+ ret = -EOPNOTSUPP;
+ }
+
+ return ret;
+}
+
+/**
+ * ethnl_update_profile - get a profile nest with child nests from userspace.
+ * @dev: netdevice to update the profile
+ * @dst: profile get from the driver and modified by ethnl_update_profile.
+ * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile.
+ * @mod: pointer to bool for modification tracking
+ * @extack: Netlink extended ack
+ *
+ * Layout of nests:
+ * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr
+ * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr
+ * ETHTOOL_A_IRQ_MODERATION_USEC attr
+ * ETHTOOL_A_IRQ_MODERATION_PKTS attr
+ * ETHTOOL_A_IRQ_MODERATION_COMPS attr
+ * ...
+ * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr
+ * ETHTOOL_A_IRQ_MODERATION_USEC attr
+ * ETHTOOL_A_IRQ_MODERATION_PKTS attr
+ * ETHTOOL_A_IRQ_MODERATION_COMPS attr
+ *
+ * Return: 0 on success or a negative error code.
+ */
+static int ethnl_update_profile(struct net_device *dev,
+ struct dim_cq_moder __rcu **dst,
+ const struct nlattr *nests,
+ bool *mod,
+ struct netlink_ext_ack *extack)
+{
+ int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy);
+ struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)];
+ struct dim_irq_moder *irq_moder = dev->irq_moder;
+ struct dim_cq_moder *new_profile, *old_profile;
+ int ret, rem, i = 0, len;
+ struct nlattr *nest;
+
+ if (!nests)
+ return 0;
+
+ if (!*dst)
+ return -EOPNOTSUPP;
+
+ old_profile = rtnl_dereference(*dst);
+ len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile);
+ new_profile = kmemdup(old_profile, len, GFP_KERNEL);
+ if (!new_profile)
+ return -ENOMEM;
+
+ nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION,
+ nests, rem) {
+ ret = nla_parse_nested(tb, len_irq_moder - 1, nest,
+ coalesce_irq_moderation_policy,
+ extack);
+ if (ret)
+ goto err_out;
+
+ ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec,
+ ETHTOOL_A_IRQ_MODERATION_USEC,
+ tb, DIM_COALESCE_USEC,
+ mod, extack);
+ if (ret)
+ goto err_out;
+
+ ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts,
+ ETHTOOL_A_IRQ_MODERATION_PKTS,
+ tb, DIM_COALESCE_PKTS,
+ mod, extack);
+ if (ret)
+ goto err_out;
+
+ ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps,
+ ETHTOOL_A_IRQ_MODERATION_COMPS,
+ tb, DIM_COALESCE_COMPS,
+ mod, extack);
+ if (ret)
+ goto err_out;
+
+ i++;
+ }
+
+ /* After the profile is modified, dim itself is a dynamic
+ * mechanism and will quickly fit to the appropriate
+ * coalescing parameters according to the new profile.
+ */
+ rcu_assign_pointer(*dst, new_profile);
+ kfree_rcu(old_profile, rcu);
+
+ return 0;
+
+err_out:
+ kfree(new_profile);
+ return ret;
+}
+
+static int
+__ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info,
+ bool *dual_change)
+{
+ struct kernel_ethtool_coalesce kernel_coalesce = {};
+ struct net_device *dev = req_info->dev;
+ struct ethtool_coalesce coalesce = {};
+ bool mod_mode = false, mod = false;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+ info->extack);
+ if (ret < 0)
+ return ret;
+
+ /* Update values */
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_RX_USECS], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_TX_USECS], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.stats_block_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_low,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_high,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rate_sample_interval,
+ tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod);
+ ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes,
+ tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod);
+ ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames,
+ tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod);
+ ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs,
+ tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod);
+
+ if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) {
+ ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile,
+ tb[ETHTOOL_A_COALESCE_RX_PROFILE],
+ &mod, info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_TX) {
+ ret = ethnl_update_profile(dev, &dev->irq_moder->tx_profile,
+ tb[ETHTOOL_A_COALESCE_TX_PROFILE],
+ &mod, info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ /* Update operation modes */
+ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod_mode);
+ ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod_mode);
+ ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx,
+ tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod_mode);
+ ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx,
+ tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod_mode);
+
+ *dual_change = mod && mod_mode;
+ if (!mod && !mod_mode)
+ return 0;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
+ info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+static int
+ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ bool dual_change;
+ int err, ret;
+
+ /* SET_COALESCE may change operation mode and parameters in one call.
+ * Changing operation mode may cause the driver to reset the parameter
+ * values, and therefore ignore user input (driver does not know which
+ * parameters come from user and which are echoed back from ->get).
+ * To not complicate the drivers if user tries to change both the mode
+ * and parameters at once - call the driver twice.
+ */
+ err = __ethnl_set_coalesce(req_info, info, &dual_change);
+ if (err < 0)
+ return err;
+ ret = err;
+
+ if (ret && dual_change) {
+ err = __ethnl_set_coalesce(req_info, info, &dual_change);
+ if (err < 0)
+ return err;
+ }
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_coalesce_request_ops = {
+ .request_cmd = ETHTOOL_MSG_COALESCE_GET,
+ .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_COALESCE_HEADER,
+ .req_info_size = sizeof(struct coalesce_req_info),
+ .reply_data_size = sizeof(struct coalesce_reply_data),
+
+ .prepare_data = coalesce_prepare_data,
+ .reply_size = coalesce_reply_size,
+ .fill_reply = coalesce_fill_reply,
+
+ .set_validate = ethnl_set_coalesce_validate,
+ .set = ethnl_set_coalesce,
+ .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF,
+};
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
new file mode 100644
index 000000000000..369c05cf8163
--- /dev/null
+++ b/net/ethtool/common.c
@@ -0,0 +1,1169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/rtnetlink.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/phy_link_topology.h>
+#include <net/netdev_queues.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "../core/dev.h"
+
+
+const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_GSO_ACCECN_BIT] = "tx-tcp-accecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
+ [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
+ [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
+ [NETIF_F_GSO_TUNNEL_REMCSUM_BIT] = "tx-tunnel-remcsum-segmentation",
+ [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
+ [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
+ [NETIF_F_GSO_FRAGLIST_BIT] = "tx-gso-list",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
+ [NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
+ [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
+ [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
+ [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
+ [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
+ [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
+ [NETIF_F_GRO_UDP_FWD_BIT] = "rx-udp-gro-forwarding",
+ [NETIF_F_HW_HSR_TAG_INS_BIT] = "hsr-tag-ins-offload",
+ [NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload",
+ [NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload",
+ [NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload",
+};
+
+const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+ [ETH_RSS_HASH_CRC32_BIT] = "crc32",
+};
+
+const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
+ [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+ [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
+ [ETHTOOL_TX_COPYBREAK_BUF_SIZE] = "tx-copybreak-buf-size",
+};
+
+const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+ [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down",
+ [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down",
+};
+
+#define __LINK_MODE_NAME(speed, type, duplex) \
+ #speed "base" #type "/" #duplex
+#define __DEFINE_LINK_MODE_NAME(speed, type, duplex) \
+ [ETHTOOL_LINK_MODE(speed, type, duplex)] = \
+ __LINK_MODE_NAME(speed, type, duplex)
+#define __DEFINE_SPECIAL_MODE_NAME(_mode, _name) \
+ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = _name
+
+const char link_mode_names[][ETH_GSTRING_LEN] = {
+ __DEFINE_LINK_MODE_NAME(10, T, Half),
+ __DEFINE_LINK_MODE_NAME(10, T, Full),
+ __DEFINE_LINK_MODE_NAME(100, T, Half),
+ __DEFINE_LINK_MODE_NAME(100, T, Full),
+ __DEFINE_LINK_MODE_NAME(1000, T, Half),
+ __DEFINE_LINK_MODE_NAME(1000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Autoneg, "Autoneg"),
+ __DEFINE_SPECIAL_MODE_NAME(TP, "TP"),
+ __DEFINE_SPECIAL_MODE_NAME(AUI, "AUI"),
+ __DEFINE_SPECIAL_MODE_NAME(MII, "MII"),
+ __DEFINE_SPECIAL_MODE_NAME(FIBRE, "FIBRE"),
+ __DEFINE_SPECIAL_MODE_NAME(BNC, "BNC"),
+ __DEFINE_LINK_MODE_NAME(10000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Pause, "Pause"),
+ __DEFINE_SPECIAL_MODE_NAME(Asym_Pause, "Asym_Pause"),
+ __DEFINE_LINK_MODE_NAME(2500, X, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Backplane, "Backplane"),
+ __DEFINE_LINK_MODE_NAME(1000, KX, Full),
+ __DEFINE_LINK_MODE_NAME(10000, KX4, Full),
+ __DEFINE_LINK_MODE_NAME(10000, KR, Full),
+ __DEFINE_SPECIAL_MODE_NAME(10000baseR_FEC, "10000baseR_FEC"),
+ __DEFINE_LINK_MODE_NAME(20000, MLD2, Full),
+ __DEFINE_LINK_MODE_NAME(20000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(40000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, LR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, LR4, Full),
+ __DEFINE_LINK_MODE_NAME(25000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(25000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(25000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(50000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR4_ER4, Full),
+ __DEFINE_LINK_MODE_NAME(50000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(1000, X, Full),
+ __DEFINE_LINK_MODE_NAME(10000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, LR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, LRM, Full),
+ __DEFINE_LINK_MODE_NAME(10000, ER, Full),
+ __DEFINE_LINK_MODE_NAME(2500, T, Full),
+ __DEFINE_LINK_MODE_NAME(5000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_NONE, "None"),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_RS, "RS"),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_BASER, "BASER"),
+ __DEFINE_LINK_MODE_NAME(50000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, T1, Full),
+ __DEFINE_LINK_MODE_NAME(1000, T1, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR8, Full),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"),
+ __DEFINE_LINK_MODE_NAME(100000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, FX, Half),
+ __DEFINE_LINK_MODE_NAME(100, FX, Full),
+ __DEFINE_LINK_MODE_NAME(10, T1L, Full),
+ __DEFINE_LINK_MODE_NAME(800000, CR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, SR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, VR8, Full),
+ __DEFINE_LINK_MODE_NAME(10, T1S, Full),
+ __DEFINE_LINK_MODE_NAME(10, T1S, Half),
+ __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half),
+ __DEFINE_LINK_MODE_NAME(10, T1BRR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR_2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, VR, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR2_2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, VR2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, VR4, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, CR8, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, DR8_2, Full),
+};
+static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+#define __LINK_MODE_LANES_CR 1
+#define __LINK_MODE_LANES_CR2 2
+#define __LINK_MODE_LANES_CR4 4
+#define __LINK_MODE_LANES_CR8 8
+#define __LINK_MODE_LANES_DR 1
+#define __LINK_MODE_LANES_DR_2 1
+#define __LINK_MODE_LANES_DR2 2
+#define __LINK_MODE_LANES_DR2_2 2
+#define __LINK_MODE_LANES_DR4 4
+#define __LINK_MODE_LANES_DR4_2 4
+#define __LINK_MODE_LANES_DR8 8
+#define __LINK_MODE_LANES_KR 1
+#define __LINK_MODE_LANES_KR2 2
+#define __LINK_MODE_LANES_KR4 4
+#define __LINK_MODE_LANES_KR8 8
+#define __LINK_MODE_LANES_SR 1
+#define __LINK_MODE_LANES_SR2 2
+#define __LINK_MODE_LANES_SR4 4
+#define __LINK_MODE_LANES_SR8 8
+#define __LINK_MODE_LANES_ER 1
+#define __LINK_MODE_LANES_KX 1
+#define __LINK_MODE_LANES_KX4 4
+#define __LINK_MODE_LANES_LR 1
+#define __LINK_MODE_LANES_LR4 4
+#define __LINK_MODE_LANES_LR4_ER4 4
+#define __LINK_MODE_LANES_LR_ER_FR 1
+#define __LINK_MODE_LANES_LR2_ER2_FR2 2
+#define __LINK_MODE_LANES_LR4_ER4_FR4 4
+#define __LINK_MODE_LANES_LR8_ER8_FR8 8
+#define __LINK_MODE_LANES_LRM 1
+#define __LINK_MODE_LANES_MLD2 2
+#define __LINK_MODE_LANES_T 1
+#define __LINK_MODE_LANES_T1 1
+#define __LINK_MODE_LANES_X 1
+#define __LINK_MODE_LANES_FX 1
+#define __LINK_MODE_LANES_T1L 1
+#define __LINK_MODE_LANES_T1S 1
+#define __LINK_MODE_LANES_T1S_P2MP 1
+#define __LINK_MODE_LANES_VR 1
+#define __LINK_MODE_LANES_VR2 2
+#define __LINK_MODE_LANES_VR4 4
+#define __LINK_MODE_LANES_VR8 8
+#define __LINK_MODE_LANES_DR8_2 8
+#define __LINK_MODE_LANES_T1BRR 1
+
+#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \
+ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
+ .speed = SPEED_ ## _speed, \
+ .lanes = __LINK_MODE_LANES_ ## _type, \
+ .duplex = __DUPLEX_ ## _duplex \
+ }
+#define __DUPLEX_Half DUPLEX_HALF
+#define __DUPLEX_Full DUPLEX_FULL
+#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
+ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \
+ .speed = SPEED_UNKNOWN, \
+ .lanes = 0, \
+ .duplex = DUPLEX_UNKNOWN, \
+ }
+
+const struct link_mode_info link_mode_params[] = {
+ __DEFINE_LINK_MODE_PARAMS(10, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(1000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Autoneg),
+ __DEFINE_SPECIAL_MODE_PARAMS(TP),
+ __DEFINE_SPECIAL_MODE_PARAMS(AUI),
+ __DEFINE_SPECIAL_MODE_PARAMS(MII),
+ __DEFINE_SPECIAL_MODE_PARAMS(FIBRE),
+ __DEFINE_SPECIAL_MODE_PARAMS(BNC),
+ __DEFINE_LINK_MODE_PARAMS(10000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Pause),
+ __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause),
+ __DEFINE_LINK_MODE_PARAMS(2500, X, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Backplane),
+ __DEFINE_LINK_MODE_PARAMS(1000, KX, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, KR, Full),
+ [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = {
+ .speed = SPEED_10000,
+ .lanes = 1,
+ .duplex = DUPLEX_FULL,
+ },
+ __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full),
+ __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, X, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, LR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, ER, Full),
+ __DEFINE_LINK_MODE_PARAMS(2500, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(5000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, T1, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, T1, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Full),
+ __DEFINE_LINK_MODE_PARAMS(10, T1L, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(10, T1S, Full),
+ __DEFINE_LINK_MODE_PARAMS(10, T1S, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, VR, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full),
+};
+static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
+EXPORT_SYMBOL_GPL(link_mode_params);
+
+const char netif_msg_class_names[][ETH_GSTRING_LEN] = {
+ [NETIF_MSG_DRV_BIT] = "drv",
+ [NETIF_MSG_PROBE_BIT] = "probe",
+ [NETIF_MSG_LINK_BIT] = "link",
+ [NETIF_MSG_TIMER_BIT] = "timer",
+ [NETIF_MSG_IFDOWN_BIT] = "ifdown",
+ [NETIF_MSG_IFUP_BIT] = "ifup",
+ [NETIF_MSG_RX_ERR_BIT] = "rx_err",
+ [NETIF_MSG_TX_ERR_BIT] = "tx_err",
+ [NETIF_MSG_TX_QUEUED_BIT] = "tx_queued",
+ [NETIF_MSG_INTR_BIT] = "intr",
+ [NETIF_MSG_TX_DONE_BIT] = "tx_done",
+ [NETIF_MSG_RX_STATUS_BIT] = "rx_status",
+ [NETIF_MSG_PKTDATA_BIT] = "pktdata",
+ [NETIF_MSG_HW_BIT] = "hw",
+ [NETIF_MSG_WOL_BIT] = "wol",
+};
+static_assert(ARRAY_SIZE(netif_msg_class_names) == NETIF_MSG_CLASS_COUNT);
+
+const char wol_mode_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(WAKE_PHY)] = "phy",
+ [const_ilog2(WAKE_UCAST)] = "ucast",
+ [const_ilog2(WAKE_MCAST)] = "mcast",
+ [const_ilog2(WAKE_BCAST)] = "bcast",
+ [const_ilog2(WAKE_ARP)] = "arp",
+ [const_ilog2(WAKE_MAGIC)] = "magic",
+ [const_ilog2(WAKE_MAGICSECURE)] = "magicsecure",
+ [const_ilog2(WAKE_FILTER)] = "filter",
+};
+static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT);
+
+const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(SOF_TIMESTAMPING_TX_HARDWARE)] = "hardware-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SOFTWARE)] = "software-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_RX_HARDWARE)] = "hardware-receive",
+ [const_ilog2(SOF_TIMESTAMPING_RX_SOFTWARE)] = "software-receive",
+ [const_ilog2(SOF_TIMESTAMPING_SOFTWARE)] = "software-system-clock",
+ [const_ilog2(SOF_TIMESTAMPING_SYS_HARDWARE)] = "hardware-legacy-clock",
+ [const_ilog2(SOF_TIMESTAMPING_RAW_HARDWARE)] = "hardware-raw-clock",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_ID)] = "option-id",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SCHED)] = "sched-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_ACK)] = "ack-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_CMSG)] = "option-cmsg",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TSONLY)] = "option-tsonly",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)] = "option-stats",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw",
+ [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_RX_FILTER)] = "option-rx-filter",
+ [const_ilog2(SOF_TIMESTAMPING_TX_COMPLETION)] = "tx-completion",
+};
+static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
+
+const char ts_tx_type_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_TX_OFF] = "off",
+ [HWTSTAMP_TX_ON] = "on",
+ [HWTSTAMP_TX_ONESTEP_SYNC] = "onestep-sync",
+ [HWTSTAMP_TX_ONESTEP_P2P] = "onestep-p2p",
+};
+static_assert(ARRAY_SIZE(ts_tx_type_names) == __HWTSTAMP_TX_CNT);
+
+const char ts_rx_filter_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_FILTER_NONE] = "none",
+ [HWTSTAMP_FILTER_ALL] = "all",
+ [HWTSTAMP_FILTER_SOME] = "some",
+ [HWTSTAMP_FILTER_PTP_V1_L4_EVENT] = "ptpv1-l4-event",
+ [HWTSTAMP_FILTER_PTP_V1_L4_SYNC] = "ptpv1-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ] = "ptpv1-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L4_EVENT] = "ptpv2-l4-event",
+ [HWTSTAMP_FILTER_PTP_V2_L4_SYNC] = "ptpv2-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ] = "ptpv2-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L2_EVENT] = "ptpv2-l2-event",
+ [HWTSTAMP_FILTER_PTP_V2_L2_SYNC] = "ptpv2-l2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ] = "ptpv2-l2-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_EVENT] = "ptpv2-event",
+ [HWTSTAMP_FILTER_PTP_V2_SYNC] = "ptpv2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_DELAY_REQ] = "ptpv2-delay-req",
+ [HWTSTAMP_FILTER_NTP_ALL] = "ntp-all",
+};
+static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT);
+
+const char ts_flags_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index",
+};
+static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT);
+
+const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = {
+ [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan",
+ [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve",
+ [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe",
+};
+static_assert(ARRAY_SIZE(udp_tunnel_type_names) ==
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT);
+
+/* return false if legacy contained non-0 deprecated fields
+ * maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+int __ethtool_get_link(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->get_link)
+ return -EOPNOTSUPP;
+
+ return netif_running(dev) && dev->ethtool_ops->get_link(dev);
+}
+
+int ethtool_get_rx_ring_count(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings = {};
+ int ret;
+
+ if (ops->get_rx_ring_count)
+ return ops->get_rx_ring_count(dev);
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret < 0)
+ return ret;
+
+ return rx_rings.data;
+}
+
+static int ethtool_get_rxnfc_rule_count(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info = {
+ .cmd = ETHTOOL_GRXCLSRLCNT,
+ };
+ int err;
+
+ err = ops->get_rxnfc(dev, &info, NULL);
+ if (err)
+ return err;
+
+ return info.rule_cnt;
+}
+
+/* Max offset for one RSS context */
+static u32 ethtool_get_rss_ctx_max_channel(struct ethtool_rxfh_context *ctx)
+{
+ u32 max_ring = 0;
+ u32 i, *tbl;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ tbl = ethtool_rxfh_context_indir(ctx);
+ for (i = 0; i < ctx->indir_size; i++)
+ max_ring = max(max_ring, tbl[i]);
+ return max_ring;
+}
+
+static int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc *info;
+ int err, i, rule_cnt;
+ u64 max_ring = 0;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ rule_cnt = ethtool_get_rxnfc_rule_count(dev);
+ if (rule_cnt <= 0)
+ return -EINVAL;
+
+ info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->cmd = ETHTOOL_GRXCLSRLALL;
+ info->rule_cnt = rule_cnt;
+ err = ops->get_rxnfc(dev, info, info->rule_locs);
+ if (err)
+ goto err_free_info;
+
+ for (i = 0; i < rule_cnt; i++) {
+ struct ethtool_rxnfc rule_info = {
+ .cmd = ETHTOOL_GRXCLSRULE,
+ .fs.location = info->rule_locs[i],
+ };
+
+ err = ops->get_rxnfc(dev, &rule_info, NULL);
+ if (err)
+ goto err_free_info;
+
+ if (rule_info.fs.ring_cookie != RX_CLS_FLOW_DISC &&
+ rule_info.fs.ring_cookie != RX_CLS_FLOW_WAKE &&
+ !ethtool_get_flow_spec_ring_vf(rule_info.fs.ring_cookie)) {
+ u64 ring = rule_info.fs.ring_cookie;
+
+ if (rule_info.flow_type & FLOW_RSS) {
+ struct ethtool_rxfh_context *ctx;
+
+ ctx = xa_load(&dev->ethtool->rss_ctx,
+ rule_info.rss_context);
+ ring += ethtool_get_rss_ctx_max_channel(ctx);
+ }
+ max_ring = max_t(u64, max_ring, ring);
+ }
+ }
+
+ kvfree(info);
+ *max = max_ring;
+ return 0;
+
+err_free_info:
+ kvfree(info);
+ return err;
+}
+
+/* Max offset across all of a device's RSS contexts */
+static u32 ethtool_get_max_rss_ctx_channel(struct net_device *dev)
+{
+ struct ethtool_rxfh_context *ctx;
+ unsigned long context;
+ u32 max_ring = 0;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ xa_for_each(&dev->ethtool->rss_ctx, context, ctx)
+ max_ring = max(max_ring, ethtool_get_rss_ctx_max_channel(ctx));
+ mutex_unlock(&dev->ethtool->rss_lock);
+
+ return max_ring;
+}
+
+static u32 ethtool_get_max_rxfh_channel(struct net_device *dev)
+{
+ struct ethtool_rxfh_param rxfh = {};
+ u32 dev_size, current_max = 0;
+ int ret;
+
+ /* While we do track whether RSS context has an indirection
+ * table explicitly set by the user, no driver looks at that bit.
+ * Assume drivers won't auto-regenerate the additional tables,
+ * to be safe.
+ */
+ current_max = ethtool_get_max_rss_ctx_channel(dev);
+
+ if (!netif_is_rxfh_configured(dev))
+ return current_max;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return current_max;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return current_max;
+
+ rxfh.indir = kcalloc(dev_size, sizeof(rxfh.indir[0]), GFP_USER);
+ if (!rxfh.indir)
+ return U32_MAX;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = dev->ethtool_ops->get_rxfh(dev, &rxfh);
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (ret) {
+ current_max = U32_MAX;
+ goto out_free;
+ }
+
+ while (dev_size--)
+ current_max = max(current_max, rxfh.indir[dev_size]);
+
+out_free:
+ kfree(rxfh.indir);
+ return current_max;
+}
+
+int ethtool_check_max_channel(struct net_device *dev,
+ struct ethtool_channels channels,
+ struct genl_info *info)
+{
+ u64 max_rxnfc_in_use;
+ u32 max_rxfh_in_use;
+ int max_mp_in_use;
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table/rxnfc settings
+ */
+ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use))
+ max_rxnfc_in_use = 0;
+ max_rxfh_in_use = ethtool_get_max_rxfh_channel(dev);
+ if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) {
+ if (info)
+ GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing indirection table (%d)", max_rxfh_in_use);
+ return -EINVAL;
+ }
+ if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) {
+ if (info)
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings");
+ return -EINVAL;
+ }
+
+ max_mp_in_use = dev_get_min_mp_channel_count(dev);
+ if (channels.combined_count + channels.rx_count <= max_mp_in_use) {
+ if (info)
+ GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing memory provider setting (%d)", max_mp_in_use);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc *info;
+ int rc, i, rule_cnt;
+
+ if (!ops->get_rxnfc)
+ return 0;
+
+ rule_cnt = ethtool_get_rxnfc_rule_count(dev);
+ if (!rule_cnt)
+ return 0;
+
+ if (rule_cnt < 0)
+ return -EINVAL;
+
+ info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->cmd = ETHTOOL_GRXCLSRLALL;
+ info->rule_cnt = rule_cnt;
+ rc = ops->get_rxnfc(dev, info, info->rule_locs);
+ if (rc)
+ goto out_free;
+
+ for (i = 0; i < rule_cnt; i++) {
+ struct ethtool_rxnfc rule_info = {
+ .cmd = ETHTOOL_GRXCLSRULE,
+ .fs.location = info->rule_locs[i],
+ };
+
+ rc = ops->get_rxnfc(dev, &rule_info, NULL);
+ if (rc)
+ goto out_free;
+
+ if (rule_info.fs.flow_type & FLOW_RSS &&
+ rule_info.rss_context == rss_context) {
+ rc = -EBUSY;
+ goto out_free;
+ }
+ }
+
+out_free:
+ kvfree(info);
+ return rc;
+}
+
+struct ethtool_rxfh_context *
+ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops,
+ u32 indir_size, u32 key_size)
+{
+ size_t indir_bytes, flex_len, key_off, size;
+ struct ethtool_rxfh_context *ctx;
+ u32 priv_bytes, indir_max;
+ u16 key_max;
+
+ key_max = max(key_size, ops->rxfh_key_space);
+ indir_max = max(indir_size, ops->rxfh_indir_space);
+
+ priv_bytes = ALIGN(ops->rxfh_priv_size, sizeof(u32));
+ indir_bytes = array_size(indir_max, sizeof(u32));
+
+ key_off = size_add(priv_bytes, indir_bytes);
+ flex_len = size_add(key_off, key_max);
+ size = struct_size_t(struct ethtool_rxfh_context, data, flex_len);
+
+ ctx = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!ctx)
+ return NULL;
+
+ ctx->indir_size = indir_size;
+ ctx->key_size = key_size;
+ ctx->key_off = key_off;
+ ctx->priv_size = ops->rxfh_priv_size;
+
+ ctx->hfunc = ETH_RSS_HASH_NO_CHANGE;
+ ctx->input_xfrm = RXH_XFRM_NO_CHANGE;
+
+ return ctx;
+}
+
+/* Check if fields configured for flow hash are symmetric - if src is included
+ * so is dst and vice versa.
+ */
+int ethtool_rxfh_config_is_sym(u64 rxfh)
+{
+ bool sym;
+
+ sym = rxfh == (rxfh & (RXH_IP_SRC | RXH_IP_DST |
+ RXH_L4_B_0_1 | RXH_L4_B_2_3));
+ sym &= !!(rxfh & RXH_IP_SRC) == !!(rxfh & RXH_IP_DST);
+ sym &= !!(rxfh & RXH_L4_B_0_1) == !!(rxfh & RXH_L4_B_2_3);
+
+ return sym;
+}
+
+int ethtool_check_ops(const struct ethtool_ops *ops)
+{
+ if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params))
+ return -EINVAL;
+ if (WARN_ON(ops->rxfh_max_num_contexts == 1))
+ return -EINVAL;
+ if (WARN_ON(ops->supported_input_xfrm && !ops->get_rxfh_fields))
+ return -EINVAL;
+ if (WARN_ON(ops->supported_input_xfrm &&
+ ops->rxfh_per_ctx_fields != ops->rxfh_per_ctx_key))
+ return -EINVAL;
+
+ /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime,
+ * the fact that ops are checked at registration time does not
+ * mean the ops attached to a netdev later on are sane.
+ */
+ return 0;
+}
+
+void ethtool_ringparam_get_cfg(struct net_device *dev,
+ struct ethtool_ringparam *param,
+ struct kernel_ethtool_ringparam *kparam,
+ struct netlink_ext_ack *extack)
+{
+ memset(param, 0, sizeof(*param));
+ memset(kparam, 0, sizeof(*kparam));
+
+ param->cmd = ETHTOOL_GRINGPARAM;
+ dev->ethtool_ops->get_ringparam(dev, param, kparam, extack);
+
+ /* Driver gives us current state, we want to return current config */
+ kparam->tcp_data_split = dev->cfg->hds_config;
+ kparam->hds_thresh = dev->cfg->hds_thresh;
+}
+
+static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info)
+{
+ memset(info, 0, sizeof(*info));
+ info->cmd = ETHTOOL_GET_TS_INFO;
+ info->phc_index = -1;
+}
+
+int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int err;
+
+ if (!ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ /* Does ptp comes from netdev */
+ ethtool_init_tsinfo(info);
+ info->phc_qualifier = hwprov_desc->qualifier;
+ err = ops->get_ts_info(dev, info);
+ if (err)
+ return err;
+
+ if (info->phc_index == hwprov_desc->index &&
+ net_support_hwtstamp_qualifier(dev, hwprov_desc->qualifier))
+ return 0;
+
+ return -ENODEV;
+}
+
+struct phy_device *
+ethtool_phy_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ int err;
+
+ /* Only precise qualifier is supported in phydev */
+ if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE)
+ return ERR_PTR(-ENODEV);
+
+ /* Look in the phy topology */
+ if (dev->link_topo) {
+ struct phy_device_node *pdn;
+ unsigned long phy_index;
+
+ xa_for_each(&dev->link_topo->phys, phy_index, pdn) {
+ if (!phy_has_tsinfo(pdn->phy))
+ continue;
+
+ ethtool_init_tsinfo(info);
+ err = phy_ts_info(pdn->phy, info);
+ if (err)
+ return ERR_PTR(err);
+
+ if (info->phc_index == hwprov_desc->index)
+ return pdn->phy;
+ }
+ return ERR_PTR(-ENODEV);
+ }
+
+ /* Look on the dev->phydev */
+ if (phy_has_tsinfo(dev->phydev)) {
+ ethtool_init_tsinfo(info);
+ err = phy_ts_info(dev->phydev, info);
+ if (err)
+ return ERR_PTR(err);
+
+ if (info->phc_index == hwprov_desc->index)
+ return dev->phydev;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+int ethtool_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ int err;
+
+ err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc);
+ if (err == -ENODEV || err == -EOPNOTSUPP) {
+ struct phy_device *phy;
+
+ phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc);
+ if (IS_ERR(phy))
+ return PTR_ERR(phy);
+
+ /* Report the phc source only if we have a real
+ * phc source with an index.
+ */
+ if (info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ info->phc_phyindex = phy->phyindex;
+ }
+ err = 0;
+ } else if (!err && info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_NETDEV;
+ }
+
+ info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ return err;
+}
+
+int __ethtool_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct hwtstamp_provider *hwprov;
+ int err = 0;
+
+ rcu_read_lock();
+ hwprov = rcu_dereference(dev->hwprov);
+ /* No provider specified, use default behavior */
+ if (!hwprov) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ ethtool_init_tsinfo(info);
+ if (phy_is_default_hwtstamp(phydev) &&
+ phy_has_tsinfo(phydev)) {
+ err = phy_ts_info(phydev, info);
+ /* Report the phc source only if we have a real
+ * phc source with an index.
+ */
+ if (!err && info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ info->phc_phyindex = phydev->phyindex;
+ }
+ } else if (ops->get_ts_info) {
+ err = ops->get_ts_info(dev, info);
+ if (!err && info->phc_index >= 0)
+ info->phc_source = HWTSTAMP_SOURCE_NETDEV;
+ }
+
+ info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ rcu_read_unlock();
+ return err;
+ }
+
+ err = ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc);
+ rcu_read_unlock();
+ return err;
+}
+
+bool net_support_hwtstamp_qualifier(struct net_device *dev,
+ enum hwtstamp_provider_qualifier qualifier)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops)
+ return false;
+
+ /* Return true with precise qualifier and with NIC without
+ * qualifier description to not break the old behavior.
+ */
+ if (!ops->supported_hwtstamp_qualifiers &&
+ qualifier == HWTSTAMP_PROVIDER_QUALIFIER_PRECISE)
+ return true;
+
+ if (ops->supported_hwtstamp_qualifiers & BIT(qualifier))
+ return true;
+
+ return false;
+}
+
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
+{
+ struct kernel_ethtool_ts_info info = { };
+ int num = 0;
+
+ if (!__ethtool_get_ts_info(dev, &info))
+ num = ptp_get_vclocks_index(info.phc_index, vclock_index);
+
+ return num;
+}
+EXPORT_SYMBOL(ethtool_get_phc_vclocks);
+
+int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info)
+{
+ return __ethtool_get_ts_info(dev, info);
+}
+EXPORT_SYMBOL(ethtool_get_ts_info_by_layer);
+
+const struct ethtool_phy_ops *ethtool_phy_ops;
+
+void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
+{
+ ASSERT_RTNL();
+ ethtool_phy_ops = ops;
+}
+EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops);
+
+void
+ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
+ enum ethtool_link_mode_bit_indices link_mode)
+{
+ const struct link_mode_info *link_info;
+
+ if (WARN_ON_ONCE(link_mode >= __ETHTOOL_LINK_MODE_MASK_NBITS))
+ return;
+
+ link_info = &link_mode_params[link_mode];
+ link_ksettings->base.speed = link_info->speed;
+ link_ksettings->lanes = link_info->lanes;
+ link_ksettings->base.duplex = link_info->duplex;
+}
+EXPORT_SYMBOL_GPL(ethtool_params_from_link_mode);
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+ for (u32 i = 0; i < size; i++) {
+ struct ethtool_forced_speed_map *map = &maps[i];
+
+ linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+ map->cap_arr = NULL;
+ map->arr_size = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(ethtool_forced_speed_maps_init);
+
+void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id)
+{
+ struct ethtool_rxfh_context *ctx;
+
+ WARN_ONCE(!rtnl_is_locked() &&
+ !lockdep_is_held_type(&dev->ethtool->rss_lock, -1),
+ "RSS context lock assertion failed\n");
+
+ netdev_err(dev, "device error, RSS context %d lost\n", context_id);
+ ctx = xa_erase(&dev->ethtool->rss_ctx, context_id);
+ kfree(ctx);
+ ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id);
+}
+EXPORT_SYMBOL(ethtool_rxfh_context_lost);
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
new file mode 100644
index 000000000000..1609cf4e53eb
--- /dev/null
+++ b/net/ethtool/common.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ETHTOOL_COMMON_H
+#define _ETHTOOL_COMMON_H
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#define ETHTOOL_DEV_FEATURE_WORDS DIV_ROUND_UP(NETDEV_FEATURE_COUNT, 32)
+
+/* compose link mode index from speed, type and duplex */
+#define ETHTOOL_LINK_MODE(speed, type, duplex) \
+ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT
+
+#define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1)
+#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1)
+
+struct genl_info;
+struct hwtstamp_provider_desc;
+
+extern const char
+netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
+extern const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN];
+extern const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN];
+extern const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN];
+extern const char link_mode_names[][ETH_GSTRING_LEN];
+extern const char netif_msg_class_names[][ETH_GSTRING_LEN];
+extern const char wol_mode_names[][ETH_GSTRING_LEN];
+extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
+extern const char ts_tx_type_names[][ETH_GSTRING_LEN];
+extern const char ts_rx_filter_names[][ETH_GSTRING_LEN];
+extern const char ts_flags_names[][ETH_GSTRING_LEN];
+extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN];
+
+int __ethtool_get_link(struct net_device *dev);
+
+bool convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings);
+int ethtool_check_max_channel(struct net_device *dev,
+ struct ethtool_channels channels,
+ struct genl_info *info);
+struct ethtool_rxfh_context *
+ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops,
+ u32 indir_size, u32 key_size);
+int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context);
+int ethtool_rxfh_config_is_sym(u64 rxfh);
+
+void ethtool_ringparam_get_cfg(struct net_device *dev,
+ struct ethtool_ringparam *param,
+ struct kernel_ethtool_ringparam *kparam,
+ struct netlink_ext_ack *extack);
+
+int ethtool_get_rx_ring_count(struct net_device *dev);
+
+int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info);
+int ethtool_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+struct phy_device *
+ethtool_phy_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+bool net_support_hwtstamp_qualifier(struct net_device *dev,
+ enum hwtstamp_provider_qualifier qualifier);
+
+extern const struct ethtool_phy_ops *ethtool_phy_ops;
+extern const struct ethtool_pse_ops *ethtool_pse_ops;
+
+int ethtool_get_module_info_call(struct net_device *dev,
+ struct ethtool_modinfo *modinfo);
+int ethtool_get_module_eeprom_call(struct net_device *dev,
+ struct ethtool_eeprom *ee, u8 *data);
+
+bool __ethtool_dev_mm_supported(struct net_device *dev);
+
+#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
+void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context);
+#else
+static inline void
+ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
+{
+}
+#endif
+
+#endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c
new file mode 100644
index 000000000000..0b2dea56d461
--- /dev/null
+++ b/net/ethtool/debug.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct debug_req_info {
+ struct ethnl_req_info base;
+};
+
+struct debug_reply_data {
+ struct ethnl_reply_data base;
+ u32 msg_mask;
+};
+
+#define DEBUG_REPDATA(__reply_base) \
+ container_of(__reply_base, struct debug_reply_data, base)
+
+const struct nla_policy ethnl_debug_get_policy[] = {
+ [ETHTOOL_A_DEBUG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int debug_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_msglevel)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->msg_mask = dev->ethtool_ops->get_msglevel(dev);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int debug_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT,
+ netif_msg_class_names, compact);
+}
+
+static int debug_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask,
+ NULL, NETIF_MSG_CLASS_COUNT,
+ netif_msg_class_names, compact);
+}
+
+/* DEBUG_SET */
+
+const struct nla_policy ethnl_debug_set_policy[] = {
+ [ETHTOOL_A_DEBUG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED },
+};
+
+static int
+ethnl_set_debug_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ u32 msg_mask;
+ int ret;
+
+ msg_mask = dev->ethtool_ops->get_msglevel(dev);
+ ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT,
+ tb[ETHTOOL_A_DEBUG_MSGMASK],
+ netif_msg_class_names, info->extack, &mod);
+ if (ret < 0 || !mod)
+ return ret;
+
+ dev->ethtool_ops->set_msglevel(dev, msg_mask);
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_debug_request_ops = {
+ .request_cmd = ETHTOOL_MSG_DEBUG_GET,
+ .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_DEBUG_HEADER,
+ .req_info_size = sizeof(struct debug_req_info),
+ .reply_data_size = sizeof(struct debug_reply_data),
+
+ .prepare_data = debug_prepare_data,
+ .reply_size = debug_reply_size,
+ .fill_reply = debug_fill_reply,
+
+ .set_validate = ethnl_set_debug_validate,
+ .set = ethnl_set_debug,
+ .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF,
+};
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
new file mode 100644
index 000000000000..bf398973eb8a
--- /dev/null
+++ b/net/ethtool/eee.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct eee_req_info {
+ struct ethnl_req_info base;
+};
+
+struct eee_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_keee eee;
+};
+
+#define EEE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct eee_reply_data, base)
+
+const struct nla_policy ethnl_eee_get_policy[] = {
+ [ETHTOOL_A_EEE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int eee_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct ethtool_keee *eee = &data->eee;
+ int ret;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_eee(dev, eee);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int eee_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_keee *eee = &data->eee;
+ int len = 0;
+ int ret;
+
+ /* MODES_OURS */
+ ret = ethnl_bitset_size(eee->advertised, eee->supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ /* MODES_PEERS */
+ ret = ethnl_bitset_size(eee->lp_advertised, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ len += nla_total_size(sizeof(u8)) + /* _EEE_ACTIVE */
+ nla_total_size(sizeof(u8)) + /* _EEE_ENABLED */
+ nla_total_size(sizeof(u8)) + /* _EEE_TX_LPI_ENABLED */
+ nla_total_size(sizeof(u32)); /* _EEE_TX_LPI_TIMER */
+
+ return len;
+}
+
+static int eee_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_keee *eee = &data->eee;
+ int ret;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_OURS,
+ eee->advertised, eee->supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_PEER,
+ eee->lp_advertised, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, eee->eee_active) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, eee->eee_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED,
+ eee->tx_lpi_enabled) ||
+ nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* EEE_SET */
+
+const struct nla_policy ethnl_eee_set_policy[] = {
+ [ETHTOOL_A_EEE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_eee_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_eee && ops->set_eee ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ struct ethtool_keee eee = {};
+ bool mod = false;
+ int ret;
+
+ ret = dev->ethtool_ops->get_eee(dev, &eee);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_update_bitset(eee.advertised,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_EEE_MODES_OURS],
+ link_mode_names, info->extack, &mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
+ ethnl_update_bool(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED],
+ &mod);
+ ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER],
+ &mod);
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_eee(dev, &eee);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_eee_request_ops = {
+ .request_cmd = ETHTOOL_MSG_EEE_GET,
+ .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_EEE_HEADER,
+ .req_info_size = sizeof(struct eee_req_info),
+ .reply_data_size = sizeof(struct eee_reply_data),
+
+ .prepare_data = eee_prepare_data,
+ .reply_size = eee_reply_size,
+ .fill_reply = eee_fill_reply,
+
+ .set_validate = ethnl_set_eee_validate,
+ .set = ethnl_set_eee,
+ .set_ntf_cmd = ETHTOOL_MSG_EEE_NTF,
+};
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
new file mode 100644
index 000000000000..3b8209e930fd
--- /dev/null
+++ b/net/ethtool/eeprom.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/sfp.h>
+#include "netlink.h"
+#include "common.h"
+
+struct eeprom_req_info {
+ struct ethnl_req_info base;
+ u32 offset;
+ u32 length;
+ u8 page;
+ u8 bank;
+ u8 i2c_address;
+};
+
+struct eeprom_reply_data {
+ struct ethnl_reply_data base;
+ u32 length;
+ u8 *data;
+};
+
+#define MODULE_EEPROM_REQINFO(__req_base) \
+ container_of(__req_base, struct eeprom_req_info, base)
+
+#define MODULE_EEPROM_REPDATA(__reply_base) \
+ container_of(__reply_base, struct eeprom_reply_data, base)
+
+static int fallback_set_params(struct eeprom_req_info *request,
+ struct ethtool_modinfo *modinfo,
+ struct ethtool_eeprom *eeprom)
+{
+ u32 offset = request->offset;
+ u32 length = request->length;
+
+ if (request->page)
+ offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset;
+
+ if (modinfo->type == ETH_MODULE_SFF_8472 &&
+ request->i2c_address == 0x51)
+ offset += ETH_MODULE_EEPROM_PAGE_LEN * 2;
+
+ if (offset >= modinfo->eeprom_len)
+ return -EINVAL;
+
+ eeprom->cmd = ETHTOOL_GMODULEEEPROM;
+ eeprom->len = length;
+ eeprom->offset = offset;
+
+ return 0;
+}
+
+static int eeprom_fallback(struct eeprom_req_info *request,
+ struct eeprom_reply_data *reply)
+{
+ struct net_device *dev = reply->base.dev;
+ struct ethtool_modinfo modinfo = {0};
+ struct ethtool_eeprom eeprom = {0};
+ u8 *data;
+ int err;
+
+ modinfo.cmd = ETHTOOL_GMODULEINFO;
+ err = ethtool_get_module_info_call(dev, &modinfo);
+ if (err < 0)
+ return err;
+
+ err = fallback_set_params(request, &modinfo, &eeprom);
+ if (err < 0)
+ return err;
+
+ data = kmalloc(eeprom.len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ err = ethtool_get_module_eeprom_call(dev, &eeprom, data);
+ if (err < 0)
+ goto err_out;
+
+ reply->data = data;
+ reply->length = eeprom.len;
+
+ return 0;
+
+err_out:
+ kfree(data);
+ return err;
+}
+
+static int get_module_eeprom_by_page(struct net_device *dev,
+ struct ethtool_module_eeprom *page_data,
+ struct netlink_ext_ack *extack)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
+ if (dev->sfp_bus)
+ return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack);
+
+ if (ops->get_module_eeprom_by_page)
+ return ops->get_module_eeprom_by_page(dev, page_data, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int eeprom_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+ struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+ struct ethtool_module_eeprom page_data = {0};
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ page_data.offset = request->offset;
+ page_data.length = request->length;
+ page_data.i2c_address = request->i2c_address;
+ page_data.page = request->page;
+ page_data.bank = request->bank;
+ page_data.data = kmalloc(page_data.length, GFP_KERNEL);
+ if (!page_data.data)
+ return -ENOMEM;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret)
+ goto err_free;
+
+ ret = get_module_eeprom_by_page(dev, &page_data, info->extack);
+ if (ret < 0)
+ goto err_ops;
+
+ reply->length = ret;
+ reply->data = page_data.data;
+
+ ethnl_ops_complete(dev);
+ return 0;
+
+err_ops:
+ ethnl_ops_complete(dev);
+err_free:
+ kfree(page_data.data);
+
+ if (ret == -EOPNOTSUPP)
+ return eeprom_fallback(request, reply);
+ return ret;
+}
+
+static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info);
+
+ if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] ||
+ !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] ||
+ !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] ||
+ !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS])
+ return -EINVAL;
+
+ request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]);
+ request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]);
+ request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]);
+
+ /* The following set of conditions limit the API to only dump 1/2
+ * EEPROM page without crossing low page boundary located at offset 128.
+ * This means user may only request dumps of length limited to 128 from
+ * either low 128 bytes or high 128 bytes.
+ * For pages higher than 0 only high 128 bytes are accessible.
+ */
+ request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]);
+ if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE],
+ "reading from lower half page is allowed for page 0 only");
+ return -EINVAL;
+ }
+
+ if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN &&
+ request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+ "reading cross half page boundary is illegal");
+ return -EINVAL;
+ } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+ "reading cross page boundary is illegal");
+ return -EINVAL;
+ }
+
+ if (tb[ETHTOOL_A_MODULE_EEPROM_BANK])
+ request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]);
+
+ return 0;
+}
+
+static int eeprom_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+
+ return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */
+}
+
+static int eeprom_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+ return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data);
+}
+
+static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+ kfree(reply->data);
+}
+
+const struct ethnl_request_ops ethnl_module_eeprom_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET,
+ .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER,
+ .req_info_size = sizeof(struct eeprom_req_info),
+ .reply_data_size = sizeof(struct eeprom_reply_data),
+
+ .parse_request = eeprom_parse_request,
+ .prepare_data = eeprom_prepare_data,
+ .reply_size = eeprom_reply_size,
+ .fill_reply = eeprom_fill_reply,
+ .cleanup_data = eeprom_cleanup_data,
+};
+
+const struct nla_policy ethnl_module_eeprom_get_policy[] = {
+ [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MODULE_EEPROM_OFFSET] =
+ NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1),
+ [ETHTOOL_A_MODULE_EEPROM_LENGTH] =
+ NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN),
+ [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 },
+ [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 },
+ [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] =
+ NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS),
+};
+
diff --git a/net/ethtool/features.c b/net/ethtool/features.c
new file mode 100644
index 000000000000..f2217983be2b
--- /dev/null
+++ b/net/ethtool/features.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct features_req_info {
+ struct ethnl_req_info base;
+};
+
+struct features_reply_data {
+ struct ethnl_reply_data base;
+ u32 hw[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 wanted[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 active[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 nochange[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 all[ETHTOOL_DEV_FEATURE_WORDS];
+};
+
+#define FEATURES_REPDATA(__reply_base) \
+ container_of(__reply_base, struct features_reply_data, base)
+
+const struct nla_policy ethnl_features_get_policy[] = {
+ [ETHTOOL_A_FEATURES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src)
+{
+ unsigned int i;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++)
+ dest[i] = src >> (32 * i);
+}
+
+static int features_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ netdev_features_t all_features;
+
+ ethnl_features_to_bitmap32(data->hw, dev->hw_features);
+ ethnl_features_to_bitmap32(data->wanted, dev->wanted_features);
+ ethnl_features_to_bitmap32(data->active, dev->features);
+ ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE);
+ all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0);
+ ethnl_features_to_bitmap32(data->all, all_features);
+
+ return 0;
+}
+
+static int features_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ unsigned int len = 0;
+ int ret;
+
+ ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ return len;
+}
+
+static int features_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw,
+ data->all, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted,
+ NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active,
+ NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE,
+ data->nochange, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+}
+
+const struct ethnl_request_ops ethnl_features_request_ops = {
+ .request_cmd = ETHTOOL_MSG_FEATURES_GET,
+ .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_FEATURES_HEADER,
+ .req_info_size = sizeof(struct features_req_info),
+ .reply_data_size = sizeof(struct features_reply_data),
+
+ .prepare_data = features_prepare_data,
+ .reply_size = features_reply_size,
+ .fill_reply = features_fill_reply,
+};
+
+/* FEATURES_SET */
+
+const struct nla_policy ethnl_features_set_policy[] = {
+ [ETHTOOL_A_FEATURES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED },
+};
+
+static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val)
+{
+ const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT);
+ unsigned int i;
+
+ for (i = 0; i < words; i++)
+ dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG));
+}
+
+static netdev_features_t ethnl_bitmap_to_features(unsigned long *src)
+{
+ const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE;
+ const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT);
+ netdev_features_t ret = 0;
+ unsigned int i;
+
+ for (i = 0; i < words; i++)
+ ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG);
+ ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT);
+ return ret;
+}
+
+static int features_send_reply(struct net_device *dev, struct genl_info *info,
+ const unsigned long *wanted,
+ const unsigned long *wanted_mask,
+ const unsigned long *active,
+ const unsigned long *active_mask, bool compact)
+{
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len = 0;
+ int ret;
+
+ reply_len = ethnl_reply_header_size();
+ ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto err;
+ reply_len += ret;
+ ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto err;
+ reply_len += ret;
+
+ ret = -ENOMEM;
+ rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY,
+ ETHTOOL_A_FEATURES_HEADER, info,
+ &reply_payload);
+ if (!rskb)
+ goto err;
+
+ ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted,
+ wanted_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto nla_put_failure;
+ ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active,
+ active_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(rskb, reply_payload);
+ ret = genlmsg_reply(rskb, info);
+ return ret;
+
+nla_put_failure:
+ nlmsg_free(rskb);
+ WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n",
+ reply_len);
+err:
+ GENL_SET_ERR_MSG(info, "failed to send reply message");
+ return ret;
+}
+
+int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
+{
+ DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT);
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ struct net_device *dev;
+ bool mod;
+ int ret;
+
+ if (!tb[ETHTOOL_A_FEATURES_WANTED])
+ return -EINVAL;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_FEATURES_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+ ethnl_features_to_bitmap(old_active, dev->features);
+ ethnl_features_to_bitmap(old_wanted, dev->wanted_features);
+ ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT,
+ tb[ETHTOOL_A_FEATURES_WANTED],
+ netdev_features_strings, info->extack);
+ if (ret < 0)
+ goto out_ops;
+ if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) {
+ GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features");
+ ret = -EINVAL;
+ goto out_ops;
+ }
+
+ /* set req_wanted bits not in req_mask from old_wanted */
+ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT);
+ bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT);
+ bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT);
+ if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) {
+ dev->wanted_features &= ~dev->hw_features;
+ dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features;
+ __netdev_update_features(dev);
+ }
+ ethnl_features_to_bitmap(new_active, dev->features);
+ mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT);
+
+ ret = 0;
+ if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) {
+ bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ bitmap_xor(wanted_diff_mask, req_wanted, new_active,
+ NETDEV_FEATURE_COUNT);
+ bitmap_xor(active_diff_mask, old_active, new_active,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(req_wanted, req_wanted, wanted_diff_mask,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(new_active, new_active, active_diff_mask,
+ NETDEV_FEATURE_COUNT);
+
+ ret = features_send_reply(dev, info, req_wanted,
+ wanted_diff_mask, new_active,
+ active_diff_mask, compact);
+ }
+ if (mod)
+ netdev_features_change(dev);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c
new file mode 100644
index 000000000000..4669e74cbcaa
--- /dev/null
+++ b/net/ethtool/fec.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct fec_req_info {
+ struct ethnl_req_info base;
+};
+
+struct fec_reply_data {
+ struct ethnl_reply_data base;
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes);
+ u32 active_fec;
+ u8 fec_auto;
+ struct fec_stat_grp {
+ u64 stats[1 + ETHTOOL_MAX_LANES];
+ u8 cnt;
+ } corr, uncorr, corr_bits;
+ struct ethtool_fec_hist fec_stat_hist;
+};
+
+#define FEC_REPDATA(__reply_base) \
+ container_of(__reply_base, struct fec_reply_data, base)
+
+#define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1)
+
+const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = {
+ [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats),
+};
+
+static void
+ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto)
+{
+ if (fec_auto)
+ *fec_auto = !!(fec & ETHTOOL_FEC_AUTO);
+
+ if (fec & ETHTOOL_FEC_OFF)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes);
+ if (fec & ETHTOOL_FEC_RS)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes);
+ if (fec & ETHTOOL_FEC_BASER)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes);
+ if (fec & ETHTOOL_FEC_LLRS)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes);
+}
+
+static int
+ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec,
+ unsigned long *link_modes, u8 fec_auto)
+{
+ memset(fec, 0, sizeof(*fec));
+
+ if (fec_auto)
+ fec->fec |= ETHTOOL_FEC_AUTO;
+
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_OFF;
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_RS;
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_BASER;
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_LLRS;
+
+ if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats)
+{
+ int i;
+
+ if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) {
+ grp->stats[0] = stats->total;
+ grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET;
+ return;
+ }
+
+ grp->cnt = 1;
+ grp->stats[0] = 0;
+ for (i = 0; i < ETHTOOL_MAX_LANES; i++) {
+ if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET)
+ break;
+
+ grp->stats[0] += stats->lanes[i];
+ grp->stats[grp->cnt++] = stats->lanes[i];
+ }
+}
+
+static int fec_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {};
+ struct fec_reply_data *data = FEC_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct ethtool_fecparam fec = {};
+ int ret;
+
+ if (!dev->ethtool_ops->get_fecparam)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_fecparam(dev, &fec);
+ if (ret)
+ goto out_complete;
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ dev->ethtool_ops->get_fec_stats) {
+ struct ethtool_fec_stats stats;
+
+ ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8);
+ ethtool_stats_init((u64 *)data->fec_stat_hist.values,
+ sizeof(data->fec_stat_hist.values) / 8);
+ dev->ethtool_ops->get_fec_stats(dev, &stats,
+ &data->fec_stat_hist);
+
+ fec_stats_recalc(&data->corr, &stats.corrected_blocks);
+ fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks);
+ fec_stats_recalc(&data->corr_bits, &stats.corrected_bits);
+ }
+
+ WARN_ON_ONCE(fec.reserved);
+
+ ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes,
+ &data->fec_auto);
+
+ ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL);
+ data->active_fec = find_first_bit(active_fec_modes,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ /* Don't report attr if no FEC mode set. Note that
+ * ethtool_fecparam_to_link_modes() ignores NONE and AUTO.
+ */
+ if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS)
+ data->active_fec = 0;
+
+out_complete:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int fec_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct fec_reply_data *data = FEC_REPDATA(reply_base);
+ int len = 0;
+ int ret;
+
+ ret = ethnl_bitset_size(data->fec_link_modes, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */
+ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ len += 3 * nla_total_size_64bit(sizeof(u64) *
+ (1 + ETHTOOL_MAX_LANES));
+ /* add FEC bins information */
+ len += (nla_total_size(0) + /* _A_FEC_HIST */
+ nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */
+ nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */
+ /* _A_FEC_HIST_BIN_VAL + per-lane values */
+ nla_total_size_64bit(sizeof(u64)) +
+ nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) *
+ ETHTOOL_FEC_HIST_MAX;
+ }
+
+ return len;
+}
+
+static int fec_put_hist(struct sk_buff *skb,
+ const struct ethtool_fec_hist *hist)
+{
+ const struct ethtool_fec_hist_range *ranges = hist->ranges;
+ const struct ethtool_fec_hist_value *values = hist->values;
+ struct nlattr *nest;
+ int i, j;
+ u64 sum;
+
+ if (!ranges)
+ return 0;
+
+ for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) {
+ if (i && !ranges[i].low && !ranges[i].high)
+ break;
+
+ if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET &&
+ values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET))
+ break;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW,
+ ranges[i].low) ||
+ nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH,
+ ranges[i].high))
+ goto err_cancel_hist;
+ sum = 0;
+ for (j = 0; j < ETHTOOL_MAX_LANES; j++) {
+ if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET)
+ break;
+ sum += values[i].per_lane[j];
+ }
+ if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL,
+ values[i].sum == ETHTOOL_STAT_NOT_SET ?
+ sum : values[i].sum))
+ goto err_cancel_hist;
+ if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE,
+ sizeof(u64) * j,
+ values[i].per_lane,
+ ETHTOOL_A_FEC_HIST_PAD))
+ goto err_cancel_hist;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel_hist:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED,
+ sizeof(u64) * data->corr.cnt,
+ data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+ nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR,
+ sizeof(u64) * data->uncorr.cnt,
+ data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+ nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS,
+ sizeof(u64) * data->corr_bits.cnt,
+ data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD))
+ goto err_cancel;
+
+ if (fec_put_hist(skb, &data->fec_stat_hist))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fec_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct fec_reply_data *data = FEC_REPDATA(reply_base);
+ int ret;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES,
+ data->fec_link_modes, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) ||
+ (data->active_fec &&
+ nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec)))
+ return -EMSGSIZE;
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* FEC_SET */
+
+const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = {
+ [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED },
+ [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int
+ethnl_set_fec_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_fecparam && ops->set_fecparam ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_fec(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {};
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ struct ethtool_fecparam fec = {};
+ bool mod = false;
+ u8 fec_auto;
+ int ret;
+
+ ret = dev->ethtool_ops->get_fecparam(dev, &fec);
+ if (ret < 0)
+ return ret;
+
+ ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto);
+
+ ret = ethnl_update_bitset(fec_link_modes,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_FEC_MODES],
+ link_mode_names, info->extack, &mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod);
+ if (!mod)
+ return 0;
+
+ ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES],
+ "invalid FEC modes requested");
+ return ret;
+ }
+ if (!fec.fec) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES],
+ "no FEC modes set");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_fecparam(dev, &fec);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_fec_request_ops = {
+ .request_cmd = ETHTOOL_MSG_FEC_GET,
+ .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_FEC_HEADER,
+ .req_info_size = sizeof(struct fec_req_info),
+ .reply_data_size = sizeof(struct fec_reply_data),
+
+ .prepare_data = fec_prepare_data,
+ .reply_size = fec_reply_size,
+ .fill_reply = fec_fill_reply,
+
+ .set_validate = ethnl_set_fec_validate,
+ .set = ethnl_set_fec,
+ .set_ntf_cmd = ETHTOOL_MSG_FEC_NTF,
+};
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
new file mode 100644
index 000000000000..fa83ddade4f8
--- /dev/null
+++ b/net/ethtool/ioctl.c
@@ -0,0 +1,3853 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/core/ethtool.c - Ethtool ioctl handler
+ * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
+ *
+ * This file is where we call all the ethtool_ops commands to get
+ * the information ethtool needs.
+ */
+
+#include <linux/compat.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/sfp.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched/signal.h>
+#include <linux/net.h>
+#include <linux/pm_runtime.h>
+#include <linux/utsname.h>
+#include <net/devlink.h>
+#include <net/ipv6.h>
+#include <net/xdp_sock_drv.h>
+#include <net/flow_offload.h>
+#include <net/netdev_lock.h>
+#include <linux/ethtool_netlink.h>
+#include "common.h"
+
+/* State held across locks and calls for commands which have devlink fallback */
+struct ethtool_devlink_compat {
+ struct devlink *devlink;
+ union {
+ struct ethtool_flash efl;
+ struct ethtool_drvinfo info;
+ };
+};
+
+static struct devlink *netdev_to_devlink_get(struct net_device *dev)
+{
+ if (!dev->devlink_port)
+ return NULL;
+ return devlink_try_get(dev->devlink_port->devlink);
+}
+
+/*
+ * Some useful ethtool_ops methods that're device independent.
+ * If we find that all drivers want to do the same thing here,
+ * we can turn these into dev_() function calls.
+ */
+
+u32 ethtool_op_get_link(struct net_device *dev)
+{
+ /* Synchronize carrier state with link watch, see also rtnl_getlink() */
+ __linkwatch_sync_dev(dev);
+
+ return netif_carrier_ok(dev) ? 1 : 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_link);
+
+int ethtool_op_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info)
+{
+ info->so_timestamping =
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ return 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_ts_info);
+
+/* Handlers for each ethtool command */
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 __user *sizeaddr;
+ u32 copy_size;
+ int i;
+
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
+
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
+
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features,
+ array_size(copy_size, sizeof(*features))))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
+
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
+ __netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
+}
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (sset == ETH_SS_RSS_HASH_FUNCS)
+ return ARRAY_SIZE(rss_hash_func_strings);
+
+ if (sset == ETH_SS_TUNABLES)
+ return ARRAY_SIZE(tunable_strings);
+
+ if (sset == ETH_SS_PHY_TUNABLES)
+ return ARRAY_SIZE(phy_tunable_strings);
+
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats &&
+ phy_ops && phy_ops->get_sset_count)
+ return phy_ops->get_sset_count(dev->phydev);
+
+ if (sset == ETH_SS_LINK_MODES)
+ return __ETHTOOL_LINK_MODE_MASK_NBITS;
+
+ if (ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
+}
+
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
+{
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else if (stringset == ETH_SS_RSS_HASH_FUNCS)
+ memcpy(data, rss_hash_func_strings,
+ sizeof(rss_hash_func_strings));
+ else if (stringset == ETH_SS_TUNABLES)
+ memcpy(data, tunable_strings, sizeof(tunable_strings));
+ else if (stringset == ETH_SS_PHY_TUNABLES)
+ memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats && phy_ops &&
+ phy_ops->get_strings)
+ phy_ops->get_strings(dev->phydev, data);
+ else if (stringset == ETH_SS_LINK_MODES)
+ memcpy(data, link_mode_names,
+ __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN);
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
+}
+
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
+{
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC |
+ NETIF_F_SCTP_CRC;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG | NETIF_F_FRAGLIST;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
+}
+
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
+{
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
+{
+ struct ethtool_value edata;
+ netdev_features_t mask;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (!mask)
+ return -EOPNOTSUPP;
+
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
+ return -EINVAL;
+
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+/* Given two link masks, AND them together and save the result in dst. */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+ struct ethtool_link_ksettings *src)
+{
+ unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS);
+ unsigned int idx = 0;
+
+ for (; idx < size; idx++) {
+ dst->link_modes.supported[idx] &=
+ src->link_modes.supported[idx];
+ dst->link_modes.advertising[idx] &=
+ src->link_modes.advertising[idx];
+ }
+}
+EXPORT_SYMBOL(ethtool_intersect_link_masks);
+
+void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
+ u32 legacy_u32)
+{
+ linkmode_zero(dst);
+ dst[0] = legacy_u32;
+}
+EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
+
+/* return false if src had higher bits set. lower bits always updated. */
+bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+ const unsigned long *src)
+{
+ *legacy_u32 = src[0];
+ return find_next_bit(src, __ETHTOOL_LINK_MODE_MASK_NBITS, 32) ==
+ __ETHTOOL_LINK_MODE_MASK_NBITS;
+}
+EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+ struct ethtool_cmd *legacy_settings,
+ const struct ethtool_link_ksettings *link_ksettings)
+{
+ bool retval = true;
+
+ memset(legacy_settings, 0, sizeof(*legacy_settings));
+ /* this also clears the deprecated fields in legacy structure:
+ * __u8 transceiver;
+ * __u32 maxtxpkt;
+ * __u32 maxrxpkt;
+ */
+
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->supported,
+ link_ksettings->link_modes.supported);
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->advertising,
+ link_ksettings->link_modes.advertising);
+ retval &= ethtool_convert_link_mode_to_legacy_u32(
+ &legacy_settings->lp_advertising,
+ link_ksettings->link_modes.lp_advertising);
+ ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+ legacy_settings->duplex
+ = link_ksettings->base.duplex;
+ legacy_settings->port
+ = link_ksettings->base.port;
+ legacy_settings->phy_address
+ = link_ksettings->base.phy_address;
+ legacy_settings->autoneg
+ = link_ksettings->base.autoneg;
+ legacy_settings->mdio_support
+ = link_ksettings->base.mdio_support;
+ legacy_settings->eth_tp_mdix
+ = link_ksettings->base.eth_tp_mdix;
+ legacy_settings->eth_tp_mdix_ctrl
+ = link_ksettings->base.eth_tp_mdix_ctrl;
+ legacy_settings->transceiver
+ = link_ksettings->base.transceiver;
+ return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32 \
+ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+ struct ethtool_link_settings base;
+ struct {
+ __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ } link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings. */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *link_ksettings)
+{
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
+}
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
+
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+ const void __user *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+ return -EFAULT;
+
+ memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+ bitmap_from_arr32(to->link_modes.supported,
+ link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_from_arr32(to->link_modes.advertising,
+ link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_from_arr32(to->link_modes.lp_advertising,
+ link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ return 0;
+}
+
+/* Check if the user is trying to change anything besides speed/duplex */
+bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd)
+{
+ struct ethtool_link_settings base2 = {};
+
+ base2.speed = cmd->base.speed;
+ base2.port = PORT_OTHER;
+ base2.duplex = cmd->base.duplex;
+ base2.cmd = cmd->base.cmd;
+ base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords;
+
+ return !memcmp(&base2, &cmd->base, sizeof(base2)) &&
+ bitmap_empty(cmd->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS) &&
+ bitmap_empty(cmd->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+ const struct ethtool_link_ksettings *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ memcpy(&link_usettings, from, sizeof(link_usettings));
+ bitmap_to_arr32(link_usettings.link_modes.supported,
+ from->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_arr32(link_usettings.link_modes.advertising,
+ from->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_arr32(link_usettings.link_modes.lp_advertising,
+ from->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Query device for its ethtool_link_settings. */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* handle bitmap nbits handshake */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords) {
+ /* wrong link mode nbits requested */
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ /* send back number of words required as negative val */
+ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+ "need too many bits for link modes!");
+ link_ksettings.base.link_mode_masks_nwords
+ = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+ /* copy the base fields back to user, not the link
+ * mode bitmaps
+ */
+ if (copy_to_user(useraddr, &link_ksettings.base,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ /* handshake successful: user/kernel agree on
+ * link_mode_masks_nwords
+ */
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+ if (err < 0)
+ return err;
+
+ /* make sure we tell the right values to user */
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+ link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED;
+ link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED;
+ link_ksettings.base.rate_matching = RATE_MATCH_NONE;
+
+ return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings. */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_link_ksettings link_ksettings = {};
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* make sure nbits field has expected value */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ /* copy the whole structure, now that we know it has expected
+ * format
+ */
+ err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+ if (err)
+ return err;
+
+ /* re-check nwords field, just in case */
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ if (link_ksettings.base.master_slave_cfg ||
+ link_ksettings.base.master_slave_state)
+ return -EINVAL;
+
+ err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (err >= 0) {
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF);
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF);
+ }
+ return err;
+}
+
+int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd,
+ u32 *dev_speed, u8 *dev_duplex)
+{
+ u32 speed;
+ u8 duplex;
+
+ speed = cmd->base.speed;
+ duplex = cmd->base.duplex;
+ /* don't allow custom speed and duplex */
+ if (!ethtool_validate_speed(speed) ||
+ !ethtool_validate_duplex(duplex) ||
+ !ethtool_virtdev_validate_cmd(cmd))
+ return -EINVAL;
+ *dev_speed = speed;
+ *dev_duplex = duplex;
+
+ return 0;
+}
+EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings);
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now implemented via get_link_ksettings. When driver reports higher link mode
+ * bits, a kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful (only the
+ * lower link mode bits reported back to user). Deprecated fields from
+ * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero.
+ */
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_link_ksettings link_ksettings;
+ struct ethtool_cmd cmd;
+ int err;
+
+ ASSERT_RTNL();
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+ if (err < 0)
+ return err;
+ convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings);
+
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now always implemented via set_link_settings. When user's request updates
+ * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to recommend user to
+ * upgrade ethtool, and the request is rejected.
+ */
+static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_link_ksettings link_ksettings;
+ struct ethtool_cmd cmd;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ if (!dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd))
+ return -EINVAL;
+ link_ksettings.base.link_mode_masks_nwords =
+ __ETHTOOL_LINK_MODE_MASK_NU32;
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (ret >= 0) {
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF);
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF);
+ }
+ return ret;
+}
+
+static int
+ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct device *parent = dev->dev.parent;
+
+ rsp->info.cmd = ETHTOOL_GDRVINFO;
+ strscpy(rsp->info.version, init_uts_ns.name.release,
+ sizeof(rsp->info.version));
+ if (ops->get_drvinfo) {
+ ops->get_drvinfo(dev, &rsp->info);
+ if (!rsp->info.bus_info[0] && parent)
+ strscpy(rsp->info.bus_info, dev_name(parent),
+ sizeof(rsp->info.bus_info));
+ if (!rsp->info.driver[0] && parent && parent->driver)
+ strscpy(rsp->info.driver, parent->driver->name,
+ sizeof(rsp->info.driver));
+ } else if (parent && parent->driver) {
+ strscpy(rsp->info.bus_info, dev_name(parent),
+ sizeof(rsp->info.bus_info));
+ strscpy(rsp->info.driver, parent->driver->name,
+ sizeof(rsp->info.driver));
+ } else if (dev->rtnl_link_ops) {
+ strscpy(rsp->info.driver, dev->rtnl_link_ops->kind,
+ sizeof(rsp->info.driver));
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * this method of obtaining string set info is deprecated;
+ * Use ETHTOOL_GSSET_INFO instead.
+ */
+ if (ops->get_sset_count) {
+ int rc;
+
+ rc = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (rc >= 0)
+ rsp->info.testinfo_len = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (rc >= 0)
+ rsp->info.n_stats = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (rc >= 0)
+ rsp->info.n_priv_flags = rc;
+ }
+ if (ops->get_regs_len) {
+ int ret = ops->get_regs_len(dev);
+
+ if (ret > 0)
+ rsp->info.regdump_len = ret;
+ }
+
+ if (ops->get_eeprom_len)
+ rsp->info.eedump_len = ops->get_eeprom_len(dev);
+
+ if (!rsp->info.fw_version[0])
+ rsp->devlink = netdev_to_devlink_get(dev);
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_sset_info info;
+ u64 sset_mask;
+ int i, idx = 0, n_bits = 0, ret, rc;
+ u32 *info_buf = NULL;
+
+ if (copy_from_user(&info, useraddr, sizeof(info)))
+ return -EFAULT;
+
+ /* store copy of mask, because we zero struct later on */
+ sset_mask = info.sset_mask;
+ if (!sset_mask)
+ return 0;
+
+ /* calculate size of return buffer */
+ n_bits = hweight64(sset_mask);
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GSSET_INFO;
+
+ info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /*
+ * fill return buffer based on input bitmask and successful
+ * get_sset_count return
+ */
+ for (i = 0; i < 64; i++) {
+ if (!(sset_mask & (1ULL << i)))
+ continue;
+
+ rc = __ethtool_get_sset_count(dev, i);
+ if (rc >= 0) {
+ info.sset_mask |= (1ULL << i);
+ info_buf[idx++] = rc;
+ }
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ goto out;
+
+ useraddr += offsetof(struct ethtool_sset_info, data);
+ if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32))))
+ goto out;
+
+ ret = 0;
+
+out:
+ kfree(info_buf);
+ return ret;
+}
+
+static noinline_for_stack int
+ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc,
+ const struct compat_ethtool_rxnfc __user *useraddr,
+ size_t size)
+{
+ struct compat_ethtool_rxnfc crxnfc = {};
+
+ /* We expect there to be holes between fs.m_ext and
+ * fs.ring_cookie and at the end of fs, but nowhere else.
+ * On non-x86, no conversion should be needed.
+ */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) &&
+ sizeof(struct compat_ethtool_rxnfc) !=
+ sizeof(struct ethtool_rxnfc));
+ BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+ sizeof(useraddr->fs.m_ext) !=
+ offsetof(struct ethtool_rxnfc, fs.m_ext) +
+ sizeof(rxnfc->fs.m_ext));
+ BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) -
+ offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
+ offsetof(struct ethtool_rxnfc, fs.location) -
+ offsetof(struct ethtool_rxnfc, fs.ring_cookie));
+
+ if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc))))
+ return -EFAULT;
+
+ *rxnfc = (struct ethtool_rxnfc) {
+ .cmd = crxnfc.cmd,
+ .flow_type = crxnfc.flow_type,
+ .data = crxnfc.data,
+ .fs = {
+ .flow_type = crxnfc.fs.flow_type,
+ .h_u = crxnfc.fs.h_u,
+ .h_ext = crxnfc.fs.h_ext,
+ .m_u = crxnfc.fs.m_u,
+ .m_ext = crxnfc.fs.m_ext,
+ .ring_cookie = crxnfc.fs.ring_cookie,
+ .location = crxnfc.fs.location,
+ },
+ .rule_cnt = crxnfc.rule_cnt,
+ };
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc,
+ const void __user *useraddr,
+ size_t size)
+{
+ if (compat_need_64bit_alignment_fixup())
+ return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size);
+
+ if (copy_from_user(rxnfc, useraddr, size))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_to_compat(void __user *useraddr,
+ const struct ethtool_rxnfc *rxnfc,
+ size_t size, const u32 *rule_buf)
+{
+ struct compat_ethtool_rxnfc crxnfc;
+
+ memset(&crxnfc, 0, sizeof(crxnfc));
+ crxnfc = (struct compat_ethtool_rxnfc) {
+ .cmd = rxnfc->cmd,
+ .flow_type = rxnfc->flow_type,
+ .data = rxnfc->data,
+ .fs = {
+ .flow_type = rxnfc->fs.flow_type,
+ .h_u = rxnfc->fs.h_u,
+ .h_ext = rxnfc->fs.h_ext,
+ .m_u = rxnfc->fs.m_u,
+ .m_ext = rxnfc->fs.m_ext,
+ .ring_cookie = rxnfc->fs.ring_cookie,
+ .location = rxnfc->fs.location,
+ },
+ .rule_cnt = rxnfc->rule_cnt,
+ };
+
+ if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc))))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info,
+ size_t *info_size, void __user *useraddr)
+{
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition.
+ */
+ if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH)
+ *info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info->data));
+
+ if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size))
+ return -EFAULT;
+
+ if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) {
+ *info_size = sizeof(*info);
+ if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size))
+ return -EFAULT;
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info->flow_type & FLOW_RSS))
+ return -EINVAL;
+ }
+
+ if (info->cmd != cmd)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_to_user(void __user *useraddr,
+ const struct ethtool_rxnfc *rxnfc,
+ size_t size, const u32 *rule_buf)
+{
+ int ret;
+
+ if (compat_need_64bit_alignment_fixup()) {
+ ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size,
+ rule_buf);
+ useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs);
+ } else {
+ ret = copy_to_user(useraddr, rxnfc, size);
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+ }
+
+ if (ret)
+ return -EFAULT;
+
+ if (rule_buf) {
+ if (copy_to_user(useraddr, rule_buf,
+ rxnfc->rule_cnt * sizeof(u32)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static bool flow_type_hashable(u32 flow_type)
+{
+ switch (flow_type) {
+ case ETHER_FLOW:
+ case TCP_V4_FLOW:
+ case UDP_V4_FLOW:
+ case SCTP_V4_FLOW:
+ case AH_ESP_V4_FLOW:
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V4_FLOW:
+ case ESP_V4_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV4_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V4_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V4_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V4_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V4_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V4_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V4_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
+static bool flow_type_v6(u32 flow_type)
+{
+ switch (flow_type) {
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
+/* When adding a new type, update the assert and, if it's hashable, add it to
+ * the flow_type_hashable switch case.
+ */
+static_assert(GTPU_DL_V6_FLOW + 1 == __FLOW_TYPE_COUNT);
+
+static int ethtool_check_xfrm_rxfh(u32 input_xfrm, u64 rxfh)
+{
+ /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then:
+ * 1 - no other fields besides IP src/dst and/or L4 src/dst are set
+ * 2 - If src is set, dst must also be set
+ */
+ if ((input_xfrm != RXH_XFRM_NO_CHANGE &&
+ input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) &&
+ !ethtool_rxfh_config_is_sym(rxfh))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int err;
+ u32 i;
+
+ if (!input_xfrm || input_xfrm == RXH_XFRM_NO_CHANGE)
+ return 0;
+
+ for (i = 0; i < __FLOW_TYPE_COUNT; i++) {
+ struct ethtool_rxfh_fields fields = {
+ .flow_type = i,
+ };
+
+ if (!flow_type_hashable(i))
+ continue;
+
+ if (ops->get_rxfh_fields(dev, &fields))
+ continue;
+
+ err = ethtool_check_xfrm_rxfh(input_xfrm, fields.data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_fields fields = {};
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!ops->set_rxfh_fields)
+ return -EOPNOTSUPP;
+
+ rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (rc)
+ return rc;
+
+ if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type))
+ return -EINVAL;
+
+ if (info.flow_type & FLOW_RSS && info.rss_context &&
+ !ops->rxfh_per_ctx_fields)
+ return -EINVAL;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (ops->get_rxfh) {
+ struct ethtool_rxfh_param rxfh = {};
+
+ rc = ops->get_rxfh(dev, &rxfh);
+ if (rc)
+ goto exit_unlock;
+
+ rc = ethtool_check_xfrm_rxfh(rxfh.input_xfrm, info.data);
+ if (rc)
+ goto exit_unlock;
+ }
+
+ fields.data = info.data;
+ fields.flow_type = info.flow_type & ~FLOW_RSS;
+ if (info.flow_type & FLOW_RSS)
+ fields.rss_context = info.rss_context;
+
+ rc = ops->set_rxfh_fields(dev, &fields, NULL);
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (rc)
+ return rc;
+
+ ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_NTF, fields.rss_context);
+ return 0;
+}
+
+static noinline_for_stack int
+ethtool_get_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_fields fields = {};
+ int ret;
+
+ if (!ops->get_rxfh_fields)
+ return -EOPNOTSUPP;
+
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
+
+ if (info.flow_type & FLOW_RSS && info.rss_context &&
+ !ops->rxfh_per_ctx_fields)
+ return -EINVAL;
+
+ fields.flow_type = info.flow_type & ~FLOW_RSS;
+ if (info.flow_type & FLOW_RSS)
+ fields.rss_context = info.rss_context;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = ops->get_rxfh_fields(dev, &fields);
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (ret < 0)
+ return ret;
+
+ info.data = fields.data;
+
+ return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS && info.fs.flow_type & FLOW_RSS) {
+ /* Nonzero ring with RSS only makes sense
+ * if NIC adds them together
+ */
+ if (!ops->cap_rss_rxnfc_adds &&
+ ethtool_get_flow_spec_ring(info.fs.ring_cookie))
+ return -EINVAL;
+
+ if (info.rss_context &&
+ !xa_load(&dev->ethtool->rss_ctx, info.rss_context))
+ return -EINVAL;
+ }
+
+ rc = ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL))
+ return -EFAULT;
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev,
+ u32 cmd,
+ void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size;
+ int ret;
+
+ info_size = sizeof(info);
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
+
+ ret = ethtool_get_rx_ring_count(dev);
+ if (ret < 0)
+ return ret;
+
+ info.data = ret;
+
+ return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info;
+ void *rule_buf = NULL;
+ size_t info_size;
+ int ret;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ info_size = sizeof(info);
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
+
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+ rule_buf = kcalloc(info.rule_cnt, sizeof(u32),
+ GFP_USER);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf);
+err_out:
+ kfree(rule_buf);
+
+ return ret;
+}
+
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ int num_rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0]))))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= num_rx_rings)
+ return -EINVAL;
+
+ return 0;
+}
+
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
+
+void netdev_rss_key_fill(void *buffer, size_t len)
+{
+ BUG_ON(len > sizeof(netdev_rss_key));
+ net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+ memcpy(buffer, netdev_rss_key, len);
+}
+EXPORT_SYMBOL(netdev_rss_key_fill);
+
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_rxfh_param rxfh = {};
+ u32 user_size;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (rxfh.indir_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &rxfh.indir_size, sizeof(rxfh.indir_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < rxfh.indir_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER);
+ if (!rxfh.indir)
+ return -ENOMEM;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = dev->ethtool_ops->get_rxfh(dev, &rxfh);
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (ret)
+ goto out;
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir)))
+ ret = -EFAULT;
+
+out:
+ kfree(rxfh.indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_param rxfh_dev = {};
+ struct netlink_ext_ack *extack = NULL;
+ int num_rx_rings;
+ u32 user_size, i;
+ int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
+
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
+ if (rxfh_dev.indir_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (user_size != 0 && user_size != rxfh_dev.indir_size)
+ return -EINVAL;
+
+ rxfh_dev.indir = kcalloc(rxfh_dev.indir_size,
+ sizeof(rxfh_dev.indir[0]), GFP_USER);
+ if (!rxfh_dev.indir)
+ return -ENOMEM;
+
+ num_rx_rings = ethtool_get_rx_ring_count(dev);
+ if (num_rx_rings < 0) {
+ ret = num_rx_rings;
+ goto out;
+ }
+
+ if (user_size == 0) {
+ u32 *indir = rxfh_dev.indir;
+
+ for (i = 0; i < rxfh_dev.indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, num_rx_rings);
+ } else {
+ ret = ethtool_copy_validate_indir(rxfh_dev.indir,
+ useraddr + ringidx_offset,
+ num_rx_rings,
+ rxfh_dev.indir_size);
+ if (ret)
+ goto out;
+ }
+
+ rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = ops->set_rxfh(dev, &rxfh_dev, extack);
+ if (ret)
+ goto out_unlock;
+
+ /* indicate whether rxfh was set to default */
+ if (user_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+out:
+ kfree(rxfh_dev.indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_param rxfh_dev = {};
+ u32 user_indir_size, user_key_size;
+ struct ethtool_rxfh_context *ctx;
+ struct ethtool_rxfh rxfh;
+ u32 indir_bytes;
+ u8 *rss_config;
+ u32 total_size;
+ int ret;
+
+ if (!ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ rxfh_dev.key_size = ops->get_rxfh_key_size(dev);
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
+ return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+
+ rxfh.indir_size = rxfh_dev.indir_size;
+ rxfh.key_size = rxfh_dev.key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
+
+ if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) ||
+ (user_key_size && user_key_size != rxfh_dev.key_size))
+ return -EINVAL;
+
+ indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (user_indir_size)
+ rxfh_dev.indir = (u32 *)rss_config;
+
+ if (user_key_size)
+ rxfh_dev.key = rss_config + indir_bytes;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (rxfh.rss_context) {
+ ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (rxfh_dev.indir)
+ memcpy(rxfh_dev.indir, ethtool_rxfh_context_indir(ctx),
+ indir_bytes);
+ if (!ops->rxfh_per_ctx_key) {
+ rxfh_dev.key_size = 0;
+ } else {
+ if (rxfh_dev.key)
+ memcpy(rxfh_dev.key,
+ ethtool_rxfh_context_key(ctx),
+ user_key_size);
+ rxfh_dev.hfunc = ctx->hfunc;
+ }
+ rxfh_dev.input_xfrm = ctx->input_xfrm;
+ ret = 0;
+ } else {
+ ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev);
+ if (ret)
+ goto out;
+ }
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
+ &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, input_xfrm),
+ &rxfh_dev.input_xfrm,
+ sizeof(rxfh.input_xfrm))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, key_size),
+ &rxfh_dev.key_size,
+ sizeof(rxfh.key_size))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size)) {
+ ret = -EFAULT;
+ }
+out:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ kfree(rss_config);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 user_indir_len = 0, indir_bytes = 0;
+ struct ethtool_rxfh_param rxfh_dev = {};
+ struct ethtool_rxfh_context *ctx = NULL;
+ struct netlink_ext_ack *extack = NULL;
+ struct ethtool_rxfh rxfh;
+ bool create = false;
+ int num_rx_rings;
+ u8 *rss_config;
+ int ntf = 0;
+ int ret;
+
+ if (!ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
+ return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+ /* Check input data transformation capabilities */
+ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR &&
+ rxfh.input_xfrm != RXH_XFRM_SYM_OR_XOR &&
+ rxfh.input_xfrm != RXH_XFRM_NO_CHANGE)
+ return -EINVAL;
+ if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE &&
+ rxfh.input_xfrm & ~ops->supported_input_xfrm)
+ return -EOPNOTSUPP;
+ create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC;
+
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && rxfh.key_size != dev_key_size))
+ return -EINVAL;
+
+ /* Must request at least one change: indir size, hash key, function
+ * or input transformation.
+ * There's no need for any of it in case of context creation.
+ */
+ if (!create &&
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE &&
+ rxfh.input_xfrm == RXH_XFRM_NO_CHANGE))
+ return -EINVAL;
+
+ indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]);
+
+ /* Check settings which may be global rather than per RSS-context */
+ if (rxfh.rss_context && !ops->rxfh_per_ctx_key)
+ if (rxfh.key_size ||
+ (rxfh.hfunc && rxfh.hfunc != ETH_RSS_HASH_NO_CHANGE) ||
+ (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE))
+ return -EOPNOTSUPP;
+
+ rss_config = kzalloc(indir_bytes + dev_key_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ num_rx_rings = ethtool_get_rx_ring_count(dev);
+ if (num_rx_rings < 0) {
+ ret = num_rx_rings;
+ goto out_free;
+ }
+
+ /* rxfh.indir_size == 0 means reset the indir table to default (master
+ * context) or delete the context (other RSS contexts).
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ user_indir_len = indir_bytes;
+ rxfh_dev.indir = (u32 *)rss_config;
+ rxfh_dev.indir_size = dev_indir_size;
+ ret = ethtool_copy_validate_indir(rxfh_dev.indir,
+ useraddr + rss_cfg_offset,
+ num_rx_rings,
+ rxfh.indir_size);
+ if (ret)
+ goto out_free;
+ } else if (rxfh.indir_size == 0) {
+ if (rxfh.rss_context == 0) {
+ u32 *indir;
+
+ rxfh_dev.indir = (u32 *)rss_config;
+ rxfh_dev.indir_size = dev_indir_size;
+ indir = rxfh_dev.indir;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] =
+ ethtool_rxfh_indir_default(i, num_rx_rings);
+ } else {
+ rxfh_dev.rss_delete = true;
+ }
+ }
+
+ if (rxfh.key_size) {
+ rxfh_dev.key_size = dev_key_size;
+ rxfh_dev.key = rss_config + indir_bytes;
+ if (copy_from_user(rxfh_dev.key,
+ useraddr + rss_cfg_offset + user_indir_len,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+ }
+
+ mutex_lock(&dev->ethtool->rss_lock);
+
+ ret = ethtool_check_flow_types(dev, rxfh.input_xfrm);
+ if (ret)
+ goto out_unlock;
+
+ if (rxfh.rss_context && rxfh_dev.rss_delete) {
+ ret = ethtool_check_rss_ctx_busy(dev, rxfh.rss_context);
+ if (ret)
+ goto out_unlock;
+ }
+
+ if (create) {
+ u32 limit, ctx_id;
+
+ if (rxfh_dev.rss_delete) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ ctx = ethtool_rxfh_ctx_alloc(ops, dev_indir_size, dev_key_size);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ limit = ops->rxfh_max_num_contexts ?: U32_MAX;
+ ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx,
+ XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT);
+ if (ret < 0) {
+ kfree(ctx);
+ goto out_unlock;
+ }
+ WARN_ON(!ctx_id); /* can't happen */
+ rxfh.rss_context = ctx_id;
+ } else if (rxfh.rss_context) {
+ ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ }
+ rxfh_dev.hfunc = rxfh.hfunc;
+ rxfh_dev.rss_context = rxfh.rss_context;
+ rxfh_dev.input_xfrm = rxfh.input_xfrm;
+
+ if (!rxfh.rss_context) {
+ ntf = ETHTOOL_MSG_RSS_NTF;
+ ret = ops->set_rxfh(dev, &rxfh_dev, extack);
+ } else if (create) {
+ ntf = ETHTOOL_MSG_RSS_CREATE_NTF;
+ ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack);
+ /* Make sure driver populates defaults */
+ WARN_ON_ONCE(!ret && !rxfh_dev.key && ops->rxfh_per_ctx_key &&
+ !memchr_inv(ethtool_rxfh_context_key(ctx), 0,
+ ctx->key_size));
+ } else if (rxfh_dev.rss_delete) {
+ ntf = ETHTOOL_MSG_RSS_DELETE_NTF;
+ ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context,
+ extack);
+ } else {
+ ntf = ETHTOOL_MSG_RSS_NTF;
+ ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, extack);
+ }
+ if (ret) {
+ ntf = 0;
+ if (create) {
+ /* failed to create, free our new tracking entry */
+ xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context);
+ kfree(ctx);
+ }
+ goto out_unlock;
+ }
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
+ &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context)))
+ ret = -EFAULT;
+
+ if (!rxfh_dev.rss_context) {
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+ }
+ /* Update rss_ctx tracking */
+ if (rxfh_dev.rss_delete) {
+ WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx);
+ kfree(ctx);
+ } else if (ctx) {
+ if (rxfh_dev.indir) {
+ for (i = 0; i < dev_indir_size; i++)
+ ethtool_rxfh_context_indir(ctx)[i] = rxfh_dev.indir[i];
+ ctx->indir_configured =
+ rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE;
+ }
+ if (rxfh_dev.key) {
+ memcpy(ethtool_rxfh_context_key(ctx), rxfh_dev.key,
+ dev_key_size);
+ ctx->key_configured = !!rxfh.key_size;
+ }
+ if (rxfh_dev.hfunc != ETH_RSS_HASH_NO_CHANGE)
+ ctx->hfunc = rxfh_dev.hfunc;
+ if (rxfh_dev.input_xfrm != RXH_XFRM_NO_CHANGE)
+ ctx->input_xfrm = rxfh_dev.input_xfrm;
+ }
+
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+out_free:
+ kfree(rss_config);
+ if (ntf)
+ ethtool_rss_notify(dev, ntf, rxfh.rss_context);
+ return ret;
+}
+
+static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_regs regs;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *regbuf;
+ int reglen, ret;
+
+ if (!ops->get_regs || !ops->get_regs_len)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&regs, useraddr, sizeof(regs)))
+ return -EFAULT;
+
+ reglen = ops->get_regs_len(dev);
+ if (reglen <= 0)
+ return reglen;
+
+ if (regs.len > reglen)
+ regs.len = reglen;
+
+ regbuf = vzalloc(reglen);
+ if (!regbuf)
+ return -ENOMEM;
+
+ if (regs.len < reglen)
+ reglen = regs.len;
+
+ ops->get_regs(dev, &regs, regbuf);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &regs, sizeof(regs)))
+ goto out;
+ useraddr += offsetof(struct ethtool_regs, data);
+ if (copy_to_user(useraddr, regbuf, reglen))
+ goto out;
+ ret = 0;
+
+ out:
+ vfree(regbuf);
+ return ret;
+}
+
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value reset;
+ int ret;
+
+ if (!dev->ethtool_ops->reset)
+ return -EOPNOTSUPP;
+
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
+ if (copy_from_user(&reset, useraddr, sizeof(reset)))
+ return -EFAULT;
+
+ ret = dev->ethtool_ops->reset(dev, &reset.data);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &reset, sizeof(reset)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol;
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+ wol.cmd = ETHTOOL_GWOL;
+ dev->ethtool_ops->get_wol(dev, &wol);
+
+ if (copy_to_user(useraddr, &wol, sizeof(wol)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol, cur_wol;
+ int ret;
+
+ if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol)
+ return -EOPNOTSUPP;
+
+ memset(&cur_wol, 0, sizeof(struct ethtool_wolinfo));
+ cur_wol.cmd = ETHTOOL_GWOL;
+ dev->ethtool_ops->get_wol(dev, &cur_wol);
+
+ if (copy_from_user(&wol, useraddr, sizeof(wol)))
+ return -EFAULT;
+
+ if (wol.wolopts & ~cur_wol.supported)
+ return -EINVAL;
+
+ if (wol.wolopts == cur_wol.wolopts &&
+ !memcmp(wol.sopass, cur_wol.sopass, sizeof(wol.sopass)))
+ return 0;
+
+ ret = dev->ethtool_ops->set_wol(dev, &wol);
+ if (ret)
+ return ret;
+
+ dev->ethtool->wol_enabled = !!wol.wolopts;
+ ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF);
+
+ return 0;
+}
+
+static void eee_to_keee(struct ethtool_keee *keee,
+ const struct ethtool_eee *eee)
+{
+ memset(keee, 0, sizeof(*keee));
+
+ keee->eee_enabled = eee->eee_enabled;
+ keee->tx_lpi_enabled = eee->tx_lpi_enabled;
+ keee->tx_lpi_timer = eee->tx_lpi_timer;
+
+ ethtool_convert_legacy_u32_to_link_mode(keee->advertised,
+ eee->advertised);
+}
+
+static void keee_to_eee(struct ethtool_eee *eee,
+ const struct ethtool_keee *keee)
+{
+ bool overflow;
+
+ memset(eee, 0, sizeof(*eee));
+
+ eee->eee_active = keee->eee_active;
+ eee->eee_enabled = keee->eee_enabled;
+ eee->tx_lpi_enabled = keee->tx_lpi_enabled;
+ eee->tx_lpi_timer = keee->tx_lpi_timer;
+
+ overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported,
+ keee->supported);
+ ethtool_convert_link_mode_to_legacy_u32(&eee->advertised,
+ keee->advertised);
+ ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised,
+ keee->lp_advertised);
+ if (overflow)
+ pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n");
+}
+
+static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_keee keee;
+ struct ethtool_eee eee;
+ int rc;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+
+ memset(&keee, 0, sizeof(keee));
+ rc = dev->ethtool_ops->get_eee(dev, &keee);
+ if (rc)
+ return rc;
+
+ keee_to_eee(&eee, &keee);
+ if (copy_to_user(useraddr, &eee, sizeof(eee)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_keee keee;
+ struct ethtool_eee eee;
+ int ret;
+
+ if (!dev->ethtool_ops->set_eee)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&eee, useraddr, sizeof(eee)))
+ return -EFAULT;
+
+ eee_to_keee(&keee, &eee);
+ ret = dev->ethtool_ops->set_eee(dev, &keee);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF);
+ return ret;
+}
+
+static int ethtool_nway_reset(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->nway_reset)
+ return -EOPNOTSUPP;
+
+ return dev->ethtool_ops->nway_reset(dev);
+}
+
+static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+ int link = __ethtool_get_link(dev);
+
+ if (link < 0)
+ return link;
+
+ edata.data = link;
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
+ int (*getter)(struct net_device *,
+ struct ethtool_eeprom *, u8 *),
+ u32 total_len)
+{
+ struct ethtool_eeprom eeprom;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > total_len)
+ return -EINVAL;
+
+ data = kzalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ ret = getter(dev, &eeprom, data);
+ if (ret)
+ break;
+ if (!eeprom.len) {
+ ret = -EIO;
+ break;
+ }
+ if (copy_to_user(userbuf, data, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ eeprom.len = userbuf - (useraddr + sizeof(eeprom));
+ eeprom.offset -= eeprom.len;
+ if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
+ ret = -EFAULT;
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
+ return -EOPNOTSUPP;
+
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom,
+ ops->get_eeprom_len(dev));
+}
+
+static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_eeprom eeprom;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (!ops->set_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ return -EINVAL;
+
+ data = kzalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ if (copy_from_user(data, userbuf, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = ops->set_eeprom(dev, &eeprom, data);
+ if (ret)
+ break;
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ kfree(data);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+ struct kernel_ethtool_coalesce kernel_coalesce = {};
+ int ret;
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+
+ ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+ NULL);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ return 0;
+}
+
+static bool
+ethtool_set_coalesce_supported(struct net_device *dev,
+ struct ethtool_coalesce *coalesce)
+{
+ u32 supported_params = dev->ethtool_ops->supported_coalesce_params;
+ u32 nonzero_params = 0;
+
+ if (coalesce->rx_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS;
+ if (coalesce->rx_max_coalesced_frames)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES;
+ if (coalesce->rx_coalesce_usecs_irq)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ;
+ if (coalesce->rx_max_coalesced_frames_irq)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ;
+ if (coalesce->tx_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS;
+ if (coalesce->tx_max_coalesced_frames)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES;
+ if (coalesce->tx_coalesce_usecs_irq)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ;
+ if (coalesce->tx_max_coalesced_frames_irq)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ;
+ if (coalesce->stats_block_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS;
+ if (coalesce->use_adaptive_rx_coalesce)
+ nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX;
+ if (coalesce->use_adaptive_tx_coalesce)
+ nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX;
+ if (coalesce->pkt_rate_low)
+ nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW;
+ if (coalesce->rx_coalesce_usecs_low)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW;
+ if (coalesce->rx_max_coalesced_frames_low)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW;
+ if (coalesce->tx_coalesce_usecs_low)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW;
+ if (coalesce->tx_max_coalesced_frames_low)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW;
+ if (coalesce->pkt_rate_high)
+ nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH;
+ if (coalesce->rx_coalesce_usecs_high)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH;
+ if (coalesce->rx_max_coalesced_frames_high)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH;
+ if (coalesce->tx_coalesce_usecs_high)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH;
+ if (coalesce->tx_max_coalesced_frames_high)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH;
+ if (coalesce->rate_sample_interval)
+ nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL;
+
+ return (supported_params & nonzero_params) == nonzero_params;
+}
+
+static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct kernel_ethtool_coalesce kernel_coalesce = {};
+ struct ethtool_coalesce coalesce;
+ int ret;
+
+ if (!dev->ethtool_ops->set_coalesce || !dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+
+ ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+ NULL);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
+ return -EFAULT;
+
+ if (!ethtool_set_coalesce_supported(dev, &coalesce))
+ return -EOPNOTSUPP;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
+ NULL);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF);
+ return ret;
+}
+
+static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
+ struct kernel_ethtool_ringparam kernel_ringparam = {};
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_ringparam(dev, &ringparam,
+ &kernel_ringparam, NULL);
+
+ if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct kernel_ethtool_ringparam kernel_ringparam;
+ struct ethtool_ringparam ringparam, max;
+ int ret;
+
+ if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
+ return -EFAULT;
+
+ ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL);
+
+ /* ensure new ring parameters are within the maximums */
+ if (ringparam.rx_pending > max.rx_max_pending ||
+ ringparam.rx_mini_pending > max.rx_mini_max_pending ||
+ ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending ||
+ ringparam.tx_pending > max.tx_max_pending)
+ return -EINVAL;
+
+ ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
+ &kernel_ringparam, NULL);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+
+ if (copy_to_user(useraddr, &channels, sizeof(channels)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
+ u16 from_channel, to_channel;
+ unsigned int i;
+ int ret;
+
+ if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&channels, useraddr, sizeof(channels)))
+ return -EFAULT;
+
+ dev->ethtool_ops->get_channels(dev, &curr);
+
+ if (channels.rx_count == curr.rx_count &&
+ channels.tx_count == curr.tx_count &&
+ channels.combined_count == curr.combined_count &&
+ channels.other_count == curr.other_count)
+ return 0;
+
+ /* ensure new counts are within the maximums */
+ if (channels.rx_count > curr.max_rx ||
+ channels.tx_count > curr.max_tx ||
+ channels.combined_count > curr.max_combined ||
+ channels.other_count > curr.max_other)
+ return -EINVAL;
+
+ /* ensure there is at least one RX and one TX channel */
+ if (!channels.combined_count &&
+ (!channels.rx_count || !channels.tx_count))
+ return -EINVAL;
+
+ ret = ethtool_check_max_channel(dev, channels, NULL);
+ if (ret)
+ return ret;
+
+ /* Disabling channels, query zero-copy AF_XDP sockets */
+ from_channel = channels.combined_count +
+ min(channels.rx_count, channels.tx_count);
+ to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
+ for (i = from_channel; i < to_channel; i++)
+ if (xsk_get_pool_from_qid(dev, i))
+ return -EINVAL;
+
+ ret = dev->ethtool_ops->set_channels(dev, &channels);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF);
+ return ret;
+}
+
+static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM };
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
+
+ if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam;
+ int ret;
+
+ if (!dev->ethtool_ops->set_pauseparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
+ return -EFAULT;
+
+ ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF);
+ return ret;
+}
+
+static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_test test;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, test_len;
+
+ if (!ops->self_test || !ops->get_sset_count)
+ return -EOPNOTSUPP;
+
+ test_len = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (test_len < 0)
+ return test_len;
+ WARN_ON(test_len == 0);
+
+ if (copy_from_user(&test, useraddr, sizeof(test)))
+ return -EFAULT;
+
+ test.len = test_len;
+ data = kcalloc(test_len, sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ netif_testing_on(dev);
+ ops->self_test(dev, &test, data);
+ netif_testing_off(dev);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &test, sizeof(test)))
+ goto out;
+ useraddr += sizeof(test);
+ if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64))))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gstrings gstrings;
+ u8 *data;
+ int ret;
+
+ if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+ return -EFAULT;
+
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
+ if (ret < 0)
+ return ret;
+ if (ret > S32_MAX / ETH_GSTRING_LEN)
+ return -ENOMEM;
+ WARN_ON_ONCE(!ret);
+
+ gstrings.len = ret;
+
+ if (gstrings.len) {
+ data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
+ if (!data)
+ return -ENOMEM;
+
+ __ethtool_get_strings(dev, gstrings.string_set, data);
+ } else {
+ data = NULL;
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+ goto out;
+ useraddr += sizeof(gstrings);
+ if (gstrings.len &&
+ copy_to_user(useraddr, data,
+ array_size(gstrings.len, ETH_GSTRING_LEN)))
+ goto out;
+ ret = 0;
+
+out:
+ vfree(data);
+ return ret;
+}
+
+__printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(*data, ETH_GSTRING_LEN, fmt, args);
+ va_end(args);
+
+ *data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_sprintf);
+
+void ethtool_puts(u8 **data, const char *str)
+{
+ strscpy(*data, str, ETH_GSTRING_LEN);
+ *data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_puts);
+
+static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_value id;
+ static bool busy;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ netdevice_tracker dev_tracker;
+ int rc;
+
+ if (!ops->set_phys_id)
+ return -EOPNOTSUPP;
+
+ if (busy)
+ return -EBUSY;
+
+ if (copy_from_user(&id, useraddr, sizeof(id)))
+ return -EFAULT;
+
+ rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
+ if (rc < 0)
+ return rc;
+
+ /* Drop the RTNL lock while waiting, but prevent reentry or
+ * removal of the device.
+ */
+ busy = true;
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+
+ if (rc == 0) {
+ /* Driver will handle this itself */
+ schedule_timeout_interruptible(
+ id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
+ } else {
+ /* Driver expects to be called at twice the frequency in rc */
+ int n = rc * 2, interval = HZ / n;
+ u64 count = mul_u32_u32(n, id.data);
+ u64 i = 0;
+
+ do {
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ rc = ops->set_phys_id(dev,
+ (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && (!id.data || i < count));
+ }
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ netdev_put(dev, &dev_tracker);
+ busy = false;
+
+ (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
+ return rc;
+}
+
+static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!ops->get_ethtool_stats || !ops->get_sset_count)
+ return -EOPNOTSUPP;
+
+ n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (n_stats < 0)
+ return n_stats;
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ WARN_ON_ONCE(!n_stats);
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+
+ if (n_stats) {
+ data = vzalloc(array_size(n_stats, sizeof(u64)));
+ if (!data)
+ return -ENOMEM;
+ ops->get_ethtool_stats(dev, &stats, data);
+ } else {
+ data = NULL;
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64))))
+ goto out;
+ ret = 0;
+
+ out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_vzalloc_stats_array(int n_stats, u64 **data)
+{
+ if (n_stats < 0)
+ return n_stats;
+ if (n_stats > S32_MAX / sizeof(u64))
+ return -ENOMEM;
+ if (WARN_ON_ONCE(!n_stats))
+ return -EOPNOTSUPP;
+
+ *data = vzalloc(array_size(n_stats, sizeof(u64)));
+ if (!*data)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ethtool_get_phy_stats_phydev(struct phy_device *phydev,
+ struct ethtool_stats *stats,
+ u64 **data)
+ {
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
+ int n_stats, ret;
+
+ if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats)
+ return -EOPNOTSUPP;
+
+ n_stats = phy_ops->get_sset_count(phydev);
+
+ ret = ethtool_vzalloc_stats_array(n_stats, data);
+ if (ret)
+ return ret;
+
+ stats->n_stats = n_stats;
+ return phy_ops->get_stats(phydev, stats, *data);
+}
+
+static int ethtool_get_phy_stats_ethtool(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 **data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int n_stats, ret;
+
+ if (!ops || !ops->get_sset_count || !ops->get_ethtool_phy_stats)
+ return -EOPNOTSUPP;
+
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+
+ ret = ethtool_vzalloc_stats_array(n_stats, data);
+ if (ret)
+ return ret;
+
+ stats->n_stats = n_stats;
+ ops->get_ethtool_phy_stats(dev, stats, *data);
+
+ return 0;
+}
+
+static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
+ u64 *data = NULL;
+ int ret = -EOPNOTSUPP;
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ if (phydev)
+ ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data);
+
+ if (ret == -EOPNOTSUPP)
+ ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data);
+
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr, &stats, sizeof(stats))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64))))
+ ret = -EFAULT;
+
+ out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_perm_addr epaddr;
+
+ if (copy_from_user(&epaddr, useraddr, sizeof(epaddr)))
+ return -EFAULT;
+
+ if (epaddr.size < dev->addr_len)
+ return -ETOOSMALL;
+ epaddr.size = dev->addr_len;
+
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ return -EFAULT;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, dev->perm_addr, epaddr.size))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
+{
+ struct ethtool_value edata = { .cmd = cmd };
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ edata.data = actor(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr,
+ void (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ actor(dev, edata.data);
+ return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
+ int (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return actor(dev, edata.data);
+}
+
+static int
+ethtool_flash_device(struct net_device *dev, struct ethtool_devlink_compat *req)
+{
+ if (!dev->ethtool_ops->flash_device) {
+ req->devlink = netdev_to_devlink_get(dev);
+ return 0;
+ }
+
+ return dev->ethtool_ops->flash_device(dev, &req->efl);
+}
+
+static int ethtool_set_dump(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_dump dump;
+
+ if (!dev->ethtool_ops->set_dump)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_dump(dev, &dump);
+}
+
+static int ethtool_get_dump_flag(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_dump dump;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ ret = ops->get_dump_flag(dev, &dump);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_dump_data(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ __u32 len;
+ struct ethtool_dump dump, tmp;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data = NULL;
+
+ if (!ops->get_dump_data || !ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
+ ret = ops->get_dump_flag(dev, &tmp);
+ if (ret)
+ return ret;
+
+ len = min(tmp.len, dump.len);
+ if (!len)
+ return -EFAULT;
+
+ /* Don't ever let the driver think there's more space available
+ * than it requested with .get_dump_flag().
+ */
+ dump.len = len;
+
+ /* Always allocate enough space to hold the whole thing so that the
+ * driver does not need to check the length and bother with partial
+ * dumping.
+ */
+ data = vzalloc(tmp.len);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_dump_data(dev, &dump, data);
+ if (ret)
+ goto out;
+
+ /* There are two sane possibilities:
+ * 1. The driver's .get_dump_data() does not touch dump.len.
+ * 2. Or it may set dump.len to how much it really writes, which
+ * should be tmp.len (or len if it can do a partial dump).
+ * In any case respond to userspace with the actual length of data
+ * it's receiving.
+ */
+ WARN_ON(dump.len != len && dump.len != tmp.len);
+ dump.len = len;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ useraddr += offsetof(struct ethtool_dump, data);
+ if (copy_to_user(useraddr, data, len))
+ ret = -EFAULT;
+out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
+{
+ struct kernel_ethtool_ts_info kernel_info;
+ struct ethtool_ts_info info = {};
+ int err;
+
+ err = __ethtool_get_ts_info(dev, &kernel_info);
+ if (err)
+ return err;
+
+ info.cmd = kernel_info.cmd;
+ info.so_timestamping = kernel_info.so_timestamping;
+ info.phc_index = kernel_info.phc_index;
+ info.tx_types = kernel_info.tx_types;
+ info.rx_filters = kernel_info.rx_filters;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ethtool_get_module_info_call(struct net_device *dev,
+ struct ethtool_modinfo *modinfo)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
+ if (dev->sfp_bus)
+ return sfp_get_module_info(dev->sfp_bus, modinfo);
+
+ if (phydev && phydev->drv && phydev->drv->module_info)
+ return phydev->drv->module_info(phydev, modinfo);
+
+ if (ops->get_module_info)
+ return ops->get_module_info(dev, modinfo);
+
+ return -EOPNOTSUPP;
+}
+
+static int ethtool_get_module_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+
+ if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
+ return -EFAULT;
+
+ ret = ethtool_get_module_info_call(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &modinfo, sizeof(modinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ethtool_get_module_eeprom_call(struct net_device *dev,
+ struct ethtool_eeprom *ee, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
+ if (dev->sfp_bus)
+ return sfp_get_module_eeprom(dev->sfp_bus, ee, data);
+
+ if (phydev && phydev->drv && phydev->drv->module_eeprom)
+ return phydev->drv->module_eeprom(phydev, ee, data);
+
+ if (ops->get_module_eeprom)
+ return ops->get_module_eeprom(dev, ee, data);
+
+ return -EOPNOTSUPP;
+}
+
+static int ethtool_get_module_eeprom(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+
+ ret = ethtool_get_module_info_call(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ return ethtool_get_any_eeprom(dev, useraddr,
+ ethtool_get_module_eeprom_call,
+ modinfo.eeprom_len);
+}
+
+static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK_BUF_SIZE:
+ if (tuna->len != sizeof(u32) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U32)
+ return -EINVAL;
+ break;
+ case ETHTOOL_PFC_PREVENTION_TOUT:
+ if (tuna->len != sizeof(u16) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U16)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->get_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kzalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_tunable(dev, &tuna, data);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->set_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ useraddr += sizeof(tuna);
+ data = memdup_user(useraddr, tuna.len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ ret = ops->set_tunable(dev, &tuna, data);
+
+ kfree(data);
+ return ret;
+}
+
+static noinline_for_stack int
+ethtool_get_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int ret;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if (!dev->ethtool_ops->get_per_queue_coalesce)
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask,
+ MAX_NUM_QUEUE);
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ return ret;
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ useraddr += sizeof(coalesce);
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+ethtool_set_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int i, ret = 0;
+ int n_queue;
+ struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+ (!dev->ethtool_ops->get_per_queue_coalesce))
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE);
+ n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+ tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+ if (!backup)
+ return -ENOMEM;
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce;
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+ if (ret != 0)
+ goto roll_back;
+
+ tmp++;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+ ret = -EFAULT;
+ goto roll_back;
+ }
+
+ if (!ethtool_set_coalesce_supported(dev, &coalesce)) {
+ ret = -EOPNOTSUPP;
+ goto roll_back;
+ }
+
+ ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ goto roll_back;
+
+ useraddr += sizeof(coalesce);
+ }
+
+roll_back:
+ if (ret != 0) {
+ tmp = backup;
+ for_each_set_bit(i, queue_mask, bit) {
+ dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+ tmp++;
+ }
+ }
+ kfree(backup);
+
+ return ret;
+}
+
+static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev,
+ void __user *useraddr, u32 sub_cmd)
+{
+ struct ethtool_per_queue_op per_queue_opt;
+
+ if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+ return -EFAULT;
+
+ if (per_queue_opt.sub_command != sub_cmd)
+ return -EINVAL;
+
+ switch (per_queue_opt.sub_command) {
+ case ETHTOOL_GCOALESCE:
+ return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ case ETHTOOL_SCOALESCE:
+ return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_PHY_DOWNSHIFT:
+ case ETHTOOL_PHY_FAST_LINK_DOWN:
+ if (tuna->len != sizeof(u8) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U8)
+ return -EINVAL;
+ break;
+ case ETHTOOL_PHY_EDPD:
+ if (tuna->len != sizeof(u16) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U16)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ struct phy_device *phydev = dev->phydev;
+ struct ethtool_tunable tuna;
+ bool phy_drv_tunable;
+ void *data;
+ int ret;
+
+ phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable;
+ if (!phy_drv_tunable && !dev->ethtool_ops->get_phy_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kzalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ if (phy_drv_tunable) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ ret = dev->ethtool_ops->get_phy_tunable(dev, &tuna, data);
+ }
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ struct phy_device *phydev = dev->phydev;
+ struct ethtool_tunable tuna;
+ bool phy_drv_tunable;
+ void *data;
+ int ret;
+
+ phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable;
+ if (!phy_drv_tunable && !dev->ethtool_ops->set_phy_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ useraddr += sizeof(tuna);
+ data = memdup_user(useraddr, tuna.len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ if (phy_drv_tunable) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ ret = dev->ethtool_ops->set_phy_tunable(dev, &tuna, data);
+ }
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM };
+ int rc;
+
+ if (!dev->ethtool_ops->get_fecparam)
+ return -EOPNOTSUPP;
+
+ rc = dev->ethtool_ops->get_fecparam(dev, &fecparam);
+ if (rc)
+ return rc;
+
+ if (WARN_ON_ONCE(fecparam.reserved))
+ fecparam.reserved = 0;
+
+ if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_fecparam fecparam;
+
+ if (!dev->ethtool_ops->set_fecparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
+ return -EFAULT;
+
+ if (!fecparam.fec || fecparam.fec & ETHTOOL_FEC_NONE)
+ return -EINVAL;
+
+ fecparam.active_fec = 0;
+ fecparam.reserved = 0;
+
+ return dev->ethtool_ops->set_fecparam(dev, &fecparam);
+}
+
+/* The main entry point in this file. Called from net/core/dev_ioctl.c */
+
+static int
+__dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr,
+ u32 ethcmd, struct ethtool_devlink_compat *devlink_state)
+{
+ struct net_device *dev;
+ u32 sub_cmd;
+ int rc;
+ netdev_features_t old_features;
+
+ dev = __dev_get_by_name(net, ifr->ifr_name);
+ if (!dev)
+ return -ENODEV;
+
+ if (ethcmd == ETHTOOL_PERQUEUE) {
+ if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+ return -EFAULT;
+ } else {
+ sub_cmd = ethcmd;
+ }
+ /* Allow some commands to be done by anyone */
+ switch (sub_cmd) {
+ case ETHTOOL_GSET:
+ case ETHTOOL_GDRVINFO:
+ case ETHTOOL_GMSGLVL:
+ case ETHTOOL_GLINK:
+ case ETHTOOL_GCOALESCE:
+ case ETHTOOL_GRINGPARAM:
+ case ETHTOOL_GPAUSEPARAM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GSSET_INFO:
+ case ETHTOOL_GSTRINGS:
+ case ETHTOOL_GSTATS:
+ case ETHTOOL_GPHYSTATS:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GPERMADDR:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GPFLAGS:
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
+ case ETHTOOL_GFEATURES:
+ case ETHTOOL_GCHANNELS:
+ case ETHTOOL_GET_TS_INFO:
+ case ETHTOOL_GEEE:
+ case ETHTOOL_GTUNABLE:
+ case ETHTOOL_PHY_GTUNABLE:
+ case ETHTOOL_GLINKSETTINGS:
+ case ETHTOOL_GFECPARAM:
+ break;
+ default:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ }
+
+ netdev_lock_ops(dev);
+ if (dev->dev.parent)
+ pm_runtime_get_sync(dev->dev.parent);
+
+ if (!netif_device_present(dev)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ rc = dev->ethtool_ops->begin(dev);
+ if (rc < 0)
+ goto out;
+ }
+ old_features = dev->features;
+
+ switch (ethcmd) {
+ case ETHTOOL_GSET:
+ rc = ethtool_get_settings(dev, useraddr);
+ break;
+ case ETHTOOL_SSET:
+ rc = ethtool_set_settings(dev, useraddr);
+ break;
+ case ETHTOOL_GDRVINFO:
+ rc = ethtool_get_drvinfo(dev, devlink_state);
+ break;
+ case ETHTOOL_GREGS:
+ rc = ethtool_get_regs(dev, useraddr);
+ break;
+ case ETHTOOL_GWOL:
+ rc = ethtool_get_wol(dev, useraddr);
+ break;
+ case ETHTOOL_SWOL:
+ rc = ethtool_set_wol(dev, useraddr);
+ break;
+ case ETHTOOL_GMSGLVL:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_msglevel);
+ break;
+ case ETHTOOL_SMSGLVL:
+ rc = ethtool_set_value_void(dev, useraddr,
+ dev->ethtool_ops->set_msglevel);
+ if (!rc)
+ ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF);
+ break;
+ case ETHTOOL_GEEE:
+ rc = ethtool_get_eee(dev, useraddr);
+ break;
+ case ETHTOOL_SEEE:
+ rc = ethtool_set_eee(dev, useraddr);
+ break;
+ case ETHTOOL_NWAY_RST:
+ rc = ethtool_nway_reset(dev);
+ break;
+ case ETHTOOL_GLINK:
+ rc = ethtool_get_link(dev, useraddr);
+ break;
+ case ETHTOOL_GEEPROM:
+ rc = ethtool_get_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_SEEPROM:
+ rc = ethtool_set_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GCOALESCE:
+ rc = ethtool_get_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_SCOALESCE:
+ rc = ethtool_set_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_GRINGPARAM:
+ rc = ethtool_get_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_SRINGPARAM:
+ rc = ethtool_set_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_GPAUSEPARAM:
+ rc = ethtool_get_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_SPAUSEPARAM:
+ rc = ethtool_set_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_TEST:
+ rc = ethtool_self_test(dev, useraddr);
+ break;
+ case ETHTOOL_GSTRINGS:
+ rc = ethtool_get_strings(dev, useraddr);
+ break;
+ case ETHTOOL_PHYS_ID:
+ rc = ethtool_phys_id(dev, useraddr);
+ break;
+ case ETHTOOL_GSTATS:
+ rc = ethtool_get_stats(dev, useraddr);
+ break;
+ case ETHTOOL_GPERMADDR:
+ rc = ethtool_get_perm_addr(dev, useraddr);
+ break;
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ __ethtool_get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+ break;
+ case ETHTOOL_GPFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_priv_flags);
+ if (!rc)
+ ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF);
+ break;
+ case ETHTOOL_SPFLAGS:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_priv_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ rc = ethtool_get_rxfh_fields(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_GRXRINGS:
+ rc = ethtool_get_rxrings(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_FLASHDEV:
+ rc = ethtool_flash_device(dev, devlink_state);
+ break;
+ case ETHTOOL_RESET:
+ rc = ethtool_reset(dev, useraddr);
+ break;
+ case ETHTOOL_GSSET_INFO:
+ rc = ethtool_get_sset_info(dev, useraddr);
+ break;
+ case ETHTOOL_GRXFHINDIR:
+ rc = ethtool_get_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_SRXFHINDIR:
+ rc = ethtool_set_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SCHANNELS:
+ rc = ethtool_set_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SET_DUMP:
+ rc = ethtool_set_dump(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_FLAG:
+ rc = ethtool_get_dump_flag(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_DATA:
+ rc = ethtool_get_dump_data(dev, useraddr);
+ break;
+ case ETHTOOL_GET_TS_INFO:
+ rc = ethtool_get_ts_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEINFO:
+ rc = ethtool_get_module_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEEEPROM:
+ rc = ethtool_get_module_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GTUNABLE:
+ rc = ethtool_get_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_STUNABLE:
+ rc = ethtool_set_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_GPHYSTATS:
+ rc = ethtool_get_phy_stats(dev, useraddr);
+ break;
+ case ETHTOOL_PERQUEUE:
+ rc = ethtool_set_per_queue(dev, useraddr, sub_cmd);
+ break;
+ case ETHTOOL_GLINKSETTINGS:
+ rc = ethtool_get_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_SLINKSETTINGS:
+ rc = ethtool_set_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_GTUNABLE:
+ rc = get_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_STUNABLE:
+ rc = set_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_GFECPARAM:
+ rc = ethtool_get_fecparam(dev, useraddr);
+ break;
+ case ETHTOOL_SFECPARAM:
+ rc = ethtool_set_fecparam(dev, useraddr);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+out:
+ if (dev->dev.parent)
+ pm_runtime_put(dev->dev.parent);
+ netdev_unlock_ops(dev);
+
+ return rc;
+}
+
+int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr)
+{
+ struct ethtool_devlink_compat *state;
+ u32 ethcmd;
+ int rc;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ switch (ethcmd) {
+ case ETHTOOL_FLASHDEV:
+ if (copy_from_user(&state->efl, useraddr, sizeof(state->efl))) {
+ rc = -EFAULT;
+ goto exit_free;
+ }
+ state->efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+ break;
+ }
+
+ rtnl_lock();
+ rc = __dev_ethtool(net, ifr, useraddr, ethcmd, state);
+ rtnl_unlock();
+ if (rc)
+ goto exit_free;
+
+ switch (ethcmd) {
+ case ETHTOOL_FLASHDEV:
+ if (state->devlink)
+ rc = devlink_compat_flash_update(state->devlink,
+ state->efl.data);
+ break;
+ case ETHTOOL_GDRVINFO:
+ if (state->devlink)
+ devlink_compat_running_version(state->devlink,
+ state->info.fw_version,
+ sizeof(state->info.fw_version));
+ if (copy_to_user(useraddr, &state->info, sizeof(state->info))) {
+ rc = -EFAULT;
+ goto exit_free;
+ }
+ break;
+ }
+
+exit_free:
+ if (state->devlink)
+ devlink_put(state->devlink);
+ kfree(state);
+ return rc;
+}
+
+struct ethtool_rx_flow_key {
+ struct flow_dissector_key_basic basic;
+ union {
+ struct flow_dissector_key_ipv4_addrs ipv4;
+ struct flow_dissector_key_ipv6_addrs ipv6;
+ };
+ struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_ip ip;
+ struct flow_dissector_key_vlan vlan;
+ struct flow_dissector_key_eth_addrs eth_addrs;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct ethtool_rx_flow_match {
+ struct flow_dissector dissector;
+ struct ethtool_rx_flow_key key;
+ struct ethtool_rx_flow_key mask;
+};
+
+struct ethtool_rx_flow_rule *
+ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
+{
+ const struct ethtool_rx_flow_spec *fs = input->fs;
+ struct ethtool_rx_flow_match *match;
+ struct ethtool_rx_flow_rule *flow;
+ struct flow_action_entry *act;
+
+ flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) +
+ sizeof(struct ethtool_rx_flow_match), GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ /* ethtool_rx supports only one single action per rule. */
+ flow->rule = flow_rule_alloc(1);
+ if (!flow->rule) {
+ kfree(flow);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ match = (struct ethtool_rx_flow_match *)flow->priv;
+ flow->rule->match.dissector = &match->dissector;
+ flow->rule->match.mask = &match->mask;
+ flow->rule->match.key = &match->key;
+
+ match->mask.basic.n_proto = htons(0xffff);
+
+ switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case ETHER_FLOW: {
+ const struct ethhdr *ether_spec, *ether_m_spec;
+
+ ether_spec = &fs->h_u.ether_spec;
+ ether_m_spec = &fs->m_u.ether_spec;
+
+ if (!is_zero_ether_addr(ether_m_spec->h_source)) {
+ ether_addr_copy(match->key.eth_addrs.src,
+ ether_spec->h_source);
+ ether_addr_copy(match->mask.eth_addrs.src,
+ ether_m_spec->h_source);
+ }
+ if (!is_zero_ether_addr(ether_m_spec->h_dest)) {
+ ether_addr_copy(match->key.eth_addrs.dst,
+ ether_spec->h_dest);
+ ether_addr_copy(match->mask.eth_addrs.dst,
+ ether_m_spec->h_dest);
+ }
+ if (ether_m_spec->h_proto) {
+ match->key.basic.n_proto = ether_spec->h_proto;
+ match->mask.basic.n_proto = ether_m_spec->h_proto;
+ }
+ }
+ break;
+ case TCP_V4_FLOW:
+ case UDP_V4_FLOW: {
+ const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
+
+ match->key.basic.n_proto = htons(ETH_P_IP);
+
+ v4_spec = &fs->h_u.tcp_ip4_spec;
+ v4_m_spec = &fs->m_u.tcp_ip4_spec;
+
+ if (v4_m_spec->ip4src) {
+ match->key.ipv4.src = v4_spec->ip4src;
+ match->mask.ipv4.src = v4_m_spec->ip4src;
+ }
+ if (v4_m_spec->ip4dst) {
+ match->key.ipv4.dst = v4_spec->ip4dst;
+ match->mask.ipv4.dst = v4_m_spec->ip4dst;
+ }
+ if (v4_m_spec->ip4src ||
+ v4_m_spec->ip4dst) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] =
+ offsetof(struct ethtool_rx_flow_key, ipv4);
+ }
+ if (v4_m_spec->psrc) {
+ match->key.tp.src = v4_spec->psrc;
+ match->mask.tp.src = v4_m_spec->psrc;
+ }
+ if (v4_m_spec->pdst) {
+ match->key.tp.dst = v4_spec->pdst;
+ match->mask.tp.dst = v4_m_spec->pdst;
+ }
+ if (v4_m_spec->psrc ||
+ v4_m_spec->pdst) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] =
+ offsetof(struct ethtool_rx_flow_key, tp);
+ }
+ if (v4_m_spec->tos) {
+ match->key.ip.tos = v4_spec->tos;
+ match->mask.ip.tos = v4_m_spec->tos;
+ match->dissector.used_keys |=
+ BIT(FLOW_DISSECTOR_KEY_IP);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IP] =
+ offsetof(struct ethtool_rx_flow_key, ip);
+ }
+ }
+ break;
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW: {
+ const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec;
+
+ match->key.basic.n_proto = htons(ETH_P_IPV6);
+
+ v6_spec = &fs->h_u.tcp_ip6_spec;
+ v6_m_spec = &fs->m_u.tcp_ip6_spec;
+ if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) {
+ memcpy(&match->key.ipv6.src, v6_spec->ip6src,
+ sizeof(match->key.ipv6.src));
+ memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src,
+ sizeof(match->mask.ipv6.src));
+ }
+ if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
+ memcpy(&match->key.ipv6.dst, v6_spec->ip6dst,
+ sizeof(match->key.ipv6.dst));
+ memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst,
+ sizeof(match->mask.ipv6.dst));
+ }
+ if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) ||
+ !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] =
+ offsetof(struct ethtool_rx_flow_key, ipv6);
+ }
+ if (v6_m_spec->psrc) {
+ match->key.tp.src = v6_spec->psrc;
+ match->mask.tp.src = v6_m_spec->psrc;
+ }
+ if (v6_m_spec->pdst) {
+ match->key.tp.dst = v6_spec->pdst;
+ match->mask.tp.dst = v6_m_spec->pdst;
+ }
+ if (v6_m_spec->psrc ||
+ v6_m_spec->pdst) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] =
+ offsetof(struct ethtool_rx_flow_key, tp);
+ }
+ if (v6_m_spec->tclass) {
+ match->key.ip.tos = v6_spec->tclass;
+ match->mask.ip.tos = v6_m_spec->tclass;
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_IP);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IP] =
+ offsetof(struct ethtool_rx_flow_key, ip);
+ }
+ }
+ break;
+ default:
+ ethtool_rx_flow_rule_destroy(flow);
+ return ERR_PTR(-EINVAL);
+ }
+
+ switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case TCP_V4_FLOW:
+ case TCP_V6_FLOW:
+ match->key.basic.ip_proto = IPPROTO_TCP;
+ match->mask.basic.ip_proto = 0xff;
+ break;
+ case UDP_V4_FLOW:
+ case UDP_V6_FLOW:
+ match->key.basic.ip_proto = IPPROTO_UDP;
+ match->mask.basic.ip_proto = 0xff;
+ break;
+ }
+
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] =
+ offsetof(struct ethtool_rx_flow_key, basic);
+
+ if (fs->flow_type & FLOW_EXT) {
+ const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext;
+ const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext;
+
+ if (ext_m_spec->vlan_etype) {
+ match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype;
+ match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype;
+ }
+
+ if (ext_m_spec->vlan_tci) {
+ match->key.vlan.vlan_id =
+ ntohs(ext_h_spec->vlan_tci) & 0x0fff;
+ match->mask.vlan.vlan_id =
+ ntohs(ext_m_spec->vlan_tci) & 0x0fff;
+
+ match->key.vlan.vlan_dei =
+ !!(ext_h_spec->vlan_tci & htons(0x1000));
+ match->mask.vlan.vlan_dei =
+ !!(ext_m_spec->vlan_tci & htons(0x1000));
+
+ match->key.vlan.vlan_priority =
+ (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13;
+ match->mask.vlan.vlan_priority =
+ (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13;
+ }
+
+ if (ext_m_spec->vlan_etype ||
+ ext_m_spec->vlan_tci) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_VLAN);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
+ offsetof(struct ethtool_rx_flow_key, vlan);
+ }
+ }
+ if (fs->flow_type & FLOW_MAC_EXT) {
+ const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext;
+ const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext;
+
+ memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest,
+ ETH_ALEN);
+ memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest,
+ ETH_ALEN);
+
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] =
+ offsetof(struct ethtool_rx_flow_key, eth_addrs);
+ }
+
+ act = &flow->rule->action.entries[0];
+ switch (fs->ring_cookie) {
+ case RX_CLS_FLOW_DISC:
+ act->id = FLOW_ACTION_DROP;
+ break;
+ case RX_CLS_FLOW_WAKE:
+ act->id = FLOW_ACTION_WAKE;
+ break;
+ default:
+ act->id = FLOW_ACTION_QUEUE;
+ if (fs->flow_type & FLOW_RSS)
+ act->queue.ctx = input->rss_ctx;
+
+ act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie);
+ act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie);
+ break;
+ }
+
+ return flow;
+}
+EXPORT_SYMBOL(ethtool_rx_flow_rule_create);
+
+void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow)
+{
+ kfree(flow->rule);
+ kfree(flow);
+}
+EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy);
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
new file mode 100644
index 000000000000..30b8ce275159
--- /dev/null
+++ b/net/ethtool/linkinfo.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct linkinfo_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_link_ksettings ksettings;
+ struct ethtool_link_settings *lsettings;
+};
+
+#define LINKINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkinfo_reply_data, base)
+
+const struct nla_policy ethnl_linkinfo_get_policy[] = {
+ [ETHTOOL_A_LINKINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int linkinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ data->lsettings = &data->ksettings.base;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = __ethtool_get_link_ksettings(dev, &data->ksettings);
+ if (ret < 0)
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int linkinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */
+ + 0;
+}
+
+static int linkinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base);
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR,
+ data->lsettings->phy_address) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX,
+ data->lsettings->eth_tp_mdix) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL,
+ data->lsettings->eth_tp_mdix_ctrl) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER,
+ data->lsettings->transceiver))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* LINKINFO_SET */
+
+const struct nla_policy ethnl_linkinfo_set_policy[] = {
+ [ETHTOOL_A_LINKINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 },
+};
+
+static int
+ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ if (!ops->get_link_ksettings || !ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+ return 1;
+}
+
+static int
+ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_link_ksettings ksettings = {};
+ struct ethtool_link_settings *lsettings;
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ return ret;
+ }
+ lsettings = &ksettings.base;
+
+ ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod);
+ ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR],
+ &mod);
+ ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl,
+ tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod);
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "link settings update failed");
+ return ret;
+ }
+
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_linkinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKINFO_HEADER,
+ .req_info_size = sizeof(struct linkinfo_req_info),
+ .reply_data_size = sizeof(struct linkinfo_reply_data),
+
+ .prepare_data = linkinfo_prepare_data,
+ .reply_size = linkinfo_reply_size,
+ .fill_reply = linkinfo_fill_reply,
+
+ .set_validate = ethnl_set_linkinfo_validate,
+ .set = ethnl_set_linkinfo,
+ .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF,
+};
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
new file mode 100644
index 000000000000..259cd9ef1f2a
--- /dev/null
+++ b/net/ethtool/linkmodes.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+/* LINKMODES_GET */
+
+struct linkmodes_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkmodes_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_link_ksettings ksettings;
+ struct ethtool_link_settings *lsettings;
+ bool peer_empty;
+};
+
+#define LINKMODES_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkmodes_reply_data, base)
+
+const struct nla_policy ethnl_linkmodes_get_policy[] = {
+ [ETHTOOL_A_LINKMODES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int linkmodes_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ data->lsettings = &data->ksettings.base;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &data->ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ goto out;
+ }
+
+ if (!dev->ethtool_ops->cap_link_lanes_supported)
+ data->ksettings.lanes = 0;
+
+ data->peer_empty =
+ bitmap_empty(data->ksettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+ const struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int len, ret;
+
+ len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */
+ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */
+ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */
+ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */
+ + nla_total_size(sizeof(u8)) /* LINKMODES_RATE_MATCHING */
+ + 0;
+ ret = ethnl_bitset_size(ksettings->link_modes.advertising,
+ ksettings->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ if (!data->peer_empty) {
+ ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising,
+ NULL, __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
+
+ if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED)
+ len += nla_total_size(sizeof(u8));
+
+ if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED)
+ len += nla_total_size(sizeof(u8));
+
+ return len;
+}
+
+static int linkmodes_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+ const struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg))
+ return -EMSGSIZE;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS,
+ ksettings->link_modes.advertising,
+ ksettings->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names,
+ compact);
+ if (ret < 0)
+ return -EMSGSIZE;
+ if (!data->peer_empty) {
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER,
+ ksettings->link_modes.lp_advertising,
+ NULL, __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex))
+ return -EMSGSIZE;
+
+ if (ksettings->lanes &&
+ nla_put_u32(skb, ETHTOOL_A_LINKMODES_LANES, ksettings->lanes))
+ return -EMSGSIZE;
+
+ if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED &&
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG,
+ lsettings->master_slave_cfg))
+ return -EMSGSIZE;
+
+ if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED &&
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE,
+ lsettings->master_slave_state))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_RATE_MATCHING,
+ lsettings->rate_matching))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* LINKMODES_SET */
+
+const struct nla_policy ethnl_linkmodes_set_policy[] = {
+ [ETHTOOL_A_LINKMODES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 },
+ [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_LANES] = NLA_POLICY_RANGE(NLA_U32, 1, 8),
+};
+
+/* Set advertised link modes to all supported modes matching requested speed,
+ * lanes and duplex values. Called when autonegotiation is on, speed, lanes or
+ * duplex is requested but no link mode change. This is done in userspace with
+ * ioctl() interface, move it into kernel for netlink.
+ * Returns true if advertised modes bitmap was modified.
+ */
+static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
+ bool req_speed, bool req_lanes, bool req_duplex)
+{
+ unsigned long *advertising = ksettings->link_modes.advertising;
+ unsigned long *supported = ksettings->link_modes.supported;
+ DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ unsigned int i;
+
+ bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) {
+ const struct link_mode_info *info = &link_mode_params[i];
+
+ if (info->speed == SPEED_UNKNOWN)
+ continue;
+ if (test_bit(i, supported) &&
+ (!req_speed || info->speed == ksettings->base.speed) &&
+ (!req_lanes || info->lanes == ksettings->lanes) &&
+ (!req_duplex || info->duplex == ksettings->base.duplex))
+ set_bit(i, advertising);
+ else
+ clear_bit(i, advertising);
+ }
+
+ return !bitmap_equal(old_adv, advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static bool ethnl_validate_master_slave_cfg(u8 cfg)
+{
+ switch (cfg) {
+ case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+ case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+ case MASTER_SLAVE_CFG_MASTER_FORCE:
+ case MASTER_SLAVE_CFG_SLAVE_FORCE:
+ return true;
+ }
+
+ return false;
+}
+
+static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb)
+{
+ const struct nlattr *master_slave_cfg, *lanes_cfg;
+
+ master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
+ if (master_slave_cfg &&
+ !ethnl_validate_master_slave_cfg(nla_get_u8(master_slave_cfg))) {
+ NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg,
+ "master/slave value is invalid");
+ return -EOPNOTSUPP;
+ }
+
+ lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES];
+ if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) {
+ NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg,
+ "lanes value is invalid");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
+ struct ethtool_link_ksettings *ksettings,
+ bool *mod, const struct net_device *dev)
+{
+ struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool req_speed, req_lanes, req_duplex;
+ const struct nlattr *master_slave_cfg, *lanes_cfg;
+ int ret;
+
+ master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
+ if (master_slave_cfg) {
+ if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) {
+ NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg,
+ "master/slave configuration not supported by device");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ *mod = false;
+ req_speed = tb[ETHTOOL_A_LINKMODES_SPEED];
+ req_lanes = tb[ETHTOOL_A_LINKMODES_LANES];
+ req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX];
+
+ ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG],
+ mod);
+
+ lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES];
+ if (lanes_cfg) {
+ /* If autoneg is off and lanes parameter is not supported by the
+ * driver, return an error.
+ */
+ if (!lsettings->autoneg &&
+ !dev->ethtool_ops->cap_link_lanes_supported) {
+ NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg,
+ "lanes configuration not supported by device");
+ return -EOPNOTSUPP;
+ }
+ } else if (!lsettings->autoneg && ksettings->lanes) {
+ /* If autoneg is off and lanes parameter is not passed from user but
+ * it was defined previously then set the lanes parameter to 0.
+ */
+ ksettings->lanes = 0;
+ *mod = true;
+ }
+
+ ret = ethnl_update_bitset(ksettings->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names,
+ info->extack, mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED],
+ mod);
+ ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod);
+ ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX],
+ mod);
+ ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod);
+
+ if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg &&
+ (req_speed || req_lanes || req_duplex) &&
+ ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex))
+ *mod = true;
+
+ return 0;
+}
+
+static int
+ethnl_set_linkmodes_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ int ret;
+
+ ret = ethnl_check_linkmodes(info, info->attrs);
+ if (ret < 0)
+ return ret;
+
+ if (!ops->get_link_ksettings || !ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+ return 1;
+}
+
+static int
+ethnl_set_linkmodes(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_link_ksettings ksettings = {};
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ return ret;
+ }
+
+ ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev);
+ if (ret < 0)
+ return ret;
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "link settings update failed");
+ return ret;
+ }
+
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_linkmodes_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKMODES_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKMODES_HEADER,
+ .req_info_size = sizeof(struct linkmodes_req_info),
+ .reply_data_size = sizeof(struct linkmodes_reply_data),
+
+ .prepare_data = linkmodes_prepare_data,
+ .reply_size = linkmodes_reply_size,
+ .fill_reply = linkmodes_fill_reply,
+
+ .set_validate = ethnl_set_linkmodes_validate,
+ .set = ethnl_set_linkmodes,
+ .set_ntf_cmd = ETHTOOL_MSG_LINKMODES_NTF,
+};
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
new file mode 100644
index 000000000000..05a5f72c99fa
--- /dev/null
+++ b/net/ethtool/linkstate.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include <linux/phy.h>
+#include <linux/phylib_stubs.h>
+
+struct linkstate_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkstate_reply_data {
+ struct ethnl_reply_data base;
+ int link;
+ int sqi;
+ int sqi_max;
+ struct ethtool_link_ext_stats link_stats;
+ bool link_ext_state_provided;
+ struct ethtool_link_ext_state_info ethtool_link_ext_state_info;
+};
+
+#define LINKSTATE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkstate_reply_data, base)
+
+const struct nla_policy ethnl_linkstate_get_policy[] = {
+ [ETHTOOL_A_LINKSTATE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
+};
+
+static int linkstate_get_sqi(struct phy_device *phydev)
+{
+ int ret;
+
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&phydev->lock);
+ if (!phydev->drv || !phydev->drv->get_sqi)
+ ret = -EOPNOTSUPP;
+ else if (!phydev->link)
+ ret = -ENETDOWN;
+ else
+ ret = phydev->drv->get_sqi(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+}
+
+static int linkstate_get_sqi_max(struct phy_device *phydev)
+{
+ int ret;
+
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&phydev->lock);
+ if (!phydev->drv || !phydev->drv->get_sqi_max)
+ ret = -EOPNOTSUPP;
+ else if (!phydev->link)
+ ret = -ENETDOWN;
+ else
+ ret = phydev->drv->get_sqi_max(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+};
+
+static bool linkstate_sqi_critical_error(int sqi)
+{
+ return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN;
+}
+
+static bool linkstate_sqi_valid(struct linkstate_reply_data *data)
+{
+ return data->sqi >= 0 && data->sqi_max >= 0 &&
+ data->sqi <= data->sqi_max;
+}
+
+static int linkstate_get_link_ext_state(struct net_device *dev,
+ struct linkstate_reply_data *data)
+{
+ int err;
+
+ if (!dev->ethtool_ops->get_link_ext_state)
+ return -EOPNOTSUPP;
+
+ err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info);
+ if (err)
+ return err;
+
+ data->link_ext_state_provided = true;
+
+ return 0;
+}
+
+static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER,
+ info->extack);
+ if (IS_ERR(phydev)) {
+ ret = PTR_ERR(phydev);
+ goto out;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->link = __ethtool_get_link(dev);
+
+ ret = linkstate_get_sqi(phydev);
+ if (linkstate_sqi_critical_error(ret))
+ goto out;
+ data->sqi = ret;
+
+ ret = linkstate_get_sqi_max(phydev);
+ if (linkstate_sqi_critical_error(ret))
+ goto out;
+ data->sqi_max = ret;
+
+ if (dev->flags & IFF_UP) {
+ ret = linkstate_get_link_ext_state(dev, data);
+ if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA)
+ goto out;
+ }
+
+ ethtool_stats_init((u64 *)&data->link_stats,
+ sizeof(data->link_stats) / 8);
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ if (phydev)
+ phy_ethtool_get_link_ext_stats(phydev,
+ &data->link_stats);
+
+ if (dev->ethtool_ops->get_link_ext_stats)
+ dev->ethtool_ops->get_link_ext_stats(dev,
+ &data->link_stats);
+ }
+
+ ret = 0;
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int linkstate_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+ int len;
+
+ len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
+ + 0;
+
+ if (linkstate_sqi_valid(data)) {
+ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */
+ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */
+ }
+
+ if (data->link_ext_state_provided)
+ len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */
+
+ if (data->ethtool_link_ext_state_info.__link_ext_substate)
+ len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */
+
+ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+ len += nla_total_size(sizeof(u32));
+
+ return len;
+}
+
+static int linkstate_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+
+ if (data->link >= 0 &&
+ nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link))
+ return -EMSGSIZE;
+
+ if (linkstate_sqi_valid(data)) {
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX,
+ data->sqi_max))
+ return -EMSGSIZE;
+ }
+
+ if (data->link_ext_state_provided) {
+ if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE,
+ data->ethtool_link_ext_state_info.link_ext_state))
+ return -EMSGSIZE;
+
+ if (data->ethtool_link_ext_state_info.__link_ext_substate &&
+ nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE,
+ data->ethtool_link_ext_state_info.__link_ext_substate))
+ return -EMSGSIZE;
+ }
+
+ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT,
+ data->link_stats.link_down_events))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_linkstate_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKSTATE_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER,
+ .req_info_size = sizeof(struct linkstate_req_info),
+ .reply_data_size = sizeof(struct linkstate_reply_data),
+
+ .prepare_data = linkstate_prepare_data,
+ .reply_size = linkstate_reply_size,
+ .fill_reply = linkstate_fill_reply,
+};
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
new file mode 100644
index 000000000000..29bbbc149375
--- /dev/null
+++ b/net/ethtool/mm.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2022-2025 NXP
+ * Copyright 2024 Furong Xu <0x1207@gmail.com>
+ */
+#include "common.h"
+#include "netlink.h"
+
+struct mm_req_info {
+ struct ethnl_req_info base;
+};
+
+struct mm_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_mm_state state;
+ struct ethtool_mm_stats stats;
+};
+
+#define MM_REPDATA(__reply_base) \
+ container_of(__reply_base, struct mm_reply_data, base)
+
+#define ETHTOOL_MM_STAT_CNT \
+ (__ETHTOOL_A_MM_STAT_CNT - (ETHTOOL_A_MM_STAT_PAD + 1))
+
+const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = {
+ [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats),
+};
+
+static int mm_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct mm_reply_data *data = MM_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_ops *ops;
+ int ret;
+
+ ops = dev->ethtool_ops;
+
+ if (!ops->get_mm)
+ return -EOPNOTSUPP;
+
+ ethtool_stats_init((u64 *)&data->stats,
+ sizeof(data->stats) / sizeof(u64));
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = ops->get_mm(dev, &data->state);
+ if (ret)
+ goto out_complete;
+
+ if (ops->get_mm_stats && (req_base->flags & ETHTOOL_FLAG_STATS))
+ ops->get_mm_stats(dev, &data->stats);
+
+out_complete:
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int mm_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ int len = 0;
+
+ len += nla_total_size(sizeof(u8)); /* _MM_PMAC_ENABLED */
+ len += nla_total_size(sizeof(u8)); /* _MM_TX_ENABLED */
+ len += nla_total_size(sizeof(u8)); /* _MM_TX_ACTIVE */
+ len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_ENABLED */
+ len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_STATUS */
+ len += nla_total_size(sizeof(u32)); /* _MM_VERIFY_TIME */
+ len += nla_total_size(sizeof(u32)); /* _MM_MAX_VERIFY_TIME */
+ len += nla_total_size(sizeof(u32)); /* _MM_TX_MIN_FRAG_SIZE */
+ len += nla_total_size(sizeof(u32)); /* _MM_RX_MIN_FRAG_SIZE */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ len += nla_total_size(0) + /* _MM_STATS */
+ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_MM_STAT_CNT;
+
+ return len;
+}
+
+static int mm_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, ETHTOOL_A_MM_STAT_PAD))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int mm_put_stats(struct sk_buff *skb,
+ const struct ethtool_mm_stats *stats)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_MM_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (mm_put_stat(skb, stats->MACMergeFrameAssErrorCount,
+ ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS) ||
+ mm_put_stat(skb, stats->MACMergeFrameSmdErrorCount,
+ ETHTOOL_A_MM_STAT_SMD_ERRORS) ||
+ mm_put_stat(skb, stats->MACMergeFrameAssOkCount,
+ ETHTOOL_A_MM_STAT_REASSEMBLY_OK) ||
+ mm_put_stat(skb, stats->MACMergeFragCountRx,
+ ETHTOOL_A_MM_STAT_RX_FRAG_COUNT) ||
+ mm_put_stat(skb, stats->MACMergeFragCountTx,
+ ETHTOOL_A_MM_STAT_TX_FRAG_COUNT) ||
+ mm_put_stat(skb, stats->MACMergeHoldCount,
+ ETHTOOL_A_MM_STAT_HOLD_COUNT))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int mm_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct mm_reply_data *data = MM_REPDATA(reply_base);
+ const struct ethtool_mm_state *state = &data->state;
+
+ if (nla_put_u8(skb, ETHTOOL_A_MM_TX_ENABLED, state->tx_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_TX_ACTIVE, state->tx_active) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_PMAC_ENABLED, state->pmac_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_ENABLED, state->verify_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_STATUS, state->verify_status) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_VERIFY_TIME, state->verify_time) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_MAX_VERIFY_TIME, state->max_verify_time) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, state->tx_min_frag_size) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, state->rx_min_frag_size))
+ return -EMSGSIZE;
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ mm_put_stats(skb, &data->stats))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1] = {
+ [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MM_VERIFY_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_MM_VERIFY_TIME] = NLA_POLICY_RANGE(NLA_U32, 1, 128),
+ [ETHTOOL_A_MM_TX_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_MM_PMAC_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_MM_TX_MIN_FRAG_SIZE] = NLA_POLICY_RANGE(NLA_U32, 60, 252),
+};
+
+static void mm_state_to_cfg(const struct ethtool_mm_state *state,
+ struct ethtool_mm_cfg *cfg)
+{
+ /* We could also compare state->verify_status against
+ * ETHTOOL_MM_VERIFY_STATUS_DISABLED, but state->verify_enabled
+ * is more like an administrative state which should be seen in
+ * ETHTOOL_MSG_MM_GET replies. For example, a port with verification
+ * disabled might be in the ETHTOOL_MM_VERIFY_STATUS_INITIAL
+ * if it's down.
+ */
+ cfg->verify_enabled = state->verify_enabled;
+ cfg->verify_time = state->verify_time;
+ cfg->tx_enabled = state->tx_enabled;
+ cfg->pmac_enabled = state->pmac_enabled;
+ cfg->tx_min_frag_size = state->tx_min_frag_size;
+}
+
+static int
+ethnl_set_mm_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_mm && ops->set_mm ? 1 : -EOPNOTSUPP;
+}
+
+static int ethnl_set_mm(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct net_device *dev = req_info->dev;
+ struct ethtool_mm_state state = {};
+ struct nlattr **tb = info->attrs;
+ struct ethtool_mm_cfg cfg = {};
+ bool mod = false;
+ int ret;
+
+ ret = dev->ethtool_ops->get_mm(dev, &state);
+ if (ret)
+ return ret;
+
+ mm_state_to_cfg(&state, &cfg);
+
+ ethnl_update_bool(&cfg.verify_enabled, tb[ETHTOOL_A_MM_VERIFY_ENABLED],
+ &mod);
+ ethnl_update_u32(&cfg.verify_time, tb[ETHTOOL_A_MM_VERIFY_TIME], &mod);
+ ethnl_update_bool(&cfg.tx_enabled, tb[ETHTOOL_A_MM_TX_ENABLED], &mod);
+ ethnl_update_bool(&cfg.pmac_enabled, tb[ETHTOOL_A_MM_PMAC_ENABLED],
+ &mod);
+ ethnl_update_u32(&cfg.tx_min_frag_size,
+ tb[ETHTOOL_A_MM_TX_MIN_FRAG_SIZE], &mod);
+
+ if (!mod)
+ return 0;
+
+ if (cfg.verify_time > state.max_verify_time) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MM_VERIFY_TIME],
+ "verifyTime exceeds device maximum");
+ return -ERANGE;
+ }
+
+ if (cfg.verify_enabled && !cfg.tx_enabled) {
+ NL_SET_ERR_MSG(extack, "Verification requires TX enabled");
+ return -EINVAL;
+ }
+
+ if (cfg.tx_enabled && !cfg.pmac_enabled) {
+ NL_SET_ERR_MSG(extack, "TX enabled requires pMAC enabled");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_mm(dev, &cfg, extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_mm_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MM_GET,
+ .reply_cmd = ETHTOOL_MSG_MM_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MM_HEADER,
+ .req_info_size = sizeof(struct mm_req_info),
+ .reply_data_size = sizeof(struct mm_reply_data),
+
+ .prepare_data = mm_prepare_data,
+ .reply_size = mm_reply_size,
+ .fill_reply = mm_fill_reply,
+
+ .set_validate = ethnl_set_mm_validate,
+ .set = ethnl_set_mm,
+ .set_ntf_cmd = ETHTOOL_MSG_MM_NTF,
+};
+
+/* Returns whether a given device supports the MAC merge layer
+ * (has an eMAC and a pMAC). Must be called under rtnl_lock() and
+ * ethnl_ops_begin().
+ */
+bool __ethtool_dev_mm_supported(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_mm_state state = {};
+ int ret = -EOPNOTSUPP;
+
+ if (ops && ops->get_mm)
+ ret = ops->get_mm(dev, &state);
+
+ return !ret;
+}
+
+bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ bool supported;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!ops)
+ return false;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return false;
+
+ supported = __ethtool_dev_mm_supported(dev);
+
+ ethnl_ops_complete(dev);
+
+ return supported;
+}
+EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported);
+
+static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv,
+ bool tx_active)
+{
+ if (mmsv->ops->configure_tx)
+ mmsv->ops->configure_tx(mmsv, tx_active);
+}
+
+static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv,
+ bool pmac_enabled)
+{
+ if (mmsv->ops->configure_pmac)
+ mmsv->ops->configure_pmac(mmsv, pmac_enabled);
+}
+
+static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv,
+ enum ethtool_mpacket mpacket)
+{
+ if (mmsv->ops->send_mpacket)
+ mmsv->ops->send_mpacket(mmsv, mpacket);
+}
+
+/**
+ * ethtool_mmsv_verify_timer - Timer for MAC Merge verification
+ * @t: timer_list struct containing private info
+ *
+ * Verify the MAC Merge capability in the local TX direction, by
+ * transmitting Verify mPackets up to 3 times. Wait until link
+ * partner responds with a Response mPacket, otherwise fail.
+ */
+static void ethtool_mmsv_verify_timer(struct timer_list *t)
+{
+ struct ethtool_mmsv *mmsv = timer_container_of(mmsv, t, verify_timer);
+ unsigned long flags;
+ bool rearm = false;
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ switch (mmsv->status) {
+ case ETHTOOL_MM_VERIFY_STATUS_INITIAL:
+ case ETHTOOL_MM_VERIFY_STATUS_VERIFYING:
+ if (mmsv->verify_retries != 0) {
+ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY);
+ rearm = true;
+ } else {
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED;
+ }
+
+ mmsv->verify_retries--;
+ break;
+
+ case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED:
+ ethtool_mmsv_configure_tx(mmsv, true);
+ break;
+
+ default:
+ break;
+ }
+
+ if (rearm) {
+ mod_timer(&mmsv->verify_timer,
+ jiffies + msecs_to_jiffies(mmsv->verify_time));
+ }
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+
+static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv)
+{
+ if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled &&
+ mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED &&
+ mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) {
+ timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0);
+ mod_timer(&mmsv->verify_timer, jiffies);
+ }
+}
+
+static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv)
+{
+ /* If verification is disabled, configure FPE right away.
+ * Otherwise let the timer code do it.
+ */
+ if (!mmsv->verify_enabled) {
+ ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled);
+ ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled);
+ } else {
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
+ mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+
+ if (netif_running(mmsv->dev))
+ ethtool_mmsv_verify_timer_arm(mmsv);
+ }
+}
+
+/**
+ * ethtool_mmsv_stop() - Stop MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ *
+ * Drivers should call this method in a state where the hardware is
+ * about to lose state, like ndo_stop() or suspend(), and turning off
+ * MAC Merge features would be superfluous. Otherwise, prefer
+ * ethtool_mmsv_link_state_handle() with up=false.
+ */
+void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv)
+{
+ timer_shutdown_sync(&mmsv->verify_timer);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_stop);
+
+/**
+ * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification
+ * of link state changes
+ * @mmsv: MAC Merge Software Verification state
+ * @up: True if device carrier is up and able to pass verification packets
+ *
+ * Calling context is expected to be from a task, interrupts enabled.
+ */
+void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up)
+{
+ unsigned long flags;
+
+ ethtool_mmsv_stop(mmsv);
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ if (up && mmsv->pmac_enabled) {
+ /* VERIFY process requires pMAC enabled when NIC comes up */
+ ethtool_mmsv_configure_pmac(mmsv, true);
+
+ /* New link => maybe new partner => new verification process */
+ ethtool_mmsv_apply(mmsv);
+ } else {
+ /* Reset the reported verification state while the link is down */
+ if (mmsv->verify_enabled)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
+
+ /* No link or pMAC not enabled */
+ ethtool_mmsv_configure_pmac(mmsv, false);
+ ethtool_mmsv_configure_tx(mmsv, false);
+ }
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle);
+
+/**
+ * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification
+ * of interrupt-based events
+ * @mmsv: MAC Merge Software Verification state
+ * @event: Event which took place (packet transmission or reception)
+ *
+ * Calling context expects to have interrupts disabled.
+ */
+void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv,
+ enum ethtool_mmsv_event event)
+{
+ /* This is interrupt context, just spin_lock() */
+ spin_lock(&mmsv->lock);
+
+ if (!mmsv->pmac_enabled)
+ goto unlock;
+
+ switch (event) {
+ case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET:
+ /* Link partner has sent verify mPacket */
+ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE);
+ break;
+ case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET:
+ /* Local device has sent verify mPacket */
+ if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING;
+ break;
+ case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET:
+ /* Link partner has sent response mPacket */
+ if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED;
+ break;
+ }
+
+unlock:
+ spin_unlock(&mmsv->lock);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle);
+
+static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv)
+{
+ /* TX is active if administratively enabled, and verification either
+ * succeeded, or was administratively disabled.
+ */
+ return mmsv->tx_enabled &&
+ (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED ||
+ mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED);
+}
+
+/**
+ * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ * @state: see struct ethtool_mm_state
+ *
+ * Drivers are expected to call this from their ethtool_ops :: get_mm()
+ * method.
+ */
+void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv,
+ struct ethtool_mm_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS;
+ state->verify_enabled = mmsv->verify_enabled;
+ state->pmac_enabled = mmsv->pmac_enabled;
+ state->verify_time = mmsv->verify_time;
+ state->tx_enabled = mmsv->tx_enabled;
+ state->verify_status = mmsv->status;
+ state->tx_active = ethtool_mmsv_is_tx_active(mmsv);
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm);
+
+/**
+ * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ * @cfg: see struct ethtool_mm_cfg
+ *
+ * Drivers are expected to call this from their ethtool_ops :: set_mm()
+ * method.
+ */
+void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg)
+{
+ unsigned long flags;
+
+ /* Wait for the verification that's currently in progress to finish */
+ ethtool_mmsv_stop(mmsv);
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ mmsv->verify_enabled = cfg->verify_enabled;
+ mmsv->pmac_enabled = cfg->pmac_enabled;
+ mmsv->verify_time = cfg->verify_time;
+ mmsv->tx_enabled = cfg->tx_enabled;
+
+ if (!cfg->verify_enabled)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
+
+ ethtool_mmsv_apply(mmsv);
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm);
+
+/**
+ * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state
+ * @mmsv: MAC Merge Software Verification state
+ * @dev: Pointer to network interface
+ * @ops: Methods for implementing the generic functionality
+ *
+ * The MAC Merge Software Verification is a timer- and event-based state
+ * machine intended for network interfaces which lack a hardware-based
+ * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer
+ * is managed by the core code, whereas events are supplied by the
+ * driver explicitly calling one of the other API functions.
+ */
+void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev,
+ const struct ethtool_mmsv_ops *ops)
+{
+ mmsv->ops = ops;
+ mmsv->dev = dev;
+ mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+ mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS;
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
+ timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0);
+ spin_lock_init(&mmsv->lock);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_init);
diff --git a/net/ethtool/module.c b/net/ethtool/module.c
new file mode 100644
index 000000000000..4d4e0a82579a
--- /dev/null
+++ b/net/ethtool/module.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/firmware.h>
+#include <linux/sfp.h>
+#include <net/devlink.h>
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "module_fw.h"
+
+struct module_req_info {
+ struct ethnl_req_info base;
+};
+
+struct module_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_module_power_mode_params power;
+};
+
+#define MODULE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct module_reply_data, base)
+
+/* MODULE_GET */
+
+const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = {
+ [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int module_get_power_mode(struct net_device *dev,
+ struct module_reply_data *data,
+ struct netlink_ext_ack *extack)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_module_power_mode)
+ return 0;
+
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
+ return ops->get_module_power_mode(dev, &data->power, extack);
+}
+
+static int module_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct module_reply_data *data = MODULE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = module_get_power_mode(dev, data, info->extack);
+ if (ret < 0)
+ goto out_complete;
+
+out_complete:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int module_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct module_reply_data *data = MODULE_REPDATA(reply_base);
+ int len = 0;
+
+ if (data->power.policy)
+ len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */
+
+ if (data->power.mode)
+ len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */
+
+ return len;
+}
+
+static int module_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct module_reply_data *data = MODULE_REPDATA(reply_base);
+
+ if (data->power.policy &&
+ nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY,
+ data->power.policy))
+ return -EMSGSIZE;
+
+ if (data->power.mode &&
+ nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* MODULE_SET */
+
+const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = {
+ [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MODULE_POWER_MODE_POLICY] =
+ NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH,
+ ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO),
+};
+
+static int
+ethnl_set_module_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct nlattr **tb = info->attrs;
+
+ if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY])
+ return 0;
+
+ if (req_info->dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(info->extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
+ if (!ops->get_module_power_mode || !ops->set_module_power_mode) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY],
+ "Setting power mode policy is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ return 1;
+}
+
+static int
+ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_module_power_mode_params power = {};
+ struct ethtool_module_power_mode_params power_new;
+ const struct ethtool_ops *ops;
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ ops = dev->ethtool_ops;
+
+ power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]);
+ ret = ops->get_module_power_mode(dev, &power, info->extack);
+ if (ret < 0)
+ return ret;
+
+ if (power_new.policy == power.policy)
+ return 0;
+
+ ret = ops->set_module_power_mode(dev, &power_new, info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_module_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MODULE_GET,
+ .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MODULE_HEADER,
+ .req_info_size = sizeof(struct module_req_info),
+ .reply_data_size = sizeof(struct module_reply_data),
+
+ .prepare_data = module_prepare_data,
+ .reply_size = module_reply_size,
+ .fill_reply = module_fill_reply,
+
+ .set_validate = ethnl_set_module_validate,
+ .set = ethnl_set_module,
+ .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF,
+};
+
+/* MODULE_FW_FLASH_ACT */
+
+const struct nla_policy
+ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1] = {
+ [ETHTOOL_A_MODULE_FW_FLASH_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME] = { .type = NLA_NUL_STRING },
+ [ETHTOOL_A_MODULE_FW_FLASH_PASSWORD] = { .type = NLA_U32 },
+};
+
+static LIST_HEAD(module_fw_flash_work_list);
+static DEFINE_SPINLOCK(module_fw_flash_work_list_lock);
+
+static int
+module_flash_fw_work_list_add(struct ethtool_module_fw_flash *module_fw,
+ struct genl_info *info)
+{
+ struct ethtool_module_fw_flash *work;
+
+ /* First, check if already registered. */
+ spin_lock(&module_fw_flash_work_list_lock);
+ list_for_each_entry(work, &module_fw_flash_work_list, list) {
+ if (work->fw_update.ntf_params.portid == info->snd_portid &&
+ work->fw_update.dev == module_fw->fw_update.dev) {
+ spin_unlock(&module_fw_flash_work_list_lock);
+ return -EALREADY;
+ }
+ }
+
+ list_add_tail(&module_fw->list, &module_fw_flash_work_list);
+ spin_unlock(&module_fw_flash_work_list_lock);
+
+ return 0;
+}
+
+static void module_flash_fw_work_list_del(struct list_head *list)
+{
+ spin_lock(&module_fw_flash_work_list_lock);
+ list_del(list);
+ spin_unlock(&module_fw_flash_work_list_lock);
+}
+
+static void module_flash_fw_work(struct work_struct *work)
+{
+ struct ethtool_module_fw_flash *module_fw;
+
+ module_fw = container_of(work, struct ethtool_module_fw_flash, work);
+
+ ethtool_cmis_fw_update(&module_fw->fw_update);
+
+ module_flash_fw_work_list_del(&module_fw->list);
+ module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false;
+ netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker);
+ release_firmware(module_fw->fw_update.fw);
+ kfree(module_fw);
+}
+
+#define MODULE_EEPROM_PHYS_ID_PAGE 0
+#define MODULE_EEPROM_PHYS_ID_I2C_ADDR 0x50
+
+static int module_flash_fw_work_init(struct ethtool_module_fw_flash *module_fw,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {};
+ u8 phys_id;
+ int err;
+
+ /* Fetch the SFF-8024 Identifier Value. For all supported standards, it
+ * is located at I2C address 0x50, byte 0. See section 4.1 in SFF-8024,
+ * revision 4.9.
+ */
+ page_data.page = MODULE_EEPROM_PHYS_ID_PAGE;
+ page_data.offset = SFP_PHYS_ID;
+ page_data.length = sizeof(phys_id);
+ page_data.i2c_address = MODULE_EEPROM_PHYS_ID_I2C_ADDR;
+ page_data.data = &phys_id;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, extack);
+ if (err < 0)
+ return err;
+
+ switch (phys_id) {
+ case SFF8024_ID_QSFP_DD:
+ case SFF8024_ID_OSFP:
+ case SFF8024_ID_DSFP:
+ case SFF8024_ID_QSFP_PLUS_CMIS:
+ case SFF8024_ID_SFP_DD_CMIS:
+ case SFF8024_ID_SFP_PLUS_CMIS:
+ INIT_WORK(&module_fw->work, module_flash_fw_work);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack,
+ "Module type does not support firmware flashing");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv)
+{
+ struct ethtool_module_fw_flash *work;
+
+ spin_lock(&module_fw_flash_work_list_lock);
+ list_for_each_entry(work, &module_fw_flash_work_list, list) {
+ if (work->fw_update.dev == sk_priv->dev &&
+ work->fw_update.ntf_params.portid == sk_priv->portid) {
+ work->fw_update.ntf_params.closed_sock = true;
+ break;
+ }
+ }
+ spin_unlock(&module_fw_flash_work_list_lock);
+}
+
+static int
+module_flash_fw_schedule(struct net_device *dev, const char *file_name,
+ struct ethtool_module_fw_flash_params *params,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_cmis_fw_update_params *fw_update;
+ struct ethtool_module_fw_flash *module_fw;
+ int err;
+
+ module_fw = kzalloc(sizeof(*module_fw), GFP_KERNEL);
+ if (!module_fw)
+ return -ENOMEM;
+
+ fw_update = &module_fw->fw_update;
+ fw_update->params = *params;
+ err = request_firmware_direct(&fw_update->fw,
+ file_name, &dev->dev);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack,
+ "Failed to request module firmware image");
+ goto err_free;
+ }
+
+ err = module_flash_fw_work_init(module_fw, dev, info->extack);
+ if (err < 0)
+ goto err_release_firmware;
+
+ dev->ethtool->module_fw_flash_in_progress = true;
+ netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL);
+ fw_update->dev = dev;
+ fw_update->ntf_params.portid = info->snd_portid;
+ fw_update->ntf_params.seq = info->snd_seq;
+ fw_update->ntf_params.closed_sock = false;
+
+ err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid,
+ ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH);
+ if (err < 0)
+ goto err_release_firmware;
+
+ err = module_flash_fw_work_list_add(module_fw, info);
+ if (err < 0)
+ goto err_release_firmware;
+
+ schedule_work(&module_fw->work);
+
+ return 0;
+
+err_release_firmware:
+ release_firmware(fw_update->fw);
+err_free:
+ kfree(module_fw);
+ return err;
+}
+
+static int module_flash_fw(struct net_device *dev, struct nlattr **tb,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_module_fw_flash_params params = {};
+ const char *file_name;
+ struct nlattr *attr;
+
+ if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME))
+ return -EINVAL;
+
+ file_name = nla_data(tb[ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME]);
+
+ attr = tb[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD];
+ if (attr) {
+ params.password = cpu_to_be32(nla_get_u32(attr));
+ params.password_valid = true;
+ }
+
+ return module_flash_fw_schedule(dev, file_name, &params, skb, info);
+}
+
+static int ethnl_module_fw_flash_validate(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_port *devlink_port = dev->devlink_port;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->set_module_eeprom_by_page ||
+ !ops->get_module_eeprom_by_page) {
+ NL_SET_ERR_MSG(extack,
+ "Flashing module firmware is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ops->reset) {
+ NL_SET_ERR_MSG(extack,
+ "Reset module is not supported by this device, so flashing is not permitted");
+ return -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack, "Module firmware flashing already in progress");
+ return -EBUSY;
+ }
+
+ if (dev->flags & IFF_UP) {
+ NL_SET_ERR_MSG(extack, "Netdevice is up, so flashing is not permitted");
+ return -EBUSY;
+ }
+
+ if (devlink_port && devlink_port->attrs.split) {
+ NL_SET_ERR_MSG(extack, "Can't perform firmware flashing on a split port");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ struct net_device *dev;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_MODULE_FW_FLASH_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = ethnl_module_fw_flash_validate(dev, info->extack);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = module_flash_fw(dev, tb, skb, info);
+
+ ethnl_ops_complete(dev);
+
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+/* MODULE_FW_FLASH_NTF */
+
+static int
+ethnl_module_fw_flash_ntf_put_err(struct sk_buff *skb, char *err_msg,
+ char *sub_err_msg)
+{
+ int err_msg_len, sub_err_msg_len, total_len;
+ struct nlattr *attr;
+
+ if (!err_msg)
+ return 0;
+
+ err_msg_len = strlen(err_msg);
+ total_len = err_msg_len + 2; /* For period and NUL. */
+
+ if (sub_err_msg) {
+ sub_err_msg_len = strlen(sub_err_msg);
+ total_len += sub_err_msg_len + 2; /* For ", ". */
+ }
+
+ attr = nla_reserve(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG,
+ total_len);
+ if (!attr)
+ return -ENOMEM;
+
+ if (sub_err_msg)
+ sprintf(nla_data(attr), "%s, %s.", err_msg, sub_err_msg);
+ else
+ sprintf(nla_data(attr), "%s.", err_msg);
+
+ return 0;
+}
+
+static void
+ethnl_module_fw_flash_ntf(struct net_device *dev,
+ enum ethtool_module_fw_flash_status status,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params,
+ char *err_msg, char *sub_err_msg,
+ u64 done, u64 total)
+{
+ struct sk_buff *skb;
+ void *hdr;
+ int ret;
+
+ if (ntf_params->closed_sock)
+ return;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ hdr = ethnl_unicast_put(skb, ntf_params->portid, ++ntf_params->seq,
+ ETHTOOL_MSG_MODULE_FW_FLASH_NTF);
+ if (!hdr)
+ goto err_skb;
+
+ ret = ethnl_fill_reply_header(skb, dev,
+ ETHTOOL_A_MODULE_FW_FLASH_HEADER);
+ if (ret < 0)
+ goto err_skb;
+
+ if (nla_put_u32(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS, status))
+ goto err_skb;
+
+ ret = ethnl_module_fw_flash_ntf_put_err(skb, err_msg, sub_err_msg);
+ if (ret < 0)
+ goto err_skb;
+
+ if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_DONE, done))
+ goto err_skb;
+
+ if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_TOTAL, total))
+ goto err_skb;
+
+ genlmsg_end(skb, hdr);
+ genlmsg_unicast(dev_net(dev), skb, ntf_params->portid);
+ return;
+
+err_skb:
+ nlmsg_free(skb);
+}
+
+void ethnl_module_fw_flash_ntf_err(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ char *err_msg, char *sub_err_msg)
+{
+ ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_ERROR,
+ params, err_msg, sub_err_msg, 0, 0);
+}
+
+void
+ethnl_module_fw_flash_ntf_start(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params)
+{
+ ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_STARTED,
+ params, NULL, NULL, 0, 0);
+}
+
+void
+ethnl_module_fw_flash_ntf_complete(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params)
+{
+ ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_COMPLETED,
+ params, NULL, NULL, 0, 0);
+}
+
+void
+ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ u64 done, u64 total)
+{
+ ethnl_module_fw_flash_ntf(dev,
+ ETHTOOL_MODULE_FW_FLASH_STATUS_IN_PROGRESS,
+ params, NULL, NULL, done, total);
+}
diff --git a/net/ethtool/module_fw.h b/net/ethtool/module_fw.h
new file mode 100644
index 000000000000..634543a12d0c
--- /dev/null
+++ b/net/ethtool/module_fw.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <uapi/linux/ethtool.h>
+#include "netlink.h"
+
+/**
+ * struct ethnl_module_fw_flash_ntf_params - module firmware flashing
+ * notifications parameters
+ * @portid: Netlink portid of sender.
+ * @seq: Sequence number of sender.
+ * @closed_sock: Indicates whether the socket was closed from user space.
+ */
+struct ethnl_module_fw_flash_ntf_params {
+ u32 portid;
+ u32 seq;
+ bool closed_sock;
+};
+
+/**
+ * struct ethtool_module_fw_flash_params - module firmware flashing parameters
+ * @password: Module password. Only valid when @pass_valid is set.
+ * @password_valid: Whether the module password is valid or not.
+ */
+struct ethtool_module_fw_flash_params {
+ __be32 password;
+ u8 password_valid:1;
+};
+
+/**
+ * struct ethtool_cmis_fw_update_params - CMIS firmware update specific
+ * parameters
+ * @dev: Pointer to the net_device to be flashed.
+ * @params: Module firmware flashing parameters.
+ * @ntf_params: Module firmware flashing notification parameters.
+ * @fw: Firmware to flash.
+ */
+struct ethtool_cmis_fw_update_params {
+ struct net_device *dev;
+ struct ethtool_module_fw_flash_params params;
+ struct ethnl_module_fw_flash_ntf_params ntf_params;
+ const struct firmware *fw;
+};
+
+/**
+ * struct ethtool_module_fw_flash - module firmware flashing
+ * @list: List node for &module_fw_flash_work_list.
+ * @dev_tracker: Refcount tracker for @dev.
+ * @work: The flashing firmware work.
+ * @fw_update: CMIS firmware update specific parameters.
+ */
+struct ethtool_module_fw_flash {
+ struct list_head list;
+ netdevice_tracker dev_tracker;
+ struct work_struct work;
+ struct ethtool_cmis_fw_update_params fw_update;
+};
+
+void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv);
+
+void
+ethnl_module_fw_flash_ntf_err(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ char *err_msg, char *sub_err_msg);
+void
+ethnl_module_fw_flash_ntf_start(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params);
+void
+ethnl_module_fw_flash_ntf_complete(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params);
+void
+ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ u64 done, u64 total);
+
+void ethtool_cmis_fw_update(struct ethtool_cmis_fw_update_params *params);
diff --git a/net/ethtool/mse.c b/net/ethtool/mse.c
new file mode 100644
index 000000000000..6aac004c3ffc
--- /dev/null
+++ b/net/ethtool/mse.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include <linux/slab.h>
+
+#include "netlink.h"
+#include "common.h"
+
+/* Channels A-D only; WORST and LINK are exclusive alternatives */
+#define PHY_MSE_CHANNEL_COUNT 4
+
+struct mse_req_info {
+ struct ethnl_req_info base;
+};
+
+struct mse_snapshot_entry {
+ struct phy_mse_snapshot snapshot;
+ int channel;
+};
+
+struct mse_reply_data {
+ struct ethnl_reply_data base;
+ struct phy_mse_capability capability;
+ struct mse_snapshot_entry *snapshots;
+ unsigned int num_snapshots;
+};
+
+static struct mse_reply_data *
+mse_repdata(const struct ethnl_reply_data *reply_base)
+{
+ return container_of(reply_base, struct mse_reply_data, base);
+}
+
+const struct nla_policy ethnl_mse_get_policy[] = {
+ [ETHTOOL_A_MSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int get_snapshot_if_supported(struct phy_device *phydev,
+ struct mse_reply_data *data,
+ unsigned int *idx, u32 cap_bit,
+ enum phy_mse_channel channel)
+{
+ int ret;
+
+ if (data->capability.supported_caps & cap_bit) {
+ ret = phydev->drv->get_mse_snapshot(phydev, channel,
+ &data->snapshots[*idx].snapshot);
+ if (ret)
+ return ret;
+ data->snapshots[*idx].channel = channel;
+ (*idx)++;
+ }
+
+ return 0;
+}
+
+static int mse_get_channels(struct phy_device *phydev,
+ struct mse_reply_data *data)
+{
+ unsigned int i = 0;
+ int ret;
+
+ if (!data->capability.supported_caps)
+ return 0;
+
+ data->snapshots = kcalloc(PHY_MSE_CHANNEL_COUNT,
+ sizeof(*data->snapshots), GFP_KERNEL);
+ if (!data->snapshots)
+ return -ENOMEM;
+
+ /* Priority 1: Individual channels */
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_A,
+ PHY_MSE_CHANNEL_A);
+ if (ret)
+ return ret;
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_B,
+ PHY_MSE_CHANNEL_B);
+ if (ret)
+ return ret;
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_C,
+ PHY_MSE_CHANNEL_C);
+ if (ret)
+ return ret;
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_D,
+ PHY_MSE_CHANNEL_D);
+ if (ret)
+ return ret;
+
+ /* If any individual channels were found, we are done. */
+ if (i > 0) {
+ data->num_snapshots = i;
+ return 0;
+ }
+
+ /* Priority 2: Worst channel, if no individual channels supported. */
+ ret = get_snapshot_if_supported(phydev, data, &i,
+ PHY_MSE_CAP_WORST_CHANNEL,
+ PHY_MSE_CHANNEL_WORST);
+ if (ret)
+ return ret;
+
+ /* If worst channel was found, we are done. */
+ if (i > 0) {
+ data->num_snapshots = i;
+ return 0;
+ }
+
+ /* Priority 3: Link-wide, if nothing else is supported. */
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_LINK,
+ PHY_MSE_CHANNEL_LINK);
+ if (ret)
+ return ret;
+
+ data->num_snapshots = i;
+ return 0;
+}
+
+static int mse_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct mse_reply_data *data = mse_repdata(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, info->attrs,
+ ETHTOOL_A_MSE_HEADER, info->extack);
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret)
+ return ret;
+
+ mutex_lock(&phydev->lock);
+
+ if (!phydev->drv || !phydev->drv->get_mse_capability ||
+ !phydev->drv->get_mse_snapshot) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+ if (!phydev->link) {
+ ret = -ENETDOWN;
+ goto out_unlock;
+ }
+
+ ret = phydev->drv->get_mse_capability(phydev, &data->capability);
+ if (ret)
+ goto out_unlock;
+
+ ret = mse_get_channels(phydev, data);
+
+out_unlock:
+ mutex_unlock(&phydev->lock);
+ ethnl_ops_complete(dev);
+ if (ret)
+ kfree(data->snapshots);
+ return ret;
+}
+
+static void mse_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct mse_reply_data *data = mse_repdata(reply_base);
+
+ kfree(data->snapshots);
+}
+
+static int mse_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct mse_reply_data *data = mse_repdata(reply_base);
+ size_t len = 0;
+ unsigned int i;
+
+ /* ETHTOOL_A_MSE_CAPABILITIES */
+ len += nla_total_size(0);
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG)
+ /* ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE */
+ len += nla_total_size(sizeof(u64));
+ if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK |
+ PHY_MSE_CAP_WORST_PEAK))
+ /* ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE */
+ len += nla_total_size(sizeof(u64));
+ /* ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS */
+ len += nla_total_size(sizeof(u64));
+ /* ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS */
+ len += nla_total_size(sizeof(u64));
+
+ for (i = 0; i < data->num_snapshots; i++) {
+ size_t snapshot_len = 0;
+
+ /* Per-channel nest (e.g., ETHTOOL_A_MSE_CHANNEL_A / _B / _C /
+ * _D / _WORST_CHANNEL / _LINK)
+ */
+ snapshot_len += nla_total_size(0);
+
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG)
+ snapshot_len += nla_total_size(sizeof(u64));
+ if (data->capability.supported_caps & PHY_MSE_CAP_PEAK)
+ snapshot_len += nla_total_size(sizeof(u64));
+ if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK)
+ snapshot_len += nla_total_size(sizeof(u64));
+
+ len += snapshot_len;
+ }
+
+ return len;
+}
+
+static int mse_channel_to_attr(int ch)
+{
+ switch (ch) {
+ case PHY_MSE_CHANNEL_A:
+ return ETHTOOL_A_MSE_CHANNEL_A;
+ case PHY_MSE_CHANNEL_B:
+ return ETHTOOL_A_MSE_CHANNEL_B;
+ case PHY_MSE_CHANNEL_C:
+ return ETHTOOL_A_MSE_CHANNEL_C;
+ case PHY_MSE_CHANNEL_D:
+ return ETHTOOL_A_MSE_CHANNEL_D;
+ case PHY_MSE_CHANNEL_WORST:
+ return ETHTOOL_A_MSE_WORST_CHANNEL;
+ case PHY_MSE_CHANNEL_LINK:
+ return ETHTOOL_A_MSE_LINK;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int mse_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct mse_reply_data *data = mse_repdata(reply_base);
+ struct nlattr *nest;
+ unsigned int i;
+ int ret;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_MSE_CAPABILITIES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG) {
+ ret = nla_put_uint(skb,
+ ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE,
+ data->capability.max_average_mse);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+ }
+
+ if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK |
+ PHY_MSE_CAP_WORST_PEAK)) {
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE,
+ data->capability.max_peak_mse);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+ }
+
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS,
+ data->capability.refresh_rate_ps);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS,
+ data->capability.num_symbols);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+
+ nla_nest_end(skb, nest);
+
+ for (i = 0; i < data->num_snapshots; i++) {
+ const struct mse_snapshot_entry *s = &data->snapshots[i];
+ int chan_attr;
+
+ chan_attr = mse_channel_to_attr(s->channel);
+ if (chan_attr < 0)
+ return chan_attr;
+
+ nest = nla_nest_start(skb, chan_attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG) {
+ ret = nla_put_uint(skb,
+ ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE,
+ s->snapshot.average_mse);
+ if (ret)
+ goto nla_put_nest_failure;
+ }
+ if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) {
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE,
+ s->snapshot.peak_mse);
+ if (ret)
+ goto nla_put_nest_failure;
+ }
+ if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) {
+ ret = nla_put_uint(skb,
+ ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE,
+ s->snapshot.worst_peak_mse);
+ if (ret)
+ goto nla_put_nest_failure;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+nla_put_nest_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_mse_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MSE_GET,
+ .reply_cmd = ETHTOOL_MSG_MSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MSE_HEADER,
+ .req_info_size = sizeof(struct mse_req_info),
+ .reply_data_size = sizeof(struct mse_reply_data),
+
+ .prepare_data = mse_prepare_data,
+ .cleanup_data = mse_cleanup_data,
+ .reply_size = mse_reply_size,
+ .fill_reply = mse_fill_reply,
+};
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
new file mode 100644
index 000000000000..6e5f0f4f815a
--- /dev/null
+++ b/net/ethtool/netlink.c
@@ -0,0 +1,1583 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_lock.h>
+#include <net/netdev_queues.h>
+#include <net/sock.h>
+#include <linux/ethtool_netlink.h>
+#include <linux/phy_link_topology.h>
+#include <linux/pm_runtime.h>
+#include "netlink.h"
+#include "module_fw.h"
+
+static struct genl_family ethtool_genl_family;
+
+static bool ethnl_ok __read_mostly;
+static u32 ethnl_bcast_seq;
+
+#define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \
+ ETHTOOL_FLAG_OMIT_REPLY)
+#define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS)
+
+const struct nla_policy ethnl_header_policy[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_BASIC),
+};
+
+const struct nla_policy ethnl_header_policy_stats[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_STATS),
+};
+
+const struct nla_policy ethnl_header_policy_phy[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_BASIC),
+ [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+const struct nla_policy ethnl_header_policy_phy_stats[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_STATS),
+ [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
+ enum ethnl_sock_type type)
+{
+ struct ethnl_sock_priv *sk_priv;
+
+ sk_priv = genl_sk_priv_get(&ethtool_genl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(sk_priv))
+ return PTR_ERR(sk_priv);
+
+ sk_priv->dev = dev;
+ sk_priv->portid = portid;
+ sk_priv->type = type;
+
+ return 0;
+}
+
+static void ethnl_sock_priv_destroy(void *priv)
+{
+ struct ethnl_sock_priv *sk_priv = priv;
+
+ switch (sk_priv->type) {
+ case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH:
+ ethnl_module_fw_flash_sock_destroy(sk_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+u32 ethnl_bcast_seq_next(void)
+{
+ ASSERT_RTNL();
+ return ++ethnl_bcast_seq;
+}
+
+int ethnl_ops_begin(struct net_device *dev)
+{
+ int ret;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->dev.parent)
+ pm_runtime_get_sync(dev->dev.parent);
+
+ netdev_ops_assert_locked(dev);
+
+ if (!netif_device_present(dev) ||
+ dev->reg_state >= NETREG_UNREGISTERING) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ ret = dev->ethtool_ops->begin(dev);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ if (dev->dev.parent)
+ pm_runtime_put(dev->dev.parent);
+
+ return ret;
+}
+
+void ethnl_ops_complete(struct net_device *dev)
+{
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (dev->dev.parent)
+ pm_runtime_put(dev->dev.parent);
+}
+
+/**
+ * ethnl_parse_header_dev_get() - parse request header
+ * @req_info: structure to put results into
+ * @header: nest attribute with request header
+ * @net: request netns
+ * @extack: netlink extack for error reporting
+ * @require_dev: fail if no device identified in header
+ *
+ * Parse request header in nested attribute @nest and puts results into
+ * the structure pointed to by @req_info. Extack from @info is used for error
+ * reporting. If req_info->dev is not null on return, reference to it has
+ * been taken. If error is returned, *req_info is null initialized and no
+ * reference is held.
+ *
+ * Return: 0 on success or negative error code
+ */
+int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
+ const struct nlattr *header, struct net *net,
+ struct netlink_ext_ack *extack, bool require_dev)
+{
+ struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)];
+ const struct nlattr *devname_attr;
+ struct net_device *dev = NULL;
+ u32 flags = 0;
+ int ret;
+
+ if (!header) {
+ if (!require_dev)
+ return 0;
+ NL_SET_ERR_MSG(extack, "request header missing");
+ return -EINVAL;
+ }
+ /* No validation here, command policy should have a nested policy set
+ * for the header, therefore validation should have already been done.
+ */
+ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header,
+ NULL, extack);
+ if (ret < 0)
+ return ret;
+ if (tb[ETHTOOL_A_HEADER_FLAGS])
+ flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
+
+ devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
+ if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
+ u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]);
+
+ dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker,
+ GFP_KERNEL);
+ if (!dev) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_HEADER_DEV_INDEX],
+ "no device matches ifindex");
+ return -ENODEV;
+ }
+ /* if both ifindex and ifname are passed, they must match */
+ if (devname_attr &&
+ strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) {
+ netdev_put(dev, &req_info->dev_tracker);
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "ifindex and name do not match");
+ return -ENODEV;
+ }
+ } else if (devname_attr) {
+ dev = netdev_get_by_name(net, nla_data(devname_attr),
+ &req_info->dev_tracker, GFP_KERNEL);
+ if (!dev) {
+ NL_SET_ERR_MSG_ATTR(extack, devname_attr,
+ "no device matches name");
+ return -ENODEV;
+ }
+ } else if (require_dev) {
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "neither ifindex nor name specified");
+ return -EINVAL;
+ }
+
+ if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
+ if (dev) {
+ req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]);
+ } else {
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "phy_index set without a netdev");
+ return -EINVAL;
+ }
+ }
+
+ req_info->dev = dev;
+ req_info->flags = flags;
+ return 0;
+}
+
+struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info,
+ struct nlattr **tb, unsigned int header,
+ struct netlink_ext_ack *extack)
+{
+ struct phy_device *phydev;
+
+ ASSERT_RTNL();
+
+ if (!req_info->dev)
+ return NULL;
+
+ if (!req_info->phy_index)
+ return req_info->dev->phydev;
+
+ phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index);
+ if (!phydev && tb) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[header],
+ "no phy matching phyindex");
+ return ERR_PTR(-ENODEV);
+ }
+
+ return phydev;
+}
+
+/**
+ * ethnl_fill_reply_header() - Put common header into a reply message
+ * @skb: skb with the message
+ * @dev: network device to describe in header
+ * @attrtype: attribute type to use for the nest
+ *
+ * Create a nested attribute with attributes describing given network device.
+ *
+ * Return: 0 on success, error value (-EMSGSIZE only) on error
+ */
+int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
+ u16 attrtype)
+{
+ struct nlattr *nest;
+
+ if (!dev)
+ return 0;
+ nest = nla_nest_start(skb, attrtype);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) ||
+ nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name))
+ goto nla_put_failure;
+ /* If more attributes are put into reply header, ethnl_header_size()
+ * must be updated to account for them.
+ */
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+/**
+ * ethnl_reply_init() - Create skb for a reply and fill device identification
+ * @payload: payload length (without netlink and genetlink header)
+ * @dev: device the reply is about (may be null)
+ * @cmd: ETHTOOL_MSG_* message type for reply
+ * @hdr_attrtype: attribute type for common header
+ * @info: genetlink info of the received packet we respond to
+ * @ehdrp: place to store payload pointer returned by genlmsg_new()
+ *
+ * Return: pointer to allocated skb on success, NULL on error
+ */
+struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
+ u16 hdr_attrtype, struct genl_info *info,
+ void **ehdrp)
+{
+ struct sk_buff *skb;
+
+ skb = genlmsg_new(payload, GFP_KERNEL);
+ if (!skb)
+ goto err;
+ *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd);
+ if (!*ehdrp)
+ goto err_free;
+
+ if (dev) {
+ int ret;
+
+ ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype);
+ if (ret < 0)
+ goto err_free;
+ }
+ return skb;
+
+err_free:
+ nlmsg_free(skb);
+err:
+ if (info)
+ GENL_SET_ERR_MSG(info, "failed to setup reply message");
+ return NULL;
+}
+
+void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd)
+{
+ return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ethtool_genl_family, 0, cmd);
+}
+
+void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd)
+{
+ return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0,
+ cmd);
+}
+
+void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd)
+{
+ return genlmsg_put(skb, portid, seq, &ethtool_genl_family, 0, cmd);
+}
+
+int ethnl_multicast(struct sk_buff *skb, struct net_device *dev)
+{
+ return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb,
+ 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL);
+}
+
+/* GET request helpers */
+
+/**
+ * struct ethnl_dump_ctx - context structure for generic dumpit() callback
+ * @ops: request ops of currently processed message type
+ * @req_info: parsed request header of processed request
+ * @reply_data: data needed to compose the reply
+ * @pos_ifindex: saved iteration position - ifindex
+ *
+ * These parameters are kept in struct netlink_callback as context preserved
+ * between iterations. They are initialized by ethnl_default_start() and used
+ * in ethnl_default_dumpit() and ethnl_default_done().
+ */
+struct ethnl_dump_ctx {
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct ethnl_reply_data *reply_data;
+ unsigned long pos_ifindex;
+};
+
+/**
+ * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks
+ * @ethnl_ctx: generic ethnl context
+ * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev
+ * @pos_phyindex: iterator position for multi-msg DUMP
+ */
+struct ethnl_perphy_dump_ctx {
+ struct ethnl_dump_ctx ethnl_ctx;
+ unsigned int ifindex;
+ unsigned long pos_phyindex;
+};
+
+static const struct ethnl_request_ops *
+ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
+ [ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops,
+ [ETHTOOL_MSG_LINKINFO_GET] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKINFO_SET] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKMODES_GET] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_LINKMODES_SET] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops,
+ [ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_DEBUG_SET] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_WOL_SET] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_FEATURES_GET] = &ethnl_features_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_SET] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_RINGS_SET] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_CHANNELS_SET] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_COALESCE_SET] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_PAUSE_SET] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_EEE_SET] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_FEC_GET] = &ethnl_fec_request_ops,
+ [ETHTOOL_MSG_FEC_SET] = &ethnl_fec_request_ops,
+ [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops,
+ [ETHTOOL_MSG_MODULE_EEPROM_GET] = &ethnl_module_eeprom_request_ops,
+ [ETHTOOL_MSG_STATS_GET] = &ethnl_stats_request_ops,
+ [ETHTOOL_MSG_PHC_VCLOCKS_GET] = &ethnl_phc_vclocks_request_ops,
+ [ETHTOOL_MSG_MODULE_GET] = &ethnl_module_request_ops,
+ [ETHTOOL_MSG_MODULE_SET] = &ethnl_module_request_ops,
+ [ETHTOOL_MSG_PSE_GET] = &ethnl_pse_request_ops,
+ [ETHTOOL_MSG_PSE_SET] = &ethnl_pse_request_ops,
+ [ETHTOOL_MSG_RSS_GET] = &ethnl_rss_request_ops,
+ [ETHTOOL_MSG_RSS_SET] = &ethnl_rss_request_ops,
+ [ETHTOOL_MSG_PLCA_GET_CFG] = &ethnl_plca_cfg_request_ops,
+ [ETHTOOL_MSG_PLCA_SET_CFG] = &ethnl_plca_cfg_request_ops,
+ [ETHTOOL_MSG_PLCA_GET_STATUS] = &ethnl_plca_status_request_ops,
+ [ETHTOOL_MSG_MM_GET] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops,
+ [ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops,
+ [ETHTOOL_MSG_PHY_GET] = &ethnl_phy_request_ops,
+ [ETHTOOL_MSG_MSE_GET] = &ethnl_mse_request_ops,
+};
+
+static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
+{
+ return (struct ethnl_dump_ctx *)cb->ctx;
+}
+
+static struct ethnl_perphy_dump_ctx *
+ethnl_perphy_dump_context(struct netlink_callback *cb)
+{
+ return (struct ethnl_perphy_dump_ctx *)cb->ctx;
+}
+
+/**
+ * ethnl_default_parse() - Parse request message
+ * @req_info: pointer to structure to put data into
+ * @info: genl_info from the request
+ * @request_ops: struct request_ops for request type
+ * @require_dev: fail if no device identified in header
+ *
+ * Parse universal request header and call request specific ->parse_request()
+ * callback (if defined) to parse the rest of the message.
+ *
+ * Return: 0 on success or negative error code
+ */
+static int ethnl_default_parse(struct ethnl_req_info *req_info,
+ const struct genl_info *info,
+ const struct ethnl_request_ops *request_ops,
+ bool require_dev)
+{
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr],
+ genl_info_net(info), info->extack,
+ require_dev);
+ if (ret < 0)
+ return ret;
+
+ if (request_ops->parse_request) {
+ ret = request_ops->parse_request(req_info, tb, info->extack);
+ if (ret < 0)
+ goto err_dev;
+ }
+
+ return 0;
+
+err_dev:
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ return ret;
+}
+
+/**
+ * ethnl_init_reply_data() - Initialize reply data for GET request
+ * @reply_data: pointer to embedded struct ethnl_reply_data
+ * @ops: instance of struct ethnl_request_ops describing the layout
+ * @dev: network device to initialize the reply for
+ *
+ * Fills the reply data part with zeros and sets the dev member. Must be called
+ * before calling the ->fill_reply() callback (for each iteration when handling
+ * dump requests).
+ */
+static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data,
+ const struct ethnl_request_ops *ops,
+ struct net_device *dev)
+{
+ memset(reply_data, 0, ops->reply_data_size);
+ reply_data->dev = dev;
+}
+
+/* default ->doit() handler for GET type requests */
+static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_reply_data *reply_data = NULL;
+ struct ethnl_req_info *req_info = NULL;
+ const u8 cmd = info->genlhdr->cmd;
+ const struct ethnl_request_ops *ops;
+ int hdr_len, reply_len;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int ret;
+
+ ops = ethnl_default_requests[cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
+ return -EOPNOTSUPP;
+ if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr))
+ return -EINVAL;
+
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return -ENOMEM;
+ }
+
+ ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do);
+ if (ret < 0)
+ goto err_free;
+ ethnl_init_reply_data(reply_data, ops, req_info->dev);
+
+ rtnl_lock();
+ if (req_info->dev)
+ netdev_lock_ops(req_info->dev);
+ ret = ops->prepare_data(req_info, reply_data, info);
+ if (req_info->dev)
+ netdev_unlock_ops(req_info->dev);
+ rtnl_unlock();
+ if (ret < 0)
+ goto err_dev;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+ reply_len = ret;
+ ret = -ENOMEM;
+ rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(),
+ req_info->dev, ops->reply_cmd,
+ ops->hdr_attr, info, &reply_payload);
+ if (!rskb)
+ goto err_cleanup;
+ hdr_len = rskb->len;
+ ret = ops->fill_reply(rskb, req_info, reply_data);
+ if (ret < 0)
+ goto err_msg;
+ WARN_ONCE(rskb->len - hdr_len > reply_len,
+ "ethnl cmd %d: calculated reply length %d, but consumed %d\n",
+ cmd, reply_len, rskb->len - hdr_len);
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+
+ genlmsg_end(rskb, reply_payload);
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ kfree(reply_data);
+ kfree(req_info);
+ return genlmsg_reply(rskb, info);
+
+err_msg:
+ WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len);
+ nlmsg_free(rskb);
+err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+err_dev:
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+err_free:
+ kfree(reply_data);
+ kfree(req_info);
+ return ret;
+}
+
+static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
+ const struct ethnl_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ void *ehdr;
+ int ret;
+
+ ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &ethtool_genl_family, NLM_F_MULTI,
+ ctx->ops->reply_cmd);
+ if (!ehdr)
+ return -EMSGSIZE;
+
+ ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev);
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ if (ret < 0)
+ goto out_cancel;
+ ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr);
+ if (ret < 0)
+ goto out;
+ ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data);
+
+out:
+ if (ctx->ops->cleanup_data)
+ ctx->ops->cleanup_data(ctx->reply_data);
+out_cancel:
+ ctx->reply_data->dev = NULL;
+ if (ret < 0)
+ genlmsg_cancel(skb, ehdr);
+ else
+ genlmsg_end(skb, ehdr);
+ return ret;
+}
+
+/* Default ->dumpit() handler for GET requests. */
+static int ethnl_default_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+ struct net *net = sock_net(skb->sk);
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb));
+
+ rcu_read_lock();
+ netdev_put(dev, &dev_tracker);
+
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ ret = skb->len;
+ break;
+ }
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* generic ->start() handler for GET requests */
+static int ethnl_default_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genlmsghdr *ghdr;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ ghdr = nlmsg_data(cb->nlh);
+ ops = ethnl_default_requests[ghdr->cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ ret = ethnl_default_parse(req_info, &info->info, ops, false);
+ if (ret < 0)
+ goto free_reply_data;
+ if (req_info->dev) {
+ /* We ignore device specification in dump requests but as the
+ * same parser as for non-dump (doit) requests is used, it
+ * would take reference to the device if it finds one
+ */
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ }
+
+ ctx->ops = ops;
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+/* per-PHY ->start() handler for GET requests */
+static int ethnl_perphy_start(struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx;
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genlmsghdr *ghdr;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ ghdr = nlmsg_data(cb->nlh);
+ ops = ethnl_default_requests[ghdr->cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ /* Unlike per-dev dump, don't ignore dev. The dump handler
+ * will notice it and dump PHYs from given dev. We only keep track of
+ * the dev's ifindex, .dumpit() will grab and release the netdev itself.
+ */
+ ret = ethnl_default_parse(req_info, &info->info, ops, false);
+ if (ret < 0)
+ goto free_reply_data;
+ if (req_info->dev) {
+ phy_ctx->ifindex = req_info->dev->ifindex;
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ }
+
+ ctx->ops = ops;
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+static int ethnl_perphy_dump_one_dev(struct sk_buff *skb,
+ struct ethnl_perphy_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ struct net_device *dev = ethnl_ctx->req_info->dev;
+ struct phy_device_node *pdn;
+ int ret;
+
+ if (!dev->link_topo)
+ return 0;
+
+ xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn,
+ ctx->pos_phyindex) {
+ ethnl_ctx->req_info->phy_index = ctx->pos_phyindex;
+
+ /* We can re-use the original dump_one as ->prepare_data in
+ * commands use ethnl_req_get_phydev(), which gets the PHY from
+ * the req_info->phy_index
+ */
+ ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info);
+ if (ret)
+ return ret;
+ }
+
+ ctx->pos_phyindex = 0;
+
+ return 0;
+}
+
+static int ethnl_perphy_dump_all_dev(struct sk_buff *skb,
+ struct ethnl_perphy_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ struct net *net = sock_net(skb->sk);
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) {
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ /* per-PHY commands use ethnl_req_get_phydev(), which needs the
+ * net_device in the req_info
+ */
+ ethnl_ctx->req_info->dev = dev;
+ ret = ethnl_perphy_dump_one_dev(skb, ctx, info);
+
+ rcu_read_lock();
+ netdev_put(dev, &dev_tracker);
+ ethnl_ctx->req_info->dev = NULL;
+
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ ret = skb->len;
+ break;
+ }
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* per-PHY ->dumpit() handler for GET requests. */
+static int ethnl_perphy_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ int ret = 0;
+
+ if (ctx->ifindex) {
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ dev = netdev_get_by_index(genl_info_net(&info->info),
+ ctx->ifindex, &dev_tracker,
+ GFP_KERNEL);
+ if (!dev)
+ return -ENODEV;
+
+ ethnl_ctx->req_info->dev = dev;
+ ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb));
+
+ if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len))
+ ret = skb->len;
+
+ netdev_put(dev, &dev_tracker);
+ } else {
+ ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb));
+ }
+
+ return ret;
+}
+
+/* per-PHY ->done() handler for GET requests */
+static int ethnl_perphy_done(struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb);
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+
+ kfree(ethnl_ctx->reply_data);
+ kfree(ethnl_ctx->req_info);
+
+ return 0;
+}
+
+/* default ->done() handler for GET requests */
+static int ethnl_default_done(struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+
+ kfree(ctx->reply_data);
+ kfree(ctx->req_info);
+
+ return 0;
+}
+
+static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ const struct ethnl_request_ops *ops;
+ const u8 cmd = info->genlhdr->cmd;
+ struct ethnl_req_info *req_info;
+ struct net_device *dev;
+ int ret;
+
+ ops = ethnl_default_requests[cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
+ return -EOPNOTSUPP;
+ if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr))
+ return -EINVAL;
+
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+
+ ret = ethnl_default_parse(req_info, info, ops, true);
+ if (ret < 0)
+ goto out_free_req;
+
+ if (ops->set_validate) {
+ ret = ops->set_validate(req_info, info);
+ /* 0 means nothing to do */
+ if (ret <= 0)
+ goto out_dev;
+ }
+
+ dev = req_info->dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg),
+ GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg_pending) {
+ ret = -ENOMEM;
+ goto out_tie_cfg;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_free_cfg;
+
+ ret = ops->set(req_info, info);
+ if (ret < 0)
+ goto out_ops;
+
+ swap(dev->cfg, dev->cfg_pending);
+ if (!ret)
+ goto out_ops;
+ ethnl_notify(dev, ops->set_ntf_cmd, req_info);
+
+ ret = 0;
+out_ops:
+ ethnl_ops_complete(dev);
+out_free_cfg:
+ kfree(dev->cfg_pending);
+out_tie_cfg:
+ dev->cfg_pending = dev->cfg;
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+out_dev:
+ ethnl_parse_header_dev_put(req_info);
+out_free_req:
+ kfree(req_info);
+ return ret;
+}
+
+static const struct ethnl_request_ops *
+ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
+ [ETHTOOL_MSG_LINKINFO_NTF] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_FEATURES_NTF] = &ethnl_features_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_FEC_NTF] = &ethnl_fec_request_ops,
+ [ETHTOOL_MSG_MODULE_NTF] = &ethnl_module_request_ops,
+ [ETHTOOL_MSG_PLCA_NTF] = &ethnl_plca_cfg_request_ops,
+ [ETHTOOL_MSG_MM_NTF] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_RSS_NTF] = &ethnl_rss_request_ops,
+ [ETHTOOL_MSG_RSS_CREATE_NTF] = &ethnl_rss_request_ops,
+};
+
+/* default notification handler */
+static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *orig_req_info)
+{
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genl_info info;
+ struct sk_buff *skb;
+ void *reply_payload;
+ int reply_len;
+ int ret;
+
+ genl_info_init_ntf(&info, &ethtool_genl_family, cmd);
+
+ if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX ||
+ !ethnl_default_notify_ops[cmd],
+ "unexpected notification type %u\n", cmd))
+ return;
+ ops = ethnl_default_notify_ops[cmd];
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return;
+ }
+
+ req_info->dev = dev;
+ req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS;
+ if (orig_req_info) {
+ req_info->phy_index = orig_req_info->phy_index;
+ memcpy(&req_info[1], &orig_req_info[1],
+ ops->req_info_size - sizeof(*req_info));
+ }
+
+ netdev_ops_assert_locked(dev);
+
+ ethnl_init_reply_data(reply_data, ops, dev);
+ ret = ops->prepare_data(req_info, reply_data, &info);
+ if (ret < 0)
+ goto err_rep;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+ reply_len = ret + ethnl_reply_header_size();
+ skb = genlmsg_new(reply_len, GFP_KERNEL);
+ if (!skb)
+ goto err_cleanup;
+ reply_payload = ethnl_bcastmsg_put(skb, cmd);
+ if (!reply_payload)
+ goto err_skb;
+ ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr);
+ if (ret < 0)
+ goto err_msg;
+ ret = ops->fill_reply(skb, req_info, reply_data);
+ if (ret < 0)
+ goto err_msg;
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+
+ genlmsg_end(skb, reply_payload);
+ kfree(reply_data);
+ kfree(req_info);
+ ethnl_multicast(skb, dev);
+ return;
+
+err_msg:
+ WARN_ONCE(ret == -EMSGSIZE,
+ "calculated message payload length (%d) not sufficient\n",
+ reply_len);
+err_skb:
+ nlmsg_free(skb);
+err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+err_rep:
+ kfree(reply_data);
+ kfree(req_info);
+ return;
+}
+
+/* notifications */
+
+typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *req_info);
+
+static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
+ [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RSS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RSS_CREATE_NTF] = ethnl_default_notify,
+};
+
+void ethnl_notify(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *req_info)
+{
+ if (unlikely(!ethnl_ok))
+ return;
+ ASSERT_RTNL();
+
+ if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) &&
+ ethnl_notify_handlers[cmd]))
+ ethnl_notify_handlers[cmd](dev, cmd, req_info);
+ else
+ WARN_ONCE(1, "notification %u not implemented (dev=%s)\n",
+ cmd, netdev_name(dev));
+}
+
+void ethtool_notify(struct net_device *dev, unsigned int cmd)
+{
+ ethnl_notify(dev, cmd, NULL);
+}
+EXPORT_SYMBOL(ethtool_notify);
+
+static void ethnl_notify_features(struct netdev_notifier_info *info)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(info);
+
+ ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF);
+}
+
+static int ethnl_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct netdev_notifier_info *info = ptr;
+ struct netlink_ext_ack *extack;
+ struct net_device *dev;
+
+ dev = netdev_notifier_info_to_dev(info);
+ extack = netdev_notifier_info_to_extack(info);
+
+ switch (event) {
+ case NETDEV_FEAT_CHANGE:
+ ethnl_notify_features(ptr);
+ break;
+ case NETDEV_PRE_UP:
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware");
+ return NOTIFY_BAD;
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ethnl_netdev_notifier = {
+ .notifier_call = ethnl_netdev_event,
+};
+
+/* genetlink setup */
+
+static const struct genl_ops ethtool_genl_ops[] = {
+ {
+ .cmd = ETHTOOL_MSG_STRSET_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_strset_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_linkinfo_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKINFO_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_linkinfo_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKMODES_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_linkmodes_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKMODES_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_linkmodes_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKSTATE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_linkstate_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_DEBUG_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_debug_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_DEBUG_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_debug_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_WOL_GET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_wol_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_WOL_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_wol_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEATURES_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_features_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEATURES_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_features,
+ .policy = ethnl_features_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_privflags_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PRIVFLAGS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_privflags_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RINGS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_rings_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RINGS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_rings_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CHANNELS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_channels_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CHANNELS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_channels_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_coalesce_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_coalesce_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_pause_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_pause_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_eee_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_eee_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_tsinfo_start,
+ .dumpit = ethnl_tsinfo_dumpit,
+ .done = ethnl_tsinfo_done,
+ .policy = ethnl_tsinfo_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CABLE_TEST_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_act_cable_test,
+ .policy = ethnl_cable_test_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_act_cable_test_tdr,
+ .policy = ethnl_cable_test_tdr_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET,
+ .doit = ethnl_tunnel_info_doit,
+ .start = ethnl_tunnel_info_start,
+ .dumpit = ethnl_tunnel_info_dumpit,
+ .policy = ethnl_tunnel_info_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEC_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_fec_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEC_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_fec_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_module_eeprom_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_STATS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_stats_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_phc_vclocks_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_module_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_module_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_pse_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PSE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_pse_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_rss_dump_start,
+ .dumpit = ethnl_rss_dumpit,
+ .policy = ethnl_rss_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PLCA_GET_CFG,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_plca_get_cfg_policy,
+ .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PLCA_SET_CFG,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_plca_set_cfg_policy,
+ .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PLCA_GET_STATUS,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_plca_get_status_policy,
+ .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MM_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_mm_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MM_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_mm_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_FW_FLASH_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_act_module_fw_flash,
+ .policy = ethnl_module_fw_flash_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PHY_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_phy_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSCONFIG_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_tsconfig_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSCONFIG_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_tsconfig_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_rss_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_CREATE_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_rss_create_doit,
+ .policy = ethnl_rss_create_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_DELETE_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_rss_delete_doit,
+ .policy = ethnl_rss_delete_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_mse_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_mse_get_policy) - 1,
+ },
+};
+
+static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
+ [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME },
+};
+
+static struct genl_family ethtool_genl_family __ro_after_init = {
+ .name = ETHTOOL_GENL_NAME,
+ .version = ETHTOOL_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ethtool_genl_ops,
+ .n_ops = ARRAY_SIZE(ethtool_genl_ops),
+ .resv_start_op = ETHTOOL_MSG_MODULE_GET + 1,
+ .mcgrps = ethtool_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps),
+ .sock_priv_size = sizeof(struct ethnl_sock_priv),
+ .sock_priv_destroy = ethnl_sock_priv_destroy,
+};
+
+/* module setup */
+
+static int __init ethnl_init(void)
+{
+ int ret;
+
+ ret = genl_register_family(&ethtool_genl_family);
+ if (WARN(ret < 0, "ethtool: genetlink family registration failed"))
+ return ret;
+ ethnl_ok = true;
+
+ ret = register_netdevice_notifier(&ethnl_netdev_notifier);
+ WARN(ret < 0, "ethtool: net device notifier registration failed");
+ return ret;
+}
+
+subsys_initcall(ethnl_init);
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
new file mode 100644
index 000000000000..89010eaa67df
--- /dev/null
+++ b/net/ethtool/netlink.h
@@ -0,0 +1,525 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_NETLINK_H
+#define _NET_ETHTOOL_NETLINK_H
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+
+struct ethnl_req_info;
+
+u32 ethnl_bcast_seq_next(void);
+int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
+ const struct nlattr *nest, struct net *net,
+ struct netlink_ext_ack *extack,
+ bool require_dev);
+int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
+ u16 attrtype);
+struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
+ u16 hdr_attrtype, struct genl_info *info,
+ void **ehdrp);
+void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd);
+void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd);
+void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd);
+int ethnl_multicast(struct sk_buff *skb, struct net_device *dev);
+void ethnl_notify(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *req_info);
+
+/**
+ * ethnl_strz_size() - calculate attribute length for fixed size string
+ * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
+ *
+ * Return: total length of an attribute with null terminated string from @s
+ */
+static inline int ethnl_strz_size(const char *s)
+{
+ return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1);
+}
+
+/**
+ * ethnl_put_strz() - put string attribute with fixed size string
+ * @skb: skb with the message
+ * @attrtype: attribute type
+ * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
+ *
+ * Puts an attribute with null terminated string from @s into the message.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype,
+ const char *s)
+{
+ unsigned int len = strnlen(s, ETH_GSTRING_LEN);
+ struct nlattr *attr;
+
+ attr = nla_reserve(skb, attrtype, len + 1);
+ if (!attr)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(attr), s, len);
+ ((char *)nla_data(attr))[len] = '\0';
+ return 0;
+}
+
+/**
+ * ethnl_update_u32() - update u32 value from NLA_U32 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Copy the u32 value from NLA_U32 netlink attribute @attr into variable
+ * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
+ * is set to true if this function changed the value of *dst, otherwise it
+ * is left as is.
+ */
+static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u32 val;
+
+ if (!attr)
+ return;
+ val = nla_get_u32(attr);
+ if (*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_u8() - update u8 value from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Copy the u8 value from NLA_U8 netlink attribute @attr into variable
+ * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
+ * is set to true if this function changed the value of *dst, otherwise it
+ * is left as is.
+ */
+static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = nla_get_u8(attr);
+ if (*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable
+ * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is
+ * null. Bool pointed to by @mod is set to true if this function changed the
+ * logical value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = !!nla_get_u8(attr);
+ if (!!*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable
+ * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is
+ * null. Bool pointed to by @mod is set to true if this function changed the
+ * logical value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = !!nla_get_u8(attr);
+ if (!!*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_binary() - update binary data from NLA_BINARY attribute
+ * @dst: value to update
+ * @len: destination buffer length
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block
+ * of length @len at @dst by attribute payload; do nothing if @attr is null.
+ * Bool pointed to by @mod is set to true if this function changed the logical
+ * value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_binary(void *dst, unsigned int len,
+ const struct nlattr *attr, bool *mod)
+{
+ if (!attr)
+ return;
+ if (nla_len(attr) < len)
+ len = nla_len(attr);
+ if (!memcmp(dst, nla_data(attr), len))
+ return;
+
+ memcpy(dst, nla_data(attr), len);
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Update bits in u32 value which are set in attribute's mask to values from
+ * attribute's value. Do nothing if @attr is null or the value wouldn't change;
+ * otherwise, set bool pointed to by @mod to true.
+ */
+static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ struct nla_bitfield32 change;
+ u32 newval;
+
+ if (!attr)
+ return;
+ change = nla_get_bitfield32(attr);
+ newval = (*dst & ~change.selector) | (change.value & change.selector);
+ if (*dst == newval)
+ return;
+
+ *dst = newval;
+ *mod = true;
+}
+
+/**
+ * ethnl_reply_header_size() - total size of reply header
+ *
+ * This is an upper estimate so that we do not need to hold RTNL lock longer
+ * than necessary (to prevent rename between size estimate and composing the
+ * message). Accounts only for device ifindex and name as those are the only
+ * attributes ethnl_fill_reply_header() puts into the reply header.
+ */
+static inline unsigned int ethnl_reply_header_size(void)
+{
+ return nla_total_size(nla_total_size(sizeof(u32)) +
+ nla_total_size(IFNAMSIZ));
+}
+
+/* GET request handling */
+
+/* Unified processing of GET requests uses two data structures: request info
+ * and reply data. Request info holds information parsed from client request
+ * and its stays constant through all request processing. Reply data holds data
+ * retrieved from ethtool_ops callbacks or other internal sources which is used
+ * to compose the reply. When processing a dump request, request info is filled
+ * only once (when the request message is parsed) but reply data is filled for
+ * each reply message.
+ *
+ * Both structures consist of part common for all request types (struct
+ * ethnl_req_info and struct ethnl_reply_data defined below) and optional
+ * parts specific for each request type. Common part always starts at offset 0.
+ */
+
+/**
+ * struct ethnl_req_info - base type of request information for GET requests
+ * @dev: network device the request is for (may be null)
+ * @dev_tracker: refcount tracker for @dev reference
+ * @flags: request flags common for all request types
+ * @phy_index: phy_device index connected to @dev this request is for. Can be
+ * 0 if the request doesn't target a phy, or if the @dev's attached
+ * phy is targeted.
+ *
+ * This is a common base for request specific structures holding data from
+ * parsed userspace request. These always embed struct ethnl_req_info at
+ * zero offset.
+ */
+struct ethnl_req_info {
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+ u32 flags;
+ u32 phy_index;
+};
+
+static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info)
+{
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+}
+
+/**
+ * ethnl_req_get_phydev() - Gets the phy_device targeted by this request,
+ * if any. Must be called under rntl_lock().
+ * @req_info: The ethnl request to get the phy from.
+ * @tb: The netlink attributes array, for error reporting.
+ * @header: The netlink header index, used for error reporting.
+ * @extack: The netlink extended ACK, for error reporting.
+ *
+ * The caller must hold RTNL, until it's done interacting with the returned
+ * phy_device.
+ *
+ * Return: A phy_device pointer corresponding either to the passed phy_index
+ * if one is provided. If not, the phy_device attached to the
+ * net_device targeted by this request is returned. If there's no
+ * targeted net_device, or no phy_device is attached, NULL is
+ * returned. If the provided phy_index is invalid, an error pointer
+ * is returned.
+ */
+struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info,
+ struct nlattr **tb, unsigned int header,
+ struct netlink_ext_ack *extack);
+
+/**
+ * struct ethnl_reply_data - base type of reply data for GET requests
+ * @dev: device for current reply message; in single shot requests it is
+ * equal to &ethnl_req_info.dev; in dumps it's different for each
+ * reply message
+ *
+ * This is a common base for request specific structures holding data for
+ * kernel reply message. These always embed struct ethnl_reply_data at zero
+ * offset.
+ */
+struct ethnl_reply_data {
+ struct net_device *dev;
+};
+
+int ethnl_ops_begin(struct net_device *dev);
+void ethnl_ops_complete(struct net_device *dev);
+
+enum ethnl_sock_type {
+ ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH,
+};
+
+struct ethnl_sock_priv {
+ struct net_device *dev;
+ u32 portid;
+ enum ethnl_sock_type type;
+};
+
+int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
+ enum ethnl_sock_type type);
+
+/**
+ * struct ethnl_request_ops - unified handling of GET and SET requests
+ * @request_cmd: command id for request (GET)
+ * @reply_cmd: command id for reply (GET_REPLY)
+ * @hdr_attr: attribute type for request header
+ * @req_info_size: size of request info
+ * @reply_data_size: size of reply data
+ * @allow_nodev_do: allow non-dump request with no device identification
+ * @set_ntf_cmd: notification to generate on changes (SET)
+ * @parse_request:
+ * Parse request except common header (struct ethnl_req_info). Common
+ * header is already filled on entry, the rest up to @repdata_offset
+ * is zero initialized. This callback should only modify type specific
+ * request info by parsed attributes from request message.
+ * Called for both GET and SET. Information parsed for SET will
+ * be conveyed to the req_info used during NTF generation.
+ * @prepare_data:
+ * Retrieve and prepare data needed to compose a reply message. Calls to
+ * ethtool_ops handlers are limited to this callback. Common reply data
+ * (struct ethnl_reply_data) is filled on entry, type specific part after
+ * it is zero initialized. This callback should only modify the type
+ * specific part of reply data. Device identification from struct
+ * ethnl_reply_data is to be used as for dump requests, it iterates
+ * through network devices while dev member of struct ethnl_req_info
+ * points to the device from client request.
+ * @reply_size:
+ * Estimate reply message size. Returned value must be sufficient for
+ * message payload without common reply header. The callback may returned
+ * estimate higher than actual message size if exact calculation would
+ * not be worth the saved memory space.
+ * @fill_reply:
+ * Fill reply message payload (except for common header) from reply data.
+ * The callback must not generate more payload than previously called
+ * ->reply_size() estimated.
+ * @cleanup_data:
+ * Optional cleanup called when reply data is no longer needed. Can be
+ * used e.g. to free any additional data structures outside the main
+ * structure which were allocated by ->prepare_data(). When processing
+ * dump requests, ->cleanup() is called for each message.
+ * @set_validate:
+ * Check if set operation is supported for a given device, and perform
+ * extra input checks. Expected return values:
+ * - 0 if the operation is a noop for the device (rare)
+ * - 1 if operation should proceed to calling @set
+ * - negative errno on errors
+ * Called without any locks, just a reference on the netdev.
+ * @set:
+ * Execute the set operation. The implementation should return
+ * - 0 if no configuration has changed
+ * - 1 if configuration changed and notification should be generated
+ * - negative errno on errors
+ *
+ * Description of variable parts of GET request handling when using the
+ * unified infrastructure. When used, a pointer to an instance of this
+ * structure is to be added to &ethnl_default_requests array and generic
+ * handlers ethnl_default_doit(), ethnl_default_dumpit(),
+ * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops;
+ * ethnl_default_notify() can be used in @ethnl_notify_handlers to send
+ * notifications of the corresponding type.
+ */
+struct ethnl_request_ops {
+ u8 request_cmd;
+ u8 reply_cmd;
+ u16 hdr_attr;
+ unsigned int req_info_size;
+ unsigned int reply_data_size;
+ bool allow_nodev_do;
+ u8 set_ntf_cmd;
+
+ int (*parse_request)(struct ethnl_req_info *req_info,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack);
+ int (*prepare_data)(const struct ethnl_req_info *req_info,
+ struct ethnl_reply_data *reply_data,
+ const struct genl_info *info);
+ int (*reply_size)(const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data);
+ int (*fill_reply)(struct sk_buff *skb,
+ const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data);
+ void (*cleanup_data)(struct ethnl_reply_data *reply_data);
+
+ int (*set_validate)(struct ethnl_req_info *req_info,
+ struct genl_info *info);
+ int (*set)(struct ethnl_req_info *req_info,
+ struct genl_info *info);
+};
+
+/* request handlers */
+
+extern const struct ethnl_request_ops ethnl_strset_request_ops;
+extern const struct ethnl_request_ops ethnl_linkinfo_request_ops;
+extern const struct ethnl_request_ops ethnl_linkmodes_request_ops;
+extern const struct ethnl_request_ops ethnl_linkstate_request_ops;
+extern const struct ethnl_request_ops ethnl_debug_request_ops;
+extern const struct ethnl_request_ops ethnl_wol_request_ops;
+extern const struct ethnl_request_ops ethnl_features_request_ops;
+extern const struct ethnl_request_ops ethnl_privflags_request_ops;
+extern const struct ethnl_request_ops ethnl_rings_request_ops;
+extern const struct ethnl_request_ops ethnl_channels_request_ops;
+extern const struct ethnl_request_ops ethnl_coalesce_request_ops;
+extern const struct ethnl_request_ops ethnl_pause_request_ops;
+extern const struct ethnl_request_ops ethnl_eee_request_ops;
+extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
+extern const struct ethnl_request_ops ethnl_fec_request_ops;
+extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
+extern const struct ethnl_request_ops ethnl_stats_request_ops;
+extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
+extern const struct ethnl_request_ops ethnl_module_request_ops;
+extern const struct ethnl_request_ops ethnl_pse_request_ops;
+extern const struct ethnl_request_ops ethnl_rss_request_ops;
+extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
+extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
+extern const struct ethnl_request_ops ethnl_mm_request_ops;
+extern const struct ethnl_request_ops ethnl_phy_request_ops;
+extern const struct ethnl_request_ops ethnl_tsconfig_request_ops;
+extern const struct ethnl_request_ops ethnl_mse_request_ops;
+
+extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1];
+extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1];
+extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
+extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
+extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
+extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1];
+extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1];
+extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1];
+extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1];
+extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1];
+extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1];
+extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1];
+extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1];
+extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1];
+extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
+extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
+extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1];
+extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
+extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
+extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
+extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1];
+extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1];
+extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1];
+extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
+extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1];
+extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1];
+extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
+extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
+extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
+extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
+extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
+extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1];
+extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1];
+extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
+extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1];
+extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1];
+extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
+extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
+extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1];
+extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1];
+extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1];
+extern const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1];
+extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1];
+extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1];
+extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
+extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1];
+extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1];
+extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1];
+extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1];
+extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1];
+extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1];
+extern const struct nla_policy ethnl_mse_get_policy[ETHTOOL_A_MSE_HEADER + 1];
+
+int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
+int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
+int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info);
+int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info);
+int ethnl_tunnel_info_start(struct netlink_callback *cb);
+int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info);
+int ethnl_rss_dump_start(struct netlink_callback *cb);
+int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_tsinfo_start(struct netlink_callback *cb);
+int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_tsinfo_done(struct netlink_callback *cb);
+int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info);
+int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info);
+
+extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
+extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN];
+extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN];
+
+#endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
new file mode 100644
index 000000000000..0f9af1e66548
--- /dev/null
+++ b/net/ethtool/pause.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct pause_req_info {
+ struct ethnl_req_info base;
+ enum ethtool_mac_stats_src src;
+};
+
+#define PAUSE_REQINFO(__req_base) \
+ container_of(__req_base, struct pause_req_info, base)
+
+struct pause_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_pauseparam pauseparam;
+ struct ethtool_pause_stats pausestat;
+};
+
+#define PAUSE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct pause_reply_data, base)
+
+const struct nla_policy ethnl_pause_get_policy[] = {
+ [ETHTOOL_A_PAUSE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
+ [ETHTOOL_A_PAUSE_STATS_SRC] =
+ NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC),
+};
+
+static int pause_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE;
+ struct pause_req_info *req_info = PAUSE_REQINFO(req_base);
+
+ if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) {
+ if (!(req_base->flags & ETHTOOL_FLAG_STATS)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "ETHTOOL_FLAG_STATS must be set when requesting a source of stats");
+ return -EINVAL;
+ }
+
+ src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]);
+ }
+
+ req_info->src = src;
+
+ return 0;
+}
+
+static int pause_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ const struct pause_req_info *req_info = PAUSE_REQINFO(req_base);
+ struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ enum ethtool_mac_stats_src src = req_info->src;
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+
+ ethtool_stats_init((u64 *)&data->pausestat,
+ sizeof(data->pausestat) / 8);
+ data->pausestat.src = src;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ if ((src == ETHTOOL_MAC_STATS_SRC_EMAC ||
+ src == ETHTOOL_MAC_STATS_SRC_PMAC) &&
+ !__ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Device does not support MAC merge layer");
+ ethnl_ops_complete(dev);
+ return -EOPNOTSUPP;
+ }
+
+ dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam);
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ dev->ethtool_ops->get_pause_stats)
+ dev->ethtool_ops->get_pause_stats(dev, &data->pausestat);
+
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int pause_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
+ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */
+ nla_total_size(sizeof(u8)); /* _PAUSE_TX */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ n += nla_total_size(0) + /* _PAUSE_STATS */
+ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */
+ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT;
+ return n;
+}
+
+static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype,
+ u16 padtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, padtype))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int pause_put_stats(struct sk_buff *skb,
+ const struct ethtool_pause_stats *pause_stats)
+{
+ const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD;
+ struct nlattr *nest;
+
+ if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src))
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (ethtool_put_stat(skb, pause_stats->tx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) ||
+ ethtool_put_stat(skb, pause_stats->rx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int pause_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ const struct ethtool_pauseparam *pauseparam = &data->pauseparam;
+
+ if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause))
+ return -EMSGSIZE;
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ pause_put_stats(skb, &data->pausestat))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* PAUSE_SET */
+
+const struct nla_policy ethnl_pause_set_policy[] = {
+ [ETHTOOL_A_PAUSE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_STATS_SRC] = { .type = NLA_REJECT },
+};
+
+static int
+ethnl_set_pause_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct net_device *dev = req_info->dev;
+ struct ethtool_pauseparam params = {};
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ dev->ethtool_ops->get_pauseparam(dev, &params);
+
+ ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod);
+ ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod);
+ ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod);
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_pauseparam(dev, &params);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_pause_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PAUSE_GET,
+ .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PAUSE_HEADER,
+ .req_info_size = sizeof(struct pause_req_info),
+ .reply_data_size = sizeof(struct pause_reply_data),
+
+ .parse_request = pause_parse_request,
+ .prepare_data = pause_prepare_data,
+ .reply_size = pause_reply_size,
+ .fill_reply = pause_fill_reply,
+
+ .set_validate = ethnl_set_pause_validate,
+ .set = ethnl_set_pause,
+ .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF,
+};
diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c
new file mode 100644
index 000000000000..cadaabed60bd
--- /dev/null
+++ b/net/ethtool/phc_vclocks.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 NXP
+ */
+#include "netlink.h"
+#include "common.h"
+
+struct phc_vclocks_req_info {
+ struct ethnl_req_info base;
+};
+
+struct phc_vclocks_reply_data {
+ struct ethnl_reply_data base;
+ int num;
+ int *index;
+};
+
+#define PHC_VCLOCKS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct phc_vclocks_reply_data, base)
+
+const struct nla_policy ethnl_phc_vclocks_get_policy[] = {
+ [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->num = ethtool_get_phc_vclocks(dev, &data->index);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct phc_vclocks_reply_data *data =
+ PHC_VCLOCKS_REPDATA(reply_base);
+ int len = 0;
+
+ if (data->num > 0) {
+ len += nla_total_size(sizeof(u32));
+ len += nla_total_size(sizeof(s32) * data->num);
+ }
+
+ return len;
+}
+
+static int phc_vclocks_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct phc_vclocks_reply_data *data =
+ PHC_VCLOCKS_REPDATA(reply_base);
+
+ if (data->num <= 0)
+ return 0;
+
+ if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) ||
+ nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX,
+ sizeof(s32) * data->num, data->index))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ const struct phc_vclocks_reply_data *data =
+ PHC_VCLOCKS_REPDATA(reply_base);
+
+ kfree(data->index);
+}
+
+const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET,
+ .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER,
+ .req_info_size = sizeof(struct phc_vclocks_req_info),
+ .reply_data_size = sizeof(struct phc_vclocks_reply_data),
+
+ .prepare_data = phc_vclocks_prepare_data,
+ .reply_size = phc_vclocks_reply_size,
+ .fill_reply = phc_vclocks_fill_reply,
+ .cleanup_data = phc_vclocks_cleanup_data,
+};
diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c
new file mode 100644
index 000000000000..68372bef4b2f
--- /dev/null
+++ b/net/ethtool/phy.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Bootlin
+ *
+ */
+#include "common.h"
+#include "netlink.h"
+
+#include <linux/phy.h>
+#include <linux/phy_link_topology.h>
+#include <linux/sfp.h>
+#include <net/netdev_lock.h>
+
+struct phy_req_info {
+ struct ethnl_req_info base;
+};
+
+struct phy_reply_data {
+ struct ethnl_reply_data base;
+ u32 phyindex;
+ char *drvname;
+ char *name;
+ unsigned int upstream_type;
+ char *upstream_sfp_name;
+ unsigned int upstream_index;
+ char *downstream_sfp_name;
+};
+
+#define PHY_REPDATA(__reply_base) \
+ container_of(__reply_base, struct phy_reply_data, base)
+
+const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = {
+ [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phy_reply_size(const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data)
+{
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+ size_t size = 0;
+
+ /* ETHTOOL_A_PHY_INDEX */
+ size += nla_total_size(sizeof(u32));
+
+ /* ETHTOOL_A_DRVNAME */
+ if (rep_data->drvname)
+ size += nla_total_size(strlen(rep_data->drvname) + 1);
+
+ /* ETHTOOL_A_NAME */
+ size += nla_total_size(strlen(rep_data->name) + 1);
+
+ /* ETHTOOL_A_PHY_UPSTREAM_TYPE */
+ size += nla_total_size(sizeof(u32));
+
+ /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */
+ if (rep_data->upstream_sfp_name)
+ size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1);
+
+ /* ETHTOOL_A_PHY_UPSTREAM_INDEX */
+ if (rep_data->upstream_index)
+ size += nla_total_size(sizeof(u32));
+
+ /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */
+ if (rep_data->downstream_sfp_name)
+ size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1);
+
+ return size;
+}
+
+static int phy_prepare_data(const struct ethnl_req_info *req_info,
+ struct ethnl_reply_data *reply_data,
+ const struct genl_info *info)
+{
+ struct phy_link_topology *topo = reply_data->dev->link_topo;
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+ struct nlattr **tb = info->attrs;
+ struct phy_device_node *pdn;
+ struct phy_device *phydev;
+
+ /* RTNL is held by the caller */
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev))
+ return -EOPNOTSUPP;
+
+ pdn = xa_load(&topo->phys, phydev->phyindex);
+ if (!pdn)
+ return -EOPNOTSUPP;
+
+ rep_data->phyindex = phydev->phyindex;
+ rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL);
+ rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL);
+ rep_data->upstream_type = pdn->upstream_type;
+
+ if (pdn->upstream_type == PHY_UPSTREAM_PHY) {
+ struct phy_device *upstream = pdn->upstream.phydev;
+ rep_data->upstream_index = upstream->phyindex;
+ }
+
+ if (pdn->parent_sfp_bus)
+ rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus),
+ GFP_KERNEL);
+
+ if (phydev->sfp_bus)
+ rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus),
+ GFP_KERNEL);
+
+ return 0;
+}
+
+static int phy_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data)
+{
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+
+ if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) ||
+ nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) ||
+ nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type))
+ return -EMSGSIZE;
+
+ if (rep_data->drvname &&
+ nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname))
+ return -EMSGSIZE;
+
+ if (rep_data->upstream_index &&
+ nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX,
+ rep_data->upstream_index))
+ return -EMSGSIZE;
+
+ if (rep_data->upstream_sfp_name &&
+ nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME,
+ rep_data->upstream_sfp_name))
+ return -EMSGSIZE;
+
+ if (rep_data->downstream_sfp_name &&
+ nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME,
+ rep_data->downstream_sfp_name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static void phy_cleanup_data(struct ethnl_reply_data *reply_data)
+{
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+
+ kfree(rep_data->drvname);
+ kfree(rep_data->name);
+ kfree(rep_data->upstream_sfp_name);
+ kfree(rep_data->downstream_sfp_name);
+}
+
+const struct ethnl_request_ops ethnl_phy_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PHY_GET,
+ .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PHY_HEADER,
+ .req_info_size = sizeof(struct phy_req_info),
+ .reply_data_size = sizeof(struct phy_reply_data),
+
+ .prepare_data = phy_prepare_data,
+ .reply_size = phy_reply_size,
+ .fill_reply = phy_fill_reply,
+ .cleanup_data = phy_cleanup_data,
+};
diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c
new file mode 100644
index 000000000000..e1f7820a6158
--- /dev/null
+++ b/net/ethtool/plca.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include <linux/ethtool_netlink.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct plca_req_info {
+ struct ethnl_req_info base;
+};
+
+struct plca_reply_data {
+ struct ethnl_reply_data base;
+ struct phy_plca_cfg plca_cfg;
+ struct phy_plca_status plca_st;
+};
+
+// Helpers ------------------------------------------------------------------ //
+
+#define PLCA_REPDATA(__reply_base) \
+ container_of(__reply_base, struct plca_reply_data, base)
+
+// PLCA get configuration message ------------------------------------------- //
+
+const struct nla_policy ethnl_plca_get_cfg_policy[] = {
+ [ETHTOOL_A_PLCA_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid,
+ bool *mod)
+{
+ const struct nlattr *attr = tb[attrid];
+
+ if (!attr ||
+ WARN_ON_ONCE(attrid >= ARRAY_SIZE(ethnl_plca_set_cfg_policy)))
+ return;
+
+ switch (ethnl_plca_set_cfg_policy[attrid].type) {
+ case NLA_U8:
+ *dst = nla_get_u8(attr);
+ break;
+ case NLA_U32:
+ *dst = nla_get_u32(attr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ *mod = true;
+}
+
+static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER,
+ info->extack);
+ // check that the PHY device is available and connected
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ // note: rtnl_lock is held already by ethnl_default_doit
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->get_plca_cfg) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out;
+
+ memset(&data->plca_cfg, 0xff,
+ sizeof_field(struct plca_reply_data, plca_cfg));
+
+ ret = ops->get_plca_cfg(phydev, &data->plca_cfg);
+ ethnl_ops_complete(dev);
+
+out:
+ return ret;
+}
+
+static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u16)) + /* _VERSION */
+ nla_total_size(sizeof(u8)) + /* _ENABLED */
+ nla_total_size(sizeof(u32)) + /* _NODE_CNT */
+ nla_total_size(sizeof(u32)) + /* _NODE_ID */
+ nla_total_size(sizeof(u32)) + /* _TO_TIMER */
+ nla_total_size(sizeof(u32)) + /* _BURST_COUNT */
+ nla_total_size(sizeof(u32)); /* _BURST_TIMER */
+}
+
+static int plca_get_cfg_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ const struct phy_plca_cfg *plca = &data->plca_cfg;
+
+ if ((plca->version >= 0 &&
+ nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) ||
+ (plca->enabled >= 0 &&
+ nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) ||
+ (plca->node_id >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) ||
+ (plca->node_cnt >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) ||
+ (plca->to_tmr >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) ||
+ (plca->burst_cnt >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) ||
+ (plca->burst_tmr >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr)))
+ return -EMSGSIZE;
+
+ return 0;
+};
+
+// PLCA set configuration message ------------------------------------------- //
+
+const struct nla_policy ethnl_plca_set_cfg_policy[] = {
+ [ETHTOOL_A_PLCA_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255),
+ [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255),
+ [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255),
+ [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255),
+ [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255),
+};
+
+static int
+ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_plca_cfg plca_cfg;
+ struct phy_device *phydev;
+ bool mod = false;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PLCA_HEADER,
+ info->extack);
+ // check that the PHY device is available and connected
+ if (IS_ERR_OR_NULL(phydev))
+ return -EOPNOTSUPP;
+
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->set_plca_cfg)
+ return -EOPNOTSUPP;
+
+ memset(&plca_cfg, 0xff, sizeof(plca_cfg));
+ plca_update_sint(&plca_cfg.enabled, tb, ETHTOOL_A_PLCA_ENABLED, &mod);
+ plca_update_sint(&plca_cfg.node_id, tb, ETHTOOL_A_PLCA_NODE_ID, &mod);
+ plca_update_sint(&plca_cfg.node_cnt, tb, ETHTOOL_A_PLCA_NODE_CNT, &mod);
+ plca_update_sint(&plca_cfg.to_tmr, tb, ETHTOOL_A_PLCA_TO_TMR, &mod);
+ plca_update_sint(&plca_cfg.burst_cnt, tb, ETHTOOL_A_PLCA_BURST_CNT,
+ &mod);
+ plca_update_sint(&plca_cfg.burst_tmr, tb, ETHTOOL_A_PLCA_BURST_TMR,
+ &mod);
+ if (!mod)
+ return 0;
+
+ ret = ops->set_plca_cfg(phydev, &plca_cfg, info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_plca_cfg_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG,
+ .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY,
+ .hdr_attr = ETHTOOL_A_PLCA_HEADER,
+ .req_info_size = sizeof(struct plca_req_info),
+ .reply_data_size = sizeof(struct plca_reply_data),
+
+ .prepare_data = plca_get_cfg_prepare_data,
+ .reply_size = plca_get_cfg_reply_size,
+ .fill_reply = plca_get_cfg_fill_reply,
+
+ .set = ethnl_set_plca,
+ .set_ntf_cmd = ETHTOOL_MSG_PLCA_NTF,
+};
+
+// PLCA get status message -------------------------------------------------- //
+
+const struct nla_policy ethnl_plca_get_status_policy[] = {
+ [ETHTOOL_A_PLCA_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER,
+ info->extack);
+ // check that the PHY device is available and connected
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ // note: rtnl_lock is held already by ethnl_default_doit
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->get_plca_status) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out;
+
+ memset(&data->plca_st, 0xff,
+ sizeof_field(struct plca_reply_data, plca_st));
+
+ ret = ops->get_plca_status(phydev, &data->plca_st);
+ ethnl_ops_complete(dev);
+out:
+ return ret;
+}
+
+static int plca_get_status_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)); /* _STATUS */
+}
+
+static int plca_get_status_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ const u8 status = data->plca_st.pst;
+
+ if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status))
+ return -EMSGSIZE;
+
+ return 0;
+};
+
+const struct ethnl_request_ops ethnl_plca_status_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS,
+ .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY,
+ .hdr_attr = ETHTOOL_A_PLCA_HEADER,
+ .req_info_size = sizeof(struct plca_req_info),
+ .reply_data_size = sizeof(struct plca_reply_data),
+
+ .prepare_data = plca_get_status_prepare_data,
+ .reply_size = plca_get_status_reply_size,
+ .fill_reply = plca_get_status_fill_reply,
+};
diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c
new file mode 100644
index 000000000000..297be6a13ab9
--- /dev/null
+++ b/net/ethtool/privflags.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct privflags_req_info {
+ struct ethnl_req_info base;
+};
+
+struct privflags_reply_data {
+ struct ethnl_reply_data base;
+ const char (*priv_flag_names)[ETH_GSTRING_LEN];
+ unsigned int n_priv_flags;
+ u32 priv_flags;
+};
+
+#define PRIVFLAGS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct privflags_reply_data, base)
+
+const struct nla_policy ethnl_privflags_get_policy[] = {
+ [ETHTOOL_A_PRIVFLAGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int ethnl_get_priv_flags_info(struct net_device *dev,
+ unsigned int *count,
+ const char (**names)[ETH_GSTRING_LEN])
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int nflags;
+
+ nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (nflags < 0)
+ return nflags;
+
+ if (names) {
+ *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL);
+ if (!*names)
+ return -ENOMEM;
+ ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names);
+ }
+
+ /* We can pass more than 32 private flags to userspace via netlink but
+ * we cannot get more with ethtool_ops::get_priv_flags(). Note that we
+ * must not adjust nflags before allocating the space for flag names
+ * as the buffer must be large enough for all flags.
+ */
+ if (WARN_ONCE(nflags > 32,
+ "device %s reports more than 32 private flags (%d)\n",
+ netdev_name(dev), nflags))
+ nflags = 32;
+ *count = nflags;
+
+ return 0;
+}
+
+static int privflags_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const char (*names)[ETH_GSTRING_LEN];
+ const struct ethtool_ops *ops;
+ unsigned int nflags;
+ int ret;
+
+ ops = dev->ethtool_ops;
+ if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_get_priv_flags_info(dev, &nflags, &names);
+ if (ret < 0)
+ goto out_ops;
+ data->priv_flags = ops->get_priv_flags(dev);
+ data->priv_flag_names = names;
+ data->n_priv_flags = nflags;
+
+out_ops:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int privflags_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags);
+
+ return ethnl_bitset32_size(&data->priv_flags, &all_flags,
+ data->n_priv_flags,
+ data->priv_flag_names, compact);
+}
+
+static int privflags_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags);
+
+ return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS,
+ &data->priv_flags, &all_flags,
+ data->n_priv_flags, data->priv_flag_names,
+ compact);
+}
+
+static void privflags_cleanup_data(struct ethnl_reply_data *reply_data)
+{
+ struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data);
+
+ kfree(data->priv_flag_names);
+}
+
+/* PRIVFLAGS_SET */
+
+const struct nla_policy ethnl_privflags_set_policy[] = {
+ [ETHTOOL_A_PRIVFLAGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED },
+};
+
+static int
+ethnl_set_privflags_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS])
+ return -EINVAL;
+
+ if (!ops->get_priv_flags || !ops->set_priv_flags ||
+ !ops->get_sset_count || !ops->get_strings)
+ return -EOPNOTSUPP;
+ return 1;
+}
+
+static int
+ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const char (*names)[ETH_GSTRING_LEN] = NULL;
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ unsigned int nflags;
+ bool mod = false;
+ bool compact;
+ u32 flags;
+ int ret;
+
+ ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names);
+ if (ret < 0)
+ return ret;
+ flags = dev->ethtool_ops->get_priv_flags(dev);
+
+ ret = ethnl_update_bitset32(&flags, nflags,
+ tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names,
+ info->extack, &mod);
+ if (ret < 0 || !mod)
+ goto out_free;
+ ret = dev->ethtool_ops->set_priv_flags(dev, flags);
+ if (ret < 0)
+ goto out_free;
+ ret = 1;
+
+out_free:
+ kfree(names);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_privflags_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
+ .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER,
+ .req_info_size = sizeof(struct privflags_req_info),
+ .reply_data_size = sizeof(struct privflags_reply_data),
+
+ .prepare_data = privflags_prepare_data,
+ .reply_size = privflags_reply_size,
+ .fill_reply = privflags_fill_reply,
+ .cleanup_data = privflags_cleanup_data,
+
+ .set_validate = ethnl_set_privflags_validate,
+ .set = ethnl_set_privflags,
+ .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF,
+};
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
new file mode 100644
index 000000000000..24def9c9dd54
--- /dev/null
+++ b/net/ethtool/pse-pd.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// ethtool interface for Ethernet PSE (Power Sourcing Equipment)
+// and PD (Powered Device)
+//
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+//
+
+#include "common.h"
+#include "linux/pse-pd/pse.h"
+#include "netlink.h"
+#include <linux/ethtool_netlink.h>
+#include <linux/ethtool.h>
+#include <linux/export.h>
+#include <linux/phy.h>
+
+struct pse_req_info {
+ struct ethnl_req_info base;
+};
+
+struct pse_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_pse_control_status status;
+};
+
+#define PSE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct pse_reply_data, base)
+
+/* PSE_GET */
+
+const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = {
+ [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int pse_get_pse_attributes(struct phy_device *phydev,
+ struct netlink_ext_ack *extack,
+ struct pse_reply_data *data)
+{
+ if (!phydev) {
+ NL_SET_ERR_MSG(extack, "No PHY found");
+ return -EOPNOTSUPP;
+ }
+
+ if (!phydev->psec) {
+ NL_SET_ERR_MSG(extack, "No PSE is attached");
+ return -EOPNOTSUPP;
+ }
+
+ memset(&data->status, 0, sizeof(data->status));
+
+ return pse_ethtool_get_status(phydev->psec, extack, &data->status);
+}
+
+static int pse_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct pse_reply_data *data = PSE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER,
+ info->extack);
+ if (IS_ERR(phydev))
+ return -ENODEV;
+
+ ret = pse_get_pse_attributes(phydev, info->extack, data);
+
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int pse_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+ const struct ethtool_pse_control_status *st = &data->status;
+ int len = 0;
+
+ if (st->pw_d_id)
+ len += nla_total_size(sizeof(u32)); /* _PSE_PW_D_ID */
+ if (st->podl_admin_state > 0)
+ len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */
+ if (st->podl_pw_status > 0)
+ len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */
+ if (st->c33_admin_state > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_ADMIN_STATE */
+ if (st->c33_pw_status > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_D_STATUS */
+ if (st->c33_pw_class > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_CLASS */
+ if (st->c33_actual_pw > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_ACTUAL_PW */
+ if (st->c33_ext_state_info.c33_pse_ext_state > 0) {
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_EXT_STATE */
+ if (st->c33_ext_state_info.__c33_pse_ext_substate > 0)
+ /* _C33_PSE_EXT_SUBSTATE */
+ len += nla_total_size(sizeof(u32));
+ }
+ if (st->c33_avail_pw_limit > 0)
+ /* _C33_AVAIL_PSE_PW_LIMIT */
+ len += nla_total_size(sizeof(u32));
+ if (st->c33_pw_limit_nb_ranges > 0)
+ /* _C33_PSE_PW_LIMIT_RANGES */
+ len += st->c33_pw_limit_nb_ranges *
+ (nla_total_size(0) +
+ nla_total_size(sizeof(u32)) * 2);
+ if (st->prio_max)
+ /* _PSE_PRIO_MAX + _PSE_PRIO */
+ len += nla_total_size(sizeof(u32)) * 2;
+
+ return len;
+}
+
+static int pse_put_pw_limit_ranges(struct sk_buff *skb,
+ const struct ethtool_pse_control_status *st)
+{
+ const struct ethtool_c33_pse_pw_limit_range *pw_limit_ranges;
+ int i;
+
+ pw_limit_ranges = st->c33_pw_limit_ranges;
+ for (i = 0; i < st->c33_pw_limit_nb_ranges; i++) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_MIN,
+ pw_limit_ranges->min) ||
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_MAX,
+ pw_limit_ranges->max)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, nest);
+ pw_limit_ranges++;
+ }
+
+ return 0;
+}
+
+static int pse_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+ const struct ethtool_pse_control_status *st = &data->status;
+
+ if (st->pw_d_id &&
+ nla_put_u32(skb, ETHTOOL_A_PSE_PW_D_ID,
+ st->pw_d_id))
+ return -EMSGSIZE;
+
+ if (st->podl_admin_state > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE,
+ st->podl_admin_state))
+ return -EMSGSIZE;
+
+ if (st->podl_pw_status > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS,
+ st->podl_pw_status))
+ return -EMSGSIZE;
+
+ if (st->c33_admin_state > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_ADMIN_STATE,
+ st->c33_admin_state))
+ return -EMSGSIZE;
+
+ if (st->c33_pw_status > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_D_STATUS,
+ st->c33_pw_status))
+ return -EMSGSIZE;
+
+ if (st->c33_pw_class > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_CLASS,
+ st->c33_pw_class))
+ return -EMSGSIZE;
+
+ if (st->c33_actual_pw > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_ACTUAL_PW,
+ st->c33_actual_pw))
+ return -EMSGSIZE;
+
+ if (st->c33_ext_state_info.c33_pse_ext_state > 0) {
+ if (nla_put_u32(skb, ETHTOOL_A_C33_PSE_EXT_STATE,
+ st->c33_ext_state_info.c33_pse_ext_state))
+ return -EMSGSIZE;
+
+ if (st->c33_ext_state_info.__c33_pse_ext_substate > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_EXT_SUBSTATE,
+ st->c33_ext_state_info.__c33_pse_ext_substate))
+ return -EMSGSIZE;
+ }
+
+ if (st->c33_avail_pw_limit > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT,
+ st->c33_avail_pw_limit))
+ return -EMSGSIZE;
+
+ if (st->c33_pw_limit_nb_ranges > 0 &&
+ pse_put_pw_limit_ranges(skb, st))
+ return -EMSGSIZE;
+
+ if (st->prio_max &&
+ (nla_put_u32(skb, ETHTOOL_A_PSE_PRIO_MAX, st->prio_max) ||
+ nla_put_u32(skb, ETHTOOL_A_PSE_PRIO, st->prio)))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static void pse_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+
+ kfree(data->status.c33_pw_limit_ranges);
+}
+
+/* PSE_SET */
+
+const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = {
+ [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] =
+ NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+ ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED),
+ [ETHTOOL_A_C33_PSE_ADMIN_CONTROL] =
+ NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED,
+ ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED),
+ [ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT] = { .type = NLA_U32 },
+ [ETHTOOL_A_PSE_PRIO] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_pse_validate(struct phy_device *phydev, struct genl_info *info)
+{
+ struct nlattr **tb = info->attrs;
+
+ if (IS_ERR_OR_NULL(phydev)) {
+ NL_SET_ERR_MSG(info->extack, "No PHY is attached");
+ return -EOPNOTSUPP;
+ }
+
+ if (!phydev->psec) {
+ NL_SET_ERR_MSG(info->extack, "No PSE is attached");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] &&
+ !pse_has_podl(phydev->psec)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL],
+ "setting PoDL PSE admin control not supported");
+ return -EOPNOTSUPP;
+ }
+ if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL] &&
+ !pse_has_c33(phydev->psec)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL],
+ "setting C33 PSE admin control not supported");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int
+ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PSE_HEADER,
+ info->extack);
+ ret = ethnl_set_pse_validate(phydev, info);
+ if (ret)
+ return ret;
+
+ if (tb[ETHTOOL_A_PSE_PRIO]) {
+ unsigned int prio;
+
+ prio = nla_get_u32(tb[ETHTOOL_A_PSE_PRIO]);
+ ret = pse_ethtool_set_prio(phydev->psec, info->extack, prio);
+ if (ret)
+ return ret;
+ }
+
+ if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) {
+ unsigned int pw_limit;
+
+ pw_limit = nla_get_u32(tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]);
+ ret = pse_ethtool_set_pw_limit(phydev->psec, info->extack,
+ pw_limit);
+ if (ret)
+ return ret;
+ }
+
+ /* These values are already validated by the ethnl_pse_set_policy */
+ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] ||
+ tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]) {
+ struct pse_control_config config = {};
+
+ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL])
+ config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
+ if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL])
+ config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]);
+
+ /* pse_ethtool_set_config() will do nothing if the config
+ * is zero
+ */
+ ret = pse_ethtool_set_config(phydev->psec, info->extack,
+ &config);
+ if (ret)
+ return ret;
+ }
+
+ /* Return errno or zero - PSE has no notification */
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_pse_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PSE_GET,
+ .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PSE_HEADER,
+ .req_info_size = sizeof(struct pse_req_info),
+ .reply_data_size = sizeof(struct pse_reply_data),
+
+ .prepare_data = pse_prepare_data,
+ .reply_size = pse_reply_size,
+ .fill_reply = pse_fill_reply,
+ .cleanup_data = pse_cleanup_data,
+
+ .set = ethnl_set_pse,
+ /* PSE has no notification */
+};
+
+void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notifs)
+{
+ void *reply_payload;
+ struct sk_buff *skb;
+ int reply_len;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!netdev || !notifs)
+ return;
+
+ reply_len = ethnl_reply_header_size() +
+ nla_total_size(sizeof(u32)); /* _PSE_NTF_EVENTS */
+
+ skb = genlmsg_new(reply_len, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ reply_payload = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_PSE_NTF);
+ if (!reply_payload)
+ goto err_skb;
+
+ ret = ethnl_fill_reply_header(skb, netdev, ETHTOOL_A_PSE_NTF_HEADER);
+ if (ret < 0)
+ goto err_skb;
+
+ if (nla_put_uint(skb, ETHTOOL_A_PSE_NTF_EVENTS, notifs))
+ goto err_skb;
+
+ genlmsg_end(skb, reply_payload);
+ ethnl_multicast(skb, netdev);
+ return;
+
+err_skb:
+ nlmsg_free(skb);
+}
+EXPORT_SYMBOL_GPL(ethnl_pse_send_ntf);
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
new file mode 100644
index 000000000000..aeedd5ec6b8c
--- /dev/null
+++ b/net/ethtool/rings.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_queues.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct rings_req_info {
+ struct ethnl_req_info base;
+};
+
+struct rings_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_ringparam ringparam;
+ struct kernel_ethtool_ringparam kernel_ringparam;
+ u32 supported_ring_params;
+};
+
+#define RINGS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct rings_reply_data, base)
+
+const struct nla_policy ethnl_rings_get_policy[] = {
+ [ETHTOOL_A_RINGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int rings_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct rings_reply_data *data = RINGS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ data->supported_ring_params = dev->ethtool_ops->supported_ring_params;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config;
+ data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh;
+
+ dev->ethtool_ops->get_ringparam(dev, &data->ringparam,
+ &data->kernel_ringparam, info->extack);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int rings_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */
+ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */
+ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */
+ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */
+ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */
+ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/
+}
+
+static int rings_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
+ const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
+ const struct ethtool_ringparam *ringparam = &data->ringparam;
+ u32 supported_ring_params = data->supported_ring_params;
+
+ WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
+
+ if ((ringparam->rx_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX,
+ ringparam->rx_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX,
+ ringparam->rx_pending))) ||
+ (ringparam->rx_mini_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX,
+ ringparam->rx_mini_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI,
+ ringparam->rx_mini_pending))) ||
+ (ringparam->rx_jumbo_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX,
+ ringparam->rx_jumbo_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO,
+ ringparam->rx_jumbo_pending))) ||
+ (ringparam->tx_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX,
+ ringparam->tx_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_TX,
+ ringparam->tx_pending))) ||
+ (kr->rx_buf_len &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) ||
+ (kr->tcp_data_split &&
+ (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
+ kr->tcp_data_split))) ||
+ (kr->cqe_size &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) ||
+ nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) ||
+ nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) ||
+ ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,
+ kr->tx_push_buf_max_len) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,
+ kr->tx_push_buf_len))) ||
+ ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH,
+ kr->hds_thresh) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX,
+ kr->hds_thresh_max))))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* RINGS_SET */
+
+const struct nla_policy ethnl_rings_set_policy[] = {
+ [ETHTOOL_A_RINGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] =
+ NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED),
+ [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_rings_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct nlattr **tb = info->attrs;
+
+ if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_RX_BUF_LEN],
+ "setting rx buf len not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT],
+ "setting TCP data split is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_HDS_THRESH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_HDS_THRESH],
+ "setting hds-thresh is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_CQE_SIZE] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_CQE_SIZE],
+ "setting cqe size not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_TX_PUSH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TX_PUSH],
+ "setting tx push not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_RX_PUSH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_RX_PUSH],
+ "setting rx push not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+ "setting tx push buf len is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct kernel_ethtool_ringparam kernel_ringparam;
+ struct net_device *dev = req_info->dev;
+ struct ethtool_ringparam ringparam;
+ struct nlattr **tb = info->attrs;
+ const struct nlattr *err_attr;
+ bool mod = false;
+ int ret;
+
+ ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam,
+ info->extack);
+
+ ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod);
+ ethnl_update_u32(&ringparam.rx_mini_pending,
+ tb[ETHTOOL_A_RINGS_RX_MINI], &mod);
+ ethnl_update_u32(&ringparam.rx_jumbo_pending,
+ tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod);
+ ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod);
+ ethnl_update_u32(&kernel_ringparam.rx_buf_len,
+ tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod);
+ ethnl_update_u8(&kernel_ringparam.tcp_data_split,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod);
+ ethnl_update_u32(&kernel_ringparam.cqe_size,
+ tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod);
+ ethnl_update_u8(&kernel_ringparam.tx_push,
+ tb[ETHTOOL_A_RINGS_TX_PUSH], &mod);
+ ethnl_update_u8(&kernel_ringparam.rx_push,
+ tb[ETHTOOL_A_RINGS_RX_PUSH], &mod);
+ ethnl_update_u32(&kernel_ringparam.tx_push_buf_len,
+ tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod);
+ ethnl_update_u32(&kernel_ringparam.hds_thresh,
+ tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod);
+ if (!mod)
+ return 0;
+
+ if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ dev_xdp_sb_prog_count(dev)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT],
+ "tcp-data-split can not be enabled with single buffer XDP");
+ return -EINVAL;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ if (kernel_ringparam.tcp_data_split !=
+ ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(info->extack,
+ "can't disable tcp-data-split while device has memory provider enabled");
+ return -EINVAL;
+ } else if (kernel_ringparam.hds_thresh) {
+ NL_SET_ERR_MSG(info->extack,
+ "can't set non-zero hds_thresh while device is memory provider enabled");
+ return -EINVAL;
+ }
+ }
+
+ /* ensure new ring parameters are within limits */
+ if (ringparam.rx_pending > ringparam.rx_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX];
+ else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX_MINI];
+ else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO];
+ else if (ringparam.tx_pending > ringparam.tx_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_TX];
+ else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max)
+ err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH];
+ else
+ err_attr = NULL;
+ if (err_attr) {
+ NL_SET_ERR_MSG_ATTR(info->extack, err_attr,
+ "requested ring size exceeds maximum");
+ return -EINVAL;
+ }
+
+ if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) {
+ NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+ "Requested TX push buffer exceeds the maximum of %u",
+ kernel_ringparam.tx_push_buf_max_len);
+
+ return -EINVAL;
+ }
+
+ dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split;
+ dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh;
+
+ ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
+ &kernel_ringparam, info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_rings_request_ops = {
+ .request_cmd = ETHTOOL_MSG_RINGS_GET,
+ .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_RINGS_HEADER,
+ .req_info_size = sizeof(struct rings_req_info),
+ .reply_data_size = sizeof(struct rings_reply_data),
+
+ .prepare_data = rings_prepare_data,
+ .reply_size = rings_reply_size,
+ .fill_reply = rings_fill_reply,
+
+ .set_validate = ethnl_set_rings_validate,
+ .set = ethnl_set_rings,
+ .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF,
+};
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
new file mode 100644
index 000000000000..4dced53be4b3
--- /dev/null
+++ b/net/ethtool/rss.c
@@ -0,0 +1,1205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct rss_req_info {
+ struct ethnl_req_info base;
+ u32 rss_context;
+};
+
+struct rss_reply_data {
+ struct ethnl_reply_data base;
+ bool has_flow_hash;
+ bool no_key_fields;
+ u32 indir_size;
+ u32 hkey_size;
+ u32 hfunc;
+ u32 input_xfrm;
+ u32 *indir_table;
+ u8 *hkey;
+ int flow_hash[__ETHTOOL_A_FLOW_CNT];
+};
+
+static const u8 ethtool_rxfh_ft_nl2ioctl[] = {
+ [ETHTOOL_A_FLOW_ETHER] = ETHER_FLOW,
+ [ETHTOOL_A_FLOW_IP4] = IPV4_FLOW,
+ [ETHTOOL_A_FLOW_IP6] = IPV6_FLOW,
+ [ETHTOOL_A_FLOW_TCP4] = TCP_V4_FLOW,
+ [ETHTOOL_A_FLOW_UDP4] = UDP_V4_FLOW,
+ [ETHTOOL_A_FLOW_SCTP4] = SCTP_V4_FLOW,
+ [ETHTOOL_A_FLOW_AH_ESP4] = AH_ESP_V4_FLOW,
+ [ETHTOOL_A_FLOW_TCP6] = TCP_V6_FLOW,
+ [ETHTOOL_A_FLOW_UDP6] = UDP_V6_FLOW,
+ [ETHTOOL_A_FLOW_SCTP6] = SCTP_V6_FLOW,
+ [ETHTOOL_A_FLOW_AH_ESP6] = AH_ESP_V6_FLOW,
+ [ETHTOOL_A_FLOW_AH4] = AH_V4_FLOW,
+ [ETHTOOL_A_FLOW_ESP4] = ESP_V4_FLOW,
+ [ETHTOOL_A_FLOW_AH6] = AH_V6_FLOW,
+ [ETHTOOL_A_FLOW_ESP6] = ESP_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU4] = GTPU_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU6] = GTPU_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPC4] = GTPC_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPC6] = GTPC_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPC_TEID4] = GTPC_TEID_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPC_TEID6] = GTPC_TEID_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_EH4] = GTPU_EH_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_EH6] = GTPU_EH_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_UL4] = GTPU_UL_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_UL6] = GTPU_UL_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_DL4] = GTPU_DL_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_DL6] = GTPU_DL_V6_FLOW,
+};
+
+#define RSS_REQINFO(__req_base) \
+ container_of(__req_base, struct rss_req_info, base)
+
+#define RSS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct rss_reply_data, base)
+
+const struct nla_policy ethnl_rss_get_policy[] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 },
+ [ETHTOOL_A_RSS_START_CONTEXT] = { .type = NLA_U32 },
+};
+
+static int
+rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rss_req_info *request = RSS_REQINFO(req_info);
+
+ if (tb[ETHTOOL_A_RSS_CONTEXT])
+ request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]);
+ if (tb[ETHTOOL_A_RSS_START_CONTEXT]) {
+ NL_SET_BAD_ATTR(extack, tb[ETHTOOL_A_RSS_START_CONTEXT]);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void
+rss_prepare_flow_hash(const struct rss_req_info *req, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ int i;
+
+ data->has_flow_hash = false;
+
+ if (!dev->ethtool_ops->get_rxfh_fields)
+ return;
+ if (req->rss_context && !dev->ethtool_ops->rxfh_per_ctx_fields)
+ return;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+ struct ethtool_rxfh_fields fields = {
+ .flow_type = ethtool_rxfh_ft_nl2ioctl[i],
+ .rss_context = req->rss_context,
+ };
+
+ if (dev->ethtool_ops->get_rxfh_fields(dev, &fields)) {
+ data->flow_hash[i] = -1; /* Unsupported */
+ continue;
+ }
+
+ data->flow_hash[i] = fields.data;
+ data->has_flow_hash = true;
+ }
+ mutex_unlock(&dev->ethtool->rss_lock);
+}
+
+static int
+rss_get_data_alloc(struct net_device *dev, struct rss_reply_data *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 total_size, indir_bytes;
+ u8 *rss_config;
+
+ data->indir_size = 0;
+ data->hkey_size = 0;
+ if (ops->get_rxfh_indir_size)
+ data->indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ data->hkey_size = ops->get_rxfh_key_size(dev);
+
+ indir_bytes = data->indir_size * sizeof(u32);
+ total_size = indir_bytes + data->hkey_size;
+ rss_config = kzalloc(total_size, GFP_KERNEL);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (data->indir_size)
+ data->indir_table = (u32 *)rss_config;
+ if (data->hkey_size)
+ data->hkey = rss_config + indir_bytes;
+
+ return 0;
+}
+
+static void rss_get_data_free(const struct rss_reply_data *data)
+{
+ kfree(data->indir_table);
+}
+
+static int
+rss_prepare_get(const struct rss_req_info *request, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_param rxfh = {};
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ mutex_lock(&dev->ethtool->rss_lock);
+
+ ret = rss_get_data_alloc(dev, data);
+ if (ret)
+ goto out_unlock;
+
+ rxfh.indir_size = data->indir_size;
+ rxfh.indir = data->indir_table;
+ rxfh.key_size = data->hkey_size;
+ rxfh.key = data->hkey;
+
+ ret = ops->get_rxfh(dev, &rxfh);
+ if (ret)
+ goto out_unlock;
+
+ data->hfunc = rxfh.hfunc;
+ data->input_xfrm = rxfh.input_xfrm;
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static void
+__rss_prepare_ctx(struct net_device *dev, struct rss_reply_data *data,
+ struct ethtool_rxfh_context *ctx)
+{
+ if (WARN_ON_ONCE(data->indir_size != ctx->indir_size ||
+ data->hkey_size != ctx->key_size))
+ return;
+
+ data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key;
+
+ data->hfunc = ctx->hfunc;
+ data->input_xfrm = ctx->input_xfrm;
+ memcpy(data->indir_table, ethtool_rxfh_context_indir(ctx),
+ data->indir_size * sizeof(u32));
+ if (data->hkey_size)
+ memcpy(data->hkey, ethtool_rxfh_context_key(ctx),
+ data->hkey_size);
+}
+
+static int
+rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ struct ethtool_rxfh_context *ctx;
+ u32 total_size, indir_bytes;
+ u8 *rss_config;
+ int ret;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ data->indir_size = ctx->indir_size;
+ data->hkey_size = ctx->key_size;
+
+ indir_bytes = data->indir_size * sizeof(u32);
+ total_size = indir_bytes + data->hkey_size;
+ rss_config = kzalloc(total_size, GFP_KERNEL);
+ if (!rss_config) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ data->indir_table = (u32 *)rss_config;
+ if (data->hkey_size)
+ data->hkey = rss_config + indir_bytes;
+
+ __rss_prepare_ctx(dev, data, ctx);
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ return ret;
+}
+
+static int
+rss_prepare(const struct rss_req_info *request, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ rss_prepare_flow_hash(request, dev, data, info);
+
+ /* Coming from RSS_SET, driver may only have flow_hash_fields ops */
+ if (!dev->ethtool_ops->get_rxfh)
+ return 0;
+
+ if (request->rss_context)
+ return rss_prepare_ctx(request, dev, data, info);
+ return rss_prepare_get(request, dev, data, info);
+}
+
+static int
+rss_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct rss_reply_data *data = RSS_REPDATA(reply_base);
+ struct rss_req_info *request = RSS_REQINFO(req_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_ops *ops;
+
+ ops = dev->ethtool_ops;
+ if (!ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ /* Some drivers don't handle rss_context */
+ if (request->rss_context && !ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+
+ return rss_prepare(request, dev, data, info);
+}
+
+static int
+rss_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+ int len;
+
+ len = nla_total_size(sizeof(u32)) + /* _RSS_CONTEXT */
+ nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */
+ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */
+ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */
+ nla_total_size(data->hkey_size) + /* _RSS_HKEY */
+ nla_total_size(0) + /* _RSS_FLOW_HASH */
+ nla_total_size(sizeof(u32)) * ETHTOOL_A_FLOW_MAX +
+ 0;
+
+ return len;
+}
+
+static int
+rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+ struct rss_req_info *request = RSS_REQINFO(req_base);
+
+ if (request->rss_context &&
+ nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context))
+ return -EMSGSIZE;
+
+ if ((data->indir_size &&
+ nla_put(skb, ETHTOOL_A_RSS_INDIR,
+ sizeof(u32) * data->indir_size, data->indir_table)))
+ return -EMSGSIZE;
+
+ if (!data->no_key_fields &&
+ ((data->hfunc &&
+ nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) ||
+ (data->input_xfrm &&
+ nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) ||
+ (data->hkey_size &&
+ nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))))
+ return -EMSGSIZE;
+
+ if (data->has_flow_hash) {
+ struct nlattr *nest;
+ int i;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_RSS_FLOW_HASH);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+ if (data->flow_hash[i] >= 0 &&
+ nla_put_uint(skb, i, data->flow_hash[i])) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+}
+
+static void rss_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+
+ rss_get_data_free(data);
+}
+
+struct rss_nl_dump_ctx {
+ unsigned long ifindex;
+ unsigned long ctx_idx;
+
+ /* User wants to only dump contexts from given ifindex */
+ unsigned int match_ifindex;
+ unsigned int start_ctx;
+};
+
+static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb)
+{
+ NL_ASSERT_CTX_FITS(struct rss_nl_dump_ctx);
+
+ return (struct rss_nl_dump_ctx *)cb->ctx;
+}
+
+int ethnl_rss_dump_start(struct netlink_callback *cb)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb);
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ /* Filtering by context not supported */
+ if (tb[ETHTOOL_A_RSS_CONTEXT]) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]);
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_RSS_START_CONTEXT]) {
+ ctx->start_ctx = nla_get_u32(tb[ETHTOOL_A_RSS_START_CONTEXT]);
+ ctx->ctx_idx = ctx->start_ctx;
+ }
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_RSS_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (req_info.dev) {
+ ctx->match_ifindex = req_info.dev->ifindex;
+ ctx->ifindex = ctx->match_ifindex;
+ ethnl_parse_header_dev_put(&req_info);
+ req_info.dev = NULL;
+ }
+
+ return ret;
+}
+
+static int
+rss_dump_one_ctx(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev, u32 rss_context)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct rss_reply_data data = {};
+ struct rss_req_info req = {};
+ void *ehdr;
+ int ret;
+
+ req.rss_context = rss_context;
+
+ ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_RSS_GET_REPLY);
+ if (!ehdr)
+ return -EMSGSIZE;
+
+ ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_RSS_HEADER);
+ if (ret < 0)
+ goto err_cancel;
+
+ ret = rss_prepare(&req, dev, &data, info);
+ if (ret)
+ goto err_cancel;
+
+ ret = rss_fill_reply(skb, &req.base, &data.base);
+ if (ret)
+ goto err_cleanup;
+ genlmsg_end(skb, ehdr);
+
+ rss_cleanup_data(&data.base);
+ return 0;
+
+err_cleanup:
+ rss_cleanup_data(&data.base);
+err_cancel:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int
+rss_dump_one_dev(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev)
+{
+ struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb);
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh)
+ return 0;
+
+ if (!ctx->ctx_idx) {
+ ret = rss_dump_one_ctx(skb, cb, dev, 0);
+ if (ret)
+ return ret;
+ ctx->ctx_idx++;
+ }
+
+ for (; xa_find(&dev->ethtool->rss_ctx, &ctx->ctx_idx,
+ ULONG_MAX, XA_PRESENT); ctx->ctx_idx++) {
+ ret = rss_dump_one_ctx(skb, cb, dev, ctx->ctx_idx);
+ if (ret)
+ return ret;
+ }
+ ctx->ctx_idx = ctx->start_ctx;
+
+ return 0;
+}
+
+int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (ctx->match_ifindex && ctx->match_ifindex != ctx->ifindex)
+ break;
+
+ netdev_lock_ops(dev);
+ ret = rss_dump_one_dev(skb, cb, dev);
+ netdev_unlock_ops(dev);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+/* RSS_NTF */
+
+static void ethnl_rss_delete_notify(struct net_device *dev, u32 rss_context)
+{
+ struct sk_buff *ntf;
+ size_t ntf_size;
+ void *hdr;
+
+ ntf_size = ethnl_reply_header_size() +
+ nla_total_size(sizeof(u32)); /* _RSS_CONTEXT */
+
+ ntf = genlmsg_new(ntf_size, GFP_KERNEL);
+ if (!ntf)
+ goto out_warn;
+
+ hdr = ethnl_bcastmsg_put(ntf, ETHTOOL_MSG_RSS_DELETE_NTF);
+ if (!hdr)
+ goto out_free_ntf;
+
+ if (ethnl_fill_reply_header(ntf, dev, ETHTOOL_A_RSS_HEADER) ||
+ nla_put_u32(ntf, ETHTOOL_A_RSS_CONTEXT, rss_context))
+ goto out_free_ntf;
+
+ genlmsg_end(ntf, hdr);
+ if (ethnl_multicast(ntf, dev))
+ goto out_warn;
+
+ return;
+
+out_free_ntf:
+ nlmsg_free(ntf);
+out_warn:
+ pr_warn_once("Failed to send a RSS delete notification");
+}
+
+void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
+{
+ struct rss_req_info req_info = {
+ .rss_context = rss_context,
+ };
+
+ if (type == ETHTOOL_MSG_RSS_DELETE_NTF)
+ ethnl_rss_delete_notify(dev, rss_context);
+ else
+ ethnl_notify(dev, type, &req_info.base);
+}
+
+/* RSS_SET */
+
+#define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \
+ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \
+ RXH_GTP_TEID | RXH_DISCARD)
+#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL)
+
+static const struct nla_policy ethnl_rss_flows_policy[] = {
+ [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+};
+
+const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32, },
+ [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RSS_INDIR] = { .type = NLA_BINARY, },
+ [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1),
+ [ETHTOOL_A_RSS_INPUT_XFRM] =
+ NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR),
+ [ETHTOOL_A_RSS_FLOW_HASH] = NLA_POLICY_NESTED(ethnl_rss_flows_policy),
+};
+
+static int
+ethnl_rss_set_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct rss_req_info *request = RSS_REQINFO(req_info);
+ struct nlattr **tb = info->attrs;
+ struct nlattr *bad_attr = NULL;
+ u32 input_xfrm;
+
+ if (request->rss_context && !ops->create_rxfh_context)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_CONTEXT];
+
+ if (request->rss_context && !ops->rxfh_per_ctx_key) {
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+ }
+
+ input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0);
+ if (input_xfrm & ~ops->supported_input_xfrm)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+
+ if (tb[ETHTOOL_A_RSS_FLOW_HASH] && !ops->set_rxfh_fields)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_FLOW_HASH];
+ if (request->rss_context &&
+ tb[ETHTOOL_A_RSS_FLOW_HASH] && !ops->rxfh_per_ctx_fields)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_FLOW_HASH];
+
+ if (bad_attr) {
+ NL_SET_BAD_ATTR(info->extack, bad_attr);
+ return -EOPNOTSUPP;
+ }
+
+ return 1;
+}
+
+static int
+rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
+ struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh,
+ bool *reset, bool *mod)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **tb = info->attrs;
+ size_t alloc_size;
+ int num_rx_rings;
+ u32 user_size;
+ int i, err;
+
+ if (!tb[ETHTOOL_A_RSS_INDIR])
+ return 0;
+ if (!data->indir_size)
+ return -EOPNOTSUPP;
+
+ err = ethtool_get_rx_ring_count(dev);
+ if (err < 0)
+ return err;
+ num_rx_rings = err;
+
+ if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]);
+ return -EINVAL;
+ }
+ user_size = nla_len(tb[ETHTOOL_A_RSS_INDIR]) / 4;
+ if (!user_size) {
+ if (rxfh->rss_context) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_RSS_INDIR],
+ "can't reset table for a context");
+ return -EINVAL;
+ }
+ *reset = true;
+ } else if (data->indir_size % user_size) {
+ NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
+ "size (%d) mismatch with device indir table (%d)",
+ user_size, data->indir_size);
+ return -EINVAL;
+ }
+
+ rxfh->indir_size = data->indir_size;
+ alloc_size = array_size(data->indir_size, sizeof(rxfh->indir[0]));
+ rxfh->indir = kzalloc(alloc_size, GFP_KERNEL);
+ if (!rxfh->indir)
+ return -ENOMEM;
+
+ nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size);
+ for (i = 0; i < user_size; i++) {
+ if (rxfh->indir[i] < num_rx_rings)
+ continue;
+
+ NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
+ "entry %d: queue out of range (%d)",
+ i, rxfh->indir[i]);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (user_size) {
+ /* Replicate the user-provided table to fill the device table */
+ for (i = user_size; i < data->indir_size; i++)
+ rxfh->indir[i] = rxfh->indir[i % user_size];
+ } else {
+ for (i = 0; i < data->indir_size; i++)
+ rxfh->indir[i] =
+ ethtool_rxfh_indir_default(i, num_rx_rings);
+ }
+
+ *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size);
+
+ return 0;
+
+err_free:
+ kfree(rxfh->indir);
+ rxfh->indir = NULL;
+ return err;
+}
+
+static int
+rss_set_prep_hkey(struct net_device *dev, struct genl_info *info,
+ struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh,
+ bool *mod)
+{
+ struct nlattr **tb = info->attrs;
+
+ if (!tb[ETHTOOL_A_RSS_HKEY])
+ return 0;
+
+ if (nla_len(tb[ETHTOOL_A_RSS_HKEY]) != data->hkey_size) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_HKEY]);
+ return -EINVAL;
+ }
+
+ rxfh->key_size = data->hkey_size;
+ rxfh->key = kmemdup(data->hkey, data->hkey_size, GFP_KERNEL);
+ if (!rxfh->key)
+ return -ENOMEM;
+
+ ethnl_update_binary(rxfh->key, rxfh->key_size, tb[ETHTOOL_A_RSS_HKEY],
+ mod);
+ return 0;
+}
+
+static int
+rss_check_rxfh_fields_sym(struct net_device *dev, struct genl_info *info,
+ struct rss_reply_data *data, bool xfrm_sym)
+{
+ struct nlattr **tb = info->attrs;
+ int i;
+
+ if (!xfrm_sym)
+ return 0;
+ if (!data->has_flow_hash) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_INPUT_XFRM],
+ "hash field config not reported");
+ return -EINVAL;
+ }
+
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++)
+ if (data->flow_hash[i] >= 0 &&
+ !ethtool_rxfh_config_is_sym(data->flow_hash[i])) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RSS_INPUT_XFRM],
+ "hash field config is not symmetric");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+ethnl_set_rss_fields(struct net_device *dev, struct genl_info *info,
+ u32 rss_context, struct rss_reply_data *data,
+ bool xfrm_sym, bool *mod)
+{
+ struct nlattr *flow_nest = info->attrs[ETHTOOL_A_RSS_FLOW_HASH];
+ struct nlattr *flows[ETHTOOL_A_FLOW_MAX + 1];
+ const struct ethtool_ops *ops;
+ int i, ret;
+
+ ops = dev->ethtool_ops;
+
+ ret = rss_check_rxfh_fields_sym(dev, info, data, xfrm_sym);
+ if (ret)
+ return ret;
+
+ if (!flow_nest)
+ return 0;
+
+ ret = nla_parse_nested(flows, ARRAY_SIZE(ethnl_rss_flows_policy) - 1,
+ flow_nest, ethnl_rss_flows_policy, info->extack);
+ if (ret < 0)
+ return ret;
+
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+ struct ethtool_rxfh_fields fields = {
+ .flow_type = ethtool_rxfh_ft_nl2ioctl[i],
+ .rss_context = rss_context,
+ };
+
+ if (!flows[i])
+ continue;
+
+ fields.data = nla_get_u32(flows[i]);
+ if (data->has_flow_hash && data->flow_hash[i] == fields.data)
+ continue;
+
+ if (xfrm_sym && !ethtool_rxfh_config_is_sym(fields.data)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, flows[i],
+ "conflict with xfrm-input");
+ return -EINVAL;
+ }
+
+ ret = ops->set_rxfh_fields(dev, &fields, info->extack);
+ if (ret)
+ return ret;
+
+ *mod = true;
+ }
+
+ return 0;
+}
+
+static void
+rss_set_ctx_update(struct ethtool_rxfh_context *ctx, struct nlattr **tb,
+ struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh)
+{
+ int i;
+
+ if (rxfh->indir) {
+ for (i = 0; i < data->indir_size; i++)
+ ethtool_rxfh_context_indir(ctx)[i] = rxfh->indir[i];
+ ctx->indir_configured = !!nla_len(tb[ETHTOOL_A_RSS_INDIR]);
+ }
+ if (rxfh->key) {
+ memcpy(ethtool_rxfh_context_key(ctx), rxfh->key,
+ data->hkey_size);
+ ctx->key_configured = !!rxfh->key_size;
+ }
+ if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE)
+ ctx->hfunc = rxfh->hfunc;
+ if (rxfh->input_xfrm != RXH_XFRM_NO_CHANGE)
+ ctx->input_xfrm = rxfh->input_xfrm;
+}
+
+static int
+ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ bool indir_reset = false, indir_mod, xfrm_sym = false;
+ struct rss_req_info *request = RSS_REQINFO(req_info);
+ struct ethtool_rxfh_context *ctx = NULL;
+ struct net_device *dev = req_info->dev;
+ bool mod = false, fields_mod = false;
+ struct ethtool_rxfh_param rxfh = {};
+ struct nlattr **tb = info->attrs;
+ struct rss_reply_data data = {};
+ const struct ethtool_ops *ops;
+ int ret;
+
+ ops = dev->ethtool_ops;
+ data.base.dev = dev;
+
+ ret = rss_prepare(request, dev, &data, info);
+ if (ret)
+ return ret;
+
+ rxfh.rss_context = request->rss_context;
+
+ ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_reset, &mod);
+ if (ret)
+ goto exit_clean_data;
+ indir_mod = !!tb[ETHTOOL_A_RSS_INDIR];
+
+ rxfh.hfunc = data.hfunc;
+ ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod);
+ if (rxfh.hfunc == data.hfunc)
+ rxfh.hfunc = ETH_RSS_HASH_NO_CHANGE;
+
+ ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod);
+ if (ret)
+ goto exit_free_indir;
+
+ rxfh.input_xfrm = data.input_xfrm;
+ ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod);
+ /* For drivers which don't support input_xfrm it will be set to 0xff
+ * in the RSS context info. In all other case input_xfrm != 0 means
+ * symmetric hashing is requested.
+ */
+ if (!request->rss_context || ops->rxfh_per_ctx_key)
+ xfrm_sym = rxfh.input_xfrm || data.input_xfrm;
+ if (rxfh.input_xfrm == data.input_xfrm)
+ rxfh.input_xfrm = RXH_XFRM_NO_CHANGE;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (request->rss_context) {
+ ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto exit_unlock;
+ }
+ }
+
+ ret = ethnl_set_rss_fields(dev, info, request->rss_context,
+ &data, xfrm_sym, &fields_mod);
+ if (ret)
+ goto exit_unlock;
+
+ if (!mod)
+ ret = 0; /* nothing to tell the driver */
+ else if (!ops->set_rxfh)
+ ret = -EOPNOTSUPP;
+ else if (!rxfh.rss_context)
+ ret = ops->set_rxfh(dev, &rxfh, info->extack);
+ else
+ ret = ops->modify_rxfh_context(dev, ctx, &rxfh, info->extack);
+ if (ret)
+ goto exit_unlock;
+
+ if (ctx)
+ rss_set_ctx_update(ctx, tb, &data, &rxfh);
+ else if (indir_reset)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (indir_mod)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ kfree(rxfh.key);
+exit_free_indir:
+ kfree(rxfh.indir);
+exit_clean_data:
+ rss_cleanup_data(&data.base);
+
+ return ret ?: mod || fields_mod;
+}
+
+const struct ethnl_request_ops ethnl_rss_request_ops = {
+ .request_cmd = ETHTOOL_MSG_RSS_GET,
+ .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_RSS_HEADER,
+ .req_info_size = sizeof(struct rss_req_info),
+ .reply_data_size = sizeof(struct rss_reply_data),
+
+ .parse_request = rss_parse_request,
+ .prepare_data = rss_prepare_data,
+ .reply_size = rss_reply_size,
+ .fill_reply = rss_fill_reply,
+ .cleanup_data = rss_cleanup_data,
+
+ .set_validate = ethnl_rss_set_validate,
+ .set = ethnl_rss_set,
+ .set_ntf_cmd = ETHTOOL_MSG_RSS_NTF,
+};
+
+/* RSS_CREATE */
+
+const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RSS_INDIR] = NLA_POLICY_MIN(NLA_BINARY, 1),
+ [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1),
+ [ETHTOOL_A_RSS_INPUT_XFRM] =
+ NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR),
+};
+
+static int
+ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct nlattr **tb = info->attrs;
+ struct nlattr *bad_attr = NULL;
+ u32 rss_context, input_xfrm;
+
+ if (!ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+
+ rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0);
+ if (ops->rxfh_max_num_contexts &&
+ ops->rxfh_max_num_contexts <= rss_context) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]);
+ return -ERANGE;
+ }
+
+ if (!ops->rxfh_per_ctx_key) {
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+ }
+
+ input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0);
+ if (input_xfrm & ~ops->supported_input_xfrm)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+
+ if (bad_attr) {
+ NL_SET_BAD_ATTR(info->extack, bad_attr);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void
+ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev)
+{
+ struct nlmsghdr *nlh = (void *)rsp->data;
+ struct genlmsghdr *genl_hdr;
+
+ /* Convert the reply into a notification */
+ nlh->nlmsg_pid = 0;
+ nlh->nlmsg_seq = ethnl_bcast_seq_next();
+
+ genl_hdr = nlmsg_data(nlh);
+ genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF;
+
+ ethnl_multicast(rsp, dev);
+}
+
+int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ bool indir_dflt = false, mod = false, ntf_fail = false;
+ struct ethtool_rxfh_param rxfh = {};
+ struct ethtool_rxfh_context *ctx;
+ struct nlattr **tb = info->attrs;
+ struct rss_reply_data data = {};
+ const struct ethtool_ops *ops;
+ struct rss_req_info req = {};
+ struct net_device *dev;
+ struct sk_buff *rsp;
+ void *hdr;
+ u32 limit;
+ int ret;
+
+ rsp = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ ret = ethnl_parse_header_dev_get(&req.base, tb[ETHTOOL_A_RSS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ goto exit_free_rsp;
+
+ dev = req.base.dev;
+ ops = dev->ethtool_ops;
+
+ req.rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0);
+
+ ret = ethnl_rss_create_validate(dev, info);
+ if (ret)
+ goto exit_free_dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto exit_dev_unlock;
+
+ ret = rss_get_data_alloc(dev, &data);
+ if (ret)
+ goto exit_ops;
+
+ ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_dflt, &mod);
+ if (ret)
+ goto exit_clean_data;
+
+ ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod);
+
+ ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod);
+ if (ret)
+ goto exit_free_indir;
+
+ rxfh.input_xfrm = RXH_XFRM_NO_CHANGE;
+ ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod);
+
+ ctx = ethtool_rxfh_ctx_alloc(ops, data.indir_size, data.hkey_size);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto exit_free_hkey;
+ }
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (!req.rss_context) {
+ limit = ops->rxfh_max_num_contexts ?: U32_MAX;
+ ret = xa_alloc(&dev->ethtool->rss_ctx, &req.rss_context, ctx,
+ XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT);
+ } else {
+ ret = xa_insert(&dev->ethtool->rss_ctx,
+ req.rss_context, ctx, GFP_KERNEL_ACCOUNT);
+ }
+ if (ret < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT],
+ "error allocating context ID");
+ goto err_unlock_free_ctx;
+ }
+ rxfh.rss_context = req.rss_context;
+
+ ret = ops->create_rxfh_context(dev, ctx, &rxfh, info->extack);
+ if (ret)
+ goto err_ctx_id_free;
+
+ /* Make sure driver populates defaults */
+ WARN_ON_ONCE(!rxfh.key && ops->rxfh_per_ctx_key &&
+ !memchr_inv(ethtool_rxfh_context_key(ctx), 0,
+ ctx->key_size));
+
+ /* Store the config from rxfh to Xarray.. */
+ rss_set_ctx_update(ctx, tb, &data, &rxfh);
+ /* .. copy from Xarray to data. */
+ __rss_prepare_ctx(dev, &data, ctx);
+
+ hdr = ethnl_unicast_put(rsp, info->snd_portid, info->snd_seq,
+ ETHTOOL_MSG_RSS_CREATE_ACT_REPLY);
+ ntf_fail = ethnl_fill_reply_header(rsp, dev, ETHTOOL_A_RSS_HEADER);
+ ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base);
+ if (WARN_ON(!hdr || ntf_fail)) {
+ ret = -EMSGSIZE;
+ goto exit_unlock;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ /* Use the same skb for the response and the notification,
+ * genlmsg_reply() will copy the skb if it has elevated user count.
+ */
+ skb_get(rsp);
+ ret = genlmsg_reply(rsp, info);
+ ethnl_rss_create_send_ntf(rsp, dev);
+ rsp = NULL;
+
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+exit_free_hkey:
+ kfree(rxfh.key);
+exit_free_indir:
+ kfree(rxfh.indir);
+exit_clean_data:
+ rss_get_data_free(&data);
+exit_ops:
+ ethnl_ops_complete(dev);
+exit_dev_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+exit_free_dev:
+ ethnl_parse_header_dev_put(&req.base);
+exit_free_rsp:
+ nlmsg_free(rsp);
+ return ret;
+
+err_ctx_id_free:
+ xa_erase(&dev->ethtool->rss_ctx, req.rss_context);
+err_unlock_free_ctx:
+ kfree(ctx);
+ goto exit_unlock;
+}
+
+/* RSS_DELETE */
+
+const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_rxfh_context *ctx;
+ struct nlattr **tb = info->attrs;
+ struct ethnl_req_info req = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ u32 rss_context;
+ int ret;
+
+ if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_RSS_CONTEXT))
+ return -EINVAL;
+ rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]);
+
+ ret = ethnl_parse_header_dev_get(&req, tb[ETHTOOL_A_RSS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ dev = req.dev;
+ ops = dev->ethtool_ops;
+
+ if (!ops->create_rxfh_context)
+ goto exit_free_dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto exit_dev_unlock;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = ethtool_check_rss_ctx_busy(dev, rss_context);
+ if (ret)
+ goto exit_unlock;
+
+ ctx = xa_load(&dev->ethtool->rss_ctx, rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto exit_unlock;
+ }
+
+ ret = ops->remove_rxfh_context(dev, ctx, rss_context, info->extack);
+ if (ret)
+ goto exit_unlock;
+
+ WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rss_context) != ctx);
+ kfree(ctx);
+
+ ethnl_rss_delete_notify(dev, rss_context);
+
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ ethnl_ops_complete(dev);
+exit_dev_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+exit_free_dev:
+ ethnl_parse_header_dev_put(&req);
+ return ret;
+}
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
new file mode 100644
index 000000000000..3ca8eb2a3b31
--- /dev/null
+++ b/net/ethtool/stats.c
@@ -0,0 +1,623 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include <linux/phylib_stubs.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct stats_req_info {
+ struct ethnl_req_info base;
+ DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT);
+ enum ethtool_mac_stats_src src;
+};
+
+#define STATS_REQINFO(__req_base) \
+ container_of(__req_base, struct stats_req_info, base)
+
+struct stats_reply_data {
+ struct ethnl_reply_data base;
+ struct_group(stats,
+ struct ethtool_eth_phy_stats phy_stats;
+ struct ethtool_eth_mac_stats mac_stats;
+ struct ethtool_eth_ctrl_stats ctrl_stats;
+ struct ethtool_rmon_stats rmon_stats;
+ struct ethtool_phy_stats phydev_stats;
+ );
+ const struct ethtool_rmon_hist_range *rmon_ranges;
+};
+
+#define STATS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct stats_reply_data, base)
+
+const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_STATS_ETH_PHY] = "eth-phy",
+ [ETHTOOL_STATS_ETH_MAC] = "eth-mac",
+ [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl",
+ [ETHTOOL_STATS_RMON] = "rmon",
+ [ETHTOOL_STATS_PHY] = "phydev",
+};
+
+const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier",
+};
+
+const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames",
+ [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames",
+ [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions",
+ [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions",
+ [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls",
+ [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError",
+ [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError",
+ [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral",
+ [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField",
+ [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors",
+};
+
+const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted",
+ [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived",
+ [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived",
+};
+
+const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts",
+ [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts",
+ [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments",
+ [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers",
+};
+
+const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_PHY_RX_PKTS] = "RxFrames",
+ [ETHTOOL_A_STATS_PHY_RX_BYTES] = "RxOctets",
+ [ETHTOOL_A_STATS_PHY_RX_ERRORS] = "RxErrors",
+ [ETHTOOL_A_STATS_PHY_TX_PKTS] = "TxFrames",
+ [ETHTOOL_A_STATS_PHY_TX_BYTES] = "TxOctets",
+ [ETHTOOL_A_STATS_PHY_TX_ERRORS] = "TxErrors",
+};
+
+const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = {
+ [ETHTOOL_A_STATS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_STATS_SRC] =
+ NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC),
+};
+
+static int stats_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE;
+ struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ bool mod = false;
+ int err;
+
+ err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT,
+ tb[ETHTOOL_A_STATS_GROUPS], stats_std_names,
+ extack, &mod);
+ if (err)
+ return err;
+
+ if (!mod) {
+ NL_SET_ERR_MSG(extack, "no stats requested");
+ return -EINVAL;
+ }
+
+ if (tb[ETHTOOL_A_STATS_SRC])
+ src = nla_get_u32(tb[ETHTOOL_A_STATS_SRC]);
+
+ req_info->src = src;
+
+ return 0;
+}
+
+static int stats_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ struct stats_reply_data *data = STATS_REPDATA(reply_base);
+ enum ethtool_mac_stats_src src = req_info->src;
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STATS_HEADER,
+ info->extack);
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ if ((src == ETHTOOL_MAC_STATS_SRC_EMAC ||
+ src == ETHTOOL_MAC_STATS_SRC_PMAC) &&
+ !__ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Device does not support MAC merge layer");
+ ethnl_ops_complete(dev);
+ return -EOPNOTSUPP;
+ }
+
+ /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them
+ * from being reported to user space in case driver did not set them.
+ */
+ memset(&data->stats, 0xff, sizeof(data->stats));
+
+ data->phy_stats.src = src;
+ data->mac_stats.src = src;
+ data->ctrl_stats.src = src;
+ data->rmon_stats.src = src;
+
+ if ((test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask) ||
+ test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) &&
+ src == ETHTOOL_MAC_STATS_SRC_AGGREGATE) {
+ if (phydev)
+ phy_ethtool_get_phy_stats(phydev, &data->phy_stats,
+ &data->phydev_stats);
+ }
+
+ if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
+ dev->ethtool_ops->get_eth_phy_stats)
+ dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
+ if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) &&
+ dev->ethtool_ops->get_eth_mac_stats)
+ dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats);
+ if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) &&
+ dev->ethtool_ops->get_eth_ctrl_stats)
+ dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats);
+ if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) &&
+ dev->ethtool_ops->get_rmon_stats)
+ dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats,
+ &data->rmon_ranges);
+
+ ethnl_ops_complete(dev);
+ return 0;
+}
+
+static int stats_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ unsigned int n_grps = 0, n_stats = 0;
+ int len = 0;
+
+ len += nla_total_size(sizeof(u32)); /* _STATS_SRC */
+
+ if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64);
+ n_grps++;
+ }
+ if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64);
+ n_grps++;
+ }
+ if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64);
+ n_grps++;
+ }
+ if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64);
+ n_grps++;
+ /* Above includes the space for _A_STATS_GRP_HIST_VALs */
+
+ len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */
+ nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */
+ nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */
+ ETHTOOL_RMON_HIST_MAX * 2;
+ }
+ if (test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_phy_stats) / sizeof(u64);
+ n_grps++;
+ }
+
+ len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
+ nla_total_size(4) + /* _A_STATS_GRP_ID */
+ nla_total_size(4)); /* _A_STATS_GRP_SS_ID */
+ len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */
+ nla_total_size_64bit(sizeof(u64)));
+
+ return len;
+}
+
+static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val)
+{
+ struct nlattr *nest;
+ int ret;
+
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+
+ /* We want to start stats attr types from 0, so we don't have a type
+ * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside
+ * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the
+ * actual attr we're 4B off - nla_need_padding_for_64bit() & co.
+ * can't be used.
+ */
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
+ if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0))
+ return -EMSGSIZE;
+#endif
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int stats_put_phy_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR,
+ data->phy_stats.SymbolErrorDuringCarrier))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_phydev_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_PHY_RX_PKTS,
+ data->phydev_stats.rx_packets) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_RX_BYTES,
+ data->phydev_stats.rx_bytes) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_RX_ERRORS,
+ data->phydev_stats.rx_errors) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_PKTS,
+ data->phydev_stats.tx_packets) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_BYTES,
+ data->phydev_stats.tx_bytes) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_ERRORS,
+ data->phydev_stats.tx_errors))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_mac_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT,
+ data->mac_stats.FramesTransmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL,
+ data->mac_stats.SingleCollisionFrames) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL,
+ data->mac_stats.MultipleCollisionFrames) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT,
+ data->mac_stats.FramesReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR,
+ data->mac_stats.FrameCheckSequenceErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR,
+ data->mac_stats.AlignmentErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES,
+ data->mac_stats.OctetsTransmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER,
+ data->mac_stats.FramesWithDeferredXmissions) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL,
+ data->mac_stats.LateCollisions) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL,
+ data->mac_stats.FramesAbortedDueToXSColls) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR,
+ data->mac_stats.FramesLostDueToIntMACXmitError) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR,
+ data->mac_stats.CarrierSenseErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES,
+ data->mac_stats.OctetsReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR,
+ data->mac_stats.FramesLostDueToIntMACRcvError) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST,
+ data->mac_stats.MulticastFramesXmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST,
+ data->mac_stats.BroadcastFramesXmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER,
+ data->mac_stats.FramesWithExcessiveDeferral) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST,
+ data->mac_stats.MulticastFramesReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST,
+ data->mac_stats.BroadcastFramesReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR,
+ data->mac_stats.InRangeLengthErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN,
+ data->mac_stats.OutOfRangeLengthField) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR,
+ data->mac_stats.FrameTooLongErrors))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_ctrl_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX,
+ data->ctrl_stats.MACControlFramesTransmitted) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX,
+ data->ctrl_stats.MACControlFramesReceived) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP,
+ data->ctrl_stats.UnsupportedOpcodesReceived))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist,
+ const struct ethtool_rmon_hist_range *ranges)
+{
+ struct nlattr *nest;
+ int i;
+
+ if (!ranges)
+ return 0;
+
+ for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) {
+ if (!ranges[i].low && !ranges[i].high)
+ break;
+ if (hist[i] == ETHTOOL_STAT_NOT_SET)
+ continue;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW,
+ ranges[i].low) ||
+ nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI,
+ ranges[i].high) ||
+ nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL,
+ hist[i], ETHTOOL_A_STATS_GRP_PAD))
+ goto err_cancel_hist;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel_hist:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int stats_put_rmon_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX,
+ data->rmon_stats.hist, data->rmon_ranges) ||
+ stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX,
+ data->rmon_stats.hist_tx, data->rmon_ranges))
+ return -EMSGSIZE;
+
+ if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE,
+ data->rmon_stats.undersize_pkts) ||
+ stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE,
+ data->rmon_stats.oversize_pkts) ||
+ stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG,
+ data->rmon_stats.fragments) ||
+ stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER,
+ data->rmon_stats.jabbers))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int stats_put_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data,
+ u32 id, u32 ss_id,
+ int (*cb)(struct sk_buff *skb,
+ const struct stats_reply_data *data))
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) ||
+ nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id))
+ goto err_cancel;
+
+ if (cb(skb, data))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int stats_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ const struct stats_reply_data *data = STATS_REPDATA(reply_base);
+ int ret = 0;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STATS_SRC, req_info->src))
+ return -EMSGSIZE;
+
+ if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY,
+ ETH_SS_STATS_ETH_PHY,
+ stats_put_phy_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC,
+ ETH_SS_STATS_ETH_MAC,
+ stats_put_mac_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL,
+ ETH_SS_STATS_ETH_CTRL,
+ stats_put_ctrl_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON,
+ ETH_SS_STATS_RMON, stats_put_rmon_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_PHY,
+ ETH_SS_STATS_PHY, stats_put_phydev_stats);
+
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_stats_request_ops = {
+ .request_cmd = ETHTOOL_MSG_STATS_GET,
+ .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_STATS_HEADER,
+ .req_info_size = sizeof(struct stats_req_info),
+ .reply_data_size = sizeof(struct stats_reply_data),
+
+ .parse_request = stats_parse_request,
+ .prepare_data = stats_prepare_data,
+ .reply_size = stats_reply_size,
+ .fill_reply = stats_fill_reply,
+};
+
+static u64 ethtool_stats_sum(u64 a, u64 b)
+{
+ if (a == ETHTOOL_STAT_NOT_SET)
+ return b;
+ if (b == ETHTOOL_STAT_NOT_SET)
+ return a;
+ return a + b;
+}
+
+/* Avoid modifying the aggregation procedure every time a new counter is added
+ * by treating the structures as an array of u64 statistics.
+ */
+static void ethtool_aggregate_stats(void *aggr_stats, const void *emac_stats,
+ const void *pmac_stats, size_t stats_size,
+ size_t stats_offset)
+{
+ size_t num_stats = stats_size / sizeof(u64);
+ const u64 *s1 = emac_stats + stats_offset;
+ const u64 *s2 = pmac_stats + stats_offset;
+ u64 *s = aggr_stats + stats_offset;
+ int i;
+
+ for (i = 0; i < num_stats; i++)
+ s[i] = ethtool_stats_sum(s1[i], s2[i]);
+}
+
+void ethtool_aggregate_mac_stats(struct net_device *dev,
+ struct ethtool_eth_mac_stats *mac_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_eth_mac_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_eth_mac_stats(dev, &emac);
+ ops->get_eth_mac_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(mac_stats, &emac, &pmac,
+ sizeof(mac_stats->stats),
+ offsetof(struct ethtool_eth_mac_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_mac_stats);
+
+void ethtool_aggregate_phy_stats(struct net_device *dev,
+ struct ethtool_eth_phy_stats *phy_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_eth_phy_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_eth_phy_stats(dev, &emac);
+ ops->get_eth_phy_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(phy_stats, &emac, &pmac,
+ sizeof(phy_stats->stats),
+ offsetof(struct ethtool_eth_phy_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_phy_stats);
+
+void ethtool_aggregate_ctrl_stats(struct net_device *dev,
+ struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_eth_ctrl_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_eth_ctrl_stats(dev, &emac);
+ ops->get_eth_ctrl_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(ctrl_stats, &emac, &pmac,
+ sizeof(ctrl_stats->stats),
+ offsetof(struct ethtool_eth_ctrl_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_ctrl_stats);
+
+void ethtool_aggregate_pause_stats(struct net_device *dev,
+ struct ethtool_pause_stats *pause_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_pause_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_pause_stats(dev, &emac);
+ ops->get_pause_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(pause_stats, &emac, &pmac,
+ sizeof(pause_stats->stats),
+ offsetof(struct ethtool_pause_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_pause_stats);
+
+void ethtool_aggregate_rmon_stats(struct net_device *dev,
+ struct ethtool_rmon_stats *rmon_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ const struct ethtool_rmon_hist_range *dummy;
+ struct ethtool_rmon_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_rmon_stats(dev, &emac, &dummy);
+ ops->get_rmon_stats(dev, &pmac, &dummy);
+
+ ethtool_aggregate_stats(rmon_stats, &emac, &pmac,
+ sizeof(rmon_stats->stats),
+ offsetof(struct ethtool_rmon_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_rmon_stats);
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
new file mode 100644
index 000000000000..f6a67109beda
--- /dev/null
+++ b/net/ethtool/strset.c
@@ -0,0 +1,499 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include "netlink.h"
+#include "common.h"
+
+struct strset_info {
+ bool per_dev;
+ bool free_strings;
+ unsigned int count;
+ const char (*strings)[ETH_GSTRING_LEN];
+};
+
+static const struct strset_info info_template[] = {
+ [ETH_SS_TEST] = {
+ .per_dev = true,
+ },
+ [ETH_SS_STATS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_PRIV_FLAGS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_FEATURES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(netdev_features_strings),
+ .strings = netdev_features_strings,
+ },
+ [ETH_SS_RSS_HASH_FUNCS] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(rss_hash_func_strings),
+ .strings = rss_hash_func_strings,
+ },
+ [ETH_SS_TUNABLES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(tunable_strings),
+ .strings = tunable_strings,
+ },
+ [ETH_SS_PHY_STATS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_PHY_TUNABLES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(phy_tunable_strings),
+ .strings = phy_tunable_strings,
+ },
+ [ETH_SS_LINK_MODES] = {
+ .per_dev = false,
+ .count = __ETHTOOL_LINK_MODE_MASK_NBITS,
+ .strings = link_mode_names,
+ },
+ [ETH_SS_MSG_CLASSES] = {
+ .per_dev = false,
+ .count = NETIF_MSG_CLASS_COUNT,
+ .strings = netif_msg_class_names,
+ },
+ [ETH_SS_WOL_MODES] = {
+ .per_dev = false,
+ .count = WOL_MODE_COUNT,
+ .strings = wol_mode_names,
+ },
+ [ETH_SS_SOF_TIMESTAMPING] = {
+ .per_dev = false,
+ .count = __SOF_TIMESTAMPING_CNT,
+ .strings = sof_timestamping_names,
+ },
+ [ETH_SS_TS_TX_TYPES] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_TX_CNT,
+ .strings = ts_tx_type_names,
+ },
+ [ETH_SS_TS_RX_FILTERS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FILTER_CNT,
+ .strings = ts_rx_filter_names,
+ },
+ [ETH_SS_TS_FLAGS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FLAG_CNT,
+ .strings = ts_flags_names,
+ },
+ [ETH_SS_UDP_TUNNEL_TYPES] = {
+ .per_dev = false,
+ .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ .strings = udp_tunnel_type_names,
+ },
+ [ETH_SS_STATS_STD] = {
+ .per_dev = false,
+ .count = __ETHTOOL_STATS_CNT,
+ .strings = stats_std_names,
+ },
+ [ETH_SS_STATS_ETH_PHY] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_ETH_PHY_CNT,
+ .strings = stats_eth_phy_names,
+ },
+ [ETH_SS_STATS_ETH_MAC] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_ETH_MAC_CNT,
+ .strings = stats_eth_mac_names,
+ },
+ [ETH_SS_STATS_ETH_CTRL] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT,
+ .strings = stats_eth_ctrl_names,
+ },
+ [ETH_SS_STATS_RMON] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_RMON_CNT,
+ .strings = stats_rmon_names,
+ },
+ [ETH_SS_STATS_PHY] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_PHY_CNT,
+ .strings = stats_phy_names,
+ },
+};
+
+struct strset_req_info {
+ struct ethnl_req_info base;
+ u32 req_ids;
+ bool counts_only;
+};
+
+#define STRSET_REQINFO(__req_base) \
+ container_of(__req_base, struct strset_req_info, base)
+
+struct strset_reply_data {
+ struct ethnl_reply_data base;
+ struct strset_info sets[ETH_SS_COUNT];
+};
+
+#define STRSET_REPDATA(__reply_base) \
+ container_of(__reply_base, struct strset_reply_data, base)
+
+const struct nla_policy ethnl_strset_get_policy[] = {
+ [ETHTOOL_A_STRSET_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy get_stringset_policy[] = {
+ [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 },
+};
+
+/**
+ * strset_include() - test if a string set should be included in reply
+ * @info: parsed client request
+ * @data: pointer to request data structure
+ * @id: id of string set to check (ETH_SS_* constants)
+ */
+static bool strset_include(const struct strset_req_info *info,
+ const struct strset_reply_data *data, u32 id)
+{
+ bool per_dev;
+
+ BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids));
+
+ if (info->req_ids)
+ return info->req_ids & (1U << id);
+ per_dev = data->sets[id].per_dev;
+ if (!per_dev && !data->sets[id].strings)
+ return false;
+
+ return data->base.dev ? per_dev : !per_dev;
+}
+
+static int strset_get_id(const struct nlattr *nest, u32 *val,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(get_stringset_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb, ARRAY_SIZE(get_stringset_policy) - 1, nest,
+ get_stringset_policy, extack);
+ if (ret < 0)
+ return ret;
+ if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_STRINGSET_ID))
+ return -EINVAL;
+
+ *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]);
+ return 0;
+}
+
+static const struct nla_policy strset_stringsets_policy[] = {
+ [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED },
+};
+
+static int strset_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS];
+ struct nlattr *attr;
+ int rem, ret;
+
+ if (!nest)
+ return 0;
+ ret = nla_validate_nested(nest,
+ ARRAY_SIZE(strset_stringsets_policy) - 1,
+ strset_stringsets_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY];
+ nla_for_each_nested(attr, nest, rem) {
+ u32 id;
+
+ if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET,
+ "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n",
+ nla_type(attr)))
+ return -EINVAL;
+
+ ret = strset_get_id(attr, &id, extack);
+ if (ret < 0)
+ return ret;
+ if (id >= ETH_SS_COUNT) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "unknown string set id");
+ return -EOPNOTSUPP;
+ }
+
+ req_info->req_ids |= (1U << id);
+ }
+
+ return 0;
+}
+
+static void strset_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ unsigned int i;
+
+ for (i = 0; i < ETH_SS_COUNT; i++)
+ if (data->sets[i].free_strings) {
+ kfree(data->sets[i].strings);
+ data->sets[i].strings = NULL;
+ data->sets[i].free_strings = false;
+ }
+}
+
+static int strset_prepare_set(struct strset_info *info, struct net_device *dev,
+ struct phy_device *phydev, unsigned int id,
+ bool counts_only)
+{
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *strings;
+ int count, ret;
+
+ if (id == ETH_SS_PHY_STATS && phydev &&
+ !ops->get_ethtool_phy_stats && phy_ops &&
+ phy_ops->get_sset_count)
+ ret = phy_ops->get_sset_count(phydev);
+ else if (ops->get_sset_count && ops->get_strings)
+ ret = ops->get_sset_count(dev, id);
+ else
+ ret = -EOPNOTSUPP;
+ if (ret <= 0) {
+ info->count = 0;
+ return 0;
+ }
+
+ count = ret;
+ if (!counts_only) {
+ strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL);
+ if (!strings)
+ return -ENOMEM;
+ if (id == ETH_SS_PHY_STATS && phydev &&
+ !ops->get_ethtool_phy_stats && phy_ops &&
+ phy_ops->get_strings)
+ phy_ops->get_strings(phydev, strings);
+ else
+ ops->get_strings(dev, id, strings);
+ info->strings = strings;
+ info->free_strings = true;
+ }
+ info->count = count;
+
+ return 0;
+}
+
+static int strset_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ unsigned int i;
+ int ret;
+
+ BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT);
+ memcpy(&data->sets, &info_template, sizeof(data->sets));
+
+ if (!dev) {
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if ((req_info->req_ids & (1U << i)) &&
+ data->sets[i].per_dev) {
+ GENL_SET_ERR_MSG(info, "requested per device strings without dev");
+ return -EINVAL;
+ }
+ }
+ return 0;
+ }
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS,
+ info->extack);
+
+ /* phydev can be NULL, check for errors only */
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto err_strset;
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if (!strset_include(req_info, data, i) ||
+ !data->sets[i].per_dev)
+ continue;
+
+ ret = strset_prepare_set(&data->sets[i], dev, phydev, i,
+ req_info->counts_only);
+ if (ret < 0)
+ goto err_ops;
+ }
+ ethnl_ops_complete(dev);
+
+ return 0;
+err_ops:
+ ethnl_ops_complete(dev);
+err_strset:
+ strset_cleanup_data(reply_base);
+ return ret;
+}
+
+/* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */
+static int strset_set_size(const struct strset_info *info, bool counts_only)
+{
+ unsigned int len = 0;
+ unsigned int i;
+
+ if (info->count == 0)
+ return 0;
+ if (counts_only)
+ return nla_total_size(2 * nla_total_size(sizeof(u32)));
+
+ for (i = 0; i < info->count; i++) {
+ const char *str = info->strings[i];
+
+ /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */
+ len += nla_total_size(nla_total_size(sizeof(u32)) +
+ ethnl_strz_size(str));
+ }
+ /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */
+ len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len);
+
+ return nla_total_size(len);
+}
+
+static int strset_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ const struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ unsigned int i;
+ int len = 0;
+ int ret;
+
+ len += nla_total_size(0); /* ETHTOOL_A_STRSET_STRINGSETS */
+
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ const struct strset_info *set_info = &data->sets[i];
+
+ if (!strset_include(req_info, data, i))
+ continue;
+
+ ret = strset_set_size(set_info, req_info->counts_only);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
+
+ return len;
+}
+
+/* fill one string into reply */
+static int strset_fill_string(struct sk_buff *skb,
+ const struct strset_info *set_info, u32 idx)
+{
+ struct nlattr *string_attr;
+ const char *value;
+
+ value = set_info->strings[idx];
+
+ string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING);
+ if (!string_attr)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) ||
+ ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(skb, string_attr);
+
+ return 0;
+nla_put_failure:
+ nla_nest_cancel(skb, string_attr);
+ return -EMSGSIZE;
+}
+
+/* fill one string set into reply */
+static int strset_fill_set(struct sk_buff *skb,
+ const struct strset_info *set_info, u32 id,
+ bool counts_only)
+{
+ struct nlattr *stringset_attr;
+ struct nlattr *strings_attr;
+ unsigned int i;
+
+ if (!set_info->per_dev && !set_info->strings)
+ return -EOPNOTSUPP;
+ if (set_info->count == 0)
+ return 0;
+ stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET);
+ if (!stringset_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) ||
+ nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count))
+ goto nla_put_failure;
+
+ if (!counts_only) {
+ strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS);
+ if (!strings_attr)
+ goto nla_put_failure;
+ for (i = 0; i < set_info->count; i++) {
+ if (strset_fill_string(skb, set_info, i) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, strings_attr);
+ }
+
+ nla_nest_end(skb, stringset_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, stringset_attr);
+ return -EMSGSIZE;
+}
+
+static int strset_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ const struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ struct nlattr *nest;
+ unsigned int i;
+ int ret;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if (strset_include(req_info, data, i)) {
+ ret = strset_fill_set(skb, &data->sets[i], i,
+ req_info->counts_only);
+ if (ret < 0)
+ goto nla_put_failure;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_strset_request_ops = {
+ .request_cmd = ETHTOOL_MSG_STRSET_GET,
+ .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_STRSET_HEADER,
+ .req_info_size = sizeof(struct strset_req_info),
+ .reply_data_size = sizeof(struct strset_reply_data),
+ .allow_nodev_do = true,
+
+ .parse_request = strset_parse_request,
+ .prepare_data = strset_prepare_data,
+ .reply_size = strset_reply_size,
+ .fill_reply = strset_fill_reply,
+ .cleanup_data = strset_cleanup_data,
+};
diff --git a/net/ethtool/ts.h b/net/ethtool/ts.h
new file mode 100644
index 000000000000..d901a879a671
--- /dev/null
+++ b/net/ethtool/ts.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_TS_H
+#define _NET_ETHTOOL_TS_H
+
+#include "netlink.h"
+
+static const struct nla_policy
+ethnl_ts_hwtst_prov_policy[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX + 1] = {
+ [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER] =
+ NLA_POLICY_MAX(NLA_U32, HWTSTAMP_PROVIDER_QUALIFIER_CNT - 1)
+};
+
+int ts_parse_hwtst_provider(const struct nlattr *nest,
+ struct hwtstamp_provider_desc *hwprov_desc,
+ struct netlink_ext_ack *extack,
+ bool *mod);
+
+#endif /* _NET_ETHTOOL_TS_H */
diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c
new file mode 100644
index 000000000000..169b413b31fc
--- /dev/null
+++ b/net/ethtool/tsconfig.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "../core/dev.h"
+#include "ts.h"
+
+struct tsconfig_req_info {
+ struct ethnl_req_info base;
+};
+
+struct tsconfig_reply_data {
+ struct ethnl_reply_data base;
+ struct hwtstamp_provider_desc hwprov_desc;
+ struct {
+ u32 tx_type;
+ u32 rx_filter;
+ u32 flags;
+ } hwtst_config;
+};
+
+#define TSCONFIG_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsconfig_reply_data, base)
+
+const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = {
+ [ETHTOOL_A_TSCONFIG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int tsconfig_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ struct hwtstamp_provider *hwprov = NULL;
+ struct net_device *dev = reply_base->dev;
+ struct kernel_hwtstamp_config cfg = {};
+ int ret;
+
+ if (!dev->netdev_ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = dev_get_hwtstamp_phylib(dev, &cfg);
+ if (ret)
+ goto out;
+
+ data->hwtst_config.tx_type = BIT(cfg.tx_type);
+ data->hwtst_config.rx_filter = BIT(cfg.rx_filter);
+ data->hwtst_config.flags = cfg.flags;
+
+ data->hwprov_desc.index = -1;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ data->hwprov_desc.index = hwprov->desc.index;
+ data->hwprov_desc.qualifier = hwprov->desc.qualifier;
+ } else {
+ struct kernel_ethtool_ts_info ts_info = {};
+
+ ts_info.phc_index = -1;
+ ret = __ethtool_get_ts_info(dev, &ts_info);
+ if (ret)
+ goto out;
+
+ if (ts_info.phc_index == -1)
+ return -ENODEV;
+
+ data->hwprov_desc.index = ts_info.phc_index;
+ data->hwprov_desc.qualifier = ts_info.phc_qualifier;
+ }
+
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int tsconfig_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32);
+
+ if (data->hwtst_config.flags) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.flags,
+ NULL, __HWTSTAMP_FLAG_CNT,
+ ts_flags_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */
+ }
+
+ if (data->hwtst_config.tx_type) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.tx_type,
+ NULL, __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_TX_TYPES */
+ }
+ if (data->hwtst_config.rx_filter) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter,
+ NULL, __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_RX_FILTERS */
+ }
+
+ if (data->hwprov_desc.index >= 0)
+ /* _TSCONFIG_HWTSTAMP_PROVIDER */
+ len += nla_total_size(0) +
+ 2 * nla_total_size(sizeof(u32));
+
+ return len;
+}
+
+static int tsconfig_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ if (data->hwtst_config.flags) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS,
+ &data->hwtst_config.flags, NULL,
+ __HWTSTAMP_FLAG_CNT,
+ ts_flags_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwtst_config.tx_type) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES,
+ &data->hwtst_config.tx_type, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwtst_config.rx_filter) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS,
+ &data->hwtst_config.rx_filter,
+ NULL, __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwprov_desc.index >= 0) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX,
+ data->hwprov_desc.index) ||
+ nla_put_u32(skb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER,
+ data->hwprov_desc.qualifier)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+}
+
+/* TSCONFIG_SET */
+const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = {
+ [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] =
+ NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy),
+ [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED },
+};
+
+static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info)
+{
+ struct tsconfig_reply_data *reply_data;
+ struct tsconfig_req_info *req_info;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len = 0;
+ int ret;
+
+ req_info = kzalloc(sizeof(*req_info), GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return -ENOMEM;
+ }
+
+ ASSERT_RTNL();
+ reply_data->base.dev = dev;
+ ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info);
+ if (ret < 0)
+ goto err_cleanup;
+
+ ret = tsconfig_reply_size(&req_info->base, &reply_data->base);
+ if (ret < 0)
+ goto err_cleanup;
+
+ reply_len = ret + ethnl_reply_header_size();
+ rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY,
+ ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload);
+ if (!rskb)
+ goto err_cleanup;
+
+ ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base);
+ if (ret < 0)
+ goto err_cleanup;
+
+ genlmsg_end(rskb, reply_payload);
+ ret = genlmsg_reply(rskb, info);
+
+err_cleanup:
+ kfree(reply_data);
+ kfree(req_info);
+ return ret;
+}
+
+static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base,
+ struct genl_info *info)
+{
+ const struct net_device_ops *ops = req_base->dev->netdev_ops;
+
+ if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ return 1;
+}
+
+static struct hwtstamp_provider *
+tsconfig_set_hwprov_from_desc(struct net_device *dev,
+ struct genl_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ struct kernel_ethtool_ts_info ts_info;
+ struct hwtstamp_provider *hwprov;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phy = NULL;
+ enum hwtstamp_source source;
+ int ret;
+
+ ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc);
+ if (!ret) {
+ /* Found */
+ source = HWTSTAMP_SOURCE_NETDEV;
+ } else {
+ phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc);
+ if (IS_ERR(phy)) {
+ if (PTR_ERR(phy) == -ENODEV)
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER],
+ "phc not in this net device topology");
+ return ERR_CAST(phy);
+ }
+
+ source = HWTSTAMP_SOURCE_PHYLIB;
+ }
+
+ hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL);
+ if (!hwprov)
+ return ERR_PTR(-ENOMEM);
+
+ hwprov->desc.index = hwprov_desc->index;
+ hwprov->desc.qualifier = hwprov_desc->qualifier;
+ hwprov->source = source;
+ hwprov->phydev = phy;
+
+ return hwprov;
+}
+
+static int ethnl_set_tsconfig(struct ethnl_req_info *req_base,
+ struct genl_info *info)
+{
+ struct kernel_hwtstamp_config hwtst_config = {0};
+ bool hwprov_mod = false, config_mod = false;
+ struct hwtstamp_provider *hwprov = NULL;
+ struct net_device *dev = req_base->dev;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32);
+ BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32);
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) {
+ struct hwtstamp_provider_desc __hwprov_desc = {.index = -1};
+ struct hwtstamp_provider *__hwprov;
+
+ __hwprov = rtnl_dereference(dev->hwprov);
+ if (__hwprov) {
+ __hwprov_desc.index = __hwprov->desc.index;
+ __hwprov_desc.qualifier = __hwprov->desc.qualifier;
+ }
+
+ ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER],
+ &__hwprov_desc, info->extack,
+ &hwprov_mod);
+ if (ret < 0)
+ return ret;
+
+ if (hwprov_mod) {
+ hwprov = tsconfig_set_hwprov_from_desc(dev, info,
+ &__hwprov_desc);
+ if (IS_ERR(hwprov))
+ return PTR_ERR(hwprov);
+ }
+ }
+
+ /* Get current hwtstamp config if we are not changing the
+ * hwtstamp source. It will be zeroed in the other case.
+ */
+ if (!hwprov_mod) {
+ ret = dev_get_hwtstamp_phylib(dev, &hwtst_config);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ goto err_free_hwprov;
+ }
+
+ /* Get the hwtstamp config from netlink */
+ if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) {
+ u32 req_tx_type;
+
+ req_tx_type = BIT(hwtst_config.tx_type);
+ ret = ethnl_update_bitset32(&req_tx_type,
+ __HWTSTAMP_TX_CNT,
+ tb[ETHTOOL_A_TSCONFIG_TX_TYPES],
+ ts_tx_type_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Select only one tx type at a time */
+ if (ffs(req_tx_type) != fls(req_tx_type)) {
+ ret = -EINVAL;
+ goto err_free_hwprov;
+ }
+
+ hwtst_config.tx_type = ffs(req_tx_type) - 1;
+ }
+
+ if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) {
+ u32 req_rx_filter;
+
+ req_rx_filter = BIT(hwtst_config.rx_filter);
+ ret = ethnl_update_bitset32(&req_rx_filter,
+ __HWTSTAMP_FILTER_CNT,
+ tb[ETHTOOL_A_TSCONFIG_RX_FILTERS],
+ ts_rx_filter_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Select only one rx filter at a time */
+ if (ffs(req_rx_filter) != fls(req_rx_filter)) {
+ ret = -EINVAL;
+ goto err_free_hwprov;
+ }
+
+ hwtst_config.rx_filter = ffs(req_rx_filter) - 1;
+ }
+
+ if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) {
+ ret = ethnl_update_bitset32(&hwtst_config.flags,
+ __HWTSTAMP_FLAG_CNT,
+ tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS],
+ ts_flags_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+ }
+
+ ret = net_hwtstamp_validate(&hwtst_config);
+ if (ret)
+ goto err_free_hwprov;
+
+ if (hwprov_mod) {
+ struct kernel_hwtstamp_config zero_config = {0};
+ struct hwtstamp_provider *__hwprov;
+
+ /* Disable current time stamping if we try to enable
+ * another one
+ */
+ ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Change the selected hwtstamp source */
+ __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov);
+ if (__hwprov)
+ kfree_rcu(__hwprov, rcu_head);
+ }
+
+ if (config_mod) {
+ ret = dev_set_hwtstamp_phylib(dev, &hwtst_config,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = tsconfig_send_reply(dev, info);
+ if (ret && ret != -EOPNOTSUPP) {
+ NL_SET_ERR_MSG(info->extack,
+ "error while reading the new configuration set");
+ return ret;
+ }
+
+ /* tsconfig has no notification */
+ return 0;
+
+err_free_hwprov:
+ kfree(hwprov);
+
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_tsconfig_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSCONFIG_GET,
+ .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER,
+ .req_info_size = sizeof(struct tsconfig_req_info),
+ .reply_data_size = sizeof(struct tsconfig_reply_data),
+
+ .prepare_data = tsconfig_prepare_data,
+ .reply_size = tsconfig_reply_size,
+ .fill_reply = tsconfig_fill_reply,
+
+ .set_validate = ethnl_set_tsconfig_validate,
+ .set = ethnl_set_tsconfig,
+};
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
new file mode 100644
index 000000000000..8c654caa6805
--- /dev/null
+++ b/net/ethtool/tsinfo.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/phy_link_topology.h>
+#include <linux/ptp_clock_kernel.h>
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "ts.h"
+
+struct tsinfo_req_info {
+ struct ethnl_req_info base;
+ struct hwtstamp_provider_desc hwprov_desc;
+};
+
+struct tsinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct kernel_ethtool_ts_info ts_info;
+ struct ethtool_ts_stats stats;
+};
+
+#define TSINFO_REQINFO(__req_base) \
+ container_of(__req_base, struct tsinfo_req_info, base)
+
+#define TSINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsinfo_reply_data, base)
+
+#define ETHTOOL_TS_STAT_CNT \
+ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1))
+
+const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = {
+ [ETHTOOL_A_TSINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
+ [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] =
+ NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy),
+};
+
+int ts_parse_hwtst_provider(const struct nlattr *nest,
+ struct hwtstamp_provider_desc *hwprov_desc,
+ struct netlink_ext_ack *extack,
+ bool *mod)
+{
+ struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb,
+ ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1,
+ nest,
+ ethnl_ts_hwtst_prov_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (NL_REQ_ATTR_CHECK(extack, nest, tb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) ||
+ NL_REQ_ATTR_CHECK(extack, nest, tb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER))
+ return -EINVAL;
+
+ ethnl_update_u32(&hwprov_desc->index,
+ tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX],
+ mod);
+ ethnl_update_u32(&hwprov_desc->qualifier,
+ tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER],
+ mod);
+
+ return 0;
+}
+
+static int
+tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct tsinfo_req_info *req = TSINFO_REQINFO(req_base);
+ bool mod = false;
+
+ req->hwprov_desc.index = -1;
+
+ if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER])
+ return 0;
+
+ return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER],
+ &req->hwprov_desc, extack, &mod);
+}
+
+static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ struct tsinfo_req_info *req = TSINFO_REQINFO(req_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ if (req->hwprov_desc.index != -1) {
+ ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info,
+ &req->hwprov_desc);
+ ethnl_ops_complete(dev);
+ return ret;
+ }
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ ethtool_stats_init((u64 *)&data->stats,
+ sizeof(data->stats) / sizeof(u64));
+ if (dev->ethtool_ops->get_ts_stats)
+ dev->ethtool_ops->get_ts_stats(dev, &data->stats);
+ }
+
+ ret = __ethtool_get_ts_info(dev, &data->ts_info);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct kernel_ethtool_ts_info *ts_info = &data->ts_info;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TIMESTAMPING */
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_bitset32_size(&ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TX_TYPES */
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_RX_FILTERS */
+ }
+ if (ts_info->phc_index >= 0) {
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */
+ /* _TSINFO_HWTSTAMP_PROVIDER */
+ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32));
+ }
+ if (ts_info->phc_source) {
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */
+ if (ts_info->phc_phyindex)
+ /* _TSINFO_HWTSTAMP_PHYINDEX */
+ len += nla_total_size(sizeof(u32));
+ }
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ len += nla_total_size(0) + /* _TSINFO_STATS */
+ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT;
+
+ return len;
+}
+
+static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_uint(skb, attrtype, val))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int tsinfo_put_stats(struct sk_buff *skb,
+ const struct ethtool_ts_stats *stats)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (tsinfo_put_stat(skb, stats->tx_stats.pkts,
+ ETHTOOL_A_TS_STAT_TX_PKTS) ||
+ tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed,
+ ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) ||
+ tsinfo_put_stat(skb, stats->tx_stats.lost,
+ ETHTOOL_A_TS_STAT_TX_LOST) ||
+ tsinfo_put_stat(skb, stats->tx_stats.err,
+ ETHTOOL_A_TS_STAT_TX_ERR))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int tsinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct kernel_ethtool_ts_info *ts_info = &data->ts_info;
+ int ret;
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING,
+ &ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES,
+ &ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS,
+ &ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->phc_index >= 0) {
+ struct nlattr *nest;
+
+ ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX,
+ ts_info->phc_index);
+ if (ret)
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX,
+ ts_info->phc_index) ||
+ nla_put_u32(skb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER,
+ ts_info->phc_qualifier)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+ if (ts_info->phc_source) {
+ if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE,
+ ts_info->phc_source))
+ return -EMSGSIZE;
+
+ if (ts_info->phc_phyindex &&
+ nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX,
+ ts_info->phc_phyindex))
+ return -EMSGSIZE;
+ }
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ tsinfo_put_stats(skb, &data->stats))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+struct ethnl_tsinfo_dump_ctx {
+ struct tsinfo_req_info *req_info;
+ struct tsinfo_reply_data *reply_data;
+ unsigned long pos_ifindex;
+ bool netdev_dump_done;
+ unsigned long pos_phyindex;
+ enum hwtstamp_provider_qualifier pos_phcqualifier;
+};
+
+static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb,
+ struct net_device *dev,
+ struct tsinfo_reply_data *reply_data,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ void *ehdr = NULL;
+
+ ehdr = ethnl_dump_put(skb, cb,
+ ETHTOOL_MSG_TSINFO_GET_REPLY);
+ if (!ehdr)
+ return ERR_PTR(-EMSGSIZE);
+
+ reply_data = ctx->reply_data;
+ memset(reply_data, 0, sizeof(*reply_data));
+ reply_data->base.dev = dev;
+ reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO;
+ reply_data->ts_info.phc_index = -1;
+
+ return ehdr;
+}
+
+static int ethnl_tsinfo_end_dump(struct sk_buff *skb,
+ struct net_device *dev,
+ struct tsinfo_req_info *req_info,
+ struct tsinfo_reply_data *reply_data,
+ void *ehdr)
+{
+ int ret;
+
+ reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER);
+ if (ret < 0)
+ return ret;
+
+ ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base);
+ if (ret < 0)
+ return ret;
+
+ reply_data->base.dev = NULL;
+ genlmsg_end(skb, ehdr);
+
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb,
+ struct net_device *dev,
+ struct phy_device *phydev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ void *ehdr = NULL;
+ int ret = 0;
+
+ if (!phy_has_tsinfo(phydev))
+ return -EOPNOTSUPP;
+
+ reply_data = ctx->reply_data;
+ req_info = ctx->req_info;
+ ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
+ if (IS_ERR(ehdr))
+ return PTR_ERR(ehdr);
+
+ ret = phy_ts_info(phydev, &reply_data->ts_info);
+ if (ret < 0)
+ goto err;
+
+ if (reply_data->ts_info.phc_index >= 0) {
+ reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ reply_data->ts_info.phc_phyindex = phydev->phyindex;
+ }
+
+ ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr);
+ if (ret < 0)
+ goto err;
+
+ return ret;
+err:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ void *ehdr = NULL;
+ int ret = 0;
+
+ if (!ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ reply_data = ctx->reply_data;
+ req_info = ctx->req_info;
+ for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT;
+ ctx->pos_phcqualifier++) {
+ if (!net_support_hwtstamp_qualifier(dev,
+ ctx->pos_phcqualifier))
+ continue;
+
+ ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
+ if (IS_ERR(ehdr)) {
+ ret = PTR_ERR(ehdr);
+ goto err;
+ }
+
+ reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier;
+ ret = ops->get_ts_info(dev, &reply_data->ts_info);
+ if (ret < 0)
+ goto err;
+
+ if (reply_data->ts_info.phc_index >= 0)
+ reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV;
+ ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data,
+ ehdr);
+ if (ret < 0)
+ goto err;
+ }
+
+ return ret;
+
+err:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct phy_device_node *pdn;
+ int ret = 0;
+
+ if (!ctx->netdev_dump_done) {
+ ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ ctx->netdev_dump_done = true;
+ }
+
+ if (!dev->link_topo) {
+ if (phy_has_tsinfo(dev->phydev)) {
+ ret = ethnl_tsinfo_dump_one_phydev(skb, dev,
+ dev->phydev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ return 0;
+ }
+
+ xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn,
+ ctx->pos_phyindex) {
+ if (phy_has_tsinfo(pdn->phy)) {
+ ret = ethnl_tsinfo_dump_one_phydev(skb, dev,
+ pdn->phy, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ if (ctx->req_info->base.dev) {
+ dev = ctx->req_info->base.dev;
+ netdev_lock_ops(dev);
+ ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb);
+ netdev_unlock_ops(dev);
+ } else {
+ for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
+ netdev_lock_ops(dev);
+ ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb);
+ netdev_unlock_ops(dev);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ break;
+ ctx->pos_phyindex = 0;
+ ctx->netdev_dump_done = false;
+ ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE;
+ }
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+int ethnl_tsinfo_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->info.attrs;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ req_info = kzalloc(sizeof(*req_info), GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ ret = ethnl_parse_header_dev_get(&req_info->base,
+ tb[ETHTOOL_A_TSINFO_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (ret < 0)
+ goto free_reply_data;
+
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+ ctx->pos_phyindex = 0;
+ ctx->netdev_dump_done = false;
+ ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+int ethnl_tsinfo_done(struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct tsinfo_req_info *req_info = ctx->req_info;
+
+ ethnl_parse_header_dev_put(&req_info->base);
+ kfree(ctx->reply_data);
+ kfree(ctx->req_info);
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSINFO_HEADER,
+ .req_info_size = sizeof(struct tsinfo_req_info),
+ .reply_data_size = sizeof(struct tsinfo_reply_data),
+
+ .parse_request = tsinfo_parse_request,
+ .prepare_data = tsinfo_prepare_data,
+ .reply_size = tsinfo_reply_size,
+ .fill_reply = tsinfo_fill_reply,
+};
diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c
new file mode 100644
index 000000000000..b4ce47dd2aa6
--- /dev/null
+++ b/net/ethtool/tunnels.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <net/udp_tunnel.h>
+#include <net/vxlan.h>
+
+#include "bitset.h"
+#include "common.h"
+#include "netlink.h"
+
+const struct nla_policy ethnl_tunnel_info_get_policy[] = {
+ [ETHTOOL_A_TUNNEL_INFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN));
+static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE));
+static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE ==
+ ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE));
+
+static ssize_t ethnl_udp_table_reply_size(unsigned int types, bool compact)
+{
+ ssize_t size;
+
+ size = ethnl_bitset32_size(&types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ udp_tunnel_type_names, compact);
+ if (size < 0)
+ return size;
+
+ return size +
+ nla_total_size(0) + /* _UDP_TABLE */
+ nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */
+}
+
+static ssize_t
+ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base,
+ struct netlink_ext_ack *extack)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct udp_tunnel_nic_info *info;
+ unsigned int i;
+ ssize_t ret;
+ size_t size;
+
+ info = req_base->dev->udp_tunnel_nic_info;
+ if (!info) {
+ NL_SET_ERR_MSG(extack,
+ "device does not report tunnel offload info");
+ return -EOPNOTSUPP;
+ }
+
+ size = nla_total_size(0); /* _INFO_UDP_PORTS */
+
+ for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+ if (!info->tables[i].n_entries)
+ break;
+
+ ret = ethnl_udp_table_reply_size(info->tables[i].tunnel_types,
+ compact);
+ if (ret < 0)
+ return ret;
+ size += ret;
+
+ size += udp_tunnel_nic_dump_size(req_base->dev, i);
+ }
+
+ if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) {
+ ret = ethnl_udp_table_reply_size(0, compact);
+ if (ret < 0)
+ return ret;
+ size += ret;
+
+ size += nla_total_size(0) + /* _TABLE_ENTRY */
+ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */
+ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */
+ }
+
+ return size;
+}
+
+static int
+ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base,
+ struct sk_buff *skb)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct udp_tunnel_nic_info *info;
+ struct nlattr *ports, *table, *entry;
+ unsigned int i;
+
+ info = req_base->dev->udp_tunnel_nic_info;
+ if (!info)
+ return -EOPNOTSUPP;
+
+ ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS);
+ if (!ports)
+ return -EMSGSIZE;
+
+ for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+ if (!info->tables[i].n_entries)
+ break;
+
+ table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE);
+ if (!table)
+ goto err_cancel_ports;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE,
+ info->tables[i].n_entries))
+ goto err_cancel_table;
+
+ if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES,
+ &info->tables[i].tunnel_types, NULL,
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ udp_tunnel_type_names, compact))
+ goto err_cancel_table;
+
+ if (udp_tunnel_nic_dump_write(req_base->dev, i, skb))
+ goto err_cancel_table;
+
+ nla_nest_end(skb, table);
+ }
+
+ if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) {
+ u32 zero = 0;
+
+ table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE);
+ if (!table)
+ goto err_cancel_ports;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, 1))
+ goto err_cancel_table;
+
+ if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES,
+ &zero, NULL,
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ udp_tunnel_type_names, compact))
+ goto err_cancel_table;
+
+ entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
+ if (!entry)
+ goto err_cancel_entry;
+
+ if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
+ htons(IANA_VXLAN_UDP_PORT)) ||
+ nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE,
+ ilog2(UDP_TUNNEL_TYPE_VXLAN)))
+ goto err_cancel_entry;
+
+ nla_nest_end(skb, entry);
+ nla_nest_end(skb, table);
+ }
+
+ nla_nest_end(skb, ports);
+
+ return 0;
+
+err_cancel_entry:
+ nla_nest_cancel(skb, entry);
+err_cancel_table:
+ nla_nest_cancel(skb, table);
+err_cancel_ports:
+ nla_nest_cancel(skb, ports);
+ return -EMSGSIZE;
+}
+
+int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ rtnl_lock();
+ ret = ethnl_tunnel_info_reply_size(&req_info, info->extack);
+ if (ret < 0)
+ goto err_unlock_rtnl;
+ reply_len = ret + ethnl_reply_header_size();
+
+ rskb = ethnl_reply_init(reply_len, req_info.dev,
+ ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
+ ETHTOOL_A_TUNNEL_INFO_HEADER,
+ info, &reply_payload);
+ if (!rskb) {
+ ret = -ENOMEM;
+ goto err_unlock_rtnl;
+ }
+
+ ret = ethnl_tunnel_info_fill_reply(&req_info, rskb);
+ if (ret)
+ goto err_free_msg;
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ genlmsg_end(rskb, reply_payload);
+
+ return genlmsg_reply(rskb, info);
+
+err_free_msg:
+ nlmsg_free(rskb);
+err_unlock_rtnl:
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+struct ethnl_tunnel_info_dump_ctx {
+ struct ethnl_req_info req_info;
+ unsigned long ifindex;
+};
+
+int ethnl_tunnel_info_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->info.attrs;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ ret = ethnl_parse_header_dev_get(&ctx->req_info,
+ tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (ctx->req_info.dev) {
+ ethnl_parse_header_dev_put(&ctx->req_info);
+ ctx->req_info.dev = NULL;
+ }
+
+ return ret;
+}
+
+int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+ void *ehdr;
+
+ rtnl_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ ehdr = ethnl_dump_put(skb, cb,
+ ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY);
+ if (!ehdr) {
+ ret = -EMSGSIZE;
+ break;
+ }
+
+ ret = ethnl_fill_reply_header(skb, dev,
+ ETHTOOL_A_TUNNEL_INFO_HEADER);
+ if (ret < 0) {
+ genlmsg_cancel(skb, ehdr);
+ break;
+ }
+
+ ctx->req_info.dev = dev;
+ ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb);
+ ctx->req_info.dev = NULL;
+ if (ret < 0) {
+ genlmsg_cancel(skb, ehdr);
+ if (ret == -EOPNOTSUPP)
+ continue;
+ break;
+ }
+ genlmsg_end(skb, ehdr);
+ }
+ rtnl_unlock();
+
+ if (ret == -EMSGSIZE && skb->len)
+ return skb->len;
+ return ret;
+}
diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c
new file mode 100644
index 000000000000..a39d8000d808
--- /dev/null
+++ b/net/ethtool/wol.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct wol_req_info {
+ struct ethnl_req_info base;
+};
+
+struct wol_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_wolinfo wol;
+ bool show_sopass;
+};
+
+#define WOL_REPDATA(__reply_base) \
+ container_of(__reply_base, struct wol_reply_data, base)
+
+const struct nla_policy ethnl_wol_get_policy[] = {
+ [ETHTOOL_A_WOL_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int wol_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_wol(dev, &data->wol);
+ ethnl_ops_complete(dev);
+ /* do not include password in notifications */
+ data->show_sopass = !genl_info_is_ntf(info) &&
+ (data->wol.supported & WAKE_MAGICSECURE);
+
+ return 0;
+}
+
+static int wol_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ int len;
+
+ len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported,
+ WOL_MODE_COUNT, wol_mode_names, compact);
+ if (len < 0)
+ return len;
+ if (data->show_sopass)
+ len += nla_total_size(sizeof(data->wol.sopass));
+
+ return len;
+}
+
+static int wol_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts,
+ &data->wol.supported, WOL_MODE_COUNT,
+ wol_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ if (data->show_sopass &&
+ nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass),
+ data->wol.sopass))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* WOL_SET */
+
+const struct nla_policy ethnl_wol_set_policy[] = {
+ [ETHTOOL_A_WOL_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED },
+ [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY,
+ .len = SOPASS_MAX },
+};
+
+static int
+ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ dev->ethtool_ops->get_wol(dev, &wol);
+ ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT,
+ tb[ETHTOOL_A_WOL_MODES], wol_mode_names,
+ info->extack, &mod);
+ if (ret < 0)
+ return ret;
+ if (wol.wolopts & ~wol.supported) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES],
+ "cannot enable unsupported WoL mode");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_WOL_SOPASS]) {
+ if (!(wol.supported & WAKE_MAGICSECURE)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_WOL_SOPASS],
+ "magicsecure not supported, cannot set password");
+ return -EINVAL;
+ }
+ ethnl_update_binary(wol.sopass, sizeof(wol.sopass),
+ tb[ETHTOOL_A_WOL_SOPASS], &mod);
+ }
+
+ if (!mod)
+ return 0;
+ ret = dev->ethtool_ops->set_wol(dev, &wol);
+ if (ret)
+ return ret;
+ dev->ethtool->wol_enabled = !!wol.wolopts;
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_wol_request_ops = {
+ .request_cmd = ETHTOOL_MSG_WOL_GET,
+ .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_WOL_HEADER,
+ .req_info_size = sizeof(struct wol_req_info),
+ .reply_data_size = sizeof(struct wol_reply_data),
+
+ .prepare_data = wol_prepare_data,
+ .reply_size = wol_reply_size,
+ .fill_reply = wol_fill_reply,
+
+ .set_validate = ethnl_set_wol_validate,
+ .set = ethnl_set_wol,
+ .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF,
+};
diff --git a/net/handshake/.kunitconfig b/net/handshake/.kunitconfig
new file mode 100644
index 000000000000..5c48cf4abca2
--- /dev/null
+++ b/net/handshake/.kunitconfig
@@ -0,0 +1,11 @@
+CONFIG_KUNIT=y
+CONFIG_UBSAN=y
+CONFIG_STACKTRACE=y
+CONFIG_NET=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_MULTIUSER=y
+CONFIG_NFS_FS=y
+CONFIG_SUNRPC=y
+CONFIG_NET_HANDSHAKE=y
+CONFIG_NET_HANDSHAKE_KUNIT_TEST=y
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
new file mode 100644
index 000000000000..ef4d9a2112bd
--- /dev/null
+++ b/net/handshake/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the Generic HANDSHAKE service
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+obj-y += handshake.o
+handshake-y := alert.o genl.o netlink.o request.o tlshd.o trace.o
+
+obj-$(CONFIG_NET_HANDSHAKE_KUNIT_TEST) += handshake-test.o
diff --git a/net/handshake/alert.c b/net/handshake/alert.c
new file mode 100644
index 000000000000..329d91984683
--- /dev/null
+++ b/net/handshake/alert.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handle the TLS Alert protocol
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/tls.h>
+#include <net/tls_prot.h>
+
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * tls_alert_send - send a TLS Alert on a kTLS socket
+ * @sock: open kTLS socket to send on
+ * @level: TLS Alert level
+ * @description: TLS Alert description
+ *
+ * Returns zero on success or a negative errno.
+ */
+int tls_alert_send(struct socket *sock, u8 level, u8 description)
+{
+ u8 record_type = TLS_RECORD_TYPE_ALERT;
+ u8 buf[CMSG_SPACE(sizeof(record_type))];
+ struct msghdr msg = { 0 };
+ struct cmsghdr *cmsg;
+ struct kvec iov;
+ u8 alert[2];
+ int ret;
+
+ trace_tls_alert_send(sock->sk, level, description);
+
+ alert[0] = level;
+ alert[1] = description;
+ iov.iov_base = alert;
+ iov.iov_len = sizeof(alert);
+
+ memset(buf, 0, sizeof(buf));
+ msg.msg_control = buf;
+ msg.msg_controllen = sizeof(buf);
+ msg.msg_flags = MSG_DONTWAIT;
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_TLS;
+ cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(record_type));
+ memcpy(CMSG_DATA(cmsg), &record_type, sizeof(record_type));
+
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, iov.iov_len);
+ ret = sock_sendmsg(sock, &msg);
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * tls_get_record_type - Look for TLS RECORD_TYPE information
+ * @sk: socket (for IP address information)
+ * @cmsg: incoming message to be parsed
+ *
+ * Returns zero or a TLS_RECORD_TYPE value.
+ */
+u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *cmsg)
+{
+ u8 record_type;
+
+ if (cmsg->cmsg_level != SOL_TLS)
+ return 0;
+ if (cmsg->cmsg_type != TLS_GET_RECORD_TYPE)
+ return 0;
+
+ record_type = *((u8 *)CMSG_DATA(cmsg));
+ trace_tls_contenttype(sk, record_type);
+ return record_type;
+}
+EXPORT_SYMBOL(tls_get_record_type);
+
+/**
+ * tls_alert_recv - Parse TLS Alert messages
+ * @sk: socket (for IP address information)
+ * @msg: incoming message to be parsed
+ * @level: OUT - TLS AlertLevel value
+ * @description: OUT - TLS AlertDescription value
+ *
+ */
+void tls_alert_recv(const struct sock *sk, const struct msghdr *msg,
+ u8 *level, u8 *description)
+{
+ const struct kvec *iov;
+ u8 *data;
+
+ iov = msg->msg_iter.kvec;
+ data = iov->iov_base;
+ *level = data[0];
+ *description = data[1];
+
+ trace_tls_alert_recv(sk, *level, *description);
+}
+EXPORT_SYMBOL(tls_alert_recv);
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
new file mode 100644
index 000000000000..870612609491
--- /dev/null
+++ b/net/handshake/genl.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "genl.h"
+
+#include <uapi/linux/handshake.h>
+
+/* HANDSHAKE_CMD_ACCEPT - do */
+static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
+ [HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 2),
+};
+
+/* HANDSHAKE_CMD_DONE - do */
+static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
+ [HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
+ [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_S32, },
+ [HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
+};
+
+/* Ops table for handshake */
+static const struct genl_split_ops handshake_nl_ops[] = {
+ {
+ .cmd = HANDSHAKE_CMD_ACCEPT,
+ .doit = handshake_nl_accept_doit,
+ .policy = handshake_accept_nl_policy,
+ .maxattr = HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = HANDSHAKE_CMD_DONE,
+ .doit = handshake_nl_done_doit,
+ .policy = handshake_done_nl_policy,
+ .maxattr = HANDSHAKE_A_DONE_REMOTE_AUTH,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group handshake_nl_mcgrps[] = {
+ [HANDSHAKE_NLGRP_NONE] = { "none", },
+ [HANDSHAKE_NLGRP_TLSHD] = { "tlshd", },
+};
+
+struct genl_family handshake_nl_family __ro_after_init = {
+ .name = HANDSHAKE_FAMILY_NAME,
+ .version = HANDSHAKE_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = handshake_nl_ops,
+ .n_split_ops = ARRAY_SIZE(handshake_nl_ops),
+ .mcgrps = handshake_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(handshake_nl_mcgrps),
+};
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
new file mode 100644
index 000000000000..8d3e18672daf
--- /dev/null
+++ b/net/handshake/genl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_HANDSHAKE_GEN_H
+#define _LINUX_HANDSHAKE_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/handshake.h>
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info);
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ HANDSHAKE_NLGRP_NONE,
+ HANDSHAKE_NLGRP_TLSHD,
+};
+
+extern struct genl_family handshake_nl_family;
+
+#endif /* _LINUX_HANDSHAKE_GEN_H */
diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c
new file mode 100644
index 000000000000..55442b2f518a
--- /dev/null
+++ b/net/handshake/handshake-test.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Oracle and/or its affiliates.
+ *
+ * KUnit test of the handshake upcall mechanism.
+ */
+
+#include <kunit/test.h>
+#include <kunit/visibility.h>
+
+#include <linux/kernel.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+static int test_accept_func(struct handshake_req *req, struct genl_info *info,
+ int fd)
+{
+ return 0;
+}
+
+static void test_done_func(struct handshake_req *req, unsigned int status,
+ struct genl_info *info)
+{
+}
+
+struct handshake_req_alloc_test_param {
+ const char *desc;
+ struct handshake_proto *proto;
+ gfp_t gfp;
+ bool expect_success;
+};
+
+static struct handshake_proto handshake_req_alloc_proto_2 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_NONE,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_3 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_MAX,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_4 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_5 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_6 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_privsize = UINT_MAX,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_good = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+};
+
+static const
+struct handshake_req_alloc_test_param handshake_req_alloc_params[] = {
+ {
+ .desc = "handshake_req_alloc NULL proto",
+ .proto = NULL,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc CLASS_NONE",
+ .proto = &handshake_req_alloc_proto_2,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc CLASS_MAX",
+ .proto = &handshake_req_alloc_proto_3,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc no callbacks",
+ .proto = &handshake_req_alloc_proto_4,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc no done callback",
+ .proto = &handshake_req_alloc_proto_5,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc excessive privsize",
+ .proto = &handshake_req_alloc_proto_6,
+ .gfp = GFP_KERNEL | __GFP_NOWARN,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc all good",
+ .proto = &handshake_req_alloc_proto_good,
+ .gfp = GFP_KERNEL,
+ .expect_success = true,
+ },
+};
+
+static void
+handshake_req_alloc_get_desc(const struct handshake_req_alloc_test_param *param,
+ char *desc)
+{
+ strscpy(desc, param->desc, KUNIT_PARAM_DESC_SIZE);
+}
+
+/* Creates the function handshake_req_alloc_gen_params */
+KUNIT_ARRAY_PARAM(handshake_req_alloc, handshake_req_alloc_params,
+ handshake_req_alloc_get_desc);
+
+static void handshake_req_alloc_case(struct kunit *test)
+{
+ const struct handshake_req_alloc_test_param *param = test->param_value;
+ struct handshake_req *result;
+
+ /* Arrange */
+
+ /* Act */
+ result = handshake_req_alloc(param->proto, param->gfp);
+
+ /* Assert */
+ if (param->expect_success)
+ KUNIT_EXPECT_NOT_NULL(test, result);
+ else
+ KUNIT_EXPECT_NULL(test, result);
+
+ kfree(result);
+}
+
+static void handshake_req_submit_test1(struct kunit *test)
+{
+ struct socket *sock;
+ int err, result;
+
+ /* Arrange */
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* Act */
+ result = handshake_req_submit(sock, NULL, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test2(struct kunit *test)
+{
+ struct handshake_req *req;
+ int result;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ /* Act */
+ result = handshake_req_submit(NULL, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ /* handshake_req_submit() destroys @req on error */
+}
+
+static void handshake_req_submit_test3(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ int err, result;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ sock->file = NULL;
+
+ /* Act */
+ result = handshake_req_submit(sock, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ /* handshake_req_submit() destroys @req on error */
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test4(struct kunit *test)
+{
+ struct handshake_req *req, *result;
+ struct socket *sock;
+ struct file *filp;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* Act */
+ result = handshake_req_hash_lookup(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_NOT_NULL(test, result);
+ KUNIT_EXPECT_PTR_EQ(test, req, result);
+
+ handshake_req_cancel(sock->sk);
+ fput(filp);
+}
+
+static void handshake_req_submit_test5(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct file *filp;
+ struct net *net;
+ int saved, err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+ sock->file = filp;
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ saved = hn->hn_pending;
+ hn->hn_pending = hn->hn_pending_max + 1;
+
+ /* Act */
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, err, -EAGAIN);
+
+ fput(filp);
+ hn->hn_pending = saved;
+}
+
+static void handshake_req_submit_test6(struct kunit *test)
+{
+ struct handshake_req *req1, *req2;
+ struct socket *sock;
+ struct file *filp;
+ int err;
+
+ /* Arrange */
+ req1 = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req1);
+ req2 = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req2);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+ sock->file = filp;
+
+ /* Act */
+ err = handshake_req_submit(sock, req1, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ err = handshake_req_submit(sock, req2, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, err, -EBUSY);
+
+ handshake_req_cancel(sock->sk);
+ fput(filp);
+}
+
+static void handshake_req_cancel_test1(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ struct file *filp;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* NB: handshake_req hasn't been accepted */
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_TRUE(test, result);
+
+ fput(filp);
+}
+
+static void handshake_req_cancel_test2(struct kunit *test)
+{
+ struct handshake_req *req, *next;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct file *filp;
+ struct net *net;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ /* Pretend to accept this request */
+ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
+ KUNIT_ASSERT_PTR_EQ(test, req, next);
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_TRUE(test, result);
+
+ fput(filp);
+}
+
+static void handshake_req_cancel_test3(struct kunit *test)
+{
+ struct handshake_req *req, *next;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct file *filp;
+ struct net *net;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ /* Pretend to accept this request */
+ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
+ KUNIT_ASSERT_PTR_EQ(test, req, next);
+
+ /* Pretend to complete this request */
+ handshake_complete(next, -ETIMEDOUT, NULL);
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_FALSE(test, result);
+
+ fput(filp);
+}
+
+static struct handshake_req *handshake_req_destroy_test;
+
+static void test_destroy_func(struct handshake_req *req)
+{
+ handshake_req_destroy_test = req;
+}
+
+static struct handshake_proto handshake_req_alloc_proto_destroy = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+ .hp_destroy = test_destroy_func,
+};
+
+static void handshake_req_destroy_test1(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ struct file *filp;
+ int err;
+
+ /* Arrange */
+ handshake_req_destroy_test = NULL;
+
+ req = handshake_req_alloc(&handshake_req_alloc_proto_destroy, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ handshake_req_cancel(sock->sk);
+
+ /* Act */
+ /* Ensure the close/release/put process has run to
+ * completion before checking the result.
+ */
+ __fput_sync(filp);
+
+ /* Assert */
+ KUNIT_EXPECT_PTR_EQ(test, handshake_req_destroy_test, req);
+}
+
+static struct kunit_case handshake_api_test_cases[] = {
+ {
+ .name = "req_alloc API fuzzing",
+ .run_case = handshake_req_alloc_case,
+ .generate_params = handshake_req_alloc_gen_params,
+ },
+ {
+ .name = "req_submit NULL req arg",
+ .run_case = handshake_req_submit_test1,
+ },
+ {
+ .name = "req_submit NULL sock arg",
+ .run_case = handshake_req_submit_test2,
+ },
+ {
+ .name = "req_submit NULL sock->file",
+ .run_case = handshake_req_submit_test3,
+ },
+ {
+ .name = "req_lookup works",
+ .run_case = handshake_req_submit_test4,
+ },
+ {
+ .name = "req_submit max pending",
+ .run_case = handshake_req_submit_test5,
+ },
+ {
+ .name = "req_submit multiple",
+ .run_case = handshake_req_submit_test6,
+ },
+ {
+ .name = "req_cancel before accept",
+ .run_case = handshake_req_cancel_test1,
+ },
+ {
+ .name = "req_cancel after accept",
+ .run_case = handshake_req_cancel_test2,
+ },
+ {
+ .name = "req_cancel after done",
+ .run_case = handshake_req_cancel_test3,
+ },
+ {
+ .name = "req_destroy works",
+ .run_case = handshake_req_destroy_test1,
+ },
+ {}
+};
+
+static struct kunit_suite handshake_api_suite = {
+ .name = "Handshake API tests",
+ .test_cases = handshake_api_test_cases,
+};
+
+kunit_test_suites(&handshake_api_suite);
+
+MODULE_DESCRIPTION("Test handshake upcall API functions");
+MODULE_LICENSE("GPL");
diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h
new file mode 100644
index 000000000000..a48163765a7a
--- /dev/null
+++ b/net/handshake/handshake.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _INTERNAL_HANDSHAKE_H
+#define _INTERNAL_HANDSHAKE_H
+
+/* Per-net namespace context */
+struct handshake_net {
+ spinlock_t hn_lock; /* protects next 3 fields */
+ int hn_pending;
+ int hn_pending_max;
+ struct list_head hn_requests;
+
+ unsigned long hn_flags;
+};
+
+enum hn_flags_bits {
+ HANDSHAKE_F_NET_DRAINING,
+};
+
+struct handshake_proto;
+
+/* One handshake request */
+struct handshake_req {
+ struct list_head hr_list;
+ struct rhash_head hr_rhash;
+ unsigned long hr_flags;
+ const struct handshake_proto *hr_proto;
+ struct sock *hr_sk;
+ void (*hr_odestruct)(struct sock *sk);
+
+ /* Always the last field */
+ char hr_priv[];
+};
+
+enum hr_flags_bits {
+ HANDSHAKE_F_REQ_COMPLETED,
+ HANDSHAKE_F_REQ_SESSION,
+};
+
+struct genl_info;
+
+/* Invariants for all handshake requests for one transport layer
+ * security protocol
+ */
+struct handshake_proto {
+ int hp_handler_class;
+ size_t hp_privsize;
+ unsigned long hp_flags;
+
+ int (*hp_accept)(struct handshake_req *req,
+ struct genl_info *info, int fd);
+ void (*hp_done)(struct handshake_req *req,
+ unsigned int status,
+ struct genl_info *info);
+ void (*hp_destroy)(struct handshake_req *req);
+};
+
+enum hp_flags_bits {
+ HANDSHAKE_F_PROTO_NOTIFY,
+};
+
+/* alert.c */
+int tls_alert_send(struct socket *sock, u8 level, u8 description);
+
+/* netlink.c */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+ gfp_t flags);
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+ struct genl_info *info);
+struct handshake_net *handshake_pernet(struct net *net);
+
+/* request.c */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+ gfp_t flags);
+int handshake_req_hash_init(void);
+void handshake_req_hash_destroy(void);
+void *handshake_req_private(struct handshake_req *req);
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk);
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class);
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+ gfp_t flags);
+void handshake_complete(struct handshake_req *req, unsigned int status,
+ struct genl_info *info);
+bool handshake_req_cancel(struct sock *sk);
+
+#endif /* _INTERNAL_HANDSHAKE_H */
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
new file mode 100644
index 000000000000..1d33a4675a48
--- /dev/null
+++ b/net/handshake/netlink.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <kunit/visibility.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+#include "genl.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * handshake_genl_notify - Notify handlers that a request is waiting
+ * @net: target network namespace
+ * @proto: handshake protocol
+ * @flags: memory allocation control flags
+ *
+ * Returns zero on success or a negative errno if notification failed.
+ */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+ gfp_t flags)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ /* Disable notifications during unit testing */
+ if (!test_bit(HANDSHAKE_F_PROTO_NOTIFY, &proto->hp_flags))
+ return 0;
+
+ if (!genl_has_listeners(&handshake_nl_family, net,
+ proto->hp_handler_class))
+ return -ESRCH;
+
+ msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, flags);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0,
+ HANDSHAKE_CMD_READY);
+ if (!hdr)
+ goto out_free;
+
+ if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+ proto->hp_handler_class) < 0) {
+ genlmsg_cancel(msg, hdr);
+ goto out_free;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_multicast_netns(&handshake_nl_family, net, msg,
+ 0, proto->hp_handler_class, flags);
+
+out_free:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+/**
+ * handshake_genl_put - Create a generic netlink message header
+ * @msg: buffer in which to create the header
+ * @info: generic netlink message context
+ *
+ * Returns a ready-to-use header, or NULL.
+ */
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+ struct genl_info *info)
+{
+ return genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &handshake_nl_family, 0, info->genlhdr->cmd);
+}
+EXPORT_SYMBOL(handshake_genl_put);
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct handshake_net *hn = handshake_pernet(net);
+ struct handshake_req *req = NULL;
+ struct socket *sock;
+ int class, err;
+
+ err = -EOPNOTSUPP;
+ if (!hn)
+ goto out_status;
+
+ err = -EINVAL;
+ if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS))
+ goto out_status;
+ class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]);
+
+ err = -EAGAIN;
+ req = handshake_req_next(hn, class);
+ if (req) {
+ sock = req->hr_sk->sk_socket;
+
+ FD_PREPARE(fdf, O_CLOEXEC, sock->file);
+ if (fdf.err) {
+ err = fdf.err;
+ goto out_complete;
+ }
+
+ get_file(sock->file); /* FD_PREPARE() consumes a reference. */
+ err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf));
+ if (err)
+ goto out_complete; /* Automatic cleanup handles fput */
+
+ trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf));
+ fd_publish(fdf);
+ return 0;
+ }
+
+out_complete:
+ handshake_complete(req, -EIO, NULL);
+out_status:
+ trace_handshake_cmd_accept_err(net, req, NULL, err);
+ return err;
+}
+
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct handshake_req *req;
+ struct socket *sock;
+ int fd, status, err;
+
+ if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD))
+ return -EINVAL;
+ fd = nla_get_s32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ req = handshake_req_hash_lookup(sock->sk);
+ if (!req) {
+ err = -EBUSY;
+ trace_handshake_cmd_done_err(net, req, sock->sk, err);
+ sockfd_put(sock);
+ return err;
+ }
+
+ trace_handshake_cmd_done(net, req, sock->sk, fd);
+
+ status = -EIO;
+ if (info->attrs[HANDSHAKE_A_DONE_STATUS])
+ status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
+
+ handshake_complete(req, status, info);
+ sockfd_put(sock);
+ return 0;
+}
+
+static unsigned int handshake_net_id;
+
+static int __net_init handshake_net_init(struct net *net)
+{
+ struct handshake_net *hn = net_generic(net, handshake_net_id);
+ unsigned long tmp;
+ struct sysinfo si;
+
+ /*
+ * Arbitrary limit to prevent handshakes that do not make
+ * progress from clogging up the system. The cap scales up
+ * with the amount of physical memory on the system.
+ */
+ si_meminfo(&si);
+ tmp = si.totalram / (25 * si.mem_unit);
+ hn->hn_pending_max = clamp(tmp, 3UL, 50UL);
+
+ spin_lock_init(&hn->hn_lock);
+ hn->hn_pending = 0;
+ hn->hn_flags = 0;
+ INIT_LIST_HEAD(&hn->hn_requests);
+ return 0;
+}
+
+static void __net_exit handshake_net_exit(struct net *net)
+{
+ struct handshake_net *hn = net_generic(net, handshake_net_id);
+ struct handshake_req *req;
+ LIST_HEAD(requests);
+
+ /*
+ * Drain the net's pending list. Requests that have been
+ * accepted and are in progress will be destroyed when
+ * the socket is closed.
+ */
+ spin_lock(&hn->hn_lock);
+ set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags);
+ list_splice_init(&requests, &hn->hn_requests);
+ spin_unlock(&hn->hn_lock);
+
+ while (!list_empty(&requests)) {
+ req = list_first_entry(&requests, struct handshake_req, hr_list);
+ list_del(&req->hr_list);
+
+ /*
+ * Requests on this list have not yet been
+ * accepted, so they do not have an fd to put.
+ */
+
+ handshake_complete(req, -ETIMEDOUT, NULL);
+ }
+}
+
+static struct pernet_operations handshake_genl_net_ops = {
+ .init = handshake_net_init,
+ .exit = handshake_net_exit,
+ .id = &handshake_net_id,
+ .size = sizeof(struct handshake_net),
+};
+
+/**
+ * handshake_pernet - Get the handshake private per-net structure
+ * @net: network namespace
+ *
+ * Returns a pointer to the net's private per-net structure for the
+ * handshake module, or NULL if handshake_init() failed.
+ */
+struct handshake_net *handshake_pernet(struct net *net)
+{
+ return handshake_net_id ?
+ net_generic(net, handshake_net_id) : NULL;
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_pernet);
+
+static int __init handshake_init(void)
+{
+ int ret;
+
+ ret = handshake_req_hash_init();
+ if (ret) {
+ pr_warn("handshake: hash initialization failed (%d)\n", ret);
+ return ret;
+ }
+
+ ret = genl_register_family(&handshake_nl_family);
+ if (ret) {
+ pr_warn("handshake: netlink registration failed (%d)\n", ret);
+ handshake_req_hash_destroy();
+ return ret;
+ }
+
+ /*
+ * ORDER: register_pernet_subsys must be done last.
+ *
+ * If initialization does not make it past pernet_subsys
+ * registration, then handshake_net_id will remain 0. That
+ * shunts the handshake consumer API to return ENOTSUPP
+ * to prevent it from dereferencing something that hasn't
+ * been allocated.
+ */
+ ret = register_pernet_subsys(&handshake_genl_net_ops);
+ if (ret) {
+ pr_warn("handshake: pernet registration failed (%d)\n", ret);
+ genl_unregister_family(&handshake_nl_family);
+ handshake_req_hash_destroy();
+ }
+
+ return ret;
+}
+
+static void __exit handshake_exit(void)
+{
+ unregister_pernet_subsys(&handshake_genl_net_ops);
+ handshake_net_id = 0;
+
+ handshake_req_hash_destroy();
+ genl_unregister_family(&handshake_nl_family);
+}
+
+module_init(handshake_init);
+module_exit(handshake_exit);
diff --git a/net/handshake/request.c b/net/handshake/request.c
new file mode 100644
index 000000000000..274d2c89b6b2
--- /dev/null
+++ b/net/handshake/request.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handshake request lifetime events
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <kunit/visibility.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/*
+ * We need both a handshake_req -> sock mapping, and a sock ->
+ * handshake_req mapping. Both are one-to-one.
+ *
+ * To avoid adding another pointer field to struct sock, net/handshake
+ * maintains a hash table, indexed by the memory address of @sock, to
+ * find the struct handshake_req outstanding for that socket. The
+ * reverse direction uses a simple pointer field in the handshake_req
+ * struct.
+ */
+
+static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp;
+
+static const struct rhashtable_params handshake_rhash_params = {
+ .key_len = sizeof_field(struct handshake_req, hr_sk),
+ .key_offset = offsetof(struct handshake_req, hr_sk),
+ .head_offset = offsetof(struct handshake_req, hr_rhash),
+ .automatic_shrinking = true,
+};
+
+int handshake_req_hash_init(void)
+{
+ return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params);
+}
+
+void handshake_req_hash_destroy(void)
+{
+ rhashtable_destroy(&handshake_rhashtbl);
+}
+
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk)
+{
+ return rhashtable_lookup_fast(&handshake_rhashtbl, &sk,
+ handshake_rhash_params);
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_req_hash_lookup);
+
+static bool handshake_req_hash_add(struct handshake_req *req)
+{
+ int ret;
+
+ ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl,
+ &req->hr_rhash,
+ handshake_rhash_params);
+ return ret == 0;
+}
+
+static void handshake_req_destroy(struct handshake_req *req)
+{
+ if (req->hr_proto->hp_destroy)
+ req->hr_proto->hp_destroy(req);
+ rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash,
+ handshake_rhash_params);
+ kfree(req);
+}
+
+static void handshake_sk_destruct(struct sock *sk)
+{
+ void (*sk_destruct)(struct sock *sk);
+ struct handshake_req *req;
+
+ req = handshake_req_hash_lookup(sk);
+ if (!req)
+ return;
+
+ trace_handshake_destruct(sock_net(sk), req, sk);
+ sk_destruct = req->hr_odestruct;
+ handshake_req_destroy(req);
+ if (sk_destruct)
+ sk_destruct(sk);
+}
+
+/**
+ * handshake_req_alloc - Allocate a handshake request
+ * @proto: security protocol
+ * @flags: memory allocation flags
+ *
+ * Returns an initialized handshake_req or NULL.
+ */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+ gfp_t flags)
+{
+ struct handshake_req *req;
+
+ if (!proto)
+ return NULL;
+ if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE)
+ return NULL;
+ if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX)
+ return NULL;
+ if (!proto->hp_accept || !proto->hp_done)
+ return NULL;
+
+ req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags);
+ if (!req)
+ return NULL;
+
+ INIT_LIST_HEAD(&req->hr_list);
+ req->hr_proto = proto;
+ return req;
+}
+EXPORT_SYMBOL(handshake_req_alloc);
+
+/**
+ * handshake_req_private - Get per-handshake private data
+ * @req: handshake arguments
+ *
+ */
+void *handshake_req_private(struct handshake_req *req)
+{
+ return (void *)&req->hr_priv;
+}
+EXPORT_SYMBOL(handshake_req_private);
+
+static bool __add_pending_locked(struct handshake_net *hn,
+ struct handshake_req *req)
+{
+ if (WARN_ON_ONCE(!list_empty(&req->hr_list)))
+ return false;
+ hn->hn_pending++;
+ list_add_tail(&req->hr_list, &hn->hn_requests);
+ return true;
+}
+
+static void __remove_pending_locked(struct handshake_net *hn,
+ struct handshake_req *req)
+{
+ hn->hn_pending--;
+ list_del_init(&req->hr_list);
+}
+
+/*
+ * Returns %true if the request was found on @net's pending list,
+ * otherwise %false.
+ *
+ * If @req was on a pending list, it has not yet been accepted.
+ */
+static bool remove_pending(struct handshake_net *hn, struct handshake_req *req)
+{
+ bool ret = false;
+
+ spin_lock(&hn->hn_lock);
+ if (!list_empty(&req->hr_list)) {
+ __remove_pending_locked(hn, req);
+ ret = true;
+ }
+ spin_unlock(&hn->hn_lock);
+
+ return ret;
+}
+
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class)
+{
+ struct handshake_req *req, *pos;
+
+ req = NULL;
+ spin_lock(&hn->hn_lock);
+ list_for_each_entry(pos, &hn->hn_requests, hr_list) {
+ if (pos->hr_proto->hp_handler_class != class)
+ continue;
+ __remove_pending_locked(hn, pos);
+ req = pos;
+ break;
+ }
+ spin_unlock(&hn->hn_lock);
+
+ return req;
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_req_next);
+
+/**
+ * handshake_req_submit - Submit a handshake request
+ * @sock: open socket on which to perform the handshake
+ * @req: handshake arguments
+ * @flags: memory allocation flags
+ *
+ * Return values:
+ * %0: Request queued
+ * %-EINVAL: Invalid argument
+ * %-EBUSY: A handshake is already under way for this socket
+ * %-ESRCH: No handshake agent is available
+ * %-EAGAIN: Too many pending handshake requests
+ * %-ENOMEM: Failed to allocate memory
+ * %-EMSGSIZE: Failed to construct notification message
+ * %-EOPNOTSUPP: Handshake module not initialized
+ *
+ * A zero return value from handshake_req_submit() means that
+ * exactly one subsequent completion callback is guaranteed.
+ *
+ * A negative return value from handshake_req_submit() means that
+ * no completion callback will be done and that @req has been
+ * destroyed.
+ */
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+ gfp_t flags)
+{
+ struct handshake_net *hn;
+ struct net *net;
+ int ret;
+
+ if (!sock || !req || !sock->file) {
+ kfree(req);
+ return -EINVAL;
+ }
+
+ req->hr_sk = sock->sk;
+ if (!req->hr_sk) {
+ kfree(req);
+ return -EINVAL;
+ }
+ req->hr_odestruct = req->hr_sk->sk_destruct;
+ req->hr_sk->sk_destruct = handshake_sk_destruct;
+
+ ret = -EOPNOTSUPP;
+ net = sock_net(req->hr_sk);
+ hn = handshake_pernet(net);
+ if (!hn)
+ goto out_err;
+
+ ret = -EAGAIN;
+ if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max)
+ goto out_err;
+
+ spin_lock(&hn->hn_lock);
+ ret = -EOPNOTSUPP;
+ if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags))
+ goto out_unlock;
+ ret = -EBUSY;
+ if (!handshake_req_hash_add(req))
+ goto out_unlock;
+ if (!__add_pending_locked(hn, req))
+ goto out_unlock;
+ spin_unlock(&hn->hn_lock);
+
+ ret = handshake_genl_notify(net, req->hr_proto, flags);
+ if (ret) {
+ trace_handshake_notify_err(net, req, req->hr_sk, ret);
+ if (remove_pending(hn, req))
+ goto out_err;
+ }
+
+ /* Prevent socket release while a handshake request is pending */
+ sock_hold(req->hr_sk);
+
+ trace_handshake_submit(net, req, req->hr_sk);
+ return 0;
+
+out_unlock:
+ spin_unlock(&hn->hn_lock);
+out_err:
+ trace_handshake_submit_err(net, req, req->hr_sk, ret);
+ handshake_req_destroy(req);
+ return ret;
+}
+EXPORT_SYMBOL(handshake_req_submit);
+
+void handshake_complete(struct handshake_req *req, unsigned int status,
+ struct genl_info *info)
+{
+ struct sock *sk = req->hr_sk;
+ struct net *net = sock_net(sk);
+
+ if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ trace_handshake_complete(net, req, sk, status);
+ req->hr_proto->hp_done(req, status, info);
+
+ /* Handshake request is no longer pending */
+ sock_put(sk);
+ }
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_complete);
+
+/**
+ * handshake_req_cancel - Cancel an in-progress handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ * %true - Uncompleted handshake request was canceled
+ * %false - Handshake request already completed or not found
+ */
+bool handshake_req_cancel(struct sock *sk)
+{
+ struct handshake_req *req;
+ struct handshake_net *hn;
+ struct net *net;
+
+ net = sock_net(sk);
+ req = handshake_req_hash_lookup(sk);
+ if (!req) {
+ trace_handshake_cancel_none(net, req, sk);
+ return false;
+ }
+
+ hn = handshake_pernet(net);
+ if (hn && remove_pending(hn, req)) {
+ /* Request hadn't been accepted */
+ goto out_true;
+ }
+ if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ /* Request already completed */
+ trace_handshake_cancel_busy(net, req, sk);
+ return false;
+ }
+
+out_true:
+ trace_handshake_cancel(net, req, sk);
+
+ /* Handshake request is no longer pending */
+ sock_put(sk);
+ return true;
+}
+EXPORT_SYMBOL(handshake_req_cancel);
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
new file mode 100644
index 000000000000..8f9532a15f43
--- /dev/null
+++ b/net/handshake/tlshd.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Establish a TLS session for a kernel socket consumer
+ * using the tlshd user space handler.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2021-2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/genetlink.h>
+#include <net/tls_prot.h>
+
+#include <uapi/linux/keyctl.h>
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+struct tls_handshake_req {
+ void (*th_consumer_done)(void *data, int status,
+ key_serial_t peerid);
+ void *th_consumer_data;
+
+ int th_type;
+ unsigned int th_timeout_ms;
+ int th_auth_mode;
+ const char *th_peername;
+ key_serial_t th_keyring;
+ key_serial_t th_certificate;
+ key_serial_t th_privkey;
+
+ unsigned int th_num_peerids;
+ key_serial_t th_peerid[5];
+};
+
+static struct tls_handshake_req *
+tls_handshake_req_init(struct handshake_req *req,
+ const struct tls_handshake_args *args)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+
+ treq->th_timeout_ms = args->ta_timeout_ms;
+ treq->th_consumer_done = args->ta_done;
+ treq->th_consumer_data = args->ta_data;
+ treq->th_peername = args->ta_peername;
+ treq->th_keyring = args->ta_keyring;
+ treq->th_num_peerids = 0;
+ treq->th_certificate = TLS_NO_CERT;
+ treq->th_privkey = TLS_NO_PRIVKEY;
+ return treq;
+}
+
+static void tls_handshake_remote_peerids(struct tls_handshake_req *treq,
+ struct genl_info *info)
+{
+ struct nlattr *head = nlmsg_attrdata(info->nlhdr, GENL_HDRLEN);
+ int rem, len = nlmsg_attrlen(info->nlhdr, GENL_HDRLEN);
+ struct nlattr *nla;
+ unsigned int i;
+
+ i = 0;
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+ i++;
+ }
+ if (!i)
+ return;
+ treq->th_num_peerids = min_t(unsigned int, i,
+ ARRAY_SIZE(treq->th_peerid));
+
+ i = 0;
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+ treq->th_peerid[i++] = nla_get_u32(nla);
+ if (i >= treq->th_num_peerids)
+ break;
+ }
+}
+
+/**
+ * tls_handshake_done - callback to handle a CMD_DONE request
+ * @req: socket on which the handshake was performed
+ * @status: session status code
+ * @info: full results of session establishment
+ *
+ */
+static void tls_handshake_done(struct handshake_req *req,
+ unsigned int status, struct genl_info *info)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+
+ treq->th_peerid[0] = TLS_NO_PEERID;
+ if (info)
+ tls_handshake_remote_peerids(treq, info);
+
+ if (!status)
+ set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags);
+
+ treq->th_consumer_done(treq->th_consumer_data, -status,
+ treq->th_peerid[0]);
+}
+
+#if IS_ENABLED(CONFIG_KEYS)
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+ key_ref_t process_keyring_ref, keyring_ref;
+ int ret;
+
+ if (treq->th_keyring == TLS_NO_KEYRING)
+ return 0;
+
+ process_keyring_ref = lookup_user_key(KEY_SPEC_PROCESS_KEYRING,
+ KEY_LOOKUP_CREATE,
+ KEY_NEED_WRITE);
+ if (IS_ERR(process_keyring_ref)) {
+ ret = PTR_ERR(process_keyring_ref);
+ goto out;
+ }
+
+ keyring_ref = lookup_user_key(treq->th_keyring, KEY_LOOKUP_CREATE,
+ KEY_NEED_LINK);
+ if (IS_ERR(keyring_ref)) {
+ ret = PTR_ERR(keyring_ref);
+ goto out_put_key;
+ }
+
+ ret = key_link(key_ref_to_ptr(process_keyring_ref),
+ key_ref_to_ptr(keyring_ref));
+
+ key_ref_put(keyring_ref);
+out_put_key:
+ key_ref_put(process_keyring_ref);
+out:
+ return ret;
+}
+#else
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+ return 0;
+}
+#endif
+
+static int tls_handshake_put_peer_identity(struct sk_buff *msg,
+ struct tls_handshake_req *treq)
+{
+ unsigned int i;
+
+ for (i = 0; i < treq->th_num_peerids; i++)
+ if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+ treq->th_peerid[i]) < 0)
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int tls_handshake_put_certificate(struct sk_buff *msg,
+ struct tls_handshake_req *treq)
+{
+ struct nlattr *entry_attr;
+
+ if (treq->th_certificate == TLS_NO_CERT &&
+ treq->th_privkey == TLS_NO_PRIVKEY)
+ return 0;
+
+ entry_attr = nla_nest_start(msg, HANDSHAKE_A_ACCEPT_CERTIFICATE);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_s32(msg, HANDSHAKE_A_X509_CERT,
+ treq->th_certificate) ||
+ nla_put_s32(msg, HANDSHAKE_A_X509_PRIVKEY,
+ treq->th_privkey)) {
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+}
+
+/**
+ * tls_handshake_accept - callback to construct a CMD_ACCEPT response
+ * @req: handshake parameters to return
+ * @info: generic netlink message context
+ * @fd: file descriptor to be returned
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+static int tls_handshake_accept(struct handshake_req *req,
+ struct genl_info *info, int fd)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+ struct nlmsghdr *hdr;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = tls_handshake_private_keyring(treq);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out;
+ hdr = handshake_genl_put(msg, info);
+ if (!hdr)
+ goto out_cancel;
+
+ ret = nla_put_s32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
+ if (ret < 0)
+ goto out_cancel;
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
+ if (ret < 0)
+ goto out_cancel;
+ if (treq->th_peername) {
+ ret = nla_put_string(msg, HANDSHAKE_A_ACCEPT_PEERNAME,
+ treq->th_peername);
+ if (ret < 0)
+ goto out_cancel;
+ }
+ if (treq->th_timeout_ms) {
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms);
+ if (ret < 0)
+ goto out_cancel;
+ }
+ if (treq->th_keyring) {
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_KEYRING,
+ treq->th_keyring);
+ if (ret < 0)
+ goto out_cancel;
+ }
+
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE,
+ treq->th_auth_mode);
+ if (ret < 0)
+ goto out_cancel;
+ switch (treq->th_auth_mode) {
+ case HANDSHAKE_AUTH_PSK:
+ ret = tls_handshake_put_peer_identity(msg, treq);
+ if (ret < 0)
+ goto out_cancel;
+ break;
+ case HANDSHAKE_AUTH_X509:
+ ret = tls_handshake_put_certificate(msg, treq);
+ if (ret < 0)
+ goto out_cancel;
+ break;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+out_cancel:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+out:
+ return ret;
+}
+
+static const struct handshake_proto tls_handshake_proto = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_privsize = sizeof(struct tls_handshake_req),
+ .hp_flags = BIT(HANDSHAKE_F_PROTO_NOTIFY),
+
+ .hp_accept = tls_handshake_accept,
+ .hp_done = tls_handshake_done,
+};
+
+/**
+ * tls_client_hello_anon - request an anonymous TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_UNAUTH;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_anon);
+
+/**
+ * tls_client_hello_x509 - request an x.509-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+ treq->th_certificate = args->ta_my_cert;
+ treq->th_privkey = args->ta_my_privkey;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_x509);
+
+/**
+ * tls_client_hello_psk - request a PSK-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-EINVAL: Wrong number of local peer IDs
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+ unsigned int i;
+
+ if (!args->ta_num_peerids ||
+ args->ta_num_peerids > ARRAY_SIZE(treq->th_peerid))
+ return -EINVAL;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+ treq->th_num_peerids = args->ta_num_peerids;
+ for (i = 0; i < args->ta_num_peerids; i++)
+ treq->th_peerid[i] = args->ta_my_peerids[i];
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_psk);
+
+/**
+ * tls_server_hello_x509 - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+ treq->th_certificate = args->ta_my_cert;
+ treq->th_privkey = args->ta_my_privkey;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_x509);
+
+/**
+ * tls_server_hello_psk - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+ treq->th_num_peerids = 1;
+ treq->th_peerid[0] = args->ta_my_peerids[0];
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_psk);
+
+/**
+ * tls_handshake_cancel - cancel a pending handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ * %true - Uncompleted handshake request was canceled
+ * %false - Handshake request already completed or not found
+ */
+bool tls_handshake_cancel(struct sock *sk)
+{
+ return handshake_req_cancel(sk);
+}
+EXPORT_SYMBOL(tls_handshake_cancel);
+
+/**
+ * tls_handshake_close - send a Closure alert
+ * @sock: an open socket
+ *
+ */
+void tls_handshake_close(struct socket *sock)
+{
+ struct handshake_req *req;
+
+ req = handshake_req_hash_lookup(sock->sk);
+ if (!req)
+ return;
+ if (!test_and_clear_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags))
+ return;
+ tls_alert_send(sock, TLS_ALERT_LEVEL_WARNING,
+ TLS_ALERT_DESC_CLOSE_NOTIFY);
+}
+EXPORT_SYMBOL(tls_handshake_close);
diff --git a/net/handshake/trace.c b/net/handshake/trace.c
new file mode 100644
index 000000000000..44432d0857b9
--- /dev/null
+++ b/net/handshake/trace.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace points for transport security layer handshakes.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "handshake.h"
+
+#define CREATE_TRACE_POINTS
+
+#include <trace/events/handshake.h>
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
new file mode 100644
index 000000000000..fcacdf4f0ffc
--- /dev/null
+++ b/net/hsr/Kconfig
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# IEC 62439-3 High-availability Seamless Redundancy
+#
+
+config HSR
+ tristate "High-availability Seamless Redundancy (HSR & PRP)"
+ help
+ This enables IEC 62439 defined High-availability Seamless
+ Redundancy (HSR) and Parallel Redundancy Protocol (PRP).
+
+ If you say Y here, then your Linux box will be able to act as a
+ DANH ("Doubly attached node implementing HSR") or DANP ("Doubly
+ attached node implementing PRP"). For this to work, your Linux box
+ needs (at least) two physical Ethernet interfaces.
+
+ For DANH, it must be connected as a node in a ring network together
+ with other HSR capable nodes. All Ethernet frames sent over the HSR
+ device will be sent in both directions on the ring (over both slave
+ ports), giving a redundant, instant fail-over network. Each HSR node
+ in the ring acts like a bridge for HSR frames, but filters frames
+ that have been forwarded earlier.
+
+ For DANP, it must be connected as a node connecting to two
+ separate networks over the two slave interfaces. Like HSR, Ethernet
+ frames sent over the PRP device will be sent to both networks giving
+ a redundant, instant fail-over network. Unlike HSR, PRP networks
+ can have Singly Attached Nodes (SAN) such as PC, printer, bridges
+ etc and will be able to communicate with DANP nodes.
+
+ This code is a "best effort" to comply with the HSR standard as
+ described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1),
+ and PRP standard described in IEC 62439-4:2012 (PRP), but no
+ compliancy tests have been made. Use iproute2 to select the protocol
+ you would like to use.
+
+ You need to perform any and all necessary tests yourself before
+ relying on this code in a safety critical system!
+
+ If unsure, say N.
+
+if HSR
+
+config PRP_DUP_DISCARD_KUNIT_TEST
+ tristate "PRP duplicate discard KUnit tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Covers the PRP duplicate discard algorithm.
+ Only useful for kernel devs running KUnit test harness and are not
+ for inclusion into a production build.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
+
+endif
diff --git a/net/hsr/Makefile b/net/hsr/Makefile
new file mode 100644
index 000000000000..34e581db5c41
--- /dev/null
+++ b/net/hsr/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for HSR
+#
+
+obj-$(CONFIG_HSR) += hsr.o
+
+hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \
+ hsr_netlink.o hsr_slave.o hsr_forward.o
+hsr-$(CONFIG_DEBUG_FS) += hsr_debugfs.o
+
+obj-$(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) += prp_dup_discard_test.o
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
new file mode 100644
index 000000000000..5b2cfac3b2ba
--- /dev/null
+++ b/net/hsr/hsr_debugfs.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * debugfs code for HSR & PRP
+ * Copyright (C) 2019 Texas Instruments Incorporated
+ *
+ * Author(s):
+ * Murali Karicheri <m-karicheri2@ti.com>
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/debugfs.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+
+static struct dentry *hsr_debugfs_root_dir;
+
+/* hsr_node_table_show - Formats and prints node_table entries */
+static int
+hsr_node_table_show(struct seq_file *sfp, void *data)
+{
+ struct hsr_priv *priv = (struct hsr_priv *)sfp->private;
+ struct hsr_node *node;
+
+ seq_printf(sfp, "Node Table entries for (%s) device\n",
+ (priv->prot_version == PRP_V1 ? "PRP" : "HSR"));
+ seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], ");
+ seq_puts(sfp, "time_in[B], Address-B port, ");
+ if (priv->prot_version == PRP_V1)
+ seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n");
+ else
+ seq_puts(sfp, "DAN-H\n");
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(node, &priv->node_db, mac_list) {
+ /* skip self node */
+ if (hsr_addr_is_self(priv, node->macaddress_A))
+ continue;
+ seq_printf(sfp, "%pM ", &node->macaddress_A[0]);
+ seq_printf(sfp, "%pM ", &node->macaddress_B[0]);
+ seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]);
+ seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]);
+ seq_printf(sfp, "%14x, ", node->addr_B_port);
+
+ if (priv->prot_version == PRP_V1)
+ seq_printf(sfp, "%5x, %5x, %5x\n",
+ node->san_a, node->san_b,
+ (node->san_a == 0 && node->san_b == 0));
+ else
+ seq_printf(sfp, "%5x\n", 1);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hsr_node_table);
+
+void hsr_debugfs_rename(struct net_device *dev)
+{
+ struct hsr_priv *priv = netdev_priv(dev);
+ int err;
+
+ err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name);
+ if (err)
+ netdev_warn(dev, "failed to rename\n");
+}
+
+/* hsr_debugfs_init - create hsr node_table file for dumping
+ * the node table
+ *
+ * Description:
+ * When debugfs is configured this routine sets up the node_table file per
+ * hsr device for dumping the node_table entries
+ */
+void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
+{
+ struct dentry *de = NULL;
+
+ de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir);
+ if (IS_ERR(de)) {
+ pr_err("Cannot create hsr debugfs directory\n");
+ return;
+ }
+
+ priv->node_tbl_root = de;
+
+ de = debugfs_create_file("node_table", S_IFREG | 0444,
+ priv->node_tbl_root, priv,
+ &hsr_node_table_fops);
+ if (IS_ERR(de)) {
+ pr_err("Cannot create hsr node_table file\n");
+ debugfs_remove(priv->node_tbl_root);
+ priv->node_tbl_root = NULL;
+ return;
+ }
+}
+
+/* hsr_debugfs_term - Tear down debugfs intrastructure
+ *
+ * Description:
+ * When Debugfs is configured this routine removes debugfs file system
+ * elements that are specific to hsr
+ */
+void
+hsr_debugfs_term(struct hsr_priv *priv)
+{
+ debugfs_remove_recursive(priv->node_tbl_root);
+ priv->node_tbl_root = NULL;
+}
+
+void hsr_debugfs_create_root(void)
+{
+ hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL);
+ if (IS_ERR(hsr_debugfs_root_dir)) {
+ pr_err("Cannot create hsr debugfs root directory\n");
+ hsr_debugfs_root_dir = NULL;
+ }
+}
+
+void hsr_debugfs_remove_root(void)
+{
+ /* debugfs_remove() internally checks NULL and ERROR */
+ debugfs_remove(hsr_debugfs_root_dir);
+}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
new file mode 100644
index 000000000000..d1bfc49b5f01
--- /dev/null
+++ b/net/hsr/hsr_device.c
@@ -0,0 +1,825 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ * This file contains device methods for creating, using and destroying
+ * virtual HSR or PRP devices.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include "hsr_device.h"
+#include "hsr_slave.h"
+#include "hsr_framereg.h"
+#include "hsr_main.h"
+#include "hsr_forward.h"
+
+static bool is_admin_up(struct net_device *dev)
+{
+ return dev && (dev->flags & IFF_UP);
+}
+
+static bool is_slave_up(struct net_device *dev)
+{
+ return dev && is_admin_up(dev) && netif_oper_up(dev);
+}
+
+static void hsr_set_operstate(struct hsr_port *master, bool has_carrier)
+{
+ struct net_device *dev = master->dev;
+
+ if (!is_admin_up(dev)) {
+ netif_set_operstate(dev, IF_OPER_DOWN);
+ return;
+ }
+
+ if (has_carrier)
+ netif_set_operstate(dev, IF_OPER_UP);
+ else
+ netif_set_operstate(dev, IF_OPER_LOWERLAYERDOWN);
+}
+
+static bool hsr_check_carrier(struct hsr_port *master)
+{
+ struct hsr_port *port;
+
+ ASSERT_RTNL();
+
+ hsr_for_each_port_rtnl(master->hsr, port) {
+ if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {
+ netif_carrier_on(master->dev);
+ return true;
+ }
+ }
+
+ netif_carrier_off(master->dev);
+
+ return false;
+}
+
+static void hsr_check_announce(struct net_device *hsr_dev)
+{
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(hsr_dev);
+ if (netif_running(hsr_dev) && netif_oper_up(hsr_dev)) {
+ /* Enable announce timer and start sending supervisory frames */
+ if (!timer_pending(&hsr->announce_timer)) {
+ hsr->announce_count = 0;
+ mod_timer(&hsr->announce_timer, jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL));
+ }
+
+ if (hsr->redbox && !timer_pending(&hsr->announce_proxy_timer))
+ mod_timer(&hsr->announce_proxy_timer, jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL) / 2);
+ } else {
+ /* Deactivate the announce timer */
+ timer_delete(&hsr->announce_timer);
+ if (hsr->redbox)
+ timer_delete(&hsr->announce_proxy_timer);
+ }
+}
+
+void hsr_check_carrier_and_operstate(struct hsr_priv *hsr)
+{
+ struct hsr_port *master;
+ bool has_carrier;
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ /* netif_stacked_transfer_operstate() cannot be used here since
+ * it doesn't set IF_OPER_LOWERLAYERDOWN (?)
+ */
+ has_carrier = hsr_check_carrier(master);
+ hsr_set_operstate(master, has_carrier);
+ hsr_check_announce(master->dev);
+}
+
+int hsr_get_max_mtu(struct hsr_priv *hsr)
+{
+ unsigned int mtu_max;
+ struct hsr_port *port;
+
+ mtu_max = ETH_DATA_LEN;
+ hsr_for_each_port_rtnl(hsr, port)
+ if (port->type != HSR_PT_MASTER)
+ mtu_max = min(port->dev->mtu, mtu_max);
+
+ if (mtu_max < HSR_HLEN)
+ return 0;
+ return mtu_max - HSR_HLEN;
+}
+
+static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+
+ if (new_mtu > hsr_get_max_mtu(hsr)) {
+ netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
+ HSR_HLEN);
+ return -EINVAL;
+ }
+
+ WRITE_ONCE(dev->mtu, new_mtu);
+
+ return 0;
+}
+
+static int hsr_dev_open(struct net_device *dev)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+ const char *designation = NULL;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ designation = "Slave A";
+ break;
+ case HSR_PT_SLAVE_B:
+ designation = "Slave B";
+ break;
+ case HSR_PT_INTERLINK:
+ designation = "Interlink";
+ break;
+ default:
+ designation = "Unknown";
+ }
+ if (!is_slave_up(port->dev))
+ netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n",
+ designation, port->dev->name);
+ }
+
+ if (!designation)
+ netdev_warn(dev, "No slave devices configured\n");
+
+ return 0;
+}
+
+static int hsr_dev_close(struct net_device *dev)
+{
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ dev_uc_unsync(port->dev, dev);
+ dev_mc_unsync(port->dev, dev);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
+ netdev_features_t features)
+{
+ netdev_features_t mask;
+ struct hsr_port *port;
+
+ mask = features;
+
+ /* Mask out all features that, if supported by one device, should be
+ * enabled for all devices (see NETIF_F_ONE_FOR_ALL).
+ *
+ * Anything that's off in mask will not be enabled - so only things
+ * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL,
+ * may become enabled.
+ */
+ features &= ~NETIF_F_ONE_FOR_ALL;
+ hsr_for_each_port_rtnl(hsr, port)
+ features = netdev_increment_features(features,
+ port->dev->features,
+ mask);
+
+ return features;
+}
+
+static netdev_features_t hsr_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
+
+ return hsr_features_recompute(hsr, features);
+}
+
+static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
+ struct hsr_port *master;
+
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ if (master) {
+ skb->dev = master->dev;
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ spin_lock_bh(&hsr->seqnr_lock);
+ hsr_forward_skb(skb, master);
+ spin_unlock_bh(&hsr->seqnr_lock);
+ } else {
+ dev_core_stats_tx_dropped_inc(dev);
+ dev_kfree_skb_any(skb);
+ }
+ rcu_read_unlock();
+
+ return NETDEV_TX_OK;
+}
+
+static const struct header_ops hsr_header_ops = {
+ .create = eth_header,
+ .parse = eth_header_parse,
+};
+
+static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra)
+{
+ struct hsr_priv *hsr = master->hsr;
+ struct sk_buff *skb;
+ int hlen, tlen;
+ int len;
+
+ hlen = LL_RESERVED_SPACE(master->dev);
+ tlen = master->dev->needed_tailroom;
+ len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload);
+ /* skb size is same for PRP/HSR frames, only difference
+ * being, for PRP it is a trailer and for HSR it is a
+ * header.
+ * RedBox might use @extra more bytes.
+ */
+ skb = dev_alloc_skb(len + extra + hlen + tlen);
+
+ if (!skb)
+ return skb;
+
+ skb_reserve(skb, hlen);
+ skb->dev = master->dev;
+ skb->priority = TC_PRIO_CONTROL;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+ hsr->sup_multicast_addr,
+ skb->dev->dev_addr, skb->len) <= 0)
+ goto out;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+out:
+ kfree_skb(skb);
+
+ return NULL;
+}
+
+static void send_hsr_supervision_frame(struct hsr_port *port,
+ unsigned long *interval,
+ const unsigned char *addr)
+{
+ struct hsr_priv *hsr = port->hsr;
+ __u8 type = HSR_TLV_LIFE_CHECK;
+ struct hsr_sup_payload *hsr_sp;
+ struct hsr_sup_tlv *hsr_stlv;
+ struct hsr_sup_tag *hsr_stag;
+ struct sk_buff *skb;
+ int extra = 0;
+
+ *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+ if (hsr->announce_count < 3 && hsr->prot_version == 0) {
+ type = HSR_TLV_ANNOUNCE;
+ *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ hsr->announce_count++;
+ }
+
+ if (hsr->redbox)
+ extra = sizeof(struct hsr_sup_tlv) +
+ sizeof(struct hsr_sup_payload);
+
+ skb = hsr_init_skb(port, extra);
+ if (!skb) {
+ netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n");
+ return;
+ }
+
+ hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
+ skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
+ skb_reset_mac_len(skb);
+
+ set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
+ set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version);
+
+ /* From HSRv1 on we have separate supervision sequence numbers. */
+ spin_lock_bh(&hsr->seqnr_lock);
+ if (hsr->prot_version > 0) {
+ hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr);
+ hsr->sup_sequence_nr++;
+ } else {
+ hsr_stag->sequence_nr = htons(hsr->sequence_nr);
+ hsr->sequence_nr++;
+ }
+
+ hsr_stag->tlv.HSR_TLV_type = type;
+ /* HSRv0 has 6 unused bytes after the MAC */
+ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ?
+ sizeof(struct hsr_sup_payload) : 12;
+
+ /* Payload: MacAddressA / SAN MAC from ProxyNodeTable */
+ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
+ ether_addr_copy(hsr_sp->macaddress_A, addr);
+
+ if (hsr->redbox &&
+ hsr_is_node_in_db(&hsr->proxy_node_db, addr)) {
+ hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv));
+ hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC;
+ hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload);
+
+ /* Payload: MacAddressRedBox */
+ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
+ ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox);
+ }
+
+ if (skb_put_padto(skb, ETH_ZLEN)) {
+ spin_unlock_bh(&hsr->seqnr_lock);
+ return;
+ }
+
+ hsr_forward_skb(skb, port);
+ spin_unlock_bh(&hsr->seqnr_lock);
+ return;
+}
+
+static void send_prp_supervision_frame(struct hsr_port *master,
+ unsigned long *interval,
+ const unsigned char *addr)
+{
+ struct hsr_priv *hsr = master->hsr;
+ struct hsr_sup_payload *hsr_sp;
+ struct hsr_sup_tag *hsr_stag;
+ struct sk_buff *skb;
+
+ skb = hsr_init_skb(master, 0);
+ if (!skb) {
+ netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n");
+ return;
+ }
+
+ *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+ hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
+ set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
+ set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0));
+
+ /* From HSRv1 on we have separate supervision sequence numbers. */
+ spin_lock_bh(&hsr->seqnr_lock);
+ hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr);
+ hsr->sup_sequence_nr++;
+ hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD;
+ hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload);
+
+ /* Payload: MacAddressA */
+ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
+ ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr);
+
+ if (skb_put_padto(skb, ETH_ZLEN)) {
+ spin_unlock_bh(&hsr->seqnr_lock);
+ return;
+ }
+
+ hsr_forward_skb(skb, master);
+ spin_unlock_bh(&hsr->seqnr_lock);
+}
+
+/* Announce (supervision frame) timer function
+ */
+static void hsr_announce(struct timer_list *t)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *master;
+ unsigned long interval;
+
+ hsr = timer_container_of(hsr, t, announce_timer);
+
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ hsr->proto_ops->send_sv_frame(master, &interval, master->dev->dev_addr);
+
+ if (is_admin_up(master->dev))
+ mod_timer(&hsr->announce_timer, jiffies + interval);
+
+ rcu_read_unlock();
+}
+
+/* Announce (supervision frame) timer function for RedBox
+ */
+static void hsr_proxy_announce(struct timer_list *t)
+{
+ struct hsr_priv *hsr = timer_container_of(hsr, t,
+ announce_proxy_timer);
+ struct hsr_port *interlink;
+ unsigned long interval = 0;
+ struct hsr_node *node;
+
+ rcu_read_lock();
+ /* RedBOX sends supervisory frames to HSR network with MAC addresses
+ * of SAN nodes stored in ProxyNodeTable.
+ */
+ interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+ if (!interlink)
+ goto done;
+
+ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) {
+ if (hsr_addr_is_redbox(hsr, node->macaddress_A))
+ continue;
+ hsr->proto_ops->send_sv_frame(interlink, &interval,
+ node->macaddress_A);
+ }
+
+ if (is_admin_up(interlink->dev)) {
+ if (!interval)
+ interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+
+ mod_timer(&hsr->announce_proxy_timer, jiffies + interval);
+ }
+
+done:
+ rcu_read_unlock();
+}
+
+void hsr_del_ports(struct hsr_priv *hsr)
+{
+ struct hsr_port *port;
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ if (port)
+ hsr_del_port(port);
+}
+
+static void hsr_set_rx_mode(struct net_device *dev)
+{
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ dev_mc_sync_multiple(port->dev, dev);
+ dev_uc_sync_multiple(port->dev, dev);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void hsr_change_rx_flags(struct net_device *dev, int change)
+{
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(port->dev,
+ dev->flags &
+ IFF_ALLMULTI ? 1 : -1);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev,
+ __be16 proto, u16 vid)
+{
+ bool is_slave_a_added = false;
+ bool is_slave_b_added = false;
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+ int ret = 0;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER ||
+ port->type == HSR_PT_INTERLINK)
+ continue;
+
+ ret = vlan_vid_add(port->dev, proto, vid);
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ if (ret) {
+ /* clean up Slave-B */
+ netdev_err(dev, "add vid failed for Slave-A\n");
+ if (is_slave_b_added)
+ vlan_vid_del(port->dev, proto, vid);
+ return ret;
+ }
+
+ is_slave_a_added = true;
+ break;
+
+ case HSR_PT_SLAVE_B:
+ if (ret) {
+ /* clean up Slave-A */
+ netdev_err(dev, "add vid failed for Slave-B\n");
+ if (is_slave_a_added)
+ vlan_vid_del(port->dev, proto, vid);
+ return ret;
+ }
+
+ is_slave_b_added = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev,
+ __be16 proto, u16 vid)
+{
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ vlan_vid_del(port->dev, proto, vid);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static const struct net_device_ops hsr_device_ops = {
+ .ndo_change_mtu = hsr_dev_change_mtu,
+ .ndo_open = hsr_dev_open,
+ .ndo_stop = hsr_dev_close,
+ .ndo_start_xmit = hsr_dev_xmit,
+ .ndo_change_rx_flags = hsr_change_rx_flags,
+ .ndo_fix_features = hsr_fix_features,
+ .ndo_set_rx_mode = hsr_set_rx_mode,
+ .ndo_vlan_rx_add_vid = hsr_ndo_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = hsr_ndo_vlan_rx_kill_vid,
+};
+
+static const struct device_type hsr_type = {
+ .name = "hsr",
+};
+
+static struct hsr_proto_ops hsr_ops = {
+ .send_sv_frame = send_hsr_supervision_frame,
+ .create_tagged_frame = hsr_create_tagged_frame,
+ .get_untagged_frame = hsr_get_untagged_frame,
+ .drop_frame = hsr_drop_frame,
+ .fill_frame_info = hsr_fill_frame_info,
+ .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame,
+ .register_frame_out = hsr_register_frame_out,
+};
+
+static struct hsr_proto_ops prp_ops = {
+ .send_sv_frame = send_prp_supervision_frame,
+ .create_tagged_frame = prp_create_tagged_frame,
+ .get_untagged_frame = prp_get_untagged_frame,
+ .drop_frame = prp_drop_frame,
+ .fill_frame_info = prp_fill_frame_info,
+ .handle_san_frame = prp_handle_san_frame,
+ .update_san_info = prp_update_san_info,
+ .register_frame_out = prp_register_frame_out,
+};
+
+void hsr_dev_setup(struct net_device *dev)
+{
+ eth_hw_addr_random(dev);
+
+ ether_setup(dev);
+ dev->min_mtu = 0;
+ dev->header_ops = &hsr_header_ops;
+ dev->netdev_ops = &hsr_device_ops;
+ SET_NETDEV_DEVTYPE(dev, &hsr_type);
+ dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL;
+ /* Prevent recursive tx locking */
+ dev->lltx = true;
+ /* Not sure about this. Taken from bridge code. netdevice.h says
+ * it means "Does not change network namespaces".
+ */
+ dev->netns_immutable = true;
+
+ dev->needs_free_netdev = true;
+
+ dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
+ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ dev->features = dev->hw_features;
+}
+
+/* Return true if dev is a HSR master; return false otherwise.
+ */
+bool is_hsr_master(struct net_device *dev)
+{
+ return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
+}
+EXPORT_SYMBOL(is_hsr_master);
+
+struct net_device *hsr_get_port_ndev(struct net_device *ndev,
+ enum hsr_port_type pt)
+{
+ struct hsr_priv *hsr = netdev_priv(ndev);
+ struct hsr_port *port;
+
+ rcu_read_lock();
+ hsr_for_each_port(hsr, port)
+ if (port->type == pt) {
+ dev_hold(port->dev);
+ rcu_read_unlock();
+ return port->dev;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+EXPORT_SYMBOL(hsr_get_port_ndev);
+
+int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev,
+ enum hsr_port_type *type)
+{
+ struct hsr_priv *hsr = netdev_priv(hsr_dev);
+ struct hsr_port *port;
+
+ rcu_read_lock();
+ hsr_for_each_port(hsr, port) {
+ if (port->dev == dev) {
+ *type = port->type;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(hsr_get_port_type);
+
+/* Default multicast address for HSR Supervision frames */
+static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
+ 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
+};
+
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+ struct net_device *interlink, unsigned char multicast_spec,
+ u8 protocol_version, struct netlink_ext_ack *extack)
+{
+ bool unregister = false;
+ struct hsr_priv *hsr;
+ int res;
+
+ hsr = netdev_priv(hsr_dev);
+ INIT_LIST_HEAD(&hsr->ports);
+ INIT_LIST_HEAD(&hsr->node_db);
+ INIT_LIST_HEAD(&hsr->proxy_node_db);
+ spin_lock_init(&hsr->list_lock);
+
+ eth_hw_addr_set(hsr_dev, slave[0]->dev_addr);
+
+ /* initialize protocol specific functions */
+ if (protocol_version == PRP_V1) {
+ /* For PRP, lan_id has most significant 3 bits holding
+ * the net_id of PRP_LAN_ID
+ */
+ hsr->net_id = PRP_LAN_ID << 1;
+ hsr->proto_ops = &prp_ops;
+ } else {
+ hsr->proto_ops = &hsr_ops;
+ }
+
+ /* Make sure we recognize frames from ourselves in hsr_rcv() */
+ res = hsr_create_self_node(hsr, hsr_dev->dev_addr,
+ slave[1]->dev_addr);
+ if (res < 0)
+ return res;
+
+ spin_lock_init(&hsr->seqnr_lock);
+ /* Overflow soon to find bugs easier: */
+ hsr->sequence_nr = HSR_SEQNR_START;
+ hsr->sup_sequence_nr = HSR_SUP_SEQNR_START;
+
+ timer_setup(&hsr->announce_timer, hsr_announce, 0);
+ timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0);
+ timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0);
+ timer_setup(&hsr->announce_proxy_timer, hsr_proxy_announce, 0);
+
+ ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr);
+ hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
+
+ hsr->prot_version = protocol_version;
+
+ /* Make sure the 1st call to netif_carrier_on() gets through */
+ netif_carrier_off(hsr_dev);
+
+ res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack);
+ if (res)
+ goto err_add_master;
+
+ /* HSR forwarding offload supported in lower device? */
+ if ((slave[0]->features & NETIF_F_HW_HSR_FWD) &&
+ (slave[1]->features & NETIF_F_HW_HSR_FWD))
+ hsr->fwd_offloaded = true;
+
+ if ((slave[0]->features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (slave[1]->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+ hsr_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ res = register_netdevice(hsr_dev);
+ if (res)
+ goto err_unregister;
+
+ unregister = true;
+
+ res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack);
+ if (res)
+ goto err_unregister;
+
+ res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack);
+ if (res)
+ goto err_unregister;
+
+ if (protocol_version == PRP_V1) {
+ eth_hw_addr_set(slave[1], slave[0]->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]);
+ }
+
+ if (interlink) {
+ res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack);
+ if (res)
+ goto err_unregister;
+
+ hsr->redbox = true;
+ ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr);
+ mod_timer(&hsr->prune_proxy_timer,
+ jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
+ }
+
+ hsr_debugfs_init(hsr, hsr_dev);
+ mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));
+
+ return 0;
+
+err_unregister:
+ hsr_del_ports(hsr);
+err_add_master:
+ hsr_del_self_node(hsr);
+
+ if (unregister)
+ unregister_netdevice(hsr_dev);
+ return res;
+}
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
new file mode 100644
index 000000000000..655284095b78
--- /dev/null
+++ b/net/hsr/hsr_device.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
+ */
+
+#ifndef __HSR_DEVICE_H
+#define __HSR_DEVICE_H
+
+#include <linux/netdevice.h>
+#include "hsr_main.h"
+
+void hsr_del_ports(struct hsr_priv *hsr);
+void hsr_dev_setup(struct net_device *dev);
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+ struct net_device *interlink, unsigned char multicast_spec,
+ u8 protocol_version, struct netlink_ext_ack *extack);
+void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
+int hsr_get_max_mtu(struct hsr_priv *hsr);
+#endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
new file mode 100644
index 000000000000..339f0d220212
--- /dev/null
+++ b/net/hsr/hsr_forward.c
@@ -0,0 +1,760 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Frame router for HSR and PRP.
+ */
+
+#include "hsr_forward.h"
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+
+struct hsr_node;
+
+/* The uses I can see for these HSR supervision frames are:
+ * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type =
+ * 22") to reset any sequence_nr counters belonging to that node. Useful if
+ * the other node's counter has been reset for some reason.
+ * --
+ * Or not - resetting the counter and bridging the frame would create a
+ * loop, unfortunately.
+ *
+ * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck
+ * frame is received from a particular node, we know something is wrong.
+ * We just register these (as with normal frames) and throw them away.
+ *
+ * 3) Allow different MAC addresses for the two slave interfaces, using the
+ * MacAddressA field.
+ */
+static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
+{
+ struct ethhdr *eth_hdr;
+ struct hsr_sup_tag *hsr_sup_tag;
+ struct hsrv1_ethhdr_sp *hsr_V1_hdr;
+ struct hsr_sup_tlv *hsr_sup_tlv;
+ u16 total_length = 0;
+
+ WARN_ON_ONCE(!skb_mac_header_was_set(skb));
+ eth_hdr = (struct ethhdr *)skb_mac_header(skb);
+
+ /* Correct addr? */
+ if (!ether_addr_equal(eth_hdr->h_dest,
+ hsr->sup_multicast_addr))
+ return false;
+
+ /* Correct ether type?. */
+ if (!(eth_hdr->h_proto == htons(ETH_P_PRP) ||
+ eth_hdr->h_proto == htons(ETH_P_HSR)))
+ return false;
+
+ /* Get the supervision header from correct location. */
+ if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */
+ total_length = sizeof(struct hsrv1_ethhdr_sp);
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+
+ hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb);
+ if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP))
+ return false;
+
+ hsr_sup_tag = &hsr_V1_hdr->hsr_sup;
+ } else {
+ total_length = sizeof(struct hsrv0_ethhdr_sp);
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+
+ hsr_sup_tag =
+ &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup;
+ }
+
+ if (hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_ANNOUNCE &&
+ hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_LIFE_CHECK &&
+ hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD &&
+ hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA)
+ return false;
+ if (hsr_sup_tag->tlv.HSR_TLV_length != 12 &&
+ hsr_sup_tag->tlv.HSR_TLV_length != sizeof(struct hsr_sup_payload))
+ return false;
+
+ /* Get next tlv */
+ total_length += hsr_sup_tag->tlv.HSR_TLV_length;
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+ skb_pull(skb, total_length);
+ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
+ skb_push(skb, total_length);
+
+ /* if this is a redbox supervision frame we need to verify
+ * that more data is available
+ */
+ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) {
+ /* tlv length must be a length of a mac address */
+ if (hsr_sup_tlv->HSR_TLV_length != sizeof(struct hsr_sup_payload))
+ return false;
+
+ /* make sure another tlv follows */
+ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length;
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+
+ /* get next tlv */
+ skb_pull(skb, total_length);
+ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
+ skb_push(skb, total_length);
+ }
+
+ /* end of tlvs must follow at the end */
+ if (hsr_sup_tlv->HSR_TLV_type == HSR_TLV_EOT &&
+ hsr_sup_tlv->HSR_TLV_length != 0)
+ return false;
+
+ return true;
+}
+
+static bool is_proxy_supervision_frame(struct hsr_priv *hsr,
+ struct sk_buff *skb)
+{
+ struct hsr_sup_payload *payload;
+ struct ethhdr *eth_hdr;
+ u16 total_length = 0;
+
+ eth_hdr = (struct ethhdr *)skb_mac_header(skb);
+
+ /* Get the HSR protocol revision. */
+ if (eth_hdr->h_proto == htons(ETH_P_HSR))
+ total_length = sizeof(struct hsrv1_ethhdr_sp);
+ else
+ total_length = sizeof(struct hsrv0_ethhdr_sp);
+
+ if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_payload)))
+ return false;
+
+ skb_pull(skb, total_length);
+ payload = (struct hsr_sup_payload *)skb->data;
+ skb_push(skb, total_length);
+
+ /* For RedBox (HSR-SAN) check if we have received the supervision
+ * frame with MAC addresses from own ProxyNodeTable.
+ */
+ return hsr_is_node_in_db(&hsr->proxy_node_db,
+ payload->macaddress_A);
+}
+
+static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in,
+ struct hsr_frame_info *frame)
+{
+ struct sk_buff *skb;
+ int copylen;
+ unsigned char *dst, *src;
+
+ skb_pull(skb_in, HSR_HLEN);
+ skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC);
+ skb_push(skb_in, HSR_HLEN);
+ if (!skb)
+ return NULL;
+
+ skb_reset_mac_header(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start -= HSR_HLEN;
+
+ copylen = 2 * ETH_ALEN;
+ if (frame->is_vlan)
+ copylen += VLAN_HLEN;
+ src = skb_mac_header(skb_in);
+ dst = skb_mac_header(skb);
+ memcpy(dst, src, copylen);
+
+ skb->protocol = eth_hdr(skb)->h_proto;
+ return skb;
+}
+
+struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ if (!frame->skb_std) {
+ if (frame->skb_hsr)
+ frame->skb_std =
+ create_stripped_skb_hsr(frame->skb_hsr, frame);
+ else
+ netdev_warn_once(port->dev,
+ "Unexpected frame received in hsr_get_untagged_frame()\n");
+
+ if (!frame->skb_std)
+ return NULL;
+ }
+
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+}
+
+struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ if (!frame->skb_std) {
+ if (frame->skb_prp) {
+ /* trim the skb by len - HSR_HLEN to exclude RCT */
+ skb_trim(frame->skb_prp,
+ frame->skb_prp->len - HSR_HLEN);
+ frame->skb_std =
+ __pskb_copy(frame->skb_prp,
+ skb_headroom(frame->skb_prp),
+ GFP_ATOMIC);
+ } else {
+ /* Unexpected */
+ WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n",
+ __FILE__, __LINE__, port->dev->name);
+ return NULL;
+ }
+ }
+
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+}
+
+static void prp_set_lan_id(struct prp_rct *trailer,
+ struct hsr_port *port)
+{
+ int lane_id;
+
+ if (port->type == HSR_PT_SLAVE_A)
+ lane_id = 0;
+ else
+ lane_id = 1;
+
+ /* Add net_id in the upper 3 bits of lane_id */
+ lane_id |= port->hsr->net_id;
+ set_prp_lan_id(trailer, lane_id);
+}
+
+/* Tailroom for PRP rct should have been created before calling this */
+static struct sk_buff *prp_fill_rct(struct sk_buff *skb,
+ struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ struct prp_rct *trailer;
+ int min_size = ETH_ZLEN;
+ int lsdu_size;
+
+ if (!skb)
+ return skb;
+
+ if (frame->is_vlan)
+ min_size = VLAN_ETH_ZLEN;
+
+ if (skb_put_padto(skb, min_size))
+ return NULL;
+
+ trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN);
+ lsdu_size = skb->len - 14;
+ if (frame->is_vlan)
+ lsdu_size -= 4;
+ prp_set_lan_id(trailer, port);
+ set_prp_LSDU_size(trailer, lsdu_size);
+ trailer->sequence_nr = htons(frame->sequence_nr);
+ trailer->PRP_suffix = htons(ETH_P_PRP);
+ skb->protocol = eth_hdr(skb)->h_proto;
+
+ return skb;
+}
+
+static void hsr_set_path_id(struct hsr_frame_info *frame,
+ struct hsr_ethhdr *hsr_ethhdr,
+ struct hsr_port *port)
+{
+ int path_id;
+
+ if (port->hsr->prot_version) {
+ if (port->type == HSR_PT_SLAVE_A)
+ path_id = 0;
+ else
+ path_id = 1;
+ } else {
+ if (frame->is_supervision)
+ path_id = 0xf;
+ else
+ path_id = 1;
+ }
+
+ set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id);
+}
+
+static struct sk_buff *hsr_fill_tag(struct sk_buff *skb,
+ struct hsr_frame_info *frame,
+ struct hsr_port *port, u8 proto_version)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+ unsigned char *pc;
+ int lsdu_size;
+
+ /* pad to minimum packet size which is 60 + 6 (HSR tag) */
+ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN))
+ return NULL;
+
+ lsdu_size = skb->len - 14;
+ if (frame->is_vlan)
+ lsdu_size -= 4;
+
+ pc = skb_mac_header(skb);
+ if (frame->is_vlan)
+ /* This 4-byte shift (size of a vlan tag) does not
+ * mean that the ethhdr starts there. But rather it
+ * provides the proper environment for accessing
+ * the fields, such as hsr_tag etc., just like
+ * when the vlan tag is not there. This is because
+ * the hsr tag is after the vlan tag.
+ */
+ hsr_ethhdr = (struct hsr_ethhdr *)(pc + VLAN_HLEN);
+ else
+ hsr_ethhdr = (struct hsr_ethhdr *)pc;
+
+ hsr_set_path_id(frame, hsr_ethhdr, port);
+ set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size);
+ hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr);
+ hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
+ hsr_ethhdr->ethhdr.h_proto = htons(proto_version ?
+ ETH_P_HSR : ETH_P_PRP);
+ skb->protocol = hsr_ethhdr->ethhdr.h_proto;
+
+ return skb;
+}
+
+/* If the original frame was an HSR tagged frame, just clone it to be sent
+ * unchanged. Otherwise, create a private frame especially tagged for 'port'.
+ */
+struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ unsigned char *dst, *src;
+ struct sk_buff *skb;
+ int movelen;
+
+ if (frame->skb_hsr) {
+ struct hsr_ethhdr *hsr_ethhdr =
+ (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr);
+
+ /* set the lane id properly */
+ hsr_set_path_id(frame, hsr_ethhdr, port);
+ return skb_clone(frame->skb_hsr, GFP_ATOMIC);
+ } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+ }
+
+ /* Create the new skb with enough headroom to fit the HSR tag */
+ skb = __pskb_copy(frame->skb_std,
+ skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+ skb_reset_mac_header(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += HSR_HLEN;
+
+ movelen = ETH_HLEN;
+ if (frame->is_vlan)
+ movelen += VLAN_HLEN;
+
+ src = skb_mac_header(skb);
+ dst = skb_push(skb, HSR_HLEN);
+ memmove(dst, src, movelen);
+ skb_reset_mac_header(skb);
+
+ /* skb_put_padto free skb on error and hsr_fill_tag returns NULL in
+ * that case
+ */
+ return hsr_fill_tag(skb, frame, port, port->hsr->prot_version);
+}
+
+struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ struct sk_buff *skb;
+
+ if (frame->skb_prp) {
+ struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp);
+
+ if (trailer) {
+ prp_set_lan_id(trailer, port);
+ } else {
+ WARN_ONCE(!trailer, "errored PRP skb");
+ return NULL;
+ }
+ return skb_clone(frame->skb_prp, GFP_ATOMIC);
+ } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+ }
+
+ skb = skb_copy_expand(frame->skb_std, skb_headroom(frame->skb_std),
+ skb_tailroom(frame->skb_std) + HSR_HLEN,
+ GFP_ATOMIC);
+ return prp_fill_rct(skb, frame, port);
+}
+
+static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev,
+ struct hsr_node *node_src)
+{
+ bool was_multicast_frame;
+ int res, recv_len;
+
+ was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST);
+ hsr_addr_subst_source(node_src, skb);
+ skb_pull(skb, ETH_HLEN);
+ recv_len = skb->len;
+ res = netif_rx(skb);
+ if (res == NET_RX_DROP) {
+ dev->stats.rx_dropped++;
+ } else {
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += recv_len;
+ if (was_multicast_frame)
+ dev->stats.multicast++;
+ }
+}
+
+static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port,
+ struct hsr_frame_info *frame)
+{
+ if (frame->port_rcv->type == HSR_PT_MASTER) {
+ hsr_addr_subst_dest(frame->node_src, skb, port);
+
+ /* Address substitution (IEC62439-3 pp 26, 50): replace mac
+ * address of outgoing frame with that of the outgoing slave's.
+ */
+ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr);
+ }
+
+ /* When HSR node is used as RedBox - the frame received from HSR ring
+ * requires source MAC address (SA) replacement to one which can be
+ * recognized by SAN devices (otherwise, frames are dropped by switch)
+ */
+ if (port->type == HSR_PT_INTERLINK)
+ ether_addr_copy(eth_hdr(skb)->h_source,
+ port->hsr->macaddress_redbox);
+
+ return dev_queue_xmit(skb);
+}
+
+bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
+{
+ return ((frame->port_rcv->type == HSR_PT_SLAVE_A &&
+ port->type == HSR_PT_SLAVE_B) ||
+ (frame->port_rcv->type == HSR_PT_SLAVE_B &&
+ port->type == HSR_PT_SLAVE_A));
+}
+
+bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
+{
+ struct sk_buff *skb;
+
+ if (port->dev->features & NETIF_F_HW_HSR_FWD)
+ return prp_drop_frame(frame, port);
+
+ /* RedBox specific frames dropping policies
+ *
+ * Do not send HSR supervisory frames to SAN devices
+ */
+ if (frame->is_supervision && port->type == HSR_PT_INTERLINK)
+ return true;
+
+ /* Do not forward to other HSR port (A or B) unicast frames which
+ * are addressed to interlink port (and are in the ProxyNodeTable).
+ */
+ skb = frame->skb_hsr;
+ if (skb && prp_drop_frame(frame, port) &&
+ is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ hsr_is_node_in_db(&port->hsr->proxy_node_db,
+ eth_hdr(skb)->h_dest)) {
+ return true;
+ }
+
+ /* Do not forward to port C (Interlink) frames from nodes A and B
+ * if DA is in NodeTable.
+ */
+ if ((frame->port_rcv->type == HSR_PT_SLAVE_A ||
+ frame->port_rcv->type == HSR_PT_SLAVE_B) &&
+ port->type == HSR_PT_INTERLINK) {
+ skb = frame->skb_hsr;
+ if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ hsr_is_node_in_db(&port->hsr->node_db,
+ eth_hdr(skb)->h_dest)) {
+ return true;
+ }
+ }
+
+ /* Do not forward to port A and B unicast frames received on the
+ * interlink port if it is addressed to one of nodes registered in
+ * the ProxyNodeTable.
+ */
+ if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) &&
+ frame->port_rcv->type == HSR_PT_INTERLINK) {
+ skb = frame->skb_std;
+ if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ hsr_is_node_in_db(&port->hsr->proxy_node_db,
+ eth_hdr(skb)->h_dest)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* Forward the frame through all devices except:
+ * - Back through the receiving device
+ * - If it's a HSR frame: through a device where it has passed before
+ * - if it's a PRP frame: through another PRP slave device (no bridge)
+ * - To the local HSR master only if the frame is directly addressed to it, or
+ * a non-supervision multicast or broadcast frame.
+ *
+ * HSR slave devices should insert a HSR tag into the frame, or forward the
+ * frame unchanged if it's already tagged. Interlink devices should strip HSR
+ * tags if they're of the non-HSR type (but only after duplicate discard). The
+ * master device always strips HSR tags.
+ */
+static void hsr_forward_do(struct hsr_frame_info *frame)
+{
+ struct hsr_port *port;
+ struct sk_buff *skb;
+ bool sent = false;
+
+ hsr_for_each_port(frame->port_rcv->hsr, port) {
+ struct hsr_priv *hsr = port->hsr;
+ /* Don't send frame back the way it came */
+ if (port == frame->port_rcv)
+ continue;
+
+ /* Don't deliver locally unless we should */
+ if (port->type == HSR_PT_MASTER && !frame->is_local_dest)
+ continue;
+
+ /* Deliver frames directly addressed to us to master only */
+ if (port->type != HSR_PT_MASTER && frame->is_local_exclusive)
+ continue;
+
+ /* If hardware duplicate generation is enabled, only send out
+ * one port.
+ */
+ if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent)
+ continue;
+
+ /* Don't send frame over port where it has been sent before.
+ * Also for SAN, this shouldn't be done.
+ */
+ if (!frame->is_from_san &&
+ hsr->proto_ops->register_frame_out &&
+ hsr->proto_ops->register_frame_out(port, frame))
+ continue;
+
+ if (frame->is_supervision && port->type == HSR_PT_MASTER &&
+ !frame->is_proxy_supervision) {
+ hsr_handle_sup_frame(frame);
+ continue;
+ }
+
+ /* Check if frame is to be dropped. Eg. for PRP no forward
+ * between ports, or sending HSR supervision to RedBox.
+ */
+ if (hsr->proto_ops->drop_frame &&
+ hsr->proto_ops->drop_frame(frame, port))
+ continue;
+
+ if (port->type == HSR_PT_SLAVE_A ||
+ port->type == HSR_PT_SLAVE_B)
+ skb = hsr->proto_ops->create_tagged_frame(frame, port);
+ else
+ skb = hsr->proto_ops->get_untagged_frame(frame, port);
+
+ if (!skb) {
+ frame->port_rcv->dev->stats.rx_dropped++;
+ continue;
+ }
+
+ skb->dev = port->dev;
+ if (port->type == HSR_PT_MASTER) {
+ hsr_deliver_master(skb, port->dev, frame->node_src);
+ } else {
+ if (!hsr_xmit(skb, port, frame))
+ if (port->type == HSR_PT_SLAVE_A ||
+ port->type == HSR_PT_SLAVE_B)
+ sent = true;
+ }
+ }
+}
+
+static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) {
+ frame->is_local_exclusive = true;
+ skb->pkt_type = PACKET_HOST;
+ } else {
+ frame->is_local_exclusive = false;
+ }
+
+ if (skb->pkt_type == PACKET_HOST ||
+ skb->pkt_type == PACKET_MULTICAST ||
+ skb->pkt_type == PACKET_BROADCAST) {
+ frame->is_local_dest = true;
+ } else {
+ frame->is_local_dest = false;
+ }
+}
+
+static void handle_std_frame(struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ struct hsr_port *port = frame->port_rcv;
+ struct hsr_priv *hsr = port->hsr;
+
+ frame->skb_hsr = NULL;
+ frame->skb_prp = NULL;
+ frame->skb_std = skb;
+
+ if (port->type != HSR_PT_MASTER)
+ frame->is_from_san = true;
+
+ if (port->type == HSR_PT_MASTER ||
+ port->type == HSR_PT_INTERLINK) {
+ /* Sequence nr for the master/interlink node */
+ lockdep_assert_held(&hsr->seqnr_lock);
+ frame->sequence_nr = hsr->sequence_nr;
+ hsr->sequence_nr++;
+ }
+}
+
+int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ struct hsr_port *port = frame->port_rcv;
+ struct hsr_priv *hsr = port->hsr;
+
+ /* HSRv0 supervisory frames double as a tag so treat them as tagged. */
+ if ((!hsr->prot_version && proto == htons(ETH_P_PRP)) ||
+ proto == htons(ETH_P_HSR)) {
+ /* Check if skb contains hsr_ethhdr */
+ if (skb->mac_len < sizeof(struct hsr_ethhdr))
+ return -EINVAL;
+
+ /* HSR tagged frame :- Data or Supervision */
+ frame->skb_std = NULL;
+ frame->skb_prp = NULL;
+ frame->skb_hsr = skb;
+ frame->sequence_nr = hsr_get_skb_sequence_nr(skb);
+ return 0;
+ }
+
+ /* Standard frame or PRP from master port */
+ handle_std_frame(skb, frame);
+
+ return 0;
+}
+
+int prp_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ /* Supervision frame */
+ struct prp_rct *rct = skb_get_PRP_rct(skb);
+
+ if (rct &&
+ prp_check_lsdu_size(skb, rct, frame->is_supervision)) {
+ frame->skb_hsr = NULL;
+ frame->skb_std = NULL;
+ frame->skb_prp = skb;
+ frame->sequence_nr = prp_get_skb_sequence_nr(rct);
+ return 0;
+ }
+ handle_std_frame(skb, frame);
+
+ return 0;
+}
+
+static int fill_frame_info(struct hsr_frame_info *frame,
+ struct sk_buff *skb, struct hsr_port *port)
+{
+ struct hsr_priv *hsr = port->hsr;
+ struct hsr_vlan_ethhdr *vlan_hdr;
+ struct list_head *n_db;
+ struct ethhdr *ethhdr;
+ __be16 proto;
+ int ret;
+
+ /* Check if skb contains ethhdr */
+ if (skb->mac_len < sizeof(struct ethhdr))
+ return -EINVAL;
+
+ memset(frame, 0, sizeof(*frame));
+ frame->is_supervision = is_supervision_frame(port->hsr, skb);
+ if (frame->is_supervision && hsr->redbox)
+ frame->is_proxy_supervision =
+ is_proxy_supervision_frame(port->hsr, skb);
+
+ n_db = &hsr->node_db;
+ if (port->type == HSR_PT_INTERLINK)
+ n_db = &hsr->proxy_node_db;
+
+ frame->node_src = hsr_get_node(port, n_db, skb,
+ frame->is_supervision, port->type);
+ if (!frame->node_src)
+ return -1; /* Unknown node and !is_supervision, or no mem */
+
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ frame->is_vlan = false;
+ proto = ethhdr->h_proto;
+
+ if (proto == htons(ETH_P_8021Q))
+ frame->is_vlan = true;
+
+ if (frame->is_vlan) {
+ /* Note: skb->mac_len might be wrong here. */
+ if (!pskb_may_pull(skb,
+ skb_mac_offset(skb) +
+ offsetofend(struct hsr_vlan_ethhdr, vlanhdr)))
+ return -EINVAL;
+ vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb);
+ proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto;
+ }
+
+ frame->is_from_san = false;
+ frame->port_rcv = port;
+ ret = hsr->proto_ops->fill_frame_info(proto, skb, frame);
+ if (ret)
+ return ret;
+
+ check_local_dest(port->hsr, skb, frame);
+
+ return 0;
+}
+
+/* Must be called holding rcu read lock (because of the port parameter) */
+void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port)
+{
+ struct hsr_frame_info frame;
+
+ rcu_read_lock();
+ if (fill_frame_info(&frame, skb, port) < 0)
+ goto out_drop;
+
+ hsr_register_frame_in(frame.node_src, port, frame.sequence_nr);
+ hsr_forward_do(&frame);
+ rcu_read_unlock();
+ /* Gets called for ingress frames as well as egress from master port.
+ * So check and increment stats for master port only here.
+ */
+ if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) {
+ port->dev->stats.tx_packets++;
+ port->dev->stats.tx_bytes += skb->len;
+ }
+
+ kfree_skb(frame.skb_hsr);
+ kfree_skb(frame.skb_prp);
+ kfree_skb(frame.skb_std);
+ return;
+
+out_drop:
+ rcu_read_unlock();
+ port->dev->stats.tx_dropped++;
+ kfree_skb(skb);
+}
diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h
new file mode 100644
index 000000000000..206636750b30
--- /dev/null
+++ b/net/hsr/hsr_forward.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
+ */
+
+#ifndef __HSR_FORWARD_H
+#define __HSR_FORWARD_H
+
+#include <linux/netdevice.h>
+#include "hsr_main.h"
+
+void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port);
+struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port);
+bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port);
+int prp_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame);
+int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame);
+#endif /* __HSR_FORWARD_H */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
new file mode 100644
index 000000000000..3a2a2fa7a0a3
--- /dev/null
+++ b/net/hsr/hsr_framereg.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * The HSR spec says never to forward the same frame twice on the same
+ * interface. A frame is identified by its source MAC address and its HSR
+ * sequence number. This code keeps track of senders and their sequence numbers
+ * to allow filtering of duplicate frames, and to detect HSR ring errors.
+ * Same code handles filtering of duplicates for PRP as well.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+#include "hsr_netlink.h"
+
+/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
+ * false otherwise.
+ */
+static bool seq_nr_after(u16 a, u16 b)
+{
+ /* Remove inconsistency where
+ * seq_nr_after(a, b) == seq_nr_before(a, b)
+ */
+ if ((int)b - a == 32768)
+ return false;
+
+ return (((s16)(b - a)) < 0);
+}
+
+#define seq_nr_before(a, b) seq_nr_after((b), (a))
+#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b)))
+#define PRP_DROP_WINDOW_LEN 32768
+
+bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr)
+{
+ if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox))
+ return false;
+
+ return ether_addr_equal(addr, hsr->macaddress_redbox);
+}
+
+bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
+{
+ struct hsr_self_node *sn;
+ bool ret = false;
+
+ rcu_read_lock();
+ sn = rcu_dereference(hsr->self_node);
+ if (!sn) {
+ WARN_ONCE(1, "HSR: No self node\n");
+ goto out;
+ }
+
+ if (ether_addr_equal(addr, sn->macaddress_A) ||
+ ether_addr_equal(addr, sn->macaddress_B))
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+static struct hsr_node *find_node_by_addr_A(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
+{
+ struct hsr_node *node;
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->macaddress_A, addr))
+ return node;
+ }
+
+ return NULL;
+}
+
+/* Check if node for a given MAC address is already present in data base
+ */
+bool hsr_is_node_in_db(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
+{
+ return !!find_node_by_addr_A(node_db, addr);
+}
+
+/* Helper for device init; the self_node is used in hsr_rcv() to recognize
+ * frames from self that's been looped over the HSR ring.
+ */
+int hsr_create_self_node(struct hsr_priv *hsr,
+ const unsigned char addr_a[ETH_ALEN],
+ const unsigned char addr_b[ETH_ALEN])
+{
+ struct hsr_self_node *sn, *old;
+
+ sn = kmalloc(sizeof(*sn), GFP_KERNEL);
+ if (!sn)
+ return -ENOMEM;
+
+ ether_addr_copy(sn->macaddress_A, addr_a);
+ ether_addr_copy(sn->macaddress_B, addr_b);
+
+ spin_lock_bh(&hsr->list_lock);
+ old = rcu_replace_pointer(hsr->self_node, sn,
+ lockdep_is_held(&hsr->list_lock));
+ spin_unlock_bh(&hsr->list_lock);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+ return 0;
+}
+
+void hsr_del_self_node(struct hsr_priv *hsr)
+{
+ struct hsr_self_node *old;
+
+ spin_lock_bh(&hsr->list_lock);
+ old = rcu_replace_pointer(hsr->self_node, NULL,
+ lockdep_is_held(&hsr->list_lock));
+ spin_unlock_bh(&hsr->list_lock);
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+void hsr_del_nodes(struct list_head *node_db)
+{
+ struct hsr_node *node;
+ struct hsr_node *tmp;
+
+ list_for_each_entry_safe(node, tmp, node_db, mac_list)
+ kfree(node);
+}
+
+void prp_handle_san_frame(bool san, enum hsr_port_type port,
+ struct hsr_node *node)
+{
+ /* Mark if the SAN node is over LAN_A or LAN_B */
+ if (port == HSR_PT_SLAVE_A) {
+ node->san_a = true;
+ return;
+ }
+
+ if (port == HSR_PT_SLAVE_B)
+ node->san_b = true;
+}
+
+/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A;
+ * seq_out is used to initialize filtering of outgoing duplicate frames
+ * originating from the newly added node.
+ */
+static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
+ struct list_head *node_db,
+ unsigned char addr[],
+ u16 seq_out, bool san,
+ enum hsr_port_type rx_port)
+{
+ struct hsr_node *new_node, *node;
+ unsigned long now;
+ int i;
+
+ new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC);
+ if (!new_node)
+ return NULL;
+
+ ether_addr_copy(new_node->macaddress_A, addr);
+ spin_lock_init(&new_node->seq_out_lock);
+
+ /* We are only interested in time diffs here, so use current jiffies
+ * as initialization. (0 could trigger an spurious ring error warning).
+ */
+ now = jiffies;
+ for (i = 0; i < HSR_PT_PORTS; i++) {
+ new_node->time_in[i] = now;
+ new_node->time_out[i] = now;
+ }
+ for (i = 0; i < HSR_PT_PORTS; i++) {
+ new_node->seq_out[i] = seq_out;
+ new_node->seq_expected[i] = seq_out + 1;
+ new_node->seq_start[i] = seq_out + 1;
+ }
+
+ if (san && hsr->proto_ops->handle_san_frame)
+ hsr->proto_ops->handle_san_frame(san, rx_port, new_node);
+
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_rcu(node, node_db, mac_list,
+ lockdep_is_held(&hsr->list_lock)) {
+ if (ether_addr_equal(node->macaddress_A, addr))
+ goto out;
+ if (ether_addr_equal(node->macaddress_B, addr))
+ goto out;
+ }
+ list_add_tail_rcu(&new_node->mac_list, node_db);
+ spin_unlock_bh(&hsr->list_lock);
+ return new_node;
+out:
+ spin_unlock_bh(&hsr->list_lock);
+ kfree(new_node);
+ return node;
+}
+
+void prp_update_san_info(struct hsr_node *node, bool is_sup)
+{
+ if (!is_sup)
+ return;
+
+ node->san_a = false;
+ node->san_b = false;
+}
+
+/* Get the hsr_node from which 'skb' was sent.
+ */
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+ struct sk_buff *skb, bool is_sup,
+ enum hsr_port_type rx_port)
+{
+ struct hsr_priv *hsr = port->hsr;
+ struct hsr_node *node;
+ struct ethhdr *ethhdr;
+ struct prp_rct *rct;
+ bool san = false;
+ u16 seq_out;
+
+ if (!skb_mac_header_was_set(skb))
+ return NULL;
+
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
+ if (hsr->proto_ops->update_san_info)
+ hsr->proto_ops->update_san_info(node, is_sup);
+ return node;
+ }
+ if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) {
+ if (hsr->proto_ops->update_san_info)
+ hsr->proto_ops->update_san_info(node, is_sup);
+ return node;
+ }
+ }
+
+ /* Check if required node is not in proxy nodes table */
+ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) {
+ if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
+ if (hsr->proto_ops->update_san_info)
+ hsr->proto_ops->update_san_info(node, is_sup);
+ return node;
+ }
+ }
+
+ /* Everyone may create a node entry, connected node to a HSR/PRP
+ * device.
+ */
+ if (ethhdr->h_proto == htons(ETH_P_PRP) ||
+ ethhdr->h_proto == htons(ETH_P_HSR)) {
+ /* Check if skb contains hsr_ethhdr */
+ if (skb->mac_len < sizeof(struct hsr_ethhdr))
+ return NULL;
+
+ /* Use the existing sequence_nr from the tag as starting point
+ * for filtering duplicate frames.
+ */
+ seq_out = hsr_get_skb_sequence_nr(skb) - 1;
+ } else {
+ rct = skb_get_PRP_rct(skb);
+ if (rct && prp_check_lsdu_size(skb, rct, is_sup)) {
+ seq_out = prp_get_skb_sequence_nr(rct);
+ } else {
+ if (rx_port != HSR_PT_MASTER)
+ san = true;
+ seq_out = HSR_SEQNR_START;
+ }
+ }
+
+ return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out,
+ san, rx_port);
+}
+
+/* Use the Supervision frame's info about an eventual macaddress_B for merging
+ * nodes that has previously had their macaddress_B registered as a separate
+ * node.
+ */
+void hsr_handle_sup_frame(struct hsr_frame_info *frame)
+{
+ struct hsr_node *node_curr = frame->node_src;
+ struct hsr_port *port_rcv = frame->port_rcv;
+ struct hsr_priv *hsr = port_rcv->hsr;
+ struct hsr_sup_payload *hsr_sp;
+ struct hsr_sup_tlv *hsr_sup_tlv;
+ struct hsr_node *node_real;
+ struct sk_buff *skb = NULL;
+ struct list_head *node_db;
+ struct ethhdr *ethhdr;
+ int i;
+ unsigned int pull_size = 0;
+ unsigned int total_pull_size = 0;
+
+ /* Here either frame->skb_hsr or frame->skb_prp should be
+ * valid as supervision frame always will have protocol
+ * header info.
+ */
+ if (frame->skb_hsr)
+ skb = frame->skb_hsr;
+ else if (frame->skb_prp)
+ skb = frame->skb_prp;
+ else if (frame->skb_std)
+ skb = frame->skb_std;
+ if (!skb)
+ return;
+
+ /* Leave the ethernet header. */
+ pull_size = sizeof(struct ethhdr);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+ /* And leave the HSR tag. */
+ if (ethhdr->h_proto == htons(ETH_P_HSR)) {
+ pull_size = sizeof(struct hsr_tag);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+ }
+
+ /* And leave the HSR sup tag. */
+ pull_size = sizeof(struct hsr_sup_tag);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ /* get HSR sup payload */
+ hsr_sp = (struct hsr_sup_payload *)skb->data;
+
+ /* Merge node_curr (registered on macaddress_B) into node_real */
+ node_db = &port_rcv->hsr->node_db;
+ node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A);
+ if (!node_real)
+ /* No frame received from AddrA of this node yet */
+ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A,
+ HSR_SEQNR_START - 1, true,
+ port_rcv->type);
+ if (!node_real)
+ goto done; /* No mem */
+ if (node_real == node_curr)
+ /* Node has already been merged */
+ goto done;
+
+ /* Leave the first HSR sup payload. */
+ pull_size = sizeof(struct hsr_sup_payload);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ /* Get second supervision tlv */
+ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
+ /* And check if it is a redbox mac TLV */
+ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) {
+ /* We could stop here after pushing hsr_sup_payload,
+ * or proceed and allow macaddress_B and for redboxes.
+ */
+ /* Sanity check length */
+ if (hsr_sup_tlv->HSR_TLV_length != 6)
+ goto done;
+
+ /* Leave the second HSR sup tlv. */
+ pull_size = sizeof(struct hsr_sup_tlv);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ /* Get redbox mac address. */
+ hsr_sp = (struct hsr_sup_payload *)skb->data;
+
+ /* Check if redbox mac and node mac are equal. */
+ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) {
+ /* This is a redbox supervision frame for a VDAN! */
+ goto done;
+ }
+ }
+
+ ether_addr_copy(node_real->macaddress_B, ethhdr->h_source);
+ spin_lock_bh(&node_real->seq_out_lock);
+ for (i = 0; i < HSR_PT_PORTS; i++) {
+ if (!node_curr->time_in_stale[i] &&
+ time_after(node_curr->time_in[i], node_real->time_in[i])) {
+ node_real->time_in[i] = node_curr->time_in[i];
+ node_real->time_in_stale[i] =
+ node_curr->time_in_stale[i];
+ }
+ if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i]))
+ node_real->seq_out[i] = node_curr->seq_out[i];
+ }
+ spin_unlock_bh(&node_real->seq_out_lock);
+ node_real->addr_B_port = port_rcv->type;
+
+ spin_lock_bh(&hsr->list_lock);
+ if (!node_curr->removed) {
+ list_del_rcu(&node_curr->mac_list);
+ node_curr->removed = true;
+ kfree_rcu(node_curr, rcu_head);
+ }
+ spin_unlock_bh(&hsr->list_lock);
+
+done:
+ /* Push back here */
+ skb_push(skb, total_pull_size);
+}
+
+/* 'skb' is a frame meant for this host, that is to be passed to upper layers.
+ *
+ * If the frame was sent by a node's B interface, replace the source
+ * address with that node's "official" address (macaddress_A) so that upper
+ * layers recognize where it came from.
+ */
+void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb)
+{
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+ return;
+ }
+
+ memcpy(&eth_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN);
+}
+
+/* 'skb' is a frame meant for another host.
+ * 'port' is the outgoing interface
+ *
+ * Substitute the target (dest) MAC address if necessary, so the it matches the
+ * recipient interface MAC address, regardless of whether that is the
+ * recipient's A or B interface.
+ * This is needed to keep the packets flowing through switches that learn on
+ * which "side" the different interfaces are.
+ */
+void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
+ struct hsr_port *port)
+{
+ struct hsr_node *node_dst;
+
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+ return;
+ }
+
+ if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest))
+ return;
+
+ node_dst = find_node_by_addr_A(&port->hsr->node_db,
+ eth_hdr(skb)->h_dest);
+ if (!node_dst && port->hsr->redbox)
+ node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db,
+ eth_hdr(skb)->h_dest);
+
+ if (!node_dst) {
+ if (port->hsr->prot_version != PRP_V1 && net_ratelimit())
+ netdev_err(skb->dev, "%s: Unknown node\n", __func__);
+ return;
+ }
+ if (port->type != node_dst->addr_B_port)
+ return;
+
+ if (is_valid_ether_addr(node_dst->macaddress_B))
+ ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B);
+}
+
+void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
+ u16 sequence_nr)
+{
+ /* Don't register incoming frames without a valid sequence number. This
+ * ensures entries of restarted nodes gets pruned so that they can
+ * re-register and resume communications.
+ */
+ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
+ seq_nr_before(sequence_nr, node->seq_out[port->type]))
+ return;
+
+ node->time_in[port->type] = jiffies;
+ node->time_in_stale[port->type] = false;
+}
+
+/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid
+ * ethhdr->h_source address and skb->mac_header set.
+ *
+ * Return:
+ * 1 if frame can be shown to have been sent recently on this interface,
+ * 0 otherwise, or
+ * negative error code on error
+ */
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
+{
+ struct hsr_node *node = frame->node_src;
+ u16 sequence_nr = frame->sequence_nr;
+
+ spin_lock_bh(&node->seq_out_lock);
+ if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) &&
+ time_is_after_jiffies(node->time_out[port->type] +
+ msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) {
+ spin_unlock_bh(&node->seq_out_lock);
+ return 1;
+ }
+
+ node->time_out[port->type] = jiffies;
+ node->seq_out[port->type] = sequence_nr;
+ spin_unlock_bh(&node->seq_out_lock);
+ return 0;
+}
+
+/* Adaptation of the PRP duplicate discard algorithm described in wireshark
+ * wiki (https://wiki.wireshark.org/PRP)
+ *
+ * A drop window is maintained for both LANs with start sequence set to the
+ * first sequence accepted on the LAN that has not been seen on the other LAN,
+ * and expected sequence set to the latest received sequence number plus one.
+ *
+ * When a frame is received on either LAN it is compared against the received
+ * frames on the other LAN. If it is outside the drop window of the other LAN
+ * the frame is accepted and the drop window is updated.
+ * The drop window for the other LAN is reset.
+ *
+ * 'port' is the outgoing interface
+ * 'frame' is the frame to be sent
+ *
+ * Return:
+ * 1 if frame can be shown to have been sent recently on this interface,
+ * 0 otherwise
+ */
+int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
+{
+ enum hsr_port_type other_port;
+ enum hsr_port_type rcv_port;
+ struct hsr_node *node;
+ u16 sequence_diff;
+ u16 sequence_exp;
+ u16 sequence_nr;
+
+ /* out-going frames are always in order
+ * and can be checked the same way as for HSR
+ */
+ if (frame->port_rcv->type == HSR_PT_MASTER)
+ return hsr_register_frame_out(port, frame);
+
+ /* for PRP we should only forward frames from the slave ports
+ * to the master port
+ */
+ if (port->type != HSR_PT_MASTER)
+ return 1;
+
+ node = frame->node_src;
+ sequence_nr = frame->sequence_nr;
+ sequence_exp = sequence_nr + 1;
+ rcv_port = frame->port_rcv->type;
+ other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B :
+ HSR_PT_SLAVE_A;
+
+ spin_lock_bh(&node->seq_out_lock);
+ if (time_is_before_jiffies(node->time_out[port->type] +
+ msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) ||
+ (node->seq_start[rcv_port] == node->seq_expected[rcv_port] &&
+ node->seq_start[other_port] == node->seq_expected[other_port])) {
+ /* the node hasn't been sending for a while
+ * or both drop windows are empty, forward the frame
+ */
+ node->seq_start[rcv_port] = sequence_nr;
+ } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) &&
+ seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) {
+ /* drop the frame, update the drop window for the other port
+ * and reset our drop window
+ */
+ node->seq_start[other_port] = sequence_exp;
+ node->seq_expected[rcv_port] = sequence_exp;
+ node->seq_start[rcv_port] = node->seq_expected[rcv_port];
+ spin_unlock_bh(&node->seq_out_lock);
+ return 1;
+ }
+
+ /* update the drop window for the port where this frame was received
+ * and clear the drop window for the other port
+ */
+ node->seq_start[other_port] = node->seq_expected[other_port];
+ node->seq_expected[rcv_port] = sequence_exp;
+ sequence_diff = sequence_exp - node->seq_start[rcv_port];
+ if (sequence_diff > PRP_DROP_WINDOW_LEN)
+ node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN;
+
+ node->time_out[port->type] = jiffies;
+ node->seq_out[port->type] = sequence_nr;
+ spin_unlock_bh(&node->seq_out_lock);
+ return 0;
+}
+
+#if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST)
+EXPORT_SYMBOL(prp_register_frame_out);
+#endif
+
+static struct hsr_port *get_late_port(struct hsr_priv *hsr,
+ struct hsr_node *node)
+{
+ if (node->time_in_stale[HSR_PT_SLAVE_A])
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (node->time_in_stale[HSR_PT_SLAVE_B])
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+
+ if (time_after(node->time_in[HSR_PT_SLAVE_B],
+ node->time_in[HSR_PT_SLAVE_A] +
+ msecs_to_jiffies(MAX_SLAVE_DIFF)))
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (time_after(node->time_in[HSR_PT_SLAVE_A],
+ node->time_in[HSR_PT_SLAVE_B] +
+ msecs_to_jiffies(MAX_SLAVE_DIFF)))
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+
+ return NULL;
+}
+
+/* Remove stale sequence_nr records. Called by timer every
+ * HSR_LIFE_CHECK_INTERVAL (two seconds or so).
+ */
+void hsr_prune_nodes(struct timer_list *t)
+{
+ struct hsr_priv *hsr = timer_container_of(hsr, t, prune_timer);
+ struct hsr_node *node;
+ struct hsr_node *tmp;
+ struct hsr_port *port;
+ unsigned long timestamp;
+ unsigned long time_a, time_b;
+
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) {
+ /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A]
+ * nor time_in[HSR_PT_SLAVE_B], will ever be updated for
+ * the master port. Thus the master node will be repeatedly
+ * pruned leading to packet loss.
+ */
+ if (hsr_addr_is_self(hsr, node->macaddress_A))
+ continue;
+
+ /* Shorthand */
+ time_a = node->time_in[HSR_PT_SLAVE_A];
+ time_b = node->time_in[HSR_PT_SLAVE_B];
+
+ /* Check for timestamps old enough to risk wrap-around */
+ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2))
+ node->time_in_stale[HSR_PT_SLAVE_A] = true;
+ if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2))
+ node->time_in_stale[HSR_PT_SLAVE_B] = true;
+
+ /* Get age of newest frame from node.
+ * At least one time_in is OK here; nodes get pruned long
+ * before both time_ins can get stale
+ */
+ timestamp = time_a;
+ if (node->time_in_stale[HSR_PT_SLAVE_A] ||
+ (!node->time_in_stale[HSR_PT_SLAVE_B] &&
+ time_after(time_b, time_a)))
+ timestamp = time_b;
+
+ /* Warn of ring error only as long as we get frames at all */
+ if (time_is_after_jiffies(timestamp +
+ msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) {
+ rcu_read_lock();
+ port = get_late_port(hsr, node);
+ if (port)
+ hsr_nl_ringerror(hsr, node->macaddress_A, port);
+ rcu_read_unlock();
+ }
+
+ /* Prune old entries */
+ if (time_is_before_jiffies(timestamp +
+ msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr, node->macaddress_A);
+ if (!node->removed) {
+ list_del_rcu(&node->mac_list);
+ node->removed = true;
+ /* Note that we need to free this entry later: */
+ kfree_rcu(node, rcu_head);
+ }
+ }
+ }
+ spin_unlock_bh(&hsr->list_lock);
+
+ /* Restart timer */
+ mod_timer(&hsr->prune_timer,
+ jiffies + msecs_to_jiffies(PRUNE_PERIOD));
+}
+
+void hsr_prune_proxy_nodes(struct timer_list *t)
+{
+ struct hsr_priv *hsr = timer_container_of(hsr, t, prune_proxy_timer);
+ unsigned long timestamp;
+ struct hsr_node *node;
+ struct hsr_node *tmp;
+
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) {
+ /* Don't prune RedBox node. */
+ if (hsr_addr_is_redbox(hsr, node->macaddress_A))
+ continue;
+
+ timestamp = node->time_in[HSR_PT_INTERLINK];
+
+ /* Prune old entries */
+ if (time_is_before_jiffies(timestamp +
+ msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr, node->macaddress_A);
+ if (!node->removed) {
+ list_del_rcu(&node->mac_list);
+ node->removed = true;
+ /* Note that we need to free this entry later: */
+ kfree_rcu(node, rcu_head);
+ }
+ }
+ }
+
+ spin_unlock_bh(&hsr->list_lock);
+
+ /* Restart timer */
+ mod_timer(&hsr->prune_proxy_timer,
+ jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
+}
+
+void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
+ unsigned char addr[ETH_ALEN])
+{
+ struct hsr_node *node;
+
+ if (!_pos) {
+ node = list_first_or_null_rcu(&hsr->node_db,
+ struct hsr_node, mac_list);
+ if (node)
+ ether_addr_copy(addr, node->macaddress_A);
+ return node;
+ }
+
+ node = _pos;
+ list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) {
+ ether_addr_copy(addr, node->macaddress_A);
+ return node;
+ }
+
+ return NULL;
+}
+
+int hsr_get_node_data(struct hsr_priv *hsr,
+ const unsigned char *addr,
+ unsigned char addr_b[ETH_ALEN],
+ unsigned int *addr_b_ifindex,
+ int *if1_age,
+ u16 *if1_seq,
+ int *if2_age,
+ u16 *if2_seq)
+{
+ struct hsr_node *node;
+ struct hsr_port *port;
+ unsigned long tdiff;
+
+ node = find_node_by_addr_A(&hsr->node_db, addr);
+ if (!node)
+ return -ENOENT;
+
+ ether_addr_copy(addr_b, node->macaddress_B);
+
+ tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A];
+ if (node->time_in_stale[HSR_PT_SLAVE_A])
+ *if1_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+ else if (tdiff > msecs_to_jiffies(INT_MAX))
+ *if1_age = INT_MAX;
+#endif
+ else
+ *if1_age = jiffies_to_msecs(tdiff);
+
+ tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B];
+ if (node->time_in_stale[HSR_PT_SLAVE_B])
+ *if2_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+ else if (tdiff > msecs_to_jiffies(INT_MAX))
+ *if2_age = INT_MAX;
+#endif
+ else
+ *if2_age = jiffies_to_msecs(tdiff);
+
+ /* Present sequence numbers as if they were incoming on interface */
+ *if1_seq = node->seq_out[HSR_PT_SLAVE_B];
+ *if2_seq = node->seq_out[HSR_PT_SLAVE_A];
+
+ if (node->addr_B_port != HSR_PT_NONE) {
+ port = hsr_port_get_hsr(hsr, node->addr_B_port);
+ *addr_b_ifindex = port->dev->ifindex;
+ } else {
+ *addr_b_ifindex = -1;
+ }
+
+ return 0;
+}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
new file mode 100644
index 000000000000..b04948659d84
--- /dev/null
+++ b/net/hsr/hsr_framereg.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
+ */
+
+#ifndef __HSR_FRAMEREG_H
+#define __HSR_FRAMEREG_H
+
+#include "hsr_main.h"
+
+struct hsr_node;
+
+struct hsr_frame_info {
+ struct sk_buff *skb_std;
+ struct sk_buff *skb_hsr;
+ struct sk_buff *skb_prp;
+ struct hsr_port *port_rcv;
+ struct hsr_node *node_src;
+ u16 sequence_nr;
+ bool is_supervision;
+ bool is_proxy_supervision;
+ bool is_vlan;
+ bool is_local_dest;
+ bool is_local_exclusive;
+ bool is_from_san;
+};
+
+void hsr_del_self_node(struct hsr_priv *hsr);
+void hsr_del_nodes(struct list_head *node_db);
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+ struct sk_buff *skb, bool is_sup,
+ enum hsr_port_type rx_port);
+void hsr_handle_sup_frame(struct hsr_frame_info *frame);
+bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr);
+bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr);
+
+void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb);
+void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
+ struct hsr_port *port);
+
+void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
+ u16 sequence_nr);
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame);
+
+void hsr_prune_nodes(struct timer_list *t);
+void hsr_prune_proxy_nodes(struct timer_list *t);
+
+int hsr_create_self_node(struct hsr_priv *hsr,
+ const unsigned char addr_a[ETH_ALEN],
+ const unsigned char addr_b[ETH_ALEN]);
+
+void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
+ unsigned char addr[ETH_ALEN]);
+
+int hsr_get_node_data(struct hsr_priv *hsr,
+ const unsigned char *addr,
+ unsigned char addr_b[ETH_ALEN],
+ unsigned int *addr_b_ifindex,
+ int *if1_age,
+ u16 *if1_seq,
+ int *if2_age,
+ u16 *if2_seq);
+
+void prp_handle_san_frame(bool san, enum hsr_port_type port,
+ struct hsr_node *node);
+void prp_update_san_info(struct hsr_node *node, bool is_sup);
+
+bool hsr_is_node_in_db(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN]);
+
+int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame);
+
+struct hsr_node {
+ struct list_head mac_list;
+ /* Protect R/W access to seq_out */
+ spinlock_t seq_out_lock;
+ unsigned char macaddress_A[ETH_ALEN];
+ unsigned char macaddress_B[ETH_ALEN];
+ /* Local slave through which AddrB frames are received from this node */
+ enum hsr_port_type addr_B_port;
+ unsigned long time_in[HSR_PT_PORTS];
+ bool time_in_stale[HSR_PT_PORTS];
+ unsigned long time_out[HSR_PT_PORTS];
+ /* if the node is a SAN */
+ bool san_a;
+ bool san_b;
+ u16 seq_out[HSR_PT_PORTS];
+ bool removed;
+ /* PRP specific duplicate handling */
+ u16 seq_expected[HSR_PT_PORTS];
+ u16 seq_start[HSR_PT_PORTS];
+ struct rcu_head rcu_head;
+};
+
+#endif /* __HSR_FRAMEREG_H */
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
new file mode 100644
index 000000000000..bc94b07101d8
--- /dev/null
+++ b/net/hsr/hsr_main.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Event handling for HSR and PRP devices.
+ */
+
+#include <linux/netdevice.h>
+#include <net/rtnetlink.h>
+#include <linux/rculist.h>
+#include <linux/timer.h>
+#include <linux/etherdevice.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_netlink.h"
+#include "hsr_framereg.h"
+#include "hsr_slave.h"
+
+static bool hsr_slave_empty(struct hsr_priv *hsr)
+{
+ struct hsr_port *port;
+
+ hsr_for_each_port_rtnl(hsr, port)
+ if (port->type != HSR_PT_MASTER)
+ return false;
+ return true;
+}
+
+static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct hsr_port *port, *master;
+ struct net_device *dev;
+ struct hsr_priv *hsr;
+ LIST_HEAD(list_kill);
+ int mtu_max;
+ int res;
+
+ dev = netdev_notifier_info_to_dev(ptr);
+ port = hsr_port_get_rtnl(dev);
+ if (!port) {
+ if (!is_hsr_master(dev))
+ return NOTIFY_DONE; /* Not an HSR device */
+ hsr = netdev_priv(dev);
+ port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ if (!port) {
+ /* Resend of notification concerning removed device? */
+ return NOTIFY_DONE;
+ }
+ } else {
+ hsr = port->hsr;
+ }
+
+ switch (event) {
+ case NETDEV_UP: /* Administrative state DOWN */
+ case NETDEV_DOWN: /* Administrative state UP */
+ case NETDEV_CHANGE: /* Link (carrier) state changes */
+ hsr_check_carrier_and_operstate(hsr);
+ break;
+ case NETDEV_CHANGENAME:
+ if (is_hsr_master(dev))
+ hsr_debugfs_rename(dev);
+ break;
+ case NETDEV_CHANGEADDR:
+ if (port->type == HSR_PT_MASTER) {
+ /* This should not happen since there's no
+ * ndo_set_mac_address() for HSR devices - i.e. not
+ * supported.
+ */
+ break;
+ }
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+
+ if (port->type == HSR_PT_SLAVE_A) {
+ eth_hw_addr_set(master->dev, dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR,
+ master->dev);
+
+ if (hsr->prot_version == PRP_V1) {
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port) {
+ eth_hw_addr_set(port->dev, dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR,
+ port->dev);
+ }
+ }
+ }
+
+ /* Make sure we recognize frames from ourselves in hsr_rcv() */
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ res = hsr_create_self_node(hsr,
+ master->dev->dev_addr,
+ port ?
+ port->dev->dev_addr :
+ master->dev->dev_addr);
+ if (res)
+ netdev_warn(master->dev,
+ "Could not update HSR node address.\n");
+ break;
+ case NETDEV_CHANGEMTU:
+ if (port->type == HSR_PT_MASTER)
+ break; /* Handled in ndo_change_mtu() */
+ mtu_max = hsr_get_max_mtu(port->hsr);
+ master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
+ WRITE_ONCE(master->dev->mtu, mtu_max);
+ break;
+ case NETDEV_UNREGISTER:
+ if (!is_hsr_master(dev)) {
+ master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
+ hsr_del_port(port);
+ if (hsr_slave_empty(master->hsr)) {
+ const struct rtnl_link_ops *ops;
+
+ ops = master->dev->rtnl_link_ops;
+ ops->dellink(master->dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ }
+ }
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* HSR works only on Ethernet devices. Refuse slave to change
+ * its type.
+ */
+ return NOTIFY_BAD;
+ }
+
+ return NOTIFY_DONE;
+}
+
+struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
+{
+ struct hsr_port *port;
+
+ hsr_for_each_port_rtnl(hsr, port)
+ if (port->type == pt)
+ return port;
+ return NULL;
+}
+
+int hsr_get_version(struct net_device *dev, enum hsr_version *ver)
+{
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+ *ver = hsr->prot_version;
+
+ return 0;
+}
+EXPORT_SYMBOL(hsr_get_version);
+
+static struct notifier_block hsr_nb = {
+ .notifier_call = hsr_netdev_notify, /* Slave event notifications */
+};
+
+static int __init hsr_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN);
+
+ err = register_netdevice_notifier(&hsr_nb);
+ if (err)
+ return err;
+
+ err = hsr_netlink_init();
+ if (err) {
+ unregister_netdevice_notifier(&hsr_nb);
+ return err;
+ }
+
+ return 0;
+}
+
+static void __exit hsr_exit(void)
+{
+ hsr_netlink_exit();
+ hsr_debugfs_remove_root();
+ unregister_netdevice_notifier(&hsr_nb);
+}
+
+module_init(hsr_init);
+module_exit(hsr_exit);
+MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver");
+MODULE_LICENSE("GPL");
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
new file mode 100644
index 000000000000..33b0d2460c9b
--- /dev/null
+++ b/net/hsr/hsr_main.h
@@ -0,0 +1,300 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
+ */
+
+#ifndef __HSR_PRIVATE_H
+#define __HSR_PRIVATE_H
+
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/if_vlan.h>
+#include <linux/if_hsr.h>
+
+/* Time constants as specified in the HSR specification (IEC-62439-3 2010)
+ * Table 8.
+ * All values in milliseconds.
+ */
+#define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */
+#define HSR_NODE_FORGET_TIME 60000 /* ms */
+#define HSR_PROXY_NODE_FORGET_TIME 60000 /* ms */
+#define HSR_ANNOUNCE_INTERVAL 100 /* ms */
+#define HSR_ENTRY_FORGET_TIME 400 /* ms */
+
+/* By how much may slave1 and slave2 timestamps of latest received frame from
+ * each node differ before we notify of communication problem?
+ */
+#define MAX_SLAVE_DIFF 3000 /* ms */
+#define HSR_SEQNR_START (USHRT_MAX - 1024)
+#define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2)
+
+/* How often shall we check for broken ring and remove node entries older than
+ * HSR_NODE_FORGET_TIME?
+ */
+#define PRUNE_PERIOD 3000 /* ms */
+#define PRUNE_PROXY_PERIOD 3000 /* ms */
+#define HSR_TLV_EOT 0 /* End of TLVs */
+#define HSR_TLV_ANNOUNCE 22
+#define HSR_TLV_LIFE_CHECK 23
+/* PRP V1 life check for Duplicate discard */
+#define PRP_TLV_LIFE_CHECK_DD 20
+/* PRP V1 life check for Duplicate Accept */
+#define PRP_TLV_LIFE_CHECK_DA 21
+/* PRP V1 life redundancy box MAC address */
+#define PRP_TLV_REDBOX_MAC 30
+
+#define HSR_V1_SUP_LSDUSIZE 52
+
+/* The helper functions below assumes that 'path' occupies the 4 most
+ * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
+ * equivalently, the 4 most significant bits of HSR tag byte 14).
+ *
+ * This is unclear in the IEC specification; its definition of MAC addresses
+ * indicates the spec is written with the least significant bit first (to the
+ * left). This, however, would mean that the LSDU field would be split in two
+ * with the path field in-between, which seems strange. I'm guessing the MAC
+ * address definition is in error.
+ */
+
+static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path)
+{
+ ht->path_and_LSDU_size =
+ htons((ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12));
+}
+
+static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size)
+{
+ ht->path_and_LSDU_size = htons((ntohs(ht->path_and_LSDU_size) &
+ 0xF000) | (LSDU_size & 0x0FFF));
+}
+
+struct hsr_ethhdr {
+ struct ethhdr ethhdr;
+ struct hsr_tag hsr_tag;
+} __packed;
+
+struct hsr_vlan_ethhdr {
+ struct vlan_ethhdr vlanhdr;
+ struct hsr_tag hsr_tag;
+} __packed;
+
+struct hsr_sup_tlv {
+ u8 HSR_TLV_type;
+ u8 HSR_TLV_length;
+} __packed;
+
+/* HSR/PRP Supervision Frame data types.
+ * Field names as defined in the IEC:2010 standard for HSR.
+ */
+struct hsr_sup_tag {
+ __be16 path_and_HSR_ver;
+ __be16 sequence_nr;
+ struct hsr_sup_tlv tlv;
+} __packed;
+
+struct hsr_sup_payload {
+ unsigned char macaddress_A[ETH_ALEN];
+} __packed;
+
+static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path)
+{
+ set_hsr_tag_path((struct hsr_tag *)hst, path);
+}
+
+static inline void set_hsr_stag_HSR_ver(struct hsr_sup_tag *hst, u16 HSR_ver)
+{
+ set_hsr_tag_LSDU_size((struct hsr_tag *)hst, HSR_ver);
+}
+
+struct hsrv0_ethhdr_sp {
+ struct ethhdr ethhdr;
+ struct hsr_sup_tag hsr_sup;
+} __packed;
+
+struct hsrv1_ethhdr_sp {
+ struct ethhdr ethhdr;
+ struct hsr_tag hsr;
+ struct hsr_sup_tag hsr_sup;
+} __packed;
+
+/* PRP Redunancy Control Trailor (RCT).
+ * As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr,
+ * Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }.
+ *
+ * Field names as defined in the IEC:2012 standard for PRP.
+ */
+struct prp_rct {
+ __be16 sequence_nr;
+ __be16 lan_id_and_LSDU_size;
+ __be16 PRP_suffix;
+} __packed;
+
+static inline u16 get_prp_LSDU_size(struct prp_rct *rct)
+{
+ return ntohs(rct->lan_id_and_LSDU_size) & 0x0FFF;
+}
+
+static inline void set_prp_lan_id(struct prp_rct *rct, u16 lan_id)
+{
+ rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) &
+ 0x0FFF) | (lan_id << 12));
+}
+static inline void set_prp_LSDU_size(struct prp_rct *rct, u16 LSDU_size)
+{
+ rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) &
+ 0xF000) | (LSDU_size & 0x0FFF));
+}
+
+struct hsr_port {
+ struct list_head port_list;
+ struct net_device *dev;
+ struct hsr_priv *hsr;
+ enum hsr_port_type type;
+ struct rcu_head rcu;
+ unsigned char original_macaddress[ETH_ALEN];
+};
+
+struct hsr_frame_info;
+struct hsr_node;
+
+struct hsr_proto_ops {
+ /* format and send supervision frame */
+ void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval,
+ const unsigned char addr[ETH_ALEN]);
+ void (*handle_san_frame)(bool san, enum hsr_port_type port,
+ struct hsr_node *node);
+ bool (*drop_frame)(struct hsr_frame_info *frame, struct hsr_port *port);
+ struct sk_buff * (*get_untagged_frame)(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+ struct sk_buff * (*create_tagged_frame)(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+ int (*fill_frame_info)(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame);
+ bool (*invalid_dan_ingress_frame)(__be16 protocol);
+ void (*update_san_info)(struct hsr_node *node, bool is_sup);
+ int (*register_frame_out)(struct hsr_port *port,
+ struct hsr_frame_info *frame);
+};
+
+struct hsr_self_node {
+ unsigned char macaddress_A[ETH_ALEN];
+ unsigned char macaddress_B[ETH_ALEN];
+ struct rcu_head rcu_head;
+};
+
+struct hsr_priv {
+ struct rcu_head rcu_head;
+ struct list_head ports;
+ struct list_head node_db; /* Known HSR nodes */
+ struct list_head proxy_node_db; /* RedBox HSR proxy nodes */
+ struct hsr_self_node __rcu *self_node; /* MACs of slaves */
+ struct timer_list announce_timer; /* Supervision frame dispatch */
+ struct timer_list announce_proxy_timer;
+ struct timer_list prune_timer;
+ struct timer_list prune_proxy_timer;
+ int announce_count;
+ u16 sequence_nr;
+ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */
+ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */
+ spinlock_t seqnr_lock; /* locking for sequence_nr */
+ spinlock_t list_lock; /* locking for node list */
+ struct hsr_proto_ops *proto_ops;
+#define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set
+ * based on SLAVE_A or SLAVE_B
+ */
+ u8 net_id; /* for PRP, it occupies most significant 3 bits
+ * of lan_id
+ */
+ bool fwd_offloaded; /* Forwarding offloaded to HW */
+ bool redbox; /* Device supports HSR RedBox */
+ unsigned char macaddress_redbox[ETH_ALEN];
+ unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16));
+ /* Align to u16 boundary to avoid unaligned access
+ * in ether_addr_equal
+ */
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *node_tbl_root;
+#endif
+};
+
+#define hsr_for_each_port(hsr, port) \
+ list_for_each_entry_rcu((port), &(hsr)->ports, port_list)
+
+#define hsr_for_each_port_rtnl(hsr, port) \
+ list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held())
+
+struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt);
+
+/* Caller must ensure skb is a valid HSR frame */
+static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+
+ hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb);
+ return ntohs(hsr_ethhdr->hsr_tag.sequence_nr);
+}
+
+static inline struct prp_rct *skb_get_PRP_rct(struct sk_buff *skb)
+{
+ unsigned char *tail = skb_tail_pointer(skb) - HSR_HLEN;
+
+ struct prp_rct *rct = (struct prp_rct *)tail;
+
+ if (rct->PRP_suffix == htons(ETH_P_PRP))
+ return rct;
+
+ return NULL;
+}
+
+/* Assume caller has confirmed this skb is PRP suffixed */
+static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct)
+{
+ return ntohs(rct->sequence_nr);
+}
+
+/* assume there is a valid rct */
+static inline bool prp_check_lsdu_size(struct sk_buff *skb,
+ struct prp_rct *rct,
+ bool is_sup)
+{
+ struct ethhdr *ethhdr;
+ int expected_lsdu_size;
+
+ if (is_sup) {
+ expected_lsdu_size = HSR_V1_SUP_LSDUSIZE;
+ } else {
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ expected_lsdu_size = skb->len - 14;
+ if (ethhdr->h_proto == htons(ETH_P_8021Q))
+ expected_lsdu_size -= 4;
+ }
+
+ return (expected_lsdu_size == get_prp_LSDU_size(rct));
+}
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+void hsr_debugfs_rename(struct net_device *dev);
+void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev);
+void hsr_debugfs_term(struct hsr_priv *priv);
+void hsr_debugfs_create_root(void);
+void hsr_debugfs_remove_root(void);
+#else
+static inline void hsr_debugfs_rename(struct net_device *dev)
+{
+}
+static inline void hsr_debugfs_init(struct hsr_priv *priv,
+ struct net_device *hsr_dev)
+{}
+static inline void hsr_debugfs_term(struct hsr_priv *priv)
+{}
+static inline void hsr_debugfs_create_root(void)
+{}
+static inline void hsr_debugfs_remove_root(void)
+{}
+#endif
+
+#endif /* __HSR_PRIVATE_H */
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
new file mode 100644
index 000000000000..db0b0af7a692
--- /dev/null
+++ b/net/hsr/hsr_netlink.c
@@ -0,0 +1,594 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Routines for handling Netlink messages for HSR and PRP.
+ */
+
+#include "hsr_netlink.h"
+#include <linux/kernel.h>
+#include <net/rtnetlink.h>
+#include <net/genetlink.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_framereg.h"
+
+static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
+ [IFLA_HSR_SLAVE1] = { .type = NLA_U32 },
+ [IFLA_HSR_SLAVE2] = { .type = NLA_U32 },
+ [IFLA_HSR_MULTICAST_SPEC] = { .type = NLA_U8 },
+ [IFLA_HSR_VERSION] = { .type = NLA_U8 },
+ [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN },
+ [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 },
+ [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 },
+ [IFLA_HSR_INTERLINK] = { .type = NLA_U32 },
+};
+
+/* Here, it seems a netdevice has already been allocated for us, and the
+ * hsr_dev_setup routine has been executed. Nice!
+ */
+static int hsr_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct net *link_net = rtnl_newlink_link_net(params);
+ struct net_device *link[2], *interlink = NULL;
+ struct nlattr **data = params->data;
+ enum hsr_version proto_version;
+ unsigned char multicast_spec;
+ u8 proto = HSR_PROTOCOL_HSR;
+
+ if (!net_eq(link_net, dev_net(dev))) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "HSR slaves/interlink must be on the same net namespace than HSR link");
+ return -EINVAL;
+ }
+
+ if (!data) {
+ NL_SET_ERR_MSG_MOD(extack, "No slave devices specified");
+ return -EINVAL;
+ }
+ if (!data[IFLA_HSR_SLAVE1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 device not specified");
+ return -EINVAL;
+ }
+ link[0] = __dev_get_by_index(link_net,
+ nla_get_u32(data[IFLA_HSR_SLAVE1]));
+ if (!link[0]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 does not exist");
+ return -EINVAL;
+ }
+ if (!data[IFLA_HSR_SLAVE2]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave2 device not specified");
+ return -EINVAL;
+ }
+ link[1] = __dev_get_by_index(link_net,
+ nla_get_u32(data[IFLA_HSR_SLAVE2]));
+ if (!link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave2 does not exist");
+ return -EINVAL;
+ }
+
+ if (link[0] == link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 and Slave2 are same");
+ return -EINVAL;
+ }
+
+ if (data[IFLA_HSR_INTERLINK])
+ interlink = __dev_get_by_index(link_net,
+ nla_get_u32(data[IFLA_HSR_INTERLINK]));
+
+ if (interlink && interlink == link[0]) {
+ NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave1 are the same");
+ return -EINVAL;
+ }
+
+ if (interlink && interlink == link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave2 are the same");
+ return -EINVAL;
+ }
+
+ multicast_spec = nla_get_u8_default(data[IFLA_HSR_MULTICAST_SPEC], 0);
+
+ if (data[IFLA_HSR_PROTOCOL])
+ proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]);
+
+ if (proto >= HSR_PROTOCOL_MAX) {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol");
+ return -EINVAL;
+ }
+
+ if (!data[IFLA_HSR_VERSION]) {
+ proto_version = HSR_V0;
+ } else {
+ if (proto == HSR_PROTOCOL_PRP) {
+ NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported");
+ return -EINVAL;
+ }
+
+ proto_version = nla_get_u8(data[IFLA_HSR_VERSION]);
+ if (proto_version > HSR_V1) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only HSR version 0/1 supported");
+ return -EINVAL;
+ }
+ }
+
+ if (proto == HSR_PROTOCOL_PRP) {
+ proto_version = PRP_V1;
+ if (interlink) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Interlink only works with HSR");
+ return -EINVAL;
+ }
+ }
+
+ return hsr_dev_finalize(dev, link, interlink, multicast_spec,
+ proto_version, extack);
+}
+
+static void hsr_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
+
+ timer_delete_sync(&hsr->prune_timer);
+ timer_delete_sync(&hsr->prune_proxy_timer);
+ timer_delete_sync(&hsr->announce_timer);
+ timer_delete_sync(&hsr->announce_proxy_timer);
+
+ hsr_debugfs_term(hsr);
+ hsr_del_ports(hsr);
+
+ hsr_del_self_node(hsr);
+ hsr_del_nodes(&hsr->node_db);
+ hsr_del_nodes(&hsr->proxy_node_db);
+
+ unregister_netdevice_queue(dev, head);
+}
+
+static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
+ u8 proto = HSR_PROTOCOL_HSR;
+ struct hsr_port *port;
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_INTERLINK, port->dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN,
+ hsr->sup_multicast_addr) ||
+ nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr))
+ goto nla_put_failure;
+ if (hsr->prot_version == PRP_V1)
+ proto = HSR_PROTOCOL_PRP;
+ else if (nla_put_u8(skb, IFLA_HSR_VERSION, hsr->prot_version))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops hsr_link_ops __read_mostly = {
+ .kind = "hsr",
+ .maxtype = IFLA_HSR_MAX,
+ .policy = hsr_policy,
+ .priv_size = sizeof(struct hsr_priv),
+ .setup = hsr_dev_setup,
+ .newlink = hsr_newlink,
+ .dellink = hsr_dellink,
+ .fill_info = hsr_fill_info,
+};
+
+/* attribute policy */
+static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
+ [HSR_A_NODE_ADDR] = { .len = ETH_ALEN },
+ [HSR_A_NODE_ADDR_B] = { .len = ETH_ALEN },
+ [HSR_A_IFINDEX] = { .type = NLA_U32 },
+ [HSR_A_IF1_AGE] = { .type = NLA_U32 },
+ [HSR_A_IF2_AGE] = { .type = NLA_U32 },
+ [HSR_A_IF1_SEQ] = { .type = NLA_U16 },
+ [HSR_A_IF2_SEQ] = { .type = NLA_U16 },
+};
+
+static struct genl_family hsr_genl_family;
+
+static const struct genl_multicast_group hsr_mcgrps[] = {
+ { .name = "hsr-network", },
+};
+
+/* This is called if for some node with MAC address addr, we only get frames
+ * over one of the slave interfaces. This would indicate an open network ring
+ * (i.e. a link has failed somewhere).
+ */
+void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN],
+ struct hsr_port *port)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ struct hsr_port *master;
+ int res;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ goto fail;
+
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0,
+ HSR_C_RING_ERROR);
+ if (!msg_head)
+ goto nla_put_failure;
+
+ res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb, HSR_A_IFINDEX, port->dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(&hsr_genl_family, skb, 0, 0, GFP_ATOMIC);
+
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+
+fail:
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ netdev_warn(master->dev, "Could not send HSR ring error message\n");
+ rcu_read_unlock();
+}
+
+/* This is called when we haven't heard from the node with MAC address addr for
+ * some time (just before the node is removed from the node table/list).
+ */
+void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN])
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ struct hsr_port *master;
+ int res;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ goto fail;
+
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_NODE_DOWN);
+ if (!msg_head)
+ goto nla_put_failure;
+
+ res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(&hsr_genl_family, skb, 0, 0, GFP_ATOMIC);
+
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+
+fail:
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ netdev_warn(master->dev, "Could not send HSR node down\n");
+ rcu_read_unlock();
+}
+
+/* HSR_C_GET_NODE_STATUS lets userspace query the internal HSR node table
+ * about the status of a specific node in the network, defined by its MAC
+ * address.
+ *
+ * Input: hsr ifindex, node mac address
+ * Output: hsr ifindex, node mac address (copied from request),
+ * age of latest frame from node over slave 1, slave 2 [ms]
+ */
+static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
+{
+ /* For receiving */
+ struct nlattr *na;
+ struct net_device *hsr_dev;
+
+ /* For sending */
+ struct sk_buff *skb_out;
+ void *msg_head;
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+ unsigned char hsr_node_addr_b[ETH_ALEN];
+ int hsr_node_if1_age;
+ u16 hsr_node_if1_seq;
+ int hsr_node_if2_age;
+ u16 hsr_node_if2_seq;
+ int addr_b_ifindex;
+ int res;
+
+ if (!info)
+ goto invalid;
+
+ na = info->attrs[HSR_A_IFINDEX];
+ if (!na)
+ goto invalid;
+ na = info->attrs[HSR_A_NODE_ADDR];
+ if (!na)
+ goto invalid;
+
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ if (!hsr_dev)
+ goto rcu_unlock;
+ if (!is_hsr_master(hsr_dev))
+ goto rcu_unlock;
+
+ /* Send reply */
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb_out) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_STATUS);
+ if (!msg_head) {
+ res = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ hsr = netdev_priv(hsr_dev);
+ res = hsr_get_node_data(hsr,
+ (unsigned char *)
+ nla_data(info->attrs[HSR_A_NODE_ADDR]),
+ hsr_node_addr_b,
+ &addr_b_ifindex,
+ &hsr_node_if1_age,
+ &hsr_node_if1_seq,
+ &hsr_node_if2_age,
+ &hsr_node_if2_seq);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN,
+ nla_data(info->attrs[HSR_A_NODE_ADDR]));
+ if (res < 0)
+ goto nla_put_failure;
+
+ if (addr_b_ifindex > -1) {
+ res = nla_put(skb_out, HSR_A_NODE_ADDR_B, ETH_ALEN,
+ hsr_node_addr_b);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX,
+ addr_b_ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IF1_AGE, hsr_node_if1_age);
+ if (res < 0)
+ goto nla_put_failure;
+ res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
+ if (res < 0)
+ goto nla_put_failure;
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port)
+ res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
+ port->dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb_out, HSR_A_IF2_AGE, hsr_node_if2_age);
+ if (res < 0)
+ goto nla_put_failure;
+ res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
+ if (res < 0)
+ goto nla_put_failure;
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port)
+ res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
+ port->dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+ return 0;
+
+rcu_unlock:
+ rcu_read_unlock();
+invalid:
+ netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
+ return 0;
+
+nla_put_failure:
+ kfree_skb(skb_out);
+ /* Fall through */
+
+fail:
+ rcu_read_unlock();
+ return res;
+}
+
+/* Get a list of MacAddressA of all nodes known to this node (including self).
+ */
+static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
+{
+ unsigned char addr[ETH_ALEN];
+ struct net_device *hsr_dev;
+ struct sk_buff *skb_out;
+ struct hsr_priv *hsr;
+ bool restart = false;
+ struct nlattr *na;
+ void *pos = NULL;
+ void *msg_head;
+ int res;
+
+ if (!info)
+ goto invalid;
+
+ na = info->attrs[HSR_A_IFINDEX];
+ if (!na)
+ goto invalid;
+
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ if (!hsr_dev)
+ goto rcu_unlock;
+ if (!is_hsr_master(hsr_dev))
+ goto rcu_unlock;
+
+restart:
+ /* Send reply */
+ skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb_out) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_LIST);
+ if (!msg_head) {
+ res = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ if (!restart) {
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
+
+ hsr = netdev_priv(hsr_dev);
+
+ if (!pos)
+ pos = hsr_get_next_node(hsr, NULL, addr);
+ while (pos) {
+ res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0) {
+ if (res == -EMSGSIZE) {
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out,
+ info->snd_portid);
+ restart = true;
+ goto restart;
+ }
+ goto nla_put_failure;
+ }
+ pos = hsr_get_next_node(hsr, pos, addr);
+ }
+ rcu_read_unlock();
+
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+ return 0;
+
+rcu_unlock:
+ rcu_read_unlock();
+invalid:
+ netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
+ return 0;
+
+nla_put_failure:
+ nlmsg_free(skb_out);
+ /* Fall through */
+
+fail:
+ rcu_read_unlock();
+ return res;
+}
+
+static const struct genl_small_ops hsr_ops[] = {
+ {
+ .cmd = HSR_C_GET_NODE_STATUS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = 0,
+ .doit = hsr_get_node_status,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = HSR_C_GET_NODE_LIST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = 0,
+ .doit = hsr_get_node_list,
+ .dumpit = NULL,
+ },
+};
+
+static struct genl_family hsr_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "HSR",
+ .version = 1,
+ .maxattr = HSR_A_MAX,
+ .policy = hsr_genl_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .small_ops = hsr_ops,
+ .n_small_ops = ARRAY_SIZE(hsr_ops),
+ .resv_start_op = HSR_C_SET_NODE_LIST + 1,
+ .mcgrps = hsr_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
+};
+
+int __init hsr_netlink_init(void)
+{
+ int rc;
+
+ rc = rtnl_link_register(&hsr_link_ops);
+ if (rc)
+ goto fail_rtnl_link_register;
+
+ rc = genl_register_family(&hsr_genl_family);
+ if (rc)
+ goto fail_genl_register_family;
+
+ hsr_debugfs_create_root();
+ return 0;
+
+fail_genl_register_family:
+ rtnl_link_unregister(&hsr_link_ops);
+fail_rtnl_link_register:
+
+ return rc;
+}
+
+void __exit hsr_netlink_exit(void)
+{
+ genl_unregister_family(&hsr_genl_family);
+ rtnl_link_unregister(&hsr_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("hsr");
diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h
new file mode 100644
index 000000000000..8c99e64e1cea
--- /dev/null
+++ b/net/hsr/hsr_netlink.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
+ */
+
+#ifndef __HSR_NETLINK_H
+#define __HSR_NETLINK_H
+
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <uapi/linux/hsr_netlink.h>
+
+struct hsr_priv;
+struct hsr_port;
+
+int __init hsr_netlink_init(void);
+void __exit hsr_netlink_exit(void);
+
+void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN],
+ struct hsr_port *port);
+void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]);
+
+#endif /* __HSR_NETLINK_H */
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
new file mode 100644
index 000000000000..afe06ba00ea4
--- /dev/null
+++ b/net/hsr/hsr_slave.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Frame handler other utility functions for HSR and PRP.
+ */
+
+#include "hsr_slave.h"
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_forward.h"
+#include "hsr_framereg.h"
+
+bool hsr_invalid_dan_ingress_frame(__be16 protocol)
+{
+ return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR));
+}
+
+static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+ __be16 protocol;
+
+ /* Packets from dev_loopback_xmit() do not have L2 header, bail out */
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ return RX_HANDLER_PASS;
+
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: skb invalid", __func__);
+ return RX_HANDLER_PASS;
+ }
+
+ port = hsr_port_get_rcu(skb->dev);
+ if (!port)
+ goto finish_pass;
+ hsr = port->hsr;
+
+ if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
+ /* Directly kill frames sent by ourselves */
+ kfree_skb(skb);
+ goto finish_consume;
+ }
+
+ /* For HSR, only tagged frames are expected (unless the device offloads
+ * HSR tag removal), but for PRP there could be non tagged frames as
+ * well from Single attached nodes (SANs).
+ */
+ protocol = eth_hdr(skb)->h_proto;
+
+ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
+ port->type != HSR_PT_INTERLINK &&
+ hsr->proto_ops->invalid_dan_ingress_frame &&
+ hsr->proto_ops->invalid_dan_ingress_frame(protocol))
+ goto finish_pass;
+
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) ||
+ protocol == htons(ETH_P_HSR)) {
+ if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) {
+ kfree_skb(skb);
+ goto finish_consume;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
+ }
+ skb_reset_mac_len(skb);
+
+ /* Only the frames received over the interlink port will assign a
+ * sequence number and require synchronisation vs other sender.
+ */
+ if (port->type == HSR_PT_INTERLINK) {
+ spin_lock_bh(&hsr->seqnr_lock);
+ hsr_forward_skb(skb, port);
+ spin_unlock_bh(&hsr->seqnr_lock);
+ } else {
+ hsr_forward_skb(skb, port);
+ }
+
+finish_consume:
+ return RX_HANDLER_CONSUMED;
+
+finish_pass:
+ return RX_HANDLER_PASS;
+}
+
+bool hsr_port_exists(const struct net_device *dev)
+{
+ return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame;
+}
+
+static int hsr_check_dev_ok(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ /* Don't allow HSR on non-ethernet like devices */
+ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave.");
+ return -EINVAL;
+ }
+
+ /* Don't allow enslaving hsr devices */
+ if (is_hsr_master(dev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot create trees of HSR devices.");
+ return -EINVAL;
+ }
+
+ if (hsr_port_exists(dev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "This device is already a HSR slave.");
+ return -EINVAL;
+ }
+
+ if (is_vlan_dev(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver.");
+ return -EINVAL;
+ }
+
+ if (dev->priv_flags & IFF_DONT_BRIDGE) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "This device does not support bridging.");
+ return -EOPNOTSUPP;
+ }
+
+ /* HSR over bonded devices has not been tested, but I'm not sure it
+ * won't work...
+ */
+
+ return 0;
+}
+
+/* Setup device to be added to the HSR bridge. */
+static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
+ struct hsr_port *port,
+ struct netlink_ext_ack *extack)
+
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *hsr_dev;
+ struct hsr_port *master;
+ int res;
+
+ /* Don't use promiscuous mode for offload since L2 frame forward
+ * happens at the offloaded hardware.
+ */
+ if (!port->hsr->fwd_offloaded) {
+ res = dev_set_promiscuity(dev, 1);
+ if (res)
+ return res;
+ }
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ hsr_dev = master->dev;
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST;
+ lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
+ res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack);
+ if (res)
+ goto fail_upper_dev_link;
+
+ res = netdev_rx_handler_register(dev, hsr_handle_frame, port);
+ if (res)
+ goto fail_rx_handler;
+ dev_disable_lro(dev);
+
+ return 0;
+
+fail_rx_handler:
+ netdev_upper_dev_unlink(dev, hsr_dev);
+fail_upper_dev_link:
+ if (!port->hsr->fwd_offloaded)
+ dev_set_promiscuity(dev, -1);
+
+ return res;
+}
+
+int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
+ enum hsr_port_type type, struct netlink_ext_ack *extack)
+{
+ struct hsr_port *port, *master;
+ int res;
+
+ if (type != HSR_PT_MASTER) {
+ res = hsr_check_dev_ok(dev, extack);
+ if (res)
+ return res;
+ }
+
+ port = hsr_port_get_hsr(hsr, type);
+ if (port)
+ return -EBUSY; /* This port already exists */
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+
+ port->hsr = hsr;
+ port->dev = dev;
+ port->type = type;
+ ether_addr_copy(port->original_macaddress, dev->dev_addr);
+
+ list_add_tail_rcu(&port->port_list, &hsr->ports);
+
+ if (type != HSR_PT_MASTER) {
+ res = hsr_portdev_setup(hsr, dev, port, extack);
+ if (res)
+ goto fail_dev_setup;
+ }
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ netdev_update_features(master->dev);
+ dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
+
+ return 0;
+
+fail_dev_setup:
+ list_del_rcu(&port->port_list);
+ kfree_rcu(port, rcu);
+ return res;
+}
+
+void hsr_del_port(struct hsr_port *port)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *master;
+
+ hsr = port->hsr;
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ list_del_rcu(&port->port_list);
+
+ if (port != master) {
+ netdev_update_features(master->dev);
+ dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
+ netdev_rx_handler_unregister(port->dev);
+ if (!port->hsr->fwd_offloaded)
+ dev_set_promiscuity(port->dev, -1);
+ netdev_upper_dev_unlink(port->dev, master->dev);
+ eth_hw_addr_set(port->dev, port->original_macaddress);
+ }
+
+ kfree_rcu(port, rcu);
+}
diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h
new file mode 100644
index 000000000000..edc4612bb009
--- /dev/null
+++ b/net/hsr/hsr_slave.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
+ */
+
+#ifndef __HSR_SLAVE_H
+#define __HSR_SLAVE_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include "hsr_main.h"
+
+int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
+ enum hsr_port_type pt, struct netlink_ext_ack *extack);
+void hsr_del_port(struct hsr_port *port);
+bool hsr_port_exists(const struct net_device *dev);
+
+static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev)
+{
+ ASSERT_RTNL();
+ return hsr_port_exists(dev) ?
+ rtnl_dereference(dev->rx_handler_data) : NULL;
+}
+
+static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev)
+{
+ return hsr_port_exists(dev) ?
+ rcu_dereference(dev->rx_handler_data) : NULL;
+}
+
+bool hsr_invalid_dan_ingress_frame(__be16 protocol);
+
+#endif /* __HSR_SLAVE_H */
diff --git a/net/hsr/prp_dup_discard_test.c b/net/hsr/prp_dup_discard_test.c
new file mode 100644
index 000000000000..e86b7b633ae8
--- /dev/null
+++ b/net/hsr/prp_dup_discard_test.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+
+struct prp_test_data {
+ struct hsr_port port;
+ struct hsr_port port_rcv;
+ struct hsr_frame_info frame;
+ struct hsr_node node;
+};
+
+static struct prp_test_data *build_prp_test_data(struct kunit *test)
+{
+ struct prp_test_data *data = kunit_kzalloc(test,
+ sizeof(struct prp_test_data), GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data);
+
+ data->frame.node_src = &data->node;
+ data->frame.port_rcv = &data->port_rcv;
+ data->port_rcv.type = HSR_PT_SLAVE_A;
+ data->node.seq_start[HSR_PT_SLAVE_A] = 1;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 1;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 1;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 1;
+ data->node.seq_out[HSR_PT_MASTER] = 0;
+ data->node.time_out[HSR_PT_MASTER] = jiffies;
+ data->port.type = HSR_PT_MASTER;
+
+ return data;
+}
+
+static void check_prp_counters(struct kunit *test,
+ struct prp_test_data *data,
+ u16 seq_start_a, u16 seq_expected_a,
+ u16 seq_start_b, u16 seq_expected_b)
+{
+ KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_A],
+ seq_start_a);
+ KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_B],
+ seq_start_b);
+ KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_A],
+ seq_expected_a);
+ KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_B],
+ seq_expected_b);
+}
+
+static void prp_dup_discard_forward(struct kunit *test)
+{
+ /* Normal situation, both LANs in sync. Next frame is forwarded */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->frame.sequence_nr = 2;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 1, 1);
+}
+
+static void prp_dup_discard_inside_dropwindow(struct kunit *test)
+{
+ /* Normal situation, other LAN ahead by one. Frame is dropped */
+ struct prp_test_data *data = build_prp_test_data(test);
+ unsigned long time = jiffies - 10;
+
+ data->frame.sequence_nr = 1;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 3;
+ data->node.seq_out[HSR_PT_MASTER] = 2;
+ data->node.time_out[HSR_PT_MASTER] = time;
+
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, 2, data->node.seq_out[HSR_PT_MASTER]);
+ KUNIT_EXPECT_EQ(test, time, data->node.time_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, 2, 2, 2, 3);
+}
+
+static void prp_dup_discard_node_timeout(struct kunit *test)
+{
+ /* Timeout situation, node hasn't sent anything for a while */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->frame.sequence_nr = 7;
+ data->node.seq_start[HSR_PT_SLAVE_A] = 1234;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 1235;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 1234;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 1234;
+ data->node.seq_out[HSR_PT_MASTER] = 1234;
+ data->node.time_out[HSR_PT_MASTER] =
+ jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1;
+
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 1234, 1234);
+}
+
+static void prp_dup_discard_out_of_sequence(struct kunit *test)
+{
+ /* One frame is received out of sequence on both LANs */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->node.seq_start[HSR_PT_SLAVE_A] = 10;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 10;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 10;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 10;
+ data->node.seq_out[HSR_PT_MASTER] = 9;
+
+ /* 1st old frame, should be accepted */
+ data->frame.sequence_nr = 8;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 10, 10);
+
+ /* 2nd frame should be dropped */
+ data->frame.sequence_nr = 8;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1);
+
+ /* Next frame, this is forwarded */
+ data->frame.sequence_nr = 10;
+ data->port_rcv.type = HSR_PT_SLAVE_A;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 9, 9);
+
+ /* and next one is dropped */
+ data->frame.sequence_nr = 10;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1);
+}
+
+static void prp_dup_discard_lan_b_late(struct kunit *test)
+{
+ /* LAN B is behind */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->node.seq_start[HSR_PT_SLAVE_A] = 9;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 9;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 9;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 9;
+ data->node.seq_out[HSR_PT_MASTER] = 8;
+
+ data->frame.sequence_nr = 9;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, 9, 10, 9, 9);
+
+ data->frame.sequence_nr = 10;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, 9, 11, 9, 9);
+
+ data->frame.sequence_nr = 9;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, 10, 11, 10, 10);
+
+ data->frame.sequence_nr = 10;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, 11, 11, 11, 11);
+}
+
+static struct kunit_case prp_dup_discard_test_cases[] = {
+ KUNIT_CASE(prp_dup_discard_forward),
+ KUNIT_CASE(prp_dup_discard_inside_dropwindow),
+ KUNIT_CASE(prp_dup_discard_node_timeout),
+ KUNIT_CASE(prp_dup_discard_out_of_sequence),
+ KUNIT_CASE(prp_dup_discard_lan_b_late),
+ {}
+};
+
+static struct kunit_suite prp_dup_discard_suite = {
+ .name = "prp_duplicate_discard",
+ .test_cases = prp_dup_discard_test_cases,
+};
+
+kunit_test_suite(prp_dup_discard_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KUnit tests for PRP duplicate discard");
+MODULE_AUTHOR("Jaakko Karrenpalo <jkarrenpalo@gmail.com>");
diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig
deleted file mode 100644
index a64be6cdf078..000000000000
--- a/net/ieee80211/Kconfig
+++ /dev/null
@@ -1,72 +0,0 @@
-config IEEE80211
- tristate "Generic IEEE 802.11 Networking Stack"
- ---help---
- This option enables the hardware independent IEEE 802.11
- networking stack.
-
-config IEEE80211_DEBUG
- bool "Enable full debugging output"
- depends on IEEE80211
- ---help---
- This option will enable debug tracing output for the
- ieee80211 network stack.
-
- This will result in the kernel module being ~70k larger. You
- can control which debug output is sent to the kernel log by
- setting the value in
-
- /proc/net/ieee80211/debug_level
-
- For example:
-
- % echo 0x00000FFO > /proc/net/ieee80211/debug_level
-
- For a list of values you can assign to debug_level, you
- can look at the bit mask values in <net/ieee80211.h>
-
- If you are not trying to debug or develop the ieee80211
- subsystem, you most likely want to say N here.
-
-config IEEE80211_CRYPT_WEP
- tristate "IEEE 802.11 WEP encryption (802.1x)"
- depends on IEEE80211
- select CRYPTO
- select CRYPTO_ARC4
- select CRYPTO_ECB
- select CRC32
- ---help---
- Include software based cipher suites in support of IEEE
- 802.11's WEP. This is needed for WEP as well as 802.1x.
-
- This can be compiled as a modules and it will be called
- "ieee80211_crypt_wep".
-
-config IEEE80211_CRYPT_CCMP
- tristate "IEEE 802.11i CCMP support"
- depends on IEEE80211
- select CRYPTO
- select CRYPTO_AES
- ---help---
- Include software based cipher suites in support of IEEE 802.11i
- (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with CCMP enabled
- networks.
-
- This can be compiled as a modules and it will be called
- "ieee80211_crypt_ccmp".
-
-config IEEE80211_CRYPT_TKIP
- tristate "IEEE 802.11i TKIP encryption"
- depends on IEEE80211 && NET_RADIO
- select CRYPTO
- select CRYPTO_MICHAEL_MIC
- select CRYPTO_ECB
- select CRC32
- ---help---
- Include software based cipher suites in support of IEEE 802.11i
- (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with TKIP enabled
- networks.
-
- This can be compiled as a modules and it will be called
- "ieee80211_crypt_tkip".
-
-source "net/ieee80211/softmac/Kconfig"
diff --git a/net/ieee80211/Makefile b/net/ieee80211/Makefile
deleted file mode 100644
index 796a7c76ee48..000000000000
--- a/net/ieee80211/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-obj-$(CONFIG_IEEE80211) += ieee80211.o
-obj-$(CONFIG_IEEE80211) += ieee80211_crypt.o
-obj-$(CONFIG_IEEE80211_CRYPT_WEP) += ieee80211_crypt_wep.o
-obj-$(CONFIG_IEEE80211_CRYPT_CCMP) += ieee80211_crypt_ccmp.o
-obj-$(CONFIG_IEEE80211_CRYPT_TKIP) += ieee80211_crypt_tkip.o
-ieee80211-objs := \
- ieee80211_module.o \
- ieee80211_tx.o \
- ieee80211_rx.o \
- ieee80211_wx.o \
- ieee80211_geo.o
-
-obj-$(CONFIG_IEEE80211_SOFTMAC) += softmac/
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
deleted file mode 100644
index 5ed0a98b2d76..000000000000
--- a/net/ieee80211/ieee80211_crypt.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Host AP crypto routines
- *
- * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
- * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation. See README and COPYING for
- * more details.
- *
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <net/ieee80211.h>
-
-MODULE_AUTHOR("Jouni Malinen");
-MODULE_DESCRIPTION("HostAP crypto");
-MODULE_LICENSE("GPL");
-
-struct ieee80211_crypto_alg {
- struct list_head list;
- struct ieee80211_crypto_ops *ops;
-};
-
-static LIST_HEAD(ieee80211_crypto_algs);
-static DEFINE_SPINLOCK(ieee80211_crypto_lock);
-
-void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
-{
- struct ieee80211_crypt_data *entry, *next;
- unsigned long flags;
-
- spin_lock_irqsave(&ieee->lock, flags);
- list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) {
- if (atomic_read(&entry->refcnt) != 0 && !force)
- continue;
-
- list_del(&entry->list);
-
- if (entry->ops) {
- entry->ops->deinit(entry->priv);
- module_put(entry->ops->owner);
- }
- kfree(entry);
- }
- spin_unlock_irqrestore(&ieee->lock, flags);
-}
-
-/* After this, crypt_deinit_list won't accept new members */
-void ieee80211_crypt_quiescing(struct ieee80211_device *ieee)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ieee->lock, flags);
- ieee->crypt_quiesced = 1;
- spin_unlock_irqrestore(&ieee->lock, flags);
-}
-
-void ieee80211_crypt_deinit_handler(unsigned long data)
-{
- struct ieee80211_device *ieee = (struct ieee80211_device *)data;
- unsigned long flags;
-
- ieee80211_crypt_deinit_entries(ieee, 0);
-
- spin_lock_irqsave(&ieee->lock, flags);
- if (!list_empty(&ieee->crypt_deinit_list) && !ieee->crypt_quiesced) {
- printk(KERN_DEBUG "%s: entries remaining in delayed crypt "
- "deletion list\n", ieee->dev->name);
- ieee->crypt_deinit_timer.expires = jiffies + HZ;
- add_timer(&ieee->crypt_deinit_timer);
- }
- spin_unlock_irqrestore(&ieee->lock, flags);
-}
-
-void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
- struct ieee80211_crypt_data **crypt)
-{
- struct ieee80211_crypt_data *tmp;
- unsigned long flags;
-
- if (*crypt == NULL)
- return;
-
- tmp = *crypt;
- *crypt = NULL;
-
- /* must not run ops->deinit() while there may be pending encrypt or
- * decrypt operations. Use a list of delayed deinits to avoid needing
- * locking. */
-
- spin_lock_irqsave(&ieee->lock, flags);
- if (!ieee->crypt_quiesced) {
- list_add(&tmp->list, &ieee->crypt_deinit_list);
- if (!timer_pending(&ieee->crypt_deinit_timer)) {
- ieee->crypt_deinit_timer.expires = jiffies + HZ;
- add_timer(&ieee->crypt_deinit_timer);
- }
- }
- spin_unlock_irqrestore(&ieee->lock, flags);
-}
-
-int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
-{
- unsigned long flags;
- struct ieee80211_crypto_alg *alg;
-
- alg = kzalloc(sizeof(*alg), GFP_KERNEL);
- if (alg == NULL)
- return -ENOMEM;
-
- alg->ops = ops;
-
- spin_lock_irqsave(&ieee80211_crypto_lock, flags);
- list_add(&alg->list, &ieee80211_crypto_algs);
- spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
-
- printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n",
- ops->name);
-
- return 0;
-}
-
-int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
-{
- struct ieee80211_crypto_alg *alg;
- unsigned long flags;
-
- spin_lock_irqsave(&ieee80211_crypto_lock, flags);
- list_for_each_entry(alg, &ieee80211_crypto_algs, list) {
- if (alg->ops == ops)
- goto found;
- }
- spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
- return -EINVAL;
-
- found:
- printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
- "'%s'\n", ops->name);
- list_del(&alg->list);
- spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
- kfree(alg);
- return 0;
-}
-
-struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
-{
- struct ieee80211_crypto_alg *alg;
- unsigned long flags;
-
- spin_lock_irqsave(&ieee80211_crypto_lock, flags);
- list_for_each_entry(alg, &ieee80211_crypto_algs, list) {
- if (strcmp(alg->ops->name, name) == 0)
- goto found;
- }
- spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
- return NULL;
-
- found:
- spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
- return alg->ops;
-}
-
-static void *ieee80211_crypt_null_init(int keyidx)
-{
- return (void *)1;
-}
-
-static void ieee80211_crypt_null_deinit(void *priv)
-{
-}
-
-static struct ieee80211_crypto_ops ieee80211_crypt_null = {
- .name = "NULL",
- .init = ieee80211_crypt_null_init,
- .deinit = ieee80211_crypt_null_deinit,
- .owner = THIS_MODULE,
-};
-
-static int __init ieee80211_crypto_init(void)
-{
- return ieee80211_register_crypto_ops(&ieee80211_crypt_null);
-}
-
-static void __exit ieee80211_crypto_deinit(void)
-{
- ieee80211_unregister_crypto_ops(&ieee80211_crypt_null);
- BUG_ON(!list_empty(&ieee80211_crypto_algs));
-}
-
-EXPORT_SYMBOL(ieee80211_crypt_deinit_entries);
-EXPORT_SYMBOL(ieee80211_crypt_deinit_handler);
-EXPORT_SYMBOL(ieee80211_crypt_delayed_deinit);
-EXPORT_SYMBOL(ieee80211_crypt_quiescing);
-
-EXPORT_SYMBOL(ieee80211_register_crypto_ops);
-EXPORT_SYMBOL(ieee80211_unregister_crypto_ops);
-EXPORT_SYMBOL(ieee80211_get_crypto_ops);
-
-module_init(ieee80211_crypto_init);
-module_exit(ieee80211_crypto_deinit);
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c
deleted file mode 100644
index 35aa3426c3fa..000000000000
--- a/net/ieee80211/ieee80211_crypt_ccmp.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Host AP crypt: host-based CCMP encryption implementation for Host AP driver
- *
- * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation. See README and COPYING for
- * more details.
- */
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/if_ether.h>
-#include <linux/if_arp.h>
-#include <asm/string.h>
-#include <linux/wireless.h>
-
-#include <net/ieee80211.h>
-
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
-MODULE_AUTHOR("Jouni Malinen");
-MODULE_DESCRIPTION("Host AP crypt: CCMP");
-MODULE_LICENSE("GPL");
-
-#define AES_BLOCK_LEN 16
-#define CCMP_HDR_LEN 8
-#define CCMP_MIC_LEN 8
-#define CCMP_TK_LEN 16
-#define CCMP_PN_LEN 6
-
-struct ieee80211_ccmp_data {
- u8 key[CCMP_TK_LEN];
- int key_set;
-
- u8 tx_pn[CCMP_PN_LEN];
- u8 rx_pn[CCMP_PN_LEN];
-
- u32 dot11RSNAStatsCCMPFormatErrors;
- u32 dot11RSNAStatsCCMPReplays;
- u32 dot11RSNAStatsCCMPDecryptErrors;
-
- int key_idx;
-
- struct crypto_cipher *tfm;
-
- /* scratch buffers for virt_to_page() (crypto API) */
- u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
- tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
- u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
-};
-
-static inline void ieee80211_ccmp_aes_encrypt(struct crypto_cipher *tfm,
- const u8 pt[16], u8 ct[16])
-{
- crypto_cipher_encrypt_one(tfm, ct, pt);
-}
-
-static void *ieee80211_ccmp_init(int key_idx)
-{
- struct ieee80211_ccmp_data *priv;
-
- priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
- if (priv == NULL)
- goto fail;
- priv->key_idx = key_idx;
-
- priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->tfm)) {
- printk(KERN_DEBUG "ieee80211_crypt_ccmp: could not allocate "
- "crypto API aes\n");
- priv->tfm = NULL;
- goto fail;
- }
-
- return priv;
-
- fail:
- if (priv) {
- if (priv->tfm)
- crypto_free_cipher(priv->tfm);
- kfree(priv);
- }
-
- return NULL;
-}
-
-static void ieee80211_ccmp_deinit(void *priv)
-{
- struct ieee80211_ccmp_data *_priv = priv;
- if (_priv && _priv->tfm)
- crypto_free_cipher(_priv->tfm);
- kfree(priv);
-}
-
-static inline void xor_block(u8 * b, u8 * a, size_t len)
-{
- int i;
- for (i = 0; i < len; i++)
- b[i] ^= a[i];
-}
-
-static void ccmp_init_blocks(struct crypto_cipher *tfm,
- struct ieee80211_hdr_4addr *hdr,
- u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0)
-{
- u8 *pos, qc = 0;
- size_t aad_len;
- u16 fc;
- int a4_included, qc_included;
- u8 aad[2 * AES_BLOCK_LEN];
-
- fc = le16_to_cpu(hdr->frame_ctl);
- a4_included = ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
- (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS));
- qc_included = ((WLAN_FC_GET_TYPE(fc) == IEEE80211_FTYPE_DATA) &&
- (WLAN_FC_GET_STYPE(fc) & IEEE80211_STYPE_QOS_DATA));
- aad_len = 22;
- if (a4_included)
- aad_len += 6;
- if (qc_included) {
- pos = (u8 *) & hdr->addr4;
- if (a4_included)
- pos += 6;
- qc = *pos & 0x0f;
- aad_len += 2;
- }
-
- /* CCM Initial Block:
- * Flag (Include authentication header, M=3 (8-octet MIC),
- * L=1 (2-octet Dlen))
- * Nonce: 0x00 | A2 | PN
- * Dlen */
- b0[0] = 0x59;
- b0[1] = qc;
- memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
- memcpy(b0 + 8, pn, CCMP_PN_LEN);
- b0[14] = (dlen >> 8) & 0xff;
- b0[15] = dlen & 0xff;
-
- /* AAD:
- * FC with bits 4..6 and 11..13 masked to zero; 14 is always one
- * A1 | A2 | A3
- * SC with bits 4..15 (seq#) masked to zero
- * A4 (if present)
- * QC (if present)
- */
- pos = (u8 *) hdr;
- aad[0] = 0; /* aad_len >> 8 */
- aad[1] = aad_len & 0xff;
- aad[2] = pos[0] & 0x8f;
- aad[3] = pos[1] & 0xc7;
- memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
- pos = (u8 *) & hdr->seq_ctl;
- aad[22] = pos[0] & 0x0f;
- aad[23] = 0; /* all bits masked */
- memset(aad + 24, 0, 8);
- if (a4_included)
- memcpy(aad + 24, hdr->addr4, ETH_ALEN);
- if (qc_included) {
- aad[a4_included ? 30 : 24] = qc;
- /* rest of QC masked */
- }
-
- /* Start with the first block and AAD */
- ieee80211_ccmp_aes_encrypt(tfm, b0, auth);
- xor_block(auth, aad, AES_BLOCK_LEN);
- ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
- xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
- ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
- b0[0] &= 0x07;
- b0[14] = b0[15] = 0;
- ieee80211_ccmp_aes_encrypt(tfm, b0, s0);
-}
-
-static int ieee80211_ccmp_hdr(struct sk_buff *skb, int hdr_len,
- u8 *aeskey, int keylen, void *priv)
-{
- struct ieee80211_ccmp_data *key = priv;
- int i;
- u8 *pos;
-
- if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len)
- return -1;
-
- if (aeskey != NULL && keylen >= CCMP_TK_LEN)
- memcpy(aeskey, key->key, CCMP_TK_LEN);
-
- pos = skb_push(skb, CCMP_HDR_LEN);
- memmove(pos, pos + CCMP_HDR_LEN, hdr_len);
- pos += hdr_len;
-
- i = CCMP_PN_LEN - 1;
- while (i >= 0) {
- key->tx_pn[i]++;
- if (key->tx_pn[i] != 0)
- break;
- i--;
- }
-
- *pos++ = key->tx_pn[5];
- *pos++ = key->tx_pn[4];
- *pos++ = 0;
- *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
- *pos++ = key->tx_pn[3];
- *pos++ = key->tx_pn[2];
- *pos++ = key->tx_pn[1];
- *pos++ = key->tx_pn[0];
-
- return CCMP_HDR_LEN;
-}
-
-static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
-{
- struct ieee80211_ccmp_data *key = priv;
- int data_len, i, blocks, last, len;
- u8 *pos, *mic;
- struct ieee80211_hdr_4addr *hdr;
- u8 *b0 = key->tx_b0;
- u8 *b = key->tx_b;
- u8 *e = key->tx_e;
- u8 *s0 = key->tx_s0;
-
- if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len)
- return -1;
-
- data_len = skb->len - hdr_len;
- len = ieee80211_ccmp_hdr(skb, hdr_len, NULL, 0, priv);
- if (len < 0)
- return -1;
-
- pos = skb->data + hdr_len + CCMP_HDR_LEN;
- mic = skb_put(skb, CCMP_MIC_LEN);
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
-
- blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
- last = data_len % AES_BLOCK_LEN;
-
- for (i = 1; i <= blocks; i++) {
- len = (i == blocks && last) ? last : AES_BLOCK_LEN;
- /* Authentication */
- xor_block(b, pos, len);
- ieee80211_ccmp_aes_encrypt(key->tfm, b, b);
- /* Encryption, with counter */
- b0[14] = (i >> 8) & 0xff;
- b0[15] = i & 0xff;
- ieee80211_ccmp_aes_encrypt(key->tfm, b0, e);
- xor_block(pos, e, len);
- pos += len;
- }
-
- for (i = 0; i < CCMP_MIC_LEN; i++)
- mic[i] = b[i] ^ s0[i];
-
- return 0;
-}
-
-/*
- * deal with seq counter wrapping correctly.
- * refer to timer_after() for jiffies wrapping handling
- */
-static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o)
-{
- u32 iv32_n, iv16_n;
- u32 iv32_o, iv16_o;
-
- iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3];
- iv16_n = (pn_n[4] << 8) | pn_n[5];
-
- iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3];
- iv16_o = (pn_o[4] << 8) | pn_o[5];
-
- if ((s32)iv32_n - (s32)iv32_o < 0 ||
- (iv32_n == iv32_o && iv16_n <= iv16_o))
- return 1;
- return 0;
-}
-
-static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
-{
- struct ieee80211_ccmp_data *key = priv;
- u8 keyidx, *pos;
- struct ieee80211_hdr_4addr *hdr;
- u8 *b0 = key->rx_b0;
- u8 *b = key->rx_b;
- u8 *a = key->rx_a;
- u8 pn[6];
- int i, blocks, last, len;
- size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
- u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
-
- if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
- key->dot11RSNAStatsCCMPFormatErrors++;
- return -1;
- }
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- pos = skb->data + hdr_len;
- keyidx = pos[3];
- if (!(keyidx & (1 << 5))) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "CCMP: received packet without ExtIV"
- " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
- }
- key->dot11RSNAStatsCCMPFormatErrors++;
- return -2;
- }
- keyidx >>= 6;
- if (key->key_idx != keyidx) {
- printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
- "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
- return -6;
- }
- if (!key->key_set) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "CCMP: received packet from " MAC_FMT
- " with keyid=%d that does not have a configured"
- " key\n", MAC_ARG(hdr->addr2), keyidx);
- }
- return -3;
- }
-
- pn[0] = pos[7];
- pn[1] = pos[6];
- pn[2] = pos[5];
- pn[3] = pos[4];
- pn[4] = pos[1];
- pn[5] = pos[0];
- pos += 8;
-
- if (ccmp_replay_check(pn, key->rx_pn)) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "CCMP: replay detected: STA=" MAC_FMT
- " previous PN %02x%02x%02x%02x%02x%02x "
- "received PN %02x%02x%02x%02x%02x%02x\n",
- MAC_ARG(hdr->addr2), MAC_ARG(key->rx_pn),
- MAC_ARG(pn));
- }
- key->dot11RSNAStatsCCMPReplays++;
- return -4;
- }
-
- ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
- xor_block(mic, b, CCMP_MIC_LEN);
-
- blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
- last = data_len % AES_BLOCK_LEN;
-
- for (i = 1; i <= blocks; i++) {
- len = (i == blocks && last) ? last : AES_BLOCK_LEN;
- /* Decrypt, with counter */
- b0[14] = (i >> 8) & 0xff;
- b0[15] = i & 0xff;
- ieee80211_ccmp_aes_encrypt(key->tfm, b0, b);
- xor_block(pos, b, len);
- /* Authentication */
- xor_block(a, pos, len);
- ieee80211_ccmp_aes_encrypt(key->tfm, a, a);
- pos += len;
- }
-
- if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "CCMP: decrypt failed: STA="
- MAC_FMT "\n", MAC_ARG(hdr->addr2));
- }
- key->dot11RSNAStatsCCMPDecryptErrors++;
- return -5;
- }
-
- memcpy(key->rx_pn, pn, CCMP_PN_LEN);
-
- /* Remove hdr and MIC */
- memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len);
- skb_pull(skb, CCMP_HDR_LEN);
- skb_trim(skb, skb->len - CCMP_MIC_LEN);
-
- return keyidx;
-}
-
-static int ieee80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
-{
- struct ieee80211_ccmp_data *data = priv;
- int keyidx;
- struct crypto_cipher *tfm = data->tfm;
-
- keyidx = data->key_idx;
- memset(data, 0, sizeof(*data));
- data->key_idx = keyidx;
- data->tfm = tfm;
- if (len == CCMP_TK_LEN) {
- memcpy(data->key, key, CCMP_TK_LEN);
- data->key_set = 1;
- if (seq) {
- data->rx_pn[0] = seq[5];
- data->rx_pn[1] = seq[4];
- data->rx_pn[2] = seq[3];
- data->rx_pn[3] = seq[2];
- data->rx_pn[4] = seq[1];
- data->rx_pn[5] = seq[0];
- }
- crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
- } else if (len == 0)
- data->key_set = 0;
- else
- return -1;
-
- return 0;
-}
-
-static int ieee80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv)
-{
- struct ieee80211_ccmp_data *data = priv;
-
- if (len < CCMP_TK_LEN)
- return -1;
-
- if (!data->key_set)
- return 0;
- memcpy(key, data->key, CCMP_TK_LEN);
-
- if (seq) {
- seq[0] = data->tx_pn[5];
- seq[1] = data->tx_pn[4];
- seq[2] = data->tx_pn[3];
- seq[3] = data->tx_pn[2];
- seq[4] = data->tx_pn[1];
- seq[5] = data->tx_pn[0];
- }
-
- return CCMP_TK_LEN;
-}
-
-static char *ieee80211_ccmp_print_stats(char *p, void *priv)
-{
- struct ieee80211_ccmp_data *ccmp = priv;
- p += sprintf(p, "key[%d] alg=CCMP key_set=%d "
- "tx_pn=%02x%02x%02x%02x%02x%02x "
- "rx_pn=%02x%02x%02x%02x%02x%02x "
- "format_errors=%d replays=%d decrypt_errors=%d\n",
- ccmp->key_idx, ccmp->key_set,
- MAC_ARG(ccmp->tx_pn), MAC_ARG(ccmp->rx_pn),
- ccmp->dot11RSNAStatsCCMPFormatErrors,
- ccmp->dot11RSNAStatsCCMPReplays,
- ccmp->dot11RSNAStatsCCMPDecryptErrors);
-
- return p;
-}
-
-static struct ieee80211_crypto_ops ieee80211_crypt_ccmp = {
- .name = "CCMP",
- .init = ieee80211_ccmp_init,
- .deinit = ieee80211_ccmp_deinit,
- .build_iv = ieee80211_ccmp_hdr,
- .encrypt_mpdu = ieee80211_ccmp_encrypt,
- .decrypt_mpdu = ieee80211_ccmp_decrypt,
- .encrypt_msdu = NULL,
- .decrypt_msdu = NULL,
- .set_key = ieee80211_ccmp_set_key,
- .get_key = ieee80211_ccmp_get_key,
- .print_stats = ieee80211_ccmp_print_stats,
- .extra_mpdu_prefix_len = CCMP_HDR_LEN,
- .extra_mpdu_postfix_len = CCMP_MIC_LEN,
- .owner = THIS_MODULE,
-};
-
-static int __init ieee80211_crypto_ccmp_init(void)
-{
- return ieee80211_register_crypto_ops(&ieee80211_crypt_ccmp);
-}
-
-static void __exit ieee80211_crypto_ccmp_exit(void)
-{
- ieee80211_unregister_crypto_ops(&ieee80211_crypt_ccmp);
-}
-
-module_init(ieee80211_crypto_ccmp_init);
-module_exit(ieee80211_crypto_ccmp_exit);
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
deleted file mode 100644
index 4200ec509866..000000000000
--- a/net/ieee80211/ieee80211_crypt_tkip.c
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
- * Host AP crypt: host-based TKIP encryption implementation for Host AP driver
- *
- * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation. See README and COPYING for
- * more details.
- */
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/if_ether.h>
-#include <linux/if_arp.h>
-#include <asm/string.h>
-
-#include <net/ieee80211.h>
-
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-#include <linux/crc32.h>
-
-MODULE_AUTHOR("Jouni Malinen");
-MODULE_DESCRIPTION("Host AP crypt: TKIP");
-MODULE_LICENSE("GPL");
-
-struct ieee80211_tkip_data {
-#define TKIP_KEY_LEN 32
- u8 key[TKIP_KEY_LEN];
- int key_set;
-
- u32 tx_iv32;
- u16 tx_iv16;
- u16 tx_ttak[5];
- int tx_phase1_done;
-
- u32 rx_iv32;
- u16 rx_iv16;
- u16 rx_ttak[5];
- int rx_phase1_done;
- u32 rx_iv32_new;
- u16 rx_iv16_new;
-
- u32 dot11RSNAStatsTKIPReplays;
- u32 dot11RSNAStatsTKIPICVErrors;
- u32 dot11RSNAStatsTKIPLocalMICFailures;
-
- int key_idx;
-
- struct crypto_blkcipher *rx_tfm_arc4;
- struct crypto_hash *rx_tfm_michael;
- struct crypto_blkcipher *tx_tfm_arc4;
- struct crypto_hash *tx_tfm_michael;
-
- /* scratch buffers for virt_to_page() (crypto API) */
- u8 rx_hdr[16], tx_hdr[16];
-
- unsigned long flags;
-};
-
-static unsigned long ieee80211_tkip_set_flags(unsigned long flags, void *priv)
-{
- struct ieee80211_tkip_data *_priv = priv;
- unsigned long old_flags = _priv->flags;
- _priv->flags = flags;
- return old_flags;
-}
-
-static unsigned long ieee80211_tkip_get_flags(void *priv)
-{
- struct ieee80211_tkip_data *_priv = priv;
- return _priv->flags;
-}
-
-static void *ieee80211_tkip_init(int key_idx)
-{
- struct ieee80211_tkip_data *priv;
-
- priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
- if (priv == NULL)
- goto fail;
-
- priv->key_idx = key_idx;
-
- priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->tx_tfm_arc4)) {
- printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
- "crypto API arc4\n");
- priv->tx_tfm_arc4 = NULL;
- goto fail;
- }
-
- priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->tx_tfm_michael)) {
- printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
- "crypto API michael_mic\n");
- priv->tx_tfm_michael = NULL;
- goto fail;
- }
-
- priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->rx_tfm_arc4)) {
- printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
- "crypto API arc4\n");
- priv->rx_tfm_arc4 = NULL;
- goto fail;
- }
-
- priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->rx_tfm_michael)) {
- printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
- "crypto API michael_mic\n");
- priv->rx_tfm_michael = NULL;
- goto fail;
- }
-
- return priv;
-
- fail:
- if (priv) {
- if (priv->tx_tfm_michael)
- crypto_free_hash(priv->tx_tfm_michael);
- if (priv->tx_tfm_arc4)
- crypto_free_blkcipher(priv->tx_tfm_arc4);
- if (priv->rx_tfm_michael)
- crypto_free_hash(priv->rx_tfm_michael);
- if (priv->rx_tfm_arc4)
- crypto_free_blkcipher(priv->rx_tfm_arc4);
- kfree(priv);
- }
-
- return NULL;
-}
-
-static void ieee80211_tkip_deinit(void *priv)
-{
- struct ieee80211_tkip_data *_priv = priv;
- if (_priv) {
- if (_priv->tx_tfm_michael)
- crypto_free_hash(_priv->tx_tfm_michael);
- if (_priv->tx_tfm_arc4)
- crypto_free_blkcipher(_priv->tx_tfm_arc4);
- if (_priv->rx_tfm_michael)
- crypto_free_hash(_priv->rx_tfm_michael);
- if (_priv->rx_tfm_arc4)
- crypto_free_blkcipher(_priv->rx_tfm_arc4);
- }
- kfree(priv);
-}
-
-static inline u16 RotR1(u16 val)
-{
- return (val >> 1) | (val << 15);
-}
-
-static inline u8 Lo8(u16 val)
-{
- return val & 0xff;
-}
-
-static inline u8 Hi8(u16 val)
-{
- return val >> 8;
-}
-
-static inline u16 Lo16(u32 val)
-{
- return val & 0xffff;
-}
-
-static inline u16 Hi16(u32 val)
-{
- return val >> 16;
-}
-
-static inline u16 Mk16(u8 hi, u8 lo)
-{
- return lo | (((u16) hi) << 8);
-}
-
-static inline u16 Mk16_le(u16 * v)
-{
- return le16_to_cpu(*v);
-}
-
-static const u16 Sbox[256] = {
- 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
- 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
- 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
- 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
- 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
- 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
- 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
- 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
- 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
- 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
- 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
- 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
- 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
- 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
- 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
- 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
- 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
- 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
- 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
- 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
- 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
- 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
- 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
- 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
- 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
- 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
- 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
- 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
- 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
- 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
- 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
- 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
-};
-
-static inline u16 _S_(u16 v)
-{
- u16 t = Sbox[Hi8(v)];
- return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8));
-}
-
-#define PHASE1_LOOP_COUNT 8
-
-static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA,
- u32 IV32)
-{
- int i, j;
-
- /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */
- TTAK[0] = Lo16(IV32);
- TTAK[1] = Hi16(IV32);
- TTAK[2] = Mk16(TA[1], TA[0]);
- TTAK[3] = Mk16(TA[3], TA[2]);
- TTAK[4] = Mk16(TA[5], TA[4]);
-
- for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
- j = 2 * (i & 1);
- TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j]));
- TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j]));
- TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j]));
- TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j]));
- TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i;
- }
-}
-
-static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK,
- u16 IV16)
-{
- /* Make temporary area overlap WEP seed so that the final copy can be
- * avoided on little endian hosts. */
- u16 *PPK = (u16 *) & WEPSeed[4];
-
- /* Step 1 - make copy of TTAK and bring in TSC */
- PPK[0] = TTAK[0];
- PPK[1] = TTAK[1];
- PPK[2] = TTAK[2];
- PPK[3] = TTAK[3];
- PPK[4] = TTAK[4];
- PPK[5] = TTAK[4] + IV16;
-
- /* Step 2 - 96-bit bijective mixing using S-box */
- PPK[0] += _S_(PPK[5] ^ Mk16_le((u16 *) & TK[0]));
- PPK[1] += _S_(PPK[0] ^ Mk16_le((u16 *) & TK[2]));
- PPK[2] += _S_(PPK[1] ^ Mk16_le((u16 *) & TK[4]));
- PPK[3] += _S_(PPK[2] ^ Mk16_le((u16 *) & TK[6]));
- PPK[4] += _S_(PPK[3] ^ Mk16_le((u16 *) & TK[8]));
- PPK[5] += _S_(PPK[4] ^ Mk16_le((u16 *) & TK[10]));
-
- PPK[0] += RotR1(PPK[5] ^ Mk16_le((u16 *) & TK[12]));
- PPK[1] += RotR1(PPK[0] ^ Mk16_le((u16 *) & TK[14]));
- PPK[2] += RotR1(PPK[1]);
- PPK[3] += RotR1(PPK[2]);
- PPK[4] += RotR1(PPK[3]);
- PPK[5] += RotR1(PPK[4]);
-
- /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value
- * WEPSeed[0..2] is transmitted as WEP IV */
- WEPSeed[0] = Hi8(IV16);
- WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F;
- WEPSeed[2] = Lo8(IV16);
- WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((u16 *) & TK[0])) >> 1);
-
-#ifdef __BIG_ENDIAN
- {
- int i;
- for (i = 0; i < 6; i++)
- PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8);
- }
-#endif
-}
-
-static int ieee80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
- u8 * rc4key, int keylen, void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
- int len;
- u8 *pos;
- struct ieee80211_hdr_4addr *hdr;
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
-
- if (skb_headroom(skb) < 8 || skb->len < hdr_len)
- return -1;
-
- if (rc4key == NULL || keylen < 16)
- return -1;
-
- if (!tkey->tx_phase1_done) {
- tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2,
- tkey->tx_iv32);
- tkey->tx_phase1_done = 1;
- }
- tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16);
-
- len = skb->len - hdr_len;
- pos = skb_push(skb, 8);
- memmove(pos, pos + 8, hdr_len);
- pos += hdr_len;
-
- *pos++ = *rc4key;
- *pos++ = *(rc4key + 1);
- *pos++ = *(rc4key + 2);
- *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
- *pos++ = tkey->tx_iv32 & 0xff;
- *pos++ = (tkey->tx_iv32 >> 8) & 0xff;
- *pos++ = (tkey->tx_iv32 >> 16) & 0xff;
- *pos++ = (tkey->tx_iv32 >> 24) & 0xff;
-
- tkey->tx_iv16++;
- if (tkey->tx_iv16 == 0) {
- tkey->tx_phase1_done = 0;
- tkey->tx_iv32++;
- }
-
- return 8;
-}
-
-static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
- struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 };
- int len;
- u8 rc4key[16], *pos, *icv;
- u32 crc;
- struct scatterlist sg;
-
- if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
- if (net_ratelimit()) {
- struct ieee80211_hdr_4addr *hdr =
- (struct ieee80211_hdr_4addr *)skb->data;
- printk(KERN_DEBUG ": TKIP countermeasures: dropped "
- "TX packet to " MAC_FMT "\n",
- MAC_ARG(hdr->addr1));
- }
- return -1;
- }
-
- if (skb_tailroom(skb) < 4 || skb->len < hdr_len)
- return -1;
-
- len = skb->len - hdr_len;
- pos = skb->data + hdr_len;
-
- if ((ieee80211_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0)
- return -1;
-
- icv = skb_put(skb, 4);
-
- crc = ~crc32_le(~0, pos, len);
- icv[0] = crc;
- icv[1] = crc >> 8;
- icv[2] = crc >> 16;
- icv[3] = crc >> 24;
-
- crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
- sg.page = virt_to_page(pos);
- sg.offset = offset_in_page(pos);
- sg.length = len + 4;
- return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
-}
-
-/*
- * deal with seq counter wrapping correctly.
- * refer to timer_after() for jiffies wrapping handling
- */
-static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
- u32 iv32_o, u16 iv16_o)
-{
- if ((s32)iv32_n - (s32)iv32_o < 0 ||
- (iv32_n == iv32_o && iv16_n <= iv16_o))
- return 1;
- return 0;
-}
-
-static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
- struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 };
- u8 rc4key[16];
- u8 keyidx, *pos;
- u32 iv32;
- u16 iv16;
- struct ieee80211_hdr_4addr *hdr;
- u8 icv[4];
- u32 crc;
- struct scatterlist sg;
- int plen;
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
-
- if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG ": TKIP countermeasures: dropped "
- "received packet from " MAC_FMT "\n",
- MAC_ARG(hdr->addr2));
- }
- return -1;
- }
-
- if (skb->len < hdr_len + 8 + 4)
- return -1;
-
- pos = skb->data + hdr_len;
- keyidx = pos[3];
- if (!(keyidx & (1 << 5))) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "TKIP: received packet without ExtIV"
- " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
- }
- return -2;
- }
- keyidx >>= 6;
- if (tkey->key_idx != keyidx) {
- printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
- "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
- return -6;
- }
- if (!tkey->key_set) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "TKIP: received packet from " MAC_FMT
- " with keyid=%d that does not have a configured"
- " key\n", MAC_ARG(hdr->addr2), keyidx);
- }
- return -3;
- }
- iv16 = (pos[0] << 8) | pos[2];
- iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24);
- pos += 8;
-
- if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "TKIP: replay detected: STA=" MAC_FMT
- " previous TSC %08x%04x received TSC "
- "%08x%04x\n", MAC_ARG(hdr->addr2),
- tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
- }
- tkey->dot11RSNAStatsTKIPReplays++;
- return -4;
- }
-
- if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) {
- tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32);
- tkey->rx_phase1_done = 1;
- }
- tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16);
-
- plen = skb->len - hdr_len - 12;
-
- crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
- sg.page = virt_to_page(pos);
- sg.offset = offset_in_page(pos);
- sg.length = plen + 4;
- if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG ": TKIP: failed to decrypt "
- "received packet from " MAC_FMT "\n",
- MAC_ARG(hdr->addr2));
- }
- return -7;
- }
-
- crc = ~crc32_le(~0, pos, plen);
- icv[0] = crc;
- icv[1] = crc >> 8;
- icv[2] = crc >> 16;
- icv[3] = crc >> 24;
- if (memcmp(icv, pos + plen, 4) != 0) {
- if (iv32 != tkey->rx_iv32) {
- /* Previously cached Phase1 result was already lost, so
- * it needs to be recalculated for the next packet. */
- tkey->rx_phase1_done = 0;
- }
- if (net_ratelimit()) {
- printk(KERN_DEBUG "TKIP: ICV error detected: STA="
- MAC_FMT "\n", MAC_ARG(hdr->addr2));
- }
- tkey->dot11RSNAStatsTKIPICVErrors++;
- return -5;
- }
-
- /* Update real counters only after Michael MIC verification has
- * completed */
- tkey->rx_iv32_new = iv32;
- tkey->rx_iv16_new = iv16;
-
- /* Remove IV and ICV */
- memmove(skb->data + 8, skb->data, hdr_len);
- skb_pull(skb, 8);
- skb_trim(skb, skb->len - 4);
-
- return keyidx;
-}
-
-static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
- u8 * data, size_t data_len, u8 * mic)
-{
- struct hash_desc desc;
- struct scatterlist sg[2];
-
- if (tfm_michael == NULL) {
- printk(KERN_WARNING "michael_mic: tfm_michael == NULL\n");
- return -1;
- }
- sg[0].page = virt_to_page(hdr);
- sg[0].offset = offset_in_page(hdr);
- sg[0].length = 16;
-
- sg[1].page = virt_to_page(data);
- sg[1].offset = offset_in_page(data);
- sg[1].length = data_len;
-
- if (crypto_hash_setkey(tfm_michael, key, 8))
- return -1;
-
- desc.tfm = tfm_michael;
- desc.flags = 0;
- return crypto_hash_digest(&desc, sg, data_len + 16, mic);
-}
-
-static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
-{
- struct ieee80211_hdr_4addr *hdr11;
- u16 stype;
-
- hdr11 = (struct ieee80211_hdr_4addr *)skb->data;
- stype = WLAN_FC_GET_STYPE(le16_to_cpu(hdr11->frame_ctl));
-
- switch (le16_to_cpu(hdr11->frame_ctl) &
- (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
- case IEEE80211_FCTL_TODS:
- memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
- memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
- break;
- case IEEE80211_FCTL_FROMDS:
- memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
- memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */
- break;
- case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
- memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
- memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
- break;
- case 0:
- memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
- memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
- break;
- }
-
- if (stype & IEEE80211_STYPE_QOS_DATA) {
- const struct ieee80211_hdr_3addrqos *qoshdr =
- (struct ieee80211_hdr_3addrqos *)skb->data;
- hdr[12] = qoshdr->qos_ctl & cpu_to_le16(IEEE80211_QCTL_TID);
- } else
- hdr[12] = 0; /* priority */
-
- hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */
-}
-
-static int ieee80211_michael_mic_add(struct sk_buff *skb, int hdr_len,
- void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
- u8 *pos;
-
- if (skb_tailroom(skb) < 8 || skb->len < hdr_len) {
- printk(KERN_DEBUG "Invalid packet for Michael MIC add "
- "(tailroom=%d hdr_len=%d skb->len=%d)\n",
- skb_tailroom(skb), hdr_len, skb->len);
- return -1;
- }
-
- michael_mic_hdr(skb, tkey->tx_hdr);
- pos = skb_put(skb, 8);
- if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr,
- skb->data + hdr_len, skb->len - 8 - hdr_len, pos))
- return -1;
-
- return 0;
-}
-
-static void ieee80211_michael_mic_failure(struct net_device *dev,
- struct ieee80211_hdr_4addr *hdr,
- int keyidx)
-{
- union iwreq_data wrqu;
- struct iw_michaelmicfailure ev;
-
- /* TODO: needed parameters: count, keyid, key type, TSC */
- memset(&ev, 0, sizeof(ev));
- ev.flags = keyidx & IW_MICFAILURE_KEY_ID;
- if (hdr->addr1[0] & 0x01)
- ev.flags |= IW_MICFAILURE_GROUP;
- else
- ev.flags |= IW_MICFAILURE_PAIRWISE;
- ev.src_addr.sa_family = ARPHRD_ETHER;
- memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN);
- memset(&wrqu, 0, sizeof(wrqu));
- wrqu.data.length = sizeof(ev);
- wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev);
-}
-
-static int ieee80211_michael_mic_verify(struct sk_buff *skb, int keyidx,
- int hdr_len, void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
- u8 mic[8];
-
- if (!tkey->key_set)
- return -1;
-
- michael_mic_hdr(skb, tkey->rx_hdr);
- if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr,
- skb->data + hdr_len, skb->len - 8 - hdr_len, mic))
- return -1;
- if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) {
- struct ieee80211_hdr_4addr *hdr;
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- printk(KERN_DEBUG "%s: Michael MIC verification failed for "
- "MSDU from " MAC_FMT " keyidx=%d\n",
- skb->dev ? skb->dev->name : "N/A", MAC_ARG(hdr->addr2),
- keyidx);
- if (skb->dev)
- ieee80211_michael_mic_failure(skb->dev, hdr, keyidx);
- tkey->dot11RSNAStatsTKIPLocalMICFailures++;
- return -1;
- }
-
- /* Update TSC counters for RX now that the packet verification has
- * completed. */
- tkey->rx_iv32 = tkey->rx_iv32_new;
- tkey->rx_iv16 = tkey->rx_iv16_new;
-
- skb_trim(skb, skb->len - 8);
-
- return 0;
-}
-
-static int ieee80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
- int keyidx;
- struct crypto_hash *tfm = tkey->tx_tfm_michael;
- struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4;
- struct crypto_hash *tfm3 = tkey->rx_tfm_michael;
- struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4;
-
- keyidx = tkey->key_idx;
- memset(tkey, 0, sizeof(*tkey));
- tkey->key_idx = keyidx;
- tkey->tx_tfm_michael = tfm;
- tkey->tx_tfm_arc4 = tfm2;
- tkey->rx_tfm_michael = tfm3;
- tkey->rx_tfm_arc4 = tfm4;
- if (len == TKIP_KEY_LEN) {
- memcpy(tkey->key, key, TKIP_KEY_LEN);
- tkey->key_set = 1;
- tkey->tx_iv16 = 1; /* TSC is initialized to 1 */
- if (seq) {
- tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) |
- (seq[3] << 8) | seq[2];
- tkey->rx_iv16 = (seq[1] << 8) | seq[0];
- }
- } else if (len == 0)
- tkey->key_set = 0;
- else
- return -1;
-
- return 0;
-}
-
-static int ieee80211_tkip_get_key(void *key, int len, u8 * seq, void *priv)
-{
- struct ieee80211_tkip_data *tkey = priv;
-
- if (len < TKIP_KEY_LEN)
- return -1;
-
- if (!tkey->key_set)
- return 0;
- memcpy(key, tkey->key, TKIP_KEY_LEN);
-
- if (seq) {
- /* Return the sequence number of the last transmitted frame. */
- u16 iv16 = tkey->tx_iv16;
- u32 iv32 = tkey->tx_iv32;
- if (iv16 == 0)
- iv32--;
- iv16--;
- seq[0] = tkey->tx_iv16;
- seq[1] = tkey->tx_iv16 >> 8;
- seq[2] = tkey->tx_iv32;
- seq[3] = tkey->tx_iv32 >> 8;
- seq[4] = tkey->tx_iv32 >> 16;
- seq[5] = tkey->tx_iv32 >> 24;
- }
-
- return TKIP_KEY_LEN;
-}
-
-static char *ieee80211_tkip_print_stats(char *p, void *priv)
-{
- struct ieee80211_tkip_data *tkip = priv;
- p += sprintf(p, "key[%d] alg=TKIP key_set=%d "
- "tx_pn=%02x%02x%02x%02x%02x%02x "
- "rx_pn=%02x%02x%02x%02x%02x%02x "
- "replays=%d icv_errors=%d local_mic_failures=%d\n",
- tkip->key_idx, tkip->key_set,
- (tkip->tx_iv32 >> 24) & 0xff,
- (tkip->tx_iv32 >> 16) & 0xff,
- (tkip->tx_iv32 >> 8) & 0xff,
- tkip->tx_iv32 & 0xff,
- (tkip->tx_iv16 >> 8) & 0xff,
- tkip->tx_iv16 & 0xff,
- (tkip->rx_iv32 >> 24) & 0xff,
- (tkip->rx_iv32 >> 16) & 0xff,
- (tkip->rx_iv32 >> 8) & 0xff,
- tkip->rx_iv32 & 0xff,
- (tkip->rx_iv16 >> 8) & 0xff,
- tkip->rx_iv16 & 0xff,
- tkip->dot11RSNAStatsTKIPReplays,
- tkip->dot11RSNAStatsTKIPICVErrors,
- tkip->dot11RSNAStatsTKIPLocalMICFailures);
- return p;
-}
-
-static struct ieee80211_crypto_ops ieee80211_crypt_tkip = {
- .name = "TKIP",
- .init = ieee80211_tkip_init,
- .deinit = ieee80211_tkip_deinit,
- .build_iv = ieee80211_tkip_hdr,
- .encrypt_mpdu = ieee80211_tkip_encrypt,
- .decrypt_mpdu = ieee80211_tkip_decrypt,
- .encrypt_msdu = ieee80211_michael_mic_add,
- .decrypt_msdu = ieee80211_michael_mic_verify,
- .set_key = ieee80211_tkip_set_key,
- .get_key = ieee80211_tkip_get_key,
- .print_stats = ieee80211_tkip_print_stats,
- .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */
- .extra_mpdu_postfix_len = 4, /* ICV */
- .extra_msdu_postfix_len = 8, /* MIC */
- .get_flags = ieee80211_tkip_get_flags,
- .set_flags = ieee80211_tkip_set_flags,
- .owner = THIS_MODULE,
-};
-
-static int __init ieee80211_crypto_tkip_init(void)
-{
- return ieee80211_register_crypto_ops(&ieee80211_crypt_tkip);
-}
-
-static void __exit ieee80211_crypto_tkip_exit(void)
-{
- ieee80211_unregister_crypto_ops(&ieee80211_crypt_tkip);
-}
-
-module_init(ieee80211_crypto_tkip_init);
-module_exit(ieee80211_crypto_tkip_exit);
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
deleted file mode 100644
index 1b2efff11d39..000000000000
--- a/net/ieee80211/ieee80211_crypt_wep.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Host AP crypt: host-based WEP encryption implementation for Host AP driver
- *
- * Copyright (c) 2002-2004, Jouni Malinen <jkmaline@cc.hut.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation. See README and COPYING for
- * more details.
- */
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/skbuff.h>
-#include <asm/string.h>
-
-#include <net/ieee80211.h>
-
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-#include <linux/crc32.h>
-
-MODULE_AUTHOR("Jouni Malinen");
-MODULE_DESCRIPTION("Host AP crypt: WEP");
-MODULE_LICENSE("GPL");
-
-struct prism2_wep_data {
- u32 iv;
-#define WEP_KEY_LEN 13
- u8 key[WEP_KEY_LEN + 1];
- u8 key_len;
- u8 key_idx;
- struct crypto_blkcipher *tx_tfm;
- struct crypto_blkcipher *rx_tfm;
-};
-
-static void *prism2_wep_init(int keyidx)
-{
- struct prism2_wep_data *priv;
-
- priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
- if (priv == NULL)
- goto fail;
- priv->key_idx = keyidx;
-
- priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->tx_tfm)) {
- printk(KERN_DEBUG "ieee80211_crypt_wep: could not allocate "
- "crypto API arc4\n");
- priv->tx_tfm = NULL;
- goto fail;
- }
-
- priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(priv->rx_tfm)) {
- printk(KERN_DEBUG "ieee80211_crypt_wep: could not allocate "
- "crypto API arc4\n");
- priv->rx_tfm = NULL;
- goto fail;
- }
- /* start WEP IV from a random value */
- get_random_bytes(&priv->iv, 4);
-
- return priv;
-
- fail:
- if (priv) {
- if (priv->tx_tfm)
- crypto_free_blkcipher(priv->tx_tfm);
- if (priv->rx_tfm)
- crypto_free_blkcipher(priv->rx_tfm);
- kfree(priv);
- }
- return NULL;
-}
-
-static void prism2_wep_deinit(void *priv)
-{
- struct prism2_wep_data *_priv = priv;
- if (_priv) {
- if (_priv->tx_tfm)
- crypto_free_blkcipher(_priv->tx_tfm);
- if (_priv->rx_tfm)
- crypto_free_blkcipher(_priv->rx_tfm);
- }
- kfree(priv);
-}
-
-/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */
-static int prism2_wep_build_iv(struct sk_buff *skb, int hdr_len,
- u8 *key, int keylen, void *priv)
-{
- struct prism2_wep_data *wep = priv;
- u32 klen, len;
- u8 *pos;
-
- if (skb_headroom(skb) < 4 || skb->len < hdr_len)
- return -1;
-
- len = skb->len - hdr_len;
- pos = skb_push(skb, 4);
- memmove(pos, pos + 4, hdr_len);
- pos += hdr_len;
-
- klen = 3 + wep->key_len;
-
- wep->iv++;
-
- /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key
- * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N)
- * can be used to speedup attacks, so avoid using them. */
- if ((wep->iv & 0xff00) == 0xff00) {
- u8 B = (wep->iv >> 16) & 0xff;
- if (B >= 3 && B < klen)
- wep->iv += 0x0100;
- }
-
- /* Prepend 24-bit IV to RC4 key and TX frame */
- *pos++ = (wep->iv >> 16) & 0xff;
- *pos++ = (wep->iv >> 8) & 0xff;
- *pos++ = wep->iv & 0xff;
- *pos++ = wep->key_idx << 6;
-
- return 0;
-}
-
-/* Perform WEP encryption on given skb that has at least 4 bytes of headroom
- * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted,
- * so the payload length increases with 8 bytes.
- *
- * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
- */
-static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
-{
- struct prism2_wep_data *wep = priv;
- struct blkcipher_desc desc = { .tfm = wep->tx_tfm };
- u32 crc, klen, len;
- u8 *pos, *icv;
- struct scatterlist sg;
- u8 key[WEP_KEY_LEN + 3];
-
- /* other checks are in prism2_wep_build_iv */
- if (skb_tailroom(skb) < 4)
- return -1;
-
- /* add the IV to the frame */
- if (prism2_wep_build_iv(skb, hdr_len, NULL, 0, priv))
- return -1;
-
- /* Copy the IV into the first 3 bytes of the key */
- memcpy(key, skb->data + hdr_len, 3);
-
- /* Copy rest of the WEP key (the secret part) */
- memcpy(key + 3, wep->key, wep->key_len);
-
- len = skb->len - hdr_len - 4;
- pos = skb->data + hdr_len + 4;
- klen = 3 + wep->key_len;
-
- /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */
- crc = ~crc32_le(~0, pos, len);
- icv = skb_put(skb, 4);
- icv[0] = crc;
- icv[1] = crc >> 8;
- icv[2] = crc >> 16;
- icv[3] = crc >> 24;
-
- crypto_blkcipher_setkey(wep->tx_tfm, key, klen);
- sg.page = virt_to_page(pos);
- sg.offset = offset_in_page(pos);
- sg.length = len + 4;
- return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
-}
-
-/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
- * the frame: IV (4 bytes), encrypted payload (including SNAP header),
- * ICV (4 bytes). len includes both IV and ICV.
- *
- * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
- * failure. If frame is OK, IV and ICV will be removed.
- */
-static int prism2_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
-{
- struct prism2_wep_data *wep = priv;
- struct blkcipher_desc desc = { .tfm = wep->rx_tfm };
- u32 crc, klen, plen;
- u8 key[WEP_KEY_LEN + 3];
- u8 keyidx, *pos, icv[4];
- struct scatterlist sg;
-
- if (skb->len < hdr_len + 8)
- return -1;
-
- pos = skb->data + hdr_len;
- key[0] = *pos++;
- key[1] = *pos++;
- key[2] = *pos++;
- keyidx = *pos++ >> 6;
- if (keyidx != wep->key_idx)
- return -1;
-
- klen = 3 + wep->key_len;
-
- /* Copy rest of the WEP key (the secret part) */
- memcpy(key + 3, wep->key, wep->key_len);
-
- /* Apply RC4 to data and compute CRC32 over decrypted data */
- plen = skb->len - hdr_len - 8;
-
- crypto_blkcipher_setkey(wep->rx_tfm, key, klen);
- sg.page = virt_to_page(pos);
- sg.offset = offset_in_page(pos);
- sg.length = plen + 4;
- if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4))
- return -7;
-
- crc = ~crc32_le(~0, pos, plen);
- icv[0] = crc;
- icv[1] = crc >> 8;
- icv[2] = crc >> 16;
- icv[3] = crc >> 24;
- if (memcmp(icv, pos + plen, 4) != 0) {
- /* ICV mismatch - drop frame */
- return -2;
- }
-
- /* Remove IV and ICV */
- memmove(skb->data + 4, skb->data, hdr_len);
- skb_pull(skb, 4);
- skb_trim(skb, skb->len - 4);
-
- return 0;
-}
-
-static int prism2_wep_set_key(void *key, int len, u8 * seq, void *priv)
-{
- struct prism2_wep_data *wep = priv;
-
- if (len < 0 || len > WEP_KEY_LEN)
- return -1;
-
- memcpy(wep->key, key, len);
- wep->key_len = len;
-
- return 0;
-}
-
-static int prism2_wep_get_key(void *key, int len, u8 * seq, void *priv)
-{
- struct prism2_wep_data *wep = priv;
-
- if (len < wep->key_len)
- return -1;
-
- memcpy(key, wep->key, wep->key_len);
-
- return wep->key_len;
-}
-
-static char *prism2_wep_print_stats(char *p, void *priv)
-{
- struct prism2_wep_data *wep = priv;
- p += sprintf(p, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len);
- return p;
-}
-
-static struct ieee80211_crypto_ops ieee80211_crypt_wep = {
- .name = "WEP",
- .init = prism2_wep_init,
- .deinit = prism2_wep_deinit,
- .build_iv = prism2_wep_build_iv,
- .encrypt_mpdu = prism2_wep_encrypt,
- .decrypt_mpdu = prism2_wep_decrypt,
- .encrypt_msdu = NULL,
- .decrypt_msdu = NULL,
- .set_key = prism2_wep_set_key,
- .get_key = prism2_wep_get_key,
- .print_stats = prism2_wep_print_stats,
- .extra_mpdu_prefix_len = 4, /* IV */
- .extra_mpdu_postfix_len = 4, /* ICV */
- .owner = THIS_MODULE,
-};
-
-static int __init ieee80211_crypto_wep_init(void)
-{
- return ieee80211_register_crypto_ops(&ieee80211_crypt_wep);
-}
-
-static void __exit ieee80211_crypto_wep_exit(void)
-{
- ieee80211_unregister_crypto_ops(&ieee80211_crypt_wep);
-}
-
-module_init(ieee80211_crypto_wep_init);
-module_exit(ieee80211_crypto_wep_exit);
diff --git a/net/ieee80211/ieee80211_geo.c b/net/ieee80211/ieee80211_geo.c
deleted file mode 100644
index 305a09de85a5..000000000000
--- a/net/ieee80211/ieee80211_geo.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/******************************************************************************
-
- Copyright(c) 2005 Intel Corporation. All rights reserved.
-
- This program is free software; you can redistribute it and/or modify it
- under the terms of version 2 of the GNU General Public License as
- published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program; if not, write to the Free Software Foundation, Inc., 59
- Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- The full GNU General Public License is included in this distribution in the
- file called LICENSE.
-
- Contact Information:
- James P. Ketrenos <ipw2100-admin@linux.intel.com>
- Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-******************************************************************************/
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/if_arp.h>
-#include <linux/in6.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/tcp.h>
-#include <linux/types.h>
-#include <linux/wireless.h>
-#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
-
-#include <net/ieee80211.h>
-
-int ieee80211_is_valid_channel(struct ieee80211_device *ieee, u8 channel)
-{
- int i;
-
- /* Driver needs to initialize the geography map before using
- * these helper functions */
- if (ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0)
- return 0;
-
- if (ieee->freq_band & IEEE80211_24GHZ_BAND)
- for (i = 0; i < ieee->geo.bg_channels; i++)
- /* NOTE: If G mode is currently supported but
- * this is a B only channel, we don't see it
- * as valid. */
- if ((ieee->geo.bg[i].channel == channel) &&
- !(ieee->geo.bg[i].flags & IEEE80211_CH_INVALID) &&
- (!(ieee->mode & IEEE_G) ||
- !(ieee->geo.bg[i].flags & IEEE80211_CH_B_ONLY)))
- return IEEE80211_24GHZ_BAND;
-
- if (ieee->freq_band & IEEE80211_52GHZ_BAND)
- for (i = 0; i < ieee->geo.a_channels; i++)
- if ((ieee->geo.a[i].channel == channel) &&
- !(ieee->geo.a[i].flags & IEEE80211_CH_INVALID))
- return IEEE80211_52GHZ_BAND;
-
- return 0;
-}
-
-int ieee80211_channel_to_index(struct ieee80211_device *ieee, u8 channel)
-{
- int i;
-
- /* Driver needs to initialize the geography map before using
- * these helper functions */
- if (ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0)
- return -1;
-
- if (ieee->freq_band & IEEE80211_24GHZ_BAND)
- for (i = 0; i < ieee->geo.bg_channels; i++)
- if (ieee->geo.bg[i].channel == channel)
- return i;
-
- if (ieee->freq_band & IEEE80211_52GHZ_BAND)
- for (i = 0; i < ieee->geo.a_channels; i++)
- if (ieee->geo.a[i].channel == channel)
- return i;
-
- return -1;
-}
-
-u8 ieee80211_freq_to_channel(struct ieee80211_device * ieee, u32 freq)
-{
- int i;
-
- /* Driver needs to initialize the geography map before using
- * these helper functions */
- if (ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0)
- return 0;
-
- freq /= 100000;
-
- if (ieee->freq_band & IEEE80211_24GHZ_BAND)
- for (i = 0; i < ieee->geo.bg_channels; i++)
- if (ieee->geo.bg[i].freq == freq)
- return ieee->geo.bg[i].channel;
-
- if (ieee->freq_band & IEEE80211_52GHZ_BAND)
- for (i = 0; i < ieee->geo.a_channels; i++)
- if (ieee->geo.a[i].freq == freq)
- return ieee->geo.a[i].channel;
-
- return 0;
-}
-
-int ieee80211_set_geo(struct ieee80211_device *ieee,
- const struct ieee80211_geo *geo)
-{
- memcpy(ieee->geo.name, geo->name, 3);
- ieee->geo.name[3] = '\0';
- ieee->geo.bg_channels = geo->bg_channels;
- ieee->geo.a_channels = geo->a_channels;
- memcpy(ieee->geo.bg, geo->bg, geo->bg_channels *
- sizeof(struct ieee80211_channel));
- memcpy(ieee->geo.a, geo->a, ieee->geo.a_channels *
- sizeof(struct ieee80211_channel));
- return 0;
-}
-
-const struct ieee80211_geo *ieee80211_get_geo(struct ieee80211_device *ieee)
-{
- return &ieee->geo;
-}
-
-u8 ieee80211_get_channel_flags(struct ieee80211_device * ieee, u8 channel)
-{
- int index = ieee80211_channel_to_index(ieee, channel);
-
- if (index == -1)
- return IEEE80211_CH_INVALID;
-
- if (channel <= IEEE80211_24GHZ_CHANNELS)
- return ieee->geo.bg[index].flags;
-
- return ieee->geo.a[index].flags;
-}
-
-static const struct ieee80211_channel bad_channel = {
- .channel = 0,
- .flags = IEEE80211_CH_INVALID,
- .max_power = 0,
-};
-
-const struct ieee80211_channel *ieee80211_get_channel(struct ieee80211_device
- *ieee, u8 channel)
-{
- int index = ieee80211_channel_to_index(ieee, channel);
-
- if (index == -1)
- return &bad_channel;
-
- if (channel <= IEEE80211_24GHZ_CHANNELS)
- return &ieee->geo.bg[index];
-
- return &ieee->geo.a[index];
-}
-
-EXPORT_SYMBOL(ieee80211_get_channel);
-EXPORT_SYMBOL(ieee80211_get_channel_flags);
-EXPORT_SYMBOL(ieee80211_is_valid_channel);
-EXPORT_SYMBOL(ieee80211_freq_to_channel);
-EXPORT_SYMBOL(ieee80211_channel_to_index);
-EXPORT_SYMBOL(ieee80211_set_geo);
-EXPORT_SYMBOL(ieee80211_get_geo);
diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
deleted file mode 100644
index b1c6d1f717d9..000000000000
--- a/net/ieee80211/ieee80211_module.c
+++ /dev/null
@@ -1,337 +0,0 @@
-/*******************************************************************************
-
- Copyright(c) 2004-2005 Intel Corporation. All rights reserved.
-
- Portions of this file are based on the WEP enablement code provided by the
- Host AP project hostap-drivers v0.1.3
- Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
- <jkmaline@cc.hut.fi>
- Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
-
- This program is free software; you can redistribute it and/or modify it
- under the terms of version 2 of the GNU General Public License as
- published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program; if not, write to the Free Software Foundation, Inc., 59
- Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- The full GNU General Public License is included in this distribution in the
- file called LICENSE.
-
- Contact Information:
- James P. Ketrenos <ipw2100-admin@linux.intel.com>
- Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
-
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/if_arp.h>
-#include <linux/in6.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/tcp.h>
-#include <linux/types.h>
-#include <linux/wireless.h>
-#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
-#include <net/arp.h>
-
-#include <net/ieee80211.h>
-
-#define DRV_DESCRIPTION "802.11 data/management/control stack"
-#define DRV_NAME "ieee80211"
-#define DRV_VERSION IEEE80211_VERSION
-#define DRV_COPYRIGHT "Copyright (C) 2004-2005 Intel Corporation <jketreno@linux.intel.com>"
-
-MODULE_VERSION(DRV_VERSION);
-MODULE_DESCRIPTION(DRV_DESCRIPTION);
-MODULE_AUTHOR(DRV_COPYRIGHT);
-MODULE_LICENSE("GPL");
-
-static int ieee80211_networks_allocate(struct ieee80211_device *ieee)
-{
- if (ieee->networks)
- return 0;
-
- ieee->networks =
- kzalloc(MAX_NETWORK_COUNT * sizeof(struct ieee80211_network),
- GFP_KERNEL);
- if (!ieee->networks) {
- printk(KERN_WARNING "%s: Out of memory allocating beacons\n",
- ieee->dev->name);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-void ieee80211_network_reset(struct ieee80211_network *network)
-{
- if (!network)
- return;
-
- if (network->ibss_dfs) {
- kfree(network->ibss_dfs);
- network->ibss_dfs = NULL;
- }
-}
-
-static inline void ieee80211_networks_free(struct ieee80211_device *ieee)
-{
- int i;
-
- if (!ieee->networks)
- return;
-
- for (i = 0; i < MAX_NETWORK_COUNT; i++)
- if (ieee->networks[i].ibss_dfs)
- kfree(ieee->networks[i].ibss_dfs);
-
- kfree(ieee->networks);
- ieee->networks = NULL;
-}
-
-static void ieee80211_networks_initialize(struct ieee80211_device *ieee)
-{
- int i;
-
- INIT_LIST_HEAD(&ieee->network_free_list);
- INIT_LIST_HEAD(&ieee->network_list);
- for (i = 0; i < MAX_NETWORK_COUNT; i++)
- list_add_tail(&ieee->networks[i].list,
- &ieee->network_free_list);
-}
-
-static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
-{
- if ((new_mtu < 68) || (new_mtu > IEEE80211_DATA_LEN))
- return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
-}
-
-static struct net_device_stats *ieee80211_generic_get_stats(
- struct net_device *dev)
-{
- struct ieee80211_device *ieee = netdev_priv(dev);
- return &ieee->stats;
-}
-
-struct net_device *alloc_ieee80211(int sizeof_priv)
-{
- struct ieee80211_device *ieee;
- struct net_device *dev;
- int err;
-
- IEEE80211_DEBUG_INFO("Initializing...\n");
-
- dev = alloc_etherdev(sizeof(struct ieee80211_device) + sizeof_priv);
- if (!dev) {
- IEEE80211_ERROR("Unable to network device.\n");
- goto failed;
- }
- ieee = netdev_priv(dev);
- dev->hard_start_xmit = ieee80211_xmit;
- dev->change_mtu = ieee80211_change_mtu;
-
- /* Drivers are free to override this if the generic implementation
- * does not meet their needs. */
- dev->get_stats = ieee80211_generic_get_stats;
-
- ieee->dev = dev;
-
- err = ieee80211_networks_allocate(ieee);
- if (err) {
- IEEE80211_ERROR("Unable to allocate beacon storage: %d\n", err);
- goto failed;
- }
- ieee80211_networks_initialize(ieee);
-
- /* Default fragmentation threshold is maximum payload size */
- ieee->fts = DEFAULT_FTS;
- ieee->rts = DEFAULT_FTS;
- ieee->scan_age = DEFAULT_MAX_SCAN_AGE;
- ieee->open_wep = 1;
-
- /* Default to enabling full open WEP with host based encrypt/decrypt */
- ieee->host_encrypt = 1;
- ieee->host_decrypt = 1;
- ieee->host_mc_decrypt = 1;
-
- /* Host fragementation in Open mode. Default is enabled.
- * Note: host fragmentation is always enabled if host encryption
- * is enabled. For cards can do hardware encryption, they must do
- * hardware fragmentation as well. So we don't need a variable
- * like host_enc_frag. */
- ieee->host_open_frag = 1;
- ieee->ieee802_1x = 1; /* Default to supporting 802.1x */
-
- INIT_LIST_HEAD(&ieee->crypt_deinit_list);
- init_timer(&ieee->crypt_deinit_timer);
- ieee->crypt_deinit_timer.data = (unsigned long)ieee;
- ieee->crypt_deinit_timer.function = ieee80211_crypt_deinit_handler;
- ieee->crypt_quiesced = 0;
-
- spin_lock_init(&ieee->lock);
-
- ieee->wpa_enabled = 0;
- ieee->drop_unencrypted = 0;
- ieee->privacy_invoked = 0;
-
- return dev;
-
- failed:
- if (dev)
- free_netdev(dev);
- return NULL;
-}
-
-void free_ieee80211(struct net_device *dev)
-{
- struct ieee80211_device *ieee = netdev_priv(dev);
-
- int i;
-
- ieee80211_crypt_quiescing(ieee);
- del_timer_sync(&ieee->crypt_deinit_timer);
- ieee80211_crypt_deinit_entries(ieee, 1);
-
- for (i = 0; i < WEP_KEYS; i++) {
- struct ieee80211_crypt_data *crypt = ieee->crypt[i];
- if (crypt) {
- if (crypt->ops) {
- crypt->ops->deinit(crypt->priv);
- module_put(crypt->ops->owner);
- }
- kfree(crypt);
- ieee->crypt[i] = NULL;
- }
- }
-
- ieee80211_networks_free(ieee);
- free_netdev(dev);
-}
-
-#ifdef CONFIG_IEEE80211_DEBUG
-
-static int debug = 0;
-u32 ieee80211_debug_level = 0;
-static struct proc_dir_entry *ieee80211_proc = NULL;
-
-static int show_debug_level(char *page, char **start, off_t offset,
- int count, int *eof, void *data)
-{
- return snprintf(page, count, "0x%08X\n", ieee80211_debug_level);
-}
-
-static int store_debug_level(struct file *file, const char __user * buffer,
- unsigned long count, void *data)
-{
- char buf[] = "0x00000000\n";
- unsigned long len = min((unsigned long)sizeof(buf) - 1, count);
- unsigned long val;
-
- if (copy_from_user(buf, buffer, len))
- return count;
- buf[len] = 0;
- if (sscanf(buf, "%li", &val) != 1)
- printk(KERN_INFO DRV_NAME
- ": %s is not in hex or decimal form.\n", buf);
- else
- ieee80211_debug_level = val;
-
- return strnlen(buf, len);
-}
-#endif /* CONFIG_IEEE80211_DEBUG */
-
-static int __init ieee80211_init(void)
-{
-#ifdef CONFIG_IEEE80211_DEBUG
- struct proc_dir_entry *e;
-
- ieee80211_debug_level = debug;
- ieee80211_proc = proc_mkdir(DRV_NAME, proc_net);
- if (ieee80211_proc == NULL) {
- IEEE80211_ERROR("Unable to create " DRV_NAME
- " proc directory\n");
- return -EIO;
- }
- e = create_proc_entry("debug_level", S_IFREG | S_IRUGO | S_IWUSR,
- ieee80211_proc);
- if (!e) {
- remove_proc_entry(DRV_NAME, proc_net);
- ieee80211_proc = NULL;
- return -EIO;
- }
- e->read_proc = show_debug_level;
- e->write_proc = store_debug_level;
- e->data = NULL;
-#endif /* CONFIG_IEEE80211_DEBUG */
-
- printk(KERN_INFO DRV_NAME ": " DRV_DESCRIPTION ", " DRV_VERSION "\n");
- printk(KERN_INFO DRV_NAME ": " DRV_COPYRIGHT "\n");
-
- return 0;
-}
-
-static void __exit ieee80211_exit(void)
-{
-#ifdef CONFIG_IEEE80211_DEBUG
- if (ieee80211_proc) {
- remove_proc_entry("debug_level", ieee80211_proc);
- remove_proc_entry(DRV_NAME, proc_net);
- ieee80211_proc = NULL;
- }
-#endif /* CONFIG_IEEE80211_DEBUG */
-}
-
-#ifdef CONFIG_IEEE80211_DEBUG
-#include <linux/moduleparam.h>
-module_param(debug, int, 0444);
-MODULE_PARM_DESC(debug, "debug output mask");
-#endif /* CONFIG_IEEE80211_DEBUG */
-
-module_exit(ieee80211_exit);
-module_init(ieee80211_init);
-
-const char *escape_essid(const char *essid, u8 essid_len)
-{
- static char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
- const char *s = essid;
- char *d = escaped;
-
- if (ieee80211_is_empty_essid(essid, essid_len)) {
- memcpy(escaped, "<hidden>", sizeof("<hidden>"));
- return escaped;
- }
-
- essid_len = min(essid_len, (u8) IW_ESSID_MAX_SIZE);
- while (essid_len--) {
- if (*s == '\0') {
- *d++ = '\\';
- *d++ = '0';
- s++;
- } else {
- *d++ = *s++;
- }
- }
- *d = '\0';
- return escaped;
-}
-
-EXPORT_SYMBOL(alloc_ieee80211);
-EXPORT_SYMBOL(free_ieee80211);
-EXPORT_SYMBOL(escape_essid);
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
deleted file mode 100644
index d97e5412e31b..000000000000
--- a/net/ieee80211/ieee80211_rx.c
+++ /dev/null
@@ -1,1803 +0,0 @@
-/*
- * Original code based Host AP (software wireless LAN access point) driver
- * for Intersil Prism2/2.5/3 - hostap.o module, common routines
- *
- * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
- * <jkmaline@cc.hut.fi>
- * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
- * Copyright (c) 2004-2005, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation. See README and COPYING for
- * more details.
- */
-
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/if_arp.h>
-#include <linux/in6.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/tcp.h>
-#include <linux/types.h>
-#include <linux/wireless.h>
-#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
-#include <linux/ctype.h>
-
-#include <net/ieee80211.h>
-
-static void ieee80211_monitor_rx(struct ieee80211_device *ieee,
- struct sk_buff *skb,
- struct ieee80211_rx_stats *rx_stats)
-{
- struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
- u16 fc = le16_to_cpu(hdr->frame_ctl);
-
- skb->dev = ieee->dev;
- skb->mac.raw = skb->data;
- skb_pull(skb, ieee80211_get_hdrlen(fc));
- skb->pkt_type = PACKET_OTHERHOST;
- skb->protocol = __constant_htons(ETH_P_80211_RAW);
- memset(skb->cb, 0, sizeof(skb->cb));
- netif_rx(skb);
-}
-
-/* Called only as a tasklet (software IRQ) */
-static struct ieee80211_frag_entry *ieee80211_frag_cache_find(struct
- ieee80211_device
- *ieee,
- unsigned int seq,
- unsigned int frag,
- u8 * src,
- u8 * dst)
-{
- struct ieee80211_frag_entry *entry;
- int i;
-
- for (i = 0; i < IEEE80211_FRAG_CACHE_LEN; i++) {
- entry = &ieee->frag_cache[i];
- if (entry->skb != NULL &&
- time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
- IEEE80211_DEBUG_FRAG("expiring fragment cache entry "
- "seq=%u last_frag=%u\n",
- entry->seq, entry->last_frag);
- dev_kfree_skb_any(entry->skb);
- entry->skb = NULL;
- }
-
- if (entry->skb != NULL && entry->seq == seq &&
- (entry->last_frag + 1 == frag || frag == -1) &&
- !compare_ether_addr(entry->src_addr, src) &&
- !compare_ether_addr(entry->dst_addr, dst))
- return entry;
- }
-
- return NULL;
-}
-
-/* Called only as a tasklet (software IRQ) */
-static struct sk_buff *ieee80211_frag_cache_get(struct ieee80211_device *ieee,
- struct ieee80211_hdr_4addr *hdr)
-{
- struct sk_buff *skb = NULL;
- u16 sc;
- unsigned int frag, seq;
- struct ieee80211_frag_entry *entry;
-
- sc = le16_to_cpu(hdr->seq_ctl);
- frag = WLAN_GET_SEQ_FRAG(sc);
- seq = WLAN_GET_SEQ_SEQ(sc);
-
- if (frag == 0) {
- /* Reserve enough space to fit maximum frame length */
- skb = dev_alloc_skb(ieee->dev->mtu +
- sizeof(struct ieee80211_hdr_4addr) +
- 8 /* LLC */ +
- 2 /* alignment */ +
- 8 /* WEP */ + ETH_ALEN /* WDS */ );
- if (skb == NULL)
- return NULL;
-
- entry = &ieee->frag_cache[ieee->frag_next_idx];
- ieee->frag_next_idx++;
- if (ieee->frag_next_idx >= IEEE80211_FRAG_CACHE_LEN)
- ieee->frag_next_idx = 0;
-
- if (entry->skb != NULL)
- dev_kfree_skb_any(entry->skb);
-
- entry->first_frag_time = jiffies;
- entry->seq = seq;
- entry->last_frag = frag;
- entry->skb = skb;
- memcpy(entry->src_addr, hdr->addr2, ETH_ALEN);
- memcpy(entry->dst_addr, hdr->addr1, ETH_ALEN);
- } else {
- /* received a fragment of a frame for which the head fragment
- * should have already been received */
- entry = ieee80211_frag_cache_find(ieee, seq, frag, hdr->addr2,
- hdr->addr1);
- if (entry != NULL) {
- entry->last_frag = frag;
- skb = entry->skb;
- }
- }
-
- return skb;
-}
-
-/* Called only as a tasklet (software IRQ) */
-static int ieee80211_frag_cache_invalidate(struct ieee80211_device *ieee,
- struct ieee80211_hdr_4addr *hdr)
-{
- u16 sc;
- unsigned int seq;
- struct ieee80211_frag_entry *entry;
-
- sc = le16_to_cpu(hdr->seq_ctl);
- seq = WLAN_GET_SEQ_SEQ(sc);
-
- entry = ieee80211_frag_cache_find(ieee, seq, -1, hdr->addr2,
- hdr->addr1);
-
- if (entry == NULL) {
- IEEE80211_DEBUG_FRAG("could not invalidate fragment cache "
- "entry (seq=%u)\n", seq);
- return -1;
- }
-
- entry->skb = NULL;
- return 0;
-}
-
-#ifdef NOT_YET
-/* ieee80211_rx_frame_mgtmt
- *
- * Responsible for handling management control frames
- *
- * Called by ieee80211_rx */
-static int
-ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb,
- struct ieee80211_rx_stats *rx_stats, u16 type,
- u16 stype)
-{
- if (ieee->iw_mode == IW_MODE_MASTER) {
- printk(KERN_DEBUG "%s: Master mode not yet suppported.\n",
- ieee->dev->name);
- return 0;
-/*
- hostap_update_sta_ps(ieee, (struct hostap_ieee80211_hdr_4addr *)
- skb->data);*/
- }
-
- if (ieee->hostapd && type == WLAN_FC_TYPE_MGMT) {
- if (stype == WLAN_FC_STYPE_BEACON &&
- ieee->iw_mode == IW_MODE_MASTER) {
- struct sk_buff *skb2;
- /* Process beacon frames also in kernel driver to
- * update STA(AP) table statistics */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2)
- hostap_rx(skb2->dev, skb2, rx_stats);
- }
-
- /* send management frames to the user space daemon for
- * processing */
- ieee->apdevstats.rx_packets++;
- ieee->apdevstats.rx_bytes += skb->len;
- prism2_rx_80211(ieee->apdev, skb, rx_stats, PRISM2_RX_MGMT);
- return 0;
- }
-
- if (ieee->iw_mode == IW_MODE_MASTER) {
- if (type != WLAN_FC_TYPE_MGMT && type != WLAN_FC_TYPE_CTRL) {
- printk(KERN_DEBUG "%s: unknown management frame "
- "(type=0x%02x, stype=0x%02x) dropped\n",
- skb->dev->name, type, stype);
- return -1;
- }
-
- hostap_rx(skb->dev, skb, rx_stats);
- return 0;
- }
-
- printk(KERN_DEBUG "%s: hostap_rx_frame_mgmt: management frame "
- "received in non-Host AP mode\n", skb->dev->name);
- return -1;
-}
-#endif
-
-/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
-/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
-static unsigned char rfc1042_header[] = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
-
-/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
-static unsigned char bridge_tunnel_header[] =
- { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
-/* No encapsulation header if EtherType < 0x600 (=length) */
-
-/* Called by ieee80211_rx_frame_decrypt */
-static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee,
- struct sk_buff *skb)
-{
- struct net_device *dev = ieee->dev;
- u16 fc, ethertype;
- struct ieee80211_hdr_3addr *hdr;
- u8 *pos;
-
- if (skb->len < 24)
- return 0;
-
- hdr = (struct ieee80211_hdr_3addr *)skb->data;
- fc = le16_to_cpu(hdr->frame_ctl);
-
- /* check that the frame is unicast frame to us */
- if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
- IEEE80211_FCTL_TODS &&
- !compare_ether_addr(hdr->addr1, dev->dev_addr) &&
- !compare_ether_addr(hdr->addr3, dev->dev_addr)) {
- /* ToDS frame with own addr BSSID and DA */
- } else if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
- IEEE80211_FCTL_FROMDS &&
- !compare_ether_addr(hdr->addr1, dev->dev_addr)) {
- /* FromDS frame with own addr as DA */
- } else
- return 0;
-
- if (skb->len < 24 + 8)
- return 0;
-
- /* check for port access entity Ethernet type */
- pos = skb->data + 24;
- ethertype = (pos[6] << 8) | pos[7];
- if (ethertype == ETH_P_PAE)
- return 1;
-
- return 0;
-}
-
-/* Called only as a tasklet (software IRQ), by ieee80211_rx */
-static int
-ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
- struct ieee80211_crypt_data *crypt)
-{
- struct ieee80211_hdr_3addr *hdr;
- int res, hdrlen;
-
- if (crypt == NULL || crypt->ops->decrypt_mpdu == NULL)
- return 0;
-
- hdr = (struct ieee80211_hdr_3addr *)skb->data;
- hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
-
- atomic_inc(&crypt->refcnt);
- res = crypt->ops->decrypt_mpdu(skb, hdrlen, crypt->priv);
- atomic_dec(&crypt->refcnt);
- if (res < 0) {
- IEEE80211_DEBUG_DROP("decryption failed (SA=" MAC_FMT
- ") res=%d\n", MAC_ARG(hdr->addr2), res);
- if (res == -2)
- IEEE80211_DEBUG_DROP("Decryption failed ICV "
- "mismatch (key %d)\n",
- skb->data[hdrlen + 3] >> 6);
- ieee->ieee_stats.rx_discards_undecryptable++;
- return -1;
- }
-
- return res;
-}
-
-/* Called only as a tasklet (software IRQ), by ieee80211_rx */
-static int
-ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee,
- struct sk_buff *skb, int keyidx,
- struct ieee80211_crypt_data *crypt)
-{
- struct ieee80211_hdr_3addr *hdr;
- int res, hdrlen;
-
- if (crypt == NULL || crypt->ops->decrypt_msdu == NULL)
- return 0;
-
- hdr = (struct ieee80211_hdr_3addr *)skb->data;
- hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
-
- atomic_inc(&crypt->refcnt);
- res = crypt->ops->decrypt_msdu(skb, keyidx, hdrlen, crypt->priv);
- atomic_dec(&crypt->refcnt);
- if (res < 0) {
- printk(KERN_DEBUG "%s: MSDU decryption/MIC verification failed"
- " (SA=" MAC_FMT " keyidx=%d)\n",
- ieee->dev->name, MAC_ARG(hdr->addr2), keyidx);
- return -1;
- }
-
- return 0;
-}
-
-/* All received frames are sent to this function. @skb contains the frame in
- * IEEE 802.11 format, i.e., in the format it was sent over air.
- * This function is called only as a tasklet (software IRQ). */
-int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
- struct ieee80211_rx_stats *rx_stats)
-{
- struct net_device *dev = ieee->dev;
- struct ieee80211_hdr_4addr *hdr;
- size_t hdrlen;
- u16 fc, type, stype, sc;
- struct net_device_stats *stats;
- unsigned int frag;
- u8 *payload;
- u16 ethertype;
-#ifdef NOT_YET
- struct net_device *wds = NULL;
- struct sk_buff *skb2 = NULL;
- struct net_device *wds = NULL;
- int frame_authorized = 0;
- int from_assoc_ap = 0;
- void *sta = NULL;
-#endif
- u8 dst[ETH_ALEN];
- u8 src[ETH_ALEN];
- struct ieee80211_crypt_data *crypt = NULL;
- int keyidx = 0;
- int can_be_decrypted = 0;
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- stats = &ieee->stats;
-
- if (skb->len < 10) {
- printk(KERN_INFO "%s: SKB length < 10\n", dev->name);
- goto rx_dropped;
- }
-
- fc = le16_to_cpu(hdr->frame_ctl);
- type = WLAN_FC_GET_TYPE(fc);
- stype = WLAN_FC_GET_STYPE(fc);
- sc = le16_to_cpu(hdr->seq_ctl);
- frag = WLAN_GET_SEQ_FRAG(sc);
- hdrlen = ieee80211_get_hdrlen(fc);
-
- /* Put this code here so that we avoid duplicating it in all
- * Rx paths. - Jean II */
-#ifdef CONFIG_WIRELESS_EXT
-#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */
- /* If spy monitoring on */
- if (ieee->spy_data.spy_number > 0) {
- struct iw_quality wstats;
-
- wstats.updated = 0;
- if (rx_stats->mask & IEEE80211_STATMASK_RSSI) {
- wstats.level = rx_stats->rssi;
- wstats.updated |= IW_QUAL_LEVEL_UPDATED;
- } else
- wstats.updated |= IW_QUAL_LEVEL_INVALID;
-
- if (rx_stats->mask & IEEE80211_STATMASK_NOISE) {
- wstats.noise = rx_stats->noise;
- wstats.updated |= IW_QUAL_NOISE_UPDATED;
- } else
- wstats.updated |= IW_QUAL_NOISE_INVALID;
-
- if (rx_stats->mask & IEEE80211_STATMASK_SIGNAL) {
- wstats.qual = rx_stats->signal;
- wstats.updated |= IW_QUAL_QUAL_UPDATED;
- } else
- wstats.updated |= IW_QUAL_QUAL_INVALID;
-
- /* Update spy records */
- wireless_spy_update(ieee->dev, hdr->addr2, &wstats);
- }
-#endif /* IW_WIRELESS_SPY */
-#endif /* CONFIG_WIRELESS_EXT */
-
-#ifdef NOT_YET
- hostap_update_rx_stats(local->ap, hdr, rx_stats);
-#endif
-
- if (ieee->iw_mode == IW_MODE_MONITOR) {
- stats->rx_packets++;
- stats->rx_bytes += skb->len;
- ieee80211_monitor_rx(ieee, skb, rx_stats);
- return 1;
- }
-
- can_be_decrypted = (is_multicast_ether_addr(hdr->addr1) ||
- is_broadcast_ether_addr(hdr->addr2)) ?
- ieee->host_mc_decrypt : ieee->host_decrypt;
-
- if (can_be_decrypted) {
- if (skb->len >= hdrlen + 3) {
- /* Top two-bits of byte 3 are the key index */
- keyidx = skb->data[hdrlen + 3] >> 6;
- }
-
- /* ieee->crypt[] is WEP_KEY (4) in length. Given that keyidx
- * is only allowed 2-bits of storage, no value of keyidx can
- * be provided via above code that would result in keyidx
- * being out of range */
- crypt = ieee->crypt[keyidx];
-
-#ifdef NOT_YET
- sta = NULL;
-
- /* Use station specific key to override default keys if the
- * receiver address is a unicast address ("individual RA"). If
- * bcrx_sta_key parameter is set, station specific key is used
- * even with broad/multicast targets (this is against IEEE
- * 802.11, but makes it easier to use different keys with
- * stations that do not support WEP key mapping). */
-
- if (!(hdr->addr1[0] & 0x01) || local->bcrx_sta_key)
- (void)hostap_handle_sta_crypto(local, hdr, &crypt,
- &sta);
-#endif
-
- /* allow NULL decrypt to indicate an station specific override
- * for default encryption */
- if (crypt && (crypt->ops == NULL ||
- crypt->ops->decrypt_mpdu == NULL))
- crypt = NULL;
-
- if (!crypt && (fc & IEEE80211_FCTL_PROTECTED)) {
- /* This seems to be triggered by some (multicast?)
- * frames from other than current BSS, so just drop the
- * frames silently instead of filling system log with
- * these reports. */
- IEEE80211_DEBUG_DROP("Decryption failed (not set)"
- " (SA=" MAC_FMT ")\n",
- MAC_ARG(hdr->addr2));
- ieee->ieee_stats.rx_discards_undecryptable++;
- goto rx_dropped;
- }
- }
-#ifdef NOT_YET
- if (type != WLAN_FC_TYPE_DATA) {
- if (type == WLAN_FC_TYPE_MGMT && stype == WLAN_FC_STYPE_AUTH &&
- fc & IEEE80211_FCTL_PROTECTED && ieee->host_decrypt &&
- (keyidx = hostap_rx_frame_decrypt(ieee, skb, crypt)) < 0) {
- printk(KERN_DEBUG "%s: failed to decrypt mgmt::auth "
- "from " MAC_FMT "\n", dev->name,
- MAC_ARG(hdr->addr2));
- /* TODO: could inform hostapd about this so that it
- * could send auth failure report */
- goto rx_dropped;
- }
-
- if (ieee80211_rx_frame_mgmt(ieee, skb, rx_stats, type, stype))
- goto rx_dropped;
- else
- goto rx_exit;
- }
-#endif
- /* drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.29) */
- if (sc == ieee->prev_seq_ctl)
- goto rx_dropped;
- else
- ieee->prev_seq_ctl = sc;
-
- /* Data frame - extract src/dst addresses */
- if (skb->len < IEEE80211_3ADDR_LEN)
- goto rx_dropped;
-
- switch (fc & (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
- case IEEE80211_FCTL_FROMDS:
- memcpy(dst, hdr->addr1, ETH_ALEN);
- memcpy(src, hdr->addr3, ETH_ALEN);
- break;
- case IEEE80211_FCTL_TODS:
- memcpy(dst, hdr->addr3, ETH_ALEN);
- memcpy(src, hdr->addr2, ETH_ALEN);
- break;
- case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
- if (skb->len < IEEE80211_4ADDR_LEN)
- goto rx_dropped;
- memcpy(dst, hdr->addr3, ETH_ALEN);
- memcpy(src, hdr->addr4, ETH_ALEN);
- break;
- case 0:
- memcpy(dst, hdr->addr1, ETH_ALEN);
- memcpy(src, hdr->addr2, ETH_ALEN);
- break;
- }
-
-#ifdef NOT_YET
- if (hostap_rx_frame_wds(ieee, hdr, fc, &wds))
- goto rx_dropped;
- if (wds) {
- skb->dev = dev = wds;
- stats = hostap_get_stats(dev);
- }
-
- if (ieee->iw_mode == IW_MODE_MASTER && !wds &&
- (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
- IEEE80211_FCTL_FROMDS && ieee->stadev
- && !compare_ether_addr(hdr->addr2, ieee->assoc_ap_addr)) {
- /* Frame from BSSID of the AP for which we are a client */
- skb->dev = dev = ieee->stadev;
- stats = hostap_get_stats(dev);
- from_assoc_ap = 1;
- }
-#endif
-
- dev->last_rx = jiffies;
-
-#ifdef NOT_YET
- if ((ieee->iw_mode == IW_MODE_MASTER ||
- ieee->iw_mode == IW_MODE_REPEAT) && !from_assoc_ap) {
- switch (hostap_handle_sta_rx(ieee, dev, skb, rx_stats,
- wds != NULL)) {
- case AP_RX_CONTINUE_NOT_AUTHORIZED:
- frame_authorized = 0;
- break;
- case AP_RX_CONTINUE:
- frame_authorized = 1;
- break;
- case AP_RX_DROP:
- goto rx_dropped;
- case AP_RX_EXIT:
- goto rx_exit;
- }
- }
-#endif
-
- /* Nullfunc frames may have PS-bit set, so they must be passed to
- * hostap_handle_sta_rx() before being dropped here. */
-
- stype &= ~IEEE80211_STYPE_QOS_DATA;
-
- if (stype != IEEE80211_STYPE_DATA &&
- stype != IEEE80211_STYPE_DATA_CFACK &&
- stype != IEEE80211_STYPE_DATA_CFPOLL &&
- stype != IEEE80211_STYPE_DATA_CFACKPOLL) {
- if (stype != IEEE80211_STYPE_NULLFUNC)
- IEEE80211_DEBUG_DROP("RX: dropped data frame "
- "with no data (type=0x%02x, "
- "subtype=0x%02x, len=%d)\n",
- type, stype, skb->len);
- goto rx_dropped;
- }
-
- /* skb: hdr + (possibly fragmented, possibly encrypted) payload */
-
- if ((fc & IEEE80211_FCTL_PROTECTED) && can_be_decrypted &&
- (keyidx = ieee80211_rx_frame_decrypt(ieee, skb, crypt)) < 0)
- goto rx_dropped;
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
-
- /* skb: hdr + (possibly fragmented) plaintext payload */
- // PR: FIXME: hostap has additional conditions in the "if" below:
- // ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
- if ((frag != 0) || (fc & IEEE80211_FCTL_MOREFRAGS)) {
- int flen;
- struct sk_buff *frag_skb = ieee80211_frag_cache_get(ieee, hdr);
- IEEE80211_DEBUG_FRAG("Rx Fragment received (%u)\n", frag);
-
- if (!frag_skb) {
- IEEE80211_DEBUG(IEEE80211_DL_RX | IEEE80211_DL_FRAG,
- "Rx cannot get skb from fragment "
- "cache (morefrag=%d seq=%u frag=%u)\n",
- (fc & IEEE80211_FCTL_MOREFRAGS) != 0,
- WLAN_GET_SEQ_SEQ(sc), frag);
- goto rx_dropped;
- }
-
- flen = skb->len;
- if (frag != 0)
- flen -= hdrlen;
-
- if (frag_skb->tail + flen > frag_skb->end) {
- printk(KERN_WARNING "%s: host decrypted and "
- "reassembled frame did not fit skb\n",
- dev->name);
- ieee80211_frag_cache_invalidate(ieee, hdr);
- goto rx_dropped;
- }
-
- if (frag == 0) {
- /* copy first fragment (including full headers) into
- * beginning of the fragment cache skb */
- memcpy(skb_put(frag_skb, flen), skb->data, flen);
- } else {
- /* append frame payload to the end of the fragment
- * cache skb */
- memcpy(skb_put(frag_skb, flen), skb->data + hdrlen,
- flen);
- }
- dev_kfree_skb_any(skb);
- skb = NULL;
-
- if (fc & IEEE80211_FCTL_MOREFRAGS) {
- /* more fragments expected - leave the skb in fragment
- * cache for now; it will be delivered to upper layers
- * after all fragments have been received */
- goto rx_exit;
- }
-
- /* this was the last fragment and the frame will be
- * delivered, so remove skb from fragment cache */
- skb = frag_skb;
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- ieee80211_frag_cache_invalidate(ieee, hdr);
- }
-
- /* skb: hdr + (possible reassembled) full MSDU payload; possibly still
- * encrypted/authenticated */
- if ((fc & IEEE80211_FCTL_PROTECTED) && can_be_decrypted &&
- ieee80211_rx_frame_decrypt_msdu(ieee, skb, keyidx, crypt))
- goto rx_dropped;
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep) {
- if ( /*ieee->ieee802_1x && */
- ieee80211_is_eapol_frame(ieee, skb)) {
- /* pass unencrypted EAPOL frames even if encryption is
- * configured */
- } else {
- IEEE80211_DEBUG_DROP("encryption configured, but RX "
- "frame not encrypted (SA=" MAC_FMT
- ")\n", MAC_ARG(hdr->addr2));
- goto rx_dropped;
- }
- }
-
- if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep &&
- !ieee80211_is_eapol_frame(ieee, skb)) {
- IEEE80211_DEBUG_DROP("dropped unencrypted RX data "
- "frame from " MAC_FMT
- " (drop_unencrypted=1)\n",
- MAC_ARG(hdr->addr2));
- goto rx_dropped;
- }
-
- /* If the frame was decrypted in hardware, we may need to strip off
- * any security data (IV, ICV, etc) that was left behind */
- if (!can_be_decrypted && (fc & IEEE80211_FCTL_PROTECTED) &&
- ieee->host_strip_iv_icv) {
- int trimlen = 0;
-
- /* Top two-bits of byte 3 are the key index */
- if (skb->len >= hdrlen + 3)
- keyidx = skb->data[hdrlen + 3] >> 6;
-
- /* To strip off any security data which appears before the
- * payload, we simply increase hdrlen (as the header gets
- * chopped off immediately below). For the security data which
- * appears after the payload, we use skb_trim. */
-
- switch (ieee->sec.encode_alg[keyidx]) {
- case SEC_ALG_WEP:
- /* 4 byte IV */
- hdrlen += 4;
- /* 4 byte ICV */
- trimlen = 4;
- break;
- case SEC_ALG_TKIP:
- /* 4 byte IV, 4 byte ExtIV */
- hdrlen += 8;
- /* 8 byte MIC, 4 byte ICV */
- trimlen = 12;
- break;
- case SEC_ALG_CCMP:
- /* 8 byte CCMP header */
- hdrlen += 8;
- /* 8 byte MIC */
- trimlen = 8;
- break;
- }
-
- if (skb->len < trimlen)
- goto rx_dropped;
-
- __skb_trim(skb, skb->len - trimlen);
-
- if (skb->len < hdrlen)
- goto rx_dropped;
- }
-
- /* skb: hdr + (possible reassembled) full plaintext payload */
-
- payload = skb->data + hdrlen;
- ethertype = (payload[6] << 8) | payload[7];
-
-#ifdef NOT_YET
- /* If IEEE 802.1X is used, check whether the port is authorized to send
- * the received frame. */
- if (ieee->ieee802_1x && ieee->iw_mode == IW_MODE_MASTER) {
- if (ethertype == ETH_P_PAE) {
- printk(KERN_DEBUG "%s: RX: IEEE 802.1X frame\n",
- dev->name);
- if (ieee->hostapd && ieee->apdev) {
- /* Send IEEE 802.1X frames to the user
- * space daemon for processing */
- prism2_rx_80211(ieee->apdev, skb, rx_stats,
- PRISM2_RX_MGMT);
- ieee->apdevstats.rx_packets++;
- ieee->apdevstats.rx_bytes += skb->len;
- goto rx_exit;
- }
- } else if (!frame_authorized) {
- printk(KERN_DEBUG "%s: dropped frame from "
- "unauthorized port (IEEE 802.1X): "
- "ethertype=0x%04x\n", dev->name, ethertype);
- goto rx_dropped;
- }
- }
-#endif
-
- /* convert hdr + possible LLC headers into Ethernet header */
- if (skb->len - hdrlen >= 8 &&
- ((memcmp(payload, rfc1042_header, SNAP_SIZE) == 0 &&
- ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
- memcmp(payload, bridge_tunnel_header, SNAP_SIZE) == 0)) {
- /* remove RFC1042 or Bridge-Tunnel encapsulation and
- * replace EtherType */
- skb_pull(skb, hdrlen + SNAP_SIZE);
- memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
- memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
- } else {
- u16 len;
- /* Leave Ethernet header part of hdr and full payload */
- skb_pull(skb, hdrlen);
- len = htons(skb->len);
- memcpy(skb_push(skb, 2), &len, 2);
- memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
- memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
- }
-
-#ifdef NOT_YET
- if (wds && ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
- IEEE80211_FCTL_TODS) && skb->len >= ETH_HLEN + ETH_ALEN) {
- /* Non-standard frame: get addr4 from its bogus location after
- * the payload */
- memcpy(skb->data + ETH_ALEN,
- skb->data + skb->len - ETH_ALEN, ETH_ALEN);
- skb_trim(skb, skb->len - ETH_ALEN);
- }
-#endif
-
- stats->rx_packets++;
- stats->rx_bytes += skb->len;
-
-#ifdef NOT_YET
- if (ieee->iw_mode == IW_MODE_MASTER && !wds && ieee->ap->bridge_packets) {
- if (dst[0] & 0x01) {
- /* copy multicast frame both to the higher layers and
- * to the wireless media */
- ieee->ap->bridged_multicast++;
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- printk(KERN_DEBUG "%s: skb_clone failed for "
- "multicast frame\n", dev->name);
- } else if (hostap_is_sta_assoc(ieee->ap, dst)) {
- /* send frame directly to the associated STA using
- * wireless media and not passing to higher layers */
- ieee->ap->bridged_unicast++;
- skb2 = skb;
- skb = NULL;
- }
- }
-
- if (skb2 != NULL) {
- /* send to wireless media */
- skb2->protocol = __constant_htons(ETH_P_802_3);
- skb2->mac.raw = skb2->nh.raw = skb2->data;
- /* skb2->nh.raw = skb2->data + ETH_HLEN; */
- skb2->dev = dev;
- dev_queue_xmit(skb2);
- }
-#endif
-
- if (skb) {
- skb->protocol = eth_type_trans(skb, dev);
- memset(skb->cb, 0, sizeof(skb->cb));
- skb->dev = dev;
- skb->ip_summed = CHECKSUM_NONE; /* 802.11 crc not sufficient */
- if (netif_rx(skb) == NET_RX_DROP) {
- /* netif_rx always succeeds, but it might drop
- * the packet. If it drops the packet, we log that
- * in our stats. */
- IEEE80211_DEBUG_DROP
- ("RX: netif_rx dropped the packet\n");
- stats->rx_dropped++;
- }
- }
-
- rx_exit:
-#ifdef NOT_YET
- if (sta)
- hostap_handle_sta_release(sta);
-#endif
- return 1;
-
- rx_dropped:
- stats->rx_dropped++;
-
- /* Returning 0 indicates to caller that we have not handled the SKB--
- * so it is still allocated and can be used again by underlying
- * hardware as a DMA target */
- return 0;
-}
-
-/* Filter out unrelated packets, call ieee80211_rx[_mgt]
- * This function takes over the skb, it should not be used again after calling
- * this function. */
-void ieee80211_rx_any(struct ieee80211_device *ieee,
- struct sk_buff *skb, struct ieee80211_rx_stats *stats)
-{
- struct ieee80211_hdr_4addr *hdr;
- int is_packet_for_us;
- u16 fc;
-
- if (ieee->iw_mode == IW_MODE_MONITOR) {
- if (!ieee80211_rx(ieee, skb, stats))
- dev_kfree_skb_irq(skb);
- return;
- }
-
- if (skb->len < sizeof(struct ieee80211_hdr))
- goto drop_free;
-
- hdr = (struct ieee80211_hdr_4addr *)skb->data;
- fc = le16_to_cpu(hdr->frame_ctl);
-
- if ((fc & IEEE80211_FCTL_VERS) != 0)
- goto drop_free;
-
- switch (fc & IEEE80211_FCTL_FTYPE) {
- case IEEE80211_FTYPE_MGMT:
- if (skb->len < sizeof(struct ieee80211_hdr_3addr))
- goto drop_free;
- ieee80211_rx_mgt(ieee, hdr, stats);
- dev_kfree_skb_irq(skb);
- return;
- case IEEE80211_FTYPE_DATA:
- break;
- case IEEE80211_FTYPE_CTL:
- return;
- default:
- return;
- }
-
- is_packet_for_us = 0;
- switch (ieee->iw_mode) {
- case IW_MODE_ADHOC:
- /* our BSS and not from/to DS */
- if (memcmp(hdr->addr3, ieee->bssid, ETH_ALEN) == 0)
- if ((fc & (IEEE80211_FCTL_TODS+IEEE80211_FCTL_FROMDS)) == 0) {
- /* promisc: get all */
- if (ieee->dev->flags & IFF_PROMISC)
- is_packet_for_us = 1;
- /* to us */
- else if (memcmp(hdr->addr1, ieee->dev->dev_addr, ETH_ALEN) == 0)
- is_packet_for_us = 1;
- /* mcast */
- else if (is_multicast_ether_addr(hdr->addr1))
- is_packet_for_us = 1;
- }
- break;
- case IW_MODE_INFRA:
- /* our BSS (== from our AP) and from DS */
- if (memcmp(hdr->addr2, ieee->bssid, ETH_ALEN) == 0)
- if ((fc & (IEEE80211_FCTL_TODS+IEEE80211_FCTL_FROMDS)) == IEEE80211_FCTL_FROMDS) {
- /* promisc: get all */
- if (ieee->dev->flags & IFF_PROMISC)
- is_packet_for_us = 1;
- /* to us */
- else if (memcmp(hdr->addr1, ieee->dev->dev_addr, ETH_ALEN) == 0)
- is_packet_for_us = 1;
- /* mcast */
- else if (is_multicast_ether_addr(hdr->addr1)) {
- /* not our own packet bcasted from AP */
- if (memcmp(hdr->addr3, ieee->dev->dev_addr, ETH_ALEN))
- is_packet_for_us = 1;
- }
- }
- break;
- default:
- /* ? */
- break;
- }
-
- if (is_packet_for_us)
- if (!ieee80211_rx(ieee, skb, stats))
- dev_kfree_skb_irq(skb);
- return;
-
-drop_free:
- dev_kfree_skb_irq(skb);
- ieee->stats.rx_dropped++;
- return;
-}
-
-#define MGMT_FRAME_FIXED_PART_LENGTH 0x24
-
-static u8 qos_oui[QOS_OUI_LEN] = { 0x00, 0x50, 0xF2 };
-
-/*
-* Make ther structure we read from the beacon packet has
-* the right values
-*/
-static int ieee80211_verify_qos_info(struct ieee80211_qos_information_element
- *info_element, int sub_type)
-{
-
- if (info_element->qui_subtype != sub_type)
- return -1;
- if (memcmp(info_element->qui, qos_oui, QOS_OUI_LEN))
- return -1;
- if (info_element->qui_type != QOS_OUI_TYPE)
- return -1;
- if (info_element->version != QOS_VERSION_1)
- return -1;
-
- return 0;
-}
-
-/*
- * Parse a QoS parameter element
- */
-static int ieee80211_read_qos_param_element(struct ieee80211_qos_parameter_info
- *element_param, struct ieee80211_info_element
- *info_element)
-{
- int ret = 0;
- u16 size = sizeof(struct ieee80211_qos_parameter_info) - 2;
-
- if ((info_element == NULL) || (element_param == NULL))
- return -1;
-
- if (info_element->id == QOS_ELEMENT_ID && info_element->len == size) {
- memcpy(element_param->info_element.qui, info_element->data,
- info_element->len);
- element_param->info_element.elementID = info_element->id;
- element_param->info_element.length = info_element->len;
- } else
- ret = -1;
- if (ret == 0)
- ret = ieee80211_verify_qos_info(&element_param->info_element,
- QOS_OUI_PARAM_SUB_TYPE);
- return ret;
-}
-
-/*
- * Parse a QoS information element
- */
-static int ieee80211_read_qos_info_element(struct
- ieee80211_qos_information_element
- *element_info, struct ieee80211_info_element
- *info_element)
-{
- int ret = 0;
- u16 size = sizeof(struct ieee80211_qos_information_element) - 2;
-
- if (element_info == NULL)
- return -1;
- if (info_element == NULL)
- return -1;
-
- if ((info_element->id == QOS_ELEMENT_ID) && (info_element->len == size)) {
- memcpy(element_info->qui, info_element->data,
- info_element->len);
- element_info->elementID = info_element->id;
- element_info->length = info_element->len;
- } else
- ret = -1;
-
- if (ret == 0)
- ret = ieee80211_verify_qos_info(element_info,
- QOS_OUI_INFO_SUB_TYPE);
- return ret;
-}
-
-/*
- * Write QoS parameters from the ac parameters.
- */
-static int ieee80211_qos_convert_ac_to_parameters(struct
- ieee80211_qos_parameter_info
- *param_elm, struct
- ieee80211_qos_parameters
- *qos_param)
-{
- int rc = 0;
- int i;
- struct ieee80211_qos_ac_parameter *ac_params;
- u32 txop;
- u8 cw_min;
- u8 cw_max;
-
- for (i = 0; i < QOS_QUEUE_NUM; i++) {
- ac_params = &(param_elm->ac_params_record[i]);
-
- qos_param->aifs[i] = (ac_params->aci_aifsn) & 0x0F;
- qos_param->aifs[i] -= (qos_param->aifs[i] < 2) ? 0 : 2;
-
- cw_min = ac_params->ecw_min_max & 0x0F;
- qos_param->cw_min[i] = (u16) ((1 << cw_min) - 1);
-
- cw_max = (ac_params->ecw_min_max & 0xF0) >> 4;
- qos_param->cw_max[i] = (u16) ((1 << cw_max) - 1);
-
- qos_param->flag[i] =
- (ac_params->aci_aifsn & 0x10) ? 0x01 : 0x00;
-
- txop = le16_to_cpu(ac_params->tx_op_limit) * 32;
- qos_param->tx_op_limit[i] = (u16) txop;
- }
- return rc;
-}
-
-/*
- * we have a generic data element which it may contain QoS information or
- * parameters element. check the information element length to decide
- * which type to read
- */
-static int ieee80211_parse_qos_info_param_IE(struct ieee80211_info_element
- *info_element,
- struct ieee80211_network *network)
-{
- int rc = 0;
- struct ieee80211_qos_parameters *qos_param = NULL;
- struct ieee80211_qos_information_element qos_info_element;
-
- rc = ieee80211_read_qos_info_element(&qos_info_element, info_element);
-
- if (rc == 0) {
- network->qos_data.param_count = qos_info_element.ac_info & 0x0F;
- network->flags |= NETWORK_HAS_QOS_INFORMATION;
- } else {
- struct ieee80211_qos_parameter_info param_element;
-
- rc = ieee80211_read_qos_param_element(&param_element,
- info_element);
- if (rc == 0) {
- qos_param = &(network->qos_data.parameters);
- ieee80211_qos_convert_ac_to_parameters(&param_element,
- qos_param);
- network->flags |= NETWORK_HAS_QOS_PARAMETERS;
- network->qos_data.param_count =
- param_element.info_element.ac_info & 0x0F;
- }
- }
-
- if (rc == 0) {
- IEEE80211_DEBUG_QOS("QoS is supported\n");
- network->qos_data.supported = 1;
- }
- return rc;
-}
-
-#ifdef CONFIG_IEEE80211_DEBUG
-#define MFIE_STRING(x) case MFIE_TYPE_ ##x: return #x
-
-static const char *get_info_element_string(u16 id)
-{
- switch (id) {
- MFIE_STRING(SSID);
- MFIE_STRING(RATES);
- MFIE_STRING(FH_SET);
- MFIE_STRING(DS_SET);
- MFIE_STRING(CF_SET);
- MFIE_STRING(TIM);
- MFIE_STRING(IBSS_SET);
- MFIE_STRING(COUNTRY);
- MFIE_STRING(HOP_PARAMS);
- MFIE_STRING(HOP_TABLE);
- MFIE_STRING(REQUEST);
- MFIE_STRING(CHALLENGE);
- MFIE_STRING(POWER_CONSTRAINT);
- MFIE_STRING(POWER_CAPABILITY);
- MFIE_STRING(TPC_REQUEST);
- MFIE_STRING(TPC_REPORT);
- MFIE_STRING(SUPP_CHANNELS);
- MFIE_STRING(CSA);
- MFIE_STRING(MEASURE_REQUEST);
- MFIE_STRING(MEASURE_REPORT);
- MFIE_STRING(QUIET);
- MFIE_STRING(IBSS_DFS);
- MFIE_STRING(ERP_INFO);
- MFIE_STRING(RSN);
- MFIE_STRING(RATES_EX);
- MFIE_STRING(GENERIC);
- MFIE_STRING(QOS_PARAMETER);
- default:
- return "UNKNOWN";
- }
-}
-#endif
-
-static int ieee80211_parse_info_param(struct ieee80211_info_element
- *info_element, u16 length,
- struct ieee80211_network *network)
-{
- u8 i;
-#ifdef CONFIG_IEEE80211_DEBUG
- char rates_str[64];
- char *p;
-#endif
-
- while (length >= sizeof(*info_element)) {
- if (sizeof(*info_element) + info_element->len > length) {
- IEEE80211_DEBUG_MGMT("Info elem: parse failed: "
- "info_element->len + 2 > left : "
- "info_element->len+2=%zd left=%d, id=%d.\n",
- info_element->len +
- sizeof(*info_element),
- length, info_element->id);
- /* We stop processing but don't return an error here
- * because some misbehaviour APs break this rule. ie.
- * Orinoco AP1000. */
- break;
- }
-
- switch (info_element->id) {
- case MFIE_TYPE_SSID:
- if (ieee80211_is_empty_essid(info_element->data,
- info_element->len)) {
- network->flags |= NETWORK_EMPTY_ESSID;
- break;
- }
-
- network->ssid_len = min(info_element->len,
- (u8) IW_ESSID_MAX_SIZE);
- memcpy(network->ssid, info_element->data,
- network->ssid_len);
- if (network->ssid_len < IW_ESSID_MAX_SIZE)
- memset(network->ssid + network->ssid_len, 0,
- IW_ESSID_MAX_SIZE - network->ssid_len);
-
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_SSID: '%s' len=%d.\n",
- network->ssid, network->ssid_len);
- break;
-
- case MFIE_TYPE_RATES:
-#ifdef CONFIG_IEEE80211_DEBUG
- p = rates_str;
-#endif
- network->rates_len = min(info_element->len,
- MAX_RATES_LENGTH);
- for (i = 0; i < network->rates_len; i++) {
- network->rates[i] = info_element->data[i];
-#ifdef CONFIG_IEEE80211_DEBUG
- p += snprintf(p, sizeof(rates_str) -
- (p - rates_str), "%02X ",
- network->rates[i]);
-#endif
- if (ieee80211_is_ofdm_rate
- (info_element->data[i])) {
- network->flags |= NETWORK_HAS_OFDM;
- if (info_element->data[i] &
- IEEE80211_BASIC_RATE_MASK)
- network->flags &=
- ~NETWORK_HAS_CCK;
- }
- }
-
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_RATES: '%s' (%d)\n",
- rates_str, network->rates_len);
- break;
-
- case MFIE_TYPE_RATES_EX:
-#ifdef CONFIG_IEEE80211_DEBUG
- p = rates_str;
-#endif
- network->rates_ex_len = min(info_element->len,
- MAX_RATES_EX_LENGTH);
- for (i = 0; i < network->rates_ex_len; i++) {
- network->rates_ex[i] = info_element->data[i];
-#ifdef CONFIG_IEEE80211_DEBUG
- p += snprintf(p, sizeof(rates_str) -
- (p - rates_str), "%02X ",
- network->rates[i]);
-#endif
- if (ieee80211_is_ofdm_rate
- (info_element->data[i])) {
- network->flags |= NETWORK_HAS_OFDM;
- if (info_element->data[i] &
- IEEE80211_BASIC_RATE_MASK)
- network->flags &=
- ~NETWORK_HAS_CCK;
- }
- }
-
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_RATES_EX: '%s' (%d)\n",
- rates_str, network->rates_ex_len);
- break;
-
- case MFIE_TYPE_DS_SET:
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_DS_SET: %d\n",
- info_element->data[0]);
- network->channel = info_element->data[0];
- break;
-
- case MFIE_TYPE_FH_SET:
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_FH_SET: ignored\n");
- break;
-
- case MFIE_TYPE_CF_SET:
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_CF_SET: ignored\n");
- break;
-
- case MFIE_TYPE_TIM:
- network->tim.tim_count = info_element->data[0];
- network->tim.tim_period = info_element->data[1];
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_TIM: partially ignored\n");
- break;
-
- case MFIE_TYPE_ERP_INFO:
- network->erp_value = info_element->data[0];
- network->flags |= NETWORK_HAS_ERP_VALUE;
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_ERP_SET: %d\n",
- network->erp_value);
- break;
-
- case MFIE_TYPE_IBSS_SET:
- network->atim_window = info_element->data[0];
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_IBSS_SET: %d\n",
- network->atim_window);
- break;
-
- case MFIE_TYPE_CHALLENGE:
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_CHALLENGE: ignored\n");
- break;
-
- case MFIE_TYPE_GENERIC:
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_GENERIC: %d bytes\n",
- info_element->len);
- if (!ieee80211_parse_qos_info_param_IE(info_element,
- network))
- break;
-
- if (info_element->len >= 4 &&
- info_element->data[0] == 0x00 &&
- info_element->data[1] == 0x50 &&
- info_element->data[2] == 0xf2 &&
- info_element->data[3] == 0x01) {
- network->wpa_ie_len = min(info_element->len + 2,
- MAX_WPA_IE_LEN);
- memcpy(network->wpa_ie, info_element,
- network->wpa_ie_len);
- }
- break;
-
- case MFIE_TYPE_RSN:
- IEEE80211_DEBUG_MGMT("MFIE_TYPE_RSN: %d bytes\n",
- info_element->len);
- network->rsn_ie_len = min(info_element->len + 2,
- MAX_WPA_IE_LEN);
- memcpy(network->rsn_ie, info_element,
- network->rsn_ie_len);
- break;
-
- case MFIE_TYPE_QOS_PARAMETER:
- printk(KERN_ERR
- "QoS Error need to parse QOS_PARAMETER IE\n");
- break;
- /* 802.11h */
- case MFIE_TYPE_POWER_CONSTRAINT:
- network->power_constraint = info_element->data[0];
- network->flags |= NETWORK_HAS_POWER_CONSTRAINT;
- break;
-
- case MFIE_TYPE_CSA:
- network->power_constraint = info_element->data[0];
- network->flags |= NETWORK_HAS_CSA;
- break;
-
- case MFIE_TYPE_QUIET:
- network->quiet.count = info_element->data[0];
- network->quiet.period = info_element->data[1];
- network->quiet.duration = info_element->data[2];
- network->quiet.offset = info_element->data[3];
- network->flags |= NETWORK_HAS_QUIET;
- break;
-
- case MFIE_TYPE_IBSS_DFS:
- if (network->ibss_dfs)
- break;
- network->ibss_dfs = kmemdup(info_element->data,
- info_element->len,
- GFP_ATOMIC);
- if (!network->ibss_dfs)
- return 1;
- network->flags |= NETWORK_HAS_IBSS_DFS;
- break;
-
- case MFIE_TYPE_TPC_REPORT:
- network->tpc_report.transmit_power =
- info_element->data[0];
- network->tpc_report.link_margin = info_element->data[1];
- network->flags |= NETWORK_HAS_TPC_REPORT;
- break;
-
- default:
- IEEE80211_DEBUG_MGMT
- ("Unsupported info element: %s (%d)\n",
- get_info_element_string(info_element->id),
- info_element->id);
- break;
- }
-
- length -= sizeof(*info_element) + info_element->len;
- info_element =
- (struct ieee80211_info_element *)&info_element->
- data[info_element->len];
- }
-
- return 0;
-}
-
-static int ieee80211_handle_assoc_resp(struct ieee80211_device *ieee, struct ieee80211_assoc_response
- *frame, struct ieee80211_rx_stats *stats)
-{
- struct ieee80211_network network_resp = {
- .ibss_dfs = NULL,
- };
- struct ieee80211_network *network = &network_resp;
- struct net_device *dev = ieee->dev;
-
- network->flags = 0;
- network->qos_data.active = 0;
- network->qos_data.supported = 0;
- network->qos_data.param_count = 0;
- network->qos_data.old_param_count = 0;
-
- //network->atim_window = le16_to_cpu(frame->aid) & (0x3FFF);
- network->atim_window = le16_to_cpu(frame->aid);
- network->listen_interval = le16_to_cpu(frame->status);
- memcpy(network->bssid, frame->header.addr3, ETH_ALEN);
- network->capability = le16_to_cpu(frame->capability);
- network->last_scanned = jiffies;
- network->rates_len = network->rates_ex_len = 0;
- network->last_associate = 0;
- network->ssid_len = 0;
- network->erp_value =
- (network->capability & WLAN_CAPABILITY_IBSS) ? 0x3 : 0x0;
-
- if (stats->freq == IEEE80211_52GHZ_BAND) {
- /* for A band (No DS info) */
- network->channel = stats->received_channel;
- } else
- network->flags |= NETWORK_HAS_CCK;
-
- network->wpa_ie_len = 0;
- network->rsn_ie_len = 0;
-
- if (ieee80211_parse_info_param
- (frame->info_element, stats->len - sizeof(*frame), network))
- return 1;
-
- network->mode = 0;
- if (stats->freq == IEEE80211_52GHZ_BAND)
- network->mode = IEEE_A;
- else {
- if (network->flags & NETWORK_HAS_OFDM)
- network->mode |= IEEE_G;
- if (network->flags & NETWORK_HAS_CCK)
- network->mode |= IEEE_B;
- }
-
- if (ieee80211_is_empty_essid(network->ssid, network->ssid_len))
- network->flags |= NETWORK_EMPTY_ESSID;
-
- memcpy(&network->stats, stats, sizeof(network->stats));
-
- if (ieee->handle_assoc_response != NULL)
- ieee->handle_assoc_response(dev, frame, network);
-
- return 0;
-}
-
-/***************************************************/
-
-static int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee80211_probe_response
- *beacon,
- struct ieee80211_network *network,
- struct ieee80211_rx_stats *stats)
-{
- network->qos_data.active = 0;
- network->qos_data.supported = 0;
- network->qos_data.param_count = 0;
- network->qos_data.old_param_count = 0;
-
- /* Pull out fixed field data */
- memcpy(network->bssid, beacon->header.addr3, ETH_ALEN);
- network->capability = le16_to_cpu(beacon->capability);
- network->last_scanned = jiffies;
- network->time_stamp[0] = le32_to_cpu(beacon->time_stamp[0]);
- network->time_stamp[1] = le32_to_cpu(beacon->time_stamp[1]);
- network->beacon_interval = le16_to_cpu(beacon->beacon_interval);
- /* Where to pull this? beacon->listen_interval; */
- network->listen_interval = 0x0A;
- network->rates_len = network->rates_ex_len = 0;
- network->last_associate = 0;
- network->ssid_len = 0;
- network->flags = 0;
- network->atim_window = 0;
- network->erp_value = (network->capability & WLAN_CAPABILITY_IBSS) ?
- 0x3 : 0x0;
-
- if (stats->freq == IEEE80211_52GHZ_BAND) {
- /* for A band (No DS info) */
- network->channel = stats->received_channel;
- } else
- network->flags |= NETWORK_HAS_CCK;
-
- network->wpa_ie_len = 0;
- network->rsn_ie_len = 0;
-
- if (ieee80211_parse_info_param
- (beacon->info_element, stats->len - sizeof(*beacon), network))
- return 1;
-
- network->mode = 0;
- if (stats->freq == IEEE80211_52GHZ_BAND)
- network->mode = IEEE_A;
- else {
- if (network->flags & NETWORK_HAS_OFDM)
- network->mode |= IEEE_G;
- if (network->flags & NETWORK_HAS_CCK)
- network->mode |= IEEE_B;
- }
-
- if (network->mode == 0) {
- IEEE80211_DEBUG_SCAN("Filtered out '%s (" MAC_FMT ")' "
- "network.\n",
- escape_essid(network->ssid,
- network->ssid_len),
- MAC_ARG(network->bssid));
- return 1;
- }
-
- if (ieee80211_is_empty_essid(network->ssid, network->ssid_len))
- network->flags |= NETWORK_EMPTY_ESSID;
-
- memcpy(&network->stats, stats, sizeof(network->stats));
-
- return 0;
-}
-
-static inline int is_same_network(struct ieee80211_network *src,
- struct ieee80211_network *dst)
-{
- /* A network is only a duplicate if the channel, BSSID, and ESSID
- * all match. We treat all <hidden> with the same BSSID and channel
- * as one network */
- return ((src->ssid_len == dst->ssid_len) &&
- (src->channel == dst->channel) &&
- !compare_ether_addr(src->bssid, dst->bssid) &&
- !memcmp(src->ssid, dst->ssid, src->ssid_len));
-}
-
-static void update_network(struct ieee80211_network *dst,
- struct ieee80211_network *src)
-{
- int qos_active;
- u8 old_param;
-
- ieee80211_network_reset(dst);
- dst->ibss_dfs = src->ibss_dfs;
-
- /* We only update the statistics if they were created by receiving
- * the network information on the actual channel the network is on.
- *
- * This keeps beacons received on neighbor channels from bringing
- * down the signal level of an AP. */
- if (dst->channel == src->stats.received_channel)
- memcpy(&dst->stats, &src->stats,
- sizeof(struct ieee80211_rx_stats));
- else
- IEEE80211_DEBUG_SCAN("Network " MAC_FMT " info received "
- "off channel (%d vs. %d)\n", MAC_ARG(src->bssid),
- dst->channel, src->stats.received_channel);
-
- dst->capability = src->capability;
- memcpy(dst->rates, src->rates, src->rates_len);
- dst->rates_len = src->rates_len;
- memcpy(dst->rates_ex, src->rates_ex, src->rates_ex_len);
- dst->rates_ex_len = src->rates_ex_len;
-
- dst->mode = src->mode;
- dst->flags = src->flags;
- dst->time_stamp[0] = src->time_stamp[0];
- dst->time_stamp[1] = src->time_stamp[1];
-
- dst->beacon_interval = src->beacon_interval;
- dst->listen_interval = src->listen_interval;
- dst->atim_window = src->atim_window;
- dst->erp_value = src->erp_value;
- dst->tim = src->tim;
-
- memcpy(dst->wpa_ie, src->wpa_ie, src->wpa_ie_len);
- dst->wpa_ie_len = src->wpa_ie_len;
- memcpy(dst->rsn_ie, src->rsn_ie, src->rsn_ie_len);
- dst->rsn_ie_len = src->rsn_ie_len;
-
- dst->last_scanned = jiffies;
- qos_active = src->qos_data.active;
- old_param = dst->qos_data.old_param_count;
- if (dst->flags & NETWORK_HAS_QOS_MASK)
- memcpy(&dst->qos_data, &src->qos_data,
- sizeof(struct ieee80211_qos_data));
- else {
- dst->qos_data.supported = src->qos_data.supported;
- dst->qos_data.param_count = src->qos_data.param_count;
- }
-
- if (dst->qos_data.supported == 1) {
- if (dst->ssid_len)
- IEEE80211_DEBUG_QOS
- ("QoS the network %s is QoS supported\n",
- dst->ssid);
- else
- IEEE80211_DEBUG_QOS
- ("QoS the network is QoS supported\n");
- }
- dst->qos_data.active = qos_active;
- dst->qos_data.old_param_count = old_param;
-
- /* dst->last_associate is not overwritten */
-}
-
-static inline int is_beacon(__le16 fc)
-{
- return (WLAN_FC_GET_STYPE(le16_to_cpu(fc)) == IEEE80211_STYPE_BEACON);
-}
-
-static void ieee80211_process_probe_response(struct ieee80211_device
- *ieee, struct
- ieee80211_probe_response
- *beacon, struct ieee80211_rx_stats
- *stats)
-{
- struct net_device *dev = ieee->dev;
- struct ieee80211_network network = {
- .ibss_dfs = NULL,
- };
- struct ieee80211_network *target;
- struct ieee80211_network *oldest = NULL;
-#ifdef CONFIG_IEEE80211_DEBUG
- struct ieee80211_info_element *info_element = beacon->info_element;
-#endif
- unsigned long flags;
-
- IEEE80211_DEBUG_SCAN("'%s' (" MAC_FMT
- "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n",
- escape_essid(info_element->data,
- info_element->len),
- MAC_ARG(beacon->header.addr3),
- (beacon->capability & (1 << 0xf)) ? '1' : '0',
- (beacon->capability & (1 << 0xe)) ? '1' : '0',
- (beacon->capability & (1 << 0xd)) ? '1' : '0',
- (beacon->capability & (1 << 0xc)) ? '1' : '0',
- (beacon->capability & (1 << 0xb)) ? '1' : '0',
- (beacon->capability & (1 << 0xa)) ? '1' : '0',
- (beacon->capability & (1 << 0x9)) ? '1' : '0',
- (beacon->capability & (1 << 0x8)) ? '1' : '0',
- (beacon->capability & (1 << 0x7)) ? '1' : '0',
- (beacon->capability & (1 << 0x6)) ? '1' : '0',
- (beacon->capability & (1 << 0x5)) ? '1' : '0',
- (beacon->capability & (1 << 0x4)) ? '1' : '0',
- (beacon->capability & (1 << 0x3)) ? '1' : '0',
- (beacon->capability & (1 << 0x2)) ? '1' : '0',
- (beacon->capability & (1 << 0x1)) ? '1' : '0',
- (beacon->capability & (1 << 0x0)) ? '1' : '0');
-
- if (ieee80211_network_init(ieee, beacon, &network, stats)) {
- IEEE80211_DEBUG_SCAN("Dropped '%s' (" MAC_FMT ") via %s.\n",
- escape_essid(info_element->data,
- info_element->len),
- MAC_ARG(beacon->header.addr3),
- is_beacon(beacon->header.frame_ctl) ?
- "BEACON" : "PROBE RESPONSE");
- return;
- }
-
- /* The network parsed correctly -- so now we scan our known networks
- * to see if we can find it in our list.
- *
- * NOTE: This search is definitely not optimized. Once its doing
- * the "right thing" we'll optimize it for efficiency if
- * necessary */
-
- /* Search for this entry in the list and update it if it is
- * already there. */
-
- spin_lock_irqsave(&ieee->lock, flags);
-
- list_for_each_entry(target, &ieee->network_list, list) {
- if (is_same_network(target, &network))
- break;
-
- if ((oldest == NULL) ||
- (target->last_scanned < oldest->last_scanned))
- oldest = target;
- }
-
- /* If we didn't find a match, then get a new network slot to initialize
- * with this beacon's information */
- if (&target->list == &ieee->network_list) {
- if (list_empty(&ieee->network_free_list)) {
- /* If there are no more slots, expire the oldest */
- list_del(&oldest->list);
- target = oldest;
- IEEE80211_DEBUG_SCAN("Expired '%s' (" MAC_FMT ") from "
- "network list.\n",
- escape_essid(target->ssid,
- target->ssid_len),
- MAC_ARG(target->bssid));
- ieee80211_network_reset(target);
- } else {
- /* Otherwise just pull from the free list */
- target = list_entry(ieee->network_free_list.next,
- struct ieee80211_network, list);
- list_del(ieee->network_free_list.next);
- }
-
-#ifdef CONFIG_IEEE80211_DEBUG
- IEEE80211_DEBUG_SCAN("Adding '%s' (" MAC_FMT ") via %s.\n",
- escape_essid(network.ssid,
- network.ssid_len),
- MAC_ARG(network.bssid),
- is_beacon(beacon->header.frame_ctl) ?
- "BEACON" : "PROBE RESPONSE");
-#endif
- memcpy(target, &network, sizeof(*target));
- network.ibss_dfs = NULL;
- list_add_tail(&target->list, &ieee->network_list);
- } else {
- IEEE80211_DEBUG_SCAN("Updating '%s' (" MAC_FMT ") via %s.\n",
- escape_essid(target->ssid,
- target->ssid_len),
- MAC_ARG(target->bssid),
- is_beacon(beacon->header.frame_ctl) ?
- "BEACON" : "PROBE RESPONSE");
- update_network(target, &network);
- network.ibss_dfs = NULL;
- }
-
- spin_unlock_irqrestore(&ieee->lock, flags);
-
- if (is_beacon(beacon->header.frame_ctl)) {
- if (ieee->handle_beacon != NULL)
- ieee->handle_beacon(dev, beacon, target);
- } else {
- if (ieee->handle_probe_response != NULL)
- ieee->handle_probe_response(dev, beacon, target);
- }
-}
-
-void ieee80211_rx_mgt(struct ieee80211_device *ieee,
- struct ieee80211_hdr_4addr *header,
- struct ieee80211_rx_stats *stats)
-{
- switch (WLAN_FC_GET_STYPE(le16_to_cpu(header->frame_ctl))) {
- case IEEE80211_STYPE_ASSOC_RESP:
- IEEE80211_DEBUG_MGMT("received ASSOCIATION RESPONSE (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
- ieee80211_handle_assoc_resp(ieee,
- (struct ieee80211_assoc_response *)
- header, stats);
- break;
-
- case IEEE80211_STYPE_REASSOC_RESP:
- IEEE80211_DEBUG_MGMT("received REASSOCIATION RESPONSE (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
- break;
-
- case IEEE80211_STYPE_PROBE_REQ:
- IEEE80211_DEBUG_MGMT("received auth (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
-
- if (ieee->handle_probe_request != NULL)
- ieee->handle_probe_request(ieee->dev,
- (struct
- ieee80211_probe_request *)
- header, stats);
- break;
-
- case IEEE80211_STYPE_PROBE_RESP:
- IEEE80211_DEBUG_MGMT("received PROBE RESPONSE (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
- IEEE80211_DEBUG_SCAN("Probe response\n");
- ieee80211_process_probe_response(ieee,
- (struct
- ieee80211_probe_response *)
- header, stats);
- break;
-
- case IEEE80211_STYPE_BEACON:
- IEEE80211_DEBUG_MGMT("received BEACON (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
- IEEE80211_DEBUG_SCAN("Beacon\n");
- ieee80211_process_probe_response(ieee,
- (struct
- ieee80211_probe_response *)
- header, stats);
- break;
- case IEEE80211_STYPE_AUTH:
-
- IEEE80211_DEBUG_MGMT("received auth (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
-
- if (ieee->handle_auth != NULL)
- ieee->handle_auth(ieee->dev,
- (struct ieee80211_auth *)header);
- break;
-
- case IEEE80211_STYPE_DISASSOC:
- if (ieee->handle_disassoc != NULL)
- ieee->handle_disassoc(ieee->dev,
- (struct ieee80211_disassoc *)
- header);
- break;
-
- case IEEE80211_STYPE_ACTION:
- IEEE80211_DEBUG_MGMT("ACTION\n");
- if (ieee->handle_action)
- ieee->handle_action(ieee->dev,
- (struct ieee80211_action *)
- header, stats);
- break;
-
- case IEEE80211_STYPE_REASSOC_REQ:
- IEEE80211_DEBUG_MGMT("received reassoc (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
-
- IEEE80211_DEBUG_MGMT("%s: IEEE80211_REASSOC_REQ received\n",
- ieee->dev->name);
- if (ieee->handle_reassoc_request != NULL)
- ieee->handle_reassoc_request(ieee->dev,
- (struct ieee80211_reassoc_request *)
- header);
- break;
-
- case IEEE80211_STYPE_ASSOC_REQ:
- IEEE80211_DEBUG_MGMT("received assoc (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
-
- IEEE80211_DEBUG_MGMT("%s: IEEE80211_ASSOC_REQ received\n",
- ieee->dev->name);
- if (ieee->handle_assoc_request != NULL)
- ieee->handle_assoc_request(ieee->dev);
- break;
-
- case IEEE80211_STYPE_DEAUTH:
- IEEE80211_DEBUG_MGMT("DEAUTH\n");
- if (ieee->handle_deauth != NULL)
- ieee->handle_deauth(ieee->dev,
- (struct ieee80211_deauth *)
- header);
- break;
- default:
- IEEE80211_DEBUG_MGMT("received UNKNOWN (%d)\n",
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
- IEEE80211_DEBUG_MGMT("%s: Unknown management packet: %d\n",
- ieee->dev->name,
- WLAN_FC_GET_STYPE(le16_to_cpu
- (header->frame_ctl)));
- break;
- }
-}
-
-EXPORT_SYMBOL_GPL(ieee80211_rx_any);
-EXPORT_SYMBOL(ieee80211_rx_mgt);
-EXPORT_SYMBOL(ieee80211_rx);
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
deleted file mode 100644
index 854fc13cd78d..000000000000
--- a/net/ieee80211/ieee80211_tx.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/******************************************************************************
-
- Copyright(c) 2003 - 2005 Intel Corporation. All rights reserved.
-
- This program is free software; you can redistribute it and/or modify it
- under the terms of version 2 of the GNU General Public License as
- published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program; if not, write to the Free Software Foundation, Inc., 59
- Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- The full GNU General Public License is included in this distribution in the
- file called LICENSE.
-
- Contact Information:
- James P. Ketrenos <ipw2100-admin@linux.intel.com>
- Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-******************************************************************************/
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/if_arp.h>
-#include <linux/in6.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/tcp.h>
-#include <linux/types.h>
-#include <linux/wireless.h>
-#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
-
-#include <net/ieee80211.h>
-
-/*
-
-802.11 Data Frame
-
- ,-------------------------------------------------------------------.
-Bytes | 2 | 2 | 6 | 6 | 6 | 2 | 0..2312 | 4 |
- |------|------|---------|---------|---------|------|---------|------|
-Desc. | ctrl | dura | DA/RA | TA | SA | Sequ | Frame | fcs |
- | | tion | (BSSID) | | | ence | data | |
- `--------------------------------------------------| |------'
-Total: 28 non-data bytes `----.----'
- |
- .- 'Frame data' expands, if WEP enabled, to <----------'
- |
- V
- ,-----------------------.
-Bytes | 4 | 0-2296 | 4 |
- |-----|-----------|-----|
-Desc. | IV | Encrypted | ICV |
- | | Packet | |
- `-----| |-----'
- `-----.-----'
- |
- .- 'Encrypted Packet' expands to
- |
- V
- ,---------------------------------------------------.
-Bytes | 1 | 1 | 1 | 3 | 2 | 0-2304 |
- |------|------|---------|----------|------|---------|
-Desc. | SNAP | SNAP | Control |Eth Tunnel| Type | IP |
- | DSAP | SSAP | | | | Packet |
- | 0xAA | 0xAA |0x03 (UI)|0x00-00-F8| | |
- `----------------------------------------------------
-Total: 8 non-data bytes
-
-802.3 Ethernet Data Frame
-
- ,-----------------------------------------.
-Bytes | 6 | 6 | 2 | Variable | 4 |
- |-------|-------|------|-----------|------|
-Desc. | Dest. | Source| Type | IP Packet | fcs |
- | MAC | MAC | | | |
- `-----------------------------------------'
-Total: 18 non-data bytes
-
-In the event that fragmentation is required, the incoming payload is split into
-N parts of size ieee->fts. The first fragment contains the SNAP header and the
-remaining packets are just data.
-
-If encryption is enabled, each fragment payload size is reduced by enough space
-to add the prefix and postfix (IV and ICV totalling 8 bytes in the case of WEP)
-So if you have 1500 bytes of payload with ieee->fts set to 500 without
-encryption it will take 3 frames. With WEP it will take 4 frames as the
-payload of each frame is reduced to 492 bytes.
-
-* SKB visualization
-*
-* ,- skb->data
-* |
-* | ETHERNET HEADER ,-<-- PAYLOAD
-* | | 14 bytes from skb->data
-* | 2 bytes for Type --> ,T. | (sizeof ethhdr)
-* | | | |
-* |,-Dest.--. ,--Src.---. | | |
-* | 6 bytes| | 6 bytes | | | |
-* v | | | | | |
-* 0 | v 1 | v | v 2
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
-* ^ | ^ | ^ |
-* | | | | | |
-* | | | | `T' <---- 2 bytes for Type
-* | | | |
-* | | '---SNAP--' <-------- 6 bytes for SNAP
-* | |
-* `-IV--' <-------------------- 4 bytes for IV (WEP)
-*
-* SNAP HEADER
-*
-*/
-
-static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 };
-static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 };
-
-static int ieee80211_copy_snap(u8 * data, u16 h_proto)
-{
- struct ieee80211_snap_hdr *snap;
- u8 *oui;
-
- snap = (struct ieee80211_snap_hdr *)data;
- snap->dsap = 0xaa;
- snap->ssap = 0xaa;
- snap->ctrl = 0x03;
-
- if (h_proto == 0x8137 || h_proto == 0x80f3)
- oui = P802_1H_OUI;
- else
- oui = RFC1042_OUI;
- snap->oui[0] = oui[0];
- snap->oui[1] = oui[1];
- snap->oui[2] = oui[2];
-
- *(u16 *) (data + SNAP_SIZE) = htons(h_proto);
-
- return SNAP_SIZE + sizeof(u16);
-}
-
-static int ieee80211_encrypt_fragment(struct ieee80211_device *ieee,
- struct sk_buff *frag, int hdr_len)
-{
- struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx];
- int res;
-
- if (crypt == NULL)
- return -1;
-
- /* To encrypt, frame format is:
- * IV (4 bytes), clear payload (including SNAP), ICV (4 bytes) */
- atomic_inc(&crypt->refcnt);
- res = 0;
- if (crypt->ops && crypt->ops->encrypt_mpdu)
- res = crypt->ops->encrypt_mpdu(frag, hdr_len, crypt->priv);
-
- atomic_dec(&crypt->refcnt);
- if (res < 0) {
- printk(KERN_INFO "%s: Encryption failed: len=%d.\n",
- ieee->dev->name, frag->len);
- ieee->ieee_stats.tx_discards++;
- return -1;
- }
-
- return 0;
-}
-
-void ieee80211_txb_free(struct ieee80211_txb *txb)
-{
- int i;
- if (unlikely(!txb))
- return;
- for (i = 0; i < txb->nr_frags; i++)
- if (txb->fragments[i])
- dev_kfree_skb_any(txb->fragments[i]);
- kfree(txb);
-}
-
-static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size,
- int headroom, gfp_t gfp_mask)
-{
- struct ieee80211_txb *txb;
- int i;
- txb = kmalloc(sizeof(struct ieee80211_txb) + (sizeof(u8 *) * nr_frags),
- gfp_mask);
- if (!txb)
- return NULL;
-
- memset(txb, 0, sizeof(struct ieee80211_txb));
- txb->nr_frags = nr_frags;
- txb->frag_size = txb_size;
-
- for (i = 0; i < nr_frags; i++) {
- txb->fragments[i] = __dev_alloc_skb(txb_size + headroom,
- gfp_mask);
- if (unlikely(!txb->fragments[i])) {
- i--;
- break;
- }
- skb_reserve(txb->fragments[i], headroom);
- }
- if (unlikely(i != nr_frags)) {
- while (i >= 0)
- dev_kfree_skb_any(txb->fragments[i--]);
- kfree(txb);
- return NULL;
- }
- return txb;
-}
-
-static int ieee80211_classify(struct sk_buff *skb)
-{
- struct ethhdr *eth;
- struct iphdr *ip;
-
- eth = (struct ethhdr *)skb->data;
- if (eth->h_proto != __constant_htons(ETH_P_IP))
- return 0;
-
- ip = skb->nh.iph;
- switch (ip->tos & 0xfc) {
- case 0x20:
- return 2;
- case 0x40:
- return 1;
- case 0x60:
- return 3;
- case 0x80:
- return 4;
- case 0xa0:
- return 5;
- case 0xc0:
- return 6;
- case 0xe0:
- return 7;
- default:
- return 0;
- }
-}
-
-/* Incoming skb is converted to a txb which consists of
- * a block of 802.11 fragment packets (stored as skbs) */
-int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct ieee80211_device *ieee = netdev_priv(dev);
- struct ieee80211_txb *txb = NULL;
- struct ieee80211_hdr_3addrqos *frag_hdr;
- int i, bytes_per_frag, nr_frags, bytes_last_frag, frag_size,
- rts_required;
- unsigned long flags;
- struct net_device_stats *stats = &ieee->stats;
- int ether_type, encrypt, host_encrypt, host_encrypt_msdu, host_build_iv;
- int bytes, fc, hdr_len;
- struct sk_buff *skb_frag;
- struct ieee80211_hdr_3addrqos header = {/* Ensure zero initialized */
- .duration_id = 0,
- .seq_ctl = 0,
- .qos_ctl = 0
- };
- u8 dest[ETH_ALEN], src[ETH_ALEN];
- struct ieee80211_crypt_data *crypt;
- int priority = skb->priority;
- int snapped = 0;
-
- if (ieee->is_queue_full && (*ieee->is_queue_full) (dev, priority))
- return NETDEV_TX_BUSY;
-
- spin_lock_irqsave(&ieee->lock, flags);
-
- /* If there is no driver handler to take the TXB, dont' bother
- * creating it... */
- if (!ieee->hard_start_xmit) {
- printk(KERN_WARNING "%s: No xmit handler.\n", ieee->dev->name);
- goto success;
- }
-
- if (unlikely(skb->len < SNAP_SIZE + sizeof(u16))) {
- printk(KERN_WARNING "%s: skb too small (%d).\n",
- ieee->dev->name, skb->len);
- goto success;
- }
-
- ether_type = ntohs(((struct ethhdr *)skb->data)->h_proto);
-
- crypt = ieee->crypt[ieee->tx_keyidx];
-
- encrypt = !(ether_type == ETH_P_PAE && ieee->ieee802_1x) &&
- ieee->sec.encrypt;
-
- host_encrypt = ieee->host_encrypt && encrypt && crypt;
- host_encrypt_msdu = ieee->host_encrypt_msdu && encrypt && crypt;
- host_build_iv = ieee->host_build_iv && encrypt && crypt;
-
- if (!encrypt && ieee->ieee802_1x &&
- ieee->drop_unencrypted && ether_type != ETH_P_PAE) {
- stats->tx_dropped++;
- goto success;
- }
-
- /* Save source and destination addresses */
- memcpy(dest, skb->data, ETH_ALEN);
- memcpy(src, skb->data + ETH_ALEN, ETH_ALEN);
-
- if (host_encrypt || host_build_iv)
- fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA |
- IEEE80211_FCTL_PROTECTED;
- else
- fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA;
-
- if (ieee->iw_mode == IW_MODE_INFRA) {
- fc |= IEEE80211_FCTL_TODS;
- /* To DS: Addr1 = BSSID, Addr2 = SA, Addr3 = DA */
- memcpy(header.addr1, ieee->bssid, ETH_ALEN);
- memcpy(header.addr2, src, ETH_ALEN);
- memcpy(header.addr3, dest, ETH_ALEN);
- } else if (ieee->iw_mode == IW_MODE_ADHOC) {
- /* not From/To DS: Addr1 = DA, Addr2 = SA, Addr3 = BSSID */
- memcpy(header.addr1, dest, ETH_ALEN);
- memcpy(header.addr2, src, ETH_ALEN);
- memcpy(header.addr3, ieee->bssid, ETH_ALEN);
- }
- hdr_len = IEEE80211_3ADDR_LEN;
-
- if (ieee->is_qos_active && ieee->is_qos_active(dev, skb)) {
- fc |= IEEE80211_STYPE_QOS_DATA;
- hdr_len += 2;
-
- skb->priority = ieee80211_classify(skb);
- header.qos_ctl |= cpu_to_le16(skb->priority & IEEE80211_QCTL_TID);
- }
- header.frame_ctl = cpu_to_le16(fc);
-
- /* Advance the SKB to the start of the payload */
- skb_pull(skb, sizeof(struct ethhdr));
-
- /* Determine total amount of storage required for TXB packets */
- bytes = skb->len + SNAP_SIZE + sizeof(u16);
-
- /* Encrypt msdu first on the whole data packet. */
- if ((host_encrypt || host_encrypt_msdu) &&
- crypt && crypt->ops && crypt->ops->encrypt_msdu) {
- int res = 0;
- int len = bytes + hdr_len + crypt->ops->extra_msdu_prefix_len +
- crypt->ops->extra_msdu_postfix_len;
- struct sk_buff *skb_new = dev_alloc_skb(len);
-
- if (unlikely(!skb_new))
- goto failed;
-
- skb_reserve(skb_new, crypt->ops->extra_msdu_prefix_len);
- memcpy(skb_put(skb_new, hdr_len), &header, hdr_len);
- snapped = 1;
- ieee80211_copy_snap(skb_put(skb_new, SNAP_SIZE + sizeof(u16)),
- ether_type);
- memcpy(skb_put(skb_new, skb->len), skb->data, skb->len);
- res = crypt->ops->encrypt_msdu(skb_new, hdr_len, crypt->priv);
- if (res < 0) {
- IEEE80211_ERROR("msdu encryption failed\n");
- dev_kfree_skb_any(skb_new);
- goto failed;
- }
- dev_kfree_skb_any(skb);
- skb = skb_new;
- bytes += crypt->ops->extra_msdu_prefix_len +
- crypt->ops->extra_msdu_postfix_len;
- skb_pull(skb, hdr_len);
- }
-
- if (host_encrypt || ieee->host_open_frag) {
- /* Determine fragmentation size based on destination (multicast
- * and broadcast are not fragmented) */
- if (is_multicast_ether_addr(dest) ||
- is_broadcast_ether_addr(dest))
- frag_size = MAX_FRAG_THRESHOLD;
- else
- frag_size = ieee->fts;
-
- /* Determine amount of payload per fragment. Regardless of if
- * this stack is providing the full 802.11 header, one will
- * eventually be affixed to this fragment -- so we must account
- * for it when determining the amount of payload space. */
- bytes_per_frag = frag_size - hdr_len;
- if (ieee->config &
- (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
- bytes_per_frag -= IEEE80211_FCS_LEN;
-
- /* Each fragment may need to have room for encryptiong
- * pre/postfix */
- if (host_encrypt)
- bytes_per_frag -= crypt->ops->extra_mpdu_prefix_len +
- crypt->ops->extra_mpdu_postfix_len;
-
- /* Number of fragments is the total
- * bytes_per_frag / payload_per_fragment */
- nr_frags = bytes / bytes_per_frag;
- bytes_last_frag = bytes % bytes_per_frag;
- if (bytes_last_frag)
- nr_frags++;
- else
- bytes_last_frag = bytes_per_frag;
- } else {
- nr_frags = 1;
- bytes_per_frag = bytes_last_frag = bytes;
- frag_size = bytes + hdr_len;
- }
-
- rts_required = (frag_size > ieee->rts
- && ieee->config & CFG_IEEE80211_RTS);
- if (rts_required)
- nr_frags++;
-
- /* When we allocate the TXB we allocate enough space for the reserve
- * and full fragment bytes (bytes_per_frag doesn't include prefix,
- * postfix, header, FCS, etc.) */
- txb = ieee80211_alloc_txb(nr_frags, frag_size,
- ieee->tx_headroom, GFP_ATOMIC);
- if (unlikely(!txb)) {
- printk(KERN_WARNING "%s: Could not allocate TXB\n",
- ieee->dev->name);
- goto failed;
- }
- txb->encrypted = encrypt;
- if (host_encrypt)
- txb->payload_size = frag_size * (nr_frags - 1) +
- bytes_last_frag;
- else
- txb->payload_size = bytes;
-
- if (rts_required) {
- skb_frag = txb->fragments[0];
- frag_hdr =
- (struct ieee80211_hdr_3addrqos *)skb_put(skb_frag, hdr_len);
-
- /*
- * Set header frame_ctl to the RTS.
- */
- header.frame_ctl =
- cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS);
- memcpy(frag_hdr, &header, hdr_len);
-
- /*
- * Restore header frame_ctl to the original data setting.
- */
- header.frame_ctl = cpu_to_le16(fc);
-
- if (ieee->config &
- (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
- skb_put(skb_frag, 4);
-
- txb->rts_included = 1;
- i = 1;
- } else
- i = 0;
-
- for (; i < nr_frags; i++) {
- skb_frag = txb->fragments[i];
-
- if (host_encrypt || host_build_iv)
- skb_reserve(skb_frag,
- crypt->ops->extra_mpdu_prefix_len);
-
- frag_hdr =
- (struct ieee80211_hdr_3addrqos *)skb_put(skb_frag, hdr_len);
- memcpy(frag_hdr, &header, hdr_len);
-
- /* If this is not the last fragment, then add the MOREFRAGS
- * bit to the frame control */
- if (i != nr_frags - 1) {
- frag_hdr->frame_ctl =
- cpu_to_le16(fc | IEEE80211_FCTL_MOREFRAGS);
- bytes = bytes_per_frag;
- } else {
- /* The last fragment takes the remaining length */
- bytes = bytes_last_frag;
- }
-
- if (i == 0 && !snapped) {
- ieee80211_copy_snap(skb_put
- (skb_frag, SNAP_SIZE + sizeof(u16)),
- ether_type);
- bytes -= SNAP_SIZE + sizeof(u16);
- }
-
- memcpy(skb_put(skb_frag, bytes), skb->data, bytes);
-
- /* Advance the SKB... */
- skb_pull(skb, bytes);
-
- /* Encryption routine will move the header forward in order
- * to insert the IV between the header and the payload */
- if (host_encrypt)
- ieee80211_encrypt_fragment(ieee, skb_frag, hdr_len);
- else if (host_build_iv) {
- struct ieee80211_crypt_data *crypt;
-
- crypt = ieee->crypt[ieee->tx_keyidx];
- atomic_inc(&crypt->refcnt);
- if (crypt->ops->build_iv)
- crypt->ops->build_iv(skb_frag, hdr_len,
- ieee->sec.keys[ieee->sec.active_key],
- ieee->sec.key_sizes[ieee->sec.active_key],
- crypt->priv);
- atomic_dec(&crypt->refcnt);
- }
-
- if (ieee->config &
- (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
- skb_put(skb_frag, 4);
- }
-
- success:
- spin_unlock_irqrestore(&ieee->lock, flags);
-
- dev_kfree_skb_any(skb);
-
- if (txb) {
- int ret = (*ieee->hard_start_xmit) (txb, dev, priority);
- if (ret == 0) {
- stats->tx_packets++;
- stats->tx_bytes += txb->payload_size;
- return 0;
- }
-
- ieee80211_txb_free(txb);
- }
-
- return 0;
-
- failed:
- spin_unlock_irqrestore(&ieee->lock, flags);
- netif_stop_queue(dev);
- stats->tx_errors++;
- return 1;
-}
-
-/* Incoming 802.11 strucure is converted to a TXB
- * a block of 802.11 fragment packets (stored as skbs) */
-int ieee80211_tx_frame(struct ieee80211_device *ieee,
- struct ieee80211_hdr *frame, int hdr_len, int total_len,
- int encrypt_mpdu)
-{
- struct ieee80211_txb *txb = NULL;
- unsigned long flags;
- struct net_device_stats *stats = &ieee->stats;
- struct sk_buff *skb_frag;
- int priority = -1;
- int fraglen = total_len;
- int headroom = ieee->tx_headroom;
- struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx];
-
- spin_lock_irqsave(&ieee->lock, flags);
-
- if (encrypt_mpdu && (!ieee->sec.encrypt || !crypt))
- encrypt_mpdu = 0;
-
- /* If there is no driver handler to take the TXB, dont' bother
- * creating it... */
- if (!ieee->hard_start_xmit) {
- printk(KERN_WARNING "%s: No xmit handler.\n", ieee->dev->name);
- goto success;
- }
-
- if (unlikely(total_len < 24)) {
- printk(KERN_WARNING "%s: skb too small (%d).\n",
- ieee->dev->name, total_len);
- goto success;
- }
-
- if (encrypt_mpdu) {
- frame->frame_ctl |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
- fraglen += crypt->ops->extra_mpdu_prefix_len +
- crypt->ops->extra_mpdu_postfix_len;
- headroom += crypt->ops->extra_mpdu_prefix_len;
- }
-
- /* When we allocate the TXB we allocate enough space for the reserve
- * and full fragment bytes (bytes_per_frag doesn't include prefix,
- * postfix, header, FCS, etc.) */
- txb = ieee80211_alloc_txb(1, fraglen, headroom, GFP_ATOMIC);
- if (unlikely(!txb)) {
- printk(KERN_WARNING "%s: Could not allocate TXB\n",
- ieee->dev->name);
- goto failed;
- }
- txb->encrypted = 0;
- txb->payload_size = fraglen;
-
- skb_frag = txb->fragments[0];
-
- memcpy(skb_put(skb_frag, total_len), frame, total_len);
-
- if (ieee->config &
- (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
- skb_put(skb_frag, 4);
-
- /* To avoid overcomplicating things, we do the corner-case frame
- * encryption in software. The only real situation where encryption is
- * needed here is during software-based shared key authentication. */
- if (encrypt_mpdu)
- ieee80211_encrypt_fragment(ieee, skb_frag, hdr_len);
-
- success:
- spin_unlock_irqrestore(&ieee->lock, flags);
-
- if (txb) {
- if ((*ieee->hard_start_xmit) (txb, ieee->dev, priority) == 0) {
- stats->tx_packets++;
- stats->tx_bytes += txb->payload_size;
- return 0;
- }
- ieee80211_txb_free(txb);
- }
- return 0;
-
- failed:
- spin_unlock_irqrestore(&ieee->lock, flags);
- stats->tx_errors++;
- return 1;
-}
-
-EXPORT_SYMBOL(ieee80211_tx_frame);
-EXPORT_SYMBOL(ieee80211_txb_free);
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
deleted file mode 100644
index 5cb9cfd35397..000000000000
--- a/net/ieee80211/ieee80211_wx.c
+++ /dev/null
@@ -1,841 +0,0 @@
-/******************************************************************************
-
- Copyright(c) 2004-2005 Intel Corporation. All rights reserved.
-
- Portions of this file are based on the WEP enablement code provided by the
- Host AP project hostap-drivers v0.1.3
- Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
- <jkmaline@cc.hut.fi>
- Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
-
- This program is free software; you can redistribute it and/or modify it
- under the terms of version 2 of the GNU General Public License as
- published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program; if not, write to the Free Software Foundation, Inc., 59
- Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- The full GNU General Public License is included in this distribution in the
- file called LICENSE.
-
- Contact Information:
- James P. Ketrenos <ipw2100-admin@linux.intel.com>
- Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-******************************************************************************/
-
-#include <linux/kmod.h>
-#include <linux/module.h>
-#include <linux/jiffies.h>
-
-#include <net/ieee80211.h>
-#include <linux/wireless.h>
-
-static const char *ieee80211_modes[] = {
- "?", "a", "b", "ab", "g", "ag", "bg", "abg"
-};
-
-#define MAX_CUSTOM_LEN 64
-static char *ieee80211_translate_scan(struct ieee80211_device *ieee,
- char *start, char *stop,
- struct ieee80211_network *network)
-{
- char custom[MAX_CUSTOM_LEN];
- char *p;
- struct iw_event iwe;
- int i, j;
- char *current_val; /* For rates */
- u8 rate;
-
- /* First entry *MUST* be the AP MAC address */
- iwe.cmd = SIOCGIWAP;
- iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
- memcpy(iwe.u.ap_addr.sa_data, network->bssid, ETH_ALEN);
- start = iwe_stream_add_event(start, stop, &iwe, IW_EV_ADDR_LEN);
-
- /* Remaining entries will be displayed in the order we provide them */
-
- /* Add the ESSID */
- iwe.cmd = SIOCGIWESSID;
- iwe.u.data.flags = 1;
- if (network->flags & NETWORK_EMPTY_ESSID) {
- iwe.u.data.length = sizeof("<hidden>");
- start = iwe_stream_add_point(start, stop, &iwe, "<hidden>");
- } else {
- iwe.u.data.length = min(network->ssid_len, (u8) 32);
- start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
- }
-
- /* Add the protocol name */
- iwe.cmd = SIOCGIWNAME;
- snprintf(iwe.u.name, IFNAMSIZ, "IEEE 802.11%s",
- ieee80211_modes[network->mode]);
- start = iwe_stream_add_event(start, stop, &iwe, IW_EV_CHAR_LEN);
-
- /* Add mode */
- iwe.cmd = SIOCGIWMODE;
- if (network->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)) {
- if (network->capability & WLAN_CAPABILITY_ESS)
- iwe.u.mode = IW_MODE_MASTER;
- else
- iwe.u.mode = IW_MODE_ADHOC;
-
- start = iwe_stream_add_event(start, stop, &iwe, IW_EV_UINT_LEN);
- }
-
- /* Add frequency/channel */
- iwe.cmd = SIOCGIWFREQ;
-/* iwe.u.freq.m = ieee80211_frequency(network->channel, network->mode);
- iwe.u.freq.e = 3; */
- iwe.u.freq.m = network->channel;
- iwe.u.freq.e = 0;
- iwe.u.freq.i = 0;
- start = iwe_stream_add_event(start, stop, &iwe, IW_EV_FREQ_LEN);
-
- /* Add encryption capability */
- iwe.cmd = SIOCGIWENCODE;
- if (network->capability & WLAN_CAPABILITY_PRIVACY)
- iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
- else
- iwe.u.data.flags = IW_ENCODE_DISABLED;
- iwe.u.data.length = 0;
- start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
-
- /* Add basic and extended rates */
- /* Rate : stuffing multiple values in a single event require a bit
- * more of magic - Jean II */
- current_val = start + IW_EV_LCP_LEN;
- iwe.cmd = SIOCGIWRATE;
- /* Those two flags are ignored... */
- iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
-
- for (i = 0, j = 0; i < network->rates_len;) {
- if (j < network->rates_ex_len &&
- ((network->rates_ex[j] & 0x7F) <
- (network->rates[i] & 0x7F)))
- rate = network->rates_ex[j++] & 0x7F;
- else
- rate = network->rates[i++] & 0x7F;
- /* Bit rate given in 500 kb/s units (+ 0x80) */
- iwe.u.bitrate.value = ((rate & 0x7f) * 500000);
- /* Add new value to event */
- current_val = iwe_stream_add_value(start, current_val, stop, &iwe, IW_EV_PARAM_LEN);
- }
- for (; j < network->rates_ex_len; j++) {
- rate = network->rates_ex[j] & 0x7F;
- /* Bit rate given in 500 kb/s units (+ 0x80) */
- iwe.u.bitrate.value = ((rate & 0x7f) * 500000);
- /* Add new value to event */
- current_val = iwe_stream_add_value(start, current_val, stop, &iwe, IW_EV_PARAM_LEN);
- }
- /* Check if we added any rate */
- if((current_val - start) > IW_EV_LCP_LEN)
- start = current_val;
-
- /* Add quality statistics */
- iwe.cmd = IWEVQUAL;
- iwe.u.qual.updated = IW_QUAL_QUAL_UPDATED | IW_QUAL_LEVEL_UPDATED |
- IW_QUAL_NOISE_UPDATED;
-
- if (!(network->stats.mask & IEEE80211_STATMASK_RSSI)) {
- iwe.u.qual.updated |= IW_QUAL_QUAL_INVALID |
- IW_QUAL_LEVEL_INVALID;
- iwe.u.qual.qual = 0;
- } else {
- if (ieee->perfect_rssi == ieee->worst_rssi)
- iwe.u.qual.qual = 100;
- else
- iwe.u.qual.qual =
- (100 *
- (ieee->perfect_rssi - ieee->worst_rssi) *
- (ieee->perfect_rssi - ieee->worst_rssi) -
- (ieee->perfect_rssi - network->stats.rssi) *
- (15 * (ieee->perfect_rssi - ieee->worst_rssi) +
- 62 * (ieee->perfect_rssi -
- network->stats.rssi))) /
- ((ieee->perfect_rssi -
- ieee->worst_rssi) * (ieee->perfect_rssi -
- ieee->worst_rssi));
- if (iwe.u.qual.qual > 100)
- iwe.u.qual.qual = 100;
- else if (iwe.u.qual.qual < 1)
- iwe.u.qual.qual = 0;
- }
-
- if (!(network->stats.mask & IEEE80211_STATMASK_NOISE)) {
- iwe.u.qual.updated |= IW_QUAL_NOISE_INVALID;
- iwe.u.qual.noise = 0;
- } else {
- iwe.u.qual.noise = network->stats.noise;
- }
-
- if (!(network->stats.mask & IEEE80211_STATMASK_SIGNAL)) {
- iwe.u.qual.updated |= IW_QUAL_LEVEL_INVALID;
- iwe.u.qual.level = 0;
- } else {
- iwe.u.qual.level = network->stats.signal;
- }
-
- start = iwe_stream_add_event(start, stop, &iwe, IW_EV_QUAL_LEN);
-
- iwe.cmd = IWEVCUSTOM;
- p = custom;
-
- iwe.u.data.length = p - custom;
- if (iwe.u.data.length)
- start = iwe_stream_add_point(start, stop, &iwe, custom);
-
- memset(&iwe, 0, sizeof(iwe));
- if (network->wpa_ie_len) {
- char buf[MAX_WPA_IE_LEN];
- memcpy(buf, network->wpa_ie, network->wpa_ie_len);
- iwe.cmd = IWEVGENIE;
- iwe.u.data.length = network->wpa_ie_len;
- start = iwe_stream_add_point(start, stop, &iwe, buf);
- }
-
- memset(&iwe, 0, sizeof(iwe));
- if (network->rsn_ie_len) {
- char buf[MAX_WPA_IE_LEN];
- memcpy(buf, network->rsn_ie, network->rsn_ie_len);
- iwe.cmd = IWEVGENIE;
- iwe.u.data.length = network->rsn_ie_len;
- start = iwe_stream_add_point(start, stop, &iwe, buf);
- }
-
- /* Add EXTRA: Age to display seconds since last beacon/probe response
- * for given network. */
- iwe.cmd = IWEVCUSTOM;
- p = custom;
- p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
- " Last beacon: %dms ago",
- jiffies_to_msecs(jiffies - network->last_scanned));
- iwe.u.data.length = p - custom;
- if (iwe.u.data.length)
- start = iwe_stream_add_point(start, stop, &iwe, custom);
-
- /* Add spectrum management information */
- iwe.cmd = -1;
- p = custom;
- p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), " Channel flags: ");
-
- if (ieee80211_get_channel_flags(ieee, network->channel) &
- IEEE80211_CH_INVALID) {
- iwe.cmd = IWEVCUSTOM;
- p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), "INVALID ");
- }
-
- if (ieee80211_get_channel_flags(ieee, network->channel) &
- IEEE80211_CH_RADAR_DETECT) {
- iwe.cmd = IWEVCUSTOM;
- p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), "DFS ");
- }
-
- if (iwe.cmd == IWEVCUSTOM) {
- iwe.u.data.length = p - custom;
- start = iwe_stream_add_point(start, stop, &iwe, custom);
- }
-
- return start;
-}
-
-#define SCAN_ITEM_SIZE 128
-
-int ieee80211_wx_get_scan(struct ieee80211_device *ieee,
- struct iw_request_info *info,
- union iwreq_data *wrqu, char *extra)
-{
- struct ieee80211_network *network;
- unsigned long flags;
- int err = 0;
-
- char *ev = extra;
- char *stop = ev + wrqu->data.length;
- int i = 0;
-
- IEEE80211_DEBUG_WX("Getting scan\n");
-
- spin_lock_irqsave(&ieee->lock, flags);
-
- list_for_each_entry(network, &ieee->network_list, list) {
- i++;
- if (stop - ev < SCAN_ITEM_SIZE) {
- err = -E2BIG;
- break;
- }
-
- if (ieee->scan_age == 0 ||
- time_after(network->last_scanned + ieee->scan_age, jiffies))
- ev = ieee80211_translate_scan(ieee, ev, stop, network);
- else
- IEEE80211_DEBUG_SCAN("Not showing network '%s ("
- MAC_FMT ")' due to age (%dms).\n",
- escape_essid(network->ssid,
- network->ssid_len),
- MAC_ARG(network->bssid),
- jiffies_to_msecs(jiffies -
- network->
- last_scanned));
- }
-
- spin_unlock_irqrestore(&ieee->lock, flags);
-
- wrqu->data.length = ev - extra;
- wrqu->data.flags = 0;
-
- IEEE80211_DEBUG_WX("exit: %d networks returned.\n", i);
-
- return err;
-}
-
-int ieee80211_wx_set_encode(struct ieee80211_device *ieee,
- struct iw_request_info *info,
- union iwreq_data *wrqu, char *keybuf)
-{
- struct iw_point *erq = &(wrqu->encoding);
- struct net_device *dev = ieee->dev;
- struct ieee80211_security sec = {
- .flags = 0
- };
- int i, key, key_provided, len;
- struct ieee80211_crypt_data **crypt;
- int host_crypto = ieee->host_encrypt || ieee->host_decrypt || ieee->host_build_iv;
-
- IEEE80211_DEBUG_WX("SET_ENCODE\n");
-
- key = erq->flags & IW_ENCODE_INDEX;
- if (key) {
- if (key > WEP_KEYS)
- return -EINVAL;
- key--;
- key_provided = 1;
- } else {
- key_provided = 0;
- key = ieee->tx_keyidx;
- }
-
- IEEE80211_DEBUG_WX("Key: %d [%s]\n", key, key_provided ?
- "provided" : "default");
-
- crypt = &ieee->crypt[key];
-
- if (erq->flags & IW_ENCODE_DISABLED) {
- if (key_provided && *crypt) {
- IEEE80211_DEBUG_WX("Disabling encryption on key %d.\n",
- key);
- ieee80211_crypt_delayed_deinit(ieee, crypt);
- } else
- IEEE80211_DEBUG_WX("Disabling encryption.\n");
-
- /* Check all the keys to see if any are still configured,
- * and if no key index was provided, de-init them all */
- for (i = 0; i < WEP_KEYS; i++) {
- if (ieee->crypt[i] != NULL) {
- if (key_provided)
- break;
- ieee80211_crypt_delayed_deinit(ieee,
- &ieee->crypt[i]);
- }
- }
-
- if (i == WEP_KEYS) {
- sec.enabled = 0;
- sec.encrypt = 0;
- sec.level = SEC_LEVEL_0;
- sec.flags |= SEC_ENABLED | SEC_LEVEL | SEC_ENCRYPT;
- }
-
- goto done;
- }
-
- sec.enabled = 1;
- sec.encrypt = 1;
- sec.flags |= SEC_ENABLED | SEC_ENCRYPT;
-
- if (*crypt != NULL && (*crypt)->ops != NULL &&
- strcmp((*crypt)->ops->name, "WEP") != 0) {
- /* changing to use WEP; deinit previously used algorithm
- * on this key */
- ieee80211_crypt_delayed_deinit(ieee, crypt);
- }
-
- if (*crypt == NULL && host_crypto) {
- struct ieee80211_crypt_data *new_crypt;
-
- /* take WEP into use */
- new_crypt = kzalloc(sizeof(struct ieee80211_crypt_data),
- GFP_KERNEL);
- if (new_crypt == NULL)
- return -ENOMEM;
- new_crypt->ops = ieee80211_get_crypto_ops("WEP");
- if (!new_crypt->ops) {
- request_module("ieee80211_crypt_wep");
- new_crypt->ops = ieee80211_get_crypto_ops("WEP");
- }
-
- if (new_crypt->ops && try_module_get(new_crypt->ops->owner))
- new_crypt->priv = new_crypt->ops->init(key);
-
- if (!new_crypt->ops || !new_crypt->priv) {
- kfree(new_crypt);
- new_crypt = NULL;
-
- printk(KERN_WARNING "%s: could not initialize WEP: "
- "load module ieee80211_crypt_wep\n", dev->name);
- return -EOPNOTSUPP;
- }
- *crypt = new_crypt;
- }
-
- /* If a new key was provided, set it up */
- if (erq->length > 0) {
- len = erq->length <= 5 ? 5 : 13;
- memcpy(sec.keys[key], keybuf, erq->length);
- if (len > erq->length)
- memset(sec.keys[key] + erq->length, 0,
- len - erq->length);
- IEEE80211_DEBUG_WX("Setting key %d to '%s' (%d:%d bytes)\n",
- key, escape_essid(sec.keys[key], len),
- erq->length, len);
- sec.key_sizes[key] = len;
- if (*crypt)
- (*crypt)->ops->set_key(sec.keys[key], len, NULL,
- (*crypt)->priv);
- sec.flags |= (1 << key);
- /* This ensures a key will be activated if no key is
- * explicitely set */
- if (key == sec.active_key)
- sec.flags |= SEC_ACTIVE_KEY;
-
- } else {
- if (host_crypto) {
- len = (*crypt)->ops->get_key(sec.keys[key], WEP_KEY_LEN,
- NULL, (*crypt)->priv);
- if (len == 0) {
- /* Set a default key of all 0 */
- IEEE80211_DEBUG_WX("Setting key %d to all "
- "zero.\n", key);
- memset(sec.keys[key], 0, 13);
- (*crypt)->ops->set_key(sec.keys[key], 13, NULL,
- (*crypt)->priv);
- sec.key_sizes[key] = 13;
- sec.flags |= (1 << key);
- }
- }
- /* No key data - just set the default TX key index */
- if (key_provided) {
- IEEE80211_DEBUG_WX("Setting key %d to default Tx "
- "key.\n", key);
- ieee->tx_keyidx = key;
- sec.active_key = key;
- sec.flags |= SEC_ACTIVE_KEY;
- }
- }
- if (erq->flags & (IW_ENCODE_OPEN | IW_ENCODE_RESTRICTED)) {
- ieee->open_wep = !(erq->flags & IW_ENCODE_RESTRICTED);
- sec.auth_mode = ieee->open_wep ? WLAN_AUTH_OPEN :
- WLAN_AUTH_SHARED_KEY;
- sec.flags |= SEC_AUTH_MODE;
- IEEE80211_DEBUG_WX("Auth: %s\n",
- sec.auth_mode == WLAN_AUTH_OPEN ?
- "OPEN" : "SHARED KEY");
- }
-
- /* For now we just support WEP, so only set that security level...
- * TODO: When WPA is added this is one place that needs to change */
- sec.flags |= SEC_LEVEL;
- sec.level = SEC_LEVEL_1; /* 40 and 104 bit WEP */
- sec.encode_alg[key] = SEC_ALG_WEP;
-
- done:
- if (ieee->set_security)
- ieee->set_security(dev, &sec);
-
- /* Do not reset port if card is in Managed mode since resetting will
- * generate new IEEE 802.11 authentication which may end up in looping
- * with IEEE 802.1X. If your hardware requires a reset after WEP
- * configuration (for example... Prism2), implement the reset_port in
- * the callbacks structures used to initialize the 802.11 stack. */
- if (ieee->reset_on_keychange &&
- ieee->iw_mode != IW_MODE_INFRA &&
- ieee->reset_port && ieee->reset_port(dev)) {
- printk(KERN_DEBUG "%s: reset_port failed\n", dev->name);
- return -EINVAL;
- }
- return 0;
-}
-
-int ieee80211_wx_get_encode(struct ieee80211_device *ieee,
- struct iw_request_info *info,
- union iwreq_data *wrqu, char *keybuf)
-{
- struct iw_point *erq = &(wrqu->encoding);
- int len, key;
- struct ieee80211_crypt_data *crypt;
- struct ieee80211_security *sec = &ieee->sec;
-
- IEEE80211_DEBUG_WX("GET_ENCODE\n");
-
- key = erq->flags & IW_ENCODE_INDEX;
- if (key) {
- if (key > WEP_KEYS)
- return -EINVAL;
- key--;
- } else
- key = ieee->tx_keyidx;
-
- crypt = ieee->crypt[key];
- erq->flags = key + 1;
-
- if (!sec->enabled) {
- erq->length = 0;
- erq->flags |= IW_ENCODE_DISABLED;
- return 0;
- }
-
- len = sec->key_sizes[key];
- memcpy(keybuf, sec->keys[key], len);
-
- erq->length = len;
- erq->flags |= IW_ENCODE_ENABLED;
-
- if (ieee->open_wep)
- erq->flags |= IW_ENCODE_OPEN;
- else
- erq->flags |= IW_ENCODE_RESTRICTED;
-
- return 0;
-}
-
-int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee,
- struct iw_request_info *info,
- union iwreq_data *wrqu, char *extra)
-{
- struct net_device *dev = ieee->dev;
- struct iw_point *encoding = &wrqu->encoding;
- struct iw_encode_ext *ext = (struct iw_encode_ext *)extra;
- int i, idx, ret = 0;
- int group_key = 0;
- const char *alg, *module;
- struct ieee80211_crypto_ops *ops;
- struct ieee80211_crypt_data **crypt;
-
- struct ieee80211_security sec = {
- .flags = 0,
- };
-
- idx = encoding->flags & IW_ENCODE_INDEX;
- if (idx) {
- if (idx < 1 || idx > WEP_KEYS)
- return -EINVAL;
- idx--;
- } else
- idx = ieee->tx_keyidx;
-
- if (ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) {
- crypt = &ieee->crypt[idx];
- group_key = 1;
- } else {
- /* some Cisco APs use idx>0 for unicast in dynamic WEP */
- if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP)
- return -EINVAL;
- if (ieee->iw_mode == IW_MODE_INFRA)
- crypt = &ieee->crypt[idx];
- else
- return -EINVAL;
- }
-
- sec.flags |= SEC_ENABLED | SEC_ENCRYPT;
- if ((encoding->flags & IW_ENCODE_DISABLED) ||
- ext->alg == IW_ENCODE_ALG_NONE) {
- if (*crypt)
- ieee80211_crypt_delayed_deinit(ieee, crypt);
-
- for (i = 0; i < WEP_KEYS; i++)
- if (ieee->crypt[i] != NULL)
- break;
-
- if (i == WEP_KEYS) {
- sec.enabled = 0;
- sec.encrypt = 0;
- sec.level = SEC_LEVEL_0;
- sec.flags |= SEC_LEVEL;
- }
- goto done;
- }
-
- sec.enabled = 1;
- sec.encrypt = 1;
-
- if (group_key ? !ieee->host_mc_decrypt :
- !(ieee->host_encrypt || ieee->host_decrypt ||
- ieee->host_encrypt_msdu))
- goto skip_host_crypt;
-
- switch (ext->alg) {
- case IW_ENCODE_ALG_WEP:
- alg = "WEP";
- module = "ieee80211_crypt_wep";
- break;
- case IW_ENCODE_ALG_TKIP:
- alg = "TKIP";
- module = "ieee80211_crypt_tkip";
- break;
- case IW_ENCODE_ALG_CCMP:
- alg = "CCMP";
- module = "ieee80211_crypt_ccmp";
- break;
- default:
- IEEE80211_DEBUG_WX("%s: unknown crypto alg %d\n",
- dev->name, ext->alg);
- ret = -EINVAL;
- goto done;
- }
-
- ops = ieee80211_get_crypto_ops(alg);
- if (ops == NULL) {
- request_module(module);
- ops = ieee80211_get_crypto_ops(alg);
- }
- if (ops == NULL) {
- IEEE80211_DEBUG_WX("%s: unknown crypto alg %d\n",
- dev->name, ext->alg);
- ret = -EINVAL;
- goto done;
- }
-
- if (*crypt == NULL || (*crypt)->ops != ops) {
- struct ieee80211_crypt_data *new_crypt;
-
- ieee80211_crypt_delayed_deinit(ieee, crypt);
-
- new_crypt = kzalloc(sizeof(*new_crypt), GFP_KERNEL);
- if (new_crypt == NULL) {
- ret = -ENOMEM;
- goto done;
- }
- new_crypt->ops = ops;
- if (new_crypt->ops && try_module_get(new_crypt->ops->owner))
- new_crypt->priv = new_crypt->ops->init(idx);
- if (new_crypt->priv == NULL) {
- kfree(new_crypt);
- ret = -EINVAL;
- goto done;
- }
- *crypt = new_crypt;
- }
-
- if (ext->key_len > 0 && (*crypt)->ops->set_key &&
- (*crypt)->ops->set_key(ext->key, ext->key_len, ext->rx_seq,
- (*crypt)->priv) < 0) {
- IEEE80211_DEBUG_WX("%s: key setting failed\n", dev->name);
- ret = -EINVAL;
- goto done;
- }
-
- skip_host_crypt:
- if (ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY) {
- ieee->tx_keyidx = idx;
- sec.active_key = idx;
- sec.flags |= SEC_ACTIVE_KEY;
- }
-
- if (ext->alg != IW_ENCODE_ALG_NONE) {
- memcpy(sec.keys[idx], ext->key, ext->key_len);
- sec.key_sizes[idx] = ext->key_len;
- sec.flags |= (1 << idx);
- if (ext->alg == IW_ENCODE_ALG_WEP) {
- sec.encode_alg[idx] = SEC_ALG_WEP;
- sec.flags |= SEC_LEVEL;
- sec.level = SEC_LEVEL_1;
- } else if (ext->alg == IW_ENCODE_ALG_TKIP) {
- sec.encode_alg[idx] = SEC_ALG_TKIP;
- sec.flags |= SEC_LEVEL;
- sec.level = SEC_LEVEL_2;
- } else if (ext->alg == IW_ENCODE_ALG_CCMP) {
- sec.encode_alg[idx] = SEC_ALG_CCMP;
- sec.flags |= SEC_LEVEL;
- sec.level = SEC_LEVEL_3;
- }
- /* Don't set sec level for group keys. */
- if (group_key)
- sec.flags &= ~SEC_LEVEL;
- }
- done:
- if (ieee->set_security)
- ieee->set_security(ieee->dev, &sec);
-
- /*
- * Do not reset port if card is in Managed mode since resetting will
- * generate new IEEE 802.11 authentication which may end up in looping
- * with IEEE 802.1X. If your hardware requires a reset after WEP
- * configuration (for example... Prism2), implement the reset_port in
- * the callbacks structures used to initialize the 802.11 stack.
- */
- if (ieee->reset_on_keychange &&
- ieee->iw_mode != IW_MODE_INFRA &&
- ieee->reset_port && ieee->reset_port(dev)) {
- IEEE80211_DEBUG_WX("%s: reset_port failed\n", dev->name);
- return -EINVAL;
- }
-
- return ret;
-}
-
-int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee,
- struct iw_request_info *info,
- union iwreq_data *wrqu, char *extra)
-{
- struct iw_point *encoding = &wrqu->encoding;
- struct iw_encode_ext *ext = (struct iw_encode_ext *)extra;
- struct ieee80211_security *sec = &ieee->sec;
- int idx, max_key_len;
-
- max_key_len = encoding->length - sizeof(*ext);
- if (max_key_len < 0)
- return -EINVAL;
-
- idx = encoding->flags & IW_ENCODE_INDEX;
- if (idx) {
- if (idx < 1 || idx > WEP_KEYS)
- return -EINVAL;
- idx--;
- } else
- idx = ieee->tx_keyidx;
-
- if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY &&
- ext->alg != IW_ENCODE_ALG_WEP)
- if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA)
- return -EINVAL;
-
- encoding->flags = idx + 1;
- memset(ext, 0, sizeof(*ext));
-
- if (!sec->enabled) {
- ext->alg = IW_ENCODE_ALG_NONE;
- ext->key_len = 0;
- encoding->flags |= IW_ENCODE_DISABLED;
- } else {
- if (sec->encode_alg[idx] == SEC_ALG_WEP)
- ext->alg = IW_ENCODE_ALG_WEP;
- else if (sec->encode_alg[idx] == SEC_ALG_TKIP)
- ext->alg = IW_ENCODE_ALG_TKIP;
- else if (sec->encode_alg[idx] == SEC_ALG_CCMP)
- ext->alg = IW_ENCODE_ALG_CCMP;
- else
- return -EINVAL;
-
- ext->key_len = sec->key_sizes[idx];
- memcpy(ext->key, sec->keys[idx], ext->key_len);
- encoding->flags |= IW_ENCODE_ENABLED;
- if (ext->key_len &&
- (ext->alg == IW_ENCODE_ALG_TKIP ||
- ext->alg == IW_ENCODE_ALG_CCMP))
- ext->ext_flags |= IW_ENCODE_EXT_TX_SEQ_VALID;
-
- }
-
- return 0;
-}
-
-int ieee80211_wx_set_auth(struct net_device *dev,
- struct iw_request_info *info,
- union iwreq_data *wrqu,
- char *extra)
-{
- struct ieee80211_device *ieee = netdev_priv(dev);
- unsigned long flags;
- int err = 0;
-
- spin_lock_irqsave(&ieee->lock, flags);
-
- switch (wrqu->param.flags & IW_AUTH_INDEX) {
- case IW_AUTH_WPA_VERSION:
- case IW_AUTH_CIPHER_PAIRWISE:
- case IW_AUTH_CIPHER_GROUP:
- case IW_AUTH_KEY_MGMT:
- /*
- * Host AP driver does not use these parameters and allows
- * wpa_supplicant to control them internally.
- */
- break;
- case IW_AUTH_TKIP_COUNTERMEASURES:
- break; /* FIXME */
- case IW_AUTH_DROP_UNENCRYPTED:
- ieee->drop_unencrypted = !!wrqu->param.value;
- break;
- case IW_AUTH_80211_AUTH_ALG:
- break; /* FIXME */
- case IW_AUTH_WPA_ENABLED:
- ieee->privacy_invoked = ieee->wpa_enabled = !!wrqu->param.value;
- break;
- case IW_AUTH_RX_UNENCRYPTED_EAPOL:
- ieee->ieee802_1x = !!wrqu->param.value;
- break;
- case IW_AUTH_PRIVACY_INVOKED:
- ieee->privacy_invoked = !!wrqu->param.value;
- break;
- default:
- err = -EOPNOTSUPP;
- break;
- }
- spin_unlock_irqrestore(&ieee->lock, flags);
- return err;
-}
-
-int ieee80211_wx_get_auth(struct net_device *dev,
- struct iw_request_info *info,
- union iwreq_data *wrqu,
- char *extra)
-{
- struct ieee80211_device *ieee = netdev_priv(dev);
- unsigned long flags;
- int err = 0;
-
- spin_lock_irqsave(&ieee->lock, flags);
-
- switch (wrqu->param.flags & IW_AUTH_INDEX) {
- case IW_AUTH_WPA_VERSION:
- case IW_AUTH_CIPHER_PAIRWISE:
- case IW_AUTH_CIPHER_GROUP:
- case IW_AUTH_KEY_MGMT:
- case IW_AUTH_TKIP_COUNTERMEASURES: /* FIXME */
- case IW_AUTH_80211_AUTH_ALG: /* FIXME */
- /*
- * Host AP driver does not use these parameters and allows
- * wpa_supplicant to control them internally.
- */
- err = -EOPNOTSUPP;
- break;
- case IW_AUTH_DROP_UNENCRYPTED:
- wrqu->param.value = ieee->drop_unencrypted;
- break;
- case IW_AUTH_WPA_ENABLED:
- wrqu->param.value = ieee->wpa_enabled;
- break;
- case IW_AUTH_RX_UNENCRYPTED_EAPOL:
- wrqu->param.value = ieee->ieee802_1x;
- break;
- default:
- err = -EOPNOTSUPP;
- break;
- }
- spin_unlock_irqrestore(&ieee->lock, flags);
- return err;
-}
-
-EXPORT_SYMBOL(ieee80211_wx_set_encodeext);
-EXPORT_SYMBOL(ieee80211_wx_get_encodeext);
-
-EXPORT_SYMBOL(ieee80211_wx_get_scan);
-EXPORT_SYMBOL(ieee80211_wx_set_encode);
-EXPORT_SYMBOL(ieee80211_wx_get_encode);
-
-EXPORT_SYMBOL_GPL(ieee80211_wx_set_auth);
-EXPORT_SYMBOL_GPL(ieee80211_wx_get_auth);
diff --git a/net/ieee80211/softmac/Kconfig b/net/ieee80211/softmac/Kconfig
deleted file mode 100644
index 2811651cb134..000000000000
--- a/net/ieee80211/softmac/Kconfig
+++ /dev/null
@@ -1,12 +0,0 @@
-config IEEE80211_SOFTMAC
- tristate "Software MAC add-on to the IEEE 802.11 networking stack"
- depends on IEEE80211 && EXPERIMENTAL
- select WIRELESS_EXT
- select IEEE80211_CRYPT_WEP
- ---help---
- This option enables the hardware independent software MAC addon
- for the IEEE 802.11 networking stack.
-
-config IEEE80211_SOFTMAC_DEBUG
- bool "Enable full debugging output"
- depends on IEEE80211_SOFTMAC
diff --git a/net/ieee80211/softmac/Makefile b/net/ieee80211/softmac/Makefile
deleted file mode 100644
index bfcb391bb2c7..000000000000
--- a/net/ieee80211/softmac/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-obj-$(CONFIG_IEEE80211_SOFTMAC) += ieee80211softmac.o
-ieee80211softmac-objs := \
- ieee80211softmac_io.o \
- ieee80211softmac_auth.o \
- ieee80211softmac_module.o \
- ieee80211softmac_scan.o \
- ieee80211softmac_wx.o \
- ieee80211softmac_assoc.o \
- ieee80211softmac_event.o
diff --git a/net/ieee80211/softmac/ieee80211softmac_assoc.c b/net/ieee80211/softmac/ieee80211softmac_assoc.c
deleted file mode 100644
index cf51c87a971d..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_assoc.c
+++ /dev/null
@@ -1,472 +0,0 @@
-/*
- * This file contains the softmac's association logic.
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#include "ieee80211softmac_priv.h"
-
-/*
- * Overview
- *
- * Before you can associate, you have to authenticate.
- *
- */
-
-/* Sends out an association request to the desired AP */
-static void
-ieee80211softmac_assoc(struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net)
-{
- unsigned long flags;
-
- /* Switch to correct channel for this network */
- mac->set_channel(mac->dev, net->channel);
-
- /* Send association request */
- ieee80211softmac_send_mgt_frame(mac, net, IEEE80211_STYPE_ASSOC_REQ, 0);
-
- dprintk(KERN_INFO PFX "sent association request!\n");
-
- spin_lock_irqsave(&mac->lock, flags);
- mac->associnfo.associated = 0; /* just to make sure */
-
- /* Set a timer for timeout */
- /* FIXME: make timeout configurable */
- if (likely(mac->running))
- schedule_delayed_work(&mac->associnfo.timeout, 5 * HZ);
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-
-void
-ieee80211softmac_assoc_timeout(void *d)
-{
- struct ieee80211softmac_device *mac = (struct ieee80211softmac_device *)d;
- struct ieee80211softmac_network *n;
-
- mutex_lock(&mac->associnfo.mutex);
- /* we might race against ieee80211softmac_handle_assoc_response,
- * so make sure only one of us does something */
- if (!mac->associnfo.associating)
- goto out;
- mac->associnfo.associating = 0;
- mac->associnfo.bssvalid = 0;
- mac->associnfo.associated = 0;
-
- n = ieee80211softmac_get_network_by_bssid_locked(mac, mac->associnfo.bssid);
-
- dprintk(KERN_INFO PFX "assoc request timed out!\n");
- ieee80211softmac_call_events(mac, IEEE80211SOFTMAC_EVENT_ASSOCIATE_TIMEOUT, n);
-out:
- mutex_unlock(&mac->associnfo.mutex);
-}
-
-void
-ieee80211softmac_disassoc(struct ieee80211softmac_device *mac)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&mac->lock, flags);
- if (mac->associnfo.associating)
- cancel_delayed_work(&mac->associnfo.timeout);
-
- netif_carrier_off(mac->dev);
-
- mac->associnfo.associated = 0;
- mac->associnfo.bssvalid = 0;
- mac->associnfo.associating = 0;
- ieee80211softmac_init_bss(mac);
- ieee80211softmac_call_events_locked(mac, IEEE80211SOFTMAC_EVENT_DISASSOCIATED, NULL);
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-
-/* Sends out a disassociation request to the desired AP */
-void
-ieee80211softmac_send_disassoc_req(struct ieee80211softmac_device *mac, u16 reason)
-{
- struct ieee80211softmac_network *found;
-
- if (mac->associnfo.bssvalid && mac->associnfo.associated) {
- found = ieee80211softmac_get_network_by_bssid(mac, mac->associnfo.bssid);
- if (found)
- ieee80211softmac_send_mgt_frame(mac, found, IEEE80211_STYPE_DISASSOC, reason);
- }
-
- ieee80211softmac_disassoc(mac);
-}
-
-static inline int
-we_support_all_basic_rates(struct ieee80211softmac_device *mac, u8 *from, u8 from_len)
-{
- int idx;
- u8 rate;
-
- for (idx = 0; idx < (from_len); idx++) {
- rate = (from)[idx];
- if (!(rate & IEEE80211_BASIC_RATE_MASK))
- continue;
- rate &= ~IEEE80211_BASIC_RATE_MASK;
- if (!ieee80211softmac_ratesinfo_rate_supported(&mac->ratesinfo, rate))
- return 0;
- }
- return 1;
-}
-
-static int
-network_matches_request(struct ieee80211softmac_device *mac, struct ieee80211_network *net)
-{
- /* we cannot associate to networks whose name we don't know */
- if (ieee80211_is_empty_essid(net->ssid, net->ssid_len))
- return 0;
- /* do not associate to a network whose BSSBasicRateSet we cannot support */
- if (!we_support_all_basic_rates(mac, net->rates, net->rates_len))
- return 0;
- /* do we really need to check the ex rates? */
- if (!we_support_all_basic_rates(mac, net->rates_ex, net->rates_ex_len))
- return 0;
-
- /* assume that users know what they're doing ...
- * (note we don't let them select a net we're incompatible with) */
- if (mac->associnfo.bssfixed) {
- return !memcmp(mac->associnfo.bssid, net->bssid, ETH_ALEN);
- }
-
- /* if 'ANY' network requested, take any that doesn't have privacy enabled */
- if (mac->associnfo.req_essid.len == 0
- && !(net->capability & WLAN_CAPABILITY_PRIVACY))
- return 1;
- if (net->ssid_len != mac->associnfo.req_essid.len)
- return 0;
- if (!memcmp(net->ssid, mac->associnfo.req_essid.data, mac->associnfo.req_essid.len))
- return 1;
- return 0;
-}
-
-static void
-ieee80211softmac_assoc_notify_scan(struct net_device *dev, int event_type, void *context)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- ieee80211softmac_assoc_work((void*)mac);
-}
-
-static void
-ieee80211softmac_assoc_notify_auth(struct net_device *dev, int event_type, void *context)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- switch (event_type) {
- case IEEE80211SOFTMAC_EVENT_AUTHENTICATED:
- ieee80211softmac_assoc_work((void*)mac);
- break;
- case IEEE80211SOFTMAC_EVENT_AUTH_FAILED:
- case IEEE80211SOFTMAC_EVENT_AUTH_TIMEOUT:
- ieee80211softmac_disassoc(mac);
- break;
- }
-}
-
-/* This function is called to handle userspace requests (asynchronously) */
-void
-ieee80211softmac_assoc_work(void *d)
-{
- struct ieee80211softmac_device *mac = (struct ieee80211softmac_device *)d;
- struct ieee80211softmac_network *found = NULL;
- struct ieee80211_network *net = NULL, *best = NULL;
- int bssvalid;
- unsigned long flags;
-
- mutex_lock(&mac->associnfo.mutex);
-
- if (!mac->associnfo.associating)
- goto out;
-
- /* ieee80211_disassoc might clear this */
- bssvalid = mac->associnfo.bssvalid;
-
- /* meh */
- if (mac->associnfo.associated)
- ieee80211softmac_send_disassoc_req(mac, WLAN_REASON_DISASSOC_STA_HAS_LEFT);
-
- /* try to find the requested network in our list, if we found one already */
- if (bssvalid || mac->associnfo.bssfixed)
- found = ieee80211softmac_get_network_by_bssid(mac, mac->associnfo.bssid);
-
- /* Search the ieee80211 networks for this network if we didn't find it by bssid,
- * but only if we've scanned at least once (to get a better list of networks to
- * select from). If we have not scanned before, the !found logic below will be
- * invoked and will scan. */
- if (!found && (mac->associnfo.scan_retry < IEEE80211SOFTMAC_ASSOC_SCAN_RETRY_LIMIT))
- {
- s8 rssi = -128; /* if I don't initialise, gcc emits an invalid warning
- because it cannot follow the best pointer logic. */
- spin_lock_irqsave(&mac->ieee->lock, flags);
- list_for_each_entry(net, &mac->ieee->network_list, list) {
- /* we're supposed to find the network with
- * the best signal here, as we're asked to join
- * any network with a specific ESSID, and many
- * different ones could have that.
- *
- * I'll for now just go with the reported rssi.
- *
- * We also should take into account the rateset
- * here to find the best BSSID to try.
- */
- if (network_matches_request(mac, net)) {
- if (!best) {
- best = net;
- rssi = best->stats.rssi;
- continue;
- }
- /* we already had a matching network, so
- * compare their properties to get the
- * better of the two ... (see above)
- */
- if (rssi < net->stats.rssi) {
- best = net;
- rssi = best->stats.rssi;
- }
- }
- }
- /* if we unlock here, we might get interrupted and the `best'
- * pointer could go stale */
- if (best) {
- found = ieee80211softmac_create_network(mac, best);
- /* if found is still NULL, then we got -ENOMEM somewhere */
- if (found)
- ieee80211softmac_add_network(mac, found);
- }
- spin_unlock_irqrestore(&mac->ieee->lock, flags);
- }
-
- if (!found) {
- if (mac->associnfo.scan_retry > 0) {
- mac->associnfo.scan_retry--;
-
- /* We know of no such network. Let's scan.
- * NB: this also happens if we had no memory to copy the network info...
- * Maybe we can hope to have more memory after scanning finishes ;)
- */
- dprintk(KERN_INFO PFX "Associate: Scanning for networks first.\n");
- ieee80211softmac_notify(mac->dev, IEEE80211SOFTMAC_EVENT_SCAN_FINISHED, ieee80211softmac_assoc_notify_scan, NULL);
- if (ieee80211softmac_start_scan(mac))
- dprintk(KERN_INFO PFX "Associate: failed to initiate scan. Is device up?\n");
- goto out;
- } else {
- mac->associnfo.associating = 0;
- mac->associnfo.associated = 0;
-
- dprintk(KERN_INFO PFX "Unable to find matching network after scan!\n");
- /* reset the retry counter for the next user request since we
- * break out and don't reschedule ourselves after this point. */
- mac->associnfo.scan_retry = IEEE80211SOFTMAC_ASSOC_SCAN_RETRY_LIMIT;
- ieee80211softmac_call_events(mac, IEEE80211SOFTMAC_EVENT_ASSOCIATE_NET_NOT_FOUND, NULL);
- goto out;
- }
- }
-
- /* reset the retry counter for the next user request since we
- * now found a net and will try to associate to it, but not
- * schedule this function again. */
- mac->associnfo.scan_retry = IEEE80211SOFTMAC_ASSOC_SCAN_RETRY_LIMIT;
- mac->associnfo.bssvalid = 1;
- memcpy(mac->associnfo.bssid, found->bssid, ETH_ALEN);
- /* copy the ESSID for displaying it */
- mac->associnfo.associate_essid.len = found->essid.len;
- memcpy(mac->associnfo.associate_essid.data, found->essid.data, IW_ESSID_MAX_SIZE + 1);
-
- /* we found a network! authenticate (if necessary) and associate to it. */
- if (found->authenticating) {
- dprintk(KERN_INFO PFX "Already requested authentication, waiting...\n");
- if(!mac->associnfo.assoc_wait) {
- mac->associnfo.assoc_wait = 1;
- ieee80211softmac_notify_internal(mac, IEEE80211SOFTMAC_EVENT_ANY, found, ieee80211softmac_assoc_notify_auth, NULL, GFP_KERNEL);
- }
- goto out;
- }
- if (!found->authenticated && !found->authenticating) {
- /* This relies on the fact that _auth_req only queues the work,
- * otherwise adding the notification would be racy. */
- if (!ieee80211softmac_auth_req(mac, found)) {
- if(!mac->associnfo.assoc_wait) {
- dprintk(KERN_INFO PFX "Cannot associate without being authenticated, requested authentication\n");
- mac->associnfo.assoc_wait = 1;
- ieee80211softmac_notify_internal(mac, IEEE80211SOFTMAC_EVENT_ANY, found, ieee80211softmac_assoc_notify_auth, NULL, GFP_KERNEL);
- }
- } else {
- printkl(KERN_WARNING PFX "Not authenticated, but requesting authentication failed. Giving up to associate\n");
- mac->associnfo.assoc_wait = 0;
- ieee80211softmac_call_events(mac, IEEE80211SOFTMAC_EVENT_ASSOCIATE_FAILED, found);
- }
- goto out;
- }
- /* finally! now we can start associating */
- mac->associnfo.assoc_wait = 0;
- ieee80211softmac_assoc(mac, found);
-
-out:
- mutex_unlock(&mac->associnfo.mutex);
-}
-
-/* call this to do whatever is necessary when we're associated */
-static void
-ieee80211softmac_associated(struct ieee80211softmac_device *mac,
- struct ieee80211_assoc_response * resp,
- struct ieee80211softmac_network *net)
-{
- u16 cap = le16_to_cpu(resp->capability);
- u8 erp_value = net->erp_value;
-
- mac->associnfo.associating = 0;
- mac->bssinfo.supported_rates = net->supported_rates;
- ieee80211softmac_recalc_txrates(mac);
-
- mac->associnfo.associated = 1;
-
- mac->associnfo.short_preamble_available =
- (cap & WLAN_CAPABILITY_SHORT_PREAMBLE) != 0;
- ieee80211softmac_process_erp(mac, erp_value);
-
- if (mac->set_bssid_filter)
- mac->set_bssid_filter(mac->dev, net->bssid);
- memcpy(mac->ieee->bssid, net->bssid, ETH_ALEN);
- netif_carrier_on(mac->dev);
-
- mac->association_id = le16_to_cpup(&resp->aid);
-}
-
-/* received frame handling functions */
-int
-ieee80211softmac_handle_assoc_response(struct net_device * dev,
- struct ieee80211_assoc_response * resp,
- struct ieee80211_network * _ieee80211_network)
-{
- /* NOTE: the network parameter has to be mostly ignored by
- * this code because it is the ieee80211's pointer
- * to the struct, not ours (we made a copy)
- */
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- u16 status = le16_to_cpup(&resp->status);
- struct ieee80211softmac_network *network = NULL;
- unsigned long flags;
-
- if (unlikely(!mac->running))
- return -ENODEV;
-
- spin_lock_irqsave(&mac->lock, flags);
-
- if (!mac->associnfo.associating) {
- /* we race against the timeout function, so make sure
- * only one of us can do work */
- spin_unlock_irqrestore(&mac->lock, flags);
- return 0;
- }
- network = ieee80211softmac_get_network_by_bssid_locked(mac, resp->header.addr3);
-
- /* someone sending us things without us knowing him? Ignore. */
- if (!network) {
- dprintk(KERN_INFO PFX "Received unrequested assocation response from " MAC_FMT "\n", MAC_ARG(resp->header.addr3));
- spin_unlock_irqrestore(&mac->lock, flags);
- return 0;
- }
-
- /* now that we know it was for us, we can cancel the timeout */
- cancel_delayed_work(&mac->associnfo.timeout);
-
- /* if the association response included an ERP IE, update our saved
- * copy */
- if (_ieee80211_network->flags & NETWORK_HAS_ERP_VALUE)
- network->erp_value = _ieee80211_network->erp_value;
-
- switch (status) {
- case 0:
- dprintk(KERN_INFO PFX "associated!\n");
- ieee80211softmac_associated(mac, resp, network);
- ieee80211softmac_call_events_locked(mac, IEEE80211SOFTMAC_EVENT_ASSOCIATED, network);
- break;
- case WLAN_REASON_STA_REQ_ASSOC_WITHOUT_AUTH:
- if (!network->auth_desynced_once) {
- /* there seem to be a few rare cases where our view of
- * the world is obscured, or buggy APs that don't DEAUTH
- * us properly. So we handle that, but allow it only once.
- */
- printkl(KERN_INFO PFX "We were not authenticated during association, retrying...\n");
- network->authenticated = 0;
- /* we don't want to do this more than once ... */
- network->auth_desynced_once = 1;
- schedule_work(&mac->associnfo.work);
- break;
- }
- default:
- dprintk(KERN_INFO PFX "associating failed (reason: 0x%x)!\n", status);
- mac->associnfo.associating = 0;
- mac->associnfo.bssvalid = 0;
- mac->associnfo.associated = 0;
- ieee80211softmac_call_events_locked(mac, IEEE80211SOFTMAC_EVENT_ASSOCIATE_FAILED, network);
- }
-
- spin_unlock_irqrestore(&mac->lock, flags);
- return 0;
-}
-
-int
-ieee80211softmac_handle_disassoc(struct net_device * dev,
- struct ieee80211_disassoc *disassoc)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- if (unlikely(!mac->running))
- return -ENODEV;
-
- if (memcmp(disassoc->header.addr2, mac->associnfo.bssid, ETH_ALEN))
- return 0;
-
- if (memcmp(disassoc->header.addr1, mac->dev->dev_addr, ETH_ALEN))
- return 0;
-
- dprintk(KERN_INFO PFX "got disassoc frame\n");
- ieee80211softmac_disassoc(mac);
-
- /* try to reassociate */
- schedule_work(&mac->associnfo.work);
-
- return 0;
-}
-
-int
-ieee80211softmac_handle_reassoc_req(struct net_device * dev,
- struct ieee80211_reassoc_request * resp)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- struct ieee80211softmac_network *network;
-
- if (unlikely(!mac->running))
- return -ENODEV;
-
- network = ieee80211softmac_get_network_by_bssid(mac, resp->header.addr3);
- if (!network) {
- dprintkl(KERN_INFO PFX "reassoc request from unknown network\n");
- return 0;
- }
- schedule_work(&mac->associnfo.work);
-
- return 0;
-}
diff --git a/net/ieee80211/softmac/ieee80211softmac_auth.c b/net/ieee80211/softmac/ieee80211softmac_auth.c
deleted file mode 100644
index 0612015f1c78..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_auth.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * This file contains the softmac's authentication logic.
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#include "ieee80211softmac_priv.h"
-
-static void ieee80211softmac_auth_queue(void *data);
-
-/* Queues an auth request to the desired AP */
-int
-ieee80211softmac_auth_req(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net)
-{
- struct ieee80211softmac_auth_queue_item *auth;
- unsigned long flags;
-
- if (net->authenticating || net->authenticated)
- return 0;
- net->authenticating = 1;
-
- /* Add the network if it's not already added */
- ieee80211softmac_add_network(mac, net);
-
- dprintk(KERN_NOTICE PFX "Queueing Authentication Request to "MAC_FMT"\n", MAC_ARG(net->bssid));
- /* Queue the auth request */
- auth = (struct ieee80211softmac_auth_queue_item *)
- kmalloc(sizeof(struct ieee80211softmac_auth_queue_item), GFP_KERNEL);
- if(auth == NULL)
- return -ENOMEM;
-
- auth->net = net;
- auth->mac = mac;
- auth->retry = IEEE80211SOFTMAC_AUTH_RETRY_LIMIT;
- auth->state = IEEE80211SOFTMAC_AUTH_OPEN_REQUEST;
- INIT_WORK(&auth->work, &ieee80211softmac_auth_queue, (void *)auth);
-
- /* Lock (for list) */
- spin_lock_irqsave(&mac->lock, flags);
-
- /* add to list */
- list_add_tail(&auth->list, &mac->auth_queue);
- schedule_work(&auth->work);
- spin_unlock_irqrestore(&mac->lock, flags);
-
- return 0;
-}
-
-
-/* Sends an auth request to the desired AP and handles timeouts */
-static void
-ieee80211softmac_auth_queue(void *data)
-{
- struct ieee80211softmac_device *mac;
- struct ieee80211softmac_auth_queue_item *auth;
- struct ieee80211softmac_network *net;
- unsigned long flags;
-
- auth = (struct ieee80211softmac_auth_queue_item *)data;
- net = auth->net;
- mac = auth->mac;
-
- if(auth->retry > 0) {
- /* Switch to correct channel for this network */
- mac->set_channel(mac->dev, net->channel);
-
- /* Lock and set flags */
- spin_lock_irqsave(&mac->lock, flags);
- if (unlikely(!mac->running)) {
- /* Prevent reschedule on workqueue flush */
- spin_unlock_irqrestore(&mac->lock, flags);
- return;
- }
- net->authenticated = 0;
- /* add a timeout call so we eventually give up waiting for an auth reply */
- schedule_delayed_work(&auth->work, IEEE80211SOFTMAC_AUTH_TIMEOUT);
- auth->retry--;
- spin_unlock_irqrestore(&mac->lock, flags);
- if (ieee80211softmac_send_mgt_frame(mac, auth->net, IEEE80211_STYPE_AUTH, auth->state))
- dprintk(KERN_NOTICE PFX "Sending Authentication Request to "MAC_FMT" failed (this shouldn't happen, wait for the timeout).\n", MAC_ARG(net->bssid));
- else
- dprintk(KERN_NOTICE PFX "Sent Authentication Request to "MAC_FMT".\n", MAC_ARG(net->bssid));
- return;
- }
-
- printkl(KERN_WARNING PFX "Authentication timed out with "MAC_FMT"\n", MAC_ARG(net->bssid));
- /* Remove this item from the queue */
- spin_lock_irqsave(&mac->lock, flags);
- net->authenticating = 0;
- ieee80211softmac_call_events_locked(mac, IEEE80211SOFTMAC_EVENT_AUTH_TIMEOUT, net);
- cancel_delayed_work(&auth->work); /* just to make sure... */
- list_del(&auth->list);
- spin_unlock_irqrestore(&mac->lock, flags);
- /* Free it */
- kfree(auth);
-}
-
-/* Sends a response to an auth challenge (for shared key auth). */
-static void
-ieee80211softmac_auth_challenge_response(void *_aq)
-{
- struct ieee80211softmac_auth_queue_item *aq = _aq;
-
- /* Send our response */
- ieee80211softmac_send_mgt_frame(aq->mac, aq->net, IEEE80211_STYPE_AUTH, aq->state);
-}
-
-/* Handle the auth response from the AP
- * This should be registered with ieee80211 as handle_auth
- */
-int
-ieee80211softmac_auth_resp(struct net_device *dev, struct ieee80211_auth *auth)
-{
-
- struct list_head *list_ptr;
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- struct ieee80211softmac_auth_queue_item *aq = NULL;
- struct ieee80211softmac_network *net = NULL;
- unsigned long flags;
- u8 * data;
-
- if (unlikely(!mac->running))
- return -ENODEV;
-
- /* Find correct auth queue item */
- spin_lock_irqsave(&mac->lock, flags);
- list_for_each(list_ptr, &mac->auth_queue) {
- aq = list_entry(list_ptr, struct ieee80211softmac_auth_queue_item, list);
- net = aq->net;
- if (!memcmp(net->bssid, auth->header.addr2, ETH_ALEN))
- break;
- else
- aq = NULL;
- }
- spin_unlock_irqrestore(&mac->lock, flags);
-
- /* Make sure that we've got an auth queue item for this request */
- if(aq == NULL)
- {
- dprintkl(KERN_DEBUG PFX "Authentication response received from "MAC_FMT" but no queue item exists.\n", MAC_ARG(auth->header.addr2));
- /* Error #? */
- return -1;
- }
-
- /* Check for out of order authentication */
- if(!net->authenticating)
- {
- dprintkl(KERN_DEBUG PFX "Authentication response received from "MAC_FMT" but did not request authentication.\n",MAC_ARG(auth->header.addr2));
- return -1;
- }
-
- /* Parse the auth packet */
- switch(auth->algorithm) {
- case WLAN_AUTH_OPEN:
- /* Check the status code of the response */
-
- switch(auth->status) {
- case WLAN_STATUS_SUCCESS:
- /* Update the status to Authenticated */
- spin_lock_irqsave(&mac->lock, flags);
- net->authenticating = 0;
- net->authenticated = 1;
- spin_unlock_irqrestore(&mac->lock, flags);
-
- /* Send event */
- printkl(KERN_NOTICE PFX "Open Authentication completed with "MAC_FMT"\n", MAC_ARG(net->bssid));
- ieee80211softmac_call_events(mac, IEEE80211SOFTMAC_EVENT_AUTHENTICATED, net);
- break;
- default:
- /* Lock and reset flags */
- spin_lock_irqsave(&mac->lock, flags);
- net->authenticated = 0;
- net->authenticating = 0;
- spin_unlock_irqrestore(&mac->lock, flags);
-
- printkl(KERN_NOTICE PFX "Open Authentication with "MAC_FMT" failed, error code: %i\n",
- MAC_ARG(net->bssid), le16_to_cpup(&auth->status));
- /* Count the error? */
- break;
- }
- goto free_aq;
- break;
- case WLAN_AUTH_SHARED_KEY:
- /* Figure out where we are in the process */
- switch(auth->transaction) {
- case IEEE80211SOFTMAC_AUTH_SHARED_CHALLENGE:
- /* Check to make sure we have a challenge IE */
- data = (u8 *)auth->info_element;
- if (*data++ != MFIE_TYPE_CHALLENGE) {
- printkl(KERN_NOTICE PFX "Shared Key Authentication failed due to a missing challenge.\n");
- break;
- }
- /* Save the challenge */
- spin_lock_irqsave(&mac->lock, flags);
- net->challenge_len = *data++;
- if (net->challenge_len > WLAN_AUTH_CHALLENGE_LEN)
- net->challenge_len = WLAN_AUTH_CHALLENGE_LEN;
- kfree(net->challenge);
- net->challenge = kmemdup(data, net->challenge_len,
- GFP_ATOMIC);
- if (net->challenge == NULL) {
- printkl(KERN_NOTICE PFX "Shared Key "
- "Authentication failed due to "
- "memory shortage.\n");
- spin_unlock_irqrestore(&mac->lock, flags);
- break;
- }
- aq->state = IEEE80211SOFTMAC_AUTH_SHARED_RESPONSE;
-
- /* We reuse the work struct from the auth request here.
- * It is safe to do so as each one is per-request, and
- * at this point (dealing with authentication response)
- * we have obviously already sent the initial auth
- * request. */
- cancel_delayed_work(&aq->work);
- INIT_WORK(&aq->work, &ieee80211softmac_auth_challenge_response, (void *)aq);
- schedule_work(&aq->work);
- spin_unlock_irqrestore(&mac->lock, flags);
- return 0;
- case IEEE80211SOFTMAC_AUTH_SHARED_PASS:
- kfree(net->challenge);
- net->challenge = NULL;
- net->challenge_len = 0;
- /* Check the status code of the response */
- switch(auth->status) {
- case WLAN_STATUS_SUCCESS:
- /* Update the status to Authenticated */
- spin_lock_irqsave(&mac->lock, flags);
- net->authenticating = 0;
- net->authenticated = 1;
- spin_unlock_irqrestore(&mac->lock, flags);
- printkl(KERN_NOTICE PFX "Shared Key Authentication completed with "MAC_FMT"\n",
- MAC_ARG(net->bssid));
- ieee80211softmac_call_events(mac, IEEE80211SOFTMAC_EVENT_AUTHENTICATED, net);
- break;
- default:
- printkl(KERN_NOTICE PFX "Shared Key Authentication with "MAC_FMT" failed, error code: %i\n",
- MAC_ARG(net->bssid), le16_to_cpup(&auth->status));
- /* Lock and reset flags */
- spin_lock_irqsave(&mac->lock, flags);
- net->authenticating = 0;
- net->authenticated = 0;
- spin_unlock_irqrestore(&mac->lock, flags);
- /* Count the error? */
- break;
- }
- goto free_aq;
- break;
- default:
- printkl(KERN_WARNING PFX "Unhandled Authentication Step: %i\n", auth->transaction);
- break;
- }
- goto free_aq;
- break;
- default:
- /* ERROR */
- goto free_aq;
- break;
- }
- return 0;
-free_aq:
- /* Cancel the timeout */
- spin_lock_irqsave(&mac->lock, flags);
- cancel_delayed_work(&aq->work);
- /* Remove this item from the queue */
- list_del(&aq->list);
- spin_unlock_irqrestore(&mac->lock, flags);
-
- /* Free it */
- kfree(aq);
- return 0;
-}
-
-/*
- * Handle deauthorization
- */
-static void
-ieee80211softmac_deauth_from_net(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net)
-{
- struct ieee80211softmac_auth_queue_item *aq = NULL;
- struct list_head *list_ptr;
- unsigned long flags;
-
- /* deauthentication implies disassociation */
- ieee80211softmac_disassoc(mac);
-
- /* Lock and reset status flags */
- spin_lock_irqsave(&mac->lock, flags);
- net->authenticating = 0;
- net->authenticated = 0;
-
- /* Find correct auth queue item, if it exists */
- list_for_each(list_ptr, &mac->auth_queue) {
- aq = list_entry(list_ptr, struct ieee80211softmac_auth_queue_item, list);
- if (!memcmp(net->bssid, aq->net->bssid, ETH_ALEN))
- break;
- else
- aq = NULL;
- }
-
- /* Cancel pending work */
- if(aq != NULL)
- /* Not entirely safe? What about running work? */
- cancel_delayed_work(&aq->work);
-
- /* Free our network ref */
- ieee80211softmac_del_network_locked(mac, net);
- if(net->challenge != NULL)
- kfree(net->challenge);
- kfree(net);
-
- /* can't transmit data right now... */
- netif_carrier_off(mac->dev);
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-
-/*
- * Sends a deauth request to the desired AP
- */
-int
-ieee80211softmac_deauth_req(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net, int reason)
-{
- int ret;
-
- /* Make sure the network is authenticated */
- if (!net->authenticated)
- {
- dprintkl(KERN_DEBUG PFX "Can't send deauthentication packet, network is not authenticated.\n");
- /* Error okay? */
- return -EPERM;
- }
-
- /* Send the de-auth packet */
- if((ret = ieee80211softmac_send_mgt_frame(mac, net, IEEE80211_STYPE_DEAUTH, reason)))
- return ret;
-
- ieee80211softmac_deauth_from_net(mac, net);
- return 0;
-}
-
-/*
- * This should be registered with ieee80211 as handle_deauth
- */
-int
-ieee80211softmac_deauth_resp(struct net_device *dev, struct ieee80211_deauth *deauth)
-{
-
- struct ieee80211softmac_network *net = NULL;
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- if (unlikely(!mac->running))
- return -ENODEV;
-
- if (!deauth) {
- dprintk("deauth without deauth packet. eek!\n");
- return 0;
- }
-
- net = ieee80211softmac_get_network_by_bssid(mac, deauth->header.addr2);
-
- if (net == NULL) {
- dprintkl(KERN_DEBUG PFX "Received deauthentication packet from "MAC_FMT", but that network is unknown.\n",
- MAC_ARG(deauth->header.addr2));
- return 0;
- }
-
- /* Make sure the network is authenticated */
- if(!net->authenticated)
- {
- dprintkl(KERN_DEBUG PFX "Can't perform deauthentication, network is not authenticated.\n");
- /* Error okay? */
- return -EPERM;
- }
-
- ieee80211softmac_deauth_from_net(mac, net);
-
- /* let's try to re-associate */
- schedule_work(&mac->associnfo.work);
- return 0;
-}
diff --git a/net/ieee80211/softmac/ieee80211softmac_event.c b/net/ieee80211/softmac/ieee80211softmac_event.c
deleted file mode 100644
index f34fa2ef666b..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_event.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Event system
- * Also see comments in public header file and longer explanation below.
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#include "ieee80211softmac_priv.h"
-
-/*
- * Each event has associated to it
- * - an event type (see constants in public header)
- * - an event context (see below)
- * - the function to be called
- * - a context (extra parameter to call the function with)
- * - and the softmac struct
- *
- * The event context is private and can only be used from
- * within this module. Its meaning varies with the event
- * type:
- * SCAN_FINISHED,
- * DISASSOCIATED: NULL
- * ASSOCIATED,
- * ASSOCIATE_FAILED,
- * ASSOCIATE_TIMEOUT,
- * AUTHENTICATED,
- * AUTH_FAILED,
- * AUTH_TIMEOUT: a pointer to the network struct
- * ...
- * Code within this module can use the event context to be only
- * called when the event is true for that specific context
- * as per above table.
- * If the event context is NULL, then the notification is always called,
- * regardless of the event context. The event context is not passed to
- * the callback, it is assumed that the context suffices.
- *
- * You can also use the event context only by setting the event type
- * to -1 (private use only), in which case you'll be notified
- * whenever the event context matches.
- */
-
-static char *event_descriptions[IEEE80211SOFTMAC_EVENT_LAST+1] = {
- NULL, /* scan finished */
- NULL, /* associated */
- "associating failed",
- "associating timed out",
- "authenticated",
- "authenticating failed",
- "authenticating timed out",
- "associating failed because no suitable network was found",
- NULL, /* disassociated */
-};
-
-
-static void
-ieee80211softmac_notify_callback(void *d)
-{
- struct ieee80211softmac_event event = *(struct ieee80211softmac_event*) d;
- kfree(d);
-
- event.fun(event.mac->dev, event.event_type, event.context);
-}
-
-int
-ieee80211softmac_notify_internal(struct ieee80211softmac_device *mac,
- int event, void *event_context, notify_function_ptr fun, void *context, gfp_t gfp_mask)
-{
- struct ieee80211softmac_event *eventptr;
- unsigned long flags;
-
- if (event < -1 || event > IEEE80211SOFTMAC_EVENT_LAST)
- return -ENOSYS;
-
- if (!fun)
- return -EINVAL;
-
- eventptr = kmalloc(sizeof(struct ieee80211softmac_event), gfp_mask);
- if (!eventptr)
- return -ENOMEM;
-
- eventptr->event_type = event;
- INIT_WORK(&eventptr->work, ieee80211softmac_notify_callback, eventptr);
- eventptr->fun = fun;
- eventptr->context = context;
- eventptr->mac = mac;
- eventptr->event_context = event_context;
-
- spin_lock_irqsave(&mac->lock, flags);
- list_add(&eventptr->list, &mac->events);
- spin_unlock_irqrestore(&mac->lock, flags);
-
- return 0;
-}
-
-int
-ieee80211softmac_notify_gfp(struct net_device *dev,
- int event, notify_function_ptr fun, void *context, gfp_t gfp_mask)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- if (event < 0 || event > IEEE80211SOFTMAC_EVENT_LAST)
- return -ENOSYS;
-
- return ieee80211softmac_notify_internal(mac, event, NULL, fun, context, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_notify_gfp);
-
-/* private -- calling all callbacks that were specified */
-void
-ieee80211softmac_call_events_locked(struct ieee80211softmac_device *mac, int event, void *event_ctx)
-{
- struct ieee80211softmac_event *eventptr, *tmp;
- struct ieee80211softmac_network *network;
-
- if (event >= 0) {
- union iwreq_data wrqu;
- int we_event;
- char *msg = NULL;
-
- memset(&wrqu, '\0', sizeof (union iwreq_data));
-
- switch(event) {
- case IEEE80211SOFTMAC_EVENT_ASSOCIATED:
- network = (struct ieee80211softmac_network *)event_ctx;
- memcpy(wrqu.ap_addr.sa_data, &network->bssid[0], ETH_ALEN);
- /* fall through */
- case IEEE80211SOFTMAC_EVENT_DISASSOCIATED:
- wrqu.ap_addr.sa_family = ARPHRD_ETHER;
- we_event = SIOCGIWAP;
- break;
- case IEEE80211SOFTMAC_EVENT_SCAN_FINISHED:
- we_event = SIOCGIWSCAN;
- break;
- default:
- msg = event_descriptions[event];
- if (!msg)
- msg = "SOFTMAC EVENT BUG";
- wrqu.data.length = strlen(msg);
- we_event = IWEVCUSTOM;
- break;
- }
- wireless_send_event(mac->dev, we_event, &wrqu, msg);
- }
-
- if (!list_empty(&mac->events))
- list_for_each_entry_safe(eventptr, tmp, &mac->events, list) {
- if ((eventptr->event_type == event || eventptr->event_type == -1)
- && (eventptr->event_context == NULL || eventptr->event_context == event_ctx)) {
- list_del(&eventptr->list);
- /* User may have subscribed to ANY event, so
- * we tell them which event triggered it. */
- eventptr->event_type = event;
- schedule_work(&eventptr->work);
- }
- }
-}
-
-void
-ieee80211softmac_call_events(struct ieee80211softmac_device *mac, int event, void *event_ctx)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&mac->lock, flags);
- ieee80211softmac_call_events_locked(mac, event, event_ctx);
-
- spin_unlock_irqrestore(&mac->lock, flags);
-}
diff --git a/net/ieee80211/softmac/ieee80211softmac_io.c b/net/ieee80211/softmac/ieee80211softmac_io.c
deleted file mode 100644
index b96931001b43..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_io.c
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Some parts based on code from net80211
- * Copyright (c) 2001 Atsushi Onoe
- * Copyright (c) 2002-2005 Sam Leffler, Errno Consulting
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "ieee80211softmac_priv.h"
-
-/* Helper functions for inserting data into the frames */
-
-/*
- * Adds an ESSID element to the frame
- *
- */
-static u8 *
-ieee80211softmac_add_essid(u8 *dst, struct ieee80211softmac_essid *essid)
-{
- if (essid) {
- *dst++ = MFIE_TYPE_SSID;
- *dst++ = essid->len;
- memcpy(dst, essid->data, essid->len);
- return dst+essid->len;
- } else {
- *dst++ = MFIE_TYPE_SSID;
- *dst++ = 0;
- return dst;
- }
-}
-
-/* Adds Supported Rates and if required Extended Rates Information Element
- * to the frame, ASSUMES WE HAVE A SORTED LIST OF RATES */
-static u8 *
-ieee80211softmac_frame_add_rates(u8 *dst, const struct ieee80211softmac_ratesinfo *r)
-{
- int cck_len, ofdm_len;
- *dst++ = MFIE_TYPE_RATES;
-
- for(cck_len=0; ieee80211_is_cck_rate(r->rates[cck_len]) && (cck_len < r->count);cck_len++);
-
- if(cck_len > IEEE80211SOFTMAC_MAX_RATES_LEN)
- cck_len = IEEE80211SOFTMAC_MAX_RATES_LEN;
- *dst++ = cck_len;
- memcpy(dst, r->rates, cck_len);
- dst += cck_len;
-
- if(cck_len < r->count){
- for (ofdm_len=0; ieee80211_is_ofdm_rate(r->rates[ofdm_len + cck_len]) && (ofdm_len + cck_len < r->count); ofdm_len++);
- if (ofdm_len > 0) {
- if (ofdm_len > IEEE80211SOFTMAC_MAX_EX_RATES_LEN)
- ofdm_len = IEEE80211SOFTMAC_MAX_EX_RATES_LEN;
- *dst++ = MFIE_TYPE_RATES_EX;
- *dst++ = ofdm_len;
- memcpy(dst, r->rates + cck_len, ofdm_len);
- dst += ofdm_len;
- }
- }
- return dst;
-}
-
-/* Allocate a management frame */
-static u8 *
-ieee80211softmac_alloc_mgt(u32 size)
-{
- u8 * data;
-
- /* Add the header and FCS to the size */
- size = size + IEEE80211_3ADDR_LEN;
- if(size > IEEE80211_DATA_LEN)
- return NULL;
- /* Allocate the frame */
- data = kzalloc(size, GFP_ATOMIC);
- return data;
-}
-
-/*
- * Add a 2 Address Header
- */
-static void
-ieee80211softmac_hdr_2addr(struct ieee80211softmac_device *mac,
- struct ieee80211_hdr_2addr *header, u32 type, u8 *dest)
-{
- /* Fill in the frame control flags */
- header->frame_ctl = cpu_to_le16(type);
- /* Control packets always have WEP turned off */
- if(type > IEEE80211_STYPE_CFENDACK && type < IEEE80211_STYPE_PSPOLL)
- header->frame_ctl |= mac->ieee->sec.level ? cpu_to_le16(IEEE80211_FCTL_PROTECTED) : 0;
-
- /* Fill in the duration */
- header->duration_id = 0;
- /* FIXME: How do I find this?
- * calculate. But most drivers just fill in 0 (except if it's a station id of course) */
-
- /* Fill in the Destination Address */
- if(dest == NULL)
- memset(header->addr1, 0xFF, ETH_ALEN);
- else
- memcpy(header->addr1, dest, ETH_ALEN);
- /* Fill in the Source Address */
- memcpy(header->addr2, mac->ieee->dev->dev_addr, ETH_ALEN);
-
-}
-
-
-/* Add a 3 Address Header */
-static void
-ieee80211softmac_hdr_3addr(struct ieee80211softmac_device *mac,
- struct ieee80211_hdr_3addr *header, u32 type, u8 *dest, u8 *bssid)
-{
- /* This is common with 2addr, so use that instead */
- ieee80211softmac_hdr_2addr(mac, (struct ieee80211_hdr_2addr *)header, type, dest);
-
- /* Fill in the BSS ID */
- if(bssid == NULL)
- memset(header->addr3, 0xFF, ETH_ALEN);
- else
- memcpy(header->addr3, bssid, ETH_ALEN);
-
- /* Fill in the sequence # */
- /* FIXME: I need to add this to the softmac struct
- * shouldn't the sequence number be in ieee80211? */
-}
-
-static u16
-ieee80211softmac_capabilities(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net)
-{
- u16 capability = 0;
-
- /* ESS and IBSS bits are set according to the current mode */
- switch (mac->ieee->iw_mode) {
- case IW_MODE_INFRA:
- capability = cpu_to_le16(WLAN_CAPABILITY_ESS);
- break;
- case IW_MODE_ADHOC:
- capability = cpu_to_le16(WLAN_CAPABILITY_IBSS);
- break;
- case IW_MODE_AUTO:
- capability = net->capabilities &
- (WLAN_CAPABILITY_ESS|WLAN_CAPABILITY_IBSS);
- break;
- default:
- /* bleh. we don't ever go to these modes */
- printk(KERN_ERR PFX "invalid iw_mode!\n");
- break;
- }
-
- /* CF Pollable / CF Poll Request */
- /* Needs to be implemented, for now, the 0's == not supported */
-
- /* Privacy Bit */
- capability |= mac->ieee->sec.level ?
- cpu_to_le16(WLAN_CAPABILITY_PRIVACY) : 0;
-
- /* Short Preamble */
- /* Always supported: we probably won't ever be powering devices which
- * dont support this... */
- capability |= WLAN_CAPABILITY_SHORT_PREAMBLE;
-
- /* PBCC */
- /* Not widely used */
-
- /* Channel Agility */
- /* Not widely used */
-
- /* Short Slot */
- /* Will be implemented later */
-
- /* DSSS-OFDM */
- /* Not widely used */
-
- return capability;
-}
-
-/*****************************************************************************
- * Create Management packets
- *****************************************************************************/
-
-/* Creates an association request packet */
-static u32
-ieee80211softmac_assoc_req(struct ieee80211_assoc_request **pkt,
- struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net)
-{
- u8 *data;
- (*pkt) = (struct ieee80211_assoc_request *)ieee80211softmac_alloc_mgt(
- 2 + /* Capability Info */
- 2 + /* Listen Interval */
- /* SSID IE */
- 1 + 1 + IW_ESSID_MAX_SIZE +
- /* Rates IE */
- 1 + 1 + IEEE80211SOFTMAC_MAX_RATES_LEN +
- /* Extended Rates IE */
- 1 + 1 + IEEE80211SOFTMAC_MAX_EX_RATES_LEN +
- /* WPA IE if present */
- mac->wpa.IElen
- /* Other IE's? Optional?
- * Yeah, probably need an extra IE parameter -- lots of vendors like to
- * fill in their own IEs */
- );
- if (unlikely((*pkt) == NULL))
- return 0;
- ieee80211softmac_hdr_3addr(mac, &((*pkt)->header), IEEE80211_STYPE_ASSOC_REQ, net->bssid, net->bssid);
-
- /* Fill in the capabilities */
- (*pkt)->capability = ieee80211softmac_capabilities(mac, net);
-
- /* Fill in Listen Interval (?) */
- (*pkt)->listen_interval = cpu_to_le16(10);
-
- data = (u8 *)(*pkt)->info_element;
- /* Add SSID */
- data = ieee80211softmac_add_essid(data, &net->essid);
- /* Add Rates */
- data = ieee80211softmac_frame_add_rates(data, &mac->ratesinfo);
- /* Add WPA IE */
- if (mac->wpa.IElen && mac->wpa.IE) {
- memcpy(data, mac->wpa.IE, mac->wpa.IElen);
- data += mac->wpa.IElen;
- }
- /* Return the number of used bytes */
- return (data - (u8*)(*pkt));
-}
-
-/* Create a reassociation request packet */
-static u32
-ieee80211softmac_reassoc_req(struct ieee80211_reassoc_request **pkt,
- struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net)
-{
- u8 *data;
- (*pkt) = (struct ieee80211_reassoc_request *)ieee80211softmac_alloc_mgt(
- 2 + /* Capability Info */
- 2 + /* Listen Interval */
- ETH_ALEN + /* AP MAC */
- /* SSID IE */
- 1 + 1 + IW_ESSID_MAX_SIZE +
- /* Rates IE */
- 1 + 1 + IEEE80211SOFTMAC_MAX_RATES_LEN +
- /* Extended Rates IE */
- 1 + 1 + IEEE80211SOFTMAC_MAX_EX_RATES_LEN
- /* Other IE's? */
- );
- if (unlikely((*pkt) == NULL))
- return 0;
- ieee80211softmac_hdr_3addr(mac, &((*pkt)->header), IEEE80211_STYPE_REASSOC_REQ, net->bssid, net->bssid);
-
- /* Fill in the capabilities */
- (*pkt)->capability = ieee80211softmac_capabilities(mac, net);
-
- /* Fill in Listen Interval (?) */
- (*pkt)->listen_interval = cpu_to_le16(10);
- /* Fill in the current AP MAC */
- memcpy((*pkt)->current_ap, mac->ieee->bssid, ETH_ALEN);
-
- data = (u8 *)(*pkt)->info_element;
- /* Add SSID */
- data = ieee80211softmac_add_essid(data, &net->essid);
- /* Add Rates */
- data = ieee80211softmac_frame_add_rates(data, &mac->ratesinfo);
- /* Return packet size */
- return (data - (u8 *)(*pkt));
-}
-
-/* Create an authentication packet */
-static u32
-ieee80211softmac_auth(struct ieee80211_auth **pkt,
- struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net,
- u16 transaction, u16 status, int *encrypt_mpdu)
-{
- u8 *data;
- int auth_mode = mac->ieee->sec.auth_mode;
- int is_shared_response = (auth_mode == WLAN_AUTH_SHARED_KEY
- && transaction == IEEE80211SOFTMAC_AUTH_SHARED_RESPONSE);
-
- /* Allocate Packet */
- (*pkt) = (struct ieee80211_auth *)ieee80211softmac_alloc_mgt(
- 2 + /* Auth Algorithm */
- 2 + /* Auth Transaction Seq */
- 2 + /* Status Code */
- /* Challenge Text IE */
- (is_shared_response ? 1 + 1 + net->challenge_len : 0)
- );
- if (unlikely((*pkt) == NULL))
- return 0;
- ieee80211softmac_hdr_3addr(mac, &((*pkt)->header), IEEE80211_STYPE_AUTH, net->bssid, net->bssid);
-
- /* Algorithm */
- (*pkt)->algorithm = cpu_to_le16(auth_mode);
- /* Transaction */
- (*pkt)->transaction = cpu_to_le16(transaction);
- /* Status */
- (*pkt)->status = cpu_to_le16(status);
-
- data = (u8 *)(*pkt)->info_element;
- /* Challenge Text */
- if (is_shared_response) {
- *data = MFIE_TYPE_CHALLENGE;
- data++;
-
- /* Copy the challenge in */
- *data = net->challenge_len;
- data++;
- memcpy(data, net->challenge, net->challenge_len);
- data += net->challenge_len;
-
- /* Make sure this frame gets encrypted with the shared key */
- *encrypt_mpdu = 1;
- } else
- *encrypt_mpdu = 0;
-
- /* Return the packet size */
- return (data - (u8 *)(*pkt));
-}
-
-/* Create a disassocation or deauthentication packet */
-static u32
-ieee80211softmac_disassoc_deauth(struct ieee80211_disassoc **pkt,
- struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net,
- u16 type, u16 reason)
-{
- /* Allocate Packet */
- (*pkt) = (struct ieee80211_disassoc *)ieee80211softmac_alloc_mgt(2);
- if (unlikely((*pkt) == NULL))
- return 0;
- ieee80211softmac_hdr_3addr(mac, &((*pkt)->header), type, net->bssid, net->bssid);
- /* Reason */
- (*pkt)->reason = cpu_to_le16(reason);
- /* Return the packet size */
- return (2 + IEEE80211_3ADDR_LEN);
-}
-
-/* Create a probe request packet */
-static u32
-ieee80211softmac_probe_req(struct ieee80211_probe_request **pkt,
- struct ieee80211softmac_device *mac, struct ieee80211softmac_essid *essid)
-{
- u8 *data;
- /* Allocate Packet */
- (*pkt) = (struct ieee80211_probe_request *)ieee80211softmac_alloc_mgt(
- /* SSID of requested network */
- 1 + 1 + IW_ESSID_MAX_SIZE +
- /* Rates IE */
- 1 + 1 + IEEE80211SOFTMAC_MAX_RATES_LEN +
- /* Extended Rates IE */
- 1 + 1 + IEEE80211SOFTMAC_MAX_EX_RATES_LEN
- );
- if (unlikely((*pkt) == NULL))
- return 0;
- ieee80211softmac_hdr_3addr(mac, &((*pkt)->header), IEEE80211_STYPE_PROBE_REQ, NULL, NULL);
-
- data = (u8 *)(*pkt)->info_element;
- /* Add ESSID (can be NULL) */
- data = ieee80211softmac_add_essid(data, essid);
- /* Add Rates */
- data = ieee80211softmac_frame_add_rates(data, &mac->ratesinfo);
- /* Return packet size */
- return (data - (u8 *)(*pkt));
-}
-
-/* Create a probe response packet */
-/* FIXME: Not complete */
-static u32
-ieee80211softmac_probe_resp(struct ieee80211_probe_response **pkt,
- struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net)
-{
- u8 *data;
- /* Allocate Packet */
- (*pkt) = (struct ieee80211_probe_response *)ieee80211softmac_alloc_mgt(
- 8 + /* Timestamp */
- 2 + /* Beacon Interval */
- 2 + /* Capability Info */
- /* SSID IE */
- 1 + 1 + IW_ESSID_MAX_SIZE +
- 7 + /* FH Parameter Set */
- 2 + /* DS Parameter Set */
- 8 + /* CF Parameter Set */
- 4 /* IBSS Parameter Set */
- );
- if (unlikely((*pkt) == NULL))
- return 0;
- ieee80211softmac_hdr_3addr(mac, &((*pkt)->header), IEEE80211_STYPE_PROBE_RESP, net->bssid, net->bssid);
- data = (u8 *)(*pkt)->info_element;
-
- /* Return the packet size */
- return (data - (u8 *)(*pkt));
-}
-
-
-/* Sends a manangement packet
- * FIXME: document the use of the arg parameter
- * for _AUTH: (transaction #) | (status << 16)
- */
-int
-ieee80211softmac_send_mgt_frame(struct ieee80211softmac_device *mac,
- void *ptrarg, u32 type, u32 arg)
-{
- void *pkt = NULL;
- u32 pkt_size = 0;
- int encrypt_mpdu = 0;
-
- switch(type) {
- case IEEE80211_STYPE_ASSOC_REQ:
- pkt_size = ieee80211softmac_assoc_req((struct ieee80211_assoc_request **)(&pkt), mac, (struct ieee80211softmac_network *)ptrarg);
- break;
- case IEEE80211_STYPE_REASSOC_REQ:
- pkt_size = ieee80211softmac_reassoc_req((struct ieee80211_reassoc_request **)(&pkt), mac, (struct ieee80211softmac_network *)ptrarg);
- break;
- case IEEE80211_STYPE_AUTH:
- pkt_size = ieee80211softmac_auth((struct ieee80211_auth **)(&pkt), mac, (struct ieee80211softmac_network *)ptrarg, (u16)(arg & 0xFFFF), (u16) (arg >> 16), &encrypt_mpdu);
- break;
- case IEEE80211_STYPE_DISASSOC:
- case IEEE80211_STYPE_DEAUTH:
- pkt_size = ieee80211softmac_disassoc_deauth((struct ieee80211_disassoc **)(&pkt), mac, (struct ieee80211softmac_network *)ptrarg, type, (u16)(arg & 0xFFFF));
- break;
- case IEEE80211_STYPE_PROBE_REQ:
- pkt_size = ieee80211softmac_probe_req((struct ieee80211_probe_request **)(&pkt), mac, (struct ieee80211softmac_essid *)ptrarg);
- break;
- case IEEE80211_STYPE_PROBE_RESP:
- pkt_size = ieee80211softmac_probe_resp((struct ieee80211_probe_response **)(&pkt), mac, (struct ieee80211softmac_network *)ptrarg);
- break;
- default:
- printkl(KERN_DEBUG PFX "Unsupported Management Frame type: %i\n", type);
- return -EINVAL;
- };
-
- if(pkt_size == 0 || pkt == NULL) {
- printkl(KERN_DEBUG PFX "Error, packet is nonexistant or 0 length\n");
- return -ENOMEM;
- }
-
- /* Send the packet to the ieee80211 layer for tx */
- /* we defined softmac->mgmt_xmit for this. Should we keep it
- * as it is (that means we'd need to wrap this into a txb),
- * modify the prototype (so it matches this function),
- * or get rid of it alltogether?
- * Does this work for you now?
- */
- ieee80211_tx_frame(mac->ieee, (struct ieee80211_hdr *)pkt,
- IEEE80211_3ADDR_LEN, pkt_size, encrypt_mpdu);
-
- kfree(pkt);
- return 0;
-}
-
-/* Beacon handling */
-int ieee80211softmac_handle_beacon(struct net_device *dev,
- struct ieee80211_beacon *beacon,
- struct ieee80211_network *network)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- /* This might race, but we don't really care and it's not worth
- * adding heavyweight locking in this fastpath.
- */
- if (mac->associnfo.associated) {
- if (memcmp(network->bssid, mac->associnfo.bssid, ETH_ALEN) == 0)
- ieee80211softmac_process_erp(mac, network->erp_value);
- }
-
- return 0;
-}
-
diff --git a/net/ieee80211/softmac/ieee80211softmac_module.c b/net/ieee80211/softmac/ieee80211softmac_module.c
deleted file mode 100644
index 33aff4f4a471..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_module.c
+++ /dev/null
@@ -1,576 +0,0 @@
-/*
- * Contains some basic softmac functions along with module registration code etc.
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#include "ieee80211softmac_priv.h"
-#include <linux/sort.h>
-#include <linux/etherdevice.h>
-
-struct net_device *alloc_ieee80211softmac(int sizeof_priv)
-{
- struct ieee80211softmac_device *softmac;
- struct net_device *dev;
-
- dev = alloc_ieee80211(sizeof(struct ieee80211softmac_device) + sizeof_priv);
- softmac = ieee80211_priv(dev);
- softmac->dev = dev;
- softmac->ieee = netdev_priv(dev);
- spin_lock_init(&softmac->lock);
-
- softmac->ieee->handle_auth = ieee80211softmac_auth_resp;
- softmac->ieee->handle_deauth = ieee80211softmac_deauth_resp;
- softmac->ieee->handle_assoc_response = ieee80211softmac_handle_assoc_response;
- softmac->ieee->handle_reassoc_request = ieee80211softmac_handle_reassoc_req;
- softmac->ieee->handle_disassoc = ieee80211softmac_handle_disassoc;
- softmac->ieee->handle_beacon = ieee80211softmac_handle_beacon;
- softmac->scaninfo = NULL;
-
- softmac->associnfo.scan_retry = IEEE80211SOFTMAC_ASSOC_SCAN_RETRY_LIMIT;
-
- /* TODO: initialise all the other callbacks in the ieee struct
- * (once they're written)
- */
-
- INIT_LIST_HEAD(&softmac->auth_queue);
- INIT_LIST_HEAD(&softmac->network_list);
- INIT_LIST_HEAD(&softmac->events);
-
- mutex_init(&softmac->associnfo.mutex);
- INIT_WORK(&softmac->associnfo.work, ieee80211softmac_assoc_work, softmac);
- INIT_WORK(&softmac->associnfo.timeout, ieee80211softmac_assoc_timeout, softmac);
- softmac->start_scan = ieee80211softmac_start_scan_implementation;
- softmac->wait_for_scan = ieee80211softmac_wait_for_scan_implementation;
- softmac->stop_scan = ieee80211softmac_stop_scan_implementation;
-
- /* to start with, we can't send anything ... */
- netif_carrier_off(dev);
-
- return dev;
-}
-EXPORT_SYMBOL_GPL(alloc_ieee80211softmac);
-
-/* Clears the pending work queue items, stops all scans, etc. */
-void
-ieee80211softmac_clear_pending_work(struct ieee80211softmac_device *sm)
-{
- unsigned long flags;
- struct ieee80211softmac_event *eventptr, *eventtmp;
- struct ieee80211softmac_auth_queue_item *authptr, *authtmp;
- struct ieee80211softmac_network *netptr, *nettmp;
-
- ieee80211softmac_stop_scan(sm);
- ieee80211softmac_wait_for_scan(sm);
-
- spin_lock_irqsave(&sm->lock, flags);
- sm->running = 0;
-
- /* Free all pending assoc work items */
- cancel_delayed_work(&sm->associnfo.work);
-
- /* Free all pending scan work items */
- if(sm->scaninfo != NULL)
- cancel_delayed_work(&sm->scaninfo->softmac_scan);
-
- /* Free all pending auth work items */
- list_for_each_entry(authptr, &sm->auth_queue, list)
- cancel_delayed_work(&authptr->work);
-
- /* delete all pending event calls and work items */
- list_for_each_entry_safe(eventptr, eventtmp, &sm->events, list)
- cancel_delayed_work(&eventptr->work);
-
- spin_unlock_irqrestore(&sm->lock, flags);
- flush_scheduled_work();
-
- /* now we should be save and no longer need locking... */
- spin_lock_irqsave(&sm->lock, flags);
- /* Free all pending auth work items */
- list_for_each_entry_safe(authptr, authtmp, &sm->auth_queue, list) {
- list_del(&authptr->list);
- kfree(authptr);
- }
-
- /* delete all pending event calls and work items */
- list_for_each_entry_safe(eventptr, eventtmp, &sm->events, list) {
- list_del(&eventptr->list);
- kfree(eventptr);
- }
-
- /* Free all networks */
- list_for_each_entry_safe(netptr, nettmp, &sm->network_list, list) {
- ieee80211softmac_del_network_locked(sm, netptr);
- if(netptr->challenge != NULL)
- kfree(netptr->challenge);
- kfree(netptr);
- }
-
- spin_unlock_irqrestore(&sm->lock, flags);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_clear_pending_work);
-
-void free_ieee80211softmac(struct net_device *dev)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(dev);
- ieee80211softmac_clear_pending_work(sm);
- kfree(sm->scaninfo);
- kfree(sm->wpa.IE);
- free_ieee80211(dev);
-}
-EXPORT_SYMBOL_GPL(free_ieee80211softmac);
-
-static void ieee80211softmac_start_check_rates(struct ieee80211softmac_device *mac)
-{
- struct ieee80211softmac_ratesinfo *ri = &mac->ratesinfo;
- /* I took out the sorting check, we're seperating by modulation now. */
- if (ri->count)
- return;
- /* otherwise assume we hav'em all! */
- if (mac->ieee->modulation & IEEE80211_CCK_MODULATION) {
- ri->rates[ri->count++] = IEEE80211_CCK_RATE_1MB;
- ri->rates[ri->count++] = IEEE80211_CCK_RATE_2MB;
- ri->rates[ri->count++] = IEEE80211_CCK_RATE_5MB;
- ri->rates[ri->count++] = IEEE80211_CCK_RATE_11MB;
- }
- if (mac->ieee->modulation & IEEE80211_OFDM_MODULATION) {
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_6MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_9MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_12MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_18MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_24MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_36MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_48MB;
- ri->rates[ri->count++] = IEEE80211_OFDM_RATE_54MB;
- }
-}
-
-int ieee80211softmac_ratesinfo_rate_supported(struct ieee80211softmac_ratesinfo *ri, u8 rate)
-{
- int search;
- u8 search_rate;
-
- for (search = 0; search < ri->count; search++) {
- search_rate = ri->rates[search];
- search_rate &= ~IEEE80211_BASIC_RATE_MASK;
- if (rate == search_rate)
- return 1;
- }
-
- return 0;
-}
-
-u8 ieee80211softmac_highest_supported_rate(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_ratesinfo *ri, int basic_only)
-{
- u8 user_rate = mac->txrates.user_rate;
- int i;
-
- if (ri->count == 0)
- return IEEE80211_CCK_RATE_1MB;
-
- for (i = ri->count - 1; i >= 0; i--) {
- u8 rate = ri->rates[i];
- if (basic_only && !(rate & IEEE80211_BASIC_RATE_MASK))
- continue;
- rate &= ~IEEE80211_BASIC_RATE_MASK;
- if (rate > user_rate)
- continue;
- if (ieee80211softmac_ratesinfo_rate_supported(&mac->ratesinfo, rate))
- return rate;
- }
-
- /* If we haven't found a suitable rate by now, just trust the user */
- return user_rate;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_highest_supported_rate);
-
-void ieee80211softmac_process_erp(struct ieee80211softmac_device *mac,
- u8 erp_value)
-{
- int use_protection;
- int short_preamble;
- u32 changes = 0;
-
- /* Barker preamble mode */
- short_preamble = ((erp_value & WLAN_ERP_BARKER_PREAMBLE) == 0
- && mac->associnfo.short_preamble_available) ? 1 : 0;
-
- /* Protection needed? */
- use_protection = (erp_value & WLAN_ERP_USE_PROTECTION) != 0;
-
- if (mac->bssinfo.short_preamble != short_preamble) {
- changes |= IEEE80211SOFTMAC_BSSINFOCHG_SHORT_PREAMBLE;
- mac->bssinfo.short_preamble = short_preamble;
- }
-
- if (mac->bssinfo.use_protection != use_protection) {
- changes |= IEEE80211SOFTMAC_BSSINFOCHG_PROTECTION;
- mac->bssinfo.use_protection = use_protection;
- }
-
- if (mac->bssinfo_change && changes)
- mac->bssinfo_change(mac->dev, changes);
-}
-
-void ieee80211softmac_recalc_txrates(struct ieee80211softmac_device *mac)
-{
- struct ieee80211softmac_txrates *txrates = &mac->txrates;
- u32 change = 0;
-
- change |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT;
- txrates->default_rate = ieee80211softmac_highest_supported_rate(mac, &mac->bssinfo.supported_rates, 0);
-
- change |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT_FBACK;
- txrates->default_fallback = lower_rate(mac, txrates->default_rate);
-
- change |= IEEE80211SOFTMAC_TXRATECHG_MCAST;
- txrates->mcast_rate = ieee80211softmac_highest_supported_rate(mac, &mac->bssinfo.supported_rates, 1);
-
- if (mac->txrates_change)
- mac->txrates_change(mac->dev, change);
-
-}
-
-void ieee80211softmac_init_bss(struct ieee80211softmac_device *mac)
-{
- struct ieee80211_device *ieee = mac->ieee;
- u32 change = 0;
- struct ieee80211softmac_txrates *txrates = &mac->txrates;
- struct ieee80211softmac_bss_info *bssinfo = &mac->bssinfo;
-
- /* TODO: We need some kind of state machine to lower the default rates
- * if we loose too many packets.
- */
- /* Change the default txrate to the highest possible value.
- * The txrate machine will lower it, if it is too high.
- */
- /* FIXME: We don't correctly handle backing down to lower
- rates, so 801.11g devices start off at 11M for now. People
- can manually change it if they really need to, but 11M is
- more reliable. Note similar logic in
- ieee80211softmac_wx_set_rate() */
- if (ieee->modulation & IEEE80211_CCK_MODULATION) {
- txrates->user_rate = IEEE80211_CCK_RATE_11MB;
- } else if (ieee->modulation & IEEE80211_OFDM_MODULATION) {
- txrates->user_rate = IEEE80211_OFDM_RATE_54MB;
- } else
- assert(0);
-
- txrates->default_rate = IEEE80211_CCK_RATE_1MB;
- change |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT;
-
- txrates->default_fallback = IEEE80211_CCK_RATE_1MB;
- change |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT_FBACK;
-
- txrates->mcast_rate = IEEE80211_CCK_RATE_1MB;
- change |= IEEE80211SOFTMAC_TXRATECHG_MCAST;
-
- txrates->mgt_mcast_rate = IEEE80211_CCK_RATE_1MB;
- change |= IEEE80211SOFTMAC_TXRATECHG_MGT_MCAST;
-
- if (mac->txrates_change)
- mac->txrates_change(mac->dev, change);
-
- change = 0;
-
- bssinfo->supported_rates.count = 0;
- memset(bssinfo->supported_rates.rates, 0,
- sizeof(bssinfo->supported_rates.rates));
- change |= IEEE80211SOFTMAC_BSSINFOCHG_RATES;
-
- bssinfo->short_preamble = 0;
- change |= IEEE80211SOFTMAC_BSSINFOCHG_SHORT_PREAMBLE;
-
- bssinfo->use_protection = 0;
- change |= IEEE80211SOFTMAC_BSSINFOCHG_PROTECTION;
-
- if (mac->bssinfo_change)
- mac->bssinfo_change(mac->dev, change);
-
- mac->running = 1;
-}
-
-void ieee80211softmac_start(struct net_device *dev)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- ieee80211softmac_start_check_rates(mac);
- ieee80211softmac_init_bss(mac);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_start);
-
-void ieee80211softmac_stop(struct net_device *dev)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
-
- ieee80211softmac_clear_pending_work(mac);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_stop);
-
-void ieee80211softmac_set_rates(struct net_device *dev, u8 count, u8 *rates)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- unsigned long flags;
-
- spin_lock_irqsave(&mac->lock, flags);
- memcpy(mac->ratesinfo.rates, rates, count);
- mac->ratesinfo.count = count;
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_set_rates);
-
-static u8 raise_rate(struct ieee80211softmac_device *mac, u8 rate)
-{
- int i;
- struct ieee80211softmac_ratesinfo *ri = &mac->ratesinfo;
-
- for (i=0; i<ri->count-1; i++) {
- if (ri->rates[i] == rate)
- return ri->rates[i+1];
- }
- /* I guess we can't go any higher... */
- return ri->rates[ri->count];
-}
-
-u8 ieee80211softmac_lower_rate_delta(struct ieee80211softmac_device *mac, u8 rate, int delta)
-{
- int i;
- struct ieee80211softmac_ratesinfo *ri = &mac->ratesinfo;
-
- for (i=delta; i<ri->count; i++) {
- if (ri->rates[i] == rate)
- return ri->rates[i-delta];
- }
- /* I guess we can't go any lower... */
- return ri->rates[0];
-}
-
-static void ieee80211softmac_add_txrates_badness(struct ieee80211softmac_device *mac,
- int amount)
-{
- u8 default_rate = mac->txrates.default_rate;
- u8 default_fallback = mac->txrates.default_fallback;
- u32 changes = 0;
-
- //TODO: This is highly experimental code.
- // Maybe the dynamic rate selection does not work
- // and it has to be removed again.
-
-printk("badness %d\n", mac->txrate_badness);
- mac->txrate_badness += amount;
- if (mac->txrate_badness <= -1000) {
- /* Very small badness. Try a faster bitrate. */
- default_rate = raise_rate(mac, default_rate);
- changes |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT;
- default_fallback = get_fallback_rate(mac, default_rate);
- changes |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT_FBACK;
- mac->txrate_badness = 0;
-printk("Bitrate raised to %u\n", default_rate);
- } else if (mac->txrate_badness >= 10000) {
- /* Very high badness. Try a slower bitrate. */
- default_rate = lower_rate(mac, default_rate);
- changes |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT;
- default_fallback = get_fallback_rate(mac, default_rate);
- changes |= IEEE80211SOFTMAC_TXRATECHG_DEFAULT_FBACK;
- mac->txrate_badness = 0;
-printk("Bitrate lowered to %u\n", default_rate);
- }
-
- mac->txrates.default_rate = default_rate;
- mac->txrates.default_fallback = default_fallback;
-
- if (changes && mac->txrates_change)
- mac->txrates_change(mac->dev, changes);
-}
-
-void ieee80211softmac_fragment_lost(struct net_device *dev,
- u16 wl_seq)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- unsigned long flags;
-
- spin_lock_irqsave(&mac->lock, flags);
- ieee80211softmac_add_txrates_badness(mac, 1000);
- //TODO
-
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_fragment_lost);
-
-static int rate_cmp(const void *a_, const void *b_) {
- u8 *a, *b;
- a = (u8*)a_;
- b = (u8*)b_;
- return ((*a & ~IEEE80211_BASIC_RATE_MASK) - (*b & ~IEEE80211_BASIC_RATE_MASK));
-}
-
-/* Allocate a softmac network struct and fill it from a network */
-struct ieee80211softmac_network *
-ieee80211softmac_create_network(struct ieee80211softmac_device *mac,
- struct ieee80211_network *net)
-{
- struct ieee80211softmac_network *softnet;
- softnet = kzalloc(sizeof(struct ieee80211softmac_network), GFP_ATOMIC);
- if(softnet == NULL)
- return NULL;
- memcpy(softnet->bssid, net->bssid, ETH_ALEN);
- softnet->channel = net->channel;
- softnet->essid.len = net->ssid_len;
- memcpy(softnet->essid.data, net->ssid, softnet->essid.len);
-
- /* copy rates over */
- softnet->supported_rates.count = net->rates_len;
- memcpy(&softnet->supported_rates.rates[0], net->rates, net->rates_len);
- memcpy(&softnet->supported_rates.rates[softnet->supported_rates.count], net->rates_ex, net->rates_ex_len);
- softnet->supported_rates.count += net->rates_ex_len;
- sort(softnet->supported_rates.rates, softnet->supported_rates.count, sizeof(softnet->supported_rates.rates[0]), rate_cmp, NULL);
-
- /* we save the ERP value because it is needed at association time, and
- * many AP's do not include an ERP IE in the association response. */
- softnet->erp_value = net->erp_value;
-
- softnet->capabilities = net->capability;
- return softnet;
-}
-
-
-/* Add a network to the list, while locked */
-void
-ieee80211softmac_add_network_locked(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *add_net)
-{
- struct list_head *list_ptr;
- struct ieee80211softmac_network *softmac_net = NULL;
-
- list_for_each(list_ptr, &mac->network_list) {
- softmac_net = list_entry(list_ptr, struct ieee80211softmac_network, list);
- if(!memcmp(softmac_net->bssid, add_net->bssid, ETH_ALEN))
- break;
- else
- softmac_net = NULL;
- }
- if(softmac_net == NULL)
- list_add(&(add_net->list), &mac->network_list);
-}
-
-/* Add a network to the list, with locking */
-void
-ieee80211softmac_add_network(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *add_net)
-{
- unsigned long flags;
- spin_lock_irqsave(&mac->lock, flags);
- ieee80211softmac_add_network_locked(mac, add_net);
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-
-
-/* Delete a network from the list, while locked*/
-void
-ieee80211softmac_del_network_locked(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *del_net)
-{
- list_del(&(del_net->list));
-}
-
-/* Delete a network from the list with locking */
-void
-ieee80211softmac_del_network(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *del_net)
-{
- unsigned long flags;
- spin_lock_irqsave(&mac->lock, flags);
- ieee80211softmac_del_network_locked(mac, del_net);
- spin_unlock_irqrestore(&mac->lock, flags);
-}
-
-/* Get a network from the list by MAC while locked */
-struct ieee80211softmac_network *
-ieee80211softmac_get_network_by_bssid_locked(struct ieee80211softmac_device *mac,
- u8 *bssid)
-{
- struct list_head *list_ptr;
- struct ieee80211softmac_network *softmac_net = NULL;
- list_for_each(list_ptr, &mac->network_list) {
- softmac_net = list_entry(list_ptr, struct ieee80211softmac_network, list);
- if(!memcmp(softmac_net->bssid, bssid, ETH_ALEN))
- break;
- else
- softmac_net = NULL;
- }
- return softmac_net;
-}
-
-/* Get a network from the list by BSSID with locking */
-struct ieee80211softmac_network *
-ieee80211softmac_get_network_by_bssid(struct ieee80211softmac_device *mac,
- u8 *bssid)
-{
- unsigned long flags;
- struct ieee80211softmac_network *softmac_net;
-
- spin_lock_irqsave(&mac->lock, flags);
- softmac_net = ieee80211softmac_get_network_by_bssid_locked(mac, bssid);
- spin_unlock_irqrestore(&mac->lock, flags);
- return softmac_net;
-}
-
-/* Get a network from the list by ESSID while locked */
-struct ieee80211softmac_network *
-ieee80211softmac_get_network_by_essid_locked(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_essid *essid)
-{
- struct list_head *list_ptr;
- struct ieee80211softmac_network *softmac_net = NULL;
-
- list_for_each(list_ptr, &mac->network_list) {
- softmac_net = list_entry(list_ptr, struct ieee80211softmac_network, list);
- if (softmac_net->essid.len == essid->len &&
- !memcmp(softmac_net->essid.data, essid->data, essid->len))
- return softmac_net;
- }
- return NULL;
-}
-
-/* Get a network from the list by ESSID with locking */
-struct ieee80211softmac_network *
-ieee80211softmac_get_network_by_essid(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_essid *essid)
-{
- unsigned long flags;
- struct ieee80211softmac_network *softmac_net = NULL;
-
- spin_lock_irqsave(&mac->lock, flags);
- softmac_net = ieee80211softmac_get_network_by_essid_locked(mac, essid);
- spin_unlock_irqrestore(&mac->lock, flags);
- return softmac_net;
-}
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Johannes Berg");
-MODULE_AUTHOR("Joseph Jezak");
-MODULE_AUTHOR("Larry Finger");
-MODULE_AUTHOR("Danny van Dyk");
-MODULE_AUTHOR("Michael Buesch");
-MODULE_DESCRIPTION("802.11 software MAC");
diff --git a/net/ieee80211/softmac/ieee80211softmac_priv.h b/net/ieee80211/softmac/ieee80211softmac_priv.h
deleted file mode 100644
index 0642e090b8a7..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_priv.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Internal softmac API definitions.
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#ifndef IEEE80211SOFTMAC_PRIV_H_
-#define IEEE80211SOFTMAC_PRIV_H_
-
-#include <net/ieee80211softmac.h>
-#include <net/ieee80211softmac_wx.h>
-#include <linux/kernel.h>
-#include <linux/stringify.h>
-
-
-#define PFX "SoftMAC: "
-
-#ifdef assert
-# undef assert
-#endif
-#ifdef CONFIG_IEEE80211_SOFTMAC_DEBUG
-#define assert(expr) \
- do { \
- if (unlikely(!(expr))) { \
- printkl(KERN_ERR PFX "ASSERTION FAILED (%s) at: %s:%d:%s()\n", #expr, \
- __FILE__, __LINE__, __FUNCTION__); \
- } \
- } while (0)
-#else
-#define assert(expr) do {} while (0)
-#endif
-
-/* rate limited printk(). */
-#ifdef printkl
-# undef printkl
-#endif
-#define printkl(f, x...) do { if (printk_ratelimit()) printk(f ,##x); } while (0)
-/* rate limited printk() for debugging */
-#ifdef dprintkl
-# undef dprintkl
-#endif
-#ifdef CONFIG_IEEE80211_SOFTMAC_DEBUG
-# define dprintkl printkl
-#else
-# define dprintkl(f, x...) do { /* nothing */ } while (0)
-#endif
-
-/* debugging printk() */
-#ifdef dprintk
-# undef dprintk
-#endif
-#ifdef CONFIG_IEEE80211_SOFTMAC_DEBUG
-# define dprintk(f, x...) do { printk(f ,##x); } while (0)
-#else
-# define dprintk(f, x...) do { /* nothing */ } while (0)
-#endif
-
-/* private definitions and prototypes */
-
-/*** prototypes from _scan.c */
-void ieee80211softmac_scan(void *sm);
-/* for internal use if scanning is needed */
-int ieee80211softmac_start_scan(struct ieee80211softmac_device *mac);
-void ieee80211softmac_stop_scan(struct ieee80211softmac_device *mac);
-void ieee80211softmac_wait_for_scan(struct ieee80211softmac_device *mac);
-
-/* for use by _module.c to assign to the callbacks */
-int ieee80211softmac_start_scan_implementation(struct net_device *dev);
-void ieee80211softmac_stop_scan_implementation(struct net_device *dev);
-void ieee80211softmac_wait_for_scan_implementation(struct net_device *dev);
-
-/*** Network prototypes from _module.c */
-struct ieee80211softmac_network * ieee80211softmac_create_network(
- struct ieee80211softmac_device *mac, struct ieee80211_network *net);
-void ieee80211softmac_add_network_locked(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net);
-void ieee80211softmac_add_network(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net);
-void ieee80211softmac_del_network_locked(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net);
-void ieee80211softmac_del_network(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_network *net);
-struct ieee80211softmac_network * ieee80211softmac_get_network_by_bssid_locked(
- struct ieee80211softmac_device *mac, u8 *ea);
-struct ieee80211softmac_network * ieee80211softmac_get_network_by_bssid(
- struct ieee80211softmac_device *mac, u8 *ea);
-struct ieee80211softmac_network * ieee80211softmac_get_network_by_ssid_locked(
- struct ieee80211softmac_device *mac, u8 *ssid, u8 ssid_len);
-struct ieee80211softmac_network * ieee80211softmac_get_network_by_ssid(
- struct ieee80211softmac_device *mac, u8 *ssid, u8 ssid_len);
-struct ieee80211softmac_network *
-ieee80211softmac_get_network_by_essid_locked(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_essid *essid);
-struct ieee80211softmac_network *
-ieee80211softmac_get_network_by_essid(struct ieee80211softmac_device *mac,
- struct ieee80211softmac_essid *essid);
-
-/* Rates related */
-void ieee80211softmac_process_erp(struct ieee80211softmac_device *mac,
- u8 erp_value);
-int ieee80211softmac_ratesinfo_rate_supported(struct ieee80211softmac_ratesinfo *ri, u8 rate);
-u8 ieee80211softmac_lower_rate_delta(struct ieee80211softmac_device *mac, u8 rate, int delta);
-void ieee80211softmac_init_bss(struct ieee80211softmac_device *mac);
-void ieee80211softmac_recalc_txrates(struct ieee80211softmac_device *mac);
-static inline u8 lower_rate(struct ieee80211softmac_device *mac, u8 rate) {
- return ieee80211softmac_lower_rate_delta(mac, rate, 1);
-}
-
-static inline u8 get_fallback_rate(struct ieee80211softmac_device *mac, u8 rate)
-{
- return ieee80211softmac_lower_rate_delta(mac, rate, 2);
-}
-
-
-/*** prototypes from _io.c */
-int ieee80211softmac_send_mgt_frame(struct ieee80211softmac_device *mac,
- void* ptrarg, u32 type, u32 arg);
-int ieee80211softmac_handle_beacon(struct net_device *dev,
- struct ieee80211_beacon *beacon,
- struct ieee80211_network *network);
-
-/*** prototypes from _auth.c */
-/* do these have to go into the public header? */
-int ieee80211softmac_auth_req(struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net);
-int ieee80211softmac_deauth_req(struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net, int reason);
-
-/* for use by _module.c to assign to the callbacks */
-int ieee80211softmac_auth_resp(struct net_device *dev, struct ieee80211_auth *auth);
-int ieee80211softmac_deauth_resp(struct net_device *dev, struct ieee80211_deauth *deauth);
-
-/*** prototypes from _assoc.c */
-void ieee80211softmac_assoc_work(void *d);
-int ieee80211softmac_handle_assoc_response(struct net_device * dev,
- struct ieee80211_assoc_response * resp,
- struct ieee80211_network * network);
-int ieee80211softmac_handle_disassoc(struct net_device * dev,
- struct ieee80211_disassoc * disassoc);
-int ieee80211softmac_handle_reassoc_req(struct net_device * dev,
- struct ieee80211_reassoc_request * reassoc);
-void ieee80211softmac_assoc_timeout(void *d);
-void ieee80211softmac_send_disassoc_req(struct ieee80211softmac_device *mac, u16 reason);
-void ieee80211softmac_disassoc(struct ieee80211softmac_device *mac);
-
-/* some helper functions */
-static inline int ieee80211softmac_scan_handlers_check_self(struct ieee80211softmac_device *sm)
-{
- return (sm->start_scan == ieee80211softmac_start_scan_implementation) &&
- (sm->stop_scan == ieee80211softmac_stop_scan_implementation) &&
- (sm->wait_for_scan == ieee80211softmac_wait_for_scan_implementation);
-}
-
-static inline int ieee80211softmac_scan_sanity_check(struct ieee80211softmac_device *sm)
-{
- return ((sm->start_scan != ieee80211softmac_start_scan_implementation) &&
- (sm->stop_scan != ieee80211softmac_stop_scan_implementation) &&
- (sm->wait_for_scan != ieee80211softmac_wait_for_scan_implementation)
- ) || ieee80211softmac_scan_handlers_check_self(sm);
-}
-
-#define IEEE80211SOFTMAC_PROBE_DELAY HZ/50
-#define IEEE80211SOFTMAC_WORKQUEUE_NAME_LEN (17 + IFNAMSIZ)
-
-struct ieee80211softmac_network {
- struct list_head list; /* List */
- /* Network information copied from ieee80211_network */
- u8 bssid[ETH_ALEN];
- u8 channel;
- struct ieee80211softmac_essid essid;
-
- struct ieee80211softmac_ratesinfo supported_rates;
-
- /* SoftMAC specific */
- u16 authenticating:1, /* Status Flags */
- authenticated:1,
- auth_desynced_once:1;
-
- u8 erp_value; /* Saved ERP value */
- u16 capabilities; /* Capabilities bitfield */
- u8 challenge_len; /* Auth Challenge length */
- char *challenge; /* Challenge Text */
-};
-
-/* structure used to keep track of networks we're auth'ing to */
-struct ieee80211softmac_auth_queue_item {
- struct list_head list; /* List head */
- struct ieee80211softmac_network *net; /* Network to auth */
- struct ieee80211softmac_device *mac; /* SoftMAC device */
- u8 retry; /* Retry limit */
- u8 state; /* Auth State */
- struct work_struct work; /* Work queue */
-};
-
-/* scanning information */
-struct ieee80211softmac_scaninfo {
- u8 current_channel_idx,
- number_channels;
- struct ieee80211_channel *channels;
- u8 started:1,
- stop:1;
- u8 skip_flags;
- struct completion finished;
- struct work_struct softmac_scan;
-};
-
-/* private event struct */
-struct ieee80211softmac_event {
- struct list_head list;
- int event_type;
- void *event_context;
- struct work_struct work;
- notify_function_ptr fun;
- void *context;
- struct ieee80211softmac_device *mac;
-};
-
-void ieee80211softmac_call_events(struct ieee80211softmac_device *mac, int event, void *event_context);
-void ieee80211softmac_call_events_locked(struct ieee80211softmac_device *mac, int event, void *event_context);
-int ieee80211softmac_notify_internal(struct ieee80211softmac_device *mac,
- int event, void *event_context, notify_function_ptr fun, void *context, gfp_t gfp_mask);
-
-#endif /* IEEE80211SOFTMAC_PRIV_H_ */
diff --git a/net/ieee80211/softmac/ieee80211softmac_scan.c b/net/ieee80211/softmac/ieee80211softmac_scan.c
deleted file mode 100644
index 5507feab32de..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_scan.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Scanning routines.
- *
- * These are not exported because they're assigned to the function pointers.
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#include <linux/completion.h>
-#include "ieee80211softmac_priv.h"
-
-/* internal, use to trigger scanning if needed.
- * Returns -EBUSY if already scanning,
- * result of start_scan otherwise */
-int
-ieee80211softmac_start_scan(struct ieee80211softmac_device *sm)
-{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&sm->lock, flags);
- if (sm->scanning)
- {
- spin_unlock_irqrestore(&sm->lock, flags);
- return -EINPROGRESS;
- }
- sm->scanning = 1;
- spin_unlock_irqrestore(&sm->lock, flags);
-
- ret = sm->start_scan(sm->dev);
- if (ret) {
- spin_lock_irqsave(&sm->lock, flags);
- sm->scanning = 0;
- spin_unlock_irqrestore(&sm->lock, flags);
- }
- return ret;
-}
-
-void
-ieee80211softmac_stop_scan(struct ieee80211softmac_device *sm)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&sm->lock, flags);
-
- if (!sm->scanning) {
- spin_unlock_irqrestore(&sm->lock, flags);
- return;
- }
-
- spin_unlock_irqrestore(&sm->lock, flags);
- sm->stop_scan(sm->dev);
-}
-
-void
-ieee80211softmac_wait_for_scan(struct ieee80211softmac_device *sm)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&sm->lock, flags);
-
- if (!sm->scanning) {
- spin_unlock_irqrestore(&sm->lock, flags);
- return;
- }
-
- spin_unlock_irqrestore(&sm->lock, flags);
- sm->wait_for_scan(sm->dev);
-}
-
-
-/* internal scanning implementation follows */
-void ieee80211softmac_scan(void *d)
-{
- int invalid_channel;
- u8 current_channel_idx;
- struct ieee80211softmac_device *sm = (struct ieee80211softmac_device *)d;
- struct ieee80211softmac_scaninfo *si = sm->scaninfo;
- unsigned long flags;
-
- while (!(si->stop) && (si->current_channel_idx < si->number_channels)) {
- current_channel_idx = si->current_channel_idx;
- si->current_channel_idx++; /* go to the next channel */
-
- invalid_channel = (si->skip_flags & si->channels[current_channel_idx].flags);
-
- if (!invalid_channel) {
- sm->set_channel(sm->dev, si->channels[current_channel_idx].channel);
- // FIXME make this user configurable (active/passive)
- if(ieee80211softmac_send_mgt_frame(sm, NULL, IEEE80211_STYPE_PROBE_REQ, 0))
- printkl(KERN_DEBUG PFX "Sending Probe Request Failed\n");
-
- /* also send directed management frame for the network we're looking for */
- // TODO: is this if correct, or should we do this only if scanning from assoc request?
- if (sm->associnfo.req_essid.len)
- ieee80211softmac_send_mgt_frame(sm, &sm->associnfo.req_essid, IEEE80211_STYPE_PROBE_REQ, 0);
-
- spin_lock_irqsave(&sm->lock, flags);
- if (unlikely(!sm->running)) {
- /* Prevent reschedule on workqueue flush */
- spin_unlock_irqrestore(&sm->lock, flags);
- break;
- }
- schedule_delayed_work(&si->softmac_scan, IEEE80211SOFTMAC_PROBE_DELAY);
- spin_unlock_irqrestore(&sm->lock, flags);
- return;
- } else {
- dprintk(PFX "Not probing Channel %d (not allowed here)\n", si->channels[current_channel_idx].channel);
- }
- }
-
- spin_lock_irqsave(&sm->lock, flags);
- cancel_delayed_work(&si->softmac_scan);
- si->started = 0;
- spin_unlock_irqrestore(&sm->lock, flags);
-
- dprintk(PFX "Scanning finished: scanned %d channels starting with channel %d\n",
- sm->scaninfo->number_channels, sm->scaninfo->channels[0].channel);
- ieee80211softmac_scan_finished(sm);
- complete_all(&sm->scaninfo->finished);
-}
-
-static inline struct ieee80211softmac_scaninfo *allocate_scaninfo(struct ieee80211softmac_device *mac)
-{
- /* ugh. can we call this without having the spinlock held? */
- struct ieee80211softmac_scaninfo *info = kmalloc(sizeof(struct ieee80211softmac_scaninfo), GFP_ATOMIC);
- if (unlikely(!info))
- return NULL;
- INIT_WORK(&info->softmac_scan, ieee80211softmac_scan, mac);
- init_completion(&info->finished);
- return info;
-}
-
-int ieee80211softmac_start_scan_implementation(struct net_device *dev)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(dev);
- unsigned long flags;
-
- if (!(dev->flags & IFF_UP))
- return -ENODEV;
-
- assert(ieee80211softmac_scan_handlers_check_self(sm));
- if (!ieee80211softmac_scan_handlers_check_self(sm))
- return -EINVAL;
-
- spin_lock_irqsave(&sm->lock, flags);
- /* it looks like we need to hold the lock here
- * to make sure we don't allocate two of these... */
- if (unlikely(!sm->scaninfo))
- sm->scaninfo = allocate_scaninfo(sm);
- if (unlikely(!sm->scaninfo)) {
- spin_unlock_irqrestore(&sm->lock, flags);
- return -ENOMEM;
- }
-
- sm->scaninfo->skip_flags = IEEE80211_CH_INVALID;
- if (0 /* not scanning in IEEE802.11b */)//TODO
- sm->scaninfo->skip_flags |= IEEE80211_CH_B_ONLY;
- if (0 /* IEEE802.11a */) {//TODO
- sm->scaninfo->channels = sm->ieee->geo.a;
- sm->scaninfo->number_channels = sm->ieee->geo.a_channels;
- } else {
- sm->scaninfo->channels = sm->ieee->geo.bg;
- sm->scaninfo->number_channels = sm->ieee->geo.bg_channels;
- }
- sm->scaninfo->current_channel_idx = 0;
- sm->scaninfo->started = 1;
- sm->scaninfo->stop = 0;
- INIT_COMPLETION(sm->scaninfo->finished);
- schedule_work(&sm->scaninfo->softmac_scan);
- spin_unlock_irqrestore(&sm->lock, flags);
- return 0;
-}
-
-void ieee80211softmac_stop_scan_implementation(struct net_device *dev)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(dev);
- unsigned long flags;
-
- assert(ieee80211softmac_scan_handlers_check_self(sm));
- if (!ieee80211softmac_scan_handlers_check_self(sm))
- return;
-
- spin_lock_irqsave(&sm->lock, flags);
- assert(sm->scaninfo != NULL);
- if (sm->scaninfo) {
- if (sm->scaninfo->started)
- sm->scaninfo->stop = 1;
- else
- complete_all(&sm->scaninfo->finished);
- }
- spin_unlock_irqrestore(&sm->lock, flags);
-}
-
-void ieee80211softmac_wait_for_scan_implementation(struct net_device *dev)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(dev);
- unsigned long flags;
-
- assert(ieee80211softmac_scan_handlers_check_self(sm));
- if (!ieee80211softmac_scan_handlers_check_self(sm))
- return;
-
- spin_lock_irqsave(&sm->lock, flags);
- if (!sm->scaninfo->started) {
- spin_unlock_irqrestore(&sm->lock, flags);
- return;
- }
- spin_unlock_irqrestore(&sm->lock, flags);
- wait_for_completion(&sm->scaninfo->finished);
-}
-
-/* this is what drivers (that do scanning) call when they're done */
-void ieee80211softmac_scan_finished(struct ieee80211softmac_device *sm)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&sm->lock, flags);
- sm->scanning = 0;
- spin_unlock_irqrestore(&sm->lock, flags);
-
- if (sm->associnfo.bssvalid) {
- struct ieee80211softmac_network *net;
-
- net = ieee80211softmac_get_network_by_bssid(sm, sm->associnfo.bssid);
- if (net)
- sm->set_channel(sm->dev, net->channel);
- }
- ieee80211softmac_call_events(sm, IEEE80211SOFTMAC_EVENT_SCAN_FINISHED, NULL);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_scan_finished);
diff --git a/net/ieee80211/softmac/ieee80211softmac_wx.c b/net/ieee80211/softmac/ieee80211softmac_wx.c
deleted file mode 100644
index 23068a830f7d..000000000000
--- a/net/ieee80211/softmac/ieee80211softmac_wx.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * This file contains our _wx handlers. Make sure you EXPORT_SYMBOL_GPL them
- *
- * Copyright (c) 2005, 2006 Johannes Berg <johannes@sipsolutions.net>
- * Joseph Jezak <josejx@gentoo.org>
- * Larry Finger <Larry.Finger@lwfinger.net>
- * Danny van Dyk <kugelfang@gentoo.org>
- * Michael Buesch <mbuesch@freenet.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-#include "ieee80211softmac_priv.h"
-
-#include <net/iw_handler.h>
-/* for is_broadcast_ether_addr and is_zero_ether_addr */
-#include <linux/etherdevice.h>
-
-int
-ieee80211softmac_wx_trigger_scan(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(net_dev);
- return ieee80211softmac_start_scan(sm);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_trigger_scan);
-
-
-/* if we're still scanning, return -EAGAIN so that userspace tools
- * can get the complete scan results, otherwise return 0. */
-int
-ieee80211softmac_wx_get_scan_results(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- unsigned long flags;
- struct ieee80211softmac_device *sm = ieee80211_priv(net_dev);
-
- spin_lock_irqsave(&sm->lock, flags);
- if (sm->scanning) {
- spin_unlock_irqrestore(&sm->lock, flags);
- return -EAGAIN;
- }
- spin_unlock_irqrestore(&sm->lock, flags);
- return ieee80211_wx_get_scan(sm->ieee, info, data, extra);
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_get_scan_results);
-
-int
-ieee80211softmac_wx_set_essid(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(net_dev);
- struct ieee80211softmac_network *n;
- struct ieee80211softmac_auth_queue_item *authptr;
- int length = 0;
-
- mutex_lock(&sm->associnfo.mutex);
-
- /* Check if we're already associating to this or another network
- * If it's another network, cancel and start over with our new network
- * If it's our network, ignore the change, we're already doing it!
- */
- if((sm->associnfo.associating || sm->associnfo.associated) &&
- (data->essid.flags && data->essid.length)) {
- /* Get the associating network */
- n = ieee80211softmac_get_network_by_bssid(sm, sm->associnfo.bssid);
- if(n && n->essid.len == data->essid.length &&
- !memcmp(n->essid.data, extra, n->essid.len)) {
- dprintk(KERN_INFO PFX "Already associating or associated to "MAC_FMT"\n",
- MAC_ARG(sm->associnfo.bssid));
- goto out;
- } else {
- dprintk(KERN_INFO PFX "Canceling existing associate request!\n");
- /* Cancel assoc work */
- cancel_delayed_work(&sm->associnfo.work);
- /* We don't have to do this, but it's a little cleaner */
- list_for_each_entry(authptr, &sm->auth_queue, list)
- cancel_delayed_work(&authptr->work);
- sm->associnfo.bssvalid = 0;
- sm->associnfo.bssfixed = 0;
- flush_scheduled_work();
- sm->associnfo.associating = 0;
- sm->associnfo.associated = 0;
- }
- }
-
-
- sm->associnfo.static_essid = 0;
- sm->associnfo.assoc_wait = 0;
-
- if (data->essid.flags && data->essid.length) {
- length = min((int)data->essid.length, IW_ESSID_MAX_SIZE);
- if (length) {
- memcpy(sm->associnfo.req_essid.data, extra, length);
- sm->associnfo.static_essid = 1;
- }
- }
-
- /* set our requested ESSID length.
- * If applicable, we have already copied the data in */
- sm->associnfo.req_essid.len = length;
-
- sm->associnfo.associating = 1;
- /* queue lower level code to do work (if necessary) */
- schedule_work(&sm->associnfo.work);
-out:
- mutex_unlock(&sm->associnfo.mutex);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_set_essid);
-
-int
-ieee80211softmac_wx_get_essid(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *sm = ieee80211_priv(net_dev);
-
- mutex_lock(&sm->associnfo.mutex);
- /* If all fails, return ANY (empty) */
- data->essid.length = 0;
- data->essid.flags = 0; /* active */
-
- /* If we have a statically configured ESSID then return it */
- if (sm->associnfo.static_essid) {
- data->essid.length = sm->associnfo.req_essid.len;
- data->essid.flags = 1; /* active */
- memcpy(extra, sm->associnfo.req_essid.data, sm->associnfo.req_essid.len);
- }
-
- /* If we're associating/associated, return that */
- if (sm->associnfo.associated || sm->associnfo.associating) {
- data->essid.length = sm->associnfo.associate_essid.len;
- data->essid.flags = 1; /* active */
- memcpy(extra, sm->associnfo.associate_essid.data, sm->associnfo.associate_essid.len);
- }
- mutex_unlock(&sm->associnfo.mutex);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_get_essid);
-
-int
-ieee80211softmac_wx_set_rate(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(net_dev);
- struct ieee80211_device *ieee = mac->ieee;
- unsigned long flags;
- s32 in_rate = data->bitrate.value;
- u8 rate;
- int is_ofdm = 0;
- int err = -EINVAL;
-
- if (in_rate == -1) {
- /* FIXME: We don't correctly handle backing down to lower
- rates, so 801.11g devices start off at 11M for now. People
- can manually change it if they really need to, but 11M is
- more reliable. Note similar logic in
- ieee80211softmac_wx_set_rate() */
- if (ieee->modulation & IEEE80211_CCK_MODULATION)
- in_rate = 11000000;
- else
- in_rate = 54000000;
- }
-
- switch (in_rate) {
- case 1000000:
- rate = IEEE80211_CCK_RATE_1MB;
- break;
- case 2000000:
- rate = IEEE80211_CCK_RATE_2MB;
- break;
- case 5500000:
- rate = IEEE80211_CCK_RATE_5MB;
- break;
- case 11000000:
- rate = IEEE80211_CCK_RATE_11MB;
- break;
- case 6000000:
- rate = IEEE80211_OFDM_RATE_6MB;
- is_ofdm = 1;
- break;
- case 9000000:
- rate = IEEE80211_OFDM_RATE_9MB;
- is_ofdm = 1;
- break;
- case 12000000:
- rate = IEEE80211_OFDM_RATE_12MB;
- is_ofdm = 1;
- break;
- case 18000000:
- rate = IEEE80211_OFDM_RATE_18MB;
- is_ofdm = 1;
- break;
- case 24000000:
- rate = IEEE80211_OFDM_RATE_24MB;
- is_ofdm = 1;
- break;
- case 36000000:
- rate = IEEE80211_OFDM_RATE_36MB;
- is_ofdm = 1;
- break;
- case 48000000:
- rate = IEEE80211_OFDM_RATE_48MB;
- is_ofdm = 1;
- break;
- case 54000000:
- rate = IEEE80211_OFDM_RATE_54MB;
- is_ofdm = 1;
- break;
- default:
- goto out;
- }
-
- spin_lock_irqsave(&mac->lock, flags);
-
- /* Check if correct modulation for this PHY. */
- if (is_ofdm && !(ieee->modulation & IEEE80211_OFDM_MODULATION))
- goto out_unlock;
-
- mac->txrates.user_rate = rate;
- ieee80211softmac_recalc_txrates(mac);
- err = 0;
-
-out_unlock:
- spin_unlock_irqrestore(&mac->lock, flags);
-out:
- return err;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_set_rate);
-
-int
-ieee80211softmac_wx_get_rate(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(net_dev);
- unsigned long flags;
- int err = -EINVAL;
-
- spin_lock_irqsave(&mac->lock, flags);
- switch (mac->txrates.default_rate) {
- case IEEE80211_CCK_RATE_1MB:
- data->bitrate.value = 1000000;
- break;
- case IEEE80211_CCK_RATE_2MB:
- data->bitrate.value = 2000000;
- break;
- case IEEE80211_CCK_RATE_5MB:
- data->bitrate.value = 5500000;
- break;
- case IEEE80211_CCK_RATE_11MB:
- data->bitrate.value = 11000000;
- break;
- case IEEE80211_OFDM_RATE_6MB:
- data->bitrate.value = 6000000;
- break;
- case IEEE80211_OFDM_RATE_9MB:
- data->bitrate.value = 9000000;
- break;
- case IEEE80211_OFDM_RATE_12MB:
- data->bitrate.value = 12000000;
- break;
- case IEEE80211_OFDM_RATE_18MB:
- data->bitrate.value = 18000000;
- break;
- case IEEE80211_OFDM_RATE_24MB:
- data->bitrate.value = 24000000;
- break;
- case IEEE80211_OFDM_RATE_36MB:
- data->bitrate.value = 36000000;
- break;
- case IEEE80211_OFDM_RATE_48MB:
- data->bitrate.value = 48000000;
- break;
- case IEEE80211_OFDM_RATE_54MB:
- data->bitrate.value = 54000000;
- break;
- default:
- assert(0);
- goto out_unlock;
- }
- err = 0;
-out_unlock:
- spin_unlock_irqrestore(&mac->lock, flags);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_get_rate);
-
-int
-ieee80211softmac_wx_get_wap(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(net_dev);
- int err = 0;
-
- mutex_lock(&mac->associnfo.mutex);
- if (mac->associnfo.bssvalid)
- memcpy(data->ap_addr.sa_data, mac->associnfo.bssid, ETH_ALEN);
- else
- memset(data->ap_addr.sa_data, 0xff, ETH_ALEN);
- data->ap_addr.sa_family = ARPHRD_ETHER;
- mutex_unlock(&mac->associnfo.mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_get_wap);
-
-int
-ieee80211softmac_wx_set_wap(struct net_device *net_dev,
- struct iw_request_info *info,
- union iwreq_data *data,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(net_dev);
-
- /* sanity check */
- if (data->ap_addr.sa_family != ARPHRD_ETHER) {
- return -EINVAL;
- }
-
- mutex_lock(&mac->associnfo.mutex);
- if (is_broadcast_ether_addr(data->ap_addr.sa_data)) {
- /* the bssid we have is not to be fixed any longer,
- * and we should reassociate to the best AP. */
- mac->associnfo.bssfixed = 0;
- /* force reassociation */
- mac->associnfo.bssvalid = 0;
- if (mac->associnfo.associated)
- schedule_work(&mac->associnfo.work);
- } else if (is_zero_ether_addr(data->ap_addr.sa_data)) {
- /* the bssid we have is no longer fixed */
- mac->associnfo.bssfixed = 0;
- } else {
- if (!memcmp(mac->associnfo.bssid, data->ap_addr.sa_data, ETH_ALEN)) {
- if (mac->associnfo.associating || mac->associnfo.associated) {
- /* bssid unchanged and associated or associating - just return */
- goto out;
- }
- } else {
- /* copy new value in data->ap_addr.sa_data to bssid */
- memcpy(mac->associnfo.bssid, data->ap_addr.sa_data, ETH_ALEN);
- }
- /* tell the other code that this bssid should be used no matter what */
- mac->associnfo.bssfixed = 1;
- /* queue associate if new bssid or (old one again and not associated) */
- schedule_work(&mac->associnfo.work);
- }
-
- out:
- mutex_unlock(&mac->associnfo.mutex);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_set_wap);
-
-int
-ieee80211softmac_wx_set_genie(struct net_device *dev,
- struct iw_request_info *info,
- union iwreq_data *wrqu,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- unsigned long flags;
- int err = 0;
- char *buf;
- int i;
-
- mutex_lock(&mac->associnfo.mutex);
- spin_lock_irqsave(&mac->lock, flags);
- /* bleh. shouldn't be locked for that kmalloc... */
-
- if (wrqu->data.length) {
- if ((wrqu->data.length < 2) || (extra[1]+2 != wrqu->data.length)) {
- /* this is an IE, so the length must be
- * correct. Is it possible though that
- * more than one IE is passed in?
- */
- err = -EINVAL;
- goto out;
- }
- if (mac->wpa.IEbuflen <= wrqu->data.length) {
- buf = kmalloc(wrqu->data.length, GFP_ATOMIC);
- if (!buf) {
- err = -ENOMEM;
- goto out;
- }
- kfree(mac->wpa.IE);
- mac->wpa.IE = buf;
- mac->wpa.IEbuflen = wrqu->data.length;
- }
- memcpy(mac->wpa.IE, extra, wrqu->data.length);
- dprintk(KERN_INFO PFX "generic IE set to ");
- for (i=0;i<wrqu->data.length;i++)
- dprintk("%.2x", (u8)mac->wpa.IE[i]);
- dprintk("\n");
- mac->wpa.IElen = wrqu->data.length;
- } else {
- kfree(mac->wpa.IE);
- mac->wpa.IE = NULL;
- mac->wpa.IElen = 0;
- mac->wpa.IEbuflen = 0;
- }
-
- out:
- spin_unlock_irqrestore(&mac->lock, flags);
- mutex_unlock(&mac->associnfo.mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_set_genie);
-
-int
-ieee80211softmac_wx_get_genie(struct net_device *dev,
- struct iw_request_info *info,
- union iwreq_data *wrqu,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- unsigned long flags;
- int err = 0;
- int space = wrqu->data.length;
-
- mutex_lock(&mac->associnfo.mutex);
- spin_lock_irqsave(&mac->lock, flags);
-
- wrqu->data.length = 0;
-
- if (mac->wpa.IE && mac->wpa.IElen) {
- wrqu->data.length = mac->wpa.IElen;
- if (mac->wpa.IElen <= space)
- memcpy(extra, mac->wpa.IE, mac->wpa.IElen);
- else
- err = -E2BIG;
- }
- spin_unlock_irqrestore(&mac->lock, flags);
- mutex_lock(&mac->associnfo.mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_get_genie);
-
-int
-ieee80211softmac_wx_set_mlme(struct net_device *dev,
- struct iw_request_info *info,
- union iwreq_data *wrqu,
- char *extra)
-{
- struct ieee80211softmac_device *mac = ieee80211_priv(dev);
- struct iw_mlme *mlme = (struct iw_mlme *)extra;
- u16 reason = cpu_to_le16(mlme->reason_code);
- struct ieee80211softmac_network *net;
- int err = -EINVAL;
-
- mutex_lock(&mac->associnfo.mutex);
-
- if (memcmp(mac->associnfo.bssid, mlme->addr.sa_data, ETH_ALEN)) {
- printk(KERN_DEBUG PFX "wx_set_mlme: requested operation on net we don't use\n");
- goto out;
- }
-
- switch (mlme->cmd) {
- case IW_MLME_DEAUTH:
- net = ieee80211softmac_get_network_by_bssid_locked(mac, mlme->addr.sa_data);
- if (!net) {
- printk(KERN_DEBUG PFX "wx_set_mlme: we should know the net here...\n");
- goto out;
- }
- return ieee80211softmac_deauth_req(mac, net, reason);
- case IW_MLME_DISASSOC:
- ieee80211softmac_send_disassoc_req(mac, reason);
- mac->associnfo.associated = 0;
- mac->associnfo.associating = 0;
- err = 0;
- goto out;
- default:
- err = -EOPNOTSUPP;
- }
-
-out:
- mutex_unlock(&mac->associnfo.mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(ieee80211softmac_wx_set_mlme);
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
new file mode 100644
index 000000000000..44a7e16bf3b5
--- /dev/null
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __IEEE802154_6LOWPAN_I_H__
+#define __IEEE802154_6LOWPAN_I_H__
+
+#include <linux/list.h>
+
+#include <net/ieee802154_netdev.h>
+#include <net/inet_frag.h>
+#include <net/6lowpan.h>
+
+typedef unsigned __bitwise lowpan_rx_result;
+#define RX_CONTINUE ((__force lowpan_rx_result) 0u)
+#define RX_DROP_UNUSABLE ((__force lowpan_rx_result) 1u)
+#define RX_DROP ((__force lowpan_rx_result) 2u)
+#define RX_QUEUED ((__force lowpan_rx_result) 3u)
+
+#define LOWPAN_DISPATCH_FRAG1 0xc0
+#define LOWPAN_DISPATCH_FRAGN 0xe0
+
+struct frag_lowpan_compare_key {
+ u16 tag;
+ u16 d_size;
+ struct ieee802154_addr src;
+ struct ieee802154_addr dst;
+};
+
+/* Equivalent of ipv4 struct ipq
+ */
+struct lowpan_frag_queue {
+ struct inet_frag_queue q;
+};
+
+int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
+void lowpan_net_frag_exit(void);
+int lowpan_net_frag_init(void);
+
+void lowpan_rx_init(void);
+void lowpan_rx_exit(void);
+
+int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *_daddr,
+ const void *_saddr, unsigned int len);
+netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev);
+
+int lowpan_iphc_decompress(struct sk_buff *skb);
+lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb);
+
+#endif /* __IEEE802154_6LOWPAN_I_H__ */
diff --git a/net/ieee802154/6lowpan/Kconfig b/net/ieee802154/6lowpan/Kconfig
new file mode 100644
index 000000000000..e808e4db2678
--- /dev/null
+++ b/net/ieee802154/6lowpan/Kconfig
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config IEEE802154_6LOWPAN
+ tristate "6lowpan support over IEEE 802.15.4"
+ depends on 6LOWPAN
+ help
+ IPv6 compression over IEEE 802.15.4.
diff --git a/net/ieee802154/6lowpan/Makefile b/net/ieee802154/6lowpan/Makefile
new file mode 100644
index 000000000000..f11d6376a891
--- /dev/null
+++ b/net/ieee802154/6lowpan/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_IEEE802154_6LOWPAN) += ieee802154_6lowpan.o
+
+ieee802154_6lowpan-y := core.o rx.o reassembly.o tx.o
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
new file mode 100644
index 000000000000..018929563c6b
--- /dev/null
+++ b/net/ieee802154/6lowpan/core.c
@@ -0,0 +1,289 @@
+/* Copyright 2011, Siemens AG
+ * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+/* Based on patches from Jon Smirl <jonsmirl@gmail.com>
+ * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Jon's code is based on 6lowpan implementation for Contiki which is:
+ * Copyright (c) 2008, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ieee802154.h>
+#include <linux/if_arp.h>
+
+#include <net/ipv6.h>
+#include <net/netdev_lock.h>
+
+#include "6lowpan_i.h"
+
+static int open_count;
+
+static const struct header_ops lowpan_header_ops = {
+ .create = lowpan_header_create,
+};
+
+static int lowpan_dev_init(struct net_device *ldev)
+{
+ netdev_lockdep_set_classes(ldev);
+
+ return 0;
+}
+
+static int lowpan_open(struct net_device *dev)
+{
+ if (!open_count)
+ lowpan_rx_init();
+ open_count++;
+ return 0;
+}
+
+static int lowpan_stop(struct net_device *dev)
+{
+ open_count--;
+ if (!open_count)
+ lowpan_rx_exit();
+ return 0;
+}
+
+static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n)
+{
+ struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+
+ /* default no short_addr is available for a neighbour */
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ return 0;
+}
+
+static int lowpan_get_iflink(const struct net_device *dev)
+{
+ return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex);
+}
+
+static const struct net_device_ops lowpan_netdev_ops = {
+ .ndo_init = lowpan_dev_init,
+ .ndo_start_xmit = lowpan_xmit,
+ .ndo_open = lowpan_open,
+ .ndo_stop = lowpan_stop,
+ .ndo_neigh_construct = lowpan_neigh_construct,
+ .ndo_get_iflink = lowpan_get_iflink,
+};
+
+static void lowpan_setup(struct net_device *ldev)
+{
+ memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN);
+ /* We need an ipv6hdr as minimum len when calling xmit */
+ ldev->hard_header_len = sizeof(struct ipv6hdr);
+ ldev->flags = IFF_BROADCAST | IFF_MULTICAST;
+ ldev->priv_flags |= IFF_NO_QUEUE;
+
+ ldev->netdev_ops = &lowpan_netdev_ops;
+ ldev->header_ops = &lowpan_header_ops;
+ ldev->needs_free_netdev = true;
+ ldev->netns_immutable = true;
+}
+
+static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int lowpan_newlink(struct net_device *ldev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **tb = params->tb;
+ struct net_device *wdev;
+ int ret;
+
+ ASSERT_RTNL();
+
+ pr_debug("adding new link\n");
+
+ if (!tb[IFLA_LINK])
+ return -EINVAL;
+ if (params->link_net && !net_eq(params->link_net, dev_net(ldev)))
+ return -EINVAL;
+ /* find and hold wpan device */
+ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK]));
+ if (!wdev)
+ return -ENODEV;
+ if (wdev->type != ARPHRD_IEEE802154) {
+ dev_put(wdev);
+ return -EINVAL;
+ }
+
+ if (wdev->ieee802154_ptr->lowpan_dev) {
+ dev_put(wdev);
+ return -EBUSY;
+ }
+
+ lowpan_802154_dev(ldev)->wdev = wdev;
+ /* Set the lowpan hardware address to the wpan hardware address. */
+ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN);
+ /* We need headroom for possible wpan_dev_hard_header call and tailroom
+ * for encryption/fcs handling. The lowpan interface will replace
+ * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN
+ * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6
+ * header.
+ */
+ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN +
+ wdev->needed_headroom;
+ ldev->needed_tailroom = wdev->needed_tailroom;
+
+ ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh);
+
+ ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154);
+ if (ret < 0) {
+ dev_put(wdev);
+ return ret;
+ }
+
+ wdev->ieee802154_ptr->lowpan_dev = ldev;
+ return 0;
+}
+
+static void lowpan_dellink(struct net_device *ldev, struct list_head *head)
+{
+ struct net_device *wdev = lowpan_802154_dev(ldev)->wdev;
+
+ ASSERT_RTNL();
+
+ wdev->ieee802154_ptr->lowpan_dev = NULL;
+ lowpan_unregister_netdevice(ldev);
+ dev_put(wdev);
+}
+
+static struct rtnl_link_ops lowpan_link_ops __read_mostly = {
+ .kind = "lowpan",
+ .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)),
+ .setup = lowpan_setup,
+ .newlink = lowpan_newlink,
+ .dellink = lowpan_dellink,
+ .validate = lowpan_validate,
+};
+
+static inline int __init lowpan_netlink_init(void)
+{
+ return rtnl_link_register(&lowpan_link_ops);
+}
+
+static inline void lowpan_netlink_fini(void)
+{
+ rtnl_link_unregister(&lowpan_link_ops);
+}
+
+static int lowpan_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct wpan_dev *wpan_dev;
+
+ if (ndev->type != ARPHRD_IEEE802154)
+ return NOTIFY_DONE;
+ wpan_dev = ndev->ieee802154_ptr;
+ if (!wpan_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* Check if wpan interface is unregistered that we
+ * also delete possible lowpan interfaces which belongs
+ * to the wpan interface.
+ */
+ if (wpan_dev->lowpan_dev)
+ lowpan_dellink(wpan_dev->lowpan_dev, NULL);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block lowpan_dev_notifier = {
+ .notifier_call = lowpan_device_event,
+};
+
+static int __init lowpan_init_module(void)
+{
+ int err = 0;
+
+ err = lowpan_net_frag_init();
+ if (err < 0)
+ goto out;
+
+ err = lowpan_netlink_init();
+ if (err < 0)
+ goto out_frag;
+
+ err = register_netdevice_notifier(&lowpan_dev_notifier);
+ if (err < 0)
+ goto out_pack;
+
+ return 0;
+
+out_pack:
+ lowpan_netlink_fini();
+out_frag:
+ lowpan_net_frag_exit();
+out:
+ return err;
+}
+
+static void __exit lowpan_cleanup_module(void)
+{
+ lowpan_netlink_fini();
+
+ lowpan_net_frag_exit();
+
+ unregister_netdevice_notifier(&lowpan_dev_notifier);
+}
+
+module_init(lowpan_init_module);
+module_exit(lowpan_cleanup_module);
+MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("lowpan");
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
new file mode 100644
index 000000000000..ddb6a5817d09
--- /dev/null
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* 6LoWPAN fragment reassembly
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/ipv6/reassembly.c
+ */
+
+#define pr_fmt(fmt) "6LoWPAN: " fmt
+
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/ieee802154_netdev.h>
+#include <net/6lowpan.h>
+#include <net/ipv6_frag.h>
+#include <net/inet_frag.h>
+#include <net/ip.h>
+
+#include "6lowpan_i.h"
+
+static const char lowpan_frags_cache_name[] = "lowpan-frags";
+
+static struct inet_frags lowpan_frags;
+
+static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev, struct net_device *ldev,
+ int *refs);
+
+static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
+{
+ const struct frag_lowpan_compare_key *key = a;
+
+ BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
+ memcpy(&q->key, key, sizeof(*key));
+}
+
+static void lowpan_frag_expire(struct timer_list *t)
+{
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
+ struct frag_queue *fq;
+ int refs = 1;
+
+ fq = container_of(frag, struct frag_queue, q);
+
+ spin_lock(&fq->q.lock);
+
+ if (fq->q.flags & INET_FRAG_COMPLETE)
+ goto out;
+
+ inet_frag_kill(&fq->q, &refs);
+out:
+ spin_unlock(&fq->q.lock);
+ inet_frag_putn(&fq->q, refs);
+}
+
+static inline struct lowpan_frag_queue *
+fq_find(struct net *net, const struct lowpan_802154_cb *cb,
+ const struct ieee802154_addr *src,
+ const struct ieee802154_addr *dst)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+ struct frag_lowpan_compare_key key = {};
+ struct inet_frag_queue *q;
+
+ key.tag = cb->d_tag;
+ key.d_size = cb->d_size;
+ key.src = *src;
+ key.dst = *dst;
+
+ q = inet_frag_find(ieee802154_lowpan->fqdir, &key);
+ if (!q)
+ return NULL;
+
+ return container_of(q, struct lowpan_frag_queue, q);
+}
+
+static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
+ struct sk_buff *skb, u8 frag_type,
+ int *refs)
+{
+ struct sk_buff *prev_tail;
+ struct net_device *ldev;
+ int end, offset, err;
+
+ /* inet_frag_queue_* functions use skb->cb; see struct ipfrag_skb_cb
+ * in inet_fragment.c
+ */
+ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet_skb_parm));
+ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet6_skb_parm));
+
+ if (fq->q.flags & INET_FRAG_COMPLETE)
+ goto err;
+
+ offset = lowpan_802154_cb(skb)->d_offset << 3;
+ end = lowpan_802154_cb(skb)->d_size;
+
+ /* Is this the final fragment? */
+ if (offset + skb->len == end) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < fq->q.len ||
+ ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
+ goto err;
+ fq->q.flags |= INET_FRAG_LAST_IN;
+ fq->q.len = end;
+ } else {
+ if (end > fq->q.len) {
+ /* Some bits beyond end -> corruption. */
+ if (fq->q.flags & INET_FRAG_LAST_IN)
+ goto err;
+ fq->q.len = end;
+ }
+ }
+
+ ldev = skb->dev;
+ if (ldev)
+ skb->dev = NULL;
+ barrier();
+
+ prev_tail = fq->q.fragments_tail;
+ err = inet_frag_queue_insert(&fq->q, skb, offset, end);
+ if (err)
+ goto err;
+
+ fq->q.stamp = skb->tstamp;
+ fq->q.tstamp_type = skb->tstamp_type;
+ if (frag_type == LOWPAN_DISPATCH_FRAG1)
+ fq->q.flags |= INET_FRAG_FIRST_IN;
+
+ fq->q.meat += skb->len;
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
+
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ int res;
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs);
+ skb->_skb_refdst = orefdst;
+ return res;
+ }
+ skb_dst_drop(skb);
+
+ return -1;
+err:
+ kfree_skb(skb);
+ return -1;
+}
+
+/* Check if this packet is complete.
+ *
+ * It is called with locked fq, and caller must check that
+ * queue is eligible for reassembly i.e. it is not COMPLETE,
+ * the last and the first frames arrived and all the bits are here.
+ */
+static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *ldev,
+ int *refs)
+{
+ void *reasm_data;
+
+ inet_frag_kill(&fq->q, refs);
+
+ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
+ if (!reasm_data)
+ goto out_oom;
+ inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);
+
+ skb->dev = ldev;
+ skb->tstamp = fq->q.stamp;
+ fq->q.rb_fragments = RB_ROOT;
+ fq->q.fragments_tail = NULL;
+ fq->q.last_run_head = NULL;
+
+ return 1;
+out_oom:
+ net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n");
+ return -1;
+}
+
+static int lowpan_frag_rx_handlers_result(struct sk_buff *skb,
+ lowpan_rx_result res)
+{
+ switch (res) {
+ case RX_QUEUED:
+ return NET_RX_SUCCESS;
+ case RX_CONTINUE:
+ /* nobody cared about this packet */
+ net_warn_ratelimited("%s: received unknown dispatch\n",
+ __func__);
+
+ fallthrough;
+ default:
+ /* all others failure */
+ return NET_RX_DROP;
+ }
+}
+
+static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb)
+{
+ int ret;
+
+ if (!lowpan_is_iphc(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ ret = lowpan_iphc_decompress(skb);
+ if (ret < 0)
+ return RX_DROP;
+
+ return RX_QUEUED;
+}
+
+static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb)
+{
+ lowpan_rx_result res;
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(skb); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0)
+
+ /* likely at first */
+ CALL_RXH(lowpan_frag_rx_h_iphc);
+ CALL_RXH(lowpan_rx_h_ipv6);
+
+rxh_next:
+ return lowpan_frag_rx_handlers_result(skb, res);
+#undef CALL_RXH
+}
+
+#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07
+#define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8
+
+static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type,
+ struct lowpan_802154_cb *cb)
+{
+ bool fail;
+ u8 high = 0, low = 0;
+ __be16 d_tag = 0;
+
+ fail = lowpan_fetch_skb(skb, &high, 1);
+ fail |= lowpan_fetch_skb(skb, &low, 1);
+ /* remove the dispatch value and use first three bits as high value
+ * for the datagram size
+ */
+ cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) <<
+ LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low;
+ fail |= lowpan_fetch_skb(skb, &d_tag, 2);
+ cb->d_tag = ntohs(d_tag);
+
+ if (frag_type == LOWPAN_DISPATCH_FRAGN) {
+ fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1);
+ } else {
+ skb_reset_network_header(skb);
+ cb->d_offset = 0;
+ /* check if datagram_size has ipv6hdr on FRAG1 */
+ fail |= cb->d_size < sizeof(struct ipv6hdr);
+ /* check if we can dereference the dispatch value */
+ fail |= !skb->len;
+ }
+
+ if (unlikely(fail))
+ return -EIO;
+
+ return 0;
+}
+
+int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
+{
+ struct lowpan_frag_queue *fq;
+ struct net *net = dev_net(skb->dev);
+ struct lowpan_802154_cb *cb = lowpan_802154_cb(skb);
+ struct ieee802154_hdr hdr = {};
+ int err;
+
+ if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
+ goto err;
+
+ err = lowpan_get_cb(skb, frag_type, cb);
+ if (err < 0)
+ goto err;
+
+ if (frag_type == LOWPAN_DISPATCH_FRAG1) {
+ err = lowpan_invoke_frag_rx_handlers(skb);
+ if (err == NET_RX_DROP)
+ goto err;
+ }
+
+ if (cb->d_size > IPV6_MIN_MTU) {
+ net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n");
+ goto err;
+ }
+
+ rcu_read_lock();
+ fq = fq_find(net, cb, &hdr.source, &hdr.dest);
+ if (fq != NULL) {
+ int ret, refs = 0;
+
+ spin_lock(&fq->q.lock);
+ ret = lowpan_frag_queue(fq, skb, frag_type, &refs);
+ spin_unlock(&fq->q.lock);
+
+ rcu_read_unlock();
+ inet_frag_putn(&fq->q, refs);
+ return ret;
+ }
+ rcu_read_unlock();
+
+err:
+ kfree_skb(skb);
+ return -1;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table lowpan_frags_ns_ctl_table[] = {
+ {
+ .procname = "6lowpanfrag_high_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "6lowpanfrag_low_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "6lowpanfrag_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+};
+
+/* secret interval has been deprecated */
+static int lowpan_frags_secret_interval_unused;
+static struct ctl_table lowpan_frags_ctl_table[] = {
+ {
+ .procname = "6lowpanfrag_secret_interval",
+ .data = &lowpan_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+};
+
+static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+ size_t table_size = ARRAY_SIZE(lowpan_frags_ns_ctl_table);
+
+ table = lowpan_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table),
+ GFP_KERNEL);
+ if (table == NULL)
+ goto err_alloc;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table_size = 0;
+ }
+
+ table[0].data = &ieee802154_lowpan->fqdir->high_thresh;
+ table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].data = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh;
+ table[2].data = &ieee802154_lowpan->fqdir->timeout;
+
+ hdr = register_net_sysctl_sz(net, "net/ieee802154/6lowpan", table,
+ table_size);
+ if (hdr == NULL)
+ goto err_reg;
+
+ ieee802154_lowpan->sysctl.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net)
+{
+ const struct ctl_table *table;
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct ctl_table_header *lowpan_ctl_header;
+
+static int __init lowpan_frags_sysctl_register(void)
+{
+ lowpan_ctl_header = register_net_sysctl(&init_net,
+ "net/ieee802154/6lowpan",
+ lowpan_frags_ctl_table);
+ return lowpan_ctl_header == NULL ? -ENOMEM : 0;
+}
+
+static void lowpan_frags_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(lowpan_ctl_header);
+}
+#else
+static inline int lowpan_frags_ns_sysctl_register(struct net *net)
+{
+ return 0;
+}
+
+static inline void lowpan_frags_ns_sysctl_unregister(struct net *net)
+{
+}
+
+static inline int __init lowpan_frags_sysctl_register(void)
+{
+ return 0;
+}
+
+static inline void lowpan_frags_sysctl_unregister(void)
+{
+}
+#endif
+
+static int __net_init lowpan_frags_init_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+ int res;
+
+
+ res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net);
+ if (res < 0)
+ return res;
+
+ ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
+ res = lowpan_frags_ns_sysctl_register(net);
+ if (res < 0)
+ fqdir_exit(ieee802154_lowpan->fqdir);
+ return res;
+}
+
+static void __net_exit lowpan_frags_pre_exit_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ fqdir_pre_exit(ieee802154_lowpan->fqdir);
+}
+
+static void __net_exit lowpan_frags_exit_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ lowpan_frags_ns_sysctl_unregister(net);
+ fqdir_exit(ieee802154_lowpan->fqdir);
+}
+
+static struct pernet_operations lowpan_frags_ops = {
+ .init = lowpan_frags_init_net,
+ .pre_exit = lowpan_frags_pre_exit_net,
+ .exit = lowpan_frags_exit_net,
+};
+
+static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
+{
+ return jhash2(data,
+ sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct inet_frag_queue *fq = data;
+
+ return jhash2((const u32 *)&fq->key,
+ sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+ const struct frag_lowpan_compare_key *key = arg->key;
+ const struct inet_frag_queue *fq = ptr;
+
+ return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params lowpan_rhash_params = {
+ .head_offset = offsetof(struct inet_frag_queue, node),
+ .hashfn = lowpan_key_hashfn,
+ .obj_hashfn = lowpan_obj_hashfn,
+ .obj_cmpfn = lowpan_obj_cmpfn,
+ .automatic_shrinking = true,
+};
+
+int __init lowpan_net_frag_init(void)
+{
+ int ret;
+
+ lowpan_frags.constructor = lowpan_frag_init;
+ lowpan_frags.destructor = NULL;
+ lowpan_frags.qsize = sizeof(struct frag_queue);
+ lowpan_frags.frag_expire = lowpan_frag_expire;
+ lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+ lowpan_frags.rhash_params = lowpan_rhash_params;
+ ret = inet_frags_init(&lowpan_frags);
+ if (ret)
+ goto out;
+
+ ret = lowpan_frags_sysctl_register();
+ if (ret)
+ goto err_sysctl;
+
+ ret = register_pernet_subsys(&lowpan_frags_ops);
+ if (ret)
+ goto err_pernet;
+out:
+ return ret;
+err_pernet:
+ lowpan_frags_sysctl_unregister();
+err_sysctl:
+ inet_frags_fini(&lowpan_frags);
+ return ret;
+}
+
+void lowpan_net_frag_exit(void)
+{
+ lowpan_frags_sysctl_unregister();
+ unregister_pernet_subsys(&lowpan_frags_ops);
+ inet_frags_fini(&lowpan_frags);
+}
diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
new file mode 100644
index 000000000000..517e6493f5d1
--- /dev/null
+++ b/net/ieee802154/6lowpan/rx.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/if_arp.h>
+
+#include <net/6lowpan.h>
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+
+#include "6lowpan_i.h"
+
+#define LOWPAN_DISPATCH_FIRST 0xc0
+#define LOWPAN_DISPATCH_FRAG_MASK 0xf8
+
+#define LOWPAN_DISPATCH_NALP 0x00
+#define LOWPAN_DISPATCH_ESC 0x40
+#define LOWPAN_DISPATCH_HC1 0x42
+#define LOWPAN_DISPATCH_DFF 0x43
+#define LOWPAN_DISPATCH_BC0 0x50
+#define LOWPAN_DISPATCH_MESH 0x80
+
+static int lowpan_give_skb_to_device(struct sk_buff *skb)
+{
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ return netif_rx(skb);
+}
+
+static int lowpan_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res)
+{
+ switch (res) {
+ case RX_CONTINUE:
+ /* nobody cared about this packet */
+ net_warn_ratelimited("%s: received unknown dispatch\n",
+ __func__);
+
+ fallthrough;
+ case RX_DROP_UNUSABLE:
+ kfree_skb(skb);
+
+ fallthrough;
+ case RX_DROP:
+ return NET_RX_DROP;
+ case RX_QUEUED:
+ return lowpan_give_skb_to_device(skb);
+ default:
+ break;
+ }
+
+ return NET_RX_DROP;
+}
+
+static inline bool lowpan_is_frag1(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAG1;
+}
+
+static inline bool lowpan_is_fragn(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FRAG_MASK) == LOWPAN_DISPATCH_FRAGN;
+}
+
+static lowpan_rx_result lowpan_rx_h_frag(struct sk_buff *skb)
+{
+ int ret;
+
+ if (!(lowpan_is_frag1(*skb_network_header(skb)) ||
+ lowpan_is_fragn(*skb_network_header(skb))))
+ return RX_CONTINUE;
+
+ ret = lowpan_frag_rcv(skb, *skb_network_header(skb) &
+ LOWPAN_DISPATCH_FRAG_MASK);
+ if (ret == 1)
+ return RX_QUEUED;
+
+ /* Packet is freed by lowpan_frag_rcv on error or put into the frag
+ * bucket.
+ */
+ return RX_DROP;
+}
+
+int lowpan_iphc_decompress(struct sk_buff *skb)
+{
+ struct ieee802154_hdr hdr;
+
+ if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
+ return -EINVAL;
+
+ return lowpan_header_decompress(skb, skb->dev, &hdr.dest, &hdr.source);
+}
+
+static lowpan_rx_result lowpan_rx_h_iphc(struct sk_buff *skb)
+{
+ int ret;
+
+ if (!lowpan_is_iphc(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ /* Setting datagram_offset to zero indicates non frag handling
+ * while doing lowpan_header_decompress.
+ */
+ lowpan_802154_cb(skb)->d_size = 0;
+
+ ret = lowpan_iphc_decompress(skb);
+ if (ret < 0)
+ return RX_DROP_UNUSABLE;
+
+ return RX_QUEUED;
+}
+
+lowpan_rx_result lowpan_rx_h_ipv6(struct sk_buff *skb)
+{
+ if (!lowpan_is_ipv6(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ /* Pull off the 1-byte of 6lowpan header. */
+ skb_pull(skb, 1);
+ return RX_QUEUED;
+}
+
+static inline bool lowpan_is_esc(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_ESC;
+}
+
+static lowpan_rx_result lowpan_rx_h_esc(struct sk_buff *skb)
+{
+ if (!lowpan_is_esc(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN ESC not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_hc1(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_HC1;
+}
+
+static lowpan_rx_result lowpan_rx_h_hc1(struct sk_buff *skb)
+{
+ if (!lowpan_is_hc1(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN HC1 not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_dff(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_DFF;
+}
+
+static lowpan_rx_result lowpan_rx_h_dff(struct sk_buff *skb)
+{
+ if (!lowpan_is_dff(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN DFF not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_bc0(u8 dispatch)
+{
+ return dispatch == LOWPAN_DISPATCH_BC0;
+}
+
+static lowpan_rx_result lowpan_rx_h_bc0(struct sk_buff *skb)
+{
+ if (!lowpan_is_bc0(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN BC0 not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static inline bool lowpan_is_mesh(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_MESH;
+}
+
+static lowpan_rx_result lowpan_rx_h_mesh(struct sk_buff *skb)
+{
+ if (!lowpan_is_mesh(*skb_network_header(skb)))
+ return RX_CONTINUE;
+
+ net_warn_ratelimited("%s: %s\n", skb->dev->name,
+ "6LoWPAN MESH not supported\n");
+
+ return RX_DROP_UNUSABLE;
+}
+
+static int lowpan_invoke_rx_handlers(struct sk_buff *skb)
+{
+ lowpan_rx_result res;
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(skb); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0)
+
+ /* likely at first */
+ CALL_RXH(lowpan_rx_h_iphc);
+ CALL_RXH(lowpan_rx_h_frag);
+ CALL_RXH(lowpan_rx_h_ipv6);
+ CALL_RXH(lowpan_rx_h_esc);
+ CALL_RXH(lowpan_rx_h_hc1);
+ CALL_RXH(lowpan_rx_h_dff);
+ CALL_RXH(lowpan_rx_h_bc0);
+ CALL_RXH(lowpan_rx_h_mesh);
+
+rxh_next:
+ return lowpan_rx_handlers_result(skb, res);
+#undef CALL_RXH
+}
+
+static inline bool lowpan_is_nalp(u8 dispatch)
+{
+ return (dispatch & LOWPAN_DISPATCH_FIRST) == LOWPAN_DISPATCH_NALP;
+}
+
+/* Lookup for reserved dispatch values at:
+ * https://www.iana.org/assignments/_6lowpan-parameters/_6lowpan-parameters.xhtml#_6lowpan-parameters-1
+ *
+ * Last Updated: 2015-01-22
+ */
+static inline bool lowpan_is_reserved(u8 dispatch)
+{
+ return ((dispatch >= 0x44 && dispatch <= 0x4F) ||
+ (dispatch >= 0x51 && dispatch <= 0x5F) ||
+ (dispatch >= 0xc8 && dispatch <= 0xdf) ||
+ dispatch >= 0xe8);
+}
+
+/* lowpan_rx_h_check checks on generic 6LoWPAN requirements
+ * in MAC and 6LoWPAN header.
+ *
+ * Don't manipulate the skb here, it could be shared buffer.
+ */
+static inline bool lowpan_rx_h_check(struct sk_buff *skb)
+{
+ __le16 fc = ieee802154_get_fc_from_skb(skb);
+
+ /* check on ieee802154 conform 6LoWPAN header */
+ if (!ieee802154_is_data(fc) ||
+ !ieee802154_skb_is_intra_pan_addressing(fc, skb))
+ return false;
+
+ /* check if we can dereference the dispatch */
+ if (unlikely(!skb->len))
+ return false;
+
+ if (lowpan_is_nalp(*skb_network_header(skb)) ||
+ lowpan_is_reserved(*skb_network_header(skb)))
+ return false;
+
+ return true;
+}
+
+static int lowpan_rcv(struct sk_buff *skb, struct net_device *wdev,
+ struct packet_type *pt, struct net_device *orig_wdev)
+{
+ struct net_device *ldev;
+
+ if (wdev->type != ARPHRD_IEEE802154 ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ !lowpan_rx_h_check(skb))
+ goto drop;
+
+ ldev = wdev->ieee802154_ptr->lowpan_dev;
+ if (!ldev || !netif_running(ldev))
+ goto drop;
+
+ /* Replacing skb->dev and followed rx handlers will manipulate skb. */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+ skb->dev = ldev;
+
+ /* When receive frag1 it's likely that we manipulate the buffer.
+ * When recevie iphc we manipulate the data buffer. So we need
+ * to unshare the buffer.
+ */
+ if (lowpan_is_frag1(*skb_network_header(skb)) ||
+ lowpan_is_iphc(*skb_network_header(skb))) {
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+ }
+
+ return lowpan_invoke_rx_handlers(skb);
+
+drop:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+}
+
+static struct packet_type lowpan_packet_type = {
+ .type = htons(ETH_P_IEEE802154),
+ .func = lowpan_rcv,
+};
+
+void lowpan_rx_init(void)
+{
+ dev_add_pack(&lowpan_packet_type);
+}
+
+void lowpan_rx_exit(void)
+{
+ dev_remove_pack(&lowpan_packet_type);
+}
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
new file mode 100644
index 000000000000..0c07662b44c0
--- /dev/null
+++ b/net/ieee802154/6lowpan/tx.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/6lowpan.h>
+#include <net/ndisc.h>
+#include <net/ieee802154_netdev.h>
+#include <net/mac802154.h>
+
+#include "6lowpan_i.h"
+
+#define LOWPAN_FRAG1_HEAD_SIZE 0x4
+#define LOWPAN_FRAGN_HEAD_SIZE 0x5
+
+struct lowpan_addr_info {
+ struct ieee802154_addr daddr;
+ struct ieee802154_addr saddr;
+};
+
+static inline struct
+lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb)
+{
+ WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct lowpan_addr_info));
+ return (struct lowpan_addr_info *)(skb->data -
+ sizeof(struct lowpan_addr_info));
+}
+
+/* This callback will be called from AF_PACKET and IPv6 stack, the AF_PACKET
+ * sockets gives an 8 byte array for addresses only!
+ *
+ * TODO I think AF_PACKET DGRAM (sending/receiving) RAW (sending) makes no
+ * sense here. We should disable it, the right use-case would be AF_INET6
+ * RAW/DGRAM sockets.
+ */
+int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
+{
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
+ struct lowpan_addr_info *info = lowpan_skb_priv(skb);
+ struct lowpan_802154_neigh *llneigh = NULL;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct neighbour *n;
+
+ if (!daddr)
+ return -EINVAL;
+
+ /* TODO:
+ * if this package isn't ipv6 one, where should it be routed?
+ */
+ if (type != ETH_P_IPV6)
+ return 0;
+
+ /* intra-pan communication */
+ info->saddr.pan_id = wpan_dev->pan_id;
+ info->daddr.pan_id = info->saddr.pan_id;
+
+ if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) {
+ info->daddr.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+ info->daddr.mode = IEEE802154_ADDR_SHORT;
+ } else {
+ __le16 short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+
+ n = neigh_lookup(&nd_tbl, &hdr->daddr, ldev);
+ if (n) {
+ llneigh = lowpan_802154_neigh(neighbour_priv(n));
+ read_lock_bh(&n->lock);
+ short_addr = llneigh->short_addr;
+ read_unlock_bh(&n->lock);
+ }
+
+ if (llneigh &&
+ lowpan_802154_is_valid_src_short_addr(short_addr)) {
+ info->daddr.short_addr = short_addr;
+ info->daddr.mode = IEEE802154_ADDR_SHORT;
+ } else {
+ info->daddr.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&info->daddr.extended_addr,
+ daddr);
+ }
+
+ if (n)
+ neigh_release(n);
+ }
+
+ if (!saddr) {
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+ info->saddr.mode = IEEE802154_ADDR_SHORT;
+ info->saddr.short_addr = wpan_dev->short_addr;
+ } else {
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ info->saddr.extended_addr = wpan_dev->extended_addr;
+ }
+ } else {
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&info->saddr.extended_addr, saddr);
+ }
+
+ return 0;
+}
+
+static struct sk_buff*
+lowpan_alloc_frag(struct sk_buff *skb, int size,
+ const struct ieee802154_hdr *master_hdr, bool frag1)
+{
+ struct net_device *wdev = lowpan_802154_dev(skb->dev)->wdev;
+ struct sk_buff *frag;
+ int rc;
+
+ frag = alloc_skb(wdev->needed_headroom + wdev->needed_tailroom + size,
+ GFP_ATOMIC);
+
+ if (likely(frag)) {
+ frag->dev = wdev;
+ frag->priority = skb->priority;
+ skb_reserve(frag, wdev->needed_headroom);
+ skb_reset_network_header(frag);
+ *mac_cb(frag) = *mac_cb(skb);
+
+ if (frag1) {
+ skb_put_data(frag, skb_mac_header(skb), skb->mac_len);
+ } else {
+ rc = wpan_dev_hard_header(frag, wdev,
+ &master_hdr->dest,
+ &master_hdr->source, size);
+ if (rc < 0) {
+ kfree_skb(frag);
+ return ERR_PTR(rc);
+ }
+ }
+ } else {
+ frag = ERR_PTR(-ENOMEM);
+ }
+
+ return frag;
+}
+
+static int
+lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr,
+ u8 *frag_hdr, int frag_hdrlen,
+ int offset, int len, bool frag1)
+{
+ struct sk_buff *frag;
+
+ raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen);
+
+ frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr, frag1);
+ if (IS_ERR(frag))
+ return PTR_ERR(frag);
+
+ skb_put_data(frag, frag_hdr, frag_hdrlen);
+ skb_put_data(frag, skb_network_header(skb) + offset, len);
+
+ raw_dump_table(__func__, " fragment dump", frag->data, frag->len);
+
+ return dev_queue_xmit(frag);
+}
+
+static int
+lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev,
+ const struct ieee802154_hdr *wpan_hdr, u16 dgram_size,
+ u16 dgram_offset)
+{
+ __be16 frag_tag;
+ u8 frag_hdr[5];
+ int frag_cap, frag_len, payload_cap, rc;
+ int skb_unprocessed, skb_offset;
+
+ frag_tag = htons(lowpan_802154_dev(ldev)->fragment_tag);
+ lowpan_802154_dev(ldev)->fragment_tag++;
+
+ frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07);
+ frag_hdr[1] = dgram_size & 0xff;
+ memcpy(frag_hdr + 2, &frag_tag, sizeof(frag_tag));
+
+ payload_cap = ieee802154_max_payload(wpan_hdr);
+
+ frag_len = round_down(payload_cap - LOWPAN_FRAG1_HEAD_SIZE -
+ skb_network_header_len(skb), 8);
+
+ skb_offset = skb_network_header_len(skb);
+ skb_unprocessed = skb->len - skb->mac_len - skb_offset;
+
+ rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr,
+ LOWPAN_FRAG1_HEAD_SIZE, 0,
+ frag_len + skb_network_header_len(skb),
+ true);
+ if (rc) {
+ pr_debug("%s unable to send FRAG1 packet (tag: %d)",
+ __func__, ntohs(frag_tag));
+ goto err;
+ }
+
+ frag_hdr[0] &= ~LOWPAN_DISPATCH_FRAG1;
+ frag_hdr[0] |= LOWPAN_DISPATCH_FRAGN;
+ frag_cap = round_down(payload_cap - LOWPAN_FRAGN_HEAD_SIZE, 8);
+
+ do {
+ dgram_offset += frag_len;
+ skb_offset += frag_len;
+ skb_unprocessed -= frag_len;
+ frag_len = min(frag_cap, skb_unprocessed);
+
+ frag_hdr[4] = dgram_offset >> 3;
+
+ rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr,
+ LOWPAN_FRAGN_HEAD_SIZE, skb_offset,
+ frag_len, false);
+ if (rc) {
+ pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n",
+ __func__, ntohs(frag_tag), skb_offset);
+ goto err;
+ }
+ } while (skb_unprocessed > frag_cap);
+
+ ldev->stats.tx_packets++;
+ ldev->stats.tx_bytes += dgram_size;
+ consume_skb(skb);
+ return NET_XMIT_SUCCESS;
+
+err:
+ kfree_skb(skb);
+ return rc;
+}
+
+static int lowpan_header(struct sk_buff *skb, struct net_device *ldev,
+ u16 *dgram_size, u16 *dgram_offset)
+{
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
+ struct ieee802154_mac_cb *cb = mac_cb_init(skb);
+ struct lowpan_addr_info info;
+
+ memcpy(&info, lowpan_skb_priv(skb), sizeof(info));
+
+ *dgram_size = skb->len;
+ lowpan_header_compress(skb, ldev, &info.daddr, &info.saddr);
+ /* dgram_offset = (saved bytes after compression) + lowpan header len */
+ *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb);
+
+ cb->type = IEEE802154_FC_TYPE_DATA;
+
+ if (info.daddr.mode == IEEE802154_ADDR_SHORT &&
+ ieee802154_is_broadcast_short_addr(info.daddr.short_addr))
+ cb->ackreq = false;
+ else
+ cb->ackreq = wpan_dev->ackreq;
+
+ return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev,
+ &info.daddr, &info.saddr, 0);
+}
+
+netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
+{
+ struct ieee802154_hdr wpan_hdr;
+ int max_single, ret;
+ u16 dgram_size, dgram_offset;
+
+ pr_debug("package xmit\n");
+
+ WARN_ON_ONCE(skb->len > IPV6_MIN_MTU);
+
+ /* We must take a copy of the skb before we modify/replace the ipv6
+ * header as the header could be used elsewhere
+ */
+ if (unlikely(skb_headroom(skb) < ldev->needed_headroom ||
+ skb_tailroom(skb) < ldev->needed_tailroom)) {
+ struct sk_buff *nskb;
+
+ nskb = skb_copy_expand(skb, ldev->needed_headroom,
+ ldev->needed_tailroom, GFP_ATOMIC);
+ if (likely(nskb)) {
+ consume_skb(skb);
+ skb = nskb;
+ } else {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+ } else {
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_XMIT_DROP;
+ }
+
+ ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ if (ieee802154_hdr_peek(skb, &wpan_hdr) < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ max_single = ieee802154_max_payload(&wpan_hdr);
+
+ if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) {
+ skb->dev = lowpan_802154_dev(ldev)->wdev;
+ ldev->stats.tx_packets++;
+ ldev->stats.tx_bytes += dgram_size;
+ return dev_queue_xmit(skb);
+ } else {
+ netdev_tx_t rc;
+
+ pr_debug("frame is too big, fragmentation is needed\n");
+ rc = lowpan_xmit_fragmented(skb, ldev, &wpan_hdr, dgram_size,
+ dgram_offset);
+
+ return rc < 0 ? NET_XMIT_DROP : rc;
+ }
+}
diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig
new file mode 100644
index 000000000000..bcb05ba97686
--- /dev/null
+++ b/net/ieee802154/Kconfig
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig IEEE802154
+ tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support"
+ help
+ IEEE Std 802.15.4 defines a low data rate, low power and low
+ complexity short range wireless personal area networks. It was
+ designed to organise networks of sensors, switches, etc automation
+ devices. Maximum allowed data rate is 250 kb/s and typical personal
+ operating space around 10m.
+
+ Say Y here to compile LR-WPAN support into the kernel or say M to
+ compile it as modules.
+
+if IEEE802154
+
+config IEEE802154_NL802154_EXPERIMENTAL
+ bool "IEEE 802.15.4 experimental netlink support"
+ help
+ Adds experimental netlink support for nl802154.
+
+config IEEE802154_SOCKET
+ tristate "IEEE 802.15.4 socket interface"
+ default y
+ help
+ Socket interface for IEEE 802.15.4. Contains DGRAM sockets interface
+ for 802.15.4 dataframes. Also RAW socket interface to build MAC
+ header from userspace.
+
+source "net/ieee802154/6lowpan/Kconfig"
+
+endif
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
new file mode 100644
index 000000000000..7bce67673e83
--- /dev/null
+++ b/net/ieee802154/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_IEEE802154) += ieee802154.o
+obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o
+obj-y += 6lowpan/
+
+ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
+ header_ops.o sysfs.o nl802154.o trace.o pan.o
+ieee802154_socket-y := socket.o
+
+CFLAGS_trace.o := -I$(src)
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
new file mode 100644
index 000000000000..89b671b12600
--- /dev/null
+++ b/net/ieee802154/core.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+
+#include <net/cfg802154.h>
+#include <net/rtnetlink.h>
+
+#include "ieee802154.h"
+#include "nl802154.h"
+#include "sysfs.h"
+#include "core.h"
+
+/* name for sysfs, %d is appended */
+#define PHY_NAME "phy"
+
+/* RCU-protected (and RTNL for writers) */
+LIST_HEAD(cfg802154_rdev_list);
+int cfg802154_rdev_list_generation;
+
+struct wpan_phy *wpan_phy_find(const char *str)
+{
+ struct device *dev;
+
+ if (WARN_ON(!str))
+ return NULL;
+
+ dev = class_find_device_by_name(&wpan_phy_class, str);
+ if (!dev)
+ return NULL;
+
+ return container_of(dev, struct wpan_phy, dev);
+}
+EXPORT_SYMBOL(wpan_phy_find);
+
+struct wpan_phy_iter_data {
+ int (*fn)(struct wpan_phy *phy, void *data);
+ void *data;
+};
+
+static int wpan_phy_iter(struct device *dev, void *_data)
+{
+ struct wpan_phy_iter_data *wpid = _data;
+ struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev);
+
+ return wpid->fn(phy, wpid->data);
+}
+
+int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data),
+ void *data)
+{
+ struct wpan_phy_iter_data wpid = {
+ .fn = fn,
+ .data = data,
+ };
+
+ return class_for_each_device(&wpan_phy_class, NULL,
+ &wpid, wpan_phy_iter);
+}
+EXPORT_SYMBOL(wpan_phy_for_each);
+
+struct cfg802154_registered_device *
+cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx)
+{
+ struct cfg802154_registered_device *result = NULL, *rdev;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ if (rdev->wpan_phy_idx == wpan_phy_idx) {
+ result = rdev;
+ break;
+ }
+ }
+
+ return result;
+}
+
+struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx)
+{
+ struct cfg802154_registered_device *rdev;
+
+ ASSERT_RTNL();
+
+ rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx);
+ if (!rdev)
+ return NULL;
+ return &rdev->wpan_phy;
+}
+
+struct wpan_phy *
+wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
+{
+ static atomic_t wpan_phy_counter = ATOMIC_INIT(0);
+ struct cfg802154_registered_device *rdev;
+ size_t alloc_size;
+
+ alloc_size = sizeof(*rdev) + priv_size;
+ rdev = kzalloc(alloc_size, GFP_KERNEL);
+ if (!rdev)
+ return NULL;
+
+ rdev->ops = ops;
+
+ rdev->wpan_phy_idx = atomic_inc_return(&wpan_phy_counter);
+
+ if (unlikely(rdev->wpan_phy_idx < 0)) {
+ /* ugh, wrapped! */
+ atomic_dec(&wpan_phy_counter);
+ kfree(rdev);
+ return NULL;
+ }
+
+ /* atomic_inc_return makes it start at 1, make it start at 0 */
+ rdev->wpan_phy_idx--;
+
+ INIT_LIST_HEAD(&rdev->wpan_dev_list);
+ device_initialize(&rdev->wpan_phy.dev);
+ dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx);
+
+ rdev->wpan_phy.dev.class = &wpan_phy_class;
+ rdev->wpan_phy.dev.platform_data = rdev;
+
+ wpan_phy_net_set(&rdev->wpan_phy, &init_net);
+
+ init_waitqueue_head(&rdev->dev_wait);
+ init_waitqueue_head(&rdev->wpan_phy.sync_txq);
+
+ spin_lock_init(&rdev->wpan_phy.queue_lock);
+
+ return &rdev->wpan_phy;
+}
+EXPORT_SYMBOL(wpan_phy_new);
+
+int wpan_phy_register(struct wpan_phy *phy)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy);
+ int ret;
+
+ rtnl_lock();
+ ret = device_add(&phy->dev);
+ if (ret) {
+ rtnl_unlock();
+ return ret;
+ }
+
+ list_add_rcu(&rdev->list, &cfg802154_rdev_list);
+ cfg802154_rdev_list_generation++;
+
+ /* TODO phy registered lock */
+ rtnl_unlock();
+
+ /* TODO nl802154 phy notify */
+
+ return 0;
+}
+EXPORT_SYMBOL(wpan_phy_register);
+
+void wpan_phy_unregister(struct wpan_phy *phy)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy);
+
+ wait_event(rdev->dev_wait, ({
+ int __count;
+ rtnl_lock();
+ __count = rdev->opencount;
+ rtnl_unlock();
+ __count == 0; }));
+
+ rtnl_lock();
+ /* TODO nl802154 phy notify */
+ /* TODO phy registered lock */
+
+ WARN_ON(!list_empty(&rdev->wpan_dev_list));
+
+ /* First remove the hardware from everywhere, this makes
+ * it impossible to find from userspace.
+ */
+ list_del_rcu(&rdev->list);
+ synchronize_rcu();
+
+ cfg802154_rdev_list_generation++;
+
+ device_del(&phy->dev);
+
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(wpan_phy_unregister);
+
+void wpan_phy_free(struct wpan_phy *phy)
+{
+ put_device(&phy->dev);
+}
+EXPORT_SYMBOL(wpan_phy_free);
+
+static void cfg802154_free_peer_structures(struct wpan_dev *wpan_dev)
+{
+ struct ieee802154_pan_device *child, *tmp;
+
+ mutex_lock(&wpan_dev->association_lock);
+
+ kfree(wpan_dev->parent);
+ wpan_dev->parent = NULL;
+
+ list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) {
+ list_del(&child->node);
+ kfree(child);
+ }
+
+ wpan_dev->nchildren = 0;
+
+ mutex_unlock(&wpan_dev->association_lock);
+}
+
+int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
+ struct net *net)
+{
+ struct wpan_dev *wpan_dev;
+ int err = 0;
+
+ list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+ if (!wpan_dev->netdev)
+ continue;
+ wpan_dev->netdev->netns_immutable = false;
+ err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
+ if (err)
+ break;
+ wpan_dev->netdev->netns_immutable = true;
+ }
+
+ if (err) {
+ /* failed -- clean up to old netns */
+ net = wpan_phy_net(&rdev->wpan_phy);
+
+ list_for_each_entry_continue_reverse(wpan_dev,
+ &rdev->wpan_dev_list,
+ list) {
+ if (!wpan_dev->netdev)
+ continue;
+ wpan_dev->netdev->netns_immutable = false;
+ err = dev_change_net_namespace(wpan_dev->netdev, net,
+ "wpan%d");
+ WARN_ON(err);
+ wpan_dev->netdev->netns_immutable = true;
+ }
+
+ return err;
+ }
+
+ wpan_phy_net_set(&rdev->wpan_phy, net);
+
+ err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev));
+ WARN_ON(err);
+
+ return 0;
+}
+
+void cfg802154_dev_free(struct cfg802154_registered_device *rdev)
+{
+ kfree(rdev);
+}
+
+static void
+cfg802154_update_iface_num(struct cfg802154_registered_device *rdev,
+ int iftype, int num)
+{
+ ASSERT_RTNL();
+
+ rdev->num_running_ifaces += num;
+}
+
+static int cfg802154_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct cfg802154_registered_device *rdev;
+
+ if (!wpan_dev)
+ return NOTIFY_DONE;
+
+ rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy);
+
+ /* TODO WARN_ON unspec type */
+
+ switch (state) {
+ /* TODO NETDEV_DEVTYPE */
+ case NETDEV_REGISTER:
+ dev->netns_immutable = true;
+ wpan_dev->identifier = ++rdev->wpan_dev_id;
+ list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list);
+ rdev->devlist_generation++;
+ mutex_init(&wpan_dev->association_lock);
+ INIT_LIST_HEAD(&wpan_dev->children);
+ wpan_dev->max_associations = SZ_16K;
+
+ wpan_dev->netdev = dev;
+ break;
+ case NETDEV_DOWN:
+ cfg802154_update_iface_num(rdev, wpan_dev->iftype, -1);
+
+ rdev->opencount--;
+ wake_up(&rdev->dev_wait);
+ break;
+ case NETDEV_UP:
+ cfg802154_update_iface_num(rdev, wpan_dev->iftype, 1);
+
+ rdev->opencount++;
+ break;
+ case NETDEV_UNREGISTER:
+ cfg802154_free_peer_structures(wpan_dev);
+
+ /* It is possible to get NETDEV_UNREGISTER
+ * multiple times. To detect that, check
+ * that the interface is still on the list
+ * of registered interfaces, and only then
+ * remove and clean it up.
+ */
+ if (!list_empty(&wpan_dev->list)) {
+ list_del_rcu(&wpan_dev->list);
+ rdev->devlist_generation++;
+ }
+ /* synchronize (so that we won't find this netdev
+ * from other code any more) and then clear the list
+ * head so that the above code can safely check for
+ * !list_empty() to avoid double-cleanup.
+ */
+ synchronize_rcu();
+ INIT_LIST_HEAD(&wpan_dev->list);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cfg802154_netdev_notifier = {
+ .notifier_call = cfg802154_netdev_notifier_call,
+};
+
+static void __net_exit cfg802154_pernet_exit(struct net *net)
+{
+ struct cfg802154_registered_device *rdev;
+
+ rtnl_lock();
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ if (net_eq(wpan_phy_net(&rdev->wpan_phy), net))
+ WARN_ON(cfg802154_switch_netns(rdev, &init_net));
+ }
+ rtnl_unlock();
+}
+
+static struct pernet_operations cfg802154_pernet_ops = {
+ .exit = cfg802154_pernet_exit,
+};
+
+static int __init wpan_phy_class_init(void)
+{
+ int rc;
+
+ rc = register_pernet_device(&cfg802154_pernet_ops);
+ if (rc)
+ goto err;
+
+ rc = wpan_phy_sysfs_init();
+ if (rc)
+ goto err_sysfs;
+
+ rc = register_netdevice_notifier(&cfg802154_netdev_notifier);
+ if (rc)
+ goto err_nl;
+
+ rc = ieee802154_nl_init();
+ if (rc)
+ goto err_notifier;
+
+ rc = nl802154_init();
+ if (rc)
+ goto err_ieee802154_nl;
+
+ return 0;
+
+err_ieee802154_nl:
+ ieee802154_nl_exit();
+
+err_notifier:
+ unregister_netdevice_notifier(&cfg802154_netdev_notifier);
+err_nl:
+ wpan_phy_sysfs_exit();
+err_sysfs:
+ unregister_pernet_device(&cfg802154_pernet_ops);
+err:
+ return rc;
+}
+subsys_initcall(wpan_phy_class_init);
+
+static void __exit wpan_phy_class_exit(void)
+{
+ nl802154_exit();
+ ieee802154_nl_exit();
+ unregister_netdevice_notifier(&cfg802154_netdev_notifier);
+ wpan_phy_sysfs_exit();
+ unregister_pernet_device(&cfg802154_pernet_ops);
+}
+module_exit(wpan_phy_class_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface");
+MODULE_AUTHOR("Dmitry Eremin-Solenikov");
diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h
new file mode 100644
index 000000000000..1c19f575d574
--- /dev/null
+++ b/net/ieee802154/core.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __IEEE802154_CORE_H
+#define __IEEE802154_CORE_H
+
+#include <net/cfg802154.h>
+
+struct cfg802154_registered_device {
+ const struct cfg802154_ops *ops;
+ struct list_head list;
+
+ /* wpan_phy index, internal only */
+ int wpan_phy_idx;
+
+ /* also protected by devlist_mtx */
+ int opencount;
+ wait_queue_head_t dev_wait;
+
+ /* protected by RTNL only */
+ int num_running_ifaces;
+
+ /* associated wpan interfaces, protected by rtnl or RCU */
+ struct list_head wpan_dev_list;
+ int devlist_generation, wpan_dev_id;
+
+ /* must be last because of the way we do wpan_phy_priv(),
+ * and it should at least be aligned to NETDEV_ALIGN
+ */
+ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN);
+};
+
+static inline struct cfg802154_registered_device *
+wpan_phy_to_rdev(struct wpan_phy *wpan_phy)
+{
+ BUG_ON(!wpan_phy);
+ return container_of(wpan_phy, struct cfg802154_registered_device,
+ wpan_phy);
+}
+
+extern struct list_head cfg802154_rdev_list;
+extern int cfg802154_rdev_list_generation;
+
+int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
+ struct net *net);
+/* free object */
+void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
+struct cfg802154_registered_device *
+cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx);
+struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx);
+
+#endif /* __IEEE802154_CORE_H */
diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c
new file mode 100644
index 000000000000..41a556be1017
--- /dev/null
+++ b/net/ieee802154/header_ops.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2014 Fraunhofer ITWM
+ *
+ * Written by:
+ * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
+ */
+
+#include <linux/ieee802154.h>
+
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+
+static int
+ieee802154_hdr_push_addr(u8 *buf, const struct ieee802154_addr *addr,
+ bool omit_pan)
+{
+ int pos = 0;
+
+ if (addr->mode == IEEE802154_ADDR_NONE)
+ return 0;
+
+ if (!omit_pan) {
+ memcpy(buf + pos, &addr->pan_id, 2);
+ pos += 2;
+ }
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_SHORT:
+ memcpy(buf + pos, &addr->short_addr, 2);
+ pos += 2;
+ break;
+
+ case IEEE802154_ADDR_LONG:
+ memcpy(buf + pos, &addr->extended_addr, IEEE802154_ADDR_LEN);
+ pos += IEEE802154_ADDR_LEN;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return pos;
+}
+
+static int
+ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr)
+{
+ int pos = 5;
+
+ memcpy(buf, hdr, 1);
+ memcpy(buf + 1, &hdr->frame_counter, 4);
+
+ switch (hdr->key_id_mode) {
+ case IEEE802154_SCF_KEY_IMPLICIT:
+ return pos;
+
+ case IEEE802154_SCF_KEY_INDEX:
+ break;
+
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ memcpy(buf + pos, &hdr->short_src, 4);
+ pos += 4;
+ break;
+
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ memcpy(buf + pos, &hdr->extended_src, IEEE802154_ADDR_LEN);
+ pos += IEEE802154_ADDR_LEN;
+ break;
+ }
+
+ buf[pos++] = hdr->key_id;
+
+ return pos;
+}
+
+int
+ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ u8 buf[IEEE802154_MAX_HEADER_LEN];
+ int pos = 2;
+ int rc;
+ struct ieee802154_hdr_fc *fc = &hdr->fc;
+
+ buf[pos++] = hdr->seq;
+
+ fc->dest_addr_mode = hdr->dest.mode;
+
+ rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false);
+ if (rc < 0)
+ return -EINVAL;
+ pos += rc;
+
+ fc->source_addr_mode = hdr->source.mode;
+
+ if (hdr->source.pan_id == hdr->dest.pan_id &&
+ hdr->dest.mode != IEEE802154_ADDR_NONE)
+ fc->intra_pan = true;
+
+ rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan);
+ if (rc < 0)
+ return -EINVAL;
+ pos += rc;
+
+ if (fc->security_enabled) {
+ fc->version = 1;
+
+ rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec);
+ if (rc < 0)
+ return -EINVAL;
+
+ pos += rc;
+ }
+
+ memcpy(buf, fc, 2);
+
+ memcpy(skb_push(skb, pos), buf, pos);
+
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_push);
+
+int ieee802154_mac_cmd_push(struct sk_buff *skb, void *f,
+ const void *pl, unsigned int pl_len)
+{
+ struct ieee802154_mac_cmd_frame *frame = f;
+ struct ieee802154_mac_cmd_pl *mac_pl = &frame->mac_pl;
+ struct ieee802154_hdr *mhr = &frame->mhr;
+ int ret;
+
+ skb_reserve(skb, sizeof(*mhr));
+ ret = ieee802154_hdr_push(skb, mhr);
+ if (ret < 0)
+ return ret;
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = ret;
+
+ skb_put_data(skb, mac_pl, sizeof(*mac_pl));
+ skb_put_data(skb, pl, pl_len);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_push);
+
+int ieee802154_beacon_push(struct sk_buff *skb,
+ struct ieee802154_beacon_frame *beacon)
+{
+ struct ieee802154_beacon_hdr *mac_pl = &beacon->mac_pl;
+ struct ieee802154_hdr *mhr = &beacon->mhr;
+ int ret;
+
+ skb_reserve(skb, sizeof(*mhr));
+ ret = ieee802154_hdr_push(skb, mhr);
+ if (ret < 0)
+ return ret;
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = ret;
+
+ skb_put_data(skb, mac_pl, sizeof(*mac_pl));
+
+ if (mac_pl->pend_short_addr_count || mac_pl->pend_ext_addr_count)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ieee802154_beacon_push);
+
+static int
+ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan,
+ struct ieee802154_addr *addr)
+{
+ int pos = 0;
+
+ addr->mode = mode;
+
+ if (mode == IEEE802154_ADDR_NONE)
+ return 0;
+
+ if (!omit_pan) {
+ memcpy(&addr->pan_id, buf + pos, 2);
+ pos += 2;
+ }
+
+ if (mode == IEEE802154_ADDR_SHORT) {
+ memcpy(&addr->short_addr, buf + pos, 2);
+ return pos + 2;
+ } else {
+ memcpy(&addr->extended_addr, buf + pos, IEEE802154_ADDR_LEN);
+ return pos + IEEE802154_ADDR_LEN;
+ }
+}
+
+static int ieee802154_hdr_addr_len(int mode, bool omit_pan)
+{
+ int pan_len = omit_pan ? 0 : 2;
+
+ switch (mode) {
+ case IEEE802154_ADDR_NONE: return 0;
+ case IEEE802154_ADDR_SHORT: return 2 + pan_len;
+ case IEEE802154_ADDR_LONG: return IEEE802154_ADDR_LEN + pan_len;
+ default: return -EINVAL;
+ }
+}
+
+static int
+ieee802154_hdr_get_sechdr(const u8 *buf, struct ieee802154_sechdr *hdr)
+{
+ int pos = 5;
+
+ memcpy(hdr, buf, 1);
+ memcpy(&hdr->frame_counter, buf + 1, 4);
+
+ switch (hdr->key_id_mode) {
+ case IEEE802154_SCF_KEY_IMPLICIT:
+ return pos;
+
+ case IEEE802154_SCF_KEY_INDEX:
+ break;
+
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ memcpy(&hdr->short_src, buf + pos, 4);
+ pos += 4;
+ break;
+
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ memcpy(&hdr->extended_src, buf + pos, IEEE802154_ADDR_LEN);
+ pos += IEEE802154_ADDR_LEN;
+ break;
+ }
+
+ hdr->key_id = buf[pos++];
+
+ return pos;
+}
+
+static int ieee802154_sechdr_lengths[4] = {
+ [IEEE802154_SCF_KEY_IMPLICIT] = 5,
+ [IEEE802154_SCF_KEY_INDEX] = 6,
+ [IEEE802154_SCF_KEY_SHORT_INDEX] = 10,
+ [IEEE802154_SCF_KEY_HW_INDEX] = 14,
+};
+
+static int ieee802154_hdr_sechdr_len(u8 sc)
+{
+ return ieee802154_sechdr_lengths[IEEE802154_SCF_KEY_ID_MODE(sc)];
+}
+
+static int ieee802154_hdr_minlen(const struct ieee802154_hdr *hdr)
+{
+ int dlen, slen;
+
+ dlen = ieee802154_hdr_addr_len(hdr->fc.dest_addr_mode, false);
+ slen = ieee802154_hdr_addr_len(hdr->fc.source_addr_mode,
+ hdr->fc.intra_pan);
+
+ if (slen < 0 || dlen < 0)
+ return -EINVAL;
+
+ return 3 + dlen + slen + hdr->fc.security_enabled;
+}
+
+static int
+ieee802154_hdr_get_addrs(const u8 *buf, struct ieee802154_hdr *hdr)
+{
+ int pos = 0;
+
+ pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.dest_addr_mode,
+ false, &hdr->dest);
+ pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.source_addr_mode,
+ hdr->fc.intra_pan, &hdr->source);
+
+ if (hdr->fc.intra_pan)
+ hdr->source.pan_id = hdr->dest.pan_id;
+
+ return pos;
+}
+
+int
+ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ int pos = 3, rc;
+
+ if (!pskb_may_pull(skb, 3))
+ return -EINVAL;
+
+ memcpy(hdr, skb->data, 3);
+
+ rc = ieee802154_hdr_minlen(hdr);
+ if (rc < 0 || !pskb_may_pull(skb, rc))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_addrs(skb->data + pos, hdr);
+
+ if (hdr->fc.security_enabled) {
+ int want = pos + ieee802154_hdr_sechdr_len(skb->data[pos]);
+
+ if (!pskb_may_pull(skb, want))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_sechdr(skb->data + pos, &hdr->sec);
+ }
+
+ skb_pull(skb, pos);
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_pull);
+
+int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb,
+ struct ieee802154_mac_cmd_pl *mac_pl)
+{
+ if (!pskb_may_pull(skb, sizeof(*mac_pl)))
+ return -EINVAL;
+
+ memcpy(mac_pl, skb->data, sizeof(*mac_pl));
+ skb_pull(skb, sizeof(*mac_pl));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_pl_pull);
+
+int
+ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ const u8 *buf = skb_mac_header(skb);
+ int pos = 3, rc;
+
+ if (buf + 3 > skb_tail_pointer(skb))
+ return -EINVAL;
+
+ memcpy(hdr, buf, 3);
+
+ rc = ieee802154_hdr_minlen(hdr);
+ if (rc < 0 || buf + rc > skb_tail_pointer(skb))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_addrs(buf + pos, hdr);
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_peek_addrs);
+
+int
+ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ const u8 *buf = skb_mac_header(skb);
+ int pos;
+
+ pos = ieee802154_hdr_peek_addrs(skb, hdr);
+ if (pos < 0)
+ return -EINVAL;
+
+ if (hdr->fc.security_enabled) {
+ u8 key_id_mode = IEEE802154_SCF_KEY_ID_MODE(*(buf + pos));
+ int want = pos + ieee802154_sechdr_lengths[key_id_mode];
+
+ if (buf + want > skb_tail_pointer(skb))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_sechdr(buf + pos, &hdr->sec);
+ }
+
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_peek);
+
+int ieee802154_max_payload(const struct ieee802154_hdr *hdr)
+{
+ int hlen = ieee802154_hdr_minlen(hdr);
+
+ if (hdr->fc.security_enabled) {
+ hlen += ieee802154_sechdr_lengths[hdr->sec.key_id_mode] - 1;
+ hlen += ieee802154_sechdr_authtag_len(&hdr->sec);
+ }
+
+ return IEEE802154_MTU - hlen - IEEE802154_MFR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ieee802154_max_payload);
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
new file mode 100644
index 000000000000..c5d91f78301a
--- /dev/null
+++ b/net/ieee802154/ieee802154.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ */
+#ifndef IEEE_802154_LOCAL_H
+#define IEEE_802154_LOCAL_H
+
+int __init ieee802154_nl_init(void);
+void ieee802154_nl_exit(void);
+
+#define IEEE802154_OP(_cmd, _func) \
+ { \
+ .cmd = _cmd, \
+ .doit = _func, \
+ .dumpit = NULL, \
+ .flags = GENL_ADMIN_PERM, \
+ }
+
+#define IEEE802154_DUMP(_cmd, _func, _dump) \
+ { \
+ .cmd = _cmd, \
+ .doit = _func, \
+ .dumpit = _dump, \
+ }
+
+struct genl_info;
+
+struct sk_buff *ieee802154_nl_create(int flags, u8 req);
+int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group);
+struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
+ int flags, u8 req);
+int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info);
+
+extern struct genl_family nl802154_family;
+
+/* genetlink ops/groups */
+int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb);
+int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info);
+
+enum ieee802154_mcgrp_ids {
+ IEEE802154_COORD_MCGRP,
+ IEEE802154_BEACON_MCGRP,
+};
+
+int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb);
+int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info);
+
+int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_keys(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_devs(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_seclevels(struct sk_buff *skb,
+ struct netlink_callback *cb);
+
+#endif
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
new file mode 100644
index 000000000000..7d2de4ee6992
--- /dev/null
+++ b/net/ieee802154/netlink.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Netlink interface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <net/genetlink.h>
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+
+static unsigned int ieee802154_seq_num;
+static DEFINE_SPINLOCK(ieee802154_seq_lock);
+
+/* Requests to userspace */
+struct sk_buff *ieee802154_nl_create(int flags, u8 req)
+{
+ void *hdr;
+ struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ unsigned long f;
+
+ if (!msg)
+ return NULL;
+
+ spin_lock_irqsave(&ieee802154_seq_lock, f);
+ hdr = genlmsg_put(msg, 0, ieee802154_seq_num++,
+ &nl802154_family, flags, req);
+ spin_unlock_irqrestore(&ieee802154_seq_lock, f);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return NULL;
+ }
+
+ return msg;
+}
+
+int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ void *hdr = genlmsg_data(nlmsg_data(nlh));
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC);
+}
+
+struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
+ int flags, u8 req)
+{
+ void *hdr;
+ struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+
+ if (!msg)
+ return NULL;
+
+ hdr = genlmsg_put_reply(msg, info,
+ &nl802154_family, flags, req);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return NULL;
+ }
+
+ return msg;
+}
+
+int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ void *hdr = genlmsg_data(nlmsg_data(nlh));
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+}
+
+static const struct genl_small_ops ieee802154_ops[] = {
+ /* see nl-phy.c */
+ IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
+ ieee802154_dump_phy),
+ IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface),
+ IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface),
+ /* see nl-mac.c */
+ IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req),
+ IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp),
+ IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req),
+ IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req),
+ IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req),
+ IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface,
+ ieee802154_dump_iface),
+ IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams),
+ IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams),
+ IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL,
+ ieee802154_llsec_dump_keys),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL,
+ ieee802154_llsec_dump_devs),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL,
+ ieee802154_llsec_dump_devkeys),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL,
+ ieee802154_llsec_dump_seclevels),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL,
+ ieee802154_llsec_add_seclevel),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL,
+ ieee802154_llsec_del_seclevel),
+};
+
+static const struct genl_multicast_group ieee802154_mcgrps[] = {
+ [IEEE802154_COORD_MCGRP] = { .name = IEEE802154_MCAST_COORD_NAME, },
+ [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
+};
+
+struct genl_family nl802154_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = IEEE802154_NL_NAME,
+ .version = 1,
+ .maxattr = IEEE802154_ATTR_MAX,
+ .policy = ieee802154_policy,
+ .module = THIS_MODULE,
+ .small_ops = ieee802154_ops,
+ .n_small_ops = ARRAY_SIZE(ieee802154_ops),
+ .resv_start_op = IEEE802154_LLSEC_DEL_SECLEVEL + 1,
+ .mcgrps = ieee802154_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps),
+};
+
+int __init ieee802154_nl_init(void)
+{
+ return genl_register_family(&nl802154_family);
+}
+
+void ieee802154_nl_exit(void)
+{
+ genl_unregister_family(&nl802154_family);
+}
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
new file mode 100644
index 000000000000..74ef0a310afb
--- /dev/null
+++ b/net/ieee802154/nl-mac.c
@@ -0,0 +1,1335 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Netlink interface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/ieee802154.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include <linux/nl802154.h>
+#include <linux/export.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154.h"
+
+static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr,
+ int padattr)
+{
+ return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr),
+ padattr);
+}
+
+static __le64 nla_get_hwaddr(const struct nlattr *nla)
+{
+ return ieee802154_devaddr_from_raw(nla_data(nla));
+}
+
+static int nla_put_shortaddr(struct sk_buff *msg, int type, __le16 addr)
+{
+ return nla_put_u16(msg, type, le16_to_cpu(addr));
+}
+
+static __le16 nla_get_shortaddr(const struct nlattr *nla)
+{
+ return cpu_to_le16(nla_get_u16(nla));
+}
+
+static int ieee802154_nl_start_confirm(struct net_device *dev, u8 status)
+{
+ struct sk_buff *msg;
+
+ pr_debug("%s\n", __func__);
+
+ msg = ieee802154_nl_create(0, IEEE802154_START_CONF);
+ if (!msg)
+ return -ENOBUFS;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+ dev->dev_addr) ||
+ nla_put_u8(msg, IEEE802154_ATTR_STATUS, status))
+ goto nla_put_failure;
+ return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
+
+nla_put_failure:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
+ u32 seq, int flags, struct net_device *dev)
+{
+ void *hdr;
+ struct wpan_phy *phy;
+ struct ieee802154_mlme_ops *ops;
+ __le16 short_addr, pan_id;
+
+ pr_debug("%s\n", __func__);
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+ IEEE802154_LIST_IFACE);
+ if (!hdr)
+ goto out;
+
+ ops = ieee802154_mlme_ops(dev);
+ phy = dev->ieee802154_ptr->wpan_phy;
+ BUG_ON(!phy);
+ get_device(&phy->dev);
+
+ rtnl_lock();
+ short_addr = dev->ieee802154_ptr->short_addr;
+ pan_id = dev->ieee802154_ptr->pan_id;
+ rtnl_unlock();
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+ dev->dev_addr) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, pan_id))
+ goto nla_put_failure;
+
+ if (ops->get_mac_params) {
+ struct ieee802154_mac_params params;
+
+ rtnl_lock();
+ ops->get_mac_params(dev, &params);
+ rtnl_unlock();
+
+ if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER,
+ params.transmit_power / 100) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE,
+ params.cca.mode) ||
+ nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL,
+ params.cca_ed_level / 100) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES,
+ params.csma_retries) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE,
+ params.min_be) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CSMA_MAX_BE,
+ params.max_be) ||
+ nla_put_s8(msg, IEEE802154_ATTR_FRAME_RETRIES,
+ params.frame_retries))
+ goto nla_put_failure;
+ }
+
+ wpan_phy_put(phy);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ wpan_phy_put(phy);
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+/* Requests from userspace */
+static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
+{
+ struct net_device *dev;
+
+ if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+ char name[IFNAMSIZ + 1];
+
+ nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+ sizeof(name));
+ dev = dev_get_by_name(&init_net, name);
+ } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) {
+ dev = dev_get_by_index(&init_net,
+ nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX]));
+ } else {
+ return NULL;
+ }
+
+ if (!dev)
+ return NULL;
+
+ if (dev->type != ARPHRD_IEEE802154) {
+ dev_put(dev);
+ return NULL;
+ }
+
+ return dev;
+}
+
+int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+ u8 page;
+ int ret = -EOPNOTSUPP;
+
+ if (!info->attrs[IEEE802154_ATTR_CHANNEL] ||
+ !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+ (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] &&
+ !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) ||
+ !info->attrs[IEEE802154_ATTR_CAPABILITY])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->assoc_req)
+ goto out;
+
+ if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) {
+ addr.mode = IEEE802154_ADDR_LONG;
+ addr.extended_addr = nla_get_hwaddr(
+ info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]);
+ } else {
+ addr.mode = IEEE802154_ADDR_SHORT;
+ addr.short_addr = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+ }
+ addr.pan_id = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+ page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0);
+
+ ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr,
+ nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]),
+ page,
+ nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY]));
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+ int ret = -EOPNOTSUPP;
+
+ if (!info->attrs[IEEE802154_ATTR_STATUS] ||
+ !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] ||
+ !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->assoc_resp)
+ goto out;
+
+ addr.mode = IEEE802154_ADDR_LONG;
+ addr.extended_addr = nla_get_hwaddr(
+ info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]);
+ rtnl_lock();
+ addr.pan_id = dev->ieee802154_ptr->pan_id;
+ rtnl_unlock();
+
+ ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr,
+ nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
+ nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS]));
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+ int ret = -EOPNOTSUPP;
+
+ if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] &&
+ !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) ||
+ !info->attrs[IEEE802154_ATTR_REASON])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->disassoc_req)
+ goto out;
+
+ if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) {
+ addr.mode = IEEE802154_ADDR_LONG;
+ addr.extended_addr = nla_get_hwaddr(
+ info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]);
+ } else {
+ addr.mode = IEEE802154_ADDR_SHORT;
+ addr.short_addr = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]);
+ }
+ rtnl_lock();
+ addr.pan_id = dev->ieee802154_ptr->pan_id;
+ rtnl_unlock();
+
+ ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
+ nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+/* PANid, channel, beacon_order = 15, superframe_order = 15,
+ * PAN_coordinator, battery_life_extension = 0,
+ * coord_realignment = 0, security_enable = 0
+*/
+int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+
+ u8 channel, bcn_ord, sf_ord;
+ u8 page;
+ int pan_coord, blx, coord_realign;
+ int ret = -EBUSY;
+
+ if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+ !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] ||
+ !info->attrs[IEEE802154_ATTR_CHANNEL] ||
+ !info->attrs[IEEE802154_ATTR_BCN_ORD] ||
+ !info->attrs[IEEE802154_ATTR_SF_ORD] ||
+ !info->attrs[IEEE802154_ATTR_PAN_COORD] ||
+ !info->attrs[IEEE802154_ATTR_BAT_EXT] ||
+ !info->attrs[IEEE802154_ATTR_COORD_REALIGN]
+ )
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ if (netif_running(dev))
+ goto out;
+
+ if (!ieee802154_mlme_ops(dev)->start_req) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ addr.mode = IEEE802154_ADDR_SHORT;
+ addr.short_addr = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+ addr.pan_id = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+ channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]);
+ bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]);
+ sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]);
+ pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]);
+ blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]);
+ coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]);
+
+ page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0);
+
+ if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) {
+ ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS);
+ dev_put(dev);
+ return -EINVAL;
+ }
+
+ rtnl_lock();
+ ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page,
+ bcn_ord, sf_ord, pan_coord, blx, coord_realign);
+ rtnl_unlock();
+
+ /* FIXME: add validation for unused parameters to be sane
+ * for SoftMAC
+ */
+ ieee802154_nl_start_confirm(dev, IEEE802154_SUCCESS);
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ int ret = -EOPNOTSUPP;
+ u8 type;
+ u32 channels;
+ u8 duration;
+ u8 page;
+
+ if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] ||
+ !info->attrs[IEEE802154_ATTR_CHANNELS] ||
+ !info->attrs[IEEE802154_ATTR_DURATION])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->scan_req)
+ goto out;
+
+ type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]);
+ channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]);
+ duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]);
+
+ page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0);
+
+ ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels,
+ page, duration);
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info)
+{
+ /* Request for interface name, index, type, IEEE address,
+ * PAN Id, short address
+ */
+ struct sk_buff *msg;
+ struct net_device *dev = NULL;
+ int rc = -ENOBUFS;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out_dev;
+
+ rc = ieee802154_nl_fill_iface(msg, info->snd_portid, info->snd_seq,
+ 0, dev);
+ if (rc < 0)
+ goto out_free;
+
+ dev_put(dev);
+
+ return genlmsg_reply(msg, info);
+out_free:
+ nlmsg_free(msg);
+out_dev:
+ dev_put(dev);
+ return rc;
+}
+
+int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx;
+ int s_idx = cb->args[0];
+
+ pr_debug("%s\n", __func__);
+
+ idx = 0;
+ for_each_netdev(net, dev) {
+ if (idx < s_idx || dev->type != ARPHRD_IEEE802154)
+ goto cont;
+
+ if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, dev) < 0)
+ break;
+cont:
+ idx++;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev = NULL;
+ struct ieee802154_mlme_ops *ops;
+ struct ieee802154_mac_params params;
+ struct wpan_phy *phy;
+ int rc = -EINVAL;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ ops = ieee802154_mlme_ops(dev);
+
+ if (!ops->get_mac_params || !ops->set_mac_params) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (netif_running(dev)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ if (!info->attrs[IEEE802154_ATTR_LBT_ENABLED] &&
+ !info->attrs[IEEE802154_ATTR_CCA_MODE] &&
+ !info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL] &&
+ !info->attrs[IEEE802154_ATTR_CSMA_RETRIES] &&
+ !info->attrs[IEEE802154_ATTR_CSMA_MIN_BE] &&
+ !info->attrs[IEEE802154_ATTR_CSMA_MAX_BE] &&
+ !info->attrs[IEEE802154_ATTR_FRAME_RETRIES])
+ goto out;
+
+ phy = dev->ieee802154_ptr->wpan_phy;
+ get_device(&phy->dev);
+
+ rtnl_lock();
+ ops->get_mac_params(dev, &params);
+
+ if (info->attrs[IEEE802154_ATTR_TXPOWER])
+ params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100;
+
+ if (info->attrs[IEEE802154_ATTR_LBT_ENABLED])
+ params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]);
+
+ if (info->attrs[IEEE802154_ATTR_CCA_MODE])
+ params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]);
+
+ if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL])
+ params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100;
+
+ if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES])
+ params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]);
+
+ if (info->attrs[IEEE802154_ATTR_CSMA_MIN_BE])
+ params.min_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]);
+
+ if (info->attrs[IEEE802154_ATTR_CSMA_MAX_BE])
+ params.max_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]);
+
+ if (info->attrs[IEEE802154_ATTR_FRAME_RETRIES])
+ params.frame_retries = nla_get_s8(info->attrs[IEEE802154_ATTR_FRAME_RETRIES]);
+
+ rc = ops->set_mac_params(dev, &params);
+ rtnl_unlock();
+
+ wpan_phy_put(phy);
+ dev_put(dev);
+
+ return 0;
+
+out:
+ dev_put(dev);
+ return rc;
+}
+
+static int
+ieee802154_llsec_parse_key_id(struct genl_info *info,
+ struct ieee802154_llsec_key_id *desc)
+{
+ memset(desc, 0, sizeof(*desc));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE])
+ return -EINVAL;
+
+ desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]);
+
+ if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
+ if (!info->attrs[IEEE802154_ATTR_PAN_ID])
+ return -EINVAL;
+
+ desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
+
+ if (info->attrs[IEEE802154_ATTR_SHORT_ADDR]) {
+ desc->device_addr.mode = IEEE802154_ADDR_SHORT;
+ desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
+ } else {
+ if (!info->attrs[IEEE802154_ATTR_HW_ADDR])
+ return -EINVAL;
+
+ desc->device_addr.mode = IEEE802154_ADDR_LONG;
+ desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+ }
+ }
+
+ if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID])
+ return -EINVAL;
+
+ if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT])
+ return -EINVAL;
+
+ if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED])
+ return -EINVAL;
+
+ if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT)
+ desc->id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]);
+
+ switch (desc->mode) {
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ {
+ u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]);
+
+ desc->short_source = cpu_to_le32(source);
+ break;
+ }
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ desc->extended_source = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]);
+ break;
+ }
+
+ return 0;
+}
+
+static int
+ieee802154_llsec_fill_key_id(struct sk_buff *msg,
+ const struct ieee802154_llsec_key_id *desc)
+{
+ if (nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_MODE, desc->mode))
+ return -EMSGSIZE;
+
+ if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
+ if (nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID,
+ desc->device_addr.pan_id))
+ return -EMSGSIZE;
+
+ if (desc->device_addr.mode == IEEE802154_ADDR_SHORT &&
+ nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
+ desc->device_addr.short_addr))
+ return -EMSGSIZE;
+
+ if (desc->device_addr.mode == IEEE802154_ADDR_LONG &&
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR,
+ desc->device_addr.extended_addr,
+ IEEE802154_ATTR_PAD))
+ return -EMSGSIZE;
+ }
+
+ if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT &&
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_ID, desc->id))
+ return -EMSGSIZE;
+
+ if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT,
+ le32_to_cpu(desc->short_source)))
+ return -EMSGSIZE;
+
+ if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
+ nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
+ desc->extended_source, IEEE802154_ATTR_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct net_device *dev = NULL;
+ int rc = -ENOBUFS;
+ struct ieee802154_mlme_ops *ops;
+ void *hdr;
+ struct ieee802154_llsec_params params;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ ops = ieee802154_mlme_ops(dev);
+ if (!ops->llsec) {
+ rc = -EOPNOTSUPP;
+ goto out_dev;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out_dev;
+
+ hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0,
+ IEEE802154_LLSEC_GETPARAMS);
+ if (!hdr)
+ goto out_free;
+
+ rc = ops->llsec->get_params(dev, &params);
+ if (rc < 0)
+ goto out_free;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_ENABLED, params.enabled) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) ||
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+ be32_to_cpu(params.frame_counter)) ||
+ ieee802154_llsec_fill_key_id(msg, &params.out_key)) {
+ rc = -ENOBUFS;
+ goto out_free;
+ }
+
+ dev_put(dev);
+
+ return ieee802154_nl_reply(msg, info);
+out_free:
+ nlmsg_free(msg);
+out_dev:
+ dev_put(dev);
+ return rc;
+}
+
+int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev = NULL;
+ int rc = -EINVAL;
+ struct ieee802154_mlme_ops *ops;
+ struct ieee802154_llsec_params params;
+ int changed = 0;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_ENABLED] &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE] &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL])
+ goto out;
+
+ ops = ieee802154_mlme_ops(dev);
+ if (!ops->llsec) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL] &&
+ nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) > 7)
+ goto out;
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]) {
+ params.enabled = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]);
+ changed |= IEEE802154_LLSEC_PARAM_ENABLED;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) {
+ if (ieee802154_llsec_parse_key_id(info, &params.out_key))
+ goto out;
+
+ changed |= IEEE802154_LLSEC_PARAM_OUT_KEY;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) {
+ params.out_level = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]);
+ changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]) {
+ u32 fc = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+
+ params.frame_counter = cpu_to_be32(fc);
+ changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER;
+ }
+
+ rc = ops->llsec->set_params(dev, &params, changed);
+
+ dev_put(dev);
+
+ return rc;
+out:
+ dev_put(dev);
+ return rc;
+}
+
+struct llsec_dump_data {
+ struct sk_buff *skb;
+ int s_idx, s_idx2;
+ int portid;
+ int nlmsg_seq;
+ struct net_device *dev;
+ struct ieee802154_mlme_ops *ops;
+ struct ieee802154_llsec_table *table;
+};
+
+static int
+ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb,
+ int (*step)(struct llsec_dump_data *))
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ struct llsec_dump_data data;
+ int idx = 0;
+ int first_dev = cb->args[0];
+ int rc;
+
+ for_each_netdev(net, dev) {
+ if (idx < first_dev || dev->type != ARPHRD_IEEE802154)
+ goto skip;
+
+ data.ops = ieee802154_mlme_ops(dev);
+ if (!data.ops->llsec)
+ goto skip;
+
+ data.skb = skb;
+ data.s_idx = cb->args[1];
+ data.s_idx2 = cb->args[2];
+ data.dev = dev;
+ data.portid = NETLINK_CB(cb->skb).portid;
+ data.nlmsg_seq = cb->nlh->nlmsg_seq;
+
+ data.ops->llsec->lock_table(dev);
+ data.ops->llsec->get_table(data.dev, &data.table);
+ rc = step(&data);
+ data.ops->llsec->unlock_table(dev);
+
+ if (rc < 0)
+ break;
+
+skip:
+ idx++;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static int
+ieee802154_nl_llsec_change(struct sk_buff *skb, struct genl_info *info,
+ int (*fn)(struct net_device*, struct genl_info*))
+{
+ struct net_device *dev = NULL;
+ int rc = -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ if (!ieee802154_mlme_ops(dev)->llsec)
+ rc = -EOPNOTSUPP;
+ else
+ rc = fn(dev, info);
+
+ dev_put(dev);
+ return rc;
+}
+
+static int
+ieee802154_llsec_parse_key(struct genl_info *info,
+ struct ieee802154_llsec_key *key)
+{
+ u8 frames;
+ u32 commands[256 / 32];
+
+ memset(key, 0, sizeof(*key));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES])
+ return -EINVAL;
+
+ frames = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES]);
+ if ((frames & BIT(IEEE802154_FC_TYPE_MAC_CMD)) &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS])
+ return -EINVAL;
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) {
+ nla_memcpy(commands,
+ info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS],
+ 256 / 8);
+
+ if (commands[0] || commands[1] || commands[2] || commands[3] ||
+ commands[4] || commands[5] || commands[6] ||
+ commands[7] >= BIT(IEEE802154_CMD_GTS_REQ + 1))
+ return -EINVAL;
+
+ key->cmd_frame_ids = commands[7];
+ }
+
+ key->frame_types = frames;
+
+ nla_memcpy(key->key, info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES],
+ IEEE802154_LLSEC_KEY_SIZE);
+
+ return 0;
+}
+
+static int llsec_add_key(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_key key;
+ struct ieee802154_llsec_key_id id;
+
+ if (ieee802154_llsec_parse_key(info, &key) ||
+ ieee802154_llsec_parse_key_id(info, &id))
+ return -EINVAL;
+
+ return ops->llsec->add_key(dev, &id, &key);
+}
+
+int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_key);
+}
+
+static int llsec_remove_key(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_key_id id;
+
+ if (ieee802154_llsec_parse_key_id(info, &id))
+ return -EINVAL;
+
+ return ops->llsec->del_key(dev, &id);
+}
+
+int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_remove_key);
+}
+
+static int
+ieee802154_nl_fill_key(struct sk_buff *msg, u32 portid, u32 seq,
+ const struct ieee802154_llsec_key_entry *key,
+ const struct net_device *dev)
+{
+ void *hdr;
+ u32 commands[256 / 32];
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_KEY);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ ieee802154_llsec_fill_key_id(msg, &key->id) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES,
+ key->key->frame_types))
+ goto nla_put_failure;
+
+ if (key->key->frame_types & BIT(IEEE802154_FC_TYPE_MAC_CMD)) {
+ memset(commands, 0, sizeof(commands));
+ commands[7] = key->key->cmd_frame_ids;
+ if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS,
+ sizeof(commands), commands))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_BYTES,
+ IEEE802154_LLSEC_KEY_SIZE, key->key->key))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_keys(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_key_entry *pos;
+ int rc = 0, idx = 0;
+
+ list_for_each_entry(pos, &data->table->keys, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ if (ieee802154_nl_fill_key(data->skb, data->portid,
+ data->nlmsg_seq, pos, data->dev)) {
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_keys(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_keys);
+}
+
+static int
+llsec_parse_dev(struct genl_info *info,
+ struct ieee802154_llsec_device *dev)
+{
+ memset(dev, 0, sizeof(*dev));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] ||
+ !info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] ||
+ (!!info->attrs[IEEE802154_ATTR_PAN_ID] !=
+ !!info->attrs[IEEE802154_ATTR_SHORT_ADDR]))
+ return -EINVAL;
+
+ if (info->attrs[IEEE802154_ATTR_PAN_ID]) {
+ dev->pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
+ dev->short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
+ } else {
+ dev->short_addr = cpu_to_le16(IEEE802154_ADDR_UNDEF);
+ }
+
+ dev->hwaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+ dev->frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+ dev->seclevel_exempt = !!nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]);
+ dev->key_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE]);
+
+ if (dev->key_mode >= __IEEE802154_LLSEC_DEVKEY_MAX)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int llsec_add_dev(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_device desc;
+
+ if (llsec_parse_dev(info, &desc))
+ return -EINVAL;
+
+ return ops->llsec->add_dev(dev, &desc);
+}
+
+int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_dev);
+}
+
+static int llsec_del_dev(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ __le64 devaddr;
+
+ if (!info->attrs[IEEE802154_ATTR_HW_ADDR])
+ return -EINVAL;
+
+ devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+
+ return ops->llsec->del_dev(dev, devaddr);
+}
+
+int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_del_dev);
+}
+
+static int
+ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq,
+ const struct ieee802154_llsec_device *desc,
+ const struct net_device *dev)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_DEV);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
+ desc->short_addr) ||
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr,
+ IEEE802154_ATTR_PAD) ||
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+ desc->frame_counter) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+ desc->seclevel_exempt) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_KEY_MODE, desc->key_mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_devs(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_device *pos;
+ int rc = 0, idx = 0;
+
+ list_for_each_entry(pos, &data->table->devices, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ if (ieee802154_nl_fill_dev(data->skb, data->portid,
+ data->nlmsg_seq, pos, data->dev)) {
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_devs(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devs);
+}
+
+static int llsec_add_devkey(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_device_key key;
+ __le64 devaddr;
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] ||
+ !info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+ ieee802154_llsec_parse_key_id(info, &key.key_id))
+ return -EINVAL;
+
+ devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+ key.frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+
+ return ops->llsec->add_devkey(dev, devaddr, &key);
+}
+
+int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_devkey);
+}
+
+static int llsec_del_devkey(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_device_key key;
+ __le64 devaddr;
+
+ if (!info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+ ieee802154_llsec_parse_key_id(info, &key.key_id))
+ return -EINVAL;
+
+ devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+
+ return ops->llsec->del_devkey(dev, devaddr, &key);
+}
+
+int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_del_devkey);
+}
+
+static int
+ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq,
+ __le64 devaddr,
+ const struct ieee802154_llsec_device_key *devkey,
+ const struct net_device *dev)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_DEVKEY);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr,
+ IEEE802154_ATTR_PAD) ||
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+ devkey->frame_counter) ||
+ ieee802154_llsec_fill_key_id(msg, &devkey->key_id))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_devkeys(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_device *dpos;
+ struct ieee802154_llsec_device_key *kpos;
+ int idx = 0, idx2;
+
+ list_for_each_entry(dpos, &data->table->devices, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ idx2 = 0;
+
+ list_for_each_entry(kpos, &dpos->keys, list) {
+ if (idx2++ < data->s_idx2)
+ continue;
+
+ if (ieee802154_nl_fill_devkey(data->skb, data->portid,
+ data->nlmsg_seq,
+ dpos->hwaddr, kpos,
+ data->dev)) {
+ return -EMSGSIZE;
+ }
+
+ data->s_idx2++;
+ }
+
+ data->s_idx++;
+ }
+
+ return 0;
+}
+
+int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devkeys);
+}
+
+static int
+llsec_parse_seclevel(struct genl_info *info,
+ struct ieee802154_llsec_seclevel *sl)
+{
+ memset(sl, 0, sizeof(*sl));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE])
+ return -EINVAL;
+
+ sl->frame_type = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE]);
+ if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD) {
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID])
+ return -EINVAL;
+
+ sl->cmd_frame_id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]);
+ }
+
+ sl->sec_levels = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS]);
+ sl->device_override = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]);
+
+ return 0;
+}
+
+static int llsec_add_seclevel(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_seclevel sl;
+
+ if (llsec_parse_seclevel(info, &sl))
+ return -EINVAL;
+
+ return ops->llsec->add_seclevel(dev, &sl);
+}
+
+int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_seclevel);
+}
+
+static int llsec_del_seclevel(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_seclevel sl;
+
+ if (llsec_parse_seclevel(info, &sl))
+ return -EINVAL;
+
+ return ops->llsec->del_seclevel(dev, &sl);
+}
+
+int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_del_seclevel);
+}
+
+static int
+ieee802154_nl_fill_seclevel(struct sk_buff *msg, u32 portid, u32 seq,
+ const struct ieee802154_llsec_seclevel *sl,
+ const struct net_device *dev)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_SECLEVEL);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_FRAME_TYPE, sl->frame_type) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVELS, sl->sec_levels) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+ sl->device_override))
+ goto nla_put_failure;
+
+ if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD &&
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_CMD_FRAME_ID,
+ sl->cmd_frame_id))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_seclevels(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_seclevel *pos;
+ int rc = 0, idx = 0;
+
+ list_for_each_entry(pos, &data->table->security_levels, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ if (ieee802154_nl_fill_seclevel(data->skb, data->portid,
+ data->nlmsg_seq, pos,
+ data->dev)) {
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_seclevels(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_seclevels);
+}
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
new file mode 100644
index 000000000000..4c07a475c567
--- /dev/null
+++ b/net/ieee802154/nl-phy.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Netlink interface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/if_arp.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/cfg802154.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/rtnetlink.h> /* for rtnl_{un,}lock */
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+#include "rdev-ops.h"
+#include "core.h"
+
+static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
+ u32 seq, int flags, struct wpan_phy *phy)
+{
+ void *hdr;
+ int i, pages = 0;
+ u32 *buf = kcalloc(IEEE802154_MAX_PAGE + 1, sizeof(u32), GFP_KERNEL);
+
+ pr_debug("%s\n", __func__);
+
+ if (!buf)
+ return -EMSGSIZE;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+ IEEE802154_LIST_PHY);
+ if (!hdr)
+ goto out;
+
+ rtnl_lock();
+ if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
+ goto nla_put_failure;
+ for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
+ if (phy->supported.channels[i])
+ buf[pages++] = phy->supported.channels[i] | (i << 27);
+ }
+ if (pages &&
+ nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
+ pages * sizeof(uint32_t), buf))
+ goto nla_put_failure;
+ rtnl_unlock();
+ kfree(buf);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ rtnl_unlock();
+ genlmsg_cancel(msg, hdr);
+out:
+ kfree(buf);
+ return -EMSGSIZE;
+}
+
+int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info)
+{
+ /* Request for interface name, index, type, IEEE address,
+ * PAN Id, short address
+ */
+ struct sk_buff *msg;
+ struct wpan_phy *phy;
+ const char *name;
+ int rc = -ENOBUFS;
+
+ pr_debug("%s\n", __func__);
+
+ if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+ if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+ return -EINVAL; /* phy name should be null-terminated */
+
+ phy = wpan_phy_find(name);
+ if (!phy)
+ return -ENODEV;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out_dev;
+
+ rc = ieee802154_nl_fill_phy(msg, info->snd_portid, info->snd_seq,
+ 0, phy);
+ if (rc < 0)
+ goto out_free;
+
+ wpan_phy_put(phy);
+
+ return genlmsg_reply(msg, info);
+out_free:
+ nlmsg_free(msg);
+out_dev:
+ wpan_phy_put(phy);
+ return rc;
+}
+
+struct dump_phy_data {
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx, s_idx;
+};
+
+static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data)
+{
+ int rc;
+ struct dump_phy_data *data = _data;
+
+ pr_debug("%s\n", __func__);
+
+ if (data->idx++ < data->s_idx)
+ return 0;
+
+ rc = ieee802154_nl_fill_phy(data->skb,
+ NETLINK_CB(data->cb->skb).portid,
+ data->cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ phy);
+
+ if (rc < 0) {
+ data->idx--;
+ return rc;
+ }
+
+ return 0;
+}
+
+int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct dump_phy_data data = {
+ .cb = cb,
+ .skb = skb,
+ .s_idx = cb->args[0],
+ .idx = 0,
+ };
+
+ pr_debug("%s\n", __func__);
+
+ wpan_phy_for_each(ieee802154_dump_phy_iter, &data);
+
+ cb->args[0] = data.idx;
+
+ return skb->len;
+}
+
+int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct wpan_phy *phy;
+ const char *name;
+ const char *devname;
+ int rc = -ENOBUFS;
+ struct net_device *dev;
+ int type = __IEEE802154_DEV_INVALID;
+ unsigned char name_assign_type;
+
+ pr_debug("%s\n", __func__);
+
+ if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+ if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+ return -EINVAL; /* phy name should be null-terminated */
+
+ if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+ devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+ if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
+ != '\0')
+ return -EINVAL; /* phy name should be null-terminated */
+ name_assign_type = NET_NAME_USER;
+ } else {
+ devname = "wpan%d";
+ name_assign_type = NET_NAME_ENUM;
+ }
+
+ if (strlen(devname) >= IFNAMSIZ)
+ return -ENAMETOOLONG;
+
+ phy = wpan_phy_find(name);
+ if (!phy)
+ return -ENODEV;
+
+ msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE);
+ if (!msg)
+ goto out_dev;
+
+ if (info->attrs[IEEE802154_ATTR_HW_ADDR] &&
+ nla_len(info->attrs[IEEE802154_ATTR_HW_ADDR]) !=
+ IEEE802154_ADDR_LEN) {
+ rc = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_DEV_TYPE]) {
+ type = nla_get_u8(info->attrs[IEEE802154_ATTR_DEV_TYPE]);
+ if (type >= __IEEE802154_DEV_MAX) {
+ rc = -EINVAL;
+ goto nla_put_failure;
+ }
+ }
+
+ dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname,
+ name_assign_type, type);
+ if (IS_ERR(dev)) {
+ rc = PTR_ERR(dev);
+ goto nla_put_failure;
+ }
+ dev_hold(dev);
+
+ if (info->attrs[IEEE802154_ATTR_HW_ADDR]) {
+ struct sockaddr_storage addr;
+
+ addr.ss_family = ARPHRD_IEEE802154;
+ nla_memcpy(&addr.__data, info->attrs[IEEE802154_ATTR_HW_ADDR],
+ IEEE802154_ADDR_LEN);
+
+ /* strangely enough, some callbacks (inetdev_event) from
+ * dev_set_mac_address require RTNL_LOCK
+ */
+ rtnl_lock();
+ rc = dev_set_mac_address(dev, &addr, NULL);
+ rtnl_unlock();
+ if (rc)
+ goto dev_unregister;
+ }
+
+ if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name)) {
+ rc = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+ dev_put(dev);
+
+ wpan_phy_put(phy);
+
+ return ieee802154_nl_reply(msg, info);
+
+dev_unregister:
+ rtnl_lock(); /* del_iface must be called with RTNL lock */
+ rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev);
+ dev_put(dev);
+ rtnl_unlock();
+nla_put_failure:
+ nlmsg_free(msg);
+out_dev:
+ wpan_phy_put(phy);
+ return rc;
+}
+
+int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct wpan_phy *phy;
+ const char *name;
+ int rc;
+ struct net_device *dev;
+
+ pr_debug("%s\n", __func__);
+
+ if (!info->attrs[IEEE802154_ATTR_DEV_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+ if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
+ return -EINVAL; /* name should be null-terminated */
+
+ rc = -ENODEV;
+ dev = dev_get_by_name(genl_info_net(info), name);
+ if (!dev)
+ return rc;
+ if (dev->type != ARPHRD_IEEE802154)
+ goto out;
+
+ phy = dev->ieee802154_ptr->wpan_phy;
+ BUG_ON(!phy);
+ get_device(&phy->dev);
+
+ rc = -EINVAL;
+ /* phy name is optional, but should be checked if it's given */
+ if (info->attrs[IEEE802154_ATTR_PHY_NAME]) {
+ struct wpan_phy *phy2;
+
+ const char *pname =
+ nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+ if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1]
+ != '\0')
+ /* name should be null-terminated */
+ goto out_dev;
+
+ phy2 = wpan_phy_find(pname);
+ if (!phy2)
+ goto out_dev;
+
+ if (phy != phy2) {
+ wpan_phy_put(phy2);
+ goto out_dev;
+ }
+ }
+
+ rc = -ENOBUFS;
+
+ msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE);
+ if (!msg)
+ goto out_dev;
+
+ rtnl_lock();
+ rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev);
+
+ /* We don't have device anymore */
+ dev_put(dev);
+ dev = NULL;
+
+ rtnl_unlock();
+
+ if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, name))
+ goto nla_put_failure;
+ wpan_phy_put(phy);
+
+ return ieee802154_nl_reply(msg, info);
+
+nla_put_failure:
+ nlmsg_free(msg);
+out_dev:
+ wpan_phy_put(phy);
+out:
+ dev_put(dev);
+
+ return rc;
+}
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
new file mode 100644
index 000000000000..5a024ca60d35
--- /dev/null
+++ b/net/ieee802154/nl802154.c
@@ -0,0 +1,3108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/wireless/nl80211.c
+ */
+
+#include <linux/rtnetlink.h>
+
+#include <net/cfg802154.h>
+#include <net/genetlink.h>
+#include <net/mac802154.h>
+#include <net/netlink.h>
+#include <net/nl802154.h>
+#include <net/sock.h>
+
+#include "nl802154.h"
+#include "rdev-ops.h"
+#include "core.h"
+
+/* the netlink family */
+static struct genl_family nl802154_fam;
+
+/* multicast groups */
+enum nl802154_multicast_groups {
+ NL802154_MCGRP_CONFIG,
+ NL802154_MCGRP_SCAN,
+};
+
+static const struct genl_multicast_group nl802154_mcgrps[] = {
+ [NL802154_MCGRP_CONFIG] = { .name = "config", },
+ [NL802154_MCGRP_SCAN] = { .name = "scan", },
+};
+
+/* returns ERR_PTR values */
+static struct wpan_dev *
+__cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs)
+{
+ struct cfg802154_registered_device *rdev;
+ struct wpan_dev *result = NULL;
+ bool have_ifidx = attrs[NL802154_ATTR_IFINDEX];
+ bool have_wpan_dev_id = attrs[NL802154_ATTR_WPAN_DEV];
+ u64 wpan_dev_id;
+ int wpan_phy_idx = -1;
+ int ifidx = -1;
+
+ ASSERT_RTNL();
+
+ if (!have_ifidx && !have_wpan_dev_id)
+ return ERR_PTR(-EINVAL);
+
+ if (have_ifidx)
+ ifidx = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]);
+ if (have_wpan_dev_id) {
+ wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]);
+ wpan_phy_idx = wpan_dev_id >> 32;
+ }
+
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ struct wpan_dev *wpan_dev;
+
+ if (wpan_phy_net(&rdev->wpan_phy) != netns)
+ continue;
+
+ if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx)
+ continue;
+
+ list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+ if (have_ifidx && wpan_dev->netdev &&
+ wpan_dev->netdev->ifindex == ifidx) {
+ result = wpan_dev;
+ break;
+ }
+ if (have_wpan_dev_id &&
+ wpan_dev->identifier == (u32)wpan_dev_id) {
+ result = wpan_dev;
+ break;
+ }
+ }
+
+ if (result)
+ break;
+ }
+
+ if (result)
+ return result;
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct cfg802154_registered_device *
+__cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
+{
+ struct cfg802154_registered_device *rdev = NULL, *tmp;
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ if (!attrs[NL802154_ATTR_WPAN_PHY] &&
+ !attrs[NL802154_ATTR_IFINDEX] &&
+ !attrs[NL802154_ATTR_WPAN_DEV])
+ return ERR_PTR(-EINVAL);
+
+ if (attrs[NL802154_ATTR_WPAN_PHY])
+ rdev = cfg802154_rdev_by_wpan_phy_idx(
+ nla_get_u32(attrs[NL802154_ATTR_WPAN_PHY]));
+
+ if (attrs[NL802154_ATTR_WPAN_DEV]) {
+ u64 wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]);
+ struct wpan_dev *wpan_dev;
+ bool found = false;
+
+ tmp = cfg802154_rdev_by_wpan_phy_idx(wpan_dev_id >> 32);
+ if (tmp) {
+ /* make sure wpan_dev exists */
+ list_for_each_entry(wpan_dev, &tmp->wpan_dev_list, list) {
+ if (wpan_dev->identifier != (u32)wpan_dev_id)
+ continue;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ tmp = NULL;
+
+ if (rdev && tmp != rdev)
+ return ERR_PTR(-EINVAL);
+ rdev = tmp;
+ }
+ }
+
+ if (attrs[NL802154_ATTR_IFINDEX]) {
+ int ifindex = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]);
+
+ netdev = __dev_get_by_index(netns, ifindex);
+ if (netdev) {
+ if (netdev->ieee802154_ptr)
+ tmp = wpan_phy_to_rdev(
+ netdev->ieee802154_ptr->wpan_phy);
+ else
+ tmp = NULL;
+
+ /* not wireless device -- return error */
+ if (!tmp)
+ return ERR_PTR(-EINVAL);
+
+ /* mismatch -- return error */
+ if (rdev && tmp != rdev)
+ return ERR_PTR(-EINVAL);
+
+ rdev = tmp;
+ }
+ }
+
+ if (!rdev)
+ return ERR_PTR(-ENODEV);
+
+ if (netns != wpan_phy_net(&rdev->wpan_phy))
+ return ERR_PTR(-ENODEV);
+
+ return rdev;
+}
+
+/* This function returns a pointer to the driver
+ * that the genl_info item that is passed refers to.
+ *
+ * The result of this can be a PTR_ERR and hence must
+ * be checked with IS_ERR() for errors.
+ */
+static struct cfg802154_registered_device *
+cfg802154_get_dev_from_info(struct net *netns, struct genl_info *info)
+{
+ return __cfg802154_rdev_from_attrs(netns, info->attrs);
+}
+
+/* policy for the attributes */
+static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
+ [NL802154_ATTR_WPAN_PHY] = { .type = NLA_U32 },
+ [NL802154_ATTR_WPAN_PHY_NAME] = { .type = NLA_NUL_STRING,
+ .len = 20-1 },
+
+ [NL802154_ATTR_IFINDEX] = { .type = NLA_U32 },
+ [NL802154_ATTR_IFTYPE] = { .type = NLA_U32 },
+ [NL802154_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
+
+ [NL802154_ATTR_WPAN_DEV] = { .type = NLA_U64 },
+
+ [NL802154_ATTR_PAGE] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_PAGE),
+ [NL802154_ATTR_CHANNEL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_CHANNEL),
+
+ [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, },
+
+ [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, },
+ [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, },
+ [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, },
+
+ [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, },
+
+ [NL802154_ATTR_PAN_ID] = { .type = NLA_U16, },
+ [NL802154_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 },
+ [NL802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
+
+ [NL802154_ATTR_MIN_BE] = { .type = NLA_U8, },
+ [NL802154_ATTR_MAX_BE] = { .type = NLA_U8, },
+ [NL802154_ATTR_MAX_CSMA_BACKOFFS] = { .type = NLA_U8, },
+
+ [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, },
+
+ [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, },
+
+ [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED },
+
+ [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED },
+
+ [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 },
+
+ [NL802154_ATTR_PID] = { .type = NLA_U32 },
+ [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 },
+
+ [NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED },
+
+ [NL802154_ATTR_SCAN_TYPE] =
+ NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_ED, NL802154_SCAN_RIT_PASSIVE),
+ [NL802154_ATTR_SCAN_CHANNELS] =
+ NLA_POLICY_MASK(NLA_U32, GENMASK(IEEE802154_MAX_CHANNEL, 0)),
+ [NL802154_ATTR_SCAN_PREAMBLE_CODES] = { .type = NLA_REJECT },
+ [NL802154_ATTR_SCAN_MEAN_PRF] = { .type = NLA_REJECT },
+ [NL802154_ATTR_SCAN_DURATION] =
+ NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_SCAN_DURATION),
+ [NL802154_ATTR_SCAN_DONE_REASON] =
+ NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_DONE_REASON_FINISHED,
+ NL802154_SCAN_DONE_REASON_ABORTED),
+ [NL802154_ATTR_BEACON_INTERVAL] =
+ NLA_POLICY_MAX(NLA_U8, IEEE802154_ACTIVE_SCAN_DURATION),
+ [NL802154_ATTR_MAX_ASSOCIATIONS] = { .type = NLA_U32 },
+ [NL802154_ATTR_PEER] = { .type = NLA_NESTED },
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
+ [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
+ [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, },
+ [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 },
+
+ [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED },
+ [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED },
+ [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED },
+ [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED },
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+};
+
+static int
+nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct cfg802154_registered_device **rdev,
+ struct wpan_dev **wpan_dev)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ int err;
+
+ rtnl_lock();
+
+ if (!cb->args[0]) {
+ *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
+ info->info.attrs);
+ if (IS_ERR(*wpan_dev)) {
+ err = PTR_ERR(*wpan_dev);
+ goto out_unlock;
+ }
+ *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy);
+ /* 0 is the first index - add 1 to parse only once */
+ cb->args[0] = (*rdev)->wpan_phy_idx + 1;
+ cb->args[1] = (*wpan_dev)->identifier;
+ } else {
+ /* subtract the 1 again here */
+ struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1);
+ struct wpan_dev *tmp;
+
+ if (!wpan_phy) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ *rdev = wpan_phy_to_rdev(wpan_phy);
+ *wpan_dev = NULL;
+
+ list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) {
+ if (tmp->identifier == cb->args[1]) {
+ *wpan_dev = tmp;
+ break;
+ }
+ }
+
+ if (!*wpan_dev) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ return 0;
+ out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void
+nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev)
+{
+ rtnl_unlock();
+}
+
+/* message building helper */
+static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, u8 cmd)
+{
+ /* since there is no private header just add the generic one */
+ return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd);
+}
+
+static int
+nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask)
+{
+ struct nlattr *nl_flags = nla_nest_start_noflag(msg, attr);
+ int i;
+
+ if (!nl_flags)
+ return -ENOBUFS;
+
+ i = 0;
+ while (mask) {
+ if ((mask & 1) && nla_put_flag(msg, i))
+ return -ENOBUFS;
+
+ mask >>= 1;
+ i++;
+ }
+
+ nla_nest_end(msg, nl_flags);
+ return 0;
+}
+
+static int
+nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ struct nlattr *nl_page;
+ unsigned long page;
+
+ nl_page = nla_nest_start_noflag(msg, NL802154_ATTR_CHANNELS_SUPPORTED);
+ if (!nl_page)
+ return -ENOBUFS;
+
+ for (page = 0; page <= IEEE802154_MAX_PAGE; page++) {
+ if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL,
+ rdev->wpan_phy.supported.channels[page]))
+ return -ENOBUFS;
+ }
+ nla_nest_end(msg, nl_page);
+
+ return 0;
+}
+
+static int
+nl802154_put_capabilities(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev)
+{
+ const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported;
+ struct nlattr *nl_caps, *nl_channels;
+ int i;
+
+ nl_caps = nla_nest_start_noflag(msg, NL802154_ATTR_WPAN_PHY_CAPS);
+ if (!nl_caps)
+ return -ENOBUFS;
+
+ nl_channels = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CHANNELS);
+ if (!nl_channels)
+ return -ENOBUFS;
+
+ for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
+ if (caps->channels[i]) {
+ if (nl802154_put_flags(msg, i, caps->channels[i]))
+ return -ENOBUFS;
+ }
+ }
+
+ nla_nest_end(msg, nl_channels);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
+ struct nlattr *nl_ed_lvls;
+
+ nl_ed_lvls = nla_nest_start_noflag(msg,
+ NL802154_CAP_ATTR_CCA_ED_LEVELS);
+ if (!nl_ed_lvls)
+ return -ENOBUFS;
+
+ for (i = 0; i < caps->cca_ed_levels_size; i++) {
+ if (nla_put_s32(msg, i, caps->cca_ed_levels[i]))
+ return -ENOBUFS;
+ }
+
+ nla_nest_end(msg, nl_ed_lvls);
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
+ struct nlattr *nl_tx_pwrs;
+
+ nl_tx_pwrs = nla_nest_start_noflag(msg,
+ NL802154_CAP_ATTR_TX_POWERS);
+ if (!nl_tx_pwrs)
+ return -ENOBUFS;
+
+ for (i = 0; i < caps->tx_powers_size; i++) {
+ if (nla_put_s32(msg, i, caps->tx_powers[i]))
+ return -ENOBUFS;
+ }
+
+ nla_nest_end(msg, nl_tx_pwrs);
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) {
+ if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES,
+ caps->cca_modes) ||
+ nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS,
+ caps->cca_opts))
+ return -ENOBUFS;
+ }
+
+ if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS,
+ caps->min_csma_backoffs) ||
+ nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS,
+ caps->max_csma_backoffs) ||
+ nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES,
+ caps->min_frame_retries) ||
+ nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES,
+ caps->max_frame_retries) ||
+ nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES,
+ caps->iftypes) ||
+ nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, nl_caps);
+
+ return 0;
+}
+
+static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
+ enum nl802154_commands cmd,
+ struct sk_buff *msg, u32 portid, u32 seq,
+ int flags)
+{
+ struct nlattr *nl_cmds;
+ void *hdr;
+ int i;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
+ nla_put_string(msg, NL802154_ATTR_WPAN_PHY_NAME,
+ wpan_phy_name(&rdev->wpan_phy)) ||
+ nla_put_u32(msg, NL802154_ATTR_GENERATION,
+ cfg802154_rdev_list_generation))
+ goto nla_put_failure;
+
+ if (cmd != NL802154_CMD_NEW_WPAN_PHY)
+ goto finish;
+
+ /* DUMP PHY PIB */
+
+ /* current channel settings */
+ if (nla_put_u8(msg, NL802154_ATTR_PAGE,
+ rdev->wpan_phy.current_page) ||
+ nla_put_u8(msg, NL802154_ATTR_CHANNEL,
+ rdev->wpan_phy.current_channel))
+ goto nla_put_failure;
+
+ /* TODO remove this behaviour, we still keep support it for a while
+ * so users can change the behaviour to the new one.
+ */
+ if (nl802154_send_wpan_phy_channels(rdev, msg))
+ goto nla_put_failure;
+
+ /* cca mode */
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) {
+ if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE,
+ rdev->wpan_phy.cca.mode))
+ goto nla_put_failure;
+
+ if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) {
+ if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT,
+ rdev->wpan_phy.cca.opt))
+ goto nla_put_failure;
+ }
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
+ if (nla_put_s32(msg, NL802154_ATTR_TX_POWER,
+ rdev->wpan_phy.transmit_power))
+ goto nla_put_failure;
+ }
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
+ if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL,
+ rdev->wpan_phy.cca_ed_level))
+ goto nla_put_failure;
+ }
+
+ if (nl802154_put_capabilities(msg, rdev))
+ goto nla_put_failure;
+
+ nl_cmds = nla_nest_start_noflag(msg, NL802154_ATTR_SUPPORTED_COMMANDS);
+ if (!nl_cmds)
+ goto nla_put_failure;
+
+ i = 0;
+#define CMD(op, n) \
+ do { \
+ if (rdev->ops->op) { \
+ i++; \
+ if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \
+ goto nla_put_failure; \
+ } \
+ } while (0)
+
+ CMD(add_virtual_intf, NEW_INTERFACE);
+ CMD(del_virtual_intf, DEL_INTERFACE);
+ CMD(set_channel, SET_CHANNEL);
+ CMD(set_pan_id, SET_PAN_ID);
+ CMD(set_short_addr, SET_SHORT_ADDR);
+ CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT);
+ CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS);
+ CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES);
+ CMD(set_lbt_mode, SET_LBT_MODE);
+ CMD(set_ackreq_default, SET_ACKREQ_DEFAULT);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)
+ CMD(set_tx_power, SET_TX_POWER);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)
+ CMD(set_cca_ed_level, SET_CCA_ED_LEVEL);
+
+ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)
+ CMD(set_cca_mode, SET_CCA_MODE);
+
+#undef CMD
+ nla_nest_end(msg, nl_cmds);
+
+finish:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+struct nl802154_dump_wpan_phy_state {
+ s64 filter_wpan_phy;
+ long start;
+
+};
+
+static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nl802154_dump_wpan_phy_state *state)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct nlattr **tb = info->info.attrs;
+
+ if (tb[NL802154_ATTR_WPAN_PHY])
+ state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]);
+ if (tb[NL802154_ATTR_WPAN_DEV])
+ state->filter_wpan_phy = nla_get_u64(tb[NL802154_ATTR_WPAN_DEV]) >> 32;
+ if (tb[NL802154_ATTR_IFINDEX]) {
+ struct net_device *netdev;
+ struct cfg802154_registered_device *rdev;
+ int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]);
+
+ netdev = __dev_get_by_index(&init_net, ifidx);
+ if (!netdev)
+ return -ENODEV;
+ if (netdev->ieee802154_ptr) {
+ rdev = wpan_phy_to_rdev(
+ netdev->ieee802154_ptr->wpan_phy);
+ state->filter_wpan_phy = rdev->wpan_phy_idx;
+ }
+ }
+
+ return 0;
+}
+
+static int
+nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx = 0, ret;
+ struct nl802154_dump_wpan_phy_state *state = (void *)cb->args[0];
+ struct cfg802154_registered_device *rdev;
+
+ rtnl_lock();
+ if (!state) {
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state) {
+ rtnl_unlock();
+ return -ENOMEM;
+ }
+ state->filter_wpan_phy = -1;
+ ret = nl802154_dump_wpan_phy_parse(skb, cb, state);
+ if (ret) {
+ kfree(state);
+ rtnl_unlock();
+ return ret;
+ }
+ cb->args[0] = (long)state;
+ }
+
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk)))
+ continue;
+ if (++idx <= state->start)
+ continue;
+ if (state->filter_wpan_phy != -1 &&
+ state->filter_wpan_phy != rdev->wpan_phy_idx)
+ continue;
+ /* attempt to fit multiple wpan_phy data chunks into the skb */
+ ret = nl802154_send_wpan_phy(rdev,
+ NL802154_CMD_NEW_WPAN_PHY,
+ skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (ret < 0) {
+ if ((ret == -ENOBUFS || ret == -EMSGSIZE) &&
+ !skb->len && cb->min_dump_alloc < 4096) {
+ cb->min_dump_alloc = 4096;
+ rtnl_unlock();
+ return 1;
+ }
+ idx--;
+ break;
+ }
+ break;
+ }
+ rtnl_unlock();
+
+ state->start = idx;
+
+ return skb->len;
+}
+
+static int nl802154_dump_wpan_phy_done(struct netlink_callback *cb)
+{
+ kfree((void *)cb->args[0]);
+ return 0;
+}
+
+static int nl802154_get_wpan_phy(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl802154_send_wpan_phy(rdev, NL802154_CMD_NEW_WPAN_PHY, msg,
+ info->snd_portid, info->snd_seq, 0) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev)
+{
+ return (u64)wpan_dev->identifier |
+ ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32);
+}
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+#include <net/ieee802154_netdev.h>
+
+static int
+ieee802154_llsec_send_key_id(struct sk_buff *msg,
+ const struct ieee802154_llsec_key_id *desc)
+{
+ struct nlattr *nl_dev_addr;
+
+ if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode))
+ return -ENOBUFS;
+
+ switch (desc->mode) {
+ case NL802154_KEY_ID_MODE_IMPLICIT:
+ nl_dev_addr = nla_nest_start_noflag(msg,
+ NL802154_KEY_ID_ATTR_IMPLICIT);
+ if (!nl_dev_addr)
+ return -ENOBUFS;
+
+ if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID,
+ desc->device_addr.pan_id) ||
+ nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE,
+ desc->device_addr.mode))
+ return -ENOBUFS;
+
+ switch (desc->device_addr.mode) {
+ case NL802154_DEV_ADDR_SHORT:
+ if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT,
+ desc->device_addr.short_addr))
+ return -ENOBUFS;
+ break;
+ case NL802154_DEV_ADDR_EXTENDED:
+ if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED,
+ desc->device_addr.extended_addr,
+ NL802154_DEV_ADDR_ATTR_PAD))
+ return -ENOBUFS;
+ break;
+ default:
+ /* userspace should handle unknown */
+ break;
+ }
+
+ nla_nest_end(msg, nl_dev_addr);
+ break;
+ case NL802154_KEY_ID_MODE_INDEX:
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_SHORT:
+ /* TODO renmae short_source? */
+ if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT,
+ desc->short_source))
+ return -ENOBUFS;
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_EXTENDED:
+ if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED,
+ desc->extended_source,
+ NL802154_KEY_ID_ATTR_PAD))
+ return -ENOBUFS;
+ break;
+ default:
+ /* userspace should handle unknown */
+ break;
+ }
+
+ /* TODO key_id to key_idx ? Check naming */
+ if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) {
+ if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id))
+ return -ENOBUFS;
+ }
+
+ return 0;
+}
+
+static int nl802154_get_llsec_params(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ struct nlattr *nl_key_id;
+ struct ieee802154_llsec_params params;
+ int ret;
+
+ ret = rdev_get_llsec_params(rdev, wpan_dev, &params);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) ||
+ nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) ||
+ nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER,
+ params.frame_counter))
+ return -ENOBUFS;
+
+ nl_key_id = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_OUT_KEY_ID);
+ if (!nl_key_id)
+ return -ENOBUFS;
+
+ ret = ieee802154_llsec_send_key_id(msg, &params.out_key);
+ if (ret < 0)
+ return ret;
+
+ nla_nest_end(msg, nl_key_id);
+
+ return 0;
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
+static int
+nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags,
+ NL802154_CMD_NEW_INTERFACE);
+ if (!hdr)
+ return -1;
+
+ if (dev &&
+ (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put_string(msg, NL802154_ATTR_IFNAME, dev->name)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
+ nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) ||
+ nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+ wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) ||
+ nla_put_u32(msg, NL802154_ATTR_GENERATION,
+ rdev->devlist_generation ^
+ (cfg802154_rdev_list_generation << 2)))
+ goto nla_put_failure;
+
+ /* address settings */
+ if (nla_put_le64(msg, NL802154_ATTR_EXTENDED_ADDR,
+ wpan_dev->extended_addr,
+ NL802154_ATTR_PAD) ||
+ nla_put_le16(msg, NL802154_ATTR_SHORT_ADDR,
+ wpan_dev->short_addr) ||
+ nla_put_le16(msg, NL802154_ATTR_PAN_ID, wpan_dev->pan_id))
+ goto nla_put_failure;
+
+ /* ARET handling */
+ if (nla_put_s8(msg, NL802154_ATTR_MAX_FRAME_RETRIES,
+ wpan_dev->frame_retries) ||
+ nla_put_u8(msg, NL802154_ATTR_MAX_BE, wpan_dev->max_be) ||
+ nla_put_u8(msg, NL802154_ATTR_MAX_CSMA_BACKOFFS,
+ wpan_dev->csma_retries) ||
+ nla_put_u8(msg, NL802154_ATTR_MIN_BE, wpan_dev->min_be))
+ goto nla_put_failure;
+
+ /* listen before transmit */
+ if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt))
+ goto nla_put_failure;
+
+ /* ackreq default behaviour */
+ if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq))
+ goto nla_put_failure;
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ goto out;
+
+ if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0)
+ goto nla_put_failure;
+
+out:
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int wp_idx = 0;
+ int if_idx = 0;
+ int wp_start = cb->args[0];
+ int if_start = cb->args[1];
+ struct cfg802154_registered_device *rdev;
+ struct wpan_dev *wpan_dev;
+
+ rtnl_lock();
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk)))
+ continue;
+ if (wp_idx < wp_start) {
+ wp_idx++;
+ continue;
+ }
+ if_idx = 0;
+
+ list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+ if (if_idx < if_start) {
+ if_idx++;
+ continue;
+ }
+ if (nl802154_send_iface(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev) < 0) {
+ goto out;
+ }
+ if_idx++;
+ }
+
+ wp_idx++;
+ }
+out:
+ rtnl_unlock();
+
+ cb->args[0] = wp_idx;
+ cb->args[1] = if_idx;
+
+ return skb->len;
+}
+
+static int nl802154_get_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct wpan_dev *wdev = info->user_ptr[1];
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl802154_send_iface(msg, info->snd_portid, info->snd_seq, 0,
+ rdev, wdev) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ enum nl802154_iftype type = NL802154_IFTYPE_UNSPEC;
+ __le64 extended_addr = cpu_to_le64(0x0000000000000000ULL);
+
+ /* TODO avoid failing a new interface
+ * creation due to pending removal?
+ */
+
+ if (!info->attrs[NL802154_ATTR_IFNAME])
+ return -EINVAL;
+
+ if (info->attrs[NL802154_ATTR_IFTYPE]) {
+ type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]);
+ if (type > NL802154_IFTYPE_MAX ||
+ !(rdev->wpan_phy.supported.iftypes & BIT(type)))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR])
+ extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+
+ if (!rdev->ops->add_virtual_intf)
+ return -EOPNOTSUPP;
+
+ return rdev_add_virtual_intf(rdev,
+ nla_data(info->attrs[NL802154_ATTR_IFNAME]),
+ NET_NAME_USER, type, extended_addr);
+}
+
+static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct wpan_dev *wpan_dev = info->user_ptr[1];
+
+ if (!rdev->ops->del_virtual_intf)
+ return -EOPNOTSUPP;
+
+ /* If we remove a wpan device without a netdev then clear
+ * user_ptr[1] so that nl802154_post_doit won't dereference it
+ * to check if it needs to do dev_put(). Otherwise it crashes
+ * since the wpan_dev has been freed, unlike with a netdev where
+ * we need the dev_put() for the netdev to really be freed.
+ */
+ if (!wpan_dev->netdev)
+ info->user_ptr[1] = NULL;
+
+ return rdev_del_virtual_intf(rdev, wpan_dev);
+}
+
+static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ u8 channel, page;
+
+ if (!info->attrs[NL802154_ATTR_PAGE] ||
+ !info->attrs[NL802154_ATTR_CHANNEL])
+ return -EINVAL;
+
+ page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]);
+ channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]);
+
+ /* check 802.15.4 constraints */
+ if (!ieee802154_chan_is_valid(&rdev->wpan_phy, page, channel))
+ return -EINVAL;
+
+ return rdev_set_channel(rdev, page, channel);
+}
+
+static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct wpan_phy_cca cca;
+
+ if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE))
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_CCA_MODE])
+ return -EINVAL;
+
+ cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]);
+ /* checking 802.15.4 constraints */
+ if (cca.mode < NL802154_CCA_ENERGY ||
+ cca.mode > NL802154_CCA_ATTR_MAX ||
+ !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode)))
+ return -EINVAL;
+
+ if (cca.mode == NL802154_CCA_ENERGY_CARRIER) {
+ if (!info->attrs[NL802154_ATTR_CCA_OPT])
+ return -EINVAL;
+
+ cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]);
+ if (cca.opt > NL802154_CCA_OPT_ATTR_MAX ||
+ !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt)))
+ return -EINVAL;
+ }
+
+ return rdev_set_cca_mode(rdev, &cca);
+}
+
+static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ s32 ed_level;
+ int i;
+
+ if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL))
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL])
+ return -EINVAL;
+
+ ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]);
+
+ for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) {
+ if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i])
+ return rdev_set_cca_ed_level(rdev, ed_level);
+ }
+
+ return -EINVAL;
+}
+
+static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ s32 power;
+ int i;
+
+ if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER))
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_TX_POWER])
+ return -EINVAL;
+
+ power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]);
+
+ for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) {
+ if (power == rdev->wpan_phy.supported.tx_powers[i])
+ return rdev_set_tx_power(rdev, power);
+ }
+
+ return -EINVAL;
+}
+
+static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ __le16 pan_id;
+
+ /* conflict here while tx/rx calls */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (wpan_dev->lowpan_dev) {
+ if (netif_running(wpan_dev->lowpan_dev))
+ return -EBUSY;
+ }
+
+ /* don't change address fields on monitor */
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR ||
+ !info->attrs[NL802154_ATTR_PAN_ID])
+ return -EINVAL;
+
+ pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]);
+
+ /* Only allow changing the PAN ID when the device has no more
+ * associations ongoing to avoid confusing peers.
+ */
+ if (cfg802154_device_is_associated(wpan_dev)) {
+ NL_SET_ERR_MSG(info->extack,
+ "Existing associations, changing PAN ID forbidden");
+ return -EINVAL;
+ }
+
+ return rdev_set_pan_id(rdev, wpan_dev, pan_id);
+}
+
+static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ __le16 short_addr;
+
+ /* conflict here while tx/rx calls */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (wpan_dev->lowpan_dev) {
+ if (netif_running(wpan_dev->lowpan_dev))
+ return -EBUSY;
+ }
+
+ /* don't change address fields on monitor */
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR ||
+ !info->attrs[NL802154_ATTR_SHORT_ADDR])
+ return -EINVAL;
+
+ short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
+
+ /* The short address only has a meaning when part of a PAN, after a
+ * proper association procedure. However, we want to still offer the
+ * possibility to create static networks so changing the short address
+ * is only allowed when not already associated to other devices with
+ * the official handshake.
+ */
+ if (cfg802154_device_is_associated(wpan_dev)) {
+ NL_SET_ERR_MSG(info->extack,
+ "Existing associations, changing short address forbidden");
+ return -EINVAL;
+ }
+
+ return rdev_set_short_addr(rdev, wpan_dev, short_addr);
+}
+
+static int
+nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ u8 min_be, max_be;
+
+ /* should be set on netif open inside phy settings */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_MIN_BE] ||
+ !info->attrs[NL802154_ATTR_MAX_BE])
+ return -EINVAL;
+
+ min_be = nla_get_u8(info->attrs[NL802154_ATTR_MIN_BE]);
+ max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]);
+
+ /* check 802.15.4 constraints */
+ if (min_be < rdev->wpan_phy.supported.min_minbe ||
+ min_be > rdev->wpan_phy.supported.max_minbe ||
+ max_be < rdev->wpan_phy.supported.min_maxbe ||
+ max_be > rdev->wpan_phy.supported.max_maxbe ||
+ min_be > max_be)
+ return -EINVAL;
+
+ return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be);
+}
+
+static int
+nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ u8 max_csma_backoffs;
+
+ /* conflict here while other running iface settings */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS])
+ return -EINVAL;
+
+ max_csma_backoffs = nla_get_u8(
+ info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]);
+
+ /* check 802.15.4 constraints */
+ if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs ||
+ max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs)
+ return -EINVAL;
+
+ return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs);
+}
+
+static int
+nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ s8 max_frame_retries;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES])
+ return -EINVAL;
+
+ max_frame_retries = nla_get_s8(
+ info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]);
+
+ /* check 802.15.4 constraints */
+ if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries ||
+ max_frame_retries > rdev->wpan_phy.supported.max_frame_retries)
+ return -EINVAL;
+
+ return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries);
+}
+
+static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ int mode;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_LBT_MODE])
+ return -EINVAL;
+
+ mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]);
+
+ if (mode != 0 && mode != 1)
+ return -EINVAL;
+
+ if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt))
+ return -EINVAL;
+
+ return rdev_set_lbt_mode(rdev, wpan_dev, mode);
+}
+
+static int
+nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ int ackreq;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT])
+ return -EINVAL;
+
+ ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]);
+
+ if (ackreq != 0 && ackreq != 1)
+ return -EINVAL;
+
+ return rdev_set_ackreq_default(rdev, wpan_dev, ackreq);
+}
+
+static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net *net;
+ int err;
+
+ if (info->attrs[NL802154_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]);
+
+ net = get_net_ns_by_pid(pid);
+ } else if (info->attrs[NL802154_ATTR_NETNS_FD]) {
+ u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]);
+
+ net = get_net_ns_by_fd(fd);
+ } else {
+ return -EINVAL;
+ }
+
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ err = 0;
+
+ /* check if anything to do */
+ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net))
+ err = cfg802154_switch_netns(rdev, net);
+
+ put_net(net);
+ return err;
+}
+
+static int nl802154_prep_scan_event_msg(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ u32 portid, u32 seq, int flags, u8 cmd,
+ struct ieee802154_coord_desc *desc)
+{
+ struct nlattr *nla;
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx))
+ goto nla_put_failure;
+
+ if (wpan_dev->netdev &&
+ nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+ wpan_dev_id(wpan_dev), NL802154_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR);
+ if (!nla)
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN,
+ &desc->addr.pan_id))
+ goto nla_put_failure;
+
+ if (desc->addr.mode == IEEE802154_ADDR_SHORT) {
+ if (nla_put(msg, NL802154_COORD_ADDR,
+ IEEE802154_SHORT_ADDR_LEN,
+ &desc->addr.short_addr))
+ goto nla_put_failure;
+ } else {
+ if (nla_put(msg, NL802154_COORD_ADDR,
+ IEEE802154_EXTENDED_ADDR_LEN,
+ &desc->addr.extended_addr))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC,
+ desc->superframe_spec))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality))
+ goto nla_put_failure;
+
+ if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT))
+ goto nla_put_failure;
+
+ /* TODO: NL802154_COORD_PAYLOAD_DATA if any */
+
+ nla_nest_end(msg, nla);
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+
+ return -EMSGSIZE;
+}
+
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ struct ieee802154_coord_desc *desc)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0,
+ NL802154_CMD_SCAN_EVENT,
+ desc);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy),
+ msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_event);
+
+static int nl802154_trigger_scan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct wpan_phy *wpan_phy = &rdev->wpan_phy;
+ struct cfg802154_scan_request *request;
+ u8 type;
+ int err;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ NL_SET_ERR_MSG(info->extack, "Monitors are not allowed to perform scans");
+ return -EOPNOTSUPP;
+ }
+
+ if (!info->attrs[NL802154_ATTR_SCAN_TYPE]) {
+ NL_SET_ERR_MSG(info->extack, "Malformed request, missing scan type");
+ return -EINVAL;
+ }
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ request = kzalloc(sizeof(*request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wpan_dev = wpan_dev;
+ request->wpan_phy = wpan_phy;
+
+ type = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_TYPE]);
+ switch (type) {
+ case NL802154_SCAN_ACTIVE:
+ case NL802154_SCAN_PASSIVE:
+ request->type = type;
+ break;
+ default:
+ NL_SET_ERR_MSG_FMT(info->extack, "Unsupported scan type: %d", type);
+ err = -EINVAL;
+ goto free_request;
+ }
+
+ /* Use current page by default */
+ request->page = nla_get_u8_default(info->attrs[NL802154_ATTR_PAGE],
+ wpan_phy->current_page);
+
+ /* Scan all supported channels by default */
+ request->channels =
+ nla_get_u32_default(info->attrs[NL802154_ATTR_SCAN_CHANNELS],
+ wpan_phy->supported.channels[request->page]);
+
+ /* Use maximum duration order by default */
+ request->duration =
+ nla_get_u8_default(info->attrs[NL802154_ATTR_SCAN_DURATION],
+ IEEE802154_MAX_SCAN_DURATION);
+
+ err = rdev_trigger_scan(rdev, request);
+ if (err) {
+ pr_err("Failure starting scanning (%d)\n", err);
+ goto free_request;
+ }
+
+ return 0;
+
+free_request:
+ kfree(request);
+
+ return err;
+}
+
+static int nl802154_prep_scan_msg(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u32 portid,
+ u32 seq, int flags, u8 cmd, u8 arg)
+{
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx))
+ goto nla_put_failure;
+
+ if (wpan_dev->netdev &&
+ nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+ wpan_dev_id(wpan_dev), NL802154_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (cmd == NL802154_CMD_SCAN_DONE &&
+ nla_put_u8(msg, NL802154_ATTR_SCAN_DONE_REASON, arg))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+
+ return -EMSGSIZE;
+}
+
+static int nl802154_send_scan_msg(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u8 cmd, u8 arg)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = nl802154_prep_scan_msg(msg, rdev, wpan_dev, 0, 0, 0, cmd, arg);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ return genlmsg_multicast_netns(&nl802154_fam,
+ wpan_phy_net(&rdev->wpan_phy), msg, 0,
+ NL802154_MCGRP_SCAN, GFP_KERNEL);
+}
+
+int nl802154_scan_started(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+ int err;
+
+ /* Ignore errors when there are no listeners */
+ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_TRIGGER_SCAN, 0);
+ if (err == -ESRCH)
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_started);
+
+int nl802154_scan_done(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ enum nl802154_scan_done_reasons reason)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+ int err;
+
+ /* Ignore errors when there are no listeners */
+ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_SCAN_DONE, reason);
+ if (err == -ESRCH)
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_done);
+
+static int nl802154_abort_scan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+
+ /* Resources are released in the notification helper above */
+ return rdev_abort_scan(rdev, wpan_dev);
+}
+
+static int
+nl802154_send_beacons(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct wpan_phy *wpan_phy = &rdev->wpan_phy;
+ struct cfg802154_beacon_request *request;
+ int err;
+
+ if (wpan_dev->iftype != NL802154_IFTYPE_COORD) {
+ NL_SET_ERR_MSG(info->extack, "Only coordinators can send beacons");
+ return -EOPNOTSUPP;
+ }
+
+ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) {
+ NL_SET_ERR_MSG(info->extack, "Device is not part of any PAN");
+ return -EPERM;
+ }
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ request = kzalloc(sizeof(*request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wpan_dev = wpan_dev;
+ request->wpan_phy = wpan_phy;
+
+ /* Use maximum duration order by default */
+ request->interval = nla_get_u8_default(info->attrs[NL802154_ATTR_BEACON_INTERVAL],
+ IEEE802154_MAX_SCAN_DURATION);
+
+ err = rdev_send_beacons(rdev, request);
+ if (err) {
+ pr_err("Failure starting sending beacons (%d)\n", err);
+ goto free_request;
+ }
+
+ return 0;
+
+free_request:
+ kfree(request);
+
+ return err;
+}
+
+void nl802154_beaconing_done(struct wpan_dev *wpan_dev)
+{
+ /* NOP */
+}
+EXPORT_SYMBOL_GPL(nl802154_beaconing_done);
+
+static int
+nl802154_stop_beacons(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+
+ /* Resources are released in the notification helper above */
+ return rdev_stop_beacons(rdev, wpan_dev);
+}
+
+static int nl802154_associate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev;
+ struct wpan_phy *wpan_phy;
+ struct ieee802154_addr coord;
+ int err;
+
+ wpan_dev = dev->ieee802154_ptr;
+ wpan_phy = &rdev->wpan_phy;
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ if (!info->attrs[NL802154_ATTR_PAN_ID] ||
+ !info->attrs[NL802154_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ coord.pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]);
+ coord.mode = IEEE802154_ADDR_LONG;
+ coord.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+
+ mutex_lock(&wpan_dev->association_lock);
+ err = rdev_associate(rdev, wpan_dev, &coord);
+ mutex_unlock(&wpan_dev->association_lock);
+ if (err)
+ pr_err("Association with PAN ID 0x%x failed (%d)\n",
+ le16_to_cpu(coord.pan_id), err);
+
+ return err;
+}
+
+static int nl802154_disassociate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct wpan_phy *wpan_phy = &rdev->wpan_phy;
+ struct ieee802154_addr target;
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ target.pan_id = wpan_dev->pan_id;
+
+ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) {
+ target.mode = IEEE802154_ADDR_LONG;
+ target.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+ } else if (info->attrs[NL802154_ATTR_SHORT_ADDR]) {
+ target.mode = IEEE802154_ADDR_SHORT;
+ target.short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
+ } else {
+ NL_SET_ERR_MSG(info->extack, "Device address is missing");
+ return -EINVAL;
+ }
+
+ mutex_lock(&wpan_dev->association_lock);
+ rdev_disassociate(rdev, wpan_dev, &target);
+ mutex_unlock(&wpan_dev->association_lock);
+
+ return 0;
+}
+
+static int nl802154_set_max_associations(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ unsigned int max_assoc;
+
+ if (!info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]) {
+ NL_SET_ERR_MSG(info->extack, "No maximum number of association given");
+ return -EINVAL;
+ }
+
+ max_assoc = nla_get_u32(info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]);
+
+ mutex_lock(&wpan_dev->association_lock);
+ cfg802154_set_max_associations(wpan_dev, max_assoc);
+ mutex_unlock(&wpan_dev->association_lock);
+
+ return 0;
+}
+
+static int nl802154_send_peer_info(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_pan_device *peer,
+ enum nl802154_peer_type type)
+{
+ struct nlattr *nla;
+ void *hdr;
+
+ ASSERT_RTNL();
+
+ hdr = nl802154hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
+ NL802154_CMD_LIST_ASSOCIATIONS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ genl_dump_check_consistent(cb, hdr);
+
+ nla = nla_nest_start_noflag(msg, NL802154_ATTR_PEER);
+ if (!nla)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_PEER_TYPE, type))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_MODE, peer->mode))
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL802154_DEV_ADDR_ATTR_SHORT,
+ IEEE802154_SHORT_ADDR_LEN, &peer->short_addr))
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL802154_DEV_ADDR_ATTR_EXTENDED,
+ IEEE802154_EXTENDED_ADDR_LEN, &peer->extended_addr))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla);
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl802154_list_associations(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev;
+ struct ieee802154_pan_device *child;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ mutex_lock(&wpan_dev->association_lock);
+
+ if (cb->args[2])
+ goto out;
+
+ if (wpan_dev->parent) {
+ err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, rdev, wpan_dev,
+ wpan_dev->parent,
+ NL802154_PEER_TYPE_PARENT);
+ if (err < 0)
+ goto out_err;
+ }
+
+ list_for_each_entry(child, &wpan_dev->children, node) {
+ err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, rdev, wpan_dev,
+ child,
+ NL802154_PEER_TYPE_CHILD);
+ if (err < 0)
+ goto out_err;
+ }
+
+ cb->args[2] = 1;
+out:
+ err = skb->len;
+out_err:
+ mutex_unlock(&wpan_dev->association_lock);
+
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
+ [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
+ [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 },
+ [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 },
+ [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 },
+};
+
+static int
+ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
+ struct ieee802154_addr *addr)
+{
+ struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL))
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE])
+ return -EINVAL;
+
+ addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]);
+ addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]);
+ switch (addr->mode) {
+ case NL802154_DEV_ADDR_SHORT:
+ if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT])
+ return -EINVAL;
+ addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]);
+ break;
+ case NL802154_DEV_ADDR_EXTENDED:
+ if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])
+ return -EINVAL;
+ addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = {
+ [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 },
+ [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 },
+ [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED },
+ [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 },
+ [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 },
+};
+
+static int
+ieee802154_llsec_parse_key_id(struct nlattr *nla,
+ struct ieee802154_llsec_key_id *desc)
+{
+ struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_KEY_ID_ATTR_MAX, nla, nl802154_key_id_policy, NULL))
+ return -EINVAL;
+
+ if (!attrs[NL802154_KEY_ID_ATTR_MODE])
+ return -EINVAL;
+
+ desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]);
+ switch (desc->mode) {
+ case NL802154_KEY_ID_MODE_IMPLICIT:
+ if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT])
+ return -EINVAL;
+
+ if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT],
+ &desc->device_addr) < 0)
+ return -EINVAL;
+ break;
+ case NL802154_KEY_ID_MODE_INDEX:
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_SHORT:
+ if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT])
+ return -EINVAL;
+
+ desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]);
+ break;
+ case NL802154_KEY_ID_MODE_INDEX_EXTENDED:
+ if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED])
+ return -EINVAL;
+
+ desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) {
+ if (!attrs[NL802154_KEY_ID_ATTR_INDEX])
+ return -EINVAL;
+
+ /* TODO change id to idx */
+ desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]);
+ }
+
+ return 0;
+}
+
+static int nl802154_set_llsec_params(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_params params;
+ u32 changed = 0;
+ int ret;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[NL802154_ATTR_SEC_ENABLED]) {
+ u8 enabled;
+
+ enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]);
+ if (enabled != 0 && enabled != 1)
+ return -EINVAL;
+
+ params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]);
+ changed |= IEEE802154_LLSEC_PARAM_ENABLED;
+ }
+
+ if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) {
+ ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID],
+ &params.out_key);
+ if (ret < 0)
+ return ret;
+
+ changed |= IEEE802154_LLSEC_PARAM_OUT_KEY;
+ }
+
+ if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) {
+ params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]);
+ if (params.out_level > NL802154_SECLEVEL_MAX)
+ return -EINVAL;
+
+ changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL;
+ }
+
+ if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) {
+ params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]);
+ changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER;
+ }
+
+ return rdev_set_llsec_params(rdev, wpan_dev, &params, changed);
+}
+
+static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev,
+ const struct ieee802154_llsec_key_entry *key)
+{
+ void *hdr;
+ u32 commands[NL802154_CMD_FRAME_NR_IDS / 32];
+ struct nlattr *nl_key, *nl_key_id;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_key = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_KEY);
+ if (!nl_key)
+ goto nla_put_failure;
+
+ nl_key_id = nla_nest_start_noflag(msg, NL802154_KEY_ATTR_ID);
+ if (!nl_key_id)
+ goto nla_put_failure;
+
+ if (ieee802154_llsec_send_key_id(msg, &key->id) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_key_id);
+
+ if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES,
+ key->key->frame_types))
+ goto nla_put_failure;
+
+ if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) {
+ /* TODO for each nested */
+ memset(commands, 0, sizeof(commands));
+ commands[7] = key->key->cmd_frame_ids;
+ if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS,
+ sizeof(commands), commands))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE,
+ key->key->key))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_key);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_key_entry *key;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ list_for_each_entry(key, &table->keys, list) {
+ if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev->netdev, key) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = {
+ [NL802154_KEY_ATTR_ID] = { NLA_NESTED },
+ /* TODO handle it as for_each_nested and NLA_FLAG? */
+ [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 },
+ /* TODO handle it as for_each_nested, not static array? */
+ [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 },
+ [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE },
+};
+
+static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_key key = { };
+ struct ieee802154_llsec_key_id id = { };
+ u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { };
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_KEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack))
+ return -EINVAL;
+
+ if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] ||
+ !attrs[NL802154_KEY_ATTR_BYTES])
+ return -EINVAL;
+
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
+ return -ENOBUFS;
+
+ key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]);
+ if (key.frame_types > BIT(NL802154_FRAME_MAX) ||
+ ((key.frame_types & BIT(NL802154_FRAME_CMD)) &&
+ !attrs[NL802154_KEY_ATTR_USAGE_CMDS]))
+ return -EINVAL;
+
+ if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) {
+ /* TODO for each nested */
+ nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS],
+ NL802154_CMD_FRAME_NR_IDS / 8);
+
+ /* TODO understand the -EINVAL logic here? last condition */
+ if (commands[0] || commands[1] || commands[2] || commands[3] ||
+ commands[4] || commands[5] || commands[6] ||
+ commands[7] > BIT(NL802154_CMD_FRAME_MAX))
+ return -EINVAL;
+
+ key.cmd_frame_ids = commands[7];
+ } else {
+ key.cmd_frame_ids = 0;
+ }
+
+ nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE);
+
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
+ return -ENOBUFS;
+
+ return rdev_add_llsec_key(rdev, wpan_dev, &id, &key);
+}
+
+static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_key_id id;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_KEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack))
+ return -EINVAL;
+
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
+ return -ENOBUFS;
+
+ return rdev_del_llsec_key(rdev, wpan_dev, &id);
+}
+
+static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev,
+ const struct ieee802154_llsec_device *dev_desc)
+{
+ void *hdr;
+ struct nlattr *nl_device;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_device = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVICE);
+ if (!nl_device)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER,
+ dev_desc->frame_counter) ||
+ nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) ||
+ nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR,
+ dev_desc->short_addr) ||
+ nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR,
+ dev_desc->hwaddr, NL802154_DEV_ATTR_PAD) ||
+ nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT,
+ dev_desc->seclevel_exempt) ||
+ nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_device);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_device *dev;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ list_for_each_entry(dev, &table->devices, list) {
+ if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev->netdev, dev) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = {
+ [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 },
+ [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 },
+ [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 },
+ [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 },
+ [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 },
+ [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 },
+};
+
+static int
+ieee802154_llsec_parse_device(struct nlattr *nla,
+ struct ieee802154_llsec_device *dev)
+{
+ struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, nla, nl802154_dev_policy, NULL))
+ return -EINVAL;
+
+ memset(dev, 0, sizeof(*dev));
+
+ if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] ||
+ !attrs[NL802154_DEV_ATTR_PAN_ID] ||
+ !attrs[NL802154_DEV_ATTR_SHORT_ADDR] ||
+ !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] ||
+ !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] ||
+ !attrs[NL802154_DEV_ATTR_KEY_MODE])
+ return -EINVAL;
+
+ /* TODO be32 */
+ dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]);
+ dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]);
+ dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]);
+ /* TODO rename hwaddr to extended_addr */
+ dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]);
+ dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]);
+ dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]);
+
+ if (dev->key_mode > NL802154_DEVKEY_MAX ||
+ (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_device dev_desc;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE],
+ &dev_desc) < 0)
+ return -EINVAL;
+
+ return rdev_add_device(rdev, wpan_dev, &dev_desc);
+}
+
+static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1];
+ __le64 extended_addr;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_DEVICE] ||
+ nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], nl802154_dev_policy, info->extack))
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]);
+ return rdev_del_device(rdev, wpan_dev, extended_addr);
+}
+
+static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev, __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *devkey)
+{
+ void *hdr;
+ struct nlattr *nl_devkey, *nl_key_id;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_devkey = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVKEY);
+ if (!nl_devkey)
+ goto nla_put_failure;
+
+ if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR,
+ extended_addr, NL802154_DEVKEY_ATTR_PAD) ||
+ nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER,
+ devkey->frame_counter))
+ goto nla_put_failure;
+
+ nl_key_id = nla_nest_start_noflag(msg, NL802154_DEVKEY_ATTR_ID);
+ if (!nl_key_id)
+ goto nla_put_failure;
+
+ if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_key_id);
+ nla_nest_end(msg, nl_devkey);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_device_key *kpos;
+ struct ieee802154_llsec_device *dpos;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ /* TODO look if remove devkey and do some nested attribute */
+ list_for_each_entry(dpos, &table->devices, list) {
+ list_for_each_entry(kpos, &dpos->keys, list) {
+ if (nl802154_send_devkey(skb,
+ NL802154_CMD_NEW_SEC_LEVEL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, rdev,
+ wpan_dev->netdev,
+ dpos->hwaddr,
+ kpos) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = {
+ [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 },
+ [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 },
+ [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED },
+};
+
+static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_device_key key;
+ __le64 extended_addr;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack) < 0)
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] ||
+ !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ /* TODO change key.id ? */
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID],
+ &key.key_id) < 0)
+ return -ENOBUFS;
+
+ /* TODO be32 */
+ key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]);
+ /* TODO change naming hwaddr -> extended_addr
+ * check unique identifier short+pan OR extended_addr
+ */
+ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]);
+ return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key);
+}
+
+static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1];
+ struct ieee802154_llsec_device_key key;
+ __le64 extended_addr;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack))
+ return -EINVAL;
+
+ if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ /* TODO change key.id ? */
+ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID],
+ &key.key_id) < 0)
+ return -ENOBUFS;
+
+ /* TODO change naming hwaddr -> extended_addr
+ * check unique identifier short+pan OR extended_addr
+ */
+ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]);
+ return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key);
+}
+
+static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct net_device *dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ void *hdr;
+ struct nlattr *nl_seclevel;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ nl_seclevel = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_LEVEL);
+ if (!nl_seclevel)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) ||
+ nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) ||
+ nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE,
+ sl->device_override))
+ goto nla_put_failure;
+
+ if (sl->frame_type == NL802154_FRAME_CMD) {
+ if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME,
+ sl->cmd_frame_id))
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, nl_seclevel);
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev = NULL;
+ struct ieee802154_llsec_seclevel *sl;
+ struct ieee802154_llsec_table *table;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
+ if (!wpan_dev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rdev_lock_llsec_table(rdev, wpan_dev);
+ rdev_get_llsec_table(rdev, wpan_dev, &table);
+
+ /* TODO make it like station dump */
+ if (cb->args[2])
+ goto out;
+
+ list_for_each_entry(sl, &table->security_levels, list) {
+ if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev->netdev, sl) < 0) {
+ /* TODO */
+ err = -EIO;
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ goto out_err;
+ }
+ }
+
+ cb->args[2] = 1;
+
+out:
+ rdev_unlock_llsec_table(rdev, wpan_dev);
+ err = skb->len;
+out_err:
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
+static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = {
+ [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 },
+ [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 },
+ [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 },
+ [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 },
+};
+
+static int
+llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl)
+{
+ struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1];
+
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, nl802154_seclevel_policy, NULL))
+ return -EINVAL;
+
+ memset(sl, 0, sizeof(*sl));
+
+ if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] ||
+ !attrs[NL802154_SECLEVEL_ATTR_FRAME] ||
+ !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE])
+ return -EINVAL;
+
+ sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]);
+ sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]);
+ sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]);
+ if (sl->frame_type > NL802154_FRAME_MAX ||
+ (sl->device_override != 0 && sl->device_override != 1))
+ return -EINVAL;
+
+ if (sl->frame_type == NL802154_FRAME_CMD) {
+ if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME])
+ return -EINVAL;
+
+ sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]);
+ if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nl802154_add_llsec_seclevel(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_seclevel sl;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
+ &sl) < 0)
+ return -EINVAL;
+
+ return rdev_add_seclevel(rdev, wpan_dev, &sl);
+}
+
+static int nl802154_del_llsec_seclevel(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct ieee802154_llsec_seclevel sl;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
+ &sl) < 0)
+ return -EINVAL;
+
+ return rdev_del_seclevel(rdev, wpan_dev, &sl);
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
+#define NL802154_FLAG_NEED_WPAN_PHY 0x01
+#define NL802154_FLAG_NEED_NETDEV 0x02
+#define NL802154_FLAG_NEED_RTNL 0x04
+#define NL802154_FLAG_CHECK_NETDEV_UP 0x08
+#define NL802154_FLAG_NEED_WPAN_DEV 0x10
+
+static int nl802154_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev;
+ struct wpan_dev *wpan_dev;
+ struct net_device *dev;
+ bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL;
+
+ if (rtnl)
+ rtnl_lock();
+
+ if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) {
+ rdev = cfg802154_get_dev_from_info(genl_info_net(info), info);
+ if (IS_ERR(rdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(rdev);
+ }
+ info->user_ptr[0] = rdev;
+ } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV ||
+ ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) {
+ ASSERT_RTNL();
+ wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info),
+ info->attrs);
+ if (IS_ERR(wpan_dev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(wpan_dev);
+ }
+
+ dev = wpan_dev->netdev;
+ rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy);
+
+ if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) {
+ if (!dev) {
+ if (rtnl)
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ info->user_ptr[1] = dev;
+ } else {
+ info->user_ptr[1] = wpan_dev;
+ }
+
+ if (dev) {
+ if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP &&
+ !netif_running(dev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+
+ dev_hold(dev);
+ }
+
+ info->user_ptr[0] = rdev;
+ }
+
+ return 0;
+}
+
+static void nl802154_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ if (info->user_ptr[1]) {
+ if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) {
+ struct wpan_dev *wpan_dev = info->user_ptr[1];
+
+ dev_put(wpan_dev->netdev);
+ } else {
+ dev_put(info->user_ptr[1]);
+ }
+ }
+
+ if (ops->internal_flags & NL802154_FLAG_NEED_RTNL)
+ rtnl_unlock();
+}
+
+static const struct genl_ops nl802154_ops[] = {
+ {
+ .cmd = NL802154_CMD_GET_WPAN_PHY,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
+ .doit = nl802154_get_wpan_phy,
+ .dumpit = nl802154_dump_wpan_phy,
+ .done = nl802154_dump_wpan_phy_done,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_GET_INTERFACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_get_interface,
+ .dumpit = nl802154_dump_interface,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_INTERFACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_new_interface,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_INTERFACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_del_interface,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_CHANNEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_channel,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_CCA_MODE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_cca_mode,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_CCA_ED_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_cca_ed_level,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_TX_POWER,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_tx_power,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_wpan_phy_netns,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_PAN_ID,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_pan_id,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_SHORT_ADDR,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_short_addr,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_BACKOFF_EXPONENT,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_backoff_exponent,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_MAX_CSMA_BACKOFFS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_max_csma_backoffs,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_MAX_FRAME_RETRIES,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_max_frame_retries,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_LBT_MODE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_lbt_mode,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_ackreq_default,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_TRIGGER_SCAN,
+ .doit = nl802154_trigger_scan,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_ABORT_SCAN,
+ .doit = nl802154_abort_scan,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SEND_BEACONS,
+ .doit = nl802154_send_beacons,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_STOP_BEACONS,
+ .doit = nl802154_stop_beacons,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_ASSOCIATE,
+ .doit = nl802154_associate,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DISASSOCIATE,
+ .doit = nl802154_disassociate,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_MAX_ASSOCIATIONS,
+ .doit = nl802154_set_max_associations,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_LIST_ASSOCIATIONS,
+ .dumpit = nl802154_list_associations,
+ /* can be retrieved by unprivileged users */
+ },
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ {
+ .cmd = NL802154_CMD_SET_SEC_PARAMS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_set_llsec_params,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_GET_SEC_KEY,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
+ /* TODO .doit by matching key id? */
+ .dumpit = nl802154_dump_llsec_key,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_KEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_add_llsec_key,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_KEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_del_llsec_key,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ /* TODO unique identifier must short+pan OR extended_addr */
+ {
+ .cmd = NL802154_CMD_GET_SEC_DEV,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
+ /* TODO .doit by matching extended_addr? */
+ .dumpit = nl802154_dump_llsec_dev,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_DEV,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_add_llsec_dev,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_DEV,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_del_llsec_dev,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ /* TODO remove complete devkey, put it as nested? */
+ {
+ .cmd = NL802154_CMD_GET_SEC_DEVKEY,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
+ /* TODO doit by matching ??? */
+ .dumpit = nl802154_dump_llsec_devkey,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_DEVKEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_add_llsec_devkey,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_DEVKEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_del_llsec_devkey,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_GET_SEC_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
+ /* TODO .doit by matching frame_type? */
+ .dumpit = nl802154_dump_llsec_seclevel,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_SEC_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = nl802154_add_llsec_seclevel,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_SEC_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* TODO match frame_type only? */
+ .doit = nl802154_del_llsec_seclevel,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+};
+
+static struct genl_family nl802154_fam __ro_after_init = {
+ .name = NL802154_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL802154_ATTR_MAX,
+ .policy = nl802154_policy,
+ .netnsok = true,
+ .pre_doit = nl802154_pre_doit,
+ .post_doit = nl802154_post_doit,
+ .module = THIS_MODULE,
+ .ops = nl802154_ops,
+ .n_ops = ARRAY_SIZE(nl802154_ops),
+ .resv_start_op = NL802154_CMD_DEL_SEC_LEVEL + 1,
+ .mcgrps = nl802154_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
+};
+
+/* initialisation/exit functions */
+int __init nl802154_init(void)
+{
+ return genl_register_family(&nl802154_fam);
+}
+
+void nl802154_exit(void)
+{
+ genl_unregister_family(&nl802154_fam);
+}
diff --git a/net/ieee802154/nl802154.h b/net/ieee802154/nl802154.h
new file mode 100644
index 000000000000..d69d950f9a6a
--- /dev/null
+++ b/net/ieee802154/nl802154.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __IEEE802154_NL802154_H
+#define __IEEE802154_NL802154_H
+
+int nl802154_init(void);
+void nl802154_exit(void);
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ struct ieee802154_coord_desc *desc);
+int nl802154_scan_started(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev);
+int nl802154_scan_done(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ enum nl802154_scan_done_reasons reason);
+void nl802154_beaconing_done(struct wpan_dev *wpan_dev);
+
+#endif /* __IEEE802154_NL802154_H */
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
new file mode 100644
index 000000000000..0672b2f01586
--- /dev/null
+++ b/net/ieee802154/nl_policy.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * nl802154.h
+ *
+ * Copyright (C) 2007, 2008 Siemens AG
+ */
+
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/nl802154.h>
+
+#define NLA_HW_ADDR NLA_U64
+
+const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
+ [IEEE802154_ATTR_DEV_NAME] = { .type = NLA_STRING, },
+ [IEEE802154_ATTR_DEV_INDEX] = { .type = NLA_U32, },
+ [IEEE802154_ATTR_PHY_NAME] = { .type = NLA_STRING, },
+
+ [IEEE802154_ATTR_STATUS] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_BCN_ORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SF_ORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_PAN_COORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_DEV_TYPE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_SRC_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_SRC_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_SRC_PAN_ID] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_DEST_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_DEST_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_DEST_PAN_ID] = { .type = NLA_U16, },
+
+ [IEEE802154_ATTR_CAPABILITY] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_REASON] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SCAN_TYPE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CHANNELS] = { .type = NLA_U32, },
+ [IEEE802154_ATTR_DURATION] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_ED_LIST] = { .len = 27 },
+ [IEEE802154_ATTR_CHANNEL_PAGE_LIST] = { .len = 32 * 4, },
+
+ [IEEE802154_ATTR_TXPOWER] = { .type = NLA_S8, },
+ [IEEE802154_ATTR_LBT_ENABLED] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CCA_MODE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, },
+ [IEEE802154_ATTR_CSMA_RETRIES] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CSMA_MIN_BE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CSMA_MAX_BE] = { .type = NLA_U8, },
+
+ [IEEE802154_ATTR_FRAME_RETRIES] = { .type = NLA_S8, },
+
+ [IEEE802154_ATTR_LLSEC_ENABLED] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_SECLEVEL] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_KEY_MODE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT] = { .type = NLA_U32, },
+ [IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_LLSEC_KEY_ID] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_FRAME_COUNTER] = { .type = NLA_U32 },
+ [IEEE802154_ATTR_LLSEC_KEY_BYTES] = { .len = 16, },
+ [IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS] = { .len = 258 / 8 },
+ [IEEE802154_ATTR_LLSEC_FRAME_TYPE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_CMD_FRAME_ID] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_SECLEVELS] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, },
+};
diff --git a/net/ieee802154/pan.c b/net/ieee802154/pan.c
new file mode 100644
index 000000000000..249df7364b3e
--- /dev/null
+++ b/net/ieee802154/pan.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IEEE 802.15.4 PAN management
+ *
+ * Copyright (C) 2023 Qorvo US, Inc
+ * Authors:
+ * - David Girault <david.girault@qorvo.com>
+ * - Miquel Raynal <miquel.raynal@bootlin.com>
+ */
+
+#include <linux/kernel.h>
+#include <net/cfg802154.h>
+#include <net/af_ieee802154.h>
+
+/* Checks whether a device address matches one from the PAN list.
+ * This helper is meant to be used only during PAN management, when we expect
+ * extended addresses to be used.
+ */
+static bool cfg802154_pan_device_is_matching(struct ieee802154_pan_device *pan_dev,
+ struct ieee802154_addr *ext_dev)
+{
+ if (!pan_dev || !ext_dev)
+ return false;
+
+ if (ext_dev->mode == IEEE802154_ADDR_SHORT)
+ return false;
+
+ return pan_dev->extended_addr == ext_dev->extended_addr;
+}
+
+bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev)
+{
+ bool is_assoc;
+
+ mutex_lock(&wpan_dev->association_lock);
+ is_assoc = !list_empty(&wpan_dev->children) || wpan_dev->parent;
+ mutex_unlock(&wpan_dev->association_lock);
+
+ return is_assoc;
+}
+
+bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target)
+{
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ return cfg802154_pan_device_is_matching(wpan_dev->parent, target);
+}
+EXPORT_SYMBOL_GPL(cfg802154_device_is_parent);
+
+struct ieee802154_pan_device *
+cfg802154_device_is_child(struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target)
+{
+ struct ieee802154_pan_device *child;
+
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ list_for_each_entry(child, &wpan_dev->children, node)
+ if (cfg802154_pan_device_is_matching(child, target))
+ return child;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(cfg802154_device_is_child);
+
+__le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev)
+{
+ struct ieee802154_pan_device *child;
+ __le16 addr;
+
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ do {
+ get_random_bytes(&addr, 2);
+ if (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST) ||
+ addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC))
+ continue;
+
+ if (wpan_dev->short_addr == addr)
+ continue;
+
+ if (wpan_dev->parent && wpan_dev->parent->short_addr == addr)
+ continue;
+
+ list_for_each_entry(child, &wpan_dev->children, node)
+ if (child->short_addr == addr)
+ continue;
+
+ break;
+ } while (1);
+
+ return addr;
+}
+EXPORT_SYMBOL_GPL(cfg802154_get_free_short_addr);
+
+unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev,
+ unsigned int max)
+{
+ unsigned int old_max;
+
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ old_max = wpan_dev->max_associations;
+ wpan_dev->max_associations = max;
+
+ return old_max;
+}
+EXPORT_SYMBOL_GPL(cfg802154_set_max_associations);
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
new file mode 100644
index 000000000000..64071ef6f57b
--- /dev/null
+++ b/net/ieee802154/rdev-ops.h
@@ -0,0 +1,407 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CFG802154_RDEV_OPS
+#define __CFG802154_RDEV_OPS
+
+#include <net/cfg802154.h>
+
+#include "core.h"
+#include "trace.h"
+
+static inline struct net_device *
+rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
+ const char *name,
+ unsigned char name_assign_type,
+ int type)
+{
+ return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name,
+ name_assign_type, type);
+}
+
+static inline void
+rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
+ struct net_device *dev)
+{
+ rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev);
+}
+
+static inline int
+rdev_suspend(struct cfg802154_registered_device *rdev)
+{
+ int ret;
+ trace_802154_rdev_suspend(&rdev->wpan_phy);
+ ret = rdev->ops->suspend(&rdev->wpan_phy);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_resume(struct cfg802154_registered_device *rdev)
+{
+ int ret;
+ trace_802154_rdev_resume(&rdev->wpan_phy);
+ ret = rdev->ops->resume(&rdev->wpan_phy);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
+ unsigned char name_assign_type,
+ enum nl802154_iftype type, __le64 extended_addr)
+{
+ int ret;
+
+ trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type,
+ extended_addr);
+ ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name,
+ name_assign_type, type,
+ extended_addr);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_del_virtual_intf(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ int ret;
+
+ trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel)
+{
+ int ret;
+
+ trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel);
+ ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
+ const struct wpan_phy_cca *cca)
+{
+ int ret;
+
+ trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca);
+ ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level)
+{
+ int ret;
+
+ trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level);
+ ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_tx_power(struct cfg802154_registered_device *rdev,
+ s32 power)
+{
+ int ret;
+
+ trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power);
+ ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_pan_id(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le16 pan_id)
+{
+ int ret;
+
+ trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_short_addr(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le16 short_addr)
+{
+ int ret;
+
+ trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u8 min_be, u8 max_be)
+{
+ int ret;
+
+ trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
+ min_be, max_be);
+ ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
+ min_be, max_be);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u8 max_csma_backoffs)
+{
+ int ret;
+
+ trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev,
+ max_csma_backoffs);
+ ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev,
+ max_csma_backoffs);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, s8 max_frame_retries)
+{
+ int ret;
+
+ trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
+ max_frame_retries);
+ ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
+ max_frame_retries);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, bool mode)
+{
+ int ret;
+
+ trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_ackreq_default(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, bool ackreq)
+{
+ int ret;
+
+ trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev,
+ ackreq);
+ ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_trigger_scan(struct cfg802154_registered_device *rdev,
+ struct cfg802154_scan_request *request)
+{
+ int ret;
+
+ if (!rdev->ops->trigger_scan)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_trigger_scan(&rdev->wpan_phy, request);
+ ret = rdev->ops->trigger_scan(&rdev->wpan_phy, request);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_abort_scan(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ int ret;
+
+ if (!rdev->ops->abort_scan)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_abort_scan(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->abort_scan(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_send_beacons(struct cfg802154_registered_device *rdev,
+ struct cfg802154_beacon_request *request)
+{
+ int ret;
+
+ if (!rdev->ops->send_beacons)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_send_beacons(&rdev->wpan_phy, request);
+ ret = rdev->ops->send_beacons(&rdev->wpan_phy, request);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_stop_beacons(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ int ret;
+
+ if (!rdev->ops->stop_beacons)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_stop_beacons(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->stop_beacons(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_associate(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *coord)
+{
+ int ret;
+
+ if (!rdev->ops->associate)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_associate(&rdev->wpan_phy, wpan_dev, coord);
+ ret = rdev->ops->associate(&rdev->wpan_phy, wpan_dev, coord);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_disassociate(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target)
+{
+ int ret;
+
+ if (!rdev->ops->disassociate)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_disassociate(&rdev->wpan_phy, wpan_dev, target);
+ ret = rdev->ops->disassociate(&rdev->wpan_phy, wpan_dev, target);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+/* TODO this is already a nl802154, so move into ieee802154 */
+static inline void
+rdev_get_llsec_table(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_llsec_table **table)
+{
+ rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table);
+}
+
+static inline void
+rdev_lock_llsec_table(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev);
+}
+
+static inline void
+rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev);
+}
+
+static inline int
+rdev_get_llsec_params(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_llsec_params *params)
+{
+ return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params);
+}
+
+static inline int
+rdev_set_llsec_params(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_params *params,
+ u32 changed)
+{
+ return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params,
+ changed);
+}
+
+static inline int
+rdev_add_llsec_key(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key)
+{
+ return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key);
+}
+
+static inline int
+rdev_del_llsec_key(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_key_id *id)
+{
+ return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id);
+}
+
+static inline int
+rdev_add_seclevel(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl);
+}
+
+static inline int
+rdev_del_seclevel(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl);
+}
+
+static inline int
+rdev_add_device(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ const struct ieee802154_llsec_device *dev_desc)
+{
+ return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc);
+}
+
+static inline int
+rdev_del_device(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le64 extended_addr)
+{
+ return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr);
+}
+
+static inline int
+rdev_add_devkey(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *devkey)
+{
+ return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr,
+ devkey);
+}
+
+static inline int
+rdev_del_devkey(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le64 extended_addr,
+ const struct ieee802154_llsec_device_key *devkey)
+{
+ return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr,
+ devkey);
+}
+#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
+
+#endif /* __CFG802154_RDEV_OPS */
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
new file mode 100644
index 000000000000..e542fbe113e7
--- /dev/null
+++ b/net/ieee802154/socket.c
@@ -0,0 +1,1143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IEEE802154.4 socket interface
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ */
+
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/if.h>
+#include <linux/termios.h> /* For TIOCOUTQ/INQ */
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/route.h>
+
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+
+/* Utility function for families */
+static struct net_device*
+ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr)
+{
+ struct net_device *dev = NULL;
+ struct net_device *tmp;
+ __le16 pan_id, short_addr;
+ u8 hwaddr[IEEE802154_ADDR_LEN];
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr);
+ rcu_read_lock();
+ dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr);
+ dev_hold(dev);
+ rcu_read_unlock();
+ break;
+ case IEEE802154_ADDR_SHORT:
+ if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) ||
+ addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) ||
+ addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST))
+ break;
+
+ rtnl_lock();
+
+ for_each_netdev(net, tmp) {
+ if (tmp->type != ARPHRD_IEEE802154)
+ continue;
+
+ pan_id = tmp->ieee802154_ptr->pan_id;
+ short_addr = tmp->ieee802154_ptr->short_addr;
+ if (pan_id == addr->pan_id &&
+ short_addr == addr->short_addr) {
+ dev = tmp;
+ dev_hold(dev);
+ break;
+ }
+ }
+
+ rtnl_unlock();
+ break;
+ default:
+ pr_warn("Unsupported ieee802154 address type: %d\n",
+ addr->mode);
+ break;
+ }
+
+ return dev;
+}
+
+static int ieee802154_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, 0);
+ }
+ return 0;
+}
+
+static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->sendmsg(sk, msg, len);
+}
+
+static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->bind)
+ return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+ return sock_no_bind(sock, uaddr, addr_len);
+}
+
+static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
+}
+
+static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
+ unsigned int cmd)
+{
+ struct ifreq ifr;
+ int ret = -ENOIOCTLCMD;
+ struct net_device *dev;
+
+ if (get_user_ifreq(&ifr, NULL, arg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ dev_load(sock_net(sk), ifr.ifr_name);
+ dev = dev_get_by_name(sock_net(sk), ifr.ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl)
+ ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd);
+
+ if (!ret && put_user_ifreq(&ifr, arg))
+ ret = -EFAULT;
+ dev_put(dev);
+
+ return ret;
+}
+
+static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg,
+ cmd);
+ default:
+ if (!sk->sk_prot->ioctl)
+ return -ENOIOCTLCMD;
+ return sk_ioctl(sk, cmd, (void __user *)arg);
+ }
+}
+
+/* RAW Sockets (802.15.4 created in userspace) */
+static HLIST_HEAD(raw_head);
+static DEFINE_RWLOCK(raw_lock);
+
+static int raw_hash(struct sock *sk)
+{
+ write_lock_bh(&raw_lock);
+ sk_add_node(sk, &raw_head);
+ write_unlock_bh(&raw_lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ return 0;
+}
+
+static void raw_unhash(struct sock *sk)
+{
+ write_lock_bh(&raw_lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&raw_lock);
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+ sk_common_release(sk);
+}
+
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *_uaddr, int len)
+{
+ struct ieee802154_addr addr;
+ struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr;
+ int err = 0;
+ struct net_device *dev = NULL;
+
+ err = ieee802154_sockaddr_check_size(uaddr, len);
+ if (err < 0)
+ return err;
+
+ uaddr = (struct sockaddr_ieee802154 *)_uaddr;
+ if (uaddr->family != AF_IEEE802154)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ ieee802154_addr_from_sa(&addr, &uaddr->addr);
+ dev = ieee802154_get_dev(sock_net(sk), &addr);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ sk->sk_bound_dev_if = dev->ifindex;
+ sk_dst_reset(sk);
+
+ dev_put(dev);
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int raw_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ return -ENOTSUPP;
+}
+
+static int raw_disconnect(struct sock *sk, int flags)
+{
+ return 0;
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct net_device *dev;
+ unsigned int mtu;
+ struct sk_buff *skb;
+ int hlen, tlen;
+ int err;
+
+ if (msg->msg_flags & MSG_OOB) {
+ pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags);
+ return -EOPNOTSUPP;
+ }
+
+ lock_sock(sk);
+ if (!sk->sk_bound_dev_if)
+ dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
+ else
+ dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if);
+ release_sock(sk);
+
+ if (!dev) {
+ pr_debug("no dev\n");
+ err = -ENXIO;
+ goto out;
+ }
+
+ mtu = IEEE802154_MTU;
+ pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
+
+ if (size > mtu) {
+ pr_debug("size = %zu, mtu = %u\n", size, mtu);
+ err = -EMSGSIZE;
+ goto out_dev;
+ }
+ if (!size) {
+ err = 0;
+ goto out_dev;
+ }
+
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk, hlen + tlen + size,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto out_dev;
+
+ skb_reserve(skb, hlen);
+
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err < 0)
+ goto out_skb;
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IEEE802154);
+
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ dev_put(dev);
+
+ return err ?: size;
+
+out_skb:
+ kfree_skb(skb);
+out_dev:
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ struct sk_buff *skb;
+
+ skb = skb_recv_datagram(sk, flags, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ if (err)
+ return err;
+ return copied;
+}
+
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+static void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk;
+
+ read_lock(&raw_lock);
+ sk_for_each(sk, &raw_head) {
+ bh_lock_sock(sk);
+ if (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == dev->ifindex) {
+ struct sk_buff *clone;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ raw_rcv_skb(sk, clone);
+ }
+ bh_unlock_sock(sk);
+ }
+ read_unlock(&raw_lock);
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct proto ieee802154_raw_prot = {
+ .name = "IEEE-802.15.4-RAW",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct sock),
+ .close = raw_close,
+ .bind = raw_bind,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .hash = raw_hash,
+ .unhash = raw_unhash,
+ .connect = raw_connect,
+ .disconnect = raw_disconnect,
+ .getsockopt = raw_getsockopt,
+ .setsockopt = raw_setsockopt,
+};
+
+static const struct proto_ops ieee802154_raw_ops = {
+ .family = PF_IEEE802154,
+ .owner = THIS_MODULE,
+ .release = ieee802154_sock_release,
+ .bind = ieee802154_sock_bind,
+ .connect = ieee802154_sock_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = ieee802154_sock_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = ieee802154_sock_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+/* DGRAM Sockets (802.15.4 dataframes) */
+static HLIST_HEAD(dgram_head);
+static DEFINE_RWLOCK(dgram_lock);
+
+struct dgram_sock {
+ struct sock sk;
+
+ struct ieee802154_addr src_addr;
+ struct ieee802154_addr dst_addr;
+
+ unsigned int bound:1;
+ unsigned int connected:1;
+ unsigned int want_ack:1;
+ unsigned int want_lqi:1;
+ unsigned int secen:1;
+ unsigned int secen_override:1;
+ unsigned int seclevel:3;
+ unsigned int seclevel_override:1;
+};
+
+static inline struct dgram_sock *dgram_sk(const struct sock *sk)
+{
+ return container_of(sk, struct dgram_sock, sk);
+}
+
+static int dgram_hash(struct sock *sk)
+{
+ write_lock_bh(&dgram_lock);
+ sk_add_node(sk, &dgram_head);
+ write_unlock_bh(&dgram_lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ return 0;
+}
+
+static void dgram_unhash(struct sock *sk)
+{
+ write_lock_bh(&dgram_lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&dgram_lock);
+}
+
+static int dgram_init(struct sock *sk)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+
+ ro->want_ack = 1;
+ ro->want_lqi = 0;
+ return 0;
+}
+
+static void dgram_close(struct sock *sk, long timeout)
+{
+ sk_common_release(sk);
+}
+
+static int dgram_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int len)
+{
+ struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+ struct ieee802154_addr haddr;
+ struct dgram_sock *ro = dgram_sk(sk);
+ int err = -EINVAL;
+ struct net_device *dev;
+
+ lock_sock(sk);
+
+ ro->bound = 0;
+
+ err = ieee802154_sockaddr_check_size(addr, len);
+ if (err < 0)
+ goto out;
+
+ if (addr->family != AF_IEEE802154) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ieee802154_addr_from_sa(&haddr, &addr->addr);
+ dev = ieee802154_get_dev(sock_net(sk), &haddr);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (dev->type != ARPHRD_IEEE802154) {
+ err = -ENODEV;
+ goto out_put;
+ }
+
+ ro->src_addr = haddr;
+
+ ro->bound = 1;
+ err = 0;
+out_put:
+ dev_put(dev);
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int dgram_ioctl(struct sock *sk, int cmd, int *karg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ {
+ *karg = sk_wmem_alloc_get(sk);
+
+ return 0;
+ }
+
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+
+ *karg = 0;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb) {
+ /* We will only return the amount
+ * of this packet since that is all
+ * that will be read.
+ */
+ *karg = skb->len - ieee802154_hdr_length(skb);
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return 0;
+ }
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+/* FIXME: autobind */
+static int dgram_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int len)
+{
+ struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+ struct dgram_sock *ro = dgram_sk(sk);
+ int err = 0;
+
+ err = ieee802154_sockaddr_check_size(addr, len);
+ if (err < 0)
+ return err;
+
+ if (addr->family != AF_IEEE802154)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (!ro->bound) {
+ err = -ENETUNREACH;
+ goto out;
+ }
+
+ ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr);
+ ro->connected = 1;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int dgram_disconnect(struct sock *sk, int flags)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+
+ lock_sock(sk);
+ ro->connected = 0;
+ release_sock(sk);
+
+ return 0;
+}
+
+static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct net_device *dev;
+ unsigned int mtu;
+ struct sk_buff *skb;
+ struct ieee802154_mac_cb *cb;
+ struct dgram_sock *ro = dgram_sk(sk);
+ struct ieee802154_addr dst_addr;
+ DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name);
+ int hlen, tlen;
+ int err;
+
+ if (msg->msg_flags & MSG_OOB) {
+ pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags);
+ return -EOPNOTSUPP;
+ }
+
+ if (msg->msg_name) {
+ if (ro->connected)
+ return -EISCONN;
+ if (msg->msg_namelen < IEEE802154_MIN_NAMELEN)
+ return -EINVAL;
+ err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen);
+ if (err < 0)
+ return err;
+ ieee802154_addr_from_sa(&dst_addr, &daddr->addr);
+ } else {
+ if (!ro->connected)
+ return -EDESTADDRREQ;
+ dst_addr = ro->dst_addr;
+ }
+
+ if (!ro->bound)
+ dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
+ else
+ dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr);
+
+ if (!dev) {
+ pr_debug("no dev\n");
+ err = -ENXIO;
+ goto out;
+ }
+ mtu = IEEE802154_MTU;
+ pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
+
+ if (size > mtu) {
+ pr_debug("size = %zu, mtu = %u\n", size, mtu);
+ err = -EMSGSIZE;
+ goto out_dev;
+ }
+
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk, hlen + tlen + size,
+ msg->msg_flags & MSG_DONTWAIT,
+ &err);
+ if (!skb)
+ goto out_dev;
+
+ skb_reserve(skb, hlen);
+
+ skb_reset_network_header(skb);
+
+ cb = mac_cb_init(skb);
+ cb->type = IEEE802154_FC_TYPE_DATA;
+ cb->ackreq = ro->want_ack;
+ cb->secen = ro->secen;
+ cb->secen_override = ro->secen_override;
+ cb->seclevel = ro->seclevel;
+ cb->seclevel_override = ro->seclevel_override;
+
+ err = wpan_dev_hard_header(skb, dev, &dst_addr,
+ ro->bound ? &ro->src_addr : NULL, size);
+ if (err < 0)
+ goto out_skb;
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err < 0)
+ goto out_skb;
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IEEE802154);
+
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ dev_put(dev);
+
+ return err ?: size;
+
+out_skb:
+ kfree_skb(skb);
+out_dev:
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ struct sk_buff *skb;
+ struct dgram_sock *ro = dgram_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name);
+
+ skb = skb_recv_datagram(sk, flags, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ /* FIXME: skip headers if necessary ?! */
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (saddr) {
+ /* Clear the implicit padding in struct sockaddr_ieee802154
+ * (16 bits between 'family' and 'addr') and in struct
+ * ieee802154_addr_sa (16 bits at the end of the structure).
+ */
+ memset(saddr, 0, sizeof(*saddr));
+
+ saddr->family = AF_IEEE802154;
+ ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source);
+ *addr_len = sizeof(*saddr);
+ }
+
+ if (ro->want_lqi) {
+ err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI,
+ sizeof(uint8_t), &(mac_cb(skb)->lqi));
+ if (err)
+ goto done;
+ }
+
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ if (err)
+ return err;
+ return copied;
+}
+
+static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+static inline bool
+ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr,
+ struct dgram_sock *ro)
+{
+ if (!ro->bound)
+ return true;
+
+ if (ro->src_addr.mode == IEEE802154_ADDR_LONG &&
+ hw_addr == ro->src_addr.extended_addr)
+ return true;
+
+ if (ro->src_addr.mode == IEEE802154_ADDR_SHORT &&
+ pan_id == ro->src_addr.pan_id &&
+ short_addr == ro->src_addr.short_addr)
+ return true;
+
+ return false;
+}
+
+static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk, *prev = NULL;
+ int ret = NET_RX_SUCCESS;
+ __le16 pan_id, short_addr;
+ __le64 hw_addr;
+
+ /* Data frame processing */
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ pan_id = dev->ieee802154_ptr->pan_id;
+ short_addr = dev->ieee802154_ptr->short_addr;
+ hw_addr = dev->ieee802154_ptr->extended_addr;
+
+ read_lock(&dgram_lock);
+ sk_for_each(sk, &dgram_head) {
+ if (ieee802154_match_sock(hw_addr, pan_id, short_addr,
+ dgram_sk(sk))) {
+ if (prev) {
+ struct sk_buff *clone;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ dgram_rcv_skb(prev, clone);
+ }
+
+ prev = sk;
+ }
+ }
+
+ if (prev) {
+ dgram_rcv_skb(prev, skb);
+ } else {
+ kfree_skb(skb);
+ ret = NET_RX_DROP;
+ }
+ read_unlock(&dgram_lock);
+
+ return ret;
+}
+
+static int dgram_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+
+ int val, len;
+
+ if (level != SOL_IEEE802154)
+ return -EOPNOTSUPP;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ switch (optname) {
+ case WPAN_WANTACK:
+ val = ro->want_ack;
+ break;
+ case WPAN_WANTLQI:
+ val = ro->want_lqi;
+ break;
+ case WPAN_SECURITY:
+ if (!ro->secen_override)
+ val = WPAN_SECURITY_DEFAULT;
+ else if (ro->secen)
+ val = WPAN_SECURITY_ON;
+ else
+ val = WPAN_SECURITY_OFF;
+ break;
+ case WPAN_SECURITY_LEVEL:
+ if (!ro->seclevel_override)
+ val = WPAN_SECURITY_LEVEL_DEFAULT;
+ else
+ val = ro->seclevel;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static int dgram_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+ struct net *net = sock_net(sk);
+ int val;
+ int err = 0;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case WPAN_WANTACK:
+ ro->want_ack = !!val;
+ break;
+ case WPAN_WANTLQI:
+ ro->want_lqi = !!val;
+ break;
+ case WPAN_SECURITY:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN) &&
+ !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ break;
+ }
+
+ switch (val) {
+ case WPAN_SECURITY_DEFAULT:
+ ro->secen_override = 0;
+ break;
+ case WPAN_SECURITY_ON:
+ ro->secen_override = 1;
+ ro->secen = 1;
+ break;
+ case WPAN_SECURITY_OFF:
+ ro->secen_override = 1;
+ ro->secen = 0;
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ break;
+ case WPAN_SECURITY_LEVEL:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN) &&
+ !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ break;
+ }
+
+ if (val < WPAN_SECURITY_LEVEL_DEFAULT ||
+ val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) {
+ err = -EINVAL;
+ } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) {
+ ro->seclevel_override = 0;
+ } else {
+ ro->seclevel_override = 1;
+ ro->seclevel = val;
+ }
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static struct proto ieee802154_dgram_prot = {
+ .name = "IEEE-802.15.4-MAC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct dgram_sock),
+ .init = dgram_init,
+ .close = dgram_close,
+ .bind = dgram_bind,
+ .sendmsg = dgram_sendmsg,
+ .recvmsg = dgram_recvmsg,
+ .hash = dgram_hash,
+ .unhash = dgram_unhash,
+ .connect = dgram_connect,
+ .disconnect = dgram_disconnect,
+ .ioctl = dgram_ioctl,
+ .getsockopt = dgram_getsockopt,
+ .setsockopt = dgram_setsockopt,
+};
+
+static const struct proto_ops ieee802154_dgram_ops = {
+ .family = PF_IEEE802154,
+ .owner = THIS_MODULE,
+ .release = ieee802154_sock_release,
+ .bind = ieee802154_sock_bind,
+ .connect = ieee802154_sock_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = ieee802154_sock_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = ieee802154_sock_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static void ieee802154_sock_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_receive_queue);
+}
+
+/* Create a socket. Initialise the socket, blank the addresses
+ * set the state.
+ */
+static int ieee802154_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct sock *sk;
+ int rc;
+ struct proto *proto;
+ const struct proto_ops *ops;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ switch (sock->type) {
+ case SOCK_RAW:
+ rc = -EPERM;
+ if (!capable(CAP_NET_RAW))
+ goto out;
+ proto = &ieee802154_raw_prot;
+ ops = &ieee802154_raw_ops;
+ break;
+ case SOCK_DGRAM:
+ proto = &ieee802154_dgram_prot;
+ ops = &ieee802154_dgram_ops;
+ break;
+ default:
+ rc = -ESOCKTNOSUPPORT;
+ goto out;
+ }
+
+ rc = -ENOMEM;
+ sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern);
+ if (!sk)
+ goto out;
+ rc = 0;
+
+ sock->ops = ops;
+
+ sock_init_data(sock, sk);
+ sk->sk_destruct = ieee802154_sock_destruct;
+ sk->sk_family = PF_IEEE802154;
+
+ /* Checksums on by default */
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ if (sk->sk_prot->hash) {
+ rc = sk->sk_prot->hash(sk);
+ if (rc)
+ goto out_sk_release;
+ }
+
+ if (sk->sk_prot->init) {
+ rc = sk->sk_prot->init(sk);
+ if (rc)
+ goto out_sk_release;
+ }
+out:
+ return rc;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
+}
+
+static const struct net_proto_family ieee802154_family_ops = {
+ .family = PF_IEEE802154,
+ .create = ieee802154_create,
+ .owner = THIS_MODULE,
+};
+
+static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (!netif_running(dev))
+ goto drop;
+ pr_debug("got frame, type %d, dev %p\n", dev->type, dev);
+#ifdef DEBUG
+ print_hex_dump_bytes("ieee802154_rcv ",
+ DUMP_PREFIX_NONE, skb->data, skb->len);
+#endif
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ ieee802154_raw_deliver(dev, skb);
+
+ if (dev->type != ARPHRD_IEEE802154)
+ goto drop;
+
+ if (skb->pkt_type != PACKET_OTHERHOST)
+ return ieee802154_dgram_deliver(dev, skb);
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static struct packet_type ieee802154_packet_type = {
+ .type = htons(ETH_P_IEEE802154),
+ .func = ieee802154_rcv,
+};
+
+static int __init af_ieee802154_init(void)
+{
+ int rc;
+
+ rc = proto_register(&ieee802154_raw_prot, 1);
+ if (rc)
+ goto out;
+
+ rc = proto_register(&ieee802154_dgram_prot, 1);
+ if (rc)
+ goto err_dgram;
+
+ /* Tell SOCKET that we are alive */
+ rc = sock_register(&ieee802154_family_ops);
+ if (rc)
+ goto err_sock;
+ dev_add_pack(&ieee802154_packet_type);
+
+ rc = 0;
+ goto out;
+
+err_sock:
+ proto_unregister(&ieee802154_dgram_prot);
+err_dgram:
+ proto_unregister(&ieee802154_raw_prot);
+out:
+ return rc;
+}
+
+static void __exit af_ieee802154_remove(void)
+{
+ dev_remove_pack(&ieee802154_packet_type);
+ sock_unregister(PF_IEEE802154);
+ proto_unregister(&ieee802154_dgram_prot);
+ proto_unregister(&ieee802154_raw_prot);
+}
+
+module_init(af_ieee802154_init);
+module_exit(af_ieee802154_remove);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IEEE 802.15.4 socket interface");
+MODULE_ALIAS_NETPROTO(PF_IEEE802154);
diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c
new file mode 100644
index 000000000000..6708160ebf9f
--- /dev/null
+++ b/net/ieee802154/sysfs.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/wireless/sysfs.c
+ */
+
+#include <linux/device.h>
+#include <linux/rtnetlink.h>
+
+#include <net/cfg802154.h>
+
+#include "core.h"
+#include "sysfs.h"
+#include "rdev-ops.h"
+
+static inline struct cfg802154_registered_device *
+dev_to_rdev(struct device *dev)
+{
+ return container_of(dev, struct cfg802154_registered_device,
+ wpan_phy.dev);
+}
+
+#define SHOW_FMT(name, fmt, member) \
+static ssize_t name ## _show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \
+} \
+static DEVICE_ATTR_RO(name)
+
+SHOW_FMT(index, "%d", wpan_phy_idx);
+
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct wpan_phy *wpan_phy = &dev_to_rdev(dev)->wpan_phy;
+
+ return sprintf(buf, "%s\n", dev_name(&wpan_phy->dev));
+}
+static DEVICE_ATTR_RO(name);
+
+static void wpan_phy_release(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+
+ cfg802154_dev_free(rdev);
+}
+
+static struct attribute *pmib_attrs[] = {
+ &dev_attr_index.attr,
+ &dev_attr_name.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(pmib);
+
+#ifdef CONFIG_PM_SLEEP
+static int wpan_phy_suspend(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ if (rdev->ops->suspend) {
+ rtnl_lock();
+ ret = rdev_suspend(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static int wpan_phy_resume(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ if (rdev->ops->resume) {
+ rtnl_lock();
+ ret = rdev_resume(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume);
+#define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops)
+#else
+#define WPAN_PHY_PM_OPS NULL
+#endif
+
+const struct class wpan_phy_class = {
+ .name = "ieee802154",
+ .dev_release = wpan_phy_release,
+ .dev_groups = pmib_groups,
+ .pm = WPAN_PHY_PM_OPS,
+};
+
+int wpan_phy_sysfs_init(void)
+{
+ return class_register(&wpan_phy_class);
+}
+
+void wpan_phy_sysfs_exit(void)
+{
+ class_unregister(&wpan_phy_class);
+}
diff --git a/net/ieee802154/sysfs.h b/net/ieee802154/sysfs.h
new file mode 100644
index 000000000000..69961e166257
--- /dev/null
+++ b/net/ieee802154/sysfs.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __IEEE802154_SYSFS_H
+#define __IEEE802154_SYSFS_H
+
+int wpan_phy_sysfs_init(void);
+void wpan_phy_sysfs_exit(void);
+
+extern const struct class wpan_phy_class;
+
+#endif /* __IEEE802154_SYSFS_H */
diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c
new file mode 100644
index 000000000000..95f997fad755
--- /dev/null
+++ b/net/ieee802154/trace.c
@@ -0,0 +1,7 @@
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
new file mode 100644
index 000000000000..591ce0a16fc0
--- /dev/null
+++ b/net/ieee802154/trace.h
@@ -0,0 +1,418 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Based on net/wireless/trace.h */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cfg802154
+
+#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __RDEV_CFG802154_OPS_TRACE
+
+#include <linux/tracepoint.h>
+
+#include <net/cfg802154.h>
+
+#define MAXNAME 32
+#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME)
+#define WPAN_PHY_ASSIGN strscpy(__entry->wpan_phy_name, \
+ wpan_phy_name(wpan_phy), \
+ MAXNAME)
+#define WPAN_PHY_PR_FMT "%s"
+#define WPAN_PHY_PR_ARG __entry->wpan_phy_name
+
+#define WPAN_DEV_ENTRY __field(u32, identifier)
+#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \
+ ? wpan_dev->identifier : 0)
+#define WPAN_DEV_PR_FMT "wpan_dev(%u)"
+#define WPAN_DEV_PR_ARG (__entry->identifier)
+
+#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \
+ __field(enum nl802154_cca_opts, cca_opt)
+#define WPAN_CCA_ASSIGN \
+ do { \
+ (__entry->cca_mode) = cca->mode; \
+ (__entry->cca_opt) = cca->opt; \
+ } while (0)
+#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d"
+#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt
+
+#define BOOL_TO_STR(bo) (bo) ? "true" : "false"
+
+/*************************************************************
+ * rdev->ops traces *
+ *************************************************************/
+
+DECLARE_EVENT_CLASS(wpan_phy_only_evt,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy)
+);
+
+TRACE_EVENT(802154_rdev_add_virtual_intf,
+ TP_PROTO(struct wpan_phy *wpan_phy, char *name,
+ enum nl802154_iftype type, __le64 extended_addr),
+ TP_ARGS(wpan_phy, name, type, extended_addr),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __string(vir_intf_name, name ? name : "<noname>")
+ __field(enum nl802154_iftype, type)
+ __field(__le64, extended_addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __assign_str(vir_intf_name);
+ __entry->type = type;
+ __entry->extended_addr = extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx",
+ WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type,
+ __le64_to_cpu(__entry->extended_addr))
+);
+
+TRACE_EVENT(802154_rdev_del_virtual_intf,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG)
+);
+
+TRACE_EVENT(802154_rdev_set_channel,
+ TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel),
+ TP_ARGS(wpan_phy, page, channel),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, page)
+ __field(u8, channel)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->page = page;
+ __entry->channel = channel;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG,
+ __entry->page, __entry->channel)
+);
+
+TRACE_EVENT(802154_rdev_set_tx_power,
+ TP_PROTO(struct wpan_phy *wpan_phy, s32 power),
+ TP_ARGS(wpan_phy, power),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(s32, power)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->power = power;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG,
+ __entry->power)
+);
+
+TRACE_EVENT(802154_rdev_set_cca_mode,
+ TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
+ TP_ARGS(wpan_phy, cca),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_CCA_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_CCA_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG,
+ WPAN_CCA_PR_ARG)
+);
+
+TRACE_EVENT(802154_rdev_set_cca_ed_level,
+ TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level),
+ TP_ARGS(wpan_phy, ed_level),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(s32, ed_level)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->ed_level = ed_level;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG,
+ __entry->ed_level)
+);
+
+DECLARE_EVENT_CLASS(802154_le16_template,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le16, le16arg)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->le16arg = le16arg;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
+ __le16_to_cpu(__entry->le16arg))
+);
+
+DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg)
+);
+
+DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
+ __le16_to_cpu(__entry->le16arg))
+);
+
+TRACE_EVENT(802154_rdev_set_backoff_exponent,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ u8 min_be, u8 max_be),
+ TP_ARGS(wpan_phy, wpan_dev, min_be, max_be),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(u8, min_be)
+ __field(u8, max_be)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->min_be = min_be;
+ __entry->max_be = max_be;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", min be: %d, max be: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be)
+);
+
+TRACE_EVENT(802154_rdev_set_csma_backoffs,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ u8 max_csma_backoffs),
+ TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(u8, max_csma_backoffs)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->max_csma_backoffs = max_csma_backoffs;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", max csma backoffs: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->max_csma_backoffs)
+);
+
+TRACE_EVENT(802154_rdev_set_max_frame_retries,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ s8 max_frame_retries),
+ TP_ARGS(wpan_phy, wpan_dev, max_frame_retries),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(s8, max_frame_retries)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->max_frame_retries = max_frame_retries;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", max frame retries: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->max_frame_retries)
+);
+
+TRACE_EVENT(802154_rdev_set_lbt_mode,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ bool mode),
+ TP_ARGS(wpan_phy, wpan_dev, mode),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(bool, mode)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->mode = mode;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", lbt mode: %s", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
+);
+
+TRACE_EVENT(802154_rdev_set_ackreq_default,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ bool ackreq),
+ TP_ARGS(wpan_phy, wpan_dev, ackreq),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(bool, ackreq)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->ackreq = ackreq;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", ackreq default: %s", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq))
+);
+
+TRACE_EVENT(802154_rdev_trigger_scan,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct cfg802154_scan_request *request),
+ TP_ARGS(wpan_phy, request),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, page)
+ __field(u32, channels)
+ __field(u8, duration)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->page = request->page;
+ __entry->channels = request->channels;
+ __entry->duration = request->duration;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", scan, page: %d, channels: %x, duration %d",
+ WPAN_PHY_PR_ARG, __entry->page, __entry->channels, __entry->duration)
+);
+
+TRACE_EVENT(802154_rdev_send_beacons,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct cfg802154_beacon_request *request),
+ TP_ARGS(wpan_phy, request),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, interval)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->interval = request->interval;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", sending beacons (interval order: %d)",
+ WPAN_PHY_PR_ARG, __entry->interval)
+);
+
+DECLARE_EVENT_CLASS(802154_wdev_template,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT,
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG)
+);
+
+DEFINE_EVENT(802154_wdev_template, 802154_rdev_abort_scan,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev)
+);
+
+DEFINE_EVENT(802154_wdev_template, 802154_rdev_stop_beacons,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev)
+);
+
+TRACE_EVENT(802154_rdev_associate,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *coord),
+ TP_ARGS(wpan_phy, wpan_dev, coord),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le64, addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->addr = coord->extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", associating with: 0x%llx",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr)
+);
+
+TRACE_EVENT(802154_rdev_disassociate,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target),
+ TP_ARGS(wpan_phy, wpan_dev, target),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le64, addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->addr = target->extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", disassociating with: 0x%llx",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr)
+);
+
+TRACE_EVENT(802154_rdev_return_int,
+ TP_PROTO(struct wpan_phy *wpan_phy, int ret),
+ TP_ARGS(wpan_phy, ret),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG,
+ __entry->ret)
+);
+
+#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..de36a5b91e50
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# IFE subsystem configuration
+#
+
+menuconfig NET_IFE
+ tristate "Inter-FE based on IETF ForCES InterFE LFB"
+ default n
+ help
+ Say Y here to add support of IFE encapsulation protocol
+ For details refer to netdev01 paper:
+ "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+ To compile this support as a module, choose M here: the module will
+ be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..1258fcb07f67
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the IFE encapsulation protocol
+#
+
+obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..be05b690b9ef
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,177 @@
+/*
+ * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
+ * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+struct ifeheadr {
+ __be16 metalen;
+ u8 tlv_data[];
+};
+
+void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+ /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+ * where ORIGDATA = original ethernet header ...
+ */
+ int hdrm = metalen + IFE_METAHDRLEN;
+ int total_push = hdrm + skb->dev->hard_header_len;
+ struct ifeheadr *ifehdr;
+ struct ethhdr *iethh; /* inner ether header */
+ int skboff = 0;
+ int err;
+
+ err = skb_cow_head(skb, total_push);
+ if (unlikely(err))
+ return NULL;
+
+ iethh = (struct ethhdr *) skb->data;
+
+ __skb_push(skb, total_push);
+ memcpy(skb->data, iethh, skb->dev->hard_header_len);
+ skb_reset_mac_header(skb);
+ skboff += skb->dev->hard_header_len;
+
+ /* total metadata length */
+ ifehdr = (struct ifeheadr *) (skb->data + skboff);
+ metalen += IFE_METAHDRLEN;
+ ifehdr->metalen = htons(metalen);
+
+ return ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_encode);
+
+void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+ struct ifeheadr *ifehdr;
+ int total_pull;
+ u16 ifehdrln;
+
+ if (!pskb_may_pull(skb, skb->dev->hard_header_len + IFE_METAHDRLEN))
+ return NULL;
+
+ ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+ ifehdrln = ntohs(ifehdr->metalen);
+ total_pull = skb->dev->hard_header_len + ifehdrln;
+
+ if (unlikely(ifehdrln < 2))
+ return NULL;
+
+ if (unlikely(!pskb_may_pull(skb, total_pull)))
+ return NULL;
+
+ ifehdr = (struct ifeheadr *)(skb->data + skb->dev->hard_header_len);
+ skb_set_mac_header(skb, total_pull);
+ __skb_pull(skb, total_pull);
+ *metalen = ifehdrln - IFE_METAHDRLEN;
+
+ return &ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_decode);
+
+struct meta_tlvhdr {
+ __be16 type;
+ __be16 len;
+};
+
+static bool __ife_tlv_meta_valid(const unsigned char *skbdata,
+ const unsigned char *ifehdr_end)
+{
+ const struct meta_tlvhdr *tlv;
+ u16 tlvlen;
+
+ if (unlikely(skbdata + sizeof(*tlv) > ifehdr_end))
+ return false;
+
+ tlv = (const struct meta_tlvhdr *)skbdata;
+ tlvlen = ntohs(tlv->len);
+
+ /* tlv length field is inc header, check on minimum */
+ if (tlvlen < NLA_HDRLEN)
+ return false;
+
+ /* overflow by NLA_ALIGN check */
+ if (NLA_ALIGN(tlvlen) < tlvlen)
+ return false;
+
+ if (unlikely(skbdata + NLA_ALIGN(tlvlen) > ifehdr_end))
+ return false;
+
+ return true;
+}
+
+/* Caller takes care of presenting data in network order
+ */
+void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype,
+ u16 *dlen, u16 *totlen)
+{
+ struct meta_tlvhdr *tlv;
+
+ if (!__ife_tlv_meta_valid(skbdata, ifehdr_end))
+ return NULL;
+
+ tlv = (struct meta_tlvhdr *)skbdata;
+ *dlen = ntohs(tlv->len) - NLA_HDRLEN;
+ *attrtype = ntohs(tlv->type);
+
+ if (totlen)
+ *totlen = nla_total_size(*dlen);
+
+ return skbdata + sizeof(struct meta_tlvhdr);
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
+
+void *ife_tlv_meta_next(void *skbdata)
+{
+ struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+ u16 tlvlen = ntohs(tlv->len);
+
+ tlvlen = NLA_ALIGN(tlvlen);
+
+ return skbdata + tlvlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
+
+/* Caller takes care of presenting data in network order
+ */
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+ __be32 *tlv = (__be32 *) (skbdata);
+ u16 totlen = nla_total_size(dlen); /*alignment + hdr */
+ char *dptr = (char *) tlv + NLA_HDRLEN;
+ u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
+
+ *tlv = htonl(htlv);
+ memset(dptr, 0, totlen - NLA_HDRLEN);
+ memcpy(dptr, dval, dlen);
+
+ return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
+MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 39e0cb763588..b71c22475c51 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP configuration
#
@@ -9,14 +10,11 @@ config IP_MULTICAST
intend to participate in the MBONE, a high bandwidth network on top
of the Internet which carries audio and video broadcasts. More
information about the MBONE is on the WWW at
- <http://www-itg.lbl.gov/mbone/>. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. For most people, it's
- safe to say N.
+ <https://www.savetz.com/mbone/>. For most people, it's safe to say N.
config IP_ADVANCED_ROUTER
bool "IP: advanced router"
- ---help---
+ help
If you intend to run your Linux box mostly as a router, i.e. as a
computer that forwards and redistributes network packets, say Y; you
will then be presented with several options that allow more precise
@@ -35,7 +33,7 @@ config IP_ADVANCED_ROUTER
at boot time after the /proc file system has been mounted.
- If you turn on IP forwarding, you will also get the rp_filter, which
+ If you turn on IP forwarding, you should consider the rp_filter, which
automatically rejects incoming packets if the routing table entry
for their source address doesn't match the network interface they're
arriving on. This has security advantages because it prevents the
@@ -43,64 +41,39 @@ config IP_ADVANCED_ROUTER
asymmetric routing (packets from you to a host take a different path
than packets from that host to you) or if you operate a non-routing
host which has several IP addresses on different interfaces. To turn
- rp_filter off use:
+ rp_filter on use:
+
+ echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+ or
+ echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
- echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- or
- echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.rst>.
If unsure, say N here.
-choice
- prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
+config IP_FIB_TRIE_STATS
+ bool "FIB TRIE statistics"
depends on IP_ADVANCED_ROUTER
- default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
- bool "FIB_HASH"
- ---help---
- Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
- bool "FIB_TRIE"
- ---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-
-endchoice
-
-config IP_FIB_HASH
- def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
+ help
+ Keep track of statistics on structure of FIB TRIE table.
+ Useful for testing and measuring TRIE performance.
config IP_MULTIPLE_TABLES
bool "IP: policy routing"
depends on IP_ADVANCED_ROUTER
select FIB_RULES
- ---help---
+ help
Normally, a router decides what to do with a received packet based
solely on the packet's final destination address. If you say Y here,
the Linux router will also be able to take the packet's source
address into account. Furthermore, the TOS (Type-Of-Service) field
of the packet can be used for routing decisions as well.
- If you are interested in this, please see the preliminary
- documentation at <http://www.compendium.com.ar/policy-routing.txt>
- and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
- You will need supporting software from
- <ftp://ftp.tux.org/pub/net/ip-routing/>.
+ If you need more information, see the Linux Advanced
+ Routing and Traffic Control documentation at
+ <https://lartc.org/howto/lartc.rpdb.html>
If unsure, say N.
@@ -116,48 +89,6 @@ config IP_ROUTE_MULTIPATH
equal "cost" and chooses one of them in a non-deterministic fashion
if a matching packet arrives.
-config IP_ROUTE_MULTIPATH_CACHED
- bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
- depends on IP_ROUTE_MULTIPATH
- help
- Normally, equal cost multipath routing is not supported by the
- routing cache. If you say Y here, alternative routes are cached
- and on cache lookup a route is chosen in a configurable fashion.
-
- If unsure, say N.
-
-config IP_ROUTE_MULTIPATH_RR
- tristate "MULTIPATH: round robin algorithm"
- depends on IP_ROUTE_MULTIPATH_CACHED
- help
- Mulitpath routes are chosen according to Round Robin
-
-config IP_ROUTE_MULTIPATH_RANDOM
- tristate "MULTIPATH: random algorithm"
- depends on IP_ROUTE_MULTIPATH_CACHED
- help
- Multipath routes are chosen in a random fashion. Actually,
- there is no weight for a route. The advantage of this policy
- is that it is implemented stateless and therefore introduces only
- a very small delay.
-
-config IP_ROUTE_MULTIPATH_WRANDOM
- tristate "MULTIPATH: weighted random algorithm"
- depends on IP_ROUTE_MULTIPATH_CACHED
- help
- Multipath routes are chosen in a weighted random fashion.
- The per route weights are the weights visible via ip route 2. As the
- corresponding state management introduces some overhead routing delay
- is increased.
-
-config IP_ROUTE_MULTIPATH_DRR
- tristate "MULTIPATH: interface round robin algorithm"
- depends on IP_ROUTE_MULTIPATH_CACHED
- help
- Connections are distributed in a round robin fashion over the
- available interfaces. This policy makes sense if the connections
- should be primarily distributed on interfaces and not on routes.
-
config IP_ROUTE_VERBOSE
bool "IP: verbose route monitoring"
depends on IP_ADVANCED_ROUTER
@@ -169,6 +100,9 @@ config IP_ROUTE_VERBOSE
handled by the klogd daemon which is responsible for kernel messages
("man klogd").
+config IP_ROUTE_CLASSID
+ bool
+
config IP_PNP
bool "IP: kernel level autoconfiguration"
help
@@ -183,7 +117,7 @@ config IP_PNP
config IP_PNP_DHCP
bool "IP: DHCP support"
depends on IP_PNP
- ---help---
+ help
If you want your Linux box to mount its whole root file system (the
one containing the directory /) from some other computer over the
net via NFS and you want the IP address of your computer to be
@@ -195,12 +129,12 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
depends on IP_PNP
- ---help---
+ help
If you want your Linux box to mount its whole root file system (the
one containing the directory /) from some other computer over the
net via NFS and you want the IP address of your computer to be
@@ -210,7 +144,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/nfsroot.txt> for details.
+ Read <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -222,15 +156,14 @@ config IP_PNP_RARP
discovered automatically at boot time using the RARP protocol (an
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
- operating on your network. Read <file:Documentation/nfsroot.txt> for
- details.
+ operating on your network. Read
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
-# not yet ready..
-# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
- ---help---
+ select NET_IP_TUNNEL
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This particular tunneling driver implements
@@ -244,8 +177,22 @@ config NET_IPIP
be inserted in and removed from the running kernel whenever you
want). Most people won't need this and can say N.
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
+config NET_IP_TUNNEL
+ tristate
+ select DST_CACHE
+ select GRO_CELLS
+ default n
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -266,18 +213,35 @@ config NET_IPGRE_BROADCAST
Network), but can be distributed all over the Internet. If you want
to do that, say Y here and to "IP multicast routing" below.
+config IP_MROUTE_COMMON
+ bool
+ depends on IP_MROUTE || IPV6_MROUTE
+
config IP_MROUTE
bool "IP: multicast routing"
depends on IP_MULTICAST
+ select IP_MROUTE_COMMON
help
This is used if you want your machine to act as a router for IP
packets that have several destination addresses. It is needed on the
MBONE, a high bandwidth network on top of the Internet which carries
audio and video broadcasts. In order to do that, you would most
- likely run the program mrouted. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. If you haven't heard
- about it, you don't need it.
+ likely run the program mrouted. If you haven't heard about it, you
+ don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+ bool "IP: multicast policy routing"
+ depends on IP_MROUTE && IP_ADVANCED_ROUTER
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
config IP_PIMSM_V1
bool "IP: PIM-SM version 1 support"
@@ -301,34 +265,9 @@ config IP_PIMSM_V2
gated-5). This routing protocol is not used widely, so say N unless
you want to play with it.
-config ARPD
- bool "IP: ARP daemon support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- ---help---
- Normally, the kernel maintains an internal cache which maps IP
- addresses to hardware addresses on the local network, so that
- Ethernet/Token Ring/ etc. frames are sent to the proper address on
- the physical networking layer. For small networks having a few
- hundred directly connected hosts or less, keeping this address
- resolution (ARP) cache inside the kernel works well. However,
- maintaining an internal ARP cache does not work well for very large
- switched networks, and will use a lot of kernel memory if TCP/IP
- connections are made to many machines on the network.
-
- If you say Y here, the kernel's internal ARP cache will never grow
- to more than 256 entries (the oldest entries are expired in a LIFO
- manner) and communication will be attempted with the user space ARP
- daemon arpd. Arpd then answers the address resolution request either
- from its own cache or by asking the net.
-
- This code is experimental and also obsolete. If you want to use it,
- you need to find a version of the daemon arpd on the net somewhere,
- and you should also say Y to "Kernel/User network link driver",
- below. If unsure, say N.
-
config SYN_COOKIES
- bool "IP: TCP syncookie support (disabled per default)"
- ---help---
+ bool "IP: TCP syncookie support"
+ help
Normal TCP/IP networking is open to an attack known as "SYN
flooding". This denial-of-service attack prevents legitimate remote
users from being able to connect to your computer during an ongoing
@@ -341,7 +280,7 @@ config SYN_COOKIES
continue to connect, even when your machine is under attack. There
is no need for the legitimate users to change their TCP/IP software;
SYN cookies work transparently to them. For technical information
- about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+ about SYN cookies, check out <https://cr.yp.to/syncookies.html>.
If you are SYN flooded, the source address reported by the kernel is
likely to have been forged by the attacker; it is only reported as
@@ -352,54 +291,127 @@ config SYN_COOKIES
server is really overloaded. If this happens frequently better turn
them off.
- If you say Y here, note that SYN cookies aren't enabled by default;
- you can enable them by saying Y to "/proc file system support" and
+ If you say Y here, you can disable SYN cookies at run time by
+ saying Y to "/proc file system support" and
"Sysctl support" below and executing the command
- echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+ echo 0 > /proc/sys/net/ipv4/tcp_syncookies
- at boot time after the /proc file system has been mounted.
+ after the /proc file system has been mounted.
If unsure, say N.
+config NET_IPVTI
+ tristate "Virtual (secure) IP: tunneling"
+ depends on IPV6 || IPV6=n
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ select XFRM
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
+config NET_UDP_TUNNEL
+ tristate
+ select NET_IP_TUNNEL
+ default n
+
+config NET_FOU
+ tristate "IP: Foo (IP protocols) over UDP"
+ select NET_UDP_TUNNEL
+ help
+ Foo over UDP allows any IP protocol to be directly encapsulated
+ over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
+ network mechanisms and optimizations for UDP (such as ECMP
+ and RSS) can be leveraged to provide better service.
+
+config NET_FOU_IP_TUNNELS
+ bool "IP: FOU encapsulation of IP tunnels"
+ depends on NET_IPIP || NET_IPGRE || IPV6_SIT
+ select NET_FOU
+ help
+ Allow configuration of FOU or GUE encapsulation for IP tunnels.
+ When this option is enabled IP tunnels can be configured to use
+ FOU or GUE encapsulation.
+
config INET_AH
tristate "IP: AH transformation"
- select XFRM
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_SHA1
- ---help---
- Support for IPsec AH.
+ select XFRM_AH
+ help
+ Support for IPsec AH (Authentication Header).
+
+ AH can be used with various authentication algorithms. Besides
+ enabling AH support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
config INET_ESP
tristate "IP: ESP transformation"
- select XFRM
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_CBC
- select CRYPTO_SHA1
- select CRYPTO_DES
- ---help---
- Support for IPsec ESP.
+ select XFRM_ESP
+ help
+ Support for IPsec ESP (Encapsulating Security Payload).
+
+ ESP can be used with various encryption and authentication algorithms.
+ Besides enabling ESP support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
+config INET_ESP_OFFLOAD
+ tristate "IP: ESP transformation offload"
+ depends on INET_ESP
+ select XFRM_OFFLOAD
+ default n
+ help
+ Support for ESP transformation offload. This makes sense
+ only if this system really does IPsec and want to do it
+ with high throughput. A typical desktop system does not
+ need it, even if it does IPsec.
+
+ If unsure, say N.
+
+config INET_ESPINTCP
+ bool "IP: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ select XFRM_ESPINTCP
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv4 sockets.
+
+ If unsure, say N.
+
config INET_IPCOMP
tristate "IP: IPComp transformation"
- select XFRM
select INET_XFRM_TUNNEL
- select CRYPTO
- select CRYPTO_DEFLATE
- ---help---
+ select XFRM_IPCOMP
+ help
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
-
+
If unsure, say Y.
+config INET_TABLE_PERTURB_ORDER
+ int "INET: Source port perturbation table size (as power of 2)" if EXPERT
+ default 16
+ help
+ Source port perturbation table size (as power of 2) for
+ RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm.
+
+ The default is almost always what you want.
+ Only change this if you know what you are doing.
+
config INET_XFRM_TUNNEL
tristate
select INET_TUNNEL
@@ -409,50 +421,54 @@ config INET_TUNNEL
tristate
default n
-config INET_XFRM_MODE_TRANSPORT
- tristate "IP: IPsec transport mode"
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
default y
- select XFRM
- ---help---
- Support for IPsec transport mode.
-
- If unsure, say Y.
+ help
+ Support for INET (TCP, UDP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at:
-config INET_XFRM_MODE_TUNNEL
- tristate "IP: IPsec tunnel mode"
- default y
- select XFRM
- ---help---
- Support for IPsec tunnel mode.
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
If unsure, say Y.
-config INET_XFRM_MODE_BEET
- tristate "IP: IPsec BEET mode"
- default y
- select XFRM
- ---help---
- Support for IPsec BEET mode.
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+config INET_UDP_DIAG
+ tristate "UDP: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ help
+ Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
-config INET_DIAG
- tristate "INET: socket monitoring interface"
- default y
- ---help---
- Support for INET (TCP, DCCP, etc) socket monitoring interface used by
- native Linux tools such as ss. ss is included in iproute2, currently
- downloadable at <http://developer.osdl.org/dev/iproute2>.
-
+config INET_RAW_DIAG
+ tristate "RAW: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ help
+ Support for RAW socket monitoring interface used by the ss tool.
If unsure, say Y.
-config INET_TCP_DIAG
+config INET_DIAG_DESTROY
+ bool "INET: allow privileged process to administratively close sockets"
depends on INET_DIAG
- def_tristate INET_DIAG
+ default n
+ help
+ Provides a SOCK_DESTROY operation that allows privileged processes
+ (e.g., a connection manager or a network administration tool such as
+ ss) to close sockets opened by other processes. Closing a socket in
+ this way interrupts any blocking read/write/connect operations on
+ the socket and causes future socket calls to behave as if the socket
+ had been disconnected.
+ If unsure, say N.
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
- ---help---
+ help
Support for selection of various TCP congestion control
modules.
@@ -466,113 +482,202 @@ if TCP_CONG_ADVANCED
config TCP_CONG_BIC
tristate "Binary Increase Congestion (BIC) control"
default m
- ---help---
- BIC-TCP is a sender-side only change that ensures a linear RTT
- fairness under large windows while offering both scalability and
- bounded TCP-friendliness. The protocol combines two schemes
- called additive increase and binary search increase. When the
- congestion window is large, additive increase with a large
- increment ensures linear RTT fairness as well as good
- scalability. Under small congestion windows, binary search
- increase provides TCP friendliness.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+ help
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
config TCP_CONG_CUBIC
tristate "CUBIC TCP"
default y
- ---help---
- This is version 2.0 of BIC-TCP which uses a cubic growth function
- among other techniques.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ help
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
- ---help---
- TCP Westwood+ is a sender-side only modification of the TCP Reno
- protocol stack that optimizes the performance of TCP congestion
- control. It is based on end-to-end bandwidth estimation to set
- congestion window and slow start threshold after a congestion
- episode. Using this estimation, TCP Westwood+ adaptively sets a
- slow start threshold and a congestion window which takes into
- account the bandwidth used at the time congestion is experienced.
- TCP Westwood+ significantly increases fairness wrt TCP Reno in
- wired networks and throughput over wireless links.
+ help
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
config TCP_CONG_HTCP
- tristate "H-TCP"
- default m
- ---help---
- H-TCP is a send-side only modifications of the TCP Reno
- protocol stack that optimizes the performance of TCP
- congestion control for high speed network links. It uses a
- modeswitch to change the alpha and beta parameters of TCP Reno
- based on network conditions and in a way so as to be fair with
- other Reno and H-TCP flows.
+ tristate "H-TCP"
+ default m
+ help
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
config TCP_CONG_HSTCP
tristate "High Speed TCP"
- depends on EXPERIMENTAL
default n
- ---help---
- Sally Floyd's High Speed TCP (RFC 3649) congestion control.
- A modification to TCP's congestion control mechanism for use
- with large congestion windows. A table indicates how much to
- increase the congestion window by when an ACK is received.
- For more detail see http://www.icir.org/floyd/hstcp.html
+ help
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see https://www.icir.org/floyd/hstcp.html
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
- depends on EXPERIMENTAL
default n
- ---help---
- TCP-Hybla is a sender-side only change that eliminates penalization of
- long-RTT, large-bandwidth connections, like when satellite legs are
- involved, especially when sharing a common bottleneck with normal
- terrestrial connections.
+ help
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
config TCP_CONG_VEGAS
tristate "TCP Vegas"
- depends on EXPERIMENTAL
default n
- ---help---
- TCP Vegas is a sender-side only change to TCP that anticipates
- the onset of congestion by estimating the bandwidth. TCP Vegas
- adjusts the sending rate by modifying the congestion
- window. TCP Vegas should provide less packet loss, but it is
- not as aggressive as TCP Reno.
+ help
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
+
+config TCP_CONG_NV
+ tristate "TCP NV"
+ default n
+ help
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
+
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+ For further details see http://www.brakmo.org/networking/tcp-nv/
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
- depends on EXPERIMENTAL
default n
- ---help---
- Scalable TCP is a sender-side only change to TCP which uses a
- MIMD congestion control algorithm which has some nice scaling
- properties, though is known to have fairness issues.
- See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ help
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
config TCP_CONG_LP
tristate "TCP Low Priority"
- depends on EXPERIMENTAL
default n
- ---help---
- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
- to utilize only the excess network bandwidth as compared to the
- ``fair share`` of bandwidth as targeted by TCP.
- See http://www-ece.rice.edu/networks/TCP-LP/
+ help
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
config TCP_CONG_VENO
tristate "TCP Veno"
- depends on EXPERIMENTAL
default n
- ---help---
- TCP Veno is a sender-side only enhancement of TCP to obtain better
- throughput over wireless networks. TCP Veno makes use of state
- distinguishing to circumvent the difficult judgment of the packet loss
- type. TCP Veno cuts down less congestion window in response to random
- loss packets.
- See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ help
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+
+config TCP_CONG_YEAH
+ tristate "YeAH TCP"
+ select TCP_CONG_VEGAS
+ default n
+ help
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
+
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+ tristate "TCP Illinois"
+ default n
+ help
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
+
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+config TCP_CONG_DCTCP
+ tristate "DataCenter TCP (DCTCP)"
+ default n
+ help
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
+
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
+
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
+
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
+config TCP_CONG_CDG
+ tristate "CAIA Delay-Gradient (CDG)"
+ default n
+ help
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
+
+ o Use the delay gradient as a congestion signal.
+ o Back off with an average probability that is independent of the RTT.
+ o Coexist with flows that use loss-based congestion control.
+ o Tolerate packet loss unrelated to congestion.
+
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint:
+ http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf
+
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ help
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the bottleneck delivery rate and path round-trip propagation
+ delay. It tolerates packet loss and delay unrelated to congestion. It
+ can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+ coexist with flows that use loss-based congestion control, and can
+ operate with shallow buffers, deep buffers, bufferbloat, policers, or
+ AQM schemes that do not provide a delay signal. It requires the fq
+ ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
@@ -590,15 +695,29 @@ choice
config DEFAULT_HTCP
bool "Htcp" if TCP_CONG_HTCP=y
+ config DEFAULT_HYBLA
+ bool "Hybla" if TCP_CONG_HYBLA=y
+
config DEFAULT_VEGAS
bool "Vegas" if TCP_CONG_VEGAS=y
+ config DEFAULT_VENO
+ bool "Veno" if TCP_CONG_VENO=y
+
config DEFAULT_WESTWOOD
bool "Westwood" if TCP_CONG_WESTWOOD=y
+ config DEFAULT_DCTCP
+ bool "DCTCP" if TCP_CONG_DCTCP=y
+
+ config DEFAULT_CDG
+ bool "CDG" if TCP_CONG_CDG=y
+
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
-
endchoice
endif
@@ -613,26 +732,38 @@ config DEFAULT_TCP_CONG
default "bic" if DEFAULT_BIC
default "cubic" if DEFAULT_CUBIC
default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
default "vegas" if DEFAULT_VEGAS
default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
+ default "dctcp" if DEFAULT_DCTCP
+ default "cdg" if DEFAULT_CDG
+ default "bbr" if DEFAULT_BBR
default "cubic"
-config TCP_MD5SIG
- bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+config TCP_SIGPOOL
+ tristate
+
+config TCP_AO
+ bool "TCP: Authentication Option (RFC5925)"
select CRYPTO
- select CRYPTO_MD5
- ---help---
- RFC2385 specifices a method of giving MD5 protection to TCP sessions.
- Its main (only?) use is to protect BGP sessions between core routers
- on the Internet.
+ select TCP_SIGPOOL
+ depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64)
+ help
+ TCP-AO specifies the use of stronger Message Authentication Codes (MACs),
+ protects against replays for long-lived TCP connections, and
+ provides more details on the association of security with TCP
+ connections than TCP MD5 (See RFC5925)
If unsure, say N.
-config TCP_MD5SIG_DEBUG
- bool "TCP: MD5 Signature Option debugging"
- depends on TCP_MD5SIG
-
-source "net/ipv4/ipvs/Kconfig"
+config TCP_MD5SIG
+ bool "TCP: MD5 Signature Option support (RFC2385)"
+ select CRYPTO_LIB_MD5
+ help
+ RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+ Its main (only?) use is to protect BGP sessions between core routers
+ on the Internet.
+ If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7a068626feea..ec36d2ec059e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux TCP/IP (INET) layer.
#
@@ -7,49 +8,67 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
- datagram.o raw.o udp.o udplite.o \
- arp.o icmp.o devinet.o af_inet.o igmp.o \
- sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_rate.o tcp_recovery.o tcp_ulp.o \
+ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
+ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
+ metrics.o netlink.o nexthop.o udp_tunnel_stub.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o
+fou-y := fou_core.o fou_nl.o fou_bpf.o
+obj-$(CONFIG_NET_FOU) += fou.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
-obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
-obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
-obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
-obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_INET_DIAG) += inet_diag.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_TCP_SIGPOOL) += tcp_sigpool.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
- xfrm4_output.o
+ xfrm4_output.o xfrm4_protocol.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
+
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1144900d37f6..08d811f11896 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
@@ -60,20 +59,17 @@
* Some other random speedups.
* Cyrus Durgin : Cleaned up file for kmod hacks.
* Andi Kleen : Fix inet_stream_connect TCP race.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/kmod.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
@@ -87,38 +83,46 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
-#include <linux/smp_lock.h>
#include <linux/inet.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
+#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include <net/tcp.h>
+#include <net/psp.h>
#include <net/udp.h>
#include <net/udplite.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
-#include <net/ipip.h>
#include <net/inet_common.h>
+#include <net/ip_tunnels.h>
#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
+#include <net/l3mdev.h>
+#include <net/compat.h>
+#include <net/rps.h>
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
-
-extern void ip_mc_drop_socket(struct sock *sk);
+#include <trace/events/sock.h>
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -135,25 +139,29 @@ void inet_sock_destruct(struct sock *sk)
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
+ sk_mem_reclaim_final(sk);
+
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
- printk("Attempt to release TCP socket in state %d %p\n",
+ pr_err("Attempt to release TCP socket in state %d %p\n",
sk->sk_state, sk);
return;
}
if (!sock_flag(sk, SOCK_DEAD)) {
- printk("Attempt to release alive inet socket %p\n", sk);
+ pr_err("Attempt to release alive inet socket %p\n", sk);
return;
}
- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
- BUG_TRAP(!sk->sk_wmem_queued);
- BUG_TRAP(!sk->sk_forward_alloc);
+ WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
+ WARN_ON_ONCE(sk->sk_wmem_queued);
+ WARN_ON_ONCE(sk->sk_forward_alloc);
- kfree(inet->opt);
- dst_release(sk->sk_dst_cache);
- sk_refcnt_debug_dec(sk);
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
+ dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
+ dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
+ psp_sk_assoc_free(sk);
}
+EXPORT_SYMBOL(inet_sock_destruct);
/*
* The routines beyond this point handle the behaviour of an AF_INET
@@ -171,78 +179,101 @@ static int inet_autobind(struct sock *sk)
/* We may need to bind the socket. */
lock_sock(sk);
inet = inet_sk(sk);
- if (!inet->num) {
+ if (!inet->inet_num) {
if (sk->sk_prot->get_port(sk, 0)) {
release_sock(sk);
return -EAGAIN;
}
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
}
release_sock(sk);
return 0;
}
+int __inet_listen_sk(struct sock *sk, int backlog)
+{
+ unsigned char old_state = sk->sk_state;
+ int err, tcp_fastopen;
+
+ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ return -EINVAL;
+
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != TCP_LISTEN) {
+ /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
+ * Note that only TCP sockets (SOCK_STREAM) will reach here.
+ * Also fastopen backlog may already been set via the option
+ * because the socket was in TCP_LISTEN state previously but
+ * was shutdown() rather than close().
+ */
+ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
+ if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+ (tcp_fastopen & TFO_SERVER_ENABLE) &&
+ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
+ fastopen_queue_tune(sk, backlog);
+ tcp_fastopen_init_key_once(sock_net(sk));
+ }
+
+ err = inet_csk_listen_start(sk);
+ if (err)
+ return err;
+
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
+ }
+ return 0;
+}
+
/*
* Move a socket into listening state.
*/
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
- unsigned char old_state;
- int err;
+ int err = -EINVAL;
lock_sock(sk);
- err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
- old_state = sk->sk_state;
- if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- goto out;
-
- /* Really, if the socket is already in listen state
- * we can only allow the backlog to be adjusted.
- */
- if (old_state != TCP_LISTEN) {
- err = inet_csk_listen_start(sk, backlog);
- if (err)
- goto out;
- }
- sk->sk_max_ack_backlog = backlog;
- err = 0;
+ err = __inet_listen_sk(sk, backlog);
out:
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(inet_listen);
/*
* Create an inet socket.
*/
-static int inet_create(struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
- struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
- answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
- list_for_each_rcu(p, &inetsw[sock->type]) {
- answer = list_entry(p, struct inet_protosw, list);
+ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+ err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
@@ -257,10 +288,9 @@ lookup_protocol:
break;
}
err = -EPROTONOSUPPORT;
- answer = NULL;
}
- if (unlikely(answer == NULL)) {
+ if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
@@ -283,79 +313,95 @@ lookup_protocol:
}
err = -EPERM;
- if (answer->capability > 0 && !capable(answer->capability))
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
- BUG_TRAP(answer_prot->slab != NULL);
+ WARN_ON(!answer_prot->slab);
- err = -ENOBUFS;
- sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
- if (sk == NULL)
+ err = -ENOMEM;
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
+ if (!sk)
goto out;
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
+
+ if (INET_PROTOSW_ICSK & answer_flags)
+ inet_init_csk_locks(sk);
inet = inet_sk(sk);
- inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+
+ inet_clear_bit(NODEFRAG, sk);
if (SOCK_RAW == sock->type) {
- inet->num = protocol;
+ inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
- inet->hdrincl = 1;
+ inet_set_bit(HDRINCL, sk);
}
- if (ipv4_config.no_pmtu_disc)
+ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->id = 0;
+ atomic_set(&inet->inet_id, 0);
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
- sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
inet->uc_ttl = -1;
- inet->mc_loop = 1;
+ inet_set_bit(MC_LOOP, sk);
inet->mc_ttl = 1;
+ inet_set_bit(MC_ALL, sk);
inet->mc_index = 0;
inet->mc_list = NULL;
+ inet->rcv_tos = 0;
- sk_refcnt_debug_inc(sk);
-
- if (inet->num) {
+ if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
+ if (err)
+ goto out_sk_release;
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
- sk_common_release(sk);
+ goto out_sk_release;
+ }
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err)
+ goto out_sk_release;
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
}
@@ -371,6 +417,9 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+ if (!sk->sk_kern_sock)
+ BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -385,34 +434,65 @@ int inet_release(struct socket *sock)
if (sock_flag(sk, SOCK_LINGER) &&
!(current->flags & PF_EXITING))
timeout = sk->sk_lingertime;
- sock->sk = NULL;
sk->sk_prot->close(sk, timeout);
+ sock->sk = NULL;
}
return 0;
}
+EXPORT_SYMBOL(inet_release);
+
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ u32 flags = BIND_WITH_LOCK;
+ int err;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if (sk->sk_prot->bind) {
+ return sk->sk_prot->bind(sk, uaddr, addr_len);
+ }
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ /* BPF prog is run before any checks are done so that if the prog
+ * changes context in a wrong way it will be caught.
+ */
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+ CGROUP_INET4_BIND, &flags);
+ if (err)
+ return err;
+
+ return __inet_bind(sk, uaddr, addr_len, flags);
+}
-/* It is off by default, see below. */
-int sysctl_ip_nonlocal_bind __read_mostly;
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ return inet_bind_sk(sock->sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_bind);
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
+ u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
- struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
+ u32 tb_id = RT_TABLE_LOCAL;
int err;
- /* If the socket has its own bind function then use it. (RAW) */
- if (sk->sk_prot->bind) {
- err = sk->sk_prot->bind(sk, uaddr, addr_len);
- goto out;
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
}
- err = -EINVAL;
- if (addr_len < sizeof(struct sockaddr_in))
- goto out;
- chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -422,17 +502,15 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!sysctl_ip_nonlocal_bind &&
- !inet->freebind &&
- addr->sin_addr.s_addr != INADDR_ANY &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+ snum && inet_port_requires_bind_service(net, snum) &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
@@ -442,57 +520,87 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
- lock_sock(sk);
+ if (flags & BIND_WITH_LOCK)
+ lock_sock(sk);
/* Check these errors (active socket, double bind). */
err = -EINVAL;
- if (sk->sk_state != TCP_CLOSE || inet->num)
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
+ inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
- if (sk->sk_prot->get_port(sk, snum)) {
- inet->saddr = inet->rcv_saddr = 0;
- err = -EADDRINUSE;
- goto out_release_sock;
+ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
+ (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
+ err = sk->sk_prot->get_port(sk, snum);
+ if (err) {
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ goto out_release_sock;
+ }
+ if (!(flags & BIND_FROM_BPF)) {
+ err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
+ if (err) {
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ if (sk->sk_prot->put_port)
+ sk->sk_prot->put_port(sk);
+ goto out_release_sock;
+ }
+ }
}
- if (inet->rcv_saddr)
+ if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- inet->sport = htons(inet->num);
- inet->daddr = 0;
- inet->dport = 0;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
- release_sock(sk);
+ if (flags & BIND_WITH_LOCK)
+ release_sock(sk);
out:
return err;
}
-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
+ const struct proto *prot;
+ int err;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
if (uaddr->sa_family == AF_UNSPEC)
- return sk->sk_prot->disconnect(sk, flags);
+ return prot->disconnect(sk, flags);
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
+ err = prot->pre_connect(sk, uaddr, addr_len);
+ if (err)
+ return err;
+ }
+
+ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+ return prot->connect(sk, uaddr, addr_len);
}
+EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk->sk_write_pending += writebias;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -501,13 +609,13 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
*/
while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
release_sock(sk);
- timeo = schedule_timeout(timeo);
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
lock_sock(sk);
if (signal_pending(current) || !timeo)
break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
return timeo;
}
@@ -515,19 +623,32 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
int err;
long timeo;
- lock_sock(sk);
-
- if (uaddr->sa_family == AF_UNSPEC) {
- err = sk->sk_prot->disconnect(sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto out;
+ /*
+ * uaddr can be NULL and addr_len can be 0 if:
+ * sk is a TCP fastopen active socket and
+ * TCP_FASTOPEN_CONNECT sockopt is set and
+ * we already have a valid cookie for this socket.
+ * In this case, user can call write() after connect().
+ * write() will invoke tcp_sendmsg_fastopen() which calls
+ * __inet_stream_connect().
+ */
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ sk->sk_disconnects++;
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
}
switch (sock->state) {
@@ -538,7 +659,10 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- err = -EALREADY;
+ if (inet_test_bit(DEFER_CONNECT, sk))
+ err = is_sendmsg ? -EINPROGRESS : -EISCONN;
+ else
+ err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
@@ -546,11 +670,20 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (sk->sk_state != TCP_CLOSE)
goto out;
+ if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
+ err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
+ if (err)
+ goto out;
+ }
+
err = sk->sk_prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out;
- sock->state = SS_CONNECTING;
+ sock->state = SS_CONNECTING;
+
+ if (!err && inet_test_bit(DEFER_CONNECT, sk))
+ goto out;
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
@@ -563,13 +696,23 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+ int dis = sk->sk_disconnects;
+
/* Error code is set above */
- if (!timeo || !inet_wait_for_connect(sk, timeo))
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
+
+ if (dis != sk->sk_disconnects) {
+ err = -EPIPE;
+ goto out;
+ }
}
/* Connection was closed by RST, timeout, ICMP error
@@ -586,101 +729,172 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
err = 0;
out:
- release_sock(sk);
return err;
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
+ sk->sk_disconnects++;
if (sk->sk_prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
+ release_sock(sock->sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_stream_connect);
+
+void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled) {
+ mem_cgroup_sk_alloc(newsk);
+ __sk_charge(newsk, GFP_KERNEL);
+ }
+
+ sock_rps_record_flow(newsk);
+ WARN_ON(!((1 << newsk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
+ TCPF_CLOSING | TCPF_CLOSE_WAIT |
+ TCPF_CLOSE)));
+
+ if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
+ set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
+ sock_graft(newsk, newsock);
+
+ newsock->state = SS_CONNECTED;
+}
+EXPORT_SYMBOL_GPL(__inet_accept);
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
-int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+int inet_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
- struct sock *sk1 = sock->sk;
- int err = -EINVAL;
- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+ struct sock *sk1 = sock->sk, *sk2;
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ arg->err = -EINVAL;
+ sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
if (!sk2)
- goto do_err;
+ return arg->err;
lock_sock(sk2);
-
- BUG_TRAP((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
-
- sock_graft(sk2, newsock);
-
- newsock->state = SS_CONNECTED;
- err = 0;
+ __inet_accept(sock, newsock, sk2);
release_sock(sk2);
-do_err:
- return err;
+ return 0;
}
-
+EXPORT_SYMBOL(inet_accept);
/*
* This does both peername and sockname.
*/
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
- int *uaddr_len, int peer)
+ int peer)
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+ int sin_addr_len = sizeof(*sin);
sin->sin_family = AF_INET;
+ lock_sock(sk);
if (peer) {
- if (!inet->dport ||
+ if (!inet->inet_dport ||
(((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
- peer == 1))
+ peer == 1)) {
+ release_sock(sk);
return -ENOTCONN;
- sin->sin_port = inet->dport;
- sin->sin_addr.s_addr = inet->daddr;
+ }
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
+ CGROUP_INET4_GETPEERNAME);
} else {
- __be32 addr = inet->rcv_saddr;
+ __be32 addr = inet->inet_rcv_saddr;
if (!addr)
- addr = inet->saddr;
- sin->sin_port = inet->sport;
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
+ CGROUP_INET4_GETSOCKNAME);
}
+ release_sock(sk);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- *uaddr_len = sizeof(*sin);
- return 0;
+ return sin_addr_len;
}
+EXPORT_SYMBOL(inet_getname);
-int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size)
+int inet_send_prepare(struct sock *sk)
{
- struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+ return 0;
}
+EXPORT_SYMBOL_GPL(inet_send_prepare);
-
-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (unlikely(inet_send_prepare(sk)))
return -EAGAIN;
- if (sk->sk_prot->sendpage)
- return sk->sk_prot->sendpage(sk, page, offset, size, flags);
- return sock_no_sendpage(sock, page, offset, size, flags);
+ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
+ sk, msg, size);
}
+EXPORT_SYMBOL(inet_sendmsg);
+void inet_splice_eof(struct socket *sock)
+{
+ const struct proto *prot;
+ struct sock *sk = sock->sk;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ if (prot->splice_eof)
+ prot->splice_eof(sock);
+}
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+
+INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int *));
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ if (likely(!(flags & MSG_ERRQUEUE)))
+ sock_rps_record_flow(sk);
+
+ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
+ sk, msg, size, flags, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
int inet_shutdown(struct socket *sock, int how)
{
@@ -709,9 +923,10 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_CLOSE:
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
- POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
+ fallthrough;
default:
- sk->sk_shutdown |= how;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
if (sk->sk_prot->shutdown)
sk->sk_prot->shutdown(sk, how);
break;
@@ -723,7 +938,7 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_LISTEN:
if (!(how & RCV_SHUTDOWN))
break;
- /* Fall through */
+ fallthrough;
case TCP_SYN_SENT:
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -735,6 +950,7 @@ int inet_shutdown(struct socket *sock, int how)
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(inet_shutdown);
/*
* ioctl() calls you can issue on an INET socket. Most of these are
@@ -750,43 +966,96 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
int err = 0;
+ struct net *net = sock_net(sk);
+ void __user *p = (void __user *)arg;
+ struct ifreq ifr;
+ struct rtentry rt;
switch (cmd) {
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
- break;
- case SIOCADDRT:
- case SIOCDELRT:
- case SIOCRTMSG:
- err = ip_rt_ioctl(cmd, (void __user *)arg);
- break;
- case SIOCDARP:
- case SIOCGARP:
- case SIOCSARP:
- err = arp_ioctl(cmd, (void __user *)arg);
- break;
- case SIOCGIFADDR:
- case SIOCSIFADDR:
- case SIOCGIFBRDADDR:
- case SIOCSIFBRDADDR:
- case SIOCGIFNETMASK:
- case SIOCSIFNETMASK:
- case SIOCGIFDSTADDR:
- case SIOCSIFDSTADDR:
- case SIOCSIFPFLAGS:
- case SIOCGIFPFLAGS:
- case SIOCSIFFLAGS:
- err = devinet_ioctl(cmd, (void __user *)arg);
- break;
- default:
- if (sk->sk_prot->ioctl)
- err = sk->sk_prot->ioctl(sk, cmd, arg);
- else
- err = -ENOIOCTLCMD;
- break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ if (copy_from_user(&rt, p, sizeof(struct rtentry)))
+ return -EFAULT;
+ err = ip_rt_ioctl(net, cmd, &rt);
+ break;
+ case SIOCRTMSG:
+ err = -EINVAL;
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCGIFPFLAGS:
+ if (get_user_ifreq(&ifr, NULL, p))
+ return -EFAULT;
+ err = devinet_ioctl(net, cmd, &ifr);
+ if (!err && put_user_ifreq(&ifr, p))
+ err = -EFAULT;
+ break;
+
+ case SIOCSIFADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCSIFNETMASK:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCSIFFLAGS:
+ if (get_user_ifreq(&ifr, NULL, p))
+ return -EFAULT;
+ err = devinet_ioctl(net, cmd, &ifr);
+ break;
+ default:
+ if (sk->sk_prot->ioctl)
+ err = sk_ioctl(sk, cmd, (void __user *)arg);
+ else
+ err = -ENOIOCTLCMD;
+ break;
}
return err;
}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_rtentry __user *ur)
+{
+ compat_uptr_t rtdev;
+ struct rtentry rt;
+
+ if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+ 3 * sizeof(struct sockaddr)) ||
+ get_user(rt.rt_flags, &ur->rt_flags) ||
+ get_user(rt.rt_metric, &ur->rt_metric) ||
+ get_user(rt.rt_mtu, &ur->rt_mtu) ||
+ get_user(rt.rt_window, &ur->rt_window) ||
+ get_user(rt.rt_irtt, &ur->rt_irtt) ||
+ get_user(rtdev, &ur->rt_dev))
+ return -EFAULT;
+
+ rt.rt_dev = compat_ptr(rtdev);
+ return ip_rt_ioctl(sock_net(sk), cmd, &rt);
+}
+
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return inet_compat_routing_ioctl(sk, cmd, argp);
+ default:
+ if (!sk->sk_prot->compat_ioctl)
+ return -ENOIOCTLCMD;
+ return sk->sk_prot->compat_ioctl(sk, cmd, arg);
+ }
+}
+#endif /* CONFIG_COMPAT */
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
@@ -799,19 +1068,29 @@ const struct proto_ops inet_stream_ops = {
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = tcp_sendpage,
+ .recvmsg = inet_recvmsg,
+#ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+#endif
+ .splice_eof = inet_splice_eof,
+ .splice_read = tcp_splice_read,
+ .set_peek_off = sk_set_peek_off,
+ .read_sock = tcp_read_sock,
+ .read_skb = tcp_read_skb,
+ .sendmsg_locked = tcp_sendmsg_locked,
+ .peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
+ .set_rcvlowat = tcp_set_rcvlowat,
};
+EXPORT_SYMBOL(inet_stream_ops);
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
@@ -824,19 +1103,22 @@ const struct proto_ops inet_dgram_ops = {
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .read_skb = udp_read_skb,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
+ .splice_eof = inet_splice_eof,
+ .set_peek_off = udp_set_peek_off,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
+EXPORT_SYMBOL(inet_dgram_ops);
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
@@ -853,21 +1135,21 @@ static const struct proto_ops inet_sockraw_ops = {
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
+ .splice_eof = inet_splice_eof,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
-static struct net_proto_family inet_family_ops = {
+static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
@@ -878,40 +1160,41 @@ static struct net_proto_family inet_family_ops = {
*/
static struct inet_protosw inetsw_array[] =
{
- {
- .type = SOCK_STREAM,
- .protocol = IPPROTO_TCP,
- .prot = &tcp_prot,
- .ops = &inet_stream_ops,
- .capability = -1,
- .no_check = 0,
- .flags = INET_PROTOSW_PERMANENT |
+ {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcp_prot,
+ .ops = &inet_stream_ops,
+ .flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
- },
-
- {
- .type = SOCK_DGRAM,
- .protocol = IPPROTO_UDP,
- .prot = &udp_prot,
- .ops = &inet_dgram_ops,
- .capability = -1,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_PERMANENT,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udp_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
},
-
{
- .type = SOCK_RAW,
- .protocol = IPPROTO_IP, /* wild card */
- .prot = &raw_prot,
- .ops = &inet_sockraw_ops,
- .capability = CAP_NET_RAW,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_REUSE,
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
+ },
+
+ {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &raw_prot,
+ .ops = &inet_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
}
};
-#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
void inet_register_protosw(struct inet_protosw *p)
{
@@ -926,54 +1209,44 @@ void inet_register_protosw(struct inet_protosw *p)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
- answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
-
/* Check only the non-wild match. */
- if (INET_PROTOSW_PERMANENT & answer->flags) {
- if (protocol == answer->protocol)
- break;
- last_perm = lh;
- }
-
- answer = NULL;
+ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
+ break;
+ if (protocol == answer->protocol)
+ goto out_permanent;
+ last_perm = lh;
}
- if (answer)
- goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
- * non-permanent entry. This means that when we remove this entry, the
+ * non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
- synchronize_net();
-
return;
out_permanent:
- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
- protocol);
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
- printk(KERN_ERR
- "Ignoring attempt to register invalid socket type %d.\n",
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
+EXPORT_SYMBOL(inet_register_protosw);
void inet_unregister_protosw(struct inet_protosw *p)
{
if (INET_PROTOSW_PERMANENT & p->flags) {
- printk(KERN_ERR
- "Attempt to unregister permanent protocol %d.\n",
+ pr_err("Attempt to unregister permanent protocol %d\n",
p->protocol);
} else {
spin_lock_bh(&inetsw_lock);
@@ -983,50 +1256,51 @@ void inet_unregister_protosw(struct inet_protosw *p)
synchronize_net();
}
}
-
-/*
- * Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr __read_mostly;
+EXPORT_SYMBOL(inet_unregister_protosw);
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
- int err;
+ __be32 old_saddr = inet->inet_saddr;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
struct rtable *rt;
- __be32 old_saddr = inet->saddr;
__be32 new_saddr;
- __be32 daddr = inet->daddr;
+ struct ip_options_rcu *inet_opt;
+ int err;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ lockdep_sock_is_held(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
/* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- sk->sk_protocol,
- inet->sport, inet->dport, sk);
- if (err)
- return err;
-
- sk_setup_caps(sk, &rt->u.dst);
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
+ sk->sk_protocol, inet->inet_sport,
+ inet->inet_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
- new_saddr = rt->rt_src;
+ new_saddr = fl4->saddr;
- if (new_saddr == old_saddr)
+ if (new_saddr == old_saddr) {
+ sk_setup_caps(sk, &rt->dst);
return 0;
+ }
- if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "%s(): shifting inet->"
- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- __FUNCTION__,
- NIPQUAD(old_saddr),
- NIPQUAD(new_saddr));
+ err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
+ if (err) {
+ ip_rt_put(rt);
+ return err;
}
- inet->saddr = inet->rcv_saddr = new_saddr;
+ sk_setup_caps(sk, &rt->dst);
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
+ }
/*
* XXX The only one ugly spot where we need to
@@ -1036,15 +1310,14 @@ static int inet_sk_reselect_saddr(struct sock *sk)
* Besides that, it does not check for connection
* uniqueness. Wait for troubles.
*/
- __sk_prot_rehash(sk);
- return 0;
+ return __sk_prot_rehash(sk);
}
int inet_sk_rebuild_header(struct sock *sk)
{
+ struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
- __be32 daddr;
+ struct flowi4 *fl4;
int err;
/* Route is OK, nothing to do. */
@@ -1052,213 +1325,578 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- daddr = inet->daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-{
- struct flowi fl = {
- .oif = sk->sk_bound_dev_if,
- .nl_u = {
- .ip4_u = {
- .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk),
- },
- },
- .proto = sk->sk_protocol,
- .uli_u = {
- .ports = {
- .sport = inet->sport,
- .dport = inet->dport,
- },
- },
- };
-
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, 0);
-}
- if (!err)
- sk_setup_caps(sk, &rt->u.dst);
- else {
+ fl4 = &inet->cork.fl.u.ip4;
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
+ if (!IS_ERR(rt)) {
+ err = 0;
+ sk_setup_caps(sk, &rt->dst);
+ } else {
+ err = PTR_ERR(rt);
+
/* Routing failed... */
sk->sk_route_caps = 0;
- /*
- * Other protocols have to map its equivalent state to TCP_SYN_SENT.
- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
- */
- if (!sysctl_ip_dynaddr ||
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
(err = inet_sk_reselect_saddr(sk)) != 0)
- sk->sk_err_soft = -err;
+ WRITE_ONCE(sk->sk_err_soft, -err);
}
return err;
}
-
EXPORT_SYMBOL(inet_sk_rebuild_header);
-static int inet_gso_send_check(struct sk_buff *skb)
+void inet_sk_set_state(struct sock *sk, int state)
+{
+ trace_inet_sock_set_state(sk, sk->sk_state, state);
+ sk->sk_state = state;
+}
+EXPORT_SYMBOL(inet_sk_set_state);
+
+void inet_sk_state_store(struct sock *sk, int newstate)
+{
+ trace_inet_sock_set_state(sk, sk->sk_state, newstate);
+ smp_store_release(&sk->sk_state, newstate);
+}
+
+struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
+ bool udpfrag = false, fixedid = false, gso_partial, encap;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ unsigned int offset = 0;
struct iphdr *iph;
- struct net_protocol *ops;
- int proto;
+ int proto, tot_len;
+ int nhoff;
int ihl;
- int err = -EINVAL;
+ int id;
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
ihl = iph->ihl * 4;
if (ihl < sizeof(*iph))
goto out;
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
+ __skb_pull(skb, ihl);
- skb->h.raw = __skb_pull(skb, ihl);
- iph = skb->nh.iph;
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
- err = -EPROTONOSUPPORT;
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features &= skb->dev->hw_enc_features;
+ SKB_GSO_CB(skb)->encap_level += ihl;
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_send_check))
- err = ops->gso_send_check(skb);
- rcu_read_unlock();
+ skb_reset_transport_header(skb);
+
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap));
+
+ if (!skb->encapsulation || encap)
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment)) {
+ segs = ops->callbacks.gso_segment(skb, features);
+ if (!segs)
+ skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
+ }
+
+ if (IS_ERR_OR_NULL(segs))
+ goto out;
+
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+ skb = segs;
+ do {
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
+ iph->frag_off = htons(offset >> 3);
+ if (skb->next)
+ iph->frag_off |= htons(IP_MF);
+ offset += skb->len - nhoff - ihl;
+ tot_len = skb->len - nhoff;
+ } else if (skb_is_gso(skb)) {
+ if (!fixedid) {
+ iph->id = htons(id);
+ id += skb_shinfo(skb)->gso_segs;
+ }
+
+ if (gso_partial)
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
+ else
+ tot_len = skb->len - nhoff;
+ } else {
+ if (!fixedid)
+ iph->id = htons(id++);
+ tot_len = skb->len - nhoff;
+ }
+ iph->tot_len = htons(tot_len);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
+ skb_reset_mac_len(skb);
+ } while ((skb = skb->next));
out:
- return err;
+ return segs;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct iphdr *iph;
- struct net_protocol *ops;
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
+ return ERR_PTR(-EINVAL);
+
+ return inet_gso_segment(skb, features);
+}
+
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+ const struct iphdr *iph;
+ struct sk_buff *p;
+ unsigned int hlen;
+ unsigned int off;
+ int flush = 1;
int proto;
- int ihl;
- int id;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- 0)))
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header(skb, hlen, off);
+ if (unlikely(!iph))
goto out;
- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ proto = iph->protocol;
+
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
goto out;
- iph = skb->nh.iph;
- ihl = iph->ihl * 4;
- if (ihl < sizeof(*iph))
+ if (*(u8 *)iph != 0x45)
goto out;
- if (unlikely(!pskb_may_pull(skb, ihl)))
+ if (ip_is_fragment(iph))
goto out;
- skb->h.raw = __skb_pull(skb, ihl);
- iph = skb->nh.iph;
- id = ntohs(iph->id);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
- segs = ERR_PTR(-EPROTONOSUPPORT);
+ if (unlikely(ip_fast_csum((u8 *)iph, 5)))
+ goto out;
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_segment))
- segs = ops->gso_segment(skb, features);
- rcu_read_unlock();
+ NAPI_GRO_CB(skb)->proto = proto;
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
+
+ list_for_each_entry(p, head, list) {
+ struct iphdr *iph2;
- if (!segs || unlikely(IS_ERR(segs)))
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
+ if ((iph->protocol ^ iph2->protocol) |
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;
+
+ /* Note : No need to call skb_gro_postpull_rcsum() here,
+ * as we already checked checksum over ipv4 header was 0
+ */
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+
+static struct sk_buff *ipip_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return inet_gro_receive(head, skb);
+}
+
+#define SECONDS_PER_DAY 86400
+
+/* inet_current_timestamp - Return IP network timestamp
+ *
+ * Return milliseconds since midnight in network byte order.
+ */
+__be32 inet_current_timestamp(void)
+{
+ u32 secs;
+ u32 msecs;
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+
+ /* Get secs since midnight. */
+ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
+ /* Convert to msecs. */
+ msecs = secs * MSEC_PER_SEC;
+ /* Convert nsec to msec. */
+ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
+
+ /* Convert to network byte order. */
+ return htonl(msecs);
+}
+EXPORT_SYMBOL(inet_current_timestamp);
+
+int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+ unsigned int family = READ_ONCE(sk->sk_family);
+
+ if (family == AF_INET)
+ return ip_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
+#endif
+ return -EINVAL;
+}
+EXPORT_SYMBOL(inet_recv_error);
+
+int inet_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ __be16 totlen = iph->tot_len;
+ int proto = iph->protocol;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation) {
+ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
+ skb_set_inner_network_header(skb, nhoff);
+ }
+
+ iph_set_totlen(iph, skb->len - nhoff);
+ csum_replace2(&iph->check, totlen, iph->tot_len);
+
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out;
- skb = segs;
- do {
- iph = skb->nh.iph;
- iph->id = htons(id++);
- iph->tot_len = htons(skb->len - skb->mac_len);
- iph->check = 0;
- iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
- } while ((skb = skb->next));
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
+ tcp4_gro_complete, udp4_gro_complete,
+ skb, nhoff + sizeof(*iph));
out:
- return segs;
+ return err;
+}
+
+static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+ return inet_gro_complete(skb, nhoff);
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+ unsigned short type, unsigned char protocol,
+ struct net *net)
+{
+ struct socket *sock;
+ int rc = sock_create_kern(net, family, type, protocol, &sock);
+
+ if (rc == 0) {
+ *sk = sock->sk;
+ (*sk)->sk_allocation = GFP_ATOMIC;
+ (*sk)->sk_use_task_frag = false;
+ /*
+ * Unhash it so that IP input processing does not even see it,
+ * we do not wish this socket to see incoming packets.
+ */
+ (*sk)->sk_prot->unhash(*sk);
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
+{
+ unsigned long res = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ res += snmp_get_cpu_field(mib, i, offt);
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field);
+
+#if BITS_PER_LONG==32
+
+u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
+ size_t syncp_offset)
+{
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib, cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin(syncp);
+ v = *(((u64 *)bhptr) + offt);
+ } while (u64_stats_fetch_retry(syncp, start));
+
+ return v;
}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
+
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
+{
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
#ifdef CONFIG_IP_MULTICAST
-static struct net_protocol igmp_protocol = {
+static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
};
#endif
-static struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
+static const struct net_protocol icmp_protocol = {
+ .handler = icmp_rcv,
+ .err_handler = icmp_err,
.no_policy = 1,
};
-static struct net_protocol udp_protocol = {
- .handler = udp_rcv,
- .err_handler = udp_err,
- .no_policy = 1,
-};
+static __net_init int ipv4_mib_init_net(struct net *net)
+{
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
+ goto err_tcp_mib;
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
+ goto err_ip_mib;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
-static struct net_protocol icmp_protocol = {
- .handler = icmp_rcv,
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
+ goto err_net_mib;
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
+ goto err_udp_mib;
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
+ goto err_udplite_mib;
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
+ goto err_icmp_mib;
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
+ goto err_icmpmsg_mib;
+
+ tcp_mib_init(net);
+ return 0;
+
+err_icmpmsg_mib:
+ free_percpu(net->mib.icmp_statistics);
+err_icmp_mib:
+ free_percpu(net->mib.udplite_statistics);
+err_udplite_mib:
+ free_percpu(net->mib.udp_statistics);
+err_udp_mib:
+ free_percpu(net->mib.net_statistics);
+err_net_mib:
+ free_percpu(net->mib.ip_statistics);
+err_ip_mib:
+ free_percpu(net->mib.tcp_statistics);
+err_tcp_mib:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+ kfree(net->mib.icmpmsg_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
+#ifdef CONFIG_MPTCP
+ /* allocated on demand, see mptcp_init_sock() */
+ free_percpu(net->mib.mptcp_statistics);
+#endif
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+ .init = ipv4_mib_init_net,
+ .exit = ipv4_mib_exit_net,
};
static int __init init_ipv4_mibs(void)
{
- net_statistics[0] = alloc_percpu(struct linux_mib);
- net_statistics[1] = alloc_percpu(struct linux_mib);
- ip_statistics[0] = alloc_percpu(struct ipstats_mib);
- ip_statistics[1] = alloc_percpu(struct ipstats_mib);
- icmp_statistics[0] = alloc_percpu(struct icmp_mib);
- icmp_statistics[1] = alloc_percpu(struct icmp_mib);
- tcp_statistics[0] = alloc_percpu(struct tcp_mib);
- tcp_statistics[1] = alloc_percpu(struct tcp_mib);
- udp_statistics[0] = alloc_percpu(struct udp_mib);
- udp_statistics[1] = alloc_percpu(struct udp_mib);
- udplite_statistics[0] = alloc_percpu(struct udp_mib);
- udplite_statistics[1] = alloc_percpu(struct udp_mib);
- if (!
- (net_statistics[0] && net_statistics[1] && ip_statistics[0]
- && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
- && udp_statistics[0] && udp_statistics[1]
- && udplite_statistics[0] && udplite_statistics[1] ) )
- return -ENOMEM;
-
- (void) tcp_mib_init();
+ return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+
+ /* Default values for sysctl-controlled parameters.
+ * We set them here, in case sysctl is not compiled.
+ */
+ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_fwd_update_priority = 1;
+ net->ipv4.sysctl_ip_dynaddr = 0;
+ net->ipv4.sysctl_ip_early_demux = 1;
+ net->ipv4.sysctl_udp_early_demux = 1;
+ net->ipv4.sysctl_tcp_early_demux = 1;
+ net->ipv4.sysctl_nexthop_compat_mode = 1;
+#ifdef CONFIG_SYSCTL
+ net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
+
+ /* Some igmp sysctl, whose values are always used */
+ net->ipv4.sysctl_igmp_max_memberships = 20;
+ net->ipv4.sysctl_igmp_max_msf = 10;
+ /* IGMP reports for link-local multicast groups are enabled by default */
+ net->ipv4.sysctl_igmp_llm_reports = 1;
+ net->ipv4.sysctl_igmp_qrv = 2;
+
+ net->ipv4.sysctl_fib_notify_on_flag_change = 0;
return 0;
}
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
static int ipv4_proc_init(void);
/*
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
+
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_segment = ipip_gso_segment,
+ .gro_receive = ipip_gro_receive,
+ .gro_complete = ipip_gro_complete,
+ },
+};
+
+static int __init ipip_offload_init(void)
+{
+ return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+}
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (udpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+ if (ipip_offload_init() < 0)
+ pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
+
+ net_hotdata.ip_packet_offload = (struct packet_offload) {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+ };
+ dev_add_offload(&net_hotdata.ip_packet_offload);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
+ .list_func = ip_list_rcv,
};
static int __init inet_init(void)
{
- struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
- int rc = -EINVAL;
+ int rc;
+
+ sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+ raw_hashinfo_init(&raw_v4_hashinfo);
rc = proto_register(&tcp_prot, 1);
if (rc)
@@ -1272,25 +1910,46 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_udp_proto;
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
/*
- * Tell SOCKET that we are alive...
+ * Tell SOCKET that we are alive...
*/
- (void)sock_register(&inet_family_ops);
+ (void)sock_register(&inet_family_ops);
+
+#ifdef CONFIG_SYSCTL
+ ip_static_sysctl_init();
+#endif
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
- if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
- if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+
+ net_hotdata.udp_protocol = (struct net_protocol) {
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+ };
+ if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0)
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
+
+ net_hotdata.tcp_protocol = (struct net_protocol) {
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .icmp_strict_tag_validation = 1,
+ };
+ if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0)
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
@@ -1306,48 +1965,60 @@ static int __init inet_init(void)
arp_init();
- /*
- * Set the IP module up
- */
+ /*
+ * Set the IP module up
+ */
ip_init();
- tcp_v4_init(&inet_family_ops);
+ /* Initialise per-cpu ipv4 mibs */
+ if (init_ipv4_mibs())
+ panic("%s: Cannot init ipv4 mibs\n", __func__);
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ raw_init();
+
+ ping_init();
+
/*
* Set the ICMP layer up
*/
- icmp_init(&inet_family_ops);
+ if (icmp_init() < 0)
+ panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
- ip_mr_init();
+ if (ip_mr_init())
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
- /*
- * Initialise per-cpu ipv4 mibs
- */
- if(init_ipv4_mibs())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
-
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
+
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type);
+ ip_tunnel_core_init();
+
rc = 0;
out:
return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
@@ -1370,15 +2041,15 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
- if (fib_proc_init())
- goto out_fib;
+ if (ping_proc_init())
+ goto out_ping;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
- fib_proc_exit();
-out_fib:
+ ping_proc_exit();
+out_ping:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();
@@ -1395,23 +2066,3 @@ static int __init ipv4_proc_init(void)
return 0;
}
#endif /* CONFIG_PROC_FS */
-
-MODULE_ALIAS_NETPROTO(PF_INET);
-
-EXPORT_SYMBOL(inet_accept);
-EXPORT_SYMBOL(inet_bind);
-EXPORT_SYMBOL(inet_dgram_connect);
-EXPORT_SYMBOL(inet_dgram_ops);
-EXPORT_SYMBOL(inet_getname);
-EXPORT_SYMBOL(inet_ioctl);
-EXPORT_SYMBOL(inet_listen);
-EXPORT_SYMBOL(inet_register_protosw);
-EXPORT_SYMBOL(inet_release);
-EXPORT_SYMBOL(inet_sendmsg);
-EXPORT_SYMBOL(inet_shutdown);
-EXPORT_SYMBOL(inet_sock_destruct);
-EXPORT_SYMBOL(inet_stream_connect);
-EXPORT_SYMBOL(inet_stream_ops);
-EXPORT_SYMBOL(inet_unregister_protosw);
-EXPORT_SYMBOL(net_statistics);
-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 67a5509e26fc..64aec3dff8ec 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,22 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/hash.h>
+#include <crypto/utils.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
+#include <linux/scatterlist.h>
#include <net/icmp.h>
#include <net/protocol.h>
-#include <asm/scatterlist.h>
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash);
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
/* Clear mutable options and find final destination to substitute
* into IP header for icv calculation. Options are already checked
* for validity, so paranoia is not required. */
-static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
{
- unsigned char * optptr = (unsigned char*)(iph+1);
+ unsigned char *optptr = (unsigned char *)(iph+1);
int l = iph->ihl*4 - sizeof(struct iphdr);
int optlen;
@@ -44,9 +104,9 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
if (optlen < 6)
return -EINVAL;
memcpy(daddr, optptr+optlen-4, 4);
- /* Fall through */
+ fallthrough;
default:
- memset(optptr+2, 0, optlen-2);
+ memset(optptr, 0, optlen);
}
l -= optlen;
optptr += optlen;
@@ -54,19 +114,79 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
return 0;
}
+static void ah_output_done(void *data, int err)
+{
+ u8 *icv;
+ struct iphdr *iph;
+ struct sk_buff *skb = data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct iphdr *top_iph = ip_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+
+ iph = AH_SKB_CB(skb)->tmp;
+ icv = ah_tmp_icv(iph, ihl);
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb->sk, skb, err);
+}
+
static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
+ int nfrags;
+ int ihl;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
struct iphdr *iph, *top_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- union {
- struct iphdr iph;
- char buf[60];
- } tmp_iph;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ skb_push(skb, -skb_network_offset(skb));
+ ah = ip_auth_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+ err = -ENOMEM;
+ iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
+ if (!iph)
+ goto out;
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
- top_iph = skb->nh.iph;
- iph = &tmp_iph.iph;
+ top_iph = ip_hdr(skb);
iph->tos = top_iph->tos;
iph->ttl = top_iph->ttl;
@@ -77,31 +197,53 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
if (err)
- goto error;
+ goto out_free;
}
- ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
- ah->nexthdr = top_iph->protocol;
+ ah->nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
top_iph->tos = 0;
top_iph->tot_len = htons(skb->len);
top_iph->frag_off = 0;
top_iph->ttl = 0;
- top_iph->protocol = IPPROTO_AH;
top_iph->check = 0;
- ahp = x->data;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len) >> 2) - 2;
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(++x->replay.oseq);
- xfrm_aevent_doreplay(x);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- if (err)
- goto error;
- memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags + sglists);
+ err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out_free;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -ENOSPC)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
top_iph->ttl = iph->ttl;
@@ -111,51 +253,132 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
}
- ip_send_check(top_iph);
+out_free:
+ kfree(iph);
+out:
+ return err;
+}
- err = 0;
+static void ah_input_done(void *data, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ struct iphdr *work_iph;
+ struct sk_buff *skb = data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
-error:
- return err;
+ if (err)
+ goto out;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
+
+ err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
}
static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ah_hlen;
int ihl;
- int err = -EINVAL;
- struct iphdr *iph;
+ int nexthdr;
+ int nfrags;
+ u8 *auth_data;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *work_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- char work_buf[60];
+ int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
- if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+ if (!pskb_may_pull(skb, sizeof(*ah)))
goto out;
- ah = (struct ip_auth_hdr*)skb->data;
+ ah = (struct ip_auth_hdr *)skb->data;
ahp = x->data;
+ ahash = ahp->ahash;
+
+ nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
-
- if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len))
- goto out;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
if (!pskb_may_pull(skb, ah_hlen))
goto out;
/* We are going to _remove_ AH header to keep sockets happy,
* so... Later this can change. */
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto out;
skb->ip_summed = CHECKSUM_NONE;
- ah = (struct ip_auth_hdr*)skb->data;
- iph = skb->nh.iph;
- ihl = skb->data - skb->nh.raw;
- memcpy(work_buf, iph, ihl);
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+ ahp->icv_trunc_len + seqhi_len);
+ if (!work_iph) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
+ icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memcpy(work_iph, iph, ihl);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
iph->ttl = 0;
iph->tos = 0;
@@ -163,108 +386,145 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
iph->check = 0;
if (ihl > sizeof(*iph)) {
__be32 dummy;
- if (ip_clear_mutable_options(iph, &dummy))
- goto out;
- }
- {
- u8 auth_data[MAX_AH_AUTH_LEN];
-
- memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
- skb_push(skb, ihl);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
+ err = ip_clear_mutable_options(iph, &dummy);
if (err)
+ goto out_free;
+ }
+
+ skb_push(skb, ihl);
+
+ sg_init_table(sg, nfrags + sglists);
+ err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out_free;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
goto out;
- err = -EINVAL;
- if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) {
- x->stats.integrity_failed++;
- goto out;
- }
+
+ goto out_free;
}
- ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
- skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl);
+
+ err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+ if (err)
+ goto out_free;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
- return 0;
+ err = nexthdr;
+out_free:
+ kfree (work_iph);
out:
return err;
}
-static void ah4_err(struct sk_buff *skb, u32 info)
+static int ah4_err(struct sk_buff *skb, u32 info)
{
- struct iphdr *iph = (struct iphdr*)skb->data;
- struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
- skb->h.icmph->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ break;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ ah->spi, IPPROTO_AH, AF_INET);
if (!x)
- return;
- printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
- ntohl(ah->spi), ntohl(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
+ else
+ ipv4_redirect(skb, net, 0, IPPROTO_AH);
xfrm_state_put(x);
+
+ return 0;
}
-static int ah_init_state(struct xfrm_state *x)
+static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *tfm;
-
- if (!x->aalg)
- goto error;
+ struct crypto_ahash *ahash;
- /* null auth can use a zero length key */
- if (x->aalg->alg_key_len > 512)
+ if (!x->aalg) {
+ NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
goto error;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
goto error;
+ }
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
- if (ahp == NULL)
+ if (!ahp)
return -ENOMEM;
- ahp->key = x->aalg->alg_key;
- ahp->key_len = (x->aalg->alg_key_len+7)/8;
- tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
- ahp->tfm = tfm;
- if (crypto_hash_setkey(tfm, ahp->key, ahp->key_len))
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
-
+ }
+
/*
* Lookup the algorithm description maintained by xfrm_algo,
* verify crypto transform properties, and store information
* we need for AH processing. This lookup cannot fail here
- * after a successful crypto_alloc_hash().
+ * after a successful crypto_alloc_ahash().
*/
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
BUG_ON(!aalg_desc);
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(tfm)) {
- printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_hash_digestsize(tfm),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ crypto_ahash_digestsize(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
}
-
+
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
-
- BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
-
- ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
- if (!ahp->work_icv)
- goto error;
-
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
@@ -273,8 +533,7 @@ static int ah_init_state(struct xfrm_state *x)
error:
if (ahp) {
- kfree(ahp->work_icv);
- crypto_free_hash(ahp->tfm);
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
return -EINVAL;
@@ -287,39 +546,42 @@ static void ah_destroy(struct xfrm_state *x)
if (!ahp)
return;
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- crypto_free_hash(ahp->tfm);
- ahp->tfm = NULL;
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
-static struct xfrm_type ah_type =
+static const struct xfrm_type ah_type =
{
- .description = "AH4",
.owner = THIS_MODULE,
.proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
.init_state = ah_init_state,
.destructor = ah_destroy,
.input = ah_input,
.output = ah_output
};
-static struct net_protocol ah4_protocol = {
+static struct xfrm4_protocol ah4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah4_rcv_cb,
.err_handler = ah4_err,
- .no_policy = 1,
+ .priority = 0,
};
static int __init ah4_init(void)
{
if (xfrm_register_type(&ah_type, AF_INET) < 0) {
- printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
- printk(KERN_INFO "ip ah init: can't add protocol\n");
+ if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ah_type, AF_INET);
return -EAGAIN;
}
@@ -328,12 +590,13 @@ static int __init ah4_init(void)
static void __exit ah4_fini(void)
{
- if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
- printk(KERN_INFO "ip ah close: can't remove protocol\n");
- if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
- printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+ if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ xfrm_unregister_type(&ah_type, AF_INET);
}
module_init(ah4_init);
module_exit(ah4_fini);
+MODULE_DESCRIPTION("IPv4 AH transformation library");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 3981e8be9ab8..7f3863daaa40 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* linux/net/ipv4/arp.c
*
- * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
- *
* Copyright (C) 1994 by Florian La Roche
*
* This module implements the Address Resolution Protocol ARP (RFC 826),
@@ -9,15 +8,10 @@
* high-level addresses) into a low-level hardware address (like an Ethernet
* address).
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
- * Alan Cox : Removed the Ethernet assumptions in
+ * Alan Cox : Removed the Ethernet assumptions in
* Florian's code
- * Alan Cox : Fixed some small errors in the ARP
+ * Alan Cox : Fixed some small errors in the ARP
* logic
* Alan Cox : Allow >4K in /proc
* Alan Cox : Make ARP add its own protocol entry
@@ -39,25 +33,25 @@
* Jonathan Naylor : Only lookup the hardware address for
* the correct hardware type.
* Germano Caronni : Assorted subtle races.
- * Craig Schlenter : Don't modify permanent entry
+ * Craig Schlenter : Don't modify permanent entry
* during arp_rcv.
* Russ Nelson : Tidied up a few bits.
* Alexey Kuznetsov: Major changes to caching and behaviour,
- * eg intelligent arp probing and
+ * eg intelligent arp probing and
* generation
* of host down events.
* Alan Cox : Missing unlock in device events.
* Eckes : ARP ioctl control errors.
* Alexey Kuznetsov: Arp free fix.
* Manuel Rodriguez: Gratuitous ARP.
- * Jonathan Layes : Added arpd support through kerneld
+ * Jonathan Layes : Added arpd support through kerneld
* message queue (960314)
* Mike Shaver : /proc/sys/net/ipv4/arp_* support
* Mike McLagan : Routing by source
* Stuart Cheshire : Metricom and grat arp fixes
* *** FOR 2.1 clean this up ***
* Lawrence V. Stefani: (08/12/96) Added FDDI support.
- * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
* folded into the mainstream FDDI code.
* Ack spit, Linus how did you allow that
* one in...
@@ -72,13 +66,15 @@
* bonding can change the skb before
* sending (e.g. insert 8021q tag).
* Harald Welte : convert to make use of jenkins hash
+ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -91,7 +87,6 @@
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
-#include <linux/trdevice.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -99,11 +94,12 @@
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
-#include <linux/jhash.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
@@ -111,97 +107,82 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
#include <net/ax25.h>
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
#include <net/netrom.h>
-#endif
-#endif
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-#include <net/atmclip.h>
-struct neigh_table *clip_tbl_hook;
-#endif
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter_arp.h>
/*
* Interface to generic neighbour cache.
*/
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
static void parp_redo(struct sk_buff *skb);
+static int arp_is_multicast(const void *pkey);
-static struct neigh_ops arp_generic_ops = {
+static const struct neigh_ops arp_generic_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
-static struct neigh_ops arp_hh_ops = {
+static const struct neigh_ops arp_hh_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
-static struct neigh_ops arp_direct_ops = {
+static const struct neigh_ops arp_direct_ops = {
.family = AF_INET,
- .output = dev_queue_xmit,
- .connected_output = dev_queue_xmit,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
-};
-
-struct neigh_ops arp_broken_ops = {
- .family = AF_INET,
- .solicit = arp_solicit,
- .error_report = arp_error_report,
- .output = neigh_compat_output,
- .connected_output = neigh_compat_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
};
struct neigh_table arp_tbl = {
- .family = AF_INET,
- .entry_size = sizeof(struct neighbour) + 4,
- .key_len = 4,
- .hash = arp_hash,
- .constructor = arp_constructor,
- .proxy_redo = parp_redo,
- .id = "arp_cache",
- .parms = {
- .tbl = &arp_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
+ .family = AF_INET,
+ .key_len = 4,
+ .protocol = cpu_to_be16(ETH_P_IP),
+ .hash = arp_hash,
+ .key_eq = arp_key_eq,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .is_multicast = arp_is_multicast,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .reachable_time = 30 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
},
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
};
+EXPORT_SYMBOL(arp_tbl);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
@@ -210,12 +191,12 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
ip_eth_mc_map(addr, haddr);
- return 0;
- case ARPHRD_IEEE802_TR:
- ip_tr_mc_map(addr, haddr);
return 0;
case ARPHRD_INFINIBAND:
- ip_ib_mc_map(addr, haddr);
+ ip_ib_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ case ARPHRD_IPGRE:
+ ip_ipgre_mc_map(addr, dev->broadcast, haddr);
return 0;
default:
if (dir) {
@@ -227,36 +208,48 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
}
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
+{
+ return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+ return neigh_key_eq32(neigh, pkey);
}
static int arp_constructor(struct neighbour *neigh)
{
- __be32 addr = *(__be32*)neigh->primary_key;
+ __be32 addr;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
+ u32 inaddr_any = INADDR_ANY;
- neigh->type = inet_addr_type(addr);
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+ memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);
+ addr = *(__be32 *)neigh->primary_key;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev == NULL) {
+ if (!in_dev) {
rcu_read_unlock();
return -EINVAL;
}
+ neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
+
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
rcu_read_unlock();
- if (dev->hard_header == NULL) {
+ if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
- neigh->output = neigh->ops->queue_xmit;
+ neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
@@ -273,48 +266,24 @@ static int arp_constructor(struct neighbour *neigh)
in old paradigm.
*/
-#if 1
- /* So... these "amateur" devices are hopeless.
- The only thing, that I can say now:
- It is very sad that we need to keep ugly obsolete
- code to make them happy.
-
- They should be moved to more reasonable state, now
- they use rebuild_header INSTEAD OF hard_start_xmit!!!
- Besides that, they are sort of out of date
- (a lot of redundant clones/copies, useless in 2.1),
- I wonder why people believe that they work.
- */
- switch (dev->type) {
- default:
- break;
- case ARPHRD_ROSE:
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
- case ARPHRD_AX25:
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
- case ARPHRD_NETROM:
-#endif
- neigh->ops = &arp_broken_ops;
- neigh->output = neigh->ops->output;
- return 0;
-#endif
- ;}
-#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
- } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
- } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
- if (dev->hard_header_cache)
+
+ if (dev->header_ops->cache)
neigh->ops = &arp_hh_ops;
else
neigh->ops = &arp_generic_ops;
- if (neigh->nud_state&NUD_VALID)
+
+ if (neigh->nud_state & NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
@@ -325,32 +294,71 @@ static int arp_constructor(struct neighbour *neigh)
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
dst_link_failure(skb);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
}
+/* Create and send an arp packet. */
+static void arp_send_dst(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw,
+ struct dst_entry *dst)
+{
+ struct sk_buff *skb;
+
+ /* arp on this interface. */
+ if (dev->flags & IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ skb_dst_set(skb, dst_clone(dst));
+ arp_xmit(skb);
+}
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
+ target_hw, NULL);
+}
+EXPORT_SYMBOL(arp_send);
+
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
- u8 *dst_ha = NULL;
+ u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
struct net_device *dev = neigh->dev;
- __be32 target = *(__be32*)neigh->primary_key;
+ __be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev;
+ struct dst_entry *dst = NULL;
- if (!in_dev)
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
return;
-
+ }
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
- saddr = skb->nh.iph->saddr;
+ if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
+ saddr = ip_hdr(skb)->saddr;
break;
case 1: /* Restrict announcements of saddr in same subnet */
if (!skb)
break;
- saddr = skb->nh.iph->saddr;
- if (inet_addr_type(saddr) == RTN_LOCAL) {
+ saddr = ip_hdr(skb)->saddr;
+ if (inet_addr_type_dev_table(dev_net(dev), dev,
+ saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
@@ -360,33 +368,34 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
case 2: /* Avoid secondary IPs, get a primary/preferred one */
break;
}
+ rcu_read_unlock();
- if (in_dev)
- in_dev_put(in_dev);
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
- if ((probes -= neigh->parms->ucast_probes) < 0) {
- if (!(neigh->nud_state&NUD_VALID))
- printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
- dst_ha = neigh->ha;
- read_lock_bh(&neigh->lock);
- } else if ((probes -= neigh->parms->app_probes) < 0) {
-#ifdef CONFIG_ARPD
- neigh_app_ns(neigh);
-#endif
- return;
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+ if (probes < 0) {
+ if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
+ neigh_ha_snapshot(dst_ha, neigh, dev);
+ dst_hw = dst_ha;
+ } else {
+ probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
+ if (probes < 0) {
+ neigh_app_ns(neigh);
+ return;
+ }
}
- arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_ha, dev->dev_addr, NULL);
- if (dst_ha)
- read_unlock_bh(&neigh->lock);
+ if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+ dst = skb_dst(skb);
+ arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL, dst);
}
-static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
- __be32 sip, __be32 tip)
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
+ struct net *net = dev_net(in_dev->dev);
int scope;
switch (IN_DEV_ARP_IGNORE(in_dev)) {
@@ -405,7 +414,7 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
case 3: /* Do not reply for scope host addresses */
sip = 0;
scope = RT_SCOPE_LINK;
- dev = NULL;
+ in_dev = NULL;
break;
case 4: /* Reserved */
case 5:
@@ -417,139 +426,112 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
default:
return 0;
}
- return !inet_confirm_addr(dev, sip, tip, scope);
+ return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}
-static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
+static int arp_accept(struct in_device *in_dev, __be32 sip)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
- .saddr = tip } } };
- struct rtable *rt;
- int flag = 0;
- /*unsigned long now; */
-
- if (ip_route_output_key(&rt, &fl) < 0)
- return 1;
- if (rt->u.dst.dev != dev) {
- NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
- flag = 1;
- }
- ip_rt_put(rt);
- return flag;
-}
-
-/* OBSOLETE FUNCTIONS */
+ struct net *net = dev_net(in_dev->dev);
+ int scope = RT_SCOPE_LINK;
-/*
- * Find an arp mapping in the cache. If not found, post a request.
- *
- * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
- * even if it exists. It is supposed that skb->dev was mangled
- * by a virtual device (eql, shaper). Nobody but broken devices
- * is allowed to use this function, it is scheduled to be removed. --ANK
- */
-
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
-{
- switch (addr_hint) {
- case RTN_LOCAL:
- printk(KERN_DEBUG "ARP: arp called for own IP address\n");
- memcpy(haddr, dev->dev_addr, dev->addr_len);
- return 1;
- case RTN_MULTICAST:
- arp_mc_map(paddr, haddr, dev, 1);
- return 1;
- case RTN_BROADCAST:
- memcpy(haddr, dev->broadcast, dev->addr_len);
+ switch (IN_DEV_ARP_ACCEPT(in_dev)) {
+ case 0: /* Don't create new entries from garp */
+ return 0;
+ case 1: /* Create new entries from garp */
return 1;
+ case 2: /* Create a neighbor in the arp table only if sip
+ * is in the same subnet as an address configured
+ * on the interface that received the garp message
+ */
+ return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
+ default:
+ return 0;
}
- return 0;
}
-
-int arp_find(unsigned char *haddr, struct sk_buff *skb)
+static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
- struct net_device *dev = skb->dev;
- __be32 paddr;
- struct neighbour *n;
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+ struct net *net = dev_net(dev);
- if (!skb->dst) {
- printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
- kfree_skb(skb);
+ rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev),
+ RT_SCOPE_UNIVERSE);
+ if (IS_ERR(rt))
return 1;
+ if (rt->dst.dev != dev) {
+ __NET_INC_STATS(net, LINUX_MIB_ARPFILTER);
+ flag = 1;
}
-
- paddr = ((struct rtable*)skb->dst)->rt_gateway;
-
- if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
- return 0;
-
- n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
-
- if (n) {
- n->used = jiffies;
- if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- read_lock_bh(&n->lock);
- memcpy(haddr, n->ha, dev->addr_len);
- read_unlock_bh(&n->lock);
- neigh_release(n);
- return 0;
- }
- neigh_release(n);
- } else
- kfree_skb(skb);
- return 1;
-}
-
-/* END OF OBSOLETE FUNCTIONS */
-
-int arp_bind_neighbour(struct dst_entry *dst)
-{
- struct net_device *dev = dst->dev;
- struct neighbour *n = dst->neighbour;
-
- if (dev == NULL)
- return -EINVAL;
- if (n == NULL) {
- __be32 nexthop = ((struct rtable*)dst)->rt_gateway;
- if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
- nexthop = 0;
- n = __neigh_lookup_errno(
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- dev->type == ARPHRD_ATM ? clip_tbl_hook :
-#endif
- &arp_tbl, &nexthop, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
- dst->neighbour = n;
- }
- return 0;
+ ip_rt_put(rt);
+ return flag;
}
/*
* Check if we can use proxy ARP for this path
*/
-
-static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt)
{
struct in_device *out_dev;
int imi, omi = -1;
- if (!IN_DEV_PROXY_ARP(in_dev))
+ if (rt->dst.dev == dev)
return 0;
- if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
return 1;
if (imi == -1)
return 0;
/* place to check for proxy_arp for routes */
- if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
omi = IN_DEV_MEDIUM_ID(out_dev);
- in_dev_put(out_dev);
- }
- return (omi != imi && omi != -1);
+
+ return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface. This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router. As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ * This technology is known by different names:
+ * In RFC 3069 it is called VLAN Aggregation.
+ * Cisco and Allied Telesyn call it Private VLAN.
+ * Hewlett-Packard call it Source-Port filtering or port-isolation.
+ * Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt,
+ __be32 sip, __be32 tip)
+{
+ /* Private VLAN is only concerned about the same ethernet segment */
+ if (rt->dst.dev != dev)
+ return 0;
+
+ /* Don't reply on self probes (often done by windowz boxes)*/
+ if (sip == tip)
+ return 0;
+
+ if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+ return 1;
+ else
+ return 0;
}
/*
@@ -557,42 +539,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
*/
/*
- * Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ * Create an arp packet. If dest_hw is not set, we create a broadcast
* message.
*/
struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
- unsigned char *dest_hw, unsigned char *src_hw,
- unsigned char *target_hw)
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw)
{
struct sk_buff *skb;
struct arphdr *arp;
unsigned char *arp_ptr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
/*
* Allocate a buffer
*/
-
- skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
- + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
- if (skb == NULL)
+
+ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
+ if (!skb)
return NULL;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
- skb->nh.raw = skb->data;
- arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+ arp = skb_put(skb, arp_hdr_len(dev));
skb->dev = dev;
skb->protocol = htons(ETH_P_ARP);
- if (src_hw == NULL)
+ if (!src_hw)
src_hw = dev->dev_addr;
- if (dest_hw == NULL)
+ if (!dest_hw)
dest_hw = dev->broadcast;
/*
* Fill the device header for the ARP frame
*/
- if (dev->hard_header &&
- dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
goto out;
/*
@@ -611,13 +594,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pro = htons(ETH_P_IP);
break;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:
arp->ar_hrd = htons(ARPHRD_AX25);
arp->ar_pro = htons(AX25_P_IP);
break;
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:
arp->ar_hrd = htons(ARPHRD_NETROM);
arp->ar_pro = htons(AX25_P_IP);
@@ -625,35 +608,37 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
#endif
#endif
-#ifdef CONFIG_FDDI
+#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
arp->ar_hrd = htons(ARPHRD_ETHER);
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
-#ifdef CONFIG_TR
- case ARPHRD_IEEE802_TR:
- arp->ar_hrd = htons(ARPHRD_IEEE802);
- arp->ar_pro = htons(ETH_P_IP);
- break;
-#endif
}
arp->ar_hln = dev->addr_len;
arp->ar_pln = 4;
arp->ar_op = htons(type);
- arp_ptr=(unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
memcpy(arp_ptr, src_hw, dev->addr_len);
- arp_ptr+=dev->addr_len;
- memcpy(arp_ptr, &src_ip,4);
- arp_ptr+=4;
- if (target_hw != NULL)
- memcpy(arp_ptr, target_hw, dev->addr_len);
- else
- memset(arp_ptr, 0, dev->addr_len);
- arp_ptr+=dev->addr_len;
+ arp_ptr += dev->addr_len;
+ memcpy(arp_ptr, &src_ip, 4);
+ arp_ptr += 4;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
memcpy(arp_ptr, &dest_ip, 4);
return skb;
@@ -662,90 +647,93 @@ out:
kfree_skb(skb);
return NULL;
}
+EXPORT_SYMBOL(arp_create);
+
+static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
/*
* Send an arp packet.
*/
void arp_xmit(struct sk_buff *skb)
{
+ rcu_read_lock();
/* Send it off, maybe filter it using firewalling first. */
- NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
+ dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev,
+ arp_xmit_finish);
+ rcu_read_unlock();
}
+EXPORT_SYMBOL(arp_xmit);
-/*
- * Create and send an arp packet.
- */
-void arp_send(int type, int ptype, __be32 dest_ip,
- struct net_device *dev, __be32 src_ip,
- unsigned char *dest_hw, unsigned char *src_hw,
- unsigned char *target_hw)
+static bool arp_is_garp(struct net *net, struct net_device *dev,
+ int *addr_type, __be16 ar_op,
+ __be32 sip, __be32 tip,
+ unsigned char *sha, unsigned char *tha)
{
- struct sk_buff *skb;
+ bool is_garp = tip == sip;
- /*
- * No arp on this interface.
+ /* Gratuitous ARP _replies_ also require target hwaddr to be
+ * the same as source.
*/
-
- if (dev->flags&IFF_NOARP)
- return;
-
- skb = arp_create(type, ptype, dest_ip, dev, src_ip,
- dest_hw, src_hw, target_hw);
- if (skb == NULL) {
- return;
+ if (is_garp && ar_op == htons(ARPOP_REPLY))
+ is_garp =
+ /* IPv4 over IEEE 1394 doesn't provide target
+ * hardware address field in its ARP payload.
+ */
+ tha &&
+ !memcmp(tha, sha, dev->addr_len);
+
+ if (is_garp) {
+ *addr_type = inet_addr_type_dev_table(net, dev, sip);
+ if (*addr_type != RTN_UNICAST)
+ is_garp = false;
}
-
- arp_xmit(skb);
+ return is_garp;
}
/*
* Process an arp request.
*/
-static int arp_process(struct sk_buff *skb)
+static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
- unsigned char *sha, *tha;
+ unsigned char *sha;
+ unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
+ struct dst_entry *reply_dst = NULL;
+ bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
*/
- if (in_dev == NULL)
- goto out;
+ if (!in_dev)
+ goto out_free_skb;
- arp = skb->nh.arph;
+ arp = arp_hdr(skb);
switch (dev_type) {
- default:
+ default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
- goto out;
+ goto out_free_skb;
break;
-#ifdef CONFIG_NET_ETHERNET
case ARPHRD_ETHER:
-#endif
-#ifdef CONFIG_TR
- case ARPHRD_IEEE802_TR:
-#endif
-#ifdef CONFIG_FDDI
case ARPHRD_FDDI:
-#endif
-#ifdef CONFIG_NET_FC
case ARPHRD_IEEE802:
-#endif
-#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
- defined(CONFIG_FDDI) || defined(CONFIG_NET_FC)
/*
- * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
@@ -756,48 +744,59 @@ static int arp_process(struct sk_buff *skb)
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
- goto out;
+ goto out_free_skb;
break;
-#endif
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
- goto out;
+ goto out_free_skb;
break;
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
- goto out;
+ goto out_free_skb;
break;
-#endif
-#endif
}
/* Understand only these message types */
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
- goto out;
+ goto out_free_skb;
/*
* Extract fields
*/
- arp_ptr= (unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
- tha = arp_ptr;
- arp_ptr += dev->addr_len;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ tha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ }
memcpy(&tip, arp_ptr, 4);
-/*
+/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
- if (LOOPBACK(tip) || MULTICAST(tip))
- goto out;
+ if (ipv4_is_multicast(tip) ||
+ (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
+ goto out_free_skb;
+
+ /*
+ * For some 802.11 wireless deployments (and possibly other networks),
+ * there will be an ARP proxy and gratuitous ARP frames are attacks
+ * and thus should not be accepted.
+ */
+ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+ goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
@@ -809,67 +808,80 @@ static int arp_process(struct sk_buff *skb)
* Process entry. The idea here is we want to send a reply if it is a
* request for us or if it is a request for someone else that we hold
* a proxy for. We want to add an entry to our cache if it is a reply
- * to us or if it is a request for our address.
- * (The assumption for this last is that if someone is requesting our
- * address, they are probably intending to talk to us, so it saves time
- * if we cache their address. Their address is also probably not in
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
* our cache, since ours is not in their cache.)
- *
+ *
* Putting this another way, we only care about replies if they are to
* us, in which case we add them to the cache. For requests, we care
* about those for us and those for our proxies. We reply to both,
- * and in the case of requests for us we add the requester to the arp
+ * and in the case of requests for us we add the requester to the arp
* cache.
*/
+ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
+ reply_dst = (struct dst_entry *)
+ iptunnel_metadata_reply(skb_metadata_dst(skb),
+ GFP_ATOMIC);
+
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- inet_addr_type(tip) == RTN_LOCAL &&
- !arp_ignore(in_dev,dev,sip,tip))
- arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
- goto out;
+ inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev, sip, tip))
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
+ sha, dev->dev_addr, sha, reply_dst);
+ goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- ip_route_input(skb, tip, sip, 0, dev) == 0) {
+ ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
- rt = (struct rtable*)skb->dst;
+ rt = skb_rtable(skb);
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
- n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
- if (n) {
- int dont_send = 0;
-
- if (!dont_send)
- dont_send |= arp_ignore(in_dev,dev,sip,tip);
- if (!dont_send && IN_DEV_ARPFILTER(in_dev))
- dont_send |= arp_filter(sip,tip,dev);
- if (!dont_send)
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
-
- neigh_release(n);
+ int dont_send;
+
+ dont_send = arp_ignore(in_dev, sip, tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send = arp_filter(sip, tip, dev);
+ if (!dont_send) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+ sip, dev, tip, sha,
+ dev->dev_addr, sha,
+ reply_dst);
+ neigh_release(n);
+ }
}
- goto out;
+ goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
- if ((rt->rt_flags&RTCF_DNAT) ||
- (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
- (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+ if (addr_type == RTN_UNICAST &&
+ (arp_fwd_proxy(in_dev, dev, rt) ||
+ arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
- if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
- in_dev->arp_parms->proxy_delay == 0) {
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+ sip, dev, tip, sha,
+ dev->dev_addr, sha,
+ reply_dst);
} else {
- pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
- in_dev_put(in_dev);
- return 0;
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
+ goto out_free_dst;
}
- goto out;
+ goto out_consume_skb;
}
}
}
@@ -878,15 +890,26 @@ static int arp_process(struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (ipv4_devconf.arp_accept) {
+ addr_type = -1;
+ if (n || arp_accept(in_dev, sip)) {
+ is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
+ sip, tip, sha, tha);
+ }
+
+ if (arp_accept(in_dev, sip)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
- if (n == NULL &&
- arp->ar_op == htons(ARPOP_REPLY) &&
- inet_addr_type(sip) == RTN_UNICAST)
- n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+ if (!n &&
+ (is_garp ||
+ (arp->ar_op == htons(ARPOP_REPLY) &&
+ (addr_type == RTN_UNICAST ||
+ (addr_type < 0 &&
+ /* postpone calculation to as late as possible */
+ inet_addr_type_dev_table(net, dev, sip) ==
+ RTN_UNICAST)))))
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
if (n) {
@@ -898,7 +921,10 @@ static int arp_process(struct sk_buff *skb)
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
- override = time_after(jiffies, n->updated + n->parms->locktime);
+ override = time_after(jiffies,
+ n->updated +
+ NEIGH_VAR(n->parms, LOCKTIME)) ||
+ is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
@@ -906,22 +932,32 @@ static int arp_process(struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
- neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
neigh_release(n);
}
-out:
- if (in_dev)
- in_dev_put(in_dev);
+out_consume_skb:
+ consume_skb(skb);
+
+out_free_dst:
+ dst_release(reply_dst);
+ return NET_RX_SUCCESS;
+
+out_free_skb:
kfree_skb(skb);
- return 0;
+ return NET_RX_DROP;
}
static void parp_redo(struct sk_buff *skb)
{
- arp_process(skb);
+ arp_process(dev_net(skb->dev), NULL, skb);
}
+static int arp_is_multicast(const void *pkey)
+{
+ return ipv4_is_multicast(*((__be32 *)pkey));
+}
/*
* Receive an arp request from the device layer.
@@ -930,89 +966,151 @@ static void parp_redo(struct sk_buff *skb)
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct arphdr *arp;
+ enum skb_drop_reason drop_reason;
+ const struct arphdr *arp;
+
+ /* do not tweak dropwatch on an ARP we will ignore */
+ if (dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK)
+ goto consumeskb;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
- if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
- (2 * dev->addr_len) +
- (2 * sizeof(u32)))))
+ drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev));
+ if (drop_reason != SKB_NOT_DROPPED_YET)
goto freeskb;
- arp = skb->nh.arph;
- if (arp->ar_hln != dev->addr_len ||
- dev->flags & IFF_NOARP ||
- skb->pkt_type == PACKET_OTHERHOST ||
- skb->pkt_type == PACKET_LOOPBACK ||
- arp->ar_pln != 4)
+ arp = arp_hdr(skb);
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) {
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
goto freeskb;
-
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- goto out_of_mem;
+ }
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
- return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
+ dev_net(dev), NULL, skb, dev, NULL,
+ arp_process);
+consumeskb:
+ consume_skb(skb);
+ return NET_RX_SUCCESS;
freeskb:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out_of_mem:
- return 0;
+ return NET_RX_DROP;
}
/*
* User level interface (ioctl)
*/
+static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r,
+ bool getarp)
+{
+ struct net_device *dev;
+
+ if (getarp)
+ dev = dev_get_by_name_rcu(net, r->arp_dev);
+ else
+ dev = __dev_get_by_name(net, r->arp_dev);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */
+ if (!r->arp_ha.sa_family)
+ r->arp_ha.sa_family = dev->type;
+
+ if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type)
+ return ERR_PTR(-EINVAL);
+
+ return dev;
+}
+
+static struct net_device *arp_req_dev(struct net *net, struct arpreq *r)
+{
+ struct net_device *dev;
+ struct rtable *rt;
+ __be32 ip;
+
+ if (r->arp_dev[0])
+ return arp_req_dev_by_name(net, r, false);
+
+ if (r->arp_flags & ATF_PUBL)
+ return NULL;
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK);
+ if (IS_ERR(rt))
+ return ERR_CAST(rt);
+
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+
+ if (!dev)
+ return ERR_PTR(-EINVAL);
+
+ return dev;
+}
+
/*
* Set (create) an ARP cache entry.
*/
-static int arp_req_set(struct arpreq *r, struct net_device * dev)
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
{
- __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
- struct neighbour *neigh;
- int err;
-
- if (r->arp_flags&ATF_PUBL) {
- __be32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
- if (mask && mask != htonl(0xFFFFFFFF))
- return -EINVAL;
- if (!dev && (r->arp_flags & ATF_COM)) {
- dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
- if (!dev)
- return -ENODEV;
- }
- if (mask) {
- if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
- return -ENOBUFS;
- return 0;
- }
- if (dev == NULL) {
- ipv4_devconf.proxy_arp = 1;
- return 0;
- }
- if (__in_dev_get_rtnl(dev)) {
- __in_dev_get_rtnl(dev)->cnf.proxy_arp = 1;
- return 0;
- }
- return -ENXIO;
+ if (!dev) {
+ IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+ return 0;
+ }
+ if (__in_dev_get_rtnl_net(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on);
+ return 0;
}
+ return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (r->arp_flags & ATF_PERM)
- r->arp_flags |= ATF_COM;
- if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(&rt, &fl)) != 0)
- return err;
- dev = rt->u.dst.dev;
- ip_rt_put(rt);
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
+ r->arp_ha.sa_data);
if (!dev)
- return -EINVAL;
+ return -ENODEV;
+ }
+ if (mask) {
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false);
}
+
+ return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r)
+{
+ struct neighbour *neigh;
+ struct net_device *dev;
+ __be32 ip;
+ int err;
+
+ dev = arp_req_dev(net, r);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_set_public(net, r, dev);
+
switch (dev->type) {
-#ifdef CONFIG_FDDI
+#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
/*
* According to RFC 1390, FDDI devices should accept ARP
@@ -1032,179 +1130,221 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev)
break;
}
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
- unsigned state = NUD_STALE;
- if (r->arp_flags & ATF_PERM)
+ unsigned int state = NUD_STALE;
+
+ if (r->arp_flags & ATF_PERM) {
+ r->arp_flags |= ATF_COM;
state = NUD_PERMANENT;
- err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
- r->arp_ha.sa_data : NULL, state,
- NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_ADMIN);
+ }
+
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN, 0);
neigh_release(neigh);
}
return err;
}
-static unsigned arp_state_to_flags(struct neighbour *neigh)
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
- unsigned flags = 0;
if (neigh->nud_state&NUD_PERMANENT)
- flags = ATF_PERM|ATF_COM;
+ return ATF_PERM | ATF_COM;
else if (neigh->nud_state&NUD_VALID)
- flags = ATF_COM;
- return flags;
+ return ATF_COM;
+ else
+ return 0;
}
/*
* Get an ARP cache entry.
*/
-static int arp_req_get(struct arpreq *r, struct net_device *dev)
+static int arp_req_get(struct net *net, struct arpreq *r)
{
__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
struct neighbour *neigh;
- int err = -ENXIO;
+ struct net_device *dev;
+
+ if (!r->arp_dev[0])
+ return -ENODEV;
+
+ dev = arp_req_dev_by_name(net, r, true);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
neigh = neigh_lookup(&arp_tbl, &ip, dev);
- if (neigh) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
- r->arp_ha.sa_family = dev->type;
- strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ if (!neigh)
+ return -ENXIO;
+
+ if (READ_ONCE(neigh->nud_state) & NUD_NOARP) {
neigh_release(neigh);
- err = 0;
+ return -ENXIO;
}
- return err;
+
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha,
+ min(dev->addr_len, sizeof(r->arp_ha.sa_data)));
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+
+ neigh_release(neigh);
+
+ r->arp_ha.sa_family = dev->type;
+ netdev_copy_name(dev, r->arp_dev);
+
+ return 0;
}
-static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
{
- int err;
- __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- struct neighbour *neigh;
+ struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ int err = -ENXIO;
+ struct neigh_table *tbl = &arp_tbl;
- if (r->arp_flags & ATF_PUBL) {
- __be32 mask =
- ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask == htonl(0xFFFFFFFF))
- return pneigh_delete(&arp_tbl, &ip, dev);
- if (mask == 0) {
- if (dev == NULL) {
- ipv4_devconf.proxy_arp = 0;
- return 0;
- }
- if (__in_dev_get_rtnl(dev)) {
- __in_dev_get_rtnl(dev)->cnf.proxy_arp = 0;
- return 0;
- }
- return -ENXIO;
+ if (neigh) {
+ if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
+ neigh_release(neigh);
+ return 0;
}
- return -EINVAL;
- }
- if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(&rt, &fl)) != 0)
- return err;
- dev = rt->u.dst.dev;
- ip_rt_put(rt);
- if (!dev)
- return -EINVAL;
- }
- err = -ENXIO;
- neigh = neigh_lookup(&arp_tbl, &ip, dev);
- if (neigh) {
- if (neigh->nud_state&~NUD_NOARP)
- err = neigh_update(neigh, NULL, NUD_FAILED,
+ if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_ADMIN);
+ NEIGH_UPDATE_F_ADMIN, 0);
+ spin_lock_bh(&tbl->lock);
neigh_release(neigh);
+ neigh_remove_one(neigh);
+ spin_unlock_bh(&tbl->lock);
}
+
return err;
}
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask) {
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+ }
+
+ return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r)
+{
+ struct net_device *dev;
+ __be32 ip;
+
+ dev = arp_req_dev(net, r);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_delete_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ return arp_invalidate(dev, ip, true);
+}
+
/*
* Handle an ARP layer I/O control request.
*/
-int arp_ioctl(unsigned int cmd, void __user *arg)
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
- int err;
struct arpreq r;
- struct net_device *dev = NULL;
+ __be32 *netmask;
+ int err;
switch (cmd) {
- case SIOCDARP:
- case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- case SIOCGARP:
- err = copy_from_user(&r, arg, sizeof(struct arpreq));
- if (err)
- return -EFAULT;
- break;
- default:
- return -EINVAL;
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ fallthrough;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
}
if (r.arp_pa.sa_family != AF_INET)
return -EPFNOSUPPORT;
if (!(r.arp_flags & ATF_PUBL) &&
- (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
return -EINVAL;
+
+ netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr;
if (!(r.arp_flags & ATF_NETMASK))
- ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
- htonl(0xFFFFFFFFUL);
- rtnl_lock();
- if (r.arp_dev[0]) {
- err = -ENODEV;
- if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
- goto out;
-
- /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
- if (!r.arp_ha.sa_family)
- r.arp_ha.sa_family = dev->type;
- err = -EINVAL;
- if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
- goto out;
- } else if (cmd == SIOCGARP) {
- err = -ENODEV;
- goto out;
- }
+ *netmask = htonl(0xFFFFFFFFUL);
+ else if (*netmask && *netmask != htonl(0xFFFFFFFFUL))
+ return -EINVAL;
- switch(cmd) {
+ switch (cmd) {
case SIOCDARP:
- err = arp_req_delete(&r, dev);
+ rtnl_net_lock(net);
+ err = arp_req_delete(net, &r);
+ rtnl_net_unlock(net);
break;
case SIOCSARP:
- err = arp_req_set(&r, dev);
+ rtnl_net_lock(net);
+ err = arp_req_set(net, &r);
+ rtnl_net_unlock(net);
break;
case SIOCGARP:
- err = arp_req_get(&r, dev);
+ rcu_read_lock();
+ err = arp_req_get(net, &r);
+ rcu_read_unlock();
+
if (!err && copy_to_user(arg, &r, sizeof(r)))
err = -EFAULT;
break;
}
-out:
- rtnl_unlock();
+
return err;
}
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
+ struct in_device *in_dev;
+ bool evict_nocarrier;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&arp_tbl, dev);
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&arp_tbl, dev);
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ evict_nocarrier = true;
+ else
+ evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev);
+
+ if (evict_nocarrier && !netif_carrier_ok(dev))
+ neigh_carrier_down(&arp_tbl, dev);
break;
default:
break;
@@ -1231,34 +1371,18 @@ void arp_ifdown(struct net_device *dev)
* Called once on startup.
*/
-static struct packet_type arp_packet_type = {
- .type = __constant_htons(ETH_P_ARP),
+static struct packet_type arp_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
-static int arp_proc_init(void);
-
-void __init arp_init(void)
-{
- neigh_table_init(&arp_tbl);
-
- dev_add_pack(&arp_packet_type);
- arp_proc_init();
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
-#endif
- register_netdevice_notifier(&arp_netdev_notifier);
-}
-
#ifdef CONFIG_PROC_FS
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
-/* ------------------------------------------------------------------------ */
/*
* ax25 -> ASCII conversion
*/
-static char *ax2asc2(ax25_address *a, char *buf)
+static void ax2asc2(ax25_address *a, char *buf)
{
char c, *s;
int n;
@@ -1266,24 +1390,24 @@ static char *ax2asc2(ax25_address *a, char *buf)
for (n = 0, s = buf; n < 6; n++) {
c = (a->ax25_call[n] >> 1) & 0x7F;
- if (c != ' ') *s++ = c;
+ if (c != ' ')
+ *s++ = c;
}
-
- *s++ = '-';
- if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ *s++ = '-';
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
*s++ = '1';
n -= 10;
}
-
+
*s++ = n + '0';
*s++ = '\0';
- if (*buf == '\0' || *buf == '-')
- return "*";
-
- return buf;
-
+ if (*buf == '\0' || *buf == '-') {
+ buf[0] = '*';
+ buf[1] = '\0';
+ }
}
#endif /* CONFIG_AX25 */
@@ -1293,7 +1417,6 @@ static void arp_format_neigh_entry(struct seq_file *seq,
struct neighbour *n)
{
char hbuffer[HBUFFERLEN];
- const char hexbuf[] = "0123456789ABCDEF";
int k, j;
char tbuf[16];
struct net_device *dev = n->dev;
@@ -1301,22 +1424,24 @@ static void arp_format_neigh_entry(struct seq_file *seq,
read_lock(&n->lock);
/* Convert hardware address to XX:XX:XX:XX ... form. */
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
ax2asc2((ax25_address *)n->ha, hbuffer);
else {
#endif
for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
- hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
- hbuffer[k++] = hexbuf[n->ha[j] & 15];
+ hbuffer[k++] = hex_asc_hi(n->ha[j]);
+ hbuffer[k++] = hex_asc_lo(n->ha[j]);
hbuffer[k++] = ':';
}
- hbuffer[--k] = 0;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ if (k != 0)
+ --k;
+ hbuffer[k] = 0;
+#if IS_ENABLED(CONFIG_AX25)
}
#endif
- sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
- seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ sprintf(tbuf, "%pI4", n->primary_key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n",
tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
read_unlock(&n->lock);
}
@@ -1328,7 +1453,7 @@ static void arp_format_pneigh_entry(struct seq_file *seq,
int hatype = dev ? dev->type : 0;
char tbuf[16];
- sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
+ sprintf(tbuf, "%pI4", n->key);
seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
dev ? dev->name : "*");
@@ -1359,68 +1484,40 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
}
-/* ------------------------------------------------------------------------ */
-
-static struct seq_operations arp_seq_ops = {
- .start = arp_seq_start,
- .next = neigh_seq_next,
- .stop = neigh_seq_stop,
- .show = arp_seq_show,
-};
-
-static int arp_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct neigh_seq_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &arp_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations arp_seq_fops = {
- .owner = THIS_MODULE,
- .open = arp_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static const struct seq_operations arp_seq_ops = {
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
};
+#endif /* CONFIG_PROC_FS */
-static int __init arp_proc_init(void)
+static int __net_init arp_net_init(struct net *net)
{
- if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
+ if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops,
+ sizeof(struct neigh_seq_state)))
return -ENOMEM;
return 0;
}
-#else /* CONFIG_PROC_FS */
-
-static int __init arp_proc_init(void)
+static void __net_exit arp_net_exit(struct net *net)
{
- return 0;
+ remove_proc_entry("arp", net->proc_net);
}
-#endif /* CONFIG_PROC_FS */
+static struct pernet_operations arp_net_ops = {
+ .init = arp_net_init,
+ .exit = arp_net_exit,
+};
-EXPORT_SYMBOL(arp_broken_ops);
-EXPORT_SYMBOL(arp_find);
-EXPORT_SYMBOL(arp_create);
-EXPORT_SYMBOL(arp_xmit);
-EXPORT_SYMBOL(arp_send);
-EXPORT_SYMBOL(arp_tbl);
+void __init arp_init(void)
+{
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-EXPORT_SYMBOL(clip_tbl_hook);
+ dev_add_pack(&arp_packet_type);
+ register_pernet_subsys(&arp_net_ops);
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
+}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
new file mode 100644
index 000000000000..e01492234b0b
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <net/tcp.h>
+#include <net/bpf_sk_storage.h>
+
+/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
+static struct bpf_struct_ops bpf_tcp_congestion_ops;
+
+static const struct btf_type *tcp_sock_type;
+static u32 tcp_sock_id, sock_id;
+static const struct btf_type *tcp_congestion_ops_type;
+
+static int bpf_tcp_ca_init(struct btf *btf)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ sock_id = type_id;
+
+ type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ tcp_sock_id = type_id;
+ tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+
+ type_id = btf_find_by_name_kind(btf, "tcp_congestion_ops", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ tcp_congestion_ops_type = btf_type_by_id(btf, type_id);
+
+ return 0;
+}
+
+static bool bpf_tcp_ca_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
+ return false;
+
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+ !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+ info->btf_id == sock_id)
+ /* promote it to tcp_sock */
+ info->btf_id = tcp_sock_id;
+
+ return true;
+}
+
+static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *t;
+ size_t end;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t != tcp_sock_type) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ switch (off) {
+ case offsetof(struct sock, sk_pacing_rate):
+ end = offsetofend(struct sock, sk_pacing_rate);
+ break;
+ case offsetof(struct sock, sk_pacing_status):
+ end = offsetofend(struct sock, sk_pacing_status);
+ break;
+ case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
+ end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
+ break;
+ case offsetof(struct inet_connection_sock, icsk_ack.pending):
+ end = offsetofend(struct inet_connection_sock,
+ icsk_ack.pending);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd):
+ end = offsetofend(struct tcp_sock, snd_cwnd);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd_cnt):
+ end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd_stamp):
+ end = offsetofend(struct tcp_sock, snd_cwnd_stamp);
+ break;
+ case offsetof(struct tcp_sock, snd_ssthresh):
+ end = offsetofend(struct tcp_sock, snd_ssthresh);
+ break;
+ case offsetof(struct tcp_sock, ecn_flags):
+ end = offsetofend(struct tcp_sock, ecn_flags);
+ break;
+ case offsetof(struct tcp_sock, app_limited):
+ end = offsetofend(struct tcp_sock, app_limited);
+ break;
+ default:
+ bpf_log(log, "no write support to tcp_sock at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
+{
+ /* bpf_tcp_ca prog cannot have NULL tp */
+ __tcp_send_ack((struct sock *)tp, rcv_nxt, 0);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
+ .func = bpf_tcp_send_ack,
+ .gpl_only = false,
+ /* In case we want to report error later */
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &tcp_sock_id,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static u32 prog_ops_moff(const struct bpf_prog *prog)
+{
+ const struct btf_member *m;
+ const struct btf_type *t;
+ u32 midx;
+
+ midx = prog->expected_attach_type;
+ t = tcp_congestion_ops_type;
+ m = &btf_type_member(t)[midx];
+
+ return __btf_member_bit_offset(t, m) / 8;
+}
+
+static const struct bpf_func_proto *
+bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_tcp_send_ack:
+ return &bpf_tcp_send_ack_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ /* Does not allow release() to call setsockopt.
+ * release() is called when the current bpf-tcp-cc
+ * is retiring. It is not allowed to call
+ * setsockopt() to make further changes which
+ * may potentially allocate new resources.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ /* Since get/setsockopt is usually expected to
+ * be available together, disable getsockopt for
+ * release also to avoid usage surprise.
+ * The bpf-tcp-cc already has a more powerful way
+ * to read tcp_sock from the PTR_TO_BTF_ID.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+BTF_KFUNCS_START(bpf_tcp_ca_check_kfunc_ids)
+BTF_ID_FLAGS(func, tcp_reno_ssthresh)
+BTF_ID_FLAGS(func, tcp_reno_cong_avoid)
+BTF_ID_FLAGS(func, tcp_reno_undo_cwnd)
+BTF_ID_FLAGS(func, tcp_slow_start)
+BTF_ID_FLAGS(func, tcp_cong_avoid_ai)
+BTF_KFUNCS_END(bpf_tcp_ca_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_tcp_ca_check_kfunc_ids,
+};
+
+static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
+ .get_func_proto = bpf_tcp_ca_get_func_proto,
+ .is_valid_access = bpf_tcp_ca_is_valid_access,
+ .btf_struct_access = bpf_tcp_ca_btf_struct_access,
+};
+
+static int bpf_tcp_ca_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct tcp_congestion_ops *utcp_ca;
+ struct tcp_congestion_ops *tcp_ca;
+ u32 moff;
+
+ utcp_ca = (const struct tcp_congestion_ops *)udata;
+ tcp_ca = (struct tcp_congestion_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct tcp_congestion_ops, flags):
+ if (utcp_ca->flags & ~TCP_CONG_MASK)
+ return -EINVAL;
+ tcp_ca->flags = utcp_ca->flags;
+ return 1;
+ case offsetof(struct tcp_congestion_ops, name):
+ if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
+ sizeof(tcp_ca->name)) <= 0)
+ return -EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link)
+{
+ return tcp_register_congestion_control(kdata);
+}
+
+static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link)
+{
+ tcp_unregister_congestion_control(kdata);
+}
+
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+ return tcp_validate_congestion_control(kdata);
+}
+
+static u32 bpf_tcp_ca_ssthresh(struct sock *sk)
+{
+ return 0;
+}
+
+static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+}
+
+static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state)
+{
+}
+
+static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+}
+
+static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags)
+{
+}
+
+static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample)
+{
+}
+
+static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+{
+ return 0;
+}
+
+static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ const struct rate_sample *rs)
+{
+}
+
+static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk)
+{
+ return 0;
+}
+
+static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk)
+{
+ return 0;
+}
+
+static void __bpf_tcp_ca_init(struct sock *sk)
+{
+}
+
+static void __bpf_tcp_ca_release(struct sock *sk)
+{
+}
+
+static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ .ssthresh = bpf_tcp_ca_ssthresh,
+ .cong_avoid = bpf_tcp_ca_cong_avoid,
+ .set_state = bpf_tcp_ca_set_state,
+ .cwnd_event = bpf_tcp_ca_cwnd_event,
+ .in_ack_event = bpf_tcp_ca_in_ack_event,
+ .pkts_acked = bpf_tcp_ca_pkts_acked,
+ .min_tso_segs = bpf_tcp_ca_min_tso_segs,
+ .cong_control = bpf_tcp_ca_cong_control,
+ .undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ .sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+
+ .init = __bpf_tcp_ca_init,
+ .release = __bpf_tcp_ca_release,
+};
+
+static struct bpf_struct_ops bpf_tcp_congestion_ops = {
+ .verifier_ops = &bpf_tcp_ca_verifier_ops,
+ .reg = bpf_tcp_ca_reg,
+ .unreg = bpf_tcp_ca_unreg,
+ .update = bpf_tcp_ca_update,
+ .init_member = bpf_tcp_ca_init_member,
+ .init = bpf_tcp_ca_init,
+ .validate = bpf_tcp_ca_validate,
+ .name = "tcp_congestion_ops",
+ .cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+ ret = ret ?: register_bpf_struct_ops(&bpf_tcp_congestion_ops, tcp_congestion_ops);
+
+ return ret;
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index f3957cfaed42..709021197e1c 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1,34 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* CIPSO - Commercial IP Security Option
*
* This is an implementation of the CIPSO 2.2 protocol as specified in
* draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
- * FIPS-188, copies of both documents can be found in the Documentation
- * directory. While CIPSO never became a full IETF RFC standard many vendors
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
* have chosen to adopt the protocol and over the years it has become a
* de-facto standard for labeled networking.
*
- * Author: Paul Moore <paul.moore@hp.com>
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * https://www.itl.nist.gov/fipspubs/fip188.htm
*
+ * Author: Paul Moore <paul.moore@hp.com>
*/
/*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
*/
#include <linux/init.h>
@@ -38,31 +28,24 @@
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
-#include <asm/atomic.h>
-#include <asm/bug.h>
-
-struct cipso_v4_domhsh_entry {
- char *domain;
- u32 valid;
- struct list_head list;
- struct rcu_head rcu;
-};
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/unaligned.h>
/* List of available DOI definitions */
-/* XXX - Updates should be minimal so having a single lock for the
- * cipso_v4_doi_list and the cipso_v4_doi_list->dom_list should be
- * okay. */
/* XXX - This currently assumes a minimal number of different DOIs in use,
* if in practice there are a lot of different DOIs this list should
* probably be turned into a hash table or something similar so we
* can do quick lookups. */
static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
-static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list);
+static LIST_HEAD(cipso_v4_doi_list);
/* Label mapping cache */
int cipso_v4_cache_enabled = 1;
@@ -75,6 +58,7 @@ struct cipso_v4_map_cache_bkt {
u32 size;
struct list_head list;
};
+
struct cipso_v4_map_cache_entry {
u32 hash;
unsigned char *key;
@@ -85,104 +69,56 @@ struct cipso_v4_map_cache_entry {
u32 activity;
struct list_head list;
};
-static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL;
+
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache;
/* Restricted bitmap (tag #1) flags */
-int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_optfmt;
int cipso_v4_rbm_strictvalid = 1;
/*
- * Helper Functions
+ * Protocol Constants
*/
-/**
- * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
- * @bitmap: the bitmap
- * @bitmap_len: length in bits
- * @offset: starting offset
- * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
- *
- * Description:
- * Starting at @offset, walk the bitmap from left to right until either the
- * desired bit is found or we reach the end. Return the bit offset, -1 if
- * not found, or -2 if error.
- */
-static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
- u32 bitmap_len,
- u32 offset,
- u8 state)
-{
- u32 bit_spot;
- u32 byte_offset;
- unsigned char bitmask;
- unsigned char byte;
-
- /* gcc always rounds to zero when doing integer division */
- byte_offset = offset / 8;
- byte = bitmap[byte_offset];
- bit_spot = offset;
- bitmask = 0x80 >> (offset % 8);
-
- while (bit_spot < bitmap_len) {
- if ((state && (byte & bitmask) == bitmask) ||
- (state == 0 && (byte & bitmask) == 0))
- return bit_spot;
-
- bit_spot++;
- bitmask >>= 1;
- if (bitmask == 0) {
- byte = bitmap[++byte_offset];
- bitmask = 0x80;
- }
- }
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX 40
- return -1;
-}
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN 6
-/**
- * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
- * @bitmap: the bitmap
- * @bit: the bit
- * @state: if non-zero, set the bit (1) else clear the bit (0)
- *
- * Description:
- * Set a single bit in the bitmask. Returns zero on success, negative values
- * on error.
- */
-static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
- u32 bit,
- u8 state)
-{
- u32 byte_spot;
- u8 bitmask;
-
- /* gcc always rounds to zero when doing integer division */
- byte_spot = bit / 8;
- bitmask = 0x80 >> (bit % 8);
- if (state)
- bitmap[byte_spot] |= bitmask;
- else
- bitmap[byte_spot] &= ~bitmask;
-}
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN 4
-/**
- * cipso_v4_doi_domhsh_free - Frees a domain list entry
- * @entry: the entry's RCU field
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN 4
+
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN 4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5). You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possible to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX 8
+
+/* Base length of the local tag (non-standard tag).
+ * Tag definition (may change between kernel versions)
*
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that the memory allocated to a domain list entry can be released
- * safely.
+ * 0 8 16 24 32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
*
*/
-static void cipso_v4_doi_domhsh_free(struct rcu_head *entry)
-{
- struct cipso_v4_domhsh_entry *ptr;
+#define CIPSO_V4_TAG_LOC_BLEN 6
- ptr = container_of(entry, struct cipso_v4_domhsh_entry, rcu);
- kfree(ptr->domain);
- kfree(ptr);
-}
+/*
+ * Helper Functions
+ */
/**
* cipso_v4_cache_entry_free - Frees a cache entry
@@ -228,14 +164,14 @@ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
* success, negative values on error.
*
*/
-static int cipso_v4_cache_init(void)
+static int __init cipso_v4_cache_init(void)
{
u32 iter;
cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
sizeof(struct cipso_v4_map_cache_bkt),
GFP_KERNEL);
- if (cipso_v4_cache == NULL)
+ if (!cipso_v4_cache)
return -ENOMEM;
for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
@@ -251,8 +187,7 @@ static int cipso_v4_cache_init(void)
* cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
*
* Description:
- * Invalidates and frees any entries in the CIPSO cache. Returns zero on
- * success and negative values on failure.
+ * Invalidates and frees any entries in the CIPSO cache.
*
*/
void cipso_v4_cache_invalidate(void)
@@ -271,8 +206,6 @@ void cipso_v4_cache_invalidate(void)
cipso_v4_cache[iter].size = 0;
spin_unlock_bh(&cipso_v4_cache[iter].lock);
}
-
- return;
}
/**
@@ -306,21 +239,22 @@ static int cipso_v4_cache_check(const unsigned char *key,
struct cipso_v4_map_cache_entry *prev_entry = NULL;
u32 hash;
- if (!cipso_v4_cache_enabled)
+ if (!READ_ONCE(cipso_v4_cache_enabled))
return -ENOENT;
hash = cipso_v4_map_cache_hash(key, key_len);
- bkt = hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
+ bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
if (entry->hash == hash &&
entry->key_len == key_len &&
memcmp(entry->key, key, key_len) == 0) {
entry->activity += 1;
- atomic_inc(&entry->lsm_data->refcount);
+ refcount_inc(&entry->lsm_data->refcount);
secattr->cache = entry->lsm_data;
secattr->flags |= NETLBL_SECATTR_CACHE;
- if (prev_entry == NULL) {
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+ if (!prev_entry) {
spin_unlock_bh(&cipso_v4_cache[bkt].lock);
return 0;
}
@@ -348,7 +282,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
/**
* cipso_v4_cache_add - Add an entry to the CIPSO cache
- * @skb: the packet
+ * @cipso_ptr: pointer to CIPSO IP option
* @secattr: the packet's security attributes
*
* Description:
@@ -359,38 +293,37 @@ static int cipso_v4_cache_check(const unsigned char *key,
* negative values on failure.
*
*/
-int cipso_v4_cache_add(const struct sk_buff *skb,
+int cipso_v4_cache_add(const unsigned char *cipso_ptr,
const struct netlbl_lsm_secattr *secattr)
{
+ int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize);
int ret_val = -EPERM;
u32 bkt;
struct cipso_v4_map_cache_entry *entry = NULL;
struct cipso_v4_map_cache_entry *old_entry = NULL;
- unsigned char *cipso_ptr;
u32 cipso_ptr_len;
- if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0)
return 0;
- cipso_ptr = CIPSO_V4_OPTPTR(skb);
cipso_ptr_len = cipso_ptr[1];
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
- if (entry == NULL)
+ if (!entry)
return -ENOMEM;
entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
- if (entry->key == NULL) {
+ if (!entry->key) {
ret_val = -ENOMEM;
goto cache_add_failure;
}
entry->key_len = cipso_ptr_len;
entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
- atomic_inc(&secattr->cache->refcount);
+ refcount_inc(&secattr->cache->refcount);
entry->lsm_data = secattr->cache;
- bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
+ bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
- if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ if (cipso_v4_cache[bkt].size < bkt_size) {
list_add(&entry->list, &cipso_v4_cache[bkt].list);
cipso_v4_cache[bkt].size += 1;
} else {
@@ -420,7 +353,7 @@ cache_add_failure:
*
* Description:
* Search the DOI definition list for a DOI definition with a DOI value that
- * matches @doi. The caller is responsibile for calling rcu_read_[un]lock().
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
* Returns a pointer to the DOI definition on success and NULL on failure.
*/
static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
@@ -428,7 +361,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
struct cipso_v4_doi *iter;
list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
- if (iter->doi == doi && iter->valid)
+ if (iter->doi == doi && refcount_read(&iter->refcount))
return iter;
return NULL;
}
@@ -436,6 +369,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
/**
* cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
* @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
*
* Description:
* The caller defines a new DOI for use by the CIPSO engine and calls this
@@ -445,108 +379,210 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
* zero on success and non-zero on failure.
*
*/
-int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+ struct netlbl_audit *audit_info)
{
+ int ret_val = -EINVAL;
u32 iter;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
- if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
- return -EINVAL;
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+ goto doi_add_return;
for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
switch (doi_def->tags[iter]) {
case CIPSO_V4_TAG_RBITMAP:
break;
+ case CIPSO_V4_TAG_RANGE:
+ case CIPSO_V4_TAG_ENUM:
+ if (doi_def->type != CIPSO_V4_MAP_PASS)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+ goto doi_add_return;
+ break;
case CIPSO_V4_TAG_INVALID:
if (iter == 0)
- return -EINVAL;
+ goto doi_add_return;
break;
default:
- return -EINVAL;
+ goto doi_add_return;
}
}
- doi_def->valid = 1;
- INIT_RCU_HEAD(&doi_def->rcu);
- INIT_LIST_HEAD(&doi_def->dom_list);
+ refcount_set(&doi_def->refcount, 1);
- rcu_read_lock();
- if (cipso_v4_doi_search(doi_def->doi) != NULL)
- goto doi_add_failure_rlock;
spin_lock(&cipso_v4_doi_list_lock);
- if (cipso_v4_doi_search(doi_def->doi) != NULL)
- goto doi_add_failure_slock;
+ if (cipso_v4_doi_search(doi_def->doi)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+ if (audit_buf) {
+ const char *type_str;
+ switch (doi_type) {
+ case CIPSO_V4_MAP_TRANS:
+ type_str = "trans";
+ break;
+ case CIPSO_V4_MAP_PASS:
+ type_str = "pass";
+ break;
+ case CIPSO_V4_MAP_LOCAL:
+ type_str = "local";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " cipso_doi=%u cipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
- return 0;
+ return ret_val;
+}
-doi_add_failure_slock:
- spin_unlock(&cipso_v4_doi_list_lock);
-doi_add_failure_rlock:
- rcu_read_unlock();
- return -EEXIST;
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_TRANS:
+ kfree(doi_def->map.std->lvl.cipso);
+ kfree(doi_def->map.std->lvl.local);
+ kfree(doi_def->map.std->cat.cipso);
+ kfree(doi_def->map.std->cat.local);
+ kfree(doi_def->map.std);
+ break;
+ }
+ kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+ struct cipso_v4_doi *doi_def;
+
+ doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+ cipso_v4_doi_free(doi_def);
}
/**
* cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
- * @callback: the DOI cleanup/free callback
+ * @audit_info: NetLabel audit information
*
* Description:
- * Removes a DOI definition from the CIPSO engine, @callback is called to
- * free any memory. The NetLabel routines will be called to release their own
- * LSM domain mappings as well as our own domain list. Returns zero on
- * success and negative values on failure.
+ * Removes a DOI definition from the CIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
*
*/
-int cipso_v4_doi_remove(u32 doi,
- struct netlbl_audit *audit_info,
- void (*callback) (struct rcu_head * head))
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
{
+ int ret_val;
struct cipso_v4_doi *doi_def;
- struct cipso_v4_domhsh_entry *dom_iter;
+ struct audit_buffer *audit_buf;
- rcu_read_lock();
- if (cipso_v4_doi_search(doi) != NULL) {
- spin_lock(&cipso_v4_doi_list_lock);
- doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL) {
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
- return -ENOENT;
- }
- doi_def->valid = 0;
- list_del_rcu(&doi_def->list);
+ spin_lock(&cipso_v4_doi_list_lock);
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def) {
spin_unlock(&cipso_v4_doi_list_lock);
- list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
- if (dom_iter->valid)
- netlbl_domhsh_remove(dom_iter->domain,
- audit_info);
- cipso_v4_cache_invalidate();
- rcu_read_unlock();
-
- call_rcu(&doi_def->rcu, callback);
- return 0;
+ ret_val = -ENOENT;
+ goto doi_remove_return;
}
- rcu_read_unlock();
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
- return -ENOENT;
+ cipso_v4_doi_putdef(doi_def);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+ if (audit_buf) {
+ audit_log_format(audit_buf,
+ " cipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
}
/**
- * cipso_v4_doi_getdef - Returns a pointer to a valid DOI definition
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
* @doi: the DOI value
*
* Description:
* Searches for a valid DOI definition and if one is found it is returned to
* the caller. Otherwise NULL is returned. The caller must ensure that
- * rcu_read_lock() is held while accessing the returned definition.
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
*
*/
struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
{
- return cipso_v4_doi_search(doi);
+ struct cipso_v4_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def)
+ goto doi_getdef_return;
+ if (!refcount_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ if (!refcount_dec_and_test(&doi_def->refcount))
+ return;
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
}
/**
@@ -572,7 +608,7 @@ int cipso_v4_doi_walk(u32 *skip_cnt,
rcu_read_lock();
list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
- if (iter_doi->valid) {
+ if (refcount_read(&iter_doi->refcount) > 0) {
if (doi_cnt++ < *skip_cnt)
continue;
ret_val = callback(iter_doi, cb_arg);
@@ -588,92 +624,6 @@ doi_walk_return:
return ret_val;
}
-/**
- * cipso_v4_doi_domhsh_add - Adds a domain entry to a DOI definition
- * @doi_def: the DOI definition
- * @domain: the domain to add
- *
- * Description:
- * Adds the @domain to the the DOI specified by @doi_def, this function
- * should only be called by external functions (i.e. NetLabel). This function
- * does allocate memory. Returns zero on success, negative values on failure.
- *
- */
-int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain)
-{
- struct cipso_v4_domhsh_entry *iter;
- struct cipso_v4_domhsh_entry *new_dom;
-
- new_dom = kzalloc(sizeof(*new_dom), GFP_KERNEL);
- if (new_dom == NULL)
- return -ENOMEM;
- if (domain) {
- new_dom->domain = kstrdup(domain, GFP_KERNEL);
- if (new_dom->domain == NULL) {
- kfree(new_dom);
- return -ENOMEM;
- }
- }
- new_dom->valid = 1;
- INIT_RCU_HEAD(&new_dom->rcu);
-
- rcu_read_lock();
- spin_lock(&cipso_v4_doi_list_lock);
- list_for_each_entry_rcu(iter, &doi_def->dom_list, list)
- if (iter->valid &&
- ((domain != NULL && iter->domain != NULL &&
- strcmp(iter->domain, domain) == 0) ||
- (domain == NULL && iter->domain == NULL))) {
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
- kfree(new_dom->domain);
- kfree(new_dom);
- return -EEXIST;
- }
- list_add_tail_rcu(&new_dom->list, &doi_def->dom_list);
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
-
- return 0;
-}
-
-/**
- * cipso_v4_doi_domhsh_remove - Removes a domain entry from a DOI definition
- * @doi_def: the DOI definition
- * @domain: the domain to remove
- *
- * Description:
- * Removes the @domain from the DOI specified by @doi_def, this function
- * should only be called by external functions (i.e. NetLabel). Returns zero
- * on success and negative values on error.
- *
- */
-int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
- const char *domain)
-{
- struct cipso_v4_domhsh_entry *iter;
-
- rcu_read_lock();
- spin_lock(&cipso_v4_doi_list_lock);
- list_for_each_entry_rcu(iter, &doi_def->dom_list, list)
- if (iter->valid &&
- ((domain != NULL && iter->domain != NULL &&
- strcmp(iter->domain, domain) == 0) ||
- (domain == NULL && iter->domain == NULL))) {
- iter->valid = 0;
- list_del_rcu(&iter->list);
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
- call_rcu(&iter->rcu, cipso_v4_doi_domhsh_free);
-
- return 0;
- }
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
-
- return -ENOENT;
-}
-
/*
* Label Mapping Functions
*/
@@ -694,8 +644,9 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
switch (doi_def->type) {
case CIPSO_V4_MAP_PASS:
return 0;
- case CIPSO_V4_MAP_STD:
- if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+ case CIPSO_V4_MAP_TRANS:
+ if ((level < doi_def->map.std->lvl.cipso_size) &&
+ (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL))
return 0;
break;
}
@@ -723,12 +674,13 @@ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
case CIPSO_V4_MAP_PASS:
*net_lvl = host_lvl;
return 0;
- case CIPSO_V4_MAP_STD:
- if (host_lvl < doi_def->map.std->lvl.local_size) {
+ case CIPSO_V4_MAP_TRANS:
+ if (host_lvl < doi_def->map.std->lvl.local_size &&
+ doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
*net_lvl = doi_def->map.std->lvl.local[host_lvl];
return 0;
}
- break;
+ return -EPERM;
}
return -EINVAL;
@@ -756,14 +708,14 @@ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
case CIPSO_V4_MAP_PASS:
*host_lvl = net_lvl;
return 0;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
map_tbl = doi_def->map.std;
if (net_lvl < map_tbl->lvl.cipso_size &&
map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
*host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
return 0;
}
- break;
+ return -EPERM;
}
return -EINVAL;
@@ -793,14 +745,14 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
switch (doi_def->type) {
case CIPSO_V4_MAP_PASS:
return 0;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
cipso_cat_size = doi_def->map.std->cat.cipso_size;
cipso_array = doi_def->map.std->cat.cipso;
for (;;) {
- cat = cipso_v4_bitmap_walk(bitmap,
- bitmap_len_bits,
- cat + 1,
- 1);
+ cat = netlbl_bitmap_walk(bitmap,
+ bitmap_len_bits,
+ cat + 1,
+ 1);
if (cat < 0)
break;
if (cat >= cipso_cat_size ||
@@ -819,8 +771,7 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
/**
* cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
* @doi_def: the DOI definition
- * @host_cat: the category bitmap in host format
- * @host_cat_len: the length of the host's category bitmap in bytes
+ * @secattr: the security attributes
* @net_cat: the zero'd out category bitmap in network/CIPSO format
* @net_cat_len: the length of the CIPSO bitmap in bytes
*
@@ -831,61 +782,51 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
*
*/
static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
- const unsigned char *host_cat,
- u32 host_cat_len,
+ const struct netlbl_lsm_secattr *secattr,
unsigned char *net_cat,
u32 net_cat_len)
{
int host_spot = -1;
- u32 net_spot;
+ u32 net_spot = CIPSO_V4_INV_CAT;
u32 net_spot_max = 0;
- u32 host_clen_bits = host_cat_len * 8;
u32 net_clen_bits = net_cat_len * 8;
- u32 host_cat_size;
- u32 *host_cat_array;
+ u32 host_cat_size = 0;
+ u32 *host_cat_array = NULL;
- switch (doi_def->type) {
- case CIPSO_V4_MAP_PASS:
- net_spot_max = host_cat_len;
- while (net_spot_max > 0 && host_cat[net_spot_max - 1] == 0)
- net_spot_max--;
- if (net_spot_max > net_cat_len)
- return -EINVAL;
- memcpy(net_cat, host_cat, net_spot_max);
- return net_spot_max;
- case CIPSO_V4_MAP_STD:
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
host_cat_size = doi_def->map.std->cat.local_size;
host_cat_array = doi_def->map.std->cat.local;
- for (;;) {
- host_spot = cipso_v4_bitmap_walk(host_cat,
- host_clen_bits,
- host_spot + 1,
- 1);
- if (host_spot < 0)
- break;
+ }
+
+ for (;;) {
+ host_spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+ host_spot + 1);
+ if (host_spot < 0)
+ break;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ net_spot = host_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
if (host_spot >= host_cat_size)
return -EPERM;
-
net_spot = host_cat_array[host_spot];
if (net_spot >= CIPSO_V4_INV_CAT)
return -EPERM;
- if (net_spot >= net_clen_bits)
- return -ENOSPC;
- cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
-
- if (net_spot > net_spot_max)
- net_spot_max = net_spot;
+ break;
}
+ if (net_spot >= net_clen_bits)
+ return -ENOSPC;
+ netlbl_bitmap_setbit(net_cat, net_spot, 1);
- if (host_spot == -2)
- return -EFAULT;
-
- if (++net_spot_max % 8)
- return net_spot_max / 8 + 1;
- return net_spot_max / 8;
+ if (net_spot > net_spot_max)
+ net_spot_max = net_spot;
}
- return -EINVAL;
+ if (++net_spot_max % 8)
+ return net_spot_max / 8 + 1;
+ return net_spot_max / 8;
}
/**
@@ -893,78 +834,307 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
* @doi_def: the DOI definition
* @net_cat: the category bitmap in network/CIPSO format
* @net_cat_len: the length of the CIPSO bitmap in bytes
- * @host_cat: the zero'd out category bitmap in host format
- * @host_cat_len: the length of the host's category bitmap in bytes
+ * @secattr: the security attributes
*
* Description:
* Perform a label mapping to translate a CIPSO bitmap to the correct local
- * MLS category bitmap using the given DOI definition. Returns the minimum
- * size in bytes of the host bitmap on success, negative values otherwise.
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
*
*/
static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
const unsigned char *net_cat,
u32 net_cat_len,
- unsigned char *host_cat,
- u32 host_cat_len)
+ struct netlbl_lsm_secattr *secattr)
{
- u32 host_spot;
- u32 host_spot_max = 0;
+ int ret_val;
int net_spot = -1;
+ u32 host_spot = CIPSO_V4_INV_CAT;
u32 net_clen_bits = net_cat_len * 8;
- u32 host_clen_bits = host_cat_len * 8;
- u32 net_cat_size;
- u32 *net_cat_array;
+ u32 net_cat_size = 0;
+ u32 *net_cat_array = NULL;
- switch (doi_def->type) {
- case CIPSO_V4_MAP_PASS:
- if (net_cat_len > host_cat_len)
- return -EINVAL;
- memcpy(host_cat, net_cat, net_cat_len);
- return net_cat_len;
- case CIPSO_V4_MAP_STD:
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
net_cat_size = doi_def->map.std->cat.cipso_size;
net_cat_array = doi_def->map.std->cat.cipso;
- for (;;) {
- net_spot = cipso_v4_bitmap_walk(net_cat,
- net_clen_bits,
- net_spot + 1,
- 1);
- if (net_spot < 0)
- break;
- if (net_spot >= net_cat_size ||
- net_cat_array[net_spot] >= CIPSO_V4_INV_CAT)
- return -EPERM;
+ }
+
+ for (;;) {
+ net_spot = netlbl_bitmap_walk(net_cat,
+ net_clen_bits,
+ net_spot + 1,
+ 1);
+ if (net_spot < 0)
+ return 0;
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ host_spot = net_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (net_spot >= net_cat_size)
+ return -EPERM;
host_spot = net_cat_array[net_spot];
if (host_spot >= CIPSO_V4_INV_CAT)
return -EPERM;
- if (host_spot >= host_clen_bits)
- return -ENOSPC;
- cipso_v4_bitmap_setbit(host_cat, host_spot, 1);
-
- if (host_spot > host_spot_max)
- host_spot_max = host_spot;
+ break;
}
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ host_spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
- if (net_spot == -2)
+/**
+ * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @enumcat: category list
+ * @enumcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *enumcat,
+ u32 enumcat_len)
+{
+ u16 cat;
+ int cat_prev = -1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < enumcat_len; iter += 2) {
+ cat = get_unaligned_be16(&enumcat[iter]);
+ if (cat <= cat_prev)
return -EFAULT;
+ cat_prev = cat;
+ }
- if (++host_spot_max % 8)
- return host_spot_max / 8 + 1;
- return host_spot_max / 8;
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int cat = -1;
+ u32 cat_iter = 0;
+
+ for (;;) {
+ cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1);
+ if (cat < 0)
+ break;
+ if ((cat_iter + 2) > net_cat_len)
+ return -ENOSPC;
+
+ *((__be16 *)&net_cat[cat_iter]) = htons(cat);
+ cat_iter += 2;
}
- return -EINVAL;
+ return cat_iter;
+}
+
+/**
+ * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ for (iter = 0; iter < net_cat_len; iter += 2) {
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ get_unaligned_be16(&net_cat[iter]),
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @rngcat: category list
+ * @rngcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *rngcat,
+ u32 rngcat_len)
+{
+ u16 cat_high;
+ u16 cat_low;
+ u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < rngcat_len; iter += 4) {
+ cat_high = get_unaligned_be16(&rngcat[iter]);
+ if ((iter + 4) <= rngcat_len)
+ cat_low = get_unaligned_be16(&rngcat[iter + 2]);
+ else
+ cat_low = 0;
+
+ if (cat_high > cat_prev)
+ return -EFAULT;
+
+ cat_prev = cat_low;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int iter = -1;
+ u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
+ u32 array_cnt = 0;
+ u32 cat_size = 0;
+
+ /* make sure we don't overflow the 'array[]' variable */
+ if (net_cat_len >
+ (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+ return -ENOSPC;
+
+ for (;;) {
+ iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1);
+ if (iter < 0)
+ break;
+ cat_size += (iter == 0 ? 0 : sizeof(u16));
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+
+ iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter);
+ if (iter < 0)
+ return -EFAULT;
+ cat_size += sizeof(u16);
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+ }
+
+ for (iter = 0; array_cnt > 0;) {
+ *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
+ iter += 2;
+ array_cnt--;
+ if (array[array_cnt] != 0) {
+ *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
+ iter += 2;
+ }
+ }
+
+ return cat_size;
+}
+
+/**
+ * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 net_iter;
+ u16 cat_low;
+ u16 cat_high;
+
+ for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
+ cat_high = get_unaligned_be16(&net_cat[net_iter]);
+ if ((net_iter + 4) <= net_cat_len)
+ cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
+ else
+ cat_low = 0;
+
+ ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat,
+ cat_low,
+ cat_high,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
}
/*
* Protocol Handling Functions
*/
-#define CIPSO_V4_OPT_LEN_MAX 40
-#define CIPSO_V4_HDR_LEN 6
-
/**
* cipso_v4_gentag_hdr - Generate a CIPSO option header
* @doi_def: the DOI definition
@@ -981,7 +1151,7 @@ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
{
buf[0] = IPOPT_CIPSO;
buf[1] = CIPSO_V4_HDR_LEN + len;
- *(__be32 *)&buf[2] = htonl(doi_def->doi);
+ put_unaligned_be32(doi_def->doi, &buf[2]);
}
/**
@@ -1010,30 +1180,32 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
- secattr->mls_cat,
- secattr->mls_cat_len,
+ secattr,
&buffer[4],
buffer_len - 4);
if (ret_val < 0)
return ret_val;
/* This will send packets using the "optimized" format when
- * possibile as specified in section 3.4.2.6 of the
+ * possible as specified in section 3.4.2.6 of the
* CIPSO draft. */
- if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 &&
+ ret_val <= 10)
tag_len = 14;
else
tag_len = 4 + ret_val;
} else
tag_len = 4;
- buffer[0] = 0x01;
+ buffer[0] = CIPSO_V4_TAG_RBITMAP;
buffer[1] = tag_len;
buffer[3] = level;
@@ -1063,42 +1235,297 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- switch (doi_def->type) {
- case CIPSO_V4_MAP_PASS:
- secattr->mls_cat_len = tag_len - 4;
- break;
- case CIPSO_V4_MAP_STD:
- secattr->mls_cat_len =
- doi_def->map.std->cat.local_size;
- break;
- }
- secattr->mls_cat = kzalloc(secattr->mls_cat_len, GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
- return -ENOMEM;
-
ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
&tag[4],
tag_len - 4,
- secattr->mls_cat,
- secattr->mls_cat_len);
- if (ret_val < 0) {
- kfree(secattr->mls_cat);
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
return ret_val;
- } else if (ret_val > 0) {
- secattr->mls_cat_len = ret_val;
+ }
+
+ if (secattr->attr.mls.cat)
secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_enum_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_ENUM;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO enumerated tag (tag type #2) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the ranged tag, tag type #5. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rng_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RANGE;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
+ * in @secattr. Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
}
+
+ if (secattr->attr.mls.cat)
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
}
return 0;
}
/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag. Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ if (!(secattr->flags & NETLBL_SECATTR_SECID))
+ return -EPERM;
+
+ buffer[0] = CIPSO_V4_TAG_LOCAL;
+ buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+ *(u32 *)&buffer[2] = secattr->attr.secid;
+
+ return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ secattr->attr.secid = *(u32 *)&tag[2];
+ secattr->flags |= NETLBL_SECATTR_SECID;
+
+ return 0;
+}
+
+/**
+ * cipso_v4_optptr - Find the CIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CIPSO option. Returns a pointer
+ * to the start of the CIPSO option on success, NULL if one is not found.
+ *
+ */
+unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+ int optlen;
+ int taglen;
+
+ for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
+ switch (optptr[0]) {
+ case IPOPT_END:
+ return NULL;
+ case IPOPT_NOOP:
+ taglen = 1;
+ break;
+ default:
+ taglen = optptr[1];
+ }
+ if (!taglen || taglen > optlen)
+ return NULL;
+ if (optptr[0] == IPOPT_CIPSO)
+ return optptr;
+
+ optlen -= taglen;
+ optptr += taglen;
+ }
+
+ return NULL;
+}
+
+/**
* cipso_v4_validate - Validate a CIPSO option
+ * @skb: the packet
* @option: the start of the option, on error it is set to point to the error
*
* Description:
@@ -1116,7 +1543,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
* that is unrecognized."
*
*/
-int cipso_v4_validate(unsigned char **option)
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
{
unsigned char *opt = *option;
unsigned char *tag;
@@ -1135,13 +1562,13 @@ int cipso_v4_validate(unsigned char **option)
}
rcu_read_lock();
- doi_def = cipso_v4_doi_search(ntohl(*((__be32 *)&opt[2])));
- if (doi_def == NULL) {
+ doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
+ if (!doi_def) {
err_offset = 2;
goto validate_return_locked;
}
- opt_iter = 6;
+ opt_iter = CIPSO_V4_HDR_LEN;
tag = opt + opt_iter;
while (opt_iter < opt_len) {
for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
@@ -1151,6 +1578,10 @@ int cipso_v4_validate(unsigned char **option)
goto validate_return_locked;
}
+ if (opt_iter + 1 == opt_len) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
tag_len = tag[1];
if (tag_len > (opt_len - opt_iter)) {
err_offset = opt_iter + 1;
@@ -1159,7 +1590,7 @@ int cipso_v4_validate(unsigned char **option)
switch (tag[0]) {
case CIPSO_V4_TAG_RBITMAP:
- if (tag_len < 4) {
+ if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
err_offset = opt_iter + 1;
goto validate_return_locked;
}
@@ -1171,13 +1602,13 @@ int cipso_v4_validate(unsigned char **option)
* all the CIPSO validations here but it doesn't
* really specify _exactly_ what we need to validate
* ... so, just make it a sysctl tunable. */
- if (cipso_v4_rbm_strictvalid) {
+ if (READ_ONCE(cipso_v4_rbm_strictvalid)) {
if (cipso_v4_map_lvl_valid(doi_def,
tag[3]) < 0) {
err_offset = opt_iter + 3;
goto validate_return_locked;
}
- if (tag_len > 4 &&
+ if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
cipso_v4_map_cat_rbm_valid(doi_def,
&tag[4],
tag_len - 4) < 0) {
@@ -1186,6 +1617,59 @@ int cipso_v4_validate(unsigned char **option)
}
}
break;
+ case CIPSO_V4_TAG_ENUM:
+ if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
+ cipso_v4_map_cat_enum_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
+ cipso_v4_map_cat_rng_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ /* This is a non-standard tag that we only allow for
+ * local connections, so if the incoming interface is
+ * not the loopback device drop the packet. Further,
+ * there is no legitimate reason for setting this from
+ * userspace so reject it if skb is NULL. */
+ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+ if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+ break;
default:
err_offset = opt_iter;
goto validate_return_locked;
@@ -1203,7 +1687,7 @@ validate_return:
}
/**
- * cipso_v4_error - Send the correct reponse for a bad packet
+ * cipso_v4_error - Send the correct response for a bad packet
* @skb: the packet
* @error: the error code
* @gateway: CIPSO gateway flag
@@ -1231,60 +1715,54 @@ validate_return:
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
- if (skb->nh.iph->protocol == IPPROTO_ICMP || error != -EACCES)
+ struct inet_skb_parm parm;
+ int res;
+
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
+ return;
+
+ /*
+ * We might be called above the IP layer,
+ * so we can not use icmp_send and IPCB here.
+ */
+
+ memset(&parm, 0, sizeof(parm));
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+ rcu_read_lock();
+ res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
return;
if (gateway)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
else
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}
/**
- * cipso_v4_socket_setattr - Add a CIPSO option to a socket
- * @sock: the socket
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
* @doi_def: the CIPSO DOI to use
- * @secattr: the specific security attributes of the socket
+ * @secattr: the security attributes
*
* Description:
- * Set the CIPSO option on the given socket using the DOI definition and
- * security attributes passed to the function. This function requires
- * exclusive access to @sock->sk, which means it either needs to be in the
- * process of being created or locked via lock_sock(sock->sk). Returns zero on
- * success and negative values on failure.
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function. Returns the length of the option on success and
+ * negative values on failure.
*
*/
-int cipso_v4_socket_setattr(const struct socket *sock,
- const struct cipso_v4_doi *doi_def,
- const struct netlbl_lsm_secattr *secattr)
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
{
- int ret_val = -EPERM;
+ int ret_val;
u32 iter;
- unsigned char *buf;
- u32 buf_len = 0;
- u32 opt_len;
- struct ip_options *opt = NULL;
- struct sock *sk;
- struct inet_sock *sk_inet;
- struct inet_connection_sock *sk_conn;
-
- /* In the case of sock_create_lite(), the sock->sk field is not
- * defined yet but it is not a problem as the only users of these
- * "lite" PF_INET sockets are functions which do an accept() call
- * afterwards so we will label the socket as part of the accept(). */
- sk = sock->sk;
- if (sk == NULL)
- return 0;
- /* We allocate the maximum CIPSO option size here so we are probably
- * being a little wasteful, but it makes our life _much_ easier later
- * on and after all we are only talking about 40 bytes. */
- buf_len = CIPSO_V4_OPT_LEN_MAX;
- buf = kmalloc(buf_len, GFP_ATOMIC);
- if (buf == NULL) {
- ret_val = -ENOMEM;
- goto socket_setattr_failure;
- }
+ if (buf_len <= CIPSO_V4_HDR_LEN)
+ return -ENOSPC;
/* XXX - This code assumes only one tag per CIPSO option which isn't
* really a good assumption to make but since we only support the MAC
@@ -1299,9 +1777,26 @@ int cipso_v4_socket_setattr(const struct socket *sock,
&buf[CIPSO_V4_HDR_LEN],
buf_len - CIPSO_V4_HDR_LEN);
break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_gentag_enum(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_gentag_rng(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_gentag_loc(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
default:
- ret_val = -EPERM;
- goto socket_setattr_failure;
+ return -EPERM;
}
iter++;
@@ -1309,9 +1804,83 @@ int cipso_v4_socket_setattr(const struct socket *sock,
iter < CIPSO_V4_TAG_MAXCNT &&
doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
if (ret_val < 0)
- goto socket_setattr_failure;
+ return ret_val;
cipso_v4_gentag_hdr(doi_def, buf, ret_val);
- buf_len = CIPSO_V4_HDR_LEN + ret_val;
+ return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len)
+{
+ int iter = 0, optlen = 0;
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length
+ */
+ while (iter < len) {
+ if (data[iter] == IPOPT_END) {
+ break;
+ } else if (data[iter] == IPOPT_NOP) {
+ iter++;
+ } else {
+ iter += data[iter + 1];
+ optlen = iter;
+ }
+ }
+ return optlen;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ * @sk_locked: true if caller holds the socket lock
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ bool sk_locked)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *old, *opt = NULL;
+ struct inet_sock *sk_inet;
+ struct inet_connection_sock *sk_conn;
+
+ /* In the case of sock_create_lite(), the sock->sk field is not
+ * defined yet but it is not a problem as the only users of these
+ * "lite" PF_INET sockets are functions which do an accept() call
+ * afterwards so we will label the socket as part of the accept(). */
+ if (!sk)
+ return 0;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto socket_setattr_failure;
+ buf_len = ret_val;
/* We can't use ip_options_get() directly because it makes a call to
* ip_options_get_alloc() which allocates memory with GFP_KERNEL and
@@ -1319,27 +1888,29 @@ int cipso_v4_socket_setattr(const struct socket *sock,
* set the IPOPT_CIPSO option. */
opt_len = (buf_len + 3) & ~3;
opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
- if (opt == NULL) {
+ if (!opt) {
ret_val = -ENOMEM;
goto socket_setattr_failure;
}
- memcpy(opt->__data, buf, buf_len);
- opt->optlen = opt_len;
- opt->is_data = 1;
- opt->cipso = sizeof(struct iphdr);
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
kfree(buf);
buf = NULL;
sk_inet = inet_sk(sk);
- if (sk_inet->is_icsk) {
+
+ old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked);
+ if (inet_test_bit(IS_ICSK, sk)) {
sk_conn = inet_csk(sk);
- if (sk_inet->opt)
- sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen;
- sk_conn->icsk_ext_hdr_len += opt->optlen;
+ if (old)
+ sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+ sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
}
- opt = xchg(&sk_inet->opt, opt);
- kfree(opt);
+ rcu_assign_pointer(sk_inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
@@ -1350,122 +1921,381 @@ socket_setattr_failure:
}
/**
- * cipso_v4_sock_getattr - Get the security attributes from a sock
- * @sk: the sock
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *opt = NULL;
+ struct inet_request_sock *req_inet;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto req_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (!opt) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ req_inet = inet_rsk(req);
+ opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt)));
+ if (opt)
+ kfree_rcu(opt, rcu);
+
+ return 0;
+
+req_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
+{
+ struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1);
+ int hdr_delta = 0;
+
+ if (!opt || opt->opt.cipso == 0)
+ return 0;
+ if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+ u8 cipso_len;
+ u8 cipso_off;
+ unsigned char *cipso_ptr;
+ int optlen_new;
+
+ cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+ cipso_ptr = &opt->opt.__data[cipso_off];
+ cipso_len = cipso_ptr[1];
+
+ if (opt->opt.srr > opt->opt.cipso)
+ opt->opt.srr -= cipso_len;
+ if (opt->opt.rr > opt->opt.cipso)
+ opt->opt.rr -= cipso_len;
+ if (opt->opt.ts > opt->opt.cipso)
+ opt->opt.ts -= cipso_len;
+ if (opt->opt.router_alert > opt->opt.cipso)
+ opt->opt.router_alert -= cipso_len;
+ opt->opt.cipso = 0;
+
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ opt->opt.optlen - cipso_off - cipso_len);
+
+ optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data,
+ opt->opt.optlen);
+ hdr_delta = opt->opt.optlen;
+ opt->opt.optlen = (optlen_new + 3) & ~3;
+ hdr_delta -= opt->opt.optlen;
+ } else {
+ /* only the cipso option was present on the socket so we can
+ * remove the entire option struct */
+ *opt_ptr = NULL;
+ hdr_delta = opt->opt.optlen;
+ kfree_rcu(opt, rcu);
+ }
+
+ return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+ struct inet_sock *sk_inet;
+ int hdr_delta;
+
+ sk_inet = inet_sk(sk);
+
+ hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+ if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) {
+ struct inet_connection_sock *sk_conn = inet_csk(sk);
+ sk_conn->icsk_ext_hdr_len -= hdr_delta;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @req: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+ cipso_v4_delopt(&inet_rsk(req)->ireq_opt);
+}
+
+/**
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
* @secattr: the security attributes
*
* Description:
- * Query @sk to see if there is a CIPSO option attached to the sock and if
- * there is return the CIPSO security attributes in @secattr. This function
- * requires that @sk be locked, or privately held, but it does not do any
- * locking itself. Returns zero on success and negative values on failure.
+ * Inspect @cipso and return the security attributes in @secattr. Returns zero
+ * on success and negative values on failure.
*
*/
-int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+int cipso_v4_getattr(const unsigned char *cipso,
+ struct netlbl_lsm_secattr *secattr)
{
int ret_val = -ENOMSG;
- struct inet_sock *sk_inet;
- unsigned char *cipso_ptr;
u32 doi;
struct cipso_v4_doi *doi_def;
- sk_inet = inet_sk(sk);
- if (sk_inet->opt == NULL || sk_inet->opt->cipso == 0)
- return -ENOMSG;
- cipso_ptr = sk_inet->opt->__data + sk_inet->opt->cipso -
- sizeof(struct iphdr);
- ret_val = cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr);
- if (ret_val == 0)
- return ret_val;
+ if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+ return 0;
- doi = ntohl(*(__be32 *)&cipso_ptr[2]);
+ doi = get_unaligned_be32(&cipso[2]);
rcu_read_lock();
doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL) {
- rcu_read_unlock();
- return -ENOMSG;
- }
-
+ if (!doi_def)
+ goto getattr_return;
/* XXX - This code assumes only one tag per CIPSO option which isn't
* really a good assumption to make but since we only support the MAC
* tags right now it is a safe assumption. */
- switch (cipso_ptr[6]) {
+ switch (cipso[6]) {
case CIPSO_V4_TAG_RBITMAP:
- ret_val = cipso_v4_parsetag_rbm(doi_def,
- &cipso_ptr[6],
- secattr);
+ ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
break;
}
- rcu_read_unlock();
+ if (ret_val == 0)
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+getattr_return:
+ rcu_read_unlock();
return ret_val;
}
/**
- * cipso_v4_socket_getattr - Get the security attributes from a socket
- * @sock: the socket
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ struct ip_options_rcu *opt;
+ int res = -ENOMSG;
+
+ rcu_read_lock();
+ opt = rcu_dereference(inet_sk(sk)->inet_opt);
+ if (opt && opt->opt.cipso)
+ res = cipso_v4_getattr(opt->opt.__data +
+ opt->opt.cipso -
+ sizeof(struct iphdr),
+ secattr);
+ rcu_read_unlock();
+ return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the DOI structure
* @secattr: the security attributes
*
* Description:
- * Query @sock to see if there is a CIPSO option attached to the socket and if
- * there is return the CIPSO security attributes in @secattr. Returns zero on
- * success and negative values on failure.
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
*
*/
-int cipso_v4_socket_getattr(const struct socket *sock,
- struct netlbl_lsm_secattr *secattr)
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
{
int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+ u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+ u32 opt_len;
+ int len_delta;
- lock_sock(sock->sk);
- ret_val = cipso_v4_sock_getattr(sock->sk, secattr);
- release_sock(sock->sk);
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+ buf_len = ret_val;
+ opt_len = (buf_len + 3) & ~3;
- return ret_val;
+ /* we overwrite any existing options to ensure that we have enough
+ * room for the CIPSO option, the reason is that we _need_ to guarantee
+ * that the security label is applied to the packet - we do the same
+ * thing when using the socket options and it hasn't caused a problem,
+ * if we need to we can always revisit this choice later */
+
+ len_delta = opt_len - opt->optlen;
+ /* if we don't ensure enough headroom we could panic on the skb_push()
+ * call below so make sure we have enough, we are also "mangling" the
+ * packet so we should probably do a copy-on-write call anyway */
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta > 0) {
+ /* we assume that the header + opt->optlen have already been
+ * "pushed" in ip_options_build() or similar */
+ iph = ip_hdr(skb);
+ skb_push(skb, len_delta);
+ memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ } else if (len_delta < 0) {
+ iph = ip_hdr(skb);
+ memset(iph + 1, IPOPT_NOP, opt->optlen);
+ } else
+ iph = ip_hdr(skb);
+
+ if (opt->optlen > 0)
+ memset(opt, 0, sizeof(*opt));
+ opt->optlen = opt_len;
+ opt->cipso = sizeof(struct iphdr);
+ opt->is_changed = 1;
+
+ /* we have to do the following because we are being called from a
+ * netfilter hook which means the packet already has had the header
+ * fields populated and the checksum calculated - yes this means we
+ * are doing more work than needed but we do it to keep the core
+ * stack clean and tidy */
+ memcpy(iph + 1, buf, buf_len);
+ if (opt_len > buf_len)
+ memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+ if (len_delta != 0) {
+ iph->ihl = 5 + (opt_len >> 2);
+ iph_set_totlen(iph, skb->len);
+ }
+ ip_send_check(iph);
+
+ return 0;
}
/**
- * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
* @skb: the packet
- * @secattr: the security attributes
*
* Description:
- * Parse the given packet's CIPSO option and return the security attributes.
- * Returns zero on success and negative values on failure.
+ * Removes any and all CIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
*
*/
-int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
- struct netlbl_lsm_secattr *secattr)
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
{
- int ret_val = -ENOMSG;
+ int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len,
+ hdr_len_delta;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
unsigned char *cipso_ptr;
- u32 doi;
- struct cipso_v4_doi *doi_def;
- cipso_ptr = CIPSO_V4_OPTPTR(skb);
- if (cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr) == 0)
+ if (opt->cipso == 0)
return 0;
- doi = ntohl(*(__be32 *)&cipso_ptr[2]);
- rcu_read_lock();
- doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL)
- goto skbuff_getattr_return;
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
- /* XXX - This code assumes only one tag per CIPSO option which isn't
- * really a good assumption to make but since we only support the MAC
- * tags right now it is a safe assumption. */
- switch (cipso_ptr[6]) {
- case CIPSO_V4_TAG_RBITMAP:
- ret_val = cipso_v4_parsetag_rbm(doi_def,
- &cipso_ptr[6],
- secattr);
- break;
+ iph = ip_hdr(skb);
+ cipso_ptr = (unsigned char *)iph + opt->cipso;
+ cipso_len = cipso_ptr[1];
+
+ hdr_len_actual = sizeof(struct iphdr) +
+ cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1),
+ opt->optlen);
+ new_hdr_len_actual = hdr_len_actual - cipso_len;
+ new_hdr_len = (new_hdr_len_actual + 3) & ~3;
+ hdr_len_delta = (iph->ihl << 2) - new_hdr_len;
+
+ /* 1. shift any options after CIPSO to the left */
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ new_hdr_len_actual - opt->cipso);
+ /* 2. move the whole IP header to its new place */
+ memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual);
+ /* 3. adjust the skb layout */
+ skb_pull(skb, hdr_len_delta);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ /* 4. re-fill new padding with IPOPT_END (may now be longer) */
+ memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END,
+ new_hdr_len - new_hdr_len_actual);
+
+ opt->optlen -= hdr_len_delta;
+ opt->cipso = 0;
+ opt->is_changed = 1;
+ if (hdr_len_delta != 0) {
+ iph->ihl = new_hdr_len >> 2;
+ iph_set_totlen(iph, skb->len);
}
+ ip_send_check(iph);
-skbuff_getattr_return:
- rcu_read_unlock();
- return ret_val;
+ return 0;
}
/*
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 7b068a891953..1614593b6d72 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -1,73 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* common UDP/RAW code
* Linux INET implementation
*
* Authors:
* Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
#include <linux/module.h>
-#include <linux/ip.h>
#include <linux/in.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
+#include <net/sock_reuseport.h>
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct flowi4 *fl4;
struct rtable *rt;
__be32 saddr;
int oif;
int err;
-
- if (addr_len < sizeof(*usin))
- return -EINVAL;
- if (usin->sin_family != AF_INET)
- return -EAFNOSUPPORT;
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
sk_dst_reset(sk);
oif = sk->sk_bound_dev_if;
- saddr = inet->saddr;
- if (MULTICAST(usin->sin_addr.s_addr)) {
- if (!oif)
- oif = inet->mc_index;
+ saddr = inet->inet_saddr;
+ if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
+ if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
+ oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
+ } else if (!oif) {
+ oif = READ_ONCE(inet->uc_index);
+ }
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
+ sk->sk_protocol, inet->inet_sport,
+ usin->sin_port, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ goto out;
}
- err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
- RT_CONN_FLAGS(sk), oif,
- sk->sk_protocol,
- inet->sport, usin->sin_port, sk);
- if (err)
- return err;
+
if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
ip_rt_put(rt);
- return -EACCES;
+ err = -EACCES;
+ goto out;
}
- if (!inet->saddr)
- inet->saddr = rt->rt_src; /* Update source address */
- if (!inet->rcv_saddr)
- inet->rcv_saddr = rt->rt_src;
- inet->daddr = rt->rt_dst;
- inet->dport = usin->sin_port;
+
+ /* Update addresses before rehashing */
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr;
+ if (!inet->inet_rcv_saddr) {
+ inet->inet_rcv_saddr = fl4->saddr;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+ reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
- inet->id = jiffies;
+ sk_set_txhash(sk);
+ atomic_set(&inet->inet_id, get_random_u16());
- sk_dst_set(sk, &rt->u.dst);
- return(0);
+ sk_dst_set(sk, &rt->dst);
+ err = 0;
+out:
+ return err;
}
+EXPORT_SYMBOL(__ip4_datagram_connect);
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ int res;
+ lock_sock(sk);
+ res = __ip4_datagram_connect(sk, uaddr, addr_len);
+ release_sock(sk);
+ return res;
+}
EXPORT_SYMBOL(ip4_datagram_connect);
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
+void ip4_datagram_release_cb(struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ inet_sk_init_flowi4(inet, &fl4);
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ dst = !IS_ERR(rt) ? &rt->dst : NULL;
+ sk_dst_set(sk, dst);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2fd899160f85..942a887bf089 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,13 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3 IP device support routines.
*
- * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Derived from the IP parts of dev.c 1.0.19
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -28,14 +22,13 @@
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -49,65 +42,181 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include "igmp_internal.h"
+#include <linux/slab.h>
+#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/kmod.h>
+#include <linux/netconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ip_fib.h>
-#include <net/netlink.h>
-
-struct ipv4_devconf ipv4_devconf = {
- .accept_redirects = 1,
- .send_redirects = 1,
- .secure_redirects = 1,
- .shared_media = 1,
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/addrconf.h>
+
+#define IPV6ONLY_FLAGS \
+ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \
+ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \
+ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY)
+
+static struct ipv4_devconf ipv4_devconf = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
+ },
};
static struct ipv4_devconf ipv4_devconf_dflt = {
- .accept_redirects = 1,
- .send_redirects = 1,
- .secure_redirects = 1,
- .shared_media = 1,
- .accept_source_route = 1,
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
+ },
};
-static struct nla_policy ifa_ipv4_policy[IFA_MAX+1] __read_mostly = {
+#define IPV4_DEVCONF_DFLT(net, attr) \
+ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
+
+static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .type = NLA_U32 },
[IFA_ADDRESS] = { .type = NLA_U32 },
[IFA_BROADCAST] = { .type = NLA_U32 },
- [IFA_ANYCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .type = NLA_U32 },
+ [IFA_RT_PRIORITY] = { .type = NLA_U32 },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFA_PROTO] = { .type = NLA_U8 },
};
+#define IN4_ADDR_HSIZE_SHIFT 8
+#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
+
+static u32 inet_addr_hash(const struct net *net, __be32 addr)
+{
+ u32 val = __ipv4_addr_hash(addr, net_hash_mix(net));
+
+ return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ u32 hash = inet_addr_hash(net, ifa->ifa_local);
+
+ ASSERT_RTNL();
+ hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ ASSERT_RTNL();
+ hlist_del_init_rcu(&ifa->addr_lst);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ ifa = inet_lookup_ifaddr_rcu(net, addr);
+ if (!ifa) {
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res = { 0 };
+ struct fib_table *local;
+
+ /* Fallback to FIB local table so that communication
+ * over loopback subnets work.
+ */
+ local = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local &&
+ !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+ res.type == RTN_LOCAL)
+ result = FIB_RES_DEV(res);
+ } else {
+ result = ifa->ifa_dev->dev;
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
+/* called under RCU lock */
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
+{
+ u32 hash = inet_addr_hash(net, addr);
+ struct in_ifaddr *ifa;
+
+ hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst)
+ if (ifa->ifa_local == addr)
+ return ifa;
+
+ return NULL;
+}
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
-static void devinet_sysctl_register(struct in_device *in_dev,
- struct ipv4_devconf *p);
-static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+static int devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static int devinet_sysctl_register(struct in_device *idev)
+{
+ return 0;
+}
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
#endif
/* Locks all the inet devices. */
-static struct in_ifaddr *inet_alloc_ifa(void)
+static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
{
- struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
+ struct in_ifaddr *ifa;
- if (ifa) {
- INIT_RCU_HEAD(&ifa->rcu_head);
- }
+ ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT);
+ if (!ifa)
+ return NULL;
+
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+
+ INIT_HLIST_NODE(&ifa->addr_lst);
return ifa;
}
@@ -115,152 +224,177 @@ static struct in_ifaddr *inet_alloc_ifa(void)
static void inet_rcu_free_ifa(struct rcu_head *head)
{
struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
- if (ifa->ifa_dev)
- in_dev_put(ifa->ifa_dev);
+
+ in_dev_put(ifa->ifa_dev);
kfree(ifa);
}
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct in_ifaddr *ifa)
{
- call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+ /* Our reference to ifa->ifa_dev must be freed ASAP
+ * to release the reference to the netdev the same way.
+ * in_dev_put() -> in_dev_finish_destroy() -> netdev_put()
+ */
+ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+static void in_dev_free_rcu(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
+ kfree(idev);
}
void in_dev_finish_destroy(struct in_device *idev)
{
struct net_device *dev = idev->dev;
- BUG_TRAP(!idev->ifa_list);
- BUG_TRAP(!idev->mc_list);
+ WARN_ON(idev->ifa_list);
+ WARN_ON(idev->mc_list);
#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
- idev, dev ? dev->name : "NIL");
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
- dev_put(dev);
+ netdev_put(dev, &idev->dev_tracker);
if (!idev->dead)
- printk("Freeing alive in_device %p\n", idev);
- else {
- kfree(idev);
- }
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
+ call_rcu(&idev->rcu_head, in_dev_free_rcu);
}
+EXPORT_SYMBOL(in_dev_finish_destroy);
-struct in_device *inetdev_init(struct net_device *dev)
+static struct in_device *inetdev_init(struct net_device *dev)
{
struct in_device *in_dev;
+ int err = -ENOMEM;
ASSERT_RTNL();
in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
if (!in_dev)
goto out;
- INIT_RCU_HEAD(&in_dev->rcu_head);
- memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+ memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+ sizeof(in_dev->cnf));
in_dev->cnf.sysctl = NULL;
in_dev->dev = dev;
- if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
goto out_kfree;
+ if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+ netif_disable_lro(dev);
/* Reference in_dev->dev */
- dev_hold(dev);
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
-#endif
+ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
+ /* Account for reference dev->ip_ptr (below) */
+ refcount_set(&in_dev->refcnt, 1);
+
+ if (dev != blackhole_netdev) {
+ err = devinet_sysctl_register(in_dev);
+ if (err) {
+ in_dev->dead = 1;
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ in_dev_put(in_dev);
+ in_dev = NULL;
+ goto out;
+ }
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
+ }
- /* Account for reference dev->ip_ptr */
- in_dev_hold(in_dev);
+ /* we can receive as soon as ip_ptr is set -- do this last */
rcu_assign_pointer(dev->ip_ptr, in_dev);
-
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_register(in_dev, &in_dev->cnf);
-#endif
- ip_mc_init_dev(in_dev);
- if (dev->flags & IFF_UP)
- ip_mc_up(in_dev);
out:
- return in_dev;
+ return in_dev ?: ERR_PTR(err);
out_kfree:
kfree(in_dev);
in_dev = NULL;
goto out;
}
-static void in_dev_rcu_put(struct rcu_head *head)
-{
- struct in_device *idev = container_of(head, struct in_device, rcu_head);
- in_dev_put(idev);
-}
-
static void inetdev_destroy(struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
struct net_device *dev;
+ struct in_ifaddr *ifa;
ASSERT_RTNL();
dev = in_dev->dev;
- if (dev == &loopback_dev)
- return;
in_dev->dead = 1;
ip_mc_destroy_dev(in_dev);
- while ((ifa = in_dev->ifa_list) != NULL) {
+ while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
inet_free_ifa(ifa);
}
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_unregister(&in_dev->cnf);
-#endif
-
- dev->ip_ptr = NULL;
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_unregister(in_dev->arp_parms);
-#endif
+ devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
arp_ifdown(dev);
- call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+ in_dev_put(in_dev);
+}
+
+static int __init inet_blackhole_dev_init(void)
+{
+ struct in_device *in_dev;
+
+ rtnl_lock();
+ in_dev = inetdev_init(blackhole_netdev);
+ rtnl_unlock();
+
+ return PTR_ERR_OR_ZERO(in_dev);
}
+late_initcall(inet_blackhole_dev_init);
int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
{
+ const struct in_ifaddr *ifa;
+
rcu_read_lock();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
- } endfor_ifa(in_dev);
+ }
rcu_read_unlock();
return 0;
}
-static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
- int destroy, struct nlmsghdr *nlh, u32 pid)
+static void __inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 portid)
{
struct in_ifaddr *promote = NULL;
- struct in_ifaddr *ifa, *ifa1 = *ifap;
- struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *ifa, *ifa1;
+ struct in_ifaddr __rcu **last_prim;
struct in_ifaddr *prev_prom = NULL;
int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
ASSERT_RTNL();
- /* 1. Deleting primary ifaddr forces deletion all secondaries
+ ifa1 = rtnl_dereference(*ifap);
+ last_prim = ifap;
+ if (in_dev->dead)
+ goto no_promotions;
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries
* unless alias promotion is set
**/
if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
- struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+ struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
- while ((ifa = *ifap1) != NULL) {
- if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
ifa1->ifa_scope <= ifa->ifa_scope)
- last_prim = ifa;
+ last_prim = &ifa->ifa_next;
if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
ifa1->ifa_mask != ifa->ifa_mask ||
@@ -271,9 +405,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
if (!do_promote) {
+ inet_hash_remove(ifa);
*ifap1 = ifa->ifa_next;
- rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+ rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_DOWN, ifa);
inet_free_ifa(ifa);
@@ -284,9 +419,22 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
}
+ /* On promotion all secondaries from subnet are changing
+ * the primary IP, we must remove all their routes silently
+ * and later to add them back with new prefsrc. Do this
+ * while all addresses are on the device list.
+ */
+ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa))
+ fib_del_ifaddr(ifa, ifa1);
+ }
+
+no_promotions:
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
/* 3. Announce address deletion */
@@ -298,22 +446,29 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
is valid, it will try to restore deleted routes... Grr.
So that, this order is correct.
*/
- rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (promote) {
+ struct in_ifaddr *next_sec;
+ next_sec = rtnl_dereference(promote->ifa_next);
if (prev_prom) {
- prev_prom->ifa_next = promote->ifa_next;
- promote->ifa_next = last_prim->ifa_next;
- last_prim->ifa_next = promote;
+ struct in_ifaddr *last_sec;
+
+ rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+
+ last_sec = rtnl_dereference(*last_prim);
+ rcu_assign_pointer(promote->ifa_next, last_sec);
+ rcu_assign_pointer(*last_prim, promote);
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
- rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+ rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_UP, promote);
- for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
+ for (ifa = next_sec; ifa;
+ ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa))
continue;
@@ -321,38 +476,39 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
}
- if (destroy) {
+ if (destroy)
inet_free_ifa(ifa1);
-
- if (!in_dev->ifa_list)
- inetdev_destroy(in_dev);
- }
}
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy)
{
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
- u32 pid)
+ u32 portid, struct netlink_ext_ack *extack)
{
+ struct in_ifaddr __rcu **last_primary, **ifap;
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap, **last_primary;
+ struct net *net = dev_net(in_dev->dev);
+ struct in_validator_info ivi;
+ struct in_ifaddr *ifa1;
+ int ret;
ASSERT_RTNL();
- if (!ifa->ifa_local) {
- inet_free_ifa(ifa);
- return 0;
- }
-
ifa->ifa_flags &= ~IFA_F_SECONDARY;
last_primary = &in_dev->ifa_list;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ /* Don't set IPv6 only flags to IPv4 addresses */
+ ifa->ifa_flags &= ~IPV6ONLY_FLAGS;
+
+ ifap = &in_dev->ifa_list;
+ ifa1 = rtnl_dereference(*ifap);
+
+ while (ifa1) {
if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
ifa->ifa_scope <= ifa1->ifa_scope)
last_primary = &ifa1->ifa_next;
@@ -363,25 +519,50 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
return -EEXIST;
}
if (ifa1->ifa_scope != ifa->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value");
inet_free_ifa(ifa);
return -EINVAL;
}
ifa->ifa_flags |= IFA_F_SECONDARY;
}
+
+ ifap = &ifa1->ifa_next;
+ ifa1 = rtnl_dereference(*ifap);
}
- if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
- net_srandom(ifa->ifa_local);
- ifap = last_primary;
+ /* Allow any devices that wish to register ifaddr validtors to weigh
+ * in now, before changes are committed. The rntl lock is serializing
+ * access here, so the state should not change between a validator call
+ * and a final notify on commit. This isn't invoked on promotion under
+ * the assumption that validators are checking the address itself, and
+ * not the flags.
+ */
+ ivi.ivi_addr = ifa->ifa_address;
+ ivi.ivi_dev = ifa->ifa_dev;
+ ivi.extack = extack;
+ ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
+ NETDEV_UP, &ivi);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ inet_free_ifa(ifa);
+ return ret;
}
- ifa->ifa_next = *ifap;
- *ifap = ifa;
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+ ifap = last_primary;
+
+ rcu_assign_pointer(ifa->ifa_next, *ifap);
+ rcu_assign_pointer(*ifap, ifa);
+
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+ cancel_delayed_work(&net->ipv4.addr_chk_work);
+ queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0);
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
- rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
return 0;
@@ -389,85 +570,117 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
static int inet_insert_ifa(struct in_ifaddr *ifa)
{
- return __inet_insert_ifa(ifa, NULL, 0);
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ return __inet_insert_ifa(ifa, NULL, 0, NULL);
}
static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl_net(dev);
- ASSERT_RTNL();
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
- if (!in_dev) {
- in_dev = inetdev_init(dev);
- if (!in_dev) {
- inet_free_ifa(ifa);
- return -ENOBUFS;
- }
- }
- if (ifa->ifa_dev != in_dev) {
- BUG_TRAP(!ifa->ifa_dev);
- in_dev_hold(in_dev);
- ifa->ifa_dev = in_dev;
- }
- if (LOOPBACK(ifa->ifa_local))
+ if (ipv4_is_loopback(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
return inet_insert_ifa(ifa);
}
-struct in_device *inetdev_by_index(int ifindex)
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct in_device *in_dev = NULL;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(ifindex);
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
- in_dev = in_dev_get(dev);
- read_unlock(&dev_base_lock);
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ rcu_read_unlock();
return in_dev;
}
+EXPORT_SYMBOL(inetdev_by_index);
/* Called only from RTNL semaphored context. No locks. */
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
{
+ struct in_ifaddr *ifa;
+
ASSERT_RTNL();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
- } endfor_ifa(in_dev);
+ }
return NULL;
}
-static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int ip_mc_autojoin_config(struct net *net, bool join,
+ const struct in_ifaddr *ifa)
+{
+#if defined(CONFIG_IP_MULTICAST)
+ struct ip_mreqn mreq = {
+ .imr_multiaddr.s_addr = ifa->ifa_address,
+ .imr_ifindex = ifa->ifa_dev->dev->ifindex,
+ };
+ struct sock *sk = net->ipv4.mc_autojoin_sk;
+ int ret;
+
+ ASSERT_RTNL_NET(net);
+
+ lock_sock(sk);
+ if (join)
+ ret = ip_mc_join_group(sk, &mreq);
+ else
+ ret = ip_mc_leave_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(skb->sk);
+ struct in_ifaddr __rcu **ifap;
struct nlattr *tb[IFA_MAX+1];
struct in_device *in_dev;
struct ifaddrmsg *ifm;
- struct in_ifaddr *ifa, **ifap;
- int err = -EINVAL;
-
- ASSERT_RTNL();
+ struct in_ifaddr *ifa;
+ int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
if (err < 0)
- goto errout;
+ goto out;
ifm = nlmsg_data(nlh);
- in_dev = inetdev_by_index(ifm->ifa_index);
- if (in_dev == NULL) {
+
+ rtnl_net_lock(net);
+
+ in_dev = inetdev_by_index(net, ifm->ifa_index);
+ if (!in_dev) {
+ NL_SET_ERR_MSG(extack, "ipv4: Device not found");
err = -ENODEV;
- goto errout;
+ goto unlock;
}
- __in_dev_put(in_dev);
-
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
- ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL]))
+ ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
continue;
if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
@@ -475,162 +688,401 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
if (tb[IFA_ADDRESS] &&
(ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
- !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
+ !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
continue;
- __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
- return 0;
+ if (ipv4_is_multicast(ifa->ifa_address))
+ ip_mc_autojoin_config(net, false, ifa);
+
+ __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
+ goto unlock;
}
+ NL_SET_ERR_MSG(extack, "ipv4: Address not found");
err = -EADDRNOTAVAIL;
-errout:
+unlock:
+ rtnl_net_unlock(net);
+out:
return err;
}
-static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh)
+static void check_lifetime(struct work_struct *work)
{
- struct nlattr *tb[IFA_MAX+1];
+ unsigned long now, next, next_sec, next_sched;
struct in_ifaddr *ifa;
- struct ifaddrmsg *ifm;
- struct net_device *dev;
- struct in_device *in_dev;
- int err = -EINVAL;
+ struct hlist_node *n;
+ struct net *net;
+ int i;
+
+ net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work);
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ struct hlist_head *head = &net->ipv4.inet_addr_lst[i];
+ bool change_needed = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, head, addr_lst) {
+ unsigned long age, tstamp;
+ u32 preferred_lft;
+ u32 valid_lft;
+ u32 flags;
+
+ flags = READ_ONCE(ifa->ifa_flags);
+ if (flags & IFA_F_PERMANENT)
+ continue;
+
+ preferred_lft = READ_ONCE(ifa->ifa_preferred_lft);
+ valid_lft = READ_ONCE(ifa->ifa_valid_lft);
+ tstamp = READ_ONCE(ifa->ifa_tstamp);
+ /* We try to batch several events at once. */
+ age = (now - tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (valid_lft != INFINITY_LIFE_TIME &&
+ age >= valid_lft) {
+ change_needed = true;
+ } else if (preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= preferred_lft) {
+ if (time_before(tstamp + valid_lft * HZ, next))
+ next = tstamp + valid_lft * HZ;
+
+ if (!(flags & IFA_F_DEPRECATED))
+ change_needed = true;
+ } else if (time_before(tstamp + preferred_lft * HZ,
+ next)) {
+ next = tstamp + preferred_lft * HZ;
+ }
+ }
+ rcu_read_unlock();
+ if (!change_needed)
+ continue;
+
+ rtnl_net_lock(net);
+ hlist_for_each_entry_safe(ifa, n, head, addr_lst) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr __rcu **ifap;
+ struct in_ifaddr *tmp;
+
+ ifap = &ifa->ifa_dev->ifa_list;
+ tmp = rtnl_net_dereference(net, *ifap);
+ while (tmp) {
+ if (tmp == ifa) {
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ break;
+ }
+ ifap = &tmp->ifa_next;
+ tmp = rtnl_net_dereference(net, *ifap);
+ }
+ } else if (ifa->ifa_preferred_lft !=
+ INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_preferred_lft &&
+ !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ }
+ rtnl_net_unlock(net);
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work,
+ next_sched - now);
+}
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+ u32 flags;
+
+ flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ WRITE_ONCE(ifa->ifa_valid_lft, timeout);
+ else
+ flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ flags |= IFA_F_DEPRECATED;
+ WRITE_ONCE(ifa->ifa_preferred_lft, timeout);
+ }
+ WRITE_ONCE(ifa->ifa_flags, flags);
+ WRITE_ONCE(ifa->ifa_tstamp, jiffies);
+ if (!ifa->ifa_cstamp)
+ WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp);
+}
+
+static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb,
+ struct netlink_ext_ack *extack,
+ __u32 *valid_lft, __u32 *prefered_lft)
+{
+ struct ifaddrmsg *ifm = nlmsg_data(nlh);
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
if (err < 0)
- goto errout;
+ return err;
- ifm = nlmsg_data(nlh);
- if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL)
- goto errout;
+ if (ifm->ifa_prefixlen > 32) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length");
+ return -EINVAL;
+ }
- dev = __dev_get_by_index(ifm->ifa_index);
- if (dev == NULL) {
- err = -ENODEV;
- goto errout;
+ if (!tb[IFA_LOCAL]) {
+ NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied");
+ return -EINVAL;
}
- in_dev = __in_dev_get_rtnl(dev);
- if (in_dev == NULL) {
- in_dev = inetdev_init(dev);
- if (in_dev == NULL) {
- err = -ENOBUFS;
- goto errout;
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid");
+ return -EINVAL;
}
+
+ *valid_lft = ci->ifa_valid;
+ *prefered_lft = ci->ifa_prefered;
}
- ifa = inet_alloc_ifa();
- if (ifa == NULL) {
+ return 0;
+}
+
+static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrmsg *ifm = nlmsg_data(nlh);
+ struct in_device *in_dev;
+ struct net_device *dev;
+ struct in_ifaddr *ifa;
+ int err;
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ err = -ENODEV;
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "ipv4: Device not found");
+ goto errout;
+ }
+
+ in_dev = __in_dev_get_rtnl_net(dev);
+ err = -ENOBUFS;
+ if (!in_dev)
+ goto errout;
+
+ ifa = inet_alloc_ifa(in_dev);
+ if (!ifa)
/*
* A potential indev allocation can be left alive, it stays
* assigned to its device and is destroy with it.
*/
- err = -ENOBUFS;
goto errout;
- }
- in_dev_hold(in_dev);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
- if (tb[IFA_ADDRESS] == NULL)
+ if (!tb[IFA_ADDRESS])
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
- ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
ifa->ifa_scope = ifm->ifa_scope;
- ifa->ifa_dev = in_dev;
-
- ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]);
- ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]);
+ ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
+ ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
if (tb[IFA_BROADCAST])
- ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
-
- if (tb[IFA_ANYCAST])
- ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]);
+ ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
if (tb[IFA_LABEL])
- nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_RT_PRIORITY])
+ ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
+ if (tb[IFA_PROTO])
+ ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
return ifa;
errout:
return ERR_PTR(err);
}
-static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1;
+
+ in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+
+ return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ struct net *net = sock_net(skb->sk);
+ struct in_ifaddr *ifa_existing;
+ struct nlattr *tb[IFA_MAX + 1];
struct in_ifaddr *ifa;
+ int ret;
- ASSERT_RTNL();
+ ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft);
+ if (ret < 0)
+ return ret;
- ifa = rtm_to_ifaddr(nlh);
- if (IS_ERR(ifa))
- return PTR_ERR(ifa);
+ if (!nla_get_in_addr(tb[IFA_LOCAL]))
+ return 0;
+
+ rtnl_net_lock(net);
+
+ ifa = inet_rtm_to_ifa(net, nlh, tb, extack);
+ if (IS_ERR(ifa)) {
+ ret = PTR_ERR(ifa);
+ goto unlock;
+ }
+
+ ifa_existing = find_matching_ifa(net, ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace already relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
+ ret = ip_mc_autojoin_config(net, true, ifa);
+ if (ret < 0) {
+ NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed");
+ inet_free_ifa(ifa);
+ goto unlock;
+ }
+ }
+
+ ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack);
+ } else {
+ u32 new_metric = ifa->ifa_rt_priority;
+ u8 new_proto = ifa->ifa_proto;
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Address already assigned");
+ ret = -EEXIST;
+ goto unlock;
+ }
+ ifa = ifa_existing;
+
+ if (ifa->ifa_rt_priority != new_metric) {
+ fib_modify_prefix_metric(ifa, new_metric);
+ ifa->ifa_rt_priority = new_metric;
+ }
+
+ ifa->ifa_proto = new_proto;
+
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ cancel_delayed_work(&net->ipv4.addr_chk_work);
+ queue_delayed_work(system_power_efficient_wq,
+ &net->ipv4.addr_chk_work, 0);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ }
+
+unlock:
+ rtnl_net_unlock(net);
+
+ return ret;
}
/*
* Determine a default network mask, based on the IP address.
*/
-static __inline__ int inet_abc_len(__be32 addr)
+static int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
- if (ZERONET(addr))
- rc = 0;
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+ rc = 0;
else {
__u32 haddr = ntohl(addr);
-
if (IN_CLASSA(haddr))
rc = 8;
else if (IN_CLASSB(haddr))
rc = 16;
else if (IN_CLASSC(haddr))
rc = 24;
+ else if (IN_CLASSE(haddr))
+ rc = 32;
}
- return rc;
+ return rc;
}
-int devinet_ioctl(unsigned int cmd, void __user *arg)
+int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
{
- struct ifreq ifr;
struct sockaddr_in sin_orig;
- struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+ struct in_ifaddr __rcu **ifap = NULL;
struct in_device *in_dev;
- struct in_ifaddr **ifap = NULL;
struct in_ifaddr *ifa = NULL;
struct net_device *dev;
char *colon;
int ret = -EFAULT;
int tryaddrmatch = 0;
- /*
- * Fetch the caller's info block into kernel space
- */
-
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- goto out;
- ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ ifr->ifr_name[IFNAMSIZ - 1] = 0;
/* save original address for comparison */
memcpy(&sin_orig, sin, sizeof(*sin));
- colon = strchr(ifr.ifr_name, ':');
+ colon = strchr(ifr->ifr_name, ':');
if (colon)
*colon = 0;
-#ifdef CONFIG_KMOD
- dev_load(ifr.ifr_name);
-#endif
+ dev_load(net, ifr->ifr_name);
- switch(cmd) {
+ switch (cmd) {
case SIOCGIFADDR: /* Get interface address */
case SIOCGIFBRDADDR: /* Get the broadcast address */
case SIOCGIFDSTADDR: /* Get the destination address */
@@ -645,16 +1097,16 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
case SIOCSIFFLAGS:
- ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
case SIOCSIFBRDADDR: /* Set the broadcast address */
case SIOCSIFDSTADDR: /* Set the destination address */
case SIOCSIFNETMASK: /* Set the netmask for the interface */
- ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
ret = -EINVAL;
if (sin->sin_family != AF_INET)
@@ -665,27 +1117,31 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
goto out;
}
- rtnl_lock();
+ rtnl_net_lock(net);
ret = -ENODEV;
- if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
+ dev = __dev_get_by_name(net, ifr->ifr_name);
+ if (!dev)
goto done;
if (colon)
*colon = ':';
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl_net(dev);
+ if (in_dev) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
/* note: we only do this for a limited set of ioctls
and only if the original address family was AF_INET.
This is checked above. */
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
ifap = &ifa->ifa_next) {
- if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+ if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
- ifa->ifa_address) {
+ ifa->ifa_local) {
break; /* found */
}
}
@@ -694,9 +1150,10 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
4.3BSD-style and passed in junk so we fall back to
comparing just the label */
if (!ifa) {
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
ifap = &ifa->ifa_next)
- if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+ if (!strcmp(ifr->ifr_name, ifa->ifa_label))
break;
}
}
@@ -705,22 +1162,26 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
goto done;
- switch(cmd) {
+ switch (cmd) {
case SIOCGIFADDR: /* Get interface address */
+ ret = 0;
sin->sin_addr.s_addr = ifa->ifa_local;
- goto rarok;
+ break;
case SIOCGIFBRDADDR: /* Get the broadcast address */
+ ret = 0;
sin->sin_addr.s_addr = ifa->ifa_broadcast;
- goto rarok;
+ break;
case SIOCGIFDSTADDR: /* Get the destination address */
+ ret = 0;
sin->sin_addr.s_addr = ifa->ifa_address;
- goto rarok;
+ break;
case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ ret = 0;
sin->sin_addr.s_addr = ifa->ifa_mask;
- goto rarok;
+ break;
case SIOCSIFFLAGS:
if (colon) {
@@ -728,11 +1189,14 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (!ifa)
break;
ret = 0;
- if (!(ifr.ifr_flags & IFF_UP))
+ if (!(ifr->ifr_flags & IFF_UP))
inet_del_ifa(in_dev, ifap, 1);
break;
}
- ret = dev_change_flags(dev, ifr.ifr_flags);
+
+ /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */
+ ASSERT_RTNL();
+ ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
break;
case SIOCSIFADDR: /* Set interface address (and family) */
@@ -742,10 +1206,14 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
- if ((ifa = inet_alloc_ifa()) == NULL)
+ if (!in_dev)
+ break;
+ ifa = inet_alloc_ifa(in_dev);
+ if (!ifa)
break;
+
if (colon)
- memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
} else {
@@ -754,7 +1222,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_broadcast = 0;
- ifa->ifa_anycast = 0;
+ ifa->ifa_scope = 0;
}
ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
@@ -770,6 +1238,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
ifa->ifa_prefixlen = 32;
ifa->ifa_mask = inet_make_mask(32);
}
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
ret = inet_set_ifa(dev, ifa);
break;
@@ -828,66 +1297,87 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
}
done:
- rtnl_unlock();
+ rtnl_net_unlock(net);
out:
return ret;
-rarok:
- rtnl_unlock();
- ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
- goto out;
}
-static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
- struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get_rtnl_net(dev);
+ const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
- if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+ if (WARN_ON(size > sizeof(struct ifreq)))
+ goto out;
+
+ if (!in_dev)
goto out;
- for (; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) {
if (!buf) {
- done += sizeof(ifr);
+ done += size;
continue;
}
- if (len < (int) sizeof(ifr))
+ if (len < size)
break;
memset(&ifr, 0, sizeof(struct ifreq));
- if (ifa->ifa_label)
- strcpy(ifr.ifr_name, ifa->ifa_label);
- else
- strcpy(ifr.ifr_name, dev->name);
+ strcpy(ifr.ifr_name, ifa->ifa_label);
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
ifa->ifa_local;
- if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+ if (copy_to_user(buf + done, &ifr, size)) {
done = -EFAULT;
break;
}
- buf += sizeof(struct ifreq);
- len -= sizeof(struct ifreq);
- done += sizeof(struct ifreq);
+ len -= size;
+ done += size;
}
out:
return done;
}
+static __be32 in_dev_select_addr(const struct in_device *in_dev,
+ int scope)
+{
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope)
+ return ifa->ifa_local;
+ }
+
+ return 0;
+}
+
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
+ struct net *net;
+ int master_idx;
rcu_read_lock();
+ net = dev_net_rcu(dev);
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope > scope)
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (min(ifa->ifa_scope, localnet_scope) > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
addr = ifa->ifa_local;
@@ -895,48 +1385,65 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
- } endfor_ifa(in_dev);
-no_in_dev:
- rcu_read_unlock();
+ }
if (addr)
- goto out;
+ goto out_unlock;
+no_in_dev:
+ master_idx = l3mdev_master_ifindex_rcu(dev);
+
+ /* For VRFs, the VRF device takes the place of the loopback device,
+ * with addresses on it being preferred. Note in such cases the
+ * loopback device will be among the devices that fail the master_idx
+ * equality check in the loop below.
+ */
+ if (master_idx &&
+ (dev = dev_get_by_index_rcu(net, master_idx)) &&
+ (in_dev = __in_dev_get_rcu(dev))) {
+ addr = in_dev_select_addr(in_dev, scope);
+ if (addr)
+ goto out_unlock;
+ }
/* Not loopback addresses on loopback should be preferred
- in this case. It is importnat that lo is the first interface
+ in this case. It is important that lo is the first interface
in dev_base list.
*/
- read_lock(&dev_base_lock);
- rcu_read_lock();
- for (dev = dev_base; dev; dev = dev->next) {
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
+ for_each_netdev_rcu(net, dev) {
+ if (l3mdev_master_ifindex_rcu(dev) != master_idx)
continue;
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope != RT_SCOPE_LINK &&
- ifa->ifa_scope <= scope) {
- addr = ifa->ifa_local;
- goto out_unlock_both;
- }
- } endfor_ifa(in_dev);
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+
+ addr = in_dev_select_addr(in_dev, scope);
+ if (addr)
+ goto out_unlock;
}
-out_unlock_both:
- read_unlock(&dev_base_lock);
+out_unlock:
rcu_read_unlock();
-out:
return addr;
}
+EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
{
- int same = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ int same = 0;
+
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);
- for_ifa(in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
- ifa->ifa_scope <= scope) {
+ min_scope <= scope) {
addr = ifa->ifa_local;
if (same)
break;
@@ -951,7 +1458,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
if (inet_ifa_match(addr, ifa))
break;
/* No, then can we use new local src? */
- if (ifa->ifa_scope <= scope) {
+ if (min_scope <= scope) {
addr = ifa->ifa_local;
break;
}
@@ -959,46 +1466,42 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
same = 0;
}
}
- } endfor_ifa(in_dev);
+ }
- return same? addr : 0;
+ return same ? addr : 0;
}
/*
* Confirm that local IP address exists using wildcards:
- * - dev: only on this interface, 0=any interface
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
* - dst: only in the same subnet as dst, 0=any dst
* - local: address, 0=autoselect the local address
* - scope: maximum allowed scope value for the local address
*/
-__be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, int scope)
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
+ __be32 dst, __be32 local, int scope)
{
__be32 addr = 0;
- struct in_device *in_dev;
-
- if (dev) {
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)))
- addr = confirm_addr_indev(in_dev, dst, local, scope);
- rcu_read_unlock();
+ struct net_device *dev;
- return addr;
- }
+ if (in_dev)
+ return confirm_addr_indev(in_dev, dst, local, scope);
- read_lock(&dev_base_lock);
rcu_read_lock();
- for (dev = dev_base; dev; dev = dev->next) {
- if ((in_dev = __in_dev_get_rcu(dev))) {
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
break;
}
}
rcu_read_unlock();
- read_unlock(&dev_base_lock);
return addr;
}
+EXPORT_SYMBOL(inet_confirm_addr);
/*
* Device notifier
@@ -1008,92 +1511,146 @@ int register_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(register_inetaddr_notifier);
int unregister_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
+
+int register_inetaddr_validator_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&inetaddr_validator_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_validator_notifier);
-/* Rename ifa_labels for a device name change. Make some effort to preserve existing
- * alias numbering and to create unique labels if possible.
+int unregister_inetaddr_validator_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&inetaddr_validator_chain,
+ nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_validator_notifier);
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
*/
static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
-{
+{
struct in_ifaddr *ifa;
int named = 0;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
- char old[IFNAMSIZ], *dot;
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ char old[IFNAMSIZ], *dot;
memcpy(old, ifa->ifa_label, IFNAMSIZ);
- memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
if (named++ == 0)
- continue;
- dot = strchr(ifa->ifa_label, ':');
- if (dot == NULL) {
- sprintf(old, ":%d", named);
+ goto skip;
+ dot = strchr(old, ':');
+ if (!dot) {
+ sprintf(old, ":%d", named);
dot = old;
}
- if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
- strcat(ifa->ifa_label, dot);
- } else {
- strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
- }
- }
-}
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
+ strcat(ifa->ifa_label, dot);
+ else
+ strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+skip:
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
+ }
+}
/* Called only under RTNL semaphore */
static int inetdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
if (!in_dev) {
- if (event == NETDEV_REGISTER && dev == &loopback_dev) {
+ if (event == NETDEV_REGISTER) {
in_dev = inetdev_init(dev);
- if (!in_dev)
- panic("devinet: Failed to create loopback\n");
- in_dev->cnf.no_xfrm = 1;
- in_dev->cnf.no_policy = 1;
+ if (IS_ERR(in_dev))
+ return notifier_from_errno(PTR_ERR(in_dev));
+ if (dev->flags & IFF_LOOPBACK) {
+ IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
+ IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
+ }
+ } else if (event == NETDEV_CHANGEMTU) {
+ /* Re-enabling IP */
+ if (inetdev_valid_mtu(dev->mtu))
+ in_dev = inetdev_init(dev);
}
goto out;
}
switch (event) {
case NETDEV_REGISTER:
- printk(KERN_DEBUG "inetdev_event: bug\n");
- dev->ip_ptr = NULL;
+ pr_debug("%s: bug\n", __func__);
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
- if (dev->mtu < 68)
+ if (!inetdev_valid_mtu(dev->mtu))
break;
- if (dev == &loopback_dev) {
- struct in_ifaddr *ifa;
- if ((ifa = inet_alloc_ifa()) != NULL) {
+ if (dev->flags & IFF_LOOPBACK) {
+ struct in_ifaddr *ifa = inet_alloc_ifa(in_dev);
+
+ if (ifa) {
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
ifa->ifa_mask = inet_make_mask(8);
- in_dev_hold(in_dev);
- ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
inet_insert_ifa(ifa);
}
}
ip_mc_up(in_dev);
+ fallthrough;
+ case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ fallthrough;
+ case NETDEV_NOTIFY_PEERS:
+ /* Send gratuitous ARP to notify of link change */
+ inetdev_send_gratuitous_arp(dev, in_dev);
break;
case NETDEV_DOWN:
ip_mc_down(in_dev);
break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ ip_mc_unmap(in_dev);
+ break;
+ case NETDEV_POST_TYPE_CHANGE:
+ ip_mc_remap(in_dev);
+ break;
case NETDEV_CHANGEMTU:
- if (dev->mtu >= 68)
+ if (inetdev_valid_mtu(dev->mtu))
break;
- /* MTU falled under 68, disable IP */
+ /* disable IP when MTU is not enough */
+ fallthrough;
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
@@ -1103,13 +1660,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
*/
inetdev_changename(dev, in_dev);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_unregister(&in_dev->cnf);
- neigh_sysctl_unregister(in_dev->arp_parms);
- neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
- devinet_sysctl_register(in_dev, &in_dev->cnf);
-#endif
+ devinet_sysctl_unregister(in_dev);
+ devinet_sysctl_register(in_dev);
break;
}
out:
@@ -1117,532 +1669,1240 @@ out:
}
static struct notifier_block ip_netdev_notifier = {
- .notifier_call =inetdev_event,
+ .notifier_call = inetdev_event,
};
-static inline size_t inet_nlmsg_size(void)
+static size_t inet_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ nla_total_size(4) /* IFA_ADDRESS */
+ nla_total_size(4) /* IFA_LOCAL */
+ nla_total_size(4) /* IFA_BROADCAST */
- + nla_total_size(4) /* IFA_ANYCAST */
- + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(1) /* IFA_PROTO */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
+}
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}
-static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 pid, u32 seq, int event, unsigned int flags)
+static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
+ struct inet_fill_args *args)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
+ unsigned long tstamp;
+ u32 preferred, valid;
+ u32 flags;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
+ args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+
+ flags = READ_ONCE(ifa->ifa_flags);
+ /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits.
+ * The 32bit value is given in IFA_FLAGS attribute.
+ */
+ ifm->ifa_flags = (__u8)flags;
+
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
- if (ifa->ifa_address)
- NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto nla_put_failure;
- if (ifa->ifa_local)
- NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
+ tstamp = READ_ONCE(ifa->ifa_tstamp);
+ if (!(flags & IFA_F_PERMANENT)) {
+ preferred = READ_ONCE(ifa->ifa_preferred_lft);
+ valid = READ_ONCE(ifa->ifa_valid_lft);
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - tstamp) / HZ;
- if (ifa->ifa_broadcast)
- NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
+ if ((ifa->ifa_address &&
+ nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ (ifa->ifa_proto &&
+ nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
+ nla_put_u32(skb, IFA_FLAGS, flags) ||
+ (ifa->ifa_rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
+ put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp,
+ preferred, valid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
- if (ifa->ifa_anycast)
- NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
- if (ifa->ifa_label[0])
- NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
- return nlmsg_end(skb, nlh);
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
+ return -EINVAL;
+ }
-nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ fillargs->netnsid = -1;
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
}
-static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ struct ip_mc_list *im;
+ int ip_idx = 0;
+ int err;
+
+ for (im = rcu_dereference(in_dev->mc_list);
+ im;
+ im = rcu_dereference(im->next_rcu)) {
+ if (ip_idx < *s_ip_idx) {
+ ip_idx++;
+ continue;
+ }
+ err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs);
+ if (err < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
+ }
+ err = 0;
+ ip_idx = 0;
+done:
+ *s_ip_idx = ip_idx;
+ return err;
+}
+
+static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
{
- int idx, ip_idx;
- struct net_device *dev;
- struct in_device *in_dev;
struct in_ifaddr *ifa;
- int s_ip_idx, s_idx = cb->args[0];
+ int ip_idx = 0;
+ int err;
- s_ip_idx = ip_idx = cb->args[1];
- read_lock(&dev_base_lock);
- for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
- if (idx < s_idx)
- continue;
- if (idx > s_idx)
- s_ip_idx = 0;
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
- rcu_read_unlock();
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ip_idx < *s_ip_idx) {
+ ip_idx++;
continue;
}
+ err = inet_fill_ifaddr(skb, ifa, fillargs);
+ if (err < 0)
+ goto done;
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) <= 0) {
- rcu_read_unlock();
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
+ }
+ err = 0;
+ ip_idx = 0;
+done:
+ *s_ip_idx = ip_idx;
+
+ return err;
+}
+
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ switch (fillargs->event) {
+ case RTM_NEWADDR:
+ return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs);
+ case RTM_GETMULTICAST:
+ return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx,
+ fillargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+/* Combine dev_addr_genid and dev_base_seq to detect changes.
+ */
+static u32 inet_base_seq(const struct net *net)
+{
+ u32 res = atomic_read(&net->ipv4.dev_addr_genid) +
+ READ_ONCE(net->dev_base_seq);
+
+ /* Must not return 0 (see nl_dump_check_consistent()).
+ * Chose a value far away from 0.
+ */
+ if (!res)
+ res = 0x80000000;
+ return res;
+}
+
+static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
+ int event)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = event,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ };
+ struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
+ struct {
+ unsigned long ifindex;
+ int ip_idx;
+ } *ctx = (void *)cb->ctx;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ int err = 0;
+
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ goto done;
+
+ if (fillargs.ifindex) {
+ dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
+ if (!dev) {
+ err = -ENODEV;
goto done;
}
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto done;
+ err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
+ &fillargs);
+ goto done;
}
- rcu_read_unlock();
}
+ cb->seq = inet_base_seq(tgt_net);
+
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+ err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
+ &fillargs);
+ if (err < 0)
+ goto done;
+ }
done:
- read_unlock(&dev_base_lock);
- cb->args[0] = idx;
- cb->args[1] = ip_idx;
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
+ rcu_read_unlock();
+ return err;
+}
- return skb->len;
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return inet_dump_addr(skb, cb, RTM_NEWADDR);
}
-static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh,
- u32 pid)
+static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
+ return inet_dump_addr(skb, cb, RTM_GETMULTICAST);
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 portid)
+{
+ struct inet_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ };
struct sk_buff *skb;
- u32 seq = nlh ? nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
+ struct net *net;
+ net = dev_net(ifa->ifa_dev->dev);
skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
- if (skb == NULL)
+ if (!skb)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
- /* failure implies BUG in inet_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ err = inet_fill_ifaddr(skb, ifa, &fillargs);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+}
+
+static size_t inet_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+ if (!in_dev)
+ return 0;
+
+ return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ struct nlattr *nla;
+ int i;
+
+ if (!in_dev)
+ return -ENODATA;
+
+ nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+ if (!nla)
+ return -EMSGSIZE;
+
+ for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+ ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);
+
+ return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+ [IFLA_INET_CONF] = { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int err, rem;
+
+ if (dev && !__in_dev_get_rtnl(dev))
+ return -EAFNOSUPPORT;
+
+ err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
+ inet_af_policy, extack);
if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err);
-}
-
-static struct rtnetlink_link inet_rtnetlink_table[RTM_NR_MSGTYPES] = {
- [RTM_NEWADDR - RTM_BASE] = { .doit = inet_rtm_newaddr, },
- [RTM_DELADDR - RTM_BASE] = { .doit = inet_rtm_deladdr, },
- [RTM_GETADDR - RTM_BASE] = { .dumpit = inet_dump_ifaddr, },
- [RTM_NEWROUTE - RTM_BASE] = { .doit = inet_rtm_newroute, },
- [RTM_DELROUTE - RTM_BASE] = { .doit = inet_rtm_delroute, },
- [RTM_GETROUTE - RTM_BASE] = { .doit = inet_rtm_getroute,
- .dumpit = inet_dump_fib, },
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- [RTM_GETRULE - RTM_BASE] = { .dumpit = fib4_rules_dump, },
-#endif
+ return err;
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+ int cfgid = nla_type(a);
+
+ if (nla_len(a) < 4)
+ return -EINVAL;
+
+ if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int rem;
+
+ if (!in_dev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
+ return -EINVAL;
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+ ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+ }
+
+ return 0;
+}
+
+static int inet_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+ bool all = false;
+
+ if (type == NETCONFA_ALL)
+ all = true;
+
+ if (all || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+ if (all || type == NETCONFA_RP_FILTER)
+ size += nla_total_size(4);
+ if (all || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+ if (all || type == NETCONFA_BC_FORWARDING)
+ size += nla_total_size(4);
+ if (all || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+ if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ const struct ipv4_devconf *devconf,
+ u32 portid, u32 seq, int event,
+ unsigned int flags, int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+ bool all = false;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ if (type == NETCONFA_ALL)
+ all = true;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ if (!devconf)
+ goto out;
+
+ if ((all || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((all || type == NETCONFA_RP_FILTER) &&
+ nla_put_s32(skb, NETCONFA_RP_FILTER,
+ IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
+ goto nla_put_failure;
+ if ((all || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((all || type == NETCONFA_BC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+ IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((all || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
+ goto nla_put_failure;
+ if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ IPV4_DEVCONF_RO(*devconf,
+ IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+ goto nla_put_failure;
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet_netconf_notify_devconf(struct net *net, int event, int type,
+ int ifindex, struct ipv4_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ event, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+ [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
-#ifdef CONFIG_SYSCTL
+static int inet_netconf_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv4_policy, extack);
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETCONFA_MAX; i++) {
+ if (!tb[i])
+ continue;
-void inet_forward_change(void)
+ switch (i) {
+ case NETCONFA_IFINDEX:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX + 1];
+ const struct ipv4_devconf *devconf;
+ struct in_device *in_dev = NULL;
+ struct net_device *dev = NULL;
+ struct sk_buff *skb;
+ int ifindex;
+ int err;
+
+ err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
+ if (err)
+ return err;
+
+ if (!tb[NETCONFA_IFINDEX])
+ return -EINVAL;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv4.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv4.devconf_dflt;
+ break;
+ default:
+ err = -ENODEV;
+ dev = dev_get_by_index(net, ifindex);
+ if (dev)
+ in_dev = in_dev_get(dev);
+ if (!in_dev)
+ goto errout;
+ devconf = &in_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ NETCONFA_ALL);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ if (in_dev)
+ in_dev_put(in_dev);
+ dev_put(dev);
+ return err;
+}
+
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
+ struct {
+ unsigned long ifindex;
+ unsigned int all_default;
+ } *ctx = (void *)cb->ctx;
+ const struct in_device *in_dev;
struct net_device *dev;
- int on = ipv4_devconf.forwarding;
+ int err = 0;
+
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
- ipv4_devconf.accept_redirects = !on;
- ipv4_devconf_dflt.forwarding = on;
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
+ return -EINVAL;
+ }
- read_lock(&dev_base_lock);
- for (dev = dev_base; dev; dev = dev->next) {
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
+ return -EINVAL;
+ }
+ }
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+ err = inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ }
+ if (ctx->all_default == 0) {
+ err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ ctx->all_default++;
+ }
+ if (ctx->all_default == 1) {
+ err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ ctx->all_default++;
+ }
+done:
+ rcu_read_unlock();
+ return err;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
struct in_device *in_dev;
- rcu_read_lock();
+
in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
- in_dev->cnf.forwarding = on;
- rcu_read_unlock();
+ if (in_dev && !test_bit(i, in_dev->cnf.state))
+ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
+}
- rt_cache_flush(0);
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+ struct net_device *dev;
+ int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+ IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev;
+
+ if (on)
+ dev_disable_lro(dev);
+
+ in_dev = __in_dev_get_rtnl_net(dev);
+ if (in_dev) {
+ IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ }
+ }
}
-static int devinet_sysctl_forward(ctl_table *ctl, int write,
- struct file* filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
{
- int *valp = ctl->data;
- int val = *valp;
- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ if (cnf == net->ipv4.devconf_dflt)
+ return NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ return NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev
+ = container_of(cnf, struct in_device, cnf);
+ return idev->dev->ifindex;
+ }
+}
- if (write && *valp != val) {
- if (valp == &ipv4_devconf.forwarding)
- inet_forward_change();
- else if (valp != &ipv4_devconf_dflt.forwarding)
- rt_cache_flush(0);
+static int devinet_conf_proc(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct net *net = ctl->extra2;
+ int i = (int *)ctl->data - cnf->data;
+ int ifindex;
+
+ set_bit(i, cnf->state);
+
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+ i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
+ if ((new_value == 0) && (old_value != 0))
+ rt_cache_flush(net);
+
+ if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+ new_value != old_value)
+ rt_cache_flush(net);
+
+ if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_RP_FILTER,
+ ifindex, cnf);
+ }
+ if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_PROXY_NEIGH,
+ ifindex, cnf);
+ }
+ if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ ifindex, cnf);
+ }
}
return ret;
}
-int ipv4_doint_and_flush(ctl_table *ctl, int write,
- struct file* filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int devinet_sysctl_forward(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ loff_t pos = *ppos;
+ struct net *net = ctl->extra2;
+ int ret;
- if (write && *valp != val)
- rt_cache_flush(0);
+ if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+ if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+ if (!rtnl_net_trylock(net)) {
+ /* Restore the original values before restarting */
+ *valp = val;
+ *ppos = pos;
+ return restart_syscall();
+ }
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+ inet_forward_change(net);
+ } else {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct in_device *idev =
+ container_of(cnf, struct in_device, cnf);
+ if (*valp)
+ dev_disable_lro(idev->dev);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ idev->dev->ifindex,
+ cnf);
+ }
+ rtnl_net_unlock(net);
+ rt_cache_flush(net);
+ } else
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+ }
return ret;
}
-int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int *valp = table->data;
- int new;
-
- if (!newval || !newlen)
- return 0;
-
- if (newlen != sizeof(int))
- return -EINVAL;
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
- if (get_user(new, (int __user *)newval))
- return -EFAULT;
+ if (write && *valp != val)
+ rt_cache_flush(net);
- if (new == *valp)
- return 0;
+ return ret;
+}
- if (oldval && oldlenp) {
- size_t len;
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
+ { \
+ .procname = name, \
+ .data = ipv4_devconf.data + \
+ IPV4_DEVCONF_ ## attr - 1, \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ .extra1 = &ipv4_devconf, \
+ }
- if (get_user(len, oldlenp))
- return -EFAULT;
+#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, valp, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
- }
- }
+#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
- *valp = new;
- rt_cache_flush(0);
- return 1;
-}
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
+#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
- ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
- ctl_table devinet_dev[2];
- ctl_table devinet_conf_dir[2];
- ctl_table devinet_proto_dir[2];
- ctl_table devinet_root_dir[2];
+ struct ctl_table devinet_vars[IPV4_DEVCONF_MAX];
} devinet_sysctl = {
.devinet_vars = {
- {
- .ctl_name = NET_IPV4_CONF_FORWARDING,
- .procname = "forwarding",
- .data = &ipv4_devconf.forwarding,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &devinet_sysctl_forward,
- },
- {
- .ctl_name = NET_IPV4_CONF_MC_FORWARDING,
- .procname = "mc_forwarding",
- .data = &ipv4_devconf.mc_forwarding,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS,
- .procname = "accept_redirects",
- .data = &ipv4_devconf.accept_redirects,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS,
- .procname = "secure_redirects",
- .data = &ipv4_devconf.secure_redirects,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_SHARED_MEDIA,
- .procname = "shared_media",
- .data = &ipv4_devconf.shared_media,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_RP_FILTER,
- .procname = "rp_filter",
- .data = &ipv4_devconf.rp_filter,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS,
- .procname = "send_redirects",
- .data = &ipv4_devconf.send_redirects,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,
- .procname = "accept_source_route",
- .data = &ipv4_devconf.accept_source_route,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_PROXY_ARP,
- .procname = "proxy_arp",
- .data = &ipv4_devconf.proxy_arp,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_MEDIUM_ID,
- .procname = "medium_id",
- .data = &ipv4_devconf.medium_id,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_BOOTP_RELAY,
- .procname = "bootp_relay",
- .data = &ipv4_devconf.bootp_relay,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_LOG_MARTIANS,
- .procname = "log_martians",
- .data = &ipv4_devconf.log_martians,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_TAG,
- .procname = "tag",
- .data = &ipv4_devconf.tag,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_ARPFILTER,
- .procname = "arp_filter",
- .data = &ipv4_devconf.arp_filter,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE,
- .procname = "arp_announce",
- .data = &ipv4_devconf.arp_announce,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_ARP_IGNORE,
- .procname = "arp_ignore",
- .data = &ipv4_devconf.arp_ignore,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_ARP_ACCEPT,
- .procname = "arp_accept",
- .data = &ipv4_devconf.arp_accept,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_CONF_NOXFRM,
- .procname = "disable_xfrm",
- .data = &ipv4_devconf.no_xfrm,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &ipv4_doint_and_flush,
- .strategy = &ipv4_doint_and_flush_strategy,
- },
- {
- .ctl_name = NET_IPV4_CONF_NOPOLICY,
- .procname = "disable_policy",
- .data = &ipv4_devconf.no_policy,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &ipv4_doint_and_flush,
- .strategy = &ipv4_doint_and_flush_strategy,
- },
- {
- .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION,
- .procname = "force_igmp_version",
- .data = &ipv4_devconf.force_igmp_version,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &ipv4_doint_and_flush,
- .strategy = &ipv4_doint_and_flush_strategy,
- },
- {
- .ctl_name = NET_IPV4_CONF_PROMOTE_SECONDARIES,
- .procname = "promote_secondaries",
- .data = &ipv4_devconf.promote_secondaries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &ipv4_doint_and_flush,
- .strategy = &ipv4_doint_and_flush_strategy,
- },
- },
- .devinet_dev = {
- {
- .ctl_name = NET_PROTO_CONF_ALL,
- .procname = "all",
- .mode = 0555,
- .child = devinet_sysctl.devinet_vars,
- },
- },
- .devinet_conf_dir = {
- {
- .ctl_name = NET_IPV4_CONF,
- .procname = "conf",
- .mode = 0555,
- .child = devinet_sysctl.devinet_dev,
- },
- },
- .devinet_proto_dir = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = devinet_sysctl.devinet_conf_dir,
- },
- },
- .devinet_root_dir = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = devinet_sysctl.devinet_proto_dir,
- },
+ DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+ devinet_sysctl_forward),
+ DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+ DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
+
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+ DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+ "accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+ DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+ DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+ DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+ DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+ DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER,
+ "arp_evict_nocarrier"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+ DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv2_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv3_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
+ "ignore_routes_with_linkdown"),
+ DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+ "drop_gratuitous_arp"),
+
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+ "promote_secondaries"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+ "route_localnet"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+ "drop_unicast_in_l2_multicast"),
},
};
-static void devinet_sysctl_register(struct in_device *in_dev,
- struct ipv4_devconf *p)
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+ int ifindex, struct ipv4_devconf *p)
{
int i;
- struct net_device *dev = in_dev ? in_dev->dev : NULL;
- struct devinet_sysctl_table *t = kmemdup(&devinet_sysctl, sizeof(*t),
- GFP_KERNEL);
- char *dev_name = NULL;
+ struct devinet_sysctl_table *t;
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT);
if (!t)
- return;
- for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ goto out;
+
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) {
t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
- t->devinet_vars[i].de = NULL;
+ t->devinet_vars[i].extra1 = p;
+ t->devinet_vars[i].extra2 = net;
}
- if (dev) {
- dev_name = dev->name;
- t->devinet_dev[0].ctl_name = dev->ifindex;
- } else {
- dev_name = "default";
- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
- }
-
- /*
- * Make a copy of dev_name, because '.procname' is regarded as const
- * by sysctl and we wouldn't want anyone to change it under our feet
- * (see SIOCSIFNAME).
- */
- dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!dev_name)
- goto free;
-
- t->devinet_dev[0].procname = dev_name;
- t->devinet_dev[0].child = t->devinet_vars;
- t->devinet_dev[0].de = NULL;
- t->devinet_conf_dir[0].child = t->devinet_dev;
- t->devinet_conf_dir[0].de = NULL;
- t->devinet_proto_dir[0].child = t->devinet_conf_dir;
- t->devinet_proto_dir[0].de = NULL;
- t->devinet_root_dir[0].child = t->devinet_proto_dir;
- t->devinet_root_dir[0].de = NULL;
-
- t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
+
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free;
p->sysctl = t;
- return;
- /* error path */
- free_procname:
- kfree(dev_name);
- free:
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
+ ifindex, p);
+ return 0;
+
+free:
kfree(t);
- return;
+out:
+ return -ENOMEM;
}
-static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+static void __devinet_sysctl_unregister(struct net *net,
+ struct ipv4_devconf *cnf, int ifindex)
{
- if (p->sysctl) {
- struct devinet_sysctl_table *t = p->sysctl;
- p->sysctl = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->devinet_dev[0].procname);
+ struct devinet_sysctl_table *t = cnf->sysctl;
+
+ if (t) {
+ cnf->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
+
+ inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
+}
+
+static int devinet_sysctl_register(struct in_device *idev)
+{
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+ if (err)
+ return err;
+ err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ idev->dev->ifindex, &idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->arp_parms);
+ return err;
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+ struct net *net = dev_net(idev->dev);
+
+ __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex);
+ neigh_sysctl_unregister(idev->arp_parms);
}
+
+static struct ctl_table ctl_forward_entry[] = {
+ {
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.data[
+ IPV4_DEVCONF_FORWARDING - 1],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = devinet_sysctl_forward,
+ .extra1 = &ipv4_devconf,
+ .extra2 = &init_net,
+ },
+};
#endif
-void __init devinet_init(void)
+static __net_init int devinet_init_net(struct net *net)
{
- register_gifconf(PF_INET, inet_gifconf);
- register_netdevice_notifier(&ip_netdev_notifier);
- rtnetlink_links[PF_INET] = inet_rtnetlink_table;
#ifdef CONFIG_SYSCTL
- devinet_sysctl.sysctl_header =
- register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
- devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
+ struct ctl_table_header *forw_hdr;
+ struct ctl_table *tbl;
#endif
+ struct ipv4_devconf *all, *dflt;
+ int err;
+ int i;
+
+ err = -ENOMEM;
+ net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->ipv4.inet_addr_lst)
+ goto err_alloc_hash;
+
+ all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
+
+ dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+ tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (!tbl)
+ goto err_alloc_ctl;
+
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
+#endif
+
+ if (!net_eq(net, &init_net)) {
+ switch (net_inherit_devconf()) {
+ case 3:
+ /* copy from the current netns */
+ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
+ sizeof(ipv4_devconf));
+ memcpy(dflt,
+ current->nsproxy->net_ns->ipv4.devconf_dflt,
+ sizeof(ipv4_devconf_dflt));
+ break;
+ case 0:
+ case 1:
+ /* copy from init_net */
+ memcpy(all, init_net.ipv4.devconf_all,
+ sizeof(ipv4_devconf));
+ memcpy(dflt, init_net.ipv4.devconf_dflt,
+ sizeof(ipv4_devconf_dflt));
+ break;
+ case 2:
+ /* use compiled values */
+ break;
+ }
+ }
+
+#ifdef CONFIG_SYSCTL
+ err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __devinet_sysctl_register(net, "default",
+ NETCONFA_IFINDEX_DEFAULT, dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+
+ err = -ENOMEM;
+ forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl,
+ ARRAY_SIZE(ctl_forward_entry));
+ if (!forw_hdr)
+ goto err_reg_ctl;
+ net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]);
+
+ INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime);
+
+ net->ipv4.devconf_all = all;
+ net->ipv4.devconf_dflt = dflt;
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+ __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT);
+err_reg_dflt:
+ __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
+err_reg_all:
+ kfree(tbl);
+err_alloc_ctl:
+#endif
+ kfree(dflt);
+err_alloc_dflt:
+ kfree(all);
+err_alloc_all:
+ kfree(net->ipv4.inet_addr_lst);
+err_alloc_hash:
+ return err;
}
-EXPORT_SYMBOL(in_dev_finish_destroy);
-EXPORT_SYMBOL(inet_select_addr);
-EXPORT_SYMBOL(inetdev_by_index);
-EXPORT_SYMBOL(register_inetaddr_notifier);
-EXPORT_SYMBOL(unregister_inetaddr_notifier);
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ const struct ctl_table *tbl;
+#endif
+
+ cancel_delayed_work_sync(&net->ipv4.addr_chk_work);
+
+#ifdef CONFIG_SYSCTL
+ tbl = net->ipv4.forw_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.forw_hdr);
+ __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt,
+ NETCONFA_IFINDEX_DEFAULT);
+ __devinet_sysctl_unregister(net, net->ipv4.devconf_all,
+ NETCONFA_IFINDEX_ALL);
+ kfree(tbl);
+#endif
+ kfree(net->ipv4.devconf_dflt);
+ kfree(net->ipv4.devconf_all);
+ kfree(net->ipv4.inet_addr_lst);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+ .init = devinet_init_net,
+ .exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops __read_mostly = {
+ .family = AF_INET,
+ .fill_link_af = inet_fill_link_af,
+ .get_link_af_size = inet_get_link_af_size,
+ .validate_link_af = inet_validate_link_af,
+ .set_link_af = inet_set_link_af,
+};
+
+static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = {
+ {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ {.protocol = PF_INET, .msgtype = RTM_GETNETCONF,
+ .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST,
+ .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED},
+};
+
+void __init devinet_init(void)
+{
+ register_pernet_subsys(&devinet_ops);
+ register_netdevice_notifier(&ip_netdev_notifier);
+
+ if (rtnl_af_register(&inet_af_ops))
+ panic("Unable to register inet_af_ops\n");
+
+ rtnl_register_many(devinet_rtnl_msg_handlers);
+}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f2c6776ea0e6..2c922afadb8f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,242 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
-#include <asm/scatterlist.h>
-#include <linux/crypto.h>
+#include <linux/scatterlist.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
-#include <linux/random.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/in6.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <linux/skbuff_ref.h>
-static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+#include <linux/highmem.h>
+
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+struct esp_output_extra {
+ __be32 seqhi;
+ u32 esphoff;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
+{
+ unsigned int len;
+
+ len = extralen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline void *esp_tmp_extra(void *tmp)
+{
+ return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + extralen,
+ crypto_aead_alignmask(aead) + 1) : tmp + extralen;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
+{
+ struct crypto_aead *aead = x->data;
+ int extralen = 0;
+ u8 *iv;
+ struct aead_request *req;
+ struct scatterlist *sg;
+
+ if (x->props.flags & XFRM_STATE_ESN)
+ extralen += sizeof(struct esp_output_extra);
+
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+
+ /* Unref skb_frag_pages in the src scatterlist if necessary.
+ * Skip the first sg which comes from skb->data.
+ */
+ if (req->src != req->dst)
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+ skb_page_unref(page_to_netmem(sg_page(sg)),
+ skb->pp_recycle);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
+ __be16 sport, dport;
+ struct sock *sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ spin_unlock_bh(&x->lock);
+
+ sk = inet_lookup_established(net, x->id.daddr.a4, dport,
+ x->props.saddr.a4, sport, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
int err;
- struct iphdr *top_iph;
- struct ip_esp_hdr *esph;
- struct crypto_blkcipher *tfm;
- struct blkcipher_desc desc;
- struct esp_data *esp;
- struct sk_buff *trailer;
- int blksize;
- int clen;
- int alen;
- int nfrags;
- /* Strip IP+ESP header. */
- __skb_pull(skb, skb->h.raw - skb->data);
- /* Now skb is pure payload to encrypt */
+ rcu_read_lock();
- err = -ENOMEM;
+ sk = esp_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
- /* Round to block size */
- clen = skb->len;
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
- esp = x->data;
- alen = esp->auth.icv_trunc_len;
- tfm = esp->conf.tfm;
- desc.tfm = tfm;
- desc.flags = 0;
- blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
- clen = ALIGN(clen + 2, blksize);
- if (esp->conf.padlen)
- clen = ALIGN(clen, esp->conf.padlen);
+ sock_put(sk);
- if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
- goto error;
+out:
+ rcu_read_unlock();
+ return err;
+}
- /* Fill padding... */
- do {
- int i;
- for (i=0; i<clen-skb->len - 2; i++)
- *(u8*)(trailer->tail + i) = i+1;
- } while (0);
- *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
- pskb_put(skb, trailer, clen - skb->len);
-
- __skb_push(skb, skb->data - skb->nh.raw);
- top_iph = skb->nh.iph;
- esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
- top_iph->tot_len = htons(skb->len + alen);
- *(u8*)(trailer->tail - 1) = top_iph->protocol;
-
- /* this is non-NULL only with UDP Encapsulation */
- if (x->encap) {
- struct xfrm_encap_tmpl *encap = x->encap;
- struct udphdr *uh;
- __be32 *udpdata32;
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
- uh = (struct udphdr *)esph;
- uh->source = encap->encap_sport;
- uh->dest = encap->encap_dport;
- uh->len = htons(skb->len + alen - top_iph->ihl*4);
- uh->check = 0;
+ return esp_output_tcp_finish(x, skb);
+}
- switch (encap->encap_type) {
- default:
- case UDP_ENCAP_ESPINUDP:
- esph = (struct ip_esp_hdr *)(uh + 1);
- break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- udpdata32 = (__be32 *)(uh + 1);
- udpdata32[0] = udpdata32[1] = 0;
- esph = (struct ip_esp_hdr *)(udpdata32 + 2);
- break;
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_done(void *data, int err)
+{
+ struct sk_buff *skb = data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ void *tmp;
+ struct xfrm_state *x;
+
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
+
+ x = sp->xvec[sp->len - 1];
+ } else {
+ x = skb_dst(skb)->xfrm;
+ }
+
+ tmp = ESP_SKB_CB(skb)->tmp;
+ esp_ssg_unref(x, tmp, skb);
+ kfree(tmp);
+
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ if (err) {
+ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ kfree_skb(skb);
+ return;
}
- top_iph->protocol = IPPROTO_UDP;
- } else
- top_iph->protocol = IPPROTO_ESP;
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ secpath_reset(skb);
+ xfrm_dev_resume(skb);
+ } else {
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
+ }
+}
+
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+ struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ __be32 *seqhi = esp_tmp_extra(tmp);
+
+ esph->seq_no = esph->spi;
+ esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+ esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+ sizeof(__be32));
+}
+
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+ struct xfrm_state *x,
+ struct ip_esp_hdr *esph,
+ struct esp_output_extra *extra)
+{
+ /* For ESN we move the header forward by 4 bytes to
+ * accommodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ __u32 seqhi;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (xo)
+ seqhi = xo->seq.hi;
+ else
+ seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(seqhi);
+ }
esph->spi = x->id.spi;
- esph->seq_no = htonl(++x->replay.oseq);
- xfrm_aevent_doreplay(x);
- if (esp->conf.ivlen) {
- if (unlikely(!esp->conf.ivinitted)) {
- get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
- esp->conf.ivinitted = 1;
- }
- crypto_blkcipher_set_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
+ return esph;
+}
+
+static void esp_output_done_esn(void *data, int err)
+{
+ struct sk_buff *skb = data;
+
+ esp_output_restore_header(skb);
+ esp_output_done(data, err);
+}
+
+static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
+{
+ struct udphdr *uh;
+ unsigned int len;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len + sizeof(struct iphdr) > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ uh = (struct udphdr *)esp->esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ /* For IPv4 ESP with UDP encapsulation, if xo is not null, the skb is in the crypto offload
+ * data path, which means that esp_output_udp_encap is called outside of the XFRM stack.
+ * In this case, the mac header doesn't point to the IPv4 protocol field, so don't set it.
+ */
+ if (!xo || encap_type != UDP_ENCAP_ESPINUDP)
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ sock_put(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
+ break;
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp_output_tcp_encap(x, skb, esp);
+ break;
}
- do {
- struct scatterlist *sg = &esp->sgbuf[0];
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
+ esp->esph = esph;
+
+ return 0;
+}
+
+int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+ u8 *tail;
+ int nfrags;
+ int esph_offset;
+ struct page *page;
+ struct sk_buff *trailer;
+ int tailen = esp->tailen;
+
+ /* this is non-NULL only with TCP/UDP Encapsulation */
+ if (x->encap) {
+ int err = esp_output_encap(x, skb, esp);
+
+ if (err < 0)
+ return err;
+ }
+
+ if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+ ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ goto cow;
+
+ if (!skb_cloned(skb)) {
+ if (tailen <= skb_tailroom(skb)) {
+ nfrags = 1;
+ trailer = skb;
+ tail = skb_tail_pointer(trailer);
+
+ goto skip_cow;
+ } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+ && !skb_has_frag_list(skb)) {
+ int allocsize;
+ struct sock *sk = skb->sk;
+ struct page_frag *pfrag = &x->xfrag;
+
+ esp->inplace = false;
+
+ allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
- if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
- sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
- if (!sg)
- goto error;
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto cow;
+ }
+
+ page = pfrag->page;
+ get_page(page);
+
+ tail = page_address(page) + pfrag->offset;
+
+ esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+
+ __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+ tailen);
+ skb_shinfo(skb)->nr_frags = ++nfrags;
+
+ pfrag->offset = pfrag->offset + allocsize;
+
+ spin_unlock_bh(&x->lock);
+
+ nfrags++;
+
+ skb_len_add(skb, tailen);
+ if (sk && sk_fullsock(sk))
+ refcount_add(tailen, &sk->sk_wmem_alloc);
+
+ goto out;
}
- skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
- err = crypto_blkcipher_encrypt(&desc, sg, sg, clen);
- if (unlikely(sg != &esp->sgbuf[0]))
- kfree(sg);
- } while (0);
+ }
- if (unlikely(err))
+cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
+ nfrags = skb_cow_data(skb, tailen, &trailer);
+ if (nfrags < 0)
+ goto out;
+ tail = skb_tail_pointer(trailer);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
+
+skip_cow:
+ esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+ pskb_put(skb, trailer, tailen);
+
+out:
+ return nfrags;
+}
+EXPORT_SYMBOL_GPL(esp_output_head);
+
+int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+ u8 *iv;
+ int alen;
+ void *tmp;
+ int ivlen;
+ int assoclen;
+ int extralen;
+ struct page *page;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct aead_request *req;
+ struct scatterlist *sg, *dsg;
+ struct esp_output_extra *extra;
+ int err = -ENOMEM;
+
+ assoclen = sizeof(struct ip_esp_hdr);
+ extralen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ extralen += sizeof(*extra);
+ assoclen += sizeof(__be32);
+ }
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
+
+ tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
+ if (!tmp)
goto error;
- if (esp->conf.ivlen) {
- memcpy(esph->enc_data, esp->conf.ivec, esp->conf.ivlen);
- crypto_blkcipher_get_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+
+ if (esp->inplace)
+ dsg = sg;
+ else
+ dsg = &sg[esp->nfrags];
+
+ esph = esp_output_set_extra(skb, x, esp->esph, extra);
+ esp->esph = esph;
+
+ sg_init_table(sg, esp->nfrags);
+ err = skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ if (unlikely(err < 0))
+ goto error_free;
+
+ if (!esp->inplace) {
+ int allocsize;
+ struct page_frag *pfrag = &x->xfrag;
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto error_free;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+ spin_unlock_bh(&x->lock);
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ err = skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ if (unlikely(err < 0))
+ goto error_free;
}
- if (esp->auth.icv_full_len) {
- err = esp_mac_digest(esp, skb, (u8 *)esph - skb->data,
- sizeof(*esph) + esp->conf.ivlen + clen);
- memcpy(pskb_put(skb, trailer, alen), esp->auth.work_icv, alen);
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ memset(iv, 0, ivlen);
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
+ min(ivlen, 8));
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_encrypt(req);
+
+ switch (err) {
+ case -EINPROGRESS:
+ goto error;
+
+ case -ENOSPC:
+ err = NET_XMIT_DROP;
+ break;
+
+ case 0:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_output_restore_header(skb);
}
- ip_send_check(top_iph);
+ if (sg != dsg)
+ esp_ssg_unref(x, tmp, skb);
+
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
+error_free:
+ kfree(tmp);
error:
return err;
}
+EXPORT_SYMBOL_GPL(esp_output_tail);
-/*
- * Note: detecting truncated vs. non-truncated authentication data is very
- * expensive, so we only support truncated data, which is the recommended
- * and common case.
- */
-static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
- struct iphdr *iph;
+ int alen;
+ int blksize;
struct ip_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_blkcipher *tfm = esp->conf.tfm;
- struct blkcipher_desc desc = { .tfm = tfm };
- struct sk_buff *trailer;
- int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
- int alen = esp->auth.icv_trunc_len;
- int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
- int nfrags;
- int ihl;
- u8 nexthdr[2];
- struct scatterlist *sg;
- int padlen;
- int err;
+ struct crypto_aead *aead;
+ struct esp_info esp;
- if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
- goto out;
+ esp.inplace = true;
- if (elen <= 0 || (elen & (blksize-1)))
- goto out;
+ esp.proto = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
- /* If integrity check is required, do this. */
- if (esp->auth.icv_full_len) {
- u8 sum[alen];
+ /* skb is pure payload to encrypt */
- err = esp_mac_digest(esp, skb, 0, skb->len - alen);
- if (err)
- goto out;
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
- if (skb_copy_bits(skb, skb->len - alen, sum, alen))
- BUG();
+ esp.tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
- if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) {
- x->stats.integrity_failed++;
- goto out;
- }
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ esp.tfclen = padto - skb->len;
}
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+ esp.plen = esp.clen - skb->len - esp.tfclen;
+ esp.tailen = esp.tfclen + esp.plen + alen;
- if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
- goto out;
+ esp.esph = ip_esp_hdr(skb);
- skb->ip_summed = CHECKSUM_NONE;
+ esp.nfrags = esp_output_head(x, skb, &esp);
+ if (esp.nfrags < 0)
+ return esp.nfrags;
- esph = (struct ip_esp_hdr*)skb->data;
+ esph = esp.esph;
+ esph->spi = x->id.spi;
- /* Get ivec. This can be wrong, check against another impls. */
- if (esp->conf.ivlen)
- crypto_blkcipher_set_iv(tfm, esph->enc_data, esp->conf.ivlen);
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
- sg = &esp->sgbuf[0];
+ skb_push(skb, -skb_network_offset(skb));
- if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
- sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
- if (!sg)
- goto out;
- }
- skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
- err = crypto_blkcipher_decrypt(&desc, sg, sg, elen);
- if (unlikely(sg != &esp->sgbuf[0]))
- kfree(sg);
- if (unlikely(err))
- return err;
+ return esp_output_tail(x, skb, &esp);
+}
- if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct crypto_aead *aead = x->data;
+ int alen, hlen, elen;
+ int padlen, trimlen;
+ __wsum csumdiff;
+ u8 nexthdr[2];
+ int ret;
+
+ alen = crypto_aead_authsize(aead);
+ hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ elen = skb->len - hlen;
+
+ if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
BUG();
+ ret = -EINVAL;
padlen = nexthdr[0];
- if (padlen+2 >= elen)
+ if (padlen + 2 + alen >= elen) {
+ net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+ padlen + 2, elen - alen);
goto out;
+ }
+
+ trimlen = alen + padlen + 2;
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+ skb->csum = csum_block_sub(skb->csum, csumdiff,
+ skb->len - trimlen);
+ }
+ ret = pskb_trim(skb, skb->len - trimlen);
+ if (unlikely(ret))
+ return ret;
+
+ ret = nexthdr[1];
- /* ... check padding bits here. Silly. :-) */
+out:
+ return ret;
+}
- iph = skb->nh.iph;
+int esp_input_done2(struct sk_buff *skb, int err)
+{
+ const struct iphdr *iph;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct crypto_aead *aead = x->data;
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int ihl;
+
+ if (!xo || !(xo->flags & CRYPTO_DONE))
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ err = esp_remove_trailer(skb);
+ if (unlikely(err < 0))
+ goto out;
+
+ iph = ip_hdr(skb);
ihl = iph->ihl * 4;
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
- struct udphdr *uh = (void *)(skb->nh.raw + ihl);
+ struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
+ struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+ __be16 source;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
/*
* 1) if the NAT-T peer's IP or port changed then
- * advertize the change to the keying daemon.
+ * advertise the change to the keying daemon.
* This is an inbound SA, so just compare
* SRC ports.
*/
if (iph->saddr != x->props.saddr.a4 ||
- uh->source != encap->encap_sport) {
+ source != encap->encap_sport) {
xfrm_address_t ipaddr;
ipaddr.a4 = iph->saddr;
- km_new_mapping(x, &ipaddr, uh->source);
-
+ km_new_mapping(x, &ipaddr, source);
+
/* XXX: perhaps add an extra
* policy check here, to see
* if we should allow or
@@ -245,7 +764,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
* address/port.
*/
}
-
+
/*
* 2) ignore UDP/TCP checksums in case
* of NAT-T in Transport Mode, or
@@ -253,206 +772,423 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
* as per draft-ietf-ipsec-udp-encaps-06,
* section 3.1.2
*/
- if (x->props.mode == XFRM_MODE_TRANSPORT ||
- x->props.mode == XFRM_MODE_BEET)
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- iph->protocol = nexthdr[1];
- pskb_trim(skb, skb->len - alen - padlen - 2);
- skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl;
+ skb_pull_rcsum(skb, hlen);
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
- return 0;
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
out:
- return -EINVAL;
+ return err;
}
+EXPORT_SYMBOL_GPL(esp_input_done2);
-static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
+static void esp_input_done(void *data, int err)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
- int enclen = 0;
+ struct sk_buff *skb = data;
- switch (x->props.mode) {
- case XFRM_MODE_TUNNEL:
- mtu = ALIGN(mtu +2, blksize);
- break;
- default:
- case XFRM_MODE_TRANSPORT:
- /* The worst case */
- mtu = ALIGN(mtu + 2, 4) + blksize - 4;
- break;
- case XFRM_MODE_BEET:
- /* The worst case. */
- enclen = IPV4_BEET_PHMAXLEN;
- mtu = ALIGN(mtu + enclen + 2, blksize);
- break;
+ xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, 0);
+ __skb_pull(skb, 4);
+}
+
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ip_esp_hdr *esph;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accommodate the high bits. We will move it back after
+ * decryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
}
+}
- if (esp->conf.padlen)
- mtu = ALIGN(mtu, esp->conf.padlen);
+static void esp_input_done_esn(void *data, int err)
+{
+ struct sk_buff *skb = data;
- return mtu + x->props.header_len + esp->auth.icv_trunc_len - enclen;
+ esp_input_restore_header(skb);
+ esp_input_done(data, err);
}
-static void esp4_err(struct sk_buff *skb, u32 info)
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct iphdr *iph = (struct iphdr*)skb->data;
- struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
+ struct crypto_aead *aead = x->data;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ int ivlen = crypto_aead_ivsize(aead);
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
+ int nfrags;
+ int assoclen;
+ int seqhilen;
+ __be32 *seqhi;
+ void *tmp;
+ u8 *iv;
+ struct scatterlist *sg;
+ int err = -EINVAL;
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
+ goto out;
+
+ if (elen <= 0)
+ goto out;
+
+ assoclen = sizeof(struct ip_esp_hdr);
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ if (!skb_cloned(skb)) {
+ if (!skb_is_nonlinear(skb)) {
+ nfrags = 1;
+
+ goto skip_cow;
+ } else if (!skb_has_frag_list(skb)) {
+ nfrags = skb_shinfo(skb)->nr_frags;
+ nfrags++;
+
+ goto skip_cow;
+ }
+ }
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+
+ nfrags = err;
+
+skip_cow:
+ err = -ENOMEM;
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+ if (!tmp)
+ goto out;
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+
+ esp_input_set_header(skb, seqhi);
+
+ sg_init_table(sg, nfrags);
+ err = skb_to_sgvec(skb, sg, 0, skb->len);
+ if (unlikely(err < 0)) {
+ kfree(tmp);
+ goto out;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+
+ aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ err = crypto_aead_decrypt(req);
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_input_restore_header(skb);
+
+ err = esp_input_done2(skb, err);
+
+out:
+ return err;
+}
+
+static int esp4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
- skb->h.icmph->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ break;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
- return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
+ else
+ ipv4_redirect(skb, net, 0, IPPROTO_ESP);
xfrm_state_put(x);
+
+ return 0;
}
static void esp_destroy(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
+ struct crypto_aead *aead = x->data;
- if (!esp)
+ if (!aead)
return;
- crypto_free_blkcipher(esp->conf.tfm);
- esp->conf.tfm = NULL;
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- crypto_free_hash(esp->auth.tfm);
- esp->auth.tfm = NULL;
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- kfree(esp);
+ crypto_free_aead(aead);
}
-static int esp_init_state(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
- struct esp_data *esp = NULL;
- struct crypto_blkcipher *tfm;
+ char aead_name[CRYPTO_MAX_ALG_NAME];
+ struct crypto_aead *aead;
+ int err;
- /* null auth and encryption can have zero length keys */
- if (x->aalg) {
- if (x->aalg->alg_key_len > 512)
+ if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ return -ENAMETOOLONG;
+ }
+
+ aead = crypto_alloc_aead(aead_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+ return 0;
+
+error:
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
+{
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
+
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "%s%sauthencesn(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "%s%sauthenc(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ goto error;
+ }
}
- if (x->ealg == NULL)
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ goto error;
+ }
+
+ x->data = aead;
+
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
goto error;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
if (x->aalg) {
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *hash;
- esp->auth.key = x->aalg->alg_key;
- esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
- hash = crypto_alloc_hash(x->aalg->alg_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(hash))
- goto error;
-
- esp->auth.tfm = hash;
- if (crypto_hash_setkey(hash, esp->auth.key, esp->auth.key_len))
- goto error;
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
BUG_ON(!aalg_desc);
- if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(hash)) {
- NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_hash_digestsize(hash),
- aalg_desc->uinfo.auth.icv_fullbits/8);
- goto error;
+ err = -EINVAL;
+ if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+ crypto_aead_authsize(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ goto free_key;
}
- esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
-
- esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
- if (!esp->auth.work_icv)
- goto error;
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ goto free_key;
+ }
}
- esp->conf.key = x->ealg->alg_key;
- esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
- tfm = crypto_alloc_blkcipher(x->ealg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- goto error;
- esp->conf.tfm = tfm;
- esp->conf.ivlen = crypto_blkcipher_ivsize(tfm);
- esp->conf.padlen = 0;
- if (esp->conf.ivlen) {
- esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
- if (unlikely(esp->conf.ivec == NULL))
- goto error;
- esp->conf.ivinitted = 0;
+
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree_sensitive(key);
+
+error:
+ return err;
+}
+
+static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ x->data = NULL;
+
+ if (x->aead) {
+ err = esp_init_aead(x, extack);
+ } else if (x->ealg) {
+ err = esp_init_authenc(x, extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+ err = -EINVAL;
}
- if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
+
+ if (err)
goto error;
- x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+
+ aead = x->data;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
+ else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
+ x->props.header_len += IPV4_BEET_PHMAXLEN;
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
switch (encap->encap_type) {
default:
+ NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
+ err = -EINVAL;
goto error;
case UDP_ENCAP_ESPINUDP:
x->props.header_len += sizeof(struct udphdr);
break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+#ifdef CONFIG_INET_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
break;
+#endif
}
}
- x->data = esp;
- x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
- return 0;
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
- x->data = esp;
- esp_destroy(x);
- x->data = NULL;
- return -EINVAL;
+ return err;
+}
+
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
}
-static struct xfrm_type esp_type =
+static const struct xfrm_type esp_type =
{
- .description = "ESP4",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp_init_state,
.destructor = esp_destroy,
- .get_max_size = esp4_get_max_size,
.input = esp_input,
- .output = esp_output
+ .output = esp_output,
};
-static struct net_protocol esp4_protocol = {
+static struct xfrm4_protocol esp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp4_rcv_cb,
.err_handler = esp4_err,
- .no_policy = 1,
+ .priority = 0,
};
static int __init esp4_init(void)
{
if (xfrm_register_type(&esp_type, AF_INET) < 0) {
- printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
- printk(KERN_INFO "ip esp init: can't add protocol\n");
+ if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&esp_type, AF_INET);
return -EAGAIN;
}
@@ -461,12 +1197,13 @@ static int __init esp4_init(void)
static void __exit esp4_fini(void)
{
- if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
- printk(KERN_INFO "ip esp close: can't remove protocol\n");
- if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
- printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+ if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ xfrm_unregister_type(&esp_type, AF_INET);
}
module_init(esp4_init);
module_exit(esp4_fini);
+MODULE_DESCRIPTION("IPv4 ESP transformation library");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..05828d4cb6cd
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff *esp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ struct xfrm_offload *xo;
+ struct xfrm_state *x;
+ int encap_type = 0;
+ __be32 seq;
+ __be32 spi;
+
+ if (!pskb_pull(skb, offset))
+ return NULL;
+
+ if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0)
+ goto out;
+
+ xo = xfrm_offload(skb);
+ if (!xo || !(xo->flags & CRYPTO_DONE)) {
+ struct sec_path *sp = secpath_set(skb);
+
+ if (!sp)
+ goto out;
+
+ if (sp->len == XFRM_MAX_DEPTH)
+ goto out_reset;
+
+ x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ip_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET);
+
+ if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+ /* non-offload path will record the error and audit log */
+ xfrm_state_put(x);
+ x = NULL;
+ }
+
+ if (!x)
+ goto out_reset;
+
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ goto out_reset;
+ }
+
+ xo->flags |= XFRM_GRO;
+
+ if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP)
+ encap_type = UDP_ENCAP_ESPINUDP;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+ XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+ /* We don't need to handle errors from xfrm_input, it does all
+ * the error handling and frees the resources on error. */
+ xfrm_input(skb, IPPROTO_ESP, spi, encap_type);
+
+ return ERR_PTR(-EINPROGRESS);
+out_reset:
+ secpath_reset(skb);
+out:
+ skb_push(skb, offset);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
+}
+
+static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct iphdr *iph = ip_hdr(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ int proto = iph->protocol;
+
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ xo->proto = proto;
+}
+
+static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+ XFRM_MODE_SKB_CB(skb)->protocol);
+ __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6)
+ : htons(ETH_P_IP);
+
+ return skb_eth_gso_segment(skb, features, type);
+}
+
+static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ skb->transport_header += x->props.header_len;
+ ops = rcu_dereference(inet_offloads[xo->proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ u8 proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (x->sel.family != AF_INET6) {
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph =
+ (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ }
+ } else {
+ __be16 frag;
+
+ skb->transport_header +=
+ ipv6_skip_exthdr(skb, 0, &proto, &frag);
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ if (proto == IPPROTO_IPV6)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ switch (x->outer_mode.encap) {
+ case XFRM_MODE_TUNNEL:
+ return xfrm4_tunnel_gso_segment(x, skb, features);
+ case XFRM_MODE_TRANSPORT:
+ return xfrm4_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm4_beet_gso_segment(x, skb, features);
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_state *x;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ netdev_features_t esp_features = features;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
+
+ if (!xo)
+ return ERR_PTR(-EINVAL);
+
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
+ return ERR_PTR(-EINVAL);
+
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
+ aead = x->data;
+ esph = ip_esp_hdr(skb);
+
+ if (esph->spi != x->id.spi)
+ return ERR_PTR(-EINVAL);
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+ skb->encap_hdr_csum = 1;
+
+ if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) &&
+ !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev)
+ esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
+ else if (!(features & NETIF_F_HW_ESP_TX_CSUM) &&
+ !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM))
+ esp_features = features & ~(NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
+
+ xo->flags |= XFRM_GSO_SEGMENT;
+
+ return xfrm4_outer_mode_gso_segment(x, skb, esp_features);
+}
+
+static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct crypto_aead *aead = x->data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+ return -EINVAL;
+
+ if (!(xo->flags & CRYPTO_DONE))
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return esp_input_done2(skb, 0);
+}
+
+static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features)
+{
+ int err;
+ int alen;
+ int blksize;
+ struct xfrm_offload *xo;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct esp_info esp;
+ bool hw_offload = true;
+ __u32 seq;
+ int encap_type = 0;
+
+ esp.inplace = true;
+
+ xo = xfrm_offload(skb);
+
+ if (!xo)
+ return -EINVAL;
+
+ if ((!(features & NETIF_F_HW_ESP) &&
+ !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) ||
+ x->xso.dev != skb->dev) {
+ xo->flags |= CRYPTO_FALLBACK;
+ hw_offload = false;
+ }
+
+ esp.proto = xo->proto;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ esp.tfclen = 0;
+ /* XXX: Add support for tfc padding here. */
+
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+ esp.plen = esp.clen - skb->len - esp.tfclen;
+ esp.tailen = esp.tfclen + esp.plen + alen;
+
+ esp.esph = ip_esp_hdr(skb);
+
+ if (x->encap)
+ encap_type = x->encap->encap_type;
+
+ if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) {
+ esp.nfrags = esp_output_head(x, skb, &esp);
+ if (esp.nfrags < 0)
+ return esp.nfrags;
+ }
+
+ seq = xo->seq.low;
+
+ esph = esp.esph;
+ esph->spi = x->id.spi;
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ if (xo->flags & XFRM_GSO_SEGMENT) {
+ esph->seq_no = htonl(seq);
+
+ if (!skb_is_gso(skb))
+ xo->seq.low++;
+ else
+ xo->seq.low += skb_shinfo(skb)->gso_segs;
+ }
+
+ if (xo->seq.low < seq)
+ xo->seq.hi++;
+
+ esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
+
+ if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) {
+ /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by
+ * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header
+ * points to iphdr->protocol (see xfrm4_tunnel_encap_add()).
+ * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol.
+ * Therefore, the protocol field needs to be corrected.
+ */
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
+
+ esph->seq_no = htonl(seq);
+ }
+
+ ip_hdr(skb)->tot_len = htons(skb->len);
+ ip_send_check(ip_hdr(skb));
+
+ if (hw_offload) {
+ if (!skb_ext_add(skb, SKB_EXT_SEC_PATH))
+ return -ENOMEM;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ return -EINVAL;
+
+ xo->flags |= XFRM_XMIT;
+ return 0;
+ }
+
+ err = esp_output_tail(x, skb, &esp);
+ if (err)
+ return err;
+
+ secpath_reset(skb);
+
+ if (skb_needs_linearize(skb, skb->dev->features) &&
+ __skb_linearize(skb))
+ return -ENOMEM;
+ return 0;
+}
+
+static const struct net_offload esp4_offload = {
+ .callbacks = {
+ .gro_receive = esp4_gro_receive,
+ .gso_segment = esp4_gso_segment,
+ },
+};
+
+static const struct xfrm_type_offload esp_type_offload = {
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .input_tail = esp_input_tail,
+ .xmit = esp_xmit,
+ .encap = esp4_gso_encap,
+};
+
+static int __init esp4_offload_init(void)
+{
+ if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type offload\n", __func__);
+ return -EAGAIN;
+ }
+
+ return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+ xfrm_unregister_type_offload(&esp_type_offload, AF_INET);
+ inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
+MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d47b72af89ed..1dab44e13d3b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,24 +6,15 @@
*
* IPv4 Forwarding Information Base: FIB frontend.
*
- * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
@@ -35,223 +27,438 @@
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
-#include <linux/netlink.h>
+#include <linux/cache.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/flow.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
-#include <net/icmp.h>
#include <net/arp.h>
#include <net/ip_fib.h>
-
-#define FFprint(a...) printk(KERN_DEBUG a)
+#include <net/nexthop.h>
+#include <net/rtnetlink.h>
+#include <net/xfrm.h>
+#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
+#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table *ip_fib_local_table;
-struct fib_table *ip_fib_main_table;
+static int __net_init fib4_rules_init(struct net *net)
+{
+ struct fib_table *local_table, *main_table;
-#define FIB_TABLE_HASHSZ 1
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+ main_table = fib_trie_table(RT_TABLE_MAIN, NULL);
+ if (!main_table)
+ return -ENOMEM;
-#else
+ local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
+ if (!local_table)
+ goto fail;
+
+ hlist_add_head_rcu(&local_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+ hlist_add_head_rcu(&main_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+ return 0;
-#define FIB_TABLE_HASHSZ 256
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+fail:
+ fib_free_table(main_table);
+ return -ENOMEM;
+}
+#else
-struct fib_table *fib_new_table(u32 id)
+struct fib_table *fib_new_table(struct net *net, u32 id)
{
- struct fib_table *tb;
+ struct fib_table *tb, *alias = NULL;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
- tb = fib_get_table(id);
+ tb = fib_get_table(net, id);
if (tb)
return tb;
- tb = fib_hash_init(id);
+
+ if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
+ alias = fib_new_table(net, RT_TABLE_MAIN);
+
+ tb = fib_trie_table(id, alias);
if (!tb)
return NULL;
+
+ switch (id) {
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, tb);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, tb);
+ break;
+ default:
+ break;
+ }
+
h = id & (FIB_TABLE_HASHSZ - 1);
- hlist_add_head_rcu(&tb->tb_hlist, &fib_table_hash[h]);
+ hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
}
+EXPORT_SYMBOL_GPL(fib_new_table);
-struct fib_table *fib_get_table(u32 id)
+/* caller must hold either rtnl or rcu read lock */
+struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
- struct hlist_node *node;
+ struct hlist_head *head;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1);
- rcu_read_lock();
- hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) {
- if (tb->tb_id == id) {
- rcu_read_unlock();
+
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held()) {
+ if (tb->tb_id == id)
return tb;
- }
}
- rcu_read_unlock();
return NULL;
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-static void fib_flush(void)
+static void fib_replace_table(struct net *net, struct fib_table *old,
+ struct fib_table *new)
{
- int flushed = 0;
- struct fib_table *tb;
- struct hlist_node *node;
- unsigned int h;
-
- for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist)
- flushed += tb->tb_flush(tb);
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ switch (new->tb_id) {
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, new);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, new);
+ break;
+ default:
+ break;
}
- if (flushed)
- rt_cache_flush(-1);
+#endif
+ /* replace the old table in the hlist */
+ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
}
-/*
- * Find the first device with a given source address.
- */
+int fib_unmerge(struct net *net)
+{
+ struct fib_table *old, *new, *main_table;
+
+ /* attempt to fetch local table if it has been allocated */
+ old = fib_get_table(net, RT_TABLE_LOCAL);
+ if (!old)
+ return 0;
+
+ new = fib_trie_unmerge(old);
+ if (!new)
+ return -ENOMEM;
+
+ /* table is already unmerged */
+ if (new == old)
+ return 0;
+
+ /* replace merged table with clean table */
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+
+ /* attempt to fetch main table if it has been allocated */
+ main_table = fib_get_table(net, RT_TABLE_MAIN);
+ if (!main_table)
+ return 0;
+
+ /* flush local entries from main table */
+ fib_table_flush_external(main_table);
-struct net_device * ip_dev_find(__be32 addr)
+ return 0;
+}
+
+void fib_flush(struct net *net)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
- struct fib_result res;
- struct net_device *dev = NULL;
+ int flushed = 0;
+ unsigned int h;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
- if (!ip_fib_local_table ||
- ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
- return NULL;
- if (res.type != RTN_LOCAL)
- goto out;
- dev = FIB_RES_DEV(res);
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+ flushed += fib_table_flush(net, tb, false);
+ }
- if (dev)
- dev_hold(dev);
-out:
- fib_res_put(&res);
- return dev;
+ if (flushed)
+ rt_cache_flush(net);
}
-unsigned inet_addr_type(__be32 addr)
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr, u32 tb_id)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
- unsigned ret = RTN_BROADCAST;
+ unsigned int ret = RTN_BROADCAST;
+ struct fib_table *table;
- if (ZERONET(addr) || BADCLASS(addr))
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
- if (MULTICAST(addr))
+ if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- if (ip_fib_local_table) {
+ rcu_read_lock();
+
+ table = fib_get_table(net, tb_id);
+ if (table) {
ret = RTN_UNICAST;
- if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
- &fl, &res)) {
- ret = res.type;
- fib_res_put(&res);
+ if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
+
+ if (!dev || dev == nhc->nhc_dev)
+ ret = res.type;
}
}
+
+ rcu_read_unlock();
return ret;
}
-/* Given (packet source, input interface) and optional (dst, oif, tos):
- - (main) check, that source is valid i.e. not broadcast or our local
- address.
- - figure out what "logical" interface this packet arrived
- and calculate "specific destination" address.
- - check, that packet arrived from expected physical interface.
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
+{
+ return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+ return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
+}
+EXPORT_SYMBOL(inet_addr_type);
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+ __be32 addr)
+{
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+ return __inet_dev_addr_type(net, dev, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+/* inet_addr_type with dev == NULL but using the table from a dev
+ * if one is associated
*/
+unsigned int inet_addr_type_dev_table(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
+{
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
-int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
- struct net_device *dev, __be32 *spec_dst, u32 *itag)
+ return __inet_dev_addr_type(net, NULL, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_addr_type_dev_table);
+
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
struct in_device *in_dev;
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = src,
- .saddr = dst,
- .tos = tos } },
- .iif = oif };
struct fib_result res;
- int no_addr, rpf;
- int ret;
+ struct rtable *rt;
+ struct net *net;
+ int scope;
+
+ rt = skb_rtable(skb);
+ if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+ RTCF_LOCAL)
+ return ip_hdr(skb)->daddr;
- no_addr = rpf = 0;
- rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev) {
- no_addr = in_dev->ifa_list == NULL;
- rpf = IN_DEV_RPFILTER(in_dev);
+
+ net = dev_net(dev);
+
+ scope = RT_SCOPE_UNIVERSE;
+ if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
+ struct flowi4 fl4 = {
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
+ .daddr = ip_hdr(skb)->saddr,
+ .flowi4_dscp = ip4h_dscp(ip_hdr(skb)),
+ .flowi4_scope = scope,
+ .flowi4_mark = vmark ? skb->mark : 0,
+ };
+ if (!fib_lookup(net, &fl4, &res, 0))
+ return fib_result_prefsrc(net, &res);
+ } else {
+ scope = RT_SCOPE_LINK;
}
- rcu_read_unlock();
- if (in_dev == NULL)
- goto e_inval;
+ return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
- if (fib_lookup(&fl, &res))
- goto last_resort;
- if (res.type != RTN_UNICAST)
- goto e_inval_res;
- *spec_dst = FIB_RES_PREFSRC(res);
- fib_combine_itag(itag, &res);
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
+{
+ bool dev_match = false;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+ if (unlikely(fi->nh)) {
+ dev_match = nexthop_uses_dev(fi->nh, dev);
+ } else {
+ int ret;
+
+ for (ret = 0; ret < fib_info_num_path(fi); ret++) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
+
+ if (nhc_l3mdev_matches_dev(nhc, dev)) {
+ dev_match = true;
+ break;
+ }
+ }
+ }
#else
- if (FIB_RES_DEV(res) == dev)
+ if (fib_info_nhc(fi, 0)->nhc_dev == dev)
+ dev_match = true;
#endif
- {
- ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- fib_res_put(&res);
+
+ return dev_match;
+}
+EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ dscp_t dscp, int oif, struct net_device *dev,
+ int rpf, struct in_device *idev, u32 *itag)
+{
+ struct net *net = dev_net(dev);
+ enum skb_drop_reason reason;
+ struct flow_keys flkeys;
+ int ret, no_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ bool dev_match;
+
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_dscp = dscp;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_flags = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+ fl4.flowi4_multipath_hash = 0;
+
+ no_addr = idev->ifa_list == NULL;
+
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+ if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
+ fl4.flowi4_proto = 0;
+ fl4.fl4_sport = 0;
+ fl4.fl4_dport = 0;
+ } else {
+ swap(fl4.fl4_sport, fl4.fl4_dport);
+ }
+
+ if (fib_lookup(net, &fl4, &res, 0))
+ goto last_resort;
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
+ goto e_inval;
+ } else if (!IN_DEV_ACCEPT_LOCAL(idev)) {
+ reason = SKB_DROP_REASON_IP_LOCAL_SOURCE;
+ goto e_inval;
+ }
+ }
+ fib_combine_itag(itag, &res);
+
+ dev_match = fib_info_nh_uses_dev(res.fi, dev);
+ /* This is not common, loopback packets retain skb_dst so normally they
+ * would not even hit this slow path.
+ */
+ dev_match = dev_match || (res.type == RTN_LOCAL &&
+ dev == net->loopback_dev);
+ if (dev_match) {
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
return ret;
}
- fib_res_put(&res);
if (no_addr)
goto last_resort;
- if (rpf)
- goto e_inval;
- fl.oif = dev->ifindex;
+ if (rpf == 1)
+ goto e_rpf;
+ fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(&fl, &res) == 0) {
- if (res.type == RTN_UNICAST) {
- *spec_dst = FIB_RES_PREFSRC(res);
- ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- }
- fib_res_put(&res);
+ if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
+ if (res.type == RTN_UNICAST)
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
}
return ret;
last_resort:
if (rpf)
- goto e_inval;
- *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ goto e_rpf;
*itag = 0;
return 0;
-e_inval_res:
- fib_res_put(&res);
e_inval:
- return -EINVAL;
+ return -reason;
+e_rpf:
+ return -SKB_DROP_REASON_IP_RPFILTER;
}
-#ifndef CONFIG_IP_NOSIOCRT
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ dscp_t dscp, int oif, struct net_device *dev,
+ struct in_device *idev, u32 *itag)
+{
+ int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+ struct net *net = dev_net(dev);
+
+ if (!r && !fib_num_tclassid_users(net) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+ if (IN_DEV_ACCEPT_LOCAL(idev))
+ goto ok;
+ /* with custom local routes in place, checking local addresses
+ * only will be too optimistic, with custom rules, checking
+ * local addresses only can be too strict, e.g. due to vrf
+ */
+ if (net->ipv4.fib_has_custom_local_routes ||
+ fib4_has_custom_rules(net))
+ goto full_check;
+ /* Within the same container, it is regarded as a martian source,
+ * and the same host but different containers are not.
+ */
+ if (inet_lookup_ifaddr_rcu(net, src))
+ return -SKB_DROP_REASON_IP_LOCAL_SOURCE;
+
+ok:
+ *itag = 0;
+ return 0;
+ }
+
+full_check:
+ return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev,
+ itag);
+}
static inline __be32 sk_extract_addr(struct sockaddr *addr)
{
@@ -270,13 +477,14 @@ static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
return len + nla_total_size(4);
}
-static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
struct fib_config *cfg)
{
__be32 addr;
int plen;
memset(cfg, 0, sizeof(*cfg));
+ cfg->fc_nlinfo.nl_net = net;
if (rt->rt_dst.sa_family != AF_INET)
return -EAFNOSUPPORT;
@@ -337,20 +545,27 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
colon = strchr(devname, ':');
if (colon)
*colon = 0;
- dev = __dev_get_by_name(devname);
+ dev = __dev_get_by_name(net, devname);
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
+ cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
- struct in_ifaddr *ifa;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rtnl_net(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+ in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
- if (ifa == NULL)
+ }
+
+ if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
}
@@ -358,16 +573,23 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
addr = sk_extract_addr(&rt->rt_gateway);
if (rt->rt_gateway.sa_family == AF_INET && addr) {
- cfg->fc_gw = addr;
+ unsigned int addr_type;
+
+ cfg->fc_gw4 = addr;
+ cfg->fc_gw_family = AF_INET;
+ addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
if (rt->rt_flags & RTF_GATEWAY &&
- inet_addr_type(addr) == RTN_UNICAST)
+ addr_type == RTN_UNICAST)
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
+ if (!cfg->fc_table)
+ cfg->fc_table = RT_TABLE_MAIN;
+
if (cmd == SIOCDELRT)
return 0;
- if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+ if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
return -EINVAL;
if (cfg->fc_scope == RT_SCOPE_NOWHERE)
@@ -377,8 +599,8 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
struct nlattr *mx;
int len = 0;
- mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
- if (mx == NULL)
+ mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
+ if (!mx)
return -ENOMEM;
if (rt->rt_flags & RTF_MTU)
@@ -398,39 +620,37 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
}
/*
- * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
*/
-
-int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
{
struct fib_config cfg;
- struct rtentry rt;
int err;
switch (cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- if (copy_from_user(&rt, arg, sizeof(rt)))
- return -EFAULT;
-
- rtnl_lock();
- err = rtentry_to_fib_config(cmd, &rt, &cfg);
+ rtnl_net_lock(net);
+ err = rtentry_to_fib_config(net, cmd, rt, &cfg);
if (err == 0) {
struct fib_table *tb;
if (cmd == SIOCDELRT) {
- tb = fib_get_table(cfg.fc_table);
+ tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = tb->tb_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg,
+ NULL);
else
err = -ESRCH;
} else {
- tb = fib_new_table(cfg.fc_table);
+ tb = fib_new_table(net, cfg.fc_table);
if (tb)
- err = tb->tb_insert(tb, &cfg);
+ err = fib_table_insert(net, tb,
+ &cfg, NULL);
else
err = -ENOBUFS;
}
@@ -438,22 +658,14 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg)
/* allocated by rtentry_to_fib_config() */
kfree(cfg.fc_mx);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
return -EINVAL;
}
-#else
-
-int ip_rt_ioctl(unsigned int cmd, void *arg)
-{
- return -EINVAL;
-}
-
-#endif
-
-struct nla_policy rtm_ipv4_policy[RTA_MAX+1] __read_mostly = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -463,27 +675,89 @@ struct nla_policy rtm_ipv4_policy[RTA_MAX+1] __read_mostly = {
[RTA_PREFSRC] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
- [RTA_PROTOINFO] = { .type = NLA_U32 },
[RTA_FLOW] = { .type = NLA_U32 },
- [RTA_MP_ALGO] = { .type = NLA_U32 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_UID] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_IP_PROTO] = { .type = NLA_U8 },
+ [RTA_SPORT] = { .type = NLA_U16 },
+ [RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
-static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct fib_config *cfg)
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct rtvia *via;
+ int alen;
+
+ if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
+ return -EINVAL;
+ }
+
+ via = nla_data(nla);
+ alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (alen != sizeof(__be32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
+ return -EINVAL;
+ }
+ cfg->fc_gw_family = AF_INET;
+ cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
+ break;
+ case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+ if (alen != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
+ return -EINVAL;
+ }
+ cfg->fc_gw_family = AF_INET6;
+ cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
+#else
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EINVAL;
+#endif
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
{
+ bool has_gw = false, has_via = false;
struct nlattr *attr;
int err, remaining;
struct rtmsg *rtm;
- err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+ err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+ rtm_ipv4_policy, extack);
if (err < 0)
goto errout;
memset(cfg, 0, sizeof(*cfg));
rtm = nlmsg_data(nlh);
+
+ if (!inet_validate_dscp(rtm->rtm_tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ err = -EINVAL;
+ goto errout;
+ }
+ cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_tos = rtm->rtm_tos;
cfg->fc_table = rtm->rtm_table;
cfg->fc_protocol = rtm->rtm_protocol;
cfg->fc_scope = rtm->rtm_scope;
@@ -491,11 +765,18 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags = rtm->rtm_flags;
cfg->fc_nlflags = nlh->nlmsg_flags;
- cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+ cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (cfg->fc_type > RTN_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid route type");
+ err = -EINVAL;
+ goto errout;
+ }
nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
- switch (attr->nla_type) {
+ switch (nla_type(attr)) {
case RTA_DST:
cfg->fc_dst = nla_get_be32(attr);
break;
@@ -503,7 +784,16 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_oif = nla_get_u32(attr);
break;
case RTA_GATEWAY:
- cfg->fc_gw = nla_get_be32(attr);
+ has_gw = true;
+ cfg->fc_gw4 = nla_get_be32(attr);
+ if (cfg->fc_gw4)
+ cfg->fc_gw_family = AF_INET;
+ break;
+ case RTA_VIA:
+ has_via = true;
+ err = fib_gw_from_via(cfg, attr, extack);
+ if (err)
+ goto errout;
break;
case RTA_PRIORITY:
cfg->fc_priority = nla_get_u32(attr);
@@ -516,92 +806,274 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_mx_len = nla_len(attr);
break;
case RTA_MULTIPATH:
+ err = lwtunnel_valid_encap_type_attr(nla_data(attr),
+ nla_len(attr),
+ extack);
+ if (err < 0)
+ goto errout;
cfg->fc_mp = nla_data(attr);
cfg->fc_mp_len = nla_len(attr);
break;
case RTA_FLOW:
cfg->fc_flow = nla_get_u32(attr);
break;
- case RTA_MP_ALGO:
- cfg->fc_mp_alg = nla_get_u32(attr);
- break;
case RTA_TABLE:
cfg->fc_table = nla_get_u32(attr);
break;
+ case RTA_ENCAP:
+ cfg->fc_encap = attr;
+ break;
+ case RTA_ENCAP_TYPE:
+ cfg->fc_encap_type = nla_get_u16(attr);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
+ extack);
+ if (err < 0)
+ goto errout;
+ break;
+ case RTA_NH_ID:
+ cfg->fc_nh_id = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ if (cfg->fc_dst_len > 32) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (cfg->fc_nh_id) {
+ if (cfg->fc_oif || cfg->fc_gw_family ||
+ cfg->fc_encap || cfg->fc_mp) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ err = -EINVAL;
+ goto errout;
}
}
+ if (has_gw && has_via) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (!cfg->fc_table)
+ cfg->fc_table = RT_TABLE_MAIN;
+
return 0;
errout:
return err;
}
-int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(skb->sk);
struct fib_config cfg;
struct fib_table *tb;
int err;
- err = rtm_to_fib_config(skb, nlh, &cfg);
+ err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
if (err < 0)
goto errout;
- tb = fib_get_table(cfg.fc_table);
- if (tb == NULL) {
+ rtnl_net_lock(net);
+
+ if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ tb = fib_get_table(net, cfg.fc_table);
+ if (!tb) {
+ NL_SET_ERR_MSG(extack, "FIB table does not exist");
err = -ESRCH;
- goto errout;
+ goto unlock;
}
- err = tb->tb_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg, extack);
+unlock:
+ rtnl_net_unlock(net);
errout:
return err;
}
-int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(skb->sk);
struct fib_config cfg;
struct fib_table *tb;
int err;
- err = rtm_to_fib_config(skb, nlh, &cfg);
+ err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
if (err < 0)
goto errout;
- tb = fib_new_table(cfg.fc_table);
- if (tb == NULL) {
+ rtnl_net_lock(net);
+
+ tb = fib_new_table(net, cfg.fc_table);
+ if (!tb) {
err = -ENOBUFS;
- goto errout;
+ goto unlock;
}
- err = tb->tb_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg, extack);
+ if (!err && cfg.fc_type == RTN_LOCAL)
+ net->ipv4.fib_has_custom_local_routes = true;
+
+unlock:
+ rtnl_net_unlock(net);
errout:
return err;
}
-int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+ struct fib_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[RTA_MAX + 1];
+ struct rtmsg *rtm;
+ int err, i;
+
+ if (filter->rtnl_held)
+ ASSERT_RTNL();
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
+ rtm->rtm_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
+ return -EINVAL;
+ }
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ filter->dump_routes = false;
+ else
+ filter->dump_exceptions = false;
+
+ filter->flags = rtm->rtm_flags;
+ filter->protocol = rtm->rtm_protocol;
+ filter->rt_type = rtm->rtm_type;
+ filter->table_id = rtm->rtm_table;
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= RTA_MAX; ++i) {
+ int ifindex;
+
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_TABLE:
+ filter->table_id = nla_get_u32(tb[i]);
+ break;
+ case RTA_OIF:
+ ifindex = nla_get_u32(tb[i]);
+ if (filter->rtnl_held)
+ filter->dev = __dev_get_by_index(net, ifindex);
+ else
+ filter->dev = dev_get_by_index_rcu(net, ifindex);
+ if (!filter->dev)
+ return -ENODEV;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ if (filter->flags || filter->protocol || filter->rt_type ||
+ filter->table_id || filter->dev) {
+ filter->filter_set = 1;
+ cb->answer_flags = NLM_F_DUMP_FILTERED;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
+
+static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = {
+ .dump_routes = true,
+ .dump_exceptions = true,
+ .rtnl_held = false,
+ };
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
- struct hlist_node *node;
- int dumped = 0;
+ struct hlist_head *head;
+ int dumped = 0, err = 0;
+
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
+ if (err < 0)
+ goto unlock;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
+ }
+
+ /* ipv4 does not use prefix flag */
+ if (filter.flags & RTM_F_PREFIX)
+ goto unlock;
- if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
- ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
- return ip_rt_dump(skb, cb);
+ if (filter.table_id) {
+ tb = fib_get_table(net, filter.table_id);
+ if (!tb) {
+ if (rtnl_msg_family(cb->nlh) != PF_INET)
+ goto unlock;
+
+ NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+ err = -ENOENT;
+ goto unlock;
+ }
+ err = fib_table_dump(tb, skb, cb, &filter);
+ goto unlock;
+ }
s_h = cb->args[0];
s_e = cb->args[1];
+ err = 0;
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
- 2 * sizeof(cb->args[0]));
- if (tb->tb_dump(tb, skb, cb) < 0)
+ 2 * sizeof(cb->args[0]));
+ err = fib_table_dump(tb, skb, cb, &filter);
+ if (err < 0)
goto out;
dumped = 1;
next:
@@ -609,38 +1081,46 @@ next:
}
}
out:
+
cb->args[1] = e;
cb->args[0] = h;
- return skb->len;
+unlock:
+ rcu_read_unlock();
+ return err;
}
/* Prepare and feed intra-kernel routing request.
- Really, it should be netlink message, but :-( netlink
- can be not configured, so that we feed it directly
- to fib engine. It is legal, because all events occur
- only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
*/
-
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+ struct in_ifaddr *ifa, u32 rt_priority)
{
+ struct net *net = dev_net(ifa->ifa_dev->dev);
+ u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
.fc_type = type,
.fc_dst = dst,
.fc_dst_len = dst_len,
+ .fc_priority = rt_priority,
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+ .fc_nlinfo = {
+ .nl_net = net,
+ },
};
- if (type == RTN_UNICAST)
- tb = fib_new_table(RT_TABLE_MAIN);
- else
- tb = fib_new_table(RT_TABLE_LOCAL);
+ if (!tb_id)
+ tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
- if (tb == NULL)
+ tb = fib_new_table(net, tb_id);
+ if (!tb)
return;
cfg.fc_table = tb->tb_id;
@@ -651,9 +1131,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
cfg.fc_scope = RT_SCOPE_HOST;
if (cmd == RTM_NEWROUTE)
- tb->tb_insert(tb, &cfg);
+ fib_table_insert(net, tb, &cfg, NULL);
else
- tb->tb_delete(tb, &cfg);
+ fib_table_delete(net, tb, &cfg, NULL);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -663,70 +1143,166 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
struct in_ifaddr *prim = ifa;
__be32 mask = ifa->ifa_mask;
__be32 addr = ifa->ifa_local;
- __be32 prefix = ifa->ifa_address&mask;
+ __be32 prefix = ifa->ifa_address & mask;
- if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
- if (prim == NULL) {
- printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ if (!prim) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
}
- fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
- if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
+ arp_invalidate(dev, ifa->ifa_broadcast, false);
+ }
- if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim,
+ ifa->ifa_rt_priority);
- /* Add network specific broadcasts, when it takes a sense */
+ /* Add the network broadcast address, when it makes sense */
if (ifa->ifa_prefixlen < 31) {
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim, 0);
+ arp_invalidate(dev, prefix | ~mask, false);
}
}
}
-static void fib_del_ifaddr(struct in_ifaddr *ifa)
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+ __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+
+ if (!(dev->flags & IFF_UP) ||
+ ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+ ipv4_is_zeronet(prefix) ||
+ (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
+ return;
+
+ /* add the new */
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+ /* delete the old */
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
{
struct in_device *in_dev = ifa->ifa_dev;
struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
- struct in_ifaddr *prim = ifa;
- __be32 brd = ifa->ifa_address|~ifa->ifa_mask;
- __be32 any = ifa->ifa_address&ifa->ifa_mask;
+ struct in_ifaddr *prim = ifa, *prim1 = NULL;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK 1
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
- unsigned ok = 0;
+ unsigned int ok = 0;
+ int subnet = 0; /* Primary network */
+ int gone = 1; /* Address is missing */
+ int same_prefsrc = 0; /* Another primary with same IP */
- if (!(ifa->ifa_flags&IFA_F_SECONDARY))
- fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
- else {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
- if (prim == NULL) {
- printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ if (!prim) {
+ /* if the device has been deleted, we don't perform
+ * address promotion
+ */
+ if (!in_dev->dead)
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
+ if (iprim && iprim != prim) {
+ pr_warn("%s: bug: iprim != prim\n", __func__);
+ return;
+ }
+ } else if (!ipv4_is_zeronet(any) &&
+ (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim, 0);
+ subnet = 1;
}
- /* Deletion is more complicated than add.
- We should take care of not to delete too much :-)
+ if (in_dev->dead)
+ goto no_promotions;
- Scan address list to be sure that addresses are really gone.
+ /* Deletion is more complicated than add.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
*/
-
- for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa1, in_dev) {
+ if (ifa1 == ifa) {
+ /* promotion, keep the IP */
+ gone = 0;
+ continue;
+ }
+ /* Ignore IFAs from our subnet */
+ if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, iprim))
+ continue;
+
+ /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+ if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+ /* Another address from our subnet? */
+ if (ifa1->ifa_mask == prim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, prim))
+ prim1 = prim;
+ else {
+ /* We reached the secondaries, so
+ * same_prefsrc should be determined.
+ */
+ if (!same_prefsrc)
+ continue;
+ /* Search new prim1 if ifa1 is not
+ * using the current prim1
+ */
+ if (!prim1 ||
+ ifa1->ifa_mask != prim1->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, prim1))
+ prim1 = inet_ifa_byprefix(in_dev,
+ ifa1->ifa_address,
+ ifa1->ifa_mask);
+ if (!prim1)
+ continue;
+ if (prim1->ifa_local != prim->ifa_local)
+ continue;
+ }
+ } else {
+ if (prim->ifa_local != ifa1->ifa_local)
+ continue;
+ prim1 = ifa1;
+ if (prim != prim1)
+ same_prefsrc = 1;
+ }
if (ifa->ifa_local == ifa1->ifa_local)
ok |= LOCAL_OK;
if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -735,27 +1311,53 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
ok |= BRD1_OK;
if (any == ifa1->ifa_broadcast)
ok |= BRD0_OK;
+ /* primary has network specific broadcasts */
+ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+ __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+ __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+ if (!ipv4_is_zeronet(any1)) {
+ if (ifa->ifa_broadcast == brd1 ||
+ ifa->ifa_broadcast == any1)
+ ok |= BRD_OK;
+ if (brd == brd1 || brd == any1)
+ ok |= BRD1_OK;
+ if (any == brd1 || any == any1)
+ ok |= BRD0_OK;
+ }
+ }
}
+ rcu_read_unlock();
+
+no_promotions:
+ if (!(ok & BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
+ if (subnet && ifa->ifa_prefixlen < 31) {
+ if (!(ok & BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+ prim, 0);
+ if (!(ok & BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+ prim, 0);
+ }
+ if (!(ok & LOCAL_OK)) {
+ unsigned int addr_type;
- if (!(ok&BRD_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!(ok&BRD1_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
- if (!(ok&BRD0_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
- if (!(ok&LOCAL_OK)) {
- fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
/* Check, that this local address finally disappeared. */
- if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
+ ifa->ifa_local);
+ if (gone && addr_type != RTN_LOCAL) {
/* And the last, but not the least thing.
- We must flush stray FIB entries.
-
- First of all, we scan fib_info list searching
- for stray nexthop entries, then ignite fib_flush.
- */
- if (fib_sync_down(ifa->ifa_local, NULL, 0))
- fib_flush();
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down_addr(dev, ifa->ifa_local))
+ fib_flush(dev_net(dev));
}
}
#undef LOCAL_OK
@@ -764,19 +1366,28 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
{
-
+
struct fib_result res;
- struct flowi fl = { .mark = frn->fl_mark,
- .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
- .tos = frn->fl_tos,
- .scope = frn->fl_scope } } };
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos),
+ .flowi4_scope = frn->fl_scope,
+ };
+ struct fib_table *tb;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(net, frn->tb_id_in);
+
+ frn->err = -ENOENT;
if (tb) {
local_bh_disable();
frn->tb_id = tb->tb_id;
- frn->err = tb->tb_lookup(tb, &fl, &res);
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
@@ -786,69 +1397,93 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
}
local_bh_enable();
}
+
+ rcu_read_unlock();
}
-static void nl_fib_input(struct sock *sk, int len)
+static void nl_fib_input(struct sk_buff *skb)
{
- struct sk_buff *skb = NULL;
- struct nlmsghdr *nlh = NULL;
+ struct net *net;
struct fib_result_nl *frn;
- u32 pid;
- struct fib_table *tb;
-
- skb = skb_dequeue(&sk->sk_receive_queue);
- nlh = (struct nlmsghdr *)skb->data;
- if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
- nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) {
- kfree_skb(skb);
+ struct nlmsghdr *nlh;
+ u32 portid;
+
+ net = sock_net(skb->sk);
+ nlh = nlmsg_hdr(skb);
+ if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
+ skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
return;
- }
-
- frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
- tb = fib_get_table(frn->tb_id_in);
- nl_fib_lookup(frn, tb);
-
- pid = nlh->nlmsg_pid; /*pid of sending process */
- NETLINK_CB(skb).pid = 0; /* from kernel */
+ skb = netlink_skb_clone(skb, GFP_KERNEL);
+ if (!skb)
+ return;
+ nlh = nlmsg_hdr(skb);
+
+ frn = nlmsg_data(nlh);
+ nl_fib_lookup(net, frn);
+
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
+ NETLINK_CB(skb).portid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
- netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
-}
+ nlmsg_unicast(net->ipv4.fibnl, skb, portid);
+}
+
+static int __net_init nl_fib_lookup_init(struct net *net)
+{
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .input = nl_fib_input,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
+ if (!sk)
+ return -EAFNOSUPPORT;
+ net->ipv4.fibnl = sk;
+ return 0;
+}
-static void nl_fib_lookup_init(void)
+static void nl_fib_lookup_exit(struct net *net)
{
- netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
+ netlink_kernel_release(net->ipv4.fibnl);
+ net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, unsigned long event,
+ bool force)
{
- if (fib_sync_down(0, dev, force))
- fib_flush();
- rt_cache_flush(0);
+ if (fib_sync_down_dev(dev, event, force))
+ fib_flush(dev_net(dev));
+ else
+ rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
}
static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+ struct in_ifaddr *ifa = ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
switch (event) {
case NETDEV_UP:
fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(ifa->ifa_dev->dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
- rt_cache_flush(-1);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(net);
break;
case NETDEV_DOWN:
- fib_del_ifaddr(ifa);
- if (ifa->ifa_dev->ifa_list == NULL) {
+ fib_del_ifaddr(ifa, NULL);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ if (!ifa->ifa_dev->ifa_list) {
/* Last address was deleted from this interface.
- Disable IP.
+ * Disable IP.
*/
- fib_disable_ip(ifa->ifa_dev->dev, 1);
+ fib_disable_ip(dev, event, true);
} else {
- rt_cache_flush(-1);
+ rt_cache_flush(net);
}
break;
}
@@ -857,65 +1492,222 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_changeupper_info *upper_info = ptr;
+ struct netdev_notifier_info_ext *info_ext = ptr;
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
+ struct in_ifaddr *ifa;
+ unsigned int flags;
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2);
+ fib_disable_ip(dev, event, true);
+ rt_flush_dev(dev);
return NOTIFY_DONE;
}
+ in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return NOTIFY_DONE;
switch (event) {
case NETDEV_UP:
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
- } endfor_ifa(in_dev);
+ }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
- rt_cache_flush(-1);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(net);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0);
+ fib_disable_ip(dev, event, false);
break;
- case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
- rt_cache_flush(0);
+ flags = netif_get_flags(dev);
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ fib_sync_up(dev, RTNH_F_LINKDOWN);
+ else
+ fib_sync_down_dev(dev, event, false);
+ rt_cache_flush(net);
+ break;
+ case NETDEV_CHANGEMTU:
+ fib_sync_mtu(dev, info_ext->ext.mtu);
+ rt_cache_flush(net);
+ break;
+ case NETDEV_CHANGEUPPER:
+ upper_info = ptr;
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (upper_info->upper_dev &&
+ netif_is_l3_master(upper_info->upper_dev))
+ fib_disable_ip(dev, NETDEV_DOWN, true);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block fib_inetaddr_notifier = {
- .notifier_call =fib_inetaddr_event,
+ .notifier_call = fib_inetaddr_event,
};
static struct notifier_block fib_netdev_notifier = {
- .notifier_call =fib_netdev_event,
+ .notifier_call = fib_netdev_event,
};
-void __init ip_fib_init(void)
+static int __net_init ip_fib_net_init(struct net *net)
{
- unsigned int i;
+ int err;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
- for (i = 0; i < FIB_TABLE_HASHSZ; i++)
- INIT_HLIST_HEAD(&fib_table_hash[i]);
-#ifndef CONFIG_IP_MULTIPLE_TABLES
- ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
- hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]);
- ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
- hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]);
-#else
- fib4_rules_init();
+ err = fib4_notifier_init(net);
+ if (err)
+ return err;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ /* Default to 3-tuple */
+ net->ipv4.sysctl_fib_multipath_hash_fields =
+ FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+#endif
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv4.fib_table_hash) {
+ err = -ENOMEM;
+ goto err_table_hash_alloc;
+ }
+
+ err = fib4_rules_init(net);
+ if (err < 0)
+ goto err_rules_init;
+ return 0;
+
+err_rules_init:
+ kfree(net->ipv4.fib_table_hash);
+err_table_hash_alloc:
+ fib4_notifier_exit(net);
+ return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
+{
+ int i;
+
+ ASSERT_RTNL_NET(net);
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+#endif
+ /* Destroy the tables in reverse order to guarantee that the
+ * local table, ID 255, is destroyed before the main table, ID
+ * 254. This is necessary as the local table may contain
+ * references to data contained in the main table.
+ */
+ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[i];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
+ fib_table_flush(net, tb, true);
+ fib_free_table(tb);
+ }
+ }
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
#endif
+ kfree(net->ipv4.fib_table_hash);
+ fib4_notifier_exit(net);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+ int error;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
+#endif
+ error = ip_fib_net_init(net);
+ if (error < 0)
+ goto out;
+
+ error = fib4_semantics_init(net);
+ if (error)
+ goto out_semantics;
+
+ error = nl_fib_lookup_init(net);
+ if (error < 0)
+ goto out_nlfl;
+
+ error = fib_proc_init(net);
+ if (error < 0)
+ goto out_proc;
+out:
+ return error;
+
+out_proc:
+ nl_fib_lookup_exit(net);
+out_nlfl:
+ fib4_semantics_exit(net);
+out_semantics:
+ rtnl_net_lock(net);
+ ip_fib_net_exit(net);
+ rtnl_net_unlock(net);
+ goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+ fib_proc_exit(net);
+ nl_fib_lookup_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ __rtnl_net_lock(net);
+ ip_fib_net_exit(net);
+ __rtnl_net_unlock(net);
+ }
+ rtnl_unlock();
+
+ list_for_each_entry(net, net_list, exit_list)
+ fib4_semantics_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+ .init = fib_net_init,
+ .exit = fib_net_exit,
+ .exit_batch = fib_net_exit_batch,
+};
+
+static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
+ {.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
+ .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_DELROUTE,
+ .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+};
+
+void __init ip_fib_init(void)
+{
+ fib_trie_init();
+
+ register_pernet_subsys(&fib_net_ops);
+
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- nl_fib_lookup_init();
-}
-EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
+ rtnl_register_many(fib_rtnl_msg_handlers);
+}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index 107bb6cbb0b3..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1079 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IPv4 FIB: lookup engine and maintenance routines.
- *
- * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static kmem_cache_t *fn_hash_kmem __read_mostly;
-static kmem_cache_t *fn_alias_kmem __read_mostly;
-
-struct fib_node {
- struct hlist_node fn_hash;
- struct list_head fn_alias;
- __be32 fn_key;
-};
-
-struct fn_zone {
- struct fn_zone *fz_next; /* Next not empty zone */
- struct hlist_head *fz_hash; /* Hash table pointer */
- int fz_nent; /* Number of entries */
-
- int fz_divisor; /* Hash divisor */
- u32 fz_hashmask; /* (fz_divisor - 1) */
-#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
-
- int fz_order; /* Zone order */
- __be32 fz_mask;
-#define FZ_MASK(fz) ((fz)->fz_mask)
-};
-
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
-
-struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone *fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
- u32 h = ntohl(key)>>(32 - fz->fz_order);
- h ^= (h>>20);
- h ^= (h>>10);
- h ^= (h>>5);
- h &= FZ_HASHMASK(fz);
- return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
- return dst & FZ_MASK(fz);
-}
-
-static DEFINE_RWLOCK(fib_hash_lock);
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE) {
- return kmalloc(size, GFP_KERNEL);
- } else {
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(size));
- }
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
- struct hlist_head *old_ht,
- int old_divisor)
-{
- int i;
-
- for (i = 0; i < old_divisor; i++) {
- struct hlist_node *node, *n;
- struct fib_node *f;
-
- hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
-
- hlist_del(&f->fn_hash);
-
- new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, new_head);
- }
- }
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
- struct hlist_head *ht, *old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- old_divisor = fz->fz_divisor;
-
- switch (old_divisor) {
- case 16:
- new_divisor = 256;
- break;
- case 256:
- new_divisor = 1024;
- break;
- default:
- if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
- printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
- return;
- }
- new_divisor = (old_divisor << 1);
- break;
- }
-
- new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
- printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
-#endif
-
- ht = fz_hash_alloc(new_divisor);
-
- if (ht) {
- memset(ht, 0, new_divisor * sizeof(struct hlist_head));
-
- write_lock_bh(&fib_hash_lock);
- old_ht = fz->fz_hash;
- fz->fz_hash = ht;
- fz->fz_hashmask = new_hashmask;
- fz->fz_divisor = new_divisor;
- fn_rebuild_zone(fz, old_ht, old_divisor);
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- fz_hash_free(old_ht, old_divisor);
- }
-}
-
-static inline void fn_free_node(struct fib_node * f)
-{
- kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa)
-{
- fib_release_info(fa->fa_info);
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
- int i;
- struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
- if (!fz)
- return NULL;
-
- if (z) {
- fz->fz_divisor = 16;
- } else {
- fz->fz_divisor = 1;
- }
- fz->fz_hashmask = (fz->fz_divisor - 1);
- fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
- if (!fz->fz_hash) {
- kfree(fz);
- return NULL;
- }
- memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
- fz->fz_order = z;
- fz->fz_mask = inet_make_mask(z);
-
- /* Find the first not empty zone with more specific mask */
- for (i=z+1; i<=32; i++)
- if (table->fn_zones[i])
- break;
- write_lock_bh(&fib_hash_lock);
- if (i>32) {
- /* No more specific masks, we are the first. */
- fz->fz_next = table->fn_zone_list;
- table->fn_zone_list = fz;
- } else {
- fz->fz_next = table->fn_zones[i]->fz_next;
- table->fn_zones[i]->fz_next = fz;
- }
- table->fn_zones[z] = fz;
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
- return fz;
-}
-
-static int
-fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
-{
- int err;
- struct fn_zone *fz;
- struct fn_hash *t = (struct fn_hash*)tb->tb_data;
-
- read_lock(&fib_hash_lock);
- for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
- struct hlist_head *head;
- struct hlist_node *node;
- struct fib_node *f;
- __be32 k = fz_key(flp->fl4_dst, fz);
-
- head = &fz->fz_hash[fn_hash(k, fz)];
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
-
- err = fib_semantic_match(&f->fn_alias,
- flp, res,
- f->fn_key, fz->fz_mask,
- fz->fz_order);
- if (err <= 0)
- goto out;
- }
- }
- err = 1;
-out:
- read_unlock(&fib_hash_lock);
- return err;
-}
-
-static int fn_hash_last_dflt=-1;
-
-static void
-fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
-{
- int order, last_idx;
- struct hlist_node *node;
- struct fib_node *f;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fn_hash *t = (struct fn_hash*)tb->tb_data;
- struct fn_zone *fz = t->fn_zones[0];
-
- if (fz == NULL)
- return;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- read_lock(&fib_hash_lock);
- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
- fa->fa_state |= FA_S_ACCESSED;
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, &fn_hash_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- fn_hash_last_dflt = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- }
-
- if (order <= 0 || fi == NULL) {
- fn_hash_last_dflt = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- fn_hash_last_dflt = order;
- goto out;
- }
-
- if (last_idx >= 0) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = last_resort;
- if (last_resort)
- atomic_inc(&last_resort->fib_clntref);
- }
- fn_hash_last_dflt = last_idx;
-out:
- read_unlock(&fib_hash_lock);
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
- struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-
- hlist_add_head(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
- struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
- struct hlist_node *node;
- struct fib_node *f;
-
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key == key)
- return f;
- }
-
- return NULL;
-}
-
-static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fib_node *new_f, *f;
- struct fib_alias *fa, *new_fa;
- struct fn_zone *fz;
- struct fib_info *fi;
- u8 tos = cfg->fc_tos;
- __be32 key;
- int err;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- fz = table->fn_zones[cfg->fc_dst_len];
- if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
- return -ENOBUFS;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- fi = fib_create_info(cfg);
- if (IS_ERR(fi))
- return PTR_ERR(fi);
-
- if (fz->fz_nent > (fz->fz_divisor<<1) &&
- fz->fz_divisor < FZ_MAX_DIVISOR &&
- (cfg->fc_dst_len == 32 ||
- (1 << cfg->fc_dst_len) > fz->fz_divisor))
- fn_rehash_zone(fz);
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
- /* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
- * exists or to the node before which we will insert new one.
- *
- * If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
- */
-
- if (fa && fa->fa_tos == tos &&
- fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_orig;
-
- err = -EEXIST;
- if (cfg->fc_nlflags & NLM_F_EXCL)
- goto out;
-
- if (cfg->fc_nlflags & NLM_F_REPLACE) {
- struct fib_info *fi_drop;
- u8 state;
-
- write_lock_bh(&fib_hash_lock);
- fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = cfg->fc_type;
- fa->fa_scope = cfg->fc_scope;
- state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- fib_release_info(fi_drop);
- if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
- return 0;
- }
-
- /* Error if we find a perfect match which
- * uses the same scope, type, and nexthop
- * information.
- */
- fa_orig = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi)
- goto out;
- }
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_orig;
- }
-
- err = -ENOENT;
- if (!(cfg->fc_nlflags & NLM_F_CREATE))
- goto out;
-
- err = -ENOBUFS;
- new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
- if (new_fa == NULL)
- goto out;
-
- new_f = NULL;
- if (!f) {
- new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
- if (new_f == NULL)
- goto out_free_new_fa;
-
- INIT_HLIST_NODE(&new_f->fn_hash);
- INIT_LIST_HEAD(&new_f->fn_alias);
- new_f->fn_key = key;
- f = new_f;
- }
-
- new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- new_fa->fa_state = 0;
-
- /*
- * Insert new entry to the list.
- */
-
- write_lock_bh(&fib_hash_lock);
- if (new_f)
- fib_insert_node(fz, new_f);
- list_add_tail(&new_fa->fa_list,
- (fa ? &fa->fa_list : &f->fn_alias));
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- if (new_f)
- fz->fz_nent++;
- rt_cache_flush(-1);
-
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo);
- return 0;
-
-out_free_new_fa:
- kmem_cache_free(fn_alias_kmem, new_fa);
-out:
- fib_release_info(fi);
- return err;
-}
-
-
-static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash*)tb->tb_data;
- struct fib_node *f;
- struct fib_alias *fa, *fa_to_delete;
- struct fn_zone *fz;
- __be32 key;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
- return -ESRCH;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
- if (!fa)
- return -ESRCH;
-
- fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fa->fa_tos != cfg->fc_tos)
- break;
-
- if ((!cfg->fc_type ||
- fa->fa_type == cfg->fc_type) &&
- (cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
- (!cfg->fc_protocol ||
- fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
- fa_to_delete = fa;
- break;
- }
- }
-
- if (fa_to_delete) {
- int kill_fn;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo);
-
- kill_fn = 0;
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
- kill_fn = 1;
- }
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
- fn_free_alias(fa);
- if (kill_fn) {
- fn_free_node(f);
- fz->fz_nent--;
- }
-
- return 0;
- }
- return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
- struct hlist_head *head = &fz->fz_hash[idx];
- struct hlist_node *node, *n;
- struct fib_node *f;
- int found = 0;
-
- hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
- struct fib_alias *fa, *fa_node;
- int kill_f;
-
- kill_f = 0;
- list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
- kill_f = 1;
- }
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- fn_free_alias(fa);
- found++;
- }
- }
- if (kill_f) {
- fn_free_node(f);
- fz->fz_nent--;
- }
- }
- return found;
-}
-
-static int fn_hash_flush(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz;
- int found = 0;
-
- for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
- int i;
-
- for (i = fz->fz_divisor - 1; i >= 0; i--)
- found += fn_flush_list(fz, i);
- }
- return found;
-}
-
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz,
- struct hlist_head *head)
-{
- struct hlist_node *node;
- struct fib_node *f;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
- hlist_for_each_entry(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
- if (i < s_i)
- goto next;
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- f->fn_key,
- fz->fz_order,
- fa->fa_tos,
- fa->fa_info,
- NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
- next:
- i++;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz)
-{
- int h, s_h;
-
- s_h = cb->args[3];
- for (h=0; h < fz->fz_divisor; h++) {
- if (h < s_h) continue;
- if (h > s_h)
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
- if (fz->fz_hash == NULL ||
- hlist_empty(&fz->fz_hash[h]))
- continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
- cb->args[3] = h;
- return -1;
- }
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
-{
- int m, s_m;
- struct fn_zone *fz;
- struct fn_hash *table = (struct fn_hash*)tb->tb_data;
-
- s_m = cb->args[2];
- read_lock(&fib_hash_lock);
- for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
- if (m < s_m) continue;
- if (m > s_m)
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
- if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
- cb->args[2] = m;
- read_unlock(&fib_hash_lock);
- return -1;
- }
- }
- read_unlock(&fib_hash_lock);
- cb->args[2] = m;
- return skb->len;
-}
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(u32 id)
-#else
-struct fib_table * __init fib_hash_init(u32 id)
-#endif
-{
- struct fib_table *tb;
-
- if (fn_hash_kmem == NULL)
- fn_hash_kmem = kmem_cache_create("ip_fib_hash",
- sizeof(struct fib_node),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
-
- if (fn_alias_kmem == NULL)
- fn_alias_kmem = kmem_cache_create("ip_fib_alias",
- sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
-
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
- GFP_KERNEL);
- if (tb == NULL)
- return NULL;
-
- tb->tb_id = id;
- tb->tb_lookup = fn_hash_lookup;
- tb->tb_insert = fn_hash_insert;
- tb->tb_delete = fn_hash_delete;
- tb->tb_flush = fn_hash_flush;
- tb->tb_select_default = fn_hash_select_default;
- tb->tb_dump = fn_hash_dump;
- memset(tb->tb_data, 0, sizeof(struct fn_hash));
- return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
- struct fn_zone *zone;
- int bucket;
- struct hlist_head *hash_head;
- struct fib_node *fn;
- struct fib_alias *fa;
- loff_t pos;
- unsigned int genid;
- int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
-
- iter->bucket = 0;
- iter->hash_head = NULL;
- iter->fn = NULL;
- iter->fa = NULL;
- iter->pos = 0;
- iter->genid = fib_hash_genid;
- iter->valid = 1;
-
- for (iter->zone = table->fn_zone_list; iter->zone;
- iter->zone = iter->zone->fz_next) {
- int maxslot;
-
- if (!iter->zone->fz_nent)
- continue;
-
- iter->hash_head = iter->zone->fz_hash;
- maxslot = iter->zone->fz_divisor;
-
- for (iter->bucket = 0; iter->bucket < maxslot;
- ++iter->bucket, ++iter->hash_head) {
- struct hlist_node *node;
- struct fib_node *fn;
-
- hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa,&fn->fn_alias,fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
- }
-out:
- return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_node *fn;
- struct fib_alias *fa;
-
- /* Advance FA, if any. */
- fn = iter->fn;
- fa = iter->fa;
- if (fa) {
- BUG_ON(!fn);
- list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
-
- fa = iter->fa = NULL;
-
- /* Advance FN. */
- if (fn) {
- struct hlist_node *node = &fn->fn_hash;
- hlist_for_each_entry_continue(fn, node, fn_hash) {
- iter->fn = fn;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- fn = iter->fn = NULL;
-
- /* Advance hash chain. */
- if (!iter->zone)
- goto out;
-
- for (;;) {
- struct hlist_node *node;
- int maxslot;
-
- maxslot = iter->zone->fz_divisor;
-
- while (++iter->bucket < maxslot) {
- iter->hash_head++;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- iter->zone = iter->zone->fz_next;
-
- if (!iter->zone)
- goto out;
-
- iter->bucket = 0;
- iter->hash_head = iter->zone->fz_hash;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-out:
- iter->pos++;
- return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_alias *fa;
-
- if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
- fa = iter->fa;
- pos -= iter->pos;
- } else
- fa = fib_get_first(seq);
-
- if (fa)
- while (pos && (fa = fib_get_next(seq)))
- --pos;
- return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-{
- void *v = NULL;
-
- read_lock(&fib_hash_lock);
- if (ip_fib_main_table)
- v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
- return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
-{
- read_unlock(&fib_hash_lock);
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
- static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
-
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
- if (mask == htonl(0xFFFFFFFF))
- flags |= RTF_HOST;
- flags |= RTF_UP;
- return flags;
-}
-
-/*
- * This outputs /proc/net/route.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
- struct fib_iter_state *iter;
- char bf[128];
- __be32 prefix, mask;
- unsigned flags;
- struct fib_node *f;
- struct fib_alias *fa;
- struct fib_info *fi;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
- "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
- "\tWindow\tIRTT");
- goto out;
- }
-
- iter = seq->private;
- f = iter->fn;
- fa = iter->fa;
- fi = fa->fa_info;
- prefix = f->fn_key;
- mask = FZ_MASK(iter->zone);
- flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (fi)
- snprintf(bf, sizeof(bf),
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*", prefix,
- fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3);
- else
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
- prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
- seq_printf(seq, "%-127s\n", bf);
-out:
- return 0;
-}
-
-static struct seq_operations fib_seq_ops = {
- .start = fib_seq_start,
- .next = fib_seq_next,
- .stop = fib_seq_stop,
- .show = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct fib_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &fib_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations fib_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-int __init fib_proc_init(void)
-{
- if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-void __init fib_proc_exit(void)
-{
- proc_net_remove("route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 0e8b70bad4e1..f9b9e26c32c1 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -1,40 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FIB_LOOKUP_H
#define _FIB_LOOKUP_H
#include <linux/types.h>
#include <linux/list.h>
+#include <net/inet_dscp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
struct fib_alias {
- struct list_head fa_list;
- struct rcu_head rcu;
+ struct hlist_node fa_list;
struct fib_info *fa_info;
- u8 fa_tos;
+ dscp_t fa_dscp;
u8 fa_type;
- u8 fa_scope;
u8 fa_state;
+ u8 fa_slen;
+ u32 tb_id;
+ s16 fa_default;
+ u8 offload;
+ u8 trap;
+ u8 offload_failed;
+ struct rcu_head rcu;
};
#define FA_S_ACCESSED 0x01
+/* Don't write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
/* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
- const struct flowi *flp,
- struct fib_result *res, __be32 zone, __be32 mask,
- int prefixlen);
-extern void fib_release_info(struct fib_info *);
-extern struct fib_info *fib_create_info(struct fib_config *cfg);
-extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
-extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, __be32 dst,
- int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
-extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info);
-extern struct fib_alias *fib_find_alias(struct list_head *fah,
- u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort,
- int *last_idx, int *dflt);
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg,
+ struct netlink_ext_ack *extack);
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
+ struct netlink_ext_ack *extack);
+bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ const struct fib_rt_info *fri, unsigned int flags);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+size_t fib_nlmsg_size(struct fib_info *fi);
+
+static inline void fib_result_assign(struct fib_result *res,
+ struct fib_info *fi)
+{
+ /* we used to play games with refcounts, but we now use RCU */
+ res->fi = fi;
+ res->nhc = fib_info_nhc(fi, 0);
+}
+
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
new file mode 100644
index 000000000000..b1551c26554b
--- /dev/null
+++ b/net/ipv4/fib_notifier.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+#include <net/ip_fib.h>
+
+int call_fib4_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->family = AF_INET;
+ return call_fib_notifier(nb, event_type, info);
+}
+
+int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ ASSERT_RTNL();
+
+ info->family = AF_INET;
+ /* Paired with READ_ONCE() in fib4_seq_read() */
+ WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1);
+ return call_fib_notifiers(net, event_type, info);
+}
+
+static unsigned int fib4_seq_read(const struct net *net)
+{
+ /* Paired with WRITE_ONCE() in call_fib4_notifiers() */
+ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net);
+}
+
+static int fib4_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = fib4_rules_dump(net, nb, extack);
+ if (err)
+ return err;
+
+ return fib_notify(net, nb, extack);
+}
+
+static const struct fib_notifier_ops fib4_notifier_ops_template = {
+ .family = AF_INET,
+ .fib_seq_read = fib4_seq_read,
+ .fib_dump = fib4_dump,
+ .owner = THIS_MODULE,
+};
+
+int __net_init fib4_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
+
+ net->ipv4.fib_seq = 0;
+
+ ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv4.notifier_ops = ops;
+
+ return 0;
+}
+
+void __net_exit fib4_notifier_exit(struct net *net)
+{
+ fib_notifier_ops_unregister(net->ipv4.notifier_ops);
+}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b837c33e0404..51f0193092f0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,15 +7,10 @@
* IPv4 Forwarding Information Base: policy rules.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * Thomas Graf <tgraf@suug.ch>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Thomas Graf <tgraf@suug.ch>
*
* Fixes:
- * Rani Assaf : local_rule cannot be deleted
+ * Rani Assaf : local_rule cannot be deleted
* Marc Boucher : routing by fwmark
*/
@@ -26,173 +22,296 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/flow.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/fib_rules.h>
+#include <linux/indirect_call_wrapper.h>
-static struct fib_rules_ops fib4_rules_ops;
-
-struct fib4_rule
-{
+struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
- u8 tos;
+ dscp_t dscp;
+ dscp_t dscp_mask;
+ u8 dscp_full:1; /* DSCP or TOS selector */
__be32 src;
__be32 srcmask;
__be32 dst;
__be32 dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
u32 tclassid;
#endif
};
-static struct fib4_rule default_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0x7FFF,
- .table = RT_TABLE_DEFAULT,
- .action = FR_ACT_TO_TBL,
- },
-};
+static bool fib4_rule_matchall(const struct fib_rule *rule)
+{
+ struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
-static struct fib4_rule main_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0x7FFE,
- .table = RT_TABLE_MAIN,
- .action = FR_ACT_TO_TBL,
- },
-};
+ if (r->dst_len || r->src_len || r->dscp)
+ return false;
+ return fib_rule_matchall(rule);
+}
-static struct fib4_rule local_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .table = RT_TABLE_LOCAL,
- .action = FR_ACT_TO_TBL,
- .flags = FIB_RULE_PERMANENT,
- },
-};
+bool fib4_rule_default(const struct fib_rule *rule)
+{
+ if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
+ rule->l3mdev)
+ return false;
+ if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN &&
+ rule->table != RT_TABLE_DEFAULT)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(fib4_rule_default);
-static LIST_HEAD(fib4_rules);
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return fib_rules_dump(net, nb, AF_INET, extack);
+}
-#ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
+unsigned int fib4_rules_seq_read(const struct net *net)
{
- return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
+ return fib_rules_seq_read(net, AF_INET);
}
-#endif
-int fib_lookup(struct flowi *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+ struct fib_result *res, unsigned int flags)
{
struct fib_lookup_arg arg = {
.result = res,
+ .flags = flags,
};
int err;
- err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg);
- res->r = arg.rule;
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (arg.rule)
+ res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+ else
+ res->tclassid = 0;
+#endif
+
+ if (err == -ESRCH)
+ err = -ENETUNREACH;
return err;
}
+EXPORT_SYMBOL_GPL(__fib_lookup);
-static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule,
+ struct flowi *flp, int flags,
+ struct fib_lookup_arg *arg)
{
int err = -EAGAIN;
struct fib_table *tbl;
+ u32 tb_id;
switch (rule->action) {
case FR_ACT_TO_TBL:
break;
case FR_ACT_UNREACHABLE:
- err = -ENETUNREACH;
- goto errout;
+ return -ENETUNREACH;
case FR_ACT_PROHIBIT:
- err = -EACCES;
- goto errout;
+ return -EACCES;
case FR_ACT_BLACKHOLE:
default:
- err = -EINVAL;
- goto errout;
+ return -EINVAL;
}
- if ((tbl = fib_get_table(rule->table)) == NULL)
- goto errout;
+ rcu_read_lock();
- err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result);
- if (err > 0)
- err = -EAGAIN;
-errout:
+ tb_id = fib_rule_get_table(rule, arg);
+ tbl = fib_get_table(rule->fr_net, tb_id);
+ if (tbl)
+ err = fib_table_lookup(tbl, &flp->u.ip4,
+ (struct fib_result *)arg->result,
+ arg->flags);
+
+ rcu_read_unlock();
return err;
}
-
-void fib_select_default(const struct flowi *flp, struct fib_result *res)
+INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule,
+ int flags,
+ struct fib_lookup_arg *arg)
{
- if (res->r && res->r->action == FR_ACT_TO_TBL &&
- FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
- struct fib_table *tb;
- if ((tb = fib_get_table(res->r->table)) != NULL)
- tb->tb_select_default(tb, flp, res);
+ struct fib_result *result = arg->result;
+ struct net_device *dev = NULL;
+
+ if (result->fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0);
+
+ dev = nhc->nhc_dev;
}
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (result->prefixlen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ fib_info_put(result->fi);
+ return true;
}
-static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
+ struct flowi *fl, int flags)
{
struct fib4_rule *r = (struct fib4_rule *) rule;
- __be32 daddr = fl->fl4_dst;
- __be32 saddr = fl->fl4_src;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
if (((saddr ^ r->src) & r->srcmask) ||
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl->fl4_tos))
+ /* When DSCP selector is used we need to match on the entire DSCP field
+ * in the flow information structure. When TOS selector is used we need
+ * to mask the upper three DSCP bits prior to matching to maintain
+ * legacy behavior.
+ */
+ if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask)
+ return 0;
+ else if (!r->dscp_full && r->dscp &&
+ !fib_dscp_masked_match(r->dscp, fl4))
+ return 0;
+
+ if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
+ return 0;
+
+ if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask,
+ fl4->fl4_sport))
+ return 0;
+
+ if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask,
+ fl4->fl4_dport))
return 0;
return 1;
}
-static struct fib_table *fib_empty_table(void)
+static struct fib_table *fib_empty_table(struct net *net)
{
- u32 id;
+ u32 id = 1;
- for (id = 1; id <= RT_TABLE_MAX; id++)
- if (fib_get_table(id) == NULL)
- return fib_new_table(id);
+ while (1) {
+ if (!fib_get_table(net, id))
+ return fib_new_table(net, id);
+
+ if (id++ == RT_TABLE_MAX)
+ break;
+ }
return NULL;
}
-static struct nla_policy fib4_rule_policy[FRA_MAX+1] __read_mostly = {
- FRA_GENERIC_POLICY,
- [FRA_SRC] = { .type = NLA_U32 },
- [FRA_DST] = { .type = NLA_U32 },
- [FRA_FLOW] = { .type = NLA_U32 },
-};
+static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4,
+ struct netlink_ext_ack *extack)
+{
+ if (rule4->dscp) {
+ NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
+ return -EINVAL;
+ }
+
+ rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK);
+ rule4->dscp_full = true;
+
+ return 0;
+}
+
+static int fib4_nl2rule_dscp_mask(const struct nlattr *nla,
+ struct fib4_rule *rule4,
+ struct netlink_ext_ack *extack)
+{
+ dscp_t dscp_mask;
+
+ if (!rule4->dscp_full) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Cannot specify DSCP mask without DSCP value");
+ return -EINVAL;
+ }
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ if (rule4->dscp & ~dscp_mask) {
+ NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask");
+ return -EINVAL;
+ }
+
+ rule4->dscp_mask = dscp_mask;
+
+ return 0;
+}
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
+ struct fib4_rule *rule4 = (struct fib4_rule *)rule;
+ struct net *net = rule->fr_net;
int err = -EINVAL;
- struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- if (frh->src_len > 32 || frh->dst_len > 32 ||
- (frh->tos & ~IPTOS_TOS_MASK))
+ if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) {
+ NL_SET_ERR_MSG(extack,
+ "Flow label cannot be specified for IPv4 FIB rules");
goto errout;
+ }
- if (rule->table == RT_TABLE_UNSPEC) {
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ /* IPv4 currently doesn't handle high order DSCP bits correctly */
+ if (frh->tos & ~IPTOS_TOS_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid tos");
+ goto errout;
+ }
+ rule4->dscp = inet_dsfield_to_dscp(frh->tos);
+
+ if (tb[FRA_DSCP] &&
+ fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0)
+ goto errout;
+
+ if (tb[FRA_DSCP_MASK] &&
+ fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0)
+ goto errout;
+
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+ if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) {
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
- table = fib_empty_table();
- if (table == NULL) {
+ table = fib_empty_table(net);
+ if (!table) {
err = -ENOBUFS;
goto errout;
}
@@ -201,28 +320,58 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
}
}
- if (tb[FRA_SRC])
- rule4->src = nla_get_be32(tb[FRA_SRC]);
+ if (frh->src_len)
+ rule4->src = nla_get_in_addr(tb[FRA_SRC]);
- if (tb[FRA_DST])
- rule4->dst = nla_get_be32(tb[FRA_DST]);
+ if (frh->dst_len)
+ rule4->dst = nla_get_in_addr(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (tb[FRA_FLOW])
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW]) {
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+ if (rule4->tclassid)
+ atomic_inc(&net->ipv4.fib_num_tclassid_users);
+ }
#endif
+ if (fib_rule_requires_fldissect(rule))
+ net->ipv4.fib_rules_require_fldissect++;
+
rule4->src_len = frh->src_len;
rule4->srcmask = inet_make_mask(rule4->src_len);
rule4->dst_len = frh->dst_len;
rule4->dstmask = inet_make_mask(rule4->dst_len);
- rule4->tos = frh->tos;
+
+ net->ipv4.fib_has_custom_rules = true;
err = 0;
errout:
return err;
}
+static int fib4_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+ int err;
+
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (((struct fib4_rule *)rule)->tclassid)
+ atomic_dec(&net->ipv4.fib_num_tclassid_users);
+#endif
+ net->ipv4.fib_has_custom_rules = true;
+
+ if (net->ipv4.fib_rules_require_fldissect &&
+ fib_rule_requires_fldissect(rule))
+ net->ipv4.fib_rules_require_fldissect--;
+errout:
+ return err;
+}
+
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
@@ -234,42 +383,69 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
- if (frh->tos && (rule4->tos != frh->tos))
+ if (frh->tos &&
+ (rule4->dscp_full ||
+ inet_dscp_to_dsfield(rule4->dscp) != frh->tos))
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+ if (tb[FRA_DSCP]) {
+ dscp_t dscp;
+
+ dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
+ if (!rule4->dscp_full || rule4->dscp != dscp)
+ return 0;
+ }
+
+ if (tb[FRA_DSCP_MASK]) {
+ dscp_t dscp_mask;
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2);
+ if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask)
+ return 0;
+ }
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
- if (tb[FRA_SRC] && (rule4->src != nla_get_be32(tb[FRA_SRC])))
+ if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
return 0;
- if (tb[FRA_DST] && (rule4->dst != nla_get_be32(tb[FRA_DST])))
+ if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
return 0;
return 1;
}
static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
+ struct fib_rule_hdr *frh)
{
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- frh->family = AF_INET;
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
- frh->tos = rule4->tos;
- if (rule4->dst_len)
- NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
-
- if (rule4->src_len)
- NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
+ if (rule4->dscp_full) {
+ frh->tos = 0;
+ if (nla_put_u8(skb, FRA_DSCP,
+ inet_dscp_to_dsfield(rule4->dscp) >> 2) ||
+ nla_put_u8(skb, FRA_DSCP_MASK,
+ inet_dscp_to_dsfield(rule4->dscp_mask) >> 2))
+ goto nla_put_failure;
+ } else {
+ frh->tos = inet_dscp_to_dsfield(rule4->dscp);
+ }
-#ifdef CONFIG_NET_CLS_ROUTE
- if (rule4->tclassid)
- NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+ if ((rule4->dst_len &&
+ nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_in_addr(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
#endif
return 0;
@@ -277,56 +453,77 @@ nla_put_failure:
return -ENOBUFS;
}
-int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return fib_rules_dump(skb, cb, AF_INET);
-}
-
-static u32 fib4_rule_default_pref(void)
-{
- struct list_head *pos;
- struct fib_rule *rule;
-
- if (!list_empty(&fib4_rules)) {
- pos = fib4_rules.next;
- if (pos->next != &fib4_rules) {
- rule = list_entry(pos->next, struct fib_rule, list);
- if (rule->pref)
- return rule->pref - 1;
- }
- }
-
- return 0;
-}
-
static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(4) /* dst */
+ nla_total_size(4) /* src */
- + nla_total_size(4); /* flow */
+ + nla_total_size(4) /* flow */
+ + nla_total_size(1) /* dscp */
+ + nla_total_size(1); /* dscp mask */
}
-static struct fib_rules_ops fib4_rules_ops = {
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+{
+ rt_cache_flush(ops->fro_net);
+}
+
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
+ .addr_size = sizeof(u32),
.action = fib4_rule_action,
+ .suppress = fib4_rule_suppress,
.match = fib4_rule_match,
.configure = fib4_rule_configure,
+ .delete = fib4_rule_delete,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
- .default_pref = fib4_rule_default_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
+ .flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
- .policy = fib4_rule_policy,
- .rules_list = &fib4_rules,
.owner = THIS_MODULE,
};
-void __init fib4_rules_init(void)
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+ int err;
+
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
{
- list_add_tail(&local_rule.common.list, &fib4_rules);
- list_add_tail(&main_rule.common.list, &fib4_rules);
- list_add_tail(&default_rule.common.list, &fib4_rules);
+ int err;
+ struct fib_rules_ops *ops;
+
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ err = fib_default_rules_init(ops);
+ if (err < 0)
+ goto fail;
+ net->ipv4.rules_ops = ops;
+ net->ipv4.fib_has_custom_rules = false;
+ net->ipv4.fib_rules_require_fldissect = 0;
+ return 0;
- fib_rules_register(&fib4_rules_ops);
+fail:
+ /* also cleans all rules already added */
+ fib_rules_unregister(ops);
+ return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+ fib_rules_unregister(net->ipv4.rules_ops);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e63b8a98fb4d..a5f3c8459758 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,18 +6,10 @@
*
* IPv4 Forwarding Information Base: semantics.
*
- * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -34,340 +27,659 @@
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/netlink.h>
+#include <linux/hash.h>
+#include <linux/nospec.h>
#include <net/arp.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/ip_mp_alg.h>
-#include <net/netlink.h>
+#include <net/ip6_fib.h>
#include <net/nexthop.h>
+#include <net/netlink.h>
+#include <net/rtnh.h>
+#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
+#include <net/addrconf.h>
#include "fib_lookup.h"
-#define FSprintk(a...)
-
-static DEFINE_SPINLOCK(fib_info_lock);
-static struct hlist_head *fib_info_hash;
-static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
-static unsigned int fib_info_cnt;
-
-#define DEVINDEX_HASHBITS 8
-#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
-static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
-
+/* for_nexthops and change_nexthops only used when nexthop object
+ * is not set in a fib_info. The logic within can reference fib_nh.
+ */
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-static DEFINE_SPINLOCK(fib_multipath_lock);
-
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < fib_info_num_path((fi)); \
+ nh++, nhsel++)
-#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
-for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < fib_info_num_path((fi)); \
+ nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
/* Hope, that gcc will optimize it to get rid of dummy loop */
-#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
#define endfor_nexthops(fi) }
-static const struct
-{
- int error;
- u8 scope;
-} fib_props[RTA_MAX + 1] = {
- {
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+ [RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_UNSPEC */
- {
+ },
+ [RTN_UNICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNICAST */
- {
+ },
+ [RTN_LOCAL] = {
.error = 0,
.scope = RT_SCOPE_HOST,
- }, /* RTN_LOCAL */
- {
+ },
+ [RTN_BROADCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_BROADCAST */
- {
+ },
+ [RTN_ANYCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_ANYCAST */
- {
+ },
+ [RTN_MULTICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_MULTICAST */
- {
+ },
+ [RTN_BLACKHOLE] = {
.error = -EINVAL,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_BLACKHOLE */
- {
+ },
+ [RTN_UNREACHABLE] = {
.error = -EHOSTUNREACH,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNREACHABLE */
- {
+ },
+ [RTN_PROHIBIT] = {
.error = -EACCES,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_PROHIBIT */
- {
+ },
+ [RTN_THROW] = {
.error = -EAGAIN,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_THROW */
- {
+ },
+ [RTN_NAT] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_NAT */
- {
+ },
+ [RTN_XRESOLVE] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_XRESOLVE */
+ },
};
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+ struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+ if (!rt)
+ return;
+
+ /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+ * because we waited an RCU grace period before calling
+ * free_fib_info_rcu()
+ */
+
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
+}
+
+static void free_nh_exceptions(struct fib_nh_common *nhc)
+{
+ struct fnhe_hash_bucket *hash;
+ int i;
+
+ hash = rcu_dereference_protected(nhc->nhc_exceptions, 1);
+ if (!hash)
+ return;
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = rcu_dereference_protected(hash[i].chain, 1);
+ while (fnhe) {
+ struct fib_nh_exception *next;
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+ kfree(fnhe);
+
+ fnhe = next;
+ }
+ }
+ kfree(hash);
+}
+
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+ int cpu;
+
+ if (!rtp)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rtable *rt;
+
+ rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+ if (rt) {
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
+ }
+ }
+ free_percpu(rtp);
+}
+
+void fib_nh_common_release(struct fib_nh_common *nhc)
+{
+ netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
+ lwtstate_put(nhc->nhc_lwtstate);
+ rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+ rt_fibinfo_free(&nhc->nhc_rth_input);
+ free_nh_exceptions(nhc);
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_release);
+
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (fib_nh->nh_tclassid)
+ atomic_dec(&net->ipv4.fib_num_tclassid_users);
+#endif
+ fib_nh_common_release(&fib_nh->nh_common);
+}
/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ if (fi->nh) {
+ nexthop_put(fi->nh);
+ } else {
+ change_nexthops(fi) {
+ fib_nh_release(fi->fib_net, nexthop_nh);
+ } endfor_nexthops(fi);
+ }
+
+ ip_fib_metrics_put(fi->fib_metrics);
+
+ kfree(fi);
+}
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- printk("Freeing alive fib_info %p\n", fi);
+ pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
- change_nexthops(fi) {
- if (nh->nh_dev)
- dev_put(nh->nh_dev);
- nh->nh_dev = NULL;
- } endfor_nexthops(fi);
- fib_info_cnt--;
- kfree(fi);
+
+ call_rcu_hurry(&fi->rcu, free_fib_info_rcu);
}
+EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
- spin_lock_bh(&fib_info_lock);
- if (fi && --fi->fib_treeref == 0) {
+ ASSERT_RTNL();
+ if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
hlist_del(&fi->fib_hash);
+ fi->fib_net->ipv4.fib_info_cnt--;
+
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
- change_nexthops(fi) {
- if (!nh->nh_dev)
- continue;
- hlist_del(&nh->nh_hash);
- } endfor_nexthops(fi)
- fi->fib_dead = 1;
+ if (fi->nh) {
+ list_del(&fi->nh_list);
+ } else {
+ change_nexthops(fi) {
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ hlist_del_rcu(&nexthop_nh->nh_hash);
+ } endfor_nexthops(fi)
+ }
+ /* Paired with READ_ONCE() from fib_table_lookup() */
+ WRITE_ONCE(fi->fib_dead, 1);
fib_info_put(fi);
}
- spin_unlock_bh(&fib_info_lock);
}
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
{
- const struct fib_nh *onh = ofi->fib_nh;
+ const struct fib_nh *onh;
+
+ if (fi->nh || ofi->nh)
+ return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;
+
+ if (ofi->fib_nhs == 0)
+ return 0;
for_nexthops(fi) {
- if (nh->nh_oif != onh->nh_oif ||
- nh->nh_gw != onh->nh_gw ||
- nh->nh_scope != onh->nh_scope ||
+ onh = fib_info_nh(ofi, nhsel);
+
+ if (nh->fib_nh_oif != onh->fib_nh_oif ||
+ nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
+ nh->fib_nh_scope != onh->fib_nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- nh->nh_weight != onh->nh_weight ||
+ nh->fib_nh_weight != onh->fib_nh_weight ||
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
+ ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
+ return -1;
+
+ if (nh->fib_nh_gw_family == AF_INET &&
+ nh->fib_nh_gw4 != onh->fib_nh_gw4)
+ return -1;
+
+ if (nh->fib_nh_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
return -1;
- onh++;
} endfor_nexthops(fi);
return 0;
}
-static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+static struct hlist_head *fib_nh_head(struct net_device *dev)
+{
+ return &dev->fib_nh_head;
+}
+
+static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
+ u32 prefsrc, u32 priority)
+{
+ unsigned int val = init_val;
+
+ val ^= (protocol << 8) | scope;
+ val ^= prefsrc;
+ val ^= priority;
+
+ return val;
+}
+
+static unsigned int fib_info_hashfn_result(const struct net *net,
+ unsigned int val)
+{
+ return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
+}
+
+static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
{
- unsigned int mask = (fib_hash_size - 1);
- unsigned int val = fi->fib_nhs;
+ struct net *net = fi->fib_net;
+ unsigned int val;
+
+ val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
+ fi->fib_scope, (__force u32)fi->fib_prefsrc,
+ fi->fib_priority);
+
+ if (fi->nh) {
+ val ^= fi->nh->id;
+ } else {
+ for_nexthops(fi) {
+ val ^= nh->fib_nh_oif;
+ } endfor_nexthops(fi)
+ }
+
+ return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
+}
+
+static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
+ __be32 val)
+{
+ unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
+ u32 slot;
+
+ slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);
+
+ return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
+}
- val ^= fi->fib_protocol;
- val ^= (__force u32)fi->fib_prefsrc;
- val ^= fi->fib_priority;
+static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
+{
+ /* The second half is used for prefsrc */
+ return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head),
+ GFP_KERNEL);
+}
+
+static void fib_info_hash_free(struct hlist_head *head)
+{
+ kvfree(head);
+}
+
+static void fib_info_hash_grow(struct net *net)
+{
+ unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
+ struct hlist_head *new_info_hash, *old_info_hash;
+ unsigned int i;
+
+ if (net->ipv4.fib_info_cnt < old_size)
+ return;
+
+ new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
+ if (!new_info_hash)
+ return;
+
+ old_info_hash = net->ipv4.fib_info_hash;
+ net->ipv4.fib_info_hash = new_info_hash;
+ net->ipv4.fib_info_hash_bits += 1;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &old_info_hash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, head, fib_hash)
+ hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
+ }
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &old_info_hash[old_size + i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
+ hlist_add_head(&fi->fib_lhash,
+ fib_info_laddrhash_bucket(fi->fib_net,
+ fi->fib_prefsrc));
+ }
- return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+ fib_info_hash_free(old_info_hash);
}
-static struct fib_info *fib_find_info(const struct fib_info *nfi)
+/* no metrics, only nexthop id */
+static struct fib_info *fib_find_info_nh(struct net *net,
+ const struct fib_config *cfg)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_info *fi;
unsigned int hash;
- hash = fib_info_hashfn(nfi);
- head = &fib_info_hash[hash];
+ hash = fib_info_hashfn_1(cfg->fc_nh_id,
+ cfg->fc_protocol, cfg->fc_scope,
+ (__force u32)cfg->fc_prefsrc,
+ cfg->fc_priority);
+ hash = fib_info_hashfn_result(net, hash);
+ head = &net->ipv4.fib_info_hash[hash];
- hlist_for_each_entry(fi, node, head, fib_hash) {
+ hlist_for_each_entry(fi, head, fib_hash) {
+ if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
+ continue;
+
+ if (cfg->fc_protocol == fi->fib_protocol &&
+ cfg->fc_scope == fi->fib_scope &&
+ cfg->fc_prefsrc == fi->fib_prefsrc &&
+ cfg->fc_priority == fi->fib_priority &&
+ cfg->fc_type == fi->fib_type &&
+ cfg->fc_table == fi->fib_tb_id &&
+ !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
+ return fi;
+ }
+
+ return NULL;
+}
+
+static struct fib_info *fib_find_info(struct fib_info *nfi)
+{
+ struct hlist_head *head = fib_info_hash_bucket(nfi);
+ struct fib_info *fi;
+
+ hlist_for_each_entry(fi, head, fib_hash) {
if (fi->fib_nhs != nfi->fib_nhs)
continue;
+
if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_scope == fi->fib_scope &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
+ nfi->fib_tb_id == fi->fib_tb_id &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
- sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
- (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ sizeof(u32) * RTAX_MAX) == 0 &&
+ !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
+ nh_comp(fi, nfi) == 0)
return fi;
}
return NULL;
}
-static inline unsigned int fib_devindex_hashfn(unsigned int val)
-{
- unsigned int mask = DEVINDEX_HASHSIZE - 1;
-
- return (val ^
- (val >> DEVINDEX_HASHBITS) ^
- (val >> (DEVINDEX_HASHBITS * 2))) & mask;
-}
-
/* Check, that the gateway is already configured.
- Used only by redirect accept routine.
+ * Used only by redirect accept routine, under rcu_read_lock();
*/
-
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
- unsigned int hash;
- spin_lock(&fib_info_lock);
+ head = fib_nh_head(dev);
- hash = fib_devindex_hashfn(dev->ifindex);
- head = &fib_info_devhash[hash];
- hlist_for_each_entry(nh, node, head, nh_hash) {
- if (nh->nh_dev == dev &&
- nh->nh_gw == gw &&
- !(nh->nh_flags&RTNH_F_DEAD)) {
- spin_unlock(&fib_info_lock);
+ hlist_for_each_entry_rcu(nh, head, nh_hash) {
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ if (nh->fib_nh_gw4 == gw &&
+ !(nh->fib_nh_flags & RTNH_F_DEAD)) {
return 0;
}
}
- spin_unlock(&fib_info_lock);
-
return -1;
}
-static inline size_t fib_nlmsg_size(struct fib_info *fi)
+size_t fib_nlmsg_size(struct fib_info *fi)
{
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(4) /* RTA_DST */
+ nla_total_size(4) /* RTA_PRIORITY */
- + nla_total_size(4); /* RTA_PREFSRC */
+ + nla_total_size(4) /* RTA_PREFSRC */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+ unsigned int nhs = fib_info_num_path(fi);
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
- if (fi->fib_nhs) {
- /* Also handles the special case fib_nhs == 1 */
+ if (fi->nh)
+ payload += nla_total_size(4); /* RTA_NH_ID */
+
+ if (nhs) {
+ size_t nh_encapsize = 0;
+ /* Also handles the special case nhs == 1 */
/* each nexthop is packed in an attribute */
size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+ unsigned int i;
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
+ /* grab encap info */
+ for (i = 0; i < fib_info_num_path(fi); i++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, i);
+
+ if (nhc->nhc_lwtstate) {
+ /* RTA_ENCAP_TYPE */
+ nh_encapsize += lwtunnel_get_encap_size(
+ nhc->nhc_lwtstate);
+ /* RTA_ENCAP */
+ nh_encapsize += nla_total_size(2);
+ }
+ }
+
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
+ payload += nla_total_size((nhs * nhsize) + nh_encapsize);
+
}
return payload;
}
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info)
+ int dst_len, u32 tb_id, const struct nl_info *info,
+ unsigned int nlm_flags)
{
+ struct fib_rt_info fri;
struct sk_buff *skb;
u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
- if (skb == NULL)
+ if (!skb)
goto errout;
- err = fib_dump_info(skb, info->pid, seq, event, tb_id,
- fa->fa_type, fa->fa_scope, key, dst_len,
- fa->fa_tos, fa->fa_info, 0);
- /* failure implies BUG in fib_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
- info->nlh, GFP_KERNEL);
-errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
-}
-
-/* Return the first fib alias matching TOS with
- * priority less than or equal to PRIO.
- */
-struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
-{
- if (fah) {
- struct fib_alias *fa;
- list_for_each_entry(fa, fah, fa_list) {
- if (fa->fa_tos > tos)
- continue;
- if (fa->fa_info->fib_priority >= prio ||
- fa->fa_tos < tos)
- return fa;
- }
+ fri.fi = fa->fa_info;
+ fri.tb_id = tb_id;
+ fri.dst = key;
+ fri.dst_len = dst_len;
+ fri.dscp = fa->fa_dscp;
+ fri.type = fa->fa_type;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed = READ_ONCE(fa->offload_failed);
+ err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
}
- return NULL;
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
-int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int *dflt)
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx,
+ int dflt)
{
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
struct neighbour *n;
int state = NUD_NONE;
- n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
+ else if (nhc->nhc_gw_family == AF_INET6)
+ n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6,
+ nhc->nhc_dev);
+ else
+ n = NULL;
+
if (n) {
- state = n->nud_state;
+ state = READ_ONCE(n->nud_state);
neigh_release(n);
+ } else {
+ return 0;
}
- if (state==NUD_REACHABLE)
+ if (state == NUD_REACHABLE)
return 0;
- if ((state&NUD_VALID) && order != *dflt)
+ if ((state & NUD_VALID) && order != dflt)
return 0;
- if ((state&NUD_VALID) ||
- (*last_idx<0 && order > *dflt)) {
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
*last_resort = fi;
*last_idx = order;
}
return 1;
}
+int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
+ struct nlattr *encap, u16 encap_type,
+ void *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
+ gfp_flags);
+ if (!nhc->nhc_pcpu_rth_output)
+ return -ENOMEM;
+
+ if (encap) {
+ struct lwtunnel_state *lwtstate;
+
+ err = lwtunnel_build_state(net, encap_type, encap,
+ nhc->nhc_family, cfg, &lwtstate,
+ extack);
+ if (err)
+ goto lwt_failure;
+
+ nhc->nhc_lwtstate = lwtstate_get(lwtstate);
+ }
+
+ return 0;
+
+lwt_failure:
+ rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+ nhc->nhc_pcpu_rth_output = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_init);
+
+int fib_nh_init(struct net *net, struct fib_nh *nh,
+ struct fib_config *cfg, int nh_weight,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nh->fib_nh_family = AF_INET;
+
+ err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
+ cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
+ if (err)
+ return err;
+
+ nh->fib_nh_oif = cfg->fc_oif;
+ nh->fib_nh_gw_family = cfg->fc_gw_family;
+ if (cfg->fc_gw_family == AF_INET)
+ nh->fib_nh_gw4 = cfg->fc_gw4;
+ else if (cfg->fc_gw_family == AF_INET6)
+ nh->fib_nh_gw6 = cfg->fc_gw6;
+
+ nh->fib_nh_flags = cfg->fc_flags;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ atomic_inc(&net->ipv4.fib_num_tclassid_users);
+#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->fib_nh_weight = nh_weight;
+#endif
+ return 0;
+}
-static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
+ struct netlink_ext_ack *extack)
{
int nhs = 0;
@@ -377,43 +689,217 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
}
/* leftover implies invalid nexthop configuration, discard it */
- return remaining > 0 ? 0 : nhs;
+ if (remaining > 0) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid nexthop configuration - extra data after nexthops");
+ nhs = 0;
+ }
+
+ return nhs;
}
+static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ if (nla_len(nla) < sizeof(*gw)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ *gw = nla_get_in_addr(nla);
+
+ return 0;
+}
+
+/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
- int remaining, struct fib_config *cfg)
+ int remaining, struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = fi->fib_net;
+ struct fib_config fib_cfg;
+ struct fib_nh *nh;
+ int ret;
+
change_nexthops(fi) {
int attrlen;
- if (!rtnh_ok(rtnh, remaining))
+ memset(&fib_cfg, 0, sizeof(fib_cfg));
+
+ if (!rtnh_ok(rtnh, remaining)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid nexthop configuration - extra data after nexthop");
+ return -EINVAL;
+ }
+
+ if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid flags for nexthop - can not contain DEAD or LINKDOWN");
return -EINVAL;
+ }
- nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
- nh->nh_oif = rtnh->rtnh_ifindex;
- nh->nh_weight = rtnh->rtnh_hops + 1;
+ fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+ fib_cfg.fc_oif = rtnh->rtnh_ifindex;
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+ nlav = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nlav) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ return -EINVAL;
+ }
+ if (nla) {
+ ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla,
+ extack);
+ if (ret)
+ goto errout;
+
+ if (fib_cfg.fc_gw4)
+ fib_cfg.fc_gw_family = AF_INET;
+ } else if (nlav) {
+ ret = fib_gw_from_via(&fib_cfg, nlav, extack);
+ if (ret)
+ goto errout;
+ }
+
nla = nla_find(attrs, attrlen, RTA_FLOW);
- nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
-#endif
+ if (nla) {
+ if (nla_len(nla) < sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+ return -EINVAL;
+ }
+ fib_cfg.fc_flow = nla_get_u32(nla);
+ }
+
+ fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ /* RTA_ENCAP_TYPE length checked in
+ * lwtunnel_valid_encap_type_attr
+ */
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ fib_cfg.fc_encap_type = nla_get_u16(nla);
}
+ ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
+ rtnh->rtnh_hops + 1, extack);
+ if (ret)
+ goto errout;
+
rtnh = rtnh_next(rtnh, &remaining);
} endfor_nexthops(fi);
- return 0;
+ ret = -EINVAL;
+ nh = fib_info_nh(fi, 0);
+ if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop device index does not match RTA_OIF");
+ goto errout;
+ }
+ if (cfg->fc_gw_family) {
+ if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
+ (cfg->fc_gw_family == AF_INET &&
+ nh->fib_nh_gw4 != cfg->fc_gw4) ||
+ (cfg->fc_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
+ goto errout;
+ }
+ }
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop class id does not match RTA_FLOW");
+ goto errout;
+ }
+#endif
+ ret = 0;
+errout:
+ return ret;
}
-#endif
+/* only called when fib_nh is integrated into fib_info */
+static void fib_rebalance(struct fib_info *fi)
+{
+ int total;
+ int w;
+
+ if (fib_info_num_path(fi) < 2)
+ return;
+
+ total = 0;
+ for_nexthops(fi) {
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
+ continue;
+
+ if (ip_ignore_linkdown(nh->fib_nh_dev) &&
+ nh->fib_nh_flags & RTNH_F_LINKDOWN)
+ continue;
+
+ total += nh->fib_nh_weight;
+ } endfor_nexthops(fi);
+
+ w = 0;
+ change_nexthops(fi) {
+ int upper_bound;
+
+ if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
+ upper_bound = -1;
+ } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
+ nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
+ upper_bound = -1;
+ } else {
+ w += nexthop_nh->fib_nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
+ } endfor_nexthops(fi);
+}
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+ int remaining, struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");
+
+ return -EINVAL;
+}
+
+#define fib_rebalance(fi) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
+static int fib_encap_match(struct net *net, u16 encap_type,
+ struct nlattr *encap,
+ const struct fib_nh *nh,
+ const struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct lwtunnel_state *lwtstate;
+ int ret, result = 0;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE)
+ return 0;
+
+ ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
+ cfg, &lwtstate, extack);
+ if (!ret) {
+ result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
+ lwtstate_free(lwtstate);
+ }
+
+ return result;
+}
+
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
+ struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
@@ -423,40 +909,119 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
return 1;
- if (cfg->fc_oif || cfg->fc_gw) {
- if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
- (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
+ if (cfg->fc_nh_id) {
+ if (fi->nh && cfg->fc_nh_id == fi->nh->id)
return 0;
return 1;
}
+ if (fi->nh) {
+ if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp)
+ return 1;
+ return 0;
+ }
+
+ if (cfg->fc_oif || cfg->fc_gw_family) {
+ struct fib_nh *nh;
+
+ nh = fib_info_nh(fi, 0);
+ if (cfg->fc_encap) {
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, nh, cfg, extack))
+ return 1;
+ }
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (cfg->fc_flow &&
+ cfg->fc_flow != nh->nh_tclassid)
+ return 1;
+#endif
+ if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
+ (cfg->fc_gw_family &&
+ cfg->fc_gw_family != nh->fib_nh_gw_family))
+ return 1;
+
+ if (cfg->fc_gw_family == AF_INET &&
+ cfg->fc_gw4 != nh->fib_nh_gw4)
+ return 1;
+
+ if (cfg->fc_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
+ return 1;
+
+ return 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (cfg->fc_mp == NULL)
+ if (!cfg->fc_mp)
return 0;
rtnh = cfg->fc_mp;
remaining = cfg->fc_mp_len;
-
+
for_nexthops(fi) {
int attrlen;
if (!rtnh_ok(rtnh, remaining))
return -EINVAL;
- if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
+ if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
return 1;
attrlen = rtnh_attrlen(rtnh);
- if (attrlen < 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
+ int err;
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- if (nla && nla_get_be32(nla) != nh->nh_gw)
- return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+ nlav = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nlav) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ return -EINVAL;
+ }
+
+ if (nla) {
+ __be32 gw;
+
+ err = fib_gw_from_attr(&gw, nla, extack);
+ if (err)
+ return err;
+
+ if (nh->fib_nh_gw_family != AF_INET ||
+ gw != nh->fib_nh_gw4)
+ return 1;
+ } else if (nlav) {
+ struct fib_config cfg2;
+
+ err = fib_gw_from_via(&cfg2, nlav, extack);
+ if (err)
+ return err;
+
+ switch (nh->fib_nh_gw_family) {
+ case AF_INET:
+ if (cfg2.fc_gw_family != AF_INET ||
+ cfg2.fc_gw4 != nh->fib_nh_gw4)
+ return 1;
+ break;
+ case AF_INET6:
+ if (cfg2.fc_gw_family != AF_INET6 ||
+ ipv6_addr_cmp(&cfg2.fc_gw6,
+ &nh->fib_nh_gw6))
+ return 1;
+ break;
+ }
+ }
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
- if (nla && nla_get_u32(nla) != nh->nh_tclassid)
- return 1;
+ if (nla) {
+ if (nla_len(nla) < sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+ return -EINVAL;
+ }
+ if (nla_get_u32(nla) != nh->nh_tclassid)
+ return 1;
+ }
#endif
}
@@ -466,402 +1031,545 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
return 0;
}
+bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
+{
+ struct nlattr *nla;
+ int remaining;
-/*
- Picture
- -------
-
- Semantics of nexthop is very messy by historical reasons.
- We have to take into account, that:
- a) gateway can be actually local interface address,
- so that gatewayed route is direct.
- b) gateway must be on-link address, possibly
- described not by an ifaddr, but also by a direct route.
- c) If both gateway and interface are specified, they should not
- contradict.
- d) If we use tunnel routes, gateway could be not on-link.
-
- Attempt to reconcile all of these (alas, self-contradictory) conditions
- results in pretty ugly and hairy code with obscure logic.
-
- I chose to generalized it instead, so that the size
- of code does not increase practically, but it becomes
- much more general.
- Every prefix is assigned a "scope" value: "host" is local address,
- "link" is direct route,
- [ ... "site" ... "interior" ... ]
- and "universe" is true gateway route with global meaning.
-
- Every prefix refers to a set of "nexthop"s (gw, oif),
- where gw must have narrower scope. This recursion stops
- when gw has LOCAL scope or if "nexthop" is declared ONLINK,
- which means that gw is forced to be on link.
-
- Code is still hairy, but now it is apparently logically
- consistent and very flexible. F.e. as by-product it allows
- to co-exists in peace independent exterior and interior
- routing processes.
-
- Normally it looks as following.
-
- {universe prefix} -> (gw, oif) [scope link]
- |
- |-> {link prefix} -> (gw, oif) [scope local]
- |
- |-> {local prefix} (terminal node)
- */
+ if (!cfg->fc_mx)
+ return true;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 fi_val, val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return false;
+
+ type = array_index_nospec(type, RTAX_MAX + 1);
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+ bool ecn_ca = false;
+
+ nla_strscpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ } else {
+ if (nla_len(nla) != sizeof(u32))
+ return false;
+ val = nla_get_u32(nla);
+ }
-static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
- struct fib_nh *nh)
+ fi_val = fi->fib_metrics->metrics[type - 1];
+ if (type == RTAX_FEATURES)
+ fi_val &= ~DST_FEATURE_ECN_CA;
+
+ if (fi_val != val)
+ return false;
+ }
+
+ return true;
+}
+
+static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
+ u32 table, struct netlink_ext_ack *extack)
{
+ struct fib6_config cfg = {
+ .fc_table = table,
+ .fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
+ .fc_ifindex = nh->fib_nh_oif,
+ .fc_gateway = nh->fib_nh_gw6,
+ };
+ struct fib6_nh fib6_nh = {};
int err;
- if (nh->nh_gw) {
- struct fib_result res;
+ err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
+ if (!err) {
+ nh->fib_nh_dev = fib6_nh.fib_nh_dev;
+ netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+ GFP_KERNEL);
+ nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
+ nh->fib_nh_scope = RT_SCOPE_LINK;
-#ifdef CONFIG_IP_ROUTE_PERVASIVE
- if (nh->nh_flags&RTNH_F_PERVASIVE)
- return 0;
-#endif
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
+ ipv6_stub->fib6_nh_release(&fib6_nh);
+ }
- if (cfg->fc_scope >= RT_SCOPE_LINK)
- return -EINVAL;
- if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
- return -EINVAL;
- if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
- return -ENODEV;
- if (!(dev->flags&IFF_UP))
- return -ENETDOWN;
- nh->nh_dev = dev;
- dev_hold(dev);
- nh->nh_scope = RT_SCOPE_LINK;
- return 0;
- }
- {
- struct flowi fl = {
- .nl_u = {
- .ip4_u = {
- .daddr = nh->nh_gw,
- .scope = cfg->fc_scope + 1,
- },
- },
- .oif = nh->nh_oif,
- };
-
- /* It is not necessary, but requires a bit of thinking */
- if (fl.fl4_scope < RT_SCOPE_LINK)
- fl.fl4_scope = RT_SCOPE_LINK;
- if ((err = fib_lookup(&fl, &res)) != 0)
- return err;
- }
- err = -EINVAL;
- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
- goto out;
- nh->nh_scope = res.scope;
- nh->nh_oif = FIB_RES_OIF(res);
- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
- goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
- goto out;
- err = 0;
-out:
- fib_res_put(&res);
- return err;
- } else {
- struct in_device *in_dev;
+ return err;
+}
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
- return -EINVAL;
+/*
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
+ */
+static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
+ u8 scope, struct netlink_ext_ack *extack)
+{
+ struct net_device *dev;
+ struct fib_result res;
+ int err = 0;
- in_dev = inetdev_by_index(nh->nh_oif);
- if (in_dev == NULL)
+ if (nh->fib_nh_flags & RTNH_F_ONLINK) {
+ unsigned int addr_type;
+
+ if (scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
+ return -EINVAL;
+ }
+ dev = __dev_get_by_index(net, nh->fib_nh_oif);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
return -ENODEV;
- if (!(in_dev->dev->flags&IFF_UP)) {
- in_dev_put(in_dev);
+ }
+ if (!(dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
return -ENETDOWN;
}
- nh->nh_dev = in_dev->dev;
- dev_hold(nh->nh_dev);
- nh->nh_scope = RT_SCOPE_HOST;
- in_dev_put(in_dev);
+ addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
+ if (addr_type != RTN_UNICAST) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ return -EINVAL;
+ }
+ if (!netif_carrier_ok(dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ nh->fib_nh_dev = dev;
+ netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ nh->fib_nh_scope = RT_SCOPE_LINK;
+ return 0;
}
- return 0;
-}
+ rcu_read_lock();
+ {
+ struct fib_table *tbl = NULL;
+ struct flowi4 fl4 = {
+ .daddr = nh->fib_nh_gw4,
+ .flowi4_scope = scope + 1,
+ .flowi4_oif = nh->fib_nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+
+ if (table && table != RT_TABLE_MAIN)
+ tbl = fib_get_table(net, table);
+
+ if (tbl)
+ err = fib_table_lookup(tbl, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE |
+ FIB_LOOKUP_NOREF);
+
+ /* on error or if no table given do full lookup. This
+ * is needed for example when nexthops are in the local
+ * table rather than the given table
+ */
+ if (!tbl || err) {
+ err = fib_lookup(net, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE);
+ }
-static inline unsigned int fib_laddr_hashfn(__be32 val)
-{
- unsigned int mask = (fib_hash_size - 1);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ goto out;
+ }
+ }
- return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ goto out;
+ }
+ nh->fib_nh_scope = res.scope;
+ nh->fib_nh_oif = FIB_RES_OIF(res);
+ nh->fib_nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack,
+ "No egress device for nexthop gateway");
+ goto out;
+ }
+ netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ if (!netif_carrier_ok(dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
+out:
+ rcu_read_unlock();
+ return err;
}
-static struct hlist_head *fib_hash_alloc(int bytes)
+static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
+ struct netlink_ext_ack *extack)
{
- if (bytes <= PAGE_SIZE)
- return kmalloc(bytes, GFP_KERNEL);
- else
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(bytes));
+ struct in_device *in_dev;
+ int err;
+
+ if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+
+ err = -ENODEV;
+ in_dev = inetdev_by_index(net, nh->fib_nh_oif);
+ if (!in_dev)
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+ goto out;
+ }
+
+ nh->fib_nh_dev = in_dev->dev;
+ netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ nh->fib_nh_scope = RT_SCOPE_HOST;
+ if (!netif_carrier_ok(nh->fib_nh_dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
}
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+ struct netlink_ext_ack *extack)
{
- if (!hash)
- return;
+ int err;
- if (bytes <= PAGE_SIZE)
- kfree(hash);
+ if (nh->fib_nh_gw_family == AF_INET)
+ err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
+ else if (nh->fib_nh_gw_family == AF_INET6)
+ err = fib_check_nh_v6_gw(net, nh, table, extack);
else
- free_pages((unsigned long) hash, get_order(bytes));
+ err = fib_check_nh_nongw(net, nh, extack);
+
+ return err;
}
-static void fib_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+ unsigned char scope)
{
- struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_hash_size;
- unsigned int i, bytes;
+ struct fib_nh *nh;
+ __be32 saddr;
- spin_lock_bh(&fib_info_lock);
- old_info_hash = fib_info_hash;
- old_laddrhash = fib_info_laddrhash;
- fib_hash_size = new_size;
+ if (nhc->nhc_family != AF_INET)
+ return inet_select_addr(nhc->nhc_dev, 0, scope);
- for (i = 0; i < old_size; i++) {
- struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *node, *n;
- struct fib_info *fi;
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
+
+ WRITE_ONCE(nh->nh_saddr, saddr);
+ WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid));
- hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
- struct hlist_head *dest;
- unsigned int new_hash;
+ return saddr;
+}
- hlist_del(&fi->fib_hash);
+__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
+{
+ struct fib_nh_common *nhc = res->nhc;
- new_hash = fib_info_hashfn(fi);
- dest = &new_info_hash[new_hash];
- hlist_add_head(&fi->fib_hash, dest);
- }
+ if (res->fi->fib_prefsrc)
+ return res->fi->fib_prefsrc;
+
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (READ_ONCE(nh->nh_saddr_genid) ==
+ atomic_read(&net->ipv4.dev_addr_genid))
+ return READ_ONCE(nh->nh_saddr);
}
- fib_info_hash = new_info_hash;
- for (i = 0; i < old_size; i++) {
- struct hlist_head *lhead = &fib_info_laddrhash[i];
- struct hlist_node *node, *n;
- struct fib_info *fi;
+ return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
+}
- hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
- struct hlist_head *ldest;
- unsigned int new_hash;
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+ if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+ fib_prefsrc != cfg->fc_dst) {
+ u32 tb_id = cfg->fc_table;
+ int rc;
- hlist_del(&fi->fib_lhash);
+ if (tb_id == RT_TABLE_MAIN)
+ tb_id = RT_TABLE_LOCAL;
- new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
- ldest = &new_laddrhash[new_hash];
- hlist_add_head(&fi->fib_lhash, ldest);
- }
- }
- fib_info_laddrhash = new_laddrhash;
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, tb_id);
- spin_unlock_bh(&fib_info_lock);
+ if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, RT_TABLE_LOCAL);
+ }
- bytes = old_size * sizeof(struct hlist_head *);
- fib_hash_free(old_info_hash, bytes);
- fib_hash_free(old_laddrhash, bytes);
+ if (rc != RTN_LOCAL)
+ return false;
+ }
+ return true;
}
-struct fib_info *fib_create_info(struct fib_config *cfg)
+struct fib_info *fib_create_info(struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
{
int err;
struct fib_info *fi = NULL;
+ struct nexthop *nh = NULL;
struct fib_info *ofi;
int nhs = 1;
+ struct net *net = cfg->fc_nlinfo.nl_net;
+
+ ASSERT_RTNL();
+ if (cfg->fc_type > RTN_MAX)
+ goto err_inval;
/* Fast check to catch the most weird cases */
- if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
+ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid scope");
goto err_inval;
+ }
+
+ if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid rtm_flags - can not contain DEAD or LINKDOWN");
+ goto err_inval;
+ }
+
+ if (cfg->fc_nh_id) {
+ if (!cfg->fc_mx) {
+ fi = fib_find_info_nh(net, cfg);
+ if (fi) {
+ refcount_inc(&fi->fib_treeref);
+ return fi;
+ }
+ }
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto err_inval;
+ }
+ nhs = 0;
+ }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (cfg->fc_mp) {
- nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
+ nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
if (nhs == 0)
goto err_inval;
}
#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- if (cfg->fc_mp_alg) {
- if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
- cfg->fc_mp_alg > IP_MP_ALG_MAX)
- goto err_inval;
- }
-#endif
-
- err = -ENOBUFS;
- if (fib_info_cnt >= fib_hash_size) {
- unsigned int new_size = fib_hash_size << 1;
- struct hlist_head *new_info_hash;
- struct hlist_head *new_laddrhash;
- unsigned int bytes;
-
- if (!new_size)
- new_size = 1;
- bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_hash_alloc(bytes);
- new_laddrhash = fib_hash_alloc(bytes);
- if (!new_info_hash || !new_laddrhash) {
- fib_hash_free(new_info_hash, bytes);
- fib_hash_free(new_laddrhash, bytes);
- } else {
- memset(new_info_hash, 0, bytes);
- memset(new_laddrhash, 0, bytes);
- fib_hash_move(new_info_hash, new_laddrhash, new_size);
- }
+ fib_info_hash_grow(net);
- if (!fib_hash_size)
- goto failure;
+ fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
+ if (!fi) {
+ err = -ENOBUFS;
+ goto failure;
}
- fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
- if (fi == NULL)
- goto failure;
- fib_info_cnt++;
+ fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
+ if (IS_ERR(fi->fib_metrics)) {
+ err = PTR_ERR(fi->fib_metrics);
+ kfree(fi);
+ return ERR_PTR(err);
+ }
+ fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
+ fi->fib_scope = cfg->fc_scope;
fi->fib_flags = cfg->fc_flags;
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
+ fi->fib_type = cfg->fc_type;
+ fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
- change_nexthops(fi) {
- nh->nh_parent = fi;
- } endfor_nexthops(fi)
-
- if (cfg->fc_mx) {
- struct nlattr *nla;
- int remaining;
-
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla->nla_type;
-
- if (type) {
- if (type > RTAX_MAX)
- goto err_inval;
- fi->fib_metrics[type - 1] = nla_get_u32(nla);
- }
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = 0;
+ fi->nh = nh;
}
- }
-
- if (cfg->fc_mp) {
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
- if (err != 0)
- goto failure;
- if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
- goto err_inval;
- if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
- goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
- if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
- goto err_inval;
-#endif
-#else
- goto err_inval;
-#endif
} else {
- struct fib_nh *nh = fi->fib_nh;
+ change_nexthops(fi) {
+ nexthop_nh->nh_parent = fi;
+ } endfor_nexthops(fi)
- nh->nh_oif = cfg->fc_oif;
- nh->nh_gw = cfg->fc_gw;
- nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
- nh->nh_tclassid = cfg->fc_flow;
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- nh->nh_weight = 1;
-#endif
+ if (cfg->fc_mp)
+ err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
+ extack);
+ else
+ err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- fi->fib_mp_alg = cfg->fc_mp_alg;
-#endif
+ if (err != 0)
+ goto failure;
if (fib_props[cfg->fc_type].error) {
- if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
+ if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
+ NL_SET_ERR_MSG(extack,
+ "Gateway, device and multipath can not be specified for this route type");
goto err_inval;
+ }
goto link_it;
+ } else {
+ switch (cfg->fc_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid route type");
+ goto err_inval;
+ }
}
- if (cfg->fc_scope > RT_SCOPE_HOST)
+ if (cfg->fc_scope > RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Invalid scope");
goto err_inval;
+ }
- if (cfg->fc_scope == RT_SCOPE_HOST) {
+ if (fi->nh) {
+ err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
+ if (err)
+ goto failure;
+ } else if (cfg->fc_scope == RT_SCOPE_HOST) {
struct fib_nh *nh = fi->fib_nh;
/* Local address is added. */
- if (nhs != 1 || nh->nh_gw)
+ if (nhs != 1) {
+ NL_SET_ERR_MSG(extack,
+ "Route with host scope can not have multiple nexthops");
goto err_inval;
- nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+ }
+ if (nh->fib_nh_gw_family) {
+ NL_SET_ERR_MSG(extack,
+ "Route with host scope can not have a gateway");
+ goto err_inval;
+ }
+ nh->fib_nh_scope = RT_SCOPE_NOWHERE;
+ nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
err = -ENODEV;
- if (nh->nh_dev == NULL)
+ if (!nh->fib_nh_dev)
goto failure;
+ netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+ GFP_KERNEL);
} else {
+ int linkdown = 0;
+
change_nexthops(fi) {
- if ((err = fib_check_nh(cfg, fi, nh)) != 0)
+ err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+ cfg->fc_table, cfg->fc_scope,
+ extack);
+ if (err != 0)
goto failure;
+ if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
+ linkdown++;
} endfor_nexthops(fi)
+ if (linkdown == fi->fib_nhs)
+ fi->fib_flags |= RTNH_F_LINKDOWN;
}
- if (fi->fib_prefsrc) {
- if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
- fi->fib_prefsrc != cfg->fc_dst)
- if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
- goto err_inval;
+ if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
+ NL_SET_ERR_MSG(extack, "Invalid prefsrc address");
+ goto err_inval;
+ }
+
+ if (!fi->nh) {
+ change_nexthops(fi) {
+ fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+ fi->fib_scope);
+ if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+ fi->fib_nh_is_v6 = true;
+ } endfor_nexthops(fi)
+
+ fib_rebalance(fi);
}
link_it:
- if ((ofi = fib_find_info(fi)) != NULL) {
+ ofi = fib_find_info(fi);
+ if (ofi) {
+ /* fib_table_lookup() should not see @fi yet. */
fi->fib_dead = 1;
free_fib_info(fi);
- ofi->fib_treeref++;
+ refcount_inc(&ofi->fib_treeref);
return ofi;
}
- fi->fib_treeref++;
- atomic_inc(&fi->fib_clntref);
- spin_lock_bh(&fib_info_lock);
- hlist_add_head(&fi->fib_hash,
- &fib_info_hash[fib_info_hashfn(fi)]);
+ refcount_set(&fi->fib_treeref, 1);
+ refcount_set(&fi->fib_clntref, 1);
+
+ net->ipv4.fib_info_cnt++;
+ hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
+
if (fi->fib_prefsrc) {
struct hlist_head *head;
- head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc);
hlist_add_head(&fi->fib_lhash, head);
}
- change_nexthops(fi) {
- struct hlist_head *head;
- unsigned int hash;
+ if (fi->nh) {
+ list_add(&fi->nh_list, &nh->fi_list);
+ } else {
+ change_nexthops(fi) {
+ struct hlist_head *head;
- if (!nh->nh_dev)
- continue;
- hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
- head = &fib_info_devhash[hash];
- hlist_add_head(&nh->nh_hash, head);
- } endfor_nexthops(fi)
- spin_unlock_bh(&fib_info_lock);
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ head = fib_nh_head(nexthop_nh->fib_nh_dev);
+ hlist_add_head_rcu(&nexthop_nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ }
return fi;
err_inval:
err = -EINVAL;
failure:
- if (fi) {
+ if (fi) {
+ /* fib_table_lookup() should not see @fi yet. */
fi->fib_dead = 1;
free_fib_info(fi);
}
@@ -869,355 +1577,689 @@ failure:
return ERR_PTR(err);
}
-/* Note! fib_semantic_match intentionally uses RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, __be32 zone, __be32 mask,
- int prefixlen)
+int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
+ u8 rt_family, unsigned char *flags, bool skip_oif)
{
- struct fib_alias *fa;
- int nh_sel = 0;
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ *flags |= RTNH_F_DEAD;
+
+ if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
+ *flags |= RTNH_F_LINKDOWN;
+
+ rcu_read_lock();
+ switch (nhc->nhc_family) {
+ case AF_INET:
+ if (ip_ignore_linkdown(nhc->nhc_dev))
+ *flags |= RTNH_F_DEAD;
+ break;
+ case AF_INET6:
+ if (ip6_ignore_linkdown(nhc->nhc_dev))
+ *flags |= RTNH_F_DEAD;
+ break;
+ }
+ rcu_read_unlock();
+ }
- list_for_each_entry_rcu(fa, head, fa_list) {
- int err;
+ switch (nhc->nhc_gw_family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ /* if gateway family does not match nexthop family
+ * gateway is encoded as RTA_VIA
+ */
+ if (rt_family != nhc->nhc_gw_family) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
- if (fa->fa_tos &&
- fa->fa_tos != flp->fl4_tos)
- continue;
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
+ } else if (nla_put_in6_addr(skb, RTA_GATEWAY,
+ &nhc->nhc_gw.ipv6) < 0) {
+ goto nla_put_failure;
+ }
+ break;
+ }
- if (fa->fa_scope < flp->fl4_scope)
- continue;
+ *flags |= (nhc->nhc_flags &
+ (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP));
- fa->fa_state |= FA_S_ACCESSED;
+ if (!skip_oif && nhc->nhc_dev &&
+ nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
+ goto nla_put_failure;
- err = fib_props[fa->fa_type].error;
- if (err == 0) {
- struct fib_info *fi = fa->fa_info;
+ if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
+ RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
- if (fi->fib_flags & RTNH_F_DEAD)
- continue;
+ return 0;
- switch (fa->fa_type) {
- case RTN_UNICAST:
- case RTN_LOCAL:
- case RTN_BROADCAST:
- case RTN_ANYCAST:
- case RTN_MULTICAST:
- for_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
- continue;
- if (!flp->oif || flp->oif == nh->nh_oif)
- break;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (nhsel < fi->fib_nhs) {
- nh_sel = nhsel;
- goto out_fill_res;
- }
-#else
- if (nhsel < 1) {
- goto out_fill_res;
- }
+nla_put_failure:
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(fib_nexthop_info);
+
+#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
+int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
+ int nh_weight, u8 rt_family, u32 nh_tclassid)
+{
+ const struct net_device *dev = nhc->nhc_dev;
+ struct rtnexthop *rtnh;
+ unsigned char flags = 0;
+
+ rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+ if (!rtnh)
+ goto nla_put_failure;
+
+ rtnh->rtnh_hops = nh_weight - 1;
+ rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
+
+ if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
+ goto nla_put_failure;
+
+ rtnh->rtnh_flags = flags;
+
+ if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid))
+ goto nla_put_failure;
+
+ /* length of rtnetlink header + attributes */
+ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(fib_add_nexthop);
#endif
- endfor_nexthops(fi);
- continue;
- default:
- printk(KERN_DEBUG "impossible 102\n");
- return -EINVAL;
- };
- }
- return err;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+ struct nlattr *mp;
+
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (unlikely(fi->nh)) {
+ if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
+ goto nla_put_failure;
+ goto mp_end;
}
- return 1;
-out_fill_res:
- res->prefixlen = prefixlen;
- res->nh_sel = nh_sel;
- res->type = fa->fa_type;
- res->scope = fa->fa_scope;
- res->fi = fa->fa_info;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- res->netmask = mask;
- res->network = zone & inet_make_mask(prefixlen);
+ for_nexthops(fi) {
+ u32 nh_tclassid = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh_tclassid = nh->nh_tclassid;
#endif
- atomic_inc(&res->fi->fib_clntref);
- return 0;
-}
+ if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+ AF_INET, nh_tclassid) < 0)
+ goto nla_put_failure;
+ } endfor_nexthops(fi);
-/* Find appropriate source address to this destination */
+mp_end:
+ nla_nest_end(skb, mp);
-__be32 __fib_res_prefsrc(struct fib_result *res)
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+#else
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
- return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
+ return 0;
}
+#endif
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
- struct fib_info *fi, unsigned int flags)
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ const struct fib_rt_info *fri, unsigned int flags)
{
+ unsigned int nhs = fib_info_num_path(fri->fi);
+ struct fib_info *fi = fri->fi;
+ u32 tb_id = fri->tb_id;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET;
- rtm->rtm_dst_len = dst_len;
+ rtm->rtm_dst_len = fri->dst_len;
rtm->rtm_src_len = 0;
- rtm->rtm_tos = tos;
- rtm->rtm_table = tb_id;
- NLA_PUT_U32(skb, RTA_TABLE, tb_id);
- rtm->rtm_type = type;
+ rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp);
+ if (tb_id < 256)
+ rtm->rtm_table = tb_id;
+ else
+ rtm->rtm_table = RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
+ rtm->rtm_type = fri->type;
rtm->rtm_flags = fi->fib_flags;
- rtm->rtm_scope = scope;
+ rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
- if (rtm->rtm_dst_len)
- NLA_PUT_BE32(skb, RTA_DST, dst);
-
- if (fi->fib_priority)
- NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
-
- if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ if (rtm->rtm_dst_len &&
+ nla_put_in_addr(skb, RTA_DST, fri->dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
goto nla_put_failure;
- if (fi->fib_prefsrc)
- NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
-
- if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
+ if (fi->fib_prefsrc &&
+ nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
- if (fi->fib_nh->nh_oif)
- NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (fi->fib_nh[0].nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
-#endif
+ if (fi->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
+ goto nla_put_failure;
+ if (nexthop_is_blackhole(fi->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+ if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
+ goto offload;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (fi->fib_nhs > 1) {
- struct rtnexthop *rtnh;
- struct nlattr *mp;
- mp = nla_nest_start(skb, RTA_MULTIPATH);
- if (mp == NULL)
- goto nla_put_failure;
+ if (nhs == 1) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+ unsigned char flags = 0;
- for_nexthops(fi) {
- rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
- if (rtnh == NULL)
- goto nla_put_failure;
+ if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
+ goto nla_put_failure;
- rtnh->rtnh_flags = nh->nh_flags & 0xFF;
- rtnh->rtnh_hops = nh->nh_weight - 1;
- rtnh->rtnh_ifindex = nh->nh_oif;
+ rtm->rtm_flags = flags;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
- if (nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (nh->nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
+ }
#endif
- /* length of rtnetlink header + attributes */
- rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
- } endfor_nexthops(fi);
-
- nla_nest_end(skb, mp);
+ } else {
+ if (fib_add_multipath(skb, fi) < 0)
+ goto nla_put_failure;
}
-#endif
- return nlmsg_end(skb, nlh);
+
+offload:
+ if (fri->offload)
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (fri->trap)
+ rtm->rtm_flags |= RTM_F_TRAP;
+ if (fri->offload_failed)
+ rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
+
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
/*
- Update FIB if:
- - local address disappeared -> we must delete all the entries
- referring to it.
- - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
*/
-
-int fib_sync_down(__be32 local, struct net_device *dev, int force)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
+ int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
+ struct net *net = dev_net(dev);
+ struct hlist_head *head;
+ struct fib_info *fi;
int ret = 0;
- int scope = RT_SCOPE_NOWHERE;
-
- if (force)
- scope = -1;
- if (local && fib_info_laddrhash) {
- unsigned int hash = fib_laddr_hashfn(local);
- struct hlist_head *head = &fib_info_laddrhash[hash];
- struct hlist_node *node;
- struct fib_info *fi;
+ if (!local)
+ return 0;
- hlist_for_each_entry(fi, node, head, fib_lhash) {
- if (fi->fib_prefsrc == local) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
- }
+ head = fib_info_laddrhash_bucket(net, local);
+ hlist_for_each_entry(fi, head, fib_lhash) {
+ if (!net_eq(fi->fib_net, net) ||
+ fi->fib_tb_id != tb_id)
+ continue;
+ if (fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ fi->pfsrc_removed = true;
+ ret++;
}
}
+ return ret;
+}
- if (dev) {
- struct fib_info *prev_fi = NULL;
- unsigned int hash = fib_devindex_hashfn(dev->ifindex);
- struct hlist_head *head = &fib_info_devhash[hash];
- struct hlist_node *node;
- struct fib_nh *nh;
+static int call_fib_nh_notifiers(struct fib_nh *nh,
+ enum fib_event_type event_type)
+{
+ bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
+ struct fib_nh_notifier_info info = {
+ .fib_nh = nh,
+ };
+
+ switch (event_type) {
+ case FIB_EVENT_NH_ADD:
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
+ break;
+ if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
+ break;
+ return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
+ &info.info);
+ case FIB_EVENT_NH_DEL:
+ if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
+ (nh->fib_nh_flags & RTNH_F_DEAD))
+ return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
+ event_type, &info.info);
+ break;
+ default:
+ break;
+ }
- hlist_for_each_entry(nh, node, head, nh_hash) {
- struct fib_info *fi = nh->nh_parent;
- int dead;
+ return NOTIFY_DONE;
+}
- BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
- continue;
- prev_fi = fi;
- dead = 0;
- change_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
- dead++;
- else if (nh->nh_dev == dev &&
- nh->nh_scope != scope) {
- nh->nh_flags |= RTNH_F_DEAD;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nh->nh_power;
- nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
- dead++;
+/* Update the PMTU of exceptions when:
+ * - the new MTU of the first hop becomes smaller than the PMTU
+ * - the old MTU was the same as the PMTU, and it limited discovery of
+ * larger MTUs on the path. With that limit raised, we can now
+ * discover larger MTUs
+ * A special case is locked exceptions, for which the PMTU is smaller
+ * than the minimal accepted PMTU:
+ * - if the new MTU is greater than the PMTU, don't make any change
+ * - otherwise, unlock and set PMTU
+ */
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
+{
+ struct fnhe_hash_bucket *bucket;
+ int i;
+
+ bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
+ if (!bucket)
+ return;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
+ fnhe;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
+ if (fnhe->fnhe_mtu_locked) {
+ if (new <= fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ fnhe->fnhe_mtu_locked = false;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (force > 1 && nh->nh_dev == dev) {
- dead = fi->fib_nhs;
+ } else if (new < fnhe->fnhe_pmtu ||
+ orig == fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ }
+ }
+ }
+}
+
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ struct hlist_head *head = fib_nh_head(dev);
+ struct fib_nh *nh;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+ }
+}
+
+/* Event force Flags Description
+ * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
+ * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
+ * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed
+ * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed
+ *
+ * only used when fib_nh is built into fib_info
+ */
+int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
+{
+ struct hlist_head *head = fib_nh_head(dev);
+ struct fib_info *prev_fi = NULL;
+ int scope = RT_SCOPE_NOWHERE;
+ struct fib_nh *nh;
+ int ret = 0;
+
+ if (force)
+ scope = -1;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int dead;
+
+ BUG_ON(!fi->fib_nhs);
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ if (fi == prev_fi)
+ continue;
+ prev_fi = fi;
+ dead = 0;
+ change_nexthops(fi) {
+ if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
+ dead++;
+ else if (nexthop_nh->fib_nh_dev == dev &&
+ nexthop_nh->fib_nh_scope != scope) {
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
+ fallthrough;
+ case NETDEV_CHANGE:
+ nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
break;
}
+ call_fib_nh_notifiers(nexthop_nh,
+ FIB_EVENT_NH_DEL);
+ dead++;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (event == NETDEV_UNREGISTER &&
+ nexthop_nh->fib_nh_dev == dev) {
+ dead = fi->fib_nhs;
+ break;
+ }
#endif
- } endfor_nexthops(fi)
- if (dead == fi->fib_nhs) {
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
fi->fib_flags |= RTNH_F_DEAD;
- ret++;
+ fallthrough;
+ case NETDEV_CHANGE:
+ fi->fib_flags |= RTNH_F_LINKDOWN;
+ break;
}
+ ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
+/* Must be invoked inside of an RCU protected region. */
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct hlist_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ u8 slen = 32 - res->prefixlen;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa, *fa1 = NULL;
+ u32 last_prio = res->fi->fib_priority;
+ dscp_t last_dscp = 0;
+
+ hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+ struct fib_nh_common *nhc;
+
+ if (fa->fa_slen != slen)
+ continue;
+ if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
+ continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
+ if (next_fi->fib_priority > last_prio &&
+ fa->fa_dscp == last_dscp) {
+ if (last_dscp)
+ continue;
+ break;
+ }
+ if (next_fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ last_dscp = fa->fa_dscp;
+ last_prio = next_fi->fib_priority;
+
+ if (next_fi->fib_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ nhc = fib_info_nhc(next_fi, 0);
+ if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (!fi) {
+ if (next_fi != res->fi)
+ break;
+ fa1 = fa;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, fa1->fa_default)) {
+ fib_result_assign(res, fi);
+ fa1->fa_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || !fi) {
+ if (fa1)
+ fa1->fa_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ fa1->fa_default)) {
+ fib_result_assign(res, fi);
+ fa1->fa_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ fa1->fa_default = last_idx;
+out:
+ return;
+}
/*
- Dead device goes up. We wake up dead nexthops.
- It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
+ *
+ * only used when fib_nh is built into fib_info
*/
-
-int fib_sync_up(struct net_device *dev)
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
struct fib_info *prev_fi;
- unsigned int hash;
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
int ret;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return 0;
+ if (nh_flags & RTNH_F_DEAD) {
+ unsigned int flags = netif_get_flags(dev);
+
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ nh_flags |= RTNH_F_LINKDOWN;
+ }
+
prev_fi = NULL;
- hash = fib_devindex_hashfn(dev->ifindex);
- head = &fib_info_devhash[hash];
+ head = fib_nh_head(dev);
ret = 0;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int alive;
BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ if (fi == prev_fi)
continue;
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
alive++;
continue;
}
- if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ if (!nexthop_nh->fib_nh_dev ||
+ !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
continue;
- if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
+ if (nexthop_nh->fib_nh_dev != dev ||
+ !__in_dev_get_rtnl(dev))
continue;
alive++;
- spin_lock_bh(&fib_multipath_lock);
- nh->nh_power = 0;
- nh->nh_flags &= ~RTNH_F_DEAD;
- spin_unlock_bh(&fib_multipath_lock);
+ nexthop_nh->fib_nh_flags &= ~nh_flags;
+ call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
} endfor_nexthops(fi)
if (alive > 0) {
- fi->fib_flags &= ~RTNH_F_DEAD;
+ fi->fib_flags &= ~nh_flags;
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
}
-/*
- The algorithm is suboptimal, but it provides really
- fair weighted route distribution.
- */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static bool fib_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+
+ if (nh->fib_nh_scope == RT_SCOPE_LINK) {
+ struct neighbour *n;
+
+ rcu_read_lock();
+
+ if (likely(nh->fib_nh_gw_family == AF_INET))
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ else if (nh->fib_nh_gw_family == AF_INET6)
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev,
+ &nh->fib_nh_gw6);
+ else
+ n = NULL;
+ if (n)
+ state = READ_ONCE(n->nud_state);
-void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+ rcu_read_unlock();
+ }
+
+ return !!(state & NUD_VALID);
+}
+
+void fib_select_multipath(struct fib_result *res, int hash,
+ const struct flowi4 *fl4)
{
struct fib_info *fi = res->fi;
- int w;
+ struct net *net = fi->fib_net;
+ bool found = false;
+ bool use_neigh;
+ __be32 saddr;
- spin_lock_bh(&fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- power += nh->nh_weight;
- nh->nh_power = nh->nh_weight;
- }
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power <= 0) {
- spin_unlock_bh(&fib_multipath_lock);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- return;
- }
+ if (unlikely(res->fi->nh)) {
+ nexthop_path_fib_result(res, hash);
+ return;
}
+ use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
+ saddr = fl4 ? fl4->saddr : 0;
- /* w should be random number [0..fi->fib_power-1],
- it is pretty bad approximation.
- */
+ change_nexthops(fi) {
+ int nh_upper_bound;
+
+ /* Nexthops without a carrier are assigned an upper bound of
+ * minus one when "ignore_routes_with_linkdown" is set.
+ */
+ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound);
+ if (nh_upper_bound == -1 ||
+ (use_neigh && !fib_good_nh(nexthop_nh)))
+ continue;
- w = jiffies % fi->fib_power;
+ if (!found) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ found = !saddr || nexthop_nh->nh_saddr == saddr;
+ }
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
- if ((w -= nh->nh_power) <= 0) {
- nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&fib_multipath_lock);
- return;
- }
+ if (hash > nh_upper_bound)
+ continue;
+
+ if (!saddr || nexthop_nh->nh_saddr == saddr) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ return;
}
- } endfor_nexthops(fi);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- spin_unlock_bh(&fib_multipath_lock);
+ if (found)
+ return;
+
+ } endfor_nexthops(fi);
}
#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+ struct flowi4 *fl4, const struct sk_buff *skb)
+{
+ if (fl4->flowi4_oif)
+ goto check_saddr;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fib_info_num_path(res->fi) > 1) {
+ int h = fib_multipath_hash(net, fl4, skb, NULL);
+
+ fib_select_multipath(res, h, fl4);
+ }
+ else
+#endif
+ if (!res->prefixlen &&
+ res->table->tb_num_default > 1 &&
+ res->type == RTN_UNICAST)
+ fib_select_default(fl4, res);
+
+check_saddr:
+ if (!fl4->saddr) {
+ struct net_device *l3mdev;
+
+ l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);
+
+ if (!l3mdev ||
+ l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
+ fl4->saddr = fib_result_prefsrc(net, res);
+ else
+ fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
+ }
+}
+
+int __net_init fib4_semantics_init(struct net *net)
+{
+ unsigned int hash_bits = 4;
+
+ net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
+ if (!net->ipv4.fib_info_hash)
+ return -ENOMEM;
+
+ net->ipv4.fib_info_hash_bits = hash_bits;
+ net->ipv4.fib_info_cnt = 0;
+
+ return 0;
+}
+
+void __net_exit fib4_semantics_exit(struct net *net)
+{
+ fib_info_hash_free(net->ipv4.fib_info_hash);
+}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d17990ec724f..59a6f0a9638f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1,47 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
* & Swedish University of Agricultural Sciences.
*
- * Jens Laas <jens.laas@data.slu.se> Swedish University of
+ * Jens Laas <jens.laas@data.slu.se> Swedish University of
* Agricultural Sciences.
- *
+ *
* Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
*
- * This work is based on the LPC-trie which is originally descibed in:
- *
+ * This work is based on the LPC-trie which is originally described in:
+ *
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
- *
+ * https://www.csc.kth.se/~snilsson/software/dyntrie2/
*
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
* IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
*
- * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
- *
- *
* Code from fib_hash has been reused which includes the following header:
*
- *
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IPv4 FIB: lookup engine and maintenance routines.
*
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Substantial contributions to this work comes from:
*
* David S. Miller, <davem@davemloft.net>
@@ -49,15 +35,11 @@
* Paul E. McKenney <paulmck@us.ibm.com>
* Patrick McHardy <kaber@trash.net>
*/
-
-#define VERSION "0.407"
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
+#include <linux/cache.h>
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
@@ -70,72 +52,97 @@
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/fib_notifier.h>
+#include <trace/events/fib.h>
#include "fib_lookup.h"
-#undef CONFIG_IP_FIB_TRIE_STATS
+static int call_fib_entry_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_alias *fa,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_entry_notifier_info info = {
+ .info.extack = extack,
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fa->fa_info,
+ .dscp = fa->fa_dscp,
+ .type = fa->fa_type,
+ .tb_id = fa->tb_id,
+ };
+ return call_fib4_notifier(nb, event_type, &info.info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_alias *fa,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_entry_notifier_info info = {
+ .info.extack = extack,
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fa->fa_info,
+ .dscp = fa->fa_dscp,
+ .type = fa->fa_type,
+ .tb_id = fa->tb_id,
+ };
+ return call_fib4_notifiers(net, event_type, &info.info);
+}
+
#define MAX_STAT_DEPTH 32
-#define KEYLENGTH (8*sizeof(t_key))
-#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
-#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
+#define KEYLENGTH (8*sizeof(t_key))
+#define KEY_MAX ((t_key)~0)
typedef unsigned int t_key;
-#define T_TNODE 0
-#define T_LEAF 1
-#define NODE_TYPE_MASK 0x1UL
-#define NODE_PARENT(node) \
- ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
-
-#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
-#define NODE_SET_PARENT(node, ptr) \
- rcu_assign_pointer((node)->parent, \
- ((unsigned long)(ptr)) | NODE_TYPE(node))
-
-#define IS_TNODE(n) (!(n->parent & T_LEAF))
-#define IS_LEAF(n) (n->parent & T_LEAF)
-
-struct node {
+struct key_vector {
t_key key;
- unsigned long parent;
-};
-
-struct leaf {
- t_key key;
- unsigned long parent;
- struct hlist_head list;
- struct rcu_head rcu;
-};
-
-struct leaf_info {
- struct hlist_node hlist;
- struct rcu_head rcu;
- int plen;
- struct list_head falh;
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned char slen;
+ union {
+ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+ struct hlist_head leaf;
+ /* This array is valid if (pos | bits) > 0 (TNODE) */
+ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
+ };
};
struct tnode {
- t_key key;
- unsigned long parent;
- unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short full_children; /* KEYLENGTH bits needed */
- unsigned short empty_children; /* KEYLENGTH bits needed */
struct rcu_head rcu;
- struct node *child[0];
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
+ struct key_vector __rcu *parent;
+ struct key_vector kv[1];
+#define tn_bits kv[0].bits
};
+#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE TNODE_SIZE(1)
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
@@ -153,1058 +160,1160 @@ struct trie_stat {
unsigned int tnodes;
unsigned int leaves;
unsigned int nullpointers;
+ unsigned int prefixes;
unsigned int nodesizes[MAX_STAT_DEPTH];
};
struct trie {
- struct node *trie;
+ struct key_vector kv[1];
#ifdef CONFIG_IP_FIB_TRIE_STATS
- struct trie_use_stats stats;
+ struct trie_use_stats __percpu *stats;
#endif
- int size;
- unsigned int revision;
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
-static void tnode_free(struct tnode *tn);
-
-static kmem_cache_t *fn_alias_kmem __read_mostly;
-static struct trie *trie_local = NULL, *trie_main = NULL;
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
+static unsigned int tnode_free_size;
+/*
+ * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
+ * especially useful before resizing the root node with PREEMPT_NONE configs;
+ * the value was obtained experimentally, aiming to avoid visible slowdown.
+ */
+unsigned int sysctl_fib_sync_mem = 512 * 1024;
+unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
+unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
-/* rcu_read_lock needs to be hold by caller from readside */
+static struct kmem_cache *fn_alias_kmem __ro_after_init;
+static struct kmem_cache *trie_leaf_kmem __ro_after_init;
-static inline struct node *tnode_get_child(struct tnode *tn, int i)
+static inline struct tnode *tn_info(struct key_vector *kv)
{
- BUG_ON(i >= 1 << tn->bits);
-
- return rcu_dereference(tn->child[i]);
+ return container_of(kv, struct tnode, kv[0]);
}
-static inline int tnode_child_length(const struct tnode *tn)
-{
- return 1 << tn->bits;
-}
+/* caller must hold RTNL */
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
-{
- if (offset < KEYLENGTH)
- return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
- else
- return 0;
-}
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
-static inline int tkey_equals(t_key a, t_key b)
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
{
- return a == b;
+ if (n)
+ rcu_assign_pointer(tn_info(n)->parent, tp);
}
-static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
-{
- if (bits == 0 || offset >= KEYLENGTH)
- return 1;
- bits = bits > KEYLENGTH ? KEYLENGTH : bits;
- return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
-}
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
-static inline int tkey_mismatch(t_key a, int offset, t_key b)
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
+ */
+static inline unsigned long child_length(const struct key_vector *tn)
{
- t_key diff = a ^ b;
- int i = offset;
-
- if (!diff)
- return 0;
- while ((diff << i) >> (KEYLENGTH-1) == 0)
- i++;
- return i;
+ return (1ul << tn->bits) & ~(1ul);
}
-/*
- To understand this stuff, an understanding of keys and all their bits is
- necessary. Every node in the trie has a key associated with it, but not
- all of the bits in that key are significant.
-
- Consider a node 'n' and its parent 'tp'.
-
- If n is a leaf, every bit in its key is significant. Its presence is
- necessitated by path compression, since during a tree traversal (when
- searching for a leaf - unless we are doing an insertion) we will completely
- ignore all skipped bits we encounter. Thus we need to verify, at the end of
- a potentially successful search, that we have indeed been walking the
- correct key path.
-
- Note that we can never "miss" the correct key in the tree if present by
- following the wrong path. Path compression ensures that segments of the key
- that are the same for all keys with a given prefix are skipped, but the
- skipped part *is* identical for each node in the subtrie below the skipped
- bit! trie_insert() in this implementation takes care of that - note the
- call to tkey_sub_equals() in trie_insert().
-
- if n is an internal node - a 'tnode' here, the various parts of its key
- have many different meanings.
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
- Example:
- _________________________________________________________________
- | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
- -----------------------------------------------------------------
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
+{
+ unsigned long index = key ^ kv->key;
- _________________________________________________________________
- | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
- -----------------------------------------------------------------
- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+ if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+ return 0;
- tp->pos = 7
- tp->bits = 3
- n->pos = 15
- n->bits = 4
+ return index >> kv->pos;
+}
- First, let's just ignore the bits that come before the parent tp, that is
- the bits from 0 to (tp->pos-1). They are *known* but at this point we do
- not use them for anything.
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
+ * at this point.
+ */
- The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
- index into the parent's child array. That is, they will be used to find
- 'n' among tp's children.
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 30;
- The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
- for the node n.
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+ kfree_rcu(fa, rcu);
+}
- All the bits we have seen so far are significant to the node n. The rest
- of the bits are really not needed or indeed known in n->key.
+#define TNODE_VMALLOC_MAX \
+ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
- The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
- n's child array, and will of course be different for each child.
-
+static void __node_free_rcu(struct rcu_head *head)
+{
+ struct tnode *n = container_of(head, struct tnode, rcu);
- The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
- at this point.
+ if (!n->tn_bits)
+ kmem_cache_free(trie_leaf_kmem, n);
+ else
+ kvfree(n);
+}
-*/
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
-static inline void check_tnode(const struct tnode *tn)
+static struct tnode *tnode_alloc(int bits)
{
- WARN_ON(tn && tn->pos+tn->bits > 32);
-}
+ size_t size;
-static int halve_threshold = 25;
-static int inflate_threshold = 50;
-static int halve_threshold_root = 15;
-static int inflate_threshold_root = 25;
+ /* verify bits is within bounds */
+ if (bits > TNODE_VMALLOC_MAX)
+ return NULL;
+ /* determine size and verify it is non-zero and didn't overflow */
+ size = TNODE_SIZE(1ul << bits);
-static void __alias_free_mem(struct rcu_head *head)
-{
- struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
- kmem_cache_free(fn_alias_kmem, fa);
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, GFP_KERNEL);
+ else
+ return vzalloc(size);
}
-static inline void alias_free_mem_rcu(struct fib_alias *fa)
+static inline void empty_child_inc(struct key_vector *n)
{
- call_rcu(&fa->rcu, __alias_free_mem);
-}
+ tn_info(n)->empty_children++;
-static void __leaf_free_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct leaf, rcu));
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children++;
}
-static void __leaf_info_free_rcu(struct rcu_head *head)
+static inline void empty_child_dec(struct key_vector *n)
{
- kfree(container_of(head, struct leaf_info, rcu));
-}
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children--;
-static inline void free_leaf_info(struct leaf_info *leaf)
-{
- call_rcu(&leaf->rcu, __leaf_info_free_rcu);
+ tn_info(n)->empty_children--;
}
-static struct tnode *tnode_alloc(unsigned int size)
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
- struct page *pages;
+ struct key_vector *l;
+ struct tnode *kv;
- if (size <= PAGE_SIZE)
- return kcalloc(size, 1, GFP_KERNEL);
-
- pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
- if (!pages)
+ kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ if (!kv)
return NULL;
- return page_address(pages);
-}
+ /* initialize key vector */
+ l = kv->kv;
+ l->key = key;
+ l->pos = 0;
+ l->bits = 0;
+ l->slen = fa->fa_slen;
-static void __tnode_free_rcu(struct rcu_head *head)
-{
- struct tnode *tn = container_of(head, struct tnode, rcu);
- unsigned int size = sizeof(struct tnode) +
- (1 << tn->bits) * sizeof(struct node *);
+ /* link leaf to fib alias */
+ INIT_HLIST_HEAD(&l->leaf);
+ hlist_add_head(&fa->fa_list, &l->leaf);
- if (size <= PAGE_SIZE)
- kfree(tn);
- else
- free_pages((unsigned long)tn, get_order(size));
+ return l;
}
-static inline void tnode_free(struct tnode *tn)
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
- if(IS_LEAF(tn)) {
- struct leaf *l = (struct leaf *) tn;
- call_rcu_bh(&l->rcu, __leaf_free_rcu);
- }
- else
- call_rcu(&tn->rcu, __tnode_free_rcu);
-}
+ unsigned int shift = pos + bits;
+ struct key_vector *tn;
+ struct tnode *tnode;
-static struct leaf *leaf_new(void)
-{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
- if (l) {
- l->parent = T_LEAF;
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
-}
+ /* verify bits and pos their msb bits clear and values are valid */
+ BUG_ON(!bits || (shift > KEYLENGTH));
-static struct leaf_info *leaf_info_new(int plen)
-{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
-}
+ tnode = tnode_alloc(bits);
+ if (!tnode)
+ return NULL;
-static struct tnode* tnode_new(t_key key, int pos, int bits)
-{
- int nchildren = 1<<bits;
- int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
- struct tnode *tn = tnode_alloc(sz);
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
- if (tn) {
- memset(tn, 0, sz);
- tn->parent = T_TNODE;
- tn->pos = pos;
- tn->bits = bits;
- tn->key = key;
- tn->full_children = 0;
- tn->empty_children = 1<<bits;
- }
+ if (bits == KEYLENGTH)
+ tnode->full_children = 1;
+ else
+ tnode->empty_children = 1ul << bits;
+
+ tn = tnode->kv;
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->slen = pos;
- pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned int) (sizeof(struct node) * 1<<bits));
return tn;
}
-/*
- * Check whether a tnode 'n' is "full", i.e. it is an internal node
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
-{
- if (n == NULL || IS_LEAF(n))
- return 0;
-
- return ((struct tnode *) n)->pos == tn->pos + tn->bits;
-}
-
-static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
{
- tnode_put_child_reorg(tn, i, n, -1);
+ return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
- /*
- * Add a child at position i overwriting the old value.
- * Update the value of full_children and empty_children.
- */
-
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct key_vector *tn, unsigned long i,
+ struct key_vector *n)
{
- struct node *chi = tn->child[i];
- int isfull;
+ struct key_vector *chi = get_child(tn, i);
+ int isfull, wasfull;
- BUG_ON(i >= 1<<tn->bits);
+ BUG_ON(i >= child_length(tn));
-
- /* update emptyChildren */
- if (n == NULL && chi != NULL)
- tn->empty_children++;
- else if (n != NULL && chi == NULL)
- tn->empty_children--;
+ /* update emptyChildren, overflow into fullChildren */
+ if (!n && chi)
+ empty_child_inc(tn);
+ if (n && !chi)
+ empty_child_dec(tn);
/* update fullChildren */
- if (wasfull == -1)
- wasfull = tnode_full(tn, chi);
-
+ wasfull = tnode_full(tn, chi);
isfull = tnode_full(tn, n);
+
if (wasfull && !isfull)
- tn->full_children--;
+ tn_info(tn)->full_children--;
else if (!wasfull && isfull)
- tn->full_children++;
+ tn_info(tn)->full_children++;
- if (n)
- NODE_SET_PARENT(n, tn);
+ if (n && (tn->slen < n->slen))
+ tn->slen = n->slen;
- rcu_assign_pointer(tn->child[i], n);
+ rcu_assign_pointer(tn->tnode[i], n);
}
-static struct node *resize(struct trie *t, struct tnode *tn)
+static void update_children(struct key_vector *tn)
{
- int i;
- int err = 0;
- struct tnode *old_tn;
- int inflate_threshold_use;
- int halve_threshold_use;
+ unsigned long i;
- if (!tn)
- return NULL;
+ /* update all of the child parent pointers */
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
- pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
- tn, inflate_threshold, halve_threshold);
+ if (!inode)
+ continue;
- /* No children */
- if (tn->empty_children == tnode_child_length(tn)) {
- tnode_free(tn);
- return NULL;
+ /* Either update the children of a tnode that
+ * already belongs to us or update the child
+ * to point to ourselves.
+ */
+ if (node_parent(inode) == tn)
+ update_children(inode);
+ else
+ node_set_parent(inode, tn);
}
- /* One child */
- if (tn->empty_children == tnode_child_length(tn) - 1)
- for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
-
- n = tn->child[i];
- if (!n)
- continue;
-
- /* compress one level */
- NODE_SET_PARENT(n, NULL);
- tnode_free(tn);
- return n;
- }
- /*
- * Double as long as the resulting node has a number of
- * nonempty nodes that are above the threshold.
- */
+}
- /*
- * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
- * the Helsinki University of Technology and Matti Tikkanen of Nokia
- * Telecommunications, page 6:
- * "A node is doubled if the ratio of non-empty children to all
- * children in the *doubled* node is at least 'high'."
- *
- * 'high' in this instance is the variable 'inflate_threshold'. It
- * is expressed as a percentage, so we multiply it with
- * tnode_child_length() and instead of multiplying by 2 (since the
- * child array will be doubled by inflate()) and multiplying
- * the left-hand side by 100 (to handle the percentage thing) we
- * multiply the left-hand side by 50.
- *
- * The left-hand side may look a bit weird: tnode_child_length(tn)
- * - tn->empty_children is of course the number of non-null children
- * in the current node. tn->full_children is the number of "full"
- * children, that is non-null tnodes with a skip value of 0.
- * All of those will be doubled in the resulting inflated tnode, so
- * we just count them one extra time here.
- *
- * A clearer way to write this would be:
- *
- * to_be_doubled = tn->full_children;
- * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
- * tn->full_children;
- *
- * new_child_length = tnode_child_length(tn) * 2;
- *
- * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
- * new_child_length;
- * if (new_fill_factor >= inflate_threshold)
- *
- * ...and so on, tho it would mess up the while () loop.
- *
- * anyway,
- * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
- * inflate_threshold
- *
- * avoid a division:
- * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
- * inflate_threshold * new_child_length
- *
- * expand not_to_be_doubled and to_be_doubled, and shorten:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children) >= inflate_threshold * new_child_length
- *
- * expand new_child_length:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children) >=
- * inflate_threshold * tnode_child_length(tn) * 2
- *
- * shorten again:
- * 50 * (tn->full_children + tnode_child_length(tn) -
- * tn->empty_children) >= inflate_threshold *
- * tnode_child_length(tn)
- *
- */
+static inline void put_child_root(struct key_vector *tp, t_key key,
+ struct key_vector *n)
+{
+ if (IS_TRIE(tp))
+ rcu_assign_pointer(tp->tnode[0], n);
+ else
+ put_child(tp, get_index(key, tp), n);
+}
- check_tnode(tn);
+static inline void tnode_free_init(struct key_vector *tn)
+{
+ tn_info(tn)->rcu.next = NULL;
+}
- /* Keep root node larger */
+static inline void tnode_free_append(struct key_vector *tn,
+ struct key_vector *n)
+{
+ tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+ tn_info(tn)->rcu.next = &tn_info(n)->rcu;
+}
- if(!tn->parent)
- inflate_threshold_use = inflate_threshold_root;
- else
- inflate_threshold_use = inflate_threshold;
+static void tnode_free(struct key_vector *tn)
+{
+ struct callback_head *head = &tn_info(tn)->rcu;
- err = 0;
- while ((tn->full_children > 0 &&
- 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
- inflate_threshold_use * tnode_child_length(tn))) {
+ while (head) {
+ head = head->next;
+ tnode_free_size += TNODE_SIZE(1ul << tn->bits);
+ node_free(tn);
- old_tn = tn;
- tn = inflate(t, tn);
- if (IS_ERR(tn)) {
- tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.resize_node_skipped++;
-#endif
- break;
- }
+ tn = container_of(head, struct tnode, rcu)->kv;
}
- check_tnode(tn);
+ if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
+ tnode_free_size = 0;
+ synchronize_net();
+ }
+}
- /*
- * Halve as long as the number of empty children in this
- * node is above threshold.
- */
+static struct key_vector *replace(struct trie *t,
+ struct key_vector *oldtnode,
+ struct key_vector *tn)
+{
+ struct key_vector *tp = node_parent(oldtnode);
+ unsigned long i;
+ /* setup the parent pointer out of and back into this node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child_root(tp, tn->key, tn);
- /* Keep root node larger */
+ /* update all of the child parent pointers */
+ update_children(tn);
- if(!tn->parent)
- halve_threshold_use = halve_threshold_root;
- else
- halve_threshold_use = halve_threshold;
+ /* all pointers should be clean so we are done */
+ tnode_free(oldtnode);
- err = 0;
- while (tn->bits > 1 &&
- 100 * (tnode_child_length(tn) - tn->empty_children) <
- halve_threshold_use * tnode_child_length(tn)) {
+ /* resize children now that oldtnode is freed */
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
- old_tn = tn;
- tn = halve(t, tn);
- if (IS_ERR(tn)) {
- tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.resize_node_skipped++;
-#endif
- break;
- }
+ /* resize child node */
+ if (tnode_full(tn, inode))
+ tn = resize(t, inode);
}
-
- /* Only one child remains */
- if (tn->empty_children == tnode_child_length(tn) - 1)
- for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
-
- n = tn->child[i];
- if (!n)
- continue;
-
- /* compress one level */
-
- NODE_SET_PARENT(n, NULL);
- tnode_free(tn);
- return n;
- }
-
- return (struct node *) tn;
+ return tp;
}
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct key_vector *inflate(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *inode;
- struct tnode *oldtnode = tn;
- int olen = tnode_child_length(tn);
- int i;
+ struct key_vector *tn;
+ unsigned long i;
+ t_key m;
pr_debug("In inflate\n");
- tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
-
+ tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Preallocate and store tnodes before the actual work so we
- * don't get into an inconsistent state if memory allocation
- * fails. In case of failure we return the oldnode and inflate
- * of tnode is ignored.
- */
-
- for (i = 0; i < olen; i++) {
- struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+ goto notnode;
- if (inode &&
- IS_TNODE(inode) &&
- inode->pos == oldtnode->pos + oldtnode->bits &&
- inode->bits > 1) {
- struct tnode *left, *right;
- t_key m = TKEY_GET_MASK(inode->pos, 1);
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
- left = tnode_new(inode->key&(~m), inode->pos + 1,
- inode->bits - 1);
- if (!left)
- goto nomem;
-
- right = tnode_new(inode->key|m, inode->pos + 1,
- inode->bits - 1);
-
- if (!right) {
- tnode_free(left);
- goto nomem;
- }
-
- put_child(t, tn, 2*i, (struct node *) left);
- put_child(t, tn, 2*i+1, (struct node *) right);
- }
- }
-
- for (i = 0; i < olen; i++) {
- struct node *node = tnode_get_child(oldtnode, i);
- struct tnode *left, *right;
- int size, j;
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
+ */
+ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+ struct key_vector *inode = get_child(oldtnode, --i);
+ struct key_vector *node0, *node1;
+ unsigned long j, k;
/* An empty child */
- if (node == NULL)
+ if (!inode)
continue;
/* A leaf or an internal node with skipped bits */
-
- if (IS_LEAF(node) || ((struct tnode *) node)->pos >
- tn->pos + tn->bits - 1) {
- if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
- 1) == 0)
- put_child(t, tn, 2*i, node);
- else
- put_child(t, tn, 2*i+1, node);
+ if (!tnode_full(oldtnode, inode)) {
+ put_child(tn, get_index(inode->key, tn), inode);
continue;
}
- /* An internal node with two children */
- inode = (struct tnode *) node;
+ /* drop the node in the old tnode free list */
+ tnode_free_append(oldtnode, inode);
+ /* An internal node with two children */
if (inode->bits == 1) {
- put_child(t, tn, 2*i, inode->child[0]);
- put_child(t, tn, 2*i+1, inode->child[1]);
-
- tnode_free(inode);
+ put_child(tn, 2 * i + 1, get_child(inode, 1));
+ put_child(tn, 2 * i, get_child(inode, 0));
continue;
}
- /* An internal node with more than two children */
-
/* We will replace this node 'inode' with two new
- * ones, 'left' and 'right', each with half of the
+ * ones, 'node0' and 'node1', each with half of the
* original children. The two new nodes will have
* a position one bit further down the key and this
* means that the "significant" part of their keys
* (see the discussion near the top of this file)
* will differ by one bit, which will be "0" in
- * left's key and "1" in right's key. Since we are
+ * node0's key and "1" in node1's key. Since we are
* moving the key position by one step, the bit that
* we are moving away from - the bit at position
- * (inode->pos) - is the one that will differ between
- * left and right. So... we synthesize that bit in the
- * two new keys.
- * The mask 'm' below will be a single "one" bit at
- * the position (inode->pos)
+ * (tn->pos) - is the one that will differ between
+ * node0 and node1. So... we synthesize that bit in the
+ * two new keys.
*/
+ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+ if (!node1)
+ goto nomem;
+ node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
+
+ tnode_free_append(tn, node1);
+ if (!node0)
+ goto nomem;
+ tnode_free_append(tn, node0);
+
+ /* populate child pointers in new nodes */
+ for (k = child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ }
- /* Use the old key, but set the new significant
- * bit to zero.
- */
+ /* link new nodes to parent */
+ NODE_INIT_PARENT(node1, tn);
+ NODE_INIT_PARENT(node0, tn);
- left = (struct tnode *) tnode_get_child(tn, 2*i);
- put_child(t, tn, 2*i, NULL);
+ /* link parent to nodes */
+ put_child(tn, 2 * i + 1, node1);
+ put_child(tn, 2 * i, node0);
+ }
+
+ /* setup the parent pointers into and out of this node */
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
+}
+
+static struct key_vector *halve(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *tn;
+ unsigned long i;
- BUG_ON(!left);
+ pr_debug("In halve\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
+ if (!tn)
+ goto notnode;
- right = (struct tnode *) tnode_get_child(tn, 2*i+1);
- put_child(t, tn, 2*i+1, NULL);
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
- BUG_ON(!right);
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
+ */
+ for (i = child_length(oldtnode); i;) {
+ struct key_vector *node1 = get_child(oldtnode, --i);
+ struct key_vector *node0 = get_child(oldtnode, --i);
+ struct key_vector *inode;
- size = tnode_child_length(left);
- for (j = 0; j < size; j++) {
- put_child(t, left, j, inode->child[j]);
- put_child(t, right, j, inode->child[j + size]);
+ /* At least one of the children is empty */
+ if (!node1 || !node0) {
+ put_child(tn, i / 2, node1 ? : node0);
+ continue;
}
- put_child(t, tn, 2*i, resize(t, left));
- put_child(t, tn, 2*i+1, resize(t, right));
- tnode_free(inode);
+ /* Two nonempty children */
+ inode = tnode_new(node0->key, oldtnode->pos, 1);
+ if (!inode)
+ goto nomem;
+ tnode_free_append(tn, inode);
+
+ /* initialize pointers out of node */
+ put_child(inode, 1, node1);
+ put_child(inode, 0, node0);
+ NODE_INIT_PARENT(inode, tn);
+
+ /* link parent to node */
+ put_child(tn, i / 2, inode);
}
- tnode_free(oldtnode);
- return tn;
+
+ /* setup the parent pointers into and out of this node */
+ return replace(t, oldtnode, tn);
nomem:
- {
- int size = tnode_child_length(tn);
- int j;
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
+}
- for (j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
+static struct key_vector *collapse(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *n, *tp;
+ unsigned long i;
- tnode_free(tn);
+ /* scan the tnode looking for that one child that might still exist */
+ for (n = NULL, i = child_length(oldtnode); !n && i;)
+ n = get_child(oldtnode, --i);
- return ERR_PTR(-ENOMEM);
- }
+ /* compress one level */
+ tp = node_parent(oldtnode);
+ put_child_root(tp, oldtnode->key, n);
+ node_set_parent(n, tp);
+
+ /* drop dead node */
+ node_free(oldtnode);
+
+ return tp;
}
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static unsigned char update_suffix(struct key_vector *tn)
{
- struct tnode *oldtnode = tn;
- struct node *left, *right;
- int i;
- int olen = tnode_child_length(tn);
+ unsigned char slen = tn->pos;
+ unsigned long stride, i;
+ unsigned char slen_max;
- pr_debug("In halve\n");
+ /* only vector 0 can have a suffix length greater than or equal to
+ * tn->pos + tn->bits, the second highest node will have a suffix
+ * length at most of tn->pos + tn->bits - 1
+ */
+ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
- tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+ /* search though the list of children looking for nodes that might
+ * have a suffix greater than the one we currently have. This is
+ * why we start with a stride of 2 since a stride of 1 would
+ * represent the nodes with suffix length equal to tn->pos
+ */
+ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+ struct key_vector *n = get_child(tn, i);
- if (!tn)
- return ERR_PTR(-ENOMEM);
+ if (!n || (n->slen <= slen))
+ continue;
- /*
- * Preallocate and store tnodes before the actual work so we
- * don't get into an inconsistent state if memory allocation
- * fails. In case of failure we return the oldnode and halve
- * of tnode is ignored.
- */
+ /* update stride and slen based on new value */
+ stride <<= (n->slen - slen);
+ slen = n->slen;
+ i &= ~(stride - 1);
- for (i = 0; i < olen; i += 2) {
- left = tnode_get_child(oldtnode, i);
- right = tnode_get_child(oldtnode, i+1);
+ /* stop searching if we have hit the maximum possible value */
+ if (slen >= slen_max)
+ break;
+ }
- /* Two nonempty children */
- if (left && right) {
- struct tnode *newn;
+ tn->slen = slen;
- newn = tnode_new(left->key, tn->pos + tn->bits, 1);
+ return slen;
+}
- if (!newn)
- goto nomem;
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
+ * tn->full_children;
+ *
+ * new_child_length = child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ * new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ * inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ * inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (child_length(tn) - tn->empty_children +
+ * tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (child_length(tn) - tn->empty_children +
+ * tn->full_children) >=
+ * inflate_threshold * child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + child_length(tn) -
+ * tn->empty_children) >= inflate_threshold *
+ * child_length(tn)
+ *
+ */
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+ unsigned long threshold = used;
- put_child(t, tn, i/2, (struct node *)newn);
- }
+ /* Keep root node larger */
+ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+ used -= tn_info(tn)->empty_children;
+ used += tn_info(tn)->full_children;
- }
+ /* if bits == KEYLENGTH then pos = 0, and will fail below */
- for (i = 0; i < olen; i += 2) {
- struct tnode *newBinNode;
+ return (used > 1) && tn->pos && ((50 * used) >= threshold);
+}
- left = tnode_get_child(oldtnode, i);
- right = tnode_get_child(oldtnode, i+1);
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+ unsigned long threshold = used;
- /* At least one of the children is empty */
- if (left == NULL) {
- if (right == NULL) /* Both are empty */
- continue;
- put_child(t, tn, i/2, right);
- continue;
- }
+ /* Keep root node larger */
+ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+ used -= tn_info(tn)->empty_children;
- if (right == NULL) {
- put_child(t, tn, i/2, left);
- continue;
- }
+ /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
- /* Two nonempty children */
- newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
- put_child(t, tn, i/2, NULL);
- put_child(t, newBinNode, 0, left);
- put_child(t, newBinNode, 1, right);
- put_child(t, tn, i/2, resize(t, newBinNode));
- }
- tnode_free(oldtnode);
- return tn;
-nomem:
- {
- int size = tnode_child_length(tn);
- int j;
+ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+}
- for (j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
+static inline bool should_collapse(struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
- tnode_free(tn);
+ used -= tn_info(tn)->empty_children;
- return ERR_PTR(-ENOMEM);
- }
+ /* account for bits == KEYLENGTH case */
+ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
+ used -= KEY_MAX;
+
+ /* One child or none, time to drop us from the trie */
+ return used < 2;
}
-static void trie_init(struct trie *t)
+#define MAX_WORK 10
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
{
- if (!t)
- return;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ struct key_vector *tp = node_parent(tn);
+ unsigned long cindex = get_index(tn->key, tp);
+ int max_work = MAX_WORK;
- t->size = 0;
- rcu_assign_pointer(t->trie, NULL);
- t->revision = 0;
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
+
+ /* track the tnode via the pointer from the parent instead of
+ * doing it ourselves. This way we can let RCU fully do its
+ * thing without us interfering
+ */
+ BUG_ON(tn != get_child(tp, cindex));
+
+ /* Double as long as the resulting node has a number of
+ * nonempty nodes that are above the threshold.
+ */
+ while (should_inflate(tp, tn) && max_work) {
+ tp = inflate(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- memset(&t->stats, 0, sizeof(struct trie_use_stats));
+ this_cpu_inc(stats->resize_node_skipped);
#endif
-}
+ break;
+ }
-/* readside must use rcu_read_lock currently dump routines
- via get_fa_head and dump */
+ max_work--;
+ tn = get_child(tp, cindex);
+ }
-static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
-{
- struct hlist_head *head = &l->list;
- struct hlist_node *node;
- struct leaf_info *li;
+ /* update parent in case inflate failed */
+ tp = node_parent(tn);
- hlist_for_each_entry_rcu(li, node, head, hlist)
- if (li->plen == plen)
- return li;
+ /* Return if at least one inflate is run */
+ if (max_work != MAX_WORK)
+ return tp;
- return NULL;
-}
+ /* Halve as long as the number of empty children in this
+ * node is above threshold.
+ */
+ while (should_halve(tp, tn) && max_work) {
+ tp = halve(t, tn);
+ if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->resize_node_skipped);
+#endif
+ break;
+ }
-static inline struct list_head * get_fa_head(struct leaf *l, int plen)
-{
- struct leaf_info *li = find_leaf_info(l, plen);
+ max_work--;
+ tn = get_child(tp, cindex);
+ }
- if (!li)
- return NULL;
+ /* Only one child remains */
+ if (should_collapse(tn))
+ return collapse(t, tn);
- return &li->falh;
+ /* update parent in case halve failed */
+ return node_parent(tn);
}
-static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
- struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node;
+ unsigned char node_slen = tn->slen;
- if (hlist_empty(head)) {
- hlist_add_head_rcu(&new->hlist, head);
- } else {
- hlist_for_each_entry(li, node, head, hlist) {
- if (new->plen > li->plen)
- break;
+ while ((node_slen > tn->pos) && (node_slen > slen)) {
+ slen = update_suffix(tn);
+ if (node_slen == slen)
+ break;
- last = li;
- }
- if (last)
- hlist_add_after_rcu(&last->hlist, &new->hlist);
- else
- hlist_add_before_rcu(&new->hlist, &li->hlist);
- }
+ tn = node_parent(tn);
+ node_slen = tn->slen;
+ }
}
-/* rcu_read_lock needs to be hold by caller from readside */
-
-static struct leaf *
-fib_find_node(struct trie *t, u32 key)
+static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
- int pos;
- struct tnode *tn;
- struct node *n;
+ while (tn->slen < slen) {
+ tn->slen = slen;
+ tn = node_parent(tn);
+ }
+}
- pos = 0;
- n = rcu_dereference(t->trie);
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+ struct key_vector **tp, u32 key)
+{
+ struct key_vector *pn, *n = t->kv;
+ unsigned long index = 0;
- while (n != NULL && NODE_TYPE(n) == T_TNODE) {
- tn = (struct tnode *) n;
+ do {
+ pn = n;
+ n = get_child_rcu(n, index);
- check_tnode(tn);
+ if (!n)
+ break;
- if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
- } else
+ index = get_cindex(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the bits in the cindex. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if (index >= (1ul << bits))
+ * we have a mismatch in skip bits and failed
+ * else
+ * we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
+ */
+ if (index >= (1ul << n->bits)) {
+ n = NULL;
break;
- }
- /* Case we have found a leaf. Compare prefixes */
+ }
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
- return (struct leaf *)n;
+ /* keep searching until we find a perfect match leaf or NULL */
+ } while (IS_TNODE(n));
- return NULL;
+ *tp = pn;
+
+ return n;
}
-static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
+/* Return the first fib alias matching DSCP with
+ * priority less than or equal to PRIO.
+ * If 'find_first' is set, return the first matching
+ * fib alias, regardless of DSCP and priority.
+ */
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+ dscp_t dscp, u32 prio, u32 tb_id,
+ bool find_first)
{
- int wasfull;
- t_key cindex, key;
- struct tnode *tp = NULL;
-
- key = tn->key;
+ struct fib_alias *fa;
- while (tn != NULL && NODE_PARENT(tn) != NULL) {
+ if (!fah)
+ return NULL;
- tp = NODE_PARENT(tn);
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *) resize (t, (struct tnode *)tn);
- tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+ hlist_for_each_entry(fa, fah, fa_list) {
+ /* Avoid Sparse warning when using dscp_t in inequalities */
+ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+ u8 __dscp = inet_dscp_to_dsfield(dscp);
- if (!NODE_PARENT(tn))
+ if (fa->fa_slen < slen)
+ continue;
+ if (fa->fa_slen != slen)
break;
-
- tn = NODE_PARENT(tn);
+ if (fa->tb_id > tb_id)
+ continue;
+ if (fa->tb_id != tb_id)
+ break;
+ if (find_first)
+ return fa;
+ if (__fa_dscp > __dscp)
+ continue;
+ if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
+ return fa;
}
- /* Handle last (top) tnode */
- if (IS_TNODE(tn))
- tn = (struct tnode*) resize(t, (struct tnode *)tn);
- return (struct node*) tn;
+ return NULL;
}
-/* only used from updater-side */
-
-static struct list_head *
-fib_insert_node(struct trie *t, int *err, u32 key, int plen)
+static struct fib_alias *
+fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
{
- int pos, newpos;
- struct tnode *tp = NULL, *tn = NULL;
- struct node *n;
- struct leaf *l;
- int missbit;
- struct list_head *fa_head = NULL;
- struct leaf_info *li;
- t_key cindex;
+ u8 slen = KEYLENGTH - fri->dst_len;
+ struct key_vector *l, *tp;
+ struct fib_table *tb;
+ struct fib_alias *fa;
+ struct trie *t;
- pos = 0;
- n = t->trie;
+ tb = fib_get_table(net, fri->tb_id);
+ if (!tb)
+ return NULL;
- /* If we point to NULL, stop. Either the tree is empty and we should
- * just put a new leaf in if, or we have reached an empty child slot,
- * and we should just put our new leaf in that.
- * If we point to a T_TNODE, check if it matches our key. Note that
- * a T_TNODE might be skipping any number of bits - its 'pos' need
- * not be the parent's 'pos'+'bits'!
- *
- * If it does match the current key, get pos/bits from it, extract
- * the index from our key, push the T_TNODE and walk the tree.
- *
- * If it doesn't, we have to replace it with a new T_TNODE.
- *
- * If we point to a T_LEAF, it might or might not have the same key
- * as we do. If it does, just change the value, update the T_LEAF's
- * value, and return it.
- * If it doesn't, we need to replace it with a T_TNODE.
- */
+ t = (struct trie *)tb->tb_data;
+ l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
+ if (!l)
+ return NULL;
- while (n != NULL && NODE_TYPE(n) == T_TNODE) {
- tn = (struct tnode *) n;
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
+ fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
+ fa->fa_type == fri->type)
+ return fa;
+ }
- check_tnode(tn);
+ return NULL;
+}
- if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- tp = tn;
- pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
+{
+ u8 fib_notify_on_flag_change;
+ struct fib_alias *fa_match;
+ struct sk_buff *skb;
+ int err;
- BUG_ON(n && NODE_PARENT(n) != tn);
- } else
- break;
- }
+ rcu_read_lock();
- /*
- * n ----> NULL, LEAF or TNODE
- *
- * tp is n's (parent) ----> NULL or TNODE
+ fa_match = fib_find_matching_alias(net, fri);
+ if (!fa_match)
+ goto out;
+
+ /* These are paired with the WRITE_ONCE() happening in this function.
+ * The reason is that we are only protected by RCU at this point.
*/
+ if (READ_ONCE(fa_match->offload) == fri->offload &&
+ READ_ONCE(fa_match->trap) == fri->trap &&
+ READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
+ goto out;
- BUG_ON(tp && IS_LEAF(tp));
+ WRITE_ONCE(fa_match->offload, fri->offload);
+ WRITE_ONCE(fa_match->trap, fri->trap);
- /* Case 1: n is a leaf. Compare prefixes */
+ fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = (struct leaf *) n;
+ /* 2 means send notifications only if offload_failed was changed. */
+ if (fib_notify_on_flag_change == 2 &&
+ READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
+ goto out;
- li = leaf_info_new(plen);
+ WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
- if (!li) {
- *err = -ENOMEM;
- goto err;
- }
+ if (!fib_notify_on_flag_change)
+ goto out;
- fa_head = &li->falh;
- insert_leaf_info(&l->list, li);
- goto done;
+ skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
}
- t->size++;
- l = leaf_new();
- if (!l) {
- *err = -ENOMEM;
- goto err;
+ err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
}
- l->key = key;
- li = leaf_info_new(plen);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
+ goto out;
- if (!li) {
- tnode_free((struct tnode *) l);
- *err = -ENOMEM;
- goto err;
- }
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
- fa_head = &li->falh;
- insert_leaf_info(&l->list, li);
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
+{
+ while (!IS_TRIE(tn))
+ tn = resize(t, tn);
+}
- if (t->trie && n == NULL) {
- /* Case 2: n is NULL, and will just insert a new leaf */
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+ struct fib_alias *new, t_key key)
+{
+ struct key_vector *n, *l;
- NODE_SET_PARENT(l, tp);
+ l = leaf_new(key, new);
+ if (!l)
+ goto noleaf;
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
- } else {
- /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
- /*
- * Add a new tnode here
- * first tnode need some special handling
- */
+ /* retrieve child from parent node */
+ n = get_child(tp, get_index(key, tp));
- if (tp)
- pos = tp->pos+tp->bits;
- else
- pos = 0;
+ /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+ *
+ * Add a new tnode here
+ * first tnode need some special handling
+ * leaves us in position for handling as case 3
+ */
+ if (n) {
+ struct key_vector *tn;
- if (n) {
- newpos = tkey_mismatch(key, pos, n->key);
- tn = tnode_new(n->key, newpos, 1);
- } else {
- newpos = 0;
- tn = tnode_new(key, newpos, 1); /* First tnode */
- }
+ tn = tnode_new(key, __fls(key ^ n->key), 1);
+ if (!tn)
+ goto notnode;
- if (!tn) {
- free_leaf_info(li);
- tnode_free((struct tnode *) l);
- *err = -ENOMEM;
- goto err;
- }
+ /* initialize routes out of node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child(tn, get_index(key, tn) ^ 1, n);
- NODE_SET_PARENT(tn, tp);
+ /* start adding routes into the node */
+ put_child_root(tp, key, tn);
+ node_set_parent(n, tn);
- missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct node *)l);
- put_child(t, tn, 1-missbit, n);
+ /* parent now has a NULL spot where the leaf can go */
+ tp = tn;
+ }
- if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
- } else {
- rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
- tp = tn;
+ /* Case 3: n is NULL, and will just insert a new leaf */
+ node_push_suffix(tp, new->fa_slen);
+ NODE_INIT_PARENT(l, tp);
+ put_child_root(tp, key, l);
+ trie_rebalance(t, tp);
+
+ return 0;
+notnode:
+ node_free(l);
+noleaf:
+ return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *new,
+ struct fib_alias *fa, t_key key)
+{
+ if (!l)
+ return fib_insert_node(t, tp, new, key);
+
+ if (fa) {
+ hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
+ } else {
+ struct fib_alias *last;
+
+ hlist_for_each_entry(last, &l->leaf, fa_list) {
+ if (new->fa_slen < last->fa_slen)
+ break;
+ if ((new->fa_slen == last->fa_slen) &&
+ (new->tb_id > last->tb_id))
+ break;
+ fa = last;
}
- }
- if (tp && tp->pos + tp->bits > 32)
- printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
+ if (fa)
+ hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+ else
+ hlist_add_head_rcu(&new->fa_list, &l->leaf);
+ }
- /* Rebalance the trie */
+ /* if we added to the tail node then we need to update slen */
+ if (l->slen < new->fa_slen) {
+ l->slen = new->fa_slen;
+ node_push_suffix(tp, new->fa_slen);
+ }
- rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
-done:
- t->revision++;
-err:
- return fa_head;
+ return 0;
}
-static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old);
+
+/* Caller must hold RTNL. */
+int fib_table_insert(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg, struct netlink_ext_ack *extack)
{
- struct trie *t = (struct trie *) tb->tb_data;
+ struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
- struct list_head *fa_head = NULL;
+ struct key_vector *l, *tp;
+ u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
- int plen = cfg->fc_dst_len;
- u8 tos = cfg->fc_tos;
- u32 key, mask;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ dscp_t dscp;
+ u32 key;
int err;
- struct leaf *l;
-
- if (plen > 32)
- return -EINVAL;
key = ntohl(cfg->fc_dst);
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
- mask = ntohl(inet_make_mask(plen));
-
- if (key & ~mask)
- return -EINVAL;
-
- key = key & mask;
-
- fi = fib_create_info(cfg);
+ fi = fib_create_info(cfg, extack);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}
- l = fib_find_node(t, key);
- fa = NULL;
-
- if (l) {
- fa_head = get_fa_head(l, plen);
- fa = fib_find_alias(fa_head, tos, fi->fib_priority);
- }
+ dscp = cfg->fc_dscp;
+ l = fib_find_node(t, &tp, key);
+ fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
+ tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
+ * with the same keys [prefix,dscp,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
+ * insert to the tail of the section matching the suffix length
+ * of the new alias.
*/
- if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_orig;
+ if (fa && fa->fa_dscp == dscp &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_first, *fa_match;
err = -EEXIST;
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ nlflags &= ~NLM_F_EXCL;
+
+ /* We have 2 goals:
+ * 1. Find exact match for type, scope, fib_info to avoid
+ * duplicate routes
+ * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+ */
+ fa_match = NULL;
+ fa_first = fa;
+ hlist_for_each_entry_from(fa, fa_list) {
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_dscp != dscp))
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == cfg->fc_type &&
+ fa->fa_info == fi) {
+ fa_match = fa;
+ break;
+ }
+ }
+
if (cfg->fc_nlflags & NLM_F_REPLACE) {
struct fib_info *fi_drop;
u8 state;
+ nlflags |= NLM_F_REPLACE;
+ fa = fa_first;
+ if (fa_match) {
+ if (fa == fa_match)
+ err = 0;
+ goto out;
+ }
err = -ENOBUFS;
- new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
- if (new_fa == NULL)
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
goto out;
fi_drop = fa->fa_info;
- new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state;
- new_fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
+ new_fa->fa_slen = fa->fa_slen;
+ new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
+ new_fa->offload_failed = 0;
+
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+ if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
+ tb->tb_id, true) == new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event,
+ key, plen,
+ new_fa, extack);
+ if (err) {
+ hlist_replace_rcu(&new_fa->fa_list,
+ &fa->fa_list);
+ goto out_free_new_fa;
+ }
+ }
+
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
goto succeeded;
}
@@ -1212,55 +1321,69 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
* uses the same scope, type, and nexthop
* information.
*/
- fa_orig = fa;
- list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
- goto out;
- }
- }
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_orig;
+ if (fa_match)
+ goto out;
+
+ if (cfg->fc_nlflags & NLM_F_APPEND)
+ nlflags |= NLM_F_APPEND;
+ else
+ fa = fa_first;
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
+ nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
- new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
- if (new_fa == NULL)
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
goto out;
new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
+ new_fa->fa_dscp = dscp;
new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
new_fa->fa_state = 0;
- /*
- * Insert new entry to the list.
- */
+ new_fa->fa_slen = slen;
+ new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
+ new_fa->offload_failed = 0;
+
+ /* Insert new entry to the list. */
+ err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+ if (err)
+ goto out_free_new_fa;
+
+ /* The alias was already inserted, so the node must exist. */
+ l = l ? l : fib_find_node(t, &tp, key);
+ if (WARN_ON_ONCE(!l)) {
+ err = -ENOENT;
+ goto out_free_new_fa;
+ }
+
+ if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
+ new_fa) {
+ enum fib_event_type fib_event;
- if (!fa_head) {
- err = 0;
- fa_head = fib_insert_node(t, &err, key, plen);
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event, key, plen,
+ new_fa, extack);
if (err)
- goto out_free_new_fa;
+ goto out_remove_new_fa;
}
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : fa_head));
+ if (!plen)
+ tb->tb_num_default++;
- rt_cache_flush(-1);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
- &cfg->fc_nlinfo);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
+ &cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
+out_remove_new_fa:
+ fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1269,329 +1392,342 @@ err:
return err;
}
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
+{
+ t_key prefix = n->key;
-/* should be called with rcu_read_lock */
-static inline int check_leaf(struct trie *t, struct leaf *l,
- t_key key, int *plen, const struct flowi *flp,
- struct fib_result *res)
-{
- int err, i;
- __be32 mask;
- struct leaf_info *li;
- struct hlist_head *hhead = &l->list;
- struct hlist_node *node;
-
- hlist_for_each_entry_rcu(li, node, hhead, hlist) {
- i = li->plen;
- mask = inet_make_mask(i);
- if (l->key != (key & ntohl(mask)))
- continue;
-
- if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
- *plen = i;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_passed++;
-#endif
- return err;
- }
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_miss++;
-#endif
- }
- return 1;
+ return (key ^ prefix) & (prefix | -prefix);
}
-static int
-fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
+ const struct flowi4 *flp)
{
- struct trie *t = (struct trie *) tb->tb_data;
- int plen, ret = 0;
- struct node *n;
- struct tnode *pn;
- int pos, bits;
- t_key key = ntohl(flp->fl4_dst);
- int chopped_off;
- t_key cindex = 0;
- int current_prefix_length = KEYLENGTH;
- struct tnode *cn;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ return false;
- rcu_read_lock();
+ if (ip_ignore_linkdown(nhc->nhc_dev) &&
+ nhc->nhc_flags & RTNH_F_LINKDOWN &&
+ !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
+ return false;
- n = rcu_dereference(t->trie);
- if (!n)
- goto failed;
+ if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
+ return false;
+
+ return true;
+}
+/* should be called with rcu_read_lock */
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.gets++;
+ struct trie_use_stats __percpu *stats = t->stats;
#endif
+ const t_key key = ntohl(flp->daddr);
+ struct key_vector *n, *pn;
+ struct fib_alias *fa;
+ unsigned long index;
+ t_key cindex;
- /* Just a leaf? */
- if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- goto failed;
- }
- pn = (struct tnode *) n;
- chopped_off = 0;
-
- while (pn) {
- pos = pn->pos;
- bits = pn->bits;
-
- if (!chopped_off)
- cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
+ pn = t->kv;
+ cindex = 0;
- n = tnode_get_child(pn, cindex);
+ n = get_child_rcu(pn, cindex);
+ if (!n) {
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
+ return -EAGAIN;
+ }
- if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.null_node_hit++;
+ this_cpu_inc(stats->gets);
#endif
- goto backtrace;
- }
- if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- else
- goto backtrace;
- }
-
-#define HL_OPTIMIZE
-#ifdef HL_OPTIMIZE
- cn = (struct tnode *)n;
-
- /*
- * It's a tnode, and we can do some extra checks here if we
- * like, to avoid descending into a dead-end branch.
- * This tnode is in the parent's child array at index
- * key[p_pos..p_pos+p_bits] but potentially with some bits
- * chopped off, so in reality the index may be just a
- * subprefix, padded with zero at the end.
- * We can also take a look at any skipped bits in this
- * tnode - everything up to p_pos is supposed to be ok,
- * and the non-chopped bits of the index (se previous
- * paragraph) are also guaranteed ok, but the rest is
- * considered unknown.
+ /* Step 1: Travel to the longest prefix match in the trie */
+ for (;;) {
+ index = get_cindex(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if (index >= (1ul << bits))
+ * we have a mismatch in skip bits and failed
+ * else
+ * we know the value is cindex
*
- * The skipped bits are key[pos+bits..cn->pos].
- */
-
- /* If current_prefix_length < pos+bits, we are already doing
- * actual prefix matching, which means everything from
- * pos+(bits-chopped_off) onward must be zero along some
- * branch of this subtree - otherwise there is *no* valid
- * prefix present. Here we can only check the skipped
- * bits. Remember, since we have already indexed into the
- * parent's child array, we know that the bits we chopped of
- * *are* zero.
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
+ if (index >= (1ul << n->bits))
+ break;
- /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
+ goto found;
- if (current_prefix_length < pos+bits) {
- if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length) != 0 ||
- !(cn->child[0]))
- goto backtrace;
+ /* only record pn and cindex if we are going to be chopping
+ * bits later. Otherwise we are just wasting cycles.
+ */
+ if (n->slen > n->pos) {
+ pn = n;
+ cindex = index;
}
- /*
- * If chopped_off=0, the index is fully validated and we
- * only need to look at the skipped bits for this, the new,
- * tnode. What we actually want to do is to find out if
- * these skipped bits match our key perfectly, or if we will
- * have to count on finding a matching prefix further down,
- * because if we do, we would like to have some way of
- * verifying the existence of such a prefix at this point.
- */
+ n = get_child_rcu(n, index);
+ if (unlikely(!n))
+ goto backtrace;
+ }
- /* The only thing we can do at this point is to verify that
- * any such matching prefix can indeed be a prefix to our
- * key, and if the bits in the node we are inspecting that
- * do not match our key are not ZERO, this cannot be true.
- * Thus, find out where there is a mismatch (before cn->pos)
- * and verify that all the mismatching bits are zero in the
- * new tnode's key.
- */
+ /* Step 2: Sort out leaves and begin backtracing for longest prefix */
+ for (;;) {
+ /* record the pointer where our next node pointer is stored */
+ struct key_vector __rcu **cptr = n->tnode;
- /* Note: We aren't very concerned about the piece of the key
- * that precede pn->pos+pn->bits, since these have already been
- * checked. The bits after cn->pos aren't checked since these are
- * by definition "unknown" at this point. Thus, what we want to
- * see is if we are about to enter the "prefix matching" state,
- * and in that case verify that the skipped bits that will prevail
- * throughout this subtree are zero, as they have to be if we are
- * to find a matching prefix.
+ /* This test verifies that none of the bits that differ
+ * between the key and the prefix exist in the region of
+ * the lsb and higher in the prefix.
*/
+ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
+ goto backtrace;
- node_prefix = MASK_PFX(cn->key, cn->pos);
- key_prefix = MASK_PFX(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
+ /* exit out and process leaf */
+ if (unlikely(IS_LEAF(n)))
+ break;
- /* In short: If skipped bits in this node do not match the search
- * key, enter the "prefix matching" state.directly.
+ /* Don't bother recording parent info. Since we are in
+ * prefix match mode we will have to come back to wherever
+ * we started this traversal anyway
*/
- if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch <<1;
+
+ while ((n = rcu_dereference(*cptr)) == NULL) {
+backtrace:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ if (!n)
+ this_cpu_inc(stats->null_node_hit);
+#endif
+ /* If we are at cindex 0 there are no more bits for
+ * us to strip at this level so we must ascend back
+ * up one level to see if there are any more bits to
+ * be stripped there.
+ */
+ while (!cindex) {
+ t_key pkey = pn->key;
+
+ /* If we don't have a parent then there is
+ * nothing for us to do as we do not have any
+ * further nodes to parse.
+ */
+ if (IS_TRIE(pn)) {
+ trace_fib_table_lookup(tb->tb_id, flp,
+ NULL, -EAGAIN);
+ return -EAGAIN;
+ }
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->backtrack);
+#endif
+ /* Get Child's index */
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn);
}
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
- if (key_prefix != 0)
- goto backtrace;
+ /* strip the least significant bit from the cindex */
+ cindex &= cindex - 1;
- if (current_prefix_length >= cn->pos)
- current_prefix_length = mp;
+ /* grab pointer for next child node */
+ cptr = &pn->tnode[cindex];
}
-#endif
- pn = (struct tnode *)n; /* Descend */
- chopped_off = 0;
- continue;
+ }
-backtrace:
- chopped_off++;
+found:
+ /* this line carries forward the xor from earlier in the function */
+ index = key ^ n->key;
- /* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
- chopped_off++;
+ /* Step 3: Process the leaf, if that fails fall back to backtracing */
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ struct fib_nh_common *nhc;
+ int nhsel, err;
- /* Decrease current_... with bits chopped off */
- if (current_prefix_length > pn->pos + pn->bits - chopped_off)
- current_prefix_length = pn->pos + pn->bits - chopped_off;
+ if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
+ if (index >= (1ul << fa->fa_slen))
+ continue;
+ }
+ if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
+ continue;
+ /* Paired with WRITE_ONCE() in fib_release_info() */
+ if (READ_ONCE(fi->fib_dead))
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
+out_reject:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
- /*
- * Either we do the actual chop off according or if we have
- * chopped off all bits in this tnode walk up to our parent.
- */
+ if (unlikely(fi->nh)) {
+ if (nexthop_is_blackhole(fi->nh)) {
+ err = fib_props[RTN_BLACKHOLE].error;
+ goto out_reject;
+ }
- if (chopped_off <= pn->bits) {
- cindex &= ~(1 << (chopped_off-1));
- } else {
- if (NODE_PARENT(pn) == NULL)
- goto failed;
+ nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
+ &nhsel);
+ if (nhc)
+ goto set_result;
+ goto miss;
+ }
- /* Get Child's index */
- cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
- pn = NODE_PARENT(pn);
- chopped_off = 0;
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ nhc = fib_info_nhc(fi, nhsel);
+ if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
+ continue;
+set_result:
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ refcount_inc(&fi->fib_clntref);
+
+ res->prefix = htonl(n->key);
+ res->prefixlen = KEYLENGTH - fa->fa_slen;
+ res->nh_sel = nhsel;
+ res->nhc = nhc;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->dscp = fa->fa_dscp;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.backtrack++;
+ this_cpu_inc(stats->semantic_match_passed);
#endif
- goto backtrace;
+ trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
+
+ return err;
}
}
-failed:
- ret = 1;
-found:
- rcu_read_unlock();
- return ret;
+miss:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_miss);
+#endif
+ goto backtrace;
}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
-/* only called from updater side */
-static int trie_leaf_remove(struct trie *t, t_key key)
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old)
{
- t_key cindex;
- struct tnode *tp = NULL;
- struct node *n = t->trie;
- struct leaf *l;
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->fa_list.pprev;
+ struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
- pr_debug("entering trie_leaf_remove(%p)\n", n);
+ /* remove the fib_alias from the list */
+ hlist_del_rcu(&old->fa_list);
- /* Note that in the case skipped bits, those bits are *not* checked!
- * When we finish this, we will have NULL or a T_LEAF, and the
- * T_LEAF may or may not match our key.
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
*/
-
- while (n != NULL && IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- check_tnode(tn);
- n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
-
- BUG_ON(n && NODE_PARENT(n) != tn);
+ if (hlist_empty(&l->leaf)) {
+ if (tp->slen == l->slen)
+ node_pull_suffix(tp, tp->pos);
+ put_child_root(tp, l->key, NULL);
+ node_free(l);
+ trie_rebalance(t, tp);
+ return;
}
- l = (struct leaf *) n;
- if (!n || !tkey_equals(l->key, key))
- return 0;
-
- /*
- * Key found.
- * Remove the leaf and rebalance the tree
- */
+ /* only access fa if it is pointing at the last valid hlist_node */
+ if (*pprev)
+ return;
- t->revision++;
- t->size--;
+ /* update the trie with the latest suffix length */
+ l->slen = fa->fa_slen;
+ node_pull_suffix(tp, fa->fa_slen);
+}
- preempt_disable();
- tp = NODE_PARENT(n);
- tnode_free((struct tnode *) n);
+static void fib_notify_alias_delete(struct net *net, u32 key,
+ struct hlist_head *fah,
+ struct fib_alias *fa_to_delete,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_alias *fa_next, *fa_to_notify;
+ u32 tb_id = fa_to_delete->tb_id;
+ u8 slen = fa_to_delete->fa_slen;
+ enum fib_event_type fib_event;
- if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, NULL);
- rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
- } else
- rcu_assign_pointer(t->trie, NULL);
- preempt_enable();
+ /* Do not notify if we do not care about the route. */
+ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
+ return;
- return 1;
+ /* Determine if the route should be replaced by the next route in the
+ * list.
+ */
+ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
+ struct fib_alias, fa_list);
+ if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ fa_to_notify = fa_next;
+ } else {
+ fib_event = FIB_EVENT_ENTRY_DEL;
+ fa_to_notify = fa_to_delete;
+ }
+ call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
+ fa_to_notify, extack);
}
-static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
+/* Caller must hold RTNL. */
+int fib_table_delete(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg, struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *) tb->tb_data;
- u32 key, mask;
- int plen = cfg->fc_dst_len;
- u8 tos = cfg->fc_tos;
struct fib_alias *fa, *fa_to_delete;
- struct list_head *fa_head;
- struct leaf *l;
- struct leaf_info *li;
-
- if (plen > 32)
- return -EINVAL;
+ struct key_vector *l, *tp;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ dscp_t dscp;
+ u32 key;
key = ntohl(cfg->fc_dst);
- mask = ntohl(inet_make_mask(plen));
-
- if (key & ~mask)
- return -EINVAL;
-
- key = key & mask;
- l = fib_find_node(t, key);
+ l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
- fa_head = get_fa_head(l, plen);
- fa = fib_find_alias(fa_head, tos, 0);
-
+ dscp = cfg->fc_dscp;
+ fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
- pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+ inet_dscp_to_dsfield(dscp), t);
fa_to_delete = NULL;
- fa_head = fa->fa_list.prev;
-
- list_for_each_entry(fa, fa_head, fa_list) {
+ hlist_for_each_entry_from(fa, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (fa->fa_tos != tos)
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_dscp != dscp))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
(cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
+ fa->fa_info->fib_scope == cfg->fc_scope) &&
+ (!cfg->fc_prefsrc ||
+ fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
+ fib_nh_match(net, cfg, fi, extack) == 0 &&
+ fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
}
@@ -1600,376 +1736,680 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
- &cfg->fc_nlinfo);
+ fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
- l = fib_find_node(t, key);
- li = find_leaf_info(l, plen);
+ if (!plen)
+ tb->tb_num_default--;
- list_del_rcu(&fa->fa_list);
+ fib_remove_alias(t, tp, l, fa_to_delete);
- if (list_empty(fa_head)) {
- hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
- }
+ if (fa_to_delete->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
- if (hlist_empty(&l->list))
- trie_leaf_remove(t, key);
+ fib_release_info(fa_to_delete->fa_info);
+ alias_free_mem_rcu(fa_to_delete);
+ return 0;
+}
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
+{
+ struct key_vector *pn, *n = *tn;
+ unsigned long cindex;
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
- return 0;
+ /* this loop is meant to try and find the key in the trie */
+ do {
+ /* record parent and next child index */
+ pn = n;
+ cindex = (key > pn->key) ? get_index(key, pn) : 0;
+
+ if (cindex >> pn->bits)
+ break;
+
+ /* descend into the next child */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ break;
+
+ /* guarantee forward progress on the keys */
+ if (IS_LEAF(n) && (n->key >= key))
+ goto found;
+ } while (IS_TNODE(n));
+
+ /* this loop will search for the next leaf with a greater key */
+ while (!IS_TRIE(pn)) {
+ /* if we exhausted the parent node we will need to climb */
+ if (cindex >= (1ul << pn->bits)) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ continue;
+
+ /* no need to compare keys since we bumped the index */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* Rescan start scanning in new node */
+ pn = n;
+ cindex = 0;
+ }
+
+ *tn = pn;
+ return NULL; /* Root of trie */
+found:
+ /* if we are at the limit for keys just return NULL for the tnode */
+ *tn = pn;
+ return n;
}
-static int trie_flush_list(struct trie *t, struct list_head *head)
+static void fib_trie_free(struct fib_table *tb)
{
- struct fib_alias *fa, *fa_node;
- int found = 0;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
- list_for_each_entry_safe(fa, fa_node, head, fa_list) {
- struct fib_info *fi = fa->fa_info;
+ /* walk trie in reverse order and free everything */
+ for (;;) {
+ struct key_vector *n;
- if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- fib_release_info(fa->fa_info);
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ n = pn;
+ pn = node_parent(pn);
+
+ /* drop emptied tnode */
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ hlist_del_rcu(&fa->fa_list);
alias_free_mem_rcu(fa);
- found++;
}
+
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
}
- return found;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ free_percpu(t->stats);
+#endif
+ kfree(tb);
}
-static int trie_flush_leaf(struct trie *t, struct leaf *l)
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
{
- int found = 0;
- struct hlist_head *lih = &l->list;
- struct hlist_node *node, *tmp;
- struct leaf_info *li = NULL;
+ struct trie *ot = (struct trie *)oldtb->tb_data;
+ struct key_vector *l, *tp = ot->kv;
+ struct fib_table *local_tb;
+ struct fib_alias *fa;
+ struct trie *lt;
+ t_key key = 0;
+
+ if (oldtb->tb_data == oldtb->__data)
+ return oldtb;
+
+ local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+ if (!local_tb)
+ return NULL;
+
+ lt = (struct trie *)local_tb->tb_data;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ struct key_vector *local_l = NULL, *local_tp;
+
+ hlist_for_each_entry(fa, &l->leaf, fa_list) {
+ struct fib_alias *new_fa;
+
+ if (local_tb->tb_id != fa->tb_id)
+ continue;
+
+ /* clone fa for new local table */
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
- hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
- found += trie_flush_list(t, &li->falh);
+ memcpy(new_fa, fa, sizeof(*fa));
- if (list_empty(&li->falh)) {
- hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
+ /* insert clone into table */
+ if (!local_l)
+ local_l = fib_find_node(lt, &local_tp, l->key);
+
+ if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+ NULL, l->key)) {
+ kmem_cache_free(fn_alias_kmem, new_fa);
+ goto out;
+ }
}
+
+ /* stop loop if key wrapped back to 0 */
+ key = l->key + 1;
+ if (key < l->key)
+ break;
}
- return found;
-}
-/* rcu_read_lock needs to be hold by caller from readside */
+ return local_tb;
+out:
+ fib_trie_free(local_tb);
-static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+ return NULL;
+}
+
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
{
- struct node *c = (struct node *) thisleaf;
- struct tnode *p;
- int idx;
- struct node *trie = rcu_dereference(t->trie);
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
- if (c == NULL) {
- if (trie == NULL)
- return NULL;
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
- if (IS_LEAF(trie)) /* trie w. just a leaf */
- return (struct leaf *) trie;
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- p = (struct tnode*) trie; /* Start */
- } else
- p = (struct tnode *) NODE_PARENT(c);
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
- while (p) {
- int pos, last;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
- /* Find the next child of the parent */
- if (c)
- pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
- else
- pos = 0;
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
- last = 1 << p->bits;
- for (idx = pos; idx < last ; idx++) {
- c = rcu_dereference(p->child[idx]);
+ continue;
+ }
- if (!c)
- continue;
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- /* Decend if tnode */
- while (IS_TNODE(c)) {
- p = (struct tnode *) c;
- idx = 0;
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- /* Rightmost non-NULL branch */
- if (p && IS_TNODE(p))
- while (!(c = rcu_dereference(p->child[idx]))
- && idx < (1<<p->bits)) idx++;
+ continue;
+ }
- /* Done with this tnode? */
- if (idx >= (1 << p->bits) || !c)
- goto up;
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
}
- return (struct leaf *) c;
+
+ /* record local slen */
+ slen = fa->fa_slen;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
}
-up:
- /* No more children go up one step */
- c = (struct node *) p;
- p = (struct tnode *) NODE_PARENT(p);
}
- return NULL; /* Ready. Root of trie */
}
-static int fn_trie_flush(struct fib_table *tb)
+/* Caller must hold RTNL. */
+int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
- struct trie *t = (struct trie *) tb->tb_data;
- struct leaf *ll = NULL, *l = NULL;
- int found = 0, h;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct nl_info info = { .nl_net = net };
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+ int found = 0;
- t->revision++;
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
- found += trie_flush_leaf(t, l);
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
- ll = l;
- }
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
- if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || tb->tb_id != fa->tb_id ||
+ (!(fi->fib_flags & RTNH_F_DEAD) &&
+ !fib_props[fa->fa_type].error)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ /* Do not flush error routes if network namespace is
+ * not being dismantled
+ */
+ if (!flush_all && fib_props[fa->fa_type].error) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ fib_notify_alias_delete(net, n->key, &n->leaf, fa,
+ NULL);
+ if (fi->pfsrc_removed)
+ rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0);
+ hlist_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+ }
pr_debug("trie_flush found=%d\n", found);
return found;
}
-static int trie_last_dflt = -1;
-
-static void
-fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+/* derived from fib_trie_free */
+static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
+ struct nl_info *info)
{
- struct trie *t = (struct trie *) tb->tb_data;
- int order, last_idx;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fib_alias *fa = NULL;
- struct list_head *fa_head;
- struct leaf *l;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct fib_alias *fa;
- last_idx = -1;
- last_resort = NULL;
- order = -1;
+ for (;;) {
+ struct key_vector *n;
- rcu_read_lock();
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- l = fib_find_node(t, 0);
- if (!l)
- goto out;
+ if (IS_TRIE(pn))
+ break;
- fa_head = get_fa_head(l, 0);
- if (!fa_head)
- goto out;
+ pn = node_parent(pn);
+ cindex = get_index(pkey, pn);
+ continue;
+ }
- if (list_empty(fa_head))
- goto out;
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
continue;
+ }
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
- fa->fa_state |= FA_S_ACCESSED;
+ hlist_for_each_entry(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, &trie_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- trie_last_dflt = order;
- goto out;
+ if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
+ continue;
+
+ rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id,
+ info, NLM_F_REPLACE);
}
- fi = next_fi;
- order++;
}
- if (order <= 0 || fi == NULL) {
- trie_last_dflt = -1;
- goto out;
+}
+
+void fib_info_notify_update(struct net *net, struct nl_info *info)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held())
+ __fib_info_notify_update(net, tb, info);
}
+}
- if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- trie_last_dflt = order;
- goto out;
+static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_alias *fa;
+ int last_slen = -1;
+ int err;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ if (fa->fa_slen == last_slen)
+ continue;
+
+ last_slen = fa->fa_slen;
+ err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
+ l->key, KEYLENGTH - fa->fa_slen,
+ fa, extack);
+ if (err)
+ return err;
}
- if (last_idx >= 0) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = last_resort;
- if (last_resort)
- atomic_inc(&last_resort->fib_clntref);
+ return 0;
+}
+
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+ int err;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ err = fib_leaf_notify(l, tb, nb, extack);
+ if (err)
+ return err;
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
}
- trie_last_dflt = last_idx;
- out:;
- rcu_read_unlock();
+ return 0;
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+int fib_notify(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- int i, s_i;
+ unsigned int h;
+ int err;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ err = fib_table_notify(tb, nb, extack);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static void __trie_free_rcu(struct rcu_head *head)
+{
+ struct fib_table *tb = container_of(head, struct fib_table, rcu);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie *t = (struct trie *)tb->tb_data;
+
+ if (tb->tb_data == tb->__data)
+ free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+ kfree(tb);
+}
+
+void fib_free_table(struct fib_table *tb)
+{
+ call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_dump_filter *filter)
+{
+ unsigned int flags = NLM_F_MULTI;
+ __be32 xkey = htonl(l->key);
+ int i, s_i, i_fa, s_fa, err;
struct fib_alias *fa;
- __be32 xkey = htonl(key);
+ if (filter->filter_set ||
+ !filter->dump_exceptions || !filter->dump_routes)
+ flags |= NLM_F_DUMP_FILTERED;
s_i = cb->args[4];
+ s_fa = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
- list_for_each_entry_rcu(fa, fah, fa_list) {
- if (i < s_i) {
- i++;
- continue;
+ if (i < s_i)
+ goto next;
+
+ i_fa = 0;
+
+ if (tb->tb_id != fa->tb_id)
+ goto next;
+
+ if (filter->filter_set) {
+ if (filter->rt_type && fa->fa_type != filter->rt_type)
+ goto next;
+
+ if ((filter->protocol &&
+ fi->fib_protocol != filter->protocol))
+ goto next;
+
+ if (filter->dev &&
+ !fib_info_nh_uses_dev(fi, filter->dev))
+ goto next;
}
- BUG_ON(!fa->fa_info);
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- xkey,
- plen,
- fa->fa_tos,
- fa->fa_info, 0) < 0) {
- cb->args[4] = i;
- return -1;
+
+ if (filter->dump_routes) {
+ if (!s_fa) {
+ struct fib_rt_info fri;
+
+ fri.fi = fi;
+ fri.tb_id = tb->tb_id;
+ fri.dst = xkey;
+ fri.dst_len = KEYLENGTH - fa->fa_slen;
+ fri.dscp = fa->fa_dscp;
+ fri.type = fa->fa_type;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed = READ_ONCE(fa->offload_failed);
+ err = fib_dump_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE, &fri, flags);
+ if (err < 0)
+ goto stop;
+ }
+
+ i_fa++;
}
+
+ if (filter->dump_exceptions) {
+ err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
+ &i_fa, s_fa, flags);
+ if (err < 0)
+ goto stop;
+ }
+
+next:
i++;
}
+
cb->args[4] = i;
return skb->len;
+
+stop:
+ cb->args[4] = i;
+ cb->args[5] = i_fa;
+ return err;
}
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+/* rcu_read_lock needs to be hold by caller from readside */
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb, struct fib_dump_filter *filter)
{
- int h, s_h;
- struct list_head *fa_head;
- struct leaf *l = NULL;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ /* Dump starting at last key.
+ * Note: 0.0.0.0/0 (ie default) is first key.
+ */
+ int count = cb->args[2];
+ t_key key = cb->args[3];
- s_h = cb->args[3];
+ /* First time here, count and key are both always 0. Count > 0
+ * and key == 0 means the dump has wrapped around and we are done.
+ */
+ if (count && !key)
+ return 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
- if (h < s_h)
- continue;
- if (h > s_h)
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ int err;
- fa_head = get_fa_head(l, plen);
+ err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
+ if (err < 0) {
+ cb->args[3] = key;
+ cb->args[2] = count;
+ return err;
+ }
- if (!fa_head)
- continue;
+ ++count;
+ key = l->key + 1;
- if (list_empty(fa_head))
- continue;
+ memset(&cb->args[4], 0,
+ sizeof(cb->args) - 4*sizeof(cb->args[0]));
- if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
- cb->args[3] = h;
- return -1;
- }
+ /* stop loop if key wrapped back to 0 */
+ if (key < l->key)
+ break;
}
- cb->args[3] = h;
- return skb->len;
-}
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
-{
- int m, s_m;
- struct trie *t = (struct trie *) tb->tb_data;
+ cb->args[3] = key;
+ cb->args[2] = count;
- s_m = cb->args[2];
+ return 0;
+}
- rcu_read_lock();
- for (m = 0; m <= 32; m++) {
- if (m < s_m)
- continue;
- if (m > s_m)
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
+void __init fib_trie_init(void)
+{
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
- if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
- cb->args[2] = m;
- goto out;
- }
- }
- rcu_read_unlock();
- cb->args[2] = m;
- return skb->len;
-out:
- rcu_read_unlock();
- return -1;
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ LEAF_SIZE,
+ 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
}
-/* Fix more generic FIB names for init later */
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(u32 id)
-#else
-struct fib_table * __init fib_hash_init(u32 id)
-#endif
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
struct fib_table *tb;
struct trie *t;
+ size_t sz = sizeof(*tb);
- if (fn_alias_kmem == NULL)
- fn_alias_kmem = kmem_cache_create("ip_fib_alias",
- sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
+ if (!alias)
+ sz += sizeof(struct trie);
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
- GFP_KERNEL);
- if (tb == NULL)
+ tb = kzalloc(sz, GFP_KERNEL);
+ if (!tb)
return NULL;
tb->tb_id = id;
- tb->tb_lookup = fn_trie_lookup;
- tb->tb_insert = fn_trie_insert;
- tb->tb_delete = fn_trie_delete;
- tb->tb_flush = fn_trie_flush;
- tb->tb_select_default = fn_trie_select_default;
- tb->tb_dump = fn_trie_dump;
- memset(tb->tb_data, 0, sizeof(struct trie));
-
- t = (struct trie *) tb->tb_data;
-
- trie_init(t);
+ tb->tb_num_default = 0;
+ tb->tb_data = (alias ? alias->__data : tb->__data);
- if (id == RT_TABLE_LOCAL)
- trie_local = t;
- else if (id == RT_TABLE_MAIN)
- trie_main = t;
+ if (alias)
+ return tb;
- if (id == RT_TABLE_LOCAL)
- printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
+ t = (struct trie *) tb->tb_data;
+ t->kv[0].pos = KEYLENGTH;
+ t->kv[0].slen = KEYLENGTH;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats = alloc_percpu(struct trie_use_stats);
+ if (!t->stats) {
+ kfree(tb);
+ tb = NULL;
+ }
+#endif
return tb;
}
@@ -1977,102 +2417,106 @@ struct fib_table * __init fib_hash_init(u32 id)
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
- struct tnode *tnode;
- struct trie *trie;
- unsigned index;
- unsigned depth;
+ struct seq_net_private p;
+ struct fib_table *tb;
+ struct key_vector *tnode;
+ unsigned int index;
+ unsigned int depth;
};
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
{
- struct tnode *tn = iter->tnode;
- unsigned cindex = iter->index;
- struct tnode *p;
+ unsigned long cindex = iter->index;
+ struct key_vector *pn = iter->tnode;
+ t_key pkey;
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
iter->tnode, iter->index, iter->depth);
-rescan:
- while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child(tn, cindex);
- if (n) {
+ while (!IS_TRIE(pn)) {
+ while (cindex < child_length(pn)) {
+ struct key_vector *n = get_child_rcu(pn, cindex++);
+
+ if (!n)
+ continue;
+
if (IS_LEAF(n)) {
- iter->tnode = tn;
- iter->index = cindex + 1;
+ iter->tnode = pn;
+ iter->index = cindex;
} else {
/* push down one level */
- iter->tnode = (struct tnode *) n;
+ iter->tnode = n;
iter->index = 0;
++iter->depth;
}
+
return n;
}
- ++cindex;
- }
-
- /* Current node exhausted, pop back up */
- p = NODE_PARENT(tn);
- if (p) {
- cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
- tn = p;
+ /* Current node exhausted, pop back up */
+ pkey = pn->key;
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
--iter->depth;
- goto rescan;
}
- /* got root? */
+ /* record root node so further searches know we are done */
+ iter->tnode = pn;
+ iter->index = 0;
+
return NULL;
}
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
- struct trie *t)
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
{
- struct node *n ;
+ struct key_vector *n, *pn;
- if(!t)
+ if (!t)
return NULL;
- n = rcu_dereference(t->trie);
-
- if(!iter)
+ pn = t->kv;
+ n = rcu_dereference(pn->tnode[0]);
+ if (!n)
return NULL;
- if (n && IS_TNODE(n)) {
- iter->tnode = (struct tnode *) n;
- iter->trie = t;
+ if (IS_TNODE(n)) {
+ iter->tnode = n;
iter->index = 0;
iter->depth = 1;
- return n;
+ } else {
+ iter->tnode = pn;
+ iter->index = 0;
+ iter->depth = 0;
}
- return NULL;
+
+ return n;
}
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct node *n;
+ struct key_vector *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
rcu_read_lock();
- for (n = fib_trie_get_first(&iter, t); n;
- n = fib_trie_get_next(&iter)) {
+ for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
+ struct fib_alias *fa;
+
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- } else {
- const struct tnode *tn = (const struct tnode *) n;
- int i;
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
+ ++s->prefixes;
+ } else {
s->tnodes++;
- if(tn->bits < MAX_STAT_DEPTH)
- s->nodesizes[tn->bits]++;
-
- for (i = 0; i < (1<<tn->bits); i++)
- if (!tn->child[i])
- s->nullpointers++;
+ if (n->bits < MAX_STAT_DEPTH)
+ s->nodesizes[n->bits]++;
+ s->nullpointers += tn_info(n)->empty_children;
}
}
rcu_read_unlock();
@@ -2083,169 +2527,223 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
*/
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
- unsigned i, max, pointers, bytes, avdepth;
+ unsigned int i, max, pointers, bytes, avdepth;
if (stat->leaves)
avdepth = stat->totdepth*100 / stat->leaves;
else
avdepth = 0;
- seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+ seq_printf(seq, "\tAver depth: %u.%02d\n",
+ avdepth / 100, avdepth % 100);
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
+ bytes = LEAF_SIZE * stat->leaves;
+
+ seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
+ bytes += sizeof(struct fib_alias) * stat->prefixes;
- bytes = sizeof(struct leaf) * stat->leaves;
- seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
- bytes += sizeof(struct tnode) * stat->tnodes;
+ seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
+ bytes += TNODE_SIZE(0) * stat->tnodes;
max = MAX_STAT_DEPTH;
while (max > 0 && stat->nodesizes[max-1] == 0)
max--;
pointers = 0;
- for (i = 1; i <= max; i++)
+ for (i = 1; i < max; i++)
if (stat->nodesizes[i] != 0) {
- seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
+ seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
pointers += (1<<i) * stat->nodesizes[i];
}
seq_putc(seq, '\n');
- seq_printf(seq, "\tPointers: %d\n", pointers);
+ seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct node *) * pointers;
- seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
- seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
+ bytes += sizeof(struct key_vector *) * pointers;
+ seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
+}
#ifdef CONFIG_IP_FIB_TRIE_STATS
- seq_printf(seq, "Counters:\n---------\n");
- seq_printf(seq,"gets = %d\n", t->stats.gets);
- seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
- seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
- seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
- seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
- seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
-#ifdef CLEAR_STATS
- memset(&(t->stats), 0, sizeof(t->stats));
-#endif
+static void trie_show_usage(struct seq_file *seq,
+ const struct trie_use_stats __percpu *stats)
+{
+ struct trie_use_stats s = { 0 };
+ int cpu;
+
+ /* loop through all of the CPUs and gather up the stats */
+ for_each_possible_cpu(cpu) {
+ const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+ s.gets += pcpu->gets;
+ s.backtrack += pcpu->backtrack;
+ s.semantic_match_passed += pcpu->semantic_match_passed;
+ s.semantic_match_miss += pcpu->semantic_match_miss;
+ s.null_node_hit += pcpu->null_node_hit;
+ s.resize_node_skipped += pcpu->resize_node_skipped;
+ }
+
+ seq_printf(seq, "\nCounters:\n---------\n");
+ seq_printf(seq, "gets = %u\n", s.gets);
+ seq_printf(seq, "backtracks = %u\n", s.backtrack);
+ seq_printf(seq, "semantic match passed = %u\n",
+ s.semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
+}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
+{
+ if (tb->tb_id == RT_TABLE_LOCAL)
+ seq_puts(seq, "Local:\n");
+ else if (tb->tb_id == RT_TABLE_MAIN)
+ seq_puts(seq, "Main:\n");
+ else
+ seq_printf(seq, "Id %d:\n", tb->tb_id);
}
+
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- struct trie_stat *stat;
+ struct net *net = seq->private;
+ unsigned int h;
- stat = kmalloc(sizeof(*stat), GFP_KERNEL);
- if (!stat)
- return -ENOMEM;
+ seq_printf(seq,
+ "Basic info: size of leaf:"
+ " %zd bytes, size of tnode: %zd bytes.\n",
+ LEAF_SIZE, TNODE_SIZE(0));
- seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct leaf), sizeof(struct tnode));
+ rcu_read_lock();
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
- if (trie_local) {
- seq_printf(seq, "Local:\n");
- trie_collect_stats(trie_local, stat);
- trie_show_stats(seq, stat);
- }
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct trie_stat stat;
- if (trie_main) {
- seq_printf(seq, "Main:\n");
- trie_collect_stats(trie_main, stat);
- trie_show_stats(seq, stat);
+ if (!t)
+ continue;
+
+ fib_table_print(seq, tb);
+
+ trie_collect_stats(t, &stat);
+ trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ trie_show_usage(seq, t->stats);
+#endif
+ }
+ cond_resched_rcu();
}
- kfree(stat);
+ rcu_read_unlock();
return 0;
}
-static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
- return single_open(file, fib_triestat_seq_show, NULL);
-}
+ struct fib_trie_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ loff_t idx = 0;
+ unsigned int h;
-static struct file_operations fib_triestat_fops = {
- .owner = THIS_MODULE,
- .open = fib_triestat_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
-static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
- loff_t pos)
-{
- loff_t idx = 0;
- struct node *n;
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct key_vector *n;
- for (n = fib_trie_get_first(iter, trie_local);
- n; ++idx, n = fib_trie_get_next(iter)) {
- if (pos == idx)
- return n;
+ for (n = fib_trie_get_first(iter,
+ (struct trie *) tb->tb_data);
+ n; n = fib_trie_get_next(iter))
+ if (pos == idx++) {
+ iter->tb = tb;
+ return n;
+ }
+ }
}
- for (n = fib_trie_get_first(iter, trie_main);
- n; ++idx, n = fib_trie_get_next(iter)) {
- if (pos == idx)
- return n;
- }
return NULL;
}
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
rcu_read_lock();
- if (*pos == 0)
- return SEQ_START_TOKEN;
- return fib_trie_get_idx(seq->private, *pos - 1);
+ return fib_trie_get_idx(seq, *pos);
}
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_trie_iter *iter = seq->private;
- void *l = v;
+ struct net *net = seq_file_net(seq);
+ struct fib_table *tb = iter->tb;
+ struct hlist_node *tb_node;
+ unsigned int h;
+ struct key_vector *n;
++*pos;
- if (v == SEQ_START_TOKEN)
- return fib_trie_get_idx(iter, 0);
-
- v = fib_trie_get_next(iter);
- BUG_ON(v == l);
- if (v)
- return v;
+ /* next node in same table */
+ n = fib_trie_get_next(iter);
+ if (n)
+ return n;
- /* continue scan in next trie */
- if (iter->trie == trie_local)
- return fib_trie_get_first(iter, trie_main);
+ /* walk rest of this hash chain */
+ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
+ while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
+ tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+ /* new hash chain */
+ while (++h < FIB_TABLE_HASHSZ) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+ }
return NULL;
+
+found:
+ iter->tb = tb;
+ return n;
}
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
rcu_read_unlock();
}
static void seq_indent(struct seq_file *seq, int n)
{
- while (n-- > 0) seq_puts(seq, " ");
+ while (n-- > 0)
+ seq_puts(seq, " ");
}
-static inline const char *rtn_scope(enum rt_scope_t s)
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
{
- static char buf[32];
-
- switch(s) {
+ switch (s) {
case RT_SCOPE_UNIVERSE: return "universe";
case RT_SCOPE_SITE: return "site";
case RT_SCOPE_LINK: return "link";
case RT_SCOPE_HOST: return "host";
case RT_SCOPE_NOWHERE: return "nowhere";
default:
- snprintf(buf, sizeof(buf), "scope=%d", s);
+ snprintf(buf, len, "scope=%d", s);
return buf;
}
}
-static const char *rtn_type_names[__RTN_MAX] = {
+static const char *const rtn_type_names[__RTN_MAX] = {
[RTN_UNSPEC] = "UNSPEC",
[RTN_UNICAST] = "UNICAST",
[RTN_LOCAL] = "LOCAL",
@@ -2260,13 +2758,11 @@ static const char *rtn_type_names[__RTN_MAX] = {
[RTN_XRESOLVE] = "XRESOLVE",
};
-static inline const char *rtn_type(unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
{
- static char buf[32];
-
if (t < __RTN_MAX && rtn_type_names[t])
return rtn_type_names[t];
- snprintf(buf, sizeof(buf), "type %d", t);
+ snprintf(buf, len, "type %u", t);
return buf;
}
@@ -2274,101 +2770,161 @@ static inline const char *rtn_type(unsigned t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct node *n = v;
+ struct key_vector *n = v;
- if (v == SEQ_START_TOKEN)
- return 0;
+ if (IS_TRIE(node_parent_rcu(n)))
+ fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- __be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
-
- if (!NODE_PARENT(n)) {
- if (iter->trie == trie_local)
- seq_puts(seq, "<local>:\n");
- else
- seq_puts(seq, "<main>:\n");
- }
+ __be32 prf = htonl(n->key);
+
seq_indent(seq, iter->depth-1);
- seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
- NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
- tn->empty_children);
-
+ seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
+ &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+ tn_info(n)->full_children,
+ tn_info(n)->empty_children);
} else {
- struct leaf *l = (struct leaf *) n;
- int i;
- __be32 val = htonl(l->key);
+ __be32 val = htonl(n->key);
+ struct fib_alias *fa;
seq_indent(seq, iter->depth);
- seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
- for (i = 32; i >= 0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
- if (li) {
- struct fib_alias *fa;
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", i,
- rtn_scope(fa->fa_scope),
- rtn_type(fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, "tos =%d\n",
- fa->fa_tos);
- seq_putc(seq, '\n');
- }
- }
+ seq_printf(seq, " |-- %pI4\n", &val);
+
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth + 1);
+ seq_printf(seq, " /%zu %s %s",
+ KEYLENGTH - fa->fa_slen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_dscp)
+ seq_printf(seq, " tos=%d",
+ inet_dscp_to_dsfield(fa->fa_dscp));
+ seq_putc(seq, '\n');
}
}
return 0;
}
-static struct seq_operations fib_trie_seq_ops = {
+static const struct seq_operations fib_trie_seq_ops = {
.start = fib_trie_seq_start,
.next = fib_trie_seq_next,
.stop = fib_trie_seq_stop,
.show = fib_trie_seq_show,
};
-static int fib_trie_seq_open(struct inode *inode, struct file *file)
+struct fib_route_iter {
+ struct seq_net_private p;
+ struct fib_table *main_tb;
+ struct key_vector *tnode;
+ loff_t pos;
+ t_key key;
+};
+
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+ loff_t pos)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+ struct key_vector *l, **tp = &iter->tnode;
+ t_key key;
- if (!s)
- goto out;
+ /* use cached location of previously found key */
+ if (iter->pos > 0 && pos >= iter->pos) {
+ key = iter->key;
+ } else {
+ iter->pos = 1;
+ key = 0;
+ }
- rc = seq_open(file, &fib_trie_seq_ops);
- if (rc)
- goto out_kfree;
+ pos -= iter->pos;
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
+ key = l->key + 1;
+ iter->pos++;
+ l = NULL;
+
+ /* handle unlikely case of a key wrap */
+ if (!key)
+ break;
+ }
+
+ if (l)
+ iter->key = l->key; /* remember it */
+ else
+ iter->pos = 0; /* forget it */
+
+ return l;
}
-static struct file_operations fib_trie_fops = {
- .owner = THIS_MODULE,
- .open = fib_trie_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb;
+ struct trie *t;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+ if (!tb)
+ return NULL;
+
+ iter->main_tb = tb;
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+ if (*pos != 0)
+ return fib_route_get_idx(iter, *pos);
+
+ iter->pos = 0;
+ iter->key = KEY_MAX;
+
+ return SEQ_START_TOKEN;
+}
+
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- static unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
+ struct fib_route_iter *iter = seq->private;
+ struct key_vector *l = NULL;
+ t_key key = iter->key + 1;
+
+ ++*pos;
+
+ /* only allow key of 0 for start of sequence */
+ if ((v == SEQ_START_TOKEN) || key)
+ l = leaf_walk_rcu(&iter->tnode, key);
+
+ if (l) {
+ iter->key = l->key;
+ iter->pos++;
+ } else {
+ iter->pos = 0;
+ }
+
+ return l;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
+{
+ unsigned int flags = 0;
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
+ if (fi) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+
+ if (nhc->nhc_gw.ipv4)
+ flags |= RTF_GATEWAY;
+ }
if (mask == htonl(0xFFFFFFFF))
flags |= RTF_HOST;
flags |= RTF_UP;
@@ -2378,15 +2934,16 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
/*
* This outputs /proc/net/route.
* The format of the file is not supposed to be changed
- * and needs to be same as fib_hash output to avoid breaking
+ * and needs to be same as fib_hash output to avoid breaking
* legacy utilities
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- const struct fib_trie_iter *iter = seq->private;
- struct leaf *l = v;
- int i;
- char bf[128];
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb = iter->main_tb;
+ struct fib_alias *fa;
+ struct key_vector *l = v;
+ __be32 prefix;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2395,118 +2952,89 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- if (iter->trie == trie_local)
- return 0;
- if (IS_TNODE(l))
- return 0;
+ prefix = htonl(l->key);
- for (i=32; i>=0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
- struct fib_alias *fa;
- __be32 mask, prefix;
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (!li)
+ if ((fa->fa_type == RTN_BROADCAST) ||
+ (fa->fa_type == RTN_MULTICAST))
continue;
- mask = inet_make_mask(li->plen);
- prefix = htonl(l->key);
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- const struct fib_info *fi = fa->fa_info;
- unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
-
- if (fa->fa_type == RTN_BROADCAST
- || fa->fa_type == RTN_MULTICAST)
- continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
- if (fi)
- snprintf(bf, sizeof(bf),
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->nh_gw, flags, 0, 0,
- fi->fib_priority,
- mask,
- (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3);
- else
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
- prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0);
-
- seq_printf(seq, "%-127s\n", bf);
+ seq_setwidth(seq, 127);
+
+ if (fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+ __be32 gw = 0;
+
+ if (nhc->nhc_gw_family == AF_INET)
+ gw = nhc->nhc_gw.ipv4;
+
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%u\t%08X\t%d\t%u\t%u",
+ nhc->nhc_dev ? nhc->nhc_dev->name : "*",
+ prefix, gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ } else {
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%u\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
}
+ seq_pad(seq, '\n');
}
return 0;
}
-static struct seq_operations fib_route_seq_ops = {
- .start = fib_trie_seq_start,
- .next = fib_trie_seq_next,
- .stop = fib_trie_seq_stop,
+static const struct seq_operations fib_route_seq_ops = {
+ .start = fib_route_seq_start,
+ .next = fib_route_seq_next,
+ .stop = fib_route_seq_stop,
.show = fib_route_seq_show,
};
-static int fib_route_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &fib_route_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations fib_route_fops = {
- .owner = THIS_MODULE,
- .open = fib_route_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-int __init fib_proc_init(void)
+int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
+ sizeof(struct fib_trie_iter)))
goto out1;
- if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
+ if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
+ fib_triestat_seq_show, NULL))
goto out2;
- if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
+ if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
+ sizeof(struct fib_route_iter)))
goto out3;
return 0;
out3:
- proc_net_remove("fib_triestat");
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- proc_net_remove("fib_trie");
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
-void __init fib_proc_exit(void)
+void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove("fib_trie");
- proc_net_remove("fib_triestat");
- proc_net_remove("route");
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
new file mode 100644
index 000000000000..54984f3170a8
--- /dev/null
+++ b/net/ipv4/fou_bpf.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Fou Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/fou.h>
+
+struct bpf_fou_encap {
+ __be16 sport;
+ __be16 dport;
+};
+
+enum bpf_fou_encap_type {
+ FOU_BPF_ENCAP_FOU,
+ FOU_BPF_ENCAP_GUE,
+};
+
+__bpf_kfunc_start_defs();
+
+/* bpf_skb_set_fou_encap - Set FOU encap parameters
+ *
+ * This function allows for using GUE or FOU encapsulation together with an
+ * ipip device in collect-metadata mode.
+ *
+ * It is meant to be used in BPF tc-hooks and after a call to the
+ * bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
+ *
+ * Parameters:
+ * @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap Pointer to a `struct bpf_fou_encap` storing UDP src and
+ * dst ports. If sport is set to 0 the kernel will auto-assign a
+ * port. This is similar to using `encap-sport auto`.
+ * Cannot be NULL
+ * @type Encapsulation type for the packet. Their definitions are
+ * specified in `enum bpf_fou_encap_type`
+ */
+__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
+ struct bpf_fou_encap *encap, int type)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!encap))
+ return -EINVAL;
+
+ if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
+
+ switch (type) {
+ case FOU_BPF_ENCAP_FOU:
+ info->encap.type = TUNNEL_ENCAP_FOU;
+ break;
+ case FOU_BPF_ENCAP_GUE:
+ info->encap.type = TUNNEL_ENCAP_GUE;
+ break;
+ default:
+ info->encap.type = TUNNEL_ENCAP_NONE;
+ }
+
+ if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
+ info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
+
+ info->encap.sport = encap->sport;
+ info->encap.dport = encap->dport;
+
+ return 0;
+}
+
+/* bpf_skb_get_fou_encap - Get FOU encap parameters
+ *
+ * This function allows for reading encap metadata from a packet received
+ * on an ipip device in collect-metadata mode.
+ *
+ * Parameters:
+ * @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap Pointer to a struct bpf_fou_encap storing UDP source and
+ * destination port. Cannot be NULL
+ */
+__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
+ struct bpf_fou_encap *encap)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!info))
+ return -EINVAL;
+
+ encap->sport = info->encap.sport;
+ encap->dport = info->encap.dport;
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(fou_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
+BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
+BTF_KFUNCS_END(fou_kfunc_set)
+
+static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fou_kfunc_set,
+};
+
+int register_fou_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &fou_bpf_kfunc_set);
+}
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
new file mode 100644
index 000000000000..3970b6b7ace5
--- /dev/null
+++ b/net/ipv4/fou_core.c
@@ -0,0 +1,1281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include <net/gro.h>
+#include <net/gue.h>
+#include <net/fou.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <uapi/linux/fou.h>
+#include <uapi/linux/genetlink.h>
+
+#include "fou_nl.h"
+
+struct fou {
+ struct socket *sock;
+ u8 protocol;
+ u8 flags;
+ __be16 port;
+ u8 family;
+ u16 type;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
+
+struct fou_cfg {
+ u16 type;
+ u8 protocol;
+ u8 flags;
+ struct udp_port_cfg udp_config;
+};
+
+static unsigned int fou_net_id;
+
+struct fou_net {
+ struct list_head fou_list;
+ struct mutex fou_lock;
+};
+
+static inline struct fou *fou_from_sock(struct sock *sk)
+{
+ return rcu_dereference_sk_user_data(sk);
+}
+
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
+{
+ /* Remove 'len' bytes from the packet (UDP header and
+ * FOU header if present).
+ */
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+ skb_reset_transport_header(skb);
+ return iptunnel_pull_offloads(skb);
+}
+
+static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+
+ if (!fou)
+ return 1;
+
+ if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
+ goto drop;
+
+ return -fou->protocol;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+ void *data, size_t hdrlen, u8 ipproto,
+ bool nopartial)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = sizeof(struct udphdr) + hdrlen +
+ max_t(size_t, offset + sizeof(u16), start);
+
+ if (skb->remcsum_offload)
+ return guehdr;
+
+ if (!pskb_may_pull(skb, plen))
+ return NULL;
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ skb_remcsum_process(skb, (void *)guehdr + hdrlen,
+ start, offset, nopartial);
+
+ return guehdr;
+}
+
+static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
+{
+ /* No support yet */
+ kfree_skb(skb);
+ return 0;
+}
+
+static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+ size_t len, optlen, hdrlen;
+ struct guehdr *guehdr;
+ void *data;
+ u16 doffset = 0;
+ u8 proto_ctype;
+
+ if (!fou)
+ return 1;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+
+ case 1: {
+ /* Direct encapsulation of IPv4 or IPv6 */
+
+ int prot;
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ prot = IPPROTO_IPIP;
+ break;
+ case 6:
+ prot = IPPROTO_IPV6;
+ break;
+ default:
+ goto drop;
+ }
+
+ if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
+ goto drop;
+
+ return -prot;
+ }
+
+ default: /* Undefined version */
+ goto drop;
+ }
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
+
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ /* guehdr may change after pull */
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ if (validate_gue_flags(guehdr, optlen))
+ goto drop;
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
+ /* Pull csum through the guehdr now . This can be used if
+ * there is a remote checksum offload.
+ */
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_remcsum(skb, guehdr, data + doffset,
+ hdrlen, guehdr->proto_ctype,
+ !!(fou->flags &
+ FOU_F_REMCSUM_NOPARTIAL));
+ if (!guehdr)
+ goto drop;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
+ }
+
+ if (unlikely(guehdr->control))
+ return gue_control_message(skb, guehdr);
+
+ proto_ctype = guehdr->proto_ctype;
+ __skb_pull(skb, sizeof(struct udphdr) + hdrlen);
+ skb_reset_transport_header(skb);
+
+ if (iptunnel_pull_offloads(skb))
+ goto drop;
+
+ return -proto_ctype;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static const struct net_offload *fou_gro_ops(const struct sock *sk,
+ int proto)
+{
+ const struct net_offload __rcu **offloads;
+
+ /* FOU doesn't allow IPv4 on IPv6 sockets. */
+ offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads;
+ return rcu_dereference(offloads[proto]);
+}
+
+static struct sk_buff *fou_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+
+ if (!fou)
+ goto out;
+
+ /* We can clear the encap_mark for FOU as we are essentially doing
+ * one of two possible things. We are either adding an L4 tunnel
+ * header to the outer L3 tunnel header, or we are simply
+ * treating the GRE tunnel header as though it is a UDP protocol
+ * specific header such as VXLAN or GENEVE.
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Flag this frame as already having an outer encap header */
+ NAPI_GRO_CB(skb)->is_fou = 1;
+
+ ops = fou_gro_ops(sk, fou->protocol);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
+
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+
+out:
+ return pp;
+}
+
+static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
+ int nhoff)
+{
+ struct fou *fou = fou_from_sock(sk);
+ const struct net_offload *ops;
+ int err;
+
+ if (!fou) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ ops = fou_gro_ops(sk, fou->protocol);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete)) {
+ err = -ENOSYS;
+ goto out;
+ }
+
+ err = ops->callbacks.gro_complete(skb, nhoff);
+
+ skb_set_inner_mac_header(skb, nhoff);
+
+out:
+ return err;
+}
+
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+ struct guehdr *guehdr, void *data,
+ size_t hdrlen, struct gro_remcsum *grc,
+ bool nopartial)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+
+ if (skb->remcsum_offload)
+ return guehdr;
+
+ if (!NAPI_GRO_CB(skb)->csum_valid)
+ return NULL;
+
+ guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
+ start, offset, grc, nopartial);
+
+ skb->remcsum_offload = 1;
+
+ return guehdr;
+}
+
+static struct sk_buff *gue_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct guehdr *guehdr;
+ size_t len, optlen, hdrlen, off;
+ void *data;
+ u16 doffset = 0;
+ int flush = 1;
+ struct fou *fou = fou_from_sock(sk);
+ struct gro_remcsum grc;
+ u8 proto;
+
+ skb_gro_remcsum_init(&grc);
+
+ if (!fou)
+ goto out;
+
+ off = skb_gro_offset(skb);
+ len = off + sizeof(*guehdr);
+
+ guehdr = skb_gro_header(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+
+ switch (guehdr->version) {
+ case 0:
+ break;
+ case 1:
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ proto = IPPROTO_IPIP;
+ break;
+ case 6:
+ proto = IPPROTO_IPV6;
+ break;
+ default:
+ goto out;
+ }
+ goto next_proto;
+ default:
+ goto out;
+ }
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
+
+ if (!skb_gro_may_pull(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
+
+ if (unlikely(guehdr->control) || guehdr->version != 0 ||
+ validate_gue_flags(guehdr, optlen))
+ goto out;
+
+ hdrlen = sizeof(*guehdr) + optlen;
+
+ /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+ * this is needed if there is a remote checkcsum offload.
+ */
+ skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_gro_remcsum(skb, off, guehdr,
+ data + doffset, hdrlen, &grc,
+ !!(fou->flags &
+ FOU_F_REMCSUM_NOPARTIAL));
+
+ if (!guehdr)
+ goto out;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
+ }
+
+ skb_gro_pull(skb, hdrlen);
+
+ list_for_each_entry(p, head, list) {
+ const struct guehdr *guehdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ guehdr2 = (struct guehdr *)(p->data + off);
+
+ /* Compare base GUE header to be equal (covers
+ * hlen, version, proto_ctype, and flags.
+ */
+ if (guehdr->word != guehdr2->word) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Compare optional fields are the same. */
+ if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
+ guehdr->hlen << 2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ proto = guehdr->proto_ctype;
+
+next_proto:
+
+ /* We can clear the encap_mark for GUE as we are essentially doing
+ * one of two possible things. We are either adding an L4 tunnel
+ * header to the outer L3 tunnel header, or we are simply
+ * treating the GRE tunnel header as though it is a UDP protocol
+ * specific header such as VXLAN or GENEVE.
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Flag this frame as already having an outer encap header */
+ NAPI_GRO_CB(skb)->is_fou = 1;
+
+ ops = fou_gro_ops(sk, proto);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
+
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ flush = 0;
+
+out:
+ skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
+
+ return pp;
+}
+
+static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
+{
+ struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ unsigned int guehlen = 0;
+ u8 proto;
+ int err = -ENOENT;
+
+ switch (guehdr->version) {
+ case 0:
+ proto = guehdr->proto_ctype;
+ guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+ break;
+ case 1:
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ proto = IPPROTO_IPIP;
+ break;
+ case 6:
+ proto = IPPROTO_IPV6;
+ break;
+ default:
+ return err;
+ }
+ break;
+ default:
+ return err;
+ }
+
+ ops = fou_gro_ops(sk, proto);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out;
+
+ err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+
+ skb_set_inner_mac_header(skb, nhoff + guehlen);
+
+out:
+ return err;
+}
+
+static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg)
+{
+ struct sock *sk = fou->sock->sk;
+ struct udp_port_cfg *udp_cfg = &cfg->udp_config;
+
+ if (fou->family != udp_cfg->family ||
+ fou->port != udp_cfg->local_udp_port ||
+ sk->sk_dport != udp_cfg->peer_udp_port ||
+ sk->sk_bound_dev_if != udp_cfg->bind_ifindex)
+ return false;
+
+ if (fou->family == AF_INET) {
+ if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr ||
+ sk->sk_daddr != udp_cfg->peer_ip.s_addr)
+ return false;
+ else
+ return true;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) ||
+ ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6))
+ return false;
+ else
+ return true;
+#endif
+ }
+
+ return false;
+}
+
+static int fou_add_to_port_list(struct net *net, struct fou *fou,
+ struct fou_cfg *cfg)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (fou_cfg_cmp(fout, cfg)) {
+ mutex_unlock(&fn->fou_lock);
+ return -EALREADY;
+ }
+ }
+
+ list_add(&fou->list, &fn->fou_list);
+ mutex_unlock(&fn->fou_lock);
+
+ return 0;
+}
+
+static void fou_release(struct fou *fou)
+{
+ struct socket *sock = fou->sock;
+
+ list_del(&fou->list);
+ udp_tunnel_sock_release(sock);
+
+ kfree_rcu(fou, rcu);
+}
+
+static int fou_create(struct net *net, struct fou_cfg *cfg,
+ struct socket **sockp)
+{
+ struct socket *sock = NULL;
+ struct fou *fou = NULL;
+ struct sock *sk;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
+ int err;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &cfg->udp_config, &sock);
+ if (err < 0)
+ goto error;
+
+ /* Allocate FOU port structure */
+ fou = kzalloc(sizeof(*fou), GFP_KERNEL);
+ if (!fou) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ sk = sock->sk;
+
+ fou->port = cfg->udp_config.local_udp_port;
+ fou->family = cfg->udp_config.family;
+ fou->flags = cfg->flags;
+ fou->type = cfg->type;
+ fou->sock = sock;
+
+ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.sk_user_data = fou;
+ tunnel_cfg.encap_destroy = NULL;
+
+ /* Initial for fou type */
+ switch (cfg->type) {
+ case FOU_ENCAP_DIRECT:
+ tunnel_cfg.encap_rcv = fou_udp_recv;
+ tunnel_cfg.gro_receive = fou_gro_receive;
+ tunnel_cfg.gro_complete = fou_gro_complete;
+ fou->protocol = cfg->protocol;
+ break;
+ case FOU_ENCAP_GUE:
+ tunnel_cfg.encap_rcv = gue_udp_recv;
+ tunnel_cfg.gro_receive = gue_gro_receive;
+ tunnel_cfg.gro_complete = gue_gro_complete;
+ break;
+ default:
+ err = -EINVAL;
+ goto error;
+ }
+
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+ sk->sk_allocation = GFP_ATOMIC;
+
+ err = fou_add_to_port_list(net, fou, cfg);
+ if (err)
+ goto error;
+
+ if (sockp)
+ *sockp = sock;
+
+ return 0;
+
+error:
+ kfree(fou);
+ if (sock)
+ udp_tunnel_sock_release(sock);
+
+ return err;
+}
+
+static int fou_destroy(struct net *net, struct fou_cfg *cfg)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ int err = -EINVAL;
+ struct fou *fou;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fou, &fn->fou_list, list) {
+ if (fou_cfg_cmp(fou, cfg)) {
+ fou_release(fou);
+ err = 0;
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+
+ return err;
+}
+
+static struct genl_family fou_nl_family;
+
+static int parse_nl_config(struct genl_info *info,
+ struct fou_cfg *cfg)
+{
+ bool has_local = false, has_peer = false;
+ struct nlattr *attr;
+ int ifindex;
+ __be16 port;
+
+ memset(cfg, 0, sizeof(*cfg));
+
+ cfg->udp_config.family = AF_INET;
+
+ if (info->attrs[FOU_ATTR_AF]) {
+ u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
+
+ switch (family) {
+ case AF_INET:
+ break;
+ case AF_INET6:
+ cfg->udp_config.ipv6_v6only = 1;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ cfg->udp_config.family = family;
+ }
+
+ if (info->attrs[FOU_ATTR_PORT]) {
+ port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
+ cfg->udp_config.local_udp_port = port;
+ }
+
+ if (info->attrs[FOU_ATTR_IPPROTO])
+ cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
+
+ if (info->attrs[FOU_ATTR_TYPE])
+ cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
+
+ if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
+ cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
+
+ if (cfg->udp_config.family == AF_INET) {
+ if (info->attrs[FOU_ATTR_LOCAL_V4]) {
+ attr = info->attrs[FOU_ATTR_LOCAL_V4];
+ cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr);
+ has_local = true;
+ }
+
+ if (info->attrs[FOU_ATTR_PEER_V4]) {
+ attr = info->attrs[FOU_ATTR_PEER_V4];
+ cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr);
+ has_peer = true;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (info->attrs[FOU_ATTR_LOCAL_V6]) {
+ attr = info->attrs[FOU_ATTR_LOCAL_V6];
+ cfg->udp_config.local_ip6 = nla_get_in6_addr(attr);
+ has_local = true;
+ }
+
+ if (info->attrs[FOU_ATTR_PEER_V6]) {
+ attr = info->attrs[FOU_ATTR_PEER_V6];
+ cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr);
+ has_peer = true;
+ }
+#endif
+ }
+
+ if (has_peer) {
+ if (info->attrs[FOU_ATTR_PEER_PORT]) {
+ port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]);
+ cfg->udp_config.peer_udp_port = port;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ if (info->attrs[FOU_ATTR_IFINDEX]) {
+ if (!has_local)
+ return -EINVAL;
+
+ ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]);
+
+ cfg->udp_config.bind_ifindex = ifindex;
+ }
+
+ return 0;
+}
+
+int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_create(net, &cfg, NULL);
+}
+
+int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_destroy(net, &cfg);
+}
+
+static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
+{
+ struct sock *sk = fou->sock->sk;
+
+ if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
+ nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+ nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) ||
+ nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
+ nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) ||
+ nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if))
+ return -1;
+
+ if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
+ if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
+ return -1;
+
+ if (fou->sock->sk->sk_family == AF_INET) {
+ if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr))
+ return -1;
+
+ if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr))
+ return -1;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6,
+ &sk->sk_v6_rcv_saddr))
+ return -1;
+
+ if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr))
+ return -1;
+#endif
+ }
+
+ return 0;
+}
+
+static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
+ u32 flags, struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (fou_fill_info(fou, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct sk_buff *msg;
+ struct fou_cfg cfg;
+ struct fou *fout;
+ __be16 port;
+ u8 family;
+ int ret;
+
+ ret = parse_nl_config(info, &cfg);
+ if (ret)
+ return ret;
+ port = cfg.udp_config.local_udp_port;
+ if (port == 0)
+ return -EINVAL;
+
+ family = cfg.udp_config.family;
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = -ESRCH;
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (fou_cfg_cmp(fout, &cfg)) {
+ ret = fou_dump_info(fout, info->snd_portid,
+ info->snd_seq, 0, msg,
+ info->genlhdr->cmd);
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+ if (ret < 0)
+ goto out_free;
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+ int idx = 0, ret;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (idx++ < cb->args[0])
+ continue;
+ ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, FOU_CMD_GET);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fn->fou_lock);
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static struct genl_family fou_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .policy = fou_nl_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .small_ops = fou_nl_ops,
+ .n_small_ops = ARRAY_SIZE(fou_nl_ops),
+ .resv_start_op = FOU_CMD_GET + 1,
+};
+
+size_t fou_encap_hlen(struct ip_tunnel_encap *e)
+{
+ return sizeof(struct udphdr);
+}
+EXPORT_SYMBOL(fou_encap_hlen);
+
+size_t gue_encap_hlen(struct ip_tunnel_encap *e)
+{
+ size_t len;
+ bool need_priv = false;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+
+ if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+ len += GUE_PLEN_REMCSUM;
+ need_priv = true;
+ }
+
+ len += need_priv ? GUE_LEN_PRIV : 0;
+
+ return len;
+}
+EXPORT_SYMBOL(gue_encap_hlen);
+
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
+{
+ int err;
+
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
+
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
+{
+ struct guehdr *guehdr;
+ size_t hdrlen, optlen = 0;
+ void *data;
+ bool need_priv = false;
+ int err;
+
+ if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ optlen += GUE_PLEN_REMCSUM;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+ need_priv = true;
+ }
+
+ optlen += need_priv ? GUE_LEN_PRIV : 0;
+
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
+
+ /* Get source port (based on flow hash) before skb_push */
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
+
+ guehdr = (struct guehdr *)skb->data;
+
+ guehdr->control = 0;
+ guehdr->version = 0;
+ guehdr->hlen = optlen >> 2;
+ guehdr->flags = 0;
+ guehdr->proto_ctype = *protocol;
+
+ data = &guehdr[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+
+ guehdr->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd = data;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi4 *fl4, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+ fl4->saddr, fl4->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+ int err;
+
+ err = __fou_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+
+static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+ int err;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+ const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t len, optlen;
+ int ret;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, transport_offset + len))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encapsulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+ goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+ case 6:
+ ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+ goto out;
+#endif
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (!pskb_may_pull(skb, transport_offset + len + optlen))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ /* Handling exceptions for direct UDP encapsulation in GUE would lead to
+ * recursion. Besides, this kind of encapsulation can't even be
+ * configured currently. Discard this.
+ */
+ if (guehdr->proto_ctype == IPPROTO_UDP ||
+ guehdr->proto_ctype == IPPROTO_UDPLITE)
+ return -EOPNOTSUPP;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+ ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
+
+static const struct ip_tunnel_encap_ops fou_iptun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou_build_header,
+ .err_handler = gue_err,
+};
+
+static const struct ip_tunnel_encap_ops gue_iptun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue_build_header,
+ .err_handler = gue_err,
+};
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou ops\n");
+ return ret;
+ }
+
+ ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue ops\n");
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static __net_init int fou_init_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+
+ INIT_LIST_HEAD(&fn->fou_list);
+ mutex_init(&fn->fou_lock);
+ return 0;
+}
+
+static __net_exit void fou_exit_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fou, *next;
+
+ /* Close all the FOU sockets */
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry_safe(fou, next, &fn->fou_list, list)
+ fou_release(fou);
+ mutex_unlock(&fn->fou_lock);
+}
+
+static struct pernet_operations fou_net_ops = {
+ .init = fou_init_net,
+ .exit = fou_exit_net,
+ .id = &fou_net_id,
+ .size = sizeof(struct fou_net),
+};
+
+static int __init fou_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&fou_net_ops);
+ if (ret)
+ goto exit;
+
+ ret = genl_register_family(&fou_nl_family);
+ if (ret < 0)
+ goto unregister;
+
+ ret = register_fou_bpf();
+ if (ret < 0)
+ goto kfunc_failed;
+
+ ret = ip_tunnel_encap_add_fou_ops();
+ if (ret == 0)
+ return 0;
+
+kfunc_failed:
+ genl_unregister_family(&fou_nl_family);
+unregister:
+ unregister_pernet_device(&fou_net_ops);
+exit:
+ return ret;
+}
+
+static void __exit fou_fini(void)
+{
+ ip_tunnel_encap_del_fou_ops();
+ genl_unregister_family(&fou_nl_family);
+ unregister_pernet_device(&fou_net_ops);
+}
+
+module_init(fou_init);
+module_exit(fou_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Foo over UDP");
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
new file mode 100644
index 000000000000..7a99639204b1
--- /dev/null
+++ b/net/ipv4/fou_nl.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/fou.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "fou_nl.h"
+
+#include <uapi/linux/fou.h>
+
+/* Global operation policy for fou */
+const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = {
+ [FOU_ATTR_PORT] = { .type = NLA_BE16, },
+ [FOU_ATTR_AF] = { .type = NLA_U8, },
+ [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+ [FOU_ATTR_TYPE] = { .type = NLA_U8, },
+ [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+ [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
+ [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16),
+ [FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
+ [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16),
+ [FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, },
+ [FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
+};
+
+/* Ops table for fou */
+const struct genl_small_ops fou_nl_ops[3] = {
+ {
+ .cmd = FOU_CMD_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = fou_nl_add_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = fou_nl_del_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = fou_nl_get_doit,
+ .dumpit = fou_nl_get_dumpit,
+ },
+};
diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h
new file mode 100644
index 000000000000..438342dc8507
--- /dev/null
+++ b/net/ipv4/fou_nl.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/fou.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_FOU_GEN_H
+#define _LINUX_FOU_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/fou.h>
+
+/* Global operation policy for fou */
+extern const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1];
+
+/* Ops table for fou */
+extern const struct genl_small_ops fou_nl_ops[3];
+
+int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+
+#endif /* _LINUX_FOU_GEN_H */
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 000000000000..dafd68f3436a
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+#include <net/erspan.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ int ret;
+
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+ 0 : -EBUSY;
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+/* Fills in tpi and returns header length to be pulled.
+ * Note that caller must use pskb_may_pull() before pulling GRE header.
+ */
+int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err, __be16 proto, int nhs)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ gre_flags_to_tnl_flags(tpi->flags, greh->flags);
+ hdr_len = gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, nhs + hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (!skb_checksum_simple_validate(skb)) {
+ skb_checksum_try_convert(skb, IPPROTO_GRE,
+ null_compute_pseudo);
+ } else if (csum_err) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else {
+ tpi->key = 0;
+ }
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else {
+ tpi->seq = 0;
+ }
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IPv4/IPv6
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ u8 _val, *val;
+
+ val = skb_header_pointer(skb, nhs + hdr_len,
+ sizeof(_val), &_val);
+ if (!val)
+ return -EINVAL;
+ tpi->proto = proto;
+ if ((*val & 0xF0) != 0x40)
+ hdr_len += 4;
+ }
+ tpi->hdr_len = hdr_len;
+
+ /* ERSPAN ver 1 and 2 protocol sets GRE key field
+ * to 0 and sets the configured key in the
+ * inner erspan header field
+ */
+ if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) ||
+ greh->protocol == htons(ETH_P_ERSPAN2)) {
+ struct erspan_base_hdr *ershdr;
+
+ if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
+ return -EINVAL;
+
+ ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len);
+ tpi->key = cpu_to_be32(get_session_id(ershdr));
+ }
+
+ return hdr_len;
+}
+EXPORT_SYMBOL(gre_parse_header);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+ int err = 0;
+
+ if (ver >= GREPROTO_MAX)
+ return -EINVAL;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
+ else
+ err = -EPROTONOSUPPORT;
+ rcu_read_unlock();
+
+ return err;
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+};
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexer driver\n");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("can't add protocol\n");
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit gre_exit(void)
+{
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 000000000000..5028c72d494a
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+#include <net/gro.h>
+#include <net/gso.h>
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ bool need_csum, offload_csum, gso_partial, need_ipsec;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
+ __be16 protocol = skb->protocol;
+ u16 mac_len = skb->mac_len;
+ int gre_offset, outer_hlen;
+
+ if (!skb->encapsulation)
+ goto out;
+
+ if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ /* setup inner skb. */
+ skb->encapsulation = 0;
+ SKB_GSO_CB(skb)->encap_level = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = skb->inner_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
+ skb->encap_hdr_csum = need_csum;
+
+ features &= skb->dev->hw_enc_features;
+ if (need_csum)
+ features &= ~NETIF_F_SCTP_CRC;
+
+ need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum && !need_ipsec &&
+ (skb->dev->features & NETIF_F_HW_CSUM));
+
+ /* segment inner packet. */
+ segs = skb_mac_gso_segment(skb, features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
+
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+ outer_hlen = skb_tnl_header_len(skb);
+ gre_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
+ struct gre_base_hdr *greh;
+ __sum16 *pcsum;
+
+ /* Set up inner headers if we are offloading inner checksum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+
+ __skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, gre_offset);
+
+ if (!need_csum)
+ continue;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ pcsum = (__sum16 *)(greh + 1);
+
+ if (gso_partial && skb_is_gso(skb)) {
+ unsigned int partial_adj;
+
+ /* Adjust checksum to account for the fact that
+ * the partial checksum is based on actual size
+ * whereas headers should be based on MSS size.
+ */
+ partial_adj = skb->len + skb_headroom(skb) -
+ SKB_GSO_CB(skb)->data_offset -
+ skb_shinfo(skb)->gso_size;
+ *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+ } else {
+ *pcsum = 0;
+ }
+
+ *(pcsum + 1) = 0;
+ if (skb->encapsulation || !offload_csum) {
+ *pcsum = gso_make_checksum(skb, 0);
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = sizeof(*greh);
+ }
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static struct sk_buff *gre_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ const struct gre_base_hdr *greh;
+ unsigned int hlen, grehlen;
+ unsigned int off;
+ int flush = 1;
+ struct packet_offload *ptype;
+ __be16 type;
+
+ if (NAPI_GRO_CB(skb)->encap_mark)
+ goto out;
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*greh);
+ greh = skb_gro_header(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+
+ /* Only support version 0 and K (key), C (csum) flags. Note that
+ * although the support for the S (seq#) flag can be added easily
+ * for GRO, this is problematic for GSO hence can not be enabled
+ * here because a GRO pkt may end up in the forwarding path, thus
+ * requiring GSO support to break it up correctly.
+ */
+ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+ goto out;
+
+ /* We can only support GRE_CSUM if we can track the location of
+ * the GRE header. In the case of FOU/GUE we cannot because the
+ * outer UDP header displaces the GRE header leaving us in a state
+ * of limbo.
+ */
+ if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou)
+ goto out;
+
+ type = greh->protocol;
+
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out;
+
+ grehlen = GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ hlen = off + grehlen;
+ if (!skb_gro_may_pull(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+ }
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
+ if (skb_gro_checksum_simple_validate(skb))
+ goto out;
+
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
+ null_compute_pseudo);
+ }
+
+ list_for_each_entry(p, head, list) {
+ const struct gre_base_hdr *greh2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ /* The following checks are needed to ensure only pkts
+ * from the same tunnel are considered for aggregation.
+ * The criteria for "the same tunnel" includes:
+ * 1) same version (we only support version 0 here)
+ * 2) same protocol (we only support ETH_P_IP for now)
+ * 3) same set of flags
+ * 4) same key if the key field is present.
+ */
+ greh2 = (struct gre_base_hdr *)(p->data + off);
+
+ if (greh2->flags != greh->flags ||
+ greh2->protocol != greh->protocol) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (greh->flags & GRE_KEY) {
+ /* compare keys */
+ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+ }
+
+ skb_gro_pull(skb, grehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+ pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
+ flush = 0;
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+ struct packet_offload *ptype;
+ unsigned int grehlen = sizeof(*greh);
+ int err = -ENOENT;
+ __be16 type;
+
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+ type = greh->protocol;
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+ skb_set_inner_mac_header(skb, nhoff + grehlen);
+
+ return err;
+}
+
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_segment = gre_gso_segment,
+ .gro_receive = gre_gro_receive,
+ .gro_complete = gre_gro_complete,
+ },
+};
+
+static int __init gre_offload_init(void)
+{
+ int err;
+
+ err = inet_add_offload(&gre_offload, IPPROTO_GRE);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (err)
+ return err;
+
+ err = inet6_add_offload(&gre_offload, IPPROTO_GRE);
+ if (err)
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
+#endif
+
+ return err;
+}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 40cf0d0e1b83..4abbec2f47ef 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1,14 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3: Implementation of the ICMP protocol layer.
*
- * Alan Cox, <alan@redhat.com>
- *
- * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
*
* Some of the function names and the icmp unreach table for this
* module were derived from [icmp.c 1.0.11 06/02/93] by
@@ -61,9 +55,10 @@
*
* - Should use skb_pull() instead of all the manual checking.
* This would also greatly simply some upper layer error handlers. --AK
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/jiffies.h>
@@ -76,6 +71,8 @@
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -84,14 +81,22 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/raw.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/ip_fib.h>
+#include <net/l3mdev.h>
+#include <net/addrconf.h>
+#include <net/inet_dscp.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/icmp.h>
/*
* Build xmit assembly blocks
@@ -107,19 +112,13 @@ struct icmp_bxm {
__be32 times[3];
} data;
int head_len;
- struct ip_options replyopts;
- unsigned char optbuf[40];
+ struct ip_options_data replyopts;
};
-/*
- * Statistics
- */
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
-
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
-struct icmp_err icmp_err_convert[] = {
+const struct icmp_err icmp_err_convert[] = {
{
.errno = ENETUNREACH, /* ICMP_NET_UNREACH */
.fatal = 0,
@@ -185,143 +184,166 @@ struct icmp_err icmp_err_convert[] = {
.fatal = 1,
},
};
-
-/* Control parameters for ECHO replies. */
-int sysctl_icmp_echo_ignore_all __read_mostly;
-int sysctl_icmp_echo_ignore_broadcasts __read_mostly = 1;
-
-/* Control parameter - ignore bogus broadcast responses? */
-int sysctl_icmp_ignore_bogus_error_responses __read_mostly = 1;
-
-/*
- * Configurable global rate limit.
- *
- * ratelimit defines tokens/packet consumed for dst->rate_token bucket
- * ratemask defines which icmp types are ratelimited by setting
- * it's bit position.
- *
- * default:
- * dest unreachable (3), source quench (4),
- * time exceeded (11), parameter problem (12)
- */
-
-int sysctl_icmp_ratelimit __read_mostly = 1 * HZ;
-int sysctl_icmp_ratemask __read_mostly = 0x1818;
-int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly;
+EXPORT_SYMBOL(icmp_err_convert);
/*
* ICMP control array. This specifies what to do with each ICMP.
*/
struct icmp_control {
- int output_entry; /* Field for increment on output */
- int input_entry; /* Field for increment on input */
- void (*handler)(struct sk_buff *skb);
+ enum skb_drop_reason (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
-#define icmp_socket __get_cpu_var(__icmp_socket)
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
-static __inline__ int icmp_xmit_lock(void)
+/* Called with BH disabled */
+static inline struct sock *icmp_xmit_lock(struct net *net)
{
- local_bh_disable();
+ struct sock *sk;
+
+ sk = this_cpu_read(ipv4_icmp_sk);
- if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
+ if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
- local_bh_enable();
- return 1;
+ return NULL;
}
- return 0;
+ sock_net_set(sk, net);
+ return sk;
}
-static void icmp_xmit_unlock(void)
+static inline void icmp_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
+ sock_net_set(sk, &init_net);
+ spin_unlock(&sk->sk_lock.slock);
}
-/*
- * Send an ICMP frame.
- */
-
-/*
- * Check transmit rate limitation for given message.
- * The rate information is held in the destination cache now.
- * This function is generic and could be used for other purposes
- * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- * Note that the same dst_entry fields are modified by functions in
- * route.c too, but these work for packet destinations while xrlim_allow
- * works for icmp destinations. This means the rate limiting information
- * for one "ip object" is shared - and these ICMPs are twice limited:
- * by source and by destination.
+/**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ * @net: network namespace
*
- * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- * SHOULD allow setting of rate limits
- *
- * Shared between ICMPv4 and ICMPv6.
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+ * Works in tandem with icmp_global_consume().
*/
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
+bool icmp_global_allow(struct net *net)
{
- unsigned long now;
- int rc = 0;
+ u32 delta, now, oldstamp;
+ int incr, new, old;
+
+ /* Note: many cpus could find this condition true.
+ * Then later icmp_global_consume() could consume more credits,
+ * this is an acceptable race.
+ */
+ if (atomic_read(&net->ipv4.icmp_global_credit) > 0)
+ return true;
now = jiffies;
- dst->rate_tokens += now - dst->rate_last;
- dst->rate_last = now;
- if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
- dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
- if (dst->rate_tokens >= timeout) {
- dst->rate_tokens -= timeout;
- rc = 1;
+ oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp);
+ delta = min_t(u32, now - oldstamp, HZ);
+ if (delta < HZ / 50)
+ return false;
+
+ incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ;
+ if (!incr)
+ return false;
+
+ if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) {
+ old = atomic_read(&net->ipv4.icmp_global_credit);
+ do {
+ new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst));
+ } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new));
}
- return rc;
+ return true;
}
+EXPORT_SYMBOL(icmp_global_allow);
-static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
+void icmp_global_consume(struct net *net)
{
- struct dst_entry *dst = &rt->u.dst;
- int rc = 1;
+ int credits = get_random_u32_below(3);
+
+ /* Note: this might make icmp_global.credit negative. */
+ if (credits)
+ atomic_sub(credits, &net->ipv4.icmp_global_credit);
+}
+EXPORT_SYMBOL(icmp_global_consume);
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
if (type > NR_ICMP_TYPES)
- goto out;
+ return true;
/* Don't limit PMTU discovery. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
- goto out;
+ return true;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
+ return true;
+
+ return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code,
+ bool *apply_ratelimit)
+{
+ if (icmpv4_mask_allow(net, type, code))
+ return true;
+
+ if (icmp_global_allow(net)) {
+ *apply_ratelimit = true;
+ return true;
+ }
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+ return false;
+}
+
+/*
+ * Send an ICMP frame.
+ */
+
+static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+ struct flowi4 *fl4, int type, int code,
+ bool apply_ratelimit)
+{
+ struct dst_entry *dst = &rt->dst;
+ struct inet_peer *peer;
+ struct net_device *dev;
+ bool rc = true;
+
+ if (!apply_ratelimit)
+ return true;
/* No rate limit on loopback */
- if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
- goto out;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ if (dev && (dev->flags & IFF_LOOPBACK))
+ goto out;
- /* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & sysctl_icmp_ratemask)
- rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
+ l3mdev_master_ifindex_rcu(dev));
+ rc = inet_peer_xrlim_allow(peer,
+ READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
out:
+ rcu_read_unlock();
+ if (!rc)
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+ else
+ icmp_global_consume(net);
return rc;
}
/*
* Maintain the counters used in the SNMP statistics for outgoing ICMP
*/
-static void icmp_out_count(int type)
+void icmp_out_count(struct net *net, unsigned char type)
{
- if (type <= NR_ICMP_TYPES) {
- ICMP_INC_STATS(icmp_pointers[type].output_entry);
- ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
- }
+ ICMPMSGOUT_INC_STATS(net, type);
+ ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
}
/*
@@ -331,12 +353,12 @@ static void icmp_out_count(int type)
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb)
{
- struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ struct icmp_bxm *icmp_param = from;
__wsum csum;
csum = skb_copy_and_csum_bits(icmp_param->skb,
icmp_param->offset + offset,
- to, len, 0);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (icmp_pointers[icmp_param->data.icmph.type].error)
@@ -344,30 +366,33 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
return 0;
}
-static void icmp_push_reply(struct icmp_bxm *icmp_param,
- struct ipcm_cookie *ipc, struct rtable *rt)
+static void icmp_push_reply(struct sock *sk,
+ struct icmp_bxm *icmp_param,
+ struct flowi4 *fl4,
+ struct ipcm_cookie *ipc, struct rtable **rt)
{
struct sk_buff *skb;
- if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
- icmp_param->data_len+icmp_param->head_len,
- icmp_param->head_len,
- ipc, rt, MSG_DONTWAIT) < 0)
- ip_flush_pending_frames(icmp_socket->sk);
- else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
- struct icmphdr *icmph = skb->h.icmph;
- __wsum csum = 0;
+ if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT) < 0) {
+ __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
+ ip_flush_pending_frames(sk);
+ } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ struct icmphdr *icmph = icmp_hdr(skb);
+ __wsum csum;
struct sk_buff *skb1;
- skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
- csum = csum_add(csum, skb1->csum);
- }
csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
(char *)icmph,
- icmp_param->head_len, csum);
+ icmp_param->head_len);
+ skb_queue_walk(&sk->sk_write_queue, skb1) {
+ csum = csum_add(csum, skb1->csum);
+ }
icmph->checksum = csum_fold(csum);
skb->ip_summed = CHECKSUM_NONE;
- ip_push_pending_frames(icmp_socket->sk);
+ ip_push_pending_frames(sk, fl4);
}
}
@@ -377,47 +402,364 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
- struct sock *sk = icmp_socket->sk;
- struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net_rcu(rt->dst.dev);
+ bool apply_ratelimit = false;
struct ipcm_cookie ipc;
- struct rtable *rt = (struct rtable *)skb->dst;
- __be32 daddr;
-
- if (ip_options_echo(&icmp_param->replyopts, skb))
+ struct flowi4 fl4;
+ struct sock *sk;
+ __be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
+ int type = icmp_param->data.icmph.type;
+ int code = icmp_param->data.icmph.code;
+
+ if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
return;
- if (icmp_xmit_lock())
- return;
+ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* is global icmp_msgs_per_sec exhausted ? */
+ if (!icmpv4_global_allow(net, type, code, &apply_ratelimit))
+ goto out_bh_enable;
+
+ sk = icmp_xmit_lock(net);
+ if (!sk)
+ goto out_bh_enable;
icmp_param->data.icmph.checksum = 0;
- icmp_out_count(icmp_param->data.icmph.type);
-
- inet->tos = skb->nh.iph->tos;
- daddr = ipc.addr = rt->rt_src;
- ipc.opt = NULL;
- if (icmp_param->replyopts.optlen) {
- ipc.opt = &icmp_param->replyopts;
- if (ipc.opt->srr)
- daddr = icmp_param->replyopts.faddr;
- }
- {
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = rt->rt_spec_dst,
- .tos = RT_TOS(skb->nh.iph->tos) } },
- .proto = IPPROTO_ICMP };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(&rt, &fl))
- goto out_unlock;
+
+ ipcm_init(&ipc);
+ ipc.tos = ip_hdr(skb)->tos;
+ ipc.sockc.mark = mark;
+ daddr = ipc.addr = ip_hdr(skb)->saddr;
+ saddr = fib_compute_spec_dst(skb);
+
+ if (icmp_param->replyopts.opt.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts.opt;
+ if (ipc.opt->opt.srr)
+ daddr = icmp_param->replyopts.opt.opt.faddr;
}
- if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
- icmp_param->data.icmph.code))
- icmp_push_reply(icmp_param, &ipc, rt);
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+ fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb));
+ fl4.flowi4_proto = IPPROTO_ICMP;
+ fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ goto out_unlock;
+ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
- icmp_xmit_unlock();
+ icmp_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
}
+/*
+ * The device used for looking up which routing table to use for sending an ICMP
+ * error is preferably the source whenever it is set, which should ensure the
+ * icmp error can be sent to the source host, else lookup using the routing
+ * table of the destination device, else use the main routing table (index 0).
+ */
+static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ const struct dst_entry *dst;
+
+ if (dev)
+ return dev;
+ dst = skb_dst(skb);
+ return dst ? dst_dev(dst) : NULL;
+}
+
+static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
+ struct sk_buff *skb_in,
+ const struct iphdr *iph, __be32 saddr,
+ dscp_t dscp, u32 mark, int type,
+ int code, struct icmp_bxm *param)
+{
+ struct net_device *route_lookup_dev;
+ struct dst_entry *dst, *dst2;
+ struct rtable *rt, *rt2;
+ struct flowi4 fl4_dec;
+ int err;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = (param->replyopts.opt.opt.srr ?
+ param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
+ fl4->flowi4_dscp = dscp;
+ fl4->flowi4_proto = IPPROTO_ICMP;
+ fl4->fl4_icmp_type = type;
+ fl4->fl4_icmp_code = code;
+ route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
+ fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
+
+ security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
+ rt = ip_route_output_key_hash(net, fl4, skb_in);
+ if (IS_ERR(rt))
+ return rt;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ dst = xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ rt = dst_rtable(dst);
+ if (!IS_ERR(dst)) {
+ if (rt != rt2)
+ return rt;
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
+ fl4->daddr) == RTN_LOCAL)
+ return rt;
+ } else if (PTR_ERR(dst) == -EPERM) {
+ rt = NULL;
+ } else {
+ return rt;
+ }
+ err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+ if (err)
+ goto relookup_failed;
+
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
+ fl4_dec.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4_dec);
+ if (IS_ERR(rt2))
+ err = PTR_ERR(rt2);
+ } else {
+ struct flowi4 fl4_2 = {};
+ unsigned long orefdst;
+
+ fl4_2.daddr = fl4_dec.saddr;
+ rt2 = ip_route_output_key(net, &fl4_2);
+ if (IS_ERR(rt2)) {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ /* Ugh! */
+ orefdst = skb_dstref_steal(skb_in);
+ err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+ dscp, rt2->dst.dev) ? -EINVAL : 0;
+
+ dst_release(&rt2->dst);
+ rt2 = skb_rtable(skb_in);
+ /* steal dst entry from skb_in, don't drop refcnt */
+ skb_dstref_steal(skb_in);
+ skb_dstref_restore(skb_in, orefdst);
+ }
+
+ if (err)
+ goto relookup_failed;
+
+ dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ rt2 = dst_rtable(dst2);
+ if (!IS_ERR(dst2)) {
+ dst_release(&rt->dst);
+ memcpy(fl4, &fl4_dec, sizeof(*fl4));
+ rt = rt2;
+ } else if (PTR_ERR(dst2) == -EPERM) {
+ if (rt)
+ dst_release(&rt->dst);
+ return rt2;
+ } else {
+ err = PTR_ERR(dst2);
+ goto relookup_failed;
+ }
+ return rt;
+
+relookup_failed:
+ if (rt)
+ return rt;
+ return ERR_PTR(err);
+}
+
+struct icmp_ext_iio_addr4_subobj {
+ __be16 afi;
+ __be16 reserved;
+ __be32 addr4;
+};
+
+static unsigned int icmp_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp_ext_iio_addr4_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return 0;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
+ ipv4_is_multicast(ifa->ifa_address))
+ continue;
+ return ifa->ifa_address;
+ }
+
+ return 0;
+}
+
+static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ /* Add the fields in the order specified by RFC 5837. */
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ data = icmp_ext_iio_addr4_find(dev);
+ if (data) {
+ struct icmp_ext_iio_addr4_subobj *addr4_subobj;
+
+ addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
+ addr4_subobj->afi = htons(ICMP_AFI_IP);
+ addr4_subobj->addr4 = data;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+ }
+
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
+ unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return NULL;
+ }
+
+ ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 32-bit words (RFC 4884). */
+ icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
+}
/*
* Send an ICMP message in response to a situation
@@ -430,17 +772,33 @@ out_unlock:
* MUST reply to only the first fragment.
*/
-void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
+ const struct inet_skb_parm *parm)
{
struct iphdr *iph;
int room;
struct icmp_bxm icmp_param;
- struct rtable *rt = (struct rtable *)skb_in->dst;
+ struct rtable *rt = skb_rtable(skb_in);
+ bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct ipcm_cookie ipc;
+ struct flowi4 fl4;
__be32 saddr;
u8 tos;
+ u32 mark;
+ struct net *net;
+ struct sock *sk;
if (!rt)
+ return;
+
+ rcu_read_lock();
+
+ if (rt->dst.dev)
+ net = dev_net_rcu(rt->dst.dev);
+ else if (skb_in->dev)
+ net = dev_net_rcu(skb_in->dev);
+ else
goto out;
/*
@@ -448,9 +806,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Check this, icmp_send is called from the most obscure devices
* sometimes.
*/
- iph = skb_in->nh.iph;
+ iph = ip_hdr(skb_in);
- if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail)
+ if ((u8 *)iph < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(*iph)) >
+ skb_tail_pointer(skb_in))
goto out;
/*
@@ -484,14 +844,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
u8 _inner_type, *itp;
itp = skb_header_pointer(skb_in,
- skb_in->nh.raw +
+ skb_network_header(skb_in) +
(iph->ihl << 2) +
offsetof(struct icmphdr,
type) -
skb_in->data,
sizeof(_inner_type),
&_inner_type);
- if (itp == NULL)
+ if (!itp)
goto out;
/*
@@ -504,8 +864,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
- if (icmp_xmit_lock())
- return;
+ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
+ * incoming dev is loopback. If outgoing dev change to not be
+ * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
+ */
+ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
+ !icmpv4_global_allow(net, type, code, &apply_ratelimit))
+ goto out_bh_enable;
+
+ sk = icmp_xmit_lock(net);
+ if (!sk)
+ goto out_bh_enable;
/*
* Construct source address and options.
@@ -513,17 +885,29 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
saddr = iph->daddr;
if (!(rt->rt_flags & RTCF_LOCAL)) {
- if (sysctl_icmp_errors_use_inbound_ifaddr)
- saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK);
+ struct net_device *dev = NULL;
+
+ rcu_read_lock();
+ if (rt_is_input_route(rt) &&
+ READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
+ dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
+ inet_iif(skb_in));
+
+ if (dev)
+ saddr = inet_select_addr(dev, iph->saddr,
+ RT_SCOPE_LINK);
else
saddr = 0;
+ rcu_read_unlock();
}
- tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) |
IPTOS_PREC_INTERNETCONTROL) :
- iph->tos;
+ iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ &parm->opt))
goto out_unlock;
@@ -536,74 +920,154 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
icmp_param.data.icmph.un.gateway = info;
icmp_param.data.icmph.checksum = 0;
icmp_param.skb = skb_in;
- icmp_param.offset = skb_in->nh.raw - skb_in->data;
- icmp_out_count(icmp_param.data.icmph.type);
- inet_sk(icmp_socket->sk)->tos = tos;
+ icmp_param.offset = skb_network_offset(skb_in);
+ ipcm_init(&ipc);
+ ipc.tos = tos;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts;
+ ipc.opt = &icmp_param.replyopts.opt;
+ ipc.sockc.mark = mark;
- {
- struct flowi fl = {
- .nl_u = {
- .ip4_u = {
- .daddr = icmp_param.replyopts.srr ?
- icmp_param.replyopts.faddr :
- iph->saddr,
- .saddr = saddr,
- .tos = RT_TOS(tos)
- }
- },
- .proto = IPPROTO_ICMP,
- .uli_u = {
- .icmpt = {
- .type = type,
- .code = code
- }
- }
- };
- security_skb_classify_flow(skb_in, &fl);
- if (ip_route_output_key(&rt, &fl))
- goto out_unlock;
- }
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr,
+ inet_dsfield_to_dscp(tos), mark, type, code,
+ &icmp_param);
+ if (IS_ERR(rt))
+ goto out_unlock;
- if (!icmpv4_xrlim_allow(rt, type, code))
+ /* peer icmp_ratelimit */
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
goto ende;
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->u.dst);
+ room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
+ /* Guard against tiny mtu. We need to include at least one
+ * IP network header for this message to make any sense.
+ */
+ if (room <= (int)sizeof(struct iphdr))
+ goto ende;
+
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ parm->iif);
+ if (ext_skb)
+ icmp_param.skb = ext_skb;
- icmp_param.data_len = skb_in->len - icmp_param.offset;
+ icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
- icmp_push_reply(&icmp_param, &ipc, rt);
+ /* if we don't have a source address at this point, fall back to the
+ * dummy address instead of sending out a packet with a source address
+ * of 0.0.0.0
+ */
+ if (!fl4.saddr)
+ fl4.saddr = htonl(INADDR_DUMMY);
+
+ trace_icmp_send(skb_in, type, code);
+
+ icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+
+ if (ext_skb)
+ consume_skb(ext_skb);
ende:
ip_rt_put(rt);
out_unlock:
- icmp_xmit_unlock();
-out:;
+ icmp_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__icmp_send);
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct inet_skb_parm parm;
+ struct nf_conn *ct;
+ __be32 orig_ip;
+
+ memset(&parm, 0, sizeof(parm));
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
+ __icmp_send(skb_in, type, code, info, &parm);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct iphdr))))
+ goto out;
+
+ orig_ip = ip_hdr(skb_in)->saddr;
+ dir = CTINFO2DIR(ctinfo);
+ ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ __icmp_send(skb_in, type, code, info, &parm);
+ ip_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
+
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
+ return;
+ }
+
+ raw_icmp_error(skb, protocol, info);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+}
+
+static bool icmp_tag_validation(int proto)
+{
+ bool ok;
+
+ rcu_read_lock();
+ ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+ rcu_read_unlock();
+ return ok;
+}
/*
- * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
+ * ICMP_PARAMETERPROB.
*/
-static void icmp_unreach(struct sk_buff *skb)
+static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
{
- struct iphdr *iph;
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
+ const struct iphdr *iph;
struct icmphdr *icmph;
- int hash, protocol;
- struct net_protocol *ipprot;
- struct sock *raw_sk;
+ struct net *net;
u32 info = 0;
+ net = skb_dst_dev_net_rcu(skb);
+
/*
* Incomplete header ?
* Only checks for the IP header, there should be an
@@ -613,13 +1077,16 @@ static void icmp_unreach(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out_err;
- icmph = skb->h.icmph;
- iph = (struct iphdr *)skb->data;
+ icmph = icmp_hdr(skb);
+ iph = (const struct iphdr *)skb->data;
- if (iph->ihl < 5) /* Mangled header, drop. */
+ if (iph->ihl < 5) { /* Mangled header, drop. */
+ reason = SKB_DROP_REASON_IP_INHDR;
goto out_err;
+ }
- if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
switch (icmph->code & 15) {
case ICMP_NET_UNREACH:
case ICMP_HOST_UNREACH:
@@ -627,30 +1094,44 @@ static void icmp_unreach(struct sk_buff *skb)
case ICMP_PORT_UNREACH:
break;
case ICMP_FRAG_NEEDED:
- if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
- "fragmentation needed "
- "and DF set.\n",
- NIPQUAD(iph->daddr));
- } else {
- info = ip_rt_frag_needed(iph,
- ntohs(icmph->un.frag.mtu));
- if (!info)
+ /* for documentation of the ip_no_pmtu_disc
+ * values please see
+ * Documentation/networking/ip-sysctl.rst
+ */
+ switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) {
+ default:
+ net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
+ &iph->daddr);
+ break;
+ case 2:
+ goto out;
+ case 3:
+ if (!icmp_tag_validation(iph->protocol))
goto out;
+ fallthrough;
+ case 0:
+ info = ntohs(icmph->un.frag.mtu);
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
- "Route Failed.\n",
- NIPQUAD(iph->daddr));
+ net_dbg_ratelimited("%pI4: Source Route Failed\n",
+ &iph->daddr);
break;
default:
break;
}
if (icmph->code > NR_ICMP_UNREACH)
goto out;
- } else if (icmph->type == ICMP_PARAMETERPROB)
+ break;
+ case ICMP_PARAMETERPROB:
info = ntohl(icmph->un.gateway) >> 24;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
+ if (icmph->code == ICMP_EXC_FRAGTIME)
+ goto out;
+ break;
+ }
/*
* Throw it at our lower layers
@@ -664,63 +1145,28 @@ static void icmp_unreach(struct sk_buff *skb)
*/
/*
- * Check the other end isnt violating RFC 1122. Some routers send
+ * Check the other end isn't violating RFC 1122. Some routers send
* bogus responses to broadcast frames. If you see this message
* first check your netmask matches at both ends, if it does then
* get the other vendor to fix their kit.
*/
- if (!sysctl_icmp_ignore_bogus_error_responses &&
- inet_addr_type(iph->daddr) == RTN_BROADCAST) {
- if (net_ratelimit())
- printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
- "type %u, code %u "
- "error to a broadcast: %u.%u.%u.%u on %s\n",
- NIPQUAD(skb->nh.iph->saddr),
- icmph->type, icmph->code,
- NIPQUAD(iph->daddr),
- skb->dev->name);
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) &&
+ inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
goto out;
}
- /* Checkin full IP header plus 8 bytes of protocol to
- * avoid additional coding at protocol handlers.
- */
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- protocol = iph->protocol;
-
- /*
- * Deliver ICMP message to raw sockets. Pretty useless feature?
- */
-
- /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
- hash = protocol & (MAX_INET_PROTOS - 1);
- read_lock(&raw_v4_lock);
- if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
- while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
- iph->saddr,
- skb->dev->ifindex)) != NULL) {
- raw_err(raw_sk, skb, info);
- raw_sk = sk_next(raw_sk);
- iph = (struct iphdr *)skb->data;
- }
- }
- read_unlock(&raw_v4_lock);
-
- rcu_read_lock();
- ipprot = rcu_dereference(inet_protos[hash]);
- if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, info);
- rcu_read_unlock();
+ icmp_socket_deliver(skb, info);
out:
- return;
+ return reason;
out_err:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- goto out;
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
}
@@ -728,43 +1174,24 @@ out_err:
* Handle ICMP_REDIRECT.
*/
-static void icmp_redirect(struct sk_buff *skb)
+static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
- struct iphdr *iph;
-
- if (skb->len < sizeof(struct iphdr))
- goto out_err;
-
- /*
- * Get the copied header of the packet that caused the redirect
- */
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
+ if (skb->len < sizeof(struct iphdr)) {
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
+ }
- iph = (struct iphdr *)skb->data;
+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+ /* there aught to be a stat */
+ return SKB_DROP_REASON_NOMEM;
+ }
- switch (skb->h.icmph->code & 7) {
- case ICMP_REDIR_NET:
- case ICMP_REDIR_NETTOS:
- /*
- * As per RFC recommendations now handle it as a host redirect.
- */
- case ICMP_REDIR_HOST:
- case ICMP_REDIR_HOSTTOS:
- ip_rt_redirect(skb->nh.iph->saddr, iph->daddr,
- skb->h.icmph->un.gateway,
- iph->saddr, skb->dev);
- break;
- }
-out:
- return;
-out_err:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- goto out;
+ icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
+ return SKB_NOT_DROPPED_YET;
}
/*
- * Handle ICMP_ECHO ("ping") requests.
+ * Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests.
*
* RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
* requests.
@@ -772,23 +1199,150 @@ out_err:
* included in the reply.
* RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
* echo requests, MUST have default=NOT.
+ * RFC 8335: 8 MUST have a config option to enable/disable ICMP
+ * Extended Echo Functionality, MUST be disabled by default
* See also WRT handling of options once they are done and working.
*/
-static void icmp_echo(struct sk_buff *skb)
+static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
- if (!sysctl_icmp_echo_ignore_all) {
- struct icmp_bxm icmp_param;
+ struct icmp_bxm icmp_param;
+ struct net *net;
+
+ net = skb_dst_dev_net_rcu(skb);
+ /* should there be an ICMP stat for ignored echos? */
+ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
+ return SKB_NOT_DROPPED_YET;
- icmp_param.data.icmph = *skb->h.icmph;
+ icmp_param.data.icmph = *icmp_hdr(skb);
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = skb->len;
+ icmp_param.head_len = sizeof(struct icmphdr);
+
+ if (icmp_param.data.icmph.type == ICMP_ECHO)
icmp_param.data.icmph.type = ICMP_ECHOREPLY;
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = skb->len;
- icmp_param.head_len = sizeof(struct icmphdr);
- icmp_reply(&icmp_param, skb);
+ else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
+ return SKB_NOT_DROPPED_YET;
+
+ icmp_reply(&icmp_param, skb);
+ return SKB_NOT_DROPPED_YET;
+}
+
+/* Helper for icmp_echo and icmpv6_echo_reply.
+ * Searches for net_device that matches PROBE interface identifier
+ * and builds PROBE reply message in icmphdr.
+ *
+ * Returns false if PROBE responses are disabled via sysctl
+ */
+
+bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
+{
+ struct net *net = dev_net_rcu(skb->dev);
+ struct icmp_ext_hdr *ext_hdr, _ext_hdr;
+ struct icmp_ext_echo_iio *iio, _iio;
+ struct inet6_dev *in6_dev;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ char buff[IFNAMSIZ];
+ u16 ident_len;
+ u8 status;
+
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
+ return false;
+
+ /* We currently only support probing interfaces on the proxy node
+ * Check to ensure L-bit is set
+ */
+ if (!(ntohs(icmphdr->un.echo.sequence) & 1))
+ return false;
+ /* Clear status bits in reply message */
+ icmphdr->un.echo.sequence &= htons(0xFF00);
+ if (icmphdr->type == ICMP_EXT_ECHO)
+ icmphdr->type = ICMP_EXT_ECHOREPLY;
+ else
+ icmphdr->type = ICMPV6_EXT_ECHO_REPLY;
+ ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr);
+ /* Size of iio is class_type dependent.
+ * Only check header here and assign length based on ctype in the switch statement
+ */
+ iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio);
+ if (!ext_hdr || !iio)
+ goto send_mal_query;
+ if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) ||
+ ntohs(iio->extobj_hdr.length) > sizeof(_iio))
+ goto send_mal_query;
+ ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr);
+ iio = skb_header_pointer(skb, sizeof(_ext_hdr),
+ sizeof(iio->extobj_hdr) + ident_len, &_iio);
+ if (!iio)
+ goto send_mal_query;
+
+ status = 0;
+ dev = NULL;
+ switch (iio->extobj_hdr.class_type) {
+ case ICMP_EXT_ECHO_CTYPE_NAME:
+ if (ident_len >= IFNAMSIZ)
+ goto send_mal_query;
+ memset(buff, 0, sizeof(buff));
+ memcpy(buff, &iio->ident.name, ident_len);
+ dev = dev_get_by_name(net, buff);
+ break;
+ case ICMP_EXT_ECHO_CTYPE_INDEX:
+ if (ident_len != sizeof(iio->ident.ifindex))
+ goto send_mal_query;
+ dev = dev_get_by_index(net, ntohl(iio->ident.ifindex));
+ break;
+ case ICMP_EXT_ECHO_CTYPE_ADDR:
+ if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) ||
+ ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
+ iio->ident.addr.ctype3_hdr.addrlen)
+ goto send_mal_query;
+ switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) {
+ case ICMP_AFI_IP:
+ if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr))
+ goto send_mal_query;
+ dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case ICMP_AFI_IP6:
+ if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
+ goto send_mal_query;
+ dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
+ dev_hold(dev);
+ break;
+#endif
+ default:
+ goto send_mal_query;
+ }
+ break;
+ default:
+ goto send_mal_query;
}
+ if (!dev) {
+ icmphdr->code = ICMP_EXT_CODE_NO_IF;
+ return true;
+ }
+ /* Fill bits in reply message */
+ if (dev->flags & IFF_UP)
+ status |= ICMP_EXT_ECHOREPLY_ACTIVE;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && rcu_access_pointer(in_dev->ifa_list))
+ status |= ICMP_EXT_ECHOREPLY_IPV4;
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev && !list_empty(&in6_dev->addr_list))
+ status |= ICMP_EXT_ECHOREPLY_IPV6;
+
+ dev_put(dev);
+ icmphdr->un.echo.sequence |= htons(status);
+ return true;
+send_mal_query:
+ icmphdr->code = ICMP_EXT_CODE_MAL_QUERY;
+ return true;
}
+EXPORT_SYMBOL_GPL(icmp_build_probe);
/*
* Handle ICMP Timestamp requests.
@@ -797,9 +1351,8 @@ static void icmp_echo(struct sk_buff *skb)
* MUST be accurate to a few minutes.
* MUST be updated at least at 15Hz.
*/
-static void icmp_timestamp(struct sk_buff *skb)
+static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
- struct timeval tv;
struct icmp_bxm icmp_param;
/*
* Too short.
@@ -810,13 +1363,12 @@ static void icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- do_gettimeofday(&tv);
- icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
- tv.tv_usec / 1000);
+ icmp_param.data.times[1] = inet_current_timestamp();
icmp_param.data.times[2] = icmp_param.data.times[1];
- if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
- BUG();
- icmp_param.data.icmph = *skb->h.icmph;
+
+ BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
+
+ icmp_param.data.icmph = *icmp_hdr(skb);
icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
icmp_param.data.icmph.code = 0;
icmp_param.skb = skb;
@@ -824,142 +1376,80 @@ static void icmp_timestamp(struct sk_buff *skb)
icmp_param.data_len = 0;
icmp_param.head_len = sizeof(struct icmphdr) + 12;
icmp_reply(&icmp_param, skb);
-out:
- return;
+ return SKB_NOT_DROPPED_YET;
+
out_err:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- goto out;
+ __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS);
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
}
-
-/*
- * Handle ICMP_ADDRESS_MASK requests. (RFC950)
- *
- * RFC1122 (3.2.2.9). A host MUST only send replies to
- * ADDRESS_MASK requests if it's been configured as an address mask
- * agent. Receiving a request doesn't constitute implicit permission to
- * act as one. Of course, implementing this correctly requires (SHOULD)
- * a way to turn the functionality on and off. Another one for sysctl(),
- * I guess. -- MS
- *
- * RFC1812 (4.3.3.9). A router MUST implement it.
- * A router SHOULD have switch turning it on/off.
- * This switch MUST be ON by default.
- *
- * Gratuitous replies, zero-source replies are not implemented,
- * that complies with RFC. DO NOT implement them!!! All the idea
- * of broadcast addrmask replies as specified in RFC950 is broken.
- * The problem is that it is not uncommon to have several prefixes
- * on one physical interface. Moreover, addrmask agent can even be
- * not aware of existing another prefixes.
- * If source is zero, addrmask agent cannot choose correct prefix.
- * Gratuitous mask announcements suffer from the same problem.
- * RFC1812 explains it, but still allows to use ADDRMASK,
- * that is pretty silly. --ANK
- *
- * All these rules are so bizarre, that I removed kernel addrmask
- * support at all. It is wrong, it is obsolete, nobody uses it in
- * any case. --ANK
- *
- * Furthermore you can do it with a usermode address agent program
- * anyway...
- */
-
-static void icmp_address(struct sk_buff *skb)
+static enum skb_drop_reason icmp_discard(struct sk_buff *skb)
{
-#if 0
- if (net_ratelimit())
- printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
-#endif
+ /* pretend it was a success */
+ return SKB_NOT_DROPPED_YET;
}
/*
- * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
- * loudly if an inconsistency is found.
+ * Deal with incoming ICMP packets.
*/
-
-static void icmp_address_reply(struct sk_buff *skb)
+int icmp_rcv(struct sk_buff *skb)
{
- struct rtable *rt = (struct rtable *)skb->dst;
- struct net_device *dev = skb->dev;
- struct in_device *in_dev;
- struct in_ifaddr *ifa;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net_rcu(rt->dst.dev);
+ struct icmphdr *icmph;
- if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
- goto out;
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ struct sec_path *sp = skb_sec_path(skb);
+ int nh;
- in_dev = in_dev_get(dev);
- if (!in_dev)
- goto out;
- rcu_read_lock();
- if (in_dev->ifa_list &&
- IN_DEV_LOG_MARTIANS(in_dev) &&
- IN_DEV_FORWARD(in_dev)) {
- __be32 _mask, *mp;
-
- mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
- BUG_ON(mp == NULL);
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
- if (*mp == ifa->ifa_mask &&
- inet_ifa_match(rt->rt_src, ifa))
- break;
- }
- if (!ifa && net_ratelimit()) {
- printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
- "%s/%u.%u.%u.%u\n",
- NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
+ if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
+ goto drop;
}
- }
- rcu_read_unlock();
- in_dev_put(in_dev);
-out:;
-}
-static void icmp_discard(struct sk_buff *skb)
-{
-}
+ if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+ goto drop;
-/*
- * Deal with incoming ICMP packets.
- */
-int icmp_rcv(struct sk_buff *skb)
-{
- struct icmphdr *icmph;
- struct rtable *rt = (struct rtable *)skb->dst;
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*icmph));
- ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN,
+ skb)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
+ goto drop;
+ }
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto error;
+ skb_set_network_header(skb, nh);
}
- if (!pskb_pull(skb, sizeof(struct icmphdr)))
- goto error;
+ __ICMP_INC_STATS(net, ICMP_MIB_INMSGS);
- icmph = skb->h.icmph;
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
- /*
- * 18 is the highest 'known' ICMP type. Anything else is a mystery
- *
- * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
- * discarded.
- */
- if (icmph->type > NR_ICMP_TYPES)
+ if (!pskb_pull(skb, sizeof(*icmph)))
goto error;
+ icmph = icmp_hdr(skb);
+
+ ICMPMSGIN_INC_STATS(net, icmph->type);
+
+ /* Check for ICMP Extended Echo (PROBE) messages */
+ if (icmph->type == ICMP_EXT_ECHO) {
+ /* We can't use icmp_pointers[].handler() because it is an array of
+ * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42.
+ */
+ reason = icmp_echo(skb);
+ goto reason_check;
+ }
/*
* Parse the ICMP message
*/
- if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
/*
* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
* silently ignored (we let user decide with a sysctl).
@@ -968,175 +1458,273 @@ int icmp_rcv(struct sk_buff *skb)
*/
if ((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP) &&
- sysctl_icmp_echo_ignore_broadcasts) {
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
}
if (icmph->type != ICMP_ECHO &&
icmph->type != ICMP_TIMESTAMP &&
icmph->type != ICMP_ADDRESS &&
icmph->type != ICMP_ADDRESSREPLY) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
- }
+ }
+ }
+
+ if (icmph->type == ICMP_EXT_ECHOREPLY ||
+ icmph->type == ICMP_ECHOREPLY) {
+ reason = ping_rcv(skb);
+ return reason ? NET_RX_DROP : NET_RX_SUCCESS;
+ }
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ reason = SKB_DROP_REASON_UNHANDLED_PROTO;
+ goto error;
}
- ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
- icmp_pointers[icmph->type].handler(skb);
+ reason = icmp_pointers[icmph->type].handler(skb);
+reason_check:
+ if (!reason) {
+ consume_skb(skb);
+ return NET_RX_SUCCESS;
+ }
drop:
- kfree_skb(skb);
- return 0;
+ kfree_skb_reason(skb, reason);
+ return NET_RX_DROP;
+csum_error:
+ reason = SKB_DROP_REASON_ICMP_CSUM;
+ __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
goto drop;
}
+static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off)
+{
+ struct icmp_extobj_hdr *objh, _objh;
+ struct icmp_ext_hdr *exth, _exth;
+ u16 olen;
+
+ exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth);
+ if (!exth)
+ return false;
+ if (exth->version != 2)
+ return true;
+
+ if (exth->checksum &&
+ csum_fold(skb_checksum(skb, off, skb->len - off, 0)))
+ return false;
+
+ off += sizeof(_exth);
+ while (off < skb->len) {
+ objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh);
+ if (!objh)
+ return false;
+
+ olen = ntohs(objh->length);
+ if (olen < sizeof(_objh))
+ return false;
+
+ off += olen;
+ if (off > skb->len)
+ return false;
+ }
+
+ return true;
+}
+
+void ip_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out,
+ int thlen, int off)
+{
+ int hlen;
+
+ /* original datagram headers: end of icmph to payload (skb->data) */
+ hlen = -skb_transport_offset(skb) - thlen;
+
+ /* per rfc 4884: minimal datagram length of 128 bytes */
+ if (off < 128 || off < hlen)
+ return;
+
+ /* kernel has stripped headers: return payload offset in bytes */
+ off -= hlen;
+ if (off + sizeof(struct icmp_ext_hdr) > skb->len)
+ return;
+
+ out->len = off;
+
+ if (!ip_icmp_error_rfc4884_validate(skb, off))
+ out->flags |= SO_EE_RFC4884_FLAG_INVALID;
+}
+EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884);
+
+int icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ int offset = iph->ihl<<2;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ struct net *net = dev_net_rcu(skb->dev);
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, offset, info);
+ return 0;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+ return 0;
+}
+
/*
* This table is the definition of how we handle ICMP.
*/
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
[ICMP_ECHOREPLY] = {
- .output_entry = ICMP_MIB_OUTECHOREPS,
- .input_entry = ICMP_MIB_INECHOREPS,
- .handler = icmp_discard,
+ .handler = ping_rcv,
},
[1] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[2] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[ICMP_DEST_UNREACH] = {
- .output_entry = ICMP_MIB_OUTDESTUNREACHS,
- .input_entry = ICMP_MIB_INDESTUNREACHS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_SOURCE_QUENCH] = {
- .output_entry = ICMP_MIB_OUTSRCQUENCHS,
- .input_entry = ICMP_MIB_INSRCQUENCHS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_REDIRECT] = {
- .output_entry = ICMP_MIB_OUTREDIRECTS,
- .input_entry = ICMP_MIB_INREDIRECTS,
.handler = icmp_redirect,
.error = 1,
},
[6] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[7] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[ICMP_ECHO] = {
- .output_entry = ICMP_MIB_OUTECHOS,
- .input_entry = ICMP_MIB_INECHOS,
.handler = icmp_echo,
},
[9] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[10] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[ICMP_TIME_EXCEEDED] = {
- .output_entry = ICMP_MIB_OUTTIMEEXCDS,
- .input_entry = ICMP_MIB_INTIMEEXCDS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_PARAMETERPROB] = {
- .output_entry = ICMP_MIB_OUTPARMPROBS,
- .input_entry = ICMP_MIB_INPARMPROBS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_TIMESTAMP] = {
- .output_entry = ICMP_MIB_OUTTIMESTAMPS,
- .input_entry = ICMP_MIB_INTIMESTAMPS,
.handler = icmp_timestamp,
},
[ICMP_TIMESTAMPREPLY] = {
- .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
- .input_entry = ICMP_MIB_INTIMESTAMPREPS,
.handler = icmp_discard,
},
[ICMP_INFO_REQUEST] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_DUMMY,
.handler = icmp_discard,
},
- [ICMP_INFO_REPLY] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_DUMMY,
+ [ICMP_INFO_REPLY] = {
.handler = icmp_discard,
},
[ICMP_ADDRESS] = {
- .output_entry = ICMP_MIB_OUTADDRMASKS,
- .input_entry = ICMP_MIB_INADDRMASKS,
- .handler = icmp_address,
+ .handler = icmp_discard,
},
[ICMP_ADDRESSREPLY] = {
- .output_entry = ICMP_MIB_OUTADDRMASKREPS,
- .input_entry = ICMP_MIB_INADDRMASKREPS,
- .handler = icmp_address_reply,
+ .handler = icmp_discard,
},
};
-void __init icmp_init(struct net_proto_family *ops)
+static int __net_init icmp_sk_init(struct net *net)
{
- struct inet_sock *inet;
- int i;
+ /* Control parameters for ECHO replies. */
+ net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+ net->ipv4.sysctl_icmp_echo_enable_probe = 0;
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
- for_each_possible_cpu(i) {
- int err;
+ /* Control parameter - ignore bogus broadcast responses? */
+ net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
+
+ /*
+ * Configurable global rate limit.
+ *
+ * ratelimit defines tokens/packet consumed for dst->rate_token
+ * bucket ratemask defines which icmp types are ratelimited by
+ * setting it's bit position.
+ *
+ * default:
+ * dest unreachable (3), source quench (4),
+ * time exceeded (11), parameter problem (12)
+ */
+
+ net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
+ net->ipv4.sysctl_icmp_ratemask = 0x1818;
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+ net->ipv4.sysctl_icmp_errors_extension_mask = 0;
+ net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
+ net->ipv4.sysctl_icmp_msgs_burst = 50;
- err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
- &per_cpu(__icmp_socket, i));
+ return 0;
+}
+
+static struct pernet_operations __net_initdata icmp_sk_ops = {
+ .init = icmp_sk_init,
+};
+
+int __init icmp_init(void)
+{
+ int err, i;
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, &init_net);
if (err < 0)
- panic("Failed to create the ICMP control socket.\n");
+ return err;
- per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
+ per_cpu(ipv4_icmp_sk, i) = sk;
/* Enough space for 2 64K ICMP packets, including
- * sk_buff struct overhead.
+ * sk_buff/skb_shared_info struct overhead.
*/
- per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
- (2 * ((64 * 1024) + sizeof(struct sk_buff)));
-
- inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
- inet->uc_ttl = -1;
- inet->pmtudisc = IP_PMTUDISC_DONT;
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
- /* Unhash it so that IP input processing does not even
- * see it, we do not wish this socket to see incoming
- * packets.
+ /*
+ * Speedup sock_wfree()
*/
- per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
}
+ return register_pernet_subsys(&icmp_sk_ops);
}
-
-EXPORT_SYMBOL(icmp_err_convert);
-EXPORT_SYMBOL(icmp_send);
-EXPORT_SYMBOL(icmp_statistics);
-EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 0017ccb01d6d..7182f1419c2a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux NET3: Internet Group Management Protocol [IGMP]
*
@@ -8,15 +9,8 @@
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
- * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
- *
* Authors:
- * Alan Cox <Alan.Cox@linux.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
*
* Fixes:
*
@@ -35,7 +29,7 @@
*
* Chih-Jen Chang : Tried to revise IGMP to Version 2
* Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
- * The enhancements are mainly based on Steve Deering's
+ * The enhancements are mainly based on Steve Deering's
* ipmulti-3.5 source code.
* Chih-Jen Chang : Added the igmp_get_mrouter_info and
* Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
@@ -49,11 +43,11 @@
* Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
* Alan Cox : Use GFP_ATOMIC in the right places.
* Christian Daudt : igmp timer wasn't set for local group
- * memberships but was being deleted,
- * which caused a "del_timer() called
+ * memberships but was being deleted,
+ * which caused a "del_timer() called
* from %p with timer not initialized\n"
* message (960131).
- * Christian Daudt : removed del_timer from
+ * Christian Daudt : removed del_timer from
* igmp_timer_expire function (960205).
* Christian Daudt : igmp_heard_report now only calls
* igmp_timer_expire if tm->running is
@@ -73,8 +67,8 @@
*/
#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -87,16 +81,23 @@
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include "igmp_internal.h"
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
+#include <linux/pkt_sched.h>
+#include <linux/byteorder/generic.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
@@ -106,39 +107,58 @@
#include <linux/seq_file.h>
#endif
-#define IP_MAX_MEMBERSHIPS 20
-#define IP_MAX_MSF 10
-
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
-#define IGMP_V1_Router_Present_Timeout (400*HZ)
-#define IGMP_V2_Router_Present_Timeout (400*HZ)
-#define IGMP_Unsolicited_Report_Interval (10*HZ)
-#define IGMP_Query_Response_Interval (10*HZ)
-#define IGMP_Unsolicited_Report_Count 2
-
+#define IGMP_QUERY_INTERVAL (125*HZ)
+#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
-#define IGMP_Initial_Report_Delay (1)
+#define IGMP_INITIAL_REPORT_DELAY (1)
-/* IGMP_Initial_Report_Delay is not from IGMP specs!
+/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
* IGMP specs require to report membership immediately after
* joining a group, but we delay the first report by a
* small interval. It seems more natural and still does not
* contradict to specs provided this delay is small enough.
*/
-#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \
- (in_dev)->cnf.force_igmp_version == 1 || \
- ((in_dev)->mr_v1_seen && \
- time_before(jiffies, (in_dev)->mr_v1_seen)))
-#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \
- (in_dev)->cnf.force_igmp_version == 2 || \
- ((in_dev)->mr_v2_seen && \
- time_before(jiffies, (in_dev)->mr_v2_seen)))
-
-static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+#define IGMP_V1_SEEN(in_dev) \
+ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
+ ((in_dev)->mr_v1_seen && \
+ time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) \
+ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
+ ((in_dev)->mr_v2_seen && \
+ time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+ int interval_ms, interval_jiffies;
+
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+ else /* v3 */
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+ interval_jiffies = msecs_to_jiffies(interval_ms);
+
+ /* _timer functions can't handle a delay of 0 jiffies so ensure
+ * we always return a positive value.
+ */
+ if (interval_jiffies <= 0)
+ interval_jiffies = 1;
+ return interval_jiffies;
+}
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+ gfp_t gfp);
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
@@ -149,9 +169,30 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
static void ip_ma_put(struct ip_mc_list *im)
{
- if (atomic_dec_and_test(&im->refcnt)) {
+ if (refcount_dec_and_test(&im->refcnt)) {
in_dev_put(im->interface);
- kfree(im);
+ kfree_rcu(im, rcu);
+ }
+}
+
+#define for_each_pmc_rcu(in_dev, pmc) \
+ for (pmc = rcu_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rcu_dereference(pmc->next_rcu))
+
+#define for_each_pmc_rtnl(in_dev, pmc) \
+ for (pmc = rtnl_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rtnl_dereference(pmc->next_rcu))
+
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+ struct ip_sf_list *next;
+
+ while (psf) {
+ next = psf->sf_next;
+ kfree(psf);
+ psf = next;
}
}
@@ -161,12 +202,12 @@ static void ip_ma_put(struct ip_mc_list *im)
* Timer management
*/
-static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+static void igmp_stop_timer(struct ip_mc_list *im)
{
spin_lock_bh(&im->lock);
- if (del_timer(&im->timer))
- atomic_dec(&im->refcnt);
- im->tm_running=0;
+ if (timer_delete(&im->timer))
+ refcount_dec(&im->refcnt);
+ im->tm_running = 0;
im->reporter = 0;
im->unsolicit_count = 0;
spin_unlock_bh(&im->lock);
@@ -175,25 +216,32 @@ static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv=net_random() % max_delay;
+ int tv = get_random_u32_below(max_delay);
- im->tm_running=1;
- if (!mod_timer(&im->timer, jiffies+tv+2))
- atomic_inc(&im->refcnt);
+ im->tm_running = 1;
+ if (refcount_inc_not_zero(&im->refcnt)) {
+ if (mod_timer(&im->timer, jiffies + tv + 2))
+ ip_ma_put(im);
+ }
}
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = net_random() % in_dev->mr_maxdelay;
+ int tv = get_random_u32_below(in_dev->mr_maxdelay);
+ unsigned long exp = jiffies + tv + 2;
+
+ if (in_dev->mr_gq_running &&
+ time_after_eq(exp, (in_dev->mr_gq_timer).expires))
+ return;
in_dev->mr_gq_running = 1;
- if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ if (!mod_timer(&in_dev->mr_gq_timer, exp))
in_dev_hold(in_dev);
}
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
- int tv = net_random() % delay;
+ int tv = get_random_u32_below(delay);
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
@@ -203,14 +251,14 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
spin_lock_bh(&im->lock);
im->unsolicit_count = 0;
- if (del_timer(&im->timer)) {
+ if (timer_delete(&im->timer)) {
if ((long)(im->timer.expires-jiffies) < max_delay) {
add_timer(&im->timer);
- im->tm_running=1;
+ im->tm_running = 1;
spin_unlock_bh(&im->lock);
return;
}
- atomic_dec(&im->refcnt);
+ refcount_dec(&im->refcnt);
}
igmp_start_timer(im, max_delay);
spin_unlock_bh(&im->lock);
@@ -274,7 +322,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
struct ip_sf_list *psf;
int scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -282,57 +330,88 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
return scount;
}
-static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+/* source address selection per RFC 3376 section 4.2.13 */
+static __be32 igmpv3_get_srcaddr(struct net_device *dev,
+ const struct flowi4 *fl4)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
+
+ if (!in_dev)
+ return htonl(INADDR_ANY);
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (fl4->saddr == ifa->ifa_local)
+ return fl4->saddr;
+ }
+
+ return htonl(INADDR_ANY);
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
{
struct sk_buff *skb;
struct rtable *rt;
struct iphdr *pip;
struct igmpv3_report *pig;
-
- skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
- if (skb == NULL)
- return NULL;
-
- {
- struct flowi fl = { .oif = dev->ifindex,
- .nl_u = { .ip4_u = {
- .daddr = IGMPV3_ALL_MCR } },
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(&rt, &fl)) {
- kfree_skb(skb);
+ struct net *net = dev_net(dev);
+ struct flowi4 fl4;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ unsigned int size;
+
+ size = min(mtu, IP_MAX_MTU);
+ while (1) {
+ skb = alloc_skb(size + hlen + tlen,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (skb)
+ break;
+ size >>= 1;
+ if (size < 256)
return NULL;
- }
}
- if (rt->rt_src == 0) {
+ skb->priority = TC_PRIO_CONTROL;
+
+ rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt)) {
kfree_skb(skb);
- ip_rt_put(rt);
return NULL;
}
- skb->dst = &rt->u.dst;
+ skb_dst_set(skb, &rt->dst);
skb->dev = dev;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
- skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+ skb_reset_network_header(skb);
+ pip = ip_hdr(skb);
+ skb_put(skb, sizeof(struct iphdr) + 4);
pip->version = 4;
pip->ihl = (sizeof(struct iphdr)+4)>>2;
pip->tos = 0xc0;
pip->frag_off = htons(IP_DF);
pip->ttl = 1;
- pip->daddr = rt->rt_dst;
- pip->saddr = rt->rt_src;
+ pip->daddr = fl4.daddr;
+
+ rcu_read_lock();
+ pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
+ rcu_read_unlock();
+
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(pip, &rt->u.dst, NULL);
- ((u8*)&pip[1])[0] = IPOPT_RA;
- ((u8*)&pip[1])[1] = 4;
- ((u8*)&pip[1])[2] = 0;
- ((u8*)&pip[1])[3] = 0;
-
- pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig));
- skb->h.igmph = (struct igmphdr *)pig;
+ ip_select_ident(net, skb, NULL);
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
+
+ skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
+ skb_put(skb, sizeof(*pig));
+ pig = igmpv3_report_hdr(skb);
pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
pig->resv1 = 0;
pig->csum = 0;
@@ -343,62 +422,64 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
static int igmpv3_sendpack(struct sk_buff *skb)
{
- struct iphdr *pip = skb->nh.iph;
- struct igmphdr *pig = skb->h.igmph;
- int iplen, igmplen;
-
- iplen = skb->tail - (unsigned char *)skb->nh.iph;
- pip->tot_len = htons(iplen);
- ip_send_check(pip);
+ struct igmphdr *pig = igmp_hdr(skb);
+ const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
- igmplen = skb->tail - (unsigned char *)skb->h.igmph;
- pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
+ pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
- dst_output);
+ return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
{
- return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
+ return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
- int type, struct igmpv3_grec **ppgr)
+ int type, struct igmpv3_grec **ppgr, unsigned int mtu)
{
struct net_device *dev = pmc->interface->dev;
struct igmpv3_report *pih;
struct igmpv3_grec *pgr;
- if (!skb)
- skb = igmpv3_newpack(dev, dev->mtu);
- if (!skb)
- return NULL;
- pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+ if (!skb) {
+ skb = igmpv3_newpack(dev, mtu);
+ if (!skb)
+ return NULL;
+ }
+ pgr = skb_put(skb, sizeof(struct igmpv3_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
pgr->grec_nsrcs = 0;
pgr->grec_mca = pmc->multiaddr;
- pih = (struct igmpv3_report *)skb->h.igmph;
+ pih = igmpv3_report_hdr(skb);
pih->ngrec = htons(ntohs(pih->ngrec)+1);
*ppgr = pgr;
return skb;
}
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
- skb_tailroom(skb)) : 0)
+#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, int gdeleted, int sdeleted)
{
struct net_device *dev = pmc->interface->dev;
+ struct net *net = dev_net(dev);
struct igmpv3_report *pih;
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
int scount, stotal, first, isquery, truncate;
+ unsigned int mtu;
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
+ if (ipv4_is_local_multicast(pmc->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ return skb;
+
+ mtu = READ_ONCE(dev->mtu);
+ if (mtu < IPV4_MIN_MTU)
+ return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
@@ -412,7 +493,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (!*psf_list)
goto empty_source;
- pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL;
+ pih = skb ? igmpv3_report_hdr(skb) : NULL;
/* EX and TO_EX get a fresh packet, if needed */
if (truncate) {
@@ -420,12 +501,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
igmpv3_sendpack(skb);
- skb = igmpv3_newpack(dev, dev->mtu);
+ skb = igmpv3_newpack(dev, mtu);
}
}
first = 1;
psf_prev = NULL;
- for (psf=*psf_list; psf; psf=psf_next) {
+ for (psf = *psf_list; psf; psf = psf_next) {
__be32 *psrc;
psf_next = psf->sf_next;
@@ -435,6 +516,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
continue;
}
+ /* Based on RFC3376 5.1. Should not send source-list change
+ * records when there is a filter mode change.
+ */
+ if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
+ (!gdeleted && pmc->crcount)) &&
+ (type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
+ goto decrease_sf_crcount;
+
/* clear marks on query responses */
if (isquery)
psf->sf_gsresp = 0;
@@ -447,19 +537,22 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
pgr->grec_nsrcs = htons(scount);
if (skb)
igmpv3_sendpack(skb);
- skb = igmpv3_newpack(dev, dev->mtu);
+ skb = igmpv3_newpack(dev, mtu);
first = 1;
scount = 0;
}
if (first) {
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
first = 0;
}
- psrc = (__be32 *)skb_put(skb, sizeof(__be32));
+ if (!skb)
+ return NULL;
+ psrc = skb_put(skb, sizeof(__be32));
*psrc = psf->sf_inaddr;
scount++; stotal++;
if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
@@ -480,11 +573,11 @@ empty_source:
return skb;
if (pmc->crcount || isquery) {
/* make sure we have room for group header */
- if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) {
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
}
}
if (pgr)
@@ -498,13 +591,17 @@ empty_source:
static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
struct sk_buff *skb = NULL;
+ struct net *net = dev_net(in_dev->dev);
int type;
if (!pmc) {
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(pmc->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
@@ -513,7 +610,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
skb = add_grec(skb, pmc, type, 0, 0);
spin_unlock_bh(&pmc->lock);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
} else {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
@@ -536,7 +633,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
struct ip_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf=*ppsf; psf; psf = psf_next) {
+ for (psf = *ppsf; psf; psf = psf_next) {
psf_next = psf->sf_next;
if (psf->sf_crcount == 0) {
if (psf_prev)
@@ -549,18 +646,25 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
}
}
+static void kfree_pmc(struct ip_mc_list *pmc)
+{
+ ip_sf_list_clear_all(pmc->sources);
+ ip_sf_list_clear_all(pmc->tomb);
+ kfree(pmc);
+}
+
static void igmpv3_send_cr(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
struct sk_buff *skb = NULL;
int type, dtype;
- read_lock(&in_dev->mc_list_lock);
+ rcu_read_lock();
spin_lock_bh(&in_dev->mc_tomb_lock);
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
pmc_next = pmc->next;
if (pmc->sfmode == MCAST_INCLUDE) {
type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -585,14 +689,14 @@ static void igmpv3_send_cr(struct in_device *in_dev)
else
in_dev->mc_tomb = pmc_next;
in_dev_put(pmc->interface);
- kfree(pmc);
+ kfree_pmc(pmc);
} else
pmc_prev = pmc;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
/* change recs */
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rcu(in_dev, pmc) {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE]) {
type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -615,7 +719,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
}
spin_unlock_bh(&pmc->lock);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
if (!skb)
return;
@@ -630,39 +734,46 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
struct igmphdr *ih;
struct rtable *rt;
struct net_device *dev = in_dev->dev;
+ struct net *net = dev_net(dev);
__be32 group = pmc ? pmc->multiaddr : 0;
+ struct flowi4 fl4;
__be32 dst;
+ int hlen, tlen;
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
- else if (type == IGMP_HOST_LEAVE_MESSAGE)
+
+ if (ipv4_is_local_multicast(group) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ return 0;
+
+ if (type == IGMP_HOST_LEAVE_MESSAGE)
dst = IGMP_ALL_ROUTER;
else
dst = group;
- {
- struct flowi fl = { .oif = dev->ifindex,
- .nl_u = { .ip4_u = { .daddr = dst } },
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(&rt, &fl))
- return -1;
- }
- if (rt->rt_src == 0) {
- ip_rt_put(rt);
+ rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt))
return -1;
- }
- skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
- if (skb == NULL) {
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
+ if (!skb) {
ip_rt_put(rt);
return -1;
}
+ skb->priority = TC_PRIO_CONTROL;
- skb->dst = &rt->u.dst;
+ skb_dst_set(skb, &rt->dst);
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
- skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ skb_put(skb, sizeof(struct iphdr) + 4);
iph->version = 4;
iph->ihl = (sizeof(struct iphdr)+4)>>2;
@@ -670,70 +781,74 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->daddr = dst;
- iph->saddr = rt->rt_src;
+ iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
- iph->tot_len = htons(IGMP_SIZE);
- ip_select_ident(iph, &rt->u.dst, NULL);
- ((u8*)&iph[1])[0] = IPOPT_RA;
- ((u8*)&iph[1])[1] = 4;
- ((u8*)&iph[1])[2] = 0;
- ((u8*)&iph[1])[3] = 0;
- ip_send_check(iph);
+ ip_select_ident(net, skb, NULL);
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
- ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
- ih->type=type;
- ih->code=0;
- ih->csum=0;
- ih->group=group;
- ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+ ih = skb_put(skb, sizeof(struct igmphdr));
+ ih->type = type;
+ ih->code = 0;
+ ih->csum = 0;
+ ih->group = group;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ return ip_local_out(net, skb->sk, skb);
}
-static void igmp_gq_timer_expire(unsigned long data)
+static void igmp_gq_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = (struct in_device *)data;
+ struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer);
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
-static void igmp_ifc_timer_expire(unsigned long data)
+static void igmp_ifc_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = (struct in_device *)data;
+ struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer);
+ u32 mr_ifc_count;
igmpv3_send_cr(in_dev);
- if (in_dev->mr_ifc_count) {
- in_dev->mr_ifc_count--;
- igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+restart:
+ mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count);
+
+ if (mr_ifc_count) {
+ if (cmpxchg(&in_dev->mr_ifc_count,
+ mr_ifc_count,
+ mr_ifc_count - 1) != mr_ifc_count)
+ goto restart;
+ igmp_ifc_start_timer(in_dev,
+ unsolicited_report_interval(in_dev));
}
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
{
+ struct net *net = dev_net(in_dev->dev);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
- in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
igmp_ifc_start_timer(in_dev, 1);
}
-static void igmp_timer_expire(unsigned long data)
+static void igmp_timer_expire(struct timer_list *t)
{
- struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct ip_mc_list *im = timer_container_of(im, t, timer);
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
- im->tm_running=0;
+ im->tm_running = 0;
+
+ if (im->unsolicit_count && --im->unsolicit_count)
+ igmp_start_timer(im, unsolicited_report_interval(in_dev));
- if (im->unsolicit_count) {
- im->unsolicit_count--;
- igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
- }
im->reporter = 1;
spin_unlock(&im->lock);
@@ -754,15 +869,15 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
int i, scount;
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++) {
+ for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
- if (pmc->sfcount[MCAST_INCLUDE] ||
+ if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
- continue;
+ break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
@@ -785,10 +900,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++)
+ for (i = 0; i < nsrcs; i++)
if (srcs[i] == psf->sf_inaddr) {
psf->sf_gsresp = 1;
scount++;
@@ -803,81 +918,118 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
return 1;
}
-static void igmp_heard_report(struct in_device *in_dev, __be32 group)
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
+ struct net *net = dev_net(in_dev->dev);
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
- return;
+ return false;
+ if (ipv4_is_local_multicast(group) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ return false;
- read_lock(&in_dev->mc_list_lock);
- for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == group) {
igmp_stop_timer(im);
break;
}
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
+ return false;
}
-static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int len)
{
- struct igmphdr *ih = skb->h.igmph;
- struct igmpv3_query *ih3 = (struct igmpv3_query *)ih;
+ struct igmphdr *ih = igmp_hdr(skb);
+ struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
struct ip_mc_list *im;
__be32 group = ih->group;
int max_delay;
int mark = 0;
+ struct net *net = dev_net(in_dev->dev);
if (len == 8) {
if (ih->code == 0) {
/* Alas, old v1 router presents here. */
-
- max_delay = IGMP_Query_Response_Interval;
+
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_v1_seen = jiffies +
- IGMP_V1_Router_Present_Timeout;
+ (in_dev->mr_qrv * in_dev->mr_qi) +
+ in_dev->mr_qri;
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
in_dev->mr_v2_seen = jiffies +
- IGMP_V2_Router_Present_Timeout;
+ (in_dev->mr_qrv * in_dev->mr_qi) +
+ in_dev->mr_qri;
}
/* cancel the interface change timer */
- in_dev->mr_ifc_count = 0;
- if (del_timer(&in_dev->mr_ifc_timer))
+ WRITE_ONCE(in_dev->mr_ifc_count, 0);
+ if (timer_delete(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
- return; /* ignore bogus packet; freed by caller */
+ return true; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
- return;
-
- ih3 = (struct igmpv3_query *) skb->h.raw;
+ return true;
+
+ ih3 = igmpv3_query_hdr(skb);
if (ih3->nsrcs) {
- if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ ntohs(ih3->nsrcs)*sizeof(__be32)))
- return;
- ih3 = (struct igmpv3_query *) skb->h.raw;
+ return true;
+ ih3 = igmpv3_query_hdr(skb);
}
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
in_dev->mr_maxdelay = max_delay;
- if (ih3->qrv)
- in_dev->mr_qrv = ih3->qrv;
+
+ /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
+ * received value was zero, use the default or statically
+ * configured value.
+ */
+ in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+
+ /* RFC3376, 8.3. Query Response Interval:
+ * The number of seconds represented by the [Query Response
+ * Interval] must be less than the [Query Interval].
+ */
+ if (in_dev->mr_qri >= in_dev->mr_qi)
+ in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+
if (!group) { /* general query */
if (ih3->nsrcs)
- return; /* no sources allowed */
+ return true; /* no sources allowed */
igmp_gq_start_timer(in_dev);
- return;
+ return false;
}
/* mark sources to include, if group & source-specific */
mark = ih3->nsrcs != 0;
@@ -893,75 +1045,78 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
* - Use the igmp->igmp_code field as the maximum
* delay possible
*/
- read_lock(&in_dev->mc_list_lock);
- for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
int changed;
if (group && group != im->multiaddr)
continue;
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
im->gsquery = im->gsquery && mark;
else
im->gsquery = mark;
changed = !im->gsquery ||
- igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+ igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
spin_unlock_bh(&im->lock);
if (changed)
igmp_mod_timer(im, max_delay);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
+ return false;
}
+/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
- struct in_device *in_dev = in_dev_get(skb->dev);
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
int len = skb->len;
+ bool dropped = true;
- if (in_dev==NULL) {
- kfree_skb(skb);
- return 0;
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+ if (!dev)
+ goto drop;
}
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto drop;
+
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto drop;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
- ih = skb->h.igmph;
+ ih = igmp_hdr(skb);
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
- igmp_heard_query(in_dev, skb, len);
+ dropped = igmp_heard_query(in_dev, skb, len);
break;
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
- case IGMPV3_HOST_MEMBERSHIP_REPORT:
/* Is it our report looped back? */
- if (((struct rtable*)skb->dst)->fl.iif == 0)
+ if (rt_is_output_route(skb_rtable(skb)))
break;
/* don't rely on MC router hearing unicast reports */
if (skb->pkt_type == PACKET_MULTICAST ||
skb->pkt_type == PACKET_BROADCAST)
- igmp_heard_report(in_dev, ih->group);
+ dropped = igmp_heard_report(in_dev, ih->group);
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
- in_dev_put(in_dev);
return pim_rcv_v1(skb);
#endif
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
case IGMP_DVMRP:
case IGMP_TRACE:
case IGMP_HOST_LEAVE_MESSAGE:
@@ -973,8 +1128,10 @@ int igmp_rcv(struct sk_buff *skb)
}
drop:
- in_dev_put(in_dev);
- kfree_skb(skb);
+ if (dropped)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
return 0;
}
@@ -992,13 +1149,13 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
We will get multicast token leakage, when IFF_MULTICAST
- is changed. This check should be done in dev->set_multicast_list
+ is changed. This check should be done in ndo_set_rx_mode
routine. Something sort of:
if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
--ANK
*/
if (arp_mc_map(addr, buf, dev, 0) == 0)
- dev_mc_add(dev,buf,dev->addr_len,0);
+ dev_mc_add(dev, buf);
}
/*
@@ -1011,16 +1168,18 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
struct net_device *dev = in_dev->dev;
if (arp_mc_map(addr, buf, dev, 0) == 0)
- dev_mc_delete(dev,buf,dev->addr_len,0);
+ dev_mc_del(dev, buf);
}
#ifdef CONFIG_IP_MULTICAST
/*
* deleted ip_mc_list manipulation
*/
-static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+ gfp_t gfp)
{
struct ip_mc_list *pmc;
+ struct net *net = dev_net(in_dev->dev);
/* this is an "ip_mc_list" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
@@ -1028,15 +1187,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
- pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+ pmc = kzalloc(sizeof(*pmc), gfp);
if (!pmc)
return;
+ spin_lock_init(&pmc->lock);
spin_lock_bh(&im->lock);
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
- pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
@@ -1044,7 +1203,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->tomb = im->tomb;
pmc->sources = im->sources;
im->tomb = im->sources = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = pmc->crcount;
}
spin_unlock_bh(&im->lock);
@@ -1055,14 +1214,19 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
spin_unlock_bh(&in_dev->mc_tomb_lock);
}
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+/*
+ * restore ip_mc_list deleted records
+ */
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc, *pmc_prev;
- struct ip_sf_list *psf, *psf_next;
+ struct ip_sf_list *psf;
+ struct net *net = dev_net(in_dev->dev);
+ __be32 multiaddr = im->multiaddr;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
if (pmc->multiaddr == multiaddr)
break;
pmc_prev = pmc;
@@ -1074,16 +1238,29 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
in_dev->mc_tomb = pmc->next;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ spin_lock_bh(&im->lock);
if (pmc) {
- for (psf=pmc->tomb; psf; psf=psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ im->interface = pmc->interface;
+ if (im->sfmode == MCAST_INCLUDE) {
+ swap(im->tomb, pmc->tomb);
+ swap(im->sources, pmc->sources);
+ for (psf = im->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = in_dev->mr_qrv ?:
+ READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ } else {
+ im->crcount = in_dev->mr_qrv ?:
+ READ_ONCE(net->ipv4.sysctl_igmp_qrv);
}
in_dev_put(pmc->interface);
- kfree(pmc);
+ kfree_pmc(pmc);
}
+ spin_unlock_bh(&im->lock);
}
+/*
+ * flush ip_mc_list deleted records
+ */
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *nextpmc;
@@ -1097,30 +1274,28 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
nextpmc = pmc->next;
ip_mc_clear_src(pmc);
in_dev_put(pmc->interface);
- kfree(pmc);
+ kfree_pmc(pmc);
}
/* clear dead sources, too */
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
- struct ip_sf_list *psf, *psf_next;
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ struct ip_sf_list *psf;
spin_lock_bh(&pmc->lock);
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
- for (; psf; psf=psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
- }
+ ip_sf_list_clear_all(psf);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
}
#endif
-static void igmp_group_dropped(struct ip_mc_list *im)
+static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
int reporter;
#endif
@@ -1132,31 +1307,40 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ return;
reporter = im->reporter;
igmp_stop_timer(im);
if (!in_dev->dead) {
if (IGMP_V1_SEEN(in_dev))
- goto done;
+ return;
if (IGMP_V2_SEEN(in_dev)) {
if (reporter)
igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
- goto done;
+ return;
}
/* IGMPv3 */
- igmpv3_add_delrec(in_dev, im);
+ igmpv3_add_delrec(in_dev, im, gfp);
igmp_ifc_event(in_dev);
}
-done:
#endif
- ip_mc_clear_src(im);
+}
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ __igmp_group_dropped(im, GFP_KERNEL);
}
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
if (im->loaded == 0) {
im->loaded = 1;
@@ -1166,19 +1350,29 @@ static void igmp_group_added(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ return;
if (in_dev->dead)
return;
+
+ im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
- igmp_start_timer(im, IGMP_Initial_Report_Delay);
+ igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
spin_unlock_bh(&im->lock);
return;
}
/* else, v3 */
- im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
+ * not send filter-mode change record as the mode should be from
+ * IN() to IN(A).
+ */
+ if (im->sfmode == MCAST_EXCLUDE)
+ im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+
igmp_ifc_event(in_dev);
#endif
}
@@ -1188,83 +1382,402 @@ static void igmp_group_added(struct ip_mc_list *im)
* Multicast list managers
*/
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+ return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash;
+ u32 hash;
+
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ rcu_assign_pointer(mc_hash[hash], im);
+ return;
+ }
+
+ /* do not use a hash table for small number of items */
+ if (in_dev->mc_count < 4)
+ return;
+
+ mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+ GFP_KERNEL);
+ if (!mc_hash)
+ return;
+
+ for_each_pmc_rtnl(in_dev, im) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ RCU_INIT_POINTER(mc_hash[hash], im);
+ }
+
+ rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+ struct ip_mc_list *aux;
+
+ if (!mc_hash)
+ return;
+ mc_hash += ip_mc_hash(im);
+ while ((aux = rtnl_dereference(*mc_hash)) != im)
+ mc_hash = &aux->next_hash;
+ *mc_hash = im->next_hash;
+}
+
+int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im,
+ struct inet_fill_args *args)
+{
+ struct ifa_cacheinfo ci;
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = 32;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_UNIVERSE;
+ ifm->ifa_index = dev->ifindex;
+
+ ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
+ ci.tstamp = ci.cstamp;
+ ci.ifa_prefered = INFINITY_LIFE_TIME;
+ ci.ifa_valid = INFINITY_LIFE_TIME;
+
+ if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
+ nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static void inet_ifmcaddr_notify(struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct inet_fill_args fillargs = {
+ .event = event,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(__be32)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet_fill_ifmcaddr(skb, dev, im, &fillargs);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
+}
/*
* A socket has joined a multicast group on device dev.
*/
-
-void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+ unsigned int mode, gfp_t gfp)
{
+ struct ip_mc_list __rcu **mc_hash;
struct ip_mc_list *im;
ASSERT_RTNL();
- for (im=in_dev->mc_list; im; im=im->next) {
- if (im->multiaddr == addr) {
- im->users++;
- ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
- goto out;
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG);
+
+ for (im = rtnl_dereference(mc_hash[hash]);
+ im;
+ im = rtnl_dereference(im->next_hash)) {
+ if (im->multiaddr == addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == addr)
+ break;
}
}
- im = kmalloc(sizeof(*im), GFP_KERNEL);
+ if (im) {
+ im->users++;
+ ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
+ goto out;
+ }
+
+ im = kzalloc(sizeof(*im), gfp);
if (!im)
goto out;
- im->users=1;
- im->interface=in_dev;
+ im->users = 1;
+ im->interface = in_dev;
in_dev_hold(in_dev);
- im->multiaddr=addr;
+ im->multiaddr = addr;
+ im->mca_cstamp = jiffies;
+ im->mca_tstamp = im->mca_cstamp;
/* initial mode is (EX, empty) */
- im->sfmode = MCAST_EXCLUDE;
- im->sfcount[MCAST_INCLUDE] = 0;
- im->sfcount[MCAST_EXCLUDE] = 1;
- im->sources = NULL;
- im->tomb = NULL;
- im->crcount = 0;
- atomic_set(&im->refcnt, 1);
+ im->sfmode = mode;
+ im->sfcount[mode] = 1;
+ refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
- im->tm_running=0;
- init_timer(&im->timer);
- im->timer.data=(unsigned long)im;
- im->timer.function=&igmp_timer_expire;
- im->unsolicit_count = IGMP_Unsolicited_Report_Count;
- im->reporter = 0;
- im->gsquery = 0;
+ timer_setup(&im->timer, igmp_timer_expire, 0);
#endif
- im->loaded = 0;
- write_lock_bh(&in_dev->mc_list_lock);
- im->next=in_dev->mc_list;
- in_dev->mc_list=im;
- write_unlock_bh(&in_dev->mc_list_lock);
+
+ im->next_rcu = in_dev->mc_list;
+ in_dev->mc_count++;
+ rcu_assign_pointer(in_dev->mc_list, im);
+
+ ip_mc_hash_add(in_dev, im);
+
#ifdef CONFIG_IP_MULTICAST
- igmpv3_del_delrec(in_dev, im->multiaddr);
+ igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
+ inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
return;
}
+void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
+{
+ ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
+}
+EXPORT_SYMBOL(__ip_mc_inc_group);
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+ __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ip_mc_inc_group);
+
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ unsigned int len;
+ unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+ return -EINVAL;
+
+ offset += ip_hdrlen(skb) - sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ return -EINVAL;
+
+ len = skb_network_offset(skb) + ntohs(iph->tot_len);
+ if (skb->len < len || len < offset)
+ return -EINVAL;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmpv3_report);
+
+ return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+ unsigned int transport_len = ip_transport_len(skb);
+ unsigned int len;
+
+ /* IGMPv{1,2}? */
+ if (transport_len != sizeof(struct igmphdr)) {
+ /* or IGMPv3? */
+ if (transport_len < sizeof(struct igmpv3_query))
+ return -EINVAL;
+
+ len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
+ if (!ip_mc_may_pull(skb, len))
+ return -EINVAL;
+ }
+
+ /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+ * all-systems destination addresses (224.0.0.1) for general queries
+ */
+ if (!igmp_hdr(skb)->group &&
+ ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+ switch (igmp_hdr(skb)->type) {
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ return 0;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ return ip_mc_check_igmp_reportv3(skb);
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ return ip_mc_check_igmp_query(skb);
+ default:
+ return -ENOMSG;
+ }
+}
+
+static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+ return skb_checksum_simple_validate(skb);
+}
+
+static int ip_mc_check_igmp_csum(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+ unsigned int transport_len = ip_transport_len(skb);
+ struct sk_buff *skb_chk;
+
+ if (!ip_mc_may_pull(skb, len))
+ return -EINVAL;
+
+ skb_chk = skb_checksum_trimmed(skb, transport_len,
+ ip_mc_validate_checksum);
+ if (!skb_chk)
+ return -EINVAL;
+
+ if (skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return 0;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb transport header accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ * standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Caller needs to set the skb network header and free any returned skb if it
+ * differs from the provided skb.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb)
+{
+ int ret = ip_mc_check_iphdr(skb);
+
+ if (ret < 0)
+ return ret;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+ return -ENOMSG;
+
+ ret = ip_mc_check_igmp_csum(skb);
+ if (ret < 0)
+ return ret;
+
+ return ip_mc_check_igmp_msg(skb);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
+/*
+ * Resend IGMP JOIN report; used by netdev notifier.
+ */
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
+{
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_mc_list *im;
+ int type;
+ struct net *net = dev_net(in_dev->dev);
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+ continue;
+
+ /* a failover is happening and switches
+ * must be notified immediately
+ */
+ if (IGMP_V1_SEEN(in_dev))
+ type = IGMP_HOST_MEMBERSHIP_REPORT;
+ else if (IGMP_V2_SEEN(in_dev))
+ type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+ else
+ type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ igmp_send_report(in_dev, im, type);
+ }
+#endif
+}
+
/*
* A socket has left a multicast group on device dev
*/
-void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
+void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
- struct ip_mc_list *i, **ip;
-
+ struct ip_mc_list *i;
+ struct ip_mc_list __rcu **ip;
+
ASSERT_RTNL();
-
- for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
- if (i->multiaddr==addr) {
+
+ for (ip = &in_dev->mc_list;
+ (i = rtnl_dereference(*ip)) != NULL;
+ ip = &i->next_rcu) {
+ if (i->multiaddr == addr) {
if (--i->users == 0) {
- write_lock_bh(&in_dev->mc_list_lock);
- *ip = i->next;
- write_unlock_bh(&in_dev->mc_list_lock);
- igmp_group_dropped(i);
+ ip_mc_hash_remove(in_dev, i);
+ *ip = i->next_rcu;
+ in_dev->mc_count--;
+ __igmp_group_dropped(i, gfp);
+ inet_ifmcaddr_notify(in_dev->dev, i,
+ RTM_DELMULTICAST);
+ ip_mc_clear_src(i);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
@@ -1276,49 +1789,82 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
}
}
}
+EXPORT_SYMBOL(__ip_mc_dec_group);
+
+/* Device changing type */
+
+void ip_mc_unmap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
+}
+
+void ip_mc_remap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, pmc);
+#endif
+ igmp_group_added(pmc);
+ }
+}
/* Device going down */
void ip_mc_down(struct in_device *in_dev)
{
- struct ip_mc_list *i;
+ struct ip_mc_list *pmc;
ASSERT_RTNL();
- for (i=in_dev->mc_list; i; i=i->next)
- igmp_group_dropped(i);
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_ifc_count = 0;
- if (del_timer(&in_dev->mr_ifc_timer))
+ WRITE_ONCE(in_dev->mr_ifc_count, 0);
+ if (timer_delete(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
in_dev->mr_gq_running = 0;
- if (del_timer(&in_dev->mr_gq_timer))
+ if (timer_delete(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
- igmpv3_clear_delrec(in_dev);
#endif
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
+#ifdef CONFIG_IP_MULTICAST
+static void ip_mc_reset(struct in_device *in_dev)
+{
+ struct net *net = dev_net(in_dev->dev);
+
+ in_dev->mr_qi = IGMP_QUERY_INTERVAL;
+ in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
+ in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+}
+#else
+static void ip_mc_reset(struct in_device *in_dev)
+{
+}
+#endif
+
void ip_mc_init_dev(struct in_device *in_dev)
{
ASSERT_RTNL();
- in_dev->mc_tomb = NULL;
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_gq_running = 0;
- init_timer(&in_dev->mr_gq_timer);
- in_dev->mr_gq_timer.data=(unsigned long) in_dev;
- in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
- in_dev->mr_ifc_count = 0;
- init_timer(&in_dev->mr_ifc_timer);
- in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
- in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
- in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+ timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
+ timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
#endif
+ ip_mc_reset(in_dev);
- rwlock_init(&in_dev->mc_list_lock);
spin_lock_init(&in_dev->mc_tomb_lock);
}
@@ -1326,14 +1872,19 @@ void ip_mc_init_dev(struct in_device *in_dev)
void ip_mc_up(struct in_device *in_dev)
{
- struct ip_mc_list *i;
+ struct ip_mc_list *pmc;
ASSERT_RTNL();
+ ip_mc_reset(in_dev);
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
- for (i=in_dev->mc_list; i; i=i->next)
- igmp_group_added(i);
+ for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, pmc);
+#endif
+ igmp_group_added(pmc);
+ }
}
/*
@@ -1348,44 +1899,43 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
/* Deactivate timers */
ip_mc_down(in_dev);
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_clear_delrec(in_dev);
+#endif
- write_lock_bh(&in_dev->mc_list_lock);
- while ((i = in_dev->mc_list) != NULL) {
- in_dev->mc_list = i->next;
- write_unlock_bh(&in_dev->mc_list_lock);
-
- igmp_group_dropped(i);
+ while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
+ in_dev->mc_list = i->next_rcu;
+ in_dev->mc_count--;
+ ip_mc_clear_src(i);
ip_ma_put(i);
-
- write_lock_bh(&in_dev->mc_list_lock);
}
- write_unlock_bh(&in_dev->mc_list_lock);
}
-static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+/* RTNL is locked */
+static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = imr->imr_multiaddr.s_addr } } };
- struct rtable *rt;
struct net_device *dev = NULL;
struct in_device *idev = NULL;
if (imr->imr_ifindex) {
- idev = inetdev_by_index(imr->imr_ifindex);
- if (idev)
- __in_dev_put(idev);
+ idev = inetdev_by_index(net, imr->imr_ifindex);
return idev;
}
if (imr->imr_address.s_addr) {
- dev = ip_dev_find(imr->imr_address.s_addr);
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
- dev_put(dev);
}
- if (!dev && !ip_route_output_key(&rt, &fl)) {
- dev = rt->u.dst.dev;
- ip_rt_put(rt);
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net,
+ imr->imr_multiaddr.s_addr,
+ 0, 0, 0,
+ RT_SCOPE_UNIVERSE);
+ if (!IS_ERR(rt)) {
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
@@ -1397,9 +1947,6 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
/*
* Join a socket to a group
*/
-int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
-int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
-
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
@@ -1408,7 +1955,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
int rv = 0;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1424,6 +1971,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct in_device *in_dev = pmc->interface;
+ struct net *net = dev_net(in_dev->dev);
#endif
/* no more filters for this source */
@@ -1434,8 +1982,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
- psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
@@ -1459,18 +2006,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!in_dev)
return -ENODEV;
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
#endif
@@ -1481,7 +2028,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[sfmode]--;
}
err = 0;
- for (i=0; i<sfcount; i++) {
+ for (i = 0; i < sfcount; i++) {
int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
@@ -1493,15 +2040,15 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
+ struct net *net = dev_net(in_dev->dev);
#endif
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
- pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
- in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
} else if (sf_setstate(pmc) || changerec) {
@@ -1517,12 +2064,12 @@ out_unlock:
* Add multicast single-source filter to the interface list
*/
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
- __be32 *psfsrc, int delta)
+ __be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1550,7 +2097,7 @@ static void sf_markstate(struct ip_mc_list *pmc)
struct ip_sf_list *psf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
if (pmc->sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
@@ -1567,7 +2114,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
int new_in, rv;
rv = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (pmc->sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -1577,7 +2124,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
if (!psf->sf_oldin) {
struct ip_sf_list *prev = NULL;
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) {
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
prev = dpsf;
@@ -1599,12 +2146,11 @@ static int sf_setstate(struct ip_mc_list *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next)
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
- dpsf = (struct ip_sf_list *)
- kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
if (!dpsf)
continue;
*dpsf = *psf;
@@ -1632,18 +2178,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!in_dev)
return -ENODEV;
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
@@ -1652,21 +2198,23 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!delta)
pmc->sfcount[sfmode]++;
err = 0;
- for (i=0; i<sfcount; i++) {
- err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+ for (i = 0; i < sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
}
if (err) {
int j;
- pmc->sfcount[sfmode]--;
- for (j=0; j<i; j++)
- (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+ if (!delta)
+ pmc->sfcount[sfmode]--;
+ for (j = 0; j < i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
- struct in_device *in_dev = pmc->interface;
struct ip_sf_list *psf;
+ struct net *net = dev_net(pmc->interface->dev);
+ in_dev = pmc->interface;
#endif
/* filter mode change */
@@ -1677,10 +2225,9 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
- pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
- in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
} else if (sf_setstate(pmc)) {
@@ -1693,112 +2240,132 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
- struct ip_sf_list *psf, *nextpsf;
+ struct ip_sf_list *tomb, *sources;
- for (psf=pmc->tomb; psf; psf=nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
- }
+ spin_lock_bh(&pmc->lock);
+ tomb = pmc->tomb;
pmc->tomb = NULL;
- for (psf=pmc->sources; psf; psf=nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
- }
+ sources = pmc->sources;
pmc->sources = NULL;
pmc->sfmode = MCAST_EXCLUDE;
pmc->sfcount[MCAST_INCLUDE] = 0;
pmc->sfcount[MCAST_EXCLUDE] = 1;
-}
+ spin_unlock_bh(&pmc->lock);
+ ip_sf_list_clear_all(tomb);
+ ip_sf_list_clear_all(sources);
+}
-/*
- * Join a multicast group
+/* Join a multicast group
*/
-int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
+ unsigned int mode)
{
- int err;
__be32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml=NULL, *i;
+ struct ip_mc_socklist *iml, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
int ifindex;
int count = 0;
+ int err;
- if (!MULTICAST(addr))
- return -EINVAL;
+ ASSERT_RTNL();
- rtnl_lock();
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
- in_dev = ip_mc_find_dev(imr);
+ in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
- iml = NULL;
err = -ENODEV;
goto done;
}
err = -EADDRINUSE;
ifindex = imr->imr_ifindex;
- for (i = inet->mc_list; i; i = i->next) {
+ for_each_pmc_rtnl(inet, i) {
if (i->multi.imr_multiaddr.s_addr == addr &&
i->multi.imr_ifindex == ifindex)
goto done;
count++;
}
err = -ENOBUFS;
- if (count >= sysctl_igmp_max_memberships)
+ if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships))
goto done;
- iml = sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
- if (iml == NULL)
+ iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+ if (!iml)
goto done;
memcpy(&iml->multi, imr, sizeof(*imr));
- iml->next = inet->mc_list;
+ iml->next_rcu = inet->mc_list;
iml->sflist = NULL;
- iml->sfmode = MCAST_EXCLUDE;
- inet->mc_list = iml;
- ip_mc_inc_group(in_dev, addr);
+ iml->sfmode = mode;
+ rcu_assign_pointer(inet->mc_list, iml);
+ ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
done:
- rtnl_unlock();
return err;
}
+/* Join ASM (Any-Source Multicast) group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
+}
+EXPORT_SYMBOL(ip_mc_join_group);
+
+/* Join SSM (Source-Specific Multicast) group
+ */
+int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+ unsigned int mode)
+{
+ return __ip_mc_join_group(sk, imr, mode);
+}
+
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
+ struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
int err;
- if (iml->sflist == 0) {
+ if (!psf) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
}
err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
- iml->sfmode, iml->sflist->sl_count,
- iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+ iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psf, rcu);
return err;
}
-/*
- * Ask a socket to leave a group.
- */
-
int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
- struct ip_mc_socklist *iml, **imlp;
+ struct ip_mc_socklist *iml;
+ struct ip_mc_socklist __rcu **imlp;
struct in_device *in_dev;
+ struct net *net = sock_net(sk);
__be32 group = imr->imr_multiaddr.s_addr;
u32 ifindex;
int ret = -EADDRNOTAVAIL;
- rtnl_lock();
- in_dev = ip_mc_find_dev(imr);
+ ASSERT_RTNL();
+
+ in_dev = ip_mc_find_dev(net, imr);
+ if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
ifindex = imr->imr_ifindex;
- for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+ for (imlp = &inet->mc_list;
+ (iml = rtnl_dereference(*imlp)) != NULL;
+ imlp = &iml->next_rcu) {
if (iml->multi.imr_multiaddr.s_addr != group)
continue;
if (ifindex) {
@@ -1810,19 +2377,20 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
(void) ip_mc_leave_src(sk, iml, in_dev);
- *imlp = iml->next;
+ *imlp = iml->next_rcu;
if (in_dev)
ip_mc_dec_group(in_dev, group);
- rtnl_unlock();
- sock_kfree_s(sk, iml, sizeof(*iml));
+
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
return 0;
}
- if (!in_dev)
- ret = -ENODEV;
- rtnl_unlock();
+out:
return ret;
}
+EXPORT_SYMBOL(ip_mc_leave_group);
int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mreq_source *mreqs, int ifindex)
@@ -1834,18 +2402,19 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
struct in_device *in_dev = NULL;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
int leavegroup = 0;
int i, j, rv;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
+ ASSERT_RTNL();
imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
imr.imr_address.s_addr = mreqs->imr_interface;
imr.imr_ifindex = ifindex;
- in_dev = ip_mc_find_dev(&imr);
+ in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
@@ -1853,9 +2422,10 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
err = -EADDRNOTAVAIL;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
- if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
- && pmc->multi.imr_ifindex == imr.imr_ifindex)
+ for_each_pmc_rtnl(inet, pmc) {
+ if ((pmc->multi.imr_multiaddr.s_addr ==
+ imr.imr_multiaddr.s_addr) &&
+ (pmc->multi.imr_ifindex == imr.imr_ifindex))
break;
}
if (!pmc) { /* must have a prior join */
@@ -1871,17 +2441,17 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
- ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
NULL, 0);
pmc->sfmode = omode;
}
- psl = pmc->sflist;
+ psl = rtnl_dereference(pmc->sflist);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -1897,10 +2467,10 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
/* update the interface filter */
- ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
- for (j=i+1; j<psl->sl_count; j++)
+ for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
@@ -1908,7 +2478,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
/* else, add a new source to the filter */
- if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
err = -ENOBUFS;
goto done;
}
@@ -1918,7 +2488,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (psl)
count += psl->sl_max;
- newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
@@ -1926,14 +2497,19 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
newpsl->sl_max = count;
newpsl->sl_count = count - IP_SFBLOCK;
if (psl) {
- for (i=0; i<psl->sl_count; i++)
+ for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
- sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
}
- pmc->sflist = psl = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ if (psl)
+ kfree_rcu(psl, rcu);
+ psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -1941,18 +2517,17 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
if (rv == 0) /* address already there is an error */
goto done;
- for (j=psl->sl_count-1; j>=i; j--)
+ for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = mreqs->imr_sourceaddr;
psl->sl_count++;
err = 0;
/* update the interface list */
- ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
done:
- rtnl_unlock();
if (leavegroup)
- return ip_mc_leave_group(sk, &imr);
+ err = ip_mc_leave_group(sk, &imr);
return err;
}
@@ -1965,20 +2540,21 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *newpsl, *psl;
+ struct net *net = sock_net(sk);
int leavegroup = 0;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
if (msf->imsf_fmode != MCAST_INCLUDE &&
msf->imsf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- rtnl_lock();
+ ASSERT_RTNL();
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = ifindex;
- in_dev = ip_mc_find_dev(&imr);
+ in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
@@ -1991,7 +2567,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
goto done;
}
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
@@ -2001,19 +2577,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
goto done;
}
if (msf->imsf_numsrc) {
- newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
- GFP_KERNEL);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+ msf->imsf_numsrc),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
- memcpy(newpsl->sl_addr, msf->imsf_slist,
- msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
+ flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
- sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ sock_kfree_s(sk, newpsl,
+ struct_size(newpsl, sl_addr,
+ newpsl->sl_max));
goto done;
}
} else {
@@ -2021,44 +2600,48 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
(void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, 0, NULL, 0);
}
- psl = pmc->sflist;
+ psl = rtnl_dereference(pmc->sflist);
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
- sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
- } else
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ } else {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
- pmc->sflist = newpsl;
+ }
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ if (psl)
+ kfree_rcu(psl, rcu);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
- rtnl_unlock();
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
}
-
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
- struct ip_msfilter __user *optval, int __user *optlen)
+ sockptr_t optval, sockptr_t optlen)
{
- int err, len, count, copycount;
+ int err, len, count, copycount, msf_size;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
- if (!MULTICAST(addr))
- return -EINVAL;
+ ASSERT_RTNL();
- rtnl_lock();
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
- in_dev = ip_mc_find_dev(&imr);
+ in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
@@ -2066,7 +2649,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
}
err = -EADDRNOTAVAIL;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
@@ -2074,118 +2657,121 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
if (!pmc) /* must have a prior join */
goto done;
msf->imsf_fmode = pmc->sfmode;
- psl = pmc->sflist;
- rtnl_unlock();
+ psl = rtnl_dereference(pmc->sflist);
if (!psl) {
- len = 0;
count = 0;
} else {
count = psl->sl_count;
}
copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
- len = copycount * sizeof(psl->sl_addr[0]);
+ len = flex_array_size(psl, sl_addr, copycount);
msf->imsf_numsrc = count;
- if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ msf_size = IP_MSFILTER_SIZE(copycount);
+ if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
+ copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
return -EFAULT;
}
if (len &&
- copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ copy_to_sockptr_offset(optval,
+ offsetof(struct ip_msfilter, imsf_slist_flex),
+ psl->sl_addr, len))
return -EFAULT;
return 0;
done:
- rtnl_unlock();
return err;
}
int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen)
+ sockptr_t optval, size_t ss_offset)
{
- int err, i, count, copycount;
+ int i, count, copycount;
struct sockaddr_in *psin;
__be32 addr;
struct ip_mc_socklist *pmc;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ ASSERT_RTNL();
+
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
addr = psin->sin_addr.s_addr;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
- err = -EADDRNOTAVAIL;
-
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == addr &&
pmc->multi.imr_ifindex == gsf->gf_interface)
break;
}
if (!pmc) /* must have a prior join */
- goto done;
+ return -EADDRNOTAVAIL;
gsf->gf_fmode = pmc->sfmode;
- psl = pmc->sflist;
- rtnl_unlock();
+ psl = rtnl_dereference(pmc->sflist);
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
- if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
- return -EFAULT;
- }
- for (i=0; i<copycount; i++) {
- struct sockaddr_in *psin;
+ for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
memset(&ss, 0, sizeof(ss));
psin->sin_family = AF_INET;
psin->sin_addr.s_addr = psl->sl_addr[i];
- if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset,
+ &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
-done:
- rtnl_unlock();
- return err;
}
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+ int dif, int sdif)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
struct ip_sf_socklist *psl;
int i;
+ int ret;
- if (!MULTICAST(loc_addr))
- return 1;
+ ret = 1;
+ if (!ipv4_is_multicast(loc_addr))
+ goto out;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
- pmc->multi.imr_ifindex == dif)
+ (pmc->multi.imr_ifindex == dif ||
+ (sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
+ ret = inet_test_bit(MC_ALL, sk);
if (!pmc)
- return 1;
- psl = pmc->sflist;
+ goto unlock;
+ psl = rcu_dereference(pmc->sflist);
+ ret = (pmc->sfmode == MCAST_EXCLUDE);
if (!psl)
- return pmc->sfmode == MCAST_EXCLUDE;
+ goto unlock;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
+ ret = 0;
if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
- return 0;
+ goto unlock;
if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
- return 0;
- return 1;
+ goto unlock;
+ ret = 1;
+unlock:
+ rcu_read_unlock();
+out:
+ return ret;
}
/*
@@ -2196,42 +2782,57 @@ void ip_mc_drop_socket(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
+ struct net *net = sock_net(sk);
- if (inet->mc_list == NULL)
+ if (!inet->mc_list)
return;
rtnl_lock();
- while ((iml = inet->mc_list) != NULL) {
+ while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
struct in_device *in_dev;
- inet->mc_list = iml->next;
- in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+ inet->mc_list = iml->next_rcu;
+ in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
- if (in_dev != NULL) {
+ if (in_dev)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
- in_dev_put(in_dev);
- }
- sock_kfree_s(sk, iml, sizeof(*iml));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
}
rtnl_unlock();
}
-int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
+ struct ip_mc_list __rcu **mc_hash;
struct ip_sf_list *psf;
int rv = 0;
- read_lock(&in_dev->mc_list_lock);
- for (im=in_dev->mc_list; im; im=im->next) {
- if (im->multiaddr == mc_addr)
- break;
+ mc_hash = rcu_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+ for (im = rcu_dereference(mc_hash[hash]);
+ im != NULL;
+ im = rcu_dereference(im->next_hash)) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
}
if (im && proto == IPPROTO_IGMP) {
rv = 1;
} else if (im) {
if (src_addr) {
- for (psf=im->sources; psf; psf=psf->sf_next) {
+ spin_lock_bh(&im->lock);
+ for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
}
@@ -2241,15 +2842,16 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
im->sfcount[MCAST_EXCLUDE];
else
rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ spin_unlock_bh(&im->lock);
} else
rv = 1; /* unspecified source; tentatively allow */
}
- read_unlock(&in_dev->mc_list_lock);
return rv;
}
#if defined(CONFIG_PROC_FS)
struct igmp_mc_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
struct in_device *in_dev;
};
@@ -2258,24 +2860,22 @@ struct igmp_mc_iter_state {
static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
{
+ struct net *net = seq_file_net(seq);
struct ip_mc_list *im = NULL;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
- for (state->dev = dev_base, state->in_dev = NULL;
- state->dev;
- state->dev = state->dev->next) {
+ state->in_dev = NULL;
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *in_dev;
- in_dev = in_dev_get(state->dev);
+
+ in_dev = __in_dev_get_rcu(state->dev);
if (!in_dev)
continue;
- read_lock(&in_dev->mc_list_lock);
- im = in_dev->mc_list;
+ im = rcu_dereference(in_dev->mc_list);
if (im) {
state->in_dev = in_dev;
break;
}
- read_unlock(&in_dev->mc_list_lock);
- in_dev_put(in_dev);
}
return im;
}
@@ -2283,22 +2883,18 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
- im = im->next;
+
+ im = rcu_dereference(im->next_rcu);
while (!im) {
- if (likely(state->in_dev != NULL)) {
- read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
- }
- state->dev = state->dev->next;
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->in_dev = NULL;
break;
}
- state->in_dev = in_dev_get(state->dev);
+ state->in_dev = __in_dev_get_rcu(state->dev);
if (!state->in_dev)
continue;
- read_lock(&state->in_dev->mc_list_lock);
- im = state->in_dev->mc_list;
+ im = rcu_dereference(state->in_dev->mc_list);
}
return im;
}
@@ -2313,8 +2909,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2330,26 +2927,26 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
- if (likely(state->in_dev != NULL)) {
- read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
- state->in_dev = NULL;
- }
+
+ state->in_dev = NULL;
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mc_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
- seq_puts(seq,
+ seq_puts(seq,
"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
else {
- struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct ip_mc_list *im = v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
+ long delta;
+
#ifdef CONFIG_IP_MULTICAST
querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2358,58 +2955,31 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
querier = "NONE";
#endif
- if (state->in_dev->mc_list == im) {
+ if (rcu_access_pointer(state->in_dev->mc_list) == im) {
seq_printf(seq, "%d\t%-10s: %5d %7s\n",
- state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+ state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
}
+ delta = im->timer.expires - jiffies;
seq_printf(seq,
"\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
im->multiaddr, im->users,
- im->tm_running, im->tm_running ?
- jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+ im->tm_running,
+ im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
im->reporter);
}
return 0;
}
-static struct seq_operations igmp_mc_seq_ops = {
+static const struct seq_operations igmp_mc_seq_ops = {
.start = igmp_mc_seq_start,
.next = igmp_mc_seq_next,
.stop = igmp_mc_seq_stop,
.show = igmp_mc_seq_show,
};
-static int igmp_mc_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct igmp_mc_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
- rc = seq_open(file, &igmp_mc_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations igmp_mc_seq_fops = {
- .owner = THIS_MODULE,
- .open = igmp_mc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
struct igmp_mcf_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
struct in_device *idev;
struct ip_mc_list *im;
@@ -2419,31 +2989,29 @@ struct igmp_mcf_iter_state {
static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
{
+ struct net *net = seq_file_net(seq);
struct ip_sf_list *psf = NULL;
struct ip_mc_list *im = NULL;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
- for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
- state->dev;
- state->dev = state->dev->next) {
+ state->idev = NULL;
+ state->im = NULL;
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
- idev = in_dev_get(state->dev);
- if (unlikely(idev == NULL))
+ idev = __in_dev_get_rcu(state->dev);
+ if (unlikely(!idev))
continue;
- read_lock(&idev->mc_list_lock);
- im = idev->mc_list;
- if (likely(im != NULL)) {
+ im = rcu_dereference(idev->mc_list);
+ if (likely(im)) {
spin_lock_bh(&im->lock);
psf = im->sources;
- if (likely(psf != NULL)) {
+ if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
}
spin_unlock_bh(&im->lock);
}
- read_unlock(&idev->mc_list_lock);
- in_dev_put(idev);
}
return psf;
}
@@ -2457,23 +3025,16 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
spin_unlock_bh(&state->im->lock);
state->im = state->im->next;
while (!state->im) {
- if (likely(state->idev != NULL)) {
- read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
- }
- state->dev = state->dev->next;
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
- state->idev = in_dev_get(state->dev);
+ state->idev = __in_dev_get_rcu(state->dev);
if (!state->idev)
continue;
- read_lock(&state->idev->mc_list_lock);
- state->im = state->idev->mc_list;
+ state->im = rcu_dereference(state->idev->mc_list);
}
- if (!state->im)
- break;
spin_lock_bh(&state->im->lock);
psf = state->im->sources;
}
@@ -2491,8 +3052,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2508,37 +3070,30 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
- if (likely(state->im != NULL)) {
+ if (likely(state->im)) {
spin_unlock_bh(&state->im->lock);
state->im = NULL;
}
- if (likely(state->idev != NULL)) {
- read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
- state->idev = NULL;
- }
+ state->idev = NULL;
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
- struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct ip_sf_list *psf = v;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
- seq_printf(seq,
- "%3s %6s "
- "%10s %10s %6s %6s\n", "Idx",
- "Device", "MCA",
- "SRC", "INC", "EXC");
+ seq_puts(seq, "Idx Device MCA SRC INC EXC\n");
} else {
seq_printf(seq,
"%3d %6.6s 0x%08x "
- "0x%08x %6lu %6lu\n",
- state->dev->ifindex, state->dev->name,
+ "0x%08x %6lu %6lu\n",
+ state->dev->ifindex, state->dev->name,
ntohl(state->im->multiaddr),
ntohl(psf->sf_inaddr),
psf->sf_count[MCAST_INCLUDE],
@@ -2547,50 +3102,96 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations igmp_mcf_seq_ops = {
+static const struct seq_operations igmp_mcf_seq_ops = {
.start = igmp_mcf_seq_start,
.next = igmp_mcf_seq_next,
.stop = igmp_mcf_seq_stop,
.show = igmp_mcf_seq_show,
};
-static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+static int __net_init igmp_net_init(struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct igmp_mcf_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
+ struct proc_dir_entry *pde;
+ int err;
- if (!s)
- goto out;
- rc = seq_open(file, &igmp_mcf_seq_ops);
- if (rc)
- goto out_kfree;
+ pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
+ sizeof(struct igmp_mc_iter_state));
+ if (!pde)
+ goto out_igmp;
+ pde = proc_create_net("mcfilter", 0444, net->proc_net,
+ &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
+ if (!pde)
+ goto out_mcfilter;
+ err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
+ SOCK_DGRAM, 0, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
+ err);
+ goto out_sock;
+ }
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations igmp_mcf_seq_fops = {
- .owner = THIS_MODULE,
- .open = igmp_mcf_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
+ return 0;
-int __init igmp_mc_proc_init(void)
+out_sock:
+ remove_proc_entry("mcfilter", net->proc_net);
+out_mcfilter:
+ remove_proc_entry("igmp", net->proc_net);
+out_igmp:
+ return -ENOMEM;
+}
+
+static void __net_exit igmp_net_exit(struct net *net)
{
- proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
- proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
- return 0;
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
+ inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}
+
+static struct pernet_operations igmp_net_ops = {
+ .init = igmp_net_init,
+ .exit = igmp_net_exit,
+};
#endif
-EXPORT_SYMBOL(ip_mc_dec_group);
-EXPORT_SYMBOL(ip_mc_inc_group);
-EXPORT_SYMBOL(ip_mc_join_group);
+static int igmp_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev)
+ ip_mc_rejoin_groups(in_dev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+ .notifier_call = igmp_netdev_event,
+};
+
+int __init igmp_mc_init(void)
+{
+#if defined(CONFIG_PROC_FS)
+ int err;
+
+ err = register_pernet_subsys(&igmp_net_ops);
+ if (err)
+ return err;
+ err = register_netdevice_notifier(&igmp_notifier);
+ if (err)
+ goto reg_notif_fail;
+ return 0;
+
+reg_notif_fail:
+ unregister_pernet_subsys(&igmp_net_ops);
+ return err;
+#else
+ return register_netdevice_notifier(&igmp_notifier);
+#endif
+}
diff --git a/net/ipv4/igmp_internal.h b/net/ipv4/igmp_internal.h
new file mode 100644
index 000000000000..0a1bcc8ec8e1
--- /dev/null
+++ b/net/ipv4/igmp_internal.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IGMP_INTERNAL_H
+#define _LINUX_IGMP_INTERNAL_H
+
+struct inet_fill_args {
+ u32 portid;
+ u32 seq;
+ int event;
+ unsigned int flags;
+ int netnsid;
+ int ifindex;
+};
+
+int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im,
+ struct inet_fill_args *args);
+#endif
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 9d68837888d3..97d57c52b9ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* Support for INET connection oriented protocols.
*
* Authors: See the TCP sources
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
*/
#include <linux/module.h>
@@ -23,141 +19,593 @@
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <net/tcp.h>
+#include <net/sock_reuseport.h>
+#include <net/addrconf.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses
+ * if IPv6 only, and any IPv4 addresses
+ * if not IPv6 only
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ * and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+ const struct in6_addr *sk2_rcv_saddr6,
+ __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk1_ipv6only, bool sk2_ipv6only,
+ bool match_sk1_wildcard,
+ bool match_sk2_wildcard)
+{
+ int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
+ int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+ /* if both are mapped, treat as IPv4 */
+ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return true;
+ return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+ (match_sk2_wildcard && !sk2_rcv_saddr);
+ }
+ return false;
+ }
+
+ if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+ return true;
-#ifdef INET_CSK_DEBUG
-const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
-EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+ if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&
+ !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+ return true;
+
+ if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&
+ !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+ return true;
+
+ if (sk2_rcv_saddr6 &&
+ ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
+ return true;
+
+ return false;
+}
#endif
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
+/* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
*/
-int sysctl_local_port_range[2] = { 1024, 4999 };
+static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk2_ipv6only, bool match_sk1_wildcard,
+ bool match_sk2_wildcard)
+{
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return true;
+ return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+ (match_sk2_wildcard && !sk2_rcv_saddr);
+ }
+ return false;
+}
-int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
+bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+ bool match_wildcard)
{
- const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
- struct sock *sk2;
- struct hlist_node *node;
- int reuse = sk->sk_reuse;
-
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- !inet_v6_ipv6only(sk2) &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr ||
- sk2_rcv_saddr == sk_rcv_saddr)
- break;
- }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+ inet6_rcv_saddr(sk2),
+ sk->sk_rcv_saddr,
+ sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk),
+ ipv6_only_sock(sk2),
+ match_wildcard,
+ match_wildcard);
+#endif
+ return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk2), match_wildcard,
+ match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
+bool inet_rcv_saddr_any(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#endif
+ return !sk->sk_rcv_saddr;
+}
+
+/**
+ * inet_sk_get_local_port_range - fetch ephemeral ports range
+ * @sk: socket
+ * @low: pointer to low port
+ * @high: pointer to high port
+ *
+ * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
+ * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
+ * Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
+ */
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
+{
+ int lo, hi, sk_lo, sk_hi;
+ bool local_range = false;
+ u32 sk_range;
+
+ inet_get_local_port_range(sock_net(sk), &lo, &hi);
+
+ sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
+ if (unlikely(sk_range)) {
+ sk_lo = sk_range & 0xffff;
+ sk_hi = sk_range >> 16;
+
+ if (lo <= sk_lo && sk_lo <= hi)
+ lo = sk_lo;
+ if (lo <= sk_hi && sk_hi <= hi)
+ hi = sk_hi;
+ local_range = true;
+ }
+
+ *low = lo;
+ *high = hi;
+ return local_range;
+}
+EXPORT_SYMBOL(inet_sk_get_local_port_range);
+
+static bool inet_use_bhash2_on_bind(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ return false;
+
+ if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return true;
+ }
+#endif
+ return sk->sk_rcv_saddr != htonl(INADDR_ANY);
+}
+
+static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
+ kuid_t uid, bool relax,
+ bool reuseport_cb_ok, bool reuseport_ok)
+{
+ int bound_dev_if2;
+
+ if (sk == sk2)
+ return false;
+
+ bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
+
+ if (!sk->sk_bound_dev_if || !bound_dev_if2 ||
+ sk->sk_bound_dev_if == bound_dev_if2) {
+ if (sk->sk_reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+ if (!relax || (!reuseport_ok && sk->sk_reuseport &&
+ sk2->sk_reuseport && reuseport_cb_ok &&
+ (sk2->sk_state == TCP_TIME_WAIT ||
+ uid_eq(uid, sk_uid(sk2)))))
+ return true;
+ } else if (!reuseport_ok || !sk->sk_reuseport ||
+ !sk2->sk_reuseport || !reuseport_cb_ok ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sk_uid(sk2)))) {
+ return true;
}
}
- return node != NULL;
+ return false;
}
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
+ kuid_t uid, bool relax,
+ bool reuseport_cb_ok, bool reuseport_ok)
+{
+ if (ipv6_only_sock(sk2)) {
+ if (sk->sk_family == AF_INET)
+ return false;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return false;
+#endif
+ }
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
+ return inet_bind_conflict(sk, sk2, uid, relax,
+ reuseport_cb_ok, reuseport_ok);
+}
+
+static bool inet_bhash2_conflict(const struct sock *sk,
+ const struct inet_bind2_bucket *tb2,
+ kuid_t uid,
+ bool relax, bool reuseport_cb_ok,
+ bool reuseport_ok)
+{
+ struct sock *sk2;
+
+ sk_for_each_bound(sk2, &tb2->owners) {
+ if (__inet_bhash2_conflict(sk, sk2, uid, relax,
+ reuseport_cb_ok, reuseport_ok))
+ return true;
+ }
+
+ return false;
+}
+
+#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \
+ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \
+ sk_for_each_bound((__sk), &(__tb2)->owners)
+
+/* This should be called only when the tb and tb2 hashbuckets' locks are held */
+static int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb,
+ const struct inet_bind2_bucket *tb2, /* may be null */
+ bool relax, bool reuseport_ok)
+{
+ struct sock_reuseport *reuseport_cb;
+ kuid_t uid = sk_uid(sk);
+ bool reuseport_cb_ok;
+ struct sock *sk2;
+
+ rcu_read_lock();
+ reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+ /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+ rcu_read_unlock();
+
+ /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
+ * ipv4) should have been checked already. We need to do these two
+ * checks separately because their spinlocks have to be acquired/released
+ * independently of each other, to prevent possible deadlocks
+ */
+ if (inet_use_bhash2_on_bind(sk))
+ return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax,
+ reuseport_cb_ok, reuseport_ok);
+
+ /* Unlike other sk lookup places we do not check
+ * for sk_net here, since _all_ the socks listed
+ * in tb->owners and tb2->owners list belong
+ * to the same net - the one this bucket belongs to.
+ */
+ sk_for_each_bound_bhash(sk2, tb2, tb) {
+ if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok))
+ continue;
+
+ if (inet_rcv_saddr_equal(sk, sk2, true))
+ return true;
+ }
+
+ return false;
+}
+
+/* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or
+ * INADDR_ANY (if ipv4) socket.
+ *
+ * Caller must hold bhash hashbucket lock with local bh disabled, to protect
+ * against concurrent binds on the port for addr any
*/
-int inet_csk_get_port(struct inet_hashinfo *hashinfo,
- struct sock *sk, unsigned short snum,
- int (*bind_conflict)(const struct sock *sk,
- const struct inet_bind_bucket *tb))
+static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev,
+ bool relax, bool reuseport_ok)
{
- struct inet_bind_hashbucket *head;
- struct hlist_node *node;
+ const struct net *net = sock_net(sk);
+ struct sock_reuseport *reuseport_cb;
+ struct inet_bind_hashbucket *head2;
+ struct inet_bind2_bucket *tb2;
+ kuid_t uid = sk_uid(sk);
+ bool conflict = false;
+ bool reuseport_cb_ok;
+
+ rcu_read_lock();
+ reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+ /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+ rcu_read_unlock();
+
+ head2 = inet_bhash2_addr_any_hashbucket(sk, net, port);
+
+ spin_lock(&head2->lock);
+
+ inet_bind_bucket_for_each(tb2, &head2->chain) {
+ if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk))
+ continue;
+
+ if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok))
+ continue;
+
+ conflict = true;
+ break;
+ }
+
+ spin_unlock(&head2->lock);
+
+ return conflict;
+}
+
+/*
+ * Find an open port number for the socket. Returns with the
+ * inet_bind_hashbucket locks held if successful.
+ */
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
+ struct inet_bind2_bucket **tb2_ret,
+ struct inet_bind_hashbucket **head2_ret, int *port_ret)
+{
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ int i, low, high, attempt_half, port, l3mdev;
+ struct inet_bind_hashbucket *head, *head2;
+ struct net *net = sock_net(sk);
+ struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
- int ret;
-
- local_bh_disable();
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover = net_random() % (high - low) + low;
-
- do {
- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == rover)
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (remaining <= 0)
- goto fail;
+ u32 remaining, offset;
+ bool relax = false;
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
+ports_exhausted:
+ attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+other_half_scan:
+ inet_sk_get_local_port_range(sk, &low, &high);
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ if (high - low < 4)
+ attempt_half = 0;
+ if (attempt_half) {
+ int half = low + (((high - low) >> 2) << 1);
+
+ if (attempt_half == 1)
+ high = half;
+ else
+ low = half;
+ }
+ remaining = high - low;
+ if (likely(remaining > 1))
+ remaining &= ~1U;
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
+ offset = get_random_u32_below(remaining);
+ /* __inet_hash_connect() favors ports having @low parity
+ * We do the opposite to not pollute connect() users.
+ */
+ offset |= 1U;
+
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += 2, port += 2) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ if (inet_use_bhash2_on_bind(sk)) {
+ if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false))
+ goto next_port;
+ }
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+ spin_lock(&head2->lock);
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
+ if (!inet_csk_bind_conflict(sk, tb, tb2,
+ relax, false))
+ goto success;
+ spin_unlock(&head2->lock);
+ goto next_port;
+ }
+ tb = NULL;
+ goto success;
+next_port:
+ spin_unlock_bh(&head->lock);
+ cond_resched();
+ }
+
+ offset--;
+ if (!(offset & 1))
+ goto other_parity_scan;
+
+ if (attempt_half == 1) {
+ /* OK we now try the upper half of the range */
+ attempt_half = 2;
+ goto other_half_scan;
+ }
+
+ if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) {
+ /* We still have a chance to connect to different destinations */
+ relax = true;
+ goto ports_exhausted;
+ }
+ return NULL;
+success:
+ *port_ret = port;
+ *tb_ret = tb;
+ *tb2_ret = tb2;
+ *head2_ret = head2;
+ return head;
+}
+
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+ const struct sock *sk)
+{
+ if (tb->fastreuseport <= 0)
+ return 0;
+ if (!sk->sk_reuseport)
+ return 0;
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ return 0;
+ if (!uid_eq(tb->fastuid, sk_uid(sk)))
+ return 0;
+ /* We only need to check the rcv_saddr if this tb was once marked
+ * without fastreuseport and then was reset, as we can only know that
+ * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+ * owners list.
+ */
+ if (tb->fastreuseport == FASTREUSEPORT_ANY)
+ return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb->fast_sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+ inet6_rcv_saddr(sk),
+ tb->fast_rcv_saddr,
+ sk->sk_rcv_saddr,
+ tb->fast_ipv6_only,
+ ipv6_only_sock(sk), true, false);
+#endif
+ return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+ ipv6_only_sock(sk), true, false);
+}
+
+void inet_csk_update_fastreuse(const struct sock *sk,
+ struct inet_bind_bucket *tb,
+ struct inet_bind2_bucket *tb2)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+
+ if (hlist_empty(&tb->bhash2)) {
+ tb->fastreuse = reuse;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = FASTREUSEPORT_ANY;
+ tb->fastuid = sk_uid(sk);
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+ tb->fast_sk_family = sk->sk_family;
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ } else {
+ tb->fastreuseport = 0;
+ }
} else {
- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse > 1)
- goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
- goto success;
+ if (!reuse)
+ tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ /* We didn't match or we don't have fastreuseport set on
+ * the tb, but we have sk_reuseport set on this socket
+ * and we know that there are no bind conflicts with
+ * this socket in this tb, so reset our tb's reuseport
+ * settings so that any subsequent sockets that match
+ * our current socket will be put on the fast path.
+ *
+ * If we reset we need to set FASTREUSEPORT_STRICT so we
+ * do extra checking for all subsequent sk_reuseport
+ * socks.
+ */
+ if (!sk_reuseport_match(tb, sk)) {
+ tb->fastreuseport = FASTREUSEPORT_STRICT;
+ tb->fastuid = sk_uid(sk);
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+ tb->fast_sk_family = sk->sk_family;
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ }
} else {
- ret = 1;
- if (bind_conflict(sk, tb))
+ tb->fastreuseport = 0;
+ }
+ }
+
+ tb2->fastreuse = tb->fastreuse;
+ tb2->fastreuseport = tb->fastreuseport;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ bool found_port = false, check_bind_conflict = true;
+ bool bhash_created = false, bhash2_created = false;
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ int ret = -EADDRINUSE, port = snum, l3mdev;
+ struct inet_bind_hashbucket *head, *head2;
+ struct inet_bind2_bucket *tb2 = NULL;
+ struct inet_bind_bucket *tb = NULL;
+ bool head2_lock_acquired = false;
+ struct net *net = sock_net(sk);
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+ if (!port) {
+ head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
+ if (!head)
+ return ret;
+
+ head2_lock_acquired = true;
+
+ if (tb && tb2)
+ goto success;
+ found_port = true;
+ } else {
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (inet_bind_bucket_match(tb, net, port, l3mdev))
+ break;
+ }
+
+ if (!tb) {
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net,
+ head, port, l3mdev);
+ if (!tb)
+ goto fail_unlock;
+ bhash_created = true;
+ }
+
+ if (!found_port) {
+ if (!hlist_empty(&tb->bhash2)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE ||
+ (tb->fastreuse > 0 && reuse) ||
+ sk_reuseport_match(tb, sk))
+ check_bind_conflict = false;
+ }
+
+ if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) {
+ if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true))
goto fail_unlock;
}
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+ spin_lock(&head2->lock);
+ head2_lock_acquired = true;
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
}
-tb_not_found:
- ret = 1;
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
+ net, head2, tb, sk);
+ if (!tb2)
+ goto fail_unlock;
+ bhash2_created = true;
+ }
+
+ if (!found_port && check_bind_conflict) {
+ if (inet_csk_bind_conflict(sk, tb, tb2, true, true))
+ goto fail_unlock;
+ }
+
success:
+ inet_csk_update_fastreuse(sk, tb, tb2);
+
if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
- BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
- ret = 0;
+ inet_bind_hash(sk, tb, tb2, port);
+ WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+ WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
+ ret = 0;
fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
+ if (ret) {
+ if (bhash2_created)
+ inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
+ if (bhash_created)
+ inet_bind_bucket_destroy(tb);
+ }
+ if (head2_lock_acquired)
+ spin_unlock(&head2->lock);
+ spin_unlock_bh(&head->lock);
return ret;
}
-
EXPORT_SYMBOL_GPL(inet_csk_get_port);
/*
@@ -185,11 +633,12 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
timeo = schedule_timeout(timeo);
+ sched_annotate_sleep();
lock_sock(sk);
err = 0;
if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -204,16 +653,18 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
if (!timeo)
break;
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return err;
}
/*
* This will accept the next outstanding connection.
*/
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock *req;
struct sock *newsk;
int error;
@@ -227,8 +678,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
goto out_err;
/* Find already established connection */
- if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ if (reqsk_queue_empty(queue)) {
+ long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
@@ -239,188 +690,384 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
+ req = reqsk_queue_remove(queue, sk);
+ arg->is_empty = reqsk_queue_empty(queue);
+ newsk = req->sk;
+
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ tcp_rsk(req)->tfo_listener) {
+ spin_lock_bh(&queue->fastopenq.lock);
+ if (tcp_rsk(req)->tfo_listener) {
+ /* We are still waiting for the final ACK from 3WHS
+ * so can't free req now. Instead, we set req->sk to
+ * NULL to signify that the child socket is taken
+ * so reqsk_fastopen_remove() will free the req
+ * when 3WHS finishes (or is aborted).
+ */
+ req->sk = NULL;
+ req = NULL;
+ }
+ spin_unlock_bh(&queue->fastopenq.lock);
+ }
- newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
- BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-out:
release_sock(sk);
+
+ if (req)
+ reqsk_put(req);
+
+ inet_init_csk_locks(newsk);
return newsk;
+
out_err:
- newsk = NULL;
- *err = error;
- goto out;
+ release_sock(sk);
+ arg->err = error;
+ return NULL;
}
-
EXPORT_SYMBOL(inet_csk_accept);
/*
* Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies
+ * We may wish use just one timer maintaining a list of expire jiffies
* to optimize.
*/
void inet_csk_init_xmit_timers(struct sock *sk,
- void (*retransmit_handler)(unsigned long),
- void (*delack_handler)(unsigned long),
- void (*keepalive_handler)(unsigned long))
+ void (*retransmit_handler)(struct timer_list *t),
+ void (*delack_handler)(struct timer_list *t),
+ void (*keepalive_handler)(struct timer_list *t))
{
struct inet_connection_sock *icsk = inet_csk(sk);
- init_timer(&icsk->icsk_retransmit_timer);
- init_timer(&icsk->icsk_delack_timer);
- init_timer(&sk->sk_timer);
-
- icsk->icsk_retransmit_timer.function = retransmit_handler;
- icsk->icsk_delack_timer.function = delack_handler;
- sk->sk_timer.function = keepalive_handler;
-
- icsk->icsk_retransmit_timer.data =
- icsk->icsk_delack_timer.data =
- sk->sk_timer.data = (unsigned long)sk;
-
+ timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0);
+ timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+ timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-EXPORT_SYMBOL(inet_csk_init_xmit_timers);
-
void inet_csk_clear_xmit_timers(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+ smp_store_release(&icsk->icsk_pending, 0);
+ smp_store_release(&icsk->icsk_ack.pending, 0);
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->tcp_retransmit_timer);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
- sk_stop_timer(sk, &sk->sk_timer);
+ sk_stop_timer(sk, &icsk->icsk_keepalive_timer);
}
-EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
-
-void inet_csk_delete_keepalive_timer(struct sock *sk)
+void inet_csk_clear_xmit_timers_sync(struct sock *sk)
{
- sk_stop_timer(sk, &sk->sk_timer);
-}
+ struct inet_connection_sock *icsk = inet_csk(sk);
-EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+ /* ongoing timer handlers need to acquire socket lock. */
+ sock_not_owned_by_me(sk);
-void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
-{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
-}
+ smp_store_release(&icsk->icsk_pending, 0);
+ smp_store_release(&icsk->icsk_ack.pending, 0);
-EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+ sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer);
+ sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer);
+}
-struct dst_entry* inet_csk_route_req(struct sock *sk,
+struct dst_entry *inet_csk_route_req(const struct sock *sk,
+ struct flowi4 *fl4,
const struct request_sock *req)
{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct ip_options_rcu *opt;
struct rtable *rt;
+
+ rcu_read_lock();
+ opt = rcu_dereference(ireq->ireq_opt);
+
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+ ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num), sk_uid(sk));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ rcu_read_unlock();
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ rcu_read_unlock();
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+
+struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
+ struct sock *newsk,
+ const struct request_sock *req)
+{
const struct inet_request_sock *ireq = inet_rsk(req);
- struct ip_options *opt = inet_rsk(req)->opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = ((opt && opt->srr) ?
- opt->faddr :
- ireq->rmt_addr),
- .saddr = ireq->loc_addr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = inet_sk(sk)->sport,
- .dport = ireq->rmt_port } } };
-
- security_req_classify_flow(req, &fl);
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct inet_sock *newinet = inet_sk(newsk);
+ struct ip_options_rcu *opt;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ opt = rcu_dereference(ireq->ireq_opt);
+ fl4 = &newinet->cork.fl.u.ip4;
+
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+ ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num), sk_uid(sk));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
+/* Decide when to expire the request and when to resend SYN-ACK */
+static void syn_ack_recalc(struct request_sock *req,
+ const int max_syn_ack_retries,
+ const u8 rskq_defer_accept,
+ int *expire, int *resend)
+{
+ if (!rskq_defer_accept) {
+ *expire = req->num_timeout >= max_syn_ack_retries;
+ *resend = 1;
+ return;
}
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
- ip_rt_put(rt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ *expire = req->num_timeout >= max_syn_ack_retries &&
+ (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept);
+ /* Do not resend while waiting for data after ACK,
+ * start to resend on end of deferring period to give
+ * last chance for data or ACK to create established socket.
+ */
+ *resend = !inet_rsk(req)->acked ||
+ req->num_timeout >= rskq_defer_accept - 1;
+}
+
+static struct request_sock *
+reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
+ bool attach_listener)
+{
+ struct request_sock *req;
+
+ req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ if (!req)
return NULL;
+ req->rsk_listener = NULL;
+ if (attach_listener) {
+ if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) {
+ kmem_cache_free(ops->slab, req);
+ return NULL;
+ }
+ req->rsk_listener = sk_listener;
}
- return &rt->u.dst;
+ req->rsk_ops = ops;
+ req_to_sk(req)->sk_prot = sk_listener->sk_prot;
+ sk_node_init(&req_to_sk(req)->sk_node);
+ sk_tx_queue_clear(req_to_sk(req));
+ req->saved_syn = NULL;
+ req->syncookie = 0;
+ req->num_timeout = 0;
+ req->num_retrans = 0;
+ req->sk = NULL;
+ refcount_set(&req->rsk_refcnt, 0);
+
+ return req;
}
+#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__))
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener,
+ bool attach_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
-EXPORT_SYMBOL_GPL(inet_csk_route_req);
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u32 synq_hsize)
+ ireq->ireq_opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ ireq->pktopts = NULL;
+#endif
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+ struct sock *sk)
{
- return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
+ struct sock *req_sk, *nreq_sk;
+ struct request_sock *nreq;
+
+ nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ if (!nreq) {
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
+ /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+ sock_put(sk);
+ return NULL;
+ }
+
+ req_sk = req_to_sk(req);
+ nreq_sk = req_to_sk(nreq);
+
+ memcpy(nreq_sk, req_sk,
+ offsetof(struct sock, sk_dontcopy_begin));
+ unsafe_memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+ req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end),
+ /* alloc is larger than struct, see above */);
+
+ sk_node_init(&nreq_sk->sk_node);
+ nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+ nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+ nreq->rsk_listener = sk;
+
+ /* We need not acquire fastopenq->lock
+ * because the child socket is locked in inet_csk_listen_stop().
+ */
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+ rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+ return nreq;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+static void reqsk_queue_migrated(struct request_sock_queue *queue,
+ const struct request_sock *req)
+{
+ if (req->num_timeout == 0)
+ atomic_inc(&queue->young);
+ atomic_inc(&queue->qlen);
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+ req->saved_syn = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_rsk(req)->ipv6_opt = NULL;
+ inet_rsk(req)->pktopts = NULL;
#else
-#define AF_INET_FAMILY(fam) 1
+ inet_rsk(req)->ireq_opt = NULL;
#endif
+}
-struct request_sock *inet_csk_search_req(const struct sock *sk,
- struct request_sock ***prevp,
- const __be16 rport, const __be32 raddr,
- const __be32 laddr)
+/* return true if req was found in the ehash table */
+static bool reqsk_queue_unlink(struct request_sock *req)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req, **prev;
-
- for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->rmt_port == rport &&
- ireq->rmt_addr == raddr &&
- ireq->loc_addr == laddr &&
- AF_INET_FAMILY(req->rsk_ops->family)) {
- BUG_TRAP(!req->sk);
- *prevp = prev;
- break;
- }
+ struct sock *sk = req_to_sk(req);
+ bool found = false;
+
+ if (sk_hashed(sk)) {
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ spinlock_t *lock;
+
+ lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
+ spin_lock(lock);
+ found = __sk_nulls_del_node_init_rcu(sk);
+ spin_unlock(lock);
}
- return req;
+ return found;
}
-EXPORT_SYMBOL_GPL(inet_csk_search_req);
-
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
+static bool __inet_csk_reqsk_queue_drop(struct sock *sk,
+ struct request_sock *req,
+ bool from_timer)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
+ bool unlinked = reqsk_queue_unlink(req);
+
+ if (!from_timer && timer_delete_sync(&req->rsk_timer))
+ reqsk_put(req);
+
+ if (unlinked) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ reqsk_put(req);
+ }
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
+ return unlinked;
}
-/* Only thing we need from tcp.h */
-extern int sysctl_tcp_synack_retries;
+bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+ return __inet_csk_reqsk_queue_drop(sk, req, false);
+}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_put(req);
+}
+EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put);
-void inet_csk_reqsk_queue_prune(struct sock *parent,
- const unsigned long interval,
- const unsigned long timeout,
- const unsigned long max_rto)
+static void reqsk_timer_handler(struct timer_list *t)
{
- struct inet_connection_sock *icsk = inet_csk(parent);
- struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct listen_sock *lopt = queue->listen_opt;
- int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
- unsigned long now = jiffies;
- struct request_sock **reqp, *req;
- int i, budget;
-
- if (lopt == NULL || lopt->qlen == 0)
- return;
+ struct request_sock *req = timer_container_of(req, t, rsk_timer);
+ struct request_sock *nreq = NULL, *oreq = req;
+ struct sock *sk_listener = req->rsk_listener;
+ struct inet_connection_sock *icsk;
+ struct request_sock_queue *queue;
+ struct net *net;
+ int max_syn_ack_retries, qlen, expire = 0, resend = 0;
+
+ if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+ struct sock *nsk;
+
+ nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
+ if (!nsk)
+ goto drop;
+
+ nreq = inet_reqsk_clone(req, nsk);
+ if (!nreq)
+ goto drop;
+
+ /* The new timer for the cloned req can decrease the 2
+ * by calling inet_csk_reqsk_queue_drop_and_put(), so
+ * hold another count to prevent use-after-free and
+ * call reqsk_put() just before return.
+ */
+ refcount_set(&nreq->rsk_refcnt, 2 + 1);
+ timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+ reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);
+
+ req = nreq;
+ sk_listener = nsk;
+ }
+ icsk = inet_csk(sk_listener);
+ net = sock_net(sk_listener);
+ max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
+ * If synack was not acknowledged for 1 second, it means
* one of the following things: synack was lost, ack was lost,
* rtt is high or nobody planned to ack (i.e. synflood).
* When server is a bit loaded, queue is populated with old
@@ -435,89 +1082,175 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- if (lopt->qlen>>(lopt->max_qlen_log-1)) {
- int young = (lopt->qlen_young<<1);
+ queue = &icsk->icsk_accept_queue;
+ qlen = reqsk_queue_len(queue);
+ if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
+ int young = reqsk_queue_len_young(queue) << 1;
- while (thresh > 2) {
- if (lopt->qlen < young)
+ while (max_syn_ack_retries > 2) {
+ if (qlen < young)
break;
- thresh--;
+ max_syn_ack_retries--;
young <<= 1;
}
}
- if (queue->rskq_defer_accept)
- max_retries = queue->rskq_defer_accept;
+ syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
+ &expire, &resend);
+ tcp_syn_ack_timeout(req);
+
+ if (!expire &&
+ (!resend ||
+ !tcp_rtx_synack(sk_listener, req) ||
+ inet_rsk(req)->acked)) {
+ if (req->num_timeout++ == 0)
+ atomic_dec(&queue->young);
+ mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
+
+ if (!nreq)
+ return;
+
+ if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
+ /* delete timer */
+ __inet_csk_reqsk_queue_drop(sk_listener, nreq, true);
+ goto no_ownership;
+ }
- budget = 2 * (lopt->nr_table_entries / (timeout / interval));
- i = lopt->clock_hand;
+ __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS);
+ reqsk_migrate_reset(oreq);
+ reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
+ reqsk_put(oreq);
- do {
- reqp=&lopt->syn_table[i];
- while ((req = *reqp) != NULL) {
- if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (inet_rsk(req)->acked && req->retrans < max_retries))
- && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
- unsigned long timeo;
+ reqsk_put(nreq);
+ return;
+ }
- if (req->retrans++ == 0)
- lopt->qlen_young--;
- timeo = min((timeout << req->retrans), max_rto);
- req->expires = now + timeo;
- reqp = &req->dl_next;
- continue;
- }
+ /* Even if we can clone the req, we may need not retransmit any more
+ * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
+ * CPU may win the "own_req" race so that inet_ehash_insert() fails.
+ */
+ if (nreq) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE);
+no_ownership:
+ reqsk_migrate_reset(nreq);
+ reqsk_queue_removed(queue, nreq);
+ __reqsk_free(nreq);
+ }
- /* Drop this request */
- inet_csk_reqsk_queue_unlink(parent, req, reqp);
- reqsk_queue_removed(queue, req);
- reqsk_free(req);
- continue;
- }
- reqp = &req->dl_next;
- }
+drop:
+ __inet_csk_reqsk_queue_drop(sk_listener, oreq, true);
+ reqsk_put(oreq);
+}
- i = (i + 1) & (lopt->nr_table_entries - 1);
+static bool reqsk_queue_hash_req(struct request_sock *req)
+{
+ bool found_dup_sk = false;
- } while (--budget > 0);
+ if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk))
+ return false;
- lopt->clock_hand = i;
+ /* The timer needs to be setup after a successful insertion. */
+ req->timeout = tcp_timeout_init((struct sock *)req);
+ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+ mod_timer(&req->rsk_timer, jiffies + req->timeout);
- if (lopt->qlen)
- inet_csk_reset_keepalive_timer(parent, interval);
+ /* before letting lookups find us, make sure all req fields
+ * are committed to memory and refcnt initialized.
+ */
+ smp_wmb();
+ refcount_set(&req->rsk_refcnt, 2 + 1);
+ return true;
}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req)
+{
+ if (!reqsk_queue_hash_req(req))
+ return false;
-struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
- const gfp_t priority)
+ inet_csk_reqsk_queue_added(sk);
+ return true;
+}
+
+static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
+ const gfp_t priority)
{
- struct sock *newsk = sk_clone(sk, priority);
+ struct inet_connection_sock *icsk = inet_csk(newsk);
- if (newsk != NULL) {
- struct inet_connection_sock *newicsk = inet_csk(newsk);
+ if (!icsk->icsk_ulp_ops)
+ return;
- newsk->sk_state = TCP_SYN_RECV;
- newicsk->icsk_bind_hash = NULL;
+ icsk->icsk_ulp_ops->clone(req, newsk, priority);
+}
- inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
- newsk->sk_write_space = sk_stream_write_space;
+/**
+ * inet_csk_clone_lock - clone an inet socket, and lock its clone
+ * @sk: the socket to clone
+ * @req: request_sock
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+ const struct request_sock *req,
+ const gfp_t priority)
+{
+ struct sock *newsk = sk_clone_lock(sk, priority);
+ struct inet_connection_sock *newicsk;
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
- newicsk->icsk_retransmits = 0;
- newicsk->icsk_backoff = 0;
- newicsk->icsk_probes_out = 0;
+ if (!newsk)
+ return NULL;
- /* Deinitialize accept_queue to trap illegal accesses. */
- memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ newicsk = inet_csk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
+
+ newicsk->icsk_bind_hash = NULL;
+ newicsk->icsk_bind2_hash = NULL;
+
+ newinet->inet_dport = ireq->ir_rmt_port;
+ newinet->inet_num = ireq->ir_num;
+ newinet->inet_sport = htons(ireq->ir_num);
+
+ newsk->sk_bound_dev_if = ireq->ir_iif;
+
+ newsk->sk_daddr = ireq->ir_rmt_addr;
+ newsk->sk_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+#endif
+
+ /* listeners have SOCK_RCU_FREE, not the children */
+ sock_reset_flag(newsk, SOCK_RCU_FREE);
+
+ inet_sk(newsk)->mc_list = NULL;
+
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+ newicsk->icsk_probes_tstamp = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0,
+ sizeof(newicsk->icsk_accept_queue));
+
+ inet_sk_set_state(newsk, TCP_SYN_RECV);
+
+ inet_clone_ulp(req, newsk, priority);
+
+ security_inet_csk_clone(newsk, req);
- security_inet_csk_clone(newsk, req);
- }
return newsk;
}
-EXPORT_SYMBOL_GPL(inet_csk_clone);
-
/*
* At this point, there should be no process reference to this
* socket, and thus no user references at all. Therefore we
@@ -526,14 +1259,14 @@ EXPORT_SYMBOL_GPL(inet_csk_clone);
*/
void inet_csk_destroy_sock(struct sock *sk)
{
- BUG_TRAP(sk->sk_state == TCP_CLOSE);
- BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+ WARN_ON(sk->sk_state != TCP_CLOSE);
+ WARN_ON(!sock_flag(sk, SOCK_DEAD));
/* It cannot be in hash table! */
- BUG_TRAP(sk_unhashed(sk));
+ WARN_ON(!sk_unhashed(sk));
- /* If it has not 0 inet_sk(sk)->num, it must be bound */
- BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+ /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
sk->sk_prot->destroy(sk);
@@ -541,24 +1274,55 @@ void inet_csk_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
- sk_refcnt_debug_release(sk);
+ tcp_orphan_count_dec();
- atomic_dec(sk->sk_prot->orphan_count);
sock_put(sk);
}
-
EXPORT_SYMBOL(inet_csk_destroy_sock);
-int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+void inet_csk_prepare_for_destroy_sock(struct sock *sk)
+{
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ tcp_orphan_count_inc();
+}
+
+/* This function allows to force a closure of a socket after the call to
+ * tcp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ /* sk_clone_lock locked the socket and set refcnt to 2 */
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ inet_csk_prepare_for_destroy_sock(sk);
+ inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
+static int inet_ulp_can_listen(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone)
+ return -EINVAL;
+
+ return 0;
+}
+
+int inet_csk_listen_start(struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+
+ err = inet_ulp_can_listen(sk);
+ if (unlikely(err))
+ return err;
- if (rc != 0)
- return rc;
+ reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -567,22 +1331,115 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
- sk->sk_state = TCP_LISTEN;
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- inet->sport = htons(inet->num);
+ inet_sk_state_store(sk, TCP_LISTEN);
+ err = sk->sk_prot->get_port(sk, inet->inet_num);
+ if (!err) {
+ inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
- return 0;
+ if (likely(!err))
+ return 0;
}
- sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
- return -EADDRINUSE;
+ inet_sk_set_state(sk, TCP_CLOSE);
+ return err;
}
-EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ tcp_orphan_count_inc();
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
+ }
+ inet_csk_destroy_sock(child);
+}
+
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+ struct request_sock *req,
+ struct sock *child)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+ spin_lock(&queue->rskq_lock);
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_child_forget(sk, req, child);
+ child = NULL;
+ } else {
+ req->sk = child;
+ req->dl_next = NULL;
+ if (queue->rskq_accept_head == NULL)
+ WRITE_ONCE(queue->rskq_accept_head, req);
+ else
+ queue->rskq_accept_tail->dl_next = req;
+ queue->rskq_accept_tail = req;
+ sk_acceptq_added(sk);
+ }
+ spin_unlock(&queue->rskq_lock);
+ return child;
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+ struct request_sock *req, bool own_req)
+{
+ if (own_req) {
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+ if (sk != req->rsk_listener) {
+ /* another listening sk has been selected,
+ * migrate the req to it.
+ */
+ struct request_sock *nreq;
+
+ /* hold a refcnt for the nreq->rsk_listener
+ * which is assigned in inet_reqsk_clone()
+ */
+ sock_hold(sk);
+ nreq = inet_reqsk_clone(req, sk);
+ if (!nreq) {
+ inet_child_forget(sk, req, child);
+ goto child_put;
+ }
+
+ refcount_set(&nreq->rsk_refcnt, 1);
+ if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS);
+ reqsk_migrate_reset(req);
+ reqsk_put(req);
+ return child;
+ }
+
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ reqsk_migrate_reset(nreq);
+ __reqsk_free(nreq);
+ } else if (inet_csk_reqsk_queue_add(sk, req, child)) {
+ return child;
+ }
+ }
+ /* Too bad, another child took ownership of the request, undo. */
+child_put:
+ bh_unlock_sock(child);
+ sock_put(child);
+ return NULL;
+}
/*
* This routine closes sockets which have been at least partially
@@ -591,13 +1448,8 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct request_sock *acc_req;
- struct request_sock *req;
-
- inet_csk_delete_keepalive_timer(sk);
-
- /* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock *next, *req;
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -607,95 +1459,98 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(&icsk->icsk_accept_queue);
-
- while ((req = acc_req) != NULL) {
- struct sock *child = req->sk;
-
- acc_req = req->dl_next;
+ while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
+ struct sock *child = req->sk, *nsk;
+ struct request_sock *nreq;
local_bh_disable();
bh_lock_sock(child);
- BUG_TRAP(!sock_owned_by_user(child));
+ WARN_ON(sock_owned_by_user(child));
sock_hold(child);
- sk->sk_prot->disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- atomic_inc(sk->sk_prot->orphan_count);
+ nsk = reuseport_migrate_sock(sk, child, NULL);
+ if (nsk) {
+ nreq = inet_reqsk_clone(req, nsk);
+ if (nreq) {
+ refcount_set(&nreq->rsk_refcnt, 1);
+
+ if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+ __NET_INC_STATS(sock_net(nsk),
+ LINUX_MIB_TCPMIGRATEREQSUCCESS);
+ reqsk_migrate_reset(req);
+ } else {
+ __NET_INC_STATS(sock_net(nsk),
+ LINUX_MIB_TCPMIGRATEREQFAILURE);
+ reqsk_migrate_reset(nreq);
+ __reqsk_free(nreq);
+ }
- inet_csk_destroy_sock(child);
+ /* inet_csk_reqsk_queue_add() has already
+ * called inet_child_forget() on failure case.
+ */
+ goto skip_child_forget;
+ }
+ }
+ inet_child_forget(sk, req, child);
+skip_child_forget:
+ reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
- sk_acceptq_removed(sk);
- __reqsk_free(req);
+ cond_resched();
}
- BUG_TRAP(!sk->sk_ack_backlog);
+ if (queue->fastopenq.rskq_rst_head) {
+ /* Free all the reqs queued in rskq_rst_head. */
+ spin_lock_bh(&queue->fastopenq.lock);
+ req = queue->fastopenq.rskq_rst_head;
+ queue->fastopenq.rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq.lock);
+ while (req != NULL) {
+ next = req->dl_next;
+ reqsk_put(req);
+ req = next;
+ }
+ }
+ WARN_ON_ONCE(sk->sk_ack_backlog);
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
-void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
const struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4;
+ struct rtable *rt;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->daddr;
- sin->sin_port = inet->dport;
+ rcu_read_lock();
+ fl4 = &fl->u.ip4;
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
+ if (IS_ERR(rt))
+ rt = NULL;
+ if (rt)
+ sk_setup_caps(sk, &rt->dst);
+ rcu_read_unlock();
+
+ return &rt->dst;
}
-EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
-
-int inet_csk_ctl_sock_create(struct socket **sock, unsigned short family,
- unsigned short type, unsigned char protocol)
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
{
- int rc = sock_create_kern(family, type, protocol, sock);
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+ struct inet_sock *inet = inet_sk(sk);
- if (rc == 0) {
- (*sock)->sk->sk_allocation = GFP_ATOMIC;
- inet_sk((*sock)->sk)->uc_ttl = -1;
- /*
- * Unhash it so that IP input processing does not even see it,
- * we do not wish this socket to see incoming packets.
- */
- (*sock)->sk->sk_prot->unhash((*sock)->sk);
+ if (!dst) {
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+ if (!dst)
+ goto out;
}
- return rc;
-}
-
-EXPORT_SYMBOL_GPL(inet_csk_ctl_sock_create);
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
-#ifdef CONFIG_COMPAT
-int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- if (icsk->icsk_af_ops->compat_getsockopt != NULL)
- return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return icsk->icsk_af_ops->getsockopt(sk, level, optname,
- optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
-
-int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- if (icsk->icsk_af_ops->compat_setsockopt != NULL)
- return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return icsk->icsk_af_ops->setsockopt(sk, level, optname,
- optval, optlen);
+ dst = __sk_dst_check(sk, 0);
+ if (!dst)
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+ return dst;
}
-
-EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
-#endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 77761ac4f7bb..3f5b1418a610 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1,20 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* inet_diag.c Module for monitoring INET transport protocols sockets.
*
- * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/time.h>
@@ -24,273 +20,414 @@
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_timewait_sock.h>
-#include <net/inet6_hashtables.h>
+#include <net/bpf_sk_storage.h>
+#include <net/netlink.h>
#include <linux/inet.h>
#include <linux/stddef.h>
#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
-static const struct inet_diag_handler **inet_diag_table;
+static const struct inet_diag_handler __rcu **inet_diag_table;
struct inet_diag_entry {
- __be32 *saddr;
- __be32 *daddr;
+ const __be32 *saddr;
+ const __be32 *daddr;
u16 sport;
u16 dport;
u16 family;
u16 userlocks;
+ u32 ifindex;
+ u32 mark;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ u64 cgroup_id;
+#endif
};
-static struct sock *idiagnl;
-
-#define INET_DIAG_PUT(skb, attrtype, attrlen) \
- RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-
-static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
- const struct inet_sock *inet = inet_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- void *info = NULL;
- struct inet_diag_meminfo *minfo = NULL;
- unsigned char *b = skb->tail;
const struct inet_diag_handler *handler;
- handler = inet_diag_table[unlh->nlmsg_type];
- BUG_ON(handler == NULL);
+ if (proto < 0 || proto >= IPPROTO_MAX)
+ return NULL;
+
+ if (!READ_ONCE(inet_diag_table[proto]))
+ sock_load_diag_module(AF_INET, proto);
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
+ rcu_read_lock();
+ handler = rcu_dereference(inet_diag_table[proto]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
- r = NLMSG_DATA(nlh);
- BUG_ON(sk->sk_state == TCP_TIME_WAIT);
+ return handler;
+}
- if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
- minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
+static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
+{
+ module_put(handler->owner);
+}
+
+void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+{
+ r->idiag_family = READ_ONCE(sk->sk_family);
- if (ext & (1 << (INET_DIAG_INFO - 1)))
- info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
- handler->idiag_info_size);
+ r->id.idiag_sport = htons(READ_ONCE(sk->sk_num));
+ r->id.idiag_dport = READ_ONCE(sk->sk_dport);
+ r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if);
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
- if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
- const size_t len = strlen(icsk->icsk_ca_ops->name);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (r->idiag_family == AF_INET6) {
+ data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr);
+ data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr);
+ } else
+#endif
+ {
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
- icsk->icsk_ca_ops->name);
+ r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr);
+ r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr);
}
+}
+EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
- r->idiag_family = sk->sk_family;
- r->idiag_state = sk->sk_state;
- r->idiag_timer = 0;
- r->idiag_retrans = 0;
+int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct inet_diag_msg *r, int ext,
+ struct user_namespace *user_ns,
+ bool net_admin)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_sockopt inet_sockopt;
- r->id.idiag_if = sk->sk_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
- r->id.idiag_sport = inet->sport;
- r->id.idiag_dport = inet->dport;
- r->id.idiag_src[0] = inet->rcv_saddr;
- r->id.idiag_dst[0] = inet->daddr;
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
+ goto errout;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &np->rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &np->daddr);
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
+
+ if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+ goto errout;
}
#endif
-#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark)))
+ goto errout;
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
- r->idiag_timer = 1;
- r->idiag_retrans = icsk->icsk_retransmits;
- r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
- } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
- r->idiag_timer = 4;
- r->idiag_retrans = icsk->icsk_probes_out;
- r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
- } else if (timer_pending(&sk->sk_timer)) {
- r->idiag_timer = 2;
- r->idiag_retrans = icsk->icsk_probes_out;
- r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
- } else {
- r->idiag_timer = 0;
- r->idiag_expires = 0;
- }
-#undef EXPIRES_IN_MS
+ if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
+ ext & (1 << (INET_DIAG_TCLASS - 1))) {
+ u32 classid = 0;
- r->idiag_uid = sock_i_uid(sk);
- r->idiag_inode = sock_i_ino(sk);
-
- if (minfo) {
- minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
- minfo->idiag_wmem = sk->sk_wmem_queued;
- minfo->idiag_fmem = sk->sk_forward_alloc;
- minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+ /* Fallback to socket priority if class id isn't set.
+ * Classful qdiscs use it as direct reference to class.
+ * For cgroup2 classid is always zero.
+ */
+ if (!classid)
+ classid = READ_ONCE(sk->sk_priority);
+
+ if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+ goto errout;
}
- handler->idiag_get_info(sk, r, info);
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID,
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)),
+ INET_DIAG_PAD))
+ goto errout;
+#endif
- if (sk->sk_state < TCP_TIME_WAIT &&
- icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
- icsk->icsk_ca_ops->get_info(sk, ext, skb);
+ r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
+ memset(&inet_sockopt, 0, sizeof(inet_sockopt));
+ inet_sockopt.recverr = inet_test_bit(RECVERR, sk);
+ inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk);
+ inet_sockopt.freebind = inet_test_bit(FREEBIND, sk);
+ inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk);
+ inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk);
+ inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk);
+ inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk);
+ inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk);
+ inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+ inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk);
+ inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk);
+ if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt),
+ &inet_sockopt))
+ goto errout;
-rtattr_failure:
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
+ return 0;
+errout:
+ return 1;
}
+EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
-static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
- struct sk_buff *skb, int ext, u32 pid,
- u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen,
+ struct nlattr **req_nlas)
{
- long tmo;
- struct inet_diag_msg *r;
- const unsigned char *previous_tail = skb->tail;
- struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
- unlh->nlmsg_type, sizeof(*r));
-
- r = NLMSG_DATA(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
-
- nlh->nlmsg_flags = nlmsg_flags;
-
- tmo = tw->tw_ttd - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->idiag_family = tw->tw_family;
- r->idiag_state = tw->tw_state;
- r->idiag_timer = 0;
- r->idiag_retrans = 0;
- r->id.idiag_if = tw->tw_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)tw;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
- r->id.idiag_sport = tw->tw_sport;
- r->id.idiag_dport = tw->tw_dport;
- r->id.idiag_src[0] = tw->tw_rcv_saddr;
- r->id.idiag_dst[0] = tw->tw_daddr;
- r->idiag_state = tw->tw_substate;
- r->idiag_timer = 3;
- r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- if (tw->tw_family == AF_INET6) {
- const struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
-
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tw6->tw_v6_rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tw6->tw_v6_daddr);
+ struct nlattr *nla;
+ int remaining;
+
+ nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) {
+ int type = nla_type(nla);
+
+ if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32))
+ return -EINVAL;
+
+ if (type < __INET_DIAG_REQ_MAX)
+ req_nlas[type] = nla;
}
-#endif
- nlh->nlmsg_len = skb->tail - previous_tail;
- return skb->len;
-nlmsg_failure:
- skb_trim(skb, previous_tail - skb->data);
- return -1;
+ return 0;
}
-static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req,
+ const struct inet_diag_dump_data *data)
{
- if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
- skb, ext, pid, seq, nlmsg_flags,
- unlh);
- return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh);
+ if (data->req_nlas[INET_DIAG_REQ_PROTOCOL])
+ return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]);
+ return req->sdiag_protocol;
}
-static int inet_diag_get_exact(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh)
+#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
+int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+ struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ u16 nlmsg_flags, bool net_admin)
{
- int err;
- struct sock *sk;
- struct inet_diag_req *req = NLMSG_DATA(nlh);
- struct sk_buff *rep;
- struct inet_hashinfo *hashinfo;
+ const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
+ struct inet_diag_dump_data *cb_data;
+ int ext = req->idiag_ext;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ void *info = NULL;
+ u8 icsk_pending;
+ int protocol;
+
+ cb_data = cb->data;
+ protocol = inet_diag_get_protocol(req, cb_data);
+
+ /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */
+ handler = rcu_dereference_protected(inet_diag_table[protocol], 1);
+ DEBUG_NET_WARN_ON_ONCE(!handler);
+ if (!handler)
+ return -ENXIO;
- handler = inet_diag_table[nlh->nlmsg_type];
- BUG_ON(handler == NULL);
- hashinfo = handler->idiag_hashinfo;
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- if (req->idiag_family == AF_INET) {
- sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
- req->id.idiag_dport, req->id.idiag_src[0],
- req->id.idiag_sport, req->id.idiag_if);
+ r = nlmsg_data(nlh);
+ BUG_ON(!sk_fullsock(sk));
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+ r->idiag_expires = 0;
+
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ net_admin))
+ goto errout;
+
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+ struct inet_diag_meminfo minfo = {
+ .idiag_rmem = sk_rmem_alloc_get(sk),
+ .idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
+ .idiag_fmem = READ_ONCE(sk->sk_forward_alloc),
+ .idiag_tmem = sk_wmem_alloc_get(sk),
+ };
+
+ if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+ goto errout;
}
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- else if (req->idiag_family == AF_INET6) {
- sk = inet6_lookup(hashinfo,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
+
+ if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+ if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+ goto errout;
+
+ /*
+ * RAW sockets might have user-defined protocols assigned,
+ * so report the one supplied on socket creation.
+ */
+ if (sk->sk_type == SOCK_RAW) {
+ if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+ goto errout;
}
-#endif
- else {
- return -EINVAL;
+
+ if (!icsk) {
+ handler->idiag_get_info(sk, r, NULL);
+ goto out;
}
- if (sk == NULL)
- return -ENOENT;
+ icsk_pending = smp_load_acquire(&icsk->icsk_pending);
+ if (icsk_pending == ICSK_TIME_RETRANS ||
+ icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ r->idiag_timer = 1;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
+ r->idiag_expires =
+ jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
+ } else if (icsk_pending == ICSK_TIME_PROBE0) {
+ r->idiag_timer = 4;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
+ r->idiag_expires =
+ jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
+ r->idiag_timer = 2;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
+ r->idiag_expires =
+ jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies);
+ }
- err = -ESTALE;
- if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
- req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
- goto out;
+ if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
+ attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
+ handler->idiag_info_size,
+ INET_DIAG_PAD);
+ if (!attr)
+ goto errout;
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) +
- handler->idiag_info_size + 64)),
- GFP_KERNEL);
- if (!rep)
- goto out;
+ info = nla_data(attr);
+ }
+
+ if (ext & (1 << (INET_DIAG_CONG - 1))) {
+ int err = 0;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops)
+ err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
+ rcu_read_unlock();
+ if (err < 0)
+ goto errout;
+ }
+
+ handler->idiag_get_info(sk, r, info);
+
+ if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux)
+ if (handler->idiag_get_aux(sk, net_admin, skb) < 0)
+ goto errout;
+
+ if (sk->sk_state < TCP_TIME_WAIT) {
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ext, &attr, &info);
+ rcu_read_unlock();
+ if (sz && nla_put(skb, attr, sz, &info) < 0)
+ goto errout;
+ }
+
+ /* Keep it at the end for potential retry with a larger skb,
+ * or else do best-effort fitting, which is only done for the
+ * first_nlmsg.
+ */
+ if (cb_data->bpf_stg_diag) {
+ bool first_nlmsg = ((unsigned char *)nlh == skb->data);
+ unsigned int prev_min_dump_alloc;
+ unsigned int total_nla_size = 0;
+ unsigned int msg_len;
+ int err;
+
+ msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
+ err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
+ INET_DIAG_SK_BPF_STORAGES,
+ &total_nla_size);
+
+ if (!err)
+ goto out;
+
+ total_nla_size += msg_len;
+ prev_min_dump_alloc = cb->min_dump_alloc;
+ if (total_nla_size > prev_min_dump_alloc)
+ cb->min_dump_alloc = min_t(u32, total_nla_size,
+ MAX_DUMP_ALLOC_SIZE);
+
+ if (!first_nlmsg)
+ goto errout;
- if (sk_diag_fill(sk, rep, req->idiag_ext,
- NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, 0, nlh) <= 0)
- BUG();
+ if (cb->min_dump_alloc > prev_min_dump_alloc)
+ /* Retry with pskb_expand_head() with
+ * __GFP_DIRECT_RECLAIM
+ */
+ goto errout;
- err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
- MSG_DONTWAIT);
- if (err > 0)
- err = 0;
+ WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
+
+ /* Send what we have for this sk
+ * and move on to the next sk in the following
+ * dump()
+ */
+ }
out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put((struct inet_timewait_sock *)sk);
- else
- sock_put(sk);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
+
+static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ int hdrlen,
+ const struct inet_diag_req_v2 *req)
+{
+ const struct inet_diag_handler *handler;
+ struct inet_diag_dump_data dump_data;
+ int err, protocol;
+
+ memset(&dump_data, 0, sizeof(dump_data));
+ err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas);
+ if (err)
+ return err;
+
+ protocol = inet_diag_get_protocol(req, &dump_data);
+
+ handler = inet_diag_lock_handler(protocol);
+ if (!handler)
+ return -ENOENT;
+
+ if (cmd == SOCK_DIAG_BY_FAMILY) {
+ struct netlink_callback cb = {
+ .nlh = nlh,
+ .skb = in_skb,
+ .data = &dump_data,
+ };
+ err = handler->dump_one(&cb, req);
+ } else if (cmd == SOCK_DESTROY && handler->destroy) {
+ err = handler->destroy(in_skb, req);
+ } else {
+ err = -EOPNOTSUPP;
}
+ inet_diag_unlock_handler(handler);
+
return err;
}
@@ -320,10 +457,12 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
return 1;
}
-
-static int inet_diag_bc_run(const void *bc, int len,
+static int inet_diag_bc_run(const struct nlattr *_bc,
const struct inet_diag_entry *entry)
{
+ const void *bc = nla_data(_bc);
+ int len = nla_len(_bc);
+
while (len > 0) {
int yes = 1;
const struct inet_diag_bc_op *op = bc;
@@ -334,11 +473,17 @@ static int inet_diag_bc_run(const void *bc, int len,
case INET_DIAG_BC_JMP:
yes = 0;
break;
+ case INET_DIAG_BC_S_EQ:
+ yes = entry->sport == op[1].no;
+ break;
case INET_DIAG_BC_S_GE:
yes = entry->sport >= op[1].no;
break;
case INET_DIAG_BC_S_LE:
- yes = entry->dport <= op[1].no;
+ yes = entry->sport <= op[1].no;
+ break;
+ case INET_DIAG_BC_D_EQ:
+ yes = entry->dport == op[1].no;
break;
case INET_DIAG_BC_D_GE:
yes = entry->dport >= op[1].no;
@@ -351,10 +496,10 @@ static int inet_diag_bc_run(const void *bc, int len,
break;
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND: {
- struct inet_diag_hostcond *cond;
- __be32 *addr;
+ const struct inet_diag_hostcond *cond;
+ const __be32 *addr;
- cond = (struct inet_diag_hostcond *)(op + 1);
+ cond = (const struct inet_diag_hostcond *)(op + 1);
if (cond->port != -1 &&
cond->port != (op->code == INET_DIAG_BC_S_COND ?
entry->sport : entry->dport)) {
@@ -362,28 +507,60 @@ static int inet_diag_bc_run(const void *bc, int len,
break;
}
- if (cond->prefix_len == 0)
- break;
-
if (op->code == INET_DIAG_BC_S_COND)
addr = entry->saddr;
else
addr = entry->daddr;
+ if (cond->family != AF_UNSPEC &&
+ cond->family != entry->family) {
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3,
+ cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
if (bitstring_match(addr, cond->addr,
cond->prefix_len))
break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr + 3, cond->addr,
- cond->prefix_len))
- break;
- }
yes = 0;
break;
}
+ case INET_DIAG_BC_DEV_COND: {
+ u32 ifindex;
+
+ ifindex = *((const u32 *)(op + 1));
+ if (ifindex != entry->ifindex)
+ yes = 0;
+ break;
+ }
+ case INET_DIAG_BC_MARK_COND: {
+ struct inet_diag_markcond *cond;
+
+ cond = (struct inet_diag_markcond *)(op + 1);
+ if ((entry->mark & cond->mask) != cond->mark)
+ yes = 0;
+ break;
+ }
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case INET_DIAG_BC_CGROUP_COND: {
+ u64 cgroup_id;
+
+ cgroup_id = get_unaligned((const u64 *)(op + 1));
+ if (cgroup_id != entry->cgroup_id)
+ yes = 0;
+ break;
+ }
+#endif
}
if (yes) {
@@ -394,8 +571,61 @@ static int inet_diag_bc_run(const void *bc, int len,
bc += op->no;
}
}
- return (len == 0);
+ return len == 0;
+}
+
+/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
+ */
+static void entry_fill_addrs(struct inet_diag_entry *entry,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (entry->family == AF_INET6) {
+ entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry->daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry->saddr = &sk->sk_rcv_saddr;
+ entry->daddr = &sk->sk_daddr;
+ }
+}
+
+int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk)
+{
+ const struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ const struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
+
+ if (!bc)
+ return 1;
+
+ entry.family = READ_ONCE(sk->sk_family);
+ entry_fill_addrs(&entry, sk);
+ entry.sport = READ_ONCE(inet->inet_num);
+ entry.dport = ntohs(READ_ONCE(inet->inet_dport));
+ entry.ifindex = READ_ONCE(sk->sk_bound_dev_if);
+ if (cb_data->userlocks_needed)
+ entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0;
+ if (cb_data->mark_needed) {
+ if (sk_fullsock(sk))
+ entry.mark = READ_ONCE(sk->sk_mark);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ entry.mark = inet_twsk(sk)->tw_mark;
+ else
+ entry.mark = 0;
+ }
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ if (cb_data->cgroup_needed)
+ entry.cgroup_id = sk_fullsock(sk) ?
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
+#endif
+
+ return inet_diag_bc_run(bc, &entry);
}
+EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
static int valid_cc(const void *bc, int len, int cc)
{
@@ -406,7 +636,7 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
if (cc == len)
return 1;
- if (op->yes < 4)
+ if (op->yes < 4 || op->yes & 3)
return 0;
len -= op->yes;
bc += op->yes;
@@ -414,477 +644,416 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
-static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
{
- const unsigned char *bc = bytecode;
- int len = bytecode_len;
+ /* Check ifindex space. */
+ *min_len += sizeof(u32);
+ if (len < *min_len)
+ return false;
+
+ return true;
+}
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ struct inet_diag_hostcond *cond;
+ int addr_len;
+
+ /* Check hostcond space. */
+ *min_len += sizeof(struct inet_diag_hostcond);
+ if (len < *min_len)
+ return false;
+ cond = (struct inet_diag_hostcond *)(op + 1);
+
+ /* Check address family and address length. */
+ switch (cond->family) {
+ case AF_UNSPEC:
+ addr_len = 0;
+ break;
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return false;
+ }
+ *min_len += addr_len;
+ if (len < *min_len)
+ return false;
+
+ /* Check prefix length (in bits) vs address length (in bytes). */
+ if (cond->prefix_len > 8 * addr_len)
+ return false;
+
+ return true;
+}
+
+/* Validate a port comparison operator. */
+static bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
+{
+ /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+ *min_len += sizeof(struct inet_diag_bc_op);
+ if (len < *min_len)
+ return false;
+ return true;
+}
+
+static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ *min_len += sizeof(struct inet_diag_markcond);
+ return len >= *min_len;
+}
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ *min_len += sizeof(u64);
+ return len >= *min_len;
+}
+#endif
+
+static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data,
+ const struct sk_buff *skb)
+{
+ const struct nlattr *attr = cb_data->inet_diag_nla_bc;
+ const void *bytecode, *bc;
+ int bytecode_len, len;
+ bool net_admin;
+
+ if (!attr)
+ return 0;
+
+ if (nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ return -EINVAL;
+
+ net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ bytecode = bc = nla_data(attr);
+ len = bytecode_len = nla_len(attr);
while (len > 0) {
- struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+ int min_len = sizeof(struct inet_diag_bc_op);
+ const struct inet_diag_bc_op *op = bc;
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
- case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
+ if (!valid_hostcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_DEV_COND:
+ if (!valid_devcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_S_EQ:
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_EQ:
case INET_DIAG_BC_D_GE:
case INET_DIAG_BC_D_LE:
- if (op->yes < 4 || op->yes > len + 4)
+ if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
- case INET_DIAG_BC_JMP:
- if (op->no < 4 || op->no > len + 4)
+ break;
+ case INET_DIAG_BC_MARK_COND:
+ if (!net_admin)
+ return -EPERM;
+ if (!valid_markcond(bc, len, &min_len))
return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len - op->no))
+ cb_data->mark_needed = true;
+ break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case INET_DIAG_BC_CGROUP_COND:
+ if (!valid_cgroupcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->cgroup_needed = true;
break;
+#endif
+ case INET_DIAG_BC_AUTO:
+ cb_data->userlocks_needed = true;
+ fallthrough;
+ case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
- if (op->yes < 4 || op->yes > len + 4)
- return -EINVAL;
break;
default:
return -EINVAL;
}
+
+ if (op->code != INET_DIAG_BC_NOP) {
+ if (op->no < min_len || op->no > len + 4 || op->no & 3)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ }
+
+ if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
+ return -EINVAL;
bc += op->yes;
len -= op->yes;
}
return len == 0 ? 0 : -EINVAL;
}
-static int inet_csk_diag_dump(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb)
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
{
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- struct inet_diag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
- struct inet_sock *inet = inet_sk(sk);
-
- entry.family = sk->sk_family;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_diag_dump_data *cb_data = cb->data;
+ const struct inet_diag_handler *handler;
+ u32 prev_min_dump_alloc;
+ int protocol, err = 0;
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->rcv_saddr;
- entry.daddr = &inet->daddr;
- }
- entry.sport = inet->num;
- entry.dport = ntohs(inet->dport);
- entry.userlocks = sk->sk_userlocks;
+ protocol = inet_diag_get_protocol(r, cb_data);
- if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
- return 0;
+again:
+ prev_min_dump_alloc = cb->min_dump_alloc;
+ handler = inet_diag_lock_handler(protocol);
+ if (handler) {
+ handler->dump(skb, cb, r);
+ inet_diag_unlock_handler(handler);
+ } else {
+ err = -ENOENT;
}
-
- return inet_csk_diag_fill(sk, skb, r->idiag_ext,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
-}
-
-static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
- struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- struct inet_diag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
-
- entry.family = tw->tw_family;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- if (tw->tw_family == AF_INET6) {
- struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
- entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw6->tw_v6_daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &tw->tw_rcv_saddr;
- entry.daddr = &tw->tw_daddr;
- }
- entry.sport = tw->tw_num;
- entry.dport = ntohs(tw->tw_dport);
- entry.userlocks = 0;
-
- if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
- return 0;
+ /* The skb is not large enough to fit one sk info and
+ * inet_sk_diag_fill() has requested for a larger skb.
+ */
+ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
+ err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
+ if (!err)
+ goto again;
}
- return inet_twsk_diag_fill(tw, skb, r->idiag_ext,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ return err ? : skb->len;
}
-static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req, u32 pid, u32 seq,
- const struct nlmsghdr *unlh)
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb->tail;
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
-
- r->idiag_family = sk->sk_family;
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = req->retrans;
-
- r->id.idiag_if = sk->sk_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)req;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-
- tmo = req->expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.idiag_sport = inet->sport;
- r->id.idiag_dport = ireq->rmt_port;
- r->id.idiag_src[0] = ireq->loc_addr;
- r->id.idiag_dst[0] = ireq->rmt_addr;
- r->idiag_expires = jiffies_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = sock_i_uid(sk);
- r->idiag_inode = 0;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- if (r->idiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &inet6_rsk(req)->loc_addr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &inet6_rsk(req)->rmt_addr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
-
- return skb->len;
-
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
}
-static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
+static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
{
- struct inet_diag_entry entry;
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt;
- struct rtattr *bc = NULL;
- struct inet_sock *inet = inet_sk(sk);
- int j, s_j;
- int reqnum, s_reqnum;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_diag_dump_data *cb_data;
+ struct sk_buff *skb = cb->skb;
+ struct nlattr *nla;
+ int err;
- lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !lopt->qlen)
- goto out;
+ cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
+ if (!cb_data)
+ return -ENOMEM;
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- bc = (struct rtattr *)(r + 1);
- entry.sport = inet->num;
- entry.userlocks = sk->sk_userlocks;
+ err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas);
+ if (err) {
+ kfree(cb_data);
+ return err;
+ }
+ err = inet_diag_bc_audit(cb_data, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
}
- for (j = s_j; j < lopt->nr_table_entries; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.idiag_dport != ireq->rmt_port &&
- r->id.idiag_dport)
- continue;
-
- if (bc) {
- entry.saddr =
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->loc_addr.s6_addr32 :
-#endif
- &ireq->loc_addr;
- entry.daddr =
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
- &ireq->rmt_addr;
- entry.dport = ntohs(ireq->rmt_port);
+ nla = cb_data->inet_diag_nla_bpf_stgs;
+ if (nla) {
+ struct bpf_sk_storage_diag *bpf_stg_diag;
- if (!inet_diag_bc_run(RTA_DATA(bc),
- RTA_PAYLOAD(bc), &entry))
- continue;
- }
-
- err = inet_diag_fill_req(skb, sk, req,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, cb->nlh);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
+ bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
+ if (IS_ERR(bpf_stg_diag)) {
+ kfree(cb_data);
+ return PTR_ERR(bpf_stg_diag);
}
-
- s_reqnum = 0;
+ cb_data->bpf_stg_diag = bpf_stg_diag;
}
-out:
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ cb->data = cb_data;
+ return 0;
+}
- return err;
+static int inet_diag_dump_start(struct netlink_callback *cb)
+{
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
}
-static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet_diag_dump_start_compat(struct netlink_callback *cb)
{
- int i, num;
- int s_i, s_num;
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
- const struct inet_diag_handler *handler;
- struct inet_hashinfo *hashinfo;
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
+}
- handler = inet_diag_table[cb->nlh->nlmsg_type];
- BUG_ON(handler == NULL);
- hashinfo = handler->idiag_hashinfo;
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+ struct inet_diag_dump_data *cb_data = cb->data;
- s_i = cb->args[1];
- s_num = num = cb->args[2];
+ bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
+ kfree(cb->data);
- if (cb->args[0] == 0) {
- if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
- goto skip_listen_ht;
+ return 0;
+}
- inet_listen_lock(hashinfo);
- for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
- struct sock *sk;
- struct hlist_node *node;
+static int inet_diag_type2proto(int type)
+{
+ switch (type) {
+ case TCPDIAG_GETSOCK:
+ return IPPROTO_TCP;
+ default:
+ return 0;
+ }
+}
- num = 0;
- sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
- struct inet_sock *inet = inet_sk(sk);
+static int inet_diag_dump_compat(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ struct inet_diag_req_v2 req;
- if (num < s_num) {
- num++;
- continue;
- }
+ req.sdiag_family = AF_UNSPEC; /* compatibility */
+ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.pad = 0;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
- if (r->id.idiag_sport != inet->sport &&
- r->id.idiag_sport)
- goto next_listen;
+ return __inet_diag_dump(skb, cb, &req);
+}
- if (!(r->idiag_states & TCPF_LISTEN) ||
- r->id.idiag_dport ||
- cb->args[3] > 0)
- goto syn_recv;
+static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh)
+{
+ struct inet_diag_req *rc = nlmsg_data(nlh);
+ struct inet_diag_req_v2 req;
+
+ req.sdiag_family = rc->idiag_family;
+ req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.pad = 0;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh,
+ sizeof(struct inet_diag_req), &req);
+}
- if (inet_csk_diag_dump(sk, skb, cb) < 0) {
- inet_listen_unlock(hashinfo);
- goto done;
- }
+static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct net *net = sock_net(skb->sk);
-syn_recv:
- if (!(r->idiag_states & TCPF_SYN_RECV))
- goto next_listen;
+ if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
+ nlmsg_len(nlh) < hdrlen)
+ return -EINVAL;
- if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
- inet_listen_unlock(hashinfo);
- goto done;
- }
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start_compat,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
+ }
-next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
- ++num;
- }
+ return inet_diag_get_exact_compat(skb, nlh);
+}
- s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
- }
- inet_listen_unlock(hashinfo);
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
+static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct net *net = sock_net(skb->sk);
- if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
- return skb->len;
-
- for (i = s_i; i < hashinfo->ehash_size; i++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[i];
- struct sock *sk;
- struct hlist_node *node;
-
- if (i > s_i)
- s_num = 0;
-
- read_lock_bh(&head->lock);
- num = 0;
- sk_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_normal;
- if (!(r->idiag_states & (1 << sk->sk_state)))
- goto next_normal;
- if (r->id.idiag_sport != inet->sport &&
- r->id.idiag_sport)
- goto next_normal;
- if (r->id.idiag_dport != inet->dport &&
- r->id.idiag_dport)
- goto next_normal;
- if (inet_csk_diag_dump(sk, skb, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_normal:
- ++num;
- }
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
- if (r->idiag_states & TCPF_TIME_WAIT) {
- struct inet_timewait_sock *tw;
-
- inet_twsk_for_each(tw, node,
- &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
-
- if (num < s_num)
- goto next_dying;
- if (r->id.idiag_sport != tw->tw_sport &&
- r->id.idiag_sport)
- goto next_dying;
- if (r->id.idiag_dport != tw->tw_dport &&
- r->id.idiag_dport)
- goto next_dying;
- if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
- read_unlock_bh(&head->lock);
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
-done:
- cb->args[1] = i;
- cb->args[2] = num;
- return skb->len;
+ return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen,
+ nlmsg_data(h));
}
-static inline int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static
+int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
{
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
-
- if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
- goto err_inval;
+ const struct inet_diag_handler *handler;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ struct inet_diag_msg *r;
+ void *info = NULL;
+ int err = 0;
- if (inet_diag_table[nlh->nlmsg_type] == NULL)
- return -ENOENT;
+ nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0);
+ if (!nlh)
+ return -ENOMEM;
- if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
- goto err_inval;
-
- if (nlh->nlmsg_flags&NLM_F_DUMP) {
- if (nlh->nlmsg_len >
- (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
- struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
- sizeof(struct inet_diag_req));
- if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
- rta->rta_len < 8 ||
- rta->rta_len >
- (nlh->nlmsg_len -
- NLMSG_SPACE(sizeof(struct inet_diag_req))))
- goto err_inval;
- if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
- goto err_inval;
- }
- return netlink_dump_start(idiagnl, skb, nlh,
- inet_diag_dump, NULL);
- } else
- return inet_diag_get_exact(skb, nlh);
-
-err_inval:
- return -EINVAL;
-}
+ r = nlmsg_data(nlh);
+ memset(r, 0, sizeof(*r));
+ inet_diag_msg_common_fill(r, sk);
+ if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM)
+ r->id.idiag_sport = inet_sk(sk)->inet_sport;
+ r->idiag_state = sk->sk_state;
+ if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
-static inline void inet_diag_rcv_skb(struct sk_buff *skb)
-{
- if (skb->len >= NLMSG_SPACE(0)) {
- int err;
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
-
- if (nlh->nlmsg_len < sizeof(*nlh) ||
- skb->len < nlh->nlmsg_len)
- return;
- err = inet_diag_rcv_msg(skb, nlh);
- if (err || nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, err);
+ handler = inet_diag_lock_handler(sk->sk_protocol);
+ if (!handler) {
+ nlmsg_cancel(skb, nlh);
+ return -ENOENT;
}
-}
-static void inet_diag_rcv(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+ attr = handler->idiag_info_size
+ ? nla_reserve_64bit(skb, INET_DIAG_INFO,
+ handler->idiag_info_size,
+ INET_DIAG_PAD)
+ : NULL;
+ if (attr)
+ info = nla_data(attr);
- while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
- inet_diag_rcv_skb(skb);
- kfree_skb(skb);
- }
+ handler->idiag_get_info(sk, r, info);
+ inet_diag_unlock_handler(handler);
+
+ nlmsg_end(skb, nlh);
+ return 0;
}
-static DEFINE_SPINLOCK(inet_diag_register_lock);
+static const struct sock_diag_handler inet_diag_handler = {
+ .owner = THIS_MODULE,
+ .family = AF_INET,
+ .dump = inet_diag_handler_cmd,
+ .get_info = inet_diag_handler_get_info,
+ .destroy = inet_diag_handler_cmd,
+};
+
+static const struct sock_diag_handler inet6_diag_handler = {
+ .owner = THIS_MODULE,
+ .family = AF_INET6,
+ .dump = inet_diag_handler_cmd,
+ .get_info = inet_diag_handler_get_info,
+ .destroy = inet_diag_handler_cmd,
+};
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
- int err = -EINVAL;
- if (type >= INET_DIAG_GETSOCK_MAX)
- goto out;
+ if (type >= IPPROTO_MAX)
+ return -EINVAL;
- spin_lock(&inet_diag_register_lock);
- err = -EEXIST;
- if (inet_diag_table[type] == NULL) {
- inet_diag_table[type] = h;
- err = 0;
- }
- spin_unlock(&inet_diag_register_lock);
-out:
- return err;
+ return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type],
+ NULL, h) ? 0 : -EEXIST;
}
EXPORT_SYMBOL_GPL(inet_diag_register);
@@ -892,20 +1061,22 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
- if (type >= INET_DIAG_GETSOCK_MAX)
+ if (type >= IPPROTO_MAX)
return;
- spin_lock(&inet_diag_register_lock);
- inet_diag_table[type] = NULL;
- spin_unlock(&inet_diag_register_lock);
-
- synchronize_rcu();
+ xchg((const struct inet_diag_handler **)&inet_diag_table[type],
+ NULL);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
+static const struct sock_diag_inet_compat inet_diag_compat = {
+ .owner = THIS_MODULE,
+ .fn = inet_diag_rcv_msg_compat,
+};
+
static int __init inet_diag_init(void)
{
- const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+ const int inet_diag_table_size = (IPPROTO_MAX *
sizeof(struct inet_diag_handler *));
int err = -ENOMEM;
@@ -913,24 +1084,36 @@ static int __init inet_diag_init(void)
if (!inet_diag_table)
goto out;
- idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
- THIS_MODULE);
- if (idiagnl == NULL)
- goto out_free_table;
- err = 0;
+ err = sock_diag_register(&inet_diag_handler);
+ if (err)
+ goto out_free_nl;
+
+ err = sock_diag_register(&inet6_diag_handler);
+ if (err)
+ goto out_free_inet;
+
+ sock_diag_register_inet_compat(&inet_diag_compat);
out:
return err;
-out_free_table:
+
+out_free_inet:
+ sock_diag_unregister(&inet_diag_handler);
+out_free_nl:
kfree(inet_diag_table);
goto out;
}
static void __exit inet_diag_exit(void)
{
- sock_release(idiagnl->sk_socket);
+ sock_diag_unregister(&inet6_diag_handler);
+ sock_diag_unregister(&inet_diag_handler);
+ sock_diag_unregister_inet_compat(&inet_diag_compat);
kfree(inet_diag_table);
}
module_init(inet_diag_init);
module_exit(inet_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
new file mode 100644
index 000000000000..025895eb6ec5
--- /dev/null
+++ b/net/ipv4/inet_fragment.c
@@ -0,0 +1,651 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * inet fragments management
+ *
+ * Authors: Pavel Emelyanov <xemul@openvz.org>
+ * Started as consolidation of ipv4/ip_fragment.c,
+ * ipv6/reassembly. and ipv6 nf conntrack reassembly
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include "../core/sock_destructor.h"
+
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+ union {
+ struct inet_skb_parm h4;
+ struct inet6_skb_parm h6;
+ };
+ struct sk_buff *next_frag;
+ int frag_run_len;
+ int ip_defrag_offset;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void fragcb_clear(struct sk_buff *skb)
+{
+ RB_CLEAR_NODE(&skb->rbnode);
+ FRAG_CB(skb)->next_frag = NULL;
+ FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void fragrun_append_to_last(struct inet_frag_queue *q,
+ struct sk_buff *skb)
+{
+ fragcb_clear(skb);
+
+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+ FRAG_CB(q->fragments_tail)->next_frag = skb;
+ q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+ fragcb_clear(skb);
+
+ if (q->last_run_head)
+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+ &q->last_run_head->rbnode.rb_right);
+ else
+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+ q->fragments_tail = skb;
+ q->last_run_head = skb;
+}
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
+
+int inet_frags_init(struct inet_frags *f)
+{
+ f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+ NULL);
+ if (!f->frags_cachep)
+ return -ENOMEM;
+
+ refcount_set(&f->refcnt, 1);
+ init_completion(&f->completion);
+ return 0;
+}
+EXPORT_SYMBOL(inet_frags_init);
+
+void inet_frags_fini(struct inet_frags *f)
+{
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ wait_for_completion(&f->completion);
+
+ kmem_cache_destroy(f->frags_cachep);
+ f->frags_cachep = NULL;
+}
+EXPORT_SYMBOL(inet_frags_fini);
+
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
+static void inet_frags_free_cb(void *ptr, void *arg)
+{
+ struct inet_frag_queue *fq = ptr;
+ int count;
+
+ count = timer_delete_sync(&fq->timer) ? 1 : 0;
+
+ spin_lock_bh(&fq->lock);
+ fq->flags |= INET_FRAG_DROP;
+ if (!(fq->flags & INET_FRAG_COMPLETE)) {
+ fq->flags |= INET_FRAG_COMPLETE;
+ count++;
+ } else if (fq->flags & INET_FRAG_HASH_DEAD) {
+ count++;
+ }
+ spin_unlock_bh(&fq->lock);
+
+ inet_frag_putn(fq, count);
+}
+
+static LLIST_HEAD(fqdir_free_list);
+
+static void fqdir_free_fn(struct work_struct *work)
+{
+ struct llist_node *kill_list;
+ struct fqdir *fqdir, *tmp;
+ struct inet_frags *f;
+
+ /* Atomically snapshot the list of fqdirs to free */
+ kill_list = llist_del_all(&fqdir_free_list);
+
+ /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
+ * have completed, since they need to dereference fqdir.
+ * Would it not be nice to have kfree_rcu_barrier() ? :)
+ */
+ rcu_barrier();
+
+ llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
+ f = fqdir->f;
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ kfree(fqdir);
+ }
+}
+
+static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn);
+
+static void fqdir_work_fn(struct work_struct *work)
+{
+ struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
+
+ rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+ if (llist_add(&fqdir->free_list, &fqdir_free_list))
+ queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ);
+}
+
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
+{
+ struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+ int res;
+
+ if (!fqdir)
+ return -ENOMEM;
+ fqdir->f = f;
+ fqdir->net = net;
+ res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+ if (res < 0) {
+ kfree(fqdir);
+ return res;
+ }
+ refcount_inc(&f->refcnt);
+ *fqdirp = fqdir;
+ return 0;
+}
+EXPORT_SYMBOL(fqdir_init);
+
+static struct workqueue_struct *inet_frag_wq;
+
+static int __init inet_frag_wq_init(void)
+{
+ inet_frag_wq = create_workqueue("inet_frag_wq");
+ if (!inet_frag_wq)
+ panic("Could not create inet frag workq");
+ return 0;
+}
+
+pure_initcall(inet_frag_wq_init);
+
+void fqdir_exit(struct fqdir *fqdir)
+{
+ INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
+ queue_work(inet_frag_wq, &fqdir->destroy_work);
+}
+EXPORT_SYMBOL(fqdir_exit);
+
+void inet_frag_kill(struct inet_frag_queue *fq, int *refs)
+{
+ if (timer_delete(&fq->timer))
+ (*refs)++;
+
+ if (!(fq->flags & INET_FRAG_COMPLETE)) {
+ struct fqdir *fqdir = fq->fqdir;
+
+ fq->flags |= INET_FRAG_COMPLETE;
+ rcu_read_lock();
+ /* The RCU read lock provides a memory barrier
+ * guaranteeing that if fqdir->dead is false then
+ * the hash table destruction will not start until
+ * after we unlock. Paired with fqdir_pre_exit().
+ */
+ if (!READ_ONCE(fqdir->dead)) {
+ rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
+ fqdir->f->rhash_params);
+ (*refs)++;
+ } else {
+ fq->flags |= INET_FRAG_HASH_DEAD;
+ }
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(inet_frag_kill);
+
+static void inet_frag_destroy_rcu(struct rcu_head *head)
+{
+ struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
+ rcu);
+ struct inet_frags *f = q->fqdir->f;
+
+ if (f->destructor)
+ f->destructor(q);
+ kmem_cache_free(f->frags_cachep, q);
+}
+
+unsigned int inet_frag_rbtree_purge(struct rb_root *root,
+ enum skb_drop_reason reason)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ while (skb) {
+ struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+ sum += skb->truesize;
+ kfree_skb_reason(skb, reason);
+ skb = next;
+ }
+ }
+ return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
+void inet_frag_destroy(struct inet_frag_queue *q)
+{
+ unsigned int sum, sum_truesize = 0;
+ enum skb_drop_reason reason;
+ struct inet_frags *f;
+ struct fqdir *fqdir;
+
+ WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+ reason = (q->flags & INET_FRAG_DROP) ?
+ SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
+ SKB_CONSUMED;
+ WARN_ON(timer_delete(&q->timer) != 0);
+
+ /* Release all fragment data. */
+ fqdir = q->fqdir;
+ f = fqdir->f;
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
+ sum = sum_truesize + f->qsize;
+
+ call_rcu(&q->rcu, inet_frag_destroy_rcu);
+
+ sub_frag_mem_limit(fqdir, sum);
+}
+EXPORT_SYMBOL(inet_frag_destroy);
+
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_queue *q;
+
+ q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
+ if (!q)
+ return NULL;
+
+ q->fqdir = fqdir;
+ f->constructor(q, arg);
+ add_frag_mem_limit(fqdir, f->qsize);
+
+ timer_setup(&q->timer, f->frag_expire, 0);
+ spin_lock_init(&q->lock);
+ /* One reference for the timer, one for the hash table. */
+ refcount_set(&q->refcnt, 2);
+
+ return q;
+}
+
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
+ void *arg,
+ struct inet_frag_queue **prev)
+{
+ struct inet_frags *f = fqdir->f;
+ struct inet_frag_queue *q;
+
+ q = inet_frag_alloc(fqdir, f, arg);
+ if (!q) {
+ *prev = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+ mod_timer(&q->timer, jiffies + fqdir->timeout);
+
+ *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
+ &q->node, f->rhash_params);
+ if (*prev) {
+ /* We could not insert in the hash table,
+ * we need to cancel what inet_frag_alloc()
+ * anticipated.
+ */
+ int refs = 1;
+
+ q->flags |= INET_FRAG_COMPLETE;
+ inet_frag_kill(q, &refs);
+ inet_frag_putn(q, refs);
+ return NULL;
+ }
+ return q;
+}
+
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
+{
+ /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
+ long high_thresh = READ_ONCE(fqdir->high_thresh);
+ struct inet_frag_queue *fq = NULL, *prev;
+
+ if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
+ return NULL;
+
+ prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
+ if (!prev)
+ fq = inet_frag_create(fqdir, key, &prev);
+ if (!IS_ERR_OR_NULL(prev))
+ fq = prev;
+ return fq;
+}
+EXPORT_SYMBOL(inet_frag_find);
+
+int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
+ int offset, int end)
+{
+ struct sk_buff *last = q->fragments_tail;
+
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+ * Duplicates, however, should be ignored (i.e. skb dropped, but the
+ * queue/fragments kept for later reassembly).
+ */
+ if (!last)
+ fragrun_create(q, skb); /* First fragment. */
+ else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) {
+ /* This is the common case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < FRAG_CB(last)->ip_defrag_offset + last->len)
+ return IPFRAG_OVERLAP;
+ if (offset == FRAG_CB(last)->ip_defrag_offset + last->len)
+ fragrun_append_to_last(q, skb);
+ else
+ fragrun_create(q, skb);
+ } else {
+ /* Binary search. Note that skb can become the first fragment,
+ * but not the last (covered above).
+ */
+ struct rb_node **rbn, *parent;
+
+ rbn = &q->rb_fragments.rb_node;
+ do {
+ struct sk_buff *curr;
+ int curr_run_end;
+
+ parent = *rbn;
+ curr = rb_to_skb(parent);
+ curr_run_end = FRAG_CB(curr)->ip_defrag_offset +
+ FRAG_CB(curr)->frag_run_len;
+ if (end <= FRAG_CB(curr)->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= curr_run_end)
+ rbn = &parent->rb_right;
+ else if (offset >= FRAG_CB(curr)->ip_defrag_offset &&
+ end <= curr_run_end)
+ return IPFRAG_DUP;
+ else
+ return IPFRAG_OVERLAP;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb.
+ */
+ fragcb_clear(skb);
+ rb_link_node(&skb->rbnode, parent, rbn);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+ }
+
+ FRAG_CB(skb)->ip_defrag_offset = offset;
+
+ return IPFRAG_OK;
+}
+EXPORT_SYMBOL(inet_frag_queue_insert);
+
+void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
+ struct sk_buff *parent)
+{
+ struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
+ void (*destructor)(struct sk_buff *);
+ unsigned int orig_truesize = 0;
+ struct sk_buff **nextp = NULL;
+ struct sock *sk = skb->sk;
+ int delta;
+
+ if (sk && is_skb_wmem(skb)) {
+ /* TX: skb->sk might have been passed as argument to
+ * dst->output and must remain valid until tx completes.
+ *
+ * Move sk to reassembled skb and fix up wmem accounting.
+ */
+ orig_truesize = skb->truesize;
+ destructor = skb->destructor;
+ }
+
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
+ if (!fp) {
+ head = skb;
+ goto out_restore_sk;
+ }
+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+ if (RB_EMPTY_NODE(&skb->rbnode))
+ FRAG_CB(parent)->next_frag = fp;
+ else
+ rb_replace_node(&skb->rbnode, &fp->rbnode,
+ &q->rb_fragments);
+ if (q->fragments_tail == skb)
+ q->fragments_tail = fp;
+
+ if (orig_truesize) {
+ /* prevent skb_morph from releasing sk */
+ skb->sk = NULL;
+ skb->destructor = NULL;
+ }
+ skb_morph(skb, head);
+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &q->rb_fragments);
+ consume_skb(head);
+ head = skb;
+ }
+ WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0);
+
+ delta = -head->truesize;
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ goto out_restore_sk;
+
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(q->fqdir, delta);
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments.
+ */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_restore_sk;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->data_len = head->data_len - plen;
+ clone->len = clone->data_len;
+ head->truesize += clone->truesize;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(q->fqdir, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
+ }
+
+out_restore_sk:
+ if (orig_truesize) {
+ int ts_delta = head->truesize - orig_truesize;
+
+ /* if this reassembled skb is fragmented later,
+ * fraglist skbs will get skb->sk assigned from head->sk,
+ * and each frag skb will be released via sock_wfree.
+ *
+ * Update sk_wmem_alloc.
+ */
+ head->sk = sk;
+ head->destructor = destructor;
+ refcount_add(ts_delta, &sk->sk_wmem_alloc);
+ }
+
+ return nextp;
+}
+EXPORT_SYMBOL(inet_frag_reasm_prepare);
+
+void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
+ void *reasm_data, bool try_coalesce)
+{
+ struct sock *sk = is_skb_wmem(head) ? head->sk : NULL;
+ const unsigned int head_truesize = head->truesize;
+ struct sk_buff **nextp = reasm_data;
+ struct rb_node *rbn;
+ struct sk_buff *fp;
+ int sum_truesize;
+
+ skb_push(head, head->data - skb_network_header(head));
+
+ /* Traverse the tree in order, to build frag_list. */
+ fp = FRAG_CB(head)->next_frag;
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &q->rb_fragments);
+
+ sum_truesize = head->truesize;
+ while (rbn || fp) {
+ /* fp points to the next sk_buff in the current run;
+ * rbn points to the next run.
+ */
+ /* Go through the current run. */
+ while (fp) {
+ struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
+ bool stolen;
+ int delta;
+
+ sum_truesize += fp->truesize;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+
+ if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
+ &delta)) {
+ kfree_skb_partial(fp, stolen);
+ } else {
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ fp->sk = NULL;
+
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+
+ *nextp = fp;
+ nextp = &fp->next;
+ }
+
+ fp = next_frag;
+ }
+ /* Move to the next run. */
+ if (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &q->rb_fragments);
+ rbn = rbnext;
+ }
+ }
+ sub_frag_mem_limit(q->fqdir, sum_truesize);
+
+ *nextp = NULL;
+ skb_mark_not_on_list(head);
+ head->prev = NULL;
+ head->tstamp = q->stamp;
+ head->tstamp_type = q->tstamp_type;
+
+ if (sk)
+ refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(inet_frag_reasm_finish);
+
+struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
+{
+ struct sk_buff *head, *skb;
+
+ head = skb_rb_first(&q->rb_fragments);
+ if (!head)
+ return NULL;
+ skb = FRAG_CB(head)->next_frag;
+ if (skb)
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &q->rb_fragments);
+ else
+ rb_erase(&head->rbnode, &q->rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+
+ if (head == q->fragments_tail)
+ q->fragments_tail = NULL;
+
+ sub_frag_mem_limit(q->fqdir, head->truesize);
+
+ return head;
+}
+EXPORT_SYMBOL(inet_frag_pull_head);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 244c4f445c7d..f5826ec4bcaa 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* Generic INET transport hashtables
*
* Authors: Lotsa people, from code originally in tcp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -18,26 +14,78 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
+#include <linux/vmalloc.h>
+#include <linux/memblock.h>
+#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/inet6_hashtables.h>
+#endif
+#include <net/hotdata.h>
#include <net/ip.h>
+#include <net/rps.h>
+#include <net/secure_seq.h>
+#include <net/sock_reuseport.h>
+#include <net/tcp.h>
+
+u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+ return lport + __inet_ehashfn(laddr, 0, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
+}
+EXPORT_SYMBOL_GPL(inet_ehashfn);
+
+/* This function handles inet_sock, but also timewait and request sockets
+ * for IPv4/IPv6.
+ */
+static u32 sk_ehashfn(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return inet6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+#endif
+ return inet_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+}
+
+static bool sk_is_connect_bind(const struct sock *sk)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return inet_twsk(sk)->tw_connect_bind;
+ else
+ return sk->sk_userlocks & SOCK_CONNECT_BIND;
+}
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
*/
-struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+ struct net *net,
struct inet_bind_hashbucket *head,
- const unsigned short snum)
+ const unsigned short snum,
+ int l3mdev)
{
- struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
- if (tb != NULL) {
+ if (tb) {
+ write_pnet(&tb->ib_net, net);
+ tb->l3mdev = l3mdev;
tb->port = snum;
tb->fastreuse = 0;
- INIT_HLIST_HEAD(&tb->owners);
- hlist_add_head(&tb->node, &head->chain);
+ tb->fastreuseport = 0;
+ INIT_HLIST_HEAD(&tb->bhash2);
+ hlist_add_head_rcu(&tb->node, &head->chain);
}
return tb;
}
@@ -45,323 +93,1289 @@ struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
-void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{
+ const struct inet_bind2_bucket *tb2;
+
+ if (hlist_empty(&tb->bhash2)) {
+ hlist_del_rcu(&tb->node);
+ kfree_rcu(tb, rcu);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
+ if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
+ return;
+ }
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+}
+
+bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
+ unsigned short port, int l3mdev)
+{
+ return net_eq(ib_net(tb), net) && tb->port == port &&
+ tb->l3mdev == l3mdev;
+}
+
+static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
+ struct net *net,
+ struct inet_bind_hashbucket *head,
+ struct inet_bind_bucket *tb,
+ const struct sock *sk)
+{
+ write_pnet(&tb2->ib_net, net);
+ tb2->l3mdev = tb->l3mdev;
+ tb2->port = tb->port;
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
+ if (sk->sk_family == AF_INET6) {
+ tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+ tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ } else {
+ tb2->addr_type = IPV6_ADDR_MAPPED;
+ ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
+ }
+#else
+ tb2->rcv_saddr = sk->sk_rcv_saddr;
+#endif
+ tb2->fastreuse = 0;
+ tb2->fastreuseport = 0;
+ INIT_HLIST_HEAD(&tb2->owners);
+ hlist_add_head(&tb2->node, &head->chain);
+ hlist_add_head(&tb2->bhash_node, &tb->bhash2);
+}
+
+struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
+ struct net *net,
+ struct inet_bind_hashbucket *head,
+ struct inet_bind_bucket *tb,
+ const struct sock *sk)
+{
+ struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+ if (tb2)
+ inet_bind2_bucket_init(tb2, net, head, tb, sk);
+
+ return tb2;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
+{
+ const struct sock *sk;
+
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
+ __hlist_del(&tb->bhash_node);
kmem_cache_free(cachep, tb);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ sk_for_each_bound(sk, &tb->owners) {
+ if (!sk_is_connect_bind(sk))
+ return;
}
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+}
+
+static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
+
+ if (tb2->addr_type != IPV6_ADDR_MAPPED)
+ return false;
+#endif
+ return tb2->rcv_saddr == sk->sk_rcv_saddr;
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
- const unsigned short snum)
+ struct inet_bind2_bucket *tb2, unsigned short port)
{
- inet_sk(sk)->num = snum;
- sk_add_bind_node(sk, &tb->owners);
+ inet_sk(sk)->inet_num = port;
inet_csk(sk)->icsk_bind_hash = tb;
+ inet_csk(sk)->icsk_bind2_hash = tb2;
+ sk_add_bind_node(sk, &tb2->owners);
}
/*
* Get rid of any references to a local port held by the given sock.
*/
-static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+static void __inet_put_port(struct sock *sk)
{
- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ struct inet_bind_hashbucket *head, *head2;
+ struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
+ int bhash;
+
+ bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
+ head = &hashinfo->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
- __sk_del_bind_node(sk);
inet_csk(sk)->icsk_bind_hash = NULL;
- inet_sk(sk)->num = 0;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_sk(sk)->inet_num = 0;
+ sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
+
+ spin_lock(&head2->lock);
+ if (inet_csk(sk)->icsk_bind2_hash) {
+ struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
+
+ __sk_del_bind_node(sk);
+ inet_csk(sk)->icsk_bind2_hash = NULL;
+ inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+ }
+ spin_unlock(&head2->lock);
+
+ inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);
}
-void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+void inet_put_port(struct sock *sk)
{
local_bh_disable();
- __inet_put_port(hashinfo, sk);
+ __inet_put_port(sk);
local_bh_enable();
}
-
EXPORT_SYMBOL(inet_put_port);
-/*
- * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
- write_lock(&hashinfo->lhash_lock);
+ struct inet_hashinfo *table = tcp_get_hashinfo(sk);
+ unsigned short port = inet_sk(child)->inet_num;
+ struct inet_bind_hashbucket *head, *head2;
+ bool created_inet_bind_bucket = false;
+ struct net *net = sock_net(sk);
+ bool update_fastreuse = false;
+ struct inet_bind2_bucket *tb2;
+ struct inet_bind_bucket *tb;
+ int bhash, l3mdev;
- if (atomic_read(&hashinfo->lhash_users)) {
- DEFINE_WAIT(wait);
+ bhash = inet_bhashfn(net, port, table->bhash_size);
+ head = &table->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(table, child, net, port);
- for (;;) {
- prepare_to_wait_exclusive(&hashinfo->lhash_wait,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&hashinfo->lhash_users))
+ spin_lock(&head->lock);
+ spin_lock(&head2->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ tb2 = inet_csk(sk)->icsk_bind2_hash;
+ if (unlikely(!tb || !tb2)) {
+ spin_unlock(&head2->lock);
+ spin_unlock(&head->lock);
+ return -ENOENT;
+ }
+ if (tb->port != port) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+ /* NOTE: using tproxy and redirecting skbs to a proxy
+ * on a different listener port breaks the assumption
+ * that the listener socket's icsk_bind_hash is the same
+ * as that of the child socket. We have to look up or
+ * create a new bind bucket for the child here. */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (inet_bind_bucket_match(tb, net, port, l3mdev))
break;
- write_unlock_bh(&hashinfo->lhash_lock);
- schedule();
- write_lock_bh(&hashinfo->lhash_lock);
}
+ if (!tb) {
+ tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+ net, head, port, l3mdev);
+ if (!tb) {
+ spin_unlock(&head2->lock);
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+ }
+ created_inet_bind_bucket = true;
+ }
+ update_fastreuse = true;
+
+ goto bhash2_find;
+ } else if (!inet_bind2_bucket_addr_match(tb2, child)) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+bhash2_find:
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
+ net, head2, tb, child);
+ if (!tb2)
+ goto error;
+ }
+ }
+ if (update_fastreuse)
+ inet_csk_update_fastreuse(child, tb, tb2);
+ inet_bind_hash(child, tb, tb2, port);
+ spin_unlock(&head2->lock);
+ spin_unlock(&head->lock);
+
+ return 0;
+
+error:
+ if (created_inet_bind_bucket)
+ inet_bind_bucket_destroy(tb);
+ spin_unlock(&head2->lock);
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(__inet_inherit_port);
+
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+ u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ hash = ipv6_portaddr_hash(sock_net(sk),
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ else
+#endif
+ hash = ipv4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ return inet_lhash2_bucket(h, hash);
+}
+
+static inline int compute_score(struct sock *sk, const struct net *net,
+ const unsigned short hnum, const __be32 daddr,
+ const int dif, const int sdif)
+{
+ int score = -1;
+
+ if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
+ !ipv6_only_sock(sk)) {
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
- finish_wait(&hashinfo->lhash_wait, &wait);
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
+ score = sk->sk_bound_dev_if ? 2 : 1;
+
+ if (sk->sk_family == PF_INET)
+ score++;
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ score++;
}
+ return score;
}
-EXPORT_SYMBOL(inet_listen_wlock);
+/**
+ * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
+ * @net: network namespace.
+ * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
+ * @skb: context for a potential SK_REUSEPORT program.
+ * @doff: header offset.
+ * @saddr: source address.
+ * @sport: source port.
+ * @daddr: destination address.
+ * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
+ *
+ * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
+ * the selected sock or an error.
+ */
+struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum,
+ inet_ehashfn_t *ehashfn)
+{
+ struct sock *reuse_sk = NULL;
+ u32 phash;
+
+ if (sk->sk_reuseport) {
+ phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
+ net, daddr, hnum, saddr, sport);
+ reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+ }
+ return reuse_sk;
+}
+EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
/*
- * Don't inline this cruft. Here are some nice properties to exploit here. The
- * BSD API does not allow a listening sock to specify the remote port nor the
+ * Here are some nice properties to exploit here. The BSD API
+ * does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/
-static struct sock *inet_lookup_listener_slow(const struct hlist_head *head,
- const __be32 daddr,
- const unsigned short hnum,
- const int dif)
-{
- struct sock *result = NULL, *sk;
- const struct hlist_node *node;
- int hiscore = -1;
-
- sk_for_each(sk, node, head) {
- const struct inet_sock *inet = inet_sk(sk);
-
- if (inet->num == hnum && !ipv6_only_sock(sk)) {
- const __be32 rcv_saddr = inet->rcv_saddr;
- int score = sk->sk_family == PF_INET ? 1 : 0;
-
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- continue;
- score += 2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score += 2;
- }
- if (score == 5)
- return sk;
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
+
+/* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(const struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif, const int sdif)
+{
+ struct sock *sk, *result = NULL;
+ struct hlist_nulls_node *node;
+ int score, hiscore = 0;
+
+ sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
+ if (score > hiscore) {
+ result = inet_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, inet_ehashfn);
+ if (result)
+ return result;
+
+ result = sk;
+ hiscore = score;
}
}
+
return result;
}
-/* Optimize the common listener case. */
-struct sock *__inet_lookup_listener(struct inet_hashinfo *hashinfo,
- const __be32 daddr, const unsigned short hnum,
- const int dif)
+struct sock *inet_lookup_run_sk_lookup(const struct net *net,
+ int protocol,
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, u16 hnum, const int dif,
+ inet_ehashfn_t *ehashfn)
{
- struct sock *sk = NULL;
- const struct hlist_head *head;
+ struct sock *sk, *reuse_sk;
+ bool no_reuseport;
- read_lock(&hashinfo->lhash_lock);
- head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
- if (!hlist_empty(head)) {
- const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+ no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
+ daddr, hnum, dif, &sk);
+ if (no_reuseport || IS_ERR_OR_NULL(sk))
+ return sk;
- if (inet->num == hnum && !sk->sk_node.next &&
- (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
- (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
- !sk->sk_bound_dev_if)
- goto sherry_cache;
- sk = inet_lookup_listener_slow(head, daddr, hnum, dif);
+ reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
+ ehashfn);
+ if (reuse_sk)
+ sk = reuse_sk;
+ return sk;
+}
+
+struct sock *__inet_lookup_listener(const struct net *net,
+ struct sk_buff *skb, int doff,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif, const int sdif)
+{
+ struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
+ struct sock *result = NULL;
+ unsigned int hash2;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+ result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
+ saddr, sport, daddr, hnum, dif,
+ inet_ehashfn);
+ if (result)
+ goto done;
}
- if (sk) {
-sherry_cache:
- sock_hold(sk);
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ goto done;
+
+ /* Lookup lhash2 with INADDR_ANY */
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, htonl(INADDR_ANY), hnum,
+ dif, sdif);
+done:
+ if (IS_ERR(result))
+ return NULL;
+ return result;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+ if (!refcount_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_free(inet_twsk(sk));
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ reqsk_free(inet_reqsk(sk));
+ else
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
+void sock_edemux(struct sk_buff *skb)
+{
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
+struct sock *__inet_lookup_established(const struct net *net,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif, const int sdif)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const struct hlist_nulls_node *node;
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
+
+begin:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
+ if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ goto out;
+ if (unlikely(!inet_match(net, sk, acookie,
+ ports, dif, sdif))) {
+ sock_gen_put(sk);
+ goto begin;
+ }
+ goto found;
+ }
}
- read_unlock(&hashinfo->lhash_lock);
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+out:
+ sk = NULL;
+found:
return sk;
}
-EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup,
+ u32 hash)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
- __be32 daddr = inet->rcv_saddr;
- __be32 saddr = inet->daddr;
+ __be32 daddr = inet->inet_rcv_saddr;
+ __be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ struct net *net = sock_net(sk);
+ int sdif = l3mdev_master_ifindex_by_index(net, dif);
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct inet_timewait_sock *tw = NULL;
+ const struct hlist_nulls_node *node;
struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
+ spinlock_t *lock;
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
- if (twsk_unique(sk, sk2, twp))
- goto unique;
- else
- goto not_unique;
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet_match(net, sk2, acookie, ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
}
+ return 0;
}
- tw = NULL;
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+ lock = inet_ehash_lockp(hinfo, hash);
+ spin_lock(lock);
+
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (tcp_twsk_unique(sk, sk2, twp))
+ break;
+ }
goto not_unique;
+ }
}
-unique:
/* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
+ * in hash table socket with a funny identity.
+ */
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
sk->sk_hash = hash;
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ WARN_ON(!sk_unhashed(sk));
+ __sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
+ __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
+ spin_unlock(lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
-
return 0;
not_unique:
- write_unlock(&head->lock);
+ spin_unlock(lock);
return -EADDRNOTAVAIL;
}
-static inline u32 inet_sk_port_offset(const struct sock *sk)
+static u64 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
- inet->dport);
+
+ return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+ inet->inet_daddr,
+ inet->inet_dport);
}
-/*
- * Bind a port for a connect operation and hash it.
+/* Searches for an exsiting socket in the ehash bucket list.
+ * Returns true if found, false otherwise.
*/
-int inet_hash_connect(struct inet_timewait_death_row *death_row,
- struct sock *sk)
+static bool inet_ehash_lookup_by_sk(struct sock *sk,
+ struct hlist_nulls_head *list)
{
- struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int range = high - low;
- int i;
- int port;
- static u32 hint;
- u32 offset = hint + inet_sk_port_offset(sk);
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__inet_check_established(death_row,
- sk, port,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- }
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
+ const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
+ const int sdif = sk->sk_bound_dev_if;
+ const int dif = sk->sk_bound_dev_if;
+ const struct hlist_nulls_node *node;
+ struct net *net = sock_net(sk);
+ struct sock *esk;
-ok:
- hint += i;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __inet_hash(hinfo, sk, 0);
- }
- spin_unlock(&head->lock);
-
- if (tw) {
- inet_twsk_deschedule(tw, death_row);
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash(hinfo, sk, 0);
- spin_unlock_bh(&head->lock);
+ INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
+
+ sk_nulls_for_each_rcu(esk, node, list) {
+ if (esk->sk_hash != sk->sk_hash)
+ continue;
+ if (sk->sk_family == AF_INET) {
+ if (unlikely(inet_match(net, esk, acookie,
+ ports, dif, sdif))) {
+ return true;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (unlikely(inet6_match(net, esk,
+ &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr,
+ ports, dif, sdif))) {
+ return true;
+ }
+ }
+#endif
+ }
+ return false;
+}
+
+/* Insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT)
+ * If an existing socket already exists, socket sk is not inserted,
+ * and sets found_dup_sk parameter to true.
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+{
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ struct inet_ehash_bucket *head;
+ struct hlist_nulls_head *list;
+ spinlock_t *lock;
+ bool ret = true;
+
+ WARN_ON_ONCE(!sk_unhashed(sk));
+
+ sk->sk_hash = sk_ehashfn(sk);
+ head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ list = &head->chain;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock(lock);
+ if (osk) {
+ WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+ ret = sk_nulls_replace_node_init_rcu(osk, sk);
+ goto unlock;
+ }
+
+ if (found_dup_sk) {
+ *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
+ if (*found_dup_sk)
+ ret = false;
+ }
+
+ if (ret)
+ __sk_nulls_add_node_rcu(sk, list);
+
+unlock:
+ spin_unlock(lock);
+
+ return ret;
+}
+
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+{
+ bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
+
+ if (ok) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ } else {
+ tcp_orphan_count_inc();
+ inet_sk_set_state(sk, TCP_CLOSE);
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_csk_destroy_sock(sk);
+ }
+ return ok;
+}
+EXPORT_IPV6_MOD(inet_ehash_nolisten);
+
+static int inet_reuseport_add_sock(struct sock *sk,
+ struct inet_listen_hashbucket *ilb)
+{
+ struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+ const struct hlist_nulls_node *node;
+ kuid_t uid = sk_uid(sk);
+ struct sock *sk2;
+
+ sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
+ if (sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+ inet_csk(sk2)->icsk_bind_hash == tb &&
+ sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
+ inet_rcv_saddr_equal(sk, sk2, false))
+ return reuseport_add_sock(sk, sk2,
+ inet_rcv_saddr_any(sk));
+ }
+
+ return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
+}
+
+int inet_hash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ struct inet_listen_hashbucket *ilb2;
+ int err = 0;
+
+ if (sk->sk_state == TCP_CLOSE)
return 0;
+
+ if (sk->sk_state != TCP_LISTEN) {
+ local_bh_disable();
+ inet_ehash_nolisten(sk, NULL, NULL);
+ local_bh_enable();
+ return 0;
+ }
+ WARN_ON(!sk_unhashed(sk));
+ ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
+
+ spin_lock(&ilb2->lock);
+ if (sk->sk_reuseport) {
+ err = inet_reuseport_add_sock(sk, ilb2);
+ if (err)
+ goto unlock;
+ }
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+ sk->sk_family == AF_INET6)
+ __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
+ else
+ __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
+ spin_unlock(&ilb2->lock);
+
+ return err;
+}
+EXPORT_IPV6_MOD(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+
+ if (sk_unhashed(sk))
+ return;
+
+ sock_rps_delete_flow(sk);
+ if (sk->sk_state == TCP_LISTEN) {
+ struct inet_listen_hashbucket *ilb2;
+
+ ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
+ /* Don't disable bottom halves while acquiring the lock to
+ * avoid circular locking dependency on PREEMPT_RT.
+ */
+ spin_lock(&ilb2->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_stop_listen_sock(sk);
+
+ __sk_nulls_del_node_init_rcu(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock(&ilb2->lock);
} else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __inet_check_established(death_row, sk, snum, NULL);
-out:
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ __sk_nulls_del_node_init_rcu(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(lock);
+ }
+}
+EXPORT_IPV6_MOD(inet_unhash);
+
+static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
+ const struct net *net, unsigned short port,
+ int l3mdev, const struct sock *sk)
+{
+ if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+ tb->l3mdev != l3mdev)
+ return false;
+
+ return inet_bind2_bucket_addr_match(tb, sk);
+}
+
+bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
+ unsigned short port, int l3mdev, const struct sock *sk)
+{
+ if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+ tb->l3mdev != l3mdev)
+ return false;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb->addr_type == IPV6_ADDR_ANY)
+ return true;
+
+ if (tb->addr_type != IPV6_ADDR_MAPPED)
+ return false;
+
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return false;
+#endif
+ return tb->rcv_saddr == 0;
+}
+
+/* The socket's bhash2 hashbucket spinlock must be held when this is called */
+struct inet_bind2_bucket *
+inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
+ unsigned short port, int l3mdev, const struct sock *sk)
+{
+ struct inet_bind2_bucket *bhash2 = NULL;
+
+ inet_bind_bucket_for_each(bhash2, &head->chain)
+ if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
+ break;
+
+ return bhash2;
+}
+
+struct inet_bind_hashbucket *
+inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
+{
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ hash = ipv6_portaddr_hash(net, &in6addr_any, port);
+ else
+#endif
+ hash = ipv4_portaddr_hash(net, 0, port);
+
+ return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
+}
+
+static void inet_update_saddr(struct sock *sk, void *saddr, int family)
+{
+ if (family == AF_INET) {
+ inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
+ sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else {
+ sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
+ }
+#endif
+}
+
+static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
+{
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ struct inet_bind_hashbucket *head, *head2;
+ struct inet_bind2_bucket *tb2, *new_tb2;
+ int l3mdev = inet_sk_bound_l3mdev(sk);
+ int port = inet_sk(sk)->inet_num;
+ struct net *net = sock_net(sk);
+ int bhash;
+
+ if (!inet_csk(sk)->icsk_bind2_hash) {
+ /* Not bind()ed before. */
+ if (reset)
+ inet_reset_saddr(sk);
+ else
+ inet_update_saddr(sk, saddr, family);
+
+ return 0;
+ }
+
+ /* Allocate a bind2 bucket ahead of time to avoid permanently putting
+ * the bhash2 table in an inconsistent state if a new tb2 bucket
+ * allocation fails.
+ */
+ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
+ if (!new_tb2) {
+ if (reset) {
+ /* The (INADDR_ANY, port) bucket might have already
+ * been freed, then we cannot fixup icsk_bind2_hash,
+ * so we give up and unlink sk from bhash/bhash2 not
+ * to leave inconsistency in bhash2.
+ */
+ inet_put_port(sk);
+ inet_reset_saddr(sk);
+ }
+
+ return -ENOMEM;
+ }
+
+ bhash = inet_bhashfn(net, port, hinfo->bhash_size);
+ head = &hinfo->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+ /* If we change saddr locklessly, another thread
+ * iterating over bhash might see corrupted address.
+ */
+ spin_lock_bh(&head->lock);
+
+ spin_lock(&head2->lock);
+ __sk_del_bind_node(sk);
+ inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
+ spin_unlock(&head2->lock);
+
+ if (reset)
+ inet_reset_saddr(sk);
+ else
+ inet_update_saddr(sk, saddr, family);
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+ spin_lock(&head2->lock);
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+ if (!tb2) {
+ tb2 = new_tb2;
+ inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
+ if (sk_is_connect_bind(sk)) {
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
+ }
+ }
+ inet_csk(sk)->icsk_bind2_hash = tb2;
+ sk_add_bind_node(sk, &tb2->owners);
+ spin_unlock(&head2->lock);
+
+ spin_unlock_bh(&head->lock);
+
+ if (tb2 != new_tb2)
+ kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
+
+ return 0;
+}
+
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
+{
+ return __inet_bhash2_update_saddr(sk, saddr, family, false);
+}
+EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
+
+void inet_bhash2_reset_saddr(struct sock *sk)
+{
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ __inet_bhash2_update_saddr(sk, NULL, 0, true);
+}
+EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
+
+/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
+ * Note that we use 32bit integers (vs RFC 'short integers')
+ * because 2^16 is not a multiple of num_ephemeral and this
+ * property might be used by clever attacker.
+ *
+ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
+ * attacks were since demonstrated, thus we use 65536 by default instead
+ * to really give more isolation and privacy, at the expense of 256kB
+ * of kernel memory.
+ */
+#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
+static u32 *table_perturb;
+
+int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk, u64 port_offset,
+ u32 hash_port0,
+ int (*check_established)(struct inet_timewait_death_row *,
+ struct sock *, __u16, struct inet_timewait_sock **,
+ bool rcu_lookup, u32 hash))
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_bind_hashbucket *head, *head2;
+ struct inet_timewait_sock *tw = NULL;
+ int port = inet_sk(sk)->inet_num;
+ struct net *net = sock_net(sk);
+ struct inet_bind2_bucket *tb2;
+ struct inet_bind_bucket *tb;
+ bool tb_created = false;
+ u32 remaining, offset;
+ int ret, i, low, high;
+ bool local_ports;
+ int step, l3mdev;
+ u32 index;
+
+ if (port) {
+ local_bh_disable();
+ ret = check_established(death_row, sk, port, NULL, false,
+ hash_port0 + port);
local_bh_enable();
return ret;
}
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+ local_ports = inet_sk_get_local_port_range(sk, &low, &high);
+ step = local_ports ? 1 : 2;
+
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ remaining = high - low;
+ if (!local_ports && remaining > 1)
+ remaining &= ~1U;
+
+ get_random_sleepable_once(table_perturb,
+ INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
+ index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
+
+ offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
+ offset %= remaining;
+
+ /* In first pass we try ports of @low parity.
+ * inet_csk_get_port() does the opposite choice.
+ */
+ if (!local_ports)
+ offset &= ~1U;
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += step, port += step) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tb, &head->chain, node) {
+ if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+ continue;
+ if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+ rcu_read_unlock();
+ goto next_port;
+ }
+ if (!check_established(death_row, sk, port, &tw, true,
+ hash_port0 + port))
+ break;
+ rcu_read_unlock();
+ goto next_port;
+ }
+ rcu_read_unlock();
+
+ spin_lock_bh(&head->lock);
+
+ /* Does not bother with rcv_saddr checks, because
+ * the established check is already unique enough.
+ */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
+ goto next_port_unlock;
+ WARN_ON(hlist_empty(&tb->bhash2));
+ if (!check_established(death_row, sk,
+ port, &tw, false,
+ hash_port0 + port))
+ goto ok;
+ goto next_port_unlock;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port, l3mdev);
+ if (!tb) {
+ spin_unlock_bh(&head->lock);
+ return -ENOMEM;
+ }
+ tb_created = true;
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+ goto ok;
+next_port_unlock:
+ spin_unlock_bh(&head->lock);
+next_port:
+ cond_resched();
+ }
+
+ if (!local_ports) {
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
+ }
+ return -EADDRNOTAVAIL;
+
+ok:
+ /* Find the corresponding tb2 bucket since we need to
+ * add the socket to the bhash2 table as well
+ */
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+ spin_lock(&head2->lock);
+
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
+ head2, tb, sk);
+ if (!tb2)
+ goto error;
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
+ }
+
+ /* Here we want to add a little bit of randomness to the next source
+ * port that will be chosen. We use a max() with a random here so that
+ * on low contention the randomness is maximal and on high contention
+ * it may be inexistent.
+ */
+ i = max_t(int, i, get_random_u32_below(8) * step);
+ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, tb2, port);
+ sk->sk_userlocks |= SOCK_CONNECT_BIND;
+
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+ inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
+ }
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+
+ spin_unlock(&head2->lock);
+ spin_unlock(&head->lock);
+
+ if (tw)
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
+
+error:
+ if (sk_hashed(sk)) {
+ spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);
+
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+
+ spin_lock(lock);
+ __sk_nulls_del_node_init_rcu(sk);
+ spin_unlock(lock);
+
+ sk->sk_hash = 0;
+ inet_sk(sk)->inet_sport = 0;
+ inet_sk(sk)->inet_num = 0;
+
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+ }
+
+ spin_unlock(&head2->lock);
+ if (tb_created)
+ inet_bind_bucket_destroy(tb);
+ spin_unlock(&head->lock);
+
+ if (tw)
+ inet_twsk_deschedule_put(tw);
+
+ local_bh_enable();
+
+ return -ENOMEM;
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct net *net = sock_net(sk);
+ u64 port_offset = 0;
+ u32 hash_port0;
+
+ if (!inet_sk(sk)->inet_num)
+ port_offset = inet_sk_port_offset(sk);
+
+ hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
+ inet->inet_daddr, inet->inet_dport);
+
+ return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
+ __inet_check_established);
+}
+
+static void init_hashinfo_lhash2(struct inet_hashinfo *h)
+{
+ int i;
+
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
+ i + LISTENING_NULLS_BASE);
+ }
+}
+
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+ unsigned long numentries, int scale,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ h->lhash2 = alloc_large_system_hash(name,
+ sizeof(*h->lhash2),
+ numentries,
+ scale,
+ 0,
+ NULL,
+ &h->lhash2_mask,
+ low_limit,
+ high_limit);
+ init_hashinfo_lhash2(h);
+
+ /* this one is used for source ports of outgoing connections */
+ table_perturb = alloc_large_system_hash("Table-perturb",
+ sizeof(*table_perturb),
+ INET_TABLE_PERTURB_SIZE,
+ 0, 0, NULL, NULL,
+ INET_TABLE_PERTURB_SIZE,
+ INET_TABLE_PERTURB_SIZE);
+}
+
+int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
+{
+ h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
+ if (!h->lhash2)
+ return -ENOMEM;
+
+ h->lhash2_mask = INET_LHTABLE_SIZE - 1;
+ /* INET_LHTABLE_SIZE must be a power of 2 */
+ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
+
+ init_hashinfo_lhash2(h);
+ return 0;
+}
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+ unsigned int locksz = sizeof(spinlock_t);
+ unsigned int i, nblocks = 1;
+ spinlock_t *ptr = NULL;
+
+ if (locksz == 0)
+ goto set_mask;
+
+ /* Allocate 2 cache lines or at least one spinlock per cpu. */
+ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
+
+ /* At least one page per NUMA node. */
+ nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
+
+ nblocks = roundup_pow_of_two(nblocks);
+
+ /* No more locks than number of hash buckets. */
+ nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+ if (num_online_nodes() > 1) {
+ /* Use vmalloc() to allow NUMA policy to spread pages
+ * on all available nodes if desired.
+ */
+ ptr = vmalloc_array(nblocks, locksz);
+ }
+ if (!ptr) {
+ ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+ }
+ for (i = 0; i < nblocks; i++)
+ spin_lock_init(&ptr[i]);
+ hashinfo->ehash_locks = ptr;
+set_mask:
+ hashinfo->ehash_locks_mask = nblocks - 1;
+ return 0;
+}
+
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+ unsigned int ehash_entries)
+{
+ struct inet_hashinfo *new_hashinfo;
+ int i;
+
+ new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
+ if (!new_hashinfo)
+ goto err;
+
+ new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
+ GFP_KERNEL_ACCOUNT);
+ if (!new_hashinfo->ehash)
+ goto free_hashinfo;
+
+ new_hashinfo->ehash_mask = ehash_entries - 1;
+
+ if (inet_ehash_locks_alloc(new_hashinfo))
+ goto free_ehash;
+
+ for (i = 0; i < ehash_entries; i++)
+ INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
+
+ new_hashinfo->pernet = true;
+
+ return new_hashinfo;
+
+free_ehash:
+ vfree(new_hashinfo->ehash);
+free_hashinfo:
+ kfree(new_hashinfo);
+err:
+ return NULL;
}
-EXPORT_SYMBOL_GPL(inet_hash_connect);
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
+{
+ if (!hashinfo->pernet)
+ return;
+
+ inet_ehash_locks_free(hashinfo);
+ vfree(hashinfo->ehash);
+ kfree(hashinfo);
+}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index cdd805344c61..d4c781a0667f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -8,252 +9,261 @@
* From code orinally in TCP
*/
-
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/psp.h>
-/* Must be called with locally disabled BHs. */
-void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+/**
+ * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ * @tw: timewait socket
+ * @hashinfo: hashinfo pointer
+ *
+ * unhash a timewait socket from bind hash, if hashed.
+ * bind hash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo)
{
- struct inet_bind_hashbucket *bhead;
- struct inet_bind_bucket *tb;
- /* Unlink from established hashes. */
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
-
- write_lock(&ehead->lock);
- if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
+ struct inet_bind2_bucket *tb2 = tw->tw_tb2;
+ struct inet_bind_bucket *tb = tw->tw_tb;
+
+ if (!tb)
return;
- }
- __hlist_del(&tw->tw_node);
- sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
+
+ __sk_del_bind_node((struct sock *)tw);
+ tw->tw_tb = NULL;
+ tw->tw_tb2 = NULL;
+ inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+ inet_bind_bucket_destroy(tb);
+
+ __sock_put((struct sock *)tw);
+}
+
+/* Must be called with locally disabled BHs. */
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+ struct inet_bind_hashbucket *bhead, *bhead2;
+
+ spin_lock(lock);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
+ spin_unlock(lock);
/* Disassociate with bind bucket. */
- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
+ hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
+ twsk_net(tw), tw->tw_num);
+
spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_lock(&bhead2->lock);
+ inet_twsk_bind_unhash(tw, hashinfo);
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
-#ifdef SOCK_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
- tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
- }
-#endif
+
+ refcount_dec(&tw->tw_dr->tw_refcount);
inet_twsk_put(tw);
}
-EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+void inet_twsk_free(struct inet_timewait_sock *tw)
+{
+ struct module *owner = tw->tw_prot->owner;
+
+ tcp_twsk_destructor((struct sock *)tw);
+ kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+ module_put(owner);
+}
+
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+ if (refcount_dec_and_test(&tw->tw_refcnt))
+ inet_twsk_free(tw);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
+static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
+{
+ __inet_twsk_schedule(tw, timeo, false);
+}
/*
- * Enter the time wait state. This is called with locally disabled BH.
+ * Enter the time wait state.
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
+ *
+ * The caller must not access @tw anymore after this function returns.
*/
-void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
- struct inet_hashinfo *hashinfo)
+void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
+ struct sock *sk,
+ struct inet_hashinfo *hashinfo,
+ int timeo)
{
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
- struct inet_bind_hashbucket *bhead;
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet->num != 0 MUST be bound in
- binding cache, even if it is closed.
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ struct inet_bind_hashbucket *bhead, *bhead2;
+
+ /* Put TW into bind hash. Original socket stays there too.
+ * Note, that any socket with inet->num != 0 MUST be bound in
+ * binding cache, even if it is closed.
*/
- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
+ hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+
+ local_bh_disable();
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
+
tw->tw_tb = icsk->icsk_bind_hash;
- BUG_TRAP(icsk->icsk_bind_hash);
- inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ WARN_ON(!icsk->icsk_bind_hash);
+
+ tw->tw_tb2 = icsk->icsk_bind2_hash;
+ WARN_ON(!icsk->icsk_bind2_hash);
+ sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners);
+
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
- write_lock(&ehead->lock);
+ spin_lock(lock);
- /* Step 2: Remove SK from established hash. */
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
+ /* tw_refcnt is set to 3 because we have :
+ * - one reference for bhash chain.
+ * - one reference for ehash chain.
+ * - one reference for timer.
+ * Also note that after this point, we lost our implicit reference
+ * so we are not allowed to use tw anymore.
+ */
+ refcount_set(&tw->tw_refcnt, 3);
+
+ /* Ensure tw_refcnt has been set before tw is published.
+ * smp_wmb() provides the necessary memory barrier to enforce this
+ * ordering.
+ */
+ smp_wmb();
+
+ hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
- inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
- atomic_inc(&tw->tw_refcnt);
+ inet_twsk_schedule(tw, timeo);
- write_unlock(&ehead->lock);
+ spin_unlock(lock);
+ local_bh_enable();
}
-EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+static void tw_timer_handler(struct timer_list *t)
+{
+ struct inet_timewait_sock *tw = timer_container_of(tw, t, tw_timer);
+
+ inet_twsk_kill(tw);
+}
-struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+ struct inet_timewait_death_row *dr,
+ const int state)
{
- struct inet_timewait_sock *tw =
- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
- SLAB_ATOMIC);
- if (tw != NULL) {
+ struct inet_timewait_sock *tw;
+
+ if (refcount_read(&dr->tw_refcount) - 1 >=
+ READ_ONCE(dr->sysctl_max_tw_buckets))
+ return NULL;
+
+ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ GFP_ATOMIC);
+ if (tw) {
const struct inet_sock *inet = inet_sk(sk);
+ tw->tw_dr = dr;
/* Give us an identity. */
- tw->tw_daddr = inet->daddr;
- tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_daddr = inet->inet_daddr;
+ tw->tw_rcv_saddr = inet->inet_rcv_saddr;
tw->tw_bound_dev_if = sk->sk_bound_dev_if;
- tw->tw_num = inet->num;
+ tw->tw_tos = inet->tos;
+ tw->tw_num = inet->inet_num;
tw->tw_state = TCP_TIME_WAIT;
tw->tw_substate = state;
- tw->tw_sport = inet->sport;
- tw->tw_dport = inet->dport;
+ tw->tw_sport = inet->inet_sport;
+ tw->tw_dport = inet->inet_dport;
tw->tw_family = sk->sk_family;
tw->tw_reuse = sk->sk_reuse;
+ tw->tw_reuseport = sk->sk_reuseport;
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
+ tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
+ tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND);
tw->tw_prot = sk->sk_prot_creator;
- atomic_set(&tw->tw_refcnt, 1);
- inet_twsk_dead_node_init(tw);
- __module_get(tw->tw_prot->owner);
- }
-
- return tw;
-}
-
-EXPORT_SYMBOL_GPL(inet_twsk_alloc);
-
-/* Returns non-zero if quota exceeded. */
-static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
- const int slot)
-{
- struct inet_timewait_sock *tw;
- struct hlist_node *node;
- unsigned int killed;
- int ret;
-
- /* NOTE: compare this to previous version where lock
- * was released after detaching chain. It was racy,
- * because tw buckets are scheduled in not serialized context
- * in 2.3 (with netfilter), and with softnet it is common, because
- * soft irqs are not sequenced.
- */
- killed = 0;
- ret = 0;
-rescan:
- inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
- __inet_twsk_del_dead_node(tw);
- spin_unlock(&twdr->death_lock);
- __inet_twsk_kill(tw, twdr->hashinfo);
- inet_twsk_put(tw);
- killed++;
- spin_lock(&twdr->death_lock);
- if (killed > INET_TWDR_TWKILL_QUOTA) {
- ret = 1;
- break;
- }
-
- /* While we dropped twdr->death_lock, another cpu may have
- * killed off the next TW bucket in the list, therefore
- * do a fresh re-read of the hlist head node with the
- * lock reacquired. We still use the hlist traversal
- * macro in order to get the prefetches.
+ atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
+ twsk_net_set(tw, sock_net(sk));
+ timer_setup(&tw->tw_timer, tw_timer_handler, 0);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ tw->tw_validate_xmit_skb = NULL;
+#endif
+ /*
+ * Because we use RCU lookups, we should not set tw_refcnt
+ * to a non null value before everything is setup for this
+ * timewait socket.
*/
- goto rescan;
- }
-
- twdr->tw_count -= killed;
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
- return ret;
-}
+ refcount_set(&tw->tw_refcnt, 0);
-void inet_twdr_hangman(unsigned long data)
-{
- struct inet_timewait_death_row *twdr;
- int unsigned need_timer;
-
- twdr = (struct inet_timewait_death_row *)data;
- spin_lock(&twdr->death_lock);
-
- if (twdr->tw_count == 0)
- goto out;
-
- need_timer = 0;
- if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
- twdr->thread_slots |= (1 << twdr->slot);
- mb();
- schedule_work(&twdr->twkill_work);
- need_timer = 1;
- } else {
- /* We purged the entire slot, anything left? */
- if (twdr->tw_count)
- need_timer = 1;
+ __module_get(tw->tw_prot->owner);
+ psp_twsk_init(tw, sk);
}
- twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
- if (need_timer)
- mod_timer(&twdr->tw_timer, jiffies + twdr->period);
-out:
- spin_unlock(&twdr->death_lock);
-}
-EXPORT_SYMBOL_GPL(inet_twdr_hangman);
-
-extern void twkill_slots_invalid(void);
-
-void inet_twdr_twkill_work(void *data)
-{
- struct inet_timewait_death_row *twdr = data;
- int i;
-
- if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
- twkill_slots_invalid();
-
- while (twdr->thread_slots) {
- spin_lock_bh(&twdr->death_lock);
- for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
- if (!(twdr->thread_slots & (1 << i)))
- continue;
-
- while (inet_twdr_do_twkill_work(twdr, i) != 0) {
- if (need_resched()) {
- spin_unlock_bh(&twdr->death_lock);
- schedule();
- spin_lock_bh(&twdr->death_lock);
- }
- }
-
- twdr->thread_slots &= ~(1 << i);
- }
- spin_unlock_bh(&twdr->death_lock);
- }
+ return tw;
}
-EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
-
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
*/
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw,
- struct inet_timewait_death_row *twdr)
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
- spin_lock(&twdr->death_lock);
- if (inet_twsk_del_dead_node(tw)) {
- inet_twsk_put(tw);
- if (--twdr->tw_count == 0)
- del_timer(&twdr->tw_timer);
- }
- spin_unlock(&twdr->death_lock);
- __inet_twsk_kill(tw, twdr->hashinfo);
-}
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ /* inet_twsk_purge() walks over all sockets, including tw ones,
+ * and removes them via inet_twsk_deschedule_put() after a
+ * refcount_inc_not_zero().
+ *
+ * inet_twsk_hashdance_schedule() must (re)init the refcount before
+ * arming the timer, i.e. inet_twsk_purge can obtain a reference to
+ * a twsk that did not yet schedule the timer.
+ *
+ * The ehash lock synchronizes these two:
+ * After acquiring the lock, the timer is always scheduled (else
+ * timer_shutdown returns false), because hashdance_schedule releases
+ * the ehash lock only after completing the timer initialization.
+ *
+ * Without grabbing the ehash lock, we get:
+ * 1) cpu x sets twsk refcount to 3
+ * 2) cpu y bumps refcount to 4
+ * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down
+ * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown
+ * -> timer refcount is never decremented.
+ */
+ spin_lock(lock);
+ /* Makes sure hashdance_schedule() has completed */
+ spin_unlock(lock);
-EXPORT_SYMBOL(inet_twsk_deschedule);
+ if (timer_shutdown_sync(&tw->tw_timer))
+ inet_twsk_kill(tw);
+ inet_twsk_put(tw);
+}
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
-void inet_twsk_schedule(struct inet_timewait_sock *tw,
- struct inet_timewait_death_row *twdr,
- const int timeo, const int timewait_len)
+void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
{
- struct hlist_head *list;
- int slot;
-
/* timeout := RTO * 3.5
*
* 3.5 = 1+2+0.5 to wait for two retransmits.
@@ -278,108 +288,73 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
* is greater than TS tick!) and detect old duplicates with help
* of PAWS.
*/
- slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
-
- spin_lock(&twdr->death_lock);
-
- /* Unlink it, if it was scheduled */
- if (inet_twsk_del_dead_node(tw))
- twdr->tw_count--;
- else
- atomic_inc(&tw->tw_refcnt);
-
- if (slot >= INET_TWDR_RECYCLE_SLOTS) {
- /* Schedule to slow timer */
- if (timeo >= timewait_len) {
- slot = INET_TWDR_TWKILL_SLOTS - 1;
- } else {
- slot = (timeo + twdr->period - 1) / twdr->period;
- if (slot >= INET_TWDR_TWKILL_SLOTS)
- slot = INET_TWDR_TWKILL_SLOTS - 1;
- }
- tw->tw_ttd = jiffies + timeo;
- slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
- list = &twdr->cells[slot];
+
+ if (!rearm) {
+ bool kill = timeo <= 4*HZ;
+
+ __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+ LINUX_MIB_TIMEWAITED);
+ BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
+ refcount_inc(&tw->tw_dr->tw_refcount);
} else {
- tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
-
- if (twdr->twcal_hand < 0) {
- twdr->twcal_hand = 0;
- twdr->twcal_jiffie = jiffies;
- twdr->twcal_timer.expires = twdr->twcal_jiffie +
- (slot << INET_TWDR_RECYCLE_TICK);
- add_timer(&twdr->twcal_timer);
- } else {
- if (time_after(twdr->twcal_timer.expires,
- jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
- mod_timer(&twdr->twcal_timer,
- jiffies + (slot << INET_TWDR_RECYCLE_TICK));
- slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
- }
- list = &twdr->twcal_row[slot];
+ mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
+}
- hlist_add_head(&tw->tw_death_node, list);
+/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
+void inet_twsk_purge(struct inet_hashinfo *hashinfo)
+{
+ struct inet_ehash_bucket *head = &hashinfo->ehash[0];
+ unsigned int ehash_mask = hashinfo->ehash_mask;
+ struct hlist_nulls_node *node;
+ unsigned int slot;
+ struct sock *sk;
+
+ for (slot = 0; slot <= ehash_mask; slot++, head++) {
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+restart_rcu:
+ cond_resched();
+ rcu_read_lock();
+restart:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ int state = inet_sk_state_load(sk);
+
+ if ((1 << state) & ~(TCPF_TIME_WAIT |
+ TCPF_NEW_SYN_RECV))
+ continue;
- if (twdr->tw_count++ == 0)
- mod_timer(&twdr->tw_timer, jiffies + twdr->period);
- spin_unlock(&twdr->death_lock);
-}
+ if (check_net(sock_net(sk)))
+ continue;
-EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+ if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ continue;
-void inet_twdr_twcal_tick(unsigned long data)
-{
- struct inet_timewait_death_row *twdr;
- int n, slot;
- unsigned long j;
- unsigned long now = jiffies;
- int killed = 0;
- int adv = 0;
-
- twdr = (struct inet_timewait_death_row *)data;
-
- spin_lock(&twdr->death_lock);
- if (twdr->twcal_hand < 0)
- goto out;
-
- slot = twdr->twcal_hand;
- j = twdr->twcal_jiffie;
-
- for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
- if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
- struct inet_timewait_sock *tw;
-
- inet_twsk_for_each_inmate_safe(tw, node, safe,
- &twdr->twcal_row[slot]) {
- __inet_twsk_del_dead_node(tw);
- __inet_twsk_kill(tw, twdr->hashinfo);
- inet_twsk_put(tw);
- killed++;
- }
- } else {
- if (!adv) {
- adv = 1;
- twdr->twcal_jiffie = j;
- twdr->twcal_hand = slot;
+ if (check_net(sock_net(sk))) {
+ sock_gen_put(sk);
+ goto restart;
}
- if (!hlist_empty(&twdr->twcal_row[slot])) {
- mod_timer(&twdr->twcal_timer, j);
- goto out;
+ rcu_read_unlock();
+ local_bh_disable();
+ if (state == TCP_TIME_WAIT) {
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ } else {
+ struct request_sock *req = inet_reqsk(sk);
+
+ inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
+ req);
}
+ local_bh_enable();
+ goto restart_rcu;
}
- j += 1 << INET_TWDR_RECYCLE_TICK;
- slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
}
- twdr->twcal_hand = -1;
-
-out:
- if ((twdr->tw_count -= killed) == 0)
- del_timer(&twdr->tw_timer);
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
- spin_unlock(&twdr->death_lock);
}
-
-EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index f072f3875af8..7b1e0a2d6906 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -3,470 +3,287 @@
*
* This source is covered by the GNU GPL, the same as all kernel sources.
*
- * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
- *
* Authors: Andrey V. Savochkin <saw@msu.ru>
*/
+#include <linux/cache.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/random.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
+#include <net/secure_seq.h>
/*
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
- * At this moment this information consists only of ID field for the next
- * outgoing IP packet. This field is incremented with each packet as encoded
- * in inet_getid() function (include/net/inetpeer.h).
- * At the moment of writing this notes identifier of IP packets is generated
- * to be unpredictable using this code only for packets subjected
- * (actually or potentially) to defragmentation. I.e. DF packets less than
- * PMTU in size uses a constant ID and do not use this code (see
- * ip_select_ident() in include/net/ip.h).
*
- * Route cache entries hold references to our nodes.
- * New cache entries get references via lookup by destination IP address in
- * the avl tree. The reference is grabbed only when it's needed i.e. only
- * when we try to output IP packet which needs an unpredictable ID (see
- * __ip_select_ident() in net/ipv4/route.c).
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
* also be removed if the pool is overloaded i.e. if the total amount of
* entries is greater-or-equal than the threshold.
*
- * Node pool is organised as an AVL tree.
+ * Node pool is organised as an RB tree.
* Such an implementation has been chosen not just for fun. It's a way to
* prevent easy and efficient DoS attacks by creating hash collisions. A huge
* amount of long living nodes in a single hash slot would significantly delay
* lookups performed with disabled BHs.
*
* Serialisation issues.
- * 1. Nodes may appear in the tree only with the pool write lock held.
- * 2. Nodes may disappear from the tree only with the pool write lock held
+ * 1. Nodes may appear in the tree only with the pool lock held.
+ * 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
- * 3. Nodes appears and disappears from unused node list only under
- * "inet_peer_unused_lock".
- * 4. Global variable peer_total is modified under the pool lock.
- * 5. struct inet_peer fields modification:
- * avl_left, avl_right, avl_parent, avl_height: pool lock
- * unused_next, unused_prevp: unused node list lock
+ * 3. Global variable peer_total is modified under the pool lock.
+ * 4. struct inet_peer fields modification:
+ * rb_node: pool lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
- * dtime: unused node list lock
- * v4daddr: unchangeable
- * ip_id_count: idlock
+ * daddr: unchangeable
*/
-/* Exported for inet_getid inline function. */
-DEFINE_SPINLOCK(inet_peer_idlock);
+static struct kmem_cache *peer_cachep __ro_after_init;
-static kmem_cache_t *peer_cachep __read_mostly;
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->rb_root = RB_ROOT;
+ seqlock_init(&bp->lock);
+ bp->total = 0;
+}
+EXPORT_IPV6_MOD_GPL(inet_peer_base_init);
-#define node_height(x) x->avl_height
-static struct inet_peer peer_fake_node = {
- .avl_left = &peer_fake_node,
- .avl_right = &peer_fake_node,
- .avl_height = 0
-};
-#define peer_avl_empty (&peer_fake_node)
-static struct inet_peer *peer_root = peer_avl_empty;
-static DEFINE_RWLOCK(peer_pool_lock);
-#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+#define PEER_MAX_GC 32
-static int peer_total;
/* Exported for sysctl_net_ipv4. */
-int inet_peer_threshold = 65536 + 128; /* start to throw entries more
+int inet_peer_threshold __read_mostly; /* start to throw entries more
* aggressively at this stage */
-int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */
-int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */
-
-static struct inet_peer *inet_peer_unused_head;
-static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
-static DEFINE_SPINLOCK(inet_peer_unused_lock);
-
-static void peer_check_expire(unsigned long dummy);
-static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
-
-/* Exported for sysctl_net_ipv4. */
-int inet_peer_gc_mintime = 10 * HZ,
- inet_peer_gc_maxtime = 120 * HZ;
+int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
+int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
{
- struct sysinfo si;
+ u64 nr_entries;
- /* Use the straight interface to information about memory. */
- si_meminfo(&si);
- /* The values below were suggested by Alexey Kuznetsov
- * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
- * myself. --SAW
- */
- if (si.totalram <= (32768*1024)/PAGE_SIZE)
- inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
- if (si.totalram <= (16384*1024)/PAGE_SIZE)
- inet_peer_threshold >>= 1; /* about 512KB */
- if (si.totalram <= (8192*1024)/PAGE_SIZE)
- inet_peer_threshold >>= 2; /* about 128KB */
-
- peer_cachep = kmem_cache_create("inet_peer_cache",
- sizeof(struct inet_peer),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
-
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- peer_periodic_timer.expires = jiffies
- + net_random() % inet_peer_gc_maxtime
- + inet_peer_gc_maxtime;
- add_timer(&peer_periodic_timer);
-}
+ /* 1% of physical memory */
+ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT,
+ 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer)));
-/* Called with or without local BH being disabled. */
-static void unlink_from_unused(struct inet_peer *p)
-{
- spin_lock_bh(&inet_peer_unused_lock);
- if (p->unused_prevp != NULL) {
- /* On unused list. */
- *p->unused_prevp = p->unused_next;
- if (p->unused_next != NULL)
- p->unused_next->unused_prevp = p->unused_prevp;
- else
- inet_peer_unused_tailp = p->unused_prevp;
- p->unused_prevp = NULL; /* mark it as removed */
- }
- spin_unlock_bh(&inet_peer_unused_lock);
+ inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128);
+
+ peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
}
-/* Called with local BH disabled and the pool lock held. */
-#define lookup(daddr) \
-({ \
- struct inet_peer *u, **v; \
- stackptr = stack; \
- *stackptr++ = &peer_root; \
- for (u = peer_root; u != peer_avl_empty; ) { \
- if (daddr == u->v4daddr) \
- break; \
- if ((__force __u32)daddr < (__force __u32)u->v4daddr) \
- v = &u->avl_left; \
- else \
- v = &u->avl_right; \
- *stackptr++ = v; \
- u = *v; \
- } \
- u; \
-})
-
-/* Called with local BH disabled and the pool write lock held. */
-#define lookup_rightempty(start) \
-({ \
- struct inet_peer *u, **v; \
- *stackptr++ = &start->avl_left; \
- v = &start->avl_left; \
- for (u = *v; u->avl_right != peer_avl_empty; ) { \
- v = &u->avl_right; \
- *stackptr++ = v; \
- u = *v; \
- } \
- u; \
-})
-
-/* Called with local BH disabled and the pool write lock held.
- * Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas. */
-static void peer_avl_rebalance(struct inet_peer **stack[],
- struct inet_peer ***stackend)
+/* Called with rcu_read_lock() or base->lock held */
+static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base,
+ unsigned int seq,
+ struct inet_peer *gc_stack[],
+ unsigned int *gc_cnt,
+ struct rb_node **parent_p,
+ struct rb_node ***pp_p)
{
- struct inet_peer **nodep, *node, *l, *r;
- int lh, rh;
-
- while (stackend > stack) {
- nodep = *--stackend;
- node = *nodep;
- l = node->avl_left;
- r = node->avl_right;
- lh = node_height(l);
- rh = node_height(r);
- if (lh > rh + 1) { /* l: RH+2 */
- struct inet_peer *ll, *lr, *lrl, *lrr;
- int lrh;
- ll = l->avl_left;
- lr = l->avl_right;
- lrh = node_height(lr);
- if (lrh <= node_height(ll)) { /* ll: RH+1 */
- node->avl_left = lr; /* lr: RH or RH+1 */
- node->avl_right = r; /* r: RH */
- node->avl_height = lrh + 1; /* RH+1 or RH+2 */
- l->avl_left = ll; /* ll: RH+1 */
- l->avl_right = node; /* node: RH+1 or RH+2 */
- l->avl_height = node->avl_height + 1;
- *nodep = l;
- } else { /* ll: RH, lr: RH+1 */
- lrl = lr->avl_left; /* lrl: RH or RH-1 */
- lrr = lr->avl_right; /* lrr: RH or RH-1 */
- node->avl_left = lrr; /* lrr: RH or RH-1 */
- node->avl_right = r; /* r: RH */
- node->avl_height = rh + 1; /* node: RH+1 */
- l->avl_left = ll; /* ll: RH */
- l->avl_right = lrl; /* lrl: RH or RH-1 */
- l->avl_height = rh + 1; /* l: RH+1 */
- lr->avl_left = l; /* l: RH+1 */
- lr->avl_right = node; /* node: RH+1 */
- lr->avl_height = rh + 2;
- *nodep = lr;
- }
- } else if (rh > lh + 1) { /* r: LH+2 */
- struct inet_peer *rr, *rl, *rlr, *rll;
- int rlh;
- rr = r->avl_right;
- rl = r->avl_left;
- rlh = node_height(rl);
- if (rlh <= node_height(rr)) { /* rr: LH+1 */
- node->avl_right = rl; /* rl: LH or LH+1 */
- node->avl_left = l; /* l: LH */
- node->avl_height = rlh + 1; /* LH+1 or LH+2 */
- r->avl_right = rr; /* rr: LH+1 */
- r->avl_left = node; /* node: LH+1 or LH+2 */
- r->avl_height = node->avl_height + 1;
- *nodep = r;
- } else { /* rr: RH, rl: RH+1 */
- rlr = rl->avl_right; /* rlr: LH or LH-1 */
- rll = rl->avl_left; /* rll: LH or LH-1 */
- node->avl_right = rll; /* rll: LH or LH-1 */
- node->avl_left = l; /* l: LH */
- node->avl_height = lh + 1; /* node: LH+1 */
- r->avl_right = rr; /* rr: LH */
- r->avl_left = rlr; /* rlr: LH or LH-1 */
- r->avl_height = lh + 1; /* r: LH+1 */
- rl->avl_right = r; /* r: LH+1 */
- rl->avl_left = node; /* node: LH+1 */
- rl->avl_height = lh + 2;
- *nodep = rl;
- }
- } else {
- node->avl_height = (lh > rh ? lh : rh) + 1;
+ struct rb_node **pp, *parent, *next;
+ struct inet_peer *p;
+ u32 now;
+
+ pp = &base->rb_root.rb_node;
+ parent = NULL;
+ while (1) {
+ int cmp;
+
+ next = rcu_dereference_raw(*pp);
+ if (!next)
+ break;
+ parent = next;
+ p = rb_entry(parent, struct inet_peer, rb_node);
+ cmp = inetpeer_addr_cmp(daddr, &p->daddr);
+ if (cmp == 0) {
+ now = jiffies;
+ if (READ_ONCE(p->dtime) != now)
+ WRITE_ONCE(p->dtime, now);
+ return p;
}
+ if (gc_stack) {
+ if (*gc_cnt < PEER_MAX_GC)
+ gc_stack[(*gc_cnt)++] = p;
+ } else if (unlikely(read_seqretry(&base->lock, seq))) {
+ break;
+ }
+ if (cmp == -1)
+ pp = &next->rb_left;
+ else
+ pp = &next->rb_right;
}
+ *parent_p = parent;
+ *pp_p = pp;
+ return NULL;
}
-/* Called with local BH disabled and the pool write lock held. */
-#define link_to_pool(n) \
-do { \
- n->avl_height = 1; \
- n->avl_left = peer_avl_empty; \
- n->avl_right = peer_avl_empty; \
- **--stackptr = n; \
- peer_avl_rebalance(stack, stackptr); \
-} while(0)
-
-/* May be called with local BH enabled. */
-static void unlink_from_pool(struct inet_peer *p)
+/* perform garbage collect on all items stacked during a lookup */
+static void inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer *gc_stack[],
+ unsigned int gc_cnt)
{
- int do_free;
-
- do_free = 0;
-
- write_lock_bh(&peer_pool_lock);
- /* Check the reference counter. It was artificially incremented by 1
- * in cleanup() function to prevent sudden disappearing. If the
- * reference count is still 1 then the node is referenced only as `p'
- * here and from the pool. So under the exclusive pool lock it's safe
- * to remove the node and free it later. */
- if (atomic_read(&p->refcnt) == 1) {
- struct inet_peer **stack[PEER_MAXDEPTH];
- struct inet_peer ***stackptr, ***delp;
- if (lookup(p->v4daddr) != p)
- BUG();
- delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty) {
- *delp[0] = p->avl_right;
- --stackptr;
- } else {
- /* look for a node to insert instead of p */
- struct inet_peer *t;
- t = lookup_rightempty(p);
- BUG_ON(*stackptr[-1] != t);
- **--stackptr = t->avl_left;
- /* t is removed, t->v4daddr > x->v4daddr for any
- * x in p->avl_left subtree.
- * Put t in the old place of p. */
- *delp[0] = t;
- t->avl_left = p->avl_left;
- t->avl_right = p->avl_right;
- t->avl_height = p->avl_height;
- BUG_ON(delp[1] != &p->avl_left);
- delp[1] = &t->avl_left; /* was &p->avl_left */
- }
- peer_avl_rebalance(stack, stackptr);
- peer_total--;
- do_free = 1;
- }
- write_unlock_bh(&peer_pool_lock);
+ int peer_threshold, peer_maxttl, peer_minttl;
+ struct inet_peer *p;
+ __u32 delta, ttl;
+ int i;
+
+ peer_threshold = READ_ONCE(inet_peer_threshold);
+ peer_maxttl = READ_ONCE(inet_peer_maxttl);
+ peer_minttl = READ_ONCE(inet_peer_minttl);
- if (do_free)
- kmem_cache_free(peer_cachep, p);
+ if (base->total >= peer_threshold)
+ ttl = 0; /* be aggressive */
else
- /* The node is used again. Decrease the reference counter
- * back. The loop "cleanup -> unlink_from_unused
- * -> unlink_from_pool -> putpeer -> link_to_unused
- * -> cleanup (for the same node)"
- * doesn't really exist because the entry will have a
- * recent deletion time and will not be cleaned again soon. */
- inet_putpeer(p);
-}
+ ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ *
+ base->total / peer_threshold * HZ;
+ for (i = 0; i < gc_cnt; i++) {
+ p = gc_stack[i];
-/* May be called with local BH enabled. */
-static int cleanup_once(unsigned long ttl)
-{
- struct inet_peer *p;
+ delta = (__u32)jiffies - READ_ONCE(p->dtime);
- /* Remove the first entry from the list of unused nodes. */
- spin_lock_bh(&inet_peer_unused_lock);
- p = inet_peer_unused_head;
- if (p != NULL) {
- __u32 delta = (__u32)jiffies - p->dtime;
- if (delta < ttl) {
- /* Do not prune fresh entries. */
- spin_unlock_bh(&inet_peer_unused_lock);
- return -1;
+ if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
+ gc_stack[i] = NULL;
+ }
+ for (i = 0; i < gc_cnt; i++) {
+ p = gc_stack[i];
+ if (p) {
+ rb_erase(&p->rb_node, &base->rb_root);
+ base->total--;
+ kfree_rcu(p, rcu);
}
- inet_peer_unused_head = p->unused_next;
- if (p->unused_next != NULL)
- p->unused_next->unused_prevp = p->unused_prevp;
- else
- inet_peer_unused_tailp = p->unused_prevp;
- p->unused_prevp = NULL; /* mark as not on the list */
- /* Grab an extra reference to prevent node disappearing
- * before unlink_from_pool() call. */
- atomic_inc(&p->refcnt);
}
- spin_unlock_bh(&inet_peer_unused_lock);
-
- if (p == NULL)
- /* It means that the total number of USED entries has
- * grown over inet_peer_threshold. It shouldn't really
- * happen because of entry limits in route cache. */
- return -1;
-
- unlink_from_pool(p);
- return 0;
}
-/* Called with or without local BH being disabled. */
-struct inet_peer *inet_getpeer(__be32 daddr, int create)
+/* Must be called under RCU : No refcount change is done here. */
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr)
{
- struct inet_peer *p, *n;
- struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
-
- /* Look up for the address quickly. */
- read_lock_bh(&peer_pool_lock);
- p = lookup(daddr);
- if (p != peer_avl_empty)
- atomic_inc(&p->refcnt);
- read_unlock_bh(&peer_pool_lock);
-
- if (p != peer_avl_empty) {
- /* The existing node has been found. */
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
+ struct inet_peer *p, *gc_stack[PEER_MAX_GC];
+ struct rb_node **pp, *parent;
+ unsigned int gc_cnt, seq;
+
+ /* Attempt a lockless lookup first.
+ * Because of a concurrent writer, we might not find an existing entry.
+ */
+ seq = read_seqbegin(&base->lock);
+ p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
+
+ if (p)
return p;
+
+ /* retry an exact lookup, taking the lock before.
+ * At least, nodes should be hot in our cache.
+ */
+ parent = NULL;
+ write_seqlock_bh(&base->lock);
+
+ gc_cnt = 0;
+ p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
+ if (!p) {
+ p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+ if (p) {
+ p->daddr = *daddr;
+ p->dtime = (__u32)jiffies;
+ refcount_set(&p->refcnt, 1);
+ atomic_set(&p->rid, 0);
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ p->n_redirects = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+
+ rb_link_node(&p->rb_node, parent, pp);
+ rb_insert_color(&p->rb_node, &base->rb_root);
+ base->total++;
+ }
}
+ if (gc_cnt)
+ inet_peer_gc(base, gc_stack, gc_cnt);
+ write_sequnlock_bh(&base->lock);
- if (!create)
- return NULL;
-
- /* Allocate the space outside the locked region. */
- n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
- if (n == NULL)
- return NULL;
- n->v4daddr = daddr;
- atomic_set(&n->refcnt, 1);
- atomic_set(&n->rid, 0);
- n->ip_id_count = secure_ip_id(daddr);
- n->tcp_ts_stamp = 0;
-
- write_lock_bh(&peer_pool_lock);
- /* Check if an entry has suddenly appeared. */
- p = lookup(daddr);
- if (p != peer_avl_empty)
- goto out_free;
-
- /* Link the node. */
- link_to_pool(n);
- n->unused_prevp = NULL; /* not on the list */
- peer_total++;
- write_unlock_bh(&peer_pool_lock);
-
- if (peer_total >= inet_peer_threshold)
- /* Remove one less-recently-used entry. */
- cleanup_once(0);
-
- return n;
-
-out_free:
- /* The appropriate node is already in the pool. */
- atomic_inc(&p->refcnt);
- write_unlock_bh(&peer_pool_lock);
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
- /* Free preallocated the preallocated node. */
- kmem_cache_free(peer_cachep, n);
return p;
}
+EXPORT_IPV6_MOD_GPL(inet_getpeer);
-/* Called with local BH disabled. */
-static void peer_check_expire(unsigned long dummy)
+void inet_putpeer(struct inet_peer *p)
{
- unsigned long now = jiffies;
- int ttl;
+ if (refcount_dec_and_test(&p->refcnt))
+ kfree_rcu(p, rcu);
+}
- if (peer_total >= inet_peer_threshold)
- ttl = inet_peer_minttl;
- else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- peer_total / inet_peer_threshold * HZ;
- while (!cleanup_once(ttl)) {
- if (jiffies != now)
- break;
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token, otoken, delta;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = otoken = READ_ONCE(peer->rate_tokens);
+ now = jiffies;
+ delta = now - READ_ONCE(peer->rate_last);
+ if (delta) {
+ WRITE_ONCE(peer->rate_last, now);
+ token += delta;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
}
-
- /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
- * interval depending on the total number of entries (more entries,
- * less interval). */
- if (peer_total >= inet_peer_threshold)
- peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
- else
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
- add_timer(&peer_periodic_timer);
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
+ }
+ if (token != otoken)
+ WRITE_ONCE(peer->rate_tokens, token);
+ return rc;
}
+EXPORT_IPV6_MOD(inet_peer_xrlim_allow);
-void inet_putpeer(struct inet_peer *p)
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- spin_lock_bh(&inet_peer_unused_lock);
- if (atomic_dec_and_test(&p->refcnt)) {
- p->unused_prevp = inet_peer_unused_tailp;
- p->unused_next = NULL;
- *inet_peer_unused_tailp = p;
- inet_peer_unused_tailp = &p->unused_next;
- p->dtime = (__u32)jiffies;
+ struct rb_node *p = rb_first(&base->rb_root);
+
+ while (p) {
+ struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);
+
+ p = rb_next(p);
+ rb_erase(&peer->rb_node, &base->rb_root);
+ inet_putpeer(peer);
+ cond_resched();
}
- spin_unlock_bh(&inet_peer_unused_lock);
+
+ base->total = 0;
}
+EXPORT_IPV6_MOD(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a22d11d2911c..8b65f12583eb 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -1,18 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* The IP forwarding functionality.
- *
- * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
*
* Authors: see ip.c
*
* Fixes:
- * Many : Split from ip.c , see ip_input.c for
+ * Many : Split from ip.c , see ip_input.c for
* history.
- * Dave Gregorich : NULL ip_rt_put fix for multicast
+ * Dave Gregorich : NULL ip_rt_put fix for multicast
* routing.
* Jos Vos : Add call_out_firewall before sending,
* use output device for accounting.
@@ -23,11 +22,11 @@
#include <linux/types.h>
#include <linux/mm.h>
-#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp.h>
@@ -41,55 +40,110 @@
#include <net/route.h>
#include <net/xfrm.h>
-static inline int ip_forward_finish(struct sk_buff *skb)
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ if (skb->len <= mtu)
+ return false;
- IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+ if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0))
+ return false;
+
+ /* original fragment exceeds mtu and DF is set */
+ if (unlikely(IPCB(skb)->frag_max_size > mtu))
+ return true;
+
+ if (skb->ignore_df)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+ return false;
+
+ return true;
+}
+
+
+static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output(skb);
+ skb_clear_tstamp(skb);
+ return dst_output(net, sk, skb);
}
int ip_forward(struct sk_buff *skb)
{
+ u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ struct net *net;
+ SKB_DR(reason);
+
+ /* that should never happen */
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if (unlikely(skb->sk))
+ goto drop;
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ if (skb_warn_if_lro(skb))
goto drop;
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
+ goto drop;
+ }
+
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
- if (skb->pkt_type != PACKET_HOST)
- goto drop;
+ skb_forward_csum(skb);
+ net = dev_net(skb->dev);
- skb->ip_summed = CHECKSUM_NONE;
-
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
- if (skb->nh.iph->ttl <= 1)
- goto too_many_hops;
+ if (ip_hdr(skb)->ttl <= 1)
+ goto too_many_hops;
- if (!xfrm4_route_forward(skb))
+ if (!xfrm4_route_forward(skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
+ }
- rt = (struct rtable*)skb->dst;
+ rt = skb_rtable(skb);
- if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (ip_exceeds_mtu(skb, mtu)) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ SKB_DR_SET(reason, PKT_TOO_BIG);
+ goto drop;
+ }
+
/* We are about to mangle packet. Copy it! */
- if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
@@ -98,26 +152,30 @@ int ip_forward(struct sk_buff *skb)
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
- if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+ if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
+ !skb_sec_path(skb))
ip_rt_send_redirect(skb);
- skb->priority = rt_tos2priority(iph->tos);
+ if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority))
+ skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, rt->dst.dev,
ip_forward_finish);
sr_failed:
- /*
+ /*
* Strict routing permits no gatewaying
*/
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
- goto drop;
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+ goto drop;
too_many_hops:
- /* Tell the sender its packet died... */
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
- icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ /* Tell the sender its packet died... */
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ SKB_DR_SET(reason, IP_INHDR);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 74046efdf875..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -1,14 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* The IP fragmentation functionality.
- *
- * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
*
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
- * Alan Cox <Alan.Cox@linux.org>
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
*
* Fixes:
* Alan Cox : Split from ip.c , see ip_input.c for history.
@@ -22,6 +21,8 @@
* Patrick McHardy : LRU queue of frag heads for evictor.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -34,390 +35,190 @@
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/route.h>
+#include <net/dst.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
+#include <net/inet_frag.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
+#include <net/l3mdev.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
* as well. Or notify me, at least. --ANK
*/
-
-/* Fragment cache limits. We will commit 256K at one time. Should we
- * cross that limit we will prune down to 192K. This should cope with
- * even the most extreme cases without allowing an attacker to measurably
- * harm machine performance.
- */
-int sysctl_ipfrag_high_thresh __read_mostly = 256*1024;
-int sysctl_ipfrag_low_thresh __read_mostly = 192*1024;
-
-int sysctl_ipfrag_max_dist __read_mostly = 64;
-
-/* Important NOTE! Fragment queue must be destroyed before MSL expires.
- * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
- */
-int sysctl_ipfrag_time __read_mostly = IP_FRAG_TIME;
-
-struct ipfrag_skb_cb
-{
- struct inet_skb_parm h;
- int offset;
-};
-
-#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb))
+static const char ip_frag_cache_name[] = "ip4-frags";
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
- struct hlist_node list;
- struct list_head lru_list; /* lru list member */
- u32 user;
- __be32 saddr;
- __be32 daddr;
- __be16 id;
- u8 protocol;
- u8 last_in;
-#define COMPLETE 4
-#define FIRST_IN 2
-#define LAST_IN 1
-
- struct sk_buff *fragments; /* linked list of received fragments */
- int len; /* total length of original datagram */
- int meat;
- spinlock_t lock;
- atomic_t refcnt;
- struct timer_list timer; /* when will this queue expire? */
- struct timeval stamp;
+ struct inet_frag_queue q;
+
+ u8 ecn; /* RFC3168 support */
+ u16 max_df_size; /* largest frag with DF set seen */
int iif;
unsigned int rid;
struct inet_peer *peer;
};
-/* Hash table. */
-
-#define IPQ_HASHSZ 64
-
-/* Per-bucket lock is easy to add now. */
-static struct hlist_head ipq_hash[IPQ_HASHSZ];
-static DEFINE_RWLOCK(ipfrag_lock);
-static u32 ipfrag_hash_rnd;
-static LIST_HEAD(ipq_lru_list);
-int ip_frag_nqueues = 0;
-
-static __inline__ void __ipq_unlink(struct ipq *qp)
+static u8 ip4_frag_ecn(u8 tos)
{
- hlist_del(&qp->list);
- list_del(&qp->lru_list);
- ip_frag_nqueues--;
+ return 1 << (tos & INET_ECN_MASK);
}
-static __inline__ void ipq_unlink(struct ipq *ipq)
-{
- write_lock(&ipfrag_lock);
- __ipq_unlink(ipq);
- write_unlock(&ipfrag_lock);
-}
+static struct inet_frags ip4_frags;
-static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
-{
- return jhash_3words((__force u32)id << 16 | prot,
- (__force u32)saddr, (__force u32)daddr,
- ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
-}
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
-static struct timer_list ipfrag_secret_timer;
-int sysctl_ipfrag_secret_interval __read_mostly = 10 * 60 * HZ;
-static void ipfrag_secret_rebuild(unsigned long dummy)
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
- unsigned long now = jiffies;
- int i;
-
- write_lock(&ipfrag_lock);
- get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
- for (i = 0; i < IPQ_HASHSZ; i++) {
- struct ipq *q;
- struct hlist_node *p, *n;
-
- hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) {
- unsigned int hval = ipqhashfn(q->id, q->saddr,
- q->daddr, q->protocol);
-
- if (hval != i) {
- hlist_del(&q->list);
-
- /* Relink to new hash chain. */
- hlist_add_head(&q->list, &ipq_hash[hval]);
- }
- }
+ struct ipq *qp = container_of(q, struct ipq, q);
+ const struct frag_v4_compare_key *key = a;
+ struct net *net = q->fqdir->net;
+ struct inet_peer *p = NULL;
+
+ q->key.v4 = *key;
+ qp->ecn = 0;
+ if (q->fqdir->max_dist) {
+ rcu_read_lock();
+ p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
+ if (p && !refcount_inc_not_zero(&p->refcnt))
+ p = NULL;
+ rcu_read_unlock();
}
- write_unlock(&ipfrag_lock);
-
- mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
+ qp->peer = p;
}
-atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
-
-/* Memory Tracking Functions. */
-static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
+static void ip4_frag_free(struct inet_frag_queue *q)
{
- if (work)
- *work -= skb->truesize;
- atomic_sub(skb->truesize, &ip_frag_mem);
- kfree_skb(skb);
-}
-
-static __inline__ void frag_free_queue(struct ipq *qp, int *work)
-{
- if (work)
- *work -= sizeof(struct ipq);
- atomic_sub(sizeof(struct ipq), &ip_frag_mem);
- kfree(qp);
-}
-
-static __inline__ struct ipq *frag_alloc_queue(void)
-{
- struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
-
- if(!qp)
- return NULL;
- atomic_add(sizeof(struct ipq), &ip_frag_mem);
- return qp;
-}
-
-
-/* Destruction primitives. */
-
-/* Complete destruction of ipq. */
-static void ip_frag_destroy(struct ipq *qp, int *work)
-{
- struct sk_buff *fp;
-
- BUG_TRAP(qp->last_in&COMPLETE);
- BUG_TRAP(del_timer(&qp->timer) == 0);
+ struct ipq *qp;
+ qp = container_of(q, struct ipq, q);
if (qp->peer)
inet_putpeer(qp->peer);
-
- /* Release all fragment data. */
- fp = qp->fragments;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- frag_kfree_skb(fp, work);
- fp = xp;
- }
-
- /* Finally, release the queue descriptor itself. */
- frag_free_queue(qp, work);
-}
-
-static __inline__ void ipq_put(struct ipq *ipq, int *work)
-{
- if (atomic_dec_and_test(&ipq->refcnt))
- ip_frag_destroy(ipq, work);
-}
-
-/* Kill ipq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static void ipq_kill(struct ipq *ipq)
-{
- if (del_timer(&ipq->timer))
- atomic_dec(&ipq->refcnt);
-
- if (!(ipq->last_in & COMPLETE)) {
- ipq_unlink(ipq);
- atomic_dec(&ipq->refcnt);
- ipq->last_in |= COMPLETE;
- }
}
-/* Memory limiting on fragments. Evictor trashes the oldest
- * fragment queue until we are back under the threshold.
- */
-static void ip_evictor(void)
+static bool frag_expire_skip_icmp(u32 user)
{
- struct ipq *qp;
- struct list_head *tmp;
- int work;
-
- work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
- if (work <= 0)
- return;
-
- while (work > 0) {
- read_lock(&ipfrag_lock);
- if (list_empty(&ipq_lru_list)) {
- read_unlock(&ipfrag_lock);
- return;
- }
- tmp = ipq_lru_list.next;
- qp = list_entry(tmp, struct ipq, lru_list);
- atomic_inc(&qp->refcnt);
- read_unlock(&ipfrag_lock);
-
- spin_lock(&qp->lock);
- if (!(qp->last_in&COMPLETE))
- ipq_kill(qp);
- spin_unlock(&qp->lock);
-
- ipq_put(qp, &work);
- IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
- }
+ return user == IP_DEFRAG_AF_PACKET ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+ __IP_DEFRAG_CONNTRACK_IN_END) ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
+ __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
}
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
-static void ip_expire(unsigned long arg)
+static void ip_expire(struct timer_list *t)
{
- struct ipq *qp = (struct ipq *) arg;
+ enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
+ const struct iphdr *iph;
+ struct sk_buff *head = NULL;
+ struct net *net;
+ struct ipq *qp;
+ int refs = 1;
- spin_lock(&qp->lock);
+ qp = container_of(frag, struct ipq, q);
+ net = qp->q.fqdir->net;
- if (qp->last_in & COMPLETE)
- goto out;
+ rcu_read_lock();
- ipq_kill(qp);
+ /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(qp->q.fqdir->dead))
+ goto out_rcu_unlock;
- IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
- IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ spin_lock(&qp->q.lock);
- if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
- struct sk_buff *head = qp->fragments;
- /* Send an ICMP "Fragment Reassembly Timeout" message. */
- if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
- icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- dev_put(head->dev);
- }
- }
-out:
- spin_unlock(&qp->lock);
- ipq_put(qp, NULL);
-}
+ if (qp->q.flags & INET_FRAG_COMPLETE)
+ goto out;
-/* Creation primitives. */
+ qp->q.flags |= INET_FRAG_DROP;
+ inet_frag_kill(&qp->q, &refs);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
-static struct ipq *ip_frag_intern(struct ipq *qp_in)
-{
- struct ipq *qp;
-#ifdef CONFIG_SMP
- struct hlist_node *n;
-#endif
- unsigned int hash;
-
- write_lock(&ipfrag_lock);
- hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr,
- qp_in->protocol);
-#ifdef CONFIG_SMP
- /* With SMP race we have to recheck hash table, because
- * such entry could be created on other cpu, while we
- * promoted read lock to write lock.
- */
- hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
- if(qp->id == qp_in->id &&
- qp->saddr == qp_in->saddr &&
- qp->daddr == qp_in->daddr &&
- qp->protocol == qp_in->protocol &&
- qp->user == qp_in->user) {
- atomic_inc(&qp->refcnt);
- write_unlock(&ipfrag_lock);
- qp_in->last_in |= COMPLETE;
- ipq_put(qp_in, NULL);
- return qp;
- }
- }
-#endif
- qp = qp_in;
-
- if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
- atomic_inc(&qp->refcnt);
-
- atomic_inc(&qp->refcnt);
- hlist_add_head(&qp->list, &ipq_hash[hash]);
- INIT_LIST_HEAD(&qp->lru_list);
- list_add_tail(&qp->lru_list, &ipq_lru_list);
- ip_frag_nqueues++;
- write_unlock(&ipfrag_lock);
- return qp;
-}
+ if (!(qp->q.flags & INET_FRAG_FIRST_IN))
+ goto out;
-/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
-static struct ipq *ip_frag_create(struct iphdr *iph, u32 user)
-{
- struct ipq *qp;
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+ * pull the head out of the tree in order to be able to
+ * deal with head->dev.
+ */
+ head = inet_frag_pull_head(&qp->q);
+ if (!head)
+ goto out;
+ head->dev = dev_get_by_index_rcu(net, qp->iif);
+ if (!head->dev)
+ goto out;
- if ((qp = frag_alloc_queue()) == NULL)
- goto out_nomem;
- qp->protocol = iph->protocol;
- qp->last_in = 0;
- qp->id = iph->id;
- qp->saddr = iph->saddr;
- qp->daddr = iph->daddr;
- qp->user = user;
- qp->len = 0;
- qp->meat = 0;
- qp->fragments = NULL;
- qp->iif = 0;
- qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
+ /* skb has no dst, perform route lookup again */
+ iph = ip_hdr(head);
+ reason = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), head->dev);
+ if (reason)
+ goto out;
- /* Initialize a timer for this entry. */
- init_timer(&qp->timer);
- qp->timer.data = (unsigned long) qp; /* pointer to queue */
- qp->timer.function = ip_expire; /* expire function */
- spin_lock_init(&qp->lock);
- atomic_set(&qp->refcnt, 1);
+ /* Only an end host needs to send an ICMP
+ * "Fragment Reassembly Timeout" message, per RFC792.
+ */
+ reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
+ if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL))
+ goto out;
- return ip_frag_intern(qp);
+ spin_unlock(&qp->q.lock);
+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+ goto out_rcu_unlock;
-out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
- return NULL;
+out:
+ spin_unlock(&qp->q.lock);
+out_rcu_unlock:
+ rcu_read_unlock();
+ kfree_skb_reason(head, reason);
+ inet_frag_putn(&qp->q, refs);
}
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
-static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
+static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+ u32 user, int vif)
{
- __be16 id = iph->id;
- __be32 saddr = iph->saddr;
- __be32 daddr = iph->daddr;
- __u8 protocol = iph->protocol;
- unsigned int hash;
- struct ipq *qp;
- struct hlist_node *n;
-
- read_lock(&ipfrag_lock);
- hash = ipqhashfn(id, saddr, daddr, protocol);
- hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
- if(qp->id == id &&
- qp->saddr == saddr &&
- qp->daddr == daddr &&
- qp->protocol == protocol &&
- qp->user == user) {
- atomic_inc(&qp->refcnt);
- read_unlock(&ipfrag_lock);
- return qp;
- }
- }
- read_unlock(&ipfrag_lock);
+ struct frag_v4_compare_key key = {
+ .saddr = iph->saddr,
+ .daddr = iph->daddr,
+ .user = user,
+ .vif = vif,
+ .id = iph->id,
+ .protocol = iph->protocol,
+ };
+ struct inet_frag_queue *q;
+
+ q = inet_frag_find(net->ipv4.fqdir, &key);
+ if (!q)
+ return NULL;
- return ip_frag_create(iph, user);
+ return container_of(q, struct ipq, q);
}
/* Is the fragment too far ahead to be part of ipq? */
-static inline int ip_frag_too_far(struct ipq *qp)
+static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int max = qp->q.fqdir->max_dist;
unsigned int start, end;
int rc;
@@ -429,320 +230,528 @@ static inline int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid);
qp->rid = end;
- rc = qp->fragments && (end - start) > max;
+ rc = qp->q.fragments_tail && (end - start) > max;
- if (rc) {
- IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
- }
+ if (rc)
+ __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
return rc;
}
static int ip_frag_reinit(struct ipq *qp)
{
- struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
- if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
- atomic_inc(&qp->refcnt);
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
+ refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
- fp = qp->fragments;
- do {
- struct sk_buff *xp = fp->next;
- frag_kfree_skb(fp, NULL);
- fp = xp;
- } while (fp);
-
- qp->last_in = 0;
- qp->len = 0;
- qp->meat = 0;
- qp->fragments = NULL;
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
+ SKB_DROP_REASON_FRAG_TOO_FAR);
+ sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
+
+ qp->q.flags = 0;
+ qp->q.len = 0;
+ qp->q.meat = 0;
+ qp->q.rb_fragments = RB_ROOT;
+ qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
qp->iif = 0;
+ qp->ecn = 0;
return 0;
}
/* Add new segment to existing queue. */
-static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs)
{
- struct sk_buff *prev, *next;
- int flags, offset;
- int ihl, end;
-
- if (qp->last_in & COMPLETE)
+ struct net *net = qp->q.fqdir->net;
+ int ihl, end, flags, offset;
+ struct sk_buff *prev_tail;
+ struct net_device *dev;
+ unsigned int fragsize;
+ int err = -ENOENT;
+ SKB_DR(reason);
+ u8 ecn;
+
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (qp->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
- unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
- ipq_kill(qp);
+ unlikely(ip_frag_too_far(qp)) &&
+ unlikely(err = ip_frag_reinit(qp))) {
+ inet_frag_kill(&qp->q, refs);
goto err;
}
- offset = ntohs(skb->nh.iph->frag_off);
+ ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
+ offset = ntohs(ip_hdr(skb)->frag_off);
flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET;
offset <<= 3; /* offset is in 8-byte chunks */
- ihl = skb->nh.iph->ihl * 4;
+ ihl = ip_hdrlen(skb);
/* Determine the position of this fragment. */
- end = offset + skb->len - ihl;
+ end = offset + skb->len - skb_network_offset(skb) - ihl;
+ err = -EINVAL;
/* Is this the final fragment? */
if ((flags & IP_MF) == 0) {
/* If we already have some bits beyond end
- * or have different end, the segment is corrrupted.
+ * or have different end, the segment is corrupted.
*/
- if (end < qp->len ||
- ((qp->last_in & LAST_IN) && end != qp->len))
- goto err;
- qp->last_in |= LAST_IN;
- qp->len = end;
+ if (end < qp->q.len ||
+ ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
+ goto discard_qp;
+ qp->q.flags |= INET_FRAG_LAST_IN;
+ qp->q.len = end;
} else {
if (end&7) {
end &= ~7;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
- if (end > qp->len) {
+ if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
- if (qp->last_in & LAST_IN)
- goto err;
- qp->len = end;
+ if (qp->q.flags & INET_FRAG_LAST_IN)
+ goto discard_qp;
+ qp->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_qp;
- if (pskb_pull(skb, ihl) == NULL)
- goto err;
- if (pskb_trim_rcsum(skb, end-offset))
- goto err;
+ err = -ENOMEM;
+ if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
+ goto discard_qp;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = NULL;
- for(next = qp->fragments; next != NULL; next = next->next) {
- if (FRAG_CB(next)->offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ err = pskb_trim_rcsum(skb, end - offset);
+ if (err)
+ goto discard_qp;
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
- */
- if (prev) {
- int i = (FRAG_CB(prev)->offset + prev->len) - offset;
-
- if (i > 0) {
- offset += i;
- if (end <= offset)
- goto err;
- if (!pskb_pull(skb, i))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
+
+ prev_tail = qp->q.fragments_tail;
+ err = inet_frag_queue_insert(&qp->q, skb, offset, end);
+ if (err)
+ goto insert_error;
+
+ if (dev)
+ qp->iif = dev->ifindex;
+
+ qp->q.stamp = skb->tstamp;
+ qp->q.tstamp_type = skb->tstamp_type;
+ qp->q.meat += skb->len;
+ qp->ecn |= ecn;
+ add_frag_mem_limit(qp->q.fqdir, skb->truesize);
+ if (offset == 0)
+ qp->q.flags |= INET_FRAG_FIRST_IN;
- while (next && FRAG_CB(next)->offset < end) {
- int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- if (!pskb_pull(next, i))
- goto err;
- FRAG_CB(next)->offset += i;
- qp->meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragmnet is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- qp->fragments = next;
-
- qp->meat -= free_it->len;
- frag_kfree_skb(free_it, NULL);
- }
- }
+ fragsize = skb->len + ihl;
- FRAG_CB(skb)->offset = offset;
-
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (prev)
- prev->next = skb;
- else
- qp->fragments = skb;
-
- if (skb->dev)
- qp->iif = skb->dev->ifindex;
- skb->dev = NULL;
- skb_get_timestamp(skb, &qp->stamp);
- qp->meat += skb->len;
- atomic_add(skb->truesize, &ip_frag_mem);
- if (offset == 0)
- qp->last_in |= FIRST_IN;
+ if (fragsize > qp->q.max_size)
+ qp->q.max_size = fragsize;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ fragsize > qp->max_df_size)
+ qp->max_df_size = fragsize;
- write_lock(&ipfrag_lock);
- list_move_tail(&qp->lru_list, &ipq_lru_list);
- write_unlock(&ipfrag_lock);
+ if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ qp->q.meat == qp->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
- return;
+ skb->_skb_refdst = 0UL;
+ err = ip_frag_reasm(qp, skb, prev_tail, dev, refs);
+ skb->_skb_refdst = orefdst;
+ if (err)
+ inet_frag_kill(&qp->q, refs);
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ skb_orphan(skb);
+ return -EINPROGRESS;
+insert_error:
+ if (err == IPFRAG_DUP) {
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
+ }
+ err = -EINVAL;
+ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
+discard_qp:
+ inet_frag_kill(&qp->q, refs);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
+ return err;
}
+static bool ip_frag_coalesce_ok(const struct ipq *qp)
+{
+ return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
+}
/* Build a new IP datagram from all its fragments. */
-
-static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
+ struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
- struct sk_buff *fp, *head = qp->fragments;
- int len;
- int ihlen;
+ void *reasm_data;
+ int len, err;
+ u8 ecn;
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
- BUG_TRAP(head != NULL);
- BUG_TRAP(FRAG_CB(head)->offset == 0);
+ ecn = ip_frag_ecn_table[qp->ecn];
+ if (unlikely(ecn == 0xff)) {
+ err = -EINVAL;
+ goto out_fail;
+ }
- /* Allocate a new buffer for the datagram. */
- ihlen = head->nh.iph->ihl*4;
- len = ihlen + qp->len;
+ /* Make the one we just received the head. */
+ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
+ if (!reasm_data)
+ goto out_nomem;
- if(len > 65535)
+ len = ip_hdrlen(skb) + qp->q.len;
+ err = -E2BIG;
+ if (len > 65535)
goto out_oversize;
- /* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
- goto out_nomem;
+ inet_frag_reasm_finish(&qp->q, skb, reasm_data,
+ ip_frag_coalesce_ok(qp));
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_shinfo(head)->frag_list) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
- goto out_nomem;
- clone->next = head->next;
- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_shinfo(head)->frag_list = NULL;
- for (i=0; i<skb_shinfo(head)->nr_frags; i++)
- plen += skb_shinfo(head)->frags[i].size;
- clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &ip_frag_mem);
- }
+ skb->dev = dev;
+ IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
- skb_shinfo(head)->frag_list = head->next;
- skb_push(head, head->data - head->nh.raw);
- atomic_sub(head->truesize, &ip_frag_mem);
-
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &ip_frag_mem);
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(len);
+ iph->tos |= ecn;
+
+ /* When we set IP_DF on a refragmented skb we must also force a
+ * call to ip_fragment to avoid forwarding a DF-skb of size s while
+ * original sender only sent fragments of size f (where f < s).
+ *
+ * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+ * frag seen to avoid sending tiny DF-fragments in case skb was built
+ * from one very small df-fragment and one large non-df frag.
+ */
+ if (qp->max_df_size == qp->q.max_size) {
+ IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
+ iph->frag_off = htons(IP_DF);
+ } else {
+ iph->frag_off = 0;
}
- head->next = NULL;
- head->dev = dev;
- skb_set_timestamp(head, &qp->stamp);
+ ip_send_check(iph);
- iph = head->nh.iph;
- iph->frag_off = 0;
- iph->tot_len = htons(len);
- IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
- qp->fragments = NULL;
- return head;
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
+ qp->q.rb_fragments = RB_ROOT;
+ qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
+ return 0;
out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
- "queue %p\n", qp);
+ net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
+ err = -ENOMEM;
goto out_fail;
out_oversize:
- if (net_ratelimit())
- printk(KERN_INFO
- "Oversized IP packet from %d.%d.%d.%d.\n",
- NIPQUAD(qp->saddr));
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
out_fail:
- IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
- return NULL;
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+ return err;
}
/* Process an incoming IP datagram fragment. */
-struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct iphdr *iph = skb->nh.iph;
- struct ipq *qp;
struct net_device *dev;
-
- IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
+ struct ipq *qp;
+ int vif;
- /* Start by cleaning up the memory. */
- if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
- ip_evictor();
-
- dev = skb->dev;
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
- if ((qp = ip_find(iph, user)) != NULL) {
- struct sk_buff *ret = NULL;
-
- spin_lock(&qp->lock);
+ rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
+ qp = ip_find(net, ip_hdr(skb), user, vif);
+ if (qp) {
+ int ret, refs = 0;
- ip_frag_queue(qp, skb);
+ spin_lock(&qp->q.lock);
- if (qp->last_in == (FIRST_IN|LAST_IN) &&
- qp->meat == qp->len)
- ret = ip_frag_reasm(qp, dev);
+ ret = ip_frag_queue(qp, skb, &refs);
- spin_unlock(&qp->lock);
- ipq_put(qp, NULL);
+ spin_unlock(&qp->q.lock);
+ rcu_read_unlock();
+ inet_frag_putn(&qp->q, refs);
return ret;
}
+ rcu_read_unlock();
- IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
- return NULL;
+ return -ENOMEM;
}
+EXPORT_SYMBOL(ip_defrag);
-void ipfrag_init(void)
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
- (jiffies ^ (jiffies >> 6)));
+ struct iphdr iph;
+ int netoff;
+ u32 len;
- init_timer(&ipfrag_secret_timer);
- ipfrag_secret_timer.function = ipfrag_secret_rebuild;
- ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
- add_timer(&ipfrag_secret_timer);
+ if (skb->protocol != htons(ETH_P_IP))
+ return skb;
+
+ netoff = skb_network_offset(skb);
+
+ if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
+ return skb;
+
+ if (iph.ihl < 5 || iph.version != 4)
+ return skb;
+
+ len = ntohs(iph.tot_len);
+ if (skb->len < netoff + len || len < (iph.ihl * 4))
+ return skb;
+
+ if (ip_is_fragment(&iph)) {
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb) {
+ if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ if (pskb_trim_rcsum(skb, netoff + len)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ if (ip_defrag(net, skb, user))
+ return NULL;
+ skb_clear_hash(skb);
+ }
+ }
+ return skb;
}
+EXPORT_SYMBOL(ip_check_defrag);
+
+#ifdef CONFIG_SYSCTL
+static int dist_min;
+
+static struct ctl_table ip4_frags_ns_ctl_table[] = {
+ {
+ .procname = "ipfrag_high_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "ipfrag_low_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "ipfrag_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ipfrag_max_dist",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &dist_min,
+ },
+};
-EXPORT_SYMBOL(ip_defrag);
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
+static struct ctl_table ip4_frags_ctl_table[] = {
+ {
+ .procname = "ipfrag_secret_interval",
+ .data = &ip4_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+};
+
+static int __net_init ip4_frags_ns_ctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = ip4_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ }
+ table[0].data = &net->ipv4.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv4.fqdir->low_thresh;
+ table[1].data = &net->ipv4.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv4.fqdir->high_thresh;
+ table[2].data = &net->ipv4.fqdir->timeout;
+ table[3].data = &net->ipv4.fqdir->max_dist;
+
+ hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ ARRAY_SIZE(ip4_frags_ns_ctl_table));
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
+{
+ const struct ctl_table *table;
+
+ table = net->ipv4.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.frags_hdr);
+ kfree(table);
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
+}
+#else
+static int ip4_frags_ns_ctl_register(struct net *net)
+{
+ return 0;
+}
+
+static void ip4_frags_ns_ctl_unregister(struct net *net)
+{
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+}
+#endif
+
+static int __net_init ipv4_frags_init_net(struct net *net)
+{
+ int res;
+
+ res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+ if (res < 0)
+ return res;
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
+ */
+ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+ net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024;
+ /*
+ * Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival
+ * by TTL.
+ */
+ net->ipv4.fqdir->timeout = IP_FRAG_TIME;
+
+ net->ipv4.fqdir->max_dist = 64;
+
+ res = ip4_frags_ns_ctl_register(net);
+ if (res < 0)
+ fqdir_exit(net->ipv4.fqdir);
+ return res;
+}
+
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv4.fqdir);
+}
+
+static void __net_exit ipv4_frags_exit_net(struct net *net)
+{
+ ip4_frags_ns_ctl_unregister(net);
+ fqdir_exit(net->ipv4.fqdir);
+}
+
+static struct pernet_operations ip4_frags_ops = {
+ .init = ipv4_frags_init_net,
+ .pre_exit = ipv4_frags_pre_exit_net,
+ .exit = ipv4_frags_exit_net,
+};
+
+
+static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
+{
+ return jhash2(data,
+ sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct inet_frag_queue *fq = data;
+
+ return jhash2((const u32 *)&fq->key.v4,
+ sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+ const struct frag_v4_compare_key *key = arg->key;
+ const struct inet_frag_queue *fq = ptr;
+
+ return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params ip4_rhash_params = {
+ .head_offset = offsetof(struct inet_frag_queue, node),
+ .key_offset = offsetof(struct inet_frag_queue, key),
+ .key_len = sizeof(struct frag_v4_compare_key),
+ .hashfn = ip4_key_hashfn,
+ .obj_hashfn = ip4_obj_hashfn,
+ .obj_cmpfn = ip4_obj_cmpfn,
+ .automatic_shrinking = true,
+};
+
+void __init ipfrag_init(void)
+{
+ ip4_frags.constructor = ip4_frag_init;
+ ip4_frags.destructor = ip4_frag_free;
+ ip4_frags.qsize = sizeof(struct ipq);
+ ip4_frags.frag_expire = ip_expire;
+ ip4_frags.frags_cache_name = ip_frag_cache_name;
+ ip4_frags.rhash_params = ip4_rhash_params;
+ if (inet_frags_init(&ip4_frags))
+ panic("IP: failed to allocate ip4_frags cache\n");
+ ip4_frags_ctl_register();
+ register_pernet_subsys(&ip4_frags_ops);
+}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 476cb6084c75..761a53c6a89a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1,51 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Linux NET3: GRE over IP protocol decoder.
+ * Linux NET3: GRE over IP protocol decoder.
*
* Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
#include <linux/if_ether.h>
+#include <net/flow.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
-
-#ifdef CONFIG_IPV6
-#include <net/ipv6.h>
-#include <net/ip6_fib.h>
-#include <net/ip6_route.h>
-#endif
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+#include <net/dst_metadata.h>
+#include <net/erspan.h>
/*
Problems & solutions
@@ -59,16 +58,13 @@
We cannot track such dead loops during route installation,
it is infeasible task. The most general solutions would be
to keep skb->encapsulation counter (sort of local ttl),
- and silently drop packet when it expires. It is the best
- solution, but it supposes maintaing new variable in ALL
+ and silently drop packet when it expires. It is a good
+ solution, but it supposes maintaining new variable in ALL
skb, even if no tunneling is used.
- Current solution: t->recursion lock breaks dead loops. It looks
- like dev->tbusy flag, but I preferred new variable, because
- the semantics is different. One day, when hard_start_xmit
- will be multithreaded we will have to use skb->encapsulation.
-
-
+ Current solution: xmit_recursion breaks dead loops. This is a percpu
+ counter, since when we enter the first ndo_xmit(), cpu migration is
+ forbidden. We force an exit if this counter reaches RECURSION_LIMIT
2. Networking dead loops would not kill routers, but would really
kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -89,14 +85,14 @@
One of them is to parse packet trying to detect inner encapsulation
made by our node. It is difficult or even impossible, especially,
- taking into account fragmentation. TO be short, tt is not solution at all.
+ taking into account fragmentation. TO be short, ttl is not solution at all.
Current solution: The solution was UNEXPECTEDLY SIMPLE.
We force DF flag on tunnels with preconfigured hop limit,
that is ALL. :-) Well, it does not remove the problem completely,
but exponential growth of network traffic is changed to linear
(branches, that exceed pmtu are pruned) and tunnel mtu
- fastly degrades to value <68, where looping stops.
+ rapidly degrades to value <68, where looping stops.
Yes, it is not good if there exists a router in the loop,
which does not force DF, even when encapsulating packets have DF set.
But it is not our problem! Nobody could accuse us, we made
@@ -104,936 +100,763 @@
fatal route to network, even if it were you who configured
fatal static route: you are innocent. :-)
-
-
- 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
- practically identical code. It would be good to glue them
- together, but it is not very evident, how to make them modular.
- sit is integral part of IPv6, ipip and gre are naturally modular.
- We could extract common parts (hash table, ioctl etc)
- to a separate module (ip_tunnel.c).
-
Alexey Kuznetsov.
*/
-static int ipgre_tunnel_init(struct net_device *dev);
-static void ipgre_tunnel_setup(struct net_device *dev);
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-/* Fallback tunnel: no source, no destination, no key, no options */
+static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static const struct header_ops ipgre_header_ops;
-static int ipgre_fb_tunnel_init(struct net_device *dev);
+static int ipgre_tunnel_init(struct net_device *dev);
+static void erspan_build_header(struct sk_buff *skb,
+ u32 id, u32 index,
+ bool truncate, bool is_ipv4);
-static struct net_device *ipgre_fb_tunnel_dev;
+static unsigned int ipgre_net_id __read_mostly;
+static unsigned int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
-/* Tunnel hash table */
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
+{
-/*
- 4 hash tables:
+ /* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft
+ state for keyed GRE tunnels with enabled checksum. Tell
+ them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standards established
+ by themselves???
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
- 3: (remote,local)
- 2: (remote,*)
- 1: (*,local)
- 0: (*,*)
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
- We require exact key match i.e. if a key is present in packet
- it will match only tunnel with the same key; if it is not present,
- it will match only keyless tunnel.
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
- All keysless packets, if not matched configured keyless tunnels
- will match fallback tunnel.
- */
+ if (!t)
+ return -ENOENT;
-#define HASH_SIZE 16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return 0;
-static struct ip_tunnel *tunnels[4][HASH_SIZE];
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return 0;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
-#define tunnels_r_l (tunnels[3])
-#define tunnels_r (tunnels[2])
-#define tunnels_l (tunnels[1])
-#define tunnels_wc (tunnels[0])
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return 0;
+ break;
-static DEFINE_RWLOCK(ipgre_lock);
+ case ICMP_REDIRECT:
+ break;
+ }
-/* Given src, dst and key, find appropriate for input tunnel. */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tpi->proto == htons(ETH_P_IPV6)) {
+ unsigned int data_len = 0;
-static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be32 key)
-{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(key);
- struct ip_tunnel *t;
+ if (type == ICMP_TIME_EXCEEDED)
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
- for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
- if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
- return t;
- }
- }
- for (t = tunnels_r[h0^h1]; t; t = t->next) {
- if (remote == t->parms.iph.daddr) {
- if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
- return t;
- }
- }
- for (t = tunnels_l[h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr ||
- (local == t->parms.iph.daddr && MULTICAST(local))) {
- if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
- return t;
- }
- }
- for (t = tunnels_wc[h1]; t; t = t->next) {
- if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
- return t;
+ if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
+ type, data_len))
+ return 0;
}
+#endif
- if (ipgre_fb_tunnel_dev->flags&IFF_UP)
- return netdev_priv(ipgre_fb_tunnel_dev);
- return NULL;
-}
+ if (t->parms.iph.daddr == 0 ||
+ ipv4_is_multicast(t->parms.iph.daddr))
+ return 0;
-static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
-{
- __be32 remote = t->parms.iph.daddr;
- __be32 local = t->parms.iph.saddr;
- __be32 key = t->parms.i_key;
- unsigned h = HASH(key);
- int prio = 0;
-
- if (local)
- prio |= 1;
- if (remote && !MULTICAST(remote)) {
- prio |= 2;
- h ^= HASH(remote);
- }
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ return 0;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
- return &tunnels[prio][h];
+ return 0;
}
-static void ipgre_tunnel_link(struct ip_tunnel *t)
+static void gre_err(struct sk_buff *skb, u32 info)
{
- struct ip_tunnel **tp = ipgre_bucket(t);
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
- t->next = *tp;
- write_lock_bh(&ipgre_lock);
- *tp = t;
- write_unlock_bh(&ipgre_lock);
-}
+ const struct iphdr *iph = (struct iphdr *)skb->data;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
-static void ipgre_tunnel_unlink(struct ip_tunnel *t)
-{
- struct ip_tunnel **tp;
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
+ iph->ihl * 4) < 0)
+ return;
- for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- write_lock_bh(&ipgre_lock);
- *tp = t->next;
- write_unlock_bh(&ipgre_lock);
- break;
- }
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, IPPROTO_GRE);
+ return;
}
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
+ IPPROTO_GRE);
+ return;
+ }
+
+ ipgre_err(skb, info, &tpi);
}
-static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+static bool is_erspan_type1(int gre_hdr_len)
{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- struct ip_tunnel *t, **tp, *nt;
- struct net_device *dev;
- unsigned h = HASH(key);
- int prio = 0;
- char name[IFNAMSIZ];
-
- if (local)
- prio |= 1;
- if (remote && !MULTICAST(remote)) {
- prio |= 2;
- h ^= HASH(remote);
- }
- for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
- if (key == t->parms.i_key)
- return t;
- }
- }
- if (!create)
- return NULL;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else {
- int i;
- for (i=1; i<100; i++) {
- sprintf(name, "gre%d", i);
- if (__dev_get_by_name(name) == NULL)
- break;
- }
- if (i==100)
- goto failed;
+ /* Both ERSPAN type I (version 0) and type II (version 1) use
+ * protocol 0x88BE, but the type I has only 4-byte GRE header,
+ * while type II has 8-byte.
+ */
+ return gre_hdr_len == 4;
+}
+
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ int gre_hdr_len)
+{
+ struct net *net = dev_net(skb->dev);
+ struct metadata_dst *tun_dst = NULL;
+ struct erspan_base_hdr *ershdr;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ struct ip_tunnel_net *itn;
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph;
+ struct erspan_md2 *md2;
+ int ver;
+ int len;
+
+ ip_tunnel_flags_copy(flags, tpi->flags);
+
+ itn = net_generic(net, erspan_net_id);
+ iph = ip_hdr(skb);
+ if (is_erspan_type1(gre_hdr_len)) {
+ ver = 0;
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
+ iph->saddr, iph->daddr, 0);
+ } else {
+ if (unlikely(!pskb_may_pull(skb,
+ gre_hdr_len + sizeof(*ershdr))))
+ return PACKET_REJECT;
+
+ ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+ ver = ershdr->ver;
+ iph = ip_hdr(skb);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
+ iph->saddr, iph->daddr, tpi->key);
}
- dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
- if (!dev)
- return NULL;
+ if (tunnel) {
+ if (is_erspan_type1(gre_hdr_len))
+ len = gre_hdr_len;
+ else
+ len = gre_hdr_len + erspan_hdr_len(ver);
- dev->init = ipgre_tunnel_init;
- nt = netdev_priv(dev);
- nt->parms = *parms;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return PACKET_REJECT;
- if (register_netdevice(dev) < 0) {
- free_netdev(dev);
- goto failed;
- }
+ if (__iptunnel_pull_header(skb,
+ len,
+ htons(ETH_P_TEB),
+ false, false) < 0)
+ goto drop;
- dev_hold(dev);
- ipgre_tunnel_link(nt);
- return nt;
+ if (tunnel->collect_md) {
+ struct erspan_metadata *pkt_md, *md;
+ struct ip_tunnel_info *info;
+ unsigned char *gh;
+ __be64 tun_id;
-failed:
- return NULL;
-}
+ __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+ ip_tunnel_flags_copy(flags, tpi->flags);
+ tun_id = key32_to_tunnel_id(tpi->key);
-static void ipgre_tunnel_uninit(struct net_device *dev)
-{
- ipgre_tunnel_unlink(netdev_priv(dev));
- dev_put(dev);
-}
+ tun_dst = ip_tun_rx_dst(skb, flags,
+ tun_id, sizeof(*md));
+ if (!tun_dst)
+ return PACKET_REJECT;
+ /* skb can be uncloned in __iptunnel_pull_header, so
+ * old pkt_md is no longer valid and we need to reset
+ * it
+ */
+ gh = skb_network_header(skb) +
+ skb_network_header_len(skb);
+ pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+ sizeof(*ershdr));
+ md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
+ md->version = ver;
+ md2 = &md->u.md2;
+ memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
+ ERSPAN_V2_MDSIZE);
+
+ info = &tun_dst->u.tun_info;
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ info->key.tun_flags);
+ info->options_len = sizeof(*md);
+ }
-static void ipgre_err(struct sk_buff *skb, u32 info)
-{
-#ifndef I_WISH_WORLD_WERE_PERFECT
+ skb_reset_mac_header(skb);
+ ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+ return PACKET_RCVD;
+ }
+ return PACKET_REJECT;
-/* It is not :-( All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
+drop:
+ kfree_skb(skb);
+ return PACKET_RCVD;
+}
- Moreover, Cisco "wise men" put GRE key to the third word
- in GRE header. It makes impossible maintaining even soft state for keyed
- GRE tunnels with enabled checksum. Tell them "thank you".
+static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
+{
+ struct metadata_dst *tun_dst = NULL;
+ const struct iphdr *iph;
+ struct ip_tunnel *tunnel;
- Well, I wonder, rfc1812 was written by Cisco employee,
- what the hell these idiots break standrads established
- by themself???
- */
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->saddr, iph->daddr, tpi->key);
- struct iphdr *iph = (struct iphdr*)skb->data;
- __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
- int grehlen = (iph->ihl<<2) + 4;
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
- struct ip_tunnel *t;
- __be16 flags;
+ if (tunnel) {
+ const struct iphdr *tnl_params;
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
- return;
- if (flags&GRE_KEY) {
- grehlen += 4;
- if (flags&GRE_CSUM)
- grehlen += 4;
- }
- }
+ if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
+ raw_proto, false) < 0)
+ goto drop;
- /* If only 8 bytes returned, keyed message will be dropped here */
- if (skb_headlen(skb) < grehlen)
- return;
+ /* Special case for ipgre_header_parse(), which expects the
+ * mac_header to point to the outer IP header.
+ */
+ if (tunnel->dev->header_ops == &ipgre_header_ops)
+ skb_pop_mac_header(skb);
+ else
+ skb_reset_mac_header(skb);
- switch (type) {
- default:
- case ICMP_PARAMETERPROB:
- return;
+ tnl_params = &tunnel->parms.iph;
+ if (tunnel->collect_md || tnl_params->daddr == 0) {
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ __be64 tun_id;
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe they are just ether pollution. --ANK
- */
- break;
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ ip_tunnel_flags_and(flags, tpi->flags, flags);
+
+ tun_id = key32_to_tunnel_id(tpi->key);
+ tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
+ if (!tun_dst)
+ return PACKET_REJECT;
}
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return;
- break;
+
+ ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+ return PACKET_RCVD;
}
+ return PACKET_NEXT;
- read_lock(&ipgre_lock);
- t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0);
- if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
- goto out;
+drop:
+ kfree_skb(skb);
+ return PACKET_RCVD;
+}
- if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- goto out;
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ int res;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
- t->err_count++;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
else
- t->err_count = 1;
- t->err_time = jiffies;
-out:
- read_unlock(&ipgre_lock);
- return;
-#else
- struct iphdr *iph = (struct iphdr*)dp;
- struct iphdr *eiph;
- __be16 *p = (__be16*)(dp+(iph->ihl<<2));
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
- int rel_type = 0;
- int rel_code = 0;
- __be32 rel_info = 0;
- __u32 n = 0;
- __be16 flags;
- int grehlen = (iph->ihl<<2) + 4;
- struct sk_buff *skb2;
- struct flowi fl;
- struct rtable *rt;
+ itn = net_generic(net, ipgre_net_id);
- if (p[1] != htons(ETH_P_IP))
- return;
-
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
- return;
- if (flags&GRE_CSUM)
- grehlen += 4;
- if (flags&GRE_KEY)
- grehlen += 4;
- if (flags&GRE_SEQ)
- grehlen += 4;
+ res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
+ if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
+ /* ipgre tunnels in collect metadata mode should receive
+ * also ETH_P_TEB traffic.
+ */
+ itn = net_generic(net, ipgre_net_id);
+ res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
}
- if (len < grehlen + sizeof(struct iphdr))
- return;
- eiph = (struct iphdr*)(dp + grehlen);
-
- switch (type) {
- default:
- return;
- case ICMP_PARAMETERPROB:
- n = ntohl(skb->h.icmph->un.gateway) >> 24;
- if (n < (iph->ihl<<2))
- return;
+ return res;
+}
- /* So... This guy found something strange INSIDE encapsulated
- packet. Well, he is fool, but what can we do ?
- */
- rel_type = ICMP_PARAMETERPROB;
- n -= grehlen;
- rel_info = htonl(n << 24);
- break;
+static int gre_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int hdr_len;
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* And it is the only really necessary thing :-) */
- n = ntohs(skb->h.icmph->un.frag.mtu);
- if (n < grehlen+68)
- return;
- n -= grehlen;
- /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
- if (n > ntohs(eiph->tot_len))
- return;
- rel_info = htonl(n);
- break;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe, it is just ether pollution. --ANK
- */
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return;
- break;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
}
+#endif
- /* Prepare fake skb to feed it to icmp_send */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- return;
- dst_release(skb2->dst);
- skb2->dst = NULL;
- skb_pull(skb2, skb->data - (u8*)eiph);
- skb2->nh.raw = skb2->data;
-
- /* Try to guess incoming interface */
- memset(&fl, 0, sizeof(fl));
- fl.fl4_dst = eiph->saddr;
- fl.fl4_tos = RT_TOS(eiph->tos);
- fl.proto = IPPROTO_GRE;
- if (ip_route_output_key(&rt, &fl)) {
- kfree_skb(skb2);
- return;
- }
- skb2->dev = rt->u.dst.dev;
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
+ if (hdr_len < 0)
+ goto drop;
- /* route "incoming" packet */
- if (rt->rt_flags&RTCF_LOCAL) {
- ip_rt_put(rt);
- rt = NULL;
- fl.fl4_dst = eiph->daddr;
- fl.fl4_src = eiph->saddr;
- fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(&rt, &fl) ||
- rt->u.dst.dev->type != ARPHRD_IPGRE) {
- ip_rt_put(rt);
- kfree_skb(skb2);
- return;
- }
- } else {
- ip_rt_put(rt);
- if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
- skb2->dst->dev->type != ARPHRD_IPGRE) {
- kfree_skb(skb2);
- return;
- }
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+ tpi.proto == htons(ETH_P_ERSPAN2))) {
+ if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+ return 0;
+ goto out;
}
- /* change mtu on this route */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- if (n > dst_mtu(skb2->dst)) {
- kfree_skb(skb2);
- return;
- }
- skb2->dst->ops->update_pmtu(skb2->dst, n);
- } else if (type == ICMP_TIME_EXCEEDED) {
- struct ip_tunnel *t = netdev_priv(skb2->dev);
- if (t->parms.iph.ttl) {
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- }
- }
+ if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+ return 0;
- icmp_send(skb2, rel_type, rel_code, rel_info);
- kfree_skb(skb2);
-#endif
+out:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
}
-static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params,
+ __be16 proto)
{
- if (INET_ECN_is_ce(iph->tos)) {
- if (skb->protocol == htons(ETH_P_IP)) {
- IP_ECN_set_ce(skb->nh.iph);
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
- IP6_ECN_set_ce(skb->nh.ipv6h);
- }
- }
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+
+ ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
+
+ /* Push GRE header. */
+ gre_build_header(skb, tunnel->tun_hlen,
+ flags, proto, tunnel->parms.o_key,
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+
+ ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
-static inline u8
-ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
- u8 inner = 0;
- if (skb->protocol == htons(ETH_P_IP))
- inner = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
- return INET_ECN_encapsulate(tos, inner);
+ return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
-static int ipgre_rcv(struct sk_buff *skb)
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+ __be16 proto)
{
- struct iphdr *iph;
- u8 *h;
- __be16 flags;
- __sum16 csum = 0;
- __be32 key = 0;
- u32 seqno = 0;
- struct ip_tunnel *tunnel;
- int offset = 4;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ int tunnel_hlen;
- if (!pskb_may_pull(skb, 16))
- goto drop_nolock;
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
- iph = skb->nh.iph;
- h = skb->data;
- flags = *(__be16*)h;
+ key = &tun_info->key;
+ tunnel_hlen = gre_calc_hlen(key->tun_flags);
- if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
- /* - Version must be 0.
- - We do not support routing headers.
- */
- if (flags&(GRE_VERSION|GRE_ROUTING))
- goto drop_nolock;
-
- if (flags&GRE_CSUM) {
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
- if (!csum)
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- }
- offset += 4;
- }
- if (flags&GRE_KEY) {
- key = *(__be32*)(h + offset);
- offset += 4;
- }
- if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32*)(h + offset));
- offset += 4;
- }
- }
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto err_free_skb;
- read_lock(&ipgre_lock);
- if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
- secpath_reset(skb);
+ /* Push Tunnel header. */
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
+ goto err_free_skb;
- skb->protocol = *(__be16*)(h + 2);
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (flags == 0 &&
- skb->protocol == htons(ETH_P_WCCP)) {
- skb->protocol = htons(ETH_P_IP);
- if ((*(h + offset) & 0xF0) != 0x40)
- offset += 4;
- }
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags);
- skb->mac.raw = skb->nh.raw;
- skb->nh.raw = __pskb_pull(skb, offset);
- skb_postpull_rcsum(skb, skb->h.raw, offset);
- skb->pkt_type = PACKET_HOST;
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (MULTICAST(iph->daddr)) {
- /* Looped back packet, drop it! */
- if (((struct rtable*)skb->dst)->fl.iif == 0)
- goto drop;
- tunnel->stat.multicast++;
- skb->pkt_type = PACKET_BROADCAST;
- }
-#endif
+ gre_build_header(skb, tunnel_hlen, flags, proto,
+ tunnel_id_to_key32(tun_info->key.tun_id),
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
- if (((flags&GRE_CSUM) && csum) ||
- (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- tunnel->stat.rx_crc_errors++;
- tunnel->stat.rx_errors++;
- goto drop;
- }
- if (tunnel->parms.i_flags&GRE_SEQ) {
- if (!(flags&GRE_SEQ) ||
- (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- tunnel->stat.rx_fifo_errors++;
- tunnel->stat.rx_errors++;
- goto drop;
- }
- tunnel->i_seqno = seqno + 1;
- }
- tunnel->stat.rx_packets++;
- tunnel->stat.rx_bytes += skb->len;
- skb->dev = tunnel->dev;
- dst_release(skb->dst);
- skb->dst = NULL;
- nf_reset(skb);
- ipgre_ecn_decapsulate(iph, skb);
- netif_rx(skb);
- read_unlock(&ipgre_lock);
- return(0);
- }
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
-drop:
- read_unlock(&ipgre_lock);
-drop_nolock:
+ return;
+
+err_free_skb:
kfree_skb(skb);
- return(0);
+ DEV_STATS_INC(dev, tx_dropped);
}
-static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &tunnel->stat;
- struct iphdr *old_iph = skb->nh.iph;
- struct iphdr *tiph;
- u8 tos;
- __be16 df;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *iph; /* Our new IP header */
- int max_headroom; /* The extra header space needed */
- int gre_hlen;
- __be32 dst;
- int mtu;
-
- if (tunnel->recursion++) {
- tunnel->stat.collisions++;
- goto tx_error;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct erspan_metadata *md;
+ bool truncate = false;
+ __be16 proto;
+ int tunnel_hlen;
+ int version;
+ int nhoff;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+ if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
+ goto err_free_skb;
+ if (tun_info->options_len < sizeof(*md))
+ goto err_free_skb;
+ md = ip_tunnel_info_opts(tun_info);
+
+ /* ERSPAN has fixed 8 byte GRE header */
+ version = md->version;
+ tunnel_hlen = 8 + erspan_hdr_len(version);
+
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto err_free_skb;
+
+ if (gre_handle_offloads(skb, false))
+ goto err_free_skb;
+
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto err_free_skb;
+ truncate = true;
}
- if (dev->hard_header) {
- gre_hlen = 0;
- tiph = (struct iphdr*)skb->data;
- } else {
- gre_hlen = tunnel->hlen;
- tiph = &tunnel->parms.iph;
+ nhoff = skb_network_offset(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_offset(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
}
- if ((dst = tiph->daddr) == 0) {
- /* NBMA tunnel */
+ if (version == 1) {
+ erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
+ ntohl(md->u.index), truncate, true);
+ proto = htons(ETH_P_ERSPAN);
+ } else if (version == 2) {
+ erspan_build_header_v2(skb,
+ ntohl(tunnel_id_to_key32(key->tun_id)),
+ md->u.md2.dir,
+ get_hwid(&md->u.md2),
+ truncate, true);
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
+ goto err_free_skb;
+ }
- if (skb->dst == NULL) {
- tunnel->stat.tx_fifo_errors++;
- goto tx_error;
- }
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ gre_build_header(skb, 8, flags, proto, 0,
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)));
- if (skb->protocol == htons(ETH_P_IP)) {
- rt = (struct rtable*)skb->dst;
- if ((dst = rt->rt_gateway) == 0)
- goto tx_error_icmp;
- }
-#ifdef CONFIG_IPV6
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct in6_addr *addr6;
- int addr_type;
- struct neighbour *neigh = skb->dst->neighbour;
+ ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
- if (neigh == NULL)
- goto tx_error;
+ return;
- addr6 = (struct in6_addr*)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
+err_free_skb:
+ kfree_skb(skb);
+ DEV_STATS_INC(dev, tx_dropped);
+}
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &skb->nh.ipv6h->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
+static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct ip_tunnel_key *key;
+ struct rtable *rt;
+ struct flowi4 fl4;
- if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
- goto tx_error_icmp;
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
- dst = addr6->s6_addr32[3];
- }
-#endif
- else
- goto tx_error;
- }
+ key = &info->key;
+ ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
+ tunnel_id_to_key32(key->tun_id),
+ key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
+ skb->mark, skb_get_hash(skb), key->flow_flags);
+ rt = ip_route_output_key(dev_net(dev), &fl4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ ip_rt_put(rt);
+ info->key.u.ipv4.src = fl4.saddr;
+ return 0;
+}
- tos = tiph->tos;
- if (tos&1) {
- if (skb->protocol == htons(ETH_P_IP))
- tos = old_iph->tos;
- tos &= ~1;
- }
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tnl_params;
- {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_GRE };
- if (ip_route_output_key(&rt, &fl)) {
- tunnel->stat.tx_carrier_errors++;
- goto tx_error;
- }
- }
- tdev = rt->u.dst.dev;
+ if (!pskb_inet_may_pull(skb))
+ goto free_skb;
- if (tdev == dev) {
- ip_rt_put(rt);
- tunnel->stat.collisions++;
- goto tx_error;
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev, skb->protocol);
+ return NETDEV_TX_OK;
}
- df = tiph->frag_off;
- if (df)
- mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
- else
- mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+ if (dev->header_ops) {
+ int pull_len = tunnel->hlen + sizeof(struct iphdr);
- if (skb->dst)
- skb->dst->ops->update_pmtu(skb->dst, mtu);
+ if (skb_cow_head(skb, 0))
+ goto free_skb;
- if (skb->protocol == htons(ETH_P_IP)) {
- df |= (old_iph->frag_off&htons(IP_DF));
+ if (!pskb_may_pull(skb, pull_len))
+ goto free_skb;
- if ((old_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-#ifdef CONFIG_IPV6
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
-
- if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
- rt6->rt6i_dst.plen == 128) {
- rt6->rt6i_flags |= RTF_MODIFIED;
- skb->dst->metrics[RTAX_MTU-1] = mtu;
- }
- }
+ tnl_params = (const struct iphdr *)skb->data;
- if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-#endif
+ /* ip_tunnel_xmit() needs skb->data pointing to gre header. */
+ skb_pull(skb, pull_len);
+ skb_reset_mac_header(skb);
- if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
- tunnel->err_count--;
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_start(skb) < skb->data)
+ goto free_skb;
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
+ tnl_params = &tunnel->parms.iph;
}
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
+ goto free_skb;
- if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- stats->tx_dropped++;
- dev_kfree_skb(skb);
- tunnel->recursion--;
- return 0;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = skb->nh.iph;
- }
+ __gre_xmit(skb, dev, tnl_params, skb->protocol);
+ return NETDEV_TX_OK;
- skb->h.raw = skb->nh.raw;
- skb->nh.raw = skb_push(skb, gre_hlen);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
+free_skb:
+ kfree_skb(skb);
+ DEV_STATS_INC(dev, tx_dropped);
+ return NETDEV_TX_OK;
+}
- /*
- * Push down and install the IPIP header.
- */
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ bool truncate = false;
+ __be16 proto;
- iph = skb->nh.iph;
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr) >> 2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_GRE;
- iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
-
- if ((iph->ttl = tiph->ttl) == 0) {
- if (skb->protocol == htons(ETH_P_IP))
- iph->ttl = old_iph->ttl;
-#ifdef CONFIG_IPV6
- else if (skb->protocol == htons(ETH_P_IPV6))
- iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
-#endif
- else
- iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+ if (!pskb_inet_may_pull(skb))
+ goto free_skb;
+
+ if (tunnel->collect_md) {
+ erspan_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
}
- ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
- ((__be16*)(iph+1))[1] = skb->protocol;
+ if (gre_handle_offloads(skb, false))
+ goto free_skb;
- if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- if (tunnel->parms.o_flags&GRE_SEQ) {
- ++tunnel->o_seqno;
- *ptr = htonl(tunnel->o_seqno);
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_KEY) {
- *ptr = tunnel->parms.o_key;
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_CSUM) {
- *ptr = 0;
- *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
- }
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto free_skb;
+ truncate = true;
}
- nf_reset(skb);
-
- IPTUNNEL_XMIT();
- tunnel->recursion--;
- return 0;
+ /* Push ERSPAN header */
+ if (tunnel->erspan_ver == 0) {
+ proto = htons(ETH_P_ERSPAN);
+ __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags);
+ } else if (tunnel->erspan_ver == 1) {
+ erspan_build_header(skb, ntohl(tunnel->parms.o_key),
+ tunnel->index,
+ truncate, true);
+ proto = htons(ETH_P_ERSPAN);
+ } else if (tunnel->erspan_ver == 2) {
+ erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
+ tunnel->dir, tunnel->hwid,
+ truncate, true);
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
+ goto free_skb;
+ }
-tx_error_icmp:
- dst_link_failure(skb);
+ __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags);
+ __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
+ return NETDEV_TX_OK;
-tx_error:
- stats->tx_errors++;
- dev_kfree_skb(skb);
- tunnel->recursion--;
- return 0;
+free_skb:
+ kfree_skb(skb);
+ DEV_STATS_INC(dev, tx_dropped);
+ return NETDEV_TX_OK;
}
-static int
-ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
- int err = 0;
- struct ip_tunnel_parm p;
- struct ip_tunnel *t;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipgre_fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipgre_tunnel_locate(&p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
+ if (!pskb_inet_may_pull(skb))
+ goto free_skb;
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- if (!(p.i_flags&GRE_KEY))
- p.i_key = 0;
- if (!(p.o_flags&GRE_KEY))
- p.o_key = 0;
-
- t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- unsigned nflags=0;
-
- t = netdev_priv(dev);
-
- if (MULTICAST(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
- err = -EINVAL;
- break;
- }
- ipgre_tunnel_unlink(t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipgre_tunnel_link(t);
- netdev_state_change(dev);
- }
- }
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
+ return NETDEV_TX_OK;
+ }
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
+ goto free_skb;
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipgre_fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(ipgre_fb_tunnel_dev))
- goto done;
- dev = t->dev;
- }
- err = unregister_netdevice(dev);
- break;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- default:
- err = -EINVAL;
- }
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
+ return NETDEV_TX_OK;
-done:
- return err;
+free_skb:
+ kfree_skb(skb);
+ DEV_STATS_INC(dev, tx_dropped);
+ return NETDEV_TX_OK;
}
-static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
+static void ipgre_link_update(struct net_device *dev, bool set_mtu)
{
- return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int len;
+
+ len = tunnel->tun_hlen;
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+ len = tunnel->tun_hlen - len;
+ tunnel->hlen = tunnel->hlen + len;
+
+ if (dev->header_ops)
+ dev->hard_header_len += len;
+ else
+ dev->needed_headroom += len;
+
+ if (set_mtu)
+ WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68));
+
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) ||
+ (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+ tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+ dev->features &= ~NETIF_F_GSO_SOFTWARE;
+ dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+ } else {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
}
-static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+static int ipgre_tunnel_ctl(struct net_device *dev,
+ struct ip_tunnel_parm_kern *p,
+ int cmd)
{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
- return -EINVAL;
- dev->mtu = new_mtu;
+ __be16 i_flags, o_flags;
+ int err;
+
+ if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(p->o_flags))
+ return -EOVERFLOW;
+
+ i_flags = ip_tunnel_flags_to_be16(p->i_flags);
+ o_flags = ip_tunnel_flags_to_be16(p->o_flags);
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
+ ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+ }
+
+ gre_flags_to_tnl_flags(p->i_flags, i_flags);
+ gre_flags_to_tnl_flags(p->o_flags, o_flags);
+
+ err = ip_tunnel_ctl(dev, p, cmd);
+ if (err)
+ return err;
+
+ if (cmd == SIOCCHGTUNNEL) {
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
+
+ if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
+ ipgre_link_update(dev, true);
+ }
+
+ i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+ ip_tunnel_flags_from_be16(p->i_flags, i_flags);
+ o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+ ip_tunnel_flags_from_be16(p->o_flags, o_flags);
+
return 0;
}
-#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
It allows to construct virtual multiprotocol broadcast "LAN"
over the Internet, provided multicast routing is tuned.
@@ -1043,7 +866,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
so that I had to set ARPHRD_IPGRE to a random value.
I have an impression, that Cisco could make something similar,
but this feature is apparently missing in IOS<=11.2(8).
-
+
I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
@@ -1060,54 +883,68 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
...
ftp fec0:6666:6666::193.233.7.65
...
-
*/
-
-static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
- void *daddr, void *saddr, unsigned len)
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
- __be16 *p = (__be16*)(iph+1);
+ struct iphdr *iph;
+ struct gre_base_hdr *greh;
+
+ iph = skb_push(skb, t->hlen + sizeof(*iph));
+ greh = (struct gre_base_hdr *)(iph+1);
+ greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->protocol = htons(type);
memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
- p[0] = t->parms.o_flags;
- p[1] = htons(type);
- /*
- * Set the source hardware address.
- */
-
+ /* Set the source hardware address. */
if (saddr)
memcpy(&iph->saddr, saddr, 4);
-
- if (daddr) {
+ if (daddr)
memcpy(&iph->daddr, daddr, 4);
- return t->hlen;
- }
- if (iph->daddr && !MULTICAST(iph->daddr))
- return t->hlen;
-
- return -t->hlen;
+ if (iph->daddr)
+ return t->hlen + sizeof(*iph);
+
+ return -(t->hlen + sizeof(*iph));
+}
+
+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
+ memcpy(haddr, &iph->saddr, 4);
+ return 4;
}
+static const struct header_ops ipgre_header_ops = {
+ .create = ipgre_header,
+ .parse = ipgre_header_parse,
+};
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- if (MULTICAST(t->parms.iph.daddr)) {
- struct flowi fl = { .oif = t->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = t->parms.iph.daddr,
- .saddr = t->parms.iph.saddr,
- .tos = RT_TOS(t->parms.iph.tos) } },
- .proto = IPPROTO_GRE };
+ if (ipv4_is_multicast(t->parms.iph.daddr)) {
+ struct flowi4 fl4 = {
+ .flowi4_oif = t->parms.link,
+ .flowi4_dscp = ip4h_dscp(&t->parms.iph),
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_proto = IPPROTO_GRE,
+ .saddr = t->parms.iph.saddr,
+ .daddr = t->parms.iph.daddr,
+ .fl4_gre_key = t->parms.o_key,
+ };
struct rtable *rt;
- if (ip_route_output_key(&rt, &fl))
+
+ rt = ip_route_output_key(t->net, &fl4);
+ if (IS_ERR(rt))
return -EADDRNOTAVAIL;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
- if (__in_dev_get_rtnl(dev) == NULL)
+ if (!__in_dev_get_rtnl(dev))
return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
@@ -1118,191 +955,905 @@ static int ipgre_open(struct net_device *dev)
static int ipgre_close(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
- struct in_device *in_dev = inetdev_by_index(t->mlink);
- if (in_dev) {
+
+ if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
+ struct in_device *in_dev;
+ in_dev = inetdev_by_index(t->net, t->mlink);
+ if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
- in_dev_put(in_dev);
- }
}
return 0;
}
+#endif
+static const struct net_device_ops ipgre_netdev_ops = {
+ .ndo_init = ipgre_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ .ndo_open = ipgre_open,
+ .ndo_stop = ipgre_close,
#endif
+ .ndo_start_xmit = ipgre_xmit,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipgre_tunnel_ctl,
+};
+
+#define GRE_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
static void ipgre_tunnel_setup(struct net_device *dev)
{
- SET_MODULE_OWNER(dev);
- dev->uninit = ipgre_tunnel_uninit;
- dev->destructor = free_netdev;
- dev->hard_start_xmit = ipgre_tunnel_xmit;
- dev->get_stats = ipgre_tunnel_get_stats;
- dev->do_ioctl = ipgre_tunnel_ioctl;
- dev->change_mtu = ipgre_tunnel_change_mtu;
-
+ dev->netdev_ops = &ipgre_netdev_ops;
dev->type = ARPHRD_IPGRE;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
- dev->flags = IFF_NOARP;
- dev->iflink = 0;
- dev->addr_len = 4;
+ ip_tunnel_setup(dev, ipgre_net_id);
}
-static int ipgre_tunnel_init(struct net_device *dev)
+static void __gre_tunnel_init(struct net_device *dev)
{
- struct net_device *tdev = NULL;
struct ip_tunnel *tunnel;
- struct iphdr *iph;
- int hlen = LL_MAX_HEADER;
- int mtu = ETH_DATA_LEN;
- int addend = sizeof(struct iphdr) + 4;
tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
- /* Guess output device to choose reasonable mtu and hard_header_len */
+ /* TCP offload with GRE SEQ is not supported, nor can we support 2
+ * levels of outer headers requiring an update.
+ */
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags))
+ return;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+ tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ return;
- if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_GRE };
- struct rtable *rt;
- if (!ip_route_output_key(&rt, &fl)) {
- tdev = rt->u.dst.dev;
- ip_rt_put(rt);
- }
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->lltx = true;
+}
- dev->flags |= IFF_POINTOPOINT;
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ __gre_tunnel_init(dev);
+
+ __dev_addr_set(dev, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->flags = IFF_NOARP;
+ netif_keep_dst(dev);
+ dev->addr_len = 4;
+ if (iph->daddr && !tunnel->collect_md) {
#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (MULTICAST(iph->daddr)) {
+ if (ipv4_is_multicast(iph->daddr)) {
if (!iph->saddr)
return -EINVAL;
dev->flags = IFF_BROADCAST;
- dev->hard_header = ipgre_header;
- dev->open = ipgre_open;
- dev->stop = ipgre_close;
+ dev->header_ops = &ipgre_header_ops;
+ dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+ dev->needed_headroom = 0;
}
#endif
+ } else if (!tunnel->collect_md) {
+ dev->header_ops = &ipgre_header_ops;
+ dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+ dev->needed_headroom = 0;
+ }
+
+ return ip_tunnel_init(dev);
+}
+
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+};
+
+static int __net_init ipgre_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
+}
+
+static void __net_exit ipgre_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill);
+}
+
+static struct pernet_operations ipgre_net_ops = {
+ .init = ipgre_init_net,
+ .exit_rtnl = ipgre_exit_rtnl,
+ .id = &ipgre_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be16 flags;
+
+ if (!data)
+ return 0;
+
+ flags = 0;
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (flags & (GRE_VERSION|GRE_ROUTING))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_COLLECT_METADATA] &&
+ data[IFLA_GRE_ENCAP_TYPE] &&
+ nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be32 daddr;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
}
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(tunnel->parms.link);
+ if (!data)
+ goto out;
- if (tdev) {
- hlen = tdev->hard_header_len;
- mtu = tdev->mtu;
+ if (data[IFLA_GRE_REMOTE]) {
+ memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+ if (!daddr)
+ return -EINVAL;
}
- dev->iflink = tunnel->parms.link;
-
- /* Precalculate GRE options length */
- if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
- if (tunnel->parms.o_flags&GRE_CSUM)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_KEY)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_SEQ)
- addend += 4;
+
+out:
+ return ipgre_tunnel_validate(tb, data, extack);
+}
+
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be16 flags = 0;
+ int ret;
+
+ if (!data)
+ return 0;
+
+ ret = ipgre_tap_validate(tb, data, extack);
+ if (ret)
+ return ret;
+
+ if (data[IFLA_GRE_ERSPAN_VER] &&
+ nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
+ return 0;
+
+ /* ERSPAN type II/III should only have GRE sequence and key flag */
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (!data[IFLA_GRE_COLLECT_METADATA] &&
+ flags != (GRE_SEQ | GRE_KEY))
+ return -EINVAL;
+
+ /* ERSPAN Session ID only has 10-bit. Since we reuse
+ * 32-bit key field as ID, check it's range.
+ */
+ if (data[IFLA_GRE_IKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_OKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipgre_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm_kern *parms,
+ __u32 *fwmark)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_GRE;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_GRE_LINK])
+ parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+ if (data[IFLA_GRE_IFLAGS])
+ gre_flags_to_tnl_flags(parms->i_flags,
+ nla_get_be16(data[IFLA_GRE_IFLAGS]));
+
+ if (data[IFLA_GRE_OFLAGS])
+ gre_flags_to_tnl_flags(parms->o_flags,
+ nla_get_be16(data[IFLA_GRE_OFLAGS]));
+
+ if (data[IFLA_GRE_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+ if (data[IFLA_GRE_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+ if (data[IFLA_GRE_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
+
+ if (data[IFLA_GRE_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
+
+ if (data[IFLA_GRE_TTL])
+ parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
+
+ if (data[IFLA_GRE_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
+
+ if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
+ if (t->ignore_df)
+ return -EINVAL;
+ parms->iph.frag_off = htons(IP_DF);
}
- dev->hard_header_len = hlen + addend;
- dev->mtu = mtu - addend;
- tunnel->hlen = addend;
+
+ if (data[IFLA_GRE_COLLECT_METADATA]) {
+ t->collect_md = true;
+ if (dev->type == ARPHRD_IPGRE)
+ dev->type = ARPHRD_NONE;
+ }
+
+ if (data[IFLA_GRE_IGNORE_DF]) {
+ if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
+ && (parms->iph.frag_off & htons(IP_DF)))
+ return -EINVAL;
+ t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
+ }
+
+ if (data[IFLA_GRE_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
return 0;
}
-static int __init ipgre_fb_tunnel_init(struct net_device *dev)
+static int erspan_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm_kern *parms,
+ __u32 *fwmark)
{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
+ if (err)
+ return err;
+ if (!data)
+ return 0;
- iph->version = 4;
- iph->protocol = IPPROTO_GRE;
- iph->ihl = 5;
- tunnel->hlen = sizeof(struct iphdr) + 4;
+ if (data[IFLA_GRE_ERSPAN_VER]) {
+ t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+ if (t->erspan_ver > 2)
+ return -EINVAL;
+ }
+
+ if (t->erspan_ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX]) {
+ t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ if (t->index & ~INDEX_MASK)
+ return -EINVAL;
+ }
+ } else if (t->erspan_ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR]) {
+ t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+ if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+ return -EINVAL;
+ }
+ if (data[IFLA_GRE_ERSPAN_HWID]) {
+ t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+ if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+ return -EINVAL;
+ }
+ }
- dev_hold(dev);
- tunnels_wc[0] = tunnel;
return 0;
}
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipgre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+static int gre_tap_init(struct net_device *dev)
+{
+ __gre_tunnel_init(dev);
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
-static struct net_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
+ return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_tap_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
+static int erspan_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
-/*
- * And now the modules code and kernel interface.
- */
+ if (tunnel->erspan_ver == 0)
+ tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
+ else
+ tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
-static int __init ipgre_init(void)
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+ erspan_hdr_len(tunnel->erspan_ver);
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
+
+ return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+ .ndo_init = erspan_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = erspan_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_fill_metadata_dst = gre_fill_metadata_dst,
+};
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->max_mtu = 0;
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, gre_tap_net_id);
+}
+
+static int
+ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
{
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int ipgre_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err < 0)
+ return err;
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
+}
+
+static int erspan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err)
+ return err;
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
+}
+
+static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = t->fwmark;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err < 0)
+ return err;
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
+
+ ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
+
+ ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+ return 0;
+}
+
+static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = t->fwmark;
int err;
- printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err < 0)
+ return err;
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
+
+ ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
+
+ return 0;
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_GRE_LINK */
+ nla_total_size(4) +
+ /* IFLA_GRE_IFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_OFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_IKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_OKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_GRE_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_GRE_TTL */
+ nla_total_size(1) +
+ /* IFLA_GRE_TOS */
+ nla_total_size(1) +
+ /* IFLA_GRE_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
+ /* IFLA_GRE_IGNORE_DF */
+ nla_total_size(1) +
+ /* IFLA_GRE_FWMARK */
+ nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_INDEX */
+ nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_VER */
+ nla_total_size(1) +
+ /* IFLA_GRE_ERSPAN_DIR */
+ nla_total_size(1) +
+ /* IFLA_GRE_ERSPAN_HWID */
+ nla_total_size(2) +
+ 0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern *p = &t->parms;
+ IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+ ip_tunnel_flags_copy(o_flags, p->o_flags);
+
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS,
+ gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS,
+ gre_tnl_flags_to_gre_flags(o_flags)) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))) ||
+ nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.flags))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
+ goto nla_put_failure;
+
+ if (t->collect_md) {
+ if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
- if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
- printk(KERN_INFO "ipgre init: can't add protocol\n");
- return -EAGAIN;
+ if (t->erspan_ver <= 2) {
+ if (t->erspan_ver != 0 && !t->collect_md)
+ __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
+
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+ goto nla_put_failure;
+
+ if (t->erspan_ver == 1) {
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+ goto nla_put_failure;
+ } else if (t->erspan_ver == 2) {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+ goto nla_put_failure;
+ }
}
- ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
- ipgre_tunnel_setup);
- if (!ipgre_fb_tunnel_dev) {
- err = -ENOMEM;
- goto err1;
+ return ipgre_fill_info(skb, dev);
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static void erspan_setup(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ ether_setup(dev);
+ dev->max_mtu = 0;
+ dev->netdev_ops = &erspan_netdev_ops;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, erspan_net_id);
+ t->erspan_ver = 1;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+ [IFLA_GRE_LINK] = { .type = NLA_U32 },
+ [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_IKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_OKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
+ [IFLA_GRE_TTL] = { .type = NLA_U8 },
+ [IFLA_GRE_TOS] = { .type = NLA_U8 },
+ [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
+ [IFLA_GRE_FWMARK] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
+ .kind = "gre",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tunnel_setup,
+ .validate = ipgre_tunnel_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+ .kind = "gretap",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tap_setup,
+ .validate = ipgre_tap_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+ .kind = "erspan",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = erspan_setup,
+ .validate = erspan_validate,
+ .newlink = erspan_newlink,
+ .changelink = erspan_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = erspan_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+ u8 name_assign_type)
+{
+ struct rtnl_newlink_params params = { .src_net = net };
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ LIST_HEAD(list_kill);
+ struct ip_tunnel *t;
+ int err;
+
+ memset(&tb, 0, sizeof(tb));
+ params.tb = tb;
+
+ dev = rtnl_create_link(net, name, name_assign_type,
+ &ipgre_tap_ops, tb, NULL);
+ if (IS_ERR(dev))
+ return dev;
+
+ /* Configure flow based GRE device. */
+ t = netdev_priv(dev);
+ t->collect_md = true;
+
+ err = ipgre_newlink(dev, &params, NULL);
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
}
- ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
+ /* openvswitch users expect packet sizes to be unrestricted,
+ * so set the largest MTU we can.
+ */
+ err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
+ if (err)
+ goto out;
+
+ err = rtnl_configure_link(dev, NULL, 0, NULL);
+ if (err < 0)
+ goto out;
- if ((err = register_netdev(ipgre_fb_tunnel_dev)))
- goto err2;
+ return dev;
out:
- return err;
-err2:
- free_netdev(ipgre_fb_tunnel_dev);
-err1:
- inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
- goto out;
+ ip_tunnel_dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
-static void __exit ipgre_destroy_tunnels(void)
+static int __net_init ipgre_tap_init_net(struct net *net)
{
- int prio;
-
- for (prio = 0; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
- while ((t = tunnels[prio][h]) != NULL)
- unregister_netdevice(t->dev);
- }
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
+}
+
+static void __net_exit ipgre_tap_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit_rtnl = ipgre_tap_exit_rtnl,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int __net_init erspan_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, erspan_net_id,
+ &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill);
+}
+
+static struct pernet_operations erspan_net_ops = {
+ .init = erspan_init_net,
+ .exit_rtnl = erspan_exit_rtnl,
+ .id = &erspan_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipgre_init(void)
+{
+ int err;
+
+ pr_info("GRE over IPv4 tunneling driver\n");
+
+ err = register_pernet_device(&ipgre_net_ops);
+ if (err < 0)
+ return err;
+
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_failed;
+
+ err = register_pernet_device(&erspan_net_ops);
+ if (err < 0)
+ goto pnet_erspan_failed;
+
+ err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ if (err < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ goto add_proto_failed;
}
+
+ err = rtnl_link_register(&ipgre_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ err = rtnl_link_register(&ipgre_tap_ops);
+ if (err < 0)
+ goto tap_ops_failed;
+
+ err = rtnl_link_register(&erspan_link_ops);
+ if (err < 0)
+ goto erspan_link_failed;
+
+ return 0;
+
+erspan_link_failed:
+ rtnl_link_unregister(&ipgre_tap_ops);
+tap_ops_failed:
+ rtnl_link_unregister(&ipgre_link_ops);
+rtnl_link_failed:
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+add_proto_failed:
+ unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_failed:
+ unregister_pernet_device(&ipgre_net_ops);
+ return err;
}
static void __exit ipgre_fini(void)
{
- if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
- printk(KERN_INFO "ipgre close: can't remove protocol\n");
-
- rtnl_lock();
- ipgre_destroy_tunnels();
- rtnl_unlock();
+ rtnl_link_unregister(&ipgre_tap_ops);
+ rtnl_link_unregister(&ipgre_link_ops);
+ rtnl_link_unregister(&erspan_link_ops);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ unregister_pernet_device(&ipgre_tap_net_ops);
+ unregister_pernet_device(&ipgre_net_ops);
+ unregister_pernet_device(&erspan_net_ops);
}
module_init(ipgre_init);
module_exit(ipgre_fini);
+MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("gre");
+MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_RTNL_LINK("erspan");
+MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
+MODULE_ALIAS_NETDEV("erspan0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 212734ca238f..19d3141dad1f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,17 +6,14 @@
*
* The Internet Protocol (IP) module.
*
- * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Donald Becker, <becker@super.org>
- * Alan Cox, <Alan.Cox@linux.org>
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
* Richard Underwood
* Stefan Becker, <stefanb@yello.ping.de>
* Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *
*
* Fixes:
* Alan Cox : Commented a couple of minor bits of surplus code
@@ -98,29 +96,24 @@
* Jos Vos : Do accounting *before* call_in_firewall
* Willy Konynenberg : Transparent proxying support
*
- *
- *
* To Fix:
* IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
* and could be made very efficient with the addition of some virtual memory hacks to permit
* the allocation of a buffer that can then be 'grown' by twiddling page tables.
- * Output fragmentation wants updating along with the buffer management to use a single
+ * Output fragmentation wants updating along with the buffer management to use a single
* interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
* output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
* fragmentation anyway.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-#include <asm/system.h>
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <linux/net.h>
#include <linux/socket.h>
@@ -130,6 +123,7 @@
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/snmp.h>
#include <net/ip.h>
@@ -141,42 +135,38 @@
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
+#include <net/inet_ecn.h>
#include <linux/netfilter_ipv4.h>
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <net/dst_metadata.h>
+#include <net/udp.h>
+#include <net/tcp.h>
/*
- * SNMP management statistics
+ * Process Router Attention IP option (RFC 2113)
*/
-
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
-
-/*
- * Process Router Attention IP option
- */
-int ip_call_ra_chain(struct sk_buff *skb)
+bool ip_call_ra_chain(struct sk_buff *skb)
{
struct ip_ra_chain *ra;
- u8 protocol = skb->nh.iph->protocol;
+ u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
- read_lock(&ip_ra_lock);
- for (ra = ip_ra_chain; ra; ra = ra->next) {
+ for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report
* the packet if it came from that interface.
*/
- if (sk && inet_sk(sk)->num == protocol &&
+ if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
- sk->sk_bound_dev_if == skb->dev->ifindex)) {
- if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
- skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
- if (skb == NULL) {
- read_unlock(&ip_ra_lock);
- return 1;
- }
+ sk->sk_bound_dev_if == dev->ifindex)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
+ return true;
}
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -189,69 +179,66 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (last) {
raw_rcv(last, skb);
- read_unlock(&ip_ra_lock);
- return 1;
+ return true;
}
- read_unlock(&ip_ra_lock);
- return 0;
+ return false;
}
-static inline int ip_local_deliver_finish(struct sk_buff *skb)
+INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
- int ihl = skb->nh.iph->ihl*4;
-
- __skb_pull(skb, ihl);
-
- /* Point into the IP datagram, just past the header. */
- skb->h.raw = skb->data;
-
- rcu_read_lock();
- {
- /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
- int protocol = skb->nh.iph->protocol;
- int hash;
- struct sock *raw_sk;
- struct net_protocol *ipprot;
-
- resubmit:
- hash = protocol & (MAX_INET_PROTOS - 1);
- raw_sk = sk_head(&raw_v4_htable[hash]);
-
- /* If there maybe a raw socket we must check - if not we
- * don't care less
- */
- if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
- raw_sk = NULL;
-
- if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
+ const struct net_protocol *ipprot;
+ int raw, ret;
+
+resubmit:
+ raw = raw_local_deliver(skb, protocol);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_XFRM_POLICY);
+ return;
}
- ret = ipprot->handler(skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
+ nf_reset_ct(skb);
+ }
+ ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
+ skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
- IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
} else {
- if (!raw_sk) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- } else
- IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
+ return 0;
+ }
+
+ skb_clear_delivery_time(skb);
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
@@ -259,28 +246,30 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
/*
* Deliver IP Packets to the higher protocol layers.
- */
+ */
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
+ struct net *net = dev_net(skb->dev);
- if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
- skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
- if (!skb)
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
- return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
+ net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
+EXPORT_SYMBOL(ip_local_deliver);
-static inline int ip_rcv_options(struct sk_buff *skb)
+static inline enum skb_drop_reason
+ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
+ const struct iphdr *iph;
struct ip_options *opt;
- struct iphdr *iph;
- struct net_device *dev = skb->dev;
/* It looks as overkill, because not all
IP options require packet mangling.
@@ -290,111 +279,215 @@ static inline int ip_rcv_options(struct sk_buff *skb)
--ANK (980813)
*/
if (skb_cow(skb, skb_headroom(skb))) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
+ __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ return SKB_DROP_REASON_NOMEM;
}
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
+ opt = &(IPCB(skb)->opt);
+ opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
- if (ip_options_compile(NULL, skb)) {
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
- goto drop;
+ if (ip_options_compile(dev_net(dev), opt, skb)) {
+ __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ return SKB_DROP_REASON_IP_INHDR;
}
- opt = &(IPCB(skb)->opt);
if (unlikely(opt->srr)) {
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
if (in_dev) {
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) &&
- net_ratelimit())
- printk(KERN_INFO "source route option "
- "%u.%u.%u.%u -> %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr),
- NIPQUAD(iph->daddr));
- in_dev_put(in_dev);
- goto drop;
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("source route option %pI4 -> %pI4\n",
+ &iph->saddr,
+ &iph->daddr);
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
-
- in_dev_put(in_dev);
}
- if (ip_options_rcv_srr(skb))
- goto drop;
+ if (ip_options_rcv_srr(skb, dev))
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
- return 0;
-drop:
- return -1;
+ return SKB_NOT_DROPPED_YET;
+}
+
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+ ip_hdr(hint)->tos == iph->tos;
}
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static int ip_rcv_finish_core(struct net *net,
+ struct sk_buff *skb, struct net_device *dev,
+ const struct sk_buff *hint)
{
- struct iphdr *iph = skb->nh.iph;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ int drop_reason;
+
+ if (ip_can_use_hint(skb, iph, hint)) {
+ drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev, hint);
+ if (unlikely(drop_reason))
+ goto drop_error;
+ }
+
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+ !skb_dst(skb) &&
+ !skb->sk &&
+ !ip_is_fragment(iph)) {
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+ tcp_v4_early_demux(skb);
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+ drop_reason = udp_v4_early_demux(skb);
+ if (unlikely(drop_reason))
+ goto drop_error;
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
+ }
+ }
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
- */
- if (skb->dst == NULL) {
- int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev);
- if (unlikely(err)) {
- if (err == -EHOSTUNREACH)
- IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
- goto drop;
- }
+ */
+ if (!skb_valid_dst(skb)) {
+ drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ if (unlikely(drop_reason))
+ goto drop_error;
+ } else {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
}
-#ifdef CONFIG_NET_CLS_ROUTE
- if (unlikely(skb->dst->tclassid)) {
- struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
- u32 idx = skb->dst->tclassid;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (unlikely(skb_dst(skb)->tclassid)) {
+ struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
+ u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
- st[idx&0xFF].o_bytes+=skb->len;
+ st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
- st[(idx>>16)&0xFF].i_bytes+=skb->len;
+ st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
- if (iph->ihl > 5 && ip_rcv_options(skb))
- goto drop;
+ if (iph->ihl > 5) {
+ drop_reason = ip_rcv_options(skb, dev);
+ if (drop_reason)
+ goto drop;
+ }
- return dst_input(skb);
+ rt = skb_rtable(skb);
+ if (rt->rt_type == RTN_MULTICAST) {
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
+ } else if (rt->rt_type == RTN_BROADCAST) {
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
+ } else if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ /* RFC 1122 3.3.6:
+ *
+ * When a host sends a datagram to a link-layer broadcast
+ * address, the IP destination address MUST be a legal IP
+ * broadcast or IP multicast address.
+ *
+ * A host SHOULD silently discard a datagram that is received
+ * via a link-layer broadcast (see Section 2.4) but does not
+ * specify an IP multicast or broadcast destination address.
+ *
+ * This doesn't explicitly say L2 *broadcast*, but broadcast is
+ * in a way a form of multicast and the most common use case for
+ * this is 802.11 protecting against cross-station spoofing (the
+ * so-called "hole-196" attack) so do it for both.
+ */
+ if (in_dev &&
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+ drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
+ goto drop;
+ }
+ }
+
+ return NET_RX_SUCCESS;
drop:
- kfree_skb(skb);
- return NET_RX_DROP;
+ kfree_skb_reason(skb, drop_reason);
+ return NET_RX_DROP;
+
+drop_error:
+ if (drop_reason == SKB_DROP_REASON_IP_RPFILTER)
+ __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+ goto drop;
+}
+
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ int ret;
+
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
+ ret = ip_rcv_finish_core(net, skb, dev, NULL);
+ if (ret != NET_RX_DROP)
+ ret = dst_input(skb);
+ return ret;
}
/*
* Main IP Receive routine.
- */
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+ */
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
- struct iphdr *iph;
+ const struct iphdr *iph;
+ int drop_reason;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
- if (skb->pkt_type == PACKET_OTHERHOST)
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+ drop_reason = SKB_DROP_REASON_OTHERHOST;
goto drop;
+ }
- IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
/*
- * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+ * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
@@ -407,16 +500,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
+ BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+ BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+ BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+ __IP_ADD_STATS(net,
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
- goto inhdr_error;
+ goto csum_error;
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl*4))
+ len = iph_totlen(skb, iph);
+ if (skb->len < len) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+ __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
@@ -424,22 +528,152 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
+ iph = ip_hdr(skb);
+ skb->transport_header = skb->network_header + iph->ihl*4;
+
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->skb_iif;
- return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
- ip_rcv_finish);
+ /* Must drop socket now because of tproxy. */
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
+
+ return skb;
+csum_error:
+ drop_reason = SKB_DROP_REASON_IP_CSUM;
+ __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+ if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ drop_reason = SKB_DROP_REASON_IP_INHDR;
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out:
- return NET_RX_DROP;
+ return NULL;
}
-EXPORT_SYMBOL(ip_statistics);
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip_rcv_finish);
+}
+
+static void ip_sublist_rcv_finish(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ dst_input(skb);
+ }
+}
+
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+ struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (fib4_has_custom_rules(net) ||
+ ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_zeronet(iph->daddr) ||
+ IPCB(skb)->flags & IPSKB_MULTIPATH)
+ return NULL;
+
+ return skb;
+}
+
+static void ip_list_rcv_finish(struct net *net, struct list_head *head)
+{
+ struct sk_buff *skb, *next, *hint = NULL;
+ struct dst_entry *curr_dst = NULL;
+ LIST_HEAD(sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct dst_entry *dst;
+
+ skb_list_del_init(skb);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ continue;
+ if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
+ continue;
+
+ dst = skb_dst(skb);
+ if (curr_dst != dst) {
+ hint = ip_extract_route_hint(net, skb);
+
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dst = dst;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv_finish(&sublist);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net)
+{
+ NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+ head, dev, NULL, ip_rcv_finish);
+ ip_list_rcv_finish(net, head);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ skb_list_del_init(skb);
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ continue;
+
+ if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
+}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 9f02917d6f45..be8815ce3ac2 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,16 +6,18 @@
*
* The options processing module for ip.c
*
- * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $
- *
* Authors: A.N.Kuznetsov
- *
+ *
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
@@ -25,8 +28,9 @@
#include <net/icmp.h>
#include <net/route.h>
#include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
-/*
+/*
* Write options to IP header, record destination address to
* source route option, address of outgoing interface
* (we should already know it, so that this function is allowed be
@@ -37,46 +41,31 @@
* saddr is address of outgoing interface.
*/
-void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
- __be32 daddr, struct rtable *rt, int is_frag)
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+ __be32 daddr, struct rtable *rt)
{
- unsigned char * iph = skb->nh.raw;
+ unsigned char *iph = skb_network_header(skb);
memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
- memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen);
opt = &(IPCB(skb)->opt);
- opt->is_data = 0;
if (opt->srr)
- memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+ memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
- if (!is_frag) {
- if (opt->rr_needaddr)
- ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
- if (opt->ts_needaddr)
- ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
- if (opt->ts_needtime) {
- struct timeval tv;
- __be32 midtime;
- do_gettimeofday(&tv);
- midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
- memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
- }
- return;
- }
- if (opt->rr) {
- memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
- opt->rr = 0;
- opt->rr_needaddr = 0;
- }
- if (opt->ts) {
- memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
- opt->ts = 0;
- opt->ts_needaddr = opt->ts_needtime = 0;
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+ if (opt->ts_needtime) {
+ __be32 midtime;
+
+ midtime = inet_current_timestamp();
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
}
-/*
+/*
* Provided (sopt, skb) points to received options,
* build in dopt compiled option set appropriate for answering.
* i.e. invert SRR option, copy anothers,
@@ -85,33 +74,21 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
* NOTE: dopt cannot point to skb.
*/
-int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+ struct sk_buff *skb, const struct ip_options *sopt)
{
- struct ip_options *sopt;
unsigned char *sptr, *dptr;
int soffset, doffset;
int optlen;
- __be32 daddr;
memset(dopt, 0, sizeof(struct ip_options));
- dopt->is_data = 1;
-
- sopt = &(IPCB(skb)->opt);
-
- if (sopt->optlen == 0) {
- dopt->optlen = 0;
+ if (sopt->optlen == 0)
return 0;
- }
- sptr = skb->nh.raw;
+ sptr = skb_network_header(skb);
dptr = dopt->__data;
- if (skb->dst)
- daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
- else
- daddr = skb->nh.iph->daddr;
-
if (sopt->rr) {
optlen = sptr[sopt->rr+1];
soffset = sptr[sopt->rr+2];
@@ -147,11 +124,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
} else {
dopt->ts_needtime = 0;
- if (soffset + 8 <= optlen) {
+ if (soffset + 7 <= optlen) {
__be32 addr;
- memcpy(&addr, sptr+soffset-1, 4);
- if (inet_addr_type(addr) != RTN_LOCAL) {
+ memcpy(&addr, dptr+soffset-1, 4);
+ if (inet_addr_type(net, addr) != RTN_UNICAST) {
dopt->ts_needtime = 1;
soffset += 8;
}
@@ -164,7 +141,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
dopt->optlen += optlen;
}
if (sopt->srr) {
- unsigned char * start = sptr+sopt->srr;
+ unsigned char *start = sptr+sopt->srr;
__be32 faddr;
optlen = start[1];
@@ -175,16 +152,16 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
soffset -= 4;
if (soffset > 3) {
memcpy(&faddr, &start[soffset-1], 4);
- for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+ for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
memcpy(&dptr[doffset-1], &start[soffset-1], 4);
/*
* RFC1812 requires to fix illegal source routes.
*/
- if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
+ if (memcmp(&ip_hdr(skb)->saddr,
+ &start[soffset + 3], 4) == 0)
doffset -= 4;
}
if (doffset > 3) {
- memcpy(&start[doffset-1], &daddr, 4);
dopt->faddr = faddr;
dptr[0] = start[0];
dptr[1] = doffset+3;
@@ -215,10 +192,10 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
* Simple and stupid 8), but the most efficient way.
*/
-void ip_options_fragment(struct sk_buff * skb)
+void ip_options_fragment(struct sk_buff *skb)
{
- unsigned char * optptr = skb->nh.raw + sizeof(struct iphdr);
- struct ip_options * opt = &(IPCB(skb)->opt);
+ unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
+ struct ip_options *opt = &(IPCB(skb)->opt);
int l = opt->optlen;
int optlen;
@@ -232,7 +209,7 @@ void ip_options_fragment(struct sk_buff * skb)
continue;
}
optlen = optptr[1];
- if (optlen<2 || optlen>l)
+ if (optlen < 2 || optlen > l)
return;
if (!IPOPT_COPIED(*optptr))
memset(optptr, IPOPT_NOOP, optlen);
@@ -244,7 +221,15 @@ void ip_options_fragment(struct sk_buff * skb)
opt->rr_needaddr = 0;
opt->ts_needaddr = 0;
opt->ts_needtime = 0;
- return;
+}
+
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+ if (*spec_dst == htonl(INADDR_ANY))
+ *spec_dst = fib_compute_spec_dst(skb);
}
/*
@@ -253,49 +238,51 @@ void ip_options_fragment(struct sk_buff * skb)
* If opt == NULL, then skb->data should point to IP header.
*/
-int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
+int __ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb,
+ __be32 *info)
{
- int l;
- unsigned char * iph;
- unsigned char * optptr;
- int optlen;
- unsigned char * pp_ptr = NULL;
- struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
-
- if (!opt) {
- opt = &(IPCB(skb)->opt);
- iph = skb->nh.raw;
- opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
- optptr = iph + sizeof(struct iphdr);
- opt->is_data = 0;
- } else {
- optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
- iph = optptr - sizeof(struct iphdr);
- }
+ __be32 spec_dst = htonl(INADDR_ANY);
+ unsigned char *pp_ptr = NULL;
+ struct rtable *rt = NULL;
+ unsigned char *optptr;
+ unsigned char *iph;
+ int optlen, l;
+
+ if (skb) {
+ rt = skb_rtable(skb);
+ optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+ } else
+ optptr = opt->__data;
+ iph = optptr - sizeof(struct iphdr);
for (l = opt->optlen; l > 0; ) {
switch (*optptr) {
- case IPOPT_END:
- for (optptr++, l--; l>0; optptr++, l--) {
+ case IPOPT_END:
+ for (optptr++, l--; l > 0; optptr++, l--) {
if (*optptr != IPOPT_END) {
*optptr = IPOPT_END;
opt->is_changed = 1;
}
}
goto eol;
- case IPOPT_NOOP:
+ case IPOPT_NOOP:
l--;
optptr++;
continue;
}
+ if (unlikely(l < 2)) {
+ pp_ptr = optptr;
+ goto error;
+ }
optlen = optptr[1];
- if (optlen<2 || optlen>l) {
+ if (optlen < 2 || optlen > l) {
pp_ptr = optptr;
goto error;
}
switch (*optptr) {
- case IPOPT_SSRR:
- case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
if (optlen < 3) {
pp_ptr = optptr + 1;
goto error;
@@ -321,7 +308,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
opt->srr = optptr - iph;
break;
- case IPOPT_RR:
+ case IPOPT_RR:
if (opt->rr) {
pp_ptr = optptr;
goto error;
@@ -339,8 +326,9 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
pp_ptr = optptr + 2;
goto error;
}
- if (skb) {
- memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
opt->is_changed = 1;
}
optptr[2] += 4;
@@ -348,7 +336,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
}
opt->rr = optptr - iph;
break;
- case IPOPT_TIMESTAMP:
+ case IPOPT_TIMESTAMP:
if (opt->ts) {
pp_ptr = optptr;
goto error;
@@ -362,79 +350,76 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
goto error;
}
if (optptr[2] <= optlen) {
- __be32 *timeptr = NULL;
- if (optptr[2]+3 > optptr[1]) {
+ unsigned char *timeptr = NULL;
+ if (optptr[2]+3 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
switch (optptr[3]&0xF) {
- case IPOPT_TS_TSONLY:
- opt->ts = optptr - iph;
- if (skb)
- timeptr = (__be32*)&optptr[optptr[2]-1];
+ case IPOPT_TS_TSONLY:
+ if (skb)
+ timeptr = &optptr[optptr[2]-1];
opt->ts_needtime = 1;
optptr[2] += 4;
break;
- case IPOPT_TS_TSANDADDR:
- if (optptr[2]+7 > optptr[1]) {
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
- if (skb) {
- memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
- timeptr = (__be32*)&optptr[optptr[2]+3];
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+ timeptr = &optptr[optptr[2]+3];
}
opt->ts_needaddr = 1;
opt->ts_needtime = 1;
optptr[2] += 8;
break;
- case IPOPT_TS_PRESPEC:
- if (optptr[2]+7 > optptr[1]) {
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
{
__be32 addr;
memcpy(&addr, &optptr[optptr[2]-1], 4);
- if (inet_addr_type(addr) == RTN_UNICAST)
+ if (inet_addr_type(net, addr) == RTN_UNICAST)
break;
if (skb)
- timeptr = (__be32*)&optptr[optptr[2]+3];
+ timeptr = &optptr[optptr[2]+3];
}
opt->ts_needtime = 1;
optptr[2] += 8;
break;
- default:
- if (!skb && !capable(CAP_NET_RAW)) {
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr + 3;
goto error;
}
break;
}
if (timeptr) {
- struct timeval tv;
- __be32 midtime;
- do_gettimeofday(&tv);
- midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
- memcpy(timeptr, &midtime, sizeof(__be32));
+ __be32 midtime;
+
+ midtime = inet_current_timestamp();
+ memcpy(timeptr, &midtime, 4);
opt->is_changed = 1;
}
- } else {
- unsigned overflow = optptr[3]>>4;
+ } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
goto error;
}
- opt->ts = optptr - iph;
if (skb) {
optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
opt->is_changed = 1;
}
}
+ opt->ts = optptr - iph;
break;
- case IPOPT_RA:
+ case IPOPT_RA:
if (optlen < 4) {
pp_ptr = optptr + 1;
goto error;
@@ -442,21 +427,21 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
if (optptr[2] == 0 && optptr[3] == 0)
opt->router_alert = optptr - iph;
break;
- case IPOPT_CIPSO:
- if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
+ case IPOPT_CIPSO:
+ if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
pp_ptr = optptr;
goto error;
}
opt->cipso = optptr - iph;
- if (cipso_v4_validate(&optptr)) {
+ if (cipso_v4_validate(skb, &optptr)) {
pp_ptr = optptr;
goto error;
}
break;
- case IPOPT_SEC:
- case IPOPT_SID:
- default:
- if (!skb && !capable(CAP_NET_RAW)) {
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr;
goto error;
}
@@ -471,61 +456,77 @@ eol:
return 0;
error:
- if (skb) {
- icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
- }
+ if (info)
+ *info = htonl((pp_ptr-iph)<<24);
return -EINVAL;
}
+EXPORT_SYMBOL(__ip_options_compile);
+
+int ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb)
+{
+ int ret;
+ __be32 info;
+ ret = __ip_options_compile(net, opt, skb, &info);
+ if (ret != 0 && skb)
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, info);
+ return ret;
+}
+EXPORT_SYMBOL(ip_options_compile);
/*
* Undo all the changes done by ip_options_compile().
*/
-void ip_options_undo(struct ip_options * opt)
+void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
- memmove(optptr+7, optptr+3, optptr[1]-7);
- memcpy(optptr+3, &opt->faddr, 4);
+ unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr);
+
+ memmove(optptr + 7, optptr + 3, optptr[1] - 7);
+ memcpy(optptr + 3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr);
+
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
if (opt->ts) {
- unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr);
+
if (opt->ts_needtime) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
- if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ memset(&optptr[optptr[2] - 1], 0, 4);
+ if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC)
optptr[2] -= 4;
}
if (opt->ts_needaddr) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
}
}
-static struct ip_options *ip_options_get_alloc(const int optlen)
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
+ sockptr_t data, int optlen)
{
- struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
- GFP_KERNEL);
- if (opt)
- memset(opt, 0, sizeof(*opt));
- return opt;
-}
+ struct ip_options_rcu *opt;
+
+ opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
+ GFP_KERNEL);
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
-static int ip_options_get_finish(struct ip_options **optp,
- struct ip_options *opt, int optlen)
-{
while (optlen & 3)
- opt->__data[optlen++] = IPOPT_END;
- opt->optlen = optlen;
- opt->is_data = 1;
- if (optlen && ip_options_compile(opt, NULL)) {
+ opt->opt.__data[optlen++] = IPOPT_END;
+ opt->opt.optlen = optlen;
+ if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
kfree(opt);
return -EINVAL;
}
@@ -534,40 +535,16 @@ static int ip_options_get_finish(struct ip_options **optp,
return 0;
}
-int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
-{
- struct ip_options *opt = ip_options_get_alloc(optlen);
-
- if (!opt)
- return -ENOMEM;
- if (optlen && copy_from_user(opt->__data, data, optlen)) {
- kfree(opt);
- return -EFAULT;
- }
- return ip_options_get_finish(optp, opt, optlen);
-}
-
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
-{
- struct ip_options *opt = ip_options_get_alloc(optlen);
-
- if (!opt)
- return -ENOMEM;
- if (optlen)
- memcpy(opt->__data, data, optlen);
- return ip_options_get_finish(optp, opt, optlen);
-}
-
void ip_forward_options(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
- unsigned char * optptr;
- struct rtable *rt = (struct rtable*)skb->dst;
- unsigned char *raw = skb->nh.raw;
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ unsigned char *optptr;
+ struct rtable *rt = skb_rtable(skb);
+ unsigned char *raw = skb_network_header(skb);
if (opt->rr_needaddr) {
optptr = (unsigned char *)raw + opt->rr;
- ip_rt_get_source(&optptr[optptr[2]-5], rt);
+ ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
opt->is_changed = 1;
}
if (opt->srr_is_hit) {
@@ -575,46 +552,49 @@ void ip_forward_options(struct sk_buff *skb)
optptr = raw + opt->srr;
- for ( srrptr=optptr[2], srrspace = optptr[1];
+ for ( srrptr = optptr[2], srrspace = optptr[1];
srrptr <= srrspace;
srrptr += 4
) {
if (srrptr + 3 > srrspace)
break;
- if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+ if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
break;
}
if (srrptr + 3 <= srrspace) {
opt->is_changed = 1;
- ip_rt_get_source(&optptr[srrptr-1], rt);
- skb->nh.iph->daddr = rt->rt_dst;
+ ip_hdr(skb)->daddr = opt->nexthop;
+ ip_rt_get_source(&optptr[srrptr-1], skb, rt);
optptr[2] = srrptr+4;
- } else if (net_ratelimit())
- printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+ } else {
+ net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+ __func__);
+ }
if (opt->ts_needaddr) {
optptr = raw + opt->ts;
- ip_rt_get_source(&optptr[optptr[2]-9], rt);
+ ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
opt->is_changed = 1;
}
}
if (opt->is_changed) {
opt->is_changed = 0;
- ip_send_check(skb->nh.iph);
+ ip_send_check(ip_hdr(skb));
}
}
-int ip_options_rcv_srr(struct sk_buff *skb)
+int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
{
struct ip_options *opt = &(IPCB(skb)->opt);
int srrspace, srrptr;
__be32 nexthop;
- struct iphdr *iph = skb->nh.iph;
- unsigned char * optptr = skb->nh.raw + opt->srr;
- struct rtable *rt = (struct rtable*)skb->dst;
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned char *optptr = skb_network_header(skb) + opt->srr;
+ struct rtable *rt = skb_rtable(skb);
struct rtable *rt2;
+ unsigned long orefdst;
int err;
- if (!opt->srr)
+ if (!rt)
return 0;
if (skb->pkt_type != PACKET_HOST)
@@ -628,32 +608,34 @@ int ip_options_rcv_srr(struct sk_buff *skb)
if (rt->rt_type != RTN_LOCAL)
return -EINVAL;
- for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
if (srrptr + 3 > srrspace) {
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
return -EINVAL;
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- rt = (struct rtable*)skb->dst;
- skb->dst = NULL;
- err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
- rt2 = (struct rtable*)skb->dst;
+ orefdst = skb_dstref_steal(skb);
+ err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph),
+ dev) ? -EINVAL : 0;
+ rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
- ip_rt_put(rt2);
- skb->dst = &rt->u.dst;
+ skb_dst_drop(skb);
+ skb_dstref_restore(skb, orefdst);
return -EINVAL;
}
- ip_rt_put(rt);
+ refdst_drop(orefdst);
if (rt2->rt_type != RTN_LOCAL)
break;
/* Superfast 8) loopback forward */
- memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+ iph->daddr = nexthop;
opt->is_changed = 1;
}
if (srrptr <= srrspace) {
opt->srr_is_hit = 1;
+ opt->nexthop = nexthop;
opt->is_changed = 1;
}
return 0;
}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1da3d32f8289..ff11d3a85a36 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* The Internet Protocol (IP) output module.
*
- * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Donald Becker, <becker@super.org>
@@ -22,7 +21,7 @@
* Fixes:
* Alan Cox : Missing nonblock feature in ip_build_xmit.
* Mike Kilburn : htons() missing in ip_build_xmit.
- * Bradford Johnson: Fix faulty handling of some frames when
+ * Bradford Johnson: Fix faulty handling of some frames when
* no route is found.
* Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
* (in case if packet not accepted by
@@ -33,9 +32,9 @@
* some redundant tests.
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
* Andi Kleen : Replace ip_reply with ip_send_reply.
- * Andi Kleen : Split fast and slow ip_build_xmit path
- * for decreased register pressure on x86
- * and more readibility.
+ * Andi Kleen : Split fast and slow ip_build_xmit path
+ * for decreased register pressure on x86
+ * and more readability.
* Marc Boucher : When call_out_firewall returns FW_QUEUE,
* silently drop skb instead of failing with -EPERM.
* Detlev Wengorz : Copy protocol for fragments.
@@ -44,15 +43,15 @@
* Hirokazu Takahashi: sendfile() on UDP works now.
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -64,6 +63,7 @@
#include <linux/stat.h>
#include <linux/init.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -74,158 +74,305 @@
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
+#include <net/gso.h>
#include <net/inetpeer.h>
-#include <net/checksum.h>
+#include <net/lwtunnel.h>
+#include <net/inet_dscp.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
-#include <linux/mroute.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
+#include <net/psp.h>
-int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
/* Generate a checksum for an outgoing IP datagram. */
-__inline__ void ip_send_check(struct iphdr *iph)
+void ip_send_check(struct iphdr *iph)
{
iph->check = 0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
+EXPORT_SYMBOL(ip_send_check);
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- newskb->mac.raw = newskb->data;
- __skb_pull(newskb, newskb->nh.raw - newskb->data);
- newskb->pkt_type = PACKET_LOOPBACK;
- newskb->ip_summed = CHECKSUM_UNNECESSARY;
- BUG_TRAP(newskb->dst);
- netif_rx(newskb);
- return 0;
+ struct iphdr *iph = ip_hdr(skb);
+
+ IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS);
+
+ iph_set_totlen(iph, skb->len);
+ ip_send_check(iph);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
+ skb->protocol = htons(ETH_P_IP);
+
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst_dev(skb),
+ dst_output);
}
-static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int ttl = inet->uc_ttl;
+ int err;
+
+ err = __ip_local_out(net, sk, skb);
+ if (likely(err == 1))
+ err = dst_output(net, sk, skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_local_out);
+
+static inline int ip_select_ttl(const struct inet_sock *inet,
+ const struct dst_entry *dst)
+{
+ int ttl = READ_ONCE(inet->uc_ttl);
if (ttl < 0)
- ttl = dst_metric(dst, RTAX_HOPLIMIT);
+ ttl = ip4_dst_hoplimit(dst);
return ttl;
}
-/*
+/*
* Add an ip header to a skbuff and send it out.
*
*/
-int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
- __be32 saddr, __be32 daddr, struct ip_options *opt)
+int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
+ u8 tos)
{
- struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)skb->dst;
+ const struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
- if (opt)
- iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
- else
- iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
-
+ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->u.dst))
+ iph->tos = tos;
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
+ iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
+ iph->saddr = saddr;
+ iph->protocol = sk->sk_protocol;
+ /* Do not bother generating IPID for small packets (eg SYNACK) */
+ if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
iph->frag_off = htons(IP_DF);
- else
+ iph->id = 0;
+ } else {
iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
- iph->protocol = sk->sk_protocol;
- iph->tot_len = htons(skb->len);
- ip_select_ident(iph, &rt->u.dst, sk);
- skb->nh.iph = iph;
+ /* TCP packets here are SYNACK with fat IPv4/TCP options.
+ * Avoid using the hashed IP ident generator.
+ */
+ if (sk->sk_protocol == IPPROTO_TCP)
+ iph->id = (__force __be16)get_random_u16();
+ else
+ __ip_select_ident(net, iph, 1);
+ }
- if (opt && opt->optlen) {
- iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, daddr, rt, 0);
+ if (opt && opt->opt.optlen) {
+ iph->ihl += opt->opt.optlen>>2;
+ ip_options_build(skb, &opt->opt, daddr, rt);
}
- ip_send_check(iph);
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
+ if (!skb->mark)
+ skb->mark = READ_ONCE(sk->sk_mark);
/* Send it out. */
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ return ip_local_out(net, skb->sk, skb);
}
-
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static inline int ip_finish_output2(struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb->dst;
- struct hh_cache *hh = dst->hh;
- struct net_device *dev = dst->dev;
- int hh_len = LL_RESERVED_SPACE(dev);
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = dst_rtable(dst);
+ struct net_device *dev = dst_dev(dst);
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ bool is_v6gw = false;
+
+ if (rt->rt_type == RTN_MULTICAST) {
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
+ } else if (rt->rt_type == RTN_BROADCAST)
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
+
+ /* OUTOCTETS should be counted after fragment */
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
- /* Be paranoid, rather than too clever. */
- if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
- struct sk_buff *skb2;
+ if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+ int res = lwtunnel_xmit(skb);
- skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
- if (skb2 == NULL) {
- kfree_skb(skb);
- return -ENOMEM;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
+ if (res != LWTUNNEL_XMIT_CONTINUE)
+ return res;
+ }
+
+ rcu_read_lock();
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+ if (!IS_ERR(neigh)) {
+ int res;
+
+ sock_confirm_neigh(skb, neigh);
+ /* if crossing protocols, can not use the cached header */
+ res = neigh_output(neigh, skb, is_v6gw);
+ rcu_read_unlock();
+ return res;
+ }
+ rcu_read_unlock();
+
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
+ return PTR_ERR(neigh);
+}
+
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
+{
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features;
+ int ret = 0;
+
+ /* common case: seglen is <= mtu
+ */
+ if (skb_gso_validate_network_len(skb, mtu))
+ return ip_finish_output2(net, sk, skb);
+
+ /* Slowpath - GSO segment length exceeds the egress MTU.
+ *
+ * This can happen in several cases:
+ * - Forwarding of a TCP GRO skb, when DF flag is not set.
+ * - Forwarding of an skb that arrived on a virtualization interface
+ * (virtio-net/vhost/tap) with TSO/GSO size set by other network
+ * stack.
+ * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
+ * interface with a smaller MTU.
+ * - Arriving GRO skb (or GSO skb in a virtualized environment) that is
+ * bridged to a NETIF_F_TSO tunnel stacked over an interface with an
+ * insufficient MTU.
+ */
+ features = netif_skb_features(skb);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
- skb = skb2;
+ return -ENOMEM;
}
- if (hh) {
- int hh_alen;
+ consume_skb(skb);
- read_lock_bh(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(hh->hh_len);
- memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
- read_unlock_bh(&hh->hh_lock);
- skb_push(skb, hh->hh_len);
- return hh->hh_output(skb);
- } else if (dst->neighbour)
- return dst->neighbour->output(skb);
+ skb_list_walk_safe(segs, segs, nskb) {
+ int err;
- if (net_ratelimit())
- printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
- kfree_skb(skb);
- return -EINVAL;
+ skb_mark_not_on_list(segs);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
+
+ if (err && ret == 0)
+ ret = err;
+ }
+
+ return ret;
}
-static inline int ip_finish_output(struct sk_buff *skb)
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
- if (skb->dst->xfrm != NULL) {
+ if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(skb);
+ return dst_output(net, sk, skb);
}
#endif
- if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
- return ip_fragment(skb, ip_finish_output2);
- else
- return ip_finish_output2(skb);
+ mtu = ip_skb_dst_mtu(sk, skb);
+ if (skb_is_gso(skb))
+ return ip_finish_output_gso(net, sk, skb, mtu);
+
+ if (skb->len > mtu || IPCB(skb)->frag_max_size)
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
+
+ return ip_finish_output2(net, sk, skb);
+}
+
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+ return ret;
+ }
}
-int ip_mc_output(struct sk_buff *skb)
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
- struct sock *sk = skb->sk;
- struct rtable *rt = (struct rtable*)skb->dst;
- struct net_device *dev = rt->u.dst.dev;
+ struct rtable *new_rt;
+ bool do_cn = false;
+ int ret, err;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_CN:
+ do_cn = true;
+ fallthrough;
+ case NET_XMIT_SUCCESS:
+ break;
+ default:
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+ return ret;
+ }
+
+ /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
+ * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
+ * see ipv4_pktinfo_prepare().
+ */
+ new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
+ if (new_rt) {
+ new_rt->rt_iif = 0;
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &new_rt->dst);
+ }
+
+ err = dev_loopback_xmit(net, sk, skb);
+ return (do_cn && err) ? ret : err;
+}
+
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
-
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -234,7 +381,7 @@ int ip_mc_output(struct sk_buff *skb)
*/
if (rt->rt_flags&RTCF_MULTICAST) {
- if ((!sk || inet_sk(sk)->mc_loop)
+ if (sk_mc_loop(sk)
#ifdef CONFIG_IP_MROUTE
/* Small optimization: do not loopback not local frames,
which returned after forwarding; they will be dropped
@@ -244,19 +391,21 @@ int ip_mc_output(struct sk_buff *skb)
This check is duplicated in ip_mr_input at the moment.
*/
- && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+ &&
+ ((rt->rt_flags & RTCF_LOCAL) ||
+ !(IPCB(skb)->flags & IPSKB_FORWARDED))
#endif
- ) {
+ ) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
- newskb->dev,
- ip_dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
- if (skb->nh.iph->ttl == 0) {
+ if (ip_hdr(skb)->ttl == 0) {
kfree_skb(skb);
return 0;
}
@@ -265,152 +414,343 @@ int ip_mc_output(struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
- newskb->dev, ip_dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ ip_mc_finish_output);
}
- return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb->dst->dev;
-
- IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+ struct net_device *dev, *indev = skb->dev;
+ int ret_val;
+ rcu_read_lock();
+ dev = skb_dst_dev_rcu(skb);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
- ip_finish_output,
- !(IPCB(skb)->flags & IPSKB_REROUTED));
+ ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+ rcu_read_unlock();
+ return ret_val;
+}
+EXPORT_SYMBOL(ip_output);
+
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ * iph->saddr = fl4->saddr;
+ * iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+ BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+ offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+
+ iph->saddr = fl4->saddr;
+ iph->daddr = fl4->daddr;
}
-int ip_queue_xmit(struct sk_buff *skb, struct sock *sk, int ipfragok)
+/* Note: skb->sk can be different from sk, in case of tunnels */
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+ __u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
- struct ip_options *opt = inet->opt;
+ struct net *net = sock_net(sk);
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
+ int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
- rt = (struct rtable *) skb->dst;
- if (rt != NULL)
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ fl4 = &fl->u.ip4;
+ rt = skb_rtable(skb);
+ if (rt)
goto packet_routed;
/* Make sure we can route this packet. */
- rt = (struct rtable *)__sk_dst_check(sk, 0);
- if (rt == NULL) {
- __be32 daddr;
-
- /* Use correct destination address if we have options. */
- daddr = inet->daddr;
- if(opt && opt->srr)
- daddr = opt->faddr;
-
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = inet->sport,
- .dport = inet->dport } } };
-
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times
- * itself out.
- */
- security_sk_classify_flow(sk, &fl);
- if (ip_route_output_flow(&rt, &fl, sk, 0))
- goto no_route;
- }
- sk_setup_caps(sk, &rt->u.dst);
+ rt = dst_rtable(__sk_dst_check(sk, 0));
+ if (!rt) {
+ inet_sk_init_flowi4(inet, fl4);
+
+ /* sctp_v4_xmit() uses its own DSCP value */
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ sk_setup_caps(sk, &rt->dst);
}
- skb->dst = dst_clone(&rt->u.dst);
+ skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
- iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
- *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- iph->tot_len = htons(skb->len);
- if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
+ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
- iph->saddr = rt->rt_src;
- iph->daddr = rt->rt_dst;
- skb->nh.iph = iph;
+ ip_copy_addrs(iph, fl4);
+
/* Transport layer set skb->h.foo itself. */
- if (opt && opt->optlen) {
- iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, inet->daddr, rt, 0);
+ if (inet_opt && inet_opt->opt.optlen) {
+ iph->ihl += inet_opt->opt.optlen >> 2;
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
}
- ip_select_ident_more(iph, &rt->u.dst, sk,
- (skb_shinfo(skb)->gso_segs ?: 1) - 1);
-
- /* Add an IP checksum. */
- ip_send_check(iph);
+ ip_select_ident_segs(net, skb, sk,
+ skb_shinfo(skb)->gso_segs ?: 1);
- skb->priority = sk->sk_priority;
+ /* TODO : should we use skb->sk here instead of sk ? */
+ skb->priority = READ_ONCE(sk->sk_priority);
+ skb->mark = READ_ONCE(sk->sk_mark);
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ res = ip_local_out(net, sk, skb);
+ rcu_read_unlock();
+ return res;
no_route:
- IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
+ rcu_read_unlock();
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
return -EHOSTUNREACH;
}
+EXPORT_SYMBOL(__ip_queue_xmit);
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+ return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
+}
+EXPORT_SYMBOL(ip_queue_xmit);
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
- dst_release(to->dst);
- to->dst = dst_clone(from->dst);
+ to->skb_iif = from->skb_iif;
+ skb_dst_drop(to);
+ skb_dst_copy(to, from);
to->dev = from->dev;
to->mark = from->mark;
- /* Copy the flags to each fragment. */
- IPCB(to)->flags = IPCB(from)->flags;
+ skb_copy_hash(to, from);
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
-#ifdef CONFIG_NETFILTER
- /* Connection association is same as pre-frag packet */
- nf_conntrack_put(to->nfct);
- to->nfct = from->nfct;
- nf_conntrack_get(to->nfct);
- to->nfctinfo = from->nfctinfo;
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ nf_copy(to, from);
+ skb_ext_copy(to, from);
+#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- nf_bridge_put(to->nf_bridge);
- to->nf_bridge = from->nf_bridge;
- nf_bridge_get(to->nf_bridge);
-#endif
-#endif
skb_copy_secmark(to, from);
}
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->frag_off & htons(IP_DF)) == 0)
+ return ip_do_fragment(net, sk, skb, output);
+
+ if (unlikely(!skb->ignore_df ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(net, sk, skb, output);
+}
+
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int hlen, struct ip_fraglist_iter *iter)
+{
+ unsigned int first_len = skb_pagelen(skb);
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->iph = iph;
+ iter->hlen = hlen;
+
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_init);
+
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
+{
+ unsigned int hlen = iter->hlen;
+ struct iphdr *iph = iter->iph;
+ struct sk_buff *frag;
+
+ frag = iter->frag;
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iph, hlen);
+ iter->iph = ip_hdr(frag);
+ iph = iter->iph;
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ iter->offset += skb->len - hlen;
+ iph->frag_off = htons(iter->offset >> 3);
+ if (frag->next)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_prepare);
+
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
+ unsigned int ll_rs, unsigned int mtu, bool DF,
+ struct ip_frag_state *state)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ state->DF = DF;
+ state->hlen = hlen;
+ state->ll_rs = ll_rs;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ state->not_last_frag = iph->frag_off & htons(IP_MF);
+}
+EXPORT_SYMBOL(ip_frag_init);
+
+static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
+ bool first_frag)
+{
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (first_frag)
+ ip_options_fragment(from);
+}
+
+struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+{
+ unsigned int len = state->left;
+ struct sk_buff *skb2;
+ struct iphdr *iph;
+
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
+ if (!skb2)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, state->ll_rs);
+ skb_put(skb2, len + state->hlen);
+ skb_reset_network_header(skb2);
+ skb2->transport_header = skb2->network_header + state->hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
+ BUG();
+ state->left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = ip_hdr(skb2);
+ iph->frag_off = htons((state->offset >> 3));
+ if (state->DF)
+ iph->frag_off |= htons(IP_DF);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (state->left > 0 || state->not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ state->ptr += len;
+ state->offset += len;
+
+ iph->tot_len = htons(len + state->hlen);
+
+ ip_send_check(iph);
+
+ return skb2;
+}
+EXPORT_SYMBOL(ip_frag_next);
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -418,42 +758,42 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
* single device frame, and queue such a frame for sending.
*/
-int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
- int raw = 0;
- int ptr;
- struct net_device *dev;
struct sk_buff *skb2;
- unsigned int mtu, hlen, left, len, ll_rs, pad;
- int offset;
- __be16 not_last_frag;
- struct rtable *rt = (struct rtable*)skb->dst;
+ u8 tstamp_type = skb->tstamp_type;
+ struct rtable *rt = skb_rtable(skb);
+ unsigned int mtu, hlen, ll_rs;
+ struct ip_fraglist_iter iter;
+ ktime_t tstamp = skb->tstamp;
+ struct ip_frag_state state;
int err = 0;
- dev = rt->u.dst.dev;
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
/*
* Point into the IP datagram header.
*/
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
- if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
- IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->u.dst)));
- kfree_skb(skb);
- return -EMSGSIZE;
- }
+ mtu = ip_skb_dst_mtu(sk, skb);
+ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+ mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
*/
hlen = iph->ihl * 4;
- mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+ mtu = mtu - hlen; /* Size of data space */
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+ ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
@@ -462,375 +802,194 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *frag;
- int first_len = skb_pagelen(skb);
+ if (skb_has_frag_list(skb)) {
+ struct sk_buff *frag, *frag2;
+ unsigned int first_len = skb_pagelen(skb);
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
- (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
- skb_cloned(skb))
+ ip_is_fragment(iph) ||
+ skb_cloned(skb) ||
+ skb_headroom(skb) < ll_rs)
goto slow_path;
- for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+ skb_walk_frags(skb, frag) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
- skb_headroom(frag) < hlen)
- goto slow_path;
+ skb_headroom(frag) < hlen + ll_rs)
+ goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
- goto slow_path;
+ goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
- sock_hold(skb->sk);
frag->sk = skb->sk;
frag->destructor = sock_wfree;
- skb->truesize -= frag->truesize;
}
+ skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
-
- err = 0;
- offset = 0;
- frag = skb_shinfo(skb)->frag_list;
- skb_shinfo(skb)->frag_list = NULL;
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- iph->tot_len = htons(first_len);
- iph->frag_off = htons(IP_MF);
- ip_send_check(iph);
+ ip_fraglist_init(skb, iph, hlen, &iter);
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- frag->h.raw = frag->data;
- frag->nh.raw = __skb_push(frag, hlen);
- memcpy(frag->nh.raw, iph, hlen);
- iph = frag->nh.iph;
- iph->tot_len = htons(frag->len);
- ip_copy_metadata(frag, skb);
- if (offset == 0)
- ip_options_fragment(frag);
- offset += skb->len - hlen;
- iph->frag_off = htons(offset>>3);
- if (frag->next != NULL)
- iph->frag_off |= htons(IP_MF);
- /* Ready, complete checksum */
- ip_send_check(iph);
+ if (iter.frag) {
+ bool first_frag = (iter.offset == 0);
+
+ IPCB(iter.frag)->flags = IPCB(skb)->flags;
+ ip_fraglist_prepare(skb, &iter);
+ if (first_frag && IPCB(skb)->opt.optlen) {
+ /* ipcb->opt is not populated for frags
+ * coming from __ip_make_skb(),
+ * ip_options_fragment() needs optlen
+ */
+ IPCB(iter.frag)->opt.optlen =
+ IPCB(skb)->opt.optlen;
+ ip_options_fragment(iter.frag);
+ ip_send_check(iter.iph);
+ }
}
- err = output(skb);
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
+ err = output(net, sk, skb);
if (!err)
- IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb->next = NULL;
+ skb = ip_fraglist_next(&iter);
}
if (err == 0) {
- IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return 0;
}
- while (frag) {
- skb = frag->next;
- kfree_skb(frag);
- frag = skb;
- }
- IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ kfree_skb_list(iter.frag);
+
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
}
slow_path:
- left = skb->len - hlen; /* Space per frame */
- ptr = raw + hlen; /* Where to start from */
-
- /* for bridged IP traffic encapsulated inside f.e. a vlan header,
- * we need to make room for the encapsulating header
- */
- pad = nf_bridge_pad(skb);
- ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
- mtu -= pad;
-
/*
* Fragment the datagram.
*/
- offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
- not_last_frag = iph->frag_off & htons(IP_MF);
+ ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
+ &state);
/*
* Keep copying data until we run out.
*/
- while(left > 0) {
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending upto and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
- /*
- * Allocate buffer.
- */
+ while (state.left > 0) {
+ bool first_frag = (state.offset == 0);
- if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
- NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
- err = -ENOMEM;
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
goto fail;
}
-
- /*
- * Set up data on packet
- */
-
- ip_copy_metadata(skb2, skb);
- skb_reserve(skb2, ll_rs);
- skb_put(skb2, len + hlen);
- skb2->nh.raw = skb2->data;
- skb2->h.raw = skb2->data + hlen;
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
-
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
-
- memcpy(skb2->nh.raw, skb->data, hlen);
-
- /*
- * Copy a block of the IP datagram.
- */
- if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
- BUG();
- left -= len;
-
- /*
- * Fill in the new header fields.
- */
- iph = skb2->nh.iph;
- iph->frag_off = htons((offset >> 3));
-
- /* ANK: dirty, but effective trick. Upgrade options only if
- * the segment to be fragmented was THE FIRST (otherwise,
- * options are already fixed) and make it ONCE
- * on the initial skb, so that all the following fragments
- * will inherit fixed options.
- */
- if (offset == 0)
- ip_options_fragment(skb);
-
- /*
- * Added AC : If we are fragmenting a fragment that's not the
- * last fragment then keep MF on each bit
- */
- if (left > 0 || not_last_frag)
- iph->frag_off |= htons(IP_MF);
- ptr += len;
- offset += len;
+ ip_frag_ipcb(skb, skb2, first_frag);
/*
* Put this fragment into the sending queue.
*/
- iph->tot_len = htons(len + hlen);
-
- ip_send_check(iph);
-
- err = output(skb2);
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
+ err = output(net, sk, skb2);
if (err)
goto fail;
- IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
}
- kfree_skb(skb);
- IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+ consume_skb(skb);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return err;
fail:
- kfree_skb(skb);
- IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
}
-
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
- struct iovec *iov = from;
+ struct msghdr *msg = from;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+ if (!copy_from_iter_full(to, len, &msg->msg_iter))
return -EFAULT;
} else {
__wsum csum = 0;
- if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+ if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
return 0;
}
+EXPORT_SYMBOL(ip_generic_getfrag);
-static inline __wsum
-csum_page(struct page *page, int offset, int copy)
-{
- char *kaddr;
- __wsum csum;
- kaddr = kmap(page);
- csum = csum_partial(kaddr + offset, copy, 0);
- kunmap(page);
- return csum;
-}
-
-static inline int ip_ufo_append_data(struct sock *sk,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int mtu,unsigned int flags)
-{
- struct sk_buff *skb;
- int err;
-
- /* There is support for UDP fragmentation offload by network
- * device, so create one single skb packet containing complete
- * udp datagram
- */
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
- skb = sock_alloc_send_skb(sk,
- hh_len + fragheaderlen + transhdrlen + 20,
- (flags & MSG_DONTWAIT), &err);
-
- if (skb == NULL)
- return err;
-
- /* reserve space for Hardware header */
- skb_reserve(skb, hh_len);
-
- /* create space for UDP/IP header */
- skb_put(skb,fragheaderlen + transhdrlen);
-
- /* initialize network header pointer */
- skb->nh.raw = skb->data;
-
- /* initialize protocol header pointer */
- skb->h.raw = skb->data + fragheaderlen;
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum = 0;
- sk->sk_sndmsg_off = 0;
- }
-
- err = skb_append_datato_frags(sk,skb, getfrag, from,
- (length - transhdrlen));
- if (!err) {
- /* specify the length of each IP datagram fragment*/
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- __skb_queue_tail(&sk->sk_write_queue, skb);
-
- return 0;
- }
- /* There is not enough support do UFO ,
- * so follow normal path
- */
- kfree_skb(skb);
- return err;
-}
-
-/*
- * ip_append_data() and ip_append_page() can make one large IP datagram
- * from many pieces of data. Each pieces will be holded on the socket
- * until ip_push_pending_frames() is called. Each piece can be a page
- * or non-page data.
- *
- * Not only UDP, other transport protocols - e.g. raw sockets - can use
- * this interface potentially.
- *
- * LATER: length must be adjusted by pad at tail, when it is required.
- */
-int ip_append_data(struct sock *sk,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- struct ipcm_cookie *ipc, struct rtable *rt,
- unsigned int flags)
+static int __ip_append_data(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork,
+ struct page_frag *pfrag,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
-
- struct ip_options *opt = NULL;
+ struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = 0;
- unsigned int maxfraglen, fragheaderlen;
+ bool zc = false;
+ unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
+ struct rtable *rt = dst_rtable(cork->dst);
+ bool paged, hold_tskey = false, extra_uref = false;
+ unsigned int wmem_alloc_delta = 0;
+ u32 tskey = 0;
- if (flags&MSG_PROBE)
- return 0;
+ skb = skb_peek_tail(queue);
- if (skb_queue_empty(&sk->sk_write_queue)) {
- /*
- * setup for corking.
- */
- opt = ipc->opt;
- if (opt) {
- if (inet->cork.opt == NULL) {
- inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
- if (unlikely(inet->cork.opt == NULL))
- return -ENOBUFS;
- }
- memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
- inet->cork.flags |= IPCORK_OPT;
- inet->cork.addr = ipc->addr;
- }
- dst_hold(&rt->u.dst);
- inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
- inet->cork.rt = rt;
- inet->cork.length = 0;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
- if ((exthdrlen = rt->u.dst.header_len) != 0) {
- length += exthdrlen;
- transhdrlen += exthdrlen;
- }
- } else {
- rt = inet->cork.rt;
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ exthdrlen = !skb ? rt->dst.header_len : 0;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+ paged = !!cork->gso_size;
- transhdrlen = 0;
- exthdrlen = 0;
- mtu = inet->cork.fragsize;
- }
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
- if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+ if (cork->length + length > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
return -EMSGSIZE;
}
@@ -840,20 +999,63 @@ int ip_append_data(struct sock *sk,
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
- rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
- !exthdrlen)
+ rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
+ (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
- inet->cork.length += length;
- if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
+ if ((flags & MSG_ZEROCOPY) && length) {
+ struct msghdr *msg = from;
- err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
- fragheaderlen, transhdrlen, mtu,
- flags);
- if (err)
- goto error;
- return 0;
+ if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
+ if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
+ return -EINVAL;
+
+ /* Leave uarg NULL if can't zerocopy, callers should
+ * be able to handle it.
+ */
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ uarg = msg->msg_ubuf;
+ }
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ } else {
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+ } else if ((flags & MSG_SPLICE_PAGES) && length) {
+ if (inet_test_bit(HDRINCL, sk))
+ return -EPERM;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ getfrag == ip_generic_getfrag)
+ /* We need an empty buffer to attach stuff to */
+ paged = true;
+ else
+ flags &= ~MSG_SPLICE_PAGES;
+ }
+
+ cork->length += length;
+
+ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ if (cork->flags & IPCORK_TS_OPT_ID) {
+ tskey = cork->ts_opt_id;
+ } else {
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+ hold_tskey = true;
+ }
}
/* So, what's going on in the loop below?
@@ -863,7 +1065,7 @@ int ip_append_data(struct sock *sk,
* adding appropriate IP header.
*/
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ if (!skb)
goto alloc_new_skb;
while (length > 0) {
@@ -876,7 +1078,8 @@ int ip_append_data(struct sock *sk,
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
- unsigned int alloclen;
+ unsigned int alloclen, alloc_extra;
+ unsigned int pagedlen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -893,12 +1096,10 @@ alloc_new_skb:
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
+ pagedlen = 0;
- if ((flags & MSG_MORE) &&
- !(rt->u.dst.dev->features&NETIF_F_SG))
- alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ alloc_extra = hh_len + 15;
+ alloc_extra += exthdrlen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
@@ -906,23 +1107,35 @@ alloc_new_skb:
* the last.
*/
if (datalen == length + fraggap)
- alloclen += rt->u.dst.trailer_len;
+ alloc_extra += rt->dst.trailer_len;
+
+ if ((flags & MSG_MORE) &&
+ !(rt->dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else if (!paged &&
+ (fraglen + alloc_extra < SKB_MAX_ALLOC ||
+ !(rt->dst.dev->features & NETIF_F_SG)))
+ alloclen = fraglen;
+ else {
+ alloclen = fragheaderlen + transhdrlen;
+ pagedlen = datalen - transhdrlen;
+ }
+
+ alloclen += alloc_extra;
if (transhdrlen) {
- skb = sock_alloc_send_skb(sk,
- alloclen + hh_len + 15,
+ skb = sock_alloc_send_skb(sk, alloclen,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
- if (atomic_read(&sk->sk_wmem_alloc) <=
+ if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
2 * sk->sk_sndbuf)
- skb = sock_wmalloc(sk,
- alloclen + hh_len + 15, 1,
- sk->sk_allocation);
- if (unlikely(skb == NULL))
+ skb = alloc_skb(alloclen,
+ sk->sk_allocation);
+ if (unlikely(!skb))
err = -ENOBUFS;
}
- if (skb == NULL)
+ if (!skb)
goto error;
/*
@@ -935,286 +1148,281 @@ alloc_new_skb:
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen);
- skb->nh.raw = data + exthdrlen;
- data += fragheaderlen;
- skb->h.raw = data + exthdrlen;
+ data = skb_put(skb, fraglen + exthdrlen - pagedlen);
+ skb_set_network_header(skb, exthdrlen);
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
+ data += fragheaderlen + exthdrlen;
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
- if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ copy = datalen - transhdrlen - fraggap - pagedlen;
+ /* [!] NOTE: copy will be negative if pagedlen>0
+ * because then the equation reduces to -fraggap.
+ */
+ if (copy > 0 &&
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
+ } else if (flags & MSG_SPLICE_PAGES) {
+ copy = 0;
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
+ if ((flags & MSG_CONFIRM) && !skb_prev)
+ skb_set_dst_pending_confirm(skb, 1);
+
/*
* Put the packet on the pending queue.
*/
- __skb_queue_tail(&sk->sk_write_queue, skb);
+ if (!skb->destructor) {
+ skb->destructor = sock_wfree;
+ skb->sk = sk;
+ wmem_alloc_delta += skb->truesize;
+ }
+ __skb_queue_tail(queue, skb);
continue;
}
if (copy > length)
copy = length;
- if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+ if (!(rt->dst.dev->features&NETIF_F_SG) &&
+ skb_tailroom(skb) >= copy) {
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
- } else {
+ } else if (flags & MSG_SPLICE_PAGES) {
+ struct msghdr *msg = from;
+
+ err = -EIO;
+ if (WARN_ON_ONCE(copy > msg->msg_iter.count))
+ goto error;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0)
+ goto error;
+ copy = err;
+ wmem_alloc_delta += copy;
+ } else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
- struct page *page = sk->sk_sndmsg_page;
- int off = sk->sk_sndmsg_off;
- unsigned int left;
-
- if (page && (left = PAGE_SIZE - off) > 0) {
- if (copy >= left)
- copy = left;
- if (page != frag->page) {
- if (i == MAX_SKB_FRAGS) {
- err = -EMSGSIZE;
- goto error;
- }
- get_page(page);
- skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
- frag = &skb_shinfo(skb)->frags[i];
- }
- } else if (i < MAX_SKB_FRAGS) {
- if (copy > PAGE_SIZE)
- copy = PAGE_SIZE;
- page = alloc_pages(sk->sk_allocation, 0);
- if (page == NULL) {
- err = -ENOMEM;
- goto error;
- }
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
- skb_fill_page_desc(skb, i, page, 0, 0);
- frag = &skb_shinfo(skb)->frags[i];
- skb->truesize += PAGE_SIZE;
- atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
- } else {
- err = -EMSGSIZE;
+ err = -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag))
goto error;
+
+ skb_zcopy_downgrade_managed(skb);
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ err = -EMSGSIZE;
+ if (i == MAX_SKB_FRAGS)
+ goto error;
+
+ __skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, 0);
+ skb_shinfo(skb)->nr_frags = ++i;
+ get_page(pfrag->page);
}
- if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
- err = -EFAULT;
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
+ page_address(pfrag->page) + pfrag->offset,
+ offset, copy, skb->len, skb) < 0)
+ goto error_efault;
+
+ pfrag->offset += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ skb_len_add(skb, copy);
+ wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
goto error;
- }
- sk->sk_sndmsg_off += copy;
- frag->size += copy;
- skb->len += copy;
- skb->data_len += copy;
}
offset += copy;
length -= copy;
}
+ if (wmem_alloc_delta)
+ refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
return 0;
+error_efault:
+ err = -EFAULT;
error:
- inet->cork.length -= length;
- IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
- return err;
+ net_zcopy_put_abort(uarg, extra_uref);
+ cork->length -= length;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+ refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+ if (hold_tskey)
+ atomic_dec(&sk->sk_tskey);
+ return err;
}
-ssize_t ip_append_page(struct sock *sk, struct page *page,
- int offset, size_t size, int flags)
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+ struct ipcm_cookie *ipc, struct rtable **rtp)
{
- struct inet_sock *inet = inet_sk(sk);
- struct sk_buff *skb;
+ struct ip_options_rcu *opt;
struct rtable *rt;
- struct ip_options *opt = NULL;
- int hh_len;
- int mtu;
- int len;
- int err;
- unsigned int maxfraglen, fragheaderlen, fraggap;
- if (inet->hdrincl)
- return -EPERM;
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
- if (flags&MSG_PROBE)
- return 0;
-
- if (skb_queue_empty(&sk->sk_write_queue))
- return -EINVAL;
-
- rt = inet->cork.rt;
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ cork->fragsize = ip_sk_use_pmtu(sk) ?
+ dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
- if (!(rt->u.dst.dev->features&NETIF_F_SG))
- return -EOPNOTSUPP;
+ if (!inetdev_valid_mtu(cork->fragsize))
+ return -ENETUNREACH;
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
- mtu = inet->cork.fragsize;
-
- fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
- maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-
- if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
- return -EMSGSIZE;
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (!cork->opt) {
+ cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+ sk->sk_allocation);
+ if (unlikely(!cork->opt))
+ return -ENOBUFS;
+ }
+ memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+ cork->flags |= IPCORK_OPT;
+ cork->addr = ipc->addr;
}
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
- return -EINVAL;
-
- inet->cork.length += size;
- if ((sk->sk_protocol == IPPROTO_UDP) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+ cork->gso_size = ipc->gso_size;
+
+ cork->dst = &rt->dst;
+ /* We stole this route, caller should not release it. */
+ *rtp = NULL;
+
+ cork->length = 0;
+ cork->ttl = ipc->ttl;
+ cork->tos = ipc->tos;
+ cork->mark = ipc->sockc.mark;
+ cork->priority = ipc->sockc.priority;
+ cork->transmit_time = ipc->sockc.transmit_time;
+ cork->tx_flags = 0;
+ sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
+ if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
+ cork->flags |= IPCORK_TS_OPT_ID;
+ cork->ts_opt_id = ipc->sockc.ts_opt_id;
}
+ return 0;
+}
- while (size > 0) {
- int i;
-
- if (skb_is_gso(skb))
- len = size;
- else {
-
- /* Check if the remaining data fits into current packet. */
- len = mtu - skb->len;
- if (len < size)
- len = maxfraglen - skb->len;
- }
- if (len <= 0) {
- struct sk_buff *skb_prev;
- char *data;
- struct iphdr *iph;
- int alloclen;
-
- skb_prev = skb;
- fraggap = skb_prev->len - maxfraglen;
-
- alloclen = fragheaderlen + hh_len + fraggap + 15;
- skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
- if (unlikely(!skb)) {
- err = -ENOBUFS;
- goto error;
- }
-
- /*
- * Fill in the control structures
- */
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum = 0;
- skb_reserve(skb, hh_len);
-
- /*
- * Find where to start putting bytes.
- */
- data = skb_put(skb, fragheaderlen + fraggap);
- skb->nh.iph = iph = (struct iphdr *)data;
- data += fragheaderlen;
- skb->h.raw = data;
-
- if (fraggap) {
- skb->csum = skb_copy_and_csum_bits(
- skb_prev, maxfraglen,
- data, fraggap, 0);
- skb_prev->csum = csum_sub(skb_prev->csum,
- skb->csum);
- pskb_trim_unique(skb_prev, maxfraglen);
- }
-
- /*
- * Put the packet on the pending queue.
- */
- __skb_queue_tail(&sk->sk_write_queue, skb);
- continue;
- }
-
- i = skb_shinfo(skb)->nr_frags;
- if (len > size)
- len = size;
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_shinfo(skb)->frags[i-1].size += len;
- } else if (i < MAX_SKB_FRAGS) {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, len);
- } else {
- err = -EMSGSIZE;
- goto error;
- }
+/*
+ * ip_append_data() can make one large IP datagram from many pieces of
+ * data. Each piece will be held on the socket until
+ * ip_push_pending_frames() is called. Each piece can be a page or
+ * non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
- if (skb->ip_summed == CHECKSUM_NONE) {
- __wsum csum;
- csum = csum_page(page, offset, len);
- skb->csum = csum_block_add(skb->csum, csum, skb->len);
- }
+ if (flags&MSG_PROBE)
+ return 0;
- skb->len += len;
- skb->data_len += len;
- offset += len;
- size -= len;
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+ if (err)
+ return err;
+ } else {
+ transhdrlen = 0;
}
- return 0;
-error:
- inet->cork.length -= size;
- IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
- return err;
+ return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+ sk_page_frag(sk), getfrag,
+ from, length, transhdrlen, flags);
+}
+
+static void ip_cork_release(struct inet_cork *cork)
+{
+ cork->flags &= ~IPCORK_OPT;
+ kfree(cork->opt);
+ cork->opt = NULL;
+ dst_release(cork->dst);
+ cork->dst = NULL;
}
/*
* Combined all pending IP fragments on the socket as one IP datagram
* and push them out.
*/
-int ip_push_pending_frames(struct sock *sk)
+struct sk_buff *__ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options *opt = NULL;
- struct rtable *rt = inet->cork.rt;
+ struct rtable *rt = dst_rtable(cork->dst);
struct iphdr *iph;
+ u8 pmtudisc, ttl;
__be16 df = 0;
- __u8 ttl;
- int err = 0;
- if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+ skb = __skb_dequeue(queue);
+ if (!skb)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */
- if (skb->data < skb->nh.raw)
- __skb_pull(skb, skb->nh.raw - skb->data);
- while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
- __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+ if (skb->data < skb_network_header(skb))
+ __skb_pull(skb, skb_network_offset(skb));
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+ __skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
- __sock_put(tmp_skb->sk);
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
}
@@ -1223,174 +1431,252 @@ int ip_push_pending_frames(struct sock *sk)
* to fragment the frame generated here. No matter, what transforms
* how transforms change size of the packet, it will come out.
*/
- if (inet->pmtudisc != IP_PMTUDISC_DO)
- skb->local_df = 1;
+ skb->ignore_df = ip_sk_ignore_df(sk);
/* DF bit is set when we want to see DF on outgoing frames.
- * If local_df is set too, we still allow to fragment this frame
+ * If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
- (skb->len <= dst_mtu(&rt->u.dst) &&
- ip_dont_fragment(sk, &rt->u.dst)))
+ pmtudisc = READ_ONCE(inet->pmtudisc);
+ if (pmtudisc == IP_PMTUDISC_DO ||
+ pmtudisc == IP_PMTUDISC_PROBE ||
+ (skb->len <= dst_mtu(&rt->dst) &&
+ ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
- if (rt->rt_type == RTN_MULTICAST)
- ttl = inet->mc_ttl;
+ if (cork->ttl != 0)
+ ttl = cork->ttl;
+ else if (rt->rt_type == RTN_MULTICAST)
+ ttl = READ_ONCE(inet->mc_ttl);
else
- ttl = ip_select_ttl(inet, &rt->u.dst);
+ ttl = ip_select_ttl(inet, &rt->dst);
- iph = (struct iphdr *)skb->data;
+ iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- if (opt) {
- iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, inet->cork.addr, rt, 0);
- }
- iph->tos = inet->tos;
- iph->tot_len = htons(skb->len);
+ iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
iph->frag_off = df;
- ip_select_ident(iph, &rt->u.dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
- iph->saddr = rt->rt_src;
- iph->daddr = rt->rt_dst;
- ip_send_check(iph);
+ ip_copy_addrs(iph, fl4);
+ ip_select_ident(net, skb, sk);
+
+ if (opt) {
+ iph->ihl += opt->optlen >> 2;
+ ip_options_build(skb, opt, cork->addr, rt);
+ }
- skb->priority = sk->sk_priority;
- skb->dst = dst_clone(&rt->u.dst);
+ skb->priority = cork->priority;
+ skb->mark = cork->mark;
+ if (sk_is_tcp(sk))
+ skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
+ else
+ skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
+ /*
+ * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
+ * on dst refcount
+ */
+ cork->dst = NULL;
+ skb_dst_set(skb, &rt->dst);
- /* Netfilter gets whole the not fragmented skb. */
- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
- skb->dst->dev, dst_output);
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 icmp_type;
+
+ /* For such sockets, transhdrlen is zero when do ip_append_data(),
+ * so icmphdr does not in skb linear region and can not get icmp_type
+ * by icmp_hdr(skb)->type.
+ */
+ if (sk->sk_type == SOCK_RAW &&
+ !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
+ icmp_type = fl4->fl4_icmp_type;
+ else
+ icmp_type = icmp_hdr(skb)->type;
+ icmp_out_count(net, icmp_type);
+ }
+
+ ip_cork_release(cork);
+out:
+ return skb;
+}
+
+int ip_send_skb(struct net *net, struct sk_buff *skb)
+{
+ int err;
+
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
- err = inet->recverr ? net_xmit_errno(err) : 0;
+ err = net_xmit_errno(err);
if (err)
- goto error;
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
}
-out:
- inet->cork.flags &= ~IPCORK_OPT;
- kfree(inet->cork.opt);
- inet->cork.opt = NULL;
- if (inet->cork.rt) {
- ip_rt_put(inet->cork.rt);
- inet->cork.rt = NULL;
- }
return err;
+}
-error:
- IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
- goto out;
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
+{
+ struct sk_buff *skb;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ return 0;
+
+ /* Netfilter gets whole the not fragmented skb. */
+ return ip_send_skb(sock_net(sk), skb);
}
/*
* Throw away all pending data on the socket.
*/
-void ip_flush_pending_frames(struct sock *sk)
+static void __ip_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
{
- struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
- while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ while ((skb = __skb_dequeue_tail(queue)) != NULL)
kfree_skb(skb);
- inet->cork.flags &= ~IPCORK_OPT;
- kfree(inet->cork.opt);
- inet->cork.opt = NULL;
- if (inet->cork.rt) {
- ip_rt_put(inet->cork.rt);
- inet->cork.rt = NULL;
- }
+ ip_cork_release(cork);
}
+void ip_flush_pending_frames(struct sock *sk)
+{
+ __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+
+struct sk_buff *ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ struct inet_cork *cork, unsigned int flags)
+{
+ struct sk_buff_head queue;
+ int err;
+
+ if (flags & MSG_PROBE)
+ return NULL;
+
+ __skb_queue_head_init(&queue);
+
+ cork->flags = 0;
+ cork->addr = 0;
+ cork->opt = NULL;
+ err = ip_setup_cork(sk, cork, ipc, rtp);
+ if (err)
+ return ERR_PTR(err);
+
+ err = __ip_append_data(sk, fl4, &queue, cork,
+ &current->task_frag, getfrag,
+ from, length, transhdrlen, flags);
+ if (err) {
+ __ip_flush_pending_frames(sk, &queue, cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip_make_skb(sk, fl4, &queue, cork);
+}
/*
* Fetch data from kernel space and fill in checksum if needed.
*/
-static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
int len, int odd, struct sk_buff *skb)
{
__wsum csum;
- csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
- return 0;
+ return 0;
}
-/*
+/*
* Generic function to send a packet as reply to another packet.
- * Used to send TCP resets so far. ICMP should use this function too.
- *
- * Should run single threaded per socket because it uses the sock
- * structure to pass arguments.
- *
- * LATER: switch from ip_build_xmit to ip_append_*
+ * Used to send some TCP resets/acks so far.
*/
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
- unsigned int len)
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
+ const struct ip_options *sopt,
+ __be32 daddr, __be32 saddr,
+ const struct ip_reply_arg *arg,
+ unsigned int len, u64 transmit_time, u32 txhash)
{
- struct inet_sock *inet = inet_sk(sk);
- struct {
- struct ip_options opt;
- char data[40];
- } replyopts;
+ struct ip_options_data replyopts;
struct ipcm_cookie ipc;
- __be32 daddr;
- struct rtable *rt = (struct rtable*)skb->dst;
+ struct flowi4 fl4;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
+ struct sk_buff *nskb;
+ int err;
+ int oif;
- if (ip_options_echo(&replyopts.opt, skb))
+ if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
return;
- daddr = ipc.addr = rt->rt_src;
- ipc.opt = NULL;
+ ipcm_init(&ipc);
+ ipc.addr = daddr;
+ ipc.sockc.transmit_time = transmit_time;
- if (replyopts.opt.optlen) {
+ if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
- if (ipc.opt->srr)
- daddr = replyopts.opt.faddr;
+ if (replyopts.opt.opt.srr)
+ daddr = replyopts.opt.opt.faddr;
}
- {
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = rt->rt_spec_dst,
- .tos = RT_TOS(skb->nh.iph->tos) } },
- /* Not quite clean, but right. */
- .uli_u = { .ports =
- { .sport = skb->h.th->dest,
- .dport = skb->h.th->source } },
- .proto = sk->sk_protocol };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(&rt, &fl))
- return;
- }
+ oif = arg->bound_dev_if;
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+ oif = skb->skb_iif;
+
+ flowi4_init_output(&fl4, oif,
+ IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
+ arg->tos & INET_DSCP_MASK,
+ RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
+ ip_reply_arg_flowi_flags(arg),
+ daddr, saddr,
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt))
+ return;
- /* And let IP do all the hard work.
+ inet_sk(sk)->tos = arg->tos;
- This chunk is not reenterable, hence spinlock.
- Note that it uses the fact, that this function is called
- with locally disabled BH and that sk cannot be already spinlocked.
- */
- bh_lock_sock(sk);
- inet->tos = skb->nh.iph->tos;
- sk->sk_priority = skb->priority;
- sk->sk_protocol = skb->nh.iph->protocol;
- ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
- &ipc, rt, MSG_DONTWAIT);
- if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
- if (arg->csumoffset >= 0)
- *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
- skb->ip_summed = CHECKSUM_NONE;
- ip_push_pending_frames(sk);
+ sk->sk_protocol = ip_hdr(skb)->protocol;
+ sk->sk_bound_dev_if = arg->bound_dev_if;
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
+ ipc.sockc.mark = fl4.flowi4_mark;
+ err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
+ len, 0, &ipc, &rt, MSG_DONTWAIT);
+ if (unlikely(err)) {
+ ip_flush_pending_frames(sk);
+ goto out;
}
- bh_unlock_sock(sk);
-
+ nskb = skb_peek(&sk->sk_write_queue);
+ if (nskb) {
+ if (arg->csumoffset >= 0)
+ *((__sum16 *)skb_transport_header(nskb) +
+ arg->csumoffset) = csum_fold(csum_add(nskb->csum,
+ arg->csum));
+ nskb->ip_summed = CHECKSUM_NONE;
+ if (orig_sk) {
+ skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
+ psp_reply_set_decrypted(orig_sk, nskb);
+ }
+ if (transmit_time)
+ nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
+ if (txhash)
+ skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
+ ip_push_pending_frames(sk, &fl4);
+ }
+out:
ip_rt_put(rt);
}
@@ -1399,11 +1685,7 @@ void __init ip_init(void)
ip_rt_init();
inet_initpeers();
-#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
- igmp_mc_proc_init();
+#if defined(CONFIG_IP_MULTICAST)
+ igmp_mc_init();
#endif
}
-
-EXPORT_SYMBOL(ip_generic_getfrag);
-EXPORT_SYMBOL(ip_queue_xmit);
-EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 57d4bae6f080..6d9c5c20b1c4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1,18 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* The IP to API glue.
- *
- * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $
*
* Authors: see ip.c
*
* Fixes:
* Many : Split from ip.c , see ip.c for history.
* Martin Mares : TOS setting fixed.
- * Alan Cox : Fixed a couple of oopses in Martin's
+ * Alan Cox : Fixed a couple of oopses in Martin's
* TOS tweaks.
* Mike McLagan : Routing by source
*/
@@ -20,12 +19,12 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
-#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -35,21 +34,18 @@
#include <linux/netfilter.h>
#include <linux/route.h>
#include <linux/mroute.h>
+#include <net/inet_ecn.h>
#include <net/route.h>
#include <net/xfrm.h>
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/compat.h>
+#include <net/checksum.h>
+#if IS_ENABLED(CONFIG_IPV6)
#include <net/transp_v6.h>
#endif
+#include <net/ip_fib.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
-
-#define IP_CMSG_PKTINFO 1
-#define IP_CMSG_TTL 2
-#define IP_CMSG_TOS 4
-#define IP_CMSG_RECVOPTS 8
-#define IP_CMSG_RETOPTS 16
-#define IP_CMSG_PASSSEC 32
+#include <linux/uaccess.h>
/*
* SOL_IP control messages.
@@ -57,30 +53,22 @@
static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
- struct in_pktinfo info;
- struct rtable *rt = (struct rtable *)skb->dst;
+ struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
- info.ipi_addr.s_addr = skb->nh.iph->daddr;
- if (rt) {
- info.ipi_ifindex = rt->rt_iif;
- info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
- } else {
- info.ipi_ifindex = 0;
- info.ipi_spec_dst.s_addr = 0;
- }
+ info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}
static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
{
- int ttl = skb->nh.iph->ttl;
+ int ttl = ip_hdr(skb)->ttl;
put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
}
static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
{
- put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
+ put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
}
static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
@@ -88,19 +76,21 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
if (IPCB(skb)->opt.optlen == 0)
return;
- put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
+ put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
+ ip_hdr(skb) + 1);
}
-static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg,
+ struct sk_buff *skb)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options * opt = (struct ip_options*)optbuf;
+ struct ip_options *opt = (struct ip_options *)optbuf;
if (IPCB(skb)->opt.optlen == 0)
return;
- if (ip_options_echo(opt, skb)) {
+ if (ip_options_echo(net, opt, skb)) {
msg->msg_flags |= MSG_CTRUNC;
return;
}
@@ -109,74 +99,190 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
}
+static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
+{
+ int val;
+
+ if (IPCB(skb)->frag_max_size == 0)
+ return;
+
+ val = IPCB(skb)->frag_max_size;
+ put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
+}
+
+static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
+ int tlen, int offset)
+{
+ __wsum csum = skb->csum;
+
+ if (skb->ip_summed != CHECKSUM_COMPLETE)
+ return;
+
+ if (offset != 0) {
+ int tend_off = skb_transport_offset(skb) + tlen;
+ csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
+ }
+
+ put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
+}
+
static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
{
- char *secdata;
- u32 seclen, secid;
+ struct lsm_context ctx;
+ u32 secid;
int err;
err = security_socket_getpeersec_dgram(NULL, skb, &secid);
if (err)
return;
- err = security_secid_to_secctx(secid, &secdata, &seclen);
- if (err)
+ err = security_secid_to_secctx(secid, &ctx);
+ if (err < 0)
return;
- put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
- security_release_secctx(secdata, seclen);
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context);
+ security_release_secctx(&ctx);
}
+static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
+{
+ __be16 _ports[2], *ports;
+ struct sockaddr_in sin;
+
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
+ ports = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_ports), &_ports);
+ if (!ports)
+ return;
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
+ sin.sin_port = ports[1];
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
+}
-void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(skb->sk);
- unsigned flags = inet->cmsg_flags;
+ unsigned long flags = inet_cmsg_flags(inet_sk(sk));
+
+ if (!flags)
+ return;
/* Ordered by supposed usage frequency */
- if (flags & 1)
+ if (flags & IP_CMSG_PKTINFO) {
ip_cmsg_recv_pktinfo(msg, skb);
- if ((flags>>=1) == 0)
- return;
- if (flags & 1)
+ flags &= ~IP_CMSG_PKTINFO;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_TTL) {
ip_cmsg_recv_ttl(msg, skb);
- if ((flags>>=1) == 0)
- return;
- if (flags & 1)
+ flags &= ~IP_CMSG_TTL;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_TOS) {
ip_cmsg_recv_tos(msg, skb);
- if ((flags>>=1) == 0)
- return;
- if (flags & 1)
+ flags &= ~IP_CMSG_TOS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_RECVOPTS) {
ip_cmsg_recv_opts(msg, skb);
- if ((flags>>=1) == 0)
- return;
- if (flags & 1)
- ip_cmsg_recv_retopts(msg, skb);
- if ((flags>>=1) == 0)
- return;
+ flags &= ~IP_CMSG_RECVOPTS;
+ if (!flags)
+ return;
+ }
- if (flags & 1)
+ if (flags & IP_CMSG_RETOPTS) {
+ ip_cmsg_recv_retopts(sock_net(sk), msg, skb);
+
+ flags &= ~IP_CMSG_RETOPTS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_PASSSEC) {
ip_cmsg_recv_security(msg, skb);
+
+ flags &= ~IP_CMSG_PASSSEC;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_ORIGDSTADDR) {
+ ip_cmsg_recv_dstaddr(msg, skb);
+
+ flags &= ~IP_CMSG_ORIGDSTADDR;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_CHECKSUM)
+ ip_cmsg_recv_checksum(msg, skb, tlen, offset);
+
+ if (flags & IP_CMSG_RECVFRAGSIZE)
+ ip_cmsg_recv_fragsize(msg, skb);
}
+EXPORT_SYMBOL(ip_cmsg_recv_offset);
-int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
+int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
+ bool allow_ipv6)
{
- int err;
+ int err, val;
struct cmsghdr *cmsg;
+ struct net *net = sock_net(sk);
- for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (allow_ipv6 &&
+ cmsg->cmsg_level == SOL_IPV6 &&
+ cmsg->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *src_info;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+ return -EINVAL;
+ src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+ return -EINVAL;
+ if (src_info->ipi6_ifindex)
+ ipc->oif = src_info->ipi6_ifindex;
+ ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+ continue;
+ }
+#endif
+ if (cmsg->cmsg_level == SOL_SOCKET) {
+ err = __sock_cmsg_send(sk, cmsg, &ipc->sockc);
+ if (err)
+ return err;
+ continue;
+ }
+
if (cmsg->cmsg_level != SOL_IP)
continue;
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
- err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
- err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
+ err = cmsg->cmsg_len - sizeof(struct cmsghdr);
+
+ /* Our caller is responsible for freeing ipc->opt */
+ err = ip_options_get(net, &ipc->opt,
+ KERNEL_SOCKPTR(CMSG_DATA(cmsg)),
+ err < 40 ? err : 40);
if (err)
return err;
break;
@@ -186,10 +292,39 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
return -EINVAL;
info = (struct in_pktinfo *)CMSG_DATA(cmsg);
- ipc->oif = info->ipi_ifindex;
+ if (info->ipi_ifindex)
+ ipc->oif = info->ipi_ifindex;
ipc->addr = info->ipi_spec_dst.s_addr;
break;
}
+ case IP_TTL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->ttl = val;
+ break;
+ case IP_TOS:
+ if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+ val = *(int *)CMSG_DATA(cmsg);
+ else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+ val = *(u8 *)CMSG_DATA(cmsg);
+ else
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ ipc->tos = val;
+ ipc->sockc.priority = rt_tos2priority(ipc->tos);
+ break;
+ case IP_PROTOCOL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->protocol = val;
+ break;
default:
return -EINVAL;
}
@@ -197,136 +332,199 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
return 0;
}
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
-/* Special input handler for packets caught by router alert option.
- They are selected only by protocol field, and then processed likely
- local ones; but only if someone wants them! Otherwise, router
- not running rsvpd will kill RSVP.
-
- It is user level problem, what it will make with them.
- I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
- but receiver should be enough clever f.e. to forward mtrace requests,
- sent to multicast group to reach destination designated router.
- */
-struct ip_ra_chain *ip_ra_chain;
-DEFINE_RWLOCK(ip_ra_lock);
+ sock_put(ra->saved_sk);
+ kfree(ra);
+}
-int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+int ip_ra_control(struct sock *sk, unsigned char on,
+ void (*destructor)(struct sock *))
{
- struct ip_ra_chain *ra, *new_ra, **rap;
+ struct ip_ra_chain *ra, *new_ra;
+ struct ip_ra_chain __rcu **rap;
+ struct net *net = sock_net(sk);
- if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
-
- write_lock_bh(&ip_ra_lock);
- for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+ if (on && !new_ra)
+ return -ENOMEM;
+
+ mutex_lock(&net->ipv4.ra_mutex);
+ for (rap = &net->ipv4.ra_chain;
+ (ra = rcu_dereference_protected(*rap,
+ lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
+ rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
- write_unlock_bh(&ip_ra_lock);
+ mutex_unlock(&net->ipv4.ra_mutex);
kfree(new_ra);
return -EADDRINUSE;
}
- *rap = ra->next;
- write_unlock_bh(&ip_ra_lock);
+ /* dont let ip_call_ra_chain() use sk again */
+ ra->sk = NULL;
+ RCU_INIT_POINTER(*rap, ra->next);
+ mutex_unlock(&net->ipv4.ra_mutex);
if (ra->destructor)
ra->destructor(sk);
- sock_put(sk);
- kfree(ra);
+ /*
+ * Delay sock_put(sk) and kfree(ra) after one rcu grace
+ * period. This guarantee ip_call_ra_chain() dont need
+ * to mess with socket refcounts.
+ */
+ ra->saved_sk = sk;
+ call_rcu(&ra->rcu, ip_ra_destroy_rcu);
return 0;
}
}
- if (new_ra == NULL) {
- write_unlock_bh(&ip_ra_lock);
+ if (!new_ra) {
+ mutex_unlock(&net->ipv4.ra_mutex);
return -ENOBUFS;
}
new_ra->sk = sk;
new_ra->destructor = destructor;
- new_ra->next = ra;
- *rap = new_ra;
+ RCU_INIT_POINTER(new_ra->next, ra);
+ rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
- write_unlock_bh(&ip_ra_lock);
+ mutex_unlock(&net->ipv4.ra_mutex);
return 0;
}
-void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out)
+{
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr),
+ icmp_hdr(skb)->un.reserved[1] * 4);
+ }
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
- struct inet_sock *inet = inet_sk(sk);
struct sock_exterr_skb *serr;
- if (!inet->recverr)
- return;
-
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
return;
- serr = SKB_EXT_ERR(skb);
+ serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
- serr->ee.ee_type = skb->h.icmph->type;
- serr->ee.ee_code = skb->h.icmph->code;
+ serr->ee.ee_type = icmp_hdr(skb)->type;
+ serr->ee.ee_code = icmp_hdr(skb)->code;
serr->ee.ee_pad = 0;
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
- serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
+ serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
+ skb_network_header(skb);
serr->port = port;
- skb->h.raw = payload;
- if (!skb_pull(skb, payload - skb->data) ||
- sock_queue_err_skb(sk, skb))
- kfree_skb(skb);
+ if (skb_pull(skb, payload - skb->data)) {
+ if (inet_test_bit(RECVERR_RFC4884, sk))
+ ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
+
+ skb_reset_transport_header(skb);
+ if (sock_queue_err_skb(sk, skb) == 0)
+ return;
+ }
+ kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(ip_icmp_error);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
{
- struct inet_sock *inet = inet_sk(sk);
struct sock_exterr_skb *serr;
struct iphdr *iph;
struct sk_buff *skb;
- if (!inet->recverr)
+ if (!inet_test_bit(RECVERR, sk))
return;
skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
if (!skb)
return;
- iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
- skb->nh.iph = iph;
+ skb_put(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
iph->daddr = daddr;
- serr = SKB_EXT_ERR(skb);
+ serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
- serr->ee.ee_type = 0;
+ serr->ee.ee_type = 0;
serr->ee.ee_code = 0;
serr->ee.ee_pad = 0;
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
- serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+ serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
serr->port = port;
- skb->h.raw = skb->tail;
- __skb_pull(skb, skb->tail - skb->data);
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
-/*
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+ return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
+/* IPv4 supports cmsg on all imcp errors and some timestamps
+ *
+ * Timestamp code paths do not initialize the fields expected by cmsg:
+ * the PKTINFO fields in skb->cb[]. Fill those in here.
+ */
+static bool ipv4_datagram_support_cmsg(const struct sock *sk,
+ struct sk_buff *skb,
+ int ee_origin)
+{
+ struct in_pktinfo *info;
+
+ if (ee_origin == SO_EE_ORIGIN_ICMP)
+ return true;
+
+ if (ee_origin == SO_EE_ORIGIN_LOCAL)
+ return false;
+
+ /* Support IP_PKTINFO on tstamp packets if requested, to correlate
+ * timestamp with egress dev. Not possible for packets without iif
+ * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
+ */
+ info = PKTINFO_SKB_CB(skb);
+ if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) ||
+ !info->ipi_ifindex)
+ return false;
+
+ info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
+ return true;
+}
+
+/*
* Handle MSG_ERRQUEUE
*/
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct sock_exterr_skb *serr;
- struct sk_buff *skb, *skb2;
- struct sockaddr_in *sin;
+ struct sk_buff *skb;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct {
struct sock_extended_err ee;
struct sockaddr_in offender;
@@ -335,8 +533,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
int copied;
err = -EAGAIN;
- skb = skb_dequeue(&sk->sk_error_queue);
- if (skb == NULL)
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb)
goto out;
copied = skb->len;
@@ -344,33 +542,32 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
- if (err)
- goto out_free_skb;
-
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
sock_recv_timestamp(msg, sk, skb);
serr = SKB_EXT_ERR(skb);
- sin = (struct sockaddr_in *)msg->msg_name;
- if (sin) {
+ if (sin && ipv4_datagram_support_addr(serr)) {
sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = *(__be32*)(skb->nh.raw + serr->addr_offset);
+ sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
+ serr->addr_offset);
sin->sin_port = serr->port;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
sin = &errhdr.offender;
- sin->sin_family = AF_UNSPEC;
- if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
- struct inet_sock *inet = inet_sk(sk);
+ memset(sin, 0, sizeof(*sin));
+ if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = skb->nh.iph->saddr;
- sin->sin_port = 0;
- memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
- if (inet->cmsg_flags)
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ if (inet_cmsg_flags(inet_sk(sk)))
ip_cmsg_recv(msg, skb);
}
@@ -381,50 +578,359 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
- /* Reset and regenerate socket error */
- spin_lock_bh(&sk->sk_error_queue.lock);
- sk->sk_err = 0;
- if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
- sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_bh(&sk->sk_error_queue.lock);
- sk->sk_error_report(sk);
- } else
- spin_unlock_bh(&sk->sk_error_queue.lock);
-
-out_free_skb:
- kfree_skb(skb);
+ consume_skb(skb);
out:
return err;
}
+void __ip_sock_set_tos(struct sock *sk, int val)
+{
+ u8 old_tos = inet_sk(sk)->tos;
+
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= old_tos & INET_ECN_MASK;
+ }
+ if (old_tos != val) {
+ WRITE_ONCE(inet_sk(sk)->tos, val);
+ WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
+ sk_dst_reset(sk);
+ }
+}
+
+void ip_sock_set_tos(struct sock *sk, int val)
+{
+ sockopt_lock_sock(sk);
+ __ip_sock_set_tos(sk, val);
+ sockopt_release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_tos);
+
+void ip_sock_set_freebind(struct sock *sk)
+{
+ inet_set_bit(FREEBIND, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_freebind);
+
+void ip_sock_set_recverr(struct sock *sk)
+{
+ inet_set_bit(RECVERR, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_recverr);
+
+int ip_sock_set_mtu_discover(struct sock *sk, int val)
+{
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+ return -EINVAL;
+ WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
+ return 0;
+}
+EXPORT_SYMBOL(ip_sock_set_mtu_discover);
+
+void ip_sock_set_pktinfo(struct sock *sk)
+{
+ inet_set_bit(PKTINFO, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_pktinfo);
/*
- * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
- * an IP socket.
+ * Socket option code for IP. This is the end of the line after any
+ * TCP,UDP etc options on an IP socket.
*/
+static bool setsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_BLOCK_SOURCE:
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case IP_MSFILTER:
+ case IP_UNBLOCK_SOURCE:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_MSFILTER:
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_UNBLOCK_SOURCE:
+ return true;
+ }
+ return false;
+}
+
+static int set_mcast_msfilter(struct sock *sk, int ifindex,
+ int numsrc, int fmode,
+ struct sockaddr_storage *group,
+ struct sockaddr_storage *list)
+{
+ struct ip_msfilter *msf;
+ struct sockaddr_in *psin;
+ int err, i;
+
+ msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL);
+ if (!msf)
+ return -ENOBUFS;
+
+ psin = (struct sockaddr_in *)group;
+ if (psin->sin_family != AF_INET)
+ goto Eaddrnotavail;
+ msf->imsf_multiaddr = psin->sin_addr.s_addr;
+ msf->imsf_interface = 0;
+ msf->imsf_fmode = fmode;
+ msf->imsf_numsrc = numsrc;
+ for (i = 0; i < numsrc; ++i) {
+ psin = (struct sockaddr_in *)&list[i];
+
+ if (psin->sin_family != AF_INET)
+ goto Eaddrnotavail;
+ msf->imsf_slist_flex[i] = psin->sin_addr.s_addr;
+ }
+ err = ip_mc_msfilter(sk, msf, ifindex);
+ kfree(msf);
+ return err;
+
+Eaddrnotavail:
+ kfree(msf);
+ return -EADDRNOTAVAIL;
+}
+
+static int copy_group_source_from_sockptr(struct group_source_req *greqs,
+ sockptr_t optval, int optlen)
+{
+ if (in_compat_syscall()) {
+ struct compat_group_source_req gr32;
+
+ if (optlen != sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+ greqs->gsr_interface = gr32.gsr_interface;
+ greqs->gsr_group = gr32.gsr_group;
+ greqs->gsr_source = gr32.gsr_source;
+ } else {
+ if (optlen != sizeof(*greqs))
+ return -EINVAL;
+ if (copy_from_sockptr(greqs, optval, sizeof(*greqs)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
-static int do_ip_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int optlen)
+static int do_mcast_group_source(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct group_source_req greqs;
+ struct ip_mreq_source mreqs;
+ struct sockaddr_in *psin;
+ int omode, add, err;
+
+ err = copy_group_source_from_sockptr(&greqs, optval, optlen);
+ if (err)
+ return err;
+
+ if (greqs.gsr_group.ss_family != AF_INET ||
+ greqs.gsr_source.ss_family != AF_INET)
+ return -EADDRNOTAVAIL;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+ psin = (struct sockaddr_in *)&greqs.gsr_source;
+ mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+ mreqs.imr_interface = 0; /* use index for mc_source */
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct ip_mreqn mreq;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_address.s_addr = 0;
+ mreq.imr_ifindex = greqs.gsr_interface;
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
+ if (err && err != -EADDRINUSE)
+ return err;
+ greqs.gsr_interface = mreq.imr_ifindex;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface);
+}
+
+static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
+{
+ struct group_filter *gsf = NULL;
+ int err;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
+ return -ENOBUFS;
+
+ gsf = memdup_sockptr(optval, optlen);
+ if (IS_ERR(gsf))
+ return PTR_ERR(gsf);
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ err = -ENOBUFS;
+ if (gsf->gf_numsrc >= 0x1ffffff ||
+ gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf))
+ goto out_free_gsf;
+
+ err = -EINVAL;
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
+ goto out_free_gsf;
+
+ err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc,
+ gsf->gf_fmode, &gsf->gf_group,
+ gsf->gf_slist_flex);
+out_free_gsf:
+ kfree(gsf);
+ return err;
+}
+
+static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter *gf32;
+ unsigned int n;
+ void *p;
+ int err;
+
+ if (optlen < size0)
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
+ return -ENOBUFS;
+
+ p = kmalloc(optlen + 4, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
+
+ err = -EFAULT;
+ if (copy_from_sockptr(gf32, optval, optlen))
+ goto out_free_gsf;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ n = gf32->gf_numsrc;
+ err = -ENOBUFS;
+ if (n >= 0x1ffffff)
+ goto out_free_gsf;
+
+ err = -EINVAL;
+ if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
+ goto out_free_gsf;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ err = -ENOBUFS;
+ if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf))
+ goto out_free_gsf;
+ err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode,
+ &gf32->gf_group, gf32->gf_slist_flex);
+out_free_gsf:
+ kfree(p);
+ return err;
+}
+
+static int ip_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct ip_mreqn mreq = { };
+ struct sockaddr_in *psin;
+ struct group_req greq;
+
+ if (optlen < sizeof(struct group_req))
+ return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
+
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+ if (optname == MCAST_JOIN_GROUP)
+ return ip_mc_join_group(sk, &mreq);
+ return ip_mc_leave_group(sk, &mreq);
+}
+
+static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct compat_group_req greq;
+ struct ip_mreqn mreq = { };
+ struct sockaddr_in *psin;
+
+ if (optlen < sizeof(struct compat_group_req))
+ return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
+
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+
+ if (optname == MCAST_JOIN_GROUP)
+ return ip_mc_join_group(sk, &mreq);
+ return ip_mc_leave_group(sk, &mreq);
+}
+
+DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
+
+int do_ip_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
- int val=0,err;
-
- if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
- (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
- (1<<IP_RETOPTS) | (1<<IP_TOS) |
- (1<<IP_TTL) | (1<<IP_HDRINCL) |
- (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
- (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
- (1<<IP_PASSSEC))) ||
- optname == IP_MULTICAST_TTL ||
- optname == IP_MULTICAST_LOOP) {
+ struct net *net = sock_net(sk);
+ int val = 0, err, retv;
+ bool needs_rtnl = setsockopt_needs_rtnl(optname);
+
+ switch (optname) {
+ case IP_PKTINFO:
+ case IP_RECVTTL:
+ case IP_RECVOPTS:
+ case IP_RECVTOS:
+ case IP_RETOPTS:
+ case IP_TOS:
+ case IP_TTL:
+ case IP_HDRINCL:
+ case IP_MTU_DISCOVER:
+ case IP_RECVERR:
+ case IP_ROUTER_ALERT:
+ case IP_FREEBIND:
+ case IP_PASSSEC:
+ case IP_TRANSPARENT:
+ case IP_MINTTL:
+ case IP_NODEFRAG:
+ case IP_BIND_ADDRESS_NO_PORT:
+ case IP_UNICAST_IF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_ALL:
+ case IP_MULTICAST_LOOP:
+ case IP_RECVORIGDSTADDR:
+ case IP_CHECKSUM:
+ case IP_RECVFRAGSIZE:
+ case IP_RECVERR_RFC4884:
+ case IP_LOCAL_PORT_RANGE:
if (optlen >= sizeof(int)) {
- if (get_user(val, (int __user *) optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
} else if (optlen >= sizeof(char)) {
unsigned char ucval;
- if (get_user(ucval, (unsigned char __user *) optval))
+ if (copy_from_sockptr(&ucval, optval, sizeof(ucval)))
return -EFAULT;
val = (int) ucval;
}
@@ -432,464 +938,476 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* If optlen==0, it is equivalent to val == 0 */
-#ifdef CONFIG_IP_MROUTE
- if (optname >= MRT_BASE && optname <= (MRT_BASE + 10))
- return ip_mroute_setsockopt(sk,optname,optval,optlen);
-#endif
+ if (optname == IP_ROUTER_ALERT) {
+ retv = ip_ra_control(sk, val ? 1 : 0, NULL);
+ if (retv == 0)
+ inet_assign_bit(RTALERT, sk, val);
+ return retv;
+ }
+ if (ip_mroute_opt(optname))
+ return ip_mroute_setsockopt(sk, optname, optval, optlen);
+
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case IP_PKTINFO:
+ inet_assign_bit(PKTINFO, sk, val);
+ return 0;
+ case IP_RECVTTL:
+ inet_assign_bit(TTL, sk, val);
+ return 0;
+ case IP_RECVTOS:
+ inet_assign_bit(TOS, sk, val);
+ return 0;
+ case IP_RECVOPTS:
+ inet_assign_bit(RECVOPTS, sk, val);
+ return 0;
+ case IP_RETOPTS:
+ inet_assign_bit(RETOPTS, sk, val);
+ return 0;
+ case IP_PASSSEC:
+ inet_assign_bit(PASSSEC, sk, val);
+ return 0;
+ case IP_RECVORIGDSTADDR:
+ inet_assign_bit(ORIGDSTADDR, sk, val);
+ return 0;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ return -EINVAL;
+ inet_assign_bit(RECVFRAGSIZE, sk, val);
+ return 0;
+ case IP_RECVERR:
+ inet_assign_bit(RECVERR, sk, val);
+ if (!val)
+ skb_errqueue_purge(&sk->sk_error_queue);
+ return 0;
+ case IP_RECVERR_RFC4884:
+ if (val < 0 || val > 1)
+ return -EINVAL;
+ inet_assign_bit(RECVERR_RFC4884, sk, val);
+ return 0;
+ case IP_FREEBIND:
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(FREEBIND, sk, val);
+ return 0;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW)
+ return -ENOPROTOOPT;
+ inet_assign_bit(HDRINCL, sk, val);
+ return 0;
+ case IP_MULTICAST_LOOP:
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(MC_LOOP, sk, val);
+ return 0;
+ case IP_MULTICAST_ALL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val != 0 && val != 1)
+ return -EINVAL;
+ inet_assign_bit(MC_ALL, sk, val);
+ return 0;
+ case IP_TRANSPARENT:
+ if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(TRANSPARENT, sk, val);
+ return 0;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW)
+ return -ENOPROTOOPT;
+ inet_assign_bit(NODEFRAG, sk, val);
+ return 0;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val);
+ return 0;
+ case IP_TTL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val != -1 && (val < 1 || val > 255))
+ return -EINVAL;
+ WRITE_ONCE(inet->uc_ttl, val);
+ return 0;
+ case IP_MINTTL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&ip4_min_ttl);
+
+ WRITE_ONCE(inet->min_ttl, val);
+ return 0;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ return -EINVAL;
+ if (optlen < 1)
+ return -EINVAL;
+ if (val == -1)
+ val = 1;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ WRITE_ONCE(inet->mc_ttl, val);
+ return 0;
+ case IP_MTU_DISCOVER:
+ return ip_sock_set_mtu_discover(sk, val);
+ case IP_TOS: /* This sets both TOS and Precedence */
+ ip_sock_set_tos(sk, val);
+ return 0;
+ case IP_LOCAL_PORT_RANGE:
+ {
+ u16 lo = val;
+ u16 hi = val >> 16;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (lo != 0 && hi != 0 && lo > hi)
+ return -EINVAL;
+
+ WRITE_ONCE(inet->local_port_range, val);
+ return 0;
+ }
+ }
err = 0;
- lock_sock(sk);
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
switch (optname) {
- case IP_OPTIONS:
- {
- struct ip_options * opt = NULL;
- if (optlen > 40 || optlen < 0)
- goto e_inval;
- err = ip_options_get_from_user(&opt, optval, optlen);
- if (err)
- break;
- if (inet->is_icsk) {
- struct inet_connection_sock *icsk = inet_csk(sk);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- if (sk->sk_family == PF_INET ||
- (!((1 << sk->sk_state) &
- (TCPF_LISTEN | TCPF_CLOSE)) &&
- inet->daddr != LOOPBACK4_IPV6)) {
-#endif
- if (inet->opt)
- icsk->icsk_ext_hdr_len -= inet->opt->optlen;
- if (opt)
- icsk->icsk_ext_hdr_len += opt->optlen;
- icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- }
+ case IP_OPTIONS:
+ {
+ struct ip_options_rcu *old, *opt = NULL;
+
+ if (optlen > 40)
+ goto e_inval;
+ err = ip_options_get(sock_net(sk), &opt, optval, optlen);
+ if (err)
+ break;
+ old = rcu_dereference_protected(inet->inet_opt,
+ lockdep_sock_is_held(sk));
+ if (inet_test_bit(IS_ICSK, sk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == PF_INET ||
+ (!((1 << sk->sk_state) &
+ (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet->inet_daddr != LOOPBACK4_IPV6)) {
#endif
+ if (old)
+ icsk->icsk_ext_hdr_len -= old->opt.optlen;
+ if (opt)
+ icsk->icsk_ext_hdr_len += opt->opt.optlen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+#if IS_ENABLED(CONFIG_IPV6)
}
- opt = xchg(&inet->opt, opt);
- kfree(opt);
- break;
+#endif
}
- case IP_PKTINFO:
- if (val)
- inet->cmsg_flags |= IP_CMSG_PKTINFO;
- else
- inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
- break;
- case IP_RECVTTL:
- if (val)
- inet->cmsg_flags |= IP_CMSG_TTL;
- else
- inet->cmsg_flags &= ~IP_CMSG_TTL;
- break;
- case IP_RECVTOS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_TOS;
- else
- inet->cmsg_flags &= ~IP_CMSG_TOS;
- break;
- case IP_RECVOPTS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_RECVOPTS;
- else
- inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
- break;
- case IP_RETOPTS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_RETOPTS;
- else
- inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
- break;
- case IP_PASSSEC:
- if (val)
- inet->cmsg_flags |= IP_CMSG_PASSSEC;
- else
- inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
- break;
- case IP_TOS: /* This sets both TOS and Precedence */
- if (sk->sk_type == SOCK_STREAM) {
- val &= ~3;
- val |= inet->tos & 3;
- }
- if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
- !capable(CAP_NET_ADMIN)) {
- err = -EPERM;
- break;
+ rcu_assign_pointer(inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ case IP_CHECKSUM:
+ if (val) {
+ if (!(inet_test_bit(CHECKSUM, sk))) {
+ inet_inc_convert_csum(sk);
+ inet_set_bit(CHECKSUM, sk);
}
- if (inet->tos != val) {
- inet->tos = val;
- sk->sk_priority = rt_tos2priority(val);
- sk_dst_reset(sk);
+ } else {
+ if (inet_test_bit(CHECKSUM, sk)) {
+ inet_dec_convert_csum(sk);
+ inet_clear_bit(CHECKSUM, sk);
}
+ }
+ break;
+ case IP_UNICAST_IF:
+ {
+ struct net_device *dev = NULL;
+ int ifindex;
+ int midx;
+
+ if (optlen != sizeof(int))
+ goto e_inval;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (ifindex == 0) {
+ WRITE_ONCE(inet->uc_index, 0);
+ err = 0;
break;
- case IP_TTL:
- if (optlen<1)
- goto e_inval;
- if (val != -1 && (val < 1 || val>255))
- goto e_inval;
- inet->uc_ttl = val;
- break;
- case IP_HDRINCL:
- if (sk->sk_type != SOCK_RAW) {
- err = -ENOPROTOOPT;
- break;
- }
- inet->hdrincl = val ? 1 : 0;
- break;
- case IP_MTU_DISCOVER:
- if (val<0 || val>2)
- goto e_inval;
- inet->pmtudisc = val;
- break;
- case IP_RECVERR:
- inet->recverr = !!val;
- if (!val)
- skb_queue_purge(&sk->sk_error_queue);
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ err = -EADDRNOTAVAIL;
+ if (!dev)
break;
- case IP_MULTICAST_TTL:
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- if (optlen<1)
- goto e_inval;
- if (val==-1)
- val = 1;
- if (val < 0 || val > 255)
- goto e_inval;
- inet->mc_ttl = val;
- break;
- case IP_MULTICAST_LOOP:
- if (optlen<1)
- goto e_inval;
- inet->mc_loop = !!val;
- break;
- case IP_MULTICAST_IF:
- {
- struct ip_mreqn mreq;
- struct net_device *dev = NULL;
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- /*
- * Check the arguments are allowable
- */
+ midx = l3mdev_master_ifindex(dev);
+ dev_put(dev);
- err = -EFAULT;
- if (optlen >= sizeof(struct ip_mreqn)) {
- if (copy_from_user(&mreq,optval,sizeof(mreq)))
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
+ break;
+
+ WRITE_ONCE(inet->uc_index, ifindex);
+ err = 0;
+ break;
+ }
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ struct net_device *dev = NULL;
+ int midx;
+
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ /*
+ * Check the arguments are allowable
+ */
+
+ if (optlen < sizeof(struct in_addr))
+ goto e_inval;
+
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_sockptr(&mreq, optval, sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (optlen >= sizeof(struct ip_mreq)) {
+ if (copy_from_sockptr(&mreq, optval,
+ sizeof(struct ip_mreq)))
break;
- } else {
- memset(&mreq, 0, sizeof(mreq));
- if (optlen >= sizeof(struct in_addr) &&
- copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
+ } else if (optlen >= sizeof(struct in_addr)) {
+ if (copy_from_sockptr(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
break;
}
+ }
- if (!mreq.imr_ifindex) {
- if (mreq.imr_address.s_addr == INADDR_ANY) {
- inet->mc_index = 0;
- inet->mc_addr = 0;
- err = 0;
- break;
- }
- dev = ip_dev_find(mreq.imr_address.s_addr);
- if (dev) {
- mreq.imr_ifindex = dev->ifindex;
- dev_put(dev);
- }
- } else
- dev = __dev_get_by_index(mreq.imr_ifindex);
-
-
- err = -EADDRNOTAVAIL;
- if (!dev)
+ if (!mreq.imr_ifindex) {
+ if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
+ WRITE_ONCE(inet->mc_index, 0);
+ WRITE_ONCE(inet->mc_addr, 0);
+ err = 0;
break;
+ }
+ dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
+ if (dev)
+ mreq.imr_ifindex = dev->ifindex;
+ } else
+ dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
- err = -EINVAL;
- if (sk->sk_bound_dev_if &&
- mreq.imr_ifindex != sk->sk_bound_dev_if)
- break;
- inet->mc_index = mreq.imr_ifindex;
- inet->mc_addr = mreq.imr_address.s_addr;
- err = 0;
+ err = -EADDRNOTAVAIL;
+ if (!dev)
break;
- }
- case IP_ADD_MEMBERSHIP:
- case IP_DROP_MEMBERSHIP:
- {
- struct ip_mreqn mreq;
+ midx = l3mdev_master_ifindex(dev);
- if (optlen < sizeof(struct ip_mreq))
- goto e_inval;
- err = -EFAULT;
- if (optlen >= sizeof(struct ip_mreqn)) {
- if(copy_from_user(&mreq,optval,sizeof(mreq)))
- break;
- } else {
- memset(&mreq, 0, sizeof(mreq));
- if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
- break;
- }
+ dev_put(dev);
- if (optname == IP_ADD_MEMBERSHIP)
- err = ip_mc_join_group(sk, &mreq);
- else
- err = ip_mc_leave_group(sk, &mreq);
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if &&
+ mreq.imr_ifindex != sk->sk_bound_dev_if &&
+ midx != sk->sk_bound_dev_if)
break;
- }
- case IP_MSFILTER:
- {
- extern int sysctl_igmp_max_msf;
- struct ip_msfilter *msf;
- if (optlen < IP_MSFILTER_SIZE(0))
- goto e_inval;
- if (optlen > sysctl_optmem_max) {
- err = -ENOBUFS;
- break;
- }
- msf = kmalloc(optlen, GFP_KERNEL);
- if (msf == 0) {
- err = -ENOBUFS;
- break;
- }
- err = -EFAULT;
- if (copy_from_user(msf, optval, optlen)) {
- kfree(msf);
- break;
- }
- /* numsrc >= (1G-4) overflow in 32 bits */
- if (msf->imsf_numsrc >= 0x3ffffffcU ||
- msf->imsf_numsrc > sysctl_igmp_max_msf) {
- kfree(msf);
- err = -ENOBUFS;
- break;
- }
- if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
- kfree(msf);
- err = -EINVAL;
- break;
- }
- err = ip_mc_msfilter(sk, msf, 0);
- kfree(msf);
- break;
- }
- case IP_BLOCK_SOURCE:
- case IP_UNBLOCK_SOURCE:
- case IP_ADD_SOURCE_MEMBERSHIP:
- case IP_DROP_SOURCE_MEMBERSHIP:
- {
- struct ip_mreq_source mreqs;
- int omode, add;
+ WRITE_ONCE(inet->mc_index, mreq.imr_ifindex);
+ WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr);
+ err = 0;
+ break;
+ }
- if (optlen != sizeof(struct ip_mreq_source))
- goto e_inval;
- if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
- err = -EFAULT;
- break;
- }
- if (optname == IP_BLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 1;
- } else if (optname == IP_UNBLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 0;
- } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
- struct ip_mreqn mreq;
-
- mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
- mreq.imr_address.s_addr = mreqs.imr_interface;
- mreq.imr_ifindex = 0;
- err = ip_mc_join_group(sk, &mreq);
- if (err && err != -EADDRINUSE)
- break;
- omode = MCAST_INCLUDE;
- add = 1;
- } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
- omode = MCAST_INCLUDE;
- add = 0;
- }
- err = ip_mc_source(add, omode, sk, &mreqs, 0);
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ {
+ struct ip_mreqn mreq;
+
+ err = -EPROTO;
+ if (inet_test_bit(IS_ICSK, sk))
break;
- }
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- {
- struct group_req greq;
- struct sockaddr_in *psin;
- struct ip_mreqn mreq;
- if (optlen < sizeof(struct group_req))
- goto e_inval;
- err = -EFAULT;
- if(copy_from_user(&greq, optval, sizeof(greq)))
+ if (optlen < sizeof(struct ip_mreq))
+ goto e_inval;
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_sockptr(&mreq, optval, sizeof(mreq)))
break;
- psin = (struct sockaddr_in *)&greq.gr_group;
- if (psin->sin_family != AF_INET)
- goto e_inval;
+ } else {
memset(&mreq, 0, sizeof(mreq));
- mreq.imr_multiaddr = psin->sin_addr;
- mreq.imr_ifindex = greq.gr_interface;
+ if (copy_from_sockptr(&mreq, optval,
+ sizeof(struct ip_mreq)))
+ break;
+ }
- if (optname == MCAST_JOIN_GROUP)
- err = ip_mc_join_group(sk, &mreq);
- else
- err = ip_mc_leave_group(sk, &mreq);
+ if (optname == IP_ADD_MEMBERSHIP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter *msf;
+
+ if (optlen < IP_MSFILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) {
+ err = -ENOBUFS;
break;
}
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- {
- struct group_source_req greqs;
- struct ip_mreq_source mreqs;
- struct sockaddr_in *psin;
- int omode, add;
-
- if (optlen != sizeof(struct group_source_req))
- goto e_inval;
- if (copy_from_user(&greqs, optval, sizeof(greqs))) {
- err = -EFAULT;
- break;
- }
- if (greqs.gsr_group.ss_family != AF_INET ||
- greqs.gsr_source.ss_family != AF_INET) {
- err = -EADDRNOTAVAIL;
- break;
- }
- psin = (struct sockaddr_in *)&greqs.gsr_group;
- mreqs.imr_multiaddr = psin->sin_addr.s_addr;
- psin = (struct sockaddr_in *)&greqs.gsr_source;
- mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
- mreqs.imr_interface = 0; /* use index for mc_source */
-
- if (optname == MCAST_BLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 1;
- } else if (optname == MCAST_UNBLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 0;
- } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
- struct ip_mreqn mreq;
-
- psin = (struct sockaddr_in *)&greqs.gsr_group;
- mreq.imr_multiaddr = psin->sin_addr;
- mreq.imr_address.s_addr = 0;
- mreq.imr_ifindex = greqs.gsr_interface;
- err = ip_mc_join_group(sk, &mreq);
- if (err && err != -EADDRINUSE)
- break;
- greqs.gsr_interface = mreq.imr_ifindex;
- omode = MCAST_INCLUDE;
- add = 1;
- } else /* MCAST_LEAVE_SOURCE_GROUP */ {
- omode = MCAST_INCLUDE;
- add = 0;
- }
- err = ip_mc_source(add, omode, sk, &mreqs,
- greqs.gsr_interface);
+ msf = memdup_sockptr(optval, optlen);
+ if (IS_ERR(msf)) {
+ err = PTR_ERR(msf);
break;
}
- case MCAST_MSFILTER:
- {
- extern int sysctl_igmp_max_msf;
- struct sockaddr_in *psin;
- struct ip_msfilter *msf = NULL;
- struct group_filter *gsf = NULL;
- int msize, i, ifindex;
-
- if (optlen < GROUP_FILTER_SIZE(0))
- goto e_inval;
- if (optlen > sysctl_optmem_max) {
- err = -ENOBUFS;
- break;
- }
- gsf = kmalloc(optlen,GFP_KERNEL);
- if (gsf == 0) {
- err = -ENOBUFS;
- break;
- }
- err = -EFAULT;
- if (copy_from_user(gsf, optval, optlen)) {
- goto mc_msf_out;
- }
- /* numsrc >= (4G-140)/128 overflow in 32 bits */
- if (gsf->gf_numsrc >= 0x1ffffff ||
- gsf->gf_numsrc > sysctl_igmp_max_msf) {
- err = -ENOBUFS;
- goto mc_msf_out;
- }
- if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
- err = -EINVAL;
- goto mc_msf_out;
- }
- msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
- msf = kmalloc(msize,GFP_KERNEL);
- if (msf == 0) {
- err = -ENOBUFS;
- goto mc_msf_out;
- }
- ifindex = gsf->gf_interface;
- psin = (struct sockaddr_in *)&gsf->gf_group;
- if (psin->sin_family != AF_INET) {
- err = -EADDRNOTAVAIL;
- goto mc_msf_out;
- }
- msf->imsf_multiaddr = psin->sin_addr.s_addr;
- msf->imsf_interface = 0;
- msf->imsf_fmode = gsf->gf_fmode;
- msf->imsf_numsrc = gsf->gf_numsrc;
- err = -EADDRNOTAVAIL;
- for (i=0; i<gsf->gf_numsrc; ++i) {
- psin = (struct sockaddr_in *)&gsf->gf_slist[i];
-
- if (psin->sin_family != AF_INET)
- goto mc_msf_out;
- msf->imsf_slist[i] = psin->sin_addr.s_addr;
- }
- kfree(gsf);
- gsf = NULL;
-
- err = ip_mc_msfilter(sk, msf, ifindex);
-mc_msf_out:
+ /* numsrc >= (1G-4) overflow in 32 bits */
+ if (msf->imsf_numsrc >= 0x3ffffffcU ||
+ msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
kfree(msf);
- kfree(gsf);
+ err = -ENOBUFS;
break;
}
- case IP_ROUTER_ALERT:
- err = ip_ra_control(sk, val ? 1 : 0, NULL);
+ if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+ kfree(msf);
+ err = -EINVAL;
break;
+ }
+ err = ip_mc_msfilter(sk, msf, 0);
+ kfree(msf);
+ break;
+ }
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ {
+ struct ip_mreq_source mreqs;
+ int omode, add;
- case IP_FREEBIND:
- if (optlen<1)
- goto e_inval;
- inet->freebind = !!val;
- break;
-
- case IP_IPSEC_POLICY:
- case IP_XFRM_POLICY:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- break;
- err = xfrm_user_policy(sk, optname, optval, optlen);
+ if (optlen != sizeof(struct ip_mreq_source))
+ goto e_inval;
+ if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) {
+ err = -EFAULT;
break;
+ }
+ if (optname == IP_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == IP_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+ struct ip_mreqn mreq;
- default:
- err = -ENOPROTOOPT;
+ mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+ mreq.imr_address.s_addr = mreqs.imr_interface;
+ mreq.imr_ifindex = 0;
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
+ if (err && err != -EADDRINUSE)
+ break;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs, 0);
+ break;
+ }
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ if (in_compat_syscall())
+ err = compat_ip_mcast_join_leave(sk, optname, optval,
+ optlen);
+ else
+ err = ip_mcast_join_leave(sk, optname, optval, optlen);
+ break;
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ err = do_mcast_group_source(sk, optname, optval, optlen);
+ break;
+ case MCAST_MSFILTER:
+ if (in_compat_syscall())
+ err = compat_ip_set_mcast_msfilter(sk, optval, optlen);
+ else
+ err = ip_set_mcast_msfilter(sk, optval, optlen);
+ break;
+ case IP_IPSEC_POLICY:
+ case IP_XFRM_POLICY:
+ err = -EPERM;
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
break;
+ err = xfrm_user_policy(sk, optname, optval, optlen);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
return err;
e_inval:
- release_sock(sk);
+ sockopt_release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
return -EINVAL;
}
-int ip_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int optlen)
+/**
+ * ipv4_pktinfo_prepare - transfer some info from rtable to skb
+ * @sk: socket
+ * @skb: buffer
+ * @drop_dst: if true, drops skb dst
+ *
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
+ * This way, receiver doesn't make cache line misses to read rtable.
+ */
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst)
+{
+ struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ bool prepare = inet_test_bit(PKTINFO, sk) ||
+ ipv6_sk_rxinfo(sk);
+
+ if (prepare && skb_rtable(skb)) {
+ /* skb->cb is overloaded: prior to this point it is IP{6}CB
+ * which has interface index (iif) as the first member of the
+ * underlying inet{6}_skb_parm struct. This code then overlays
+ * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
+ * element so the iif is picked up from the prior IPCB. If iif
+ * is the loopback interface, then return the sending interface
+ * (e.g., process binds socket to eth0 for Tx which is
+ * redirected to loopback in the rtable/dst).
+ */
+ struct rtable *rt = skb_rtable(skb);
+ bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
+
+ if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+ pktinfo->ipi_ifindex = inet_iif(skb);
+ else if (l3slave && rt && rt->rt_iif)
+ pktinfo->ipi_ifindex = rt->rt_iif;
+
+ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
+ } else {
+ pktinfo->ipi_ifindex = 0;
+ pktinfo->ipi_spec_dst.s_addr = 0;
+ }
+ if (drop_dst)
+ skb_dst_drop(skb);
+}
+
+int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int err;
@@ -900,311 +1418,363 @@ int ip_setsockopt(struct sock *sk, int level,
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
- optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY
-#ifdef CONFIG_IP_MROUTE
- && (optname < MRT_BASE || optname > (MRT_BASE + 10))
-#endif
- ) {
- lock_sock(sk);
+ optname != IP_IPSEC_POLICY &&
+ optname != IP_XFRM_POLICY &&
+ !ip_mroute_opt(optname))
err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
- release_sock(sk);
- }
#endif
return err;
}
+EXPORT_SYMBOL(ip_setsockopt);
+
+/*
+ * Get the options. Note for future reference. The GET of IP options gets
+ * the _received_ ones. The set sets the _sent_ ones.
+ */
+
+static bool getsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_MSFILTER:
+ case MCAST_MSFILTER:
+ return true;
+ }
+ return false;
+}
-#ifdef CONFIG_COMPAT
-int compat_ip_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
+ const int size0 = offsetof(struct group_filter, gf_slist_flex);
+ struct group_filter gsf;
+ int num, gsf_size;
int err;
- if (level != SOL_IP)
- return -ENOPROTOOPT;
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gsf, optval, size0))
+ return -EFAULT;
- err = do_ip_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
- optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY
-#ifdef CONFIG_IP_MROUTE
- && (optname < MRT_BASE || optname > (MRT_BASE + 10))
-#endif
- ) {
- lock_sock(sk);
- err = compat_nf_setsockopt(sk, PF_INET, optname,
- optval, optlen);
- release_sock(sk);
- }
-#endif
- return err;
+ num = gsf.gf_numsrc;
+ err = ip_mc_gsfget(sk, &gsf, optval,
+ offsetof(struct group_filter, gf_slist_flex));
+ if (err)
+ return err;
+ if (gsf.gf_numsrc < num)
+ num = gsf.gf_numsrc;
+ gsf_size = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
+ return -EFAULT;
+ return 0;
}
-EXPORT_SYMBOL(compat_ip_setsockopt);
-#endif
+static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter gf32;
+ struct group_filter gf;
+ int num;
+ int err;
-/*
- * Get the options. Note for future reference. The GET of IP options gets the
- * _received_ ones. The set sets the _sent_ ones.
- */
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gf32, optval, size0))
+ return -EFAULT;
+
+ gf.gf_interface = gf32.gf_interface;
+ gf.gf_fmode = gf32.gf_fmode;
+ num = gf.gf_numsrc = gf32.gf_numsrc;
+ gf.gf_group = gf32.gf_group;
+
+ err = ip_mc_gsfget(sk, &gf, optval,
+ offsetof(struct compat_group_filter, gf_slist_flex));
+ if (err)
+ return err;
+ if (gf.gf_numsrc < num)
+ num = gf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32));
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf.gf_numsrc)))
+ return -EFAULT;
+ return 0;
+}
-static int do_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+int do_ip_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct inet_sock *inet = inet_sk(sk);
- int val;
+ bool needs_rtnl = getsockopt_needs_rtnl(optname);
+ int val, err = 0;
int len;
-
- if(level!=SOL_IP)
+
+ if (level != SOL_IP)
return -EOPNOTSUPP;
-#ifdef CONFIG_IP_MROUTE
- if(optname>=MRT_BASE && optname <=MRT_BASE+10)
- {
- return ip_mroute_getsockopt(sk,optname,optval,optlen);
- }
-#endif
+ if (ip_mroute_opt(optname))
+ return ip_mroute_getsockopt(sk, optname, optval, optlen);
- if(get_user(len,optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- if(len < 0)
+ if (len < 0)
return -EINVAL;
-
- lock_sock(sk);
-
- switch(optname) {
- case IP_OPTIONS:
- {
- unsigned char optbuf[sizeof(struct ip_options)+40];
- struct ip_options * opt = (struct ip_options*)optbuf;
- opt->optlen = 0;
- if (inet->opt)
- memcpy(optbuf, inet->opt,
- sizeof(struct ip_options)+
- inet->opt->optlen);
- release_sock(sk);
-
- if (opt->optlen == 0)
- return put_user(0, optlen);
-
- ip_options_undo(opt);
-
- len = min_t(unsigned int, len, opt->optlen);
- if(put_user(len, optlen))
- return -EFAULT;
- if(copy_to_user(optval, opt->__data, len))
- return -EFAULT;
- return 0;
- }
- case IP_PKTINFO:
- val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
- break;
- case IP_RECVTTL:
- val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
- break;
- case IP_RECVTOS:
- val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
- break;
- case IP_RECVOPTS:
- val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
- break;
- case IP_RETOPTS:
- val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
- break;
- case IP_PASSSEC:
- val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
- break;
- case IP_TOS:
- val = inet->tos;
- break;
- case IP_TTL:
- val = (inet->uc_ttl == -1 ?
- sysctl_ip_default_ttl :
- inet->uc_ttl);
- break;
- case IP_HDRINCL:
- val = inet->hdrincl;
- break;
- case IP_MTU_DISCOVER:
- val = inet->pmtudisc;
- break;
- case IP_MTU:
- {
- struct dst_entry *dst;
- val = 0;
- dst = sk_dst_get(sk);
- if (dst) {
- val = dst_mtu(dst);
- dst_release(dst);
- }
- if (!val) {
- release_sock(sk);
- return -ENOTCONN;
- }
- break;
+
+ /* Handle options that can be read without locking the socket. */
+ switch (optname) {
+ case IP_PKTINFO:
+ val = inet_test_bit(PKTINFO, sk);
+ goto copyval;
+ case IP_RECVTTL:
+ val = inet_test_bit(TTL, sk);
+ goto copyval;
+ case IP_RECVTOS:
+ val = inet_test_bit(TOS, sk);
+ goto copyval;
+ case IP_RECVOPTS:
+ val = inet_test_bit(RECVOPTS, sk);
+ goto copyval;
+ case IP_RETOPTS:
+ val = inet_test_bit(RETOPTS, sk);
+ goto copyval;
+ case IP_PASSSEC:
+ val = inet_test_bit(PASSSEC, sk);
+ goto copyval;
+ case IP_RECVORIGDSTADDR:
+ val = inet_test_bit(ORIGDSTADDR, sk);
+ goto copyval;
+ case IP_CHECKSUM:
+ val = inet_test_bit(CHECKSUM, sk);
+ goto copyval;
+ case IP_RECVFRAGSIZE:
+ val = inet_test_bit(RECVFRAGSIZE, sk);
+ goto copyval;
+ case IP_RECVERR:
+ val = inet_test_bit(RECVERR, sk);
+ goto copyval;
+ case IP_RECVERR_RFC4884:
+ val = inet_test_bit(RECVERR_RFC4884, sk);
+ goto copyval;
+ case IP_FREEBIND:
+ val = inet_test_bit(FREEBIND, sk);
+ goto copyval;
+ case IP_HDRINCL:
+ val = inet_test_bit(HDRINCL, sk);
+ goto copyval;
+ case IP_MULTICAST_LOOP:
+ val = inet_test_bit(MC_LOOP, sk);
+ goto copyval;
+ case IP_MULTICAST_ALL:
+ val = inet_test_bit(MC_ALL, sk);
+ goto copyval;
+ case IP_TRANSPARENT:
+ val = inet_test_bit(TRANSPARENT, sk);
+ goto copyval;
+ case IP_NODEFRAG:
+ val = inet_test_bit(NODEFRAG, sk);
+ goto copyval;
+ case IP_BIND_ADDRESS_NO_PORT:
+ val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+ goto copyval;
+ case IP_ROUTER_ALERT:
+ val = inet_test_bit(RTALERT, sk);
+ goto copyval;
+ case IP_TTL:
+ val = READ_ONCE(inet->uc_ttl);
+ if (val < 0)
+ val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl);
+ goto copyval;
+ case IP_MINTTL:
+ val = READ_ONCE(inet->min_ttl);
+ goto copyval;
+ case IP_MULTICAST_TTL:
+ val = READ_ONCE(inet->mc_ttl);
+ goto copyval;
+ case IP_MTU_DISCOVER:
+ val = READ_ONCE(inet->pmtudisc);
+ goto copyval;
+ case IP_TOS:
+ val = READ_ONCE(inet->tos);
+ goto copyval;
+ case IP_OPTIONS:
+ {
+ unsigned char optbuf[sizeof(struct ip_options)+40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ opt->optlen = 0;
+ if (inet_opt)
+ memcpy(optbuf, &inet_opt->opt,
+ sizeof(struct ip_options) +
+ inet_opt->opt.optlen);
+ rcu_read_unlock();
+
+ if (opt->optlen == 0) {
+ len = 0;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
- case IP_RECVERR:
- val = inet->recverr;
- break;
- case IP_MULTICAST_TTL:
- val = inet->mc_ttl;
- break;
- case IP_MULTICAST_LOOP:
- val = inet->mc_loop;
- break;
- case IP_MULTICAST_IF:
- {
- struct in_addr addr;
- len = min_t(unsigned int, len, sizeof(struct in_addr));
- addr.s_addr = inet->mc_addr;
- release_sock(sk);
-
- if(put_user(len, optlen))
- return -EFAULT;
- if(copy_to_user(optval, &addr, len))
- return -EFAULT;
- return 0;
+
+ ip_options_undo(opt);
+
+ len = min_t(unsigned int, len, opt->optlen);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, opt->__data, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_MTU:
+ {
+ struct dst_entry *dst;
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+ val = dst_mtu(dst);
+ dst_release(dst);
}
- case IP_MSFILTER:
- {
- struct ip_msfilter msf;
- int err;
+ if (!val)
+ return -ENOTCONN;
+ goto copyval;
+ }
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
- if (len < IP_MSFILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
- }
- if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
- }
- err = ip_mc_msfget(sk, &msf,
- (struct ip_msfilter __user *)optval, optlen);
- release_sock(sk);
- return err;
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
}
- case MCAST_MSFILTER:
- {
- struct group_filter gsf;
- int err;
+ msg.msg_controllen = len;
+ msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
- if (len < GROUP_FILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
- }
- if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
- }
- err = ip_mc_gsfget(sk, &gsf,
- (struct group_filter __user *)optval, optlen);
- release_sock(sk);
- return err;
+ if (inet_test_bit(PKTINFO, sk)) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_ifindex = READ_ONCE(inet->mc_index);
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}
- case IP_PKTOPTIONS:
- {
- struct msghdr msg;
+ if (inet_test_bit(TTL, sk)) {
+ int hlim = READ_ONCE(inet->mc_ttl);
- release_sock(sk);
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ if (inet_test_bit(TOS, sk)) {
+ int tos = READ_ONCE(inet->rcv_tos);
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
+ len -= msg.msg_controllen;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
+ }
+ case IP_UNICAST_IF:
+ val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
+ goto copyval;
+ case IP_MULTICAST_IF:
+ {
+ struct in_addr addr;
+ len = min_t(unsigned int, len, sizeof(struct in_addr));
+ addr.s_addr = READ_ONCE(inet->mc_addr);
- if (sk->sk_type != SOCK_STREAM)
- return -ENOPROTOOPT;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, &addr, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_LOCAL_PORT_RANGE:
+ val = READ_ONCE(inet->local_port_range);
+ goto copyval;
+ }
- msg.msg_control = optval;
- msg.msg_controllen = len;
- msg.msg_flags = 0;
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
- if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
- struct in_pktinfo info;
+ switch (optname) {
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter msf;
- info.ipi_addr.s_addr = inet->rcv_saddr;
- info.ipi_spec_dst.s_addr = inet->rcv_saddr;
- info.ipi_ifindex = inet->mc_index;
- put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
- }
- if (inet->cmsg_flags & IP_CMSG_TTL) {
- int hlim = inet->mc_ttl;
- put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
- }
- len -= msg.msg_controllen;
- return put_user(len, optlen);
+ if (len < IP_MSFILTER_SIZE(0)) {
+ err = -EINVAL;
+ goto out;
}
- case IP_FREEBIND:
- val = inet->freebind;
- break;
- default:
- release_sock(sk);
- return -ENOPROTOOPT;
+ if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ err = -EFAULT;
+ goto out;
+ }
+ err = ip_mc_msfget(sk, &msf, optval, optlen);
+ goto out;
+ }
+ case MCAST_MSFILTER:
+ if (in_compat_syscall())
+ err = compat_ip_get_mcast_msfilter(sk, optval, optlen,
+ len);
+ else
+ err = ip_get_mcast_msfilter(sk, optval, optlen, len);
+ goto out;
+ case IP_PROTOCOL:
+ val = inet_sk(sk)->inet_num;
+ break;
+ default:
+ sockopt_release_sock(sk);
+ return -ENOPROTOOPT;
}
- release_sock(sk);
-
- if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+ sockopt_release_sock(sk);
+copyval:
+ if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
unsigned char ucval = (unsigned char)val;
len = 1;
- if(put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if(copy_to_user(optval,&ucval,1))
+ if (copy_to_sockptr(optval, &ucval, 1))
return -EFAULT;
} else {
len = min_t(unsigned int, sizeof(int), len);
- if(put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if(copy_to_user(optval,&val,len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
}
return 0;
+
+out:
+ sockopt_release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
}
int ip_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int __user *optlen)
+ int optname, char __user *optval, int __user *optlen)
{
int err;
- err = do_ip_getsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS
-#ifdef CONFIG_IP_MROUTE
- && (optname < MRT_BASE || optname > MRT_BASE+10)
-#endif
- ) {
- int len;
-
- if(get_user(len,optlen))
- return -EFAULT;
-
- lock_sock(sk);
- err = nf_getsockopt(sk, PF_INET, optname, optval,
- &len);
- release_sock(sk);
- if (err >= 0)
- err = put_user(len, optlen);
- return err;
- }
-#endif
- return err;
-}
+ err = do_ip_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
-#ifdef CONFIG_COMPAT
-int compat_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- int err = do_ip_getsockopt(sk, level, optname, optval, optlen);
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS
-#ifdef CONFIG_IP_MROUTE
- && (optname < MRT_BASE || optname > MRT_BASE+10)
-#endif
- ) {
- int len;
+ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+ !ip_mroute_opt(optname)) {
+ int len;
if (get_user(len, optlen))
return -EFAULT;
- lock_sock(sk);
- err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
- release_sock(sk);
+ err = nf_getsockopt(sk, PF_INET, optname, optval, &len);
if (err >= 0)
err = put_user(len, optlen);
return err;
@@ -1212,11 +1782,4 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
#endif
return err;
}
-
-EXPORT_SYMBOL(compat_ip_getsockopt);
-#endif
-
-EXPORT_SYMBOL(ip_cmsg_recv);
-
EXPORT_SYMBOL(ip_getsockopt);
-EXPORT_SYMBOL(ip_setsockopt);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 000000000000..158a30ae7c5f
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1340 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+#include <linux/err.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
+#include <net/rtnetlink.h>
+#include <net/udp.h>
+#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
+{
+ return hash_32((__force u32)key ^ (__force u32)remote,
+ IP_TNL_HASH_BITS);
+}
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
+ const unsigned long *flags, __be32 key)
+{
+ if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
+ return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
+
+ return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+ Tunnel hash table:
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+ int link, const unsigned long *flags,
+ __be32 remote, __be32 local,
+ __be32 key)
+{
+ struct ip_tunnel *t, *cand = NULL;
+ struct hlist_head *head;
+ struct net_device *ndev;
+ unsigned int hash;
+
+ hash = ip_tunnel_hash(key, remote);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (READ_ONCE(t->parms.link) == link)
+ return t;
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (remote != t->parms.iph.daddr ||
+ t->parms.iph.saddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (READ_ONCE(t->parms.link) == link)
+ return t;
+ if (!cand)
+ cand = t;
+ }
+
+ hash = ip_tunnel_hash(key, 0);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+ (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+ continue;
+
+ if (!(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (READ_ONCE(t->parms.link) == link)
+ return t;
+ if (!cand)
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
+ t->parms.i_key != key) ||
+ t->parms.iph.saddr != 0 ||
+ t->parms.iph.daddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (READ_ONCE(t->parms.link) == link)
+ return t;
+ if (!cand)
+ cand = t;
+ }
+
+ if (cand)
+ return cand;
+
+ t = rcu_dereference(itn->collect_md_tun);
+ if (t && t->dev->flags & IFF_UP)
+ return t;
+
+ ndev = READ_ONCE(itn->fb_tunnel_dev);
+ if (ndev && ndev->flags & IFF_UP)
+ return netdev_priv(ndev);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm_kern *parms)
+{
+ unsigned int h;
+ __be32 remote;
+ __be32 i_key = parms->i_key;
+
+ if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+ remote = parms->iph.daddr;
+ else
+ remote = 0;
+
+ if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
+ test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
+ i_key = 0;
+
+ h = ip_tunnel_hash(i_key, remote);
+ return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, t);
+ hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, NULL);
+ hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm_kern *parms,
+ int type)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ __be32 key = parms->i_key;
+ int link = parms->link;
+ struct ip_tunnel *t = NULL;
+ struct hlist_head *head = ip_bucket(itn, parms);
+
+ ip_tunnel_flags_copy(flags, parms->i_flags);
+
+ hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ link == READ_ONCE(t->parms.link) &&
+ type == t->dev->type &&
+ ip_tunnel_key_match(&t->parms, flags, key))
+ break;
+ }
+ return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+ const struct rtnl_link_ops *ops,
+ struct ip_tunnel_parm_kern *parms)
+{
+ int err;
+ struct ip_tunnel *tunnel;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ err = -E2BIG;
+ if (parms->name[0]) {
+ if (!dev_valid_name(parms->name))
+ goto failed;
+ strscpy(name, parms->name);
+ } else {
+ if (strlen(ops->kind) > (IFNAMSIZ - 3))
+ goto failed;
+ strscpy(name, ops->kind);
+ strcat(name, "%d");
+ }
+
+ ASSERT_RTNL();
+ dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
+ if (!dev) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ dev_net_set(dev, net);
+
+ dev->rtnl_link_ops = ops;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms = *parms;
+ tunnel->net = net;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto failed_free;
+
+ return dev;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
+ iph->saddr, tunnel->parms.o_key,
+ iph->tos & INET_DSCP_MASK, tunnel->net,
+ tunnel->parms.link, tunnel->fwmark, 0, 0);
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+
+ dst_cache_reset(&tunnel->dst_cache);
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = min(tdev->mtu, IP_MAX_MTU);
+ }
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
+
+ if (mtu < IPV4_MIN_MTU)
+ mtu = IPV4_MIN_MTU;
+
+ return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+ struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm_kern *parms)
+{
+ struct ip_tunnel *nt;
+ struct net_device *dev;
+ int t_hlen;
+ int mtu;
+ int err;
+
+ dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
+ if (IS_ERR(dev))
+ return ERR_CAST(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ err = dev_set_mtu(dev, mtu);
+ if (err)
+ goto err_dev_set_mtu;
+
+ nt = netdev_priv(dev);
+ t_hlen = nt->hlen + sizeof(struct iphdr);
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = IP_MAX_MTU - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->max_mtu -= dev->hard_header_len;
+
+ ip_tunnel_add(itn, nt);
+ return nt;
+
+err_dev_set_mtu:
+ unregister_netdevice(dev);
+ return ERR_PTR(err);
+}
+
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct udphdr *udph;
+
+ if (iph->protocol != IPPROTO_UDP)
+ return;
+
+ udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
+ info->encap.sport = udph->source;
+ info->encap.dport = udph->dest;
+}
+EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+ bool log_ecn_error)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ int nh, err;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ DEV_STATS_INC(tunnel->dev, multicast);
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+ test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
+ DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+ if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
+ (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
+
+ /* Save offset of outer header relative to skb->head,
+ * because we are going to reset the network header to the inner header
+ * and might change skb->head.
+ */
+ nh = skb_network_header(skb) - skb->head;
+
+ skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
+
+ if (!pskb_inet_may_pull(skb)) {
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+ iph = (struct iphdr *)(skb->head + nh);
+
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+ }
+
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ }
+
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
+
+int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
+
+int ip_tunnel_encap_setup(struct ip_tunnel *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
+
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+ struct rtable *rt, __be16 df,
+ const struct iphdr *inner_iph,
+ int tunnel_hlen, __be32 dst, bool md)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int pkt_size;
+ int mtu;
+
+ tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
+ pkt_size = skb->len - tunnel_hlen;
+ pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+
+ if (df) {
+ mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
+ mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+ } else {
+ mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+ }
+
+ if (skb_valid_dst(skb))
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (!skb_is_gso(skb) &&
+ (inner_iph->frag_off & htons(IP_DF)) &&
+ mtu < pkt_size) {
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ return -E2BIG;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6;
+ __be32 daddr;
+
+ rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
+ NULL;
+ daddr = md ? dst : tunnel->parms.iph.daddr;
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((daddr && !ipv4_is_multicast(daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < pkt_size) {
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -E2BIG;
+ }
+ }
+#endif
+ return 0;
+}
+
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 proto, int tunnel_hlen)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ u32 headroom = sizeof(struct iphdr);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ const struct iphdr *inner_iph;
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ __be16 df = 0;
+ u8 tos, ttl;
+ bool use_cache;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto tx_error;
+ key = &tun_info->key;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ tos = key->tos;
+ if (tos == 1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+ ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
+ tunnel_id_to_key32(key->tun_id),
+ tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
+ skb_get_hash(skb), key->flow_flags);
+
+ if (!tunnel_hlen)
+ tunnel_hlen = ip_encap_hlen(&tun_info->encap);
+
+ if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
+ goto tx_error;
+
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error;
+ }
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl4.saddr);
+ }
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ DEV_STATS_INC(dev, collisions);
+ goto tx_error;
+ }
+
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
+ df = htons(IP_DF);
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
+ key->u.ipv4.dst, true)) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = key->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ if (skb_cow_head(skb, headroom)) {
+ ip_rt_put(rt);
+ goto tx_dropped;
+ }
+
+ ip_tunnel_adj_headroom(dev, headroom);
+
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
+ return;
+tx_error:
+ DEV_STATS_INC(dev, tx_errors);
+ goto kfree;
+tx_dropped:
+ DEV_STATS_INC(dev, tx_dropped);
+kfree:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params, u8 protocol)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_info *tun_info = NULL;
+ const struct iphdr *inner_iph;
+ unsigned int max_headroom; /* The extra header space needed */
+ struct rtable *rt = NULL; /* Route to the other host */
+ __be16 payload_protocol;
+ bool use_cache = false;
+ struct flowi4 fl4;
+ bool md = false;
+ bool connected;
+ u8 tos, ttl;
+ __be32 dst;
+ __be16 df;
+
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ connected = (tunnel->parms.iph.daddr != 0);
+ payload_protocol = skb_protocol(skb, true);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ dst = tnl_params->daddr;
+ if (dst == 0) {
+ /* NBMA tunnel */
+
+ if (!skb_dst(skb)) {
+ DEV_STATS_INC(dev, tx_fifo_errors);
+ goto tx_error;
+ }
+
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
+ ip_tunnel_info_af(tun_info) == AF_INET &&
+ tun_info->key.u.ipv4.dst) {
+ dst = tun_info->key.u.ipv4.dst;
+ md = true;
+ connected = true;
+ } else if (payload_protocol == htons(ETH_P_IP)) {
+ rt = skb_rtable(skb);
+ dst = rt_nexthop(rt, inner_iph->daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (payload_protocol == htons(ETH_P_IPV6)) {
+ const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
+ int addr_type;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_error;
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
+ goto tx_error_icmp;
+ }
+#endif
+ else
+ goto tx_error;
+
+ if (!md)
+ connected = false;
+ }
+
+ tos = tnl_params->tos;
+ if (tos & 0x1) {
+ tos &= ~0x1;
+ if (payload_protocol == htons(ETH_P_IP)) {
+ tos = inner_iph->tos;
+ connected = false;
+ } else if (payload_protocol == htons(ETH_P_IPV6)) {
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ connected = false;
+ }
+ }
+
+ ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
+ tunnel->parms.o_key, tos & INET_DSCP_MASK,
+ tunnel->net, READ_ONCE(tunnel->parms.link),
+ tunnel->fwmark, skb_get_hash(skb), 0);
+
+ if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
+ goto tx_error;
+
+ if (connected && md) {
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache,
+ &fl4.saddr);
+ } else {
+ rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
+ &fl4.saddr) : NULL;
+ }
+
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error;
+ }
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl4.saddr);
+ else if (!md && connected)
+ dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+ fl4.saddr);
+ }
+
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ DEV_STATS_INC(dev, collisions);
+ goto tx_error;
+ }
+
+ df = tnl_params->frag_off;
+ if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+ df |= (inner_iph->frag_off & htons(IP_DF));
+
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = tnl_params->ttl;
+ if (ttl == 0) {
+ if (payload_protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (payload_protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
+
+ if (skb_cow_head(skb, max_headroom)) {
+ ip_rt_put(rt);
+ DEV_STATS_INC(dev, tx_dropped);
+ kfree_skb(skb);
+ return;
+ }
+
+ ip_tunnel_adj_headroom(dev, max_headroom);
+
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+ dst_link_failure(skb);
+#endif
+tx_error:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+ struct ip_tunnel *t,
+ struct net_device *dev,
+ struct ip_tunnel_parm_kern *p,
+ bool set_mtu,
+ __u32 fwmark)
+{
+ ip_tunnel_del(itn, t);
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ __dev_addr_set(dev, &p->iph.saddr, 4);
+ memcpy(dev->broadcast, &p->iph.daddr, 4);
+ }
+ ip_tunnel_add(itn, t);
+
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+
+ if (t->parms.link != p->link || t->fwmark != fwmark) {
+ int mtu;
+
+ WRITE_ONCE(t->parms.link, p->link);
+ t->fwmark = fwmark;
+ mtu = ip_tunnel_bind_dev(dev);
+ if (set_mtu)
+ WRITE_ONCE(dev->mtu, mtu);
+ }
+ dst_cache_reset(&t->dst_cache);
+ netdev_state_change(dev);
+}
+
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+ int cmd)
+{
+ int err = 0;
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == itn->fb_tunnel_dev) {
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (!t)
+ t = netdev_priv(dev);
+ }
+ memcpy(p, &t->parms, sizeof(*p));
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
+ if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
+ p->i_key = 0;
+ if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
+ p->o_key = 0;
+ }
+
+ t = ip_tunnel_find(itn, p, itn->type);
+
+ if (cmd == SIOCADDTUNNEL) {
+ if (!t) {
+ t = ip_tunnel_create(net, itn, p);
+ err = PTR_ERR_OR_ZERO(t);
+ break;
+ }
+
+ err = -EEXIST;
+ break;
+ }
+ if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+
+ t = netdev_priv(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ ip_tunnel_update(itn, t, dev, p, true, 0);
+ } else {
+ err = -ENOENT;
+ }
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == itn->fb_tunnel_dev) {
+ err = -ENOENT;
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (!t)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(itn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
+
+bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
+ const void __user *data)
+{
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, data, sizeof(p)))
+ return false;
+
+ strscpy(kp->name, p.name);
+ kp->link = p.link;
+ ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
+ ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
+ kp->i_key = p.i_key;
+ kp->o_key = p.o_key;
+ memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
+
+bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
+{
+ struct ip_tunnel_parm p;
+
+ if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(kp->o_flags))
+ return false;
+
+ memset(&p, 0, sizeof(p));
+
+ strscpy(p.name, kp->name);
+ p.link = kp->link;
+ p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
+ p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
+ p.i_key = kp->i_key;
+ p.o_key = kp->o_key;
+ memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
+
+ return !copy_to_user(data, &p, sizeof(p));
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
+
+int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
+{
+ struct ip_tunnel_parm_kern p;
+ int err;
+
+ if (!ip_tunnel_parm_from_user(&p, data))
+ return -EFAULT;
+ err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
+ if (!err && !ip_tunnel_parm_to_user(data, &p))
+ return -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
+
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ int max_mtu = IP_MAX_MTU - t_hlen;
+
+ if (dev->type == ARPHRD_ETHER)
+ max_mtu -= dev->hard_header_len;
+
+ if (new_mtu < ETH_MIN_MTU)
+ return -EINVAL;
+
+ if (new_mtu > max_mtu) {
+ if (strict)
+ return -EINVAL;
+
+ new_mtu = max_mtu;
+ }
+
+ WRITE_ONCE(dev->mtu, new_mtu);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ return __ip_tunnel_change_mtu(dev, new_mtu, true);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ gro_cells_destroy(&tunnel->gro_cells);
+ dst_cache_destroy(&tunnel->dst_cache);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
+
+ if (itn->fb_tunnel_dev != dev) {
+ ip_tunnel_del(itn, netdev_priv(dev));
+ unregister_netdevice_queue(dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return READ_ONCE(tunnel->net);
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
+int ip_tunnel_get_iflink(const struct net_device *dev)
+{
+ const struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return READ_ONCE(tunnel->parms.link);
+}
+EXPORT_SYMBOL(ip_tunnel_get_iflink);
+
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+ struct ip_tunnel_parm_kern parms;
+ unsigned int i;
+
+ itn->rtnl_link_ops = ops;
+ for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&itn->tunnels[i]);
+
+ if (!ops || !net_has_fallback_tunnels(net)) {
+ struct ip_tunnel_net *it_init_net;
+
+ it_init_net = net_generic(&init_net, ip_tnl_net_id);
+ itn->type = it_init_net->type;
+ itn->fb_tunnel_dev = NULL;
+ return 0;
+ }
+
+ memset(&parms, 0, sizeof(parms));
+ if (devname)
+ strscpy(parms.name, devname, IFNAMSIZ);
+
+ rtnl_lock();
+ itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ if (!IS_ERR(itn->fb_tunnel_dev)) {
+ itn->fb_tunnel_dev->netns_immutable = true;
+ itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+ ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ itn->type = itn->fb_tunnel_dev->type;
+ }
+ rtnl_unlock();
+
+ return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+void ip_tunnel_delete_net(struct net *net, unsigned int id,
+ struct rtnl_link_ops *ops,
+ struct list_head *head)
+{
+ struct ip_tunnel_net *itn = net_generic(net, id);
+ struct net_device *dev, *aux;
+ int h;
+
+ ASSERT_RTNL_NET(net);
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ struct hlist_node *n;
+ struct hlist_head *thead = &itn->tunnels[h];
+
+ hlist_for_each_entry_safe(t, n, thead, hash_node)
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+
+int ip_tunnel_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
+ __u32 fwmark)
+{
+ struct ip_tunnel *nt;
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (nt->collect_md) {
+ if (rtnl_dereference(itn->collect_md_tun))
+ return -EEXIST;
+ } else {
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+ }
+
+ nt->net = net;
+ nt->parms = *p;
+ nt->fwmark = fwmark;
+ err = register_netdevice(dev);
+ if (err)
+ goto err_register_netdevice;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (tb[IFLA_MTU]) {
+ unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
+
+ if (dev->type == ARPHRD_ETHER)
+ max -= dev->hard_header_len;
+
+ mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
+ }
+
+ err = dev_set_mtu(dev, mtu);
+ if (err)
+ goto err_dev_set_mtu;
+
+ ip_tunnel_add(itn, nt);
+ return 0;
+
+err_dev_set_mtu:
+ unregister_netdevice(dev);
+err_register_netdevice:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm_kern *p, __u32 fwmark)
+{
+ struct ip_tunnel *t;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (dev == itn->fb_tunnel_dev)
+ return -EINVAL;
+
+ t = ip_tunnel_find(itn, p, dev->type);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = tunnel;
+
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
+ }
+
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ int err;
+
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip_tunnel_dev_free;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
+ err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (err)
+ return err;
+
+ err = gro_cells_init(&tunnel->gro_cells, dev);
+ if (err) {
+ dst_cache_destroy(&tunnel->dst_cache);
+ return err;
+ }
+
+ tunnel->dev = dev;
+ strscpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ if (tunnel->collect_md)
+ netif_keep_dst(dev);
+ netdev_lockdep_set_classes(dev);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ ip_tunnel_del(itn, netdev_priv(dev));
+ if (itn->fb_tunnel_dev == dev)
+ WRITE_ONCE(itn->fb_tunnel_dev, NULL);
+
+ dst_cache_reset(&tunnel->dst_cache);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_DESCRIPTION("IPv4 tunnel implementation library");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
new file mode 100644
index 000000000000..2e61ac137128
--- /dev/null
+++ b/net/ipv4/ip_tunnel_core.c
@@ -0,0 +1,1176 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/static_key.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/dst_metadata.h>
+#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
+
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(iptun_encaps);
+
+const struct ip6_tnl_encap_ops __rcu *
+ ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(ip6tun_encaps);
+
+void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet,
+ u16 ipcb_flags)
+{
+ int pkt_len = skb->len - skb_inner_network_offset(skb);
+ struct net *net = dev_net(rt->dst.dev);
+ struct net_device *dev = skb->dev;
+ struct iphdr *iph;
+ int err;
+
+ skb_scrub_packet(skb, xnet);
+
+ skb_clear_hash_if_not_l4(skb);
+ skb_dst_set(skb, &rt->dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ IPCB(skb)->flags = ipcb_flags;
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = dst;
+ iph->saddr = src;
+ iph->ttl = ttl;
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
+
+ err = ip_local_out(net, sk, skb);
+
+ if (dev) {
+ if (unlikely(net_xmit_eval(err)))
+ pkt_len = 0;
+ iptunnel_xmit_stats(dev, pkt_len);
+ }
+}
+EXPORT_SYMBOL_GPL(iptunnel_xmit);
+
+int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
+ __be16 inner_proto, bool raw_proto, bool xnet)
+{
+ if (unlikely(!pskb_may_pull(skb, hdr_len)))
+ return -ENOMEM;
+
+ skb_pull_rcsum(skb, hdr_len);
+
+ if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
+ struct ethhdr *eh;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ return -ENOMEM;
+
+ eh = (struct ethhdr *)skb->data;
+ if (likely(eth_proto_is_802_3(eh->h_proto)))
+ skb->protocol = eh->h_proto;
+ else
+ skb->protocol = htons(ETH_P_802_2);
+
+ } else {
+ skb->protocol = inner_proto;
+ }
+
+ skb_clear_hash_if_not_l4(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ skb_set_queue_mapping(skb, 0);
+ skb_scrub_packet(skb, xnet);
+
+ return iptunnel_pull_offloads(skb);
+}
+EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
+
+struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
+ gfp_t flags)
+{
+ IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
+ struct metadata_dst *res;
+ struct ip_tunnel_info *dst, *src;
+
+ if (!md || md->type != METADATA_IP_TUNNEL ||
+ md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+ return NULL;
+
+ src = &md->u.tun_info;
+ res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
+ if (!res)
+ return NULL;
+
+ dst = &res->u.tun_info;
+ dst->key.tun_id = src->key.tun_id;
+ if (src->mode & IP_TUNNEL_INFO_IPV6)
+ memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ else
+ dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+ ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
+ dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+ ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
+ src->options_len, tun_flags);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
+
+int iptunnel_handle_offloads(struct sk_buff *skb,
+ int gso_type_mask)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_header_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+ skb_shinfo(skb)->gso_type |= gso_type_mask;
+ return 0;
+ }
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_NONE;
+ /* We clear encapsulation here to prevent badly-written
+ * drivers potentially deciding to offload an inner checksum
+ * if we set CHECKSUM_PARTIAL on the outer header.
+ * This should go away when the drivers are all fixed.
+ */
+ skb->encapsulation = 0;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
+
+/**
+ * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
+ * @skb: Original packet with L2 header
+ * @mtu: MTU value for ICMP error
+ *
+ * Return: length on success, negative error code if message couldn't be built.
+ */
+static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct icmphdr *icmph;
+ struct iphdr *niph;
+ struct ethhdr eh;
+ int len, err;
+
+ if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
+ return -EINVAL;
+
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
+ skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
+ pskb_pull(skb, ETH_HLEN);
+ skb_reset_network_header(skb);
+
+ err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
+ if (err)
+ return err;
+
+ len = skb->len + sizeof(*icmph);
+ err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
+ if (err)
+ return err;
+
+ icmph = skb_push(skb, sizeof(*icmph));
+ *icmph = (struct icmphdr) {
+ .type = ICMP_DEST_UNREACH,
+ .code = ICMP_FRAG_NEEDED,
+ .checksum = 0,
+ .un.frag.__unused = 0,
+ .un.frag.mtu = htons(mtu),
+ };
+ icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
+ skb_reset_transport_header(skb);
+
+ niph = skb_push(skb, sizeof(*niph));
+ *niph = (struct iphdr) {
+ .ihl = sizeof(*niph) / 4u,
+ .version = 4,
+ .tos = 0,
+ .tot_len = htons(len + sizeof(*niph)),
+ .id = 0,
+ .frag_off = htons(IP_DF),
+ .ttl = iph->ttl,
+ .protocol = IPPROTO_ICMP,
+ .saddr = iph->daddr,
+ .daddr = iph->saddr,
+ };
+ ip_send_check(niph);
+ skb_reset_network_header(skb);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
+ skb_reset_mac_header(skb);
+
+ return skb->len;
+}
+
+/**
+ * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
+ * @skb: Buffer being sent by encapsulation, L2 headers expected
+ * @mtu: Network MTU for path
+ *
+ * Return: 0 for no ICMP reply, length if built, negative value on error.
+ */
+static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
+{
+ const struct icmphdr *icmph = icmp_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (mtu < 576 || iph->frag_off != htons(IP_DF))
+ return 0;
+
+ if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) ||
+ ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) ||
+ ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr))
+ return 0;
+
+ if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
+ return 0;
+
+ return iptunnel_pmtud_build_icmp(skb, mtu);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
+ * @skb: Original packet with L2 header
+ * @mtu: MTU value for ICMPv6 error
+ *
+ * Return: length on success, negative error code if message couldn't be built.
+ */
+static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct icmp6hdr *icmp6h;
+ struct ipv6hdr *nip6h;
+ struct ethhdr eh;
+ int len, err;
+ __wsum csum;
+
+ if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
+ skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
+ pskb_pull(skb, ETH_HLEN);
+ skb_reset_network_header(skb);
+
+ err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
+ if (err)
+ return err;
+
+ len = skb->len + sizeof(*icmp6h);
+ err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
+ if (err)
+ return err;
+
+ icmp6h = skb_push(skb, sizeof(*icmp6h));
+ *icmp6h = (struct icmp6hdr) {
+ .icmp6_type = ICMPV6_PKT_TOOBIG,
+ .icmp6_code = 0,
+ .icmp6_cksum = 0,
+ .icmp6_mtu = htonl(mtu),
+ };
+ skb_reset_transport_header(skb);
+
+ nip6h = skb_push(skb, sizeof(*nip6h));
+ *nip6h = (struct ipv6hdr) {
+ .priority = 0,
+ .version = 6,
+ .flow_lbl = { 0 },
+ .payload_len = htons(len),
+ .nexthdr = IPPROTO_ICMPV6,
+ .hop_limit = ip6h->hop_limit,
+ .saddr = ip6h->daddr,
+ .daddr = ip6h->saddr,
+ };
+ skb_reset_network_header(skb);
+
+ csum = skb_checksum(skb, skb_transport_offset(skb), len, 0);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
+ IPPROTO_ICMPV6, csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
+ skb_reset_mac_header(skb);
+
+ return skb->len;
+}
+
+/**
+ * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
+ * @skb: Buffer being sent by encapsulation, L2 headers expected
+ * @mtu: Network MTU for path
+ *
+ * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
+ */
+static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int stype = ipv6_addr_type(&ip6h->saddr);
+ u8 proto = ip6h->nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ if (mtu < IPV6_MIN_MTU)
+ return 0;
+
+ if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
+ stype == IPV6_ADDR_LOOPBACK)
+ return 0;
+
+ offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
+ &frag_off);
+ if (offset < 0 || (frag_off & htons(~0x7)))
+ return 0;
+
+ if (proto == IPPROTO_ICMPV6) {
+ struct icmp6hdr *icmp6h;
+
+ if (!pskb_may_pull(skb, skb_network_header(skb) +
+ offset + 1 - skb->data))
+ return 0;
+
+ icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
+ if (icmpv6_is_err(icmp6h->icmp6_type) ||
+ icmp6h->icmp6_type == NDISC_REDIRECT)
+ return 0;
+ }
+
+ return iptunnel_pmtud_build_icmpv6(skb, mtu);
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+/**
+ * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
+ * @skb: Buffer being sent by encapsulation, L2 headers expected
+ * @encap_dst: Destination for tunnel encapsulation (outer IP)
+ * @headroom: Encapsulation header size, bytes
+ * @reply: Build matching ICMP or ICMPv6 message as a result
+ *
+ * L2 tunnel implementations that can carry IP and can be directly bridged
+ * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
+ * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
+ * based on payload and sent back by the encapsulation itself.
+ *
+ * For routable interfaces, we just need to update the PMTU for the destination.
+ *
+ * Return: 0 if ICMP error not needed, length if built, negative value on error
+ */
+int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
+ int headroom, bool reply)
+{
+ u32 mtu = dst_mtu(encap_dst) - headroom;
+
+ if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
+ (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
+ return 0;
+
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (!reply)
+ return 0;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return iptunnel_pmtud_check_icmp(skb, mtu);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (skb->protocol == htons(ETH_P_IPV6))
+ return iptunnel_pmtud_check_icmpv6(skb, mtu);
+#endif
+ return 0;
+}
+EXPORT_SYMBOL(skb_tunnel_check_pmtu);
+
+static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
+ [LWTUNNEL_IP_ID] = { .type = NLA_U64 },
+ [LWTUNNEL_IP_DST] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
+ [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
+static int ip_tun_parse_opts_geneve(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
+ int data_len, err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
+ geneve_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ if (info) {
+ struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
+ opt->opt_class = nla_get_be16(attr);
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
+ opt->type = nla_get_u8(attr);
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
+ }
+
+ return sizeof(struct geneve_opt) + data_len;
+}
+
+static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
+ vxlan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
+ return -EINVAL;
+
+ if (info) {
+ struct vxlan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
+ md->gbp = nla_get_u32(attr);
+ md->gbp &= VXLAN_GBP_MASK;
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int ip_tun_parse_opts_erspan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
+ erspan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
+ return -EINVAL;
+
+ ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
+ return -EINVAL;
+ } else if (ver == 2) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
+ !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (info) {
+ struct erspan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ md->version = ver;
+ if (ver == 1) {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(attr);
+ } else {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(attr);
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(attr));
+ }
+
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
+static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, opts_len = 0;
+ struct nlattr *nla;
+ u32 type = 0;
+
+ if (!attr)
+ return 0;
+
+ err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
+ ip_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case LWTUNNEL_IP_OPTS_GENEVE:
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
+ break;
+ case LWTUNNEL_IP_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
+ break;
+ case LWTUNNEL_IP_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return opts_len;
+}
+
+static int ip_tun_get_optlen(struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, NULL, extack);
+}
+
+static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, info, extack);
+}
+
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
+
+ err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
+ ip_tun_policy, extack);
+ if (err < 0)
+ return err;
+
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
+ if (!new_state)
+ return -ENOMEM;
+
+ new_state->type = LWTUNNEL_ENCAP_IP;
+
+ tun_info = lwt_tun_info(new_state);
+
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
+#ifdef CONFIG_DST_CACHE
+ err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
+ if (err) {
+ lwtstate_free(new_state);
+ return err;
+ }
+#endif
+
+ if (tb[LWTUNNEL_IP_ID])
+ tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
+
+ if (tb[LWTUNNEL_IP_DST])
+ tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
+
+ if (tb[LWTUNNEL_IP_SRC])
+ tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
+
+ if (tb[LWTUNNEL_IP_TTL])
+ tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
+
+ if (tb[LWTUNNEL_IP_TOS])
+ tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
+
+ if (tb[LWTUNNEL_IP_FLAGS]) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+
+ ip_tunnel_flags_from_be16(flags,
+ nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
+ ip_tunnel_clear_options_present(flags);
+
+ ip_tunnel_flags_or(tun_info->key.tun_flags,
+ tun_info->key.tun_flags, flags);
+ }
+
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->options_len = opt_len;
+
+ *ts = new_state;
+
+ return 0;
+}
+
+static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
+{
+#ifdef CONFIG_DST_CACHE
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ dst_cache_destroy(&tun_info->dst_cache);
+#endif
+}
+
+static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct geneve_opt *opt;
+ struct nlattr *nest;
+ int offset = 0;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
+ if (!nest)
+ return -ENOMEM;
+
+ while (tun_info->options_len > offset) {
+ opt = ip_tunnel_info_opts(tun_info) + offset;
+ if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+ nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+ opt->opt_data)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, nest);
+ return 0;
+err:
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+}
+
+static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nlattr *nest;
+ int err = 0;
+
+ if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, type);
+ if (!nest)
+ return -ENOMEM;
+
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
+ err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+ else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
+ err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
+ else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
+ err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
+
+ if (err) {
+ nla_nest_cancel(skb, nest);
+ return err;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
+ LWTUNNEL_IP_PAD) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+ nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
+ ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
+{
+ int opt_len;
+
+ if (!ip_tunnel_is_options_present(info->key.tun_flags))
+ return 0;
+
+ opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */
+ while (info->options_len > offset) {
+ opt = ip_tunnel_info_opts(info) + offset;
+ opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */
+ + nla_total_size(1) /* OPT_GENEVE_TYPE */
+ + nla_total_size(opt->length * 4);
+ /* OPT_GENEVE_DATA */
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
+ + nla_total_size(4); /* OPT_VXLAN_GBP */
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
+ struct erspan_metadata *md = ip_tunnel_info_opts(info);
+
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */
+ + nla_total_size(1) /* OPT_ERSPAN_VER */
+ + (md->version == 1 ? nla_total_size(4)
+ /* OPT_ERSPAN_INDEX (v1) */
+ : nla_total_size(1) +
+ nla_total_size(1));
+ /* OPT_ERSPAN_DIR + HWID (v2) */
+ }
+
+ return opt_len;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
+ + nla_total_size(4) /* LWTUNNEL_IP_DST */
+ + nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ + nla_total_size(1) /* LWTUNNEL_IP_TOS */
+ + nla_total_size(1) /* LWTUNNEL_IP_TTL */
+ + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP_OPTS */
+}
+
+static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct ip_tunnel_info *info_a = lwt_tun_info(a);
+ struct ip_tunnel_info *info_b = lwt_tun_info(b);
+
+ return memcmp(info_a, info_b, sizeof(info_a->key)) ||
+ info_a->mode != info_b->mode ||
+ info_a->options_len != info_b->options_len ||
+ memcmp(ip_tunnel_info_opts(info_a),
+ ip_tunnel_info_opts(info_b), info_a->options_len);
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+ .build_state = ip_tun_build_state,
+ .destroy_state = ip_tun_destroy_state,
+ .fill_encap = ip_tun_fill_encap_info,
+ .get_encap_size = ip_tun_encap_nlsize,
+ .cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
+};
+
+static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS },
+ [LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
+ [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
+ [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
+ [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
+ [LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
+ [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
+};
+
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
+
+ err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
+ ip6_tun_policy, extack);
+ if (err < 0)
+ return err;
+
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
+ if (!new_state)
+ return -ENOMEM;
+
+ new_state->type = LWTUNNEL_ENCAP_IP6;
+
+ tun_info = lwt_tun_info(new_state);
+
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
+ if (tb[LWTUNNEL_IP6_ID])
+ tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
+
+ if (tb[LWTUNNEL_IP6_DST])
+ tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
+
+ if (tb[LWTUNNEL_IP6_SRC])
+ tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
+
+ if (tb[LWTUNNEL_IP6_HOPLIMIT])
+ tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
+
+ if (tb[LWTUNNEL_IP6_TC])
+ tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
+
+ if (tb[LWTUNNEL_IP6_FLAGS]) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ __be16 data;
+
+ data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+ ip_tunnel_flags_from_be16(flags, data);
+ ip_tunnel_clear_options_present(flags);
+
+ ip_tunnel_flags_or(tun_info->key.tun_flags,
+ tun_info->key.tun_flags, flags);
+ }
+
+ tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
+ tun_info->options_len = opt_len;
+
+ *ts = new_state;
+
+ return 0;
+}
+
+static int ip6_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
+ LWTUNNEL_IP6_PAD) ||
+ nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
+ nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
+ ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */
+ + nla_total_size(16) /* LWTUNNEL_IP6_DST */
+ + nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
+ + nla_total_size(1) /* LWTUNNEL_IP6_TC */
+ + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP6_OPTS */
+}
+
+static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
+ .build_state = ip6_tun_build_state,
+ .fill_encap = ip6_tun_fill_encap_info,
+ .get_encap_size = ip6_tun_encap_nlsize,
+ .cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
+};
+
+void __init ip_tunnel_core_init(void)
+{
+ /* If you land here, make sure whether increasing ip_tunnel_info's
+ * options_len is a reasonable choice with its usage in front ends
+ * (f.e., it's part of flow keys, etc).
+ */
+ BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
+
+ lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+ lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
+}
+
+DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+ static_branch_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+ static_branch_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
+
+/* Returns either the correct skb->protocol value, or 0 if invalid. */
+__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
+{
+ if (skb_network_header(skb) >= skb->head &&
+ (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
+ ip_hdr(skb)->version == 4)
+ return htons(ETH_P_IP);
+ if (skb_network_header(skb) >= skb->head &&
+ (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
+ ipv6_hdr(skb)->version == 6)
+ return htons(ETH_P_IPV6);
+ return 0;
+}
+EXPORT_SYMBOL(ip_tunnel_parse_protocol);
+
+const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
+EXPORT_SYMBOL(ip_tunnel_header_ops);
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *encap)
+{
+ bool ret = false;
+
+ memset(encap, 0, sizeof(*encap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
+
+void ip_tunnel_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm_kern *parms)
+{
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
+
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_IPTUN_FLAGS]) {
+ __be16 flags;
+
+ flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+ ip_tunnel_flags_from_be16(parms->i_flags, flags);
+ }
+
+ if (data[IFLA_IPTUN_PROTO])
+ parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000000..95b6bb78fcd2
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,744 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Linux NET3: IP/IP protocol decoder modified to support
+ * virtual tunnel interface
+ *
+ * Authors:
+ * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ */
+
+/*
+ This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+#include <linux/icmpv6.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static unsigned int vti_net_id __read_mostly;
+static int vti_tunnel_init(struct net_device *dev);
+
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type, bool update_skb_dev)
+{
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+ if (update_skb_dev)
+ skb->dev = tunnel->dev;
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+ }
+
+ return -EINVAL;
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ return vti_input(skb, nexthdr, spi, encap_type, false);
+}
+
+static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev);
+}
+
+static int vti_rcv_proto(struct sk_buff *skb)
+{
+ return vti_rcv(skb, 0, false);
+}
+
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct xfrm_state *x;
+ const struct xfrm_mode *inner_mode;
+ struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+ u32 orig_mark = skb->mark;
+ int ret;
+
+ if (!tunnel)
+ return 1;
+
+ dev = tunnel->dev;
+
+ if (err) {
+ DEV_STATS_INC(dev, rx_errors);
+ DEV_STATS_INC(dev, rx_dropped);
+
+ return 0;
+ }
+
+ x = xfrm_input_state(skb);
+
+ inner_mode = &x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->family;
+
+ skb->mark = be32_to_cpu(tunnel->parms.i_key);
+ ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+ skb->mark = orig_mark;
+
+ if (!ret)
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+ skb->dev = dev;
+ dev_sw_netstats_rx_add(dev, skb->len);
+
+ return 0;
+}
+
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)&src;
+
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET)
+ return false;
+
+ if (!dst)
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+ return false;
+
+ return true;
+}
+
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct flowi *fl)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm_kern *parms = &tunnel->parms;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *tdev; /* Device to other host */
+ int pkt_len = skb->len;
+ int err;
+ int mtu;
+
+ if (!dst) {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+ skb_dst_set(skb, dst);
+ break;
+#endif
+ default:
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+ }
+
+ dst_hold(dst);
+ dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+
+ if (dst->flags & DST_XFRM_QUEUE)
+ goto xmit;
+
+ if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ dst_release(dst);
+ goto tx_error_icmp;
+ }
+
+ tdev = dst_dev(dst);
+
+ if (tdev == dev) {
+ dst_release(dst);
+ DEV_STATS_INC(dev, collisions);
+ goto tx_error;
+ }
+
+ mtu = dst_mtu(dst);
+ if (skb->len > mtu) {
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
+ goto xmit;
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ } else {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ }
+
+ dst_release(dst);
+ goto tx_error;
+ }
+
+xmit:
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = skb_dst_dev(skb);
+
+ err = dst_output(tunnel->net, skb->sk, skb);
+ if (net_xmit_eval(err) == 0)
+ err = pkt_len;
+ iptunnel_xmit_stats(dev, err);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct flowi fl;
+
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET);
+ break;
+ case htons(ETH_P_IPV6):
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6);
+ break;
+ default:
+ goto tx_err;
+ }
+
+ /* override mark with tunnel output key */
+ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
+
+ return vti_xmit(skb, dev, &fl);
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ int protocol = iph->protocol;
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
+ iph->daddr, iph->saddr, 0);
+ if (!tunnel)
+ return -1;
+
+ mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ break;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, protocol);
+ else
+ ipv4_redirect(skb, net, 0, protocol);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int
+vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
+{
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ int err = 0;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP ||
+ p->iph.ihl != 5)
+ return -EINVAL;
+ }
+
+ if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(p->o_flags))
+ return -EOVERFLOW;
+
+ if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY))
+ p->i_key = 0;
+ if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY))
+ p->o_key = 0;
+
+ __set_bit(IP_TUNNEL_VTI_BIT, flags);
+ ip_tunnel_flags_copy(p->i_flags, flags);
+
+ err = ip_tunnel_ctl(dev, p, cmd);
+ if (err)
+ return err;
+
+ if (cmd != SIOCDELTUNNEL) {
+ ip_tunnel_flags_from_be16(flags, GRE_KEY);
+ ip_tunnel_flags_or(p->i_flags, p->i_flags, flags);
+ ip_tunnel_flags_or(p->o_flags, p->o_flags, flags);
+ }
+ return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+ .ndo_init = vti_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = vti_tunnel_xmit,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = vti_tunnel_ctl,
+};
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
+ dev->type = ARPHRD_TUNNEL;
+ ip_tunnel_setup(dev, vti_net_id);
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ __dev_addr_set(dev, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->flags = IFF_NOARP;
+ dev->addr_len = 4;
+ dev->lltx = true;
+ netif_keep_dst(dev);
+
+ return ip_tunnel_init(dev);
+}
+
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+}
+
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int vti_rcv_tunnel(struct sk_buff *skb)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false);
+}
+
+static struct xfrm_tunnel vti_ipip_handler __read_mostly = {
+ .handler = vti_rcv_tunnel,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 0,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel vti_ipip6_handler __read_mostly = {
+ .handler = vti_rcv_tunnel,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 0,
+};
+#endif
+#endif
+
+static int __net_init vti_init_net(struct net *net)
+{
+ int err;
+ struct ip_tunnel_net *itn;
+
+ err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
+ if (err)
+ return err;
+ itn = net_generic(net, vti_net_id);
+ if (itn->fb_tunnel_dev)
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
+ return 0;
+}
+
+static void __net_exit vti_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill);
+}
+
+static struct pernet_operations vti_net_ops = {
+ .init = vti_init_net,
+ .exit_rtnl = vti_exit_rtnl,
+ .id = &vti_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm_kern *parms,
+ __u32 *fwmark)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_IPIP;
+
+ if (!data)
+ return;
+
+ __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags);
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
+
+ if (data[IFLA_VTI_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
+}
+
+static int vti_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct ip_tunnel_parm_kern parms;
+ struct nlattr **tb = params->tb;
+ __u32 fwmark = 0;
+
+ vti_netlink_parms(data, &parms, &fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb,
+ &parms, fwmark);
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = t->fwmark;
+
+ vti_netlink_parms(data, &p, &fwmark);
+ return ip_tunnel_changelink(dev, tb, &p, fwmark);
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_VTI_FWMARK */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern *p = &t->parms;
+
+ if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) ||
+ nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) ||
+ nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) ||
+ nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
+ [IFLA_VTI_FWMARK] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+ .kind = "vti",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = vti_tunnel_setup,
+ .validate = vti_tunnel_validate,
+ .newlink = vti_newlink,
+ .changelink = vti_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = vti_get_size,
+ .fill_info = vti_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static int __init vti_init(void)
+{
+ const char *msg;
+ int err;
+
+ pr_info("IPv4 over IPsec tunneling driver\n");
+
+ msg = "tunnel device";
+ err = register_pernet_device(&vti_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "tunnel protocols";
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+ msg = "ipip tunnel";
+ err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET);
+ if (err < 0)
+ goto xfrm_tunnel_ipip_failed;
+#if IS_ENABLED(CONFIG_IPV6)
+ err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6);
+ if (err < 0)
+ goto xfrm_tunnel_ipip6_failed;
+#endif
+#endif
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&vti_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return err;
+
+rtnl_link_failed:
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+ xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
+xfrm_tunnel_ipip6_failed:
+#endif
+ xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+xfrm_tunnel_ipip_failed:
+#endif
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+ pr_err("vti init: failed to register %s\n", msg);
+ return err;
+}
+
+static void __exit vti_fini(void)
+{
+ rtnl_link_unregister(&vti_link_ops);
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+ xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
+#endif
+ xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+#endif
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_DESCRIPTION("Virtual (secure) IP tunneling library");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 3839b706142e..9a45aed508d1 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -1,227 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP Payload Compression Protocol (IPComp) - RFC3173.
*
* Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Todo:
* - Tunable compression parameters.
* - Compression stats.
* - Adaptive compression.
*/
#include <linux/module.h>
-#include <asm/scatterlist.h>
-#include <asm/semaphore.h>
-#include <linux/crypto.h>
-#include <linux/pfkeyv2.h>
-#include <linux/percpu.h>
-#include <linux/smp.h>
-#include <linux/list.h>
-#include <linux/vmalloc.h>
+#include <linux/err.h>
#include <linux/rtnetlink.h>
-#include <linux/mutex.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/icmp.h>
#include <net/ipcomp.h>
#include <net/protocol.h>
+#include <net/sock.h>
-struct ipcomp_tfms {
- struct list_head list;
- struct crypto_comp **tfms;
- int users;
-};
-
-static DEFINE_MUTEX(ipcomp_resource_mutex);
-static void **ipcomp_scratches;
-static int ipcomp_scratch_users;
-static LIST_HEAD(ipcomp_tfms_list);
-
-static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err, plen, dlen;
- struct ipcomp_data *ipcd = x->data;
- u8 *start, *scratch;
- struct crypto_comp *tfm;
- int cpu;
-
- plen = skb->len;
- dlen = IPCOMP_SCRATCH_SIZE;
- start = skb->data;
-
- cpu = get_cpu();
- scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
- tfm = *per_cpu_ptr(ipcd->tfms, cpu);
-
- err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
- if (err)
- goto out;
-
- if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
- err = -EINVAL;
- goto out;
- }
-
- err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
- if (err)
- goto out;
-
- skb->truesize += dlen - plen;
- __skb_put(skb, dlen - plen);
- memcpy(skb->data, scratch, dlen);
-out:
- put_cpu();
- return err;
-}
-
-static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err = -ENOMEM;
- struct iphdr *iph;
- struct ip_comp_hdr *ipch;
-
- if (skb_linearize_cow(skb))
- goto out;
-
- skb->ip_summed = CHECKSUM_NONE;
-
- /* Remove ipcomp header and decompress original payload */
- iph = skb->nh.iph;
- ipch = (void *)skb->data;
- iph->protocol = ipch->nexthdr;
- skb->h.raw = skb->nh.raw + sizeof(*ipch);
- __skb_pull(skb, sizeof(*ipch));
- err = ipcomp_decompress(x, skb);
-
-out:
- return err;
-}
-
-static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err, plen, dlen, ihlen;
- struct iphdr *iph = skb->nh.iph;
- struct ipcomp_data *ipcd = x->data;
- u8 *start, *scratch;
- struct crypto_comp *tfm;
- int cpu;
-
- ihlen = iph->ihl * 4;
- plen = skb->len - ihlen;
- dlen = IPCOMP_SCRATCH_SIZE;
- start = skb->data + ihlen;
-
- cpu = get_cpu();
- scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
- tfm = *per_cpu_ptr(ipcd->tfms, cpu);
-
- err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
- if (err)
- goto out;
-
- if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
- err = -EMSGSIZE;
- goto out;
- }
-
- memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
- put_cpu();
-
- pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
- return 0;
-
-out:
- put_cpu();
- return err;
-}
-
-static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
- struct iphdr *iph;
- struct ip_comp_hdr *ipch;
- struct ipcomp_data *ipcd = x->data;
- int hdr_len = 0;
-
- iph = skb->nh.iph;
- iph->tot_len = htons(skb->len);
- hdr_len = iph->ihl * 4;
- if ((skb->len - hdr_len) < ipcd->threshold) {
- /* Don't bother compressing */
- goto out_ok;
- }
-
- if (skb_linearize_cow(skb))
- goto out_ok;
-
- err = ipcomp_compress(x, skb);
- iph = skb->nh.iph;
-
- if (err) {
- goto out_ok;
- }
-
- /* Install ipcomp header, convert into ipcomp datagram. */
- iph->tot_len = htons(skb->len);
- ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
- ipch->nexthdr = iph->protocol;
- ipch->flags = 0;
- ipch->cpi = htons((u16 )ntohl(x->id.spi));
- iph->protocol = IPPROTO_COMP;
- ip_send_check(iph);
- return 0;
-
-out_ok:
- if (x->props.mode == XFRM_MODE_TUNNEL)
- ip_send_check(iph);
- return 0;
-}
-
-static void ipcomp4_err(struct sk_buff *skb, u32 info)
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
{
+ struct net *net = dev_net(skb->dev);
__be32 spi;
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
- skb->h.icmph->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ break;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
spi = htonl(ntohs(ipch->cpi));
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
- spi, IPPROTO_COMP, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET);
if (!x)
- return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
- spi, NIPQUAD(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
+ else
+ ipv4_redirect(skb, net, 0, IPPROTO_COMP);
xfrm_state_put(x);
+
+ return 0;
}
-/* We always hold one tunnel user reference to indicate a tunnel */
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
{
+ struct net *net = xs_net(x);
struct xfrm_state *t;
- u8 mode = XFRM_MODE_TUNNEL;
-
- t = xfrm_state_alloc();
- if (t == NULL)
+
+ t = xfrm_state_alloc(net);
+ if (!t)
goto out;
+ lockdep_set_class(&t->lock, &xfrm_state_lock_key);
t->id.proto = IPPROTO_IPIP;
t->id.spi = x->props.saddr.a4;
t->id.daddr.a4 = x->id.daddr.a4;
memcpy(&t->sel, &x->sel, sizeof(t->sel));
t->props.family = AF_INET;
- if (x->props.mode == XFRM_MODE_BEET)
- mode = x->props.mode;
- t->props.mode = mode;
+ t->props.mode = x->props.mode;
t->props.saddr.a4 = x->props.saddr.a4;
t->props.flags = x->props.flags;
+ t->props.extra_flags = x->props.extra_flags;
+ memcpy(&t->mark, &x->mark, sizeof(t->mark));
+ t->if_id = x->if_id;
if (xfrm_init_state(t))
goto error;
@@ -243,11 +97,13 @@ error:
*/
static int ipcomp_tunnel_attach(struct xfrm_state *x)
{
+ struct net *net = xs_net(x);
int err = 0;
struct xfrm_state *t;
+ u32 mark = x->mark.v & x->mark.m;
- t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
- x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+ t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
+ x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
if (!t) {
t = ipcomp_tunnel_create(x);
if (!t) {
@@ -263,221 +119,70 @@ out:
return err;
}
-static void ipcomp_free_scratches(void)
+static int ipcomp4_init_state(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
- int i;
- void **scratches;
-
- if (--ipcomp_scratch_users)
- return;
-
- scratches = ipcomp_scratches;
- if (!scratches)
- return;
-
- for_each_possible_cpu(i)
- vfree(*per_cpu_ptr(scratches, i));
-
- free_percpu(scratches);
-}
-
-static void **ipcomp_alloc_scratches(void)
-{
- int i;
- void **scratches;
-
- if (ipcomp_scratch_users++)
- return ipcomp_scratches;
-
- scratches = alloc_percpu(void *);
- if (!scratches)
- return NULL;
-
- ipcomp_scratches = scratches;
-
- for_each_possible_cpu(i) {
- void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
- if (!scratch)
- return NULL;
- *per_cpu_ptr(scratches, i) = scratch;
- }
-
- return scratches;
-}
-
-static void ipcomp_free_tfms(struct crypto_comp **tfms)
-{
- struct ipcomp_tfms *pos;
- int cpu;
-
- list_for_each_entry(pos, &ipcomp_tfms_list, list) {
- if (pos->tfms == tfms)
- break;
- }
-
- BUG_TRAP(pos);
-
- if (--pos->users)
- return;
-
- list_del(&pos->list);
- kfree(pos);
-
- if (!tfms)
- return;
-
- for_each_possible_cpu(cpu) {
- struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu);
- crypto_free_comp(tfm);
- }
- free_percpu(tfms);
-}
-
-static struct crypto_comp **ipcomp_alloc_tfms(const char *alg_name)
-{
- struct ipcomp_tfms *pos;
- struct crypto_comp **tfms;
- int cpu;
-
- /* This can be any valid CPU ID so we don't need locking. */
- cpu = raw_smp_processor_id();
-
- list_for_each_entry(pos, &ipcomp_tfms_list, list) {
- struct crypto_comp *tfm;
-
- tfms = pos->tfms;
- tfm = *per_cpu_ptr(tfms, cpu);
-
- if (!strcmp(crypto_comp_name(tfm), alg_name)) {
- pos->users++;
- return tfms;
- }
- }
-
- pos = kmalloc(sizeof(*pos), GFP_KERNEL);
- if (!pos)
- return NULL;
-
- pos->users = 1;
- INIT_LIST_HEAD(&pos->list);
- list_add(&pos->list, &ipcomp_tfms_list);
-
- pos->tfms = tfms = alloc_percpu(struct crypto_comp *);
- if (!tfms)
- goto error;
-
- for_each_possible_cpu(cpu) {
- struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0,
- CRYPTO_ALG_ASYNC);
- if (!tfm)
- goto error;
- *per_cpu_ptr(tfms, cpu) = tfm;
- }
-
- return tfms;
-
-error:
- ipcomp_free_tfms(tfms);
- return NULL;
-}
-
-static void ipcomp_free_data(struct ipcomp_data *ipcd)
-{
- if (ipcd->tfms)
- ipcomp_free_tfms(ipcd->tfms);
- ipcomp_free_scratches();
-}
-
-static void ipcomp_destroy(struct xfrm_state *x)
-{
- struct ipcomp_data *ipcd = x->data;
- if (!ipcd)
- return;
- xfrm_state_delete_tunnel(x);
- mutex_lock(&ipcomp_resource_mutex);
- ipcomp_free_data(ipcd);
- mutex_unlock(&ipcomp_resource_mutex);
- kfree(ipcd);
-}
-
-static int ipcomp_init_state(struct xfrm_state *x)
-{
- int err;
- struct ipcomp_data *ipcd;
- struct xfrm_algo_desc *calg_desc;
-
- err = -EINVAL;
- if (!x->calg)
- goto out;
-
- if (x->encap)
- goto out;
-
- err = -ENOMEM;
- ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
- if (!ipcd)
- goto out;
+ int err = -EINVAL;
x->props.header_len = 0;
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
x->props.header_len += sizeof(struct iphdr);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
+ goto out;
+ }
- mutex_lock(&ipcomp_resource_mutex);
- if (!ipcomp_alloc_scratches())
- goto error;
-
- ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
- if (!ipcd->tfms)
- goto error;
- mutex_unlock(&ipcomp_resource_mutex);
+ err = ipcomp_init_state(x, extack);
+ if (err)
+ goto out;
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = ipcomp_tunnel_attach(x);
- if (err)
- goto error_tunnel;
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
+ goto out;
+ }
}
- calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
- BUG_ON(!calg_desc);
- ipcd->threshold = calg_desc->uinfo.comp.threshold;
- x->data = ipcd;
err = 0;
out:
return err;
+}
-error_tunnel:
- mutex_lock(&ipcomp_resource_mutex);
-error:
- ipcomp_free_data(ipcd);
- mutex_unlock(&ipcomp_resource_mutex);
- kfree(ipcd);
- goto out;
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
}
-static struct xfrm_type ipcomp_type = {
- .description = "IPCOMP4",
+static const struct xfrm_type ipcomp_type = {
.owner = THIS_MODULE,
.proto = IPPROTO_COMP,
- .init_state = ipcomp_init_state,
+ .init_state = ipcomp4_init_state,
.destructor = ipcomp_destroy,
.input = ipcomp_input,
.output = ipcomp_output
};
-static struct net_protocol ipcomp4_protocol = {
+static struct xfrm4_protocol ipcomp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ipcomp4_rcv_cb,
.err_handler = ipcomp4_err,
- .no_policy = 1,
+ .priority = 0,
};
static int __init ipcomp4_init(void)
{
if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
- printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
- printk(KERN_INFO "ipcomp init: can't add protocol\n");
+ if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ipcomp_type, AF_INET);
return -EAGAIN;
}
@@ -486,16 +191,16 @@ static int __init ipcomp4_init(void)
static void __exit ipcomp4_fini(void)
{
- if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
- printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
- if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
- printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+ if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
}
module_init(ipcomp4_init);
module_exit(ipcomp4_fini);
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index afa60b9a003f..019408d3ca2c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $
- *
* Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
* user-supplied information to configure own IP address and routes.
*
@@ -12,7 +11,7 @@
* BOOTP rewritten to construct and analyse packets itself instead
* of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
* -- MJ, December 1998
- *
+ *
* Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
* initialization scheme.
* - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
@@ -29,6 +28,9 @@
*
* Multiple Nameservers in /proc/net/pnp
* -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ * NTP servers in /proc/net/ipconfig/ntp_servers
+ * -- Chris Novakovic <chris@chrisn.me.uk>, April 2018
*/
#include <linux/types.h>
@@ -55,24 +57,18 @@
#include <linux/root_dev.h>
#include <linux/delay.h>
#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
#include <net/route.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <asm/processor.h>
-/* Define this to allow debugging output */
-#undef IPCONFIG_DEBUG
-
-#ifdef IPCONFIG_DEBUG
-#define DBG(x) printk x
-#else
-#define DBG(x) do { } while(0)
-#endif
-
#if defined(CONFIG_IP_PNP_DHCP)
#define IPCONFIG_DHCP
#endif
@@ -87,21 +83,24 @@
#endif
/* Define the friendly delay before and after opening net devices */
-#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */
-#define CONF_POST_OPEN 1 /* After opening: 1 second */
+#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
#define CONF_SEND_RETRIES 6 /* Send six requests per open */
-#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
-#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
- - '3' from resolv.h */
+#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
+ - '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX 3 /* Maximum number of NTP servers */
-#define NONE __constant_htonl(INADDR_NONE)
+#define NONE cpu_to_be32(INADDR_NONE)
+#define ANY cpu_to_be32(INADDR_ANY)
+
+/* Wait for carrier timeout default in seconds */
+static unsigned int carrier_timeout = 120;
/*
* Public IP configuration
@@ -113,7 +112,7 @@
*/
int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
-static int ic_enable __initdata = 0; /* IP config enabled? */
+static int ic_enable __initdata; /* IP config enabled? */
/* Protocol choice */
int ic_proto_enabled __initdata = 0
@@ -128,21 +127,37 @@ int ic_proto_enabled __initdata = 0
#endif
;
-static int ic_host_name_set __initdata = 0; /* Host name set by us? */
+static int ic_host_name_set __initdata; /* Host name set by us? */
__be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
+#ifdef IPCONFIG_DYNAMIC
+static __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#endif
+
__be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+/* vendor class identifier */
+static char vendor_class_identifier[253] __initdata;
+
+#if defined(CONFIG_IP_PNP_DHCP)
+static char dhcp_client_identifier[253] __initdata;
+#endif
+
/* Persistent data: */
+#ifdef IPCONFIG_DYNAMIC
static int ic_proto_used; /* Protocol used, if any */
+#else
+#define ic_proto_used 0
+#endif
static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
static u8 ic_domain[64]; /* DNS (not NIS) domain name */
/*
@@ -153,14 +168,17 @@ static u8 ic_domain[64]; /* DNS (not NIS) domain name */
static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
/* Protocols supported by available interfaces */
-static int ic_proto_have_if __initdata = 0;
+static int ic_proto_have_if __initdata;
+
+/* MTU for boot device */
+static int ic_dev_mtu __initdata;
#ifdef IPCONFIG_DYNAMIC
static DEFINE_SPINLOCK(ic_recv_lock);
-static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */
+static volatile int ic_got_reply __initdata; /* Proto(s) that replied */
#endif
#ifdef IPCONFIG_DHCP
-static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */
+static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */
#endif
@@ -176,47 +194,59 @@ struct ic_device {
__be32 xid;
};
-static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
-static struct net_device *ic_dev __initdata = NULL; /* Selected device */
+static struct ic_device *ic_first_dev __initdata; /* List of open device */
+static struct ic_device *ic_dev __initdata; /* Selected device */
+
+static bool __init ic_is_init_dev(struct net_device *dev)
+{
+ if (dev->flags & IFF_LOOPBACK)
+ return false;
+ return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5));
+}
static int __init ic_open_devs(void)
{
struct ic_device *d, **last;
struct net_device *dev;
unsigned short oflags;
+ unsigned long start, next_msg;
last = &ic_first_dev;
rtnl_lock();
/* bring loopback device up first */
- if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
- printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
-
- for (dev = dev_base; dev; dev = dev->next) {
- if (dev == &loopback_dev)
+ for_each_netdev(&init_net, dev) {
+ if (!(dev->flags & IFF_LOOPBACK))
continue;
- if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
- (!(dev->flags & IFF_LOOPBACK) &&
- (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
- strncmp(dev->name, "dummy", 5))) {
+ if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
+ pr_err("IP-Config: Failed to open %s\n", dev->name);
+ }
+
+ for_each_netdev(&init_net, dev) {
+ if (ic_is_init_dev(dev)) {
int able = 0;
if (dev->mtu >= 364)
able |= IC_BOOTP;
else
- printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small\n",
+ dev->name, dev->mtu);
if (!(dev->flags & IFF_NOARP))
able |= IC_RARP;
able &= ic_proto_enabled;
if (ic_proto_enabled && !able)
continue;
oflags = dev->flags;
- if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
- printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {
+ pr_err("IP-Config: Failed to open %s\n",
+ dev->name);
continue;
}
if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
rtnl_unlock();
- return -1;
+ return -ENOMEM;
}
d->dev = dev;
*last = d;
@@ -228,37 +258,90 @@ static int __init ic_open_devs(void)
else
d->xid = 0;
ic_proto_have_if |= able;
- DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
- dev->name, able, d->xid));
+ pr_debug("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid);
}
}
+ /* Devices with a complex topology like SFP ethernet interfaces needs
+ * the rtnl_lock at init. The carrier wait-loop must therefore run
+ * without holding it.
+ */
rtnl_unlock();
+ /* no point in waiting if we could not bring up at least one device */
+ if (!ic_first_dev)
+ goto have_carrier;
+
+ /* wait for a carrier on at least one device */
+ start = jiffies;
+ next_msg = start + secs_to_jiffies(20);
+ while (time_before(jiffies, start +
+ secs_to_jiffies(carrier_timeout))) {
+ int wait, elapsed;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev)
+ if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) {
+ rtnl_unlock();
+ goto have_carrier;
+ }
+ rtnl_unlock();
+
+ msleep(1);
+
+ if (time_before(jiffies, next_msg))
+ continue;
+
+ elapsed = jiffies_to_msecs(jiffies - start);
+ wait = (carrier_timeout * 1000 - elapsed + 500) / 1000;
+ pr_info("Waiting up to %d more seconds for network.\n", wait);
+ next_msg = jiffies + secs_to_jiffies(20);
+ }
+have_carrier:
+
*last = NULL;
if (!ic_first_dev) {
if (user_dev_name[0])
- printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+ pr_err("IP-Config: Device `%s' not found\n",
+ user_dev_name);
else
- printk(KERN_ERR "IP-Config: No network devices available.\n");
- return -1;
+ pr_err("IP-Config: No network devices available\n");
+ return -ENODEV;
}
return 0;
}
+/* Close all network interfaces except the one we've autoconfigured, and its
+ * lowers, in case it's a stacked virtual interface.
+ */
static void __init ic_close_devs(void)
{
+ struct net_device *selected_dev = ic_dev ? ic_dev->dev : NULL;
struct ic_device *d, *next;
struct net_device *dev;
rtnl_lock();
next = ic_first_dev;
while ((d = next)) {
+ bool bring_down = (d != ic_dev);
+ struct net_device *lower;
+ struct list_head *iter;
+
next = d->next;
dev = d->dev;
- if (dev != ic_dev) {
- DBG(("IP-Config: Downing %s\n", dev->name));
- dev_change_flags(dev, d->flags);
+
+ if (selected_dev) {
+ netdev_for_each_lower_dev(selected_dev, lower, iter) {
+ if (dev == lower) {
+ bring_down = false;
+ break;
+ }
+ }
+ }
+ if (bring_down) {
+ pr_debug("IP-Config: Downing %s\n", dev->name);
+ dev_change_flags(dev, d->flags, NULL);
}
kfree(d);
}
@@ -277,28 +360,6 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
sin->sin_port = port;
}
-static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
-{
- int res;
-
- mm_segment_t oldfs = get_fs();
- set_fs(get_ds());
- res = devinet_ioctl(cmd, (struct ifreq __user *) arg);
- set_fs(oldfs);
- return res;
-}
-
-static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
-{
- int res;
-
- mm_segment_t oldfs = get_fs();
- set_fs(get_ds());
- res = ip_rt_ioctl(cmd, (void __user *) arg);
- set_fs(oldfs);
- return res;
-}
-
/*
* Set up interface addresses and routes.
*/
@@ -310,22 +371,36 @@ static int __init ic_setup_if(void)
int err;
memset(&ir, 0, sizeof(ir));
- strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name);
set_sockaddr(sin, ic_myaddr, 0);
- if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+ if ((err = devinet_ioctl(&init_net, SIOCSIFADDR, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface address (%d)\n",
+ err);
return -1;
}
set_sockaddr(sin, ic_netmask, 0);
- if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+ if ((err = devinet_ioctl(&init_net, SIOCSIFNETMASK, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+ err);
return -1;
}
set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
- if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+ if ((err = devinet_ioctl(&init_net, SIOCSIFBRDADDR, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+ err);
return -1;
}
+ /* Handle the case where we need non-standard MTU on the boot link (a network
+ * using jumbo frames, for instance). If we can't set the mtu, don't error
+ * out, we'll try to muddle along.
+ */
+ if (ic_dev_mtu != 0) {
+ rtnl_lock();
+ if ((err = dev_set_mtu(ic_dev->dev, ic_dev_mtu)) < 0)
+ pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+ ic_dev_mtu, err);
+ rtnl_unlock();
+ }
return 0;
}
@@ -339,15 +414,16 @@ static int __init ic_setup_routes(void)
memset(&rm, 0, sizeof(rm));
if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
- printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+ pr_err("IP-Config: Gateway not on directly connected network\n");
return -1;
}
set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
rm.rt_flags = RTF_UP | RTF_GATEWAY;
- if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
- printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+ if ((err = ip_rt_ioctl(&init_net, SIOCADDRT, &rm)) < 0) {
+ pr_err("IP-Config: Cannot add default route (%d)\n",
+ err);
return -1;
}
}
@@ -365,9 +441,9 @@ static int __init ic_defaults(void)
* At this point we have no userspace running so need not
* claim locks on system_utsname
*/
-
+
if (!ic_host_name_set)
- sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+ sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr);
if (root_server_addr == NONE)
root_server_addr = ic_servaddr;
@@ -379,12 +455,15 @@ static int __init ic_defaults(void)
ic_netmask = htonl(IN_CLASSB_NET);
else if (IN_CLASSC(ntohl(ic_myaddr)))
ic_netmask = htonl(IN_CLASSC_NET);
+ else if (IN_CLASSE(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSE_NET);
else {
- printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n",
- NIPQUAD(ic_myaddr));
+ pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+ &ic_myaddr);
return -1;
}
- printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask));
+ pr_notice("IP-Config: Guessing netmask %pI4\n",
+ &ic_netmask);
}
return 0;
@@ -399,16 +478,16 @@ static int __init ic_defaults(void)
static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type rarp_packet_type __initdata = {
- .type = __constant_htons(ETH_P_RARP),
+ .type = cpu_to_be16(ETH_P_RARP),
.func = ic_rarp_recv,
};
-static inline void ic_rarp_init(void)
+static inline void __init ic_rarp_init(void)
{
dev_add_pack(&rarp_packet_type);
}
-static inline void ic_rarp_cleanup(void)
+static inline void __init ic_rarp_cleanup(void)
{
dev_remove_pack(&rarp_packet_type);
}
@@ -422,17 +501,21 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
struct arphdr *rarp;
unsigned char *rarp_ptr;
__be32 sip, tip;
- unsigned char *sha, *tha; /* s for "source", t for "target" */
+ unsigned char *tha; /* t for "target" */
struct ic_device *d;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
return NET_RX_DROP;
if (!pskb_may_pull(skb, sizeof(struct arphdr)))
goto drop;
/* Basic sanity checks can be done without the lock. */
- rarp = (struct arphdr *)skb->h.raw;
+ rarp = (struct arphdr *)skb_transport_header(skb);
/* If this test doesn't pass, it's not IP, or we should
* ignore it anyway.
@@ -448,14 +531,11 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (rarp->ar_pro != htons(ETH_P_IP))
goto drop;
- if (!pskb_may_pull(skb,
- sizeof(struct arphdr) +
- (2 * dev->addr_len) +
- (2 * 4)))
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
goto drop;
/* OK, it is all there and looks valid, process... */
- rarp = (struct arphdr *)skb->h.raw;
+ rarp = (struct arphdr *)skb_transport_header(skb);
rarp_ptr = (unsigned char *) (rarp + 1);
/* One reply at a time, please. */
@@ -473,7 +553,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
goto drop_unlock; /* should never happen */
/* Extract variable-width fields */
- sha = rarp_ptr;
rarp_ptr += dev->addr_len;
memcpy(&sip, rarp_ptr, 4);
rarp_ptr += 4;
@@ -490,10 +569,11 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
goto drop_unlock;
/* We have a winner! */
- ic_dev = dev;
+ ic_dev = d;
if (ic_myaddr == NONE)
ic_myaddr = tip;
ic_servaddr = sip;
+ ic_addrservaddr = sip;
ic_got_reply = IC_RARP;
drop_unlock:
@@ -519,6 +599,26 @@ static void __init ic_rarp_send_if(struct ic_device *d)
#endif
/*
+ * Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ ic_nameservers[i] = NONE;
+}
+
+/* Predefine NTP servers */
+static inline void __init ic_ntp_servers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+ ic_ntp_servers[i] = NONE;
+}
+
+/*
* DHCP/BOOTP support.
*/
@@ -561,10 +661,12 @@ struct bootp_pkt { /* BOOTP packet format */
static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type bootp_packet_type __initdata = {
- .type = __constant_htons(ETH_P_IP),
+ .type = cpu_to_be16(ETH_P_IP),
.func = ic_bootp_recv,
};
+/* DHCPACK can overwrite DNS if fallback was set upon first BOOTP reply */
+static int ic_nameservers_fallback __initdata;
/*
* Initialize DHCP/BOOTP extension fields in the request.
@@ -575,15 +677,14 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
#ifdef IPCONFIG_DHCP
static void __init
-ic_dhcp_init_options(u8 *options)
+ic_dhcp_init_options(u8 *options, struct ic_device *d)
{
u8 mt = ((ic_servaddr == NONE)
? DHCPDISCOVER : DHCPREQUEST);
u8 *e = options;
+ int len;
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Sending message type %d\n", mt);
-#endif
+ pr_debug("DHCP: Sending message type %d (%s)\n", mt, d->dev->name);
memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
e += 4;
@@ -613,13 +714,42 @@ ic_dhcp_init_options(u8 *options)
12, /* Host name */
15, /* Domain name */
17, /* Boot path */
+ 26, /* MTU */
40, /* NIS domain name */
+ 42, /* NTP servers */
};
*e++ = 55; /* Parameter request list */
*e++ = sizeof(ic_req_params);
memcpy(e, ic_req_params, sizeof(ic_req_params));
e += sizeof(ic_req_params);
+
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
+ if (*vendor_class_identifier) {
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
+ }
}
*e++ = 255; /* End of the list */
@@ -637,9 +767,11 @@ static void __init ic_bootp_init_ext(u8 *e)
*e++ = 3; /* Default gateway request */
*e++ = 4;
e += 4;
- *e++ = 5; /* Name server request */
- *e++ = 8;
- e += 8;
+#if CONF_NAMESERVERS_MAX > 0
+ *e++ = 6; /* (DNS) name server request */
+ *e++ = 4 * CONF_NAMESERVERS_MAX;
+ e += 4 * CONF_NAMESERVERS_MAX;
+#endif
*e++ = 12; /* Host name request */
*e++ = 32;
e += 32;
@@ -650,9 +782,9 @@ static void __init ic_bootp_init_ext(u8 *e)
*e++ = 40;
e += 40;
- *e++ = 57; /* set extension buffer size for reply */
+ *e++ = 57; /* set extension buffer size for reply */
*e++ = 2;
- *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */
+ *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */
*e++ = 150;
*e++ = 255; /* End of the list */
@@ -662,12 +794,15 @@ static void __init ic_bootp_init_ext(u8 *e)
/*
* Initialize the DHCP/BOOTP mechanism.
*/
-static inline void ic_bootp_init(void)
+static inline void __init ic_bootp_init(void)
{
- int i;
-
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
- ic_nameservers[i] = NONE;
+ /* Re-initialise all name servers and NTP servers to NONE, in case any
+ * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+ * any IP addresses specified there will already have been decoded but
+ * are no longer needed
+ */
+ ic_nameservers_predef();
+ ic_ntp_servers_predef();
dev_add_pack(&bootp_packet_type);
}
@@ -676,7 +811,7 @@ static inline void ic_bootp_init(void)
/*
* DHCP/BOOTP cleanup.
*/
-static inline void ic_bootp_cleanup(void)
+static inline void __init ic_bootp_cleanup(void)
{
dev_remove_pack(&bootp_packet_type);
}
@@ -690,19 +825,21 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
struct net_device *dev = d->dev;
struct sk_buff *skb;
struct bootp_pkt *b;
- int hh_len = LL_RESERVED_SPACE(dev);
struct iphdr *h;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
/* Allocate packet */
- skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL);
+ skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
+ GFP_KERNEL);
if (!skb)
return;
- skb_reserve(skb, hh_len);
- b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
- memset(b, 0, sizeof(struct bootp_pkt));
+ skb_reserve(skb, hlen);
+ b = skb_put_zero(skb, sizeof(struct bootp_pkt));
/* Construct IP header */
- skb->nh.iph = h = &b->iph;
+ skb_reset_network_header(skb);
+ h = ip_hdr(skb);
h->version = 4;
h->ihl = 5;
h->tot_len = htons(sizeof(struct bootp_pkt));
@@ -722,17 +859,16 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
b->op = BOOTP_REQUEST;
if (dev->type < 256) /* check for false types */
b->htype = dev->type;
- else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
- b->htype = ARPHRD_IEEE802;
else if (dev->type == ARPHRD_FDDI)
b->htype = ARPHRD_ETHER;
else {
- printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ pr_warn("Unknown ARP type 0x%04x for device %s\n", dev->type,
+ dev->name);
b->htype = dev->type; /* can cause undefined behavior */
}
+
+ /* server_ip and your_ip address are both already zero per RFC2131 */
b->hlen = dev->addr_len;
- b->your_ip = NONE;
- b->server_ip = NONE;
memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
b->secs = htons(jiffies_diff / HZ);
b->xid = d->xid;
@@ -740,7 +876,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/* add DHCP options or BOOTP extensions */
#ifdef IPCONFIG_DHCP
if (ic_proto_enabled & IC_USE_DHCP)
- ic_dhcp_init_options(b->exten);
+ ic_dhcp_init_options(b->exten, d);
else
#endif
ic_bootp_init_ext(b->exten);
@@ -748,15 +884,20 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/* Chain packet down the line... */
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- if ((dev->hard_header &&
- dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
- dev_queue_xmit(skb) < 0)
+ if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+ dev->broadcast, dev->dev_addr, skb->len) < 0) {
+ kfree_skb(skb);
+ printk("E");
+ return;
+ }
+
+ if (dev_queue_xmit(skb) < 0)
printk("E");
}
/*
- * Copy BOOTP-supplied string if not already set.
+ * Copy BOOTP-supplied string
*/
static int __init ic_bootp_string(char *dest, char *src, int len, int max)
{
@@ -775,50 +916,69 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max)
*/
static void __init ic_do_bootp_ext(u8 *ext)
{
- u8 servers;
- int i;
+ u8 servers;
+ int i;
+ __be16 mtu;
-#ifdef IPCONFIG_DEBUG
u8 *c;
- printk("DHCP/BOOTP: Got extension %d:",*ext);
- for(c=ext+2; c<ext+2+ext[1]; c++)
- printk(" %02x", *c);
- printk("\n");
-#endif
+ pr_debug("DHCP/BOOTP: Got extension %d:", *ext);
+ for (c=ext+2; c<ext+2+ext[1]; c++)
+ pr_debug(" %02x", *c);
+ pr_debug("\n");
switch (*ext++) {
- case 1: /* Subnet mask */
- if (ic_netmask == NONE)
- memcpy(&ic_netmask, ext+1, 4);
- break;
- case 3: /* Default gateway */
- if (ic_gateway == NONE)
- memcpy(&ic_gateway, ext+1, 4);
- break;
- case 6: /* DNS server */
- servers= *ext/4;
- if (servers > CONF_NAMESERVERS_MAX)
- servers = CONF_NAMESERVERS_MAX;
- for (i = 0; i < servers; i++) {
- if (ic_nameservers[i] == NONE)
- memcpy(&ic_nameservers[i], ext+1+4*i, 4);
- }
- break;
- case 12: /* Host name */
- ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN);
+ case 1: /* Subnet mask */
+ if (ic_netmask == NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 6: /* DNS server */
+ servers= *ext/4;
+ if (servers > CONF_NAMESERVERS_MAX)
+ servers = CONF_NAMESERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_nameservers[i] == NONE ||
+ ic_nameservers_fallback)
+ memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+ }
+ break;
+ case 12: /* Host name */
+ if (!ic_host_name_set) {
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
ic_host_name_set = 1;
- break;
- case 15: /* Domain name (DNS) */
+ }
+ break;
+ case 15: /* Domain name (DNS) */
+ if (!ic_domain[0])
ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
- break;
- case 17: /* Root path */
- if (!root_server_path[0])
- ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
- break;
- case 40: /* NIS Domain name (_not_ DNS) */
- ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN);
- break;
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext,
+ sizeof(root_server_path));
+ break;
+ case 26: /* Interface MTU */
+ memcpy(&mtu, ext+1, sizeof(mtu));
+ ic_dev_mtu = ntohs(mtu);
+ break;
+ case 40: /* NIS Domain name (_not_ DNS) */
+ ic_bootp_string(utsname()->domainname, ext+1, *ext,
+ __NEW_UTS_LEN);
+ break;
+ case 42: /* NTP servers */
+ servers = *ext / 4;
+ if (servers > CONF_NTP_SERVERS_MAX)
+ servers = CONF_NTP_SERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_ntp_servers[i] == NONE)
+ memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+ }
+ break;
}
}
@@ -833,11 +993,15 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
struct ic_device *d;
int len, ext_len;
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
/* Perform verifications before taking the lock. */
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
return NET_RX_DROP;
if (!pskb_may_pull(skb,
@@ -845,17 +1009,15 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
sizeof(struct udphdr)))
goto drop;
- b = (struct bootp_pkt *) skb->nh.iph;
+ b = (struct bootp_pkt *)skb_network_header(skb);
h = &b->iph;
if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
goto drop;
/* Fragments are not supported */
- if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
- "reply.\n");
+ if (ip_is_fragment(h)) {
+ net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
goto drop;
}
@@ -883,7 +1045,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
if (!pskb_may_pull(skb, skb->len))
goto drop;
- b = (struct bootp_pkt *) skb->nh.iph;
+ b = (struct bootp_pkt *)skb_network_header(skb);
h = &b->iph;
/* One reply at a time, please. */
@@ -903,17 +1065,15 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Is it a reply to our BOOTP request? */
if (b->op != BOOTP_REPLY ||
b->xid != d->xid) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
- "op[%x] xid[%x]\n",
- b->op, b->xid);
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us on %s, op[%x] xid[%x]\n",
+ d->dev->name, b->op, b->xid);
goto drop_unlock;
}
/* Parse extensions */
if (ext_len >= 4 &&
!memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
- u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+ u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
u8 *ext;
#ifdef IPCONFIG_DHCP
@@ -938,12 +1098,10 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
if (opt[1] >= 4)
memcpy(&server_id, opt + 2, 4);
break;
- };
+ }
}
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Got message type %d\n", mt);
-#endif
+ pr_debug("DHCP: Got message type %d (%s)\n", mt, d->dev->name);
switch (mt) {
case DHCPOFFER:
@@ -956,12 +1114,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Let's accept that offer. */
ic_myaddr = b->your_ip;
ic_servaddr = server_id;
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Offered address %u.%u.%u.%u",
- NIPQUAD(ic_myaddr));
- printk(" by server %u.%u.%u.%u\n",
- NIPQUAD(ic_servaddr));
-#endif
+ pr_debug("DHCP: Offered address %pI4 by server %pI4\n",
+ &ic_myaddr, &b->iph.saddr);
/* The DHCP indicated server address takes
* precedence over the bootp header one if
* they are different.
@@ -983,7 +1137,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
ic_myaddr = NONE;
ic_servaddr = NONE;
goto drop_unlock;
- };
+ }
ic_dhcp_msgtype = mt;
@@ -1002,13 +1156,16 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
}
/* We have a winner! */
- ic_dev = dev;
+ ic_dev = d;
ic_myaddr = b->your_ip;
ic_servaddr = b->server_ip;
+ ic_addrservaddr = b->iph.saddr;
if (ic_gateway == NONE && b->relay_ip)
ic_gateway = b->relay_ip;
- if (ic_nameservers[0] == NONE)
+ if (ic_nameservers[0] == NONE) {
ic_nameservers[0] = ic_servaddr;
+ ic_nameservers_fallback = 1;
+ }
ic_got_reply = IC_BOOTP;
drop_unlock:
@@ -1020,7 +1177,7 @@ drop:
kfree_skb(skb);
return 0;
-}
+}
#endif
@@ -1046,17 +1203,17 @@ static int __init ic_dynamic(void)
* are missing, and without DHCP/BOOTP/RARP we are unable to get it.
*/
if (!ic_proto_enabled) {
- printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ pr_err("IP-Config: Incomplete network configuration information\n");
return -1;
}
#ifdef IPCONFIG_BOOTP
if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
- printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+ pr_err("DHCP/BOOTP: No suitable device found\n");
#endif
#ifdef IPCONFIG_RARP
if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
- printk(KERN_ERR "RARP: No suitable device found.\n");
+ pr_err("RARP: No suitable device found\n");
#endif
if (!ic_proto_have_if)
@@ -1080,21 +1237,21 @@ static int __init ic_dynamic(void)
* seems to be a terrible waste of CPU time, but actually there is
* only one process running at all, so we don't need to use any
* scheduler functions.
- * [Actually we could now, but the nothing else running note still
+ * [Actually we could now, but the nothing else running note still
* applies.. - AC]
*/
- printk(KERN_NOTICE "Sending %s%s%s requests .",
- do_bootp
- ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
- (do_bootp && do_rarp) ? " and " : "",
- do_rarp ? "RARP" : "");
+ pr_notice("Sending %s%s%s requests .",
+ do_bootp
+ ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+ (do_bootp && do_rarp) ? " and " : "",
+ do_rarp ? "RARP" : "");
start_jiffies = jiffies;
d = ic_first_dev;
retries = CONF_SEND_RETRIES;
get_random_bytes(&timeout, sizeof(timeout));
- timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
- for(;;) {
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
+ for (;;) {
#ifdef IPCONFIG_BOOTP
if (do_bootp && (d->able & IC_BOOTP))
ic_bootp_send_if(d, jiffies - start_jiffies);
@@ -1104,23 +1261,26 @@ static int __init ic_dynamic(void)
ic_rarp_send_if(d);
#endif
- jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
- while (time_before(jiffies, jiff) && !ic_got_reply)
- schedule_timeout_uninterruptible(1);
+ if (!d->next) {
+ jiff = jiffies + timeout;
+ while (time_before(jiffies, jiff) && !ic_got_reply)
+ schedule_timeout_uninterruptible(1);
+ }
#ifdef IPCONFIG_DHCP
/* DHCP isn't done until we get a DHCPACK. */
- if ((ic_got_reply & IC_BOOTP)
- && (ic_proto_enabled & IC_USE_DHCP)
- && ic_dhcp_msgtype != DHCPACK)
- {
+ if ((ic_got_reply & IC_BOOTP) &&
+ (ic_proto_enabled & IC_USE_DHCP) &&
+ ic_dhcp_msgtype != DHCPACK) {
ic_got_reply = 0;
- printk(",");
+ /* continue on device that got the reply */
+ d = ic_dev;
+ pr_cont(",");
continue;
}
#endif /* IPCONFIG_DHCP */
if (ic_got_reply) {
- printk(" OK\n");
+ pr_cont(" OK\n");
break;
}
@@ -1128,7 +1288,7 @@ static int __init ic_dynamic(void)
continue;
if (! --retries) {
- printk(" timed out!\n");
+ pr_cont(" timed out!\n");
break;
}
@@ -1138,7 +1298,7 @@ static int __init ic_dynamic(void)
if (timeout > CONF_TIMEOUT_MAX)
timeout = CONF_TIMEOUT_MAX;
- printk(".");
+ pr_cont(".");
}
#ifdef IPCONFIG_BOOTP
@@ -1155,11 +1315,10 @@ static int __init ic_dynamic(void)
return -1;
}
- printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
- ((ic_got_reply & IC_RARP) ? "RARP"
- : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
- NIPQUAD(ic_servaddr));
- printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr));
+ pr_info("IP-Config: Got %s answer from %pI4, my address is %pI4\n",
+ ((ic_got_reply & IC_RARP) ? "RARP"
+ : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+ &ic_addrservaddr, &ic_myaddr);
return 0;
}
@@ -1167,7 +1326,10 @@ static int __init ic_dynamic(void)
#endif /* IPCONFIG_DYNAMIC */
#ifdef CONFIG_PROC_FS
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
+/* Name servers: */
static int pnp_seq_show(struct seq_file *seq, void *v)
{
int i;
@@ -1184,29 +1346,59 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
"domain %s\n", ic_domain);
for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
if (ic_nameservers[i] != NONE)
- seq_printf(seq,
- "nameserver %u.%u.%u.%u\n",
- NIPQUAD(ic_nameservers[i]));
+ seq_printf(seq, "nameserver %pI4\n",
+ &ic_nameservers[i]);
}
if (ic_servaddr != NONE)
- seq_printf(seq,
- "bootserver %u.%u.%u.%u\n",
- NIPQUAD(ic_servaddr));
+ seq_printf(seq, "bootserver %pI4\n",
+ &ic_servaddr);
return 0;
}
-static int pnp_seq_open(struct inode *indoe, struct file *file)
+/* Create the /proc/net/ipconfig directory */
+static int __init ipconfig_proc_net_init(void)
{
- return single_open(file, pnp_seq_show, NULL);
+ ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ return 0;
}
-static struct file_operations pnp_seq_fops = {
- .owner = THIS_MODULE,
- .open = pnp_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+/* Create a new file under /proc/net/ipconfig */
+static int ipconfig_proc_net_create(const char *name,
+ const struct proc_ops *proc_ops)
+{
+ char *pname;
+ struct proc_dir_entry *p;
+
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
+ if (!pname)
+ return -ENOMEM;
+
+ p = proc_create(pname, 0444, init_net.proc_net, proc_ops);
+ kfree(pname);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
+static int ntp_servers_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE)
+ seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+ }
+ return 0;
+}
+DEFINE_PROC_SHOW_ATTRIBUTE(ntp_servers);
#endif /* CONFIG_PROC_FS */
/*
@@ -1243,6 +1435,40 @@ __be32 __init root_nfs_parse_addr(char *name)
return addr;
}
+#define DEVICE_WAIT_MAX 12 /* 12 seconds */
+
+static int __init wait_for_devices(void)
+{
+ int i;
+ bool try_init_devs = true;
+
+ for (i = 0; i < DEVICE_WAIT_MAX; i++) {
+ struct net_device *dev;
+ int found = 0;
+
+ /* make sure deferred device probes are finished */
+ wait_for_device_probe();
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ if (ic_is_init_dev(dev)) {
+ found = 1;
+ break;
+ }
+ }
+ rtnl_unlock();
+ if (found)
+ return 0;
+ if (try_init_devs &&
+ (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) {
+ try_init_devs = false;
+ wait_for_init_devices_probe();
+ }
+ ssleep(1);
+ }
+ return -ENODEV;
+}
+
/*
* IP Autoconfig dispatcher.
*/
@@ -1250,27 +1476,47 @@ __be32 __init root_nfs_parse_addr(char *name)
static int __init ip_auto_config(void)
{
__be32 addr;
+#ifdef IPCONFIG_DYNAMIC
+ int retries = CONF_OPEN_RETRIES;
+#endif
+ int err;
+ unsigned int i, count;
+
+ /* Initialise all name servers and NTP servers to NONE (but only if the
+ * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+ * otherwise we'll overwrite the IP addresses specified there)
+ */
+ if (ic_set_manually == 0) {
+ ic_nameservers_predef();
+ ic_ntp_servers_predef();
+ }
#ifdef CONFIG_PROC_FS
- proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops);
+ proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
+
+ if (ipconfig_proc_net_init() == 0)
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_proc_ops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
return 0;
- DBG(("IP-Config: Entered.\n"));
+ pr_debug("IP-Config: Entered.\n");
#ifdef IPCONFIG_DYNAMIC
try_try_again:
#endif
- /* Give hardware a chance to settle */
- msleep(CONF_PRE_OPEN);
+ /* Wait for devices to appear */
+ err = wait_for_devices();
+ if (err)
+ return err;
/* Setup all network devices */
- if (ic_open_devs() < 0)
- return -1;
+ err = ic_open_devs();
+ if (err)
+ return err;
/* Give drivers a chance to settle */
- ssleep(CONF_POST_OPEN);
+ msleep(CONF_POST_OPEN);
/*
* If the config information is insufficient (e.g., our IP address or
@@ -1279,16 +1525,13 @@ static int __init ip_auto_config(void)
* missing values.
*/
if (ic_myaddr == NONE ||
-#ifdef CONFIG_ROOT_NFS
- (MAJOR(ROOT_DEV) == UNNAMED_MAJOR
- && root_server_addr == NONE
- && ic_servaddr == NONE) ||
+#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT)
+ (root_server_addr == NONE &&
+ ic_servaddr == NONE &&
+ (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) ||
#endif
ic_first_dev->next) {
#ifdef IPCONFIG_DYNAMIC
-
- int retries = CONF_OPEN_RETRIES;
-
if (ic_dynamic() < 0) {
ic_close_devs();
@@ -1308,30 +1551,34 @@ static int __init ip_auto_config(void)
*/
#ifdef CONFIG_ROOT_NFS
if (ROOT_DEV == Root_NFS) {
- printk(KERN_ERR
- "IP-Config: Retrying forever (NFS root)...\n");
+ pr_err("IP-Config: Retrying forever (NFS root)...\n");
+ goto try_try_again;
+ }
+#endif
+#ifdef CONFIG_CIFS_ROOT
+ if (ROOT_DEV == Root_CIFS) {
+ pr_err("IP-Config: Retrying forever (CIFS root)...\n");
goto try_try_again;
}
#endif
if (--retries) {
- printk(KERN_ERR
- "IP-Config: Reopening network devices...\n");
+ pr_err("IP-Config: Reopening network devices...\n");
goto try_try_again;
}
/* Oh, well. At least we tried. */
- printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+ pr_err("IP-Config: Auto-configuration of network failed\n");
return -1;
}
#else /* !DYNAMIC */
- printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ pr_err("IP-Config: Incomplete network configuration information\n");
ic_close_devs();
return -1;
#endif /* IPCONFIG_DYNAMIC */
} else {
/* Device selected manually or only one device -> use it */
- ic_dev = ic_first_dev->dev;
+ ic_dev = ic_first_dev;
}
addr = root_nfs_parse_addr(root_server_path);
@@ -1339,20 +1586,12 @@ static int __init ip_auto_config(void)
root_server_addr = addr;
/*
- * Use defaults whereever applicable.
+ * Use defaults wherever applicable.
*/
if (ic_defaults() < 0)
return -1;
/*
- * Close all network devices except the device we've
- * autoconfigured and set up routes.
- */
- ic_close_devs();
- if (ic_setup_if() < 0 || ic_setup_routes() < 0)
- return -1;
-
- /*
* Record which protocol was actually used.
*/
#ifdef IPCONFIG_DYNAMIC
@@ -1363,20 +1602,61 @@ static int __init ip_auto_config(void)
/*
* Clue in the operator.
*/
- printk("IP-Config: Complete:");
- printk("\n device=%s", ic_dev->name);
- printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr));
- printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
- printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
- printk(",\n host=%s, domain=%s, nis-domain=%s",
- utsname()->nodename, ic_domain, utsname()->domainname);
- printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
- printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
- printk(", rootpath=%s", root_server_path);
- printk("\n");
+ pr_info("IP-Config: Complete:\n");
+
+ pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+ ic_dev->dev->name, ic_dev->dev->addr_len, ic_dev->dev->dev_addr,
+ &ic_myaddr, &ic_netmask, &ic_gateway);
+ pr_info(" host=%s, domain=%s, nis-domain=%s\n",
+ utsname()->nodename, ic_domain, utsname()->domainname);
+ pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+ &ic_servaddr, &root_server_addr, root_server_path);
+ if (ic_dev_mtu)
+ pr_cont(", mtu=%d", ic_dev_mtu);
+ /* Name servers (if any): */
+ for (i = 0, count = 0; i < CONF_NAMESERVERS_MAX; i++) {
+ if (ic_nameservers[i] != NONE) {
+ if (i == 0)
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ else
+ pr_cont(", nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+
+ count++;
+ }
+ if ((i + 1 == CONF_NAMESERVERS_MAX) && count > 0)
+ pr_cont("\n");
+ }
+ /* NTP servers (if any): */
+ for (i = 0, count = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE) {
+ if (i == 0)
+ pr_info(" ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ else
+ pr_cont(", ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+
+ count++;
+ }
+ if ((i + 1 == CONF_NTP_SERVERS_MAX) && count > 0)
+ pr_cont("\n");
+ }
#endif /* !SILENT */
- return 0;
+ /*
+ * Close all network devices except the device we've
+ * autoconfigured and set up routes.
+ */
+ if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+ err = -1;
+ else
+ err = 0;
+
+ ic_close_devs();
+
+ return err;
}
late_initcall(ip_auto_config);
@@ -1384,34 +1664,36 @@ late_initcall(ip_auto_config);
/*
* Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
- * command line parameter. It consists of option fields separated by colons in
- * the following order:
- *
- * <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<PROTO>
- *
- * Any of the fields can be empty which means to use a default value:
- * <client-ip> - address given by BOOTP or RARP
- * <server-ip> - address of host returning BOOTP or RARP packet
- * <gw-ip> - none, or the address returned by BOOTP
- * <netmask> - automatically determined from <client-ip>, or the
- * one returned by BOOTP
- * <host name> - <client-ip> in ASCII notation, or the name returned
- * by BOOTP
- * <device> - use all available devices
- * <PROTO>:
- * off|none - don't do autoconfig at all (DEFAULT)
- * on|any - use any configured protocol
- * dhcp|bootp|rarp - use only the specified protocol
- * both - use both BOOTP and RARP (not DHCP)
+ * command line parameter. See Documentation/admin-guide/nfs/nfsroot.rst.
*/
static int __init ic_proto_name(char *name)
{
if (!strcmp(name, "on") || !strcmp(name, "any")) {
return 1;
}
+ if (!strcmp(name, "off") || !strcmp(name, "none")) {
+ return 0;
+ }
#ifdef CONFIG_IP_PNP_DHCP
- else if (!strcmp(name, "dhcp")) {
+ else if (!strncmp(name, "dhcp", 4)) {
+ char *client_id;
+
ic_proto_enabled &= ~IC_RARP;
+ client_id = strstr(name, "dhcp,");
+ if (client_id) {
+ char *v;
+
+ client_id = client_id + 5;
+ v = strchr(client_id, ',');
+ if (!v)
+ return 1;
+ *v = 0;
+ if (kstrtou8(client_id, 0, dhcp_client_identifier))
+ pr_debug("DHCP: Invalid client identifier type\n");
+ strscpy(dhcp_client_identifier + 1, v + 1,
+ sizeof(dhcp_client_identifier) - 1);
+ *v = ',';
+ }
return 1;
}
#endif
@@ -1442,55 +1724,90 @@ static int __init ip_auto_config_setup(char *addrs)
int num = 0;
ic_set_manually = 1;
+ ic_enable = 1;
- ic_enable = (*addrs &&
- (strcmp(addrs, "off") != 0) &&
- (strcmp(addrs, "none") != 0));
- if (!ic_enable)
+ /*
+ * If any dhcp, bootp etc options are set, leave autoconfig on
+ * and skip the below static IP processing.
+ */
+ if (ic_proto_name(addrs))
return 1;
- if (ic_proto_name(addrs))
+ /* If no static IP is given, turn off autoconfig and bail. */
+ if (*addrs == 0 ||
+ strcmp(addrs, "off") == 0 ||
+ strcmp(addrs, "none") == 0) {
+ ic_enable = 0;
return 1;
+ }
+
+ /* Initialise all name servers and NTP servers to NONE */
+ ic_nameservers_predef();
+ ic_ntp_servers_predef();
- /* Parse the whole string */
+ /* Parse string for static IP assignment. */
ip = addrs;
while (ip && *ip) {
if ((cp = strchr(ip, ':')))
*cp++ = '\0';
if (strlen(ip) > 0) {
- DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ pr_debug("IP-Config: Parameter #%d: `%s'\n", num, ip);
switch (num) {
case 0:
- if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
+ if ((ic_myaddr = in_aton(ip)) == ANY)
ic_myaddr = NONE;
break;
case 1:
- if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
+ if ((ic_servaddr = in_aton(ip)) == ANY)
ic_servaddr = NONE;
break;
case 2:
- if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
+ if ((ic_gateway = in_aton(ip)) == ANY)
ic_gateway = NONE;
break;
case 3:
- if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
+ if ((ic_netmask = in_aton(ip)) == ANY)
ic_netmask = NONE;
break;
case 4:
if ((dp = strchr(ip, '.'))) {
*dp++ = '\0';
- strlcpy(utsname()->domainname, dp,
+ strscpy(utsname()->domainname, dp,
sizeof(utsname()->domainname));
}
- strlcpy(utsname()->nodename, ip,
+ strscpy(utsname()->nodename, ip,
sizeof(utsname()->nodename));
ic_host_name_set = 1;
break;
case 5:
- strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ strscpy(user_dev_name, ip, sizeof(user_dev_name));
break;
case 6:
- ic_proto_name(ip);
+ if (ic_proto_name(ip) == 0 &&
+ ic_myaddr == NONE) {
+ ic_enable = 0;
+ }
+ break;
+ case 7:
+ if (CONF_NAMESERVERS_MAX >= 1) {
+ ic_nameservers[0] = in_aton(ip);
+ if (ic_nameservers[0] == ANY)
+ ic_nameservers[0] = NONE;
+ }
+ break;
+ case 8:
+ if (CONF_NAMESERVERS_MAX >= 2) {
+ ic_nameservers[1] = in_aton(ip);
+ if (ic_nameservers[1] == ANY)
+ ic_nameservers[1] = NONE;
+ }
+ break;
+ case 9:
+ if (CONF_NTP_SERVERS_MAX >= 1) {
+ ic_ntp_servers[0] = in_aton(ip);
+ if (ic_ntp_servers[0] == ANY)
+ ic_ntp_servers[0] = NONE;
+ }
break;
}
}
@@ -1500,11 +1817,36 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+__setup("ip=", ip_auto_config_setup);
static int __init nfsaddrs_config_setup(char *addrs)
{
return ip_auto_config_setup(addrs);
}
-
-__setup("ip=", ip_auto_config_setup);
__setup("nfsaddrs=", nfsaddrs_config_setup);
+
+static int __init vendor_class_identifier_setup(char *addrs)
+{
+ if (strscpy(vendor_class_identifier, addrs,
+ sizeof(vendor_class_identifier))
+ >= sizeof(vendor_class_identifier))
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
+ vendor_class_identifier);
+ return 1;
+}
+__setup("dhcpclass=", vendor_class_identifier_setup);
+
+static int __init set_carrier_timeout(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &carrier_timeout);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("carrier_timeout=", set_carrier_timeout);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 9d719d664e5b..ff95b1b9908e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Linux NET3: IP/IP protocol decoder.
- *
- * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
+ * Linux NET3: IP/IP protocol decoder.
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
@@ -18,12 +17,6 @@
* Carlos Picoto : GRE over IP support
* Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
* I do not want to merge them together.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
/* tunnel.c: an IP tunnel driver
@@ -35,15 +28,15 @@
Thanks for the great code!
-Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
-
+
Minor tweaks:
Cleaned up the code a little and added some pre-1.3.0 tweaks.
dev->hard_header/hard_header_len changed to use no headers.
Comments/bracketing tweaked.
Made the tunnels use dev->name not tunnel: when error reporting.
Added tx_dropped stat
-
- -Alan Cox (Alan.Cox@linux.org) 21 March 95
+
+ -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95
Reworked:
Changed to tunnel to destination gateway in addition to the
@@ -52,7 +45,7 @@
Note: There is currently no firewall or ICMP handling done.
-Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
-
+
*/
/* Things I wish I had known when writing the tunnel driver:
@@ -75,7 +68,7 @@
"allocated" with skb_put(). You can then write up to skb->len
bytes to that buffer. If you need more, you can call skb_put()
again with the additional amount of space you need. You can
- find out how much more space you can allocate by calling
+ find out how much more space you can allocate by calling
"skb_tailroom(skb)".
Now, to add header space, call "skb_push(skb, header_len)".
This creates space at the beginning of the buffer and returns
@@ -92,20 +85,19 @@
For comments look at net/ipv4/ip_gre.c --ANK
*/
-
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -113,814 +105,598 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
-#define HASH_SIZE 16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-
-static int ipip_fb_tunnel_init(struct net_device *dev);
-static int ipip_tunnel_init(struct net_device *dev);
-static void ipip_tunnel_setup(struct net_device *dev);
-
-static struct net_device *ipip_fb_tunnel_dev;
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
-static struct ip_tunnel *tunnels_r[HASH_SIZE];
-static struct ip_tunnel *tunnels_l[HASH_SIZE];
-static struct ip_tunnel *tunnels_wc[1];
-static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+static unsigned int ipip_net_id __read_mostly;
-static DEFINE_RWLOCK(ipip_lock);
+static int ipip_tunnel_init(struct net_device *dev);
+static struct rtnl_link_ops ipip_link_ops __read_mostly;
-static struct ip_tunnel * ipip_tunnel_lookup(__be32 remote, __be32 local)
+static int ipip_err(struct sk_buff *skb, u32 info)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
+ int err = 0;
- for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
- }
- for (t = tunnels_r[h0]; t; t = t->next) {
- if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
- }
- for (t = tunnels_l[h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
- return t;
- }
- if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
- return t;
- return NULL;
-}
-
-static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
-{
- __be32 remote = t->parms.iph.daddr;
- __be32 local = t->parms.iph.saddr;
- unsigned h = 0;
- int prio = 0;
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- return &tunnels[prio][h];
-}
-
-
-static void ipip_tunnel_unlink(struct ip_tunnel *t)
-{
- struct ip_tunnel **tp;
-
- for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- write_lock_bh(&ipip_lock);
- *tp = t->next;
- write_unlock_bh(&ipip_lock);
- break;
- }
- }
-}
-
-static void ipip_tunnel_link(struct ip_tunnel *t)
-{
- struct ip_tunnel **tp = ipip_bucket(t);
-
- t->next = *tp;
- write_lock_bh(&ipip_lock);
- *tp = t;
- write_unlock_bh(&ipip_lock);
-}
-
-static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- struct ip_tunnel *t, **tp, *nt;
- struct net_device *dev;
- unsigned h = 0;
- int prio = 0;
- char name[IFNAMSIZ];
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
- return t;
- }
- if (!create)
- return NULL;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else {
- int i;
- for (i=1; i<100; i++) {
- sprintf(name, "tunl%d", i);
- if (__dev_get_by_name(name) == NULL)
- break;
- }
- if (i==100)
- goto failed;
- }
-
- dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
- if (dev == NULL)
- return NULL;
-
- nt = netdev_priv(dev);
- SET_MODULE_OWNER(dev);
- dev->init = ipip_tunnel_init;
- nt->parms = *parms;
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
- if (register_netdevice(dev) < 0) {
- free_netdev(dev);
- goto failed;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr,
+ iph->saddr, 0);
+ if (!t) {
+ err = -ENOENT;
+ goto out;
}
- dev_hold(dev);
- ipip_tunnel_link(nt);
- return nt;
-
-failed:
- return NULL;
-}
-
-static void ipip_tunnel_uninit(struct net_device *dev)
-{
- if (dev == ipip_fb_tunnel_dev) {
- write_lock_bh(&ipip_lock);
- tunnels_wc[0] = NULL;
- write_unlock_bh(&ipip_lock);
- } else
- ipip_tunnel_unlink(netdev_priv(dev));
- dev_put(dev);
-}
-
-static int ipip_err(struct sk_buff *skb, u32 info)
-{
-#ifndef I_WISH_WORLD_WERE_PERFECT
-
-/* It is not :-( All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
- */
- struct iphdr *iph = (struct iphdr*)skb->data;
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
- struct ip_tunnel *t;
- int err;
-
switch (type) {
- default:
- case ICMP_PARAMETERPROB:
- return 0;
-
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
/* Impossible event. */
- return 0;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return 0;
+ goto out;
default:
/* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe they are just ether pollution. --ANK
+ * rfc2003 contains "deep thoughts" about NET_UNREACH,
+ * I believe they are just ether pollution. --ANK
*/
break;
}
break;
+
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return 0;
+ goto out;
break;
+
+ case ICMP_REDIRECT:
+ break;
+
+ default:
+ goto out;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
+ goto out;
}
- err = -ENOENT;
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, net, t->parms.link, iph->protocol);
+ goto out;
+ }
- read_lock(&ipip_lock);
- t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
- if (t == NULL || t->parms.iph.daddr == 0)
+ if (t->parms.iph.daddr == 0) {
+ err = -ENOENT;
goto out;
+ }
- err = 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+
out:
- read_unlock(&ipip_lock);
return err;
-#else
- struct iphdr *iph = (struct iphdr*)dp;
- int hlen = iph->ihl<<2;
- struct iphdr *eiph;
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
- int rel_type = 0;
- int rel_code = 0;
- __be32 rel_info = 0;
- __u32 n = 0;
- struct sk_buff *skb2;
- struct flowi fl;
- struct rtable *rt;
+}
- if (len < hlen + sizeof(struct iphdr))
- return 0;
- eiph = (struct iphdr*)(dp + hlen);
+static const struct tnl_ptk_info ipip_tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
- switch (type) {
- default:
- return 0;
- case ICMP_PARAMETERPROB:
- n = ntohl(skb->h.icmph->un.gateway) >> 24;
- if (n < hlen)
- return 0;
-
- /* So... This guy found something strange INSIDE encapsulated
- packet. Well, he is fool, but what can we do ?
- */
- rel_type = ICMP_PARAMETERPROB;
- rel_info = htonl((n - hlen) << 24);
- break;
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct tnl_ptk_info mplsip_tpi = {
+ /* no tunnel info required for mplsip. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+#endif
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return 0;
- case ICMP_FRAG_NEEDED:
- /* And it is the only really necessary thing :-) */
- n = ntohs(skb->h.icmph->un.frag.mtu);
- if (n < hlen+68)
- return 0;
- n -= hlen;
- /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
- if (n > ntohs(eiph->tot_len))
+static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ struct metadata_dst *tun_dst = NULL;
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph;
+
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr,
+ iph->daddr, 0);
+ if (tunnel) {
+ const struct tnl_ptk_info *tpi;
+
+ if (tunnel->parms.iph.protocol != ipproto &&
+ tunnel->parms.iph.protocol != 0)
+ goto drop;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+#if IS_ENABLED(CONFIG_MPLS)
+ if (ipproto == IPPROTO_MPLS)
+ tpi = &mplsip_tpi;
+ else
+#endif
+ tpi = &ipip_tpi;
+ if (iptunnel_pull_header(skb, 0, tpi->proto, false))
+ goto drop;
+ if (tunnel->collect_md) {
+ ip_tunnel_flags_zero(flags);
+
+ tun_dst = ip_tun_rx_dst(skb, flags, 0, 0);
+ if (!tun_dst)
return 0;
- rel_info = htonl(n);
- break;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe, it is just ether pollution. --ANK
- */
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- break;
+ ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
}
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return 0;
- break;
- }
+ skb_reset_mac_header(skb);
- /* Prepare fake skb to feed it to icmp_send */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- return 0;
- dst_release(skb2->dst);
- skb2->dst = NULL;
- skb_pull(skb2, skb->data - (u8*)eiph);
- skb2->nh.raw = skb2->data;
-
- /* Try to guess incoming interface */
- memset(&fl, 0, sizeof(fl));
- fl.fl4_daddr = eiph->saddr;
- fl.fl4_tos = RT_TOS(eiph->tos);
- fl.proto = IPPROTO_IPIP;
- if (ip_route_output_key(&rt, &key)) {
- kfree_skb(skb2);
- return 0;
- }
- skb2->dev = rt->u.dst.dev;
-
- /* route "incoming" packet */
- if (rt->rt_flags&RTCF_LOCAL) {
- ip_rt_put(rt);
- rt = NULL;
- fl.fl4_daddr = eiph->daddr;
- fl.fl4_src = eiph->saddr;
- fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(&rt, &fl) ||
- rt->u.dst.dev->type != ARPHRD_TUNNEL) {
- ip_rt_put(rt);
- kfree_skb(skb2);
- return 0;
- }
- } else {
- ip_rt_put(rt);
- if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
- skb2->dst->dev->type != ARPHRD_TUNNEL) {
- kfree_skb(skb2);
- return 0;
- }
+ return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
}
- /* change mtu on this route */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- if (n > dst_mtu(skb2->dst)) {
- kfree_skb(skb2);
- return 0;
- }
- skb2->dst->ops->update_pmtu(skb2->dst, n);
- } else if (type == ICMP_TIME_EXCEEDED) {
- struct ip_tunnel *t = netdev_priv(skb2->dev);
- if (t->parms.iph.ttl) {
- rel_type = ICMP_DEST_UNREACH;
- rel_code = ICMP_HOST_UNREACH;
- }
- }
+ return -1;
- icmp_send(skb2, rel_type, rel_code, rel_info);
- kfree_skb(skb2);
+drop:
+ kfree_skb(skb);
return 0;
-#endif
}
-static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb)
+static int ipip_rcv(struct sk_buff *skb)
{
- struct iphdr *inner_iph = skb->nh.iph;
-
- if (INET_ECN_is_ce(outer_iph->tos))
- IP_ECN_set_ce(inner_iph);
+ return ipip_tunnel_rcv(skb, IPPROTO_IPIP);
}
-static int ipip_rcv(struct sk_buff *skb)
+#if IS_ENABLED(CONFIG_MPLS)
+static int mplsip_rcv(struct sk_buff *skb)
{
- struct iphdr *iph;
- struct ip_tunnel *tunnel;
-
- iph = skb->nh.iph;
-
- read_lock(&ipip_lock);
- if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- read_unlock(&ipip_lock);
- kfree_skb(skb);
- return 0;
- }
-
- secpath_reset(skb);
-
- skb->mac.raw = skb->nh.raw;
- skb->nh.raw = skb->data;
- skb->protocol = htons(ETH_P_IP);
- skb->pkt_type = PACKET_HOST;
-
- tunnel->stat.rx_packets++;
- tunnel->stat.rx_bytes += skb->len;
- skb->dev = tunnel->dev;
- dst_release(skb->dst);
- skb->dst = NULL;
- nf_reset(skb);
- ipip_ecn_decapsulate(iph, skb);
- netif_rx(skb);
- read_unlock(&ipip_lock);
- return 0;
- }
- read_unlock(&ipip_lock);
-
- return -1;
+ return ipip_tunnel_rcv(skb, IPPROTO_MPLS);
}
+#endif
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
*/
-
-static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &tunnel->stat;
- struct iphdr *tiph = &tunnel->parms.iph;
- u8 tos = tunnel->parms.iph.tos;
- __be16 df = tiph->frag_off;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *old_iph = skb->nh.iph;
- struct iphdr *iph; /* Our new IP header */
- int max_headroom; /* The extra header space needed */
- __be32 dst = tiph->daddr;
- int mtu;
-
- if (tunnel->recursion++) {
- tunnel->stat.collisions++;
- goto tx_error;
- }
+ const struct iphdr *tiph = &tunnel->parms.iph;
+ u8 ipproto;
- if (skb->protocol != htons(ETH_P_IP))
+ if (!pskb_inet_may_pull(skb))
goto tx_error;
- if (tos&1)
- tos = old_iph->tos;
-
- if (!dst) {
- /* NBMA tunnel */
- if ((rt = (struct rtable*)skb->dst) == NULL) {
- tunnel->stat.tx_fifo_errors++;
- goto tx_error;
- }
- if ((dst = rt->rt_gateway) == 0)
- goto tx_error_icmp;
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipproto = IPPROTO_IPIP;
+ break;
+#if IS_ENABLED(CONFIG_MPLS)
+ case htons(ETH_P_MPLS_UC):
+ ipproto = IPPROTO_MPLS;
+ break;
+#endif
+ default:
+ goto tx_error;
}
- {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_IPIP };
- if (ip_route_output_key(&rt, &fl)) {
- tunnel->stat.tx_carrier_errors++;
- goto tx_error_icmp;
- }
- }
- tdev = rt->u.dst.dev;
+ if (tiph->protocol != ipproto && tiph->protocol != 0)
+ goto tx_error;
- if (tdev == dev) {
- ip_rt_put(rt);
- tunnel->stat.collisions++;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
goto tx_error;
- }
- if (tiph->frag_off)
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+ skb_set_inner_ipproto(skb, ipproto);
+
+ if (tunnel->collect_md)
+ ip_md_tunnel_xmit(skb, dev, ipproto, 0);
else
- mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
+ return NETDEV_TX_OK;
- if (mtu < 68) {
- tunnel->stat.collisions++;
- ip_rt_put(rt);
- goto tx_error;
- }
- if (skb->dst)
- skb->dst->ops->update_pmtu(skb->dst, mtu);
+tx_error:
+ kfree_skb(skb);
- df |= (old_iph->frag_off&htons(IP_DF));
+ DEV_STATS_INC(dev, tx_errors);
+ return NETDEV_TX_OK;
+}
- if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
+static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
+{
+ switch (ipproto) {
+ case 0:
+ case IPPROTO_IPIP:
+#if IS_ENABLED(CONFIG_MPLS)
+ case IPPROTO_MPLS:
+#endif
+ return true;
}
- if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
- tunnel->err_count--;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
+ return false;
+}
- /*
- * Okay, now see if we can stuff it in the buffer as-is.
- */
- max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
-
- if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- stats->tx_dropped++;
- dev_kfree_skb(skb);
- tunnel->recursion--;
- return 0;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = skb->nh.iph;
+static int
+ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
+{
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p->iph.version != 4 ||
+ !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
+ return -EINVAL;
}
- skb->h.raw = skb->nh.raw;
- skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
- /*
- * Push down and install the IPIP header.
- */
+ p->i_key = p->o_key = 0;
+ ip_tunnel_flags_zero(p->i_flags);
+ ip_tunnel_flags_zero(p->o_flags);
+ return ip_tunnel_ctl(dev, p, cmd);
+}
- iph = skb->nh.iph;
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_IPIP;
- iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
+static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct ip_tunnel *tunnel = netdev_priv(ctx->dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+ struct rtable *rt;
- if ((iph->ttl = tiph->ttl) == 0)
- iph->ttl = old_iph->ttl;
+ rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
+ RT_SCOPE_UNIVERSE);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
- nf_reset(skb);
+ path->type = DEV_PATH_TUN;
+ path->tun.src_v4.s_addr = tiph->saddr;
+ path->tun.dst_v4.s_addr = tiph->daddr;
+ path->tun.l3_proto = IPPROTO_IPIP;
+ path->dev = ctx->dev;
- IPTUNNEL_XMIT();
- tunnel->recursion--;
- return 0;
+ ctx->dev = rt->dst.dev;
+ ip_rt_put(rt);
-tx_error_icmp:
- dst_link_failure(skb);
-tx_error:
- stats->tx_errors++;
- dev_kfree_skb(skb);
- tunnel->recursion--;
return 0;
}
-static int
-ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
-{
- int err = 0;
- struct ip_tunnel_parm p;
- struct ip_tunnel *t;
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipip_fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip_tunnel_locate(&p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- ipip_tunnel_unlink(t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipip_tunnel_link(t);
- netdev_state_change(dev);
- }
- }
+static const struct net_device_ops ipip_netdev_ops = {
+ .ndo_init = ipip_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = ipip_tunnel_xmit,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipip_tunnel_ctl,
+ .ndo_fill_forward_path = ipip_fill_forward_path,
+};
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
+#define IPIP_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipip_fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t->dev == ipip_fb_tunnel_dev)
- goto done;
- dev = t->dev;
- }
- err = unregister_netdevice(dev);
- break;
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &ipip_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
- default:
- err = -EINVAL;
- }
+ dev->type = ARPHRD_TUNNEL;
+ dev->flags = IFF_NOARP;
+ dev->addr_len = 4;
+ dev->lltx = true;
+ netif_keep_dst(dev);
-done:
- return err;
+ dev->features |= IPIP_FEATURES;
+ dev->hw_features |= IPIP_FEATURES;
+ ip_tunnel_setup(dev, ipip_net_id);
}
-static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
{
- return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ tunnel->tun_hlen = 0;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ return ip_tunnel_init(dev);
}
-static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
- if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+ u8 proto;
+
+ if (!data || !data[IFLA_IPTUN_PROTO])
+ return 0;
+
+ proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0)
return -EINVAL;
- dev->mtu = new_mtu;
+
return 0;
}
-static void ipip_tunnel_setup(struct net_device *dev)
+static void ipip_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm_kern *parms,
+ bool *collect_md, __u32 *fwmark)
{
- SET_MODULE_OWNER(dev);
- dev->uninit = ipip_tunnel_uninit;
- dev->hard_start_xmit = ipip_tunnel_xmit;
- dev->get_stats = ipip_tunnel_get_stats;
- dev->do_ioctl = ipip_tunnel_ioctl;
- dev->change_mtu = ipip_tunnel_change_mtu;
- dev->destructor = free_netdev;
+ memset(parms, 0, sizeof(*parms));
- dev->type = ARPHRD_TUNNEL;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
- dev->flags = IFF_NOARP;
- dev->iflink = 0;
- dev->addr_len = 4;
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPIP;
+ parms->iph.ihl = 5;
+ *collect_md = false;
+
+ if (!data)
+ return;
+
+ ip_tunnel_netlink_parms(data, parms);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ *collect_md = true;
+
+ if (data[IFLA_IPTUN_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-static int ipip_tunnel_init(struct net_device *dev)
+static int ipip_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = 0;
+
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
+ ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
+}
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
+ bool collect_md;
+ __u32 fwmark = t->fwmark;
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ int err = ip_tunnel_encap_setup(t, &ipencap);
- if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
- struct rtable *rt;
- if (!ip_route_output_key(&rt, &fl)) {
- tdev = rt->u.dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
+ if (err < 0)
+ return err;
}
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(tunnel->parms.link);
+ ipip_netlink_parms(data, &p, &collect_md, &fwmark);
+ if (collect_md)
+ return -EINVAL;
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - sizeof(struct iphdr);
- }
- dev->iflink = tunnel->parms.link;
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
- return 0;
+ return ip_tunnel_changelink(dev, tb, &p, fwmark);
}
-static int __init ipip_fb_tunnel_init(struct net_device *dev)
+static size_t ipip_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PROTO */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
+ /* IFLA_IPTUN_FWMARK */
+ nla_total_size(4) +
+ 0;
+}
+
+static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
+ struct ip_tunnel_parm_kern *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))) ||
+ nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
+ if (tunnel->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
+ return 0;
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+nla_put_failure:
+ return -EMSGSIZE;
+}
- iph->version = 4;
- iph->protocol = IPPROTO_IPIP;
- iph->ihl = 5;
+static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 },
+};
- dev_hold(dev);
- tunnels_wc[0] = tunnel;
- return 0;
-}
+static struct rtnl_link_ops ipip_link_ops __read_mostly = {
+ .kind = "ipip",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip_tunnel_setup,
+ .validate = ipip_tunnel_validate,
+ .newlink = ipip_newlink,
+ .changelink = ipip_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipip_get_size,
+ .fill_info = ipip_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
-static struct xfrm_tunnel ipip_handler = {
+static struct xfrm_tunnel ipip_handler __read_mostly = {
.handler = ipip_rcv,
.err_handler = ipip_err,
.priority = 1,
};
-static char banner[] __initdata =
- KERN_INFO "IPv4 over IPv4 tunneling driver\n";
+#if IS_ENABLED(CONFIG_MPLS)
+static struct xfrm_tunnel mplsip_handler __read_mostly = {
+ .handler = mplsip_rcv,
+ .err_handler = ipip_err,
+ .priority = 1,
+};
+#endif
+
+static int __net_init ipip_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
+}
+
+static void __net_exit ipip_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill);
+}
+
+static struct pernet_operations ipip_net_ops = {
+ .init = ipip_init_net,
+ .exit_rtnl = ipip_exit_rtnl,
+ .id = &ipip_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
static int __init ipip_init(void)
{
int err;
- printk(banner);
+ pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n");
- if (xfrm4_tunnel_register(&ipip_handler)) {
- printk(KERN_INFO "ipip init: can't register tunnel\n");
- return -EAGAIN;
+ err = register_pernet_device(&ipip_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_ipip_failed;
}
-
- ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
- "tunl0",
- ipip_tunnel_setup);
- if (!ipip_fb_tunnel_dev) {
- err = -ENOMEM;
- goto err1;
+#if IS_ENABLED(CONFIG_MPLS)
+ err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
+ if (err < 0) {
+ pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_mplsip_failed;
}
+#endif
+ err = rtnl_link_register(&ipip_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
- ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init;
-
- if ((err = register_netdev(ipip_fb_tunnel_dev)))
- goto err2;
- out:
+out:
return err;
- err2:
- free_netdev(ipip_fb_tunnel_dev);
- err1:
- xfrm4_tunnel_deregister(&ipip_handler);
- goto out;
-}
-static void __exit ipip_destroy_tunnels(void)
-{
- int prio;
-
- for (prio = 1; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
- while ((t = tunnels[prio][h]) != NULL)
- unregister_netdevice(t->dev);
- }
- }
+rtnl_link_failed:
+#if IS_ENABLED(CONFIG_MPLS)
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
+xfrm_tunnel_mplsip_failed:
+
+#endif
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel_ipip_failed:
+ unregister_pernet_device(&ipip_net_ops);
+ goto out;
}
static void __exit ipip_fini(void)
{
- if (xfrm4_tunnel_deregister(&ipip_handler))
- printk(KERN_INFO "ipip close: can't deregister tunnel\n");
-
- rtnl_lock();
- ipip_destroy_tunnels();
- unregister_netdevice(ipip_fb_tunnel_dev);
- rtnl_unlock();
+ rtnl_link_unregister(&ipip_link_ops);
+ if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
+ pr_info("%s: can't deregister tunnel\n", __func__);
+#if IS_ENABLED(CONFIG_MPLS)
+ if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS))
+ pr_info("%s: can't deregister tunnel\n", __func__);
+#endif
+ unregister_pernet_device(&ipip_net_ops);
}
module_init(ipip_init);
module_exit(ipip_fini);
+MODULE_DESCRIPTION("IP/IP protocol decoder library");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ipip");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efcf45ecc818..ca9eaee4c2ef 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1,16 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP multicast routing support for mrouted 3.6/3.8
*
- * (c) 1995 Alan Cox, <alan@redhat.com>
+ * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
* Linux Consultancy and Custom Driver Development
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
- *
* Fixes:
* Michael Chastain : Incorrect size of copying.
* Alan Cox : Added the cache manager code
@@ -24,17 +18,14 @@
* overflow.
* Carlos Picoto : PIMv1 Support
* Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
- * Relax this requrement to work with older peers.
- *
+ * Relax this requirement to work with older peers.
*/
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/cache.h>
#include <linux/capability.h>
#include <linux/errno.h>
-#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
@@ -50,322 +41,770 @@
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/flow.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/route.h>
-#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <linux/rhashtable.h>
+#include <net/ip_tunnels.h>
#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+#include <linux/netconf.h>
+#include <net/rtnh.h>
+#include <net/inet_dscp.h>
-#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
-#define CONFIG_IP_PIMSM 1
-#endif
+#include <linux/nospec.h>
-static struct sock *mroute_socket;
+struct ipmr_rule {
+ struct fib_rule common;
+};
+struct ipmr_result {
+ struct mr_table *mrt;
+};
/* Big lock, protecting vif table, mrt cache and mroute socket state.
- Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
-/*
- * Multicast router control variables
- */
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
-static struct vif_device vif_table[MAXVIFS]; /* Devices */
-static int maxvif;
+/* Multicast router control variables */
-#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
-static int mroute_do_assert; /* Set in PIM assert */
-static int mroute_do_pim;
+/* We return to original Alan's scheme. Hash table of resolved
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
+ */
-static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
+static struct kmem_cache *mrt_cachep __ro_after_init;
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static void ipmr_free_table(struct mr_table *mrt);
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local);
+static int ipmr_cache_report(const struct mr_table *mrt,
+ struct sk_buff *pkt, vifi_t vifi, int assert);
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd);
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
+static void ipmr_expire_process(struct timer_list *t);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \
+ lockdep_rtnl_is_held() || \
+ list_empty(&net->ipv4.mr_tables))
+
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+ struct mr_table *mrt)
+{
+ struct mr_table *ret;
-static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
-static atomic_t cache_resolve_queue_len; /* Size of unresolved */
+ if (!mrt)
+ ret = list_entry_rcu(net->ipv4.mr_tables.next,
+ struct mr_table, list);
+ else
+ ret = list_entry_rcu(mrt->list.next,
+ struct mr_table, list);
-/* Special spinlock for queue of unresolved entries */
-static DEFINE_SPINLOCK(mfc_unres_lock);
+ if (&ret->list == &net->ipv4.mr_tables)
+ return NULL;
+ return ret;
+}
-/* We return to original Alan's scheme. Hash table of resolved
- entries is changed only in process context and protected
- with weak lock mrt_lock. Queue of unresolved entries is protected
- with strong spinlock mfc_unres_lock.
+static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
- In this case data path is free of exclusive locks at all.
- */
+ ipmr_for_each_table(mrt, net) {
+ if (mrt->id == id)
+ return mrt;
+ }
+ return NULL;
+}
-static kmem_cache_t *mrt_cachep __read_mostly;
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
-static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
-static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+ rcu_read_lock();
+ mrt = __ipmr_get_table(net, id);
+ rcu_read_unlock();
+ return mrt;
+}
-#ifdef CONFIG_IP_PIMSM_V2
-static struct net_protocol pim_protocol;
-#endif
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+ struct mr_table **mrt)
+{
+ int err;
+ struct ipmr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp4));
+
+ err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+ flowi4_to_flowi(flp4), 0, &arg);
+ if (err < 0)
+ return err;
+ *mrt = res.mrt;
+ return 0;
+}
-static struct timer_list ipmr_expire_timer;
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct ipmr_result *res = arg->result;
+ struct mr_table *mrt;
-/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
-static
-struct net_device *ipmr_new_tunnel(struct vifctl *v)
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = __ipmr_get_table(rule->fr_net, arg->table);
+ if (!mrt)
+ return -EAGAIN;
+ res->mrt = mrt;
+ return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
- struct net_device *dev;
+ return 1;
+}
- dev = __dev_get_by_name("tunl0");
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
- if (dev) {
- int err;
- struct ifreq ifr;
- mm_segment_t oldfs;
- struct ip_tunnel_parm p;
- struct in_device *in_dev;
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ return 1;
+}
- memset(&p, 0, sizeof(p));
- p.iph.daddr = v->vifc_rmt_addr.s_addr;
- p.iph.saddr = v->vifc_lcl_addr.s_addr;
- p.iph.version = 4;
- p.iph.ihl = 5;
- p.iph.protocol = IPPROTO_IPIP;
- sprintf(p.name, "dvmrp%d", v->vifc_vifi);
- ifr.ifr_ifru.ifru_data = (void*)&p;
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ frh->dst_len = 0;
+ frh->src_len = 0;
+ frh->tos = 0;
+ return 0;
+}
- oldfs = get_fs(); set_fs(KERNEL_DS);
- err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
- set_fs(oldfs);
+static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .rule_size = sizeof(struct ipmr_rule),
+ .addr_size = sizeof(u32),
+ .action = ipmr_rule_action,
+ .match = ipmr_rule_match,
+ .configure = ipmr_rule_configure,
+ .compare = ipmr_rule_compare,
+ .fill = ipmr_rule_fill,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .owner = THIS_MODULE,
+};
- dev = NULL;
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ struct mr_table *mrt;
+ int err;
- if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
- dev->flags |= IFF_MULTICAST;
+ ops = fib_rules_register(&ipmr_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
- in_dev = __in_dev_get_rtnl(dev);
- if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
- goto failure;
- in_dev->cnf.rp_filter = 0;
+ INIT_LIST_HEAD(&net->ipv4.mr_tables);
- if (dev_open(dev))
- goto failure;
- }
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
+ goto err1;
}
- return dev;
-failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
+ err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT);
+ if (err < 0)
+ goto err2;
+
+ net->ipv4.mr_rules_ops = ops;
+ return 0;
+
+err2:
rtnl_lock();
+ ipmr_free_table(mrt);
+ rtnl_unlock();
+err1:
+ fib_rules_unregister(ops);
+ return err;
+}
- unregister_netdevice(dev);
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ struct mr_table *mrt, *next;
+
+ ASSERT_RTNL();
+ list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
+ list_del(&mrt->list);
+ ipmr_free_table(mrt);
+ }
+ fib_rules_unregister(net->ipv4.mr_rules_ops);
+}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
+}
+
+static unsigned int ipmr_rules_seq_read(const struct net *net)
+{
+ return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+ return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
+#else
+#define ipmr_for_each_table(mrt, net) \
+ for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+ struct mr_table *mrt)
+{
+ if (!mrt)
+ return net->ipv4.mrt;
return NULL;
}
-#ifdef CONFIG_IP_PIMSM
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ return net->ipv4.mrt;
+}
-static int reg_vif_num = -1;
+#define __ipmr_get_table ipmr_get_table
-static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+ struct mr_table **mrt)
{
- read_lock(&mrt_lock);
- ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
- ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
- ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
- read_unlock(&mrt_lock);
- kfree_skb(skb);
+ *mrt = net->ipv4.mrt;
return 0;
}
-static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ struct mr_table *mrt;
+
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (IS_ERR(mrt))
+ return PTR_ERR(mrt);
+ net->ipv4.mrt = mrt;
+ return 0;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ ASSERT_RTNL();
+ ipmr_free_table(net->ipv4.mrt);
+ net->ipv4.mrt = NULL;
+}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(const struct net *net)
+{
+ return 0;
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+ return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
+#endif
+
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct mfc_cache_cmp_arg *cmparg = arg->key;
+ const struct mfc_cache *c = ptr;
+
+ return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+ cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+ .head_offset = offsetof(struct mr_mfc, mnode),
+ .key_offset = offsetof(struct mfc_cache, cmparg),
+ .key_len = sizeof(struct mfc_cache_cmp_arg),
+ .nelem_hint = 3,
+ .obj_cmpfn = ipmr_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+static void ipmr_new_table_set(struct mr_table *mrt,
+ struct net *net)
+{
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+}
+
+static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
+ .mfc_mcastgrp = htonl(INADDR_ANY),
+ .mfc_origin = htonl(INADDR_ANY),
+};
+
+static struct mr_table_ops ipmr_mr_table_ops = {
+ .rht_params = &ipmr_rht_params,
+ .cmparg_any = &ipmr_mr_table_ops_cmparg_any,
+};
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (id != RT_TABLE_DEFAULT && id >= 1000000000)
+ return ERR_PTR(-EINVAL);
+
+ mrt = __ipmr_get_table(net, id);
+ if (mrt)
+ return mrt;
+
+ return mr_table_alloc(net, id, &ipmr_mr_table_ops,
+ ipmr_expire_process, ipmr_new_table_set);
+}
+
+static void ipmr_free_table(struct mr_table *mrt)
+{
+ struct net *net = read_pnet(&mrt->net);
+
+ WARN_ON_ONCE(!mr_can_free_table(net));
+
+ timer_shutdown_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
+ MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC);
+ rhltable_destroy(&mrt->mfc_hash);
+ kfree(mrt);
+}
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+/* Initialize ipmr pimreg/tunnel in_device */
+static bool ipmr_init_vif_indev(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return false;
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+ return true;
+}
+
+static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+{
+ struct net_device *tunnel_dev, *new_dev;
+ struct ip_tunnel_parm_kern p = { };
+ int err;
+
+ tunnel_dev = __dev_get_by_name(net, "tunl0");
+ if (!tunnel_dev)
+ goto out;
+
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+
+ if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl)
+ goto out;
+ err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
+ SIOCADDTUNNEL);
+ if (err)
+ goto out;
+
+ new_dev = __dev_get_by_name(net, p.name);
+ if (!new_dev)
+ goto out;
+
+ new_dev->flags |= IFF_MULTICAST;
+ if (!ipmr_init_vif_indev(new_dev))
+ goto out_unregister;
+ if (dev_open(new_dev, NULL))
+ goto out_unregister;
+ dev_hold(new_dev);
+ err = dev_set_allmulti(new_dev, 1);
+ if (err) {
+ dev_close(new_dev);
+ tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
+ SIOCDELTUNNEL);
+ dev_put(new_dev);
+ new_dev = ERR_PTR(err);
+ }
+ return new_dev;
+
+out_unregister:
+ unregister_netdevice(new_dev);
+out:
+ return ERR_PTR(-ENOBUFS);
+}
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct flowi4 fl4 = {
+ .flowi4_oif = dev->ifindex,
+ .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi4_mark = skb->mark,
+ };
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ DEV_STATS_ADD(dev, tx_bytes, skb->len);
+ DEV_STATS_INC(dev, tx_packets);
+ rcu_read_lock();
+
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ IGMPMSG_WHOLEPKT);
+
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int reg_vif_get_iflink(const struct net_device *dev)
{
- return (struct net_device_stats*)netdev_priv(dev);
+ return 0;
}
+static const struct net_device_ops reg_vif_netdev_ops = {
+ .ndo_start_xmit = reg_vif_xmit,
+ .ndo_get_iflink = reg_vif_get_iflink,
+};
+
static void reg_vif_setup(struct net_device *dev)
{
dev->type = ARPHRD_PIMREG;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
- dev->hard_start_xmit = reg_vif_xmit;
- dev->get_stats = reg_vif_get_stats;
- dev->destructor = free_netdev;
+ dev->netdev_ops = &reg_vif_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->netns_immutable = true;
}
-static struct net_device *ipmr_reg_vif(void)
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
struct net_device *dev;
- struct in_device *in_dev;
+ char name[IFNAMSIZ];
- dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
- reg_vif_setup);
+ if (mrt->id == RT_TABLE_DEFAULT)
+ sprintf(name, "pimreg");
+ else
+ sprintf(name, "pimreg%u", mrt->id);
+
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
- if (dev == NULL)
+ if (!dev)
return NULL;
+ dev_net_set(dev, net);
+
if (register_netdevice(dev)) {
free_netdev(dev);
return NULL;
}
- dev->iflink = 0;
- if ((in_dev = inetdev_init(dev)) == NULL)
+ if (!ipmr_init_vif_indev(dev))
goto failure;
-
- in_dev->cnf.rp_filter = 0;
-
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
+ dev_hold(dev);
+
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
+
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+ unsigned int pimlen)
+{
+ struct net_device *reg_dev = NULL;
+ struct iphdr *encap;
+ int vif_num;
+
+ encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+ /* Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
+ */
+ if (!ipv4_is_multicast(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + pimlen > skb->len)
+ return 1;
+
+ /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */
+ vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+ if (vif_num >= 0)
+ reg_dev = vif_dev_read(&mrt->vif_table[vif_num]);
+ if (!reg_dev)
+ return 1;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ return NET_RX_SUCCESS;
+}
+#else
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+ return NULL;
+}
#endif
-/*
- * Delete a VIF entry
+static int call_ipmr_vif_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct vif_device *vif,
+ struct net_device *vif_dev,
+ vifi_t vif_index, u32 tb_id)
+{
+ return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
+ vif, vif_dev, vif_index, tb_id,
+ &net->ipv4.ipmr_seq);
+}
+
+static int call_ipmr_mfc_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct mfc_cache *mfc, u32 tb_id)
+{
+ return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type,
+ &mfc->_c, tb_id, &net->ipv4.ipmr_seq);
+}
+
+/**
+ * vif_delete - Delete a VIF entry
+ * @mrt: Table to delete from
+ * @vifi: VIF identifier to delete
+ * @notify: Set to 1, if the caller is a notifier_call
+ * @head: if unregistering the VIF, place it on this queue
*/
-
-static int vif_delete(int vifi)
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
+ struct list_head *head)
{
+ struct net *net = read_pnet(&mrt->net);
struct vif_device *v;
struct net_device *dev;
struct in_device *in_dev;
- if (vifi < 0 || vifi >= maxvif)
+ if (vifi < 0 || vifi >= mrt->maxvif)
return -EADDRNOTAVAIL;
- v = &vif_table[vifi];
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
+ v = &mrt->vif_table[vifi];
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
-#ifdef CONFIG_IP_PIMSM
- if (vifi == reg_vif_num)
- reg_vif_num = -1;
-#endif
+ spin_lock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ RCU_INIT_POINTER(v->dev, NULL);
- if (vifi+1 == maxvif) {
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
+ if (vifi + 1 == mrt->maxvif) {
int tmp;
- for (tmp=vifi-1; tmp>=0; tmp--) {
- if (VIF_EXISTS(tmp))
+
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
+ if (VIF_EXISTS(mrt, tmp))
break;
}
- maxvif = tmp+1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
- in_dev->cnf.mc_forwarding--;
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
ip_rt_multicast_event(in_dev);
}
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
- unregister_netdevice(dev);
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
+ unregister_netdevice_queue(dev, head);
- dev_put(dev);
+ netdev_put(dev, &v->dev_tracker);
return 0;
}
+static void ipmr_cache_free_rcu(struct rcu_head *head)
+{
+ struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
+
+ kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
+}
+
+static void ipmr_cache_free(struct mfc_cache *c)
+{
+ call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
+}
+
/* Destroy an unresolved cache entry, killing queued skbs
- and reporting error to netlink readers.
+ * and reporting error to netlink readers.
*/
-
-static void ipmr_destroy_unres(struct mfc_cache *c)
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
{
+ struct net *net = read_pnet(&mrt->net);
struct sk_buff *skb;
struct nlmsgerr *e;
- atomic_dec(&cache_resolve_queue_len);
+ atomic_dec(&mrt->cache_resolve_queue_len);
- while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
- if (skb->nh.iph->version == 0) {
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) {
+ if (ip_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = skb_pull(skb,
+ sizeof(struct iphdr));
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -ETIMEDOUT;
memset(&e->msg, 0, sizeof(e->msg));
- rtnl_unicast(skb, NETLINK_CB(skb).pid);
- } else
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
kfree_skb(skb);
+ }
}
- kmem_cache_free(mrt_cachep, c);
+ ipmr_cache_free(c);
}
-
-/* Single timer process for all the unresolved queue. */
-
-static void ipmr_expire_process(unsigned long dummy)
+/* Timer process for the unresolved queue. */
+static void ipmr_expire_process(struct timer_list *t)
{
- unsigned long now;
+ struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer);
+ struct mr_mfc *c, *next;
unsigned long expires;
- struct mfc_cache *c, **cp;
+ unsigned long now;
if (!spin_trylock(&mfc_unres_lock)) {
- mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
+ mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
return;
}
- if (atomic_read(&cache_resolve_queue_len) == 0)
+ if (list_empty(&mrt->mfc_unres_queue))
goto out;
now = jiffies;
expires = 10*HZ;
- cp = &mfc_unres_queue;
- while ((c=*cp) != NULL) {
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
if (time_after(c->mfc_un.unres.expires, now)) {
unsigned long interval = c->mfc_un.unres.expires - now;
if (interval < expires)
expires = interval;
- cp = &c->next;
continue;
}
- *cp = c->next;
-
- ipmr_destroy_unres(c);
+ list_del(&c->list);
+ mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, (struct mfc_cache *)c);
}
- if (atomic_read(&cache_resolve_queue_len))
- mod_timer(&ipmr_expire_timer, jiffies + expires);
+ if (!list_empty(&mrt->mfc_unres_queue))
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
out:
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
-
-static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
+/* Fill oifs list. It is called under locked mrt_lock. */
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
+ unsigned char *ttls)
{
int vifi;
@@ -373,8 +812,9 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
cache->mfc_un.res.maxvif = 0;
memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
- for (vifi=0; vifi<maxvif; vifi++) {
- if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
+ for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+ if (VIF_EXISTS(mrt, vifi) &&
+ ttls[vifi] && ttls[vifi] < 255) {
cache->mfc_un.res.ttls[vifi] = ttls[vifi];
if (cache->mfc_un.res.minvif > vifi)
cache->mfc_un.res.minvif = vifi;
@@ -382,306 +822,366 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
cache->mfc_un.res.maxvif = vifi + 1;
}
}
+ WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}
-static int vif_add(struct vifctl *vifc, int mrtsock)
+static int vif_add(struct net *net, struct mr_table *mrt,
+ struct vifctl *vifc, int mrtsock)
{
+ struct netdev_phys_item_id ppid = { };
int vifi = vifc->vifc_vifi;
- struct vif_device *v = &vif_table[vifi];
+ struct vif_device *v = &mrt->vif_table[vifi];
struct net_device *dev;
struct in_device *in_dev;
+ int err;
/* Is vif busy ? */
- if (VIF_EXISTS(vifi))
+ if (VIF_EXISTS(mrt, vifi))
return -EADDRINUSE;
switch (vifc->vifc_flags) {
-#ifdef CONFIG_IP_PIMSM
case VIFF_REGISTER:
- /*
- * Special Purpose VIF in PIM
+ if (!ipmr_pimsm_enabled())
+ return -EINVAL;
+ /* Special Purpose VIF in PIM
* All the packets will be sent to the daemon
*/
- if (reg_vif_num >= 0)
+ if (mrt->mroute_reg_vif_num >= 0)
return -EADDRINUSE;
- dev = ipmr_reg_vif();
+ dev = ipmr_reg_vif(net, mrt);
if (!dev)
return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ unregister_netdevice(dev);
+ dev_put(dev);
+ return err;
+ }
break;
-#endif
- case VIFF_TUNNEL:
- dev = ipmr_new_tunnel(vifc);
- if (!dev)
- return -ENOBUFS;
+ case VIFF_TUNNEL:
+ dev = ipmr_new_tunnel(net, vifc);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
break;
+ case VIFF_USE_IFINDEX:
case 0:
- dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+ if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+ dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+ if (dev && !__in_dev_get_rtnl(dev)) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ } else {
+ dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+ }
if (!dev)
return -EADDRNOTAVAIL;
- dev_put(dev);
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ dev_put(dev);
+ return err;
+ }
break;
default:
return -EINVAL;
}
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
+ dev_put(dev);
return -EADDRNOTAVAIL;
- in_dev->cnf.mc_forwarding++;
- dev_set_allmulti(dev, +1);
+ }
+ IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
ip_rt_multicast_event(in_dev);
- /*
- * Fill in the VIF structures
- */
- v->rate_limit=vifc->vifc_rate_limit;
- v->local=vifc->vifc_lcl_addr.s_addr;
- v->remote=vifc->vifc_rmt_addr.s_addr;
- v->flags=vifc->vifc_flags;
- if (!mrtsock)
- v->flags |= VIFF_STATIC;
- v->threshold=vifc->vifc_threshold;
- v->bytes_in = 0;
- v->bytes_out = 0;
- v->pkt_in = 0;
- v->pkt_out = 0;
- v->link = dev->ifindex;
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
- v->link = dev->iflink;
+ /* Fill in the VIF structures */
+ vif_device_init(v, dev, vifc->vifc_rate_limit,
+ vifc->vifc_threshold,
+ vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
+ (VIFF_TUNNEL | VIFF_REGISTER));
+
+ err = netif_get_port_parent_id(dev, &ppid, true);
+ if (err == 0) {
+ memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
+ v->dev_parent_id.id_len = ppid.id_len;
+ } else {
+ v->dev_parent_id.id_len = 0;
+ }
+
+ v->local = vifc->vifc_lcl_addr.s_addr;
+ v->remote = vifc->vifc_rmt_addr.s_addr;
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- dev_hold(dev);
- v->dev=dev;
-#ifdef CONFIG_IP_PIMSM
- if (v->flags&VIFF_REGISTER)
- reg_vif_num = vifi;
-#endif
- if (vifi+1 > maxvif)
- maxvif = vifi+1;
- write_unlock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
+ netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
+ if (v->flags & VIFF_REGISTER) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
+ }
+ if (vifi+1 > mrt->maxvif)
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
+ vifi, mrt->id);
return 0;
}
-static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
+/* called with rcu_read_lock() */
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
+ __be32 origin,
+ __be32 mcastgrp)
{
- int line=MFC_HASH(mcastgrp,origin);
- struct mfc_cache *c;
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin
+ };
- for (c=mfc_cache_array[line]; c; c = c->next) {
- if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
- break;
- }
- return c;
+ return mr_mfc_find(mrt, &arg);
}
-/*
- * Allocate a multicast cache entry
- */
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+ __be32 mcastgrp, int vifi)
+{
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+
+ if (mcastgrp == htonl(INADDR_ANY))
+ return mr_mfc_find_any_parent(mrt, vifi);
+ return mr_mfc_find_any(mrt, vifi, &arg);
+}
+
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+ __be32 origin, __be32 mcastgrp,
+ int parent)
+{
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin,
+ };
+
+ return mr_mfc_find_parent(mrt, &arg, parent);
+}
+
+/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
- struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
- if(c==NULL)
- return NULL;
- memset(c, 0, sizeof(*c));
- c->mfc_un.res.minvif = MAXVIFS;
+ struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+
+ if (c) {
+ c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
+ c->_c.mfc_un.res.minvif = MAXVIFS;
+ c->_c.free = ipmr_cache_free_rcu;
+ refcount_set(&c->_c.mfc_un.res.refcount, 1);
+ }
return c;
}
static struct mfc_cache *ipmr_cache_alloc_unres(void)
{
- struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
- if(c==NULL)
- return NULL;
- memset(c, 0, sizeof(*c));
- skb_queue_head_init(&c->mfc_un.unres.unresolved);
- c->mfc_un.unres.expires = jiffies + 10*HZ;
+ struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+
+ if (c) {
+ skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
+ c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
+ }
return c;
}
-/*
- * A cache entry has gone into a resolved state from queued
- */
-
-static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
+/* A cache entry has gone into a resolved state from queued */
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+ struct mfc_cache *uc, struct mfc_cache *c)
{
struct sk_buff *skb;
struct nlmsgerr *e;
- /*
- * Play the pending entries through our router
- */
-
- while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
- if (skb->nh.iph->version == 0) {
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ /* Play the pending entries through our router */
+ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
+ if (ip_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = skb_pull(skb,
+ sizeof(struct iphdr));
- if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
- nlh->nlmsg_len = skb->tail - (u8*)nlh;
+ if (mr_fill_mroute(mrt, skb, &c->_c,
+ nlmsg_data(nlh)) > 0) {
+ nlh->nlmsg_len = skb_tail_pointer(skb) -
+ (u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -EMSGSIZE;
memset(&e->msg, 0, sizeof(e->msg));
}
- rtnl_unicast(skb, NETLINK_CB(skb).pid);
- } else
- ip_mr_forward(skb, c, 0);
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
+ rcu_read_lock();
+ ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
+ rcu_read_unlock();
+ }
}
}
-/*
- * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
- * expects the following bizarre scheme.
+/* Bounce a cache query up to mrouted and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock().
*/
-
-static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
+static int ipmr_cache_report(const struct mr_table *mrt,
+ struct sk_buff *pkt, vifi_t vifi, int assert)
{
- struct sk_buff *skb;
- int ihl = pkt->nh.iph->ihl<<2;
+ const int ihl = ip_hdrlen(pkt);
+ struct sock *mroute_sk;
struct igmphdr *igmp;
struct igmpmsg *msg;
+ struct sk_buff *skb;
int ret;
-#ifdef CONFIG_IP_PIMSM
- if (assert == IGMPMSG_WHOLEPKT)
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute_sk)
+ return -EINVAL;
+
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
-#endif
skb = alloc_skb(128, GFP_ATOMIC);
- if(!skb)
+ if (!skb)
return -ENOBUFS;
-#ifdef CONFIG_IP_PIMSM
- if (assert == IGMPMSG_WHOLEPKT) {
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
/* Ugly, but we have no choice with this interface.
- Duplicate old header, fix ihl, length etc.
- And all this only to mangle msg->im_msgtype and
- to set msg->im_mbz to "mbz" :-)
+ * Duplicate old header, fix ihl, length etc.
+ * And all this only to mangle msg->im_msgtype and
+ * to set msg->im_mbz to "mbz" :-)
*/
- msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
- skb->nh.raw = skb->h.raw = (u8*)msg;
- memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
- msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
+ msg->im_msgtype = assert;
msg->im_mbz = 0;
- msg->im_vif = reg_vif_num;
- skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
- skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
- } else
-#endif
- {
-
- /*
- * Copy the IP header
- */
-
- skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
- memcpy(skb->data,pkt->data,ihl);
- skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */
- msg = (struct igmpmsg*)skb->nh.iph;
- msg->im_vif = vifi;
- skb->dst = dst_clone(pkt->dst);
+ if (assert == IGMPMSG_WRVIFWHOLE) {
+ msg->im_vif = vifi;
+ msg->im_vif_hi = vifi >> 8;
+ } else {
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
- /*
- * Add our header
- */
+ msg->im_vif = vif_num;
+ msg->im_vif_hi = vif_num >> 8;
+ }
+ ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
+ sizeof(struct iphdr));
+ } else {
+ /* Copy the IP header */
+ skb_set_network_header(skb, skb->len);
+ skb_put(skb, ihl);
+ skb_copy_to_linear_data(skb, pkt->data, ihl);
+ /* Flag to the kernel this is a route add */
+ ip_hdr(skb)->protocol = 0;
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ msg->im_vif = vifi;
+ msg->im_vif_hi = vifi >> 8;
+ ipv4_pktinfo_prepare(mroute_sk, pkt, false);
+ memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
+ /* Add our header */
+ igmp = skb_put(skb, sizeof(struct igmphdr));
+ igmp->type = assert;
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ skb->transport_header = skb->network_header;
+ }
- igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
- igmp->type =
- msg->im_msgtype = assert;
- igmp->code = 0;
- skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */
- skb->h.raw = skb->nh.raw;
- }
+ igmpmsg_netlink_event(mrt, skb);
- if (mroute_socket == NULL) {
- kfree_skb(skb);
- return -EINVAL;
- }
+ /* Deliver to mrouted */
+ ret = sock_queue_rcv_skb(mroute_sk, skb);
- /*
- * Deliver to mrouted
- */
- if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
- if (net_ratelimit())
- printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+ if (ret < 0) {
+ net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
}
return ret;
}
-/*
- * Queue a packet for resolution. It gets locked cache entry!
- */
-
-static int
-ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
+/* Queue a packet for resolution. It gets locked cache entry! */
+/* Called under rcu_read_lock() */
+static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
+ struct sk_buff *skb, struct net_device *dev)
{
- int err;
+ const struct iphdr *iph = ip_hdr(skb);
struct mfc_cache *c;
+ bool found = false;
+ int err;
spin_lock_bh(&mfc_unres_lock);
- for (c=mfc_unres_queue; c; c=c->next) {
- if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
- c->mfc_origin == skb->nh.iph->saddr)
+ list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
+ if (c->mfc_mcastgrp == iph->daddr &&
+ c->mfc_origin == iph->saddr) {
+ found = true;
break;
+ }
}
- if (c == NULL) {
- /*
- * Create a new entry if allowable
- */
-
- if (atomic_read(&cache_resolve_queue_len)>=10 ||
- (c=ipmr_cache_alloc_unres())==NULL) {
+ if (!found) {
+ /* Create a new entry if allowable */
+ c = ipmr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
return -ENOBUFS;
}
- /*
- * Fill in the new cache entry
- */
- c->mfc_parent=-1;
- c->mfc_origin=skb->nh.iph->saddr;
- c->mfc_mcastgrp=skb->nh.iph->daddr;
+ /* Fill in the new cache entry */
+ c->_c.mfc_parent = -1;
+ c->mfc_origin = iph->saddr;
+ c->mfc_mcastgrp = iph->daddr;
- /*
- * Reflect first query at mrouted.
- */
- if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
- /* If the report failed throw the cache entry
+ /* Reflect first query at mrouted. */
+ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
+
+ if (err < 0) {
+ /* If the report failed throw the cache entry
out - Brad Parker
*/
spin_unlock_bh(&mfc_unres_lock);
- kmem_cache_free(mrt_cachep, c);
+ ipmr_cache_free(c);
kfree_skb(skb);
return err;
}
- atomic_inc(&cache_resolve_queue_len);
- c->next = mfc_unres_queue;
- mfc_unres_queue = c;
+ atomic_inc(&mrt->cache_resolve_queue_len);
+ list_add(&c->_c.list, &mrt->mfc_unres_queue);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
- mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
+ if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
+ mod_timer(&mrt->ipmr_expire_timer,
+ c->_c.mfc_un.unres.expires);
}
- /*
- * See if we can append the packet
- */
- if (c->mfc_un.unres.unresolved.qlen>3) {
+ /* See if we can append the packet */
+ if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
kfree_skb(skb);
err = -ENOBUFS;
} else {
- skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
+ skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -689,536 +1189,765 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
return err;
}
-/*
- * MFC cache manipulation by user space mroute daemon
- */
+/* MFC cache manipulation by user space mroute daemon */
-static int ipmr_mfc_delete(struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
- int line;
- struct mfc_cache *c, **cp;
-
- line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+ struct net *net = read_pnet(&mrt->net);
+ struct mfc_cache *c;
- for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
- write_lock_bh(&mrt_lock);
- *cp = c->next;
- write_unlock_bh(&mrt_lock);
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (!c)
+ return -ENOENT;
+ rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params);
+ list_del_rcu(&c->_c.list);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ mr_cache_put(&c->_c);
- kmem_cache_free(mrt_cachep, c);
- return 0;
- }
- }
- return -ENOENT;
+ return 0;
}
-static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+ struct mfcctl *mfc, int mrtsock, int parent)
{
- int line;
- struct mfc_cache *uc, *c, **cp;
-
- line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
- for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
- break;
- }
+ struct mfc_cache *uc, *c;
+ struct mr_mfc *_uc;
+ bool found;
+ int ret;
- if (c != NULL) {
- write_lock_bh(&mrt_lock);
- c->mfc_parent = mfc->mfcc_parent;
- ipmr_update_thresholds(c, mfc->mfcc_ttls);
+ if (mfc->mfcc_parent >= MAXVIFS)
+ return -ENFILE;
+
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (c) {
+ spin_lock(&mrt_lock);
+ c->_c.mfc_parent = mfc->mfcc_parent;
+ ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
if (!mrtsock)
- c->mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ c->_c.mfc_flags |= MFC_STATIC;
+ spin_unlock(&mrt_lock);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
+ mrt->id);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
- if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
+ if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+ !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
- c=ipmr_cache_alloc();
- if (c==NULL)
+ c = ipmr_cache_alloc();
+ if (!c)
return -ENOMEM;
- c->mfc_origin=mfc->mfcc_origin.s_addr;
- c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
- c->mfc_parent=mfc->mfcc_parent;
- ipmr_update_thresholds(c, mfc->mfcc_ttls);
+ c->mfc_origin = mfc->mfcc_origin.s_addr;
+ c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
+ c->_c.mfc_parent = mfc->mfcc_parent;
+ ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
if (!mrtsock)
- c->mfc_flags |= MFC_STATIC;
-
- write_lock_bh(&mrt_lock);
- c->next = mfc_cache_array[line];
- mfc_cache_array[line] = c;
- write_unlock_bh(&mrt_lock);
-
- /*
- * Check to see if we resolved a queued list. If so we
- * need to send on the frames and tidy up.
+ c->_c.mfc_flags |= MFC_STATIC;
+
+ ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
+ ipmr_rht_params);
+ if (ret) {
+ pr_err("ipmr: rhtable insert error %d\n", ret);
+ ipmr_cache_free(c);
+ return ret;
+ }
+ list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
+ /* Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
*/
+ found = false;
spin_lock_bh(&mfc_unres_lock);
- for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
- cp = &uc->next) {
+ list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
+ uc = (struct mfc_cache *)_uc;
if (uc->mfc_origin == c->mfc_origin &&
uc->mfc_mcastgrp == c->mfc_mcastgrp) {
- *cp = uc->next;
- if (atomic_dec_and_test(&cache_resolve_queue_len))
- del_timer(&ipmr_expire_timer);
+ list_del(&_uc->list);
+ atomic_dec(&mrt->cache_resolve_queue_len);
+ found = true;
break;
}
}
+ if (list_empty(&mrt->mfc_unres_queue))
+ timer_delete(&mrt->ipmr_expire_timer);
spin_unlock_bh(&mfc_unres_lock);
- if (uc) {
- ipmr_cache_resolve(uc, c);
- kmem_cache_free(mrt_cachep, uc);
+ if (found) {
+ ipmr_cache_resolve(net, mrt, uc, c);
+ ipmr_cache_free(uc);
}
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
-/*
- * Close the multicast socket, and clear the vif tables etc
- */
-
-static void mroute_clean_tables(struct sock *sk)
+/* Close the multicast socket, and clear the vif tables etc */
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
{
+ struct net *net = read_pnet(&mrt->net);
+ struct mr_mfc *c, *tmp;
+ struct mfc_cache *cache;
+ LIST_HEAD(list);
int i;
-
- /*
- * Shut down all active vif entries
- */
- for(i=0; i<maxvif; i++) {
- if (!(vif_table[i].flags&VIFF_STATIC))
- vif_delete(i);
- }
-
- /*
- * Wipe the cache
- */
- for (i=0;i<MFC_LINES;i++) {
- struct mfc_cache *c, **cp;
- cp = &mfc_cache_array[i];
- while ((c = *cp) != NULL) {
- if (c->mfc_flags&MFC_STATIC) {
- cp = &c->next;
+ /* Shut down all active vif entries */
+ if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) {
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+ !(flags & MRT_FLUSH_VIFS_STATIC)) ||
+ (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS)))
continue;
- }
- write_lock_bh(&mrt_lock);
- *cp = c->next;
- write_unlock_bh(&mrt_lock);
-
- kmem_cache_free(mrt_cachep, c);
+ vif_delete(mrt, i, 0, &list);
}
+ unregister_netdevice_many(&list);
}
- if (atomic_read(&cache_resolve_queue_len) != 0) {
- struct mfc_cache *c;
-
- spin_lock_bh(&mfc_unres_lock);
- while (mfc_unres_queue != NULL) {
- c = mfc_unres_queue;
- mfc_unres_queue = c->next;
- spin_unlock_bh(&mfc_unres_lock);
-
- ipmr_destroy_unres(c);
+ /* Wipe the cache */
+ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) ||
+ (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC)))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ cache = (struct mfc_cache *)c;
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
+ mrt->id);
+ mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+ mr_cache_put(c);
+ }
+ }
+ if (flags & MRT_FLUSH_MFC) {
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ cache = (struct mfc_cache *)c;
+ mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, cache);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
}
- spin_unlock_bh(&mfc_unres_lock);
}
}
+/* called from ip_ra_control(), before an RCU grace period,
+ * we don't need to call synchronize_rcu() here
+ */
static void mrtsock_destruct(struct sock *sk)
{
- rtnl_lock();
- if (sk == mroute_socket) {
- ipv4_devconf.mc_forwarding--;
-
- write_lock_bh(&mrt_lock);
- mroute_socket=NULL;
- write_unlock_bh(&mrt_lock);
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
- mroute_clean_tables(sk);
+ rtnl_lock();
+ ipmr_for_each_table(mrt, net) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ RCU_INIT_POINTER(mrt->mroute_sk, NULL);
+ mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC);
+ }
}
rtnl_unlock();
}
-/*
- * Socket options and virtual interface manipulation. The whole
- * virtual interface system is a complete heap, but unfortunately
- * that's how BSD mrouted happens to think. Maybe one day with a proper
- * MOSPF/PIM router set up we can clean this up.
+/* Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
*/
-
-int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
+
+int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
+ unsigned int optlen)
{
- int ret;
+ struct net *net = sock_net(sk);
+ int val, ret = 0, parent = 0;
+ struct mr_table *mrt;
struct vifctl vif;
struct mfcctl mfc;
-
- if(optname!=MRT_INIT)
- {
- if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
- return -EACCES;
- }
-
- switch(optname)
- {
- case MRT_INIT:
- if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->num != IPPROTO_IGMP)
- return -EOPNOTSUPP;
- if(optlen!=sizeof(int))
- return -ENOPROTOOPT;
-
- rtnl_lock();
- if (mroute_socket) {
- rtnl_unlock();
- return -EADDRINUSE;
- }
+ bool do_wrvifwhole;
+ u32 uval;
- ret = ip_ra_control(sk, 1, mrtsock_destruct);
- if (ret == 0) {
- write_lock_bh(&mrt_lock);
- mroute_socket=sk;
- write_unlock_bh(&mrt_lock);
+ /* There's one exception to the lock - MRT_DONE which needs to unlock */
+ rtnl_lock();
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
- ipv4_devconf.mc_forwarding++;
- }
- rtnl_unlock();
- return ret;
- case MRT_DONE:
- if (sk!=mroute_socket)
- return -EACCES;
- return ip_ra_control(sk, 0, NULL);
- case MRT_ADD_VIF:
- case MRT_DEL_VIF:
- if(optlen!=sizeof(vif))
- return -EINVAL;
- if (copy_from_user(&vif,optval,sizeof(vif)))
- return -EFAULT;
- if(vif.vifc_vifi >= MAXVIFS)
- return -ENFILE;
- rtnl_lock();
- if (optname==MRT_ADD_VIF) {
- ret = vif_add(&vif, sk==mroute_socket);
- } else {
- ret = vif_delete(vif.vifc_vifi);
- }
+ mrt = __ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ if (optname != MRT_INIT) {
+ if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EACCES;
+ goto out_unlock;
+ }
+ }
+
+ switch (optname) {
+ case MRT_INIT:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EADDRINUSE;
+ break;
+ }
+
+ ret = ip_ra_control(sk, 1, mrtsock_destruct);
+ if (ret == 0) {
+ rcu_assign_pointer(mrt->mroute_sk, sk);
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ }
+ break;
+ case MRT_DONE:
+ if (sk != rcu_access_pointer(mrt->mroute_sk)) {
+ ret = -EACCES;
+ } else {
+ /* We need to unlock here because mrtsock_destruct takes
+ * care of rtnl itself and we can't change that due to
+ * the IP_ROUTER_ALERT setsockopt which runs without it.
+ */
rtnl_unlock();
- return ret;
+ ret = ip_ra_control(sk, 0, NULL);
+ goto out;
+ }
+ break;
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ if (optlen != sizeof(vif)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&vif, optval, sizeof(vif))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (vif.vifc_vifi >= MAXVIFS) {
+ ret = -ENFILE;
+ break;
+ }
+ if (optname == MRT_ADD_VIF) {
+ ret = vif_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
+ } else {
+ ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
+ }
+ break;
+ /* Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ parent = -1;
+ fallthrough;
+ case MRT_ADD_MFC_PROXY:
+ case MRT_DEL_MFC_PROXY:
+ if (optlen != sizeof(mfc)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (parent == 0)
+ parent = mfc.mfcc_parent;
+ if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+ ret = ipmr_mfc_delete(mrt, &mfc, parent);
+ else
+ ret = ipmr_mfc_add(net, mrt, &mfc,
+ sk == rtnl_dereference(mrt->mroute_sk),
+ parent);
+ break;
+ case MRT_FLUSH:
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(val))) {
+ ret = -EFAULT;
+ break;
+ }
+ mroute_clean_tables(mrt, val);
+ break;
+ /* Control PIM assert. */
+ case MRT_ASSERT:
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(val))) {
+ ret = -EFAULT;
+ break;
+ }
+ mrt->mroute_do_assert = val;
+ break;
+ case MRT_PIM:
+ if (!ipmr_pimsm_enabled()) {
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(val))) {
+ ret = -EFAULT;
+ break;
+ }
- /*
- * Manipulate the forwarding caches. These live
- * in a sort of kernel/user symbiosis.
- */
- case MRT_ADD_MFC:
- case MRT_DEL_MFC:
- if(optlen!=sizeof(mfc))
- return -EINVAL;
- if (copy_from_user(&mfc,optval, sizeof(mfc)))
- return -EFAULT;
- rtnl_lock();
- if (optname==MRT_DEL_MFC)
- ret = ipmr_mfc_delete(&mfc);
+ do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
+ val = !!val;
+ if (val != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = val;
+ mrt->mroute_do_assert = val;
+ mrt->mroute_do_wrvifwhole = do_wrvifwhole;
+ }
+ break;
+ case MRT_TABLE:
+ if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ if (optlen != sizeof(uval)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&uval, optval, sizeof(uval))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EBUSY;
+ } else {
+ mrt = ipmr_new_table(net, uval);
+ if (IS_ERR(mrt))
+ ret = PTR_ERR(mrt);
else
- ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
- rtnl_unlock();
- return ret;
- /*
- * Control PIM assert.
- */
- case MRT_ASSERT:
- {
- int v;
- if(get_user(v,(int __user *)optval))
- return -EFAULT;
- mroute_do_assert=(v)?1:0;
- return 0;
+ raw_sk(sk)->ipmr_table = uval;
}
-#ifdef CONFIG_IP_PIMSM
- case MRT_PIM:
- {
- int v, ret;
- if(get_user(v,(int __user *)optval))
- return -EFAULT;
- v = (v)?1:0;
- rtnl_lock();
- ret = 0;
- if (v != mroute_do_pim) {
- mroute_do_pim = v;
- mroute_do_assert = v;
-#ifdef CONFIG_IP_PIMSM_V2
- if (mroute_do_pim)
- ret = inet_add_protocol(&pim_protocol,
- IPPROTO_PIM);
- else
- ret = inet_del_protocol(&pim_protocol,
- IPPROTO_PIM);
- if (ret < 0)
- ret = -EAGAIN;
-#endif
- }
- rtnl_unlock();
- return ret;
+ break;
+ /* Spurious command, or MRT_VERSION which you cannot set. */
+ default:
+ ret = -ENOPROTOOPT;
+ }
+out_unlock:
+ rtnl_unlock();
+out:
+ return ret;
+}
+
+/* Execute if this ioctl is a special mroute ioctl */
+int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ switch (cmd) {
+ /* These userspace buffers will be consumed by ipmr_ioctl() */
+ case SIOCGETVIFCNT: {
+ struct sioc_vif_req buffer;
+
+ return sock_ioctl_inout(sk, cmd, arg, &buffer,
+ sizeof(buffer));
+ }
+ case SIOCGETSGCNT: {
+ struct sioc_sg_req buffer;
+
+ return sock_ioctl_inout(sk, cmd, arg, &buffer,
+ sizeof(buffer));
}
-#endif
- /*
- * Spurious command, or MRT_VERSION which you cannot
- * set.
- */
- default:
- return -ENOPROTOOPT;
}
+ /* return code > 0 means that the ioctl was not executed */
+ return 1;
}
-/*
- * Getsock opt support for the multicast routing system.
- */
-
-int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
+/* Getsock opt support for the multicast routing system. */
+int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
{
int olr;
int val;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
- if(optname!=MRT_VERSION &&
-#ifdef CONFIG_IP_PIMSM
- optname!=MRT_PIM &&
-#endif
- optname!=MRT_ASSERT)
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (optname) {
+ case MRT_VERSION:
+ val = 0x0305;
+ break;
+ case MRT_PIM:
+ if (!ipmr_pimsm_enabled())
+ return -ENOPROTOOPT;
+ val = mrt->mroute_do_pim;
+ break;
+ case MRT_ASSERT:
+ val = mrt->mroute_do_assert;
+ break;
+ default:
return -ENOPROTOOPT;
+ }
- if (get_user(olr, optlen))
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
return -EFAULT;
-
- olr = min_t(unsigned int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
-
- if(put_user(olr,optlen))
+
+ olr = min_t(unsigned int, olr, sizeof(int));
+
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
return -EFAULT;
- if(optname==MRT_VERSION)
- val=0x0305;
-#ifdef CONFIG_IP_PIMSM
- else if(optname==MRT_PIM)
- val=mroute_do_pim;
-#endif
- else
- val=mroute_do_assert;
- if(copy_to_user(optval,&val,olr))
+ if (copy_to_sockptr(optval, &val, olr))
return -EFAULT;
return 0;
}
-/*
- * The IP multicast ioctl support routines.
- */
-
-int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+/* The IP multicast ioctl support routines. */
+int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{
- struct sioc_sg_req sr;
- struct sioc_vif_req vr;
struct vif_device *vif;
struct mfc_cache *c;
-
- switch(cmd)
- {
- case SIOCGETVIFCNT:
- if (copy_from_user(&vr,arg,sizeof(vr)))
- return -EFAULT;
- if(vr.vifi>=maxvif)
- return -EINVAL;
- read_lock(&mrt_lock);
- vif=&vif_table[vr.vifi];
- if(VIF_EXISTS(vr.vifi)) {
- vr.icount=vif->pkt_in;
- vr.ocount=vif->pkt_out;
- vr.ibytes=vif->bytes_in;
- vr.obytes=vif->bytes_out;
- read_unlock(&mrt_lock);
-
- if (copy_to_user(arg,&vr,sizeof(vr)))
- return -EFAULT;
- return 0;
- }
- read_unlock(&mrt_lock);
- return -EADDRNOTAVAIL;
- case SIOCGETSGCNT:
- if (copy_from_user(&sr,arg,sizeof(sr)))
- return -EFAULT;
-
- read_lock(&mrt_lock);
- c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
- if (c) {
- sr.pktcnt = c->mfc_un.res.pkt;
- sr.bytecnt = c->mfc_un.res.bytes;
- sr.wrong_if = c->mfc_un.res.wrong_if;
- read_unlock(&mrt_lock);
+ struct net *net = sock_net(sk);
+ struct sioc_vif_req *vr;
+ struct sioc_sg_req *sr;
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ vr = (struct sioc_vif_req *)arg;
+ if (vr->vifi >= mrt->maxvif)
+ return -EINVAL;
+ vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif);
+ rcu_read_lock();
+ vif = &mrt->vif_table[vr->vifi];
+ if (VIF_EXISTS(mrt, vr->vifi)) {
+ vr->icount = READ_ONCE(vif->pkt_in);
+ vr->ocount = READ_ONCE(vif->pkt_out);
+ vr->ibytes = READ_ONCE(vif->bytes_in);
+ vr->obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
- if (copy_to_user(arg,&sr,sizeof(sr)))
- return -EFAULT;
- return 0;
- }
- read_unlock(&mrt_lock);
- return -EADDRNOTAVAIL;
- default:
- return -ENOIOCTLCMD;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ sr = (struct sioc_sg_req *)arg;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
+ if (c) {
+ sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
}
}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+ vifi_t vifi; /* Which iface */
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req sr;
+ struct compat_sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
+ rcu_read_lock();
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
struct vif_device *v;
int ct;
+
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
- v=&vif_table[0];
- for(ct=0;ct<maxvif;ct++,v++) {
- if (v->dev==ptr)
- vif_delete(ct);
+
+ ipmr_for_each_table(mrt, net) {
+ v = &mrt->vif_table[0];
+ for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+ if (rcu_access_pointer(v->dev) == dev)
+ vif_delete(mrt, ct, 1, NULL);
+ }
}
return NOTIFY_DONE;
}
-
-static struct notifier_block ip_mr_notifier={
+static struct notifier_block ip_mr_notifier = {
.notifier_call = ipmr_device_event,
};
-/*
- * Encapsulate a packet by attaching a valid IPIP header to it.
- * This avoids tunnel drivers and other mess and gives us the speed so
- * important for multicast video.
+/* Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
*/
-
-static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+static void ip_encap(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr)
{
- struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+ struct iphdr *iph;
+ const struct iphdr *old_iph = ip_hdr(skb);
- iph->version = 4;
- iph->tos = skb->nh.iph->tos;
- iph->ttl = skb->nh.iph->ttl;
+ skb_push(skb, sizeof(struct iphdr));
+ skb->transport_header = skb->network_header;
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->tos = old_iph->tos;
+ iph->ttl = old_iph->ttl;
iph->frag_off = 0;
iph->daddr = daddr;
iph->saddr = saddr;
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
iph->tot_len = htons(skb->len);
- ip_select_ident(iph, skb->dst, NULL);
+ ip_select_ident(net, skb, NULL);
ip_send_check(iph);
- skb->h.ipiph = skb->nh.iph;
- skb->nh.iph = iph;
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- nf_reset(skb);
+ nf_reset_ct(skb);
}
-static inline int ipmr_forward_finish(struct sk_buff *skb)
+static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output(skb);
+ return dst_output(net, sk, skb);
}
-/*
- * Processing handlers for ipmr_forward
- */
+#ifdef CONFIG_NET_SWITCHDEV
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+ int in_vifi, int out_vifi)
+{
+ struct vif_device *out_vif = &mrt->vif_table[out_vifi];
+ struct vif_device *in_vif = &mrt->vif_table[in_vifi];
+
+ if (!skb->offload_l3_fwd_mark)
+ return false;
+ if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
+ return false;
+ return netdev_phys_item_id_same(&out_vif->dev_parent_id,
+ &in_vif->dev_parent_id);
+}
+#else
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+ int in_vifi, int out_vifi)
+{
+ return false;
+}
+#endif
+
+/* Processing handlers for ipmr_forward, under rcu_read_lock() */
-static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
+static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
{
- struct iphdr *iph = skb->nh.iph;
- struct vif_device *vif = &vif_table[vifi];
- struct net_device *dev;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct vif_device *vif = &mrt->vif_table[vifi];
+ struct net_device *vif_dev;
struct rtable *rt;
+ struct flowi4 fl4;
int encap = 0;
- if (vif->dev == NULL)
- goto out_free;
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
+ return -1;
-#ifdef CONFIG_IP_PIMSM
if (vif->flags & VIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out+=skb->len;
- ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
- ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
- ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
- kfree_skb(skb);
- return;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+ DEV_STATS_INC(vif_dev, tx_packets);
+ ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
+ return -1;
}
-#endif
- if (vif->flags&VIFF_TUNNEL) {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = vif->remote,
- .saddr = vif->local,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
- if (ip_route_output_key(&rt, &fl))
- goto out_free;
+ if (vif->flags & VIFF_TUNNEL) {
+ rt = ip_route_output_ports(net, &fl4, NULL,
+ vif->remote, vif->local,
+ 0, 0,
+ IPPROTO_IPIP,
+ iph->tos & INET_DSCP_MASK, vif->link);
+ if (IS_ERR(rt))
+ return -1;
encap = sizeof(struct iphdr);
} else {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
- if (ip_route_output_key(&rt, &fl))
- goto out_free;
+ rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+ 0, 0,
+ IPPROTO_IPIP,
+ iph->tos & INET_DSCP_MASK, vif->link);
+ if (IS_ERR(rt))
+ return -1;
}
- dev = rt->u.dst.dev;
-
- if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
- allow to send ICMP, so that packets will disappear
- to blackhole.
+ * allow to send ICMP, so that packets will disappear
+ * to blackhole.
*/
-
- IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
- goto out_free;
+ return -1;
}
- encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+ encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
- ip_rt_put(rt);
- goto out_free;
+ ip_rt_put(rt);
+ return -1;
}
- vif->pkt_out++;
- vif->bytes_out+=skb->len;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
- iph = skb->nh.iph;
- ip_decrease_ttl(iph);
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ ip_decrease_ttl(ip_hdr(skb));
/* FIXME: forward and output firewalls used to be called here.
- * What do we do with netfilter? -- RR */
+ * What do we do with netfilter? -- RR
+ */
if (vif->flags & VIFF_TUNNEL) {
- ip_encap(skb, vif->local, vif->remote);
+ ip_encap(net, skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
- ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
- ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
+ DEV_STATS_INC(vif_dev, tx_packets);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
}
+ return 0;
+}
+
+static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
+ int in_vifi, struct sk_buff *skb, int vifi)
+{
+ struct rtable *rt;
+
+ if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+ goto out_free;
+
+ if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ rt = skb_rtable(skb);
+
IPCB(skb)->flags |= IPSKB_FORWARDED;
- /*
- * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
* not only before forwarding, but after forwarding on all output
* interfaces. It is clear, if mrouter runs a multicasting
* program, it should receive packets not depending to what interface
@@ -1228,180 +1957,274 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst),
ipmr_forward_finish);
return;
out_free:
kfree_skb(skb);
+}
+
+static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ ip_mc_output(net, NULL, skb);
return;
+
+out_free:
+ kfree_skb(skb);
}
-static int ipmr_find_vif(struct net_device *dev)
+/* Called with mrt_lock or rcu_read_lock() */
+static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
{
int ct;
- for (ct=maxvif-1; ct>=0; ct--) {
- if (vif_table[ct].dev == dev)
+ /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
/* "local" means that we should preserve one skb (for local delivery) */
-
-static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+/* Called uner rcu_read_lock() */
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *c, int local)
{
+ int true_vifi = ipmr_find_vif(mrt, dev);
int psend = -1;
int vif, ct;
- vif = cache->mfc_parent;
- cache->mfc_un.res.pkt++;
- cache->mfc_un.res.bytes += skb->len;
+ vif = c->_c.mfc_parent;
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
- /*
- * Wrong interface: drop packet and (maybe) send PIM assert.
- */
- if (vif_table[vif].dev != skb->dev) {
- int true_vifi;
+ if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+ struct mfc_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incoming
+ * interface is part of the static tree.
+ */
+ cache_proxy = mr_mfc_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
- if (((struct rtable*)skb->dst)->fl.iif == 0) {
+ /* Wrong interface: drop packet and (maybe) send PIM assert. */
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
+ if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
- Very complicated situation...
-
- The best workaround until routing daemons will be
- fixed is not to redistribute packet, if it was
- send through wrong interface. It means, that
- multicast applications WILL NOT work for
- (S,G), which have default multicast route pointing
- to wrong oif. In any case, it is not a good
- idea to use multicasting applications on router.
+ * Very complicated situation...
+ *
+ * The best workaround until routing daemons will be
+ * fixed is not to redistribute packet, if it was
+ * send through wrong interface. It means, that
+ * multicast applications WILL NOT work for
+ * (S,G), which have default multicast route pointing
+ * to wrong oif. In any case, it is not a good
+ * idea to use multicasting applications on router.
*/
goto dont_forward;
}
- cache->mfc_un.res.wrong_if++;
- true_vifi = ipmr_find_vif(skb->dev);
+ atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
- if (true_vifi >= 0 && mroute_do_assert &&
+ if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
- so that we cannot check that packet arrived on an oif.
- It is bad, but otherwise we would need to move pretty
- large chunk of pimd to kernel. Ough... --ANK
+ * so that we cannot check that packet arrived on an oif.
+ * It is bad, but otherwise we would need to move pretty
+ * large chunk of pimd to kernel. Ough... --ANK
*/
- (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
- time_after(jiffies,
- cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
- cache->mfc_un.res.last_assert = jiffies;
- ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
+ (mrt->mroute_do_pim ||
+ c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
+ time_after(jiffies,
+ c->_c.mfc_un.res.last_assert +
+ MFC_ASSERT_THRESH)) {
+ c->_c.mfc_un.res.last_assert = jiffies;
+ ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ipmr_cache_report(mrt, skb, true_vifi,
+ IGMPMSG_WRVIFWHOLE);
}
goto dont_forward;
}
- vif_table[vif].pkt_in++;
- vif_table[vif].bytes_in+=skb->len;
-
- /*
- * Forward the frame
- */
- for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
- if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
+forward:
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
+
+ /* Forward the frame */
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (true_vifi >= 0 &&
+ true_vifi != c->_c.mfc_parent &&
+ ip_hdr(skb)->ttl >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((c->mfc_origin != htonl(INADDR_ANY) ||
+ ct != true_vifi) &&
+ ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
- ipmr_queue_xmit(skb2, cache, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi,
+ skb2, psend);
}
- psend=ct;
+ psend = ct;
}
}
+last_forward:
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
- ipmr_queue_xmit(skb2, cache, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb2,
+ psend);
} else {
- ipmr_queue_xmit(skb, cache, psend);
- return 0;
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb, psend);
+ return;
}
}
dont_forward:
if (!local)
kfree_skb(skb);
- return 0;
}
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_dscp = ip4h_dscp(iph),
+ .flowi4_oif = (rt_is_output_route(rt) ?
+ skb->dev->ifindex : 0),
+ .flowi4_iif = (rt_is_output_route(rt) ?
+ LOOPBACK_IFINDEX :
+ skb->dev->ifindex),
+ .flowi4_mark = skb->mark,
+ };
+ struct mr_table *mrt;
+ int err;
-/*
- * Multicast packets for forwarding arrive here
- */
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err)
+ return ERR_PTR(err);
+ return mrt;
+}
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
int ip_mr_input(struct sk_buff *skb)
{
struct mfc_cache *cache;
- int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
+ struct net *net = dev_net(skb->dev);
+ int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+ struct mr_table *mrt;
+ struct net_device *dev;
+
+ /* skb->dev passed in is the loX master dev for vrfs.
+ * As there are no vifs associated with loopback devices,
+ * get the proper interface that does have a vif associated with it.
+ */
+ dev = skb->dev;
+ if (netif_is_l3_master(skb->dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
/* Packet is looped back after forward, it should not be
- forwarded second time, but still can be delivered locally.
+ * forwarded second time, but still can be delivered locally.
*/
- if (IPCB(skb)->flags&IPSKB_FORWARDED)
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto dont_forward;
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt)) {
+ kfree_skb(skb);
+ return PTR_ERR(mrt);
+ }
if (!local) {
- if (IPCB(skb)->opt.router_alert) {
- if (ip_call_ra_chain(skb))
- return 0;
- } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
- /* IGMPv1 (and broken IGMPv2 implementations sort of
- Cisco IOS <= 11.2(8)) do not put router alert
- option to IGMP packets destined to routable
- groups. It is very bad, because it means
- that we can forward NO IGMP messages.
- */
- read_lock(&mrt_lock);
- if (mroute_socket) {
- nf_reset(skb);
- raw_rcv(mroute_socket, skb);
- read_unlock(&mrt_lock);
- return 0;
- }
- read_unlock(&mrt_lock);
- }
- }
-
- read_lock(&mrt_lock);
- cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
-
- /*
- * No usable cache entry
- */
- if (cache==NULL) {
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ * Cisco IOS <= 11.2(8)) do not put router alert
+ * option to IGMP packets destined to routable
+ * groups. It is very bad, because it means
+ * that we can forward NO IGMP messages.
+ */
+ struct sock *mroute_sk;
+
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk) {
+ nf_reset_ct(skb);
+ raw_rcv(mroute_sk, skb);
+ return 0;
+ }
+ }
+ }
+
+ /* already under rcu_read_lock() */
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (!cache) {
+ int vif = ipmr_find_vif(mrt, dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
+
+ /* No usable cache entry */
+ if (!cache) {
int vif;
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
ip_local_deliver(skb);
- if (skb2 == NULL) {
- read_unlock(&mrt_lock);
+ if (!skb2)
return -ENOBUFS;
- }
skb = skb2;
}
- vif = ipmr_find_vif(skb->dev);
- if (vif >= 0) {
- int err = ipmr_cache_unresolved(vif, skb);
- read_unlock(&mrt_lock);
-
- return err;
- }
- read_unlock(&mrt_lock);
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
kfree_skb(skb);
return -ENODEV;
}
- ip_mr_forward(skb, cache, local);
-
- read_unlock(&mrt_lock);
+ ip_mr_forward(net, mrt, dev, skb, cache, local);
if (local)
return ip_local_deliver(skb);
@@ -1415,401 +2238,894 @@ dont_forward:
return 0;
}
-#ifdef CONFIG_IP_PIMSM_V1
-/*
- * Handle IGMP messages of PIMv1
- */
+static void ip_mr_output_finish(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *c)
+{
+ int psend = -1;
+ int ct;
+
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+ /* Forward the frame */
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (ip_hdr(skb)->ttl >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_xmit;
+ }
+ goto dont_xmit;
+ }
+
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ipmr_queue_output_xmit(net, mrt,
+ skb2, psend);
+ }
+ psend = ct;
+ }
+ }
+
+last_xmit:
+ if (psend != -1) {
+ ipmr_queue_output_xmit(net, mrt, skb, psend);
+ return;
+ }
+
+dont_xmit:
+ kfree_skb(skb);
+}
-int pim_rcv_v1(struct sk_buff * skb)
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct igmphdr *pim;
- struct iphdr *encap;
- struct net_device *reg_dev = NULL;
+ struct rtable *rt = skb_rtable(skb);
+ struct mfc_cache *cache;
+ struct net_device *dev;
+ struct mr_table *mrt;
+ int vif;
- if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
- goto drop;
+ guard(rcu)();
- pim = (struct igmphdr*)skb->h.raw;
+ dev = dst_dev_rcu(&rt->dst);
- if (!mroute_do_pim ||
- skb->len < sizeof(*pim) + sizeof(*encap) ||
- pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
- goto drop;
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
+ goto mc_output;
+ if (!(IPCB(skb)->flags & IPSKB_MCROUTE))
+ goto mc_output;
- encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
- /*
- Check that:
- a. packet is really destinted to a multicast group
- b. packet is not a NULL-REGISTER
- c. packet is not truncated
- */
- if (!MULTICAST(encap->daddr) ||
- encap->tot_len == 0 ||
- ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
- goto drop;
+ skb->dev = dev;
- read_lock(&mrt_lock);
- if (reg_vif_num >= 0)
- reg_dev = vif_table[reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
- read_unlock(&mrt_lock);
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto mc_output;
- if (reg_dev == NULL)
- goto drop;
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (!cache) {
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
- skb->mac.raw = skb->nh.raw;
- skb_pull(skb, (u8*)encap - skb->data);
- skb->nh.iph = (struct iphdr *)skb->data;
- skb->dev = reg_dev;
- skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = 0;
- skb->pkt_type = PACKET_HOST;
- dst_release(skb->dst);
- skb->dst = NULL;
- ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
- ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
- nf_reset(skb);
- netif_rx(skb);
- dev_put(reg_dev);
- return 0;
- drop:
- kfree_skb(skb);
+ /* No usable cache entry */
+ if (!cache) {
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
+ goto mc_output;
+ }
+
+ vif = cache->_c.mfc_parent;
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+ goto mc_output;
+
+ ip_mr_output_finish(net, mrt, dev, skb, cache);
return 0;
+
+mc_output:
+ return ip_mc_output(net, sk, skb);
}
-#endif
-#ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+#ifdef CONFIG_IP_PIMSM_V1
+/* Handle IGMP messages of PIMv1 */
+int pim_rcv_v1(struct sk_buff *skb)
{
- struct pimreghdr *pim;
- struct iphdr *encap;
- struct net_device *reg_dev = NULL;
+ struct igmphdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
- if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
goto drop;
- pim = (struct pimreghdr*)skb->h.raw;
- if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
- (pim->flags&PIM_NULL_REGISTER) ||
- (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
- csum_fold(skb_checksum(skb, 0, skb->len, 0))))
- goto drop;
+ pim = igmp_hdr(skb);
- /* check if the inner packet is destined to mcast group */
- encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
- if (!MULTICAST(encap->daddr) ||
- encap->tot_len == 0 ||
- ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
goto drop;
-
- read_lock(&mrt_lock);
- if (reg_vif_num >= 0)
- reg_dev = vif_table[reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
- read_unlock(&mrt_lock);
-
- if (reg_dev == NULL)
+ if (!mrt->mroute_do_pim ||
+ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
goto drop;
- skb->mac.raw = skb->nh.raw;
- skb_pull(skb, (u8*)encap - skb->data);
- skb->nh.iph = (struct iphdr *)skb->data;
- skb->dev = reg_dev;
- skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = 0;
- skb->pkt_type = PACKET_HOST;
- dst_release(skb->dst);
- ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
- ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
- skb->dst = NULL;
- nf_reset(skb);
- netif_rx(skb);
- dev_put(reg_dev);
- return 0;
- drop:
- kfree_skb(skb);
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+ kfree_skb(skb);
+ }
return 0;
}
#endif
-static int
-ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff *skb)
{
- int ct;
- struct rtnexthop *nhp;
- struct net_device *dev = vif_table[c->mfc_parent].dev;
- u8 *b = skb->tail;
- struct rtattr *mp_head;
+ struct pimreghdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
- if (dev)
- RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+ goto drop;
- mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
+ pim = (struct pimreghdr *)skb_transport_header(skb);
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
+ (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+ csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+ goto drop;
- for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (c->mfc_un.res.ttls[ct] < 255) {
- if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
- goto rtattr_failure;
- nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
- nhp->rtnh_flags = 0;
- nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
- nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
- nhp->rtnh_len = sizeof(*nhp);
- }
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto drop;
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+ kfree_skb(skb);
}
- mp_head->rta_type = RTA_MULTIPATH;
- mp_head->rta_len = skb->tail - (u8*)mp_head;
- rtm->rtm_type = RTN_MULTICAST;
- return 1;
-
-rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -EMSGSIZE;
+ return 0;
}
+#endif
-int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr,
+ struct rtmsg *rtm, u32 portid)
{
- int err;
struct mfc_cache *cache;
- struct rtable *rt = (struct rtable*)skb->dst;
+ struct mr_table *mrt;
+ int err;
+
+ rcu_read_lock();
+ mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
- read_lock(&mrt_lock);
- cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
+ cache = ipmr_cache_find(mrt, saddr, daddr);
+ if (!cache && skb->dev) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
- if (cache==NULL) {
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, daddr, vif);
+ }
+ if (!cache) {
struct sk_buff *skb2;
+ struct iphdr *iph;
struct net_device *dev;
- int vif;
-
- if (nowait) {
- read_unlock(&mrt_lock);
- return -EAGAIN;
- }
+ int vif = -1;
dev = skb->dev;
- if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
- read_unlock(&mrt_lock);
+ if (dev)
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif < 0) {
+ rcu_read_unlock();
return -ENODEV;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
- skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr));
- skb2->nh.iph->ihl = sizeof(struct iphdr)>>2;
- skb2->nh.iph->saddr = rt->rt_src;
- skb2->nh.iph->daddr = rt->rt_dst;
- skb2->nh.iph->version = 0;
- err = ipmr_cache_unresolved(vif, skb2);
- read_unlock(&mrt_lock);
+ NETLINK_CB(skb2).portid = portid;
+ skb_push(skb2, sizeof(struct iphdr));
+ skb_reset_network_header(skb2);
+ iph = ip_hdr(skb2);
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+ iph->version = 0;
+ err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
+ rcu_read_unlock();
return err;
}
- if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
- cache->mfc_flags |= MFC_NOTIFY;
- err = ipmr_fill_mroute(skb, cache, rtm);
- read_unlock(&mrt_lock);
+ err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
+ rcu_read_unlock();
return err;
}
-#ifdef CONFIG_PROC_FS
-/*
- * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
- */
-struct ipmr_vif_iter {
- int ct;
-};
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mfc_cache *c, int cmd,
+ int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+ int err;
-static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
- loff_t pos)
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = RTNL_FAMILY_IPMR;
+ rtm->rtm_dst_len = 32;
+ rtm->rtm_src_len = 32;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = mrt->id;
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
+ rtm->rtm_type = RTN_MULTICAST;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (c->_c.mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
+ rtm->rtm_flags = 0;
+
+ if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
+ goto nla_put_failure;
+ err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c, int cmd,
+ int flags)
{
- for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
- if(!VIF_EXISTS(iter->ct))
- continue;
- if (pos-- == 0)
- return &vif_table[iter->ct];
- }
- return NULL;
+ return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c,
+ cmd, flags);
}
-static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+static size_t mroute_msgsize(bool unresolved, int maxvif)
{
- read_lock(&mrt_lock);
- return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
- : SEQ_START_TOKEN;
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_SRC */
+ + nla_total_size(4) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
}
-static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd)
{
- struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ipmr_vif_seq_idx(iter, 0);
-
- while (++iter->ct < maxvif) {
- if(!VIF_EXISTS(iter->ct))
- continue;
- return &vif_table[iter->ct];
- }
- return NULL;
+ skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS,
+ mrt->maxvif),
+ GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}
-static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
{
- read_unlock(&mrt_lock);
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ + nla_total_size(1) /* IPMRA_CREPORT_MSGTYPE */
+ + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */
+ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */
+ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */
+ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */
+ /* IPMRA_CREPORT_PKT */
+ + nla_total_size(payloadlen)
+ ;
+
+ return len;
}
-static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
- if (v == SEQ_START_TOKEN) {
- seq_puts(seq,
- "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
- } else {
- const struct vif_device *vif = v;
- const char *name = vif->dev ? vif->dev->name : "none";
+ struct net *net = read_pnet(&mrt->net);
+ struct nlmsghdr *nlh;
+ struct rtgenmsg *rtgenm;
+ struct igmpmsg *msg;
+ struct sk_buff *skb;
+ struct nlattr *nla;
+ int payloadlen;
+
+ payloadlen = pkt->len - sizeof(struct igmpmsg);
+ msg = (struct igmpmsg *)skb_network_header(pkt);
+
+ skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+ sizeof(struct rtgenmsg), 0);
+ if (!nlh)
+ goto errout;
+ rtgenm = nlmsg_data(nlh);
+ rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
+ if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
+ nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) ||
+ nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
+ msg->im_src.s_addr) ||
+ nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
+ msg->im_dst.s_addr) ||
+ nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
+ if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
+ nla_data(nla), payloadlen))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
+ return;
- seq_printf(seq,
- "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
- vif - vif_table,
- name, vif->bytes_in, vif->pkt_in,
- vif->bytes_out, vif->pkt_out,
- vif->flags, vif->local, vif->remote);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+errout:
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
+}
+
+static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_TABLE:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request");
+ return -EINVAL;
+ }
}
+
return 0;
}
-static struct seq_operations ipmr_vif_seq_ops = {
- .start = ipmr_vif_seq_start,
- .next = ipmr_vif_seq_next,
- .stop = ipmr_vif_seq_stop,
- .show = ipmr_vif_seq_show,
-};
+static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[RTA_MAX + 1];
+ struct sk_buff *skb = NULL;
+ struct mfc_cache *cache;
+ struct mr_table *mrt;
+ __be32 src, grp;
+ u32 tableid;
+ int err;
+
+ err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
+ if (err < 0)
+ goto errout;
+
+ src = nla_get_in_addr_default(tb[RTA_SRC], 0);
+ grp = nla_get_in_addr_default(tb[RTA_DST], 0);
+ tableid = nla_get_u32_default(tb[RTA_TABLE], 0);
+
+ mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
+ if (!mrt) {
+ err = -ENOENT;
+ goto errout_free;
+ }
-static int ipmr_vif_open(struct inode *inode, struct file *file)
+ /* entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ cache = ipmr_cache_find(mrt, src, grp);
+ rcu_read_unlock();
+ if (!cache) {
+ err = -ENOENT;
+ goto errout_free;
+ }
+
+ skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout_free;
+ }
+
+ err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, cache,
+ RTM_NEWROUTE, 0);
+ if (err < 0)
+ goto errout_free;
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout:
+ return err;
+
+errout_free:
+ kfree_skb(skb);
+ goto errout;
+}
+
+static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
+ struct fib_dump_filter filter = {
+ .rtnl_held = true,
+ };
+ int err;
- rc = seq_open(file, &ipmr_vif_seq_ops);
- if (rc)
- goto out_kfree;
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
- s->ct = 0;
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR)
+ return skb->len;
+ NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
+ return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
+ _ipmr_fill_mroute, &mfc_unres_lock, &filter);
}
-static struct file_operations ipmr_vif_fops = {
- .owner = THIS_MODULE,
- .open = ipmr_vif_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
+ [RTA_SRC] = { .type = NLA_U32 },
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
};
-struct ipmr_mfc_iter {
- struct mfc_cache **cache;
- int ct;
-};
+static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
+{
+ switch (rtm_protocol) {
+ case RTPROT_STATIC:
+ case RTPROT_MROUTED:
+ return true;
+ }
+ return false;
+}
+static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
+{
+ struct rtnexthop *rtnh = nla_data(nla);
+ int remaining = nla_len(nla), vifi = 0;
-static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
+ while (rtnh_ok(rtnh, remaining)) {
+ mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
+ if (++vifi == MAXVIFS)
+ break;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return remaining > 0 ? -EINVAL : vifi;
+}
+
+/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
+static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
+ struct mfcctl *mfcc, int *mrtsock,
+ struct mr_table **mrtret,
+ struct netlink_ext_ack *extack)
{
- struct mfc_cache *mfc;
+ struct net_device *dev = NULL;
+ u32 tblid = RT_TABLE_DEFAULT;
+ struct mr_table *mrt;
+ struct nlattr *attr;
+ struct rtmsg *rtm;
+ int ret, rem;
+
+ ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+ rtm_ipmr_policy, extack);
+ if (ret < 0)
+ goto out;
+ rtm = nlmsg_data(nlh);
- it->cache = mfc_cache_array;
- read_lock(&mrt_lock);
- for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
- for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
- if (pos-- == 0)
- return mfc;
- read_unlock(&mrt_lock);
+ ret = -EINVAL;
+ if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
+ rtm->rtm_type != RTN_MULTICAST ||
+ rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
+ !ipmr_rtm_validate_proto(rtm->rtm_protocol))
+ goto out;
- it->cache = &mfc_unres_queue;
- spin_lock_bh(&mfc_unres_lock);
- for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
- if (pos-- == 0)
- return mfc;
- spin_unlock_bh(&mfc_unres_lock);
+ memset(mfcc, 0, sizeof(*mfcc));
+ mfcc->mfcc_parent = -1;
+ ret = 0;
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
+ switch (nla_type(attr)) {
+ case RTA_SRC:
+ mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
+ break;
+ case RTA_DST:
+ mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
+ break;
+ case RTA_IIF:
+ dev = __dev_get_by_index(net, nla_get_u32(attr));
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ break;
+ case RTA_MULTIPATH:
+ if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case RTA_PREFSRC:
+ ret = 1;
+ break;
+ case RTA_TABLE:
+ tblid = nla_get_u32(attr);
+ break;
+ }
+ }
+ mrt = __ipmr_get_table(net, tblid);
+ if (!mrt) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *mrtret = mrt;
+ *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
+ if (dev)
+ mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);
- it->cache = NULL;
- return NULL;
+out:
+ return ret;
}
+/* takes care of both newroute and delroute */
+static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ int ret, mrtsock, parent;
+ struct mr_table *tbl;
+ struct mfcctl mfcc;
+
+ mrtsock = 0;
+ tbl = NULL;
+ ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
+ if (ret < 0)
+ return ret;
+
+ parent = ret ? mfcc.mfcc_parent : -1;
+ if (nlh->nlmsg_type == RTM_NEWROUTE)
+ return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
+ else
+ return ipmr_mfc_delete(tbl, &mfcc, parent);
+}
-static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
{
- struct ipmr_mfc_iter *it = seq->private;
- it->cache = NULL;
- it->ct = 0;
- return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
- : SEQ_START_TOKEN;
+ u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);
+
+ if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
+ nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
+ nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
+ mrt->mroute_reg_vif_num) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
+ mrt->mroute_do_assert) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+ mrt->mroute_do_wrvifwhole))
+ return false;
+
+ return true;
}
-static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
- struct mfc_cache *mfc = v;
- struct ipmr_mfc_iter *it = seq->private;
+ struct net_device *vif_dev;
+ struct nlattr *vif_nest;
+ struct vif_device *vif;
- ++*pos;
+ vif = &mrt->vif_table[vifid];
+ vif_dev = rtnl_dereference(vif->dev);
+ /* if the VIF doesn't exist just continue */
+ if (!vif_dev)
+ return true;
+
+ vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
+ if (!vif_nest)
+ return false;
+
+ if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
+ nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
+ nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
+ IPMRA_VIFA_PAD) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
+ IPMRA_VIFA_PAD) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
+ IPMRA_VIFA_PAD) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
+ IPMRA_VIFA_PAD) ||
+ nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
+ nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
+ nla_nest_cancel(skb, vif_nest);
+ return false;
+ }
+ nla_nest_end(skb, vif_nest);
- if (v == SEQ_START_TOKEN)
- return ipmr_mfc_seq_idx(seq->private, 0);
+ return true;
+}
- if (mfc->next)
- return mfc->next;
-
- if (it->cache == &mfc_unres_queue)
- goto end_of_list;
+static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
- BUG_ON(it->cache != mfc_cache_array);
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
+ return -EINVAL;
+ }
- while (++it->ct < MFC_LINES) {
- mfc = mfc_cache_array[it->ct];
- if (mfc)
- return mfc;
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
+ return -EINVAL;
}
- /* exhausted cache_array, show unresolved */
- read_unlock(&mrt_lock);
- it->cache = &mfc_unres_queue;
- it->ct = 0;
-
- spin_lock_bh(&mfc_unres_lock);
- mfc = mfc_unres_queue;
- if (mfc)
- return mfc;
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
+ return -EINVAL;
+ }
- end_of_list:
- spin_unlock_bh(&mfc_unres_lock);
- it->cache = NULL;
+ return 0;
+}
- return NULL;
+static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlmsghdr *nlh = NULL;
+ unsigned int t = 0, s_t;
+ unsigned int e = 0, s_e;
+ struct mr_table *mrt;
+
+ if (cb->strict_check) {
+ int err = ipmr_valid_dumplink(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
+ s_t = cb->args[0];
+ s_e = cb->args[1];
+
+ ipmr_for_each_table(mrt, net) {
+ struct nlattr *vifs, *af;
+ struct ifinfomsg *hdr;
+ u32 i;
+
+ if (t < s_t)
+ goto skip_table;
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWLINK,
+ sizeof(*hdr), NLM_F_MULTI);
+ if (!nlh)
+ break;
+
+ hdr = nlmsg_data(nlh);
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->ifi_family = RTNL_FAMILY_IPMR;
+
+ af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
+ if (!af) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ if (!ipmr_fill_table(mrt, skb)) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS);
+ if (!vifs) {
+ nla_nest_end(skb, af);
+ nlmsg_end(skb, nlh);
+ goto out;
+ }
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (e < s_e)
+ goto skip_entry;
+ if (!ipmr_fill_vif(mrt, i, skb)) {
+ nla_nest_end(skb, vifs);
+ nla_nest_end(skb, af);
+ nlmsg_end(skb, nlh);
+ goto out;
+ }
+skip_entry:
+ e++;
+ }
+ s_e = 0;
+ e = 0;
+ nla_nest_end(skb, vifs);
+ nla_nest_end(skb, af);
+ nlmsg_end(skb, nlh);
+skip_table:
+ t++;
+ }
+
+out:
+ cb->args[1] = e;
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+/* The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+ */
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct mr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ rcu_read_lock();
+ mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt) {
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+ }
+
+ iter->mrt = mrt;
+
+ return mr_vif_seq_start(seq, pos);
}
-static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
{
- struct ipmr_mfc_iter *it = seq->private;
+ struct mr_vif_iter *iter = seq->private;
+ struct mr_table *mrt = iter->mrt;
- if (it->cache == &mfc_unres_queue)
- spin_unlock_bh(&mfc_unres_lock);
- else if (it->cache == mfc_cache_array)
- read_unlock(&mrt_lock);
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
+ } else {
+ const struct vif_device *vif = v;
+ const struct net_device *vif_dev;
+ const char *name;
+
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
+ seq_printf(seq,
+ "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ vif - mrt->vif_table,
+ name, vif->bytes_in, vif->pkt_in,
+ vif->bytes_out, vif->pkt_out,
+ vif->flags, vif->local, vif->remote);
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_vif_seq_ops = {
+ .start = ipmr_vif_seq_start,
+ .next = mr_vif_seq_next,
+ .stop = ipmr_vif_seq_stop,
+ .show = ipmr_vif_seq_show,
+};
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
}
static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -1817,96 +3133,200 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
int n;
if (v == SEQ_START_TOKEN) {
- seq_puts(seq,
+ seq_puts(seq,
"Group Origin Iif Pkts Bytes Wrong Oifs\n");
} else {
const struct mfc_cache *mfc = v;
- const struct ipmr_mfc_iter *it = seq->private;
-
- seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
- (unsigned long) mfc->mfc_mcastgrp,
- (unsigned long) mfc->mfc_origin,
- mfc->mfc_parent,
- mfc->mfc_un.res.pkt,
- mfc->mfc_un.res.bytes,
- mfc->mfc_un.res.wrong_if);
-
- if (it->cache != &mfc_unres_queue) {
- for(n = mfc->mfc_un.res.minvif;
- n < mfc->mfc_un.res.maxvif; n++ ) {
- if(VIF_EXISTS(n)
- && mfc->mfc_un.res.ttls[n] < 255)
- seq_printf(seq,
- " %2d:%-3d",
- n, mfc->mfc_un.res.ttls[n]);
+ const struct mr_mfc_iter *it = seq->private;
+ const struct mr_table *mrt = it->mrt;
+
+ seq_printf(seq, "%08X %08X %-3hd",
+ (__force u32) mfc->mfc_mcastgrp,
+ (__force u32) mfc->mfc_origin,
+ mfc->_c.mfc_parent);
+
+ if (it->cache != &mrt->mfc_unres_queue) {
+ seq_printf(seq, " %8lu %8lu %8lu",
+ atomic_long_read(&mfc->_c.mfc_un.res.pkt),
+ atomic_long_read(&mfc->_c.mfc_un.res.bytes),
+ atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
+ for (n = mfc->_c.mfc_un.res.minvif;
+ n < mfc->_c.mfc_un.res.maxvif; n++) {
+ if (VIF_EXISTS(mrt, n) &&
+ mfc->_c.mfc_un.res.ttls[n] < 255)
+ seq_printf(seq,
+ " %2d:%-3d",
+ n, mfc->_c.mfc_un.res.ttls[n]);
}
+ } else {
+ /* unresolved mfc_caches don't contain
+ * pkt, bytes and wrong_if values
+ */
+ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
}
seq_putc(seq, '\n');
}
return 0;
}
-static struct seq_operations ipmr_mfc_seq_ops = {
+static const struct seq_operations ipmr_mfc_seq_ops = {
.start = ipmr_mfc_seq_start,
- .next = ipmr_mfc_seq_next,
- .stop = ipmr_mfc_seq_stop,
+ .next = mr_mfc_seq_next,
+ .stop = mr_mfc_seq_stop,
.show = ipmr_mfc_seq_show,
};
+#endif
-static int ipmr_mfc_open(struct inode *inode, struct file *file)
+#ifdef CONFIG_IP_PIMSM_V2
+static const struct net_protocol pim_protocol = {
+ .handler = pim_rcv,
+};
+#endif
+
+static unsigned int ipmr_seq_read(const struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
+ return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net);
+}
- rc = seq_open(file, &ipmr_mfc_seq_ops);
- if (rc)
- goto out_kfree;
+static int ipmr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
+ ipmr_mr_table_iter, extack);
+}
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .fib_seq_read = ipmr_seq_read,
+ .fib_dump = ipmr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ipmr_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
+
+ net->ipv4.ipmr_seq = 0;
+
+ ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv4.ipmr_notifier_ops = ops;
+ return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+ fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+ net->ipv4.ipmr_notifier_ops = NULL;
}
-static struct file_operations ipmr_mfc_fops = {
- .owner = THIS_MODULE,
- .open = ipmr_mfc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+/* Setup for IP multicast routing */
+static int __net_init ipmr_net_init(struct net *net)
+{
+ int err;
+
+ err = ipmr_notifier_init(net);
+ if (err)
+ goto ipmr_notifier_fail;
+
+ err = ipmr_rules_init(net);
+ if (err < 0)
+ goto ipmr_rules_fail;
+
+#ifdef CONFIG_PROC_FS
+ err = -ENOMEM;
+ if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops,
+ sizeof(struct mr_vif_iter)))
+ goto proc_vif_fail;
+ if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+ sizeof(struct mr_mfc_iter)))
+ goto proc_cache_fail;
+#endif
+ return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+ remove_proc_entry("ip_mr_vif", net->proc_net);
+proc_vif_fail:
+ rtnl_lock();
+ ipmr_rules_exit(net);
+ rtnl_unlock();
+#endif
+ipmr_rules_fail:
+ ipmr_notifier_exit(net);
+ipmr_notifier_fail:
+ return err;
+}
+
+static void __net_exit ipmr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_mr_cache", net->proc_net);
+ remove_proc_entry("ip_mr_vif", net->proc_net);
+#endif
+ ipmr_notifier_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ipmr_rules_exit(net);
+ rtnl_unlock();
+}
+
+static struct pernet_operations ipmr_net_ops = {
+ .init = ipmr_net_init,
+ .exit = ipmr_net_exit,
+ .exit_batch = ipmr_net_exit_batch,
};
-#endif
-#ifdef CONFIG_IP_PIMSM_V2
-static struct net_protocol pim_protocol = {
- .handler = pim_rcv,
+static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = {
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK,
+ .dumpit = ipmr_rtm_dumplink},
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE,
+ .doit = ipmr_rtm_route},
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE,
+ .doit = ipmr_rtm_route},
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE,
+ .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute},
};
+
+int __init ip_mr_init(void)
+{
+ int err;
+
+ mrt_cachep = KMEM_CACHE(mfc_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
+ err = register_pernet_subsys(&ipmr_net_ops);
+ if (err)
+ goto reg_pernet_fail;
+
+ err = register_netdevice_notifier(&ip_mr_notifier);
+ if (err)
+ goto reg_notif_fail;
+#ifdef CONFIG_IP_PIMSM_V2
+ if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
+ pr_err("%s: can't add PIM protocol\n", __func__);
+ err = -EAGAIN;
+ goto add_proto_fail;
+ }
#endif
+ rtnl_register_many(ipmr_rtnl_msg_handlers);
+ return 0;
-/*
- * Setup for IP multicast routing
- */
-
-void __init ip_mr_init(void)
-{
- mrt_cachep = kmem_cache_create("ip_mrt_cache",
- sizeof(struct mfc_cache),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
- init_timer(&ipmr_expire_timer);
- ipmr_expire_timer.function=ipmr_expire_process;
- register_netdevice_notifier(&ip_mr_notifier);
-#ifdef CONFIG_PROC_FS
- proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
- proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
-#endif
+#ifdef CONFIG_IP_PIMSM_V2
+add_proto_fail:
+ unregister_netdevice_notifier(&ip_mr_notifier);
+#endif
+reg_notif_fail:
+ unregister_pernet_subsys(&ipmr_net_ops);
+reg_pernet_fail:
+ kmem_cache_destroy(mrt_cachep);
+ return err;
}
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
new file mode 100644
index 000000000000..28d77d454d44
--- /dev/null
+++ b/net/ipv4/ipmr_base.c
@@ -0,0 +1,446 @@
+/* Linux multicast routing support
+ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
+ */
+
+#include <linux/rhashtable.h>
+#include <linux/mroute_base.h>
+
+/* Sets everything common except 'dev', since that is done under locking */
+void vif_device_init(struct vif_device *v,
+ struct net_device *dev,
+ unsigned long rate_limit,
+ unsigned char threshold,
+ unsigned short flags,
+ unsigned short get_iflink_mask)
+{
+ RCU_INIT_POINTER(v->dev, NULL);
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->rate_limit = rate_limit;
+ v->flags = flags;
+ v->threshold = threshold;
+ if (v->flags & get_iflink_mask)
+ v->link = dev_get_iflink(dev);
+ else
+ v->link = dev->ifindex;
+}
+EXPORT_SYMBOL(vif_device_init);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+ struct mr_table_ops *ops,
+ void (*expire_func)(struct timer_list *t),
+ void (*table_set)(struct mr_table *mrt,
+ struct net *net))
+{
+ struct mr_table *mrt;
+ int err;
+
+ mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+ if (!mrt)
+ return ERR_PTR(-ENOMEM);
+ mrt->id = id;
+ write_pnet(&mrt->net, net);
+
+ mrt->ops = *ops;
+ err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+ if (err) {
+ kfree(mrt);
+ return ERR_PTR(err);
+ }
+ INIT_LIST_HEAD(&mrt->mfc_cache_list);
+ INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+ timer_setup(&mrt->ipmr_expire_timer, expire_func, 0);
+
+ mrt->mroute_reg_vif_num = -1;
+ table_set(mrt, net);
+ return mrt;
+}
+EXPORT_SYMBOL(mr_table_alloc);
+
+void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent)
+{
+ struct rhlist_head *tmp, *list;
+ struct mr_mfc *c;
+
+ list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (parent == -1 || parent == c->mfc_parent)
+ return c;
+
+ return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_find_parent);
+
+void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi)
+{
+ struct rhlist_head *tmp, *list;
+ struct mr_mfc *c;
+
+ list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any,
+ *mrt->ops.rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_find_any_parent);
+
+void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg)
+{
+ struct rhlist_head *tmp, *list;
+ struct mr_mfc *c, *proxy;
+
+ list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
+
+ return mr_mfc_find_any_parent(mrt, vifi);
+}
+EXPORT_SYMBOL(mr_mfc_find_any);
+
+#ifdef CONFIG_PROC_FS
+void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos)
+{
+ struct mr_table *mrt = iter->mrt;
+
+ for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+ if (!VIF_EXISTS(mrt, iter->ct))
+ continue;
+ if (pos-- == 0)
+ return &mrt->vif_table[iter->ct];
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(mr_vif_seq_idx);
+
+void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct mr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = iter->mrt;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return mr_vif_seq_idx(net, iter, 0);
+
+ while (++iter->ct < mrt->maxvif) {
+ if (!VIF_EXISTS(mrt, iter->ct))
+ continue;
+ return &mrt->vif_table[iter->ct];
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(mr_vif_seq_next);
+
+void *mr_mfc_seq_idx(struct net *net,
+ struct mr_mfc_iter *it, loff_t pos)
+{
+ struct mr_table *mrt = it->mrt;
+ struct mr_mfc *mfc;
+
+ rcu_read_lock();
+ it->cache = &mrt->mfc_cache_list;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+ if (pos-- == 0)
+ return mfc;
+ rcu_read_unlock();
+
+ spin_lock_bh(it->lock);
+ it->cache = &mrt->mfc_unres_queue;
+ list_for_each_entry(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ spin_unlock_bh(it->lock);
+
+ it->cache = NULL;
+ return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_seq_idx);
+
+void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos)
+{
+ struct mr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = it->mrt;
+ struct mr_mfc *c = v;
+
+ ++*pos;
+
+ if (v == SEQ_START_TOKEN)
+ return mr_mfc_seq_idx(net, seq->private, 0);
+
+ if (c->list.next != it->cache)
+ return list_entry(c->list.next, struct mr_mfc, list);
+
+ if (it->cache == &mrt->mfc_unres_queue)
+ goto end_of_list;
+
+ /* exhausted cache_array, show unresolved */
+ rcu_read_unlock();
+ it->cache = &mrt->mfc_unres_queue;
+
+ spin_lock_bh(it->lock);
+ if (!list_empty(it->cache))
+ return list_first_entry(it->cache, struct mr_mfc, list);
+
+end_of_list:
+ spin_unlock_bh(it->lock);
+ it->cache = NULL;
+
+ return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_seq_next);
+#endif
+
+int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mr_mfc *c, struct rtmsg *rtm)
+{
+ struct net_device *vif_dev;
+ struct rta_mfc_stats mfcs;
+ struct nlattr *mp_attr;
+ struct rtnexthop *nhp;
+ unsigned long lastuse;
+ int ct;
+
+ /* If cache is unresolved, don't try to parse IIF and OIF */
+ if (c->mfc_parent >= MAXVIFS) {
+ rtm->rtm_flags |= RTNH_F_UNRESOLVED;
+ return -ENOENT;
+ }
+
+ rcu_read_lock();
+ vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev);
+ if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) {
+ rcu_read_unlock();
+ return -EMSGSIZE;
+ }
+ rcu_read_unlock();
+
+ if (c->mfc_flags & MFC_OFFLOAD)
+ rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
+ mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+ if (!mp_attr)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ struct vif_device *vif = &mrt->vif_table[ct];
+
+ vif_dev = rcu_dereference(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255) {
+
+ nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
+ if (!nhp) {
+ rcu_read_unlock();
+ nla_nest_cancel(skb, mp_attr);
+ return -EMSGSIZE;
+ }
+
+ nhp->rtnh_flags = 0;
+ nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+ nhp->rtnh_ifindex = vif_dev->ifindex;
+ nhp->rtnh_len = sizeof(*nhp);
+ }
+ }
+ rcu_read_unlock();
+
+ nla_nest_end(skb, mp_attr);
+
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
+ mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt);
+ mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes);
+ mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if);
+ if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
+ RTA_PAD))
+ return -EMSGSIZE;
+
+ rtm->rtm_type = RTN_MULTICAST;
+ return 1;
+}
+EXPORT_SYMBOL(mr_fill_mroute);
+
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+ const struct mr_mfc *c,
+ const struct net_device *dev)
+{
+ int ct;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ const struct net_device *vif_dev;
+ const struct vif_device *vif;
+
+ vif = &mrt->vif_table[ct];
+ vif_dev = rcu_access_pointer(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255 &&
+ vif_dev == dev)
+ return true;
+ }
+ return false;
+}
+
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c,
+ int cmd, int flags),
+ spinlock_t *lock, struct fib_dump_filter *filter)
+{
+ unsigned int e = 0, s_e = cb->args[1];
+ unsigned int flags = NLM_F_MULTI;
+ struct mr_mfc *mfc;
+ int err;
+
+ if (filter->filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list,
+ lockdep_rtnl_is_held()) {
+ if (e < s_e)
+ goto next_entry;
+ if (filter->dev &&
+ !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+ goto next_entry;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0)
+ goto out;
+next_entry:
+ e++;
+ }
+
+ spin_lock_bh(lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0) {
+ spin_unlock_bh(lock);
+ goto out;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(lock);
+ err = 0;
+out:
+ cb->args[1] = e;
+ return err;
+}
+EXPORT_SYMBOL(mr_table_dump);
+
+int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+ struct mr_table *(*iter)(struct net *net,
+ struct mr_table *mrt),
+ int (*fill)(struct mr_table *mrt,
+ struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c,
+ int cmd, int flags),
+ spinlock_t *lock, struct fib_dump_filter *filter)
+{
+ unsigned int t = 0, s_t = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct mr_table *mrt;
+ int err;
+
+ /* multicast does not track protocol or have route type other
+ * than RTN_MULTICAST
+ */
+ if (filter->filter_set) {
+ if (filter->protocol || filter->flags ||
+ (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+ return skb->len;
+ }
+
+ rcu_read_lock();
+ for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
+ if (t < s_t)
+ goto next_table;
+
+ err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
+ if (err < 0)
+ break;
+ cb->args[1] = 0;
+next_table:
+ t++;
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = t;
+
+ return skb->len;
+}
+EXPORT_SYMBOL(mr_rtm_dumproute);
+
+int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
+ int (*rules_dump)(struct net *net,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack),
+ struct mr_table *(*mr_iter)(struct net *net,
+ struct mr_table *mrt),
+ struct netlink_ext_ack *extack)
+{
+ struct mr_table *mrt;
+ int err;
+
+ err = rules_dump(net, nb, extack);
+ if (err)
+ return err;
+
+ for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
+ struct vif_device *v = &mrt->vif_table[0];
+ struct net_device *vif_dev;
+ struct mr_mfc *mfc;
+ int vifi;
+
+ /* Notifiy on table VIF entries */
+ rcu_read_lock();
+ for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+ vif_dev = rcu_dereference(v->dev);
+ if (!vif_dev)
+ continue;
+
+ err = mr_call_vif_notifier(nb, family,
+ FIB_EVENT_VIF_ADD, v,
+ vif_dev, vifi,
+ mrt->id, extack);
+ if (err)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err)
+ return err;
+
+ /* Notify on table MFC entries */
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ err = mr_call_mfc_notifier(nb, family,
+ FIB_EVENT_ENTRY_ADD,
+ mfc, mrt->id, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(mr_dump);
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
deleted file mode 100644
index 8832eb517d52..000000000000
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ /dev/null
@@ -1,944 +0,0 @@
-/*
- * IPVS An implementation of the IP virtual server support for the
- * LINUX operating system. IPVS is now implemented as a module
- * over the Netfilter framework. IPVS can be used to build a
- * high-performance and highly available server based on a
- * cluster of servers.
- *
- * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Peter Kese <peter.kese@ijs.si>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
- * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
- * and others. Many code here is taken from IP MASQ code of kernel 2.2.
- *
- * Changes:
- *
- */
-
-#include <linux/interrupt.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/proc_fs.h> /* for proc_net_* */
-#include <linux/seq_file.h>
-#include <linux/jhash.h>
-#include <linux/random.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * Connection hash table: for input and output packets lookups of IPVS
- */
-static struct list_head *ip_vs_conn_tab;
-
-/* SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
-
-/* counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
-/* counter for no client port connections */
-static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
-
-/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
-
-/*
- * Fine locking granularity for big connection hash table
- */
-#define CT_LOCKARRAY_BITS 4
-#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
-#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
-
-struct ip_vs_aligned_lock
-{
- rwlock_t l;
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
-/* lock array for conn table */
-static struct ip_vs_aligned_lock
-__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-
-static inline void ct_read_lock(unsigned key)
-{
- read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned key)
-{
- read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned key)
-{
- write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned key)
-{
- write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned key)
-{
- read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned key)
-{
- read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned key)
-{
- write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock_bh(unsigned key)
-{
- write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-
-/*
- * Returns hash value for IPVS connection entry
- */
-static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port)
-{
- return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
-}
-
-
-/*
- * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
- * returns bool success.
- */
-static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
-{
- unsigned hash;
- int ret;
-
- /* Hash by protocol, client address and port */
- hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
-
- ct_write_lock(hash);
-
- if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
- cp->flags |= IP_VS_CONN_F_HASHED;
- atomic_inc(&cp->refcnt);
- ret = 1;
- } else {
- IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
- "called from %p\n", __builtin_return_address(0));
- ret = 0;
- }
-
- ct_write_unlock(hash);
-
- return ret;
-}
-
-
-/*
- * UNhashes ip_vs_conn from ip_vs_conn_tab.
- * returns bool success.
- */
-static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
-{
- unsigned hash;
- int ret;
-
- /* unhash it and decrease its reference counter */
- hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
-
- ct_write_lock(hash);
-
- if (cp->flags & IP_VS_CONN_F_HASHED) {
- list_del(&cp->c_list);
- cp->flags &= ~IP_VS_CONN_F_HASHED;
- atomic_dec(&cp->refcnt);
- ret = 1;
- } else
- ret = 0;
-
- ct_write_unlock(hash);
-
- return ret;
-}
-
-
-/*
- * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- * Called for pkts coming from OUTside-to-INside.
- * s_addr, s_port: pkt source address (foreign host)
- * d_addr, d_port: pkt dest address (load balancer)
- */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
-{
- unsigned hash;
- struct ip_vs_conn *cp;
-
- hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
-
- ct_read_lock(hash);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (s_addr==cp->caddr && s_port==cp->cport &&
- d_port==cp->vport && d_addr==cp->vaddr &&
- ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- protocol==cp->protocol) {
- /* HIT */
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
- return cp;
- }
- }
-
- ct_read_unlock(hash);
-
- return NULL;
-}
-
-struct ip_vs_conn *ip_vs_conn_in_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
-{
- struct ip_vs_conn *cp;
-
- cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
- if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
- cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
-
- IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
- ip_vs_proto_name(protocol),
- NIPQUAD(s_addr), ntohs(s_port),
- NIPQUAD(d_addr), ntohs(d_port),
- cp?"hit":"not hit");
-
- return cp;
-}
-
-/* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
-{
- unsigned hash;
- struct ip_vs_conn *cp;
-
- hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
-
- ct_read_lock(hash);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (s_addr==cp->caddr && s_port==cp->cport &&
- d_port==cp->vport && d_addr==cp->vaddr &&
- cp->flags & IP_VS_CONN_F_TEMPLATE &&
- protocol==cp->protocol) {
- /* HIT */
- atomic_inc(&cp->refcnt);
- goto out;
- }
- }
- cp = NULL;
-
- out:
- ct_read_unlock(hash);
-
- IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
- ip_vs_proto_name(protocol),
- NIPQUAD(s_addr), ntohs(s_port),
- NIPQUAD(d_addr), ntohs(d_port),
- cp?"hit":"not hit");
-
- return cp;
-}
-
-/*
- * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- * Called for pkts coming from inside-to-OUTside.
- * s_addr, s_port: pkt source address (inside host)
- * d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
-{
- unsigned hash;
- struct ip_vs_conn *cp, *ret=NULL;
-
- /*
- * Check for "full" addressed entries
- */
- hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
-
- ct_read_lock(hash);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (d_addr == cp->caddr && d_port == cp->cport &&
- s_port == cp->dport && s_addr == cp->daddr &&
- protocol == cp->protocol) {
- /* HIT */
- atomic_inc(&cp->refcnt);
- ret = cp;
- break;
- }
- }
-
- ct_read_unlock(hash);
-
- IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
- ip_vs_proto_name(protocol),
- NIPQUAD(s_addr), ntohs(s_port),
- NIPQUAD(d_addr), ntohs(d_port),
- ret?"hit":"not hit");
-
- return ret;
-}
-
-
-/*
- * Put back the conn and restart its timer with its timeout
- */
-void ip_vs_conn_put(struct ip_vs_conn *cp)
-{
- /* reset it expire in its timeout */
- mod_timer(&cp->timer, jiffies+cp->timeout);
-
- __ip_vs_conn_put(cp);
-}
-
-
-/*
- * Fill a no_client_port connection with a client port number
- */
-void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
-{
- if (ip_vs_conn_unhash(cp)) {
- spin_lock(&cp->lock);
- if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
- atomic_dec(&ip_vs_conn_no_cport_cnt);
- cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
- cp->cport = cport;
- }
- spin_unlock(&cp->lock);
-
- /* hash on new dport */
- ip_vs_conn_hash(cp);
- }
-}
-
-
-/*
- * Bind a connection entry with the corresponding packet_xmit.
- * Called by ip_vs_conn_new.
- */
-static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
-{
- switch (IP_VS_FWD_METHOD(cp)) {
- case IP_VS_CONN_F_MASQ:
- cp->packet_xmit = ip_vs_nat_xmit;
- break;
-
- case IP_VS_CONN_F_TUNNEL:
- cp->packet_xmit = ip_vs_tunnel_xmit;
- break;
-
- case IP_VS_CONN_F_DROUTE:
- cp->packet_xmit = ip_vs_dr_xmit;
- break;
-
- case IP_VS_CONN_F_LOCALNODE:
- cp->packet_xmit = ip_vs_null_xmit;
- break;
-
- case IP_VS_CONN_F_BYPASS:
- cp->packet_xmit = ip_vs_bypass_xmit;
- break;
- }
-}
-
-
-static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
-{
- return atomic_read(&dest->activeconns)
- + atomic_read(&dest->inactconns);
-}
-
-/*
- * Bind a connection entry with a virtual service destination
- * Called just after a new connection entry is created.
- */
-static inline void
-ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
-{
- /* if dest is NULL, then return directly */
- if (!dest)
- return;
-
- /* Increase the refcnt counter of the dest */
- atomic_inc(&dest->refcnt);
-
- /* Bind with the destination and its corresponding transmitter */
- cp->flags |= atomic_read(&dest->conn_flags);
- cp->dest = dest;
-
- IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
- "dest->refcnt:%d\n",
- ip_vs_proto_name(cp->protocol),
- NIPQUAD(cp->caddr), ntohs(cp->cport),
- NIPQUAD(cp->vaddr), ntohs(cp->vport),
- NIPQUAD(cp->daddr), ntohs(cp->dport),
- ip_vs_fwd_tag(cp), cp->state,
- cp->flags, atomic_read(&cp->refcnt),
- atomic_read(&dest->refcnt));
-
- /* Update the connection counters */
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
- /* It is a normal connection, so increase the inactive
- connection counter because it is in TCP SYNRECV
- state (inactive) or other protocol inacive state */
- atomic_inc(&dest->inactconns);
- } else {
- /* It is a persistent connection/template, so increase
- the peristent connection counter */
- atomic_inc(&dest->persistconns);
- }
-
- if (dest->u_threshold != 0 &&
- ip_vs_dest_totalconns(dest) >= dest->u_threshold)
- dest->flags |= IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- * Unbind a connection entry with its VS destination
- * Called by the ip_vs_conn_expire function.
- */
-static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
-{
- struct ip_vs_dest *dest = cp->dest;
-
- if (!dest)
- return;
-
- IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
- "dest->refcnt:%d\n",
- ip_vs_proto_name(cp->protocol),
- NIPQUAD(cp->caddr), ntohs(cp->cport),
- NIPQUAD(cp->vaddr), ntohs(cp->vport),
- NIPQUAD(cp->daddr), ntohs(cp->dport),
- ip_vs_fwd_tag(cp), cp->state,
- cp->flags, atomic_read(&cp->refcnt),
- atomic_read(&dest->refcnt));
-
- /* Update the connection counters */
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
- /* It is a normal connection, so decrease the inactconns
- or activeconns counter */
- if (cp->flags & IP_VS_CONN_F_INACTIVE) {
- atomic_dec(&dest->inactconns);
- } else {
- atomic_dec(&dest->activeconns);
- }
- } else {
- /* It is a persistent connection/template, so decrease
- the peristent connection counter */
- atomic_dec(&dest->persistconns);
- }
-
- if (dest->l_threshold != 0) {
- if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
- dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
- } else if (dest->u_threshold != 0) {
- if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
- dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
- } else {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
- }
-
- /*
- * Simply decrease the refcnt of the dest, because the
- * dest will be either in service's destination list
- * or in the trash.
- */
- atomic_dec(&dest->refcnt);
-}
-
-
-/*
- * Checking if the destination of a connection template is available.
- * If available, return 1, otherwise invalidate this connection
- * template and return 0.
- */
-int ip_vs_check_template(struct ip_vs_conn *ct)
-{
- struct ip_vs_dest *dest = ct->dest;
-
- /*
- * Checking the dest server status.
- */
- if ((dest == NULL) ||
- !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- (sysctl_ip_vs_expire_quiescent_template &&
- (atomic_read(&dest->weight) == 0))) {
- IP_VS_DBG(9, "check_template: dest not available for "
- "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "-> d:%u.%u.%u.%u:%d\n",
- ip_vs_proto_name(ct->protocol),
- NIPQUAD(ct->caddr), ntohs(ct->cport),
- NIPQUAD(ct->vaddr), ntohs(ct->vport),
- NIPQUAD(ct->daddr), ntohs(ct->dport));
-
- /*
- * Invalidate the connection template
- */
- if (ct->vport != htons(0xffff)) {
- if (ip_vs_conn_unhash(ct)) {
- ct->dport = htons(0xffff);
- ct->vport = htons(0xffff);
- ct->cport = 0;
- ip_vs_conn_hash(ct);
- }
- }
-
- /*
- * Simply decrease the refcnt of the template,
- * don't restart its timer.
- */
- atomic_dec(&ct->refcnt);
- return 0;
- }
- return 1;
-}
-
-static void ip_vs_conn_expire(unsigned long data)
-{
- struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
-
- cp->timeout = 60*HZ;
-
- /*
- * hey, I'm using it
- */
- atomic_inc(&cp->refcnt);
-
- /*
- * do I control anybody?
- */
- if (atomic_read(&cp->n_control))
- goto expire_later;
-
- /*
- * unhash it if it is hashed in the conn table
- */
- if (!ip_vs_conn_unhash(cp))
- goto expire_later;
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (likely(atomic_read(&cp->refcnt) == 1)) {
- /* delete the timer if it is activated by other users */
- if (timer_pending(&cp->timer))
- del_timer(&cp->timer);
-
- /* does anybody control me? */
- if (cp->control)
- ip_vs_control_del(cp);
-
- if (unlikely(cp->app != NULL))
- ip_vs_unbind_app(cp);
- ip_vs_unbind_dest(cp);
- if (cp->flags & IP_VS_CONN_F_NO_CPORT)
- atomic_dec(&ip_vs_conn_no_cport_cnt);
- atomic_dec(&ip_vs_conn_count);
-
- kmem_cache_free(ip_vs_conn_cachep, cp);
- return;
- }
-
- /* hash it back to the table */
- ip_vs_conn_hash(cp);
-
- expire_later:
- IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
- atomic_read(&cp->refcnt)-1,
- atomic_read(&cp->n_control));
-
- ip_vs_conn_put(cp);
-}
-
-
-void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
-{
- if (del_timer(&cp->timer))
- mod_timer(&cp->timer, jiffies);
-}
-
-
-/*
- * Create a new connection entry and hash it into the ip_vs_conn_tab
- */
-struct ip_vs_conn *
-ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport,
- __be32 daddr, __be16 dport, unsigned flags,
- struct ip_vs_dest *dest)
-{
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
-
- cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
- if (cp == NULL) {
- IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
- return NULL;
- }
-
- memset(cp, 0, sizeof(*cp));
- INIT_LIST_HEAD(&cp->c_list);
- init_timer(&cp->timer);
- cp->timer.data = (unsigned long)cp;
- cp->timer.function = ip_vs_conn_expire;
- cp->protocol = proto;
- cp->caddr = caddr;
- cp->cport = cport;
- cp->vaddr = vaddr;
- cp->vport = vport;
- cp->daddr = daddr;
- cp->dport = dport;
- cp->flags = flags;
- spin_lock_init(&cp->lock);
-
- /*
- * Set the entry is referenced by the current thread before hashing
- * it in the table, so that other thread run ip_vs_random_dropentry
- * but cannot drop this entry.
- */
- atomic_set(&cp->refcnt, 1);
-
- atomic_set(&cp->n_control, 0);
- atomic_set(&cp->in_pkts, 0);
-
- atomic_inc(&ip_vs_conn_count);
- if (flags & IP_VS_CONN_F_NO_CPORT)
- atomic_inc(&ip_vs_conn_no_cport_cnt);
-
- /* Bind the connection with a destination server */
- ip_vs_bind_dest(cp, dest);
-
- /* Set its state and timeout */
- cp->state = 0;
- cp->timeout = 3*HZ;
-
- /* Bind its packet transmitter */
- ip_vs_bind_xmit(cp);
-
- if (unlikely(pp && atomic_read(&pp->appcnt)))
- ip_vs_bind_app(cp, pp);
-
- /* Hash it in the ip_vs_conn_tab finally */
- ip_vs_conn_hash(cp);
-
- return cp;
-}
-
-
-/*
- * /proc/net/ip_vs_conn entries
- */
-#ifdef CONFIG_PROC_FS
-
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
-{
- int idx;
- struct ip_vs_conn *cp;
-
- for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
- ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- if (pos-- == 0) {
- seq->private = &ip_vs_conn_tab[idx];
- return cp;
- }
- }
- ct_read_unlock_bh(idx);
- }
-
- return NULL;
-}
-
-static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
-{
- seq->private = NULL;
- return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
-}
-
-static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct ip_vs_conn *cp = v;
- struct list_head *e, *l = seq->private;
- int idx;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ip_vs_conn_array(seq, 0);
-
- /* more on same hash chain? */
- if ((e = cp->c_list.next) != l)
- return list_entry(e, struct ip_vs_conn, c_list);
-
- idx = l - ip_vs_conn_tab;
- ct_read_unlock_bh(idx);
-
- while (++idx < IP_VS_CONN_TAB_SIZE) {
- ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- seq->private = &ip_vs_conn_tab[idx];
- return cp;
- }
- ct_read_unlock_bh(idx);
- }
- seq->private = NULL;
- return NULL;
-}
-
-static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
-{
- struct list_head *l = seq->private;
-
- if (l)
- ct_read_unlock_bh(l - ip_vs_conn_tab);
-}
-
-static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
-{
-
- if (v == SEQ_START_TOKEN)
- seq_puts(seq,
- "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
- else {
- const struct ip_vs_conn *cp = v;
-
- seq_printf(seq,
- "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
- ip_vs_proto_name(cp->protocol),
- ntohl(cp->caddr), ntohs(cp->cport),
- ntohl(cp->vaddr), ntohs(cp->vport),
- ntohl(cp->daddr), ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
- }
- return 0;
-}
-
-static struct seq_operations ip_vs_conn_seq_ops = {
- .start = ip_vs_conn_seq_start,
- .next = ip_vs_conn_seq_next,
- .stop = ip_vs_conn_seq_stop,
- .show = ip_vs_conn_seq_show,
-};
-
-static int ip_vs_conn_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &ip_vs_conn_seq_ops);
-}
-
-static struct file_operations ip_vs_conn_fops = {
- .owner = THIS_MODULE,
- .open = ip_vs_conn_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-#endif
-
-
-/*
- * Randomly drop connection entries before running out of memory
- */
-static inline int todrop_entry(struct ip_vs_conn *cp)
-{
- /*
- * The drop rate array needs tuning for real environments.
- * Called from timer bh only => no locking
- */
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
- int i;
-
- /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
- This will leave enough time for normal connection to get
- through. */
- if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
- return 0;
-
- /* Don't drop the entry if its number of incoming packets is not
- located in [0, 8] */
- i = atomic_read(&cp->in_pkts);
- if (i > 8 || i < 0) return 0;
-
- if (!todrop_rate[i]) return 0;
- if (--todrop_counter[i] > 0) return 0;
-
- todrop_counter[i] = todrop_rate[i];
- return 1;
-}
-
-/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
-{
- int idx;
- struct ip_vs_conn *cp;
-
- /*
- * Randomly scan 1/32 of the whole table every second
- */
- for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
- unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
-
- /*
- * Lock is actually needed in this loop.
- */
- ct_write_lock_bh(hash);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- /* connection template */
- continue;
-
- if (cp->protocol == IPPROTO_TCP) {
- switch(cp->state) {
- case IP_VS_TCP_S_SYN_RECV:
- case IP_VS_TCP_S_SYNACK:
- break;
-
- case IP_VS_TCP_S_ESTABLISHED:
- if (todrop_entry(cp))
- break;
- continue;
-
- default:
- continue;
- }
- } else {
- if (!todrop_entry(cp))
- continue;
- }
-
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
- if (cp->control) {
- IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
- }
- }
- ct_write_unlock_bh(hash);
- }
-}
-
-
-/*
- * Flush all the connection entries in the ip_vs_conn_tab
- */
-static void ip_vs_conn_flush(void)
-{
- int idx;
- struct ip_vs_conn *cp;
-
- flush_again:
- for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
- /*
- * Lock is actually needed in this loop.
- */
- ct_write_lock_bh(idx);
-
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
- if (cp->control) {
- IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
- }
- }
- ct_write_unlock_bh(idx);
- }
-
- /* the counter may be not NULL, because maybe some conn entries
- are run by slow timer handler or unhashed but still referred */
- if (atomic_read(&ip_vs_conn_count) != 0) {
- schedule();
- goto flush_again;
- }
-}
-
-
-int ip_vs_conn_init(void)
-{
- int idx;
-
- /*
- * Allocate the connection hash table and initialize its list heads
- */
- ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
- if (!ip_vs_conn_tab)
- return -ENOMEM;
-
- /* Allocate ip_vs_conn slab cache */
- ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
- sizeof(struct ip_vs_conn), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (!ip_vs_conn_cachep) {
- vfree(ip_vs_conn_tab);
- return -ENOMEM;
- }
-
- IP_VS_INFO("Connection hash table configured "
- "(size=%d, memory=%ldKbytes)\n",
- IP_VS_CONN_TAB_SIZE,
- (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
- IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
- sizeof(struct ip_vs_conn));
-
- for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
- }
-
- for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
- rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
- }
-
- proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
-
- /* calculate the random value for connection hash */
- get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
-
- return 0;
-}
-
-
-void ip_vs_conn_cleanup(void)
-{
- /* flush all the connection entries first */
- ip_vs_conn_flush();
-
- /* Release the empty cache */
- kmem_cache_destroy(ip_vs_conn_cachep);
- proc_net_remove("ip_vs_conn");
- vfree(ip_vs_conn_tab);
-}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
deleted file mode 100644
index 34257520a3a6..000000000000
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ /dev/null
@@ -1,1198 +0,0 @@
-/*
- * IPVS An implementation of the IP virtual server support for the
- * LINUX operating system. IPVS is now implemented as a module
- * over the Netfilter framework. IPVS can be used to build a
- * high-performance and highly available server based on a
- * cluster of servers.
- *
- * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Peter Kese <peter.kese@ijs.si>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
- * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
- * and others.
- *
- * Changes:
- * Paul `Rusty' Russell properly handle non-linear skbs
- * Harald Welte don't use nfcache
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/icmp.h>
-
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <net/icmp.h> /* for icmp_send */
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-EXPORT_SYMBOL(register_ip_vs_scheduler);
-EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
-EXPORT_SYMBOL(ip_vs_proto_name);
-EXPORT_SYMBOL(ip_vs_conn_new);
-EXPORT_SYMBOL(ip_vs_conn_in_get);
-EXPORT_SYMBOL(ip_vs_conn_out_get);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
-#endif
-EXPORT_SYMBOL(ip_vs_conn_put);
-#ifdef CONFIG_IP_VS_DEBUG
-EXPORT_SYMBOL(ip_vs_get_debug_level);
-#endif
-EXPORT_SYMBOL(ip_vs_make_skb_writable);
-
-
-/* ID used in ICMP lookups */
-#define icmp_id(icmph) (((icmph)->un).echo.id)
-
-const char *ip_vs_proto_name(unsigned proto)
-{
- static char buf[20];
-
- switch (proto) {
- case IPPROTO_IP:
- return "IP";
- case IPPROTO_UDP:
- return "UDP";
- case IPPROTO_TCP:
- return "TCP";
- case IPPROTO_ICMP:
- return "ICMP";
- default:
- sprintf(buf, "IP_%d", proto);
- return buf;
- }
-}
-
-void ip_vs_init_hash_table(struct list_head *table, int rows)
-{
- while (--rows >= 0)
- INIT_LIST_HEAD(&table[rows]);
-}
-
-static inline void
-ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
- struct ip_vs_dest *dest = cp->dest;
- if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.inpkts++;
- dest->stats.inbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.inpkts++;
- dest->svc->stats.inbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.inpkts++;
- ip_vs_stats.inbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
- }
-}
-
-
-static inline void
-ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
- struct ip_vs_dest *dest = cp->dest;
- if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.outpkts++;
- dest->stats.outbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.outpkts++;
- dest->svc->stats.outbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.outpkts++;
- ip_vs_stats.outbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
- }
-}
-
-
-static inline void
-ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
-{
- spin_lock(&cp->dest->stats.lock);
- cp->dest->stats.conns++;
- spin_unlock(&cp->dest->stats.lock);
-
- spin_lock(&svc->stats.lock);
- svc->stats.conns++;
- spin_unlock(&svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.conns++;
- spin_unlock(&ip_vs_stats.lock);
-}
-
-
-static inline int
-ip_vs_set_state(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
-{
- if (unlikely(!pp->state_transition))
- return 0;
- return pp->state_transition(cp, direction, skb, pp);
-}
-
-
-int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
-{
- struct sk_buff *skb = *pskb;
-
- /* skb is already used, better copy skb and its payload */
- if (unlikely(skb_shared(skb) || skb->sk))
- goto copy_skb;
-
- /* skb data is already used, copy it */
- if (unlikely(skb_cloned(skb)))
- goto copy_data;
-
- return pskb_may_pull(skb, writable_len);
-
- copy_data:
- if (unlikely(writable_len > skb->len))
- return 0;
- return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
-
- copy_skb:
- if (unlikely(writable_len > skb->len))
- return 0;
- skb = skb_copy(skb, GFP_ATOMIC);
- if (!skb)
- return 0;
- BUG_ON(skb_is_nonlinear(skb));
-
- /* Rest of kernel will get very unhappy if we pass it a
- suddenly-orphaned skbuff */
- if ((*pskb)->sk)
- skb_set_owner_w(skb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = skb;
- return 1;
-}
-
-/*
- * IPVS persistent scheduling function
- * It creates a connection entry according to its template if exists,
- * or selects a server and creates a connection entry plus a template.
- * Locking: we are svc user (svc->refcnt), so we hold all dests too
- * Protocols supported: TCP, UDP
- */
-static struct ip_vs_conn *
-ip_vs_sched_persist(struct ip_vs_service *svc,
- const struct sk_buff *skb,
- __be16 ports[2])
-{
- struct ip_vs_conn *cp = NULL;
- struct iphdr *iph = skb->nh.iph;
- struct ip_vs_dest *dest;
- struct ip_vs_conn *ct;
- __be16 dport; /* destination port to forward */
- __be32 snet; /* source network of the client, after masking */
-
- /* Mask saddr with the netmask to adjust template granularity */
- snet = iph->saddr & svc->netmask;
-
- IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
- "mnet %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr), ntohs(ports[0]),
- NIPQUAD(iph->daddr), ntohs(ports[1]),
- NIPQUAD(snet));
-
- /*
- * As far as we know, FTP is a very complicated network protocol, and
- * it uses control connection and data connections. For active FTP,
- * FTP server initialize data connection to the client, its source port
- * is often 20. For passive FTP, FTP server tells the clients the port
- * that it passively listens to, and the client issues the data
- * connection. In the tunneling or direct routing mode, the load
- * balancer is on the client-to-server half of connection, the port
- * number is unknown to the load balancer. So, a conn template like
- * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
- * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
- * is created for other persistent services.
- */
- if (ports[1] == svc->port) {
- /* Check if a template already exists */
- if (svc->port != FTPPORT)
- ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
- iph->daddr, ports[1]);
- else
- ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
- iph->daddr, 0);
-
- if (!ct || !ip_vs_check_template(ct)) {
- /*
- * No template found or the dest of the connection
- * template is not available.
- */
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "p-schedule: no dest found.\n");
- return NULL;
- }
-
- /*
- * Create a template like <protocol,caddr,0,
- * vaddr,vport,daddr,dport> for non-ftp service,
- * and <protocol,caddr,0,vaddr,0,daddr,0>
- * for ftp service.
- */
- if (svc->port != FTPPORT)
- ct = ip_vs_conn_new(iph->protocol,
- snet, 0,
- iph->daddr,
- ports[1],
- dest->addr, dest->port,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- else
- ct = ip_vs_conn_new(iph->protocol,
- snet, 0,
- iph->daddr, 0,
- dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- if (ct == NULL)
- return NULL;
-
- ct->timeout = svc->timeout;
- } else {
- /* set destination with the found template */
- dest = ct->dest;
- }
- dport = dest->port;
- } else {
- /*
- * Note: persistent fwmark-based services and persistent
- * port zero service are handled here.
- * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
- * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
- */
- if (svc->fwmark)
- ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
- htonl(svc->fwmark), 0);
- else
- ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
- iph->daddr, 0);
-
- if (!ct || !ip_vs_check_template(ct)) {
- /*
- * If it is not persistent port zero, return NULL,
- * otherwise create a connection template.
- */
- if (svc->port)
- return NULL;
-
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "p-schedule: no dest found.\n");
- return NULL;
- }
-
- /*
- * Create a template according to the service
- */
- if (svc->fwmark)
- ct = ip_vs_conn_new(IPPROTO_IP,
- snet, 0,
- htonl(svc->fwmark), 0,
- dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- else
- ct = ip_vs_conn_new(iph->protocol,
- snet, 0,
- iph->daddr, 0,
- dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- if (ct == NULL)
- return NULL;
-
- ct->timeout = svc->timeout;
- } else {
- /* set destination with the found template */
- dest = ct->dest;
- }
- dport = ports[1];
- }
-
- /*
- * Create a new connection according to the template
- */
- cp = ip_vs_conn_new(iph->protocol,
- iph->saddr, ports[0],
- iph->daddr, ports[1],
- dest->addr, dport,
- 0,
- dest);
- if (cp == NULL) {
- ip_vs_conn_put(ct);
- return NULL;
- }
-
- /*
- * Add its control
- */
- ip_vs_control_add(cp, ct);
- ip_vs_conn_put(ct);
-
- ip_vs_conn_stats(cp, svc);
- return cp;
-}
-
-
-/*
- * IPVS main scheduling function
- * It selects a server according to the virtual service, and
- * creates a connection entry.
- * Protocols supported: TCP, UDP
- */
-struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct ip_vs_conn *cp = NULL;
- struct iphdr *iph = skb->nh.iph;
- struct ip_vs_dest *dest;
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, iph->ihl*4,
- sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- /*
- * Persistent service
- */
- if (svc->flags & IP_VS_SVC_F_PERSISTENT)
- return ip_vs_sched_persist(svc, skb, pptr);
-
- /*
- * Non-persistent service
- */
- if (!svc->fwmark && pptr[1] != svc->port) {
- if (!svc->port)
- IP_VS_ERR("Schedule: port zero only supported "
- "in persistent services, "
- "check your ipvs configuration\n");
- return NULL;
- }
-
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "Schedule: no dest found.\n");
- return NULL;
- }
-
- /*
- * Create a connection entry.
- */
- cp = ip_vs_conn_new(iph->protocol,
- iph->saddr, pptr[0],
- iph->daddr, pptr[1],
- dest->addr, dest->port?dest->port:pptr[1],
- 0,
- dest);
- if (cp == NULL)
- return NULL;
-
- IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
- "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
- ip_vs_fwd_tag(cp),
- NIPQUAD(cp->caddr), ntohs(cp->cport),
- NIPQUAD(cp->vaddr), ntohs(cp->vport),
- NIPQUAD(cp->daddr), ntohs(cp->dport),
- cp->flags, atomic_read(&cp->refcnt));
-
- ip_vs_conn_stats(cp, svc);
- return cp;
-}
-
-
-/*
- * Pass or drop the packet.
- * Called by ip_vs_in, when the virtual service is available but
- * no destination is available for a new connection.
- */
-int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp)
-{
- __be16 _ports[2], *pptr;
- struct iphdr *iph = skb->nh.iph;
-
- pptr = skb_header_pointer(skb, iph->ihl*4,
- sizeof(_ports), _ports);
- if (pptr == NULL) {
- ip_vs_service_put(svc);
- return NF_DROP;
- }
-
- /* if it is fwmark-based service, the cache_bypass sysctl is up
- and the destination is RTN_UNICAST (and not local), then create
- a cache_bypass connection entry */
- if (sysctl_ip_vs_cache_bypass && svc->fwmark
- && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
- int ret, cs;
- struct ip_vs_conn *cp;
-
- ip_vs_service_put(svc);
-
- /* create a new connection entry */
- IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
- cp = ip_vs_conn_new(iph->protocol,
- iph->saddr, pptr[0],
- iph->daddr, pptr[1],
- 0, 0,
- IP_VS_CONN_F_BYPASS,
- NULL);
- if (cp == NULL)
- return NF_DROP;
-
- /* statistics */
- ip_vs_in_stats(cp, skb);
-
- /* set state */
- cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
-
- /* transmit the first SYN packet */
- ret = cp->packet_xmit(skb, cp, pp);
- /* do not touch skb anymore */
-
- atomic_inc(&cp->in_pkts);
- ip_vs_conn_put(cp);
- return ret;
- }
-
- /*
- * When the virtual ftp service is presented, packets destined
- * for other services on the VIP may get here (except services
- * listed in the ipvs table), pass the packets, because it is
- * not ipvs job to decide to drop the packets.
- */
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
- ip_vs_service_put(svc);
- return NF_ACCEPT;
- }
-
- ip_vs_service_put(svc);
-
- /*
- * Notify the client that the destination is unreachable, and
- * release the socket buffer.
- * Since it is in IP layer, the TCP socket is not actually
- * created, the TCP RST packet cannot be sent, instead that
- * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
- */
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- return NF_DROP;
-}
-
-
-/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
- * chain, and is used for VS/NAT.
- * It detects packets for VS/NAT connections and sends the packets
- * immediately. This can avoid that iptable_nat mangles the packets
- * for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- if (!((*pskb)->ipvs_property))
- return NF_ACCEPT;
- /* The packet was sent from IPVS, exit this chain */
- return NF_STOP;
-}
-
-__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
-{
- return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
-}
-
-static inline struct sk_buff *
-ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
-{
- skb = ip_defrag(skb, user);
- if (skb)
- ip_send_check(skb->nh.iph);
- return skb;
-}
-
-/*
- * Packet has been made sufficiently writable in caller
- * - inout: 1=in->out, 0=out->in
- */
-void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
- struct ip_vs_conn *cp, int inout)
-{
- struct iphdr *iph = skb->nh.iph;
- unsigned int icmp_offset = iph->ihl*4;
- struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
- struct iphdr *ciph = (struct iphdr *)(icmph + 1);
-
- if (inout) {
- iph->saddr = cp->vaddr;
- ip_send_check(iph);
- ciph->daddr = cp->vaddr;
- ip_send_check(ciph);
- } else {
- iph->daddr = cp->daddr;
- ip_send_check(iph);
- ciph->saddr = cp->daddr;
- ip_send_check(ciph);
- }
-
- /* the TCP/UDP port */
- if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
- __be16 *ports = (void *)ciph + ciph->ihl*4;
-
- if (inout)
- ports[1] = cp->vport;
- else
- ports[0] = cp->dport;
- }
-
- /* And finally the ICMP checksum */
- icmph->checksum = 0;
- icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
-
- if (inout)
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
- "Forwarding altered outgoing ICMP");
- else
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
- "Forwarding altered incoming ICMP");
-}
-
-/*
- * Handle ICMP messages in the inside-to-outside direction (outgoing).
- * Find any that might be relevant, check against existing connections,
- * forward to the right destination host if relevant.
- * Currently handles error types - unreachable, quench, ttl exceeded.
- * (Only used in VS/NAT)
- */
-static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
-{
- struct sk_buff *skb = *pskb;
- struct iphdr *iph;
- struct icmphdr _icmph, *ic;
- struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp;
- unsigned int offset, ihl, verdict;
-
- *related = 1;
-
- /* reassemble IP fragments */
- if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
- skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
- if (!skb)
- return NF_STOLEN;
- *pskb = skb;
- }
-
- iph = skb->nh.iph;
- offset = ihl = iph->ihl * 4;
- ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
- if (ic == NULL)
- return NF_DROP;
-
- IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
- ic->type, ntohs(icmp_id(ic)),
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-
- /*
- * Work through seeing if this is for us.
- * These checks are supposed to be in an order that means easy
- * things are checked first to speed up processing.... however
- * this means that some packets will manage to get a long way
- * down this stack and then be rejected, but that's life.
- */
- if ((ic->type != ICMP_DEST_UNREACH) &&
- (ic->type != ICMP_SOURCE_QUENCH) &&
- (ic->type != ICMP_TIME_EXCEEDED)) {
- *related = 0;
- return NF_ACCEPT;
- }
-
- /* Now find the contained IP header */
- offset += sizeof(_icmph);
- cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
- if (cih == NULL)
- return NF_ACCEPT; /* The packet looks wrong, ignore */
-
- pp = ip_vs_proto_get(cih->protocol);
- if (!pp)
- return NF_ACCEPT;
-
- /* Is the embedded protocol header present? */
- if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
- pp->dont_defrag))
- return NF_ACCEPT;
-
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
-
- offset += cih->ihl * 4;
-
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(skb, pp, cih, offset, 1);
- if (!cp)
- return NF_ACCEPT;
-
- verdict = NF_DROP;
-
- if (IP_VS_FWD_METHOD(cp) != 0) {
- IP_VS_ERR("shouldn't reach here, because the box is on the"
- "half connection in the tun/dr module.\n");
- }
-
- /* Ensure the checksum is correct */
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- ip_vs_checksum_complete(skb, ihl)) {
- /* Failed checksum! */
- IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
- NIPQUAD(iph->saddr));
- goto out;
- }
-
- if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
- offset += 2 * sizeof(__u16);
- if (!ip_vs_make_skb_writable(pskb, offset))
- goto out;
- skb = *pskb;
-
- ip_vs_nat_icmp(skb, pp, cp, 1);
-
- /* do the statistics and put it back */
- ip_vs_out_stats(cp, skb);
-
- skb->ipvs_property = 1;
- verdict = NF_ACCEPT;
-
- out:
- __ip_vs_conn_put(cp);
-
- return verdict;
-}
-
-static inline int is_tcp_reset(const struct sk_buff *skb)
-{
- struct tcphdr _tcph, *th;
-
- th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL)
- return 0;
- return th->rst;
-}
-
-/*
- * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
- * Check if outgoing packet belongs to the established ip_vs_conn,
- * rewrite addresses of the packet and send it on its way...
- */
-static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
- struct iphdr *iph;
- struct ip_vs_protocol *pp;
- struct ip_vs_conn *cp;
- int ihl;
-
- EnterFunction(11);
-
- if (skb->ipvs_property)
- return NF_ACCEPT;
-
- iph = skb->nh.iph;
- if (unlikely(iph->protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_out_icmp(pskb, &related);
-
- if (related)
- return verdict;
- skb = *pskb;
- iph = skb->nh.iph;
- }
-
- pp = ip_vs_proto_get(iph->protocol);
- if (unlikely(!pp))
- return NF_ACCEPT;
-
- /* reassemble IP fragments */
- if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
- !pp->dont_defrag)) {
- skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
- if (!skb)
- return NF_STOLEN;
- iph = skb->nh.iph;
- *pskb = skb;
- }
-
- ihl = iph->ihl << 2;
-
- /*
- * Check if the packet belongs to an existing entry
- */
- cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
-
- if (unlikely(!cp)) {
- if (sysctl_ip_vs_nat_icmp_send &&
- (pp->protocol == IPPROTO_TCP ||
- pp->protocol == IPPROTO_UDP)) {
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, ihl,
- sizeof(_ports), _ports);
- if (pptr == NULL)
- return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(iph->protocol,
- iph->saddr, pptr[0])) {
- /*
- * Notify the real server: there is no
- * existing entry if it is not RST
- * packet or not TCP packet.
- */
- if (iph->protocol != IPPROTO_TCP
- || !is_tcp_reset(skb)) {
- icmp_send(skb,ICMP_DEST_UNREACH,
- ICMP_PORT_UNREACH, 0);
- return NF_DROP;
- }
- }
- }
- IP_VS_DBG_PKT(12, pp, skb, 0,
- "packet continues traversal as normal");
- return NF_ACCEPT;
- }
-
- IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
-
- if (!ip_vs_make_skb_writable(pskb, ihl))
- goto drop;
-
- /* mangle the packet */
- if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
- goto drop;
- skb = *pskb;
- skb->nh.iph->saddr = cp->vaddr;
- ip_send_check(skb->nh.iph);
-
- /* For policy routing, packets originating from this
- * machine itself may be routed differently to packets
- * passing through. We want this packet to be routed as
- * if it came from this machine itself. So re-compute
- * the routing information.
- */
- if (ip_route_me_harder(pskb, RTN_LOCAL) != 0)
- goto drop;
- skb = *pskb;
-
- IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
-
- ip_vs_out_stats(cp, skb);
- ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
- ip_vs_conn_put(cp);
-
- skb->ipvs_property = 1;
-
- LeaveFunction(11);
- return NF_ACCEPT;
-
- drop:
- ip_vs_conn_put(cp);
- kfree_skb(*pskb);
- return NF_STOLEN;
-}
-
-
-/*
- * Handle ICMP messages in the outside-to-inside direction (incoming).
- * Find any that might be relevant, check against existing connections,
- * forward to the right destination host if relevant.
- * Currently handles error types - unreachable, quench, ttl exceeded.
- */
-static int
-ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
-{
- struct sk_buff *skb = *pskb;
- struct iphdr *iph;
- struct icmphdr _icmph, *ic;
- struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp;
- unsigned int offset, ihl, verdict;
-
- *related = 1;
-
- /* reassemble IP fragments */
- if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
- skb = ip_vs_gather_frags(skb,
- hooknum == NF_IP_LOCAL_IN ?
- IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
- if (!skb)
- return NF_STOLEN;
- *pskb = skb;
- }
-
- iph = skb->nh.iph;
- offset = ihl = iph->ihl * 4;
- ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
- if (ic == NULL)
- return NF_DROP;
-
- IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
- ic->type, ntohs(icmp_id(ic)),
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-
- /*
- * Work through seeing if this is for us.
- * These checks are supposed to be in an order that means easy
- * things are checked first to speed up processing.... however
- * this means that some packets will manage to get a long way
- * down this stack and then be rejected, but that's life.
- */
- if ((ic->type != ICMP_DEST_UNREACH) &&
- (ic->type != ICMP_SOURCE_QUENCH) &&
- (ic->type != ICMP_TIME_EXCEEDED)) {
- *related = 0;
- return NF_ACCEPT;
- }
-
- /* Now find the contained IP header */
- offset += sizeof(_icmph);
- cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
- if (cih == NULL)
- return NF_ACCEPT; /* The packet looks wrong, ignore */
-
- pp = ip_vs_proto_get(cih->protocol);
- if (!pp)
- return NF_ACCEPT;
-
- /* Is the embedded protocol header present? */
- if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
- pp->dont_defrag))
- return NF_ACCEPT;
-
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
-
- offset += cih->ihl * 4;
-
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(skb, pp, cih, offset, 1);
- if (!cp)
- return NF_ACCEPT;
-
- verdict = NF_DROP;
-
- /* Ensure the checksum is correct */
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- ip_vs_checksum_complete(skb, ihl)) {
- /* Failed checksum! */
- IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
- NIPQUAD(iph->saddr));
- goto out;
- }
-
- /* do the statistics and put it back */
- ip_vs_in_stats(cp, skb);
- if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
- offset += 2 * sizeof(__u16);
- verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
- /* do not touch skb anymore */
-
- out:
- __ip_vs_conn_put(cp);
-
- return verdict;
-}
-
-/*
- * Check if it's for virtual services, look it up,
- * and send it on its way...
- */
-static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *skb = *pskb;
- struct iphdr *iph;
- struct ip_vs_protocol *pp;
- struct ip_vs_conn *cp;
- int ret, restart;
- int ihl;
-
- /*
- * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
- * ... don't know why 1st test DOES NOT include 2nd (?)
- */
- if (unlikely(skb->pkt_type != PACKET_HOST
- || skb->dev == &loopback_dev || skb->sk)) {
- IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
- skb->pkt_type,
- skb->nh.iph->protocol,
- NIPQUAD(skb->nh.iph->daddr));
- return NF_ACCEPT;
- }
-
- iph = skb->nh.iph;
- if (unlikely(iph->protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
-
- if (related)
- return verdict;
- skb = *pskb;
- iph = skb->nh.iph;
- }
-
- /* Protocol supported? */
- pp = ip_vs_proto_get(iph->protocol);
- if (unlikely(!pp))
- return NF_ACCEPT;
-
- ihl = iph->ihl << 2;
-
- /*
- * Check if the packet belongs to an existing connection entry
- */
- cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
-
- if (unlikely(!cp)) {
- int v;
-
- if (!pp->conn_schedule(skb, pp, &v, &cp))
- return v;
- }
-
- if (unlikely(!cp)) {
- /* sorry, all this trouble for a no-hit :) */
- IP_VS_DBG_PKT(12, pp, skb, 0,
- "packet continues traversal as normal");
- return NF_ACCEPT;
- }
-
- IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
-
- /* Check the server status */
- if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- /* the destination server is not available */
-
- if (sysctl_ip_vs_expire_nodest_conn) {
- /* try to expire the connection immediately */
- ip_vs_conn_expire_now(cp);
- }
- /* don't restart its timer, and silently
- drop the packet. */
- __ip_vs_conn_put(cp);
- return NF_DROP;
- }
-
- ip_vs_in_stats(cp, skb);
- restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
- if (cp->packet_xmit)
- ret = cp->packet_xmit(skb, cp, pp);
- /* do not touch skb anymore */
- else {
- IP_VS_DBG_RL("warning: packet_xmit is null");
- ret = NF_ACCEPT;
- }
-
- /* increase its packet counter and check if it is needed
- to be synchronized */
- atomic_inc(&cp->in_pkts);
- if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
- (cp->protocol != IPPROTO_TCP ||
- cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0]))
- ip_vs_sync_conn(cp);
-
- ip_vs_conn_put(cp);
- return ret;
-}
-
-
-/*
- * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
- * related packets destined for 0.0.0.0/0.
- * When fwmark-based virtual service is used, such as transparent
- * cache cluster, TCP packets can be marked and routed to ip_vs_in,
- * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
- * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
- * and send them to ip_vs_in_icmp.
- */
-static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- int r;
-
- if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
- return NF_ACCEPT;
-
- return ip_vs_in_icmp(pskb, &r, hooknum);
-}
-
-
-/* After packet filtering, forward packet through VS/DR, VS/TUN,
- or VS/NAT(change destination), so that filtering rules can be
- applied to IPVS. */
-static struct nf_hook_ops ip_vs_in_ops = {
- .hook = ip_vs_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = 100,
-};
-
-/* After packet filtering, change source only for VS/NAT */
-static struct nf_hook_ops ip_vs_out_ops = {
- .hook = ip_vs_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = 100,
-};
-
-/* After packet filtering (but before ip_vs_out_icmp), catch icmp
- destined for 0.0.0.0/0, which is for incoming IPVS connections */
-static struct nf_hook_ops ip_vs_forward_icmp_ops = {
- .hook = ip_vs_forward_icmp,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = 99,
-};
-
-/* Before the netfilter connection tracking, exit from POST_ROUTING */
-static struct nf_hook_ops ip_vs_post_routing_ops = {
- .hook = ip_vs_post_routing,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC-1,
-};
-
-
-/*
- * Initialize IP Virtual Server
- */
-static int __init ip_vs_init(void)
-{
- int ret;
-
- ret = ip_vs_control_init();
- if (ret < 0) {
- IP_VS_ERR("can't setup control.\n");
- goto cleanup_nothing;
- }
-
- ip_vs_protocol_init();
-
- ret = ip_vs_app_init();
- if (ret < 0) {
- IP_VS_ERR("can't setup application helper.\n");
- goto cleanup_protocol;
- }
-
- ret = ip_vs_conn_init();
- if (ret < 0) {
- IP_VS_ERR("can't setup connection table.\n");
- goto cleanup_app;
- }
-
- ret = nf_register_hook(&ip_vs_in_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register in hook.\n");
- goto cleanup_conn;
- }
-
- ret = nf_register_hook(&ip_vs_out_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register out hook.\n");
- goto cleanup_inops;
- }
- ret = nf_register_hook(&ip_vs_post_routing_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register post_routing hook.\n");
- goto cleanup_outops;
- }
- ret = nf_register_hook(&ip_vs_forward_icmp_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register forward_icmp hook.\n");
- goto cleanup_postroutingops;
- }
-
- IP_VS_INFO("ipvs loaded.\n");
- return ret;
-
- cleanup_postroutingops:
- nf_unregister_hook(&ip_vs_post_routing_ops);
- cleanup_outops:
- nf_unregister_hook(&ip_vs_out_ops);
- cleanup_inops:
- nf_unregister_hook(&ip_vs_in_ops);
- cleanup_conn:
- ip_vs_conn_cleanup();
- cleanup_app:
- ip_vs_app_cleanup();
- cleanup_protocol:
- ip_vs_protocol_cleanup();
- ip_vs_control_cleanup();
- cleanup_nothing:
- return ret;
-}
-
-static void __exit ip_vs_cleanup(void)
-{
- nf_unregister_hook(&ip_vs_forward_icmp_ops);
- nf_unregister_hook(&ip_vs_post_routing_ops);
- nf_unregister_hook(&ip_vs_out_ops);
- nf_unregister_hook(&ip_vs_in_ops);
- ip_vs_conn_cleanup();
- ip_vs_app_cleanup();
- ip_vs_protocol_cleanup();
- ip_vs_control_cleanup();
- IP_VS_INFO("ipvs unloaded.\n");
-}
-
-module_init(ip_vs_init);
-module_exit(ip_vs_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
deleted file mode 100644
index f261616e4602..000000000000
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ /dev/null
@@ -1,2396 +0,0 @@
-/*
- * IPVS An implementation of the IP virtual server support for the
- * LINUX operating system. IPVS is now implemented as a module
- * over the NetFilter framework. IPVS can be used to build a
- * high-performance and highly available server based on a
- * cluster of servers.
- *
- * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Peter Kese <peter.kese@ijs.si>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/swap.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/mutex.h>
-
-#include <net/ip.h>
-#include <net/route.h>
-#include <net/sock.h>
-
-#include <asm/uaccess.h>
-
-#include <net/ip_vs.h>
-
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
-
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
-/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-
-
-#ifdef CONFIG_IP_VS_DEBUG
-static int sysctl_ip_vs_debug_level = 0;
-
-int ip_vs_get_debug_level(void)
-{
- return sysctl_ip_vs_debug_level;
-}
-#endif
-
-/*
- * update_defense_level is called from keventd and from sysctl,
- * so it needs to protect itself from softirqs
- */
-static void update_defense_level(void)
-{
- struct sysinfo i;
- static int old_secure_tcp = 0;
- int availmem;
- int nomem;
- int to_change = -1;
-
- /* we only count free and buffered memory (in pages) */
- si_meminfo(&i);
- availmem = i.freeram + i.bufferram;
- /* however in linux 2.5 the i.bufferram is total page cache size,
- we need adjust it */
- /* si_swapinfo(&i); */
- /* availmem = availmem - (i.totalswap - i.freeswap); */
-
- nomem = (availmem < sysctl_ip_vs_amemthresh);
-
- local_bh_disable();
-
- /* drop_entry */
- spin_lock(&__ip_vs_dropentry_lock);
- switch (sysctl_ip_vs_drop_entry) {
- case 0:
- atomic_set(&ip_vs_dropentry, 0);
- break;
- case 1:
- if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
- sysctl_ip_vs_drop_entry = 2;
- } else {
- atomic_set(&ip_vs_dropentry, 0);
- }
- break;
- case 2:
- if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
- } else {
- atomic_set(&ip_vs_dropentry, 0);
- sysctl_ip_vs_drop_entry = 1;
- };
- break;
- case 3:
- atomic_set(&ip_vs_dropentry, 1);
- break;
- }
- spin_unlock(&__ip_vs_dropentry_lock);
-
- /* drop_packet */
- spin_lock(&__ip_vs_droppacket_lock);
- switch (sysctl_ip_vs_drop_packet) {
- case 0:
- ip_vs_drop_rate = 0;
- break;
- case 1:
- if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
- sysctl_ip_vs_drop_packet = 2;
- } else {
- ip_vs_drop_rate = 0;
- }
- break;
- case 2:
- if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
- } else {
- ip_vs_drop_rate = 0;
- sysctl_ip_vs_drop_packet = 1;
- }
- break;
- case 3:
- ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
- break;
- }
- spin_unlock(&__ip_vs_droppacket_lock);
-
- /* secure_tcp */
- write_lock(&__ip_vs_securetcp_lock);
- switch (sysctl_ip_vs_secure_tcp) {
- case 0:
- if (old_secure_tcp >= 2)
- to_change = 0;
- break;
- case 1:
- if (nomem) {
- if (old_secure_tcp < 2)
- to_change = 1;
- sysctl_ip_vs_secure_tcp = 2;
- } else {
- if (old_secure_tcp >= 2)
- to_change = 0;
- }
- break;
- case 2:
- if (nomem) {
- if (old_secure_tcp < 2)
- to_change = 1;
- } else {
- if (old_secure_tcp >= 2)
- to_change = 0;
- sysctl_ip_vs_secure_tcp = 1;
- }
- break;
- case 3:
- if (old_secure_tcp < 2)
- to_change = 1;
- break;
- }
- old_secure_tcp = sysctl_ip_vs_secure_tcp;
- if (to_change >= 0)
- ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- write_unlock(&__ip_vs_securetcp_lock);
-
- local_bh_enable();
-}
-
-
-/*
- * Timer for checking the defense
- */
-#define DEFENSE_TIMER_PERIOD 1*HZ
-static void defense_work_handler(void *data);
-static DECLARE_WORK(defense_work, defense_work_handler, NULL);
-
-static void defense_work_handler(void *data)
-{
- update_defense_level();
- if (atomic_read(&ip_vs_dropentry))
- ip_vs_random_dropentry();
-
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-}
-
-int
-ip_vs_use_count_inc(void)
-{
- return try_module_get(THIS_MODULE);
-}
-
-void
-ip_vs_use_count_dec(void)
-{
- module_put(THIS_MODULE);
-}
-
-
-/*
- * Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
-
-/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
-/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-
-/*
- * Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- * Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- * FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
-
-/*
- * Returns hash value for virtual service
- */
-static __inline__ unsigned
-ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
-{
- register unsigned porth = ntohs(port);
-
- return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
- & IP_VS_SVC_TAB_MASK;
-}
-
-/*
- * Returns hash value of fwmark for virtual service lookup
- */
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
-{
- return fwmark & IP_VS_SVC_TAB_MASK;
-}
-
-/*
- * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
- * or in the ip_vs_svc_fwm_table by fwmark.
- * Should be called with locked tables.
- */
-static int ip_vs_svc_hash(struct ip_vs_service *svc)
-{
- unsigned hash;
-
- if (svc->flags & IP_VS_SVC_F_HASHED) {
- IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- if (svc->fwmark == 0) {
- /*
- * Hash it by <protocol,addr,port> in ip_vs_svc_table
- */
- hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
- list_add(&svc->s_list, &ip_vs_svc_table[hash]);
- } else {
- /*
- * Hash it by fwmark in ip_vs_svc_fwm_table
- */
- hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
- list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
- }
-
- svc->flags |= IP_VS_SVC_F_HASHED;
- /* increase its refcnt because it is referenced by the svc table */
- atomic_inc(&svc->refcnt);
- return 1;
-}
-
-
-/*
- * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
- * Should be called with locked tables.
- */
-static int ip_vs_svc_unhash(struct ip_vs_service *svc)
-{
- if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
- IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- if (svc->fwmark == 0) {
- /* Remove it from the ip_vs_svc_table table */
- list_del(&svc->s_list);
- } else {
- /* Remove it from the ip_vs_svc_fwm_table table */
- list_del(&svc->f_list);
- }
-
- svc->flags &= ~IP_VS_SVC_F_HASHED;
- atomic_dec(&svc->refcnt);
- return 1;
-}
-
-
-/*
- * Get service by {proto,addr,port} in the service table.
- */
-static __inline__ struct ip_vs_service *
-__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
-{
- unsigned hash;
- struct ip_vs_service *svc;
-
- /* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
-
- list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
- if ((svc->addr == vaddr)
- && (svc->port == vport)
- && (svc->protocol == protocol)) {
- /* HIT */
- atomic_inc(&svc->usecnt);
- return svc;
- }
- }
-
- return NULL;
-}
-
-
-/*
- * Get service by {fwmark} in the service table.
- */
-static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
-{
- unsigned hash;
- struct ip_vs_service *svc;
-
- /* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(fwmark);
-
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
- if (svc->fwmark == fwmark) {
- /* HIT */
- atomic_inc(&svc->usecnt);
- return svc;
- }
- }
-
- return NULL;
-}
-
-struct ip_vs_service *
-ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
-{
- struct ip_vs_service *svc;
-
- read_lock(&__ip_vs_svc_lock);
-
- /*
- * Check the table hashed by fwmark first
- */
- if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
- goto out;
-
- /*
- * Check the table hashed by <protocol,addr,port>
- * for "full" addressed entries
- */
- svc = __ip_vs_service_get(protocol, vaddr, vport);
-
- if (svc == NULL
- && protocol == IPPROTO_TCP
- && atomic_read(&ip_vs_ftpsvc_counter)
- && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
- /*
- * Check if ftp service entry exists, the packet
- * might belong to FTP data connections.
- */
- svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
- }
-
- if (svc == NULL
- && atomic_read(&ip_vs_nullsvc_counter)) {
- /*
- * Check if the catch-all port (port zero) exists
- */
- svc = __ip_vs_service_get(protocol, vaddr, 0);
- }
-
- out:
- read_unlock(&__ip_vs_svc_lock);
-
- IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
- fwmark, ip_vs_proto_name(protocol),
- NIPQUAD(vaddr), ntohs(vport),
- svc?"hit":"not hit");
-
- return svc;
-}
-
-
-static inline void
-__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
- atomic_inc(&svc->refcnt);
- dest->svc = svc;
-}
-
-static inline void
-__ip_vs_unbind_svc(struct ip_vs_dest *dest)
-{
- struct ip_vs_service *svc = dest->svc;
-
- dest->svc = NULL;
- if (atomic_dec_and_test(&svc->refcnt))
- kfree(svc);
-}
-
-
-/*
- * Returns hash value for real service
- */
-static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
-{
- register unsigned porth = ntohs(port);
-
- return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
- & IP_VS_RTAB_MASK;
-}
-
-/*
- * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
- * should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
-{
- unsigned hash;
-
- if (!list_empty(&dest->d_list)) {
- return 0;
- }
-
- /*
- * Hash by proto,addr,port,
- * which are the parameters of the real service.
- */
- hash = ip_vs_rs_hashkey(dest->addr, dest->port);
- list_add(&dest->d_list, &ip_vs_rtable[hash]);
-
- return 1;
-}
-
-/*
- * UNhashes ip_vs_dest from ip_vs_rtable.
- * should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
-{
- /*
- * Remove it from the ip_vs_rtable table.
- */
- if (!list_empty(&dest->d_list)) {
- list_del(&dest->d_list);
- INIT_LIST_HEAD(&dest->d_list);
- }
-
- return 1;
-}
-
-/*
- * Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
-{
- unsigned hash;
- struct ip_vs_dest *dest;
-
- /*
- * Check for "full" addressed entries
- * Return the first found entry
- */
- hash = ip_vs_rs_hashkey(daddr, dport);
-
- read_lock(&__ip_vs_rs_lock);
- list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
- if ((dest->addr == daddr)
- && (dest->port == dport)
- && ((dest->protocol == protocol) ||
- dest->vfwmark)) {
- /* HIT */
- read_unlock(&__ip_vs_rs_lock);
- return dest;
- }
- }
- read_unlock(&__ip_vs_rs_lock);
-
- return NULL;
-}
-
-/*
- * Lookup destination by {addr,port} in the given service
- */
-static struct ip_vs_dest *
-ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
-{
- struct ip_vs_dest *dest;
-
- /*
- * Find the destination for the given service
- */
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if ((dest->addr == daddr) && (dest->port == dport)) {
- /* HIT */
- return dest;
- }
- }
-
- return NULL;
-}
-
-
-/*
- * Lookup dest by {svc,addr,port} in the destination trash.
- * The destination trash is used to hold the destinations that are removed
- * from the service table but are still referenced by some conn entries.
- * The reason to add the destination trash is when the dest is temporary
- * down (either by administrator or by monitor program), the dest can be
- * picked back from the trash, the remaining connections to the dest can
- * continue, and the counting information of the dest is also useful for
- * scheduling.
- */
-static struct ip_vs_dest *
-ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
-{
- struct ip_vs_dest *dest, *nxt;
-
- /*
- * Find the destination in trash
- */
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
- IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
- "dest->refcnt=%d\n",
- dest->vfwmark,
- NIPQUAD(dest->addr), ntohs(dest->port),
- atomic_read(&dest->refcnt));
- if (dest->addr == daddr &&
- dest->port == dport &&
- dest->vfwmark == svc->fwmark &&
- dest->protocol == svc->protocol &&
- (svc->fwmark ||
- (dest->vaddr == svc->addr &&
- dest->vport == svc->port))) {
- /* HIT */
- return dest;
- }
-
- /*
- * Try to purge the destination from trash if not referenced
- */
- if (atomic_read(&dest->refcnt) == 1) {
- IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
- "from trash\n",
- dest->vfwmark,
- NIPQUAD(dest->addr), ntohs(dest->port));
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- kfree(dest);
- }
- }
-
- return NULL;
-}
-
-
-/*
- * Clean up all the destinations in the trash
- * Called by the ip_vs_control_cleanup()
- *
- * When the ip_vs_control_clearup is activated by ipvs module exit,
- * the service tables must have been flushed and all the connections
- * are expired, and the refcnt of each destination in the trash must
- * be 1, so we simply release them here.
- */
-static void ip_vs_trash_cleanup(void)
-{
- struct ip_vs_dest *dest, *nxt;
-
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- kfree(dest);
- }
-}
-
-
-static void
-ip_vs_zero_stats(struct ip_vs_stats *stats)
-{
- spin_lock_bh(&stats->lock);
- memset(stats, 0, (char *)&stats->lock - (char *)stats);
- spin_unlock_bh(&stats->lock);
- ip_vs_zero_estimator(stats);
-}
-
-/*
- * Update a destination in the given service
- */
-static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
- struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
-{
- int conn_flags;
-
- /* set the weight and the flags */
- atomic_set(&dest->weight, udest->weight);
- conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
- /* check if local node and update the flags */
- if (inet_addr_type(udest->addr) == RTN_LOCAL) {
- conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
- | IP_VS_CONN_F_LOCALNODE;
- }
-
- /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
- if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
- conn_flags |= IP_VS_CONN_F_NOOUTPUT;
- } else {
- /*
- * Put the real service in ip_vs_rtable if not present.
- * For now only for NAT!
- */
- write_lock_bh(&__ip_vs_rs_lock);
- ip_vs_rs_hash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
- }
- atomic_set(&dest->conn_flags, conn_flags);
-
- /* bind the service */
- if (!dest->svc) {
- __ip_vs_bind_svc(dest, svc);
- } else {
- if (dest->svc != svc) {
- __ip_vs_unbind_svc(dest);
- ip_vs_zero_stats(&dest->stats);
- __ip_vs_bind_svc(dest, svc);
- }
- }
-
- /* set the dest status flags */
- dest->flags |= IP_VS_DEST_F_AVAILABLE;
-
- if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
- dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
- dest->u_threshold = udest->u_threshold;
- dest->l_threshold = udest->l_threshold;
-}
-
-
-/*
- * Create a destination for the given service
- */
-static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
- struct ip_vs_dest **dest_p)
-{
- struct ip_vs_dest *dest;
- unsigned atype;
-
- EnterFunction(2);
-
- atype = inet_addr_type(udest->addr);
- if (atype != RTN_LOCAL && atype != RTN_UNICAST)
- return -EINVAL;
-
- dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
- if (dest == NULL) {
- IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
- return -ENOMEM;
- }
-
- dest->protocol = svc->protocol;
- dest->vaddr = svc->addr;
- dest->vport = svc->port;
- dest->vfwmark = svc->fwmark;
- dest->addr = udest->addr;
- dest->port = udest->port;
-
- atomic_set(&dest->activeconns, 0);
- atomic_set(&dest->inactconns, 0);
- atomic_set(&dest->persistconns, 0);
- atomic_set(&dest->refcnt, 0);
-
- INIT_LIST_HEAD(&dest->d_list);
- spin_lock_init(&dest->dst_lock);
- spin_lock_init(&dest->stats.lock);
- __ip_vs_update_dest(svc, dest, udest);
- ip_vs_new_estimator(&dest->stats);
-
- *dest_p = dest;
-
- LeaveFunction(2);
- return 0;
-}
-
-
-/*
- * Add a destination into an existing service
- */
-static int
-ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
-{
- struct ip_vs_dest *dest;
- __be32 daddr = udest->addr;
- __be16 dport = udest->port;
- int ret;
-
- EnterFunction(2);
-
- if (udest->weight < 0) {
- IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
- return -ERANGE;
- }
-
- if (udest->l_threshold > udest->u_threshold) {
- IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
- "upper threshold\n");
- return -ERANGE;
- }
-
- /*
- * Check if the dest already exists in the list
- */
- dest = ip_vs_lookup_dest(svc, daddr, dport);
- if (dest != NULL) {
- IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
- return -EEXIST;
- }
-
- /*
- * Check if the dest already exists in the trash and
- * is from the same service
- */
- dest = ip_vs_trash_get_dest(svc, daddr, dport);
- if (dest != NULL) {
- IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
- "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
- NIPQUAD(daddr), ntohs(dport),
- atomic_read(&dest->refcnt),
- dest->vfwmark,
- NIPQUAD(dest->vaddr),
- ntohs(dest->vport));
- __ip_vs_update_dest(svc, dest, udest);
-
- /*
- * Get the destination from the trash
- */
- list_del(&dest->n_list);
-
- ip_vs_new_estimator(&dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
- return 0;
- }
-
- /*
- * Allocate and initialize the dest structure
- */
- ret = ip_vs_new_dest(svc, udest, &dest);
- if (ret) {
- return ret;
- }
-
- /*
- * Add the dest entry into the list
- */
- atomic_inc(&dest->refcnt);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
- LeaveFunction(2);
-
- return 0;
-}
-
-
-/*
- * Edit a destination in the given service
- */
-static int
-ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
-{
- struct ip_vs_dest *dest;
- __be32 daddr = udest->addr;
- __be16 dport = udest->port;
-
- EnterFunction(2);
-
- if (udest->weight < 0) {
- IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
- return -ERANGE;
- }
-
- if (udest->l_threshold > udest->u_threshold) {
- IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
- "upper threshold\n");
- return -ERANGE;
- }
-
- /*
- * Lookup the destination list
- */
- dest = ip_vs_lookup_dest(svc, daddr, dport);
- if (dest == NULL) {
- IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
- return -ENOENT;
- }
-
- __ip_vs_update_dest(svc, dest, udest);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- while (atomic_read(&svc->usecnt) > 1) {};
-
- /* call the update_service, because server weight may be changed */
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
- LeaveFunction(2);
-
- return 0;
-}
-
-
-/*
- * Delete a destination (must be already unlinked from the service)
- */
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
-{
- ip_vs_kill_estimator(&dest->stats);
-
- /*
- * Remove it from the d-linked list with the real services.
- */
- write_lock_bh(&__ip_vs_rs_lock);
- ip_vs_rs_unhash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
-
- /*
- * Decrease the refcnt of the dest, and free the dest
- * if nobody refers to it (refcnt=0). Otherwise, throw
- * the destination into the trash.
- */
- if (atomic_dec_and_test(&dest->refcnt)) {
- ip_vs_dst_reset(dest);
- /* simply decrease svc->refcnt here, let the caller check
- and release the service if nobody refers to it.
- Only user context can release destination and service,
- and only one user context can update virtual service at a
- time, so the operation here is OK */
- atomic_dec(&dest->svc->refcnt);
- kfree(dest);
- } else {
- IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
- "dest->refcnt=%d\n",
- NIPQUAD(dest->addr), ntohs(dest->port),
- atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ip_vs_dest_trash);
- atomic_inc(&dest->refcnt);
- }
-}
-
-
-/*
- * Unlink a destination from the given service
- */
-static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
- struct ip_vs_dest *dest,
- int svcupd)
-{
- dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
-
- /*
- * Remove it from the d-linked destination list.
- */
- list_del(&dest->n_list);
- svc->num_dests--;
- if (svcupd) {
- /*
- * Call the update_service function of its scheduler
- */
- svc->scheduler->update_service(svc);
- }
-}
-
-
-/*
- * Delete a destination server in the given service
- */
-static int
-ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
-{
- struct ip_vs_dest *dest;
- __be32 daddr = udest->addr;
- __be16 dport = udest->port;
-
- EnterFunction(2);
-
- dest = ip_vs_lookup_dest(svc, daddr, dport);
- if (dest == NULL) {
- IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
- return -ENOENT;
- }
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- /*
- * Unlink dest from the service
- */
- __ip_vs_unlink_dest(svc, dest, 1);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
- /*
- * Delete the destination
- */
- __ip_vs_del_dest(dest);
-
- LeaveFunction(2);
-
- return 0;
-}
-
-
-/*
- * Add a service into the service hash table
- */
-static int
-ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
-{
- int ret = 0;
- struct ip_vs_scheduler *sched = NULL;
- struct ip_vs_service *svc = NULL;
-
- /* increase the module use count */
- ip_vs_use_count_inc();
-
- /* Lookup the scheduler by 'u->sched_name' */
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
- u->sched_name);
- ret = -ENOENT;
- goto out_mod_dec;
- }
-
- svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
- if (svc == NULL) {
- IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
- ret = -ENOMEM;
- goto out_err;
- }
-
- /* I'm the first user of the service */
- atomic_set(&svc->usecnt, 1);
- atomic_set(&svc->refcnt, 0);
-
- svc->protocol = u->protocol;
- svc->addr = u->addr;
- svc->port = u->port;
- svc->fwmark = u->fwmark;
- svc->flags = u->flags;
- svc->timeout = u->timeout * HZ;
- svc->netmask = u->netmask;
-
- INIT_LIST_HEAD(&svc->destinations);
- rwlock_init(&svc->sched_lock);
- spin_lock_init(&svc->stats.lock);
-
- /* Bind the scheduler */
- ret = ip_vs_bind_scheduler(svc, sched);
- if (ret)
- goto out_err;
- sched = NULL;
-
- /* Update the virtual service counters */
- if (svc->port == FTPPORT)
- atomic_inc(&ip_vs_ftpsvc_counter);
- else if (svc->port == 0)
- atomic_inc(&ip_vs_nullsvc_counter);
-
- ip_vs_new_estimator(&svc->stats);
- ip_vs_num_services++;
-
- /* Hash the service into the service table */
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_hash(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
-
- *svc_p = svc;
- return 0;
-
- out_err:
- if (svc != NULL) {
- if (svc->scheduler)
- ip_vs_unbind_scheduler(svc);
- if (svc->inc) {
- local_bh_disable();
- ip_vs_app_inc_put(svc->inc);
- local_bh_enable();
- }
- kfree(svc);
- }
- ip_vs_scheduler_put(sched);
-
- out_mod_dec:
- /* decrease the module use count */
- ip_vs_use_count_dec();
-
- return ret;
-}
-
-
-/*
- * Edit a service and bind it with a new scheduler
- */
-static int
-ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
-{
- struct ip_vs_scheduler *sched, *old_sched;
- int ret = 0;
-
- /*
- * Lookup the scheduler, by 'u->sched_name'
- */
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
- u->sched_name);
- return -ENOENT;
- }
- old_sched = sched;
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- /*
- * Set the flags and timeout value
- */
- svc->flags = u->flags | IP_VS_SVC_F_HASHED;
- svc->timeout = u->timeout * HZ;
- svc->netmask = u->netmask;
-
- old_sched = svc->scheduler;
- if (sched != old_sched) {
- /*
- * Unbind the old scheduler
- */
- if ((ret = ip_vs_unbind_scheduler(svc))) {
- old_sched = sched;
- goto out;
- }
-
- /*
- * Bind the new scheduler
- */
- if ((ret = ip_vs_bind_scheduler(svc, sched))) {
- /*
- * If ip_vs_bind_scheduler fails, restore the old
- * scheduler.
- * The main reason of failure is out of memory.
- *
- * The question is if the old scheduler can be
- * restored all the time. TODO: if it cannot be
- * restored some time, we must delete the service,
- * otherwise the system may crash.
- */
- ip_vs_bind_scheduler(svc, old_sched);
- old_sched = sched;
- goto out;
- }
- }
-
- out:
- write_unlock_bh(&__ip_vs_svc_lock);
-
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
-
- return ret;
-}
-
-
-/*
- * Delete a service from the service list
- * - The service must be unlinked, unlocked and not referenced!
- * - We are called under _bh lock
- */
-static void __ip_vs_del_service(struct ip_vs_service *svc)
-{
- struct ip_vs_dest *dest, *nxt;
- struct ip_vs_scheduler *old_sched;
-
- ip_vs_num_services--;
- ip_vs_kill_estimator(&svc->stats);
-
- /* Unbind scheduler */
- old_sched = svc->scheduler;
- ip_vs_unbind_scheduler(svc);
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
-
- /* Unbind app inc */
- if (svc->inc) {
- ip_vs_app_inc_put(svc->inc);
- svc->inc = NULL;
- }
-
- /*
- * Unlink the whole destination list
- */
- list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
- __ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(dest);
- }
-
- /*
- * Update the virtual service counters
- */
- if (svc->port == FTPPORT)
- atomic_dec(&ip_vs_ftpsvc_counter);
- else if (svc->port == 0)
- atomic_dec(&ip_vs_nullsvc_counter);
-
- /*
- * Free the service if nobody refers to it
- */
- if (atomic_read(&svc->refcnt) == 0)
- kfree(svc);
-
- /* decrease the module use count */
- ip_vs_use_count_dec();
-}
-
-/*
- * Delete a service from the service list
- */
-static int ip_vs_del_service(struct ip_vs_service *svc)
-{
- if (svc == NULL)
- return -EEXIST;
-
- /*
- * Unhash it from the service table
- */
- write_lock_bh(&__ip_vs_svc_lock);
-
- ip_vs_svc_unhash(svc);
-
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- __ip_vs_del_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
- return 0;
-}
-
-
-/*
- * Flush all the virtual services
- */
-static int ip_vs_flush(void)
-{
- int idx;
- struct ip_vs_service *svc, *nxt;
-
- /*
- * Flush the service table hashed by <protocol,addr,port>
- */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
- }
- }
-
- /*
- * Flush the service table hashed by fwmark
- */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt,
- &ip_vs_svc_fwm_table[idx], f_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
- }
- }
-
- return 0;
-}
-
-
-/*
- * Zero counters in a service or all services
- */
-static int ip_vs_zero_service(struct ip_vs_service *svc)
-{
- struct ip_vs_dest *dest;
-
- write_lock_bh(&__ip_vs_svc_lock);
- list_for_each_entry(dest, &svc->destinations, n_list) {
- ip_vs_zero_stats(&dest->stats);
- }
- ip_vs_zero_stats(&svc->stats);
- write_unlock_bh(&__ip_vs_svc_lock);
- return 0;
-}
-
-static int ip_vs_zero_all(void)
-{
- int idx;
- struct ip_vs_service *svc;
-
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- ip_vs_zero_service(svc);
- }
- }
-
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_zero_service(svc);
- }
- }
-
- ip_vs_zero_stats(&ip_vs_stats);
- return 0;
-}
-
-
-static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int *valp = table->data;
- int val = *valp;
- int rc;
-
- rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
- if (write && (*valp != val)) {
- if ((*valp < 0) || (*valp > 3)) {
- /* Restore the correct value */
- *valp = val;
- } else {
- update_defense_level();
- }
- }
- return rc;
-}
-
-
-static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int *valp = table->data;
- int val[2];
- int rc;
-
- /* backup the value first */
- memcpy(val, valp, sizeof(val));
-
- rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
- if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
- /* Restore the correct value */
- memcpy(valp, val, sizeof(val));
- }
- return rc;
-}
-
-
-/*
- * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
- */
-
-static struct ctl_table vs_vars[] = {
- {
- .ctl_name = NET_IPV4_VS_AMEMTHRESH,
- .procname = "amemthresh",
- .data = &sysctl_ip_vs_amemthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#ifdef CONFIG_IP_VS_DEBUG
- {
- .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
- .procname = "debug_level",
- .data = &sysctl_ip_vs_debug_level,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
- {
- .ctl_name = NET_IPV4_VS_AMDROPRATE,
- .procname = "am_droprate",
- .data = &sysctl_ip_vs_am_droprate,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_VS_DROP_ENTRY,
- .procname = "drop_entry",
- .data = &sysctl_ip_vs_drop_entry,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_do_defense_mode,
- },
- {
- .ctl_name = NET_IPV4_VS_DROP_PACKET,
- .procname = "drop_packet",
- .data = &sysctl_ip_vs_drop_packet,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_do_defense_mode,
- },
- {
- .ctl_name = NET_IPV4_VS_SECURE_TCP,
- .procname = "secure_tcp",
- .data = &sysctl_ip_vs_secure_tcp,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_do_defense_mode,
- },
-#if 0
- {
- .ctl_name = NET_IPV4_VS_TO_ES,
- .procname = "timeout_established",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_SS,
- .procname = "timeout_synsent",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_SR,
- .procname = "timeout_synrecv",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_FW,
- .procname = "timeout_finwait",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_TW,
- .procname = "timeout_timewait",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_CL,
- .procname = "timeout_close",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_CW,
- .procname = "timeout_closewait",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_LA,
- .procname = "timeout_lastack",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_LI,
- .procname = "timeout_listen",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_SA,
- .procname = "timeout_synack",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_UDP,
- .procname = "timeout_udp",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_VS_TO_ICMP,
- .procname = "timeout_icmp",
- .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
-#endif
- {
- .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
- .procname = "cache_bypass",
- .data = &sysctl_ip_vs_cache_bypass,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
- .procname = "expire_nodest_conn",
- .data = &sysctl_ip_vs_expire_nodest_conn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
- .procname = "expire_quiescent_template",
- .data = &sysctl_ip_vs_expire_quiescent_template,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
- .procname = "sync_threshold",
- .data = &sysctl_ip_vs_sync_threshold,
- .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
- .mode = 0644,
- .proc_handler = &proc_do_sync_threshold,
- },
- {
- .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
- .procname = "nat_icmp_send",
- .data = &sysctl_ip_vs_nat_icmp_send,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table vs_table[] = {
- {
- .ctl_name = NET_IPV4_VS,
- .procname = "vs",
- .mode = 0555,
- .child = vs_vars
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipvs_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = vs_table,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table vs_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipvs_ipv4_table,
- },
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-#ifdef CONFIG_PROC_FS
-
-struct ip_vs_iter {
- struct list_head *table;
- int bucket;
-};
-
-/*
- * Write the contents of the VS rule table to a PROCfs file.
- * (It is kept just for backward compatibility)
- */
-static inline const char *ip_vs_fwd_name(unsigned flags)
-{
- switch (flags & IP_VS_CONN_F_FWD_MASK) {
- case IP_VS_CONN_F_LOCALNODE:
- return "Local";
- case IP_VS_CONN_F_TUNNEL:
- return "Tunnel";
- case IP_VS_CONN_F_DROUTE:
- return "Route";
- default:
- return "Masq";
- }
-}
-
-
-/* Get the Nth entry in the two lists */
-static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
-{
- struct ip_vs_iter *iter = seq->private;
- int idx;
- struct ip_vs_service *svc;
-
- /* look in hash by protocol */
- for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (pos-- == 0){
- iter->table = ip_vs_svc_table;
- iter->bucket = idx;
- return svc;
- }
- }
- }
-
- /* keep looking in fwmark */
- for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (pos-- == 0) {
- iter->table = ip_vs_svc_fwm_table;
- iter->bucket = idx;
- return svc;
- }
- }
- }
-
- return NULL;
-}
-
-static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-{
-
- read_lock_bh(&__ip_vs_svc_lock);
- return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
-}
-
-
-static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct list_head *e;
- struct ip_vs_iter *iter;
- struct ip_vs_service *svc;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ip_vs_info_array(seq,0);
-
- svc = v;
- iter = seq->private;
-
- if (iter->table == ip_vs_svc_table) {
- /* next service in table hashed by protocol */
- if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, s_list);
-
-
- while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
- s_list) {
- return svc;
- }
- }
-
- iter->table = ip_vs_svc_fwm_table;
- iter->bucket = -1;
- goto scan_fwmark;
- }
-
- /* next service in hashed by fwmark */
- if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, f_list);
-
- scan_fwmark:
- while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
- f_list)
- return svc;
- }
-
- return NULL;
-}
-
-static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-{
- read_unlock_bh(&__ip_vs_svc_lock);
-}
-
-
-static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq,
- "IP Virtual Server version %d.%d.%d (size=%d)\n",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
- seq_puts(seq,
- "Prot LocalAddress:Port Scheduler Flags\n");
- seq_puts(seq,
- " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
- } else {
- const struct ip_vs_service *svc = v;
- const struct ip_vs_iter *iter = seq->private;
- const struct ip_vs_dest *dest;
-
- if (iter->table == ip_vs_svc_table)
- seq_printf(seq, "%s %08X:%04X %s ",
- ip_vs_proto_name(svc->protocol),
- ntohl(svc->addr),
- ntohs(svc->port),
- svc->scheduler->name);
- else
- seq_printf(seq, "FWM %08X %s ",
- svc->fwmark, svc->scheduler->name);
-
- if (svc->flags & IP_VS_SVC_F_PERSISTENT)
- seq_printf(seq, "persistent %d %08X\n",
- svc->timeout,
- ntohl(svc->netmask));
- else
- seq_putc(seq, '\n');
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- seq_printf(seq,
- " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
- ntohl(dest->addr), ntohs(dest->port),
- ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
- atomic_read(&dest->weight),
- atomic_read(&dest->activeconns),
- atomic_read(&dest->inactconns));
- }
- }
- return 0;
-}
-
-static struct seq_operations ip_vs_info_seq_ops = {
- .start = ip_vs_info_seq_start,
- .next = ip_vs_info_seq_next,
- .stop = ip_vs_info_seq_stop,
- .show = ip_vs_info_seq_show,
-};
-
-static int ip_vs_info_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &ip_vs_info_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations ip_vs_info_fops = {
- .owner = THIS_MODULE,
- .open = ip_vs_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-#endif
-
-struct ip_vs_stats ip_vs_stats;
-
-#ifdef CONFIG_PROC_FS
-static int ip_vs_stats_show(struct seq_file *seq, void *v)
-{
-
-/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
- seq_puts(seq,
- " Total Incoming Outgoing Incoming Outgoing\n");
- seq_printf(seq,
- " Conns Packets Packets Bytes Bytes\n");
-
- spin_lock_bh(&ip_vs_stats.lock);
- seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
- ip_vs_stats.inpkts, ip_vs_stats.outpkts,
- (unsigned long long) ip_vs_stats.inbytes,
- (unsigned long long) ip_vs_stats.outbytes);
-
-/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
- seq_puts(seq,
- " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
- seq_printf(seq,"%8X %8X %8X %16X %16X\n",
- ip_vs_stats.cps,
- ip_vs_stats.inpps,
- ip_vs_stats.outpps,
- ip_vs_stats.inbps,
- ip_vs_stats.outbps);
- spin_unlock_bh(&ip_vs_stats.lock);
-
- return 0;
-}
-
-static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ip_vs_stats_show, NULL);
-}
-
-static struct file_operations ip_vs_stats_fops = {
- .owner = THIS_MODULE,
- .open = ip_vs_stats_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-#endif
-
-/*
- * Set timeout values for tcp tcpfin udp in the timeout_table.
- */
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
-{
- IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
- u->tcp_timeout,
- u->tcp_fin_timeout,
- u->udp_timeout);
-
-#ifdef CONFIG_IP_VS_PROTO_TCP
- if (u->tcp_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
- = u->tcp_timeout * HZ;
- }
-
- if (u->tcp_fin_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
- = u->tcp_fin_timeout * HZ;
- }
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_UDP
- if (u->udp_timeout) {
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
- = u->udp_timeout * HZ;
- }
-#endif
- return 0;
-}
-
-
-#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
-#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
-#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
- sizeof(struct ip_vs_dest_user))
-#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
-#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
-#define MAX_ARG_LEN SVCDEST_ARG_LEN
-
-static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
- [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
- [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
- [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
-};
-
-static int
-do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
-{
- int ret;
- unsigned char arg[MAX_ARG_LEN];
- struct ip_vs_service_user *usvc;
- struct ip_vs_service *svc;
- struct ip_vs_dest_user *udest;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (len != set_arglen[SET_CMDID(cmd)]) {
- IP_VS_ERR("set_ctl: len %u != %u\n",
- len, set_arglen[SET_CMDID(cmd)]);
- return -EINVAL;
- }
-
- if (copy_from_user(arg, user, len) != 0)
- return -EFAULT;
-
- /* increase the module use count */
- ip_vs_use_count_inc();
-
- if (mutex_lock_interruptible(&__ip_vs_mutex)) {
- ret = -ERESTARTSYS;
- goto out_dec;
- }
-
- if (cmd == IP_VS_SO_SET_FLUSH) {
- /* Flush the virtual service */
- ret = ip_vs_flush();
- goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
- /* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
- goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
- struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
- goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
- struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = stop_sync_thread(dm->state);
- goto out_unlock;
- }
-
- usvc = (struct ip_vs_service_user *)arg;
- udest = (struct ip_vs_dest_user *)(usvc + 1);
-
- if (cmd == IP_VS_SO_SET_ZERO) {
- /* if no service address is set, zero counters in all */
- if (!usvc->fwmark && !usvc->addr && !usvc->port) {
- ret = ip_vs_zero_all();
- goto out_unlock;
- }
- }
-
- /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
- if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
- IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
- usvc->protocol, NIPQUAD(usvc->addr),
- ntohs(usvc->port), usvc->sched_name);
- ret = -EFAULT;
- goto out_unlock;
- }
-
- /* Lookup the exact service by <protocol, addr, port> or fwmark */
- if (usvc->fwmark == 0)
- svc = __ip_vs_service_get(usvc->protocol,
- usvc->addr, usvc->port);
- else
- svc = __ip_vs_svc_fwm_get(usvc->fwmark);
-
- if (cmd != IP_VS_SO_SET_ADD
- && (svc == NULL || svc->protocol != usvc->protocol)) {
- ret = -ESRCH;
- goto out_unlock;
- }
-
- switch (cmd) {
- case IP_VS_SO_SET_ADD:
- if (svc != NULL)
- ret = -EEXIST;
- else
- ret = ip_vs_add_service(usvc, &svc);
- break;
- case IP_VS_SO_SET_EDIT:
- ret = ip_vs_edit_service(svc, usvc);
- break;
- case IP_VS_SO_SET_DEL:
- ret = ip_vs_del_service(svc);
- if (!ret)
- goto out_unlock;
- break;
- case IP_VS_SO_SET_ZERO:
- ret = ip_vs_zero_service(svc);
- break;
- case IP_VS_SO_SET_ADDDEST:
- ret = ip_vs_add_dest(svc, udest);
- break;
- case IP_VS_SO_SET_EDITDEST:
- ret = ip_vs_edit_dest(svc, udest);
- break;
- case IP_VS_SO_SET_DELDEST:
- ret = ip_vs_del_dest(svc, udest);
- break;
- default:
- ret = -EINVAL;
- }
-
- if (svc)
- ip_vs_service_put(svc);
-
- out_unlock:
- mutex_unlock(&__ip_vs_mutex);
- out_dec:
- /* decrease the module use count */
- ip_vs_use_count_dec();
-
- return ret;
-}
-
-
-static void
-ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
-{
- spin_lock_bh(&src->lock);
- memcpy(dst, src, (char*)&src->lock - (char*)src);
- spin_unlock_bh(&src->lock);
-}
-
-static void
-ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
-{
- dst->protocol = src->protocol;
- dst->addr = src->addr;
- dst->port = src->port;
- dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
- dst->flags = src->flags;
- dst->timeout = src->timeout / HZ;
- dst->netmask = src->netmask;
- dst->num_dests = src->num_dests;
- ip_vs_copy_stats(&dst->stats, &src->stats);
-}
-
-static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
- struct ip_vs_get_services __user *uptr)
-{
- int idx, count=0;
- struct ip_vs_service *svc;
- struct ip_vs_service_entry entry;
- int ret = 0;
-
- for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (count >= get->num_services)
- goto out;
- memset(&entry, 0, sizeof(entry));
- ip_vs_copy_service(&entry, svc);
- if (copy_to_user(&uptr->entrytable[count],
- &entry, sizeof(entry))) {
- ret = -EFAULT;
- goto out;
- }
- count++;
- }
- }
-
- for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (count >= get->num_services)
- goto out;
- memset(&entry, 0, sizeof(entry));
- ip_vs_copy_service(&entry, svc);
- if (copy_to_user(&uptr->entrytable[count],
- &entry, sizeof(entry))) {
- ret = -EFAULT;
- goto out;
- }
- count++;
- }
- }
- out:
- return ret;
-}
-
-static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
- struct ip_vs_get_dests __user *uptr)
-{
- struct ip_vs_service *svc;
- int ret = 0;
-
- if (get->fwmark)
- svc = __ip_vs_svc_fwm_get(get->fwmark);
- else
- svc = __ip_vs_service_get(get->protocol,
- get->addr, get->port);
- if (svc) {
- int count = 0;
- struct ip_vs_dest *dest;
- struct ip_vs_dest_entry entry;
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (count >= get->num_dests)
- break;
-
- entry.addr = dest->addr;
- entry.port = dest->port;
- entry.conn_flags = atomic_read(&dest->conn_flags);
- entry.weight = atomic_read(&dest->weight);
- entry.u_threshold = dest->u_threshold;
- entry.l_threshold = dest->l_threshold;
- entry.activeconns = atomic_read(&dest->activeconns);
- entry.inactconns = atomic_read(&dest->inactconns);
- entry.persistconns = atomic_read(&dest->persistconns);
- ip_vs_copy_stats(&entry.stats, &dest->stats);
- if (copy_to_user(&uptr->entrytable[count],
- &entry, sizeof(entry))) {
- ret = -EFAULT;
- break;
- }
- count++;
- }
- ip_vs_service_put(svc);
- } else
- ret = -ESRCH;
- return ret;
-}
-
-static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
-{
-#ifdef CONFIG_IP_VS_PROTO_TCP
- u->tcp_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
- u->tcp_fin_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
- u->udp_timeout =
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
-#endif
-}
-
-
-#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
-#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
-#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
-#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
-#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
-#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
-#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
-
-static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
- [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
- [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
- [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
- [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
- [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
- [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
- [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
-};
-
-static int
-do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
- unsigned char arg[128];
- int ret = 0;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (*len < get_arglen[GET_CMDID(cmd)]) {
- IP_VS_ERR("get_ctl: len %u < %u\n",
- *len, get_arglen[GET_CMDID(cmd)]);
- return -EINVAL;
- }
-
- if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
- return -EFAULT;
-
- if (mutex_lock_interruptible(&__ip_vs_mutex))
- return -ERESTARTSYS;
-
- switch (cmd) {
- case IP_VS_SO_GET_VERSION:
- {
- char buf[64];
-
- sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
- if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
- ret = -EFAULT;
- goto out;
- }
- *len = strlen(buf)+1;
- }
- break;
-
- case IP_VS_SO_GET_INFO:
- {
- struct ip_vs_getinfo info;
- info.version = IP_VS_VERSION_CODE;
- info.size = IP_VS_CONN_TAB_SIZE;
- info.num_services = ip_vs_num_services;
- if (copy_to_user(user, &info, sizeof(info)) != 0)
- ret = -EFAULT;
- }
- break;
-
- case IP_VS_SO_GET_SERVICES:
- {
- struct ip_vs_get_services *get;
- int size;
-
- get = (struct ip_vs_get_services *)arg;
- size = sizeof(*get) +
- sizeof(struct ip_vs_service_entry) * get->num_services;
- if (*len != size) {
- IP_VS_ERR("length: %u != %u\n", *len, size);
- ret = -EINVAL;
- goto out;
- }
- ret = __ip_vs_get_service_entries(get, user);
- }
- break;
-
- case IP_VS_SO_GET_SERVICE:
- {
- struct ip_vs_service_entry *entry;
- struct ip_vs_service *svc;
-
- entry = (struct ip_vs_service_entry *)arg;
- if (entry->fwmark)
- svc = __ip_vs_svc_fwm_get(entry->fwmark);
- else
- svc = __ip_vs_service_get(entry->protocol,
- entry->addr, entry->port);
- if (svc) {
- ip_vs_copy_service(entry, svc);
- if (copy_to_user(user, entry, sizeof(*entry)) != 0)
- ret = -EFAULT;
- ip_vs_service_put(svc);
- } else
- ret = -ESRCH;
- }
- break;
-
- case IP_VS_SO_GET_DESTS:
- {
- struct ip_vs_get_dests *get;
- int size;
-
- get = (struct ip_vs_get_dests *)arg;
- size = sizeof(*get) +
- sizeof(struct ip_vs_dest_entry) * get->num_dests;
- if (*len != size) {
- IP_VS_ERR("length: %u != %u\n", *len, size);
- ret = -EINVAL;
- goto out;
- }
- ret = __ip_vs_get_dest_entries(get, user);
- }
- break;
-
- case IP_VS_SO_GET_TIMEOUT:
- {
- struct ip_vs_timeout_user t;
-
- __ip_vs_get_timeouts(&t);
- if (copy_to_user(user, &t, sizeof(t)) != 0)
- ret = -EFAULT;
- }
- break;
-
- case IP_VS_SO_GET_DAEMON:
- {
- struct ip_vs_daemon_user d[2];
-
- memset(&d, 0, sizeof(d));
- if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
- d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
- d[0].syncid = ip_vs_master_syncid;
- }
- if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
- d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
- d[1].syncid = ip_vs_backup_syncid;
- }
- if (copy_to_user(user, &d, sizeof(d)) != 0)
- ret = -EFAULT;
- }
- break;
-
- default:
- ret = -EINVAL;
- }
-
- out:
- mutex_unlock(&__ip_vs_mutex);
- return ret;
-}
-
-
-static struct nf_sockopt_ops ip_vs_sockopts = {
- .pf = PF_INET,
- .set_optmin = IP_VS_BASE_CTL,
- .set_optmax = IP_VS_SO_SET_MAX+1,
- .set = do_ip_vs_set_ctl,
- .get_optmin = IP_VS_BASE_CTL,
- .get_optmax = IP_VS_SO_GET_MAX+1,
- .get = do_ip_vs_get_ctl,
-};
-
-
-int ip_vs_control_init(void)
-{
- int ret;
- int idx;
-
- EnterFunction(2);
-
- ret = nf_register_sockopt(&ip_vs_sockopts);
- if (ret) {
- IP_VS_ERR("cannot register sockopt.\n");
- return ret;
- }
-
- proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
-
- sysctl_header = register_sysctl_table(vs_root_table, 0);
-
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
- }
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
- }
-
- memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
- spin_lock_init(&ip_vs_stats.lock);
- ip_vs_new_estimator(&ip_vs_stats);
-
- /* Hook the defense timer */
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
- LeaveFunction(2);
- return 0;
-}
-
-
-void ip_vs_control_cleanup(void)
-{
- EnterFunction(2);
- ip_vs_trash_cleanup();
- cancel_rearming_delayed_work(&defense_work);
- ip_vs_kill_estimator(&ip_vs_stats);
- unregister_sysctl_table(sysctl_header);
- proc_net_remove("ip_vs_stats");
- proc_net_remove("ip_vs");
- nf_unregister_sockopt(&ip_vs_sockopts);
- LeaveFunction(2);
-}
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
deleted file mode 100644
index 7d68b80c4c19..000000000000
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * ip_vs_est.c: simple rate estimator for IPVS
- *
- * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-
-#include <net/ip_vs.h>
-
-/*
- This code is to estimate rate in a shorter interval (such as 8
- seconds) for virtual services and real servers. For measure rate in a
- long interval, it is easy to implement a user level daemon which
- periodically reads those statistical counters and measure rate.
-
- Currently, the measurement is activated by slow timer handler. Hope
- this measurement will not introduce too much load.
-
- We measure rate during the last 8 seconds every 2 seconds:
-
- avgrate = avgrate*(1-W) + rate*W
-
- where W = 2^(-2)
-
- NOTES.
-
- * The stored value for average bps is scaled by 2^5, so that maximal
- rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
-
- * A lot code is taken from net/sched/estimator.c
- */
-
-
-struct ip_vs_estimator
-{
- struct ip_vs_estimator *next;
- struct ip_vs_stats *stats;
-
- u32 last_conns;
- u32 last_inpkts;
- u32 last_outpkts;
- u64 last_inbytes;
- u64 last_outbytes;
-
- u32 cps;
- u32 inpps;
- u32 outpps;
- u32 inbps;
- u32 outbps;
-};
-
-
-static struct ip_vs_estimator *est_list = NULL;
-static DEFINE_RWLOCK(est_lock);
-static struct timer_list est_timer;
-
-static void estimation_timer(unsigned long arg)
-{
- struct ip_vs_estimator *e;
- struct ip_vs_stats *s;
- u32 n_conns;
- u32 n_inpkts, n_outpkts;
- u64 n_inbytes, n_outbytes;
- u32 rate;
-
- read_lock(&est_lock);
- for (e = est_list; e; e = e->next) {
- s = e->stats;
-
- spin_lock(&s->lock);
- n_conns = s->conns;
- n_inpkts = s->inpkts;
- n_outpkts = s->outpkts;
- n_inbytes = s->inbytes;
- n_outbytes = s->outbytes;
-
- /* scaled by 2^10, but divided 2 seconds */
- rate = (n_conns - e->last_conns)<<9;
- e->last_conns = n_conns;
- e->cps += ((long)rate - (long)e->cps)>>2;
- s->cps = (e->cps+0x1FF)>>10;
-
- rate = (n_inpkts - e->last_inpkts)<<9;
- e->last_inpkts = n_inpkts;
- e->inpps += ((long)rate - (long)e->inpps)>>2;
- s->inpps = (e->inpps+0x1FF)>>10;
-
- rate = (n_outpkts - e->last_outpkts)<<9;
- e->last_outpkts = n_outpkts;
- e->outpps += ((long)rate - (long)e->outpps)>>2;
- s->outpps = (e->outpps+0x1FF)>>10;
-
- rate = (n_inbytes - e->last_inbytes)<<4;
- e->last_inbytes = n_inbytes;
- e->inbps += ((long)rate - (long)e->inbps)>>2;
- s->inbps = (e->inbps+0xF)>>5;
-
- rate = (n_outbytes - e->last_outbytes)<<4;
- e->last_outbytes = n_outbytes;
- e->outbps += ((long)rate - (long)e->outbps)>>2;
- s->outbps = (e->outbps+0xF)>>5;
- spin_unlock(&s->lock);
- }
- read_unlock(&est_lock);
- mod_timer(&est_timer, jiffies + 2*HZ);
-}
-
-int ip_vs_new_estimator(struct ip_vs_stats *stats)
-{
- struct ip_vs_estimator *est;
-
- est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
- return -ENOMEM;
-
- est->stats = stats;
- est->last_conns = stats->conns;
- est->cps = stats->cps<<10;
-
- est->last_inpkts = stats->inpkts;
- est->inpps = stats->inpps<<10;
-
- est->last_outpkts = stats->outpkts;
- est->outpps = stats->outpps<<10;
-
- est->last_inbytes = stats->inbytes;
- est->inbps = stats->inbps<<5;
-
- est->last_outbytes = stats->outbytes;
- est->outbps = stats->outbps<<5;
-
- write_lock_bh(&est_lock);
- est->next = est_list;
- if (est->next == NULL) {
- init_timer(&est_timer);
- est_timer.expires = jiffies + 2*HZ;
- est_timer.function = estimation_timer;
- add_timer(&est_timer);
- }
- est_list = est;
- write_unlock_bh(&est_lock);
- return 0;
-}
-
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
-{
- struct ip_vs_estimator *est, **pest;
- int killed = 0;
-
- write_lock_bh(&est_lock);
- pest = &est_list;
- while ((est=*pest) != NULL) {
- if (est->stats != stats) {
- pest = &est->next;
- continue;
- }
- *pest = est->next;
- kfree(est);
- killed++;
- }
- if (killed && est_list == NULL)
- del_timer_sync(&est_timer);
- write_unlock_bh(&est_lock);
-}
-
-void ip_vs_zero_estimator(struct ip_vs_stats *stats)
-{
- struct ip_vs_estimator *e;
-
- write_lock_bh(&est_lock);
- for (e = est_list; e; e = e->next) {
- if (e->stats != stats)
- continue;
-
- /* set counters zero */
- e->last_conns = 0;
- e->last_inpkts = 0;
- e->last_outpkts = 0;
- e->last_inbytes = 0;
- e->last_outbytes = 0;
- e->cps = 0;
- e->inpps = 0;
- e->outpps = 0;
- e->inbps = 0;
- e->outbps = 0;
- }
- write_unlock_bh(&est_lock);
-}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
deleted file mode 100644
index 687c1de1146f..000000000000
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * ip_vs_ftp.c: IPVS ftp application module
- *
- * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * Changes:
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
- * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
- *
- * IP_MASQ_FTP ftp masquerading module
- *
- * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
- *
- * Author: Wouter Gadeyne
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <asm/unaligned.h>
-
-#include <net/ip_vs.h>
-
-
-#define SERVER_STRING "227 Entering Passive Mode ("
-#define CLIENT_STRING "PORT "
-
-
-/*
- * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
-module_param_array(ports, ushort, NULL, 0);
-MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
-
-
-/* Dummy variable */
-static int ip_vs_ftp_pasv;
-
-
-static int
-ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
-{
- return 0;
-}
-
-
-static int
-ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
-{
- return 0;
-}
-
-
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern" and terminated with the "term" character.
- * <addr,port> is in network order.
- */
-static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
- const char *pattern, size_t plen, char term,
- __be32 *addr, __be16 *port,
- char **start, char **end)
-{
- unsigned char p[6];
- int i = 0;
-
- if (data_limit - data < plen) {
- /* check if there is partial match */
- if (strnicmp(data, pattern, data_limit - data) == 0)
- return -1;
- else
- return 0;
- }
-
- if (strnicmp(data, pattern, plen) != 0) {
- return 0;
- }
- *start = data + plen;
-
- for (data = *start; *data != term; data++) {
- if (data == data_limit)
- return -1;
- }
- *end = data;
-
- memset(p, 0, sizeof(p));
- for (data = *start; data != *end; data++) {
- if (*data >= '0' && *data <= '9') {
- p[i] = p[i]*10 + *data - '0';
- } else if (*data == ',' && i < 5) {
- i++;
- } else {
- /* unexpected character */
- return -1;
- }
- }
-
- if (i != 5)
- return -1;
-
- *addr = get_unaligned((__be32 *)p);
- *port = get_unaligned((__be16 *)(p + 4));
- return 1;
-}
-
-
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
- * from the server (inside-to-outside).
- * When we see one, we build a connection entry with the client address,
- * client port 0 (unknown at the moment), the server address and the
- * server port. Mark the current connection entry as a control channel
- * of the new entry. All this work is just to make the data connection
- * can be scheduled to the right server later.
- *
- * The outgoing packet should be something like
- * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
- * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
- */
-static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff **pskb, int *diff)
-{
- struct iphdr *iph;
- struct tcphdr *th;
- char *data, *data_limit;
- char *start, *end;
- __be32 from;
- __be16 port;
- struct ip_vs_conn *n_cp;
- char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
- unsigned buf_len;
- int ret;
-
- *diff = 0;
-
- /* Only useful for established sessions */
- if (cp->state != IP_VS_TCP_S_ESTABLISHED)
- return 1;
-
- /* Linear packets are much easier to deal with. */
- if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
- return 0;
-
- if (cp->app_data == &ip_vs_ftp_pasv) {
- iph = (*pskb)->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)th + (th->doff << 2);
- data_limit = (*pskb)->tail;
-
- if (ip_vs_ftp_get_addrport(data, data_limit,
- SERVER_STRING,
- sizeof(SERVER_STRING)-1, ')',
- &from, &port,
- &start, &end) != 1)
- return 1;
-
- IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
- "%u.%u.%u.%u:%d detected\n",
- NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
-
- /*
- * Now update or create an connection entry for it
- */
- n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
- cp->caddr, 0);
- if (!n_cp) {
- n_cp = ip_vs_conn_new(IPPROTO_TCP,
- cp->caddr, 0,
- cp->vaddr, port,
- from, port,
- IP_VS_CONN_F_NO_CPORT,
- cp->dest);
- if (!n_cp)
- return 0;
-
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
- }
-
- /*
- * Replace the old passive address with the new one
- */
- from = n_cp->vaddr;
- port = n_cp->vport;
- sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
- (ntohs(port)>>8)&255, ntohs(port)&255);
- buf_len = strlen(buf);
-
- /*
- * Calculate required delta-offset to keep TCP happy
- */
- *diff = buf_len - (end-start);
-
- if (*diff == 0) {
- /* simply replace it with new passive address */
- memcpy(start, buf, buf_len);
- ret = 1;
- } else {
- ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
- end-start, buf, buf_len);
- }
-
- cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
- ip_vs_conn_put(n_cp);
- return ret;
- }
- return 1;
-}
-
-
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
- * (outside-to-inside).
- *
- * The incoming packet having the PORT command should be something like
- * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
- * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
- * In this case, we create a connection entry using the client address and
- * port, so that the active ftp data connection from the server can reach
- * the client.
- */
-static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff **pskb, int *diff)
-{
- struct iphdr *iph;
- struct tcphdr *th;
- char *data, *data_start, *data_limit;
- char *start, *end;
- __be32 to;
- __be16 port;
- struct ip_vs_conn *n_cp;
-
- /* no diff required for incoming packets */
- *diff = 0;
-
- /* Only useful for established sessions */
- if (cp->state != IP_VS_TCP_S_ESTABLISHED)
- return 1;
-
- /* Linear packets are much easier to deal with. */
- if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
- return 0;
-
- /*
- * Detecting whether it is passive
- */
- iph = (*pskb)->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /* Since there may be OPTIONS in the TCP packet and the HLEN is
- the length of the header in 32-bit multiples, it is accurate
- to calculate data address by th+HLEN*4 */
- data = data_start = (char *)th + (th->doff << 2);
- data_limit = (*pskb)->tail;
-
- while (data <= data_limit - 6) {
- if (strnicmp(data, "PASV\r\n", 6) == 0) {
- /* Passive mode on */
- IP_VS_DBG(7, "got PASV at %td of %td\n",
- data - data_start,
- data_limit - data_start);
- cp->app_data = &ip_vs_ftp_pasv;
- return 1;
- }
- data++;
- }
-
- /*
- * To support virtual FTP server, the scenerio is as follows:
- * FTP client ----> Load Balancer ----> FTP server
- * First detect the port number in the application data,
- * then create a new connection entry for the coming data
- * connection.
- */
- if (ip_vs_ftp_get_addrport(data_start, data_limit,
- CLIENT_STRING, sizeof(CLIENT_STRING)-1,
- '\r', &to, &port,
- &start, &end) != 1)
- return 1;
-
- IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
- NIPQUAD(to), ntohs(port));
-
- /* Passive mode off */
- cp->app_data = NULL;
-
- /*
- * Now update or create a connection entry for it
- */
- IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
- ip_vs_proto_name(iph->protocol),
- NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
-
- n_cp = ip_vs_conn_in_get(iph->protocol,
- to, port,
- cp->vaddr, htons(ntohs(cp->vport)-1));
- if (!n_cp) {
- n_cp = ip_vs_conn_new(IPPROTO_TCP,
- to, port,
- cp->vaddr, htons(ntohs(cp->vport)-1),
- cp->daddr, htons(ntohs(cp->dport)-1),
- 0,
- cp->dest);
- if (!n_cp)
- return 0;
-
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
- }
-
- /*
- * Move tunnel to listen state
- */
- ip_vs_tcp_conn_listen(n_cp);
- ip_vs_conn_put(n_cp);
-
- return 1;
-}
-
-
-static struct ip_vs_app ip_vs_ftp = {
- .name = "ftp",
- .type = IP_VS_APP_TYPE_FTP,
- .protocol = IPPROTO_TCP,
- .module = THIS_MODULE,
- .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
- .init_conn = ip_vs_ftp_init_conn,
- .done_conn = ip_vs_ftp_done_conn,
- .bind_conn = NULL,
- .unbind_conn = NULL,
- .pkt_out = ip_vs_ftp_out,
- .pkt_in = ip_vs_ftp_in,
-};
-
-
-/*
- * ip_vs_ftp initialization
- */
-static int __init ip_vs_ftp_init(void)
-{
- int i, ret;
- struct ip_vs_app *app = &ip_vs_ftp;
-
- ret = register_ip_vs_app(app);
- if (ret)
- return ret;
-
- for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
- if (!ports[i])
- continue;
- ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
- if (ret)
- break;
- IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
- app->name, i, ports[i]);
- }
-
- if (ret)
- unregister_ip_vs_app(app);
-
- return ret;
-}
-
-
-/*
- * ip_vs_ftp finish.
- */
-static void __exit ip_vs_ftp_exit(void)
-{
- unregister_ip_vs_app(&ip_vs_ftp);
-}
-
-
-module_init(ip_vs_ftp_init);
-module_exit(ip_vs_ftp_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
deleted file mode 100644
index 524751e031de..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ /dev/null
@@ -1,599 +0,0 @@
-/*
- * IPVS: Locality-Based Least-Connection scheduling module
- *
- * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@gnuchina.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- * Martin Hamilton : fixed the terrible locking bugs
- * *lock(tbl->lock) ==> *lock(&tbl->lock)
- * Wensong Zhang : fixed the uninitilized tbl->lock bug
- * Wensong Zhang : added doing full expiration check to
- * collect stale entries of 24+ hours when
- * no partial expire check in a half hour
- * Julian Anastasov : replaced del_timer call with del_timer_sync
- * to avoid the possible race between timer
- * handler and del_timer thread in SMP
- *
- */
-
-/*
- * The lblc algorithm is as follows (pseudo code):
- *
- * if cachenode[dest_ip] is null then
- * n, cachenode[dest_ip] <- {weighted least-conn node};
- * else
- * n <- cachenode[dest_ip];
- * if (n is dead) OR
- * (n.conns>n.weight AND
- * there is a node m with m.conns<m.weight/2) then
- * n, cachenode[dest_ip] <- {weighted least-conn node};
- *
- * return n;
- *
- * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
- * me to write this module.
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-/* for sysctl */
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * It is for garbage collection of stale IPVS lblc entries,
- * when the table is full.
- */
-#define CHECK_EXPIRE_INTERVAL (60*HZ)
-#define ENTRY_TIMEOUT (6*60*HZ)
-
-/*
- * It is for full expiration check.
- * When there is no partial expiration check (garbage collection)
- * in a half hour, do a full expiration check to collect stale
- * entries that haven't been touched for a day.
- */
-#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
-
-
-/*
- * for IPVS lblc entry hash table
- */
-#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
-#define CONFIG_IP_VS_LBLC_TAB_BITS 10
-#endif
-#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
-#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
-#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
-
-
-/*
- * IPVS lblc entry represents an association between destination
- * IP address and its destination server
- */
-struct ip_vs_lblc_entry {
- struct list_head list;
- __be32 addr; /* destination IP address */
- struct ip_vs_dest *dest; /* real server (cache) */
- unsigned long lastuse; /* last used time */
-};
-
-
-/*
- * IPVS lblc hash table
- */
-struct ip_vs_lblc_table {
- rwlock_t lock; /* lock for this table */
- struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
- atomic_t entries; /* number of entries */
- int max_size; /* maximum size of entries */
- struct timer_list periodic_timer; /* collect stale entries */
- int rover; /* rover for expire check */
- int counter; /* counter for no expire */
-};
-
-
-/*
- * IPVS LBLC sysctl table
- */
-
-static ctl_table vs_vars_table[] = {
- {
- .ctl_name = NET_IPV4_VS_LBLC_EXPIRE,
- .procname = "lblc_expiration",
- .data = &sysctl_ip_vs_lblc_expiration,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table vs_table[] = {
- {
- .ctl_name = NET_IPV4_VS,
- .procname = "vs",
- .mode = 0555,
- .child = vs_vars_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipvs_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = vs_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table lblc_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipvs_ipv4_table
- },
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-/*
- * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
- * IP address to a server.
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
-{
- struct ip_vs_lblc_entry *en;
-
- en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
- if (en == NULL) {
- IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
- return NULL;
- }
-
- INIT_LIST_HEAD(&en->list);
- en->addr = daddr;
-
- atomic_inc(&dest->refcnt);
- en->dest = dest;
-
- return en;
-}
-
-
-static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
-{
- list_del(&en->list);
- /*
- * We don't kfree dest because it is refered either by its service
- * or the trash dest list.
- */
- atomic_dec(&en->dest->refcnt);
- kfree(en);
-}
-
-
-/*
- * Returns hash value for IPVS LBLC entry
- */
-static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
-{
- return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
-}
-
-
-/*
- * Hash an entry in the ip_vs_lblc_table.
- * returns bool success.
- */
-static int
-ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
-{
- unsigned hash;
-
- if (!list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- /*
- * Hash by destination IP address
- */
- hash = ip_vs_lblc_hashkey(en->addr);
-
- write_lock(&tbl->lock);
- list_add(&en->list, &tbl->bucket[hash]);
- atomic_inc(&tbl->entries);
- write_unlock(&tbl->lock);
-
- return 1;
-}
-
-
-/*
- * Get ip_vs_lblc_entry associated with supplied parameters.
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
-{
- unsigned hash;
- struct ip_vs_lblc_entry *en;
-
- hash = ip_vs_lblc_hashkey(addr);
-
- read_lock(&tbl->lock);
-
- list_for_each_entry(en, &tbl->bucket[hash], list) {
- if (en->addr == addr) {
- /* HIT */
- read_unlock(&tbl->lock);
- return en;
- }
- }
-
- read_unlock(&tbl->lock);
-
- return NULL;
-}
-
-
-/*
- * Flush all the entries of the specified table.
- */
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
-{
- int i;
- struct ip_vs_lblc_entry *en, *nxt;
-
- for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- write_lock(&tbl->lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
- ip_vs_lblc_free(en);
- atomic_dec(&tbl->entries);
- }
- write_unlock(&tbl->lock);
- }
-}
-
-
-static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
-{
- unsigned long now = jiffies;
- int i, j;
- struct ip_vs_lblc_entry *en, *nxt;
-
- for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
- j = (j + 1) & IP_VS_LBLC_TAB_MASK;
-
- write_lock(&tbl->lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_before(now,
- en->lastuse + sysctl_ip_vs_lblc_expiration))
- continue;
-
- ip_vs_lblc_free(en);
- atomic_dec(&tbl->entries);
- }
- write_unlock(&tbl->lock);
- }
- tbl->rover = j;
-}
-
-
-/*
- * Periodical timer handler for IPVS lblc table
- * It is used to collect stale entries when the number of entries
- * exceeds the maximum size of the table.
- *
- * Fixme: we probably need more complicated algorithm to collect
- * entries that have not been used for a long time even
- * if the number of entries doesn't exceed the maximum size
- * of the table.
- * The full expiration check is for this purpose now.
- */
-static void ip_vs_lblc_check_expire(unsigned long data)
-{
- struct ip_vs_lblc_table *tbl;
- unsigned long now = jiffies;
- int goal;
- int i, j;
- struct ip_vs_lblc_entry *en, *nxt;
-
- tbl = (struct ip_vs_lblc_table *)data;
-
- if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
- /* do full expiration check */
- ip_vs_lblc_full_check(tbl);
- tbl->counter = 1;
- goto out;
- }
-
- if (atomic_read(&tbl->entries) <= tbl->max_size) {
- tbl->counter++;
- goto out;
- }
-
- goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
- if (goal > tbl->max_size/2)
- goal = tbl->max_size/2;
-
- for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
- j = (j + 1) & IP_VS_LBLC_TAB_MASK;
-
- write_lock(&tbl->lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
- continue;
-
- ip_vs_lblc_free(en);
- atomic_dec(&tbl->entries);
- goal--;
- }
- write_unlock(&tbl->lock);
- if (goal <= 0)
- break;
- }
- tbl->rover = j;
-
- out:
- mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
-}
-
-
-static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
-{
- int i;
- struct ip_vs_lblc_table *tbl;
-
- /*
- * Allocate the ip_vs_lblc_table for this service
- */
- tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
- if (tbl == NULL) {
- IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
- return -ENOMEM;
- }
- svc->sched_data = tbl;
- IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
- "current service\n",
- sizeof(struct ip_vs_lblc_table));
-
- /*
- * Initialize the hash buckets
- */
- for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
- }
- rwlock_init(&tbl->lock);
- tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
- tbl->rover = 0;
- tbl->counter = 1;
-
- /*
- * Hook periodic timer for garbage collection
- */
- init_timer(&tbl->periodic_timer);
- tbl->periodic_timer.data = (unsigned long)tbl;
- tbl->periodic_timer.function = ip_vs_lblc_check_expire;
- tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
- add_timer(&tbl->periodic_timer);
-
- return 0;
-}
-
-
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_lblc_table *tbl = svc->sched_data;
-
- /* remove periodic timer */
- del_timer_sync(&tbl->periodic_timer);
-
- /* got to clean up table entries here */
- ip_vs_lblc_flush(tbl);
-
- /* release the table itself */
- kfree(svc->sched_data);
- IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
- sizeof(struct ip_vs_lblc_table));
-
- return 0;
-}
-
-
-static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
-{
- return 0;
-}
-
-
-static inline struct ip_vs_dest *
-__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
-{
- struct ip_vs_dest *dest, *least;
- int loh, doh;
-
- /*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
- * (dest overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connection.
- */
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
- if (atomic_read(&dest->weight) > 0) {
- least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
- goto nextstage;
- }
- }
- return NULL;
-
- /*
- * Find the destination with the least load.
- */
- nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
- least = dest;
- loh = doh;
- }
- }
-
- IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
-}
-
-
-/*
- * If this destination server is overloaded and there is a less loaded
- * server, then return true.
- */
-static inline int
-is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
- if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
- struct ip_vs_dest *d;
-
- list_for_each_entry(d, &svc->destinations, n_list) {
- if (atomic_read(&d->activeconns)*2
- < atomic_read(&d->weight)) {
- return 1;
- }
- }
- }
- return 0;
-}
-
-
-/*
- * Locality-Based (weighted) Least-Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct ip_vs_dest *dest;
- struct ip_vs_lblc_table *tbl;
- struct ip_vs_lblc_entry *en;
- struct iphdr *iph = skb->nh.iph;
-
- IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
-
- tbl = (struct ip_vs_lblc_table *)svc->sched_data;
- en = ip_vs_lblc_get(tbl, iph->daddr);
- if (en == NULL) {
- dest = __ip_vs_wlc_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- en = ip_vs_lblc_new(iph->daddr, dest);
- if (en == NULL) {
- return NULL;
- }
- ip_vs_lblc_hash(tbl, en);
- } else {
- dest = en->dest;
- if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
- || atomic_read(&dest->weight) <= 0
- || is_overloaded(dest, svc)) {
- dest = __ip_vs_wlc_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- atomic_dec(&en->dest->refcnt);
- atomic_inc(&dest->refcnt);
- en->dest = dest;
- }
- }
- en->lastuse = jiffies;
-
- IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
- "--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(en->addr),
- NIPQUAD(dest->addr),
- ntohs(dest->port));
-
- return dest;
-}
-
-
-/*
- * IPVS LBLC Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_lblc_scheduler =
-{
- .name = "lblc",
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_lblc_init_svc,
- .done_service = ip_vs_lblc_done_svc,
- .update_service = ip_vs_lblc_update_svc,
- .schedule = ip_vs_lblc_schedule,
-};
-
-
-static int __init ip_vs_lblc_init(void)
-{
- INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
- sysctl_header = register_sysctl_table(lblc_root_table, 0);
- return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-}
-
-
-static void __exit ip_vs_lblc_cleanup(void)
-{
- unregister_sysctl_table(sysctl_header);
- unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-}
-
-
-module_init(ip_vs_lblc_init);
-module_exit(ip_vs_lblc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
deleted file mode 100644
index 08990192b6ec..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * IPVS: Locality-Based Least-Connection with Replication scheduler
- *
- * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@gnuchina.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- * Julian Anastasov : Added the missing (dest->weight>0)
- * condition in the ip_vs_dest_set_max.
- *
- */
-
-/*
- * The lblc/r algorithm is as follows (pseudo code):
- *
- * if serverSet[dest_ip] is null then
- * n, serverSet[dest_ip] <- {weighted least-conn node};
- * else
- * n <- {least-conn (alive) node in serverSet[dest_ip]};
- * if (n is null) OR
- * (n.conns>n.weight AND
- * there is a node m with m.conns<m.weight/2) then
- * n <- {weighted least-conn node};
- * add n to serverSet[dest_ip];
- * if |serverSet[dest_ip]| > 1 AND
- * now - serverSet[dest_ip].lastMod > T then
- * m <- {most conn node in serverSet[dest_ip]};
- * remove m from serverSet[dest_ip];
- * if serverSet[dest_ip] changed then
- * serverSet[dest_ip].lastMod <- now;
- *
- * return n;
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-/* for sysctl */
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-/* for proc_net_create/proc_net_remove */
-#include <linux/proc_fs.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * It is for garbage collection of stale IPVS lblcr entries,
- * when the table is full.
- */
-#define CHECK_EXPIRE_INTERVAL (60*HZ)
-#define ENTRY_TIMEOUT (6*60*HZ)
-
-/*
- * It is for full expiration check.
- * When there is no partial expiration check (garbage collection)
- * in a half hour, do a full expiration check to collect stale
- * entries that haven't been touched for a day.
- */
-#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
-
-/*
- * for IPVS lblcr entry hash table
- */
-#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
-#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
-#endif
-#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
-#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
-#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
-
-
-/*
- * IPVS destination set structure and operations
- */
-struct ip_vs_dest_list {
- struct ip_vs_dest_list *next; /* list link */
- struct ip_vs_dest *dest; /* destination server */
-};
-
-struct ip_vs_dest_set {
- atomic_t size; /* set size */
- unsigned long lastmod; /* last modified time */
- struct ip_vs_dest_list *list; /* destination list */
- rwlock_t lock; /* lock for this list */
-};
-
-
-static struct ip_vs_dest_list *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
-{
- struct ip_vs_dest_list *e;
-
- for (e=set->list; e!=NULL; e=e->next) {
- if (e->dest == dest)
- /* already existed */
- return NULL;
- }
-
- e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
- if (e == NULL) {
- IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
- return NULL;
- }
-
- atomic_inc(&dest->refcnt);
- e->dest = dest;
-
- /* link it to the list */
- write_lock(&set->lock);
- e->next = set->list;
- set->list = e;
- atomic_inc(&set->size);
- write_unlock(&set->lock);
-
- set->lastmod = jiffies;
- return e;
-}
-
-static void
-ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
-{
- struct ip_vs_dest_list *e, **ep;
-
- write_lock(&set->lock);
- for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
- if (e->dest == dest) {
- /* HIT */
- *ep = e->next;
- atomic_dec(&set->size);
- set->lastmod = jiffies;
- atomic_dec(&e->dest->refcnt);
- kfree(e);
- break;
- }
- ep = &e->next;
- }
- write_unlock(&set->lock);
-}
-
-static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
-{
- struct ip_vs_dest_list *e, **ep;
-
- write_lock(&set->lock);
- for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
- *ep = e->next;
- /*
- * We don't kfree dest because it is refered either
- * by its service or by the trash dest list.
- */
- atomic_dec(&e->dest->refcnt);
- kfree(e);
- }
- write_unlock(&set->lock);
-}
-
-/* get weighted least-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
-{
- register struct ip_vs_dest_list *e;
- struct ip_vs_dest *dest, *least;
- int loh, doh;
-
- if (set == NULL)
- return NULL;
-
- read_lock(&set->lock);
- /* select the first destination server, whose weight > 0 */
- for (e=set->list; e!=NULL; e=e->next) {
- least = e->dest;
- if (least->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- if ((atomic_read(&least->weight) > 0)
- && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
- goto nextstage;
- }
- }
- read_unlock(&set->lock);
- return NULL;
-
- /* find the destination with the weighted least load */
- nextstage:
- for (e=e->next; e!=NULL; e=e->next) {
- dest = e->dest;
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if ((loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))
- && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- least = dest;
- loh = doh;
- }
- }
- read_unlock(&set->lock);
-
- IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
- return least;
-}
-
-
-/* get weighted most-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
-{
- register struct ip_vs_dest_list *e;
- struct ip_vs_dest *dest, *most;
- int moh, doh;
-
- if (set == NULL)
- return NULL;
-
- read_lock(&set->lock);
- /* select the first destination server, whose weight > 0 */
- for (e=set->list; e!=NULL; e=e->next) {
- most = e->dest;
- if (atomic_read(&most->weight) > 0) {
- moh = atomic_read(&most->activeconns) * 50
- + atomic_read(&most->inactconns);
- goto nextstage;
- }
- }
- read_unlock(&set->lock);
- return NULL;
-
- /* find the destination with the weighted most load */
- nextstage:
- for (e=e->next; e!=NULL; e=e->next) {
- dest = e->dest;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
- if ((moh * atomic_read(&dest->weight) <
- doh * atomic_read(&most->weight))
- && (atomic_read(&dest->weight) > 0)) {
- most = dest;
- moh = doh;
- }
- }
- read_unlock(&set->lock);
-
- IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(most->addr), ntohs(most->port),
- atomic_read(&most->activeconns),
- atomic_read(&most->refcnt),
- atomic_read(&most->weight), moh);
- return most;
-}
-
-
-/*
- * IPVS lblcr entry represents an association between destination
- * IP address and its destination server set
- */
-struct ip_vs_lblcr_entry {
- struct list_head list;
- __be32 addr; /* destination IP address */
- struct ip_vs_dest_set set; /* destination server set */
- unsigned long lastuse; /* last used time */
-};
-
-
-/*
- * IPVS lblcr hash table
- */
-struct ip_vs_lblcr_table {
- rwlock_t lock; /* lock for this table */
- struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
- atomic_t entries; /* number of entries */
- int max_size; /* maximum size of entries */
- struct timer_list periodic_timer; /* collect stale entries */
- int rover; /* rover for expire check */
- int counter; /* counter for no expire */
-};
-
-
-/*
- * IPVS LBLCR sysctl table
- */
-
-static ctl_table vs_vars_table[] = {
- {
- .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE,
- .procname = "lblcr_expiration",
- .data = &sysctl_ip_vs_lblcr_expiration,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table vs_table[] = {
- {
- .ctl_name = NET_IPV4_VS,
- .procname = "vs",
- .mode = 0555,
- .child = vs_vars_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipvs_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = vs_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table lblcr_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipvs_ipv4_table
- },
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-/*
- * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server.
- */
-static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
-{
- struct ip_vs_lblcr_entry *en;
-
- en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
- if (en == NULL) {
- IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
- return NULL;
- }
-
- INIT_LIST_HEAD(&en->list);
- en->addr = daddr;
-
- /* initilize its dest set */
- atomic_set(&(en->set.size), 0);
- en->set.list = NULL;
- rwlock_init(&en->set.lock);
-
- return en;
-}
-
-
-static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
-{
- list_del(&en->list);
- ip_vs_dest_set_eraseall(&en->set);
- kfree(en);
-}
-
-
-/*
- * Returns hash value for IPVS LBLCR entry
- */
-static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
-{
- return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
-}
-
-
-/*
- * Hash an entry in the ip_vs_lblcr_table.
- * returns bool success.
- */
-static int
-ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
-{
- unsigned hash;
-
- if (!list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- /*
- * Hash by destination IP address
- */
- hash = ip_vs_lblcr_hashkey(en->addr);
-
- write_lock(&tbl->lock);
- list_add(&en->list, &tbl->bucket[hash]);
- atomic_inc(&tbl->entries);
- write_unlock(&tbl->lock);
-
- return 1;
-}
-
-
-/*
- * Get ip_vs_lblcr_entry associated with supplied parameters.
- */
-static inline struct ip_vs_lblcr_entry *
-ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
-{
- unsigned hash;
- struct ip_vs_lblcr_entry *en;
-
- hash = ip_vs_lblcr_hashkey(addr);
-
- read_lock(&tbl->lock);
-
- list_for_each_entry(en, &tbl->bucket[hash], list) {
- if (en->addr == addr) {
- /* HIT */
- read_unlock(&tbl->lock);
- return en;
- }
- }
-
- read_unlock(&tbl->lock);
-
- return NULL;
-}
-
-
-/*
- * Flush all the entries of the specified table.
- */
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
-{
- int i;
- struct ip_vs_lblcr_entry *en, *nxt;
-
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- write_lock(&tbl->lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
- ip_vs_lblcr_free(en);
- atomic_dec(&tbl->entries);
- }
- write_unlock(&tbl->lock);
- }
-}
-
-
-static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
-{
- unsigned long now = jiffies;
- int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
-
- for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
-
- write_lock(&tbl->lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
- now))
- continue;
-
- ip_vs_lblcr_free(en);
- atomic_dec(&tbl->entries);
- }
- write_unlock(&tbl->lock);
- }
- tbl->rover = j;
-}
-
-
-/*
- * Periodical timer handler for IPVS lblcr table
- * It is used to collect stale entries when the number of entries
- * exceeds the maximum size of the table.
- *
- * Fixme: we probably need more complicated algorithm to collect
- * entries that have not been used for a long time even
- * if the number of entries doesn't exceed the maximum size
- * of the table.
- * The full expiration check is for this purpose now.
- */
-static void ip_vs_lblcr_check_expire(unsigned long data)
-{
- struct ip_vs_lblcr_table *tbl;
- unsigned long now = jiffies;
- int goal;
- int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
-
- tbl = (struct ip_vs_lblcr_table *)data;
-
- if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
- /* do full expiration check */
- ip_vs_lblcr_full_check(tbl);
- tbl->counter = 1;
- goto out;
- }
-
- if (atomic_read(&tbl->entries) <= tbl->max_size) {
- tbl->counter++;
- goto out;
- }
-
- goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
- if (goal > tbl->max_size/2)
- goal = tbl->max_size/2;
-
- for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
-
- write_lock(&tbl->lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
- continue;
-
- ip_vs_lblcr_free(en);
- atomic_dec(&tbl->entries);
- goal--;
- }
- write_unlock(&tbl->lock);
- if (goal <= 0)
- break;
- }
- tbl->rover = j;
-
- out:
- mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
-}
-
-
-#ifdef CONFIG_IP_VS_LBLCR_DEBUG
-static struct ip_vs_lblcr_table *lblcr_table_list;
-
-/*
- * /proc/net/ip_vs_lblcr to display the mappings of
- * destination IP address <==> its serverSet
- */
-static int
-ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
-{
- off_t pos=0, begin;
- int len=0, size;
- struct ip_vs_lblcr_table *tbl;
- unsigned long now = jiffies;
- int i;
- struct ip_vs_lblcr_entry *en;
-
- tbl = lblcr_table_list;
-
- size = sprintf(buffer, "LastTime Dest IP address Server set\n");
- pos += size;
- len += size;
-
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- read_lock_bh(&tbl->lock);
- list_for_each_entry(en, &tbl->bucket[i], list) {
- char tbuf[16];
- struct ip_vs_dest_list *d;
-
- sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
- size = sprintf(buffer+len, "%8lu %-16s ",
- now-en->lastuse, tbuf);
-
- read_lock(&en->set.lock);
- for (d=en->set.list; d!=NULL; d=d->next) {
- size += sprintf(buffer+len+size,
- "%u.%u.%u.%u ",
- NIPQUAD(d->dest->addr));
- }
- read_unlock(&en->set.lock);
- size += sprintf(buffer+len+size, "\n");
- len += size;
- pos += size;
- if (pos <= offset)
- len=0;
- if (pos >= offset+length) {
- read_unlock_bh(&tbl->lock);
- goto done;
- }
- }
- read_unlock_bh(&tbl->lock);
- }
-
- done:
- begin = len - (pos - offset);
- *start = buffer + begin;
- len -= begin;
- if(len>length)
- len = length;
- return len;
-}
-#endif
-
-
-static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
-{
- int i;
- struct ip_vs_lblcr_table *tbl;
-
- /*
- * Allocate the ip_vs_lblcr_table for this service
- */
- tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
- if (tbl == NULL) {
- IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
- return -ENOMEM;
- }
- svc->sched_data = tbl;
- IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
- "current service\n",
- sizeof(struct ip_vs_lblcr_table));
-
- /*
- * Initialize the hash buckets
- */
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
- }
- rwlock_init(&tbl->lock);
- tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
- tbl->rover = 0;
- tbl->counter = 1;
-
- /*
- * Hook periodic timer for garbage collection
- */
- init_timer(&tbl->periodic_timer);
- tbl->periodic_timer.data = (unsigned long)tbl;
- tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
- tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
- add_timer(&tbl->periodic_timer);
-
-#ifdef CONFIG_IP_VS_LBLCR_DEBUG
- lblcr_table_list = tbl;
-#endif
- return 0;
-}
-
-
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_lblcr_table *tbl = svc->sched_data;
-
- /* remove periodic timer */
- del_timer_sync(&tbl->periodic_timer);
-
- /* got to clean up table entries here */
- ip_vs_lblcr_flush(tbl);
-
- /* release the table itself */
- kfree(svc->sched_data);
- IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
- sizeof(struct ip_vs_lblcr_table));
-
- return 0;
-}
-
-
-static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
-{
- return 0;
-}
-
-
-static inline struct ip_vs_dest *
-__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
-{
- struct ip_vs_dest *dest, *least;
- int loh, doh;
-
- /*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
- * (dest overhead) / dest->weight
- *
- * Remember -- no floats in kernel mode!!!
- * The comparison of h1*w2 > h2*w1 is equivalent to that of
- * h1/w1 > h2/w2
- * if every weight is larger than zero.
- *
- * The server with weight=0 is quiesced and will not receive any
- * new connection.
- */
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- if (atomic_read(&dest->weight) > 0) {
- least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
- goto nextstage;
- }
- }
- return NULL;
-
- /*
- * Find the destination with the least load.
- */
- nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
- if (dest->flags & IP_VS_DEST_F_OVERLOAD)
- continue;
-
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
- least = dest;
- loh = doh;
- }
- }
-
- IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
- "activeconns %d refcnt %d weight %d overhead %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->refcnt),
- atomic_read(&least->weight), loh);
-
- return least;
-}
-
-
-/*
- * If this destination server is overloaded and there is a less loaded
- * server, then return true.
- */
-static inline int
-is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
- if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
- struct ip_vs_dest *d;
-
- list_for_each_entry(d, &svc->destinations, n_list) {
- if (atomic_read(&d->activeconns)*2
- < atomic_read(&d->weight)) {
- return 1;
- }
- }
- }
- return 0;
-}
-
-
-/*
- * Locality-Based (weighted) Least-Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct ip_vs_dest *dest;
- struct ip_vs_lblcr_table *tbl;
- struct ip_vs_lblcr_entry *en;
- struct iphdr *iph = skb->nh.iph;
-
- IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
-
- tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
- en = ip_vs_lblcr_get(tbl, iph->daddr);
- if (en == NULL) {
- dest = __ip_vs_wlc_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- en = ip_vs_lblcr_new(iph->daddr);
- if (en == NULL) {
- return NULL;
- }
- ip_vs_dest_set_insert(&en->set, dest);
- ip_vs_lblcr_hash(tbl, en);
- } else {
- dest = ip_vs_dest_set_min(&en->set);
- if (!dest || is_overloaded(dest, svc)) {
- dest = __ip_vs_wlc_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- ip_vs_dest_set_insert(&en->set, dest);
- }
- if (atomic_read(&en->set.size) > 1 &&
- jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
- struct ip_vs_dest *m;
- m = ip_vs_dest_set_max(&en->set);
- if (m)
- ip_vs_dest_set_erase(&en->set, m);
- }
- }
- en->lastuse = jiffies;
-
- IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
- "--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(en->addr),
- NIPQUAD(dest->addr),
- ntohs(dest->port));
-
- return dest;
-}
-
-
-/*
- * IPVS LBLCR Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
-{
- .name = "lblcr",
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_lblcr_init_svc,
- .done_service = ip_vs_lblcr_done_svc,
- .update_service = ip_vs_lblcr_update_svc,
- .schedule = ip_vs_lblcr_schedule,
-};
-
-
-static int __init ip_vs_lblcr_init(void)
-{
- INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
- sysctl_header = register_sysctl_table(lblcr_root_table, 0);
-#ifdef CONFIG_IP_VS_LBLCR_DEBUG
- proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
-#endif
- return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-}
-
-
-static void __exit ip_vs_lblcr_cleanup(void)
-{
-#ifdef CONFIG_IP_VS_LBLCR_DEBUG
- proc_net_remove("ip_vs_lblcr");
-#endif
- unregister_sysctl_table(sysctl_header);
- unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-}
-
-
-module_init(ip_vs_lblcr_init);
-module_exit(ip_vs_lblcr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
deleted file mode 100644
index d88fef90a641..000000000000
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * IPVS: Least-Connection Scheduling module
- *
- * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- * Wensong Zhang : added the ip_vs_lc_update_svc
- * Wensong Zhang : added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
-{
- return 0;
-}
-
-
-static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
-{
- return 0;
-}
-
-
-static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
-{
- return 0;
-}
-
-
-static inline unsigned int
-ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
-/*
- * Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct ip_vs_dest *dest, *least = NULL;
- unsigned int loh = 0, doh;
-
- IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
-
- /*
- * Simply select the server with the least number of
- * (activeconns<<5) + inactconns
- * Except whose weight is equal to zero.
- * If the weight is equal to zero, it means that the server is
- * quiesced, the existing connections to the server still get
- * served, but no new connection is assigned to the server.
- */
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
- atomic_read(&dest->weight) == 0)
- continue;
- doh = ip_vs_lc_dest_overhead(dest);
- if (!least || doh < loh) {
- least = dest;
- loh = doh;
- }
- }
-
- if (least)
- IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
- NIPQUAD(least->addr), ntohs(least->port),
- atomic_read(&least->activeconns),
- atomic_read(&least->inactconns));
-
- return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_lc_scheduler = {
- .name = "lc",
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_lc_init_svc,
- .done_service = ip_vs_lc_done_svc,
- .update_service = ip_vs_lc_update_svc,
- .schedule = ip_vs_lc_schedule,
-};
-
-
-static int __init ip_vs_lc_init(void)
-{
- INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
- return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
-}
-
-static void __exit ip_vs_lc_cleanup(void)
-{
- unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
-}
-
-module_init(ip_vs_lc_init);
-module_exit(ip_vs_lc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
deleted file mode 100644
index e844ddb82b9a..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * ip_vs_proto.c: transport protocol load balancing support for IPVS
- *
- * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * IPVS protocols can only be registered/unregistered when the ipvs
- * module is loaded/unloaded, so no lock is needed in accessing the
- * ipvs protocol table.
- */
-
-#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
-#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
-
-static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
-
-
-/*
- * register an ipvs protocol
- */
-static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
-{
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
-
- pp->next = ip_vs_proto_table[hash];
- ip_vs_proto_table[hash] = pp;
-
- if (pp->init != NULL)
- pp->init(pp);
-
- return 0;
-}
-
-
-/*
- * unregister an ipvs protocol
- */
-static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
-{
- struct ip_vs_protocol **pp_p;
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
-
- pp_p = &ip_vs_proto_table[hash];
- for (; *pp_p; pp_p = &(*pp_p)->next) {
- if (*pp_p == pp) {
- *pp_p = pp->next;
- if (pp->exit != NULL)
- pp->exit(pp);
- return 0;
- }
- }
-
- return -ESRCH;
-}
-
-
-/*
- * get ip_vs_protocol object by its proto.
- */
-struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
-{
- struct ip_vs_protocol *pp;
- unsigned hash = IP_VS_PROTO_HASH(proto);
-
- for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
- if (pp->protocol == proto)
- return pp;
- }
-
- return NULL;
-}
-
-
-/*
- * Propagate event for state change to all protocols
- */
-void ip_vs_protocol_timeout_change(int flags)
-{
- struct ip_vs_protocol *pp;
- int i;
-
- for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
- for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
- if (pp->timeout_change)
- pp->timeout_change(pp, flags);
- }
- }
-}
-
-
-int *
-ip_vs_create_timeout_table(int *table, int size)
-{
- return kmemdup(table, size, GFP_ATOMIC);
-}
-
-
-/*
- * Set timeout value for state specified by name
- */
-int
-ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
-{
- int i;
-
- if (!table || !name || !to)
- return -EINVAL;
-
- for (i = 0; i < num; i++) {
- if (strcmp(names[i], name))
- continue;
- table[i] = to * HZ;
- return 0;
- }
- return -ENOENT;
-}
-
-
-const char * ip_vs_state_name(__u16 proto, int state)
-{
- struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
-
- if (pp == NULL || pp->state_name == NULL)
- return "ERR!";
- return pp->state_name(state);
-}
-
-
-void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
- const struct sk_buff *skb,
- int offset,
- const char *msg)
-{
- char buf[128];
- struct iphdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "%s TRUNCATED", pp->name);
- else if (ih->frag_off & __constant_htons(IP_OFFSET))
- sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
- pp->name, NIPQUAD(ih->saddr),
- NIPQUAD(ih->daddr));
- else {
- __be16 _ports[2], *pptr
-;
- pptr = skb_header_pointer(skb, offset + ih->ihl*4,
- sizeof(_ports), _ports);
- if (pptr == NULL)
- sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
- pp->name,
- NIPQUAD(ih->saddr),
- NIPQUAD(ih->daddr));
- else
- sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
- pp->name,
- NIPQUAD(ih->saddr),
- ntohs(pptr[0]),
- NIPQUAD(ih->daddr),
- ntohs(pptr[1]));
- }
-
- printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-
-int ip_vs_protocol_init(void)
-{
- char protocols[64];
-#define REGISTER_PROTOCOL(p) \
- do { \
- register_ip_vs_protocol(p); \
- strcat(protocols, ", "); \
- strcat(protocols, (p)->name); \
- } while (0)
-
- protocols[0] = '\0';
- protocols[2] = '\0';
-#ifdef CONFIG_IP_VS_PROTO_TCP
- REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
- REGISTER_PROTOCOL(&ip_vs_protocol_udp);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_AH
- REGISTER_PROTOCOL(&ip_vs_protocol_ah);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_ESP
- REGISTER_PROTOCOL(&ip_vs_protocol_esp);
-#endif
- IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
-
- return 0;
-}
-
-
-void ip_vs_protocol_cleanup(void)
-{
- struct ip_vs_protocol *pp;
- int i;
-
- /* unregister all the ipvs protocols */
- for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
- while ((pp = ip_vs_proto_table[i]) != NULL)
- unregister_ip_vs_protocol(pp);
- }
-}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
deleted file mode 100644
index 8b0505b09317..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
- *
- * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
- *
- * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
- * Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/* TODO:
-
-struct isakmp_hdr {
- __u8 icookie[8];
- __u8 rcookie[8];
- __u8 np;
- __u8 version;
- __u8 xchgtype;
- __u8 flags;
- __u32 msgid;
- __u32 length;
-};
-
-*/
-
-#define PORT_ISAKMP 500
-
-
-static struct ip_vs_conn *
-ah_conn_in_get(const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct iphdr *iph,
- unsigned int proto_off,
- int inverse)
-{
- struct ip_vs_conn *cp;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_in_get(IPPROTO_UDP,
- iph->saddr,
- __constant_htons(PORT_ISAKMP),
- iph->daddr,
- __constant_htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_in_get(IPPROTO_UDP,
- iph->daddr,
- __constant_htons(PORT_ISAKMP),
- iph->saddr,
- __constant_htons(PORT_ISAKMP));
- }
-
- if (!cp) {
- /*
- * We are not sure if the packet is from our
- * service, so our conn_schedule hook should return NF_ACCEPT
- */
- IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
- "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
- inverse ? "ICMP+" : "",
- pp->name,
- NIPQUAD(iph->saddr),
- NIPQUAD(iph->daddr));
- }
-
- return cp;
-}
-
-
-static struct ip_vs_conn *
-ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
- struct ip_vs_conn *cp;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_out_get(IPPROTO_UDP,
- iph->saddr,
- __constant_htons(PORT_ISAKMP),
- iph->daddr,
- __constant_htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_out_get(IPPROTO_UDP,
- iph->daddr,
- __constant_htons(PORT_ISAKMP),
- iph->saddr,
- __constant_htons(PORT_ISAKMP));
- }
-
- if (!cp) {
- IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
- "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
- inverse ? "ICMP+" : "",
- pp->name,
- NIPQUAD(iph->saddr),
- NIPQUAD(iph->daddr));
- }
-
- return cp;
-}
-
-
-static int
-ah_conn_schedule(struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
-{
- /*
- * AH is only related traffic. Pass the packet to IP stack.
- */
- *verdict = NF_ACCEPT;
- return 0;
-}
-
-
-static void
-ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
- char buf[256];
- struct iphdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "%s TRUNCATED", pp->name);
- else
- sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
- pp->name, NIPQUAD(ih->saddr),
- NIPQUAD(ih->daddr));
-
- printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-
-static void ah_init(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-static void ah_exit(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_ah = {
- .name = "AH",
- .protocol = IPPROTO_AH,
- .dont_defrag = 1,
- .init = ah_init,
- .exit = ah_exit,
- .conn_schedule = ah_conn_schedule,
- .conn_in_get = ah_conn_in_get,
- .conn_out_get = ah_conn_out_get,
- .snat_handler = NULL,
- .dnat_handler = NULL,
- .csum_check = NULL,
- .state_transition = NULL,
- .register_app = NULL,
- .unregister_app = NULL,
- .app_conn_bind = NULL,
- .debug_packet = ah_debug_packet,
- .timeout_change = NULL, /* ISAKMP */
- .set_state_timeout = NULL,
-};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
deleted file mode 100644
index c36ccf057a19..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
- *
- * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
- *
- * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
- * Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/* TODO:
-
-struct isakmp_hdr {
- __u8 icookie[8];
- __u8 rcookie[8];
- __u8 np;
- __u8 version;
- __u8 xchgtype;
- __u8 flags;
- __u32 msgid;
- __u32 length;
-};
-
-*/
-
-#define PORT_ISAKMP 500
-
-
-static struct ip_vs_conn *
-esp_conn_in_get(const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- const struct iphdr *iph,
- unsigned int proto_off,
- int inverse)
-{
- struct ip_vs_conn *cp;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_in_get(IPPROTO_UDP,
- iph->saddr,
- __constant_htons(PORT_ISAKMP),
- iph->daddr,
- __constant_htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_in_get(IPPROTO_UDP,
- iph->daddr,
- __constant_htons(PORT_ISAKMP),
- iph->saddr,
- __constant_htons(PORT_ISAKMP));
- }
-
- if (!cp) {
- /*
- * We are not sure if the packet is from our
- * service, so our conn_schedule hook should return NF_ACCEPT
- */
- IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
- "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
- inverse ? "ICMP+" : "",
- pp->name,
- NIPQUAD(iph->saddr),
- NIPQUAD(iph->daddr));
- }
-
- return cp;
-}
-
-
-static struct ip_vs_conn *
-esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
- struct ip_vs_conn *cp;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_out_get(IPPROTO_UDP,
- iph->saddr,
- __constant_htons(PORT_ISAKMP),
- iph->daddr,
- __constant_htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_out_get(IPPROTO_UDP,
- iph->daddr,
- __constant_htons(PORT_ISAKMP),
- iph->saddr,
- __constant_htons(PORT_ISAKMP));
- }
-
- if (!cp) {
- IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
- "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
- inverse ? "ICMP+" : "",
- pp->name,
- NIPQUAD(iph->saddr),
- NIPQUAD(iph->daddr));
- }
-
- return cp;
-}
-
-
-static int
-esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
-{
- /*
- * ESP is only related traffic. Pass the packet to IP stack.
- */
- *verdict = NF_ACCEPT;
- return 0;
-}
-
-
-static void
-esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
- char buf[256];
- struct iphdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "%s TRUNCATED", pp->name);
- else
- sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
- pp->name, NIPQUAD(ih->saddr),
- NIPQUAD(ih->daddr));
-
- printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-
-static void esp_init(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-static void esp_exit(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_esp = {
- .name = "ESP",
- .protocol = IPPROTO_ESP,
- .dont_defrag = 1,
- .init = esp_init,
- .exit = esp_exit,
- .conn_schedule = esp_conn_schedule,
- .conn_in_get = esp_conn_in_get,
- .conn_out_get = esp_conn_out_get,
- .snat_handler = NULL,
- .dnat_handler = NULL,
- .csum_check = NULL,
- .state_transition = NULL,
- .register_app = NULL,
- .unregister_app = NULL,
- .app_conn_bind = NULL,
- .debug_packet = esp_debug_packet,
- .timeout_change = NULL, /* ISAKMP */
-};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
deleted file mode 100644
index 16a9ebee2fe6..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
- *
- * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h> /* for tcphdr */
-#include <net/ip.h>
-#include <net/tcp.h> /* for csum_tcpudp_magic */
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-static struct ip_vs_conn *
-tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- return ip_vs_conn_in_get(iph->protocol,
- iph->saddr, pptr[0],
- iph->daddr, pptr[1]);
- } else {
- return ip_vs_conn_in_get(iph->protocol,
- iph->daddr, pptr[1],
- iph->saddr, pptr[0]);
- }
-}
-
-static struct ip_vs_conn *
-tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- return ip_vs_conn_out_get(iph->protocol,
- iph->saddr, pptr[0],
- iph->daddr, pptr[1]);
- } else {
- return ip_vs_conn_out_get(iph->protocol,
- iph->daddr, pptr[1],
- iph->saddr, pptr[0]);
- }
-}
-
-
-static int
-tcp_conn_schedule(struct sk_buff *skb,
- struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
-{
- struct ip_vs_service *svc;
- struct tcphdr _tcph, *th;
-
- th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
- sizeof(_tcph), &_tcph);
- if (th == NULL) {
- *verdict = NF_DROP;
- return 0;
- }
-
- if (th->syn &&
- (svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol,
- skb->nh.iph->daddr, th->dest))) {
- if (ip_vs_todrop()) {
- /*
- * It seems that we are very loaded.
- * We have to drop this packet :(
- */
- ip_vs_service_put(svc);
- *verdict = NF_DROP;
- return 0;
- }
-
- /*
- * Let the virtual server select a real server for the
- * incoming connection, and create a connection entry.
- */
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
- *verdict = ip_vs_leave(svc, skb, pp);
- return 0;
- }
- ip_vs_service_put(svc);
- }
- return 1;
-}
-
-
-static inline void
-tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip,
- __be16 oldport, __be16 newport)
-{
- tcph->check =
- csum_fold(ip_vs_check_diff4(oldip, newip,
- ip_vs_check_diff2(oldport, newport,
- ~csum_unfold(tcph->check))));
-}
-
-
-static int
-tcp_snat_handler(struct sk_buff **pskb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
- struct tcphdr *tcph;
- unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
-
- /* csum_check requires unshared skb */
- if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
- return 0;
-
- if (unlikely(cp->app != NULL)) {
- /* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(*pskb, pp))
- return 0;
-
- /* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, pskb))
- return 0;
- }
-
- tcph = (void *)(*pskb)->nh.iph + tcphoff;
- tcph->source = cp->vport;
-
- /* Adjust TCP checksums */
- if (!cp->app) {
- /* Only port and addr are changed, do fast csum update */
- tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
- cp->dport, cp->vport);
- if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
- (*pskb)->ip_summed = CHECKSUM_NONE;
- } else {
- /* full checksum calculation */
- tcph->check = 0;
- (*pskb)->csum = skb_checksum(*pskb, tcphoff,
- (*pskb)->len - tcphoff, 0);
- tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
- (*pskb)->len - tcphoff,
- cp->protocol,
- (*pskb)->csum);
- IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
- pp->name, tcph->check,
- (char*)&(tcph->check) - (char*)tcph);
- }
- return 1;
-}
-
-
-static int
-tcp_dnat_handler(struct sk_buff **pskb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
- struct tcphdr *tcph;
- unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
-
- /* csum_check requires unshared skb */
- if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
- return 0;
-
- if (unlikely(cp->app != NULL)) {
- /* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(*pskb, pp))
- return 0;
-
- /*
- * Attempt ip_vs_app call.
- * It will fix ip_vs_conn and iph ack_seq stuff
- */
- if (!ip_vs_app_pkt_in(cp, pskb))
- return 0;
- }
-
- tcph = (void *)(*pskb)->nh.iph + tcphoff;
- tcph->dest = cp->dport;
-
- /*
- * Adjust TCP checksums
- */
- if (!cp->app) {
- /* Only port and addr are changed, do fast csum update */
- tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
- cp->vport, cp->dport);
- if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
- (*pskb)->ip_summed = CHECKSUM_NONE;
- } else {
- /* full checksum calculation */
- tcph->check = 0;
- (*pskb)->csum = skb_checksum(*pskb, tcphoff,
- (*pskb)->len - tcphoff, 0);
- tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
- (*pskb)->len - tcphoff,
- cp->protocol,
- (*pskb)->csum);
- (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
- }
- return 1;
-}
-
-
-static int
-tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
- unsigned int tcphoff = skb->nh.iph->ihl*4;
-
- switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
- case CHECKSUM_COMPLETE:
- if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->len - tcphoff,
- skb->nh.iph->protocol, skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
- "Failed checksum for");
- return 0;
- }
- break;
- default:
- /* No need to checksum. */
- break;
- }
-
- return 1;
-}
-
-
-#define TCP_DIR_INPUT 0
-#define TCP_DIR_OUTPUT 4
-#define TCP_DIR_INPUT_ONLY 8
-
-static const int tcp_state_off[IP_VS_DIR_LAST] = {
- [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
- [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
- [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
-};
-
-/*
- * Timeout table[state]
- */
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
- [IP_VS_TCP_S_NONE] = 2*HZ,
- [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
- [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
- [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
- [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
- [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
- [IP_VS_TCP_S_CLOSE] = 10*HZ,
- [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
- [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
- [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
- [IP_VS_TCP_S_SYNACK] = 120*HZ,
- [IP_VS_TCP_S_LAST] = 2*HZ,
-};
-
-static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
- [IP_VS_TCP_S_NONE] = "NONE",
- [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
- [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
- [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
- [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
- [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
- [IP_VS_TCP_S_CLOSE] = "CLOSE",
- [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
- [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
- [IP_VS_TCP_S_LISTEN] = "LISTEN",
- [IP_VS_TCP_S_SYNACK] = "SYNACK",
- [IP_VS_TCP_S_LAST] = "BUG!",
-};
-
-#define sNO IP_VS_TCP_S_NONE
-#define sES IP_VS_TCP_S_ESTABLISHED
-#define sSS IP_VS_TCP_S_SYN_SENT
-#define sSR IP_VS_TCP_S_SYN_RECV
-#define sFW IP_VS_TCP_S_FIN_WAIT
-#define sTW IP_VS_TCP_S_TIME_WAIT
-#define sCL IP_VS_TCP_S_CLOSE
-#define sCW IP_VS_TCP_S_CLOSE_WAIT
-#define sLA IP_VS_TCP_S_LAST_ACK
-#define sLI IP_VS_TCP_S_LISTEN
-#define sSA IP_VS_TCP_S_SYNACK
-
-struct tcp_states_t {
- int next_state[IP_VS_TCP_S_LAST];
-};
-
-static const char * tcp_state_name(int state)
-{
- if (state >= IP_VS_TCP_S_LAST)
- return "ERR!";
- return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
-}
-
-static struct tcp_states_t tcp_states [] = {
-/* INPUT */
-/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
-/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
-
-/* OUTPUT */
-/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
-/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
-/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
-/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
-/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
-
-/* INPUT-ONLY */
-/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
-/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-};
-
-static struct tcp_states_t tcp_states_dos [] = {
-/* INPUT */
-/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
-/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-
-/* OUTPUT */
-/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
-/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
-/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
-/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
-/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
-
-/* INPUT-ONLY */
-/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
-/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
-/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-};
-
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
- int on = (flags & 1); /* secure_tcp */
-
- /*
- ** FIXME: change secure_tcp to independent sysctl var
- ** or make it per-service or per-app because it is valid
- ** for most if not for all of the applications. Something
- ** like "capabilities" (flags) for each object.
- */
- tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
- tcp_state_name_table, sname, to);
-}
-
-static inline int tcp_state_idx(struct tcphdr *th)
-{
- if (th->rst)
- return 3;
- if (th->syn)
- return 0;
- if (th->fin)
- return 1;
- if (th->ack)
- return 2;
- return -1;
-}
-
-static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
- int direction, struct tcphdr *th)
-{
- int state_idx;
- int new_state = IP_VS_TCP_S_CLOSE;
- int state_off = tcp_state_off[direction];
-
- /*
- * Update state offset to INPUT_ONLY if necessary
- * or delete NO_OUTPUT flag if output packet detected
- */
- if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
- if (state_off == TCP_DIR_OUTPUT)
- cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
- else
- state_off = TCP_DIR_INPUT_ONLY;
- }
-
- if ((state_idx = tcp_state_idx(th)) < 0) {
- IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
- goto tcp_state_out;
- }
-
- new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
-
- tcp_state_out:
- if (new_state != cp->state) {
- struct ip_vs_dest *dest = cp->dest;
-
- IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
- "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
- (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
- th->syn? 'S' : '.',
- th->fin? 'F' : '.',
- th->ack? 'A' : '.',
- th->rst? 'R' : '.',
- NIPQUAD(cp->daddr), ntohs(cp->dport),
- NIPQUAD(cp->caddr), ntohs(cp->cport),
- tcp_state_name(cp->state),
- tcp_state_name(new_state),
- atomic_read(&cp->refcnt));
- if (dest) {
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (new_state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (new_state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
- }
- }
-
- cp->timeout = pp->timeout_table[cp->state = new_state];
-}
-
-
-/*
- * Handle state transitions
- */
-static int
-tcp_state_transition(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
-{
- struct tcphdr _tcph, *th;
-
- th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
- sizeof(_tcph), &_tcph);
- if (th == NULL)
- return 0;
-
- spin_lock(&cp->lock);
- set_tcp_state(pp, cp, direction, th);
- spin_unlock(&cp->lock);
-
- return 1;
-}
-
-
-/*
- * Hash table for TCP application incarnations
- */
-#define TCP_APP_TAB_BITS 4
-#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
-#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
-static inline __u16 tcp_app_hashkey(__be16 port)
-{
- return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
- & TCP_APP_TAB_MASK;
-}
-
-
-static int tcp_register_app(struct ip_vs_app *inc)
-{
- struct ip_vs_app *i;
- __u16 hash;
- __be16 port = inc->port;
- int ret = 0;
-
- hash = tcp_app_hashkey(port);
-
- spin_lock_bh(&tcp_app_lock);
- list_for_each_entry(i, &tcp_apps[hash], p_list) {
- if (i->port == port) {
- ret = -EEXIST;
- goto out;
- }
- }
- list_add(&inc->p_list, &tcp_apps[hash]);
- atomic_inc(&ip_vs_protocol_tcp.appcnt);
-
- out:
- spin_unlock_bh(&tcp_app_lock);
- return ret;
-}
-
-
-static void
-tcp_unregister_app(struct ip_vs_app *inc)
-{
- spin_lock_bh(&tcp_app_lock);
- atomic_dec(&ip_vs_protocol_tcp.appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&tcp_app_lock);
-}
-
-
-static int
-tcp_app_conn_bind(struct ip_vs_conn *cp)
-{
- int hash;
- struct ip_vs_app *inc;
- int result = 0;
-
- /* Default binding: bind app only for NAT */
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- return 0;
-
- /* Lookup application incarnations and bind the right one */
- hash = tcp_app_hashkey(cp->vport);
-
- spin_lock(&tcp_app_lock);
- list_for_each_entry(inc, &tcp_apps[hash], p_list) {
- if (inc->port == cp->vport) {
- if (unlikely(!ip_vs_app_inc_get(inc)))
- break;
- spin_unlock(&tcp_app_lock);
-
- IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
- "%u.%u.%u.%u:%u to app %s on port %u\n",
- __FUNCTION__,
- NIPQUAD(cp->caddr), ntohs(cp->cport),
- NIPQUAD(cp->vaddr), ntohs(cp->vport),
- inc->name, ntohs(inc->port));
- cp->app = inc;
- if (inc->init_conn)
- result = inc->init_conn(inc, cp);
- goto out;
- }
- }
- spin_unlock(&tcp_app_lock);
-
- out:
- return result;
-}
-
-
-/*
- * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
- */
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
-{
- spin_lock(&cp->lock);
- cp->state = IP_VS_TCP_S_LISTEN;
- cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
- spin_unlock(&cp->lock);
-}
-
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
-{
- IP_VS_INIT_HASH_TABLE(tcp_apps);
- pp->timeout_table = tcp_timeouts;
-}
-
-
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_tcp = {
- .name = "TCP",
- .protocol = IPPROTO_TCP,
- .dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_tcp_init,
- .exit = ip_vs_tcp_exit,
- .register_app = tcp_register_app,
- .unregister_app = tcp_unregister_app,
- .conn_schedule = tcp_conn_schedule,
- .conn_in_get = tcp_conn_in_get,
- .conn_out_get = tcp_conn_out_get,
- .snat_handler = tcp_snat_handler,
- .dnat_handler = tcp_dnat_handler,
- .csum_check = tcp_csum_check,
- .state_name = tcp_state_name,
- .state_transition = tcp_state_transition,
- .app_conn_bind = tcp_app_conn_bind,
- .debug_packet = ip_vs_tcpudp_debug_packet,
- .timeout_change = tcp_timeout_change,
- .set_state_timeout = tcp_set_state_timeout,
-};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
deleted file mode 100644
index 03f0a414cfa4..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * ip_vs_proto_udp.c: UDP load balancing support for IPVS
- *
- * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/udp.h>
-
-#include <net/ip_vs.h>
-
-
-static struct ip_vs_conn *
-udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
- struct ip_vs_conn *cp;
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_in_get(iph->protocol,
- iph->saddr, pptr[0],
- iph->daddr, pptr[1]);
- } else {
- cp = ip_vs_conn_in_get(iph->protocol,
- iph->daddr, pptr[1],
- iph->saddr, pptr[0]);
- }
-
- return cp;
-}
-
-
-static struct ip_vs_conn *
-udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
- const struct iphdr *iph, unsigned int proto_off, int inverse)
-{
- struct ip_vs_conn *cp;
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
- sizeof(_ports), _ports);
- if (pptr == NULL)
- return NULL;
-
- if (likely(!inverse)) {
- cp = ip_vs_conn_out_get(iph->protocol,
- iph->saddr, pptr[0],
- iph->daddr, pptr[1]);
- } else {
- cp = ip_vs_conn_out_get(iph->protocol,
- iph->daddr, pptr[1],
- iph->saddr, pptr[0]);
- }
-
- return cp;
-}
-
-
-static int
-udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
- int *verdict, struct ip_vs_conn **cpp)
-{
- struct ip_vs_service *svc;
- struct udphdr _udph, *uh;
-
- uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
- sizeof(_udph), &_udph);
- if (uh == NULL) {
- *verdict = NF_DROP;
- return 0;
- }
-
- if ((svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol,
- skb->nh.iph->daddr, uh->dest))) {
- if (ip_vs_todrop()) {
- /*
- * It seems that we are very loaded.
- * We have to drop this packet :(
- */
- ip_vs_service_put(svc);
- *verdict = NF_DROP;
- return 0;
- }
-
- /*
- * Let the virtual server select a real server for the
- * incoming connection, and create a connection entry.
- */
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
- *verdict = ip_vs_leave(svc, skb, pp);
- return 0;
- }
- ip_vs_service_put(svc);
- }
- return 1;
-}
-
-
-static inline void
-udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip,
- __be16 oldport, __be16 newport)
-{
- uhdr->check =
- csum_fold(ip_vs_check_diff4(oldip, newip,
- ip_vs_check_diff2(oldport, newport,
- ~csum_unfold(uhdr->check))));
- if (!uhdr->check)
- uhdr->check = CSUM_MANGLED_0;
-}
-
-static int
-udp_snat_handler(struct sk_buff **pskb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
- struct udphdr *udph;
- unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
-
- /* csum_check requires unshared skb */
- if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
- return 0;
-
- if (unlikely(cp->app != NULL)) {
- /* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(*pskb, pp))
- return 0;
-
- /*
- * Call application helper if needed
- */
- if (!ip_vs_app_pkt_out(cp, pskb))
- return 0;
- }
-
- udph = (void *)(*pskb)->nh.iph + udphoff;
- udph->source = cp->vport;
-
- /*
- * Adjust UDP checksums
- */
- if (!cp->app && (udph->check != 0)) {
- /* Only port and addr are changed, do fast csum update */
- udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
- cp->dport, cp->vport);
- if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
- (*pskb)->ip_summed = CHECKSUM_NONE;
- } else {
- /* full checksum calculation */
- udph->check = 0;
- (*pskb)->csum = skb_checksum(*pskb, udphoff,
- (*pskb)->len - udphoff, 0);
- udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
- (*pskb)->len - udphoff,
- cp->protocol,
- (*pskb)->csum);
- if (udph->check == 0)
- udph->check = CSUM_MANGLED_0;
- IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
- pp->name, udph->check,
- (char*)&(udph->check) - (char*)udph);
- }
- return 1;
-}
-
-
-static int
-udp_dnat_handler(struct sk_buff **pskb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
- struct udphdr *udph;
- unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
-
- /* csum_check requires unshared skb */
- if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
- return 0;
-
- if (unlikely(cp->app != NULL)) {
- /* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(*pskb, pp))
- return 0;
-
- /*
- * Attempt ip_vs_app call.
- * It will fix ip_vs_conn
- */
- if (!ip_vs_app_pkt_in(cp, pskb))
- return 0;
- }
-
- udph = (void *)(*pskb)->nh.iph + udphoff;
- udph->dest = cp->dport;
-
- /*
- * Adjust UDP checksums
- */
- if (!cp->app && (udph->check != 0)) {
- /* Only port and addr are changed, do fast csum update */
- udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
- cp->vport, cp->dport);
- if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
- (*pskb)->ip_summed = CHECKSUM_NONE;
- } else {
- /* full checksum calculation */
- udph->check = 0;
- (*pskb)->csum = skb_checksum(*pskb, udphoff,
- (*pskb)->len - udphoff, 0);
- udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
- (*pskb)->len - udphoff,
- cp->protocol,
- (*pskb)->csum);
- if (udph->check == 0)
- udph->check = CSUM_MANGLED_0;
- (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
- }
- return 1;
-}
-
-
-static int
-udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
- struct udphdr _udph, *uh;
- unsigned int udphoff = skb->nh.iph->ihl*4;
-
- uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
- if (uh == NULL)
- return 0;
-
- if (uh->check != 0) {
- switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- skb->csum = skb_checksum(skb, udphoff,
- skb->len - udphoff, 0);
- case CHECKSUM_COMPLETE:
- if (csum_tcpudp_magic(skb->nh.iph->saddr,
- skb->nh.iph->daddr,
- skb->len - udphoff,
- skb->nh.iph->protocol,
- skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
- "Failed checksum for");
- return 0;
- }
- break;
- default:
- /* No need to checksum. */
- break;
- }
- }
- return 1;
-}
-
-
-/*
- * Note: the caller guarantees that only one of register_app,
- * unregister_app or app_conn_bind is called each time.
- */
-
-#define UDP_APP_TAB_BITS 4
-#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
-#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
-static inline __u16 udp_app_hashkey(__be16 port)
-{
- return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
- & UDP_APP_TAB_MASK;
-}
-
-
-static int udp_register_app(struct ip_vs_app *inc)
-{
- struct ip_vs_app *i;
- __u16 hash;
- __be16 port = inc->port;
- int ret = 0;
-
- hash = udp_app_hashkey(port);
-
-
- spin_lock_bh(&udp_app_lock);
- list_for_each_entry(i, &udp_apps[hash], p_list) {
- if (i->port == port) {
- ret = -EEXIST;
- goto out;
- }
- }
- list_add(&inc->p_list, &udp_apps[hash]);
- atomic_inc(&ip_vs_protocol_udp.appcnt);
-
- out:
- spin_unlock_bh(&udp_app_lock);
- return ret;
-}
-
-
-static void
-udp_unregister_app(struct ip_vs_app *inc)
-{
- spin_lock_bh(&udp_app_lock);
- atomic_dec(&ip_vs_protocol_udp.appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&udp_app_lock);
-}
-
-
-static int udp_app_conn_bind(struct ip_vs_conn *cp)
-{
- int hash;
- struct ip_vs_app *inc;
- int result = 0;
-
- /* Default binding: bind app only for NAT */
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- return 0;
-
- /* Lookup application incarnations and bind the right one */
- hash = udp_app_hashkey(cp->vport);
-
- spin_lock(&udp_app_lock);
- list_for_each_entry(inc, &udp_apps[hash], p_list) {
- if (inc->port == cp->vport) {
- if (unlikely(!ip_vs_app_inc_get(inc)))
- break;
- spin_unlock(&udp_app_lock);
-
- IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
- "%u.%u.%u.%u:%u to app %s on port %u\n",
- __FUNCTION__,
- NIPQUAD(cp->caddr), ntohs(cp->cport),
- NIPQUAD(cp->vaddr), ntohs(cp->vport),
- inc->name, ntohs(inc->port));
- cp->app = inc;
- if (inc->init_conn)
- result = inc->init_conn(inc, cp);
- goto out;
- }
- }
- spin_unlock(&udp_app_lock);
-
- out:
- return result;
-}
-
-
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
- [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
- [IP_VS_UDP_S_LAST] = 2*HZ,
-};
-
-static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
- [IP_VS_UDP_S_NORMAL] = "UDP",
- [IP_VS_UDP_S_LAST] = "BUG!",
-};
-
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
- udp_state_name_table, sname, to);
-}
-
-static const char * udp_state_name(int state)
-{
- if (state >= IP_VS_UDP_S_LAST)
- return "ERR!";
- return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
-}
-
-static int
-udp_state_transition(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
-{
- cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
- return 1;
-}
-
-static void udp_init(struct ip_vs_protocol *pp)
-{
- IP_VS_INIT_HASH_TABLE(udp_apps);
- pp->timeout_table = udp_timeouts;
-}
-
-static void udp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_udp = {
- .name = "UDP",
- .protocol = IPPROTO_UDP,
- .dont_defrag = 0,
- .init = udp_init,
- .exit = udp_exit,
- .conn_schedule = udp_conn_schedule,
- .conn_in_get = udp_conn_in_get,
- .conn_out_get = udp_conn_out_get,
- .snat_handler = udp_snat_handler,
- .dnat_handler = udp_dnat_handler,
- .csum_check = udp_csum_check,
- .state_transition = udp_state_transition,
- .state_name = udp_state_name,
- .register_app = udp_register_app,
- .unregister_app = udp_unregister_app,
- .app_conn_bind = udp_app_conn_bind,
- .debug_packet = ip_vs_tcpudp_debug_packet,
- .timeout_change = NULL,
- .set_state_timeout = udp_set_state_timeout,
-};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
deleted file mode 100644
index b23bab231cab..000000000000
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * IPVS: Round-Robin Scheduling module
- *
- * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Peter Kese <peter.kese@ijs.si>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Fixes/Changes:
- * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
- * Julian Anastasov : fixed the NULL pointer access bug in debugging
- * Wensong Zhang : changed some comestics things for debugging
- * Wensong Zhang : changed for the d-linked destination list
- * Wensong Zhang : added the ip_vs_rr_update_svc
- * Wensong Zhang : added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
-{
- svc->sched_data = &svc->destinations;
- return 0;
-}
-
-
-static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
-{
- return 0;
-}
-
-
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
-{
- svc->sched_data = &svc->destinations;
- return 0;
-}
-
-
-/*
- * Round-Robin Scheduling
- */
-static struct ip_vs_dest *
-ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct list_head *p, *q;
- struct ip_vs_dest *dest;
-
- IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
-
- write_lock(&svc->sched_lock);
- p = (struct list_head *)svc->sched_data;
- p = p->next;
- q = p;
- do {
- /* skip list head */
- if (q == &svc->destinations) {
- q = q->next;
- continue;
- }
-
- dest = list_entry(q, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0)
- /* HIT */
- goto out;
- q = q->next;
- } while (q != p);
- write_unlock(&svc->sched_lock);
- return NULL;
-
- out:
- svc->sched_data = q;
- write_unlock(&svc->sched_lock);
- IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d\n",
- NIPQUAD(dest->addr), ntohs(dest->port),
- atomic_read(&dest->activeconns),
- atomic_read(&dest->refcnt), atomic_read(&dest->weight));
-
- return dest;
-}
-
-
-static struct ip_vs_scheduler ip_vs_rr_scheduler = {
- .name = "rr", /* name */
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_rr_init_svc,
- .done_service = ip_vs_rr_done_svc,
- .update_service = ip_vs_rr_update_svc,
- .schedule = ip_vs_rr_schedule,
-};
-
-static int __init ip_vs_rr_init(void)
-{
- INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
- return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
-}
-
-static void __exit ip_vs_rr_cleanup(void)
-{
- unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
-}
-
-module_init(ip_vs_rr_init);
-module_exit(ip_vs_rr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
deleted file mode 100644
index 338668f88fe2..000000000000
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * IPVS: Source Hashing scheduling module
- *
- * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@gnuchina.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The sh algorithm is to select server by the hash key of source IP
- * address. The pseudo code is as follows:
- *
- * n <- servernode[src_ip];
- * if (n is dead) OR
- * (n is overloaded) or (n.weight <= 0) then
- * return NULL;
- *
- * return n;
- *
- * Notes that servernode is a 256-bucket hash table that maps the hash
- * index derived from packet source IP address to the current server
- * array. If the sh scheduler is used in cache cluster, it is good to
- * combine it with cache_bypass feature. When the statically assigned
- * server is dead or overloaded, the load balancer can bypass the cache
- * server and send requests to the original server directly.
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * IPVS SH bucket
- */
-struct ip_vs_sh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
-};
-
-/*
- * for IPVS SH entry hash table
- */
-#ifndef CONFIG_IP_VS_SH_TAB_BITS
-#define CONFIG_IP_VS_SH_TAB_BITS 8
-#endif
-#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
-#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
-#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
-
-
-/*
- * Returns hash value for IPVS SH entry
- */
-static inline unsigned ip_vs_sh_hashkey(__be32 addr)
-{
- return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
-}
-
-
-/*
- * Get ip_vs_dest associated with supplied parameters.
- */
-static inline struct ip_vs_dest *
-ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
-{
- return (tbl[ip_vs_sh_hashkey(addr)]).dest;
-}
-
-
-/*
- * Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
-{
- int i;
- struct ip_vs_sh_bucket *b;
- struct list_head *p;
- struct ip_vs_dest *dest;
-
- b = tbl;
- p = &svc->destinations;
- for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
- if (p == &svc->destinations)
- p = p->next;
-
- dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
-
- p = p->next;
- }
- b++;
- }
- return 0;
-}
-
-
-/*
- * Flush all the hash buckets of the specified table.
- */
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
-{
- int i;
- struct ip_vs_sh_bucket *b;
-
- b = tbl;
- for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
- }
- b++;
- }
-}
-
-
-static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_sh_bucket *tbl;
-
- /* allocate the SH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
- GFP_ATOMIC);
- if (tbl == NULL) {
- IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
- return -ENOMEM;
- }
- svc->sched_data = tbl;
- IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
- "current service\n",
- sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
- /* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
-
- return 0;
-}
-
-
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
-
- /* release the table itself */
- kfree(svc->sched_data);
- IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
- sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
- return 0;
-}
-
-
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
-
- /* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
-
- return 0;
-}
-
-
-/*
- * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- * consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
- return dest->flags & IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- * Source Hashing scheduling
- */
-static struct ip_vs_dest *
-ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct ip_vs_dest *dest;
- struct ip_vs_sh_bucket *tbl;
- struct iphdr *iph = skb->nh.iph;
-
- IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
-
- tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
- dest = ip_vs_sh_get(tbl, iph->saddr);
- if (!dest
- || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
- || atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
- return NULL;
- }
-
- IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
- "--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(iph->saddr),
- NIPQUAD(dest->addr),
- ntohs(dest->port));
-
- return dest;
-}
-
-
-/*
- * IPVS SH Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_sh_scheduler =
-{
- .name = "sh",
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_sh_init_svc,
- .done_service = ip_vs_sh_done_svc,
- .update_service = ip_vs_sh_update_svc,
- .schedule = ip_vs_sh_schedule,
-};
-
-
-static int __init ip_vs_sh_init(void)
-{
- INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
- return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
-}
-
-
-static void __exit ip_vs_sh_cleanup(void)
-{
- unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
-}
-
-
-module_init(ip_vs_sh_init);
-module_exit(ip_vs_sh_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
deleted file mode 100644
index 91a075edd68e..000000000000
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ /dev/null
@@ -1,902 +0,0 @@
-/*
- * IPVS An implementation of the IP virtual server support for the
- * LINUX operating system. IPVS is now implemented as a module
- * over the NetFilter framework. IPVS can be used to build a
- * high-performance and highly available server based on a
- * cluster of servers.
- *
- * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * ip_vs_sync: sync connection info from master load balancer to backups
- * through multicast
- *
- * Changes:
- * Alexandre Cassen : Added master & backup support at a time.
- * Alexandre Cassen : Added SyncID support for incoming sync
- * messages filtering.
- * Justin Ossevoort : Fix endian problem on sync message size.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/inetdevice.h>
-#include <linux/net.h>
-#include <linux/completion.h>
-#include <linux/delay.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/igmp.h> /* for ip_mc_join_group */
-#include <linux/udp.h>
-
-#include <net/ip.h>
-#include <net/sock.h>
-#include <asm/uaccess.h> /* for get_fs and set_fs */
-
-#include <net/ip_vs.h>
-
-#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
-#define IP_VS_SYNC_PORT 8848 /* multicast port */
-
-
-/*
- * IPVS sync connection entry
- */
-struct ip_vs_sync_conn {
- __u8 reserved;
-
- /* Protocol, addresses and port numbers */
- __u8 protocol; /* Which protocol (TCP/UDP) */
- __be16 cport;
- __be16 vport;
- __be16 dport;
- __be32 caddr; /* client address */
- __be32 vaddr; /* virtual address */
- __be32 daddr; /* destination address */
-
- /* Flags and state transition */
- __be16 flags; /* status flags */
- __be16 state; /* state info */
-
- /* The sequence options start here */
-};
-
-struct ip_vs_sync_conn_options {
- struct ip_vs_seq in_seq; /* incoming seq. struct */
- struct ip_vs_seq out_seq; /* outgoing seq. struct */
-};
-
-#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
-#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
-#define FULL_CONN_SIZE \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
-
-
-/*
- The master mulitcasts messages to the backup load balancers in the
- following format.
-
- 0 1 2 3
- 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | Count Conns | SyncID | Size |
- +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | |
- | IPVS Sync Connection (1) |
- +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | . |
- | . |
- | . |
- +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | |
- | IPVS Sync Connection (n) |
- +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*/
-
-#define SYNC_MESG_HEADER_LEN 4
-
-struct ip_vs_sync_mesg {
- __u8 nr_conns;
- __u8 syncid;
- __u16 size;
-
- /* ip_vs_sync_conn entries start here */
-};
-
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
-
-struct ip_vs_sync_buff {
- struct list_head list;
- unsigned long firstuse;
-
- /* pointers for the message data */
- struct ip_vs_sync_mesg *mesg;
- unsigned char *head;
- unsigned char *end;
-};
-
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* multicast addr */
-static struct sockaddr_in mcast_addr;
-
-
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
-{
- spin_lock(&ip_vs_sync_lock);
- list_add_tail(&sb->list, &ip_vs_sync_queue);
- spin_unlock(&ip_vs_sync_lock);
-}
-
-static inline struct ip_vs_sync_buff * sb_dequeue(void)
-{
- struct ip_vs_sync_buff *sb;
-
- spin_lock_bh(&ip_vs_sync_lock);
- if (list_empty(&ip_vs_sync_queue)) {
- sb = NULL;
- } else {
- sb = list_entry(ip_vs_sync_queue.next,
- struct ip_vs_sync_buff,
- list);
- list_del(&sb->list);
- }
- spin_unlock_bh(&ip_vs_sync_lock);
-
- return sb;
-}
-
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
-{
- struct ip_vs_sync_buff *sb;
-
- if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
- return NULL;
-
- if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
- kfree(sb);
- return NULL;
- }
- sb->mesg->nr_conns = 0;
- sb->mesg->syncid = ip_vs_master_syncid;
- sb->mesg->size = 4;
- sb->head = (unsigned char *)sb->mesg + 4;
- sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
- sb->firstuse = jiffies;
- return sb;
-}
-
-static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
-{
- kfree(sb->mesg);
- kfree(sb);
-}
-
-/*
- * Get the current sync buffer if it has been created for more
- * than the specified time or the specified time is zero.
- */
-static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
-{
- struct ip_vs_sync_buff *sb;
-
- spin_lock_bh(&curr_sb_lock);
- if (curr_sb && (time == 0 ||
- time_before(jiffies - curr_sb->firstuse, time))) {
- sb = curr_sb;
- curr_sb = NULL;
- } else
- sb = NULL;
- spin_unlock_bh(&curr_sb_lock);
- return sb;
-}
-
-
-/*
- * Add an ip_vs_conn information into the current sync_buff.
- * Called by ip_vs_in.
- */
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
-{
- struct ip_vs_sync_mesg *m;
- struct ip_vs_sync_conn *s;
- int len;
-
- spin_lock(&curr_sb_lock);
- if (!curr_sb) {
- if (!(curr_sb=ip_vs_sync_buff_create())) {
- spin_unlock(&curr_sb_lock);
- IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
- return;
- }
- }
-
- len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
- SIMPLE_CONN_SIZE;
- m = curr_sb->mesg;
- s = (struct ip_vs_sync_conn *)curr_sb->head;
-
- /* copy members */
- s->protocol = cp->protocol;
- s->cport = cp->cport;
- s->vport = cp->vport;
- s->dport = cp->dport;
- s->caddr = cp->caddr;
- s->vaddr = cp->vaddr;
- s->daddr = cp->daddr;
- s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
- s->state = htons(cp->state);
- if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
- struct ip_vs_sync_conn_options *opt =
- (struct ip_vs_sync_conn_options *)&s[1];
- memcpy(opt, &cp->in_seq, sizeof(*opt));
- }
-
- m->nr_conns++;
- m->size += len;
- curr_sb->head += len;
-
- /* check if there is a space for next one */
- if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
- sb_queue_tail(curr_sb);
- curr_sb = NULL;
- }
- spin_unlock(&curr_sb_lock);
-
- /* synchronize its controller if it has */
- if (cp->control)
- ip_vs_sync_conn(cp->control);
-}
-
-
-/*
- * Process received multicast message and create the corresponding
- * ip_vs_conn entries.
- */
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
-{
- struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
- struct ip_vs_sync_conn *s;
- struct ip_vs_sync_conn_options *opt;
- struct ip_vs_conn *cp;
- char *p;
- int i;
-
- /* Convert size back to host byte order */
- m->size = ntohs(m->size);
-
- if (buflen != m->size) {
- IP_VS_ERR("bogus message\n");
- return;
- }
-
- /* SyncID sanity check */
- if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
- IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
- m->syncid);
- return;
- }
-
- p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
- for (i=0; i<m->nr_conns; i++) {
- unsigned flags;
-
- s = (struct ip_vs_sync_conn *)p;
- flags = ntohs(s->flags);
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(s->protocol,
- s->caddr, s->cport,
- s->vaddr, s->vport);
- else
- cp = ip_vs_ct_in_get(s->protocol,
- s->caddr, s->cport,
- s->vaddr, s->vport);
- if (!cp) {
- cp = ip_vs_conn_new(s->protocol,
- s->caddr, s->cport,
- s->vaddr, s->vport,
- s->daddr, s->dport,
- flags, NULL);
- if (!cp) {
- IP_VS_ERR("ip_vs_conn_new failed\n");
- return;
- }
- cp->state = ntohs(s->state);
- } else if (!cp->dest) {
- /* it is an entry created by the synchronization */
- cp->state = ntohs(s->state);
- cp->flags = flags | IP_VS_CONN_F_HASHED;
- } /* Note that we don't touch its state and flags
- if it is a normal entry. */
-
- if (flags & IP_VS_CONN_F_SEQ_MASK) {
- opt = (struct ip_vs_sync_conn_options *)&s[1];
- memcpy(&cp->in_seq, opt, sizeof(*opt));
- p += FULL_CONN_SIZE;
- } else
- p += SIMPLE_CONN_SIZE;
-
- atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
- cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
- ip_vs_conn_put(cp);
-
- if (p > buffer+buflen) {
- IP_VS_ERR("bogus message\n");
- return;
- }
- }
-}
-
-
-/*
- * Setup loopback of outgoing multicasts on a sending socket
- */
-static void set_mcast_loop(struct sock *sk, u_char loop)
-{
- struct inet_sock *inet = inet_sk(sk);
-
- /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
- lock_sock(sk);
- inet->mc_loop = loop ? 1 : 0;
- release_sock(sk);
-}
-
-/*
- * Specify TTL for outgoing multicasts on a sending socket
- */
-static void set_mcast_ttl(struct sock *sk, u_char ttl)
-{
- struct inet_sock *inet = inet_sk(sk);
-
- /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
- lock_sock(sk);
- inet->mc_ttl = ttl;
- release_sock(sk);
-}
-
-/*
- * Specifiy default interface for outgoing multicasts
- */
-static int set_mcast_if(struct sock *sk, char *ifname)
-{
- struct net_device *dev;
- struct inet_sock *inet = inet_sk(sk);
-
- if ((dev = __dev_get_by_name(ifname)) == NULL)
- return -ENODEV;
-
- if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
- return -EINVAL;
-
- lock_sock(sk);
- inet->mc_index = dev->ifindex;
- /* inet->mc_addr = 0; */
- release_sock(sk);
-
- return 0;
-}
-
-
-/*
- * Set the maximum length of sync message according to the
- * specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(int sync_state)
-{
- struct net_device *dev;
- int num;
-
- if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
- return -ENODEV;
-
- num = (dev->mtu - sizeof(struct iphdr) -
- sizeof(struct udphdr) -
- SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- sync_send_mesg_maxlen =
- SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
- IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", sync_send_mesg_maxlen);
- } else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
- return -ENODEV;
-
- sync_recv_mesg_maxlen = dev->mtu -
- sizeof(struct iphdr) - sizeof(struct udphdr);
- IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", sync_recv_mesg_maxlen);
- }
-
- return 0;
-}
-
-
-/*
- * Join a multicast group.
- * the group is specified by a class D multicast address 224.0.0.0/8
- * in the in_addr structure passed in as a parameter.
- */
-static int
-join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
-{
- struct ip_mreqn mreq;
- struct net_device *dev;
- int ret;
-
- memset(&mreq, 0, sizeof(mreq));
- memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
-
- if ((dev = __dev_get_by_name(ifname)) == NULL)
- return -ENODEV;
- if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
- return -EINVAL;
-
- mreq.imr_ifindex = dev->ifindex;
-
- lock_sock(sk);
- ret = ip_mc_join_group(sk, &mreq);
- release_sock(sk);
-
- return ret;
-}
-
-
-static int bind_mcastif_addr(struct socket *sock, char *ifname)
-{
- struct net_device *dev;
- __be32 addr;
- struct sockaddr_in sin;
-
- if ((dev = __dev_get_by_name(ifname)) == NULL)
- return -ENODEV;
-
- addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
- if (!addr)
- IP_VS_ERR("You probably need to specify IP address on "
- "multicast interface.\n");
-
- IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
- ifname, NIPQUAD(addr));
-
- /* Now bind the socket with the address of multicast interface */
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr;
- sin.sin_port = 0;
-
- return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
-}
-
-/*
- * Set up sending multicast socket over UDP
- */
-static struct socket * make_send_sock(void)
-{
- struct socket *sock;
-
- /* First create a socket */
- if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
- IP_VS_ERR("Error during creation of socket; terminating\n");
- return NULL;
- }
-
- if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
- IP_VS_ERR("Error setting outbound mcast interface\n");
- goto error;
- }
-
- set_mcast_loop(sock->sk, 0);
- set_mcast_ttl(sock->sk, 1);
-
- if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
- IP_VS_ERR("Error binding address of the mcast interface\n");
- goto error;
- }
-
- if (sock->ops->connect(sock,
- (struct sockaddr*)&mcast_addr,
- sizeof(struct sockaddr), 0) < 0) {
- IP_VS_ERR("Error connecting to the multicast addr\n");
- goto error;
- }
-
- return sock;
-
- error:
- sock_release(sock);
- return NULL;
-}
-
-
-/*
- * Set up receiving multicast socket over UDP
- */
-static struct socket * make_receive_sock(void)
-{
- struct socket *sock;
-
- /* First create a socket */
- if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
- IP_VS_ERR("Error during creation of socket; terminating\n");
- return NULL;
- }
-
- /* it is equivalent to the REUSEADDR option in user-space */
- sock->sk->sk_reuse = 1;
-
- if (sock->ops->bind(sock,
- (struct sockaddr*)&mcast_addr,
- sizeof(struct sockaddr)) < 0) {
- IP_VS_ERR("Error binding to the multicast addr\n");
- goto error;
- }
-
- /* join the multicast group */
- if (join_mcast_group(sock->sk,
- (struct in_addr*)&mcast_addr.sin_addr,
- ip_vs_backup_mcast_ifn) < 0) {
- IP_VS_ERR("Error joining to the multicast group\n");
- goto error;
- }
-
- return sock;
-
- error:
- sock_release(sock);
- return NULL;
-}
-
-
-static int
-ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
-{
- struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
- struct kvec iov;
- int len;
-
- EnterFunction(7);
- iov.iov_base = (void *)buffer;
- iov.iov_len = length;
-
- len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
-
- LeaveFunction(7);
- return len;
-}
-
-static void
-ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
-{
- int msize;
-
- msize = msg->size;
-
- /* Put size in network byte order */
- msg->size = htons(msg->size);
-
- if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
- IP_VS_ERR("ip_vs_send_async error\n");
-}
-
-static int
-ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
-{
- struct msghdr msg = {NULL,};
- struct kvec iov;
- int len;
-
- EnterFunction(7);
-
- /* Receive a packet */
- iov.iov_base = buffer;
- iov.iov_len = (size_t)buflen;
-
- len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
-
- if (len < 0)
- return -1;
-
- LeaveFunction(7);
- return len;
-}
-
-
-static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
-static pid_t sync_master_pid = 0;
-static pid_t sync_backup_pid = 0;
-
-static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
-static int stop_master_sync = 0;
-static int stop_backup_sync = 0;
-
-static void sync_master_loop(void)
-{
- struct socket *sock;
- struct ip_vs_sync_buff *sb;
-
- /* create the sending multicast socket */
- sock = make_send_sock();
- if (!sock)
- return;
-
- IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
- "syncid = %d\n",
- ip_vs_master_mcast_ifn, ip_vs_master_syncid);
-
- for (;;) {
- while ((sb=sb_dequeue())) {
- ip_vs_send_sync_msg(sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
- }
-
- /* check if entries stay in curr_sb for 2 seconds */
- if ((sb = get_curr_sync_buff(2*HZ))) {
- ip_vs_send_sync_msg(sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
- }
-
- if (stop_master_sync)
- break;
-
- ssleep(1);
- }
-
- /* clean up the sync_buff queue */
- while ((sb=sb_dequeue())) {
- ip_vs_sync_buff_release(sb);
- }
-
- /* clean up the current sync_buff */
- if ((sb = get_curr_sync_buff(0))) {
- ip_vs_sync_buff_release(sb);
- }
-
- /* release the sending multicast socket */
- sock_release(sock);
-}
-
-
-static void sync_backup_loop(void)
-{
- struct socket *sock;
- char *buf;
- int len;
-
- if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
- IP_VS_ERR("sync_backup_loop: kmalloc error\n");
- return;
- }
-
- /* create the receiving multicast socket */
- sock = make_receive_sock();
- if (!sock)
- goto out;
-
- IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
- "syncid = %d\n",
- ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
-
- for (;;) {
- /* do you have data now? */
- while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
- if ((len =
- ip_vs_receive(sock, buf,
- sync_recv_mesg_maxlen)) <= 0) {
- IP_VS_ERR("receiving message error\n");
- break;
- }
- /* disable bottom half, because it accessed the data
- shared by softirq while getting/creating conns */
- local_bh_disable();
- ip_vs_process_message(buf, len);
- local_bh_enable();
- }
-
- if (stop_backup_sync)
- break;
-
- ssleep(1);
- }
-
- /* release the sending multicast socket */
- sock_release(sock);
-
- out:
- kfree(buf);
-}
-
-
-static void set_sync_pid(int sync_state, pid_t sync_pid)
-{
- if (sync_state == IP_VS_STATE_MASTER)
- sync_master_pid = sync_pid;
- else if (sync_state == IP_VS_STATE_BACKUP)
- sync_backup_pid = sync_pid;
-}
-
-static void set_stop_sync(int sync_state, int set)
-{
- if (sync_state == IP_VS_STATE_MASTER)
- stop_master_sync = set;
- else if (sync_state == IP_VS_STATE_BACKUP)
- stop_backup_sync = set;
- else {
- stop_master_sync = set;
- stop_backup_sync = set;
- }
-}
-
-static int sync_thread(void *startup)
-{
- DECLARE_WAITQUEUE(wait, current);
- mm_segment_t oldmm;
- int state;
- const char *name;
-
- /* increase the module use count */
- ip_vs_use_count_inc();
-
- if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
- state = IP_VS_STATE_MASTER;
- name = "ipvs_syncmaster";
- } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
- state = IP_VS_STATE_BACKUP;
- name = "ipvs_syncbackup";
- } else {
- IP_VS_BUG();
- ip_vs_use_count_dec();
- return -EINVAL;
- }
-
- daemonize(name);
-
- oldmm = get_fs();
- set_fs(KERNEL_DS);
-
- /* Block all signals */
- spin_lock_irq(&current->sighand->siglock);
- siginitsetinv(&current->blocked, 0);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- /* set the maximum length of sync message */
- set_sync_mesg_maxlen(state);
-
- /* set up multicast address */
- mcast_addr.sin_family = AF_INET;
- mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
- mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
-
- add_wait_queue(&sync_wait, &wait);
-
- set_sync_pid(state, current->pid);
- complete((struct completion *)startup);
-
- /* processing master/backup loop here */
- if (state == IP_VS_STATE_MASTER)
- sync_master_loop();
- else if (state == IP_VS_STATE_BACKUP)
- sync_backup_loop();
- else IP_VS_BUG();
-
- remove_wait_queue(&sync_wait, &wait);
-
- /* thread exits */
- set_sync_pid(state, 0);
- IP_VS_INFO("sync thread stopped!\n");
-
- set_fs(oldmm);
-
- /* decrease the module use count */
- ip_vs_use_count_dec();
-
- set_stop_sync(state, 0);
- wake_up(&stop_sync_wait);
-
- return 0;
-}
-
-
-static int fork_sync_thread(void *startup)
-{
- pid_t pid;
-
- /* fork the sync thread here, then the parent process of the
- sync thread is the init process after this thread exits. */
- repeat:
- if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
- IP_VS_ERR("could not create sync_thread due to %d... "
- "retrying.\n", pid);
- ssleep(1);
- goto repeat;
- }
-
- return 0;
-}
-
-
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
-{
- DECLARE_COMPLETION_ONSTACK(startup);
- pid_t pid;
-
- if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
- (state == IP_VS_STATE_BACKUP && sync_backup_pid))
- return -EEXIST;
-
- IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
- IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
- sizeof(struct ip_vs_sync_conn));
-
- ip_vs_sync_state |= state;
- if (state == IP_VS_STATE_MASTER) {
- strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
- ip_vs_master_syncid = syncid;
- } else {
- strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
- ip_vs_backup_syncid = syncid;
- }
-
- repeat:
- if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
- IP_VS_ERR("could not create fork_sync_thread due to %d... "
- "retrying.\n", pid);
- ssleep(1);
- goto repeat;
- }
-
- wait_for_completion(&startup);
-
- return 0;
-}
-
-
-int stop_sync_thread(int state)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
- (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
- return -ESRCH;
-
- IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
- IP_VS_INFO("stopping sync thread %d ...\n",
- (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
-
- __set_current_state(TASK_UNINTERRUPTIBLE);
- add_wait_queue(&stop_sync_wait, &wait);
- set_stop_sync(state, 1);
- ip_vs_sync_state -= state;
- wake_up(&sync_wait);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&stop_sync_wait, &wait);
-
- /* Note: no need to reap the sync thread, because its parent
- process is the init process */
-
- if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
- (state == IP_VS_STATE_BACKUP && stop_backup_sync))
- IP_VS_BUG();
-
- return 0;
-}
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
deleted file mode 100644
index 749fa044eca5..000000000000
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * IPVS: Weighted Round-Robin Scheduling module
- *
- * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
- * Wensong Zhang : changed some comestics things for debugging
- * Wensong Zhang : changed for the d-linked destination list
- * Wensong Zhang : added the ip_vs_wrr_update_svc
- * Julian Anastasov : fixed the bug of returning destination
- * with weight 0 when all weights are zero
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-/*
- * current destination pointer for weighted round-robin scheduling
- */
-struct ip_vs_wrr_mark {
- struct list_head *cl; /* current list head */
- int cw; /* current weight */
- int mw; /* maximum weight */
- int di; /* decreasing interval */
-};
-
-
-/*
- * Get the gcd of server weights
- */
-static int gcd(int a, int b)
-{
- int c;
-
- while ((c = a % b)) {
- a = b;
- b = c;
- }
- return b;
-}
-
-static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
-{
- struct ip_vs_dest *dest;
- int weight;
- int g = 0;
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- weight = atomic_read(&dest->weight);
- if (weight > 0) {
- if (g > 0)
- g = gcd(weight, g);
- else
- g = weight;
- }
- }
- return g ? g : 1;
-}
-
-
-/*
- * Get the maximum weight of the service destinations.
- */
-static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
-{
- struct ip_vs_dest *dest;
- int weight = 0;
-
- list_for_each_entry(dest, &svc->destinations, n_list) {
- if (atomic_read(&dest->weight) > weight)
- weight = atomic_read(&dest->weight);
- }
-
- return weight;
-}
-
-
-static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_wrr_mark *mark;
-
- /*
- * Allocate the mark variable for WRR scheduling
- */
- mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
- if (mark == NULL) {
- IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
- return -ENOMEM;
- }
- mark->cl = &svc->destinations;
- mark->cw = 0;
- mark->mw = ip_vs_wrr_max_weight(svc);
- mark->di = ip_vs_wrr_gcd_weight(svc);
- svc->sched_data = mark;
-
- return 0;
-}
-
-
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
-{
- /*
- * Release the mark variable
- */
- kfree(svc->sched_data);
-
- return 0;
-}
-
-
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
-{
- struct ip_vs_wrr_mark *mark = svc->sched_data;
-
- mark->cl = &svc->destinations;
- mark->mw = ip_vs_wrr_max_weight(svc);
- mark->di = ip_vs_wrr_gcd_weight(svc);
- if (mark->cw > mark->mw)
- mark->cw = 0;
- return 0;
-}
-
-
-/*
- * Weighted Round-Robin Scheduling
- */
-static struct ip_vs_dest *
-ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
- struct ip_vs_dest *dest;
- struct ip_vs_wrr_mark *mark = svc->sched_data;
- struct list_head *p;
-
- IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
-
- /*
- * This loop will always terminate, because mark->cw in (0, max_weight]
- * and at least one server has its weight equal to max_weight.
- */
- write_lock(&svc->sched_lock);
- p = mark->cl;
- while (1) {
- if (mark->cl == &svc->destinations) {
- /* it is at the head of the destination list */
-
- if (mark->cl == mark->cl->next) {
- /* no dest entry */
- dest = NULL;
- goto out;
- }
-
- mark->cl = svc->destinations.next;
- mark->cw -= mark->di;
- if (mark->cw <= 0) {
- mark->cw = mark->mw;
- /*
- * Still zero, which means no available servers.
- */
- if (mark->cw == 0) {
- mark->cl = &svc->destinations;
- IP_VS_INFO("ip_vs_wrr_schedule(): "
- "no available servers\n");
- dest = NULL;
- goto out;
- }
- }
- } else
- mark->cl = mark->cl->next;
-
- if (mark->cl != &svc->destinations) {
- /* not at the head of the list */
- dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) >= mark->cw) {
- /* got it */
- break;
- }
- }
-
- if (mark->cl == p && mark->cw == mark->di) {
- /* back to the start, and no dest is found.
- It is only possible when all dests are OVERLOADED */
- dest = NULL;
- goto out;
- }
- }
-
- IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
- "activeconns %d refcnt %d weight %d\n",
- NIPQUAD(dest->addr), ntohs(dest->port),
- atomic_read(&dest->activeconns),
- atomic_read(&dest->refcnt),
- atomic_read(&dest->weight));
-
- out:
- write_unlock(&svc->sched_lock);
- return dest;
-}
-
-
-static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
- .name = "wrr",
- .refcnt = ATOMIC_INIT(0),
- .module = THIS_MODULE,
- .init_service = ip_vs_wrr_init_svc,
- .done_service = ip_vs_wrr_done_svc,
- .update_service = ip_vs_wrr_update_svc,
- .schedule = ip_vs_wrr_schedule,
-};
-
-static int __init ip_vs_wrr_init(void)
-{
- INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
- return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
-}
-
-static void __exit ip_vs_wrr_cleanup(void)
-{
- unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
-}
-
-module_init(ip_vs_wrr_init);
-module_exit(ip_vs_wrr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
deleted file mode 100644
index e1f77bd7c9a5..000000000000
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * ip_vs_xmit.c: various packet transmitters for IPVS
- *
- * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
- *
- * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
- * Julian Anastasov <ja@ssi.bg>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h> /* for tcphdr */
-#include <net/tcp.h> /* for csum_tcpudp_magic */
-#include <net/udp.h>
-#include <net/icmp.h> /* for icmp_send */
-#include <net/route.h> /* for ip_route_output */
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * Destination cache to speed up outgoing route lookup
- */
-static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
-{
- struct dst_entry *old_dst;
-
- old_dst = dest->dst_cache;
- dest->dst_cache = dst;
- dest->dst_rtos = rtos;
- dst_release(old_dst);
-}
-
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
-{
- struct dst_entry *dst = dest->dst_cache;
-
- if (!dst)
- return NULL;
- if ((dst->obsolete || rtos != dest->dst_rtos) &&
- dst->ops->check(dst, cookie) == NULL) {
- dest->dst_cache = NULL;
- dst_release(dst);
- return NULL;
- }
- dst_hold(dst);
- return dst;
-}
-
-static inline struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
-{
- struct rtable *rt; /* Route to the other host */
- struct ip_vs_dest *dest = cp->dest;
-
- if (dest) {
- spin_lock(&dest->dst_lock);
- if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos, 0))) {
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip4_u = {
- .daddr = dest->addr,
- .saddr = 0,
- .tos = rtos, } },
- };
-
- if (ip_route_output_key(&rt, &fl)) {
- spin_unlock(&dest->dst_lock);
- IP_VS_DBG_RL("ip_route_output error, "
- "dest: %u.%u.%u.%u\n",
- NIPQUAD(dest->addr));
- return NULL;
- }
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
- IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
- NIPQUAD(dest->addr),
- atomic_read(&rt->u.dst.__refcnt), rtos);
- }
- spin_unlock(&dest->dst_lock);
- } else {
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip4_u = {
- .daddr = cp->daddr,
- .saddr = 0,
- .tos = rtos, } },
- };
-
- if (ip_route_output_key(&rt, &fl)) {
- IP_VS_DBG_RL("ip_route_output error, dest: "
- "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
- return NULL;
- }
- }
-
- return rt;
-}
-
-
-/*
- * Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
-{
- struct dst_entry *old_dst;
-
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
-}
-
-#define IP_VS_XMIT(skb, rt) \
-do { \
- (skb)->ipvs_property = 1; \
- (skb)->ip_summed = CHECKSUM_NONE; \
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
- (rt)->u.dst.dev, dst_output); \
-} while (0)
-
-
-/*
- * NULL transmitter (do nothing except return NF_ACCEPT)
- */
-int
-ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
-{
- /* we do not touch skb and do not need pskb ptr */
- return NF_ACCEPT;
-}
-
-
-/*
- * Bypass transmitter
- * Let packets bypass the destination when the destination is not
- * available, it may be only used in transparent cache cluster.
- */
-int
-ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
-{
- struct rtable *rt; /* Route to the other host */
- struct iphdr *iph = skb->nh.iph;
- u8 tos = iph->tos;
- int mtu;
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip4_u = {
- .daddr = iph->daddr,
- .saddr = 0,
- .tos = RT_TOS(tos), } },
- };
-
- EnterFunction(10);
-
- if (ip_route_output_key(&rt, &fl)) {
- IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
- "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
- goto tx_error_icmp;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
- if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
- goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(skb->nh.iph);
-
- /* drop old route */
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
- /* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- LeaveFunction(10);
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- kfree_skb(skb);
- LeaveFunction(10);
- return NF_STOLEN;
-}
-
-
-/*
- * NAT transmitter (only for outside-to-inside nat forwarding)
- * Not used for related ICMP
- */
-int
-ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
-{
- struct rtable *rt; /* Route to the other host */
- int mtu;
- struct iphdr *iph = skb->nh.iph;
-
- EnterFunction(10);
-
- /* check if it is a connection of no-client-port */
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
- __be16 _pt, *p;
- p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
- if (p == NULL)
- goto tx_error;
- ip_vs_conn_fill_cport(cp, *p);
- IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
- }
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
- if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
- goto tx_error;
- }
-
- /* copy-on-write the packet before mangling it */
- if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
- goto tx_error_put;
-
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
- goto tx_error_put;
-
- /* drop old route */
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
- /* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
- goto tx_error;
- skb->nh.iph->daddr = cp->daddr;
- ip_send_check(skb->nh.iph);
-
- IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-
- /* FIXME: when application helper enlarges the packet and the length
- is larger than the MTU of outgoing device, there will be still
- MTU problem. */
-
- /* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- LeaveFunction(10);
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- LeaveFunction(10);
- kfree_skb(skb);
- return NF_STOLEN;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
-}
-
-
-/*
- * IP Tunneling transmitter
- *
- * This function encapsulates the packet in a new IP packet, its
- * destination will be set to cp->daddr. Most code of this function
- * is taken from ipip.c.
- *
- * It is used in VS/TUN cluster. The load balancer selects a real
- * server from a cluster based on a scheduling algorithm,
- * encapsulates the request packet and forwards it to the selected
- * server. For example, all real servers are configured with
- * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
- * the encapsulated packet, it will decapsulate the packet, processe
- * the request and return the response packets directly to the client
- * without passing the load balancer. This can greatly increase the
- * scalability of virtual server.
- *
- * Used for ANY protocol
- */
-int
-ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
-{
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *old_iph = skb->nh.iph;
- u8 tos = old_iph->tos;
- __be16 df = old_iph->frag_off;
- struct iphdr *iph; /* Our new IP header */
- int max_headroom; /* The extra header space needed */
- int mtu;
-
- EnterFunction(10);
-
- if (skb->protocol != __constant_htons(ETH_P_IP)) {
- IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
- "ETH_P_IP: %d, skb protocol: %d\n",
- __constant_htons(ETH_P_IP), skb->protocol);
- goto tx_error;
- }
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
- goto tx_error_icmp;
-
- tdev = rt->u.dst.dev;
-
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
- if (mtu < 68) {
- ip_rt_put(rt);
- IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
- goto tx_error;
- }
- if (skb->dst)
- skb->dst->ops->update_pmtu(skb->dst, mtu);
-
- df |= (old_iph->frag_off&__constant_htons(IP_DF));
-
- if ((old_iph->frag_off&__constant_htons(IP_DF))
- && mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
- goto tx_error;
- }
-
- /*
- * Okay, now see if we can stuff it in the buffer as-is.
- */
- max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
-
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
- struct sk_buff *new_skb =
- skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- kfree_skb(skb);
- IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
- return NF_STOLEN;
- }
- kfree_skb(skb);
- skb = new_skb;
- old_iph = skb->nh.iph;
- }
-
- skb->h.raw = (void *) old_iph;
-
- /* fix old IP header checksum */
- ip_send_check(old_iph);
-
- skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
- /* drop old route */
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
- /*
- * Push down and install the IPIP header.
- */
- iph = skb->nh.iph;
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_IPIP;
- iph->tos = tos;
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
- iph->ttl = old_iph->ttl;
- iph->tot_len = htons(skb->len);
- ip_select_ident(iph, &rt->u.dst, NULL);
- ip_send_check(iph);
-
- /* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- LeaveFunction(10);
-
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- kfree_skb(skb);
- LeaveFunction(10);
- return NF_STOLEN;
-}
-
-
-/*
- * Direct Routing transmitter
- * Used for ANY protocol
- */
-int
-ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
-{
- struct rtable *rt; /* Route to the other host */
- struct iphdr *iph = skb->nh.iph;
- int mtu;
-
- EnterFunction(10);
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
- if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
- goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(skb->nh.iph);
-
- /* drop old route */
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
- /* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- LeaveFunction(10);
- return NF_STOLEN;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- kfree_skb(skb);
- LeaveFunction(10);
- return NF_STOLEN;
-}
-
-
-/*
- * ICMP packet transmitter
- * called by the ip_vs_in_icmp
- */
-int
-ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, int offset)
-{
- struct rtable *rt; /* Route to the other host */
- int mtu;
- int rc;
-
- EnterFunction(10);
-
- /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
- forwarded directly here, because there is no need to
- translate address/port back */
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
- if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp);
- else
- rc = NF_ACCEPT;
- /* do not touch skb anymore */
- atomic_inc(&cp->in_pkts);
- goto out;
- }
-
- /*
- * mangle and send the packet here (only for VS/NAT)
- */
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->u.dst);
- if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
- goto tx_error;
- }
-
- /* copy-on-write the packet before mangling it */
- if (!ip_vs_make_skb_writable(&skb, offset))
- goto tx_error_put;
-
- if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
- goto tx_error_put;
-
- /* drop the old route when skb is not shared */
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
-
- ip_vs_nat_icmp(skb, pp, cp, 0);
-
- /* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
-
- IP_VS_XMIT(skb, rt);
-
- rc = NF_STOLEN;
- goto out;
-
- tx_error_icmp:
- dst_link_failure(skb);
- tx_error:
- dev_kfree_skb(skb);
- rc = NF_STOLEN;
- out:
- LeaveFunction(10);
- return rc;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
-}
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 000000000000..8ddac1f595ed
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/netlink.h>
+#include <linux/nospec.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/tcp.h>
+
+static int ip_metrics_convert(struct nlattr *fc_mx,
+ int fc_mx_len, u32 *metrics,
+ struct netlink_ext_ack *extack)
+{
+ bool ecn_ca = false;
+ struct nlattr *nla;
+ int remaining;
+
+ nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid metric type");
+ return -EINVAL;
+ }
+
+ type = array_index_nospec(type, RTAX_MAX + 1);
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strscpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
+ return -EINVAL;
+ }
+ } else {
+ if (nla_len(nla) != sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Invalid attribute in metrics");
+ return -EINVAL;
+ }
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ if (type == RTAX_HOPLIMIT && val > 255)
+ val = 255;
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+ NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
+ return -EINVAL;
+ }
+ metrics[type - 1] = val;
+ }
+
+ if (ecn_ca)
+ metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+ return 0;
+}
+
+struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx,
+ int fc_mx_len,
+ struct netlink_ext_ack *extack)
+{
+ struct dst_metrics *fib_metrics;
+ int err;
+
+ if (!fc_mx)
+ return (struct dst_metrics *)&dst_default_metrics;
+
+ fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
+ if (unlikely(!fib_metrics))
+ return ERR_PTR(-ENOMEM);
+
+ err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics,
+ extack);
+ if (!err) {
+ refcount_set(&fib_metrics->refcnt, 1);
+ } else {
+ kfree(fib_metrics);
+ fib_metrics = ERR_PTR(err);
+ }
+
+ return fib_metrics;
+}
+EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c
deleted file mode 100644
index 4e9ca7c76407..000000000000
--- a/net/ipv4/multipath.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/* multipath.c: IPV4 multipath algorithm support.
- *
- * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
- * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-
-#include <net/ip_mp_alg.h>
-
-static DEFINE_SPINLOCK(alg_table_lock);
-struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
-
-int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
-{
- struct ip_mp_alg_ops **slot;
- int err;
-
- if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
- !ops->mp_alg_select_route)
- return -EINVAL;
-
- spin_lock(&alg_table_lock);
- slot = &ip_mp_alg_table[n];
- if (*slot != NULL) {
- err = -EBUSY;
- } else {
- *slot = ops;
- err = 0;
- }
- spin_unlock(&alg_table_lock);
-
- return err;
-}
-EXPORT_SYMBOL(multipath_alg_register);
-
-void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
-{
- struct ip_mp_alg_ops **slot;
-
- if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
- return;
-
- spin_lock(&alg_table_lock);
- slot = &ip_mp_alg_table[n];
- if (*slot == ops)
- *slot = NULL;
- spin_unlock(&alg_table_lock);
-
- synchronize_net();
-}
-EXPORT_SYMBOL(multipath_alg_unregister);
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
deleted file mode 100644
index 252e837b17a5..000000000000
--- a/net/ipv4/multipath_drr.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Device round robin policy for multipath.
- *
- *
- * Version: $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
- *
- * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_mp_alg.h>
-
-struct multipath_device {
- int ifi; /* interface index of device */
- atomic_t usecount;
- int allocated;
-};
-
-#define MULTIPATH_MAX_DEVICECANDIDATES 10
-
-static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
-static DEFINE_SPINLOCK(state_lock);
-
-static int inline __multipath_findslot(void)
-{
- int i;
-
- for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
- if (state[i].allocated == 0)
- return i;
- }
- return -1;
-}
-
-static int inline __multipath_finddev(int ifindex)
-{
- int i;
-
- for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
- if (state[i].allocated != 0 &&
- state[i].ifi == ifindex)
- return i;
- }
- return -1;
-}
-
-static int drr_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
- int devidx;
-
- switch (event) {
- case NETDEV_UNREGISTER:
- case NETDEV_DOWN:
- spin_lock_bh(&state_lock);
-
- devidx = __multipath_finddev(dev->ifindex);
- if (devidx != -1) {
- state[devidx].allocated = 0;
- state[devidx].ifi = 0;
- atomic_set(&state[devidx].usecount, 0);
- }
-
- spin_unlock_bh(&state_lock);
- break;
- };
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block drr_dev_notifier = {
- .notifier_call = drr_dev_event,
-};
-
-
-static void drr_safe_inc(atomic_t *usecount)
-{
- int n;
-
- atomic_inc(usecount);
-
- n = atomic_read(usecount);
- if (n <= 0) {
- int i;
-
- spin_lock_bh(&state_lock);
-
- for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
- atomic_set(&state[i].usecount, 0);
-
- spin_unlock_bh(&state_lock);
- }
-}
-
-static void drr_select_route(const struct flowi *flp,
- struct rtable *first, struct rtable **rp)
-{
- struct rtable *nh, *result, *cur_min;
- int min_usecount = -1;
- int devidx = -1;
- int cur_min_devidx = -1;
-
- /* 1. make sure all alt. nexthops have the same GC related data */
- /* 2. determine the new candidate to be returned */
- result = NULL;
- cur_min = NULL;
- for (nh = rcu_dereference(first); nh;
- nh = rcu_dereference(nh->u.rt_next)) {
- if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
- multipath_comparekeys(&nh->fl, flp)) {
- int nh_ifidx = nh->u.dst.dev->ifindex;
-
- nh->u.dst.lastuse = jiffies;
- nh->u.dst.__use++;
- if (result != NULL)
- continue;
-
- /* search for the output interface */
-
- /* this is not SMP safe, only add/remove are
- * SMP safe as wrong usecount updates have no big
- * impact
- */
- devidx = __multipath_finddev(nh_ifidx);
- if (devidx == -1) {
- /* add the interface to the array
- * SMP safe
- */
- spin_lock_bh(&state_lock);
-
- /* due to SMP: search again */
- devidx = __multipath_finddev(nh_ifidx);
- if (devidx == -1) {
- /* add entry for device */
- devidx = __multipath_findslot();
- if (devidx == -1) {
- /* unlikely but possible */
- continue;
- }
-
- state[devidx].allocated = 1;
- state[devidx].ifi = nh_ifidx;
- atomic_set(&state[devidx].usecount, 0);
- min_usecount = 0;
- }
-
- spin_unlock_bh(&state_lock);
- }
-
- if (min_usecount == 0) {
- /* if the device has not been used it is
- * the primary target
- */
- drr_safe_inc(&state[devidx].usecount);
- result = nh;
- } else {
- int count =
- atomic_read(&state[devidx].usecount);
-
- if (min_usecount == -1 ||
- count < min_usecount) {
- cur_min = nh;
- cur_min_devidx = devidx;
- min_usecount = count;
- }
- }
- }
- }
-
- if (!result) {
- if (cur_min) {
- drr_safe_inc(&state[cur_min_devidx].usecount);
- result = cur_min;
- } else {
- result = first;
- }
- }
-
- *rp = result;
-}
-
-static struct ip_mp_alg_ops drr_ops = {
- .mp_alg_select_route = drr_select_route,
-};
-
-static int __init drr_init(void)
-{
- int err = register_netdevice_notifier(&drr_dev_notifier);
-
- if (err)
- return err;
-
- err = multipath_alg_register(&drr_ops, IP_MP_ALG_DRR);
- if (err)
- goto fail;
-
- return 0;
-
-fail:
- unregister_netdevice_notifier(&drr_dev_notifier);
- return err;
-}
-
-static void __exit drr_exit(void)
-{
- unregister_netdevice_notifier(&drr_dev_notifier);
- multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
-}
-
-module_init(drr_init);
-module_exit(drr_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
deleted file mode 100644
index b8c289f247cb..000000000000
--- a/net/ipv4/multipath_random.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Random policy for multipath.
- *
- *
- * Version: $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
- *
- * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_mp_alg.h>
-
-#define MULTIPATH_MAX_CANDIDATES 40
-
-/* interface to random number generation */
-static unsigned int RANDOM_SEED = 93186752;
-
-static inline unsigned int random(unsigned int ubound)
-{
- static unsigned int a = 1588635695,
- q = 2,
- r = 1117695901;
-
- RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
-
- return RANDOM_SEED % ubound;
-}
-
-
-static void random_select_route(const struct flowi *flp,
- struct rtable *first,
- struct rtable **rp)
-{
- struct rtable *rt;
- struct rtable *decision;
- unsigned char candidate_count = 0;
-
- /* count all candidate */
- for (rt = rcu_dereference(first); rt;
- rt = rcu_dereference(rt->u.rt_next)) {
- if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
- multipath_comparekeys(&rt->fl, flp))
- ++candidate_count;
- }
-
- /* choose a random candidate */
- decision = first;
- if (candidate_count > 1) {
- unsigned char i = 0;
- unsigned char candidate_no = (unsigned char)
- random(candidate_count);
-
- /* find chosen candidate and adjust GC data for all candidates
- * to ensure they stay in cache
- */
- for (rt = first; rt; rt = rt->u.rt_next) {
- if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
- multipath_comparekeys(&rt->fl, flp)) {
- rt->u.dst.lastuse = jiffies;
-
- if (i == candidate_no)
- decision = rt;
-
- if (i >= candidate_count)
- break;
-
- i++;
- }
- }
- }
-
- decision->u.dst.__use++;
- *rp = decision;
-}
-
-static struct ip_mp_alg_ops random_ops = {
- .mp_alg_select_route = random_select_route,
-};
-
-static int __init random_init(void)
-{
- return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
-}
-
-static void __exit random_exit(void)
-{
- multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
-}
-
-module_init(random_init);
-module_exit(random_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
deleted file mode 100644
index bba5abe5542d..000000000000
--- a/net/ipv4/multipath_rr.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Round robin policy for multipath.
- *
- *
- * Version: $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
- *
- * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_mp_alg.h>
-
-static void rr_select_route(const struct flowi *flp,
- struct rtable *first, struct rtable **rp)
-{
- struct rtable *nh, *result, *min_use_cand = NULL;
- int min_use = -1;
-
- /* 1. make sure all alt. nexthops have the same GC related data
- * 2. determine the new candidate to be returned
- */
- result = NULL;
- for (nh = rcu_dereference(first); nh;
- nh = rcu_dereference(nh->u.rt_next)) {
- if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
- multipath_comparekeys(&nh->fl, flp)) {
- nh->u.dst.lastuse = jiffies;
-
- if (min_use == -1 || nh->u.dst.__use < min_use) {
- min_use = nh->u.dst.__use;
- min_use_cand = nh;
- }
- }
- }
- result = min_use_cand;
- if (!result)
- result = first;
-
- result->u.dst.__use++;
- *rp = result;
-}
-
-static struct ip_mp_alg_ops rr_ops = {
- .mp_alg_select_route = rr_select_route,
-};
-
-static int __init rr_init(void)
-{
- return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
-}
-
-static void __exit rr_exit(void)
-{
- multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
-}
-
-module_init(rr_init);
-module_exit(rr_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
deleted file mode 100644
index 92b04823e034..000000000000
--- a/net/ipv4/multipath_wrandom.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Weighted random policy for multipath.
- *
- *
- * Version: $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
- *
- * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_fib.h>
-#include <net/ip_mp_alg.h>
-
-#define MULTIPATH_STATE_SIZE 15
-
-struct multipath_candidate {
- struct multipath_candidate *next;
- int power;
- struct rtable *rt;
-};
-
-struct multipath_dest {
- struct list_head list;
-
- const struct fib_nh *nh_info;
- __be32 netmask;
- __be32 network;
- unsigned char prefixlen;
-
- struct rcu_head rcu;
-};
-
-struct multipath_bucket {
- struct list_head head;
- spinlock_t lock;
-};
-
-struct multipath_route {
- struct list_head list;
-
- int oif;
- __be32 gw;
- struct list_head dests;
-
- struct rcu_head rcu;
-};
-
-/* state: primarily weight per route information */
-static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
-
-/* interface to random number generation */
-static unsigned int RANDOM_SEED = 93186752;
-
-static inline unsigned int random(unsigned int ubound)
-{
- static unsigned int a = 1588635695,
- q = 2,
- r = 1117695901;
- RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
- return RANDOM_SEED % ubound;
-}
-
-static unsigned char __multipath_lookup_weight(const struct flowi *fl,
- const struct rtable *rt)
-{
- const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
- struct multipath_route *r;
- struct multipath_route *target_route = NULL;
- struct multipath_dest *d;
- int weight = 1;
-
- /* lookup the weight information for a certain route */
- rcu_read_lock();
-
- /* find state entry for gateway or add one if necessary */
- list_for_each_entry_rcu(r, &state[state_idx].head, list) {
- if (r->gw == rt->rt_gateway &&
- r->oif == rt->idev->dev->ifindex) {
- target_route = r;
- break;
- }
- }
-
- if (!target_route) {
- /* this should not happen... but we are prepared */
- printk( KERN_CRIT"%s: missing state for gateway: %u and " \
- "device %d\n", __FUNCTION__, rt->rt_gateway,
- rt->idev->dev->ifindex);
- goto out;
- }
-
- /* find state entry for destination */
- list_for_each_entry_rcu(d, &target_route->dests, list) {
- __be32 targetnetwork = fl->fl4_dst &
- inet_make_mask(d->prefixlen);
-
- if ((targetnetwork & d->netmask) == d->network) {
- weight = d->nh_info->nh_weight;
- goto out;
- }
- }
-
-out:
- rcu_read_unlock();
- return weight;
-}
-
-static void wrandom_init_state(void)
-{
- int i;
-
- for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
- INIT_LIST_HEAD(&state[i].head);
- spin_lock_init(&state[i].lock);
- }
-}
-
-static void wrandom_select_route(const struct flowi *flp,
- struct rtable *first,
- struct rtable **rp)
-{
- struct rtable *rt;
- struct rtable *decision;
- struct multipath_candidate *first_mpc = NULL;
- struct multipath_candidate *mpc, *last_mpc = NULL;
- int power = 0;
- int last_power;
- int selector;
- const size_t size_mpc = sizeof(struct multipath_candidate);
-
- /* collect all candidates and identify their weights */
- for (rt = rcu_dereference(first); rt;
- rt = rcu_dereference(rt->u.rt_next)) {
- if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
- multipath_comparekeys(&rt->fl, flp)) {
- struct multipath_candidate* mpc =
- (struct multipath_candidate*)
- kmalloc(size_mpc, GFP_ATOMIC);
-
- if (!mpc)
- return;
-
- power += __multipath_lookup_weight(flp, rt) * 10000;
-
- mpc->power = power;
- mpc->rt = rt;
- mpc->next = NULL;
-
- if (!first_mpc)
- first_mpc = mpc;
- else
- last_mpc->next = mpc;
-
- last_mpc = mpc;
- }
- }
-
- /* choose a weighted random candidate */
- decision = first;
- selector = random(power);
- last_power = 0;
-
- /* select candidate, adjust GC data and cleanup local state */
- decision = first;
- last_mpc = NULL;
- for (mpc = first_mpc; mpc; mpc = mpc->next) {
- mpc->rt->u.dst.lastuse = jiffies;
- if (last_power <= selector && selector < mpc->power)
- decision = mpc->rt;
-
- last_power = mpc->power;
- kfree(last_mpc);
- last_mpc = mpc;
- }
-
- /* concurrent __multipath_flush may lead to !last_mpc */
- kfree(last_mpc);
-
- decision->u.dst.__use++;
- *rp = decision;
-}
-
-static void wrandom_set_nhinfo(__be32 network,
- __be32 netmask,
- unsigned char prefixlen,
- const struct fib_nh *nh)
-{
- const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
- struct multipath_route *r, *target_route = NULL;
- struct multipath_dest *d, *target_dest = NULL;
-
- /* store the weight information for a certain route */
- spin_lock_bh(&state[state_idx].lock);
-
- /* find state entry for gateway or add one if necessary */
- list_for_each_entry_rcu(r, &state[state_idx].head, list) {
- if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
- target_route = r;
- break;
- }
- }
-
- if (!target_route) {
- const size_t size_rt = sizeof(struct multipath_route);
- target_route = (struct multipath_route *)
- kmalloc(size_rt, GFP_ATOMIC);
-
- target_route->gw = nh->nh_gw;
- target_route->oif = nh->nh_oif;
- memset(&target_route->rcu, 0, sizeof(struct rcu_head));
- INIT_LIST_HEAD(&target_route->dests);
-
- list_add_rcu(&target_route->list, &state[state_idx].head);
- }
-
- /* find state entry for destination or add one if necessary */
- list_for_each_entry_rcu(d, &target_route->dests, list) {
- if (d->nh_info == nh) {
- target_dest = d;
- break;
- }
- }
-
- if (!target_dest) {
- const size_t size_dst = sizeof(struct multipath_dest);
- target_dest = (struct multipath_dest*)
- kmalloc(size_dst, GFP_ATOMIC);
-
- target_dest->nh_info = nh;
- target_dest->network = network;
- target_dest->netmask = netmask;
- target_dest->prefixlen = prefixlen;
- memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
-
- list_add_rcu(&target_dest->list, &target_route->dests);
- }
- /* else: we already stored this info for another destination =>
- * we are finished
- */
-
- spin_unlock_bh(&state[state_idx].lock);
-}
-
-static void __multipath_free(struct rcu_head *head)
-{
- struct multipath_route *rt = container_of(head, struct multipath_route,
- rcu);
- kfree(rt);
-}
-
-static void __multipath_free_dst(struct rcu_head *head)
-{
- struct multipath_dest *dst = container_of(head,
- struct multipath_dest,
- rcu);
- kfree(dst);
-}
-
-static void wrandom_flush(void)
-{
- int i;
-
- /* defere delete to all entries */
- for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
- struct multipath_route *r;
-
- spin_lock_bh(&state[i].lock);
- list_for_each_entry_rcu(r, &state[i].head, list) {
- struct multipath_dest *d;
- list_for_each_entry_rcu(d, &r->dests, list) {
- list_del_rcu(&d->list);
- call_rcu(&d->rcu,
- __multipath_free_dst);
- }
- list_del_rcu(&r->list);
- call_rcu(&r->rcu,
- __multipath_free);
- }
-
- spin_unlock_bh(&state[i].lock);
- }
-}
-
-static struct ip_mp_alg_ops wrandom_ops = {
- .mp_alg_select_route = wrandom_select_route,
- .mp_alg_flush = wrandom_flush,
- .mp_alg_set_nhinfo = wrandom_set_nhinfo,
-};
-
-static int __init wrandom_init(void)
-{
- wrandom_init_state();
-
- return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
-}
-
-static void __exit wrandom_exit(void)
-{
- multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
-}
-
-module_init(wrandom_init);
-module_exit(wrandom_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index a68966059b50..ce310eb779e0 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -1,216 +1,99 @@
-/* IPv4 specific functions of netfilter core */
+/*
+ * IPv4 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <net/flow.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type)
+int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
{
- struct iphdr *iph = (*pskb)->nh.iph;
+ struct net_device *dev = skb_dst_dev(skb);
+ const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- struct flowi fl = {};
- struct dst_entry *odst;
+ struct flowi4 fl4 = {};
+ __be32 saddr = iph->saddr;
+ __u8 flags;
+ struct flow_keys flkeys;
unsigned int hh_len;
+ sk = sk_to_full_sk(sk);
+ flags = sk ? inet_sk_flowi_flags(sk) : 0;
+
if (addr_type == RTN_UNSPEC)
- addr_type = inet_addr_type(iph->saddr);
+ addr_type = inet_addr_type_dev_table(net, dev, saddr);
+ if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+ flags |= FLOWI_FLAG_ANYSRC;
+ else
+ saddr = 0;
/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
- * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+ * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
*/
- if (addr_type == RTN_LOCAL) {
- fl.nl_u.ip4_u.daddr = iph->daddr;
- fl.nl_u.ip4_u.saddr = iph->saddr;
- fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
- fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
- fl.mark = (*pskb)->mark;
- if (ip_route_output_key(&rt, &fl) != 0)
- return -1;
-
- /* Drop old route. */
- dst_release((*pskb)->dst);
- (*pskb)->dst = &rt->u.dst;
- } else {
- /* non-local src, find valid iif to satisfy
- * rp-filter when calling ip_route_input. */
- fl.nl_u.ip4_u.daddr = iph->saddr;
- if (ip_route_output_key(&rt, &fl) != 0)
- return -1;
-
- odst = (*pskb)->dst;
- if (ip_route_input(*pskb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
- dst_release(&rt->u.dst);
- return -1;
- }
- dst_release(&rt->u.dst);
- dst_release(odst);
- }
-
- if ((*pskb)->dst->error)
- return -1;
+ fl4.daddr = iph->daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_dscp = ip4h_dscp(iph);
+ fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+ fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_flags = flags;
+ fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys);
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ /* Drop old route. */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+
+ if (skb_dst(skb)->error)
+ return skb_dst(skb)->error;
#ifdef CONFIG_XFRM
- if (!(IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(*pskb, &fl, AF_INET) == 0)
- if (xfrm_lookup(&(*pskb)->dst, &fl, (*pskb)->sk, 0))
- return -1;
+ if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
+ struct dst_entry *dst = skb_dst(skb);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ skb_dst_set(skb, dst);
+ }
#endif
/* Change in oif may mean change in hh_len. */
- hh_len = (*pskb)->dst->dev->hard_header_len;
- if (skb_headroom(*pskb) < hh_len) {
- struct sk_buff *nskb;
-
- nskb = skb_realloc_headroom(*pskb, hh_len);
- if (!nskb)
- return -1;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- }
+ hh_len = skb_dst_dev(skb)->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(ip_route_me_harder);
-#ifdef CONFIG_XFRM
-int ip_xfrm_me_harder(struct sk_buff **pskb)
-{
- struct flowi fl;
- unsigned int hh_len;
- struct dst_entry *dst;
-
- if (IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED)
- return 0;
- if (xfrm_decode_session(*pskb, &fl, AF_INET) < 0)
- return -1;
-
- dst = (*pskb)->dst;
- if (dst->xfrm)
- dst = ((struct xfrm_dst *)dst)->route;
- dst_hold(dst);
-
- if (xfrm_lookup(&dst, &fl, (*pskb)->sk, 0) < 0)
- return -1;
-
- dst_release((*pskb)->dst);
- (*pskb)->dst = dst;
-
- /* Change in oif may mean change in hh_len. */
- hh_len = (*pskb)->dst->dev->hard_header_len;
- if (skb_headroom(*pskb) < hh_len) {
- struct sk_buff *nskb;
-
- nskb = skb_realloc_headroom(*pskb, hh_len);
- if (!nskb)
- return -1;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- }
- return 0;
-}
-EXPORT_SYMBOL(ip_xfrm_me_harder);
-#endif
-
-void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(ip_nat_decode_session);
-
-/*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
-
-struct ip_rt_info {
- __be32 daddr;
- __be32 saddr;
- u_int8_t tos;
-};
-
-static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info)
+int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+ bool strict __always_unused)
{
- struct ip_rt_info *rt_info = nf_info_reroute(info);
-
- if (info->hook == NF_IP_LOCAL_OUT) {
- const struct iphdr *iph = skb->nh.iph;
-
- rt_info->tos = iph->tos;
- rt_info->daddr = iph->daddr;
- rt_info->saddr = iph->saddr;
- }
-}
-
-static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info)
-{
- const struct ip_rt_info *rt_info = nf_info_reroute(info);
-
- if (info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = (*pskb)->nh.iph;
-
- if (!(iph->tos == rt_info->tos
- && iph->daddr == rt_info->daddr
- && iph->saddr == rt_info->saddr))
- return ip_route_me_harder(pskb, RTN_UNSPEC);
- }
+ struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ *dst = &rt->dst;
return 0;
}
-
-__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
-{
- struct iphdr *iph = skb->nh.iph;
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_IP_PRE_ROUTING && hook != NF_IP_LOCAL_IN)
- break;
- if ((protocol == 0 && !csum_fold(skb->csum)) ||
- !csum_tcpudp_magic(iph->saddr, iph->daddr,
- skb->len - dataoff, protocol,
- skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- /* fall through */
- case CHECKSUM_NONE:
- if (protocol == 0)
- skb->csum = 0;
- else
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len - dataoff,
- protocol, 0);
- csum = __skb_checksum_complete(skb);
- }
- return csum;
-}
-
-EXPORT_SYMBOL(nf_ip_checksum);
-
-static struct nf_afinfo nf_ip_afinfo = {
- .family = AF_INET,
- .checksum = nf_ip_checksum,
- .saveroute = nf_ip_saveroute,
- .reroute = nf_ip_reroute,
- .route_key_size = sizeof(struct ip_rt_info),
-};
-
-static int ipv4_netfilter_init(void)
-{
- return nf_register_afinfo(&nf_ip_afinfo);
-}
-
-static void ipv4_netfilter_fini(void)
-{
- nf_unregister_afinfo(&nf_ip_afinfo);
-}
-
-module_init(ipv4_netfilter_init);
-module_exit(ipv4_netfilter_fini);
+EXPORT_SYMBOL_GPL(nf_ip_route);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index bc298a3f236f..7dc9772fe2d8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP netfilter configuration
#
@@ -5,242 +6,131 @@
menu "IP: Netfilter Configuration"
depends on INET && NETFILTER
-config NF_CONNTRACK_IPV4
- tristate "IPv4 support for new connection tracking (EXPERIMENTAL)"
- depends on EXPERIMENTAL && NF_CONNTRACK
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv4 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
+config NF_DEFRAG_IPV4
+ tristate
+ default n
-config NF_CONNTRACK_PROC_COMPAT
- bool "proc/sysctl compatibility with old connection tracking"
- depends on NF_CONNTRACK
- default y
+# old sockopt interface and eval loop
+config IP_NF_IPTABLES_LEGACY
+ tristate "Legacy IP tables support"
+ depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default m if NETFILTER_XTABLES_LEGACY
help
- This option enables /proc and sysctl compatibility with the old
- layer 3 dependant connection tracking. This is needed to keep
- old programs that have not been adapted to the new names working.
-
- If unsure, say Y.
+ iptables is a legacy packet classifier.
+ This is not needed if you are using iptables over nftables
+ (iptables-nft).
-# connection tracking, helpers and protocols
-config IP_NF_CONNTRACK
- tristate "Connection tracking (required for masq/NAT)"
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
+config NF_SOCKET_IPV4
+ tristate "IPv4 socket lookup support"
+ help
+ This option enables the IPv4 socket lookup infrastructure. This is
+ is required by the {ip,nf}tables socket match.
- This is required to do Masquerading or other kinds of Network
- Address Translation (except for Fast NAT). It can also be used to
- enhance packet filtering (see `Connection state match support'
- below).
+config NF_TPROXY_IPV4
+ tristate "IPv4 tproxy support"
- To compile it as a module, choose M here. If unsure, say N.
+if NF_TABLES
-config IP_NF_CT_ACCT
- bool "Connection tracking flow accounting"
- depends on IP_NF_CONNTRACK
+config NF_TABLES_IPV4
+ bool "IPv4 nf_tables support"
help
- If this option is enabled, the connection tracking code will
- keep per-flow packet and byte counters.
+ This option enables the IPv4 support for nf_tables.
- Those counters can be used for flow-based accounting or the
- `connbytes' match.
+if NF_TABLES_IPV4
- If unsure, say `N'.
+config NFT_REJECT_IPV4
+ select NF_REJECT_IPV4
+ default NFT_REJECT
+ tristate
-config IP_NF_CONNTRACK_MARK
- bool 'Connection mark tracking support'
- depends on IP_NF_CONNTRACK
- help
- This option enables support for connection marks, used by the
- `CONNMARK' target and `connmark' match. Similar to the mark value
- of packets, but this mark value is kept in the conntrack session
- instead of the individual packets.
-
-config IP_NF_CONNTRACK_SECMARK
- bool 'Connection tracking security mark support'
- depends on IP_NF_CONNTRACK && NETWORK_SECMARK
+config NFT_DUP_IPV4
+ tristate "IPv4 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV4
help
- This option enables security markings to be applied to
- connections. Typically they are copied to connections from
- packets using the CONNSECMARK target and copied back from
- connections to packets with the same target, with the packets
- being originally labeled via SECMARK.
+ This module enables IPv4 packet duplication support for nf_tables.
- If unsure, say 'N'.
-
-config IP_NF_CONNTRACK_EVENTS
- bool "Connection tracking events (EXPERIMENTAL)"
- depends on EXPERIMENTAL && IP_NF_CONNTRACK
- help
- If this option is enabled, the connection tracking code will
- provide a notifier chain that can be used by other kernel code
- to get notified about changes in the connection tracking state.
-
- IF unsure, say `N'.
-
-config IP_NF_CONNTRACK_NETLINK
- tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
- depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK
- depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
- depends on IP_NF_NAT=n || IP_NF_NAT
+config NFT_FIB_IPV4
+ select NFT_FIB
+ tristate "nf_tables fib / ip route lookup support"
help
- This option enables support for a netlink-based userspace interface
+ This module enables IPv4 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+endif # NF_TABLES_IPV4
-config IP_NF_CT_PROTO_SCTP
- tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
- depends on IP_NF_CONNTRACK && EXPERIMENTAL
+config NF_TABLES_ARP
+ bool "ARP nf_tables support"
+ select NETFILTER_FAMILY_ARP
help
- With this option enabled, the connection tracking code will
- be able to do state tracking on SCTP connections.
+ This option enables the ARP support for nf_tables.
- If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+endif # NF_TABLES
-config IP_NF_FTP
- tristate "FTP protocol support"
- depends on IP_NF_CONNTRACK
- help
- Tracking FTP connections is problematic: special helpers are
- required for tracking them, and doing masquerading and other forms
- of Network Address Translation on them.
-
- To compile it as a module, choose M here. If unsure, say Y.
-
-config IP_NF_IRC
- tristate "IRC protocol support"
- depends on IP_NF_CONNTRACK
- ---help---
- There is a commonly-used extension to IRC called
- Direct Client-to-Client Protocol (DCC). This enables users to send
- files to each other, and also chat to each other without the need
- of a server. DCC Sending is used anywhere you send files over IRC,
- and DCC Chat is most commonly used by Eggdrop bots. If you are
- using NAT, this extension will enable you to send files and initiate
- chats. Note that you do NOT need this extension to get files or
- have others initiate chats, or everything else in IRC.
-
- To compile it as a module, choose M here. If unsure, say Y.
-
-config IP_NF_NETBIOS_NS
- tristate "NetBIOS name service protocol support (EXPERIMENTAL)"
- depends on IP_NF_CONNTRACK && EXPERIMENTAL
+config NF_DUP_IPV4
+ tristate "Netfilter IPv4 packet duplication to alternate destination"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
help
- NetBIOS name service requests are sent as broadcast messages from an
- unprivileged port and responded to with unicast messages to the
- same port. This make them hard to firewall properly because connection
- tracking doesn't deal with broadcasts. This helper tracks locally
- originating NetBIOS name service requests and the corresponding
- responses. It relies on correct IP address configuration, specifically
- netmask and broadcast address. When properly configured, the output
- of "ip address show" should look similar to this:
-
- $ ip -4 address show eth0
- 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
- inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
-
- To compile it as a module, choose M here. If unsure, say N.
+ This option enables the nf_dup_ipv4 core, which duplicates an IPv4
+ packet to be rerouted to another destination.
-config IP_NF_TFTP
- tristate "TFTP protocol support"
- depends on IP_NF_CONNTRACK
- help
- TFTP connection tracking helper, this is required depending
- on how restrictive your ruleset is.
- If you are using a tftp client behind -j SNAT or -j MASQUERADING
- you will need this.
-
- To compile it as a module, choose M here. If unsure, say Y.
-
-config IP_NF_AMANDA
- tristate "Amanda backup protocol support"
- depends on IP_NF_CONNTRACK
- select TEXTSEARCH
- select TEXTSEARCH_KMP
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_SYSLOG
help
- If you are running the Amanda backup package <http://www.amanda.org/>
- on this machine or machines that will be MASQUERADED through this
- machine, then you may want to enable this feature. This allows the
- connection tracking and natting code to allow the sub-channels that
- Amanda requires for communication of the backup data, messages and
- index.
-
- To compile it as a module, choose M here. If unsure, say Y.
-
-config IP_NF_PPTP
- tristate 'PPTP protocol support'
- depends on IP_NF_CONNTRACK
- help
- This module adds support for PPTP (Point to Point Tunnelling
- Protocol, RFC2637) connection tracking and NAT.
-
- If you are running PPTP sessions over a stateful firewall or NAT
- box, you may want to enable this feature.
-
- Please note that not all PPTP modes of operation are supported yet.
- For more info, read top of the file
- net/ipv4/netfilter/ip_conntrack_pptp.c
-
- If you want to compile it as a module, say M here and read
- Documentation/modules.txt. If unsure, say `N'.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
-config IP_NF_H323
- tristate 'H.323 protocol support (EXPERIMENTAL)'
- depends on IP_NF_CONNTRACK && EXPERIMENTAL
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_SYSLOG
help
- H.323 is a VoIP signalling protocol from ITU-T. As one of the most
- important VoIP protocols, it is widely used by voice hardware and
- software including voice gateways, IP phones, Netmeeting, OpenPhone,
- Gnomemeeting, etc.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
- With this module you can support H.323 on a connection tracking/NAT
- firewall.
+config NF_REJECT_IPV4
+ tristate "IPv4 packet rejection"
+ default m if NETFILTER_ADVANCED=n
- This module supports RAS, Fast Start, H.245 Tunnelling, Call
- Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
- whiteboard, file transfer, etc. For more information, please
- visit http://nath323.sourceforge.net/.
+if NF_NAT
+config NF_NAT_SNMP_BASIC
+ tristate "Basic SNMP-ALG support"
+ depends on NF_CONNTRACK_SNMP
+ depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
+ select ASN1
+ help
- If you want to compile it as a module, say 'M' here and read
- Documentation/modules.txt. If unsure, say 'N'.
+ This module implements an Application Layer Gateway (ALG) for
+ SNMP payloads. In conjunction with NAT, it allows a network
+ management system to access multiple private networks with
+ conflicting addresses. It works by modifying IP addresses
+ inside SNMP payloads to match IP-layer NAT mapping.
-config IP_NF_SIP
- tristate "SIP protocol support (EXPERIMENTAL)"
- depends on IP_NF_CONNTRACK && EXPERIMENTAL
- help
- SIP is an application-layer control protocol that can establish,
- modify, and terminate multimedia sessions (conferences) such as
- Internet telephony calls. With the ip_conntrack_sip and
- the ip_nat_sip modules you can support the protocol on a connection
- tracking/NATing firewall.
+ This is the "basic" form of SNMP-ALG, as described in RFC 2962
- To compile it as a module, choose M here. If unsure, say Y.
+ To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_QUEUE
- tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
- help
- Netfilter has the ability to queue packets to user space: the
- netlink device can be used to access them using this driver.
+config NF_NAT_PPTP
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_PPTP
- This option enables the old IPv4-only "ip_queue" implementation
- which has been obsoleted by the new "nfnetlink_queue" code (see
- CONFIG_NETFILTER_NETLINK_QUEUE).
+config NF_NAT_H323
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_H323
- To compile it as a module, choose M here. If unsure, say N.
+endif # NF_NAT
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
- depends on NETFILTER_XTABLES
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_XTABLES
help
iptables is a general, extensible packet identification framework.
The packet filtering and full NAT (masquerading, port forwarding,
@@ -249,87 +139,52 @@ config IP_NF_IPTABLES
To compile it as a module, choose M here. If unsure, say N.
-# The matches.
-config IP_NF_MATCH_IPRANGE
- tristate "IP range match support"
- depends on IP_NF_IPTABLES
- help
- This option makes possible to match IP addresses against IP address
- ranges.
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_MATCH_TOS
- tristate "TOS match support"
- depends on IP_NF_IPTABLES
- help
- TOS matching allows you to match packets based on the Type Of
- Service fields of the IP packet.
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_MATCH_RECENT
- tristate "recent match support"
- depends on IP_NF_IPTABLES
- help
- This match is used for creating one or many lists of recently
- used addresses and then matching against that/those list(s).
-
- Short options are available by using 'iptables -m recent -h'
- Official Website: <http://snowman.net/projects/ipt_recent/>
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_MATCH_ECN
- tristate "ECN match support"
- depends on IP_NF_IPTABLES
- help
- This option adds a `ECN' match, which allows you to match against
- the IPv4 and TCP header ECN fields.
-
- To compile it as a module, choose M here. If unsure, say N.
+if IP_NF_IPTABLES
+# The matches.
config IP_NF_MATCH_AH
- tristate "AH match support"
- depends on IP_NF_IPTABLES
+ tristate '"ah" match support'
+ depends on NETFILTER_ADVANCED
help
This match extension allows you to match a range of SPIs
inside AH header of IPSec packets.
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_MATCH_TTL
- tristate "TTL match support"
- depends on IP_NF_IPTABLES
+config IP_NF_MATCH_ECN
+ tristate '"ecn" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_ECN
help
- This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
- to match packets by their TTL value.
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_ECN.
-config IP_NF_MATCH_OWNER
- tristate "Owner match support"
- depends on IP_NF_IPTABLES
+config IP_NF_MATCH_RPFILTER
+ tristate '"rpfilter" reverse path filter match support'
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE || IP_NF_RAW || NFT_COMPAT
help
- Packet owner matching allows you to match locally-generated packets
- based on who created them: the user, group, process or session.
+ This option allows you to match packets whose replies would
+ go out via the interface the packet came in.
To compile it as a module, choose M here. If unsure, say N.
+ The module will be called ipt_rpfilter.
-config IP_NF_MATCH_ADDRTYPE
- tristate 'address type match support'
- depends on IP_NF_IPTABLES
+config IP_NF_MATCH_TTL
+ tristate '"ttl" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_HL
help
- This option allows you to match what routing thinks of an address,
- eg. UNICAST, LOCAL, BROADCAST, ...
-
- If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
# `filter', generic and specific targets
config IP_NF_FILTER
tristate "Packet filtering"
- depends on IP_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
Packet filtering defines a table `filter', which has a series of
rules for simple packet filtering at local input, forwarding and
@@ -339,7 +194,9 @@ config IP_NF_FILTER
config IP_NF_TARGET_REJECT
tristate "REJECT target support"
- depends on IP_NF_FILTER
+ depends on IP_NF_FILTER || NFT_COMPAT
+ select NF_REJECT_IPV4
+ default m if NETFILTER_ADVANCED=n
help
The REJECT target allows a filtering rule to specify that an ICMP
error should be issued in response to an incoming packet, rather
@@ -347,182 +204,68 @@ config IP_NF_TARGET_REJECT
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_LOG
- tristate "LOG target support"
- depends on IP_NF_IPTABLES
+config IP_NF_TARGET_SYNPROXY
+ tristate "SYNPROXY target support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
help
- This option adds a `LOG' target, which allows you to create rules in
- any iptables table which records the packet header to the syslog.
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_TARGET_ULOG
- tristate "ULOG target support"
- depends on IP_NF_IPTABLES
- ---help---
-
- This option enables the old IPv4-only "ipt_ULOG" implementation
- which has been obsoleted by the new "nfnetlink_log" code (see
- CONFIG_NETFILTER_NETLINK_LOG).
-
- This option adds a `ULOG' target, which allows you to create rules in
- any iptables table. The packet is passed to a userspace logging
- daemon using netlink multicast sockets; unlike the LOG target
- which can only be viewed through syslog.
+ The SYNPROXY target allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
- The appropriate userspace logging daemon (ulogd) may be obtained from
- <http://www.gnumonks.org/projects/ulogd/>
+ To compile it as a module, choose M here. If unsure, say N.
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_TARGET_TCPMSS
- tristate "TCPMSS target support"
- depends on IP_NF_IPTABLES
- ---help---
- This option adds a `TCPMSS' target, which allows you to alter the
- MSS value of TCP SYN packets, to control the maximum size for that
- connection (usually limiting it to your outgoing interface's MTU
- minus 40).
-
- This is used to overcome criminally braindead ISPs or servers which
- block ICMP Fragmentation Needed packets. The symptoms of this
- problem are that everything works fine from your Linux
- firewall/router, but machines behind it can never exchange large
- packets:
- 1) Web browsers connect, then hang with no data received.
- 2) Small mail works fine, but large emails hang.
- 3) ssh works fine, but scp hangs after initial handshaking.
-
- Workaround: activate this option and add a rule to your firewall
- configuration like:
-
- iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
- -j TCPMSS --clamp-mss-to-pmtu
-
- To compile it as a module, choose M here. If unsure, say N.
-
-# NAT + specific targets
+# NAT + specific targets: nf_conntrack
config IP_NF_NAT
- tristate "Full NAT"
- depends on IP_NF_IPTABLES && IP_NF_CONNTRACK
+ tristate "iptables NAT support"
+ depends on NF_CONNTRACK
+ depends on IP_NF_IPTABLES_LEGACY
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ select NETFILTER_XT_NAT
help
- The Full NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. It is controlled by
- the `nat' table in iptables: see the man page for iptables(8).
+ This enables the `nat' table in iptables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_NAT_NEEDED
- bool
- depends on IP_NF_NAT != n
- default y
+if IP_NF_NAT
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- depends on IP_NF_NAT
- help
- Masquerading is a special case of NAT: all outgoing connections are
- changed to seem to come from a particular interface's address, and
- if the interface goes down, those connections are lost. This is
- only useful for dialup accounts with dynamic IP address (ie. your IP
- address will be different on next dialup).
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_TARGET_REDIRECT
- tristate "REDIRECT target support"
- depends on IP_NF_NAT
+ select NETFILTER_XT_TARGET_MASQUERADE
help
- REDIRECT is a special case of NAT: all incoming connections are
- mapped onto the incoming interface's address, causing the packets to
- come to the local machine instead of passing through. This is
- useful for transparent proxies.
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
config IP_NF_TARGET_NETMAP
tristate "NETMAP target support"
- depends on IP_NF_NAT
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_NETMAP
help
- NETMAP is an implementation of static 1:1 NAT mapping of network
- addresses. It maps the network address part, while keeping the host
- address part intact. It is similar to Fast NAT, except that
- Netfilter's connection tracking doesn't work well with Fast NAT.
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_NETMAP.
-config IP_NF_TARGET_SAME
- tristate "SAME target support"
- depends on IP_NF_NAT
+config IP_NF_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_REDIRECT
help
- This option adds a `SAME' target, which works like the standard SNAT
- target, but attempts to give clients the same IP for all connections.
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_NAT_SNMP_BASIC
- tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && IP_NF_NAT
- ---help---
-
- This module implements an Application Layer Gateway (ALG) for
- SNMP payloads. In conjunction with NAT, it allows a network
- management system to access multiple private networks with
- conflicting addresses. It works by modifying IP addresses
- inside SNMP payloads to match IP-layer NAT mapping.
-
- This is the "basic" form of SNMP-ALG, as described in RFC 2962
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_NAT_IRC
- tristate
- depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
- default IP_NF_NAT if IP_NF_IRC=y
- default m if IP_NF_IRC=m
-
-# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
-# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. Argh.
-config IP_NF_NAT_FTP
- tristate
- depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
- default IP_NF_NAT if IP_NF_FTP=y
- default m if IP_NF_FTP=m
-
-config IP_NF_NAT_TFTP
- tristate
- depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
- default IP_NF_NAT if IP_NF_TFTP=y
- default m if IP_NF_TFTP=m
-
-config IP_NF_NAT_AMANDA
- tristate
- depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
- default IP_NF_NAT if IP_NF_AMANDA=y
- default m if IP_NF_AMANDA=m
-
-config IP_NF_NAT_PPTP
- tristate
- depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
- default IP_NF_NAT if IP_NF_PPTP=y
- default m if IP_NF_PPTP=m
-
-config IP_NF_NAT_H323
- tristate
- depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
- default IP_NF_NAT if IP_NF_H323=y
- default m if IP_NF_H323=m
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_REDIRECT.
-config IP_NF_NAT_SIP
- tristate
- depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
- default IP_NF_NAT if IP_NF_SIP=y
- default m if IP_NF_SIP=m
+endif # IP_NF_NAT
# mangle + specific targets
config IP_NF_MANGLE
tristate "Packet mangling"
- depends on IP_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `mangle' table to iptables: see the man page for
iptables(8). This table is used for various packet alterations
@@ -530,22 +273,13 @@ config IP_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_TOS
- tristate "TOS target support"
- depends on IP_NF_MANGLE
- help
- This option adds a `TOS' target, which allows you to create rules in
- the `mangle' table which alter the Type Of Service field of an IP
- packet prior to routing.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_TARGET_ECN
tristate "ECN target support"
- depends on IP_NF_MANGLE
- ---help---
+ depends on IP_NF_MANGLE || NFT_COMPAT
+ depends on NETFILTER_ADVANCED
+ help
This option adds a `ECN' target, which can be used in the iptables mangle
- table.
+ table.
You can use this target to remove the ECN bits from the IPv4 header of
an IP packet. This is particularly useful, if you need to work around
@@ -555,70 +289,80 @@ config IP_NF_TARGET_ECN
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_TTL
- tristate 'TTL target support'
- depends on IP_NF_MANGLE
- help
- This option adds a `TTL' target, which enables the user to modify
- the TTL value of the IP header.
-
- While it is safe to decrement/lower the TTL, this target also enables
- functionality to increment and set the TTL value of the IP header to
- arbitrary values. This is EXTREMELY DANGEROUS since you can easily
- create immortal packets that loop forever on the network.
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_TARGET_CLUSTERIP
- tristate "CLUSTERIP target support (EXPERIMENTAL)"
- depends on IP_NF_MANGLE && EXPERIMENTAL
- depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
+ tristate '"TTL" target support'
+ depends on NETFILTER_ADVANCED && IP_NF_MANGLE
+ select NETFILTER_XT_TARGET_HL
help
- The CLUSTERIP target allows you to build load-balancing clusters of
- network servers without having a dedicated load-balancing
- router/server/switch.
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
# raw + specific targets
config IP_NF_RAW
tristate 'raw table support (required for NOTRACK/TRACE)'
- depends on IP_NF_IPTABLES
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
-
+
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
+
+# security table for MAC policy
+config IP_NF_SECURITY
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_IPTABLES_LEGACY
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
+
+endif # IP_NF_IPTABLES
# ARP tables
config IP_NF_ARPTABLES
- tristate "ARP tables support"
+ tristate "Legacy ARPTABLES support"
+ depends on NETFILTER_XTABLES_LEGACY
depends on NETFILTER_XTABLES
+ default n
help
- arptables is a general, extensible packet identification framework.
- The ARP packet filtering and mangling (manipulation)subsystems
- use this: say Y or M here if you want to use either of those.
+ arptables is a legacy packet classifier.
+ This is not needed if you are using arptables over nftables
+ (iptables-nft).
- To compile it as a module, choose M here. If unsure, say N.
+config NFT_COMPAT_ARP
+ tristate
+ depends on NF_TABLES_ARP && NFT_COMPAT
+ default m if NFT_COMPAT=m
+ default y if NFT_COMPAT=y
config IP_NF_ARPFILTER
- tristate "ARP packet filtering"
- depends on IP_NF_ARPTABLES
+ tristate "arptables-legacy packet filtering support"
+ select IP_NF_ARPTABLES
+ select NETFILTER_FAMILY_ARP
+ depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
help
ARP packet filtering defines a table `filter', which has a series of
rules for simple ARP packet filtering at local input and
- local output. On a bridge, you can also specify filtering rules
- for forwarded ARP packets. See the man page for arptables(8).
+ local output. This is only needed for arptables-legacy(8).
+ Neither arptables-nft nor nftables need this to work.
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_ARP_MANGLE
tristate "ARP payload mangling"
- depends on IP_NF_ARPTABLES
+ depends on IP_NF_ARPTABLES || NFT_COMPAT_ARP
help
Allows altering the ARP packet payload: source and destination
hardware and network addresses.
+ This option is needed by both arptables-legacy and arptables-nft.
+ It is not used by nftables.
+
endmenu
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 21359d83f0c7..85502d4dfbb4 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -1,81 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the netfilter modules on top of IPv4.
#
-# objects for the standalone - connection tracking / NAT
-ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
-ip_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
-iptable_nat-objs := ip_nat_rule.o ip_nat_standalone.o
+# defrag
+obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
-ip_conntrack_pptp-objs := ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o
-ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o
+obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
-ip_conntrack_h323-objs := ip_conntrack_helper_h323.o ip_conntrack_helper_h323_asn1.o
-ip_nat_h323-objs := ip_nat_helper_h323.o
+# reject
+obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
-# connection tracking
-obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
-obj-$(CONFIG_IP_NF_NAT) += ip_nat.o
+# NAT helpers (nf_conntrack)
+obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
+obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
-# conntrack netlink interface
-obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
+nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
+$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
+obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
+obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
-# SCTP protocol connection tracking
-obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
-
-# connection tracking helpers
-obj-$(CONFIG_IP_NF_H323) += ip_conntrack_h323.o
-obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
-obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
-obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
-obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
-obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
-obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
-obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
-
-# NAT helpers
-obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o
-obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
-obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
-obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
-obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
-obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
-obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
-
-# generic IP tables
-obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+# generic IP tables
+obj-$(CONFIG_IP_NF_IPTABLES_LEGACY) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
# matches
-obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
-obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
-obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
-obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
-obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
-obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
-obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
+obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
# targets
-obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
-obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
-obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
-obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
-obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
-obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
-obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
-obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
-obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
-obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
-obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
# generic ARP tables
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -84,15 +50,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
-obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
-
-# objects for l3 independent conntrack
-nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
-ifeq ($(CONFIG_PROC_FS),y)
-nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
-endif
-endif
-
-# l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
+obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 413c2d0a1f3d..1cdd9c28ab2d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Packet matching code for ARP packets.
*
@@ -6,9 +7,10 @@
* Some ARP specific bits are:
*
* Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>
*
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -19,45 +21,28 @@
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/init.h>
-
-#include <asm/uaccess.h>
#include <linux/mutex.h>
+#include <linux/err.h>
+#include <net/compat.h>
+#include <net/sock.h>
+#include <linux/uaccess.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
+#include "../../netfilter/xt_repldata.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
MODULE_DESCRIPTION("arptables core");
-/*#define DEBUG_ARP_TABLES*/
-/*#define DEBUG_ARP_TABLES_USER*/
-
-#ifdef DEBUG_ARP_TABLES
-#define dprintf(format, args...) printk(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_ARP_TABLES_USER
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-#ifdef CONFIG_NETFILTER_DEBUG
-#define ARP_NF_ASSERT(x) \
-do { \
- if (!(x)) \
- printk("ARP_NF_ASSERT: %s:%s:%u\n", \
- __FUNCTION__, __FILE__, __LINE__); \
-} while(0)
-#else
-#define ARP_NF_ASSERT(x)
-#endif
+void *arpt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(arpt, ARPT);
+}
+EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
- char *hdr_addr, int len)
+ const char *hdr_addr, int len)
{
int i, ret;
@@ -68,7 +53,29 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
for (i = 0; i < len; i++)
ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
- return (ret != 0);
+ return ret != 0;
+}
+
+/*
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
+ * Some arches dont care, unrolling the loop is a win on them.
+ * For other arches, we only have a 16bit alignement.
+ */
+static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ unsigned long ret = ifname_compare_aligned(_a, _b, _mask);
+#else
+ unsigned long ret = 0;
+ const u16 *a = (const u16 *)_a;
+ const u16 *b = (const u16 *)_b;
+ const u16 *mask = (const u16 *)_mask;
+ int i;
+
+ for (i = 0; i < IFNAMSIZ/sizeof(u16); i++)
+ ret |= (a[i] ^ b[i]) & mask[i];
+#endif
+ return ret;
}
/* Returns whether packet matches rule or not. */
@@ -78,44 +85,26 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
const char *outdev,
const struct arpt_arp *arpinfo)
{
- char *arpptr = (char *)(arphdr + 1);
- char *src_devaddr, *tgt_devaddr;
+ const char *arpptr = (char *)(arphdr + 1);
+ const char *src_devaddr, *tgt_devaddr;
__be32 src_ipaddr, tgt_ipaddr;
- int i, ret;
+ long ret;
-#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
-
- if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
- ARPT_INV_ARPOP)) {
- dprintf("ARP operation field mismatch.\n");
- dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
- arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ if (NF_INVF(arpinfo, ARPT_INV_ARPOP,
+ (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop))
return 0;
- }
- if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
- ARPT_INV_ARPHRD)) {
- dprintf("ARP hardware address format mismatch.\n");
- dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
- arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ if (NF_INVF(arpinfo, ARPT_INV_ARPHRD,
+ (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd))
return 0;
- }
- if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
- ARPT_INV_ARPPRO)) {
- dprintf("ARP protocol address format mismatch.\n");
- dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
- arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ if (NF_INVF(arpinfo, ARPT_INV_ARPPRO,
+ (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro))
return 0;
- }
- if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
- ARPT_INV_ARPHLN)) {
- dprintf("ARP hardware address length mismatch.\n");
- dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
- arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ if (NF_INVF(arpinfo, ARPT_INV_ARPHLN,
+ (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln))
return 0;
- }
src_devaddr = arpptr;
arpptr += dev->addr_len;
@@ -125,219 +114,194 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
arpptr += dev->addr_len;
memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
- if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
- ARPT_INV_SRCDEVADDR) ||
- FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
- ARPT_INV_TGTDEVADDR)) {
- dprintf("Source or target device address mismatch.\n");
-
+ if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR,
+ arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr,
+ dev->addr_len)) ||
+ NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR,
+ arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr,
+ dev->addr_len)))
return 0;
- }
- if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
- ARPT_INV_SRCIP) ||
- FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
- ARPT_INV_TGTIP)) {
- dprintf("Source or target IP address mismatch.\n");
-
- dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
- NIPQUAD(src_ipaddr),
- NIPQUAD(arpinfo->smsk.s_addr),
- NIPQUAD(arpinfo->src.s_addr),
- arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
- dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
- NIPQUAD(tgt_ipaddr),
- NIPQUAD(arpinfo->tmsk.s_addr),
- NIPQUAD(arpinfo->tgt.s_addr),
- arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ if (NF_INVF(arpinfo, ARPT_INV_SRCIP,
+ (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) ||
+ NF_INVF(arpinfo, ARPT_INV_TGTIP,
+ (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr))
return 0;
- }
/* Look for ifname matches. */
- for (i = 0, ret = 0; i < IFNAMSIZ; i++) {
- ret |= (indev[i] ^ arpinfo->iniface[i])
- & arpinfo->iniface_mask[i];
- }
+ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, arpinfo->iniface,
- arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0))
return 0;
- }
- for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
- unsigned long odev;
- memcpy(&odev, outdev + i*sizeof(unsigned long),
- sizeof(unsigned long));
- ret |= (odev
- ^ ((const unsigned long *)arpinfo->outiface)[i])
- & ((const unsigned long *)arpinfo->outiface_mask)[i];
- }
+ ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, arpinfo->outiface,
- arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0))
return 0;
- }
return 1;
}
static inline int arp_checkentry(const struct arpt_arp *arp)
{
- if (arp->flags & ~ARPT_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- arp->flags & ~ARPT_F_MASK);
+ if (arp->flags & ~ARPT_F_MASK)
return 0;
- }
- if (arp->invflags & ~ARPT_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- arp->invflags & ~ARPT_INV_MASK);
+ if (arp->invflags & ~ARPT_INV_MASK)
return 0;
- }
return 1;
}
-static unsigned int arpt_error(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+static unsigned int
+arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- printk("arp_tables: error: '%s'\n", (char *)targinfo);
+ net_err_ratelimited("arp_tables: error: '%s'\n",
+ (const char *)par->targinfo);
return NF_DROP;
}
-static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
+static inline const struct xt_entry_target *
+arpt_get_target_c(const struct arpt_entry *e)
+{
+ return arpt_get_target((struct arpt_entry *)e);
+}
+
+static inline struct arpt_entry *
+get_entry(const void *base, unsigned int offset)
{
return (struct arpt_entry *)(base + offset);
}
-unsigned int arpt_do_table(struct sk_buff **pskb,
- unsigned int hook,
- const struct net_device *in,
- const struct net_device *out,
- struct arpt_table *table)
+static inline
+struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
{
- static const char nulldevname[IFNAMSIZ];
+ return (void *)entry + entry->next_offset;
+}
+
+unsigned int arpt_do_table(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct xt_table *table = priv;
+ unsigned int hook = state->hook;
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
- struct arphdr *arp;
- int hotdrop = 0;
- struct arpt_entry *e, *back;
+ const struct arphdr *arp;
+ struct arpt_entry *e, **jumpstack;
const char *indev, *outdev;
- void *table_base;
- struct xt_table_info *private;
+ const void *table_base;
+ unsigned int cpu, stackidx = 0;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
- /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
- if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) +
- (2 * (*pskb)->dev->addr_len) +
- (2 * sizeof(u32)))))
+ if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
return NF_DROP;
- indev = in ? in->name : nulldevname;
- outdev = out ? out->name : nulldevname;
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
- read_lock_bh(&table->lock);
- private = table->private;
- table_base = (void *)private->entries[smp_processor_id()];
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = READ_ONCE(table->private); /* Address dependency. */
+ cpu = smp_processor_id();
+ table_base = private->entries;
+ jumpstack = (struct arpt_entry **)private->jumpstack[cpu];
+
+ /* No TEE support for arptables, so no need to switch to alternate
+ * stack. All targets that reenter must return absolute verdicts.
+ */
e = get_entry(table_base, private->hook_entry[hook]);
- back = get_entry(table_base, private->underflow[hook]);
- arp = (*pskb)->nh.arph;
+ acpar.state = state;
+ acpar.hotdrop = false;
+
+ arp = arp_hdr(skb);
do {
- if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
- struct arpt_entry_target *t;
- int hdr_len;
-
- hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
- (2 * (*pskb)->dev->addr_len);
- ADD_COUNTER(e->counters, hdr_len, 1);
-
- t = arpt_get_target(e);
-
- /* Standard target? */
- if (!t->u.kernel.target->target) {
- int v;
-
- v = ((struct arpt_standard_target *)t)->verdict;
- if (v < 0) {
- /* Pop from stack? */
- if (v != ARPT_RETURN) {
- verdict = (unsigned)(-v) - 1;
- break;
- }
- e = back;
- back = get_entry(table_base,
- back->comefrom);
- continue;
+ const struct xt_entry_target *t;
+ struct xt_counters *counter;
+
+ if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
+ e = arpt_next_entry(e);
+ continue;
+ }
+
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
+
+ t = arpt_get_target_c(e);
+
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
}
- if (table_base + v
- != (void *)e + e->next_offset) {
- /* Save old back ptr in next entry */
- struct arpt_entry *next
- = (void *)e + e->next_offset;
- next->comefrom =
- (void *)back - table_base;
-
- /* set back pointer to next entry */
- back = next;
+ if (stackidx == 0) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ } else {
+ e = jumpstack[--stackidx];
+ e = arpt_next_entry(e);
}
-
- e = get_entry(table_base, v);
- } else {
- /* Targets which reenter must return
- * abs. verdicts
- */
- verdict = t->u.kernel.target->target(pskb,
- in, out,
- hook,
- t->u.kernel.target,
- t->data);
-
- /* Target might have changed stuff. */
- arp = (*pskb)->nh.arph;
-
- if (verdict == ARPT_CONTINUE)
- e = (void *)e + e->next_offset;
- else
- /* Verdict */
+ continue;
+ }
+ if (table_base + v
+ != arpt_next_entry(e)) {
+ if (unlikely(stackidx >= private->stacksize)) {
+ verdict = NF_DROP;
break;
+ }
+ jumpstack[stackidx++] = e;
}
+
+ e = get_entry(table_base, v);
+ continue;
+ }
+
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+ verdict = t->u.kernel.target->target(skb, &acpar);
+
+ if (verdict == XT_CONTINUE) {
+ /* Target might have changed stuff. */
+ arp = arp_hdr(skb);
+ e = arpt_next_entry(e);
} else {
- e = (void *)e + e->next_offset;
+ /* Verdict */
+ break;
}
- } while (!hotdrop);
- read_unlock_bh(&table->lock);
+ } while (!acpar.hotdrop);
+ xt_write_recseq_end(addend);
+ local_bh_enable();
- if (hotdrop)
+ if (acpar.hotdrop)
return NF_DROP;
else
return verdict;
}
/* All zeroes == unconditional rule. */
-static inline int unconditional(const struct arpt_arp *arp)
+static inline bool unconditional(const struct arpt_entry *e)
{
- unsigned int i;
+ static const struct arpt_arp uncond;
- for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++)
- if (((__u32 *)arp)[i])
- return 0;
-
- return 1;
+ return e->target_offset == sizeof(struct arpt_entry) &&
+ memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
}
/* Figures out from what hook each rule can be called: returns 0 if
* there are loops. Puts hook bitmask in comefrom.
*/
-static int mark_source_chains(struct xt_table_info *newinfo,
- unsigned int valid_hooks, void *entry0)
+static int mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0,
+ unsigned int *offsets)
{
unsigned int hook;
@@ -346,8 +310,7 @@ static int mark_source_chains(struct xt_table_info *newinfo,
*/
for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
- struct arpt_entry *e
- = (struct arpt_entry *)(entry0 + pos);
+ struct arpt_entry *e = entry0 + pos;
if (!(valid_hooks & (1 << hook)))
continue;
@@ -356,23 +319,21 @@ static int mark_source_chains(struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- struct arpt_standard_target *t
- = (void *)arpt_get_target(e);
+ const struct xt_standard_target *t
+ = (void *)arpt_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
- printk("arptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS))
return 0;
- }
+
e->comefrom
|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
/* Unconditional return/END. */
- if (e->target_offset == sizeof(struct arpt_entry)
- && (strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0)
- && t->verdict < 0
- && unconditional(&e->arp)) {
+ if ((unconditional(e) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
/* Return: backtrack through the last
@@ -388,201 +349,186 @@ static int mark_source_chains(struct xt_table_info *newinfo,
if (pos == oldpos)
goto next;
- e = (struct arpt_entry *)
- (entry0 + pos);
+ e = entry0 + pos;
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
- e = (struct arpt_entry *)
- (entry0 + pos + size);
+ e = entry0 + pos + size;
+ if (pos + size >= newinfo->size)
+ return 0;
e->counters.pcnt = pos;
pos += size;
} else {
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0
- && newpos >= 0) {
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
+ if (!xt_find_jump_offset(offsets, newpos,
+ newinfo->number))
+ return 0;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
+ if (newpos >= newinfo->size)
+ return 0;
}
- e = (struct arpt_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
e->counters.pcnt = pos;
pos = newpos;
}
}
- next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
-static inline int standard_check(const struct arpt_entry_target *t,
- unsigned int max_offset)
+static int check_target(struct arpt_entry *e, struct net *net, const char *name)
{
- struct arpt_standard_target *targ = (void *)t;
-
- /* Check standard info. */
- if (t->u.target_size
- != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
- duprintf("arpt_standard_check: target size %u != %Zu\n",
- t->u.target_size,
- ARPT_ALIGN(sizeof(struct arpt_standard_target)));
- return 0;
- }
-
- if (targ->verdict >= 0
- && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
- duprintf("arpt_standard_check: bad verdict (%i)\n",
- targ->verdict);
- return 0;
- }
-
- if (targ->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("arpt_standard_check: bad negative verdict (%i)\n",
- targ->verdict);
- return 0;
- }
- return 1;
+ struct xt_entry_target *t = arpt_get_target(e);
+ struct xt_tgchk_param par = {
+ .net = net,
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_ARP,
+ };
+
+ return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
}
-static struct arpt_target arpt_standard_target;
-
-static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+static int
+find_check_entry(struct arpt_entry *e, struct net *net, const char *name,
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
- struct arpt_entry_target *t;
- struct arpt_target *target;
+ struct xt_entry_target *t;
+ struct xt_target *target;
int ret;
- if (!arp_checkentry(&e->arp)) {
- duprintf("arp_tables: arp check failed %p %s.\n", e, name);
- return -EINVAL;
- }
-
- if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
- return -EINVAL;
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
+ return -ENOMEM;
t = arpt_get_target(e);
- if (e->target_offset + t->u.target_size > e->next_offset)
- return -EINVAL;
-
- target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name,
- t->u.user.revision),
- "arpt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
- duprintf("check_entry: `%s' not found\n", t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
goto out;
}
t->u.kernel.target = target;
- ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t),
- name, e->comefrom, 0, 0);
+ ret = check_target(e, net, name);
if (ret)
goto err;
-
- if (t->u.kernel.target == &arpt_standard_target) {
- if (!standard_check(t, size)) {
- ret = -EINVAL;
- goto err;
- }
- } else if (t->u.kernel.target->checkentry
- && !t->u.kernel.target->checkentry(name, e, target, t->data,
- e->comefrom)) {
- duprintf("arp_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- ret = -EINVAL;
- goto err;
- }
-
- (*i)++;
return 0;
err:
module_put(t->u.kernel.target->me);
out:
+ xt_percpu_counter_free(&e->counters);
+
return ret;
}
+static bool check_underflow(const struct arpt_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(e))
+ return false;
+ t = arpt_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
static inline int check_entry_size_and_hooks(struct arpt_entry *e,
struct xt_table_info *newinfo,
- unsigned char *base,
- unsigned char *limit,
+ const unsigned char *base,
+ const unsigned char *limit,
const unsigned int *hook_entries,
const unsigned int *underflows,
- unsigned int *i)
+ unsigned int valid_hooks)
{
unsigned int h;
+ int err;
- if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
- || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
- duprintf("Bad offset %p\n", e);
+ if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target))
+ return -EINVAL;
+
+ if (!arp_checkentry(&e->arp))
return -EINVAL;
- }
+
+ err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+ e->next_offset);
+ if (err)
+ return err;
/* Check hooks & underflows */
for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
- if ((unsigned char *)e - base == underflows[h])
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e))
+ return -EINVAL;
+
newinfo->underflow[h] = underflows[h];
+ }
}
- /* FIXME: underflows must be unconditional, standard verdicts
- < 0 (not ARPT_RETURN). --RR */
-
/* Clear counters and comefrom */
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
-
- (*i)++;
return 0;
}
-static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
+static void cleanup_entry(struct arpt_entry *e, struct net *net)
{
- struct arpt_entry_target *t;
-
- if (i && (*i)-- == 0)
- return 1;
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
t = arpt_get_target(e);
- if (t->u.kernel.target->destroy)
- t->u.kernel.target->destroy(t->u.kernel.target, t->data);
- module_put(t->u.kernel.target->me);
- return 0;
+ par.net = net;
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_ARP;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
-static int translate_table(const char *name,
- unsigned int valid_hooks,
+static int translate_table(struct net *net,
struct xt_table_info *newinfo,
void *entry0,
- unsigned int size,
- unsigned int number,
- const unsigned int *hook_entries,
- const unsigned int *underflows)
+ const struct arpt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
+ struct arpt_entry *iter;
+ unsigned int *offsets;
unsigned int i;
- int ret;
+ int ret = 0;
- newinfo->size = size;
- newinfo->number = number;
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
/* Init all hooks to impossible value. */
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
@@ -590,162 +536,164 @@ static int translate_table(const char *name,
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
+ offsets = xt_alloc_entry_offsets(newinfo->number);
+ if (!offsets)
+ return -ENOMEM;
i = 0;
/* Walk through entries, checking offsets. */
- ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry_size_and_hooks,
- newinfo,
- entry0,
- entry0 + size,
- hook_entries, underflows, &i);
- duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
- if (ret != 0)
- return ret;
-
- if (i != number) {
- duprintf("translate_table: %u not %u entries\n",
- i, number);
- return -EINVAL;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ goto out_free;
+ if (i < repl->num_entries)
+ offsets[i] = (void *)iter - entry0;
+ ++i;
+ if (strcmp(arpt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
- /* Check hooks all assigned */
- for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
- /* Only hooks which are valid */
- if (!(valid_hooks & (1 << i)))
- continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, hook_entries[i]);
- return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, underflows[i]);
- return -EINVAL;
- }
+ ret = -EINVAL;
+ if (i != repl->num_entries)
+ goto out_free;
+
+ ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+ if (ret)
+ goto out_free;
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+ ret = -ELOOP;
+ goto out_free;
}
+ kvfree(offsets);
/* Finally, each sanity check must pass */
i = 0;
- ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry, name, size, &i);
-
- if (ret != 0)
- goto cleanup;
-
- ret = -ELOOP;
- if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
- duprintf("Looping hook\n");
- goto cleanup;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
+ if (ret != 0)
+ break;
+ ++i;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
+ return ret;
}
- return 0;
-cleanup:
- ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i);
return ret;
-}
-
-/* Gets counters. */
-static inline int add_entry_to_counter(const struct arpt_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
-static inline int set_entry_to_counter(const struct arpt_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
+ out_free:
+ kvfree(offsets);
+ return ret;
}
static void get_counters(const struct xt_table_info *t,
struct xt_counters counters[])
{
+ struct arpt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU
- * We dont care about preemption here.
- */
- curcpu = raw_smp_processor_id();
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
- i = 0;
- ARPT_ENTRY_ITERATE(t->entries[curcpu],
- t->size,
- set_entry_to_counter,
- counters,
- &i);
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i;
+ cond_resched();
+ }
+ }
+}
+
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct arpt_entry *iter;
+ unsigned int cpu, i;
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
i = 0;
- ARPT_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i;
+ }
+ cond_resched();
}
}
-static int copy_entries_to_user(unsigned int total_size,
- struct arpt_table *table,
- void __user *userptr)
+static struct xt_counters *alloc_counters(const struct xt_table *table)
{
- unsigned int off, num, countersize;
- struct arpt_entry *e;
+ unsigned int countersize;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
- int ret = 0;
- void *loc_cpu_entry;
+ const struct xt_table_info *private = table->private;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
* about).
*/
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = vzalloc(countersize);
if (counters == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- /* First, sum counters... */
- write_lock_bh(&table->lock);
get_counters(private, counters);
- write_unlock_bh(&table->lock);
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- /* ... then copy entire thing ... */
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
+ return counters;
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct arpt_entry *e;
+ struct xt_counters *counters;
+ struct xt_table_info *private = table->private;
+ int ret = 0;
+ void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ loc_cpu_entry = private->entries;
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
- struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
- e = (struct arpt_entry *)(loc_cpu_entry + off);
+ e = loc_cpu_entry + off;
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct arpt_entry, counters),
&counters[num],
@@ -754,12 +702,8 @@ static int copy_entries_to_user(unsigned int total_size,
goto free_counters;
}
- t = arpt_get_target(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct arpt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ t = arpt_get_target_c(e);
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -770,182 +714,314 @@ static int copy_entries_to_user(unsigned int total_size,
return ret;
}
-static int get_entries(const struct arpt_get_entries *entries,
- struct arpt_get_entries __user *uptr)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(NFPROTO_ARP, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct arpt_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - base;
+
+ t = arpt_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct arpt_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct arpt_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct arpt_entry *iter;
+ const void *loc_cpu_entry;
int ret;
- struct arpt_table *t;
-
- t = xt_find_table_lock(NF_ARP, entries->name);
- if (t && !IS_ERR(t)) {
- struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n",
- private->number);
- if (entries->size == private->size)
- ret = copy_entries_to_user(private->size,
- t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, entries->size);
- ret = -EINVAL;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries;
+ ret = xt_compat_init_offsets(NFPROTO_ARP, info->number);
+ if (ret)
+ return ret;
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user, const int *len)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct arpt_getinfo))
+ return -EINVAL;
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ xt_compat_lock(NFPROTO_ARP);
+#endif
+ t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+ if (!IS_ERR(t)) {
+ struct arpt_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ struct xt_table_info tmp;
+
+ if (in_compat_syscall()) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ private = &tmp;
}
- module_put(t->me);
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strscpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
xt_table_unlock(t);
+ module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
-
+ ret = PTR_ERR(t);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ xt_compat_unlock(NFPROTO_ARP);
+#endif
return ret;
}
-static int do_replace(void __user *user, unsigned int len)
+static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
+ const int *len)
{
int ret;
- struct arpt_replace tmp;
- struct arpt_table *t;
- struct xt_table_info *newinfo, *oldinfo;
- struct xt_counters *counters;
- void *loc_cpu_entry, *loc_cpu_old_entry;
+ struct arpt_get_entries get;
+ struct xt_table *t;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (*len < sizeof(get))
+ return -EINVAL;
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
+ if (*len != sizeof(struct arpt_get_entries) + get.size)
+ return -EINVAL;
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (len != sizeof(tmp) + tmp.size)
- return -ENOPROTOOPT;
+ get.name[sizeof(get.name) - 1] = '\0';
- /* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
- return -ENOMEM;
- if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
- return -ENOMEM;
+ t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+ if (!IS_ERR(t)) {
+ const struct xt_table_info *private = t->private;
- newinfo = xt_alloc_table_info(tmp.size);
- if (!newinfo)
- return -ENOMEM;
+ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else
+ ret = -EAGAIN;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
- ret = -EFAULT;
- goto free_newinfo;
- }
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = PTR_ERR(t);
+
+ return ret;
+}
- counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
+static int __do_replace(struct net *net, const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info *newinfo,
+ unsigned int num_counters,
+ void __user *counters_ptr)
+{
+ int ret;
+ struct xt_table *t;
+ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+ void *loc_cpu_old_entry;
+ struct arpt_entry *iter;
+
+ ret = 0;
+ counters = xt_counters_alloc(num_counters);
if (!counters) {
ret = -ENOMEM;
- goto free_newinfo;
+ goto out;
}
- ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
- tmp.hook_entry, tmp.underflow);
- if (ret != 0)
- goto free_newinfo_counters;
-
- duprintf("arp_tables: Translated table\n");
-
- t = try_then_request_module(xt_find_table_lock(NF_ARP, tmp.name),
- "arptable_%s", tmp.name);
- if (!t || IS_ERR(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free_newinfo_counters_untrans;
}
/* You lied! */
- if (tmp.valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- tmp.valid_hooks, t->valid_hooks);
+ if (valid_hooks != t->valid_hooks) {
ret = -EINVAL;
goto put_module;
}
- oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
if (!oldinfo)
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
- if ((oldinfo->number > oldinfo->initial_entries) ||
- (newinfo->number <= oldinfo->initial_entries))
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
if ((oldinfo->number > oldinfo->initial_entries) &&
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters. */
- get_counters(oldinfo, counters);
+ xt_table_unlock(t);
+
+ get_old_counters(oldinfo, counters);
+
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ loc_cpu_old_entry = oldinfo->entries;
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
- if (copy_to_user(tmp.counters, counters,
- sizeof(struct xt_counters) * tmp.num_counters) != 0)
- ret = -EFAULT;
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
- xt_table_unlock(t);
return ret;
put_module:
module_put(t->me);
xt_table_unlock(t);
free_newinfo_counters_untrans:
- ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
- free_newinfo_counters:
vfree(counters);
- free_newinfo:
- xt_free_table_info(newinfo);
+ out:
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
+static int do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
-
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
-static int do_add_counters(void __user *user, unsigned int len)
-{
- unsigned int i;
- struct xt_counters_info tmp, *paddc;
- struct arpt_table *t;
- struct xt_table_info *private;
- int ret = 0;
+ int ret;
+ struct arpt_replace tmp;
+ struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct arpt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
- if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
return -EINVAL;
- paddc = vmalloc(len);
- if (!paddc)
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
return -ENOMEM;
- if (copy_from_user(paddc, user, len) != 0) {
+ loc_cpu_entry = newinfo->entries;
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
- goto free;
+ goto free_newinfo;
}
- t = xt_find_table_lock(NF_ARP, tmp.name);
- if (!t || IS_ERR(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
+{
+ unsigned int i;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ struct arpt_entry *iter;
+ unsigned int addend;
+
+ paddc = xt_copy_counters(arg, len, &tmp);
+ if (IS_ERR(paddc))
+ return PTR_ERR(paddc);
+
+ t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free;
}
- write_lock_bh(&t->lock);
+ local_bh_disable();
private = t->private;
if (private->number != tmp.num_counters) {
ret = -EINVAL;
@@ -953,15 +1029,18 @@ static int do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- /* Choose the copy that is on our node */
- loc_cpu_entry = private->entries[smp_processor_id()];
- ARPT_ENTRY_ITERATE(loc_cpu_entry,
- private->size,
- add_counter_to_entry,
- paddc->counters,
- &i);
+
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ local_bh_enable();
xt_table_unlock(t);
module_put(t->me);
free:
@@ -970,24 +1049,391 @@ static int do_add_counters(void __user *user, unsigned int len)
return ret;
}
-static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+struct compat_arpt_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_ARP_NUMHOOKS];
+ u32 underflow[NF_ARP_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters;
+ struct compat_arpt_entry entries[];
+};
+
+static inline void compat_release_entry(struct compat_arpt_entry *e)
+{
+ struct xt_entry_target *t;
+
+ t = compat_arpt_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ int ret, off;
+
+ if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit)
+ return -EINVAL;
+
+ if (e->next_offset < sizeof(struct compat_arpt_entry) +
+ sizeof(struct compat_xt_entry_target))
+ return -EINVAL;
+
+ if (!arp_checkentry(&e->arp))
+ return -EINVAL;
+
+ ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset,
+ e->next_offset);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - (void *)base;
+
+ t = compat_arpt_get_target(e);
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
+ goto out;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+ if (ret)
+ goto release_target;
+
+ return 0;
+
+release_target:
+ module_put(t->u.kernel.target->me);
+out:
+ return ret;
+}
+
+static void
+compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
+ unsigned int *size,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct arpt_entry *de;
+ unsigned int origsize;
+ int h;
+
+ origsize = *size;
+ de = *dstptr;
+ memcpy(de, e, sizeof(struct arpt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct arpt_entry);
+ *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_arpt_get_target(e);
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+}
+
+static int translate_compat_table(struct net *net,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ const struct compat_arpt_replace *compatr)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_arpt_entry *iter0;
+ struct arpt_replace repl;
+ unsigned int size;
+ int ret;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = compatr->size;
+ info->number = compatr->num_entries;
+
+ j = 0;
+ xt_compat_lock(NFPROTO_ARP);
+ ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries);
+ if (ret)
+ goto out_unlock;
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, compatr->size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + compatr->size);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != compatr->num_entries)
+ goto out_unlock;
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ memset(newinfo->entries, 0, size);
+
+ newinfo->number = compatr->num_entries;
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = compatr->hook_entry[i];
+ newinfo->underflow[i] = compatr->underflow[i];
+ }
+ entry1 = newinfo->entries;
+ pos = entry1;
+ size = compatr->size;
+ xt_entry_foreach(iter0, entry0, compatr->size)
+ compat_copy_entry_from_user(iter0, &pos, &size,
+ newinfo, entry1);
+
+ /* all module references in entry0 are now gone */
+
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ xt_compat_unlock(NFPROTO_ARP);
+
+ memcpy(&repl, compatr, sizeof(*compatr));
+
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ repl.hook_entry[i] = newinfo->hook_entry[i];
+ repl.underflow[i] = newinfo->underflow[i];
+ }
+
+ repl.num_counters = 0;
+ repl.counters = NULL;
+ repl.size = newinfo->size;
+ ret = translate_table(net, newinfo, entry1, &repl);
+ if (ret)
+ goto free_newinfo;
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ xt_compat_unlock(NFPROTO_ARP);
+ xt_entry_foreach(iter0, entry0, compatr->size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+}
+
+static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
+{
+ int ret;
+ struct compat_arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ loc_cpu_entry = newinfo->entries;
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
+ compat_uint_t *size,
+ struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_arpt_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ int ret;
+
+ origsize = *size;
+ ce = *dstptr;
+ if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_arpt_entry);
+ *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ target_offset = e->target_offset - (origsize - *size);
+
+ t = arpt_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_copy_entries_to_user(unsigned int total_size,
+ struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ unsigned int i = 0;
+ struct arpt_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, private->entries, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+ vfree(counters);
+ return ret;
+}
+
+struct compat_arpt_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_arpt_entry entrytable[];
+};
+
+static int compat_get_entries(struct net *net,
+ struct compat_arpt_get_entries __user *uptr,
+ int *len)
{
int ret;
+ struct compat_arpt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get))
+ return -EINVAL;
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct compat_arpt_get_entries) + get.size)
+ return -EINVAL;
+
+ get.name[sizeof(get.name) - 1] = '\0';
+
+ xt_compat_lock(NFPROTO_ARP);
+ t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+ if (!IS_ERR(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret)
+ ret = -EAGAIN;
- if (!capable(CAP_NET_ADMIN))
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = PTR_ERR(t);
+
+ xt_compat_unlock(NFPROTO_ARP);
+ return ret;
+}
+#endif
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
+ unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case ARPT_SO_SET_REPLACE:
- ret = do_replace(user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case ARPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(user, len);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
- duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -998,69 +1444,22 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
- case ARPT_SO_GET_INFO: {
- char name[ARPT_TABLE_MAXNAMELEN];
- struct arpt_table *t;
-
- if (*len != sizeof(struct arpt_getinfo)) {
- duprintf("length %u != %Zu\n", *len,
- sizeof(struct arpt_getinfo));
- ret = -EINVAL;
- break;
- }
-
- if (copy_from_user(name, user, sizeof(name)) != 0) {
- ret = -EFAULT;
- break;
- }
- name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
-
- t = try_then_request_module(xt_find_table_lock(NF_ARP, name),
- "arptable_%s", name);
- if (t && !IS_ERR(t)) {
- struct arpt_getinfo info;
- struct xt_table_info *private = t->private;
-
- info.valid_hooks = t->valid_hooks;
- memcpy(info.hook_entry, private->hook_entry,
- sizeof(info.hook_entry));
- memcpy(info.underflow, private->underflow,
- sizeof(info.underflow));
- info.num_entries = private->number;
- info.size = private->size;
- strcpy(info.name, name);
-
- if (copy_to_user(user, &info, *len) != 0)
- ret = -EFAULT;
- else
- ret = 0;
- xt_table_unlock(t);
- module_put(t->me);
- } else
- ret = t ? PTR_ERR(t) : -ENOENT;
- }
- break;
-
- case ARPT_SO_GET_ENTRIES: {
- struct arpt_get_entries get;
+ case ARPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len);
+ break;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
- ret = -EINVAL;
- } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
- ret = -EFAULT;
- } else if (*len != sizeof(struct arpt_get_entries) + get.size) {
- duprintf("get_entries: %u != %Zu\n", *len,
- sizeof(struct arpt_get_entries) + get.size);
- ret = -EINVAL;
- } else
- ret = get_entries(&get, user);
+ case ARPT_SO_GET_ENTRIES:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
- }
case ARPT_SO_GET_REVISION_TARGET: {
struct xt_get_revision rev;
@@ -1073,87 +1472,138 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
ret = -EFAULT;
break;
}
+ rev.name[sizeof(rev.name)-1] = 0;
- try_then_request_module(xt_find_revision(NF_ARP, rev.name,
+ try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
rev.revision, 1, &ret),
"arpt_%s", rev.name);
break;
}
default:
- duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
return ret;
}
-int arpt_register_table(struct arpt_table *table,
- const struct arpt_replace *repl)
+static void __arpt_unregister_table(struct net *net, struct xt_table *table)
{
- int ret;
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct arpt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int arpt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct arpt_replace *repl,
+ const struct nf_hook_ops *template_ops)
+{
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
- static struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
+ struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- return ret;
- }
+ if (!newinfo)
+ return -ENOMEM;
- /* choose the copy on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(table->name, table->valid_hooks,
- newinfo, loc_cpu_entry, repl->size,
- repl->num_entries,
- repl->hook_entry,
- repl->underflow);
-
- duprintf("arpt_register_table: translate table gives %d\n", ret);
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0) {
xt_free_table_info(newinfo);
return ret;
}
- ret = xt_register_table(table, &bootstrap, newinfo);
- if (ret != 0) {
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ struct arpt_entry *iter;
+
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
xt_free_table_info(newinfo);
- return ret;
+ return PTR_ERR(new_table);
}
- return 0;
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
+
+ new_table->ops = ops;
+
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
+
+ return ret;
+
+out_free:
+ __arpt_unregister_table(net, new_table);
+ return ret;
}
-void arpt_unregister_table(struct arpt_table *table)
+void arpt_unregister_table_pre_exit(struct net *net, const char *name)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
+ struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
- private = xt_unregister_table(table);
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size,
- cleanup_entry, NULL);
- xt_free_table_info(private);
+void arpt_unregister_table(struct net *net, const char *name)
+{
+ struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+ if (table)
+ __arpt_unregister_table(net, table);
}
/* The built-in targets: standard (NULL) and error. */
-static struct arpt_target arpt_standard_target = {
- .name = ARPT_STANDARD_TARGET,
- .targetsize = sizeof(int),
- .family = NF_ARP,
-};
-
-static struct arpt_target arpt_error_target = {
- .name = ARPT_ERROR_TARGET,
- .target = arpt_error,
- .targetsize = ARPT_FUNCTION_MAXNAMELEN,
- .family = NF_ARP,
+static struct xt_target arpt_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_ARP,
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = arpt_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_ARP,
+ },
};
static struct nf_sockopt_ops arpt_sockopts = {
@@ -1164,38 +1614,48 @@ static struct nf_sockopt_ops arpt_sockopts = {
.get_optmin = ARPT_BASE_CTL,
.get_optmax = ARPT_SO_GET_MAX+1,
.get = do_arpt_get_ctl,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init arp_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_ARP);
+}
+
+static void __net_exit arp_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_ARP);
+}
+
+static struct pernet_operations arp_tables_net_ops = {
+ .init = arp_tables_net_init,
+ .exit = arp_tables_net_exit,
};
static int __init arp_tables_init(void)
{
int ret;
- ret = xt_proto_init(NF_ARP);
+ ret = register_pernet_subsys(&arp_tables_net_ops);
if (ret < 0)
goto err1;
- /* Noone else will be downing sem now, so we won't sleep */
- ret = xt_register_target(&arpt_standard_target);
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_target(&arpt_error_target);
- if (ret < 0)
- goto err3;
/* Register setsockopt */
ret = nf_register_sockopt(&arpt_sockopts);
if (ret < 0)
goto err4;
- printk("arp_tables: (C) 2002 David S. Miller\n");
return 0;
err4:
- xt_unregister_target(&arpt_error_target);
-err3:
- xt_unregister_target(&arpt_standard_target);
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
err2:
- xt_proto_fini(NF_ARP);
+ unregister_pernet_subsys(&arp_tables_net_ops);
err1:
return ret;
}
@@ -1203,9 +1663,8 @@ err1:
static void __exit arp_tables_fini(void)
{
nf_unregister_sockopt(&arpt_sockopts);
- xt_unregister_target(&arpt_error_target);
- xt_unregister_target(&arpt_standard_target);
- xt_proto_fini(NF_ARP);
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+ unregister_pernet_subsys(&arp_tables_net_ops);
}
EXPORT_SYMBOL(arpt_register_table);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index d12b1df252a1..a4e07e5e9c11 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* module that allows mangling of the arp payload */
#include <linux/module.h>
+#include <linux/netfilter.h>
#include <linux/netfilter_arp/arpt_mangle.h>
#include <net/sock.h>
@@ -8,81 +10,68 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
MODULE_DESCRIPTION("arptables arp payload mangle target");
static unsigned int
-target(struct sk_buff **pskb,
- const struct net_device *in, const struct net_device *out,
- unsigned int hooknum, const struct xt_target *target,
- const void *targinfo)
+target(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct arpt_mangle *mangle = targinfo;
- struct arphdr *arp;
+ const struct arpt_mangle *mangle = par->targinfo;
+ const struct arphdr *arp;
unsigned char *arpptr;
int pln, hln;
- if (skb_shared(*pskb) || skb_cloned(*pskb)) {
- struct sk_buff *nskb;
+ if (skb_ensure_writable(skb, skb->len))
+ return NF_DROP;
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return NF_DROP;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- }
-
- arp = (*pskb)->nh.arph;
- arpptr = (*pskb)->nh.raw + sizeof(*arp);
+ arp = arp_hdr(skb);
+ arpptr = skb_network_header(skb) + sizeof(*arp);
pln = arp->ar_pln;
hln = arp->ar_hln;
/* We assume that pln and hln were checked in the match */
if (mangle->flags & ARPT_MANGLE_SDEV) {
if (ARPT_DEV_ADDR_LEN_MAX < hln ||
- (arpptr + hln > (**pskb).tail))
+ (arpptr + hln > skb_tail_pointer(skb)))
return NF_DROP;
memcpy(arpptr, mangle->src_devaddr, hln);
}
arpptr += hln;
if (mangle->flags & ARPT_MANGLE_SIP) {
if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
- (arpptr + pln > (**pskb).tail))
+ (arpptr + pln > skb_tail_pointer(skb)))
return NF_DROP;
memcpy(arpptr, &mangle->u_s.src_ip, pln);
}
arpptr += pln;
if (mangle->flags & ARPT_MANGLE_TDEV) {
if (ARPT_DEV_ADDR_LEN_MAX < hln ||
- (arpptr + hln > (**pskb).tail))
+ (arpptr + hln > skb_tail_pointer(skb)))
return NF_DROP;
memcpy(arpptr, mangle->tgt_devaddr, hln);
}
arpptr += hln;
if (mangle->flags & ARPT_MANGLE_TIP) {
if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
- (arpptr + pln > (**pskb).tail))
+ (arpptr + pln > skb_tail_pointer(skb)))
return NF_DROP;
memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
}
return mangle->target;
}
-static int
-checkentry(const char *tablename, const void *e, const struct xt_target *target,
- void *targinfo, unsigned int hook_mask)
+static int checkentry(const struct xt_tgchk_param *par)
{
- const struct arpt_mangle *mangle = targinfo;
+ const struct arpt_mangle *mangle = par->targinfo;
if (mangle->flags & ~ARPT_MANGLE_MASK ||
!(mangle->flags & ARPT_MANGLE_MASK))
- return 0;
+ return -EINVAL;
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
- mangle->target != ARPT_CONTINUE)
- return 0;
- return 1;
+ mangle->target != XT_CONTINUE)
+ return -EINVAL;
+ return 0;
}
-static struct arpt_target arpt_mangle_reg = {
+static struct xt_target arpt_mangle_reg __read_mostly = {
.name = "mangle",
+ .family = NFPROTO_ARP,
.target = target,
.targetsize = sizeof(struct arpt_mangle),
.checkentry = checkentry,
@@ -91,15 +80,12 @@ static struct arpt_target arpt_mangle_reg = {
static int __init arpt_mangle_init(void)
{
- if (arpt_register_target(&arpt_mangle_reg))
- return -EINVAL;
-
- return 0;
+ return xt_register_target(&arpt_mangle_reg);
}
static void __exit arpt_mangle_fini(void)
{
- arpt_unregister_target(&arpt_mangle_reg);
+ xt_unregister_target(&arpt_mangle_reg);
}
module_init(arpt_mangle_init);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 7edea2a1696c..78cd5ee24448 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Filtering ARP tables module.
*
@@ -6,7 +7,9 @@
*/
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -15,193 +18,73 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
-/* Standard entry. */
-struct arpt_standard
-{
- struct arpt_entry entry;
- struct arpt_standard_target target;
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_ARP,
+ .priority = NF_IP_PRI_FILTER,
};
-struct arpt_error_target
-{
- struct arpt_entry_target target;
- char errorname[ARPT_FUNCTION_MAXNAMELEN];
-};
+static struct nf_hook_ops *arpfilter_ops __read_mostly;
-struct arpt_error
+static int arptable_filter_table_init(struct net *net)
{
- struct arpt_entry entry;
- struct arpt_error_target target;
-};
+ struct arpt_replace *repl;
+ int err;
+
+ repl = arpt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops);
+ kfree(repl);
+ return err;
+}
-static struct
+static void __net_exit arptable_filter_net_pre_exit(struct net *net)
{
- struct arpt_replace repl;
- struct arpt_standard entries[3];
- struct arpt_error term;
-} initial_table __initdata
-= { { "filter", FILTER_VALID_HOOKS, 4,
- sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
- { [NF_ARP_IN] = 0,
- [NF_ARP_OUT] = sizeof(struct arpt_standard),
- [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
- { [NF_ARP_IN] = 0,
- [NF_ARP_OUT] = sizeof(struct arpt_standard),
- [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
- 0, NULL, { } },
- {
- /* ARP_IN */
- {
- {
- {
- { 0 }, { 0 }, { 0 }, { 0 },
- 0, 0,
- { { 0, }, { 0, } },
- { { 0, }, { 0, } },
- 0, 0,
- 0, 0,
- 0, 0,
- "", "", { 0 }, { 0 },
- 0, 0
- },
- sizeof(struct arpt_entry),
- sizeof(struct arpt_standard),
- 0,
- { 0, 0 }, { } },
- { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 }
- },
- /* ARP_OUT */
- {
- {
- {
- { 0 }, { 0 }, { 0 }, { 0 },
- 0, 0,
- { { 0, }, { 0, } },
- { { 0, }, { 0, } },
- 0, 0,
- 0, 0,
- 0, 0,
- "", "", { 0 }, { 0 },
- 0, 0
- },
- sizeof(struct arpt_entry),
- sizeof(struct arpt_standard),
- 0,
- { 0, 0 }, { } },
- { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 }
- },
- /* ARP_FORWARD */
- {
- {
- {
- { 0 }, { 0 }, { 0 }, { 0 },
- 0, 0,
- { { 0, }, { 0, } },
- { { 0, }, { 0, } },
- 0, 0,
- 0, 0,
- 0, 0,
- "", "", { 0 }, { 0 },
- 0, 0
- },
- sizeof(struct arpt_entry),
- sizeof(struct arpt_standard),
- 0,
- { 0, 0 }, { } },
- { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 }
- }
- },
- /* ERROR */
- {
- {
- {
- { 0 }, { 0 }, { 0 }, { 0 },
- 0, 0,
- { { 0, }, { 0, } },
- { { 0, }, { 0, } },
- 0, 0,
- 0, 0,
- 0, 0,
- "", "", { 0 }, { 0 },
- 0, 0
- },
- sizeof(struct arpt_entry),
- sizeof(struct arpt_error),
- 0,
- { 0, 0 }, { } },
- { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } },
- { } },
- "ERROR"
- }
- }
-};
-
-static struct arpt_table packet_filter = {
- .name = "filter",
- .valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .private = NULL,
- .me = THIS_MODULE,
- .af = NF_ARP,
-};
+ arpt_unregister_table_pre_exit(net, "filter");
+}
-/* The work comes in here from netfilter.c */
-static unsigned int arpt_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static void __net_exit arptable_filter_net_exit(struct net *net)
{
- return arpt_do_table(pskb, hook, in, out, &packet_filter);
+ arpt_unregister_table(net, "filter");
}
-static struct nf_hook_ops arpt_ops[] = {
- {
- .hook = arpt_hook,
- .owner = THIS_MODULE,
- .pf = NF_ARP,
- .hooknum = NF_ARP_IN,
- },
- {
- .hook = arpt_hook,
- .owner = THIS_MODULE,
- .pf = NF_ARP,
- .hooknum = NF_ARP_OUT,
- },
- {
- .hook = arpt_hook,
- .owner = THIS_MODULE,
- .pf = NF_ARP,
- .hooknum = NF_ARP_FORWARD,
- },
+static struct pernet_operations arptable_filter_net_ops = {
+ .exit = arptable_filter_net_exit,
+ .pre_exit = arptable_filter_net_pre_exit,
};
static int __init arptable_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ arptable_filter_table_init);
- /* Register table */
- ret = arpt_register_table(&packet_filter, &initial_table.repl);
if (ret < 0)
return ret;
- ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops));
- if (ret < 0)
- goto cleanup_table;
- return ret;
+ arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table);
+ if (IS_ERR(arpfilter_ops)) {
+ xt_unregister_template(&packet_filter);
+ return PTR_ERR(arpfilter_ops);
+ }
+
+ ret = register_pernet_subsys(&arptable_filter_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
+ kfree(arpfilter_ops);
+ return ret;
+ }
-cleanup_table:
- arpt_unregister_table(&packet_filter);
return ret;
}
static void __exit arptable_filter_fini(void)
{
- nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops));
- arpt_unregister_table(&packet_filter);
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+ xt_unregister_template(&packet_filter);
+ kfree(arpfilter_ops);
}
module_init(arptable_filter_init);
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
deleted file mode 100644
index ad246ba7790b..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Amanda extension for IP connection tracking, Version 0.2
- * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
- * based on HW's ip_conntrack_irc.c as well as other modules
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Module load syntax:
- * insmod ip_conntrack_amanda.o [master_timeout=n]
- *
- * Where master_timeout is the timeout (in seconds) of the master
- * connection (port 10080). This defaults to 5 minutes but if
- * your clients take longer than 5 minutes to do their work
- * before getting back to the Amanda server, you can increase
- * this value.
- *
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/textsearch.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
-
-static unsigned int master_timeout = 300;
-static char *ts_algo = "kmp";
-
-MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
-MODULE_DESCRIPTION("Amanda connection tracking module");
-MODULE_LICENSE("GPL");
-module_param(master_timeout, uint, 0600);
-MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
-module_param(ts_algo, charp, 0400);
-MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
-
-unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack_expect *exp);
-EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
-
-enum amanda_strings {
- SEARCH_CONNECT,
- SEARCH_NEWLINE,
- SEARCH_DATA,
- SEARCH_MESG,
- SEARCH_INDEX,
-};
-
-static struct {
- char *string;
- size_t len;
- struct ts_config *ts;
-} search[] = {
- [SEARCH_CONNECT] = {
- .string = "CONNECT ",
- .len = 8,
- },
- [SEARCH_NEWLINE] = {
- .string = "\n",
- .len = 1,
- },
- [SEARCH_DATA] = {
- .string = "DATA ",
- .len = 5,
- },
- [SEARCH_MESG] = {
- .string = "MESG ",
- .len = 5,
- },
- [SEARCH_INDEX] = {
- .string = "INDEX ",
- .len = 6,
- },
-};
-
-static int help(struct sk_buff **pskb,
- struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
-{
- struct ts_state ts;
- struct ip_conntrack_expect *exp;
- unsigned int dataoff, start, stop, off, i;
- char pbuf[sizeof("65535")], *tmp;
- u_int16_t port, len;
- int ret = NF_ACCEPT;
- typeof(ip_nat_amanda_hook) ip_nat_amanda;
-
- /* Only look at packets from the Amanda server */
- if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
- return NF_ACCEPT;
-
- /* increase the UDP timeout of the master connection as replies from
- * Amanda clients to the server can be quite delayed */
- ip_ct_refresh(ct, *pskb, master_timeout * HZ);
-
- /* No data? */
- dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
- if (dataoff >= (*pskb)->len) {
- if (net_ratelimit())
- printk("amanda_help: skblen = %u\n", (*pskb)->len);
- return NF_ACCEPT;
- }
-
- memset(&ts, 0, sizeof(ts));
- start = skb_find_text(*pskb, dataoff, (*pskb)->len,
- search[SEARCH_CONNECT].ts, &ts);
- if (start == UINT_MAX)
- goto out;
- start += dataoff + search[SEARCH_CONNECT].len;
-
- memset(&ts, 0, sizeof(ts));
- stop = skb_find_text(*pskb, start, (*pskb)->len,
- search[SEARCH_NEWLINE].ts, &ts);
- if (stop == UINT_MAX)
- goto out;
- stop += start;
-
- for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
- memset(&ts, 0, sizeof(ts));
- off = skb_find_text(*pskb, start, stop, search[i].ts, &ts);
- if (off == UINT_MAX)
- continue;
- off += start + search[i].len;
-
- len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
- if (skb_copy_bits(*pskb, off, pbuf, len))
- break;
- pbuf[len] = '\0';
-
- port = simple_strtoul(pbuf, &tmp, 10);
- len = tmp - pbuf;
- if (port == 0 || len > 5)
- break;
-
- exp = ip_conntrack_expect_alloc(ct);
- if (exp == NULL) {
- ret = NF_DROP;
- goto out;
- }
-
- exp->expectfn = NULL;
- exp->flags = 0;
-
- exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->tuple.dst.u.tcp.port = htons(port);
-
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
-
- /* RCU read locked by nf_hook_slow */
- ip_nat_amanda = rcu_dereference(ip_nat_amanda_hook);
- if (ip_nat_amanda)
- ret = ip_nat_amanda(pskb, ctinfo, off - dataoff,
- len, exp);
- else if (ip_conntrack_expect_related(exp) != 0)
- ret = NF_DROP;
- ip_conntrack_expect_put(exp);
- }
-
-out:
- return ret;
-}
-
-static struct ip_conntrack_helper amanda_helper = {
- .max_expected = 3,
- .timeout = 180,
- .me = THIS_MODULE,
- .help = help,
- .name = "amanda",
-
- .tuple = { .src = { .u = { .udp = {.port = __constant_htons(10080) } } },
- .dst = { .protonum = IPPROTO_UDP },
- },
- .mask = { .src = { .u = { 0xFFFF } },
- .dst = { .protonum = 0xFF },
- },
-};
-
-static void __exit ip_conntrack_amanda_fini(void)
-{
- int i;
-
- ip_conntrack_helper_unregister(&amanda_helper);
- for (i = 0; i < ARRAY_SIZE(search); i++)
- textsearch_destroy(search[i].ts);
-}
-
-static int __init ip_conntrack_amanda_init(void)
-{
- int ret, i;
-
- ret = -ENOMEM;
- for (i = 0; i < ARRAY_SIZE(search); i++) {
- search[i].ts = textsearch_prepare(ts_algo, search[i].string,
- search[i].len,
- GFP_KERNEL, TS_AUTOLOAD);
- if (search[i].ts == NULL)
- goto err;
- }
- ret = ip_conntrack_helper_register(&amanda_helper);
- if (ret < 0)
- goto err;
- return 0;
-
-err:
- for (; i >= 0; i--) {
- if (search[i].ts)
- textsearch_destroy(search[i].ts);
- }
- return ret;
-}
-
-module_init(ip_conntrack_amanda_init);
-module_exit(ip_conntrack_amanda_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
deleted file mode 100644
index 4d1f954d459b..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ /dev/null
@@ -1,1544 +0,0 @@
-/* Connection state tracking for netfilter. This is separated from,
- but required by, the NAT layer; it can also be used by an iptables
- extension. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
- * - new API and handling of conntrack/nat helpers
- * - now capable of multiple expectations for one master
- * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
- * - add usage/reference counts to ip_conntrack_expect
- * - export ip_conntrack[_expect]_{find_get,put} functions
- * */
-
-#include <linux/types.h>
-#include <linux/icmp.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/vmalloc.h>
-#include <net/checksum.h>
-#include <net/ip.h>
-#include <linux/stddef.h>
-#include <linux/sysctl.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/jhash.h>
-#include <linux/err.h>
-#include <linux/percpu.h>
-#include <linux/moduleparam.h>
-#include <linux/notifier.h>
-
-/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
- registrations, conntrack timers*/
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-
-#define IP_CONNTRACK_VERSION "2.4"
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-DEFINE_RWLOCK(ip_conntrack_lock);
-
-/* ip_conntrack_standalone needs this */
-atomic_t ip_conntrack_count = ATOMIC_INIT(0);
-
-void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
-LIST_HEAD(ip_conntrack_expect_list);
-struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
-static LIST_HEAD(helpers);
-unsigned int ip_conntrack_htable_size __read_mostly = 0;
-int ip_conntrack_max __read_mostly;
-struct list_head *ip_conntrack_hash __read_mostly;
-static kmem_cache_t *ip_conntrack_cachep __read_mostly;
-static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
-struct ip_conntrack ip_conntrack_untracked;
-unsigned int ip_ct_log_invalid __read_mostly;
-static LIST_HEAD(unconfirmed);
-static int ip_conntrack_vmalloc __read_mostly;
-
-static unsigned int ip_conntrack_next_id;
-static unsigned int ip_conntrack_expect_next_id;
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
-ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
-
-DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
-
-/* deliver cached events and clear cache entry - must be called with locally
- * disabled softirqs */
-static inline void
-__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
-{
- DEBUGP("ecache: delivering events for %p\n", ecache->ct);
- if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
- atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
- ecache->ct);
- ecache->events = 0;
- ip_conntrack_put(ecache->ct);
- ecache->ct = NULL;
-}
-
-/* Deliver all cached events for a particular conntrack. This is called
- * by code prior to async packet handling or freeing the skb */
-void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
-{
- struct ip_conntrack_ecache *ecache;
-
- local_bh_disable();
- ecache = &__get_cpu_var(ip_conntrack_ecache);
- if (ecache->ct == ct)
- __ip_ct_deliver_cached_events(ecache);
- local_bh_enable();
-}
-
-void __ip_ct_event_cache_init(struct ip_conntrack *ct)
-{
- struct ip_conntrack_ecache *ecache;
-
- /* take care of delivering potentially old events */
- ecache = &__get_cpu_var(ip_conntrack_ecache);
- BUG_ON(ecache->ct == ct);
- if (ecache->ct)
- __ip_ct_deliver_cached_events(ecache);
- /* initialize for this conntrack/packet */
- ecache->ct = ct;
- nf_conntrack_get(&ct->ct_general);
-}
-
-/* flush the event cache - touches other CPU's data and must not be called while
- * packets are still passing through the code */
-static void ip_ct_event_cache_flush(void)
-{
- struct ip_conntrack_ecache *ecache;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ecache = &per_cpu(ip_conntrack_ecache, cpu);
- if (ecache->ct)
- ip_conntrack_put(ecache->ct);
- }
-}
-#else
-static inline void ip_ct_event_cache_flush(void) {}
-#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
-
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
-
-static int ip_conntrack_hash_rnd_initted;
-static unsigned int ip_conntrack_hash_rnd;
-
-static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
- unsigned int size, unsigned int rnd)
-{
- return (jhash_3words((__force u32)tuple->src.ip,
- ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
- (tuple->src.u.all | (tuple->dst.u.all << 16)),
- rnd) % size);
-}
-
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
-{
- return __hash_conntrack(tuple, ip_conntrack_htable_size,
- ip_conntrack_hash_rnd);
-}
-
-int
-ip_ct_get_tuple(const struct iphdr *iph,
- const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_protocol *protocol)
-{
- /* Never happen */
- if (iph->frag_off & htons(IP_OFFSET)) {
- printk("ip_conntrack_core: Frag of proto %u.\n",
- iph->protocol);
- return 0;
- }
-
- tuple->src.ip = iph->saddr;
- tuple->dst.ip = iph->daddr;
- tuple->dst.protonum = iph->protocol;
- tuple->dst.dir = IP_CT_DIR_ORIGINAL;
-
- return protocol->pkt_to_tuple(skb, dataoff, tuple);
-}
-
-int
-ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
- const struct ip_conntrack_tuple *orig,
- const struct ip_conntrack_protocol *protocol)
-{
- inverse->src.ip = orig->dst.ip;
- inverse->dst.ip = orig->src.ip;
- inverse->dst.protonum = orig->dst.protonum;
- inverse->dst.dir = !orig->dst.dir;
-
- return protocol->invert_tuple(inverse, orig);
-}
-
-
-/* ip_conntrack_expect helper functions */
-void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
-{
- ASSERT_WRITE_LOCK(&ip_conntrack_lock);
- IP_NF_ASSERT(!timer_pending(&exp->timeout));
- list_del(&exp->list);
- CONNTRACK_STAT_INC(expect_delete);
- exp->master->expecting--;
- ip_conntrack_expect_put(exp);
-}
-
-static void expectation_timed_out(unsigned long ul_expect)
-{
- struct ip_conntrack_expect *exp = (void *)ul_expect;
-
- write_lock_bh(&ip_conntrack_lock);
- ip_ct_unlink_expect(exp);
- write_unlock_bh(&ip_conntrack_lock);
- ip_conntrack_expect_put(exp);
-}
-
-struct ip_conntrack_expect *
-__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
-{
- struct ip_conntrack_expect *i;
-
- list_for_each_entry(i, &ip_conntrack_expect_list, list) {
- if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
- return i;
- }
- return NULL;
-}
-
-/* Just find a expectation corresponding to a tuple. */
-struct ip_conntrack_expect *
-ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
-{
- struct ip_conntrack_expect *i;
-
- read_lock_bh(&ip_conntrack_lock);
- i = __ip_conntrack_expect_find(tuple);
- if (i)
- atomic_inc(&i->use);
- read_unlock_bh(&ip_conntrack_lock);
-
- return i;
-}
-
-/* If an expectation for this connection is found, it gets delete from
- * global list then returned. */
-static struct ip_conntrack_expect *
-find_expectation(const struct ip_conntrack_tuple *tuple)
-{
- struct ip_conntrack_expect *i;
-
- list_for_each_entry(i, &ip_conntrack_expect_list, list) {
- /* If master is not in hash table yet (ie. packet hasn't left
- this machine yet), how can other end know about expected?
- Hence these are not the droids you are looking for (if
- master ct never got confirmed, we'd hold a reference to it
- and weird things would happen to future packets). */
- if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
- && is_confirmed(i->master)) {
- if (i->flags & IP_CT_EXPECT_PERMANENT) {
- atomic_inc(&i->use);
- return i;
- } else if (del_timer(&i->timeout)) {
- ip_ct_unlink_expect(i);
- return i;
- }
- }
- }
- return NULL;
-}
-
-/* delete all expectations for this conntrack */
-void ip_ct_remove_expectations(struct ip_conntrack *ct)
-{
- struct ip_conntrack_expect *i, *tmp;
-
- /* Optimization: most connection never expect any others. */
- if (ct->expecting == 0)
- return;
-
- list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
- if (i->master == ct && del_timer(&i->timeout)) {
- ip_ct_unlink_expect(i);
- ip_conntrack_expect_put(i);
- }
- }
-}
-
-static void
-clean_from_lists(struct ip_conntrack *ct)
-{
- DEBUGP("clean_from_lists(%p)\n", ct);
- ASSERT_WRITE_LOCK(&ip_conntrack_lock);
- list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
-
- /* Destroy all pending expectations */
- ip_ct_remove_expectations(ct);
-}
-
-static void
-destroy_conntrack(struct nf_conntrack *nfct)
-{
- struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
- struct ip_conntrack_protocol *proto;
- struct ip_conntrack_helper *helper;
-
- DEBUGP("destroy_conntrack(%p)\n", ct);
- IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
- IP_NF_ASSERT(!timer_pending(&ct->timeout));
-
- ip_conntrack_event(IPCT_DESTROY, ct);
- set_bit(IPS_DYING_BIT, &ct->status);
-
- helper = ct->helper;
- if (helper && helper->destroy)
- helper->destroy(ct);
-
- /* To make sure we don't get any weird locking issues here:
- * destroy_conntrack() MUST NOT be called with a write lock
- * to ip_conntrack_lock!!! -HW */
- proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
- if (proto && proto->destroy)
- proto->destroy(ct);
-
- if (ip_conntrack_destroyed)
- ip_conntrack_destroyed(ct);
-
- write_lock_bh(&ip_conntrack_lock);
- /* Expectations will have been removed in clean_from_lists,
- * except TFTP can create an expectation on the first packet,
- * before connection is in the list, so we need to clean here,
- * too. */
- ip_ct_remove_expectations(ct);
-
- /* We overload first tuple to link into unconfirmed list. */
- if (!is_confirmed(ct)) {
- BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
- list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- }
-
- CONNTRACK_STAT_INC(delete);
- write_unlock_bh(&ip_conntrack_lock);
-
- if (ct->master)
- ip_conntrack_put(ct->master);
-
- DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
- ip_conntrack_free(ct);
-}
-
-static void death_by_timeout(unsigned long ul_conntrack)
-{
- struct ip_conntrack *ct = (void *)ul_conntrack;
-
- write_lock_bh(&ip_conntrack_lock);
- /* Inside lock so preempt is disabled on module removal path.
- * Otherwise we can get spurious warnings. */
- CONNTRACK_STAT_INC(delete_list);
- clean_from_lists(ct);
- write_unlock_bh(&ip_conntrack_lock);
- ip_conntrack_put(ct);
-}
-
-struct ip_conntrack_tuple_hash *
-__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack *ignored_conntrack)
-{
- struct ip_conntrack_tuple_hash *h;
- unsigned int hash = hash_conntrack(tuple);
-
- ASSERT_READ_LOCK(&ip_conntrack_lock);
- list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
- if (tuplehash_to_ctrack(h) != ignored_conntrack &&
- ip_ct_tuple_equal(tuple, &h->tuple)) {
- CONNTRACK_STAT_INC(found);
- return h;
- }
- CONNTRACK_STAT_INC(searched);
- }
-
- return NULL;
-}
-
-/* Find a connection corresponding to a tuple. */
-struct ip_conntrack_tuple_hash *
-ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack *ignored_conntrack)
-{
- struct ip_conntrack_tuple_hash *h;
-
- read_lock_bh(&ip_conntrack_lock);
- h = __ip_conntrack_find(tuple, ignored_conntrack);
- if (h)
- atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
- read_unlock_bh(&ip_conntrack_lock);
-
- return h;
-}
-
-static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
- unsigned int hash,
- unsigned int repl_hash)
-{
- ct->id = ++ip_conntrack_next_id;
- list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
- &ip_conntrack_hash[hash]);
- list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
- &ip_conntrack_hash[repl_hash]);
-}
-
-void ip_conntrack_hash_insert(struct ip_conntrack *ct)
-{
- unsigned int hash, repl_hash;
-
- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- write_lock_bh(&ip_conntrack_lock);
- __ip_conntrack_hash_insert(ct, hash, repl_hash);
- write_unlock_bh(&ip_conntrack_lock);
-}
-
-/* Confirm a connection given skb; places it in hash table */
-int
-__ip_conntrack_confirm(struct sk_buff **pskb)
-{
- unsigned int hash, repl_hash;
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
-
- /* ipt_REJECT uses ip_conntrack_attach to attach related
- ICMP/TCP RST packets in other direction. Actual packet
- which created connection will be IP_CT_NEW or for an
- expected connection, IP_CT_RELATED. */
- if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- return NF_ACCEPT;
-
- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- /* We're not in hash table, and we refuse to set up related
- connections for unconfirmed conns. But packet copies and
- REJECT will give spurious warnings here. */
- /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
-
- /* No external references means noone else could have
- confirmed us. */
- IP_NF_ASSERT(!is_confirmed(ct));
- DEBUGP("Confirming conntrack %p\n", ct);
-
- write_lock_bh(&ip_conntrack_lock);
-
- /* See if there's one in the list already, including reverse:
- NAT could have grabbed it without realizing, since we're
- not in the hash. If there is, we lost race. */
- list_for_each_entry(h, &ip_conntrack_hash[hash], list)
- if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &h->tuple))
- goto out;
- list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
- if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- &h->tuple))
- goto out;
-
- /* Remove from unconfirmed list */
- list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
-
- __ip_conntrack_hash_insert(ct, hash, repl_hash);
- /* Timer relative to confirmation time, not original
- setting time, otherwise we'd get timer wrap in
- weird delay cases. */
- ct->timeout.expires += jiffies;
- add_timer(&ct->timeout);
- atomic_inc(&ct->ct_general.use);
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
- CONNTRACK_STAT_INC(insert);
- write_unlock_bh(&ip_conntrack_lock);
- if (ct->helper)
- ip_conntrack_event_cache(IPCT_HELPER, *pskb);
-#ifdef CONFIG_IP_NF_NAT_NEEDED
- if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
- test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
- ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
-#endif
- ip_conntrack_event_cache(master_ct(ct) ?
- IPCT_RELATED : IPCT_NEW, *pskb);
-
- return NF_ACCEPT;
-
-out:
- CONNTRACK_STAT_INC(insert_failed);
- write_unlock_bh(&ip_conntrack_lock);
- return NF_DROP;
-}
-
-/* Returns true if a connection correspondings to the tuple (required
- for NAT). */
-int
-ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack *ignored_conntrack)
-{
- struct ip_conntrack_tuple_hash *h;
-
- read_lock_bh(&ip_conntrack_lock);
- h = __ip_conntrack_find(tuple, ignored_conntrack);
- read_unlock_bh(&ip_conntrack_lock);
-
- return h != NULL;
-}
-
-/* There's a small race here where we may free a just-assured
- connection. Too bad: we're in trouble anyway. */
-static int early_drop(struct list_head *chain)
-{
- /* Traverse backwards: gives us oldest, which is roughly LRU */
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack *ct = NULL, *tmp;
- int dropped = 0;
-
- read_lock_bh(&ip_conntrack_lock);
- list_for_each_entry_reverse(h, chain, list) {
- tmp = tuplehash_to_ctrack(h);
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
- ct = tmp;
- atomic_inc(&ct->ct_general.use);
- break;
- }
- }
- read_unlock_bh(&ip_conntrack_lock);
-
- if (!ct)
- return dropped;
-
- if (del_timer(&ct->timeout)) {
- death_by_timeout((unsigned long)ct);
- dropped = 1;
- CONNTRACK_STAT_INC(early_drop);
- }
- ip_conntrack_put(ct);
- return dropped;
-}
-
-static struct ip_conntrack_helper *
-__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
-{
- struct ip_conntrack_helper *h;
-
- list_for_each_entry(h, &helpers, list) {
- if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
- return h;
- }
- return NULL;
-}
-
-struct ip_conntrack_helper *
-ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
-{
- struct ip_conntrack_helper *helper;
-
- /* need ip_conntrack_lock to assure that helper exists until
- * try_module_get() is called */
- read_lock_bh(&ip_conntrack_lock);
-
- helper = __ip_conntrack_helper_find(tuple);
- if (helper) {
- /* need to increase module usage count to assure helper will
- * not go away while the caller is e.g. busy putting a
- * conntrack in the hash that uses the helper */
- if (!try_module_get(helper->me))
- helper = NULL;
- }
-
- read_unlock_bh(&ip_conntrack_lock);
-
- return helper;
-}
-
-void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
-{
- module_put(helper->me);
-}
-
-struct ip_conntrack_protocol *
-__ip_conntrack_proto_find(u_int8_t protocol)
-{
- return ip_ct_protos[protocol];
-}
-
-/* this is guaranteed to always return a valid protocol helper, since
- * it falls back to generic_protocol */
-struct ip_conntrack_protocol *
-ip_conntrack_proto_find_get(u_int8_t protocol)
-{
- struct ip_conntrack_protocol *p;
-
- preempt_disable();
- p = __ip_conntrack_proto_find(protocol);
- if (p) {
- if (!try_module_get(p->me))
- p = &ip_conntrack_generic_protocol;
- }
- preempt_enable();
-
- return p;
-}
-
-void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
-{
- module_put(p->me);
-}
-
-struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
- struct ip_conntrack_tuple *repl)
-{
- struct ip_conntrack *conntrack;
-
- if (!ip_conntrack_hash_rnd_initted) {
- get_random_bytes(&ip_conntrack_hash_rnd, 4);
- ip_conntrack_hash_rnd_initted = 1;
- }
-
- /* We don't want any race condition at early drop stage */
- atomic_inc(&ip_conntrack_count);
-
- if (ip_conntrack_max
- && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
- unsigned int hash = hash_conntrack(orig);
- /* Try dropping from this hash chain. */
- if (!early_drop(&ip_conntrack_hash[hash])) {
- atomic_dec(&ip_conntrack_count);
- if (net_ratelimit())
- printk(KERN_WARNING
- "ip_conntrack: table full, dropping"
- " packet.\n");
- return ERR_PTR(-ENOMEM);
- }
- }
-
- conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
- if (!conntrack) {
- DEBUGP("Can't allocate conntrack.\n");
- atomic_dec(&ip_conntrack_count);
- return ERR_PTR(-ENOMEM);
- }
-
- memset(conntrack, 0, sizeof(*conntrack));
- atomic_set(&conntrack->ct_general.use, 1);
- conntrack->ct_general.destroy = destroy_conntrack;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
- /* Don't set timer yet: wait for confirmation */
- init_timer(&conntrack->timeout);
- conntrack->timeout.data = (unsigned long)conntrack;
- conntrack->timeout.function = death_by_timeout;
-
- return conntrack;
-}
-
-void
-ip_conntrack_free(struct ip_conntrack *conntrack)
-{
- atomic_dec(&ip_conntrack_count);
- kmem_cache_free(ip_conntrack_cachep, conntrack);
-}
-
-/* Allocate a new conntrack: we return -ENOMEM if classification
- * failed due to stress. Otherwise it really is unclassifiable */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *protocol,
- struct sk_buff *skb)
-{
- struct ip_conntrack *conntrack;
- struct ip_conntrack_tuple repl_tuple;
- struct ip_conntrack_expect *exp;
-
- if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
- DEBUGP("Can't invert tuple.\n");
- return NULL;
- }
-
- conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
- if (conntrack == NULL || IS_ERR(conntrack))
- return (struct ip_conntrack_tuple_hash *)conntrack;
-
- if (!protocol->new(conntrack, skb)) {
- ip_conntrack_free(conntrack);
- return NULL;
- }
-
- write_lock_bh(&ip_conntrack_lock);
- exp = find_expectation(tuple);
-
- if (exp) {
- DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
- conntrack, exp);
- /* Welcome, Mr. Bond. We've been expecting you... */
- __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
- conntrack->master = exp->master;
-#ifdef CONFIG_IP_NF_CONNTRACK_MARK
- conntrack->mark = exp->master->mark;
-#endif
-#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
- defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
- /* this is ugly, but there is no other place where to put it */
- conntrack->nat.masq_index = exp->master->nat.masq_index;
-#endif
-#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
- conntrack->secmark = exp->master->secmark;
-#endif
- nf_conntrack_get(&conntrack->master->ct_general);
- CONNTRACK_STAT_INC(expect_new);
- } else {
- conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
-
- CONNTRACK_STAT_INC(new);
- }
-
- /* Overload tuple linked list to put us in unconfirmed list. */
- list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
-
- write_unlock_bh(&ip_conntrack_lock);
-
- if (exp) {
- if (exp->expectfn)
- exp->expectfn(conntrack, exp);
- ip_conntrack_expect_put(exp);
- }
-
- return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
-}
-
-/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
-static inline struct ip_conntrack *
-resolve_normal_ct(struct sk_buff *skb,
- struct ip_conntrack_protocol *proto,
- int *set_reply,
- unsigned int hooknum,
- enum ip_conntrack_info *ctinfo)
-{
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack *ct;
-
- IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
-
- if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
- &tuple,proto))
- return NULL;
-
- /* look for tuple match */
- h = ip_conntrack_find_get(&tuple, NULL);
- if (!h) {
- h = init_conntrack(&tuple, proto, skb);
- if (!h)
- return NULL;
- if (IS_ERR(h))
- return (void *)h;
- }
- ct = tuplehash_to_ctrack(h);
-
- /* It exists; we have (non-exclusive) reference. */
- if (DIRECTION(h) == IP_CT_DIR_REPLY) {
- *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
- /* Please set reply bit if this packet OK */
- *set_reply = 1;
- } else {
- /* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- DEBUGP("ip_conntrack_in: normal packet for %p\n",
- ct);
- *ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- DEBUGP("ip_conntrack_in: related packet for %p\n",
- ct);
- *ctinfo = IP_CT_RELATED;
- } else {
- DEBUGP("ip_conntrack_in: new packet for %p\n",
- ct);
- *ctinfo = IP_CT_NEW;
- }
- *set_reply = 0;
- }
- skb->nfct = &ct->ct_general;
- skb->nfctinfo = *ctinfo;
- return ct;
-}
-
-/* Netfilter hook itself. */
-unsigned int ip_conntrack_in(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- struct ip_conntrack_protocol *proto;
- int set_reply = 0;
- int ret;
-
- /* Previously seen (loopback or untracked)? Ignore. */
- if ((*pskb)->nfct) {
- CONNTRACK_STAT_INC(ignore);
- return NF_ACCEPT;
- }
-
- /* Never happen */
- if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
- if (net_ratelimit()) {
- printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
- (*pskb)->nh.iph->protocol, hooknum);
- }
- return NF_DROP;
- }
-
-/* Doesn't cover locally-generated broadcast, so not worth it. */
-#if 0
- /* Ignore broadcast: no `connection'. */
- if ((*pskb)->pkt_type == PACKET_BROADCAST) {
- printk("Broadcast packet!\n");
- return NF_ACCEPT;
- } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
- == htonl(0x000000FF)) {
- printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
- NIPQUAD((*pskb)->nh.iph->saddr),
- NIPQUAD((*pskb)->nh.iph->daddr),
- (*pskb)->sk, (*pskb)->pkt_type);
- }
-#endif
-
- proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
-
- /* It may be an special packet, error, unclean...
- * inverse of the return code tells to the netfilter
- * core what to do with the packet. */
- if (proto->error != NULL
- && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
- CONNTRACK_STAT_INC(error);
- CONNTRACK_STAT_INC(invalid);
- return -ret;
- }
-
- if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
- /* Not valid part of a connection */
- CONNTRACK_STAT_INC(invalid);
- return NF_ACCEPT;
- }
-
- if (IS_ERR(ct)) {
- /* Too stressed to deal. */
- CONNTRACK_STAT_INC(drop);
- return NF_DROP;
- }
-
- IP_NF_ASSERT((*pskb)->nfct);
-
- ret = proto->packet(ct, *pskb, ctinfo);
- if (ret < 0) {
- /* Invalid: inverse of the return code tells
- * the netfilter core what to do*/
- nf_conntrack_put((*pskb)->nfct);
- (*pskb)->nfct = NULL;
- CONNTRACK_STAT_INC(invalid);
- return -ret;
- }
-
- if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
- ip_conntrack_event_cache(IPCT_STATUS, *pskb);
-
- return ret;
-}
-
-int invert_tuplepr(struct ip_conntrack_tuple *inverse,
- const struct ip_conntrack_tuple *orig)
-{
- return ip_ct_invert_tuple(inverse, orig,
- __ip_conntrack_proto_find(orig->dst.protonum));
-}
-
-/* Would two expected things clash? */
-static inline int expect_clash(const struct ip_conntrack_expect *a,
- const struct ip_conntrack_expect *b)
-{
- /* Part covered by intersection of masks must be unequal,
- otherwise they clash */
- struct ip_conntrack_tuple intersect_mask
- = { { a->mask.src.ip & b->mask.src.ip,
- { a->mask.src.u.all & b->mask.src.u.all } },
- { a->mask.dst.ip & b->mask.dst.ip,
- { a->mask.dst.u.all & b->mask.dst.u.all },
- a->mask.dst.protonum & b->mask.dst.protonum } };
-
- return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
-}
-
-static inline int expect_matches(const struct ip_conntrack_expect *a,
- const struct ip_conntrack_expect *b)
-{
- return a->master == b->master
- && ip_ct_tuple_equal(&a->tuple, &b->tuple)
- && ip_ct_tuple_equal(&a->mask, &b->mask);
-}
-
-/* Generally a bad idea to call this: could have matched already. */
-void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
-{
- struct ip_conntrack_expect *i;
-
- write_lock_bh(&ip_conntrack_lock);
- /* choose the the oldest expectation to evict */
- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
- if (expect_matches(i, exp) && del_timer(&i->timeout)) {
- ip_ct_unlink_expect(i);
- write_unlock_bh(&ip_conntrack_lock);
- ip_conntrack_expect_put(i);
- return;
- }
- }
- write_unlock_bh(&ip_conntrack_lock);
-}
-
-/* We don't increase the master conntrack refcount for non-fulfilled
- * conntracks. During the conntrack destruction, the expectations are
- * always killed before the conntrack itself */
-struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
-{
- struct ip_conntrack_expect *new;
-
- new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
- if (!new) {
- DEBUGP("expect_related: OOM allocating expect\n");
- return NULL;
- }
- new->master = me;
- atomic_set(&new->use, 1);
- return new;
-}
-
-void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
-{
- if (atomic_dec_and_test(&exp->use))
- kmem_cache_free(ip_conntrack_expect_cachep, exp);
-}
-
-static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
-{
- atomic_inc(&exp->use);
- exp->master->expecting++;
- list_add(&exp->list, &ip_conntrack_expect_list);
-
- init_timer(&exp->timeout);
- exp->timeout.data = (unsigned long)exp;
- exp->timeout.function = expectation_timed_out;
- exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
- add_timer(&exp->timeout);
-
- exp->id = ++ip_conntrack_expect_next_id;
- atomic_inc(&exp->use);
- CONNTRACK_STAT_INC(expect_create);
-}
-
-/* Race with expectations being used means we could have none to find; OK. */
-static void evict_oldest_expect(struct ip_conntrack *master)
-{
- struct ip_conntrack_expect *i;
-
- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
- if (i->master == master) {
- if (del_timer(&i->timeout)) {
- ip_ct_unlink_expect(i);
- ip_conntrack_expect_put(i);
- }
- break;
- }
- }
-}
-
-static inline int refresh_timer(struct ip_conntrack_expect *i)
-{
- if (!del_timer(&i->timeout))
- return 0;
-
- i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
- add_timer(&i->timeout);
- return 1;
-}
-
-int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
-{
- struct ip_conntrack_expect *i;
- int ret;
-
- DEBUGP("ip_conntrack_expect_related %p\n", related_to);
- DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
- DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
-
- write_lock_bh(&ip_conntrack_lock);
- list_for_each_entry(i, &ip_conntrack_expect_list, list) {
- if (expect_matches(i, expect)) {
- /* Refresh timer: if it's dying, ignore.. */
- if (refresh_timer(i)) {
- ret = 0;
- goto out;
- }
- } else if (expect_clash(i, expect)) {
- ret = -EBUSY;
- goto out;
- }
- }
-
- /* Will be over limit? */
- if (expect->master->helper->max_expected &&
- expect->master->expecting >= expect->master->helper->max_expected)
- evict_oldest_expect(expect->master);
-
- ip_conntrack_expect_insert(expect);
- ip_conntrack_expect_event(IPEXP_NEW, expect);
- ret = 0;
-out:
- write_unlock_bh(&ip_conntrack_lock);
- return ret;
-}
-
-/* Alter reply tuple (maybe alter helper). This is for NAT, and is
- implicitly racy: see __ip_conntrack_confirm */
-void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
- const struct ip_conntrack_tuple *newreply)
-{
- write_lock_bh(&ip_conntrack_lock);
- /* Should be unconfirmed, so not in hash table yet */
- IP_NF_ASSERT(!is_confirmed(conntrack));
-
- DEBUGP("Altering reply tuple of %p to ", conntrack);
- DUMP_TUPLE(newreply);
-
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (!conntrack->master && conntrack->expecting == 0)
- conntrack->helper = __ip_conntrack_helper_find(newreply);
- write_unlock_bh(&ip_conntrack_lock);
-}
-
-int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
-{
- BUG_ON(me->timeout == 0);
- write_lock_bh(&ip_conntrack_lock);
- list_add(&me->list, &helpers);
- write_unlock_bh(&ip_conntrack_lock);
-
- return 0;
-}
-
-struct ip_conntrack_helper *
-__ip_conntrack_helper_find_byname(const char *name)
-{
- struct ip_conntrack_helper *h;
-
- list_for_each_entry(h, &helpers, list) {
- if (!strcmp(h->name, name))
- return h;
- }
-
- return NULL;
-}
-
-static inline void unhelp(struct ip_conntrack_tuple_hash *i,
- const struct ip_conntrack_helper *me)
-{
- if (tuplehash_to_ctrack(i)->helper == me) {
- ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
- tuplehash_to_ctrack(i)->helper = NULL;
- }
-}
-
-void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
-{
- unsigned int i;
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack_expect *exp, *tmp;
-
- /* Need write lock here, to delete helper. */
- write_lock_bh(&ip_conntrack_lock);
- list_del(&me->list);
-
- /* Get rid of expectations */
- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
- if (exp->master->helper == me && del_timer(&exp->timeout)) {
- ip_ct_unlink_expect(exp);
- ip_conntrack_expect_put(exp);
- }
- }
- /* Get rid of expecteds, set helpers to NULL. */
- list_for_each_entry(h, &unconfirmed, list)
- unhelp(h, me);
- for (i = 0; i < ip_conntrack_htable_size; i++) {
- list_for_each_entry(h, &ip_conntrack_hash[i], list)
- unhelp(h, me);
- }
- write_unlock_bh(&ip_conntrack_lock);
-
- /* Someone could be still looking at the helper in a bh. */
- synchronize_net();
-}
-
-/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
-void __ip_ct_refresh_acct(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
- unsigned long extra_jiffies,
- int do_acct)
-{
- int event = 0;
-
- IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
- IP_NF_ASSERT(skb);
-
- write_lock_bh(&ip_conntrack_lock);
-
- /* Only update if this is not a fixed timeout */
- if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
- write_unlock_bh(&ip_conntrack_lock);
- return;
- }
-
- /* If not in hash table, timer will not be active yet */
- if (!is_confirmed(ct)) {
- ct->timeout.expires = extra_jiffies;
- event = IPCT_REFRESH;
- } else {
- /* Need del_timer for race avoidance (may already be dying). */
- if (del_timer(&ct->timeout)) {
- ct->timeout.expires = jiffies + extra_jiffies;
- add_timer(&ct->timeout);
- event = IPCT_REFRESH;
- }
- }
-
-#ifdef CONFIG_IP_NF_CT_ACCT
- if (do_acct) {
- ct->counters[CTINFO2DIR(ctinfo)].packets++;
- ct->counters[CTINFO2DIR(ctinfo)].bytes +=
- ntohs(skb->nh.iph->tot_len);
- if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
- || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
- event |= IPCT_COUNTER_FILLING;
- }
-#endif
-
- write_unlock_bh(&ip_conntrack_lock);
-
- /* must be unlocked when calling event cache */
- if (event)
- ip_conntrack_event_cache(event, skb);
-}
-
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
-/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
- * in ip_conntrack_core, since we don't want the protocols to autoload
- * or depend on ctnetlink */
-int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
- const struct ip_conntrack_tuple *tuple)
-{
- NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
- &tuple->src.u.tcp.port);
- NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
- &tuple->dst.u.tcp.port);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
- struct ip_conntrack_tuple *t)
-{
- if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
- return -EINVAL;
-
- t->src.u.tcp.port =
- *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
- t->dst.u.tcp.port =
- *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
-
- return 0;
-}
-#endif
-
-/* Returns new sk_buff, or NULL */
-struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
-{
- skb_orphan(skb);
-
- local_bh_disable();
- skb = ip_defrag(skb, user);
- local_bh_enable();
-
- if (skb)
- ip_send_check(skb->nh.iph);
- return skb;
-}
-
-/* Used by ipt_REJECT. */
-static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
-
- /* This ICMP is in reverse direction to the packet which caused it */
- ct = ip_conntrack_get(skb, &ctinfo);
-
- if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
- ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
- else
- ctinfo = IP_CT_RELATED;
-
- /* Attach to new skbuff, and increment count */
- nskb->nfct = &ct->ct_general;
- nskb->nfctinfo = ctinfo;
- nf_conntrack_get(nskb->nfct);
-}
-
-/* Bring out ya dead! */
-static struct ip_conntrack *
-get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
- void *data, unsigned int *bucket)
-{
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack *ct;
-
- write_lock_bh(&ip_conntrack_lock);
- for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
- list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
- ct = tuplehash_to_ctrack(h);
- if (iter(ct, data))
- goto found;
- }
- }
- list_for_each_entry(h, &unconfirmed, list) {
- ct = tuplehash_to_ctrack(h);
- if (iter(ct, data))
- goto found;
- }
- write_unlock_bh(&ip_conntrack_lock);
- return NULL;
-
-found:
- atomic_inc(&ct->ct_general.use);
- write_unlock_bh(&ip_conntrack_lock);
- return ct;
-}
-
-void
-ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
-{
- struct ip_conntrack *ct;
- unsigned int bucket = 0;
-
- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
- /* Time to push up daises... */
- if (del_timer(&ct->timeout))
- death_by_timeout((unsigned long)ct);
- /* ... else the timer will get him soon. */
-
- ip_conntrack_put(ct);
- }
-}
-
-/* Fast function for those who don't want to parse /proc (and I don't
- blame them). */
-/* Reversing the socket's dst/src point of view gives us the reply
- mapping. */
-static int
-getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack_tuple tuple;
-
- IP_CT_TUPLE_U_BLANK(&tuple);
- tuple.src.ip = inet->rcv_saddr;
- tuple.src.u.tcp.port = inet->sport;
- tuple.dst.ip = inet->daddr;
- tuple.dst.u.tcp.port = inet->dport;
- tuple.dst.protonum = IPPROTO_TCP;
-
- /* We only do TCP at the moment: is there a better way? */
- if (strcmp(sk->sk_prot->name, "TCP")) {
- DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
- return -ENOPROTOOPT;
- }
-
- if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
- *len, sizeof(struct sockaddr_in));
- return -EINVAL;
- }
-
- h = ip_conntrack_find_get(&tuple, NULL);
- if (h) {
- struct sockaddr_in sin;
- struct ip_conntrack *ct = tuplehash_to_ctrack(h);
-
- sin.sin_family = AF_INET;
- sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.ip;
- memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
-
- DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
- NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
- ip_conntrack_put(ct);
- if (copy_to_user(user, &sin, sizeof(sin)) != 0)
- return -EFAULT;
- else
- return 0;
- }
- DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
- NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
- NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
-}
-
-static struct nf_sockopt_ops so_getorigdst = {
- .pf = PF_INET,
- .get_optmin = SO_ORIGINAL_DST,
- .get_optmax = SO_ORIGINAL_DST+1,
- .get = &getorigdst,
-};
-
-static int kill_all(struct ip_conntrack *i, void *data)
-{
- return 1;
-}
-
-void ip_conntrack_flush(void)
-{
- ip_ct_iterate_cleanup(kill_all, NULL);
-}
-
-static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
-{
- if (vmalloced)
- vfree(hash);
- else
- free_pages((unsigned long)hash,
- get_order(sizeof(struct list_head) * size));
-}
-
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
-{
- ip_ct_attach = NULL;
-
- /* This makes sure all current packets have passed through
- netfilter framework. Roll on, two-stage module
- delete... */
- synchronize_net();
-
- ip_ct_event_cache_flush();
- i_see_dead_people:
- ip_conntrack_flush();
- if (atomic_read(&ip_conntrack_count) != 0) {
- schedule();
- goto i_see_dead_people;
- }
- /* wait until all references to ip_conntrack_untracked are dropped */
- while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
- schedule();
-
- kmem_cache_destroy(ip_conntrack_cachep);
- kmem_cache_destroy(ip_conntrack_expect_cachep);
- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
- ip_conntrack_htable_size);
- nf_unregister_sockopt(&so_getorigdst);
-}
-
-static struct list_head *alloc_hashtable(int size, int *vmalloced)
-{
- struct list_head *hash;
- unsigned int i;
-
- *vmalloced = 0;
- hash = (void*)__get_free_pages(GFP_KERNEL,
- get_order(sizeof(struct list_head)
- * size));
- if (!hash) {
- *vmalloced = 1;
- printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
- hash = vmalloc(sizeof(struct list_head) * size);
- }
-
- if (hash)
- for (i = 0; i < size; i++)
- INIT_LIST_HEAD(&hash[i]);
-
- return hash;
-}
-
-static int set_hashsize(const char *val, struct kernel_param *kp)
-{
- int i, bucket, hashsize, vmalloced;
- int old_vmalloced, old_size;
- int rnd;
- struct list_head *hash, *old_hash;
- struct ip_conntrack_tuple_hash *h;
-
- /* On boot, we can set this without any fancy locking. */
- if (!ip_conntrack_htable_size)
- return param_set_int(val, kp);
-
- hashsize = simple_strtol(val, NULL, 0);
- if (!hashsize)
- return -EINVAL;
-
- hash = alloc_hashtable(hashsize, &vmalloced);
- if (!hash)
- return -ENOMEM;
-
- /* We have to rehash for the new table anyway, so we also can
- * use a new random seed */
- get_random_bytes(&rnd, 4);
-
- write_lock_bh(&ip_conntrack_lock);
- for (i = 0; i < ip_conntrack_htable_size; i++) {
- while (!list_empty(&ip_conntrack_hash[i])) {
- h = list_entry(ip_conntrack_hash[i].next,
- struct ip_conntrack_tuple_hash, list);
- list_del(&h->list);
- bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
- list_add_tail(&h->list, &hash[bucket]);
- }
- }
- old_size = ip_conntrack_htable_size;
- old_vmalloced = ip_conntrack_vmalloc;
- old_hash = ip_conntrack_hash;
-
- ip_conntrack_htable_size = hashsize;
- ip_conntrack_vmalloc = vmalloced;
- ip_conntrack_hash = hash;
- ip_conntrack_hash_rnd = rnd;
- write_unlock_bh(&ip_conntrack_lock);
-
- free_conntrack_hash(old_hash, old_vmalloced, old_size);
- return 0;
-}
-
-module_param_call(hashsize, set_hashsize, param_get_uint,
- &ip_conntrack_htable_size, 0600);
-
-int __init ip_conntrack_init(void)
-{
- unsigned int i;
- int ret;
-
- /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
- * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
- if (!ip_conntrack_htable_size) {
- ip_conntrack_htable_size
- = (((num_physpages << PAGE_SHIFT) / 16384)
- / sizeof(struct list_head));
- if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
- ip_conntrack_htable_size = 8192;
- if (ip_conntrack_htable_size < 16)
- ip_conntrack_htable_size = 16;
- }
- ip_conntrack_max = 8 * ip_conntrack_htable_size;
-
- printk("ip_conntrack version %s (%u buckets, %d max)"
- " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
- ip_conntrack_htable_size, ip_conntrack_max,
- sizeof(struct ip_conntrack));
-
- ret = nf_register_sockopt(&so_getorigdst);
- if (ret != 0) {
- printk(KERN_ERR "Unable to register netfilter socket option\n");
- return ret;
- }
-
- ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
- &ip_conntrack_vmalloc);
- if (!ip_conntrack_hash) {
- printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
- goto err_unreg_sockopt;
- }
-
- ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
- sizeof(struct ip_conntrack), 0,
- 0, NULL, NULL);
- if (!ip_conntrack_cachep) {
- printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
- goto err_free_hash;
- }
-
- ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
- sizeof(struct ip_conntrack_expect),
- 0, 0, NULL, NULL);
- if (!ip_conntrack_expect_cachep) {
- printk(KERN_ERR "Unable to create ip_expect slab cache\n");
- goto err_free_conntrack_slab;
- }
-
- /* Don't NEED lock here, but good form anyway. */
- write_lock_bh(&ip_conntrack_lock);
- for (i = 0; i < MAX_IP_CT_PROTO; i++)
- ip_ct_protos[i] = &ip_conntrack_generic_protocol;
- /* Sew in builtin protocols. */
- ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
- ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
- ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
- write_unlock_bh(&ip_conntrack_lock);
-
- /* For use by ipt_REJECT */
- ip_ct_attach = ip_conntrack_attach;
-
- /* Set up fake conntrack:
- - to never be deleted, not in any hashes */
- atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
- /* - and look it like as a confirmed connection */
- set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
-
- return ret;
-
-err_free_conntrack_slab:
- kmem_cache_destroy(ip_conntrack_cachep);
-err_free_hash:
- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
- ip_conntrack_htable_size);
-err_unreg_sockopt:
- nf_unregister_sockopt(&so_getorigdst);
-
- return -ENOMEM;
-}
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
deleted file mode 100644
index 0410c99cacae..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/* FTP extension for IP connection tracking. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <linux/ctype.h>
-#include <net/checksum.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
-#include <linux/moduleparam.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-MODULE_DESCRIPTION("ftp connection tracking helper");
-
-/* This is slow, but it's simple. --RR */
-static char *ftp_buffer;
-static DEFINE_SPINLOCK(ip_ftp_lock);
-
-#define MAX_PORTS 8
-static unsigned short ports[MAX_PORTS];
-static int ports_c;
-module_param_array(ports, ushort, &ports_c, 0400);
-
-static int loose;
-module_param(loose, bool, 0600);
-
-unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- enum ip_ct_ftp_type type,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack_expect *exp,
- u32 *seq);
-EXPORT_SYMBOL_GPL(ip_nat_ftp_hook);
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int try_rfc959(const char *, size_t, u_int32_t [], char);
-static int try_eprt(const char *, size_t, u_int32_t [], char);
-static int try_epsv_response(const char *, size_t, u_int32_t [], char);
-
-static const struct ftp_search {
- const char *pattern;
- size_t plen;
- char skip;
- char term;
- enum ip_ct_ftp_type ftptype;
- int (*getnum)(const char *, size_t, u_int32_t[], char);
-} search[IP_CT_DIR_MAX][2] = {
- [IP_CT_DIR_ORIGINAL] = {
- {
- .pattern = "PORT",
- .plen = sizeof("PORT") - 1,
- .skip = ' ',
- .term = '\r',
- .ftptype = IP_CT_FTP_PORT,
- .getnum = try_rfc959,
- },
- {
- .pattern = "EPRT",
- .plen = sizeof("EPRT") - 1,
- .skip = ' ',
- .term = '\r',
- .ftptype = IP_CT_FTP_EPRT,
- .getnum = try_eprt,
- },
- },
- [IP_CT_DIR_REPLY] = {
- {
- .pattern = "227 ",
- .plen = sizeof("227 ") - 1,
- .skip = '(',
- .term = ')',
- .ftptype = IP_CT_FTP_PASV,
- .getnum = try_rfc959,
- },
- {
- .pattern = "229 ",
- .plen = sizeof("229 ") - 1,
- .skip = '(',
- .term = ')',
- .ftptype = IP_CT_FTP_EPSV,
- .getnum = try_epsv_response,
- },
- },
-};
-
-static int try_number(const char *data, size_t dlen, u_int32_t array[],
- int array_size, char sep, char term)
-{
- u_int32_t i, len;
-
- memset(array, 0, sizeof(array[0])*array_size);
-
- /* Keep data pointing at next char. */
- for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
- if (*data >= '0' && *data <= '9') {
- array[i] = array[i]*10 + *data - '0';
- }
- else if (*data == sep)
- i++;
- else {
- /* Unexpected character; true if it's the
- terminator and we're finished. */
- if (*data == term && i == array_size - 1)
- return len;
-
- DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
- len, i, *data);
- return 0;
- }
- }
- DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
-
- return 0;
-}
-
-/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
-static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6],
- char term)
-{
- return try_number(data, dlen, array, 6, ',', term);
-}
-
-/* Grab port: number up to delimiter */
-static int get_port(const char *data, int start, size_t dlen, char delim,
- u_int32_t array[2])
-{
- u_int16_t port = 0;
- int i;
-
- for (i = start; i < dlen; i++) {
- /* Finished? */
- if (data[i] == delim) {
- if (port == 0)
- break;
- array[0] = port >> 8;
- array[1] = port;
- return i + 1;
- }
- else if (data[i] >= '0' && data[i] <= '9')
- port = port*10 + data[i] - '0';
- else /* Some other crap */
- break;
- }
- return 0;
-}
-
-/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */
-static int try_eprt(const char *data, size_t dlen, u_int32_t array[6],
- char term)
-{
- char delim;
- int length;
-
- /* First character is delimiter, then "1" for IPv4, then
- delimiter again. */
- if (dlen <= 3) return 0;
- delim = data[0];
- if (isdigit(delim) || delim < 33 || delim > 126
- || data[1] != '1' || data[2] != delim)
- return 0;
-
- DEBUGP("EPRT: Got |1|!\n");
- /* Now we have IP address. */
- length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
- if (length == 0)
- return 0;
-
- DEBUGP("EPRT: Got IP address!\n");
- /* Start offset includes initial "|1|", and trailing delimiter */
- return get_port(data, 3 + length + 1, dlen, delim, array+4);
-}
-
-/* Returns 0, or length of numbers: |||6446| */
-static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6],
- char term)
-{
- char delim;
-
- /* Three delimiters. */
- if (dlen <= 3) return 0;
- delim = data[0];
- if (isdigit(delim) || delim < 33 || delim > 126
- || data[1] != delim || data[2] != delim)
- return 0;
-
- return get_port(data, 3, dlen, delim, array+4);
-}
-
-/* Return 1 for match, 0 for accept, -1 for partial. */
-static int find_pattern(const char *data, size_t dlen,
- const char *pattern, size_t plen,
- char skip, char term,
- unsigned int *numoff,
- unsigned int *numlen,
- u_int32_t array[6],
- int (*getnum)(const char *, size_t, u_int32_t[], char))
-{
- size_t i;
-
- DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
- if (dlen == 0)
- return 0;
-
- if (dlen <= plen) {
- /* Short packet: try for partial? */
- if (strnicmp(data, pattern, dlen) == 0)
- return -1;
- else return 0;
- }
-
- if (strnicmp(data, pattern, plen) != 0) {
-#if 0
- size_t i;
-
- DEBUGP("ftp: string mismatch\n");
- for (i = 0; i < plen; i++) {
- DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
- i, data[i], data[i],
- pattern[i], pattern[i]);
- }
-#endif
- return 0;
- }
-
- DEBUGP("Pattern matches!\n");
- /* Now we've found the constant string, try to skip
- to the 'skip' character */
- for (i = plen; data[i] != skip; i++)
- if (i == dlen - 1) return -1;
-
- /* Skip over the last character */
- i++;
-
- DEBUGP("Skipped up to `%c'!\n", skip);
-
- *numoff = i;
- *numlen = getnum(data + i, dlen - i, array, term);
- if (!*numlen)
- return -1;
-
- DEBUGP("Match succeeded!\n");
- return 1;
-}
-
-/* Look up to see if we're just after a \n. */
-static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
-{
- unsigned int i;
-
- for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
- if (info->seq_aft_nl[dir][i] == seq)
- return 1;
- return 0;
-}
-
-/* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
- struct sk_buff *skb)
-{
- unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
-
- /* Look for oldest: if we find exact match, we're done. */
- for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
- if (info->seq_aft_nl[dir][i] == nl_seq)
- return;
-
- if (oldest == info->seq_aft_nl_num[dir]
- || before(info->seq_aft_nl[dir][i], oldest))
- oldest = i;
- }
-
- if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
- info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
- ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
- } else if (oldest != NUM_SEQ_TO_REMEMBER) {
- info->seq_aft_nl[dir][oldest] = nl_seq;
- ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
- }
-}
-
-static int help(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- unsigned int dataoff, datalen;
- struct tcphdr _tcph, *th;
- char *fb_ptr;
- int ret;
- u32 seq, array[6] = { 0 };
- int dir = CTINFO2DIR(ctinfo);
- unsigned int matchlen, matchoff;
- struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
- struct ip_conntrack_expect *exp;
- unsigned int i;
- int found = 0, ends_in_nl;
- typeof(ip_nat_ftp_hook) ip_nat_ftp;
-
- /* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED
- && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
- DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
- return NF_ACCEPT;
- }
-
- th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
- sizeof(_tcph), &_tcph);
- if (th == NULL)
- return NF_ACCEPT;
-
- dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
- /* No data? */
- if (dataoff >= (*pskb)->len) {
- DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
- return NF_ACCEPT;
- }
- datalen = (*pskb)->len - dataoff;
-
- spin_lock_bh(&ip_ftp_lock);
- fb_ptr = skb_header_pointer(*pskb, dataoff,
- (*pskb)->len - dataoff, ftp_buffer);
- BUG_ON(fb_ptr == NULL);
-
- ends_in_nl = (fb_ptr[datalen - 1] == '\n');
- seq = ntohl(th->seq) + datalen;
-
- /* Look up to see if we're just after a \n. */
- if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
- /* Now if this ends in \n, update ftp info. */
- DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
- ct_ftp_info->seq_aft_nl[0][dir]
- old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
- ret = NF_ACCEPT;
- goto out_update_nl;
- }
-
- /* Initialize IP array to expected address (it's not mentioned
- in EPSV responses) */
- array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
- array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
- array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
- array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
-
- for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
- found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
- search[dir][i].pattern,
- search[dir][i].plen,
- search[dir][i].skip,
- search[dir][i].term,
- &matchoff, &matchlen,
- array,
- search[dir][i].getnum);
- if (found) break;
- }
- if (found == -1) {
- /* We don't usually drop packets. After all, this is
- connection tracking, not packet filtering.
- However, it is necessary for accurate tracking in
- this case. */
- if (net_ratelimit())
- printk("conntrack_ftp: partial %s %u+%u\n",
- search[dir][i].pattern,
- ntohl(th->seq), datalen);
- ret = NF_DROP;
- goto out;
- } else if (found == 0) { /* No match */
- ret = NF_ACCEPT;
- goto out_update_nl;
- }
-
- DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
- fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
-
- /* Allocate expectation which will be inserted */
- exp = ip_conntrack_expect_alloc(ct);
- if (exp == NULL) {
- ret = NF_DROP;
- goto out;
- }
-
- /* We refer to the reverse direction ("!dir") tuples here,
- * because we're expecting something in the other direction.
- * Doesn't matter unless NAT is happening. */
- exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
-
- if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
- != ct->tuplehash[dir].tuple.src.ip) {
- /* Enrico Scholz's passive FTP to partially RNAT'd ftp
- server: it really wants us to connect to a
- different IP address. Simply don't record it for
- NAT. */
- DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
- array[0], array[1], array[2], array[3],
- NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
-
- /* Thanks to Cristiano Lincoln Mattos
- <lincoln@cesar.org.br> for reporting this potential
- problem (DMZ machines opening holes to internal
- networks, or the packet filter itself). */
- if (!loose) {
- ret = NF_ACCEPT;
- goto out_put_expect;
- }
- exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
- | (array[2] << 8) | array[3]);
- }
-
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
- exp->tuple.src.u.tcp.port = 0; /* Don't care. */
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask = ((struct ip_conntrack_tuple)
- { { htonl(0xFFFFFFFF), { 0 } },
- { htonl(0xFFFFFFFF), { .tcp = { htons(0xFFFF) } }, 0xFF }});
-
- exp->expectfn = NULL;
- exp->flags = 0;
-
- /* Now, NAT might want to mangle the packet, and register the
- * (possibly changed) expectation itself. */
- ip_nat_ftp = rcu_dereference(ip_nat_ftp_hook);
- if (ip_nat_ftp)
- ret = ip_nat_ftp(pskb, ctinfo, search[dir][i].ftptype,
- matchoff, matchlen, exp, &seq);
- else {
- /* Can't expect this? Best to drop packet now. */
- if (ip_conntrack_expect_related(exp) != 0)
- ret = NF_DROP;
- else
- ret = NF_ACCEPT;
- }
-
-out_put_expect:
- ip_conntrack_expect_put(exp);
-
-out_update_nl:
- /* Now if this ends in \n, update ftp info. Seq may have been
- * adjusted by NAT code. */
- if (ends_in_nl)
- update_nl_seq(seq, ct_ftp_info,dir, *pskb);
- out:
- spin_unlock_bh(&ip_ftp_lock);
- return ret;
-}
-
-static struct ip_conntrack_helper ftp[MAX_PORTS];
-static char ftp_names[MAX_PORTS][sizeof("ftp-65535")];
-
-/* Not __exit: called from init() */
-static void ip_conntrack_ftp_fini(void)
-{
- int i;
- for (i = 0; i < ports_c; i++) {
- DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
- ports[i]);
- ip_conntrack_helper_unregister(&ftp[i]);
- }
-
- kfree(ftp_buffer);
-}
-
-static int __init ip_conntrack_ftp_init(void)
-{
- int i, ret;
- char *tmpname;
-
- ftp_buffer = kmalloc(65536, GFP_KERNEL);
- if (!ftp_buffer)
- return -ENOMEM;
-
- if (ports_c == 0)
- ports[ports_c++] = FTP_PORT;
-
- for (i = 0; i < ports_c; i++) {
- ftp[i].tuple.src.u.tcp.port = htons(ports[i]);
- ftp[i].tuple.dst.protonum = IPPROTO_TCP;
- ftp[i].mask.src.u.tcp.port = htons(0xFFFF);
- ftp[i].mask.dst.protonum = 0xFF;
- ftp[i].max_expected = 1;
- ftp[i].timeout = 5 * 60; /* 5 minutes */
- ftp[i].me = THIS_MODULE;
- ftp[i].help = help;
-
- tmpname = &ftp_names[i][0];
- if (ports[i] == FTP_PORT)
- sprintf(tmpname, "ftp");
- else
- sprintf(tmpname, "ftp-%d", ports[i]);
- ftp[i].name = tmpname;
-
- DEBUGP("ip_ct_ftp: registering helper for port %d\n",
- ports[i]);
- ret = ip_conntrack_helper_register(&ftp[i]);
-
- if (ret) {
- ip_conntrack_ftp_fini();
- return ret;
- }
- }
- return 0;
-}
-
-module_init(ip_conntrack_ftp_init);
-module_exit(ip_conntrack_ftp_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
deleted file mode 100644
index aabfe1c06905..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ /dev/null
@@ -1,1841 +0,0 @@
-/*
- * H.323 connection tracking helper
- *
- * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
- *
- * This source code is licensed under General Public License version 2.
- *
- * Based on the 'brute force' H.323 connection tracking module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * For more information, please see http://nath323.sourceforge.net/
- */
-
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <net/tcp.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
-#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
-#include <linux/moduleparam.h>
-#include <linux/ctype.h>
-#include <linux/inet.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Parameters */
-static unsigned int default_rrq_ttl = 300;
-module_param(default_rrq_ttl, uint, 0600);
-MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ");
-
-static int gkrouted_only = 1;
-module_param(gkrouted_only, int, 0600);
-MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
-
-static int callforward_filter = 1;
-module_param(callforward_filter, bool, 0600);
-MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
- "if both endpoints are on different sides "
- "(determined by routing information)");
-
-/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff ** pskb,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr,
- __be32 ip, u_int16_t port);
-int (*set_h225_addr_hook) (struct sk_buff ** pskb,
- unsigned char **data, int dataoff,
- TransportAddress * addr,
- __be32 ip, u_int16_t port);
-int (*set_sig_addr_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data,
- TransportAddress * addr, int count);
-int (*set_ras_addr_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data,
- TransportAddress * addr, int count);
-int (*nat_rtp_rtcp_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr,
- u_int16_t port, u_int16_t rtp_port,
- struct ip_conntrack_expect * rtp_exp,
- struct ip_conntrack_expect * rtcp_exp);
-int (*nat_t120_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr, u_int16_t port,
- struct ip_conntrack_expect * exp);
-int (*nat_h245_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- TransportAddress * addr, u_int16_t port,
- struct ip_conntrack_expect * exp);
-int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- TransportAddress * addr, u_int16_t port,
- struct ip_conntrack_expect * exp);
-int (*nat_q931_hook) (struct sk_buff ** pskb,
- struct ip_conntrack * ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, TransportAddress * addr, int idx,
- u_int16_t port, struct ip_conntrack_expect * exp);
-
-
-static DEFINE_SPINLOCK(ip_h323_lock);
-static char *h323_buffer;
-
-/****************************************************************************/
-static int get_tpkt_data(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int *datalen, int *dataoff)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- struct tcphdr _tcph, *th;
- int tcpdatalen;
- int tcpdataoff;
- unsigned char *tpkt;
- int tpktlen;
- int tpktoff;
-
- /* Get TCP header */
- th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL)
- return 0;
-
- /* Get TCP data offset */
- tcpdataoff = (*pskb)->nh.iph->ihl * 4 + th->doff * 4;
-
- /* Get TCP data length */
- tcpdatalen = (*pskb)->len - tcpdataoff;
- if (tcpdatalen <= 0) /* No TCP data */
- goto clear_out;
-
- if (*data == NULL) { /* first TPKT */
- /* Get first TPKT pointer */
- tpkt = skb_header_pointer(*pskb, tcpdataoff, tcpdatalen,
- h323_buffer);
- BUG_ON(tpkt == NULL);
-
- /* Validate TPKT identifier */
- if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
- /* Netmeeting sends TPKT header and data separately */
- if (info->tpkt_len[dir] > 0) {
- DEBUGP("ip_ct_h323: previous packet "
- "indicated separate TPKT data of %hu "
- "bytes\n", info->tpkt_len[dir]);
- if (info->tpkt_len[dir] <= tcpdatalen) {
- /* Yes, there was a TPKT header
- * received */
- *data = tpkt;
- *datalen = info->tpkt_len[dir];
- *dataoff = 0;
- goto out;
- }
-
- /* Fragmented TPKT */
- if (net_ratelimit())
- printk("ip_ct_h323: "
- "fragmented TPKT\n");
- goto clear_out;
- }
-
- /* It is not even a TPKT */
- return 0;
- }
- tpktoff = 0;
- } else { /* Next TPKT */
- tpktoff = *dataoff + *datalen;
- tcpdatalen -= tpktoff;
- if (tcpdatalen <= 4) /* No more TPKT */
- goto clear_out;
- tpkt = *data + *datalen;
-
- /* Validate TPKT identifier */
- if (tpkt[0] != 0x03 || tpkt[1] != 0)
- goto clear_out;
- }
-
- /* Validate TPKT length */
- tpktlen = tpkt[2] * 256 + tpkt[3];
- if (tpktlen < 4)
- goto clear_out;
- if (tpktlen > tcpdatalen) {
- if (tcpdatalen == 4) { /* Separate TPKT header */
- /* Netmeeting sends TPKT header and data separately */
- DEBUGP("ip_ct_h323: separate TPKT header indicates "
- "there will be TPKT data of %hu bytes\n",
- tpktlen - 4);
- info->tpkt_len[dir] = tpktlen - 4;
- return 0;
- }
-
- if (net_ratelimit())
- printk("ip_ct_h323: incomplete TPKT (fragmented?)\n");
- goto clear_out;
- }
-
- /* This is the encapsulated data */
- *data = tpkt + 4;
- *datalen = tpktlen - 4;
- *dataoff = tpktoff + 4;
-
- out:
- /* Clear TPKT length */
- info->tpkt_len[dir] = 0;
- return 1;
-
- clear_out:
- info->tpkt_len[dir] = 0;
- return 0;
-}
-
-/****************************************************************************/
-static int get_h245_addr(unsigned char *data, H245_TransportAddress * addr,
- __be32 * ip, u_int16_t * port)
-{
- unsigned char *p;
-
- if (addr->choice != eH245_TransportAddress_unicastAddress ||
- addr->unicastAddress.choice != eUnicastAddress_iPAddress)
- return 0;
-
- p = data + addr->unicastAddress.iPAddress.network;
- *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3]));
- *port = (p[4] << 8) | (p[5]);
-
- return 1;
-}
-
-/****************************************************************************/
-static int expect_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- u_int16_t rtp_port;
- struct ip_conntrack_expect *rtp_exp;
- struct ip_conntrack_expect *rtcp_exp;
- typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
-
- /* Read RTP or RTCP address */
- if (!get_h245_addr(*data, addr, &ip, &port) ||
- ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
- return 0;
-
- /* RTP port is even */
- rtp_port = port & (~1);
-
- /* Create expect for RTP */
- if ((rtp_exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- rtp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- rtp_exp->tuple.src.u.udp.port = 0;
- rtp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
- rtp_exp->tuple.dst.u.udp.port = htons(rtp_port);
- rtp_exp->tuple.dst.protonum = IPPROTO_UDP;
- rtp_exp->mask.src.ip = htonl(0xFFFFFFFF);
- rtp_exp->mask.src.u.udp.port = 0;
- rtp_exp->mask.dst.ip = htonl(0xFFFFFFFF);
- rtp_exp->mask.dst.u.udp.port = htons(0xFFFF);
- rtp_exp->mask.dst.protonum = 0xFF;
- rtp_exp->flags = 0;
-
- /* Create expect for RTCP */
- if ((rtcp_exp = ip_conntrack_expect_alloc(ct)) == NULL) {
- ip_conntrack_expect_put(rtp_exp);
- return -1;
- }
- rtcp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- rtcp_exp->tuple.src.u.udp.port = 0;
- rtcp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
- rtcp_exp->tuple.dst.u.udp.port = htons(rtp_port + 1);
- rtcp_exp->tuple.dst.protonum = IPPROTO_UDP;
- rtcp_exp->mask.src.ip = htonl(0xFFFFFFFF);
- rtcp_exp->mask.src.u.udp.port = 0;
- rtcp_exp->mask.dst.ip = htonl(0xFFFFFFFF);
- rtcp_exp->mask.dst.u.udp.port = htons(0xFFFF);
- rtcp_exp->mask.dst.protonum = 0xFF;
- rtcp_exp->flags = 0;
-
- if (ct->tuplehash[dir].tuple.src.ip !=
- ct->tuplehash[!dir].tuple.dst.ip &&
- (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook))) {
- /* NAT needed */
- ret = nat_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
- addr, port, rtp_port, rtp_exp, rtcp_exp);
- } else { /* Conntrack only */
- rtp_exp->expectfn = NULL;
- rtcp_exp->expectfn = NULL;
-
- if (ip_conntrack_expect_related(rtp_exp) == 0) {
- if (ip_conntrack_expect_related(rtcp_exp) == 0) {
- DEBUGP("ip_ct_h323: expect RTP "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(rtp_exp->tuple.src.ip),
- ntohs(rtp_exp->tuple.src.u.udp.port),
- NIPQUAD(rtp_exp->tuple.dst.ip),
- ntohs(rtp_exp->tuple.dst.u.udp.port));
- DEBUGP("ip_ct_h323: expect RTCP "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(rtcp_exp->tuple.src.ip),
- ntohs(rtcp_exp->tuple.src.u.udp.port),
- NIPQUAD(rtcp_exp->tuple.dst.ip),
- ntohs(rtcp_exp->tuple.dst.u.udp.port));
- } else {
- ip_conntrack_unexpect_related(rtp_exp);
- ret = -1;
- }
- } else
- ret = -1;
- }
-
- ip_conntrack_expect_put(rtp_exp);
- ip_conntrack_expect_put(rtcp_exp);
-
- return ret;
-}
-
-/****************************************************************************/
-static int expect_t120(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp = NULL;
- typeof(nat_t120_hook) nat_t120;
-
- /* Read T.120 address */
- if (!get_h245_addr(*data, addr, &ip, &port) ||
- ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
- return 0;
-
- /* Create expect for T.120 connections */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple channels */
-
- if (ct->tuplehash[dir].tuple.src.ip !=
- ct->tuplehash[!dir].tuple.dst.ip &&
- (nat_t120 = rcu_dereference(nat_t120_hook))) {
- /* NAT needed */
- ret = nat_t120(pskb, ct, ctinfo, data, dataoff, addr,
- port, exp);
- } else { /* Conntrack only */
- exp->expectfn = NULL;
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_h323: expect T.120 "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
- } else
- ret = -1;
- }
-
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-/****************************************************************************/
-static int process_h245_channel(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H2250LogicalChannelParameters * channel)
-{
- int ret;
-
- if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
- /* RTP */
- ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
- &channel->mediaChannel);
- if (ret < 0)
- return -1;
- }
-
- if (channel->
- options & eH2250LogicalChannelParameters_mediaControlChannel) {
- /* RTCP */
- ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
- &channel->mediaControlChannel);
- if (ret < 0)
- return -1;
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_olc(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- OpenLogicalChannel * olc)
-{
- int ret;
-
- DEBUGP("ip_ct_h323: OpenLogicalChannel\n");
-
- if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
- eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
- {
- ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff,
- &olc->
- forwardLogicalChannelParameters.
- multiplexParameters.
- h2250LogicalChannelParameters);
- if (ret < 0)
- return -1;
- }
-
- if ((olc->options &
- eOpenLogicalChannel_reverseLogicalChannelParameters) &&
- (olc->reverseLogicalChannelParameters.options &
- eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters)
- && (olc->reverseLogicalChannelParameters.multiplexParameters.
- choice ==
- eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
- {
- ret =
- process_h245_channel(pskb, ct, ctinfo, data, dataoff,
- &olc->
- reverseLogicalChannelParameters.
- multiplexParameters.
- h2250LogicalChannelParameters);
- if (ret < 0)
- return -1;
- }
-
- if ((olc->options & eOpenLogicalChannel_separateStack) &&
- olc->forwardLogicalChannelParameters.dataType.choice ==
- eDataType_data &&
- olc->forwardLogicalChannelParameters.dataType.data.application.
- choice == eDataApplicationCapability_application_t120 &&
- olc->forwardLogicalChannelParameters.dataType.data.application.
- t120.choice == eDataProtocolCapability_separateLANStack &&
- olc->separateStack.networkAddress.choice ==
- eNetworkAccessParameters_networkAddress_localAreaAddress) {
- ret = expect_t120(pskb, ct, ctinfo, data, dataoff,
- &olc->separateStack.networkAddress.
- localAreaAddress);
- if (ret < 0)
- return -1;
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_olca(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- OpenLogicalChannelAck * olca)
-{
- H2250LogicalChannelAckParameters *ack;
- int ret;
-
- DEBUGP("ip_ct_h323: OpenLogicalChannelAck\n");
-
- if ((olca->options &
- eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
- (olca->reverseLogicalChannelParameters.options &
- eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters)
- && (olca->reverseLogicalChannelParameters.multiplexParameters.
- choice ==
- eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
- {
- ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff,
- &olca->
- reverseLogicalChannelParameters.
- multiplexParameters.
- h2250LogicalChannelParameters);
- if (ret < 0)
- return -1;
- }
-
- if ((olca->options &
- eOpenLogicalChannelAck_forwardMultiplexAckParameters) &&
- (olca->forwardMultiplexAckParameters.choice ==
- eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters))
- {
- ack = &olca->forwardMultiplexAckParameters.
- h2250LogicalChannelAckParameters;
- if (ack->options &
- eH2250LogicalChannelAckParameters_mediaChannel) {
- /* RTP */
- ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
- &ack->mediaChannel);
- if (ret < 0)
- return -1;
- }
-
- if (ack->options &
- eH2250LogicalChannelAckParameters_mediaControlChannel) {
- /* RTCP */
- ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
- &ack->mediaControlChannel);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- MultimediaSystemControlMessage * mscm)
-{
- switch (mscm->choice) {
- case eMultimediaSystemControlMessage_request:
- if (mscm->request.choice ==
- eRequestMessage_openLogicalChannel) {
- return process_olc(pskb, ct, ctinfo, data, dataoff,
- &mscm->request.openLogicalChannel);
- }
- DEBUGP("ip_ct_h323: H.245 Request %d\n",
- mscm->request.choice);
- break;
- case eMultimediaSystemControlMessage_response:
- if (mscm->response.choice ==
- eResponseMessage_openLogicalChannelAck) {
- return process_olca(pskb, ct, ctinfo, data, dataoff,
- &mscm->response.
- openLogicalChannelAck);
- }
- DEBUGP("ip_ct_h323: H.245 Response %d\n",
- mscm->response.choice);
- break;
- default:
- DEBUGP("ip_ct_h323: H.245 signal %d\n", mscm->choice);
- break;
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int h245_help(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- static MultimediaSystemControlMessage mscm;
- unsigned char *data = NULL;
- int datalen;
- int dataoff;
- int ret;
-
- /* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED
- && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
- return NF_ACCEPT;
- }
- DEBUGP("ip_ct_h245: skblen = %u\n", (*pskb)->len);
-
- spin_lock_bh(&ip_h323_lock);
-
- /* Process each TPKT */
- while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) {
- DEBUGP("ip_ct_h245: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
- NIPQUAD((*pskb)->nh.iph->saddr),
- NIPQUAD((*pskb)->nh.iph->daddr), datalen);
-
- /* Decode H.245 signal */
- ret = DecodeMultimediaSystemControlMessage(data, datalen,
- &mscm);
- if (ret < 0) {
- if (net_ratelimit())
- printk("ip_ct_h245: decoding error: %s\n",
- ret == H323_ERROR_BOUND ?
- "out of bound" : "out of range");
- /* We don't drop when decoding error */
- break;
- }
-
- /* Process H.245 signal */
- if (process_h245(pskb, ct, ctinfo, &data, dataoff, &mscm) < 0)
- goto drop;
- }
-
- spin_unlock_bh(&ip_h323_lock);
- return NF_ACCEPT;
-
- drop:
- spin_unlock_bh(&ip_h323_lock);
- if (net_ratelimit())
- printk("ip_ct_h245: packet dropped\n");
- return NF_DROP;
-}
-
-/****************************************************************************/
-static struct ip_conntrack_helper ip_conntrack_helper_h245 = {
- .name = "H.245",
- .me = THIS_MODULE,
- .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */ ,
- .timeout = 240,
- .tuple = {.dst = {.protonum = IPPROTO_TCP}},
- .mask = {.src = {.u = {0xFFFF}},
- .dst = {.protonum = 0xFF}},
- .help = h245_help
-};
-
-/****************************************************************************/
-void ip_conntrack_h245_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this)
-{
- write_lock_bh(&ip_conntrack_lock);
- new->helper = &ip_conntrack_helper_h245;
- write_unlock_bh(&ip_conntrack_lock);
-}
-
-/****************************************************************************/
-int get_h225_addr(unsigned char *data, TransportAddress * addr,
- __be32 * ip, u_int16_t * port)
-{
- unsigned char *p;
-
- if (addr->choice != eTransportAddress_ipAddress)
- return 0;
-
- p = data + addr->ipAddress.ip;
- *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3]));
- *port = (p[4] << 8) | (p[5]);
-
- return 1;
-}
-
-/****************************************************************************/
-static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- TransportAddress * addr)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp = NULL;
- typeof(nat_h245_hook) nat_h245;
-
- /* Read h245Address */
- if (!get_h225_addr(*data, addr, &ip, &port) ||
- ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
- return 0;
-
- /* Create expect for h245 connection */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = 0;
-
- if (ct->tuplehash[dir].tuple.src.ip !=
- ct->tuplehash[!dir].tuple.dst.ip &&
- (nat_h245 = rcu_dereference(nat_h245_hook))) {
- /* NAT needed */
- ret = nat_h245(pskb, ct, ctinfo, data, dataoff, addr,
- port, exp);
- } else { /* Conntrack only */
- exp->expectfn = ip_conntrack_h245_expect;
-
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_q931: expect H.245 "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
- } else
- ret = -1;
- }
-
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-/* Forwarding declaration */
-void ip_conntrack_q931_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this);
-
-/****************************************************************************/
-static int expect_callforwarding(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- TransportAddress * addr)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp = NULL;
- typeof(nat_callforwarding_hook) nat_callforwarding;
-
- /* Read alternativeAddress */
- if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
- return 0;
-
- /* If the calling party is on the same side of the forward-to party,
- * we don't need to track the second call */
- if (callforward_filter) {
- struct rtable *rt1, *rt2;
- struct flowi fl1 = {
- .fl4_dst = ip,
- };
- struct flowi fl2 = {
- .fl4_dst = ct->tuplehash[!dir].tuple.src.ip,
- };
-
- if (ip_route_output_key(&rt1, &fl1) == 0) {
- if (ip_route_output_key(&rt2, &fl2) == 0) {
- if (rt1->rt_gateway == rt2->rt_gateway &&
- rt1->u.dst.dev == rt2->u.dst.dev)
- ret = 1;
- dst_release(&rt2->u.dst);
- }
- dst_release(&rt1->u.dst);
- }
- if (ret) {
- DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
- return 0;
- }
- }
-
- /* Create expect for the second call leg */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = 0;
-
- if (ct->tuplehash[dir].tuple.src.ip !=
- ct->tuplehash[!dir].tuple.dst.ip &&
- (nat_callforwarding = rcu_dereference(nat_callforwarding_hook))) {
- /* Need NAT */
- ret = nat_callforwarding(pskb, ct, ctinfo, data, dataoff,
- addr, port, exp);
- } else { /* Conntrack only */
- exp->expectfn = ip_conntrack_q931_expect;
-
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_q931: expect Call Forwarding "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
- } else
- ret = -1;
- }
-
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-/****************************************************************************/
-static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- Setup_UUIE * setup)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret;
- int i;
- __be32 ip;
- u_int16_t port;
- typeof(set_h225_addr_hook) set_h225_addr;
-
- DEBUGP("ip_ct_q931: Setup\n");
-
- if (setup->options & eSetup_UUIE_h245Address) {
- ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
- &setup->h245Address);
- if (ret < 0)
- return -1;
- }
-
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
-
- if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) &&
- get_h225_addr(*data, &setup->destCallSignalAddress, &ip, &port) &&
- ip != ct->tuplehash[!dir].tuple.src.ip) {
- DEBUGP("ip_ct_q931: set destCallSignalAddress "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(ip), port,
- NIPQUAD(ct->tuplehash[!dir].tuple.src.ip),
- ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
- ret = set_h225_addr(pskb, data, dataoff,
- &setup->destCallSignalAddress,
- ct->tuplehash[!dir].tuple.src.ip,
- ntohs(ct->tuplehash[!dir].tuple.src.
- u.tcp.port));
- if (ret < 0)
- return -1;
- }
-
- if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) &&
- get_h225_addr(*data, &setup->sourceCallSignalAddress, &ip, &port)
- && ip != ct->tuplehash[!dir].tuple.dst.ip) {
- DEBUGP("ip_ct_q931: set sourceCallSignalAddress "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(ip), port,
- NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip),
- ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
- ret = set_h225_addr(pskb, data, dataoff,
- &setup->sourceCallSignalAddress,
- ct->tuplehash[!dir].tuple.dst.ip,
- ntohs(ct->tuplehash[!dir].tuple.dst.
- u.tcp.port));
- if (ret < 0)
- return -1;
- }
-
- if (setup->options & eSetup_UUIE_fastStart) {
- for (i = 0; i < setup->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &setup->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_callproceeding(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- CallProceeding_UUIE * callproc)
-{
- int ret;
- int i;
-
- DEBUGP("ip_ct_q931: CallProceeding\n");
-
- if (callproc->options & eCallProceeding_UUIE_h245Address) {
- ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
- &callproc->h245Address);
- if (ret < 0)
- return -1;
- }
-
- if (callproc->options & eCallProceeding_UUIE_fastStart) {
- for (i = 0; i < callproc->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &callproc->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_connect(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- Connect_UUIE * connect)
-{
- int ret;
- int i;
-
- DEBUGP("ip_ct_q931: Connect\n");
-
- if (connect->options & eConnect_UUIE_h245Address) {
- ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
- &connect->h245Address);
- if (ret < 0)
- return -1;
- }
-
- if (connect->options & eConnect_UUIE_fastStart) {
- for (i = 0; i < connect->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &connect->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_alerting(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- Alerting_UUIE * alert)
-{
- int ret;
- int i;
-
- DEBUGP("ip_ct_q931: Alerting\n");
-
- if (alert->options & eAlerting_UUIE_h245Address) {
- ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
- &alert->h245Address);
- if (ret < 0)
- return -1;
- }
-
- if (alert->options & eAlerting_UUIE_fastStart) {
- for (i = 0; i < alert->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &alert->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_information(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- Information_UUIE * info)
-{
- int ret;
- int i;
-
- DEBUGP("ip_ct_q931: Information\n");
-
- if (info->options & eInformation_UUIE_fastStart) {
- for (i = 0; i < info->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &info->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- Facility_UUIE * facility)
-{
- int ret;
- int i;
-
- DEBUGP("ip_ct_q931: Facility\n");
-
- if (facility->reason.choice == eFacilityReason_callForwarded) {
- if (facility->options & eFacility_UUIE_alternativeAddress)
- return expect_callforwarding(pskb, ct, ctinfo, data,
- dataoff,
- &facility->
- alternativeAddress);
- return 0;
- }
-
- if (facility->options & eFacility_UUIE_h245Address) {
- ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
- &facility->h245Address);
- if (ret < 0)
- return -1;
- }
-
- if (facility->options & eFacility_UUIE_fastStart) {
- for (i = 0; i < facility->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &facility->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_progress(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- Progress_UUIE * progress)
-{
- int ret;
- int i;
-
- DEBUGP("ip_ct_q931: Progress\n");
-
- if (progress->options & eProgress_UUIE_h245Address) {
- ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
- &progress->h245Address);
- if (ret < 0)
- return -1;
- }
-
- if (progress->options & eProgress_UUIE_fastStart) {
- for (i = 0; i < progress->fastStart.count; i++) {
- ret = process_olc(pskb, ct, ctinfo, data, dataoff,
- &progress->fastStart.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff, Q931 * q931)
-{
- H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
- int i;
- int ret = 0;
-
- switch (pdu->h323_message_body.choice) {
- case eH323_UU_PDU_h323_message_body_setup:
- ret = process_setup(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.setup);
- break;
- case eH323_UU_PDU_h323_message_body_callProceeding:
- ret = process_callproceeding(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.
- callProceeding);
- break;
- case eH323_UU_PDU_h323_message_body_connect:
- ret = process_connect(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.connect);
- break;
- case eH323_UU_PDU_h323_message_body_alerting:
- ret = process_alerting(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.alerting);
- break;
- case eH323_UU_PDU_h323_message_body_information:
- ret = process_information(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.
- information);
- break;
- case eH323_UU_PDU_h323_message_body_facility:
- ret = process_facility(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.facility);
- break;
- case eH323_UU_PDU_h323_message_body_progress:
- ret = process_progress(pskb, ct, ctinfo, data, dataoff,
- &pdu->h323_message_body.progress);
- break;
- default:
- DEBUGP("ip_ct_q931: Q.931 signal %d\n",
- pdu->h323_message_body.choice);
- break;
- }
-
- if (ret < 0)
- return -1;
-
- if (pdu->options & eH323_UU_PDU_h245Control) {
- for (i = 0; i < pdu->h245Control.count; i++) {
- ret = process_h245(pskb, ct, ctinfo, data, dataoff,
- &pdu->h245Control.item[i]);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int q931_help(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- static Q931 q931;
- unsigned char *data = NULL;
- int datalen;
- int dataoff;
- int ret;
-
- /* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED
- && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
- return NF_ACCEPT;
- }
- DEBUGP("ip_ct_q931: skblen = %u\n", (*pskb)->len);
-
- spin_lock_bh(&ip_h323_lock);
-
- /* Process each TPKT */
- while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) {
- DEBUGP("ip_ct_q931: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
- NIPQUAD((*pskb)->nh.iph->saddr),
- NIPQUAD((*pskb)->nh.iph->daddr), datalen);
-
- /* Decode Q.931 signal */
- ret = DecodeQ931(data, datalen, &q931);
- if (ret < 0) {
- if (net_ratelimit())
- printk("ip_ct_q931: decoding error: %s\n",
- ret == H323_ERROR_BOUND ?
- "out of bound" : "out of range");
- /* We don't drop when decoding error */
- break;
- }
-
- /* Process Q.931 signal */
- if (process_q931(pskb, ct, ctinfo, &data, dataoff, &q931) < 0)
- goto drop;
- }
-
- spin_unlock_bh(&ip_h323_lock);
- return NF_ACCEPT;
-
- drop:
- spin_unlock_bh(&ip_h323_lock);
- if (net_ratelimit())
- printk("ip_ct_q931: packet dropped\n");
- return NF_DROP;
-}
-
-/****************************************************************************/
-static struct ip_conntrack_helper ip_conntrack_helper_q931 = {
- .name = "Q.931",
- .me = THIS_MODULE,
- .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4 /* T.120 and H.245 */ ,
- .timeout = 240,
- .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(Q931_PORT)}}},
- .dst = {.protonum = IPPROTO_TCP}},
- .mask = {.src = {.u = {0xFFFF}},
- .dst = {.protonum = 0xFF}},
- .help = q931_help
-};
-
-/****************************************************************************/
-void ip_conntrack_q931_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this)
-{
- write_lock_bh(&ip_conntrack_lock);
- new->helper = &ip_conntrack_helper_q931;
- write_unlock_bh(&ip_conntrack_lock);
-}
-
-/****************************************************************************/
-static unsigned char *get_udp_data(struct sk_buff **pskb, int *datalen)
-{
- struct udphdr _uh, *uh;
- int dataoff;
-
- uh = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, sizeof(_uh),
- &_uh);
- if (uh == NULL)
- return NULL;
- dataoff = (*pskb)->nh.iph->ihl * 4 + sizeof(_uh);
- if (dataoff >= (*pskb)->len)
- return NULL;
- *datalen = (*pskb)->len - dataoff;
- return skb_header_pointer(*pskb, dataoff, *datalen, h323_buffer);
-}
-
-/****************************************************************************/
-static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct,
- __be32 ip, u_int16_t port)
-{
- struct ip_conntrack_expect *exp;
- struct ip_conntrack_tuple tuple;
-
- tuple.src.ip = 0;
- tuple.src.u.tcp.port = 0;
- tuple.dst.ip = ip;
- tuple.dst.u.tcp.port = htons(port);
- tuple.dst.protonum = IPPROTO_TCP;
-
- exp = __ip_conntrack_expect_find(&tuple);
- if (exp && exp->master == ct)
- return exp;
- return NULL;
-}
-
-/****************************************************************************/
-static int set_expect_timeout(struct ip_conntrack_expect *exp,
- unsigned timeout)
-{
- if (!exp || !del_timer(&exp->timeout))
- return 0;
-
- exp->timeout.expires = jiffies + timeout * HZ;
- add_timer(&exp->timeout);
-
- return 1;
-}
-
-/****************************************************************************/
-static int expect_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data,
- TransportAddress * addr, int count)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- int i;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp;
- typeof(nat_q931_hook) nat_q931;
-
- /* Look for the first related address */
- for (i = 0; i < count; i++) {
- if (get_h225_addr(*data, &addr[i], &ip, &port) &&
- ip == ct->tuplehash[dir].tuple.src.ip && port != 0)
- break;
- }
-
- if (i >= count) /* Not found */
- return 0;
-
- /* Create expect for Q.931 */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = gkrouted_only ? /* only accept calls from GK? */
- ct->tuplehash[!dir].tuple.src.ip : 0;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask.src.ip = gkrouted_only ? htonl(0xFFFFFFFF) : 0;
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple calls */
-
- nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931) { /* Need NAT */
- ret = nat_q931(pskb, ct, ctinfo, data, addr, i, port, exp);
- } else { /* Conntrack only */
- exp->expectfn = ip_conntrack_q931_expect;
-
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_ras: expect Q.931 "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
-
- /* Save port for looking up expect in processing RCF */
- info->sig_port[dir] = port;
- } else
- ret = -1;
- }
-
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-/****************************************************************************/
-static int process_grq(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, GatekeeperRequest * grq)
-{
- typeof(set_ras_addr_hook) set_ras_addr;
-
- DEBUGP("ip_ct_ras: GRQ\n");
-
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr) /* NATed */
- return set_ras_addr(pskb, ct, ctinfo, data,
- &grq->rasAddress, 1);
- return 0;
-}
-
-/* Declare before using */
-static void ip_conntrack_ras_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this);
-
-/****************************************************************************/
-static int process_gcf(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, GatekeeperConfirm * gcf)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp;
-
- DEBUGP("ip_ct_ras: GCF\n");
-
- if (!get_h225_addr(*data, &gcf->rasAddress, &ip, &port))
- return 0;
-
- /* Registration port is the same as discovery port */
- if (ip == ct->tuplehash[dir].tuple.src.ip &&
- port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port))
- return 0;
-
- /* Avoid RAS expectation loops. A GCF is never expected. */
- if (test_bit(IPS_EXPECTED_BIT, &ct->status))
- return 0;
-
- /* Need new expect */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_UDP;
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = 0;
- exp->expectfn = ip_conntrack_ras_expect;
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_ras: expect RAS "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
- } else
- ret = -1;
-
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-/****************************************************************************/
-static int process_rrq(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, RegistrationRequest * rrq)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
-
- DEBUGP("ip_ct_ras: RRQ\n");
-
- ret = expect_q931(pskb, ct, ctinfo, data,
- rrq->callSignalAddress.item,
- rrq->callSignalAddress.count);
- if (ret < 0)
- return -1;
-
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr) {
- ret = set_ras_addr(pskb, ct, ctinfo, data,
- rrq->rasAddress.item,
- rrq->rasAddress.count);
- if (ret < 0)
- return -1;
- }
-
- if (rrq->options & eRegistrationRequest_timeToLive) {
- DEBUGP("ip_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
- info->timeout = rrq->timeToLive;
- } else
- info->timeout = default_rrq_ttl;
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_rcf(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, RegistrationConfirm * rcf)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- int ret;
- struct ip_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
-
- DEBUGP("ip_ct_ras: RCF\n");
-
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr) {
- ret = set_sig_addr(pskb, ct, ctinfo, data,
- rcf->callSignalAddress.item,
- rcf->callSignalAddress.count);
- if (ret < 0)
- return -1;
- }
-
- if (rcf->options & eRegistrationConfirm_timeToLive) {
- DEBUGP("ip_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
- info->timeout = rcf->timeToLive;
- }
-
- if (info->timeout > 0) {
- DEBUGP
- ("ip_ct_ras: set RAS connection timeout to %u seconds\n",
- info->timeout);
- ip_ct_refresh(ct, *pskb, info->timeout * HZ);
-
- /* Set expect timeout */
- read_lock_bh(&ip_conntrack_lock);
- exp = find_expect(ct, ct->tuplehash[dir].tuple.dst.ip,
- info->sig_port[!dir]);
- if (exp) {
- DEBUGP("ip_ct_ras: set Q.931 expect "
- "(%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu) "
- "timeout to %u seconds\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port),
- info->timeout);
- set_expect_timeout(exp, info->timeout);
- }
- read_unlock_bh(&ip_conntrack_lock);
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_urq(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, UnregistrationRequest * urq)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- int ret;
- typeof(set_sig_addr_hook) set_sig_addr;
-
- DEBUGP("ip_ct_ras: URQ\n");
-
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr) {
- ret = set_sig_addr(pskb, ct, ctinfo, data,
- urq->callSignalAddress.item,
- urq->callSignalAddress.count);
- if (ret < 0)
- return -1;
- }
-
- /* Clear old expect */
- ip_ct_remove_expectations(ct);
- info->sig_port[dir] = 0;
- info->sig_port[!dir] = 0;
-
- /* Give it 30 seconds for UCF or URJ */
- ip_ct_refresh(ct, *pskb, 30 * HZ);
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_arq(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, AdmissionRequest * arq)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- __be32 ip;
- u_int16_t port;
- typeof(set_h225_addr_hook) set_h225_addr;
-
- DEBUGP("ip_ct_ras: ARQ\n");
-
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
- if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
- get_h225_addr(*data, &arq->destCallSignalAddress, &ip, &port) &&
- ip == ct->tuplehash[dir].tuple.src.ip &&
- port == info->sig_port[dir] && set_h225_addr) {
- /* Answering ARQ */
- return set_h225_addr(pskb, data, 0,
- &arq->destCallSignalAddress,
- ct->tuplehash[!dir].tuple.dst.ip,
- info->sig_port[!dir]);
- }
-
- if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
- get_h225_addr(*data, &arq->srcCallSignalAddress, &ip, &port) &&
- ip == ct->tuplehash[dir].tuple.src.ip && set_h225_addr) {
- /* Calling ARQ */
- return set_h225_addr(pskb, data, 0,
- &arq->srcCallSignalAddress,
- ct->tuplehash[!dir].tuple.dst.ip,
- port);
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_acf(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, AdmissionConfirm * acf)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
-
- DEBUGP("ip_ct_ras: ACF\n");
-
- if (!get_h225_addr(*data, &acf->destCallSignalAddress, &ip, &port))
- return 0;
-
- if (ip == ct->tuplehash[dir].tuple.dst.ip) { /* Answering ACF */
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr)
- return set_sig_addr(pskb, ct, ctinfo, data,
- &acf->destCallSignalAddress, 1);
- return 0;
- }
-
- /* Need new expect */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = IP_CT_EXPECT_PERMANENT;
- exp->expectfn = ip_conntrack_q931_expect;
-
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_ras: expect Q.931 "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
- } else
- ret = -1;
-
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-/****************************************************************************/
-static int process_lrq(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, LocationRequest * lrq)
-{
- typeof(set_ras_addr_hook) set_ras_addr;
-
- DEBUGP("ip_ct_ras: LRQ\n");
-
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr)
- return set_ras_addr(pskb, ct, ctinfo, data,
- &lrq->replyAddress, 1);
- return 0;
-}
-
-/****************************************************************************/
-static int process_lcf(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, LocationConfirm * lcf)
-{
- int dir = CTINFO2DIR(ctinfo);
- int ret = 0;
- __be32 ip;
- u_int16_t port;
- struct ip_conntrack_expect *exp = NULL;
-
- DEBUGP("ip_ct_ras: LCF\n");
-
- if (!get_h225_addr(*data, &lcf->callSignalAddress, &ip, &port))
- return 0;
-
- /* Need new expect for call signal */
- if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
- return -1;
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.ip = ip;
- exp->tuple.dst.u.tcp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_TCP;
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.tcp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.tcp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
- exp->flags = IP_CT_EXPECT_PERMANENT;
- exp->expectfn = ip_conntrack_q931_expect;
-
- if (ip_conntrack_expect_related(exp) == 0) {
- DEBUGP("ip_ct_ras: expect Q.931 "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip),
- ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip),
- ntohs(exp->tuple.dst.u.tcp.port));
- } else
- ret = -1;
-
- ip_conntrack_expect_put(exp);
-
- /* Ignore rasAddress */
-
- return ret;
-}
-
-/****************************************************************************/
-static int process_irr(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, InfoRequestResponse * irr)
-{
- int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
- typeof(set_sig_addr_hook) set_sig_addr;
-
- DEBUGP("ip_ct_ras: IRR\n");
-
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr) {
- ret = set_ras_addr(pskb, ct, ctinfo, data,
- &irr->rasAddress, 1);
- if (ret < 0)
- return -1;
- }
-
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr) {
- ret = set_sig_addr(pskb, ct, ctinfo, data,
- irr->callSignalAddress.item,
- irr->callSignalAddress.count);
- if (ret < 0)
- return -1;
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int process_ras(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, RasMessage * ras)
-{
- switch (ras->choice) {
- case eRasMessage_gatekeeperRequest:
- return process_grq(pskb, ct, ctinfo, data,
- &ras->gatekeeperRequest);
- case eRasMessage_gatekeeperConfirm:
- return process_gcf(pskb, ct, ctinfo, data,
- &ras->gatekeeperConfirm);
- case eRasMessage_registrationRequest:
- return process_rrq(pskb, ct, ctinfo, data,
- &ras->registrationRequest);
- case eRasMessage_registrationConfirm:
- return process_rcf(pskb, ct, ctinfo, data,
- &ras->registrationConfirm);
- case eRasMessage_unregistrationRequest:
- return process_urq(pskb, ct, ctinfo, data,
- &ras->unregistrationRequest);
- case eRasMessage_admissionRequest:
- return process_arq(pskb, ct, ctinfo, data,
- &ras->admissionRequest);
- case eRasMessage_admissionConfirm:
- return process_acf(pskb, ct, ctinfo, data,
- &ras->admissionConfirm);
- case eRasMessage_locationRequest:
- return process_lrq(pskb, ct, ctinfo, data,
- &ras->locationRequest);
- case eRasMessage_locationConfirm:
- return process_lcf(pskb, ct, ctinfo, data,
- &ras->locationConfirm);
- case eRasMessage_infoRequestResponse:
- return process_irr(pskb, ct, ctinfo, data,
- &ras->infoRequestResponse);
- default:
- DEBUGP("ip_ct_ras: RAS message %d\n", ras->choice);
- break;
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int ras_help(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- static RasMessage ras;
- unsigned char *data;
- int datalen = 0;
- int ret;
-
- DEBUGP("ip_ct_ras: skblen = %u\n", (*pskb)->len);
-
- spin_lock_bh(&ip_h323_lock);
-
- /* Get UDP data */
- data = get_udp_data(pskb, &datalen);
- if (data == NULL)
- goto accept;
- DEBUGP("ip_ct_ras: RAS message %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
- NIPQUAD((*pskb)->nh.iph->saddr),
- NIPQUAD((*pskb)->nh.iph->daddr), datalen);
-
- /* Decode RAS message */
- ret = DecodeRasMessage(data, datalen, &ras);
- if (ret < 0) {
- if (net_ratelimit())
- printk("ip_ct_ras: decoding error: %s\n",
- ret == H323_ERROR_BOUND ?
- "out of bound" : "out of range");
- goto accept;
- }
-
- /* Process RAS message */
- if (process_ras(pskb, ct, ctinfo, &data, &ras) < 0)
- goto drop;
-
- accept:
- spin_unlock_bh(&ip_h323_lock);
- return NF_ACCEPT;
-
- drop:
- spin_unlock_bh(&ip_h323_lock);
- if (net_ratelimit())
- printk("ip_ct_ras: packet dropped\n");
- return NF_DROP;
-}
-
-/****************************************************************************/
-static struct ip_conntrack_helper ip_conntrack_helper_ras = {
- .name = "RAS",
- .me = THIS_MODULE,
- .max_expected = 32,
- .timeout = 240,
- .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(RAS_PORT)}}},
- .dst = {.protonum = IPPROTO_UDP}},
- .mask = {.src = {.u = {0xFFFE}},
- .dst = {.protonum = 0xFF}},
- .help = ras_help,
-};
-
-/****************************************************************************/
-static void ip_conntrack_ras_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this)
-{
- write_lock_bh(&ip_conntrack_lock);
- new->helper = &ip_conntrack_helper_ras;
- write_unlock_bh(&ip_conntrack_lock);
-}
-
-/****************************************************************************/
-/* Not __exit - called from init() */
-static void fini(void)
-{
- ip_conntrack_helper_unregister(&ip_conntrack_helper_ras);
- ip_conntrack_helper_unregister(&ip_conntrack_helper_q931);
- kfree(h323_buffer);
- DEBUGP("ip_ct_h323: fini\n");
-}
-
-/****************************************************************************/
-static int __init init(void)
-{
- int ret;
-
- h323_buffer = kmalloc(65536, GFP_KERNEL);
- if (!h323_buffer)
- return -ENOMEM;
- if ((ret = ip_conntrack_helper_register(&ip_conntrack_helper_q931)) ||
- (ret = ip_conntrack_helper_register(&ip_conntrack_helper_ras))) {
- fini();
- return ret;
- }
- DEBUGP("ip_ct_h323: init success\n");
- return 0;
-}
-
-/****************************************************************************/
-module_init(init);
-module_exit(fini);
-
-EXPORT_SYMBOL_GPL(get_h225_addr);
-EXPORT_SYMBOL_GPL(ip_conntrack_h245_expect);
-EXPORT_SYMBOL_GPL(ip_conntrack_q931_expect);
-EXPORT_SYMBOL_GPL(set_h245_addr_hook);
-EXPORT_SYMBOL_GPL(set_h225_addr_hook);
-EXPORT_SYMBOL_GPL(set_sig_addr_hook);
-EXPORT_SYMBOL_GPL(set_ras_addr_hook);
-EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
-EXPORT_SYMBOL_GPL(nat_t120_hook);
-EXPORT_SYMBOL_GPL(nat_h245_hook);
-EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
-EXPORT_SYMBOL_GPL(nat_q931_hook);
-
-MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
-MODULE_DESCRIPTION("H.323 connection tracking helper");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
deleted file mode 100644
index 4d19373bbf0d..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ /dev/null
@@ -1,684 +0,0 @@
-/*
- * ip_conntrack_pptp.c - Version 3.0
- *
- * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
- * It is a specification defined by Microsoft and some vendors
- * working with Microsoft. PPTP is built on top of a modified
- * version of the Internet Generic Routing Encapsulation Protocol.
- * GRE is defined in RFC 1701 and RFC 1702. Documentation of
- * PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- * Limitations:
- * - We blindly assume that control connections are always
- * established in PNS->PAC direction. This is a violation
- * of RFFC2673
- * - We can only support one single call within each session
- *
- * TODO:
- * - testing of incoming PPTP calls
- *
- * Changes:
- * 2002-02-05 - Version 1.3
- * - Call ip_conntrack_unexpect_related() from
- * pptp_destroy_siblings() to destroy expectations in case
- * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen
- * (Philip Craig <philipc@snapgear.com>)
- * - Add Version information at module loadtime
- * 2002-02-10 - Version 1.6
- * - move to C99 style initializers
- * - remove second expectation if first arrives
- * 2004-10-22 - Version 2.0
- * - merge Mandrake's 2.6.x port with recent 2.6.x API changes
- * - fix lots of linear skb assumptions from Mandrake's port
- * 2005-06-10 - Version 2.1
- * - use ip_conntrack_expect_free() instead of kfree() on the
- * expect's (which are from the slab for quite some time)
- * 2005-06-10 - Version 3.0
- * - port helper to post-2.6.11 API changes,
- * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
- * 2005-07-30 - Version 3.1
- * - port helper to 2.6.13 API changes
- *
- */
-
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
-#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
-
-#define IP_CT_PPTP_VERSION "3.1"
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
-
-static DEFINE_SPINLOCK(ip_pptp_lock);
-
-int
-(*ip_nat_pptp_hook_outbound)(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq);
-
-int
-(*ip_nat_pptp_hook_inbound)(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq);
-
-void
-(*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig,
- struct ip_conntrack_expect *expect_reply);
-
-void
-(*ip_nat_pptp_hook_expectfn)(struct ip_conntrack *ct,
- struct ip_conntrack_expect *exp);
-
-#if 0
-/* PptpControlMessageType names */
-const char *pptp_msg_name[] = {
- "UNKNOWN_MESSAGE",
- "START_SESSION_REQUEST",
- "START_SESSION_REPLY",
- "STOP_SESSION_REQUEST",
- "STOP_SESSION_REPLY",
- "ECHO_REQUEST",
- "ECHO_REPLY",
- "OUT_CALL_REQUEST",
- "OUT_CALL_REPLY",
- "IN_CALL_REQUEST",
- "IN_CALL_REPLY",
- "IN_CALL_CONNECT",
- "CALL_CLEAR_REQUEST",
- "CALL_DISCONNECT_NOTIFY",
- "WAN_ERROR_NOTIFY",
- "SET_LINK_INFO"
-};
-EXPORT_SYMBOL(pptp_msg_name);
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#define SECS *HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-
-#define PPTP_GRE_TIMEOUT (10 MINS)
-#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS)
-
-static void pptp_expectfn(struct ip_conntrack *ct,
- struct ip_conntrack_expect *exp)
-{
- typeof(ip_nat_pptp_hook_expectfn) ip_nat_pptp_expectfn;
-
- DEBUGP("increasing timeouts\n");
-
- /* increase timeout of GRE data channel conntrack entry */
- ct->proto.gre.timeout = PPTP_GRE_TIMEOUT;
- ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
-
- /* Can you see how rusty this code is, compared with the pre-2.6.11
- * one? That's what happened to my shiny newnat of 2002 ;( -HW */
-
- rcu_read_lock();
- ip_nat_pptp_expectfn = rcu_dereference(ip_nat_pptp_hook_expectfn);
- if (!ip_nat_pptp_expectfn) {
- struct ip_conntrack_tuple inv_t;
- struct ip_conntrack_expect *exp_other;
-
- /* obviously this tuple inversion only works until you do NAT */
- invert_tuplepr(&inv_t, &exp->tuple);
- DEBUGP("trying to unexpect other dir: ");
- DUMP_TUPLE(&inv_t);
-
- exp_other = ip_conntrack_expect_find_get(&inv_t);
- if (exp_other) {
- /* delete other expectation. */
- DEBUGP("found\n");
- ip_conntrack_unexpect_related(exp_other);
- ip_conntrack_expect_put(exp_other);
- } else {
- DEBUGP("not found\n");
- }
- } else {
- /* we need more than simple inversion */
- ip_nat_pptp_expectfn(ct, exp);
- }
- rcu_read_unlock();
-}
-
-static int destroy_sibling_or_exp(const struct ip_conntrack_tuple *t)
-{
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack_expect *exp;
-
- DEBUGP("trying to timeout ct or exp for tuple ");
- DUMP_TUPLE(t);
-
- h = ip_conntrack_find_get(t, NULL);
- if (h) {
- struct ip_conntrack *sibling = tuplehash_to_ctrack(h);
- DEBUGP("setting timeout of conntrack %p to 0\n", sibling);
- sibling->proto.gre.timeout = 0;
- sibling->proto.gre.stream_timeout = 0;
- if (del_timer(&sibling->timeout))
- sibling->timeout.function((unsigned long)sibling);
- ip_conntrack_put(sibling);
- return 1;
- } else {
- exp = ip_conntrack_expect_find_get(t);
- if (exp) {
- DEBUGP("unexpect_related of expect %p\n", exp);
- ip_conntrack_unexpect_related(exp);
- ip_conntrack_expect_put(exp);
- return 1;
- }
- }
-
- return 0;
-}
-
-
-/* timeout GRE data connections */
-static void pptp_destroy_siblings(struct ip_conntrack *ct)
-{
- struct ip_conntrack_tuple t;
-
- ip_ct_gre_keymap_destroy(ct);
- /* Since ct->sibling_list has literally rusted away in 2.6.11,
- * we now need another way to find out about our sibling
- * contrack and expects... -HW */
-
- /* try original (pns->pac) tuple */
- memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
- t.dst.protonum = IPPROTO_GRE;
- t.src.u.gre.key = ct->help.ct_pptp_info.pns_call_id;
- t.dst.u.gre.key = ct->help.ct_pptp_info.pac_call_id;
-
- if (!destroy_sibling_or_exp(&t))
- DEBUGP("failed to timeout original pns->pac ct/exp\n");
-
- /* try reply (pac->pns) tuple */
- memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
- t.dst.protonum = IPPROTO_GRE;
- t.src.u.gre.key = ct->help.ct_pptp_info.pac_call_id;
- t.dst.u.gre.key = ct->help.ct_pptp_info.pns_call_id;
-
- if (!destroy_sibling_or_exp(&t))
- DEBUGP("failed to timeout reply pac->pns ct/exp\n");
-}
-
-/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
-static inline int
-exp_gre(struct ip_conntrack *ct,
- __be16 callid,
- __be16 peer_callid)
-{
- struct ip_conntrack_expect *exp_orig, *exp_reply;
- int ret = 1;
- typeof(ip_nat_pptp_hook_exp_gre) ip_nat_pptp_exp_gre;
-
- exp_orig = ip_conntrack_expect_alloc(ct);
- if (exp_orig == NULL)
- goto out;
-
- exp_reply = ip_conntrack_expect_alloc(ct);
- if (exp_reply == NULL)
- goto out_put_orig;
-
- /* original direction, PNS->PAC */
- exp_orig->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
- exp_orig->tuple.src.u.gre.key = peer_callid;
- exp_orig->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
- exp_orig->tuple.dst.u.gre.key = callid;
- exp_orig->tuple.dst.protonum = IPPROTO_GRE;
-
- exp_orig->mask.src.ip = htonl(0xffffffff);
- exp_orig->mask.src.u.all = 0;
- exp_orig->mask.dst.u.gre.key = htons(0xffff);
- exp_orig->mask.dst.ip = htonl(0xffffffff);
- exp_orig->mask.dst.protonum = 0xff;
-
- exp_orig->master = ct;
- exp_orig->expectfn = pptp_expectfn;
- exp_orig->flags = 0;
-
- /* both expectations are identical apart from tuple */
- memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
-
- /* reply direction, PAC->PNS */
- exp_reply->tuple.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
- exp_reply->tuple.src.u.gre.key = callid;
- exp_reply->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
- exp_reply->tuple.dst.u.gre.key = peer_callid;
- exp_reply->tuple.dst.protonum = IPPROTO_GRE;
-
- ip_nat_pptp_exp_gre = rcu_dereference(ip_nat_pptp_hook_exp_gre);
- if (ip_nat_pptp_exp_gre)
- ip_nat_pptp_exp_gre(exp_orig, exp_reply);
- if (ip_conntrack_expect_related(exp_orig) != 0)
- goto out_put_both;
- if (ip_conntrack_expect_related(exp_reply) != 0)
- goto out_unexpect_orig;
-
- /* Add GRE keymap entries */
- if (ip_ct_gre_keymap_add(ct, &exp_orig->tuple, 0) != 0)
- goto out_unexpect_both;
- if (ip_ct_gre_keymap_add(ct, &exp_reply->tuple, 1) != 0) {
- ip_ct_gre_keymap_destroy(ct);
- goto out_unexpect_both;
- }
- ret = 0;
-
-out_put_both:
- ip_conntrack_expect_put(exp_reply);
-out_put_orig:
- ip_conntrack_expect_put(exp_orig);
-out:
- return ret;
-
-out_unexpect_both:
- ip_conntrack_unexpect_related(exp_reply);
-out_unexpect_orig:
- ip_conntrack_unexpect_related(exp_orig);
- goto out_put_both;
-}
-
-static inline int
-pptp_inbound_pkt(struct sk_buff **pskb,
- struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq,
- unsigned int reqlen,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
- u_int16_t msg;
- __be16 cid = 0, pcid = 0;
- typeof(ip_nat_pptp_hook_inbound) ip_nat_pptp_inbound;
-
- msg = ntohs(ctlh->messageType);
- DEBUGP("inbound control message %s\n", pptp_msg_name[msg]);
-
- switch (msg) {
- case PPTP_START_SESSION_REPLY:
- /* server confirms new control session */
- if (info->sstate < PPTP_SESSION_REQUESTED)
- goto invalid;
- if (pptpReq->srep.resultCode == PPTP_START_OK)
- info->sstate = PPTP_SESSION_CONFIRMED;
- else
- info->sstate = PPTP_SESSION_ERROR;
- break;
-
- case PPTP_STOP_SESSION_REPLY:
- /* server confirms end of control session */
- if (info->sstate > PPTP_SESSION_STOPREQ)
- goto invalid;
- if (pptpReq->strep.resultCode == PPTP_STOP_OK)
- info->sstate = PPTP_SESSION_NONE;
- else
- info->sstate = PPTP_SESSION_ERROR;
- break;
-
- case PPTP_OUT_CALL_REPLY:
- /* server accepted call, we now expect GRE frames */
- if (info->sstate != PPTP_SESSION_CONFIRMED)
- goto invalid;
- if (info->cstate != PPTP_CALL_OUT_REQ &&
- info->cstate != PPTP_CALL_OUT_CONF)
- goto invalid;
-
- cid = pptpReq->ocack.callID;
- pcid = pptpReq->ocack.peersCallID;
- if (info->pns_call_id != pcid)
- goto invalid;
- DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
- ntohs(cid), ntohs(pcid));
-
- if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
- info->cstate = PPTP_CALL_OUT_CONF;
- info->pac_call_id = cid;
- exp_gre(ct, cid, pcid);
- } else
- info->cstate = PPTP_CALL_NONE;
- break;
-
- case PPTP_IN_CALL_REQUEST:
- /* server tells us about incoming call request */
- if (info->sstate != PPTP_SESSION_CONFIRMED)
- goto invalid;
-
- cid = pptpReq->icreq.callID;
- DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
- info->cstate = PPTP_CALL_IN_REQ;
- info->pac_call_id = cid;
- break;
-
- case PPTP_IN_CALL_CONNECT:
- /* server tells us about incoming call established */
- if (info->sstate != PPTP_SESSION_CONFIRMED)
- goto invalid;
- if (info->cstate != PPTP_CALL_IN_REP &&
- info->cstate != PPTP_CALL_IN_CONF)
- goto invalid;
-
- pcid = pptpReq->iccon.peersCallID;
- cid = info->pac_call_id;
-
- if (info->pns_call_id != pcid)
- goto invalid;
-
- DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
- info->cstate = PPTP_CALL_IN_CONF;
-
- /* we expect a GRE connection from PAC to PNS */
- exp_gre(ct, cid, pcid);
- break;
-
- case PPTP_CALL_DISCONNECT_NOTIFY:
- /* server confirms disconnect */
- cid = pptpReq->disc.callID;
- DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
- info->cstate = PPTP_CALL_NONE;
-
- /* untrack this call id, unexpect GRE packets */
- pptp_destroy_siblings(ct);
- break;
-
- case PPTP_WAN_ERROR_NOTIFY:
- case PPTP_ECHO_REQUEST:
- case PPTP_ECHO_REPLY:
- /* I don't have to explain these ;) */
- break;
- default:
- goto invalid;
- }
-
- ip_nat_pptp_inbound = rcu_dereference(ip_nat_pptp_hook_inbound);
- if (ip_nat_pptp_inbound)
- return ip_nat_pptp_inbound(pskb, ct, ctinfo, ctlh, pptpReq);
- return NF_ACCEPT;
-
-invalid:
- DEBUGP("invalid %s: type=%d cid=%u pcid=%u "
- "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
- msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
- ntohs(info->pns_call_id), ntohs(info->pac_call_id));
- return NF_ACCEPT;
-}
-
-static inline int
-pptp_outbound_pkt(struct sk_buff **pskb,
- struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq,
- unsigned int reqlen,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
- u_int16_t msg;
- __be16 cid = 0, pcid = 0;
- typeof(ip_nat_pptp_hook_outbound) ip_nat_pptp_outbound;
-
- msg = ntohs(ctlh->messageType);
- DEBUGP("outbound control message %s\n", pptp_msg_name[msg]);
-
- switch (msg) {
- case PPTP_START_SESSION_REQUEST:
- /* client requests for new control session */
- if (info->sstate != PPTP_SESSION_NONE)
- goto invalid;
- info->sstate = PPTP_SESSION_REQUESTED;
- break;
- case PPTP_STOP_SESSION_REQUEST:
- /* client requests end of control session */
- info->sstate = PPTP_SESSION_STOPREQ;
- break;
-
- case PPTP_OUT_CALL_REQUEST:
- /* client initiating connection to server */
- if (info->sstate != PPTP_SESSION_CONFIRMED)
- goto invalid;
- info->cstate = PPTP_CALL_OUT_REQ;
- /* track PNS call id */
- cid = pptpReq->ocreq.callID;
- DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
- info->pns_call_id = cid;
- break;
- case PPTP_IN_CALL_REPLY:
- /* client answers incoming call */
- if (info->cstate != PPTP_CALL_IN_REQ &&
- info->cstate != PPTP_CALL_IN_REP)
- goto invalid;
-
- cid = pptpReq->icack.callID;
- pcid = pptpReq->icack.peersCallID;
- if (info->pac_call_id != pcid)
- goto invalid;
- DEBUGP("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
- ntohs(cid), ntohs(pcid));
-
- if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
- /* part two of the three-way handshake */
- info->cstate = PPTP_CALL_IN_REP;
- info->pns_call_id = cid;
- } else
- info->cstate = PPTP_CALL_NONE;
- break;
-
- case PPTP_CALL_CLEAR_REQUEST:
- /* client requests hangup of call */
- if (info->sstate != PPTP_SESSION_CONFIRMED)
- goto invalid;
- /* FUTURE: iterate over all calls and check if
- * call ID is valid. We don't do this without newnat,
- * because we only know about last call */
- info->cstate = PPTP_CALL_CLEAR_REQ;
- break;
- case PPTP_SET_LINK_INFO:
- case PPTP_ECHO_REQUEST:
- case PPTP_ECHO_REPLY:
- /* I don't have to explain these ;) */
- break;
- default:
- goto invalid;
- }
-
- ip_nat_pptp_outbound = rcu_dereference(ip_nat_pptp_hook_outbound);
- if (ip_nat_pptp_outbound)
- return ip_nat_pptp_outbound(pskb, ct, ctinfo, ctlh, pptpReq);
- return NF_ACCEPT;
-
-invalid:
- DEBUGP("invalid %s: type=%d cid=%u pcid=%u "
- "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
- msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
- ntohs(info->pns_call_id), ntohs(info->pac_call_id));
- return NF_ACCEPT;
-}
-
-static const unsigned int pptp_msg_size[] = {
- [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest),
- [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply),
- [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest),
- [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply),
- [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest),
- [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply),
- [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest),
- [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply),
- [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected),
- [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest),
- [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify),
- [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify),
- [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo),
-};
-
-/* track caller id inside control connection, call expect_related */
-static int
-conntrack_pptp_help(struct sk_buff **pskb,
- struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
-
-{
- int dir = CTINFO2DIR(ctinfo);
- struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
- struct tcphdr _tcph, *tcph;
- struct pptp_pkt_hdr _pptph, *pptph;
- struct PptpControlHeader _ctlh, *ctlh;
- union pptp_ctrl_union _pptpReq, *pptpReq;
- unsigned int tcplen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4;
- unsigned int datalen, reqlen, nexthdr_off;
- int oldsstate, oldcstate;
- int ret;
- u_int16_t msg;
-
- /* don't do any tracking before tcp handshake complete */
- if (ctinfo != IP_CT_ESTABLISHED
- && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
- DEBUGP("ctinfo = %u, skipping\n", ctinfo);
- return NF_ACCEPT;
- }
-
- nexthdr_off = (*pskb)->nh.iph->ihl*4;
- tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph);
- BUG_ON(!tcph);
- nexthdr_off += tcph->doff * 4;
- datalen = tcplen - tcph->doff * 4;
-
- pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph);
- if (!pptph) {
- DEBUGP("no full PPTP header, can't track\n");
- return NF_ACCEPT;
- }
- nexthdr_off += sizeof(_pptph);
- datalen -= sizeof(_pptph);
-
- /* if it's not a control message we can't do anything with it */
- if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
- ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
- DEBUGP("not a control packet\n");
- return NF_ACCEPT;
- }
-
- ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
- if (!ctlh)
- return NF_ACCEPT;
- nexthdr_off += sizeof(_ctlh);
- datalen -= sizeof(_ctlh);
-
- reqlen = datalen;
- msg = ntohs(ctlh->messageType);
- if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg])
- return NF_ACCEPT;
- if (reqlen > sizeof(*pptpReq))
- reqlen = sizeof(*pptpReq);
-
- pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq);
- if (!pptpReq)
- return NF_ACCEPT;
-
- oldsstate = info->sstate;
- oldcstate = info->cstate;
-
- spin_lock_bh(&ip_pptp_lock);
-
- /* FIXME: We just blindly assume that the control connection is always
- * established from PNS->PAC. However, RFC makes no guarantee */
- if (dir == IP_CT_DIR_ORIGINAL)
- /* client -> server (PNS -> PAC) */
- ret = pptp_outbound_pkt(pskb, ctlh, pptpReq, reqlen, ct,
- ctinfo);
- else
- /* server -> client (PAC -> PNS) */
- ret = pptp_inbound_pkt(pskb, ctlh, pptpReq, reqlen, ct,
- ctinfo);
- DEBUGP("sstate: %d->%d, cstate: %d->%d\n",
- oldsstate, info->sstate, oldcstate, info->cstate);
- spin_unlock_bh(&ip_pptp_lock);
-
- return ret;
-}
-
-/* control protocol helper */
-static struct ip_conntrack_helper pptp = {
- .list = { NULL, NULL },
- .name = "pptp",
- .me = THIS_MODULE,
- .max_expected = 2,
- .timeout = 5 * 60,
- .tuple = { .src = { .ip = 0,
- .u = { .tcp = { .port =
- __constant_htons(PPTP_CONTROL_PORT) } }
- },
- .dst = { .ip = 0,
- .u = { .all = 0 },
- .protonum = IPPROTO_TCP
- }
- },
- .mask = { .src = { .ip = 0,
- .u = { .tcp = { .port = __constant_htons(0xffff) } }
- },
- .dst = { .ip = 0,
- .u = { .all = 0 },
- .protonum = 0xff
- }
- },
- .help = conntrack_pptp_help,
- .destroy = pptp_destroy_siblings,
-};
-
-extern void ip_ct_proto_gre_fini(void);
-extern int __init ip_ct_proto_gre_init(void);
-
-/* ip_conntrack_pptp initialization */
-static int __init ip_conntrack_helper_pptp_init(void)
-{
- int retcode;
-
- retcode = ip_ct_proto_gre_init();
- if (retcode < 0)
- return retcode;
-
- DEBUGP(" registering helper\n");
- if ((retcode = ip_conntrack_helper_register(&pptp))) {
- printk(KERN_ERR "Unable to register conntrack application "
- "helper for pptp: %d\n", retcode);
- ip_ct_proto_gre_fini();
- return retcode;
- }
-
- printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION);
- return 0;
-}
-
-static void __exit ip_conntrack_helper_pptp_fini(void)
-{
- ip_conntrack_helper_unregister(&pptp);
- ip_ct_proto_gre_fini();
- printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION);
-}
-
-module_init(ip_conntrack_helper_pptp_init);
-module_exit(ip_conntrack_helper_pptp_fini);
-
-EXPORT_SYMBOL(ip_nat_pptp_hook_outbound);
-EXPORT_SYMBOL(ip_nat_pptp_hook_inbound);
-EXPORT_SYMBOL(ip_nat_pptp_hook_exp_gre);
-EXPORT_SYMBOL(ip_nat_pptp_hook_expectfn);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
deleted file mode 100644
index 91832eca4106..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/* IRC extension for IP connection tracking, Version 1.21
- * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
- * based on RR's ip_conntrack_ftp.c
- *
- * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- **
- * Module load syntax:
- * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS>
- * max_dcc_channels=n dcc_timeout=secs
- *
- * please give the ports of all IRC servers You wish to connect to.
- * If You don't specify ports, the default will be port 6667.
- * With max_dcc_channels you can define the maximum number of not
- * yet answered DCC channels per IRC session (default 8).
- * With dcc_timeout you can specify how long the system waits for
- * an expected DCC channel (default 300 seconds).
- *
- */
-
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
-#include <linux/moduleparam.h>
-
-#define MAX_PORTS 8
-static unsigned short ports[MAX_PORTS];
-static int ports_c;
-static unsigned int max_dcc_channels = 8;
-static unsigned int dcc_timeout = 300;
-/* This is slow, but it's simple. --RR */
-static char *irc_buffer;
-static DEFINE_SPINLOCK(irc_buffer_lock);
-
-unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack_expect *exp);
-EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
-MODULE_LICENSE("GPL");
-module_param_array(ports, ushort, &ports_c, 0400);
-MODULE_PARM_DESC(ports, "port numbers of IRC servers");
-module_param(max_dcc_channels, uint, 0400);
-MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
-module_param(dcc_timeout, uint, 0400);
-MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
-
-static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
-#define MINMATCHLEN 5
-
-#if 0
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
- __FILE__, __FUNCTION__ , ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int parse_dcc(char *data, char *data_end, u_int32_t *ip,
- u_int16_t *port, char **ad_beg_p, char **ad_end_p)
-/* tries to get the ip_addr and port out of a dcc command
- return value: -1 on failure, 0 on success
- data pointer to first byte of DCC command data
- data_end pointer to last byte of dcc command data
- ip returns parsed ip of dcc command
- port returns parsed port of dcc command
- ad_beg_p returns pointer to first byte of addr data
- ad_end_p returns pointer to last byte of addr data */
-{
-
- /* at least 12: "AAAAAAAA P\1\n" */
- while (*data++ != ' ')
- if (data > data_end - 12)
- return -1;
-
- *ad_beg_p = data;
- *ip = simple_strtoul(data, &data, 10);
-
- /* skip blanks between ip and port */
- while (*data == ' ') {
- if (data >= data_end)
- return -1;
- data++;
- }
-
- *port = simple_strtoul(data, &data, 10);
- *ad_end_p = data;
-
- return 0;
-}
-
-static int help(struct sk_buff **pskb,
- struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
-{
- unsigned int dataoff;
- struct tcphdr _tcph, *th;
- char *data, *data_limit, *ib_ptr;
- int dir = CTINFO2DIR(ctinfo);
- struct ip_conntrack_expect *exp;
- u32 seq;
- u_int32_t dcc_ip;
- u_int16_t dcc_port;
- int i, ret = NF_ACCEPT;
- char *addr_beg_p, *addr_end_p;
- typeof(ip_nat_irc_hook) ip_nat_irc;
-
- DEBUGP("entered\n");
-
- /* If packet is coming from IRC server */
- if (dir == IP_CT_DIR_REPLY)
- return NF_ACCEPT;
-
- /* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED
- && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
- DEBUGP("Conntrackinfo = %u\n", ctinfo);
- return NF_ACCEPT;
- }
-
- /* Not a full tcp header? */
- th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
- sizeof(_tcph), &_tcph);
- if (th == NULL)
- return NF_ACCEPT;
-
- /* No data? */
- dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
- if (dataoff >= (*pskb)->len)
- return NF_ACCEPT;
-
- spin_lock_bh(&irc_buffer_lock);
- ib_ptr = skb_header_pointer(*pskb, dataoff,
- (*pskb)->len - dataoff, irc_buffer);
- BUG_ON(ib_ptr == NULL);
-
- data = ib_ptr;
- data_limit = ib_ptr + (*pskb)->len - dataoff;
-
- /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
- * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
- while (data < (data_limit - (19 + MINMATCHLEN))) {
- if (memcmp(data, "\1DCC ", 5)) {
- data++;
- continue;
- }
-
- data += 5;
- /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
-
- DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
- NIPQUAD(iph->saddr), ntohs(th->source),
- NIPQUAD(iph->daddr), ntohs(th->dest));
-
- for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
- if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
- /* no match */
- continue;
- }
-
- DEBUGP("DCC %s detected\n", dccprotos[i]);
- data += strlen(dccprotos[i]);
- /* we have at least
- * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
- * data left (== 14/13 bytes) */
- if (parse_dcc((char *)data, data_limit, &dcc_ip,
- &dcc_port, &addr_beg_p, &addr_end_p)) {
- /* unable to parse */
- DEBUGP("unable to parse dcc command\n");
- continue;
- }
- DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
- HIPQUAD(dcc_ip), dcc_port);
-
- /* dcc_ip can be the internal OR external (NAT'ed) IP
- * Tiago Sousa <mirage@kaotik.org> */
- if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip)
- && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "Forged DCC command from "
- "%u.%u.%u.%u: %u.%u.%u.%u:%u\n",
- NIPQUAD(ct->tuplehash[dir].tuple.src.ip),
- HIPQUAD(dcc_ip), dcc_port);
-
- continue;
- }
-
- exp = ip_conntrack_expect_alloc(ct);
- if (exp == NULL) {
- ret = NF_DROP;
- goto out;
- }
-
- /* save position of address in dcc string,
- * necessary for NAT */
- DEBUGP("tcph->seq = %u\n", th->seq);
- seq = ntohl(th->seq) + (addr_beg_p - ib_ptr);
-
- /* We refer to the reverse direction ("!dir")
- * tuples here, because we're expecting
- * something in the other * direction.
- * Doesn't matter unless NAT is happening. */
- exp->tuple = ((struct ip_conntrack_tuple)
- { { 0, { 0 } },
- { ct->tuplehash[!dir].tuple.dst.ip,
- { .tcp = { htons(dcc_port) } },
- IPPROTO_TCP }});
- exp->mask = ((struct ip_conntrack_tuple)
- { { 0, { 0 } },
- { htonl(0xFFFFFFFF),
- { .tcp = { htons(0xFFFF) } }, 0xFF }});
- exp->expectfn = NULL;
- exp->flags = 0;
- ip_nat_irc = rcu_dereference(ip_nat_irc_hook);
- if (ip_nat_irc)
- ret = ip_nat_irc(pskb, ctinfo,
- addr_beg_p - ib_ptr,
- addr_end_p - addr_beg_p,
- exp);
- else if (ip_conntrack_expect_related(exp) != 0)
- ret = NF_DROP;
- ip_conntrack_expect_put(exp);
- goto out;
- } /* for .. NUM_DCCPROTO */
- } /* while data < ... */
-
- out:
- spin_unlock_bh(&irc_buffer_lock);
- return ret;
-}
-
-static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
-static char irc_names[MAX_PORTS][sizeof("irc-65535")];
-
-static void ip_conntrack_irc_fini(void);
-
-static int __init ip_conntrack_irc_init(void)
-{
- int i, ret;
- struct ip_conntrack_helper *hlpr;
- char *tmpname;
-
- if (max_dcc_channels < 1) {
- printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n");
- return -EBUSY;
- }
-
- irc_buffer = kmalloc(65536, GFP_KERNEL);
- if (!irc_buffer)
- return -ENOMEM;
-
- /* If no port given, default to standard irc port */
- if (ports_c == 0)
- ports[ports_c++] = IRC_PORT;
-
- for (i = 0; i < ports_c; i++) {
- hlpr = &irc_helpers[i];
- hlpr->tuple.src.u.tcp.port = htons(ports[i]);
- hlpr->tuple.dst.protonum = IPPROTO_TCP;
- hlpr->mask.src.u.tcp.port = htons(0xFFFF);
- hlpr->mask.dst.protonum = 0xFF;
- hlpr->max_expected = max_dcc_channels;
- hlpr->timeout = dcc_timeout;
- hlpr->me = THIS_MODULE;
- hlpr->help = help;
-
- tmpname = &irc_names[i][0];
- if (ports[i] == IRC_PORT)
- sprintf(tmpname, "irc");
- else
- sprintf(tmpname, "irc-%d", i);
- hlpr->name = tmpname;
-
- DEBUGP("port #%d: %d\n", i, ports[i]);
-
- ret = ip_conntrack_helper_register(hlpr);
-
- if (ret) {
- printk("ip_conntrack_irc: ERROR registering port %d\n",
- ports[i]);
- ip_conntrack_irc_fini();
- return -EBUSY;
- }
- }
- return 0;
-}
-
-/* This function is intentionally _NOT_ defined as __exit, because
- * it is needed by the init function */
-static void ip_conntrack_irc_fini(void)
-{
- int i;
- for (i = 0; i < ports_c; i++) {
- DEBUGP("unregistering port %d\n",
- ports[i]);
- ip_conntrack_helper_unregister(&irc_helpers[i]);
- }
- kfree(irc_buffer);
-}
-
-module_init(ip_conntrack_irc_init);
-module_exit(ip_conntrack_irc_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
deleted file mode 100644
index a1d6a89f64aa..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * NetBIOS name service broadcast connection tracking helper
- *
- * (c) 2005 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-/*
- * This helper tracks locally originating NetBIOS name service
- * requests by issuing permanent expectations (valid until
- * timing out) matching all reply connections from the
- * destination network. The only NetBIOS specific thing is
- * actually the port number.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_addr.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-
-#define NMBD_PORT 137
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
-MODULE_LICENSE("GPL");
-
-static unsigned int timeout = 3;
-module_param(timeout, uint, 0400);
-MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
-
-static int help(struct sk_buff **pskb,
- struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
-{
- struct ip_conntrack_expect *exp;
- struct iphdr *iph = (*pskb)->nh.iph;
- struct rtable *rt = (struct rtable *)(*pskb)->dst;
- struct in_device *in_dev;
- __be32 mask = 0;
-
- /* we're only interested in locally generated packets */
- if ((*pskb)->sk == NULL)
- goto out;
- if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
- goto out;
- if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- goto out;
-
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->u.dst.dev);
- if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
- if (ifa->ifa_broadcast == iph->daddr) {
- mask = ifa->ifa_mask;
- break;
- }
- } endfor_ifa(in_dev);
- }
- rcu_read_unlock();
-
- if (mask == 0)
- goto out;
-
- exp = ip_conntrack_expect_alloc(ct);
- if (exp == NULL)
- goto out;
-
- exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = htons(NMBD_PORT);
-
- exp->mask.src.ip = mask;
- exp->mask.src.u.udp.port = htons(0xFFFF);
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.udp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
-
- exp->expectfn = NULL;
- exp->flags = IP_CT_EXPECT_PERMANENT;
-
- ip_conntrack_expect_related(exp);
- ip_conntrack_expect_put(exp);
-
- ip_ct_refresh(ct, *pskb, timeout * HZ);
-out:
- return NF_ACCEPT;
-}
-
-static struct ip_conntrack_helper helper = {
- .name = "netbios-ns",
- .tuple = {
- .src = {
- .u = {
- .udp = {
- .port = __constant_htons(NMBD_PORT),
- }
- }
- },
- .dst = {
- .protonum = IPPROTO_UDP,
- },
- },
- .mask = {
- .src = {
- .u = {
- .udp = {
- .port = __constant_htons(0xFFFF),
- }
- }
- },
- .dst = {
- .protonum = 0xFF,
- },
- },
- .max_expected = 1,
- .me = THIS_MODULE,
- .help = help,
-};
-
-static int __init ip_conntrack_netbios_ns_init(void)
-{
- helper.timeout = timeout;
- return ip_conntrack_helper_register(&helper);
-}
-
-static void __exit ip_conntrack_netbios_ns_fini(void)
-{
- ip_conntrack_helper_unregister(&helper);
-}
-
-module_init(ip_conntrack_netbios_ns_init);
-module_exit(ip_conntrack_netbios_ns_fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
deleted file mode 100644
index 5fcf91d617cd..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ /dev/null
@@ -1,1575 +0,0 @@
-/* Connection tracking via netlink socket. Allows for user space
- * protocol helpers and general trouble making from userspace.
- *
- * (C) 2001 by Jay Schulist <jschlst@samba.org>
- * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2003 by Patrick Mchardy <kaber@trash.net>
- * (C) 2005-2006 by Pablo Neira Ayuso <pablo@eurodev.net>
- *
- * I've reworked this stuff to use attributes instead of conntrack
- * structures. 5.44 am. I need more tea. --pablo 05/07/11.
- *
- * Initial connection tracking via netlink development funded and
- * generally made possible by Network Robots, Inc. (www.networkrobots.com)
- *
- * Further development of this code funded by Astaro AG (http://www.astaro.com)
- *
- * This software may be used and distributed according to the terms
- * of the GNU General Public License, incorporated herein by reference.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-MODULE_LICENSE("GPL");
-
-static char __initdata version[] = "0.90";
-
-static inline int
-ctnetlink_dump_tuples_proto(struct sk_buff *skb,
- const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *proto)
-{
- int ret = 0;
- struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
-
- NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
-
- if (likely(proto->tuple_to_nfattr))
- ret = proto->tuple_to_nfattr(skb, tuple);
-
- NFA_NEST_END(skb, nest_parms);
-
- return ret;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_dump_tuples_ip(struct sk_buff *skb,
- const struct ip_conntrack_tuple *tuple)
-{
- struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
-
- NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(__be32), &tuple->src.ip);
- NFA_PUT(skb, CTA_IP_V4_DST, sizeof(__be32), &tuple->dst.ip);
-
- NFA_NEST_END(skb, nest_parms);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_dump_tuples(struct sk_buff *skb,
- const struct ip_conntrack_tuple *tuple)
-{
- int ret;
- struct ip_conntrack_protocol *proto;
-
- ret = ctnetlink_dump_tuples_ip(skb, tuple);
- if (unlikely(ret < 0))
- return ret;
-
- proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
- ret = ctnetlink_dump_tuples_proto(skb, tuple, proto);
- ip_conntrack_proto_put(proto);
-
- return ret;
-}
-
-static inline int
-ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- __be32 status = htonl((u_int32_t) ct->status);
- NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- long timeout_l = ct->timeout.expires - jiffies;
- __be32 timeout;
-
- if (timeout_l < 0)
- timeout = 0;
- else
- timeout = htonl(timeout_l / HZ);
-
- NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
-
- struct nfattr *nest_proto;
- int ret;
-
- if (!proto->to_nfattr) {
- ip_conntrack_proto_put(proto);
- return 0;
- }
-
- nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
-
- ret = proto->to_nfattr(skb, nest_proto, ct);
-
- ip_conntrack_proto_put(proto);
-
- NFA_NEST_END(skb, nest_proto);
-
- return ret;
-
-nfattr_failure:
- ip_conntrack_proto_put(proto);
- return -1;
-}
-
-static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- struct nfattr *nest_helper;
-
- if (!ct->helper)
- return 0;
-
- nest_helper = NFA_NEST(skb, CTA_HELP);
- NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name);
-
- if (ct->helper->to_nfattr)
- ct->helper->to_nfattr(skb, ct);
-
- NFA_NEST_END(skb, nest_helper);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-#ifdef CONFIG_IP_NF_CT_ACCT
-static inline int
-ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
- enum ip_conntrack_dir dir)
-{
- enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
- struct nfattr *nest_count = NFA_NEST(skb, type);
- __be32 tmp;
-
- tmp = htonl(ct->counters[dir].packets);
- NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(__be32), &tmp);
-
- tmp = htonl(ct->counters[dir].bytes);
- NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(__be32), &tmp);
-
- NFA_NEST_END(skb, nest_count);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-#else
-#define ctnetlink_dump_counters(a, b, c) (0)
-#endif
-
-#ifdef CONFIG_IP_NF_CONNTRACK_MARK
-static inline int
-ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- __be32 mark = htonl(ct->mark);
-
- NFA_PUT(skb, CTA_MARK, sizeof(__be32), &mark);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-#else
-#define ctnetlink_dump_mark(a, b) (0)
-#endif
-
-static inline int
-ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- __be32 id = htonl(ct->id);
- NFA_PUT(skb, CTA_ID, sizeof(__be32), &id);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
-{
- __be32 use = htonl(atomic_read(&ct->ct_general.use));
-
- NFA_PUT(skb, CTA_USE, sizeof(__be32), &use);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
-
-static int
-ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
- int event, int nowait,
- const struct ip_conntrack *ct)
-{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- struct nfattr *nest_parms;
- unsigned char *b;
-
- b = skb->tail;
-
- event |= NFNL_SUBSYS_CTNETLINK << 8;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
-
- nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
- nfmsg->nfgen_family = AF_INET;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- if (ctnetlink_dump_status(skb, ct) < 0 ||
- ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
- ctnetlink_dump_protoinfo(skb, ct) < 0 ||
- ctnetlink_dump_helpinfo(skb, ct) < 0 ||
- ctnetlink_dump_mark(skb, ct) < 0 ||
- ctnetlink_dump_id(skb, ct) < 0 ||
- ctnetlink_dump_use(skb, ct) < 0)
- goto nfattr_failure;
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-nlmsg_failure:
-nfattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-static int ctnetlink_conntrack_event(struct notifier_block *this,
- unsigned long events, void *ptr)
-{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- struct nfattr *nest_parms;
- struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
- struct sk_buff *skb;
- unsigned int type;
- unsigned char *b;
- unsigned int flags = 0, group;
-
- /* ignore our fake conntrack entry */
- if (ct == &ip_conntrack_untracked)
- return NOTIFY_DONE;
-
- if (events & IPCT_DESTROY) {
- type = IPCTNL_MSG_CT_DELETE;
- group = NFNLGRP_CONNTRACK_DESTROY;
- } else if (events & (IPCT_NEW | IPCT_RELATED)) {
- type = IPCTNL_MSG_CT_NEW;
- flags = NLM_F_CREATE|NLM_F_EXCL;
- group = NFNLGRP_CONNTRACK_NEW;
- } else if (events & (IPCT_STATUS | IPCT_PROTOINFO)) {
- type = IPCTNL_MSG_CT_NEW;
- group = NFNLGRP_CONNTRACK_UPDATE;
- } else
- return NOTIFY_DONE;
-
- if (!nfnetlink_has_listeners(group))
- return NOTIFY_DONE;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
- if (!skb)
- return NOTIFY_DONE;
-
- b = skb->tail;
-
- type |= NFNL_SUBSYS_CTNETLINK << 8;
- nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
-
- nlh->nlmsg_flags = flags;
- nfmsg->nfgen_family = AF_INET;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- if (events & IPCT_DESTROY) {
- if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
- goto nfattr_failure;
- } else {
- if (ctnetlink_dump_status(skb, ct) < 0)
- goto nfattr_failure;
-
- if (ctnetlink_dump_timeout(skb, ct) < 0)
- goto nfattr_failure;
-
- if (events & IPCT_PROTOINFO
- && ctnetlink_dump_protoinfo(skb, ct) < 0)
- goto nfattr_failure;
-
- if ((events & IPCT_HELPER || ct->helper)
- && ctnetlink_dump_helpinfo(skb, ct) < 0)
- goto nfattr_failure;
-
- if ((events & IPCT_MARK || ct->mark)
- && ctnetlink_dump_mark(skb, ct) < 0)
- goto nfattr_failure;
-
- if (events & IPCT_COUNTER_FILLING &&
- (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0))
- goto nfattr_failure;
- }
-
- nlh->nlmsg_len = skb->tail - b;
- nfnetlink_send(skb, 0, group, 0);
- return NOTIFY_DONE;
-
-nlmsg_failure:
-nfattr_failure:
- kfree_skb(skb);
- return NOTIFY_DONE;
-}
-#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
-
-static int ctnetlink_done(struct netlink_callback *cb)
-{
- if (cb->args[1])
- ip_conntrack_put((struct ip_conntrack *)cb->args[1]);
- return 0;
-}
-
-static int
-ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct ip_conntrack *ct, *last;
- struct ip_conntrack_tuple_hash *h;
- struct list_head *i;
-
- read_lock_bh(&ip_conntrack_lock);
- last = (struct ip_conntrack *)cb->args[1];
- for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) {
-restart:
- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
- h = (struct ip_conntrack_tuple_hash *) i;
- if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
- ct = tuplehash_to_ctrack(h);
- if (cb->args[1]) {
- if (ct != last)
- continue;
- cb->args[1] = 0;
- }
- if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW,
- 1, ct) < 0) {
- nf_conntrack_get(&ct->ct_general);
- cb->args[1] = (unsigned long)ct;
- goto out;
- }
-#ifdef CONFIG_NF_CT_ACCT
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) ==
- IPCTNL_MSG_CT_GET_CTRZERO)
- memset(&ct->counters, 0, sizeof(ct->counters));
-#endif
- }
- if (cb->args[1]) {
- cb->args[1] = 0;
- goto restart;
- }
- }
-out:
- read_unlock_bh(&ip_conntrack_lock);
- if (last)
- ip_conntrack_put(last);
-
- return skb->len;
-}
-
-static const size_t cta_min_ip[CTA_IP_MAX] = {
- [CTA_IP_V4_SRC-1] = sizeof(__be32),
- [CTA_IP_V4_DST-1] = sizeof(__be32),
-};
-
-static inline int
-ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
-{
- struct nfattr *tb[CTA_IP_MAX];
-
- nfattr_parse_nested(tb, CTA_IP_MAX, attr);
-
- if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
- return -EINVAL;
-
- if (!tb[CTA_IP_V4_SRC-1])
- return -EINVAL;
- tuple->src.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
-
- if (!tb[CTA_IP_V4_DST-1])
- return -EINVAL;
- tuple->dst.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
-
- return 0;
-}
-
-static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_NUM-1] = sizeof(u_int8_t),
- [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
- [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
- [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
-};
-
-static inline int
-ctnetlink_parse_tuple_proto(struct nfattr *attr,
- struct ip_conntrack_tuple *tuple)
-{
- struct nfattr *tb[CTA_PROTO_MAX];
- struct ip_conntrack_protocol *proto;
- int ret = 0;
-
- nfattr_parse_nested(tb, CTA_PROTO_MAX, attr);
-
- if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
- return -EINVAL;
-
- if (!tb[CTA_PROTO_NUM-1])
- return -EINVAL;
- tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
-
- proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
-
- if (likely(proto->nfattr_to_tuple))
- ret = proto->nfattr_to_tuple(tb, tuple);
-
- ip_conntrack_proto_put(proto);
-
- return ret;
-}
-
-static inline int
-ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
- enum ctattr_tuple type)
-{
- struct nfattr *tb[CTA_TUPLE_MAX];
- int err;
-
- memset(tuple, 0, sizeof(*tuple));
-
- nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]);
-
- if (!tb[CTA_TUPLE_IP-1])
- return -EINVAL;
-
- err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
- if (err < 0)
- return err;
-
- if (!tb[CTA_TUPLE_PROTO-1])
- return -EINVAL;
-
- err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
- if (err < 0)
- return err;
-
- /* orig and expect tuples get DIR_ORIGINAL */
- if (type == CTA_TUPLE_REPLY)
- tuple->dst.dir = IP_CT_DIR_REPLY;
- else
- tuple->dst.dir = IP_CT_DIR_ORIGINAL;
-
- return 0;
-}
-
-#ifdef CONFIG_IP_NF_NAT_NEEDED
-static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = {
- [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
- [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
-};
-
-static int ctnetlink_parse_nat_proto(struct nfattr *attr,
- const struct ip_conntrack *ct,
- struct ip_nat_range *range)
-{
- struct nfattr *tb[CTA_PROTONAT_MAX];
- struct ip_nat_protocol *npt;
-
- nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr);
-
- if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
- return -EINVAL;
-
- npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
-
- if (!npt->nfattr_to_range) {
- ip_nat_proto_put(npt);
- return 0;
- }
-
- /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
- if (npt->nfattr_to_range(tb, range) > 0)
- range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
-
- ip_nat_proto_put(npt);
-
- return 0;
-}
-
-static const size_t cta_min_nat[CTA_NAT_MAX] = {
- [CTA_NAT_MINIP-1] = sizeof(__be32),
- [CTA_NAT_MAXIP-1] = sizeof(__be32),
-};
-
-static inline int
-ctnetlink_parse_nat(struct nfattr *nat,
- const struct ip_conntrack *ct, struct ip_nat_range *range)
-{
- struct nfattr *tb[CTA_NAT_MAX];
- int err;
-
- memset(range, 0, sizeof(*range));
-
- nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
-
- if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
- return -EINVAL;
-
- if (tb[CTA_NAT_MINIP-1])
- range->min_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
-
- if (!tb[CTA_NAT_MAXIP-1])
- range->max_ip = range->min_ip;
- else
- range->max_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
-
- if (range->min_ip)
- range->flags |= IP_NAT_RANGE_MAP_IPS;
-
- if (!tb[CTA_NAT_PROTO-1])
- return 0;
-
- err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
- if (err < 0)
- return err;
-
- return 0;
-}
-#endif
-
-static inline int
-ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
-{
- struct nfattr *tb[CTA_HELP_MAX];
-
- nfattr_parse_nested(tb, CTA_HELP_MAX, attr);
-
- if (!tb[CTA_HELP_NAME-1])
- return -EINVAL;
-
- *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
-
- return 0;
-}
-
-static const size_t cta_min[CTA_MAX] = {
- [CTA_STATUS-1] = sizeof(__be32),
- [CTA_TIMEOUT-1] = sizeof(__be32),
- [CTA_MARK-1] = sizeof(__be32),
- [CTA_USE-1] = sizeof(__be32),
- [CTA_ID-1] = sizeof(__be32)
-};
-
-static int
-ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
-{
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack *ct;
- int err = 0;
-
- if (nfattr_bad_size(cda, CTA_MAX, cta_min))
- return -EINVAL;
-
- if (cda[CTA_TUPLE_ORIG-1])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
- else if (cda[CTA_TUPLE_REPLY-1])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
- else {
- /* Flush the whole table */
- ip_conntrack_flush();
- return 0;
- }
-
- if (err < 0)
- return err;
-
- h = ip_conntrack_find_get(&tuple, NULL);
- if (!h)
- return -ENOENT;
-
- ct = tuplehash_to_ctrack(h);
-
- if (cda[CTA_ID-1]) {
- u_int32_t id = ntohl(*(__be32 *)NFA_DATA(cda[CTA_ID-1]));
- if (ct->id != id) {
- ip_conntrack_put(ct);
- return -ENOENT;
- }
- }
- if (del_timer(&ct->timeout))
- ct->timeout.function((unsigned long)ct);
-
- ip_conntrack_put(ct);
-
- return 0;
-}
-
-static int
-ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
-{
- struct ip_conntrack_tuple_hash *h;
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack *ct;
- struct sk_buff *skb2 = NULL;
- int err = 0;
-
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct nfgenmsg *msg = NLMSG_DATA(nlh);
- u32 rlen;
-
- if (msg->nfgen_family != AF_INET)
- return -EAFNOSUPPORT;
-
-#ifndef CONFIG_IP_NF_CT_ACCT
- if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO)
- return -ENOTSUPP;
-#endif
- if ((*errp = netlink_dump_start(ctnl, skb, nlh,
- ctnetlink_dump_table,
- ctnetlink_done)) != 0)
- return -EINVAL;
-
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- skb_pull(skb, rlen);
- return 0;
- }
-
- if (nfattr_bad_size(cda, CTA_MAX, cta_min))
- return -EINVAL;
-
- if (cda[CTA_TUPLE_ORIG-1])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
- else if (cda[CTA_TUPLE_REPLY-1])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
- else
- return -EINVAL;
-
- if (err < 0)
- return err;
-
- h = ip_conntrack_find_get(&tuple, NULL);
- if (!h)
- return -ENOENT;
-
- ct = tuplehash_to_ctrack(h);
-
- err = -ENOMEM;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb2) {
- ip_conntrack_put(ct);
- return -ENOMEM;
- }
-
- err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW, 1, ct);
- ip_conntrack_put(ct);
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
-
-free:
- kfree_skb(skb2);
-out:
- return err;
-}
-
-static inline int
-ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
-{
- unsigned long d;
- unsigned status = ntohl(*(__be32 *)NFA_DATA(cda[CTA_STATUS-1]));
- d = ct->status ^ status;
-
- if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
- /* unchangeable */
- return -EINVAL;
-
- if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
- /* SEEN_REPLY bit can only be set */
- return -EINVAL;
-
-
- if (d & IPS_ASSURED && !(status & IPS_ASSURED))
- /* ASSURED bit can only be set */
- return -EINVAL;
-
- if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
-#ifndef CONFIG_IP_NF_NAT_NEEDED
- return -EINVAL;
-#else
- struct ip_nat_range range;
-
- if (cda[CTA_NAT_DST-1]) {
- if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
- &range) < 0)
- return -EINVAL;
- if (ip_nat_initialized(ct,
- HOOK2MANIP(NF_IP_PRE_ROUTING)))
- return -EEXIST;
- ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
- }
- if (cda[CTA_NAT_SRC-1]) {
- if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
- &range) < 0)
- return -EINVAL;
- if (ip_nat_initialized(ct,
- HOOK2MANIP(NF_IP_POST_ROUTING)))
- return -EEXIST;
- ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
- }
-#endif
- }
-
- /* Be careful here, modifying NAT bits can screw up things,
- * so don't let users modify them directly if they don't pass
- * ip_nat_range. */
- ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
- return 0;
-}
-
-
-static inline int
-ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
-{
- struct ip_conntrack_helper *helper;
- char *helpname;
- int err;
-
- /* don't change helper of sibling connections */
- if (ct->master)
- return -EINVAL;
-
- err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
- if (err < 0)
- return err;
-
- helper = __ip_conntrack_helper_find_byname(helpname);
- if (!helper) {
- if (!strcmp(helpname, ""))
- helper = NULL;
- else
- return -EINVAL;
- }
-
- if (ct->helper) {
- if (!helper) {
- /* we had a helper before ... */
- ip_ct_remove_expectations(ct);
- ct->helper = NULL;
- } else {
- /* need to zero data of old helper */
- memset(&ct->help, 0, sizeof(ct->help));
- }
- }
-
- ct->helper = helper;
-
- return 0;
-}
-
-static inline int
-ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
-{
- u_int32_t timeout = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1]));
-
- if (!del_timer(&ct->timeout))
- return -ETIME;
-
- ct->timeout.expires = jiffies + timeout * HZ;
- add_timer(&ct->timeout);
-
- return 0;
-}
-
-static inline int
-ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
-{
- struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
- struct ip_conntrack_protocol *proto;
- u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
- int err = 0;
-
- nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
-
- proto = ip_conntrack_proto_find_get(npt);
-
- if (proto->from_nfattr)
- err = proto->from_nfattr(tb, ct);
- ip_conntrack_proto_put(proto);
-
- return err;
-}
-
-static int
-ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
-{
- int err;
-
- if (cda[CTA_HELP-1]) {
- err = ctnetlink_change_helper(ct, cda);
- if (err < 0)
- return err;
- }
-
- if (cda[CTA_TIMEOUT-1]) {
- err = ctnetlink_change_timeout(ct, cda);
- if (err < 0)
- return err;
- }
-
- if (cda[CTA_STATUS-1]) {
- err = ctnetlink_change_status(ct, cda);
- if (err < 0)
- return err;
- }
-
- if (cda[CTA_PROTOINFO-1]) {
- err = ctnetlink_change_protoinfo(ct, cda);
- if (err < 0)
- return err;
- }
-
-#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (cda[CTA_MARK-1])
- ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1]));
-#endif
-
- return 0;
-}
-
-static int
-ctnetlink_create_conntrack(struct nfattr *cda[],
- struct ip_conntrack_tuple *otuple,
- struct ip_conntrack_tuple *rtuple)
-{
- struct ip_conntrack *ct;
- int err = -EINVAL;
-
- ct = ip_conntrack_alloc(otuple, rtuple);
- if (ct == NULL || IS_ERR(ct))
- return -ENOMEM;
-
- if (!cda[CTA_TIMEOUT-1])
- goto err;
- ct->timeout.expires = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1]));
-
- ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
- ct->status |= IPS_CONFIRMED;
-
- if (cda[CTA_STATUS-1]) {
- err = ctnetlink_change_status(ct, cda);
- if (err < 0)
- goto err;
- }
-
- if (cda[CTA_PROTOINFO-1]) {
- err = ctnetlink_change_protoinfo(ct, cda);
- if (err < 0)
- return err;
- }
-
-#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (cda[CTA_MARK-1])
- ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1]));
-#endif
-
- ct->helper = ip_conntrack_helper_find_get(rtuple);
-
- add_timer(&ct->timeout);
- ip_conntrack_hash_insert(ct);
-
- if (ct->helper)
- ip_conntrack_helper_put(ct->helper);
-
- return 0;
-
-err:
- ip_conntrack_free(ct);
- return err;
-}
-
-static int
-ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
-{
- struct ip_conntrack_tuple otuple, rtuple;
- struct ip_conntrack_tuple_hash *h = NULL;
- int err = 0;
-
- if (nfattr_bad_size(cda, CTA_MAX, cta_min))
- return -EINVAL;
-
- if (cda[CTA_TUPLE_ORIG-1]) {
- err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
- if (err < 0)
- return err;
- }
-
- if (cda[CTA_TUPLE_REPLY-1]) {
- err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
- if (err < 0)
- return err;
- }
-
- write_lock_bh(&ip_conntrack_lock);
- if (cda[CTA_TUPLE_ORIG-1])
- h = __ip_conntrack_find(&otuple, NULL);
- else if (cda[CTA_TUPLE_REPLY-1])
- h = __ip_conntrack_find(&rtuple, NULL);
-
- if (h == NULL) {
- write_unlock_bh(&ip_conntrack_lock);
- err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE)
- err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
- return err;
- }
- /* implicit 'else' */
-
- /* we only allow nat config for new conntracks */
- if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
- err = -EINVAL;
- goto out_unlock;
- }
-
- /* We manipulate the conntrack inside the global conntrack table lock,
- * so there's no need to increase the refcount */
- err = -EEXIST;
- if (!(nlh->nlmsg_flags & NLM_F_EXCL))
- err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
-
-out_unlock:
- write_unlock_bh(&ip_conntrack_lock);
- return err;
-}
-
-/***********************************************************************
- * EXPECT
- ***********************************************************************/
-
-static inline int
-ctnetlink_exp_dump_tuple(struct sk_buff *skb,
- const struct ip_conntrack_tuple *tuple,
- enum ctattr_expect type)
-{
- struct nfattr *nest_parms = NFA_NEST(skb, type);
-
- if (ctnetlink_dump_tuples(skb, tuple) < 0)
- goto nfattr_failure;
-
- NFA_NEST_END(skb, nest_parms);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_exp_dump_mask(struct sk_buff *skb,
- const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *mask)
-{
- int ret;
- struct ip_conntrack_protocol *proto;
- struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT_MASK);
-
- ret = ctnetlink_dump_tuples_ip(skb, mask);
- if (unlikely(ret < 0))
- goto nfattr_failure;
-
- proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
- ret = ctnetlink_dump_tuples_proto(skb, mask, proto);
- ip_conntrack_proto_put(proto);
- if (unlikely(ret < 0))
- goto nfattr_failure;
-
- NFA_NEST_END(skb, nest_parms);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static inline int
-ctnetlink_exp_dump_expect(struct sk_buff *skb,
- const struct ip_conntrack_expect *exp)
-{
- struct ip_conntrack *master = exp->master;
- __be32 timeout = htonl((exp->timeout.expires - jiffies) / HZ);
- __be32 id = htonl(exp->id);
-
- if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
- goto nfattr_failure;
- if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
- goto nfattr_failure;
- if (ctnetlink_exp_dump_tuple(skb,
- &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- CTA_EXPECT_MASTER) < 0)
- goto nfattr_failure;
-
- NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(__be32), &timeout);
- NFA_PUT(skb, CTA_EXPECT_ID, sizeof(__be32), &id);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static int
-ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
- int event,
- int nowait,
- const struct ip_conntrack_expect *exp)
-{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- unsigned char *b;
-
- b = skb->tail;
-
- event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
-
- nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
- nfmsg->nfgen_family = AF_INET;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- if (ctnetlink_exp_dump_expect(skb, exp) < 0)
- goto nfattr_failure;
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-nlmsg_failure:
-nfattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-static int ctnetlink_expect_event(struct notifier_block *this,
- unsigned long events, void *ptr)
-{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
- struct sk_buff *skb;
- unsigned int type;
- unsigned char *b;
- int flags = 0;
-
- if (events & IPEXP_NEW) {
- type = IPCTNL_MSG_EXP_NEW;
- flags = NLM_F_CREATE|NLM_F_EXCL;
- } else
- return NOTIFY_DONE;
-
- if (!nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW))
- return NOTIFY_DONE;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
- if (!skb)
- return NOTIFY_DONE;
-
- b = skb->tail;
-
- type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
- nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
-
- nlh->nlmsg_flags = flags;
- nfmsg->nfgen_family = AF_INET;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- if (ctnetlink_exp_dump_expect(skb, exp) < 0)
- goto nfattr_failure;
-
- nlh->nlmsg_len = skb->tail - b;
- nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
- return NOTIFY_DONE;
-
-nlmsg_failure:
-nfattr_failure:
- kfree_skb(skb);
- return NOTIFY_DONE;
-}
-#endif
-
-static int
-ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct ip_conntrack_expect *exp = NULL;
- struct list_head *i;
- u_int32_t *id = (u_int32_t *) &cb->args[0];
-
- read_lock_bh(&ip_conntrack_lock);
- list_for_each_prev(i, &ip_conntrack_expect_list) {
- exp = (struct ip_conntrack_expect *) i;
- if (exp->id <= *id)
- continue;
- if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- IPCTNL_MSG_EXP_NEW,
- 1, exp) < 0)
- goto out;
- *id = exp->id;
- }
-out:
- read_unlock_bh(&ip_conntrack_lock);
-
- return skb->len;
-}
-
-static const size_t cta_min_exp[CTA_EXPECT_MAX] = {
- [CTA_EXPECT_TIMEOUT-1] = sizeof(__be32),
- [CTA_EXPECT_ID-1] = sizeof(__be32)
-};
-
-static int
-ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
-{
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack_expect *exp;
- struct sk_buff *skb2;
- int err = 0;
-
- if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
- return -EINVAL;
-
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct nfgenmsg *msg = NLMSG_DATA(nlh);
- u32 rlen;
-
- if (msg->nfgen_family != AF_INET)
- return -EAFNOSUPPORT;
-
- if ((*errp = netlink_dump_start(ctnl, skb, nlh,
- ctnetlink_exp_dump_table,
- ctnetlink_done)) != 0)
- return -EINVAL;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- skb_pull(skb, rlen);
- return 0;
- }
-
- if (cda[CTA_EXPECT_MASTER-1])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
- else
- return -EINVAL;
-
- if (err < 0)
- return err;
-
- exp = ip_conntrack_expect_find_get(&tuple);
- if (!exp)
- return -ENOENT;
-
- if (cda[CTA_EXPECT_ID-1]) {
- __be32 id = *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
- if (exp->id != ntohl(id)) {
- ip_conntrack_expect_put(exp);
- return -ENOENT;
- }
- }
-
- err = -ENOMEM;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb2)
- goto out;
-
- err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
- nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
- 1, exp);
- if (err <= 0)
- goto free;
-
- ip_conntrack_expect_put(exp);
-
- return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
-
-free:
- kfree_skb(skb2);
-out:
- ip_conntrack_expect_put(exp);
- return err;
-}
-
-static int
-ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
-{
- struct ip_conntrack_expect *exp, *tmp;
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack_helper *h;
- int err;
-
- if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
- return -EINVAL;
-
- if (cda[CTA_EXPECT_TUPLE-1]) {
- /* delete a single expect by tuple */
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
- if (err < 0)
- return err;
-
- /* bump usage count to 2 */
- exp = ip_conntrack_expect_find_get(&tuple);
- if (!exp)
- return -ENOENT;
-
- if (cda[CTA_EXPECT_ID-1]) {
- __be32 id =
- *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
- if (exp->id != ntohl(id)) {
- ip_conntrack_expect_put(exp);
- return -ENOENT;
- }
- }
-
- /* after list removal, usage count == 1 */
- ip_conntrack_unexpect_related(exp);
- /* have to put what we 'get' above.
- * after this line usage count == 0 */
- ip_conntrack_expect_put(exp);
- } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
- char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
-
- /* delete all expectations for this helper */
- write_lock_bh(&ip_conntrack_lock);
- h = __ip_conntrack_helper_find_byname(name);
- if (!h) {
- write_unlock_bh(&ip_conntrack_lock);
- return -EINVAL;
- }
- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
- list) {
- if (exp->master->helper == h
- && del_timer(&exp->timeout)) {
- ip_ct_unlink_expect(exp);
- ip_conntrack_expect_put(exp);
- }
- }
- write_unlock_bh(&ip_conntrack_lock);
- } else {
- /* This basically means we have to flush everything*/
- write_lock_bh(&ip_conntrack_lock);
- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
- list) {
- if (del_timer(&exp->timeout)) {
- ip_ct_unlink_expect(exp);
- ip_conntrack_expect_put(exp);
- }
- }
- write_unlock_bh(&ip_conntrack_lock);
- }
-
- return 0;
-}
-static int
-ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
-{
- return -EOPNOTSUPP;
-}
-
-static int
-ctnetlink_create_expect(struct nfattr *cda[])
-{
- struct ip_conntrack_tuple tuple, mask, master_tuple;
- struct ip_conntrack_tuple_hash *h = NULL;
- struct ip_conntrack_expect *exp;
- struct ip_conntrack *ct;
- int err = 0;
-
- /* caller guarantees that those three CTA_EXPECT_* exist */
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
- if (err < 0)
- return err;
- err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
- if (err < 0)
- return err;
- err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
- if (err < 0)
- return err;
-
- /* Look for master conntrack of this expectation */
- h = ip_conntrack_find_get(&master_tuple, NULL);
- if (!h)
- return -ENOENT;
- ct = tuplehash_to_ctrack(h);
-
- if (!ct->helper) {
- /* such conntrack hasn't got any helper, abort */
- err = -EINVAL;
- goto out;
- }
-
- exp = ip_conntrack_expect_alloc(ct);
- if (!exp) {
- err = -ENOMEM;
- goto out;
- }
-
- exp->expectfn = NULL;
- exp->flags = 0;
- exp->master = ct;
- memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
- memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
-
- err = ip_conntrack_expect_related(exp);
- ip_conntrack_expect_put(exp);
-
-out:
- ip_conntrack_put(tuplehash_to_ctrack(h));
- return err;
-}
-
-static int
-ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
-{
- struct ip_conntrack_tuple tuple;
- struct ip_conntrack_expect *exp;
- int err = 0;
-
- if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
- return -EINVAL;
-
- if (!cda[CTA_EXPECT_TUPLE-1]
- || !cda[CTA_EXPECT_MASK-1]
- || !cda[CTA_EXPECT_MASTER-1])
- return -EINVAL;
-
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
- if (err < 0)
- return err;
-
- write_lock_bh(&ip_conntrack_lock);
- exp = __ip_conntrack_expect_find(&tuple);
-
- if (!exp) {
- write_unlock_bh(&ip_conntrack_lock);
- err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE)
- err = ctnetlink_create_expect(cda);
- return err;
- }
-
- err = -EEXIST;
- if (!(nlh->nlmsg_flags & NLM_F_EXCL))
- err = ctnetlink_change_expect(exp, cda);
- write_unlock_bh(&ip_conntrack_lock);
-
- return err;
-}
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-static struct notifier_block ctnl_notifier = {
- .notifier_call = ctnetlink_conntrack_event,
-};
-
-static struct notifier_block ctnl_notifier_exp = {
- .notifier_call = ctnetlink_expect_event,
-};
-#endif
-
-static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
- [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX, },
- [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX, },
- [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX, },
- [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX, },
-};
-
-static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
- [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX, },
- [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX, },
- [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX, },
-};
-
-static struct nfnetlink_subsystem ctnl_subsys = {
- .name = "conntrack",
- .subsys_id = NFNL_SUBSYS_CTNETLINK,
- .cb_count = IPCTNL_MSG_MAX,
- .cb = ctnl_cb,
-};
-
-static struct nfnetlink_subsystem ctnl_exp_subsys = {
- .name = "conntrack_expect",
- .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
- .cb_count = IPCTNL_MSG_EXP_MAX,
- .cb = ctnl_exp_cb,
-};
-
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
-
-static int __init ctnetlink_init(void)
-{
- int ret;
-
- printk("ctnetlink v%s: registering with nfnetlink.\n", version);
- ret = nfnetlink_subsys_register(&ctnl_subsys);
- if (ret < 0) {
- printk("ctnetlink_init: cannot register with nfnetlink.\n");
- goto err_out;
- }
-
- ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
- if (ret < 0) {
- printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
- goto err_unreg_subsys;
- }
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
- ret = ip_conntrack_register_notifier(&ctnl_notifier);
- if (ret < 0) {
- printk("ctnetlink_init: cannot register notifier.\n");
- goto err_unreg_exp_subsys;
- }
-
- ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
- if (ret < 0) {
- printk("ctnetlink_init: cannot expect register notifier.\n");
- goto err_unreg_notifier;
- }
-#endif
-
- return 0;
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
- ip_conntrack_unregister_notifier(&ctnl_notifier);
-err_unreg_exp_subsys:
- nfnetlink_subsys_unregister(&ctnl_exp_subsys);
-#endif
-err_unreg_subsys:
- nfnetlink_subsys_unregister(&ctnl_subsys);
-err_out:
- return ret;
-}
-
-static void __exit ctnetlink_exit(void)
-{
- printk("ctnetlink: unregistering from nfnetlink.\n");
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
- ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp);
- ip_conntrack_unregister_notifier(&ctnl_notifier);
-#endif
-
- nfnetlink_subsys_unregister(&ctnl_exp_subsys);
- nfnetlink_subsys_unregister(&ctnl_subsys);
- return;
-}
-
-module_init(ctnetlink_init);
-module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
deleted file mode 100644
index 36f2b5e5d80a..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-
-unsigned int ip_ct_generic_timeout __read_mostly = 600*HZ;
-
-static int generic_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple)
-{
- tuple->src.u.all = 0;
- tuple->dst.u.all = 0;
-
- return 1;
-}
-
-static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig)
-{
- tuple->src.u.all = 0;
- tuple->dst.u.all = 0;
-
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int generic_print_tuple(struct seq_file *s,
- const struct ip_conntrack_tuple *tuple)
-{
- return 0;
-}
-
-/* Print out the private part of the conntrack. */
-static int generic_print_conntrack(struct seq_file *s,
- const struct ip_conntrack *state)
-{
- return 0;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int packet(struct ip_conntrack *conntrack,
- const struct sk_buff *skb,
- enum ip_conntrack_info ctinfo)
-{
- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
-{
- return 1;
-}
-
-struct ip_conntrack_protocol ip_conntrack_generic_protocol =
-{
- .proto = 0,
- .name = "unknown",
- .pkt_to_tuple = generic_pkt_to_tuple,
- .invert_tuple = generic_invert_tuple,
- .print_tuple = generic_print_tuple,
- .print_conntrack = generic_print_conntrack,
- .packet = packet,
- .new = new,
-};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
deleted file mode 100644
index 5fe026f467d3..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * ip_conntrack_proto_gre.c - Version 3.0
- *
- * Connection tracking protocol helper module for GRE.
- *
- * GRE is a generic encapsulation protocol, which is generally not very
- * suited for NAT, as it has no protocol-specific part as port numbers.
- *
- * It has an optional key field, which may help us distinguishing two
- * connections between the same two hosts.
- *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
- *
- * PPTP is built on top of a modified version of GRE, and has a mandatory
- * field called "CallID", which serves us for the same purpose as the key
- * field in plain GRE.
- *
- * Documentation about PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <linux/in.h>
-#include <linux/list.h>
-#include <linux/seq_file.h>
-#include <linux/interrupt.h>
-
-static DEFINE_RWLOCK(ip_ct_gre_lock);
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
-#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE");
-
-/* shamelessly stolen from ip_conntrack_proto_udp.c */
-#define GRE_TIMEOUT (30*HZ)
-#define GRE_STREAM_TIMEOUT (180*HZ)
-
-#if 0
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
-#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \
- NIPQUAD((x)->src.ip), ntohs((x)->src.u.gre.key), \
- NIPQUAD((x)->dst.ip), ntohs((x)->dst.u.gre.key))
-#else
-#define DEBUGP(x, args...)
-#define DUMP_TUPLE_GRE(x)
-#endif
-
-/* GRE KEYMAP HANDLING FUNCTIONS */
-static LIST_HEAD(gre_keymap_list);
-
-static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
- const struct ip_conntrack_tuple *t)
-{
- return ((km->tuple.src.ip == t->src.ip) &&
- (km->tuple.dst.ip == t->dst.ip) &&
- (km->tuple.dst.protonum == t->dst.protonum) &&
- (km->tuple.dst.u.all == t->dst.u.all));
-}
-
-/* look up the source key for a given tuple */
-static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
-{
- struct ip_ct_gre_keymap *km;
- __be16 key = 0;
-
- read_lock_bh(&ip_ct_gre_lock);
- list_for_each_entry(km, &gre_keymap_list, list) {
- if (gre_key_cmpfn(km, t)) {
- key = km->tuple.src.u.gre.key;
- break;
- }
- }
- read_unlock_bh(&ip_ct_gre_lock);
-
- DEBUGP("lookup src key 0x%x up key for ", key);
- DUMP_TUPLE_GRE(t);
-
- return key;
-}
-
-/* add a single keymap entry, associate with specified master ct */
-int
-ip_ct_gre_keymap_add(struct ip_conntrack *ct,
- struct ip_conntrack_tuple *t, int reply)
-{
- struct ip_ct_gre_keymap **exist_km, *km;
-
- if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
- DEBUGP("refusing to add GRE keymap to non-pptp session\n");
- return -1;
- }
-
- if (!reply)
- exist_km = &ct->help.ct_pptp_info.keymap_orig;
- else
- exist_km = &ct->help.ct_pptp_info.keymap_reply;
-
- if (*exist_km) {
- /* check whether it's a retransmission */
- list_for_each_entry(km, &gre_keymap_list, list) {
- if (gre_key_cmpfn(km, t) && km == *exist_km)
- return 0;
- }
- DEBUGP("trying to override keymap_%s for ct %p\n",
- reply? "reply":"orig", ct);
- return -EEXIST;
- }
-
- km = kmalloc(sizeof(*km), GFP_ATOMIC);
- if (!km)
- return -ENOMEM;
-
- memcpy(&km->tuple, t, sizeof(*t));
- *exist_km = km;
-
- DEBUGP("adding new entry %p: ", km);
- DUMP_TUPLE_GRE(&km->tuple);
-
- write_lock_bh(&ip_ct_gre_lock);
- list_add_tail(&km->list, &gre_keymap_list);
- write_unlock_bh(&ip_ct_gre_lock);
-
- return 0;
-}
-
-/* destroy the keymap entries associated with specified master ct */
-void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct)
-{
- DEBUGP("entering for ct %p\n", ct);
-
- if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
- DEBUGP("refusing to destroy GRE keymap to non-pptp session\n");
- return;
- }
-
- write_lock_bh(&ip_ct_gre_lock);
- if (ct->help.ct_pptp_info.keymap_orig) {
- DEBUGP("removing %p from list\n",
- ct->help.ct_pptp_info.keymap_orig);
- list_del(&ct->help.ct_pptp_info.keymap_orig->list);
- kfree(ct->help.ct_pptp_info.keymap_orig);
- ct->help.ct_pptp_info.keymap_orig = NULL;
- }
- if (ct->help.ct_pptp_info.keymap_reply) {
- DEBUGP("removing %p from list\n",
- ct->help.ct_pptp_info.keymap_reply);
- list_del(&ct->help.ct_pptp_info.keymap_reply->list);
- kfree(ct->help.ct_pptp_info.keymap_reply);
- ct->help.ct_pptp_info.keymap_reply = NULL;
- }
- write_unlock_bh(&ip_ct_gre_lock);
-}
-
-
-/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
-
-/* invert gre part of tuple */
-static int gre_invert_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig)
-{
- tuple->dst.u.gre.key = orig->src.u.gre.key;
- tuple->src.u.gre.key = orig->dst.u.gre.key;
-
- return 1;
-}
-
-/* gre hdr info to tuple */
-static int gre_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple)
-{
- struct gre_hdr_pptp _pgrehdr, *pgrehdr;
- __be16 srckey;
- struct gre_hdr _grehdr, *grehdr;
-
- /* first only delinearize old RFC1701 GRE header */
- grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
- if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
- /* try to behave like "ip_conntrack_proto_generic" */
- tuple->src.u.all = 0;
- tuple->dst.u.all = 0;
- return 1;
- }
-
- /* PPTP header is variable length, only need up to the call_id field */
- pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
- if (!pgrehdr)
- return 1;
-
- if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
- DEBUGP("GRE_VERSION_PPTP but unknown proto\n");
- return 0;
- }
-
- tuple->dst.u.gre.key = pgrehdr->call_id;
- srckey = gre_keymap_lookup(tuple);
- tuple->src.u.gre.key = srckey;
-
- return 1;
-}
-
-/* print gre part of tuple */
-static int gre_print_tuple(struct seq_file *s,
- const struct ip_conntrack_tuple *tuple)
-{
- return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
- ntohs(tuple->src.u.gre.key),
- ntohs(tuple->dst.u.gre.key));
-}
-
-/* print private data for conntrack */
-static int gre_print_conntrack(struct seq_file *s,
- const struct ip_conntrack *ct)
-{
- return seq_printf(s, "timeout=%u, stream_timeout=%u ",
- (ct->proto.gre.timeout / HZ),
- (ct->proto.gre.stream_timeout / HZ));
-}
-
-/* Returns verdict for packet, and may modify conntrack */
-static int gre_packet(struct ip_conntrack *ct,
- const struct sk_buff *skb,
- enum ip_conntrack_info conntrackinfo)
-{
- /* If we've seen traffic both ways, this is a GRE connection.
- * Extend timeout. */
- if (ct->status & IPS_SEEN_REPLY) {
- ip_ct_refresh_acct(ct, conntrackinfo, skb,
- ct->proto.gre.stream_timeout);
- /* Also, more likely to be important, and not a probe. */
- set_bit(IPS_ASSURED_BIT, &ct->status);
- ip_conntrack_event_cache(IPCT_STATUS, skb);
- } else
- ip_ct_refresh_acct(ct, conntrackinfo, skb,
- ct->proto.gre.timeout);
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int gre_new(struct ip_conntrack *ct,
- const struct sk_buff *skb)
-{
- DEBUGP(": ");
- DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-
- /* initialize to sane value. Ideally a conntrack helper
- * (e.g. in case of pptp) is increasing them */
- ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT;
- ct->proto.gre.timeout = GRE_TIMEOUT;
-
- return 1;
-}
-
-/* Called when a conntrack entry has already been removed from the hashes
- * and is about to be deleted from memory */
-static void gre_destroy(struct ip_conntrack *ct)
-{
- struct ip_conntrack *master = ct->master;
- DEBUGP(" entering\n");
-
- if (!master)
- DEBUGP("no master !?!\n");
- else
- ip_ct_gre_keymap_destroy(master);
-}
-
-/* protocol helper struct */
-static struct ip_conntrack_protocol gre = {
- .proto = IPPROTO_GRE,
- .name = "gre",
- .pkt_to_tuple = gre_pkt_to_tuple,
- .invert_tuple = gre_invert_tuple,
- .print_tuple = gre_print_tuple,
- .print_conntrack = gre_print_conntrack,
- .packet = gre_packet,
- .new = gre_new,
- .destroy = gre_destroy,
- .me = THIS_MODULE,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
-#endif
-};
-
-/* ip_conntrack_proto_gre initialization */
-int __init ip_ct_proto_gre_init(void)
-{
- return ip_conntrack_protocol_register(&gre);
-}
-
-/* This cannot be __exit, as it is invoked from ip_conntrack_helper_pptp.c's
- * init() code on errors.
- */
-void ip_ct_proto_gre_fini(void)
-{
- struct list_head *pos, *n;
-
- /* delete all keymap entries */
- write_lock_bh(&ip_ct_gre_lock);
- list_for_each_safe(pos, n, &gre_keymap_list) {
- DEBUGP("deleting keymap %p at module unload time\n", pos);
- list_del(pos);
- kfree(pos);
- }
- write_unlock_bh(&ip_ct_gre_lock);
-
- ip_conntrack_protocol_unregister(&gre);
-}
-
-EXPORT_SYMBOL(ip_ct_gre_keymap_add);
-EXPORT_SYMBOL(ip_ct_gre_keymap_destroy);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
deleted file mode 100644
index 295b6fa340db..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ /dev/null
@@ -1,316 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/in.h>
-#include <linux/icmp.h>
-#include <linux/seq_file.h>
-#include <linux/skbuff.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-
-unsigned int ip_ct_icmp_timeout __read_mostly = 30*HZ;
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int icmp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple)
-{
- struct icmphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return 0;
-
- tuple->dst.u.icmp.type = hp->type;
- tuple->src.u.icmp.id = hp->un.echo.id;
- tuple->dst.u.icmp.code = hp->code;
-
- return 1;
-}
-
-/* Add 1; spaces filled with 0. */
-static const u_int8_t invmap[] = {
- [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
- [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
- [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
- [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
- [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
- [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
- [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
- [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
-};
-
-static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig)
-{
- if (orig->dst.u.icmp.type >= sizeof(invmap)
- || !invmap[orig->dst.u.icmp.type])
- return 0;
-
- tuple->src.u.icmp.id = orig->src.u.icmp.id;
- tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
- tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int icmp_print_tuple(struct seq_file *s,
- const struct ip_conntrack_tuple *tuple)
-{
- return seq_printf(s, "type=%u code=%u id=%u ",
- tuple->dst.u.icmp.type,
- tuple->dst.u.icmp.code,
- ntohs(tuple->src.u.icmp.id));
-}
-
-/* Print out the private part of the conntrack. */
-static int icmp_print_conntrack(struct seq_file *s,
- const struct ip_conntrack *conntrack)
-{
- return 0;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int icmp_packet(struct ip_conntrack *ct,
- const struct sk_buff *skb,
- enum ip_conntrack_info ctinfo)
-{
- /* Try to delete connection immediately after all replies:
- won't actually vanish as we still have skb, and del_timer
- means this will only run once even if count hits zero twice
- (theoretically possible with SMP) */
- if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
- if (atomic_dec_and_test(&ct->proto.icmp.count)
- && del_timer(&ct->timeout))
- ct->timeout.function((unsigned long)ct);
- } else {
- atomic_inc(&ct->proto.icmp.count);
- ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
- ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
- }
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int icmp_new(struct ip_conntrack *conntrack,
- const struct sk_buff *skb)
-{
- static const u_int8_t valid_new[] = {
- [ICMP_ECHO] = 1,
- [ICMP_TIMESTAMP] = 1,
- [ICMP_INFO_REQUEST] = 1,
- [ICMP_ADDRESS] = 1
- };
-
- if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
- || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
- /* Can't create a new ICMP `conn' with this. */
- DEBUGP("icmp: can't create new conn with type %u\n",
- conntrack->tuplehash[0].tuple.dst.u.icmp.type);
- DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
- return 0;
- }
- atomic_set(&conntrack->proto.icmp.count, 0);
- return 1;
-}
-
-static int
-icmp_error_message(struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct ip_conntrack_tuple innertuple, origtuple;
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } _in, *inside;
- struct ip_conntrack_protocol *innerproto;
- struct ip_conntrack_tuple_hash *h;
- int dataoff;
-
- IP_NF_ASSERT(skb->nfct == NULL);
-
- /* Not enough header? */
- inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
- if (inside == NULL)
- return -NF_ACCEPT;
-
- /* Ignore ICMP's containing fragments (shouldn't happen) */
- if (inside->ip.frag_off & htons(IP_OFFSET)) {
- DEBUGP("icmp_error_track: fragment of proto %u\n",
- inside->ip.protocol);
- return -NF_ACCEPT;
- }
-
- innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
- dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
- /* Are they talking about one of our connections? */
- if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
- DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
- ip_conntrack_proto_put(innerproto);
- return -NF_ACCEPT;
- }
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
- DEBUGP("icmp_error_track: Can't invert tuple\n");
- ip_conntrack_proto_put(innerproto);
- return -NF_ACCEPT;
- }
- ip_conntrack_proto_put(innerproto);
-
- *ctinfo = IP_CT_RELATED;
-
- h = ip_conntrack_find_get(&innertuple, NULL);
- if (!h) {
- /* Locally generated ICMPs will match inverted if they
- haven't been SNAT'ed yet */
- /* FIXME: NAT code has to handle half-done double NAT --RR */
- if (hooknum == NF_IP_LOCAL_OUT)
- h = ip_conntrack_find_get(&origtuple, NULL);
-
- if (!h) {
- DEBUGP("icmp_error_track: no match\n");
- return -NF_ACCEPT;
- }
- /* Reverse direction from that found */
- if (DIRECTION(h) != IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- } else {
- if (DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- }
-
- /* Update skb to refer to this connection */
- skb->nfct = &tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
- return -NF_ACCEPT;
-}
-
-/* Small and modified version of icmp_rcv */
-static int
-icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct icmphdr _ih, *icmph;
-
- /* Not enough header? */
- icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
- if (icmph == NULL) {
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_icmp: short packet ");
- return -NF_ACCEPT;
- }
-
- /* See ip_conntrack_proto_tcp.c */
- if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
- nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_icmp: bad ICMP checksum ");
- return -NF_ACCEPT;
- }
-
- /*
- * 18 is the highest 'known' ICMP type. Anything else is a mystery
- *
- * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
- * discarded.
- */
- if (icmph->type > NR_ICMP_TYPES) {
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_icmp: invalid ICMP type ");
- return -NF_ACCEPT;
- }
-
- /* Need to track icmp error message? */
- if (icmph->type != ICMP_DEST_UNREACH
- && icmph->type != ICMP_SOURCE_QUENCH
- && icmph->type != ICMP_TIME_EXCEEDED
- && icmph->type != ICMP_PARAMETERPROB
- && icmph->type != ICMP_REDIRECT)
- return NF_ACCEPT;
-
- return icmp_error_message(skb, ctinfo, hooknum);
-}
-
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
-static int icmp_tuple_to_nfattr(struct sk_buff *skb,
- const struct ip_conntrack_tuple *t)
-{
- NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(__be16),
- &t->src.u.icmp.id);
- NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
- &t->dst.u.icmp.type);
- NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
- &t->dst.u.icmp.code);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static int icmp_nfattr_to_tuple(struct nfattr *tb[],
- struct ip_conntrack_tuple *tuple)
-{
- if (!tb[CTA_PROTO_ICMP_TYPE-1]
- || !tb[CTA_PROTO_ICMP_CODE-1]
- || !tb[CTA_PROTO_ICMP_ID-1])
- return -EINVAL;
-
- tuple->dst.u.icmp.type =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
- tuple->dst.u.icmp.code =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
- tuple->src.u.icmp.id =
- *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
-
- if (tuple->dst.u.icmp.type >= sizeof(invmap)
- || !invmap[tuple->dst.u.icmp.type])
- return -EINVAL;
-
- return 0;
-}
-#endif
-
-struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
-{
- .proto = IPPROTO_ICMP,
- .name = "icmp",
- .pkt_to_tuple = icmp_pkt_to_tuple,
- .invert_tuple = icmp_invert_tuple,
- .print_tuple = icmp_print_tuple,
- .print_conntrack = icmp_print_conntrack,
- .packet = icmp_packet,
- .new = icmp_new,
- .error = icmp_error,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .tuple_to_nfattr = icmp_tuple_to_nfattr,
- .nfattr_to_tuple = icmp_nfattr_to_tuple,
-#endif
-};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
deleted file mode 100644
index 2443322e4128..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ /dev/null
@@ -1,660 +0,0 @@
-/*
- * Connection tracking protocol helper module for SCTP.
- *
- * SCTP is defined in RFC 2960. References to various sections in this code
- * are to this RFC.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*
- * Added support for proc manipulation of timeouts.
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/interrupt.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/sctp.h>
-#include <linux/string.h>
-#include <linux/seq_file.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-
-#if 0
-#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Protects conntrack->proto.sctp */
-static DEFINE_RWLOCK(sctp_lock);
-
-/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
- closely. They're more complex. --RR
-
- And so for me for SCTP :D -Kiran */
-
-static const char *sctp_conntrack_names[] = {
- "NONE",
- "CLOSED",
- "COOKIE_WAIT",
- "COOKIE_ECHOED",
- "ESTABLISHED",
- "SHUTDOWN_SENT",
- "SHUTDOWN_RECD",
- "SHUTDOWN_ACK_SENT",
-};
-
-#define SECS * HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS * 24 HOURS
-
-static unsigned int ip_ct_sctp_timeout_closed __read_mostly = 10 SECS;
-static unsigned int ip_ct_sctp_timeout_cookie_wait __read_mostly = 3 SECS;
-static unsigned int ip_ct_sctp_timeout_cookie_echoed __read_mostly = 3 SECS;
-static unsigned int ip_ct_sctp_timeout_established __read_mostly = 5 DAYS;
-static unsigned int ip_ct_sctp_timeout_shutdown_sent __read_mostly = 300 SECS / 1000;
-static unsigned int ip_ct_sctp_timeout_shutdown_recd __read_mostly = 300 SECS / 1000;
-static unsigned int ip_ct_sctp_timeout_shutdown_ack_sent __read_mostly = 3 SECS;
-
-static const unsigned int * sctp_timeouts[]
-= { NULL, /* SCTP_CONNTRACK_NONE */
- &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
- &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
- &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
- &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
- &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
- &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
- &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
- };
-
-#define sNO SCTP_CONNTRACK_NONE
-#define sCL SCTP_CONNTRACK_CLOSED
-#define sCW SCTP_CONNTRACK_COOKIE_WAIT
-#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
-#define sES SCTP_CONNTRACK_ESTABLISHED
-#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
-#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
-#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
-#define sIV SCTP_CONNTRACK_MAX
-
-/*
- These are the descriptions of the states:
-
-NOTE: These state names are tantalizingly similar to the states of an
-SCTP endpoint. But the interpretation of the states is a little different,
-considering that these are the states of the connection and not of an end
-point. Please note the subtleties. -Kiran
-
-NONE - Nothing so far.
-COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
- an INIT_ACK chunk in the reply direction.
-COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
-ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
-SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
-SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
-SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
- to that of the SHUTDOWN chunk.
-CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
- the SHUTDOWN chunk. Connection is closed.
-*/
-
-/* TODO
- - I have assumed that the first INIT is in the original direction.
- This messes things when an INIT comes in the reply direction in CLOSED
- state.
- - Check the error type in the reply dir before transitioning from
-cookie echoed to closed.
- - Sec 5.2.4 of RFC 2960
- - Multi Homing support.
-*/
-
-/* SCTP conntrack state transitions */
-static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
- {
-/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
- },
- {
-/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
- }
-};
-
-static int sctp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple)
-{
- sctp_sctphdr_t _hdr, *hp;
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- /* Actually only need first 8 bytes. */
- hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
- if (hp == NULL)
- return 0;
-
- tuple->src.u.sctp.port = hp->source;
- tuple->dst.u.sctp.port = hp->dest;
- return 1;
-}
-
-static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig)
-{
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- tuple->src.u.sctp.port = orig->dst.u.sctp.port;
- tuple->dst.u.sctp.port = orig->src.u.sctp.port;
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int sctp_print_tuple(struct seq_file *s,
- const struct ip_conntrack_tuple *tuple)
-{
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- return seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.sctp.port),
- ntohs(tuple->dst.u.sctp.port));
-}
-
-/* Print out the private part of the conntrack. */
-static int sctp_print_conntrack(struct seq_file *s,
- const struct ip_conntrack *conntrack)
-{
- enum sctp_conntrack state;
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- read_lock_bh(&sctp_lock);
- state = conntrack->proto.sctp.state;
- read_unlock_bh(&sctp_lock);
-
- return seq_printf(s, "%s ", sctp_conntrack_names[state]);
-}
-
-#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \
-for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \
- offset < skb->len && \
- (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
- offset += (ntohs(sch->length) + 3) & ~3, count++)
-
-/* Some validity checks to make sure the chunks are fine */
-static int do_basic_checks(struct ip_conntrack *conntrack,
- const struct sk_buff *skb,
- char *map)
-{
- u_int32_t offset, count;
- sctp_chunkhdr_t _sch, *sch;
- int flag;
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- flag = 0;
-
- for_each_sctp_chunk (skb, sch, _sch, offset, count) {
- DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
-
- if (sch->type == SCTP_CID_INIT
- || sch->type == SCTP_CID_INIT_ACK
- || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
- flag = 1;
- }
-
- /*
- * Cookie Ack/Echo chunks not the first OR
- * Init / Init Ack / Shutdown compl chunks not the only chunks
- * OR zero-length.
- */
- if (((sch->type == SCTP_CID_COOKIE_ACK
- || sch->type == SCTP_CID_COOKIE_ECHO
- || flag)
- && count !=0) || !sch->length) {
- DEBUGP("Basic checks failed\n");
- return 1;
- }
-
- if (map) {
- set_bit(sch->type, (void *)map);
- }
- }
-
- DEBUGP("Basic checks passed\n");
- return count == 0;
-}
-
-static int new_state(enum ip_conntrack_dir dir,
- enum sctp_conntrack cur_state,
- int chunk_type)
-{
- int i;
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- DEBUGP("Chunk type: %d\n", chunk_type);
-
- switch (chunk_type) {
- case SCTP_CID_INIT:
- DEBUGP("SCTP_CID_INIT\n");
- i = 0; break;
- case SCTP_CID_INIT_ACK:
- DEBUGP("SCTP_CID_INIT_ACK\n");
- i = 1; break;
- case SCTP_CID_ABORT:
- DEBUGP("SCTP_CID_ABORT\n");
- i = 2; break;
- case SCTP_CID_SHUTDOWN:
- DEBUGP("SCTP_CID_SHUTDOWN\n");
- i = 3; break;
- case SCTP_CID_SHUTDOWN_ACK:
- DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
- i = 4; break;
- case SCTP_CID_ERROR:
- DEBUGP("SCTP_CID_ERROR\n");
- i = 5; break;
- case SCTP_CID_COOKIE_ECHO:
- DEBUGP("SCTP_CID_COOKIE_ECHO\n");
- i = 6; break;
- case SCTP_CID_COOKIE_ACK:
- DEBUGP("SCTP_CID_COOKIE_ACK\n");
- i = 7; break;
- case SCTP_CID_SHUTDOWN_COMPLETE:
- DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
- i = 8; break;
- default:
- /* Other chunks like DATA, SACK, HEARTBEAT and
- its ACK do not cause a change in state */
- DEBUGP("Unknown chunk type, Will stay in %s\n",
- sctp_conntrack_names[cur_state]);
- return cur_state;
- }
-
- DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
- dir, sctp_conntrack_names[cur_state], chunk_type,
- sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
-
- return sctp_conntracks[dir][i][cur_state];
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int sctp_packet(struct ip_conntrack *conntrack,
- const struct sk_buff *skb,
- enum ip_conntrack_info ctinfo)
-{
- enum sctp_conntrack newconntrack, oldsctpstate;
- struct iphdr *iph = skb->nh.iph;
- sctp_sctphdr_t _sctph, *sh;
- sctp_chunkhdr_t _sch, *sch;
- u_int32_t offset, count;
- char map[256 / sizeof (char)] = {0};
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
- if (sh == NULL)
- return -1;
-
- if (do_basic_checks(conntrack, skb, map) != 0)
- return -1;
-
- /* Check the verification tag (Sec 8.5) */
- if (!test_bit(SCTP_CID_INIT, (void *)map)
- && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
- && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
- && !test_bit(SCTP_CID_ABORT, (void *)map)
- && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
- && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
- DEBUGP("Verification tag check failed\n");
- return -1;
- }
-
- oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
- for_each_sctp_chunk (skb, sch, _sch, offset, count) {
- write_lock_bh(&sctp_lock);
-
- /* Special cases of Verification tag check (Sec 8.5.1) */
- if (sch->type == SCTP_CID_INIT) {
- /* Sec 8.5.1 (A) */
- if (sh->vtag != 0) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
- } else if (sch->type == SCTP_CID_ABORT) {
- /* Sec 8.5.1 (B) */
- if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
- && !(sh->vtag == conntrack->proto.sctp.vtag
- [1 - CTINFO2DIR(ctinfo)])) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
- } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
- /* Sec 8.5.1 (C) */
- if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
- && !(sh->vtag == conntrack->proto.sctp.vtag
- [1 - CTINFO2DIR(ctinfo)]
- && (sch->flags & 1))) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
- } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
- /* Sec 8.5.1 (D) */
- if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
- }
-
- oldsctpstate = conntrack->proto.sctp.state;
- newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
-
- /* Invalid */
- if (newconntrack == SCTP_CONNTRACK_MAX) {
- DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
- CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
- write_unlock_bh(&sctp_lock);
- return -1;
- }
-
- /* If it is an INIT or an INIT ACK note down the vtag */
- if (sch->type == SCTP_CID_INIT
- || sch->type == SCTP_CID_INIT_ACK) {
- sctp_inithdr_t _inithdr, *ih;
-
- ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
- sizeof(_inithdr), &_inithdr);
- if (ih == NULL) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
- DEBUGP("Setting vtag %x for dir %d\n",
- ih->init_tag, !CTINFO2DIR(ctinfo));
- conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
- }
-
- conntrack->proto.sctp.state = newconntrack;
- if (oldsctpstate != newconntrack)
- ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
- write_unlock_bh(&sctp_lock);
- }
-
- ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
-
- if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
- && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
- && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
- DEBUGP("Setting assured bit\n");
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
- ip_conntrack_event_cache(IPCT_STATUS, skb);
- }
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int sctp_new(struct ip_conntrack *conntrack,
- const struct sk_buff *skb)
-{
- enum sctp_conntrack newconntrack;
- struct iphdr *iph = skb->nh.iph;
- sctp_sctphdr_t _sctph, *sh;
- sctp_chunkhdr_t _sch, *sch;
- u_int32_t offset, count;
- char map[256 / sizeof (char)] = {0};
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
- if (sh == NULL)
- return 0;
-
- if (do_basic_checks(conntrack, skb, map) != 0)
- return 0;
-
- /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
- if ((test_bit (SCTP_CID_ABORT, (void *)map))
- || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
- || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
- return 0;
- }
-
- newconntrack = SCTP_CONNTRACK_MAX;
- for_each_sctp_chunk (skb, sch, _sch, offset, count) {
- /* Don't need lock here: this conntrack not in circulation yet */
- newconntrack = new_state (IP_CT_DIR_ORIGINAL,
- SCTP_CONNTRACK_NONE, sch->type);
-
- /* Invalid: delete conntrack */
- if (newconntrack == SCTP_CONNTRACK_MAX) {
- DEBUGP("ip_conntrack_sctp: invalid new deleting.\n");
- return 0;
- }
-
- /* Copy the vtag into the state info */
- if (sch->type == SCTP_CID_INIT) {
- if (sh->vtag == 0) {
- sctp_inithdr_t _inithdr, *ih;
-
- ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
- sizeof(_inithdr), &_inithdr);
- if (ih == NULL)
- return 0;
-
- DEBUGP("Setting vtag %x for new conn\n",
- ih->init_tag);
-
- conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
- ih->init_tag;
- } else {
- /* Sec 8.5.1 (A) */
- return 0;
- }
- }
- /* If it is a shutdown ack OOTB packet, we expect a return
- shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
- else {
- DEBUGP("Setting vtag %x for new conn OOTB\n",
- sh->vtag);
- conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
- }
-
- conntrack->proto.sctp.state = newconntrack;
- }
-
- return 1;
-}
-
-static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
- .proto = IPPROTO_SCTP,
- .name = "sctp",
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
- .print_tuple = sctp_print_tuple,
- .print_conntrack = sctp_print_conntrack,
- .packet = sctp_packet,
- .new = sctp_new,
- .destroy = NULL,
- .me = THIS_MODULE,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
-#endif
-};
-
-#ifdef CONFIG_SYSCTL
-static ctl_table ip_ct_sysctl_table[] = {
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
- .procname = "ip_conntrack_sctp_timeout_closed",
- .data = &ip_ct_sctp_timeout_closed,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
- .procname = "ip_conntrack_sctp_timeout_cookie_wait",
- .data = &ip_ct_sctp_timeout_cookie_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
- .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
- .data = &ip_ct_sctp_timeout_cookie_echoed,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
- .procname = "ip_conntrack_sctp_timeout_established",
- .data = &ip_ct_sctp_timeout_established,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
- .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
- .data = &ip_ct_sctp_timeout_shutdown_sent,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
- .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
- .data = &ip_ct_sctp_timeout_shutdown_recd,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
- .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
- .data = &ip_ct_sctp_timeout_shutdown_ack_sent,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ip_ct_netfilter_table[] = {
- {
- .ctl_name = NET_IPV4_NETFILTER,
- .procname = "netfilter",
- .mode = 0555,
- .child = ip_ct_sysctl_table,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ip_ct_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = ip_ct_netfilter_table,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ip_ct_net_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ip_ct_ipv4_table,
- },
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header *ip_ct_sysctl_header;
-#endif
-
-static int __init ip_conntrack_proto_sctp_init(void)
-{
- int ret;
-
- ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp);
- if (ret) {
- printk("ip_conntrack_proto_sctp: protocol register failed\n");
- goto out;
- }
-
-#ifdef CONFIG_SYSCTL
- ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
- if (ip_ct_sysctl_header == NULL) {
- ret = -ENOMEM;
- printk("ip_conntrack_proto_sctp: can't register to sysctl.\n");
- goto cleanup;
- }
-#endif
-
- return ret;
-
-#ifdef CONFIG_SYSCTL
- cleanup:
- ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
-#endif
- out:
- DEBUGP("SCTP conntrack module loading %s\n",
- ret ? "failed": "succeeded");
- return ret;
-}
-
-static void __exit ip_conntrack_proto_sctp_fini(void)
-{
- ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ip_ct_sysctl_header);
-#endif
- DEBUGP("SCTP conntrack module unloaded\n");
-}
-
-module_init(ip_conntrack_proto_sctp_init);
-module_exit(ip_conntrack_proto_sctp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kiran Kumar Immidi");
-MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
deleted file mode 100644
index 06e4e8a6dd9f..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ /dev/null
@@ -1,1173 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
- * - Real stateful connection tracking
- * - Modified state transitions table
- * - Window scaling support added
- * - SACK support added
- *
- * Willy Tarreau:
- * - State table bugfixes
- * - More robust state changes
- * - Tuning timer parameters
- *
- * version 2.2
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/spinlock.h>
-
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-
-#if 0
-#define DEBUGP printk
-#define DEBUGP_VARS
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Protects conntrack->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
-/* "Be conservative in what you do,
- be liberal in what you accept from others."
- If it's non-zero, we mark only out of window RST segments as INVALID. */
-int ip_ct_tcp_be_liberal __read_mostly = 0;
-
-/* When connection is picked up from the middle, how many packets are required
- to pass in each direction when we assume we are in sync - if any side uses
- window scaling, we lost the game.
- If it is set to zero, we disable picking up already established
- connections. */
-int ip_ct_tcp_loose __read_mostly = 3;
-
-/* Max number of the retransmitted packets without receiving an (acceptable)
- ACK from the destination. If this number is reached, a shorter timer
- will be started. */
-int ip_ct_tcp_max_retrans __read_mostly = 3;
-
- /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
- closely. They're more complex. --RR */
-
-static const char *tcp_conntrack_names[] = {
- "NONE",
- "SYN_SENT",
- "SYN_RECV",
- "ESTABLISHED",
- "FIN_WAIT",
- "CLOSE_WAIT",
- "LAST_ACK",
- "TIME_WAIT",
- "CLOSE",
- "LISTEN"
-};
-
-#define SECS * HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS * 24 HOURS
-
-unsigned int ip_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS;
-unsigned int ip_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS;
-unsigned int ip_ct_tcp_timeout_established __read_mostly = 5 DAYS;
-unsigned int ip_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS;
-unsigned int ip_ct_tcp_timeout_close_wait __read_mostly = 60 SECS;
-unsigned int ip_ct_tcp_timeout_last_ack __read_mostly = 30 SECS;
-unsigned int ip_ct_tcp_timeout_time_wait __read_mostly = 2 MINS;
-unsigned int ip_ct_tcp_timeout_close __read_mostly = 10 SECS;
-
-/* RFC1122 says the R2 limit should be at least 100 seconds.
- Linux uses 15 packets as limit, which corresponds
- to ~13-30min depending on RTO. */
-unsigned int ip_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS;
-
-static const unsigned int * tcp_timeouts[]
-= { NULL, /* TCP_CONNTRACK_NONE */
- &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
- &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
- &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
- &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
- &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
- &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
- &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
- &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
- NULL, /* TCP_CONNTRACK_LISTEN */
- };
-
-#define sNO TCP_CONNTRACK_NONE
-#define sSS TCP_CONNTRACK_SYN_SENT
-#define sSR TCP_CONNTRACK_SYN_RECV
-#define sES TCP_CONNTRACK_ESTABLISHED
-#define sFW TCP_CONNTRACK_FIN_WAIT
-#define sCW TCP_CONNTRACK_CLOSE_WAIT
-#define sLA TCP_CONNTRACK_LAST_ACK
-#define sTW TCP_CONNTRACK_TIME_WAIT
-#define sCL TCP_CONNTRACK_CLOSE
-#define sLI TCP_CONNTRACK_LISTEN
-#define sIV TCP_CONNTRACK_MAX
-#define sIG TCP_CONNTRACK_IGNORE
-
-/* What TCP flags are set from RST/SYN/FIN/ACK. */
-enum tcp_bit_set {
- TCP_SYN_SET,
- TCP_SYNACK_SET,
- TCP_FIN_SET,
- TCP_ACK_SET,
- TCP_RST_SET,
- TCP_NONE_SET,
-};
-
-/*
- * The TCP state transition table needs a few words...
- *
- * We are the man in the middle. All the packets go through us
- * but might get lost in transit to the destination.
- * It is assumed that the destinations can't receive segments
- * we haven't seen.
- *
- * The checked segment is in window, but our windows are *not*
- * equivalent with the ones of the sender/receiver. We always
- * try to guess the state of the current sender.
- *
- * The meaning of the states are:
- *
- * NONE: initial state
- * SYN_SENT: SYN-only packet seen
- * SYN_RECV: SYN-ACK packet seen
- * ESTABLISHED: ACK packet seen
- * FIN_WAIT: FIN packet seen
- * CLOSE_WAIT: ACK seen (after FIN)
- * LAST_ACK: FIN seen (after FIN)
- * TIME_WAIT: last ACK seen
- * CLOSE: closed connection
- *
- * LISTEN state is not used.
- *
- * Packets marked as IGNORED (sIG):
- * if they may be either invalid or valid
- * and the receiver may send back a connection
- * closing RST or a SYN/ACK.
- *
- * Packets marked as INVALID (sIV):
- * if they are invalid
- * or we do not support the request (simultaneous open)
- */
-static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
- {
-/* ORIGINAL */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
-/*
- * sNO -> sSS Initialize a new connection
- * sSS -> sSS Retransmitted SYN
- * sSR -> sIG Late retransmitted SYN?
- * sES -> sIG Error: SYNs in window outside the SYN_SENT state
- * are errors. Receiver will reply with RST
- * and close the connection.
- * Or we are not in sync and hold a dead connection.
- * sFW -> sIG
- * sCW -> sIG
- * sLA -> sIG
- * sTW -> sSS Reopened connection (RFC 1122).
- * sCL -> sSS
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
-/*
- * A SYN/ACK from the client is always invalid:
- * - either it tries to set up a simultaneous open, which is
- * not supported;
- * - or the firewall has just been inserted between the two hosts
- * during the session set-up. The SYN will be retransmitted
- * by the true client (or it'll time out).
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
-/*
- * sNO -> sIV Too late and no reason to do anything...
- * sSS -> sIV Client migth not send FIN in this state:
- * we enforce waiting for a SYN/ACK reply first.
- * sSR -> sFW Close started.
- * sES -> sFW
- * sFW -> sLA FIN seen in both directions, waiting for
- * the last ACK.
- * Migth be a retransmitted FIN as well...
- * sCW -> sLA
- * sLA -> sLA Retransmitted FIN. Remain in the same state.
- * sTW -> sTW
- * sCL -> sCL
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
-/*
- * sNO -> sES Assumed.
- * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
- * sSR -> sES Established state is reached.
- * sES -> sES :-)
- * sFW -> sCW Normal close request answered by ACK.
- * sCW -> sCW
- * sLA -> sTW Last ACK detected.
- * sTW -> sTW Retransmitted last ACK. Remain in the same state.
- * sCL -> sCL
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
-/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
- },
- {
-/* REPLY */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
-/*
- * sNO -> sIV Never reached.
- * sSS -> sIV Simultaneous open, not supported
- * sSR -> sIV Simultaneous open, not supported.
- * sES -> sIV Server may not initiate a connection.
- * sFW -> sIV
- * sCW -> sIV
- * sLA -> sIV
- * sTW -> sIV Reopened connection, but server may not do it.
- * sCL -> sIV
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
-/*
- * sSS -> sSR Standard open.
- * sSR -> sSR Retransmitted SYN/ACK.
- * sES -> sIG Late retransmitted SYN/ACK?
- * sFW -> sIG Might be SYN/ACK answering ignored SYN
- * sCW -> sIG
- * sLA -> sIG
- * sTW -> sIG
- * sCL -> sIG
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
-/*
- * sSS -> sIV Server might not send FIN in this state.
- * sSR -> sFW Close started.
- * sES -> sFW
- * sFW -> sLA FIN seen in both directions.
- * sCW -> sLA
- * sLA -> sLA Retransmitted FIN.
- * sTW -> sTW
- * sCL -> sCL
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
-/*
- * sSS -> sIG Might be a half-open connection.
- * sSR -> sSR Might answer late resent SYN.
- * sES -> sES :-)
- * sFW -> sCW Normal close request answered by ACK.
- * sCW -> sCW
- * sLA -> sTW Last ACK detected.
- * sTW -> sTW Retransmitted last ACK.
- * sCL -> sCL
- */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
-/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
- }
-};
-
-static int tcp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple)
-{
- struct tcphdr _hdr, *hp;
-
- /* Actually only need first 8 bytes. */
- hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
- if (hp == NULL)
- return 0;
-
- tuple->src.u.tcp.port = hp->source;
- tuple->dst.u.tcp.port = hp->dest;
-
- return 1;
-}
-
-static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig)
-{
- tuple->src.u.tcp.port = orig->dst.u.tcp.port;
- tuple->dst.u.tcp.port = orig->src.u.tcp.port;
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int tcp_print_tuple(struct seq_file *s,
- const struct ip_conntrack_tuple *tuple)
-{
- return seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.tcp.port),
- ntohs(tuple->dst.u.tcp.port));
-}
-
-/* Print out the private part of the conntrack. */
-static int tcp_print_conntrack(struct seq_file *s,
- const struct ip_conntrack *conntrack)
-{
- enum tcp_conntrack state;
-
- read_lock_bh(&tcp_lock);
- state = conntrack->proto.tcp.state;
- read_unlock_bh(&tcp_lock);
-
- return seq_printf(s, "%s ", tcp_conntrack_names[state]);
-}
-
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
-static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
- const struct ip_conntrack *ct)
-{
- struct nfattr *nest_parms;
-
- read_lock_bh(&tcp_lock);
- nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
- NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
- &ct->proto.tcp.state);
- read_unlock_bh(&tcp_lock);
-
- NFA_NEST_END(skb, nest_parms);
-
- return 0;
-
-nfattr_failure:
- read_unlock_bh(&tcp_lock);
- return -1;
-}
-
-static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = {
- [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t),
-};
-
-static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
-{
- struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
- struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
-
- /* updates could not contain anything about the private
- * protocol info, in that case skip the parsing */
- if (!attr)
- return 0;
-
- nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
-
- if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp))
- return -EINVAL;
-
- if (!tb[CTA_PROTOINFO_TCP_STATE-1])
- return -EINVAL;
-
- write_lock_bh(&tcp_lock);
- ct->proto.tcp.state =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
- write_unlock_bh(&tcp_lock);
-
- return 0;
-}
-#endif
-
-static unsigned int get_conntrack_index(const struct tcphdr *tcph)
-{
- if (tcph->rst) return TCP_RST_SET;
- else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
- else if (tcph->fin) return TCP_FIN_SET;
- else if (tcph->ack) return TCP_ACK_SET;
- else return TCP_NONE_SET;
-}
-
-/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
- in IP Filter' by Guido van Rooij.
-
- http://www.nluug.nl/events/sane2000/papers.html
- http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
-
- The boundaries and the conditions are changed according to RFC793:
- the packet must intersect the window (i.e. segments may be
- after the right or before the left edge) and thus receivers may ACK
- segments after the right edge of the window.
-
- td_maxend = max(sack + max(win,1)) seen in reply packets
- td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
- td_maxwin += seq + len - sender.td_maxend
- if seq + len > sender.td_maxend
- td_end = max(seq + len) seen in sent packets
-
- I. Upper bound for valid data: seq <= sender.td_maxend
- II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
- III. Upper bound for valid ack: sack <= receiver.td_end
- IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
-
- where sack is the highest right edge of sack block found in the packet.
-
- The upper bound limit for a valid ack is not ignored -
- we doesn't have to deal with fragments.
-*/
-
-static inline __u32 segment_seq_plus_len(__u32 seq,
- size_t len,
- struct iphdr *iph,
- struct tcphdr *tcph)
-{
- return (seq + len - (iph->ihl + tcph->doff)*4
- + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
-}
-
-/* Fixme: what about big packets? */
-#define MAXACKWINCONST 66000
-#define MAXACKWINDOW(sender) \
- ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
- : MAXACKWINCONST)
-
-/*
- * Simplified tcp_parse_options routine from tcp_input.c
- */
-static void tcp_options(const struct sk_buff *skb,
- struct iphdr *iph,
- struct tcphdr *tcph,
- struct ip_ct_tcp_state *state)
-{
- unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
- unsigned char *ptr;
- int length = (tcph->doff*4) - sizeof(struct tcphdr);
-
- if (!length)
- return;
-
- ptr = skb_header_pointer(skb,
- (iph->ihl * 4) + sizeof(struct tcphdr),
- length, buff);
- BUG_ON(ptr == NULL);
-
- state->td_scale =
- state->flags = 0;
-
- while (length > 0) {
- int opcode=*ptr++;
- int opsize;
-
- switch (opcode) {
- case TCPOPT_EOL:
- return;
- case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
- length--;
- continue;
- default:
- opsize=*ptr++;
- if (opsize < 2) /* "silly options" */
- return;
- if (opsize > length)
- break; /* don't parse partial options */
-
- if (opcode == TCPOPT_SACK_PERM
- && opsize == TCPOLEN_SACK_PERM)
- state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
- else if (opcode == TCPOPT_WINDOW
- && opsize == TCPOLEN_WINDOW) {
- state->td_scale = *(u_int8_t *)ptr;
-
- if (state->td_scale > 14) {
- /* See RFC1323 */
- state->td_scale = 14;
- }
- state->flags |=
- IP_CT_TCP_FLAG_WINDOW_SCALE;
- }
- ptr += opsize - 2;
- length -= opsize;
- }
- }
-}
-
-static void tcp_sack(const struct sk_buff *skb,
- struct iphdr *iph,
- struct tcphdr *tcph,
- __u32 *sack)
-{
- unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
- unsigned char *ptr;
- int length = (tcph->doff*4) - sizeof(struct tcphdr);
- __u32 tmp;
-
- if (!length)
- return;
-
- ptr = skb_header_pointer(skb,
- (iph->ihl * 4) + sizeof(struct tcphdr),
- length, buff);
- BUG_ON(ptr == NULL);
-
- /* Fast path for timestamp-only option */
- if (length == TCPOLEN_TSTAMP_ALIGNED*4
- && *(__be32 *)ptr ==
- __constant_htonl((TCPOPT_NOP << 24)
- | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8)
- | TCPOLEN_TIMESTAMP))
- return;
-
- while (length > 0) {
- int opcode=*ptr++;
- int opsize, i;
-
- switch (opcode) {
- case TCPOPT_EOL:
- return;
- case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
- length--;
- continue;
- default:
- opsize=*ptr++;
- if (opsize < 2) /* "silly options" */
- return;
- if (opsize > length)
- break; /* don't parse partial options */
-
- if (opcode == TCPOPT_SACK
- && opsize >= (TCPOLEN_SACK_BASE
- + TCPOLEN_SACK_PERBLOCK)
- && !((opsize - TCPOLEN_SACK_BASE)
- % TCPOLEN_SACK_PERBLOCK)) {
- for (i = 0;
- i < (opsize - TCPOLEN_SACK_BASE);
- i += TCPOLEN_SACK_PERBLOCK) {
- tmp = ntohl(*((__be32 *)(ptr+i)+1));
-
- if (after(tmp, *sack))
- *sack = tmp;
- }
- return;
- }
- ptr += opsize - 2;
- length -= opsize;
- }
- }
-}
-
-static int tcp_in_window(struct ip_ct_tcp *state,
- enum ip_conntrack_dir dir,
- unsigned int index,
- const struct sk_buff *skb,
- struct iphdr *iph,
- struct tcphdr *tcph)
-{
- struct ip_ct_tcp_state *sender = &state->seen[dir];
- struct ip_ct_tcp_state *receiver = &state->seen[!dir];
- __u32 seq, ack, sack, end, win, swin;
- int res;
-
- /*
- * Get the required data from the packet.
- */
- seq = ntohl(tcph->seq);
- ack = sack = ntohl(tcph->ack_seq);
- win = ntohs(tcph->window);
- end = segment_seq_plus_len(seq, skb->len, iph, tcph);
-
- if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
- tcp_sack(skb, iph, tcph, &sack);
-
- DEBUGP("tcp_in_window: START\n");
- DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
- "seq=%u ack=%u sack=%u win=%u end=%u\n",
- NIPQUAD(iph->saddr), ntohs(tcph->source),
- NIPQUAD(iph->daddr), ntohs(tcph->dest),
- seq, ack, sack, win, end);
- DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
- if (sender->td_end == 0) {
- /*
- * Initialize sender data.
- */
- if (tcph->syn && tcph->ack) {
- /*
- * Outgoing SYN-ACK in reply to a SYN.
- */
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
-
- tcp_options(skb, iph, tcph, sender);
- /*
- * RFC 1323:
- * Both sides must send the Window Scale option
- * to enable window scaling in either direction.
- */
- if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
- && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
- sender->td_scale =
- receiver->td_scale = 0;
- } else {
- /*
- * We are in the middle of a connection,
- * its history is lost for us.
- * Let's try to use the data from the packet.
- */
- sender->td_end = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
- sender->td_maxend = end + sender->td_maxwin;
- }
- } else if (((state->state == TCP_CONNTRACK_SYN_SENT
- && dir == IP_CT_DIR_ORIGINAL)
- || (state->state == TCP_CONNTRACK_SYN_RECV
- && dir == IP_CT_DIR_REPLY))
- && after(end, sender->td_end)) {
- /*
- * RFC 793: "if a TCP is reinitialized ... then it need
- * not wait at all; it must only be sure to use sequence
- * numbers larger than those recently used."
- */
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
-
- tcp_options(skb, iph, tcph, sender);
- }
-
- if (!(tcph->ack)) {
- /*
- * If there is no ACK, just pretend it was set and OK.
- */
- ack = sack = receiver->td_end;
- } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
- (TCP_FLAG_ACK|TCP_FLAG_RST))
- && (ack == 0)) {
- /*
- * Broken TCP stacks, that set ACK in RST packets as well
- * with zero ack value.
- */
- ack = sack = receiver->td_end;
- }
-
- if (seq == end
- && (!tcph->rst
- || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
- /*
- * Packets contains no data: we assume it is valid
- * and check the ack value only.
- * However RST segments are always validated by their
- * SEQ number, except when seq == 0 (reset sent answering
- * SYN.
- */
- seq = end = sender->td_end;
-
- DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
- "seq=%u ack=%u sack =%u win=%u end=%u\n",
- NIPQUAD(iph->saddr), ntohs(tcph->source),
- NIPQUAD(iph->daddr), ntohs(tcph->dest),
- seq, ack, sack, win, end);
- DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
- DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
- before(seq, sender->td_maxend + 1),
- after(end, sender->td_end - receiver->td_maxwin - 1),
- before(sack, receiver->td_end + 1),
- after(ack, receiver->td_end - MAXACKWINDOW(sender)));
-
- if (sender->loose || receiver->loose ||
- (before(seq, sender->td_maxend + 1) &&
- after(end, sender->td_end - receiver->td_maxwin - 1) &&
- before(sack, receiver->td_end + 1) &&
- after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
- /*
- * Take into account window scaling (RFC 1323).
- */
- if (!tcph->syn)
- win <<= sender->td_scale;
-
- /*
- * Update sender data.
- */
- swin = win + (sack - ack);
- if (sender->td_maxwin < swin)
- sender->td_maxwin = swin;
- if (after(end, sender->td_end))
- sender->td_end = end;
- /*
- * Update receiver data.
- */
- if (after(end, sender->td_maxend))
- receiver->td_maxwin += end - sender->td_maxend;
- if (after(sack + win, receiver->td_maxend - 1)) {
- receiver->td_maxend = sack + win;
- if (win == 0)
- receiver->td_maxend++;
- }
-
- /*
- * Check retransmissions.
- */
- if (index == TCP_ACK_SET) {
- if (state->last_dir == dir
- && state->last_seq == seq
- && state->last_ack == ack
- && state->last_end == end
- && state->last_win == win)
- state->retrans++;
- else {
- state->last_dir = dir;
- state->last_seq = seq;
- state->last_ack = ack;
- state->last_end = end;
- state->last_win = win;
- state->retrans = 0;
- }
- }
- /*
- * Close the window of disabled window tracking :-)
- */
- if (sender->loose)
- sender->loose--;
-
- res = 1;
- } else {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: %s ",
- before(seq, sender->td_maxend + 1) ?
- after(end, sender->td_end - receiver->td_maxwin - 1) ?
- before(sack, receiver->td_end + 1) ?
- after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
- : "ACK is under the lower bound (possible overly delayed ACK)"
- : "ACK is over the upper bound (ACKed data not seen yet)"
- : "SEQ is under the lower bound (already ACKed data retransmitted)"
- : "SEQ is over the upper bound (over the window of the receiver)");
-
- res = ip_ct_tcp_be_liberal;
- }
-
- DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
- "receiver end=%u maxend=%u maxwin=%u\n",
- res, sender->td_end, sender->td_maxend, sender->td_maxwin,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
-
- return res;
-}
-
-#ifdef CONFIG_IP_NF_NAT_NEEDED
-/* Update sender->td_end after NAT successfully mangled the packet */
-void ip_conntrack_tcp_update(struct sk_buff *skb,
- struct ip_conntrack *conntrack,
- enum ip_conntrack_dir dir)
-{
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
- __u32 end;
-#ifdef DEBUGP_VARS
- struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
- struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
-#endif
-
- end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
-
- write_lock_bh(&tcp_lock);
- /*
- * We have to worry for the ack in the reply packet only...
- */
- if (after(end, conntrack->proto.tcp.seen[dir].td_end))
- conntrack->proto.tcp.seen[dir].td_end = end;
- conntrack->proto.tcp.last_end = end;
- write_unlock_bh(&tcp_lock);
- DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-}
-
-#endif
-
-#define TH_FIN 0x01
-#define TH_SYN 0x02
-#define TH_RST 0x04
-#define TH_PUSH 0x08
-#define TH_ACK 0x10
-#define TH_URG 0x20
-#define TH_ECE 0x40
-#define TH_CWR 0x80
-
-/* table of valid flag combinations - ECE and CWR are always valid */
-static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
-{
- [TH_SYN] = 1,
- [TH_SYN|TH_ACK] = 1,
- [TH_SYN|TH_PUSH] = 1,
- [TH_SYN|TH_ACK|TH_PUSH] = 1,
- [TH_RST] = 1,
- [TH_RST|TH_ACK] = 1,
- [TH_RST|TH_ACK|TH_PUSH] = 1,
- [TH_FIN|TH_ACK] = 1,
- [TH_ACK] = 1,
- [TH_ACK|TH_PUSH] = 1,
- [TH_ACK|TH_URG] = 1,
- [TH_ACK|TH_URG|TH_PUSH] = 1,
- [TH_FIN|TH_ACK|TH_PUSH] = 1,
- [TH_FIN|TH_ACK|TH_URG] = 1,
- [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
-};
-
-/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
-static int tcp_error(struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr _tcph, *th;
- unsigned int tcplen = skb->len - iph->ihl * 4;
- u_int8_t tcpflags;
-
- /* Smaller that minimal TCP header? */
- th = skb_header_pointer(skb, iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: short packet ");
- return -NF_ACCEPT;
- }
-
- /* Not whole TCP header or malformed packet */
- if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: truncated/malformed packet ");
- return -NF_ACCEPT;
- }
-
- /* Checksum invalid? Ignore.
- * We skip checking packets on the outgoing path
- * because it is assumed to be correct.
- */
- /* FIXME: Source route IP option packets --RR */
- if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
- nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: bad TCP checksum ");
- return -NF_ACCEPT;
- }
-
- /* Check TCP flags. */
- tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
- if (!tcp_valid_flags[tcpflags]) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: invalid TCP flag combination ");
- return -NF_ACCEPT;
- }
-
- return NF_ACCEPT;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int tcp_packet(struct ip_conntrack *conntrack,
- const struct sk_buff *skb,
- enum ip_conntrack_info ctinfo)
-{
- enum tcp_conntrack new_state, old_state;
- enum ip_conntrack_dir dir;
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr *th, _tcph;
- unsigned long timeout;
- unsigned int index;
-
- th = skb_header_pointer(skb, iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- BUG_ON(th == NULL);
-
- write_lock_bh(&tcp_lock);
- old_state = conntrack->proto.tcp.state;
- dir = CTINFO2DIR(ctinfo);
- index = get_conntrack_index(th);
- new_state = tcp_conntracks[dir][index][old_state];
-
- switch (new_state) {
- case TCP_CONNTRACK_IGNORE:
- /* Ignored packets:
- *
- * a) SYN in ORIGINAL
- * b) SYN/ACK in REPLY
- * c) ACK in reply direction after initial SYN in original.
- */
- if (index == TCP_SYNACK_SET
- && conntrack->proto.tcp.last_index == TCP_SYN_SET
- && conntrack->proto.tcp.last_dir != dir
- && ntohl(th->ack_seq) ==
- conntrack->proto.tcp.last_end) {
- /* This SYN/ACK acknowledges a SYN that we earlier
- * ignored as invalid. This means that the client and
- * the server are both in sync, while the firewall is
- * not. We kill this session and block the SYN/ACK so
- * that the client cannot but retransmit its SYN and
- * thus initiate a clean new session.
- */
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- NULL, "ip_ct_tcp: "
- "killing out of sync session ");
- if (del_timer(&conntrack->timeout))
- conntrack->timeout.function((unsigned long)
- conntrack);
- return -NF_DROP;
- }
- conntrack->proto.tcp.last_index = index;
- conntrack->proto.tcp.last_dir = dir;
- conntrack->proto.tcp.last_seq = ntohl(th->seq);
- conntrack->proto.tcp.last_end =
- segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
-
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: invalid packet ignored ");
- return NF_ACCEPT;
- case TCP_CONNTRACK_MAX:
- /* Invalid packet */
- DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
- dir, get_conntrack_index(th),
- old_state);
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_tcp: invalid state ");
- return -NF_ACCEPT;
- case TCP_CONNTRACK_SYN_SENT:
- if (old_state < TCP_CONNTRACK_TIME_WAIT)
- break;
- if ((conntrack->proto.tcp.seen[dir].flags &
- IP_CT_TCP_FLAG_CLOSE_INIT)
- || after(ntohl(th->seq),
- conntrack->proto.tcp.seen[dir].td_end)) {
- /* Attempt to reopen a closed connection.
- * Delete this connection and look up again. */
- write_unlock_bh(&tcp_lock);
- if (del_timer(&conntrack->timeout))
- conntrack->timeout.function((unsigned long)
- conntrack);
- return -NF_REPEAT;
- } else {
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- NULL, "ip_ct_tcp: invalid SYN");
- return -NF_ACCEPT;
- }
- case TCP_CONNTRACK_CLOSE:
- if (index == TCP_RST_SET
- && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
- && conntrack->proto.tcp.last_index == TCP_SYN_SET)
- || (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
- && conntrack->proto.tcp.last_index == TCP_ACK_SET))
- && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
- /* RST sent to invalid SYN or ACK we had let through
- * at a) and c) above:
- *
- * a) SYN was in window then
- * c) we hold a half-open connection.
- *
- * Delete our connection entry.
- * We skip window checking, because packet might ACK
- * segments we ignored. */
- goto in_window;
- }
- /* Just fall through */
- default:
- /* Keep compilers happy. */
- break;
- }
-
- if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
- skb, iph, th)) {
- write_unlock_bh(&tcp_lock);
- return -NF_ACCEPT;
- }
- in_window:
- /* From now on we have got in-window packets */
- conntrack->proto.tcp.last_index = index;
-
- DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
- "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
- NIPQUAD(iph->saddr), ntohs(th->source),
- NIPQUAD(iph->daddr), ntohs(th->dest),
- (th->syn ? 1 : 0), (th->ack ? 1 : 0),
- (th->fin ? 1 : 0), (th->rst ? 1 : 0),
- old_state, new_state);
-
- conntrack->proto.tcp.state = new_state;
- if (old_state != new_state
- && (new_state == TCP_CONNTRACK_FIN_WAIT
- || new_state == TCP_CONNTRACK_CLOSE))
- conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
- timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
- && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
- ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
- write_unlock_bh(&tcp_lock);
-
- ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
- if (new_state != old_state)
- ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
-
- if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
- /* If only reply is a RST, we can consider ourselves not to
- have an established connection: this is a fairly common
- problem case, so we can delete the conntrack
- immediately. --RR */
- if (th->rst) {
- if (del_timer(&conntrack->timeout))
- conntrack->timeout.function((unsigned long)
- conntrack);
- return NF_ACCEPT;
- }
- } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
- && (old_state == TCP_CONNTRACK_SYN_RECV
- || old_state == TCP_CONNTRACK_ESTABLISHED)
- && new_state == TCP_CONNTRACK_ESTABLISHED) {
- /* Set ASSURED if we see see valid ack in ESTABLISHED
- after SYN_RECV or a valid answer for a picked up
- connection. */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
- ip_conntrack_event_cache(IPCT_STATUS, skb);
- }
- ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int tcp_new(struct ip_conntrack *conntrack,
- const struct sk_buff *skb)
-{
- enum tcp_conntrack new_state;
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr *th, _tcph;
-#ifdef DEBUGP_VARS
- struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
- struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
-#endif
-
- th = skb_header_pointer(skb, iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- BUG_ON(th == NULL);
-
- /* Don't need lock here: this conntrack not in circulation yet */
- new_state
- = tcp_conntracks[0][get_conntrack_index(th)]
- [TCP_CONNTRACK_NONE];
-
- /* Invalid: delete conntrack */
- if (new_state >= TCP_CONNTRACK_MAX) {
- DEBUGP("ip_ct_tcp: invalid new deleting.\n");
- return 0;
- }
-
- if (new_state == TCP_CONNTRACK_SYN_SENT) {
- /* SYN packet */
- conntrack->proto.tcp.seen[0].td_end =
- segment_seq_plus_len(ntohl(th->seq), skb->len,
- iph, th);
- conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
- if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
- conntrack->proto.tcp.seen[0].td_maxwin = 1;
- conntrack->proto.tcp.seen[0].td_maxend =
- conntrack->proto.tcp.seen[0].td_end;
-
- tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
- conntrack->proto.tcp.seen[1].flags = 0;
- conntrack->proto.tcp.seen[0].loose =
- conntrack->proto.tcp.seen[1].loose = 0;
- } else if (ip_ct_tcp_loose == 0) {
- /* Don't try to pick up connections. */
- return 0;
- } else {
- /*
- * We are in the middle of a connection,
- * its history is lost for us.
- * Let's try to use the data from the packet.
- */
- conntrack->proto.tcp.seen[0].td_end =
- segment_seq_plus_len(ntohl(th->seq), skb->len,
- iph, th);
- conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
- if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
- conntrack->proto.tcp.seen[0].td_maxwin = 1;
- conntrack->proto.tcp.seen[0].td_maxend =
- conntrack->proto.tcp.seen[0].td_end +
- conntrack->proto.tcp.seen[0].td_maxwin;
- conntrack->proto.tcp.seen[0].td_scale = 0;
-
- /* We assume SACK. Should we assume window scaling too? */
- conntrack->proto.tcp.seen[0].flags =
- conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
- conntrack->proto.tcp.seen[0].loose =
- conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
- }
-
- conntrack->proto.tcp.seen[1].td_end = 0;
- conntrack->proto.tcp.seen[1].td_maxend = 0;
- conntrack->proto.tcp.seen[1].td_maxwin = 1;
- conntrack->proto.tcp.seen[1].td_scale = 0;
-
- /* tcp_packet will set them */
- conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
- conntrack->proto.tcp.last_index = TCP_NONE_SET;
-
- DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
- return 1;
-}
-
-struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
-{
- .proto = IPPROTO_TCP,
- .name = "tcp",
- .pkt_to_tuple = tcp_pkt_to_tuple,
- .invert_tuple = tcp_invert_tuple,
- .print_tuple = tcp_print_tuple,
- .print_conntrack = tcp_print_conntrack,
- .packet = tcp_packet,
- .new = tcp_new,
- .error = tcp_error,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .to_nfattr = tcp_to_nfattr,
- .from_nfattr = nfattr_to_tcp,
- .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
-#endif
-};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
deleted file mode 100644
index d0e8a16970ec..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/seq_file.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-
-unsigned int ip_ct_udp_timeout __read_mostly = 30*HZ;
-unsigned int ip_ct_udp_timeout_stream __read_mostly = 180*HZ;
-
-static int udp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct ip_conntrack_tuple *tuple)
-{
- struct udphdr _hdr, *hp;
-
- /* Actually only need first 8 bytes. */
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return 0;
-
- tuple->src.u.udp.port = hp->source;
- tuple->dst.u.udp.port = hp->dest;
-
- return 1;
-}
-
-static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig)
-{
- tuple->src.u.udp.port = orig->dst.u.udp.port;
- tuple->dst.u.udp.port = orig->src.u.udp.port;
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int udp_print_tuple(struct seq_file *s,
- const struct ip_conntrack_tuple *tuple)
-{
- return seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.udp.port),
- ntohs(tuple->dst.u.udp.port));
-}
-
-/* Print out the private part of the conntrack. */
-static int udp_print_conntrack(struct seq_file *s,
- const struct ip_conntrack *conntrack)
-{
- return 0;
-}
-
-/* Returns verdict for packet, and may modify conntracktype */
-static int udp_packet(struct ip_conntrack *conntrack,
- const struct sk_buff *skb,
- enum ip_conntrack_info ctinfo)
-{
- /* If we've seen traffic both ways, this is some kind of UDP
- stream. Extend timeout. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
- ip_ct_refresh_acct(conntrack, ctinfo, skb,
- ip_ct_udp_timeout_stream);
- /* Also, more likely to be important, and not a probe */
- if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
- ip_conntrack_event_cache(IPCT_STATUS, skb);
- } else
- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
-{
- return 1;
-}
-
-static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct iphdr *iph = skb->nh.iph;
- unsigned int udplen = skb->len - iph->ihl * 4;
- struct udphdr _hdr, *hdr;
-
- /* Header is too small? */
- hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
- if (hdr == NULL) {
- if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_udp: short packet ");
- return -NF_ACCEPT;
- }
-
- /* Truncated/malformed packets */
- if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
- if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_udp: truncated/malformed packet ");
- return -NF_ACCEPT;
- }
-
- /* Packet with no checksum */
- if (!hdr->check)
- return NF_ACCEPT;
-
- /* Checksum invalid? Ignore.
- * We skip checking packets on the outgoing path
- * because the checksum is assumed to be correct.
- * FIXME: Source route IP option packets --RR */
- if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
- nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
- if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_udp: bad UDP checksum ");
- return -NF_ACCEPT;
- }
-
- return NF_ACCEPT;
-}
-
-struct ip_conntrack_protocol ip_conntrack_protocol_udp =
-{
- .proto = IPPROTO_UDP,
- .name = "udp",
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
- .print_tuple = udp_print_tuple,
- .print_conntrack = udp_print_conntrack,
- .packet = udp_packet,
- .new = udp_new,
- .error = udp_error,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
-#endif
-};
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
deleted file mode 100644
index 3a26d63eed88..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_sip.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/* SIP extension for IP connection tracking.
- *
- * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
- * based on RR's ip_conntrack_ftp.c and other modules.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/ctype.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
-MODULE_DESCRIPTION("SIP connection tracking helper");
-
-#define MAX_PORTS 8
-static unsigned short ports[MAX_PORTS];
-static int ports_c;
-module_param_array(ports, ushort, &ports_c, 0400);
-MODULE_PARM_DESC(ports, "port numbers of sip servers");
-
-static unsigned int sip_timeout = SIP_TIMEOUT;
-module_param(sip_timeout, uint, 0600);
-MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
-
-unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack *ct,
- const char **dptr);
-EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
-
-unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack_expect *exp,
- const char *dptr);
-EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
-
-static int digits_len(const char *dptr, const char *limit, int *shift);
-static int epaddr_len(const char *dptr, const char *limit, int *shift);
-static int skp_digits_len(const char *dptr, const char *limit, int *shift);
-static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
-
-struct sip_header_nfo {
- const char *lname;
- const char *sname;
- const char *ln_str;
- size_t lnlen;
- size_t snlen;
- size_t ln_strlen;
- int case_sensitive;
- int (*match_len)(const char *, const char *, int *);
-};
-
-static struct sip_header_nfo ct_sip_hdrs[] = {
- [POS_REG_REQ_URI] = { /* SIP REGISTER request URI */
- .lname = "sip:",
- .lnlen = sizeof("sip:") - 1,
- .ln_str = ":",
- .ln_strlen = sizeof(":") - 1,
- .match_len = epaddr_len
- },
- [POS_REQ_URI] = { /* SIP request URI */
- .lname = "sip:",
- .lnlen = sizeof("sip:") - 1,
- .ln_str = "@",
- .ln_strlen = sizeof("@") - 1,
- .match_len = epaddr_len
- },
- [POS_FROM] = { /* SIP From header */
- .lname = "From:",
- .lnlen = sizeof("From:") - 1,
- .sname = "\r\nf:",
- .snlen = sizeof("\r\nf:") - 1,
- .ln_str = "sip:",
- .ln_strlen = sizeof("sip:") - 1,
- .match_len = skp_epaddr_len,
- },
- [POS_TO] = { /* SIP To header */
- .lname = "To:",
- .lnlen = sizeof("To:") - 1,
- .sname = "\r\nt:",
- .snlen = sizeof("\r\nt:") - 1,
- .ln_str = "sip:",
- .ln_strlen = sizeof("sip:") - 1,
- .match_len = skp_epaddr_len,
- },
- [POS_VIA] = { /* SIP Via header */
- .lname = "Via:",
- .lnlen = sizeof("Via:") - 1,
- .sname = "\r\nv:",
- .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
- .ln_str = "UDP ",
- .ln_strlen = sizeof("UDP ") - 1,
- .match_len = epaddr_len,
- },
- [POS_CONTACT] = { /* SIP Contact header */
- .lname = "Contact:",
- .lnlen = sizeof("Contact:") - 1,
- .sname = "\r\nm:",
- .snlen = sizeof("\r\nm:") - 1,
- .ln_str = "sip:",
- .ln_strlen = sizeof("sip:") - 1,
- .match_len = skp_epaddr_len
- },
- [POS_CONTENT] = { /* SIP Content length header */
- .lname = "Content-Length:",
- .lnlen = sizeof("Content-Length:") - 1,
- .sname = "\r\nl:",
- .snlen = sizeof("\r\nl:") - 1,
- .ln_str = ":",
- .ln_strlen = sizeof(":") - 1,
- .match_len = skp_digits_len
- },
- [POS_MEDIA] = { /* SDP media info */
- .case_sensitive = 1,
- .lname = "\nm=",
- .lnlen = sizeof("\nm=") - 1,
- .sname = "\rm=",
- .snlen = sizeof("\rm=") - 1,
- .ln_str = "audio ",
- .ln_strlen = sizeof("audio ") - 1,
- .match_len = digits_len
- },
- [POS_OWNER] = { /* SDP owner address*/
- .case_sensitive = 1,
- .lname = "\no=",
- .lnlen = sizeof("\no=") - 1,
- .sname = "\ro=",
- .snlen = sizeof("\ro=") - 1,
- .ln_str = "IN IP4 ",
- .ln_strlen = sizeof("IN IP4 ") - 1,
- .match_len = epaddr_len
- },
- [POS_CONNECTION] = { /* SDP connection info */
- .case_sensitive = 1,
- .lname = "\nc=",
- .lnlen = sizeof("\nc=") - 1,
- .sname = "\rc=",
- .snlen = sizeof("\rc=") - 1,
- .ln_str = "IN IP4 ",
- .ln_strlen = sizeof("IN IP4 ") - 1,
- .match_len = epaddr_len
- },
- [POS_SDP_HEADER] = { /* SDP version header */
- .case_sensitive = 1,
- .lname = "\nv=",
- .lnlen = sizeof("\nv=") - 1,
- .sname = "\rv=",
- .snlen = sizeof("\rv=") - 1,
- .ln_str = "=",
- .ln_strlen = sizeof("=") - 1,
- .match_len = digits_len
- }
-};
-
-/* get line lenght until first CR or LF seen. */
-int ct_sip_lnlen(const char *line, const char *limit)
-{
- const char *k = line;
-
- while ((line <= limit) && (*line == '\r' || *line == '\n'))
- line++;
-
- while (line <= limit) {
- if (*line == '\r' || *line == '\n')
- break;
- line++;
- }
- return line - k;
-}
-EXPORT_SYMBOL_GPL(ct_sip_lnlen);
-
-/* Linear string search, case sensitive. */
-const char *ct_sip_search(const char *needle, const char *haystack,
- size_t needle_len, size_t haystack_len,
- int case_sensitive)
-{
- const char *limit = haystack + (haystack_len - needle_len);
-
- while (haystack <= limit) {
- if (case_sensitive) {
- if (strncmp(haystack, needle, needle_len) == 0)
- return haystack;
- } else {
- if (strnicmp(haystack, needle, needle_len) == 0)
- return haystack;
- }
- haystack++;
- }
- return NULL;
-}
-EXPORT_SYMBOL_GPL(ct_sip_search);
-
-static int digits_len(const char *dptr, const char *limit, int *shift)
-{
- int len = 0;
- while (dptr <= limit && isdigit(*dptr)) {
- dptr++;
- len++;
- }
- return len;
-}
-
-/* get digits lenght, skiping blank spaces. */
-static int skp_digits_len(const char *dptr, const char *limit, int *shift)
-{
- for (; dptr <= limit && *dptr == ' '; dptr++)
- (*shift)++;
-
- return digits_len(dptr, limit, shift);
-}
-
-/* Simple ipaddr parser.. */
-static int parse_ipaddr(const char *cp, const char **endp,
- __be32 *ipaddr, const char *limit)
-{
- unsigned long int val;
- int i, digit = 0;
-
- for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
- digit = 0;
- if (!isdigit(*cp))
- break;
-
- val = simple_strtoul(cp, (char **)&cp, 10);
- if (val > 0xFF)
- return -1;
-
- ((u_int8_t *)ipaddr)[i] = val;
- digit = 1;
-
- if (*cp != '.')
- break;
- cp++;
- }
- if (!digit)
- return -1;
-
- if (endp)
- *endp = cp;
-
- return 0;
-}
-
-/* skip ip address. returns it lenght. */
-static int epaddr_len(const char *dptr, const char *limit, int *shift)
-{
- const char *aux = dptr;
- __be32 ip;
-
- if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
- DEBUGP("ip: %s parse failed.!\n", dptr);
- return 0;
- }
-
- /* Port number */
- if (*dptr == ':') {
- dptr++;
- dptr += digits_len(dptr, limit, shift);
- }
- return dptr - aux;
-}
-
-/* get address length, skiping user info. */
-static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
-{
- int s = *shift;
-
- for (; dptr <= limit && *dptr != '@'; dptr++)
- (*shift)++;
-
- if (*dptr == '@') {
- dptr++;
- (*shift)++;
- } else
- *shift = s;
-
- return epaddr_len(dptr, limit, shift);
-}
-
-/* Returns 0 if not found, -1 error parsing. */
-int ct_sip_get_info(const char *dptr, size_t dlen,
- unsigned int *matchoff,
- unsigned int *matchlen,
- enum sip_header_pos pos)
-{
- struct sip_header_nfo *hnfo = &ct_sip_hdrs[pos];
- const char *limit, *aux, *k = dptr;
- int shift = 0;
-
- limit = dptr + (dlen - hnfo->lnlen);
-
- while (dptr <= limit) {
- if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
- (hnfo->sname == NULL ||
- strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
- dptr++;
- continue;
- }
- aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
- ct_sip_lnlen(dptr, limit),
- hnfo->case_sensitive);
- if (!aux) {
- DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
- hnfo->lname);
- return -1;
- }
- aux += hnfo->ln_strlen;
-
- *matchlen = hnfo->match_len(aux, limit, &shift);
- if (!*matchlen)
- return -1;
-
- *matchoff = (aux - k) + shift;
-
- DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
- *matchlen);
- return 1;
- }
- DEBUGP("%s header not found.\n", hnfo->lname);
- return 0;
-}
-EXPORT_SYMBOL_GPL(ct_sip_get_info);
-
-static int set_expected_rtp(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- __be32 ipaddr, u_int16_t port,
- const char *dptr)
-{
- struct ip_conntrack_expect *exp;
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- int ret;
- typeof(ip_nat_sdp_hook) ip_nat_sdp;
-
- exp = ip_conntrack_expect_alloc(ct);
- if (exp == NULL)
- return NF_DROP;
-
- exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
- exp->tuple.src.u.udp.port = 0;
- exp->tuple.dst.ip = ipaddr;
- exp->tuple.dst.u.udp.port = htons(port);
- exp->tuple.dst.protonum = IPPROTO_UDP;
-
- exp->mask.src.ip = htonl(0xFFFFFFFF);
- exp->mask.src.u.udp.port = 0;
- exp->mask.dst.ip = htonl(0xFFFFFFFF);
- exp->mask.dst.u.udp.port = htons(0xFFFF);
- exp->mask.dst.protonum = 0xFF;
-
- exp->expectfn = NULL;
- exp->flags = 0;
-
- ip_nat_sdp = rcu_dereference(ip_nat_sdp_hook);
- if (ip_nat_sdp)
- ret = ip_nat_sdp(pskb, ctinfo, exp, dptr);
- else {
- if (ip_conntrack_expect_related(exp) != 0)
- ret = NF_DROP;
- else
- ret = NF_ACCEPT;
- }
- ip_conntrack_expect_put(exp);
-
- return ret;
-}
-
-static int sip_help(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- unsigned int dataoff, datalen;
- const char *dptr;
- int ret = NF_ACCEPT;
- int matchoff, matchlen;
- __be32 ipaddr;
- u_int16_t port;
- typeof(ip_nat_sip_hook) ip_nat_sip;
-
- /* No Data ? */
- dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
- if (dataoff >= (*pskb)->len) {
- DEBUGP("skb->len = %u\n", (*pskb)->len);
- return NF_ACCEPT;
- }
-
- ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
-
- if (!skb_is_nonlinear(*pskb))
- dptr = (*pskb)->data + dataoff;
- else {
- DEBUGP("Copy of skbuff not supported yet.\n");
- goto out;
- }
-
- ip_nat_sip = rcu_dereference(ip_nat_sip_hook);
- if (ip_nat_sip) {
- if (!ip_nat_sip(pskb, ctinfo, ct, &dptr)) {
- ret = NF_DROP;
- goto out;
- }
- }
-
- /* After this point NAT, could have mangled skb, so
- we need to recalculate payload lenght. */
- datalen = (*pskb)->len - dataoff;
-
- if (datalen < (sizeof("SIP/2.0 200") - 1))
- goto out;
-
- /* RTP info only in some SDP pkts */
- if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
- memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
- goto out;
- }
- /* Get ip and port address from SDP packet. */
- if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
- POS_CONNECTION) > 0) {
-
- /* We'll drop only if there are parse problems. */
- if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
- dptr + datalen) < 0) {
- ret = NF_DROP;
- goto out;
- }
- if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
- POS_MEDIA) > 0) {
-
- port = simple_strtoul(dptr + matchoff, NULL, 10);
- if (port < 1024) {
- ret = NF_DROP;
- goto out;
- }
- ret = set_expected_rtp(pskb, ct, ctinfo,
- ipaddr, port, dptr);
- }
- }
-out:
- return ret;
-}
-
-static struct ip_conntrack_helper sip[MAX_PORTS];
-static char sip_names[MAX_PORTS][10];
-
-static void fini(void)
-{
- int i;
- for (i = 0; i < ports_c; i++) {
- DEBUGP("unregistering helper for port %d\n", ports[i]);
- ip_conntrack_helper_unregister(&sip[i]);
- }
-}
-
-static int __init init(void)
-{
- int i, ret;
- char *tmpname;
-
- if (ports_c == 0)
- ports[ports_c++] = SIP_PORT;
-
- for (i = 0; i < ports_c; i++) {
- /* Create helper structure */
- memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
-
- sip[i].tuple.dst.protonum = IPPROTO_UDP;
- sip[i].tuple.src.u.udp.port = htons(ports[i]);
- sip[i].mask.src.u.udp.port = htons(0xFFFF);
- sip[i].mask.dst.protonum = 0xFF;
- sip[i].max_expected = 2;
- sip[i].timeout = 3 * 60; /* 3 minutes */
- sip[i].me = THIS_MODULE;
- sip[i].help = sip_help;
-
- tmpname = &sip_names[i][0];
- if (ports[i] == SIP_PORT)
- sprintf(tmpname, "sip");
- else
- sprintf(tmpname, "sip-%d", i);
- sip[i].name = tmpname;
-
- DEBUGP("port #%d: %d\n", i, ports[i]);
-
- ret = ip_conntrack_helper_register(&sip[i]);
- if (ret) {
- printk("ERROR registering helper for port %d\n",
- ports[i]);
- fini();
- return ret;
- }
- }
- return 0;
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
deleted file mode 100644
index 2df67538ffb0..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ /dev/null
@@ -1,967 +0,0 @@
-/* This file contains all the functions required for the standalone
- ip_conntrack module.
-
- These are not required by the compatibility layer.
-*/
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/percpu.h>
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-#endif
-#include <net/checksum.h>
-#include <net/ip.h>
-#include <net/route.h>
-
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-MODULE_LICENSE("GPL");
-
-extern atomic_t ip_conntrack_count;
-DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
-
-static int kill_proto(struct ip_conntrack *i, void *data)
-{
- return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
- *((u_int8_t *) data));
-}
-
-#ifdef CONFIG_PROC_FS
-static int
-print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *proto)
-{
- seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
- NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
- return proto->print_tuple(s, tuple);
-}
-
-#ifdef CONFIG_IP_NF_CT_ACCT
-static unsigned int
-seq_print_counters(struct seq_file *s,
- const struct ip_conntrack_counter *counter)
-{
- return seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)counter->packets,
- (unsigned long long)counter->bytes);
-}
-#else
-#define seq_print_counters(x, y) 0
-#endif
-
-struct ct_iter_state {
- unsigned int bucket;
-};
-
-static struct list_head *ct_get_first(struct seq_file *seq)
-{
- struct ct_iter_state *st = seq->private;
-
- for (st->bucket = 0;
- st->bucket < ip_conntrack_htable_size;
- st->bucket++) {
- if (!list_empty(&ip_conntrack_hash[st->bucket]))
- return ip_conntrack_hash[st->bucket].next;
- }
- return NULL;
-}
-
-static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
-{
- struct ct_iter_state *st = seq->private;
-
- head = head->next;
- while (head == &ip_conntrack_hash[st->bucket]) {
- if (++st->bucket >= ip_conntrack_htable_size)
- return NULL;
- head = ip_conntrack_hash[st->bucket].next;
- }
- return head;
-}
-
-static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct list_head *head = ct_get_first(seq);
-
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
-}
-
-static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
-{
- read_lock_bh(&ip_conntrack_lock);
- return ct_get_idx(seq, *pos);
-}
-
-static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
- return ct_get_next(s, v);
-}
-
-static void ct_seq_stop(struct seq_file *s, void *v)
-{
- read_unlock_bh(&ip_conntrack_lock);
-}
-
-static int ct_seq_show(struct seq_file *s, void *v)
-{
- const struct ip_conntrack_tuple_hash *hash = v;
- const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
- struct ip_conntrack_protocol *proto;
-
- ASSERT_READ_LOCK(&ip_conntrack_lock);
- IP_NF_ASSERT(conntrack);
-
- /* we only want to print DIR_ORIGINAL */
- if (DIRECTION(hash))
- return 0;
-
- proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
- IP_NF_ASSERT(proto);
-
- if (seq_printf(s, "%-8s %u %ld ",
- proto->name,
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
- timer_pending(&conntrack->timeout)
- ? (long)(conntrack->timeout.expires - jiffies)/HZ
- : 0) != 0)
- return -ENOSPC;
-
- if (proto->print_conntrack(s, conntrack))
- return -ENOSPC;
-
- if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- proto))
- return -ENOSPC;
-
- if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
- return -ENOSPC;
-
- if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
- if (seq_printf(s, "[UNREPLIED] "))
- return -ENOSPC;
-
- if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
- proto))
- return -ENOSPC;
-
- if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
- return -ENOSPC;
-
- if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
- if (seq_printf(s, "[ASSURED] "))
- return -ENOSPC;
-
-#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (seq_printf(s, "mark=%u ", conntrack->mark))
- return -ENOSPC;
-#endif
-
-#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
- if (seq_printf(s, "secmark=%u ", conntrack->secmark))
- return -ENOSPC;
-#endif
-
- if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
- return -ENOSPC;
-
- return 0;
-}
-
-static struct seq_operations ct_seq_ops = {
- .start = ct_seq_start,
- .next = ct_seq_next,
- .stop = ct_seq_stop,
- .show = ct_seq_show
-};
-
-static int ct_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- struct ct_iter_state *st;
- int ret;
-
- st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
- if (st == NULL)
- return -ENOMEM;
- ret = seq_open(file, &ct_seq_ops);
- if (ret)
- goto out_free;
- seq = file->private_data;
- seq->private = st;
- memset(st, 0, sizeof(struct ct_iter_state));
- return ret;
-out_free:
- kfree(st);
- return ret;
-}
-
-static struct file_operations ct_file_ops = {
- .owner = THIS_MODULE,
- .open = ct_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-/* expects */
-static void *exp_seq_start(struct seq_file *s, loff_t *pos)
-{
- struct list_head *e = &ip_conntrack_expect_list;
- loff_t i;
-
- /* strange seq_file api calls stop even if we fail,
- * thus we need to grab lock since stop unlocks */
- read_lock_bh(&ip_conntrack_lock);
-
- if (list_empty(e))
- return NULL;
-
- for (i = 0; i <= *pos; i++) {
- e = e->next;
- if (e == &ip_conntrack_expect_list)
- return NULL;
- }
- return e;
-}
-
-static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct list_head *e = v;
-
- ++*pos;
- e = e->next;
-
- if (e == &ip_conntrack_expect_list)
- return NULL;
-
- return e;
-}
-
-static void exp_seq_stop(struct seq_file *s, void *v)
-{
- read_unlock_bh(&ip_conntrack_lock);
-}
-
-static int exp_seq_show(struct seq_file *s, void *v)
-{
- struct ip_conntrack_expect *expect = v;
-
- if (expect->timeout.function)
- seq_printf(s, "%ld ", timer_pending(&expect->timeout)
- ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
- else
- seq_printf(s, "- ");
-
- seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
-
- print_tuple(s, &expect->tuple,
- __ip_conntrack_proto_find(expect->tuple.dst.protonum));
- return seq_putc(s, '\n');
-}
-
-static struct seq_operations exp_seq_ops = {
- .start = exp_seq_start,
- .next = exp_seq_next,
- .stop = exp_seq_stop,
- .show = exp_seq_show
-};
-
-static int exp_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &exp_seq_ops);
-}
-
-static struct file_operations exp_file_ops = {
- .owner = THIS_MODULE,
- .open = exp_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
-
-static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
- int cpu;
-
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
- if (!cpu_possible(cpu))
- continue;
- *pos = cpu+1;
- return &per_cpu(ip_conntrack_stat, cpu);
- }
-
- return NULL;
-}
-
-static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- int cpu;
-
- for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
- if (!cpu_possible(cpu))
- continue;
- *pos = cpu+1;
- return &per_cpu(ip_conntrack_stat, cpu);
- }
-
- return NULL;
-}
-
-static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static int ct_cpu_seq_show(struct seq_file *seq, void *v)
-{
- unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
- struct ip_conntrack_stat *st = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
- return 0;
- }
-
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
- "%08x %08x %08x %08x %08x %08x %08x %08x \n",
- nr_conntracks,
- st->searched,
- st->found,
- st->new,
- st->invalid,
- st->ignore,
- st->delete,
- st->delete_list,
- st->insert,
- st->insert_failed,
- st->drop,
- st->early_drop,
- st->error,
-
- st->expect_new,
- st->expect_create,
- st->expect_delete
- );
- return 0;
-}
-
-static struct seq_operations ct_cpu_seq_ops = {
- .start = ct_cpu_seq_start,
- .next = ct_cpu_seq_next,
- .stop = ct_cpu_seq_stop,
- .show = ct_cpu_seq_show,
-};
-
-static int ct_cpu_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &ct_cpu_seq_ops);
-}
-
-static struct file_operations ct_cpu_seq_fops = {
- .owner = THIS_MODULE,
- .open = ct_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-#endif
-
-static unsigned int ip_confirm(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* We've seen it coming out the other side: confirm it */
- return ip_conntrack_confirm(pskb);
-}
-
-static unsigned int ip_conntrack_help(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = ip_conntrack_get(*pskb, &ctinfo);
- if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) {
- unsigned int ret;
- ret = ct->helper->help(pskb, ct, ctinfo);
- if (ret != NF_ACCEPT)
- return ret;
- }
- return NF_ACCEPT;
-}
-
-static unsigned int ip_conntrack_defrag(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
-#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
- /* Previously seen (loopback)? Ignore. Do this before
- fragment check. */
- if ((*pskb)->nfct)
- return NF_ACCEPT;
-#endif
-
- /* Gather fragments. */
- if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
- *pskb = ip_ct_gather_frags(*pskb,
- hooknum == NF_IP_PRE_ROUTING ?
- IP_DEFRAG_CONNTRACK_IN :
- IP_DEFRAG_CONNTRACK_OUT);
- if (!*pskb)
- return NF_STOLEN;
- }
- return NF_ACCEPT;
-}
-
-static unsigned int ip_conntrack_local(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
- if (net_ratelimit())
- printk("ipt_hook: happy cracking.\n");
- return NF_ACCEPT;
- }
- return ip_conntrack_in(hooknum, pskb, in, out, okfn);
-}
-
-/* Connection tracking may drop packets, but never alters them, so
- make it the first hook. */
-static struct nf_hook_ops ip_conntrack_ops[] = {
- {
- .hook = ip_conntrack_defrag,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
- },
- {
- .hook = ip_conntrack_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ip_conntrack_defrag,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
- },
- {
- .hook = ip_conntrack_local,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ip_conntrack_help,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ip_conntrack_help,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ip_confirm,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
- {
- .hook = ip_confirm,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
-};
-
-/* Sysctl support */
-
-int ip_conntrack_checksum __read_mostly = 1;
-
-#ifdef CONFIG_SYSCTL
-
-/* From ip_conntrack_core.c */
-extern int ip_conntrack_max;
-extern unsigned int ip_conntrack_htable_size;
-
-/* From ip_conntrack_proto_tcp.c */
-extern unsigned int ip_ct_tcp_timeout_syn_sent;
-extern unsigned int ip_ct_tcp_timeout_syn_recv;
-extern unsigned int ip_ct_tcp_timeout_established;
-extern unsigned int ip_ct_tcp_timeout_fin_wait;
-extern unsigned int ip_ct_tcp_timeout_close_wait;
-extern unsigned int ip_ct_tcp_timeout_last_ack;
-extern unsigned int ip_ct_tcp_timeout_time_wait;
-extern unsigned int ip_ct_tcp_timeout_close;
-extern unsigned int ip_ct_tcp_timeout_max_retrans;
-extern int ip_ct_tcp_loose;
-extern int ip_ct_tcp_be_liberal;
-extern int ip_ct_tcp_max_retrans;
-
-/* From ip_conntrack_proto_udp.c */
-extern unsigned int ip_ct_udp_timeout;
-extern unsigned int ip_ct_udp_timeout_stream;
-
-/* From ip_conntrack_proto_icmp.c */
-extern unsigned int ip_ct_icmp_timeout;
-
-/* From ip_conntrack_proto_generic.c */
-extern unsigned int ip_ct_generic_timeout;
-
-/* Log invalid packets of a given protocol */
-static int log_invalid_proto_min = 0;
-static int log_invalid_proto_max = 255;
-
-static struct ctl_table_header *ip_ct_sysctl_header;
-
-static ctl_table ip_ct_sysctl_table[] = {
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
- .procname = "ip_conntrack_max",
- .data = &ip_conntrack_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
- .procname = "ip_conntrack_count",
- .data = &ip_conntrack_count,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
- .procname = "ip_conntrack_buckets",
- .data = &ip_conntrack_htable_size,
- .maxlen = sizeof(unsigned int),
- .mode = 0444,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
- .procname = "ip_conntrack_checksum",
- .data = &ip_conntrack_checksum,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
- .procname = "ip_conntrack_tcp_timeout_syn_sent",
- .data = &ip_ct_tcp_timeout_syn_sent,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
- .procname = "ip_conntrack_tcp_timeout_syn_recv",
- .data = &ip_ct_tcp_timeout_syn_recv,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
- .procname = "ip_conntrack_tcp_timeout_established",
- .data = &ip_ct_tcp_timeout_established,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
- .procname = "ip_conntrack_tcp_timeout_fin_wait",
- .data = &ip_ct_tcp_timeout_fin_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
- .procname = "ip_conntrack_tcp_timeout_close_wait",
- .data = &ip_ct_tcp_timeout_close_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
- .procname = "ip_conntrack_tcp_timeout_last_ack",
- .data = &ip_ct_tcp_timeout_last_ack,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
- .procname = "ip_conntrack_tcp_timeout_time_wait",
- .data = &ip_ct_tcp_timeout_time_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
- .procname = "ip_conntrack_tcp_timeout_close",
- .data = &ip_ct_tcp_timeout_close,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,
- .procname = "ip_conntrack_udp_timeout",
- .data = &ip_ct_udp_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
- .procname = "ip_conntrack_udp_timeout_stream",
- .data = &ip_ct_udp_timeout_stream,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
- .procname = "ip_conntrack_icmp_timeout",
- .data = &ip_ct_icmp_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,
- .procname = "ip_conntrack_generic_timeout",
- .data = &ip_ct_generic_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
- .procname = "ip_conntrack_log_invalid",
- .data = &ip_ct_log_invalid,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &log_invalid_proto_min,
- .extra2 = &log_invalid_proto_max,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
- .procname = "ip_conntrack_tcp_timeout_max_retrans",
- .data = &ip_ct_tcp_timeout_max_retrans,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
- .procname = "ip_conntrack_tcp_loose",
- .data = &ip_ct_tcp_loose,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
- .procname = "ip_conntrack_tcp_be_liberal",
- .data = &ip_ct_tcp_be_liberal,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
- .procname = "ip_conntrack_tcp_max_retrans",
- .data = &ip_ct_tcp_max_retrans,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- { .ctl_name = 0 }
-};
-
-#define NET_IP_CONNTRACK_MAX 2089
-
-static ctl_table ip_ct_netfilter_table[] = {
- {
- .ctl_name = NET_IPV4_NETFILTER,
- .procname = "netfilter",
- .mode = 0555,
- .child = ip_ct_sysctl_table,
- },
- {
- .ctl_name = NET_IP_CONNTRACK_MAX,
- .procname = "ip_conntrack_max",
- .data = &ip_conntrack_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ip_ct_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = ip_ct_netfilter_table,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ip_ct_net_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ip_ct_ipv4_table,
- },
- { .ctl_name = 0 }
-};
-
-EXPORT_SYMBOL(ip_ct_log_invalid);
-#endif /* CONFIG_SYSCTL */
-
-/* FIXME: Allow NULL functions and sub in pointers to generic for
- them. --RR */
-int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
-{
- int ret = 0;
-
- write_lock_bh(&ip_conntrack_lock);
- if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
- ret = -EBUSY;
- goto out;
- }
- ip_ct_protos[proto->proto] = proto;
- out:
- write_unlock_bh(&ip_conntrack_lock);
- return ret;
-}
-
-void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
-{
- write_lock_bh(&ip_conntrack_lock);
- ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
- write_unlock_bh(&ip_conntrack_lock);
-
- /* Somebody could be still looking at the proto in bh. */
- synchronize_net();
-
- /* Remove all contrack entries for this protocol */
- ip_ct_iterate_cleanup(kill_proto, &proto->proto);
-}
-
-static int __init ip_conntrack_standalone_init(void)
-{
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *proc, *proc_exp, *proc_stat;
-#endif
- int ret = 0;
-
- ret = ip_conntrack_init();
- if (ret < 0)
- return ret;
-
-#ifdef CONFIG_PROC_FS
- ret = -ENOMEM;
- proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
- if (!proc) goto cleanup_init;
-
- proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
- &exp_file_ops);
- if (!proc_exp) goto cleanup_proc;
-
- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
- if (!proc_stat)
- goto cleanup_proc_exp;
-
- proc_stat->proc_fops = &ct_cpu_seq_fops;
- proc_stat->owner = THIS_MODULE;
-#endif
-
- ret = nf_register_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops));
- if (ret < 0) {
- printk("ip_conntrack: can't register hooks.\n");
- goto cleanup_proc_stat;
- }
-#ifdef CONFIG_SYSCTL
- ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
- if (ip_ct_sysctl_header == NULL) {
- printk("ip_conntrack: can't register to sysctl.\n");
- ret = -ENOMEM;
- goto cleanup_hooks;
- }
-#endif
- return ret;
-
-#ifdef CONFIG_SYSCTL
- cleanup_hooks:
- nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops));
-#endif
- cleanup_proc_stat:
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("ip_conntrack", proc_net_stat);
- cleanup_proc_exp:
- proc_net_remove("ip_conntrack_expect");
- cleanup_proc:
- proc_net_remove("ip_conntrack");
- cleanup_init:
-#endif /* CONFIG_PROC_FS */
- ip_conntrack_cleanup();
- return ret;
-}
-
-static void __exit ip_conntrack_standalone_fini(void)
-{
- synchronize_net();
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ip_ct_sysctl_header);
-#endif
- nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops));
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("ip_conntrack", proc_net_stat);
- proc_net_remove("ip_conntrack_expect");
- proc_net_remove("ip_conntrack");
-#endif /* CONFIG_PROC_FS */
- ip_conntrack_cleanup();
-}
-
-module_init(ip_conntrack_standalone_init);
-module_exit(ip_conntrack_standalone_fini);
-
-/* Some modules need us, but don't depend directly on any symbol.
- They should call this. */
-void need_conntrack(void)
-{
-}
-
-#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-EXPORT_SYMBOL_GPL(ip_conntrack_chain);
-EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
-EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
-EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
-EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
-EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
-#endif
-EXPORT_SYMBOL(ip_conntrack_protocol_register);
-EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
-EXPORT_SYMBOL(ip_ct_get_tuple);
-EXPORT_SYMBOL(invert_tuplepr);
-EXPORT_SYMBOL(ip_conntrack_alter_reply);
-EXPORT_SYMBOL(ip_conntrack_destroyed);
-EXPORT_SYMBOL(need_conntrack);
-EXPORT_SYMBOL(ip_conntrack_helper_register);
-EXPORT_SYMBOL(ip_conntrack_helper_unregister);
-EXPORT_SYMBOL(ip_ct_iterate_cleanup);
-EXPORT_SYMBOL(__ip_ct_refresh_acct);
-
-EXPORT_SYMBOL(ip_conntrack_expect_alloc);
-EXPORT_SYMBOL(ip_conntrack_expect_put);
-EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
-EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
-EXPORT_SYMBOL(ip_conntrack_expect_related);
-EXPORT_SYMBOL(ip_conntrack_unexpect_related);
-EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
-EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
-
-EXPORT_SYMBOL(ip_conntrack_tuple_taken);
-EXPORT_SYMBOL(ip_ct_gather_frags);
-EXPORT_SYMBOL(ip_conntrack_htable_size);
-EXPORT_SYMBOL(ip_conntrack_lock);
-EXPORT_SYMBOL(ip_conntrack_hash);
-EXPORT_SYMBOL(ip_conntrack_untracked);
-EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-#ifdef CONFIG_IP_NF_NAT_NEEDED
-EXPORT_SYMBOL(ip_conntrack_tcp_update);
-#endif
-
-EXPORT_SYMBOL_GPL(ip_conntrack_flush);
-EXPORT_SYMBOL_GPL(__ip_conntrack_find);
-
-EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
-EXPORT_SYMBOL_GPL(ip_conntrack_free);
-EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
-
-EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
-
-EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
-EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
-
-EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
-EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
-EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
-EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
-EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
-#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
deleted file mode 100644
index ef56de2eff0c..000000000000
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Version: 0.0.7
- *
- * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
- * - port to newnat API
- *
- */
-
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
-#include <linux/moduleparam.h>
-
-MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
-MODULE_DESCRIPTION("tftp connection tracking helper");
-MODULE_LICENSE("GPL");
-
-#define MAX_PORTS 8
-static unsigned short ports[MAX_PORTS];
-static int ports_c;
-module_param_array(ports, ushort, &ports_c, 0400);
-MODULE_PARM_DESC(ports, "port numbers of tftp servers");
-
-#if 0
-#define DEBUGP(format, args...) printk("%s:%s:" format, \
- __FILE__, __FUNCTION__ , ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack_expect *exp);
-EXPORT_SYMBOL_GPL(ip_nat_tftp_hook);
-
-static int tftp_help(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct tftphdr _tftph, *tfh;
- struct ip_conntrack_expect *exp;
- unsigned int ret = NF_ACCEPT;
- typeof(ip_nat_tftp_hook) ip_nat_tftp;
-
- tfh = skb_header_pointer(*pskb,
- (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr),
- sizeof(_tftph), &_tftph);
- if (tfh == NULL)
- return NF_ACCEPT;
-
- switch (ntohs(tfh->opcode)) {
- /* RRQ and WRQ works the same way */
- case TFTP_OPCODE_READ:
- case TFTP_OPCODE_WRITE:
- DEBUGP("");
- DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- exp = ip_conntrack_expect_alloc(ct);
- if (exp == NULL)
- return NF_DROP;
-
- exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->mask.src.ip = htonl(0xffffffff);
- exp->mask.src.u.udp.port = 0;
- exp->mask.dst.ip = htonl(0xffffffff);
- exp->mask.dst.u.udp.port = htons(0xffff);
- exp->mask.dst.protonum = 0xff;
- exp->expectfn = NULL;
- exp->flags = 0;
-
- DEBUGP("expect: ");
- DUMP_TUPLE(&exp->tuple);
- DUMP_TUPLE(&exp->mask);
- ip_nat_tftp = rcu_dereference(ip_nat_tftp_hook);
- if (ip_nat_tftp)
- ret = ip_nat_tftp(pskb, ctinfo, exp);
- else if (ip_conntrack_expect_related(exp) != 0)
- ret = NF_DROP;
- ip_conntrack_expect_put(exp);
- break;
- case TFTP_OPCODE_DATA:
- case TFTP_OPCODE_ACK:
- DEBUGP("Data/ACK opcode\n");
- break;
- case TFTP_OPCODE_ERROR:
- DEBUGP("Error opcode\n");
- break;
- default:
- DEBUGP("Unknown opcode\n");
- }
- return NF_ACCEPT;
-}
-
-static struct ip_conntrack_helper tftp[MAX_PORTS];
-static char tftp_names[MAX_PORTS][sizeof("tftp-65535")];
-
-static void ip_conntrack_tftp_fini(void)
-{
- int i;
-
- for (i = 0 ; i < ports_c; i++) {
- DEBUGP("unregistering helper for port %d\n",
- ports[i]);
- ip_conntrack_helper_unregister(&tftp[i]);
- }
-}
-
-static int __init ip_conntrack_tftp_init(void)
-{
- int i, ret;
- char *tmpname;
-
- if (ports_c == 0)
- ports[ports_c++] = TFTP_PORT;
-
- for (i = 0; i < ports_c; i++) {
- /* Create helper structure */
- memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper));
-
- tftp[i].tuple.dst.protonum = IPPROTO_UDP;
- tftp[i].tuple.src.u.udp.port = htons(ports[i]);
- tftp[i].mask.dst.protonum = 0xFF;
- tftp[i].mask.src.u.udp.port = htons(0xFFFF);
- tftp[i].max_expected = 1;
- tftp[i].timeout = 5 * 60; /* 5 minutes */
- tftp[i].me = THIS_MODULE;
- tftp[i].help = tftp_help;
-
- tmpname = &tftp_names[i][0];
- if (ports[i] == TFTP_PORT)
- sprintf(tmpname, "tftp");
- else
- sprintf(tmpname, "tftp-%d", i);
- tftp[i].name = tmpname;
-
- DEBUGP("port #%d: %d\n", i, ports[i]);
-
- ret=ip_conntrack_helper_register(&tftp[i]);
- if (ret) {
- printk("ERROR registering helper for port %d\n",
- ports[i]);
- ip_conntrack_tftp_fini();
- return(ret);
- }
- }
- return(0);
-}
-
-module_init(ip_conntrack_tftp_init);
-module_exit(ip_conntrack_tftp_fini);
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
deleted file mode 100644
index 85df1a9aed33..000000000000
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Amanda extension for TCP NAT alteration.
- * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
- * based on a copy of HW's ip_nat_irc.c as well as other modules
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Module load syntax:
- * insmod ip_nat_amanda.o
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
-
-
-MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
-MODULE_DESCRIPTION("Amanda NAT helper");
-MODULE_LICENSE("GPL");
-
-static unsigned int help(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack_expect *exp)
-{
- char buffer[sizeof("65535")];
- u_int16_t port;
- unsigned int ret;
-
- /* Connection comes from client. */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->dir = IP_CT_DIR_ORIGINAL;
-
- /* When you see the packet, we need to NAT it the same as the
- * this one (ie. same IP: it will be TCP and master is UDP). */
- exp->expectfn = ip_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- exp->tuple.dst.u.tcp.port = htons(port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (port == 0)
- return NF_DROP;
-
- sprintf(buffer, "%u", port);
- ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
- matchoff, matchlen,
- buffer, strlen(buffer));
- if (ret != NF_ACCEPT)
- ip_conntrack_unexpect_related(exp);
- return ret;
-}
-
-static void __exit ip_nat_amanda_fini(void)
-{
- rcu_assign_pointer(ip_nat_amanda_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init ip_nat_amanda_init(void)
-{
- BUG_ON(rcu_dereference(ip_nat_amanda_hook));
- rcu_assign_pointer(ip_nat_amanda_hook, help);
- return 0;
-}
-
-module_init(ip_nat_amanda_init);
-module_exit(ip_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
deleted file mode 100644
index 9d1a5175dcd4..000000000000
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ /dev/null
@@ -1,626 +0,0 @@
-/* NAT for netfilter; shared with compatibility layer. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/vmalloc.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h> /* For tcp_prot in getorigdst */
-#include <linux/icmp.h>
-#include <linux/udp.h>
-#include <linux/jhash.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-DEFINE_RWLOCK(ip_nat_lock);
-
-/* Calculated at init based on memory size */
-static unsigned int ip_nat_htable_size;
-
-static struct list_head *bysource;
-
-#define MAX_IP_NAT_PROTO 256
-static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
-
-static inline struct ip_nat_protocol *
-__ip_nat_proto_find(u_int8_t protonum)
-{
- return ip_nat_protos[protonum];
-}
-
-struct ip_nat_protocol *
-ip_nat_proto_find_get(u_int8_t protonum)
-{
- struct ip_nat_protocol *p;
-
- /* we need to disable preemption to make sure 'p' doesn't get
- * removed until we've grabbed the reference */
- preempt_disable();
- p = __ip_nat_proto_find(protonum);
- if (!try_module_get(p->me))
- p = &ip_nat_unknown_protocol;
- preempt_enable();
-
- return p;
-}
-EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
-
-void
-ip_nat_proto_put(struct ip_nat_protocol *p)
-{
- module_put(p->me);
-}
-EXPORT_SYMBOL_GPL(ip_nat_proto_put);
-
-/* We keep an extra hash for each conntrack, for fast searching. */
-static inline unsigned int
-hash_by_src(const struct ip_conntrack_tuple *tuple)
-{
- /* Original src, to ensure we map it consistently if poss. */
- return jhash_3words((__force u32)tuple->src.ip, tuple->src.u.all,
- tuple->dst.protonum, 0) % ip_nat_htable_size;
-}
-
-/* Noone using conntrack by the time this called. */
-static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
-{
- if (!(conn->status & IPS_NAT_DONE_MASK))
- return;
-
- write_lock_bh(&ip_nat_lock);
- list_del(&conn->nat.info.bysource);
- write_unlock_bh(&ip_nat_lock);
-}
-
-/* Is this tuple already taken? (not by us) */
-int
-ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack *ignored_conntrack)
-{
- /* Conntrack tracking doesn't keep track of outgoing tuples; only
- incoming ones. NAT means they don't have a fixed mapping,
- so we invert the tuple and look for the incoming reply.
-
- We could keep a separate hash if this proves too slow. */
- struct ip_conntrack_tuple reply;
-
- invert_tuplepr(&reply, tuple);
- return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
-}
-EXPORT_SYMBOL(ip_nat_used_tuple);
-
-/* If we source map this tuple so reply looks like reply_tuple, will
- * that meet the constraints of range. */
-static int
-in_range(const struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range)
-{
- struct ip_nat_protocol *proto =
- __ip_nat_proto_find(tuple->dst.protonum);
-
- /* If we are supposed to map IPs, then we must be in the
- range specified, otherwise let this drag us onto a new src IP. */
- if (range->flags & IP_NAT_RANGE_MAP_IPS) {
- if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
- || ntohl(tuple->src.ip) > ntohl(range->max_ip))
- return 0;
- }
-
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
- || proto->in_range(tuple, IP_NAT_MANIP_SRC,
- &range->min, &range->max))
- return 1;
-
- return 0;
-}
-
-static inline int
-same_src(const struct ip_conntrack *ct,
- const struct ip_conntrack_tuple *tuple)
-{
- return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
- == tuple->dst.protonum
- && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
- == tuple->src.ip
- && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
- == tuple->src.u.all);
-}
-
-/* Only called for SRC manip */
-static int
-find_appropriate_src(const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_tuple *result,
- const struct ip_nat_range *range)
-{
- unsigned int h = hash_by_src(tuple);
- struct ip_conntrack *ct;
-
- read_lock_bh(&ip_nat_lock);
- list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
- if (same_src(ct, tuple)) {
- /* Copy source part from reply tuple. */
- invert_tuplepr(result,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- result->dst = tuple->dst;
-
- if (in_range(result, range)) {
- read_unlock_bh(&ip_nat_lock);
- return 1;
- }
- }
- }
- read_unlock_bh(&ip_nat_lock);
- return 0;
-}
-
-/* For [FUTURE] fragmentation handling, we want the least-used
- src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
- if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
- 1-65535, we don't do pro-rata allocation based on ports; we choose
- the ip with the lowest src-ip/dst-ip/proto usage.
-*/
-static void
-find_best_ips_proto(struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range,
- const struct ip_conntrack *conntrack,
- enum ip_nat_manip_type maniptype)
-{
- __be32 *var_ipp;
- /* Host order */
- u_int32_t minip, maxip, j;
-
- /* No IP mapping? Do nothing. */
- if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
- return;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- var_ipp = &tuple->src.ip;
- else
- var_ipp = &tuple->dst.ip;
-
- /* Fast path: only one choice. */
- if (range->min_ip == range->max_ip) {
- *var_ipp = range->min_ip;
- return;
- }
-
- /* Hashing source and destination IPs gives a fairly even
- * spread in practice (if there are a small number of IPs
- * involved, there usually aren't that many connections
- * anyway). The consistency means that servers see the same
- * client coming from the same IP (some Internet Banking sites
- * like this), even across reboots. */
- minip = ntohl(range->min_ip);
- maxip = ntohl(range->max_ip);
- j = jhash_2words((__force u32)tuple->src.ip, (__force u32)tuple->dst.ip, 0);
- *var_ipp = htonl(minip + j % (maxip - minip + 1));
-}
-
-/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
- * we change the source to map into the range. For NF_IP_PRE_ROUTING
- * and NF_IP_LOCAL_OUT, we change the destination to map into the
- * range. It might not be possible to get a unique tuple, but we try.
- * At worst (or if we race), we will end up with a final duplicate in
- * __ip_conntrack_confirm and drop the packet. */
-static void
-get_unique_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_conntrack_tuple *orig_tuple,
- const struct ip_nat_range *range,
- struct ip_conntrack *conntrack,
- enum ip_nat_manip_type maniptype)
-{
- struct ip_nat_protocol *proto;
-
- /* 1) If this srcip/proto/src-proto-part is currently mapped,
- and that same mapping gives a unique tuple within the given
- range, use that.
-
- This is only required for source (ie. NAT/masq) mappings.
- So far, we don't do local source mappings, so multiple
- manips not an issue. */
- if (maniptype == IP_NAT_MANIP_SRC) {
- if (find_appropriate_src(orig_tuple, tuple, range)) {
- DEBUGP("get_unique_tuple: Found current src map\n");
- if (!ip_nat_used_tuple(tuple, conntrack))
- return;
- }
- }
-
- /* 2) Select the least-used IP/proto combination in the given
- range. */
- *tuple = *orig_tuple;
- find_best_ips_proto(tuple, range, conntrack, maniptype);
-
- /* 3) The per-protocol part of the manip is made to map into
- the range to make a unique tuple. */
-
- proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
-
- /* Only bother mapping if it's not already in range and unique */
- if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
- || proto->in_range(tuple, maniptype, &range->min, &range->max))
- && !ip_nat_used_tuple(tuple, conntrack)) {
- ip_nat_proto_put(proto);
- return;
- }
-
- /* Last change: get protocol to try to obtain unique tuple. */
- proto->unique_tuple(tuple, range, maniptype, conntrack);
-
- ip_nat_proto_put(proto);
-}
-
-unsigned int
-ip_nat_setup_info(struct ip_conntrack *conntrack,
- const struct ip_nat_range *range,
- unsigned int hooknum)
-{
- struct ip_conntrack_tuple curr_tuple, new_tuple;
- struct ip_nat_info *info = &conntrack->nat.info;
- int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
- enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
-
- IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_POST_ROUTING
- || hooknum == NF_IP_LOCAL_IN
- || hooknum == NF_IP_LOCAL_OUT);
- BUG_ON(ip_nat_initialized(conntrack, maniptype));
-
- /* What we've got will look like inverse of reply. Normally
- this is what is in the conntrack, except for prior
- manipulations (future optimization: if num_manips == 0,
- orig_tp =
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
- invert_tuplepr(&curr_tuple,
- &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
-
- if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
- struct ip_conntrack_tuple reply;
-
- /* Alter conntrack table so will recognize replies. */
- invert_tuplepr(&reply, &new_tuple);
- ip_conntrack_alter_reply(conntrack, &reply);
-
- /* Non-atomic: we own this at the moment. */
- if (maniptype == IP_NAT_MANIP_SRC)
- conntrack->status |= IPS_SRC_NAT;
- else
- conntrack->status |= IPS_DST_NAT;
- }
-
- /* Place in source hash if this is the first time. */
- if (have_to_hash) {
- unsigned int srchash
- = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple);
- write_lock_bh(&ip_nat_lock);
- list_add(&info->bysource, &bysource[srchash]);
- write_unlock_bh(&ip_nat_lock);
- }
-
- /* It's done. */
- if (maniptype == IP_NAT_MANIP_DST)
- set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
- else
- set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
-
- return NF_ACCEPT;
-}
-EXPORT_SYMBOL(ip_nat_setup_info);
-
-/* Returns true if succeeded. */
-static int
-manip_pkt(u_int16_t proto,
- struct sk_buff **pskb,
- unsigned int iphdroff,
- const struct ip_conntrack_tuple *target,
- enum ip_nat_manip_type maniptype)
-{
- struct iphdr *iph;
- struct ip_nat_protocol *p;
-
- if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
- return 0;
-
- iph = (void *)(*pskb)->data + iphdroff;
-
- /* Manipulate protcol part. */
- p = ip_nat_proto_find_get(proto);
- if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
- ip_nat_proto_put(p);
- return 0;
- }
- ip_nat_proto_put(p);
-
- iph = (void *)(*pskb)->data + iphdroff;
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- nf_csum_replace4(&iph->check, iph->saddr, target->src.ip);
- iph->saddr = target->src.ip;
- } else {
- nf_csum_replace4(&iph->check, iph->daddr, target->dst.ip);
- iph->daddr = target->dst.ip;
- }
- return 1;
-}
-
-/* Do packet manipulations according to ip_nat_setup_info. */
-unsigned int ip_nat_packet(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- struct sk_buff **pskb)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned long statusbit;
- enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
-
- if (mtype == IP_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply dir. */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- /* Non-atomic: these bits don't change. */
- if (ct->status & statusbit) {
- struct ip_conntrack_tuple target;
-
- /* We are aiming to look like inverse of other direction. */
- invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-
- if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
- return NF_DROP;
- }
- return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(ip_nat_packet);
-
-/* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int ip_nat_icmp_reply_translation(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- struct sk_buff **pskb)
-{
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } *inside;
- struct ip_conntrack_tuple inner, target;
- int hdrlen = (*pskb)->nh.iph->ihl * 4;
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned long statusbit;
- enum ip_nat_manip_type manip = HOOK2MANIP(hooknum);
-
- if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
- return 0;
-
- inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
-
- /* We're actually going to mangle it beyond trivial checksum
- adjustment, so make sure the current checksum is correct. */
- if (nf_ip_checksum(*pskb, hooknum, hdrlen, 0))
- return 0;
-
- /* Must be RELATED */
- IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
- (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
-
- /* Redirects on non-null nats must be dropped, else they'll
- start talking to each other without our translation, and be
- confused... --RR */
- if (inside->icmp.type == ICMP_REDIRECT) {
- /* If NAT isn't finished, assume it and drop. */
- if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
- return 0;
-
- if (ct->status & IPS_NAT_MASK)
- return 0;
- }
-
- DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
- *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
-
- if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
- sizeof(struct icmphdr) + inside->ip.ihl*4,
- &inner,
- __ip_conntrack_proto_find(inside->ip.protocol)))
- return 0;
-
- /* Change inner back to look like incoming packet. We do the
- opposite manip on this hook to normal, because it might not
- pass all hooks (locally-generated ICMP). Consider incoming
- packet: PREROUTING (DST manip), routing produces ICMP, goes
- through POSTROUTING (which must correct the DST manip). */
- if (!manip_pkt(inside->ip.protocol, pskb,
- (*pskb)->nh.iph->ihl*4
- + sizeof(inside->icmp),
- &ct->tuplehash[!dir].tuple,
- !manip))
- return 0;
-
- if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
- /* Reloading "inside" here since manip_pkt inner. */
- inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
- inside->icmp.checksum = 0;
- inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
- (*pskb)->len - hdrlen,
- 0));
- }
-
- /* Change outer to look the reply to an incoming packet
- * (proto 0 means don't invert per-proto part). */
- if (manip == IP_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply dir. */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- if (!manip_pkt(0, pskb, 0, &target, manip))
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
-
-/* Protocol registration. */
-int ip_nat_protocol_register(struct ip_nat_protocol *proto)
-{
- int ret = 0;
-
- write_lock_bh(&ip_nat_lock);
- if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
- ret = -EBUSY;
- goto out;
- }
- ip_nat_protos[proto->protonum] = proto;
- out:
- write_unlock_bh(&ip_nat_lock);
- return ret;
-}
-EXPORT_SYMBOL(ip_nat_protocol_register);
-
-/* Noone stores the protocol anywhere; simply delete it. */
-void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
-{
- write_lock_bh(&ip_nat_lock);
- ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
- write_unlock_bh(&ip_nat_lock);
-
- /* Someone could be still looking at the proto in a bh. */
- synchronize_net();
-}
-EXPORT_SYMBOL(ip_nat_protocol_unregister);
-
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
-int
-ip_nat_port_range_to_nfattr(struct sk_buff *skb,
- const struct ip_nat_range *range)
-{
- NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16),
- &range->min.tcp.port);
- NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16),
- &range->max.tcp.port);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-int
-ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
-{
- int ret = 0;
-
- /* we have to return whether we actually parsed something or not */
-
- if (tb[CTA_PROTONAT_PORT_MIN-1]) {
- ret = 1;
- range->min.tcp.port =
- *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
- }
-
- if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
- if (ret)
- range->max.tcp.port = range->min.tcp.port;
- } else {
- ret = 1;
- range->max.tcp.port =
- *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
-EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
-#endif
-
-static int __init ip_nat_init(void)
-{
- size_t i;
-
- /* Leave them the same for the moment. */
- ip_nat_htable_size = ip_conntrack_htable_size;
-
- /* One vmalloc for both hash tables */
- bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
- if (!bysource)
- return -ENOMEM;
-
- /* Sew in builtin protocols. */
- write_lock_bh(&ip_nat_lock);
- for (i = 0; i < MAX_IP_NAT_PROTO; i++)
- ip_nat_protos[i] = &ip_nat_unknown_protocol;
- ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
- ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
- ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
- write_unlock_bh(&ip_nat_lock);
-
- for (i = 0; i < ip_nat_htable_size; i++) {
- INIT_LIST_HEAD(&bysource[i]);
- }
-
- /* FIXME: Man, this is a hack. <SIGH> */
- IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
- ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
-
- /* Initialize fake conntrack so that NAT will skip it */
- ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
- return 0;
-}
-
-/* Clear NAT section of all conntracks, in case we're loaded again. */
-static int clean_nat(struct ip_conntrack *i, void *data)
-{
- memset(&i->nat, 0, sizeof(i->nat));
- i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
- return 0;
-}
-
-static void __exit ip_nat_cleanup(void)
-{
- ip_ct_iterate_cleanup(&clean_nat, NULL);
- ip_conntrack_destroyed = NULL;
- vfree(bysource);
-}
-
-MODULE_LICENSE("GPL");
-
-module_init(ip_nat_init);
-module_exit(ip_nat_cleanup);
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
deleted file mode 100644
index 913960e1380f..000000000000
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/* FTP extension for TCP NAT alteration. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/moduleparam.h>
-#include <net/tcp.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-MODULE_DESCRIPTION("ftp NAT helper");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* FIXME: Time out? --RR */
-
-static int
-mangle_rfc959_packet(struct sk_buff **pskb,
- __be32 newip,
- u_int16_t port,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- u32 *seq)
-{
- char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
-
- sprintf(buffer, "%u,%u,%u,%u,%u,%u",
- NIPQUAD(newip), port>>8, port&0xFF);
-
- DEBUGP("calling ip_nat_mangle_tcp_packet\n");
-
- *seq += strlen(buffer) - matchlen;
- return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
- matchlen, buffer, strlen(buffer));
-}
-
-/* |1|132.235.1.2|6275| */
-static int
-mangle_eprt_packet(struct sk_buff **pskb,
- __be32 newip,
- u_int16_t port,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- u32 *seq)
-{
- char buffer[sizeof("|1|255.255.255.255|65535|")];
-
- sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
-
- DEBUGP("calling ip_nat_mangle_tcp_packet\n");
-
- *seq += strlen(buffer) - matchlen;
- return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
- matchlen, buffer, strlen(buffer));
-}
-
-/* |1|132.235.1.2|6275| */
-static int
-mangle_epsv_packet(struct sk_buff **pskb,
- __be32 newip,
- u_int16_t port,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- u32 *seq)
-{
- char buffer[sizeof("|||65535|")];
-
- sprintf(buffer, "|||%u|", port);
-
- DEBUGP("calling ip_nat_mangle_tcp_packet\n");
-
- *seq += strlen(buffer) - matchlen;
- return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
- matchlen, buffer, strlen(buffer));
-}
-
-static int (*mangle[])(struct sk_buff **, __be32, u_int16_t,
- unsigned int,
- unsigned int,
- struct ip_conntrack *,
- enum ip_conntrack_info,
- u32 *seq)
-= { [IP_CT_FTP_PORT] = mangle_rfc959_packet,
- [IP_CT_FTP_PASV] = mangle_rfc959_packet,
- [IP_CT_FTP_EPRT] = mangle_eprt_packet,
- [IP_CT_FTP_EPSV] = mangle_epsv_packet
-};
-
-/* So, this packet has hit the connection tracking matching code.
- Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_ftp(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- enum ip_ct_ftp_type type,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack_expect *exp,
- u32 *seq)
-{
- __be32 newip;
- u_int16_t port;
- int dir = CTINFO2DIR(ctinfo);
- struct ip_conntrack *ct = exp->master;
-
- DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
-
- /* Connection will come from wherever this packet goes, hence !dir */
- newip = ct->tuplehash[!dir].tuple.dst.ip;
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->dir = !dir;
-
- /* When you see the packet, we need to NAT it the same as the
- * this one. */
- exp->expectfn = ip_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- exp->tuple.dst.u.tcp.port = htons(port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (port == 0)
- return NF_DROP;
-
- if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
- seq)) {
- ip_conntrack_unexpect_related(exp);
- return NF_DROP;
- }
- return NF_ACCEPT;
-}
-
-static void __exit ip_nat_ftp_fini(void)
-{
- rcu_assign_pointer(ip_nat_ftp_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init ip_nat_ftp_init(void)
-{
- BUG_ON(rcu_dereference(ip_nat_ftp_hook));
- rcu_assign_pointer(ip_nat_ftp_hook, ip_nat_ftp);
- return 0;
-}
-
-/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
- printk(KERN_INFO KBUILD_MODNAME
- ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
- return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-
-module_init(ip_nat_ftp_init);
-module_exit(ip_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
deleted file mode 100644
index ee80feb4b2a9..000000000000
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ /dev/null
@@ -1,436 +0,0 @@
-/* ip_nat_helper.c - generic support functions for NAT helpers
- *
- * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
- * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
- * - add support for SACK adjustment
- * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
- * - merge SACK support into newnat API
- * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
- * - make ip_nat_resize_packet more generic (TCP and UDP)
- * - add ip_nat_mangle_udp_packet
- */
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-
-#if 0
-#define DEBUGP printk
-#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
-#else
-#define DEBUGP(format, args...)
-#define DUMP_OFFSET(x)
-#endif
-
-static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
-
-/* Setup TCP sequence correction given this change at this sequence */
-static inline void
-adjust_tcp_sequence(u32 seq,
- int sizediff,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- int dir;
- struct ip_nat_seq *this_way, *other_way;
-
- DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
- (*skb)->len, new_size);
-
- dir = CTINFO2DIR(ctinfo);
-
- this_way = &ct->nat.info.seq[dir];
- other_way = &ct->nat.info.seq[!dir];
-
- DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
- DUMP_OFFSET(this_way);
-
- spin_lock_bh(&ip_nat_seqofs_lock);
-
- /* SYN adjust. If it's uninitialized, or this is after last
- * correction, record it: we don't handle more than one
- * adjustment in the window, but do deal with common case of a
- * retransmit */
- if (this_way->offset_before == this_way->offset_after
- || before(this_way->correction_pos, seq)) {
- this_way->correction_pos = seq;
- this_way->offset_before = this_way->offset_after;
- this_way->offset_after += sizediff;
- }
- spin_unlock_bh(&ip_nat_seqofs_lock);
-
- DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
- DUMP_OFFSET(this_way);
-}
-
-/* Frobs data inside this packet, which is linear. */
-static void mangle_contents(struct sk_buff *skb,
- unsigned int dataoff,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len)
-{
- unsigned char *data;
-
- BUG_ON(skb_is_nonlinear(skb));
- data = (unsigned char *)skb->nh.iph + dataoff;
-
- /* move post-replacement */
- memmove(data + match_offset + rep_len,
- data + match_offset + match_len,
- skb->tail - (data + match_offset + match_len));
-
- /* insert data from buffer */
- memcpy(data + match_offset, rep_buffer, rep_len);
-
- /* update skb info */
- if (rep_len > match_len) {
- DEBUGP("ip_nat_mangle_packet: Extending packet by "
- "%u from %u bytes\n", rep_len - match_len,
- skb->len);
- skb_put(skb, rep_len - match_len);
- } else {
- DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
- "%u from %u bytes\n", match_len - rep_len,
- skb->len);
- __skb_trim(skb, skb->len + rep_len - match_len);
- }
-
- /* fix IP hdr checksum information */
- skb->nh.iph->tot_len = htons(skb->len);
- ip_send_check(skb->nh.iph);
-}
-
-/* Unusual, but possible case. */
-static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
-{
- struct sk_buff *nskb;
-
- if ((*pskb)->len + extra > 65535)
- return 0;
-
- nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
- if (!nskb)
- return 0;
-
- /* Transfer socket to new skb. */
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- return 1;
-}
-
-/* Generic function for mangling variable-length address changes inside
- * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
- * command in FTP).
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * */
-int
-ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len)
-{
- struct iphdr *iph;
- struct tcphdr *tcph;
- int oldlen, datalen;
-
- if (!skb_make_writable(pskb, (*pskb)->len))
- return 0;
-
- if (rep_len > match_len
- && rep_len - match_len > skb_tailroom(*pskb)
- && !enlarge_skb(pskb, rep_len - match_len))
- return 0;
-
- SKB_LINEAR_ASSERT(*pskb);
-
- iph = (*pskb)->nh.iph;
- tcph = (void *)iph + iph->ihl*4;
-
- oldlen = (*pskb)->len - iph->ihl*4;
- mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
- match_offset, match_len, rep_buffer, rep_len);
-
- datalen = (*pskb)->len - iph->ihl*4;
- if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
- tcph->check = 0;
- tcph->check = tcp_v4_check(tcph, datalen,
- iph->saddr, iph->daddr,
- csum_partial((char *)tcph,
- datalen, 0));
- } else
- nf_proto_csum_replace2(&tcph->check, *pskb,
- htons(oldlen), htons(datalen), 1);
-
- if (rep_len != match_len) {
- set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
- adjust_tcp_sequence(ntohl(tcph->seq),
- (int)rep_len - (int)match_len,
- ct, ctinfo);
- /* Tell TCP window tracking about seq change */
- ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
- }
- return 1;
-}
-EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
-
-/* Generic function for mangling variable-length address changes inside
- * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
- * command in the Amanda protocol)
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
- * should be fairly easy to do.
- */
-int
-ip_nat_mangle_udp_packet(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len)
-{
- struct iphdr *iph;
- struct udphdr *udph;
- int datalen, oldlen;
-
- /* UDP helpers might accidentally mangle the wrong packet */
- iph = (*pskb)->nh.iph;
- if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) +
- match_offset + match_len)
- return 0;
-
- if (!skb_make_writable(pskb, (*pskb)->len))
- return 0;
-
- if (rep_len > match_len
- && rep_len - match_len > skb_tailroom(*pskb)
- && !enlarge_skb(pskb, rep_len - match_len))
- return 0;
-
- iph = (*pskb)->nh.iph;
- udph = (void *)iph + iph->ihl*4;
-
- oldlen = (*pskb)->len - iph->ihl*4;
- mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
- match_offset, match_len, rep_buffer, rep_len);
-
- /* update the length of the UDP packet */
- datalen = (*pskb)->len - iph->ihl*4;
- udph->len = htons(datalen);
-
- /* fix udp checksum if udp checksum was previously calculated */
- if (!udph->check && (*pskb)->ip_summed != CHECKSUM_PARTIAL)
- return 1;
-
- if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
- udph->check = 0;
- udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- csum_partial((char *)udph,
- datalen, 0));
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- } else
- nf_proto_csum_replace2(&udph->check, *pskb,
- htons(oldlen), htons(datalen), 1);
- return 1;
-}
-EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
-
-/* Adjust one found SACK option including checksum correction */
-static void
-sack_adjust(struct sk_buff *skb,
- struct tcphdr *tcph,
- unsigned int sackoff,
- unsigned int sackend,
- struct ip_nat_seq *natseq)
-{
- while (sackoff < sackend) {
- struct tcp_sack_block_wire *sack;
- __be32 new_start_seq, new_end_seq;
-
- sack = (void *)skb->data + sackoff;
- if (after(ntohl(sack->start_seq) - natseq->offset_before,
- natseq->correction_pos))
- new_start_seq = htonl(ntohl(sack->start_seq)
- - natseq->offset_after);
- else
- new_start_seq = htonl(ntohl(sack->start_seq)
- - natseq->offset_before);
-
- if (after(ntohl(sack->end_seq) - natseq->offset_before,
- natseq->correction_pos))
- new_end_seq = htonl(ntohl(sack->end_seq)
- - natseq->offset_after);
- else
- new_end_seq = htonl(ntohl(sack->end_seq)
- - natseq->offset_before);
-
- DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
- ntohl(sack->start_seq), new_start_seq,
- ntohl(sack->end_seq), new_end_seq);
-
- nf_proto_csum_replace4(&tcph->check, skb,
- sack->start_seq, new_start_seq, 0);
- nf_proto_csum_replace4(&tcph->check, skb,
- sack->end_seq, new_end_seq, 0);
- sack->start_seq = new_start_seq;
- sack->end_seq = new_end_seq;
- sackoff += sizeof(*sack);
- }
-}
-
-/* TCP SACK sequence number adjustment */
-static inline unsigned int
-ip_nat_sack_adjust(struct sk_buff **pskb,
- struct tcphdr *tcph,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- unsigned int dir, optoff, optend;
-
- optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
- optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
-
- if (!skb_make_writable(pskb, optend))
- return 0;
-
- dir = CTINFO2DIR(ctinfo);
-
- while (optoff < optend) {
- /* Usually: option, length. */
- unsigned char *op = (*pskb)->data + optoff;
-
- switch (op[0]) {
- case TCPOPT_EOL:
- return 1;
- case TCPOPT_NOP:
- optoff++;
- continue;
- default:
- /* no partial options */
- if (optoff + 1 == optend
- || optoff + op[1] > optend
- || op[1] < 2)
- return 0;
- if (op[0] == TCPOPT_SACK
- && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
- && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
- sack_adjust(*pskb, tcph, optoff+2,
- optoff+op[1],
- &ct->nat.info.seq[!dir]);
- optoff += op[1];
- }
- }
- return 1;
-}
-
-/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
-int
-ip_nat_seq_adjust(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct tcphdr *tcph;
- int dir;
- __be32 newseq, newack;
- struct ip_nat_seq *this_way, *other_way;
-
- dir = CTINFO2DIR(ctinfo);
-
- this_way = &ct->nat.info.seq[dir];
- other_way = &ct->nat.info.seq[!dir];
-
- if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
- return 0;
-
- tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
- if (after(ntohl(tcph->seq), this_way->correction_pos))
- newseq = htonl(ntohl(tcph->seq) + this_way->offset_after);
- else
- newseq = htonl(ntohl(tcph->seq) + this_way->offset_before);
-
- if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
- other_way->correction_pos))
- newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_after);
- else
- newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before);
-
- nf_proto_csum_replace4(&tcph->check, *pskb, tcph->seq, newseq, 0);
- nf_proto_csum_replace4(&tcph->check, *pskb, tcph->ack_seq, newack, 0);
-
- DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
- ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
- ntohl(newack));
-
- tcph->seq = newseq;
- tcph->ack_seq = newack;
-
- if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
- return 0;
-
- ip_conntrack_tcp_update(*pskb, ct, dir);
-
- return 1;
-}
-EXPORT_SYMBOL(ip_nat_seq_adjust);
-
-/* Setup NAT on this expected conntrack so it follows master. */
-/* If we fail to get a free NAT slot, we'll get dropped on confirm */
-void ip_nat_follow_master(struct ip_conntrack *ct,
- struct ip_conntrack_expect *exp)
-{
- struct ip_nat_range range;
-
- /* This must be a fresh one. */
- BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-
- /* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
- /* hook doesn't matter, but it has to do source manip */
- ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
-
- /* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = exp->saved_proto;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.src.ip;
- /* hook doesn't matter, but it has to do destination manip */
- ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
-}
-EXPORT_SYMBOL(ip_nat_follow_master);
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
deleted file mode 100644
index bdc99ef6159e..000000000000
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ /dev/null
@@ -1,611 +0,0 @@
-/*
- * H.323 extension for NAT alteration.
- *
- * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
- *
- * This source code is licensed under General Public License version 2.
- *
- * Based on the 'brute force' H.323 NAT module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- */
-
-#include <linux/module.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/moduleparam.h>
-#include <net/tcp.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
-#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/****************************************************************************/
-static int set_addr(struct sk_buff **pskb,
- unsigned char **data, int dataoff,
- unsigned int addroff, __be32 ip, u_int16_t port)
-{
- enum ip_conntrack_info ctinfo;
- struct ip_conntrack *ct = ip_conntrack_get(*pskb, &ctinfo);
- struct {
- __be32 ip;
- __be16 port;
- } __attribute__ ((__packed__)) buf;
- struct tcphdr _tcph, *th;
-
- buf.ip = ip;
- buf.port = htons(port);
- addroff += dataoff;
-
- if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) {
- if (!ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
- addroff, sizeof(buf),
- (char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- printk("ip_nat_h323: ip_nat_mangle_tcp_packet"
- " error\n");
- return -1;
- }
-
- /* Relocate data pointer */
- th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL)
- return -1;
- *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 +
- th->doff * 4 + dataoff;
- } else {
- if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
- addroff, sizeof(buf),
- (char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- printk("ip_nat_h323: ip_nat_mangle_udp_packet"
- " error\n");
- return -1;
- }
- /* ip_nat_mangle_udp_packet uses skb_make_writable() to copy
- * or pull everything in a linear buffer, so we can safely
- * use the skb pointers now */
- *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 +
- sizeof(struct udphdr);
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int set_h225_addr(struct sk_buff **pskb,
- unsigned char **data, int dataoff,
- TransportAddress * addr,
- __be32 ip, u_int16_t port)
-{
- return set_addr(pskb, data, dataoff, addr->ipAddress.ip, ip, port);
-}
-
-/****************************************************************************/
-static int set_h245_addr(struct sk_buff **pskb,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr,
- __be32 ip, u_int16_t port)
-{
- return set_addr(pskb, data, dataoff,
- addr->unicastAddress.iPAddress.network, ip, port);
-}
-
-/****************************************************************************/
-static int set_sig_addr(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data,
- TransportAddress * addr, int count)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- int i;
- __be32 ip;
- u_int16_t port;
-
- for (i = 0; i < count; i++) {
- if (get_h225_addr(*data, &addr[i], &ip, &port)) {
- if (ip == ct->tuplehash[dir].tuple.src.ip &&
- port == info->sig_port[dir]) {
- /* GW->GK */
-
- /* Fix for Gnomemeeting */
- if (i > 0 &&
- get_h225_addr(*data, &addr[0],
- &ip, &port) &&
- (ntohl(ip) & 0xff000000) == 0x7f000000)
- i = 0;
-
- DEBUGP
- ("ip_nat_ras: set signal address "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(ip), port,
- NIPQUAD(ct->tuplehash[!dir].tuple.dst.
- ip), info->sig_port[!dir]);
- return set_h225_addr(pskb, data, 0, &addr[i],
- ct->tuplehash[!dir].
- tuple.dst.ip,
- info->sig_port[!dir]);
- } else if (ip == ct->tuplehash[dir].tuple.dst.ip &&
- port == info->sig_port[dir]) {
- /* GK->GW */
- DEBUGP
- ("ip_nat_ras: set signal address "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(ip), port,
- NIPQUAD(ct->tuplehash[!dir].tuple.src.
- ip), info->sig_port[!dir]);
- return set_h225_addr(pskb, data, 0, &addr[i],
- ct->tuplehash[!dir].
- tuple.src.ip,
- info->sig_port[!dir]);
- }
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int set_ras_addr(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data,
- TransportAddress * addr, int count)
-{
- int dir = CTINFO2DIR(ctinfo);
- int i;
- __be32 ip;
- u_int16_t port;
-
- for (i = 0; i < count; i++) {
- if (get_h225_addr(*data, &addr[i], &ip, &port) &&
- ip == ct->tuplehash[dir].tuple.src.ip &&
- port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) {
- DEBUGP("ip_nat_ras: set rasAddress "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(ip), port,
- NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip),
- ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.
- port));
- return set_h225_addr(pskb, data, 0, &addr[i],
- ct->tuplehash[!dir].tuple.dst.ip,
- ntohs(ct->tuplehash[!dir].tuple.
- dst.u.udp.port));
- }
- }
-
- return 0;
-}
-
-/****************************************************************************/
-static int nat_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr,
- u_int16_t port, u_int16_t rtp_port,
- struct ip_conntrack_expect *rtp_exp,
- struct ip_conntrack_expect *rtcp_exp)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- int i;
- u_int16_t nated_port;
-
- /* Set expectations for NAT */
- rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
- rtp_exp->expectfn = ip_nat_follow_master;
- rtp_exp->dir = !dir;
- rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
- rtcp_exp->expectfn = ip_nat_follow_master;
- rtcp_exp->dir = !dir;
-
- /* Lookup existing expects */
- for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
- if (info->rtp_port[i][dir] == rtp_port) {
- /* Expected */
-
- /* Use allocated ports first. This will refresh
- * the expects */
- rtp_exp->tuple.dst.u.udp.port =
- htons(info->rtp_port[i][dir]);
- rtcp_exp->tuple.dst.u.udp.port =
- htons(info->rtp_port[i][dir] + 1);
- break;
- } else if (info->rtp_port[i][dir] == 0) {
- /* Not expected */
- break;
- }
- }
-
- /* Run out of expectations */
- if (i >= H323_RTP_CHANNEL_MAX) {
- if (net_ratelimit())
- printk("ip_nat_h323: out of expectations\n");
- return 0;
- }
-
- /* Try to get a pair of ports. */
- for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
- nated_port != 0; nated_port += 2) {
- rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
- if (ip_conntrack_expect_related(rtp_exp) == 0) {
- rtcp_exp->tuple.dst.u.udp.port =
- htons(nated_port + 1);
- if (ip_conntrack_expect_related(rtcp_exp) == 0)
- break;
- ip_conntrack_unexpect_related(rtp_exp);
- }
- }
-
- if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- printk("ip_nat_h323: out of RTP ports\n");
- return 0;
- }
-
- /* Modify signal */
- if (set_h245_addr(pskb, data, dataoff, addr,
- ct->tuplehash[!dir].tuple.dst.ip,
- (port & 1) ? nated_port + 1 : nated_port) == 0) {
- /* Save ports */
- info->rtp_port[i][dir] = rtp_port;
- info->rtp_port[i][!dir] = nated_port;
- } else {
- ip_conntrack_unexpect_related(rtp_exp);
- ip_conntrack_unexpect_related(rtcp_exp);
- return -1;
- }
-
- /* Success */
- DEBUGP("ip_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(rtp_exp->tuple.src.ip),
- ntohs(rtp_exp->tuple.src.u.udp.port),
- NIPQUAD(rtp_exp->tuple.dst.ip),
- ntohs(rtp_exp->tuple.dst.u.udp.port));
- DEBUGP("ip_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(rtcp_exp->tuple.src.ip),
- ntohs(rtcp_exp->tuple.src.u.udp.port),
- NIPQUAD(rtcp_exp->tuple.dst.ip),
- ntohs(rtcp_exp->tuple.dst.u.udp.port));
-
- return 0;
-}
-
-/****************************************************************************/
-static int nat_t120(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- H245_TransportAddress * addr, u_int16_t port,
- struct ip_conntrack_expect *exp)
-{
- int dir = CTINFO2DIR(ctinfo);
- u_int16_t nated_port = port;
-
- /* Set expectations for NAT */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->expectfn = ip_nat_follow_master;
- exp->dir = !dir;
-
- /* Try to get same port: if not, try to change it. */
- for (; nated_port != 0; nated_port++) {
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- printk("ip_nat_h323: out of TCP ports\n");
- return 0;
- }
-
- /* Modify signal */
- if (set_h245_addr(pskb, data, dataoff, addr,
- ct->tuplehash[!dir].tuple.dst.ip, nated_port) < 0) {
- ip_conntrack_unexpect_related(exp);
- return -1;
- }
-
- DEBUGP("ip_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
-
- return 0;
-}
-
-/****************************************************************************
- * This conntrack expect function replaces ip_conntrack_h245_expect()
- * which was set by ip_conntrack_helper_h323.c. It calls both
- * ip_nat_follow_master() and ip_conntrack_h245_expect()
- ****************************************************************************/
-static void ip_nat_h245_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this)
-{
- ip_nat_follow_master(new, this);
- ip_conntrack_h245_expect(new, this);
-}
-
-/****************************************************************************/
-static int nat_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- TransportAddress * addr, u_int16_t port,
- struct ip_conntrack_expect *exp)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- u_int16_t nated_port = port;
-
- /* Set expectations for NAT */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->expectfn = ip_nat_h245_expect;
- exp->dir = !dir;
-
- /* Check existing expects */
- if (info->sig_port[dir] == port)
- nated_port = info->sig_port[!dir];
-
- /* Try to get same port: if not, try to change it. */
- for (; nated_port != 0; nated_port++) {
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- printk("ip_nat_q931: out of TCP ports\n");
- return 0;
- }
-
- /* Modify signal */
- if (set_h225_addr(pskb, data, dataoff, addr,
- ct->tuplehash[!dir].tuple.dst.ip,
- nated_port) == 0) {
- /* Save ports */
- info->sig_port[dir] = port;
- info->sig_port[!dir] = nated_port;
- } else {
- ip_conntrack_unexpect_related(exp);
- return -1;
- }
-
- DEBUGP("ip_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
-
- return 0;
-}
-
-/****************************************************************************
- * This conntrack expect function replaces ip_conntrack_q931_expect()
- * which was set by ip_conntrack_helper_h323.c.
- ****************************************************************************/
-static void ip_nat_q931_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this)
-{
- struct ip_nat_range range;
-
- if (this->tuple.src.ip != 0) { /* Only accept calls from GK */
- ip_nat_follow_master(new, this);
- goto out;
- }
-
- /* This must be a fresh one. */
- BUG_ON(new->status & IPS_NAT_DONE_MASK);
-
- /* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
-
- /* hook doesn't matter, but it has to do source manip */
- ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
-
- /* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = this->saved_proto;
- range.min_ip = range.max_ip =
- new->master->tuplehash[!this->dir].tuple.src.ip;
-
- /* hook doesn't matter, but it has to do destination manip */
- ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
-
- out:
- ip_conntrack_q931_expect(new, this);
-}
-
-/****************************************************************************/
-static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, TransportAddress * addr, int idx,
- u_int16_t port, struct ip_conntrack_expect *exp)
-{
- struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
- int dir = CTINFO2DIR(ctinfo);
- u_int16_t nated_port = port;
- __be32 ip;
-
- /* Set expectations for NAT */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->expectfn = ip_nat_q931_expect;
- exp->dir = !dir;
-
- /* Check existing expects */
- if (info->sig_port[dir] == port)
- nated_port = info->sig_port[!dir];
-
- /* Try to get same port: if not, try to change it. */
- for (; nated_port != 0; nated_port++) {
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- printk("ip_nat_ras: out of TCP ports\n");
- return 0;
- }
-
- /* Modify signal */
- if (set_h225_addr(pskb, data, 0, &addr[idx],
- ct->tuplehash[!dir].tuple.dst.ip,
- nated_port) == 0) {
- /* Save ports */
- info->sig_port[dir] = port;
- info->sig_port[!dir] = nated_port;
-
- /* Fix for Gnomemeeting */
- if (idx > 0 &&
- get_h225_addr(*data, &addr[0], &ip, &port) &&
- (ntohl(ip) & 0xff000000) == 0x7f000000) {
- set_h225_addr_hook(pskb, data, 0, &addr[0],
- ct->tuplehash[!dir].tuple.dst.ip,
- info->sig_port[!dir]);
- }
- } else {
- ip_conntrack_unexpect_related(exp);
- return -1;
- }
-
- /* Success */
- DEBUGP("ip_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
-
- return 0;
-}
-
-/****************************************************************************/
-static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
- struct ip_conntrack_expect *this)
-{
- struct ip_nat_range range;
-
- /* This must be a fresh one. */
- BUG_ON(new->status & IPS_NAT_DONE_MASK);
-
- /* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
-
- /* hook doesn't matter, but it has to do source manip */
- ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
-
- /* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = this->saved_proto;
- range.min_ip = range.max_ip = this->saved_ip;
-
- /* hook doesn't matter, but it has to do destination manip */
- ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
-
- ip_conntrack_q931_expect(new, this);
-}
-
-/****************************************************************************/
-static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
- TransportAddress * addr, u_int16_t port,
- struct ip_conntrack_expect *exp)
-{
- int dir = CTINFO2DIR(ctinfo);
- u_int16_t nated_port;
-
- /* Set expectations for NAT */
- exp->saved_ip = exp->tuple.dst.ip;
- exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->expectfn = ip_nat_callforwarding_expect;
- exp->dir = !dir;
-
- /* Try to get same port: if not, try to change it. */
- for (nated_port = port; nated_port != 0; nated_port++) {
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- printk("ip_nat_q931: out of TCP ports\n");
- return 0;
- }
-
- /* Modify signal */
- if (!set_h225_addr(pskb, data, dataoff, addr,
- ct->tuplehash[!dir].tuple.dst.ip,
- nated_port) == 0) {
- ip_conntrack_unexpect_related(exp);
- return -1;
- }
-
- /* Success */
- DEBUGP("ip_nat_q931: expect Call Forwarding "
- "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
- NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
- NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
-
- return 0;
-}
-
-/****************************************************************************/
-static int __init init(void)
-{
- BUG_ON(rcu_dereference(set_h245_addr_hook) != NULL);
- BUG_ON(rcu_dereference(set_h225_addr_hook) != NULL);
- BUG_ON(rcu_dereference(set_sig_addr_hook) != NULL);
- BUG_ON(rcu_dereference(set_ras_addr_hook) != NULL);
- BUG_ON(rcu_dereference(nat_rtp_rtcp_hook) != NULL);
- BUG_ON(rcu_dereference(nat_t120_hook) != NULL);
- BUG_ON(rcu_dereference(nat_h245_hook) != NULL);
- BUG_ON(rcu_dereference(nat_callforwarding_hook) != NULL);
- BUG_ON(rcu_dereference(nat_q931_hook) != NULL);
-
- rcu_assign_pointer(set_h245_addr_hook, set_h245_addr);
- rcu_assign_pointer(set_h225_addr_hook, set_h225_addr);
- rcu_assign_pointer(set_sig_addr_hook, set_sig_addr);
- rcu_assign_pointer(set_ras_addr_hook, set_ras_addr);
- rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp);
- rcu_assign_pointer(nat_t120_hook, nat_t120);
- rcu_assign_pointer(nat_h245_hook, nat_h245);
- rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
- rcu_assign_pointer(nat_q931_hook, nat_q931);
-
- DEBUGP("ip_nat_h323: init success\n");
- return 0;
-}
-
-/****************************************************************************/
-static void __exit fini(void)
-{
- rcu_assign_pointer(set_h245_addr_hook, NULL);
- rcu_assign_pointer(set_h225_addr_hook, NULL);
- rcu_assign_pointer(set_sig_addr_hook, NULL);
- rcu_assign_pointer(set_ras_addr_hook, NULL);
- rcu_assign_pointer(nat_rtp_rtcp_hook, NULL);
- rcu_assign_pointer(nat_t120_hook, NULL);
- rcu_assign_pointer(nat_h245_hook, NULL);
- rcu_assign_pointer(nat_callforwarding_hook, NULL);
- rcu_assign_pointer(nat_q931_hook, NULL);
- synchronize_rcu();
-}
-
-/****************************************************************************/
-module_init(init);
-module_exit(fini);
-
-MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
-MODULE_DESCRIPTION("H.323 NAT helper");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
deleted file mode 100644
index ec957bbb5366..000000000000
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * ip_nat_pptp.c - Version 3.0
- *
- * NAT support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
- * It is a specification defined by Microsoft and some vendors
- * working with Microsoft. PPTP is built on top of a modified
- * version of the Internet Generic Routing Encapsulation Protocol.
- * GRE is defined in RFC 1701 and RFC 1702. Documentation of
- * PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- * TODO: - NAT to a unique tuple, not to TCP source port
- * (needs netfilter tuple reservation)
- *
- * Changes:
- * 2002-02-10 - Version 1.3
- * - Use ip_nat_mangle_tcp_packet() because of cloned skb's
- * in local connections (Philip Craig <philipc@snapgear.com>)
- * - add checks for magicCookie and pptp version
- * - make argument list of pptp_{out,in}bound_packet() shorter
- * - move to C99 style initializers
- * - print version number at module loadtime
- * 2003-09-22 - Version 1.5
- * - use SNATed tcp sourceport as callid, since we get called before
- * TCP header is mangled (Philip Craig <philipc@snapgear.com>)
- * 2004-10-22 - Version 2.0
- * - kernel 2.6.x version
- * 2005-06-10 - Version 3.0
- * - kernel >= 2.6.11 version,
- * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
- *
- */
-
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_nat_pptp.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
-#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
-
-#define IP_NAT_PPTP_VERSION "3.0"
-
-#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off)))
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
-
-
-#if 0
-extern const char *pptp_msg_name[];
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
- __FUNCTION__, ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static void pptp_nat_expected(struct ip_conntrack *ct,
- struct ip_conntrack_expect *exp)
-{
- struct ip_conntrack *master = ct->master;
- struct ip_conntrack_expect *other_exp;
- struct ip_conntrack_tuple t;
- struct ip_ct_pptp_master *ct_pptp_info;
- struct ip_nat_pptp *nat_pptp_info;
- struct ip_nat_range range;
-
- ct_pptp_info = &master->help.ct_pptp_info;
- nat_pptp_info = &master->nat.help.nat_pptp_info;
-
- /* And here goes the grand finale of corrosion... */
-
- if (exp->dir == IP_CT_DIR_ORIGINAL) {
- DEBUGP("we are PNS->PAC\n");
- /* therefore, build tuple for PAC->PNS */
- t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
- t.src.u.gre.key = master->help.ct_pptp_info.pac_call_id;
- t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
- t.dst.u.gre.key = master->help.ct_pptp_info.pns_call_id;
- t.dst.protonum = IPPROTO_GRE;
- } else {
- DEBUGP("we are PAC->PNS\n");
- /* build tuple for PNS->PAC */
- t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
- t.src.u.gre.key = master->nat.help.nat_pptp_info.pns_call_id;
- t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
- t.dst.u.gre.key = master->nat.help.nat_pptp_info.pac_call_id;
- t.dst.protonum = IPPROTO_GRE;
- }
-
- DEBUGP("trying to unexpect other dir: ");
- DUMP_TUPLE(&t);
- other_exp = ip_conntrack_expect_find_get(&t);
- if (other_exp) {
- ip_conntrack_unexpect_related(other_exp);
- ip_conntrack_expect_put(other_exp);
- DEBUGP("success\n");
- } else {
- DEBUGP("not found!\n");
- }
-
- /* This must be a fresh one. */
- BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-
- /* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
- if (exp->dir == IP_CT_DIR_ORIGINAL) {
- range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
- range.min = range.max = exp->saved_proto;
- }
- /* hook doesn't matter, but it has to do source manip */
- ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
-
- /* For DST manip, map port here to where it's expected. */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.src.ip;
- if (exp->dir == IP_CT_DIR_REPLY) {
- range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
- range.min = range.max = exp->saved_proto;
- }
- /* hook doesn't matter, but it has to do destination manip */
- ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
-}
-
-/* outbound packets == from PNS to PAC */
-static int
-pptp_outbound_pkt(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq)
-
-{
- struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
- struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
- u_int16_t msg;
- __be16 new_callid;
- unsigned int cid_off;
-
- new_callid = ct_pptp_info->pns_call_id;
-
- switch (msg = ntohs(ctlh->messageType)) {
- case PPTP_OUT_CALL_REQUEST:
- cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
- /* FIXME: ideally we would want to reserve a call ID
- * here. current netfilter NAT core is not able to do
- * this :( For now we use TCP source port. This breaks
- * multiple calls within one control session */
-
- /* save original call ID in nat_info */
- nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
-
- /* don't use tcph->source since we are at a DSTmanip
- * hook (e.g. PREROUTING) and pkt is not mangled yet */
- new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
- /* save new call ID in ct info */
- ct_pptp_info->pns_call_id = new_callid;
- break;
- case PPTP_IN_CALL_REPLY:
- cid_off = offsetof(union pptp_ctrl_union, icack.callID);
- break;
- case PPTP_CALL_CLEAR_REQUEST:
- cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
- break;
- default:
- DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
- (msg <= PPTP_MSG_MAX)?
- pptp_msg_name[msg]:pptp_msg_name[0]);
- /* fall through */
-
- case PPTP_SET_LINK_INFO:
- /* only need to NAT in case PAC is behind NAT box */
- case PPTP_START_SESSION_REQUEST:
- case PPTP_START_SESSION_REPLY:
- case PPTP_STOP_SESSION_REQUEST:
- case PPTP_STOP_SESSION_REPLY:
- case PPTP_ECHO_REQUEST:
- case PPTP_ECHO_REPLY:
- /* no need to alter packet */
- return NF_ACCEPT;
- }
-
- /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
- * down to here */
- DEBUGP("altering call id from 0x%04x to 0x%04x\n",
- ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
-
- /* mangle packet */
- if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
- cid_off + sizeof(struct pptp_pkt_hdr) +
- sizeof(struct PptpControlHeader),
- sizeof(new_callid), (char *)&new_callid,
- sizeof(new_callid)) == 0)
- return NF_DROP;
-
- return NF_ACCEPT;
-}
-
-static void
-pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
- struct ip_conntrack_expect *expect_reply)
-{
- struct ip_conntrack *ct = expect_orig->master;
- struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
- struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
-
- /* save original PAC call ID in nat_info */
- nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
-
- /* alter expectation for PNS->PAC direction */
- expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
- expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
- expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
- expect_orig->dir = IP_CT_DIR_ORIGINAL;
-
- /* alter expectation for PAC->PNS direction */
- expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
- expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
- expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
- expect_reply->dir = IP_CT_DIR_REPLY;
-}
-
-/* inbound packets == from PAC to PNS */
-static int
-pptp_inbound_pkt(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq)
-{
- struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
- u_int16_t msg;
- __be16 new_pcid;
- unsigned int pcid_off;
-
- new_pcid = nat_pptp_info->pns_call_id;
-
- switch (msg = ntohs(ctlh->messageType)) {
- case PPTP_OUT_CALL_REPLY:
- pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
- break;
- case PPTP_IN_CALL_CONNECT:
- pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
- break;
- case PPTP_IN_CALL_REQUEST:
- /* only need to nat in case PAC is behind NAT box */
- return NF_ACCEPT;
- case PPTP_WAN_ERROR_NOTIFY:
- pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
- break;
- case PPTP_CALL_DISCONNECT_NOTIFY:
- pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
- break;
- case PPTP_SET_LINK_INFO:
- pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
- break;
-
- default:
- DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)?
- pptp_msg_name[msg]:pptp_msg_name[0]);
- /* fall through */
-
- case PPTP_START_SESSION_REQUEST:
- case PPTP_START_SESSION_REPLY:
- case PPTP_STOP_SESSION_REQUEST:
- case PPTP_STOP_SESSION_REPLY:
- case PPTP_ECHO_REQUEST:
- case PPTP_ECHO_REPLY:
- /* no need to alter packet */
- return NF_ACCEPT;
- }
-
- /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
- * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
-
- /* mangle packet */
- DEBUGP("altering peer call id from 0x%04x to 0x%04x\n",
- ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
-
- if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
- pcid_off + sizeof(struct pptp_pkt_hdr) +
- sizeof(struct PptpControlHeader),
- sizeof(new_pcid), (char *)&new_pcid,
- sizeof(new_pcid)) == 0)
- return NF_DROP;
- return NF_ACCEPT;
-}
-
-
-extern int __init ip_nat_proto_gre_init(void);
-extern void __exit ip_nat_proto_gre_fini(void);
-
-static int __init ip_nat_helper_pptp_init(void)
-{
- int ret;
-
- DEBUGP("%s: registering NAT helper\n", __FILE__);
-
- ret = ip_nat_proto_gre_init();
- if (ret < 0)
- return ret;
-
- BUG_ON(rcu_dereference(ip_nat_pptp_hook_outbound));
- rcu_assign_pointer(ip_nat_pptp_hook_outbound, pptp_outbound_pkt);
-
- BUG_ON(rcu_dereference(ip_nat_pptp_hook_inbound));
- rcu_assign_pointer(ip_nat_pptp_hook_inbound, pptp_inbound_pkt);
-
- BUG_ON(rcu_dereference(ip_nat_pptp_hook_exp_gre));
- rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, pptp_exp_gre);
-
- BUG_ON(rcu_dereference(ip_nat_pptp_hook_expectfn));
- rcu_assign_pointer(ip_nat_pptp_hook_expectfn, pptp_nat_expected);
-
- printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION);
- return 0;
-}
-
-static void __exit ip_nat_helper_pptp_fini(void)
-{
- DEBUGP("cleanup_module\n" );
-
- rcu_assign_pointer(ip_nat_pptp_hook_expectfn, NULL);
- rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, NULL);
- rcu_assign_pointer(ip_nat_pptp_hook_inbound, NULL);
- rcu_assign_pointer(ip_nat_pptp_hook_outbound, NULL);
- synchronize_rcu();
-
- ip_nat_proto_gre_fini();
-
- printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION);
-}
-
-module_init(ip_nat_helper_pptp_init);
-module_exit(ip_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
deleted file mode 100644
index feb26b48f1d5..000000000000
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/* IRC extension for TCP NAT alteration.
- * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
- * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- * based on a copy of RR's ip_nat_ftp.c
- *
- * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/kernel.h>
-#include <net/tcp.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/moduleparam.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("IRC (DCC) NAT helper");
-MODULE_LICENSE("GPL");
-
-static unsigned int help(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- unsigned int matchoff,
- unsigned int matchlen,
- struct ip_conntrack_expect *exp)
-{
- u_int16_t port;
- unsigned int ret;
-
- /* "4294967296 65635 " */
- char buffer[18];
-
- DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
- expect->seq, exp_irc_info->len,
- ntohl(tcph->seq));
-
- /* Reply comes from server. */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->dir = IP_CT_DIR_REPLY;
-
- /* When you see the packet, we need to NAT it the same as the
- * this one. */
- exp->expectfn = ip_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- exp->tuple.dst.u.tcp.port = htons(port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (port == 0)
- return NF_DROP;
-
- /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
- * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
- * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
- * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
- * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
- * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
- * 255.255.255.255==4294967296, 10 digits)
- * P: bound port (min 1 d, max 5d (65635))
- * F: filename (min 1 d )
- * S: size (min 1 d )
- * 0x01, \n: terminators
- */
-
- /* AAA = "us", ie. where server normally talks to. */
- sprintf(buffer, "%u %u",
- ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip),
- port);
- DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n",
- buffer, NIPQUAD(exp->tuple.src.ip), port);
-
- ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo,
- matchoff, matchlen, buffer,
- strlen(buffer));
- if (ret != NF_ACCEPT)
- ip_conntrack_unexpect_related(exp);
- return ret;
-}
-
-static void __exit ip_nat_irc_fini(void)
-{
- rcu_assign_pointer(ip_nat_irc_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init ip_nat_irc_init(void)
-{
- BUG_ON(rcu_dereference(ip_nat_irc_hook));
- rcu_assign_pointer(ip_nat_irc_hook, help);
- return 0;
-}
-
-/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
- printk(KERN_INFO KBUILD_MODNAME
- ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
- return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-
-module_init(ip_nat_irc_init);
-module_exit(ip_nat_irc_fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
deleted file mode 100644
index 95810202d849..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * ip_nat_proto_gre.c - Version 2.0
- *
- * NAT protocol helper module for GRE.
- *
- * GRE is a generic encapsulation protocol, which is generally not very
- * suited for NAT, as it has no protocol-specific part as port numbers.
- *
- * It has an optional key field, which may help us distinguishing two
- * connections between the same two hosts.
- *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
- *
- * PPTP is built on top of a modified version of GRE, and has a mandatory
- * field called "CallID", which serves us for the same purpose as the key
- * field in plain GRE.
- *
- * Documentation about PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- */
-
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
-
-#if 0
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
- __FUNCTION__, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
-
-/* is key in given range between min and max */
-static int
-gre_in_range(const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype,
- const union ip_conntrack_manip_proto *min,
- const union ip_conntrack_manip_proto *max)
-{
- __be16 key;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- key = tuple->src.u.gre.key;
- else
- key = tuple->dst.u.gre.key;
-
- return ntohs(key) >= ntohs(min->gre.key)
- && ntohs(key) <= ntohs(max->gre.key);
-}
-
-/* generate unique tuple ... */
-static int
-gre_unique_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range,
- enum ip_nat_manip_type maniptype,
- const struct ip_conntrack *conntrack)
-{
- static u_int16_t key;
- __be16 *keyptr;
- unsigned int min, i, range_size;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- keyptr = &tuple->src.u.gre.key;
- else
- keyptr = &tuple->dst.u.gre.key;
-
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
- DEBUGP("%p: NATing GRE PPTP\n", conntrack);
- min = 1;
- range_size = 0xffff;
- } else {
- min = ntohs(range->min.gre.key);
- range_size = ntohs(range->max.gre.key) - min + 1;
- }
-
- DEBUGP("min = %u, range_size = %u\n", min, range_size);
-
- for (i = 0; i < range_size; i++, key++) {
- *keyptr = htons(min + key % range_size);
- if (!ip_nat_used_tuple(tuple, conntrack))
- return 1;
- }
-
- DEBUGP("%p: no NAT mapping\n", conntrack);
-
- return 0;
-}
-
-/* manipulate a GRE packet according to maniptype */
-static int
-gre_manip_pkt(struct sk_buff **pskb,
- unsigned int iphdroff,
- const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype)
-{
- struct gre_hdr *greh;
- struct gre_hdr_pptp *pgreh;
- struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
- unsigned int hdroff = iphdroff + iph->ihl*4;
-
- /* pgreh includes two optional 32bit fields which are not required
- * to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(pskb, hdroff + sizeof(*pgreh)-8))
- return 0;
-
- greh = (void *)(*pskb)->data + hdroff;
- pgreh = (struct gre_hdr_pptp *) greh;
-
- /* we only have destination manip of a packet, since 'source key'
- * is not present in the packet itself */
- if (maniptype == IP_NAT_MANIP_DST) {
- /* key manipulation is always dest */
- switch (greh->version) {
- case 0:
- if (!greh->key) {
- DEBUGP("can't nat GRE w/o key\n");
- break;
- }
- if (greh->csum) {
- /* FIXME: Never tested this code... */
- nf_proto_csum_replace4(gre_csum(greh), *pskb,
- *(gre_key(greh)),
- tuple->dst.u.gre.key, 0);
- }
- *(gre_key(greh)) = tuple->dst.u.gre.key;
- break;
- case GRE_VERSION_PPTP:
- DEBUGP("call_id -> 0x%04x\n",
- ntohs(tuple->dst.u.gre.key));
- pgreh->call_id = tuple->dst.u.gre.key;
- break;
- default:
- DEBUGP("can't nat unknown GRE version\n");
- return 0;
- break;
- }
- }
- return 1;
-}
-
-/* nat helper struct */
-static struct ip_nat_protocol gre = {
- .name = "GRE",
- .protonum = IPPROTO_GRE,
- .manip_pkt = gre_manip_pkt,
- .in_range = gre_in_range,
- .unique_tuple = gre_unique_tuple,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .range_to_nfattr = ip_nat_port_range_to_nfattr,
- .nfattr_to_range = ip_nat_port_nfattr_to_range,
-#endif
-};
-
-int __init ip_nat_proto_gre_init(void)
-{
- return ip_nat_protocol_register(&gre);
-}
-
-void __exit ip_nat_proto_gre_fini(void)
-{
- ip_nat_protocol_unregister(&gre);
-}
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
deleted file mode 100644
index 75266fe3e0fa..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-#include <linux/if.h>
-
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-
-static int
-icmp_in_range(const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype,
- const union ip_conntrack_manip_proto *min,
- const union ip_conntrack_manip_proto *max)
-{
- return (tuple->src.u.icmp.id >= min->icmp.id
- && tuple->src.u.icmp.id <= max->icmp.id);
-}
-
-static int
-icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range,
- enum ip_nat_manip_type maniptype,
- const struct ip_conntrack *conntrack)
-{
- static u_int16_t id;
- unsigned int range_size;
- unsigned int i;
-
- range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
- /* If no range specified... */
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
- range_size = 0xFFFF;
-
- for (i = 0; i < range_size; i++, id++) {
- tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
- (id % range_size));
- if (!ip_nat_used_tuple(tuple, conntrack))
- return 1;
- }
- return 0;
-}
-
-static int
-icmp_manip_pkt(struct sk_buff **pskb,
- unsigned int iphdroff,
- const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype)
-{
- struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
- struct icmphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
-
- if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
- return 0;
-
- hdr = (struct icmphdr *)((*pskb)->data + hdroff);
- nf_proto_csum_replace2(&hdr->checksum, *pskb,
- hdr->un.echo.id, tuple->src.u.icmp.id, 0);
- hdr->un.echo.id = tuple->src.u.icmp.id;
- return 1;
-}
-
-struct ip_nat_protocol ip_nat_protocol_icmp = {
- .name = "ICMP",
- .protonum = IPPROTO_ICMP,
- .me = THIS_MODULE,
- .manip_pkt = icmp_manip_pkt,
- .in_range = icmp_in_range,
- .unique_tuple = icmp_unique_tuple,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .range_to_nfattr = ip_nat_port_range_to_nfattr,
- .nfattr_to_range = ip_nat_port_nfattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
deleted file mode 100644
index b586d18b3fb3..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-
-static int
-tcp_in_range(const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype,
- const union ip_conntrack_manip_proto *min,
- const union ip_conntrack_manip_proto *max)
-{
- __be16 port;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- port = tuple->src.u.tcp.port;
- else
- port = tuple->dst.u.tcp.port;
-
- return ntohs(port) >= ntohs(min->tcp.port)
- && ntohs(port) <= ntohs(max->tcp.port);
-}
-
-static int
-tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range,
- enum ip_nat_manip_type maniptype,
- const struct ip_conntrack *conntrack)
-{
- static u_int16_t port;
- __be16 *portptr;
- unsigned int range_size, min, i;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- portptr = &tuple->src.u.tcp.port;
- else
- portptr = &tuple->dst.u.tcp.port;
-
- /* If no range specified... */
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
- /* If it's dst rewrite, can't change port */
- if (maniptype == IP_NAT_MANIP_DST)
- return 0;
-
- /* Map privileged onto privileged. */
- if (ntohs(*portptr) < 1024) {
- /* Loose convention: >> 512 is credential passing */
- if (ntohs(*portptr)<512) {
- min = 1;
- range_size = 511 - min + 1;
- } else {
- min = 600;
- range_size = 1023 - min + 1;
- }
- } else {
- min = 1024;
- range_size = 65535 - 1024 + 1;
- }
- } else {
- min = ntohs(range->min.tcp.port);
- range_size = ntohs(range->max.tcp.port) - min + 1;
- }
-
- for (i = 0; i < range_size; i++, port++) {
- *portptr = htons(min + port % range_size);
- if (!ip_nat_used_tuple(tuple, conntrack)) {
- return 1;
- }
- }
- return 0;
-}
-
-static int
-tcp_manip_pkt(struct sk_buff **pskb,
- unsigned int iphdroff,
- const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype)
-{
- struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
- struct tcphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
- __be32 oldip, newip;
- __be16 *portptr, newport, oldport;
- int hdrsize = 8; /* TCP connection tracking guarantees this much */
-
- /* this could be a inner header returned in icmp packet; in such
- cases we cannot update the checksum field since it is outside of
- the 8 bytes of transport layer headers we are guaranteed */
- if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
- hdrsize = sizeof(struct tcphdr);
-
- if (!skb_make_writable(pskb, hdroff + hdrsize))
- return 0;
-
- iph = (struct iphdr *)((*pskb)->data + iphdroff);
- hdr = (struct tcphdr *)((*pskb)->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- /* Get rid of src ip and src pt */
- oldip = iph->saddr;
- newip = tuple->src.ip;
- newport = tuple->src.u.tcp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst ip and dst pt */
- oldip = iph->daddr;
- newip = tuple->dst.ip;
- newport = tuple->dst.u.tcp.port;
- portptr = &hdr->dest;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return 1;
-
- nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1);
- nf_proto_csum_replace2(&hdr->check, *pskb, oldport, newport, 0);
- return 1;
-}
-
-struct ip_nat_protocol ip_nat_protocol_tcp = {
- .name = "TCP",
- .protonum = IPPROTO_TCP,
- .me = THIS_MODULE,
- .manip_pkt = tcp_manip_pkt,
- .in_range = tcp_in_range,
- .unique_tuple = tcp_unique_tuple,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .range_to_nfattr = ip_nat_port_range_to_nfattr,
- .nfattr_to_range = ip_nat_port_nfattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
deleted file mode 100644
index 5ced0877b32f..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/netfilter.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/if.h>
-
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-
-static int
-udp_in_range(const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype,
- const union ip_conntrack_manip_proto *min,
- const union ip_conntrack_manip_proto *max)
-{
- __be16 port;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- port = tuple->src.u.udp.port;
- else
- port = tuple->dst.u.udp.port;
-
- return ntohs(port) >= ntohs(min->udp.port)
- && ntohs(port) <= ntohs(max->udp.port);
-}
-
-static int
-udp_unique_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range,
- enum ip_nat_manip_type maniptype,
- const struct ip_conntrack *conntrack)
-{
- static u_int16_t port;
- __be16 *portptr;
- unsigned int range_size, min, i;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- portptr = &tuple->src.u.udp.port;
- else
- portptr = &tuple->dst.u.udp.port;
-
- /* If no range specified... */
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
- /* If it's dst rewrite, can't change port */
- if (maniptype == IP_NAT_MANIP_DST)
- return 0;
-
- if (ntohs(*portptr) < 1024) {
- /* Loose convention: >> 512 is credential passing */
- if (ntohs(*portptr)<512) {
- min = 1;
- range_size = 511 - min + 1;
- } else {
- min = 600;
- range_size = 1023 - min + 1;
- }
- } else {
- min = 1024;
- range_size = 65535 - 1024 + 1;
- }
- } else {
- min = ntohs(range->min.udp.port);
- range_size = ntohs(range->max.udp.port) - min + 1;
- }
-
- for (i = 0; i < range_size; i++, port++) {
- *portptr = htons(min + port % range_size);
- if (!ip_nat_used_tuple(tuple, conntrack))
- return 1;
- }
- return 0;
-}
-
-static int
-udp_manip_pkt(struct sk_buff **pskb,
- unsigned int iphdroff,
- const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype)
-{
- struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
- struct udphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
- __be32 oldip, newip;
- __be16 *portptr, newport;
-
- if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
- return 0;
-
- iph = (struct iphdr *)((*pskb)->data + iphdroff);
- hdr = (struct udphdr *)((*pskb)->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- /* Get rid of src ip and src pt */
- oldip = iph->saddr;
- newip = tuple->src.ip;
- newport = tuple->src.u.udp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst ip and dst pt */
- oldip = iph->daddr;
- newip = tuple->dst.ip;
- newport = tuple->dst.u.udp.port;
- portptr = &hdr->dest;
- }
-
- if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) {
- nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1);
- nf_proto_csum_replace2(&hdr->check, *pskb, *portptr, newport, 0);
- if (!hdr->check)
- hdr->check = CSUM_MANGLED_0;
- }
- *portptr = newport;
- return 1;
-}
-
-struct ip_nat_protocol ip_nat_protocol_udp = {
- .name = "UDP",
- .protonum = IPPROTO_UDP,
- .me = THIS_MODULE,
- .manip_pkt = udp_manip_pkt,
- .in_range = udp_in_range,
- .unique_tuple = udp_unique_tuple,
-#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
- defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
- .range_to_nfattr = ip_nat_port_range_to_nfattr,
- .nfattr_to_range = ip_nat_port_nfattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
deleted file mode 100644
index 3bf049517246..000000000000
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/* The "unknown" protocol. This is what is used for protocols we
- * don't understand. It's returned by ip_ct_find_proto().
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/netfilter.h>
-#include <linux/if.h>
-
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-
-static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type manip_type,
- const union ip_conntrack_manip_proto *min,
- const union ip_conntrack_manip_proto *max)
-{
- return 1;
-}
-
-static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
- const struct ip_nat_range *range,
- enum ip_nat_manip_type maniptype,
- const struct ip_conntrack *conntrack)
-{
- /* Sorry: we can't help you; if it's not unique, we can't frob
- anything. */
- return 0;
-}
-
-static int
-unknown_manip_pkt(struct sk_buff **pskb,
- unsigned int iphdroff,
- const struct ip_conntrack_tuple *tuple,
- enum ip_nat_manip_type maniptype)
-{
- return 1;
-}
-
-struct ip_nat_protocol ip_nat_unknown_protocol = {
- .name = "unknown",
- /* .me isn't set: getting a ref to this cannot fail. */
- .manip_pkt = unknown_manip_pkt,
- .in_range = unknown_in_range,
- .unique_tuple = unknown_unique_tuple,
-};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
deleted file mode 100644
index a176aa3031e0..000000000000
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* Everything about the rules for NAT. */
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/bitops.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
-
-static struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[3];
- struct ipt_error term;
-} nat_initial_table __initdata
-= { { "nat", NAT_VALID_HOOKS, 4,
- sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
- { [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
- { [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
- 0, NULL, { } },
- {
- /* PRE_ROUTING */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* POST_ROUTING */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_OUT */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } }
- },
- /* ERROR */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_error),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
- { } },
- "ERROR"
- }
- }
-};
-
-static struct ipt_table nat_table = {
- .name = "nat",
- .valid_hooks = NAT_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .me = THIS_MODULE,
- .af = AF_INET,
-};
-
-/* Source NAT */
-static unsigned int ipt_snat_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct ipt_target *target,
- const void *targinfo)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- const struct ip_nat_multi_range_compat *mr = targinfo;
-
- IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
-
- /* Connection must be valid and new. */
- IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
- || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
- IP_NF_ASSERT(out);
-
- return ip_nat_setup_info(ct, &mr->range[0], hooknum);
-}
-
-/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
-static void warn_if_extra_mangle(__be32 dstip, __be32 srcip)
-{
- static int warned = 0;
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
- struct rtable *rt;
-
- if (ip_route_output_key(&rt, &fl) != 0)
- return;
-
- if (rt->rt_src != srcip && !warned) {
- printk("NAT: no longer support implicit source local NAT\n");
- printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
- NIPQUAD(srcip), NIPQUAD(dstip));
- warned = 1;
- }
- ip_rt_put(rt);
-}
-
-static unsigned int ipt_dnat_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct ipt_target *target,
- const void *targinfo)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- const struct ip_nat_multi_range_compat *mr = targinfo;
-
- IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_LOCAL_OUT);
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
-
- /* Connection must be valid and new. */
- IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-
- if (hooknum == NF_IP_LOCAL_OUT
- && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
- warn_if_extra_mangle((*pskb)->nh.iph->daddr,
- mr->range[0].min_ip);
-
- return ip_nat_setup_info(ct, &mr->range[0], hooknum);
-}
-
-static int ipt_snat_checkentry(const char *tablename,
- const void *entry,
- const struct ipt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- struct ip_nat_multi_range_compat *mr = targinfo;
-
- /* Must be a valid range */
- if (mr->rangesize != 1) {
- printk("SNAT: multiple ranges no longer supported\n");
- return 0;
- }
- return 1;
-}
-
-static int ipt_dnat_checkentry(const char *tablename,
- const void *entry,
- const struct ipt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- struct ip_nat_multi_range_compat *mr = targinfo;
-
- /* Must be a valid range */
- if (mr->rangesize != 1) {
- printk("DNAT: multiple ranges no longer supported\n");
- return 0;
- }
- return 1;
-}
-
-inline unsigned int
-alloc_null_binding(struct ip_conntrack *conntrack,
- struct ip_nat_info *info,
- unsigned int hooknum)
-{
- /* Force range to this IP; let proto decide mapping for
- per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
- Use reply in case it's already been mangled (eg local packet).
- */
- __be32 ip
- = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
- ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
- : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
- struct ip_nat_range range
- = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
- DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
- NIPQUAD(ip));
- return ip_nat_setup_info(conntrack, &range, hooknum);
-}
-
-unsigned int
-alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
- struct ip_nat_info *info,
- unsigned int hooknum)
-{
- __be32 ip
- = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
- ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
- : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
- u_int16_t all
- = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
- ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all
- : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all);
- struct ip_nat_range range
- = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
-
- DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
- conntrack, NIPQUAD(ip));
- return ip_nat_setup_info(conntrack, &range, hooknum);
-}
-
-int ip_nat_rule_find(struct sk_buff **pskb,
- unsigned int hooknum,
- const struct net_device *in,
- const struct net_device *out,
- struct ip_conntrack *ct,
- struct ip_nat_info *info)
-{
- int ret;
-
- ret = ipt_do_table(pskb, hooknum, in, out, &nat_table);
-
- if (ret == NF_ACCEPT) {
- if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
- /* NUL mapping */
- ret = alloc_null_binding(ct, info, hooknum);
- }
- return ret;
-}
-
-static struct ipt_target ipt_snat_reg = {
- .name = "SNAT",
- .target = ipt_snat_target,
- .targetsize = sizeof(struct ip_nat_multi_range_compat),
- .table = "nat",
- .hooks = 1 << NF_IP_POST_ROUTING,
- .checkentry = ipt_snat_checkentry,
-};
-
-static struct ipt_target ipt_dnat_reg = {
- .name = "DNAT",
- .target = ipt_dnat_target,
- .targetsize = sizeof(struct ip_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT),
- .checkentry = ipt_dnat_checkentry,
-};
-
-int __init ip_nat_rule_init(void)
-{
- int ret;
-
- ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
- if (ret != 0)
- return ret;
- ret = ipt_register_target(&ipt_snat_reg);
- if (ret != 0)
- goto unregister_table;
-
- ret = ipt_register_target(&ipt_dnat_reg);
- if (ret != 0)
- goto unregister_snat;
-
- return ret;
-
- unregister_snat:
- ipt_unregister_target(&ipt_snat_reg);
- unregister_table:
- ipt_unregister_table(&nat_table);
-
- return ret;
-}
-
-void ip_nat_rule_cleanup(void)
-{
- ipt_unregister_target(&ipt_dnat_reg);
- ipt_unregister_target(&ipt_snat_reg);
- ipt_unregister_table(&nat_table);
-}
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
deleted file mode 100644
index 6223abc924ff..000000000000
--- a/net/ipv4/netfilter/ip_nat_sip.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/* SIP extension for UDP NAT alteration.
- *
- * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
- * based on RR's ip_nat_ftp.c and other modules.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
-MODULE_DESCRIPTION("SIP NAT helper");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-struct addr_map {
- struct {
- char src[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
- char dst[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
- unsigned int srclen, srciplen;
- unsigned int dstlen, dstiplen;
- } addr[IP_CT_DIR_MAX];
-};
-
-static void addr_map_init(struct ip_conntrack *ct, struct addr_map *map)
-{
- struct ip_conntrack_tuple *t;
- enum ip_conntrack_dir dir;
- unsigned int n;
-
- for (dir = 0; dir < IP_CT_DIR_MAX; dir++) {
- t = &ct->tuplehash[dir].tuple;
-
- n = sprintf(map->addr[dir].src, "%u.%u.%u.%u",
- NIPQUAD(t->src.ip));
- map->addr[dir].srciplen = n;
- n += sprintf(map->addr[dir].src + n, ":%u",
- ntohs(t->src.u.udp.port));
- map->addr[dir].srclen = n;
-
- n = sprintf(map->addr[dir].dst, "%u.%u.%u.%u",
- NIPQUAD(t->dst.ip));
- map->addr[dir].dstiplen = n;
- n += sprintf(map->addr[dir].dst + n, ":%u",
- ntohs(t->dst.u.udp.port));
- map->addr[dir].dstlen = n;
- }
-}
-
-static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo,
- struct ip_conntrack *ct, const char **dptr, size_t dlen,
- enum sip_header_pos pos, struct addr_map *map)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned int matchlen, matchoff, addrlen;
- char *addr;
-
- if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0)
- return 1;
-
- if ((matchlen == map->addr[dir].srciplen ||
- matchlen == map->addr[dir].srclen) &&
- memcmp(*dptr + matchoff, map->addr[dir].src, matchlen) == 0) {
- addr = map->addr[!dir].dst;
- addrlen = map->addr[!dir].dstlen;
- } else if ((matchlen == map->addr[dir].dstiplen ||
- matchlen == map->addr[dir].dstlen) &&
- memcmp(*dptr + matchoff, map->addr[dir].dst, matchlen) == 0) {
- addr = map->addr[!dir].src;
- addrlen = map->addr[!dir].srclen;
- } else
- return 1;
-
- if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
- matchoff, matchlen, addr, addrlen))
- return 0;
- *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
- return 1;
-
-}
-
-static unsigned int ip_nat_sip(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack *ct,
- const char **dptr)
-{
- enum sip_header_pos pos;
- struct addr_map map;
- int dataoff, datalen;
-
- dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
- datalen = (*pskb)->len - dataoff;
- if (datalen < sizeof("SIP/2.0") - 1)
- return NF_DROP;
-
- addr_map_init(ct, &map);
-
- /* Basic rules: requests and responses. */
- if (strncmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) != 0) {
- /* 10.2: Constructing the REGISTER Request:
- *
- * The "userinfo" and "@" components of the SIP URI MUST NOT
- * be present.
- */
- if (datalen >= sizeof("REGISTER") - 1 &&
- strncmp(*dptr, "REGISTER", sizeof("REGISTER") - 1) == 0)
- pos = POS_REG_REQ_URI;
- else
- pos = POS_REQ_URI;
-
- if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, pos, &map))
- return NF_DROP;
- }
-
- if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_FROM, &map) ||
- !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_TO, &map) ||
- !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_VIA, &map) ||
- !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_CONTACT, &map))
- return NF_DROP;
- return NF_ACCEPT;
-}
-
-static unsigned int mangle_sip_packet(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack *ct,
- const char **dptr, size_t dlen,
- char *buffer, int bufflen,
- enum sip_header_pos pos)
-{
- unsigned int matchlen, matchoff;
-
- if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0)
- return 0;
-
- if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
- matchoff, matchlen, buffer, bufflen))
- return 0;
-
- /* We need to reload this. Thanks Patrick. */
- *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
- return 1;
-}
-
-static int mangle_content_len(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack *ct,
- const char *dptr)
-{
- unsigned int dataoff, matchoff, matchlen;
- char buffer[sizeof("65536")];
- int bufflen;
-
- dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
-
- /* Get actual SDP lenght */
- if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
- &matchlen, POS_SDP_HEADER) > 0) {
-
- /* since ct_sip_get_info() give us a pointer passing 'v='
- we need to add 2 bytes in this count. */
- int c_len = (*pskb)->len - dataoff - matchoff + 2;
-
- /* Now, update SDP lenght */
- if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
- &matchlen, POS_CONTENT) > 0) {
-
- bufflen = sprintf(buffer, "%u", c_len);
-
- return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
- matchoff, matchlen,
- buffer, bufflen);
- }
- }
- return 0;
-}
-
-static unsigned int mangle_sdp(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack *ct,
- __be32 newip, u_int16_t port,
- const char *dptr)
-{
- char buffer[sizeof("nnn.nnn.nnn.nnn")];
- unsigned int dataoff, bufflen;
-
- dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
-
- /* Mangle owner and contact info. */
- bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
- if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
- buffer, bufflen, POS_OWNER))
- return 0;
-
- if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
- buffer, bufflen, POS_CONNECTION))
- return 0;
-
- /* Mangle media port. */
- bufflen = sprintf(buffer, "%u", port);
- if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
- buffer, bufflen, POS_MEDIA))
- return 0;
-
- return mangle_content_len(pskb, ctinfo, ct, dptr);
-}
-
-/* So, this packet has hit the connection tracking matching code.
- Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_sdp(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack_expect *exp,
- const char *dptr)
-{
- struct ip_conntrack *ct = exp->master;
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- __be32 newip;
- u_int16_t port;
-
- DEBUGP("ip_nat_sdp():\n");
-
- /* Connection will come from reply */
- newip = ct->tuplehash[!dir].tuple.dst.ip;
-
- exp->tuple.dst.ip = newip;
- exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
- exp->dir = !dir;
-
- /* When you see the packet, we need to NAT it the same as the
- this one. */
- exp->expectfn = ip_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
- exp->tuple.dst.u.udp.port = htons(port);
- if (ip_conntrack_expect_related(exp) == 0)
- break;
- }
-
- if (port == 0)
- return NF_DROP;
-
- if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
- ip_conntrack_unexpect_related(exp);
- return NF_DROP;
- }
- return NF_ACCEPT;
-}
-
-static void __exit fini(void)
-{
- rcu_assign_pointer(ip_nat_sip_hook, NULL);
- rcu_assign_pointer(ip_nat_sdp_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init init(void)
-{
- BUG_ON(rcu_dereference(ip_nat_sip_hook));
- BUG_ON(rcu_dereference(ip_nat_sdp_hook));
- rcu_assign_pointer(ip_nat_sip_hook, ip_nat_sip);
- rcu_assign_pointer(ip_nat_sdp_hook, ip_nat_sdp);
- return 0;
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
deleted file mode 100644
index c3d9f3b090c4..000000000000
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ /dev/null
@@ -1,1333 +0,0 @@
-/*
- * ip_nat_snmp_basic.c
- *
- * Basic SNMP Application Layer Gateway
- *
- * This IP NAT module is intended for use with SNMP network
- * discovery and monitoring applications where target networks use
- * conflicting private address realms.
- *
- * Static NAT is used to remap the networks from the view of the network
- * management system at the IP layer, and this module remaps some application
- * layer addresses to match.
- *
- * The simplest form of ALG is performed, where only tagged IP addresses
- * are modified. The module does not need to be MIB aware and only scans
- * messages at the ASN.1/BER level.
- *
- * Currently, only SNMPv1 and SNMPv2 are supported.
- *
- * More information on ALG and associated issues can be found in
- * RFC 2962
- *
- * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
- * McLean & Jochen Friedrich, stripped down for use in the kernel.
- *
- * Copyright (c) 2000 RP Internet (www.rpi.net.au).
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Author: James Morris <jmorris@intercode.com.au>
- *
- * Updates:
- * 2000-08-06: Convert to new helper API (Harald Welte).
- *
- */
-#include <linux/in.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/moduleparam.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/udp.h>
-#include <asm/uaccess.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
-
-#define SNMP_PORT 161
-#define SNMP_TRAP_PORT 162
-#define NOCT1(n) (*(u8 *)n)
-
-static int debug;
-static DEFINE_SPINLOCK(snmp_lock);
-
-/*
- * Application layer address mapping mimics the NAT mapping, but
- * only for the first octet in this case (a more flexible system
- * can be implemented if needed).
- */
-struct oct1_map
-{
- u_int8_t from;
- u_int8_t to;
-};
-
-
-/*****************************************************************************
- *
- * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
- *
- *****************************************************************************/
-
-/* Class */
-#define ASN1_UNI 0 /* Universal */
-#define ASN1_APL 1 /* Application */
-#define ASN1_CTX 2 /* Context */
-#define ASN1_PRV 3 /* Private */
-
-/* Tag */
-#define ASN1_EOC 0 /* End Of Contents */
-#define ASN1_BOL 1 /* Boolean */
-#define ASN1_INT 2 /* Integer */
-#define ASN1_BTS 3 /* Bit String */
-#define ASN1_OTS 4 /* Octet String */
-#define ASN1_NUL 5 /* Null */
-#define ASN1_OJI 6 /* Object Identifier */
-#define ASN1_OJD 7 /* Object Description */
-#define ASN1_EXT 8 /* External */
-#define ASN1_SEQ 16 /* Sequence */
-#define ASN1_SET 17 /* Set */
-#define ASN1_NUMSTR 18 /* Numerical String */
-#define ASN1_PRNSTR 19 /* Printable String */
-#define ASN1_TEXSTR 20 /* Teletext String */
-#define ASN1_VIDSTR 21 /* Video String */
-#define ASN1_IA5STR 22 /* IA5 String */
-#define ASN1_UNITIM 23 /* Universal Time */
-#define ASN1_GENTIM 24 /* General Time */
-#define ASN1_GRASTR 25 /* Graphical String */
-#define ASN1_VISSTR 26 /* Visible String */
-#define ASN1_GENSTR 27 /* General String */
-
-/* Primitive / Constructed methods*/
-#define ASN1_PRI 0 /* Primitive */
-#define ASN1_CON 1 /* Constructed */
-
-/*
- * Error codes.
- */
-#define ASN1_ERR_NOERROR 0
-#define ASN1_ERR_DEC_EMPTY 2
-#define ASN1_ERR_DEC_EOC_MISMATCH 3
-#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
-#define ASN1_ERR_DEC_BADVALUE 5
-
-/*
- * ASN.1 context.
- */
-struct asn1_ctx
-{
- int error; /* Error condition */
- unsigned char *pointer; /* Octet just to be decoded */
- unsigned char *begin; /* First octet */
- unsigned char *end; /* Octet after last octet */
-};
-
-/*
- * Octet string (not null terminated)
- */
-struct asn1_octstr
-{
- unsigned char *data;
- unsigned int len;
-};
-
-static void asn1_open(struct asn1_ctx *ctx,
- unsigned char *buf,
- unsigned int len)
-{
- ctx->begin = buf;
- ctx->end = buf + len;
- ctx->pointer = buf;
- ctx->error = ASN1_ERR_NOERROR;
-}
-
-static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
-{
- if (ctx->pointer >= ctx->end) {
- ctx->error = ASN1_ERR_DEC_EMPTY;
- return 0;
- }
- *ch = *(ctx->pointer)++;
- return 1;
-}
-
-static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
-{
- unsigned char ch;
-
- *tag = 0;
-
- do
- {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
- *tag <<= 7;
- *tag |= ch & 0x7F;
- } while ((ch & 0x80) == 0x80);
- return 1;
-}
-
-static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
- unsigned int *cls,
- unsigned int *con,
- unsigned int *tag)
-{
- unsigned char ch;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *cls = (ch & 0xC0) >> 6;
- *con = (ch & 0x20) >> 5;
- *tag = (ch & 0x1F);
-
- if (*tag == 0x1F) {
- if (!asn1_tag_decode(ctx, tag))
- return 0;
- }
- return 1;
-}
-
-static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
- unsigned int *def,
- unsigned int *len)
-{
- unsigned char ch, cnt;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- if (ch == 0x80)
- *def = 0;
- else {
- *def = 1;
-
- if (ch < 0x80)
- *len = ch;
- else {
- cnt = (unsigned char) (ch & 0x7F);
- *len = 0;
-
- while (cnt > 0) {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
- *len <<= 8;
- *len |= ch;
- cnt--;
- }
- }
- }
- return 1;
-}
-
-static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
- unsigned char **eoc,
- unsigned int *cls,
- unsigned int *con,
- unsigned int *tag)
-{
- unsigned int def, len;
-
- if (!asn1_id_decode(ctx, cls, con, tag))
- return 0;
-
- def = len = 0;
- if (!asn1_length_decode(ctx, &def, &len))
- return 0;
-
- if (def)
- *eoc = ctx->pointer + len;
- else
- *eoc = NULL;
- return 1;
-}
-
-static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
-{
- unsigned char ch;
-
- if (eoc == 0) {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- if (ch != 0x00) {
- ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- if (ch != 0x00) {
- ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
- return 0;
- }
- return 1;
- } else {
- if (ctx->pointer != eoc) {
- ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
- return 0;
- }
- return 1;
- }
-}
-
-static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
-{
- ctx->pointer = eoc;
- return 1;
-}
-
-static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- long *integer)
-{
- unsigned char ch;
- unsigned int len;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer = (signed char) ch;
- len = 1;
-
- while (ctx->pointer < eoc) {
- if (++len > sizeof (long)) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer <<= 8;
- *integer |= ch;
- }
- return 1;
-}
-
-static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned int *integer)
-{
- unsigned char ch;
- unsigned int len;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer = ch;
- if (ch == 0) len = 0;
- else len = 1;
-
- while (ctx->pointer < eoc) {
- if (++len > sizeof (unsigned int)) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer <<= 8;
- *integer |= ch;
- }
- return 1;
-}
-
-static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned long *integer)
-{
- unsigned char ch;
- unsigned int len;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer = ch;
- if (ch == 0) len = 0;
- else len = 1;
-
- while (ctx->pointer < eoc) {
- if (++len > sizeof (unsigned long)) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer <<= 8;
- *integer |= ch;
- }
- return 1;
-}
-
-static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned char **octets,
- unsigned int *len)
-{
- unsigned char *ptr;
-
- *len = 0;
-
- *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
- if (*octets == NULL) {
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
-
- ptr = *octets;
- while (ctx->pointer < eoc) {
- if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
- kfree(*octets);
- *octets = NULL;
- return 0;
- }
- (*len)++;
- }
- return 1;
-}
-
-static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
- unsigned long *subid)
-{
- unsigned char ch;
-
- *subid = 0;
-
- do {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *subid <<= 7;
- *subid |= ch & 0x7F;
- } while ((ch & 0x80) == 0x80);
- return 1;
-}
-
-static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned long **oid,
- unsigned int *len)
-{
- unsigned long subid;
- unsigned int size;
- unsigned long *optr;
-
- size = eoc - ctx->pointer + 1;
- *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
- if (*oid == NULL) {
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
-
- optr = *oid;
-
- if (!asn1_subid_decode(ctx, &subid)) {
- kfree(*oid);
- *oid = NULL;
- return 0;
- }
-
- if (subid < 40) {
- optr [0] = 0;
- optr [1] = subid;
- } else if (subid < 80) {
- optr [0] = 1;
- optr [1] = subid - 40;
- } else {
- optr [0] = 2;
- optr [1] = subid - 80;
- }
-
- *len = 2;
- optr += 2;
-
- while (ctx->pointer < eoc) {
- if (++(*len) > size) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- kfree(*oid);
- *oid = NULL;
- return 0;
- }
-
- if (!asn1_subid_decode(ctx, optr++)) {
- kfree(*oid);
- *oid = NULL;
- return 0;
- }
- }
- return 1;
-}
-
-/*****************************************************************************
- *
- * SNMP decoding routines (gxsnmp author Dirk Wisse)
- *
- *****************************************************************************/
-
-/* SNMP Versions */
-#define SNMP_V1 0
-#define SNMP_V2C 1
-#define SNMP_V2 2
-#define SNMP_V3 3
-
-/* Default Sizes */
-#define SNMP_SIZE_COMM 256
-#define SNMP_SIZE_OBJECTID 128
-#define SNMP_SIZE_BUFCHR 256
-#define SNMP_SIZE_BUFINT 128
-#define SNMP_SIZE_SMALLOBJECTID 16
-
-/* Requests */
-#define SNMP_PDU_GET 0
-#define SNMP_PDU_NEXT 1
-#define SNMP_PDU_RESPONSE 2
-#define SNMP_PDU_SET 3
-#define SNMP_PDU_TRAP1 4
-#define SNMP_PDU_BULK 5
-#define SNMP_PDU_INFORM 6
-#define SNMP_PDU_TRAP2 7
-
-/* Errors */
-#define SNMP_NOERROR 0
-#define SNMP_TOOBIG 1
-#define SNMP_NOSUCHNAME 2
-#define SNMP_BADVALUE 3
-#define SNMP_READONLY 4
-#define SNMP_GENERROR 5
-#define SNMP_NOACCESS 6
-#define SNMP_WRONGTYPE 7
-#define SNMP_WRONGLENGTH 8
-#define SNMP_WRONGENCODING 9
-#define SNMP_WRONGVALUE 10
-#define SNMP_NOCREATION 11
-#define SNMP_INCONSISTENTVALUE 12
-#define SNMP_RESOURCEUNAVAILABLE 13
-#define SNMP_COMMITFAILED 14
-#define SNMP_UNDOFAILED 15
-#define SNMP_AUTHORIZATIONERROR 16
-#define SNMP_NOTWRITABLE 17
-#define SNMP_INCONSISTENTNAME 18
-
-/* General SNMP V1 Traps */
-#define SNMP_TRAP_COLDSTART 0
-#define SNMP_TRAP_WARMSTART 1
-#define SNMP_TRAP_LINKDOWN 2
-#define SNMP_TRAP_LINKUP 3
-#define SNMP_TRAP_AUTFAILURE 4
-#define SNMP_TRAP_EQPNEIGHBORLOSS 5
-#define SNMP_TRAP_ENTSPECIFIC 6
-
-/* SNMPv1 Types */
-#define SNMP_NULL 0
-#define SNMP_INTEGER 1 /* l */
-#define SNMP_OCTETSTR 2 /* c */
-#define SNMP_DISPLAYSTR 2 /* c */
-#define SNMP_OBJECTID 3 /* ul */
-#define SNMP_IPADDR 4 /* uc */
-#define SNMP_COUNTER 5 /* ul */
-#define SNMP_GAUGE 6 /* ul */
-#define SNMP_TIMETICKS 7 /* ul */
-#define SNMP_OPAQUE 8 /* c */
-
-/* Additional SNMPv2 Types */
-#define SNMP_UINTEGER 5 /* ul */
-#define SNMP_BITSTR 9 /* uc */
-#define SNMP_NSAP 10 /* uc */
-#define SNMP_COUNTER64 11 /* ul */
-#define SNMP_NOSUCHOBJECT 12
-#define SNMP_NOSUCHINSTANCE 13
-#define SNMP_ENDOFMIBVIEW 14
-
-union snmp_syntax
-{
- unsigned char uc[0]; /* 8 bit unsigned */
- char c[0]; /* 8 bit signed */
- unsigned long ul[0]; /* 32 bit unsigned */
- long l[0]; /* 32 bit signed */
-};
-
-struct snmp_object
-{
- unsigned long *id;
- unsigned int id_len;
- unsigned short type;
- unsigned int syntax_len;
- union snmp_syntax syntax;
-};
-
-struct snmp_request
-{
- unsigned long id;
- unsigned int error_status;
- unsigned int error_index;
-};
-
-struct snmp_v1_trap
-{
- unsigned long *id;
- unsigned int id_len;
- unsigned long ip_address; /* pointer */
- unsigned int general;
- unsigned int specific;
- unsigned long time;
-};
-
-/* SNMP types */
-#define SNMP_IPA 0
-#define SNMP_CNT 1
-#define SNMP_GGE 2
-#define SNMP_TIT 3
-#define SNMP_OPQ 4
-#define SNMP_C64 6
-
-/* SNMP errors */
-#define SERR_NSO 0
-#define SERR_NSI 1
-#define SERR_EOM 2
-
-static inline void mangle_address(unsigned char *begin,
- unsigned char *addr,
- const struct oct1_map *map,
- __sum16 *check);
-struct snmp_cnv
-{
- unsigned int class;
- unsigned int tag;
- int syntax;
-};
-
-static struct snmp_cnv snmp_conv [] =
-{
- {ASN1_UNI, ASN1_NUL, SNMP_NULL},
- {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
- {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
- {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
- {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
- {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
- {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */
- {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */
- {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
- {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
-
- /* SNMPv2 data types and errors */
- {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
- {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
- {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
- {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
- {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
- {0, 0, -1}
-};
-
-static unsigned char snmp_tag_cls2syntax(unsigned int tag,
- unsigned int cls,
- unsigned short *syntax)
-{
- struct snmp_cnv *cnv;
-
- cnv = snmp_conv;
-
- while (cnv->syntax != -1) {
- if (cnv->tag == tag && cnv->class == cls) {
- *syntax = cnv->syntax;
- return 1;
- }
- cnv++;
- }
- return 0;
-}
-
-static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
- struct snmp_object **obj)
-{
- unsigned int cls, con, tag, len, idlen;
- unsigned short type;
- unsigned char *eoc, *end, *p;
- unsigned long *lp, *id;
- unsigned long ul;
- long l;
-
- *obj = NULL;
- id = NULL;
-
- if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
- return 0;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
- return 0;
-
- if (!asn1_oid_decode(ctx, end, &id, &idlen))
- return 0;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
- kfree(id);
- return 0;
- }
-
- if (con != ASN1_PRI) {
- kfree(id);
- return 0;
- }
-
- type = 0;
- if (!snmp_tag_cls2syntax(tag, cls, &type)) {
- kfree(id);
- return 0;
- }
-
- l = 0;
- switch (type) {
- case SNMP_INTEGER:
- len = sizeof(long);
- if (!asn1_long_decode(ctx, end, &l)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len,
- GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- (*obj)->syntax.l[0] = l;
- break;
- case SNMP_OCTETSTR:
- case SNMP_OPAQUE:
- if (!asn1_octets_decode(ctx, end, &p, &len)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len,
- GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.c, p, len);
- kfree(p);
- break;
- case SNMP_NULL:
- case SNMP_NOSUCHOBJECT:
- case SNMP_NOSUCHINSTANCE:
- case SNMP_ENDOFMIBVIEW:
- len = 0;
- *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- if (!asn1_null_decode(ctx, end)) {
- kfree(id);
- kfree(*obj);
- *obj = NULL;
- return 0;
- }
- break;
- case SNMP_OBJECTID:
- if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
- kfree(id);
- return 0;
- }
- len *= sizeof(unsigned long);
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(lp);
- kfree(id);
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.ul, lp, len);
- kfree(lp);
- break;
- case SNMP_IPADDR:
- if (!asn1_octets_decode(ctx, end, &p, &len)) {
- kfree(id);
- return 0;
- }
- if (len != 4) {
- kfree(p);
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(p);
- kfree(id);
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.uc, p, len);
- kfree(p);
- break;
- case SNMP_COUNTER:
- case SNMP_GAUGE:
- case SNMP_TIMETICKS:
- len = sizeof(unsigned long);
- if (!asn1_ulong_decode(ctx, end, &ul)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- (*obj)->syntax.ul[0] = ul;
- break;
- default:
- kfree(id);
- return 0;
- }
-
- (*obj)->syntax_len = len;
- (*obj)->type = type;
- (*obj)->id = id;
- (*obj)->id_len = idlen;
-
- if (!asn1_eoc_decode(ctx, eoc)) {
- kfree(id);
- kfree(*obj);
- *obj = NULL;
- return 0;
- }
- return 1;
-}
-
-static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
- struct snmp_request *request)
-{
- unsigned int cls, con, tag;
- unsigned char *end;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
- return 0;
-
- if (!asn1_ulong_decode(ctx, end, &request->id))
- return 0;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
- return 0;
-
- if (!asn1_uint_decode(ctx, end, &request->error_status))
- return 0;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
- return 0;
-
- if (!asn1_uint_decode(ctx, end, &request->error_index))
- return 0;
-
- return 1;
-}
-
-/*
- * Fast checksum update for possibly oddly-aligned UDP byte, from the
- * code example in the draft.
- */
-static void fast_csum(__sum16 *csum,
- const unsigned char *optr,
- const unsigned char *nptr,
- int offset)
-{
- unsigned char s[4];
-
- if (offset & 1) {
- s[0] = s[2] = 0;
- s[1] = ~*optr;
- s[3] = *nptr;
- } else {
- s[1] = s[3] = 0;
- s[0] = ~*optr;
- s[2] = *nptr;
- }
-
- *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
-}
-
-/*
- * Mangle IP address.
- * - begin points to the start of the snmp messgae
- * - addr points to the start of the address
- */
-static inline void mangle_address(unsigned char *begin,
- unsigned char *addr,
- const struct oct1_map *map,
- __sum16 *check)
-{
- if (map->from == NOCT1(addr)) {
- u_int32_t old;
-
- if (debug)
- memcpy(&old, (unsigned char *)addr, sizeof(old));
-
- *addr = map->to;
-
- /* Update UDP checksum if being used */
- if (*check) {
- fast_csum(check,
- &map->from, &map->to, addr - begin);
- }
-
- if (debug)
- printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to "
- "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr));
- }
-}
-
-static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
- struct snmp_v1_trap *trap,
- const struct oct1_map *map,
- __sum16 *check)
-{
- unsigned int cls, con, tag, len;
- unsigned char *end;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
- return 0;
-
- if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
- return 0;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- goto err_id_free;
-
- if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
- (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
- goto err_id_free;
-
- if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
- goto err_id_free;
-
- /* IPv4 only */
- if (len != 4)
- goto err_addr_free;
-
- mangle_address(ctx->begin, ctx->pointer - 4, map, check);
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- goto err_addr_free;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
- goto err_addr_free;
-
- if (!asn1_uint_decode(ctx, end, &trap->general))
- goto err_addr_free;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- goto err_addr_free;
-
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
- goto err_addr_free;
-
- if (!asn1_uint_decode(ctx, end, &trap->specific))
- goto err_addr_free;
-
- if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
- goto err_addr_free;
-
- if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
- (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
- goto err_addr_free;
-
- if (!asn1_ulong_decode(ctx, end, &trap->time))
- goto err_addr_free;
-
- return 1;
-
-err_addr_free:
- kfree((unsigned long *)trap->ip_address);
-
-err_id_free:
- kfree(trap->id);
-
- return 0;
-}
-
-/*****************************************************************************
- *
- * Misc. routines
- *
- *****************************************************************************/
-
-static void hex_dump(unsigned char *buf, size_t len)
-{
- size_t i;
-
- for (i = 0; i < len; i++) {
- if (i && !(i % 16))
- printk("\n");
- printk("%02x ", *(buf + i));
- }
- printk("\n");
-}
-
-/*
- * Parse and mangle SNMP message according to mapping.
- * (And this is the fucking 'basic' method).
- */
-static int snmp_parse_mangle(unsigned char *msg,
- u_int16_t len,
- const struct oct1_map *map,
- __sum16 *check)
-{
- unsigned char *eoc, *end;
- unsigned int cls, con, tag, vers, pdutype;
- struct asn1_ctx ctx;
- struct asn1_octstr comm;
- struct snmp_object **obj;
-
- if (debug > 1)
- hex_dump(msg, len);
-
- asn1_open(&ctx, msg, len);
-
- /*
- * Start of SNMP message.
- */
- if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
- return 0;
- if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
- return 0;
-
- /*
- * Version 1 or 2 handled.
- */
- if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
- return 0;
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
- return 0;
- if (!asn1_uint_decode (&ctx, end, &vers))
- return 0;
- if (debug > 1)
- printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
- if (vers > 1)
- return 1;
-
- /*
- * Community.
- */
- if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
- return 0;
- if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
- return 0;
- if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
- return 0;
- if (debug > 1) {
- unsigned int i;
-
- printk(KERN_DEBUG "bsalg: community: ");
- for (i = 0; i < comm.len; i++)
- printk("%c", comm.data[i]);
- printk("\n");
- }
- kfree(comm.data);
-
- /*
- * PDU type
- */
- if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
- return 0;
- if (cls != ASN1_CTX || con != ASN1_CON)
- return 0;
- if (debug > 1) {
- unsigned char *pdus[] = {
- [SNMP_PDU_GET] = "get",
- [SNMP_PDU_NEXT] = "get-next",
- [SNMP_PDU_RESPONSE] = "response",
- [SNMP_PDU_SET] = "set",
- [SNMP_PDU_TRAP1] = "trapv1",
- [SNMP_PDU_BULK] = "bulk",
- [SNMP_PDU_INFORM] = "inform",
- [SNMP_PDU_TRAP2] = "trapv2"
- };
-
- if (pdutype > SNMP_PDU_TRAP2)
- printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
- else
- printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
- }
- if (pdutype != SNMP_PDU_RESPONSE &&
- pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
- return 1;
-
- /*
- * Request header or v1 trap
- */
- if (pdutype == SNMP_PDU_TRAP1) {
- struct snmp_v1_trap trap;
- unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
-
- if (ret) {
- kfree(trap.id);
- kfree((unsigned long *)trap.ip_address);
- } else
- return ret;
-
- } else {
- struct snmp_request req;
-
- if (!snmp_request_decode(&ctx, &req))
- return 0;
-
- if (debug > 1)
- printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
- "error_index=%u\n", req.id, req.error_status,
- req.error_index);
- }
-
- /*
- * Loop through objects, look for IP addresses to mangle.
- */
- if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
- return 0;
-
- if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
- return 0;
-
- obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
- if (obj == NULL) {
- if (net_ratelimit())
- printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
- return 0;
- }
-
- while (!asn1_eoc_decode(&ctx, eoc)) {
- unsigned int i;
-
- if (!snmp_object_decode(&ctx, obj)) {
- if (*obj) {
- kfree((*obj)->id);
- kfree(*obj);
- }
- kfree(obj);
- return 0;
- }
-
- if (debug > 1) {
- printk(KERN_DEBUG "bsalg: object: ");
- for (i = 0; i < (*obj)->id_len; i++) {
- if (i > 0)
- printk(".");
- printk("%lu", (*obj)->id[i]);
- }
- printk(": type=%u\n", (*obj)->type);
-
- }
-
- if ((*obj)->type == SNMP_IPADDR)
- mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
-
- kfree((*obj)->id);
- kfree(*obj);
- }
- kfree(obj);
-
- if (!asn1_eoc_decode(&ctx, eoc))
- return 0;
-
- return 1;
-}
-
-/*****************************************************************************
- *
- * NAT routines.
- *
- *****************************************************************************/
-
-/*
- * SNMP translation routine.
- */
-static int snmp_translate(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- struct sk_buff **pskb)
-{
- struct iphdr *iph = (*pskb)->nh.iph;
- struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
- u_int16_t udplen = ntohs(udph->len);
- u_int16_t paylen = udplen - sizeof(struct udphdr);
- int dir = CTINFO2DIR(ctinfo);
- struct oct1_map map;
-
- /*
- * Determine mappping for application layer addresses based
- * on NAT manipulations for the packet.
- */
- if (dir == IP_CT_DIR_ORIGINAL) {
- /* SNAT traps */
- map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip);
- map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip);
- } else {
- /* DNAT replies */
- map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
- map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip);
- }
-
- if (map.from == map.to)
- return NF_ACCEPT;
-
- if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
- paylen, &map, &udph->check)) {
- if (net_ratelimit())
- printk(KERN_WARNING "bsalg: parser failed\n");
- return NF_DROP;
- }
- return NF_ACCEPT;
-}
-
-/* We don't actually set up expectations, just adjust internal IP
- * addresses if this is being NATted */
-static int help(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo)
-{
- int dir = CTINFO2DIR(ctinfo);
- unsigned int ret;
- struct iphdr *iph = (*pskb)->nh.iph;
- struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
-
- /* SNMP replies and originating SNMP traps get mangled */
- if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
- return NF_ACCEPT;
- if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
- return NF_ACCEPT;
-
- /* No NAT? */
- if (!(ct->status & IPS_NAT_MASK))
- return NF_ACCEPT;
-
- /*
- * Make sure the packet length is ok. So far, we were only guaranteed
- * to have a valid length IP header plus 8 bytes, which means we have
- * enough room for a UDP header. Just verify the UDP length field so we
- * can mess around with the payload.
- */
- if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING "SNMP: dropping malformed packet "
- "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n",
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
- return NF_DROP;
- }
-
- if (!skb_make_writable(pskb, (*pskb)->len))
- return NF_DROP;
-
- spin_lock_bh(&snmp_lock);
- ret = snmp_translate(ct, ctinfo, pskb);
- spin_unlock_bh(&snmp_lock);
- return ret;
-}
-
-static struct ip_conntrack_helper snmp_helper = {
- .max_expected = 0,
- .timeout = 180,
- .me = THIS_MODULE,
- .help = help,
- .name = "snmp",
-
- .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_PORT)}}},
- .dst = {.protonum = IPPROTO_UDP},
- },
- .mask = {.src = {.u = {0xFFFF}},
- .dst = {.protonum = 0xFF},
- },
-};
-
-static struct ip_conntrack_helper snmp_trap_helper = {
- .max_expected = 0,
- .timeout = 180,
- .me = THIS_MODULE,
- .help = help,
- .name = "snmp_trap",
-
- .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_TRAP_PORT)}}},
- .dst = {.protonum = IPPROTO_UDP},
- },
- .mask = {.src = {.u = {0xFFFF}},
- .dst = {.protonum = 0xFF},
- },
-};
-
-/*****************************************************************************
- *
- * Module stuff.
- *
- *****************************************************************************/
-
-static int __init ip_nat_snmp_basic_init(void)
-{
- int ret = 0;
-
- ret = ip_conntrack_helper_register(&snmp_helper);
- if (ret < 0)
- return ret;
- ret = ip_conntrack_helper_register(&snmp_trap_helper);
- if (ret < 0) {
- ip_conntrack_helper_unregister(&snmp_helper);
- return ret;
- }
- return ret;
-}
-
-static void __exit ip_nat_snmp_basic_fini(void)
-{
- ip_conntrack_helper_unregister(&snmp_helper);
- ip_conntrack_helper_unregister(&snmp_trap_helper);
-}
-
-module_init(ip_nat_snmp_basic_init);
-module_exit(ip_nat_snmp_basic_fini);
-
-module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
deleted file mode 100644
index d85d2de50449..000000000000
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ /dev/null
@@ -1,391 +0,0 @@
-/* This file contains all the functions required for the standalone
- ip_nat module.
-
- These are not required by the compatibility layer.
-*/
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*
- * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
- * - new API and handling of conntrack/nat helpers
- * - now capable of multiple expectations for one master
- * */
-
-#include <linux/types.h>
-#include <linux/icmp.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/spinlock.h>
-
-#include <linux/netfilter_ipv4/ip_nat.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat_core.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \
- : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
- : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \
- : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \
- : "*ERROR*")))
-
-#ifdef CONFIG_XFRM
-static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
-{
- struct ip_conntrack *ct;
- struct ip_conntrack_tuple *t;
- enum ip_conntrack_info ctinfo;
- enum ip_conntrack_dir dir;
- unsigned long statusbit;
-
- ct = ip_conntrack_get(skb, &ctinfo);
- if (ct == NULL)
- return;
- dir = CTINFO2DIR(ctinfo);
- t = &ct->tuplehash[dir].tuple;
-
- if (dir == IP_CT_DIR_ORIGINAL)
- statusbit = IPS_DST_NAT;
- else
- statusbit = IPS_SRC_NAT;
-
- if (ct->status & statusbit) {
- fl->fl4_dst = t->dst.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP)
- fl->fl_ip_dport = t->dst.u.tcp.port;
- }
-
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- fl->fl4_src = t->src.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP)
- fl->fl_ip_sport = t->src.u.tcp.port;
- }
-}
-#endif
-
-static unsigned int
-ip_nat_fn(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- struct ip_nat_info *info;
- /* maniptype == SRC for postrouting. */
- enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
-
- /* We never see fragments: conntrack defrags on pre-routing
- and local-out, and ip_nat_out protects post-routing. */
- IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
- & htons(IP_MF|IP_OFFSET)));
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
- /* Can't track? It's not due to stress, or conntrack would
- have dropped it. Hence it's the user's responsibilty to
- packet filter it out, or implement conntrack/NAT for that
- protocol. 8) --RR */
- if (!ct) {
- /* Exception: ICMP redirect to new connection (not in
- hash table yet). We must not let this through, in
- case we're doing NAT to the same network. */
- if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
- struct icmphdr _hdr, *hp;
-
- hp = skb_header_pointer(*pskb,
- (*pskb)->nh.iph->ihl*4,
- sizeof(_hdr), &_hdr);
- if (hp != NULL &&
- hp->type == ICMP_REDIRECT)
- return NF_DROP;
- }
- return NF_ACCEPT;
- }
-
- /* Don't try to NAT if this packet is not conntracked */
- if (ct == &ip_conntrack_untracked)
- return NF_ACCEPT;
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED+IP_CT_IS_REPLY:
- if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
- if (!ip_nat_icmp_reply_translation(ct, ctinfo,
- hooknum, pskb))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
- case IP_CT_NEW:
- info = &ct->nat.info;
-
- /* Seen it before? This can happen for loopback, retrans,
- or local packets.. */
- if (!ip_nat_initialized(ct, maniptype)) {
- unsigned int ret;
-
- if (unlikely(is_confirmed(ct)))
- /* NAT module was loaded late */
- ret = alloc_null_binding_confirmed(ct, info,
- hooknum);
- else if (hooknum == NF_IP_LOCAL_IN)
- /* LOCAL_IN hook doesn't have a chain! */
- ret = alloc_null_binding(ct, info, hooknum);
- else
- ret = ip_nat_rule_find(pskb, hooknum,
- in, out, ct,
- info);
-
- if (ret != NF_ACCEPT) {
- return ret;
- }
- } else
- DEBUGP("Already setup manip %s for ct %p\n",
- maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
- ct);
- break;
-
- default:
- /* ESTABLISHED */
- IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
- || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
- info = &ct->nat.info;
- }
-
- IP_NF_ASSERT(info);
- return ip_nat_packet(ct, ctinfo, hooknum, pskb);
-}
-
-static unsigned int
-ip_nat_in(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- unsigned int ret;
- __be32 daddr = (*pskb)->nh.iph->daddr;
-
- ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN
- && daddr != (*pskb)->nh.iph->daddr) {
- dst_release((*pskb)->dst);
- (*pskb)->dst = NULL;
- }
- return ret;
-}
-
-static unsigned int
-ip_nat_out(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
-#ifdef CONFIG_XFRM
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
-#endif
- unsigned int ret;
-
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN
- && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.src.ip !=
- ct->tuplehash[!dir].tuple.dst.ip
- || ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all
- )
- return ip_xfrm_me_harder(pskb) == 0 ? ret : NF_DROP;
- }
-#endif
- return ret;
-}
-
-static unsigned int
-ip_nat_local_fn(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- unsigned int ret;
-
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN
- && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.dst.ip !=
- ct->tuplehash[!dir].tuple.src.ip
-#ifdef CONFIG_XFRM
- || ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all
-#endif
- )
- if (ip_route_me_harder(pskb, RTN_UNSPEC))
- ret = NF_DROP;
- }
- return ret;
-}
-
-static unsigned int
-ip_nat_adjust(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
- if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
- DEBUGP("ip_nat_standalone: adjusting sequence number\n");
- if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
- return NF_DROP;
- }
- return NF_ACCEPT;
-}
-
-/* We must be after connection tracking and before packet filtering. */
-
-static struct nf_hook_ops ip_nat_ops[] = {
- /* Before packet filtering, change destination */
- {
- .hook = ip_nat_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = ip_nat_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC,
- },
- /* After conntrack, adjust sequence number */
- {
- .hook = ip_nat_adjust,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SEQ_ADJUST,
- },
- /* Before packet filtering, change destination */
- {
- .hook = ip_nat_local_fn,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = ip_nat_fn,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_NAT_SRC,
- },
- /* After conntrack, adjust sequence number */
- {
- .hook = ip_nat_adjust,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_NAT_SEQ_ADJUST,
- },
-};
-
-static int __init ip_nat_standalone_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
-#ifdef CONFIG_XFRM
- BUG_ON(ip_nat_decode_session != NULL);
- ip_nat_decode_session = nat_decode_session;
-#endif
- ret = ip_nat_rule_init();
- if (ret < 0) {
- printk("ip_nat_init: can't setup rules.\n");
- goto cleanup_decode_session;
- }
- ret = nf_register_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops));
- if (ret < 0) {
- printk("ip_nat_init: can't register hooks.\n");
- goto cleanup_rule_init;
- }
- return ret;
-
- cleanup_rule_init:
- ip_nat_rule_cleanup();
- cleanup_decode_session:
-#ifdef CONFIG_XFRM
- ip_nat_decode_session = NULL;
- synchronize_net();
-#endif
- return ret;
-}
-
-static void __exit ip_nat_standalone_fini(void)
-{
- nf_unregister_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops));
- ip_nat_rule_cleanup();
-#ifdef CONFIG_XFRM
- ip_nat_decode_session = NULL;
- synchronize_net();
-#endif
-}
-
-module_init(ip_nat_standalone_init);
-module_exit(ip_nat_standalone_fini);
-
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
deleted file mode 100644
index 604793536fc1..000000000000
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Version: 0.0.7
- *
- * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
- * - Port to newnat API
- *
- * This module currently supports DNAT:
- * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y
- *
- * and SNAT:
- * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x }
- *
- * It has not been tested with
- * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip
- * If you do test this please let me know if it works or not.
- *
- */
-
-#include <linux/module.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
-#include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/moduleparam.h>
-
-MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
-MODULE_DESCRIPTION("tftp NAT helper");
-MODULE_LICENSE("GPL");
-
-static unsigned int help(struct sk_buff **pskb,
- enum ip_conntrack_info ctinfo,
- struct ip_conntrack_expect *exp)
-{
- struct ip_conntrack *ct = exp->master;
-
- exp->saved_proto.udp.port
- = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
- exp->dir = IP_CT_DIR_REPLY;
- exp->expectfn = ip_nat_follow_master;
- if (ip_conntrack_expect_related(exp) != 0)
- return NF_DROP;
- return NF_ACCEPT;
-}
-
-static void __exit ip_nat_tftp_fini(void)
-{
- rcu_assign_pointer(ip_nat_tftp_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init ip_nat_tftp_init(void)
-{
- BUG_ON(rcu_dereference(ip_nat_tftp_hook));
- rcu_assign_pointer(ip_nat_tftp_hook, help);
- return 0;
-}
-
-module_init(ip_nat_tftp_init);
-module_exit(ip_nat_tftp_fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
deleted file mode 100644
index cd520df4dcf4..000000000000
--- a/net/ipv4/netfilter/ip_queue.c
+++ /dev/null
@@ -1,742 +0,0 @@
-/*
- * This is a module which is used for queueing IPv4 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
- * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
- * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
- * Zander).
- * 2000-08-01: Added Nick Williams' MAC support.
- * 2002-06-25: Code cleanup.
- * 2005-01-10: Added /proc counter for dropped packets; fixed so
- * packets aren't delivered to user space if they're going
- * to be dropped.
- * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte)
- *
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
-#include <linux/mutex.h>
-#include <net/sock.h>
-#include <net/route.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip_queue"
-#define NET_IPQ_QMAX 2088
-#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-
-struct ipq_queue_entry {
- struct list_head list;
- struct nf_info *info;
- struct sk_buff *skb;
-};
-
-typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_RWLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static void
-ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
-{
- /* TCP input path (and probably other bits) assume to be called
- * from softirq context, not from syscall, like ipq_issue_verdict is
- * called. TCP input path deadlocks with locks taken from timer
- * softirq, e.g. We therefore emulate this by local_bh_disable() */
-
- local_bh_disable();
- nf_reinject(entry->skb, entry->info, verdict);
- local_bh_enable();
-
- kfree(entry);
-}
-
-static inline void
-__ipq_enqueue_entry(struct ipq_queue_entry *entry)
-{
- list_add(&entry->list, &queue_list);
- queue_total++;
-}
-
-/*
- * Find and return a queued entry matched by cmpfn, or return the last
- * entry if cmpfn is NULL.
- */
-static inline struct ipq_queue_entry *
-__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct list_head *p;
-
- list_for_each_prev(p, &queue_list) {
- struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
-
- if (!cmpfn || cmpfn(entry, data))
- return entry;
- }
- return NULL;
-}
-
-static inline void
-__ipq_dequeue_entry(struct ipq_queue_entry *entry)
-{
- list_del(&entry->list);
- queue_total--;
-}
-
-static inline struct ipq_queue_entry *
-__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct ipq_queue_entry *entry;
-
- entry = __ipq_find_entry(cmpfn, data);
- if (entry == NULL)
- return NULL;
-
- __ipq_dequeue_entry(entry);
- return entry;
-}
-
-
-static inline void
-__ipq_flush(int verdict)
-{
- struct ipq_queue_entry *entry;
-
- while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
- ipq_issue_verdict(entry, verdict);
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- copy_mode = mode;
- copy_range = range;
- if (copy_range > 0xFFFF)
- copy_range = 0xFFFF;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NF_DROP);
-}
-
-static struct ipq_queue_entry *
-ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct ipq_queue_entry *entry;
-
- write_lock_bh(&queue_lock);
- entry = __ipq_find_dequeue_entry(cmpfn, data);
- write_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-ipq_flush(int verdict)
-{
- write_lock_bh(&queue_lock);
- __ipq_flush(verdict);
- write_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
-{
- unsigned char *old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
-
- read_lock_bh(&queue_lock);
-
- switch (copy_mode) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- data_len = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
- entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
- (*errp = skb_checksum_help(entry->skb))) {
- read_unlock_bh(&queue_lock);
- return NULL;
- }
- if (copy_range == 0 || copy_range > entry->skb->len)
- data_len = entry->skb->len;
- else
- data_len = copy_range;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- read_unlock_bh(&queue_lock);
- return NULL;
- }
-
- read_unlock_bh(&queue_lock);
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail= skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->tstamp.off_sec;
- pmsg->timestamp_usec = entry->skb->tstamp.off_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->info->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->info->indev)
- strcpy(pmsg->indev_name, entry->info->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->info->outdev)
- strcpy(pmsg->outdev_name, entry->info->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->info->indev && entry->skb->dev) {
- pmsg->hw_type = entry->skb->dev->type;
- if (entry->skb->dev->hard_header_parse)
- pmsg->hw_addrlen =
- entry->skb->dev->hard_header_parse(entry->skb,
- pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- if (skb)
- kfree_skb(skb);
- *errp = -EINVAL;
- printk(KERN_ERR "ip_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
- unsigned int queuenum, void *data)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
- struct ipq_queue_entry *entry;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
- if (entry == NULL) {
- printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
- return -ENOMEM;
- }
-
- entry->info = info;
- entry->skb = skb;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- goto err_out_free;
-
- write_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip_queue: full at %d entries, "
- "dropping packets(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- write_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- write_unlock_bh(&queue_lock);
-
-err_out_free:
- kfree(entry);
- return status;
-}
-
-static int
-ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
-{
- int diff;
- struct iphdr *user_iph = (struct iphdr *)v->payload;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- struct sk_buff *newskb;
-
- newskb = skb_copy_expand(e->skb,
- skb_headroom(e->skb),
- diff,
- GFP_ATOMIC);
- if (newskb == NULL) {
- printk(KERN_WARNING "ip_queue: OOM "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- if (e->skb->sk)
- skb_set_owner_w(newskb, e->skb->sk);
- kfree_skb(e->skb);
- e->skb = newskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(&e->skb, v->data_len))
- return -ENOMEM;
- memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static inline int
-id_cmp(struct ipq_queue_entry *e, unsigned long id)
-{
- return (id == (unsigned long )e);
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct ipq_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv4(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- ipq_issue_verdict(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- write_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- write_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
- status = -EINVAL;
- else
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->info->indev)
- if (entry->info->indev->ifindex == ifindex)
- return 1;
- if (entry->info->outdev)
- if (entry->info->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- struct ipq_queue_entry *entry;
-
- while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
- ipq_issue_verdict(entry, NF_DROP);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags, nlmsglen, skblen;
- struct nlmsghdr *nlh;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = (struct nlmsghdr *)skb->data;
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- write_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- write_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- net_enable_timestamp();
- peer_pid = pid;
- }
-
- write_unlock_bh(&queue_lock);
-
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- return;
-}
-
-static void
-ipq_rcv_sk(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen;
-
- mutex_lock(&ipqnl_mutex);
-
- for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
- skb = skb_dequeue(&sk->sk_receive_queue);
- ipq_rcv_skb(skb);
- kfree_skb(skb);
- }
-
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE &&
- n->protocol == NETLINK_FIREWALL && n->pid) {
- write_lock_bh(&queue_lock);
- if (n->pid == peer_pid)
- __ipq_reset();
- write_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .ctl_name = NET_IPQ_QMAX,
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipq_dir_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = ipq_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipq_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipq_dir_table
- },
- { .ctl_name = 0 }
-};
-
-#ifdef CONFIG_PROC_FS
-static int
-ipq_get_info(char *buffer, char **start, off_t offset, int length)
-{
- int len;
-
- read_lock_bh(&queue_lock);
-
- len = sprintf(buffer,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netlink dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- read_unlock_bh(&queue_lock);
-
- *start = buffer + offset;
- len -= offset;
- if (len > length)
- len = length;
- else if (len < 0)
- len = 0;
- return len;
-}
-#endif /* CONFIG_PROC_FS */
-
-static struct nf_queue_handler nfqh = {
- .name = "ip_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
- THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
- proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
- if (proc)
- proc->owner = THIS_MODULE;
- else {
- printk(KERN_ERR "ip_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-
- register_netdevice_notifier(&ipq_dev_notifier);
- ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
-
- status = nf_register_queue_handler(PF_INET, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
- unregister_sysctl_table(ipq_sysctl_header);
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(IPQ_PROC_FS_NAME);
-
-cleanup_ipqnl:
- sock_release(ipqnl->sk_socket);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
- synchronize_net();
- ipq_flush(NF_DROP);
-
- unregister_sysctl_table(ipq_sysctl_header);
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(IPQ_PROC_FS_NAME);
-
- sock_release(ipqnl->sk_socket);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv4 packet queue handler");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_LICENSE("GPL");
-
-module_init(ip_queue_init);
-module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a455439b128..23c8deff8095 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1,19 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Packet matching code.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
- * - increase module usage count as soon as we have rules inside
- * a table
- * 08 Oct 2005 Harald Welte <lafore@netfilter.org>
- * - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables"
+ * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cache.h>
#include <linux/capability.h>
#include <linux/skbuff.h>
@@ -21,10 +14,9 @@
#include <linux/vmalloc.h>
#include <linux/netdevice.h>
#include <linux/module.h>
-#include <linux/icmp.h>
#include <net/ip.h>
#include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
@@ -32,348 +24,357 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");
-/*#define DEBUG_IP_FIREWALL*/
-/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
-
-#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) printk(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x) \
-do { \
- if (!(x)) \
- printk("IP_NF_ASSERT: %s:%s:%u\n", \
- __FUNCTION__, __FILE__, __LINE__); \
-} while(0)
-#else
-#define IP_NF_ASSERT(x)
-#endif
-
-#if 0
-/* All the better to debug you with... */
-#define static
-#define inline
-#endif
-
-/*
- We keep a set of rules for each CPU, so we can avoid write-locking
- them in the softirq when updating the counters and therefore
- only need to read-lock in the softirq; doing a write_lock_bh() in user
- context stops packets coming through and allows user context to read
- the counters or update the rules.
-
- Hence the start of any table is given by get_table() below. */
+void *ipt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(ipt, IPT);
+}
+EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
/* Returns whether matches rule or not. */
-static inline int
+/* Performance critical - called for every packet */
+static inline bool
ip_packet_match(const struct iphdr *ip,
const char *indev,
const char *outdev,
const struct ipt_ip *ipinfo,
int isfrag)
{
- size_t i;
unsigned long ret;
-#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
-
- if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
- IPT_INV_SRCIP)
- || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
- IPT_INV_DSTIP)) {
- dprintf("Source or dest mismatch.\n");
-
- dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
- NIPQUAD(ip->saddr),
- NIPQUAD(ipinfo->smsk.s_addr),
- NIPQUAD(ipinfo->src.s_addr),
- ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
- dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
- NIPQUAD(ip->daddr),
- NIPQUAD(ipinfo->dmsk.s_addr),
- NIPQUAD(ipinfo->dst.s_addr),
- ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
- return 0;
- }
+ if (NF_INVF(ipinfo, IPT_INV_SRCIP,
+ (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
+ NF_INVF(ipinfo, IPT_INV_DSTIP,
+ (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
+ return false;
- /* Look for ifname matches; this should unroll nicely. */
- for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
- ret |= (((const unsigned long *)indev)[i]
- ^ ((const unsigned long *)ipinfo->iniface)[i])
- & ((const unsigned long *)ipinfo->iniface_mask)[i];
- }
+ ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, ipinfo->iniface,
- ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
- return 0;
- }
+ if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
+ return false;
- for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
- ret |= (((const unsigned long *)outdev)[i]
- ^ ((const unsigned long *)ipinfo->outiface)[i])
- & ((const unsigned long *)ipinfo->outiface_mask)[i];
- }
+ ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, ipinfo->outiface,
- ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
- return 0;
- }
+ if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
+ return false;
/* Check specific protocol */
- if (ipinfo->proto
- && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
- dprintf("Packet protocol %hi does not match %hi.%s\n",
- ip->protocol, ipinfo->proto,
- ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
- return 0;
- }
+ if (ipinfo->proto &&
+ NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
+ return false;
/* If we have a fragment rule but the packet is not a fragment
* then we return zero */
- if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
- dprintf("Fragment rule but not fragment.%s\n",
- ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
- return 0;
- }
+ if (NF_INVF(ipinfo, IPT_INV_FRAG,
+ (ipinfo->flags & IPT_F_FRAG) && !isfrag))
+ return false;
- return 1;
+ return true;
}
-static inline int
+static bool
ip_checkentry(const struct ipt_ip *ip)
{
- if (ip->flags & ~IPT_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- ip->flags & ~IPT_F_MASK);
- return 0;
- }
- if (ip->invflags & ~IPT_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- ip->invflags & ~IPT_INV_MASK);
- return 0;
- }
- return 1;
+ if (ip->flags & ~IPT_F_MASK)
+ return false;
+ if (ip->invflags & ~IPT_INV_MASK)
+ return false;
+ return true;
}
static unsigned int
-ipt_error(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- printk("ip_tables: error: `%s'\n", (char *)targinfo);
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
return NF_DROP;
}
-static inline
-int do_match(struct ipt_entry_match *m,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int offset,
- int *hotdrop)
+/* Performance critical */
+static inline struct ipt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct ipt_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ipt_entry *e)
+{
+ static const struct ipt_ip uncond;
+
+ return e->target_offset == sizeof(struct ipt_entry) &&
+ memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
+}
+
+/* for const-correctness */
+static inline const struct xt_entry_target *
+ipt_get_target_c(const struct ipt_entry *e)
+{
+ return ipt_get_target((struct ipt_entry *)e);
+}
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+static const char *const hooknames[] = {
+ [NF_INET_PRE_ROUTING] = "PREROUTING",
+ [NF_INET_LOCAL_IN] = "INPUT",
+ [NF_INET_FORWARD] = "FORWARD",
+ [NF_INET_LOCAL_OUT] = "OUTPUT",
+ [NF_INET_POST_ROUTING] = "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+ NF_IP_TRACE_COMMENT_RULE,
+ NF_IP_TRACE_COMMENT_RETURN,
+ NF_IP_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+ [NF_IP_TRACE_COMMENT_RULE] = "rule",
+ [NF_IP_TRACE_COMMENT_RETURN] = "return",
+ [NF_IP_TRACE_COMMENT_POLICY] = "policy",
+};
+
+static const struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 4,
+ .logflags = NF_LOG_DEFAULT_MASK,
+ },
+ },
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
+ const char *hookname, const char **chainname,
+ const char **comment, unsigned int *rulenum)
{
- /* Stop iteration if it doesn't match */
- if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
- offset, skb->nh.iph->ihl*4, hotdrop))
+ const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
+
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+ /* Head of user chain: ERROR target with chainname */
+ *chainname = t->target.data;
+ (*rulenum) = 0;
+ } else if (s == e) {
+ (*rulenum)++;
+
+ if (unconditional(s) &&
+ strcmp(t->target.u.kernel.target->name,
+ XT_STANDARD_TARGET) == 0 &&
+ t->verdict < 0) {
+ /* Tail of chains: STANDARD target (return/policy) */
+ *comment = *chainname == hookname
+ ? comments[NF_IP_TRACE_COMMENT_POLICY]
+ : comments[NF_IP_TRACE_COMMENT_RETURN];
+ }
return 1;
- else
- return 0;
+ } else
+ (*rulenum)++;
+
+ return 0;
}
-static inline struct ipt_entry *
-get_entry(void *base, unsigned int offset)
+static void trace_packet(struct net *net,
+ const struct sk_buff *skb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *tablename,
+ const struct xt_table_info *private,
+ const struct ipt_entry *e)
{
- return (struct ipt_entry *)(base + offset);
+ const struct ipt_entry *root;
+ const char *hookname, *chainname, *comment;
+ const struct ipt_entry *iter;
+ unsigned int rulenum = 0;
+
+ root = get_entry(private->entries, private->hook_entry[hook]);
+
+ hookname = chainname = hooknames[hook];
+ comment = comments[NF_IP_TRACE_COMMENT_RULE];
+
+ xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+ if (get_chainname_rulenum(iter, e, hookname,
+ &chainname, &comment, &rulenum) != 0)
+ break;
+
+ nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline
+struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
}
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
-ipt_do_table(struct sk_buff **pskb,
- unsigned int hook,
- const struct net_device *in,
- const struct net_device *out,
- struct ipt_table *table)
+ipt_do_table(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
- u_int16_t offset;
- struct iphdr *ip;
- u_int16_t datalen;
- int hotdrop = 0;
+ const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
- void *table_base;
- struct ipt_entry *e, *back;
- struct xt_table_info *private;
+ const void *table_base;
+ struct ipt_entry *e, **jumpstack;
+ unsigned int stackidx, cpu;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
/* Initialization */
- ip = (*pskb)->nh.iph;
- datalen = (*pskb)->len - ip->ihl * 4;
- indev = in ? in->name : nulldevname;
- outdev = out ? out->name : nulldevname;
+ stackidx = 0;
+ ip = ip_hdr(skb);
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
- offset = ntohs(ip->frag_off) & IP_OFFSET;
+ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+ acpar.thoff = ip_hdrlen(skb);
+ acpar.hotdrop = false;
+ acpar.state = state;
+
+ WARN_ON(!(table->valid_hooks & (1 << hook)));
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = READ_ONCE(table->private); /* Address dependency. */
+ cpu = smp_processor_id();
+ table_base = private->entries;
+ jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * current->in_nf_duplicate;
- read_lock_bh(&table->lock);
- IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- private = table->private;
- table_base = (void *)private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
- /* For return from builtin chain */
- back = get_entry(table_base, private->underflow[hook]);
-
do {
- IP_NF_ASSERT(e);
- IP_NF_ASSERT(back);
- if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
- struct ipt_entry_target *t;
-
- if (IPT_MATCH_ITERATE(e, do_match,
- *pskb, in, out,
- offset, &hotdrop) != 0)
+ const struct xt_entry_target *t;
+ const struct xt_entry_match *ematch;
+ struct xt_counters *counter;
+
+ WARN_ON(!e);
+ if (!ip_packet_match(ip, indev, outdev,
+ &e->ip, acpar.fragoff)) {
+ no_match:
+ e = ipt_next_entry(e);
+ continue;
+ }
+
+ xt_ematch_foreach(ematch, e) {
+ acpar.match = ematch->u.kernel.match;
+ acpar.matchinfo = ematch->data;
+ if (!acpar.match->match(skb, &acpar))
goto no_match;
+ }
- ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
-
- t = ipt_get_target(e);
- IP_NF_ASSERT(t->u.kernel.target);
- /* Standard target? */
- if (!t->u.kernel.target->target) {
- int v;
-
- v = ((struct ipt_standard_target *)t)->verdict;
- if (v < 0) {
- /* Pop from stack? */
- if (v != IPT_RETURN) {
- verdict = (unsigned)(-v) - 1;
- break;
- }
- e = back;
- back = get_entry(table_base,
- back->comefrom);
- continue;
- }
- if (table_base + v != (void *)e + e->next_offset
- && !(e->ip.flags & IPT_F_GOTO)) {
- /* Save old back ptr in next entry */
- struct ipt_entry *next
- = (void *)e + e->next_offset;
- next->comefrom
- = (void *)back - table_base;
- /* set back pointer to next entry */
- back = next;
- }
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
- e = get_entry(table_base, v);
- } else {
- /* Targets which reenter must return
- abs. verdicts */
-#ifdef CONFIG_NETFILTER_DEBUG
- ((struct ipt_entry *)table_base)->comefrom
- = 0xeeeeeeec;
+ t = ipt_get_target_c(e);
+ WARN_ON(!t->u.kernel.target);
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+ /* The packet is traced: log it */
+ if (unlikely(skb->nf_trace))
+ trace_packet(state->net, skb, hook, state->in,
+ state->out, table->name, private, e);
#endif
- verdict = t->u.kernel.target->target(pskb,
- in, out,
- hook,
- t->u.kernel.target,
- t->data);
-
-#ifdef CONFIG_NETFILTER_DEBUG
- if (((struct ipt_entry *)table_base)->comefrom
- != 0xeeeeeeec
- && verdict == IPT_CONTINUE) {
- printk("Target %s reentered!\n",
- t->u.kernel.target->name);
- verdict = NF_DROP;
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
}
- ((struct ipt_entry *)table_base)->comefrom
- = 0x57acc001;
-#endif
- /* Target might have changed stuff. */
- ip = (*pskb)->nh.iph;
- datalen = (*pskb)->len - ip->ihl * 4;
-
- if (verdict == IPT_CONTINUE)
- e = (void *)e + e->next_offset;
- else
- /* Verdict */
+ if (stackidx == 0) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ } else {
+ e = jumpstack[--stackidx];
+ e = ipt_next_entry(e);
+ }
+ continue;
+ }
+ if (table_base + v != ipt_next_entry(e) &&
+ !(e->ip.flags & IPT_F_GOTO)) {
+ if (unlikely(stackidx >= private->stacksize)) {
+ verdict = NF_DROP;
break;
+ }
+ jumpstack[stackidx++] = e;
}
- } else {
- no_match:
- e = (void *)e + e->next_offset;
+ e = get_entry(table_base, v);
+ continue;
}
- } while (!hotdrop);
-
- read_unlock_bh(&table->lock);
-#ifdef DEBUG_ALLOW_ALL
- return NF_ACCEPT;
-#else
- if (hotdrop)
- return NF_DROP;
- else return verdict;
-#endif
-}
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
-/* All zeroes == unconditional rule. */
-static inline int
-unconditional(const struct ipt_ip *ip)
-{
- unsigned int i;
+ verdict = t->u.kernel.target->target(skb, &acpar);
+ if (verdict == XT_CONTINUE) {
+ /* Target might have changed stuff. */
+ ip = ip_hdr(skb);
+ e = ipt_next_entry(e);
+ } else {
+ /* Verdict */
+ break;
+ }
+ } while (!acpar.hotdrop);
- for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
- if (((__u32 *)ip)[i])
- return 0;
+ xt_write_recseq_end(addend);
+ local_bh_enable();
- return 1;
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else return verdict;
}
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct xt_table_info *newinfo,
- unsigned int valid_hooks, void *entry0)
+mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0,
+ unsigned int *offsets)
{
unsigned int hook;
/* No recursion; use packet counter to save back ptrs (reset
to 0 as we leave), and comefrom to save source hook bitmask */
- for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
+ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
- struct ipt_entry *e
- = (struct ipt_entry *)(entry0 + pos);
+ struct ipt_entry *e = entry0 + pos;
if (!(valid_hooks & (1 << hook)))
continue;
@@ -382,38 +383,26 @@ mark_source_chains(struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- struct ipt_standard_target *t
- = (void *)ipt_get_target(e);
+ const struct xt_standard_target *t
+ = (void *)ipt_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
- printk("iptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS))
return 0;
- }
- e->comefrom
- |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
+
+ e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
- if (e->target_offset == sizeof(struct ipt_entry)
- && (strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0)
- && t->verdict < 0
- && unconditional(&e->ip)) {
+ if ((unconditional(e) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
/* Return: backtrack through the last
big jump. */
do {
- e->comefrom ^= (1<<NF_IP_NUMHOOKS);
-#ifdef DEBUG_IP_FIREWALL_USER
- if (e->comefrom
- & (1 << NF_IP_NUMHOOKS)) {
- duprintf("Back unset "
- "on hook %u "
- "rule %u\n",
- hook, pos);
- }
-#endif
+ e->comefrom ^= (1<<NF_INET_NUMHOOKS);
oldpos = pos;
pos = e->counters.pcnt;
e->counters.pcnt = 0;
@@ -422,441 +411,431 @@ mark_source_chains(struct xt_table_info *newinfo,
if (pos == oldpos)
goto next;
- e = (struct ipt_entry *)
- (entry0 + pos);
+ e = entry0 + pos;
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
- e = (struct ipt_entry *)
- (entry0 + pos + size);
+ e = entry0 + pos + size;
+ if (pos + size >= newinfo->size)
+ return 0;
e->counters.pcnt = pos;
pos += size;
} else {
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0
- && newpos >= 0) {
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
+ if (!xt_find_jump_offset(offsets, newpos,
+ newinfo->number))
+ return 0;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
+ if (newpos >= newinfo->size)
+ return 0;
}
- e = (struct ipt_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
e->counters.pcnt = pos;
pos = newpos;
}
}
- next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
-static inline int
-cleanup_match(struct ipt_entry_match *m, unsigned int *i)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
- if (i && (*i)-- == 0)
- return 1;
-
- if (m->u.kernel.match->destroy)
- m->u.kernel.match->destroy(m->u.kernel.match, m->data);
- module_put(m->u.kernel.match->me);
- return 0;
+ struct xt_mtdtor_param par;
+
+ par.net = net;
+ par.match = m->u.kernel.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_IPV4;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
}
-static inline int
-standard_check(const struct ipt_entry_target *t,
- unsigned int max_offset)
+static int
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
- struct ipt_standard_target *targ = (void *)t;
+ const struct ipt_ip *ip = par->entryinfo;
- /* Check standard info. */
- if (targ->verdict >= 0
- && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
- duprintf("ipt_standard_check: bad verdict (%i)\n",
- targ->verdict);
- return 0;
- }
- if (targ->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("ipt_standard_check: bad negative verdict (%i)\n",
- targ->verdict);
- return 0;
- }
- return 1;
+ par->match = m->u.kernel.match;
+ par->matchinfo = m->data;
+
+ return xt_check_match(par, m->u.match_size - sizeof(*m),
+ ip->proto, ip->invflags & IPT_INV_PROTO);
}
-static inline int
-check_match(struct ipt_entry_match *m,
- const char *name,
- const struct ipt_ip *ip,
- unsigned int hookmask,
- unsigned int *i)
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
- struct ipt_match *match;
+ struct xt_match *match;
int ret;
- match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
- m->u.user.revision),
- "ipt_%s", m->u.user.name);
- if (IS_ERR(match) || !match) {
- duprintf("check_match: `%s' not found\n", m->u.user.name);
- return match ? PTR_ERR(match) : -ENOENT;
- }
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
m->u.kernel.match = match;
- ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m),
- name, hookmask, ip->proto,
- ip->invflags & IPT_INV_PROTO);
+ ret = check_match(m, par);
if (ret)
goto err;
- if (m->u.kernel.match->checkentry
- && !m->u.kernel.match->checkentry(name, ip, match, m->data,
- hookmask)) {
- duprintf("ip_tables: check failed for `%s'.\n",
- m->u.kernel.match->name);
- ret = -EINVAL;
- goto err;
- }
-
- (*i)++;
return 0;
err:
module_put(m->u.kernel.match->me);
return ret;
}
-static struct ipt_target ipt_standard_target;
+static int check_target(struct ipt_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_target *t = ipt_get_target(e);
+ struct xt_tgchk_param par = {
+ .net = net,
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_IPV4,
+ };
+
+ return xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
+}
-static inline int
-check_entry(struct ipt_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+static int
+find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
- struct ipt_entry_target *t;
- struct ipt_target *target;
+ struct xt_entry_target *t;
+ struct xt_target *target;
int ret;
unsigned int j;
+ struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
- if (!ip_checkentry(&e->ip)) {
- duprintf("ip_tables: ip check failed %p %s.\n", e, name);
- return -EINVAL;
- }
-
- if (e->target_offset + sizeof(struct ipt_entry_target) > e->next_offset)
- return -EINVAL;
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
+ return -ENOMEM;
j = 0;
- ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
- if (ret != 0)
- goto cleanup_matches;
+ memset(&mtpar, 0, sizeof(mtpar));
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ip;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV4;
+ xt_ematch_foreach(ematch, e) {
+ ret = find_check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
t = ipt_get_target(e);
- ret = -EINVAL;
- if (e->target_offset + t->u.target_size > e->next_offset)
- goto cleanup_matches;
- target = try_then_request_module(xt_find_target(AF_INET,
- t->u.user.name,
- t->u.user.revision),
- "ipt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
- duprintf("check_entry: `%s' not found\n", t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
goto cleanup_matches;
}
t->u.kernel.target = target;
- ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
- name, e->comefrom, e->ip.proto,
- e->ip.invflags & IPT_INV_PROTO);
+ ret = check_target(e, net, name);
if (ret)
goto err;
- if (t->u.kernel.target == &ipt_standard_target) {
- if (!standard_check(t, size)) {
- ret = -EINVAL;
- goto err;
- }
- } else if (t->u.kernel.target->checkentry
- && !t->u.kernel.target->checkentry(name, e, target, t->data,
- e->comefrom)) {
- duprintf("ip_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- ret = -EINVAL;
- goto err;
- }
-
- (*i)++;
return 0;
err:
module_put(t->u.kernel.target->me);
cleanup_matches:
- IPT_MATCH_ITERATE(e, cleanup_match, &j);
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+
+ xt_percpu_counter_free(&e->counters);
+
return ret;
}
-static inline int
+static bool check_underflow(const struct ipt_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(e))
+ return false;
+ t = ipt_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
check_entry_size_and_hooks(struct ipt_entry *e,
struct xt_table_info *newinfo,
- unsigned char *base,
- unsigned char *limit,
+ const unsigned char *base,
+ const unsigned char *limit,
const unsigned int *hook_entries,
const unsigned int *underflows,
- unsigned int *i)
+ unsigned int valid_hooks)
{
unsigned int h;
+ int err;
- if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
- || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
- duprintf("Bad offset %p\n", e);
+ if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
+
+ if (!ip_checkentry(&e->ip))
+ return -EINVAL;
+
+ err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+ e->next_offset);
+ if (err)
+ return err;
/* Check hooks & underflows */
- for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
- if ((unsigned char *)e - base == underflows[h])
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e))
+ return -EINVAL;
+
newinfo->underflow[h] = underflows[h];
+ }
}
- /* FIXME: underflows must be unconditional, standard verdicts
- < 0 (not IPT_RETURN). --RR */
-
/* Clear counters and comefrom */
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
-
- (*i)++;
return 0;
}
-static inline int
-cleanup_entry(struct ipt_entry *e, unsigned int *i)
+static void
+cleanup_entry(struct ipt_entry *e, struct net *net)
{
- struct ipt_entry_target *t;
-
- if (i && (*i)-- == 0)
- return 1;
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
/* Cleanup all matches */
- IPT_MATCH_ITERATE(e, cleanup_match, NULL);
+ xt_ematch_foreach(ematch, e)
+ cleanup_match(ematch, net);
t = ipt_get_target(e);
- if (t->u.kernel.target->destroy)
- t->u.kernel.target->destroy(t->u.kernel.target, t->data);
- module_put(t->u.kernel.target->me);
- return 0;
+
+ par.net = net;
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_IPV4;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
newinfo) */
static int
-translate_table(const char *name,
- unsigned int valid_hooks,
- struct xt_table_info *newinfo,
- void *entry0,
- unsigned int size,
- unsigned int number,
- const unsigned int *hook_entries,
- const unsigned int *underflows)
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+ const struct ipt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
+ struct ipt_entry *iter;
+ unsigned int *offsets;
unsigned int i;
- int ret;
+ int ret = 0;
- newinfo->size = size;
- newinfo->number = number;
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
/* Init all hooks to impossible value. */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = 0xFFFFFFFF;
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
+ offsets = xt_alloc_entry_offsets(newinfo->number);
+ if (!offsets)
+ return -ENOMEM;
i = 0;
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry_size_and_hooks,
- newinfo,
- entry0,
- entry0 + size,
- hook_entries, underflows, &i);
- if (ret != 0)
- return ret;
-
- if (i != number) {
- duprintf("translate_table: %u not %u entries\n",
- i, number);
- return -EINVAL;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ goto out_free;
+ if (i < repl->num_entries)
+ offsets[i] = (void *)iter - entry0;
+ ++i;
+ if (strcmp(ipt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
- /* Check hooks all assigned */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- /* Only hooks which are valid */
- if (!(valid_hooks & (1 << i)))
- continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, hook_entries[i]);
- return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, underflows[i]);
- return -EINVAL;
- }
+ ret = -EINVAL;
+ if (i != repl->num_entries)
+ goto out_free;
+
+ ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+ if (ret)
+ goto out_free;
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+ ret = -ELOOP;
+ goto out_free;
}
+ kvfree(offsets);
/* Finally, each sanity check must pass */
i = 0;
- ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry, name, size, &i);
-
- if (ret != 0)
- goto cleanup;
-
- ret = -ELOOP;
- if (!mark_source_chains(newinfo, valid_hooks, entry0))
- goto cleanup;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
+ if (ret != 0)
+ break;
+ ++i;
+ }
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
+ return ret;
}
- return 0;
-cleanup:
- IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i);
return ret;
-}
-
-/* Gets counters. */
-static inline int
-add_entry_to_counter(const struct ipt_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
-static inline int
-set_entry_to_counter(const struct ipt_entry *e,
- struct ipt_counters total[],
- unsigned int *i)
-{
- SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
+ out_free:
+ kvfree(offsets);
+ return ret;
}
static void
get_counters(const struct xt_table_info *t,
struct xt_counters counters[])
{
+ struct ipt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU
- * We dont care about preemption here.
- */
- curcpu = raw_smp_processor_id();
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
- i = 0;
- IPT_ENTRY_ITERATE(t->entries[curcpu],
- t->size,
- set_entry_to_counter,
- counters,
- &i);
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i; /* macro does multi eval of i */
+ cond_resched();
+ }
+ }
+}
+
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ipt_entry *iter;
+ unsigned int cpu, i;
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
i = 0;
- IPT_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries, t->size) {
+ const struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i; /* macro does multi eval of i */
+ }
+
+ cond_resched();
}
}
-static inline struct xt_counters * alloc_counters(struct ipt_table *table)
+static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = table->private;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = vzalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
- /* First, sum counters... */
- write_lock_bh(&table->lock);
get_counters(private, counters);
- write_unlock_bh(&table->lock);
return counters;
}
static int
copy_entries_to_user(unsigned int total_size,
- struct ipt_table *table,
+ const struct xt_table *table,
void __user *userptr)
{
unsigned int off, num;
- struct ipt_entry *e;
+ const struct ipt_entry *e;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = table->private;
int ret = 0;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
counters = alloc_counters(table);
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- /* ... then copy entire thing ... */
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
+ loc_cpu_entry = private->entries;
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
unsigned int i;
- struct ipt_entry_match *m;
- struct ipt_entry_target *t;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
- e = (struct ipt_entry *)(loc_cpu_entry + off);
+ e = loc_cpu_entry + off;
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -870,23 +849,14 @@ copy_entries_to_user(unsigned int total_size,
i += m->u.match_size) {
m = (void *)e + i;
- if (copy_to_user(userptr + off + i
- + offsetof(struct ipt_entry_match,
- u.user.name),
- m->u.kernel.match->name,
- strlen(m->u.kernel.match->name)+1)
- != 0) {
+ if (xt_match_to_user(m, userptr + off + i)) {
ret = -EFAULT;
goto free_counters;
}
}
- t = ipt_get_target(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct ipt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ t = ipt_get_target_c(e);
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -897,175 +867,121 @@ copy_entries_to_user(unsigned int total_size,
return ret;
}
-#ifdef CONFIG_COMPAT
-struct compat_delta {
- struct compat_delta *next;
- u_int16_t offset;
- short delta;
-};
-
-static struct compat_delta *compat_offsets = NULL;
-
-static int compat_add_offset(u_int16_t offset, short delta)
-{
- struct compat_delta *tmp;
-
- tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
- tmp->offset = offset;
- tmp->delta = delta;
- if (compat_offsets) {
- tmp->next = compat_offsets->next;
- compat_offsets->next = tmp;
- } else {
- compat_offsets = tmp;
- tmp->next = NULL;
- }
- return 0;
-}
-
-static void compat_flush_offsets(void)
-{
- struct compat_delta *tmp, *next;
-
- if (compat_offsets) {
- for(tmp = compat_offsets; tmp; tmp = next) {
- next = tmp->next;
- kfree(tmp);
- }
- compat_offsets = NULL;
- }
-}
-
-static short compat_calc_jump(u_int16_t offset)
-{
- struct compat_delta *tmp;
- short delta;
-
- for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next)
- if (tmp->offset < offset)
- delta += tmp->delta;
- return delta;
-}
-
-static void compat_standard_from_user(void *dst, void *src)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
if (v > 0)
- v += compat_calc_jump(v);
+ v += xt_compat_calc_jump(AF_INET, v);
memcpy(dst, &v, sizeof(v));
}
-static int compat_standard_to_user(void __user *dst, void *src)
+static int compat_standard_to_user(void __user *dst, const void *src)
{
compat_int_t cv = *(int *)src;
if (cv > 0)
- cv -= compat_calc_jump(cv);
+ cv -= xt_compat_calc_jump(AF_INET, cv);
return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}
-static inline int
-compat_calc_match(struct ipt_entry_match *m, int * size)
-{
- *size += xt_compat_match_offset(m->u.kernel.match);
- return 0;
-}
-
-static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info,
- void *base, struct xt_table_info *newinfo)
+static int compat_calc_entry(const struct ipt_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
{
- struct ipt_entry_target *t;
- u_int16_t entry_offset;
+ const struct xt_entry_match *ematch;
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
int off, i, ret;
- off = 0;
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
entry_offset = (void *)e - base;
- IPT_MATCH_ITERATE(e, compat_calc_match, &off);
- t = ipt_get_target(e);
+ xt_ematch_foreach(ematch, e)
+ off += xt_compat_match_offset(ematch->u.kernel.match);
+ t = ipt_get_target_c(e);
off += xt_compat_target_offset(t->u.kernel.target);
newinfo->size -= off;
- ret = compat_add_offset(entry_offset, off);
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
if (ret)
return ret;
- for (i = 0; i< NF_IP_NUMHOOKS; i++) {
- if (info->hook_entry[i] && (e < (struct ipt_entry *)
- (base + info->hook_entry[i])))
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct ipt_entry *)(base + info->hook_entry[i])))
newinfo->hook_entry[i] -= off;
- if (info->underflow[i] && (e < (struct ipt_entry *)
- (base + info->underflow[i])))
+ if (info->underflow[i] &&
+ (e < (struct ipt_entry *)(base + info->underflow[i])))
newinfo->underflow[i] -= off;
}
return 0;
}
-static int compat_table_info(struct xt_table_info *info,
- struct xt_table_info *newinfo)
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
{
- void *loc_cpu_entry;
- int i;
+ struct ipt_entry *iter;
+ const void *loc_cpu_entry;
+ int ret;
if (!newinfo || !info)
return -EINVAL;
- memset(newinfo, 0, sizeof(struct xt_table_info));
- newinfo->size = info->size;
- newinfo->number = info->number;
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- newinfo->hook_entry[i] = info->hook_entry[i];
- newinfo->underflow[i] = info->underflow[i];
+ /* we dont care about newinfo->entries */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries;
+ ret = xt_compat_init_offsets(AF_INET, info->number);
+ if (ret)
+ return ret;
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
}
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
- return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size,
- compat_calc_entry, info, loc_cpu_entry, newinfo);
+ return 0;
}
#endif
-static int get_info(void __user *user, int *len, int compat)
+static int get_info(struct net *net, void __user *user, const int *len)
{
- char name[IPT_TABLE_MAXNAMELEN];
- struct ipt_table *t;
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
int ret;
- if (*len != sizeof(struct ipt_getinfo)) {
- duprintf("length %u != %u\n", *len,
- (unsigned int)sizeof(struct ipt_getinfo));
+ if (*len != sizeof(struct ipt_getinfo))
return -EINVAL;
- }
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
- name[IPT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
- if (compat)
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_lock(AF_INET);
#endif
- t = try_then_request_module(xt_find_table_lock(AF_INET, name),
- "iptable_%s", name);
- if (t && !IS_ERR(t)) {
+ t = xt_request_find_table_lock(net, AF_INET, name);
+ if (!IS_ERR(t)) {
struct ipt_getinfo info;
- struct xt_table_info *private = t->private;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ struct xt_table_info tmp;
-#ifdef CONFIG_COMPAT
- if (compat) {
- struct xt_table_info tmp;
+ if (in_compat_syscall()) {
ret = compat_table_info(private, &tmp);
- compat_flush_offsets();
- private = &tmp;
+ xt_compat_flush_offsets(AF_INET);
+ private = &tmp;
}
#endif
+ memset(&info, 0, sizeof(info));
info.valid_hooks = t->valid_hooks;
memcpy(info.hook_entry, private->hook_entry,
- sizeof(info.hook_entry));
+ sizeof(info.hook_entry));
memcpy(info.underflow, private->underflow,
- sizeof(info.underflow));
+ sizeof(info.underflow));
info.num_entries = private->number;
info.size = private->size;
- strcpy(info.name, name);
+ strscpy(info.name, name);
if (copy_to_user(user, &info, *len) != 0)
ret = -EFAULT;
@@ -1075,86 +991,72 @@ static int get_info(void __user *user, int *len, int compat)
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
-#ifdef CONFIG_COMPAT
- if (compat)
+ ret = PTR_ERR(t);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_unlock(AF_INET);
#endif
return ret;
}
static int
-get_entries(struct ipt_get_entries __user *uptr, int *len)
+get_entries(struct net *net, struct ipt_get_entries __user *uptr,
+ const int *len)
{
int ret;
struct ipt_get_entries get;
- struct ipt_table *t;
+ struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %d\n", *len,
- (unsigned int)sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct ipt_get_entries) + get.size) {
- duprintf("get_entries: %u != %u\n", *len,
- (unsigned int)(sizeof(struct ipt_get_entries) +
- get.size));
+ if (*len != sizeof(struct ipt_get_entries) + get.size)
return -EINVAL;
- }
+ get.name[sizeof(get.name) - 1] = '\0';
- t = xt_find_table_lock(AF_INET, get.name);
- if (t && !IS_ERR(t)) {
- struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n",
- private->number);
+ t = xt_find_table_lock(net, AF_INET, get.name);
+ if (!IS_ERR(t)) {
+ const struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size,
- get.size);
- ret = -EINVAL;
- }
+ else
+ ret = -EAGAIN;
+
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = PTR_ERR(t);
return ret;
}
static int
-__do_replace(const char *name, unsigned int valid_hooks,
- struct xt_table_info *newinfo, unsigned int num_counters,
- void __user *counters_ptr)
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+ struct xt_table_info *newinfo, unsigned int num_counters,
+ void __user *counters_ptr)
{
int ret;
- struct ipt_table *t;
+ struct xt_table *t;
struct xt_table_info *oldinfo;
struct xt_counters *counters;
- void *loc_cpu_old_entry;
+ struct ipt_entry *iter;
- ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = xt_counters_alloc(num_counters);
if (!counters) {
ret = -ENOMEM;
goto out;
}
- t = try_then_request_module(xt_find_table_lock(AF_INET, name),
- "iptable_%s", name);
- if (!t || IS_ERR(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ t = xt_request_find_table_lock(net, AF_INET, name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free_newinfo_counters_untrans;
}
/* You lied! */
if (valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
@@ -1164,8 +1066,6 @@ __do_replace(const char *name, unsigned int valid_hooks,
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
if ((oldinfo->number > oldinfo->initial_entries) ||
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
@@ -1173,18 +1073,22 @@ __do_replace(const char *name, unsigned int valid_hooks,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters. */
- get_counters(oldinfo, counters);
+ xt_table_unlock(t);
+
+ get_old_counters(oldinfo, counters);
+
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
+ cleanup_entry(iter, net);
+
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
- sizeof(struct xt_counters) * num_counters) != 0)
- ret = -EFAULT;
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
- xt_table_unlock(t);
- return ret;
+ return 0;
put_module:
module_put(t->me);
@@ -1196,159 +1100,99 @@ __do_replace(const char *name, unsigned int valid_hooks,
}
static int
-do_replace(void __user *user, unsigned int len)
+do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct ipt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct ipt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (len != sizeof(tmp) + tmp.size)
- return -ENOPROTOOPT;
-
/* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ loc_cpu_entry = newinfo->entries;
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
- ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
- tmp.hook_entry, tmp.underflow);
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
- duprintf("ip_tables: Translated table\n");
-
- ret = __do_replace(tmp.name, tmp.valid_hooks,
- newinfo, tmp.num_counters,
- tmp.counters);
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
if (ret)
goto free_newinfo_untrans;
return 0;
free_newinfo_untrans:
- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ipt_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
-#if 0
- duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
- *i,
- (long unsigned int)e->counters.pcnt,
- (long unsigned int)e->counters.bcnt,
- (long unsigned int)addme[*i].pcnt,
- (long unsigned int)addme[*i].bcnt);
-#endif
-
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
static int
-do_add_counters(void __user *user, unsigned int len, int compat)
+do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
- unsigned int num_counters;
- char *name;
- int size;
- void *ptmp;
- struct ipt_table *t;
- struct xt_table_info *private;
+ struct xt_table *t;
+ const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
-#ifdef CONFIG_COMPAT
- struct compat_xt_counters_info compat_tmp;
-
- if (compat) {
- ptmp = &compat_tmp;
- size = sizeof(struct compat_xt_counters_info);
- } else
-#endif
- {
- ptmp = &tmp;
- size = sizeof(struct xt_counters_info);
- }
-
- if (copy_from_user(ptmp, user, size) != 0)
- return -EFAULT;
-
-#ifdef CONFIG_COMPAT
- if (compat) {
- num_counters = compat_tmp.num_counters;
- name = compat_tmp.name;
- } else
-#endif
- {
- num_counters = tmp.num_counters;
- name = tmp.name;
- }
-
- if (len != size + num_counters * sizeof(struct xt_counters))
- return -EINVAL;
+ struct ipt_entry *iter;
+ unsigned int addend;
- paddc = vmalloc_node(len - size, numa_node_id());
- if (!paddc)
- return -ENOMEM;
+ paddc = xt_copy_counters(arg, len, &tmp);
+ if (IS_ERR(paddc))
+ return PTR_ERR(paddc);
- if (copy_from_user(paddc, user + size, len - size) != 0) {
- ret = -EFAULT;
+ t = xt_find_table_lock(net, AF_INET, tmp.name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free;
}
- t = xt_find_table_lock(AF_INET, name);
- if (!t || IS_ERR(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
- goto free;
- }
-
- write_lock_bh(&t->lock);
+ local_bh_disable();
private = t->private;
- if (private->number != num_counters) {
+ if (private->number != tmp.num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
i = 0;
- /* Choose the copy that is on our node */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- IPT_ENTRY_ITERATE(loc_cpu_entry,
- private->size,
- add_counter_to_entry,
- paddc,
- &i);
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ local_bh_enable();
xt_table_unlock(t);
module_put(t->me);
free:
@@ -1357,348 +1201,272 @@ do_add_counters(void __user *user, unsigned int len, int compat)
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ipt_replace {
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
u32 num_entries;
u32 size;
- u32 hook_entry[NF_IP_NUMHOOKS];
- u32 underflow[NF_IP_NUMHOOKS];
+ u32 hook_entry[NF_INET_NUMHOOKS];
+ u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
- compat_uptr_t counters; /* struct ipt_counters * */
- struct compat_ipt_entry entries[0];
+ compat_uptr_t counters; /* struct xt_counters * */
+ struct compat_ipt_entry entries[];
};
-static inline int compat_copy_match_to_user(struct ipt_entry_match *m,
- void __user **dstptr, compat_uint_t *size)
-{
- return xt_compat_match_to_user(m, dstptr, size);
-}
-
-static int compat_copy_entry_to_user(struct ipt_entry *e,
- void __user **dstptr, compat_uint_t *size)
+static int
+compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
+ unsigned int *size, struct xt_counters *counters,
+ unsigned int i)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct compat_ipt_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
- int ret;
+ const struct xt_entry_match *ematch;
+ int ret = 0;
- ret = -EFAULT;
origsize = *size;
- ce = (struct compat_ipt_entry __user *)*dstptr;
- if (copy_to_user(ce, e, sizeof(struct ipt_entry)))
- goto out;
+ ce = *dstptr;
+ if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
*dstptr += sizeof(struct compat_ipt_entry);
- ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size);
+ *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
target_offset = e->target_offset - (origsize - *size);
- if (ret)
- goto out;
t = ipt_get_target(e);
ret = xt_compat_target_to_user(t, dstptr, size);
if (ret)
- goto out;
- ret = -EFAULT;
+ return ret;
next_offset = e->next_offset - (origsize - *size);
- if (put_user(target_offset, &ce->target_offset))
- goto out;
- if (put_user(next_offset, &ce->next_offset))
- goto out;
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
return 0;
-out:
- return ret;
}
-static inline int
-compat_check_calc_match(struct ipt_entry_match *m,
- const char *name,
- const struct ipt_ip *ip,
- unsigned int hookmask,
- int *size, int *i)
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+ const struct ipt_ip *ip,
+ int *size)
{
- struct ipt_match *match;
-
- match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
- m->u.user.revision),
- "ipt_%s", m->u.user.name);
- if (IS_ERR(match) || !match) {
- duprintf("compat_check_calc_match: `%s' not found\n",
- m->u.user.name);
- return match ? PTR_ERR(match) : -ENOENT;
- }
+ struct xt_match *match;
+
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+
m->u.kernel.match = match;
*size += xt_compat_match_offset(match);
-
- (*i)++;
return 0;
}
-static inline int
-check_compat_entry_size_and_hooks(struct ipt_entry *e,
- struct xt_table_info *newinfo,
- unsigned int *size,
- unsigned char *base,
- unsigned char *limit,
- unsigned int *hook_entries,
- unsigned int *underflows,
- unsigned int *i,
- const char *name)
+static void compat_release_entry(struct compat_ipt_entry *e)
+{
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ module_put(ematch->u.kernel.match->me);
+ t = compat_ipt_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit)
{
- struct ipt_entry_target *t;
- struct ipt_target *target;
- u_int16_t entry_offset;
- int ret, off, h, j;
-
- duprintf("check_compat_entry_size_and_hooks %p\n", e);
- if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0
- || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
- duprintf("Bad offset %p, limit = %p\n", e, limit);
+ struct xt_entry_match *ematch;
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ unsigned int j;
+ int ret, off;
+
+ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset < sizeof(struct compat_ipt_entry) +
- sizeof(struct compat_xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ sizeof(struct compat_xt_entry_target))
return -EINVAL;
- }
- if (!ip_checkentry(&e->ip)) {
- duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+ if (!ip_checkentry(&e->ip))
return -EINVAL;
- }
- if (e->target_offset + sizeof(struct compat_xt_entry_target) >
- e->next_offset)
- return -EINVAL;
+ ret = xt_compat_check_entry_offsets(e, e->elems,
+ e->target_offset, e->next_offset);
+ if (ret)
+ return ret;
- off = 0;
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
entry_offset = (void *)e - (void *)base;
j = 0;
- ret = IPT_MATCH_ITERATE(e, compat_check_calc_match, name, &e->ip,
- e->comefrom, &off, &j);
- if (ret != 0)
- goto cleanup_matches;
+ xt_ematch_foreach(ematch, e) {
+ ret = compat_find_calc_match(ematch, &e->ip, &off);
+ if (ret != 0)
+ goto release_matches;
+ ++j;
+ }
- t = ipt_get_target(e);
- ret = -EINVAL;
- if (e->target_offset + t->u.target_size > e->next_offset)
- goto cleanup_matches;
- target = try_then_request_module(xt_find_target(AF_INET,
- t->u.user.name,
- t->u.user.revision),
- "ipt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
- duprintf("check_entry: `%s' not found\n", t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
- goto cleanup_matches;
+ t = compat_ipt_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
+ goto release_matches;
}
t->u.kernel.target = target;
off += xt_compat_target_offset(target);
*size += off;
- ret = compat_add_offset(entry_offset, off);
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
if (ret)
goto out;
- /* Check hooks & underflows */
- for (h = 0; h < NF_IP_NUMHOOKS; h++) {
- if ((unsigned char *)e - base == hook_entries[h])
- newinfo->hook_entry[h] = hook_entries[h];
- if ((unsigned char *)e - base == underflows[h])
- newinfo->underflow[h] = underflows[h];
- }
-
- /* Clear counters and comefrom */
- e->counters = ((struct ipt_counters) { 0, 0 });
- e->comefrom = 0;
-
- (*i)++;
return 0;
out:
module_put(t->u.kernel.target->me);
-cleanup_matches:
- IPT_MATCH_ITERATE(e, cleanup_match, &j);
- return ret;
-}
-
-static inline int compat_copy_match_from_user(struct ipt_entry_match *m,
- void **dstptr, compat_uint_t *size, const char *name,
- const struct ipt_ip *ip, unsigned int hookmask)
-{
- struct ipt_entry_match *dm;
- struct ipt_match *match;
- int ret;
-
- dm = (struct ipt_entry_match *)*dstptr;
- match = m->u.kernel.match;
- xt_compat_match_from_user(m, dstptr, size);
-
- ret = xt_check_match(match, AF_INET, dm->u.match_size - sizeof(*dm),
- name, hookmask, ip->proto,
- ip->invflags & IPT_INV_PROTO);
- if (!ret && m->u.kernel.match->checkentry
- && !m->u.kernel.match->checkentry(name, ip, match, dm->data,
- hookmask)) {
- duprintf("ip_tables: check failed for `%s'.\n",
- m->u.kernel.match->name);
- ret = -EINVAL;
+release_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ module_put(ematch->u.kernel.match->me);
}
return ret;
}
-static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
- unsigned int *size, const char *name,
- struct xt_table_info *newinfo, unsigned char *base)
+static void
+compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
+ unsigned int *size,
+ struct xt_table_info *newinfo, unsigned char *base)
{
- struct ipt_entry_target *t;
- struct ipt_target *target;
+ struct xt_entry_target *t;
struct ipt_entry *de;
unsigned int origsize;
- int ret, h;
+ int h;
+ struct xt_entry_match *ematch;
- ret = 0;
origsize = *size;
- de = (struct ipt_entry *)*dstptr;
+ de = *dstptr;
memcpy(de, e, sizeof(struct ipt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct ipt_entry);
+ *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ xt_ematch_foreach(ematch, e)
+ xt_compat_match_from_user(ematch, dstptr, size);
- *dstptr += sizeof(struct compat_ipt_entry);
- ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size,
- name, &de->ip, de->comefrom);
- if (ret)
- goto err;
de->target_offset = e->target_offset - (origsize - *size);
- t = ipt_get_target(e);
- target = t->u.kernel.target;
+ t = compat_ipt_get_target(e);
xt_compat_target_from_user(t, dstptr, size);
de->next_offset = e->next_offset - (origsize - *size);
- for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if ((unsigned char *)de - base < newinfo->hook_entry[h])
newinfo->hook_entry[h] -= origsize - *size;
if ((unsigned char *)de - base < newinfo->underflow[h])
newinfo->underflow[h] -= origsize - *size;
}
-
- t = ipt_get_target(de);
- target = t->u.kernel.target;
- ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
- name, e->comefrom, e->ip.proto,
- e->ip.invflags & IPT_INV_PROTO);
- if (ret)
- goto err;
-
- ret = -EINVAL;
- if (t->u.kernel.target == &ipt_standard_target) {
- if (!standard_check(t, *size))
- goto err;
- } else if (t->u.kernel.target->checkentry
- && !t->u.kernel.target->checkentry(name, de, target,
- t->data, de->comefrom)) {
- duprintf("ip_tables: compat: check failed for `%s'.\n",
- t->u.kernel.target->name);
- goto err;
- }
- ret = 0;
-err:
- return ret;
}
static int
-translate_compat_table(const char *name,
- unsigned int valid_hooks,
- struct xt_table_info **pinfo,
- void **pentry0,
- unsigned int total_size,
- unsigned int number,
- unsigned int *hook_entries,
- unsigned int *underflows)
+translate_compat_table(struct net *net,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ const struct compat_ipt_replace *compatr)
{
unsigned int i, j;
struct xt_table_info *newinfo, *info;
void *pos, *entry0, *entry1;
+ struct compat_ipt_entry *iter0;
+ struct ipt_replace repl;
unsigned int size;
int ret;
info = *pinfo;
entry0 = *pentry0;
- size = total_size;
- info->number = number;
-
- /* Init all hooks to impossible value. */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- info->hook_entry[i] = 0xFFFFFFFF;
- info->underflow[i] = 0xFFFFFFFF;
- }
+ size = compatr->size;
+ info->number = compatr->num_entries;
- duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
- /* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(entry0, total_size,
- check_compat_entry_size_and_hooks,
- info, &size, entry0,
- entry0 + total_size,
- hook_entries, underflows, &j, name);
- if (ret != 0)
+ ret = xt_compat_init_offsets(AF_INET, compatr->num_entries);
+ if (ret)
goto out_unlock;
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, compatr->size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + compatr->size);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
ret = -EINVAL;
- if (j != number) {
- duprintf("translate_compat_table: %u not %u entries\n",
- j, number);
+ if (j != compatr->num_entries)
goto out_unlock;
- }
-
- /* Check hooks all assigned */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- /* Only hooks which are valid */
- if (!(valid_hooks & (1 << i)))
- continue;
- if (info->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, hook_entries[i]);
- goto out_unlock;
- }
- if (info->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, underflows[i]);
- goto out_unlock;
- }
- }
ret = -ENOMEM;
newinfo = xt_alloc_table_info(size);
if (!newinfo)
goto out_unlock;
- newinfo->number = number;
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- newinfo->hook_entry[i] = info->hook_entry[i];
- newinfo->underflow[i] = info->underflow[i];
+ memset(newinfo->entries, 0, size);
+
+ newinfo->number = compatr->num_entries;
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = compatr->hook_entry[i];
+ newinfo->underflow[i] = compatr->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
- size = total_size;
- ret = IPT_ENTRY_ITERATE(entry0, total_size,
- compat_copy_entry_from_user, &pos, &size,
- name, newinfo, entry1);
- compat_flush_offsets();
+ size = compatr->size;
+ xt_entry_foreach(iter0, entry0, compatr->size)
+ compat_copy_entry_from_user(iter0, &pos, &size,
+ newinfo, entry1);
+
+ /* all module references in entry0 are now gone.
+ * entry1/newinfo contains a 64bit ruleset that looks exactly as
+ * generated by 64bit userspace.
+ *
+ * Call standard translate_table() to validate all hook_entrys,
+ * underflows, check for loops, etc.
+ */
+ xt_compat_flush_offsets(AF_INET);
xt_compat_unlock(AF_INET);
- if (ret)
- goto free_newinfo;
- ret = -ELOOP;
- if (!mark_source_chains(newinfo, valid_hooks, entry1))
- goto free_newinfo;
+ memcpy(&repl, compatr, sizeof(*compatr));
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ repl.hook_entry[i] = newinfo->hook_entry[i];
+ repl.underflow[i] = newinfo->underflow[i];
+ }
+
+ repl.num_counters = 0;
+ repl.counters = NULL;
+ repl.size = newinfo->size;
+ ret = translate_table(net, newinfo, entry1, &repl);
+ if (ret)
+ goto free_newinfo;
*pinfo = newinfo;
*pentry0 = entry1;
@@ -1707,268 +1475,171 @@ translate_compat_table(const char *name,
free_newinfo:
xt_free_table_info(newinfo);
-out:
- IPT_ENTRY_ITERATE(entry0, total_size, cleanup_entry, &j);
return ret;
out_unlock:
- compat_flush_offsets();
+ xt_compat_flush_offsets(AF_INET);
xt_compat_unlock(AF_INET);
- goto out;
+ xt_entry_foreach(iter0, entry0, compatr->size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
}
static int
-compat_do_replace(void __user *user, unsigned int len)
+compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct compat_ipt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct ipt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (len != sizeof(tmp) + tmp.size)
- return -ENOPROTOOPT;
-
/* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ loc_cpu_entry = newinfo->entries;
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
- ret = translate_compat_table(tmp.name, tmp.valid_hooks,
- &newinfo, &loc_cpu_entry, tmp.size,
- tmp.num_entries, tmp.hook_entry, tmp.underflow);
+ ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
- duprintf("compat_do_replace: Translated table\n");
-
- ret = __do_replace(tmp.name, tmp.valid_hooks,
- newinfo, tmp.num_counters,
- compat_ptr(tmp.counters));
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
goto free_newinfo_untrans;
return 0;
free_newinfo_untrans:
- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
-static int
-compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
- unsigned int len)
-{
- int ret;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IPT_SO_SET_REPLACE:
- ret = compat_do_replace(user, len);
- break;
-
- case IPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(user, len, 1);
- break;
-
- default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
- ret = -EINVAL;
- }
-
- return ret;
-}
-
-struct compat_ipt_get_entries
-{
- char name[IPT_TABLE_MAXNAMELEN];
+struct compat_ipt_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ipt_entry entrytable[0];
+ struct compat_ipt_entry entrytable[];
};
-static int compat_copy_entries_to_user(unsigned int total_size,
- struct ipt_table *table, void __user *userptr)
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+ void __user *userptr)
{
- unsigned int off, num;
- struct compat_ipt_entry e;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = table->private;
void __user *pos;
unsigned int size;
int ret = 0;
- void *loc_cpu_entry;
+ unsigned int i = 0;
+ struct ipt_entry *iter;
counters = alloc_counters(table);
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size,
- compat_copy_entry_to_user, &pos, &size);
- if (ret)
- goto free_counters;
-
- /* ... then go back and fix counters and names */
- for (off = 0, num = 0; off < size; off += e.next_offset, num++) {
- unsigned int i;
- struct ipt_entry_match m;
- struct ipt_entry_target t;
-
- ret = -EFAULT;
- if (copy_from_user(&e, userptr + off,
- sizeof(struct compat_ipt_entry)))
- goto free_counters;
- if (copy_to_user(userptr + off +
- offsetof(struct compat_ipt_entry, counters),
- &counters[num], sizeof(counters[num])))
- goto free_counters;
-
- for (i = sizeof(struct compat_ipt_entry);
- i < e.target_offset; i += m.u.match_size) {
- if (copy_from_user(&m, userptr + off + i,
- sizeof(struct ipt_entry_match)))
- goto free_counters;
- if (copy_to_user(userptr + off + i +
- offsetof(struct ipt_entry_match, u.user.name),
- m.u.kernel.match->name,
- strlen(m.u.kernel.match->name) + 1))
- goto free_counters;
- }
-
- if (copy_from_user(&t, userptr + off + e.target_offset,
- sizeof(struct ipt_entry_target)))
- goto free_counters;
- if (copy_to_user(userptr + off + e.target_offset +
- offsetof(struct ipt_entry_target, u.user.name),
- t.u.kernel.target->name,
- strlen(t.u.kernel.target->name) + 1))
- goto free_counters;
+ xt_entry_foreach(iter, private->entries, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
}
- ret = 0;
-free_counters:
+
vfree(counters);
return ret;
}
static int
-compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len)
+compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
+ int *len)
{
int ret;
struct compat_ipt_get_entries get;
- struct ipt_table *t;
+ struct xt_table *t;
-
- if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %u\n",
- *len, (unsigned int)sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %u\n", *len,
- (unsigned int)(sizeof(struct compat_ipt_get_entries) +
- get.size));
+ if (*len != sizeof(struct compat_ipt_get_entries) + get.size)
return -EINVAL;
- }
+
+ get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET);
- t = xt_find_table_lock(AF_INET, get.name);
- if (t && !IS_ERR(t)) {
- struct xt_table_info *private = t->private;
+ t = xt_find_table_lock(net, AF_INET, get.name);
+ if (!IS_ERR(t)) {
+ const struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n",
- private->number);
ret = compat_table_info(private, &info);
- if (!ret && get.size == info.size) {
+ if (!ret && get.size == info.size)
ret = compat_copy_entries_to_user(private->size,
- t, uptr->entrytable);
- } else if (!ret) {
- duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size,
- get.size);
- ret = -EINVAL;
- }
- compat_flush_offsets();
+ t, uptr->entrytable);
+ else if (!ret)
+ ret = -EAGAIN;
+
+ xt_compat_flush_offsets(AF_INET);
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = PTR_ERR(t);
xt_compat_unlock(AF_INET);
return ret;
}
-
-static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
-
-static int
-compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
- int ret;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IPT_SO_GET_INFO:
- ret = get_info(user, len, 1);
- break;
- case IPT_SO_GET_ENTRIES:
- ret = compat_get_entries(user, len);
- break;
- default:
- ret = do_ipt_get_ctl(sk, cmd, user, len);
- }
- return ret;
-}
#endif
static int
-do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IPT_SO_SET_REPLACE:
- ret = do_replace(user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case IPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(user, len, 0);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1980,21 +1651,26 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IPT_SO_GET_INFO:
- ret = get_info(user, len, 0);
+ ret = get_info(sock_net(sk), user, len);
break;
case IPT_SO_GET_ENTRIES:
- ret = get_entries(user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
case IPT_SO_GET_REVISION_MATCH:
case IPT_SO_GET_REVISION_TARGET: {
- struct ipt_get_revision rev;
+ struct xt_get_revision rev;
int target;
if (*len != sizeof(rev)) {
@@ -2005,6 +1681,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EFAULT;
break;
}
+ rev.name[sizeof(rev.name)-1] = 0;
if (cmd == IPT_SO_GET_REVISION_TARGET)
target = 1;
@@ -2019,138 +1696,132 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
}
default:
- duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
return ret;
}
-int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
{
- int ret;
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ipt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ipt_register_table(struct net *net, const struct xt_table *table,
+ const struct ipt_replace *repl,
+ const struct nf_hook_ops *template_ops)
+{
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
- static struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
+ struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy on our node/cpu
- * but dont care of preemption
- */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(table->name, table->valid_hooks,
- newinfo, loc_cpu_entry, repl->size,
- repl->num_entries,
- repl->hook_entry,
- repl->underflow);
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0) {
xt_free_table_info(newinfo);
return ret;
}
- ret = xt_register_table(table, &bootstrap, newinfo);
- if (ret != 0) {
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ struct ipt_entry *iter;
+
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
xt_free_table_info(newinfo);
- return ret;
+ return PTR_ERR(new_table);
}
- return 0;
-}
+ /* No template? No need to do anything. This is used by 'nat' table, it registers
+ * with the nat core instead of the netfilter core.
+ */
+ if (!template_ops)
+ return 0;
-void ipt_unregister_table(struct ipt_table *table)
-{
- struct xt_table_info *private;
- void *loc_cpu_entry;
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
- private = xt_unregister_table(table);
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
- xt_free_table_info(private);
-}
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
-/* Returns 1 if the type and code is matched by the range, 0 otherwise */
-static inline int
-icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
- u_int8_t type, u_int8_t code,
- int invert)
-{
- return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
- ^ invert;
-}
+ new_table->ops = ops;
-static int
-icmp_match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
-{
- struct icmphdr _icmph, *ic;
- const struct ipt_icmp *icmpinfo = matchinfo;
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
- /* Must not be a fragment. */
- if (offset)
- return 0;
+ return ret;
- ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
- if (ic == NULL) {
- /* We've been asked to examine this packet, and we
- * can't. Hence, no choice but to drop.
- */
- duprintf("Dropping evil ICMP tinygram.\n");
- *hotdrop = 1;
- return 0;
- }
+out_free:
+ __ipt_unregister_table(net, new_table);
+ return ret;
+}
+
+void ipt_unregister_table_pre_exit(struct net *net, const char *name)
+{
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
- return icmp_type_code_match(icmpinfo->type,
- icmpinfo->code[0],
- icmpinfo->code[1],
- ic->type, ic->code,
- !!(icmpinfo->invflags&IPT_ICMP_INV));
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
-/* Called when user tries to insert an entry of this type. */
-static int
-icmp_checkentry(const char *tablename,
- const void *info,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+void ipt_unregister_table_exit(struct net *net, const char *name)
{
- const struct ipt_icmp *icmpinfo = matchinfo;
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
- /* Must specify no unknown invflags */
- return !(icmpinfo->invflags & ~IPT_ICMP_INV);
+ if (table)
+ __ipt_unregister_table(net, table);
}
-/* The built-in targets: standard (NULL) and error. */
-static struct ipt_target ipt_standard_target = {
- .name = IPT_STANDARD_TARGET,
- .targetsize = sizeof(int),
- .family = AF_INET,
-#ifdef CONFIG_COMPAT
- .compatsize = sizeof(compat_int_t),
- .compat_from_user = compat_standard_from_user,
- .compat_to_user = compat_standard_to_user,
+static struct xt_target ipt_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_IPV4,
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
#endif
-};
-
-static struct ipt_target ipt_error_target = {
- .name = IPT_ERROR_TARGET,
- .target = ipt_error,
- .targetsize = IPT_FUNCTION_MAXNAMELEN,
- .family = AF_INET,
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = ipt_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_IPV4,
+ },
};
static struct nf_sockopt_ops ipt_sockopts = {
@@ -2158,61 +1829,51 @@ static struct nf_sockopt_ops ipt_sockopts = {
.set_optmin = IPT_BASE_CTL,
.set_optmax = IPT_SO_SET_MAX+1,
.set = do_ipt_set_ctl,
-#ifdef CONFIG_COMPAT
- .compat_set = compat_do_ipt_set_ctl,
-#endif
.get_optmin = IPT_BASE_CTL,
.get_optmax = IPT_SO_GET_MAX+1,
.get = do_ipt_get_ctl,
-#ifdef CONFIG_COMPAT
- .compat_get = compat_do_ipt_get_ctl,
-#endif
+ .owner = THIS_MODULE,
};
-static struct ipt_match icmp_matchstruct = {
- .name = "icmp",
- .match = icmp_match,
- .matchsize = sizeof(struct ipt_icmp),
- .proto = IPPROTO_ICMP,
- .family = AF_INET,
- .checkentry = icmp_checkentry,
+static int __net_init ip_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_IPV4);
+}
+
+static void __net_exit ip_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_IPV4);
+}
+
+static struct pernet_operations ip_tables_net_ops = {
+ .init = ip_tables_net_init,
+ .exit = ip_tables_net_exit,
};
static int __init ip_tables_init(void)
{
int ret;
- ret = xt_proto_init(AF_INET);
+ ret = register_pernet_subsys(&ip_tables_net_ops);
if (ret < 0)
goto err1;
- /* Noone else will be downing sem now, so we won't sleep */
- ret = xt_register_target(&ipt_standard_target);
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_target(&ipt_error_target);
- if (ret < 0)
- goto err3;
- ret = xt_register_match(&icmp_matchstruct);
- if (ret < 0)
- goto err4;
/* Register setsockopt */
ret = nf_register_sockopt(&ipt_sockopts);
if (ret < 0)
- goto err5;
+ goto err4;
- printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n");
return 0;
-err5:
- xt_unregister_match(&icmp_matchstruct);
err4:
- xt_unregister_target(&ipt_error_target);
-err3:
- xt_unregister_target(&ipt_standard_target);
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
- xt_proto_fini(AF_INET);
+ unregister_pernet_subsys(&ip_tables_net_ops);
err1:
return ret;
}
@@ -2221,15 +1882,13 @@ static void __exit ip_tables_fini(void)
{
nf_unregister_sockopt(&ipt_sockopts);
- xt_unregister_match(&icmp_matchstruct);
- xt_unregister_target(&ipt_error_target);
- xt_unregister_target(&ipt_standard_target);
-
- xt_proto_fini(AF_INET);
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+ unregister_pernet_subsys(&ip_tables_net_ops);
}
EXPORT_SYMBOL(ipt_register_table);
-EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
+EXPORT_SYMBOL(ipt_unregister_table_exit);
EXPORT_SYMBOL(ipt_do_table);
module_init(ip_tables_init);
module_exit(ip_tables_fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
deleted file mode 100644
index 7a29d6e7baa7..000000000000
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ /dev/null
@@ -1,759 +0,0 @@
-/* Cluster IP hashmark target
- * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
- * based on ideas of Fabio Olive Leite <olive@unixforge.org>
- *
- * Development of this code funded by SuSE Linux AG, http://www.suse.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/jhash.h>
-#include <linux/bitops.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-
-#include <net/checksum.h>
-
-#include <linux/netfilter_arp.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
-#include <net/netfilter/nf_conntrack_compat.h>
-
-#define CLUSTERIP_VERSION "0.8"
-
-#define DEBUG_CLUSTERIP
-
-#ifdef DEBUG_CLUSTERIP
-#define DEBUGP printk
-#else
-#define DEBUGP
-#endif
-
-#define ASSERT_READ_LOCK(x)
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables target for CLUSTERIP");
-
-struct clusterip_config {
- struct list_head list; /* list of all configs */
- atomic_t refcount; /* reference count */
- atomic_t entries; /* number of entries/rules
- * referencing us */
-
- __be32 clusterip; /* the IP address */
- u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
- struct net_device *dev; /* device */
- u_int16_t num_total_nodes; /* total number of nodes */
- unsigned long local_nodes; /* node number array */
-
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *pde; /* proc dir entry */
-#endif
- enum clusterip_hashmode hash_mode; /* which hashing mode */
- u_int32_t hash_initval; /* hash initialization */
-};
-
-static LIST_HEAD(clusterip_configs);
-
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_RWLOCK(clusterip_lock);
-
-#ifdef CONFIG_PROC_FS
-static struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
-#endif
-
-static inline void
-clusterip_config_get(struct clusterip_config *c)
-{
- atomic_inc(&c->refcount);
-}
-
-static inline void
-clusterip_config_put(struct clusterip_config *c)
-{
- if (atomic_dec_and_test(&c->refcount))
- kfree(c);
-}
-
-/* increase the count of entries(rules) using/referencing this config */
-static inline void
-clusterip_config_entry_get(struct clusterip_config *c)
-{
- atomic_inc(&c->entries);
-}
-
-/* decrease the count of entries using/referencing this config. If last
- * entry(rule) is removed, remove the config from lists, but don't free it
- * yet, since proc-files could still be holding references */
-static inline void
-clusterip_config_entry_put(struct clusterip_config *c)
-{
- if (atomic_dec_and_test(&c->entries)) {
- write_lock_bh(&clusterip_lock);
- list_del(&c->list);
- write_unlock_bh(&clusterip_lock);
-
- dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
- dev_put(c->dev);
-
- /* In case anyone still accesses the file, the open/close
- * functions are also incrementing the refcount on their own,
- * so it's safe to remove the entry even if it's in use. */
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(c->pde->name, c->pde->parent);
-#endif
- }
-}
-
-static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
-{
- struct list_head *pos;
-
- ASSERT_READ_LOCK(&clusterip_lock);
- list_for_each(pos, &clusterip_configs) {
- struct clusterip_config *c = list_entry(pos,
- struct clusterip_config, list);
- if (c->clusterip == clusterip) {
- return c;
- }
- }
-
- return NULL;
-}
-
-static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
-{
- struct clusterip_config *c;
-
- read_lock_bh(&clusterip_lock);
- c = __clusterip_config_find(clusterip);
- if (!c) {
- read_unlock_bh(&clusterip_lock);
- return NULL;
- }
- atomic_inc(&c->refcount);
- if (entry)
- atomic_inc(&c->entries);
- read_unlock_bh(&clusterip_lock);
-
- return c;
-}
-
-static void
-clusterip_config_init_nodelist(struct clusterip_config *c,
- const struct ipt_clusterip_tgt_info *i)
-{
- int n;
-
- for (n = 0; n < i->num_local_nodes; n++) {
- set_bit(i->local_nodes[n] - 1, &c->local_nodes);
- }
-}
-
-static struct clusterip_config *
-clusterip_config_init(struct ipt_clusterip_tgt_info *i, __be32 ip,
- struct net_device *dev)
-{
- struct clusterip_config *c;
- char buffer[16];
-
- c = kzalloc(sizeof(*c), GFP_ATOMIC);
- if (!c)
- return NULL;
-
- c->dev = dev;
- c->clusterip = ip;
- memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
- c->num_total_nodes = i->num_total_nodes;
- clusterip_config_init_nodelist(c, i);
- c->hash_mode = i->hash_mode;
- c->hash_initval = i->hash_initval;
- atomic_set(&c->refcount, 1);
- atomic_set(&c->entries, 1);
-
-#ifdef CONFIG_PROC_FS
- /* create proc dir entry */
- sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
- c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir);
- if (!c->pde) {
- kfree(c);
- return NULL;
- }
- c->pde->proc_fops = &clusterip_proc_fops;
- c->pde->data = c;
-#endif
-
- write_lock_bh(&clusterip_lock);
- list_add(&c->list, &clusterip_configs);
- write_unlock_bh(&clusterip_lock);
-
- return c;
-}
-
-static int
-clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
-{
-
- if (nodenum == 0 ||
- nodenum > c->num_total_nodes)
- return 1;
-
- /* check if we already have this number in our bitfield */
- if (test_and_set_bit(nodenum - 1, &c->local_nodes))
- return 1;
-
- return 0;
-}
-
-static int
-clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
-{
- if (nodenum == 0 ||
- nodenum > c->num_total_nodes)
- return 1;
-
- if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
- return 0;
-
- return 1;
-}
-
-static inline u_int32_t
-clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
-{
- struct iphdr *iph = skb->nh.iph;
- unsigned long hashval;
- u_int16_t sport, dport;
- u_int16_t *ports;
-
- switch (iph->protocol) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- case IPPROTO_ICMP:
- ports = (void *)iph+iph->ihl*4;
- sport = ports[0];
- dport = ports[1];
- break;
- default:
- if (net_ratelimit()) {
- printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
- iph->protocol);
- }
- sport = dport = 0;
- }
-
- switch (config->hash_mode) {
- case CLUSTERIP_HASHMODE_SIP:
- hashval = jhash_1word(ntohl(iph->saddr),
- config->hash_initval);
- break;
- case CLUSTERIP_HASHMODE_SIP_SPT:
- hashval = jhash_2words(ntohl(iph->saddr), sport,
- config->hash_initval);
- break;
- case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
- hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
- config->hash_initval);
- break;
- default:
- /* to make gcc happy */
- hashval = 0;
- /* This cannot happen, unless the check function wasn't called
- * at rule load time */
- printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
- BUG();
- break;
- }
-
- /* node numbers are 1..n, not 0..n */
- return ((hashval % config->num_total_nodes)+1);
-}
-
-static inline int
-clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
-{
- return test_bit(hash - 1, &config->local_nodes);
-}
-
-/***********************************************************************
- * IPTABLES TARGET
- ***********************************************************************/
-
-static unsigned int
-target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
- enum ip_conntrack_info ctinfo;
- u_int32_t *mark, hash;
-
- /* don't need to clusterip_config_get() here, since refcount
- * is only decremented by destroy() - and ip_tables guarantees
- * that the ->target() function isn't called after ->destroy() */
-
- mark = nf_ct_get_mark((*pskb), &ctinfo);
- if (mark == NULL) {
- printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
- /* FIXME: need to drop invalid ones, since replies
- * to outgoing connections of other nodes will be
- * marked as INVALID */
- return NF_DROP;
- }
-
- /* special case: ICMP error handling. conntrack distinguishes between
- * error messages (RELATED) and information requests (see below) */
- if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
- && (ctinfo == IP_CT_RELATED
- || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
- return IPT_CONTINUE;
-
- /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
- * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
- * on, which all have an ID field [relevant for hashing]. */
-
- hash = clusterip_hashfn(*pskb, cipinfo->config);
-
- switch (ctinfo) {
- case IP_CT_NEW:
- *mark = hash;
- break;
- case IP_CT_RELATED:
- case IP_CT_RELATED+IP_CT_IS_REPLY:
- /* FIXME: we don't handle expectations at the
- * moment. they can arrive on a different node than
- * the master connection (e.g. FTP passive mode) */
- case IP_CT_ESTABLISHED:
- case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
- break;
- default:
- break;
- }
-
-#ifdef DEBUG_CLUSTERP
- DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-#endif
- DEBUGP("hash=%u ct_hash=%u ", hash, *mark);
- if (!clusterip_responsible(cipinfo->config, hash)) {
- DEBUGP("not responsible\n");
- return NF_DROP;
- }
- DEBUGP("responsible\n");
-
- /* despite being received via linklayer multicast, this is
- * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
- (*pskb)->pkt_type = PACKET_HOST;
-
- return IPT_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- struct ipt_clusterip_tgt_info *cipinfo = targinfo;
- const struct ipt_entry *e = e_void;
-
- struct clusterip_config *config;
-
- if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
- cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
- cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
- printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
- cipinfo->hash_mode);
- return 0;
-
- }
- if (e->ip.dmsk.s_addr != htonl(0xffffffff)
- || e->ip.dst.s_addr == 0) {
- printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
- return 0;
- }
-
- /* FIXME: further sanity checks */
-
- config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
- if (config) {
- if (cipinfo->config != NULL) {
- /* Case A: This is an entry that gets reloaded, since
- * it still has a cipinfo->config pointer. Simply
- * increase the entry refcount and return */
- if (cipinfo->config != config) {
- printk(KERN_ERR "CLUSTERIP: Reloaded entry "
- "has invalid config pointer!\n");
- return 0;
- }
- clusterip_config_entry_get(cipinfo->config);
- } else {
- /* Case B: This is a new rule referring to an existing
- * clusterip config. */
- cipinfo->config = config;
- clusterip_config_entry_get(cipinfo->config);
- }
- } else {
- /* Case C: This is a completely new clusterip config */
- if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
- printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
- return 0;
- } else {
- struct net_device *dev;
-
- if (e->ip.iniface[0] == '\0') {
- printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
- return 0;
- }
-
- dev = dev_get_by_name(e->ip.iniface);
- if (!dev) {
- printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
- return 0;
- }
-
- config = clusterip_config_init(cipinfo,
- e->ip.dst.s_addr, dev);
- if (!config) {
- printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
- dev_put(dev);
- return 0;
- }
- dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
- }
- cipinfo->config = config;
- }
-
- return 1;
-}
-
-/* drop reference count of cluster config when rule is deleted */
-static void destroy(const struct xt_target *target, void *targinfo)
-{
- struct ipt_clusterip_tgt_info *cipinfo = targinfo;
-
- /* if no more entries are referencing the config, remove it
- * from the list and destroy the proc entry */
- clusterip_config_entry_put(cipinfo->config);
-
- clusterip_config_put(cipinfo->config);
-}
-
-static struct ipt_target clusterip_tgt = {
- .name = "CLUSTERIP",
- .target = target,
- .targetsize = sizeof(struct ipt_clusterip_tgt_info),
- .checkentry = checkentry,
- .destroy = destroy,
- .me = THIS_MODULE
-};
-
-
-/***********************************************************************
- * ARP MANGLING CODE
- ***********************************************************************/
-
-/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
-struct arp_payload {
- u_int8_t src_hw[ETH_ALEN];
- __be32 src_ip;
- u_int8_t dst_hw[ETH_ALEN];
- __be32 dst_ip;
-} __attribute__ ((packed));
-
-#ifdef CLUSTERIP_DEBUG
-static void arp_print(struct arp_payload *payload)
-{
-#define HBUFFERLEN 30
- char hbuffer[HBUFFERLEN];
- int j,k;
- const char hexbuf[]= "0123456789abcdef";
-
- for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
- hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
- hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
- hbuffer[k++]=':';
- }
- hbuffer[--k]='\0';
-
- printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n",
- NIPQUAD(payload->src_ip), hbuffer,
- NIPQUAD(payload->dst_ip));
-}
-#endif
-
-static unsigned int
-arp_mangle(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct arphdr *arp = (*pskb)->nh.arph;
- struct arp_payload *payload;
- struct clusterip_config *c;
-
- /* we don't care about non-ethernet and non-ipv4 ARP */
- if (arp->ar_hrd != htons(ARPHRD_ETHER)
- || arp->ar_pro != htons(ETH_P_IP)
- || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
- return NF_ACCEPT;
-
- /* we only want to mangle arp requests and replies */
- if (arp->ar_op != htons(ARPOP_REPLY)
- && arp->ar_op != htons(ARPOP_REQUEST))
- return NF_ACCEPT;
-
- payload = (void *)(arp+1);
-
- /* if there is no clusterip configuration for the arp reply's
- * source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip, 0);
- if (!c)
- return NF_ACCEPT;
-
- /* normally the linux kernel always replies to arp queries of
- * addresses on different interfacs. However, in the CLUSTERIP case
- * this wouldn't work, since we didn't subscribe the mcast group on
- * other interfaces */
- if (c->dev != out) {
- DEBUGP("CLUSTERIP: not mangling arp reply on different "
- "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
- clusterip_config_put(c);
- return NF_ACCEPT;
- }
-
- /* mangle reply hardware address */
- memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
-
-#ifdef CLUSTERIP_DEBUG
- DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
- arp_print(payload);
-#endif
-
- clusterip_config_put(c);
-
- return NF_ACCEPT;
-}
-
-static struct nf_hook_ops cip_arp_ops = {
- .hook = arp_mangle,
- .pf = NF_ARP,
- .hooknum = NF_ARP_OUT,
- .priority = -1
-};
-
-/***********************************************************************
- * PROC DIR HANDLING
- ***********************************************************************/
-
-#ifdef CONFIG_PROC_FS
-
-struct clusterip_seq_position {
- unsigned int pos; /* position */
- unsigned int weight; /* number of bits set == size */
- unsigned int bit; /* current bit */
- unsigned long val; /* current value */
-};
-
-static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
-{
- struct proc_dir_entry *pde = s->private;
- struct clusterip_config *c = pde->data;
- unsigned int weight;
- u_int32_t local_nodes;
- struct clusterip_seq_position *idx;
-
- /* FIXME: possible race */
- local_nodes = c->local_nodes;
- weight = hweight32(local_nodes);
- if (*pos >= weight)
- return NULL;
-
- idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
- if (!idx)
- return ERR_PTR(-ENOMEM);
-
- idx->pos = *pos;
- idx->weight = weight;
- idx->bit = ffs(local_nodes);
- idx->val = local_nodes;
- clear_bit(idx->bit - 1, &idx->val);
-
- return idx;
-}
-
-static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v;
-
- *pos = ++idx->pos;
- if (*pos >= idx->weight) {
- kfree(v);
- return NULL;
- }
- idx->bit = ffs(idx->val);
- clear_bit(idx->bit - 1, &idx->val);
- return idx;
-}
-
-static void clusterip_seq_stop(struct seq_file *s, void *v)
-{
- kfree(v);
-}
-
-static int clusterip_seq_show(struct seq_file *s, void *v)
-{
- struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v;
-
- if (idx->pos != 0)
- seq_putc(s, ',');
-
- seq_printf(s, "%u", idx->bit);
-
- if (idx->pos == idx->weight - 1)
- seq_putc(s, '\n');
-
- return 0;
-}
-
-static struct seq_operations clusterip_seq_ops = {
- .start = clusterip_seq_start,
- .next = clusterip_seq_next,
- .stop = clusterip_seq_stop,
- .show = clusterip_seq_show,
-};
-
-static int clusterip_proc_open(struct inode *inode, struct file *file)
-{
- int ret = seq_open(file, &clusterip_seq_ops);
-
- if (!ret) {
- struct seq_file *sf = file->private_data;
- struct proc_dir_entry *pde = PDE(inode);
- struct clusterip_config *c = pde->data;
-
- sf->private = pde;
-
- clusterip_config_get(c);
- }
-
- return ret;
-}
-
-static int clusterip_proc_release(struct inode *inode, struct file *file)
-{
- struct proc_dir_entry *pde = PDE(inode);
- struct clusterip_config *c = pde->data;
- int ret;
-
- ret = seq_release(inode, file);
-
- if (!ret)
- clusterip_config_put(c);
-
- return ret;
-}
-
-static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
- size_t size, loff_t *ofs)
-{
-#define PROC_WRITELEN 10
- char buffer[PROC_WRITELEN+1];
- struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
- struct clusterip_config *c = pde->data;
- unsigned long nodenum;
-
- if (copy_from_user(buffer, input, PROC_WRITELEN))
- return -EFAULT;
-
- if (*buffer == '+') {
- nodenum = simple_strtoul(buffer+1, NULL, 10);
- if (clusterip_add_node(c, nodenum))
- return -ENOMEM;
- } else if (*buffer == '-') {
- nodenum = simple_strtoul(buffer+1, NULL,10);
- if (clusterip_del_node(c, nodenum))
- return -ENOENT;
- } else
- return -EIO;
-
- return size;
-}
-
-static struct file_operations clusterip_proc_fops = {
- .owner = THIS_MODULE,
- .open = clusterip_proc_open,
- .read = seq_read,
- .write = clusterip_proc_write,
- .llseek = seq_lseek,
- .release = clusterip_proc_release,
-};
-
-#endif /* CONFIG_PROC_FS */
-
-static int __init ipt_clusterip_init(void)
-{
- int ret;
-
- ret = ipt_register_target(&clusterip_tgt);
- if (ret < 0)
- return ret;
-
- ret = nf_register_hook(&cip_arp_ops);
- if (ret < 0)
- goto cleanup_target;
-
-#ifdef CONFIG_PROC_FS
- clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
- if (!clusterip_procdir) {
- printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
- ret = -ENOMEM;
- goto cleanup_hook;
- }
-#endif /* CONFIG_PROC_FS */
-
- printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
- CLUSTERIP_VERSION);
- return 0;
-
-cleanup_hook:
- nf_unregister_hook(&cip_arp_ops);
-cleanup_target:
- ipt_unregister_target(&clusterip_tgt);
- return ret;
-}
-
-static void __exit ipt_clusterip_fini(void)
-{
- printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
- CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
-#endif
- nf_unregister_hook(&cip_arp_ops);
- ipt_unregister_target(&clusterip_tgt);
-}
-
-module_init(ipt_clusterip_init);
-module_exit(ipt_clusterip_fini);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index b55d670a24df..5930d3b02555 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -1,69 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
*
* (C) 2002 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <net/ip.h>
#include <linux/tcp.h>
#include <net/checksum.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_ECN.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables ECN modification module");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification");
/* set ECT codepoint from IP header.
- * return 0 if there was an error. */
-static inline int
-set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
+ * return false if there was an error. */
+static inline bool
+set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
{
- struct iphdr *iph = (*pskb)->nh.iph;
+ struct iphdr *iph = ip_hdr(skb);
if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
__u8 oldtos;
- if (!skb_make_writable(pskb, sizeof(struct iphdr)))
- return 0;
- iph = (*pskb)->nh.iph;
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
+ return false;
+ iph = ip_hdr(skb);
oldtos = iph->tos;
iph->tos &= ~IPT_ECN_IP_MASK;
iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
- nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
- }
- return 1;
+ csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+ }
+ return true;
}
-/* Return 0 if there was an error. */
-static inline int
-set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
+/* Return false if there was an error. */
+static inline bool
+set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
{
struct tcphdr _tcph, *tcph;
__be16 oldval;
- /* Not enought header? */
- tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
- sizeof(_tcph), &_tcph);
+ /* Not enough header? */
+ tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
if (!tcph)
- return 0;
+ return false;
if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
tcph->ece == einfo->proto.tcp.ece) &&
- ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
- tcph->cwr == einfo->proto.tcp.cwr)))
- return 1;
+ (!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+ tcph->cwr == einfo->proto.tcp.cwr))
+ return true;
- if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
- return 0;
- tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+ if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+ return false;
+ tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
oldval = ((__be16 *)tcph)[6];
if (einfo->operation & IPT_ECN_OP_SET_ECE)
@@ -71,80 +68,66 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
if (einfo->operation & IPT_ECN_OP_SET_CWR)
tcph->cwr = einfo->proto.tcp.cwr;
- nf_proto_csum_replace2(&tcph->check, *pskb,
- oldval, ((__be16 *)tcph)[6], 0);
- return 1;
+ inet_proto_csum_replace2(&tcph->check, skb,
+ oldval, ((__be16 *)tcph)[6], false);
+ return true;
}
static unsigned int
-target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct ipt_ECN_info *einfo = targinfo;
+ const struct ipt_ECN_info *einfo = par->targinfo;
if (einfo->operation & IPT_ECN_OP_SET_IP)
- if (!set_ect_ip(pskb, einfo))
+ if (!set_ect_ip(skb, einfo))
return NF_DROP;
- if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
- && (*pskb)->nh.iph->protocol == IPPROTO_TCP)
- if (!set_ect_tcp(pskb, einfo))
+ if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
+ ip_hdr(skb)->protocol == IPPROTO_TCP)
+ if (!set_ect_tcp(skb, einfo))
return NF_DROP;
- return IPT_CONTINUE;
+ return XT_CONTINUE;
}
-static int
-checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+static int ecn_tg_check(const struct xt_tgchk_param *par)
{
- const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
- const struct ipt_entry *e = e_void;
+ const struct ipt_ECN_info *einfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
- if (einfo->operation & IPT_ECN_OP_MASK) {
- printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
- einfo->operation);
- return 0;
- }
- if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
- printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
- einfo->ip_ect);
- return 0;
- }
- if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
- && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) {
- printk(KERN_WARNING "ECN: cannot use TCP operations on a "
- "non-tcp rule\n");
- return 0;
+ if (einfo->operation & IPT_ECN_OP_MASK)
+ return -EINVAL;
+
+ if (einfo->ip_ect & ~IPT_ECN_IP_MASK)
+ return -EINVAL;
+
+ if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
+ (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
+ pr_info_ratelimited("cannot use operation on non-tcp rule\n");
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static struct ipt_target ipt_ecn_reg = {
+static struct xt_target ecn_tg_reg __read_mostly = {
.name = "ECN",
- .target = target,
+ .family = NFPROTO_IPV4,
+ .target = ecn_tg,
.targetsize = sizeof(struct ipt_ECN_info),
.table = "mangle",
- .checkentry = checkentry,
+ .checkentry = ecn_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_ecn_init(void)
+static int __init ecn_tg_init(void)
{
- return ipt_register_target(&ipt_ecn_reg);
+ return xt_register_target(&ecn_tg_reg);
}
-static void __exit ipt_ecn_fini(void)
+static void __exit ecn_tg_exit(void)
{
- ipt_unregister_target(&ipt_ecn_reg);
+ xt_unregister_target(&ecn_tg_reg);
}
-module_init(ipt_ecn_init);
-module_exit(ipt_ecn_fini);
+module_init(ecn_tg_init);
+module_exit(ecn_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
deleted file mode 100644
index 46eee64a11f6..000000000000
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * This is a module which is used for logging packets.
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_LOG.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables syslog logging module");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
-/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
- const struct sk_buff *skb,
- unsigned int iphoff)
-{
- struct iphdr _iph, *ih;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_MASK;
-
- ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
- if (ih == NULL) {
- printk("TRUNCATED");
- return;
- }
-
- /* Important fields:
- * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
- /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
- NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
-
- /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
- ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
-
- /* Max length: 6 "CE DF MF " */
- if (ntohs(ih->frag_off) & IP_CE)
- printk("CE ");
- if (ntohs(ih->frag_off) & IP_DF)
- printk("DF ");
- if (ntohs(ih->frag_off) & IP_MF)
- printk("MF ");
-
- /* Max length: 11 "FRAG:65535 " */
- if (ntohs(ih->frag_off) & IP_OFFSET)
- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
-
- if ((logflags & IPT_LOG_IPOPT)
- && ih->ihl * 4 > sizeof(struct iphdr)) {
- unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
- unsigned int i, optsize;
-
- optsize = ih->ihl * 4 - sizeof(struct iphdr);
- op = skb_header_pointer(skb, iphoff+sizeof(_iph),
- optsize, _opt);
- if (op == NULL) {
- printk("TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
- for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
- }
-
- switch (ih->protocol) {
- case IPPROTO_TCP: {
- struct tcphdr _tcph, *th;
-
- /* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & IPT_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- /* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3F " */
- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- printk("CWR ");
- if (th->ece)
- printk("ECE ");
- if (th->urg)
- printk("URG ");
- if (th->ack)
- printk("ACK ");
- if (th->psh)
- printk("PSH ");
- if (th->rst)
- printk("RST ");
- if (th->syn)
- printk("SYN ");
- if (th->fin)
- printk("FIN ");
- /* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & IPT_LOG_TCPOPT)
- && th->doff * 4 > sizeof(struct tcphdr)) {
- unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
- unsigned char *op;
- unsigned int i, optsize;
-
- optsize = th->doff * 4 - sizeof(struct tcphdr);
- op = skb_header_pointer(skb,
- iphoff+ih->ihl*4+sizeof(_tcph),
- optsize, _opt);
- if (op == NULL) {
- printk("TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
- for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
- }
- break;
- }
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE: {
- struct udphdr _udph, *uh;
-
- if (ih->protocol == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
- else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_udph), &_udph);
- if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest),
- ntohs(uh->len));
- break;
- }
- case IPPROTO_ICMP: {
- struct icmphdr _icmph, *ich;
- static const size_t required_len[NR_ICMP_TYPES+1]
- = { [ICMP_ECHOREPLY] = 4,
- [ICMP_DEST_UNREACH]
- = 8 + sizeof(struct iphdr),
- [ICMP_SOURCE_QUENCH]
- = 8 + sizeof(struct iphdr),
- [ICMP_REDIRECT]
- = 8 + sizeof(struct iphdr),
- [ICMP_ECHO] = 4,
- [ICMP_TIME_EXCEEDED]
- = 8 + sizeof(struct iphdr),
- [ICMP_PARAMETERPROB]
- = 8 + sizeof(struct iphdr),
- [ICMP_TIMESTAMP] = 20,
- [ICMP_TIMESTAMPREPLY] = 20,
- [ICMP_ADDRESS] = 12,
- [ICMP_ADDRESSREPLY] = 12 };
-
- /* Max length: 11 "PROTO=ICMP " */
- printk("PROTO=ICMP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_icmph), &_icmph);
- if (ich == NULL) {
- printk("INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ich->type, ich->code);
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- if (ich->type <= NR_ICMP_TYPES
- && required_len[ich->type]
- && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- printk("INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- switch (ich->type) {
- case ICMP_ECHOREPLY:
- case ICMP_ECHO:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
- ntohs(ich->un.echo.id),
- ntohs(ich->un.echo.sequence));
- break;
-
- case ICMP_PARAMETERPROB:
- /* Max length: 14 "PARAMETER=255 " */
- printk("PARAMETER=%u ",
- ntohl(ich->un.gateway) >> 24);
- break;
- case ICMP_REDIRECT:
- /* Max length: 24 "GATEWAY=255.255.255.255 " */
- printk("GATEWAY=%u.%u.%u.%u ",
- NIPQUAD(ich->un.gateway));
- /* Fall through */
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_TIME_EXCEEDED:
- /* Max length: 3+maxlen */
- if (!iphoff) { /* Only recurse once. */
- printk("[");
- dump_packet(info, skb,
- iphoff + ih->ihl*4+sizeof(_icmph));
- printk("] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ich->type == ICMP_DEST_UNREACH
- && ich->code == ICMP_FRAG_NEEDED)
- printk("MTU=%u ", ntohs(ich->un.frag.mtu));
- }
- break;
- }
- /* Max Length */
- case IPPROTO_AH: {
- struct ip_auth_hdr _ahdr, *ah;
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 9 "PROTO=AH " */
- printk("PROTO=AH ");
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_ahdr), &_ahdr);
- if (ah == NULL) {
- printk("INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(ah->spi));
- break;
- }
- case IPPROTO_ESP: {
- struct ip_esp_hdr _esph, *eh;
-
- /* Max length: 10 "PROTO=ESP " */
- printk("PROTO=ESP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_esph), &_esph);
- if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(eh->spi));
- break;
- }
- /* Max length: 10 "PROTO 255 " */
- default:
- printk("PROTO=%u ", ih->protocol);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
- }
-
- /* Proto Max log string length */
- /* IP: 40+46+6+11+127 = 230 */
- /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
- /* UDP: 10+max(25,20) = 35 */
- /* UDPLITE: 14+max(25,20) = 39 */
- /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
- /* ESP: 10+max(25)+15 = 50 */
- /* AH: 9+max(25)+15 = 49 */
- /* unknown: 10 */
-
- /* (ICMP allows recursion one level deep) */
- /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
- /* maxlen = 230+ 91 + 230 + 252 = 803 */
-}
-
-static struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = 0,
- .logflags = NF_LOG_MASK,
- },
- },
-};
-
-static void
-ipt_log_packet(unsigned int pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- if (!loginfo)
- loginfo = &default_loginfo;
-
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
- prefix,
- in ? in->name : "",
- out ? out->name : "");
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- struct net_device *physindev = skb->nf_bridge->physindev;
- struct net_device *physoutdev = skb->nf_bridge->physoutdev;
-
- if (physindev && in != physindev)
- printk("PHYSIN=%s ", physindev->name);
- if (physoutdev && out != physoutdev)
- printk("PHYSOUT=%s ", physoutdev->name);
- }
-#endif
-
- if (in && !out) {
- /* MAC logging for input chain only. */
- printk("MAC=");
- if (skb->dev && skb->dev->hard_header_len
- && skb->mac.raw != (void*)skb->nh.iph) {
- int i;
- unsigned char *p = skb->mac.raw;
- for (i = 0; i < skb->dev->hard_header_len; i++,p++)
- printk("%02x%c", *p,
- i==skb->dev->hard_header_len - 1
- ? ' ':':');
- } else
- printk(" ");
- }
-
- dump_packet(loginfo, skb, 0);
- printk("\n");
- spin_unlock_bh(&log_lock);
-}
-
-static unsigned int
-ipt_log_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ipt_log_info *loginfo = targinfo;
- struct nf_loginfo li;
-
- li.type = NF_LOG_TYPE_LOG;
- li.u.log.level = loginfo->level;
- li.u.log.logflags = loginfo->logflags;
-
- if (loginfo->logflags & IPT_LOG_NFLOG)
- nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li,
- "%s", loginfo->prefix);
- else
- ipt_log_packet(PF_INET, hooknum, *pskb, in, out, &li,
- loginfo->prefix);
-
- return IPT_CONTINUE;
-}
-
-static int ipt_log_checkentry(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const struct ipt_log_info *loginfo = targinfo;
-
- if (loginfo->level >= 8) {
- DEBUGP("LOG: level %u >= 8\n", loginfo->level);
- return 0;
- }
- if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
- DEBUGP("LOG: prefix term %i\n",
- loginfo->prefix[sizeof(loginfo->prefix)-1]);
- return 0;
- }
- return 1;
-}
-
-static struct ipt_target ipt_log_reg = {
- .name = "LOG",
- .target = ipt_log_target,
- .targetsize = sizeof(struct ipt_log_info),
- .checkentry = ipt_log_checkentry,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ipt_log_logger ={
- .name = "ipt_LOG",
- .logfn = &ipt_log_packet,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_log_init(void)
-{
- if (ipt_register_target(&ipt_log_reg))
- return -EINVAL;
- if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
- printk(KERN_WARNING "ipt_LOG: not logging via system console "
- "since somebody else already registered for PF_INET\n");
- /* we cannot make module load fail here, since otherwise
- * iptables userspace would abort */
- }
-
- return 0;
-}
-
-static void __exit ipt_log_fini(void)
-{
- nf_log_unregister_logger(&ipt_log_logger);
- ipt_unregister_target(&ipt_log_reg);
-}
-
-module_init(ipt_log_init);
-module_exit(ipt_log_fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
deleted file mode 100644
index 3dbfcfac8a84..000000000000
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Masquerade. Simple mapping which alters range to a local IP address
- (depending on route). */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables MASQUERADE target module");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Lock protects masq region inside conntrack */
-static DEFINE_RWLOCK(masq_lock);
-
-/* FIXME: Multiple targets. --RR */
-static int
-masquerade_check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const struct ip_nat_multi_range_compat *mr = targinfo;
-
- if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
- DEBUGP("masquerade_check: bad MAP_IPS.\n");
- return 0;
- }
- if (mr->rangesize != 1) {
- DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
- return 0;
- }
- return 1;
-}
-
-static unsigned int
-masquerade_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- const struct ip_nat_multi_range_compat *mr;
- struct ip_nat_range newrange;
- struct rtable *rt;
- __be32 newsrc;
-
- IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
- IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
- || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
-
- /* Source address is 0.0.0.0 - locally generated packet that is
- * probably not supposed to be masqueraded.
- */
- if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip == 0)
- return NF_ACCEPT;
-
- mr = targinfo;
- rt = (struct rtable *)(*pskb)->dst;
- newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
- if (!newsrc) {
- printk("MASQUERADE: %s ate my IP address\n", out->name);
- return NF_DROP;
- }
-
- write_lock_bh(&masq_lock);
- ct->nat.masq_index = out->ifindex;
- write_unlock_bh(&masq_lock);
-
- /* Transfer from original range. */
- newrange = ((struct ip_nat_range)
- { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
- newsrc, newsrc,
- mr->range[0].min, mr->range[0].max });
-
- /* Hand modified range to generic setup. */
- return ip_nat_setup_info(ct, &newrange, hooknum);
-}
-
-static inline int
-device_cmp(struct ip_conntrack *i, void *ifindex)
-{
- int ret;
-
- read_lock_bh(&masq_lock);
- ret = (i->nat.masq_index == (int)(long)ifindex);
- read_unlock_bh(&masq_lock);
-
- return ret;
-}
-
-static int masq_device_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (event == NETDEV_DOWN) {
- /* Device was downed. Search entire table for
- conntracks which were associated with that device,
- and forget them. */
- IP_NF_ASSERT(dev->ifindex != 0);
-
- ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
- }
-
- return NOTIFY_DONE;
-}
-
-static int masq_inet_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
-
- if (event == NETDEV_DOWN) {
- /* IP address was deleted. Search entire table for
- conntracks which were associated with that device,
- and forget them. */
- IP_NF_ASSERT(dev->ifindex != 0);
-
- ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block masq_dev_notifier = {
- .notifier_call = masq_device_event,
-};
-
-static struct notifier_block masq_inet_notifier = {
- .notifier_call = masq_inet_event,
-};
-
-static struct ipt_target masquerade = {
- .name = "MASQUERADE",
- .target = masquerade_target,
- .targetsize = sizeof(struct ip_nat_multi_range_compat),
- .table = "nat",
- .hooks = 1 << NF_IP_POST_ROUTING,
- .checkentry = masquerade_check,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_masquerade_init(void)
-{
- int ret;
-
- ret = ipt_register_target(&masquerade);
-
- if (ret == 0) {
- /* Register for device down reports */
- register_netdevice_notifier(&masq_dev_notifier);
- /* Register IP address change reports */
- register_inetaddr_notifier(&masq_inet_notifier);
- }
-
- return ret;
-}
-
-static void __exit ipt_masquerade_fini(void)
-{
- ipt_unregister_target(&masquerade);
- unregister_netdevice_notifier(&masq_dev_notifier);
- unregister_inetaddr_notifier(&masq_inet_notifier);
-}
-
-module_init(ipt_masquerade_init);
-module_exit(ipt_masquerade_fini);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
deleted file mode 100644
index 58a88f227108..000000000000
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/* NETMAP - static NAT mapping of IP network addresses (1:1).
- * The mapping can be applied to source (POSTROUTING),
- * destination (PREROUTING), or both (with separate rules).
- */
-
-/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-
-#define MODULENAME "NETMAP"
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
-MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int
-check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const struct ip_nat_multi_range_compat *mr = targinfo;
-
- if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
- DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
- return 0;
- }
- if (mr->rangesize != 1) {
- DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
- return 0;
- }
- return 1;
-}
-
-static unsigned int
-target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- __be32 new_ip, netmask;
- const struct ip_nat_multi_range_compat *mr = targinfo;
- struct ip_nat_range newrange;
-
- IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_POST_ROUTING
- || hooknum == NF_IP_LOCAL_OUT);
- ct = ip_conntrack_get(*pskb, &ctinfo);
-
- netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
-
- if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
- new_ip = (*pskb)->nh.iph->daddr & ~netmask;
- else
- new_ip = (*pskb)->nh.iph->saddr & ~netmask;
- new_ip |= mr->range[0].min_ip & netmask;
-
- newrange = ((struct ip_nat_range)
- { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
- new_ip, new_ip,
- mr->range[0].min, mr->range[0].max });
-
- /* Hand modified range to generic setup. */
- return ip_nat_setup_info(ct, &newrange, hooknum);
-}
-
-static struct ipt_target target_module = {
- .name = MODULENAME,
- .target = target,
- .targetsize = sizeof(struct ip_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
- (1 << NF_IP_LOCAL_OUT),
- .checkentry = check,
- .me = THIS_MODULE
-};
-
-static int __init ipt_netmap_init(void)
-{
- return ipt_register_target(&target_module);
-}
-
-static void __exit ipt_netmap_fini(void)
-{
- ipt_unregister_target(&target_module);
-}
-
-module_init(ipt_netmap_init);
-module_exit(ipt_netmap_fini);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
deleted file mode 100644
index c0dcfe9d610c..000000000000
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Redirect. Simple mapping which alters dst to a local IP address. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <net/protocol.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables REDIRECT target module");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* FIXME: Take multiple ranges --RR */
-static int
-redirect_check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const struct ip_nat_multi_range_compat *mr = targinfo;
-
- if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
- DEBUGP("redirect_check: bad MAP_IPS.\n");
- return 0;
- }
- if (mr->rangesize != 1) {
- DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
- return 0;
- }
- return 1;
-}
-
-static unsigned int
-redirect_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- __be32 newdst;
- const struct ip_nat_multi_range_compat *mr = targinfo;
- struct ip_nat_range newrange;
-
- IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_LOCAL_OUT);
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
- IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-
- /* Local packets: make them go to loopback */
- if (hooknum == NF_IP_LOCAL_OUT)
- newdst = htonl(0x7F000001);
- else {
- struct in_device *indev;
- struct in_ifaddr *ifa;
-
- newdst = 0;
-
- rcu_read_lock();
- indev = __in_dev_get_rcu((*pskb)->dev);
- if (indev && (ifa = indev->ifa_list))
- newdst = ifa->ifa_local;
- rcu_read_unlock();
-
- if (!newdst)
- return NF_DROP;
- }
-
- /* Transfer from original range. */
- newrange = ((struct ip_nat_range)
- { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
- newdst, newdst,
- mr->range[0].min, mr->range[0].max });
-
- /* Hand modified range to generic setup. */
- return ip_nat_setup_info(ct, &newrange, hooknum);
-}
-
-static struct ipt_target redirect_reg = {
- .name = "REDIRECT",
- .target = redirect_target,
- .targetsize = sizeof(struct ip_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT),
- .checkentry = redirect_check,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_redirect_init(void)
-{
- return ipt_register_target(&redirect_reg);
-}
-
-static void __exit ipt_redirect_fini(void)
-{
- ipt_unregister_target(&redirect_reg);
-}
-
-module_init(ipt_redirect_init);
-module_exit(ipt_redirect_fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index f0319e5ee437..4b8840734762 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -1,212 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for rejecting packets.
- * Added support for customized reject packets (Jozsef Kadlecsik).
- * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812]
*/
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/route.h>
-#include <net/dst.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_REJECT.h>
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
#include <linux/netfilter_bridge.h>
#endif
+#include <net/netfilter/ipv4/nf_reject.h>
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables REJECT target module");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Send RST reply */
-static void send_reset(struct sk_buff *oldskb, int hook)
-{
- struct sk_buff *nskb;
- struct iphdr *iph = oldskb->nh.iph;
- struct tcphdr _otcph, *oth, *tcph;
- __be16 tmp_port;
- __be32 tmp_addr;
- int needs_ack;
- unsigned int addr_type;
-
- /* IP header checks: fragment. */
- if (oldskb->nh.iph->frag_off & htons(IP_OFFSET))
- return;
-
- oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4,
- sizeof(_otcph), &_otcph);
- if (oth == NULL)
- return;
-
- /* No RST for RST. */
- if (oth->rst)
- return;
-
- /* Check checksum */
- if (nf_ip_checksum(oldskb, hook, iph->ihl * 4, IPPROTO_TCP))
- return;
-
- /* We need a linear, writeable skb. We also need to expand
- headroom in case hh_len of incoming interface < hh_len of
- outgoing interface */
- nskb = skb_copy_expand(oldskb, LL_MAX_HEADER, skb_tailroom(oldskb),
- GFP_ATOMIC);
- if (!nskb)
- return;
-
- /* This packet will not be the same as the other: clear nf fields */
- nf_reset(nskb);
- nskb->mark = 0;
- skb_init_secmark(nskb);
-
- tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
-
- /* Swap source and dest */
- tmp_addr = nskb->nh.iph->saddr;
- nskb->nh.iph->saddr = nskb->nh.iph->daddr;
- nskb->nh.iph->daddr = tmp_addr;
- tmp_port = tcph->source;
- tcph->source = tcph->dest;
- tcph->dest = tmp_port;
-
- /* Truncate to length (no data) */
- tcph->doff = sizeof(struct tcphdr)/4;
- skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr));
- nskb->nh.iph->tot_len = htons(nskb->len);
-
- if (tcph->ack) {
- needs_ack = 0;
- tcph->seq = oth->ack_seq;
- tcph->ack_seq = 0;
- } else {
- needs_ack = 1;
- tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin
- + oldskb->len - oldskb->nh.iph->ihl*4
- - (oth->doff<<2));
- tcph->seq = 0;
- }
-
- /* Reset flags */
- ((u_int8_t *)tcph)[13] = 0;
- tcph->rst = 1;
- tcph->ack = needs_ack;
-
- tcph->window = 0;
- tcph->urg_ptr = 0;
-
- /* Adjust TCP checksum */
- tcph->check = 0;
- tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr),
- nskb->nh.iph->saddr,
- nskb->nh.iph->daddr,
- csum_partial((char *)tcph,
- sizeof(struct tcphdr), 0));
-
- /* Set DF, id = 0 */
- nskb->nh.iph->frag_off = htons(IP_DF);
- nskb->nh.iph->id = 0;
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
- addr_type = RTN_UNSPEC;
- if (hook != NF_IP_FORWARD
-#ifdef CONFIG_BRIDGE_NETFILTER
- || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED)
-#endif
- )
- addr_type = RTN_LOCAL;
-
- if (ip_route_me_harder(&nskb, addr_type))
- goto free_nskb;
-
- nskb->ip_summed = CHECKSUM_NONE;
-
- /* Adjust IP TTL */
- nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT);
-
- /* Adjust IP checksum */
- nskb->nh.iph->check = 0;
- nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph,
- nskb->nh.iph->ihl);
-
- /* "Never happens" */
- if (nskb->len > dst_mtu(nskb->dst))
- goto free_nskb;
-
- nf_ct_attach(nskb, oldskb);
-
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
- dst_output);
- return;
-
- free_nskb:
- kfree_skb(nskb);
-}
-
-static inline void send_unreach(struct sk_buff *skb_in, int code)
+static unsigned int
+reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
-}
+ const struct ipt_reject_info *reject = par->targinfo;
+ int hook = xt_hooknum(par);
-static unsigned int reject(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ipt_reject_info *reject = targinfo;
-
- /* Our naive response construction doesn't deal with IP
- options, and probably shouldn't try. */
- if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr))
- return NF_DROP;
-
- /* WARNING: This code causes reentry within iptables.
- This means that the iptables jump stack is now crap. We
- must return an absolute verdict. --RR */
- switch (reject->with) {
- case IPT_ICMP_NET_UNREACHABLE:
- send_unreach(*pskb, ICMP_NET_UNREACH);
- break;
- case IPT_ICMP_HOST_UNREACHABLE:
- send_unreach(*pskb, ICMP_HOST_UNREACH);
- break;
- case IPT_ICMP_PROT_UNREACHABLE:
- send_unreach(*pskb, ICMP_PROT_UNREACH);
- break;
- case IPT_ICMP_PORT_UNREACHABLE:
- send_unreach(*pskb, ICMP_PORT_UNREACH);
- break;
- case IPT_ICMP_NET_PROHIBITED:
- send_unreach(*pskb, ICMP_NET_ANO);
- break;
+ switch (reject->with) {
+ case IPT_ICMP_NET_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
+ break;
+ case IPT_ICMP_HOST_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
+ break;
+ case IPT_ICMP_PROT_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
+ break;
+ case IPT_ICMP_PORT_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
+ break;
+ case IPT_ICMP_NET_PROHIBITED:
+ nf_send_unreach(skb, ICMP_NET_ANO, hook);
+ break;
case IPT_ICMP_HOST_PROHIBITED:
- send_unreach(*pskb, ICMP_HOST_ANO);
- break;
- case IPT_ICMP_ADMIN_PROHIBITED:
- send_unreach(*pskb, ICMP_PKT_FILTERED);
+ nf_send_unreach(skb, ICMP_HOST_ANO, hook);
+ break;
+ case IPT_ICMP_ADMIN_PROHIBITED:
+ nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- send_reset(*pskb, hooknum);
+ nf_send_reset(xt_net(par), par->state->sk, skb, hook);
+ break;
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
@@ -215,49 +66,46 @@ static unsigned int reject(struct sk_buff **pskb,
return NF_DROP;
}
-static int check(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+static int reject_tg_check(const struct xt_tgchk_param *par)
{
- const struct ipt_reject_info *rejinfo = targinfo;
- const struct ipt_entry *e = e_void;
+ const struct ipt_reject_info *rejinfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
- printk("REJECT: ECHOREPLY no longer supported.\n");
- return 0;
+ pr_info_ratelimited("ECHOREPLY no longer supported.\n");
+ return -EINVAL;
} else if (rejinfo->with == IPT_TCP_RESET) {
/* Must specify that it's a TCP packet */
- if (e->ip.proto != IPPROTO_TCP
- || (e->ip.invflags & IPT_INV_PROTO)) {
- DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
- return 0;
+ if (e->ip.proto != IPPROTO_TCP ||
+ (e->ip.invflags & XT_INV_PROTO)) {
+ pr_info_ratelimited("TCP_RESET invalid for non-tcp\n");
+ return -EINVAL;
}
}
- return 1;
+ return 0;
}
-static struct ipt_target ipt_reject_reg = {
+static struct xt_target reject_tg_reg __read_mostly = {
.name = "REJECT",
- .target = reject,
+ .family = NFPROTO_IPV4,
+ .target = reject_tg,
.targetsize = sizeof(struct ipt_reject_info),
.table = "filter",
- .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) |
- (1 << NF_IP_LOCAL_OUT),
- .checkentry = check,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = reject_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_reject_init(void)
+static int __init reject_tg_init(void)
{
- return ipt_register_target(&ipt_reject_reg);
+ return xt_register_target(&reject_tg_reg);
}
-static void __exit ipt_reject_fini(void)
+static void __exit reject_tg_exit(void)
{
- ipt_unregister_target(&ipt_reject_reg);
+ xt_unregister_target(&reject_tg_reg);
}
-module_init(ipt_reject_init);
-module_exit(ipt_reject_fini);
+module_init(reject_tg_init);
+module_exit(reject_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
deleted file mode 100644
index b38b13328d73..000000000000
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Same. Just like SNAT, only try to make the connections
- * between client A and server B always have the same source ip.
- *
- * (C) 2000 Paul `Rusty' Russell
- * (C) 2001 Martin Josefsson
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 010320 Martin Josefsson <gandalf@wlug.westbo.se>
- * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things.
- * 010728 Martin Josefsson <gandalf@wlug.westbo.se>
- * * added --nodst to not include destination-ip in new source
- * calculations.
- * * added some more sanity-checks.
- * 010729 Martin Josefsson <gandalf@wlug.westbo.se>
- * * fixed a buggy if-statement in same_check(), should have
- * used ntohl() but didn't.
- * * added support for multiple ranges. IPT_SAME_MAX_RANGE is
- * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h
- * and is currently set to 10.
- * * added support for 1-address range, nice to have now that
- * we have multiple ranges.
- */
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <net/protocol.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/ipt_SAME.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
-MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int
-same_check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- unsigned int count, countess, rangeip, index = 0;
- struct ipt_same_info *mr = targinfo;
-
- mr->ipnum = 0;
-
- if (mr->rangesize < 1) {
- DEBUGP("same_check: need at least one dest range.\n");
- return 0;
- }
- if (mr->rangesize > IPT_SAME_MAX_RANGE) {
- DEBUGP("same_check: too many ranges specified, maximum "
- "is %u ranges\n",
- IPT_SAME_MAX_RANGE);
- return 0;
- }
- for (count = 0; count < mr->rangesize; count++) {
- if (ntohl(mr->range[count].min_ip) >
- ntohl(mr->range[count].max_ip)) {
- DEBUGP("same_check: min_ip is larger than max_ip in "
- "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
- NIPQUAD(mr->range[count].min_ip),
- NIPQUAD(mr->range[count].max_ip));
- return 0;
- }
- if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
- DEBUGP("same_check: bad MAP_IPS.\n");
- return 0;
- }
- rangeip = (ntohl(mr->range[count].max_ip) -
- ntohl(mr->range[count].min_ip) + 1);
- mr->ipnum += rangeip;
-
- DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip);
- }
- DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum);
-
- mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
- if (!mr->iparray) {
- DEBUGP("same_check: Couldn't allocate %u bytes "
- "for %u ipaddresses!\n",
- (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
- return 0;
- }
- DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
- (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
-
- for (count = 0; count < mr->rangesize; count++) {
- for (countess = ntohl(mr->range[count].min_ip);
- countess <= ntohl(mr->range[count].max_ip);
- countess++) {
- mr->iparray[index] = countess;
- DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' "
- "in index %u.\n",
- HIPQUAD(countess), index);
- index++;
- }
- }
- return 1;
-}
-
-static void
-same_destroy(const struct xt_target *target, void *targinfo)
-{
- struct ipt_same_info *mr = targinfo;
-
- kfree(mr->iparray);
-
- DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n",
- (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
-}
-
-static unsigned int
-same_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- u_int32_t tmpip, aindex;
- __be32 new_ip;
- const struct ipt_same_info *same = targinfo;
- struct ip_nat_range newrange;
- const struct ip_conntrack_tuple *t;
-
- IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
- hooknum == NF_IP_POST_ROUTING);
- ct = ip_conntrack_get(*pskb, &ctinfo);
-
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-
- /* Base new source on real src ip and optionally dst ip,
- giving some hope for consistency across reboots.
- Here we calculate the index in same->iparray which
- holds the ipaddress we should use */
-
- tmpip = ntohl(t->src.ip);
-
- if (!(same->info & IPT_SAME_NODST))
- tmpip += ntohl(t->dst.ip);
-
- aindex = tmpip % same->ipnum;
-
- new_ip = htonl(same->iparray[aindex]);
-
- DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
- "new src=%u.%u.%u.%u\n",
- NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip),
- NIPQUAD(new_ip));
-
- /* Transfer from original range. */
- newrange = ((struct ip_nat_range)
- { same->range[0].flags, new_ip, new_ip,
- /* FIXME: Use ports from correct range! */
- same->range[0].min, same->range[0].max });
-
- /* Hand modified range to generic setup. */
- return ip_nat_setup_info(ct, &newrange, hooknum);
-}
-
-static struct ipt_target same_reg = {
- .name = "SAME",
- .target = same_target,
- .targetsize = sizeof(struct ipt_same_info),
- .table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING),
- .checkentry = same_check,
- .destroy = same_destroy,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_same_init(void)
-{
- return ipt_register_target(&same_reg);
-}
-
-static void __exit ipt_same_fini(void)
-{
- ipt_unregister_target(&same_reg);
-}
-
-module_init(ipt_same_init);
-module_exit(ipt_same_fini);
-
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 000000000000..f2984c7eef40
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+
+#include <net/netfilter/nf_synproxy.h>
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_synproxy_info *info = par->targinfo;
+ struct net *net = xt_net(par);
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct synproxy_options opts = {};
+ struct tcphdr *th, _th;
+
+ if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
+ return NF_DROP;
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
+
+ if (th->syn && !(th->ack || th->fin || th->rst)) {
+ /* Initial SYN from client */
+ this_cpu_inc(snet->stats->syn_received);
+
+ if (th->ece && th->cwr)
+ opts.options |= XT_SYNPROXY_OPT_ECN;
+
+ opts.options &= info->options;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, &opts);
+ else
+ opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM |
+ XT_SYNPROXY_OPT_ECN);
+
+ synproxy_send_client_synack(net, skb, th, &opts);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+ /* ACK from client */
+ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ return XT_CONTINUE;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+ struct synproxy_net *snet = synproxy_pernet(par->net);
+ const struct ipt_entry *e = par->entryinfo;
+ int err;
+
+ if (e->ip.proto != IPPROTO_TCP ||
+ e->ip.invflags & XT_INV_PROTO)
+ return -EINVAL;
+
+ err = nf_ct_netns_get(par->net, par->family);
+ if (err)
+ return err;
+
+ err = nf_synproxy_ipv4_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
+ }
+
+ return err;
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ struct synproxy_net *snet = synproxy_pernet(par->net);
+
+ nf_synproxy_ipv4_fini(snet, par->net);
+ nf_ct_netns_put(par->net, par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV4,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+ .target = synproxy_tg4,
+ .targetsize = sizeof(struct xt_synproxy_info),
+ .checkentry = synproxy_tg4_check,
+ .destroy = synproxy_tg4_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init synproxy_tg4_init(void)
+{
+ return xt_register_target(&synproxy_tg4_reg);
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+ xt_unregister_target(&synproxy_tg4_reg);
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies");
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
deleted file mode 100644
index 93eb5c3c1884..000000000000
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * This is a module which is used for setting the MSS option in TCP packets.
- *
- * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/ip.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_TCPMSS.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables TCP MSS modification module");
-
-static inline unsigned int
-optlen(const u_int8_t *opt, unsigned int offset)
-{
- /* Beware zero-length options: make finite progress */
- if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0)
- return 1;
- else
- return opt[offset+1];
-}
-
-static unsigned int
-ipt_tcpmss_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
- struct tcphdr *tcph;
- struct iphdr *iph;
- u_int16_t tcplen, newmss;
- __be16 newtotlen, oldval;
- unsigned int i;
- u_int8_t *opt;
-
- if (!skb_make_writable(pskb, (*pskb)->len))
- return NF_DROP;
-
- iph = (*pskb)->nh.iph;
- tcplen = (*pskb)->len - iph->ihl*4;
- tcph = (void *)iph + iph->ihl*4;
-
- /* Since it passed flags test in tcp match, we know it is is
- not a fragment, and has data >= tcp header length. SYN
- packets should not contain data: if they did, then we risk
- running over MTU, sending Frag Needed and breaking things
- badly. --RR */
- if (tcplen != tcph->doff*4) {
- if (net_ratelimit())
- printk(KERN_ERR
- "ipt_tcpmss_target: bad length (%d bytes)\n",
- (*pskb)->len);
- return NF_DROP;
- }
-
- if (tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) {
- if (dst_mtu((*pskb)->dst) <= sizeof(struct iphdr) +
- sizeof(struct tcphdr)) {
- if (net_ratelimit())
- printk(KERN_ERR "ipt_tcpmss_target: "
- "unknown or invalid path-MTU (%d)\n",
- dst_mtu((*pskb)->dst));
- return NF_DROP; /* or IPT_CONTINUE ?? */
- }
-
- newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) -
- sizeof(struct tcphdr);
- } else
- newmss = tcpmssinfo->mss;
-
- opt = (u_int8_t *)tcph;
- for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) {
- if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS &&
- opt[i+1] == TCPOLEN_MSS) {
- u_int16_t oldmss;
-
- oldmss = (opt[i+2] << 8) | opt[i+3];
-
- if (tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU &&
- oldmss <= newmss)
- return IPT_CONTINUE;
-
- opt[i+2] = (newmss & 0xff00) >> 8;
- opt[i+3] = (newmss & 0x00ff);
-
- nf_proto_csum_replace2(&tcph->check, *pskb,
- htons(oldmss), htons(newmss), 0);
- return IPT_CONTINUE;
- }
- }
-
- /*
- * MSS Option not found ?! add it..
- */
- if (skb_tailroom((*pskb)) < TCPOLEN_MSS) {
- struct sk_buff *newskb;
-
- newskb = skb_copy_expand(*pskb, skb_headroom(*pskb),
- TCPOLEN_MSS, GFP_ATOMIC);
- if (!newskb)
- return NF_DROP;
- kfree_skb(*pskb);
- *pskb = newskb;
- iph = (*pskb)->nh.iph;
- tcph = (void *)iph + iph->ihl*4;
- }
-
- skb_put((*pskb), TCPOLEN_MSS);
-
- opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
- memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
-
- nf_proto_csum_replace2(&tcph->check, *pskb,
- htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1);
- opt[0] = TCPOPT_MSS;
- opt[1] = TCPOLEN_MSS;
- opt[2] = (newmss & 0xff00) >> 8;
- opt[3] = (newmss & 0x00ff);
-
- nf_proto_csum_replace4(&tcph->check, *pskb, 0, *((__be32 *)opt), 0);
-
- oldval = ((__be16 *)tcph)[6];
- tcph->doff += TCPOLEN_MSS/4;
- nf_proto_csum_replace2(&tcph->check, *pskb,
- oldval, ((__be16 *)tcph)[6], 0);
-
- newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS);
- nf_csum_replace2(&iph->check, iph->tot_len, newtotlen);
- iph->tot_len = newtotlen;
- return IPT_CONTINUE;
-}
-
-#define TH_SYN 0x02
-
-static inline int find_syn_match(const struct ipt_entry_match *m)
-{
- const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data;
-
- if (strcmp(m->u.kernel.match->name, "tcp") == 0 &&
- tcpinfo->flg_cmp & TH_SYN &&
- !(tcpinfo->invflags & IPT_TCP_INV_FLAGS))
- return 1;
-
- return 0;
-}
-
-/* Must specify -p tcp --syn/--tcp-flags SYN */
-static int
-ipt_tcpmss_checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
- const struct ipt_entry *e = e_void;
-
- if (tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU &&
- (hook_mask & ~((1 << NF_IP_FORWARD) |
- (1 << NF_IP_LOCAL_OUT) |
- (1 << NF_IP_POST_ROUTING))) != 0) {
- printk("TCPMSS: path-MTU clamping only supported in "
- "FORWARD, OUTPUT and POSTROUTING hooks\n");
- return 0;
- }
-
- if (IPT_MATCH_ITERATE(e, find_syn_match))
- return 1;
- printk("TCPMSS: Only works on TCP SYN packets\n");
- return 0;
-}
-
-static struct ipt_target ipt_tcpmss_reg = {
- .name = "TCPMSS",
- .target = ipt_tcpmss_target,
- .targetsize = sizeof(struct ipt_tcpmss_info),
- .proto = IPPROTO_TCP,
- .checkentry = ipt_tcpmss_checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_tcpmss_init(void)
-{
- return ipt_register_target(&ipt_tcpmss_reg);
-}
-
-static void __exit ipt_tcpmss_fini(void)
-{
- ipt_unregister_target(&ipt_tcpmss_reg);
-}
-
-module_init(ipt_tcpmss_init);
-module_exit(ipt_tcpmss_fini);
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
deleted file mode 100644
index 18e74ac4d425..000000000000
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/* This is a module which is used for setting the TOS field of a packet. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_TOS.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables TOS mangling module");
-
-static unsigned int
-target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ipt_tos_target_info *tosinfo = targinfo;
- struct iphdr *iph = (*pskb)->nh.iph;
-
- if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
- __u8 oldtos;
- if (!skb_make_writable(pskb, sizeof(struct iphdr)))
- return NF_DROP;
- iph = (*pskb)->nh.iph;
- oldtos = iph->tos;
- iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos;
- nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
- }
- return IPT_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
-
- if (tos != IPTOS_LOWDELAY
- && tos != IPTOS_THROUGHPUT
- && tos != IPTOS_RELIABILITY
- && tos != IPTOS_MINCOST
- && tos != IPTOS_NORMALSVC) {
- printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
- return 0;
- }
- return 1;
-}
-
-static struct ipt_target ipt_tos_reg = {
- .name = "TOS",
- .target = target,
- .targetsize = sizeof(struct ipt_tos_target_info),
- .table = "mangle",
- .checkentry = checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_tos_init(void)
-{
- return ipt_register_target(&ipt_tos_reg);
-}
-
-static void __exit ipt_tos_fini(void)
-{
- ipt_unregister_target(&ipt_tos_reg);
-}
-
-module_init(ipt_tos_init);
-module_exit(ipt_tos_fini);
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
deleted file mode 100644
index fffe5ca82e91..000000000000
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/* TTL modification target for IP tables
- * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_TTL.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("IP tables TTL modification module");
-MODULE_LICENSE("GPL");
-
-static unsigned int
-ipt_ttl_target(struct sk_buff **pskb,
- const struct net_device *in, const struct net_device *out,
- unsigned int hooknum, const struct xt_target *target,
- const void *targinfo)
-{
- struct iphdr *iph;
- const struct ipt_TTL_info *info = targinfo;
- int new_ttl;
-
- if (!skb_make_writable(pskb, (*pskb)->len))
- return NF_DROP;
-
- iph = (*pskb)->nh.iph;
-
- switch (info->mode) {
- case IPT_TTL_SET:
- new_ttl = info->ttl;
- break;
- case IPT_TTL_INC:
- new_ttl = iph->ttl + info->ttl;
- if (new_ttl > 255)
- new_ttl = 255;
- break;
- case IPT_TTL_DEC:
- new_ttl = iph->ttl - info->ttl;
- if (new_ttl < 0)
- new_ttl = 0;
- break;
- default:
- new_ttl = iph->ttl;
- break;
- }
-
- if (new_ttl != iph->ttl) {
- nf_csum_replace2(&iph->check, htons(iph->ttl << 8),
- htons(new_ttl << 8));
- iph->ttl = new_ttl;
- }
-
- return IPT_CONTINUE;
-}
-
-static int ipt_ttl_checkentry(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- struct ipt_TTL_info *info = targinfo;
-
- if (info->mode > IPT_TTL_MAXMODE) {
- printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
- info->mode);
- return 0;
- }
- if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
- return 0;
- return 1;
-}
-
-static struct ipt_target ipt_TTL = {
- .name = "TTL",
- .target = ipt_ttl_target,
- .targetsize = sizeof(struct ipt_TTL_info),
- .table = "mangle",
- .checkentry = ipt_ttl_checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_ttl_init(void)
-{
- return ipt_register_target(&ipt_TTL);
-}
-
-static void __exit ipt_ttl_fini(void)
-{
- ipt_unregister_target(&ipt_TTL);
-}
-
-module_init(ipt_ttl_init);
-module_exit(ipt_ttl_fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
deleted file mode 100644
index dbd34783a64d..000000000000
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * netfilter module for userspace packet logging daemons
- *
- * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
- *
- * 2000/09/22 ulog-cprange feature added
- * 2001/01/04 in-kernel queue as proposed by Sebastian Zander
- * <zander@fokus.gmd.de>
- * 2001/01/30 per-rule nlgroup conflicts with global queue.
- * nlgroup now global (sysctl)
- * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at
- * module loadtime -HW
- * 2002/07/07 remove broken nflog_rcv() function -HW
- * 2002/08/29 fix shifted/unshifted nlgroup bug -HW
- * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen>
- * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT
- * resulting in bogus 'error during NLMSG_PUT' messages.
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This module accepts two parameters:
- *
- * nlbufsiz:
- * The parameter specifies how big the buffer for each netlink multicast
- * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
- * get accumulated in the kernel until they are sent to userspace. It is
- * NOT possible to allocate more than 128kB, and it is strongly discouraged,
- * because atomically allocating 128kB inside the network rx softirq is not
- * reliable. Please also keep in mind that this buffer size is allocated for
- * each nlgroup you are using, so the total kernel memory usage increases
- * by that factor.
- *
- * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
- * nlbufsiz is used with alloc_skb, which adds another
- * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead.
- *
- * flushtimeout:
- * Specify, after how many hundredths of a second the queue should be
- * flushed even if it is not full yet.
- *
- * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/netlink.h>
-#include <linux/netdevice.h>
-#include <linux/mm.h>
-#include <linux/moduleparam.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_ULOG.h>
-#include <net/sock.h>
-#include <linux/bitops.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("iptables userspace logging module");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
-
-#define ULOG_NL_EVENT 111 /* Harald's favorite number */
-#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
-
-#if 0
-#define DEBUGP(format, args...) printk("%s:%s:" format, \
- __FILE__, __FUNCTION__ , ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
-
-static unsigned int nlbufsiz = NLMSG_GOODSIZE;
-module_param(nlbufsiz, uint, 0400);
-MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
-
-static unsigned int flushtimeout = 10;
-module_param(flushtimeout, uint, 0600);
-MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
-
-static int nflog = 1;
-module_param(nflog, bool, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
-/* global data structures */
-
-typedef struct {
- unsigned int qlen; /* number of nlmsgs' in the skb */
- struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
- struct sk_buff *skb; /* the pre-allocated skb */
- struct timer_list timer; /* the timer function */
-} ulog_buff_t;
-
-static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
-
-static struct sock *nflognl; /* our socket */
-static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
-
-/* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroupnum)
-{
- ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
-
- if (timer_pending(&ub->timer)) {
- DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
-
- if (!ub->skb) {
- DEBUGP("ipt_ULOG: ulog_send: nothing to send\n");
- return;
- }
-
- /* last nlmsg needs NLMSG_DONE */
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_type = NLMSG_DONE;
-
- NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
- DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
- ub->qlen, nlgroupnum + 1);
- netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
-
- ub->qlen = 0;
- ub->skb = NULL;
- ub->lastnlh = NULL;
-
-}
-
-
-/* timer function to flush queue in flushtimeout time */
-static void ulog_timer(unsigned long data)
-{
- DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n");
-
- /* lock to protect against somebody modifying our structure
- * from ipt_ulog_target at the same time */
- spin_lock_bh(&ulog_lock);
- ulog_send(data);
- spin_unlock_bh(&ulog_lock);
-}
-
-static struct sk_buff *ulog_alloc_skb(unsigned int size)
-{
- struct sk_buff *skb;
- unsigned int n;
-
- /* alloc skb which should be big enough for a whole
- * multipart message. WARNING: has to be <= 131000
- * due to slab allocator restrictions */
-
- n = max(size, nlbufsiz);
- skb = alloc_skb(n, GFP_ATOMIC);
- if (!skb) {
- PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", n);
-
- if (n > size) {
- /* try to allocate only as much as we need for
- * current packet */
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- PRINTR("ipt_ULOG: can't even allocate %ub\n",
- size);
- }
- }
-
- return skb;
-}
-
-static void ipt_ulog_packet(unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct ipt_ulog_info *loginfo,
- const char *prefix)
-{
- ulog_buff_t *ub;
- ulog_packet_msg_t *pm;
- size_t size, copy_len;
- struct nlmsghdr *nlh;
-
- /* ffs == find first bit set, necessary because userspace
- * is already shifting groupnumber, but we need unshifted.
- * ffs() returns [1..32], we need [0..31] */
- unsigned int groupnum = ffs(loginfo->nl_group) - 1;
-
- /* calculate the size of the skb needed */
- if ((loginfo->copy_range == 0) ||
- (loginfo->copy_range > skb->len)) {
- copy_len = skb->len;
- } else {
- copy_len = loginfo->copy_range;
- }
-
- size = NLMSG_SPACE(sizeof(*pm) + copy_len);
-
- ub = &ulog_buffers[groupnum];
-
- spin_lock_bh(&ulog_lock);
-
- if (!ub->skb) {
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- } else if (ub->qlen >= loginfo->qthreshold ||
- size > skb_tailroom(ub->skb)) {
- /* either the queue len is too high or we don't have
- * enough room in nlskb left. send it to userspace. */
-
- ulog_send(groupnum);
-
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- }
-
- DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen,
- loginfo->qthreshold);
-
- /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
- nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
- sizeof(*pm)+copy_len);
- ub->qlen++;
-
- pm = NLMSG_DATA(nlh);
-
- /* We might not have a timestamp, get one */
- if (skb->tstamp.off_sec == 0)
- __net_timestamp((struct sk_buff *)skb);
-
- /* copy hook, prefix, timestamp, payload, etc. */
- pm->data_len = copy_len;
- pm->timestamp_sec = skb->tstamp.off_sec;
- pm->timestamp_usec = skb->tstamp.off_usec;
- pm->mark = skb->mark;
- pm->hook = hooknum;
- if (prefix != NULL)
- strncpy(pm->prefix, prefix, sizeof(pm->prefix));
- else if (loginfo->prefix[0] != '\0')
- strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
- else
- *(pm->prefix) = '\0';
-
- if (in && in->hard_header_len > 0
- && skb->mac.raw != (void *) skb->nh.iph
- && in->hard_header_len <= ULOG_MAC_LEN) {
- memcpy(pm->mac, skb->mac.raw, in->hard_header_len);
- pm->mac_len = in->hard_header_len;
- } else
- pm->mac_len = 0;
-
- if (in)
- strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
- else
- pm->indev_name[0] = '\0';
-
- if (out)
- strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
- else
- pm->outdev_name[0] = '\0';
-
- /* copy_len <= skb->len, so can't fail. */
- if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
- BUG();
-
- /* check if we are building multi-part messages */
- if (ub->qlen > 1) {
- ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
- }
-
- ub->lastnlh = nlh;
-
- /* if timer isn't already running, start it */
- if (!timer_pending(&ub->timer)) {
- ub->timer.expires = jiffies + flushtimeout * HZ / 100;
- add_timer(&ub->timer);
- }
-
- /* if threshold is reached, send message to userspace */
- if (ub->qlen >= loginfo->qthreshold) {
- if (loginfo->qthreshold > 1)
- nlh->nlmsg_type = NLMSG_DONE;
- ulog_send(groupnum);
- }
-
- spin_unlock_bh(&ulog_lock);
-
- return;
-
-nlmsg_failure:
- PRINTR("ipt_ULOG: error during NLMSG_PUT\n");
-
-alloc_failure:
- PRINTR("ipt_ULOG: Error building netlink message\n");
-
- spin_unlock_bh(&ulog_lock);
-}
-
-static unsigned int ipt_ulog_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
-
- ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL);
-
- return IPT_CONTINUE;
-}
-
-static void ipt_logfn(unsigned int pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *li,
- const char *prefix)
-{
- struct ipt_ulog_info loginfo;
-
- if (!li || li->type != NF_LOG_TYPE_ULOG) {
- loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
- loginfo.copy_range = 0;
- loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
- loginfo.prefix[0] = '\0';
- } else {
- loginfo.nl_group = li->u.ulog.group;
- loginfo.copy_range = li->u.ulog.copy_len;
- loginfo.qthreshold = li->u.ulog.qthreshold;
- strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
- }
-
- ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
-}
-
-static int ipt_ulog_checkentry(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hookmask)
-{
- struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
-
- if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
- DEBUGP("ipt_ULOG: prefix term %i\n",
- loginfo->prefix[sizeof(loginfo->prefix) - 1]);
- return 0;
- }
- if (loginfo->qthreshold > ULOG_MAX_QLEN) {
- DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
- loginfo->qthreshold);
- return 0;
- }
- return 1;
-}
-
-static struct ipt_target ipt_ulog_reg = {
- .name = "ULOG",
- .target = ipt_ulog_target,
- .targetsize = sizeof(struct ipt_ulog_info),
- .checkentry = ipt_ulog_checkentry,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ipt_ulog_logger = {
- .name = "ipt_ULOG",
- .logfn = ipt_logfn,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_ulog_init(void)
-{
- int i;
-
- DEBUGP("ipt_ULOG: init module\n");
-
- if (nlbufsiz > 128*1024) {
- printk("Netlink buffer has to be <= 128kB\n");
- return -EINVAL;
- }
-
- /* initialize ulog_buffers */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- init_timer(&ulog_buffers[i].timer);
- ulog_buffers[i].timer.function = ulog_timer;
- ulog_buffers[i].timer.data = i;
- }
-
- nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
- THIS_MODULE);
- if (!nflognl)
- return -ENOMEM;
-
- if (ipt_register_target(&ipt_ulog_reg) != 0) {
- sock_release(nflognl->sk_socket);
- return -EINVAL;
- }
- if (nflog)
- nf_log_register(PF_INET, &ipt_ulog_logger);
-
- return 0;
-}
-
-static void __exit ipt_ulog_fini(void)
-{
- ulog_buff_t *ub;
- int i;
-
- DEBUGP("ipt_ULOG: cleanup_module\n");
-
- if (nflog)
- nf_log_unregister_logger(&ipt_ulog_logger);
- ipt_unregister_target(&ipt_ulog_reg);
- sock_release(nflognl->sk_socket);
-
- /* remove pending timers and free allocated skb's */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ub = &ulog_buffers[i];
- if (timer_pending(&ub->timer)) {
- DEBUGP("timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
-
- if (ub->skb) {
- kfree_skb(ub->skb);
- ub->skb = NULL;
- }
- }
-
-}
-
-module_init(ipt_ulog_init);
-module_exit(ipt_ulog_fini);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index 7b60eb74788b..000000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * iptables module to match inet_addr_type() of an ip.
- *
- * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter_ipv4/ipt_addrtype.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("iptables addrtype match");
-
-static inline int match_type(__be32 addr, u_int16_t mask)
-{
- return !!(mask & (1 << inet_addr_type(addr)));
-}
-
-static int match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, int *hotdrop)
-{
- const struct ipt_addrtype_info *info = matchinfo;
- const struct iphdr *iph = skb->nh.iph;
- int ret = 1;
-
- if (info->source)
- ret &= match_type(iph->saddr, info->source)^info->invert_source;
- if (info->dest)
- ret &= match_type(iph->daddr, info->dest)^info->invert_dest;
-
- return ret;
-}
-
-static struct ipt_match addrtype_match = {
- .name = "addrtype",
- .match = match,
- .matchsize = sizeof(struct ipt_addrtype_info),
- .me = THIS_MODULE
-};
-
-static int __init ipt_addrtype_init(void)
-{
- return ipt_register_match(&addrtype_match);
-}
-
-static void __exit ipt_addrtype_fini(void)
-{
- ipt_unregister_match(&addrtype_match);
-}
-
-module_init(ipt_addrtype_init);
-module_exit(ipt_addrtype_fini);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 1798f86bc534..161ba412cb08 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -1,66 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match AH parameters. */
/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/netfilter_ipv4/ipt_ah.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
-MODULE_DESCRIPTION("iptables AH SPI match module");
-
-#ifdef DEBUG_CONNTRACK
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
+MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
/* Returns 1 if the spi is matched by the range, 0 otherwise */
-static inline int
-spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
{
- int r=0;
- duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
- min,spi,max);
- r=(spi >= min && spi <= max) ^ invert;
- duprintf(" result %s\n",r? "PASS" : "FAILED");
+ bool r;
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r = (spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ip_auth_hdr _ahdr, *ah;
- const struct ipt_ah *ahinfo = matchinfo;
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+ const struct ipt_ah *ahinfo = par->matchinfo;
/* Must not be a fragment. */
- if (offset)
- return 0;
+ if (par->fragoff != 0)
+ return false;
- ah = skb_header_pointer(skb, protoff,
- sizeof(_ahdr), &_ahdr);
+ ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
if (ah == NULL) {
/* We've been asked to examine this packet, and we
* can't. Hence, no choice but to drop.
*/
- duprintf("Dropping evil AH tinygram.\n");
- *hotdrop = 1;
- return 0;
+ pr_debug("Dropping evil AH tinygram.\n");
+ par->hotdrop = true;
+ return false;
}
return spi_match(ahinfo->spis[0], ahinfo->spis[1],
@@ -68,42 +52,37 @@ match(const struct sk_buff *skb,
!!(ahinfo->invflags & IPT_AH_INV_SPI));
}
-/* Called when user tries to insert an entry of this type. */
-static int
-checkentry(const char *tablename,
- const void *ip_void,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+static int ah_mt_check(const struct xt_mtchk_param *par)
{
- const struct ipt_ah *ahinfo = matchinfo;
+ const struct ipt_ah *ahinfo = par->matchinfo;
/* Must specify no unknown invflags */
if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
- duprintf("ipt_ah: unknown flags %X\n", ahinfo->invflags);
- return 0;
+ pr_debug("unknown flags %X\n", ahinfo->invflags);
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static struct ipt_match ah_match = {
+static struct xt_match ah_mt_reg __read_mostly = {
.name = "ah",
- .match = match,
+ .family = NFPROTO_IPV4,
+ .match = ah_mt,
.matchsize = sizeof(struct ipt_ah),
.proto = IPPROTO_AH,
- .checkentry = checkentry,
+ .checkentry = ah_mt_check,
.me = THIS_MODULE,
};
-static int __init ipt_ah_init(void)
+static int __init ah_mt_init(void)
{
- return ipt_register_match(&ah_match);
+ return xt_register_match(&ah_mt_reg);
}
-static void __exit ipt_ah_fini(void)
+static void __exit ah_mt_exit(void)
{
- ipt_unregister_match(&ah_match);
+ xt_unregister_match(&ah_mt_reg);
}
-module_init(ipt_ah_init);
-module_exit(ipt_ah_fini);
+module_init(ah_mt_init);
+module_exit(ah_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
deleted file mode 100644
index dafbdec0efc0..000000000000
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/* IP tables module for matching the value of the IPv4 and TCP ECN bits
- *
- * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp
- *
- * (C) 2002 by Harald Welte <laforge@gnumonks.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_ecn.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables ECN matching module");
-MODULE_LICENSE("GPL");
-
-static inline int match_ip(const struct sk_buff *skb,
- const struct ipt_ecn_info *einfo)
-{
- return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect);
-}
-
-static inline int match_tcp(const struct sk_buff *skb,
- const struct ipt_ecn_info *einfo,
- int *hotdrop)
-{
- struct tcphdr _tcph, *th;
-
- /* In practice, TCP match does this, so can't fail. But let's
- * be good citizens.
- */
- th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL) {
- *hotdrop = 0;
- return 0;
- }
-
- if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
- if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
- if (th->ece == 1)
- return 0;
- } else {
- if (th->ece == 0)
- return 0;
- }
- }
-
- if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
- if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
- if (th->cwr == 1)
- return 0;
- } else {
- if (th->cwr == 0)
- return 0;
- }
- }
-
- return 1;
-}
-
-static int match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, int *hotdrop)
-{
- const struct ipt_ecn_info *info = matchinfo;
-
- if (info->operation & IPT_ECN_OP_MATCH_IP)
- if (!match_ip(skb, info))
- return 0;
-
- if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
- if (skb->nh.iph->protocol != IPPROTO_TCP)
- return 0;
- if (!match_tcp(skb, info, hotdrop))
- return 0;
- }
-
- return 1;
-}
-
-static int checkentry(const char *tablename, const void *ip_void,
- const struct xt_match *match,
- void *matchinfo, unsigned int hook_mask)
-{
- const struct ipt_ecn_info *info = matchinfo;
- const struct ipt_ip *ip = ip_void;
-
- if (info->operation & IPT_ECN_OP_MATCH_MASK)
- return 0;
-
- if (info->invert & IPT_ECN_OP_MATCH_MASK)
- return 0;
-
- if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
- && ip->proto != IPPROTO_TCP) {
- printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
- " non-tcp packets\n");
- return 0;
- }
-
- return 1;
-}
-
-static struct ipt_match ecn_match = {
- .name = "ecn",
- .match = match,
- .matchsize = sizeof(struct ipt_ecn_info),
- .checkentry = checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_ecn_init(void)
-{
- return ipt_register_match(&ecn_match);
-}
-
-static void __exit ipt_ecn_fini(void)
-{
- ipt_unregister_match(&ecn_match);
-}
-
-module_init(ipt_ecn_init);
-module_exit(ipt_ecn_fini);
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
deleted file mode 100644
index 5202edd8d333..000000000000
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * iptables module to match IP address ranges
- *
- * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_iprange.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-MODULE_DESCRIPTION("iptables arbitrary IP range match module");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset, unsigned int protoff, int *hotdrop)
-{
- const struct ipt_iprange_info *info = matchinfo;
- const struct iphdr *iph = skb->nh.iph;
-
- if (info->flags & IPRANGE_SRC) {
- if (((ntohl(iph->saddr) < ntohl(info->src.min_ip))
- || (ntohl(iph->saddr) > ntohl(info->src.max_ip)))
- ^ !!(info->flags & IPRANGE_SRC_INV)) {
- DEBUGP("src IP %u.%u.%u.%u NOT in range %s"
- "%u.%u.%u.%u-%u.%u.%u.%u\n",
- NIPQUAD(iph->saddr),
- info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
- NIPQUAD(info->src.min_ip),
- NIPQUAD(info->src.max_ip));
- return 0;
- }
- }
- if (info->flags & IPRANGE_DST) {
- if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip))
- || (ntohl(iph->daddr) > ntohl(info->dst.max_ip)))
- ^ !!(info->flags & IPRANGE_DST_INV)) {
- DEBUGP("dst IP %u.%u.%u.%u NOT in range %s"
- "%u.%u.%u.%u-%u.%u.%u.%u\n",
- NIPQUAD(iph->daddr),
- info->flags & IPRANGE_DST_INV ? "(INV) " : "",
- NIPQUAD(info->dst.min_ip),
- NIPQUAD(info->dst.max_ip));
- return 0;
- }
- }
- return 1;
-}
-
-static struct ipt_match iprange_match = {
- .name = "iprange",
- .match = match,
- .matchsize = sizeof(struct ipt_iprange_info),
- .destroy = NULL,
- .me = THIS_MODULE
-};
-
-static int __init ipt_iprange_init(void)
-{
- return ipt_register_match(&iprange_match);
-}
-
-static void __exit ipt_iprange_fini(void)
-{
- ipt_unregister_match(&iprange_match);
-}
-
-module_init(ipt_iprange_init);
-module_exit(ipt_iprange_fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
deleted file mode 100644
index 78c336f12a9e..000000000000
--- a/net/ipv4/netfilter/ipt_owner.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Kernel module to match various things tied to sockets associated with
- locally generated outgoing packets. */
-
-/* (C) 2000 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/file.h>
-#include <linux/rcupdate.h>
-#include <net/sock.h>
-
-#include <linux/netfilter_ipv4/ipt_owner.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables owner match");
-
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
-{
- const struct ipt_owner_info *info = matchinfo;
-
- if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
- return 0;
-
- if(info->match & IPT_OWNER_UID) {
- if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
- !!(info->invert & IPT_OWNER_UID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_GID) {
- if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
- !!(info->invert & IPT_OWNER_GID))
- return 0;
- }
-
- return 1;
-}
-
-static int
-checkentry(const char *tablename,
- const void *ip,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
-{
- const struct ipt_owner_info *info = matchinfo;
-
- if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
- printk("ipt_owner: pid, sid and command matching "
- "not supported anymore\n");
- return 0;
- }
- return 1;
-}
-
-static struct ipt_match owner_match = {
- .name = "owner",
- .match = match,
- .matchsize = sizeof(struct ipt_owner_info),
- .hooks = (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING),
- .checkentry = checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_owner_init(void)
-{
- return ipt_register_match(&owner_match);
-}
-
-static void __exit ipt_owner_fini(void)
-{
- ipt_unregister_match(&owner_match);
-}
-
-module_init(ipt_owner_init);
-module_exit(ipt_owner_fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
deleted file mode 100644
index 126db44e71a8..000000000000
--- a/net/ipv4/netfilter/ipt_recent.c
+++ /dev/null
@@ -1,505 +0,0 @@
-/*
- * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This is a replacement of the old ipt_recent module, which carried the
- * following copyright notice:
- *
- * Author: Stephen Frost <sfrost@snowman.net>
- * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
- */
-#include <linux/init.h>
-#include <linux/moduleparam.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/random.h>
-#include <linux/jhash.h>
-#include <linux/bitops.h>
-#include <linux/skbuff.h>
-#include <linux/inet.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_recent.h>
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("IP tables recently seen matching module");
-MODULE_LICENSE("GPL");
-
-static unsigned int ip_list_tot = 100;
-static unsigned int ip_pkt_list_tot = 20;
-static unsigned int ip_list_hash_size = 0;
-static unsigned int ip_list_perms = 0644;
-static unsigned int ip_list_uid = 0;
-static unsigned int ip_list_gid = 0;
-module_param(ip_list_tot, uint, 0400);
-module_param(ip_pkt_list_tot, uint, 0400);
-module_param(ip_list_hash_size, uint, 0400);
-module_param(ip_list_perms, uint, 0400);
-module_param(ip_list_uid, uint, 0400);
-module_param(ip_list_gid, uint, 0400);
-MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
-MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
-MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
-MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
-
-struct recent_entry {
- struct list_head list;
- struct list_head lru_list;
- __be32 addr;
- u_int8_t ttl;
- u_int8_t index;
- u_int16_t nstamps;
- unsigned long stamps[0];
-};
-
-struct recent_table {
- struct list_head list;
- char name[IPT_RECENT_NAME_LEN];
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *proc;
-#endif
- unsigned int refcnt;
- unsigned int entries;
- struct list_head lru_list;
- struct list_head iphash[0];
-};
-
-static LIST_HEAD(tables);
-static DEFINE_SPINLOCK(recent_lock);
-static DEFINE_MUTEX(recent_mutex);
-
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_dir;
-static struct file_operations recent_fops;
-#endif
-
-static u_int32_t hash_rnd;
-static int hash_rnd_initted;
-
-static unsigned int recent_entry_hash(__be32 addr)
-{
- if (!hash_rnd_initted) {
- get_random_bytes(&hash_rnd, 4);
- hash_rnd_initted = 1;
- }
- return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1);
-}
-
-static struct recent_entry *
-recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl)
-{
- struct recent_entry *e;
- unsigned int h;
-
- h = recent_entry_hash(addr);
- list_for_each_entry(e, &table->iphash[h], list)
- if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
- return e;
- return NULL;
-}
-
-static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
-{
- list_del(&e->list);
- list_del(&e->lru_list);
- kfree(e);
- t->entries--;
-}
-
-static struct recent_entry *
-recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
-{
- struct recent_entry *e;
-
- if (t->entries >= ip_list_tot) {
- e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
- recent_entry_remove(t, e);
- }
- e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
- GFP_ATOMIC);
- if (e == NULL)
- return NULL;
- e->addr = addr;
- e->ttl = ttl;
- e->stamps[0] = jiffies;
- e->nstamps = 1;
- e->index = 1;
- list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
- list_add_tail(&e->lru_list, &t->lru_list);
- t->entries++;
- return e;
-}
-
-static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
-{
- e->stamps[e->index++] = jiffies;
- if (e->index > e->nstamps)
- e->nstamps = e->index;
- e->index %= ip_pkt_list_tot;
- list_move_tail(&e->lru_list, &t->lru_list);
-}
-
-static struct recent_table *recent_table_lookup(const char *name)
-{
- struct recent_table *t;
-
- list_for_each_entry(t, &tables, list)
- if (!strcmp(t->name, name))
- return t;
- return NULL;
-}
-
-static void recent_table_flush(struct recent_table *t)
-{
- struct recent_entry *e, *next;
- unsigned int i;
-
- for (i = 0; i < ip_list_hash_size; i++) {
- list_for_each_entry_safe(e, next, &t->iphash[i], list)
- recent_entry_remove(t, e);
- }
-}
-
-static int
-ipt_recent_match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, int *hotdrop)
-{
- const struct ipt_recent_info *info = matchinfo;
- struct recent_table *t;
- struct recent_entry *e;
- __be32 addr;
- u_int8_t ttl;
- int ret = info->invert;
-
- if (info->side == IPT_RECENT_DEST)
- addr = skb->nh.iph->daddr;
- else
- addr = skb->nh.iph->saddr;
-
- ttl = skb->nh.iph->ttl;
- /* use TTL as seen before forwarding */
- if (out && !skb->sk)
- ttl++;
-
- spin_lock_bh(&recent_lock);
- t = recent_table_lookup(info->name);
- e = recent_entry_lookup(t, addr,
- info->check_set & IPT_RECENT_TTL ? ttl : 0);
- if (e == NULL) {
- if (!(info->check_set & IPT_RECENT_SET))
- goto out;
- e = recent_entry_init(t, addr, ttl);
- if (e == NULL)
- *hotdrop = 1;
- ret ^= 1;
- goto out;
- }
-
- if (info->check_set & IPT_RECENT_SET)
- ret ^= 1;
- else if (info->check_set & IPT_RECENT_REMOVE) {
- recent_entry_remove(t, e);
- ret ^= 1;
- } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
- unsigned long t = jiffies - info->seconds * HZ;
- unsigned int i, hits = 0;
-
- for (i = 0; i < e->nstamps; i++) {
- if (info->seconds && time_after(t, e->stamps[i]))
- continue;
- if (++hits >= info->hit_count) {
- ret ^= 1;
- break;
- }
- }
- }
-
- if (info->check_set & IPT_RECENT_SET ||
- (info->check_set & IPT_RECENT_UPDATE && ret)) {
- recent_entry_update(t, e);
- e->ttl = ttl;
- }
-out:
- spin_unlock_bh(&recent_lock);
- return ret;
-}
-
-static int
-ipt_recent_checkentry(const char *tablename, const void *ip,
- const struct xt_match *match, void *matchinfo,
- unsigned int hook_mask)
-{
- const struct ipt_recent_info *info = matchinfo;
- struct recent_table *t;
- unsigned i;
- int ret = 0;
-
- if (hweight8(info->check_set &
- (IPT_RECENT_SET | IPT_RECENT_REMOVE |
- IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
- return 0;
- if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
- (info->seconds || info->hit_count))
- return 0;
- if (info->name[0] == '\0' ||
- strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
- return 0;
-
- mutex_lock(&recent_mutex);
- t = recent_table_lookup(info->name);
- if (t != NULL) {
- t->refcnt++;
- ret = 1;
- goto out;
- }
-
- t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
- GFP_KERNEL);
- if (t == NULL)
- goto out;
- t->refcnt = 1;
- strcpy(t->name, info->name);
- INIT_LIST_HEAD(&t->lru_list);
- for (i = 0; i < ip_list_hash_size; i++)
- INIT_LIST_HEAD(&t->iphash[i]);
-#ifdef CONFIG_PROC_FS
- t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir);
- if (t->proc == NULL) {
- kfree(t);
- goto out;
- }
- t->proc->proc_fops = &recent_fops;
- t->proc->uid = ip_list_uid;
- t->proc->gid = ip_list_gid;
- t->proc->data = t;
-#endif
- spin_lock_bh(&recent_lock);
- list_add_tail(&t->list, &tables);
- spin_unlock_bh(&recent_lock);
- ret = 1;
-out:
- mutex_unlock(&recent_mutex);
- return ret;
-}
-
-static void
-ipt_recent_destroy(const struct xt_match *match, void *matchinfo)
-{
- const struct ipt_recent_info *info = matchinfo;
- struct recent_table *t;
-
- mutex_lock(&recent_mutex);
- t = recent_table_lookup(info->name);
- if (--t->refcnt == 0) {
- spin_lock_bh(&recent_lock);
- list_del(&t->list);
- spin_unlock_bh(&recent_lock);
- recent_table_flush(t);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(t->name, proc_dir);
-#endif
- kfree(t);
- }
- mutex_unlock(&recent_mutex);
-}
-
-#ifdef CONFIG_PROC_FS
-struct recent_iter_state {
- struct recent_table *table;
- unsigned int bucket;
-};
-
-static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
-{
- struct recent_iter_state *st = seq->private;
- struct recent_table *t = st->table;
- struct recent_entry *e;
- loff_t p = *pos;
-
- spin_lock_bh(&recent_lock);
-
- for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) {
- list_for_each_entry(e, &t->iphash[st->bucket], list) {
- if (p-- == 0)
- return e;
- }
- }
- return NULL;
-}
-
-static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct recent_iter_state *st = seq->private;
- struct recent_table *t = st->table;
- struct recent_entry *e = v;
- struct list_head *head = e->list.next;
-
- while (head == &t->iphash[st->bucket]) {
- if (++st->bucket >= ip_list_hash_size)
- return NULL;
- head = t->iphash[st->bucket].next;
- }
- (*pos)++;
- return list_entry(head, struct recent_entry, list);
-}
-
-static void recent_seq_stop(struct seq_file *s, void *v)
-{
- spin_unlock_bh(&recent_lock);
-}
-
-static int recent_seq_show(struct seq_file *seq, void *v)
-{
- struct recent_entry *e = v;
- unsigned int i;
-
- i = (e->index - 1) % ip_pkt_list_tot;
- seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
- NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
- for (i = 0; i < e->nstamps; i++)
- seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
- seq_printf(seq, "\n");
- return 0;
-}
-
-static struct seq_operations recent_seq_ops = {
- .start = recent_seq_start,
- .next = recent_seq_next,
- .stop = recent_seq_stop,
- .show = recent_seq_show,
-};
-
-static int recent_seq_open(struct inode *inode, struct file *file)
-{
- struct proc_dir_entry *pde = PDE(inode);
- struct seq_file *seq;
- struct recent_iter_state *st;
- int ret;
-
- st = kzalloc(sizeof(*st), GFP_KERNEL);
- if (st == NULL)
- return -ENOMEM;
- ret = seq_open(file, &recent_seq_ops);
- if (ret)
- kfree(st);
- st->table = pde->data;
- seq = file->private_data;
- seq->private = st;
- return ret;
-}
-
-static ssize_t recent_proc_write(struct file *file, const char __user *input,
- size_t size, loff_t *loff)
-{
- struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
- struct recent_table *t = pde->data;
- struct recent_entry *e;
- char buf[sizeof("+255.255.255.255")], *c = buf;
- __be32 addr;
- int add;
-
- if (size > sizeof(buf))
- size = sizeof(buf);
- if (copy_from_user(buf, input, size))
- return -EFAULT;
- while (isspace(*c))
- c++;
-
- if (size - (c - buf) < 5)
- return c - buf;
- if (!strncmp(c, "clear", 5)) {
- c += 5;
- spin_lock_bh(&recent_lock);
- recent_table_flush(t);
- spin_unlock_bh(&recent_lock);
- return c - buf;
- }
-
- switch (*c) {
- case '-':
- add = 0;
- c++;
- break;
- case '+':
- c++;
- default:
- add = 1;
- break;
- }
- addr = in_aton(c);
-
- spin_lock_bh(&recent_lock);
- e = recent_entry_lookup(t, addr, 0);
- if (e == NULL) {
- if (add)
- recent_entry_init(t, addr, 0);
- } else {
- if (add)
- recent_entry_update(t, e);
- else
- recent_entry_remove(t, e);
- }
- spin_unlock_bh(&recent_lock);
- return size;
-}
-
-static struct file_operations recent_fops = {
- .open = recent_seq_open,
- .read = seq_read,
- .write = recent_proc_write,
- .release = seq_release_private,
- .owner = THIS_MODULE,
-};
-#endif /* CONFIG_PROC_FS */
-
-static struct ipt_match recent_match = {
- .name = "recent",
- .match = ipt_recent_match,
- .matchsize = sizeof(struct ipt_recent_info),
- .checkentry = ipt_recent_checkentry,
- .destroy = ipt_recent_destroy,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_recent_init(void)
-{
- int err;
-
- if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
- return -EINVAL;
- ip_list_hash_size = 1 << fls(ip_list_tot);
-
- err = ipt_register_match(&recent_match);
-#ifdef CONFIG_PROC_FS
- if (err)
- return err;
- proc_dir = proc_mkdir("ipt_recent", proc_net);
- if (proc_dir == NULL) {
- ipt_unregister_match(&recent_match);
- err = -ENOMEM;
- }
-#endif
- return err;
-}
-
-static void __exit ipt_recent_exit(void)
-{
- BUG_ON(!list_empty(&tables));
- ipt_unregister_match(&recent_match);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("ipt_recent", proc_net);
-#endif
-}
-
-module_init(ipt_recent_init);
-module_exit(ipt_recent_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
new file mode 100644
index 000000000000..6d9bf5106868
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ *
+ * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/flow.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 rpfilter_get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
+ const struct net_device *dev, u8 flags)
+{
+ struct fib_result res;
+
+ if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+ return false;
+
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
+ return false;
+ }
+ return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE;
+}
+
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
+{
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rpfilter_info *info;
+ const struct iphdr *iph;
+ struct flowi4 flow;
+ bool invert;
+
+ info = par->matchinfo;
+ invert = info->flags & XT_RPFILTER_INVERT;
+
+ if (rpfilter_is_loopback(skb, xt_in(par)))
+ return true ^ invert;
+
+ iph = ip_hdr(skb);
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr))
+ return true ^ invert;
+ }
+
+ memset(&flow, 0, sizeof(flow));
+ flow.flowi4_iif = LOOPBACK_IFINDEX;
+ flow.daddr = iph->saddr;
+ flow.saddr = rpfilter_get_saddr(iph->daddr);
+ flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ flow.flowi4_dscp = ip4h_dscp(iph);
+ flow.flowi4_scope = RT_SCOPE_UNIVERSE;
+ flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
+ flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
+
+ return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+ if (info->flags & options) {
+ pr_info_ratelimited("unknown options\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "raw") != 0) {
+ pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n",
+ par->table);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+ .name = "rpfilter",
+ .family = NFPROTO_IPV4,
+ .checkentry = rpfilter_check,
+ .match = rpfilter_mt,
+ .matchsize = sizeof(struct xt_rpfilter_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING),
+ .me = THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+ return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+ xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
deleted file mode 100644
index 5549c39c7851..000000000000
--- a/net/ipv4/netfilter/ipt_tos.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Kernel module to match TOS values. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ipt_tos.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("iptables TOS match module");
-
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
-{
- const struct ipt_tos_info *info = matchinfo;
-
- return (skb->nh.iph->tos == info->tos) ^ info->invert;
-}
-
-static struct ipt_match tos_match = {
- .name = "tos",
- .match = match,
- .matchsize = sizeof(struct ipt_tos_info),
- .me = THIS_MODULE,
-};
-
-static int __init ipt_multiport_init(void)
-{
- return ipt_register_match(&tos_match);
-}
-
-static void __exit ipt_multiport_fini(void)
-{
- ipt_unregister_match(&tos_match);
-}
-
-module_init(ipt_multiport_init);
-module_exit(ipt_multiport_fini);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
deleted file mode 100644
index a5243bdb87d7..000000000000
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* IP tables module for matching the value of the TTL
- *
- * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp
- *
- * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ipt_ttl.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("IP tables TTL matching module");
-MODULE_LICENSE("GPL");
-
-static int match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, int *hotdrop)
-{
- const struct ipt_ttl_info *info = matchinfo;
-
- switch (info->mode) {
- case IPT_TTL_EQ:
- return (skb->nh.iph->ttl == info->ttl);
- break;
- case IPT_TTL_NE:
- return (!(skb->nh.iph->ttl == info->ttl));
- break;
- case IPT_TTL_LT:
- return (skb->nh.iph->ttl < info->ttl);
- break;
- case IPT_TTL_GT:
- return (skb->nh.iph->ttl > info->ttl);
- break;
- default:
- printk(KERN_WARNING "ipt_ttl: unknown mode %d\n",
- info->mode);
- return 0;
- }
-
- return 0;
-}
-
-static struct ipt_match ttl_match = {
- .name = "ttl",
- .match = match,
- .matchsize = sizeof(struct ipt_ttl_info),
- .me = THIS_MODULE,
-};
-
-static int __init ipt_ttl_init(void)
-{
- return ipt_register_match(&ttl_match);
-}
-
-static void __exit ipt_ttl_fini(void)
-{
- ipt_unregister_match(&ttl_match);
-
-}
-
-module_init(ipt_ttl_init);
-module_exit(ipt_ttl_fini);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index e2e7dd8d7903..3ab908b74795 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -1,177 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables filter table");
-#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
-
-static struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[3];
- struct ipt_error term;
-} initial_table __initdata
-= { { "filter", FILTER_VALID_HOOKS, 4,
- sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
- { [NF_IP_LOCAL_IN] = 0,
- [NF_IP_FORWARD] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
- { [NF_IP_LOCAL_IN] = 0,
- [NF_IP_FORWARD] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
- 0, NULL, { } },
- {
- /* LOCAL_IN */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* FORWARD */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_OUT */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } }
- },
- /* ERROR */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_error),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
- { } },
- "ERROR"
- }
- }
-};
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT))
-static struct ipt_table packet_filter = {
+static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
- .af = AF_INET,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_FILTER,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ipt_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static bool forward __read_mostly = true;
+module_param(forward, bool, 0000);
+
+static int iptable_filter_table_init(struct net *net)
{
- return ipt_do_table(pskb, hook, in, out, &packet_filter);
+ struct ipt_replace *repl;
+ int err;
+
+ repl = ipt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ /* Entry 1 is the FORWARD hook */
+ ((struct ipt_standard *)repl->entries)[1].target.verdict =
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
+
+ err = ipt_register_table(net, &packet_filter, repl, filter_ops);
+ kfree(repl);
+ return err;
}
-static unsigned int
-ipt_local_out_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static int __net_init iptable_filter_net_init(struct net *net)
{
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
- if (net_ratelimit())
- printk("ipt_hook: happy cracking.\n");
- return NF_ACCEPT;
- }
+ if (!forward)
+ return iptable_filter_table_init(net);
- return ipt_do_table(pskb, hook, in, out, &packet_filter);
+ return 0;
}
-static struct nf_hook_ops ipt_ops[] = {
- {
- .hook = ipt_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_FILTER,
- },
- {
- .hook = ipt_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = NF_IP_PRI_FILTER,
- },
- {
- .hook = ipt_local_out_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_FILTER,
- },
-};
-
-/* Default to forward because I got too much mail already. */
-static int forward = NF_ACCEPT;
-module_param(forward, bool, 0000);
+static void __net_exit iptable_filter_net_pre_exit(struct net *net)
+{
+ ipt_unregister_table_pre_exit(net, "filter");
+}
-static int __init iptable_filter_init(void)
+static void __net_exit iptable_filter_net_exit(struct net *net)
{
- int ret;
+ ipt_unregister_table_exit(net, "filter");
+}
- if (forward < 0 || forward > NF_MAX_VERDICT) {
- printk("iptables forward must be 0 or 1\n");
- return -EINVAL;
- }
+static struct pernet_operations iptable_filter_net_ops = {
+ .init = iptable_filter_net_init,
+ .pre_exit = iptable_filter_net_pre_exit,
+ .exit = iptable_filter_net_exit,
+};
- /* Entry 1 is the FORWARD hook */
- initial_table.entries[1].target.verdict = -forward - 1;
+static int __init iptable_filter_init(void)
+{
+ int ret = xt_register_template(&packet_filter,
+ iptable_filter_table_init);
- /* Register table */
- ret = ipt_register_table(&packet_filter, &initial_table.repl);
if (ret < 0)
return ret;
- /* Register hooks */
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
- goto cleanup_table;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table);
+ if (IS_ERR(filter_ops)) {
+ xt_unregister_template(&packet_filter);
+ return PTR_ERR(filter_ops);
+ }
- return ret;
+ ret = register_pernet_subsys(&iptable_filter_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
+ kfree(filter_ops);
+ return ret;
+ }
- cleanup_table:
- ipt_unregister_table(&packet_filter);
- return ret;
+ return 0;
}
static void __exit iptable_filter_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- ipt_unregister_table(&packet_filter);
+ unregister_pernet_subsys(&iptable_filter_net_ops);
+ xt_unregister_template(&packet_filter);
+ kfree(filter_ops);
}
module_init(iptable_filter_init);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index af2939889444..385d945d8ebe 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -1,229 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/route.h>
#include <linux/ip.h>
+#include <net/ip.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables mangle table");
-#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \
- (1 << NF_IP_LOCAL_IN) | \
- (1 << NF_IP_FORWARD) | \
- (1 << NF_IP_LOCAL_OUT) | \
- (1 << NF_IP_POST_ROUTING))
-
-/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
-static struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[5];
- struct ipt_error term;
-} initial_table __initdata
-= { { "mangle", MANGLE_VALID_HOOKS, 6,
- sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
- { [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
- [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 },
- { [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
- [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 },
- 0, NULL, { } },
- {
- /* PRE_ROUTING */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_IN */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* FORWARD */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_OUT */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* POST_ROUTING */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_standard),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- },
- /* ERROR */
- { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ipt_entry),
- sizeof(struct ipt_error),
- 0, { 0, 0 }, { } },
- { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
- { } },
- "ERROR"
- }
- }
-};
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_POST_ROUTING))
-static struct ipt_table packet_mangler = {
+static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
- .af = AF_INET,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_MANGLE,
};
-/* The work comes in here from netfilter.c. */
static unsigned int
-ipt_route_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
- return ipt_do_table(pskb, hook, in, out, &packet_mangler);
+ unsigned int ret, verdict;
+ const struct iphdr *iph;
+ __be32 saddr, daddr;
+ u32 mark;
+ int err;
+ u8 tos;
+
+ /* Save things which could affect route */
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = ipt_do_table(priv, skb, state);
+ verdict = ret & NF_VERDICT_MASK;
+ /* Reroute for ANY change. */
+ if (verdict != NF_DROP && verdict != NF_STOLEN) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+
+ return ret;
}
+/* The work comes in here from netfilter.c. */
static unsigned int
-ipt_local_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+iptable_mangle_hook(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
- unsigned int ret;
- u_int8_t tos;
- __be32 saddr, daddr;
- u_int32_t mark;
-
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
- if (net_ratelimit())
- printk("ipt_hook: happy cracking.\n");
- return NF_ACCEPT;
- }
-
- /* Save things which could affect route */
- mark = (*pskb)->mark;
- saddr = (*pskb)->nh.iph->saddr;
- daddr = (*pskb)->nh.iph->daddr;
- tos = (*pskb)->nh.iph->tos;
+ if (state->hook == NF_INET_LOCAL_OUT)
+ return ipt_mangle_out(priv, skb, state);
+ return ipt_do_table(priv, skb, state);
+}
- ret = ipt_do_table(pskb, hook, in, out, &packet_mangler);
- /* Reroute for ANY change. */
- if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
- && ((*pskb)->nh.iph->saddr != saddr
- || (*pskb)->nh.iph->daddr != daddr
- || (*pskb)->mark != mark
- || (*pskb)->nh.iph->tos != tos))
- if (ip_route_me_harder(pskb, RTN_UNSPEC))
- ret = NF_DROP;
+static struct nf_hook_ops *mangle_ops __read_mostly;
+static int iptable_mangle_table_init(struct net *net)
+{
+ struct ipt_replace *repl;
+ int ret;
+ repl = ipt_alloc_initial_table(&packet_mangler);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops);
+ kfree(repl);
return ret;
}
-static struct nf_hook_ops ipt_ops[] = {
- {
- .hook = ipt_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_local_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_MANGLE,
- },
+static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
+{
+ ipt_unregister_table_pre_exit(net, "mangle");
+}
+
+static void __net_exit iptable_mangle_net_exit(struct net *net)
+{
+ ipt_unregister_table_exit(net, "mangle");
+}
+
+static struct pernet_operations iptable_mangle_net_ops = {
+ .pre_exit = iptable_mangle_net_pre_exit,
+ .exit = iptable_mangle_net_exit,
};
static int __init iptable_mangle_init(void)
{
- int ret;
-
- /* Register table */
- ret = ipt_register_table(&packet_mangler, &initial_table.repl);
+ int ret = xt_register_template(&packet_mangler,
+ iptable_mangle_table_init);
if (ret < 0)
return ret;
- /* Register hooks */
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
- goto cleanup_table;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ xt_unregister_template(&packet_mangler);
+ ret = PTR_ERR(mangle_ops);
+ return ret;
+ }
- return ret;
+ ret = register_pernet_subsys(&iptable_mangle_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&packet_mangler);
+ kfree(mangle_ops);
+ return ret;
+ }
- cleanup_table:
- ipt_unregister_table(&packet_mangler);
return ret;
}
static void __exit iptable_mangle_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- ipt_unregister_table(&packet_mangler);
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
+ xt_unregister_template(&packet_mangler);
+ kfree(mangle_ops);
}
module_init(iptable_mangle_init);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 000000000000..a5db7c67d61b
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+
+struct iptable_nat_pernet {
+ struct nf_hook_ops *nf_nat_ops;
+};
+
+static unsigned int iptable_nat_net_id __read_mostly;
+
+static const struct xt_table nf_nat_ipv4_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+};
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+ {
+ .hook = ipt_do_table,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ {
+ .hook = ipt_do_table,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ {
+ .hook = ipt_do_table,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ {
+ .hook = ipt_do_table,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+static int ipt_nat_register_lookups(struct net *net)
+{
+ struct iptable_nat_pernet *xt_nat_net;
+ struct nf_hook_ops *ops;
+ struct xt_table *table;
+ int i, ret;
+
+ xt_nat_net = net_generic(net, iptable_nat_net_id);
+ table = xt_find_table(net, NFPROTO_IPV4, "nat");
+ if (WARN_ON_ONCE(!table))
+ return -ENOENT;
+
+ ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
+ ops[i].priv = table;
+ ret = nf_nat_ipv4_register_fn(net, &ops[i]);
+ if (ret) {
+ while (i)
+ nf_nat_ipv4_unregister_fn(net, &ops[--i]);
+
+ kfree(ops);
+ return ret;
+ }
+ }
+
+ xt_nat_net->nf_nat_ops = ops;
+ return 0;
+}
+
+static void ipt_nat_unregister_lookups(struct net *net)
+{
+ struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id);
+ struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops;
+ int i;
+
+ if (!ops)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
+ nf_nat_ipv4_unregister_fn(net, &ops[i]);
+
+ kfree(ops);
+}
+
+static int iptable_nat_table_init(struct net *net)
+{
+ struct ipt_replace *repl;
+ int ret;
+
+ repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+ if (repl == NULL)
+ return -ENOMEM;
+
+ ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL);
+ if (ret < 0) {
+ kfree(repl);
+ return ret;
+ }
+
+ ret = ipt_nat_register_lookups(net);
+ if (ret < 0)
+ ipt_unregister_table_exit(net, "nat");
+
+ kfree(repl);
+ return ret;
+}
+
+static void __net_exit iptable_nat_net_pre_exit(struct net *net)
+{
+ ipt_nat_unregister_lookups(net);
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+ ipt_unregister_table_exit(net, "nat");
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+ .pre_exit = iptable_nat_net_pre_exit,
+ .exit = iptable_nat_net_exit,
+ .id = &iptable_nat_net_id,
+ .size = sizeof(struct iptable_nat_pernet),
+};
+
+static int __init iptable_nat_init(void)
+{
+ int ret;
+
+ /* net->gen->ptr[iptable_nat_net_id] must be allocated
+ * before calling iptable_nat_table_init().
+ */
+ ret = register_pernet_subsys(&iptable_nat_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_register_template(&nf_nat_ipv4_table,
+ iptable_nat_table_init);
+ if (ret < 0)
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+
+ return ret;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+ xt_unregister_template(&nf_nat_ipv4_table);
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables legacy nat table");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index bcbeb4aeacd9..0e7f53964d0a 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -1,148 +1,111 @@
-/*
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
-#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-static struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[2];
- struct ipt_error term;
-} initial_table __initdata = {
- .repl = {
- .name = "raw",
- .valid_hooks = RAW_VALID_HOOKS,
- .num_entries = 3,
- .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
- .hook_entry = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
- .underflow = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
- },
- .entries = {
- /* PRE_ROUTING */
- {
- .entry = {
- .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_standard),
- },
- .target = {
- .target = {
- .u = {
- .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
- },
- },
- .verdict = -NF_ACCEPT - 1,
- },
- },
-
- /* LOCAL_OUT */
- {
- .entry = {
- .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_standard),
- },
- .target = {
- .target = {
- .u = {
- .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
- },
- },
- .verdict = -NF_ACCEPT - 1,
- },
- },
- },
- /* ERROR */
- .term = {
- .entry = {
- .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_error),
- },
- .target = {
- .target = {
- .u = {
- .user = {
- .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)),
- .name = IPT_ERROR_TARGET,
- },
- },
- },
- .errorname = "ERROR",
- },
- }
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
+
+static const struct xt_table packet_raw = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_RAW,
};
-static struct ipt_table packet_raw = {
- .name = "raw",
- .valid_hooks = RAW_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
+static const struct xt_table packet_raw_before_defrag = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
- .af = AF_INET,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ipt_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int iptable_raw_table_init(struct net *net)
+{
+ struct ipt_replace *repl;
+ const struct xt_table *table = &packet_raw;
+ int ret;
+
+ if (raw_before_defrag)
+ table = &packet_raw_before_defrag;
+
+ repl = ipt_alloc_initial_table(table);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ipt_register_table(net, table, repl, rawtable_ops);
+ kfree(repl);
+ return ret;
+}
+
+static void __net_exit iptable_raw_net_pre_exit(struct net *net)
{
- return ipt_do_table(pskb, hook, in, out, &packet_raw);
+ ipt_unregister_table_pre_exit(net, "raw");
}
-/* 'raw' is the very first table. */
-static struct nf_hook_ops ipt_ops[] = {
- {
- .hook = ipt_hook,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_RAW,
- .owner = THIS_MODULE,
- },
- {
- .hook = ipt_hook,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_RAW,
- .owner = THIS_MODULE,
- },
+static void __net_exit iptable_raw_net_exit(struct net *net)
+{
+ ipt_unregister_table_exit(net, "raw");
+}
+
+static struct pernet_operations iptable_raw_net_ops = {
+ .pre_exit = iptable_raw_net_pre_exit,
+ .exit = iptable_raw_net_exit,
};
static int __init iptable_raw_init(void)
{
int ret;
+ const struct xt_table *table = &packet_raw;
+
+ if (raw_before_defrag) {
+ table = &packet_raw_before_defrag;
+
+ pr_info("Enabling raw table before defrag\n");
+ }
- /* Register table */
- ret = ipt_register_table(&packet_raw, &initial_table.repl);
+ ret = xt_register_template(table,
+ iptable_raw_table_init);
if (ret < 0)
return ret;
- /* Register hooks */
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
- goto cleanup_table;
+ rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table);
+ if (IS_ERR(rawtable_ops)) {
+ xt_unregister_template(table);
+ return PTR_ERR(rawtable_ops);
+ }
- return ret;
+ ret = register_pernet_subsys(&iptable_raw_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(table);
+ kfree(rawtable_ops);
+ return ret;
+ }
- cleanup_table:
- ipt_unregister_table(&packet_raw);
return ret;
}
static void __exit iptable_raw_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- ipt_unregister_table(&packet_raw);
+ unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
+ xt_unregister_template(&packet_raw);
}
module_init(iptable_raw_init);
module_exit(iptable_raw_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables legacy raw table");
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
new file mode 100644
index 000000000000..d885443cb267
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * "security" table
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("iptables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+ .name = "security",
+ .valid_hooks = SECURITY_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_SECURITY,
+};
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int iptable_security_table_init(struct net *net)
+{
+ struct ipt_replace *repl;
+ int ret;
+
+ repl = ipt_alloc_initial_table(&security_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ipt_register_table(net, &security_table, repl, sectbl_ops);
+ kfree(repl);
+ return ret;
+}
+
+static void __net_exit iptable_security_net_pre_exit(struct net *net)
+{
+ ipt_unregister_table_pre_exit(net, "security");
+}
+
+static void __net_exit iptable_security_net_exit(struct net *net)
+{
+ ipt_unregister_table_exit(net, "security");
+}
+
+static struct pernet_operations iptable_security_net_ops = {
+ .pre_exit = iptable_security_net_pre_exit,
+ .exit = iptable_security_net_exit,
+};
+
+static int __init iptable_security_init(void)
+{
+ int ret = xt_register_template(&security_table,
+ iptable_security_table_init);
+
+ if (ret < 0)
+ return ret;
+
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table);
+ if (IS_ERR(sectbl_ops)) {
+ xt_unregister_template(&security_table);
+ return PTR_ERR(sectbl_ops);
+ }
+
+ ret = register_pernet_subsys(&iptable_security_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&security_table);
+ kfree(sectbl_ops);
+ return ret;
+ }
+
+ return ret;
+}
+
+static void __exit iptable_security_fini(void)
+{
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
+ xt_unregister_template(&security_table);
+}
+
+module_init(iptable_security_init);
+module_exit(iptable_security_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
deleted file mode 100644
index f24e872d4b90..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ /dev/null
@@ -1,535 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - move L3 protocol dependent part to this file.
- * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - add get_features() to support various size of conntrack
- * structures.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- u_int32_t _addrs[2], *ap;
- ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
- sizeof(u_int32_t) * 2, _addrs);
- if (ap == NULL)
- return 0;
-
- tuple->src.u3.ip = ap[0];
- tuple->dst.u3.ip = ap[1];
-
- return 1;
-}
-
-static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u3.ip = orig->dst.u3.ip;
- tuple->dst.u3.ip = orig->src.u3.ip;
-
- return 1;
-}
-
-static int ipv4_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
- NIPQUAD(tuple->src.u3.ip),
- NIPQUAD(tuple->dst.u3.ip));
-}
-
-static int ipv4_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
-/* Returns new sk_buff, or NULL */
-static struct sk_buff *
-nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
-{
- skb_orphan(skb);
-
- local_bh_disable();
- skb = ip_defrag(skb, user);
- local_bh_enable();
-
- if (skb)
- ip_send_check(skb->nh.iph);
-
- return skb;
-}
-
-static int
-ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
- u_int8_t *protonum)
-{
- /* Never happen */
- if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
- if (net_ratelimit()) {
- printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n",
- (*pskb)->nh.iph->protocol, hooknum);
- }
- return -NF_DROP;
- }
-
- *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4;
- *protonum = (*pskb)->nh.iph->protocol;
-
- return NF_ACCEPT;
-}
-
-int nat_module_is_loaded = 0;
-static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple)
-{
- if (nat_module_is_loaded)
- return NF_CT_F_NAT;
-
- return NF_CT_F_BASIC;
-}
-
-static unsigned int ipv4_confirm(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(pskb);
-}
-
-static unsigned int ipv4_conntrack_help(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- struct nf_conn_help *help;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(*pskb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help || !help->helper)
- return NF_ACCEPT;
-
- return help->helper->help(pskb,
- (*pskb)->nh.raw - (*pskb)->data
- + (*pskb)->nh.iph->ihl*4,
- ct, ctinfo);
-}
-
-static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
-#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
- /* Previously seen (loopback)? Ignore. Do this before
- fragment check. */
- if ((*pskb)->nfct)
- return NF_ACCEPT;
-#endif
-
- /* Gather fragments. */
- if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
- *pskb = nf_ct_ipv4_gather_frags(*pskb,
- hooknum == NF_IP_PRE_ROUTING ?
- IP_DEFRAG_CONNTRACK_IN :
- IP_DEFRAG_CONNTRACK_OUT);
- if (!*pskb)
- return NF_STOLEN;
- }
- return NF_ACCEPT;
-}
-
-static unsigned int ipv4_conntrack_in(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return nf_conntrack_in(PF_INET, hooknum, pskb);
-}
-
-static unsigned int ipv4_conntrack_local(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
- if (net_ratelimit())
- printk("ipt_hook: happy cracking.\n");
- return NF_ACCEPT;
- }
- return nf_conntrack_in(PF_INET, hooknum, pskb);
-}
-
-/* Connection tracking may drop packets, but never alters them, so
- make it the first hook. */
-static struct nf_hook_ops ipv4_conntrack_ops[] = {
- {
- .hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
- },
- {
- .hook = ipv4_conntrack_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
- },
- {
- .hook = ipv4_conntrack_local,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_conntrack_help,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_conntrack_help,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
- {
- .hook = ipv4_confirm,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
-};
-
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-static int log_invalid_proto_min = 0;
-static int log_invalid_proto_max = 255;
-
-static ctl_table ip_ct_sysctl_table[] = {
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
- .procname = "ip_conntrack_max",
- .data = &nf_conntrack_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
- .procname = "ip_conntrack_count",
- .data = &nf_conntrack_count,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
- .procname = "ip_conntrack_buckets",
- .data = &nf_conntrack_htable_size,
- .maxlen = sizeof(unsigned int),
- .mode = 0444,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
- .procname = "ip_conntrack_checksum",
- .data = &nf_conntrack_checksum,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
- .procname = "ip_conntrack_log_invalid",
- .data = &nf_ct_log_invalid,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &log_invalid_proto_min,
- .extra2 = &log_invalid_proto_max,
- },
- {
- .ctl_name = 0
- }
-};
-#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
-
-/* Fast function for those who don't want to parse /proc (and I don't
- blame them). */
-/* Reversing the socket's dst/src point of view gives us the reply
- mapping. */
-static int
-getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
-
- NF_CT_TUPLE_U_BLANK(&tuple);
- tuple.src.u3.ip = inet->rcv_saddr;
- tuple.src.u.tcp.port = inet->sport;
- tuple.dst.u3.ip = inet->daddr;
- tuple.dst.u.tcp.port = inet->dport;
- tuple.src.l3num = PF_INET;
- tuple.dst.protonum = IPPROTO_TCP;
-
- /* We only do TCP at the moment: is there a better way? */
- if (strcmp(sk->sk_prot->name, "TCP")) {
- DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
- return -ENOPROTOOPT;
- }
-
- if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
- *len, sizeof(struct sockaddr_in));
- return -EINVAL;
- }
-
- h = nf_conntrack_find_get(&tuple, NULL);
- if (h) {
- struct sockaddr_in sin;
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
- sin.sin_family = AF_INET;
- sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u3.ip;
- memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
-
- DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
- NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
- nf_ct_put(ct);
- if (copy_to_user(user, &sin, sizeof(sin)) != 0)
- return -EFAULT;
- else
- return 0;
- }
- DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
- NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port),
- NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
-}
-
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv4_tuple_to_nfattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t),
- &tuple->src.u3.ip);
- NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t),
- &tuple->dst.u3.ip);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static const size_t cta_min_ip[CTA_IP_MAX] = {
- [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
- [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
-};
-
-static int ipv4_nfattr_to_tuple(struct nfattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1])
- return -EINVAL;
-
- if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
- return -EINVAL;
-
- t->src.u3.ip =
- *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
- t->dst.u3.ip =
- *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
-
- return 0;
-}
-#endif
-
-static struct nf_sockopt_ops so_getorigdst = {
- .pf = PF_INET,
- .get_optmin = SO_ORIGINAL_DST,
- .get_optmax = SO_ORIGINAL_DST+1,
- .get = &getorigdst,
-};
-
-struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
- .l3proto = PF_INET,
- .name = "ipv4",
- .pkt_to_tuple = ipv4_pkt_to_tuple,
- .invert_tuple = ipv4_invert_tuple,
- .print_tuple = ipv4_print_tuple,
- .print_conntrack = ipv4_print_conntrack,
- .prepare = ipv4_prepare,
- .get_features = ipv4_get_features,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = ipv4_tuple_to_nfattr,
- .nfattr_to_tuple = ipv4_nfattr_to_tuple,
-#endif
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path,
- .ctl_table = ip_ct_sysctl_table,
-#endif
- .me = THIS_MODULE,
-};
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
-MODULE_LICENSE("GPL");
-
-static int __init nf_conntrack_l3proto_ipv4_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
- ret = nf_register_sockopt(&so_getorigdst);
- if (ret < 0) {
- printk(KERN_ERR "Unable to register netfilter socket option\n");
- return ret;
- }
-
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register tcp.\n");
- goto cleanup_sockopt;
- }
-
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register udp.\n");
- goto cleanup_tcp;
- }
-
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register icmp.\n");
- goto cleanup_udp;
- }
-
- ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
- if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register ipv4\n");
- goto cleanup_icmp;
- }
-
- ret = nf_register_hooks(ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register hooks.\n");
- goto cleanup_ipv4;
- }
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- ret = nf_conntrack_ipv4_compat_init();
- if (ret < 0)
- goto cleanup_hooks;
-#endif
- return ret;
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- cleanup_hooks:
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-#endif
- cleanup_ipv4:
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- cleanup_icmp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst);
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv4_fini(void)
-{
- synchronize_net();
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- nf_conntrack_ipv4_compat_fini();
-#endif
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- nf_unregister_sockopt(&so_getorigdst);
-}
-
-module_init(nf_conntrack_l3proto_ipv4_init);
-module_exit(nf_conntrack_l3proto_ipv4_fini);
-
-EXPORT_SYMBOL(nf_ct_ipv4_gather_frags);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
deleted file mode 100644
index 3b31bc649608..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/percpu.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#ifdef CONFIG_NF_CT_ACCT
-static unsigned int
-seq_print_counters(struct seq_file *s,
- const struct ip_conntrack_counter *counter)
-{
- return seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)counter->packets,
- (unsigned long long)counter->bytes);
-}
-#else
-#define seq_print_counters(x, y) 0
-#endif
-
-struct ct_iter_state {
- unsigned int bucket;
-};
-
-static struct list_head *ct_get_first(struct seq_file *seq)
-{
- struct ct_iter_state *st = seq->private;
-
- for (st->bucket = 0;
- st->bucket < nf_conntrack_htable_size;
- st->bucket++) {
- if (!list_empty(&nf_conntrack_hash[st->bucket]))
- return nf_conntrack_hash[st->bucket].next;
- }
- return NULL;
-}
-
-static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
-{
- struct ct_iter_state *st = seq->private;
-
- head = head->next;
- while (head == &nf_conntrack_hash[st->bucket]) {
- if (++st->bucket >= nf_conntrack_htable_size)
- return NULL;
- head = nf_conntrack_hash[st->bucket].next;
- }
- return head;
-}
-
-static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct list_head *head = ct_get_first(seq);
-
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
-}
-
-static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
-{
- read_lock_bh(&nf_conntrack_lock);
- return ct_get_idx(seq, *pos);
-}
-
-static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
- return ct_get_next(s, v);
-}
-
-static void ct_seq_stop(struct seq_file *s, void *v)
-{
- read_unlock_bh(&nf_conntrack_lock);
-}
-
-static int ct_seq_show(struct seq_file *s, void *v)
-{
- const struct nf_conntrack_tuple_hash *hash = v;
- const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
- struct nf_conntrack_l3proto *l3proto;
- struct nf_conntrack_l4proto *l4proto;
-
- NF_CT_ASSERT(ct);
-
- /* we only want to print DIR_ORIGINAL */
- if (NF_CT_DIRECTION(hash))
- return 0;
- if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num != AF_INET)
- return 0;
-
- l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.src.l3num);
- NF_CT_ASSERT(l3proto);
- l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.src.l3num,
- ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.protonum);
- NF_CT_ASSERT(l4proto);
-
- if (seq_printf(s, "%-8s %u %ld ",
- l4proto->name,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
- timer_pending(&ct->timeout)
- ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
- return -ENOSPC;
-
- if (l3proto->print_conntrack(s, ct))
- return -ENOSPC;
-
- if (l4proto->print_conntrack(s, ct))
- return -ENOSPC;
-
- if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- l3proto, l4proto))
- return -ENOSPC;
-
- if (seq_print_counters(s, &ct->counters[IP_CT_DIR_ORIGINAL]))
- return -ENOSPC;
-
- if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
- if (seq_printf(s, "[UNREPLIED] "))
- return -ENOSPC;
-
- if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- l3proto, l4proto))
- return -ENOSPC;
-
- if (seq_print_counters(s, &ct->counters[IP_CT_DIR_REPLY]))
- return -ENOSPC;
-
- if (test_bit(IPS_ASSURED_BIT, &ct->status))
- if (seq_printf(s, "[ASSURED] "))
- return -ENOSPC;
-
-#ifdef CONFIG_NF_CONNTRACK_MARK
- if (seq_printf(s, "mark=%u ", ct->mark))
- return -ENOSPC;
-#endif
-
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
- if (seq_printf(s, "secmark=%u ", ct->secmark))
- return -ENOSPC;
-#endif
-
- if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
- return -ENOSPC;
-
- return 0;
-}
-
-static struct seq_operations ct_seq_ops = {
- .start = ct_seq_start,
- .next = ct_seq_next,
- .stop = ct_seq_stop,
- .show = ct_seq_show
-};
-
-static int ct_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- struct ct_iter_state *st;
- int ret;
-
- st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
- if (st == NULL)
- return -ENOMEM;
- ret = seq_open(file, &ct_seq_ops);
- if (ret)
- goto out_free;
- seq = file->private_data;
- seq->private = st;
- memset(st, 0, sizeof(struct ct_iter_state));
- return ret;
-out_free:
- kfree(st);
- return ret;
-}
-
-static struct file_operations ct_file_ops = {
- .owner = THIS_MODULE,
- .open = ct_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-/* expects */
-static void *exp_seq_start(struct seq_file *s, loff_t *pos)
-{
- struct list_head *e = &nf_conntrack_expect_list;
- loff_t i;
-
- /* strange seq_file api calls stop even if we fail,
- * thus we need to grab lock since stop unlocks */
- read_lock_bh(&nf_conntrack_lock);
-
- if (list_empty(e))
- return NULL;
-
- for (i = 0; i <= *pos; i++) {
- e = e->next;
- if (e == &nf_conntrack_expect_list)
- return NULL;
- }
- return e;
-}
-
-static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct list_head *e = v;
-
- ++*pos;
- e = e->next;
-
- if (e == &nf_conntrack_expect_list)
- return NULL;
-
- return e;
-}
-
-static void exp_seq_stop(struct seq_file *s, void *v)
-{
- read_unlock_bh(&nf_conntrack_lock);
-}
-
-static int exp_seq_show(struct seq_file *s, void *v)
-{
- struct nf_conntrack_expect *exp = v;
-
- if (exp->tuple.src.l3num != AF_INET)
- return 0;
-
- if (exp->timeout.function)
- seq_printf(s, "%ld ", timer_pending(&exp->timeout)
- ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
- else
- seq_printf(s, "- ");
-
- seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
-
- print_tuple(s, &exp->tuple,
- __nf_ct_l3proto_find(exp->tuple.src.l3num),
- __nf_ct_l4proto_find(exp->tuple.src.l3num,
- exp->tuple.dst.protonum));
- return seq_putc(s, '\n');
-}
-
-static struct seq_operations exp_seq_ops = {
- .start = exp_seq_start,
- .next = exp_seq_next,
- .stop = exp_seq_stop,
- .show = exp_seq_show
-};
-
-static int exp_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &exp_seq_ops);
-}
-
-static struct file_operations ip_exp_file_ops = {
- .owner = THIS_MODULE,
- .open = exp_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
-
-static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
- int cpu;
-
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
- if (!cpu_possible(cpu))
- continue;
- *pos = cpu+1;
- return &per_cpu(nf_conntrack_stat, cpu);
- }
-
- return NULL;
-}
-
-static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- int cpu;
-
- for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
- if (!cpu_possible(cpu))
- continue;
- *pos = cpu+1;
- return &per_cpu(nf_conntrack_stat, cpu);
- }
-
- return NULL;
-}
-
-static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static int ct_cpu_seq_show(struct seq_file *seq, void *v)
-{
- unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
- struct ip_conntrack_stat *st = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
- return 0;
- }
-
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
- "%08x %08x %08x %08x %08x %08x %08x %08x \n",
- nr_conntracks,
- st->searched,
- st->found,
- st->new,
- st->invalid,
- st->ignore,
- st->delete,
- st->delete_list,
- st->insert,
- st->insert_failed,
- st->drop,
- st->early_drop,
- st->error,
-
- st->expect_new,
- st->expect_create,
- st->expect_delete
- );
- return 0;
-}
-
-static struct seq_operations ct_cpu_seq_ops = {
- .start = ct_cpu_seq_start,
- .next = ct_cpu_seq_next,
- .stop = ct_cpu_seq_stop,
- .show = ct_cpu_seq_show,
-};
-
-static int ct_cpu_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &ct_cpu_seq_ops);
-}
-
-static struct file_operations ct_cpu_seq_fops = {
- .owner = THIS_MODULE,
- .open = ct_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-int __init nf_conntrack_ipv4_compat_init(void)
-{
- struct proc_dir_entry *proc, *proc_exp, *proc_stat;
-
- proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
- if (!proc)
- goto err1;
-
- proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
- &ip_exp_file_ops);
- if (!proc_exp)
- goto err2;
-
- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
- if (!proc_stat)
- goto err3;
-
- proc_stat->proc_fops = &ct_cpu_seq_fops;
- proc_stat->owner = THIS_MODULE;
-
- return 0;
-
-err3:
- proc_net_remove("ip_conntrack_expect");
-err2:
- proc_net_remove("ip_conntrack");
-err1:
- return -ENOMEM;
-}
-
-void __exit nf_conntrack_ipv4_compat_fini(void)
-{
- remove_proc_entry("ip_conntrack", proc_net_stat);
- proc_net_remove("ip_conntrack_expect");
- proc_net_remove("ip_conntrack");
-}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
deleted file mode 100644
index 46aa44abc078..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - enable working with Layer 3 protocol independent connection tracking.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/in.h>
-#include <linux/icmp.h>
-#include <linux/seq_file.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack_tuple.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-
-static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ;
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int icmp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
-{
- struct icmphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return 0;
-
- tuple->dst.u.icmp.type = hp->type;
- tuple->src.u.icmp.id = hp->un.echo.id;
- tuple->dst.u.icmp.code = hp->code;
-
- return 1;
-}
-
-/* Add 1; spaces filled with 0. */
-static const u_int8_t invmap[] = {
- [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
- [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
- [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
- [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
- [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
- [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
- [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
- [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
-};
-
-static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- if (orig->dst.u.icmp.type >= sizeof(invmap)
- || !invmap[orig->dst.u.icmp.type])
- return 0;
-
- tuple->src.u.icmp.id = orig->src.u.icmp.id;
- tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
- tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int icmp_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- return seq_printf(s, "type=%u code=%u id=%u ",
- tuple->dst.u.icmp.type,
- tuple->dst.u.icmp.code,
- ntohs(tuple->src.u.icmp.id));
-}
-
-/* Print out the private part of the conntrack. */
-static int icmp_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int icmp_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- int pf,
- unsigned int hooknum)
-{
- /* Try to delete connection immediately after all replies:
- won't actually vanish as we still have skb, and del_timer
- means this will only run once even if count hits zero twice
- (theoretically possible with SMP) */
- if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
- if (atomic_dec_and_test(&ct->proto.icmp.count)
- && del_timer(&ct->timeout))
- ct->timeout.function((unsigned long)ct);
- } else {
- atomic_inc(&ct->proto.icmp.count);
- nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
- }
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int icmp_new(struct nf_conn *conntrack,
- const struct sk_buff *skb, unsigned int dataoff)
-{
- static const u_int8_t valid_new[] = {
- [ICMP_ECHO] = 1,
- [ICMP_TIMESTAMP] = 1,
- [ICMP_INFO_REQUEST] = 1,
- [ICMP_ADDRESS] = 1
- };
-
- if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
- || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
- /* Can't create a new ICMP `conn' with this. */
- DEBUGP("icmp: can't create new conn with type %u\n",
- conntrack->tuplehash[0].tuple.dst.u.icmp.type);
- NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
- return 0;
- }
- atomic_set(&conntrack->proto.icmp.count, 0);
- return 1;
-}
-
-extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-static int
-icmp_error_message(struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct nf_conntrack_tuple innertuple, origtuple;
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } _in, *inside;
- struct nf_conntrack_l4proto *innerproto;
- struct nf_conntrack_tuple_hash *h;
- int dataoff;
-
- NF_CT_ASSERT(skb->nfct == NULL);
-
- /* Not enough header? */
- inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
- if (inside == NULL)
- return -NF_ACCEPT;
-
- /* Ignore ICMP's containing fragments (shouldn't happen) */
- if (inside->ip.frag_off & htons(IP_OFFSET)) {
- DEBUGP("icmp_error_message: fragment of proto %u\n",
- inside->ip.protocol);
- return -NF_ACCEPT;
- }
-
- innerproto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
- dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp);
- /* Are they talking about one of our connections? */
- if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET,
- inside->ip.protocol, &origtuple,
- &nf_conntrack_l3proto_ipv4, innerproto)) {
- DEBUGP("icmp_error_message: ! get_tuple p=%u",
- inside->ip.protocol);
- return -NF_ACCEPT;
- }
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&innertuple, &origtuple,
- &nf_conntrack_l3proto_ipv4, innerproto)) {
- DEBUGP("icmp_error_message: no match\n");
- return -NF_ACCEPT;
- }
-
- *ctinfo = IP_CT_RELATED;
-
- h = nf_conntrack_find_get(&innertuple, NULL);
- if (!h) {
- /* Locally generated ICMPs will match inverted if they
- haven't been SNAT'ed yet */
- /* FIXME: NAT code has to handle half-done double NAT --RR */
- if (hooknum == NF_IP_LOCAL_OUT)
- h = nf_conntrack_find_get(&origtuple, NULL);
-
- if (!h) {
- DEBUGP("icmp_error_message: no match\n");
- return -NF_ACCEPT;
- }
-
- /* Reverse direction from that found */
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- } else {
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- }
-
- /* Update skb to refer to this connection */
- skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
- return -NF_ACCEPT;
-}
-
-/* Small and modified version of icmp_rcv */
-static int
-icmp_error(struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
-{
- struct icmphdr _ih, *icmph;
-
- /* Not enough header? */
- icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
- if (icmph == NULL) {
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: short packet ");
- return -NF_ACCEPT;
- }
-
- /* See ip_conntrack_proto_tcp.c */
- if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
- nf_ip_checksum(skb, hooknum, dataoff, 0)) {
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: bad HW ICMP checksum ");
- return -NF_ACCEPT;
- }
-
- /*
- * 18 is the highest 'known' ICMP type. Anything else is a mystery
- *
- * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
- * discarded.
- */
- if (icmph->type > NR_ICMP_TYPES) {
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: invalid ICMP type ");
- return -NF_ACCEPT;
- }
-
- /* Need to track icmp error message? */
- if (icmph->type != ICMP_DEST_UNREACH
- && icmph->type != ICMP_SOURCE_QUENCH
- && icmph->type != ICMP_TIME_EXCEEDED
- && icmph->type != ICMP_PARAMETERPROB
- && icmph->type != ICMP_REDIRECT)
- return NF_ACCEPT;
-
- return icmp_error_message(skb, ctinfo, hooknum);
-}
-
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int icmp_tuple_to_nfattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *t)
-{
- NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
- &t->src.u.icmp.id);
- NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
- &t->dst.u.icmp.type);
- NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
- &t->dst.u.icmp.code);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t)
-};
-
-static int icmp_nfattr_to_tuple(struct nfattr *tb[],
- struct nf_conntrack_tuple *tuple)
-{
- if (!tb[CTA_PROTO_ICMP_TYPE-1]
- || !tb[CTA_PROTO_ICMP_CODE-1]
- || !tb[CTA_PROTO_ICMP_ID-1])
- return -EINVAL;
-
- if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
- return -EINVAL;
-
- tuple->dst.u.icmp.type =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
- tuple->dst.u.icmp.code =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
- tuple->src.u.icmp.id =
- *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
-
- if (tuple->dst.u.icmp.type >= sizeof(invmap)
- || !invmap[tuple->dst.u.icmp.type])
- return -EINVAL;
-
- return 0;
-}
-#endif
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmp_sysctl_header;
-static struct ctl_table icmp_sysctl_table[] = {
- {
- .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT,
- .procname = "nf_conntrack_icmp_timeout",
- .data = &nf_ct_icmp_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = 0
- }
-};
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table icmp_compat_sysctl_table[] = {
- {
- .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
- .procname = "ip_conntrack_icmp_timeout",
- .data = &nf_ct_icmp_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = 0
- }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
-#endif /* CONFIG_SYSCTL */
-
-struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
-{
- .l3proto = PF_INET,
- .l4proto = IPPROTO_ICMP,
- .name = "icmp",
- .pkt_to_tuple = icmp_pkt_to_tuple,
- .invert_tuple = icmp_invert_tuple,
- .print_tuple = icmp_print_tuple,
- .print_conntrack = icmp_print_conntrack,
- .packet = icmp_packet,
- .new = icmp_new,
- .error = icmp_error,
- .destroy = NULL,
- .me = NULL,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = icmp_tuple_to_nfattr,
- .nfattr_to_tuple = icmp_nfattr_to_tuple,
-#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_header = &icmp_sysctl_header,
- .ctl_table = icmp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = icmp_compat_sysctl_table,
-#endif
-#endif
-};
-
-EXPORT_SYMBOL(nf_conntrack_l4proto_icmp);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 000000000000..482e733c3375
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <net/netns/generic.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static DEFINE_MUTEX(defrag4_mutex);
+
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+ u_int32_t user)
+{
+ int err;
+
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+
+ if (!err)
+ skb->ignore_df = 1;
+
+ return err;
+}
+
+static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (skb_nfct(skb)) {
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+ }
+#endif
+ if (nf_bridge_in_prerouting(skb))
+ return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
+
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP_DEFRAG_CONNTRACK_IN + zone_id;
+ else
+ return IP_DEFRAG_CONNTRACK_OUT + zone_id;
+}
+
+static unsigned int ipv4_conntrack_defrag(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct sock *sk = skb->sk;
+
+ if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
+ inet_test_bit(NODEFRAG, sk))
+ return NF_ACCEPT;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
+ return NF_ACCEPT;
+#endif
+ if (skb->_nfct == IP_CT_UNTRACKED)
+ return NF_ACCEPT;
+#endif
+ /* Gather fragments. */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ enum ip_defrag_users user =
+ nf_ct_defrag_user(state->hook, skb);
+
+ if (nf_ct_ipv4_gather_frags(state->net, skb, user))
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static const struct nf_hook_ops ipv4_defrag_ops[] = {
+ {
+ .hook = ipv4_conntrack_defrag,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+ },
+ {
+ .hook = ipv4_conntrack_defrag,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+ },
+};
+
+static void __net_exit defrag4_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv4_users) {
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ net->nf.defrag_ipv4_users = 0;
+ }
+}
+
+static const struct nf_defrag_hook defrag_hook = {
+ .owner = THIS_MODULE,
+ .enable = nf_defrag_ipv4_enable,
+ .disable = nf_defrag_ipv4_disable,
+};
+
+static struct pernet_operations defrag4_net_ops = {
+ .exit = defrag4_net_exit,
+};
+
+static int __init nf_defrag_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&defrag4_net_ops);
+ if (err)
+ return err;
+
+ rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook);
+ return err;
+}
+
+static void __exit nf_defrag_fini(void)
+{
+ rcu_assign_pointer(nf_defrag_v4_hook, NULL);
+ unregister_pernet_subsys(&defrag4_net_ops);
+}
+
+int nf_defrag_ipv4_enable(struct net *net)
+{
+ int err = 0;
+
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4_users == UINT_MAX) {
+ err = -EOVERFLOW;
+ goto out_unlock;
+ }
+
+ if (net->nf.defrag_ipv4_users) {
+ net->nf.defrag_ipv4_users++;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv4_users = 1;
+
+ out_unlock:
+ mutex_unlock(&defrag4_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+
+void nf_defrag_ipv4_disable(struct net *net)
+{
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4_users) {
+ net->nf.defrag_ipv4_users--;
+ if (net->nf.defrag_ipv4_users == 0)
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ }
+
+ mutex_unlock(&defrag4_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 defragmentation support");
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
new file mode 100644
index 000000000000..9a773502f10a
--- /dev/null
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/checksum.h>
+#include <net/flow.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
+ const struct in_addr *gw, int oif)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ memset(&fl4, 0, sizeof(fl4));
+ if (oif != -1)
+ fl4.flowi4_oif = oif;
+
+ fl4.daddr = gw->s_addr;
+ fl4.flowi4_dscp = ip4h_dscp(iph);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return false;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = rt->dst.dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return true;
+}
+
+void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
+ const struct in_addr *gw, int oif)
+{
+ struct iphdr *iph;
+
+ local_bh_disable();
+ if (current->in_nf_duplicate)
+ goto out;
+ /*
+ * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+ * the original skb, which should continue on its way as if nothing has
+ * happened. The copy should be independently delivered to the gateway.
+ */
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Avoid counting cloned packets towards the original connection. */
+ nf_reset_ct(skb);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+#endif
+ /*
+ * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
+ * loops between two hosts.
+ *
+ * Set %IP_DF so that the original source is notified of a potentially
+ * decreased MTU on the clone route. IPv6 does this too.
+ *
+ * IP header checksum will be recalculated at ip_local_out.
+ */
+ iph = ip_hdr(skb);
+ iph->frag_off |= htons(IP_DF);
+ if (hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_IN)
+ --iph->ttl;
+
+ if (nf_dup_ipv4_route(net, skb, gw, oif)) {
+ current->in_nf_duplicate = true;
+ ip_local_out(net, skb->sk, skb);
+ current->in_nf_duplicate = false;
+ } else {
+ kfree_skb(skb);
+ }
+out:
+ local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv4);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
new file mode 100644
index 000000000000..faee20af4856
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * H.323 extension for NAT alteration.
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the 'brute force' H.323 NAT module by
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/****************************************************************************/
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ unsigned int addroff, __be32 ip, __be16 port)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct {
+ __be32 ip;
+ __be16 port;
+ } __attribute__ ((__packed__)) buf;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+
+ buf.ip = ip;
+ buf.port = port;
+ addroff += dataoff;
+
+ if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ protoff, addroff, sizeof(buf),
+ (char *) &buf, sizeof(buf))) {
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
+ return -1;
+ }
+
+ /* Relocate data pointer */
+ th = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return -1;
+ *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
+ } else {
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ protoff, addroff, sizeof(buf),
+ (char *) &buf, sizeof(buf))) {
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
+ return -1;
+ }
+ /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy
+ * or pull everything in a linear buffer, so we can safely
+ * use the skb pointers now */
+ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+{
+ return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
+ addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+{
+ return set_addr(skb, protoff, data, dataoff,
+ taddr->unicastAddress.iPAddress.network,
+ addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
+ if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+ port == info->sig_port[dir]) {
+ /* GW->GK */
+
+ /* Fix for Gnomemeeting */
+ if (i > 0 &&
+ get_h225_addr(ct, *data, &taddr[0],
+ &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
+ i = 0;
+
+ pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, port,
+ &ct->tuplehash[!dir].tuple.dst.u3.ip,
+ info->sig_port[!dir]);
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
+ &ct->tuplehash[!dir].
+ tuple.dst.u3,
+ info->sig_port[!dir]);
+ } else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
+ port == info->sig_port[dir]) {
+ /* GK->GW */
+ pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, port,
+ &ct->tuplehash[!dir].tuple.src.u3.ip,
+ info->sig_port[!dir]);
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
+ &ct->tuplehash[!dir].
+ tuple.src.u3,
+ info->sig_port[!dir]);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+ addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+ port == ct->tuplehash[dir].tuple.src.u.udp.port) {
+ pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, ntohs(port),
+ &ct->tuplehash[!dir].tuple.dst.u3.ip,
+ ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
+ return set_h225_addr(skb, protoff, data, 0, &taddr[i],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.
+ dst.u.udp.port);
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ __be16 port, __be16 rtp_port,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ u_int16_t nated_port;
+
+ /* Set expectations for NAT */
+ rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+ rtp_exp->expectfn = nf_nat_follow_master;
+ rtp_exp->dir = !dir;
+ rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+ rtcp_exp->expectfn = nf_nat_follow_master;
+ rtcp_exp->dir = !dir;
+
+ /* Lookup existing expects */
+ for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
+ if (info->rtp_port[i][dir] == rtp_port) {
+ /* Expected */
+
+ /* Use allocated ports first. This will refresh
+ * the expects */
+ rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir];
+ rtcp_exp->tuple.dst.u.udp.port =
+ htons(ntohs(info->rtp_port[i][dir]) + 1);
+ break;
+ } else if (info->rtp_port[i][dir] == 0) {
+ /* Not expected */
+ break;
+ }
+ }
+
+ /* Run out of expectations */
+ if (i >= H323_RTP_CHANNEL_MAX) {
+ net_notice_ratelimited("nf_nat_h323: out of expectations\n");
+ return 0;
+ }
+
+ /* Try to get a pair of ports. */
+ for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+ nated_port != 0; nated_port += 2) {
+ int ret;
+
+ rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
+ ret = nf_ct_expect_related(rtp_exp, 0);
+ if (ret == 0) {
+ rtcp_exp->tuple.dst.u.udp.port =
+ htons(nated_port + 1);
+ ret = nf_ct_expect_related(rtcp_exp, 0);
+ if (ret == 0)
+ break;
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
+ nf_ct_unexpect_related(rtp_exp);
+ nated_port = 0;
+ break;
+ }
+ } else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons((port & htons(1)) ? nated_port + 1 :
+ nated_port))) {
+ nf_ct_unexpect_related(rtp_exp);
+ nf_ct_unexpect_related(rtcp_exp);
+ return -1;
+ }
+
+ /* Save ports */
+ info->rtp_port[i][dir] = rtp_port;
+ info->rtp_port[i][!dir] = htons(nated_port);
+
+ /* Success */
+ pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
+ &rtp_exp->tuple.src.u3.ip,
+ ntohs(rtp_exp->tuple.src.u.udp.port),
+ &rtp_exp->tuple.dst.u3.ip,
+ ntohs(rtp_exp->tuple.dst.u.udp.port));
+ pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
+ &rtcp_exp->tuple.src.u3.ip,
+ ntohs(rtcp_exp->tuple.src.u.udp.port),
+ &rtcp_exp->tuple.dst.u3.ip,
+ ntohs(rtcp_exp->tuple.dst.u.udp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = nf_nat_follow_master;
+ exp->dir = !dir;
+
+ nated_port = nf_nat_exp_find_port(exp, nated_port);
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) < 0) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = nf_nat_follow_master;
+ exp->dir = !dir;
+
+ /* Check existing expects */
+ if (info->sig_port[dir] == port)
+ nated_port = ntohs(info->sig_port[!dir]);
+
+ nated_port = nf_nat_exp_find_port(exp, nated_port);
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port))) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+
+ pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces nf_conntrack_q931_expect()
+ * which was set by nf_conntrack_h323.c.
+ ****************************************************************************/
+static void ip_nat_q931_expect(struct nf_conn *new,
+ struct nf_conntrack_expect *this)
+{
+ struct nf_nat_range2 range;
+
+ if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
+ nf_nat_follow_master(new, this);
+ return;
+ }
+
+ /* This must be a fresh one. */
+ BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr =
+ new->master->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int idx,
+ __be16 port, struct nf_conntrack_expect *exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+ union nf_inet_addr addr;
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = ip_nat_q931_expect;
+ exp->dir = !dir;
+
+ /* Check existing expects */
+ if (info->sig_port[dir] == port)
+ nated_port = ntohs(info->sig_port[!dir]);
+
+ nated_port = nf_nat_exp_find_port(exp, nated_port);
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port))) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+
+ /* Fix for Gnomemeeting */
+ if (idx > 0 &&
+ get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
+ if (set_h225_addr(skb, protoff, data, 0, &taddr[0],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir])) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+ }
+
+ /* Success */
+ pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct nf_conn *new,
+ struct nf_conntrack_expect *this)
+{
+ struct nf_nat_range2 range;
+
+ /* This must be a fresh one. */
+ BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr = this->saved_addr;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port;
+
+ /* Set expectations for NAT */
+ exp->saved_addr = exp->tuple.dst.u3;
+ exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = ip_nat_callforwarding_expect;
+ exp->dir = !dir;
+
+ nated_port = nf_nat_exp_find_port(exp, ntohs(port));
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port))) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+static struct nf_ct_helper_expectfn q931_nat = {
+ .name = "Q.931",
+ .expectfn = ip_nat_q931_expect,
+};
+
+static struct nf_ct_helper_expectfn callforwarding_nat = {
+ .name = "callforwarding",
+ .expectfn = ip_nat_callforwarding_expect,
+};
+
+static const struct nfct_h323_nat_hooks nathooks = {
+ .set_h245_addr = set_h245_addr,
+ .set_h225_addr = set_h225_addr,
+ .set_sig_addr = set_sig_addr,
+ .set_ras_addr = set_ras_addr,
+ .nat_rtp_rtcp = nat_rtp_rtcp,
+ .nat_t120 = nat_t120,
+ .nat_h245 = nat_h245,
+ .nat_callforwarding = nat_callforwarding,
+ .nat_q931 = nat_q931,
+};
+
+/****************************************************************************/
+static int __init nf_nat_h323_init(void)
+{
+ RCU_INIT_POINTER(nfct_h323_nat_hook, &nathooks);
+ nf_ct_helper_expectfn_register(&q931_nat);
+ nf_ct_helper_expectfn_register(&callforwarding_nat);
+ return 0;
+}
+
+/****************************************************************************/
+static void __exit nf_nat_h323_fini(void)
+{
+ RCU_INIT_POINTER(nfct_h323_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&q931_nat);
+ nf_ct_helper_expectfn_unregister(&callforwarding_nat);
+ synchronize_rcu();
+}
+
+/****************************************************************************/
+module_init(nf_nat_h323_init);
+module_exit(nf_nat_h323_fini);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_NAT_HELPER("h323");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
new file mode 100644
index 000000000000..fab357cc8559
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * nf_nat_pptp.c
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * (needs netfilter tuple reservation)
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_NAT_PPTP_VERSION "3.0"
+
+#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off)))
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+MODULE_ALIAS_NF_NAT_HELPER("pptp");
+
+static void pptp_nat_expected(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct net *net = nf_ct_net(ct);
+ const struct nf_conn *master = ct->master;
+ struct nf_conntrack_expect *other_exp;
+ struct nf_conntrack_tuple t = {};
+ const struct nf_ct_pptp_master *ct_pptp_info;
+ const struct nf_nat_pptp *nat_pptp_info;
+ struct nf_nat_range2 range;
+ struct nf_conn_nat *nat;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (WARN_ON_ONCE(!nat))
+ return;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
+ ct_pptp_info = nfct_help_data(master);
+
+ /* And here goes the grand finale of corrosion... */
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ pr_debug("we are PNS->PAC\n");
+ /* therefore, build tuple for PAC->PNS */
+ t.src.l3num = AF_INET;
+ t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ t.src.u.gre.key = ct_pptp_info->pac_call_id;
+ t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ t.dst.protonum = IPPROTO_GRE;
+ } else {
+ pr_debug("we are PAC->PNS\n");
+ /* build tuple for PNS->PAC */
+ t.src.l3num = AF_INET;
+ t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ t.src.u.gre.key = nat_pptp_info->pns_call_id;
+ t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ t.dst.u.gre.key = nat_pptp_info->pac_call_id;
+ t.dst.protonum = IPPROTO_GRE;
+ }
+
+ pr_debug("trying to unexpect other dir: ");
+ nf_ct_dump_tuple_ip(&t);
+ other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
+ if (other_exp) {
+ nf_ct_unexpect_related(other_exp);
+ nf_ct_expect_put(other_exp);
+ pr_debug("success\n");
+ } else {
+ pr_debug("not found!\n");
+ }
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
+ }
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+ if (exp->dir == IP_CT_DIR_REPLY) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
+ }
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+
+{
+ struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ struct nf_nat_pptp *nat_pptp_info;
+ u_int16_t msg;
+ __be16 new_callid;
+ unsigned int cid_off;
+
+ if (WARN_ON_ONCE(!nat))
+ return NF_DROP;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
+ ct_pptp_info = nfct_help_data(ct);
+
+ new_callid = ct_pptp_info->pns_call_id;
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REQUEST:
+ cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
+ /* FIXME: ideally we would want to reserve a call ID
+ * here. current netfilter NAT core is not able to do
+ * this :( For now we use TCP source port. This breaks
+ * multiple calls within one control session */
+
+ /* save original call ID in nat_info */
+ nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+ /* don't use tcph->source since we are at a DSTmanip
+ * hook (e.g. PREROUTING) and pkt is not mangled yet */
+ new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ /* save new call ID in ct info */
+ ct_pptp_info->pns_call_id = new_callid;
+ break;
+ case PPTP_IN_CALL_REPLY:
+ cid_off = offsetof(union pptp_ctrl_union, icack.callID);
+ break;
+ case PPTP_CALL_CLEAR_REQUEST:
+ cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
+ break;
+ default:
+ pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
+ pptp_msg_name(msg));
+ fallthrough;
+ case PPTP_SET_LINK_INFO:
+ /* only need to NAT in case PAC is behind NAT box */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+ * down to here */
+ pr_debug("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
+
+ /* mangle packet */
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ cid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_callid), (char *)&new_callid,
+ sizeof(new_callid)))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+
+static void
+pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
+ struct nf_conntrack_expect *expect_reply)
+{
+ const struct nf_conn *ct = expect_orig->master;
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_nat_pptp *nat_pptp_info;
+
+ if (WARN_ON_ONCE(!nat))
+ return;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
+ ct_pptp_info = nfct_help_data(ct);
+
+ /* save original PAC call ID in nat_info */
+ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+ /* alter expectation for PNS->PAC direction */
+ expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
+ expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
+ expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
+ expect_orig->dir = IP_CT_DIR_ORIGINAL;
+
+ /* alter expectation for PAC->PNS direction */
+ expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
+ expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
+ expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ expect_reply->dir = IP_CT_DIR_REPLY;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+{
+ const struct nf_nat_pptp *nat_pptp_info;
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ u_int16_t msg;
+ __be16 new_pcid;
+ unsigned int pcid_off;
+
+ if (WARN_ON_ONCE(!nat))
+ return NF_DROP;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
+ new_pcid = nat_pptp_info->pns_call_id;
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REPLY:
+ pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
+ break;
+ case PPTP_IN_CALL_CONNECT:
+ pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
+ break;
+ case PPTP_IN_CALL_REQUEST:
+ /* only need to nat in case PAC is behind NAT box */
+ return NF_ACCEPT;
+ case PPTP_WAN_ERROR_NOTIFY:
+ pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
+ break;
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
+ break;
+ case PPTP_SET_LINK_INFO:
+ pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
+ break;
+ default:
+ pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg));
+ fallthrough;
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+ * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+ /* mangle packet */
+ pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
+ ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
+
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ pcid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_pcid), (char *)&new_pcid,
+ sizeof(new_pcid)))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+
+static const struct nf_nat_pptp_hook pptp_hooks = {
+ .outbound = pptp_outbound_pkt,
+ .inbound = pptp_inbound_pkt,
+ .exp_gre = pptp_exp_gre,
+ .expectfn = pptp_nat_expected,
+};
+
+static int __init nf_nat_helper_pptp_init(void)
+{
+ WARN_ON(nf_nat_pptp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
+
+ return 0;
+}
+
+static void __exit nf_nat_helper_pptp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
+ synchronize_rcu();
+}
+
+module_init(nf_nat_helper_pptp_init);
+module_exit(nf_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
new file mode 100644
index 000000000000..dc2cc5794160
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
@@ -0,0 +1,185 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1990, 2002 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc1157#section-4
+-- https://www.rfc-editor.org/rfc/rfc3416#section-3
+
+Message ::=
+ SEQUENCE {
+ version
+ INTEGER ({snmp_version}),
+
+ community
+ OCTET STRING,
+
+ pdu
+ PDUs
+ }
+
+
+ObjectName ::=
+ OBJECT IDENTIFIER
+
+ObjectSyntax ::=
+ CHOICE {
+ simple
+ SimpleSyntax,
+
+ application-wide
+ ApplicationSyntax
+ }
+
+SimpleSyntax ::=
+ CHOICE {
+ integer-value
+ INTEGER,
+
+ string-value
+ OCTET STRING,
+
+ objectID-value
+ OBJECT IDENTIFIER
+ }
+
+ApplicationSyntax ::=
+ CHOICE {
+ ipAddress-value
+ IpAddress,
+
+ counter-value
+ Counter32,
+
+ timeticks-value
+ TimeTicks,
+
+ arbitrary-value
+ Opaque,
+
+ big-counter-value
+ Counter64,
+
+ unsigned-integer-value
+ Unsigned32
+ }
+
+IpAddress ::=
+ [APPLICATION 0]
+ IMPLICIT OCTET STRING OPTIONAL ({snmp_helper})
+
+Counter32 ::=
+ [APPLICATION 1]
+ IMPLICIT INTEGER OPTIONAL
+
+Unsigned32 ::=
+ [APPLICATION 2]
+ IMPLICIT INTEGER OPTIONAL
+
+Gauge32 ::= Unsigned32 OPTIONAL
+
+TimeTicks ::=
+ [APPLICATION 3]
+ IMPLICIT INTEGER OPTIONAL
+
+Opaque ::=
+ [APPLICATION 4]
+ IMPLICIT OCTET STRING OPTIONAL
+
+Counter64 ::=
+ [APPLICATION 6]
+ IMPLICIT INTEGER OPTIONAL
+
+PDUs ::=
+ CHOICE {
+ get-request
+ GetRequest-PDU,
+
+ get-next-request
+ GetNextRequest-PDU,
+
+ get-bulk-request
+ GetBulkRequest-PDU,
+
+ response
+ Response-PDU,
+
+ set-request
+ SetRequest-PDU,
+
+ inform-request
+ InformRequest-PDU,
+
+ snmpV2-trap
+ SNMPv2-Trap-PDU,
+
+ report
+ Report-PDU
+ }
+
+GetRequest-PDU ::=
+ [0] IMPLICIT PDU OPTIONAL
+
+GetNextRequest-PDU ::=
+ [1] IMPLICIT PDU OPTIONAL
+
+Response-PDU ::=
+ [2] IMPLICIT PDU OPTIONAL
+
+SetRequest-PDU ::=
+ [3] IMPLICIT PDU OPTIONAL
+
+-- [4] is obsolete
+
+GetBulkRequest-PDU ::=
+ [5] IMPLICIT PDU OPTIONAL
+
+InformRequest-PDU ::=
+ [6] IMPLICIT PDU OPTIONAL
+
+SNMPv2-Trap-PDU ::=
+ [7] IMPLICIT PDU OPTIONAL
+
+Report-PDU ::=
+ [8] IMPLICIT PDU OPTIONAL
+
+PDU ::=
+ SEQUENCE {
+ request-id
+ INTEGER,
+
+ error-status
+ INTEGER,
+
+ error-index
+ INTEGER,
+
+ variable-bindings
+ VarBindList
+ }
+
+
+VarBind ::=
+ SEQUENCE {
+ name
+ ObjectName,
+
+ CHOICE {
+ value
+ ObjectSyntax,
+
+ unSpecified
+ NULL,
+
+ noSuchObject
+ [0] IMPLICIT NULL,
+
+ noSuchInstance
+ [1] IMPLICIT NULL,
+
+ endOfMibView
+ [2] IMPLICIT NULL
+ }
+}
+
+VarBindList ::= SEQUENCE OF VarBind
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
new file mode 100644
index 000000000000..717b726504fe
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nf_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified. The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+#include "nf_nat_snmp_basic.asn1.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+MODULE_ALIAS("ip_nat_snmp_basic");
+MODULE_ALIAS_NFCT_HELPER("snmp_trap");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+
+static DEFINE_SPINLOCK(snmp_lock);
+
+struct snmp_ctx {
+ unsigned char *begin;
+ __sum16 *check;
+ __be32 from;
+ __be32 to;
+};
+
+static void fast_csum(struct snmp_ctx *ctx, unsigned char offset)
+{
+ unsigned char s[12] = {0,};
+ int size;
+
+ if (offset & 1) {
+ memcpy(&s[1], &ctx->from, 4);
+ memcpy(&s[7], &ctx->to, 4);
+ s[0] = ~0;
+ s[1] = ~s[1];
+ s[2] = ~s[2];
+ s[3] = ~s[3];
+ s[4] = ~s[4];
+ s[5] = ~0;
+ size = 12;
+ } else {
+ memcpy(&s[0], &ctx->from, 4);
+ memcpy(&s[4], &ctx->to, 4);
+ s[0] = ~s[0];
+ s[1] = ~s[1];
+ s[2] = ~s[2];
+ s[3] = ~s[3];
+ size = 8;
+ }
+ *ctx->check = csum_fold(csum_partial(s, size,
+ ~csum_unfold(*ctx->check)));
+}
+
+int snmp_version(void *context, size_t hdrlen, unsigned char tag,
+ const void *data, size_t datalen)
+{
+ if (datalen != 1)
+ return -EINVAL;
+ if (*(unsigned char *)data > 1)
+ return -ENOTSUPP;
+ return 1;
+}
+
+int snmp_helper(void *context, size_t hdrlen, unsigned char tag,
+ const void *data, size_t datalen)
+{
+ struct snmp_ctx *ctx = (struct snmp_ctx *)context;
+ __be32 *pdata;
+
+ if (datalen != 4)
+ return -EINVAL;
+ pdata = (__be32 *)data;
+ if (*pdata == ctx->from) {
+ pr_debug("%s: %pI4 to %pI4\n", __func__,
+ (void *)&ctx->from, (void *)&ctx->to);
+
+ if (*ctx->check)
+ fast_csum(ctx, (unsigned char *)data - ctx->begin);
+ *pdata = ctx->to;
+ }
+
+ return 1;
+}
+
+static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+ u16 datalen = ntohs(udph->len) - sizeof(struct udphdr);
+ char *data = (unsigned char *)udph + sizeof(struct udphdr);
+ struct snmp_ctx ctx;
+ int ret;
+
+ if (dir == IP_CT_DIR_ORIGINAL) {
+ ctx.from = ct->tuplehash[dir].tuple.src.u3.ip;
+ ctx.to = ct->tuplehash[!dir].tuple.dst.u3.ip;
+ } else {
+ ctx.from = ct->tuplehash[!dir].tuple.src.u3.ip;
+ ctx.to = ct->tuplehash[dir].tuple.dst.u3.ip;
+ }
+
+ if (ctx.from == ctx.to)
+ return NF_ACCEPT;
+
+ ctx.begin = (unsigned char *)udph + sizeof(struct udphdr);
+ ctx.check = &udph->check;
+ ret = asn1_ber_decoder(&nf_nat_snmp_basic_decoder, &ctx, data, datalen);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "parser failed\n");
+ return NF_DROP;
+ }
+
+ return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted
+ */
+static int help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int ret;
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+
+ /* SNMP replies and originating SNMP traps get mangled */
+ if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+ if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* No NAT? */
+ if (!(ct->status & IPS_NAT_MASK))
+ return NF_ACCEPT;
+
+ /* Make sure the packet length is ok. So far, we were only guaranteed
+ * to have a valid length IP header plus 8 bytes, which means we have
+ * enough room for a UDP header. Just verify the UDP length field so we
+ * can mess around with the payload.
+ */
+ if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
+ nf_ct_helper_log(skb, ct, "dropping malformed packet\n");
+ return NF_DROP;
+ }
+
+ if (skb_ensure_writable(skb, skb->len)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+
+ spin_lock_bh(&snmp_lock);
+ ret = snmp_translate(ct, dir, skb);
+ spin_unlock_bh(&snmp_lock);
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy snmp_exp_policy = {
+ .max_expected = 0,
+ .timeout = 180,
+};
+
+static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
+ .me = THIS_MODULE,
+ .help = help,
+ .expect_policy = &snmp_exp_policy,
+ .name = "snmp_trap",
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+};
+
+static int __init nf_nat_snmp_basic_init(void)
+{
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+
+ return nf_conntrack_helper_register(&snmp_trap_helper);
+}
+
+static void __exit nf_nat_snmp_basic_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+ synchronize_rcu();
+ nf_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(nf_nat_snmp_basic_init);
+module_exit(nf_nat_snmp_basic_fini);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
new file mode 100644
index 000000000000..fae4aa4a5f09
--- /dev/null
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl);
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth);
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook);
+
+static int nf_reject_iphdr_validate(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return 0;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ return 0;
+ else if (len < (iph->ihl*4))
+ return 0;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ const struct tcphdr *oth;
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct tcphdr _oth;
+
+ if (!nf_reject_iphdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset);
+
+static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ u8 *tp, _type;
+ int thoff;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return false;
+
+ thoff = skb_network_offset(skb) + sizeof(*iph);
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmphdr, type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMP_DEST_UNREACH;
+}
+
+struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct icmphdr *icmph;
+ unsigned int len;
+ int dataoff;
+ __wsum csum;
+ u8 proto;
+
+ if (!nf_reject_iphdr_validate(oldskb))
+ return NULL;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return NULL;
+
+ /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */
+ if (nf_skb_is_icmp_unreach(oldskb))
+ return NULL;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+ len = min_t(unsigned int, 536, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
+ return NULL;
+
+ dataoff = ip_hdrlen(oldskb);
+ proto = ip_hdr(oldskb)->protocol;
+
+ if (!skb_csum_unnecessary(oldskb) &&
+ nf_reject_verify_csum(oldskb, dataoff, proto) &&
+ nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
+ READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+
+ skb_reset_transport_header(nskb);
+ icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
+ icmph->type = ICMP_DEST_UNREACH;
+ icmph->code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+
+ csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
+ icmph->checksum = csum_fold(csum);
+
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
+
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
+{
+ const struct tcphdr *oth;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return NULL;
+
+ if (ip_hdr(oldskb)->protocol != IPPROTO_TCP)
+ return NULL;
+
+ oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
+ sizeof(struct tcphdr), _oth);
+ if (oth == NULL)
+ return NULL;
+
+ /* No RST for RST. */
+ if (oth->rst)
+ return NULL;
+
+ /* Check checksum */
+ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
+ return NULL;
+
+ return oth;
+}
+
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
+{
+ struct iphdr *niph, *oiph = ip_hdr(oldskb);
+
+ skb_reset_network_header(nskb);
+ niph = skb_put(nskb, sizeof(struct iphdr));
+ niph->version = 4;
+ niph->ihl = sizeof(struct iphdr) / 4;
+ niph->tos = 0;
+ niph->id = 0;
+ niph->frag_off = htons(IP_DF);
+ niph->protocol = protocol;
+ niph->check = 0;
+ niph->saddr = oiph->daddr;
+ niph->daddr = oiph->saddr;
+ niph->ttl = ttl;
+
+ nskb->protocol = htons(ETH_P_IP);
+
+ return niph;
+}
+
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
+{
+ struct iphdr *niph = ip_hdr(nskb);
+ struct tcphdr *tcph;
+
+ skb_reset_transport_header(nskb);
+ tcph = skb_put_zero(nskb, sizeof(struct tcphdr));
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+
+ if (oth->ack) {
+ tcph->seq = oth->ack_seq;
+ } else {
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+ oldskb->len - ip_hdrlen(oldskb) -
+ (oth->doff << 2));
+ tcph->ack = 1;
+ }
+
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+ niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)tcph - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+}
+
+static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(struct flowi));
+ fl.u.ip4.daddr = ip_hdr(skb_in)->saddr;
+ nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false);
+ if (!dst)
+ return -1;
+
+ skb_dst_set(skb_in, dst);
+ return 0;
+}
+
+/* Send RST reply */
+void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
+{
+ const struct tcphdr *oth;
+ struct sk_buff *nskb;
+ struct tcphdr _oth;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return;
+
+ if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0)
+ return;
+
+ if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ return;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ /* ip_route_me_harder expects skb->dst to be set */
+ skb_dst_set_noref(nskb, skb_dst(oldskb));
+
+ nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ ip4_dst_hoplimit(skb_dst(nskb)));
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+ if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ /* "Never happens" */
+ if (nskb->len > dst_mtu(skb_dst(nskb)))
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
+ nf_ct_set_closing(skb_nfct(oldskb));
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip_local_out for bridged traffic, the MAC source on
+ * the RST will be ours, instead of the destination's. This confuses
+ * some routers/firewalls, and they drop the packet. So we need to
+ * build the eth header using the original destination's MAC as the
+ * source, and send the RST packet directly.
+ */
+ if (nf_bridge_info_exists(oldskb)) {
+ struct ethhdr *oeth = eth_hdr(oldskb);
+ struct iphdr *niph = ip_hdr(nskb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(oldskb, net);
+ if (!br_indev)
+ goto free_nskb;
+
+ nskb->dev = br_indev;
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+ if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+ oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ goto free_nskb;
+ dev_queue_xmit(nskb);
+ } else
+#endif
+ ip_local_out(net, nskb->sk, nskb);
+
+ return;
+
+ free_nskb:
+ kfree_skb(nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset);
+
+void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
+{
+ struct iphdr *iph = ip_hdr(skb_in);
+ int dataoff = ip_hdrlen(skb_in);
+ u8 proto = iph->protocol;
+
+ if (iph->frag_off & htons(IP_OFFSET))
+ return;
+
+ if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0)
+ return;
+
+ if (skb_csum_unnecessary(skb_in) ||
+ !nf_reject_verify_csum(skb_in, dataoff, proto)) {
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+ return;
+ }
+
+ if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0)
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 packet rejection core");
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
new file mode 100644
index 000000000000..5080fa5fbf6a
--- /dev/null
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
+ __be32 *raddr, __be32 *laddr,
+ __be16 *rport, __be16 *lport)
+{
+ unsigned int outside_hdrlen = ip_hdrlen(skb);
+ struct iphdr *inside_iph, _inside_iph;
+ struct icmphdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (!icmp_is_err(icmph->type))
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr),
+ sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+
+ if (inside_iph->protocol != IPPROTO_TCP &&
+ inside_iph->protocol != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr) +
+ (inside_iph->ihl << 2),
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_iph->protocol;
+ *laddr = inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet_lookup(net, skb, doff, saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be32 daddr, saddr;
+ __be16 dport, sport;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ u8 protocol;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
+ int doff = 0;
+
+ if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+ struct tcphdr _hdr;
+ struct udphdr *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ iph->protocol == IPPROTO_UDP ?
+ sizeof(*hp) : sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ protocol = iph->protocol;
+ saddr = iph->saddr;
+ sport = hp->source;
+ daddr = iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = iph->protocol == IPPROTO_TCP ?
+ ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+ ip_hdrlen(skb) + sizeof(*hp);
+
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+ &sport, &dport))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct &&
+ ((iph->protocol != IPPROTO_ICMP &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (iph->protocol == IPPROTO_ICMP &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+
+ daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+ dport = (iph->protocol == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
+ return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+ daddr, sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 000000000000..041c3f37f237
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ */
+
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/inetdevice.h>
+
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+ __be32 laddr, __be16 lport, struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ if (sk2) {
+ nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ const struct in_ifaddr *ifa;
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ indev = __in_dev_get_rcu(skb->dev);
+ if (!indev)
+ return daddr;
+
+ in_dev_for_each_ifa_rcu(ifa, indev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
+ laddr = ifa->ifa_local;
+ break;
+ }
+
+ return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(struct tcphdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ switch (lookup_type) {
+ case NF_TPROXY_LOOKUP_LISTENER:
+ sk = inet_lookup_listener(net, skb,
+ ip_hdrlen(skb) + __tcp_hdrlen(hp),
+ saddr, sport, daddr, dport,
+ in->ifindex, 0);
+
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NF_TPROXY_LOOKUP_ESTABLISHED:
+ sk = inet_lookup_established(net, saddr, sport,
+ daddr, dport, in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ }
+ case IPPROTO_UDP:
+ sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
+ (!connected || wildcard)) ||
+ (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+ protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
new file mode 100644
index 000000000000..ef5dd88107dd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+
+struct nft_dup_ipv4 {
+ u8 sreg_addr;
+ u8 sreg_dev;
+};
+
+static void nft_dup_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+ struct in_addr gw = {
+ .s_addr = (__force __be32)regs->data[priv->sreg_addr],
+ };
+ int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
+
+ nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif);
+}
+
+static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+ return -EINVAL;
+
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in_addr));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
+}
+
+static int nft_dup_ipv4_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr))
+ goto nla_put_failure;
+ if (priv->sreg_dev &&
+ nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv4_type;
+static const struct nft_expr_ops nft_dup_ipv4_ops = {
+ .type = &nft_dup_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)),
+ .eval = nft_dup_ipv4_eval,
+ .init = nft_dup_ipv4_init,
+ .dump = nft_dup_ipv4_dump,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
+ [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "dup",
+ .ops = &nft_dup_ipv4_ops,
+ .policy = nft_dup_ipv4_policy,
+ .maxattr = NFTA_DUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_dup_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_dup_ipv4_type);
+}
+
+static void __exit nft_dup_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_dup_ipv4_type);
+}
+
+module_init(nft_dup_ipv4_module_init);
+module_exit(nft_dup_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
+MODULE_DESCRIPTION("IPv4 nftables packet duplication support");
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
new file mode 100644
index 000000000000..82af6cd76d13
--- /dev/null
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/flow.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ int noff = skb_network_offset(pkt->skb);
+ u32 *dst = &regs->data[priv->dreg];
+ const struct net_device *dev = NULL;
+ struct iphdr *iph, _iph;
+ __be32 addr;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+ if (!iph) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if (priv->flags & NFTA_FIB_F_DADDR)
+ addr = iph->daddr;
+ else
+ addr = iph->saddr;
+
+ if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) {
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+ return;
+ }
+
+ *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
+
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ int noff = skb_network_offset(pkt->skb);
+ u32 *dest = &regs->data[priv->dreg];
+ struct iphdr *iph, _iph;
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_proto = pkt->tprot,
+ .flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
+ };
+ const struct net_device *oif;
+ const struct net_device *found;
+
+ if (nft_fib_can_skip(pkt)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
+
+ /*
+ * Do not set flowi4_oif, it restricts results (for example, asking
+ * for oif 3 will get RTN_UNICAST result even if the daddr exits
+ * on another interface.
+ *
+ * Search results for the desired outinterface instead.
+ */
+ if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+ else if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else
+ oif = NULL;
+
+ fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif);
+
+ iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+ if (!iph) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr)) {
+ nft_fib_store_result(dest, priv, pkt->skb->dev);
+ return;
+ }
+ }
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl4.flowi4_mark = pkt->skb->mark;
+
+ fl4.flowi4_dscp = ip4h_dscp(iph);
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl4.daddr = iph->daddr;
+ fl4.saddr = get_saddr(iph->saddr);
+ } else {
+ if (nft_hook(pkt) == NF_INET_FORWARD &&
+ priv->flags & NFTA_FIB_F_IIF)
+ fl4.flowi4_iif = nft_out(pkt)->ifindex;
+
+ fl4.daddr = iph->saddr;
+ fl4.saddr = get_saddr(iph->daddr);
+ }
+
+ *dest = 0;
+
+ if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+ return;
+
+ switch (res.type) {
+ case RTN_UNICAST:
+ break;
+ case RTN_LOCAL: /* Should not see RTN_LOCAL here */
+ return;
+ default:
+ break;
+ }
+
+ if (!oif) {
+ found = FIB_RES_DEV(res);
+ } else {
+ if (!fib_info_nh_uses_dev(res.fi, oif))
+ return;
+ found = oif;
+ }
+
+ nft_fib_store_result(dest, priv, found);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval);
+
+static struct nft_expr_type nft_fib4_type;
+
+static const struct nft_expr_ops nft_fib4_type_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
+};
+
+static const struct nft_expr_ops nft_fib4_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
+};
+
+static const struct nft_expr_ops *
+nft_fib4_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib4_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib4_type __read_mostly = {
+ .name = "fib",
+ .select_ops = nft_fib4_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib4_module_init(void)
+{
+ return nft_register_expr(&nft_fib4_type);
+}
+
+static void __exit nft_fib4_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib4_type);
+}
+
+module_init(nft_fib4_module_init);
+module_exit(nft_fib4_module_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
+MODULE_DESCRIPTION("nftables fib / ip route lookup support");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 000000000000..6cb213bb7256
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/nft_reject.h>
+
+static void nft_reject_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb,
+ nft_hook(pkt));
+ break;
+ default:
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_reject_ipv4_type;
+static const struct nft_expr_ops nft_reject_ipv4_ops = {
+ .type = &nft_reject_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv4_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "reject",
+ .ops = &nft_reject_ipv4_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv4_type);
+}
+
+static void __exit nft_reject_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv4_type);
+}
+
+module_init(nft_reject_ipv4_module_init);
+module_exit(nft_reject_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
+MODULE_DESCRIPTION("IPv4 packet rejection for nftables");
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..b920e1bdcf58
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <linux/in6.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
+ struct netlink_ext_ack *extack)
+{
+ *ip_proto = nla_get_u8(attr);
+
+ switch (*ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return 0;
+ case IPPROTO_ICMP:
+ if (family != AF_INET)
+ break;
+ return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6:
+ if (family != AF_INET6)
+ break;
+ return 0;
+#endif
+ }
+ NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 000000000000..7b9d70f9b31c
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,4157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/arp.h>
+#include <net/ipv6_stubs.h>
+#include <net/lwtunnel.h>
+#include <net/ndisc.h>
+#include <net/nexthop.h>
+#include <net/route.h>
+#include <net/sock.h>
+
+#define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ)
+#define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo);
+
+#define NH_DEV_HASHBITS 8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
+#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \
+ NHA_OP_FLAG_DUMP_HW_STATS)
+
+static const struct nla_policy rtm_nh_policy_new[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_GROUP] = { .type = NLA_BINARY },
+ [NHA_GROUP_TYPE] = { .type = NLA_U16 },
+ [NHA_BLACKHOLE] = { .type = NLA_FLAG },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_GATEWAY] = { .type = NLA_BINARY },
+ [NHA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [NHA_ENCAP] = { .type = NLA_NESTED },
+ [NHA_FDB] = { .type = NLA_FLAG },
+ [NHA_RES_GROUP] = { .type = NLA_NESTED },
+ [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true),
+};
+
+static const struct nla_policy rtm_nh_policy_get[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ NHA_OP_FLAGS_DUMP_ALL),
+};
+
+static const struct nla_policy rtm_nh_policy_del[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump[] = {
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_GROUPS] = { .type = NLA_FLAG },
+ [NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_FDB] = { .type = NLA_FLAG },
+ [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ NHA_OP_FLAGS_DUMP_ALL),
+};
+
+static const struct nla_policy rtm_nh_res_policy_new[] = {
+ [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 },
+ [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 },
+ [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_RES_BUCKET] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
+ [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_get_bucket[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_RES_BUCKET] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
+ [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 },
+};
+
+static bool nexthop_notifiers_is_empty(struct net *net)
+{
+ return !net->nexthop.notifier_chain.head;
+}
+
+static void
+__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
+ const struct nh_info *nhi)
+{
+ nh_info->dev = nhi->fib_nhc.nhc_dev;
+ nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
+ if (nh_info->gw_family == AF_INET)
+ nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
+ else if (nh_info->gw_family == AF_INET6)
+ nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
+
+ nh_info->id = nhi->nh_parent->id;
+ nh_info->is_reject = nhi->reject_nh;
+ nh_info->is_fdb = nhi->fdb_nh;
+ nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
+}
+
+static int nh_notifier_single_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
+ info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
+ if (!info->nh)
+ return -ENOMEM;
+
+ __nh_notifier_single_info_init(info->nh, nhi);
+
+ return 0;
+}
+
+static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh);
+}
+
+static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
+ struct nh_group *nhg)
+{
+ u16 num_nh = nhg->num_nh;
+ int i;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_GRP;
+ info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
+ GFP_KERNEL);
+ if (!info->nh_grp)
+ return -ENOMEM;
+
+ info->nh_grp->num_nh = num_nh;
+ info->nh_grp->is_fdb = nhg->fdb_nh;
+ info->nh_grp->hw_stats = nhg->hw_stats;
+
+ for (i = 0; i < num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nhge->nh->nh_info);
+ info->nh_grp->nh_entries[i].weight = nhge->weight;
+ __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
+ nhi);
+ }
+
+ return 0;
+}
+
+static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
+ struct nh_group *nhg)
+{
+ struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+ u16 num_nh_buckets = res_table->num_nh_buckets;
+ unsigned long size;
+ u16 i;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
+ size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
+ info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
+ __GFP_NOWARN);
+ if (!info->nh_res_table)
+ return -ENOMEM;
+
+ info->nh_res_table->num_nh_buckets = num_nh_buckets;
+ info->nh_res_table->hw_stats = nhg->hw_stats;
+
+ for (i = 0; i < num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+ struct nh_info *nhi;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ nhi = rtnl_dereference(nhge->nh->nh_info);
+ __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
+ nhi);
+ }
+
+ return 0;
+}
+
+static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nhg->hash_threshold)
+ return nh_notifier_mpath_info_init(info, nhg);
+ else if (nhg->resilient)
+ return nh_notifier_res_table_info_init(info, nhg);
+ return -EINVAL;
+}
+
+static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nhg->hash_threshold)
+ kfree(info->nh_grp);
+ else if (nhg->resilient)
+ vfree(info->nh_res_table);
+}
+
+static int nh_notifier_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ info->id = nh->id;
+
+ if (nh->is_group)
+ return nh_notifier_grp_info_init(info, nh);
+ else
+ return nh_notifier_single_info_init(info, nh);
+}
+
+static void nh_notifier_info_fini(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ if (nh->is_group)
+ nh_notifier_grp_info_fini(info, nh);
+ else
+ nh_notifier_single_info_fini(info);
+}
+
+static int call_nexthop_notifiers(struct net *net,
+ enum nexthop_event_type event_type,
+ struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
+ int err;
+
+ ASSERT_RTNL();
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ err = nh_notifier_info_init(&info, nh);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+ return err;
+ }
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ event_type, &info);
+ nh_notifier_info_fini(&info, nh);
+
+ return notifier_to_errno(err);
+}
+
+static int
+nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
+ bool force, unsigned int *p_idle_timer_ms)
+{
+ struct nh_res_table *res_table;
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int err = 0;
+
+ /* When 'force' is false, nexthop bucket replacement is performed
+ * because the bucket was deemed to be idle. In this case, capable
+ * listeners can choose to perform an atomic replacement: The bucket is
+ * only replaced if it is inactive. However, if the idle timer interval
+ * is smaller than the interval in which a listener is querying
+ * buckets' activity from the device, then atomic replacement should
+ * not be tried. Pass the idle timer value to listeners, so that they
+ * could determine which type of replacement to perform.
+ */
+ if (force) {
+ *p_idle_timer_ms = 0;
+ return 0;
+ }
+
+ rcu_read_lock();
+
+ nh = nexthop_find_by_id(info->net, info->id);
+ if (!nh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ nhg = rcu_dereference(nh->nh_grp);
+ res_table = rcu_dereference(nhg->res_table);
+ *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
+
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
+ u16 bucket_index, bool force,
+ struct nh_info *oldi,
+ struct nh_info *newi)
+{
+ unsigned int idle_timer_ms;
+ int err;
+
+ err = nh_notifier_res_bucket_idle_timer_get(info, force,
+ &idle_timer_ms);
+ if (err)
+ return err;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
+ info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
+ GFP_KERNEL);
+ if (!info->nh_res_bucket)
+ return -ENOMEM;
+
+ info->nh_res_bucket->bucket_index = bucket_index;
+ info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
+ info->nh_res_bucket->force = force;
+ __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
+ __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
+ return 0;
+}
+
+static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh_res_bucket);
+}
+
+static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+ u16 bucket_index, bool force,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ .id = nhg_id,
+ };
+ int err;
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
+ oldi, newi);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_BUCKET_REPLACE, &info);
+ nh_notifier_res_bucket_info_fini(&info);
+
+ return notifier_to_errno(err);
+}
+
+/* There are three users of RES_TABLE, and NHs etc. referenced from there:
+ *
+ * 1) a collection of callbacks for NH maintenance. This operates under
+ * RTNL,
+ * 2) the delayed work that gradually balances the resilient table,
+ * 3) and nexthop_select_path(), operating under RCU.
+ *
+ * Both the delayed work and the RTNL block are writers, and need to
+ * maintain mutual exclusion. Since there are only two and well-known
+ * writers for each table, the RTNL code can make sure it has exclusive
+ * access thus:
+ *
+ * - Have the DW operate without locking;
+ * - synchronously cancel the DW;
+ * - do the writing;
+ * - if the write was not actually a delete, call upkeep, which schedules
+ * DW again if necessary.
+ *
+ * The functions that are always called from the RTNL context use
+ * rtnl_dereference(). The functions that can also be called from the DW do
+ * a raw dereference and rely on the above mutual exclusion scheme.
+ */
+#define nh_res_dereference(p) (rcu_dereference_raw(p))
+
+static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+ u16 bucket_index, bool force,
+ struct nexthop *old_nh,
+ struct nexthop *new_nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
+ struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
+
+ return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
+ force, oldi, newi, extack);
+}
+
+static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ .id = nh->id,
+ };
+ struct nh_group *nhg;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ /* At this point, the nexthop buckets are still not populated. Only
+ * emit a notification with the logical nexthops, so that a listener
+ * could potentially veto it in case of unsupported configuration.
+ */
+ nhg = rtnl_dereference(nh->nh_grp);
+ err = nh_notifier_mpath_info_init(&info, nhg);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+ return err;
+ }
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
+ &info);
+ kfree(info.nh_grp);
+
+ return notifier_to_errno(err);
+}
+
+static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+ enum nexthop_event_type event_type,
+ struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
+ int err;
+
+ err = nh_notifier_info_init(&info, nh);
+ if (err)
+ return err;
+
+ err = nb->notifier_call(nb, event_type, &info);
+ nh_notifier_info_fini(&info, nh);
+
+ return notifier_to_errno(err);
+}
+
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+ unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+ return (val ^
+ (val >> NH_DEV_HASHBITS) ^
+ (val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+ struct net_device *dev = nhi->fib_nhc.nhc_dev;
+ struct hlist_head *head;
+ unsigned int hash;
+
+ WARN_ON(!dev);
+
+ hash = nh_dev_hashfn(dev->ifindex);
+ head = &net->nexthop.devhash[hash];
+ hlist_add_head(&nhi->dev_hash, head);
+}
+
+static void nexthop_free_group(struct nexthop *nh)
+{
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_raw(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ WARN_ON(!list_empty(&nhge->nh_list));
+ free_percpu(nhge->stats);
+ nexthop_put(nhge->nh);
+ }
+
+ WARN_ON(nhg->spare == nhg);
+
+ if (nhg->resilient)
+ vfree(rcu_dereference_raw(nhg->res_table));
+
+ kfree(nhg->spare);
+ kfree(nhg);
+}
+
+static void nexthop_free_single(struct nexthop *nh)
+{
+ struct nh_info *nhi;
+
+ nhi = rcu_dereference_raw(nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh_release(nh->net, &nhi->fib_nh);
+ break;
+ case AF_INET6:
+ ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
+ break;
+ }
+ kfree(nhi);
+}
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+ struct nexthop *nh = container_of(head, struct nexthop, rcu);
+
+ if (nh->is_group)
+ nexthop_free_group(nh);
+ else
+ nexthop_free_single(nh);
+
+ kfree(nh);
+}
+EXPORT_SYMBOL_GPL(nexthop_free_rcu);
+
+static struct nexthop *nexthop_alloc(void)
+{
+ struct nexthop *nh;
+
+ nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+ if (nh) {
+ INIT_LIST_HEAD(&nh->fi_list);
+ INIT_LIST_HEAD(&nh->f6i_list);
+ INIT_LIST_HEAD(&nh->grp_list);
+ INIT_LIST_HEAD(&nh->fdb_list);
+ spin_lock_init(&nh->lock);
+ }
+ return nh;
+}
+
+static struct nh_group *nexthop_grp_alloc(u16 num_nh)
+{
+ struct nh_group *nhg;
+
+ nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
+ if (nhg)
+ nhg->num_nh = num_nh;
+
+ return nhg;
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work);
+
+static struct nh_res_table *
+nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
+{
+ const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
+ struct nh_res_table *res_table;
+ unsigned long size;
+
+ size = struct_size(res_table, nh_buckets, num_nh_buckets);
+ res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
+ if (!res_table)
+ return NULL;
+
+ res_table->net = net;
+ res_table->nhg_id = nhg_id;
+ INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
+ INIT_LIST_HEAD(&res_table->uw_nh_entries);
+ res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+ res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
+ res_table->num_nh_buckets = num_nh_buckets;
+ return res_table;
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+ while (++net->nexthop.seq == 0)
+ ;
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+
+ pp = &net->nexthop.rb_root.rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = rcu_dereference_raw(*pp);
+ if (!next)
+ break;
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (id < nh->id)
+ pp = &next->rb_left;
+ else if (id > nh->id)
+ pp = &next->rb_right;
+ else
+ return nh;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_find_by_id);
+
+/* used for auto id allocation; called with rtnl held */
+static u32 nh_find_unused_id(struct net *net)
+{
+ u32 id_start = net->nexthop.last_id_allocated;
+
+ while (1) {
+ net->nexthop.last_id_allocated++;
+ if (net->nexthop.last_id_allocated == id_start)
+ break;
+
+ if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+ return net->nexthop.last_id_allocated;
+ }
+ return 0;
+}
+
+static void nh_res_time_set_deadline(unsigned long next_time,
+ unsigned long *deadline)
+{
+ if (time_before(next_time, *deadline))
+ *deadline = next_time;
+}
+
+static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
+{
+ if (list_empty(&res_table->uw_nh_entries))
+ return 0;
+ return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
+}
+
+static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
+{
+ struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NHA_RES_GROUP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
+ res_table->num_nh_buckets) ||
+ nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
+ jiffies_to_clock_t(res_table->idle_timer)) ||
+ nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
+ jiffies_to_clock_t(res_table->unbalanced_timer)) ||
+ nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
+ nh_res_table_unbalanced_time(res_table),
+ NHA_RES_GROUP_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge)
+{
+ struct nh_grp_entry_stats *cpu_stats;
+
+ cpu_stats = get_cpu_ptr(nhge->stats);
+ u64_stats_update_begin(&cpu_stats->syncp);
+ u64_stats_inc(&cpu_stats->packets);
+ u64_stats_update_end(&cpu_stats->syncp);
+ put_cpu_ptr(cpu_stats);
+}
+
+static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge,
+ u64 *ret_packets)
+{
+ int i;
+
+ *ret_packets = 0;
+
+ for_each_possible_cpu(i) {
+ struct nh_grp_entry_stats *cpu_stats;
+ unsigned int start;
+ u64 packets;
+
+ cpu_stats = per_cpu_ptr(nhge->stats, i);
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ packets = u64_stats_read(&cpu_stats->packets);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ *ret_packets += packets;
+ }
+}
+
+static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg;
+ int i;
+
+ ASSERT_RTNL();
+ nhg = rtnl_dereference(nh->nh_grp);
+
+ info->id = nh->id;
+ info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS;
+ info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats,
+ stats, nhg->num_nh),
+ GFP_KERNEL);
+ if (!info->nh_grp_hw_stats)
+ return -ENOMEM;
+
+ info->nh_grp_hw_stats->num_nh = nhg->num_nh;
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ info->nh_grp_hw_stats->stats[i].id = nhge->nh->id;
+ }
+
+ return 0;
+}
+
+static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh_grp_hw_stats);
+}
+
+void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
+ unsigned int nh_idx,
+ u64 delta_packets)
+{
+ info->hw_stats_used = true;
+ info->stats[nh_idx].packets += delta_packets;
+}
+EXPORT_SYMBOL(nh_grp_hw_stats_report_delta);
+
+static void nh_grp_hw_stats_apply_update(struct nexthop *nh,
+ struct nh_notifier_info *info)
+{
+ struct nh_group *nhg;
+ int i;
+
+ ASSERT_RTNL();
+ nhg = rtnl_dereference(nh->nh_grp);
+
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets;
+ }
+}
+
+static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used)
+{
+ struct nh_notifier_info info = {
+ .net = nh->net,
+ };
+ struct net *net = nh->net;
+ int err;
+
+ if (nexthop_notifiers_is_empty(net)) {
+ *hw_stats_used = false;
+ return 0;
+ }
+
+ err = nh_notifier_grp_hw_stats_init(&info, nh);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
+ &info);
+
+ /* Cache whatever we got, even if there was an error, otherwise the
+ * successful stats retrievals would get lost.
+ */
+ nh_grp_hw_stats_apply_update(nh, &info);
+ *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used;
+
+ nh_notifier_grp_hw_stats_fini(&info);
+ return notifier_to_errno(err);
+}
+
+static int nla_put_nh_group_stats_entry(struct sk_buff *skb,
+ struct nh_grp_entry *nhge,
+ u32 op_flags)
+{
+ struct nlattr *nest;
+ u64 packets;
+
+ nh_grp_entry_stats_read(nhge, &packets);
+
+ nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) ||
+ nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS,
+ packets + nhge->packets_hw))
+ goto nla_put_failure;
+
+ if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
+ nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW,
+ nhge->packets_hw))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
+ u32 op_flags)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ struct nlattr *nest;
+ bool hw_stats_used;
+ int err;
+ int i;
+
+ if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats))
+ goto err_out;
+
+ if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
+ nhg->hw_stats) {
+ err = nh_grp_hw_stats_update(nh, &hw_stats_used);
+ if (err)
+ goto out;
+
+ if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used))
+ goto err_out;
+ }
+
+ nest = nla_nest_start(skb, NHA_GROUP_STATS);
+ if (!nest)
+ goto err_out;
+
+ for (i = 0; i < nhg->num_nh; i++)
+ if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i],
+ op_flags))
+ goto cancel_out;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+cancel_out:
+ nla_nest_cancel(skb, nest);
+err_out:
+ err = -EMSGSIZE;
+out:
+ return err;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
+ u32 op_flags, u32 *resp_op_flags)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ struct nexthop_grp *p;
+ size_t len = nhg->num_nh * sizeof(*p);
+ struct nlattr *nla;
+ u16 group_type = 0;
+ u16 weight;
+ int i;
+
+ *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;
+
+ if (nhg->hash_threshold)
+ group_type = NEXTHOP_GRP_TYPE_MPATH;
+ else if (nhg->resilient)
+ group_type = NEXTHOP_GRP_TYPE_RES;
+
+ if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, NHA_GROUP, len);
+ if (!nla)
+ goto nla_put_failure;
+
+ p = nla_data(nla);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ weight = nhg->nh_entries[i].weight - 1;
+
+ *p++ = (struct nexthop_grp) {
+ .id = nhg->nh_entries[i].nh->id,
+ .weight = weight,
+ .weight_high = weight >> 8,
+ };
+ }
+
+ if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
+ goto nla_put_failure;
+
+ if (op_flags & NHA_OP_FLAG_DUMP_STATS &&
+ (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) ||
+ nla_put_nh_group_stats(skb, nh, op_flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+ int event, u32 portid, u32 seq, unsigned int nlflags,
+ u32 op_flags)
+{
+ struct fib6_nh *fib6_nh;
+ struct fib_nh *fib_nh;
+ struct nlmsghdr *nlh;
+ struct nh_info *nhi;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = nh->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ u32 resp_op_flags = 0;
+
+ if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+ if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
+ nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
+ goto nla_put_failure;
+ goto out;
+ }
+
+ nhi = rtnl_dereference(nh->nh_info);
+ nhm->nh_family = nhi->family;
+ if (nhi->reject_nh) {
+ if (nla_put_flag(skb, NHA_BLACKHOLE))
+ goto nla_put_failure;
+ goto out;
+ } else if (nhi->fdb_nh) {
+ if (nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+ } else {
+ const struct net_device *dev;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh = &nhi->fib_nh;
+ if (fib_nh->fib_nh_gw_family &&
+ nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+ goto nla_put_failure;
+ break;
+
+ case AF_INET6:
+ fib6_nh = &nhi->fib6_nh;
+ if (fib6_nh->fib_nh_gw_family &&
+ nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+ NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
+{
+ return nla_total_size(0) + /* NHA_RES_GROUP */
+ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */
+ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */
+ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */
+ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
+}
+
+static size_t nh_nlmsg_size_grp(struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+ size_t tot = nla_total_size(sz) +
+ nla_total_size(2); /* NHA_GROUP_TYPE */
+
+ if (nhg->resilient)
+ tot += nh_nlmsg_size_grp_res(nhg);
+
+ return tot;
+}
+
+static size_t nh_nlmsg_size_single(struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+ size_t sz;
+
+ /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+ * are mutually exclusive
+ */
+ sz = nla_total_size(4); /* NHA_OIF */
+
+ switch (nhi->family) {
+ case AF_INET:
+ if (nhi->fib_nh.fib_nh_gw_family)
+ sz += nla_total_size(4); /* NHA_GATEWAY */
+ break;
+
+ case AF_INET6:
+ /* NHA_GATEWAY */
+ if (nhi->fib6_nh.fib_nh_gw_family)
+ sz += nla_total_size(sizeof(const struct in6_addr));
+ break;
+ }
+
+ if (nhi->fib_nhc.nhc_lwtstate) {
+ sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
+ sz += nla_total_size(2); /* NHA_ENCAP_TYPE */
+ }
+
+ return sz;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+ size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
+
+ sz += nla_total_size(4); /* NHA_ID */
+
+ if (nh->is_group)
+ sz += nh_nlmsg_size_grp(nh) +
+ nla_total_size(4) + /* NHA_OP_FLAGS */
+ 0;
+ else
+ sz += nh_nlmsg_size_single(nh);
+
+ return sz;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+ unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
+ info->nlh, gfp_any());
+ return;
+errout:
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+}
+
+static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
+{
+ return (unsigned long)atomic_long_read(&bucket->used_time);
+}
+
+static unsigned long
+nh_res_bucket_idle_point(const struct nh_res_table *res_table,
+ const struct nh_res_bucket *bucket,
+ unsigned long now)
+{
+ unsigned long time = nh_res_bucket_used_time(bucket);
+
+ /* Bucket was not used since it was migrated. The idle time is now. */
+ if (time == bucket->migrated_time)
+ return now;
+
+ return time + res_table->idle_timer;
+}
+
+static unsigned long
+nh_res_table_unb_point(const struct nh_res_table *res_table)
+{
+ return res_table->unbalanced_since + res_table->unbalanced_timer;
+}
+
+static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
+ struct nh_res_bucket *bucket)
+{
+ unsigned long now = jiffies;
+
+ atomic_long_set(&bucket->used_time, (long)now);
+ bucket->migrated_time = now;
+}
+
+static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
+{
+ atomic_long_set(&bucket->used_time, (long)jiffies);
+}
+
+static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
+{
+ unsigned long used_time = nh_res_bucket_used_time(bucket);
+
+ return jiffies_delta_to_clock_t(jiffies - used_time);
+}
+
+static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
+ struct nh_res_bucket *bucket, u16 bucket_index,
+ int event, u32 portid, u32 seq,
+ unsigned int nlflags,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = bucket->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NHA_RES_BUCKET);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
+ nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
+ nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
+ nh_res_bucket_idle_time(bucket),
+ NHA_RES_BUCKET_PAD))
+ goto nla_put_failure_nest;
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure_nest:
+ nla_nest_cancel(skb, nest);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void nexthop_bucket_notify(struct nh_res_table *res_table,
+ u16 bucket_index)
+{
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+ struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+ struct nexthop *nh = nhge->nh_parent;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
+ NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
+}
+
+static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
+ bool *is_fdb, struct netlink_ext_ack *extack)
+{
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ /* Nesting groups within groups is not supported. */
+ if (nhg->hash_threshold) {
+ NL_SET_ERR_MSG(extack,
+ "Hash-threshold group can not be a nexthop within a group");
+ return false;
+ }
+ if (nhg->resilient) {
+ NL_SET_ERR_MSG(extack,
+ "Resilient group can not be a nexthop within a group");
+ return false;
+ }
+ *is_fdb = nhg->fdb_nh;
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ if (nhi->reject_nh && npaths > 1) {
+ NL_SET_ERR_MSG(extack,
+ "Blackhole nexthop can not be used in a group with more than 1 path");
+ return false;
+ }
+ *is_fdb = nhi->fdb_nh;
+ }
+
+ return true;
+}
+
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+
+ if (!nhi->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+ return -EINVAL;
+ }
+
+ if (*nh_family == AF_UNSPEC) {
+ *nh_family = nhi->family;
+ } else if (*nh_family != nhi->family) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_check_attr_group(struct net *net,
+ struct nlattr *tb[], size_t tb_size,
+ u16 nh_grp_type, struct netlink_ext_ack *extack)
+{
+ unsigned int len = nla_len(tb[NHA_GROUP]);
+ struct nexthop_grp *nhg;
+ unsigned int i, j;
+
+ if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid length for nexthop group attribute");
+ return -EINVAL;
+ }
+
+ /* convert len to number of nexthop ids */
+ len /= sizeof(*nhg);
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ if (nhg[i].resvd2) {
+ NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
+ return -EINVAL;
+ }
+ if (nexthop_grp_weight(&nhg[i]) == 0) {
+ /* 0xffff got passed in, representing weight of 0x10000,
+ * which is too heavy.
+ */
+ NL_SET_ERR_MSG(extack, "Invalid value for weight");
+ return -EINVAL;
+ }
+ for (j = i + 1; j < len; ++j) {
+ if (nhg[i].id == nhg[j].id) {
+ NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
+ return -EINVAL;
+ }
+ }
+ }
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
+ if (!tb[i])
+ continue;
+ switch (i) {
+ case NHA_HW_STATS_ENABLE:
+ case NHA_FDB:
+ continue;
+ case NHA_RES_GROUP:
+ if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ continue;
+ break;
+ }
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ u8 nh_family = AF_UNSPEC;
+ struct nexthop_grp *nhg;
+ unsigned int len;
+ unsigned int i;
+ u8 nhg_fdb;
+
+ len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg);
+ nhg = nla_data(tb[NHA_GROUP]);
+ nhg_fdb = !!tb[NHA_FDB];
+
+ for (i = 0; i < len; i++) {
+ struct nexthop *nh;
+ bool is_fdb_nh;
+
+ nh = nexthop_find_by_id(net, nhg[i].id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
+ return -EINVAL;
+
+ if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+ return -EINVAL;
+
+ if (!nhg_fdb && is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static bool ipv6_good_nh(const struct fib6_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock();
+
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
+ if (n)
+ state = READ_ONCE(n->nud_state);
+
+ rcu_read_unlock();
+
+ return !!(state & NUD_VALID);
+}
+
+static bool ipv4_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock();
+
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ if (n)
+ state = READ_ONCE(n->nud_state);
+
+ rcu_read_unlock();
+
+ return !!(state & NUD_VALID);
+}
+
+static bool nexthop_is_good_nh(const struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference(nh->nh_info);
+
+ switch (nhi->family) {
+ case AF_INET:
+ return ipv4_good_nh(&nhi->fib_nh);
+ case AF_INET6:
+ return ipv6_good_nh(&nhi->fib6_nh);
+ }
+
+ return false;
+}
+
+static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
+{
+ int i;
+
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ if (hash > atomic_read(&nhge->hthr.upper_bound))
+ continue;
+
+ nh_grp_entry_stats_inc(nhge);
+ return nhge->nh;
+ }
+
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
+{
+ struct nh_grp_entry *nhge0 = NULL;
+ int i;
+
+ if (nhg->fdb_nh)
+ return nexthop_select_path_fdb(nhg, hash);
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ /* nexthops always check if it is good and does
+ * not rely on a sysctl for this behavior
+ */
+ if (!nexthop_is_good_nh(nhge->nh))
+ continue;
+
+ if (!nhge0)
+ nhge0 = nhge;
+
+ if (hash > atomic_read(&nhge->hthr.upper_bound))
+ continue;
+
+ nh_grp_entry_stats_inc(nhge);
+ return nhge->nh;
+ }
+
+ if (!nhge0)
+ nhge0 = &nhg->nh_entries[0];
+ nh_grp_entry_stats_inc(nhge0);
+ return nhge0->nh;
+}
+
+static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
+{
+ struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
+ u16 bucket_index = hash % res_table->num_nh_buckets;
+ struct nh_res_bucket *bucket;
+ struct nh_grp_entry *nhge;
+
+ /* nexthop_select_path() is expected to return a non-NULL value, so
+ * skip protocol validation and just hand out whatever there is.
+ */
+ bucket = &res_table->nh_buckets[bucket_index];
+ nh_res_bucket_set_busy(bucket);
+ nhge = rcu_dereference(bucket->nh_entry);
+ nh_grp_entry_stats_inc(nhge);
+ return nhge->nh;
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+ struct nh_group *nhg;
+
+ if (!nh->is_group)
+ return nh;
+
+ nhg = rcu_dereference(nh->nh_grp);
+ if (nhg->hash_threshold)
+ return nexthop_select_path_hthr(nhg, hash);
+ else if (nhg->resilient)
+ return nexthop_select_path_res(nhg, hash);
+
+ /* Unreachable. */
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_select_path);
+
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+ int (*cb)(struct fib6_nh *nh, void *arg),
+ void *arg)
+{
+ struct nh_info *nhi;
+ int err;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+ } else {
+ nhi = rcu_dereference_rtnl(nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
+
+static int check_src_addr(const struct in6_addr *saddr,
+ struct netlink_ext_ack *extack)
+{
+ if (!ipv6_addr_any(saddr)) {
+ NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ bool is_fdb_nh;
+
+ /* fib6_src is unique to a fib6_info and limits the ability to cache
+ * routes in fib6_nh within a nexthop that is potentially shared
+ * across multiple fib entries. If the config wants to use source
+ * routing it can not use nexthop objects. mlxsw also does not allow
+ * fib6_src on routes.
+ */
+ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
+ return -EINVAL;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
+ if (nhg->has_v4)
+ goto no_v4_nh;
+ is_fdb_nh = nhg->fdb_nh;
+ } else {
+ nhi = rcu_dereference_rtnl(nh->nh_info);
+ if (nhi->family == AF_INET)
+ goto no_v4_nh;
+ is_fdb_nh = nhi->fdb_nh;
+ }
+
+ if (is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ return -EINVAL;
+ }
+
+ return 0;
+no_v4_nh:
+ NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
+/* if existing nexthop has ipv6 routes linked to it, need
+ * to verify this new spec works with ipv6
+ */
+static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_info *f6i;
+
+ if (list_empty(&old->f6i_list))
+ return 0;
+
+ list_for_each_entry(f6i, &old->f6i_list, nh_list) {
+ if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
+ return -EINVAL;
+ }
+
+ return fib6_check_nexthop(new, NULL, extack);
+}
+
+static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
+ NL_SET_ERR_MSG(extack,
+ "Route with host scope can not have a gateway");
+ return -EINVAL;
+ }
+
+ if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * object is used.
+ */
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ int err = 0;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (nhg->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (scope == RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* all nexthops in a group have the same scope */
+ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
+ err = nexthop_check_scope(nhi, scope, extack);
+ } else {
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+ err = nexthop_check_scope(nhi, scope, extack);
+ }
+
+out:
+ return err;
+}
+
+static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &old->fi_list, nh_list) {
+ int err;
+
+ err = fib_check_nexthop(new, fi->fib_scope, extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets == nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets > nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets < nhge->res.wants_buckets;
+}
+
+static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
+{
+ return list_empty(&res_table->uw_nh_entries);
+}
+
+static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
+{
+ struct nh_grp_entry *nhge;
+
+ if (bucket->occupied) {
+ nhge = nh_res_dereference(bucket->nh_entry);
+ nhge->res.count_buckets--;
+ bucket->occupied = false;
+ }
+}
+
+static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
+ struct nh_grp_entry *nhge)
+{
+ nh_res_bucket_unset_nh(bucket);
+
+ bucket->occupied = true;
+ rcu_assign_pointer(bucket->nh_entry, nhge);
+ nhge->res.count_buckets++;
+}
+
+static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
+ struct nh_res_bucket *bucket,
+ unsigned long *deadline, bool *force)
+{
+ unsigned long now = jiffies;
+ struct nh_grp_entry *nhge;
+ unsigned long idle_point;
+
+ if (!bucket->occupied) {
+ /* The bucket is not occupied, its NHGE pointer is either
+ * NULL or obsolete. We _have to_ migrate: set force.
+ */
+ *force = true;
+ return true;
+ }
+
+ nhge = nh_res_dereference(bucket->nh_entry);
+
+ /* If the bucket is populated by an underweight or balanced
+ * nexthop, do not migrate.
+ */
+ if (!nh_res_nhge_is_ow(nhge))
+ return false;
+
+ /* At this point we know that the bucket is populated with an
+ * overweight nexthop. It needs to be migrated to a new nexthop if
+ * the idle timer of unbalanced timer expired.
+ */
+
+ idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
+ if (time_after_eq(now, idle_point)) {
+ /* The bucket is idle. We _can_ migrate: unset force. */
+ *force = false;
+ return true;
+ }
+
+ /* Unbalanced timer of 0 means "never force". */
+ if (res_table->unbalanced_timer) {
+ unsigned long unb_point;
+
+ unb_point = nh_res_table_unb_point(res_table);
+ if (time_after(now, unb_point)) {
+ /* The bucket is not idle, but the unbalanced timer
+ * expired. We _can_ migrate, but set force anyway,
+ * so that drivers know to ignore activity reports
+ * from the HW.
+ */
+ *force = true;
+ return true;
+ }
+
+ nh_res_time_set_deadline(unb_point, deadline);
+ }
+
+ nh_res_time_set_deadline(idle_point, deadline);
+ return false;
+}
+
+static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
+ u16 bucket_index, bool notify,
+ bool notify_nl, bool force)
+{
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+ struct nh_grp_entry *new_nhge;
+ struct netlink_ext_ack extack;
+ int err;
+
+ new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
+ struct nh_grp_entry,
+ res.uw_nh_entry);
+ if (WARN_ON_ONCE(!new_nhge))
+ /* If this function is called, "bucket" is either not
+ * occupied, or it belongs to a next hop that is
+ * overweight. In either case, there ought to be a
+ * corresponding underweight next hop.
+ */
+ return false;
+
+ if (notify) {
+ struct nh_grp_entry *old_nhge;
+
+ old_nhge = nh_res_dereference(bucket->nh_entry);
+ err = call_nexthop_res_bucket_notifiers(res_table->net,
+ res_table->nhg_id,
+ bucket_index, force,
+ old_nhge->nh,
+ new_nhge->nh, &extack);
+ if (err) {
+ pr_err_ratelimited("%s\n", extack._msg);
+ if (!force)
+ return false;
+ /* It is not possible to veto a forced replacement, so
+ * just clear the hardware flags from the nexthop
+ * bucket to indicate to user space that this bucket is
+ * not correctly populated in hardware.
+ */
+ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ }
+ }
+
+ nh_res_bucket_set_nh(bucket, new_nhge);
+ nh_res_bucket_set_idle(res_table, bucket);
+
+ if (notify_nl)
+ nexthop_bucket_notify(res_table, bucket_index);
+
+ if (nh_res_nhge_is_balanced(new_nhge))
+ list_del(&new_nhge->res.uw_nh_entry);
+ return true;
+}
+
+#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
+
+static void nh_res_table_upkeep(struct nh_res_table *res_table,
+ bool notify, bool notify_nl)
+{
+ unsigned long now = jiffies;
+ unsigned long deadline;
+ u16 i;
+
+ /* Deadline is the next time that upkeep should be run. It is the
+ * earliest time at which one of the buckets might be migrated.
+ * Start at the most pessimistic estimate: either unbalanced_timer
+ * from now, or if there is none, idle_timer from now. For each
+ * encountered time point, call nh_res_time_set_deadline() to
+ * refine the estimate.
+ */
+ if (res_table->unbalanced_timer)
+ deadline = now + res_table->unbalanced_timer;
+ else
+ deadline = now + res_table->idle_timer;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ bool force;
+
+ if (nh_res_bucket_should_migrate(res_table, bucket,
+ &deadline, &force)) {
+ if (!nh_res_bucket_migrate(res_table, i, notify,
+ notify_nl, force)) {
+ unsigned long idle_point;
+
+ /* A driver can override the migration
+ * decision if the HW reports that the
+ * bucket is actually not idle. Therefore
+ * remark the bucket as busy again and
+ * update the deadline.
+ */
+ nh_res_bucket_set_busy(bucket);
+ idle_point = nh_res_bucket_idle_point(res_table,
+ bucket,
+ now);
+ nh_res_time_set_deadline(idle_point, &deadline);
+ }
+ }
+ }
+
+ /* If the group is still unbalanced, schedule the next upkeep to
+ * either the deadline computed above, or the minimum deadline,
+ * whichever comes later.
+ */
+ if (!nh_res_table_is_balanced(res_table)) {
+ unsigned long now = jiffies;
+ unsigned long min_deadline;
+
+ min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
+ if (time_before(deadline, min_deadline))
+ deadline = min_deadline;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &res_table->upkeep_dw, deadline - now);
+ }
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work)
+{
+ struct delayed_work *dw = to_delayed_work(work);
+ struct nh_res_table *res_table;
+
+ res_table = container_of(dw, struct nh_res_table, upkeep_dw);
+ nh_res_table_upkeep(res_table, true, true);
+}
+
+static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
+{
+ cancel_delayed_work_sync(&res_table->upkeep_dw);
+}
+
+static void nh_res_group_rebalance(struct nh_group *nhg,
+ struct nh_res_table *res_table)
+{
+ u16 prev_upper_bound = 0;
+ u32 total = 0;
+ u32 w = 0;
+ int i;
+
+ INIT_LIST_HEAD(&res_table->uw_nh_entries);
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ u16 upper_bound;
+ u64 btw;
+
+ w += nhge->weight;
+ btw = ((u64)res_table->num_nh_buckets) * w;
+ upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
+ nhge->res.wants_buckets = upper_bound - prev_upper_bound;
+ prev_upper_bound = upper_bound;
+
+ if (nh_res_nhge_is_uw(nhge)) {
+ if (list_empty(&res_table->uw_nh_entries))
+ res_table->unbalanced_since = jiffies;
+ list_add(&nhge->res.uw_nh_entry,
+ &res_table->uw_nh_entries);
+ }
+ }
+}
+
+/* Migrate buckets in res_table so that they reference NHGE's from NHG with
+ * the right NH ID. Set those buckets that do not have a corresponding NHGE
+ * entry in NHG as not occupied.
+ */
+static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
+ struct nh_group *nhg)
+{
+ u16 i;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
+ bool found = false;
+ int j;
+
+ for (j = 0; j < nhg->num_nh; j++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[j];
+
+ if (nhge->nh->id == id) {
+ nh_res_bucket_set_nh(bucket, nhge);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ nh_res_bucket_unset_nh(bucket);
+ }
+}
+
+static void replace_nexthop_grp_res(struct nh_group *oldg,
+ struct nh_group *newg)
+{
+ /* For NH group replacement, the new NHG might only have a stub
+ * hash table with 0 buckets, because the number of buckets was not
+ * specified. For NH removal, oldg and newg both reference the same
+ * res_table. So in any case, in the following, we want to work
+ * with oldg->res_table.
+ */
+ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
+ unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
+ bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
+
+ nh_res_table_cancel_upkeep(old_res_table);
+ nh_res_table_migrate_buckets(old_res_table, newg);
+ nh_res_group_rebalance(newg, old_res_table);
+ if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
+ old_res_table->unbalanced_since = prev_unbalanced_since;
+ nh_res_table_upkeep(old_res_table, true, false);
+}
+
+static void nh_hthr_group_rebalance(struct nh_group *nhg)
+{
+ u32 total = 0;
+ u32 w = 0;
+ int i;
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ u32 upper_bound;
+
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
+ atomic_set(&nhge->hthr.upper_bound, upper_bound);
+ }
+}
+
+static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
+ struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhges, *new_nhges;
+ struct nexthop *nhp = nhge->nh_parent;
+ struct netlink_ext_ack extack;
+ struct nexthop *nh = nhge->nh;
+ struct nh_group *nhg, *newg;
+ int i, j, err;
+
+ WARN_ON(!nh);
+
+ nhg = rtnl_dereference(nhp->nh_grp);
+ newg = nhg->spare;
+
+ /* last entry, keep it visible and remove the parent */
+ if (nhg->num_nh == 1) {
+ remove_nexthop(net, nhp, nlinfo);
+ return;
+ }
+
+ newg->has_v4 = false;
+ newg->is_multipath = nhg->is_multipath;
+ newg->hash_threshold = nhg->hash_threshold;
+ newg->resilient = nhg->resilient;
+ newg->fdb_nh = nhg->fdb_nh;
+ newg->num_nh = nhg->num_nh;
+
+ /* copy old entries to new except the one getting removed */
+ nhges = nhg->nh_entries;
+ new_nhges = newg->nh_entries;
+ for (i = 0, j = 0; i < nhg->num_nh; ++i) {
+ struct nh_info *nhi;
+
+ /* current nexthop getting removed */
+ if (nhg->nh_entries[i].nh == nh) {
+ newg->num_nh--;
+ continue;
+ }
+
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ newg->has_v4 = true;
+
+ list_del(&nhges[i].nh_list);
+ new_nhges[j].stats = nhges[i].stats;
+ new_nhges[j].nh_parent = nhges[i].nh_parent;
+ new_nhges[j].nh = nhges[i].nh;
+ new_nhges[j].weight = nhges[i].weight;
+ list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
+ j++;
+ }
+
+ if (newg->hash_threshold)
+ nh_hthr_group_rebalance(newg);
+ else if (newg->resilient)
+ replace_nexthop_grp_res(nhg, newg);
+
+ rcu_assign_pointer(nhp->nh_grp, newg);
+
+ list_del(&nhge->nh_list);
+ free_percpu(nhge->stats);
+ nexthop_put(nhge->nh);
+
+ /* Removal of a NH from a resilient group is notified through
+ * bucket notifications.
+ */
+ if (newg->hash_threshold) {
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
+ &extack);
+ if (err)
+ pr_err("%s\n", extack._msg);
+ }
+
+ if (nlinfo)
+ nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhge, *tmp;
+
+ /* If there is nothing to do, let's avoid the costly call to
+ * synchronize_net()
+ */
+ if (list_empty(&nh->grp_list))
+ return;
+
+ list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
+ remove_nh_grp_entry(net, nhge, nlinfo);
+
+ /* make sure all see the newly published array before releasing rtnl */
+ synchronize_net();
+}
+
+static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
+{
+ struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+ struct nh_res_table *res_table;
+ int i, num_nh = nhg->num_nh;
+
+ for (i = 0; i < num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ if (WARN_ON(!nhge->nh))
+ continue;
+
+ list_del_init(&nhge->nh_list);
+ }
+
+ if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+ nh_res_table_cancel_upkeep(res_table);
+ }
+}
+
+/* not called for nexthop replace */
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+ struct fib6_info *f6i;
+ bool do_flush = false;
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ do_flush = true;
+ }
+ if (do_flush)
+ fib_flush(net);
+
+ spin_lock_bh(&nh->lock);
+
+ nh->dead = true;
+
+ while (!list_empty(&nh->f6i_list)) {
+ f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list);
+
+ /* __ip6_del_rt does a release, so do a hold here */
+ fib6_info_hold(f6i);
+
+ spin_unlock_bh(&nh->lock);
+ ipv6_stub->ip6_del_rt(net, f6i,
+ !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
+
+ spin_lock_bh(&nh->lock);
+ }
+
+ spin_unlock_bh(&nh->lock);
+}
+
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ __remove_nexthop_fib(net, nh);
+
+ if (nh->is_group) {
+ remove_nexthop_group(nh, nlinfo);
+ } else {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fib_nhc.nhc_dev)
+ hlist_del(&nhi->dev_hash);
+
+ remove_nexthop_from_groups(net, nh, nlinfo);
+ }
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
+
+ /* remove from the tree */
+ rb_erase(&nh->rb_node, &net->nexthop.rb_root);
+
+ if (nlinfo)
+ nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+
+ __remove_nexthop(net, nh, nlinfo);
+ nh_base_seq_inc(net);
+
+ nexthop_put(nh);
+}
+
+/* if any FIB entries reference this nexthop, any dst entries
+ * need to be regenerated
+ */
+static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
+ struct nexthop *replaced_nh)
+{
+ struct fib6_info *f6i;
+ struct nh_group *nhg;
+ int i;
+
+ if (!list_empty(&nh->fi_list))
+ rt_cache_flush(net);
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_update_sernum(net, f6i);
+
+ /* if an IPv6 group was replaced, we have to release all old
+ * dsts to make sure all refcounts are released
+ */
+ if (!replaced_nh->is_group)
+ return;
+
+ nhg = rtnl_dereference(replaced_nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info);
+
+ if (nhi->family == AF_INET6)
+ ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh);
+ }
+}
+
+static int replace_nexthop_grp(struct net *net, struct nexthop *old,
+ struct nexthop *new, const struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_res_table *tmp_table = NULL;
+ struct nh_res_table *new_res_table;
+ struct nh_res_table *old_res_table;
+ struct nh_group *oldg, *newg;
+ int i, err;
+
+ if (!new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+ return -EINVAL;
+ }
+
+ oldg = rtnl_dereference(old->nh_grp);
+ newg = rtnl_dereference(new->nh_grp);
+
+ if (newg->hash_threshold != oldg->hash_threshold) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
+ return -EINVAL;
+ }
+
+ if (newg->hash_threshold) {
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
+ extack);
+ if (err)
+ return err;
+ } else if (newg->resilient) {
+ new_res_table = rtnl_dereference(newg->res_table);
+ old_res_table = rtnl_dereference(oldg->res_table);
+
+ /* Accept if num_nh_buckets was not given, but if it was
+ * given, demand that the value be correct.
+ */
+ if (cfg->nh_grp_res_has_num_buckets &&
+ cfg->nh_grp_res_num_buckets !=
+ old_res_table->num_nh_buckets) {
+ NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
+ return -EINVAL;
+ }
+
+ /* Emit a pre-replace notification so that listeners could veto
+ * a potentially unsupported configuration. Otherwise,
+ * individual bucket replacement notifications would need to be
+ * vetoed, which is something that should only happen if the
+ * bucket is currently active.
+ */
+ err = call_nexthop_res_table_notifiers(net, new, extack);
+ if (err)
+ return err;
+
+ if (cfg->nh_grp_res_has_idle_timer)
+ old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+ if (cfg->nh_grp_res_has_unbalanced_timer)
+ old_res_table->unbalanced_timer =
+ cfg->nh_grp_res_unbalanced_timer;
+
+ replace_nexthop_grp_res(oldg, newg);
+
+ tmp_table = new_res_table;
+ rcu_assign_pointer(newg->res_table, old_res_table);
+ rcu_assign_pointer(newg->spare->res_table, old_res_table);
+ }
+
+ /* update parents - used by nexthop code for cleanup */
+ for (i = 0; i < newg->num_nh; i++)
+ newg->nh_entries[i].nh_parent = old;
+
+ rcu_assign_pointer(old->nh_grp, newg);
+
+ /* Make sure concurrent readers are not using 'oldg' anymore. */
+ synchronize_net();
+
+ if (newg->resilient) {
+ rcu_assign_pointer(oldg->res_table, tmp_table);
+ rcu_assign_pointer(oldg->spare->res_table, tmp_table);
+ }
+
+ for (i = 0; i < oldg->num_nh; i++)
+ oldg->nh_entries[i].nh_parent = new;
+
+ rcu_assign_pointer(new->nh_grp, oldg);
+
+ return 0;
+}
+
+static void nh_group_v4_update(struct nh_group *nhg)
+{
+ struct nh_grp_entry *nhges;
+ bool has_v4 = false;
+ int i;
+
+ nhges = nhg->nh_entries;
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ has_v4 = true;
+ }
+ nhg->has_v4 = has_v4;
+}
+
+static int replace_nexthop_single_notify_res(struct net *net,
+ struct nh_res_table *res_table,
+ struct nexthop *old,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ u32 nhg_id = res_table->nhg_id;
+ int err;
+ u16 i;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nhge->nh == old) {
+ err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
+ i, true,
+ oldi, newi,
+ extack);
+ if (err)
+ goto err_notify;
+ }
+ }
+
+ return 0;
+
+err_notify:
+ while (i-- > 0) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nhge->nh == old)
+ __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
+ true, newi, oldi,
+ extack);
+ }
+ return err;
+}
+
+static int replace_nexthop_single_notify(struct net *net,
+ struct nexthop *group_nh,
+ struct nexthop *old,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
+ struct nh_res_table *res_table;
+
+ if (nhg->hash_threshold) {
+ return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
+ group_nh, extack);
+ } else if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+ return replace_nexthop_single_notify_res(net, res_table,
+ old, oldi, newi,
+ extack);
+ }
+
+ return -EINVAL;
+}
+
+static int replace_nexthop_single(struct net *net, struct nexthop *old,
+ struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ u8 old_protocol, old_nh_flags;
+ struct nh_info *oldi, *newi;
+ struct nh_grp_entry *nhge;
+ int err;
+
+ if (new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+ return -EINVAL;
+ }
+
+ if (!list_empty(&old->grp_list) &&
+ rtnl_dereference(new->nh_info)->fdb_nh !=
+ rtnl_dereference(old->nh_info)->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group");
+ return -EINVAL;
+ }
+
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
+ if (err)
+ return err;
+
+ /* Hardware flags were set on 'old' as 'new' is not in the red-black
+ * tree. Therefore, inherit the flags from 'old' to 'new'.
+ */
+ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
+
+ oldi = rtnl_dereference(old->nh_info);
+ newi = rtnl_dereference(new->nh_info);
+
+ newi->nh_parent = old;
+ oldi->nh_parent = new;
+
+ old_protocol = old->protocol;
+ old_nh_flags = old->nh_flags;
+
+ old->protocol = new->protocol;
+ old->nh_flags = new->nh_flags;
+
+ rcu_assign_pointer(old->nh_info, newi);
+ rcu_assign_pointer(new->nh_info, oldi);
+
+ /* Send a replace notification for all the groups using the nexthop. */
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+
+ err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
+ extack);
+ if (err)
+ goto err_notify;
+ }
+
+ /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
+ * update IPv4 indication in all the groups using the nexthop.
+ */
+ if (oldi->family == AF_INET && newi->family == AF_INET6) {
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nhp->nh_grp);
+ nh_group_v4_update(nhg);
+ }
+ }
+
+ return 0;
+
+err_notify:
+ rcu_assign_pointer(new->nh_info, newi);
+ rcu_assign_pointer(old->nh_info, oldi);
+ old->nh_flags = old_nh_flags;
+ old->protocol = old_protocol;
+ oldi->nh_parent = old;
+ newi->nh_parent = new;
+ list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+
+ replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
+ }
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
+ return err;
+}
+
+static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct fib6_info *f6i;
+
+ if (!list_empty(&nh->fi_list)) {
+ struct fib_info *fi;
+
+ /* expectation is a few fib_info per nexthop and then
+ * a lot of routes per fib_info. So mark the fib_info
+ * and then walk the fib tables once
+ */
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = true;
+
+ fib_info_notify_update(net, info);
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = false;
+ }
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_rt_update(net, f6i, info);
+}
+
+/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
+ * linked to this nexthop and for all groups that the nexthop
+ * is a member of
+ */
+static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct nh_grp_entry *nhge;
+
+ __nexthop_replace_notify(net, nh, info);
+
+ list_for_each_entry(nhge, &nh->grp_list, nh_list)
+ __nexthop_replace_notify(net, nhge->nh_parent, info);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+ struct nexthop *new, const struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ bool new_is_reject = false;
+ struct nh_grp_entry *nhge;
+ int err;
+
+ /* check that existing FIB entries are ok with the
+ * new nexthop definition
+ */
+ err = fib_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ if (!new->is_group) {
+ struct nh_info *nhi = rtnl_dereference(new->nh_info);
+
+ new_is_reject = nhi->reject_nh;
+ }
+
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ /* if new nexthop is a blackhole, any groups using this
+ * nexthop cannot have more than 1 path
+ */
+ if (new_is_reject &&
+ nexthop_num_path(nhge->nh_parent) > 1) {
+ NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
+ return -EINVAL;
+ }
+
+ err = fib_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+ }
+
+ if (old->is_group)
+ err = replace_nexthop_grp(net, old, new, cfg, extack);
+ else
+ err = replace_nexthop_single(net, old, new, extack);
+
+ if (!err) {
+ nh_rt_cache_flush(net, old, new);
+
+ __remove_nexthop(net, new, NULL);
+ nexthop_put(new);
+ }
+
+ return err;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+ struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+ struct rb_root *root = &net->nexthop.rb_root;
+ bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+ bool create = !!(cfg->nlflags & NLM_F_CREATE);
+ u32 new_id = new_nh->id;
+ int replace_notify = 0;
+ int rc = -EEXIST;
+
+ pp = &root->rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = *pp;
+ if (!next)
+ break;
+
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (new_id < nh->id) {
+ pp = &next->rb_left;
+ } else if (new_id > nh->id) {
+ pp = &next->rb_right;
+ } else if (replace) {
+ rc = replace_nexthop(net, nh, new_nh, cfg, extack);
+ if (!rc) {
+ new_nh = nh; /* send notification with old nh */
+ replace_notify = 1;
+ }
+ goto out;
+ } else {
+ /* id already exists and not a replace */
+ goto out;
+ }
+ }
+
+ if (replace && !create) {
+ NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+ rc = -ENOENT;
+ goto out;
+ }
+
+ if (new_nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
+ struct nh_res_table *res_table;
+
+ if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+
+ /* Not passing the number of buckets is OK when
+ * replacing, but not when creating a new group.
+ */
+ if (!cfg->nh_grp_res_has_num_buckets) {
+ NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ nh_res_group_rebalance(nhg, res_table);
+
+ /* Do not send bucket notifications, we do full
+ * notification below.
+ */
+ nh_res_table_upkeep(res_table, false, false);
+ }
+ }
+
+ rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+ rb_insert_color(&new_nh->rb_node, root);
+
+ /* The initial insertion is a full notification for hash-threshold as
+ * well as resilient groups.
+ */
+ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
+ if (rc)
+ rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
+
+out:
+ if (!rc) {
+ nh_base_seq_inc(net);
+ nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+ if (replace_notify &&
+ READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
+ nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
+ }
+
+ return rc;
+}
+
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev != dev)
+ continue;
+
+ if (nhi->reject_nh &&
+ (event == NETDEV_DOWN || event == NETDEV_CHANGE))
+ continue;
+
+ remove_nexthop(net, nhi->nh_parent, NULL);
+ }
+}
+
+/* rtnl; called when net namespace is deleted */
+static void flush_all_nexthops(struct net *net)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ struct nexthop *nh;
+
+ while ((node = rb_first(root))) {
+ nh = rb_entry(node, struct nexthop, rb_node);
+ remove_nexthop(net, nh, NULL);
+ cond_resched();
+ }
+}
+
+static struct nexthop *nexthop_create_group(struct net *net,
+ struct nh_config *cfg)
+{
+ struct nlattr *grps_attr = cfg->nh_grp;
+ struct nexthop_grp *entry = nla_data(grps_attr);
+ u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int err;
+ int i;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nh->is_group = 1;
+
+ nhg = nexthop_grp_alloc(num_nh);
+ if (!nhg) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* spare group used for removals */
+ nhg->spare = nexthop_grp_alloc(num_nh);
+ if (!nhg->spare) {
+ kfree(nhg);
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+ nhg->spare->spare = nhg;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nexthop *nhe;
+ struct nh_info *nhi;
+
+ nhe = nexthop_find_by_id(net, entry[i].id);
+ if (!nexthop_get(nhe)) {
+ err = -ENOENT;
+ goto out_no_nh;
+ }
+
+ nhi = rtnl_dereference(nhe->nh_info);
+ if (nhi->family == AF_INET)
+ nhg->has_v4 = true;
+
+ nhg->nh_entries[i].stats =
+ netdev_alloc_pcpu_stats(struct nh_grp_entry_stats);
+ if (!nhg->nh_entries[i].stats) {
+ err = -ENOMEM;
+ nexthop_put(nhe);
+ goto out_no_nh;
+ }
+ nhg->nh_entries[i].nh = nhe;
+ nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);
+
+ list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
+ nhg->nh_entries[i].nh_parent = nh;
+ }
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ nhg->hash_threshold = 1;
+ nhg->is_multipath = true;
+ } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
+ struct nh_res_table *res_table;
+
+ res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
+ if (!res_table) {
+ err = -ENOMEM;
+ goto out_no_nh;
+ }
+
+ rcu_assign_pointer(nhg->spare->res_table, res_table);
+ rcu_assign_pointer(nhg->res_table, res_table);
+ nhg->resilient = true;
+ nhg->is_multipath = true;
+ }
+
+ WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
+
+ if (nhg->hash_threshold)
+ nh_hthr_group_rebalance(nhg);
+
+ if (cfg->nh_fdb)
+ nhg->fdb_nh = 1;
+
+ if (cfg->nh_hw_stats)
+ nhg->hw_stats = true;
+
+ rcu_assign_pointer(nh->nh_grp, nhg);
+
+ return nh;
+
+out_no_nh:
+ for (i--; i >= 0; --i) {
+ list_del(&nhg->nh_entries[i].nh_list);
+ free_percpu(nhg->nh_entries[i].stats);
+ nexthop_put(nhg->nh_entries[i].nh);
+ }
+
+ kfree(nhg->spare);
+ kfree(nhg);
+ kfree(nh);
+
+ return ERR_PTR(err);
+}
+
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_nh *fib_nh = &nhi->fib_nh;
+ struct fib_config fib_cfg = {
+ .fc_oif = cfg->nh_ifindex,
+ .fc_gw4 = cfg->gw.ipv4,
+ .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+ .fc_flags = cfg->nh_flags,
+ .fc_nlinfo = cfg->nlinfo,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ };
+ u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
+ int err;
+
+ err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+ if (err) {
+ fib_nh_release(net, fib_nh);
+ goto out;
+ }
+
+ if (nhi->fdb_nh)
+ goto out;
+
+ /* sets nh_dev if successful */
+ err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+ if (!err) {
+ nh->nh_flags = fib_nh->fib_nh_flags;
+ fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
+ !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1);
+ } else {
+ fib_nh_release(net, fib_nh);
+ }
+out:
+ return err;
+}
+
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+ struct fib6_config fib6_cfg = {
+ .fc_table = l3mdev_fib_table(cfg->dev),
+ .fc_ifindex = cfg->nh_ifindex,
+ .fc_gateway = cfg->gw.ipv6,
+ .fc_flags = cfg->nh_flags,
+ .fc_nlinfo = cfg->nlinfo,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ .fc_is_fdb = cfg->nh_fdb,
+ };
+ int err;
+
+ if (!ipv6_addr_any(&cfg->gw.ipv6))
+ fib6_cfg.fc_flags |= RTF_GATEWAY;
+
+ /* sets nh_dev if successful */
+ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
+ extack);
+ if (err) {
+ /* IPv6 is not enabled, don't call fib6_nh_release */
+ if (err == -EAFNOSUPPORT)
+ goto out;
+ ipv6_stub->fib6_nh_release(fib6_nh);
+ } else {
+ nh->nh_flags = fib6_nh->fib_nh_flags;
+ }
+out:
+ return err;
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ struct nexthop *nh;
+ int err = 0;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+ if (!nhi) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nh->nh_flags = cfg->nh_flags;
+ nh->net = net;
+
+ nhi->nh_parent = nh;
+ nhi->family = cfg->nh_family;
+ nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+
+ if (cfg->nh_fdb)
+ nhi->fdb_nh = 1;
+
+ if (cfg->nh_blackhole) {
+ nhi->reject_nh = 1;
+ cfg->nh_ifindex = net->loopback_dev->ifindex;
+ }
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+ break;
+ case AF_INET6:
+ err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+ break;
+ }
+
+ if (err) {
+ kfree(nhi);
+ kfree(nh);
+ return ERR_PTR(err);
+ }
+
+ /* add the entry to the device based hash */
+ if (!nhi->fdb_nh)
+ nexthop_devhash_add(net, nhi);
+
+ rcu_assign_pointer(nh->nh_info, nhi);
+
+ return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nexthop *nh;
+ int err;
+
+ if (!cfg->nh_id) {
+ cfg->nh_id = nh_find_unused_id(net);
+ if (!cfg->nh_id) {
+ NL_SET_ERR_MSG(extack, "No unused id");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ if (cfg->nh_grp)
+ nh = nexthop_create_group(net, cfg);
+ else
+ nh = nexthop_create(net, cfg, extack);
+
+ if (IS_ERR(nh))
+ return nh;
+
+ refcount_set(&nh->refcnt, 1);
+ nh->id = cfg->nh_id;
+ nh->protocol = cfg->nh_protocol;
+ nh->net = net;
+
+ err = insert_nexthop(net, nh, cfg, extack);
+ if (err) {
+ __remove_nexthop(net, nh, NULL);
+ nexthop_put(nh);
+ nh = ERR_PTR(err);
+ }
+
+ return nh;
+}
+
+static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
+ unsigned long *timer_p, bool *has_p,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long timer;
+ u32 value;
+
+ if (!attr) {
+ *timer_p = fallback;
+ *has_p = false;
+ return 0;
+ }
+
+ value = nla_get_u32(attr);
+ timer = clock_t_to_jiffies(value);
+ if (timer == ~0UL) {
+ NL_SET_ERR_MSG(extack, "Timer value too large");
+ return -EINVAL;
+ }
+
+ *timer_p = timer;
+ *has_p = true;
+ return 0;
+}
+
+static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
+ int err;
+
+ if (res) {
+ err = nla_parse_nested(tb,
+ ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
+ res, rtm_nh_res_policy_new, extack);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NHA_RES_GROUP_BUCKETS]) {
+ cfg->nh_grp_res_num_buckets =
+ nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
+ cfg->nh_grp_res_has_num_buckets = true;
+ if (!cfg->nh_grp_res_num_buckets) {
+ NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
+ return -EINVAL;
+ }
+ }
+
+ err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
+ NH_RES_DEFAULT_IDLE_TIMER,
+ &cfg->nh_grp_res_idle_timer,
+ &cfg->nh_grp_res_has_idle_timer,
+ extack);
+ if (err)
+ return err;
+
+ return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
+ NH_RES_DEFAULT_UNBALANCED_TIMER,
+ &cfg->nh_grp_res_unbalanced_timer,
+ &cfg->nh_grp_res_has_unbalanced_timer,
+ extack);
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nlattr **tb,
+ struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+ int err;
+
+ err = -EINVAL;
+ if (nhm->resvd || nhm->nh_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
+ goto out;
+ }
+ if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
+ goto out;
+ }
+
+ switch (nhm->nh_family) {
+ case AF_INET:
+ case AF_INET6:
+ break;
+ case AF_UNSPEC:
+ if (tb[NHA_GROUP])
+ break;
+ fallthrough;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid address family");
+ goto out;
+ }
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->nlflags = nlh->nlmsg_flags;
+ cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->nlinfo.nlh = nlh;
+ cfg->nlinfo.nl_net = net;
+
+ cfg->nh_family = nhm->nh_family;
+ cfg->nh_protocol = nhm->nh_protocol;
+ cfg->nh_flags = nhm->nh_flags;
+
+ if (tb[NHA_ID])
+ cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+ if (tb[NHA_FDB]) {
+ if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+ goto out;
+ }
+ if (nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
+ goto out;
+ }
+ cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+ }
+
+ if (tb[NHA_GROUP]) {
+ if (nhm->nh_family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid family for group");
+ goto out;
+ }
+ cfg->nh_grp = tb[NHA_GROUP];
+
+ cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+ if (tb[NHA_GROUP_TYPE])
+ cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+ if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid group type");
+ goto out;
+ }
+
+ err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new),
+ cfg->nh_grp_type, extack);
+ if (err)
+ goto out;
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
+ cfg, extack);
+
+ if (tb[NHA_HW_STATS_ENABLE])
+ cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]);
+
+ /* no other attributes should be set */
+ goto out;
+ }
+
+ if (tb[NHA_BLACKHOLE]) {
+ if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+ NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
+ goto out;
+ }
+
+ cfg->nh_blackhole = 1;
+ err = 0;
+ goto out;
+ }
+
+ if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+ NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (tb[NHA_GATEWAY]) {
+ struct nlattr *gwa = tb[NHA_GATEWAY];
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ if (nla_len(gwa) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv4 = nla_get_be32(gwa);
+ break;
+ case AF_INET6:
+ if (nla_len(gwa) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack,
+ "Unknown address family for gateway");
+ goto out;
+ }
+ } else {
+ /* device only nexthop (no gateway) */
+ if (cfg->nh_flags & RTNH_F_ONLINK) {
+ NL_SET_ERR_MSG(extack,
+ "ONLINK flag can not be set for nexthop without a gateway");
+ goto out;
+ }
+ }
+
+ if (tb[NHA_ENCAP]) {
+ cfg->nh_encap = tb[NHA_ENCAP];
+
+ if (!tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
+ goto out;
+ }
+
+ cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
+ err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
+ if (err < 0)
+ goto out;
+
+ } else if (tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
+ goto out;
+ }
+
+ if (tb[NHA_HW_STATS_ENABLE]) {
+ NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops");
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb,
+ struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[NHA_GROUP])
+ return nh_check_attr_group_rtnl(net, tb, extack);
+
+ if (tb[NHA_OIF]) {
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+
+ if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ return -ENETDOWN;
+ }
+
+ if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ return -ENETDOWN;
+ }
+ }
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
+ struct net *net = sock_net(skb->sk);
+ struct nh_config cfg;
+ struct nexthop *nh;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_new) - 1,
+ rtm_nh_policy_new, extack);
+ if (err < 0)
+ goto out;
+
+ err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack);
+ if (err)
+ goto out;
+
+ if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) {
+ NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+ err = -EINVAL;
+ goto out;
+ }
+
+ rtnl_net_lock(net);
+
+ err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack);
+ if (err)
+ goto unlock;
+
+ nh = nexthop_add(net, &cfg, extack);
+ if (IS_ERR(nh))
+ err = PTR_ERR(nh);
+
+unlock:
+ rtnl_net_unlock(net);
+out:
+ return err;
+}
+
+static int nh_valid_get_del_req(const struct nlmsghdr *nlh,
+ struct nlattr **tb, u32 *id, u32 *op_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header");
+ return -EINVAL;
+ }
+
+ if (!tb[NHA_ID]) {
+ NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+ return -EINVAL;
+ }
+
+ *id = nla_get_u32(tb[NHA_ID]);
+ if (!(*id)) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+
+ if (op_flags)
+ *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0);
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)];
+ struct net *net = sock_net(skb->sk);
+ struct nl_info nlinfo = {
+ .nlh = nlh,
+ .nl_net = net,
+ .portid = NETLINK_CB(skb).portid,
+ };
+ struct nexthop *nh;
+ int err;
+ u32 id;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack);
+ if (err)
+ return err;
+
+ rtnl_net_lock(net);
+
+ nh = nexthop_find_by_id(net, id);
+ if (nh)
+ remove_nexthop(net, nh, &nlinfo);
+ else
+ err = -ENOENT;
+
+ rtnl_net_unlock(net);
+
+ return err;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *skb = NULL;
+ struct nexthop *nh;
+ u32 op_flags;
+ int err;
+ u32 id;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack);
+ if (err)
+ return err;
+
+ err = -ENOBUFS;
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ err = -ENOENT;
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ goto errout_free;
+
+ err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, op_flags);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ return err;
+errout_free:
+ kfree_skb(skb);
+ goto out;
+}
+
+struct nh_dump_filter {
+ u32 nh_id;
+ int dev_idx;
+ int master_idx;
+ bool group_filter;
+ bool fdb_filter;
+ u32 res_bucket_nh_id;
+ u32 op_flags;
+};
+
+static bool nh_dump_filtered(struct nexthop *nh,
+ struct nh_dump_filter *filter, u8 family)
+{
+ const struct net_device *dev;
+ const struct nh_info *nhi;
+
+ if (filter->group_filter && !nh->is_group)
+ return true;
+
+ if (!filter->dev_idx && !filter->master_idx && !family)
+ return false;
+
+ if (nh->is_group)
+ return true;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (family && nhi->family != family)
+ return true;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
+ return true;
+
+ if (filter->master_idx) {
+ struct net_device *master;
+
+ if (!dev)
+ return true;
+
+ master = netdev_master_upper_dev_get((struct net_device *)dev);
+ if (!master || master->ifindex != filter->master_idx)
+ return true;
+ }
+
+ return false;
+}
+
+static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
+ struct nh_dump_filter *filter,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm;
+ u32 idx;
+
+ if (tb[NHA_OIF]) {
+ idx = nla_get_u32(tb[NHA_OIF]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+ filter->dev_idx = idx;
+ }
+ if (tb[NHA_MASTER]) {
+ idx = nla_get_u32(tb[NHA_MASTER]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid master device index");
+ return -EINVAL;
+ }
+ filter->master_idx = idx;
+ }
+ filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
+ filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
+
+ nhm = nlmsg_data(nlh);
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_valid_dump_req(const struct nlmsghdr *nlh,
+ struct nh_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_dump) - 1,
+ rtm_nh_policy_dump, cb->extack);
+ if (err < 0)
+ return err;
+
+ filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0);
+
+ return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_nh_ctx {
+ u32 idx;
+};
+
+static struct rtm_dump_nh_ctx *
+rtm_dump_nh_ctx(struct netlink_callback *cb)
+{
+ struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+ return ctx;
+}
+
+static int rtm_dump_walk_nexthops(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct rb_root *root,
+ struct rtm_dump_nh_ctx *ctx,
+ int (*nh_cb)(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh, void *data),
+ void *data)
+{
+ struct rb_node *node;
+ int s_idx;
+ int err;
+
+ s_idx = ctx->idx;
+
+ /* If this is not the first invocation, ctx->idx will contain the id of
+ * the last nexthop we processed. Instead of starting from the very
+ * first element of the red/black tree again and linearly skipping the
+ * (potentially large) set of nodes with an id smaller than s_idx, walk
+ * the tree and find the left-most node whose id is >= s_idx. This
+ * provides an efficient O(log n) starting point for the dump
+ * continuation.
+ */
+ if (s_idx != 0) {
+ struct rb_node *tmp = root->rb_node;
+
+ node = NULL;
+ while (tmp) {
+ struct nexthop *nh;
+
+ nh = rb_entry(tmp, struct nexthop, rb_node);
+ if (nh->id < s_idx) {
+ tmp = tmp->rb_right;
+ } else {
+ /* Track current candidate and keep looking on
+ * the left side to find the left-most
+ * (smallest id) that is still >= s_idx.
+ */
+ node = tmp;
+ tmp = tmp->rb_left;
+ }
+ }
+ } else {
+ node = rb_first(root);
+ }
+
+ for (; node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+
+ ctx->idx = nh->id;
+ err = nh_cb(skb, cb, nh, data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
+ struct nexthop *nh, void *data)
+{
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ struct nh_dump_filter *filter = data;
+
+ if (nh_dump_filtered(nh, filter, nhm->nh_family))
+ return 0;
+
+ return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct nh_dump_filter filter = {};
+ int err;
+
+ err = nh_valid_dump_req(cb->nlh, &filter, cb);
+ if (err < 0)
+ return err;
+
+ err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
+ &rtm_dump_nexthop_cb, &filter);
+
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+}
+
+static struct nexthop *
+nexthop_find_group_resilient(struct net *net, u32 id,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *nhg;
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ return ERR_PTR(-ENOENT);
+
+ if (!nh->is_group) {
+ NL_SET_ERR_MSG(extack, "Not a nexthop group");
+ return ERR_PTR(-EINVAL);
+ }
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (!nhg->resilient) {
+ NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return nh;
+}
+
+static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
+ struct netlink_ext_ack *extack)
+{
+ u32 idx;
+
+ if (attr) {
+ idx = nla_get_u32(attr);
+ if (!idx) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ *nh_id_p = idx;
+ } else {
+ *nh_id_p = 0;
+ }
+
+ return 0;
+}
+
+static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
+ struct nh_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
+ rtm_nh_policy_dump_bucket, NULL);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
+ if (err)
+ return err;
+
+ if (tb[NHA_RES_BUCKET]) {
+ size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
+
+ err = nla_parse_nested(res_tb, max,
+ tb[NHA_RES_BUCKET],
+ rtm_nh_res_bucket_policy_dump,
+ cb->extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
+ &filter->res_bucket_nh_id,
+ cb->extack);
+ if (err)
+ return err;
+ }
+
+ return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_res_bucket_ctx {
+ struct rtm_dump_nh_ctx nh;
+ u16 bucket_index;
+};
+
+static struct rtm_dump_res_bucket_ctx *
+rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
+{
+ struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+ return ctx;
+}
+
+struct rtm_dump_nexthop_bucket_data {
+ struct rtm_dump_res_bucket_ctx *ctx;
+ struct nh_dump_filter filter;
+};
+
+static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh,
+ struct rtm_dump_nexthop_bucket_data *dd)
+{
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ struct nh_res_table *res_table;
+ struct nh_group *nhg;
+ u16 bucket_index;
+ int err;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ res_table = rtnl_dereference(nhg->res_table);
+ for (bucket_index = dd->ctx->bucket_index;
+ bucket_index < res_table->num_nh_buckets;
+ bucket_index++) {
+ struct nh_res_bucket *bucket;
+ struct nh_grp_entry *nhge;
+
+ bucket = &res_table->nh_buckets[bucket_index];
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
+ continue;
+
+ if (dd->filter.res_bucket_nh_id &&
+ dd->filter.res_bucket_nh_id != nhge->nh->id)
+ continue;
+
+ dd->ctx->bucket_index = bucket_index;
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+ if (err)
+ return err;
+ }
+
+ dd->ctx->bucket_index = 0;
+
+ return 0;
+}
+
+static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh, void *data)
+{
+ struct rtm_dump_nexthop_bucket_data *dd = data;
+ struct nh_group *nhg;
+
+ if (!nh->is_group)
+ return 0;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (!nhg->resilient)
+ return 0;
+
+ return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
+ struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
+ struct net *net = sock_net(skb->sk);
+ struct nexthop *nh;
+ int err;
+
+ err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
+ if (err)
+ return err;
+
+ if (dd.filter.nh_id) {
+ nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
+ cb->extack);
+ if (IS_ERR(nh))
+ return PTR_ERR(nh);
+ err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
+ } else {
+ struct rb_root *root = &net->nexthop.rb_root;
+
+ err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
+ &rtm_dump_nexthop_bucket_cb, &dd);
+ }
+
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+}
+
+static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
+ u16 *bucket_index,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
+ int err;
+
+ err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
+ res, rtm_nh_res_bucket_policy_get, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[NHA_RES_BUCKET_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Bucket index is missing");
+ return -EINVAL;
+ }
+
+ *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
+ return 0;
+}
+
+static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
+ u32 *id, u16 *bucket_index,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
+ rtm_nh_policy_get_bucket, extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_get_del_req(nlh, tb, id, NULL, extack);
+ if (err)
+ return err;
+
+ if (!tb[NHA_RES_BUCKET]) {
+ NL_SET_ERR_MSG(extack, "Bucket information is missing");
+ return -EINVAL;
+ }
+
+ err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
+ bucket_index, extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nh_res_table *res_table;
+ struct sk_buff *skb = NULL;
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ u16 bucket_index;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
+ if (err)
+ return err;
+
+ nh = nexthop_find_group_resilient(net, id, extack);
+ if (IS_ERR(nh))
+ return PTR_ERR(nh);
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ res_table = rtnl_dereference(nhg->res_table);
+ if (bucket_index >= res_table->num_nh_buckets) {
+ NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
+ return -ENOENT;
+ }
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
+ bucket_index, RTM_NEWNEXTHOPBUCKET,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ 0, extack);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout_free:
+ kfree_skb(skb);
+ return err;
+}
+
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev == dev) {
+ if (nhi->family == AF_INET)
+ fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+ orig_mtu);
+ }
+ }
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_info_ext *info_ext;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_flush_dev(dev, event);
+ break;
+ case NETDEV_CHANGE:
+ if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+ nexthop_flush_dev(dev, event);
+ break;
+ case NETDEV_CHANGEMTU:
+ info_ext = ptr;
+ nexthop_sync_mtu(dev, info_ext->ext.mtu);
+ rt_cache_flush(dev_net(dev));
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+ .notifier_call = nh_netdev_event,
+};
+
+static int nexthops_dump(struct net *net, struct notifier_block *nb,
+ enum nexthop_event_type event_type,
+ struct netlink_ext_ack *extack)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ int err = 0;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+ err = call_nexthop_notifier(nb, net, event_type, nh, extack);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ rtnl_lock();
+ err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
+ if (err)
+ goto unlock;
+ err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
+ nb);
+unlock:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_nexthop_notifier);
+
+int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
+ nb);
+ if (!err)
+ nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
+ return err;
+}
+EXPORT_SYMBOL(__unregister_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = __unregister_nexthop_notifier(net, nb);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_nexthop_notifier);
+
+void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
+{
+ struct nexthop *nexthop;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop)
+ goto out;
+
+ nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ if (offload)
+ nexthop->nh_flags |= RTNH_F_OFFLOAD;
+ if (trap)
+ nexthop->nh_flags |= RTNH_F_TRAP;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_set_hw_flags);
+
+void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
+ bool offload, bool trap)
+{
+ struct nh_res_table *res_table;
+ struct nh_res_bucket *bucket;
+ struct nexthop *nexthop;
+ struct nh_group *nhg;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop || !nexthop->is_group)
+ goto out;
+
+ nhg = rcu_dereference(nexthop->nh_grp);
+ if (!nhg->resilient)
+ goto out;
+
+ if (bucket_index >= nhg->res_table->num_nh_buckets)
+ goto out;
+
+ res_table = rcu_dereference(nhg->res_table);
+ bucket = &res_table->nh_buckets[bucket_index];
+ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ if (offload)
+ bucket->nh_flags |= RTNH_F_OFFLOAD;
+ if (trap)
+ bucket->nh_flags |= RTNH_F_TRAP;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
+
+void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
+ unsigned long *activity)
+{
+ struct nh_res_table *res_table;
+ struct nexthop *nexthop;
+ struct nh_group *nhg;
+ u16 i;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop || !nexthop->is_group)
+ goto out;
+
+ nhg = rcu_dereference(nexthop->nh_grp);
+ if (!nhg->resilient)
+ goto out;
+
+ /* Instead of silently ignoring some buckets, demand that the sizes
+ * be the same.
+ */
+ res_table = rcu_dereference(nhg->res_table);
+ if (num_buckets != res_table->num_nh_buckets)
+ goto out;
+
+ for (i = 0; i < num_buckets; i++) {
+ if (test_bit(i, activity))
+ nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
+ }
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_res_grp_activity_update);
+
+static void __net_exit nexthop_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ASSERT_RTNL_NET(net);
+ flush_all_nexthops(net);
+}
+
+static void __net_exit nexthop_net_exit(struct net *net)
+{
+ kfree(net->nexthop.devhash);
+ net->nexthop.devhash = NULL;
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+ size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
+ net->nexthop.rb_root = RB_ROOT;
+ net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+ if (!net->nexthop.devhash)
+ return -ENOMEM;
+ BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
+
+ return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+ .init = nexthop_net_init,
+ .exit = nexthop_net_exit,
+ .exit_rtnl = nexthop_net_exit_rtnl,
+};
+
+static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop,
+ .dumpit = rtm_dump_nexthop},
+ {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket,
+ .dumpit = rtm_dump_nexthop_bucket},
+ {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP,
+ .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP,
+ .dumpit = rtm_dump_nexthop},
+ {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP,
+ .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP,
+ .dumpit = rtm_dump_nexthop},
+};
+
+static int __init nexthop_init(void)
+{
+ register_pernet_subsys(&nexthop_net_ops);
+
+ register_netdevice_notifier(&nh_netdev_notifier);
+
+ rtnl_register_many(nexthop_rtnl_msg_handlers);
+
+ return 0;
+}
+subsys_initcall(nexthop_init);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 000000000000..ad56588107cc
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,1181 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * "Ping" sockets
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6),
+ * Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#endif
+
+struct ping_table {
+ struct hlist_head hash[PING_HTABLE_SIZE];
+ spinlock_t lock;
+};
+
+static struct ping_table ping_table;
+struct pingv6_ops pingv6_ops;
+EXPORT_IPV6_MOD_GPL(pingv6_ops);
+
+static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
+{
+ u32 res = (num + net_hash_mix(net)) & mask;
+
+ pr_debug("hash(%u) = %u\n", num, res);
+ return res;
+}
+
+static inline struct hlist_head *ping_hashslot(struct ping_table *table,
+ struct net *net, unsigned int num)
+{
+ return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+int ping_get_port(struct sock *sk, unsigned short ident)
+{
+ struct net *net = sock_net(sk);
+ struct inet_sock *isk, *isk2;
+ struct hlist_head *hlist;
+ struct sock *sk2 = NULL;
+
+ isk = inet_sk(sk);
+ spin_lock(&ping_table.lock);
+ if (ident == 0) {
+ u16 result = net->ipv4.ping_port_rover + 1;
+ u32 i;
+
+ for (i = 0; i < (1L << 16); i++, result++) {
+ if (!result)
+ continue; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, net, result);
+ sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
+ isk2 = inet_sk(sk2);
+
+ if (isk2->inet_num == result)
+ goto next_port;
+ }
+
+ /* found */
+ net->ipv4.ping_port_rover = ident = result;
+ break;
+next_port:
+ ;
+ }
+ if (i >= (1L << 16))
+ goto fail;
+ } else {
+ hlist = ping_hashslot(&ping_table, net, ident);
+ sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
+ isk2 = inet_sk(sk2);
+
+ /* BUG? Why is this reuse and not reuseaddr? ping.c
+ * doesn't turn off SO_REUSEADDR, and it doesn't expect
+ * that other ping processes can steal its packets.
+ */
+ if ((isk2->inet_num == ident) &&
+ (sk2 != sk) &&
+ (!sk2->sk_reuse || !sk->sk_reuse))
+ goto fail;
+ }
+ }
+
+ pr_debug("found port/ident = %d\n", ident);
+ isk->inet_num = ident;
+ if (sk_unhashed(sk)) {
+ pr_debug("was not hashed\n");
+ sk_add_node_rcu(sk, hlist);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
+ }
+ spin_unlock(&ping_table.lock);
+ return 0;
+
+fail:
+ spin_unlock(&ping_table.lock);
+ return -EADDRINUSE;
+}
+EXPORT_IPV6_MOD_GPL(ping_get_port);
+
+void ping_unhash(struct sock *sk)
+{
+ struct inet_sock *isk = inet_sk(sk);
+
+ pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ spin_lock(&ping_table.lock);
+ if (sk_del_node_init_rcu(sk)) {
+ isk->inet_num = 0;
+ isk->inet_sport = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ }
+ spin_unlock(&ping_table.lock);
+}
+EXPORT_IPV6_MOD_GPL(ping_unhash);
+
+/* Called under rcu_read_lock() */
+static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
+{
+ struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident);
+ struct sock *sk = NULL;
+ struct inet_sock *isk;
+ int dif, sdif;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ dif = inet_iif(skb);
+ sdif = inet_sdif(skb);
+ pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+ (int)ident, &ip_hdr(skb)->daddr, dif);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ dif = inet6_iif(skb);
+ sdif = inet6_sdif(skb);
+ pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
+ (int)ident, &ipv6_hdr(skb)->daddr, dif);
+#endif
+ } else {
+ return NULL;
+ }
+
+ sk_for_each_rcu(sk, hslot) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ isk = inet_sk(sk);
+
+ pr_debug("iterate\n");
+ if (isk->inet_num != ident)
+ continue;
+
+ if (skb->protocol == htons(ETH_P_IP) &&
+ sk->sk_family == AF_INET) {
+ pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
+ (int) isk->inet_num, &isk->inet_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (isk->inet_rcv_saddr &&
+ isk->inet_rcv_saddr != ip_hdr(skb)->daddr)
+ continue;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6) &&
+ sk->sk_family == AF_INET6) {
+
+ pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
+ (int) isk->inet_num,
+ &sk->sk_v6_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
+ &ipv6_hdr(skb)->daddr))
+ continue;
+#endif
+ } else {
+ continue;
+ }
+
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif)
+ continue;
+
+ goto exit;
+ }
+
+ sk = NULL;
+exit:
+
+ return sk;
+}
+
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+ kgid_t *high)
+{
+ kgid_t *data = net->ipv4.ping_group_range.range;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
+
+
+int ping_init_sock(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ kgid_t group = current_egid();
+ struct group_info *group_info;
+ int i;
+ kgid_t low, high;
+ int ret = 0;
+
+ if (sk->sk_family == AF_INET6)
+ sk->sk_ipv6only = 1;
+
+ inet_get_ping_group_range_net(net, &low, &high);
+ if (gid_lte(low, group) && gid_lte(group, high))
+ return 0;
+
+ group_info = get_current_groups();
+ for (i = 0; i < group_info->ngroups; i++) {
+ kgid_t gid = group_info->gid[i];
+
+ if (gid_lte(low, gid) && gid_lte(gid, high))
+ goto out_release_group;
+ }
+
+ ret = -EACCES;
+
+out_release_group:
+ put_group_info(group_info);
+ return ret;
+}
+EXPORT_IPV6_MOD_GPL(ping_init_sock);
+
+void ping_close(struct sock *sk, long timeout)
+{
+ pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num);
+ pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt));
+
+ sk_common_release(sk);
+}
+EXPORT_IPV6_MOD_GPL(ping_close);
+
+static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from __ip4_datagram_connect() and
+ * intended to prevent BPF program called below from accessing bytes
+ * that are out of the bound specified by user in addr_len.
+ */
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+
+/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
+ struct sockaddr_unsized *uaddr, int addr_len)
+{
+ struct net *net = sock_net(sk);
+ if (sk->sk_family == AF_INET) {
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ u32 tb_id = RT_TABLE_LOCAL;
+ int chk_addr_ret;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin_family != AF_INET &&
+ !(addr->sin_family == AF_UNSPEC &&
+ addr->sin_addr.s_addr == htonl(INADDR_ANY)))
+ return -EAFNOSUPPORT;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
+ sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+ if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+ return 0;
+
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
+
+ if (chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST ||
+ (chk_addr_ret != RTN_LOCAL &&
+ !inet_can_nonlocal_bind(net, isk)))
+ return -EADDRNOTAVAIL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+ int addr_type, scoped, has_addr;
+ struct net_device *dev = NULL;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
+ sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+ scoped = __ipv6_addr_needs_scope_id(addr_type);
+ if ((addr_type != IPV6_ADDR_ANY &&
+ !(addr_type & IPV6_ADDR_UNICAST)) ||
+ (scoped && !addr->sin6_scope_id))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (addr->sin6_scope_id) {
+ dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
+
+ if (!dev && sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
+ has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
+ scoped);
+ rcu_read_unlock();
+
+ if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
+ addr_type == IPV6_ADDR_ANY))
+ return -EADDRNOTAVAIL;
+
+ if (scoped)
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+#endif
+ } else {
+ return -EAFNOSUPPORT;
+ }
+ return 0;
+}
+
+static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr)
+{
+ if (saddr->sa_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) saddr;
+ isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (saddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
+#endif
+ }
+}
+
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ unsigned short snum;
+ int err;
+ int dif = sk->sk_bound_dev_if;
+
+ err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (isk->inet_num != 0)
+ goto out;
+
+ err = -EADDRINUSE;
+ snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
+ if (ping_get_port(sk, snum) != 0) {
+ /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */
+ sk->sk_bound_dev_if = dif;
+ goto out;
+ }
+ ping_set_saddr(sk, uaddr);
+
+ pr_debug("after bind(): num = %hu, dif = %d\n",
+ isk->inet_num,
+ sk->sk_bound_dev_if);
+
+ err = 0;
+ if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
+
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ isk->inet_sport = htons(isk->inet_num);
+ isk->inet_daddr = 0;
+ isk->inet_dport = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+#endif
+
+ sk_dst_reset(sk);
+out:
+ release_sock(sk);
+ pr_debug("ping_v4_bind -> %d\n", err);
+ return err;
+}
+EXPORT_IPV6_MOD_GPL(ping_bind);
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int family, int type, int code)
+{
+ return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+ (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+void ping_err(struct sk_buff *skb, int offset, u32 info)
+{
+ int family;
+ struct icmphdr *icmph;
+ struct inet_sock *inet_sock;
+ int type;
+ int code;
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ int harderr;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ family = AF_INET;
+ type = icmp_hdr(skb)->type;
+ code = icmp_hdr(skb)->code;
+ icmph = (struct icmphdr *)(skb->data + offset);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ family = AF_INET6;
+ type = icmp6_hdr(skb)->icmp6_type;
+ code = icmp6_hdr(skb)->icmp6_code;
+ icmph = (struct icmphdr *) (skb->data + offset);
+ } else {
+ BUG();
+ }
+
+ /* We assume the packet has already been checked by icmp_unreach */
+
+ if (!ping_supported(family, icmph->type, icmph->code))
+ return;
+
+ pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
+ skb->protocol, type, code, ntohs(icmph->un.echo.id),
+ ntohs(icmph->un.echo.sequence));
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (!sk) {
+ pr_debug("no socket, dropping\n");
+ return; /* No socket for error */
+ }
+ pr_debug("err on socket %p\n", sk);
+
+ err = 0;
+ harderr = 0;
+ inet_sock = inet_sk(sk);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ /* This is not a real error but ping wants to see it.
+ * Report it with some fake errno.
+ */
+ err = EREMOTEIO;
+ break;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ /* See ICMP_SOURCE_QUENCH */
+ ipv4_sk_redirect(skb, sk);
+ err = EREMOTEIO;
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
+#endif
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
+ (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
+ if (family == AF_INET) {
+ ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+ info, (u8 *)icmph);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
+ info, (u8 *)icmph);
+#endif
+ }
+ }
+ sk->sk_err = err;
+ sk_error_report(sk);
+out:
+ return;
+}
+EXPORT_IPV6_MOD_GPL(ping_err);
+
+/*
+ * Copy and checksum an ICMP Echo packet from user space into a buffer
+ * starting from the payload.
+ */
+
+int ping_getfrag(void *from, char *to,
+ int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+ struct pingfakehdr *pfh = from;
+
+ if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter))
+ return -EFAULT;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* For IPv6, checksum each skb as we go along, as expected by
+ * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
+ * wcheck, it will be finalized in ping_v4_push_pending_frames.
+ */
+ if (pfh->family == AF_INET6) {
+ skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd);
+ skb->ip_summed = CHECKSUM_NONE;
+ pfh->wcheck = 0;
+ }
+#endif
+
+ return 0;
+}
+EXPORT_IPV6_MOD_GPL(ping_getfrag);
+
+static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+ struct flowi4 *fl4)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+ if (!skb)
+ return 0;
+ pfh->wcheck = csum_partial((char *)&pfh->icmph,
+ sizeof(struct icmphdr), pfh->wcheck);
+ pfh->icmph.checksum = csum_fold(pfh->wcheck);
+ memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+ skb->ip_summed = CHECKSUM_NONE;
+ return ip_push_pending_frames(sk, fl4);
+}
+
+int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
+ void *user_icmph, size_t icmph_len)
+{
+ u8 type, code;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /* Must have at least a full ICMP header. */
+ if (len < icmph_len)
+ return -EINVAL;
+
+ /*
+ * Check the flags.
+ */
+
+ /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /*
+ * Fetch the ICMP header provided by the userland.
+ * iovec is modified! The ICMP header is consumed.
+ */
+ if (memcpy_from_msg(user_icmph, msg, icmph_len))
+ return -EFAULT;
+
+ if (family == AF_INET) {
+ type = ((struct icmphdr *) user_icmph)->type;
+ code = ((struct icmphdr *) user_icmph)->code;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
+ code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
+#endif
+ } else {
+ BUG();
+ }
+
+ if (!ping_supported(family, type, code))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
+
+static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct net *net = sock_net(sk);
+ struct flowi4 fl4;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct icmphdr user_icmph;
+ struct pingfakehdr pfh;
+ struct rtable *rt = NULL;
+ struct ip_options_data opt_copy;
+ int free = 0;
+ __be32 saddr, daddr, faddr;
+ u8 scope;
+ int err;
+
+ pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+ err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+ daddr = usin->sin_addr.s_addr;
+ /* no remote port */
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->inet_daddr;
+ /* no remote port */
+ }
+
+ ipcm_init_sk(&ipc, inet);
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sk, msg, &ipc, false);
+ if (unlikely(err)) {
+ kfree(ipc.opt);
+ return err;
+ }
+ if (ipc.opt)
+ free = 1;
+ }
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ faddr = ipc.opt->opt.faddr;
+ }
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+ ipc.oif = READ_ONCE(inet->mc_index);
+ if (!saddr)
+ saddr = READ_ONCE(inet->mc_addr);
+ } else if (!ipc.oif)
+ ipc.oif = READ_ONCE(inet->uc_index);
+
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
+ sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
+ saddr, 0, 0, sk_uid(sk));
+
+ fl4.fl4_icmp_type = user_icmph.type;
+ fl4.fl4_icmp_code = user_icmph.code;
+
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (!ipc.addr)
+ ipc.addr = fl4.daddr;
+
+ lock_sock(sk);
+
+ pfh.icmph.type = user_icmph.type; /* already checked */
+ pfh.icmph.code = user_icmph.code; /* ditto */
+ pfh.icmph.checksum = 0;
+ pfh.icmph.un.echo.id = inet->inet_sport;
+ pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+ pfh.msg = msg;
+ pfh.wcheck = 0;
+ pfh.family = AF_INET;
+
+ err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+ sizeof(struct icmphdr), &ipc, &rt,
+ msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else
+ err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+out_free:
+ if (free)
+ kfree(ipc.opt);
+ if (!err) {
+ icmp_out_count(sock_net(sk), user_icmph.type);
+ return len;
+ }
+ return err;
+
+do_confirm:
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ int family = sk->sk_family;
+ struct sk_buff *skb;
+ int copied, err;
+
+ pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+ err = -EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (flags & MSG_ERRQUEUE)
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ skb = skb_recv_datagram(sk, flags, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ /* Don't bother checking the checksum */
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address and add cmsg data. */
+ if (family == AF_INET) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ if (inet_cmsg_flags(isk))
+ ip_cmsg_recv(msg, skb);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ struct ipv6hdr *ip6 = ipv6_hdr(skb);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ if (sin6) {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+ sin6->sin6_flowinfo = 0;
+ if (inet6_test_bit(SNDFLOW, sk))
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
+ *addr_len = sizeof(*sin6);
+ }
+
+ if (inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ else if (skb->protocol == htons(ETH_P_IP) &&
+ inet_cmsg_flags(isk))
+ ip_cmsg_recv(msg, skb);
+#endif
+ } else {
+ BUG();
+ }
+
+ err = copied;
+
+done:
+ skb_free_datagram(sk, skb);
+out:
+ pr_debug("ping_recvmsg -> %d\n", err);
+ return err;
+}
+EXPORT_IPV6_MOD_GPL(ping_recvmsg);
+
+static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
+ struct sk_buff *skb)
+{
+ enum skb_drop_reason reason;
+
+ pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num, skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ sk_skb_reason_drop(sk, skb, reason);
+ pr_debug("ping_queue_rcv_skb -> failed\n");
+ return reason;
+ }
+ return SKB_NOT_DROPPED_YET;
+}
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
+}
+EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb);
+
+
+/*
+ * All we need to do is get the socket.
+ */
+
+enum skb_drop_reason ping_rcv(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct icmphdr *icmph = icmp_hdr(skb);
+ struct sock *sk;
+
+ /* We assume the packet has already been checked by icmp_rcv */
+
+ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+ skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+ /* Push ICMP header back */
+ skb_push(skb, skb->data - (u8 *)icmph);
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (sk)
+ return __ping_queue_rcv_skb(sk, skb);
+
+ kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
+ return SKB_DROP_REASON_NO_SOCKET;
+}
+EXPORT_IPV6_MOD_GPL(ping_rcv);
+
+struct proto ping_prot = {
+ .name = "PING",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .pre_connect = ping_pre_connect,
+ .connect = ip4_datagram_connect,
+ .disconnect = __udp_disconnect,
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .sendmsg = ping_v4_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .put_port = ping_unhash,
+ .obj_size = sizeof(struct inet_sock),
+};
+EXPORT_IPV6_MOD(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+ struct sock *sk;
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+ ++state->bucket) {
+ struct hlist_head *hslot;
+
+ hslot = &ping_table.hash[state->bucket];
+
+ if (hlist_empty(hslot))
+ continue;
+
+ sk_for_each(sk, hslot) {
+ if (net_eq(sock_net(sk), net) &&
+ sk->sk_family == state->family)
+ goto found;
+ }
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ do {
+ sk = sk_next(sk);
+ } while (sk && (!net_eq(sock_net(sk), net)));
+
+ if (!sk)
+ return ping_get_first(seq, state->bucket + 1);
+ return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = ping_get_first(seq, 0);
+
+ if (sk)
+ while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+ __acquires(ping_table.lock)
+{
+ struct ping_iter_state *state = seq->private;
+ state->bucket = 0;
+ state->family = family;
+
+ spin_lock(&ping_table.lock);
+
+ return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+EXPORT_IPV6_MOD_GPL(ping_seq_start);
+
+static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET);
+}
+
+void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = ping_get_idx(seq, 0);
+ else
+ sk = ping_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+EXPORT_IPV6_MOD_GPL(ping_seq_next);
+
+void ping_seq_stop(struct seq_file *seq, void *v)
+ __releases(ping_table.lock)
+{
+ spin_unlock(&ping_table.lock);
+}
+EXPORT_IPV6_MOD_GPL(ping_seq_stop);
+
+static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
+ 0, sock_i_ino(sp),
+ refcount_read(&sp->sk_refcnt), sp,
+ sk_drops_read(sp));
+}
+
+static int ping_v4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct ping_iter_state *state = seq->private;
+
+ ping_v4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct seq_operations ping_v4_seq_ops = {
+ .start = ping_v4_seq_start,
+ .show = ping_v4_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+};
+
+static int __net_init ping_v4_proc_init_net(struct net *net)
+{
+ if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
+ sizeof(struct ping_iter_state)))
+ return -ENOMEM;
+
+ net->ipv4.ping_port_rover = get_random_u16();
+ return 0;
+}
+
+static void __net_exit ping_v4_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("icmp", net->proc_net);
+}
+
+static struct pernet_operations ping_v4_net_ops = {
+ .init = ping_v4_proc_init_net,
+ .exit = ping_v4_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+ return register_pernet_subsys(&ping_v4_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+ unregister_pernet_subsys(&ping_v4_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+ int i;
+
+ for (i = 0; i < PING_HTABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&ping_table.hash[i]);
+ spin_lock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index cd873da54cbe..974afc4ecbe2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,8 +8,6 @@
* PROC file system. It is mainly used for debugging and
* statistics.
*
- * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $
- *
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
* Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
@@ -27,82 +26,58 @@
* split functions for more readibility.
* Andi Kleen : Add support for /proc/net/netstat
* Arnaldo C. Melo : Convert to seq_file
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
+#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <net/udp.h>
#include <net/udplite.h>
+#include <linux/bottom_half.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/export.h>
#include <net/sock.h>
#include <net/raw.h>
-static int fold_prot_inuse(struct proto *proto)
-{
- int res = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- res += proto->stats[cpu].inuse;
-
- return res;
-}
+#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX)
/*
* Report socket allocation statistics [mea@utu.fi]
*/
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
- socket_seq_show(seq);
- seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
- fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
- tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
- atomic_read(&tcp_memory_allocated));
- seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
- seq_printf(seq, "UDPLITE: inuse %d\n", fold_prot_inuse(&udplite_prot));
- seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
- seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
- atomic_read(&ip_frag_mem));
- return 0;
-}
-
-static int sockstat_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, sockstat_seq_show, NULL);
-}
-
-static struct file_operations sockstat_seq_fops = {
- .owner = THIS_MODULE,
- .open = sockstat_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+ struct net *net = seq->private;
+ int orphans, sockets;
-static unsigned long
-fold_field(void *mib[], int offt)
-{
- unsigned long res = 0;
- int i;
+ orphans = tcp_orphan_count_sum();
+ sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
- for_each_possible_cpu(i) {
- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
- }
- return res;
+ socket_seq_show(seq);
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+ sock_prot_inuse_get(net, &tcp_prot), orphans,
+ refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1,
+ sockets, proto_memory_allocated(&tcp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %ld\n",
+ sock_prot_inuse_get(net, &udp_prot),
+ proto_memory_allocated(&udp_prot));
+ seq_printf(seq, "UDPLITE: inuse %d\n",
+ sock_prot_inuse_get(net, &udplite_prot));
+ seq_printf(seq, "RAW: inuse %d\n",
+ sock_prot_inuse_get(net, &raw_prot));
+ seq_printf(seq, "FRAG: inuse %u memory %lu\n",
+ atomic_read(&net->ipv4.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv4.fqdir));
+ return 0;
}
/* snmp items */
static const struct snmp_mib snmp4_ipstats_list[] = {
- SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
+ SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
@@ -119,39 +94,51 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS),
};
-static const struct snmp_mib snmp4_icmp_list[] = {
- SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
- SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
- SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
- SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
- SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
- SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
- SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
- SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
- SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
- SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
- SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
- SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
- SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
- SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
- SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
- SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
- SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
- SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
- SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
- SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
- SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
- SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
- SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
- SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
- SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
- SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
- SNMP_MIB_SENTINEL
+/* Following items are displayed in /proc/net/netstat */
+static const struct snmp_mib snmp4_ipextstats_list[] = {
+ SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
+ SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+ SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+ SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+ SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
+ SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
+ SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
+ SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
+ SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+ SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+ SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+ SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* Non RFC4293 fields */
+ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
};
+static const struct {
+ const char *name;
+ int index;
+} icmpmibmap[] = {
+ { "DestUnreachs", ICMP_DEST_UNREACH },
+ { "TimeExcds", ICMP_TIME_EXCEEDED },
+ { "ParmProbs", ICMP_PARAMETERPROB },
+ { "SrcQuenchs", ICMP_SOURCE_QUENCH },
+ { "Redirects", ICMP_REDIRECT },
+ { "Echos", ICMP_ECHO },
+ { "EchoReps", ICMP_ECHOREPLY },
+ { "Timestamps", ICMP_TIMESTAMP },
+ { "TimestampReps", ICMP_TIMESTAMPREPLY },
+ { "AddrMasks", ICMP_ADDRESS },
+ { "AddrMaskReps", ICMP_ADDRESSREPLY },
+ { NULL, 0 }
+};
+
+
static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
@@ -167,7 +154,7 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
};
static const struct snmp_mib snmp4_udp_list[] = {
@@ -177,7 +164,9 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
};
static const struct snmp_mib snmp4_net_list[] = {
@@ -194,26 +183,23 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
- SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW),
+ SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED),
+ SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
+ SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
- SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
- SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
- SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
- SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
- SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
- SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
@@ -221,24 +207,23 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
- SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
- SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+ SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
- SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
- SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -246,148 +231,342 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO),
+ SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
+ SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
+ SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
+ SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+ SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE),
+ SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
+ SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
+ SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
+ SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
+ SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
+ SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+ SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+ SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+ SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
+ SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
+ SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+ SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+ SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+ SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+ SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+ SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+ SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+ SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+ SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+ SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+ SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE),
+ SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+ SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
+ SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+ SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+ SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
+ SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT),
+ SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
+ SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
+ SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+ SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+ SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+ SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+ SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+ SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+ SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+ SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+ SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+ SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+ SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+ SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
+ SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
+ SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
+ SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
+ SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH),
+ SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
+ SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
+ SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
+ SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
+ SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
+ SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED),
+ SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD),
+ SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
+ SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
+ SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS),
};
+static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
+ unsigned short *type, int count)
+{
+ int j;
+
+ if (count) {
+ seq_puts(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %sType%u",
+ type[j] & 0x100 ? "Out" : "In",
+ type[j] & 0xff);
+ seq_puts(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %lu", vals[j]);
+ }
+}
+
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE 16
+
+ int i, count;
+ unsigned short type[PERLINE];
+ unsigned long vals[PERLINE], val;
+ struct net *net = seq->private;
+
+ count = 0;
+ for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+ val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
+ if (val) {
+ type[count] = i;
+ vals[count++] = val;
+ }
+ if (count == PERLINE) {
+ icmpmsg_put_line(seq, vals, type, count);
+ count = 0;
+ }
+ }
+ icmpmsg_put_line(seq, vals, type, count);
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+ int i;
+ struct net *net = seq->private;
+ atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
+
+ seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " In%s", icmpmibmap[i].name);
+ seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost");
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " Out%s", icmpmibmap[i].name);
+ seq_printf(seq, "\nIcmp: %lu %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + icmpmibmap[i].index));
+ seq_printf(seq, " %lu %lu %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
+ for (i = 0; icmpmibmap[i].name; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+}
+
/*
* Called from the PROCfs module. This outputs /proc/net/snmp.
*/
-static int snmp_seq_show(struct seq_file *seq, void *v)
+static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
+ const int cnt = ARRAY_SIZE(snmp4_ipstats_list);
+ u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)];
+ struct net *net = seq->private;
int i;
- seq_puts(seq, "Ip: Forwarding DefaultTTL");
+ memset(buff64, 0, sizeof(buff64));
- for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_puts(seq, "Ip: Forwarding DefaultTTL");
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
- ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl);
+ IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
+ READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
- for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- fold_field((void **) ip_statistics,
- snmp4_ipstats_list[i].entry));
+ BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < cnt; i++)
+ seq_printf(seq, " %llu", buff64[i]);
- seq_puts(seq, "\nIcmp:");
- for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
- seq_printf(seq, " %s", snmp4_icmp_list[i].name);
+ return 0;
+}
- seq_puts(seq, "\nIcmp:");
- for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- fold_field((void **) icmp_statistics,
- snmp4_icmp_list[i].entry));
+static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
+{
+ const int udp_cnt = ARRAY_SIZE(snmp4_udp_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list);
+ unsigned long buff[TCPUDP_MIB_MAX];
+ struct net *net = seq->private;
+ int i;
+
+ memset(buff, 0, tcp_cnt * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list,
+ tcp_cnt,
+ net->mib.tcp_statistics);
+ for (i = 0; i < tcp_cnt; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
- seq_printf(seq, " %ld",
- fold_field((void **) tcp_statistics,
- snmp4_tcp_list[i].entry));
+ seq_printf(seq, " %ld", buff[i]);
else
- seq_printf(seq, " %lu",
- fold_field((void **) tcp_statistics,
- snmp4_tcp_list[i].entry));
+ seq_printf(seq, " %lu", buff[i]);
}
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
+
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- fold_field((void **) udp_statistics,
- snmp4_udp_list[i].entry));
+ for (i = 0; i < udp_cnt; i++)
+ seq_printf(seq, " %lu", buff[i]);
+
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udplite_statistics);
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- fold_field((void **) udplite_statistics,
- snmp4_udp_list[i].entry) );
+ for (i = 0; i < udp_cnt; i++)
+ seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
return 0;
}
-static int snmp_seq_open(struct inode *inode, struct file *file)
+static int snmp_seq_show(struct seq_file *seq, void *v)
{
- return single_open(file, snmp_seq_show, NULL);
-}
+ snmp_seq_show_ipstats(seq, v);
-static struct file_operations snmp_seq_fops = {
- .owner = THIS_MODULE,
- .open = snmp_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+ icmp_put(seq); /* RFC 2011 compatibility */
+ icmpmsg_put(seq);
+
+ snmp_seq_show_tcp_udp(seq, v);
+
+ return 0;
+}
/*
* Output /proc/net/netstat
*/
static int netstat_seq_show(struct seq_file *seq, void *v)
{
+ const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_net_list);
+ struct net *net = seq->private;
+ unsigned long *buff;
int i;
seq_puts(seq, "TcpExt:");
- for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_net_list[i].name);
seq_puts(seq, "\nTcpExt:");
- for (i = 0; snmp4_net_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- fold_field((void **) net_statistics,
- snmp4_net_list[i].entry));
-
+ buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
+ GFP_KERNEL);
+ if (buff) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt,
+ net->mib.net_statistics);
+ for (i = 0; i < tcp_cnt; i++)
+ seq_printf(seq, " %lu", buff[i]);
+ } else {
+ for (i = 0; i < tcp_cnt; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.net_statistics,
+ snmp4_net_list[i].entry));
+ }
+ seq_puts(seq, "\nIpExt:");
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+
+ seq_puts(seq, "\nIpExt:");
+ if (buff) {
+ u64 *buff64 = (u64 *)buff;
+
+ memset(buff64, 0, ip_cnt * sizeof(u64));
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %llu", buff64[i]);
+ } else {
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+ }
+ kfree(buff);
seq_putc(seq, '\n');
+ mptcp_seq_show(seq);
return 0;
}
-static int netstat_seq_open(struct inode *inode, struct file *file)
+static __net_init int ip_proc_init_net(struct net *net)
{
- return single_open(file, netstat_seq_show, NULL);
-}
-
-static struct file_operations netstat_seq_fops = {
- .owner = THIS_MODULE,
- .open = netstat_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-int __init ip_misc_proc_init(void)
-{
- int rc = 0;
-
- if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
+ if (!proc_create_net_single("sockstat", 0444, net->proc_net,
+ sockstat_seq_show, NULL))
+ goto out_sockstat;
+ if (!proc_create_net_single("netstat", 0444, net->proc_net,
+ netstat_seq_show, NULL))
goto out_netstat;
-
- if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
+ if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show,
+ NULL))
goto out_snmp;
- if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
- goto out_sockstat;
-out:
- return rc;
-out_sockstat:
- proc_net_remove("snmp");
+ return 0;
+
out_snmp:
- proc_net_remove("netstat");
+ remove_proc_entry("netstat", net->proc_net);
out_netstat:
- rc = -ENOMEM;
- goto out;
+ remove_proc_entry("sockstat", net->proc_net);
+out_sockstat:
+ return -ENOMEM;
+}
+
+static __net_exit void ip_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("snmp", net->proc_net);
+ remove_proc_entry("netstat", net->proc_net);
+ remove_proc_entry("sockstat", net->proc_net);
}
+static __net_initdata struct pernet_operations ip_proc_ops = {
+ .init = ip_proc_init_net,
+ .exit = ip_proc_exit_net,
+};
+
+int __init ip_misc_proc_init(void)
+{
+ return register_pernet_subsys(&ip_proc_ops);
+}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 05f5114828ea..6913979948d7 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* INET protocol dispatch tables.
*
- * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*
@@ -18,82 +17,54 @@
* Richard Colella : Hang on hash collision
* Vince Laviano : Modified inet_del_protocol() to correctly
* maintain copy bit.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/cache.h>
#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <net/ip.h>
+#include <linux/spinlock.h>
#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/ipip.h>
-#include <linux/igmp.h>
-struct net_protocol *inet_protos[MAX_INET_PROTOS];
-static DEFINE_SPINLOCK(inet_proto_lock);
+struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_offloads);
-/*
- * Add a protocol handler to the hash tables
- */
+int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_protocol);
-int inet_add_protocol(struct net_protocol *prot, unsigned char protocol)
+int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
{
- int hash, ret;
+ return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_offload);
- hash = protocol & (MAX_INET_PROTOS - 1);
+int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+ int ret;
- spin_lock_bh(&inet_proto_lock);
- if (inet_protos[hash]) {
- ret = -1;
- } else {
- inet_protos[hash] = prot;
- ret = 0;
- }
- spin_unlock_bh(&inet_proto_lock);
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
return ret;
}
+EXPORT_SYMBOL(inet_del_protocol);
-/*
- * Remove a protocol from the hash tables.
- */
-
-int inet_del_protocol(struct net_protocol *prot, unsigned char protocol)
+int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
{
- int hash, ret;
+ int ret;
- hash = protocol & (MAX_INET_PROTOS - 1);
-
- spin_lock_bh(&inet_proto_lock);
- if (inet_protos[hash] == prot) {
- inet_protos[hash] = NULL;
- ret = 0;
- } else {
- ret = -1;
- }
- spin_unlock_bh(&inet_proto_lock);
+ ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
synchronize_net();
return ret;
}
-
-EXPORT_SYMBOL(inet_add_protocol);
-EXPORT_SYMBOL(inet_del_protocol);
+EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a6c63bbd9ddb..5998c4cc6f47 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* RAW - implementation of IP "raw" sockets.
*
- * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*
@@ -32,24 +31,19 @@
* Alan Cox : Added IP_HDRINCL option.
* Alan Cox : Skip broadcast check if BSDism set.
* David S. Miller : New socket lookup architecture.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/byteorder.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/errno.h>
-#include <linux/aio.h>
#include <linux/kernel.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/sockios.h>
#include <linux/socket.h>
@@ -59,9 +53,10 @@
#include <linux/in_route.h>
#include <linux/route.h>
#include <linux/skbuff.h>
+#include <linux/igmp.h>
+#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
-#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <net/ip.h>
@@ -78,65 +73,81 @@
#include <linux/seq_file.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/uio.h>
+
+struct raw_frag_vec {
+ struct msghdr *msg;
+ union {
+ struct icmphdr icmph;
+ char c[1];
+ } hdr;
+ int hlen;
+};
-struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE];
-DEFINE_RWLOCK(raw_v4_lock);
+struct raw_hashinfo raw_v4_hashinfo;
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
-static void raw_v4_hash(struct sock *sk)
+int raw_hash_sk(struct sock *sk)
{
- struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num &
- (RAWV4_HTABLE_SIZE - 1)];
+ struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+ struct hlist_head *hlist;
- write_lock_bh(&raw_v4_lock);
- sk_add_node(sk, head);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock_bh(&raw_v4_lock);
-}
+ hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];
-static void raw_v4_unhash(struct sock *sk)
-{
- write_lock_bh(&raw_v4_lock);
- if (sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(&raw_v4_lock);
+ spin_lock(&h->lock);
+ sk_add_node_rcu(sk, hlist);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ spin_unlock(&h->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(raw_hash_sk);
-struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
- __be32 raddr, __be32 laddr,
- int dif)
+void raw_unhash_sk(struct sock *sk)
{
- struct hlist_node *node;
+ struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- sk_for_each_from(sk, node) {
- struct inet_sock *inet = inet_sk(sk);
+ spin_lock(&h->lock);
+ if (sk_del_node_init_rcu(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_unhash_sk);
- if (inet->num == num &&
- !(inet->daddr && inet->daddr != raddr) &&
- !(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
- goto found; /* gotcha */
- }
- sk = NULL;
-found:
- return sk;
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
+ __be32 raddr, __be32 laddr, int dif, int sdif)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
+ !(inet->inet_daddr && inet->inet_daddr != raddr) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return true;
+ return false;
}
+EXPORT_SYMBOL_GPL(raw_v4_match);
/*
* 0 - deliver
* 1 - block
*/
-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
{
- int type;
+ struct icmphdr _hdr;
+ const struct icmphdr *hdr;
- if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+ hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_hdr), &_hdr);
+ if (!hdr)
return 1;
- type = skb->h.icmph->type;
- if (type < 32) {
+ if (hdr->type < 32) {
__u32 data = raw_sk(sk)->filter.data;
- return ((1 << type) & data) != 0;
+ return ((1U << hdr->type) & data) != 0;
}
/* Do not block unknown ICMP types */
@@ -149,52 +160,74 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+static int raw_v4_input(struct net *net, struct sk_buff *skb,
+ const struct iphdr *iph, int hash)
{
- struct sock *sk;
- struct hlist_head *head;
+ int sdif = inet_sdif(skb);
+ struct hlist_head *hlist;
+ int dif = inet_iif(skb);
int delivered = 0;
+ struct sock *sk;
- read_lock(&raw_v4_lock);
- head = &raw_v4_htable[hash];
- if (hlist_empty(head))
- goto out;
- sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
- iph->saddr, iph->daddr,
- skb->dev->ifindex);
+ hlist = &raw_v4_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
+ if (!raw_v4_match(net, sk, iph->protocol,
+ iph->saddr, iph->daddr, dif, sdif))
+ continue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >=
+ READ_ONCE(sk->sk_rcvbuf)) {
+ sk_drops_inc(sk);
+ continue;
+ }
- while (sk) {
delivered = 1;
- if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+ if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+ ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+ skb->dev->ifindex, sdif)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
if (clone)
raw_rcv(sk, clone);
}
- sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
- iph->saddr, iph->daddr,
- skb->dev->ifindex);
}
-out:
- read_unlock(&raw_v4_lock);
+ rcu_read_unlock();
return delivered;
}
-void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
+int raw_local_deliver(struct sk_buff *skb, int protocol)
+{
+ struct net *net = dev_net(skb->dev);
+
+ return raw_v4_input(net, skb, ip_hdr(skb),
+ raw_hashfunc(net, protocol));
+}
+
+static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
{
struct inet_sock *inet = inet_sk(sk);
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
- int err = 0;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
int harderr = 0;
+ bool recverr;
+ int err = 0;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_sk_update_pmtu(skb, sk, info);
+ else if (type == ICMP_REDIRECT) {
+ ipv4_sk_redirect(skb, sk);
+ return;
+ }
/* Report error on raw socket, if:
1. User requested ip_recverr.
2. Socket is connected (otherwise the error indication
is useless without ip_recverr and error is hard.
*/
- if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+ recverr = inet_test_bit(RECVERR, sk);
+ if (!recverr && sk->sk_state != TCP_ESTABLISHED)
return;
switch (type) {
@@ -212,36 +245,63 @@ void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
err = EHOSTUNREACH;
if (code > NR_ICMP_UNREACH)
break;
- err = icmp_err_convert[code].errno;
- harderr = icmp_err_convert[code].fatal;
if (code == ICMP_FRAG_NEEDED) {
- harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
err = EMSGSIZE;
+ } else {
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
}
}
- if (inet->recverr) {
- struct iphdr *iph = (struct iphdr*)skb->data;
+ if (recverr) {
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 *payload = skb->data + (iph->ihl << 2);
- if (inet->hdrincl)
+ if (inet_test_bit(HDRINCL, sk))
payload = skb->data;
ip_icmp_error(sk, skb, err, 0, info, payload);
}
- if (inet->recverr || harderr) {
+ if (recverr || harderr) {
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
+ }
+}
+
+void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
+ struct hlist_head *hlist;
+ const struct iphdr *iph;
+ struct sock *sk;
+ int hash;
+
+ hash = raw_hashfunc(net, protocol);
+ hlist = &raw_v4_hashinfo.ht[hash];
+
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
+ iph = (const struct iphdr *)skb->data;
+ if (!raw_v4_match(net, sk, iph->protocol,
+ iph->daddr, iph->saddr, dif, sdif))
+ continue;
+ raw_err(sk, skb, info);
}
+ rcu_read_unlock();
}
-static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
/* Charge it to the socket. */
-
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- /* FIXME: increment a raw drops counter here */
- kfree_skb(skb);
+
+ ipv4_pktinfo_prepare(sk, skb, true);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ sk_skb_reason_drop(sk, skb, reason);
return NET_RX_DROP;
}
@@ -251,147 +311,196 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
- skb_push(skb, skb->data - skb->nh.raw);
+ skb_push(skb, -skb_network_offset(skb));
raw_rcv_skb(sk, skb);
return 0;
}
-static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
- struct rtable *rt,
- unsigned int flags)
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+ struct msghdr *msg, size_t length,
+ struct rtable **rtp, unsigned int flags,
+ const struct sockcm_cookie *sockc)
{
struct inet_sock *inet = inet_sk(sk);
- int hh_len;
+ struct net *net = sock_net(sk);
struct iphdr *iph;
struct sk_buff *skb;
+ unsigned int iphlen;
int err;
+ struct rtable *rt = *rtp;
+ int hlen, tlen;
- if (length > rt->u.dst.dev->mtu) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
- rt->u.dst.dev->mtu);
+ if (length > rt->dst.dev->mtu) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ rt->dst.dev->mtu);
return -EMSGSIZE;
}
+ if (length < sizeof(struct iphdr))
+ return -EINVAL;
+
if (flags&MSG_PROBE)
goto out;
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
-
- skb = sock_alloc_send_skb(sk, length+hh_len+15,
- flags&MSG_DONTWAIT, &err);
- if (skb == NULL)
- goto error;
- skb_reserve(skb, hh_len);
+ hlen = LL_RESERVED_SPACE(rt->dst.dev);
+ tlen = rt->dst.dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk,
+ length + hlen + tlen + 15,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto error;
+ skb_reserve(skb, hlen);
- skb->priority = sk->sk_priority;
- skb->dst = dst_clone(&rt->u.dst);
+ skb->protocol = htons(ETH_P_IP);
+ skb->priority = sockc->priority;
+ skb->mark = sockc->mark;
+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
+ skb_dst_set(skb, &rt->dst);
+ *rtp = NULL;
- skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ skb_put(skb, length);
skb->ip_summed = CHECKSUM_NONE;
- skb->h.raw = skb->nh.raw;
- err = memcpy_fromiovecend((void *)iph, from, 0, length);
- if (err)
- goto error_fault;
+ skb_setup_tx_timestamp(skb, sockc);
+
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
+ skb->transport_header = skb->network_header;
+ err = -EFAULT;
+ if (memcpy_from_msg(iph, msg, length))
+ goto error_free;
+
+ iphlen = iph->ihl * 4;
+
+ /*
+ * We don't want to modify the ip header, but we do need to
+ * be sure that it won't cause problems later along the network
+ * stack. Specifically we want to make sure that iph->ihl is a
+ * sane value. If ihl points beyond the length of the buffer passed
+ * in, reject the frame as invalid
+ */
+ err = -EINVAL;
+ if (iphlen > length)
+ goto error_free;
- /* We don't modify invalid header */
- if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
+ if (iphlen >= sizeof(*iph)) {
if (!iph->saddr)
- iph->saddr = rt->rt_src;
+ iph->saddr = fl4->saddr;
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_select_ident(net, skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->transport_header += iphlen;
+ if (iph->protocol == IPPROTO_ICMP &&
+ length >= iphlen + sizeof(struct icmphdr))
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
}
- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, rt->dst.dev,
dst_output);
if (err > 0)
- err = inet->recverr ? net_xmit_errno(err) : 0;
+ err = net_xmit_errno(err);
if (err)
goto error;
out:
return 0;
-error_fault:
- err = -EFAULT;
+error_free:
kfree_skb(skb);
error:
- IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
- return err;
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+ if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
+ err = 0;
+ return err;
}
-static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
{
- struct iovec *iov;
- u8 __user *type = NULL;
- u8 __user *code = NULL;
- int probed = 0;
- unsigned int i;
+ int err;
- if (!msg->msg_iov)
+ if (fl4->flowi4_proto != IPPROTO_ICMP)
return 0;
- for (i = 0; i < msg->msg_iovlen; i++) {
- iov = &msg->msg_iov[i];
- if (!iov)
- continue;
+ /* We only need the first two bytes. */
+ rfv->hlen = 2;
+
+ err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
+ if (err)
+ return err;
+
+ fl4->fl4_icmp_type = rfv->hdr.icmph.type;
+ fl4->fl4_icmp_code = rfv->hdr.icmph.code;
- switch (fl->proto) {
- case IPPROTO_ICMP:
- /* check if one-byte field is readable or not. */
- if (iov->iov_base && iov->iov_len < 1)
- break;
-
- if (!type) {
- type = iov->iov_base;
- /* check if code field is readable or not. */
- if (iov->iov_len > 1)
- code = type + 1;
- } else if (!code)
- code = iov->iov_base;
-
- if (type && code) {
- if (get_user(fl->fl_icmp_type, type) ||
- get_user(fl->fl_icmp_code, code))
- return -EFAULT;
- probed = 1;
- }
- break;
- default:
- probed = 1;
- break;
- }
- if (probed)
- break;
- }
return 0;
}
-static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct raw_frag_vec *rfv = from;
+
+ if (offset < rfv->hlen) {
+ int copy = min(rfv->hlen - offset, len);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ memcpy(to, rfv->hdr.c + offset, copy);
+ else
+ skb->csum = csum_block_add(
+ skb->csum,
+ csum_partial_copy_nocheck(rfv->hdr.c + offset,
+ to, copy),
+ odd);
+
+ odd = 0;
+ offset += copy;
+ to += copy;
+ len -= copy;
+
+ if (!len)
+ return 0;
+ }
+
+ offset -= rfv->hlen;
+
+ return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ u8 scope;
int free = 0;
__be32 daddr;
__be32 saddr;
- u8 tos;
- int err;
+ int uc_index, err;
+ struct ip_options_data opt_copy;
+ struct raw_frag_vec rfv;
+ int hdrincl;
err = -EMSGSIZE;
if (len > 0xFFFF)
goto out;
+ hdrincl = inet_test_bit(HDRINCL, sk);
+
/*
* Check the flags.
*/
@@ -399,22 +508,19 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = -EOPNOTSUPP;
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
goto out; /* compatibility */
-
+
/*
- * Get and verify the address.
+ * Get and verify the address.
*/
if (msg->msg_namelen) {
- struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
err = -EINVAL;
if (msg->msg_namelen < sizeof(*usin))
goto out;
if (usin->sin_family != AF_INET) {
- static int complained;
- if (!complained++)
- printk(KERN_INFO "%s forgot to set AF_INET in "
- "raw sendmsg. Fix it!\n",
- current->comm);
+ pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+ __func__, current->comm);
err = -EAFNOSUPPORT;
if (usin->sin_family)
goto out;
@@ -426,19 +532,22 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
*/
} else {
err = -EDESTADDRREQ;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (sk->sk_state != TCP_ESTABLISHED)
goto out;
- daddr = inet->daddr;
+ daddr = inet->inet_daddr;
}
- ipc.addr = inet->saddr;
- ipc.opt = NULL;
- ipc.oif = sk->sk_bound_dev_if;
+ ipcm_init_sk(&ipc, inet);
+ /* Keep backward compat */
+ if (hdrincl)
+ ipc.protocol = IPPROTO_RAW;
if (msg->msg_controllen) {
- err = ip_cmsg_send(msg, &ipc);
- if (err)
+ err = ip_cmsg_send(sk, msg, &ipc, false);
+ if (unlikely(err)) {
+ kfree(ipc.opt);
goto out;
+ }
if (ipc.opt)
free = 1;
}
@@ -446,53 +555,82 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
saddr = ipc.addr;
ipc.addr = daddr;
- if (!ipc.opt)
- ipc.opt = inet->opt;
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
if (ipc.opt) {
err = -EINVAL;
/* Linux does not mangle headers on raw sockets,
* so that IP options + IP_HDRINCL is non-sense.
*/
- if (inet->hdrincl)
+ if (hdrincl)
goto done;
- if (ipc.opt->srr) {
+ if (ipc.opt->opt.srr) {
if (!daddr)
goto done;
- daddr = ipc.opt->faddr;
+ daddr = ipc.opt->opt.faddr;
}
}
- tos = RT_CONN_FLAGS(sk);
- if (msg->msg_flags & MSG_DONTROUTE)
- tos |= RTO_ONLINK;
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
- if (MULTICAST(daddr)) {
- if (!ipc.oif)
- ipc.oif = inet->mc_index;
+ uc_index = READ_ONCE(inet->uc_index);
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
+ } else if (!ipc.oif) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
+ /* oif is set, packet is to local broadcast
+ * and uc_index is set. oif is most likely set
+ * by sk_bound_dev_if. If uc_index != oif check if the
+ * oif is an L3 master and uc_index is an L3 slave.
+ * If so, we want to allow the send using the uc_index.
+ */
+ if (ipc.oif != uc_index &&
+ ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
+ uc_index)) {
+ ipc.oif = uc_index;
+ }
}
- {
- struct flowi fl = { .oif = ipc.oif,
- .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = saddr,
- .tos = tos } },
- .proto = inet->hdrincl ? IPPROTO_RAW :
- sk->sk_protocol,
- };
- if (!inet->hdrincl) {
- err = raw_probe_proto_opt(&fl, msg);
- if (err)
- goto done;
- }
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
+ hdrincl ? ipc.protocol : sk->sk_protocol,
+ inet_sk_flowi_flags(sk) |
+ (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+ daddr, saddr, 0, 0, sk_uid(sk));
+
+ fl4.fl4_icmp_type = 0;
+ fl4.fl4_icmp_code = 0;
+
+ if (!hdrincl) {
+ rfv.msg = msg;
+ rfv.hlen = 0;
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+ err = raw_probe_proto_opt(&rfv, &fl4);
+ if (err)
+ goto done;
}
- if (err)
+
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto done;
+ }
err = -EACCES;
if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -502,20 +640,24 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto do_confirm;
back_from_confirm:
- if (inet->hdrincl)
- err = raw_send_hdrinc(sk, msg->msg_iov, len,
- rt, msg->msg_flags);
-
+ if (hdrincl)
+ err = raw_send_hdrinc(sk, &fl4, msg, len,
+ &rt, msg->msg_flags, &ipc.sockc);
+
else {
if (!ipc.addr)
- ipc.addr = rt->rt_dst;
+ ipc.addr = fl4.daddr;
lock_sock(sk);
- err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
- &ipc, rt, msg->msg_flags);
+ err = ip_append_data(sk, &fl4, raw_getfrag,
+ &rfv, len, 0,
+ &ipc, &rt, msg->msg_flags);
if (err)
ip_flush_pending_frames(sk);
- else if (!(msg->msg_flags & MSG_MORE))
- err = ip_push_pending_frames(sk);
+ else if (!(msg->msg_flags & MSG_MORE)) {
+ err = ip_push_pending_frames(sk, &fl4);
+ if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
+ err = 0;
+ }
release_sock(sk);
}
done:
@@ -529,7 +671,8 @@ out:
return len;
do_confirm:
- dst_confirm(&rt->u.dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -538,35 +681,55 @@ do_confirm:
static void raw_close(struct sock *sk, long timeout)
{
- /*
- * Raw sockets may have direct kernel refereneces. Kill them.
+ /*
+ * Raw sockets may have direct kernel references. Kill them.
*/
ip_ra_control(sk, 0, NULL);
sk_common_release(sk);
}
+static void raw_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip_flush_pending_frames(sk);
+ release_sock(sk);
+}
+
/* This gets rid of all the nasties in af_inet. -DaveM */
-static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ struct net *net = sock_net(sk);
+ u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
+ lock_sock(sk);
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+
+ if (sk->sk_bound_dev_if)
+ tb_id = l3mdev_fib_table_by_index(net,
+ sk->sk_bound_dev_if) ? : tb_id;
+
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
+
ret = -EADDRNOTAVAIL;
- if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
+ inet->inet_saddr = 0; /* Use device */
sk_dst_reset(sk);
ret = 0;
-out: return ret;
+out:
+ release_sock(sk);
+ return ret;
}
/*
@@ -574,27 +737,24 @@ out: return ret;
* we return it, otherwise we block.
*/
-static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
int err = -EOPNOTSUPP;
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
if (flags & MSG_OOB)
goto out;
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE) {
- err = ip_recv_error(sk, msg, len);
+ err = ip_recv_error(sk, msg, len, addr_len);
goto out;
}
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -604,20 +764,21 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto done;
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (sin) {
sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
- if (inet->cmsg_flags)
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv(msg, skb);
if (flags & MSG_TRUNC)
copied = skb->len;
@@ -629,20 +790,21 @@ out:
return copied;
}
-static int raw_init(struct sock *sk)
+static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
- if (inet_sk(sk)->num == IPPROTO_ICMP)
+ sk->sk_drop_counters = &rp->drop_counters;
+ if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
memset(&rp->filter, 0, sizeof(rp->filter));
return 0;
}
-static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen)
{
if (optlen > sizeof(struct icmp_filter))
optlen = sizeof(struct icmp_filter);
- if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+ if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen))
return -EFAULT;
return 0;
}
@@ -666,11 +828,11 @@ static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *o
out: return ret;
}
-static int do_raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static int do_raw_setsockopt(struct sock *sk, int optname,
+ sockptr_t optval, unsigned int optlen)
{
if (optname == ICMP_FILTER) {
- if (inet_sk(sk)->num != IPPROTO_ICMP)
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
return -EOPNOTSUPP;
else
return raw_seticmpfilter(sk, optval, optlen);
@@ -679,28 +841,18 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
}
static int raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
if (level != SOL_RAW)
return ip_setsockopt(sk, level, optname, optval, optlen);
- return do_raw_setsockopt(sk, level, optname, optval, optlen);
+ return do_raw_setsockopt(sk, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- if (level != SOL_RAW)
- return compat_ip_setsockopt(sk, level, optname, optval, optlen);
- return do_raw_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
-static int do_raw_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+static int do_raw_getsockopt(struct sock *sk, int optname,
+ char __user *optval, int __user *optlen)
{
if (optname == ICMP_FILTER) {
- if (inet_sk(sk)->num != IPPROTO_ICMP)
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
return -EOPNOTSUPP;
else
return raw_geticmpfilter(sk, optval, optlen);
@@ -713,114 +865,132 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
{
if (level != SOL_RAW)
return ip_getsockopt(sk, level, optname, optval, optlen);
- return do_raw_getsockopt(sk, level, optname, optval, optlen);
+ return do_raw_getsockopt(sk, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+static int raw_ioctl(struct sock *sk, int cmd, int *karg)
{
- if (level != SOL_RAW)
- return compat_ip_getsockopt(sk, level, optname, optval, optlen);
- return do_raw_getsockopt(sk, level, optname, optval, optlen);
-}
+ switch (cmd) {
+ case SIOCOUTQ: {
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ *karg = skb->len;
+ else
+ *karg = 0;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return 0;
+ }
+
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_ioctl(sk, cmd, karg);
+#else
+ return -ENOIOCTLCMD;
#endif
+ }
+}
-static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case SIOCOUTQ: {
- int amount = atomic_read(&sk->sk_wmem_alloc);
- return put_user(amount, (int __user *)arg);
- }
- case SIOCINQ: {
- struct sk_buff *skb;
- int amount = 0;
-
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL)
- amount = skb->len;
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
- }
-
- default:
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
#ifdef CONFIG_IP_MROUTE
- return ipmr_ioctl(sk, cmd, (void __user *)arg);
+ return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
#else
- return -ENOIOCTLCMD;
+ return -ENOIOCTLCMD;
#endif
}
}
+#endif
+
+int raw_abort(struct sock *sk, int err)
+{
+ lock_sock(sk);
+
+ sk->sk_err = err;
+ sk_error_report(sk);
+ __udp_disconnect(sk, 0);
+
+ release_sock(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_abort);
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
.close = raw_close,
+ .destroy = raw_destroy,
.connect = ip4_datagram_connect,
- .disconnect = udp_disconnect,
+ .disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
- .init = raw_init,
+ .init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
.recvmsg = raw_recvmsg,
.bind = raw_bind,
.backlog_rcv = raw_rcv_skb,
- .hash = raw_v4_hash,
- .unhash = raw_v4_unhash,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = raw_hash_sk,
+ .unhash = raw_unhash_sk,
.obj_size = sizeof(struct raw_sock),
+ .useroffset = offsetof(struct raw_sock, filter),
+ .usersize = sizeof_field(struct raw_sock, filter),
+ .h.raw_hash = &raw_v4_hashinfo,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_raw_setsockopt,
- .compat_getsockopt = compat_raw_getsockopt,
+ .compat_ioctl = compat_raw_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
-struct raw_iter_state {
- int bucket;
-};
-
-#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private)
-
-static struct sock *raw_get_first(struct seq_file *seq)
+static struct sock *raw_get_first(struct seq_file *seq, int bucket)
{
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+ struct raw_iter_state *state = raw_seq_private(seq);
+ struct hlist_head *hlist;
struct sock *sk;
- struct raw_iter_state* state = raw_seq_private(seq);
- for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) {
- struct hlist_node *node;
-
- sk_for_each(sk, node, &raw_v4_htable[state->bucket])
- if (sk->sk_family == PF_INET)
- goto found;
+ for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
+ ++state->bucket) {
+ hlist = &h->ht[state->bucket];
+ sk_for_each(sk, hlist) {
+ if (sock_net(sk) == seq_file_net(seq))
+ return sk;
+ }
}
- sk = NULL;
-found:
- return sk;
+ return NULL;
}
static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
{
- struct raw_iter_state* state = raw_seq_private(seq);
+ struct raw_iter_state *state = raw_seq_private(seq);
do {
sk = sk_next(sk);
-try_again:
- ;
- } while (sk && sk->sk_family != PF_INET);
+ } while (sk && sock_net(sk) != seq_file_net(seq));
- if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
- sk = sk_head(&raw_v4_htable[state->bucket]);
- goto try_again;
- }
+ if (!sk)
+ return raw_get_first(seq, state->bucket + 1);
return sk;
}
static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
{
- struct sock *sk = raw_get_first(seq);
+ struct sock *sk = raw_get_first(seq, 0);
if (sk)
while (pos && (sk = raw_get_next(seq, sk)) != NULL)
@@ -828,111 +998,127 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
return pos ? NULL : sk;
}
-static void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&h->lock)
{
- read_lock(&raw_v4_lock);
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+ spin_lock(&h->lock);
+
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
+EXPORT_SYMBOL_GPL(raw_seq_start);
-static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sock *sk;
if (v == SEQ_START_TOKEN)
- sk = raw_get_first(seq);
+ sk = raw_get_first(seq, 0);
else
sk = raw_get_next(seq, v);
++*pos;
return sk;
}
+EXPORT_SYMBOL_GPL(raw_seq_next);
-static void raw_seq_stop(struct seq_file *seq, void *v)
+void raw_seq_stop(struct seq_file *seq, void *v)
+ __releases(&h->lock)
{
- read_unlock(&raw_v4_lock);
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+ spin_unlock(&h->lock);
}
+EXPORT_SYMBOL_GPL(raw_seq_stop);
-static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
+static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
{
struct inet_sock *inet = inet_sk(sp);
- __be32 dest = inet->daddr,
- src = inet->rcv_saddr;
+ __be32 dest = inet->inet_daddr,
+ src = inet->inet_rcv_saddr;
__u16 destp = 0,
- srcp = inet->num;
-
- sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
- i, src, srcp, dest, destp, sp->sk_state,
- atomic_read(&sp->sk_wmem_alloc),
- atomic_read(&sp->sk_rmem_alloc),
- 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp);
- return tmpbuf;
+ srcp = inet->inet_num;
+
+ seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
+ i, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
+ 0, sock_i_ino(sp),
+ refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp));
}
static int raw_seq_show(struct seq_file *seq, void *v)
{
- char tmpbuf[129];
-
if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-127s\n",
- " sl local_address rem_address st tx_queue "
- "rx_queue tr tm->when retrnsmt uid timeout "
- "inode");
- else {
- struct raw_iter_state *state = raw_seq_private(seq);
-
- seq_printf(seq, "%-127s\n",
- get_raw_sock(v, tmpbuf, state->bucket));
- }
+ seq_printf(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops\n");
+ else
+ raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
return 0;
}
-static struct seq_operations raw_seq_ops = {
+static const struct seq_operations raw_seq_ops = {
.start = raw_seq_start,
.next = raw_seq_next,
.stop = raw_seq_stop,
.show = raw_seq_show,
};
-static int raw_seq_open(struct inode *inode, struct file *file)
+static __net_init int raw_init_net(struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops,
+ sizeof(struct raw_iter_state), &raw_v4_hashinfo))
+ return -ENOMEM;
- if (!s)
- goto out;
- rc = seq_open(file, &raw_seq_ops);
- if (rc)
- goto out_kfree;
+ return 0;
+}
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations raw_seq_fops = {
- .owner = THIS_MODULE,
- .open = raw_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static __net_exit void raw_exit_net(struct net *net)
+{
+ remove_proc_entry("raw", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations raw_net_ops = {
+ .init = raw_init_net,
+ .exit = raw_exit_net,
};
int __init raw_proc_init(void)
{
- if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
- return -ENOMEM;
- return 0;
+
+ return register_pernet_subsys(&raw_net_ops);
}
void __init raw_proc_exit(void)
{
- proc_net_remove("raw");
+ unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+ raw_sysctl_init_net(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+ .init = raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+ raw_sysctl_init_net(&init_net);
+ if (register_pernet_subsys(&raw_sysctl_ops))
+ panic("RAW: failed to init sysctl parameters.\n");
+}
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
new file mode 100644
index 000000000000..943e5998e0ad
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+ if (r->sdiag_family == AF_INET) {
+ return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (r->sdiag_family == AF_INET6) {
+ return &raw_v6_hashinfo;
+#endif
+ } else {
+ return ERR_PTR(-EINVAL);
+ }
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static bool raw_lookup(struct net *net, const struct sock *sk,
+ const struct inet_diag_req_v2 *req)
+{
+ struct inet_diag_req_raw *r = (void *)req;
+
+ if (r->sdiag_family == AF_INET)
+ return raw_v4_match(net, sk, r->sdiag_raw_protocol,
+ r->id.idiag_dst[0],
+ r->id.idiag_src[0],
+ r->id.idiag_if, 0);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ return raw_v6_match(net, sk, r->sdiag_raw_protocol,
+ (const struct in6_addr *)r->id.idiag_src,
+ (const struct in6_addr *)r->id.idiag_dst,
+ r->id.idiag_if, 0);
+#endif
+ return false;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct hlist_head *hlist;
+ struct sock *sk;
+ int slot;
+
+ if (IS_ERR(hashinfo))
+ return ERR_CAST(hashinfo);
+
+ rcu_read_lock();
+ for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+ hlist = &hashinfo->ht[slot];
+ sk_for_each_rcu(sk, hlist) {
+ if (raw_lookup(net, sk, r)) {
+ /*
+ * Grab it and keep until we fill
+ * diag message to be reported, so
+ * caller should call sock_put then.
+ */
+ if (refcount_inc_not_zero(&sk->sk_refcnt))
+ goto out_unlock;
+ }
+ }
+ }
+ sk = ERR_PTR(-ENOENT);
+out_unlock:
+ rcu_read_unlock();
+
+ return sk;
+}
+
+static int raw_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ struct sk_buff *in_skb = cb->skb;
+ struct sk_buff *rep;
+ struct sock *sk;
+ struct net *net;
+ int err;
+
+ net = sock_net(in_skb->sk);
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+ GFP_KERNEL);
+ if (!rep) {
+ sock_put(sk);
+ return -ENOMEM;
+ }
+
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ sock_put(sk);
+
+ if (err < 0) {
+ kfree_skb(rep);
+ return err;
+ }
+
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+ return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ bool net_admin)
+{
+ if (!inet_diag_bc_sk(cb->data, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
+ struct hlist_head *hlist;
+ struct sock *sk = NULL;
+
+ if (IS_ERR(hashinfo))
+ return;
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ rcu_read_lock();
+ for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+ num = 0;
+
+ hlist = &hashinfo->ht[slot];
+ sk_for_each_rcu(sk, hlist) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0)
+ goto out_unlock;
+next:
+ num++;
+ }
+ }
+
+out_unlock:
+ rcu_read_unlock();
+
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int raw_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+ err = sock_diag_destroy(sk, ECONNABORTED);
+ sock_put(sk);
+ return err;
+}
+#endif
+
+static const struct inet_diag_handler raw_diag_handler = {
+ .owner = THIS_MODULE,
+ .dump = raw_diag_dump,
+ .dump_one = raw_diag_dump_one,
+ .idiag_get_info = raw_diag_get_info,
+ .idiag_type = IPPROTO_RAW,
+ .idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = raw_diag_destroy,
+#endif
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+ /*
+ * Make sure the two structures are identical,
+ * except the @pad field.
+ */
+#define __offset_mismatch(m1, m2) \
+ (offsetof(struct inet_diag_req_v2, m1) != \
+ offsetof(struct inet_diag_req_raw, m2))
+
+ BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+ sizeof(struct inet_diag_req_raw));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+ BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+ BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+ return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+ inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9f3924c4905e..b549d6a57307 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
@@ -20,9 +19,9 @@
* (rco@di.uminho.pt) Routing table insertion and update
* Linus Torvalds : Rewrote bits to be sensible
* Alan Cox : Added BSD route gw semantics
- * Alan Cox : Super /proc >4K
+ * Alan Cox : Super /proc >4K
* Alan Cox : MTU in route table
- * Alan Cox : MSS actually. Also added the window
+ * Alan Cox : MSS actually. Also added the window
* clamper.
* Sam Lantinga : Fixed route matching in rt_del()
* Alan Cox : Routing cache support.
@@ -38,11 +37,11 @@
* Alan Cox : Faster /proc handling
* Alexey Kuznetsov : Massive rework to support tree based routing,
* routing caches and better behaviour.
- *
+ *
* Olaf Erb : irtt wasn't being copied right.
* Bjorn Ekwall : Kerneld route support.
* Alan Cox : Multicast fixed (I hope)
- * Pavel Krauz : Limited broadcast fixed
+ * Pavel Krauz : Limited broadcast fixed
* Mike McLagan : Routing by source
* Alexey Kuznetsov : End of old history. Split to fib.c and
* route.c and rewritten from scratch.
@@ -55,27 +54,18 @@
* Robert Olsson : Added rt_cache statistics
* Arnaldo C. Melo : Convert proc stuff to seq_file
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
- * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
- * Ilia Sotnikov : Removed TOS from hash calculations
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
+ * Ilia Sotnikov : Removed TOS from hash calculations
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
-#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/string.h>
+#include <linux/memblock.h>
#include <linux/socket.h>
-#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
@@ -83,95 +73,105 @@
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
-#include <linux/jhash.h>
#include <linux/rcupdate.h>
-#include <linux/times.h>
-#include <net/protocol.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+#include <net/flow.h>
+#include <net/inet_dscp.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/arp.h>
+#include <net/nexthop.h>
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
-#include <net/ip_mp_alg.h>
+#include <net/lwtunnel.h>
#include <net/netevent.h>
+#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
+#include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
-#define RT_FL_TOS(oldflp) \
- ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
-
-#define IP_MAX_MTU 0xFFF0
+#include "fib_lookup.h"
#define RT_GC_TIMEOUT (300*HZ)
-static int ip_rt_min_delay = 2 * HZ;
-static int ip_rt_max_delay = 10 * HZ;
+#define DEFAULT_MIN_PMTU (512 + 20 + 20)
+#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
+#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
-static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
-static int ip_rt_gc_interval = 60 * HZ;
-static int ip_rt_gc_min_interval = HZ / 2;
-static int ip_rt_redirect_number = 9;
-static int ip_rt_redirect_load = HZ / 50;
-static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
-static int ip_rt_error_cost = HZ;
-static int ip_rt_error_burst = 5 * HZ;
-static int ip_rt_gc_elasticity = 8;
-static int ip_rt_mtu_expires = 10 * 60 * HZ;
-static int ip_rt_min_pmtu = 512 + 20 + 20;
-static int ip_rt_min_advmss = 256;
-static int ip_rt_secret_interval = 10 * 60 * HZ;
-static unsigned long rt_deadline;
-
-#define RTprint(a...) printk(KERN_DEBUG a)
-
-static struct timer_list rt_flush_timer;
-static struct timer_list rt_periodic_timer;
-static struct timer_list rt_secret_timer;
+static int ip_rt_redirect_number __read_mostly = 9;
+static int ip_rt_redirect_load __read_mostly = HZ / 50;
+static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost __read_mostly = HZ;
+static int ip_rt_error_burst __read_mostly = 5 * HZ;
+
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
/*
* Interface to generic destination cache.
*/
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
-static void ipv4_dst_destroy(struct dst_entry *dst);
-static void ipv4_dst_ifdown(struct dst_entry *dst,
- struct net_device *dev, int how);
-static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+unsigned int ipv4_mtu(const struct dst_entry *dst);
+static void ipv4_negative_advice(struct sock *sk,
+ struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static int rt_garbage_collect(void);
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static void ipv4_dst_destroy(struct dst_entry *dst);
+
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ WARN_ON(1);
+ return NULL;
+}
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
- .protocol = __constant_htons(ETH_P_IP),
- .gc = rt_garbage_collect,
.check = ipv4_dst_check,
+ .default_advmss = ipv4_default_advmss,
+ .mtu = ipv4_mtu,
+ .cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
- .ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
- .entry_size = sizeof(struct rtable),
+ .redirect = ip_do_redirect,
+ .local_out = __ip_local_out,
+ .neigh_lookup = ipv4_neigh_lookup,
+ .confirm_neigh = ipv4_confirm_neigh,
};
#define ECN_OR_COST(class) TC_PRIO_##class
-__u8 ip_tos2prio[16] = {
+const __u8 ip_tos2prio[16] = {
TC_PRIO_BESTEFFORT,
- ECN_OR_COST(FILLER),
+ ECN_OR_COST(BESTEFFORT),
TC_PRIO_BESTEFFORT,
ECN_OR_COST(BESTEFFORT),
TC_PRIO_BULK,
@@ -187,150 +187,31 @@ __u8 ip_tos2prio[16] = {
TC_PRIO_INTERACTIVE_BULK,
ECN_OR_COST(INTERACTIVE_BULK)
};
+EXPORT_SYMBOL(ip_tos2prio);
-
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- * as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- * they do so with atomic increments and with the
- * lock held.
- */
-
-struct rt_hash_bucket {
- struct rtable *chain;
-};
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
- defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ 256
-#else
-# if NR_CPUS >= 32
-# define RT_HASH_LOCK_SZ 4096
-# elif NR_CPUS >= 16
-# define RT_HASH_LOCK_SZ 2048
-# elif NR_CPUS >= 8
-# define RT_HASH_LOCK_SZ 1024
-# elif NR_CPUS >= 4
-# define RT_HASH_LOCK_SZ 512
-# else
-# define RT_HASH_LOCK_SZ 256
-# endif
-#endif
-
-static spinlock_t *rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-# define rt_hash_lock_init() { \
- int i; \
- rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
- if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
- for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
- spin_lock_init(&rt_hash_locks[i]); \
- }
+static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#ifndef CONFIG_PREEMPT_RT
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
#else
-# define rt_hash_lock_addr(slot) NULL
-# define rt_hash_lock_init()
+#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
#endif
-static struct rt_hash_bucket *rt_hash_table;
-static unsigned rt_hash_mask;
-static int rt_hash_log;
-static unsigned int rt_hash_rnd;
-
-static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) \
- (__raw_get_cpu_var(rt_cache_stat).field++)
-
-static int rt_intern_hash(unsigned hash, struct rtable *rth,
- struct rtable **res);
-
-static unsigned int rt_hash_code(u32 daddr, u32 saddr)
-{
- return (jhash_2words(daddr, saddr, rt_hash_rnd)
- & rt_hash_mask);
-}
-
-#define rt_hash(daddr, saddr, idx) \
- rt_hash_code((__force u32)(__be32)(daddr),\
- (__force u32)(__be32)(saddr) ^ ((idx) << 5))
-
#ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
- int bucket;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
- struct rtable *r = NULL;
- struct rt_cache_iter_state *st = seq->private;
-
- for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
- rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
- if (r)
- break;
- rcu_read_unlock_bh();
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
-{
- struct rt_cache_iter_state *st = rcu_dereference(seq->private);
-
- r = r->u.rt_next;
- while (!r) {
- rcu_read_unlock_bh();
- if (--st->bucket < 0)
- break;
- rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct rtable *r = rt_cache_get_first(seq);
-
- if (r)
- while (pos && (r = rt_cache_get_next(seq, r)))
- --pos;
- return pos ? NULL : r;
-}
-
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
- return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+ if (*pos)
+ return NULL;
+ return SEQ_START_TOKEN;
}
static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct rtable *r = NULL;
-
- if (v == SEQ_START_TOKEN)
- r = rt_cache_get_first(seq);
- else
- r = rt_cache_get_next(seq, v);
++*pos;
- return r;
+ return NULL;
}
static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
- if (v && v != SEQ_START_TOKEN)
- rcu_read_unlock_bh();
}
static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -340,68 +221,16 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
"HHUptod\tSpecDst");
- else {
- struct rtable *r = v;
- char temp[256];
-
- sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
- "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
- r->u.dst.dev ? r->u.dst.dev->name : "*",
- (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
- r->rt_flags, atomic_read(&r->u.dst.__refcnt),
- r->u.dst.__use, 0, (unsigned long)r->rt_src,
- (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
- (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
- dst_metric(&r->u.dst, RTAX_WINDOW),
- (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
- dst_metric(&r->u.dst, RTAX_RTTVAR)),
- r->fl.fl4_tos,
- r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
- r->u.dst.hh ? (r->u.dst.hh->hh_output ==
- dev_queue_xmit) : 0,
- r->rt_spec_dst);
- seq_printf(seq, "%-127s\n", temp);
- }
- return 0;
-}
-
-static struct seq_operations rt_cache_seq_ops = {
+ return 0;
+}
+
+static const struct seq_operations rt_cache_seq_ops = {
.start = rt_cache_seq_start,
.next = rt_cache_seq_next,
.stop = rt_cache_seq_stop,
.show = rt_cache_seq_show,
};
-static int rt_cache_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
- rc = seq_open(file, &rt_cache_seq_ops);
- if (rc)
- goto out_kfree;
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations rt_cache_seq_fops = {
- .owner = THIS_MODULE,
- .open = rt_cache_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-
static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
int cpu;
@@ -409,7 +238,7 @@ static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
if (*pos == 0)
return SEQ_START_TOKEN;
- for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
@@ -422,14 +251,15 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
int cpu;
- for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return &per_cpu(rt_cache_stat, cpu);
}
+ (*pos)++;
return NULL;
-
+
}
static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
@@ -442,14 +272,15 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
struct rt_cache_stat *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+ seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
return 0;
}
-
- seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
- " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
- atomic_read(&ipv4_dst_ops.entries),
- st->in_hit,
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x\n",
+ dst_entries_get_slow(&ipv4_dst_ops),
+ 0, /* st->in_hit */
st->in_slow_tot,
st->in_slow_mc,
st->in_no_route,
@@ -457,690 +288,492 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
st->in_martian_dst,
st->in_martian_src,
- st->out_hit,
+ 0, /* st->out_hit */
st->out_slow_tot,
- st->out_slow_mc,
-
- st->gc_total,
- st->gc_ignored,
- st->gc_goal_miss,
- st->gc_dst_overflow,
- st->in_hlist_search,
- st->out_hlist_search
+ st->out_slow_mc,
+
+ 0, /* st->gc_total */
+ 0, /* st->gc_ignored */
+ 0, /* st->gc_goal_miss */
+ 0, /* st->gc_dst_overflow */
+ 0, /* st->in_hlist_search */
+ 0 /* st->out_hlist_search */
);
return 0;
}
-static struct seq_operations rt_cpu_seq_ops = {
+static const struct seq_operations rt_cpu_seq_ops = {
.start = rt_cpu_seq_start,
.next = rt_cpu_seq_next,
.stop = rt_cpu_seq_stop,
.show = rt_cpu_seq_show,
};
-
-static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static int rt_acct_proc_show(struct seq_file *m, void *v)
{
- return seq_open(file, &rt_cpu_seq_ops);
-}
-
-static struct file_operations rt_cpu_seq_fops = {
- .owner = THIS_MODULE,
- .open = rt_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+ struct ip_rt_acct *dst, *src;
+ unsigned int i, j;
-#endif /* CONFIG_PROC_FS */
-
-static __inline__ void rt_free(struct rtable *rt)
-{
- multipath_remove(rt);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
+ dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
-static __inline__ void rt_drop(struct rtable *rt)
-{
- multipath_remove(rt);
- ip_rt_put(rt);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
+ for_each_possible_cpu(i) {
+ src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+ for (j = 0; j < 256; j++) {
+ dst[j].o_bytes += src[j].o_bytes;
+ dst[j].o_packets += src[j].o_packets;
+ dst[j].i_bytes += src[j].i_bytes;
+ dst[j].i_packets += src[j].i_packets;
+ }
+ }
-static __inline__ int rt_fast_clean(struct rtable *rth)
-{
- /* Kill broadcast/multicast entries very aggresively, if they
- collide in hash table with more useful entries */
- return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rth->fl.iif && rth->u.rt_next;
+ seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+ kfree(dst);
+ return 0;
}
+#endif
-static __inline__ int rt_valuable(struct rtable *rth)
+static int __net_init ip_rt_do_proc_init(struct net *net)
{
- return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->u.dst.expires;
-}
+ struct proc_dir_entry *pde;
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
- unsigned long age;
- int ret = 0;
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net,
+ &rt_cache_seq_ops);
+ if (!pde)
+ goto err1;
- if (atomic_read(&rth->u.dst.__refcnt))
- goto out;
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
+ &rt_cpu_seq_ops);
+ if (!pde)
+ goto err2;
- ret = 1;
- if (rth->u.dst.expires &&
- time_after_eq(jiffies, rth->u.dst.expires))
- goto out;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ pde = proc_create_single("rt_acct", 0, net->proc_net,
+ rt_acct_proc_show);
+ if (!pde)
+ goto err3;
+#endif
+ return 0;
- age = jiffies - rth->u.dst.lastuse;
- ret = 0;
- if ((age <= tmo1 && !rt_fast_clean(rth)) ||
- (age <= tmo2 && rt_valuable(rth)))
- goto out;
- ret = 1;
-out: return ret;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+err3:
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+#endif
+err2:
+ remove_proc_entry("rt_cache", net->proc_net);
+err1:
+ return -ENOMEM;
}
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
+static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
- u32 score = jiffies - rt->u.dst.lastuse;
-
- score = ~score & ~(3<<30);
-
- if (rt_valuable(rt))
- score |= (1<<31);
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+ remove_proc_entry("rt_cache", net->proc_net);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ remove_proc_entry("rt_acct", net->proc_net);
+#endif
+}
- if (!rt->fl.iif ||
- !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
- score |= (1<<30);
+static struct pernet_operations ip_rt_proc_ops __net_initdata = {
+ .init = ip_rt_do_proc_init,
+ .exit = ip_rt_do_proc_exit,
+};
- return score;
+static int __init ip_rt_proc_init(void)
+{
+ return register_pernet_subsys(&ip_rt_proc_ops);
}
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+#else
+static inline int ip_rt_proc_init(void)
{
- return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
- (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
- (fl1->mark ^ fl2->mark) |
- (*(u16 *)&fl1->nl_u.ip4_u.tos ^
- *(u16 *)&fl2->nl_u.ip4_u.tos) |
- (fl1->oif ^ fl2->oif) |
- (fl1->iif ^ fl2->iif)) == 0;
+ return 0;
}
+#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
- struct rtable *expentry,
- int *removed_count)
+static inline bool rt_is_expired(const struct rtable *rth)
{
- int passedexpired = 0;
- struct rtable **nextstep = NULL;
- struct rtable **rthp = chain_head;
- struct rtable *rth;
-
- if (removed_count)
- *removed_count = 0;
+ bool res;
- while ((rth = *rthp) != NULL) {
- if (rth == expentry)
- passedexpired = 1;
-
- if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
- compare_keys(&(*rthp)->fl, &expentry->fl)) {
- if (*rthp == expentry) {
- *rthp = rth->u.rt_next;
- continue;
- } else {
- *rthp = rth->u.rt_next;
- rt_free(rth);
- if (removed_count)
- ++(*removed_count);
- }
- } else {
- if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
- passedexpired && !nextstep)
- nextstep = &rth->u.rt_next;
-
- rthp = &rth->u.rt_next;
- }
- }
-
- rt_free(expentry);
- if (removed_count)
- ++(*removed_count);
+ rcu_read_lock();
+ res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
+ rcu_read_unlock();
- return nextstep;
+ return res;
}
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+void rt_cache_flush(struct net *net)
+{
+ rt_genid_bump_ipv4(net);
+}
-/* This runs via a timer and thus is always in BH context. */
-static void rt_check_expire(unsigned long dummy)
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth, **rthp;
- unsigned long now = jiffies;
- u64 mult;
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
+ struct net_device *dev;
+ struct neighbour *n;
- mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ if (likely(rt->rt_gw_family == AF_INET)) {
+ n = ip_neigh_gw4(dev, rt->rt_gw4);
+ } else if (rt->rt_gw_family == AF_INET6) {
+ n = ip_neigh_gw6(dev, &rt->rt_gw6);
+ } else {
+ __be32 pkey;
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
+ pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
+ n = ip_neigh_gw4(dev, pkey);
+ }
- if (*rthp == 0)
- continue;
- spin_lock(rt_hash_lock_addr(i));
- while ((rth = *rthp) != NULL) {
- if (rth->u.dst.expires) {
- /* Entry is expired even if it is in use */
- if (time_before_eq(now, rth->u.dst.expires)) {
- tmo >>= 1;
- rthp = &rth->u.rt_next;
- continue;
- }
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
- tmo >>= 1;
- rthp = &rth->u.rt_next;
- continue;
- }
+ if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
- /* Cleanup aged off entries. */
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- /* remove all related balanced entries if necessary */
- if (rth->u.dst.flags & DST_BALANCED) {
- rthp = rt_remove_balanced_route(
- &rt_hash_table[i].chain,
- rth, NULL);
- if (!rthp)
- break;
- } else {
- *rthp = rth->u.rt_next;
- rt_free(rth);
- }
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- *rthp = rth->u.rt_next;
- rt_free(rth);
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- }
- spin_unlock(rt_hash_lock_addr(i));
+ rcu_read_unlock();
- /* Fallback loop breaker. */
- if (time_after(jiffies, now))
- break;
- }
- rover = i;
- mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
+ return n;
}
-/* This can run from both BH and non-BH contexts, the latter
- * in the case of a forced flush event.
- */
-static void rt_run_flush(unsigned long dummy)
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
- int i;
- struct rtable *rth, *next;
-
- rt_deadline = 0;
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
+ struct net_device *dev = dst_dev(dst);
+ const __be32 *pkey = daddr;
- get_random_bytes(&rt_hash_rnd, 4);
-
- for (i = rt_hash_mask; i >= 0; i--) {
- spin_lock_bh(rt_hash_lock_addr(i));
- rth = rt_hash_table[i].chain;
- if (rth)
- rt_hash_table[i].chain = NULL;
- spin_unlock_bh(rt_hash_lock_addr(i));
-
- for (; rth; rth = next) {
- next = rth->u.rt_next;
- rt_free(rth);
- }
+ if (rt->rt_gw_family == AF_INET) {
+ pkey = (const __be32 *)&rt->rt_gw4;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
+ } else if (!daddr ||
+ (rt->rt_flags &
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
+ return;
}
+ __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
-static DEFINE_SPINLOCK(rt_flush_lock);
+/* Hash tables of size 2048..262144 depending on RAM size.
+ * Each bucket uses 8 bytes.
+ */
+static u32 ip_idents_mask __read_mostly;
+static atomic_t *ip_idents __read_mostly;
+static u32 *ip_tstamps __read_mostly;
-void rt_cache_flush(int delay)
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
+ */
+static u32 ip_idents_reserve(u32 hash, int segs)
{
- unsigned long now = jiffies;
- int user_mode = !in_softirq();
+ u32 bucket, old, now = (u32)jiffies;
+ atomic_t *p_id;
+ u32 *p_tstamp;
+ u32 delta = 0;
- if (delay < 0)
- delay = ip_rt_min_delay;
+ bucket = hash & ip_idents_mask;
+ p_tstamp = ip_tstamps + bucket;
+ p_id = ip_idents + bucket;
+ old = READ_ONCE(*p_tstamp);
- /* flush existing multipath state*/
- multipath_flush();
+ if (old != now && cmpxchg(p_tstamp, old, now) == old)
+ delta = get_random_u32_below(now - old);
- spin_lock_bh(&rt_flush_lock);
+ /* If UBSAN reports an error there, please make sure your compiler
+ * supports -fno-strict-overflow before reporting it that was a bug
+ * in UBSAN, and it has been fixed in GCC-8.
+ */
+ return atomic_add_return(segs + delta, p_id) - segs;
+}
- if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
- long tmo = (long)(rt_deadline - now);
+void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
+{
+ u32 hash, id;
- /* If flush timer is already running
- and flush request is not immediate (delay > 0):
+ /* Note the following code is not safe, but this is okay. */
+ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+ get_random_bytes(&net->ipv4.ip_id_key,
+ sizeof(net->ipv4.ip_id_key));
- if deadline is not achieved, prolongate timer to "delay",
- otherwise fire it at deadline time.
- */
+ hash = siphash_3u32((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+ iph->protocol,
+ &net->ipv4.ip_id_key);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
+}
+EXPORT_SYMBOL(__ip_select_ident);
- if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
- tmo = 0;
-
- if (delay > tmo)
- delay = tmo;
- }
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk, const struct iphdr *iph,
+ int oif, __u8 tos, u8 prot, u32 mark,
+ int flow_flags)
+{
+ __u8 scope = RT_SCOPE_UNIVERSE;
- if (delay <= 0) {
- spin_unlock_bh(&rt_flush_lock);
- rt_run_flush(0);
- return;
+ if (sk) {
+ oif = sk->sk_bound_dev_if;
+ mark = READ_ONCE(sk->sk_mark);
+ tos = ip_sock_rt_tos(sk);
+ scope = ip_sock_rt_scope(sk);
+ prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
+ sk->sk_protocol;
}
- if (rt_deadline == 0)
- rt_deadline = now + ip_rt_max_delay;
-
- mod_timer(&rt_flush_timer, now+delay);
- spin_unlock_bh(&rt_flush_lock);
+ flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
+ prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
-static void rt_secret_rebuild(unsigned long dummy)
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+ const struct sock *sk)
{
- unsigned long now = jiffies;
+ const struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = ip_hdr(skb);
+ int oif = skb->dev->ifindex;
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+ __u8 tos = iph->tos;
- rt_cache_flush(0);
- mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
-/*
- Short description of GC goals.
-
- We want to build algorithm, which will keep routing cache
- at some equilibrium point, when number of aged off entries
- is kept approximately equal to newly generated ones.
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
- Current expiration strength is variable "expire".
- We try to adjust it dynamically, so that if networking
- is idle expires is large enough to keep enough of warm entries,
- and when load increases it reduces to limit cache size.
- */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
+ ip_sock_rt_tos(sk),
+ ip_sock_rt_scope(sk),
+ inet_test_bit(HDRINCL, sk) ?
+ IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ daddr, inet->inet_saddr, 0, 0,
+ sk_uid(sk));
+ rcu_read_unlock();
+}
-static int rt_garbage_collect(void)
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct sk_buff *skb)
{
- static unsigned long expire = RT_GC_TIMEOUT;
- static unsigned long last_gc;
- static int rover;
- static int equilibrium;
- struct rtable *rth, **rthp;
- unsigned long now = jiffies;
- int goal;
-
- /*
- * Garbage collection is pretty expensive,
- * do not make it too frequently.
- */
+ if (skb)
+ build_skb_flow_key(fl4, skb, sk);
+ else
+ build_sk_flow_key(fl4, sk);
+}
- RT_CACHE_STAT_INC(gc_total);
+static DEFINE_SPINLOCK(fnhe_lock);
- if (now - last_gc < ip_rt_gc_min_interval &&
- atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
- RT_CACHE_STAT_INC(gc_ignored);
- goto out;
- }
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+ struct rtable *rt;
- /* Calculate number of entries, which we want to expire now. */
- goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << rt_hash_log);
- if (goal <= 0) {
- if (equilibrium < ipv4_dst_ops.gc_thresh)
- equilibrium = ipv4_dst_ops.gc_thresh;
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
- if (goal > 0) {
- equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
- }
- } else {
- /* We are in dangerous area. Try to reduce cache really
- * aggressively.
- */
- goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
- equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
-
- if (now - last_gc >= ip_rt_gc_min_interval)
- last_gc = now;
-
- if (goal <= 0) {
- equilibrium += goal;
- goto work_done;
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
+}
- do {
- int i, k;
-
- for (i = rt_hash_mask, k = rover; i >= 0; i--) {
- unsigned long tmo = expire;
-
- k = (k + 1) & rt_hash_mask;
- rthp = &rt_hash_table[k].chain;
- spin_lock_bh(rt_hash_lock_addr(k));
- while ((rth = *rthp) != NULL) {
- if (!rt_may_expire(rth, tmo, expire)) {
- tmo >>= 1;
- rthp = &rth->u.rt_next;
- continue;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- /* remove all related balanced entries
- * if necessary
- */
- if (rth->u.dst.flags & DST_BALANCED) {
- int r;
-
- rthp = rt_remove_balanced_route(
- &rt_hash_table[k].chain,
- rth,
- &r);
- goal -= r;
- if (!rthp)
- break;
- } else {
- *rthp = rth->u.rt_next;
- rt_free(rth);
- goal--;
- }
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- *rthp = rth->u.rt_next;
- rt_free(rth);
- goal--;
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- }
- spin_unlock_bh(rt_hash_lock_addr(k));
- if (goal <= 0)
- break;
- }
- rover = k;
-
- if (goal <= 0)
- goto work_done;
-
- /* Goal is not achieved. We stop process if:
-
- - if expire reduced to zero. Otherwise, expire is halfed.
- - if table is not full.
- - if we are called from interrupt.
- - jiffies check is just fallback/debug loop breaker.
- We will not spin here for long time in any case.
- */
-
- RT_CACHE_STAT_INC(gc_goal_miss);
+static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
+{
+ struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
+ struct fib_nh_exception *fnhe, *oldest = NULL;
- if (expire == 0)
+ for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
+ fnhe = rcu_dereference_protected(*fnhe_p,
+ lockdep_is_held(&fnhe_lock));
+ if (!fnhe)
break;
+ if (!oldest ||
+ time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
+ oldest = fnhe;
+ oldest_p = fnhe_p;
+ }
+ }
- expire >>= 1;
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, i);
-#endif
-
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
- goto out;
- } while (!in_softirq() && time_before_eq(jiffies, now));
-
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
- goto out;
- if (net_ratelimit())
- printk(KERN_WARNING "dst cache overflow\n");
- RT_CACHE_STAT_INC(gc_dst_overflow);
- return 1;
-
-work_done:
- expire += ip_rt_gc_min_interval;
- if (expire > ip_rt_gc_timeout ||
- atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
- expire = ip_rt_gc_timeout;
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, rover);
-#endif
-out: return 0;
+ /* Clear oldest->fnhe_daddr to prevent this fnhe from being
+ * rebound with new dsts in rt_bind_exception().
+ */
+ oldest->fnhe_daddr = 0;
+ fnhe_flush_routes(oldest);
+ *oldest_p = oldest->fnhe_next;
+ kfree_rcu(oldest, rcu);
}
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static u32 fnhe_hashfun(__be32 daddr)
{
- struct rtable *rth, **rthp;
- unsigned long now;
- struct rtable *cand, **candp;
- u32 min_score;
- int chain_length;
- int attempts = !in_softirq();
+ static siphash_aligned_key_t fnhe_hash_key;
+ u64 hval;
-restart:
- chain_length = 0;
- min_score = ~(u32)0;
- cand = NULL;
- candp = NULL;
- now = jiffies;
+ net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
+ hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
+ return hash_64(hval, FNHE_HASH_SHIFT);
+}
- rthp = &rt_hash_table[hash].chain;
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+ rt->rt_pmtu = fnhe->fnhe_pmtu;
+ rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
+ rt->dst.expires = fnhe->fnhe_expires;
- spin_lock_bh(rt_hash_lock_addr(hash));
- while ((rth = *rthp) != NULL) {
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- if (!(rth->u.dst.flags & DST_BALANCED) &&
- compare_keys(&rth->fl, &rt->fl)) {
-#else
- if (compare_keys(&rth->fl, &rt->fl)) {
-#endif
- /* Put it first */
- *rthp = rth->u.rt_next;
- /*
- * Since lookup is lockfree, the deletion
- * must be visible to another weakly ordered CPU before
- * the insertion at the start of the hash chain.
- */
- rcu_assign_pointer(rth->u.rt_next,
- rt_hash_table[hash].chain);
- /*
- * Since lookup is lockfree, the update writes
- * must be ordered for consistency on SMP.
- */
- rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+ if (fnhe->fnhe_gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = fnhe->fnhe_gw;
+ }
+}
- rth->u.dst.__use++;
- dst_hold(&rth->u.dst);
- rth->u.dst.lastuse = now;
- spin_unlock_bh(rt_hash_lock_addr(hash));
+static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
+ __be32 gw, u32 pmtu, bool lock,
+ unsigned long expires)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe;
+ struct rtable *rt;
+ u32 genid, hval;
+ unsigned int i;
+ int depth;
- rt_drop(rt);
- *rp = rth;
- return 0;
- }
+ genid = fnhe_genid(dev_net(nhc->nhc_dev));
+ hval = fnhe_hashfun(daddr);
- if (!atomic_read(&rth->u.dst.__refcnt)) {
- u32 score = rt_score(rth);
+ spin_lock_bh(&fnhe_lock);
- if (score <= min_score) {
- cand = rth;
- candp = rthp;
- min_score = score;
- }
- }
+ hash = rcu_dereference(nhc->nhc_exceptions);
+ if (!hash) {
+ hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ goto out_unlock;
+ rcu_assign_pointer(nhc->nhc_exceptions, hash);
+ }
- chain_length++;
+ hash += hval;
- rthp = &rth->u.rt_next;
+ depth = 0;
+ for (fnhe = rcu_dereference(hash->chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ break;
+ depth++;
}
- if (cand) {
- /* ip_rt_gc_elasticity used to be average length of chain
- * length, when exceeded gc becomes really aggressive.
- *
- * The second limit is less certain. At the moment it allows
- * only 2 entries per bucket. We will see.
- */
- if (chain_length > ip_rt_gc_elasticity) {
- *candp = cand->u.rt_next;
- rt_free(cand);
+ if (fnhe) {
+ if (fnhe->fnhe_genid != genid)
+ fnhe->fnhe_genid = genid;
+ if (gw)
+ fnhe->fnhe_gw = gw;
+ if (pmtu) {
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_mtu_locked = lock;
}
- }
+ fnhe->fnhe_expires = max(1UL, expires);
+ /* Update all cached dsts too */
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ } else {
+ /* Randomize max depth to avoid some side channels attacks. */
+ int max_depth = FNHE_RECLAIM_DEPTH +
+ get_random_u32_below(FNHE_RECLAIM_DEPTH);
- /* Try to bind route to arp only if it is output
- route or unicast forwarding path.
- */
- if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
- int err = arp_bind_neighbour(&rt->u.dst);
- if (err) {
- spin_unlock_bh(rt_hash_lock_addr(hash));
+ while (depth > max_depth) {
+ fnhe_remove_oldest(hash);
+ depth--;
+ }
- if (err != -ENOBUFS) {
- rt_drop(rt);
- return err;
- }
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
- /* Neighbour tables are full and nothing
- can be released. Try to shrink route cache,
- it is most likely it holds some neighbour records.
- */
- if (attempts-- > 0) {
- int saved_elasticity = ip_rt_gc_elasticity;
- int saved_int = ip_rt_gc_min_interval;
- ip_rt_gc_elasticity = 1;
- ip_rt_gc_min_interval = 0;
- rt_garbage_collect();
- ip_rt_gc_min_interval = saved_int;
- ip_rt_gc_elasticity = saved_elasticity;
- goto restart;
- }
+ fnhe->fnhe_next = hash->chain;
- if (net_ratelimit())
- printk(KERN_WARNING "Neighbour table overflow.\n");
- rt_drop(rt);
- return -ENOBUFS;
- }
- }
+ fnhe->fnhe_genid = genid;
+ fnhe->fnhe_daddr = daddr;
+ fnhe->fnhe_gw = gw;
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_mtu_locked = lock;
+ fnhe->fnhe_expires = max(1UL, expires);
- rt->u.rt_next = rt_hash_table[hash].chain;
-#if RT_CACHE_DEBUG >= 2
- if (rt->u.rt_next) {
- struct rtable *trt;
- printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
- NIPQUAD(rt->rt_dst));
- for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
- printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
- printk("\n");
- }
-#endif
- rt_hash_table[hash].chain = rt;
- spin_unlock_bh(rt_hash_lock_addr(hash));
- *rp = rt;
- return 0;
-}
+ rcu_assign_pointer(hash->chain, fnhe);
-void rt_bind_peer(struct rtable *rt, int create)
-{
- static DEFINE_SPINLOCK(rt_peer_lock);
- struct inet_peer *peer;
+ /* Exception created; mark the cached routes for the nexthop
+ * stale, so anyone caching it rechecks if this exception
+ * applies to them.
+ */
+ rt = rcu_dereference(nhc->nhc_rth_input);
+ if (rt)
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
- peer = inet_getpeer(rt->rt_dst, create);
+ for_each_possible_cpu(i) {
+ struct rtable __rcu **prt;
- spin_lock_bh(&rt_peer_lock);
- if (rt->peer == NULL) {
- rt->peer = peer;
- peer = NULL;
+ prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
+ rt = rcu_dereference(*prt);
+ if (rt)
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
+ }
}
- spin_unlock_bh(&rt_peer_lock);
- if (peer)
- inet_putpeer(peer);
-}
-/*
- * Peer allocation may fail only in serious out-of-memory conditions. However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
- */
-static void ip_select_fb_ident(struct iphdr *iph)
-{
- static DEFINE_SPINLOCK(ip_fb_id_lock);
- static u32 ip_fallback_id;
- u32 salt;
+ fnhe->fnhe_stamp = jiffies;
- spin_lock_bh(&ip_fb_id_lock);
- salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
- iph->id = htons(salt & 0xFFFF);
- ip_fallback_id = salt;
- spin_unlock_bh(&ip_fb_id_lock);
+out_unlock:
+ spin_unlock_bh(&fnhe_lock);
}
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+ bool kill_route)
{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt) {
- if (rt->peer == NULL)
- rt_bind_peer(rt, 1);
-
- /* If peer is attached to destination, it is never detached,
- so that we need not to grab a lock to dereference it.
- */
- if (rt->peer) {
- iph->id = htons(inet_getid(rt->peer, more));
- return;
- }
- } else
- printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
- __builtin_return_address(0));
-
- ip_select_fb_ident(iph);
-}
+ __be32 new_gw = icmp_hdr(skb)->un.gateway;
+ __be32 old_gw = ip_hdr(skb)->saddr;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct neighbour *n;
+ struct net *net;
-static void rt_del(unsigned hash, struct rtable *rt)
-{
- struct rtable **rthp;
+ switch (icmp_hdr(skb)->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ break;
- spin_lock_bh(rt_hash_lock_addr(hash));
- ip_rt_put(rt);
- for (rthp = &rt_hash_table[hash].chain; *rthp;
- rthp = &(*rthp)->u.rt_next)
- if (*rthp == rt) {
- *rthp = rt->u.rt_next;
- rt_free(rt);
- break;
- }
- spin_unlock_bh(rt_hash_lock_addr(hash));
-}
+ default:
+ return;
+ }
-void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
- __be32 saddr, struct net_device *dev)
-{
- int i, k;
- struct in_device *in_dev = in_dev_get(dev);
- struct rtable *rth, **rthp;
- __be32 skeys[2] = { saddr, 0 };
- int ikeys[2] = { dev->ifindex, 0 };
- struct netevent_redirect netevent;
+ if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
+ return;
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
return;
- if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
- || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+ net = dev_net(dev);
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+ ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+ ipv4_is_zeronet(new_gw))
goto reject_redirect;
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
@@ -1149,136 +782,76 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
goto reject_redirect;
} else {
- if (inet_addr_type(new_gw) != RTN_UNICAST)
+ if (inet_addr_type(net, new_gw) != RTN_UNICAST)
goto reject_redirect;
}
- for (i = 0; i < 2; i++) {
- for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
-
- rthp=&rt_hash_table[hash].chain;
-
- rcu_read_lock();
- while ((rth = rcu_dereference(*rthp)) != NULL) {
- struct rtable *rt;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->fl.oif != ikeys[k] ||
- rth->fl.iif != 0) {
- rthp = &rth->u.rt_next;
- continue;
- }
-
- if (rth->rt_dst != daddr ||
- rth->rt_src != saddr ||
- rth->u.dst.error ||
- rth->rt_gateway != old_gw ||
- rth->u.dst.dev != dev)
- break;
-
- dst_hold(&rth->u.dst);
- rcu_read_unlock();
-
- rt = dst_alloc(&ipv4_dst_ops);
- if (rt == NULL) {
- ip_rt_put(rth);
- in_dev_put(in_dev);
- return;
- }
-
- /* Copy all the information. */
- *rt = *rth;
- INIT_RCU_HEAD(&rt->u.dst.rcu_head);
- rt->u.dst.__use = 1;
- atomic_set(&rt->u.dst.__refcnt, 1);
- rt->u.dst.child = NULL;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
- if (rt->idev)
- in_dev_hold(rt->idev);
- rt->u.dst.obsolete = 0;
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.path = &rt->u.dst;
- rt->u.dst.neighbour = NULL;
- rt->u.dst.hh = NULL;
- rt->u.dst.xfrm = NULL;
-
- rt->rt_flags |= RTCF_REDIRECTED;
-
- /* Gateway is different ... */
- rt->rt_gateway = new_gw;
-
- /* Redirect received -> path was valid */
- dst_confirm(&rth->u.dst);
-
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
- if (arp_bind_neighbour(&rt->u.dst) ||
- !(rt->u.dst.neighbour->nud_state &
- NUD_VALID)) {
- if (rt->u.dst.neighbour)
- neigh_event_send(rt->u.dst.neighbour, NULL);
- ip_rt_put(rth);
- rt_drop(rt);
- goto do_next;
- }
-
- netevent.old = &rth->u.dst;
- netevent.new = &rt->u.dst;
- call_netevent_notifiers(NETEVENT_REDIRECT,
- &netevent);
-
- rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt))
- ip_rt_put(rt);
- goto do_next;
+ n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
+ if (!n)
+ n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
+ if (!IS_ERR(n)) {
+ if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
+ struct fib_nh_common *nhc;
+
+ fib_select_path(net, &res, fl4, skb);
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, new_gw,
+ 0, false,
+ jiffies + ip_rt_gc_timeout);
}
- rcu_read_unlock();
- do_next:
- ;
+ if (kill_route)
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
+ neigh_release(n);
}
- in_dev_put(in_dev);
return;
reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
- "%u.%u.%u.%u ignored.\n"
- " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
- NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
- NIPQUAD(saddr), NIPQUAD(daddr));
+ if (IN_DEV_LOG_MARTIANS(in_dev)) {
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ __be32 daddr = iph->daddr;
+ __be32 saddr = iph->saddr;
+
+ net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+ " Advised path = %pI4 -> %pI4\n",
+ &old_gw, dev->name, &new_gw,
+ &saddr, &daddr);
+ }
#endif
- in_dev_put(in_dev);
+ ;
}
-static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
- struct rtable *rt = (struct rtable*)dst;
- struct dst_entry *ret = dst;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
+ int oif = skb->dev->ifindex;
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+ __u8 tos = iph->tos;
- if (rt) {
- if (dst->obsolete) {
- ip_rt_put(rt);
- ret = NULL;
- } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- rt->u.dst.expires) {
- unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
- rt->fl.oif);
-#if RT_CACHE_DEBUG >= 1
- printk(KERN_DEBUG "ip_rt_advice: redirect to "
- "%u.%u.%u.%u/%02x dropped\n",
- NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
-#endif
- rt_del(hash, rt);
- ret = NULL;
- }
- }
- return ret;
+ rt = dst_rtable(dst);
+
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
+ __ip_do_redirect(rt, skb, &fl4, true);
+}
+
+static void ipv4_negative_advice(struct sock *sk,
+ struct dst_entry *dst)
+{
+ struct rtable *rt = dst_rtable(dst);
+
+ if ((READ_ONCE(dst->obsolete) > 0) ||
+ (rt->rt_flags & RTCF_REDIRECTED) ||
+ READ_ONCE(rt->dst.expires))
+ sk_dst_reset(sk);
}
/*
@@ -1299,386 +872,913 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
void ip_rt_send_redirect(struct sk_buff *skb)
{
- struct rtable *rt = (struct rtable*)skb->dst;
- struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct inet_peer *peer;
+ struct net *net;
+ int log_martians;
+ int vif;
- if (!in_dev)
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
+ rcu_read_unlock();
return;
+ }
+ log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+ vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
- if (!IN_DEV_TX_REDIRECTS(in_dev))
- goto out;
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
+ if (!peer) {
+ rcu_read_unlock();
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+ rt_nexthop(rt, ip_hdr(skb)->daddr));
+ return;
+ }
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
- rt->u.dst.rate_tokens = 0;
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
+ peer->rate_tokens = 0;
+ peer->n_redirects = 0;
+ }
/* Too many ignored redirects; do not send anything
- * set u.dst.rate_last to the last seen redirected packet.
+ * set dst.rate_last to the last seen redirected packet.
*/
- if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
- rt->u.dst.rate_last = jiffies;
- goto out;
+ if (peer->n_redirects >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
+ goto out_unlock;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (time_after(jiffies,
- (rt->u.dst.rate_last +
- (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
- icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->u.dst.rate_last = jiffies;
- ++rt->u.dst.rate_tokens;
-#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) &&
- rt->u.dst.rate_tokens == ip_rt_redirect_number &&
- net_ratelimit())
- printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
- "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
- NIPQUAD(rt->rt_src), rt->rt_iif,
- NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
-#endif
- }
-out:
- in_dev_put(in_dev);
+ if (peer->n_redirects == 0 ||
+ time_after(jiffies,
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->n_redirects)))) {
+ __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
+ peer->rate_last = jiffies;
+ ++peer->n_redirects;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
+ peer->n_redirects == ip_rt_redirect_number)
+ net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+ &ip_hdr(skb)->saddr, inet_iif(skb),
+ &ip_hdr(skb)->daddr, &gw);
+ }
+out_unlock:
+ rcu_read_unlock();
}
static int ip_error(struct sk_buff *skb)
{
- struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct inet_peer *peer;
unsigned long now;
+ struct net *net;
+ SKB_DR(reason);
+ bool send;
int code;
- switch (rt->u.dst.error) {
- case EINVAL:
- default:
+ if (netif_is_l3_master(skb->dev)) {
+ dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
+ if (!dev)
goto out;
+ }
+
+ in_dev = __in_dev_get_rcu(dev);
+
+ /* IP on this device is disabled. */
+ if (!in_dev)
+ goto out;
+
+ net = dev_net(rt->dst.dev);
+ if (!IN_DEV_FORWARD(in_dev)) {
+ switch (rt->dst.error) {
case EHOSTUNREACH:
- code = ICMP_HOST_UNREACH;
+ SKB_DR_SET(reason, IP_INADDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
break;
+
case ENETUNREACH:
- code = ICMP_NET_UNREACH;
- break;
- case EACCES:
- code = ICMP_PKT_FILTERED;
+ SKB_DR_SET(reason, IP_INNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
+ }
+ goto out;
}
- now = jiffies;
- rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
- if (rt->u.dst.rate_tokens > ip_rt_error_burst)
- rt->u.dst.rate_tokens = ip_rt_error_burst;
- rt->u.dst.rate_last = now;
- if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
- rt->u.dst.rate_tokens -= ip_rt_error_cost;
- icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ switch (rt->dst.error) {
+ case EINVAL:
+ default:
+ goto out;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ SKB_DR_SET(reason, IP_INNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
}
-out: kfree_skb(skb);
- return 0;
-}
-
-/*
- * The last two values are not from the RFC but
- * are needed for AMPRnet AX.25 paths.
- */
+ rcu_read_lock();
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
+ l3mdev_master_ifindex_rcu(skb->dev));
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
+ }
+ rcu_read_unlock();
-static const unsigned short mtu_plateau[] =
-{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
-static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
- if (old_mtu > mtu_plateau[i])
- return mtu_plateau[i];
- return 68;
+out: kfree_skb_reason(skb, reason);
+ return 0;
}
-unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
- int i;
- unsigned short old_mtu = ntohs(iph->tot_len);
- struct rtable *rth;
- __be32 skeys[2] = { iph->saddr, 0, };
- __be32 daddr = iph->daddr;
- unsigned short est_mtu = 0;
+ struct dst_entry *dst = &rt->dst;
+ struct fib_result res;
+ bool lock = false;
+ struct net *net;
+ u32 old_mtu;
- if (ipv4_config.no_pmtu_disc)
- return 0;
+ if (ip_mtu_locked(dst))
+ return;
- for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash(daddr, skeys[i], 0);
+ old_mtu = ipv4_mtu(dst);
+ if (old_mtu < mtu)
+ return;
- rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.rt_next)) {
- if (rth->fl.fl4_dst == daddr &&
- rth->fl.fl4_src == skeys[i] &&
- rth->rt_dst == daddr &&
- rth->rt_src == iph->saddr &&
- rth->fl.iif == 0 &&
- !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
- unsigned short mtu = new_mtu;
-
- if (new_mtu < 68 || new_mtu >= old_mtu) {
-
- /* BSD 4.2 compatibility hack :-( */
- if (mtu == 0 &&
- old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
- old_mtu >= 68 + (iph->ihl << 2))
- old_mtu -= iph->ihl << 2;
-
- mtu = guess_mtu(old_mtu);
- }
- if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
- if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
- dst_confirm(&rth->u.dst);
- if (mtu < ip_rt_min_pmtu) {
- mtu = ip_rt_min_pmtu;
- rth->u.dst.metrics[RTAX_LOCK-1] |=
- (1 << RTAX_MTU);
- }
- rth->u.dst.metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(&rth->u.dst,
- ip_rt_mtu_expires);
- }
- est_mtu = mtu;
- }
+ rcu_read_lock();
+ net = dst_dev_net_rcu(dst);
+ if (mtu < net->ipv4.ip_rt_min_pmtu) {
+ lock = true;
+ mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
+ }
+
+ if (rt->rt_pmtu == mtu && !lock &&
+ time_before(jiffies, READ_ONCE(dst->expires) -
+ net->ipv4.ip_rt_mtu_expires / 2))
+ goto out;
+
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
+ struct fib_nh_common *nhc;
+
+ fib_select_path(net, &res, fl4, NULL);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fib_info_num_path(res.fi) > 1) {
+ int nhsel;
+
+ for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
+ nhc = fib_info_nhc(res.fi, nhsel);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
}
+ goto out;
}
- rcu_read_unlock();
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
}
- return est_mtu ? : new_mtu;
+out:
+ rcu_read_unlock();
}
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
- if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
- !(dst_metric_locked(dst, RTAX_MTU))) {
- if (mtu < ip_rt_min_pmtu) {
- mtu = ip_rt_min_pmtu;
- dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
- }
- dst->metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(dst, ip_rt_mtu_expires);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+ struct rtable *rt = dst_rtable(dst);
+ struct flowi4 fl4;
+
+ ip_rt_build_flow_key(&fl4, sk, skb);
+
+ /* Don't make lookup fail for bridged encapsulations */
+ if (skb && netif_is_any_bridge_port(skb->dev))
+ fl4.flowi4_oif = 0;
+
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+ int oif, u8 protocol)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
+
+ __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
+ 0);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
}
}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- return NULL;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!fl4.flowi4_mark)
+ fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
+ }
}
-static void ipv4_dst_destroy(struct dst_entry *dst)
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer = rt->peer;
- struct in_device *idev = rt->idev;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct dst_entry *odst = NULL;
+ bool new = false;
+ struct net *net = sock_net(sk);
- if (peer) {
- rt->peer = NULL;
- inet_putpeer(peer);
+ bh_lock_sock(sk);
+
+ if (!ip_sk_accept_pmtu(sk))
+ goto out;
+
+ odst = sk_dst_get(sk);
+
+ if (sock_owned_by_user(sk) || !odst) {
+ __ipv4_sk_update_pmtu(skb, sk, mtu);
+ goto out;
+ }
+
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ rt = dst_rtable(odst);
+ if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) {
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
}
- if (idev) {
- rt->idev = NULL;
- in_dev_put(idev);
+ __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
+
+ if (!dst_check(&rt->dst, 0)) {
+ if (new)
+ dst_release(&rt->dst);
+
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
}
+
+ if (new)
+ sk_dst_set(sk, &rt->dst);
+
+out:
+ bh_unlock_sock(sk);
+ dst_release(odst);
}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
-static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int how)
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+ int oif, u8 protocol)
{
- struct rtable *rt = (struct rtable *) dst;
- struct in_device *idev = rt->idev;
- if (dev != &loopback_dev && idev && idev->dev == dev) {
- struct in_device *loopback_idev = in_dev_get(&loopback_dev);
- if (loopback_idev) {
- rt->idev = loopback_idev;
- in_dev_put(idev);
- }
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
+
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct net *net = sock_net(sk);
+
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
+
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
+ u32 cookie)
+{
+ struct rtable *rt = dst_rtable(dst);
+
+ /* All IPV4 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ *
+ * When a PMTU/redirect information update invalidates a route,
+ * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+ * DST_OBSOLETE_DEAD.
+ */
+ if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK ||
+ rt_is_expired(rt))
+ return NULL;
+ return dst;
+}
+EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
+
+static void ipv4_send_dest_unreach(struct sk_buff *skb)
+{
+ struct inet_skb_parm parm;
+ struct net_device *dev;
+ int res;
+
+ /* Recompile ip options since IPCB may not be valid anymore.
+ * Also check we have a reasonable ipv4 header.
+ */
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
+ ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
+ return;
+
+ memset(&parm, 0, sizeof(parm));
+ if (ip_hdr(skb)->ihl > 5) {
+ if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
+ return;
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+
+ rcu_read_lock();
+ dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
+ res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
+ return;
}
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
}
static void ipv4_link_failure(struct sk_buff *skb)
{
struct rtable *rt;
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ ipv4_send_dest_unreach(skb);
- rt = (struct rtable *) skb->dst;
+ rt = skb_rtable(skb);
if (rt)
- dst_set_expires(&rt->u.dst, 0);
+ dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
- NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
- skb->dev ? skb->dev->name : "?");
+ pr_debug("%s: %pI4 -> %pI4, %s\n",
+ __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ skb->dev ? skb->dev->name : "?");
kfree_skb(skb);
+ WARN_ON(1);
return 0;
}
/*
- We do not cache source address of outgoing interface,
- because it is used only by IP RR, TS and SRR options,
- so that it out of fast path.
-
- BTW remember: "addr" is allowed to be not aligned
- in IP options!
+ * We do not cache source address of outgoing interface,
+ * because it is used only by IP RR, TS and SRR options,
+ * so that it out of fast path.
+ *
+ * BTW remember: "addr" is allowed to be not aligned
+ * in IP options!
*/
-void ip_rt_get_source(u8 *addr, struct rtable *rt)
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
{
__be32 src;
- struct fib_result res;
- if (rt->fl.iif == 0)
- src = rt->rt_src;
- else if (fib_lookup(&rt->fl, &res) == 0) {
- src = FIB_RES_PREFSRC(res);
- fib_res_put(&res);
- } else
- src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
- RT_SCOPE_UNIVERSE);
+ if (rt_is_output_route(rt))
+ src = ip_hdr(skb)->saddr;
+ else {
+ struct fib_result res;
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_dscp = ip4h_dscp(iph),
+ .flowi4_oif = rt->dst.dev->ifindex,
+ .flowi4_iif = skb->dev->ifindex,
+ .flowi4_mark = skb->mark,
+ };
+
+ rcu_read_lock();
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
+ src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
+ else
+ src = inet_select_addr(rt->dst.dev,
+ rt_nexthop(rt, iph->daddr),
+ RT_SCOPE_UNIVERSE);
+ rcu_read_unlock();
+ }
memcpy(addr, &src, 4);
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
- if (!(rt->u.dst.tclassid & 0xFFFF))
- rt->u.dst.tclassid |= tag & 0xFFFF;
- if (!(rt->u.dst.tclassid & 0xFFFF0000))
- rt->u.dst.tclassid |= tag & 0xFFFF0000;
+ if (!(rt->dst.tclassid & 0xFFFF))
+ rt->dst.tclassid |= tag & 0xFFFF;
+ if (!(rt->dst.tclassid & 0xFFFF0000))
+ rt->dst.tclassid |= tag & 0xFFFF0000;
}
#endif
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
+ unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
+ unsigned int advmss;
+ struct net *net;
+
+ rcu_read_lock();
+ net = dst_dev_net_rcu(dst);
+ advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
+ net->ipv4.ip_rt_min_advmss);
+ rcu_read_unlock();
+
+ return min(advmss, IPV4_MAX_PMTU - header_size);
+}
+
+INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
+{
+ return ip_dst_mtu_maybe_forward(dst, false);
+}
+EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
+
+static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nhc->nhc_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ /* set fnhe_daddr to 0 to ensure it won't bind with
+ * new dsts in rt_bind_exception().
+ */
+ fnhe->fnhe_daddr = 0;
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
+static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
+ __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
+ struct fib_nh_exception *fnhe;
+ u32 hval;
+
+ if (!hash)
+ return NULL;
+
+ hval = fnhe_hashfun(daddr);
+
+ for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr) {
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires)) {
+ ip_del_fnhe(nhc, daddr);
+ break;
+ }
+ return fnhe;
+ }
+ }
+ return NULL;
+}
+
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+ struct fib_nh_common *nhc = res->nhc;
+ struct net_device *dev = nhc->nhc_dev;
struct fib_info *fi = res->fi;
+ u32 mtu = 0;
+
+ if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
+ fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+ mtu = fi->fib_mtu;
+
+ if (likely(!mtu)) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = find_exception(nhc, daddr);
+ if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+ mtu = fnhe->fnhe_pmtu;
+ }
+
+ if (likely(!mtu))
+ mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
+}
+
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ __be32 daddr, const bool do_cache)
+{
+ bool ret = false;
+
+ spin_lock_bh(&fnhe_lock);
+
+ if (daddr == fnhe->fnhe_daddr) {
+ struct rtable __rcu **porig;
+ struct rtable *orig;
+ int genid = fnhe_genid(dev_net(rt->dst.dev));
+
+ if (rt_is_input_route(rt))
+ porig = &fnhe->fnhe_rth_input;
+ else
+ porig = &fnhe->fnhe_rth_output;
+ orig = rcu_dereference(*porig);
+
+ if (fnhe->fnhe_genid != genid) {
+ fnhe->fnhe_genid = genid;
+ fnhe->fnhe_gw = 0;
+ fnhe->fnhe_pmtu = 0;
+ fnhe->fnhe_expires = 0;
+ fnhe->fnhe_mtu_locked = false;
+ fnhe_flush_routes(fnhe);
+ orig = NULL;
+ }
+ fill_route_from_fnhe(rt, fnhe);
+ if (!rt->rt_gw4) {
+ rt->rt_gw4 = daddr;
+ rt->rt_gw_family = AF_INET;
+ }
+
+ if (do_cache) {
+ dst_hold(&rt->dst);
+ rcu_assign_pointer(*porig, rt);
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
+ ret = true;
+ }
+
+ fnhe->fnhe_stamp = jiffies;
+ }
+ spin_unlock_bh(&fnhe_lock);
+
+ return ret;
+}
+
+static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
+{
+ struct rtable *orig, *prev, **p;
+ bool ret = true;
+
+ if (rt_is_input_route(rt)) {
+ p = (struct rtable **)&nhc->nhc_rth_input;
+ } else {
+ p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
+ }
+ orig = *p;
+
+ /* hold dst before doing cmpxchg() to avoid race condition
+ * on this dst
+ */
+ dst_hold(&rt->dst);
+ prev = cmpxchg(p, orig, rt);
+ if (prev == orig) {
+ if (orig) {
+ rt_add_uncached_list(orig);
+ dst_release(&orig->dst);
+ }
+ } else {
+ dst_release(&rt->dst);
+ ret = false;
+ }
+
+ return ret;
+}
+
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
+
+void rt_add_uncached_list(struct rtable *rt)
+{
+ struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
+
+ rt->dst.rt_uncached_list = ul;
+
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
+}
+
+void rt_del_uncached_list(struct rtable *rt)
+{
+ if (!list_empty(&rt->dst.rt_uncached)) {
+ struct uncached_list *ul = rt->dst.rt_uncached_list;
+
+ spin_lock_bh(&ul->lock);
+ list_del_init(&rt->dst.rt_uncached);
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+ ip_dst_metrics_put(dst);
+ rt_del_uncached_list(dst_rtable(dst));
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+ struct rtable *rt, *safe;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+ if (list_empty(&ul->head))
+ continue;
+
+ spin_lock_bh(&ul->lock);
+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
+ if (rt->dst.dev != dev)
+ continue;
+ rt->dst.dev = blackhole_netdev;
+ netdev_ref_replace(dev, blackhole_netdev,
+ &rt->dst.dev_tracker, GFP_ATOMIC);
+ list_del_init(&rt->dst.rt_uncached);
+ }
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+static bool rt_cache_valid(const struct rtable *rt)
+{
+ return rt &&
+ READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
+ !rt_is_expired(rt);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
+ const struct fib_result *res,
+ struct fib_nh_exception *fnhe,
+ struct fib_info *fi, u16 type, u32 itag,
+ const bool do_cache)
+{
+ bool cached = false;
if (fi) {
- if (FIB_RES_GW(*res) &&
- FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- rt->rt_gateway = FIB_RES_GW(*res);
- memcpy(rt->u.dst.metrics, fi->fib_metrics,
- sizeof(rt->u.dst.metrics));
- if (fi->fib_mtu == 0) {
- rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
- if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
- rt->rt_gateway != rt->rt_dst &&
- rt->u.dst.dev->mtu > 576)
- rt->u.dst.metrics[RTAX_MTU-1] = 576;
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+ if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+ rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = nhc->nhc_gw_family;
+ /* only INET and INET6 are supported */
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ rt->rt_gw4 = nhc->nhc_gw.ipv4;
+ else
+ rt->rt_gw6 = nhc->nhc_gw.ipv6;
+ }
+
+ ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ rt->dst.tclassid = nh->nh_tclassid;
}
-#ifdef CONFIG_NET_CLS_ROUTE
- rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
+ rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
+ if (unlikely(fnhe))
+ cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
+ else if (do_cache)
+ cached = rt_cache_route(nhc, rt);
+ if (unlikely(!cached)) {
+ /* Routes we intend to cache in nexthop exception or
+ * FIB nexthop have the DST_NOCACHE bit clear.
+ * However, if we are unsuccessful at storing this
+ * route into the cache we really need to set it.
+ */
+ if (!rt->rt_gw4) {
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = daddr;
+ }
+ rt_add_uncached_list(rt);
+ }
} else
- rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
-
- if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
- if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
- rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
- if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
- ip_rt_min_advmss);
- if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
-
-#ifdef CONFIG_NET_CLS_ROUTE
+ rt_add_uncached_list(rt);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
- set_class_tag(rt, fib_rules_tclass(res));
+ set_class_tag(rt, res->tclassid);
#endif
set_class_tag(rt, itag);
#endif
- rt->rt_type = res->type;
}
-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, int our)
+struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
+ bool noxfrm)
{
- unsigned hash;
- struct rtable *rth;
- __be32 spec_dst;
- struct in_device *in_dev = in_dev_get(dev);
- u32 itag = 0;
+ struct rtable *rt;
+
+ rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
+ (noxfrm ? DST_NOXFRM : 0));
+
+ if (rt) {
+ rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ rt->rt_flags = flags;
+ rt->rt_type = type;
+ rt->rt_is_input = 0;
+ rt->rt_iif = 0;
+ rt->rt_pmtu = 0;
+ rt->rt_mtu_locked = 0;
+ rt->rt_uses_gateway = 0;
+ rt->rt_gw_family = 0;
+ rt->rt_gw4 = 0;
+
+ rt->dst.output = ip_output;
+ if (flags & RTCF_LOCAL)
+ rt->dst.input = ip_local_deliver;
+ }
+
+ return rt;
+}
+EXPORT_SYMBOL(rt_dst_alloc);
+
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
+{
+ struct rtable *new_rt;
+
+ new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
+ rt->dst.flags);
+
+ if (new_rt) {
+ new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ new_rt->rt_flags = rt->rt_flags;
+ new_rt->rt_type = rt->rt_type;
+ new_rt->rt_is_input = rt->rt_is_input;
+ new_rt->rt_iif = rt->rt_iif;
+ new_rt->rt_pmtu = rt->rt_pmtu;
+ new_rt->rt_mtu_locked = rt->rt_mtu_locked;
+ new_rt->rt_gw_family = rt->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ new_rt->rt_gw4 = rt->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ new_rt->rt_gw6 = rt->rt_gw6;
+
+ new_rt->dst.input = READ_ONCE(rt->dst.input);
+ new_rt->dst.output = READ_ONCE(rt->dst.output);
+ new_rt->dst.error = rt->dst.error;
+ new_rt->dst.lastuse = jiffies;
+ new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
+ }
+ return new_rt;
+}
+EXPORT_SYMBOL(rt_dst_clone);
+
+/* called in rcu_read_lock() section */
+enum skb_drop_reason
+ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ struct in_device *in_dev, u32 *itag)
+{
+ enum skb_drop_reason reason;
/* Primary sanity checks. */
+ if (!in_dev)
+ return SKB_DROP_REASON_NOT_SPECIFIED;
- if (in_dev == NULL)
- return -EINVAL;
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ return SKB_DROP_REASON_IP_INVALID_SOURCE;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return SKB_DROP_REASON_INVALID_PROTO;
- if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
- skb->protocol != htons(ETH_P_IP))
- goto e_inval;
+ if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+ return SKB_DROP_REASON_IP_LOCALNET;
- if (ZERONET(saddr)) {
- if (!LOCAL_MCAST(daddr))
- goto e_inval;
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- } else if (fib_validate_source(saddr, 0, tos, 0,
- dev, &spec_dst, &itag) < 0)
- goto e_inval;
+ if (ipv4_is_zeronet(saddr)) {
+ if (!ipv4_is_local_multicast(daddr) &&
+ ip_hdr(skb)->protocol != IPPROTO_IGMP)
+ return SKB_DROP_REASON_IP_INVALID_SOURCE;
+ } else {
+ reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
+ dev, in_dev, itag);
+ if (reason)
+ return reason;
+ }
+ return SKB_NOT_DROPPED_YET;
+}
- rth = dst_alloc(&ipv4_dst_ops);
+/* called in rcu_read_lock() section */
+static enum skb_drop_reason
+ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev, int our)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ unsigned int flags = RTCF_MULTICAST;
+ enum skb_drop_reason reason;
+ struct rtable *rth;
+ u32 itag = 0;
+
+ reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev,
+ &itag);
+ if (reason)
+ return reason;
+
+ if (our)
+ flags |= RTCF_LOCAL;
+
+ if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
+ false);
if (!rth)
- goto e_nobufs;
+ return SKB_DROP_REASON_NOMEM;
- rth->u.dst.output= ip_rt_bug;
-
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
- if (in_dev->cnf.no_policy)
- rth->u.dst.flags |= DST_NOPOLICY;
- rth->fl.fl4_dst = daddr;
- rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
- rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
- rth->u.dst.tclassid = itag;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rth->dst.tclassid = itag;
#endif
- rth->rt_iif =
- rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = &loopback_dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
- rth->fl.oif = 0;
- rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->rt_type = RTN_MULTICAST;
- rth->rt_flags = RTCF_MULTICAST;
- if (our) {
- rth->u.dst.input= ip_local_deliver;
- rth->rt_flags |= RTCF_LOCAL;
- }
+ rth->dst.output = ip_rt_bug;
+ rth->rt_is_input= 1;
#ifdef CONFIG_IP_MROUTE
- if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
- rth->u.dst.input = ip_mr_input;
+ if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
+ rth->dst.input = ip_mr_input;
#endif
RT_CACHE_STAT_INC(in_slow_mc);
- in_dev_put(in_dev);
- hash = rt_hash(daddr, saddr, dev->ifindex);
- return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
-
-e_nobufs:
- in_dev_put(in_dev);
- return -ENOBUFS;
-
-e_inval:
- in_dev_put(in_dev);
- return -EINVAL;
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rth->dst);
+ return SKB_NOT_DROPPED_YET;
}
@@ -1695,235 +1795,486 @@ static void ip_handle_martian_source(struct net_device *dev,
* RFC1812 recommendation, if source is martian,
* the only hint is MAC header.
*/
- printk(KERN_WARNING "martian source %u.%u.%u.%u from "
- "%u.%u.%u.%u, on dev %s\n",
- NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
- if (dev->hard_header_len && skb->mac.raw) {
- int i;
- unsigned char *p = skb->mac.raw;
- printk(KERN_WARNING "ll header: ");
- for (i = 0; i < dev->hard_header_len; i++, p++) {
- printk("%02x", *p);
- if (i < (dev->hard_header_len - 1))
- printk(":");
- }
- printk("\n");
+ pr_warn("martian source %pI4 from %pI4, on dev %s\n",
+ &daddr, &saddr, dev->name);
+ if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
+ print_hex_dump(KERN_WARNING, "ll header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ skb_mac_header(skb),
+ dev->hard_header_len, false);
}
}
#endif
}
-static inline int __mkroute_input(struct sk_buff *skb,
- struct fib_result* res,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos,
- struct rtable **result)
+/* called in rcu_read_lock() section */
+static enum skb_drop_reason
+__mkroute_input(struct sk_buff *skb, const struct fib_result *res,
+ struct in_device *in_dev, __be32 daddr,
+ __be32 saddr, dscp_t dscp)
{
-
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+ struct net_device *dev = nhc->nhc_dev;
+ struct fib_nh_exception *fnhe;
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned flags = 0;
- __be32 spec_dst;
- u32 itag;
+ bool do_cache;
+ u32 itag = 0;
/* get a working reference to the output device */
- out_dev = in_dev_get(FIB_RES_DEV(*res));
- if (out_dev == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "Bug in ip_route_input" \
- "_slow(). Please, report\n");
- return -EINVAL;
+ out_dev = __in_dev_get_rcu(dev);
+ if (!out_dev) {
+ net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
+ return reason;
}
-
- err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
- in_dev->dev, &spec_dst, &itag);
+ err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res),
+ in_dev->dev, in_dev, &itag);
if (err < 0) {
- ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+ reason = -err;
+ ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
-
- err = -EINVAL;
+
goto cleanup;
}
- if (err)
- flags |= RTCF_DIRECTSRC;
+ do_cache = res->fi && !itag;
+ if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+ skb->protocol == htons(ETH_P_IP)) {
+ __be32 gw;
- if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
- (IN_DEV_SHARED_MEDIA(out_dev) ||
- inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
- flags |= RTCF_DOREDIRECT;
+ gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+ if (IN_DEV_SHARED_MEDIA(out_dev) ||
+ inet_addr_onlink(out_dev, saddr, gw))
+ IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+ }
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
* invalid for proxy arp. DNAT routes are always valid.
+ *
+ * Proxy arp feature have been extended to allow, ARP
+ * replies back to the same interface, to support
+ * Private VLAN switch technologies. See arp.c.
*/
- if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
- err = -EINVAL;
+ if (out_dev == in_dev &&
+ IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
+ reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE;
goto cleanup;
}
}
+ if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
- rth = dst_alloc(&ipv4_dst_ops);
+ fnhe = find_exception(nhc, daddr);
+ if (do_cache) {
+ if (fnhe)
+ rth = rcu_dereference(fnhe->fnhe_rth_input);
+ else
+ rth = rcu_dereference(nhc->nhc_rth_input);
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ goto out;
+ }
+ }
+
+ rth = rt_dst_alloc(out_dev->dev, 0, res->type,
+ IN_DEV_ORCONF(out_dev, NOXFRM));
if (!rth) {
- err = -ENOBUFS;
+ reason = SKB_DROP_REASON_NOMEM;
goto cleanup;
}
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- if (res->fi->fib_nhs > 1)
- rth->u.dst.flags |= DST_BALANCED;
-#endif
- if (in_dev->cnf.no_policy)
- rth->u.dst.flags |= DST_NOPOLICY;
- if (in_dev->cnf.no_xfrm)
- rth->u.dst.flags |= DST_NOXFRM;
- rth->fl.fl4_dst = daddr;
- rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
- rth->rt_src = saddr;
- rth->rt_gateway = daddr;
- rth->rt_iif =
- rth->fl.iif = in_dev->dev->ifindex;
- rth->u.dst.dev = (out_dev)->dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
- rth->fl.oif = 0;
- rth->rt_spec_dst= spec_dst;
-
- rth->u.dst.input = ip_forward;
- rth->u.dst.output = ip_output;
-
- rt_set_nexthop(rth, res, itag);
-
- rth->rt_flags = flags;
-
- *result = rth;
- err = 0;
- cleanup:
- /* release the working reference to the output device */
- in_dev_put(out_dev);
- return err;
-}
+ rth->rt_is_input = 1;
+ RT_CACHE_STAT_INC(in_slow_tot);
-static inline int ip_mkroute_input_def(struct sk_buff *skb,
- struct fib_result* res,
- const struct flowi *fl,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos)
+ rth->dst.input = ip_forward;
+
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
+ do_cache);
+ lwtunnel_set_redirect(&rth->dst);
+ skb_dst_set(skb, &rth->dst);
+out:
+ reason = SKB_NOT_DROPPED_YET;
+cleanup:
+ return reason;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses.
+ */
+static void ip_multipath_l3_keys(const struct sk_buff *skb,
+ struct flow_keys *hash_keys)
{
- struct rtable* rth = NULL;
- int err;
- unsigned hash;
+ const struct iphdr *outer_iph = ip_hdr(skb);
+ const struct iphdr *key_iph = outer_iph;
+ const struct iphdr *inner_iph;
+ const struct icmphdr *icmph;
+ struct iphdr _inner_iph;
+ struct icmphdr _icmph;
+
+ if (likely(outer_iph->protocol != IPPROTO_ICMP))
+ goto out;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ goto out;
+
+ icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+ &_icmph);
+ if (!icmph)
+ goto out;
+
+ if (!icmp_is_err(icmph->type))
+ goto out;
+
+ inner_iph = skb_header_pointer(skb,
+ outer_iph->ihl * 4 + sizeof(_icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto out;
+
+ key_iph = inner_iph;
+out:
+ hash_keys->addrs.v4addrs.src = key_iph->saddr;
+ hash_keys->addrs.v4addrs.dst = key_iph->daddr;
+}
+
+static u32 fib_multipath_custom_hash_outer(const struct net *net,
+ const struct sk_buff *skb,
+ bool *p_has_inner)
+{
+ u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+ struct flow_keys keys, hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 fib_multipath_custom_hash_inner(const struct net *net,
+ const struct sk_buff *skb,
+ bool has_inner)
+{
+ u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+ struct flow_keys keys, hash_keys;
+
+ /* We assume the packet carries an encapsulation, but if none was
+ * encountered during dissection of the outer flow, then there is no
+ * point in calling the flow dissector again.
+ */
+ if (!has_inner)
+ return 0;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+ if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
+ return 0;
+
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ }
+
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 fib_multipath_custom_hash_skb(const struct net *net,
+ const struct sk_buff *skb)
+{
+ u32 mhash, mhash_inner;
+ bool has_inner = true;
+ mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
+ mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
+
+ return jhash_2words(mhash, mhash_inner, 0);
+}
+
+static u32 fib_multipath_custom_hash_fl4(const struct net *net,
+ const struct flowi4 *fl4)
+{
+ u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+ struct flow_keys hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = fl4->flowi4_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
+ }
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = fl4->fl4_dport;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+/* if skb is set it will be used and fl4 can be NULL */
+int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
+ const struct sk_buff *skb, struct flow_keys *flkeys)
+{
+ u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
+ struct flow_keys hash_keys;
+ u32 mhash = 0;
+
+ switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
+ case 0:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (skb) {
+ ip_multipath_l3_keys(skb, &hash_keys);
+ } else {
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 1:
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+ struct flow_keys keys;
+
+ /* short-circuit if we already have L4 hash present */
+ if (skb->l4_hash)
+ return skb_get_hash_raw(skb) >> 1;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+
+ if (!flkeys) {
+ skb_flow_dissect_flow_keys(skb, &keys, flag);
+ flkeys = &keys;
+ }
+
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+ hash_keys.ports.src = flkeys->ports.src;
+ hash_keys.ports.dst = flkeys->ports.dst;
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+ } else {
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
+ hash_keys.ports.dst = fl4->fl4_dport;
+ hash_keys.basic.ip_proto = fl4->flowi4_proto;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ struct flow_keys keys;
+
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ /* Inner can be v4 or v6 */
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ ip_multipath_l3_keys(skb, &hash_keys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 3:
+ if (skb)
+ mhash = fib_multipath_custom_hash_skb(net, skb);
+ else
+ mhash = fib_multipath_custom_hash_fl4(net, fl4);
+ break;
+ }
+
+ if (multipath_hash)
+ mhash = jhash_2words(mhash, multipath_hash, 0);
+
+ return mhash >> 1;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static enum skb_drop_reason
+ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
+ struct in_device *in_dev, __be32 daddr,
+ __be32 saddr, dscp_t dscp, struct flow_keys *hkeys)
+{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
- fib_select_multipath(fl, res);
+ if (res->fi && fib_info_num_path(res->fi) > 1) {
+ int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
+
+ fib_select_multipath(res, h, NULL);
+ IPCB(skb)->flags |= IPSKB_MULTIPATH;
+ }
#endif
/* create a routing cache entry */
- err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
- if (err)
- return err;
-
- /* put it into the cache */
- hash = rt_hash(daddr, saddr, fl->iif);
- return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp);
}
-static inline int ip_mkroute_input(struct sk_buff *skb,
- struct fib_result* res,
- const struct flowi *fl,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos)
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+enum skb_drop_reason
+ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ const struct sk_buff *hint)
{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- struct rtable* rth = NULL, *rtres;
- unsigned char hop, hopcount;
- int err = -EINVAL;
- unsigned int hash;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rt = skb_rtable(hint);
+ struct net *net = dev_net(dev);
+ u32 tag = 0;
- if (res->fi)
- hopcount = res->fi->fib_nhs;
- else
- hopcount = 1;
-
- /* distinguish between multipath and singlepath */
- if (hopcount < 2)
- return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
- saddr, tos);
-
- /* add all alternatives to the routing cache */
- for (hop = 0; hop < hopcount; hop++) {
- res->nh_sel = hop;
-
- /* put reference to previous result */
- if (hop)
- ip_rt_put(rtres);
-
- /* create a routing cache entry */
- err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
- &rth);
- if (err)
- return err;
+ if (!in_dev)
+ return reason;
- /* put it into the cache */
- hash = rt_hash(daddr, saddr, fl->iif);
- err = rt_intern_hash(hash, rth, &rtres);
- if (err)
- return err;
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
+ goto martian_source;
+ }
- /* forward hop information to multipath impl. */
- multipath_set_nhinfo(rth,
- FIB_RES_NETWORK(*res),
- FIB_RES_NETMASK(*res),
- res->prefixlen,
- &FIB_RES_NH(*res));
+ if (ipv4_is_zeronet(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
+ goto martian_source;
}
- skb->dst = &rtres->u.dst;
- return err;
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
+ reason = SKB_DROP_REASON_IP_LOCALNET;
+ goto martian_source;
+ }
+
+ if (!(rt->rt_flags & RTCF_LOCAL))
+ goto skip_validate_source;
+
+ reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
+ in_dev, &tag);
+ if (reason)
+ goto martian_source;
+
+skip_validate_source:
+ skb_dst_copy(skb, hint);
+ return SKB_NOT_DROPPED_YET;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ return reason;
}
+/* get device for dst_alloc with local routes */
+static struct net_device *ip_rt_get_dev(struct net *net,
+ const struct fib_result *res)
+{
+ struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
+ struct net_device *dev = NULL;
+
+ if (nhc)
+ dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
+
+ return dev ? : net->loopback_dev;
+}
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
+ * Changes in the enforced policies must be applied also to
+ * ip_route_use_hint().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
* 2. IP spoofing attempts are filtered with 100% of guarantee.
+ * called with rcu_read_lock()
*/
-static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+static enum skb_drop_reason
+ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ struct fib_result *res)
{
- struct fib_result res;
- struct in_device *in_dev = in_dev_get(dev);
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = saddr,
- .tos = tos,
- .scope = RT_SCOPE_UNIVERSE,
- } },
- .mark = skb->mark,
- .iif = dev->ifindex };
- unsigned flags = 0;
- u32 itag = 0;
- struct rtable * rth;
- unsigned hash;
- __be32 spec_dst;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct flow_keys *flkeys = NULL, _flkeys;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_info *tun_info;
int err = -EINVAL;
- int free_res = 0;
+ unsigned int flags = 0;
+ u32 itag = 0;
+ struct rtable *rth;
+ struct flowi4 fl4;
+ bool do_cache = true;
/* IP on this device is disabled. */
@@ -1931,130 +2282,190 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto out;
/* Check for the most weird martians, which can be not detected
- by fib_lookup.
+ * by fib_lookup.
*/
- if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+ else
+ fl4.flowi4_tun_key.tun_id = 0;
+ skb_dst_drop(skb);
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
goto martian_source;
+ }
- if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+ res->fi = NULL;
+ res->table = NULL;
+ if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
/* Accept zero addresses only to limited broadcast;
* I even do not know to fix it or not. Waiting for complains :-)
*/
- if (ZERONET(saddr))
+ if (ipv4_is_zeronet(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
goto martian_source;
+ }
- if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+ if (ipv4_is_zeronet(daddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_DEST;
goto martian_destination;
+ }
+
+ /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+ * and call it once if daddr or/and saddr are loopback addresses
+ */
+ if (ipv4_is_loopback(daddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
+ reason = SKB_DROP_REASON_IP_LOCALNET;
+ goto martian_destination;
+ }
+ } else if (ipv4_is_loopback(saddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
+ reason = SKB_DROP_REASON_IP_LOCALNET;
+ goto martian_source;
+ }
+ }
/*
* Now we are ready to route packet.
*/
- if ((err = fib_lookup(&fl, &res)) != 0) {
+ fl4.flowi4_l3mdev = 0;
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_dscp = dscp;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+ fl4.flowi4_multipath_hash = 0;
+
+ if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
+ flkeys = &_flkeys;
+ } else {
+ fl4.flowi4_proto = 0;
+ fl4.fl4_sport = 0;
+ fl4.fl4_dport = 0;
+ }
+
+ err = fib_lookup(net, &fl4, res, 0);
+ if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
- goto e_hostunreach;
+ err = -EHOSTUNREACH;
goto no_route;
}
- free_res = 1;
-
- RT_CACHE_STAT_INC(in_slow_tot);
- if (res.type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST) {
+ if (IN_DEV_BFORWARD(in_dev))
+ goto make_route;
+ /* not do cache if bc_forwarding is enabled */
+ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
+ do_cache = false;
goto brd_input;
+ }
- if (res.type == RTN_LOCAL) {
- int result;
- result = fib_validate_source(saddr, daddr, tos,
- loopback_dev.ifindex,
- dev, &spec_dst, &itag);
- if (result < 0)
+ err = -EINVAL;
+ if (res->type == RTN_LOCAL) {
+ reason = fib_validate_source_reason(skb, saddr, daddr, dscp,
+ 0, dev, in_dev, &itag);
+ if (reason)
goto martian_source;
- if (result)
- flags |= RTCF_DIRECTSRC;
- spec_dst = daddr;
goto local_input;
}
- if (!IN_DEV_FORWARD(in_dev))
- goto e_hostunreach;
- if (res.type != RTN_UNICAST)
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
+ goto no_route;
+ }
+ if (res->type != RTN_UNICAST) {
+ reason = SKB_DROP_REASON_IP_INVALID_DEST;
goto martian_destination;
+ }
- err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
- if (err == -ENOBUFS)
- goto e_nobufs;
- if (err == -EINVAL)
- goto e_inval;
-
-done:
- in_dev_put(in_dev);
- if (free_res)
- fib_res_put(&res);
-out: return err;
+make_route:
+ reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp,
+ flkeys);
+
+out:
+ return reason;
brd_input:
- if (skb->protocol != htons(ETH_P_IP))
- goto e_inval;
+ if (skb->protocol != htons(ETH_P_IP)) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
+ goto out;
+ }
- if (ZERONET(saddr))
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- else {
- err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
- &itag);
- if (err < 0)
+ if (!ipv4_is_zeronet(saddr)) {
+ reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
+ dev, in_dev, &itag);
+ if (reason)
goto martian_source;
- if (err)
- flags |= RTCF_DIRECTSRC;
}
flags |= RTCF_BROADCAST;
- res.type = RTN_BROADCAST;
+ res->type = RTN_BROADCAST;
RT_CACHE_STAT_INC(in_brd);
local_input:
- rth = dst_alloc(&ipv4_dst_ops);
+ if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+ do_cache &= res->fi && !itag;
+ if (do_cache) {
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+ rth = rcu_dereference(nhc->nhc_rth_input);
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ reason = SKB_NOT_DROPPED_YET;
+ goto out;
+ }
+ }
+
+ rth = rt_dst_alloc(ip_rt_get_dev(net, res),
+ flags | RTCF_LOCAL, res->type, false);
if (!rth)
goto e_nobufs;
- rth->u.dst.output= ip_rt_bug;
-
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
- if (in_dev->cnf.no_policy)
- rth->u.dst.flags |= DST_NOPOLICY;
- rth->fl.fl4_dst = daddr;
- rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
- rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
- rth->u.dst.tclassid = itag;
+ rth->dst.output= ip_rt_bug;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rth->dst.tclassid = itag;
#endif
- rth->rt_iif =
- rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = &loopback_dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
- rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->u.dst.input= ip_local_deliver;
- rth->rt_flags = flags|RTCF_LOCAL;
- if (res.type == RTN_UNREACHABLE) {
- rth->u.dst.input= ip_error;
- rth->u.dst.error= -err;
- rth->rt_flags &= ~RTCF_LOCAL;
- }
- rth->rt_type = res.type;
- hash = rt_hash(daddr, saddr, fl.iif);
- err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
- goto done;
+ rth->rt_is_input = 1;
+
+ RT_CACHE_STAT_INC(in_slow_tot);
+ if (res->type == RTN_UNREACHABLE) {
+ rth->dst.input= ip_error;
+ rth->dst.error= -err;
+ rth->rt_flags &= ~RTCF_LOCAL;
+ }
+
+ if (do_cache) {
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+ rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ WARN_ON(rth->dst.input == lwtunnel_input);
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+
+ if (unlikely(!rt_cache_route(nhc, rth)))
+ rt_add_uncached_list(rth);
+ }
+ skb_dst_set(skb, &rth->dst);
+ reason = SKB_NOT_DROPPED_YET;
+ goto out;
no_route:
RT_CACHE_STAT_INC(in_no_route);
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
- res.type = RTN_UNREACHABLE;
+ res->type = RTN_UNREACHABLE;
+ res->fi = NULL;
+ res->table = NULL;
goto local_input;
/*
@@ -2063,1139 +2474,1326 @@ no_route:
martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
- "%u.%u.%u.%u, dev %s\n",
- NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
#endif
-
-e_hostunreach:
- err = -EHOSTUNREACH;
- goto done;
-
-e_inval:
- err = -EINVAL;
- goto done;
+ goto out;
e_nobufs:
- err = -ENOBUFS;
- goto done;
+ reason = SKB_DROP_REASON_NOMEM;
+ goto out;
martian_source:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
- goto e_inval;
+ goto out;
}
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+/* called with rcu_read_lock held */
+static enum skb_drop_reason
+ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ struct fib_result *res)
{
- struct rtable * rth;
- unsigned hash;
- int iif = dev->ifindex;
+ /* Multicast recognition logic is moved from route cache to here.
+ * The problem was that too many Ethernet cards have broken/missing
+ * hardware multicast filters :-( As result the host on multicasting
+ * network acquires a lot of useless route cache entries, sort of
+ * SDR messages from all the world. Now we try to get rid of them.
+ * Really, provided software IP multicast filter is organized
+ * reasonably (at least, hashed), it does not result in a slowdown
+ * comparing with route cache reject entries.
+ * Note, that multicast routers are not affected, because
+ * route cache entry is created eventually.
+ */
+ if (ipv4_is_multicast(daddr)) {
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ int our = 0;
- tos &= IPTOS_RT_MASK;
- hash = rt_hash(daddr, saddr, iif);
+ if (!in_dev)
+ return reason;
- rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.rt_next)) {
- if (rth->fl.fl4_dst == daddr &&
- rth->fl.fl4_src == saddr &&
- rth->fl.iif == iif &&
- rth->fl.oif == 0 &&
- rth->fl.mark == skb->mark &&
- rth->fl.fl4_tos == tos) {
- rth->u.dst.lastuse = jiffies;
- dst_hold(&rth->u.dst);
- rth->u.dst.__use++;
- RT_CACHE_STAT_INC(in_hit);
- rcu_read_unlock();
- skb->dst = (struct dst_entry*)rth;
- return 0;
- }
- RT_CACHE_STAT_INC(in_hlist_search);
- }
- rcu_read_unlock();
+ our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
- /* Multicast recognition logic is moved from route cache to here.
- The problem was that too many Ethernet cards have broken/missing
- hardware multicast filters :-( As result the host on multicasting
- network acquires a lot of useless route cache entries, sort of
- SDR messages from all the world. Now we try to get rid of them.
- Really, provided software IP multicast filter is organized
- reasonably (at least, hashed), it does not result in a slowdown
- comparing with route cache reject entries.
- Note, that multicast routers are not affected, because
- route cache entry is created eventually.
- */
- if (MULTICAST(daddr)) {
- struct in_device *in_dev;
+ /* check l3 master if no match yet */
+ if (!our && netif_is_l3_slave(dev)) {
+ struct in_device *l3_in_dev;
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
- int our = ip_check_mc(in_dev, daddr, saddr,
- skb->nh.iph->protocol);
- if (our
+ l3_in_dev = __in_dev_get_rcu(skb->dev);
+ if (l3_in_dev)
+ our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+ }
+
+ if (our
#ifdef CONFIG_IP_MROUTE
- || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
- ) {
- rcu_read_unlock();
- return ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
- }
+ ) {
+ reason = ip_route_input_mc(skb, daddr, saddr, dscp,
+ dev, our);
}
- rcu_read_unlock();
- return -EINVAL;
+ return reason;
}
- return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+
+ return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res);
}
-static inline int __mkroute_output(struct rtable **result,
- struct fib_result* res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr,
+ __be32 saddr, dscp_t dscp,
+ struct net_device *dev)
{
- struct rtable *rth;
+ enum skb_drop_reason reason;
+ struct fib_result res;
+
+ rcu_read_lock();
+ reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res);
+ rcu_read_unlock();
+
+ return reason;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
+/* called with rcu_read_lock() */
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi4 *fl4, int orig_oif,
+ struct net_device *dev_out,
+ unsigned int flags)
+{
+ struct fib_info *fi = res->fi;
+ struct fib_nh_exception *fnhe;
struct in_device *in_dev;
- u32 tos = RT_FL_TOS(oldflp);
- int err = 0;
+ u16 type = res->type;
+ struct rtable *rth;
+ bool do_cache;
- if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
- return -EINVAL;
+ in_dev = __in_dev_get_rcu(dev_out);
+ if (!in_dev)
+ return ERR_PTR(-EINVAL);
- if (fl->fl4_dst == htonl(0xFFFFFFFF))
- res->type = RTN_BROADCAST;
- else if (MULTICAST(fl->fl4_dst))
- res->type = RTN_MULTICAST;
- else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
- return -EINVAL;
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(fl4->saddr) &&
+ !(dev_out->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev_out))
+ return ERR_PTR(-EINVAL);
+
+ if (ipv4_is_lbcast(fl4->daddr)) {
+ type = RTN_BROADCAST;
+
+ /* reset fi to prevent gateway resolution */
+ fi = NULL;
+ } else if (ipv4_is_multicast(fl4->daddr)) {
+ type = RTN_MULTICAST;
+ } else if (ipv4_is_zeronet(fl4->daddr)) {
+ return ERR_PTR(-EINVAL);
+ }
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- /* get work reference to inet device */
- in_dev = in_dev_get(dev_out);
- if (!in_dev)
- return -EINVAL;
-
- if (res->type == RTN_BROADCAST) {
+ do_cache = true;
+ if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- if (res->fi) {
- fib_info_put(res->fi);
- res->fi = NULL;
- }
- } else if (res->type == RTN_MULTICAST) {
- flags |= RTCF_MULTICAST|RTCF_LOCAL;
- if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
- oldflp->proto))
+ } else if (type == RTN_MULTICAST) {
+ flags |= RTCF_MULTICAST | RTCF_LOCAL;
+ if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+ fl4->flowi4_proto))
flags &= ~RTCF_LOCAL;
+ else
+ do_cache = false;
/* If multicast route do not exist use
- default one, but do not gateway in this case.
- Yes, it is hack.
+ * default one, but do not gateway in this case.
+ * Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4) {
- fib_info_put(res->fi);
- res->fi = NULL;
- }
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
+ } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+ (orig_oif != dev_out->ifindex)) {
+ /* For local routes that require a particular output interface
+ * we do not want to cache the result. Caching the result
+ * causes incorrect behaviour when there are multiple source
+ * addresses on the interface, the end result being that if the
+ * intended recipient is waiting on that interface for the
+ * packet he won't receive it because it will be delivered on
+ * the loopback interface and the IP_PKTINFO ipi_ifindex will
+ * be set to the loopback interface as well.
+ */
+ do_cache = false;
}
+ fnhe = NULL;
+ do_cache &= fi != NULL;
+ if (fi) {
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+ struct rtable __rcu **prth;
+
+ fnhe = find_exception(nhc, fl4->daddr);
+ if (!do_cache)
+ goto add;
+ if (fnhe) {
+ prth = &fnhe->fnhe_rth_output;
+ } else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nhc->nhc_gw_family &&
+ nhc->nhc_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
+ }
+ rth = rcu_dereference(*prth);
+ if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
+ return rth;
+ }
- rth = dst_alloc(&ipv4_dst_ops);
- if (!rth) {
- err = -ENOBUFS;
- goto cleanup;
- }
+add:
+ rth = rt_dst_alloc(dev_out, flags, type,
+ IN_DEV_ORCONF(in_dev, NOXFRM));
+ if (!rth)
+ return ERR_PTR(-ENOBUFS);
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- if (res->fi) {
- rth->rt_multipath_alg = res->fi->fib_mp_alg;
- if (res->fi->fib_nhs > 1)
- rth->u.dst.flags |= DST_BALANCED;
- }
-#endif
- if (in_dev->cnf.no_xfrm)
- rth->u.dst.flags |= DST_NOXFRM;
- if (in_dev->cnf.no_policy)
- rth->u.dst.flags |= DST_NOPOLICY;
-
- rth->fl.fl4_dst = oldflp->fl4_dst;
- rth->fl.fl4_tos = tos;
- rth->fl.fl4_src = oldflp->fl4_src;
- rth->fl.oif = oldflp->oif;
- rth->fl.mark = oldflp->mark;
- rth->rt_dst = fl->fl4_dst;
- rth->rt_src = fl->fl4_src;
- rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
- /* get references to the devices that are to be hold by the routing
- cache entry */
- rth->u.dst.dev = dev_out;
- dev_hold(dev_out);
- rth->idev = in_dev_get(dev_out);
- rth->rt_gateway = fl->fl4_dst;
- rth->rt_spec_dst= fl->fl4_src;
-
- rth->u.dst.output=ip_output;
+ rth->rt_iif = orig_oif;
RT_CACHE_STAT_INC(out_slow_tot);
- if (flags & RTCF_LOCAL) {
- rth->u.dst.input = ip_local_deliver;
- rth->rt_spec_dst = fl->fl4_dst;
- }
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
- rth->rt_spec_dst = fl->fl4_src;
- if (flags & RTCF_LOCAL &&
+ if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
- rth->u.dst.output = ip_mc_output;
+ rth->dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
- if (res->type == RTN_MULTICAST) {
+ if (type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
- !LOCAL_MCAST(oldflp->fl4_dst)) {
- rth->u.dst.input = ip_mr_input;
- rth->u.dst.output = ip_mc_output;
+ !ipv4_is_local_multicast(fl4->daddr)) {
+ rth->dst.input = ip_mr_input;
+ rth->dst.output = ip_mr_output;
}
}
#endif
}
- rt_set_nexthop(rth, res, 0);
-
- rth->rt_flags = flags;
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
+ lwtunnel_set_redirect(&rth->dst);
- *result = rth;
- cleanup:
- /* release work reference to inet device */
- in_dev_put(in_dev);
-
- return err;
+ return rth;
}
-static inline int ip_mkroute_output_def(struct rtable **rp,
- struct fib_result* res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
-{
- struct rtable *rth = NULL;
- int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
- unsigned hash;
- if (err == 0) {
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
- err = rt_intern_hash(hash, rth, rp);
- }
-
- return err;
-}
+/*
+ * Major route resolver routine.
+ */
-static inline int ip_mkroute_output(struct rtable** rp,
- struct fib_result* res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ const struct sk_buff *skb)
{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- unsigned char hop;
- unsigned hash;
- int err = -EINVAL;
- struct rtable *rth = NULL;
-
- if (res->fi && res->fi->fib_nhs > 1) {
- unsigned char hopcount = res->fi->fib_nhs;
-
- for (hop = 0; hop < hopcount; hop++) {
- struct net_device *dev2nexthop;
-
- res->nh_sel = hop;
-
- /* hold a work reference to the output device */
- dev2nexthop = FIB_RES_DEV(*res);
- dev_hold(dev2nexthop);
-
- /* put reference to previous result */
- if (hop)
- ip_rt_put(*rp);
-
- err = __mkroute_output(&rth, res, fl, oldflp,
- dev2nexthop, flags);
-
- if (err != 0)
- goto cleanup;
+ struct fib_result res = {
+ .type = RTN_UNSPEC,
+ .fi = NULL,
+ .table = NULL,
+ .tclassid = 0,
+ };
+ struct rtable *rth;
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
- oldflp->oif);
- err = rt_intern_hash(hash, rth, rp);
+ fl4->flowi4_iif = LOOPBACK_IFINDEX;
- /* forward hop information to multipath impl. */
- multipath_set_nhinfo(rth,
- FIB_RES_NETWORK(*res),
- FIB_RES_NETMASK(*res),
- res->prefixlen,
- &FIB_RES_NH(*res));
- cleanup:
- /* release work reference to output device */
- dev_put(dev2nexthop);
+ rcu_read_lock();
+ rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
+ rcu_read_unlock();
- if (err != 0)
- return err;
- }
- return err;
- } else {
- return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
- flags);
- }
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
- return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
-#endif
+ return rth;
}
+EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
-/*
- * Major route resolver routine.
- */
-
-static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
-{
- u32 tos = RT_FL_TOS(oldflp);
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = oldflp->fl4_dst,
- .saddr = oldflp->fl4_src,
- .tos = tos & IPTOS_RT_MASK,
- .scope = ((tos & RTO_ONLINK) ?
- RT_SCOPE_LINK :
- RT_SCOPE_UNIVERSE),
- } },
- .mark = oldflp->mark,
- .iif = loopback_dev.ifindex,
- .oif = oldflp->oif };
- struct fib_result res;
- unsigned flags = 0;
+struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
+ struct fib_result *res,
+ const struct sk_buff *skb)
+{
struct net_device *dev_out = NULL;
- int free_res = 0;
+ int orig_oif = fl4->flowi4_oif;
+ unsigned int flags = 0;
+ struct rtable *rth;
int err;
-
- res.fi = NULL;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- if (oldflp->fl4_src) {
- err = -EINVAL;
- if (MULTICAST(oldflp->fl4_src) ||
- BADCLASS(oldflp->fl4_src) ||
- ZERONET(oldflp->fl4_src))
+ if (fl4->saddr) {
+ if (ipv4_is_multicast(fl4->saddr) ||
+ ipv4_is_lbcast(fl4->saddr)) {
+ rth = ERR_PTR(-EINVAL);
goto out;
+ }
- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(oldflp->fl4_src);
- if (dev_out == NULL)
- goto out;
+ rth = ERR_PTR(-ENETUNREACH);
/* I removed check for oif == dev_out->oif here.
- It was wrong for two reasons:
- 1. ip_dev_find(saddr) can return wrong iface, if saddr is
- assigned to multiple interfaces.
- 2. Moreover, we are allowed to send packets with saddr
- of another iface. --ANK
+ * It was wrong for two reasons:
+ * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ * is assigned to multiple interfaces.
+ * 2. Moreover, we are allowed to send packets with saddr
+ * of another iface. --ANK
*/
- if (oldflp->oif == 0
- && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ if (fl4->flowi4_oif == 0 &&
+ (ipv4_is_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr))) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = __ip_dev_find(net, fl4->saddr, false);
+ if (!dev_out)
+ goto out;
+
/* Special hack: user can direct multicasts
- and limited broadcast via necessary interface
- without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
- This hack is not just for fun, it allows
- vic,vat and friends to work.
- They bind socket to loopback, set ttl to zero
- and expect that it will work.
- From the viewpoint of routing cache they are broken,
- because we are not allowed to build multicast path
- with loopback source addr (look, routing cache
- cannot know, that ttl is zero, so that packet
- will not leave this host and route is valid).
- Luckily, this hack is good workaround.
+ * and limited broadcast via necessary interface
+ * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ * This hack is not just for fun, it allows
+ * vic,vat and friends to work.
+ * They bind socket to loopback, set ttl to zero
+ * and expect that it will work.
+ * From the viewpoint of routing cache they are broken,
+ * because we are not allowed to build multicast path
+ * with loopback source addr (look, routing cache
+ * cannot know, that ttl is zero, so that packet
+ * will not leave this host and route is valid).
+ * Luckily, this hack is good workaround.
*/
- fl.oif = dev_out->ifindex;
+ fl4->flowi4_oif = dev_out->ifindex;
goto make_route;
}
- if (dev_out)
- dev_put(dev_out);
- dev_out = NULL;
+
+ if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ if (!__ip_dev_find(net, fl4->saddr, false))
+ goto out;
+ }
}
- if (oldflp->oif) {
- dev_out = dev_get_by_index(oldflp->oif);
- err = -ENODEV;
- if (dev_out == NULL)
+ if (fl4->flowi4_oif) {
+ dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
+ rth = ERR_PTR(-ENODEV);
+ if (!dev_out)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
- if (__in_dev_get_rtnl(dev_out) == NULL) {
- dev_put(dev_out);
- goto out; /* Wrong error code */
+ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
}
-
- if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
- if (!fl.fl4_src)
- fl.fl4_src = inet_select_addr(dev_out, 0,
+ if (ipv4_is_local_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr) ||
+ fl4->flowi4_proto == IPPROTO_IGMP) {
+ if (!fl4->saddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
goto make_route;
}
- if (!fl.fl4_src) {
- if (MULTICAST(oldflp->fl4_dst))
- fl.fl4_src = inet_select_addr(dev_out, 0,
- fl.fl4_scope);
- else if (!oldflp->fl4_dst)
- fl.fl4_src = inet_select_addr(dev_out, 0,
+ if (!fl4->saddr) {
+ if (ipv4_is_multicast(fl4->daddr))
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ fl4->flowi4_scope);
+ else if (!fl4->daddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
}
- if (!fl.fl4_dst) {
- fl.fl4_dst = fl.fl4_src;
- if (!fl.fl4_dst)
- fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
- if (dev_out)
- dev_put(dev_out);
- dev_out = &loopback_dev;
- dev_hold(dev_out);
- fl.oif = loopback_dev.ifindex;
- res.type = RTN_LOCAL;
+ if (!fl4->daddr) {
+ fl4->daddr = fl4->saddr;
+ if (!fl4->daddr)
+ fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
+ dev_out = net->loopback_dev;
+ fl4->flowi4_oif = LOOPBACK_IFINDEX;
+ res->type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
- if (fib_lookup(&fl, &res)) {
- res.fi = NULL;
- if (oldflp->oif) {
+ err = fib_lookup(net, fl4, res, 0);
+ if (err) {
+ res->fi = NULL;
+ res->table = NULL;
+ if (fl4->flowi4_oif &&
+ (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
/* Apparently, routing tables are wrong. Assume,
- that the destination is on link.
-
- WHY? DW.
- Because we are allowed to send to iface
- even if it has NO routes and NO assigned
- addresses. When oif is specified, routing
- tables are looked up with only one purpose:
- to catch if destination is gatewayed, rather than
- direct. Moreover, if MSG_DONTROUTE is set,
- we send packet, ignoring both routing tables
- and ifaddr state. --ANK
-
-
- We could make it even if oif is unknown,
- likely IPv6, but we do not.
+ * that the destination is on link.
+ *
+ * WHY? DW.
+ * Because we are allowed to send to iface
+ * even if it has NO routes and NO assigned
+ * addresses. When oif is specified, routing
+ * tables are looked up with only one purpose:
+ * to catch if destination is gatewayed, rather than
+ * direct. Moreover, if MSG_DONTROUTE is set,
+ * we send packet, ignoring both routing tables
+ * and ifaddr state. --ANK
+ *
+ *
+ * We could make it even if oif is unknown,
+ * likely IPv6, but we do not.
*/
- if (fl.fl4_src == 0)
- fl.fl4_src = inet_select_addr(dev_out, 0,
+ if (fl4->saddr == 0)
+ fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
- res.type = RTN_UNICAST;
+ res->type = RTN_UNICAST;
goto make_route;
}
- if (dev_out)
- dev_put(dev_out);
- err = -ENETUNREACH;
+ rth = ERR_PTR(err);
goto out;
}
- free_res = 1;
-
- if (res.type == RTN_LOCAL) {
- if (!fl.fl4_src)
- fl.fl4_src = fl.fl4_dst;
- if (dev_out)
- dev_put(dev_out);
- dev_out = &loopback_dev;
- dev_hold(dev_out);
- fl.oif = dev_out->ifindex;
- if (res.fi)
- fib_info_put(res.fi);
- res.fi = NULL;
+
+ if (res->type == RTN_LOCAL) {
+ if (!fl4->saddr) {
+ if (res->fi->fib_prefsrc)
+ fl4->saddr = res->fi->fib_prefsrc;
+ else
+ fl4->saddr = fl4->daddr;
+ }
+
+ /* L3 master device is the loopback for that domain */
+ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
+ net->loopback_dev;
+
+ /* make sure orig_oif points to fib result device even
+ * though packet rx/tx happens over loopback or l3mdev
+ */
+ orig_oif = FIB_RES_OIF(*res);
+
+ fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- fib_select_multipath(&fl, &res);
- else
-#endif
- if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
- fib_select_default(&fl, &res);
+ fib_select_path(net, res, fl4, skb);
- if (!fl.fl4_src)
- fl.fl4_src = FIB_RES_PREFSRC(res);
+ dev_out = FIB_RES_DEV(*res);
- if (dev_out)
- dev_put(dev_out);
- dev_out = FIB_RES_DEV(res);
- dev_hold(dev_out);
- fl.oif = dev_out->ifindex;
+make_route:
+ rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
+out:
+ return rth;
+}
+
+static struct dst_ops ipv4_dst_blackhole_ops = {
+ .family = AF_INET,
+ .default_advmss = ipv4_default_advmss,
+ .neigh_lookup = ipv4_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
+};
-make_route:
- err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+ struct rtable *ort = dst_rtable(dst_orig);
+ struct rtable *rt;
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
+ if (rt) {
+ struct dst_entry *new = &rt->dst;
- if (free_res)
- fib_res_put(&res);
- if (dev_out)
- dev_put(dev_out);
-out: return err;
-}
+ new->__use = 1;
+ new->input = dst_discard;
+ new->output = dst_discard_out;
-int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
-{
- unsigned hash;
- struct rtable *rth;
+ new->dev = net->loopback_dev;
+ netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
- hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
-
- rcu_read_lock_bh();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.rt_next)) {
- if (rth->fl.fl4_dst == flp->fl4_dst &&
- rth->fl.fl4_src == flp->fl4_src &&
- rth->fl.iif == 0 &&
- rth->fl.oif == flp->oif &&
- rth->fl.mark == flp->mark &&
- !((rth->fl.fl4_tos ^ flp->fl4_tos) &
- (IPTOS_RT_MASK | RTO_ONLINK))) {
-
- /* check for multipath routes and choose one if
- * necessary
- */
- if (multipath_select_route(flp, rth, rp)) {
- dst_hold(&(*rp)->u.dst);
- RT_CACHE_STAT_INC(out_hit);
- rcu_read_unlock_bh();
- return 0;
- }
+ rt->rt_is_input = ort->rt_is_input;
+ rt->rt_iif = ort->rt_iif;
+ rt->rt_pmtu = ort->rt_pmtu;
+ rt->rt_mtu_locked = ort->rt_mtu_locked;
- rth->u.dst.lastuse = jiffies;
- dst_hold(&rth->u.dst);
- rth->u.dst.__use++;
- RT_CACHE_STAT_INC(out_hit);
- rcu_read_unlock_bh();
- *rp = rth;
- return 0;
- }
- RT_CACHE_STAT_INC(out_hlist_search);
+ rt->rt_genid = rt_genid_ipv4(net);
+ rt->rt_flags = ort->rt_flags;
+ rt->rt_type = ort->rt_type;
+ rt->rt_uses_gateway = ort->rt_uses_gateway;
+ rt->rt_gw_family = ort->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ rt->rt_gw4 = ort->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ rt->rt_gw6 = ort->rt_gw6;
}
- rcu_read_unlock_bh();
- return ip_route_output_slow(rp, flp);
-}
+ dst_release(dst_orig);
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+ return rt ? &rt->dst : ERR_PTR(-ENOMEM);
+}
-int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+ const struct sock *sk)
{
- int err;
+ struct rtable *rt = __ip_route_output_key(net, flp4);
- if ((err = __ip_route_output_key(rp, flp)) != 0)
- return err;
+ if (IS_ERR(rt))
+ return rt;
- if (flp->proto) {
- if (!flp->fl4_src)
- flp->fl4_src = (*rp)->rt_src;
- if (!flp->fl4_dst)
- flp->fl4_dst = (*rp)->rt_dst;
- return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
+ if (flp4->flowi4_proto) {
+ flp4->flowi4_oif = rt->dst.dev->ifindex;
+ rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0));
}
- return 0;
+ return rt;
}
-
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-int ip_route_output_key(struct rtable **rp, struct flowi *flp)
-{
- return ip_route_output_flow(rp, flp, NULL, 0);
-}
-
-static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- int nowait, unsigned int flags)
+/* called with rcu_read_lock held */
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct rtable *rt, u32 table_id, dscp_t dscp,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+ u32 seq, unsigned int flags)
{
- struct rtable *rt = (struct rtable*)skb->dst;
struct rtmsg *r;
struct nlmsghdr *nlh;
- long expires;
- u32 id = 0, ts = 0, tsage = 0, error;
+ unsigned long expires = 0;
+ u32 error;
+ u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
+ if (!nlh)
+ return -EMSGSIZE;
r = nlmsg_data(nlh);
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = rt->fl.fl4_tos;
- r->rtm_table = RT_TABLE_MAIN;
- NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
+ r->rtm_tos = inet_dscp_to_dsfield(dscp);
+ r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, RTA_TABLE, table_id))
+ goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
r->rtm_protocol = RTPROT_UNSPEC;
r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
+ if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
+ r->rtm_flags |= RTCF_DOREDIRECT;
- NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
-
- if (rt->fl.fl4_src) {
+ if (nla_put_in_addr(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (src) {
r->rtm_src_len = 32;
- NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
+ if (nla_put_in_addr(skb, RTA_SRC, src))
+ goto nla_put_failure;
}
- if (rt->u.dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (rt->u.dst.tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
- NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
+ if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rt->dst.tclassid &&
+ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+ goto nla_put_failure;
#endif
- if (rt->fl.iif)
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
- else if (rt->rt_src != rt->fl.fl4_src)
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
+ if (fl4 && !rt_is_input_route(rt) &&
+ fl4->saddr != src) {
+ if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
+ goto nla_put_failure;
+ }
+ if (rt->rt_uses_gateway) {
+ if (rt->rt_gw_family == AF_INET &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
+ goto nla_put_failure;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ }
+ }
- if (rt->rt_dst != rt->rt_gateway)
- NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
+ expires = READ_ONCE(rt->dst.expires);
+ if (expires) {
+ unsigned long now = jiffies;
- if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ if (time_before(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+ }
+
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt_pmtu && expires)
+ metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+ if (rt->rt_mtu_locked && expires)
+ metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- error = rt->u.dst.error;
- expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
- if (rt->peer) {
- id = rt->peer->ip_id_count;
- if (rt->peer->tcp_ts_stamp) {
- ts = rt->peer->tcp_ts;
- tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
- }
- }
+ if (fl4) {
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
- if (rt->fl.iif) {
-#ifdef CONFIG_IP_MROUTE
- __be32 dst = rt->rt_dst;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(),
+ fl4->flowi4_uid)))
+ goto nla_put_failure;
- if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
- ipv4_devconf.mc_forwarding) {
- int err = ipmr_get_route(skb, r, nowait);
- if (err <= 0) {
- if (!nowait) {
+ if (rt_is_input_route(rt)) {
+#ifdef CONFIG_IP_MROUTE
+ if (ipv4_is_multicast(dst) &&
+ !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, portid);
+
+ if (err <= 0) {
if (err == 0)
return 0;
goto nla_put_failure;
- } else {
- if (err == -EMSGSIZE)
- goto nla_put_failure;
- error = err;
}
- }
- } else
+ } else
#endif
- NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
+ goto nla_put_failure;
+ }
}
- if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
- expires, error) < 0)
+ error = rt->dst.error;
+
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, u32 table_id,
+ struct fnhe_hash_bucket *bucket, int genid,
+ int *fa_index, int fa_start, unsigned int flags)
+{
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ struct rtable *rt;
+ int err;
+
+ if (*fa_index < fa_start)
+ goto next;
+
+ if (fnhe->fnhe_genid != genid)
+ goto next;
+
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires))
+ goto next;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (!rt)
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (!rt)
+ goto next;
+
+ err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+ table_id, 0, NULL, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err)
+ return err;
+next:
+ (*fa_index)++;
+ }
+ }
+
+ return 0;
}
-int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+ u32 table_id, struct fib_info *fi,
+ int *fa_index, int fa_start, unsigned int flags)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ int nhsel, genid = fnhe_genid(net);
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+ struct fnhe_hash_bucket *bucket;
+ int err;
+
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ continue;
+
+ rcu_read_lock();
+ bucket = rcu_dereference(nhc->nhc_exceptions);
+ err = 0;
+ if (bucket)
+ err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
+ genid, fa_index, fa_start,
+ flags);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+ u8 ip_proto, __be16 sport,
+ __be16 dport)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ /* Reserve room for dummy headers, this skb can pass
+ * through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ iph = skb_put(skb, sizeof(struct iphdr));
+ iph->protocol = ip_proto;
+ iph->saddr = src;
+ iph->daddr = dst;
+ iph->version = 0x4;
+ iph->frag_off = 0;
+ iph->ihl = 0x5;
+ skb_set_transport_header(skb, skb->len);
+
+ switch (iph->protocol) {
+ case IPPROTO_UDP: {
+ struct udphdr *udph;
+
+ udph = skb_put_zero(skb, sizeof(struct udphdr));
+ udph->source = sport;
+ udph->dest = dport;
+ udph->len = htons(sizeof(struct udphdr));
+ udph->check = 0;
+ break;
+ }
+ case IPPROTO_TCP: {
+ struct tcphdr *tcph;
+
+ tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+ tcph->source = sport;
+ tcph->dest = dport;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+ src, dst, 0);
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr *icmph;
+
+ icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+ icmph->type = ICMP_ECHO;
+ icmph->code = 0;
+ }
+ }
+
+ return skb;
+}
+
+static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct rtmsg *rtm;
+ int i, err;
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG(extack,
+ "ipv4: Invalid header for route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
+ RTM_F_LOOKUP_TABLE |
+ RTM_F_FIB_MATCH)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_IIF:
+ case RTA_OIF:
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_IP_PROTO:
+ case RTA_SPORT:
+ case RTA_DPORT:
+ case RTA_MARK:
+ case RTA_UID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[RTA_MAX+1];
+ u32 table_id = RT_TABLE_MAIN;
+ __be16 sport = 0, dport = 0;
+ struct fib_result res = {};
+ u8 ip_proto = IPPROTO_UDP;
struct rtable *rt = NULL;
+ struct sk_buff *skb;
+ struct rtmsg *rtm;
+ struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
+ dscp_t dscp;
+ kuid_t uid;
u32 iif;
int err;
- struct sk_buff *skb;
+ int mark;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
+ err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
- goto errout;
+ return err;
rtm = nlmsg_data(nlh);
+ src = nla_get_in_addr_default(tb[RTA_SRC], 0);
+ dst = nla_get_in_addr_default(tb[RTA_DST], 0);
+ iif = nla_get_u32_default(tb[RTA_IIF], 0);
+ mark = nla_get_u32_default(tb[RTA_MARK], 0);
+ dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (skb == NULL) {
- err = -ENOBUFS;
- goto errout;
+ if (tb[RTA_IP_PROTO]) {
+ err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+ &ip_proto, AF_INET, extack);
+ if (err)
+ return err;
}
- /* Reserve room for dummy headers, this skb can pass
- through good chunk of routing engine.
- */
- skb->mac.raw = skb->nh.raw = skb->data;
+ if (tb[RTA_SPORT])
+ sport = nla_get_be16(tb[RTA_SPORT]);
- /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
- skb->nh.iph->protocol = IPPROTO_ICMP;
- skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+ if (tb[RTA_DPORT])
+ dport = nla_get_be16(tb[RTA_DPORT]);
- src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
- dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
- iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+ skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+ if (!skb)
+ return -ENOBUFS;
+
+ fl4.daddr = dst;
+ fl4.saddr = src;
+ fl4.flowi4_dscp = dscp;
+ fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
+ fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = uid;
+ if (sport)
+ fl4.fl4_sport = sport;
+ if (dport)
+ fl4.fl4_dport = dport;
+ fl4.flowi4_proto = ip_proto;
+
+ rcu_read_lock();
if (iif) {
struct net_device *dev;
- dev = __dev_get_by_index(iif);
- if (dev == NULL) {
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev) {
err = -ENODEV;
- goto errout_free;
+ goto errout_rcu;
}
- skb->protocol = htons(ETH_P_IP);
+ fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
- local_bh_disable();
- err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
- local_bh_enable();
+ skb->mark = mark;
+ err = ip_route_input_rcu(skb, dst, src, dscp, dev,
+ &res) ? -EINVAL : 0;
- rt = (struct rtable*) skb->dst;
- if (err == 0 && rt->u.dst.error)
- err = -rt->u.dst.error;
+ rt = skb_rtable(skb);
+ if (err == 0 && rt->dst.error)
+ err = -rt->dst.error;
} else {
- struct flowi fl = {
- .nl_u = {
- .ip4_u = {
- .daddr = dst,
- .saddr = src,
- .tos = rtm->rtm_tos,
- },
- },
- .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
- };
- err = ip_route_output_key(&rt, &fl);
+ fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ skb->dev = net->loopback_dev;
+ rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
+ err = 0;
+ if (IS_ERR(rt))
+ err = PTR_ERR(rt);
+ else
+ skb_dst_set(skb, &rt->dst);
}
if (err)
- goto errout_free;
+ goto errout_rcu;
- skb->dst = &rt->u.dst;
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
- RTM_NEWROUTE, 0, 0);
- if (err <= 0)
- goto errout_free;
+ if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
+ table_id = res.table ? res.table->tb_id : 0;
- err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
-errout:
- return err;
+ /* reset skb for netlink reply msg */
+ skb_trim(skb, 0);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_header(skb);
-errout_free:
- kfree_skb(skb);
- goto errout;
-}
+ if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ struct fib_rt_info fri;
-int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct rtable *rt;
- int h, s_h;
- int idx, s_idx;
-
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
- for (h = 0; h <= rt_hash_mask; h++) {
- if (h < s_h) continue;
- if (h > s_h)
- s_idx = 0;
- rcu_read_lock_bh();
- for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference(rt->u.rt_next), idx++) {
- if (idx < s_idx)
- continue;
- skb->dst = dst_clone(&rt->u.dst);
- if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- 1, NLM_F_MULTI) <= 0) {
- dst_release(xchg(&skb->dst, NULL));
- rcu_read_unlock_bh();
- goto done;
+ if (!res.fi) {
+ err = fib_props[res.type].error;
+ if (!err)
+ err = -EHOSTUNREACH;
+ goto errout_rcu;
+ }
+ fri.fi = res.fi;
+ fri.tb_id = table_id;
+ fri.dst = res.prefix;
+ fri.dst_len = res.prefixlen;
+ fri.dscp = res.dscp;
+ fri.type = rt->rt_type;
+ fri.offload = 0;
+ fri.trap = 0;
+ fri.offload_failed = 0;
+ if (res.fa_head) {
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
+ u8 slen = 32 - fri.dst_len;
+
+ if (fa->fa_slen == slen &&
+ fa->tb_id == fri.tb_id &&
+ fa->fa_dscp == fri.dscp &&
+ fa->fa_info == res.fi &&
+ fa->fa_type == fri.type) {
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed =
+ READ_ONCE(fa->offload_failed);
+ break;
+ }
}
- dst_release(xchg(&skb->dst, NULL));
}
- rcu_read_unlock_bh();
+ err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
+ } else {
+ err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
+ skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
}
+ if (err < 0)
+ goto errout_rcu;
-done:
- cb->args[0] = h;
- cb->args[1] = idx;
- return skb->len;
+ rcu_read_unlock();
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout_free:
+ return err;
+errout_rcu:
+ rcu_read_unlock();
+ kfree_skb(skb);
+ goto errout_free;
}
void ip_rt_multicast_event(struct in_device *in_dev)
{
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(in_dev->dev));
}
#ifdef CONFIG_SYSCTL
-static int flush_delay;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly = 8;
+static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
-static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
- struct file *filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = (struct net *)__ctl->extra1;
+
if (write) {
- proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
- rt_cache_flush(flush_delay);
+ rt_cache_flush(net);
+ fnhe_genid_bump(net);
return 0;
- }
+ }
return -EINVAL;
}
-static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
- int __user *name,
- int nlen,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval,
- size_t newlen,
- void **context)
-{
- int delay;
- if (newlen != sizeof(int))
- return -EINVAL;
- if (get_user(delay, (int __user *)newval))
- return -EFAULT;
- rt_cache_flush(delay);
- return 0;
-}
-
-ctl_table ipv4_route_table[] = {
- {
- .ctl_name = NET_IPV4_ROUTE_FLUSH,
- .procname = "flush",
- .data = &flush_delay,
- .maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = &ipv4_sysctl_rtcache_flush,
- .strategy = &ipv4_sysctl_rtcache_flush_strategy,
- },
+static struct ctl_table ipv4_route_table[] = {
{
- .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
- .procname = "min_delay",
- .data = &ip_rt_min_delay,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
- .procname = "max_delay",
- .data = &ip_rt_max_delay,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
.procname = "gc_thresh",
.data = &ipv4_dst_ops.gc_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
.procname = "max_size",
.data = &ip_rt_max_size,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
/* Deprecated. Use gc_min_interval_ms */
-
- .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
+
.procname = "gc_min_interval",
.data = &ip_rt_gc_min_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
.procname = "gc_min_interval_ms",
.data = &ip_rt_gc_min_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_ms_jiffies,
- .strategy = &sysctl_ms_jiffies,
+ .proc_handler = proc_dointvec_ms_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
.procname = "gc_timeout",
.data = &ip_rt_gc_timeout,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
.procname = "gc_interval",
.data = &ip_rt_gc_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
.procname = "redirect_load",
.data = &ip_rt_redirect_load,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
.procname = "redirect_number",
.data = &ip_rt_redirect_number,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
.procname = "redirect_silence",
.data = &ip_rt_redirect_silence,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
.procname = "error_cost",
.data = &ip_rt_error_cost,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
.procname = "error_burst",
.data = &ip_rt_error_burst,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
.procname = "gc_elasticity",
.data = &ip_rt_gc_elasticity,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
+};
+
+static const char ipv4_route_flush_procname[] = "flush";
+
+static struct ctl_table ipv4_route_netns_table[] = {
{
- .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
- .procname = "mtu_expires",
- .data = &ip_rt_mtu_expires,
+ .procname = ipv4_route_flush_procname,
.maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .mode = 0200,
+ .proc_handler = ipv4_sysctl_rtcache_flush,
},
{
- .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
- .procname = "min_pmtu",
- .data = &ip_rt_min_pmtu,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
+ .procname = "min_pmtu",
+ .data = &init_net.ipv4.ip_rt_min_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_min_valid_pmtu,
},
{
- .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
- .procname = "min_adv_mss",
- .data = &ip_rt_min_advmss,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
+ .procname = "mtu_expires",
+ .data = &init_net.ipv4.ip_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
- .procname = "secret_interval",
- .data = &ip_rt_secret_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .procname = "min_adv_mss",
+ .data = &init_net.ipv4.ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
- { .ctl_name = 0 }
};
-#endif
-
-#ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct *ip_rt_acct;
-/* This code sucks. But you should have seen it before! --RR */
+static __net_init int sysctl_route_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+ size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
-/* IP route accounting ptr for this logical cpu number. */
-#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
+ tbl = ipv4_route_netns_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
-#ifdef CONFIG_PROC_FS
-static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
- int length, int *eof, void *data)
-{
- unsigned int i;
+ tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
+ if (!tbl)
+ goto err_dup;
- if ((offset & 3) || (length & 3))
- return -EIO;
+ /* Don't export non-whitelisted sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ if (tbl[0].procname != ipv4_route_flush_procname)
+ table_size = 0;
+ }
- if (offset >= sizeof(struct ip_rt_acct) * 256) {
- *eof = 1;
- return 0;
+ /* Update the variables to point into the current struct net
+ * except for the first element flush
+ */
+ for (i = 1; i < table_size; i++)
+ tbl[i].data += (void *)net - (void *)&init_net;
}
+ tbl[0].extra1 = net;
- if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
- length = sizeof(struct ip_rt_acct) * 256 - offset;
- *eof = 1;
- }
+ net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
+ tbl, table_size);
+ if (!net->ipv4.route_hdr)
+ goto err_reg;
+ return 0;
- offset /= sizeof(u32);
+err_reg:
+ if (tbl != ipv4_route_netns_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
- if (length > 0) {
- u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
- u32 *dst = (u32 *) buffer;
+static __net_exit void sysctl_route_net_exit(struct net *net)
+{
+ const struct ctl_table *tbl;
- /* Copy first cpu. */
- *start = buffer;
- memcpy(dst, src, length);
+ tbl = net->ipv4.route_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.route_hdr);
+ BUG_ON(tbl == ipv4_route_netns_table);
+ kfree(tbl);
+}
- /* Add the other cpus in, one int at a time */
- for_each_possible_cpu(i) {
- unsigned int j;
+static __net_initdata struct pernet_operations sysctl_route_ops = {
+ .init = sysctl_route_net_init,
+ .exit = sysctl_route_net_exit,
+};
+#endif
- src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
+static __net_init int netns_ip_rt_init(struct net *net)
+{
+ /* Set default value for namespaceified sysctls */
+ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
+ net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+ net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
+ return 0;
+}
- for (j = 0; j < length/4; j++)
- dst[j] += src[j];
- }
- }
- return length;
+static struct pernet_operations __net_initdata ip_rt_ops = {
+ .init = netns_ip_rt_init,
+};
+
+static __net_init int rt_genid_init(struct net *net)
+{
+ atomic_set(&net->ipv4.rt_genid, 0);
+ atomic_set(&net->fnhe_genid, 0);
+ atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
+ return 0;
}
-#endif /* CONFIG_PROC_FS */
-#endif /* CONFIG_NET_CLS_ROUTE */
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
+static __net_initdata struct pernet_operations rt_genid_ops = {
+ .init = rt_genid_init,
+};
+
+static int __net_init ipv4_inetpeer_init(struct net *net)
{
- if (!str)
- return 0;
- rhash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv4.peers = bp;
+ return 0;
+}
+
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
+{
+ struct inet_peer_base *bp = net->ipv4.peers;
+
+ net->ipv4.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
}
-__setup("rhash_entries=", set_rhash_entries);
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+ .init = ipv4_inetpeer_init,
+ .exit = ipv4_inetpeer_exit,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+#endif /* CONFIG_IP_ROUTE_CLASSID */
+
+static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = {
+ {.protocol = PF_INET, .msgtype = RTM_GETROUTE,
+ .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+};
int __init ip_rt_init(void)
{
- int rc = 0;
+ void *idents_hash;
+ int cpu;
- rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
- (jiffies ^ (jiffies >> 7)));
+ /* For modern hosts, this will use 2 MB of memory */
+ idents_hash = alloc_large_system_hash("IP idents",
+ sizeof(*ip_idents) + sizeof(*ip_tstamps),
+ 0,
+ 16, /* one bucket per 64 KB */
+ HASH_ZERO,
+ NULL,
+ &ip_idents_mask,
+ 2048,
+ 256*1024);
-#ifdef CONFIG_NET_CLS_ROUTE
- {
- int order;
- for (order = 0;
- (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
- /* NOTHING */;
- ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+ ip_idents = idents_hash;
+
+ get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
+
+ ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+ INIT_LIST_HEAD(&ul->head);
+ spin_lock_init(&ul->lock);
+ }
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
- memset(ip_rt_acct, 0, PAGE_SIZE << order);
- }
#endif
- ipv4_dst_ops.kmem_cachep =
- kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-
- rt_hash_table = (struct rt_hash_bucket *)
- alloc_large_system_hash("IP route cache",
- sizeof(struct rt_hash_bucket),
- rhash_entries,
- (num_physpages >= 128 * 1024) ?
- 15 : 17,
- 0,
- &rt_hash_log,
- &rt_hash_mask,
- 0);
- memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
- rt_hash_lock_init();
-
- ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
- ip_rt_max_size = (rt_hash_mask + 1) * 16;
+ ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC);
- devinet_init();
- ip_fib_init();
+ ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
- init_timer(&rt_flush_timer);
- rt_flush_timer.function = rt_run_flush;
- init_timer(&rt_periodic_timer);
- rt_periodic_timer.function = rt_check_expire;
- init_timer(&rt_secret_timer);
- rt_secret_timer.function = rt_secret_rebuild;
+ if (dst_entries_init(&ipv4_dst_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_ops counter\n");
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
- ip_rt_gc_interval;
- add_timer(&rt_periodic_timer);
+ if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
- rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
- ip_rt_secret_interval;
- add_timer(&rt_secret_timer);
+ ipv4_dst_ops.gc_thresh = ~0;
+ ip_rt_max_size = INT_MAX;
-#ifdef CONFIG_PROC_FS
- {
- struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
- if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
- proc_net_stat))) {
- return -ENOMEM;
- }
- rtstat_pde->proc_fops = &rt_cpu_seq_fops;
- }
-#ifdef CONFIG_NET_CLS_ROUTE
- create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
-#endif
-#endif
+ devinet_init();
+ ip_fib_init();
+
+ if (ip_rt_proc_init())
+ pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
xfrm_init();
xfrm4_init();
#endif
- return rc;
+ rtnl_register_many(ip_rt_rtnl_msg_handlers);
+
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&sysctl_route_ops);
+#endif
+ register_pernet_subsys(&ip_rt_ops);
+ register_pernet_subsys(&rt_genid_ops);
+ register_pernet_subsys(&ipv4_inetpeer_ops);
+ return 0;
}
-EXPORT_SYMBOL(__ip_select_ident);
-EXPORT_SYMBOL(ip_route_input);
-EXPORT_SYMBOL(ip_route_output_key);
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+ register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
+}
+#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 6b19530905af..569befcf021b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -1,70 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Syncookies implementation for the Linux kernel
*
* Copyright (C) 1997 Andi Kleen
- * Based on ideas by D.J.Bernstein and Eric Schenk.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $
- *
- * Missing: IPv6 support.
+ * Based on ideas by D.J.Bernstein and Eric Schenk.
*/
#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/secure_seq.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/route.h>
-extern int sysctl_tcp_syncookies;
-
-static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
-
-static __init int init_syncookies(void)
-{
- get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
- return 0;
-}
-module_init(init_syncookies);
+static siphash_aligned_key_t syncookie_secret[2];
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK
+ * stores TCP options:
+ *
+ * MSB LSB
+ * | 31 ... 6 | 5 | 4 | 3 2 1 0 |
+ * | Timestamp | ECN | SACK | WScale |
+ *
+ * When we receive a valid cookie-ACK, we look at the echoed tsval (if
+ * any) to figure out which TCP options we should use for the rebuilt
+ * connection.
+ *
+ * A WScale setting of '0xf' (which is an invalid scaling value)
+ * means that original syn did not include the TCP window scaling option.
+ */
+#define TS_OPT_WSCALE_MASK 0xf
+#define TS_OPT_SACK BIT(4)
+#define TS_OPT_ECN BIT(5)
+/* There is no TS_OPT_TIMESTAMP:
+ * if ACK contains timestamp option, we already know it was
+ * requested/supported by the syn/synack exchange.
+ */
+#define TSBITS 6
+
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
-
- memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
- tmp[0] = (__force u32)saddr;
- tmp[1] = (__force u32)daddr;
- tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
- tmp[3] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+ net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ count, &syncookie_secret[c]);
+}
- return tmp[17];
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we encode
+ * tcp options in the lower bits of the timestamp value that will be
+ * sent in the syn-ack.
+ * Since subsequent timestamps use the normal tcp_time_stamp value, we
+ * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
+ */
+u64 cookie_init_timestamp(struct request_sock *req, u64 now)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ u64 ts, ts_now = tcp_ns_to_ts(false, now);
+ u32 options = 0;
+
+ options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
+ if (ireq->sack_ok)
+ options |= TS_OPT_SACK;
+ if (ireq->ecn_ok)
+ options |= TS_OPT_ECN;
+
+ ts = (ts_now >> TSBITS) << TSBITS;
+ ts |= options;
+ if (ts > ts_now)
+ ts -= (1UL << TSBITS);
+
+ if (tcp_rsk(req)->req_usec_ts)
+ return ts * NSEC_PER_USEC;
+ return ts * NSEC_PER_MSEC;
}
+
static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
- __be16 dport, __u32 sseq, __u32 count,
- __u32 data)
+ __be16 dport, __u32 sseq, __u32 data)
{
/*
* Compute the secure sequence number.
* The output should be:
- * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+ * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
* + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
* Where sseq is their sequence number and count increases every
* minute by 1.
* As an extra hack, we add a small "data" value that encodes the
* MSS into the second hash value.
*/
-
+ u32 count = tcp_cookie_time();
return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
sseq + (count << COOKIEBITS) +
((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
@@ -76,22 +107,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* If the syncookie is bad, the data returned will be out of
* range. This must be checked by the caller.
*
- * The count value used to generate the cookie must be within
- * "maxdiff" if the current (passed-in) "count". The return value
- * is (__u32)-1 if this test fails.
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
*/
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport, __u32 sseq,
- __u32 count, __u32 maxdiff)
+ __be16 sport, __be16 dport, __u32 sseq)
{
- __u32 diff;
+ u32 diff, count = tcp_cookie_time();
/* Strip away the layers from the cookie */
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
- diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
- if (diff >= maxdiff)
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
return (__u32)-1;
return (cookie -
@@ -99,185 +129,379 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
& COOKIEMASK; /* Leaving the data behind */
}
-/*
- * This table has to be sorted and terminated with (__u16)-1.
- * XXX generate a better table.
- * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+/*
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ * .. lower than 536 are rare (< 0.2%)
+ * .. between 537 and 1299 account for less than < 1.5% of observed values
+ * .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ * .. exceeding 1460 are very rare (< 0.04%)
+ *
+ * 1460 is the single most frequently announced mss value (30 to 46% depending
+ * on monitor location). Table must be sorted.
*/
static __u16 const msstab[] = {
- 64 - 1,
- 256 - 1,
- 512 - 1,
- 536 - 1,
- 1024 - 1,
- 1440 - 1,
- 1460 - 1,
- 4312 - 1,
- (__u16)-1
+ 536,
+ 1300,
+ 1440, /* 1440, 1452: PPPoE */
+ 1460,
};
-/* The number doesn't include the -1 terminator */
-#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
/*
* Generate a syncookie. mssp points to the mss, which is returned
* rounded down to the value encoded in the cookie.
*/
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ u16 *mssp)
{
- struct tcp_sock *tp = tcp_sk(sk);
int mssind;
const __u16 mss = *mssp;
-
- tp->last_synq_overflow = jiffies;
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+ *mssp = msstab[mssind];
- /* XXX sort msstab[] by probability? Binary search? */
- for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
- ;
- *mssp = msstab[mssind] + 1;
+ return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
+ th->source, th->dest, ntohl(th->seq),
+ mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
- NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
+__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
- return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->h.th->source, skb->h.th->dest,
- ntohl(skb->h.th->seq),
- jiffies / (HZ * 60), mssind);
+ return __cookie_v4_init_sequence(iph, th, mssp);
}
-/*
- * This (misnamed) value is the age of syncookie which is permitted.
- * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
- * sysctl_tcp_retries1. It's a rather complicated formula (exponential
- * backoff) to compute at runtime so it's currently hardcoded here.
- */
-#define COUNTER_TRIES 4
-/*
- * Check if a ack sequence number is a valid syncookie.
+/*
+ * Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
-static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th)
{
- __u32 seq;
+ __u32 cookie = ntohl(th->ack_seq) - 1;
+ __u32 seq = ntohl(th->seq) - 1;
__u32 mssind;
- seq = ntohl(skb->h.th->seq)-1;
- mssind = check_tcp_syn_cookie(cookie,
- skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->h.th->source, skb->h.th->dest,
- seq, jiffies / (HZ * 60), COUNTER_TRIES);
+ mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+ th->source, th->dest, seq);
- return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
-static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
+ bool own_req;
- child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
- if (child)
- inet_csk_reqsk_queue_add(sk, req, child);
- else
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ NULL, &own_req);
+ if (child) {
+ refcount_set(&req->rsk_refcnt, 1);
+ sock_rps_save_rxhash(child, skb);
+
+ if (rsk_drop_req(req)) {
+ reqsk_put(req);
+ return child;
+ }
+
+ if (inet_csk_reqsk_queue_add(sk, req, child))
+ return child;
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ }
+ __reqsk_free(req);
+
+ return NULL;
+}
+EXPORT_IPV6_MOD(tcp_get_cookie_sock);
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we stored
+ * additional tcp options in the timestamp.
+ * This extracts these options from the timestamp echo.
+ *
+ * return false if we decode a tcp option that is disabled
+ * on the host.
+ */
+bool cookie_timestamp_decode(const struct net *net,
+ struct tcp_options_received *tcp_opt)
+{
+ /* echoed timestamp, lowest bits contain options */
+ u32 options = tcp_opt->rcv_tsecr;
+
+ if (!tcp_opt->saw_tstamp) {
+ tcp_clear_options(tcp_opt);
+ return true;
+ }
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
+ return false;
+
+ tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
+
+ if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
+ return false;
+
+ if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
+ return true; /* no window scaling */
+
+ tcp_opt->wscale_ok = 1;
+ tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
+
+ return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0;
+}
+EXPORT_IPV6_MOD(cookie_timestamp_decode);
+
+static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ req->num_retrans = 0;
+
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_rmt_port = th->source;
+ ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+
+ if (IS_ENABLED(CONFIG_SMC))
+ ireq->smc_ok = 0;
+
+ treq->snt_synack = 0;
+ treq->snt_tsval_first = 0;
+ treq->tfo_listener = false;
+ treq->txhash = net_tx_rndhash();
+ treq->rcv_isn = ntohl(th->seq) - 1;
+ treq->snt_isn = ntohl(th->ack_seq) - 1;
+ treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+ treq->req_usec_ts = false;
+
+#if IS_ENABLED(CONFIG_MPTCP)
+ treq->is_mptcp = sk_is_mptcp(sk);
+ if (treq->is_mptcp)
+ return mptcp_subflow_init_cookie_req(req, sk, skb);
+#endif
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_BPF)
+struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct request_sock *req = inet_reqsk(skb->sk);
+
+ skb->sk = NULL;
+ skb->destructor = NULL;
+
+ if (cookie_tcp_reqsk_init(sk, skb, req)) {
reqsk_free(req);
+ req = NULL;
+ }
- return child;
+ return req;
}
+EXPORT_IPV6_MOD_GPL(cookie_bpf_check);
+#endif
-struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
- struct ip_options *opt)
+struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk, struct sk_buff *skb,
+ struct tcp_options_received *tcp_opt,
+ int mss, u32 tsoff)
{
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
- struct tcp_sock *tp = tcp_sk(sk);
- __u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
- struct sock *ret = sk;
- struct request_sock *req;
- int mss;
- struct rtable *rt;
- __u8 rcv_wscale;
+ struct request_sock *req;
+
+ if (sk_is_mptcp(sk))
+ req = mptcp_subflow_reqsk_alloc(ops, sk, false);
+ else
+ req = inet_reqsk_alloc(ops, sk, false);
- if (!sysctl_tcp_syncookies || !skb->h.th->ack)
+ if (!req)
+ return NULL;
+
+ if (cookie_tcp_reqsk_init(sk, skb, req)) {
+ reqsk_free(req);
+ return NULL;
+ }
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+
+ req->mss = mss;
+ req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0;
+
+ ireq->snd_wscale = tcp_opt->snd_wscale;
+ ireq->tstamp_ok = tcp_opt->saw_tstamp;
+ ireq->sack_ok = tcp_opt->sack_ok;
+ ireq->wscale_ok = tcp_opt->wscale_ok;
+ ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN);
+
+ treq->ts_off = tsoff;
+
+ return req;
+}
+EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc);
+
+static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct tcp_options_received tcp_opt;
+ u32 tsoff = 0;
+ int mss;
+
+ if (tcp_synq_no_recent_overflow(sk))
goto out;
- if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
- (mss = cookie_check(skb, cookie)) == 0) {
- NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED);
+ mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb));
+ if (!mss) {
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
- NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV);
- ret = NULL;
- req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
- if (!req)
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+ tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
+
+ if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
+ tsoff = secure_tcp_ts_off(net,
+ ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr);
+ tcp_opt.rcv_tsecr -= tsoff;
+ }
+
+ if (!cookie_timestamp_decode(net, &tcp_opt))
goto out;
- if (security_inet_conn_request(sk, skb, req)) {
- reqsk_free(req);
+ return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb,
+ &tcp_opt, mss, tsoff);
+out:
+ return ERR_PTR(-EINVAL);
+}
+
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ * NULL if memory could not be allocated.
+ */
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_request_sock *ireq;
+ struct net *net = sock_net(sk);
+ struct tcp_request_sock *treq;
+ struct request_sock *req;
+ struct sock *ret = sk;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ __u8 rcv_wscale;
+ int full_space;
+ SKB_DR(reason);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) ||
+ !th->ack || th->rst)
goto out;
+
+ if (cookie_bpf_ok(skb)) {
+ req = cookie_bpf_check(sk, skb);
+ } else {
+ req = cookie_tcp_check(net, sk, skb);
+ if (IS_ERR(req))
+ goto out;
+ }
+ if (!req) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
}
+
ireq = inet_rsk(req);
treq = tcp_rsk(req);
- treq->rcv_isn = ntohl(skb->h.th->seq) - 1;
- treq->snt_isn = cookie;
- req->mss = mss;
- ireq->rmt_port = skb->h.th->source;
- ireq->loc_addr = skb->nh.iph->daddr;
- ireq->rmt_addr = skb->nh.iph->saddr;
- ireq->opt = NULL;
+
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
*/
- if (opt && opt->optlen) {
- int opt_size = sizeof(struct ip_options) + opt->optlen;
+ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
- ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
- if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
- kfree(ireq->opt);
- ireq->opt = NULL;
- }
+ if (security_inet_conn_request(sk, skb, req)) {
+ SKB_DR_SET(reason, SECURITY_HOOK);
+ goto out_free;
}
- ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
- ireq->wscale_ok = ireq->sack_ok = 0;
- req->expires = 0UL;
- req->retrans = 0;
-
+ tcp_ao_syncookie(sk, skb, req, AF_INET);
+
/*
* We need to lookup the route here to get at the correct
* window size. We should better make sure that the window size
* hasn't changed since we received the original syn, but I see
- * no easy way to do this.
+ * no easy way to do this.
*/
- {
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = ((opt && opt->srr) ?
- opt->faddr :
- ireq->rmt_addr),
- .saddr = ireq->loc_addr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = skb->h.th->dest,
- .dport = skb->h.th->source } } };
- security_req_classify_flow(req, &fl);
- if (ip_route_output_key(&rt, &fl)) {
- reqsk_free(req);
- goto out;
- }
+ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
+ ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+ IPPROTO_TCP, inet_sk_flowi_flags(sk),
+ opt->srr ? opt->faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, th->source, th->dest,
+ sk_uid(sk));
+ security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ SKB_DR_SET(reason, IP_OUTNOROUTES);
+ goto out_free;
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW);
- tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
- 0, &rcv_wscale);
- /* BTW win scale with syncookies is 0 by definition */
- ireq->rcv_wscale = rcv_wscale;
-
- ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
-out: return ret;
+ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
+ dst_metric(&rt->dst, RTAX_WINDOW);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ tcp_select_initial_window(sk, full_space, req->mss,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
+ ireq->wscale_ok, &rcv_wscale,
+ dst_metric(&rt->dst, RTAX_INITRWND));
+
+ /* req->syncookie is set true only if ACK is validated
+ * by BPF kfunc, then, rcv_wscale is already configured.
+ */
+ if (!req->syncookie)
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
+ treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
+
+ ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
+ /* ip_queue_xmit() depends on our flow being setup
+ * Normal sockets get it right from inet_csk_route_child_sock()
+ */
+ if (!ret) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
+ inet_sk(ret)->cork.fl.u.ip4 = fl4;
+out:
+ return ret;
+out_free:
+ reqsk_free(req);
+out_drop:
+ sk_skb_reason_drop(sk, skb, reason);
+ return NULL;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index dfcf47f10f88..a1a50a5c80dc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,157 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
*
- * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $
- *
* Begun April 1, 1996, Mike Shaver.
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
*/
-#include <linux/mm.h>
-#include <linux/module.h>
#include <linux/sysctl.h>
-#include <linux/igmp.h>
-#include <linux/inetdevice.h>
-#include <net/snmp.h>
+#include <linux/seqlock.h>
+#include <linux/init.h>
+#include <linux/slab.h>
#include <net/icmp.h>
#include <net/ip.h>
-#include <net/route.h>
+#include <net/ip_fib.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/cipso_ipv4.h>
+#include <net/ping.h>
+#include <net/protocol.h>
+#include <net/netevent.h>
-/* From af_inet.c */
-extern int sysctl_ip_nonlocal_bind;
-
-#ifdef CONFIG_SYSCTL
-static int zero;
-static int tcp_retr1_max = 255;
+static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
-#endif
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int tcp_app_win_max = 31;
+static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
+static int tcp_min_snd_mss_max = 65535;
+static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int tcp_syn_retries_min = 1;
+static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT;
+static unsigned long ip_ping_group_range_min[] = { 0, 0 };
+static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static u32 u32_max_div_HZ = UINT_MAX / HZ;
+static int one_day_secs = 24 * 3600;
+static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
+ FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
+static int tcp_plb_max_rounds = 31;
+static int tcp_plb_max_cong_thresh = 256;
+static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
+static int tcp_ecn_mode_max = 2;
+static u32 icmp_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
-struct ipv4_config ipv4_config;
+/* obsolete */
+static int sysctl_tcp_low_latency __read_mostly;
-#ifdef CONFIG_SYSCTL
+/* Update system visible IP port range */
+static void set_local_port_range(struct net *net, unsigned int low, unsigned int high)
+{
+ bool same_parity = !((low ^ high) & 1);
-static
-int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ if (same_parity && !net->ipv4.ip_local_ports.warned) {
+ net->ipv4.ip_local_ports.warned = true;
+ pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
+ }
+ WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_local_port_range(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int val = ipv4_devconf.forwarding;
+ struct net *net = table->data;
int ret;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &range,
+ .maxlen = sizeof(range),
+ .mode = table->mode,
+ .extra1 = &ip_local_port_range_min,
+ .extra2 = &ip_local_port_range_max,
+ };
- ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ inet_get_local_port_range(net, &range[0], &range[1]);
- if (write && ipv4_devconf.forwarding != val)
- inet_forward_change();
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ /* Ensure that the upper limit is not smaller than the lower,
+ * and that the lower does not encroach upon the privileged
+ * port limit.
+ */
+ if ((range[1] < range[0]) ||
+ (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock)))
+ ret = -EINVAL;
+ else
+ set_local_port_range(net, range[0], range[1]);
+ }
return ret;
}
-static int ipv4_sysctl_forward_strategy(ctl_table *table,
- int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int *valp = table->data;
- int new;
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_prot_sock);
+ int ret;
+ int pports;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &pports,
+ .maxlen = sizeof(pports),
+ .mode = table->mode,
+ .extra1 = &ip_privileged_port_min,
+ .extra2 = &ip_privileged_port_max,
+ };
+
+ pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
- if (!newval || !newlen)
- return 0;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ inet_get_local_port_range(net, &range[0], &range[1]);
+ /* Ensure that the local port range doesn't overlap with the
+ * privileged port range.
+ */
+ if (range[0] < pports)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports);
+ }
+
+ return ret;
+}
- if (newlen != sizeof(int))
- return -EINVAL;
+static void inet_get_ping_group_range_table(const struct ctl_table *table,
+ kgid_t *low, kgid_t *high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ unsigned int seq;
+ do {
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
- if (get_user(new, (int __user *)newval))
- return -EFAULT;
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
- if (new == *valp)
- return 0;
+/* Update system visible IP port range */
+static void set_ping_group_range(const struct ctl_table *table,
+ kgid_t low, kgid_t high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ write_seqlock(&net->ipv4.ping_group_range.lock);
+ data[0] = low;
+ data[1] = high;
+ write_sequnlock(&net->ipv4.ping_group_range.lock);
+}
- if (oldval && oldlenp) {
- size_t len;
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int ret;
+ unsigned long urange[2];
+ kgid_t low, high;
+ struct ctl_table tmp = {
+ .data = &urange,
+ .maxlen = sizeof(urange),
+ .mode = table->mode,
+ .extra1 = &ip_ping_group_range_min,
+ .extra2 = &ip_ping_group_range_max,
+ };
- if (get_user(len, oldlenp))
- return -EFAULT;
+ inet_get_ping_group_range_table(table, &low, &high);
+ urange[0] = from_kgid_munged(user_ns, low);
+ urange[1] = from_kgid_munged(user_ns, high);
+ ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, valp, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
+ if (write && ret == 0) {
+ low = make_kgid(user_ns, urange[0]);
+ high = make_kgid(user_ns, urange[1]);
+ if (!gid_valid(low) || !gid_valid(high))
+ return -EINVAL;
+ if (urange[1] < urange[0] || gid_lt(high, low)) {
+ low = make_kgid(&init_user_ns, 1);
+ high = make_kgid(&init_user_ns, 0);
}
+ set_ping_group_range(table, low, high);
}
- *valp = new;
- inet_forward_change();
- return 1;
+ return ret;
}
-static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv4_fwd_update_priority(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- char val[TCP_CA_NAME_MAX];
- ctl_table tbl = {
- .data = val,
- .maxlen = TCP_CA_NAME_MAX,
- };
+ struct net *net;
int ret;
- tcp_get_default_congestion_control(val);
-
- ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_fwd_update_priority);
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
- ret = tcp_set_default_congestion_control(val);
+ call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
+ net);
+
return ret;
}
-static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
- int nlen, void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+static int proc_tcp_congestion_control(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = container_of(ctl->data, struct net,
+ ipv4.tcp_congestion_control);
char val[TCP_CA_NAME_MAX];
- ctl_table tbl = {
+ struct ctl_table tbl = {
.data = val,
.maxlen = TCP_CA_NAME_MAX,
};
int ret;
- tcp_get_default_congestion_control(val);
- ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
- context);
- if (ret == 0 && newval && newlen)
- ret = tcp_set_default_congestion_control(val);
+ tcp_get_default_congestion_control(net, val);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_default_congestion_control(net, val);
return ret;
}
-static int proc_tcp_available_congestion_control(ctl_table *ctl,
- int write, struct file * filp,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int proc_tcp_available_congestion_control(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
if (!tbl.data)
return -ENOMEM;
tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
- ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
kfree(tbl.data);
return ret;
}
-static int proc_allowed_congestion_control(ctl_table *ctl,
- int write, struct file * filp,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int proc_allowed_congestion_control(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
@@ -159,657 +263,1467 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
return -ENOMEM;
tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
- ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0)
ret = tcp_set_allowed_congestion_control(tbl.data);
kfree(tbl.data);
return ret;
}
-static int strategy_allowed_congestion_control(ctl_table *table, int __user *name,
- int nlen, void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+static int sscanf_key(char *buf, __le32 *key)
+{
+ u32 user_key[4];
+ int i, ret = 0;
+
+ if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ } else {
+ for (i = 0; i < ARRAY_SIZE(user_key); i++)
+ key[i] = cpu_to_le32(user_key[i]);
+ }
+ pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3], buf, ret);
+
+ return ret;
+}
+
+static int proc_tcp_fastopen_key(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen);
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)) };
+ u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)];
+ __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)];
+ char *backup_data;
+ int ret, i = 0, off = 0, n_keys;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key);
+ if (!n_keys) {
+ memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH);
+ n_keys = 1;
+ }
+
+ for (i = 0; i < n_keys * 4; i++)
+ user_key[i] = le32_to_cpu(key[i]);
+
+ for (i = 0; i < n_keys; i++) {
+ off += snprintf(tbl.data + off, tbl.maxlen - off,
+ "%08x-%08x-%08x-%08x",
+ user_key[i * 4],
+ user_key[i * 4 + 1],
+ user_key[i * 4 + 2],
+ user_key[i * 4 + 3]);
+
+ if (WARN_ON_ONCE(off >= tbl.maxlen - 1))
+ break;
+
+ if (i + 1 < n_keys)
+ off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
+ }
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ backup_data = strchr(tbl.data, ',');
+ if (backup_data) {
+ *backup_data = '\0';
+ backup_data++;
+ }
+ if (sscanf_key(tbl.data, key)) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ if (backup_data) {
+ if (sscanf_key(backup_data, key + 4)) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ }
+ tcp_fastopen_reset_cipher(net, NULL, key,
+ backup_data ? key + 4 : NULL);
+ }
+
+bad_key:
+ kfree(tbl.data);
+ return ret;
+}
+
+static int proc_tfo_blackhole_detect_timeout(const struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen_blackhole_timeout);
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
+ return ret;
+}
+
+static int proc_tcp_available_ulp(const struct ctl_table *ctl,
+ int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+ struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
if (!tbl.data)
return -ENOMEM;
-
- tcp_get_available_congestion_control(tbl.data, tbl.maxlen);
- ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
- context);
- if (ret == 0 && newval && newlen)
- ret = tcp_set_allowed_congestion_control(tbl.data);
+ tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
kfree(tbl.data);
return ret;
+}
+
+static int proc_tcp_ehash_entries(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_child_ehash_entries);
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
+ int tcp_ehash_entries;
+ struct ctl_table tbl;
+
+ tcp_ehash_entries = hinfo->ehash_mask + 1;
+ /* A negative number indicates that the child netns
+ * shares the global ehash.
+ */
+ if (!net_eq(net, &init_net) && !hinfo->pernet)
+ tcp_ehash_entries *= -1;
+
+ memset(&tbl, 0, sizeof(tbl));
+ tbl.data = &tcp_ehash_entries;
+ tbl.maxlen = sizeof(int);
+
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}
-ctl_table ipv4_table[] = {
- {
- .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
- .procname = "tcp_timestamps",
- .data = &sysctl_tcp_timestamps,
+static int proc_udp_hash_entries(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_udp_child_hash_entries);
+ int udp_hash_entries;
+ struct ctl_table tbl;
+
+ udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+ /* A negative number indicates that the child netns
+ * shares the global udp_table.
+ */
+ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+ udp_hash_entries *= -1;
+
+ memset(&tbl, 0, sizeof(tbl));
+ tbl.data = &udp_hash_entries;
+ tbl.maxlen = sizeof(int);
+
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int proc_fib_multipath_hash_policy(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_fib_multipath_hash_policy);
+ int ret;
+
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+
+static int proc_fib_multipath_hash_fields(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_fib_multipath_hash_fields);
+ ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+
+static u32 proc_fib_multipath_hash_rand_seed __ro_after_init;
+
+static void proc_fib_multipath_hash_init_rand_seed(void)
+{
+ get_random_bytes(&proc_fib_multipath_hash_rand_seed,
+ sizeof(proc_fib_multipath_hash_rand_seed));
+}
+
+static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
+{
+ struct sysctl_fib_multipath_hash_seed new = {
+ .user_seed = user_seed,
+ .mp_seed = (user_seed ? user_seed :
+ proc_fib_multipath_hash_rand_seed),
+ };
+
+ WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new);
+}
+
+static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sysctl_fib_multipath_hash_seed *mphs;
+ struct net *net = table->data;
+ struct ctl_table tmp;
+ u32 user_seed;
+ int ret;
+
+ mphs = &net->ipv4.sysctl_fib_multipath_hash_seed;
+ user_seed = mphs->user_seed;
+
+ tmp = *table;
+ tmp.data = &user_seed;
+
+ ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ proc_fib_multipath_hash_set_seed(net, user_seed);
+ call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+ }
+
+ return ret;
+}
+#else
+
+static void proc_fib_multipath_hash_init_rand_seed(void)
+{
+}
+
+static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
+{
+}
+
+#endif
+
+static struct ctl_table ipv4_table[] = {
+ {
+ .procname = "tcp_max_orphans",
+ .data = &sysctl_tcp_max_orphans,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
- {
- .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
- .procname = "tcp_window_scaling",
- .data = &sysctl_tcp_window_scaling,
+ {
+ .procname = "inet_peer_threshold",
+ .data = &inet_peer_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
- {
- .ctl_name = NET_IPV4_TCP_SACK,
- .procname = "tcp_sack",
- .data = &sysctl_tcp_sack,
+ {
+ .procname = "inet_peer_minttl",
+ .data = &inet_peer_minttl,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_jiffies,
},
- {
- .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
- .procname = "tcp_retrans_collapse",
- .data = &sysctl_tcp_retrans_collapse,
+ {
+ .procname = "inet_peer_maxttl",
+ .data = &inet_peer_maxttl,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_jiffies,
},
- {
- .ctl_name = NET_IPV4_FORWARD,
- .procname = "ip_forward",
- .data = &ipv4_devconf.forwarding,
- .maxlen = sizeof(int),
+ {
+ .procname = "tcp_mem",
+ .maxlen = sizeof(sysctl_tcp_mem),
+ .data = &sysctl_tcp_mem,
.mode = 0644,
- .proc_handler = &ipv4_sysctl_forward,
- .strategy = &ipv4_sysctl_forward_strategy
+ .proc_handler = proc_doulongvec_minmax,
},
- {
- .ctl_name = NET_IPV4_DEFAULT_TTL,
- .procname = "ip_default_ttl",
- .data = &sysctl_ip_default_ttl,
+ {
+ .procname = "tcp_low_latency",
+ .data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &ipv4_doint_and_flush,
- .strategy = &ipv4_doint_and_flush_strategy,
+ .proc_handler = proc_dointvec
},
- {
- .ctl_name = NET_IPV4_NO_PMTU_DISC,
- .procname = "ip_no_pmtu_disc",
- .data = &ipv4_config.no_pmtu_disc,
+#ifdef CONFIG_NETLABEL
+ {
+ .procname = "cipso_cache_enable",
+ .data = &cipso_v4_cache_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_NONLOCAL_BIND,
- .procname = "ip_nonlocal_bind",
- .data = &sysctl_ip_nonlocal_bind,
+ .procname = "cipso_cache_bucket_size",
+ .data = &cipso_v4_cache_bucketsize,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
- .procname = "tcp_syn_retries",
- .data = &sysctl_tcp_syn_retries,
+ .procname = "cipso_rbm_optfmt",
+ .data = &cipso_v4_rbm_optfmt,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_TCP_SYNACK_RETRIES,
- .procname = "tcp_synack_retries",
- .data = &sysctl_tcp_synack_retries,
+ .procname = "cipso_rbm_strictvalid",
+ .data = &cipso_v4_rbm_strictvalid,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NETLABEL */
+ {
+ .procname = "tcp_available_ulp",
+ .maxlen = TCP_ULP_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_ulp,
+ },
+ {
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
},
{
- .ctl_name = NET_TCP_MAX_ORPHANS,
- .procname = "tcp_max_orphans",
- .data = &sysctl_tcp_max_orphans,
- .maxlen = sizeof(int),
+ .procname = "fib_sync_mem",
+ .data = &sysctl_fib_sync_mem,
+ .maxlen = sizeof(sysctl_fib_sync_mem),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = &sysctl_fib_sync_mem_min,
+ .extra2 = &sysctl_fib_sync_mem_max,
},
+};
+
+static struct ctl_table ipv4_net_table[] = {
{
- .ctl_name = NET_TCP_MAX_TW_BUCKETS,
.procname = "tcp_max_tw_buckets",
- .data = &tcp_death_row.sysctl_max_tw_buckets,
+ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
- .procname = "ipfrag_high_thresh",
- .data = &sysctl_ipfrag_high_thresh,
- .maxlen = sizeof(int),
+ .procname = "icmp_echo_ignore_all",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
- .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
- .procname = "ipfrag_low_thresh",
- .data = &sysctl_ipfrag_low_thresh,
- .maxlen = sizeof(int),
+ .procname = "icmp_echo_enable_probe",
+ .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
- .ctl_name = NET_IPV4_DYNADDR,
- .procname = "ip_dynaddr",
- .data = &sysctl_ip_dynaddr,
- .maxlen = sizeof(int),
+ .procname = "icmp_echo_ignore_broadcasts",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
- .ctl_name = NET_IPV4_IPFRAG_TIME,
- .procname = "ipfrag_time",
- .data = &sysctl_ipfrag_time,
- .maxlen = sizeof(int),
+ .procname = "icmp_ignore_bogus_error_responses",
+ .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
- .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
- .procname = "tcp_keepalive_time",
- .data = &sysctl_tcp_keepalive_time,
- .maxlen = sizeof(int),
+ .procname = "icmp_errors_use_inbound_ifaddr",
+ .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
- .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
- .procname = "tcp_keepalive_probes",
- .data = &sysctl_tcp_keepalive_probes,
- .maxlen = sizeof(int),
+ .procname = "icmp_errors_extension_mask",
+ .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmp_errors_extension_mask_all,
},
{
- .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
- .procname = "tcp_keepalive_intvl",
- .data = &sysctl_tcp_keepalive_intvl,
+ .procname = "icmp_ratelimit",
+ .data = &init_net.ipv4.sysctl_icmp_ratelimit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dointvec_ms_jiffies,
},
{
- .ctl_name = NET_IPV4_TCP_RETRIES1,
- .procname = "tcp_retries1",
- .data = &sysctl_tcp_retries1,
+ .procname = "icmp_ratemask",
+ .data = &init_net.ipv4.sysctl_icmp_ratemask,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra2 = &tcp_retr1_max
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_RETRIES2,
- .procname = "tcp_retries2",
- .data = &sysctl_tcp_retries2,
+ .procname = "icmp_msgs_per_sec",
+ .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
},
{
- .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
- .procname = "tcp_fin_timeout",
- .data = &sysctl_tcp_fin_timeout,
+ .procname = "icmp_msgs_burst",
+ .data = &init_net.ipv4.sysctl_icmp_msgs_burst,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
},
-#ifdef CONFIG_SYN_COOKIES
{
- .ctl_name = NET_TCP_SYNCOOKIES,
- .procname = "tcp_syncookies",
- .data = &sysctl_tcp_syncookies,
- .maxlen = sizeof(int),
+ .procname = "ping_group_range",
+ .data = &init_net.ipv4.ping_group_range.range,
+ .maxlen = sizeof(gid_t)*2,
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = ipv4_ping_group_range,
+ },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "raw_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_raw_l3mdev_accept,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
- .ctl_name = NET_TCP_TW_RECYCLE,
- .procname = "tcp_tw_recycle",
- .data = &tcp_death_row.sysctl_tw_recycle,
- .maxlen = sizeof(int),
+ .procname = "tcp_ecn",
+ .data = &init_net.ipv4.sysctl_tcp_ecn,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_ecn_mode_max,
},
{
- .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
- .procname = "tcp_abort_on_overflow",
- .data = &sysctl_tcp_abort_on_overflow,
- .maxlen = sizeof(int),
+ .procname = "tcp_ecn_option",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
{
- .ctl_name = NET_TCP_STDURG,
- .procname = "tcp_stdurg",
- .data = &sysctl_tcp_stdurg,
- .maxlen = sizeof(int),
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
},
{
- .ctl_name = NET_TCP_RFC1337,
- .procname = "tcp_rfc1337",
- .data = &sysctl_tcp_rfc1337,
- .maxlen = sizeof(int),
+ .procname = "tcp_ecn_fallback",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
- .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
- .procname = "tcp_max_syn_backlog",
- .data = &sysctl_max_syn_backlog,
- .maxlen = sizeof(int),
+ .procname = "ip_dynaddr",
+ .data = &init_net.ipv4.sysctl_ip_dynaddr,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ip_early_demux",
+ .data = &init_net.ipv4.sysctl_ip_early_demux,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "udp_early_demux",
+ .data = &init_net.ipv4.sysctl_udp_early_demux,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_early_demux",
+ .data = &init_net.ipv4.sysctl_tcp_early_demux,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "nexthop_compat_mode",
+ .data = &init_net.ipv4.sysctl_nexthop_compat_mode,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "ip_default_ttl",
+ .data = &init_net.ipv4.sysctl_ip_default_ttl,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = &ip_ttl_min,
+ .extra2 = &ip_ttl_max,
},
{
- .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
.procname = "ip_local_port_range",
- .data = &sysctl_local_port_range,
- .maxlen = sizeof(sysctl_local_port_range),
+ .maxlen = 0,
+ .data = &init_net,
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = ip_local_port_range_min,
- .extra2 = ip_local_port_range_max
+ .proc_handler = ipv4_local_port_range,
},
{
- .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
- .procname = "icmp_echo_ignore_all",
- .data = &sysctl_icmp_echo_ignore_all,
+ .procname = "ip_local_reserved_ports",
+ .data = &init_net.ipv4.sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "ip_no_pmtu_disc",
+ .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ip_forward_use_pmtu",
+ .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ip_forward_update_priority",
+ .data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = ipv4_fwd_update_priority,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ip_autobind_reuse",
+ .data = &init_net.ipv4.sysctl_ip_autobind_reuse,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_fwmark_accept",
+ .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "tcp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "tcp_mtu_probing",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probing,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_base_mss",
+ .data = &init_net.ipv4.sysctl_tcp_base_mss,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
- .procname = "icmp_echo_ignore_broadcasts",
- .data = &sysctl_icmp_echo_ignore_broadcasts,
+ .procname = "tcp_min_snd_mss",
+ .data = &init_net.ipv4.sysctl_tcp_min_snd_mss,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
},
{
- .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
- .procname = "icmp_ignore_bogus_error_responses",
- .data = &sysctl_icmp_ignore_bogus_error_responses,
+ .procname = "tcp_mtu_probe_floor",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
},
{
- .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
- .procname = "icmp_errors_use_inbound_ifaddr",
- .data = &sysctl_icmp_errors_use_inbound_ifaddr,
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE,
- .procname = "route",
- .maxlen = 0,
- .mode = 0555,
- .child = ipv4_route_table
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = &u32_max_div_HZ,
+ },
+ {
+ .procname = "igmp_link_local_mcast_reports",
+ .data = &init_net.ipv4.sysctl_igmp_llm_reports,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
},
-#ifdef CONFIG_IP_MULTICAST
{
- .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
.procname = "igmp_max_memberships",
- .data = &sysctl_igmp_max_memberships,
+ .data = &init_net.ipv4.sysctl_igmp_max_memberships,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
-
-#endif
{
- .ctl_name = NET_IPV4_IGMP_MAX_MSF,
.procname = "igmp_max_msf",
- .data = &sysctl_igmp_max_msf,
+ .data = &init_net.ipv4.sysctl_igmp_max_msf,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
+#ifdef CONFIG_IP_MULTICAST
{
- .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
- .procname = "inet_peer_threshold",
- .data = &inet_peer_threshold,
+ .procname = "igmp_qrv",
+ .data = &init_net.ipv4.sysctl_igmp_qrv,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE
},
+#endif
{
- .ctl_name = NET_IPV4_INET_PEER_MINTTL,
- .procname = "inet_peer_minttl",
- .data = &inet_peer_minttl,
- .maxlen = sizeof(int),
+ .procname = "tcp_congestion_control",
+ .data = &init_net.ipv4.tcp_congestion_control,
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = proc_tcp_congestion_control,
},
{
- .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
- .procname = "inet_peer_maxttl",
- .data = &inet_peer_maxttl,
- .maxlen = sizeof(int),
+ .procname = "tcp_available_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_congestion_control,
+ },
+ {
+ .procname = "tcp_allowed_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_allowed_congestion_control,
},
{
- .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
- .procname = "inet_peer_gc_mintime",
- .data = &inet_peer_gc_mintime,
+ .procname = "tcp_keepalive_time",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
- .procname = "inet_peer_gc_maxtime",
- .data = &inet_peer_gc_maxtime,
- .maxlen = sizeof(int),
+ .procname = "tcp_keepalive_probes",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_probes,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_TCP_ORPHAN_RETRIES,
- .procname = "tcp_orphan_retries",
- .data = &sysctl_tcp_orphan_retries,
+ .procname = "tcp_keepalive_intvl",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_TCP_FACK,
- .procname = "tcp_fack",
- .data = &sysctl_tcp_fack,
- .maxlen = sizeof(int),
+ .procname = "tcp_syn_retries",
+ .data = &init_net.ipv4.sysctl_tcp_syn_retries,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
+ },
+ {
+ .procname = "tcp_synack_retries",
+ .data = &init_net.ipv4.sysctl_tcp_synack_retries,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+#ifdef CONFIG_SYN_COOKIES
+ {
+ .procname = "tcp_syncookies",
+ .data = &init_net.ipv4.sysctl_tcp_syncookies,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+#endif
+ {
+ .procname = "tcp_migrate_req",
+ .data = &init_net.ipv4.sysctl_tcp_migrate_req,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
- .ctl_name = NET_TCP_REORDERING,
.procname = "tcp_reordering",
- .data = &sysctl_tcp_reordering,
+ .data = &init_net.ipv4.sysctl_tcp_reordering,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_ECN,
- .procname = "tcp_ecn",
- .data = &sysctl_tcp_ecn,
- .maxlen = sizeof(int),
+ .procname = "tcp_retries1",
+ .data = &init_net.ipv4.sysctl_tcp_retries1,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_retr1_max
+ },
+ {
+ .procname = "tcp_retries2",
+ .data = &init_net.ipv4.sysctl_tcp_retries2,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_TCP_DSACK,
- .procname = "tcp_dsack",
- .data = &sysctl_tcp_dsack,
+ .procname = "tcp_orphan_retries",
+ .data = &init_net.ipv4.sysctl_tcp_orphan_retries,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_fin_timeout",
+ .data = &init_net.ipv4.sysctl_tcp_fin_timeout,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_TCP_MEM,
- .procname = "tcp_mem",
- .data = &sysctl_tcp_mem,
- .maxlen = sizeof(sysctl_tcp_mem),
+ .procname = "tcp_notsent_lowat",
+ .data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_douintvec,
},
{
- .ctl_name = NET_TCP_WMEM,
- .procname = "tcp_wmem",
- .data = &sysctl_tcp_wmem,
- .maxlen = sizeof(sysctl_tcp_wmem),
+ .procname = "tcp_tw_reuse",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
{
- .ctl_name = NET_TCP_RMEM,
- .procname = "tcp_rmem",
- .data = &sysctl_tcp_rmem,
- .maxlen = sizeof(sysctl_tcp_rmem),
+ .procname = "tcp_tw_reuse_delay",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &tcp_tw_reuse_delay_max,
},
{
- .ctl_name = NET_TCP_APP_WIN,
- .procname = "tcp_app_win",
- .data = &sysctl_tcp_app_win,
+ .procname = "tcp_max_syn_backlog",
+ .data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_ADV_WIN_SCALE,
- .procname = "tcp_adv_win_scale",
- .data = &sysctl_tcp_adv_win_scale,
+ .procname = "tcp_fastopen",
+ .data = &init_net.ipv4.sysctl_tcp_fastopen,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ICMP_RATELIMIT,
- .procname = "icmp_ratelimit",
- .data = &sysctl_icmp_ratelimit,
+ .procname = "tcp_fastopen_key",
+ .mode = 0600,
+ .data = &init_net.ipv4.sysctl_tcp_fastopen,
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)),
+ .proc_handler = proc_tcp_fastopen_key,
+ },
+ {
+ .procname = "tcp_fastopen_blackhole_timeout_sec",
+ .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_tfo_blackhole_detect_timeout,
+ .extra1 = SYSCTL_ZERO,
},
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
- .ctl_name = NET_IPV4_ICMP_RATEMASK,
- .procname = "icmp_ratemask",
- .data = &sysctl_icmp_ratemask,
- .maxlen = sizeof(int),
+ .procname = "fib_multipath_use_neigh",
+ .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
- .ctl_name = NET_TCP_TW_REUSE,
- .procname = "tcp_tw_reuse",
- .data = &sysctl_tcp_tw_reuse,
- .maxlen = sizeof(int),
+ .procname = "fib_multipath_hash_policy",
+ .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_fib_multipath_hash_policy,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
},
{
- .ctl_name = NET_TCP_FRTO,
- .procname = "tcp_frto",
- .data = &sysctl_tcp_frto,
- .maxlen = sizeof(int),
+ .procname = "fib_multipath_hash_fields",
+ .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields,
+ .maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_fib_multipath_hash_fields,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &fib_multipath_hash_fields_all_mask,
},
{
- .ctl_name = NET_TCP_LOW_LATENCY,
- .procname = "tcp_low_latency",
- .data = &sysctl_tcp_low_latency,
+ .procname = "fib_multipath_hash_seed",
+ .data = &init_net,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_fib_multipath_hash_seed,
+ },
+#endif
+ {
+ .procname = "ip_unprivileged_port_start",
.maxlen = sizeof(int),
+ .data = &init_net.ipv4.sysctl_ip_prot_sock,
+ .mode = 0644,
+ .proc_handler = ipv4_privileged_ports,
+ },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "udp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "tcp_sack",
+ .data = &init_net.ipv4.sysctl_tcp_sack,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_window_scaling",
+ .data = &init_net.ipv4.sysctl_tcp_window_scaling,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_timestamps",
+ .data = &init_net.ipv4.sysctl_tcp_timestamps,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_early_retrans",
+ .data = &init_net.ipv4.sysctl_tcp_early_retrans,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+ {
+ .procname = "tcp_recovery",
+ .data = &init_net.ipv4.sysctl_tcp_recovery,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_thin_linear_timeouts",
+ .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_slow_start_after_idle",
+ .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_retrans_collapse",
+ .data = &init_net.ipv4.sysctl_tcp_retrans_collapse,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_stdurg",
+ .data = &init_net.ipv4.sysctl_tcp_stdurg,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
- .procname = "ipfrag_secret_interval",
- .data = &sysctl_ipfrag_secret_interval,
+ .procname = "tcp_rfc1337",
+ .data = &init_net.ipv4.sysctl_tcp_rfc1337,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_abort_on_overflow",
+ .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_fack",
+ .data = &init_net.ipv4.sysctl_tcp_fack,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_max_reordering",
+ .data = &init_net.ipv4.sysctl_tcp_max_reordering,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_dsack",
+ .data = &init_net.ipv4.sysctl_tcp_dsack,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_app_win",
+ .data = &init_net.ipv4.sysctl_tcp_app_win,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_app_win_max,
},
{
- .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
- .procname = "ipfrag_max_dist",
- .data = &sysctl_ipfrag_max_dist,
+ .procname = "tcp_adv_win_scale",
+ .data = &init_net.ipv4.sysctl_tcp_adv_win_scale,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &zero
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_adv_win_scale_min,
+ .extra2 = &tcp_adv_win_scale_max,
+ },
+ {
+ .procname = "tcp_frto",
+ .data = &init_net.ipv4.sysctl_tcp_frto,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save",
- .data = &sysctl_tcp_nometrics_save,
- .maxlen = sizeof(int),
+ .data = &init_net.ipv4.sysctl_tcp_nometrics_save,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_no_ssthresh_metrics_save",
+ .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
- .ctl_name = NET_TCP_MODERATE_RCVBUF,
.procname = "tcp_moderate_rcvbuf",
- .data = &sysctl_tcp_moderate_rcvbuf,
+ .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_rcvbuf_low_rtt",
+ .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
{
- .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
.procname = "tcp_tso_win_divisor",
- .data = &sysctl_tcp_tso_win_divisor,
- .maxlen = sizeof(int),
+ .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_TCP_CONG_CONTROL,
- .procname = "tcp_congestion_control",
+ .procname = "tcp_workaround_signed_windows",
+ .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .maxlen = TCP_CA_NAME_MAX,
- .proc_handler = &proc_tcp_congestion_control,
- .strategy = &sysctl_tcp_congestion_control,
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_TCP_ABC,
- .procname = "tcp_abc",
- .data = &sysctl_tcp_abc,
+ .procname = "tcp_limit_output_bytes",
+ .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_MTU_PROBING,
- .procname = "tcp_mtu_probing",
- .data = &sysctl_tcp_mtu_probing,
+ .procname = "tcp_challenge_ack_limit",
+ .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_BASE_MSS,
- .procname = "tcp_base_mss",
- .data = &sysctl_tcp_base_mss,
- .maxlen = sizeof(int),
+ .procname = "tcp_min_tso_segs",
+ .data = &init_net.ipv4.sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ONE,
},
- {
- .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,
- .procname = "tcp_workaround_signed_windows",
- .data = &sysctl_tcp_workaround_signed_windows,
- .maxlen = sizeof(int),
+ {
+ .procname = "tcp_tso_rtt_log",
+ .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
-#ifdef CONFIG_NET_DMA
{
- .ctl_name = NET_TCP_DMA_COPYBREAK,
- .procname = "tcp_dma_copybreak",
- .data = &sysctl_tcp_dma_copybreak,
+ .procname = "tcp_min_rtt_wlen",
+ .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &one_day_secs
},
-#endif
{
- .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE,
- .procname = "tcp_slow_start_after_idle",
- .data = &sysctl_tcp_slow_start_after_idle,
- .maxlen = sizeof(int),
+ .procname = "tcp_autocorking",
+ .data = &init_net.ipv4.sysctl_tcp_autocorking,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
-#ifdef CONFIG_NETLABEL
{
- .ctl_name = NET_CIPSOV4_CACHE_ENABLE,
- .procname = "cipso_cache_enable",
- .data = &cipso_v4_cache_enabled,
+ .procname = "tcp_invalid_ratelimit",
+ .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec_ms_jiffies,
},
{
- .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE,
- .procname = "cipso_cache_bucket_size",
- .data = &cipso_v4_cache_bucketsize,
+ .procname = "tcp_pacing_ss_ratio",
+ .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
- .ctl_name = NET_CIPSOV4_RBM_OPTFMT,
- .procname = "cipso_rbm_optfmt",
- .data = &cipso_v4_rbm_optfmt,
+ .procname = "tcp_pacing_ca_ratio",
+ .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
- .ctl_name = NET_CIPSOV4_RBM_STRICTVALID,
- .procname = "cipso_rbm_strictvalid",
- .data = &cipso_v4_rbm_strictvalid,
+ .procname = "tcp_wmem",
+ .data = &init_net.ipv4.sysctl_tcp_wmem,
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_rmem",
+ .data = &init_net.ipv4.sysctl_tcp_rmem,
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_comp_sack_delay_ns",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_comp_sack_rtt_percent",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
-#endif /* CONFIG_NETLABEL */
{
- .ctl_name = NET_TCP_AVAIL_CONG_CONTROL,
- .procname = "tcp_available_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
+ .procname = "tcp_comp_sack_slack_ns",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_comp_sack_nr",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "tcp_backlog_ack_defer",
+ .data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_reflect_tos",
+ .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
.mode = 0444,
- .proc_handler = &proc_tcp_available_congestion_control,
+ .proc_handler = proc_tcp_ehash_entries,
},
{
- .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL,
- .procname = "tcp_allowed_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
+ .procname = "tcp_child_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_child_ehash_entries_max,
+ },
+ {
+ .procname = "udp_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .mode = 0444,
+ .proc_handler = proc_udp_hash_entries,
+ },
+ {
+ .procname = "udp_child_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &udp_child_hash_entries_max,
+ },
+ {
+ .procname = "udp_rmem_min",
+ .data = &init_net.ipv4.sysctl_udp_rmem_min,
+ .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE
+ },
+ {
+ .procname = "udp_wmem_min",
+ .data = &init_net.ipv4.sysctl_udp_wmem_min,
+ .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE
+ },
+ {
+ .procname = "fib_notify_on_flag_change",
+ .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "tcp_plb_enabled",
+ .data = &init_net.ipv4.sysctl_tcp_plb_enabled,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_plb_idle_rehash_rounds",
+ .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_plb_max_rounds,
+ },
+ {
+ .procname = "tcp_plb_rehash_rounds",
+ .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_plb_max_rounds,
+ },
+ {
+ .procname = "tcp_plb_suspend_rto_sec",
+ .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_plb_cong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_plb_max_cong_thresh,
+ },
+ {
+ .procname = "tcp_syn_linear_timeouts",
+ .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_syn_linear_timeouts_max,
+ },
+ {
+ .procname = "tcp_shrink_window",
+ .data = &init_net.ipv4.sysctl_tcp_shrink_window,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_pingpong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_rto_min_us",
+ .data = &init_net.ipv4.sysctl_tcp_rto_min_us,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_allowed_congestion_control,
- .strategy = &strategy_allowed_congestion_control,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
},
- { .ctl_name = 0 }
+ {
+ .procname = "tcp_rto_max_ms",
+ .data = &init_net.ipv4.sysctl_tcp_rto_max_ms,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE_THOUSAND,
+ .extra2 = &tcp_rto_max_max,
+ },
+};
+
+static __net_init int ipv4_sysctl_init_net(struct net *net)
+{
+ size_t table_size = ARRAY_SIZE(ipv4_net_table);
+ struct ctl_table *table;
+
+ table = ipv4_net_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+
+ table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ for (i = 0; i < table_size; i++) {
+ if (table[i].data) {
+ /* Update the variables to point into
+ * the current struct net
+ */
+ table[i].data += (void *)net - (void *)&init_net;
+ } else {
+ /* Entries without data pointer are global;
+ * Make them read-only in non-init_net ns
+ */
+ table[i].mode &= ~0222;
+ }
+ }
+ }
+
+ net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ table_size);
+ if (!net->ipv4.ipv4_hdr)
+ goto err_reg;
+
+ net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!net->ipv4.sysctl_local_reserved_ports)
+ goto err_ports;
+
+ proc_fib_multipath_hash_set_seed(net, 0);
+
+ return 0;
+
+err_ports:
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_sysctl_exit_net(struct net *net)
+{
+ const struct ctl_table *table;
+
+ kfree(net->ipv4.sysctl_local_reserved_ports);
+ table = net->ipv4.ipv4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+ kfree(table);
+}
+
+static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
+ .init = ipv4_sysctl_init_net,
+ .exit = ipv4_sysctl_exit_net,
};
-#endif /* CONFIG_SYSCTL */
+static __init int sysctl_ipv4_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
+ if (!hdr)
+ return -ENOMEM;
+
+ proc_fib_multipath_hash_init_rand_seed();
+
+ if (register_pernet_subsys(&ipv4_sysctl_ops)) {
+ unregister_net_sysctl_table(hdr);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
-EXPORT_SYMBOL(ipv4_config);
+__initcall(sysctl_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a6b228914b8e..f035440c475a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
@@ -207,11 +206,6 @@
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
- *
* Description of States:
*
* TCP_SYN_SENT sent a connection request, waiting for ack
@@ -247,69 +241,288 @@
* TCP_CLOSE socket is finished
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <crypto/md5.h>
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
+#include <linux/inet_diag.h>
#include <linux/init.h>
-#include <linux/smp_lock.h>
#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
#include <linux/random.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/highmem.h>
#include <linux/cache.h>
#include <linux/err.h>
-#include <linux/crypto.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/errqueue.h>
+#include <linux/static_key.h>
+#include <linux/btf.h>
#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/netdma.h>
+#include <net/psp.h>
+#include <net/sock.h>
+#include <net/rstreason.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
+#include <net/busy_poll.h>
+#include <net/hotdata.h>
+#include <trace/events/tcp.h>
+#include <net/rps.h>
-int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+#include "../core/devmem.h"
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
+/* Track pending CMSGs. */
+enum {
+ TCP_CMSG_INQ = 1,
+ TCP_CMSG_TS = 2
+};
+
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
-atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+DEFINE_PER_CPU(u32, tcp_tw_isn);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
-EXPORT_SYMBOL_GPL(tcp_orphan_count);
+long sysctl_tcp_mem[3] __read_mostly;
+EXPORT_IPV6_MOD(sysctl_tcp_mem);
-int sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
+DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
-EXPORT_SYMBOL(sysctl_tcp_mem);
-EXPORT_SYMBOL(sysctl_tcp_rmem);
-EXPORT_SYMBOL(sysctl_tcp_wmem);
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
-atomic_t tcp_memory_allocated; /* Current allocated memory. */
-atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
+/*
+ * Current number of TCP sockets.
+ */
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
+EXPORT_IPV6_MOD(tcp_sockets_allocated);
-EXPORT_SYMBOL(tcp_memory_allocated);
-EXPORT_SYMBOL(tcp_sockets_allocated);
+/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+ struct pipe_inode_info *pipe;
+ size_t len;
+ unsigned int flags;
+};
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
- * All the sk_stream_mem_schedule() is of this nature: accounting
+ * All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
-int tcp_memory_pressure;
+unsigned long tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL_GPL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+ unsigned long val;
+
+ if (READ_ONCE(tcp_memory_pressure))
+ return;
+ val = jiffies;
+
+ if (!val)
+ val--;
+ if (!cmpxchg(&tcp_memory_pressure, 0, val))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+}
+EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure);
+
+void tcp_leave_memory_pressure(struct sock *sk)
+{
+ unsigned long val;
+
+ if (!READ_ONCE(tcp_memory_pressure))
+ return;
+ val = xchg(&tcp_memory_pressure, 0);
+ if (val)
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
+ jiffies_to_msecs(jiffies - val));
+}
+EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure);
+
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+ u8 res = 0;
+
+ if (seconds > 0) {
+ int period = timeout;
+
+ res = 1;
+ while (seconds > period && res < 255) {
+ res++;
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+ int period = 0;
+
+ if (retrans > 0) {
+ period = timeout;
+ while (--retrans) {
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return period;
+}
+
+static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
+{
+ u32 rate = READ_ONCE(tp->rate_delivered);
+ u32 intv = READ_ONCE(tp->rate_interval_us);
+ u64 rate64 = 0;
+
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ }
+ return rate64;
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+void tcp_md5_destruct_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->md5sig_info) {
+
+ tcp_clear_md5_list(sk);
+ kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ }
+}
+EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
+#endif
+
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int rto_min_us, rto_max_ms;
+
+ tp->out_of_order_queue = RB_ROOT;
+ sk->tcp_rtx_queue = RB_ROOT;
+ tcp_init_xmit_timers(sk);
+ INIT_LIST_HEAD(&tp->tsq_node);
+ INIT_LIST_HEAD(&tp->tsorted_sent_queue);
+
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
-EXPORT_SYMBOL(tcp_memory_pressure);
+ rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms);
+ icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms);
+
+ rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
+ icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
-void tcp_enter_memory_pressure(void)
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+ tp->rate_app_limited = 1;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = TCP_MSS_DEFAULT;
+
+ tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
+ tcp_assign_congestion_control(sk);
+
+ tp->tsoffset = 0;
+ tp->rack.reo_wnd_steps = 1;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ icsk->icsk_sync_mss = tcp_sync_mss;
+
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+ tcp_scaling_ratio_init(sk);
+
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ sk_sockets_allocated_inc(sk);
+ xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
+}
+EXPORT_IPV6_MOD(tcp_init_sock);
+
+static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
{
- if (!tcp_memory_pressure) {
- NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
- tcp_memory_pressure = 1;
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+ u32 tsflags = sockc->tsflags;
+
+ if (tsflags && skb) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ sock_tx_timestamp(sk, sockc, &shinfo->tx_flags);
+ if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+ tcb->txstamp_ack |= TSTAMP_ACK_SK;
+ if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
+
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb)
+ bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
-EXPORT_SYMBOL(tcp_enter_memory_pressure);
+static bool tcp_stream_is_readable(struct sock *sk, int target)
+{
+ if (tcp_epollin_ready(sk, target))
+ return true;
+ return sk_is_readable(sk);
+}
/*
* Wait for a TCP event.
@@ -318,120 +531,131 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure);
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
-unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
- unsigned int mask;
+ __poll_t mask;
struct sock *sk = sock->sk;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u8 shutdown;
+ int state;
- poll_wait(file, sk->sk_sleep, wait);
- if (sk->sk_state == TCP_LISTEN)
+ sock_poll_wait(file, sock, wait);
+
+ state = inet_sk_state_load(sk);
+ if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
- by poll logic and correct handling of state changes
- made by another threads is impossible in any case.
+ * by poll logic and correct handling of state changes
+ * made by other threads is impossible in any case.
*/
mask = 0;
- if (sk->sk_err)
- mask = POLLERR;
/*
- * POLLHUP is certainly not done right. But poll() doesn't
+ * EPOLLHUP is certainly not done right. But poll() doesn't
* have a notion of HUP in just one direction, and for a
* socket the read side is more interesting.
*
- * Some poll() documentation says that POLLHUP is incompatible
- * with the POLLOUT/POLLWR flags, so somebody should check this
+ * Some poll() documentation says that EPOLLHUP is incompatible
+ * with the EPOLLOUT/POLLWR flags, so somebody should check this
* all. But careful, it tends to be safer to return too many
* bits than too few, and you can easily break real applications
* if you don't tell them that something has hung up!
*
* Check-me.
*
- * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+ * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
* our fs/select.c). It means that after we received EOF,
* poll always returns immediately, making impossible poll() on write()
- * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+ * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
* if and only if shutdown has been made in both directions.
* Actually, it is interesting to look how Solaris and DUX
- * solve this dilemma. I would prefer, if PULLHUP were maskable,
+ * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
* then we could set it on SND_SHUTDOWN. BTW examples given
* in Stevens' books assume exactly this behaviour, it explains
- * why PULLHUP is incompatible with POLLOUT. --ANK
+ * why EPOLLHUP is incompatible with EPOLLOUT. --ANK
*
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
- mask |= POLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-
- /* Connected? */
- if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- /* Potential race condition. If read of tp below will
- * escape above sk->sk_state, we can be illegally awaken
- * in SYN_* states. */
- if ((tp->rcv_nxt != tp->copied_seq) &&
- (tp->urg_seq != tp->copied_seq ||
- tp->rcv_nxt != tp->copied_seq + 1 ||
- sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
- mask |= POLLIN | POLLRDNORM;
-
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
- mask |= POLLOUT | POLLWRNORM;
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ mask |= EPOLLHUP;
+ if (shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+
+ /* Connected or passive Fast Open socket? */
+ if (state != TCP_SYN_SENT &&
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
+ int target = sock_rcvlowat(sk, 0, INT_MAX);
+ u16 urg_data = READ_ONCE(tp->urg_data);
+
+ if (unlikely(urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+ !sock_flag(sk, SOCK_URGINLINE))
+ target++;
+
+ if (tcp_stream_is_readable(sk, target))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ if (!(shutdown & SEND_SHUTDOWN)) {
+ if (__sk_stream_is_writeable(sk, 1)) {
+ mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
* wspace test but before the flags are set,
- * IO signal will be lost.
+ * IO signal will be lost. Memory barrier
+ * pairs with the input side.
*/
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
- mask |= POLLOUT | POLLWRNORM;
+ smp_mb__after_atomic();
+ if (__sk_stream_is_writeable(sk, 1))
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
- }
-
- if (tp->urg_data & TCP_URG_VALID)
- mask |= POLLPRI;
+ } else
+ mask |= EPOLLOUT | EPOLLWRNORM;
+
+ if (urg_data & TCP_URG_VALID)
+ mask |= EPOLLPRI;
+ } else if (state == TCP_SYN_SENT &&
+ inet_test_bit(DEFER_CONNECT, sk)) {
+ /* Active TCP fastopen socket with defer_connect
+ * Return EPOLLOUT so application can call write()
+ * in order for kernel to generate SYN+data
+ */
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
+ /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
+ smp_rmb();
+ if (READ_ONCE(sk->sk_err) ||
+ !skb_queue_empty_lockless(&sk->sk_error_queue))
+ mask |= EPOLLERR;
+
return mask;
}
+EXPORT_SYMBOL(tcp_poll);
-int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct tcp_sock *tp = tcp_sk(sk);
int answ;
+ bool slow;
switch (cmd) {
case SIOCINQ:
if (sk->sk_state == TCP_LISTEN)
return -EINVAL;
- lock_sock(sk);
- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
- answ = 0;
- else if (sock_flag(sk, SOCK_URGINLINE) ||
- !tp->urg_data ||
- before(tp->urg_seq, tp->copied_seq) ||
- !before(tp->urg_seq, tp->rcv_nxt)) {
- answ = tp->rcv_nxt - tp->copied_seq;
-
- /* Subtract 1, if FIN is in queue. */
- if (answ && !skb_queue_empty(&sk->sk_receive_queue))
- answ -=
- ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
- } else
- answ = tp->urg_seq - tp->copied_seq;
- release_sock(sk);
+ slow = lock_sock_fast(sk);
+ answ = tcp_inq(sk);
+ unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ answ = READ_ONCE(tp->urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
if (sk->sk_state == TCP_LISTEN)
@@ -440,450 +664,780 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_una;
+ answ = READ_ONCE(tp->write_seq) - tp->snd_una;
+ break;
+ case SIOCOUTQNSD:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
break;
default:
return -ENOIOCTLCMD;
- };
+ }
- return put_user(answ, (int __user *)arg);
+ *karg = answ;
+ return 0;
}
+EXPORT_IPV6_MOD(tcp_ioctl);
-static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
}
-static inline int forced_push(struct tcp_sock *tp)
+static inline bool forced_push(const struct tcp_sock *tp)
{
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
- tcb->flags = TCPCB_FLAG_ACK;
- tcb->sacked = 0;
- skb_header_release(skb);
- __skb_queue_tail(&sk->sk_write_queue, skb);
- sk_charge_skb(sk, skb);
- if (!sk->sk_send_head)
- sk->sk_send_head = skb;
+ tcb->tcp_flags = TCPHDR_ACK;
+ __skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
+ tcp_add_write_queue_tail(sk, skb);
+ sk_wmem_queued_add(sk, skb->truesize);
+ sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
- tp->nonagle &= ~TCP_NAGLE_PUSH;
+ tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+ tcp_slow_start_after_idle_check(sk);
}
-static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
- struct sk_buff *skb)
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
{
- if (flags & MSG_OOB) {
- tp->urg_mode = 1;
+ if (flags & MSG_OOB)
tp->snd_up = tp->write_seq;
- TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
- }
}
-static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
- int mss_now, int nonagle)
+/* If a not yet filled skb is pushed, do not send it if
+ * we have data packets in Qdisc or NIC queues :
+ * Because TX completion will happen shortly, it gives a chance
+ * to coalesce future sendmsg() payload into this skb, without
+ * need for a timer, and with no latency trade off.
+ * As packets containing data payload have a bigger truesize
+ * than pure acks (dataless) packets, the last checks prevent
+ * autocorking if we only have an ACK in Qdisc/NIC queues,
+ * or if TX completion was delayed after we processed ACK packet.
+ */
+static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
+ int size_goal)
{
- if (sk->sk_send_head) {
- struct sk_buff *skb = sk->sk_write_queue.prev;
- if (!(flags & MSG_MORE) || forced_push(tp))
- tcp_mark_push(tp, skb);
- tcp_mark_urg(tp, flags, skb);
- __tcp_push_pending_frames(sk, tp, mss_now,
- (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
- }
+ return skb->len < size_goal &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
+ !tcp_rtx_queue_empty(sk) &&
+ refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
+ tcp_skb_can_collapse_to(skb);
}
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
- size_t psize, int flags)
+void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mss_now, size_goal;
- int err;
- ssize_t copied;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ struct sk_buff *skb;
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
- goto out_err;
+ skb = tcp_write_queue_tail(sk);
+ if (!skb)
+ return;
+ if (!(flags & MSG_MORE) || forced_push(tp))
+ tcp_mark_push(tp, skb);
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ tcp_mark_urg(tp, flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
- copied = 0;
+ if (tcp_should_autocork(sk, skb, size_goal)) {
- err = -EPIPE;
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
+ /* avoid atomic op if TSQ_THROTTLED bit is already set */
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ smp_mb__after_atomic();
+ }
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED.
+ */
+ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
+ return;
+ }
- while (psize > 0) {
- struct sk_buff *skb = sk->sk_write_queue.prev;
- struct page *page = pages[poffset / PAGE_SIZE];
- int copy, i, can_coalesce;
- int offset = poffset % PAGE_SIZE;
- int size = min_t(size_t, psize, PAGE_SIZE - offset);
+ if (flags & MSG_MORE)
+ nonagle = TCP_NAGLE_CORK;
- if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
-new_segment:
- if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ __tcp_push_pending_frames(sk, mss_now, nonagle);
+}
- skb = sk_stream_alloc_pskb(sk, 0, 0,
- sk->sk_allocation);
- if (!skb)
- goto wait_for_memory;
+static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct tcp_splice_state *tss = rd_desc->arg.data;
+ int ret;
+
+ ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+ min(rd_desc->count, len), tss->flags);
+ if (ret > 0)
+ rd_desc->count -= ret;
+ return ret;
+}
- skb_entail(sk, tp, skb);
- copy = size_goal;
- }
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ .count = tss->len,
+ };
- if (copy > size)
- copy = size;
+ return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
- i = skb_shinfo(skb)->nr_frags;
- can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= MAX_SKB_FRAGS) {
- tcp_mark_push(tp, skb);
- goto new_segment;
+/**
+ * tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ long timeo;
+ ssize_t spliced;
+ int ret;
+
+ sock_rps_record_flow(sk);
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ ret = spliced = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+ while (tss.len) {
+ ret = __tcp_splice_read(sk, &tss);
+ if (ret < 0)
+ break;
+ else if (!ret) {
+ if (spliced)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ /* if __tcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ ret = sk_wait_data(sk, &timeo, NULL);
+ if (ret < 0)
+ break;
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
}
- if (!sk_stream_wmem_schedule(sk, copy))
- goto wait_for_memory;
-
- if (can_coalesce) {
- skb_shinfo(skb)->frags[i - 1].size += copy;
+ tss.len -= ret;
+ spliced += ret;
+
+ if (!tss.len || !timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+EXPORT_IPV6_MOD(tcp_splice_read);
+
+struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
+ bool force_schedule)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
+ if (likely(skb)) {
+ bool mem_scheduled;
+
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ if (force_schedule) {
+ mem_scheduled = true;
+ sk_forced_mem_schedule(sk, skb->truesize);
} else {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
+ mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
+ }
+ if (likely(mem_scheduled)) {
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+ return skb;
}
+ __kfree_skb(skb);
+ } else {
+ if (!sk->sk_bypass_prot_mem)
+ tcp_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ }
+ return NULL;
+}
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
- sk->sk_wmem_queued += copy;
- sk->sk_forward_alloc -= copy;
- skb->ip_summed = CHECKSUM_PARTIAL;
- tp->write_seq += copy;
- TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->gso_segs = 0;
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+ int large_allowed)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 new_size_goal, size_goal;
- if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+ if (!large_allowed)
+ return mss_now;
- copied += copy;
- poffset += copy;
- if (!(psize -= copy))
- goto out;
+ /* Note : tcp_tso_autosize() will eventually split this later */
+ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
- if (skb->len < mss_now || (flags & MSG_OOB))
- continue;
+ /* We try hard to avoid divides here */
+ size_goal = tp->gso_segs * mss_now;
+ if (unlikely(new_size_goal < size_goal ||
+ new_size_goal >= size_goal + mss_now)) {
+ tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+ sk->sk_gso_max_segs);
+ size_goal = tp->gso_segs * mss_now;
+ }
- if (forced_push(tp)) {
- tcp_mark_push(tp, skb);
- __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
- } else if (skb == sk->sk_send_head)
- tcp_push_one(sk, mss_now);
- continue;
+ return max(size_goal, mss_now);
+}
-wait_for_sndbuf:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
- if (copied)
- tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+ int mss_now;
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
- goto do_error;
+ mss_now = tcp_current_mss(sk);
+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
- }
+ return mss_now;
+}
-out:
- if (copied)
- tcp_push(sk, tp, flags, mss_now, tp->nonagle);
- return copied;
+/* In some cases, sendmsg() could have added an skb to the write queue,
+ * but failed adding payload on it. We need to remove it to consume less
+ * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
+ * epoll() users. Another reason is that tcp_write_xmit() does not like
+ * finding an empty skb in the write queue.
+ */
+void tcp_remove_empty_skb(struct sock *sk)
+{
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
-do_error:
- if (copied)
- goto out;
-out_err:
- return sk_stream_error(sk, flags, err);
+ if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ tcp_unlink_write_queue(skb, sk);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ tcp_wmem_free_skb(sk, skb);
+ }
}
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+/* skb changing from pure zc to mixed, must charge zc */
+static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
{
- ssize_t res;
- struct sock *sk = sock->sk;
+ if (unlikely(skb_zcopy_pure(skb))) {
+ u32 extra = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
- if (!(sk->sk_route_caps & NETIF_F_SG) ||
- !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
- return sock_no_sendpage(sock, page, offset, size, flags);
+ if (!sk_wmem_schedule(sk, extra))
+ return -ENOMEM;
- lock_sock(sk);
- TCP_CHECK_TIMER(sk);
- res = do_tcp_sendpages(sk, &page, offset, size, flags);
- TCP_CHECK_TIMER(sk);
- release_sock(sk);
- return res;
+ sk_mem_charge(sk, extra);
+ skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+ }
+ return 0;
}
-#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
-#define TCP_OFF(sk) (sk->sk_sndmsg_off)
-static inline int select_size(struct sock *sk, struct tcp_sock *tp)
+int tcp_wmem_schedule(struct sock *sk, int copy)
{
- int tmp = tp->mss_cache;
+ int left;
- if (sk->sk_route_caps & NETIF_F_SG) {
- if (sk_can_gso(sk))
- tmp = 0;
- else {
- int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+ if (likely(sk_wmem_schedule(sk, copy)))
+ return copy;
- if (tmp >= pgbreak &&
- tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
- tmp = pgbreak;
- }
+ /* We could be in trouble if we have nothing queued.
+ * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
+ * to guarantee some progress.
+ */
+ left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued;
+ if (left > 0)
+ sk_forced_mem_schedule(sk, min(left, copy));
+ return min(copy, sk->sk_forward_alloc);
+}
+
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+ if (tp->fastopen_req) {
+ kfree(tp->fastopen_req);
+ tp->fastopen_req = NULL;
}
+}
- return tmp;
+int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
+ size_t size, struct ubuf_info *uarg)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr *uaddr = msg->msg_name;
+ int err, flags;
+
+ if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) ||
+ (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
+ uaddr->sa_family == AF_UNSPEC))
+ return -EOPNOTSUPP;
+ if (tp->fastopen_req)
+ return -EALREADY; /* Another Fast Open is in progress */
+
+ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+ sk->sk_allocation);
+ if (unlikely(!tp->fastopen_req))
+ return -ENOBUFS;
+ tp->fastopen_req->data = msg;
+ tp->fastopen_req->size = size;
+ tp->fastopen_req->uarg = uarg;
+
+ if (inet_test_bit(DEFER_CONNECT, sk)) {
+ err = tcp_connect(sk);
+ /* Same failure procedure as in tcp_v4/6_connect */
+ if (err) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ }
+ }
+ flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+ err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr,
+ msg->msg_namelen, flags, 1);
+ /* fastopen_req could already be freed in __inet_stream_connect
+ * if the connection times out or gets rst
+ */
+ if (tp->fastopen_req) {
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ inet_clear_bit(DEFER_CONNECT, sk);
+ }
+ return err;
}
-int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t size)
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
- struct iovec *iov;
+ struct net_devmem_dmabuf_binding *binding = NULL;
struct tcp_sock *tp = tcp_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
- int iovlen, flags;
- int mss_now, size_goal;
- int err, copied;
+ struct sockcm_cookie sockc;
+ int flags, err, copied = 0;
+ int mss_now = 0, size_goal, copied_syn = 0;
+ int process_backlog = 0;
+ int sockc_err = 0;
+ int zc = 0;
long timeo;
- lock_sock(sk);
- TCP_CHECK_TIMER(sk);
-
flags = msg->msg_flags;
+
+ sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
+ if (msg->msg_controllen) {
+ sockc_err = sock_cmsg_send(sk, msg, &sockc);
+ /* Don't return error until MSG_FASTOPEN has been processed;
+ * that may succeed even if the cmsg is invalid.
+ */
+ }
+
+ if ((flags & MSG_ZEROCOPY) && size) {
+ if (msg->msg_ubuf) {
+ uarg = msg->msg_ubuf;
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ skb = tcp_write_queue_tail(sk);
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb),
+ !sockc_err && sockc.dmabuf_id);
+ if (!uarg) {
+ err = -ENOBUFS;
+ goto out_err;
+ }
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
+ else
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+
+ if (!sockc_err && sockc.dmabuf_id) {
+ binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ binding = NULL;
+ goto out_err;
+ }
+ }
+ }
+ } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_SPLICE_PAGES;
+ }
+
+ if (!sockc_err && sockc.dmabuf_id &&
+ (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ if (unlikely(flags & MSG_FASTOPEN ||
+ inet_test_bit(DEFER_CONNECT, sk)) &&
+ !tp->repair) {
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
+ if (err == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (err)
+ goto out_err;
+ }
+
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
+ goto do_error;
+ }
+
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+ goto out_nopush;
+ }
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
goto out_err;
- /* This should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ /* 'common' sending to sendq */
+ }
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ if (sockc_err) {
+ err = sockc_err;
+ goto out_err;
+ }
+
+ /* This should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/* Ok commence sending. */
- iovlen = msg->msg_iovlen;
- iov = msg->msg_iov;
copied = 0;
+restart:
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
- while (--iovlen >= 0) {
- int seglen = iov->iov_len;
- unsigned char __user *from = iov->iov_base;
+ while (msg_data_left(msg)) {
+ int copy = 0;
- iov++;
+ skb = tcp_write_queue_tail(sk);
+ if (skb)
+ copy = size_goal - skb->len;
- while (seglen > 0) {
- int copy;
+ trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
- skb = sk->sk_write_queue.prev;
-
- if (!sk->sk_send_head ||
- (copy = size_goal - skb->len) <= 0) {
+ if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+ bool first_skb;
new_segment:
- /* Allocate new segment. If the interface is SG,
- * allocate skb fitting to single page.
- */
- if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_space;
- skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
- 0, sk->sk_allocation);
- if (!skb)
- goto wait_for_memory;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
+ }
+ first_skb = tcp_rtx_and_write_queues_empty(sk);
+ skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
+ first_skb);
+ if (!skb)
+ goto wait_for_space;
- /*
- * Check whether we can use HW checksum.
- */
- if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
- skb->ip_summed = CHECKSUM_PARTIAL;
+ process_backlog++;
- skb_entail(sk, tp, skb);
- copy = size_goal;
- }
+#ifdef CONFIG_SKB_DECRYPTED
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+ tcp_skb_entail(sk, skb);
+ copy = size_goal;
- /* Try to append data to the end of skb. */
- if (copy > seglen)
- copy = seglen;
-
- /* Where to copy to? */
- if (skb_tailroom(skb) > 0) {
- /* We have some space in skb head. Superb! */
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
- if ((err = skb_add_data(skb, from, copy)) != 0)
- goto do_fault;
- } else {
- int merge = 0;
- int i = skb_shinfo(skb)->nr_frags;
- struct page *page = TCP_PAGE(sk);
- int off = TCP_OFF(sk);
-
- if (skb_can_coalesce(skb, i, page, off) &&
- off != PAGE_SIZE) {
- /* We can extend the last page
- * fragment. */
- merge = 1;
- } else if (i == MAX_SKB_FRAGS ||
- (!i &&
- !(sk->sk_route_caps & NETIF_F_SG))) {
- /* Need to add new fragment and cannot
- * do this because interface is non-SG,
- * or because all the page slots are
- * busy. */
- tcp_mark_push(tp, skb);
- goto new_segment;
- } else if (page) {
- if (off == PAGE_SIZE) {
- put_page(page);
- TCP_PAGE(sk) = page = NULL;
- off = 0;
- }
- } else
- off = 0;
+ /* All packets are restored as if they have
+ * already been sent. skb_mstamp_ns isn't set to
+ * avoid wrong rtt estimation.
+ */
+ if (tp->repair)
+ TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
+ }
- if (copy > PAGE_SIZE - off)
- copy = PAGE_SIZE - off;
+ /* Try to append data to the end of skb. */
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
- if (!sk_stream_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ if (zc == 0) {
+ bool merge = true;
+ int i = skb_shinfo(skb)->nr_frags;
+ struct page_frag *pfrag = sk_page_frag(sk);
- if (!page) {
- /* Allocate new cache page. */
- if (!(page = sk_stream_alloc_page(sk)))
- goto wait_for_memory;
- }
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_space;
- /* Time to copy data. We are close to
- * the end! */
- err = skb_copy_to_page(sk, from, skb, page,
- off, copy);
- if (err) {
- /* If this page was new, give it to the
- * socket so it does not get leaked.
- */
- if (!TCP_PAGE(sk)) {
- TCP_PAGE(sk) = page;
- TCP_OFF(sk) = 0;
- }
- goto do_error;
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
}
+ merge = false;
+ }
- /* Update the skb. */
- if (merge) {
- skb_shinfo(skb)->frags[i - 1].size +=
- copy;
- } else {
- skb_fill_page_desc(skb, i, page, off, copy);
- if (TCP_PAGE(sk)) {
- get_page(page);
- } else if (off + copy < PAGE_SIZE) {
- get_page(page);
- TCP_PAGE(sk) = page;
- }
- }
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
- TCP_OFF(sk) = off + copy;
+ if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ goto wait_for_space;
+ skb_zcopy_downgrade_managed(skb);
}
- if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
- tp->write_seq += copy;
- TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->gso_segs = 0;
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto do_error;
- from += copy;
- copied += copy;
- if ((seglen -= copy) == 0 && iovlen == 0)
- goto out;
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ page_ref_inc(pfrag->page);
+ }
+ pfrag->offset += copy;
+ } else if (zc == MSG_ZEROCOPY) {
+ /* First append to a fragless skb builds initial
+ * pure zerocopy skb
+ */
+ if (!skb->len)
+ skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
- if (skb->len < mss_now || (flags & MSG_OOB))
- continue;
+ if (!skb_zcopy_pure(skb)) {
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
+ }
- if (forced_push(tp)) {
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
+ binding);
+ if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
- __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
- } else if (skb == sk->sk_send_head)
- tcp_push_one(sk, mss_now);
- continue;
+ goto new_segment;
+ }
+ if (err < 0)
+ goto do_error;
+ copy = err;
+ } else if (zc == MSG_SPLICE_PAGES) {
+ /* Splice in data if we can; copy if we can't. */
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ goto wait_for_space;
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0) {
+ if (err == -EMSGSIZE) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ goto do_error;
+ }
+ copy = err;
-wait_for_sndbuf:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
- if (copied)
- tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
- goto do_error;
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
+ }
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ if (!copied)
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ copied += copy;
+ if (!msg_data_left(msg)) {
+ if (unlikely(flags & MSG_EOR))
+ TCP_SKB_CB(skb)->eor = 1;
+ goto out;
}
+
+ if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_space:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ tcp_remove_empty_skb(sk);
+ if (copied)
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
+
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
+ goto do_error;
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out:
- if (copied)
- tcp_push(sk, tp, flags, mss_now, tp->nonagle);
- TCP_CHECK_TIMER(sk);
- release_sock(sk);
- return copied;
-
-do_fault:
- if (!skb->len) {
- if (sk->sk_send_head == skb)
- sk->sk_send_head = NULL;
- __skb_unlink(skb, &sk->sk_write_queue);
- sk_stream_free_skb(sk, skb);
+ if (copied) {
+ tcp_tx_timestamp(sk, &sockc);
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
+out_nopush:
+ /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+ if (uarg && !msg->msg_ubuf)
+ net_zcopy_put(uarg);
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+ return copied + copied_syn;
do_error:
- if (copied)
+ tcp_remove_empty_skb(sk);
+
+ if (copied + copied_syn)
goto out;
out_err:
+ /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+ if (uarg && !msg->msg_ubuf)
+ net_zcopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
- TCP_CHECK_TIMER(sk);
- release_sock(sk);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
+ sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
return err;
}
+EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendmsg_locked(sk, msg, size);
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(tcp_sendmsg);
+
+void tcp_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+
+ if (!tcp_write_queue_tail(sk))
+ return;
+
+ lock_sock(sk);
+ mss_now = tcp_send_mss(sk, &size_goal, 0);
+ tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+ release_sock(sk);
+}
+EXPORT_IPV6_MOD_GPL(tcp_splice_eof);
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
*/
-static int tcp_recv_urg(struct sock *sk, long timeo,
- struct msghdr *msg, int len, int flags,
- int *addr_len)
+static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -900,14 +1454,14 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = TCP_URG_READ;
+ WRITE_ONCE(tp->urg_data, TCP_URG_READ);
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
if (len > 0) {
if (!(flags & MSG_TRUNC))
- err = memcpy_toiovec(msg->msg_iov, &c, 1);
+ err = memcpy_to_msg(msg, &c, 1);
len = 1;
} else
msg->msg_flags |= MSG_TRUNC;
@@ -927,29 +1481,44 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
return -EAGAIN;
}
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0;
+
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ return err;
+ copied += skb->len;
+ }
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ break;
+
+ copied += skb->len;
+ }
+
+ return err ?: copied;
+}
+
/* Clean up the receive buffer for full frames taken by the user,
* then send an ACK if necessary. COPIED is the number of bytes
* tcp_recvmsg has given to the user so far, it speeds up the
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
- int time_to_ack = 0;
-
-#if TCP_DEBUG
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
-#endif
+ bool time_to_ack = false;
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- /* Delayed ACKs frequently hit locked sockets during bulk
- * receive. */
- if (icsk->icsk_ack.blocked ||
- /* Once-per-two-segments ACK was not sent by tcp_input.c */
+
+ if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
@@ -960,9 +1529,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
- !icsk->icsk_ack.pingpong)) &&
+ !inet_csk_in_pingpong_mode(sk))) &&
!atomic_read(&sk->sk_rmem_alloc)))
- time_to_ack = 1;
+ time_to_ack = true;
}
/* We send an ACK if we can now advertise a non-zero window
@@ -984,47 +1553,62 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
* "Lots" means "at least twice" here.
*/
if (new_window && new_window >= 2 * rcv_window_now)
- time_to_ack = 1;
+ time_to_ack = true;
}
}
- if (time_to_ack)
+ if (time_to_ack) {
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
+ }
}
-static void tcp_prequeue_process(struct sock *sk)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
struct tcp_sock *tp = tcp_sk(sk);
- NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
-
- /* RX process wants to run with disabled BHs, though it is not
- * necessary */
- local_bh_disable();
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk->sk_backlog_rcv(sk, skb);
- local_bh_enable();
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+ __tcp_cleanup_rbuf(sk, copied);
+}
- /* Clear memory counter. */
- tp->ucopy.memory = 0;
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return skb_attempt_defer_free(skb);
+ }
+ __kfree_skb(skb);
}
-static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
u32 offset;
- skb_queue_walk(&sk->sk_receive_queue, skb) {
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
- if (skb->h.th->syn)
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
- if (offset < skb->len || skb->h.th->fin) {
+ }
+ if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
*off = offset;
return skb;
}
+ /* This looks weird, but this can happen if TCP collapsing
+ * splitted a fat GRO packet, while we released socket lock
+ * in skb_splice_bits()
+ */
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
+EXPORT_SYMBOL(tcp_recv_skb);
/*
* This routine provides an alternative to tcp_recvmsg() for routines
@@ -1037,12 +1621,13 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* or for 'peeking' the socket using this routine
* (although both would be easy to implement).
*/
-int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- u32 seq = tp->copied_seq;
+ u32 seq = *copied_seq;
u32 offset;
int copied = 0;
@@ -1050,11 +1635,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
return -ENOTCONN;
while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
if (offset < skb->len) {
- size_t used, len;
+ int used;
+ size_t len;
len = skb->len - offset;
/* Stop reading if we hit a patch of urgent data */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
if (urg_offset < len)
len = urg_offset;
@@ -1062,32 +1648,980 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
break;
}
used = recv_actor(desc, skb, offset, len);
- if (used <= len) {
- seq += used;
- copied += used;
- offset += used;
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ break;
}
- if (offset != skb->len)
+ if (WARN_ON_ONCE(used > len))
+ used = len;
+ seq += used;
+ copied += used;
+ offset += used;
+
+ /* If recv_actor drops the lock (e.g. TCP splice
+ * receive) the skb pointer might be invalid when
+ * getting here: tcp_collapse might have deleted it
+ * while aggregating skbs from the socket queue.
+ */
+ skb = tcp_recv_skb(sk, seq - 1, &offset);
+ if (!skb)
break;
+ /* TCP coalescing might have appended data to the skb.
+ * Try to splice more frags
+ */
+ if (offset + 1 != skb->len)
+ continue;
}
- if (skb->h.th->fin) {
- sk_eat_skb(sk, skb, 0);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb, 0);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
+ WRITE_ONCE(*copied_seq, seq);
}
- tp->copied_seq = seq;
+ WRITE_ONCE(*copied_seq, seq);
+
+ if (noack)
+ goto out;
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
- if (copied)
+ if (copied > 0) {
+ tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
+ }
+out:
+ return copied;
+}
+
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, false,
+ &tcp_sk(sk)->copied_seq);
+}
+EXPORT_SYMBOL(tcp_read_sock);
+
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
+}
+
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ int copied = 0;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u8 tcp_flags;
+ int used;
+
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+ tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+ used = recv_actor(sk, skb);
+ if (used < 0) {
+ if (!copied)
+ copied = used;
+ break;
+ }
+ copied += used;
+
+ if (tcp_flags & TCPHDR_FIN)
+ break;
+ }
return copied;
}
+EXPORT_IPV6_MOD(tcp_read_skb);
+
+void tcp_read_done(struct sock *sk, size_t len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ size_t left;
+ u32 offset;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return;
+
+ left = len;
+ while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ int used;
+
+ used = min_t(size_t, skb->len - offset, left);
+ seq += used;
+ left -= used;
+
+ if (skb->len > offset + used)
+ break;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ tcp_eat_recv_skb(sk, skb);
+ ++seq;
+ break;
+ }
+ tcp_eat_recv_skb(sk, skb);
+ }
+ WRITE_ONCE(tp->copied_seq, seq);
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (left != len)
+ tcp_cleanup_rbuf(sk, len - left);
+}
+EXPORT_SYMBOL(tcp_read_done);
+
+int tcp_peek_len(struct socket *sock)
+{
+ return tcp_inq(sock->sk);
+}
+EXPORT_IPV6_MOD(tcp_peek_len);
+
+/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
+int tcp_set_rcvlowat(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int space, cap;
+
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ cap = sk->sk_rcvbuf >> 1;
+ else
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
+ val = min(val, cap);
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
+
+ /* Check if we need to signal EPOLLIN right now */
+ tcp_data_ready(sk);
+
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ return 0;
+
+ space = tcp_space_from_win(sk, val);
+ if (space > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, space);
+
+ if (tp->window_clamp && tp->window_clamp < val)
+ WRITE_ONCE(tp->window_clamp, val);
+ }
+ return 0;
+}
+EXPORT_IPV6_MOD(tcp_set_rcvlowat);
+
+void tcp_update_recv_tstamps(struct sk_buff *skb,
+ struct scm_timestamping_internal *tss)
+{
+ if (skb->tstamp)
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
+ else
+ tss->ts[0] = (struct timespec64) {0};
+
+ if (skb_hwtstamps(skb)->hwtstamp)
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
+ else
+ tss->ts[2] = (struct timespec64) {0};
+}
+
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
+int tcp_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+ vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC);
+
+ /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
+ vm_flags_set(vma, VM_MIXEDMAP);
+
+ vma->vm_ops = &tcp_vm_ops;
+ return 0;
+}
+EXPORT_IPV6_MOD(tcp_mmap);
+
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+ u32 *offset_frag)
+{
+ skb_frag_t *frag;
+
+ if (unlikely(offset_skb >= skb->len))
+ return NULL;
+
+ offset_skb -= skb_headlen(skb);
+ if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+ return NULL;
+
+ frag = skb_shinfo(skb)->frags;
+ while (offset_skb) {
+ if (skb_frag_size(frag) > offset_skb) {
+ *offset_frag = offset_skb;
+ return frag;
+ }
+ offset_skb -= skb_frag_size(frag);
+ ++frag;
+ }
+ *offset_frag = 0;
+ return frag;
+}
+
+static bool can_map_frag(const skb_frag_t *frag)
+{
+ struct page *page;
+
+ if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag))
+ return false;
+
+ page = skb_frag_page(frag);
+
+ if (PageCompound(page) || page->mapping)
+ return false;
+
+ return true;
+}
+
+static int find_next_mappable_frag(const skb_frag_t *frag,
+ int remaining_in_skb)
+{
+ int offset = 0;
+
+ if (likely(can_map_frag(frag)))
+ return 0;
+
+ while (offset < remaining_in_skb && !can_map_frag(frag)) {
+ offset += skb_frag_size(frag);
+ ++frag;
+ }
+ return offset;
+}
+
+static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 offset)
+{
+ u32 frag_offset, partial_frag_remainder = 0;
+ int mappable_offset;
+ skb_frag_t *frag;
+
+ /* worst case: skip to next skb. try to improve on this case below */
+ zc->recv_skip_hint = skb->len - offset;
+
+ /* Find the frag containing this offset (and how far into that frag) */
+ frag = skb_advance_to_frag(skb, offset, &frag_offset);
+ if (!frag)
+ return;
+
+ if (frag_offset) {
+ struct skb_shared_info *info = skb_shinfo(skb);
+
+ /* We read part of the last frag, must recvmsg() rest of skb. */
+ if (frag == &info->frags[info->nr_frags - 1])
+ return;
+
+ /* Else, we must at least read the remainder in this frag. */
+ partial_frag_remainder = skb_frag_size(frag) - frag_offset;
+ zc->recv_skip_hint -= partial_frag_remainder;
+ ++frag;
+ }
+
+ /* partial_frag_remainder: If part way through a frag, must read rest.
+ * mappable_offset: Bytes till next mappable frag, *not* counting bytes
+ * in partial_frag_remainder.
+ */
+ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
+ zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
+}
+
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, struct scm_timestamping_internal *tss,
+ int *cmsg_flags);
+static int receive_fallback_to_copy(struct sock *sk,
+ struct tcp_zerocopy_receive *zc, int inq,
+ struct scm_timestamping_internal *tss)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ int err;
+
+ zc->length = 0;
+ zc->recv_skip_hint = 0;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq,
+ &msg.msg_iter);
+ if (err)
+ return err;
+
+ err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
+ tss, &zc->msg_flags);
+ if (err < 0)
+ return err;
+
+ zc->copybuf_len = err;
+ if (likely(zc->copybuf_len)) {
+ struct sk_buff *skb;
+ u32 offset;
+
+ skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+ if (skb)
+ tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
+ }
+ return 0;
+}
+
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 copylen,
+ u32 *offset, u32 *seq)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ int err;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen,
+ &msg.msg_iter);
+ if (err)
+ return err;
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+ if (err)
+ return err;
+ zc->recv_skip_hint -= copylen;
+ *offset += copylen;
+ *seq += copylen;
+ return (__s32)copylen;
+}
+
+static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
+ struct sock *sk,
+ struct sk_buff *skb,
+ u32 *seq,
+ s32 copybuf_len,
+ struct scm_timestamping_internal *tss)
+{
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+ if (!copylen)
+ return 0;
+ /* skb is null if inq < PAGE_SIZE. */
+ if (skb) {
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ } else {
+ skb = tcp_recv_skb(sk, *seq, &offset);
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ zc->msg_flags |= TCP_CMSG_TS;
+ }
+ }
+
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+ seq);
+ return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+ struct page **pending_pages,
+ unsigned long pages_remaining,
+ unsigned long *address,
+ u32 *length,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc,
+ u32 total_bytes_to_map,
+ int err)
+{
+ /* At least one page did not map. Try zapping if we skipped earlier. */
+ if (err == -EBUSY &&
+ zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+ u32 maybe_zap_len;
+
+ maybe_zap_len = total_bytes_to_map - /* All bytes to map */
+ *length + /* Mapped or pending */
+ (pages_remaining * PAGE_SIZE); /* Failed map. */
+ zap_page_range_single(vma, *address, maybe_zap_len, NULL);
+ err = 0;
+ }
+
+ if (!err) {
+ unsigned long leftover_pages = pages_remaining;
+ int bytes_mapped;
+
+ /* We called zap_page_range_single, try to reinsert. */
+ err = vm_insert_pages(vma, *address,
+ pending_pages,
+ &pages_remaining);
+ bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+ *seq += bytes_mapped;
+ *address += bytes_mapped;
+ }
+ if (err) {
+ /* Either we were unable to zap, OR we zapped, retried an
+ * insert, and still had an issue. Either ways, pages_remaining
+ * is the number of pages we were unable to map, and we unroll
+ * some state we speculatively touched before.
+ */
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+
+ *length -= bytes_not_mapped;
+ zc->recv_skip_hint += bytes_not_mapped;
+ }
+ return err;
+}
+
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+ struct page **pages,
+ unsigned int pages_to_map,
+ unsigned long *address,
+ u32 *length,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc,
+ u32 total_bytes_to_map)
+{
+ unsigned long pages_remaining = pages_to_map;
+ unsigned int pages_mapped;
+ unsigned int bytes_mapped;
+ int err;
+
+ err = vm_insert_pages(vma, *address, pages, &pages_remaining);
+ pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+ bytes_mapped = PAGE_SIZE * pages_mapped;
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
+ * mapping (some but not all of the pages).
+ */
+ *seq += bytes_mapped;
+ *address += bytes_mapped;
+
+ if (likely(!err))
+ return 0;
+
+ /* Error: maybe zap and retry + rollback state for failed inserts. */
+ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
+ pages_remaining, address, length, seq, zc, total_bytes_to_map,
+ err);
+}
+
+#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
+static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct scm_timestamping_internal *tss)
+{
+ unsigned long msg_control_addr;
+ struct msghdr cmsg_dummy;
+
+ msg_control_addr = (unsigned long)zc->msg_control;
+ cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
+ cmsg_dummy.msg_controllen =
+ (__kernel_size_t)zc->msg_controllen;
+ cmsg_dummy.msg_flags = in_compat_syscall()
+ ? MSG_CMSG_COMPAT : 0;
+ cmsg_dummy.msg_control_is_user = true;
+ zc->msg_flags = 0;
+ if (zc->msg_control == msg_control_addr &&
+ zc->msg_controllen == cmsg_dummy.msg_controllen) {
+ tcp_recv_timestamp(&cmsg_dummy, sk, tss);
+ zc->msg_control = (__u64)
+ ((uintptr_t)cmsg_dummy.msg_control_user);
+ zc->msg_controllen =
+ (__u64)cmsg_dummy.msg_controllen;
+ zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
+ }
+}
+
+static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
+ unsigned long address,
+ bool *mmap_locked)
+{
+ struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
+
+ if (vma) {
+ if (vma->vm_ops != &tcp_vm_ops) {
+ vma_end_read(vma);
+ return NULL;
+ }
+ *mmap_locked = false;
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, address);
+ if (!vma || vma->vm_ops != &tcp_vm_ops) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+ *mmap_locked = true;
+ return vma;
+}
+
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
+static int tcp_zerocopy_receive(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct scm_timestamping_internal *tss)
+{
+ u32 length = 0, offset, vma_len, avail_len, copylen = 0;
+ unsigned long address = (unsigned long)zc->address;
+ struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
+ s32 copybuf_len = zc->copybuf_len;
+ struct tcp_sock *tp = tcp_sk(sk);
+ const skb_frag_t *frags = NULL;
+ unsigned int pages_to_map = 0;
+ struct vm_area_struct *vma;
+ struct sk_buff *skb = NULL;
+ u32 seq = tp->copied_seq;
+ u32 total_bytes_to_map;
+ int inq = tcp_inq(sk);
+ bool mmap_locked;
+ int ret;
+
+ zc->copybuf_len = 0;
+ zc->msg_flags = 0;
+
+ if (address & (PAGE_SIZE - 1) || address != zc->address)
+ return -EINVAL;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ sock_rps_record_flow(sk);
+
+ if (inq && inq <= copybuf_len)
+ return receive_fallback_to_copy(sk, zc, inq, tss);
+
+ if (inq < PAGE_SIZE) {
+ zc->length = 0;
+ zc->recv_skip_hint = inq;
+ if (!inq && sock_flag(sk, SOCK_DONE))
+ return -EIO;
+ return 0;
+ }
+
+ vma = find_tcp_vma(current->mm, address, &mmap_locked);
+ if (!vma)
+ return -EINVAL;
+
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+ avail_len = min_t(u32, vma_len, inq);
+ total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
+ if (total_bytes_to_map) {
+ if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
+ zap_page_range_single(vma, address, total_bytes_to_map,
+ NULL);
+ zc->length = total_bytes_to_map;
+ zc->recv_skip_hint = 0;
+ } else {
+ zc->length = avail_len;
+ zc->recv_skip_hint = avail_len;
+ }
+ ret = 0;
+ while (length + PAGE_SIZE <= zc->length) {
+ int mappable_offset;
+ struct page *page;
+
+ if (zc->recv_skip_hint < PAGE_SIZE) {
+ u32 offset_frag;
+
+ if (skb) {
+ if (zc->recv_skip_hint > 0)
+ break;
+ skb = skb->next;
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ } else {
+ skb = tcp_recv_skb(sk, seq, &offset);
+ }
+
+ if (!skb_frags_readable(skb))
+ break;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ zc->msg_flags |= TCP_CMSG_TS;
+ }
+ zc->recv_skip_hint = skb->len - offset;
+ frags = skb_advance_to_frag(skb, offset, &offset_frag);
+ if (!frags || offset_frag)
+ break;
+ }
+
+ mappable_offset = find_next_mappable_frag(frags,
+ zc->recv_skip_hint);
+ if (mappable_offset) {
+ zc->recv_skip_hint = mappable_offset;
+ break;
+ }
+ page = skb_frag_page(frags);
+ if (WARN_ON_ONCE(!page))
+ break;
+
+ prefetchw(page);
+ pages[pages_to_map++] = page;
+ length += PAGE_SIZE;
+ zc->recv_skip_hint -= PAGE_SIZE;
+ frags++;
+ if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
+ zc->recv_skip_hint < PAGE_SIZE) {
+ /* Either full batch, or we're about to go to next skb
+ * (and we cannot unroll failed ops across skbs).
+ */
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+ pages_to_map,
+ &address, &length,
+ &seq, zc,
+ total_bytes_to_map);
+ if (ret)
+ goto out;
+ pages_to_map = 0;
+ }
+ }
+ if (pages_to_map) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
+ &address, &length, &seq,
+ zc, total_bytes_to_map);
+ }
+out:
+ if (mmap_locked)
+ mmap_read_unlock(current->mm);
+ else
+ vma_end_read(vma);
+ /* Try to copy straggler data. */
+ if (!ret)
+ copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
+
+ if (length + copylen) {
+ WRITE_ONCE(tp->copied_seq, seq);
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, length + copylen);
+ ret = 0;
+ if (length == zc->length)
+ zc->recv_skip_hint = 0;
+ } else {
+ if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+ ret = -EIO;
+ }
+ zc->length = length;
+ return ret;
+}
+#endif
+
+/* Similar to __sock_recv_timestamp, but does not require an skb */
+void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping_internal *tss)
+{
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
+ u32 tsflags = READ_ONCE(sk->sk_tsflags);
+ bool has_timestamping = false;
+
+ if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
+ if (sock_flag(sk, SOCK_RCVTSTAMP)) {
+ if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+ if (new_tstamp) {
+ struct __kernel_timespec kts = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+ sizeof(kts), &kts);
+ } else {
+ struct __kernel_old_timespec ts_old = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+ sizeof(ts_old), &ts_old);
+ }
+ } else {
+ if (new_tstamp) {
+ struct __kernel_sock_timeval stv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+ sizeof(stv), &stv);
+ } else {
+ struct __kernel_old_timeval tv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+ sizeof(tv), &tv);
+ }
+ }
+ }
+
+ if (tsflags & SOF_TIMESTAMPING_SOFTWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
+ has_timestamping = true;
+ else
+ tss->ts[0] = (struct timespec64) {0};
+ }
+
+ if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
+ if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
+ has_timestamping = true;
+ else
+ tss->ts[2] = (struct timespec64) {0};
+ }
+
+ if (has_timestamping) {
+ tss->ts[1] = (struct timespec64) {0};
+ if (sock_flag(sk, SOCK_TSTAMP_NEW))
+ put_cmsg_scm_timestamping64(msg, tss);
+ else
+ put_cmsg_scm_timestamping(msg, tss);
+ }
+}
+
+static int tcp_inq_hint(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied_seq = READ_ONCE(tp->copied_seq);
+ u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+ int inq;
+
+ inq = rcv_nxt - copied_seq;
+ if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+ lock_sock(sk);
+ inq = tp->rcv_nxt - tp->copied_seq;
+ release_sock(sk);
+ }
+ /* After receiving a FIN, tell the user-space to continue reading
+ * by returning a non-zero inq.
+ */
+ if (inq == 0 && sock_flag(sk, SOCK_DONE))
+ inq = 1;
+ return inq;
+}
+
+/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
+struct tcp_xa_pool {
+ u8 max; /* max <= MAX_SKB_FRAGS */
+ u8 idx; /* idx <= max */
+ __u32 tokens[MAX_SKB_FRAGS];
+ netmem_ref netmems[MAX_SKB_FRAGS];
+};
+
+static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
+{
+ int i;
+
+ /* Commit part that has been copied to user space. */
+ for (i = 0; i < p->idx; i++)
+ __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
+ (__force void *)p->netmems[i], GFP_KERNEL);
+ /* Rollback what has been pre-allocated and is no longer needed. */
+ for (; i < p->max; i++)
+ __xa_erase(&sk->sk_user_frags, p->tokens[i]);
+
+ p->max = 0;
+ p->idx = 0;
+}
+
+static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
+{
+ if (!p->max)
+ return;
+
+ xa_lock_bh(&sk->sk_user_frags);
+
+ tcp_xa_pool_commit_locked(sk, p);
+
+ xa_unlock_bh(&sk->sk_user_frags);
+}
+
+static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
+ unsigned int max_frags)
+{
+ int err, k;
+
+ if (p->idx < p->max)
+ return 0;
+
+ xa_lock_bh(&sk->sk_user_frags);
+
+ tcp_xa_pool_commit_locked(sk, p);
+
+ for (k = 0; k < max_frags; k++) {
+ err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
+ XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
+ if (err)
+ break;
+ }
+
+ xa_unlock_bh(&sk->sk_user_frags);
+
+ p->max = k;
+ p->idx = 0;
+ return k ? 0 : err;
+}
+
+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
+ unsigned int offset, struct msghdr *msg,
+ int remaining_len)
+{
+ struct dmabuf_cmsg dmabuf_cmsg = { 0 };
+ struct tcp_xa_pool tcp_xa_pool;
+ unsigned int start;
+ int i, copy, n;
+ int sent = 0;
+ int err = 0;
+
+ tcp_xa_pool.max = 0;
+ tcp_xa_pool.idx = 0;
+ do {
+ start = skb_headlen(skb);
+
+ if (skb_frags_readable(skb)) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ /* Copy header. */
+ copy = start - offset;
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ n = copy_to_iter(skb->data + offset, copy,
+ &msg->msg_iter);
+ if (n != copy) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ offset += copy;
+ remaining_len -= copy;
+
+ /* First a dmabuf_cmsg for # bytes copied to user
+ * buffer.
+ */
+ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
+ dmabuf_cmsg.frag_size = copy;
+ err = put_cmsg_notrunc(msg, SOL_SOCKET,
+ SO_DEVMEM_LINEAR,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err)
+ goto out;
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+
+ /* after that, send information of dmabuf pages through a
+ * sequence of cmsg
+ */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct net_iov *niov;
+ u64 frag_offset;
+ int end;
+
+ /* !skb_frags_readable() should indicate that ALL the
+ * frags in this skb are dmabuf net_iovs. We're checking
+ * for that flag above, but also check individual frags
+ * here. If the tcp stack is not setting
+ * skb_frags_readable() correctly, we still don't want
+ * to crash here.
+ */
+ if (!skb_frag_net_iov(frag)) {
+ net_err_ratelimited("Found non-dmabuf skb with net_iov");
+ err = -ENODEV;
+ goto out;
+ }
+
+ niov = skb_frag_net_iov(frag);
+ if (!net_is_devmem_iov(niov)) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ frag_offset = net_iov_virtual_addr(niov) +
+ skb_frag_off(frag) + offset -
+ start;
+ dmabuf_cmsg.frag_offset = frag_offset;
+ dmabuf_cmsg.frag_size = copy;
+ err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
+ skb_shinfo(skb)->nr_frags - i);
+ if (err)
+ goto out;
+
+ /* Will perform the exchange later */
+ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
+ dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
+
+ offset += copy;
+ remaining_len -= copy;
+
+ err = put_cmsg_notrunc(msg, SOL_SOCKET,
+ SO_DEVMEM_DMABUF,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err)
+ goto out;
+
+ atomic_long_inc(&niov->desc.pp_ref_count);
+ tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+ start = end;
+ }
+
+ tcp_xa_pool_commit(sk, &tcp_xa_pool);
+ if (!remaining_len)
+ goto out;
+
+ /* if remaining_len is not satisfied yet, we need to go to the
+ * next frag in the frag_list to satisfy remaining_len.
+ */
+ skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+ offset = offset - start;
+ } while (skb);
+
+ if (remaining_len) {
+ err = -EFAULT;
+ goto out;
+ }
+
+out:
+ tcp_xa_pool_commit(sk, &tcp_xa_pool);
+ if (!sent)
+ sent = err;
+
+ return sent;
+}
/*
* This routine copies from a sock struct into the user buffer.
@@ -1097,10 +2631,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
* Probably, code can be easily improved even more.
*/
-int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int nonblock, int flags, int *addr_len)
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, struct scm_timestamping_internal *tss,
+ int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int last_copied_dmabuf = -1; /* uninitialized */
int copied = 0;
u32 peek_seq;
u32 *seq;
@@ -1108,48 +2644,53 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int err;
int target; /* Read at least this many bytes */
long timeo;
- struct task_struct *user_recv = NULL;
- int copied_early = 0;
-
- lock_sock(sk);
-
- TCP_CHECK_TIMER(sk);
+ struct sk_buff *skb, *last;
+ u32 peek_offset = 0;
+ u32 urg_hole = 0;
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
- timeo = sock_rcvtimeo(sk, nonblock);
+ if (tp->recvmsg_inq) {
+ *cmsg_flags = TCP_CMSG_INQ;
+ msg->msg_get_inq = 1;
+ }
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
goto recv_urg;
+ if (unlikely(tp->repair)) {
+ err = -EPERM;
+ if (!(flags & MSG_PEEK))
+ goto out;
+
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ goto recv_sndq;
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out;
+
+ /* 'common' recv queue MSG_PEEK-ing */
+ }
+
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
- peek_seq = tp->copied_seq;
+ peek_offset = max(sk_peek_offset(sk, flags), 0);
+ peek_seq = tp->copied_seq + peek_offset;
seq = &peek_seq;
}
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
-#ifdef CONFIG_NET_DMA
- tp->ucopy.dma_chan = NULL;
- preempt_disable();
- if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
- !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) {
- preempt_enable_no_resched();
- tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
- } else
- preempt_enable_no_resched();
-#endif
-
do {
- struct sk_buff *skb;
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
- if (tp->urg_data && tp->urg_seq == *seq) {
+ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
@@ -1160,42 +2701,43 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Next get a buffer. */
- skb = skb_peek(&sk->sk_receive_queue);
- do {
- if (!skb)
- break;
-
+ last = skb_peek_tail(&sk->sk_receive_queue);
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
+ last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
- if (before(*seq, TCP_SKB_CB(skb)->seq)) {
- printk(KERN_INFO "recvmsg bug: copied %X "
- "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+ "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+ flags))
break;
- }
+
offset = *seq - TCP_SKB_CB(skb)->seq;
- if (skb->h.th->syn)
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
+ }
if (offset < skb->len)
goto found_ok_skb;
- if (skb->h.th->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
- BUG_TRAP(flags & MSG_PEEK);
- skb = skb->next;
- } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+ WARN(!(flags & MSG_PEEK),
+ "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+ }
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail)
+ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
- if (sk->sk_err ||
+ if (!timeo ||
+ sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
- !timeo ||
- signal_pending(current) ||
- (flags & MSG_PEEK))
+ signal_pending(current))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
@@ -1210,13 +2752,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
break;
if (sk->sk_state == TCP_CLOSE) {
- if (!sock_flag(sk, SOCK_DONE)) {
- /* This occurs when user tries to read
- * from never connected socket.
- */
- copied = -ENOTCONN;
- break;
- }
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
break;
}
@@ -1231,108 +2770,41 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
}
- tcp_cleanup_rbuf(sk, copied);
-
- if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
- /* Install new reader */
- if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
- user_recv = current;
- tp->ucopy.task = user_recv;
- tp->ucopy.iov = msg->msg_iov;
- }
-
- tp->ucopy.len = len;
-
- BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
- (flags & (MSG_PEEK | MSG_TRUNC)));
-
- /* Ugly... If prequeue is not empty, we have to
- * process it before releasing socket, otherwise
- * order will be broken at second iteration.
- * More elegant solution is required!!!
- *
- * Look: we have the following (pseudo)queues:
- *
- * 1. packets in flight
- * 2. backlog
- * 3. prequeue
- * 4. receive_queue
- *
- * Each queue can be processed only if the next ones
- * are empty. At this point we have empty receive_queue.
- * But prequeue _can_ be not empty after 2nd iteration,
- * when we jumped to start of loop because backlog
- * processing added something to receive_queue.
- * We cannot release_sock(), because backlog contains
- * packets arrived _after_ prequeued ones.
- *
- * Shortly, algorithm is clear --- to process all
- * the queues in order. We could make it more directly,
- * requeueing packets from backlog to prequeue, if
- * is not empty. It is more elegant, but eats cycles,
- * unfortunately.
- */
- if (!skb_queue_empty(&tp->ucopy.prequeue))
- goto do_prequeue;
-
- /* __ Set realtime policy in scheduler __ */
- }
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
- release_sock(sk);
- lock_sock(sk);
- } else
- sk_wait_data(sk, &timeo);
-
-#ifdef CONFIG_NET_DMA
- tp->ucopy.wakeup = 0;
-#endif
-
- if (user_recv) {
- int chunk;
-
- /* __ Restore normal policy in scheduler __ */
-
- if ((chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
- len -= chunk;
- copied += chunk;
- }
-
- if (tp->rcv_nxt == tp->copied_seq &&
- !skb_queue_empty(&tp->ucopy.prequeue)) {
-do_prequeue:
- tcp_prequeue_process(sk);
-
- if ((chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
+ __sk_flush_backlog(sk);
+ } else {
+ tcp_cleanup_rbuf(sk, copied);
+ err = sk_wait_data(sk, &timeo, last);
+ if (err < 0) {
+ err = copied ? : err;
+ goto out;
}
}
- if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
- if (net_ratelimit())
- printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
- current->comm, current->pid);
- peek_seq = tp->copied_seq;
+
+ if ((flags & MSG_PEEK) &&
+ (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) {
+ net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
+ peek_seq = tp->copied_seq + peek_offset;
}
continue;
- found_ok_skb:
+found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* Do we have urgent data here? */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
+ urg_hole++;
offset++;
used--;
if (!used)
@@ -1344,143 +2816,203 @@ do_prequeue:
}
if (!(flags & MSG_TRUNC)) {
-#ifdef CONFIG_NET_DMA
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = get_softnet_dma();
-
- if (tp->ucopy.dma_chan) {
- tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
- tp->ucopy.dma_chan, skb, offset,
- msg->msg_iov, used,
- tp->ucopy.pinned_list);
-
- if (tp->ucopy.dma_cookie < 0) {
-
- printk(KERN_ALERT "dma_cookie < 0\n");
+ if (last_copied_dmabuf != -1 &&
+ last_copied_dmabuf != !skb_frags_readable(skb))
+ break;
+ if (skb_frags_readable(skb)) {
+ err = skb_copy_datagram_msg(skb, offset, msg,
+ used);
+ if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
- if ((offset + used) == skb->len)
- copied_early = 1;
-
- } else
-#endif
- {
- err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
- if (err) {
- /* Exception. Bailout! */
+ } else {
+ if (!(flags & MSG_SOCK_DEVMEM)) {
+ /* dmabuf skbs can only be received
+ * with the MSG_SOCK_DEVMEM flag.
+ */
if (!copied)
copied = -EFAULT;
+
break;
}
+
+ err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
+ used);
+ if (err < 0) {
+ if (!copied)
+ copied = err;
+
+ break;
+ }
+ used = err;
}
}
- *seq += used;
+ last_copied_dmabuf = !skb_frags_readable(skb);
+
+ WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
-
+ if (flags & MSG_PEEK)
+ sk_peek_offset_fwd(sk, used);
+ else
+ sk_peek_offset_bwd(sk, used);
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
- tp->urg_data = 0;
- tcp_fast_path_check(sk, tp);
+ if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+ WRITE_ONCE(tp->urg_data, 0);
+ tcp_fast_path_check(sk);
+ }
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ *cmsg_flags |= TCP_CMSG_TS;
}
+
if (used + offset < skb->len)
continue;
- if (skb->h.th->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
- if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
- }
+ if (!(flags & MSG_PEEK))
+ tcp_eat_recv_skb(sk, skb);
continue;
- found_fin_ok:
+found_fin_ok:
/* Process the FIN. */
- ++*seq;
- if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
- }
+ WRITE_ONCE(*seq, *seq + 1);
+ if (!(flags & MSG_PEEK))
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
- if (user_recv) {
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- int chunk;
-
- tp->ucopy.len = copied > 0 ? len : 0;
-
- tcp_prequeue_process(sk);
-
- if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
-
- tp->ucopy.task = NULL;
- tp->ucopy.len = 0;
- }
-
-#ifdef CONFIG_NET_DMA
- if (tp->ucopy.dma_chan) {
- struct sk_buff *skb;
- dma_cookie_t done, used;
-
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-
- while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
- tp->ucopy.dma_cookie, &done,
- &used) == DMA_IN_PROGRESS) {
- /* do partial cleanup of sk_async_wait_queue */
- while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
- (dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_SUCCESS)) {
- __skb_dequeue(&sk->sk_async_wait_queue);
- kfree_skb(skb);
- }
- }
-
- /* Safe to free early-copied skbs now */
- __skb_queue_purge(&sk->sk_async_wait_queue);
- dma_chan_put(tp->ucopy.dma_chan);
- tp->ucopy.dma_chan = NULL;
- }
- if (tp->ucopy.pinned_list) {
- dma_unpin_iovec_pages(tp->ucopy.pinned_list);
- tp->ucopy.pinned_list = NULL;
- }
-#endif
-
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
-
- TCP_CHECK_TIMER(sk);
- release_sock(sk);
return copied;
out:
- TCP_CHECK_TIMER(sk);
- release_sock(sk);
return err;
recv_urg:
- err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+ err = tcp_recv_urg(sk, msg, len, flags);
goto out;
+
+recv_sndq:
+ err = tcp_peek_sndq(sk, msg, len);
+ goto out;
+}
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
+{
+ int cmsg_flags = 0, ret;
+ struct scm_timestamping_internal tss;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (sk_can_busy_loop(sk) &&
+ skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+ sk->sk_state == TCP_ESTABLISHED)
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+ ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
+ release_sock(sk);
+
+ if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
+ if (cmsg_flags & TCP_CMSG_TS)
+ tcp_recv_timestamp(msg, sk, &tss);
+ if (msg->msg_get_inq) {
+ msg->msg_inq = tcp_inq_hint(sk);
+ if (cmsg_flags & TCP_CMSG_INQ)
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
+ sizeof(msg->msg_inq), &msg->msg_inq);
+ }
+ }
+ return ret;
+}
+EXPORT_IPV6_MOD(tcp_recvmsg);
+
+void tcp_set_state(struct sock *sk, int state)
+{
+ int oldstate = sk->sk_state;
+
+ /* We defined a new enum for TCP states that are exported in BPF
+ * so as not force the internal TCP states to be frozen. The
+ * following checks will detect if an internal state value ever
+ * differs from the BPF value. If this ever happens, then we will
+ * need to remap the internal value to the BPF value before calling
+ * tcp_call_bpf_2arg.
+ */
+ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+ BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+ BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+ BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+ BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+ BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+ BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+ BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+ BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+ BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+ BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+ BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+ BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE);
+ BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+ /* bpf uapi header bpf.h defines an anonymous enum with values
+ * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
+ * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
+ * But clang built vmlinux does not have this enum in DWARF
+ * since clang removes the above code before generating IR/debuginfo.
+ * Let us explicitly emit the type debuginfo to ensure the
+ * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
+ * regardless of which compiler is used.
+ */
+ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
+
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+ tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
+ switch (state) {
+ case TCP_ESTABLISHED:
+ if (oldstate != TCP_ESTABLISHED)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ break;
+ case TCP_CLOSE_WAIT:
+ if (oldstate == TCP_SYN_RECV)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ break;
+
+ case TCP_CLOSE:
+ if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(sk);
+ fallthrough;
+ default:
+ if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
+ TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ inet_sk_state_store(sk, state);
}
+EXPORT_SYMBOL_GPL(tcp_set_state);
/*
* State processing on a close. This implements the state shift for
@@ -1491,18 +3023,19 @@ recv_urg:
static const unsigned char new_state[16] = {
/* current state: new state: action: */
- /* (Invalid) */ TCP_CLOSE,
- /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
- /* TCP_SYN_SENT */ TCP_CLOSE,
- /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
- /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
- /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
- /* TCP_TIME_WAIT */ TCP_CLOSE,
- /* TCP_CLOSE */ TCP_CLOSE,
- /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
- /* TCP_LAST_ACK */ TCP_LAST_ACK,
- /* TCP_LISTEN */ TCP_CLOSE,
- /* TCP_CLOSING */ TCP_CLOSING,
+ [0 /* (Invalid) */] = TCP_CLOSE,
+ [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_SYN_SENT] = TCP_CLOSE,
+ [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
+ [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
+ [TCP_TIME_WAIT] = TCP_CLOSE,
+ [TCP_CLOSE] = TCP_CLOSE,
+ [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
+ [TCP_LAST_ACK] = TCP_LAST_ACK,
+ [TCP_LISTEN] = TCP_CLOSE,
+ [TCP_CLOSING] = TCP_CLOSING,
+ [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
};
static int tcp_close_state(struct sock *sk)
@@ -1517,7 +3050,7 @@ static int tcp_close_state(struct sock *sk)
/*
* Shutdown the sending side of a connection. Much like close except
- * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
+ * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
*/
void tcp_shutdown(struct sock *sk, int how)
@@ -1532,21 +3065,69 @@ void tcp_shutdown(struct sock *sk, int how)
/* If we've already sent a FIN, or it's a closed state, skip this. */
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT |
- TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ TCPF_CLOSE_WAIT)) {
/* Clear out any half completed packets. FIN if needed. */
if (tcp_close_state(sk))
tcp_send_fin(sk);
}
}
+EXPORT_IPV6_MOD(tcp_shutdown);
-void tcp_close(struct sock *sk, long timeout)
+int tcp_orphan_count_sum(void)
+{
+ int i, total = 0;
+
+ for_each_possible_cpu(i)
+ total += per_cpu(tcp_orphan_count, i);
+
+ return max(total, 0);
+}
+
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
{
+ WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(int shift)
+{
+ return READ_ONCE(tcp_orphan_cache) << shift >
+ READ_ONCE(sysctl_tcp_max_orphans);
+}
+
+static bool tcp_out_of_memory(const struct sock *sk)
+{
+ if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+ sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
+ return true;
+ return false;
+}
+
+bool tcp_check_oom(const struct sock *sk, int shift)
+{
+ bool too_many_orphans, out_of_socket_memory;
+
+ too_many_orphans = tcp_too_many_orphans(shift);
+ out_of_socket_memory = tcp_out_of_memory(sk);
+
+ if (too_many_orphans)
+ net_info_ratelimited("too many orphaned sockets\n");
+ if (out_of_socket_memory)
+ net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
+ return too_many_orphans || out_of_socket_memory;
+}
+
+void __tcp_close(struct sock *sk, long timeout)
+{
+ bool data_was_unread = false;
struct sk_buff *skb;
- int data_was_unread = 0;
int state;
- lock_sock(sk);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@@ -1561,33 +3142,39 @@ void tcp_close(struct sock *sk, long timeout)
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
- skb->h.th->fin;
- data_was_unread += len;
- __kfree_skb(skb);
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ end_seq--;
+ if (after(end_seq, tcp_sk(sk)->copied_seq))
+ data_was_unread = true;
+ tcp_eat_recv_skb(sk, skb);
}
- sk_stream_mem_reclaim(sk);
+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+ if (sk->sk_state == TCP_CLOSE)
+ goto adjudge_to_death;
- /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
- * 3.10, we send a RST here because data was lost. To
- * witness the awful effects of the old behavior of always
- * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
- * a bulk GET in an FTP client, suspend the process, wait
- * for the client to advertise a zero window, then kill -9
- * the FTP client, wheee... Note: timeout is always zero
- * in such a case.
+ /* As outlined in RFC 2525, section 2.17, we send a RST here because
+ * data was lost. To witness the awful effects of the old behavior of
+ * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
+ * GET in an FTP client, suspend the process, wait for the client to
+ * advertise a zero window, then kill -9 the FTP client, wheee...
+ * Note: timeout is always zero in such a case.
*/
- if (data_was_unread) {
+ if (unlikely(tcp_sk(sk)->repair)) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
- NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_KERNEL);
+ tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_TCP_ABORT_ON_CLOSE);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
- NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
} else if (tcp_close_state(sk)) {
/* We FIN if the application ate all the data before
* zapping the connection.
@@ -1597,7 +3184,7 @@ void tcp_close(struct sock *sk, long timeout)
* machine. State transitions:
*
* TCP_ESTABLISHED -> TCP_FIN_WAIT1
- * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult)
* TCP_CLOSE_WAIT -> TCP_LAST_ACK
*
* are legal only when FIN has been sent (i.e. in window),
@@ -1613,6 +3200,10 @@ void tcp_close(struct sock *sk, long timeout)
* they look as CLOSING or LAST_ACK for Linux)
* Probably, I missed some more holelets.
* --ANK
+ * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+ * in a single packet! (May consider it later but will
+ * probably need API support or TCP_CORK SYN-ACK until
+ * data is written and socket is closed.)
*/
tcp_send_fin(sk);
}
@@ -1623,18 +3214,13 @@ adjudge_to_death:
state = sk->sk_state;
sock_hold(sk);
sock_orphan(sk);
- atomic_inc(sk->sk_prot->orphan_count);
-
- /* It is the last release_sock in its life. It will remove backlog. */
- release_sock(sk);
-
- /* Now socket is owned by kernel and we acquire BH lock
- to finish close. No need to check for user refs.
- */
local_bh_disable();
bh_lock_sock(sk);
- BUG_TRAP(!sock_owned_by_user(sk));
+ /* remove backlog if any, without releasing ownership. */
+ __release_sock(sk);
+
+ tcp_orphan_count_inc();
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -1643,7 +3229,7 @@ adjudge_to_death:
/* This is a (useful) BSD violating of the RFC. There is a
* problem with TCP as specified in that the other end could
* keep a socket open forever with no application left this end.
- * We use a 3 minute timeout (about the same as BSD) then kill
+ * We use a 1 minute timeout (about the same as BSD) then kill
* our end. If they send after that then tough - BUT: long enough
* that we won't make the old 4*rto = almost no time - whoops
* reset mistake.
@@ -1656,15 +3242,17 @@ adjudge_to_death:
if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_ATOMIC);
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_LINGER);
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPABORTONLINGER);
} else {
const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk,
+ tcp_reset_keepalive_timer(sk,
tmo - TCP_TIMEWAIT_LEN);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -1673,45 +3261,99 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- sk_stream_mem_reclaim(sk);
- if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
- if (net_ratelimit())
- printk(KERN_INFO "TCP: too many of orphaned "
- "sockets\n");
+ if (tcp_check_oom(sk, 0)) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_MEMORY);
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPABORTONMEMORY);
+ } else if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_ATOMIC);
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
}
}
- if (sk->sk_state == TCP_CLOSE)
+ if (sk->sk_state == TCP_CLOSE) {
+ struct request_sock *req;
+
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ /* We could get here with a non-NULL req if the socket is
+ * aborted (e.g., closed with unread data) before 3WHS
+ * finishes.
+ */
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
inet_csk_destroy_sock(sk);
+ }
/* Otherwise, socket is reprieved until protocol close. */
out:
bh_unlock_sock(sk);
local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
+ release_sock(sk);
+ if (!sk->sk_net_refcnt)
+ inet_csk_clear_xmit_timers_sync(sk);
sock_put(sk);
}
+EXPORT_SYMBOL(tcp_close);
/* These states need RST on ABORT according to RFC793 */
-static inline int tcp_need_reset(int state)
+static inline bool tcp_need_reset(int state)
{
return (1 << state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+ struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+ tcp_sk(sk)->highest_sack = NULL;
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ /* Since we are deleting whole queue, no need to
+ * list_del(&skb->tcp_tsorted_anchor)
+ */
+ tcp_rtx_queue_unlink(skb, sk);
+ tcp_wmem_free_skb(sk, skb);
+ }
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ tcp_skb_tsorted_anchor_cleanup(skb);
+ tcp_wmem_free_skb(sk, skb);
+ }
+ tcp_rtx_queue_purge(sk);
+ INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+ tcp_clear_all_retrans_hints(tcp_sk(sk));
+ tcp_sk(sk)->packets_out = 0;
+ inet_csk(sk)->icsk_backoff = 0;
+}
+
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int err = 0;
int old_state = sk->sk_state;
+ struct request_sock *req;
+ u32 seq;
if (old_state != TCP_CLOSE)
tcp_set_state(sk, TCP_CLOSE);
@@ -1719,280 +3361,876 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
- } else if (tcp_need_reset(old_state) ||
- (tp->snd_nxt != tp->write_seq &&
- (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+ } else if (unlikely(tp->repair)) {
+ WRITE_ONCE(sk->sk_err, ECONNABORTED);
+ } else if (tcp_need_reset(old_state)) {
+ tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE);
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
+ } else if (tp->snd_nxt != tp->write_seq &&
+ (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
/* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
- tcp_send_active_reset(sk, gfp_any());
- sk->sk_err = ECONNRESET;
+ tcp_send_active_reset(sk, gfp_any(),
+ SK_RST_REASON_TCP_DISCONNECT_WITH_DATA);
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
- sk->sk_err = ECONNRESET;
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- sk_stream_writequeue_purge(sk);
- __skb_queue_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_NET_DMA
- __skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+ WRITE_ONCE(tp->urg_data, 0);
+ sk_set_peek_off(sk, -1);
+ tcp_write_queue_purge(sk);
+ tcp_fastopen_active_disable_ofo_check(sk);
+ skb_rbtree_purge(&tp->out_of_order_queue);
- inet->dport = 0;
+ inet->inet_dport = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
- inet_reset_saddr(sk);
+ inet_bhash2_reset_saddr(sk);
- sk->sk_shutdown = 0;
+ WRITE_ONCE(sk->sk_shutdown, 0);
sock_reset_flag(sk, SOCK_DONE);
- tp->srtt = 0;
- if ((tp->write_seq += tp->max_window + 2) == 0)
- tp->write_seq = 1;
+ tp->srtt_us = 0;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ tp->rcv_rtt_last_tsecr = 0;
+
+ seq = tp->write_seq + tp->max_window + 2;
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
+
icsk->icsk_backoff = 0;
- tp->snd_cwnd = 2;
- icsk->icsk_probes_out = 0;
- tp->packets_out = 0;
- tp->snd_ssthresh = 0x7fffffff;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
+ icsk->icsk_probes_tstamp = 0;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
+ WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
+ tp->is_cwnd_limited = 0;
+ tp->max_packets_out = 0;
+ tp->window_clamp = 0;
+ tp->delivered = 0;
+ tp->delivered_ce = 0;
+ tp->accecn_fail_mode = 0;
+ tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
+ if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ icsk->icsk_ca_initialized = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
+ tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
+ tp->total_retrans = 0;
inet_csk_delack_init(sk);
- sk->sk_send_head = NULL;
- tp->rx_opt.saw_tstamp = 0;
- tcp_sack_reset(&tp->rx_opt);
+ /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+ * issue in __tcp_select_window()
+ */
+ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
+ memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
+ dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL)));
+ tcp_saved_syn_free(tp);
+ tp->compressed_ack = 0;
+ tp->segs_in = 0;
+ tp->segs_out = 0;
+ tp->bytes_sent = 0;
+ tp->bytes_acked = 0;
+ tp->bytes_received = 0;
+ tp->bytes_retrans = 0;
+ tp->data_segs_in = 0;
+ tp->data_segs_out = 0;
+ tp->duplicate_sack[0].start_seq = 0;
+ tp->duplicate_sack[0].end_seq = 0;
+ tp->dsack_dups = 0;
+ tp->reord_seen = 0;
+ tp->retrans_out = 0;
+ tp->sacked_out = 0;
+ tp->tlp_high_seq = 0;
+ tp->last_oow_ack_time = 0;
+ tp->plb_rehash = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+ tp->rate_app_limited = 1;
+ tp->rack.mstamp = 0;
+ tp->rack.advanced = 0;
+ tp->rack.reo_wnd_steps = 1;
+ tp->rack.last_delivered = 0;
+ tp->rack.reo_wnd_persist = 0;
+ tp->rack.dsack_seen = 0;
+ tp->syn_data_acked = 0;
+ tp->syn_fastopen_child = 0;
+ tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
+
+
+ /* Clean up fastopen related fields */
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
+ tcp_free_fastopen_req(tp);
+ inet_clear_bit(DEFER_CONNECT, sk);
+ tp->fastopen_client_fail = 0;
+
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ }
+ sk_error_report(sk);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_disconnect);
+
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+ return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ (sk->sk_state != TCP_LISTEN);
+}
+
+static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
+{
+ struct tcp_repair_window opt;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ if (len != sizeof(opt))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ if (opt.max_window < opt.snd_wnd)
+ return -EINVAL;
+
+ if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
+ return -EINVAL;
+
+ if (after(opt.rcv_wup, tp->rcv_nxt))
+ return -EINVAL;
+
+ tp->snd_wl1 = opt.snd_wl1;
+ tp->snd_wnd = opt.snd_wnd;
+ tp->max_window = opt.max_window;
+
+ tp->rcv_wnd = opt.rcv_wnd;
+ tp->rcv_wup = opt.rcv_wup;
+
+ return 0;
+}
+
+static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
+ unsigned int len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_repair_opt opt;
+ size_t offset = 0;
+
+ while (len >= sizeof(opt)) {
+ if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
+ return -EFAULT;
+
+ offset += sizeof(opt);
+ len -= sizeof(opt);
+
+ switch (opt.opt_code) {
+ case TCPOPT_MSS:
+ tp->rx_opt.mss_clamp = opt.opt_val;
+ tcp_mtup_init(sk);
+ break;
+ case TCPOPT_WINDOW:
+ {
+ u16 snd_wscale = opt.opt_val & 0xFFFF;
+ u16 rcv_wscale = opt.opt_val >> 16;
+
+ if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
+ return -EFBIG;
+
+ tp->rx_opt.snd_wscale = snd_wscale;
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rx_opt.wscale_ok = 1;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.tstamp_ok = 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta = (val - tp->tcp_tx_delay) << 3;
+
+ if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ static int __tcp_tx_delay_enabled = 0;
+
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+ static_branch_enable(&tcp_tx_delay_enabled);
+ pr_info("TCP_TX_DELAY enabled\n");
+ }
+ }
+ /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
+ * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
+ * This is best effort.
+ */
+ if (delta && sk->sk_state == TCP_ESTABLISHED) {
+ s64 srtt = (s64)tp->srtt_us + delta;
+
+ tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+
+ /* Note: does not deal with non zero icsk_backoff */
+ tcp_set_rto(sk);
+
+ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+ tcp_update_pacing_rate(sk);
+ }
+}
+
+/* When set indicates to always queue non-full frames. Later the user clears
+ * this option and we transmit any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly filled frames when the
+ * user (for example) must write out headers with a write() call first and then
+ * use sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
+ * TCP_NODELAY.
+ */
+void __tcp_sock_set_cork(struct sock *sk, bool on)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (on) {
+ tp->nonagle |= TCP_NAGLE_CORK;
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_CORK;
+ if (tp->nonagle & TCP_NAGLE_OFF)
+ tp->nonagle |= TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ }
+}
+
+void tcp_sock_set_cork(struct sock *sk, bool on)
+{
+ lock_sock(sk);
+ __tcp_sock_set_cork(sk, on);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_cork);
- BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
+ * remembered, but it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
+ * even TCP_CORK for currently queued segments.
+ */
+void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+ if (on) {
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ } else {
+ tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
+ }
+}
- sk->sk_error_report(sk);
+void tcp_sock_set_nodelay(struct sock *sk)
+{
+ lock_sock(sk);
+ __tcp_sock_set_nodelay(sk, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
+
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
+{
+ if (!val) {
+ inet_csk_enter_pingpong_mode(sk);
+ return;
+ }
+
+ inet_csk_exit_pingpong_mode(sk);
+ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ inet_csk_ack_scheduled(sk)) {
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ tcp_cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ inet_csk_enter_pingpong_mode(sk);
+ }
+}
+
+void tcp_sock_set_quickack(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __tcp_sock_set_quickack(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_quickack);
+
+int tcp_sock_set_syncnt(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ return -EINVAL;
+
+ WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
+
+int tcp_sock_set_user_timeout(struct sock *sk, int val)
+{
+ /* Cap the max time in ms TCP will retry or probe the window
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ if (val < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
+
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ return -EINVAL;
+
+ /* Paired with WRITE_ONCE() in keepalive_time_when() */
+ WRITE_ONCE(tp->keepalive_time, val * HZ);
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
+ !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+ u32 elapsed = keepalive_time_elapsed(tp);
+
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ tcp_reset_keepalive_timer(sk, elapsed);
+ }
+
+ return 0;
+}
+
+int tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+ int err;
+
+ lock_sock(sk);
+ err = tcp_sock_set_keepidle_locked(sk, val);
+ release_sock(sk);
return err;
}
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
+
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ return -EINVAL;
+
+ WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
+
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ return -EINVAL;
+
+ /* Paired with READ_ONCE() in keepalive_probes() */
+ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+
+int tcp_set_window_clamp(struct sock *sk, int val)
+{
+ u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!val) {
+ if (sk->sk_state != TCP_CLOSE)
+ return -EINVAL;
+ WRITE_ONCE(tp->window_clamp, 0);
+ return 0;
+ }
+
+ old_window_clamp = tp->window_clamp;
+ new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val);
+
+ if (new_window_clamp == old_window_clamp)
+ return 0;
+
+ WRITE_ONCE(tp->window_clamp, new_window_clamp);
+
+ /* Need to apply the reserved mem provisioning only
+ * when shrinking the window clamp.
+ */
+ if (new_window_clamp < old_window_clamp) {
+ __tcp_adjust_rcv_ssthresh(sk, new_window_clamp);
+ } else {
+ new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp);
+ tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh);
+ }
+ return 0;
+}
+
+int tcp_sock_set_maxseg(struct sock *sk, int val)
+{
+ /* Values greater than interface MTU won't take effect. However
+ * at the point when this call is done we typically don't yet
+ * know which interface is going to be used
+ */
+ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
+ return -EINVAL;
+
+ WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
+ return 0;
+}
/*
* Socket option code for TCP.
*/
-static int do_tcp_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int optlen)
+int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
int val;
int err = 0;
- /* This is a string value all the others are int's */
- if (optname == TCP_CONGESTION) {
+ /* These are data/string values, all the others are ints */
+ switch (optname) {
+ case TCP_CONGESTION: {
char name[TCP_CA_NAME_MAX];
if (optlen < 1)
return -EINVAL;
- val = strncpy_from_user(name, optval,
- min(TCP_CA_NAME_MAX-1, optlen));
+ val = strncpy_from_sockptr(name, optval,
+ min_t(long, TCP_CA_NAME_MAX-1, optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
- err = tcp_set_congestion_control(sk, name);
- release_sock(sk);
+ sockopt_lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
+ sockopt_ns_capable(sock_net(sk)->user_ns,
+ CAP_NET_ADMIN));
+ sockopt_release_sock(sk);
+ return err;
+ }
+ case TCP_ULP: {
+ char name[TCP_ULP_NAME_MAX];
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ val = strncpy_from_sockptr(name, optval,
+ min_t(long, TCP_ULP_NAME_MAX - 1,
+ optlen));
+ if (val < 0)
+ return -EFAULT;
+ name[val] = 0;
+
+ sockopt_lock_sock(sk);
+ err = tcp_set_ulp(sk, name);
+ sockopt_release_sock(sk);
return err;
}
+ case TCP_FASTOPEN_KEY: {
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+ __u8 *backup_key = NULL;
+
+ /* Allow a backup key as well to facilitate key rotation
+ * First key is the active one.
+ */
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
+ return -EINVAL;
+
+ if (copy_from_sockptr(key, optval, optlen))
+ return -EFAULT;
+
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
+ }
+ default:
+ /* fallthru */
+ break;
+ }
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
- lock_sock(sk);
-
+ /* Handle options that can be set without locking the socket. */
switch (optname) {
+ case TCP_SYNCNT:
+ return tcp_sock_set_syncnt(sk, val);
+ case TCP_USER_TIMEOUT:
+ return tcp_sock_set_user_timeout(sk, val);
+ case TCP_KEEPINTVL:
+ return tcp_sock_set_keepintvl(sk, val);
+ case TCP_KEEPCNT:
+ return tcp_sock_set_keepcnt(sk, val);
+ case TCP_LINGER2:
+ if (val < 0)
+ WRITE_ONCE(tp->linger2, -1);
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
+ else
+ WRITE_ONCE(tp->linger2, val * HZ);
+ return 0;
+ case TCP_DEFER_ACCEPT:
+ /* Translate value in seconds to number of retransmits */
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ));
+ return 0;
+ case TCP_RTO_MAX_MS:
+ if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
+ return 0;
+ case TCP_RTO_MIN_US: {
+ int rto_min = usecs_to_jiffies(val);
+
+ if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min);
+ return 0;
+ }
+ case TCP_DELACK_MAX_US: {
+ int delack_max = usecs_to_jiffies(val);
+
+ if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
+ return 0;
+ }
case TCP_MAXSEG:
- /* Values greater than interface MTU won't take effect. However
- * at the point when this call is done we typically don't yet
- * know which interface is going to be used */
- if (val < 8 || val > MAX_TCP_WINDOW) {
- err = -EINVAL;
- break;
- }
- tp->rx_opt.user_mss = val;
- break;
+ return tcp_sock_set_maxseg(sk, val);
+ }
+
+ sockopt_lock_sock(sk);
+ switch (optname) {
case TCP_NODELAY:
- if (val) {
- /* TCP_NODELAY is weaker than TCP_CORK, so that
- * this option on corked socket is remembered, but
- * it is not activated until cork is cleared.
- *
- * However, when TCP_NODELAY is set we make
- * an explicit push, which overrides even TCP_CORK
- * for currently queued segments.
- */
- tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
- tcp_push_pending_frames(sk, tp);
- } else {
- tp->nonagle &= ~TCP_NAGLE_OFF;
- }
+ __tcp_sock_set_nodelay(sk, val);
break;
- case TCP_CORK:
- /* When set indicates to always queue non-full frames.
- * Later the user clears this option and we transmit
- * any pending partial frames in the queue. This is
- * meant to be used alongside sendfile() to get properly
- * filled frames when the user (for example) must write
- * out headers with a write() call first and then use
- * sendfile to send out the data parts.
- *
- * TCP_CORK can be set together with TCP_NODELAY and it is
- * stronger than TCP_NODELAY.
- */
- if (val) {
- tp->nonagle |= TCP_NAGLE_CORK;
- } else {
- tp->nonagle &= ~TCP_NAGLE_CORK;
- if (tp->nonagle&TCP_NAGLE_OFF)
- tp->nonagle |= TCP_NAGLE_PUSH;
- tcp_push_pending_frames(sk, tp);
- }
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->thin_lto = val;
break;
- case TCP_KEEPIDLE:
- if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ case TCP_THIN_DUPACK:
+ if (val < 0 || val > 1)
err = -EINVAL;
- else {
- tp->keepalive_time = val * HZ;
- if (sock_flag(sk, SOCK_KEEPOPEN) &&
- !((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN))) {
- __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
- if (tp->keepalive_time > elapsed)
- elapsed = tp->keepalive_time - elapsed;
- else
- elapsed = 0;
- inet_csk_reset_keepalive_timer(sk, elapsed);
- }
- }
break;
- case TCP_KEEPINTVL:
- if (val < 1 || val > MAX_TCP_KEEPINTVL)
+
+ case TCP_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ err = -EPERM;
+ else if (val == TCP_REPAIR_ON) {
+ tp->repair = 1;
+ sk->sk_reuse = SK_FORCE_REUSE;
+ tp->repair_queue = TCP_NO_QUEUE;
+ } else if (val == TCP_REPAIR_OFF) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ tcp_send_window_probe(sk);
+ } else if (val == TCP_REPAIR_OFF_NO_WP) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ } else
err = -EINVAL;
+
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (!tp->repair)
+ err = -EPERM;
+ else if ((unsigned int)val < TCP_QUEUES_NR)
+ tp->repair_queue = val;
else
- tp->keepalive_intvl = val * HZ;
+ err = -EINVAL;
break;
- case TCP_KEEPCNT:
- if (val < 1 || val > MAX_TCP_KEEPCNT)
+
+ case TCP_QUEUE_SEQ:
+ if (sk->sk_state != TCP_CLOSE) {
+ err = -EPERM;
+ } else if (tp->repair_queue == TCP_SEND_QUEUE) {
+ if (!tcp_rtx_queue_empty(sk))
+ err = -EPERM;
+ else
+ WRITE_ONCE(tp->write_seq, val);
+ } else if (tp->repair_queue == TCP_RECV_QUEUE) {
+ if (tp->rcv_nxt != tp->copied_seq) {
+ err = -EPERM;
+ } else {
+ WRITE_ONCE(tp->rcv_nxt, val);
+ WRITE_ONCE(tp->copied_seq, val);
+ }
+ } else {
err = -EINVAL;
- else
- tp->keepalive_probes = val;
+ }
break;
- case TCP_SYNCNT:
- if (val < 1 || val > MAX_TCP_SYNCNT)
+
+ case TCP_REPAIR_OPTIONS:
+ if (!tp->repair)
err = -EINVAL;
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
+ err = tcp_repair_options_est(sk, optval, optlen);
else
- icsk->icsk_syn_retries = val;
+ err = -EPERM;
break;
- case TCP_LINGER2:
- if (val < 0)
- tp->linger2 = -1;
- else if (val > sysctl_tcp_fin_timeout / HZ)
- tp->linger2 = 0;
- else
- tp->linger2 = val * HZ;
+ case TCP_CORK:
+ __tcp_sock_set_cork(sk, val);
break;
- case TCP_DEFER_ACCEPT:
- icsk->icsk_accept_queue.rskq_defer_accept = 0;
- if (val > 0) {
- /* Translate value in seconds to number of
- * retransmits */
- while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
- val > ((TCP_TIMEOUT_INIT / HZ) <<
- icsk->icsk_accept_queue.rskq_defer_accept))
- icsk->icsk_accept_queue.rskq_defer_accept++;
- icsk->icsk_accept_queue.rskq_defer_accept++;
- }
+ case TCP_KEEPIDLE:
+ err = tcp_sock_set_keepidle_locked(sk, val);
+ break;
+ case TCP_SAVE_SYN:
+ /* 0: disable, 1: enable, 2: start from ether_header */
+ if (val < 0 || val > 2)
+ err = -EINVAL;
+ else
+ tp->save_syn = val;
break;
case TCP_WINDOW_CLAMP:
- if (!val) {
- if (sk->sk_state != TCP_CLOSE) {
- err = -EINVAL;
- break;
- }
- tp->window_clamp = 0;
- } else
- tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
- SOCK_MIN_RCVBUF / 2 : val;
+ err = tcp_set_window_clamp(sk, val);
break;
case TCP_QUICKACK:
- if (!val) {
- icsk->icsk_ack.pingpong = 1;
- } else {
- icsk->icsk_ack.pingpong = 0;
- if ((1 << sk->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- inet_csk_ack_scheduled(sk)) {
- icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
- tcp_cleanup_rbuf(sk, 1);
- if (!(val & 1))
- icsk->icsk_ack.pingpong = 1;
- }
- }
+ __tcp_sock_set_quickack(sk, val);
break;
+ case TCP_AO_REPAIR:
+ if (!tcp_can_repair_sock(sk)) {
+ err = -EPERM;
+ break;
+ }
+ err = tcp_ao_set_repair(sk, optval, optlen);
+ break;
+#ifdef CONFIG_TCP_AO
+ case TCP_AO_ADD_KEY:
+ case TCP_AO_DEL_KEY:
+ case TCP_AO_INFO: {
+ /* If this is the first TCP-AO setsockopt() on the socket,
+ * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
+ * in any state.
+ */
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ goto ao_parse;
+ if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk)))
+ goto ao_parse;
+ if (tp->repair)
+ goto ao_parse;
+ err = -EISCONN;
+ break;
+ao_parse:
+ err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
+ break;
+ }
+#endif
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
- /* Read the IP->Key mappings from userspace */
- err = tp->af_specific->md5_parse(sk, optval, optlen);
+ case TCP_MD5SIG_EXT:
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
+ case TCP_FASTOPEN:
+ if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+ TCPF_LISTEN))) {
+ tcp_fastopen_init_key_once(net);
+ fastopen_queue_tune(sk, val);
+ } else {
+ err = -EINVAL;
+ }
+ break;
+ case TCP_FASTOPEN_CONNECT:
+ if (val > 1 || val < 0) {
+ err = -EINVAL;
+ } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) {
+ if (sk->sk_state == TCP_CLOSE)
+ tp->fastopen_connect = val;
+ else
+ err = -EINVAL;
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ break;
+ case TCP_FASTOPEN_NO_COOKIE:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ err = -EINVAL;
+ else
+ tp->fastopen_no_cookie = val;
+ break;
+ case TCP_TIMESTAMP:
+ if (!tp->repair) {
+ err = -EPERM;
+ break;
+ }
+ /* val is an opaque field,
+ * and low order bit contains usec_ts enable bit.
+ * Its a best effort, and we do not care if user makes an error.
+ */
+ tp->tcp_usec_ts = val & 1;
+ WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
+ break;
+ case TCP_REPAIR_WINDOW:
+ err = tcp_repair_set_window(tp, optval, optlen);
+ break;
+ case TCP_NOTSENT_LOWAT:
+ WRITE_ONCE(tp->notsent_lowat, val);
+ sk->sk_write_space(sk);
+ break;
+ case TCP_INQ:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else
+ tp->recvmsg_inq = val;
+ break;
+ case TCP_TX_DELAY:
+ /* tp->srtt_us is u32, and is shifted by 3 */
+ if (val < 0 || val >= (1U << (31 - 3))) {
+ err = -EINVAL;
+ break;
+ }
+ tcp_enable_tx_delay(sk, val);
+ WRITE_ONCE(tp->tcp_tx_delay, val);
+ break;
default:
err = -ENOPROTOOPT;
break;
- };
- release_sock(sk);
+ }
+
+ sockopt_release_sock(sk);
return err;
}
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
- int optlen)
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP)
- return icsk->icsk_af_ops->setsockopt(sk, level, optname,
- optval, optlen);
+ /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
+ optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
+EXPORT_IPV6_MOD(tcp_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
+ struct tcp_info *info)
{
- if (level != SOL_TCP)
- return inet_csk_compat_setsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_setsockopt(sk, level, optname, optval, optlen);
-}
+ u64 stats[__TCP_CHRONO_MAX], total = 0;
+ enum tcp_chrono i;
+
+ for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
+ stats[i] = tp->chrono_stat[i - 1];
+ if (i == tp->chrono_type)
+ stats[i] += tcp_jiffies32 - tp->chrono_start;
+ stats[i] *= USEC_PER_SEC / HZ;
+ total += stats[i];
+ }
-EXPORT_SYMBOL(compat_tcp_setsockopt);
-#endif
+ info->tcpi_busy_time = total;
+ info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
+ info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
+}
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp;
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ unsigned long rate;
+ u32 now;
+ u64 rate64;
+ bool slow;
memset(info, 0, sizeof(*info));
+ if (sk->sk_type != SOCK_STREAM)
+ return;
+
+ info->tcpi_state = inet_sk_state_load(sk);
+
+ /* Report meaningful fields for all TCP states, including listeners */
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
+ info->tcpi_pacing_rate = rate64;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
+ info->tcpi_max_pacing_rate = rate64;
+
+ info->tcpi_reordering = tp->reordering;
+ info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
+
+ if (info->tcpi_state == TCP_LISTEN) {
+ /* listeners aliased fields :
+ * tcpi_unacked -> Number of children ready for accept()
+ * tcpi_sacked -> max backlog
+ */
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
+ return;
+ }
+
+ slow = lock_sock_fast(sk);
- info->tcpi_state = sk->sk_state;
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2000,69 +4238,241 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rx_opt.tstamp_ok)
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- if (tp->rx_opt.sack_ok)
+ if (tcp_is_sack(tp))
info->tcpi_options |= TCPI_OPT_SACK;
if (tp->rx_opt.wscale_ok) {
info->tcpi_options |= TCPI_OPT_WSCALE;
info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
- }
+ }
- if (tp->ecn_flags&TCP_ECN_OK)
+ if (tcp_ecn_mode_any(tp))
info->tcpi_options |= TCPI_OPT_ECN;
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+ if (tp->syn_data_acked)
+ info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ if (tp->tcp_usec_ts)
+ info->tcpi_options |= TCPI_OPT_USEC_TS;
+ if (tp->syn_fastopen_child)
+ info->tcpi_options |= TCPI_OPT_TFO_CHILD;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
- info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+ info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
+ tcp_delack_max(sk)));
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
+
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
- info->tcpi_fackets = tp->fackets_out;
+ now = tcp_jiffies32;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
- info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
- info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+ info->tcpi_rtt = tp->srtt_us >> 3;
+ info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
- info->tcpi_reordering = tp->reordering;
- info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+ info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
-}
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+ tcp_get_info_chrono_stats(tp, info);
+
+ info->tcpi_segs_out = tp->segs_out;
+
+ /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+ info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+ info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
+
+ info->tcpi_min_rtt = tcp_min_rtt(tp);
+ info->tcpi_data_segs_out = tp->data_segs_out;
+
+ info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+ rate64 = tcp_compute_delivery_rate(tp);
+ if (rate64)
+ info->tcpi_delivery_rate = rate64;
+ info->tcpi_delivered = tp->delivered;
+ info->tcpi_delivered_ce = tp->delivered_ce;
+ info->tcpi_bytes_sent = tp->bytes_sent;
+ info->tcpi_bytes_retrans = tp->bytes_retrans;
+ info->tcpi_dsack_dups = tp->dsack_dups;
+ info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
+ info->tcpi_rcv_wnd = tp->rcv_wnd;
+ info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
+
+ info->tcpi_total_rto = tp->total_rto;
+ info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
+ info->tcpi_total_rto_time = tp->total_rto_time;
+ if (tp->rto_stamp)
+ info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+
+ info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
+ info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
+ info->tcpi_received_ce = tp->received_ce;
+ info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
+ info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
+ info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
+ info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
+ info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
+ info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];
+
+ unlock_sock_fast(sk, slow);
+}
EXPORT_SYMBOL_GPL(tcp_get_info);
-static int do_tcp_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int __user *optlen)
+static size_t tcp_opt_stats_get_size(void)
+{
+ return
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
+ 0;
+}
+
+/* Returns TTL or hop limit of an incoming packet from skb. */
+static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
+{
+ if (skb->protocol == htons(ETH_P_IP))
+ return ip_hdr(skb)->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return ipv6_hdr(skb)->hop_limit;
+ else
+ return 0;
+}
+
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
+ const struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *stats;
+ struct tcp_info info;
+ unsigned long rate;
+ u64 rate64;
+
+ stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ tcp_get_info_chrono_stats(tp, &info);
+ nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+ info.tcpi_busy_time, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+ info.tcpi_rwnd_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+ info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+ tp->data_segs_out, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+ tp->total_retrans, TCP_NLA_PAD);
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
+ nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+ rate64 = tcp_compute_delivery_rate(tp);
+ nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
+ nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+ nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
+ READ_ONCE(inet_csk(sk)->icsk_retransmits));
+ nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
+ nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+ nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+ nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
+
+ nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
+ nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+ TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+ TCP_NLA_PAD);
+ nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+ nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
+ nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
+ TCP_NLA_PAD);
+ if (ack_skb)
+ nla_put_u8(stats, TCP_NLA_TTL,
+ tcp_skb_ttl_or_hop_limit(ack_skb));
+
+ nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
+ return stats;
+}
+
+int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, sockptr_t optval, sockptr_t optlen)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ int user_mss;
int val, len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- len = min_t(unsigned int, len, sizeof(int));
-
if (len < 0)
return -EINVAL;
+ len = min_t(unsigned int, len, sizeof(int));
+
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- val = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ val = user_mss;
+ if (tp->repair)
+ val = tp->rx_opt.mss_clamp;
break;
case TCP_NODELAY:
val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2071,333 +4481,757 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = !!(tp->nonagle&TCP_NAGLE_CORK);
break;
case TCP_KEEPIDLE:
- val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
+ val = keepalive_time_when(tp) / HZ;
break;
case TCP_KEEPINTVL:
- val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
+ val = keepalive_intvl_when(tp) / HZ;
break;
case TCP_KEEPCNT:
- val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
+ val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
- val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ val = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
break;
case TCP_LINGER2:
- val = tp->linger2;
+ val = READ_ONCE(tp->linger2);
if (val >= 0)
- val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+ val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
- ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
+ val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
+ val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
- val = tp->window_clamp;
+ val = READ_ONCE(tp->window_clamp);
break;
case TCP_INFO: {
struct tcp_info info;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
tcp_get_info(sk, &info);
len = min_t(unsigned int, len, sizeof(info));
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_CC_INFO: {
+ const struct tcp_congestion_ops *ca_ops;
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ ca_ops = icsk->icsk_ca_ops;
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ~0U, &attr, &info);
+
+ len = min_t(unsigned int, len, sz);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
case TCP_QUICKACK:
- val = !icsk->icsk_ack.pingpong;
+ val = !inet_csk_in_pingpong_mode(sk);
break;
case TCP_CONGESTION:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
+ return -EFAULT;
+ return 0;
+
+ case TCP_ULP:
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+ len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+ if (!icsk->icsk_ulp_ops) {
+ len = 0;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ return 0;
+ }
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
+ return -EFAULT;
+ return 0;
+
+ case TCP_FASTOPEN_KEY: {
+ u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
+ unsigned int key_len;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ key_len = tcp_fastopen_get_cipher(net, icsk, key) *
+ TCP_FASTOPEN_KEY_LENGTH;
+ len = min_t(unsigned int, len, key_len);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, key, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ val = tp->thin_lto;
+ break;
+
+ case TCP_THIN_DUPACK:
+ val = 0;
+ break;
+
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (tp->repair)
+ val = tp->repair_queue;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_REPAIR_WINDOW: {
+ struct tcp_repair_window opt;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+
+ if (len != sizeof(opt))
+ return -EINVAL;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ opt.snd_wl1 = tp->snd_wl1;
+ opt.snd_wnd = tp->snd_wnd;
+ opt.max_window = tp->max_window;
+ opt.rcv_wnd = tp->rcv_wnd;
+ opt.rcv_wup = tp->rcv_wup;
+
+ if (copy_to_sockptr(optval, &opt, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_QUEUE_SEQ:
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ val = tp->write_seq;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ val = tp->rcv_nxt;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_USER_TIMEOUT:
+ val = READ_ONCE(icsk->icsk_user_timeout);
+ break;
+
+ case TCP_FASTOPEN:
+ val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
+ break;
+
+ case TCP_FASTOPEN_CONNECT:
+ val = tp->fastopen_connect;
+ break;
+
+ case TCP_FASTOPEN_NO_COOKIE:
+ val = tp->fastopen_no_cookie;
+ break;
+
+ case TCP_TX_DELAY:
+ val = READ_ONCE(tp->tcp_tx_delay);
+ break;
+
+ case TCP_TIMESTAMP:
+ val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
+ if (tp->tcp_usec_ts)
+ val |= 1;
+ else
+ val &= ~1;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ val = READ_ONCE(tp->notsent_lowat);
+ break;
+ case TCP_INQ:
+ val = tp->recvmsg_inq;
+ break;
+ case TCP_SAVE_SYN:
+ val = tp->save_syn;
+ break;
+ case TCP_SAVED_SYN: {
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
+
+ sockopt_lock_sock(sk);
+ if (tp->saved_syn) {
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
+ len = tcp_saved_syn_len(tp->saved_syn);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
+ return -EFAULT;
+ }
+ sockopt_release_sock(sk);
+ return -EINVAL;
+ }
+ len = tcp_saved_syn_len(tp->saved_syn);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
+ return -EFAULT;
+ }
+ if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
+ sockopt_release_sock(sk);
+ return -EFAULT;
+ }
+ tcp_saved_syn_free(tp);
+ sockopt_release_sock(sk);
+ } else {
+ sockopt_release_sock(sk);
+ len = 0;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ }
return 0;
+ }
+#ifdef CONFIG_MMU
+ case TCP_ZEROCOPY_RECEIVE: {
+ struct scm_timestamping_internal tss;
+ struct tcp_zerocopy_receive zc = {};
+ int err;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+ if (len < 0 ||
+ len < offsetofend(struct tcp_zerocopy_receive, length))
+ return -EINVAL;
+ if (unlikely(len > sizeof(zc))) {
+ err = check_zeroed_sockptr(optval, sizeof(zc),
+ len - sizeof(zc));
+ if (err < 1)
+ return err == 0 ? -EINVAL : err;
+ len = sizeof(zc);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ }
+ if (copy_from_sockptr(&zc, optval, len))
+ return -EFAULT;
+ if (zc.reserved)
+ return -EINVAL;
+ if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
+ return -EINVAL;
+ sockopt_lock_sock(sk);
+ err = tcp_zerocopy_receive(sk, &zc, &tss);
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+ &zc, &len, err);
+ sockopt_release_sock(sk);
+ if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
+ goto zerocopy_rcv_cmsg;
+ switch (len) {
+ case offsetofend(struct tcp_zerocopy_receive, msg_flags):
+ goto zerocopy_rcv_cmsg;
+ case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
+ case offsetofend(struct tcp_zerocopy_receive, msg_control):
+ case offsetofend(struct tcp_zerocopy_receive, flags):
+ case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
+ case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
+ case offsetofend(struct tcp_zerocopy_receive, err):
+ goto zerocopy_rcv_sk_err;
+ case offsetofend(struct tcp_zerocopy_receive, inq):
+ goto zerocopy_rcv_inq;
+ case offsetofend(struct tcp_zerocopy_receive, length):
+ default:
+ goto zerocopy_rcv_out;
+ }
+zerocopy_rcv_cmsg:
+ if (zc.msg_flags & TCP_CMSG_TS)
+ tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
+ else
+ zc.msg_flags = 0;
+zerocopy_rcv_sk_err:
+ if (!err)
+ zc.err = sock_error(sk);
+zerocopy_rcv_inq:
+ zc.inq = tcp_inq_hint(sk);
+zerocopy_rcv_out:
+ if (!err && copy_to_sockptr(optval, &zc, len))
+ err = -EFAULT;
+ return err;
+ }
+#endif
+ case TCP_AO_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ return -EPERM;
+ return tcp_ao_get_repair(sk, optval, optlen);
+ case TCP_AO_GET_KEYS:
+ case TCP_AO_INFO: {
+ int err;
+
+ sockopt_lock_sock(sk);
+ if (optname == TCP_AO_GET_KEYS)
+ err = tcp_ao_get_mkts(sk, optval, optlen);
+ else
+ err = tcp_ao_get_sock_info(sk, optval, optlen);
+ sockopt_release_sock(sk);
+
+ return err;
+ }
+ case TCP_IS_MPTCP:
+ val = 0;
+ break;
+ case TCP_RTO_MAX_MS:
+ val = jiffies_to_msecs(tcp_rto_max(sk));
+ break;
+ case TCP_RTO_MIN_US:
+ val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min));
+ break;
+ case TCP_DELACK_MAX_US:
+ val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max));
+ break;
default:
return -ENOPROTOOPT;
- };
+ }
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+ /* TCP do_tcp_getsockopt has optimized getsockopt implementation
+ * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+ */
+ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+ return true;
+
+ return false;
+}
+EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt);
+
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP)
- return icsk->icsk_af_ops->getsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+ /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
}
+EXPORT_IPV6_MOD(tcp_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+#ifdef CONFIG_TCP_MD5SIG
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+ unsigned int header_len)
{
- if (level != SOL_TCP)
- return inet_csk_compat_getsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
-}
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+ unsigned int i;
+
+ md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len);
+
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const skb_frag_t *f = &shi->frags[i];
+ u32 p_off, p_len, copied;
+ const void *vaddr;
+ struct page *p;
+
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ md5_update(ctx, vaddr + p_off, p_len);
+ kunmap_local(vaddr);
+ }
+ }
-EXPORT_SYMBOL(compat_tcp_getsockopt);
-#endif
+ skb_walk_frags(skb, frag_iter)
+ tcp_md5_hash_skb_data(ctx, frag_iter, 0);
+}
+EXPORT_IPV6_MOD(tcp_md5_hash_skb_data);
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
+void tcp_md5_hash_key(struct md5_ctx *ctx,
+ const struct tcp_md5sig_key *key)
{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct tcphdr *th;
- unsigned thlen;
- unsigned int seq;
- __be32 delta;
- unsigned int oldlen;
- unsigned int len;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- goto out;
+ u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
- th = skb->h.th;
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- goto out;
+ /* We use data_race() because tcp_md5_do_add() might change
+ * key->key under us
+ */
+ data_race(({ md5_update(ctx, key->key, keylen), 0; }));
+}
+EXPORT_IPV6_MOD(tcp_md5_hash_key);
- if (!pskb_may_pull(skb, thlen))
- goto out;
+/* Called with rcu_read_lock() */
+static enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int l3index, const __u8 *hash_location)
+{
+ /* This gets called for each TCP segment that has TCP-MD5 option.
+ * We have 2 drop cases:
+ * o An MD5 signature is present, but we're not expecting one.
+ * o The MD5 signature is wrong.
+ */
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ u8 newhash[16];
+
+ key = tcp_md5_do_lookup(sk, l3index, saddr, family);
+ if (!key) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ trace_tcp_hash_md5_unexpected(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
+ }
- oldlen = (u16)~skb->len;
- __skb_pull(skb, thlen);
-
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
- int mss;
-
- if (unlikely(type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCPV6 |
- 0) ||
- !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
- goto out;
+ /* Check the signature.
+ * To support dual stack listeners, we need to handle
+ * IPv4-mapped case.
+ */
+ if (family == AF_INET)
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ else
+ tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
+ if (memcmp(hash_location, newhash, 16) != 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
+ trace_tcp_hash_md5_mismatch(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5FAILURE;
+ }
+ return SKB_NOT_DROPPED_YET;
+}
+#else
+static inline enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int l3index, const __u8 *hash_location)
+{
+ return SKB_NOT_DROPPED_YET;
+}
- mss = skb_shinfo(skb)->gso_size;
- skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
+#endif
- segs = NULL;
- goto out;
+/* Called with rcu_read_lock() */
+enum skb_drop_reason
+tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
+ const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int dif, int sdif)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_ao_hdr *aoh;
+ const __u8 *md5_location;
+ int l3index;
+
+ /* Invalid option or two times meet any of auth options */
+ if (tcp_parse_auth_options(th, &md5_location, &aoh)) {
+ trace_tcp_hash_bad_header(sk, skb);
+ return SKB_DROP_REASON_TCP_AUTH_HDR;
}
- segs = skb_segment(skb, features);
- if (IS_ERR(segs))
- goto out;
+ if (req) {
+ if (tcp_rsk_used_ao(req) != !!aoh) {
+ u8 keyid, rnext, maclen;
- len = skb_shinfo(skb)->gso_size;
- delta = htonl(oldlen + (thlen + len));
+ if (aoh) {
+ keyid = aoh->keyid;
+ rnext = aoh->rnext_keyid;
+ maclen = tcp_ao_hdr_maclen(aoh);
+ } else {
+ keyid = rnext = maclen = 0;
+ }
- skb = segs;
- th = skb->h.th;
- seq = ntohl(th->seq);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+ }
- do {
- th->fin = th->psh = 0;
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ /* Fast path: unsigned segments */
+ if (likely(!md5_location && !aoh)) {
+ /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid
+ * for the remote peer. On TCP-AO established connection
+ * the last key is impossible to remove, so there's
+ * always at least one current_key.
+ */
+ if (tcp_ao_required(sk, saddr, family, l3index, true)) {
+ trace_tcp_hash_ao_required(sk, skb);
+ return SKB_DROP_REASON_TCP_AONOTFOUND;
+ }
+ if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ trace_tcp_hash_md5_required(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+ }
+ return SKB_NOT_DROPPED_YET;
+ }
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check = csum_fold(csum_partial(skb->h.raw, thlen,
- skb->csum));
+ if (aoh)
+ return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);
- seq += len;
- skb = skb->next;
- th = skb->h.th;
+ return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
+ l3index, md5_location);
+}
+EXPORT_IPV6_MOD_GPL(tcp_inbound_hash);
- th->seq = htonl(seq);
- th->cwr = 0;
- } while (skb->next);
+void tcp_done(struct sock *sk)
+{
+ struct request_sock *req;
- delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check = csum_fold(csum_partial(skb->h.raw, thlen,
- skb->csum));
+ /* We might be called with a new socket, after
+ * inet_csk_prepare_forced_close() has been called
+ * so we can not use lockdep_sock_is_held(sk)
+ */
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
-out:
- return segs;
-}
-EXPORT_SYMBOL(tcp_tso_segment);
+ if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
-#ifdef CONFIG_TCP_MD5SIG
-static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool **tcp_md5sig_pool;
-static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
-
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
-{
- int cpu;
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
- if (p) {
- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
- kfree(p);
- p = NULL;
- }
- }
- free_percpu(pool);
-}
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_clear_xmit_timers(sk);
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
-void tcp_free_md5sig_pool(void)
-{
- struct tcp_md5sig_pool **pool = NULL;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
- spin_lock(&tcp_md5sig_pool_lock);
- if (--tcp_md5sig_users == 0) {
- pool = tcp_md5sig_pool;
- tcp_md5sig_pool = NULL;
- }
- spin_unlock(&tcp_md5sig_pool_lock);
- if (pool)
- __tcp_free_md5sig_pool(pool);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
}
+EXPORT_SYMBOL_GPL(tcp_done);
-EXPORT_SYMBOL(tcp_free_md5sig_pool);
-
-struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
+int tcp_abort(struct sock *sk, int err)
{
- int cpu;
- struct tcp_md5sig_pool **pool;
+ int state = inet_sk_state_load(sk);
- pool = alloc_percpu(struct tcp_md5sig_pool *);
- if (!pool)
- return NULL;
+ if (state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p;
- struct crypto_hash *hash;
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ local_bh_enable();
+ return 0;
+ }
+ if (state == TCP_TIME_WAIT) {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
- p = kzalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- goto out_free;
- *per_cpu_ptr(pool, cpu) = p;
+ refcount_inc(&tw->tw_refcnt);
+ local_bh_disable();
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
+ }
- hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (!hash || IS_ERR(hash))
- goto out_free;
+ /* BPF context ensures sock locking. */
+ if (!has_current_bpf_ctx())
+ /* Don't race with userspace socket closes such as tcp_close. */
+ lock_sock(sk);
- p->md5_desc.tfm = hash;
+ /* Avoid closing the same socket twice. */
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
+ return -ENOENT;
}
- return pool;
-out_free:
- __tcp_free_md5sig_pool(pool);
- return NULL;
-}
-struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
-{
- struct tcp_md5sig_pool **pool;
- int alloc = 0;
-
-retry:
- spin_lock(&tcp_md5sig_pool_lock);
- pool = tcp_md5sig_pool;
- if (tcp_md5sig_users++ == 0) {
- alloc = 1;
- spin_unlock(&tcp_md5sig_pool_lock);
- } else if (!pool) {
- tcp_md5sig_users--;
- spin_unlock(&tcp_md5sig_pool_lock);
- cpu_relax();
- goto retry;
- } else
- spin_unlock(&tcp_md5sig_pool_lock);
-
- if (alloc) {
- /* we cannot hold spinlock here because this may sleep. */
- struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
- spin_lock(&tcp_md5sig_pool_lock);
- if (!p) {
- tcp_md5sig_users--;
- spin_unlock(&tcp_md5sig_pool_lock);
- return NULL;
- }
- pool = tcp_md5sig_pool;
- if (pool) {
- /* oops, it has already been assigned. */
- spin_unlock(&tcp_md5sig_pool_lock);
- __tcp_free_md5sig_pool(p);
- } else {
- tcp_md5sig_pool = pool = p;
- spin_unlock(&tcp_md5sig_pool_lock);
- }
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet_csk_listen_stop(sk);
}
- return pool;
-}
-
-EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
-struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
-{
- struct tcp_md5sig_pool **p;
- spin_lock(&tcp_md5sig_pool_lock);
- p = tcp_md5sig_pool;
- if (p)
- tcp_md5sig_users++;
- spin_unlock(&tcp_md5sig_pool_lock);
- return (p ? *per_cpu_ptr(p, cpu) : NULL);
-}
+ /* Don't race with BH socket closes such as inet_csk_listen_stop. */
+ local_bh_disable();
+ bh_lock_sock(sk);
-EXPORT_SYMBOL(__tcp_get_md5sig_pool);
+ if (tcp_need_reset(sk->sk_state))
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_STATE);
+ tcp_done_with_error(sk, err);
-void __tcp_put_md5sig_pool(void) {
- __tcp_free_md5sig_pool(tcp_md5sig_pool);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
+ return 0;
}
+EXPORT_SYMBOL_GPL(tcp_abort);
-EXPORT_SYMBOL(__tcp_put_md5sig_pool);
-#endif
-
-extern void __skb_cb_too_small_for_tcp(int, int);
extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- thash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &thash_entries);
+ if (ret)
+ return 0;
+
return 1;
}
__setup("thash_entries=", set_thash_entries);
+static void __init tcp_init_mem(void)
+{
+ unsigned long limit = nr_free_buffer_pages() / 16;
+
+ limit = max(limit, 128UL);
+ sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
+ sysctl_tcp_mem[1] = limit; /* 6.25 % */
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
+}
+
+static void __init tcp_struct_check(void)
+{
+ /* TX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked);
+#endif
+
+ /* TXRX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);
+
+ /* RX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
+
+ /* TX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
+
+ /* TXRX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
+
+ /* RX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
+}
+
void __init tcp_init(void)
{
- struct sk_buff *skb = NULL;
+ int max_rshare, max_wshare, cnt;
unsigned long limit;
- int order, i, max_share;
+ unsigned int i;
+
+ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+ sizeof_field(struct sk_buff, cb));
+
+ tcp_struct_check();
+
+ percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
- if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
- __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
- sizeof(skb->cb));
+ timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+ inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+ thash_entries, 21, /* one slot per 2 MB*/
+ 0, 64 * 1024);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT,
+ NULL);
+ tcp_hashinfo.bind2_bucket_cachep =
+ kmem_cache_create("tcp_bind2_bucket",
+ sizeof(struct inet_bind2_bucket), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT,
+ NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
@@ -2408,86 +5242,61 @@ void __init tcp_init(void)
alloc_large_system_hash("TCP established",
sizeof(struct inet_ehash_bucket),
thash_entries,
- (num_physpages >= 128 * 1024) ?
- 13 : 15,
+ 17, /* one slot per 128 KB of memory */
0,
- &tcp_hashinfo.ehash_size,
NULL,
- 0);
- tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
- for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
- rwlock_init(&tcp_hashinfo.ehash[i].lock);
- INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
- }
+ &tcp_hashinfo.ehash_mask,
+ 0,
+ thash_entries ? 0 : 512 * 1024);
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
+ INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+ if (inet_ehash_locks_alloc(&tcp_hashinfo))
+ panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
- sizeof(struct inet_bind_hashbucket),
- tcp_hashinfo.ehash_size,
- (num_physpages >= 128 * 1024) ?
- 13 : 15,
+ 2 * sizeof(struct inet_bind_hashbucket),
+ tcp_hashinfo.ehash_mask + 1,
+ 17, /* one slot per 128 KB of memory */
0,
&tcp_hashinfo.bhash_size,
NULL,
+ 0,
64 * 1024);
- tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+ tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+ tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
- /* Try to be a bit smarter and adjust defaults depending
- * on available memory.
- */
- for (order = 0; ((1 << order) << PAGE_SHIFT) <
- (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
- order++)
- ;
- if (order >= 4) {
- sysctl_local_port_range[0] = 32768;
- sysctl_local_port_range[1] = 61000;
- tcp_death_row.sysctl_max_tw_buckets = 180000;
- sysctl_tcp_max_orphans = 4096 << (order - 4);
- sysctl_max_syn_backlog = 1024;
- } else if (order < 3) {
- sysctl_local_port_range[0] = 1024 * (3 - order);
- tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
- sysctl_tcp_max_orphans >>= (3 - order);
- sysctl_max_syn_backlog = 128;
- }
-
- /* Allow no more than 3/4 kernel memory (usually less) allocated to TCP */
- sysctl_tcp_mem[0] = (1536 / sizeof (struct inet_bind_hashbucket)) << order;
- sysctl_tcp_mem[1] = sysctl_tcp_mem[0] * 4 / 3;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
-
- limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
- max_share = min(4UL*1024*1024, limit);
-
- sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
- sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_share);
-
- sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
- sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_share);
-
- printk(KERN_INFO "TCP: Hash tables configured "
- "(established %d bind %d)\n",
- tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
-
- tcp_register_congestion_control(&tcp_reno);
-}
+ tcp_hashinfo.pernet = false;
-EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_disconnect);
-EXPORT_SYMBOL(tcp_getsockopt);
-EXPORT_SYMBOL(tcp_ioctl);
-EXPORT_SYMBOL(tcp_poll);
-EXPORT_SYMBOL(tcp_read_sock);
-EXPORT_SYMBOL(tcp_recvmsg);
-EXPORT_SYMBOL(tcp_sendmsg);
-EXPORT_SYMBOL(tcp_sendpage);
-EXPORT_SYMBOL(tcp_setsockopt);
-EXPORT_SYMBOL(tcp_shutdown);
-EXPORT_SYMBOL(tcp_statistics);
+ cnt = tcp_hashinfo.ehash_mask + 1;
+ sysctl_tcp_max_orphans = cnt / 2;
+
+ tcp_init_mem();
+ /* Set per-socket limits to no more than 1/128 the pressure threshold */
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+ max_wshare = min(4UL*1024*1024, limit);
+ max_rshare = min(32UL*1024*1024, limit);
+
+ init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
+ init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+ init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+
+ init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
+ init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
+
+ pr_info("Hash tables configured (established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+
+ tcp_v4_init();
+ tcp_metrics_init();
+ BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
+ tcp_tsq_work_init();
+ mptcp_init();
+}
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
new file mode 100644
index 000000000000..34b8450829d0
--- /dev/null
+++ b/net/ipv4/tcp_ao.c
@@ -0,0 +1,2442 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET An implementation of the TCP Authentication Option (TCP-AO).
+ * See RFC5925.
+ *
+ * Authors: Dmitry Safonov <dima@arista.com>
+ * Francesco Ruggeri <fruggeri@arista.com>
+ * Salam Noureddine <noureddine@arista.com>
+ */
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <crypto/hash.h>
+#include <linux/inetdevice.h>
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <trace/events/tcp.h>
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ);
+
+int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
+ unsigned int len, struct tcp_sigpool *hp)
+{
+ struct scatterlist sg;
+ int ret;
+
+ if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp->req),
+ mkt->key, mkt->keylen))
+ goto clear_hash;
+
+ ret = crypto_ahash_init(hp->req);
+ if (ret)
+ goto clear_hash;
+
+ sg_init_one(&sg, ctx, len);
+ ahash_request_set_crypt(hp->req, &sg, key, len);
+ crypto_ahash_update(hp->req);
+
+ ret = crypto_ahash_final(hp->req);
+ if (ret)
+ goto clear_hash;
+
+ return 0;
+clear_hash:
+ memset(key, 0, tcp_ao_digest_size(mkt));
+ return 1;
+}
+
+bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code)
+{
+ bool ignore_icmp = false;
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return false;
+
+ /* RFC5925, 7.8:
+ * >> A TCP-AO implementation MUST default to ignore incoming ICMPv4
+ * messages of Type 3 (destination unreachable), Codes 2-4 (protocol
+ * unreachable, port unreachable, and fragmentation needed -- ’hard
+ * errors’), and ICMPv6 Type 1 (destination unreachable), Code 1
+ * (administratively prohibited) and Code 4 (port unreachable) intended
+ * for connections in synchronized states (ESTABLISHED, FIN-WAIT-1, FIN-
+ * WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT) that match MKTs.
+ */
+ if (family == AF_INET) {
+ if (type != ICMP_DEST_UNREACH)
+ return false;
+ if (code < ICMP_PROT_UNREACH || code > ICMP_FRAG_NEEDED)
+ return false;
+ } else {
+ if (type != ICMPV6_DEST_UNREACH)
+ return false;
+ if (code != ICMPV6_ADM_PROHIBITED && code != ICMPV6_PORT_UNREACH)
+ return false;
+ }
+
+ rcu_read_lock();
+ switch (sk->sk_state) {
+ case TCP_TIME_WAIT:
+ ao = rcu_dereference(tcp_twsk(sk)->ao_info);
+ break;
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ case TCP_LISTEN:
+ case TCP_NEW_SYN_RECV:
+ /* RFC5925 specifies to ignore ICMPs *only* on connections
+ * in synchronized states.
+ */
+ rcu_read_unlock();
+ return false;
+ default:
+ ao = rcu_dereference(tcp_sk(sk)->ao_info);
+ }
+
+ if (ao && !ao->accept_icmps) {
+ ignore_icmp = true;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAODROPPEDICMPS);
+ atomic64_inc(&ao->counters.dropped_icmp);
+ }
+ rcu_read_unlock();
+
+ return ignore_icmp;
+}
+
+/* Optimized version of tcp_ao_do_lookup(): only for sockets for which
+ * it's known that the keys in ao_info are matching peer's
+ * family/address/VRF/etc.
+ */
+struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk,
+ struct tcp_ao_info *ao,
+ int sndid, int rcvid)
+{
+ struct tcp_ao_key *key;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) {
+ if ((sndid >= 0 && key->sndid != sndid) ||
+ (rcvid >= 0 && key->rcvid != rcvid))
+ continue;
+ return key;
+ }
+
+ return NULL;
+}
+
+static int ipv4_prefix_cmp(const struct in_addr *addr1,
+ const struct in_addr *addr2,
+ unsigned int prefixlen)
+{
+ __be32 mask = inet_make_mask(prefixlen);
+ __be32 a1 = addr1->s_addr & mask;
+ __be32 a2 = addr2->s_addr & mask;
+
+ if (a1 == a2)
+ return 0;
+ return memcmp(&a1, &a2, sizeof(a1));
+}
+
+static int __tcp_ao_key_cmp(const struct tcp_ao_key *key, int l3index,
+ const union tcp_ao_addr *addr, u8 prefixlen,
+ int family, int sndid, int rcvid)
+{
+ if (sndid >= 0 && key->sndid != sndid)
+ return (key->sndid > sndid) ? 1 : -1;
+ if (rcvid >= 0 && key->rcvid != rcvid)
+ return (key->rcvid > rcvid) ? 1 : -1;
+ if (l3index >= 0 && (key->keyflags & TCP_AO_KEYF_IFINDEX)) {
+ if (key->l3index != l3index)
+ return (key->l3index > l3index) ? 1 : -1;
+ }
+
+ if (family == AF_UNSPEC)
+ return 0;
+ if (key->family != family)
+ return (key->family > family) ? 1 : -1;
+
+ if (family == AF_INET) {
+ if (ntohl(key->addr.a4.s_addr) == INADDR_ANY)
+ return 0;
+ if (ntohl(addr->a4.s_addr) == INADDR_ANY)
+ return 0;
+ return ipv4_prefix_cmp(&key->addr.a4, &addr->a4, prefixlen);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (ipv6_addr_any(&key->addr.a6) || ipv6_addr_any(&addr->a6))
+ return 0;
+ if (ipv6_prefix_equal(&key->addr.a6, &addr->a6, prefixlen))
+ return 0;
+ return memcmp(&key->addr.a6, &addr->a6, sizeof(addr->a6));
+#endif
+ }
+ return -1;
+}
+
+static int tcp_ao_key_cmp(const struct tcp_ao_key *key, int l3index,
+ const union tcp_ao_addr *addr, u8 prefixlen,
+ int family, int sndid, int rcvid)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6 && ipv6_addr_v4mapped(&addr->a6)) {
+ __be32 addr4 = addr->a6.s6_addr32[3];
+
+ return __tcp_ao_key_cmp(key, l3index,
+ (union tcp_ao_addr *)&addr4,
+ prefixlen, AF_INET, sndid, rcvid);
+ }
+#endif
+ return __tcp_ao_key_cmp(key, l3index, addr,
+ prefixlen, family, sndid, rcvid);
+}
+
+static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_ao_addr *addr, int family, u8 prefix,
+ int sndid, int rcvid)
+{
+ struct tcp_ao_key *key;
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return NULL;
+
+ ao = rcu_dereference_check(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao)
+ return NULL;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) {
+ u8 prefixlen = min(prefix, key->prefixlen);
+
+ if (!tcp_ao_key_cmp(key, l3index, addr, prefixlen,
+ family, sndid, rcvid))
+ return key;
+ }
+ return NULL;
+}
+
+struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_ao_addr *addr,
+ int family, int sndid, int rcvid)
+{
+ return __tcp_ao_do_lookup(sk, l3index, addr, family, U8_MAX, sndid, rcvid);
+}
+
+static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
+{
+ struct tcp_ao_info *ao;
+
+ ao = kzalloc(sizeof(*ao), flags);
+ if (!ao)
+ return NULL;
+ INIT_HLIST_HEAD(&ao->head);
+ refcount_set(&ao->refcnt, 1);
+
+ return ao;
+}
+
+static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
+{
+ hlist_add_head_rcu(&mkt->node, &ao->head);
+}
+
+static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk,
+ struct tcp_ao_key *key)
+{
+ struct tcp_ao_key *new_key;
+
+ new_key = sock_kmalloc(sk, tcp_ao_sizeof_key(key),
+ GFP_ATOMIC);
+ if (!new_key)
+ return NULL;
+
+ *new_key = *key;
+ INIT_HLIST_NODE(&new_key->node);
+ tcp_sigpool_get(new_key->tcp_sigpool_id);
+ atomic64_set(&new_key->pkt_good, 0);
+ atomic64_set(&new_key->pkt_bad, 0);
+
+ return new_key;
+}
+
+static void tcp_ao_key_free_rcu(struct rcu_head *head)
+{
+ struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu);
+
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+}
+
+static void tcp_ao_info_free(struct tcp_ao_info *ao)
+{
+ struct tcp_ao_key *key;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(key, n, &ao->head, node) {
+ hlist_del(&key->node);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+ }
+ kfree(ao);
+ static_branch_slow_dec_deferred(&tcp_ao_needed);
+}
+
+static void tcp_ao_sk_omem_free(struct sock *sk, struct tcp_ao_info *ao)
+{
+ size_t total_ao_sk_mem = 0;
+ struct tcp_ao_key *key;
+
+ hlist_for_each_entry(key, &ao->head, node)
+ total_ao_sk_mem += tcp_ao_sizeof_key(key);
+ atomic_sub(total_ao_sk_mem, &sk->sk_omem_alloc);
+}
+
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
+{
+ struct tcp_ao_info *ao;
+
+ if (twsk) {
+ ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1);
+ rcu_assign_pointer(tcp_twsk(sk)->ao_info, NULL);
+ } else {
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, NULL);
+ }
+
+ if (!ao || !refcount_dec_and_test(&ao->refcnt))
+ return;
+
+ if (!twsk)
+ tcp_ao_sk_omem_free(sk, ao);
+ tcp_ao_info_free(ao);
+}
+
+void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
+{
+ struct tcp_ao_info *ao_info = rcu_dereference_protected(tp->ao_info, 1);
+
+ if (ao_info) {
+ struct tcp_ao_key *key;
+ struct hlist_node *n;
+ int omem = 0;
+
+ hlist_for_each_entry_safe(key, n, &ao_info->head, node) {
+ omem += tcp_ao_sizeof_key(key);
+ }
+
+ refcount_inc(&ao_info->refcnt);
+ atomic_sub(omem, &(((struct sock *)tp)->sk_omem_alloc));
+ rcu_assign_pointer(tcptw->ao_info, ao_info);
+ } else {
+ tcptw->ao_info = NULL;
+ }
+}
+
+/* 4 tuple and ISNs are expected in NBO */
+static int tcp_v4_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
+ __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport,
+ __be32 sisn, __be32 disn)
+{
+ /* See RFC5926 3.1.1 */
+ struct kdf_input_block {
+ u8 counter;
+ u8 label[6];
+ struct tcp4_ao_context ctx;
+ __be16 outlen;
+ } __packed * tmp;
+ struct tcp_sigpool hp;
+ int err;
+
+ err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp);
+ if (err)
+ return err;
+
+ tmp = hp.scratch;
+ tmp->counter = 1;
+ memcpy(tmp->label, "TCP-AO", 6);
+ tmp->ctx.saddr = saddr;
+ tmp->ctx.daddr = daddr;
+ tmp->ctx.sport = sport;
+ tmp->ctx.dport = dport;
+ tmp->ctx.sisn = sisn;
+ tmp->ctx.disn = disn;
+ tmp->outlen = htons(tcp_ao_digest_size(mkt) * 8); /* in bits */
+
+ err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp);
+ tcp_sigpool_end(&hp);
+
+ return err;
+}
+
+int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+ const struct sock *sk,
+ __be32 sisn, __be32 disn, bool send)
+{
+ if (send)
+ return tcp_v4_ao_calc_key(mkt, key, sk->sk_rcv_saddr,
+ sk->sk_daddr, htons(sk->sk_num),
+ sk->sk_dport, sisn, disn);
+ else
+ return tcp_v4_ao_calc_key(mkt, key, sk->sk_daddr,
+ sk->sk_rcv_saddr, sk->sk_dport,
+ htons(sk->sk_num), disn, sisn);
+}
+
+static int tcp_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+ const struct sock *sk,
+ __be32 sisn, __be32 disn, bool send)
+{
+ if (mkt->family == AF_INET)
+ return tcp_v4_ao_calc_key_sk(mkt, key, sk, sisn, disn, send);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (mkt->family == AF_INET6)
+ return tcp_v6_ao_calc_key_sk(mkt, key, sk, sisn, disn, send);
+#endif
+ else
+ return -EOPNOTSUPP;
+}
+
+int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+ struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ return tcp_v4_ao_calc_key(mkt, key,
+ ireq->ir_loc_addr, ireq->ir_rmt_addr,
+ htons(ireq->ir_num), ireq->ir_rmt_port,
+ htonl(tcp_rsk(req)->snt_isn),
+ htonl(tcp_rsk(req)->rcv_isn));
+}
+
+static int tcp_v4_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+ const struct sk_buff *skb,
+ __be32 sisn, __be32 disn)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ return tcp_v4_ao_calc_key(mkt, key, iph->saddr, iph->daddr,
+ th->source, th->dest, sisn, disn);
+}
+
+static int tcp_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+ const struct sk_buff *skb,
+ __be32 sisn, __be32 disn, int family)
+{
+ if (family == AF_INET)
+ return tcp_v4_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ return tcp_v6_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#endif
+ return -EAFNOSUPPORT;
+}
+
+static int tcp_v4_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+ __be32 daddr, __be32 saddr,
+ int nbytes)
+{
+ struct tcp4_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = hp->scratch;
+ bp->saddr = saddr;
+ bp->daddr = daddr;
+ bp->pad = 0;
+ bp->protocol = IPPROTO_TCP;
+ bp->len = cpu_to_be16(nbytes);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->req);
+}
+
+static int tcp_ao_hash_pseudoheader(unsigned short int family,
+ const struct sock *sk,
+ const struct sk_buff *skb,
+ struct tcp_sigpool *hp, int nbytes)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ /* TODO: Can we rely on checksum being zero to mean outbound pkt? */
+ if (!th->check) {
+ if (family == AF_INET)
+ return tcp_v4_ao_hash_pseudoheader(hp, sk->sk_daddr,
+ sk->sk_rcv_saddr, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ return tcp_v6_ao_hash_pseudoheader(hp, &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr, skb->len);
+#endif
+ else
+ return -EAFNOSUPPORT;
+ }
+
+ if (family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return tcp_v4_ao_hash_pseudoheader(hp, iph->daddr,
+ iph->saddr, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ return tcp_v6_ao_hash_pseudoheader(hp, &iph->daddr,
+ &iph->saddr, skb->len);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq)
+{
+ u32 sne = next_sne;
+
+ if (before(seq, next_seq)) {
+ if (seq > next_seq)
+ sne--;
+ } else {
+ if (seq < next_seq)
+ sne++;
+ }
+
+ return sne;
+}
+
+/* tcp_ao_hash_sne(struct tcp_sigpool *hp)
+ * @hp - used for hashing
+ * @sne - sne value
+ */
+static int tcp_ao_hash_sne(struct tcp_sigpool *hp, u32 sne)
+{
+ struct scatterlist sg;
+ __be32 *bp;
+
+ bp = (__be32 *)hp->scratch;
+ *bp = htonl(sne);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->req);
+}
+
+static int tcp_ao_hash_header(struct tcp_sigpool *hp,
+ const struct tcphdr *th,
+ bool exclude_options, u8 *hash,
+ int hash_offset, int hash_len)
+{
+ struct scatterlist sg;
+ u8 *hdr = hp->scratch;
+ int err, len;
+
+ /* We are not allowed to change tcphdr, make a local copy */
+ if (exclude_options) {
+ len = sizeof(*th) + sizeof(struct tcp_ao_hdr) + hash_len;
+ memcpy(hdr, th, sizeof(*th));
+ memcpy(hdr + sizeof(*th),
+ (u8 *)th + hash_offset - sizeof(struct tcp_ao_hdr),
+ sizeof(struct tcp_ao_hdr));
+ memset(hdr + sizeof(*th) + sizeof(struct tcp_ao_hdr),
+ 0, hash_len);
+ ((struct tcphdr *)hdr)->check = 0;
+ } else {
+ len = th->doff << 2;
+ memcpy(hdr, th, len);
+ /* zero out tcp-ao hash */
+ ((struct tcphdr *)hdr)->check = 0;
+ memset(hdr + hash_offset, 0, hash_len);
+ }
+
+ sg_init_one(&sg, hdr, len);
+ ahash_request_set_crypt(hp->req, &sg, NULL, len);
+ err = crypto_ahash_update(hp->req);
+ WARN_ON_ONCE(err != 0);
+ return err;
+}
+
+int tcp_ao_hash_hdr(unsigned short int family, char *ao_hash,
+ struct tcp_ao_key *key, const u8 *tkey,
+ const union tcp_ao_addr *daddr,
+ const union tcp_ao_addr *saddr,
+ const struct tcphdr *th, u32 sne)
+{
+ int tkey_len = tcp_ao_digest_size(key);
+ int hash_offset = ao_hash - (char *)th;
+ struct tcp_sigpool hp;
+ void *hash_buf = NULL;
+
+ hash_buf = kmalloc(tkey_len, GFP_ATOMIC);
+ if (!hash_buf)
+ goto clear_hash_noput;
+
+ if (tcp_sigpool_start(key->tcp_sigpool_id, &hp))
+ goto clear_hash_noput;
+
+ if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp.req), tkey, tkey_len))
+ goto clear_hash;
+
+ if (crypto_ahash_init(hp.req))
+ goto clear_hash;
+
+ if (tcp_ao_hash_sne(&hp, sne))
+ goto clear_hash;
+ if (family == AF_INET) {
+ if (tcp_v4_ao_hash_pseudoheader(&hp, daddr->a4.s_addr,
+ saddr->a4.s_addr, th->doff * 4))
+ goto clear_hash;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ if (tcp_v6_ao_hash_pseudoheader(&hp, &daddr->a6,
+ &saddr->a6, th->doff * 4))
+ goto clear_hash;
+#endif
+ } else {
+ WARN_ON_ONCE(1);
+ goto clear_hash;
+ }
+ if (tcp_ao_hash_header(&hp, th,
+ !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT),
+ ao_hash, hash_offset, tcp_ao_maclen(key)))
+ goto clear_hash;
+ ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
+ if (crypto_ahash_final(hp.req))
+ goto clear_hash;
+
+ memcpy(ao_hash, hash_buf, tcp_ao_maclen(key));
+ tcp_sigpool_end(&hp);
+ kfree(hash_buf);
+ return 0;
+
+clear_hash:
+ tcp_sigpool_end(&hp);
+clear_hash_noput:
+ memset(ao_hash, 0, tcp_ao_maclen(key));
+ kfree(hash_buf);
+ return 1;
+}
+
+int tcp_ao_hash_skb(unsigned short int family,
+ char *ao_hash, struct tcp_ao_key *key,
+ const struct sock *sk, const struct sk_buff *skb,
+ const u8 *tkey, int hash_offset, u32 sne)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ int tkey_len = tcp_ao_digest_size(key);
+ struct tcp_sigpool hp;
+ void *hash_buf = NULL;
+
+ hash_buf = kmalloc(tkey_len, GFP_ATOMIC);
+ if (!hash_buf)
+ goto clear_hash_noput;
+
+ if (tcp_sigpool_start(key->tcp_sigpool_id, &hp))
+ goto clear_hash_noput;
+
+ if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp.req), tkey, tkey_len))
+ goto clear_hash;
+
+ /* For now use sha1 by default. Depends on alg in tcp_ao_key */
+ if (crypto_ahash_init(hp.req))
+ goto clear_hash;
+
+ if (tcp_ao_hash_sne(&hp, sne))
+ goto clear_hash;
+ if (tcp_ao_hash_pseudoheader(family, sk, skb, &hp, skb->len))
+ goto clear_hash;
+ if (tcp_ao_hash_header(&hp, th,
+ !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT),
+ ao_hash, hash_offset, tcp_ao_maclen(key)))
+ goto clear_hash;
+ if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
+ goto clear_hash;
+ ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
+ if (crypto_ahash_final(hp.req))
+ goto clear_hash;
+
+ memcpy(ao_hash, hash_buf, tcp_ao_maclen(key));
+ tcp_sigpool_end(&hp);
+ kfree(hash_buf);
+ return 0;
+
+clear_hash:
+ tcp_sigpool_end(&hp);
+clear_hash_noput:
+ memset(ao_hash, 0, tcp_ao_maclen(key));
+ kfree(hash_buf);
+ return 1;
+}
+
+int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+ const struct sock *sk, const struct sk_buff *skb,
+ const u8 *tkey, int hash_offset, u32 sne)
+{
+ return tcp_ao_hash_skb(AF_INET, ao_hash, key, sk, skb,
+ tkey, hash_offset, sne);
+}
+
+int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+ struct request_sock *req, const struct sk_buff *skb,
+ int hash_offset, u32 sne)
+{
+ void *hash_buf = NULL;
+ int err;
+
+ hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC);
+ if (!hash_buf)
+ return -ENOMEM;
+
+ err = tcp_v4_ao_calc_key_rsk(ao_key, hash_buf, req);
+ if (err)
+ goto out;
+
+ err = tcp_ao_hash_skb(AF_INET, ao_hash, ao_key, req_to_sk(req), skb,
+ hash_buf, hash_offset, sne);
+out:
+ kfree(hash_buf);
+ return err;
+}
+
+struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
+ struct request_sock *req,
+ int sndid, int rcvid)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ union tcp_ao_addr *addr = (union tcp_ao_addr *)&ireq->ir_rmt_addr;
+ int l3index;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+ return tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid, rcvid);
+}
+
+struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
+ int sndid, int rcvid)
+{
+ int l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ union tcp_ao_addr *addr = (union tcp_ao_addr *)&addr_sk->sk_daddr;
+
+ return tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid, rcvid);
+}
+
+int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
+ const struct tcp_ao_hdr *aoh, int l3index, u32 seq,
+ struct tcp_ao_key **key, char **traffic_key,
+ bool *allocated_traffic_key, u8 *keyid, u32 *sne)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_ao_info *ao_info;
+
+ *allocated_traffic_key = false;
+ /* If there's no socket - than initial sisn/disn are unknown.
+ * Drop the segment. RFC5925 (7.7) advises to require graceful
+ * restart [RFC4724]. Alternatively, the RFC5925 advises to
+ * save/restore traffic keys before/after reboot.
+ * Linux TCP-AO support provides TCP_AO_ADD_KEY and TCP_AO_REPAIR
+ * options to restore a socket post-reboot.
+ */
+ if (!sk)
+ return -ENOTCONN;
+
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
+ unsigned int family = READ_ONCE(sk->sk_family);
+ union tcp_ao_addr *addr;
+ __be32 disn, sisn;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ sisn = htonl(tcp_rsk(req)->rcv_isn);
+ disn = htonl(tcp_rsk(req)->snt_isn);
+ *sne = tcp_ao_compute_sne(0, tcp_rsk(req)->snt_isn, seq);
+ } else {
+ sisn = th->seq;
+ disn = 0;
+ }
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ addr = (union tcp_md5_addr *)&ipv6_hdr(skb)->saddr;
+ else
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6 && ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ family = AF_INET;
+#endif
+
+ sk = sk_const_to_full_sk(sk);
+ ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+ if (!ao_info)
+ return -ENOENT;
+ *key = tcp_ao_do_lookup(sk, l3index, addr, family,
+ -1, aoh->rnext_keyid);
+ if (!*key)
+ return -ENOENT;
+ *traffic_key = kmalloc(tcp_ao_digest_size(*key), GFP_ATOMIC);
+ if (!*traffic_key)
+ return -ENOMEM;
+ *allocated_traffic_key = true;
+ if (tcp_ao_calc_key_skb(*key, *traffic_key, skb,
+ sisn, disn, family))
+ return -1;
+ *keyid = (*key)->rcvid;
+ } else {
+ struct tcp_ao_key *rnext_key;
+ u32 snd_basis;
+
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
+ snd_basis = tcp_twsk(sk)->tw_snd_nxt;
+ } else {
+ ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+ snd_basis = tcp_sk(sk)->snd_una;
+ }
+ if (!ao_info)
+ return -ENOENT;
+
+ *key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1);
+ if (!*key)
+ return -ENOENT;
+ *traffic_key = snd_other_key(*key);
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ *keyid = rnext_key->rcvid;
+ *sne = tcp_ao_compute_sne(READ_ONCE(ao_info->snd_sne),
+ snd_basis, seq);
+ }
+ return 0;
+}
+
+int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_ao_key *key, struct tcphdr *th,
+ __u8 *hash_location)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_info *ao;
+ void *tkey_buf = NULL;
+ u8 *traffic_key;
+ u32 sne;
+
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ traffic_key = snd_other_key(key);
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+ __be32 disn;
+
+ if (!(tcb->tcp_flags & TCPHDR_ACK)) {
+ disn = 0;
+ tkey_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+ if (!tkey_buf)
+ return -ENOMEM;
+ traffic_key = tkey_buf;
+ } else {
+ disn = ao->risn;
+ }
+ tp->af_specific->ao_calc_key_sk(key, traffic_key,
+ sk, ao->lisn, disn, true);
+ }
+ sne = tcp_ao_compute_sne(READ_ONCE(ao->snd_sne), READ_ONCE(tp->snd_una),
+ ntohl(th->seq));
+ tp->af_specific->calc_ao_hash(hash_location, key, sk, skb, traffic_key,
+ hash_location - (u8 *)th, sne);
+ kfree(tkey_buf);
+ return 0;
+}
+
+static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family,
+ const struct sock *sk, const struct sk_buff *skb,
+ int sndid, int rcvid, int l3index)
+{
+ if (family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)&iph->saddr,
+ AF_INET, sndid, rcvid);
+ } else {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ return tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)&iph->saddr,
+ AF_INET6, sndid, rcvid);
+ }
+}
+
+void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+ struct request_sock *req, unsigned short int family)
+{
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_ao_hdr *aoh;
+ struct tcp_ao_key *key;
+ int l3index;
+
+ /* treq->af_specific is used to perform TCP_AO lookup
+ * in tcp_create_openreq_child().
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ treq->af_specific = &tcp_request_sock_ipv6_ops;
+ else
+#endif
+ treq->af_specific = &tcp_request_sock_ipv4_ops;
+
+ treq->used_tcp_ao = false;
+
+ if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
+ return;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), inet_rsk(req)->ir_iif);
+ key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index);
+ if (!key)
+ /* Key not found, continue without TCP-AO */
+ return;
+
+ treq->ao_rcv_next = aoh->keyid;
+ treq->ao_keyid = aoh->rnext_keyid;
+ treq->used_tcp_ao = true;
+}
+
+static enum skb_drop_reason
+tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
+ unsigned short int family, struct tcp_ao_info *info,
+ const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key,
+ u8 *traffic_key, u8 *phash, u32 sne, int l3index)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ u8 maclen = tcp_ao_hdr_maclen(aoh);
+ void *hash_buf = NULL;
+
+ if (maclen != tcp_ao_maclen(key)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ atomic64_inc(&info->counters.pkt_bad);
+ atomic64_inc(&key->pkt_bad);
+ trace_tcp_ao_wrong_maclen(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+
+ hash_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+ if (!hash_buf)
+ return SKB_DROP_REASON_NOT_SPECIFIED;
+
+ /* XXX: make it per-AF callback? */
+ tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key,
+ (phash - (u8 *)th), sne);
+ if (memcmp(phash, hash_buf, maclen)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ atomic64_inc(&info->counters.pkt_bad);
+ atomic64_inc(&key->pkt_bad);
+ trace_tcp_ao_mismatch(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ kfree(hash_buf);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOGOOD);
+ atomic64_inc(&info->counters.pkt_good);
+ atomic64_inc(&key->pkt_good);
+ kfree(hash_buf);
+ return SKB_NOT_DROPPED_YET;
+}
+
+enum skb_drop_reason
+tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
+ unsigned short int family, const struct request_sock *req,
+ int l3index, const struct tcp_ao_hdr *aoh)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ u8 maclen = tcp_ao_hdr_maclen(aoh);
+ u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */
+ struct tcp_ao_info *info;
+ enum skb_drop_reason ret;
+ struct tcp_ao_key *key;
+ __be32 sisn, disn;
+ u8 *traffic_key;
+ int state;
+ u32 sne = 0;
+
+ info = rcu_dereference(tcp_sk(sk)->ao_info);
+ if (!info) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
+ trace_tcp_ao_key_not_found(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ return SKB_DROP_REASON_TCP_AOUNEXPECTED;
+ }
+
+ if (unlikely(th->syn)) {
+ sisn = th->seq;
+ disn = 0;
+ }
+
+ state = READ_ONCE(sk->sk_state);
+ /* Fast-path */
+ if (likely((1 << state) & TCP_AO_ESTABLISHED)) {
+ enum skb_drop_reason err;
+ struct tcp_ao_key *current_key;
+
+ /* Check if this socket's rnext_key matches the keyid in the
+ * packet. If not we lookup the key based on the keyid
+ * matching the rcvid in the mkt.
+ */
+ key = READ_ONCE(info->rnext_key);
+ if (key->rcvid != aoh->keyid) {
+ key = tcp_ao_established_key(sk, info, -1, aoh->keyid);
+ if (!key)
+ goto key_not_found;
+ }
+
+ /* Delayed retransmitted SYN */
+ if (unlikely(th->syn && !th->ack))
+ goto verify_hash;
+
+ sne = tcp_ao_compute_sne(info->rcv_sne, tcp_sk(sk)->rcv_nxt,
+ ntohl(th->seq));
+ /* Established socket, traffic key are cached */
+ traffic_key = rcv_other_key(key);
+ err = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
+ traffic_key, phash, sne, l3index);
+ if (err)
+ return err;
+ current_key = READ_ONCE(info->current_key);
+ /* Key rotation: the peer asks us to use new key (RNext) */
+ if (unlikely(aoh->rnext_keyid != current_key->sndid)) {
+ trace_tcp_ao_rnext_request(sk, skb, current_key->sndid,
+ aoh->rnext_keyid,
+ tcp_ao_hdr_maclen(aoh));
+ /* If the key is not found we do nothing. */
+ key = tcp_ao_established_key(sk, info, aoh->rnext_keyid, -1);
+ if (key)
+ /* pairs with tcp_ao_del_cmd */
+ WRITE_ONCE(info->current_key, key);
+ }
+ return SKB_NOT_DROPPED_YET;
+ }
+
+ if (unlikely(state == TCP_CLOSE))
+ return SKB_DROP_REASON_TCP_CLOSE;
+
+ /* Lookup key based on peer address and keyid.
+ * current_key and rnext_key must not be used on tcp listen
+ * sockets as otherwise:
+ * - request sockets would race on those key pointers
+ * - tcp_ao_del_cmd() allows async key removal
+ */
+ key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index);
+ if (!key)
+ goto key_not_found;
+
+ if (th->syn && !th->ack)
+ goto verify_hash;
+
+ if ((1 << state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
+ /* Make the initial syn the likely case here */
+ if (unlikely(req)) {
+ sne = tcp_ao_compute_sne(0, tcp_rsk(req)->rcv_isn,
+ ntohl(th->seq));
+ sisn = htonl(tcp_rsk(req)->rcv_isn);
+ disn = htonl(tcp_rsk(req)->snt_isn);
+ } else if (unlikely(th->ack && !th->syn)) {
+ /* Possible syncookie packet */
+ sisn = htonl(ntohl(th->seq) - 1);
+ disn = htonl(ntohl(th->ack_seq) - 1);
+ sne = tcp_ao_compute_sne(0, ntohl(sisn),
+ ntohl(th->seq));
+ } else if (unlikely(!th->syn)) {
+ /* no way to figure out initial sisn/disn - drop */
+ return SKB_DROP_REASON_TCP_FLAGS;
+ }
+ } else if ((1 << state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ disn = info->lisn;
+ if (th->syn || th->rst)
+ sisn = th->seq;
+ else
+ sisn = info->risn;
+ } else {
+ WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", state);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+verify_hash:
+ traffic_key = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+ if (!traffic_key)
+ return SKB_DROP_REASON_NOT_SPECIFIED;
+ tcp_ao_calc_key_skb(key, traffic_key, skb, sisn, disn, family);
+ ret = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
+ traffic_key, phash, sne, l3index);
+ kfree(traffic_key);
+ return ret;
+
+key_not_found:
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
+ atomic64_inc(&info->counters.key_not_found);
+ trace_tcp_ao_key_not_found(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
+}
+
+static int tcp_ao_cache_traffic_keys(const struct sock *sk,
+ struct tcp_ao_info *ao,
+ struct tcp_ao_key *ao_key)
+{
+ u8 *traffic_key = snd_other_key(ao_key);
+ int ret;
+
+ ret = tcp_ao_calc_key_sk(ao_key, traffic_key, sk,
+ ao->lisn, ao->risn, true);
+ if (ret)
+ return ret;
+
+ traffic_key = rcv_other_key(ao_key);
+ ret = tcp_ao_calc_key_sk(ao_key, traffic_key, sk,
+ ao->lisn, ao->risn, false);
+ return ret;
+}
+
+void tcp_ao_connect_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_info *ao_info;
+ struct hlist_node *next;
+ union tcp_ao_addr *addr;
+ struct tcp_ao_key *key;
+ int family, l3index;
+
+ ao_info = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao_info)
+ return;
+
+ /* Remove all keys that don't match the peer */
+ family = sk->sk_family;
+ if (family == AF_INET)
+ addr = (union tcp_ao_addr *)&sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ addr = (union tcp_ao_addr *)&sk->sk_v6_daddr;
+#endif
+ else
+ return;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ sk->sk_bound_dev_if);
+
+ hlist_for_each_entry_safe(key, next, &ao_info->head, node) {
+ if (!tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
+ continue;
+
+ if (key == ao_info->current_key)
+ ao_info->current_key = NULL;
+ if (key == ao_info->rnext_key)
+ ao_info->rnext_key = NULL;
+ hlist_del_rcu(&key->node);
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+ }
+
+ key = tp->af_specific->ao_lookup(sk, sk, -1, -1);
+ if (key) {
+ /* if current_key or rnext_key were not provided,
+ * use the first key matching the peer
+ */
+ if (!ao_info->current_key)
+ ao_info->current_key = key;
+ if (!ao_info->rnext_key)
+ ao_info->rnext_key = key;
+ tp->tcp_header_len += tcp_ao_len_aligned(key);
+
+ ao_info->lisn = htonl(tp->write_seq);
+ ao_info->snd_sne = 0;
+ } else {
+ /* Can't happen: tcp_connect() verifies that there's
+ * at least one tcp-ao key that matches the remote peer.
+ */
+ WARN_ON_ONCE(1);
+ rcu_assign_pointer(tp->ao_info, NULL);
+ kfree(ao_info);
+ }
+}
+
+void tcp_ao_established(struct sock *sk)
+{
+ struct tcp_ao_info *ao;
+ struct tcp_ao_key *key;
+
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao)
+ return;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
+ tcp_ao_cache_traffic_keys(sk, ao, key);
+}
+
+void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_ao_info *ao;
+ struct tcp_ao_key *key;
+
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao)
+ return;
+
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
+ ao->rcv_sne = 0;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
+ tcp_ao_cache_traffic_keys(sk, ao, key);
+}
+
+int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
+ struct request_sock *req, struct sk_buff *skb,
+ int family)
+{
+ struct tcp_ao_key *key, *new_key, *first_key;
+ struct tcp_ao_info *new_ao, *ao;
+ struct hlist_node *key_head;
+ int l3index, ret = -ENOMEM;
+ union tcp_ao_addr *addr;
+ bool match = false;
+
+ ao = rcu_dereference(tcp_sk(sk)->ao_info);
+ if (!ao)
+ return 0;
+
+ /* New socket without TCP-AO on it */
+ if (!tcp_rsk_used_ao(req))
+ return 0;
+
+ new_ao = tcp_ao_alloc_info(GFP_ATOMIC);
+ if (!new_ao)
+ return -ENOMEM;
+ new_ao->lisn = htonl(tcp_rsk(req)->snt_isn);
+ new_ao->risn = htonl(tcp_rsk(req)->rcv_isn);
+ new_ao->ao_required = ao->ao_required;
+ new_ao->accept_icmps = ao->accept_icmps;
+
+ if (family == AF_INET) {
+ addr = (union tcp_ao_addr *)&newsk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ addr = (union tcp_ao_addr *)&newsk->sk_v6_daddr;
+#endif
+ } else {
+ ret = -EAFNOSUPPORT;
+ goto free_ao;
+ }
+ l3index = l3mdev_master_ifindex_by_index(sock_net(newsk),
+ newsk->sk_bound_dev_if);
+
+ hlist_for_each_entry_rcu(key, &ao->head, node) {
+ if (tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
+ continue;
+
+ new_key = tcp_ao_copy_key(newsk, key);
+ if (!new_key)
+ goto free_and_exit;
+
+ tcp_ao_cache_traffic_keys(newsk, new_ao, new_key);
+ tcp_ao_link_mkt(new_ao, new_key);
+ match = true;
+ }
+
+ if (!match) {
+ /* RFC5925 (7.4.1) specifies that the TCP-AO status
+ * of a connection is determined on the initial SYN.
+ * At this point the connection was TCP-AO enabled, so
+ * it can't switch to being unsigned if peer's key
+ * disappears on the listening socket.
+ */
+ ret = -EKEYREJECTED;
+ goto free_and_exit;
+ }
+
+ if (!static_key_fast_inc_not_disabled(&tcp_ao_needed.key.key)) {
+ ret = -EUSERS;
+ goto free_and_exit;
+ }
+
+ key_head = rcu_dereference(hlist_first_rcu(&new_ao->head));
+ first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node);
+
+ key = tcp_ao_established_key(req_to_sk(req), new_ao, tcp_rsk(req)->ao_keyid, -1);
+ if (key)
+ new_ao->current_key = key;
+ else
+ new_ao->current_key = first_key;
+
+ /* set rnext_key */
+ key = tcp_ao_established_key(req_to_sk(req), new_ao, -1, tcp_rsk(req)->ao_rcv_next);
+ if (key)
+ new_ao->rnext_key = key;
+ else
+ new_ao->rnext_key = first_key;
+
+ sk_gso_disable(newsk);
+ rcu_assign_pointer(tcp_sk(newsk)->ao_info, new_ao);
+
+ return 0;
+
+free_and_exit:
+ hlist_for_each_entry_safe(key, key_head, &new_ao->head, node) {
+ hlist_del(&key->node);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ atomic_sub(tcp_ao_sizeof_key(key), &newsk->sk_omem_alloc);
+ kfree_sensitive(key);
+ }
+free_ao:
+ kfree(new_ao);
+ return ret;
+}
+
+static bool tcp_ao_can_set_current_rnext(struct sock *sk)
+{
+ /* There aren't current/rnext keys on TCP_LISTEN sockets */
+ if (sk->sk_state == TCP_LISTEN)
+ return false;
+ return true;
+}
+
+static int tcp_ao_verify_ipv4(struct sock *sk, struct tcp_ao_add *cmd,
+ union tcp_ao_addr **addr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)&cmd->addr;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (sin->sin_family != AF_INET)
+ return -EINVAL;
+
+ /* Currently matching is not performed on port (or port ranges) */
+ if (sin->sin_port != 0)
+ return -EINVAL;
+
+ /* Check prefix and trailing 0's in addr */
+ if (cmd->prefix != 0) {
+ __be32 mask;
+
+ if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY)
+ return -EINVAL;
+ if (cmd->prefix > 32)
+ return -EINVAL;
+
+ mask = inet_make_mask(cmd->prefix);
+ if (sin->sin_addr.s_addr & ~mask)
+ return -EINVAL;
+
+ /* Check that MKT address is consistent with socket */
+ if (ntohl(inet->inet_daddr) != INADDR_ANY &&
+ (inet->inet_daddr & mask) != sin->sin_addr.s_addr)
+ return -EINVAL;
+ } else {
+ if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY)
+ return -EINVAL;
+ }
+
+ *addr = (union tcp_ao_addr *)&sin->sin_addr;
+ return 0;
+}
+
+static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key)
+{
+ unsigned int syn_tcp_option_space;
+ bool is_kdf_aes_128_cmac = false;
+ struct crypto_ahash *tfm;
+ struct tcp_sigpool hp;
+ void *tmp_key = NULL;
+ int err;
+
+ /* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+ if (!strcmp("cmac(aes128)", cmd->alg_name)) {
+ strscpy(cmd->alg_name, "cmac(aes)", sizeof(cmd->alg_name));
+ is_kdf_aes_128_cmac = (cmd->keylen != 16);
+ tmp_key = kmalloc(cmd->keylen, GFP_KERNEL);
+ if (!tmp_key)
+ return -ENOMEM;
+ }
+
+ key->maclen = cmd->maclen ?: 12; /* 12 is the default in RFC5925 */
+
+ /* Check: maclen + tcp-ao header <= (MAX_TCP_OPTION_SPACE - mss
+ * - tstamp (including sackperm)
+ * - wscale),
+ * see tcp_syn_options(), tcp_synack_options(), commit 33ad798c924b.
+ *
+ * In order to allow D-SACK with TCP-AO, the header size should be:
+ * (MAX_TCP_OPTION_SPACE - TCPOLEN_TSTAMP_ALIGNED
+ * - TCPOLEN_SACK_BASE_ALIGNED
+ * - 2 * TCPOLEN_SACK_PERBLOCK) = 8 (maclen = 4),
+ * see tcp_established_options().
+ *
+ * RFC5925, 2.2:
+ * Typical MACs are 96-128 bits (12-16 bytes), but any length
+ * that fits in the header of the segment being authenticated
+ * is allowed.
+ *
+ * RFC5925, 7.6:
+ * TCP-AO continues to consume 16 bytes in non-SYN segments,
+ * leaving a total of 24 bytes for other options, of which
+ * the timestamp consumes 10. This leaves 14 bytes, of which 10
+ * are used for a single SACK block. When two SACK blocks are used,
+ * such as to handle D-SACK, a smaller TCP-AO MAC would be required
+ * to make room for the additional SACK block (i.e., to leave 18
+ * bytes for the D-SACK variant of the SACK option) [RFC2883].
+ * Note that D-SACK is not supportable in TCP MD5 in the presence
+ * of timestamps, because TCP MD5’s MAC length is fixed and too
+ * large to leave sufficient option space.
+ */
+ syn_tcp_option_space = MAX_TCP_OPTION_SPACE;
+ syn_tcp_option_space -= TCPOLEN_MSS_ALIGNED;
+ syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED;
+ syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED;
+ if (tcp_ao_len_aligned(key) > syn_tcp_option_space) {
+ err = -EMSGSIZE;
+ goto err_kfree;
+ }
+
+ key->keylen = cmd->keylen;
+ memcpy(key->key, cmd->key, cmd->keylen);
+
+ err = tcp_sigpool_start(key->tcp_sigpool_id, &hp);
+ if (err)
+ goto err_kfree;
+
+ tfm = crypto_ahash_reqtfm(hp.req);
+ if (is_kdf_aes_128_cmac) {
+ void *scratch = hp.scratch;
+ struct scatterlist sg;
+
+ memcpy(tmp_key, cmd->key, cmd->keylen);
+ sg_init_one(&sg, tmp_key, cmd->keylen);
+
+ /* Using zero-key of 16 bytes as described in RFC5926 */
+ memset(scratch, 0, 16);
+ err = crypto_ahash_setkey(tfm, scratch, 16);
+ if (err)
+ goto err_pool_end;
+
+ err = crypto_ahash_init(hp.req);
+ if (err)
+ goto err_pool_end;
+
+ ahash_request_set_crypt(hp.req, &sg, key->key, cmd->keylen);
+ err = crypto_ahash_update(hp.req);
+ if (err)
+ goto err_pool_end;
+
+ err |= crypto_ahash_final(hp.req);
+ if (err)
+ goto err_pool_end;
+ key->keylen = 16;
+ }
+
+ err = crypto_ahash_setkey(tfm, key->key, key->keylen);
+ if (err)
+ goto err_pool_end;
+
+ tcp_sigpool_end(&hp);
+ kfree_sensitive(tmp_key);
+
+ if (tcp_ao_maclen(key) > key->digest_size)
+ return -EINVAL;
+
+ return 0;
+
+err_pool_end:
+ tcp_sigpool_end(&hp);
+err_kfree:
+ kfree_sensitive(tmp_key);
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+ union tcp_ao_addr **paddr,
+ unsigned short int *family)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd->addr;
+ struct in6_addr *addr = &sin6->sin6_addr;
+ u8 prefix = cmd->prefix;
+
+ if (sin6->sin6_family != AF_INET6)
+ return -EINVAL;
+
+ /* Currently matching is not performed on port (or port ranges) */
+ if (sin6->sin6_port != 0)
+ return -EINVAL;
+
+ /* Check prefix and trailing 0's in addr */
+ if (cmd->prefix != 0 && ipv6_addr_v4mapped(addr)) {
+ __be32 addr4 = addr->s6_addr32[3];
+ __be32 mask;
+
+ if (prefix > 32 || ntohl(addr4) == INADDR_ANY)
+ return -EINVAL;
+
+ mask = inet_make_mask(prefix);
+ if (addr4 & ~mask)
+ return -EINVAL;
+
+ /* Check that MKT address is consistent with socket */
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ __be32 daddr4 = sk->sk_v6_daddr.s6_addr32[3];
+
+ if (!ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return -EINVAL;
+ if ((daddr4 & mask) != addr4)
+ return -EINVAL;
+ }
+
+ *paddr = (union tcp_ao_addr *)&addr->s6_addr32[3];
+ *family = AF_INET;
+ return 0;
+ } else if (cmd->prefix != 0) {
+ struct in6_addr pfx;
+
+ if (ipv6_addr_any(addr) || prefix > 128)
+ return -EINVAL;
+
+ ipv6_addr_prefix(&pfx, addr, prefix);
+ if (ipv6_addr_cmp(&pfx, addr))
+ return -EINVAL;
+
+ /* Check that MKT address is consistent with socket */
+ if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_prefix_equal(&sk->sk_v6_daddr, addr, prefix))
+
+ return -EINVAL;
+ } else {
+ if (!ipv6_addr_any(addr))
+ return -EINVAL;
+ }
+
+ *paddr = (union tcp_ao_addr *)addr;
+ return 0;
+}
+#else
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+ union tcp_ao_addr **paddr,
+ unsigned short int *family)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
+{
+ if (sk_fullsock(sk)) {
+ return rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ } else if (sk->sk_state == TCP_TIME_WAIT) {
+ return rcu_dereference_protected(tcp_twsk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ }
+ return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
+static struct tcp_ao_info *getsockopt_ao_info(struct sock *sk)
+{
+ if (sk_fullsock(sk))
+ return rcu_dereference(tcp_sk(sk)->ao_info);
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ return rcu_dereference(tcp_twsk(sk)->ao_info);
+
+ return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
+#define TCP_AO_KEYF_ALL (TCP_AO_KEYF_IFINDEX | TCP_AO_KEYF_EXCLUDE_OPT)
+#define TCP_AO_GET_KEYF_VALID (TCP_AO_KEYF_IFINDEX)
+
+static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk,
+ struct tcp_ao_add *cmd)
+{
+ const char *algo = cmd->alg_name;
+ unsigned int digest_size;
+ struct crypto_ahash *tfm;
+ struct tcp_ao_key *key;
+ struct tcp_sigpool hp;
+ int err, pool_id;
+ size_t size;
+
+ /* Force null-termination of alg_name */
+ cmd->alg_name[ARRAY_SIZE(cmd->alg_name) - 1] = '\0';
+
+ /* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+ if (!strcmp("cmac(aes128)", algo))
+ algo = "cmac(aes)";
+
+ /* Full TCP header (th->doff << 2) should fit into scratch area,
+ * see tcp_ao_hash_header().
+ */
+ pool_id = tcp_sigpool_alloc_ahash(algo, 60);
+ if (pool_id < 0)
+ return ERR_PTR(pool_id);
+
+ err = tcp_sigpool_start(pool_id, &hp);
+ if (err)
+ goto err_free_pool;
+
+ tfm = crypto_ahash_reqtfm(hp.req);
+ digest_size = crypto_ahash_digestsize(tfm);
+ tcp_sigpool_end(&hp);
+
+ size = sizeof(struct tcp_ao_key) + (digest_size << 1);
+ key = sock_kmalloc(sk, size, GFP_KERNEL);
+ if (!key) {
+ err = -ENOMEM;
+ goto err_free_pool;
+ }
+
+ key->tcp_sigpool_id = pool_id;
+ key->digest_size = digest_size;
+ return key;
+
+err_free_pool:
+ tcp_sigpool_release(pool_id);
+ return ERR_PTR(err);
+}
+
+static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ struct tcp_ao_info *ao_info;
+ union tcp_ao_addr *addr;
+ struct tcp_ao_key *key;
+ struct tcp_ao_add cmd;
+ int ret, l3index = 0;
+ bool first = false;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ ret = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (ret)
+ return ret;
+
+ if (cmd.keylen > TCP_AO_MAXKEYLEN)
+ return -EINVAL;
+
+ if (cmd.reserved != 0 || cmd.reserved2 != 0)
+ return -EINVAL;
+
+ if (family == AF_INET)
+ ret = tcp_ao_verify_ipv4(sk, &cmd, &addr);
+ else
+ ret = tcp_ao_verify_ipv6(sk, &cmd, &addr, &family);
+ if (ret)
+ return ret;
+
+ if (cmd.keyflags & ~TCP_AO_KEYF_ALL)
+ return -EINVAL;
+
+ if (cmd.set_current || cmd.set_rnext) {
+ if (!tcp_ao_can_set_current_rnext(sk))
+ return -EINVAL;
+ }
+
+ if (cmd.ifindex && !(cmd.keyflags & TCP_AO_KEYF_IFINDEX))
+ return -EINVAL;
+
+ /* For cmd.tcp_ifindex = 0 the key will apply to the default VRF */
+ if (cmd.keyflags & TCP_AO_KEYF_IFINDEX && cmd.ifindex) {
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+ rcu_read_unlock();
+
+ if (!dev || !l3index)
+ return -EINVAL;
+
+ if (!bound_dev_if || bound_dev_if != cmd.ifindex) {
+ /* tcp_ao_established_key() doesn't expect having
+ * non peer-matching key on an established TCP-AO
+ * connection.
+ */
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)))
+ return -EINVAL;
+ }
+
+ /* It's still possible to bind after adding keys or even
+ * re-bind to a different dev (with CAP_NET_RAW).
+ * So, no reason to return error here, rather try to be
+ * nice and warn the user.
+ */
+ if (bound_dev_if && bound_dev_if != cmd.ifindex)
+ net_warn_ratelimited("AO key ifindex %d != sk bound ifindex %d\n",
+ cmd.ifindex, bound_dev_if);
+ }
+
+ /* Don't allow keys for peers that have a matching TCP-MD5 key */
+ if (cmd.keyflags & TCP_AO_KEYF_IFINDEX) {
+ /* Non-_exact version of tcp_md5_do_lookup() will
+ * as well match keys that aren't bound to a specific VRF
+ * (that will make them match AO key with
+ * sysctl_tcp_l3dev_accept = 1
+ */
+ if (tcp_md5_do_lookup(sk, l3index, addr, family))
+ return -EKEYREJECTED;
+ } else {
+ if (tcp_md5_do_lookup_any_l3index(sk, addr, family))
+ return -EKEYREJECTED;
+ }
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+
+ if (!ao_info) {
+ ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+ if (!ao_info)
+ return -ENOMEM;
+ first = true;
+ } else {
+ /* Check that neither RecvID nor SendID match any
+ * existing key for the peer, RFC5925 3.1:
+ * > The IDs of MKTs MUST NOT overlap where their
+ * > TCP connection identifiers overlap.
+ */
+ if (__tcp_ao_do_lookup(sk, l3index, addr, family, cmd.prefix, -1, cmd.rcvid))
+ return -EEXIST;
+ if (__tcp_ao_do_lookup(sk, l3index, addr, family,
+ cmd.prefix, cmd.sndid, -1))
+ return -EEXIST;
+ }
+
+ key = tcp_ao_key_alloc(sk, &cmd);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto err_free_ao;
+ }
+
+ INIT_HLIST_NODE(&key->node);
+ memcpy(&key->addr, addr, (family == AF_INET) ? sizeof(struct in_addr) :
+ sizeof(struct in6_addr));
+ key->prefixlen = cmd.prefix;
+ key->family = family;
+ key->keyflags = cmd.keyflags;
+ key->sndid = cmd.sndid;
+ key->rcvid = cmd.rcvid;
+ key->l3index = l3index;
+ atomic64_set(&key->pkt_good, 0);
+ atomic64_set(&key->pkt_bad, 0);
+
+ ret = tcp_ao_parse_crypto(&cmd, key);
+ if (ret < 0)
+ goto err_free_sock;
+
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) {
+ tcp_ao_cache_traffic_keys(sk, ao_info, key);
+ if (first) {
+ ao_info->current_key = key;
+ ao_info->rnext_key = key;
+ }
+ }
+
+ tcp_ao_link_mkt(ao_info, key);
+ if (first) {
+ if (!static_branch_inc(&tcp_ao_needed.key)) {
+ ret = -EUSERS;
+ goto err_free_sock;
+ }
+ sk_gso_disable(sk);
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+ }
+
+ if (cmd.set_current)
+ WRITE_ONCE(ao_info->current_key, key);
+ if (cmd.set_rnext)
+ WRITE_ONCE(ao_info->rnext_key, key);
+ return 0;
+
+err_free_sock:
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+err_free_ao:
+ if (first)
+ kfree(ao_info);
+ return ret;
+}
+
+static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info,
+ bool del_async, struct tcp_ao_key *key,
+ struct tcp_ao_key *new_current,
+ struct tcp_ao_key *new_rnext)
+{
+ int err;
+
+ hlist_del_rcu(&key->node);
+
+ /* Support for async delete on listening sockets: as they don't
+ * need current_key/rnext_key maintaining, we don't need to check
+ * them and we can just free all resources in RCU fashion.
+ */
+ if (del_async) {
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+ return 0;
+ }
+
+ /* At this moment another CPU could have looked this key up
+ * while it was unlinked from the list. Wait for RCU grace period,
+ * after which the key is off-list and can't be looked up again;
+ * the rx path [just before RCU came] might have used it and set it
+ * as current_key (very unlikely).
+ * Free the key with next RCU grace period (in case it was
+ * current_key before tcp_ao_current_rnext() might have
+ * changed it in forced-delete).
+ */
+ synchronize_rcu();
+ if (new_current)
+ WRITE_ONCE(ao_info->current_key, new_current);
+ if (new_rnext)
+ WRITE_ONCE(ao_info->rnext_key, new_rnext);
+
+ if (unlikely(READ_ONCE(ao_info->current_key) == key ||
+ READ_ONCE(ao_info->rnext_key) == key)) {
+ err = -EBUSY;
+ goto add_key;
+ }
+
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+
+ return 0;
+add_key:
+ hlist_add_head_rcu(&key->node, &ao_info->head);
+ return err;
+}
+
+#define TCP_AO_DEL_KEYF_ALL (TCP_AO_KEYF_IFINDEX)
+static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ struct tcp_ao_key *key, *new_current = NULL, *new_rnext = NULL;
+ int err, addr_len, l3index = 0;
+ struct tcp_ao_info *ao_info;
+ union tcp_ao_addr *addr;
+ struct tcp_ao_del cmd;
+ __u8 prefix;
+ u16 port;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (err)
+ return err;
+
+ if (cmd.reserved != 0 || cmd.reserved2 != 0)
+ return -EINVAL;
+
+ if (cmd.set_current || cmd.set_rnext) {
+ if (!tcp_ao_can_set_current_rnext(sk))
+ return -EINVAL;
+ }
+
+ if (cmd.keyflags & ~TCP_AO_DEL_KEYF_ALL)
+ return -EINVAL;
+
+ /* No sanity check for TCP_AO_KEYF_IFINDEX as if a VRF
+ * was destroyed, there still should be a way to delete keys,
+ * that were bound to that l3intf. So, fail late at lookup stage
+ * if there is no key for that ifindex.
+ */
+ if (cmd.ifindex && !(cmd.keyflags & TCP_AO_KEYF_IFINDEX))
+ return -EINVAL;
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+ if (!ao_info)
+ return -ENOENT;
+
+ /* For sockets in TCP_CLOSED it's possible set keys that aren't
+ * matching the future peer (address/VRF/etc),
+ * tcp_ao_connect_init() will choose a correct matching MKT
+ * if there's any.
+ */
+ if (cmd.set_current) {
+ new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1);
+ if (!new_current)
+ return -ENOENT;
+ }
+ if (cmd.set_rnext) {
+ new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext);
+ if (!new_rnext)
+ return -ENOENT;
+ }
+ if (cmd.del_async && sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr;
+
+ addr = (union tcp_ao_addr *)&sin->sin_addr;
+ addr_len = sizeof(struct in_addr);
+ port = ntohs(sin->sin_port);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.addr;
+ struct in6_addr *addr6 = &sin6->sin6_addr;
+
+ if (ipv6_addr_v4mapped(addr6)) {
+ addr = (union tcp_ao_addr *)&addr6->s6_addr32[3];
+ addr_len = sizeof(struct in_addr);
+ family = AF_INET;
+ } else {
+ addr = (union tcp_ao_addr *)addr6;
+ addr_len = sizeof(struct in6_addr);
+ }
+ port = ntohs(sin6->sin6_port);
+ }
+ prefix = cmd.prefix;
+
+ /* Currently matching is not performed on port (or port ranges) */
+ if (port != 0)
+ return -EINVAL;
+
+ /* We could choose random present key here for current/rnext
+ * but that's less predictable. Let's be strict and don't
+ * allow removing a key that's in use. RFC5925 doesn't
+ * specify how-to coordinate key removal, but says:
+ * "It is presumed that an MKT affecting a particular
+ * connection cannot be destroyed during an active connection"
+ */
+ hlist_for_each_entry_rcu(key, &ao_info->head, node,
+ lockdep_sock_is_held(sk)) {
+ if (cmd.sndid != key->sndid ||
+ cmd.rcvid != key->rcvid)
+ continue;
+
+ if (family != key->family ||
+ prefix != key->prefixlen ||
+ memcmp(addr, &key->addr, addr_len))
+ continue;
+
+ if ((cmd.keyflags & TCP_AO_KEYF_IFINDEX) !=
+ (key->keyflags & TCP_AO_KEYF_IFINDEX))
+ continue;
+
+ if (key->l3index != l3index)
+ continue;
+
+ if (key == new_current || key == new_rnext)
+ continue;
+
+ return tcp_ao_delete_key(sk, ao_info, cmd.del_async, key,
+ new_current, new_rnext);
+ }
+ return -ENOENT;
+}
+
+/* cmd.ao_required makes a socket TCP-AO only.
+ * Don't allow any md5 keys for any l3intf on the socket together with it.
+ * Restricting it early in setsockopt() removes a check for
+ * ao_info->ao_required on inbound tcp segment fast-path.
+ */
+static int tcp_ao_required_verify(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ const struct tcp_md5sig_info *md5sig;
+
+ if (!static_branch_unlikely(&tcp_md5_needed.key))
+ return 0;
+
+ md5sig = rcu_dereference_check(tcp_sk(sk)->md5sig_info,
+ lockdep_sock_is_held(sk));
+ if (!md5sig)
+ return 0;
+
+ if (rcu_dereference_check(hlist_first_rcu(&md5sig->head),
+ lockdep_sock_is_held(sk)))
+ return 1;
+#endif
+ return 0;
+}
+
+static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ struct tcp_ao_key *new_current = NULL, *new_rnext = NULL;
+ struct tcp_ao_info *ao_info;
+ struct tcp_ao_info_opt cmd;
+ bool first = false;
+ int err;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (err)
+ return err;
+
+ if (cmd.set_current || cmd.set_rnext) {
+ if (!tcp_ao_can_set_current_rnext(sk))
+ return -EINVAL;
+ }
+
+ if (cmd.reserved != 0 || cmd.reserved2 != 0)
+ return -EINVAL;
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+ if (!ao_info) {
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)))
+ return -EINVAL;
+ ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+ if (!ao_info)
+ return -ENOMEM;
+ first = true;
+ }
+
+ if (cmd.ao_required && tcp_ao_required_verify(sk)) {
+ err = -EKEYREJECTED;
+ goto out;
+ }
+
+ /* For sockets in TCP_CLOSED it's possible set keys that aren't
+ * matching the future peer (address/port/VRF/etc),
+ * tcp_ao_connect_init() will choose a correct matching MKT
+ * if there's any.
+ */
+ if (cmd.set_current) {
+ new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1);
+ if (!new_current) {
+ err = -ENOENT;
+ goto out;
+ }
+ }
+ if (cmd.set_rnext) {
+ new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext);
+ if (!new_rnext) {
+ err = -ENOENT;
+ goto out;
+ }
+ }
+ if (cmd.set_counters) {
+ atomic64_set(&ao_info->counters.pkt_good, cmd.pkt_good);
+ atomic64_set(&ao_info->counters.pkt_bad, cmd.pkt_bad);
+ atomic64_set(&ao_info->counters.key_not_found, cmd.pkt_key_not_found);
+ atomic64_set(&ao_info->counters.ao_required, cmd.pkt_ao_required);
+ atomic64_set(&ao_info->counters.dropped_icmp, cmd.pkt_dropped_icmp);
+ }
+
+ ao_info->ao_required = cmd.ao_required;
+ ao_info->accept_icmps = cmd.accept_icmps;
+ if (new_current)
+ WRITE_ONCE(ao_info->current_key, new_current);
+ if (new_rnext)
+ WRITE_ONCE(ao_info->rnext_key, new_rnext);
+ if (first) {
+ if (!static_branch_inc(&tcp_ao_needed.key)) {
+ err = -EUSERS;
+ goto out;
+ }
+ sk_gso_disable(sk);
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+ }
+ return 0;
+out:
+ if (first)
+ kfree(ao_info);
+ return err;
+}
+
+int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ if (WARN_ON_ONCE(family != AF_INET && family != AF_INET6))
+ return -EAFNOSUPPORT;
+
+ switch (cmd) {
+ case TCP_AO_ADD_KEY:
+ return tcp_ao_add_cmd(sk, family, optval, optlen);
+ case TCP_AO_DEL_KEY:
+ return tcp_ao_del_cmd(sk, family, optval, optlen);
+ case TCP_AO_INFO:
+ return tcp_ao_info_cmd(sk, family, optval, optlen);
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+}
+
+int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen)
+{
+ return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen);
+}
+
+/* tcp_ao_copy_mkts_to_user(ao_info, optval, optlen)
+ *
+ * @ao_info: struct tcp_ao_info on the socket that
+ * socket getsockopt(TCP_AO_GET_KEYS) is executed on
+ * @optval: pointer to array of tcp_ao_getsockopt structures in user space.
+ * Must be != NULL.
+ * @optlen: pointer to size of tcp_ao_getsockopt structure.
+ * Must be != NULL.
+ *
+ * Return value: 0 on success, a negative error number otherwise.
+ *
+ * optval points to an array of tcp_ao_getsockopt structures in user space.
+ * optval[0] is used as both input and output to getsockopt. It determines
+ * which keys are returned by the kernel.
+ * optval[0].nkeys is the size of the array in user space. On return it contains
+ * the number of keys matching the search criteria.
+ * If tcp_ao_getsockopt::get_all is set, then all keys in the socket are
+ * returned, otherwise only keys matching <addr, prefix, sndid, rcvid>
+ * in optval[0] are returned.
+ * optlen is also used as both input and output. The user provides the size
+ * of struct tcp_ao_getsockopt in user space, and the kernel returns the size
+ * of the structure in kernel space.
+ * The size of struct tcp_ao_getsockopt may differ between user and kernel.
+ * There are three cases to consider:
+ * * If usize == ksize, then keys are copied verbatim.
+ * * If usize < ksize, then the userspace has passed an old struct to a
+ * newer kernel. The rest of the trailing bytes in optval[0]
+ * (ksize - usize) are interpreted as 0 by the kernel.
+ * * If usize > ksize, then the userspace has passed a new struct to an
+ * older kernel. The trailing bytes unknown to the kernel (usize - ksize)
+ * are checked to ensure they are zeroed, otherwise -E2BIG is returned.
+ * On return the kernel fills in min(usize, ksize) in each entry of the array.
+ * The layout of the fields in the user and kernel structures is expected to
+ * be the same (including in the 32bit vs 64bit case).
+ */
+static int tcp_ao_copy_mkts_to_user(const struct sock *sk,
+ struct tcp_ao_info *ao_info,
+ sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_ao_getsockopt opt_in, opt_out;
+ struct tcp_ao_key *key, *current_key;
+ bool do_address_matching = true;
+ union tcp_ao_addr *addr = NULL;
+ int err, l3index, user_len;
+ unsigned int max_keys; /* maximum number of keys to copy to user */
+ size_t out_offset = 0;
+ size_t bytes_to_write; /* number of bytes to write to user level */
+ u32 matched_keys; /* keys from ao_info matched so far */
+ int optlen_out;
+ __be16 port = 0;
+
+ if (copy_from_sockptr(&user_len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ if (user_len <= 0)
+ return -EINVAL;
+
+ memset(&opt_in, 0, sizeof(struct tcp_ao_getsockopt));
+ err = copy_struct_from_sockptr(&opt_in, sizeof(opt_in),
+ optval, user_len);
+ if (err < 0)
+ return err;
+
+ if (opt_in.pkt_good || opt_in.pkt_bad)
+ return -EINVAL;
+ if (opt_in.keyflags & ~TCP_AO_GET_KEYF_VALID)
+ return -EINVAL;
+ if (opt_in.ifindex && !(opt_in.keyflags & TCP_AO_KEYF_IFINDEX))
+ return -EINVAL;
+
+ if (opt_in.reserved != 0)
+ return -EINVAL;
+
+ max_keys = opt_in.nkeys;
+ l3index = (opt_in.keyflags & TCP_AO_KEYF_IFINDEX) ? opt_in.ifindex : -1;
+
+ if (opt_in.get_all || opt_in.is_current || opt_in.is_rnext) {
+ if (opt_in.get_all && (opt_in.is_current || opt_in.is_rnext))
+ return -EINVAL;
+ do_address_matching = false;
+ }
+
+ switch (opt_in.addr.ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin;
+ __be32 mask;
+
+ sin = (struct sockaddr_in *)&opt_in.addr;
+ port = sin->sin_port;
+ addr = (union tcp_ao_addr *)&sin->sin_addr;
+
+ if (opt_in.prefix > 32)
+ return -EINVAL;
+
+ if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY &&
+ opt_in.prefix != 0)
+ return -EINVAL;
+
+ mask = inet_make_mask(opt_in.prefix);
+ if (sin->sin_addr.s_addr & ~mask)
+ return -EINVAL;
+
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6;
+ struct in6_addr *addr6;
+
+ sin6 = (struct sockaddr_in6 *)&opt_in.addr;
+ addr = (union tcp_ao_addr *)&sin6->sin6_addr;
+ addr6 = &sin6->sin6_addr;
+ port = sin6->sin6_port;
+
+ /* We don't have to change family and @addr here if
+ * ipv6_addr_v4mapped() like in key adding:
+ * tcp_ao_key_cmp() does it. Do the sanity checks though.
+ */
+ if (opt_in.prefix != 0) {
+ if (ipv6_addr_v4mapped(addr6)) {
+ __be32 mask, addr4 = addr6->s6_addr32[3];
+
+ if (opt_in.prefix > 32 ||
+ ntohl(addr4) == INADDR_ANY)
+ return -EINVAL;
+ mask = inet_make_mask(opt_in.prefix);
+ if (addr4 & ~mask)
+ return -EINVAL;
+ } else {
+ struct in6_addr pfx;
+
+ if (ipv6_addr_any(addr6) ||
+ opt_in.prefix > 128)
+ return -EINVAL;
+
+ ipv6_addr_prefix(&pfx, addr6, opt_in.prefix);
+ if (ipv6_addr_cmp(&pfx, addr6))
+ return -EINVAL;
+ }
+ } else if (!ipv6_addr_any(addr6)) {
+ return -EINVAL;
+ }
+ break;
+ }
+ case 0:
+ if (!do_address_matching)
+ break;
+ fallthrough;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (!do_address_matching) {
+ /* We could just ignore those, but let's do stricter checks */
+ if (addr || port)
+ return -EINVAL;
+ if (opt_in.prefix || opt_in.sndid || opt_in.rcvid)
+ return -EINVAL;
+ }
+
+ bytes_to_write = min_t(int, user_len, sizeof(struct tcp_ao_getsockopt));
+ matched_keys = 0;
+ /* May change in RX, while we're dumping, pre-fetch it */
+ current_key = READ_ONCE(ao_info->current_key);
+
+ hlist_for_each_entry_rcu(key, &ao_info->head, node,
+ lockdep_sock_is_held(sk)) {
+ if (opt_in.get_all)
+ goto match;
+
+ if (opt_in.is_current || opt_in.is_rnext) {
+ if (opt_in.is_current && key == current_key)
+ goto match;
+ if (opt_in.is_rnext && key == ao_info->rnext_key)
+ goto match;
+ continue;
+ }
+
+ if (tcp_ao_key_cmp(key, l3index, addr, opt_in.prefix,
+ opt_in.addr.ss_family,
+ opt_in.sndid, opt_in.rcvid) != 0)
+ continue;
+match:
+ matched_keys++;
+ if (matched_keys > max_keys)
+ continue;
+
+ memset(&opt_out, 0, sizeof(struct tcp_ao_getsockopt));
+
+ if (key->family == AF_INET) {
+ struct sockaddr_in *sin_out = (struct sockaddr_in *)&opt_out.addr;
+
+ sin_out->sin_family = key->family;
+ sin_out->sin_port = 0;
+ memcpy(&sin_out->sin_addr, &key->addr, sizeof(struct in_addr));
+ } else {
+ struct sockaddr_in6 *sin6_out = (struct sockaddr_in6 *)&opt_out.addr;
+
+ sin6_out->sin6_family = key->family;
+ sin6_out->sin6_port = 0;
+ memcpy(&sin6_out->sin6_addr, &key->addr, sizeof(struct in6_addr));
+ }
+ opt_out.sndid = key->sndid;
+ opt_out.rcvid = key->rcvid;
+ opt_out.prefix = key->prefixlen;
+ opt_out.keyflags = key->keyflags;
+ opt_out.is_current = (key == current_key);
+ opt_out.is_rnext = (key == ao_info->rnext_key);
+ opt_out.nkeys = 0;
+ opt_out.maclen = key->maclen;
+ opt_out.keylen = key->keylen;
+ opt_out.ifindex = key->l3index;
+ opt_out.pkt_good = atomic64_read(&key->pkt_good);
+ opt_out.pkt_bad = atomic64_read(&key->pkt_bad);
+ memcpy(&opt_out.key, key->key, key->keylen);
+ tcp_sigpool_algo(key->tcp_sigpool_id, opt_out.alg_name, 64);
+
+ /* Copy key to user */
+ if (copy_to_sockptr_offset(optval, out_offset,
+ &opt_out, bytes_to_write))
+ return -EFAULT;
+ out_offset += user_len;
+ }
+
+ optlen_out = (int)sizeof(struct tcp_ao_getsockopt);
+ if (copy_to_sockptr(optlen, &optlen_out, sizeof(int)))
+ return -EFAULT;
+
+ out_offset = offsetof(struct tcp_ao_getsockopt, nkeys);
+ if (copy_to_sockptr_offset(optval, out_offset,
+ &matched_keys, sizeof(u32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_ao_info *ao_info;
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+ if (!ao_info)
+ return -ENOENT;
+
+ return tcp_ao_copy_mkts_to_user(sk, ao_info, optval, optlen);
+}
+
+int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_ao_info_opt out, in = {};
+ struct tcp_ao_key *current_key;
+ struct tcp_ao_info *ao;
+ int err, len;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ if (len <= 0)
+ return -EINVAL;
+
+ /* Copying this "in" only to check ::reserved, ::reserved2,
+ * that may be needed to extend (struct tcp_ao_info_opt) and
+ * what getsockopt() provides in future.
+ */
+ err = copy_struct_from_sockptr(&in, sizeof(in), optval, len);
+ if (err)
+ return err;
+
+ if (in.reserved != 0 || in.reserved2 != 0)
+ return -EINVAL;
+
+ ao = setsockopt_ao_info(sk);
+ if (IS_ERR(ao))
+ return PTR_ERR(ao);
+ if (!ao)
+ return -ENOENT;
+
+ memset(&out, 0, sizeof(out));
+ out.ao_required = ao->ao_required;
+ out.accept_icmps = ao->accept_icmps;
+ out.pkt_good = atomic64_read(&ao->counters.pkt_good);
+ out.pkt_bad = atomic64_read(&ao->counters.pkt_bad);
+ out.pkt_key_not_found = atomic64_read(&ao->counters.key_not_found);
+ out.pkt_ao_required = atomic64_read(&ao->counters.ao_required);
+ out.pkt_dropped_icmp = atomic64_read(&ao->counters.dropped_icmp);
+
+ current_key = READ_ONCE(ao->current_key);
+ if (current_key) {
+ out.set_current = 1;
+ out.current_key = current_key->sndid;
+ }
+ if (ao->rnext_key) {
+ out.set_rnext = 1;
+ out.rnext = ao->rnext_key->rcvid;
+ }
+
+ if (copy_to_sockptr(optval, &out, min_t(int, len, sizeof(out))))
+ return -EFAULT;
+
+ return 0;
+}
+
+int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_repair cmd;
+ struct tcp_ao_key *key;
+ struct tcp_ao_info *ao;
+ int err;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (err)
+ return err;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ ao = setsockopt_ao_info(sk);
+ if (IS_ERR(ao))
+ return PTR_ERR(ao);
+ if (!ao)
+ return -ENOENT;
+
+ WRITE_ONCE(ao->lisn, cmd.snt_isn);
+ WRITE_ONCE(ao->risn, cmd.rcv_isn);
+ WRITE_ONCE(ao->snd_sne, cmd.snd_sne);
+ WRITE_ONCE(ao->rcv_sne, cmd.rcv_sne);
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
+ tcp_ao_cache_traffic_keys(sk, ao, key);
+
+ return 0;
+}
+
+int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_repair opt;
+ struct tcp_ao_info *ao;
+ int len;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ if (len <= 0)
+ return -EINVAL;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ rcu_read_lock();
+ ao = getsockopt_ao_info(sk);
+ if (IS_ERR_OR_NULL(ao)) {
+ rcu_read_unlock();
+ return ao ? PTR_ERR(ao) : -ENOENT;
+ }
+
+ opt.snt_isn = ao->lisn;
+ opt.rcv_isn = ao->risn;
+ opt.snd_sne = READ_ONCE(ao->snd_sne);
+ opt.rcv_sne = READ_ONCE(ao->rcv_sne);
+ rcu_read_unlock();
+
+ if (copy_to_sockptr(optval, &opt, min_t(int, len, sizeof(opt))))
+ return -EFAULT;
+ return 0;
+}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..760941e55153
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,1199 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ * On each ACK, update our model of the network path:
+ * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ * min_rtt = windowed_min(rtt, 10 seconds)
+ * pacing_rate = pacing_gain * bottleneck_bandwidth
+ * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * Here is a state transition diagram for BBR:
+ *
+ * |
+ * V
+ * +---> STARTUP ----+
+ * | | |
+ * | V |
+ * | DRAIN ----+
+ * | | |
+ * | V |
+ * +---> PROBE_BW ----+
+ * | ^ | |
+ * | | | |
+ * | +----+ |
+ * | |
+ * +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
+ */
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
+ u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ u64 cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ unused:13,
+ lt_is_sampling:1, /* taking long-term ("LT") samples now? */
+ lt_rtt_cnt:7, /* round trips in long-term interval */
+ lt_use_bw:1; /* use lt_bw as our bw estimate? */
+ u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
+ u32 lt_last_delivered; /* LT intvl start: tp->delivered */
+ u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
+ u32 lt_last_lost; /* LT intvl start: tp->lost */
+ u32 pacing_gain:10, /* current gain for setting pacing rate */
+ cwnd_gain:10, /* current gain for setting cwnd */
+ full_bw_reached:1, /* reached full bw in Startup? */
+ full_bw_cnt:2, /* number of rounds without large bw gains */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ has_seen_rtt:1, /* have we seen an RTT sample yet? */
+ unused_b:5;
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+
+ /* For tracking ACK aggregation: */
+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
+ extra_acked_win_idx:1, /* current index in extra_acked array */
+ unused_c:6;
+};
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Gain factor for adding extra_acked to target cwnd: */
+static const int bbr_extra_acked_gain = BBR_UNIT;
+/* Window length of extra_acked window. */
+static const u32 bbr_extra_acked_win_rtts = 5;
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+/* Time period for clamping cwnd increment due to ack aggregation */
+static const u32 bbr_extra_acked_max_us = 100 * 1000;
+
+static void bbr_check_probe_rtt_done(struct sock *sk);
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_reached;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+ unsigned int mss = tcp_sk(sk)->mss_cache;
+
+ rate *= mss;
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+ return rate >> BW_SCALE;
+}
+
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ return rate;
+}
+
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+ u32 rtt_us;
+
+ if (tp->srtt_us) { /* any RTT sample yet? */
+ rtt_us = max(tp->srtt_us >> 3, 1U);
+ bbr->has_seen_rtt = 1;
+ } else { /* no RTT sample yet */
+ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */
+ }
+ bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ do_div(bw, rtt_us);
+ WRITE_ONCE(sk->sk_pacing_rate,
+ bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
+}
+
+/* Pace using current bw estimate and a gain factor. */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+
+ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
+ bbr_init_pacing_rate_from_rtt(sk);
+ if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate))
+ WRITE_ONCE(sk->sk_pacing_rate, rate);
+}
+
+/* override sysctl_tcp_min_tso_segs */
+__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+{
+ return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
+}
+
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 segs, bytes;
+
+ /* Sort of tcp_tso_autosize() but ignoring
+ * driver provided sk_gso_max_size.
+ */
+ bytes = min_t(unsigned long,
+ READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+ GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+ segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+
+ return min(segs, 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+ bbr->prior_cwnd = tcp_snd_cwnd(tp); /* this cwnd is good enough */
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
+}
+
+__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
+ bbr->idle_restart = 1;
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ bbr->ack_epoch_acked = 0;
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
+ * need more speed (we're restarting from idle and app-limited).
+ */
+ if (bbr->mode == BBR_PROBE_BW)
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ else if (bbr->mode == BBR_PROBE_RTT)
+ bbr_check_probe_rtt_done(sk);
+ }
+}
+
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
+ *
+ * bdp = ceil(bw * min_rtt * gain)
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ */
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bdp;
+ u64 w;
+
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
+ * default. This should only happen when the connection is not using TCP
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+ * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ */
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
+ return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+
+ w = (u64)bw * bbr->min_rtt_us;
+
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
+ * round the value up to avoid a negative feedback loop.
+ */
+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ return bdp;
+}
+
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd += 3 * bbr_tso_segs_goal(sk);
+
+ /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+ cwnd = (cwnd + 1) & ~1U;
+
+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
+ cwnd += 2;
+
+ return cwnd;
+}
+
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
+{
+ u32 inflight;
+
+ inflight = bbr_bdp(sk, bw, gain);
+ inflight = bbr_quantization_budget(sk, inflight);
+
+ return inflight;
+}
+
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 now_ns, edt_ns, interval_us;
+ u32 interval_delivered, inflight_at_edt;
+
+ now_ns = tp->tcp_clock_cache;
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+ inflight_at_edt = inflight_now;
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
+ if (interval_delivered >= inflight_at_edt)
+ return 0;
+ return inflight_at_edt - interval_delivered;
+}
+
+/* Find the cwnd increment based on estimate of ack aggregation */
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+{
+ u32 max_aggr_cwnd, aggr_cwnd = 0;
+
+ if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ / BW_UNIT;
+ aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+ >> BBR_SCALE;
+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ }
+
+ return aggr_cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+ struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+ u32 cwnd = tcp_snd_cwnd(tp);
+
+ /* An ACK for P pkts should release at most 2*P packets. We do this
+ * in two steps. First, here we deduct the number of lost packets.
+ * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+ */
+ if (rs->losses > 0)
+ cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+ if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+ /* Starting 1st round of Recovery, so do packet conservation. */
+ bbr->packet_conservation = 1;
+ bbr->next_rtt_delivered = tp->delivered; /* start round now */
+ /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+ cwnd = tcp_packets_in_flight(tp) + acked;
+ } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+ /* Exiting loss recovery; restore cwnd saved before recovery. */
+ cwnd = max(cwnd, bbr->prior_cwnd);
+ bbr->packet_conservation = 0;
+ }
+ bbr->prev_ca_state = state;
+
+ if (bbr->packet_conservation) {
+ *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+ return true; /* yes, using packet conservation */
+ }
+ *new_cwnd = cwnd;
+ return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ u32 acked, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
+
+ if (!acked)
+ goto done; /* no packet fully ACKed; just apply caps */
+
+ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+ goto done;
+
+ target_cwnd = bbr_bdp(sk, bw, gain);
+
+ /* Increment the cwnd to account for excess ACKed data that seems
+ * due to aggregation (of data and/or ACKs) visible in the ACK stream.
+ */
+ target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
+ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
+ cwnd = min(cwnd + acked, target_cwnd);
+ else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+ cwnd = cwnd + acked;
+ cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool is_full_length =
+ tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+ bbr->min_rtt_us;
+ u32 inflight, bw;
+
+ /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+ * use the pipe without increasing the queue.
+ */
+ if (bbr->pacing_gain == BBR_UNIT)
+ return is_full_length; /* just use wall clock time */
+
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+ bw = bbr_max_bw(sk);
+
+ /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+ * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+ * small (e.g. on a LAN). We do not persist if packets are lost, since
+ * a path with small buffers may not hold that much.
+ */
+ if (bbr->pacing_gain > BBR_UNIT)
+ return is_full_length &&
+ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
+ inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+
+ /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+ * probing didn't find more bw. If inflight falls to match BDP then we
+ * estimate queue is drained; persisting would underutilize the pipe.
+ */
+ return is_full_length ||
+ inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+ bbr->cycle_mstamp = tp->delivered_mstamp;
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+ bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_STARTUP;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_PROBE_BW;
+ bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+ bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+ if (!bbr_full_bw_reached(sk))
+ bbr_reset_startup_mode(sk);
+ else
+ bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+ bbr->lt_last_delivered = tp->delivered;
+ bbr->lt_last_lost = tp->lost;
+ bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_bw = 0;
+ bbr->lt_use_bw = 0;
+ bbr->lt_is_sampling = false;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 diff;
+
+ if (bbr->lt_bw) { /* do we have bw from a previous interval? */
+ /* Is new bw close to the lt_bw from the previous interval? */
+ diff = abs(bw - bbr->lt_bw);
+ if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+ (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+ bbr_lt_bw_diff)) {
+ /* All criteria are met; estimate we're policed. */
+ bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
+ bbr->lt_use_bw = 1;
+ bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
+ bbr->lt_rtt_cnt = 0;
+ return;
+ }
+ }
+ bbr->lt_bw = bw;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 lost, delivered;
+ u64 bw;
+ u32 t;
+
+ if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
+ if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+ ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
+ bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
+ }
+ return;
+ }
+
+ /* Wait for the first loss before sampling, to let the policer exhaust
+ * its tokens and estimate the steady-state rate allowed by the policer.
+ * Starting samples earlier includes bursts that over-estimate the bw.
+ */
+ if (!bbr->lt_is_sampling) {
+ if (!rs->losses)
+ return;
+ bbr_reset_lt_bw_sampling_interval(sk);
+ bbr->lt_is_sampling = true;
+ }
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (rs->is_app_limited) {
+ bbr_reset_lt_bw_sampling(sk);
+ return;
+ }
+
+ if (bbr->round_start)
+ bbr->lt_rtt_cnt++; /* count round trips in this interval */
+ if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+ return; /* sampling interval needs to be longer */
+ if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* interval is too long */
+ return;
+ }
+
+ /* End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (!rs->losses)
+ return;
+
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = tp->lost - bbr->lt_last_lost;
+ delivered = tp->delivered - bbr->lt_last_delivered;
+ /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+ if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+ return;
+
+ /* Find average delivery rate in this sampling interval. */
+ t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+ if ((s32)t < 1)
+ return; /* interval is less than one ms, so wait */
+ /* Check if can multiply without overflow */
+ if (t >= ~0U / USEC_PER_MSEC) {
+ bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
+ return;
+ }
+ t *= USEC_PER_MSEC;
+ bw = (u64)delivered * BW_UNIT;
+ do_div(bw, t);
+ bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->round_start = 0;
+ if (rs->delivered < 0 || rs->interval_us <= 0)
+ return; /* Not a valid observation */
+
+ /* See if we've reached the next RTT */
+ if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->rtt_cnt++;
+ bbr->round_start = 1;
+ bbr->packet_conservation = 0;
+ }
+
+ bbr_lt_bw_sampling(sk, rs);
+
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
+ * ratio will be <<1 for most connections. So delivered is first scaled.
+ */
+ bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+
+ /* If this sample is application-limited, it is likely to have a very
+ * low delivered count that represents application behavior rather than
+ * the available network rate. Such a sample could drag down estimated
+ * bw, causing needless slow-down. Thus, to continue to send at the
+ * last measured network rate, we filter out app-limited samples unless
+ * they describe the path bw at least as well as our bw model.
+ *
+ * So the goal during app-limited phase is to proceed with the best
+ * network rate no matter how long. We automatically leave this
+ * phase when app writes faster than the network can deliver :)
+ */
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+ /* Incorporate new sample into our max bw filter. */
+ minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+ }
+}
+
+/* Estimates the windowed max degree of ack aggregation.
+ * This is used to provision extra in-flight data to keep sending during
+ * inter-ACK silences.
+ *
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
+ *
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
+ * cwnd += max_extra_acked
+ *
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ * trips.
+ */
+static void bbr_update_ack_aggregation(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ u32 epoch_us, expected_acked, extra_acked;
+ struct bbr *bbr = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
+ rs->delivered < 0 || rs->interval_us <= 0)
+ return;
+
+ if (bbr->round_start) {
+ bbr->extra_acked_win_rtts = min(0x1F,
+ bbr->extra_acked_win_rtts + 1);
+ if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
+ bbr->extra_acked_win_rtts = 0;
+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 0 : 1;
+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
+ }
+ }
+
+ /* Compute how many packets we expected to be delivered over epoch. */
+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
+ bbr->ack_epoch_mstamp);
+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
+
+ /* Reset the aggregation epoch if ACK rate is below expected rate or
+ * significantly large no. of ack received since epoch (potentially
+ * quite old epoch).
+ */
+ if (bbr->ack_epoch_acked <= expected_acked ||
+ (bbr->ack_epoch_acked + rs->acked_sacked >=
+ bbr_ack_epoch_acked_reset_thresh)) {
+ bbr->ack_epoch_acked = 0;
+ bbr->ack_epoch_mstamp = tp->delivered_mstamp;
+ expected_acked = 0;
+ }
+
+ /* Compute excess data delivered, beyond what was expected. */
+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
+ bbr->ack_epoch_acked + rs->acked_sacked);
+ extra_acked = bbr->ack_epoch_acked - expected_acked;
+ extra_acked = min(extra_acked, tcp_snd_cwnd(tp));
+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw_thresh;
+
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+ return;
+
+ bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+ if (bbr_max_bw(sk) >= bw_thresh) {
+ bbr->full_bw = bbr_max_bw(sk);
+ bbr->full_bw_cnt = 0;
+ return;
+ }
+ ++bbr->full_bw_cnt;
+ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
+ tcp_sk(sk)->snd_ssthresh =
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+ } /* fall through to check if in-flight is already small: */
+ if (bbr->mode == BBR_DRAIN &&
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
+}
+
+static void bbr_check_probe_rtt_done(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (!(bbr->probe_rtt_done_stamp &&
+ after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ return;
+
+ bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+ bbr_reset_mode(sk);
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool filter_expired;
+
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
+ filter_expired = after(tcp_jiffies32,
+ bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+ if (rs->rtt_us >= 0 &&
+ (rs->rtt_us < bbr->min_rtt_us ||
+ (filter_expired && !rs->is_ack_delayed))) {
+ bbr->min_rtt_us = rs->rtt_us;
+ bbr->min_rtt_stamp = tcp_jiffies32;
+ }
+
+ if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
+ bbr->probe_rtt_done_stamp = 0;
+ }
+
+ if (bbr->mode == BBR_PROBE_RTT) {
+ /* Ignore low rate samples during this mode. */
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
+ if (!bbr->probe_rtt_done_stamp &&
+ tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+ bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+ msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+ bbr->probe_rtt_round_done = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ } else if (bbr->probe_rtt_done_stamp) {
+ if (bbr->round_start)
+ bbr->probe_rtt_round_done = 1;
+ if (bbr->probe_rtt_round_done)
+ bbr_check_probe_rtt_done(sk);
+ }
+ }
+ /* Restart after idle ends only once we process a new S/ACK for data */
+ if (rs->delivered > 0)
+ bbr->idle_restart = 0;
+}
+
+static void bbr_update_gains(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ switch (bbr->mode) {
+ case BBR_STARTUP:
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+ break;
+ case BBR_DRAIN:
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
+ break;
+ case BBR_PROBE_BW:
+ bbr->pacing_gain = (bbr->lt_use_bw ?
+ BBR_UNIT :
+ bbr_pacing_gain[bbr->cycle_idx]);
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ break;
+ case BBR_PROBE_RTT:
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ break;
+ default:
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+ break;
+ }
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+ bbr_update_bw(sk, rs);
+ bbr_update_ack_aggregation(sk, rs);
+ bbr_update_cycle_phase(sk, rs);
+ bbr_check_full_bw_reached(sk, rs);
+ bbr_check_drain(sk, rs);
+ bbr_update_min_rtt(sk, rs);
+ bbr_update_gains(sk);
+}
+
+__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw;
+
+ bbr_update_model(sk, rs);
+
+ bw = bbr_bw(sk);
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+__bpf_kfunc static void bbr_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->prior_cwnd = 0;
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ bbr->rtt_cnt = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->prev_ca_state = TCP_CA_Open;
+ bbr->packet_conservation = 0;
+
+ bbr->probe_rtt_done_stamp = 0;
+ bbr->probe_rtt_round_done = 0;
+ bbr->min_rtt_us = tcp_min_rtt(tp);
+ bbr->min_rtt_stamp = tcp_jiffies32;
+
+ minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
+
+ bbr->has_seen_rtt = 0;
+ bbr_init_pacing_rate_from_rtt(sk);
+
+ bbr->round_start = 0;
+ bbr->idle_restart = 0;
+ bbr->full_bw_reached = 0;
+ bbr->full_bw = 0;
+ bbr->full_bw_cnt = 0;
+ bbr->cycle_mstamp = 0;
+ bbr->cycle_idx = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ bbr_reset_startup_mode(sk);
+
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ bbr->ack_epoch_acked = 0;
+ bbr->extra_acked_win_rtts = 0;
+ bbr->extra_acked_win_idx = 0;
+ bbr->extra_acked[0] = 0;
+ bbr->extra_acked[1] = 0;
+
+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+}
+
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+ return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
+ bbr->full_bw_cnt = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ return tcp_snd_cwnd(tcp_sk(sk));
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+{
+ bbr_save_cwnd(sk);
+ return tcp_sk(sk)->snd_ssthresh;
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw = bbr_bw(sk);
+
+ bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+ memset(&info->bbr, 0, sizeof(info->bbr));
+ info->bbr.bbr_bw_lo = (u32)bw;
+ info->bbr.bbr_bw_hi = (u32)(bw >> 32);
+ info->bbr.bbr_min_rtt = bbr->min_rtt_us;
+ info->bbr.bbr_pacing_gain = bbr->pacing_gain;
+ info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
+ *attr = INET_DIAG_BBRINFO;
+ return sizeof(info->bbr);
+ }
+ return 0;
+}
+
+__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ struct rate_sample rs = { .losses = 1 };
+
+ bbr->prev_ca_state = TCP_CA_Loss;
+ bbr->full_bw = 0;
+ bbr->round_start = 1; /* treat RTO like end of a round */
+ bbr_lt_bw_sampling(sk, &rs);
+ }
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "bbr",
+ .owner = THIS_MODULE,
+ .init = bbr_init,
+ .cong_control = bbr_main,
+ .sndbuf_expand = bbr_sndbuf_expand,
+ .undo_cwnd = bbr_undo_cwnd,
+ .cwnd_event = bbr_cwnd_event,
+ .ssthresh = bbr_ssthresh,
+ .min_tso_segs = bbr_min_tso_segs,
+ .get_info = bbr_get_info,
+ .set_state = bbr_set_state,
+};
+
+BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+BTF_ID_FLAGS(func, bbr_init)
+BTF_ID_FLAGS(func, bbr_main)
+BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_undo_cwnd)
+BTF_ID_FLAGS(func, bbr_cwnd_event)
+BTF_ID_FLAGS(func, bbr_ssthresh)
+BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_set_state)
+BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tcp_bbr_check_kfunc_ids,
+};
+
+static int __init bbr_register(void)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+ if (ret < 0)
+ return ret;
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 5730333cd0ac..58358bf92e1b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -1,12 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Binary Increase Congestion control for TCP
- *
+ * Home page:
+ * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
* This is from the implementation of BICTCP in
* Lison-Xu, Kahaled Harfoush, and Injong Rhee.
* "Binary Increase Congestion Control for Fast, Long Distance
* Networks" in InfoComm 2004
* Available from:
- * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ * http://netsrv.csc.ncsu.edu/export/bitcp.pdf
*
* Unless BIC is enabled and congestion window is large
* this behaves the same as the original Reno.
@@ -16,7 +18,6 @@
#include <linux/module.h>
#include <net/tcp.h>
-
#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
* max_cwnd = snd_cwnd * beta
*/
@@ -29,7 +30,7 @@ static int fast_convergence = 1;
static int max_increment = 16;
static int low_window = 14;
static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
-static int initial_ssthresh = 100;
+static int initial_ssthresh;
static int smooth_part = 20;
module_param(fast_convergence, int, 0644);
@@ -45,12 +46,10 @@ MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
module_param(smooth_part, int, 0644);
MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
-
/* BIC TCP Parameters */
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
- u32 last_max_cwnd; /* last maximum snd_cwnd */
- u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 epoch_start; /* beginning of an epoch */
@@ -62,7 +61,6 @@ static inline void bictcp_reset(struct bictcp *ca)
{
ca->cnt = 0;
ca->last_max_cwnd = 0;
- ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
ca->epoch_start = 0;
@@ -71,7 +69,10 @@ static inline void bictcp_reset(struct bictcp *ca)
static void bictcp_init(struct sock *sk)
{
- bictcp_reset(inet_csk_ca(sk));
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+
if (initial_ssthresh)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
@@ -82,14 +83,14 @@ static void bictcp_init(struct sock *sk)
static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
{
if (ca->last_cwnd == cwnd &&
- (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
return;
ca->last_cwnd = cwnd;
- ca->last_time = tcp_time_stamp;
+ ca->last_time = tcp_jiffies32;
if (ca->epoch_start == 0) /* record the beginning of an epoch */
- ca->epoch_start = tcp_time_stamp;
+ ca->epoch_start = tcp_jiffies32;
/* start off normal */
if (cwnd <= low_window) {
@@ -99,7 +100,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
/* binary increase */
if (cwnd < ca->last_max_cwnd) {
- __u32 dist = (ca->last_max_cwnd - cwnd)
+ __u32 dist = (ca->last_max_cwnd - cwnd)
/ BICTCP_B;
if (dist > max_increment)
@@ -126,7 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
/* if in slow start or link utilization is very low */
- if (ca->loss_cwnd == 0) {
+ if (ca->last_max_cwnd == 0) {
if (ca->cnt > 20) /* increase cwnd 5% per RTT */
ca->cnt = 20;
}
@@ -136,31 +137,21 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack,
- u32 seq_rtt, u32 in_flight, int data_acked)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
- else {
- bictcp_update(ca, tp->snd_cwnd);
-
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= ca->cnt) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
-
+ bictcp_update(ca, tcp_snd_cwnd(tp));
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
/*
@@ -175,26 +166,16 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
/* Wmax and fast convergence */
- if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
- ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
/ (2 * BICTCP_BETA_SCALE);
else
- ca->last_max_cwnd = tp->snd_cwnd;
-
- ca->loss_cwnd = tp->snd_cwnd;
+ ca->last_max_cwnd = tcp_snd_cwnd(tp);
-
- if (tp->snd_cwnd <= low_window)
- return max(tp->snd_cwnd >> 1U, 2U);
+ if (tcp_snd_cwnd(tp) <= low_window)
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
else
- return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
-}
-
-static u32 bictcp_undo_cwnd(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct bictcp *ca = inet_csk_ca(sk);
- return max(tp->snd_cwnd, ca->last_max_cwnd);
+ return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
static void bictcp_state(struct sock *sk, u8 new_state)
@@ -206,24 +187,24 @@ static void bictcp_state(struct sock *sk, u8 new_state)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
- cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ca->delayed_ack += cnt;
+
+ ca->delayed_ack += sample->pkts_acked -
+ (ca->delayed_ack >> ACK_RATIO_SHIFT);
}
}
-
-static struct tcp_congestion_ops bictcp = {
+static struct tcp_congestion_ops bictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
- .undo_cwnd = bictcp_undo_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "bic",
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..a268e1595b22
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/util_macros.h>
+
+#include <net/inet_common.h>
+#include <net/tls.h>
+
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tcp;
+ int copied;
+
+ if (!skb || !skb->len || !sk_is_tcp(sk))
+ return;
+
+ if (skb_bpf_strparser(skb))
+ return;
+
+ tcp = tcp_sk(sk);
+ copied = tcp->copied_seq + skb->len;
+ WRITE_ONCE(tcp->copied_seq, copied);
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, skb->len);
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, u32 apply_bytes)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ u32 size, copied = 0;
+ struct sk_msg *tmp;
+ int i, ret = 0;
+
+ tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!tmp))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ tmp->sg.start = msg->sg.start;
+ i = msg->sg.start;
+ do {
+ sge = sk_msg_elem(msg, i);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ if (!__sk_rmem_schedule(sk, size, false)) {
+ if (!copied)
+ ret = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ atomic_add(size, &sk->sk_rmem_alloc);
+ sk_msg_xfer(tmp, msg, i, size);
+ copied += size;
+ if (sge->length)
+ get_page(sk_msg_page(tmp, i));
+ sk_msg_iter_var_next(i);
+ tmp->sg.end = i;
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes) {
+ if (sge->length)
+ sk_msg_iter_var_prev(i);
+ break;
+ }
+ }
+ } while (i != msg->sg.end);
+
+ if (!ret) {
+ msg->sg.start = i;
+ if (!sk_psock_queue_msg(psock, tmp))
+ atomic_sub(copied, &sk->sk_rmem_alloc);
+ sk_psock_data_ready(sk, psock);
+ } else {
+ sk_msg_free(sk, tmp);
+ kfree(tmp);
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+ int flags, bool uncharge)
+{
+ struct msghdr msghdr = {};
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ struct page *page;
+ int size, ret = 0;
+ u32 off;
+
+ while (1) {
+ struct bio_vec bvec;
+ bool has_tx_ulp;
+
+ sge = sk_msg_elem(msg, msg->sg.start);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ off = sge->offset;
+ page = sg_page(sge);
+
+ tcp_rate_check_app_limited(sk);
+retry:
+ msghdr.msg_flags = flags | MSG_SPLICE_PAGES;
+ has_tx_ulp = tls_sw_has_ctx_tx(sk);
+ if (has_tx_ulp)
+ msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY;
+
+ if (size < sge->length && msg->sg.start != msg->sg.end)
+ msghdr.msg_flags |= MSG_MORE;
+
+ bvec_set_page(&bvec, page, size, off);
+ iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size);
+ ret = tcp_sendmsg_locked(sk, &msghdr, size);
+ if (ret <= 0)
+ return ret;
+
+ if (apply)
+ apply_bytes -= ret;
+ msg->sg.size -= ret;
+ sge->offset += ret;
+ sge->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ if (ret != size) {
+ size -= ret;
+ off += ret;
+ goto retry;
+ }
+ if (!sge->length) {
+ put_page(page);
+ sk_msg_iter_next(msg, start);
+ sg_init_table(sge, 1);
+ if (msg->sg.start == msg->sg.end)
+ break;
+ }
+ if (apply && !apply_bytes)
+ break;
+ }
+
+ return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+ u32 apply_bytes, int flags, bool uncharge)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+ release_sock(sk);
+ return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+ struct sk_msg *msg, u32 bytes, int flags)
+{
+ struct sk_psock *psock = sk_psock_get(sk);
+ int ret;
+
+ if (unlikely(!psock))
+ return -EPIPE;
+
+ ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) :
+ tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+#ifdef CONFIG_BPF_SYSCALL
+static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+ long timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static bool is_next_msg_fin(struct sk_psock *psock)
+{
+ struct scatterlist *sge;
+ struct sk_msg *msg_rx;
+ int i;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ i = msg_rx->sg.start;
+ sge = sk_msg_elem(msg_rx, i);
+ if (!sge->length) {
+ struct sk_buff *skb = msg_rx->skb;
+
+ if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ return true;
+ }
+ return false;
+}
+
+static int tcp_bpf_recvmsg_parser(struct sock *sk,
+ struct msghdr *msg,
+ size_t len,
+ int flags,
+ int *addr_len)
+{
+ int peek = flags & MSG_PEEK;
+ struct sk_psock *psock;
+ struct tcp_sock *tcp;
+ int copied = 0;
+ u32 seq;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (!len)
+ return 0;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+
+ lock_sock(sk);
+ tcp = tcp_sk(sk);
+ seq = tcp->copied_seq;
+ /* We may have received data on the sk_receive_queue pre-accept and
+ * then we can not use read_skb in this context because we haven't
+ * assigned a sk_socket yet so have no link to the ops. The work-around
+ * is to check the sk_receive_queue and in these cases read skbs off
+ * queue again. The read_skb hook is not running at this point because
+ * of lock_sock so we avoid having multiple runners in read_skb.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ tcp_data_ready(sk);
+ /* This handles the ENOMEM errors if we both receive data
+ * pre accept and are already under memory pressure. At least
+ * let user know to retry.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ copied = -EAGAIN;
+ goto out;
+ }
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ /* The typical case for EFAULT is the socket was gracefully
+ * shutdown with a FIN pkt. So check here the other case is
+ * some error on copy_page_to_iter which would be unexpected.
+ * On fin return correct return code to zero.
+ */
+ if (copied == -EFAULT) {
+ bool is_fin = is_next_msg_fin(psock);
+
+ if (is_fin) {
+ copied = 0;
+ seq++;
+ goto out;
+ }
+ }
+ seq += copied;
+ if (!copied) {
+ long timeo;
+ int data;
+
+ if (sock_flag(sk, SOCK_DONE))
+ goto out;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ goto out;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ copied = -ENOTCONN;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ if (!timeo) {
+ copied = -EAGAIN;
+ goto out;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ data = tcp_msg_wait_data(sk, psock, timeo);
+ if (data < 0) {
+ copied = data;
+ goto unlock;
+ }
+ if (data && !sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ copied = -EAGAIN;
+ }
+out:
+ if (!peek)
+ WRITE_ONCE(tcp->copied_seq, seq);
+ tcp_rcv_space_adjust(sk);
+ if (copied > 0)
+ __tcp_cleanup_rbuf(sk, copied);
+
+unlock:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied;
+}
+
+static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (!len)
+ return 0;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock)) {
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+ }
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ long timeo;
+ int data;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ data = tcp_msg_wait_data(sk, psock, timeo);
+ if (data < 0) {
+ ret = data;
+ goto unlock;
+ }
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+
+unlock:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, int *copied, int flags)
+{
+ bool cork = false, enospc = sk_msg_full(msg), redir_ingress;
+ struct sock *sk_redir;
+ u32 tosend, origsize, sent, delta = 0;
+ u32 eval;
+ int ret;
+
+more_data:
+ if (psock->eval == __SK_NONE) {
+ /* Track delta in msg size to add/subtract it on SK_DROP from
+ * returned to user copied size. This ensures user doesn't
+ * get a positive return code with msg_cut_data and SK_DROP
+ * verdict.
+ */
+ delta = msg->sg.size;
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ delta -= msg->sg.size;
+ }
+
+ if (msg->cork_bytes &&
+ msg->cork_bytes > msg->sg.size && !enospc) {
+ psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+ if (!psock->cork) {
+ psock->cork = kzalloc(sizeof(*psock->cork),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!psock->cork) {
+ sk_msg_free(sk, msg);
+ *copied = 0;
+ return -ENOMEM;
+ }
+ }
+ memcpy(psock->cork, msg, sizeof(*msg));
+ return 0;
+ }
+
+ tosend = msg->sg.size;
+ if (psock->apply_bytes && psock->apply_bytes < tosend)
+ tosend = psock->apply_bytes;
+ eval = __SK_NONE;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+ if (unlikely(ret)) {
+ *copied -= sk_msg_free(sk, msg);
+ break;
+ }
+ sk_msg_apply_bytes(psock, tosend);
+ break;
+ case __SK_REDIRECT:
+ redir_ingress = psock->redir_ingress;
+ sk_redir = psock->sk_redir;
+ sk_msg_apply_bytes(psock, tosend);
+ if (!psock->apply_bytes) {
+ /* Clean up before releasing the sock lock. */
+ eval = psock->eval;
+ psock->eval = __SK_NONE;
+ psock->sk_redir = NULL;
+ }
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+ release_sock(sk);
+
+ origsize = msg->sg.size;
+ ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+ msg, tosend, flags);
+ sent = origsize - msg->sg.size;
+
+ if (eval == __SK_REDIRECT)
+ sock_put(sk_redir);
+
+ lock_sock(sk);
+ sk_mem_uncharge(sk, sent);
+ if (unlikely(ret < 0)) {
+ int free = sk_msg_free(sk, msg);
+
+ if (!cork)
+ *copied -= free;
+ }
+ if (cork) {
+ sk_msg_free(sk, msg);
+ kfree(msg);
+ msg = NULL;
+ ret = 0;
+ }
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free(sk, msg);
+ sk_msg_apply_bytes(psock, tosend);
+ *copied -= (tosend + delta);
+ return -EACCES;
+ }
+
+ if (likely(!ret)) {
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (msg &&
+ msg->sg.data[msg->sg.start].page_link &&
+ msg->sg.data[msg->sg.start].length)
+ goto more_data;
+ }
+ return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_msg tmp, *msg_tx = NULL;
+ int copied = 0, err = 0, ret = 0;
+ struct sk_psock *psock;
+ long timeo;
+ int flags;
+
+ /* Don't let internal flags through */
+ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED);
+ flags |= MSG_NO_SHARED_FRAGS;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendmsg(sk, msg, size);
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ while (msg_data_left(msg)) {
+ bool enospc = false;
+ u32 copy, osize;
+
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+ if (psock->cork) {
+ msg_tx = psock->cork;
+ } else {
+ msg_tx = &tmp;
+ sk_msg_init(msg_tx);
+ }
+
+ osize = msg_tx->sg.size;
+ err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = msg_tx->sg.size - osize;
+ }
+
+ ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+ copy);
+ if (ret < 0) {
+ sk_msg_trim(sk, msg_tx, osize);
+ goto out_err;
+ }
+
+ copied += ret;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err) {
+ if (msg_tx && msg_tx != psock->cork)
+ sk_msg_free(sk, msg_tx);
+ goto out_err;
+ }
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied > 0 ? copied : err;
+}
+
+enum {
+ TCP_BPF_IPV4,
+ TCP_BPF_IPV6,
+ TCP_BPF_NUM_PROTS,
+};
+
+enum {
+ TCP_BPF_BASE,
+ TCP_BPF_TX,
+ TCP_BPF_RX,
+ TCP_BPF_TXRX,
+ TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+ struct proto *base)
+{
+ prot[TCP_BPF_BASE] = *base;
+ prot[TCP_BPF_BASE].destroy = sock_map_destroy;
+ prot[TCP_BPF_BASE].close = sock_map_close;
+ prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
+ prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
+
+ prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+
+ prot[TCP_BPF_RX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser;
+
+ prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX];
+ prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+ if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(ops != tcpv6_prot_saved)) {
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+ smp_store_release(&tcpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
+ }
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+ return 0;
+}
+late_initcall(tcp_bpf_v4_build_proto);
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+ /* In order to avoid retpoline, we make assumptions when we call
+ * into ops if e.g. a psock is not present. Make sure they are
+ * indeed valid assumptions.
+ */
+ return ops->recvmsg == tcp_recvmsg &&
+ ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sock *sk = strp->sk;
+ struct sk_psock *psock;
+ struct tcp_sock *tp;
+ int copied = 0;
+
+ tp = tcp_sk(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (WARN_ON_ONCE(!psock)) {
+ desc->error = -EINVAL;
+ goto out;
+ }
+
+ psock->ingress_bytes = 0;
+ copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
+ &psock->copied_seq);
+ if (copied < 0)
+ goto out;
+ /* recv_actor may redirect skb to another socket (SK_REDIRECT) or
+ * just put skb into ingress queue of current socket (SK_PASS).
+ * For SK_REDIRECT, we need to ack the frame immediately but for
+ * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser().
+ */
+ tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
+out:
+ rcu_read_unlock();
+ return copied;
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
+int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ if (psock->progs.stream_verdict || psock->progs.skb_verdict) {
+ config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX;
+ }
+
+ if (restore) {
+ if (inet_csk_has_ulp(sk)) {
+ /* TLS does not have an unhash proto in SW cases,
+ * but we need to ensure we stop using the sock_map
+ * unhash routine because the associated psock is being
+ * removed. So use the original unhash handler.
+ */
+ WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash);
+ tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+ } else {
+ sk->sk_write_space = psock->saved_write_space;
+ /* Pairs with lockless read in sk_clone_lock() */
+ sock_replace_proto(sk, psock->sk_proto);
+ }
+ return 0;
+ }
+
+ if (sk->sk_family == AF_INET6) {
+ if (tcp_bpf_assert_proto_ops(psock->sk_proto))
+ return -EINVAL;
+
+ tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+ }
+
+ /* Pairs with lockless read in sk_clone_lock() */
+ sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
+
+/* If a child got cloned from a listening socket that had tcp_bpf
+ * protocol callbacks installed, we need to restore the callbacks to
+ * the default ones because the child does not inherit the psock state
+ * that tcp_bpf callbacks expect.
+ */
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct proto *prot = newsk->sk_prot;
+
+ if (is_insidevar(prot, tcp_bpf_prots))
+ newsk->sk_prot = sk->sk_prot_creator;
+}
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
new file mode 100644
index 000000000000..fbad6c35dee9
--- /dev/null
+++ b/net/ipv4/tcp_cdg.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CAIA Delay-Gradient (CDG) congestion control
+ *
+ * This implementation is based on the paper:
+ * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011.
+ *
+ * Scavenger traffic (Less-than-Best-Effort) should disable coexistence
+ * heuristics using parameters use_shadow=0 and use_ineff=0.
+ *
+ * Parameters window, backoff_beta, and backoff_factor are crucial for
+ * throughput and delay. Future work is needed to determine better defaults,
+ * and to provide guidelines for use in different environments/contexts.
+ *
+ * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/.
+ * Parameter window is only configurable when loading tcp_cdg as a module.
+ *
+ * Notable differences from paper/FreeBSD:
+ * o Using Hybrid Slow start and Proportional Rate Reduction.
+ * o Add toggle for shadow window mechanism. Suggested by David Hayes.
+ * o Add toggle for non-congestion loss tolerance.
+ * o Scaling parameter G is changed to a backoff factor;
+ * conversion is given by: backoff_factor = 1000/(G * window).
+ * o Limit shadow window to 2 * cwnd, or to cwnd when application limited.
+ * o More accurate e^-x.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/sched/clock.h>
+
+#include <net/tcp.h>
+
+#define HYSTART_ACK_TRAIN 1
+#define HYSTART_DELAY 2
+
+static int window __read_mostly = 8;
+static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */
+static unsigned int backoff_factor __read_mostly = 42;
+static unsigned int hystart_detect __read_mostly = 3;
+static unsigned int use_ineff __read_mostly = 5;
+static bool use_shadow __read_mostly = true;
+static bool use_tolerance __read_mostly;
+
+module_param(window, int, 0444);
+MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)");
+module_param(backoff_beta, uint, 0644);
+MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)");
+module_param(backoff_factor, uint, 0644);
+MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor");
+module_param(hystart_detect, uint, 0644);
+MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start "
+ "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)");
+module_param(use_ineff, uint, 0644);
+MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)");
+module_param(use_shadow, bool, 0644);
+MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
+module_param(use_tolerance, bool, 0644);
+MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
+
+struct cdg_minmax {
+ union {
+ struct {
+ s32 min;
+ s32 max;
+ };
+ u64 v64;
+ };
+};
+
+enum cdg_state {
+ CDG_UNKNOWN = 0,
+ CDG_NONFULL = 1,
+ CDG_FULL = 2,
+ CDG_BACKOFF = 3,
+};
+
+struct cdg {
+ struct cdg_minmax rtt;
+ struct cdg_minmax rtt_prev;
+ struct cdg_minmax *gradients;
+ struct cdg_minmax gsum;
+ bool gfilled;
+ u8 tail;
+ u8 state;
+ u8 delack;
+ u32 rtt_seq;
+ u32 shadow_wnd;
+ u16 backoff_cnt;
+ u16 sample_cnt;
+ s32 delay_min;
+ u32 last_ack;
+ u32 round_start;
+};
+
+/**
+ * nexp_u32 - negative base-e exponential
+ * @ux: x in units of micro
+ *
+ * Returns exp(ux * -1e-6) * U32_MAX.
+ */
+static u32 __pure nexp_u32(u32 ux)
+{
+ static const u16 v[] = {
+ /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */
+ 65535,
+ 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
+ 61378, 57484, 50423, 38795, 22965, 8047, 987, 14,
+ };
+ u32 msb = ux >> 8;
+ u32 res;
+ int i;
+
+ /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */
+ if (msb > U16_MAX)
+ return 0;
+
+ /* Scale first eight bits linearly: */
+ res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
+
+ /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */
+ for (i = 1; msb; i++, msb >>= 1) {
+ u32 y = v[i & -(msb & 1)] + U32_C(1);
+
+ res = ((u64)res * y) >> 16;
+ }
+
+ return res;
+}
+
+/* Based on the HyStart algorithm (by Ha et al.) that is implemented in
+ * tcp_cubic. Differences/experimental changes:
+ * o Using Hayes' delayed ACK filter.
+ * o Using a usec clock for the ACK train.
+ * o Reset ACK train when application limited.
+ * o Invoked at any cwnd (i.e. also when cwnd < 16).
+ * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh).
+ */
+static void tcp_cdg_hystart_update(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min);
+ if (ca->delay_min == 0)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now_us = tp->tcp_mstamp;
+
+ if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
+ ca->last_ack = now_us;
+ ca->round_start = now_us;
+ } else if (before(now_us, ca->last_ack + 3000)) {
+ u32 base_owd = max(ca->delay_min / 2U, 125U);
+
+ ca->last_ack = now_us;
+ if (after(now_us, ca->round_start + base_owd)) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ return;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ if (ca->sample_cnt < 8) {
+ ca->sample_cnt++;
+ } else {
+ s32 thresh = max(ca->delay_min + ca->delay_min / 8U,
+ 125U);
+
+ if (ca->rtt.min > thresh) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ }
+ }
+ }
+}
+
+static s32 tcp_cdg_grad(struct cdg *ca)
+{
+ s32 gmin = ca->rtt.min - ca->rtt_prev.min;
+ s32 gmax = ca->rtt.max - ca->rtt_prev.max;
+ s32 grad;
+
+ if (ca->gradients) {
+ ca->gsum.min += gmin - ca->gradients[ca->tail].min;
+ ca->gsum.max += gmax - ca->gradients[ca->tail].max;
+ ca->gradients[ca->tail].min = gmin;
+ ca->gradients[ca->tail].max = gmax;
+ ca->tail = (ca->tail + 1) & (window - 1);
+ gmin = ca->gsum.min;
+ gmax = ca->gsum.max;
+ }
+
+ /* We keep sums to ignore gradients during cwnd reductions;
+ * the paper's smoothed gradients otherwise simplify to:
+ * (rtt_latest - rtt_oldest) / window.
+ *
+ * We also drop division by window here.
+ */
+ grad = gmin > 0 ? gmin : gmax;
+
+ /* Extrapolate missing values in gradient window: */
+ if (!ca->gfilled) {
+ if (!ca->gradients && window > 1)
+ grad *= window; /* Memory allocation failed. */
+ else if (ca->tail == 0)
+ ca->gfilled = true;
+ else
+ grad = (grad * window) / (int)ca->tail;
+ }
+
+ /* Backoff was effectual: */
+ if (gmin <= -32 || gmax <= -32)
+ ca->backoff_cnt = 0;
+
+ if (use_tolerance) {
+ /* Reduce small variations to zero: */
+ gmin = DIV_ROUND_CLOSEST(gmin, 64);
+ gmax = DIV_ROUND_CLOSEST(gmax, 64);
+
+ if (gmin > 0 && gmax <= 0)
+ ca->state = CDG_FULL;
+ else if ((gmin > 0 && gmax > 0) || gmax < 0)
+ ca->state = CDG_NONFULL;
+ }
+ return grad;
+}
+
+static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (get_random_u32() <= nexp_u32(grad * backoff_factor))
+ return false;
+
+ if (use_ineff) {
+ ca->backoff_cnt++;
+ if (ca->backoff_cnt > use_ineff)
+ return false;
+ }
+
+ ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp));
+ ca->state = CDG_BACKOFF;
+ tcp_enter_cwr(sk);
+ return true;
+}
+
+/* Not called in CWR or Recovery state. */
+static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_cwnd;
+ u32 incr;
+
+ if (tcp_in_slow_start(tp) && hystart_detect)
+ tcp_cdg_hystart_update(sk);
+
+ if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
+ s32 grad = 0;
+
+ if (ca->rtt_prev.v64)
+ grad = tcp_cdg_grad(ca);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ ca->last_ack = 0;
+ ca->sample_cnt = 0;
+
+ if (grad > 0 && tcp_cdg_backoff(sk, grad))
+ return;
+ }
+
+ if (!tcp_is_cwnd_limited(sk)) {
+ ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp));
+ return;
+ }
+
+ prior_snd_cwnd = tcp_snd_cwnd(tp);
+ tcp_reno_cong_avoid(sk, ack, acked);
+
+ incr = tcp_snd_cwnd(tp) - prior_snd_cwnd;
+ ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
+}
+
+static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sample->rtt_us <= 0)
+ return;
+
+ /* A heuristic for filtering delayed ACKs, adapted from:
+ * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support
+ * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
+ */
+ if (tp->sacked_out == 0) {
+ if (sample->pkts_acked == 1 && ca->delack) {
+ /* A delayed ACK is only used for the minimum if it is
+ * provenly lower than an existing non-zero minimum.
+ */
+ ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
+ ca->delack--;
+ return;
+ } else if (sample->pkts_acked > 1 && ca->delack < 5) {
+ ca->delack++;
+ }
+ }
+
+ ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
+ ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
+}
+
+static u32 tcp_cdg_ssthresh(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (ca->state == CDG_BACKOFF)
+ return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10);
+
+ if (ca->state == CDG_NONFULL && use_tolerance)
+ return tcp_snd_cwnd(tp);
+
+ ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp));
+ if (use_shadow)
+ return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1);
+ return max(2U, tcp_snd_cwnd(tp) >> 1);
+}
+
+static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct cdg_minmax *gradients;
+
+ switch (ev) {
+ case CA_EVENT_CWND_RESTART:
+ gradients = ca->gradients;
+ if (gradients)
+ memset(gradients, 0, window * sizeof(gradients[0]));
+ memset(ca, 0, sizeof(*ca));
+
+ ca->gradients = gradients;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tcp_snd_cwnd(tp);
+ break;
+ case CA_EVENT_COMPLETE_CWR:
+ ca->state = CDG_UNKNOWN;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcp_cdg_init(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->gradients = NULL;
+ /* We silently fall back to window = 1 if allocation fails. */
+ if (window > 1)
+ ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
+ GFP_NOWAIT);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tcp_snd_cwnd(tp);
+}
+
+static void tcp_cdg_release(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ kfree(ca->gradients);
+ ca->gradients = NULL;
+}
+
+static struct tcp_congestion_ops tcp_cdg __read_mostly = {
+ .cong_avoid = tcp_cdg_cong_avoid,
+ .cwnd_event = tcp_cdg_cwnd_event,
+ .pkts_acked = tcp_cdg_acked,
+ .undo_cwnd = tcp_reno_undo_cwnd,
+ .ssthresh = tcp_cdg_ssthresh,
+ .release = tcp_cdg_release,
+ .init = tcp_cdg_init,
+ .owner = THIS_MODULE,
+ .name = "cdg",
+};
+
+static int __init tcp_cdg_register(void)
+{
+ if (backoff_beta > 1024 || window < 1 || window > 256)
+ return -ERANGE;
+ if (!is_power_of_2(window))
+ return -EINVAL;
+
+ BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_cdg);
+ return 0;
+}
+
+static void __exit tcp_cdg_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_cdg);
+}
+
+module_init(tcp_cdg_register);
+module_exit(tcp_cdg_unregister);
+MODULE_AUTHOR("Kenneth Klette Jonassen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP CDG");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 5ca7723d0798..df758adbb445 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,22 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Plugable TCP congestion control support and newReno
+ * Pluggable TCP congestion control support and newReno
* congestion control.
- * Based on ideas from I/O scheduler suport and Web100.
+ * Based on ideas from I/O scheduler support and Web100.
*
* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/jhash.h>
#include <net/tcp.h>
+#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
/* Simple linear search, don't expect many entries! */
-static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
struct tcp_congestion_ops *e;
@@ -28,28 +34,79 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL;
}
-/*
- * Attach new congestion control algorthim to the list
+void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ trace_tcp_cong_state_set(sk, ca_state);
+
+ if (icsk->icsk_ca_ops->set_state)
+ icsk->icsk_ca_ops->set_state(sk, ca_state);
+ icsk->icsk_ca_state = ca_state;
+}
+
+/* Must be called with rcu lock held */
+static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name)
+{
+ struct tcp_congestion_ops *ca = tcp_ca_find(name);
+
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("tcp_%s", name);
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ }
+#endif
+ return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (e->key == key)
+ return e;
+ }
+
+ return NULL;
+}
+
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
+{
+ /* all algorithms must implement these */
+ if (!ca->ssthresh || !ca->undo_cwnd ||
+ !(ca->cong_avoid || ca->cong_control)) {
+ pr_err("%s does not implement required ops\n", ca->name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Attach new congestion control algorithm to the list
* of available options.
*/
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
- int ret = 0;
+ int ret;
- /* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !ca->cong_avoid) {
- printk(KERN_ERR "TCP %s does not implement required ops\n",
- ca->name);
- return -EINVAL;
- }
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
- if (tcp_ca_find(ca->name)) {
- printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+ if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+ pr_notice("%s already registered or non-unique key\n",
+ ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
- printk(KERN_INFO "TCP %s registered\n", ca->name);
+ pr_debug("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
@@ -68,30 +125,144 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
spin_lock(&tcp_cong_list_lock);
list_del_rcu(&ca->list);
spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module gets removed entirely.
+ *
+ * A try_module_get() should fail by now as our module is
+ * in "going" state since no refs are held anymore and
+ * module_exit() handler being called.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-/* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct sock *sk)
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_congestion_ops *ca;
+ struct tcp_congestion_ops *existing;
+ int ret = 0;
- if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
- return;
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ existing = tcp_ca_find_key(old_ca->key);
+ if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+ pr_notice("%s not registered or non-unique key\n",
+ ca->name);
+ ret = -EINVAL;
+ } else if (existing != old_ca) {
+ pr_notice("invalid old congestion control algorithm to replace\n");
+ ret = -EINVAL;
+ } else {
+ /* Add the new one before removing the old one to keep
+ * one implementation available all the time.
+ */
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ list_del_rcu(&existing->list);
+ pr_debug("%s updated\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module or struct_ops gets removed entirely.
+ */
+ if (!ret)
+ synchronize_rcu();
+
+ return ret;
+}
+
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
+{
+ const struct tcp_congestion_ops *ca;
+ u32 key = TCP_CA_UNSPEC;
+
+ might_sleep();
rcu_read_lock();
- list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (try_module_get(ca->owner)) {
- icsk->icsk_ca_ops = ca;
- break;
- }
+ ca = tcp_ca_find_autoload(name);
+ if (ca) {
+ key = ca->key;
+ *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
+ }
+ rcu_read_unlock();
+ return key;
+}
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ const struct tcp_congestion_ops *ca;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(key);
+ if (ca) {
+ strscpy(buffer, ca->name, TCP_CA_NAME_MAX);
+ ret = buffer;
}
rcu_read_unlock();
+ return ret;
+}
+
+/* Assign choice of congestion control. */
+void tcp_assign_congestion_control(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+ if (unlikely(!bpf_try_module_get(ca, ca->owner)))
+ ca = &tcp_reno;
+ icsk->icsk_ca_ops = ca;
+ rcu_read_unlock();
+
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_sk(sk)->prior_ssthresh = 0;
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
+ icsk->icsk_ca_initialized = 1;
+}
+
+static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+ icsk->icsk_ca_setsockopt = 1;
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ tcp_init_congestion_control(sk);
}
/* Manage refcounts on socket close. */
@@ -99,35 +270,38 @@ void tcp_cleanup_congestion_control(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->release)
+ if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
- module_put(icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_initialized = 0;
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
-int tcp_set_default_congestion_control(const char *name)
+int tcp_set_default_congestion_control(struct net *net, const char *name)
{
struct tcp_congestion_ops *ca;
- int ret = -ENOENT;
-
- spin_lock(&tcp_cong_list_lock);
- ca = tcp_ca_find(name);
-#ifdef CONFIG_KMOD
- if (!ca && capable(CAP_SYS_MODULE)) {
- spin_unlock(&tcp_cong_list_lock);
+ const struct tcp_congestion_ops *prev;
+ int ret;
- request_module("tcp_%s", name);
- spin_lock(&tcp_cong_list_lock);
- ca = tcp_ca_find(name);
- }
-#endif
+ rcu_read_lock();
+ ca = tcp_ca_find_autoload(name);
+ if (!ca) {
+ ret = -ENOENT;
+ } else if (!bpf_try_module_get(ca, ca->owner)) {
+ ret = -EBUSY;
+ } else if (!net_eq(net, &init_net) &&
+ !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
+ /* Only init netns can set default to a restricted algorithm */
+ ret = -EPERM;
+ } else {
+ prev = xchg(&net->ipv4.tcp_congestion_control, ca);
+ if (prev)
+ bpf_module_put(prev, prev->owner);
- if (ca) {
- ca->non_restricted = 1; /* default is always allowed */
- list_move(&ca->list, &tcp_cong_list);
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
}
- spin_unlock(&tcp_cong_list_lock);
+ rcu_read_unlock();
return ret;
}
@@ -135,11 +309,11 @@ int tcp_set_default_congestion_control(const char *name)
/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void)
{
- return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+ return tcp_set_default_congestion_control(&init_net,
+ CONFIG_DEFAULT_TCP_CONG);
}
late_initcall(tcp_congestion_default);
-
/* Build string with list of available congestion control values */
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
{
@@ -152,20 +326,20 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
"%s%s",
offs == 0 ? "" : " ", ca->name);
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
/* Get current default congestion control */
-void tcp_get_default_congestion_control(char *name)
+void tcp_get_default_congestion_control(struct net *net, char *name)
{
- struct tcp_congestion_ops *ca;
- /* We will always have reno... */
- BUG_ON(list_empty(&tcp_cong_list));
+ const struct tcp_congestion_ops *ca;
rcu_read_lock();
- ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
- strncpy(name, ca->name, TCP_CA_NAME_MAX);
+ ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+ strscpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock();
}
@@ -178,12 +352,14 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
*buf = '\0';
rcu_read_lock();
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (!ca->non_restricted)
+ if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
continue;
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -192,10 +368,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
int tcp_set_allowed_congestion_control(char *val)
{
struct tcp_congestion_ops *ca;
- char *clone, *name;
+ char *saved_clone, *clone, *name;
int ret = 0;
- clone = kstrdup(val, GFP_USER);
+ saved_clone = clone = kstrdup(val, GFP_USER);
if (!clone)
return -ENOMEM;
@@ -209,94 +385,106 @@ int tcp_set_allowed_congestion_control(char *val)
}
}
- /* pass 2 clear */
+ /* pass 2 clear old values */
list_for_each_entry_rcu(ca, &tcp_cong_list, list)
- ca->non_restricted = 0;
+ ca->flags &= ~TCP_CONG_NON_RESTRICTED;
/* pass 3 mark as allowed */
while ((name = strsep(&val, " ")) && *name) {
ca = tcp_ca_find(name);
WARN_ON(!ca);
if (ca)
- ca->non_restricted = 1;
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
}
out:
spin_unlock(&tcp_cong_list_lock);
+ kfree(saved_clone);
return ret;
}
-
-/* Change congestion control for socket */
-int tcp_set_congestion_control(struct sock *sk, const char *name)
+/* Change congestion control for socket. If load is false, then it is the
+ * responsibility of the caller to call tcp_init_congestion_control or
+ * tcp_reinit_congestion_control (if the current congestion control was
+ * already initialized.
+ */
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+ bool cap_net_admin)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_congestion_ops *ca;
+ const struct tcp_congestion_ops *ca;
int err = 0;
- rcu_read_lock();
- ca = tcp_ca_find(name);
- /* no change asking for existing value */
- if (ca == icsk->icsk_ca_ops)
- goto out;
+ if (icsk->icsk_ca_dst_locked)
+ return -EPERM;
-#ifdef CONFIG_KMOD
- /* not found attempt to autoload module */
- if (!ca && capable(CAP_SYS_MODULE)) {
- rcu_read_unlock();
- request_module("tcp_%s", name);
- rcu_read_lock();
+ rcu_read_lock();
+ if (!load)
ca = tcp_ca_find(name);
+ else
+ ca = tcp_ca_find_autoload(name);
+
+ /* No change asking for existing value */
+ if (ca == icsk->icsk_ca_ops) {
+ icsk->icsk_ca_setsockopt = 1;
+ goto out;
}
-#endif
+
if (!ca)
err = -ENOENT;
-
- else if (!(ca->non_restricted || capable(CAP_NET_ADMIN)))
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
err = -EPERM;
-
- else if (!try_module_get(ca->owner))
+ else if (!bpf_try_module_get(ca, ca->owner))
err = -EBUSY;
-
- else {
- tcp_cleanup_congestion_control(sk);
- icsk->icsk_ca_ops = ca;
- if (icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
- }
+ else
+ tcp_reinit_congestion_control(sk, ca);
out:
rcu_read_unlock();
return err;
}
-
-/*
- * Linear increase during slow start
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp)
+__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- if (sysctl_tcp_abc) {
- /* RFC3465: Slow Start
- * TCP sender SHOULD increase cwnd by the number of
- * previously unacknowledged bytes ACKed by each incoming
- * acknowledgment, provided the increase is not more than L
- */
- if (tp->bytes_acked < tp->mss_cache)
- return;
+ u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
- /* We MAY increase by 2 if discovered delayed ack */
- if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- }
- tp->bytes_acked = 0;
+ acked -= cwnd - tcp_snd_cwnd(tp);
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
+ * for every packet that was ACKed.
+ */
+__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+{
+ /* If credits accumulated at a higher w, apply them gently now. */
+ if (tp->snd_cwnd_cnt >= w) {
+ tp->snd_cwnd_cnt = 0;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+ }
+
+ tp->snd_cwnd_cnt += acked;
+ if (tp->snd_cwnd_cnt >= w) {
+ u32 delta = tp->snd_cwnd_cnt / w;
+
+ tp->snd_cwnd_cnt -= delta * w;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
+ }
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
/*
* TCP Reno congestion control
* This is special case used for fallback as well.
@@ -304,74 +492,46 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
- int flag)
+__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
-
- /* In dangerous area, increase slowly. */
- else if (sysctl_tcp_abc) {
- /* RFC3465: Appropriate Byte Count
- * increase once for each full cwnd acked
- */
- if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
- tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- } else {
- /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
- }
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ /* In dangerous area, increase slowly. */
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
/* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd >> 1U, 2U);
+
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-/* Lower bound on congestion window with halving. */
-u32 tcp_reno_min_cwnd(const struct sock *sk)
+__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return tp->snd_ssthresh/2;
+
+ return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
}
-EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
struct tcp_congestion_ops tcp_reno = {
+ .flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
- .non_restricted = 1,
- .owner = THIS_MODULE,
- .ssthresh = tcp_reno_ssthresh,
- .cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
-};
-
-/* Initial congestion control used (until SYN)
- * really reno under another name so we can tell difference
- * during tcp_set_default_congestion_control
- */
-struct tcp_congestion_ops tcp_init_congestion_ops = {
- .name = "",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
};
-EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 6ad184802266..76c23675ae50 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -1,48 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
- *
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
+ * Home page:
+ * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
* This is from the implementation of CUBIC TCP in
- * Injong Rhee, Lisong Xu.
- * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
- * in PFLDnet 2005
+ * Sangtae Ha, Injong Rhee and Lisong Xu,
+ * "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
+ * in ACM SIGOPS Operating System Review, July 2008.
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
+ *
+ * CUBIC integrates a new slow start algorithm, called HyStart.
+ * The details of HyStart are presented in
+ * Sangtae Ha and Injong Rhee,
+ * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
* Available from:
- * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
+ *
+ * All testing results are available from:
+ * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
*
* Unless CUBIC is enabled and congestion window is large
* this behaves the same as the original Reno.
*/
#include <linux/mm.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/module.h>
+#include <linux/math64.h>
#include <net/tcp.h>
-#include <asm/div64.h>
#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
* max_cwnd = snd_cwnd * beta
*/
-#define BICTCP_B 4 /*
- * In binary search,
- * go to point (max+min)/N
- */
#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
-static int fast_convergence = 1;
-static int max_increment = 16;
-static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
-static int initial_ssthresh = 100;
-static int bic_scale = 41;
-static int tcp_friendliness = 1;
+/* Two methods of hybrid slow start */
+#define HYSTART_ACK_TRAIN 0x1
+#define HYSTART_DELAY 0x2
+
+/* Number of delay samples for detecting the increase of delay */
+#define HYSTART_MIN_SAMPLES 8
+#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
+#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
+#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
+
+static int fast_convergence __read_mostly = 1;
+static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh __read_mostly;
+static int bic_scale __read_mostly = 41;
+static int tcp_friendliness __read_mostly = 1;
+
+static int hystart __read_mostly = 1;
+static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
+static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta_us __read_mostly = 2000;
-static u32 cube_rtt_scale;
-static u32 beta_scale;
-static u64 cube_factor;
+static u32 cube_rtt_scale __read_mostly;
+static u32 beta_scale __read_mostly;
+static u64 cube_factor __read_mostly;
/* Note parameters that are used for precomputing scale factors are read-only */
module_param(fast_convergence, int, 0644);
MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
-module_param(max_increment, int, 0644);
-MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
-module_param(beta, int, 0444);
+module_param(beta, int, 0644);
MODULE_PARM_DESC(beta, "beta for multiplicative increase");
module_param(initial_ssthresh, int, 0644);
MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
@@ -50,116 +72,169 @@ module_param(bic_scale, int, 0444);
MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
module_param(tcp_friendliness, int, 0644);
MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
-
-#include <asm/div64.h>
+module_param(hystart, int, 0644);
+MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
+module_param(hystart_detect, int, 0644);
+MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
+ " 1: packet-train 2: delay 3: both packet-train and delay");
+module_param(hystart_low_window, int, 0644);
+MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta_us, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
/* BIC TCP Parameters */
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
- u32 last_max_cwnd; /* last maximum snd_cwnd */
- u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 bic_origin_point;/* origin point of bic function */
- u32 bic_K; /* time to origin point from the beginning of the current epoch */
- u32 delay_min; /* min delay */
+ u32 bic_K; /* time to origin point
+ from the beginning of the current epoch */
+ u32 delay_min; /* min delay (usec) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
-#define ACK_RATIO_SHIFT 4
- u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+ u16 unused;
+ u8 sample_cnt; /* number of samples to decide curr_rtt */
+ u8 found; /* the exit point is found? */
+ u32 round_start; /* beginning of each round */
+ u32 end_seq; /* end_seq of the round */
+ u32 last_ack; /* last time when the ACK spacing is close */
+ u32 curr_rtt; /* the minimum rtt of current round */
};
static inline void bictcp_reset(struct bictcp *ca)
{
- ca->cnt = 0;
- ca->last_max_cwnd = 0;
- ca->loss_cwnd = 0;
- ca->last_cwnd = 0;
- ca->last_time = 0;
- ca->bic_origin_point = 0;
- ca->bic_K = 0;
- ca->delay_min = 0;
- ca->epoch_start = 0;
- ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
- ca->ack_cnt = 0;
- ca->tcp_cwnd = 0;
+ memset(ca, 0, offsetof(struct bictcp, unused));
+ ca->found = 0;
}
-static void bictcp_init(struct sock *sk)
+static inline u32 bictcp_clock_us(const struct sock *sk)
{
- bictcp_reset(inet_csk_ca(sk));
- if (initial_ssthresh)
- tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+ return tcp_sk(sk)->tcp_mstamp;
}
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+static inline void bictcp_hystart_reset(struct sock *sk)
{
- u_int32_t d = divisor;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
- if (divisor > 0xffffffffULL) {
- unsigned int shift = fls(divisor >> 32);
+ ca->round_start = ca->last_ack = bictcp_clock_us(sk);
+ ca->end_seq = tp->snd_nxt;
+ ca->curr_rtt = ~0U;
+ ca->sample_cnt = 0;
+}
- d = divisor >> shift;
- dividend >>= shift;
- }
+__bpf_kfunc static void cubictcp_init(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
- /* avoid 64 bit division if possible */
- if (dividend >> 32)
- do_div(dividend, d);
- else
- dividend = (uint32_t) dividend / d;
+ bictcp_reset(ca);
- return dividend;
+ if (hystart)
+ bictcp_hystart_reset(sk);
+
+ if (!hystart && initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-/*
- * calculate the cubic root of x using Newton-Raphson
+__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_TX_START) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_jiffies32;
+ s32 delta;
+
+ delta = now - tcp_sk(sk)->lsndtime;
+
+ /* We were application limited (idle) for a while.
+ * Shift epoch_start to keep cwnd growth to cubic curve.
+ */
+ if (ca->epoch_start && delta > 0) {
+ ca->epoch_start += delta;
+ if (after(ca->epoch_start, now))
+ ca->epoch_start = now;
+ }
+ return;
+ }
+}
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
*/
static u32 cubic_root(u64 a)
{
- u32 x, x1;
-
- /* Initial estimate is based on:
- * cbrt(x) = exp(log(x) / 3)
+ u32 x, b, shift;
+ /*
+ * cbrt(x) MSB values for x MSB values in [0..63].
+ * Precomputed then refined by hand - Willy Tarreau
+ *
+ * For x in [0..63],
+ * v = cbrt(x << 18) - 1
+ * cbrt(x) = (v[x] + 10) >> 6
*/
- x = 1u << (fls64(a)/3);
+ static const u8 v[] = {
+ /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
+ /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
+ /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
+ /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
+ /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
+ /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
+ /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
+ /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
+ };
+
+ b = fls64(a);
+ if (b < 7) {
+ /* a in [0..63] */
+ return ((u32)v[(u32)a] + 35) >> 6;
+ }
+
+ b = ((b * 84) >> 8) - 1;
+ shift = (a >> (b * 3));
+
+ x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
/*
- * Iteration based on:
+ * Newton-Raphson iteration
* 2
* x = ( 2 * x + a / x ) / 3
* k+1 k k
*/
- do {
- x1 = x;
- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
- } while (abs(x1 - x) > 1);
-
+ x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
+ x = ((x * 341) >> 10);
return x;
}
/*
* Compute congestion window to use.
*/
-static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
{
- u64 offs;
- u32 delta, t, bic_target, min_cnt, max_cnt;
+ u32 delta, bic_target, max_cnt;
+ u64 offs, t;
- ca->ack_cnt++; /* count the number of ACKs */
+ ca->ack_cnt += acked; /* count the number of ACKed packets */
if (ca->last_cwnd == cwnd &&
- (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
return;
+ /* The CUBIC function can update ca->cnt at most once per jiffy.
+ * On all cwnd reduction events, ca->epoch_start is set to 0,
+ * which will force a recalculation of ca->cnt.
+ */
+ if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
+ goto tcp_friendliness;
+
ca->last_cwnd = cwnd;
- ca->last_time = tcp_time_stamp;
+ ca->last_time = tcp_jiffies32;
if (ca->epoch_start == 0) {
- ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
- ca->ack_cnt = 1; /* start counting */
+ ca->epoch_start = tcp_jiffies32; /* record beginning */
+ ca->ack_cnt = acked; /* start counting */
ca->tcp_cwnd = cwnd; /* syn with cubic */
if (ca->last_max_cwnd <= cwnd) {
@@ -175,130 +250,95 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
}
- /* cubic function - calc*/
- /* calculate c * time^3 / rtt,
- * while considering overflow in calculation of time^3
+ /* cubic function - calc*/
+ /* calculate c * time^3 / rtt,
+ * while considering overflow in calculation of time^3
* (so time^3 is done by using 64 bit)
* and without the support of division of 64bit numbers
* (so all divisions are done by using 32 bit)
- * also NOTE the unit of those veriables
- * time = (t - K) / 2^bictcp_HZ
- * c = bic_scale >> 10
+ * also NOTE the unit of those veriables
+ * time = (t - K) / 2^bictcp_HZ
+ * c = bic_scale >> 10
* rtt = (srtt >> 3) / HZ
* !!! The following code does not have overflow problems,
* if the cwnd < 1 million packets !!!
- */
+ */
+ t = (s32)(tcp_jiffies32 - ca->epoch_start);
+ t += usecs_to_jiffies(ca->delay_min);
/* change the unit from HZ to bictcp_HZ */
- t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start)
- << BICTCP_HZ) / HZ;
+ t <<= BICTCP_HZ;
+ do_div(t, HZ);
- if (t < ca->bic_K) /* t - K */
+ if (t < ca->bic_K) /* t - K */
offs = ca->bic_K - t;
- else
- offs = t - ca->bic_K;
+ else
+ offs = t - ca->bic_K;
/* c/rtt * (t-K)^3 */
delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
- if (t < ca->bic_K) /* below origin*/
- bic_target = ca->bic_origin_point - delta;
- else /* above origin*/
- bic_target = ca->bic_origin_point + delta;
+ if (t < ca->bic_K) /* below origin*/
+ bic_target = ca->bic_origin_point - delta;
+ else /* above origin*/
+ bic_target = ca->bic_origin_point + delta;
- /* cubic function - calc bictcp_cnt*/
- if (bic_target > cwnd) {
+ /* cubic function - calc bictcp_cnt*/
+ if (bic_target > cwnd) {
ca->cnt = cwnd / (bic_target - cwnd);
- } else {
- ca->cnt = 100 * cwnd; /* very small increment*/
- }
-
- if (ca->delay_min > 0) {
- /* max increment = Smax * rtt / 0.1 */
- min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
- if (ca->cnt < min_cnt)
- ca->cnt = min_cnt;
+ } else {
+ ca->cnt = 100 * cwnd; /* very small increment*/
}
- /* slow start and low utilization */
- if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
- ca->cnt = 50;
+ /*
+ * The initial growth of cubic function may be too conservative
+ * when the available bandwidth is still unknown.
+ */
+ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+ ca->cnt = 20; /* increase cwnd 5% per RTT */
+tcp_friendliness:
/* TCP Friendly */
if (tcp_friendliness) {
u32 scale = beta_scale;
+
delta = (cwnd * scale) >> 3;
- while (ca->ack_cnt > delta) { /* update tcp cwnd */
- ca->ack_cnt -= delta;
- ca->tcp_cwnd++;
+ while (ca->ack_cnt > delta) { /* update tcp cwnd */
+ ca->ack_cnt -= delta;
+ ca->tcp_cwnd++;
}
- if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
+ if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
delta = ca->tcp_cwnd - cwnd;
max_cnt = cwnd / delta;
if (ca->cnt > max_cnt)
ca->cnt = max_cnt;
}
- }
-
- ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
- if (ca->cnt == 0) /* cannot be zero */
- ca->cnt = 1;
-}
-
-
-/* Keep track of minimum rtt */
-static inline void measure_delay(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- struct bictcp *ca = inet_csk_ca(sk);
- u32 delay;
-
- /* No time stamp */
- if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
- /* Discard delay samples right after fast recovery */
- (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
- return;
-
- delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3;
- if (delay == 0)
- delay = 1;
+ }
- /* first time call or link delay decreases */
- if (ca->delay_min == 0 || ca->delay_min > delay)
- ca->delay_min = delay;
+ /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
+ * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
+ */
+ ca->cnt = max(ca->cnt, 2U);
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack,
- u32 seq_rtt, u32 in_flight, int data_acked)
+__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (data_acked)
- measure_delay(sk);
-
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
- else {
- bictcp_update(ca, tp->snd_cwnd);
-
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= ca->cnt) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
-
+ bictcp_update(ca, tcp_snd_cwnd(tp), acked);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
-static u32 bictcp_recalc_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -306,65 +346,173 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
/* Wmax and fast convergence */
- if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
- ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
/ (2 * BICTCP_BETA_SCALE);
else
- ca->last_max_cwnd = tp->snd_cwnd;
-
- ca->loss_cwnd = tp->snd_cwnd;
+ ca->last_max_cwnd = tcp_snd_cwnd(tp);
- return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+ return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct sock *sk)
+__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state)
{
- struct bictcp *ca = inet_csk_ca(sk);
-
- return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+ if (new_state == TCP_CA_Loss) {
+ bictcp_reset(inet_csk_ca(sk));
+ bictcp_hystart_reset(sk);
+ }
}
-static void bictcp_state(struct sock *sk, u8 new_state)
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static u32 hystart_ack_delay(const struct sock *sk)
{
- if (new_state == TCP_CA_Loss)
- bictcp_reset(inet_csk_ca(sk));
+ unsigned long rate;
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ if (!rate)
+ return 0;
+ return min_t(u64, USEC_PER_MSEC,
+ div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate));
}
-/* Track delayed acknowledgment ratio using sliding window
- * ratio = (15*ratio + sample) / 16
- */
-static void bictcp_acked(struct sock *sk, u32 cnt)
+static void hystart_update(struct sock *sk, u32 delay)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 threshold;
- if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
- struct bictcp *ca = inet_csk_ca(sk);
- cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ca->delayed_ack += cnt;
+ if (after(tp->snd_una, ca->end_seq))
+ bictcp_hystart_reset(sk);
+
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (tcp_snd_cwnd(tp) < hystart_low_window)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now = bictcp_clock_us(sk);
+
+ /* first detection parameter - ack-train detection */
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
+ ca->last_ack = now;
+
+ threshold = ca->delay_min + hystart_ack_delay(sk);
+
+ /* Hystart ack train triggers if we get ack past
+ * ca->delay_min/2.
+ * Pacing might have delayed packets up to RTT/2
+ * during slow start.
+ */
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ threshold >>= 1;
+
+ if ((s32)(now - ca->round_start) > threshold) {
+ ca->found = 1;
+ pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
+ now - ca->round_start, threshold,
+ ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp));
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ /* obtain the minimum delay of more than sampling packets */
+ if (ca->curr_rtt > delay)
+ ca->curr_rtt = delay;
+ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
+ ca->sample_cnt++;
+ } else {
+ if (ca->curr_rtt > ca->delay_min +
+ HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
+ ca->found = 1;
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ }
+ }
}
}
+__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 delay;
-static struct tcp_congestion_ops cubictcp = {
- .init = bictcp_init,
- .ssthresh = bictcp_recalc_ssthresh,
- .cong_avoid = bictcp_cong_avoid,
- .set_state = bictcp_state,
- .undo_cwnd = bictcp_undo_cwnd,
- .pkts_acked = bictcp_acked,
+ /* Some calls are for duplicates without timetamps */
+ if (sample->rtt_us < 0)
+ return;
+
+ /* Discard delay samples right after fast recovery */
+ if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
+ return;
+
+ delay = sample->rtt_us;
+ if (delay == 0)
+ delay = 1;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay)
+ ca->delay_min = delay;
+
+ if (!ca->found && tcp_in_slow_start(tp) && hystart)
+ hystart_update(sk, delay);
+}
+
+static struct tcp_congestion_ops cubictcp __read_mostly = {
+ .init = cubictcp_init,
+ .ssthresh = cubictcp_recalc_ssthresh,
+ .cong_avoid = cubictcp_cong_avoid,
+ .set_state = cubictcp_state,
+ .undo_cwnd = tcp_reno_undo_cwnd,
+ .cwnd_event = cubictcp_cwnd_event,
+ .pkts_acked = cubictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",
};
+BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids)
+BTF_ID_FLAGS(func, cubictcp_init)
+BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
+BTF_ID_FLAGS(func, cubictcp_cong_avoid)
+BTF_ID_FLAGS(func, cubictcp_state)
+BTF_ID_FLAGS(func, cubictcp_cwnd_event)
+BTF_ID_FLAGS(func, cubictcp_acked)
+BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tcp_cubic_check_kfunc_ids,
+};
+
static int __init cubictcp_register(void)
{
+ int ret;
+
BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
/* Precompute a bunch of the scaling factors that are used per-packet
* based on SRTT of 100ms
*/
- beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
+ / (BICTCP_BETA_SCALE - beta);
cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
@@ -387,6 +535,9 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+ if (ret < 0)
+ return ret;
return tcp_register_congestion_control(&cubictcp);
}
@@ -401,4 +552,4 @@ module_exit(cubictcp_unregister);
MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("CUBIC TCP");
-MODULE_VERSION("2.0");
+MODULE_VERSION("2.3");
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..03abe0848420
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ * - High burst tolerance (incast due to partition/aggregate)
+ * - Low latency (short flows, queries)
+ * - High throughput (continuous data updates, large file transfers)
+ * with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ * "Data Center TCP (DCTCP)", Data Center Networks session
+ * Proc. ACM SIGCOMM, New Delhi, 2010.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ * "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ * Proc. ACM SIGMETRICS, San Jose, 2011.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <dborkman@redhat.com>
+ * Florian Westphal <fw@strlen.de>
+ * Glenn Judd <glenn.judd@morganstanley.com>
+ */
+
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include "tcp_dctcp.h"
+
+#define DCTCP_MAX_ALPHA 1024U
+
+struct dctcp {
+ u32 old_delivered;
+ u32 old_delivered_ce;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state;
+ u32 loss_cwnd;
+ struct tcp_plb_state plb;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+
+static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10);
+}
+
+static const struct kernel_param_ops dctcp_shift_g_ops = {
+ .set = dctcp_shift_g_set,
+ .get = param_get_uint,
+};
+
+module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+ ca->next_seq = tp->snd_nxt;
+
+ ca->old_delivered = tp->delivered;
+ ca->old_delivered_ce = tp->delivered_ce;
+}
+
+__bpf_kfunc static void dctcp_init(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_ecn_mode_any(tp) ||
+ (sk->sk_state == TCP_LISTEN ||
+ sk->sk_state == TCP_CLOSE)) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+
+ ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+ ca->loss_cwnd = 0;
+ ca->ce_state = 0;
+
+ dctcp_reset(tp, ca);
+ tcp_plb_init(sk, &ca->plb);
+
+ return;
+ }
+
+ /* No ECN support? Fall back to Reno. Also need to clear
+ * ECT from sk since it is set during 3WHS for DCTCP.
+ */
+ inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+ INET_ECN_dontxmit(sk);
+}
+
+__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->loss_cwnd = tcp_snd_cwnd(tp);
+ return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ /* Expired RTT */
+ if (!before(tp->snd_una, ca->next_seq)) {
+ u32 delivered = tp->delivered - ca->old_delivered;
+ u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
+ u32 alpha = ca->dctcp_alpha;
+ u32 ce_ratio = 0;
+
+ if (delivered > 0) {
+ /* dctcp_alpha keeps EWMA of fraction of ECN marked
+ * packets. Because of EWMA smoothing, PLB reaction can
+ * be slow so we use ce_ratio which is an instantaneous
+ * measure of congestion. ce_ratio is the fraction of
+ * ECN marked packets in the previous RTT.
+ */
+ if (delivered_ce > 0)
+ ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered;
+ tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio);
+ tcp_plb_check_rehash(sk, &ca->plb);
+ }
+
+ /* alpha = (1 - g) * alpha + g * F */
+
+ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
+ if (delivered_ce) {
+
+ /* If dctcp_shift_g == 1, a 32bit value would overflow
+ * after 8 M packets.
+ */
+ delivered_ce <<= (10 - dctcp_shift_g);
+ delivered_ce /= max(1U, delivered);
+
+ alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
+ }
+ /* dctcp_alpha can be read from dctcp_get_info() without
+ * synchro, so we ask compiler to not use dctcp_alpha
+ * as a temporary variable in prior operations.
+ */
+ WRITE_ONCE(ca->dctcp_alpha, alpha);
+ dctcp_reset(tp, ca);
+ }
+}
+
+static void dctcp_react_to_loss(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->loss_cwnd = tcp_snd_cwnd(tp);
+ tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
+}
+
+__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Recovery &&
+ new_state != inet_csk(sk)->icsk_ca_state)
+ dctcp_react_to_loss(sk);
+ /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
+ * one loss-adjustment per RTT.
+ */
+}
+
+__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ switch (ev) {
+ case CA_EVENT_ECN_IS_CE:
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
+ break;
+ case CA_EVENT_LOSS:
+ tcp_plb_update_state_upon_rto(sk, &ca->plb);
+ dctcp_react_to_loss(sk);
+ break;
+ case CA_EVENT_TX_START:
+ tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Fill it also in case of VEGASINFO due to req struct limits.
+ * We can still correctly retrieve it later.
+ */
+ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ memset(&info->dctcp, 0, sizeof(info->dctcp));
+ if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+ info->dctcp.dctcp_enabled = 1;
+ info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+ info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+ info->dctcp.dctcp_ab_ecn = tp->mss_cache *
+ (tp->delivered_ce - ca->old_delivered_ce);
+ info->dctcp.dctcp_ab_tot = tp->mss_cache *
+ (tp->delivered - ca->old_delivered);
+ }
+
+ *attr = INET_DIAG_DCTCPINFO;
+ return sizeof(info->dctcp);
+ }
+ return 0;
+}
+
+__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tcp_snd_cwnd(tp), ca->loss_cwnd);
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+ .init = dctcp_init,
+ .in_ack_event = dctcp_update_alpha,
+ .cwnd_event = dctcp_cwnd_event,
+ .ssthresh = dctcp_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = dctcp_cwnd_undo,
+ .set_state = dctcp_state,
+ .get_info = dctcp_get_info,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .owner = THIS_MODULE,
+ .name = "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
+ .get_info = dctcp_get_info,
+ .owner = THIS_MODULE,
+ .name = "dctcp-reno",
+};
+
+BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids)
+BTF_ID_FLAGS(func, dctcp_init)
+BTF_ID_FLAGS(func, dctcp_update_alpha)
+BTF_ID_FLAGS(func, dctcp_cwnd_event)
+BTF_ID_FLAGS(func, dctcp_ssthresh)
+BTF_ID_FLAGS(func, dctcp_cwnd_undo)
+BTF_ID_FLAGS(func, dctcp_state)
+BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tcp_dctcp_check_kfunc_ids,
+};
+
+static int __init dctcp_register(void)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+ if (ret < 0)
+ return ret;
+ return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
new file mode 100644
index 000000000000..4b0259111d81
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.h
@@ -0,0 +1,40 @@
+#ifndef _TCP_DCTCP_H
+#define _TCP_DCTCP_H
+
+static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (ce_state == 1)
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ else
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+ u32 *prior_rcv_nxt, u32 *ce_state)
+{
+ u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+ if (*ce_state != new_ce_state) {
+ /* CE state has changed, force an immediate ACK to
+ * reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ dctcp_ece_ack_cwr(sk, *ce_state);
+ __tcp_send_ack(sk, *prior_rcv_nxt, 0);
+ }
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ }
+ *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+ *ce_state = new_ce_state;
+ dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+#endif
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 57c5f0b10e6c..d83efd91f461 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,44 +1,674 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* tcp_diag.c Module for monitoring TCP transport protocols sockets.
*
- * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-
#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/sock_diag.h>
#include <linux/inet_diag.h>
#include <linux/tcp.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/netlink.h>
#include <net/tcp.h>
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_info *info = _info;
- if (sk->sk_state == TCP_LISTEN)
- r->idiag_rqueue = sk->sk_ack_backlog;
- else
- r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
- r->idiag_wqueue = tp->write_seq - tp->snd_una;
- if (info != NULL)
+ if (inet_sk_state_load(sk) == TCP_LISTEN) {
+ r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
+ } else if (sk->sk_type == SOCK_STREAM) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
+ r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
+ }
+ if (info)
tcp_get_info(sk, info);
}
-static struct inet_diag_handler tcp_diag_handler = {
- .idiag_hashinfo = &tcp_hashinfo,
- .idiag_get_info = tcp_diag_get_info,
- .idiag_type = TCPDIAG_GETSOCK,
- .idiag_info_size = sizeof(struct tcp_info),
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info,
+ const struct tcp_md5sig_key *key)
+{
+ info->tcpm_family = key->family;
+ info->tcpm_prefixlen = key->prefixlen;
+ info->tcpm_keylen = key->keylen;
+ memcpy(info->tcpm_key, key->key, key->keylen);
+
+ if (key->family == AF_INET)
+ info->tcpm_addr[0] = key->addr.a4.s_addr;
+ #if IS_ENABLED(CONFIG_IPV6)
+ else if (key->family == AF_INET6)
+ memcpy(&info->tcpm_addr, &key->addr.a6,
+ sizeof(info->tcpm_addr));
+ #endif
+}
+
+static int tcp_diag_put_md5sig(struct sk_buff *skb,
+ const struct tcp_md5sig_info *md5sig)
+{
+ const struct tcp_md5sig_key *key;
+ struct tcp_diag_md5sig *info;
+ struct nlattr *attr;
+ int md5sig_count = 0;
+
+ hlist_for_each_entry_rcu(key, &md5sig->head, node)
+ md5sig_count++;
+ if (md5sig_count == 0)
+ return 0;
+
+ attr = nla_reserve(skb, INET_DIAG_MD5SIG,
+ md5sig_count * sizeof(struct tcp_diag_md5sig));
+ if (!attr)
+ return -EMSGSIZE;
+
+ info = nla_data(attr);
+ memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig));
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ tcp_diag_md5sig_fill(info++, key);
+ if (--md5sig_count == 0)
+ break;
+ }
+
+ return 0;
+}
+#endif
+
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+ const struct tcp_ulp_ops *ulp_ops, bool net_admin)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+ if (err)
+ goto nla_failure;
+
+ if (ulp_ops->get_info)
+ err = ulp_ops->get_info(sk, skb, net_admin);
+ if (err)
+ goto nla_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_failure:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
+static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
+ struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_ulp_ops *ulp_ops;
+ int err = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (net_admin) {
+ struct tcp_md5sig_info *md5sig;
+
+ rcu_read_lock();
+ md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+ if (md5sig)
+ err = tcp_diag_put_md5sig(skb, md5sig);
+ rcu_read_unlock();
+ if (err < 0)
+ return err;
+ }
+#endif
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ err = tcp_diag_put_ulp(skb, sk, ulp_ops, net_admin);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ size_t size = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (net_admin && sk_fullsock(sk)) {
+ const struct tcp_md5sig_info *md5sig;
+ const struct tcp_md5sig_key *key;
+ size_t md5sig_count = 0;
+
+ rcu_read_lock();
+ md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+ if (md5sig) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node)
+ md5sig_count++;
+ }
+ rcu_read_unlock();
+ size += nla_total_size(md5sig_count *
+ sizeof(struct tcp_diag_md5sig));
+ }
+#endif
+
+ if (sk_fullsock(sk)) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ size += nla_total_size(0) +
+ nla_total_size(TCP_ULP_NAME_MAX);
+ if (ulp_ops->get_info_size)
+ size += ulp_ops->get_info_size(sk, net_admin);
+ }
+ }
+
+ return size
+ + nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+static int tcp_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT);
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = READ_ONCE(tw->tw_substate);
+ r->idiag_timer = 3;
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ tw->tw_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct request_sock *reqsk = inet_reqsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = READ_ONCE(reqsk->num_retrans);
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ u16 nlmsg_flags, bool net_admin)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
+}
+
+static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct net *net = sock_net(skb->sk);
+ u32 idiag_states = r->idiag_states;
+ struct inet_hashinfo *hashinfo;
+ int i, num, s_i, s_num;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
+ goto skip_listen_ht;
+
+ for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+
+ num = 0;
+ ilb = &hashinfo->lhash2[i];
+
+ if (hlist_nulls_empty(&ilb->nulls_head)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
+ spin_unlock(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+
+ s_num = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+ /* Dump bound but inactive (not listening, connecting, etc.) sockets */
+ if (cb->args[0] == 1) {
+ if (!(idiag_states & TCPF_BOUND_INACTIVE))
+ goto skip_bind_ht;
+
+ for (i = s_i; i < hashinfo->bhash_size; i++) {
+ struct inet_bind_hashbucket *ibb;
+ struct inet_bind2_bucket *tb2;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+resume_bind_walk:
+ num = 0;
+ accum = 0;
+ ibb = &hashinfo->bhash2[i];
+
+ if (hlist_empty(&ibb->chain)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock_bh(&ibb->lock);
+ inet_bind_bucket_for_each(tb2, &ibb->chain) {
+ if (!net_eq(ib2_net(tb2), net))
+ continue;
+
+ sk_for_each_bound(sk, &tb2->owners) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_bind;
+
+ if (sk->sk_state != TCP_CLOSE ||
+ !inet->inet_num)
+ goto next_bind;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ r->sdiag_family != sk->sk_family)
+ goto next_bind;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_bind;
+
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ goto pause_bind_walk;
+next_bind:
+ num++;
+ }
+ }
+pause_bind_walk:
+ spin_unlock_bh(&ibb->lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = inet_sk_diag_fill(sk_arr[idx],
+ NULL, skb, cb,
+ r, NLM_F_MULTI,
+ net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ goto done;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto resume_bind_walk;
+ }
+
+ s_num = 0;
+ }
+skip_bind_ht:
+ cb->args[0] = 2;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(idiag_states & ~TCPF_LISTEN))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+next_chunk:
+ num = 0;
+ accum = 0;
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
+ if (!(idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_normal;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_gen_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ break;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
+}
+
+static struct sock *tcp_diag_find_one_icsk(struct net *net,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sock *sk;
+
+ rcu_read_lock();
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, NULL, 0,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ } else {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ rcu_read_unlock();
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
+}
+
+static int tcp_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sk_buff *in_skb = cb->skb;
+ struct sk_buff *rep;
+ struct sock *sk;
+ struct net *net;
+ bool net_admin;
+ int err;
+
+ net = sock_net(in_skb->sk);
+ sk = tcp_diag_find_one_icsk(net, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
+ rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+ return err;
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int tcp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ sk = tcp_diag_find_one_icsk(net, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ err = sock_diag_destroy(sk, ECONNABORTED);
+
+ sock_gen_put(sk);
+
+ return err;
+}
+#endif
+
+static const struct inet_diag_handler tcp_diag_handler = {
+ .owner = THIS_MODULE,
+ .dump = tcp_diag_dump,
+ .dump_one = tcp_diag_dump_one,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_get_aux = tcp_diag_get_aux,
+ .idiag_type = IPPROTO_TCP,
+ .idiag_info_size = sizeof(struct tcp_info),
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = tcp_diag_destroy,
+#endif
};
static int __init tcp_diag_init(void)
@@ -54,3 +684,5 @@ static void __exit tcp_diag_exit(void)
module_init(tcp_diag_init);
module_exit(tcp_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000000..7d945a527daf
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,603 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <net/tcp.h>
+#include <net/busy_poll.h>
+
+void tcp_fastopen_init_key_once(struct net *net)
+{
+ u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ struct tcp_fastopen_context *ctxt;
+
+ rcu_read_lock();
+ ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+ if (ctxt) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ /* tcp_fastopen_reset_cipher publishes the new context
+ * atomically, so we allow this race happening here.
+ *
+ * All call sites of tcp_fastopen_cookie_gen also check
+ * for a valid cookie, so this is an acceptable risk.
+ */
+ get_random_bytes(key, sizeof(key));
+ tcp_fastopen_reset_cipher(net, NULL, key, NULL);
+}
+
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+ struct tcp_fastopen_context *ctx =
+ container_of(head, struct tcp_fastopen_context, rcu);
+
+ kfree_sensitive(ctx);
+}
+
+void tcp_fastopen_destroy_cipher(struct sock *sk)
+{
+ struct tcp_fastopen_context *ctx;
+
+ ctx = rcu_dereference_protected(
+ inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
+ if (ctx)
+ call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
+}
+
+void tcp_fastopen_ctx_destroy(struct net *net)
+{
+ struct tcp_fastopen_context *ctxt;
+
+ ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL));
+
+ if (ctxt)
+ call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
+}
+
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+ void *primary_key, void *backup_key)
+{
+ struct tcp_fastopen_context *ctx, *octx;
+ struct fastopen_queue *q;
+ int err = 0;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+ ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
+ if (backup_key) {
+ ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+ ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
+ ctx->num = 2;
+ } else {
+ ctx->num = 1;
+ }
+
+ if (sk) {
+ q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx)));
+ } else {
+ octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx,
+ RCU_INITIALIZER(ctx)));
+ }
+
+ if (octx)
+ call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+out:
+ return err;
+}
+
+int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
+ u64 *key)
+{
+ struct tcp_fastopen_context *ctx;
+ int n_keys = 0, i;
+
+ rcu_read_lock();
+ if (icsk)
+ ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+ else
+ ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+ if (ctx) {
+ n_keys = tcp_fastopen_context_len(ctx);
+ for (i = 0; i < n_keys; i++) {
+ put_unaligned_le64(ctx->key[i].key[0], key + (i * 2));
+ put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1);
+ }
+ }
+ rcu_read_unlock();
+
+ return n_keys;
+}
+
+static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
+ struct sk_buff *syn,
+ const siphash_key_t *key,
+ struct tcp_fastopen_cookie *foc)
+{
+ BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
+
+ if (req->rsk_ops->family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(syn);
+
+ foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+ sizeof(iph->saddr) +
+ sizeof(iph->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (req->rsk_ops->family == AF_INET6) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+
+ foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+ sizeof(ip6h->saddr) +
+ sizeof(ip6h->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
+ }
+#endif
+ return false;
+}
+
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
+ */
+static void tcp_fastopen_cookie_gen(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (ctx)
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
+ rcu_read_unlock();
+}
+
+/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
+ * queue this additional data / FIN.
+ */
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ tcp_cleanup_skb(skb);
+ /* segs_in has been initialized to 1 in tcp_create_openreq_child().
+ * Hence, reset segs_in to 0 before calling tcp_segs_in()
+ * to avoid double counting. Also, tcp_segs_in() expects
+ * skb->len to include the tcp_hdrlen. Hence, it should
+ * be called before __skb_pull().
+ */
+ tp->segs_in = 0;
+ tcp_segs_in(tp, skb);
+ __skb_pull(skb, tcp_hdrlen(skb));
+ sk_forced_mem_schedule(sk, skb->truesize);
+ skb_set_owner_r(skb, sk);
+
+ TCP_SKB_CB(skb)->seq++;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_add_receive_queue(sk, skb);
+ tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = skb->len;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+}
+
+/* returns 0 - no key match, 1 for primary, 2 for backup */
+static int tcp_fastopen_cookie_gen_check(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *orig,
+ struct tcp_fastopen_cookie *valid_foc)
+{
+ struct tcp_fastopen_cookie search_foc = { .len = -1 };
+ struct tcp_fastopen_cookie *foc = valid_foc;
+ struct tcp_fastopen_context *ctx;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (!ctx)
+ goto out;
+ for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
+ if (tcp_fastopen_cookie_match(foc, orig)) {
+ ret = i + 1;
+ goto out;
+ }
+ foc = &search_foc;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct sock *tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req)
+{
+ struct tcp_sock *tp;
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ struct sock *child;
+ bool own_req;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ NULL, &own_req);
+ if (!child)
+ return NULL;
+
+ spin_lock(&queue->fastopenq.lock);
+ queue->fastopenq.qlen++;
+ spin_unlock(&queue->fastopenq.lock);
+
+ /* Initialize the child socket. Have to fix some values to take
+ * into account the child is a Fast Open socket and is created
+ * only out of the bits carried in the SYN packet.
+ */
+ tp = tcp_sk(child);
+
+ rcu_assign_pointer(tp->fastopen_rsk, req);
+ tcp_rsk(req)->tfo_listener = true;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never
+ * scaled. So correct it appropriately.
+ */
+ tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+ tp->max_window = tp->snd_wnd;
+
+ /* Activate the retrans timer so that SYNACK can be retransmitted.
+ * The request socket is not added to the ehash
+ * because it's been added to the accept queue directly.
+ */
+ req->timeout = tcp_timeout_init(child);
+ tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ req->timeout, false);
+
+ refcount_set(&req->rsk_refcnt, 2);
+
+ sk_mark_napi_id_set(child, skb);
+
+ /* Now finish processing the fastopen child socket. */
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+ tcp_fastopen_add_skb(child, skb);
+
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+ tp->rcv_wup = tp->rcv_nxt;
+ /* tcp_conn_request() is sending the SYNACK,
+ * and queues the child into listener accept queue.
+ */
+ return child;
+}
+
+static bool tcp_fastopen_queue_check(struct sock *sk)
+{
+ struct fastopen_queue *fastopenq;
+ int max_qlen;
+
+ /* Make sure the listener has enabled fastopen, and we don't
+ * exceed the max # of pending TFO requests allowed before trying
+ * to validating the cookie in order to avoid burning CPU cycles
+ * unnecessarily.
+ *
+ * XXX (TFO) - The implication of checking the max_qlen before
+ * processing a cookie request is that clients can't differentiate
+ * between qlen overflow causing Fast Open to be disabled
+ * temporarily vs a server not supporting Fast Open at all.
+ */
+ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ max_qlen = READ_ONCE(fastopenq->max_qlen);
+ if (max_qlen == 0)
+ return false;
+
+ if (fastopenq->qlen >= max_qlen) {
+ struct request_sock *req1;
+ spin_lock(&fastopenq->lock);
+ req1 = fastopenq->rskq_rst_head;
+ if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ spin_unlock(&fastopenq->lock);
+ return false;
+ }
+ fastopenq->rskq_rst_head = req1->dl_next;
+ fastopenq->qlen--;
+ spin_unlock(&fastopenq->lock);
+ reqsk_put(req1);
+ }
+ return true;
+}
+
+static bool tcp_fastopen_no_cookie(const struct sock *sk,
+ const struct dst_entry *dst,
+ int flag)
+{
+ return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) ||
+ tcp_sk(sk)->fastopen_no_cookie ||
+ (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
+}
+
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ const struct dst_entry *dst)
+{
+ bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+ int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+ struct sock *child;
+ int ret = 0;
+
+ if (foc->len == 0) /* Client requests a cookie */
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+
+ if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
+ (syn_data || foc->len >= 0) &&
+ tcp_fastopen_queue_check(sk))) {
+ foc->len = -1;
+ return NULL;
+ }
+
+ if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
+ goto fastopen;
+
+ if (foc->len == 0) {
+ /* Client requests a cookie. */
+ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
+ } else if (foc->len > 0) {
+ ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
+ &valid_foc);
+ if (!ret) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ } else {
+ /* Cookie is valid. Create a (full) child socket to
+ * accept the data in SYN before returning a SYN-ACK to
+ * ack the data. If we fail to create the socket, fall
+ * back and ack the ISN only but includes the same
+ * cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to
+ * send data in SYN_RECV state.
+ */
+fastopen:
+ child = tcp_fastopen_create_child(sk, skb, req);
+ if (child) {
+ if (ret == 2) {
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
+ } else {
+ foc->len = -1;
+ }
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ tcp_sk(child)->syn_fastopen_child = 1;
+ return child;
+ }
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ }
+ }
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ return NULL;
+}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ const struct dst_entry *dst;
+
+ tcp_fastopen_cache_get(sk, mss, cookie);
+
+ /* Firewall blackhole issue check */
+ if (tcp_fastopen_active_should_disable(sk)) {
+ cookie->len = -1;
+ return false;
+ }
+
+ dst = __sk_dst_get(sk);
+
+ if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
+ cookie->len = -1;
+ return true;
+ }
+ if (cookie->len > 0)
+ return true;
+ tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
+ return false;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write(). We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ * return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+ struct tcp_fastopen_cookie cookie = { .len = 0 };
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (tp->fastopen_connect && !tp->fastopen_req) {
+ if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+ inet_set_bit(DEFER_CONNECT, sk);
+ return true;
+ }
+
+ /* Alloc fastopen_req in order for FO option to be included
+ * in SYN
+ */
+ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+ sk->sk_allocation);
+ if (tp->fastopen_req)
+ tp->fastopen_req->cookie = cookie;
+ else
+ *err = -ENOBUFS;
+ }
+ return false;
+}
+EXPORT_IPV6_MOD(tcp_fastopen_defer_connect);
+
+/*
+ * The following code block is to deal with middle box issues with TFO:
+ * Middlebox firewall issues can potentially cause server's data being
+ * blackholed after a successful 3WHS using TFO.
+ * The proposed solution is to disable active TFO globally under the
+ * following circumstances:
+ * 1. client side TFO socket receives out of order FIN
+ * 2. client side TFO socket receives out of order RST
+ * 3. client side TFO socket has timed out three times consecutively during
+ * or after handshake
+ * We disable active side TFO globally for 1hr at first. Then if it
+ * happens again, we disable it for 2h, then 4h, 8h, ...
+ * And we reset the timeout back to 1hr when we see a successful active
+ * TFO connection with data exchanges.
+ */
+
+/* Disable active TFO and record current jiffies and
+ * tfo_active_disable_times
+ */
+void tcp_fastopen_active_disable(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout))
+ return;
+
+ /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */
+ WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies);
+
+ /* Paired with smp_rmb() in tcp_fastopen_active_should_disable().
+ * We want net->ipv4.tfo_active_disable_stamp to be updated first.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&net->ipv4.tfo_active_disable_times);
+
+ NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
+}
+
+/* Calculate timeout for tfo active disable
+ * Return true if we are still in the active TFO disable period
+ * Return false if timeout already expired and we should use active TFO
+ */
+bool tcp_fastopen_active_should_disable(struct sock *sk)
+{
+ unsigned int tfo_bh_timeout =
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout);
+ unsigned long timeout;
+ int tfo_da_times;
+ int multiplier;
+
+ if (!tfo_bh_timeout)
+ return false;
+
+ tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
+ if (!tfo_da_times)
+ return false;
+
+ /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */
+ smp_rmb();
+
+ /* Limit timeout to max: 2^6 * initial timeout */
+ multiplier = 1 << min(tfo_da_times - 1, 6);
+
+ /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */
+ timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) +
+ multiplier * tfo_bh_timeout * HZ;
+ if (time_before(jiffies, timeout))
+ return true;
+
+ /* Mark check bit so we can check for successful active TFO
+ * condition and reset tfo_active_disable_times
+ */
+ tcp_sk(sk)->syn_fastopen_ch = 1;
+ return false;
+}
+
+/* Disable active TFO if FIN is the only packet in the ofo queue
+ * and no data is received.
+ * Also check if we can reset tfo_active_disable_times if data is
+ * received successfully on a marked active TFO sockets opened on
+ * a non-loopback interface
+ */
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net_device *dev;
+ struct dst_entry *dst;
+ struct sk_buff *skb;
+
+ if (!tp->syn_fastopen)
+ return;
+
+ if (!tp->data_segs_in) {
+ skb = skb_rb_first(&tp->out_of_order_queue);
+ if (skb && !skb_rb_next(skb)) {
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ tcp_fastopen_active_disable(sk);
+ return;
+ }
+ }
+ } else if (tp->syn_fastopen_ch &&
+ atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!(dev && (dev->flags & IFF_LOOPBACK)))
+ atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
+ rcu_read_unlock();
+ }
+}
+
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
+{
+ u32 timeouts = inet_csk(sk)->icsk_retransmits;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Broken middle-boxes may black-hole Fast Open connection during or
+ * even after the handshake. Be extremely conservative and pause
+ * Fast Open globally after hitting the third consecutive timeout or
+ * exceeding the configured timeout limit.
+ */
+ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
+ (timeouts == 2 || (timeouts < 2 && expired))) {
+ tcp_fastopen_active_disable(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
+}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index c4fc811bf377..c6de5ce79ad3 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Sally Floyd's High Speed TCP (RFC 3649) congestion control
*
- * See http://www.icir.org/floyd/hstcp.html
+ * See https://www.icir.org/floyd/hstcp.html
*
* John Heffner <jheffner@psc.edu>
*/
@@ -9,86 +10,85 @@
#include <linux/module.h>
#include <net/tcp.h>
-
/* From AIMD tables from RFC 3649 appendix B,
* with fixed-point MD scaled <<8.
*/
static const struct hstcp_aimd_val {
- unsigned int cwnd;
- unsigned int md;
+ unsigned int cwnd;
+ unsigned int md;
} hstcp_aimd_vals[] = {
- { 38, 128, /* 0.50 */ },
- { 118, 112, /* 0.44 */ },
- { 221, 104, /* 0.41 */ },
- { 347, 98, /* 0.38 */ },
- { 495, 93, /* 0.37 */ },
- { 663, 89, /* 0.35 */ },
- { 851, 86, /* 0.34 */ },
- { 1058, 83, /* 0.33 */ },
- { 1284, 81, /* 0.32 */ },
- { 1529, 78, /* 0.31 */ },
- { 1793, 76, /* 0.30 */ },
- { 2076, 74, /* 0.29 */ },
- { 2378, 72, /* 0.28 */ },
- { 2699, 71, /* 0.28 */ },
- { 3039, 69, /* 0.27 */ },
- { 3399, 68, /* 0.27 */ },
- { 3778, 66, /* 0.26 */ },
- { 4177, 65, /* 0.26 */ },
- { 4596, 64, /* 0.25 */ },
- { 5036, 62, /* 0.25 */ },
- { 5497, 61, /* 0.24 */ },
- { 5979, 60, /* 0.24 */ },
- { 6483, 59, /* 0.23 */ },
- { 7009, 58, /* 0.23 */ },
- { 7558, 57, /* 0.22 */ },
- { 8130, 56, /* 0.22 */ },
- { 8726, 55, /* 0.22 */ },
- { 9346, 54, /* 0.21 */ },
- { 9991, 53, /* 0.21 */ },
- { 10661, 52, /* 0.21 */ },
- { 11358, 52, /* 0.20 */ },
- { 12082, 51, /* 0.20 */ },
- { 12834, 50, /* 0.20 */ },
- { 13614, 49, /* 0.19 */ },
- { 14424, 48, /* 0.19 */ },
- { 15265, 48, /* 0.19 */ },
- { 16137, 47, /* 0.19 */ },
- { 17042, 46, /* 0.18 */ },
- { 17981, 45, /* 0.18 */ },
- { 18955, 45, /* 0.18 */ },
- { 19965, 44, /* 0.17 */ },
- { 21013, 43, /* 0.17 */ },
- { 22101, 43, /* 0.17 */ },
- { 23230, 42, /* 0.17 */ },
- { 24402, 41, /* 0.16 */ },
- { 25618, 41, /* 0.16 */ },
- { 26881, 40, /* 0.16 */ },
- { 28193, 39, /* 0.16 */ },
- { 29557, 39, /* 0.15 */ },
- { 30975, 38, /* 0.15 */ },
- { 32450, 38, /* 0.15 */ },
- { 33986, 37, /* 0.15 */ },
- { 35586, 36, /* 0.14 */ },
- { 37253, 36, /* 0.14 */ },
- { 38992, 35, /* 0.14 */ },
- { 40808, 35, /* 0.14 */ },
- { 42707, 34, /* 0.13 */ },
- { 44694, 33, /* 0.13 */ },
- { 46776, 33, /* 0.13 */ },
- { 48961, 32, /* 0.13 */ },
- { 51258, 32, /* 0.13 */ },
- { 53677, 31, /* 0.12 */ },
- { 56230, 30, /* 0.12 */ },
- { 58932, 30, /* 0.12 */ },
- { 61799, 29, /* 0.12 */ },
- { 64851, 28, /* 0.11 */ },
- { 68113, 28, /* 0.11 */ },
- { 71617, 27, /* 0.11 */ },
- { 75401, 26, /* 0.10 */ },
- { 79517, 26, /* 0.10 */ },
- { 84035, 25, /* 0.10 */ },
- { 89053, 24, /* 0.10 */ },
+ { 38, 128, /* 0.50 */ },
+ { 118, 112, /* 0.44 */ },
+ { 221, 104, /* 0.41 */ },
+ { 347, 98, /* 0.38 */ },
+ { 495, 93, /* 0.37 */ },
+ { 663, 89, /* 0.35 */ },
+ { 851, 86, /* 0.34 */ },
+ { 1058, 83, /* 0.33 */ },
+ { 1284, 81, /* 0.32 */ },
+ { 1529, 78, /* 0.31 */ },
+ { 1793, 76, /* 0.30 */ },
+ { 2076, 74, /* 0.29 */ },
+ { 2378, 72, /* 0.28 */ },
+ { 2699, 71, /* 0.28 */ },
+ { 3039, 69, /* 0.27 */ },
+ { 3399, 68, /* 0.27 */ },
+ { 3778, 66, /* 0.26 */ },
+ { 4177, 65, /* 0.26 */ },
+ { 4596, 64, /* 0.25 */ },
+ { 5036, 62, /* 0.25 */ },
+ { 5497, 61, /* 0.24 */ },
+ { 5979, 60, /* 0.24 */ },
+ { 6483, 59, /* 0.23 */ },
+ { 7009, 58, /* 0.23 */ },
+ { 7558, 57, /* 0.22 */ },
+ { 8130, 56, /* 0.22 */ },
+ { 8726, 55, /* 0.22 */ },
+ { 9346, 54, /* 0.21 */ },
+ { 9991, 53, /* 0.21 */ },
+ { 10661, 52, /* 0.21 */ },
+ { 11358, 52, /* 0.20 */ },
+ { 12082, 51, /* 0.20 */ },
+ { 12834, 50, /* 0.20 */ },
+ { 13614, 49, /* 0.19 */ },
+ { 14424, 48, /* 0.19 */ },
+ { 15265, 48, /* 0.19 */ },
+ { 16137, 47, /* 0.19 */ },
+ { 17042, 46, /* 0.18 */ },
+ { 17981, 45, /* 0.18 */ },
+ { 18955, 45, /* 0.18 */ },
+ { 19965, 44, /* 0.17 */ },
+ { 21013, 43, /* 0.17 */ },
+ { 22101, 43, /* 0.17 */ },
+ { 23230, 42, /* 0.17 */ },
+ { 24402, 41, /* 0.16 */ },
+ { 25618, 41, /* 0.16 */ },
+ { 26881, 40, /* 0.16 */ },
+ { 28193, 39, /* 0.16 */ },
+ { 29557, 39, /* 0.15 */ },
+ { 30975, 38, /* 0.15 */ },
+ { 32450, 38, /* 0.15 */ },
+ { 33986, 37, /* 0.15 */ },
+ { 35586, 36, /* 0.14 */ },
+ { 37253, 36, /* 0.14 */ },
+ { 38992, 35, /* 0.14 */ },
+ { 40808, 35, /* 0.14 */ },
+ { 42707, 34, /* 0.13 */ },
+ { 44694, 33, /* 0.13 */ },
+ { 46776, 33, /* 0.13 */ },
+ { 48961, 32, /* 0.13 */ },
+ { 51258, 32, /* 0.13 */ },
+ { 53677, 31, /* 0.12 */ },
+ { 56230, 30, /* 0.12 */ },
+ { 58932, 30, /* 0.12 */ },
+ { 61799, 29, /* 0.12 */ },
+ { 64851, 28, /* 0.11 */ },
+ { 68113, 28, /* 0.11 */ },
+ { 71617, 27, /* 0.11 */ },
+ { 75401, 26, /* 0.10 */ },
+ { 79517, 26, /* 0.10 */ },
+ { 84035, 25, /* 0.10 */ },
+ { 89053, 24, /* 0.10 */ },
};
#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
@@ -97,10 +97,6 @@ struct hstcp {
u32 ai;
};
-static int max_ssthresh = 100;
-module_param(max_ssthresh, int, 0644);
-MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)");
-
static void hstcp_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -113,32 +109,17 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
- u32 in_flight, int data_acked)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* RFC3742: limited slow start
- * the window is increased by 1/K MSS for each arriving ACK,
- * for K = int(cwnd/(0.5 max_ssthresh))
- */
- if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) {
- u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U);
- if (++tp->snd_cwnd_cnt >= k) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
- } else {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- } else {
+ if (tcp_in_slow_start(tp))
+ tcp_slow_start(tp, acked);
+ else {
/* Update AIMD parameters.
*
* We want to guarantee that:
@@ -146,22 +127,22 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
* snd_cwnd <=
* hstcp_aimd_vals[ca->ai].cwnd
*/
- if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
- while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ if (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd &&
ca->ai < HSTCP_AIMD_MAX - 1)
ca->ai++;
- } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
- while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
+ } else if (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+ while (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd)
ca->ai--;
}
/* Do additive increase */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
/* cwnd = cwnd + a(w) / cwnd */
tp->snd_cwnd_cnt += ca->ai + 1;
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd_cnt -= tp->snd_cwnd;
- tp->snd_cwnd++;
+ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ tp->snd_cwnd_cnt -= tcp_snd_cwnd(tp);
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
}
}
}
@@ -170,18 +151,17 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
static u32 hstcp_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- const struct hstcp *ca = inet_csk_ca(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
/* Do multiplicative decrease */
- return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+ return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
-
-static struct tcp_congestion_ops tcp_highspeed = {
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hstcp_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.owner = THIS_MODULE,
.name = "highspeed"
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 753987a1048f..81b96331b2bb 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -1,31 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* H-TCP congestion control. The algorithm is detailed in:
* R.N.Shorten, D.J.Leith:
* "H-TCP: TCP for high-speed and long-distance networks"
* Proc. PFLDnet, Argonne, 2004.
- * http://www.hamilton.ie/net/htcp3.pdf
+ * https://www.hamilton.ie/net/htcp3.pdf
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <net/tcp.h>
-#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
-#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
+#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
+#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
#define BETA_MAX 102 /* 0.8 with shift << 7 */
-static int use_rtt_scaling = 1;
+static int use_rtt_scaling __read_mostly = 1;
module_param(use_rtt_scaling, int, 0644);
MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
-static int use_bandwidth_switch = 1;
+static int use_bandwidth_switch __read_mostly = 1;
module_param(use_bandwidth_switch, int, 0644);
MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
struct htcp {
u32 alpha; /* Fixed point arith, << 7 */
u8 beta; /* Fixed point arith, << 7 */
- u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
+ u8 modeswitch; /* Delay modeswitch
+ until we had at least one congestion event */
u16 pkts_acked;
u32 packetcount;
u32 minRTT;
@@ -44,14 +46,14 @@ struct htcp {
u32 lasttime;
};
-static inline u32 htcp_cong_time(struct htcp *ca)
+static inline u32 htcp_cong_time(const struct htcp *ca)
{
return jiffies - ca->last_cong;
}
-static inline u32 htcp_ccount(struct htcp *ca)
+static inline u32 htcp_ccount(const struct htcp *ca)
{
- return htcp_cong_time(ca)/ca->minRTT;
+ return htcp_cong_time(ca) / ca->minRTT;
}
static inline void htcp_reset(struct htcp *ca)
@@ -65,66 +67,73 @@ static inline void htcp_reset(struct htcp *ca)
static u32 htcp_cwnd_undo(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- ca->last_cong = ca->undo_last_cong;
- ca->maxRTT = ca->undo_maxRTT;
- ca->old_maxB = ca->undo_old_maxB;
- return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
+
+ if (ca->undo_last_cong) {
+ ca->last_cong = ca->undo_last_cong;
+ ca->maxRTT = ca->undo_maxRTT;
+ ca->old_maxB = ca->undo_old_maxB;
+ ca->undo_last_cong = 0;
+ }
+
+ return tcp_reno_undo_cwnd(sk);
}
-static inline void measure_rtt(struct sock *sk)
+static inline void measure_rtt(struct sock *sk, u32 srtt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- u32 srtt = tp->srtt>>3;
/* keep track of minimum RTT seen so far, minRTT is zero at first */
if (ca->minRTT > srtt || !ca->minRTT)
ca->minRTT = srtt;
/* max RTT */
- if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && htcp_ccount(ca) > 3) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
if (ca->maxRTT < ca->minRTT)
ca->maxRTT = ca->minRTT;
- if (ca->maxRTT < srtt && srtt <= ca->maxRTT+msecs_to_jiffies(20))
+ if (ca->maxRTT < srtt &&
+ srtt <= ca->maxRTT + msecs_to_jiffies(20))
ca->maxRTT = srtt;
}
}
-static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
+static void measure_achieved_throughput(struct sock *sk,
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- u32 now = tcp_time_stamp;
+ u32 now = tcp_jiffies32;
if (icsk->icsk_ca_state == TCP_CA_Open)
- ca->pkts_acked = pkts_acked;
+ ca->pkts_acked = sample->pkts_acked;
+
+ if (sample->rtt_us > 0)
+ measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
if (!use_bandwidth_switch)
return;
/* achieved throughput calculations */
- if (icsk->icsk_ca_state != TCP_CA_Open &&
- icsk->icsk_ca_state != TCP_CA_Disorder) {
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
ca->packetcount = 0;
ca->lasttime = now;
return;
}
- ca->packetcount += pkts_acked;
+ ca->packetcount += sample->pkts_acked;
+
+ if (ca->packetcount >= tcp_snd_cwnd(tp) - (ca->alpha >> 7 ? : 1) &&
+ now - ca->lasttime >= ca->minRTT &&
+ ca->minRTT > 0) {
+ __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
- if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
- && now - ca->lasttime >= ca->minRTT
- && ca->minRTT > 0) {
- __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
if (htcp_ccount(ca) <= 3) {
/* just after backoff */
ca->minB = ca->maxB = ca->Bi = cur_Bi;
} else {
- ca->Bi = (3*ca->Bi + cur_Bi)/4;
+ ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
if (ca->Bi > ca->maxB)
ca->maxB = ca->Bi;
if (ca->minB > ca->maxB)
@@ -140,9 +149,9 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
if (use_bandwidth_switch) {
u32 maxB = ca->maxB;
u32 old_maxB = ca->old_maxB;
- ca->old_maxB = ca->maxB;
- if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
+ ca->old_maxB = ca->maxB;
+ if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
ca->beta = BETA_MIN;
ca->modeswitch = 0;
return;
@@ -150,7 +159,7 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
}
if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
- ca->beta = (minRTT<<7)/maxRTT;
+ ca->beta = (minRTT << 7) / maxRTT;
if (ca->beta < BETA_MIN)
ca->beta = BETA_MIN;
else if (ca->beta > BETA_MAX)
@@ -169,23 +178,26 @@ static inline void htcp_alpha_update(struct htcp *ca)
if (diff > HZ) {
diff -= HZ;
- factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
+ factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
}
if (use_rtt_scaling && minRTT) {
- u32 scale = (HZ<<3)/(10*minRTT);
- scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
- factor = (factor<<3)/scale;
+ u32 scale = (HZ << 3) / (10 * minRTT);
+
+ /* clamping ratio to interval [0.5,10]<<3 */
+ scale = clamp(scale, 1U << 2, 10U << 3);
+ factor = (factor << 3) / scale;
if (!factor)
factor = 1;
}
- ca->alpha = 2*factor*((1<<7)-ca->beta);
+ ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
if (!ca->alpha)
ca->alpha = ALPHA_BASE;
}
-/* After we have the rtt data to calculate beta, we'd still prefer to wait one
+/*
+ * After we have the rtt data to calculate beta, we'd still prefer to wait one
* rtt before we adjust our beta to ensure we are working from a consistent
* data.
*
@@ -202,40 +214,37 @@ static void htcp_param_update(struct sock *sk)
htcp_beta_update(ca, minRTT, maxRTT);
htcp_alpha_update(ca);
- /* add slowly fading memory for maxRTT to accommodate routing changes etc */
+ /* add slowly fading memory for maxRTT to accommodate routing changes */
if (minRTT > 0 && maxRTT > minRTT)
- ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
+ ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
}
static u32 htcp_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct htcp *ca = inet_csk_ca(sk);
+
htcp_param_update(sk);
- return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+ return max((tcp_snd_cwnd(tp) * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
- u32 in_flight, int data_acked)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ if (tcp_in_slow_start(tp))
+ tcp_slow_start(tp, acked);
else {
-
- measure_rtt(sk);
-
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
- if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
+ if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tcp_snd_cwnd(tp)) {
+ if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tp->snd_cwnd_cnt = 0;
htcp_alpha_update(ca);
} else
@@ -262,7 +271,11 @@ static void htcp_state(struct sock *sk, u8 new_state)
case TCP_CA_Open:
{
struct htcp *ca = inet_csk_ca(sk);
- ca->last_cong = jiffies;
+
+ if (ca->undo_last_cong) {
+ ca->last_cong = jiffies;
+ ca->undo_last_cong = 0;
+ }
}
break;
case TCP_CA_CWR:
@@ -273,7 +286,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
}
}
-static struct tcp_congestion_ops htcp = {
+static struct tcp_congestion_ops htcp __read_mostly = {
.init = htcp_init,
.ssthresh = htcp_recalc_ssthresh,
.cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 59e691d26f64..abd7d91807e5 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP HYBLA
*
@@ -15,31 +16,31 @@
/* Tcp Hybla structure. */
struct hybla {
- u8 hybla_en;
+ bool hybla_en;
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
u32 rho; /* Rho parameter, integer part */
u32 rho2; /* Rho * Rho, integer part */
u32 rho_3ls; /* Rho parameter, <<3 */
u32 rho2_7ls; /* Rho^2, <<7 */
- u32 minrtt; /* Minimum smoothed round trip time value seen */
+ u32 minrtt_us; /* Minimum smoothed round trip time value seen */
};
-/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
- expressed in jiffies */
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
static int rtt0 = 25;
module_param(rtt0, int, 0644);
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
-
/* This is called to refresh values for hybla parameters */
static inline void hybla_recalc_param (struct sock *sk)
{
struct hybla *ca = inet_csk_ca(sk);
- ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
+ ca->rho_3ls = max_t(u32,
+ tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+ 8U);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
- ca->rho2 = ca->rho2_7ls >>7;
+ ca->rho2 = ca->rho2_7ls >> 7;
}
static void hybla_init(struct sock *sk)
@@ -52,21 +53,22 @@ static void hybla_init(struct sock *sk)
ca->rho_3ls = 0;
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
- ca->hybla_en = 1;
- tp->snd_cwnd = 2;
+ ca->hybla_en = true;
+ tcp_snd_cwnd_set(tp, 2);
tp->snd_cwnd_clamp = 65535;
/* 1st Rho measurement based on initial srtt */
hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */
- ca->minrtt = tp->srtt;
- tp->snd_cwnd = ca->rho;
+ ca->minrtt_us = tp->srtt_us;
+ tcp_snd_cwnd_set(tp, ca->rho);
}
static void hybla_state(struct sock *sk, u8 ca_state)
{
struct hybla *ca = inet_csk_ca(sk);
+
ca->hybla_en = (ca_state == TCP_CA_Open);
}
@@ -85,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
- u32 in_flight, int flag)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -94,23 +95,25 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */
- if (tp->srtt < ca->minrtt){
+ if (tp->srtt_us < ca->minrtt_us) {
hybla_recalc_param(sk);
- ca->minrtt = tp->srtt;
+ ca->minrtt_us = tp->srtt_us;
}
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
- if (!ca->hybla_en)
- return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
+ if (!ca->hybla_en) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
if (ca->rho == 0)
hybla_recalc_param(sk);
rho_fractions = ca->rho_3ls - (ca->rho << 3);
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/*
* slow start
* INC = 2^RHO - 1
@@ -125,8 +128,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
* calculate 2^fract in a <<7 value.
*/
is_slowstart = 1;
- increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
- - 128;
+ increment = ((1 << min(ca->rho, 16U)) *
+ hybla_fraction(rho_fractions)) - 128;
} else {
/*
* congestion avoidance
@@ -134,33 +137,37 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
* as long as increment is estimated as (rho<<7)/window
* it already is <<7 and we can easily count its fractions.
*/
- increment = ca->rho2_7ls / tp->snd_cwnd;
+ increment = ca->rho2_7ls / tcp_snd_cwnd(tp);
if (increment < 128)
tp->snd_cwnd_cnt++;
}
odd = increment % 128;
- tp->snd_cwnd += increment >> 7;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7));
ca->snd_cwnd_cents += odd;
/* check when fractions goes >=128 and increase cwnd by 1. */
- while(ca->snd_cwnd_cents >= 128) {
- tp->snd_cwnd++;
+ while (ca->snd_cwnd_cents >= 128) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
ca->snd_cwnd_cents -= 128;
tp->snd_cwnd_cnt = 0;
}
-
+ /* check when cwnd has not been incremented for a while */
+ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+ tp->snd_cwnd_cnt = 0;
+ }
/* clamp down slowstart cwnd to ssthresh value. */
if (is_slowstart)
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh));
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
}
-static struct tcp_congestion_ops tcp_hybla = {
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
- .min_cwnd = tcp_reno_min_cwnd,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
new file mode 100644
index 000000000000..c0c81a2c77fa
--- /dev/null
+++ b/net/ipv4/tcp_illinois.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP Illinois congestion control.
+ * Home page:
+ * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ *
+ * The algorithm is described in:
+ * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
+ * for High-Speed Networks"
+ * http://tamerbasar.csl.illinois.edu/LiuBasarSrikantPerfEvalArtJun2008.pdf
+ *
+ * Implemented from description in paper and ns-2 simulation.
+ * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <asm/div64.h>
+#include <net/tcp.h>
+
+#define ALPHA_SHIFT 7
+#define ALPHA_SCALE (1u<<ALPHA_SHIFT)
+#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
+#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
+#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
+#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
+
+#define BETA_SHIFT 6
+#define BETA_SCALE (1u<<BETA_SHIFT)
+#define BETA_MIN (BETA_SCALE/8) /* 0.125 */
+#define BETA_MAX (BETA_SCALE/2) /* 0.5 */
+#define BETA_BASE BETA_MAX
+
+static int win_thresh __read_mostly = 15;
+module_param(win_thresh, int, 0);
+MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
+
+static int theta __read_mostly = 5;
+module_param(theta, int, 0);
+MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
+
+/* TCP Illinois Parameters */
+struct illinois {
+ u64 sum_rtt; /* sum of rtt's measured within last rtt */
+ u16 cnt_rtt; /* # of rtts measured within last rtt */
+ u32 base_rtt; /* min of all rtt in usec */
+ u32 max_rtt; /* max of all rtt in usec */
+ u32 end_seq; /* right edge of current RTT */
+ u32 alpha; /* Additive increase */
+ u32 beta; /* Muliplicative decrease */
+ u16 acked; /* # packets acked by current ACK */
+ u8 rtt_above; /* average rtt has gone above threshold */
+ u8 rtt_low; /* # of rtts measurements below threshold */
+};
+
+static void rtt_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->end_seq = tp->snd_nxt;
+ ca->cnt_rtt = 0;
+ ca->sum_rtt = 0;
+
+ /* TODO: age max_rtt? */
+}
+
+static void tcp_illinois_init(struct sock *sk)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->alpha = ALPHA_MAX;
+ ca->beta = BETA_BASE;
+ ca->base_rtt = 0x7fffffff;
+ ca->max_rtt = 0;
+
+ ca->acked = 0;
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+
+ rtt_reset(sk);
+}
+
+/* Measure RTT for each ack. */
+static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+ s32 rtt_us = sample->rtt_us;
+
+ ca->acked = sample->pkts_acked;
+
+ /* dup ack, no rtt sample */
+ if (rtt_us < 0)
+ return;
+
+ /* ignore bogus values, this prevents wraparound in alpha math */
+ if (rtt_us > RTT_MAX)
+ rtt_us = RTT_MAX;
+
+ /* keep track of minimum RTT seen so far */
+ if (ca->base_rtt > rtt_us)
+ ca->base_rtt = rtt_us;
+
+ /* and max */
+ if (ca->max_rtt < rtt_us)
+ ca->max_rtt = rtt_us;
+
+ ++ca->cnt_rtt;
+ ca->sum_rtt += rtt_us;
+}
+
+/* Maximum queuing delay */
+static inline u32 max_delay(const struct illinois *ca)
+{
+ return ca->max_rtt - ca->base_rtt;
+}
+
+/* Average queuing delay */
+static inline u32 avg_delay(const struct illinois *ca)
+{
+ u64 t = ca->sum_rtt;
+
+ do_div(t, ca->cnt_rtt);
+ return t - ca->base_rtt;
+}
+
+/*
+ * Compute value of alpha used for additive increase.
+ * If small window then use 1.0, equivalent to Reno.
+ *
+ * For larger windows, adjust based on average delay.
+ * A. If average delay is at minimum (we are uncongested),
+ * then use large alpha (10.0) to increase faster.
+ * B. If average delay is at maximum (getting congested)
+ * then use small alpha (0.3)
+ *
+ * The result is a convex window growth curve.
+ */
+static u32 alpha(struct illinois *ca, u32 da, u32 dm)
+{
+ u32 d1 = dm / 100; /* Low threshold */
+
+ if (da <= d1) {
+ /* If never got out of low delay zone, then use max */
+ if (!ca->rtt_above)
+ return ALPHA_MAX;
+
+ /* Wait for 5 good RTT's before allowing alpha to go alpha max.
+ * This prevents one good RTT from causing sudden window increase.
+ */
+ if (++ca->rtt_low < theta)
+ return ca->alpha;
+
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+ return ALPHA_MAX;
+ }
+
+ ca->rtt_above = 1;
+
+ /*
+ * Based on:
+ *
+ * (dm - d1) amin amax
+ * k1 = -------------------
+ * amax - amin
+ *
+ * (dm - d1) amin
+ * k2 = ---------------- - d1
+ * amax - amin
+ *
+ * k1
+ * alpha = ----------
+ * k2 + da
+ */
+
+ dm -= d1;
+ da -= d1;
+ return (dm * ALPHA_MAX) /
+ (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
+}
+
+/*
+ * Beta used for multiplicative decrease.
+ * For small window sizes returns same value as Reno (0.5)
+ *
+ * If delay is small (10% of max) then beta = 1/8
+ * If delay is up to 80% of max then beta = 1/2
+ * In between is a linear function
+ */
+static u32 beta(u32 da, u32 dm)
+{
+ u32 d2, d3;
+
+ d2 = dm / 10;
+ if (da <= d2)
+ return BETA_MIN;
+
+ d3 = (8 * dm) / 10;
+ if (da >= d3 || d3 <= d2)
+ return BETA_MAX;
+
+ /*
+ * Based on:
+ *
+ * bmin d3 - bmax d2
+ * k3 = -------------------
+ * d3 - d2
+ *
+ * bmax - bmin
+ * k4 = -------------
+ * d3 - d2
+ *
+ * b = k3 + k4 da
+ */
+ return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
+ / (d3 - d2);
+}
+
+/* Update alpha and beta values once per RTT */
+static void update_params(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (tcp_snd_cwnd(tp) < win_thresh) {
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_BASE;
+ } else if (ca->cnt_rtt > 0) {
+ u32 dm = max_delay(ca);
+ u32 da = avg_delay(ca);
+
+ ca->alpha = alpha(ca, da, dm);
+ ca->beta = beta(da, dm);
+ }
+
+ rtt_reset(sk);
+}
+
+/*
+ * In case of loss, reset to default values
+ */
+static void tcp_illinois_state(struct sock *sk, u8 new_state)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_BASE;
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+ rtt_reset(sk);
+ }
+}
+
+/*
+ * Increase window in response to successful acknowledgment.
+ */
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (after(ack, ca->end_seq))
+ update_params(sk);
+
+ /* RFC2861 only increase cwnd if fully utilized */
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* In slow start */
+ if (tcp_in_slow_start(tp))
+ tcp_slow_start(tp, acked);
+
+ else {
+ u32 delta;
+
+ /* snd_cwnd_cnt is # of packets since last cwnd increment */
+ tp->snd_cwnd_cnt += ca->acked;
+ ca->acked = 1;
+
+ /* This is close approximation of:
+ * tp->snd_cwnd += alpha/tp->snd_cwnd
+ */
+ delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
+ if (delta >= tcp_snd_cwnd(tp)) {
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp) + delta / tcp_snd_cwnd(tp),
+ (u32)tp->snd_cwnd_clamp));
+ tp->snd_cwnd_cnt = 0;
+ }
+ }
+}
+
+static u32 tcp_illinois_ssthresh(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+ u32 decr;
+
+ /* Multiplicative decrease */
+ decr = (tcp_snd_cwnd(tp) * ca->beta) >> BETA_SHIFT;
+ return max(tcp_snd_cwnd(tp) - decr, 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+ info->vegas.tcpv_minrtt = ca->base_rtt;
+ info->vegas.tcpv_rtt = 0;
+
+ if (info->vegas.tcpv_rttcnt > 0) {
+ u64 t = ca->sum_rtt;
+
+ do_div(t, info->vegas.tcpv_rttcnt);
+ info->vegas.tcpv_rtt = t;
+ }
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
+ .init = tcp_illinois_init,
+ .ssthresh = tcp_illinois_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
+ .cong_avoid = tcp_illinois_cong_avoid,
+ .set_state = tcp_illinois_state,
+ .get_info = tcp_illinois_info,
+ .pkts_acked = tcp_illinois_acked,
+
+ .owner = THIS_MODULE,
+ .name = "illinois",
+};
+
+static int __init tcp_illinois_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_illinois);
+}
+
+static void __exit tcp_illinois_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_illinois);
+}
+
+module_init(tcp_illinois_register);
+module_exit(tcp_illinois_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Illinois");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9304034c0c47..198f8a0d37be 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
@@ -50,9 +49,9 @@
* Andi Kleen: Make sure we never ack data there is not
* enough room for. Also make this condition
* a fatal error if it might still happen.
- * Andi Kleen: Add tcp_measure_rcv_mss to make
+ * Andi Kleen: Add tcp_measure_rcv_mss to make
* connections with MSS<min(MTU,ann. MSS)
- * work without delayed acks.
+ * work without delayed acks.
* Andi Kleen: Process packets with PSH set in the
* fast path.
* J Hadi Salim: ECN support
@@ -63,33 +62,29 @@
* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/kernel.h>
+#include <linux/prefetch.h>
+#include <linux/bitops.h>
+#include <net/dst.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
-#include <asm/unaligned.h>
-#include <net/netdma.h>
-
-int sysctl_tcp_timestamps __read_mostly = 1;
-int sysctl_tcp_window_scaling __read_mostly = 1;
-int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
-int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
-int sysctl_tcp_ecn __read_mostly;
-int sysctl_tcp_dsack __read_mostly = 1;
-int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
-
-int sysctl_tcp_stdurg __read_mostly;
-int sysctl_tcp_rfc1337 __read_mostly;
-int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_frto __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
+#include <linux/unaligned.h>
+#include <linux/errqueue.h>
+#include <trace/events/tcp.h>
+#include <linux/jump_label_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/mptcp.h>
-int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -98,53 +93,204 @@ int sysctl_tcp_abc __read_mostly;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
-#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
+#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
+#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
+#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
+#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
+#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
+#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
-#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
-#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
-#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
-
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+#define REXMIT_NONE 0 /* no loss recovery to do */
+#define REXMIT_LOST 1 /* retransmit packets marked lost */
+#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
+
+void clean_acked_data_enable(struct tcp_sock *tp,
+ void (*cad)(struct sock *sk, u32 ack_seq))
+{
+ tp->tcp_clean_acked = cad;
+ static_branch_deferred_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct tcp_sock *tp)
+{
+ static_branch_slow_dec_deferred(&clean_acked_data_enabled);
+ tp->tcp_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+
+void clean_acked_data_flush(void)
+{
+ static_key_deferred_flush(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
+#endif
+
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+ struct bpf_sock_ops_kern sock_ops;
+
+ if (likely(!unknown_opt && !parse_all_opt))
+ return;
+
+ /* The skb will be handled in the
+ * bpf_skops_established() or
+ * bpf_skops_write_hdr_opt().
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_LISTEN:
+ return;
+ }
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+ struct bpf_sock_ops_kern sock_ops;
-/* Adapt the MSS value used to make delayed ack decision to the
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = bpf_op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+}
+#endif
+
+static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int len)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ if (!dev || len >= READ_ONCE(dev->mtu))
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
+ rcu_read_unlock();
+}
+
+/* Adapt the MSS value used to make delayed ack decision to the
* real world.
- */
-static void tcp_measure_rcv_mss(struct sock *sk,
- const struct sk_buff *skb)
+ */
+static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const unsigned int lss = icsk->icsk_ack.last_seg_size;
+ const unsigned int lss = icsk->icsk_ack.last_seg_size;
unsigned int len;
- icsk->icsk_ack.last_seg_size = 0;
+ icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
- len = skb_shinfo(skb)->gso_size ?: skb->len;
+ len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
- icsk->icsk_ack.rcv_mss = len;
+ /* Note: divides are still a bit expensive.
+ * For the moment, only adjust scaling_ratio
+ * when we update icsk_ack.rcv_mss.
+ */
+ if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
+ u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+ u8 old_ratio = tcp_sk(sk)->scaling_ratio;
+
+ do_div(val, skb->truesize);
+ tcp_sk(sk)->scaling_ratio = val ? val : 1;
+
+ if (old_ratio != tcp_sk(sk)->scaling_ratio) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ val = tcp_win_from_space(sk, sk->sk_rcvbuf);
+ tcp_set_window_clamp(sk, val);
+
+ if (tp->window_clamp < tp->rcvq_space.space)
+ tp->rcvq_space.space = tp->window_clamp;
+ }
+ }
+ icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+ tcp_sk(sk)->advmss);
+ /* Account for possibly-removed options */
+ DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE,
+ tcp_gro_dev_warn, sk, skb, len);
+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
+ * set then it is likely the end of an application write. So
+ * more data may not be arriving soon, and yet the data sender
+ * may be waiting for an ACK if cwnd-bound or using TX zero
+ * copy. So we set ICSK_ACK_PUSHED here so that
+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
+ * reads all of the data and is not ping-pong. If len > MSS
+ * then this logic does not matter (and does not hurt) because
+ * tcp_cleanup_rbuf() will always ACK immediately if the app
+ * reads data and there is more than an MSS of unACKed data.
+ */
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
*
* "len" is invariant segment length, including TCP header.
*/
- len += skb->data - skb->h.raw;
- if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+ len += skb->data - skb_transport_header(skb);
+ if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
* This observation (if it is correct 8)) allows
* to handle super-low mtu links fairly.
*/
(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
- !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) {
+ !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
/* Subtract also invariant (if peer is RFC compliant),
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
@@ -162,22 +308,24 @@ static void tcp_measure_rcv_mss(struct sock *sk,
}
}
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
- if (quickacks==0)
- quickacks=2;
+ if (quickacks == 0)
+ quickacks = 2;
+ quickacks = min(quickacks, max_quickacks);
if (quickacks > icsk->icsk_ack.quick)
- icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ icsk->icsk_ack.quick = quickacks;
}
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_incr_quickack(sk);
- icsk->icsk_ack.pingpong = 0;
+
+ tcp_incr_quickack(sk, max_quickacks);
+ inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
@@ -185,10 +333,250 @@ void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline int tcp_in_quickack_mode(const struct sock *sk)
+static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+
+ return icsk->icsk_ack.dst_quick_ack ||
+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
+}
+
+static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_ecn_disabled(tp))
+ return;
+
+ switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ /* Funny extension: if ECT is not set on a segment,
+ * and we already seen ECT on a previous segment,
+ * it is probably a retransmit.
+ */
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ tcp_enter_quickack_mode(sk, 2);
+ break;
+ case INET_ECN_CE:
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
+ /* Better not delay acks, sender can have a very low cwnd */
+ tcp_enter_quickack_mode(sk, 2);
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ }
+ /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_data_ecn_check() when the ECN codepoint of
+ * received TCP data contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
+ default:
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
+ }
+}
+
+/* Returns true if the byte counters can be used */
+static bool tcp_accecn_process_option(struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ u32 delivered_bytes, int flag)
+{
+ u8 estimate_ecnfield = tp->est_ecnfield;
+ bool ambiguous_ecn_bytes_incr = false;
+ bool first_changed = false;
+ unsigned int optlen;
+ bool order1, res;
+ unsigned int i;
+ u8 *ptr;
+
+ if (tcp_accecn_opt_fail_recv(tp))
+ return false;
+
+ if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
+ if (!tp->saw_accecn_opt) {
+ /* Too late to enable after this point due to
+ * potential counter wraps
+ */
+ if (tp->bytes_sent >= (1 << 23) - 1) {
+ u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ }
+ return false;
+ }
+
+ if (estimate_ecnfield) {
+ u8 ecnfield = estimate_ecnfield - 1;
+
+ tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
+ return true;
+ }
+ return false;
+ }
+
+ ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
+ optlen = ptr[1] - 2;
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return false;
+ order1 = (ptr[0] == TCPOPT_ACCECN1);
+ ptr += 2;
+
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+ }
+
+ res = !!estimate_ecnfield;
+ for (i = 0; i < 3; i++) {
+ u32 init_offset;
+ u8 ecnfield;
+ s32 delta;
+ u32 *cnt;
+
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ break;
+
+ ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
+ init_offset = tcp_accecn_field_init_offset(ecnfield);
+ cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
+ delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
+ if (delta && delta < 0) {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ if (delta && ecnfield != estimate_ecnfield) {
+ if (!first_changed) {
+ tp->est_ecnfield = ecnfield;
+ first_changed = true;
+ } else {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ }
+
+ optlen -= TCPOLEN_ACCECN_PERFIELD;
+ ptr += TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (ambiguous_ecn_bytes_incr)
+ tp->est_ecnfield = 0;
+
+ return res;
+}
+
+static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
+{
+ tp->delivered_ce += ecn_count;
+}
+
+/* Updates the delivered and delivered_ce counts */
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
+ bool ece_ack)
+{
+ tp->delivered += delivered;
+ if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
+ tcp_count_delivered_ce(tp, delivered);
+}
+
+/* Returns the ECN CE delta */
+static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int flag)
+{
+ u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1];
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta, safe_delta, d_ceb;
+ bool opt_deltas_valid;
+ u32 corrected_ace;
+
+ /* Reordered ACK or uncertain due to lack of data to send and ts */
+ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
+ return 0;
+
+ opt_deltas_valid = tcp_accecn_process_option(tp, skb,
+ delivered_bytes, flag);
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+
+ corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts -
+ ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ if (opt_deltas_valid) {
+ d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb;
+ if (!d_ceb)
+ return delta;
+
+ if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) &&
+ (tcp_is_sack(tp) ||
+ ((1 << inet_csk(sk)->icsk_ca_state) &
+ (TCPF_CA_Open | TCPF_CA_CWR)))) {
+ u32 est_d_cep;
+
+ if (delivered_bytes <= d_ceb)
+ return safe_delta;
+
+ est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb *
+ delivered_pkts,
+ delivered_bytes);
+ return min(safe_delta,
+ delta +
+ (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK));
+ }
+
+ if (d_ceb > delta * tp->mss_cache)
+ return safe_delta;
+ if (d_ceb <
+ safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
+ return delta;
+ }
+
+ return safe_delta;
+}
+
+static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta;
+
+ delta = __tcp_accecn_process(sk, skb, delivered_pkts,
+ delivered_bytes, *flag);
+ if (delta > 0) {
+ tcp_count_delivered_ce(tp, delta);
+ *flag |= FLAG_ECE;
+ /* Recalculate header predictor */
+ if (tp->pred_flags)
+ tcp_fast_path_on(tp);
+ }
+ return delta;
}
/* Buffer size and advertised window tuning.
@@ -196,13 +584,36 @@ static inline int tcp_in_quickack_mode(const struct sock *sk)
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
*/
-static void tcp_fixup_sndbuf(struct sock *sk)
+static void tcp_sndbuf_expand(struct sock *sk)
{
- int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
- sizeof(struct sk_buff);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ int sndmem, per_mss;
+ u32 nr_segs;
- if (sk->sk_sndbuf < 3 * sndmem)
- sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+ /* Worst case is non GSO/TSO : each frame consumes one skb
+ * and skb->head is kmalloced using power of two area of memory
+ */
+ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+ MAX_TCP_HEADER +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ per_mss = roundup_pow_of_two(per_mss) +
+ SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+ /* Fast Recovery (RFC 5681 3.2) :
+ * Cubic needs 1.7 factor, rounded to 2 to include
+ * extra cushion (application might react slowly to EPOLLOUT)
+ */
+ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+ sndmem *= nr_segs * per_mss;
+
+ if (sk->sk_sndbuf < sndmem)
+ WRITE_ONCE(sk->sk_sndbuf,
+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -231,12 +642,13 @@ static void tcp_fixup_sndbuf(struct sock *sk)
*/
/* Slow part of check#2. */
-static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
- const struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int skbtruesize)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
- int truesize = tcp_win_from_space(skb->truesize)/2;
- int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -248,102 +660,124 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
return 0;
}
-static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
+ * can play nice with us, as sk_buff and skb->head might be either
+ * freed or shared with up to MAX_SKB_FRAGS segments.
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
+ * and if no payload was pulled in skb->head before reaching us.
+ */
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
+{
+ u32 truesize = skb->truesize;
+
+ if (adjust && !skb_headlen(skb)) {
+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
+ /* paranoid check, some drivers might be buggy */
+ if (unlikely((int)truesize < (int)skb->len))
+ truesize = skb->truesize;
+ }
+ return truesize;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
+ bool adjust)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int room;
+
+ room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
+
+ if (room <= 0)
+ return;
+
/* Check #1 */
- if (tp->rcv_ssthresh < tp->window_clamp &&
- (int)tp->rcv_ssthresh < tcp_space(sk) &&
- !tcp_memory_pressure) {
+ if (!tcp_under_memory_pressure(sk)) {
+ unsigned int truesize = truesize_adjust(adjust, skb);
int incr;
/* Check #2. Increase window, if skb with such overhead
* will fit to rcvbuf in future.
*/
- if (tcp_win_from_space(skb->truesize) <= skb->len)
- incr = 2*tp->advmss;
+ if (tcp_win_from_space(sk, truesize) <= skb->len)
+ incr = 2 * tp->advmss;
else
- incr = __tcp_grow_window(sk, tp, skb);
+ incr = __tcp_grow_window(sk, skb, truesize);
if (incr) {
- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
+ incr = max_t(int, incr, 2 * skb->len);
+ tp->rcv_ssthresh += min(room, incr);
inet_csk(sk)->icsk_ack.quick |= 1;
}
+ } else {
+ /* Under pressure:
+ * Adjust rcv_ssthresh according to reserved mem
+ */
+ tcp_adjust_rcv_ssthresh(sk);
}
}
-/* 3. Tuning rcvbuf, when connection enters established state. */
-
-static void tcp_fixup_rcvbuf(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
-
- /* Try to select rcvbuf so that 4 mss-sized segments
- * will fit to window and corresponding skbs will fit to our rcvbuf.
- * (was 3; 4 is minimum to allow fast retransmit to work.)
- */
- while (tcp_win_from_space(rcvmem) < tp->advmss)
- rcvmem += 128;
- if (sk->sk_rcvbuf < 4 * rcvmem)
- sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
-}
-
-/* 4. Try to fixup all. It is made immediately after connection enters
+/* 3. Try to fixup all. It is made immediately after connection enters
* established state.
*/
static void tcp_init_buffer_space(struct sock *sk)
{
+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
- tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
- tcp_fixup_sndbuf(sk);
+ tcp_sndbuf_expand(sk);
- tp->rcvq_space.space = tp->rcv_wnd;
+ tcp_mstamp_refresh(tp);
+ tp->rcvq_space.time = tp->tcp_mstamp;
+ tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
- tp->window_clamp = maxwin;
+ WRITE_ONCE(tp->window_clamp, maxwin);
- if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
- tp->window_clamp = max(maxwin -
- (maxwin >> sysctl_tcp_app_win),
- 4 * tp->advmss);
+ if (tcp_app_win && maxwin > 4 * tp->advmss)
+ WRITE_ONCE(tp->window_clamp,
+ max(maxwin - (maxwin >> tcp_app_win),
+ 4 * tp->advmss));
}
/* Force reservation of one segment. */
- if (sysctl_tcp_app_win &&
+ if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
- tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(2 * tp->advmss, maxwin - tp->advmss));
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+ tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
+ (u32)TCP_INIT_CWND * tp->advmss);
}
-/* 5. Recalculate window clamp after socket hit its memory bounds. */
-static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
+/* 4. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
+ int rmem2;
icsk->icsk_ack.quick = 0;
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
- if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ if (sk->sk_rcvbuf < rmem2 &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- sysctl_tcp_rmem[2]);
+ !tcp_under_memory_pressure(sk) &&
+ sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
- tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
+ tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}
-
/* Initialize RCV_MSS value.
* RCV_MSS is an our guess about MSS used by the peer.
* We haven't any direct information about the MSS.
@@ -353,36 +787,36 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
*/
void tcp_initialize_rcv_mss(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
- hint = min(hint, tp->rcv_wnd/2);
- hint = min(hint, TCP_MIN_RCVMSS);
+ hint = min(hint, tp->rcv_wnd / 2);
+ hint = min(hint, TCP_MSS_DEFAULT);
hint = max(hint, TCP_MIN_MSS);
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
+EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
*
* More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
* though this reference is out of date. A new paper
* is pending.
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
- u32 new_sample = tp->rcv_rtt_est.rtt;
- long m = sample;
+ u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
+ long m = sample << 3;
- if (m == 0)
- m = 1;
-
- if (new_sample != 0) {
+ if (old_sample == 0 || m < old_sample) {
+ new_sample = m;
+ } else {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
@@ -393,44 +827,115 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* else with timestamps disabled convergence takes too
* long.
*/
- if (!win_dep) {
- m -= (new_sample >> 3);
- new_sample += m;
- } else if (m < new_sample)
- new_sample = m << 3;
- } else {
- /* No previous measure. */
- new_sample = m << 3;
+ if (win_dep)
+ return;
+ /* Do not use this sample if receive queue is not empty. */
+ if (tp->rcv_nxt != tp->copied_seq)
+ return;
+ new_sample = old_sample - (old_sample >> 3) + sample;
}
- if (tp->rcv_rtt_est.rtt != new_sample)
- tp->rcv_rtt_est.rtt = new_sample;
+ tp->rcv_rtt_est.rtt_us = new_sample;
}
static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
+ u32 delta_us;
+
if (tp->rcv_rtt_est.time == 0)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
- tcp_rcv_rtt_update(tp,
- jiffies - tp->rcv_rtt_est.time,
- 1);
+ delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
+ if (!delta_us)
+ delta_us = 1;
+ tcp_rcv_rtt_update(tp, delta_us, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
- tp->rcv_rtt_est.time = tcp_time_stamp;
+ tp->rcv_rtt_est.time = tp->tcp_mstamp;
+}
+
+static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
+{
+ u32 delta, delta_us;
+
+ delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
+ if (tp->tcp_usec_ts)
+ return delta;
+
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ if (!delta)
+ delta = min_delta;
+ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ return delta_us;
+ }
+ return -1;
}
-static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+ const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->rx_opt.rcv_tsecr &&
- (TCP_SKB_CB(skb)->end_seq -
- TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
- tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+
+ if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
+ return;
+ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+
+ if (TCP_SKB_CB(skb)->end_seq -
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
+ s32 delta = tcp_rtt_tsopt_us(tp, 0);
+
+ if (delta > 0)
+ tcp_rcv_rtt_update(tp, delta, 0);
+ }
}
+void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 rcvwin, rcvbuf, cap, oldval;
+ u32 rtt_threshold, rtt_us;
+ u64 grow;
+
+ oldval = tp->rcvq_space.space;
+ tp->rcvq_space.space = newval;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return;
+
+ /* DRS is always one RTT late. */
+ rcvwin = newval << 1;
+
+ rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
+ rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt);
+ if (rtt_us < rtt_threshold) {
+ /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold.
+ * It might take few additional ms to reach 'line rate',
+ * but will avoid sk_rcvbuf inflation and poor cache use.
+ */
+ grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold);
+ } else {
+ /* slow start: allow the sender to double its rate. */
+ grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval);
+ }
+ rcvwin += grow;
+
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
+
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+ rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ /* Make the window clamp follow along. */
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
+ }
+}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
@@ -438,55 +943,46 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- int time;
- int space;
-
- if (tp->rcvq_space.time == 0)
- goto new_measure;
-
- time = tcp_time_stamp - tp->rcvq_space.time;
- if (time < (tp->rcv_rtt_est.rtt >> 3) ||
- tp->rcv_rtt_est.rtt == 0)
+ int time, inq, copied;
+
+ trace_tcp_rcv_space_adjust(sk);
+
+ if (unlikely(!tp->rcv_rtt_est.rtt_us))
return;
-
- space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
- space = max(tp->rcvq_space.space, space);
+ /* We do not refresh tp->tcp_mstamp here.
+ * Some platforms have expensive ktime_get() implementations.
+ * Using the last cached value is enough for DRS.
+ */
+ time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
+ if (time < (tp->rcv_rtt_est.rtt_us >> 3))
+ return;
- if (tp->rcvq_space.space != space) {
- int rcvmem;
+ /* Number of bytes copied to user in last RTT */
+ copied = tp->copied_seq - tp->rcvq_space.seq;
+ /* Number of bytes in receive queue. */
+ inq = tp->rcv_nxt - tp->copied_seq;
+ copied -= inq;
+ if (copied <= tp->rcvq_space.space)
+ goto new_measure;
- tp->rcvq_space.space = space;
+ trace_tcp_rcvbuf_grow(sk, time);
- if (sysctl_tcp_moderate_rcvbuf &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int new_clamp = space;
+ tcp_rcvbuf_grow(sk, copied);
- /* Receive space grows, normalize in order to
- * take into account packet headers and sk_buff
- * structure overhead.
- */
- space /= tp->advmss;
- if (!space)
- space = 1;
- rcvmem = (tp->advmss + MAX_TCP_HEADER +
- 16 + sizeof(struct sk_buff));
- while (tcp_win_from_space(rcvmem) < tp->advmss)
- rcvmem += 128;
- space *= rcvmem;
- space = min(space, sysctl_tcp_rmem[2]);
- if (space > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = space;
-
- /* Make the window clamp follow along. */
- tp->window_clamp = new_clamp;
- }
- }
- }
-
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
- tp->rcvq_space.time = tcp_time_stamp;
+ tp->rcvq_space.time = tp->tcp_mstamp;
+}
+
+static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
+#endif
}
/* There is something which you must keep in mind when you analyze the
@@ -499,8 +995,9 @@ new_measure:
* each ACK we send, he increments snd_cwnd and transmits more of his
* queue. -DaveM
*/
-static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
u32 now;
@@ -509,19 +1006,19 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
tcp_measure_rcv_mss(sk, skb);
tcp_rcv_rtt_measure(tp);
-
- now = tcp_time_stamp;
+
+ now = tcp_jiffies32;
if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
- if (m <= TCP_ATO_MIN/2) {
+ if (m <= TCP_ATO_MIN / 2) {
/* The fastest case is the first. */
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
} else if (m < icsk->icsk_ack.ato) {
@@ -532,16 +1029,16 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
/* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(sk);
- sk_stream_mem_reclaim(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
}
}
icsk->icsk_ack.lrcvtime = now;
+ tcp_save_lrcv_flowlabel(sk, skb);
- TCP_ECN_check_ce(tp, skb);
+ tcp_data_ecn_check(sk, skb);
if (skb->len >= 128)
- tcp_grow_window(sk, tp, skb);
+ tcp_grow_window(sk, skb, true);
}
/* Called to compute a smoothed rtt estimate. The data fed to this
@@ -553,15 +1050,16 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
- long m = mrtt; /* RTT */
+ long m = mrtt_us; /* RTT */
+ u32 srtt = tp->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
- * This is designed to be as fast as possible
+ * This is designed to be as fast as possible
* m stands for "measurement".
*
* On a 1990 paper the rto value is changed to:
@@ -574,14 +1072,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
*/
- if(m == 0)
- m = 1;
- if (tp->srtt != 0) {
- m -= (tp->srtt >> 3); /* m is now error in rtt est */
- tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (srtt != 0) {
+ m -= (srtt >> 3); /* m is now error in rtt est */
+ srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@@ -593,33 +1089,73 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
if (m > 0)
m >>= 3;
} else {
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
}
- tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
- if (tp->mdev > tp->mdev_max) {
- tp->mdev_max = tp->mdev;
- if (tp->mdev_max > tp->rttvar)
- tp->rttvar = tp->mdev_max;
+ tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev_us > tp->mdev_max_us) {
+ tp->mdev_max_us = tp->mdev_us;
+ if (tp->mdev_max_us > tp->rttvar_us)
+ tp->rttvar_us = tp->mdev_max_us;
}
if (after(tp->snd_una, tp->rtt_seq)) {
- if (tp->mdev_max < tp->rttvar)
- tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
+ if (tp->mdev_max_us < tp->rttvar_us)
+ tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
- tp->mdev_max = TCP_RTO_MIN;
+ tp->mdev_max_us = tcp_rto_min_us(sk);
+
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
} else {
/* no previous measure. */
- tp->srtt = m<<3; /* take the measured time to be rtt */
- tp->mdev = m<<1; /* make sure rto = 3*rtt */
- tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ srtt = m << 3; /* take the measured time to be rtt */
+ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+ tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
+
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
+ tp->srtt_us = max(1U, srtt);
+}
+
+void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+ /* current rate is (cwnd * mss) / srtt
+ * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+ * In Congestion Avoidance phase, set it to 120 % the current rate.
+ *
+ * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+ * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+ * end of slow start and should slow down.
+ */
+ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
+ rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
+ else
+ rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
+
+ rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
+
+ if (likely(tp->srtt_us))
+ do_div(rate, tp->srtt_us);
+
+ /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ * without any lock. We want to make sure compiler wont store
+ * intermediate values in this location.
+ */
+ WRITE_ONCE(sk->sk_pacing_rate,
+ min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -632,252 +1168,168 @@ static inline void tcp_set_rto(struct sock *sk)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some circumstances.
*/
- inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
+ inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
* with correct one. It is exactly, which we pretend to do.
*/
+
+ /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
+ */
+ tcp_bound_rto(sk);
}
-/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
- * guarantees that rto is higher.
- */
-static inline void tcp_bound_rto(struct sock *sk)
+__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
- if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
- inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
+ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+ if (!cwnd)
+ cwnd = TCP_INIT_CWND;
+ return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
-/* Save metrics learned by this TCP session.
- This function is called only, when TCP finishes successfully
- i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+struct tcp_sacktag_state {
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ u64 first_sackt;
+ u64 last_sackt;
+ u32 reord;
+ u32 sack_delivered;
+ u32 delivered_bytes;
+ int flag;
+ unsigned int mss_now;
+ struct rate_sample *rate;
+};
+
+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
+ * and spurious retransmission information if this DSACK is unlikely caused by
+ * sender's action:
+ * - DSACKed sequence range is larger than maximum receiver's window.
+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
*/
-void tcp_update_metrics(struct sock *sk)
+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
+ u32 end_seq, struct tcp_sacktag_state *state)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (sysctl_tcp_nometrics_save)
- return;
-
- dst_confirm(dst);
-
- if (dst && (dst->flags&DST_HOST)) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int m;
+ u32 seq_len, dup_segs = 1;
- if (icsk->icsk_backoff || !tp->srtt) {
- /* This session failed to estimate rtt. Why?
- * Probably, no packets returned in time.
- * Reset our results.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst->metrics[RTAX_RTT-1] = 0;
- return;
- }
+ if (!before(start_seq, end_seq))
+ return 0;
- m = dst_metric(dst, RTAX_RTT) - tp->srtt;
+ seq_len = end_seq - start_seq;
+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
+ if (seq_len > tp->max_window)
+ return 0;
+ if (seq_len > tp->mss_cache)
+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
+ else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
+ state->flag |= FLAG_DSACK_TLP;
+
+ tp->dsack_dups += dup_segs;
+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
+ if (tp->dsack_dups > tp->total_retrans)
+ return 0;
- /* If newly calculated rtt larger than stored one,
- * store new one. Otherwise, use EWMA. Remember,
- * rtt overestimation is always better than underestimation.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT))) {
- if (m <= 0)
- dst->metrics[RTAX_RTT-1] = tp->srtt;
- else
- dst->metrics[RTAX_RTT-1] -= (m>>3);
- }
+ tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+ /* We increase the RACK ordering window in rounds where we receive
+ * DSACKs that may have been due to reordering causing RACK to trigger
+ * a spurious fast recovery. Thus RACK ignores DSACKs that happen
+ * without having seen reordering, or that match TLP probes (TLP
+ * is timer-driven, not triggered by RACK).
+ */
+ if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
+ tp->rack.dsack_seen = 1;
- if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
- if (m < 0)
- m = -m;
+ state->flag |= FLAG_DSACKING_ACK;
+ /* A spurious retransmission is delivered */
+ state->sack_delivered += dup_segs;
- /* Scale deviation to rttvar fixed point */
- m >>= 1;
- if (m < tp->mdev)
- m = tp->mdev;
+ return dup_segs;
+}
- if (m >= dst_metric(dst, RTAX_RTTVAR))
- dst->metrics[RTAX_RTTVAR-1] = m;
- else
- dst->metrics[RTAX_RTTVAR-1] -=
- (dst->metrics[RTAX_RTTVAR-1] - m)>>2;
- }
+/* It's reordering when higher sequence was delivered (i.e. sacked) before
+ * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
+ * distance is approximated in full-mss packet distance ("reordering").
+ */
+static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
+ const int ts)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const u32 mss = tp->mss_cache;
+ u32 fack, metric;
- if (tp->snd_ssthresh >= 0xFFFF) {
- /* Slow start still did not finish. */
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
- if (!dst_metric_locked(dst, RTAX_CWND) &&
- tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
- icsk->icsk_ca_state == TCP_CA_Open) {
- /* Cong. avoidance phase, cwnd is reliable. */
- if (!dst_metric_locked(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] =
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
- } else {
- /* Else slow start did not finish, cwnd is non-sense,
- ssthresh may be also invalid.
- */
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
- if (dst->metrics[RTAX_SSTHRESH-1] &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
- dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
- }
+ fack = tcp_highest_sack_seq(tp);
+ if (!before(low_seq, fack))
+ return;
- if (!dst_metric_locked(dst, RTAX_REORDERING)) {
- if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
- dst->metrics[RTAX_REORDERING-1] = tp->reordering;
- }
+ metric = fack - low_seq;
+ if ((metric > tp->reordering * mss) && mss) {
+#if FASTRETRANS_DEBUG > 1
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ 0,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+ tp->reordering = min_t(u32, (metric + mss - 1) / mss,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
}
+
+ /* This exciting event is worth to be remembered. 8) */
+ tp->reord_seen++;
+ NET_INC_STATS(sock_net(sk),
+ ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
-/* Numbers are taken from RFC2414. */
-__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+ /* This must be called before lost_out or retrans_out are updated
+ * on a new loss, because we want to know if all skbs previously
+ * known to be lost have already been retransmitted, indicating
+ * that this newly lost skb is our next skb to retransmit.
+ */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
-
- if (!cwnd) {
- if (tp->mss_cache > 1460)
- cwnd = 2;
- else
- cwnd = (tp->mss_cache > 1095) ? 3 : 4;
- }
- return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+ if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
+ (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
+ tp->retransmit_skb_hint = skb;
}
-/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk)
+/* Sum the number of packets on the wire we have marked as lost, and
+ * notify the congestion control module that the given skb was marked lost.
+ */
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->prior_ssthresh = 0;
- tp->bytes_acked = 0;
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
- tp->undo_marker = 0;
- tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + 1U);
- tp->snd_cwnd_cnt = 0;
- tp->high_seq = tp->snd_nxt;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- TCP_ECN_queue_cwr(tp);
-
- tcp_set_ca_state(sk, TCP_CA_CWR);
- }
+ tp->lost += tcp_skb_pcount(skb);
}
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst == NULL)
- goto reset;
-
- dst_confirm(dst);
-
- if (dst_metric_locked(dst, RTAX_CWND))
- tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
- if (dst_metric(dst, RTAX_SSTHRESH)) {
- tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- }
- if (dst_metric(dst, RTAX_REORDERING) &&
- tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
- tp->rx_opt.sack_ok &= ~2;
- tp->reordering = dst_metric(dst, RTAX_REORDERING);
- }
-
- if (dst_metric(dst, RTAX_RTT) == 0)
- goto reset;
-
- if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
- goto reset;
-
- /* Initial rtt is determined from SYN,SYN-ACK.
- * The segment is small and rtt may appear much
- * less than real one. Use per-dst memory
- * to make it more realistic.
- *
- * A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal circumstances sending small
- * packets force peer to delay ACKs and calculation is correct too.
- * The algorithm is adaptive and, provided we follow specs, it
- * NEVER underestimate RTT. BUT! If peer tries to make some clever
- * tricks sort of "quick acks" for time long enough to decrease RTT
- * to low value, and then abruptly stops to do it and starts to delay
- * ACKs, wait for troubles.
- */
- if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
- tp->srtt = dst_metric(dst, RTAX_RTT);
- tp->rtt_seq = tp->snd_nxt;
- }
- if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
- tp->mdev = dst_metric(dst, RTAX_RTTVAR);
- tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
- }
- tcp_set_rto(sk);
- tcp_bound_rto(sk);
- if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
- goto reset;
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_time_stamp;
- return;
-reset:
- /* Play conservative. If timestamps are not
- * supported, TCP will fail to recalculate correct
- * rtt, if initial rto is too small. FORGET ALL AND RESET!
- */
- if (!tp->rx_opt.saw_tstamp && tp->srtt) {
- tp->srtt = 0;
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
- }
-}
+ if (sacked & TCPCB_SACKED_ACKED)
+ return;
-static void tcp_update_reordering(struct sock *sk, const int metric,
- const int ts)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- if (metric > tp->reordering) {
- tp->reordering = min(TCP_MAX_REORDERING, metric);
-
- /* This exciting event is worth to be remembered. 8) */
- if (ts)
- NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
- else if (IsReno(tp))
- NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
- else if (IsFack(tp))
- NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
- else
- NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
-#if FASTRETRANS_DEBUG > 1
- printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
- tp->reordering,
- tp->fackets_out,
- tp->sacked_out,
- tp->undo_marker ? tp->undo_retrans : 0);
-#endif
- /* Disable FACK yet. */
- tp->rx_opt.sack_ok &= ~2;
+ tcp_verify_retransmit_hint(tp, skb);
+ if (sacked & TCPCB_LOST) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+ tcp_skb_pcount(skb));
+ tcp_notify_skb_loss_event(tp, skb);
+ }
+ } else {
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tcp_notify_skb_loss_event(tp, skb);
}
}
@@ -896,19 +1348,16 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
* L|R 1 - orig is lost, retransmit is in flight.
* S|R 1 - orig reached receiver, retrans is still in flight.
* (L|S|R is logically valid, it could occur when L|R is sacked,
- * but it is equivalent to plain S and code short-curcuits it to S.
+ * but it is equivalent to plain S and code short-circuits it to S.
* L|S is logically invalid, it would mean -1 packet in flight 8))
*
* These 6 states form finite state machine, controlled by the following events:
* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
- * 3. Loss detection event of one of three flavors:
+ * 3. Loss detection event of two flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
- * A''. Its FACK modfication, head until snd.fack is lost.
- * B. SACK arrives sacking data transmitted after never retransmitted
- * hole was sent out.
- * C. SACK arrives sacking SND.NXT at the moment, when the
+ * B. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
*
@@ -928,495 +1377,1048 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
* for retransmitted and already SACKed segment -> reordering..
* Both of these heuristics are not used in Loss state, when we cannot
* account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself reporting
+ * SACK reneging when it should advance SND.UNA. Such SACK block this is
+ * perfectly valid, however, in light of RFC2018 which explicitly states
+ * that "SACK block MUST reflect the newest segment. Even if the newest
+ * segment is going to be discarded ...", not that it looks very clever
+ * in case of head skb. Due to potentional receiver driven attacks, we
+ * choose to avoid immediate execution of a walk in write queue due to
+ * reneging and defer head skb's loss recovery to standard loss recovery
+ * procedure that will eventually trigger (nothing forbids us doing this).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ * <- outs wnd -> <- wrapzone ->
+ * u e n u_w e_w s n_w
+ * | | | | | | |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->| |<--------...
+ * ...---- >2^31 ------>| |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, D-SACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
*/
-static int
-tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+ u32 start_seq, u32 end_seq)
+{
+ /* Too far in future, or reversed (interpretation is ambiguous) */
+ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+ return false;
+
+ /* Nasty start_seq wrap-around check (see comments above) */
+ if (!before(start_seq, tp->snd_nxt))
+ return false;
+
+ /* In outstanding window? ...This is valid exit for D-SACKs too.
+ * start_seq == snd_una is non-sensical (see comments above)
+ */
+ if (after(start_seq, tp->snd_una))
+ return true;
+
+ if (!is_dsack || !tp->undo_marker)
+ return false;
+
+ /* ...Then it's D-SACK, and must reside below snd_una completely */
+ if (after(end_seq, tp->snd_una))
+ return false;
+
+ if (!before(start_seq, tp->undo_marker))
+ return true;
+
+ /* Too old */
+ if (!after(end_seq, tp->undo_marker))
+ return false;
+
+ /* Undo_marker boundary crossing (overestimates a lot). Known already:
+ * start_seq < undo_marker and end_seq >= undo_marker.
+ */
+ return !before(start_seq, end_seq - tp->max_window);
+}
+
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
- struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
- int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
- int reord = tp->packets_out;
- int prior_fackets;
- u32 lost_retrans = 0;
- int flag = 0;
- int dup_sack = 0;
- int i;
+ u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
+ u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
+ u32 dup_segs;
+
+ if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ } else if (num_sacks > 1) {
+ u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
+ u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
+
+ if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
+ return false;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
+ } else {
+ return false;
+ }
- if (!tp->sacked_out)
- tp->fackets_out = 0;
- prior_fackets = tp->fackets_out;
-
- /* SACK fastpath:
- * if the only SACK change is the increase of the end_seq of
- * the first block then only apply that SACK block
- * and use retrans queue hinting otherwise slowpath */
- flag = 1;
- for (i = 0; i< num_sacks; i++) {
- __u32 start_seq = ntohl(sp[i].start_seq);
- __u32 end_seq = ntohl(sp[i].end_seq);
-
- if (i == 0){
- if (tp->recv_sack_cache[i].start_seq != start_seq)
- flag = 0;
+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
+ if (!dup_segs) { /* Skip dubious DSACK */
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
+ return false;
+ }
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
+
+ /* D-SACK for already forgotten data... Do dumb counting. */
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
+ !after(end_seq_0, prior_snd_una) &&
+ after(end_seq_0, tp->undo_marker))
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
+
+ return true;
+}
+
+/* Check if skb is fully within the SACK block. In presence of GSO skbs,
+ * the incoming SACK may not exactly match but we can find smaller MSS
+ * aligned portion of it that matches. Therefore we might need to fragment
+ * which may fail and creates some hassle (caller must handle error case
+ * returns).
+ *
+ * FIXME: this could be merged to shift decision code
+ */
+static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
+ u32 start_seq, u32 end_seq)
+{
+ int err;
+ bool in_sack;
+ unsigned int pkt_len;
+ unsigned int mss;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (tcp_skb_pcount(skb) > 1 && !in_sack &&
+ after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+ mss = tcp_skb_mss(skb);
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+
+ if (!in_sack) {
+ pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
+ if (pkt_len < mss)
+ pkt_len = mss;
} else {
- if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
- (tp->recv_sack_cache[i].end_seq != end_seq))
- flag = 0;
+ pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
+ if (pkt_len < mss)
+ return -EINVAL;
+ }
+
+ /* Round if necessary so that SACKs cover only full MSSes
+ * and/or the remaining small portion (if present)
+ */
+ if (pkt_len > mss) {
+ unsigned int new_len = (pkt_len / mss) * mss;
+ if (!in_sack && new_len < pkt_len)
+ new_len += mss;
+ pkt_len = new_len;
}
- tp->recv_sack_cache[i].start_seq = start_seq;
- tp->recv_sack_cache[i].end_seq = end_seq;
-
- /* Check for D-SACK. */
- if (i == 0) {
- u32 ack = TCP_SKB_CB(ack_skb)->ack_seq;
-
- if (before(start_seq, ack)) {
- dup_sack = 1;
- tp->rx_opt.sack_ok |= 4;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
- } else if (num_sacks > 1 &&
- !after(end_seq, ntohl(sp[1].end_seq)) &&
- !before(start_seq, ntohl(sp[1].start_seq))) {
- dup_sack = 1;
- tp->rx_opt.sack_ok |= 4;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
+
+ if (pkt_len >= skb->len && !in_sack)
+ return 0;
+
+ err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ pkt_len, mss, GFP_ATOMIC);
+ if (err < 0)
+ return err;
+ }
+
+ return in_sack;
+}
+
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+ struct tcp_sacktag_state *state, u8 sacked,
+ u32 start_seq, u32 end_seq,
+ int dup_sack, int pcount, u32 plen,
+ u64 xmit_time)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Account D-SACK for retransmitted packet. */
+ if (dup_sack && (sacked & TCPCB_RETRANS)) {
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
+ after(end_seq, tp->undo_marker))
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
+ if ((sacked & TCPCB_SACKED_ACKED) &&
+ before(start_seq, state->reord))
+ state->reord = start_seq;
+ }
+
+ /* Nothing to do; acked frame is about to be dropped (was ACKed). */
+ if (!after(end_seq, tp->snd_una))
+ return sacked;
+
+ if (!(sacked & TCPCB_SACKED_ACKED)) {
+ tcp_rack_advance(tp, sacked, end_seq, xmit_time);
+
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* If the segment is not tagged as lost,
+ * we do not clear RETRANS, believing
+ * that retransmission is still in flight.
+ */
+ if (sacked & TCPCB_LOST) {
+ sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ tp->lost_out -= pcount;
+ tp->retrans_out -= pcount;
+ }
+ } else {
+ if (!(sacked & TCPCB_RETRANS)) {
+ /* New sack for not retransmitted frame,
+ * which was in hole. It is reordering.
+ */
+ if (before(start_seq,
+ tcp_highest_sack_seq(tp)) &&
+ before(start_seq, state->reord))
+ state->reord = start_seq;
+
+ if (!after(end_seq, tp->high_seq))
+ state->flag |= FLAG_ORIG_SACK_ACKED;
+ if (state->first_sackt == 0)
+ state->first_sackt = xmit_time;
+ state->last_sackt = xmit_time;
+ }
+
+ if (sacked & TCPCB_LOST) {
+ sacked &= ~TCPCB_LOST;
+ tp->lost_out -= pcount;
}
+ }
+
+ sacked |= TCPCB_SACKED_ACKED;
+ state->flag |= FLAG_DATA_SACKED;
+ tp->sacked_out += pcount;
+ /* Out-of-order packets delivered */
+ state->sack_delivered += pcount;
+ state->delivered_bytes += plen;
+ }
+
+ /* D-SACK. We can detect redundant retransmission in S|R and plain R
+ * frames and clear it. undo_retrans is decreased above, L|R frames
+ * are accounted above as well.
+ */
+ if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
+ sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= pcount;
+ }
+
+ return sacked;
+}
+
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ unsigned int pcount, int shifted, int mss,
+ bool dup_sack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
+ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
+
+ BUG_ON(!pcount);
+
+ /* Adjust counters and hints for the newly sacked sequence
+ * range but discard the return value since prev is already
+ * marked. We must tag the range first because the seq
+ * advancement below implicitly advances
+ * tcp_highest_sack_seq() when skb is highest_sack.
+ */
+ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+ start_seq, end_seq, dup_sack, pcount, skb->len,
+ tcp_skb_timestamp_us(skb));
+ tcp_rate_skb_delivered(sk, skb, state->rate);
+
+ TCP_SKB_CB(prev)->end_seq += shifted;
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ tcp_skb_pcount_add(prev, pcount);
+ WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+ * in theory this shouldn't be necessary but as long as DSACK
+ * code can come after this skb later on it's better to keep
+ * setting gso_size to something.
+ */
+ if (!TCP_SKB_CB(prev)->tcp_gso_size)
+ TCP_SKB_CB(prev)->tcp_gso_size = mss;
+
+ /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+ if (tcp_skb_pcount(skb) <= 1)
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
+
+ /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+ if (skb->len > 0) {
+ BUG_ON(!tcp_skb_pcount(skb));
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ return false;
+ }
- /* D-SACK for already forgotten data...
- * Do dumb counting. */
- if (dup_sack &&
- !after(end_seq, prior_snd_una) &&
- after(end_seq, tp->undo_marker))
- tp->undo_retrans--;
+ /* Whole SKB was eaten :-) */
- /* Eliminate too old ACKs, but take into
- * account more or less fresh ones, they can
- * contain valid SACK info.
+ if (skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = prev;
+
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
+ if (skb == tcp_highest_sack(sk))
+ tcp_advance_highest_sack(sk, skb);
+
+ tcp_skb_collapse_tstamp(prev, skb);
+ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
+ TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
+
+ tcp_rtx_queue_unlink_and_free(skb, sk);
+
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
+
+ return true;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_skb_seglen(const struct sk_buff *skb)
+{
+ return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(const struct sk_buff *skb)
+{
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
+ int pcount, int shiftlen)
+{
+ /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
+ * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
+ * to make sure not storing more than 65535 * 8 bytes per skb,
+ * even if current MSS is bigger.
+ */
+ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
+ return 0;
+ if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
+ return 0;
+ return skb_shift(to, from, shiftlen);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ u32 start_seq, u32 end_seq,
+ bool dup_sack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
+ int pcount = 0;
+ int len;
+ int in_sack;
+
+ /* Normally R but no L won't result in plain S */
+ if (!dup_sack &&
+ (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
+ goto fallback;
+ if (!skb_can_shift(skb))
+ goto fallback;
+ /* This frame is about to be dropped (was ACKed). */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ goto fallback;
+
+ /* Can only happen with delayed DSACK + discard craziness */
+ prev = skb_rb_prev(skb);
+ if (!prev)
+ goto fallback;
+
+ if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto fallback;
+
+ if (!tcp_skb_can_collapse(prev, skb))
+ goto fallback;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (in_sack) {
+ len = skb->len;
+ pcount = tcp_skb_pcount(skb);
+ mss = tcp_skb_seglen(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_skb_seglen(prev))
+ goto fallback;
+ } else {
+ if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+ goto noop;
+ /* CHECKME: This is non-MSS split case only?, this will
+ * cause skipped skbs due to advancing loop btw, original
+ * has that feature too
+ */
+ if (tcp_skb_pcount(skb) <= 1)
+ goto noop;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+ if (!in_sack) {
+ /* TODO: head merge to next could be attempted here
+ * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+ * though it might not be worth of the additional hassle
+ *
+ * ...we can probably just fallback to what was done
+ * previously. We could try merging non-SACKed ones
+ * as well but it probably isn't going to buy off
+ * because later SACKs might again split them, and
+ * it would make skb timestamp tracking considerably
+ * harder problem.
*/
- if (before(ack, prior_snd_una - tp->max_window))
- return 0;
+ goto fallback;
+ }
+
+ len = end_seq - TCP_SKB_CB(skb)->seq;
+ BUG_ON(len < 0);
+ BUG_ON(len > skb->len);
+
+ /* MSS boundaries should be honoured or else pcount will
+ * severely break even though it makes things bit trickier.
+ * Optimize common case to avoid most of the divides
+ */
+ mss = tcp_skb_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_skb_seglen(prev))
+ goto fallback;
+
+ if (len == mss) {
+ pcount = 1;
+ } else if (len < mss) {
+ goto noop;
+ } else {
+ pcount = len / mss;
+ len = pcount * mss;
}
}
- if (flag)
- num_sacks = 1;
- else {
- int j;
- tp->fastpath_skb_hint = NULL;
-
- /* order SACK blocks to allow in order walk of the retrans queue */
- for (i = num_sacks-1; i > 0; i--) {
- for (j = 0; j < i; j++){
- if (after(ntohl(sp[j].start_seq),
- ntohl(sp[j+1].start_seq))){
- sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
- sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
- sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
- sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
+ /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
+ if (!tcp_skb_shift(prev, skb, pcount, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
+ goto out;
+
+ /* Hole filled allows collapsing with the next as well, this is very
+ * useful when hole on every nth skb pattern happens
+ */
+ skb = skb_rb_next(prev);
+ if (!skb)
+ goto out;
+
+ if (!skb_can_shift(skb) ||
+ ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
+ (mss != tcp_skb_seglen(skb)))
+ goto out;
+
+ if (!tcp_skb_can_collapse(prev, skb))
+ goto out;
+ len = skb->len;
+ pcount = tcp_skb_pcount(skb);
+ if (tcp_skb_shift(prev, skb, pcount, len))
+ tcp_shifted_skb(sk, prev, skb, state, pcount,
+ len, mss, 0);
+
+out:
+ return prev;
+
+noop:
+ return skb;
+
+fallback:
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ struct tcp_sacktag_state *state,
+ u32 start_seq, u32 end_seq,
+ bool dup_sack_in)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *tmp;
+
+ skb_rbtree_walk_from(skb) {
+ int in_sack = 0;
+ bool dup_sack = dup_sack_in;
+
+ /* queue is in-order => we can short-circuit the walk early */
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ if (next_dup &&
+ before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ next_dup->start_seq,
+ next_dup->end_seq);
+ if (in_sack > 0)
+ dup_sack = true;
+ }
+
+ /* skb reference here is a bit tricky to get right, since
+ * shifting can eat and free both this skb and the next,
+ * so not even _safe variant of the loop is enough.
+ */
+ if (in_sack <= 0) {
+ tmp = tcp_shift_skb_data(sk, skb, state,
+ start_seq, end_seq, dup_sack);
+ if (tmp) {
+ if (tmp != skb) {
+ skb = tmp;
+ continue;
}
+ in_sack = 0;
+ } else {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ start_seq,
+ end_seq);
}
}
+
+ if (unlikely(in_sack < 0))
+ break;
+
+ if (in_sack) {
+ TCP_SKB_CB(skb)->sacked =
+ tcp_sacktag_one(sk,
+ state,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ dup_sack,
+ tcp_skb_pcount(skb),
+ skb->len,
+ tcp_skb_timestamp_us(skb));
+ tcp_rate_skb_delivered(sk, skb, state->rate);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ list_del_init(&skb->tcp_tsorted_anchor);
+
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tcp_highest_sack_seq(tp)))
+ tcp_advance_highest_sack(sk, skb);
+ }
}
+ return skb;
+}
- /* clear flag as used for different purpose in following code */
- flag = 0;
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
+{
+ struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+ struct sk_buff *skb;
- for (i=0; i<num_sacks; i++, sp++) {
- struct sk_buff *skb;
- __u32 start_seq = ntohl(sp->start_seq);
- __u32 end_seq = ntohl(sp->end_seq);
- int fack_count;
-
- /* Use SACK fastpath hint if valid */
- if (tp->fastpath_skb_hint) {
- skb = tp->fastpath_skb_hint;
- fack_count = tp->fastpath_cnt_hint;
- } else {
- skb = sk->sk_write_queue.next;
- fack_count = 0;
+ while (*p) {
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (before(seq, TCP_SKB_CB(skb)->seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+ p = &parent->rb_right;
+ continue;
}
+ return skb;
+ }
+ return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
+ u32 skip_to_seq)
+{
+ if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+ return skb;
- /* Event "B" in the comment above. */
- if (after(end_seq, tp->high_seq))
- flag |= FLAG_DATA_LOST;
+ return tcp_sacktag_bsearch(sk, skip_to_seq);
+}
- sk_stream_for_retrans_queue_from(skb, sk) {
- int in_sack, pcount;
- u8 sacked;
+static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
+ struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ struct tcp_sacktag_state *state,
+ u32 skip_to_seq)
+{
+ if (!next_dup)
+ return skb;
+
+ if (before(next_dup->start_seq, skip_to_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
+ skb = tcp_sacktag_walk(skb, sk, NULL, state,
+ next_dup->start_seq, next_dup->end_seq,
+ 1);
+ }
- tp->fastpath_skb_hint = skb;
- tp->fastpath_cnt_hint = fack_count;
+ return skb;
+}
- /* The retransmission queue is always in order, so
- * we can short-circuit the walk early.
- */
- if (!before(TCP_SKB_CB(skb)->seq, end_seq))
- break;
+static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
+{
+ return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+}
- in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
- !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+static int
+tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const unsigned char *ptr = (skb_transport_header(ack_skb) +
+ TCP_SKB_CB(ack_skb)->sacked);
+ struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
+ struct tcp_sack_block sp[TCP_NUM_SACKS];
+ struct tcp_sack_block *cache;
+ struct sk_buff *skb;
+ int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
+ int used_sacks;
+ bool found_dup_sack = false;
+ int i, j;
+ int first_sack_index;
- pcount = tcp_skb_pcount(skb);
+ state->flag = 0;
+ state->reord = tp->snd_nxt;
- if (pcount > 1 && !in_sack &&
- after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
- unsigned int pkt_len;
+ if (!tp->sacked_out)
+ tcp_highest_sack_reset(sk);
- in_sack = !after(start_seq,
- TCP_SKB_CB(skb)->seq);
+ found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
+ num_sacks, prior_snd_una, state);
- if (!in_sack)
- pkt_len = (start_seq -
- TCP_SKB_CB(skb)->seq);
- else
- pkt_len = (end_seq -
- TCP_SKB_CB(skb)->seq);
- if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
- break;
- pcount = tcp_skb_pcount(skb);
- }
+ /* Eliminate too old ACKs, but take into
+ * account more or less fresh ones, they can
+ * contain valid SACK info.
+ */
+ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
+ return 0;
- fack_count += pcount;
-
- sacked = TCP_SKB_CB(skb)->sacked;
-
- /* Account D-SACK for retransmitted packet. */
- if ((dup_sack && in_sack) &&
- (sacked & TCPCB_RETRANS) &&
- after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
- tp->undo_retrans--;
-
- /* The frame is ACKed. */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
- if (sacked&TCPCB_RETRANS) {
- if ((dup_sack && in_sack) &&
- (sacked&TCPCB_SACKED_ACKED))
- reord = min(fack_count, reord);
- } else {
- /* If it was in a hole, we detected reordering. */
- if (fack_count < prior_fackets &&
- !(sacked&TCPCB_SACKED_ACKED))
- reord = min(fack_count, reord);
- }
+ if (!tp->packets_out)
+ goto out;
- /* Nothing to do; acked frame is about to be dropped. */
- continue;
- }
+ used_sacks = 0;
+ first_sack_index = 0;
+ for (i = 0; i < num_sacks; i++) {
+ bool dup_sack = !i && found_dup_sack;
- if ((sacked&TCPCB_SACKED_RETRANS) &&
- after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
- (!lost_retrans || after(end_seq, lost_retrans)))
- lost_retrans = end_seq;
-
- if (!in_sack)
- continue;
-
- if (!(sacked&TCPCB_SACKED_ACKED)) {
- if (sacked & TCPCB_SACKED_RETRANS) {
- /* If the segment is not tagged as lost,
- * we do not clear RETRANS, believing
- * that retransmission is still in flight.
- */
- if (sacked & TCPCB_LOST) {
- TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->retrans_out -= tcp_skb_pcount(skb);
-
- /* clear lost hint */
- tp->retransmit_skb_hint = NULL;
- }
- } else {
- /* New sack for not retransmitted frame,
- * which was in hole. It is reordering.
- */
- if (!(sacked & TCPCB_RETRANS) &&
- fack_count < prior_fackets)
- reord = min(fack_count, reord);
-
- if (sacked & TCPCB_LOST) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- tp->lost_out -= tcp_skb_pcount(skb);
-
- /* clear lost hint */
- tp->retransmit_skb_hint = NULL;
- }
- }
+ sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
+ sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
- TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
- flag |= FLAG_DATA_SACKED;
- tp->sacked_out += tcp_skb_pcount(skb);
+ if (!tcp_is_sackblock_valid(tp, dup_sack,
+ sp[used_sacks].start_seq,
+ sp[used_sacks].end_seq)) {
+ int mib_idx;
- if (fack_count > tp->fackets_out)
- tp->fackets_out = fack_count;
+ if (dup_sack) {
+ if (!tp->undo_marker)
+ mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
+ else
+ mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
} else {
- if (dup_sack && (sacked&TCPCB_RETRANS))
- reord = min(fack_count, reord);
+ /* Don't count olds caused by ACK reordering */
+ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
+ !after(sp[used_sacks].end_seq, tp->snd_una))
+ continue;
+ mib_idx = LINUX_MIB_TCPSACKDISCARD;
}
- /* D-SACK. We can detect redundant retransmission
- * in S|R and plain R frames and clear it.
- * undo_retrans is decreased above, L|R frames
- * are accounted above as well.
- */
- if (dup_sack &&
- (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- tp->retransmit_skb_hint = NULL;
+ NET_INC_STATS(sock_net(sk), mib_idx);
+ if (i == 0)
+ first_sack_index = -1;
+ continue;
+ }
+
+ /* Ignore very old stuff early */
+ if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
+ if (i == 0)
+ first_sack_index = -1;
+ continue;
+ }
+
+ used_sacks++;
+ }
+
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = used_sacks - 1; i > 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
+ swap(sp[j], sp[j + 1]);
+
+ /* Track where the first SACK block goes to */
+ if (j == first_sack_index)
+ first_sack_index = j + 1;
}
}
}
- /* Check for lost retransmit. This superb idea is
- * borrowed from "ratehalving". Event "C".
- * Later note: FACK people cheated me again 8),
- * we have to account for reordering! Ugly,
- * but should help.
- */
- if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
- struct sk_buff *skb;
+ state->mss_now = tcp_current_mss(sk);
+ skb = NULL;
+ i = 0;
- sk_stream_for_retrans_queue(skb, sk) {
- if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
- if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
- after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
- (IsFack(tp) ||
- !before(lost_retrans,
- TCP_SKB_CB(skb)->ack_seq + tp->reordering *
- tp->mss_cache))) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
-
- /* clear lost hint */
- tp->retransmit_skb_hint = NULL;
-
- if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
- tp->lost_out += tcp_skb_pcount(skb);
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- flag |= FLAG_DATA_SACKED;
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
- }
+ if (!tp->sacked_out) {
+ /* It's already past, so skip checking against it */
+ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+ } else {
+ cache = tp->recv_sack_cache;
+ /* Skip empty blocks in at head of the cache */
+ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
+ !cache->end_seq)
+ cache++;
+ }
+
+ while (i < used_sacks) {
+ u32 start_seq = sp[i].start_seq;
+ u32 end_seq = sp[i].end_seq;
+ bool dup_sack = (found_dup_sack && (i == first_sack_index));
+ struct tcp_sack_block *next_dup = NULL;
+
+ if (found_dup_sack && ((i + 1) == first_sack_index))
+ next_dup = &sp[i + 1];
+
+ /* Skip too early cached blocks */
+ while (tcp_sack_cache_ok(tp, cache) &&
+ !before(start_seq, cache->end_seq))
+ cache++;
+
+ /* Can skip some work by looking recv_sack_cache? */
+ if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
+ after(end_seq, cache->start_seq)) {
+
+ /* Head todo? */
+ if (before(start_seq, cache->start_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
+ skb = tcp_sacktag_walk(skb, sk, next_dup,
+ state,
+ start_seq,
+ cache->start_seq,
+ dup_sack);
+ }
+
+ /* Rest of the block already fully processed? */
+ if (!after(end_seq, cache->end_seq))
+ goto advance_sp;
+
+ skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
+ state,
+ cache->end_seq);
+
+ /* ...tail remains todo... */
+ if (tcp_highest_sack_seq(tp) == cache->end_seq) {
+ /* ...but better entrypoint exists! */
+ skb = tcp_highest_sack(sk);
+ if (!skb)
+ break;
+ cache++;
+ goto walk;
}
+
+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
+ /* Check overlap against next cached too (past this one already) */
+ cache++;
+ continue;
}
+
+ if (!before(start_seq, tcp_highest_sack_seq(tp))) {
+ skb = tcp_highest_sack(sk);
+ if (!skb)
+ break;
+ }
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
+
+walk:
+ skb = tcp_sacktag_walk(skb, sk, next_dup, state,
+ start_seq, end_seq, dup_sack);
+
+advance_sp:
+ i++;
+ }
+
+ /* Clear the head of the cache sack blocks so we can skip it next time */
+ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
+ tp->recv_sack_cache[i].start_seq = 0;
+ tp->recv_sack_cache[i].end_seq = 0;
}
+ for (j = 0; j < used_sacks; j++)
+ tp->recv_sack_cache[i++] = sp[j];
- tp->left_out = tp->sacked_out + tp->lost_out;
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
+ tcp_check_sack_reordering(sk, state->reord, 0);
- if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
- tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
+ tcp_verify_left_out(tp);
+out:
#if FASTRETRANS_DEBUG > 0
- BUG_TRAP((int)tp->sacked_out >= 0);
- BUG_TRAP((int)tp->lost_out >= 0);
- BUG_TRAP((int)tp->retrans_out >= 0);
- BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
+ WARN_ON((int)tp->sacked_out < 0);
+ WARN_ON((int)tp->lost_out < 0);
+ WARN_ON((int)tp->retrans_out < 0);
+ WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
- return flag;
+ return state->flag;
}
-/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
- * segments to see from the next ACKs whether any data was really missing.
- * If the RTO was spurious, new ACKs should arrive.
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
-void tcp_enter_frto(struct sock *sk)
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- tp->frto_counter = 1;
-
- if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
- tp->snd_una == tp->high_seq ||
- (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tcp_ca_event(sk, CA_EVENT_FRTO);
- }
+ u32 holes;
- /* Have to clear retransmission markers here to keep the bookkeeping
- * in shape, even though we are not yet in Loss state.
- * If something was really lost, it is eventually caught up
- * in tcp_enter_frto_loss.
- */
- tp->retrans_out = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = 0;
+ holes = max(tp->lost_out, 1U);
+ holes = min(holes, tp->packets_out);
- sk_stream_for_retrans_queue(skb, sk) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS;
+ if ((tp->sacked_out + holes) > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
+ return true;
}
- tcp_sync_left_out(tp);
-
- tcp_set_ca_state(sk, TCP_CA_Open);
- tp->frto_highmark = tp->snd_nxt;
+ return false;
}
-/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
- * which indicates that we should follow the traditional RTO recovery,
- * i.e. mark everything lost and do go-back-N retransmission.
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
*/
-static void tcp_enter_frto_loss(struct sock *sk)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
- tp->sacked_out = 0;
- tp->lost_out = 0;
- tp->fackets_out = 0;
+ if (!tcp_limit_reno_sacked(tp))
+ return;
- sk_stream_for_retrans_queue(skb, sk) {
- cnt += tcp_skb_pcount(skb);
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+ tp->reordering = min_t(u32, tp->packets_out + addend,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
+ tp->reord_seen++;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
+}
- /* Do not mark those segments lost that were
- * forward transmitted after RTO
- */
- if (!after(TCP_SKB_CB(skb)->end_seq,
- tp->frto_highmark)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- }
- } else {
- tp->sacked_out += tcp_skb_pcount(skb);
- tp->fackets_out = cnt;
- }
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
+{
+ if (num_dupack) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ s32 delivered;
+
+ tp->sacked_out += num_dupack;
+ tcp_check_reno_reordering(sk, 0);
+ delivered = tp->sacked_out - prior_sacked;
+ if (delivered > 0)
+ tcp_count_delivered(tp, delivered, ece_ack);
+ tcp_verify_left_out(tp);
}
- tcp_sync_left_out(tp);
+}
- tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
- tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->undo_marker = 0;
- tp->frto_counter = 0;
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
- tcp_set_ca_state(sk, TCP_CA_Loss);
- tp->high_seq = tp->frto_highmark;
- TCP_ECN_queue_cwr(tp);
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (acked > 0) {
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
+ ece_ack);
+ if (acked - 1 >= tp->sacked_out)
+ tp->sacked_out = 0;
+ else
+ tp->sacked_out -= acked - 1;
+ }
+ tcp_check_reno_reordering(sk, acked);
+ tcp_verify_left_out(tp);
+}
- clear_all_retrans_hints(tp);
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out = 0;
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
- tp->left_out = 0;
tp->retrans_out = 0;
-
- tp->fackets_out = 0;
- tp->sacked_out = 0;
tp->lost_out = 0;
-
tp->undo_marker = 0;
- tp->undo_retrans = 0;
+ tp->undo_retrans = -1;
+ tp->sacked_out = 0;
+ tp->rto_stamp = 0;
+ tp->total_rto = 0;
+ tp->total_rto_recoveries = 0;
+ tp->total_rto_time = 0;
+}
+
+static inline void tcp_init_undo(struct tcp_sock *tp)
+{
+ tp->undo_marker = tp->snd_una;
+
+ /* Retransmission still in flight may cause DSACKs later. */
+ /* First, account for regular retransmits in flight: */
+ tp->undo_retrans = tp->retrans_out;
+ /* Next, account for TLP retransmits in flight: */
+ if (tp->tlp_high_seq && tp->tlp_retrans)
+ tp->undo_retrans++;
+ /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
+ if (!tp->undo_retrans)
+ tp->undo_retrans = -1;
}
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
-void tcp_enter_loss(struct sock *sk, int how)
+static void tcp_timeout_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb, *head;
+ bool is_reneg; /* is receiver reneging on SACKs? */
+
+ head = tcp_rtx_queue_head(sk);
+ is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ tp->sacked_out = 0;
+ /* Mark SACK reneging until we recover from this loss event. */
+ tp->is_sack_reneg = 1;
+ } else if (tcp_is_reno(tp)) {
+ tcp_reset_reno_sack(tp);
+ }
+
+ skb = head;
+ skb_rbtree_walk_from(skb) {
+ if (is_reneg)
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ continue; /* Don't mark recently sent ones lost yet */
+ tcp_mark_skb_lost(sk, skb);
+ }
+ tcp_verify_left_out(tp);
+ tcp_clear_all_retrans_hints(tp);
+}
+
+/* Enter Loss state. */
+void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
+ struct net *net = sock_net(sk);
+ bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
+ u8 reordering;
+
+ tcp_timeout_mark_lost(sk);
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+ !after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_cwnd = tcp_snd_cwnd(tp);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
+ tcp_init_undo(tp);
}
- tp->snd_cwnd = 1;
+ tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1);
tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
-
- tp->bytes_acked = 0;
- tcp_clear_retrans(tp);
-
- /* Push undo marker, if it was plain RTO and nothing
- * was retransmitted. */
- if (!how)
- tp->undo_marker = tp->snd_una;
-
- sk_stream_for_retrans_queue(skb, sk) {
- cnt += tcp_skb_pcount(skb);
- if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
- tp->undo_marker = 0;
- TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- } else {
- tp->sacked_out += tcp_skb_pcount(skb);
- tp->fackets_out = cnt;
- }
- }
- tcp_sync_left_out(tp);
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+
+ /* Timeout in disordered state after receiving substantial DUPACKs
+ * suggests that the degree of reordering is over-estimated.
+ */
+ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+ tp->sacked_out >= reordering)
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ reordering);
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
- TCP_ECN_queue_cwr(tp);
-
- clear_all_retrans_hints(tp);
-}
+ tp->tlp_high_seq = 0;
+ tcp_ecn_queue_cwr(tp);
-static int tcp_check_sack_reneging(struct sock *sk)
-{
- struct sk_buff *skb;
-
- /* If ACK arrived pointing to a remembered SACK,
- * it means that our remembered SACKs do not reflect
- * real state of receiver i.e.
- * receiver _host_ is heavily congested (or buggy).
- * Do processing similar to RTO timeout.
+ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+ * loss recovery is underway except recurring timeout(s) on
+ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
*/
- if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- struct inet_connection_sock *icsk = inet_csk(sk);
- NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
-
- tcp_enter_loss(sk, 1);
- icsk->icsk_retransmits++;
- tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- icsk->icsk_rto, TCP_RTO_MAX);
- return 1;
- }
- return 0;
-}
-
-static inline int tcp_fackets_out(struct tcp_sock *tp)
-{
- return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
+ (new_recovery || icsk->icsk_retransmits) &&
+ !inet_csk(sk)->icsk_mtup.probe_size;
}
-static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
-{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
-}
-
-static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
+/* If ACK arrived pointing to a remembered SACK, it means that our
+ * remembered SACKs do not reflect real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ *
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
+ */
+static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
{
- return tp->packets_out &&
- tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
+ if (*ack_flag & FLAG_SACK_RENEGING &&
+ *ack_flag & FLAG_SND_UNA_ADVANCED) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+ msecs_to_jiffies(10));
+
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false);
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
+ return true;
+ }
+ return false;
}
-/* Linux NewReno/SACK/FACK/ECN state machine.
+/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
@@ -1463,30 +2465,28 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
- * Essentially, we have now two algorithms counting
+ * Essentially, we have now a few algorithms detecting
* lost packets.
*
- * FACK: It is the simplest heuristics. As soon as we decided
- * that something is lost, we decide that _all_ not SACKed
- * packets until the most forward SACK are lost. I.e.
- * lost_out = fackets_out - sacked_out and left_out = fackets_out.
- * It is absolutely correct estimate, if network does not reorder
- * packets. And it loses any connection to reality when reordering
- * takes place. We use FACK by default until reordering
- * is suspected on the path to this destination.
+ * If the receiver supports SACK:
+ *
+ * RACK (RFC8985): RACK is a newer loss detection algorithm
+ * (2017-) that checks timing instead of counting DUPACKs.
+ * Essentially a packet is considered lost if it's not S/ACKed
+ * after RTT + reordering_window, where both metrics are
+ * dynamically measured and adjusted. This is implemented in
+ * tcp_rack_mark_lost.
+ *
+ * If the receiver does not support SACK:
*
- * NewReno: when Recovery is entered, we assume that one segment
+ * NewReno (RFC6582): in Recovery we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Imagine, that's all! Forget about all this shamanism about CWND inflation
- * deflation etc. CWND is real congestion window, never inflated, changes
- * only according to classic VJ rules.
- *
- * Really tricky (and requiring careful tuning) part of algorithm
- * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The really tricky (and requiring careful tuning) part of the algorithm
+ * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
@@ -1509,556 +2509,712 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
+static bool tcp_time_to_recover(const struct tcp_sock *tp)
{
- __u32 packets_out;
+ /* Has loss detection marked at least one packet lost? */
+ return tp->lost_out != 0;
+}
- /* Trick#1: The loss is proven. */
- if (tp->lost_out)
- return 1;
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+ return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, when);
+}
- /* Not-A-Trick#2 : Classic rule... */
- if (tcp_fackets_out(tp) > tp->reordering)
- return 1;
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+}
- /* Trick#3 : when we use RFC2988 timer restart, fast
- * retransmit can be triggered by timeout of queue head.
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
+{
+ const struct sock *sk = (const struct sock *)tp;
+
+ /* Received an echoed timestamp before the first retransmission? */
+ if (tp->retrans_stamp)
+ return tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
+
+ /* We set tp->retrans_stamp upon the first retransmission of a loss
+ * recovery episode, so normally if tp->retrans_stamp is 0 then no
+ * retransmission has happened yet (likely due to TSQ, which can cause
+ * fast retransmits to be delayed). So if snd_una advanced while
+ * (tp->retrans_stamp is 0 then apparently a packet was merely delayed,
+ * not lost. But there are exceptions where we retransmit but then
+ * clear tp->retrans_stamp, so we check for those exceptions.
*/
- if (tcp_head_timedout(sk, tp))
- return 1;
- /* Trick#4: It is still not OK... But will it be useful to delay
- * recovery more?
+ /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen()
+ * clears tp->retrans_stamp when snd_una == high_seq.
*/
- packets_out = tp->packets_out;
- if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
- !tcp_may_send_now(sk, tp)) {
- /* We have nothing to send. This connection is limited
- * either by receiver window or by application.
- */
- return 1;
- }
+ if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq))
+ return false;
- return 0;
+ /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp
+ * when setting FLAG_SYN_ACKED is set, even if the SYN was
+ * retransmitted.
+ */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return false;
+
+ return true; /* tp->retrans_stamp is zero; no retransmit yet */
}
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
+/* Undo procedures. */
+
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
*/
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+static bool tcp_any_retrans_done(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 holes;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
- holes = max(tp->lost_out, 1U);
- holes = min(holes, tp->packets_out);
+ if (tp->retrans_out)
+ return true;
- if ((tp->sacked_out + holes) > tp->packets_out) {
- tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(sk, tp->packets_out + addend, 0);
- }
-}
+ skb = tcp_rtx_queue_head(sk);
+ if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+ return true;
-/* Emulate SACKs for SACKless connection: account for a new dupack. */
+ return false;
+}
-static void tcp_add_reno_sack(struct sock *sk)
+/* If loss recovery is finished and there are no retransmits out in the
+ * network, then we clear retrans_stamp so that upon the next loss recovery
+ * retransmits_timed_out() and timestamp-undo are using the correct value.
+ */
+static void tcp_retrans_stamp_cleanup(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- tcp_sync_left_out(tp);
+ if (!tcp_any_retrans_done(sk))
+ tcp_sk(sk)->retrans_stamp = 0;
}
-/* Account for ACK, ACKing some data in Reno Recovery phase. */
-
-static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
+static void DBGUNDO(struct sock *sk, const char *msg)
{
- if (acked > 0) {
- /* One ACK acked hole. The rest eat duplicate ACKs. */
- if (acked-1 >= tp->sacked_out)
- tp->sacked_out = 0;
- else
- tp->sacked_out -= acked-1;
+#if FASTRETRANS_DEBUG > 1
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (sk->sk_family == AF_INET) {
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tcp_snd_cwnd(tp), tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
- tcp_check_reno_reordering(sk, acked);
- tcp_sync_left_out(tp);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
+ tcp_snd_cwnd(tp), tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+ }
+#endif
+#endif
}
-static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
- tp->sacked_out = 0;
- tp->left_out = tp->lost_out;
-}
+ struct tcp_sock *tp = tcp_sk(sk);
-/* Mark head of queue up as lost. */
-static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
- int packets, u32 high_seq)
-{
- struct sk_buff *skb;
- int cnt;
+ if (unmark_loss) {
+ struct sk_buff *skb;
- BUG_TRAP(packets <= tp->packets_out);
- if (tp->lost_skb_hint) {
- skb = tp->lost_skb_hint;
- cnt = tp->lost_cnt_hint;
- } else {
- skb = sk->sk_write_queue.next;
- cnt = 0;
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ }
+ tp->lost_out = 0;
+ tcp_clear_all_retrans_hints(tp);
}
- sk_stream_for_retrans_queue_from(skb, sk) {
- /* TODO: do this better */
- /* this is not the most efficient way to do this... */
- tp->lost_skb_hint = skb;
- tp->lost_cnt_hint = cnt;
- cnt += tcp_skb_pcount(skb);
- if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
- break;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ if (tp->prior_ssthresh) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
- /* clear xmit_retransmit_queue hints
- * if this is beyond hint */
- if(tp->retransmit_skb_hint != NULL &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
+ tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
- tp->retransmit_skb_hint = NULL;
- }
+ if (tp->prior_ssthresh > tp->snd_ssthresh) {
+ tp->snd_ssthresh = tp->prior_ssthresh;
+ tcp_ecn_withdraw_cwr(tp);
}
}
- tcp_sync_left_out(tp);
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+ tp->undo_marker = 0;
+ tp->rack.advanced = 1; /* Force RACK to re-exam losses */
}
-/* Account newly detected lost packet(s) */
-
-static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
{
- if (IsFack(tp)) {
- int lost = tp->fackets_out - tp->reordering;
- if (lost <= 0)
- lost = 1;
- tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
- } else {
- tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
- }
+ return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
- /* New heuristics: it is possible only after we switched
- * to restart timer each time when something is ACKed.
- * Hence, we can detect timed out packets during fast
- * retransmit without falling to slow start.
- */
- if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
- struct sk_buff *skb;
+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
- skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
- : sk->sk_write_queue.next;
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+ /* Hold old state until something *above* high_seq
+ * is ACKed. For Reno it is MUST to prevent false
+ * fast retransmits (RFC2582). SACK TCP is safe. */
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+ return true;
+ }
+ return false;
+}
- sk_stream_for_retrans_queue_from(skb, sk) {
- if (!tcp_skb_timedout(sk, skb))
- break;
+/* People celebrate: "We love our President!" */
+static bool tcp_try_undo_recovery(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
+ if (tcp_may_undo(tp)) {
+ int mib_idx;
- /* clear xmit_retrans hint */
- if (tp->retransmit_skb_hint &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ /* Happy end! We did not retransmit anything
+ * or our original transmission succeeded.
+ */
+ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwnd_reduction(sk, false);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+ mib_idx = LINUX_MIB_TCPLOSSUNDO;
+ else
+ mib_idx = LINUX_MIB_TCPFULLUNDO;
- tp->retransmit_skb_hint = NULL;
- }
- }
+ NET_INC_STATS(sock_net(sk), mib_idx);
+ } else if (tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_persist--;
+ }
+ if (tcp_is_non_sack_preventing_reopen(sk))
+ return true;
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ tp->is_sack_reneg = 0;
+ return false;
+}
- tp->scoreboard_skb_hint = skb;
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static bool tcp_try_undo_dsack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
- tcp_sync_left_out(tp);
+ if (tp->undo_marker && !tp->undo_retrans) {
+ tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
+ tp->rack.reo_wnd_persist + 1);
+ DBGUNDO(sk, "D-SACK");
+ tcp_undo_cwnd_reduction(sk, false);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ return true;
}
+ return false;
}
-/* CWND moderation, preventing bursts due to too big ACKs
- * in dubious situations.
- */
-static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp)+tcp_max_burst(tp));
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (frto_undo || tcp_may_undo(tp)) {
+ tcp_undo_cwnd_reduction(sk, true);
+
+ DBGUNDO(sk, "partial loss");
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ if (frto_undo)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
+ if (tcp_is_non_sack_preventing_reopen(sk))
+ return true;
+ if (frto_undo || tcp_is_sack(tp)) {
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ tp->is_sack_reneg = 0;
+ }
+ return true;
+ }
+ return false;
}
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ * 1) If the packets in flight is larger than ssthresh, PRR spreads the
+ * cwnd reductions across a full RTT.
+ * 2) Otherwise PRR uses packet conservation to send as much as delivered.
+ * But when SND_UNA is acked without further losses,
+ * slow starts cwnd up to ssthresh to speed up the recovery.
*/
-static inline u32 tcp_cwnd_min(const struct sock *sk)
+static void tcp_init_cwnd_reduction(struct sock *sk)
{
- const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ struct tcp_sock *tp = tcp_sk(sk);
- return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+ tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tcp_snd_cwnd(tp);
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tcp_ecn_queue_cwr(tp);
}
-/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- int decr = tp->snd_cwnd_cnt + 1;
+ int sndcnt = 0;
+ int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
- tp->snd_cwnd_cnt = decr&1;
- decr >>= 1;
+ if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+ return;
- if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
- tp->snd_cwnd -= decr;
+ trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag);
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->prr_delivered += newly_acked_sacked;
+ if (delta < 0) {
+ u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+ tp->prior_cwnd - 1;
+ sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+ } else {
+ sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked);
+ if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
+ sndcnt++;
+ sndcnt = min(delta, sndcnt);
+ }
+ /* Force a fast retransmit upon entering fast recovery */
+ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
+ tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
}
-/* Nothing was retransmitted or returned timestamp is less
- * than timestamp of the first retransmission.
- */
-static inline int tcp_packet_delayed(struct tcp_sock *tp)
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
- return !tp->retrans_stamp ||
- (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (inet_csk(sk)->icsk_ca_ops->cong_control)
+ return;
+
+ /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
+ (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
+ tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+ }
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
-/* Undo procedures. */
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
-#if FASTRETRANS_DEBUG > 1
-static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
+ tp->prior_ssthresh = 0;
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ tp->undo_marker = 0;
+ tcp_init_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ }
+}
+EXPORT_SYMBOL(tcp_enter_cwr);
+
+static void tcp_try_keep_open(struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
- printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
- msg,
- NIPQUAD(inet->daddr), ntohs(inet->dport),
- tp->snd_cwnd, tp->left_out,
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int state = TCP_CA_Open;
+
+ if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
+ state = TCP_CA_Disorder;
+
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
+ tp->high_seq = tp->snd_nxt;
+ }
}
-#else
-#define DBGUNDO(x...) do { } while (0)
-#endif
-static void tcp_undo_cwr(struct sock *sk, const int undo)
+static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->prior_ssthresh) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_verify_left_out(tp);
- if (icsk->icsk_ca_ops->undo_cwnd)
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
- else
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
- if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
- tp->snd_ssthresh = tp->prior_ssthresh;
- TCP_ECN_withdraw_cwr(tp);
- }
- } else {
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
- }
- tcp_moderate_cwnd(tp);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ if (flag & FLAG_ECE)
+ tcp_enter_cwr(sk);
- /* There is something screwy going on with the retrans hints after
- an undo */
- clear_all_retrans_hints(tp);
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
+ tcp_try_keep_open(sk);
+ }
}
-static inline int tcp_may_undo(struct tcp_sock *tp)
+static void tcp_mtup_probe_failed(struct sock *sk)
{
- return tp->undo_marker &&
- (!tp->undo_retrans || tcp_packet_delayed(tp));
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+ icsk->icsk_mtup.probe_size = 0;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
-/* People celebrate: "We love our President!" */
-static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
+static void tcp_mtup_probe_success(struct sock *sk)
{
- if (tcp_may_undo(tp)) {
- /* Happy end! We did not retransmit anything
- * or our original transmission succeeded.
- */
- DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
- tcp_undo_cwr(sk, 1);
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
- else
- NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
- tp->undo_marker = 0;
- }
- if (tp->snd_una == tp->high_seq && IsReno(tp)) {
- /* Hold old state until something *above* high_seq
- * is ACKed. For Reno it is MUST to prevent false
- * fast retransmits (RFC2582). SACK TCP is safe. */
- tcp_moderate_cwnd(tp);
- return 1;
- }
- tcp_set_ca_state(sk, TCP_CA_Open);
- return 0;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u64 val;
+
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+
+ val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
+ do_div(val, icsk->icsk_mtup.probe_size);
+ DEBUG_NET_WARN_ON_ONCE((u32)val != val);
+ tcp_snd_cwnd_set(tp, max_t(u32, 1U, val));
+
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+
+ icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+ icsk->icsk_mtup.probe_size = 0;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
-/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
-static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
+/* Sometimes we deduce that packets have been dropped due to reasons other than
+ * congestion, like path MTU reductions or failed client TFO attempts. In these
+ * cases we call this function to retransmit as many packets as cwnd allows,
+ * without reducing cwnd. Given that retransmits will set retrans_stamp to a
+ * non-zero value (and may do so in a later calling context due to TSQ), we
+ * also enter CA_Loss so that we track when all retransmitted packets are ACKed
+ * and clear retrans_stamp when that happens (to ensure later recurring RTOs
+ * are using the correct retrans_stamp and don't declare ETIMEDOUT
+ * prematurely).
+ */
+static void tcp_non_congestion_loss_retransmit(struct sock *sk)
{
- if (tp->undo_marker && !tp->undo_retrans) {
- DBGUNDO(sk, tp, "D-SACK");
- tcp_undo_cwr(sk, 1);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
tp->undo_marker = 0;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
}
+ tcp_xmit_retransmit_queue(sk);
}
-/* Undo during fast recovery after partial ACK. */
-
-static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
- int acked)
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
{
- /* Partial ACK arrived. Force Hoe's retransmit. */
- int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int mss;
- if (tcp_may_undo(tp)) {
- /* Plain luck! Hole if filled with delayed
- * packet, rather than with a retransmit.
- */
- if (tp->retrans_out == 0)
- tp->retrans_stamp = 0;
+ /* A fastopen SYN request is stored as two separate packets within
+ * the retransmit queue, this is done by tcp_send_syn_data().
+ * As a result simply checking the MSS of the frames in the queue
+ * will not work for the SYN packet.
+ *
+ * Us being here is an indication of a path MTU issue so we can
+ * assume that the fastopen SYN was lost and just mark all the
+ * frames in the retransmit queue as lost. We will use an MSS of
+ * -1 to mark all frames as lost, otherwise compute the current MSS.
+ */
+ if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
+ mss = -1;
+ else
+ mss = tcp_current_mss(sk);
+
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+ if (tcp_skb_seglen(skb) > mss)
+ tcp_mark_skb_lost(sk, skb);
+ }
- tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+ if (!tp->lost_out)
+ return;
- DBGUNDO(sk, tp, "Hoe");
- tcp_undo_cwr(sk, 0);
- NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
+ if (tcp_is_reno(tp))
+ tcp_limit_reno_sacked(tp);
- /* So... Do not make Hoe's retransmit yet.
- * If the first packet was delayed, the rest
- * ones are most probably delayed as well.
- */
- failed = 0;
- }
- return failed;
+ tcp_verify_left_out(tp);
+
+ /* Don't muck with the congestion window here.
+ * Reason is that we do not increase amount of _data_
+ * in network, but units changed and effective
+ * cwnd/ssthresh really reduced now.
+ */
+ tcp_non_congestion_loss_retransmit(sk);
}
+EXPORT_IPV6_MOD(tcp_simple_retransmit);
-/* Undo during loss recovery after partial ACK. */
-static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
- if (tcp_may_undo(tp)) {
- struct sk_buff *skb;
- sk_stream_for_retrans_queue(skb, sk) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
- clear_all_retrans_hints(tp);
+ /* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */
+ tcp_retrans_stamp_cleanup(sk);
- DBGUNDO(sk, tp, "partial loss");
- tp->lost_out = 0;
- tp->left_out = tp->sacked_out;
- tcp_undo_cwr(sk, 1);
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
- inet_csk(sk)->icsk_retransmits = 0;
- tp->undo_marker = 0;
- if (!IsReno(tp))
- tcp_set_ca_state(sk, TCP_CA_Open);
- return 1;
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS(sock_net(sk), mib_idx);
+
+ tp->prior_ssthresh = 0;
+ tcp_init_undo(tp);
+
+ if (!tcp_in_cwnd_reduction(sk)) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tcp_init_cwnd_reduction(sk);
}
- return 0;
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
}
-static inline void tcp_complete_cwr(struct sock *sk)
+static void tcp_update_rto_time(struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
+ if (tp->rto_stamp) {
+ tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
+ tp->rto_stamp = 0;
+ }
}
-static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
+ int *rexmit)
{
- tp->left_out = tp->sacked_out;
-
- if (tp->retrans_out == 0)
- tp->retrans_stamp = 0;
-
- if (flag&FLAG_ECE)
- tcp_enter_cwr(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool recovered = !before(tp->snd_una, tp->high_seq);
- if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
- int state = TCP_CA_Open;
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
+ tcp_try_undo_loss(sk, false))
+ return;
- if (tp->left_out || tp->retrans_out || tp->undo_marker)
- state = TCP_CA_Disorder;
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, true))
+ return;
- if (inet_csk(sk)->icsk_ca_state != state) {
- tcp_set_ca_state(sk, state);
+ if (after(tp->snd_nxt, tp->high_seq)) {
+ if (flag & FLAG_DATA_SACKED || num_dupack)
+ tp->frto = 0; /* Step 3.a. loss was real */
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
+ /* Step 2.b. Try send new data (but deferred until cwnd
+ * is updated in tcp_ack()). Otherwise fall back to
+ * the conventional recovery.
+ */
+ if (!tcp_write_queue_empty(sk) &&
+ after(tcp_wnd_end(tp), tp->snd_nxt)) {
+ *rexmit = REXMIT_NEW;
+ return;
+ }
+ tp->frto = 0;
}
- tcp_moderate_cwnd(tp);
- } else {
- tcp_cwnd_down(sk);
}
+
+ if (recovered) {
+ /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+ tcp_try_undo_recovery(sk);
+ return;
+ }
+ if (tcp_is_reno(tp)) {
+ /* A Reno DUPACK means new data in F-RTO step 2.b above are
+ * delivered. Lower inflight to clock out (re)transmissions.
+ */
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
+ else if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ }
+ *rexmit = REXMIT_LOST;
}
-static void tcp_mtup_probe_failed(struct sock *sk)
+/* Undo during fast recovery after partial ACK. */
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
- icsk->icsk_mtup.probe_size = 0;
+ if (tp->undo_marker && tcp_packet_delayed(tp)) {
+ /* Plain luck! Hole if filled with delayed
+ * packet, rather than with a retransmit. Check reordering.
+ */
+ tcp_check_sack_reordering(sk, prior_snd_una, 1);
+
+ /* We are getting evidence that the reordering degree is higher
+ * than we realized. If there are no retransmits out then we
+ * can undo. Otherwise we clock out new packets but do not
+ * mark more packets lost or retransmit more.
+ */
+ if (tp->retrans_out)
+ return true;
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ DBGUNDO(sk, "partial recovery");
+ tcp_undo_cwnd_reduction(sk, true);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ tcp_try_keep_open(sk);
+ }
+ return false;
}
-static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
- /* FIXME: breaks with very large cwnd */
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = tp->snd_cwnd *
- tcp_mss_to_mtu(sk, tp->mss_cache) /
- icsk->icsk_mtup.probe_size;
- tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+ if (tcp_rtx_queue_empty(sk))
+ return;
- icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
- icsk->icsk_mtup.probe_size = 0;
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
-}
+ if (unlikely(tcp_is_reno(tp))) {
+ tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+ } else {
+ u32 prior_retrans = tp->retrans_out;
+ if (tcp_rack_mark_lost(sk))
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
+ if (prior_retrans > tp->retrans_out)
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
+}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
- * Besides that it does CWND reduction, when packet loss is detected
- * and changes state of machine.
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void
-tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
- int prior_packets, int flag)
+static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+ int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
+ int flag = *ack_flag;
+ bool ece_ack = flag & FLAG_ECE;
- /* Some technical things:
- * 1. Reno does not count dupacks (sacked_out) automatically. */
- if (!tp->packets_out)
+ if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
- /* 2. SACK counts snd_fack in packets inaccurately. */
- if (tp->sacked_out == 0)
- tp->fackets_out = 0;
- /* Now state machine starts.
+ /* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
- if (flag&FLAG_ECE)
+ if (ece_ack)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tp->sacked_out && tcp_check_sack_reneging(sk))
+ if (tcp_check_sack_reneging(sk, ack_flag))
return;
- /* C. Process data loss notification, provided it is valid. */
- if ((flag&FLAG_DATA_LOST) &&
- before(tp->snd_una, tp->high_seq) &&
- icsk->icsk_ca_state != TCP_CA_Open &&
- tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
- }
-
- /* D. Synchronize left_out to current state. */
- tcp_sync_left_out(tp);
+ /* C. Check consistency of the current state. */
+ tcp_verify_left_out(tp);
- /* E. Check state exit conditions. State can be terminated
+ /* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
- if (!sysctl_tcp_frto)
- BUG_TRAP(tp->retrans_out == 0);
+ WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
- case TCP_CA_Loss:
- icsk->icsk_retransmits = 0;
- if (tcp_try_undo_recovery(sk, tp))
- return;
- break;
-
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
- tcp_complete_cwr(sk);
- tcp_set_ca_state(sk, TCP_CA_Open);
- }
- break;
-
- case TCP_CA_Disorder:
- tcp_try_undo_dsack(sk, tp);
- if (!tp->undo_marker ||
- /* For SACK case do not Open to allow to undo
- * catching for all duplicate ACKs. */
- IsReno(tp) || tp->snd_una != tp->high_seq) {
- tp->undo_marker = 0;
+ tcp_end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
- if (IsReno(tp))
+ if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- if (tcp_try_undo_recovery(sk, tp))
+ if (tcp_try_undo_recovery(sk))
return;
- tcp_complete_cwr(sk);
+ tcp_end_cwnd_reduction(sk);
break;
}
}
- /* F. Process state. */
+ /* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
- if (prior_snd_una == tp->snd_una) {
- if (IsReno(tp) && is_dupack)
- tcp_add_reno_sack(sk);
- } else {
- int acked = prior_packets - tp->packets_out;
- if (IsReno(tp))
- tcp_remove_reno_sacks(sk, tp, acked);
- is_dupack = tcp_try_undo_partial(sk, tp, acked);
+ if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+ if (tcp_is_reno(tp))
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
+ } else if (tcp_try_undo_partial(sk, prior_snd_una))
+ return;
+
+ if (tcp_try_undo_dsack(sk))
+ tcp_try_to_open(sk, flag);
+
+ tcp_identify_packet_loss(sk, ack_flag);
+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
+ if (!tcp_time_to_recover(tp))
+ return;
+ /* Undo reverts the recovery state. If loss is evident,
+ * starts a new recovery (e.g. reordering then loss);
+ */
+ tcp_enter_recovery(sk, ece_ack);
}
break;
case TCP_CA_Loss:
- if (flag&FLAG_DATA_ACKED)
- icsk->icsk_retransmits = 0;
- if (!tcp_try_undo_loss(sk, tp)) {
- tcp_moderate_cwnd(tp);
- tcp_xmit_retransmit_queue(sk);
- return;
- }
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_rto_time(tp);
+ tcp_identify_packet_loss(sk, ack_flag);
+ if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+ (*ack_flag & FLAG_LOST_RETRANS)))
return;
- /* Loss is undone; fall through to processing in Open state. */
+ /* Change state if cwnd is undone or retransmits are lost */
+ fallthrough;
default:
- if (IsReno(tp)) {
- if (tp->snd_una != prior_snd_una)
+ if (tcp_is_reno(tp)) {
+ if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
- if (is_dupack)
- tcp_add_reno_sack(sk);
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
}
- if (icsk->icsk_ca_state == TCP_CA_Disorder)
- tcp_try_undo_dsack(sk, tp);
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+ tcp_try_undo_dsack(sk);
- if (!tcp_time_to_recover(sk, tp)) {
- tcp_try_to_open(sk, tp, flag);
+ tcp_identify_packet_loss(sk, ack_flag);
+ if (!tcp_time_to_recover(tp)) {
+ tcp_try_to_open(sk, flag);
return;
}
@@ -2068,216 +3224,250 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tcp_simple_retransmit(sk);
return;
}
/* Otherwise enter Recovery state */
+ tcp_enter_recovery(sk, ece_ack);
+ }
- if (IsReno(tp))
- NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
- else
- NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
+ *rexmit = REXMIT_LOST;
+}
- tp->high_seq = tp->snd_nxt;
- tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
-
- if (icsk->icsk_ca_state < TCP_CA_CWR) {
- if (!(flag&FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
- }
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
+{
+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
+ struct tcp_sock *tp = tcp_sk(sk);
- tp->bytes_acked = 0;
- tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(sk, TCP_CA_Recovery);
+ if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
+ /* If the remote keeps returning delayed ACKs, eventually
+ * the min filter would pick it up and overestimate the
+ * prop. delay when it expires. Skip suspected delayed ACKs.
+ */
+ return;
}
-
- if (is_dupack || tcp_head_timedout(sk, tp))
- tcp_update_scoreboard(sk, tp);
- tcp_cwnd_down(sk);
- tcp_xmit_retransmit_queue(sk);
+ minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
+ rtt_us ? : jiffies_to_usecs(1));
}
-/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Supersedes RFC1323)
- */
-static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
+static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us, struct rate_sample *rs)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+ * broken middle-boxes or peers may corrupt TS-ECR fields. But
+ * Karn's algorithm forbids taking RTT if some retransmitted data
+ * is acked (RFC6298).
+ */
+ if (seq_rtt_us < 0)
+ seq_rtt_us = sack_rtt_us;
+
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
- *
* See draft-ietf-tcplw-high-performance-00, section 3.3.
- * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
- *
- * Changed: reset backoff as soon as we see the first valid sample.
- * If we do not, we get strongly overestimated rto. With timestamps
- * samples are accepted even from very old segments: f.e., when rtt=1
- * increases to 8, we retransmit 5 times and after 8 seconds delayed
- * answer arrives rto becomes 120 seconds! If at least one of segments
- * in window is lost... Voila. --ANK (010210)
*/
- struct tcp_sock *tp = tcp_sk(sk);
- const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(sk, seq_rtt);
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
+ tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
+ seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
+
+ rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
+ if (seq_rtt_us < 0)
+ return false;
+
+ /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+ * always taken together with ACK, SACK, or TS-opts. Any negative
+ * values will be skipped with the seq_rtt_us < 0 check above.
+ */
+ tcp_update_rtt_min(sk, ca_rtt_us, flag);
+ tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
+
+ /* RFC6298: only reset backoff on valid RTT measurement. */
inet_csk(sk)->icsk_backoff = 0;
- tcp_bound_rto(sk);
+ return true;
}
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
- /* We don't have a timestamp. Can only use
- * packets that are not retransmitted to determine
- * rtt estimates. Also, we must not reset the
- * backoff for rto until we get a non-retransmitted
- * packet. This allows us to deal with a situation
- * where the network delay has increased suddenly.
- * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
- */
+ struct rate_sample rs;
+ long rtt_us = -1L;
- if (flag & FLAG_RETRANS_DATA_ACKED)
- return;
+ if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
+ rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
- tcp_rtt_estimator(sk, seq_rtt);
- tcp_set_rto(sk);
- inet_csk(sk)->icsk_backoff = 0;
- tcp_bound_rto(sk);
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
}
-static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
- const s32 seq_rtt)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
- if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(sk, flag);
- else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(sk, seq_rtt, flag);
-}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
- u32 in_flight, int good)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
- tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
+
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
+ tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
-
-static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+void tcp_rearm_rto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If the retrans timer is currently being used by Fast Open
+ * for SYN-ACK retrans purpose, stay put.
+ */
+ if (rcu_access_pointer(tp->fastopen_rsk))
+ return;
+
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ s64 delta_us = tcp_rto_delta_us(sk);
+ /* delta_us may not be positive if the socket is locked
+ * when the retrans timer fires and is rescheduled.
+ */
+ rto = usecs_to_jiffies(max_t(int, delta_us, 1));
+ }
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true);
}
}
-static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
- __u32 now, __s32 *seq_rtt)
+/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
+static void tcp_set_xmit_timer(struct sock *sk)
+{
+ if (!tcp_schedule_loss_probe(sk, true))
+ tcp_rearm_rto(sk);
+}
+
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- __u32 seq = tp->snd_una;
- __u32 packets_acked;
- int acked = 0;
+ u32 packets_acked;
- /* If we get here, the whole TSO packet has not been
- * acked.
- */
- BUG_ON(!after(scb->end_seq, seq));
+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
packets_acked = tcp_skb_pcount(skb);
- if (tcp_trim_head(sk, skb, seq - scb->seq))
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
- __u8 sacked = scb->sacked;
-
- acked |= FLAG_DATA_ACKED;
- if (sacked) {
- if (sacked & TCPCB_RETRANS) {
- if (sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= packets_acked;
- acked |= FLAG_RETRANS_DATA_ACKED;
- *seq_rtt = -1;
- } else if (*seq_rtt < 0)
- *seq_rtt = now - scb->when;
- if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= packets_acked;
- if (sacked & TCPCB_LOST)
- tp->lost_out -= packets_acked;
- if (sacked & TCPCB_URG) {
- if (tp->urg_mode &&
- !before(seq, tp->snd_up))
- tp->urg_mode = 0;
- }
- } else if (*seq_rtt < 0)
- *seq_rtt = now - scb->when;
-
- if (tp->fackets_out) {
- __u32 dval = min(tp->fackets_out, packets_acked);
- tp->fackets_out -= dval;
- }
- tp->packets_out -= packets_acked;
-
BUG_ON(tcp_skb_pcount(skb) == 0);
- BUG_ON(!before(scb->seq, scb->end_seq));
+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
}
- return acked;
+ return packets_acked;
}
-static u32 tcp_usrtt(struct timeval *tv)
+static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+ const struct sk_buff *ack_skb, u32 prior_snd_una)
{
- struct timeval now;
+ const struct skb_shared_info *shinfo;
+
+ /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
+ if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
+ return;
- do_gettimeofday(&now);
- return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec);
+ shinfo = skb_shinfo(skb);
+ if (!before(shinfo->tskey, prior_snd_una) &&
+ before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+ tcp_skb_tsorted_save(skb) {
+ __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
+ } tcp_skb_tsorted_restore(skb);
+ }
}
-/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_fack, u32 prior_snd_una,
+ struct tcp_sacktag_state *sack, bool ece_ack)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
- __u32 now = tcp_time_stamp;
- int acked = 0;
- __s32 seq_rtt = -1;
+ u64 first_ackt, last_ackt;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
+ struct sk_buff *skb, *next;
+ bool fully_acked = true;
+ long sack_rtt_us = -1L;
+ long seq_rtt_us = -1L;
+ long ca_rtt_us = -1L;
u32 pkts_acked = 0;
- void (*rtt_sample)(struct sock *sk, u32 usrtt)
- = icsk->icsk_ca_ops->rtt_sample;
- struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
-
- while ((skb = skb_peek(&sk->sk_write_queue)) &&
- skb != sk->sk_send_head) {
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- __u8 sacked = scb->sacked;
-
- /* If our packet is before the ack sequence we can
- * discard it as it's confirmed to have arrived at
- * the other end.
- */
+ bool rtt_update;
+ int flag = 0;
+
+ first_ackt = 0;
+
+ for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ const u32 start_seq = scb->seq;
+ u8 sacked = scb->sacked;
+ u32 acked_pcount;
+
+ /* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
- if (tcp_skb_pcount(skb) > 1 &&
- after(tp->snd_una, scb->seq))
- acked |= tcp_tso_acked(sk, skb,
- now, &seq_rtt);
- break;
+ if (tcp_skb_pcount(skb) == 1 ||
+ !after(tp->snd_una, scb->seq))
+ break;
+
+ acked_pcount = tcp_tso_acked(sk, skb);
+ if (!acked_pcount)
+ break;
+ fully_acked = false;
+ } else {
+ acked_pcount = tcp_skb_pcount(skb);
+ }
+
+ if (unlikely(sacked & TCPCB_RETRANS)) {
+ if (sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= acked_pcount;
+ flag |= FLAG_RETRANS_DATA_ACKED;
+ } else if (!(sacked & TCPCB_SACKED_ACKED)) {
+ last_ackt = tcp_skb_timestamp_us(skb);
+ WARN_ON_ONCE(last_ackt == 0);
+ if (!first_ackt)
+ first_ackt = last_ackt;
+
+ if (before(start_seq, reord))
+ reord = start_seq;
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
}
+ if (sacked & TCPCB_SACKED_ACKED) {
+ tp->sacked_out -= acked_pcount;
+ /* snd_una delta covers these skbs */
+ sack->delivered_bytes -= skb->len;
+ } else if (tcp_is_sack(tp)) {
+ tcp_count_delivered(tp, acked_pcount, ece_ack);
+ if (!tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, sacked, scb->end_seq,
+ tcp_skb_timestamp_us(skb));
+ }
+ if (sacked & TCPCB_LOST)
+ tp->lost_out -= acked_pcount;
+
+ tp->packets_out -= acked_pcount;
+ pkts_acked += acked_pcount;
+ tcp_rate_skb_delivered(sk, skb, sack->rate);
+
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
@@ -2285,131 +3475,268 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->flags & TCPCB_FLAG_SYN)) {
- acked |= FLAG_DATA_ACKED;
- ++pkts_acked;
+ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
+ flag |= FLAG_DATA_ACKED;
} else {
- acked |= FLAG_SYN_ACKED;
+ flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
- /* MTU probing checks */
- if (icsk->icsk_mtup.probe_size) {
- if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
- tcp_mtup_probe_success(sk, skb);
- }
+ if (!fully_acked)
+ break;
+
+ tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
+
+ next = skb_rb_next(skb);
+ if (unlikely(skb == tp->retransmit_skb_hint))
+ tp->retransmit_skb_hint = NULL;
+ tcp_highest_sack_replace(sk, skb, next);
+ tcp_rtx_queue_unlink_and_free(skb, sk);
+ }
+
+ if (!skb)
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+
+ if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+ tp->snd_up = tp->snd_una;
+
+ if (skb) {
+ tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ flag |= FLAG_SACK_RENEGING;
+ }
+
+ if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
+ seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
+ ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
+
+ if (pkts_acked == 1 && fully_acked && !prior_sacked &&
+ (tp->snd_una - prior_snd_una) < tp->mss_cache &&
+ sack->rate->prior_delivered + 1 == tp->delivered &&
+ !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
+ /* Conservatively mark a delayed ACK. It's typically
+ * from a lone runt packet over the round trip to
+ * a receiver w/o out-of-order or CE events.
+ */
+ flag |= FLAG_ACK_MAYBE_DELAYED;
+ }
+ }
+ if (sack->first_sackt) {
+ sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
+ ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
+ }
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+ ca_rtt_us, sack->rate);
+
+ if (flag & FLAG_ACKED) {
+ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
+ if (unlikely(icsk->icsk_mtup.probe_size &&
+ !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+ tcp_mtup_probe_success(sk);
}
- if (sacked) {
- if (sacked & TCPCB_RETRANS) {
- if(sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(skb);
- acked |= FLAG_RETRANS_DATA_ACKED;
- seq_rtt = -1;
- } else if (seq_rtt < 0) {
- seq_rtt = now - scb->when;
- skb_get_timestamp(skb, &tv);
- }
- if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= tcp_skb_pcount(skb);
- if (sacked & TCPCB_LOST)
- tp->lost_out -= tcp_skb_pcount(skb);
- if (sacked & TCPCB_URG) {
- if (tp->urg_mode &&
- !before(scb->end_seq, tp->snd_up))
- tp->urg_mode = 0;
- }
- } else if (seq_rtt < 0) {
- seq_rtt = now - scb->when;
- skb_get_timestamp(skb, &tv);
+ if (tcp_is_reno(tp)) {
+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
+
+ /* If any of the cumulatively ACKed segments was
+ * retransmitted, non-SACK case cannot confirm that
+ * progress was due to original transmission due to
+ * lack of TCPCB_SACKED_ACKED bits even if some of
+ * the packets may have been never retransmitted.
+ */
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ flag &= ~FLAG_ORIG_SACK_ACKED;
+ } else {
+ /* Non-retransmitted hole got filled? That's reordering */
+ if (before(reord, prior_fack))
+ tcp_check_sack_reordering(sk, reord, 0);
}
- tcp_dec_pcount_approx(&tp->fackets_out, skb);
- tcp_packets_out_dec(tp, skb);
- __skb_unlink(skb, &sk->sk_write_queue);
- sk_stream_free_skb(sk, skb);
- clear_all_retrans_hints(tp);
+
+ sack->delivered_bytes = (skb ?
+ TCP_SKB_CB(skb)->seq : tp->snd_una) -
+ prior_snd_una;
+ } else if (skb && rtt_update && sack_rtt_us >= 0 &&
+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
+ tcp_skb_timestamp_us(skb))) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
}
- if (acked&FLAG_ACKED) {
- tcp_ack_update_rtt(sk, acked, seq_rtt);
- tcp_ack_packets_out(sk, tp);
- if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
- (*rtt_sample)(sk, tcp_usrtt(&tv));
+ if (icsk->icsk_ca_ops->pkts_acked) {
+ struct ack_sample sample = { .pkts_acked = pkts_acked,
+ .rtt_us = sack->rate->rtt_us };
- if (icsk->icsk_ca_ops->pkts_acked)
- icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
+ sample.in_flight = tp->mss_cache *
+ (tp->delivered - sack->rate->prior_delivered);
+ icsk->icsk_ca_ops->pkts_acked(sk, &sample);
}
#if FASTRETRANS_DEBUG > 0
- BUG_TRAP((int)tp->sacked_out >= 0);
- BUG_TRAP((int)tp->lost_out >= 0);
- BUG_TRAP((int)tp->retrans_out >= 0);
- if (!tp->packets_out && tp->rx_opt.sack_ok) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ WARN_ON((int)tp->sacked_out < 0);
+ WARN_ON((int)tp->lost_out < 0);
+ WARN_ON((int)tp->retrans_out < 0);
+ if (!tp->packets_out && tcp_is_sack(tp)) {
+ icsk = inet_csk(sk);
if (tp->lost_out) {
- printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, icsk->icsk_ca_state);
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
- printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, icsk->icsk_ca_state);
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
- printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, icsk->icsk_ca_state);
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
#endif
- *seq_rtt_p = seq_rtt;
- return acked;
+ return flag;
}
static void tcp_ack_probe(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *head = tcp_send_head(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Was it a usable window open? */
-
- if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
- tp->snd_una + tp->snd_wnd)) {
+ if (!head)
+ return;
+ if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk));
+
+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true);
}
}
-static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
- return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
-static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+/* Decide wheather to run the increase function of congestion control. */
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
- !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+ /* If reordering is high then always grow cwnd whenever data is
+ * delivered regardless of its ordering. Otherwise stay conservative
+ * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
+ * new SACK or ECE mark may first advance cwnd here and later reduce
+ * cwnd in tcp_fastretrans_alert() based on more states.
+ */
+ if (tcp_sk(sk)->reordering >
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
+ return flag & FLAG_FORWARD_PROGRESS;
+
+ return flag & FLAG_DATA_ACKED;
+}
+
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+ int flag, const struct rate_sample *rs)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->cong_control) {
+ icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
+ return;
+ }
+
+ if (tcp_in_cwnd_reduction(sk)) {
+ /* Reduce cwnd if state mandates */
+ tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
+ } else if (tcp_may_raise_cwnd(sk, flag)) {
+ /* Advance cwnd if state allows */
+ tcp_cong_avoid(sk, ack, acked_sacked);
+ }
+ tcp_update_pacing_rate(sk);
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
- const u32 ack_seq, const u32 nwin)
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
+ const u32 ack, const u32 ack_seq,
+ const u32 nwin)
{
- return (after(ack, tp->snd_una) ||
+ return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+ (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
+}
+
+static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held((struct sock *)tp));
+ if (ao && ack < tp->snd_una) {
+ ao->snd_sne++;
+ trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne);
+ }
+#endif
+}
+
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+ u32 delta = ack - tp->snd_una;
+
+ sock_owned_by_me((struct sock *)tp);
+ tp->bytes_acked += delta;
+ tcp_snd_sne_update(tp, ack);
+ tp->snd_una = ack;
+}
+
+static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held((struct sock *)tp));
+ if (ao && seq < tp->rcv_nxt) {
+ ao->rcv_sne++;
+ trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne);
+ }
+#endif
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+ u32 delta = seq - tp->rcv_nxt;
+
+ sock_owned_by_me((struct sock *)tp);
+ tp->bytes_received += delta;
+ tcp_rcv_sne_update(tp, seq);
+ WRITE_ONCE(tp->rcv_nxt, seq);
}
/* Update our send window.
@@ -2417,18 +3744,19 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
-static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb, u32 ack, u32 ack_seq)
+static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
+ u32 ack_seq)
{
+ struct tcp_sock *tp = tcp_sk(sk);
int flag = 0;
- u32 nwin = ntohs(skb->h.th->window);
+ u32 nwin = ntohs(tcp_hdr(skb)->window);
- if (likely(!skb->h.th->syn))
+ if (likely(!tcp_hdr(skb)->syn))
nwin <<= tp->rx_opt.snd_wscale;
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
- tcp_update_wl(tp, ack, ack_seq);
+ tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
@@ -2437,7 +3765,10 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
* fast path is recovered for sending TCP.
*/
tp->pred_flags = 0;
- tcp_fast_path_check(sk, tp);
+ tcp_fast_path_check(sk);
+
+ if (!tcp_write_queue_empty(sk))
+ tcp_slow_start_after_idle_check(sk);
if (nwin > tp->max_window) {
tp->max_window = nwin;
@@ -2446,299 +3777,749 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
}
}
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
return flag;
}
-static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
+static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
+ u32 *last_oow_ack_time)
+{
+ /* Paired with the WRITE_ONCE() in this function. */
+ u32 val = READ_ONCE(*last_oow_ack_time);
+
+ if (val) {
+ s32 elapsed = (s32)(tcp_jiffies32 - val);
+
+ if (0 <= elapsed &&
+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
+ NET_INC_STATS(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ /* Paired with the prior READ_ONCE() and with itself,
+ * as we might be lockless.
+ */
+ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
+
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ return false;
+
+ return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
+}
+
+static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector)
{
struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_sync_left_out(tp);
-
- if (tp->snd_una == prior_snd_una ||
- !before(tp->snd_una, tp->frto_highmark)) {
- /* RTO was caused by loss, start retransmitting in
- * go-back-N slow start
- */
- tcp_enter_frto_loss(sk);
+ u16 flags = 0;
+
+ if (accecn_reflector)
+ flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ __tcp_send_ack(sk, tp->rcv_nxt, flags);
+}
+
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 count, now, ack_limit;
+
+ /* First check our per-socket dupack rate limit. */
+ if (__tcp_oow_rate_limited(net,
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+ &tp->last_oow_ack_time))
return;
+
+ ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+ if (ack_limit == INT_MAX)
+ goto send_ack;
+
+ /* Then check host-wide RFC 5961 rate limit. */
+ now = jiffies / HZ;
+ if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
+ u32 half = (ack_limit + 1) >> 1;
+
+ WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
+ WRITE_ONCE(net->ipv4.tcp_challenge_count,
+ get_random_u32_inclusive(half, ack_limit + half - 1));
}
+ count = READ_ONCE(net->ipv4.tcp_challenge_count);
+ if (count > 0) {
+ WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
+send_ack:
+ NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack_reflect_ect(sk, accecn_reflector);
+ }
+}
+
+static void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
+}
+
+static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta)
+{
+ tcp_store_ts_recent(tp);
+ return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0;
+}
+
+static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+ s32 delta;
- if (tp->frto_counter == 1) {
- /* First ACK after RTO advances the window: allow two new
- * segments out.
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
*/
- tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
- } else {
- /* Also the second ACK after RTO advances the window.
- * The RTO was likely spurious. Reduce cwnd and continue
- * in congestion avoidance
+
+ if (tcp_paws_check(&tp->rx_opt, 0)) {
+ delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent;
+ return __tcp_replace_ts_recent(tp, delta);
+ }
+ }
+
+ return 0;
+}
+
+/* This routine deals with acks during a TLP episode and ends an episode by
+ * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (before(ack, tp->tlp_high_seq))
+ return;
+
+ if (!tp->tlp_retrans) {
+ /* TLP of new data has been acknowledged */
+ tp->tlp_high_seq = 0;
+ } else if (flag & FLAG_DSACK_TLP) {
+ /* This DSACK means original and TLP probe arrived; no loss */
+ tp->tlp_high_seq = 0;
+ } else if (after(ack, tp->tlp_high_seq)) {
+ /* ACK advances: there was a loss, so reduce cwnd. Reset
+ * tlp_high_seq in tcp_init_cwnd_reduction()
*/
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- tcp_moderate_cwnd(tp);
+ tcp_init_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ tcp_end_cwnd_reduction(sk);
+ tcp_try_keep_open(sk);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
+ } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ /* Pure dupack: original and TLP probe arrived; no loss */
+ tp->tlp_high_seq = 0;
}
+}
- /* F-RTO affects on two new ACKs following RTO.
- * At latest on third ACK the TCP behavior is back to normal.
- */
- tp->frto_counter = (tp->frto_counter + 1) % 3;
+static void tcp_in_ack_event(struct sock *sk, int flag)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->in_ack_event) {
+ u32 ack_ev_flags = 0;
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+ if (flag & FLAG_SLOWPATH) {
+ ack_ev_flags |= CA_ACK_SLOWPATH;
+ if (flag & FLAG_ECE)
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags);
+ }
+}
+
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
+ return;
+
+ if (unlikely(rexmit == REXMIT_NEW)) {
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return;
+ tp->frto = 0;
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* Returns the number of packets newly acked or sacked by the current ACK */
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count, int flag)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delivered;
+
+ delivered = tp->delivered - prior_delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+
+ if (flag & FLAG_ECE) {
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
+ }
+
+ return delivered;
}
/* This routine deals with incoming acks, but not outgoing ones. */
-static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sacktag_state sack_state;
+ struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
+ bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- u32 prior_in_flight;
- s32 seq_rtt;
- int prior_packets;
-
- /* If the ack is newer than sent or older than previous acks
+ int num_dupack = 0;
+ int prior_packets = tp->packets_out;
+ u32 delivered = tp->delivered;
+ u32 lost = tp->lost;
+ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
+ u32 prior_fack;
+
+ sack_state.first_sackt = 0;
+ sack_state.rate = &rs;
+ sack_state.sack_delivered = 0;
+ sack_state.delivered_bytes = 0;
+
+ /* We very likely will need to access rtx queue. */
+ prefetch(sk->tcp_rtx_queue.rb_node);
+
+ /* If the ack is older than previous acks
* then we can probably ignore it.
*/
+ if (before(ack, prior_snd_una)) {
+ u32 max_window;
+
+ /* do not accept ACK for bytes we never sent. */
+ max_window = min_t(u64, tp->max_window, tp->bytes_acked);
+ /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+ if (before(ack, prior_snd_una - max_window)) {
+ if (!(flag & FLAG_NO_CHALLENGE_ACK))
+ tcp_send_challenge_ack(sk, false);
+ return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
+ }
+ goto old_ack;
+ }
+
+ /* If the ack includes data we haven't sent yet, discard
+ * this segment (RFC793 Section 3.9).
+ */
if (after(ack, tp->snd_nxt))
- goto uninteresting_ack;
+ return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
- if (before(ack, prior_snd_una))
- goto old_ack;
+ if (after(ack, prior_snd_una)) {
+ flag |= FLAG_SND_UNA_ADVANCED;
+ WRITE_ONCE(icsk->icsk_retransmits, 0);
- if (sysctl_tcp_abc) {
- if (icsk->icsk_ca_state < TCP_CA_CWR)
- tp->bytes_acked += ack - prior_snd_una;
- else if (icsk->icsk_ca_state == TCP_CA_Loss)
- /* we assume just one segment left network */
- tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ if (static_branch_unlikely(&clean_acked_data_enabled.key))
+ if (tp->tcp_clean_acked)
+ tp->tcp_clean_acked(sk, ack);
+#endif
}
- if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ rs.prior_in_flight = tcp_packets_in_flight(tp);
+
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ if (flag & FLAG_UPDATE_TS_RECENT)
+ flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+ FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
- tcp_update_wl(tp, ack, ack_seq);
- tp->snd_una = ack;
+ tcp_update_wl(tp, ack_seq);
+ tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
- tcp_ca_event(sk, CA_EVENT_FAST_ACK);
-
- NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
- NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
- flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
+ flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_state);
- if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
- tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+ if (sack_state.sack_delivered)
+ tcp_count_delivered(tp, sack_state.sack_delivered,
+ flag & FLAG_ECE);
}
+ /* This is a deviation from RFC3168 since it states that:
+ * "When the TCP data sender is ready to set the CWR bit after reducing
+ * the congestion window, it SHOULD set the CWR bit only on the first
+ * new data packet that it transmits."
+ * We accept CWR on pure ACKs to be more robust
+ * with widely-deployed TCP implementations that do this.
+ */
+ tcp_ecn_accept_cwr(sk, skb);
+
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
- sk->sk_err_soft = 0;
- tp->rcv_tstamp = tcp_time_stamp;
- prior_packets = tp->packets_out;
+ if (READ_ONCE(sk->sk_err_soft))
+ WRITE_ONCE(sk->sk_err_soft, 0);
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
+ tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
- prior_in_flight = tcp_packets_in_flight(tp);
-
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+ flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
+ &sack_state, flag & FLAG_ECE);
+
+ tcp_rack_update_reo_wnd(sk, &rs);
- if (tp->frto_counter)
- tcp_process_frto(sk, prior_snd_una);
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+
+ tcp_in_ack_event(sk, flag);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
if (tcp_ack_is_dubious(sk, flag)) {
- /* Advance CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
- tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
- } else {
- if ((flag & FLAG_DATA_ACKED))
- tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
+ if (!(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
+ num_dupack = 1;
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
+ if (!(flag & FLAG_DATA))
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ }
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+ &rexmit);
}
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
- dst_confirm(sk->sk_dst_cache);
+ /* If needed, reset TLP/RTO timer when RACK doesn't set. */
+ if (flag & FLAG_SET_XMIT_TIMER)
+ tcp_set_xmit_timer(sk);
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+ sk_dst_confirm(sk);
+
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
+
+ lost = tp->lost - lost; /* freshly marked lost */
+ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+ tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
- icsk->icsk_probes_out = 0;
-
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+ tcp_in_ack_event(sk, flag);
+ /* If data was DSACKed, see if we can undo a cwnd reduction. */
+ if (flag & FLAG_DSACKING_ACK) {
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+ &rexmit);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
+ }
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
- if (sk->sk_send_head)
- tcp_ack_probe(sk);
+ tcp_ack_probe(sk);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
return 1;
old_ack:
- if (TCP_SKB_CB(skb)->sacked)
- tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+ /* If data was SACKed, tag it and see if we should send more data.
+ * If data was DSACKed, see if we can undo a cwnd reduction.
+ */
+ if (TCP_SKB_CB(skb)->sacked) {
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_state);
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+ &rexmit);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
+ tcp_xmit_recovery(sk, rexmit);
+ }
-uninteresting_ack:
- SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+ bool syn, struct tcp_fastopen_cookie *foc,
+ bool exp_opt)
+{
+ /* Valid only in SYN or SYN-ACK with an even length. */
+ if (!foc || !syn || len < 0 || (len & 1))
+ return;
+
+ if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+ len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, cookie, len);
+ else if (len != 0)
+ len = -1;
+ foc->len = len;
+ foc->exp = exp_opt;
+}
+
+static bool smc_parse_options(const struct tcphdr *th,
+ struct tcp_options_received *opt_rx,
+ const unsigned char *ptr,
+ int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (th->syn && !(opsize & 1) &&
+ opsize >= TCPOLEN_EXP_SMC_BASE &&
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
+ opt_rx->smc_ok = 1;
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+ u16 mss = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return mss;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return mss;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return mss;
+ if (opsize > length)
+ return mss; /* fail on partial options */
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+ u16 in_mss = get_unaligned_be16(ptr);
+
+ if (in_mss) {
+ if (user_mss && user_mss < in_mss)
+ in_mss = user_mss;
+ mss = in_mss;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return mss;
+}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
+void tcp_parse_options(const struct net *net,
+ const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx, int estab,
+ struct tcp_fastopen_cookie *foc)
{
- unsigned char *ptr;
- struct tcphdr *th = skb->h.th;
- int length=(th->doff*4)-sizeof(struct tcphdr);
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
- ptr = (unsigned char *)(th + 1);
+ ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->accecn = 0;
+ opt_rx->saw_unknown = 0;
- while(length>0) {
- int opcode=*ptr++;
+ while (length > 0) {
+ int opcode = *ptr++;
int opsize;
switch (opcode) {
- case TCPOPT_EOL:
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
return;
- case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
- length--;
- continue;
- default:
- opsize=*ptr++;
- if (opsize < 2) /* "silly options" */
- return;
- if (opsize > length)
- return; /* don't parse partial options */
- switch(opcode) {
- case TCPOPT_MSS:
- if(opsize==TCPOLEN_MSS && th->syn && !estab) {
- u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
- if (in_mss) {
- if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
- in_mss = opt_rx->user_mss;
- opt_rx->mss_clamp = in_mss;
- }
- }
- break;
- case TCPOPT_WINDOW:
- if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
- if (sysctl_tcp_window_scaling) {
- __u8 snd_wscale = *(__u8 *) ptr;
- opt_rx->wscale_ok = 1;
- if (snd_wscale > 14) {
- if(net_ratelimit())
- printk(KERN_INFO "tcp_parse_options: Illegal window "
- "scaling value %d >14 received.\n",
- snd_wscale);
- snd_wscale = 14;
- }
- opt_rx->snd_wscale = snd_wscale;
- }
- break;
- case TCPOPT_TIMESTAMP:
- if(opsize==TCPOLEN_TIMESTAMP) {
- if ((estab && opt_rx->tstamp_ok) ||
- (!estab && sysctl_tcp_timestamps)) {
- opt_rx->saw_tstamp = 1;
- opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
- opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
- }
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS && th->syn && !estab) {
+ u16 in_mss = get_unaligned_be16(ptr);
+ if (in_mss) {
+ if (opt_rx->user_mss &&
+ opt_rx->user_mss < in_mss)
+ in_mss = opt_rx->user_mss;
+ opt_rx->mss_clamp = in_mss;
}
- break;
- case TCPOPT_SACK_PERM:
- if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
- if (sysctl_tcp_sack) {
- opt_rx->sack_ok = 1;
- tcp_sack_reset(opt_rx);
- }
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW && th->syn &&
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
+ __u8 snd_wscale = *(__u8 *)ptr;
+ opt_rx->wscale_ok = 1;
+ if (snd_wscale > TCP_MAX_WSCALE) {
+ net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
+ __func__,
+ snd_wscale,
+ TCP_MAX_WSCALE);
+ snd_wscale = TCP_MAX_WSCALE;
}
- break;
+ opt_rx->snd_wscale = snd_wscale;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if ((opsize == TCPOLEN_TIMESTAMP) &&
+ ((estab && opt_rx->tstamp_ok) ||
+ (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
+ opt_rx->saw_tstamp = 1;
+ opt_rx->rcv_tsval = get_unaligned_be32(ptr);
+ opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM && th->syn &&
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
+ opt_rx->sack_ok = TCP_SACK_SEEN;
+ tcp_sack_reset(opt_rx);
+ }
+ break;
- case TCPOPT_SACK:
- if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
- !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
- opt_rx->sack_ok) {
- TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
- }
+ case TCPOPT_SACK:
+ if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+ !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+ opt_rx->sack_ok) {
+ TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+ }
+ break;
#ifdef CONFIG_TCP_MD5SIG
- case TCPOPT_MD5SIG:
- /*
- * The MD5 Hash has already been
- * checked (see tcp_v{4,6}_do_rcv()).
- */
- break;
+ case TCPOPT_MD5SIG:
+ /* The MD5 Hash has already been
+ * checked (see tcp_v{4,6}_rcv()).
+ */
+ break;
#endif
- };
- ptr+=opsize-2;
- length-=opsize;
- };
+#ifdef CONFIG_TCP_AO
+ case TCPOPT_AO:
+ /* TCP AO has already been checked
+ * (see tcp_inbound_ao_hash()).
+ */
+ break;
+#endif
+ case TCPOPT_FASTOPEN:
+ tcp_parse_fastopen_option(
+ opsize - TCPOLEN_FASTOPEN_BASE,
+ ptr, th->syn, foc, false);
+ break;
+
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ /* Save offset of AccECN option in TCP header */
+ opt_rx->accecn = (ptr - 2) - (__u8 *)th;
+ break;
+
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number.
+ */
+ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+ get_unaligned_be16(ptr) ==
+ TCPOPT_FASTOPEN_MAGIC) {
+ tcp_parse_fastopen_option(opsize -
+ TCPOLEN_EXP_FASTOPEN_BASE,
+ ptr + 2, th->syn, foc, true);
+ break;
+ }
+
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
+ break;
+
+ opt_rx->saw_unknown = 1;
+ break;
+
+ default:
+ opt_rx->saw_unknown = 1;
+ }
+ ptr += opsize-2;
+ length -= opsize;
+ }
+ }
+}
+EXPORT_SYMBOL(tcp_parse_options);
+
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ const __be32 *ptr = (const __be32 *)(th + 1);
+
+ if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ tp->rx_opt.saw_tstamp = 1;
+ ++ptr;
+ tp->rx_opt.rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ if (*ptr)
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+ else
+ tp->rx_opt.rcv_tsecr = 0;
+ return true;
}
+ return false;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp)
+static bool tcp_fast_parse_options(const struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th, struct tcp_sock *tp)
{
- if (th->doff == sizeof(struct tcphdr)>>2) {
+ /* In the spirit of fast parsing, compare doff directly to constant
+ * values. Because equality is used, short doff can be ignored here.
+ */
+ if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
- return 0;
+ tp->rx_opt.accecn = 0;
+ return false;
} else if (tp->rx_opt.tstamp_ok &&
- th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
- __be32 *ptr = (__be32 *)(th + 1);
- if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
- tp->rx_opt.saw_tstamp = 1;
- ++ptr;
- tp->rx_opt.rcv_tsval = ntohl(*ptr);
- ++ptr;
- tp->rx_opt.rcv_tsecr = ntohl(*ptr);
- return 1;
+ th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
+ if (tcp_parse_aligned_timestamp(tp, th)) {
+ tp->rx_opt.accecn = 0;
+ return true;
}
}
- tcp_parse_options(skb, &tp->rx_opt, 1);
- return 1;
-}
-static inline void tcp_store_ts_recent(struct tcp_sock *tp)
-{
- tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+ tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+ return true;
}
-static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+/*
+ * Parse Signature options
+ */
+int tcp_do_parse_auth_options(const struct tcphdr *th,
+ const u8 **md5_hash, const u8 **ao_hash)
{
- if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
- /* PAWS bug workaround wrt. ACK frames, the PAWS discard
- * extra check below makes sure this can only happen
- * for pure ACK frames. -DaveM
- *
- * Not only, also it occurs for expired timestamps.
- */
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
+ unsigned int minlen = TCPOLEN_MD5SIG;
- if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
- xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
- tcp_store_ts_recent(tp);
+ if (IS_ENABLED(CONFIG_TCP_AO))
+ minlen = sizeof(struct tcp_ao_hdr) + 1;
+
+ *md5_hash = NULL;
+ *ao_hash = NULL;
+
+ /* If not enough data remaining, we can short cut */
+ while (length >= minlen) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return 0;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ return -EINVAL;
+ if (opcode == TCPOPT_MD5SIG) {
+ if (opsize != TCPOLEN_MD5SIG)
+ return -EINVAL;
+ if (unlikely(*md5_hash || *ao_hash))
+ return -EEXIST;
+ *md5_hash = ptr;
+ } else if (opcode == TCPOPT_AO) {
+ if (opsize <= sizeof(struct tcp_ao_hdr))
+ return -EINVAL;
+ if (unlikely(*md5_hash || *ao_hash))
+ return -EEXIST;
+ *ao_hash = ptr;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
}
+ return 0;
}
+EXPORT_SYMBOL(tcp_do_parse_auth_options);
+#endif
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
@@ -2763,32 +4544,57 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
-static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+/* Estimates max number of increments of remote peer TSval in
+ * a replay window (based on our current RTO estimation).
+ */
+static u32 tcp_tsval_replay(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th = skb->h.th;
- u32 seq = TCP_SKB_CB(skb)->seq;
+ /* If we use usec TS resolution,
+ * then expect the remote peer to use the same resolution.
+ */
+ if (tcp_sk(sk)->tcp_usec_ts)
+ return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ);
+
+ /* RFC 7323 recommends a TSval clock between 1ms and 1sec.
+ * We know that some OS (including old linux) can use 1200 Hz.
+ */
+ return inet_csk(sk)->icsk_rto * 1200 / HZ;
+}
+
+static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ SKB_DR_INIT(reason, TCP_RFC7323_PAWS);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ u32 seq = TCP_SKB_CB(skb)->seq;
- return (/* 1. Pure ACK with correct sequence number. */
- (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+ /* 1. Is this not a pure ACK ? */
+ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq)
+ return reason;
- /* 2. ... and duplicate ACK. */
- ack == tp->snd_una &&
+ /* 2. Is its sequence not the expected one ? */
+ if (seq != tp->rcv_nxt)
+ return before(seq, tp->rcv_nxt) ?
+ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK :
+ reason;
- /* 3. ... and does not update window. */
- !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+ /* 3. Is this not a duplicate ACK ? */
+ if (ack != tp->snd_una)
+ return reason;
- /* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
-}
+ /* 4. Is this updating the window ? */
+ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) <<
+ tp->rx_opt.snd_wscale))
+ return reason;
-static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
- xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
- !tcp_disordered_ack(sk, skb));
+ /* 5. Is this not in the replay window ? */
+ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) >
+ tcp_tsval_replay(sk))
+ return reason;
+
+ return 0;
}
/* Check segment sequence number for validity.
@@ -2804,33 +4610,68 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *
* (borrowed from freebsd)
*/
-static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static enum skb_drop_reason tcp_sequence(const struct sock *sk,
+ u32 seq, u32 end_seq)
{
- return !before(end_seq, tp->rcv_wup) &&
- !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (before(end_seq, tp->rcv_wup))
+ return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
+
+ if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) {
+ if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+ /* Only accept this packet if receive queue is empty. */
+ if (skb_queue_len(&sk->sk_receive_queue))
+ return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE;
+ }
+
+ return SKB_NOT_DROPPED_YET;
+}
+
+
+void tcp_done_with_error(struct sock *sk, int err)
+{
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ WRITE_ONCE(sk->sk_err, err);
+ smp_wmb();
+
+ tcp_write_queue_purge(sk);
+ tcp_done(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
}
+EXPORT_IPV6_MOD(tcp_done_with_error);
/* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
+ int err;
+
+ trace_tcp_receive_reset(sk);
+
+ /* mptcp can't tell us to ignore reset pkts,
+ * so just ignore the return value of mptcp_incoming_options().
+ */
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb);
+
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
- case TCP_SYN_SENT:
- sk->sk_err = ECONNREFUSED;
- break;
- case TCP_CLOSE_WAIT:
- sk->sk_err = EPIPE;
- break;
- case TCP_CLOSE:
- return;
- default:
- sk->sk_err = ECONNRESET;
+ case TCP_SYN_SENT:
+ err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ err = ECONNRESET;
}
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
-
- tcp_done(sk);
+ tcp_done_with_error(sk, err);
}
/*
@@ -2847,62 +4688,61 @@ static void tcp_reset(struct sock *sk)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_schedule_ack(sk);
- sk->sk_shutdown |= RCV_SHUTDOWN;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
- case TCP_SYN_RECV:
- case TCP_ESTABLISHED:
- /* Move to CLOSE_WAIT */
- tcp_set_state(sk, TCP_CLOSE_WAIT);
- inet_csk(sk)->icsk_ack.pingpong = 1;
- break;
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ /* Move to CLOSE_WAIT */
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+ inet_csk_enter_pingpong_mode(sk);
+ break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSING:
- /* Received a retransmission of the FIN, do
- * nothing.
- */
- break;
- case TCP_LAST_ACK:
- /* RFC793: Remain in the LAST-ACK state. */
- break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /* Received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_LAST_ACK:
+ /* RFC793: Remain in the LAST-ACK state. */
+ break;
- case TCP_FIN_WAIT1:
- /* This case occurs when a simultaneous close
- * happens, we must ack the received FIN and
- * enter the CLOSING state.
- */
- tcp_send_ack(sk);
- tcp_set_state(sk, TCP_CLOSING);
- break;
- case TCP_FIN_WAIT2:
- /* Received a FIN -- send ACK and enter TIME_WAIT. */
- tcp_send_ack(sk);
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- break;
- default:
- /* Only TCP_LISTEN and TCP_CLOSE are left, in these
- * cases we should never reach this piece of code.
- */
- printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
- __FUNCTION__, sk->sk_state);
- break;
- };
+ case TCP_FIN_WAIT1:
+ /* This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ */
+ tcp_send_ack(sk);
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_send_ack(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
+ */
+ pr_err("%s: Impossible, sk->sk_state=%d\n",
+ __func__, sk->sk_state);
+ break;
+ }
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
- __skb_queue_purge(&tp->out_of_order_queue);
- if (tp->rx_opt.sack_ok)
+ skb_rbtree_purge(&tp->out_of_order_queue);
+ if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_stream_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -2910,62 +4750,94 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, 1, POLL_HUP);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
- sk_wake_async(sk, 1, POLL_IN);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
}
-static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+ u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
- if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
+ int mib_idx;
+
if (before(seq, tp->rcv_nxt))
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
+ mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
else
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);
+ mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
+
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
}
}
-static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (!tp->rx_opt.dsack)
- tcp_dsack_set(tp, seq, end_seq);
+ tcp_dsack_set(sk, seq, end_seq);
else
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
-static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+{
+ /* When the ACK path fails or drops most ACKs, the sender would
+ * timeout and spuriously retransmit the same segment repeatedly.
+ * If it seems our ACKs are not reaching the other side,
+ * based on receiving a duplicate data segment with new flowlabel
+ * (suggesting the sender suffered an RTO), and we are not already
+ * repathing due to our own RTO, then rehash the socket to repath our
+ * packets.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
+ ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
+ sk_rethink_txhash(sk))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+
+ /* Save last flowlabel after a spurious retrans. */
+ tcp_save_lrcv_flowlabel(sk, skb);
+#endif
+}
+
+static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
- if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_spurious_retrans(sk, skb);
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
- tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
}
}
@@ -2979,12 +4851,12 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
int this_sack;
struct tcp_sack_block *sp = &tp->selective_acks[0];
- struct tcp_sack_block *swalk = sp+1;
+ struct tcp_sack_block *swalk = sp + 1;
/* See if the recent change to the first SACK eats into
* or hits the sequence space of other SACK blocks, if so coalesce.
*/
- for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
int i;
@@ -2992,28 +4864,42 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
- for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
- sp[i] = sp[i+1];
+ for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
+ sp[i] = sp[i + 1];
continue;
}
- this_sack++, swalk++;
+ this_sack++;
+ swalk++;
}
}
-static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+void tcp_sack_compress_send_ack(struct sock *sk)
{
- __u32 tmp;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->compressed_ack)
+ return;
+
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+ __sock_put(sk);
- tmp = sack1->start_seq;
- sack1->start_seq = sack2->start_seq;
- sack2->start_seq = tmp;
+ /* Since we have to send one ack finally,
+ * substract one from tp->compressed_ack to keep
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
+ */
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+ tp->compressed_ack - 1);
- tmp = sack1->end_seq;
- sack1->end_seq = sack2->end_seq;
- sack2->end_seq = tmp;
+ tp->compressed_ack = 0;
+ tcp_send_ack(sk);
}
+/* Reasonable amount of sack blocks included in TCP SACK option
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
+ * Given that SACK packets might be lost, be conservative and use 2.
+ */
+#define TCP_SACK_BLOCKS_EXPECTED 2
+
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3024,37 +4910,41 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
if (!cur_sacks)
goto new_sack;
- for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
+ for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+ tcp_sack_compress_send_ack(sk);
/* Rotate this_sack to the first one. */
- for (; this_sack>0; this_sack--, sp--)
- tcp_sack_swap(sp, sp-1);
+ for (; this_sack > 0; this_sack--, sp--)
+ swap(*sp, *(sp - 1));
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
}
}
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+ tcp_sack_compress_send_ack(sk);
+
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
*
* If the sack array is full, forget about the last one.
*/
- if (this_sack >= 4) {
+ if (this_sack >= TCP_NUM_SACKS) {
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
}
- for(; this_sack > 0; this_sack--, sp--)
- *sp = *(sp-1);
+ for (; this_sack > 0; this_sack--, sp--)
+ *sp = *(sp - 1);
new_sack:
/* Build the new head SACK, and we're done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
}
/* RCV.NXT advances, some SACKs should be eaten. */
@@ -3066,22 +4956,21 @@ static void tcp_sack_remove(struct tcp_sock *tp)
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_empty(&tp->out_of_order_queue)) {
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
return;
}
- for(this_sack = 0; this_sack < num_sacks; ) {
+ for (this_sack = 0; this_sack < num_sacks;) {
/* Check if the start of the sack is covered by RCV.NXT. */
if (!before(tp->rcv_nxt, sp->start_seq)) {
int i;
/* RCV.NXT must cover all the block! */
- BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq));
+ WARN_ON(before(tp->rcv_nxt, sp->end_seq));
/* Zap this SACK, by moving forward any other SACKS. */
- for (i=this_sack+1; i < num_sacks; i++)
+ for (i = this_sack+1; i < num_sacks; i++)
tp->selective_acks[i-1] = tp->selective_acks[i];
num_sacks--;
continue;
@@ -3089,10 +4978,79 @@ static void tcp_sack_remove(struct tcp_sock *tp)
this_sack++;
sp++;
}
- if (num_sacks != tp->rx_opt.num_sacks) {
- tp->rx_opt.num_sacks = num_sacks;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+ tp->rx_opt.num_sacks = num_sacks;
+}
+
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!tcp_skb_can_collapse_rx(to, from))
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+
+ if (TCP_SKB_CB(from)->has_rxtstamp) {
+ TCP_SKB_CB(to)->has_rxtstamp = true;
+ to->tstamp = from->tstamp;
+ skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
+ }
+
+ return true;
+}
+
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+ /* In case tcp_drop_reason() is called later, update to->gso_segs */
+ if (res) {
+ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+ max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
}
+ return res;
+}
+
+noinline_for_tracing static void
+tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ sk_drops_skbadd(sk, skb);
+ sk_skb_reason_drop(sk, skb, reason);
}
/* This one checks to see if we can put data from the
@@ -3102,316 +5060,573 @@ static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
- struct sk_buff *skb;
+ bool fin, fragstolen, eaten;
+ struct sk_buff *skb, *tail;
+ struct rb_node *p;
- while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ p = rb_first(&tp->out_of_order_queue);
+ while (p) {
+ skb = rb_to_skb(p);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
__u32 dsack = dsack_high;
+
if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
- dsack_high = TCP_SKB_CB(skb)->end_seq;
- tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack);
+ dsack = TCP_SKB_CB(skb)->end_seq;
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &tp->out_of_order_queue);
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
- SOCK_DEBUG(sk, "ofo packet was already received \n");
- __skb_unlink(skb, &tp->out_of_order_queue);
- __kfree_skb(skb);
+ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP);
continue;
}
- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
- __skb_unlink(skb, &tp->out_of_order_queue);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- if(skb->h.th->fin)
- tcp_fin(skb, sk, skb->h.th);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ if (!eaten)
+ tcp_add_receive_queue(sk, skb);
+ else
+ kfree_skb_partial(skb, fragstolen);
+
+ if (unlikely(fin)) {
+ tcp_fin(sk);
+ /* tcp_fin() purges tp->out_of_order_queue,
+ * so we must end this loop right now.
+ */
+ break;
+ }
}
}
-static int tcp_prune_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
-static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+/* Check if this incoming skb can be added to socket receive queues
+ * while satisfying sk->sk_rcvbuf limit.
+ *
+ * In theory we should use skb->truesize, but this can cause problems
+ * when applications use too small SO_RCVBUF values.
+ * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio,
+ * allowing RWIN to be close to available space.
+ * Whenever the receive queue gets full, we can receive a small packet
+ * filling RWIN, but with a high skb->truesize, because most NIC use 4K page
+ * plus sk_buff metadata even when receiving less than 1500 bytes of payload.
+ *
+ * Note that we use skb->len to decide to accept or drop this packet,
+ * but sk->sk_rmem_alloc is the sum of all skb->truesize.
+ */
+static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
+{
+ unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
+
+ return rmem + skb->len <= sk->sk_rcvbuf;
+}
+
+static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
+ unsigned int size)
+{
+ if (!tcp_can_ingest(sk, skb) ||
+ !sk_rmem_schedule(sk, skb, size)) {
+
+ if (tcp_prune_queue(sk, skb) < 0)
+ return -1;
+
+ while (!sk_rmem_schedule(sk, skb, size)) {
+ if (!tcp_prune_ofo_queue(sk, skb))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = skb->h.th;
struct tcp_sock *tp = tcp_sk(sk);
- int eaten = -1;
+ struct rb_node **p, *parent;
+ struct sk_buff *skb1;
+ u32 seq, end_seq;
+ bool fragstolen;
+
+ tcp_save_lrcv_flowlabel(sk, skb);
+ tcp_data_ecn_check(sk, skb);
+
+ if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ sk->sk_data_ready(sk);
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM);
+ return;
+ }
- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
- goto drop;
+ tcp_measure_rcv_mss(sk, skb);
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+ inet_csk_schedule_ack(sk);
+
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ p = &tp->out_of_order_queue.rb_node;
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+ /* Initial out of order segment, build 1 SACK. */
+ if (tcp_is_sack(tp)) {
+ tp->rx_opt.num_sacks = 1;
+ tp->selective_acks[0].start_seq = seq;
+ tp->selective_acks[0].end_seq = end_seq;
+ }
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ tp->ooo_last_skb = skb;
+ goto end;
+ }
+
+ /* In the typical case, we are adding an skb to the end of the list.
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+ skb, &fragstolen)) {
+coalesce_done:
+ /* For non sack flows, do not grow window to force DUPACK
+ * and trigger fast retransmit.
+ */
+ if (tcp_is_sack(tp))
+ tcp_grow_window(sk, skb, true);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ goto add_sack;
+ }
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+ parent = &tp->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
+
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ tcp_drop_reason(sk, skb,
+ SKB_DROP_REASON_TCP_OFOMERGE);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &tp->out_of_order_queue);
+ tcp_dsack_extend(sk,
+ TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ tcp_drop_reason(sk, skb1,
+ SKB_DROP_REASON_TCP_OFOMERGE);
+ goto merge_right;
+ }
+ } else if (tcp_ooo_try_coalesce(sk, skb1,
+ skb, &fragstolen)) {
+ goto coalesce_done;
+ }
+ p = &parent->rb_right;
+ }
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ break;
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ tp->ooo_last_skb = skb;
+
+add_sack:
+ if (tcp_is_sack(tp))
+ tcp_sack_new_ofo_skb(sk, seq, end_seq);
+end:
+ if (skb) {
+ /* For non sack flows, do not grow window to force DUPACK
+ * and trigger fast retransmit.
+ */
+ if (tcp_is_sack(tp))
+ tcp_grow_window(sk, skb, false);
+ skb_condense(skb);
+ skb_set_owner_r(skb, sk);
+ }
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ tcp_rcvbuf_grow(sk, tp->rcvq_space.space);
+}
+
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail,
+ skb, fragstolen)) ? 1 : 0;
+ tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
+ if (!eaten) {
+ tcp_add_receive_queue(sk, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+ int data_len = 0;
+ bool fragstolen;
+
+ if (size == 0)
+ return 0;
+
+ if (size > PAGE_SIZE) {
+ int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+ data_len = npages << PAGE_SHIFT;
+ size = data_len + (size & ~PAGE_MASK);
+ }
+ skb = alloc_skb_with_frags(size - data_len, data_len,
+ PAGE_ALLOC_COSTLY_ORDER,
+ &err, sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+ goto err_free;
+ }
+
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err)
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return err;
+
+}
- __skb_pull(skb, th->doff*4);
+void tcp_data_ready(struct sock *sk)
+{
+ if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
+ sk->sk_data_ready(sk);
+}
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ enum skb_drop_reason reason;
+ bool fragstolen;
+ int eaten;
- TCP_ECN_accept_cwr(tp, skb);
+ /* If a subflow has been reset, the packet should not continue
+ * to be processed, drop the packet.
+ */
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+ __kfree_skb(skb);
+ return;
+ }
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
- 4 - tp->rx_opt.tstamp_ok);
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ __kfree_skb(skb);
+ return;
}
+ tcp_cleanup_skb(skb);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
* Out of sequence packets to the out_of_order_queue.
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
- if (tcp_receive_window(tp) == 0)
+ if (tcp_receive_window(tp) == 0) {
+ /* Some stacks are known to send bare FIN packets
+ * in a loop even if we send RWIN 0 in our ACK.
+ * Accepting this FIN does not hurt memory pressure
+ * because the FIN flag will simply be merged to the
+ * receive queue tail skb in most cases.
+ */
+ if (!skb->len &&
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ goto queue_and_out;
+
+ reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
+ }
/* Ok. In sequence. In window. */
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
- sock_owned_by_user(sk) && !tp->urg_data) {
- int chunk = min_t(unsigned int, skb->len,
- tp->ucopy.len);
-
- __set_current_state(TASK_RUNNING);
+queue_and_out:
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ /* TODO: maybe ratelimit these WIN 0 ACK ? */
+ inet_csk(sk)->icsk_ack.pending |=
+ (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
+ inet_csk_schedule_ack(sk);
+ sk->sk_data_ready(sk);
- local_bh_enable();
- if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- eaten = (chunk == skb->len && !th->fin);
- tcp_rcv_space_adjust(sk);
+ if (skb_queue_len(&sk->sk_receive_queue) && skb->len) {
+ reason = SKB_DROP_REASON_PROTO_MEM;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+ goto drop;
}
- local_bh_disable();
+ sk_forced_mem_schedule(sk, skb->truesize);
}
- if (eaten <= 0) {
-queue_and_out:
- if (eaten < 0 &&
- (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_stream_rmem_schedule(sk, skb))) {
- if (tcp_prune_queue(sk) < 0 ||
- !sk_stream_rmem_schedule(sk, skb))
- goto drop;
- }
- sk_stream_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- }
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- if(skb->len)
- tcp_event_data_recv(sk, tp, skb);
- if(th->fin)
- tcp_fin(skb, sk, th);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
+ if (skb->len)
+ tcp_event_data_recv(sk, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
- /* RFC2581. 4.2. SHOULD send immediate ACK, when
+ /* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (skb_queue_empty(&tp->out_of_order_queue))
- inet_csk(sk)->icsk_ack.pingpong = 0;
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
- tcp_fast_path_check(sk, tp);
+ tcp_fast_path_check(sk);
if (eaten > 0)
- __kfree_skb(skb);
- else if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ kfree_skb_partial(skb, fragstolen);
+ if (!sock_flag(sk, SOCK_DEAD))
+ tcp_data_ready(sk);
return;
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
- tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ reason = SKB_DROP_REASON_TCP_OLD_DATA;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
- __kfree_skb(skb);
+ tcp_drop_reason(sk, skb, reason);
return;
}
/* Out of window. F.e. zero window probe. */
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tp->rcv_nxt + tcp_receive_window(tp))) {
+ reason = SKB_DROP_REASON_TCP_OVERWINDOW;
goto out_of_window;
-
- tcp_enter_quickack_mode(sk);
+ }
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
- tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
-
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
- if (!tcp_receive_window(tp))
+ if (!tcp_receive_window(tp)) {
+ reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
+ }
goto queue_and_out;
}
- TCP_ECN_check_ce(tp, skb);
-
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_stream_rmem_schedule(sk, skb)) {
- if (tcp_prune_queue(sk) < 0 ||
- !sk_stream_rmem_schedule(sk, skb))
- goto drop;
- }
-
- /* Disable header prediction. */
- tp->pred_flags = 0;
- inet_csk_schedule_ack(sk);
-
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
-
- sk_stream_set_owner_r(skb, sk);
+ tcp_data_queue_ofo(sk, skb);
+}
- if (!skb_peek(&tp->out_of_order_queue)) {
- /* Initial out of order segment, build 1 SACK. */
- if (tp->rx_opt.sack_ok) {
- tp->rx_opt.num_sacks = 1;
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = 1;
- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
- tp->selective_acks[0].end_seq =
- TCP_SKB_CB(skb)->end_seq;
- }
- __skb_queue_head(&tp->out_of_order_queue,skb);
- } else {
- struct sk_buff *skb1 = tp->out_of_order_queue.prev;
- u32 seq = TCP_SKB_CB(skb)->seq;
- u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ if (list)
+ return !skb_queue_is_last(list, skb) ? skb->next : NULL;
- if (seq == TCP_SKB_CB(skb1)->end_seq) {
- __skb_append(skb1, skb, &tp->out_of_order_queue);
+ return skb_rb_next(skb);
+}
- if (!tp->rx_opt.num_sacks ||
- tp->selective_acks[0].end_seq != seq)
- goto add_sack;
+static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+ struct sk_buff_head *list,
+ struct rb_root *root)
+{
+ struct sk_buff *next = tcp_skb_next(skb, list);
- /* Common case: data arrive in order after hole. */
- tp->selective_acks[0].end_seq = end_seq;
- return;
- }
+ if (list)
+ __skb_unlink(skb, list);
+ else
+ rb_erase(&skb->rbnode, root);
- /* Find place to insert this segment. */
- do {
- if (!after(TCP_SKB_CB(skb1)->seq, seq))
- break;
- } while ((skb1 = skb1->prev) !=
- (struct sk_buff*)&tp->out_of_order_queue);
+ __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
- /* Do skb overlap to previous one? */
- if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
- before(seq, TCP_SKB_CB(skb1)->end_seq)) {
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- /* All the bits are present. Drop. */
- __kfree_skb(skb);
- tcp_dsack_set(tp, seq, end_seq);
- goto add_sack;
- }
- if (after(seq, TCP_SKB_CB(skb1)->seq)) {
- /* Partial overlap. */
- tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
- } else {
- skb1 = skb1->prev;
- }
- }
- __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
-
- /* And clean segments covered by new one as whole. */
- while ((skb1 = skb->next) !=
- (struct sk_buff*)&tp->out_of_order_queue &&
- after(end_seq, TCP_SKB_CB(skb1)->seq)) {
- if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
- break;
- }
- __skb_unlink(skb1, &tp->out_of_order_queue);
- tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
- __kfree_skb(skb1);
- }
+ return next;
+}
-add_sack:
- if (tp->rx_opt.sack_ok)
- tcp_sack_new_ofo_skb(sk, seq, end_seq);
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
}
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
}
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the queue.
+ *
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
- struct sk_buff *head, struct sk_buff *tail,
- u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = head, *n;
+ struct sk_buff_head tmp;
+ bool end_of_skbs;
/* First, check that queue is collapsible and find
- * the point where collapsing can be useful. */
- for (skb = head; skb != tail; ) {
+ * the point where collapsing can be useful.
+ */
+restart:
+ for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+ n = tcp_skb_next(skb, list);
+
+ if (!skb_frags_readable(skb))
+ goto skip_this;
+
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- struct sk_buff *next = skb->next;
- __skb_unlink(skb, list);
- __kfree_skb(skb);
- NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
- skb = next;
- continue;
+ skb = tcp_collapse_one(sk, skb, list, root);
+ if (!skb)
+ break;
+ goto restart;
}
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
- * overlaps to the next one.
+ * overlaps to the next one and mptcp allow collapsing.
*/
- if (!skb->h.th->syn && !skb->h.th->fin &&
- (tcp_win_from_space(skb->truesize) > skb->len ||
- before(TCP_SKB_CB(skb)->seq, start) ||
- (skb->next != tail &&
- TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
+ (tcp_win_from_space(sk, skb->truesize) > skb->len ||
+ before(TCP_SKB_CB(skb)->seq, start))) {
+ end_of_skbs = false;
+ break;
+ }
+
+ if (n && n != tail && skb_frags_readable(n) &&
+ tcp_skb_can_collapse_rx(skb, n) &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+ end_of_skbs = false;
break;
+ }
+skip_this:
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
- skb = skb->next;
}
- if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+ if (end_of_skbs ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
+ !skb_frags_readable(skb))
return;
+ __skb_queue_head_init(&tmp);
+
while (before(start, end)) {
+ int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
- int header = skb_headroom(skb);
- int copy = SKB_MAX_ORDER(header, 0);
- /* Too big header? This can happen with IPv6. */
- if (copy < 0)
- return;
- if (end-start < copy)
- copy = end-start;
- nskb = alloc_skb(copy+header, GFP_ATOMIC);
+ nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
- return;
- skb_reserve(nskb, header);
- memcpy(nskb->head, skb->head, header);
- nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
- nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
- nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
+ break;
+
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ skb_copy_decrypted(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_insert(nskb, skb->prev, skb, list);
- sk_stream_set_owner_r(nskb, sk);
+ if (list)
+ __skb_queue_before(list, skb, nskb);
+ else
+ __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
+ skb_set_owner_r(nskb, sk);
+ mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -3428,16 +5643,19 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
start += size;
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- struct sk_buff *next = skb->next;
- __skb_unlink(skb, list);
- __kfree_skb(skb);
- NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
- skb = next;
- if (skb == tail || skb->h.th->syn || skb->h.th->fin)
- return;
+ skb = tcp_collapse_one(sk, skb, list, root);
+ if (!skb ||
+ skb == tail ||
+ !tcp_skb_can_collapse_rx(nskb, skb) ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
+ !skb_frags_readable(skb))
+ goto end;
}
}
}
+end:
+ skb_queue_walk_safe(&tmp, skb, n)
+ tcp_rbtree_insert(root, skb);
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -3446,40 +5664,108 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
- struct sk_buff *head;
+ u32 range_truesize, sum_tiny = 0;
+ struct sk_buff *skb, *head;
u32 start, end;
- if (skb == NULL)
+ skb = skb_rb_first(&tp->out_of_order_queue);
+new_range:
+ if (!skb) {
+ tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
return;
-
+ }
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
- head = skb;
+ range_truesize = skb->truesize;
- for (;;) {
- skb = skb->next;
+ for (head = skb;;) {
+ skb = skb_rb_next(skb);
- /* Segment is terminated when we see gap or when
- * we are at the end of all the queue. */
- if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+ /* Range is terminated when we see a gap or when
+ * we are at the queue end.
+ */
+ if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, &tp->out_of_order_queue,
- head, skb, start, end);
- head = skb;
- if (skb == (struct sk_buff *)&tp->out_of_order_queue)
- break;
- /* Start new segment */
+ /* Do not attempt collapsing tiny skbs */
+ if (range_truesize != head->truesize ||
+ end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
+ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+ head, skb, start, end);
+ } else {
+ sum_tiny += range_truesize;
+ if (sum_tiny > sk->sk_rcvbuf >> 3)
+ return;
+ }
+ goto new_range;
+ }
+
+ range_truesize += skb->truesize;
+ if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
- } else {
- if (before(TCP_SKB_CB(skb)->seq, start))
- start = TCP_SKB_CB(skb)->seq;
- if (after(TCP_SKB_CB(skb)->end_seq, end))
- end = TCP_SKB_CB(skb)->end_seq;
+ }
+}
+
+/*
+ * Clean the out-of-order queue to make room.
+ * We drop high sequences packets to :
+ * 1) Let a chance for holes to be filled.
+ * This means we do not drop packets from ooo queue if their sequence
+ * is before incoming packet sequence.
+ * 2) not add too big latencies if thousands of packets sit there.
+ * (But if application shrinks SO_RCVBUF, we could still end up
+ * freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+ *
+ * Return true if queue has shrunk.
+ */
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node *node, *prev;
+ bool pruned = false;
+ int goal;
+
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ return false;
+
+ goal = sk->sk_rcvbuf >> 3;
+ node = &tp->ooo_last_skb->rbnode;
+
+ do {
+ struct sk_buff *skb = rb_to_skb(node);
+
+ /* If incoming skb would land last in ofo queue, stop pruning. */
+ if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
+ break;
+ pruned = true;
+ prev = rb_prev(node);
+ rb_erase(node, &tp->out_of_order_queue);
+ goal -= skb->truesize;
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+ tp->ooo_last_skb = rb_to_skb(prev);
+ if (!prev || goal <= 0) {
+ if (tcp_can_ingest(sk, in_skb) &&
+ !tcp_under_memory_pressure(sk))
+ break;
+ goal = sk->sk_rcvbuf >> 3;
}
+ node = prev;
+ } while (node);
+
+ if (pruned) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
}
+ return pruned;
}
/* Reduce allocated memory if we can, trying to get
@@ -3489,144 +5775,125 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
* until the socket owning process reads some of the data
* to stabilize the situation.
*/
-static int tcp_prune_queue(struct sock *sk)
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Do nothing if our queues are empty. */
+ if (!atomic_read(&sk->sk_rmem_alloc))
+ return -1;
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
- NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
+ if (!tcp_can_ingest(sk, in_skb))
+ tcp_clamp_window(sk);
+ else if (tcp_under_memory_pressure(sk))
+ tcp_adjust_rcv_ssthresh(sk);
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
- tcp_clamp_window(sk, tp);
- else if (tcp_memory_pressure)
- tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ if (tcp_can_ingest(sk, in_skb))
+ return 0;
tcp_collapse_ofo_queue(sk);
- tcp_collapse(sk, &sk->sk_receive_queue,
- sk->sk_receive_queue.next,
- (struct sk_buff*)&sk->sk_receive_queue,
- tp->copied_seq, tp->rcv_nxt);
- sk_stream_mem_reclaim(sk);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+ skb_peek(&sk->sk_receive_queue),
+ NULL,
+ tp->copied_seq, tp->rcv_nxt);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* Collapsing did not help, destructive actions follow.
* This must not ever occur. */
- /* First, purge the out_of_order queue. */
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
- NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
- __skb_queue_purge(&tp->out_of_order_queue);
-
- /* Reset SACK state. A conforming SACK implementation will
- * do the same at a timeout based retransmit. When a connection
- * is in a sad state like this, we care only about integrity
- * of the connection not performance.
- */
- if (tp->rx_opt.sack_ok)
- tcp_sack_reset(&tp->rx_opt);
- sk_stream_mem_reclaim(sk);
- }
+ tcp_prune_ofo_queue(sk, in_skb);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* If we are really being abused, tell the caller to silently
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
*/
- NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
return -1;
}
-
-/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
- * As additional protections, we do not touch cwnd in retransmission phases,
- * and if application hit its sndbuf limit recently.
- */
-void tcp_cwnd_application_limited(struct sock *sk)
+static bool tcp_should_expand_sndbuf(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
- sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- /* Limited by application or receiver window. */
- u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
- u32 win_used = max(tp->snd_cwnd_used, init_win);
- if (win_used < tp->snd_cwnd) {
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
- }
- tp->snd_cwnd_used = 0;
- }
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
+ const struct tcp_sock *tp = tcp_sk(sk);
-static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
-{
/* If the user specified a specific send buffer setting, do
* not modify it.
*/
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
- return 0;
+ return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (tcp_memory_pressure)
- return 0;
+ if (tcp_under_memory_pressure(sk)) {
+ int unused_mem = sk_unused_reserved_mem(sk);
+
+ /* Adjust sndbuf according to reserved mem. But make sure
+ * it never goes below SOCK_MIN_SNDBUF.
+ * See sk_stream_moderate_sndbuf() for more details.
+ */
+ if (unused_mem > SOCK_MIN_SNDBUF)
+ WRITE_ONCE(sk->sk_sndbuf, unused_mem);
+
+ return false;
+ }
/* If we are under soft global TCP memory pressure, do not expand. */
- if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
- return 0;
+ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+ return false;
/* If we filled the congestion window, do not expand. */
- if (tp->packets_out >= tp->snd_cwnd)
- return 0;
+ if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
+ return false;
- return 1;
+ return true;
}
-/* When incoming ACK allowed to free some skb from write_queue,
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
- * on the exit from tcp input handler.
- *
- * PROBLEM: sndbuf expansion does not work well with largesend.
- */
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_should_expand_sndbuf(sk, tp)) {
- int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
- MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
- demanded = max_t(unsigned int, tp->snd_cwnd,
- tp->reordering + 1);
- sndmem *= 2*demanded;
- if (sndmem > sk->sk_sndbuf)
- sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ if (tcp_should_expand_sndbuf(sk)) {
+ tcp_sndbuf_expand(sk);
+ tp->snd_cwnd_stamp = tcp_jiffies32;
}
- sk->sk_write_space(sk);
+ INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}
-static void tcp_check_space(struct sock *sk)
+/* Caller made space either from:
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
+ *
+ * We might be able to generate EPOLLOUT to the application if:
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
+ * small enough that tcp_stream_memory_free() decides it
+ * is time to generate EPOLLOUT.
+ */
+void tcp_check_space(struct sock *sk)
{
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_new_space(sk);
+ /* pairs with tcp_poll() */
+ smp_mb();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
-static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_data_snd_check(struct sock *sk)
{
- tcp_push_pending_frames(sk, tp);
+ tcp_push_pending_frames(sk);
tcp_check_space(sk);
}
@@ -3636,24 +5903,78 @@ static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ unsigned long rtt;
+ u64 delay;
/* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
- * (tcp_recvmsg() will send ACK otherwise). Or...
+ * (tcp_recvmsg() will send ACK otherwise).
+ * If application uses SO_RCVLOWAT, we want send ack now if
+ * we have not received enough bytes to satisfy the condition.
*/
- && __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+ __tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
- /* We have out of order data. */
- (ofo_possible &&
- skb_peek(&tp->out_of_order_queue))) {
- /* Then ack it now */
+ /* Protocol state mandates a one-time immediate ACK */
+ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+ /* If we are running from __release_sock() in user context,
+ * Defer the ack until tcp_release_cb().
+ */
+ if (sock_owned_by_user_nocheck(sk) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) {
+ set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
+ return;
+ }
+send_now:
tcp_send_ack(sk);
- } else {
- /* Else, send delayed ack. */
+ return;
+ }
+
+ if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk);
+ return;
+ }
+
+ if (!tcp_is_sack(tp) ||
+ tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr))
+ goto send_now;
+
+ if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
+ tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
+ tp->dup_ack_counter = 0;
+ }
+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
+ tp->dup_ack_counter++;
+ goto send_now;
}
+ tp->compressed_ack++;
+ if (hrtimer_is_queued(&tp->compressed_ack_timer))
+ return;
+
+ /* compress ack timer : comp_sack_rtt_percent of rtt,
+ * but no more than tcp_comp_sack_delay_ns.
+ */
+
+ rtt = tp->rcv_rtt_est.rtt_us;
+ if (tp->srtt_us && tp->srtt_us < rtt)
+ rtt = tp->srtt_us;
+
+ /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100
+ * ->
+ * delay = rtt * 1.25 * comp_sack_rtt_percent
+ */
+ delay = (u64)(rtt + (rtt >> 2)) *
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent);
+
+ delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns));
+
+ sock_hold(sk);
+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns),
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
static inline void tcp_ack_snd_check(struct sock *sk)
@@ -3674,13 +5995,13 @@ static inline void tcp_ack_snd_check(struct sock *sk)
* For 1003.1g we should support a new option TCP_STDURG to permit
* either form (or just set the sysctl tcp_stdurg).
*/
-
-static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+
+static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
- if (ptr && !sysctl_tcp_stdurg)
+ if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
ptr--;
ptr += ntohl(th->seq);
@@ -3724,8 +6045,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* buggy users.
*/
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->copied_seq != tp->rcv_nxt) {
+ !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
@@ -3734,183 +6054,280 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
}
}
- tp->urg_data = TCP_URG_NOTYET;
- tp->urg_seq = ptr;
+ WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
+ WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
tp->pred_flags = 0;
}
/* This is the 'fast' part of urgent handling. */
-static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Check if we get a new urgent pointer - normally not. */
- if (th->urg)
- tcp_check_urg(sk,th);
+ if (unlikely(th->urg))
+ tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
- if (tp->urg_data == TCP_URG_NOTYET) {
+ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
th->syn;
- /* Is the urgent pointer pointing into this packet? */
+ /* Is the urgent pointer pointing into this packet? */
if (ptr < skb->len) {
u8 tmp;
if (skb_copy_bits(skb, ptr, &tmp, 1))
BUG();
- tp->urg_data = TCP_URG_VALID | tmp;
+ WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
}
}
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int err;
-
- local_bh_enable();
- if (skb->ip_summed==CHECKSUM_UNNECESSARY)
- err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
- else
- err = skb_copy_and_csum_datagram_iovec(skb, hlen,
- tp->ucopy.iov);
-
- if (!err) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
- }
+ const struct tcp_sock *tp = tcp_sk(sk);
- local_bh_disable();
- return err;
+ return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+ (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+ TCPF_CLOSING));
}
-static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+/* Does PAWS and seqno based validation of an incoming segment, flags will
+ * play significant role here.
+ */
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, int syn_inerr)
{
- __sum16 result;
-
- if (sock_owned_by_user(sk)) {
- local_bh_enable();
- result = __tcp_checksum_complete(skb);
- local_bh_disable();
- } else {
- result = __tcp_checksum_complete(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool accecn_reflector = false;
+ SKB_DR(reason);
+
+ /* RFC1323: H1. Apply PAWS check first. */
+ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) ||
+ !tp->rx_opt.saw_tstamp ||
+ tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW))
+ goto step1;
+
+ reason = tcp_disordered_ack_check(sk, skb);
+ if (!reason)
+ goto step1;
+ /* Reset is accepted even if it did not pass PAWS. */
+ if (th->rst)
+ goto step1;
+ if (unlikely(th->syn))
+ goto syn_challenge;
+
+ /* Old ACK are common, increment PAWS_OLD_ACK
+ * and do not send a dupack.
+ */
+ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK);
+ goto discard;
}
- return result;
-}
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
-static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
-{
- return skb->ip_summed != CHECKSUM_UNNECESSARY &&
- __tcp_checksum_complete_user(sk, skb);
-}
+step1:
+ /* Step 1: check sequence number */
+ reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ if (reason) {
+ /* RFC793, page 37: "In all states except SYN-SENT, all reset
+ * (RST) segments are validated by checking their SEQ-fields."
+ * And page 69: "If an incoming segment is not acceptable,
+ * an acknowledgment should be sent in reply (unless the RST
+ * bit is set, if so drop the segment and return)".
+ */
+ if (!th->rst) {
+ if (th->syn)
+ goto syn_challenge;
+
+ if (reason == SKB_DROP_REASON_TCP_INVALID_SEQUENCE ||
+ reason == SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_BEYOND_WINDOW);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSEQ,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ } else if (tcp_reset_check(sk, skb)) {
+ goto reset;
+ }
+ goto discard;
+ }
-#ifdef CONFIG_NET_DMA
-static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int dma_cookie;
- int copied_early = 0;
+ /* Step 2: check RST bit */
+ if (th->rst) {
+ /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+ * FIN and SACK too if available):
+ * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+ * the right-most SACK block,
+ * then
+ * RESET the connection
+ * else
+ * Send a challenge ACK
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+ tcp_reset_check(sk, skb))
+ goto reset;
+
+ if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int max_sack = sp[0].end_seq;
+ int this_sack;
+
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
+ ++this_sack) {
+ max_sack = after(sp[this_sack].end_seq,
+ max_sack) ?
+ sp[this_sack].end_seq : max_sack;
+ }
- if (tp->ucopy.wakeup)
- return 0;
+ if (TCP_SKB_CB(skb)->seq == max_sack)
+ goto reset;
+ }
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = get_softnet_dma();
+ /* Disable TFO if RST is out-of-order
+ * and no data has been received
+ * for current active TFO socket
+ */
+ if (tp->syn_fastopen && !tp->data_segs_in &&
+ sk->sk_state == TCP_ESTABLISHED)
+ tcp_fastopen_active_disable(sk);
+ tcp_send_challenge_ack(sk, false);
+ SKB_DR_SET(reason, TCP_RESET);
+ goto discard;
+ }
- if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ /* step 3: check security and precedence [ignored] */
- dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
- skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
+ /* step 4: Check for a SYN
+ * RFC 5961 4.2 : Send a challenge ack
+ */
+ if (th->syn) {
+ if (tcp_ecn_mode_accecn(tp)) {
+ accecn_reflector = true;
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
+ if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
+ TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
+ TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
+ TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt)
+ goto pass;
+syn_challenge:
+ if (syn_inerr)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ tcp_send_challenge_ack(sk, accecn_reflector);
+ SKB_DR_SET(reason, TCP_INVALID_SYN);
+ goto discard;
+ }
- if (dma_cookie < 0)
- goto out;
+pass:
+ bpf_skops_parse_hdr(sk, skb);
- tp->ucopy.dma_cookie = dma_cookie;
- copied_early = 1;
+ return true;
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
+discard:
+ tcp_drop_reason(sk, skb, reason);
+ return false;
- if ((tp->ucopy.len == 0) ||
- (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
- (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
- tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
- }
- } else if (chunk > 0) {
- tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
- }
-out:
- return copied_early;
+reset:
+ tcp_reset(sk, skb);
+ __kfree_skb(skb);
+ return false;
}
-#endif /* CONFIG_NET_DMA */
/*
- * TCP receive function for the ESTABLISHED state.
+ * TCP receive function for the ESTABLISHED state.
*
- * It is split into a fast path and a slow path. The fast path is
+ * It is split into a fast path and a slow path. The fast path is
* disabled when:
* - A zero window was announced from us - zero window probing
- * is only handled properly in the slow path.
+ * is only handled properly in the slow path.
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
- * (detected by checking the TCP header against pred_flags)
+ * (detected by checking the TCP header against pred_flags)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
* - Unexpected TCP option.
*
- * When these conditions are not satisfied it drops into a standard
+ * When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
* The first three cases are guaranteed by proper pred_flags setting,
- * the rest is checked inline. Fast processing is turned on in
+ * the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
-int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int len = skb->len;
+ /* TCP congestion window tracking */
+ trace_tcp_probe(sk, skb);
+
+ tcp_mstamp_refresh(tp);
+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
+ inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
- * The code loosely follows the one in the famous
+ * The code loosely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
- *
- * Van's trick is to deposit buffers into socket queue
+ *
+ * Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
- * Our current scheme is not silly either but we take the
+ * Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
- * turn it off (when there are holes in the receive
+ * turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
- TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+ !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
+ s32 delta = 0;
+ int flag = 0;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
@@ -3919,21 +6336,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
- __be32 *ptr = (__be32 *)(th + 1);
-
/* No? Slow path! */
- if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+ if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
- tp->rx_opt.saw_tstamp = 1;
- ++ptr;
- tp->rx_opt.rcv_tsval = ntohl(*ptr);
- ++ptr;
- tp->rx_opt.rcv_tsecr = ntohl(*ptr);
-
+ delta = tp->rx_opt.rcv_tsval -
+ tp->rx_opt.ts_recent;
/* If PAWS failed, check it more carefully in slow path */
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ if (delta < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
@@ -3953,164 +6363,108 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ flag |= __tcp_replace_ts_recent(tp,
+ delta);
+
+ tcp_ecn_received_counters(sk, skb, 0);
/* We know that such packets are checksummed
* on entry.
*/
- tcp_ack(sk, skb, 0);
- __kfree_skb(skb);
- tcp_data_snd_check(sk, tp);
- return 0;
+ tcp_ack(sk, skb, flag);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ /* When receiving pure ack in fast path, update
+ * last ts ecr directly instead of calling
+ * tcp_rcv_rtt_measure_ts()
+ */
+ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+ return;
} else { /* Header too small */
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
- int copied_early = 0;
-
- if (tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
- if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
- copied_early = 1;
- eaten = 1;
- }
-#endif
- if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
- __set_current_state(TASK_RUNNING);
+ bool fragstolen = false;
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
- eaten = 1;
- }
- if (eaten) {
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) +
- TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- tcp_rcv_rtt_measure_ts(sk, skb);
-
- __skb_pull(skb, tcp_header_len);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
- }
- if (copied_early)
- tcp_cleanup_rbuf(sk, skb->len);
- }
- if (!eaten) {
- if (tcp_checksum_complete_user(sk, skb))
- goto csum_error;
+ if (tcp_checksum_complete(skb))
+ goto csum_error;
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ if (after(TCP_SKB_CB(skb)->end_seq,
+ tp->rcv_nxt + tcp_receive_window(tp)))
+ goto validate;
- tcp_rcv_rtt_measure_ts(sk, skb);
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ flag |= __tcp_replace_ts_recent(tp,
+ delta);
- NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
+ tcp_rcv_rtt_measure_ts(sk, skb);
- /* Bulk data transfer: receiver */
- __skb_pull(skb,tcp_header_len);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- sk_stream_set_owner_r(skb, sk);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- }
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
- tcp_event_data_recv(sk, tp, skb);
+ /* Bulk data transfer: receiver */
+ tcp_cleanup_skb(skb);
+ __skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb,
+ len - tcp_header_len);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
+
+ tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
- tcp_ack(sk, skb, FLAG_DATA);
- tcp_data_snd_check(sk, tp);
+ tcp_ack(sk, skb, flag | FLAG_DATA);
+ tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
+ } else {
+ tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
}
__tcp_ack_snd_check(sk, 0);
no_ack:
-#ifdef CONFIG_NET_DMA
- if (copied_early)
- __skb_queue_tail(&sk->sk_async_wait_queue, skb);
- else
-#endif
if (eaten)
- __kfree_skb(skb);
- else
- sk->sk_data_ready(sk, 0);
- return 0;
+ kfree_skb_partial(skb, fragstolen);
+ tcp_data_ready(sk);
+ return;
}
}
slow_path:
- if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
+ if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
- /*
- * RFC1323: H1. Apply PAWS check first.
- */
- if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(sk, skb)) {
- if (!th->rst) {
- NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
- tcp_send_dupack(sk, skb);
- goto discard;
- }
- /* Resets are accepted even if PAWS failed.
-
- ts_recent update must be made after we are sure
- that the packet is in window.
- */
+ if (!th->ack && !th->rst && !th->syn) {
+ reason = SKB_DROP_REASON_TCP_FLAGS;
+ goto discard;
}
/*
* Standard slow path.
*/
+validate:
+ if (!tcp_validate_incoming(sk, skb, th, 1))
+ return;
- if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
- /* RFC793, page 37: "In all states except SYN-SENT, all reset
- * (RST) segments are validated by checking their SEQ-fields."
- * And page 69: "If an incoming segment is not acceptable,
- * an acknowledgment should be sent in reply (unless the RST bit
- * is set, if so drop the segment and return)".
- */
- if (!th->rst)
- tcp_send_dupack(sk, skb);
- goto discard;
- }
+step5:
+ tcp_ecn_received_counters_payload(sk, skb);
- if(th->rst) {
- tcp_reset(sk);
+ reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
+ if ((int)reason < 0) {
+ reason = -reason;
goto discard;
}
-
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
- tcp_reset(sk);
- return 1;
- }
-
-step5:
- if(th->ack)
- tcp_ack(sk, skb, FLAG_SLOWPATH);
-
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
@@ -4119,26 +6473,181 @@ step5:
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
- tcp_data_snd_check(sk, tp);
+ tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
- return 0;
+ return;
csum_error:
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- __kfree_skb(skb);
- return 0;
+ tcp_drop_reason(sk, skb, reason);
}
+EXPORT_IPV6_MOD(tcp_rcv_established);
-static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_mtup_init(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_metrics(sk);
+
+ /* Initialize the congestion window to start the transfer.
+ * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1 && tp->undo_marker)
+ tcp_snd_cwnd_set(tp, 1);
+ else
+ tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk)));
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+
+ bpf_skops_established(sk, bpf_op, skb);
+ /* Initialize congestion control unless BPF initialized it already: */
+ if (!icsk->icsk_ca_initialized)
+ tcp_init_congestion_control(sk);
+ tcp_init_buffer_space(sk);
+}
+
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_ao_finish_connect(sk, skb);
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ icsk->icsk_ack.lrcvtime = tcp_jiffies32;
+
+ if (skb) {
+ icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ security_inet_conn_established(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ }
+
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_jiffies32;
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+}
+
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
+ u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+ bool syn_drop = false;
+
+ if (mss == READ_ONCE(tp->rx_opt.user_mss)) {
+ struct tcp_options_received opt;
+
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+ tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+ if (!tp->syn_fastopen) {
+ /* Ignore an unsolicited cookie */
+ cookie->len = -1;
+ } else if (tp->total_retrans) {
+ /* SYN timed out and the SYN-ACK neither has a cookie nor
+ * acknowledges data. Presumably the remote received only
+ * the retransmitted (regular) SYNs: either the original
+ * SYN-data or the corresponding SYN-ACK was dropped.
+ */
+ syn_drop = (cookie->len < 0 && data);
+ } else if (cookie->len < 0 && !tp->syn_data) {
+ /* We requested a cookie but didn't get it. If we did not use
+ * the (old) exp opt format then try so next time (try_exp=1).
+ * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+ */
+ try_exp = tp->syn_fastopen_exp ? 2 : 1;
+ }
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
+
+ if (data) { /* Retransmit unacked data in SYN */
+ if (tp->total_retrans)
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+ else
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
+ skb_rbtree_walk_from(data)
+ tcp_mark_skb_lost(sk, data);
+ tcp_non_congestion_loss_retransmit(sk);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ return true;
+ }
+ tp->syn_data_acked = tp->syn_data;
+ if (tp->syn_data_acked) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ /* SYN-data is counted as two separate packets in tcp_ack() */
+ if (tp->delivered > 1)
+ --tp->delivered;
+ }
+
+ tcp_fastopen_add_skb(sk, synack);
+
+ return false;
+}
+
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (tp->syn_smc && !tp->rx_opt.smc_ok)
+ tp->syn_smc = 0;
+ }
+#endif
+}
+
+static void tcp_try_undo_spurious_syn(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 syn_stamp;
+
+ /* undo_marker is set when SYN or SYNACK times out. The timeout is
+ * spurious if the ACK's timestamp option echo value matches the
+ * original SYN timestamp.
+ */
+ syn_stamp = tp->retrans_stamp;
+ if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
+ syn_stamp == tp->rx_opt.rcv_tsecr)
+ tp->undo_marker = 0;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th)
+{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
+ bool fastopen_fail;
+ SKB_DR(reason);
- tcp_parse_options(skb, &tp->rx_opt, 0);
+ tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
if (th->ack) {
/* rfc793:
@@ -4148,17 +6657,23 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
* a reset (unless the RST bit is set, if so drop
* the segment and return)"
- *
- * We do not send data with SYN, so that RFC-correct
- * test reduces to:
*/
- if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
+ if (icsk->icsk_retransmits == 0)
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, false);
+ SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE);
goto reset_and_undo;
+ }
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
- tcp_time_stamp)) {
- NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED);
+ tcp_time_stamp_ts(tp))) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_PAWSACTIVEREJECTED);
+ SKB_DR_SET(reason, TCP_RFC7323_PAWS);
goto reset_and_undo;
}
@@ -4171,8 +6686,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (th->rst) {
- tcp_reset(sk);
- goto discard;
+ tcp_reset(sk, skb);
+consume:
+ __kfree_skb(skb);
+ return 0;
}
/* rfc793:
@@ -4182,9 +6699,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* See note below!
* --ANK(990513)
*/
- if (!th->syn)
+ if (!th->syn) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard_and_undo;
-
+ }
/* rfc793:
* "If the SYN bit is on ...
* are acceptable then ...
@@ -4192,26 +6710,29 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* state to ESTABLISHED..."
*/
- TCP_ECN_rcv_synack(tp, th);
+ if (tcp_ecn_mode_any(tp))
+ tcp_ecn_rcv_synack(sk, skb, th,
+ TCP_SKB_CB(skb)->ip_dsfield);
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_try_undo_spurious_syn(sk);
tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
* never scaled.
*/
tp->snd_wnd = ntohs(th->window);
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
- tp->window_clamp = min(tp->window_clamp, 65535U);
+ WRITE_ONCE(tp->window_clamp,
+ min(tp->window_clamp, 65535U));
}
if (tp->rx_opt.saw_tstamp) {
@@ -4224,52 +6745,32 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tp->tcp_header_len = sizeof(struct tcphdr);
}
- if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
- tp->rx_opt.sack_ok |= 2;
-
- tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
- tp->copied_seq = tp->rcv_nxt;
- mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
-
- security_inet_conn_established(sk, skb);
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
- tcp_init_congestion_control(sk);
-
- /* Prevent spurious tcp_cwnd_restart() on first data
- * packet.
- */
- tp->lsndtime = tcp_time_stamp;
+ smc_check_reset_syn(tp);
- tcp_init_buffer_space(sk);
+ smp_mb();
- if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ tcp_finish_connect(sk, skb);
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
+ fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
-
+ if (fastopen_fail)
+ return -1;
if (sk->sk_write_pending ||
- icsk->icsk_accept_queue.rskq_defer_accept ||
- icsk->icsk_ack.pingpong) {
+ READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
+ inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@@ -4278,19 +6779,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- icsk->icsk_ack.lrcvtime = tcp_time_stamp;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(sk);
- tcp_enter_quickack_mode(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
-
-discard:
- __kfree_skb(skb);
- return 0;
- } else {
- tcp_send_ack(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, false);
+ goto consume;
}
+ tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp));
return -1;
}
@@ -4302,19 +6796,31 @@ discard:
*
* Otherwise (no ACK) drop the segment and return."
*/
-
+ SKB_DR_SET(reason, TCP_RESET);
goto discard_and_undo;
}
/* PAWS check. */
- if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
+ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
+ tcp_paws_reject(&tp->rx_opt, 0)) {
+ SKB_DR_SET(reason, TCP_RFC7323_PAWS);
goto discard_and_undo;
-
+ }
if (th->syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
* Particularly, it can be connect to self.
*/
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (ao) {
+ WRITE_ONCE(ao->risn, th->seq);
+ ao->rcv_sne = 0;
+ }
+#endif
tcp_set_state(sk, TCP_SYN_RECV);
if (tp->rx_opt.saw_tstamp) {
@@ -4326,7 +6832,8 @@ discard:
tp->tcp_header_len = sizeof(struct tcphdr);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -4336,17 +6843,18 @@ discard:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- TCP_ECN_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
-
tcp_send_synack(sk);
#if 0
/* Note, we could accept data and URG from this segment.
- * There are no obstacles to make this.
+ * There are no obstacles to make this (except that we must
+ * either change tcp_recvmsg() to prevent it from returning data
+ * before 3WHS completes per RFC793, or employ TCP Fast Open).
*
* However, if we ignore data in ACKless segments sometimes,
* we have no reasons to accept it sometimes.
@@ -4356,7 +6864,7 @@ discard:
*/
return -1;
#else
- goto discard;
+ goto consume;
#endif
}
/* "fifth, if neither of the SYN or RST bits is set then
@@ -4366,232 +6874,280 @@ discard:
discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- goto discard;
+ tcp_drop_reason(sk, skb, reason);
+ return 0;
reset_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- return 1;
+ /* we can reuse/return @reason to its caller to handle the exception */
+ return reason;
}
+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct request_sock *req;
+
+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
+ */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
+ tcp_try_undo_recovery(sk);
+
+ tcp_update_rto_time(tp);
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
+ /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set
+ * retrans_stamp but don't enter CA_Loss, so in case that happened we
+ * need to zero retrans_stamp here to prevent spurious
+ * retransmits_timed_out(). However, if the ACK of our SYNACK caused us
+ * to enter CA_Recovery then we need to leave retrans_stamp as it was
+ * set entering CA_Recovery, for correct retransmits_timed_out() and
+ * undo behavior.
+ */
+ tcp_retrans_stamp_cleanup(sk);
+
+ /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
+ * we no longer need req so release it.
+ */
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ reqsk_fastopen_remove(sk, req, false);
+
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
+}
/*
* This function implements the receiving procedure of RFC 793 for
- * all states except ESTABLISHED and TIME_WAIT.
+ * all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/
-
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+
+enum skb_drop_reason
+tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct request_sock *req;
int queued = 0;
-
- tp->rx_opt.saw_tstamp = 0;
+ SKB_DR(reason);
switch (sk->sk_state) {
case TCP_CLOSE:
+ SKB_DR_SET(reason, TCP_CLOSE);
goto discard;
case TCP_LISTEN:
- if(th->ack)
- return 1;
+ if (th->ack)
+ return SKB_DROP_REASON_TCP_FLAGS;
- if(th->rst)
+ if (th->rst) {
+ SKB_DR_SET(reason, TCP_RESET);
goto discard;
-
- if(th->syn) {
- if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
- return 1;
-
- /* Now we have several options: In theory there is
- * nothing else in the frame. KA9Q has an option to
- * send data with the syn, BSD accepts data with the
- * syn up to the [to be] advertised window and
- * Solaris 2.1 gives you a protocol error. For now
- * we just ignore it, that fits the spec precisely
- * and avoids incompatibilities. It would be nice in
- * future to drop through and process the data.
- *
- * Now that TTCP is starting to be used we ought to
- * queue this data.
- * But, this leaves one open to an easy denial of
- * service attack, and SYN cookies can't defend
- * against this problem. So, we drop the data
- * in the interest of security over speed.
+ }
+ if (th->syn) {
+ if (th->fin) {
+ SKB_DR_SET(reason, TCP_FLAGS);
+ goto discard;
+ }
+ /* It is possible that we process SYN packets from backlog,
+ * so we need to make sure to disable BH and RCU right there.
*/
- goto discard;
+ rcu_read_lock();
+ local_bh_disable();
+ icsk->icsk_af_ops->conn_request(sk, skb);
+ local_bh_enable();
+ rcu_read_unlock();
+
+ consume_skb(skb);
+ return 0;
}
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
case TCP_SYN_SENT:
- queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ tp->rx_opt.saw_tstamp = 0;
+ tcp_mstamp_refresh(tp);
+ queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
- tcp_data_snd_check(sk, tp);
+ tcp_data_snd_check(sk);
return 0;
}
- if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(sk, skb)) {
- if (!th->rst) {
- NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
- tcp_send_dupack(sk, skb);
+ tcp_mstamp_refresh(tp);
+ tp->rx_opt.saw_tstamp = 0;
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req) {
+ bool req_stolen;
+
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+
+ SKB_DR_SET(reason, TCP_FASTOPEN);
+ if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason))
goto discard;
- }
- /* Reset is accepted even if it did not pass PAWS. */
}
- /* step 1: check sequence number */
- if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
- if (!th->rst)
- tcp_send_dupack(sk, skb);
+ if (!th->ack && !th->rst && !th->syn) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
}
+ if (!tcp_validate_incoming(sk, skb, th, 0))
+ return 0;
- /* step 2: check RST bit */
- if(th->rst) {
- tcp_reset(sk);
- goto discard;
+ /* step 5: check the ACK field */
+ reason = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT |
+ FLAG_NO_CHALLENGE_ACK);
+
+ if ((int)reason <= 0) {
+ if (sk->sk_state == TCP_SYN_RECV) {
+ /* send one RST */
+ if (!reason)
+ return SKB_DROP_REASON_TCP_OLD_ACK;
+ return -reason;
+ }
+ /* accept old ack during closing */
+ if ((int)reason < 0) {
+ tcp_send_challenge_ack(sk, false);
+ reason = -reason;
+ goto discard;
+ }
}
+ SKB_DR_SET(reason, NOT_SPECIFIED);
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
+ if (!tp->srtt_us)
+ tcp_synack_rtt_meas(sk, req);
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
- /* step 3: check security and precedence [ignored] */
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
- /* step 4:
- *
- * Check for a SYN in window.
- */
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
- tcp_reset(sk);
- return 1;
- }
+ if (req) {
+ tcp_rcv_synrecv_state_fastopen(sk);
+ } else {
+ tcp_try_undo_spurious_syn(sk);
+ tp->retrans_stamp = 0;
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+ skb);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+ }
+ tcp_ao_established(sk);
+ smp_mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->sk_state_change(sk);
- /* step 5: check the ACK field */
- if (th->ack) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
-
- switch(sk->sk_state) {
- case TCP_SYN_RECV:
- if (acceptable) {
- tp->copied_seq = tp->rcv_nxt;
- mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
- sk->sk_state_change(sk);
-
- /* Note, that this wakeup is only for marginal
- * crossed SYN case. Passively open sockets
- * are not waked up, because sk->sk_sleep ==
- * NULL and sk->sk_socket == NULL.
- */
- if (sk->sk_socket) {
- sk_wake_async(sk,0,POLL_OUT);
- }
+ /* Note, that this wakeup is only for marginal crossed SYN case.
+ * Passively open sockets are not waked up, because
+ * sk->sk_sleep == NULL and sk->sk_socket == NULL.
+ */
+ if (sk->sk_socket)
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
- tp->snd_wnd = ntohs(th->window) <<
- tp->rx_opt.snd_wscale;
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
- TCP_SKB_CB(skb)->seq);
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- /* tcp_ack considers this ACK as duplicate
- * and does not calculate rtt.
- * Fix it at least with timestamps.
- */
- if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- !tp->srtt)
- tcp_ack_saw_tstamp(sk, 0);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_update_pacing_rate(sk);
- if (tp->rx_opt.tstamp_ok)
- tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+ /* Prevent spurious tcp_cwnd_restart() on first data packet */
+ tp->lsndtime = tcp_jiffies32;
- /* Make sure socket is routed, for
- * correct metrics.
- */
- icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_initialize_rcv_mss(sk);
+ if (tcp_ecn_mode_accecn(tp))
+ tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
+ tcp_fast_path_on(tp);
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ tcp_shutdown(sk, SEND_SHUTDOWN);
- tcp_init_metrics(sk);
+ break;
- tcp_init_congestion_control(sk);
+ case TCP_FIN_WAIT1: {
+ int tmo;
- /* Prevent spurious tcp_cwnd_restart() on
- * first data packet.
- */
- tp->lsndtime = tcp_time_stamp;
+ if (req)
+ tcp_rcv_synrecv_state_fastopen(sk);
- tcp_mtup_init(sk);
- tcp_initialize_rcv_mss(sk);
- tcp_init_buffer_space(sk);
- tcp_fast_path_on(tp);
- } else {
- return 1;
- }
+ if (tp->snd_una != tp->write_seq)
break;
- case TCP_FIN_WAIT1:
- if (tp->snd_una == tp->write_seq) {
- tcp_set_state(sk, TCP_FIN_WAIT2);
- sk->sk_shutdown |= SEND_SHUTDOWN;
- dst_confirm(sk->sk_dst_cache);
-
- if (!sock_flag(sk, SOCK_DEAD))
- /* Wake up lingering close() */
- sk->sk_state_change(sk);
- else {
- int tmo;
-
- if (tp->linger2 < 0 ||
- (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
- tcp_done(sk);
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
- return 1;
- }
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN);
- tmo = tcp_fin_time(sk);
- if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
- } else if (th->fin || sock_owned_by_user(sk)) {
- /* Bad case. We could lose such FIN otherwise.
- * It is not a big problem, but it looks confusing
- * and not so rare event. We still can lose it now,
- * if it spins in bh_lock_sock(), but it is really
- * marginal case.
- */
- inet_csk_reset_keepalive_timer(sk, tmo);
- } else {
- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
- }
- }
- }
- break;
+ sk_dst_confirm(sk);
- case TCP_CLOSING:
- if (tp->snd_una == tp->write_seq) {
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
- }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Wake up lingering close() */
+ sk->sk_state_change(sk);
break;
+ }
- case TCP_LAST_ACK:
- if (tp->snd_una == tp->write_seq) {
- tcp_update_metrics(sk);
- tcp_done(sk);
- goto discard;
- }
- break;
+ if (READ_ONCE(tp->linger2) < 0) {
+ tcp_done(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
}
- } else
- goto discard;
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ /* Receive out of order FIN after close() */
+ if (tp->syn_fastopen && th->fin)
+ tcp_fastopen_active_disable(sk);
+ tcp_done(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
+ }
+
+ tmo = tcp_fin_time(sk);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ tcp_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto consume;
+ }
+ break;
+ }
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ goto consume;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ goto consume;
+ }
+ break;
+ }
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
@@ -4601,24 +7157,31 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* If a subflow has been reset, the packet should not
+ * continue to be processed, drop the packet.
+ */
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+ goto discard;
break;
+ }
+ fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
- * RFC 1122 says we MUST send a reset.
+ * RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
- tcp_reset(sk);
- return 1;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ tcp_reset(sk, skb);
+ return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
}
}
- /* Fall through */
- case TCP_ESTABLISHED:
+ fallthrough;
+ case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
@@ -4626,20 +7189,392 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
- tcp_data_snd_check(sk, tp);
+ tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
- if (!queued) {
+ if (!queued) {
discard:
- __kfree_skb(skb);
+ tcp_drop_reason(sk, skb, reason);
}
return 0;
+
+consume:
+ __kfree_skb(skb);
+ return 0;
}
+EXPORT_IPV6_MOD(tcp_rcv_state_process);
-EXPORT_SYMBOL(sysctl_tcp_ecn);
-EXPORT_SYMBOL(sysctl_tcp_reordering);
-EXPORT_SYMBOL(tcp_parse_options);
-EXPORT_SYMBOL(tcp_rcv_established);
-EXPORT_SYMBOL(tcp_rcv_state_process);
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (family == AF_INET)
+ net_dbg_ratelimited("drop open request from %pI4/%u\n",
+ &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ net_dbg_ratelimited("drop open request from %pI6/%u\n",
+ &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negotiation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control: Linux DCTCP asserts ECT on all packets,
+ * including SYN, which is most optimal solution; however,
+ * others, such as FreeBSD do not.
+ *
+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
+ * set, indicating the use of a future TCP extension (such as AccECN). See
+ * RFC8311 §4.3 which updates RFC3168 to allow the development of such
+ * extensions.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+ const struct sk_buff *skb,
+ const struct sock *listen_sk,
+ const struct dst_entry *dst)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, ecn_ok;
+ u32 ecn_ok_dst;
+
+ if (tcp_accecn_syn_requested(th) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ inet_rsk(req)->ecn_ok = 1;
+ tcp_rsk(req)->accecn_ok = 1;
+ tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ return;
+ }
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+ ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
+
+ if (((!ect || th->res1 || th->ae) && ecn_ok) ||
+ tcp_ca_needs_ecn(listen_sk) ||
+ (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ inet_rsk(req)->ecn_ok = 1;
+}
+
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = 0;
+ tcp_rsk(req)->snt_tsval_first = 0;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ tcp_rsk(req)->accecn_ok = 0;
+ tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_rsk(req)->accecn_fail_mode = 0;
+ tcp_rsk(req)->syn_ect_rcv = 0;
+ tcp_rsk(req)->syn_ect_snt = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+#if IS_ENABLED(CONFIG_SMC)
+ ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+ tcp_sk(sk)->smc_hs_congested(sk));
+#endif
+}
+
+/*
+ * Return true if a syncookie should be sent
+ */
+static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ const char *msg = "Dropping request";
+ struct net *net = sock_net(sk);
+ bool want_cookie = false;
+ u8 syncookies;
+
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+
+#ifdef CONFIG_SYN_COOKIES
+ if (syncookies) {
+ msg = "Sending cookies";
+ want_cookie = true;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ } else
+#endif
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+
+ if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) {
+ WRITE_ONCE(queue->synflood_warned, 1);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
+ net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
+ proto, inet6_rcv_saddr(sk),
+ sk->sk_num, msg);
+ } else {
+ net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
+ proto, &sk->sk_rcv_saddr,
+ sk->sk_num, msg);
+ }
+ }
+
+ return want_cookie;
+}
+
+static void tcp_reqsk_record_syn(const struct sock *sk,
+ struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ if (tcp_sk(sk)->save_syn) {
+ u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+ struct saved_syn *saved_syn;
+ u32 mac_hdrlen;
+ void *base;
+
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
+ base = skb_mac_header(skb);
+ mac_hdrlen = skb_mac_header_len(skb);
+ len += mac_hdrlen;
+ } else {
+ base = skb_network_header(skb);
+ mac_hdrlen = 0;
+ }
+
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
+ GFP_ATOMIC);
+ if (saved_syn) {
+ saved_syn->mac_hdrlen = mac_hdrlen;
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+ memcpy(saved_syn->data, base, len);
+ req->saved_syn = saved_syn;
+ }
+ }
+}
+
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
+ !inet_csk_reqsk_queue_is_full(sk))
+ return 0;
+
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+ return 0;
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ return 0;
+ }
+
+ mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss));
+ if (!mss)
+ mss = af_ops->mss_clamp;
+
+ return mss;
+}
+EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss);
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ struct tcp_options_received tmp_opt;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ struct sock *fastopen_sk = NULL;
+ struct request_sock *req;
+ bool want_cookie = false;
+ struct dst_entry *dst;
+ struct flowi fl;
+ u8 syncookies;
+ u32 isn;
+
+#ifdef CONFIG_TCP_AO
+ const struct tcp_ao_hdr *aoh;
+#endif
+
+ isn = __this_cpu_read(tcp_tw_isn);
+ if (isn) {
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ __this_cpu_write(tcp_tw_isn, 0);
+ } else {
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+
+ if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
+ want_cookie = tcp_syn_flood_action(sk,
+ rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
+ }
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
+ if (!req)
+ goto drop;
+
+ req->syncookie = want_cookie;
+ tcp_rsk(req)->af_specific = af_ops;
+ tcp_rsk(req)->ts_off = 0;
+ tcp_rsk(req)->req_usec_ts = false;
+#if IS_ENABLED(CONFIG_MPTCP)
+ tcp_rsk(req)->is_mptcp = 0;
+#endif
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = af_ops->mss_clamp;
+ tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
+ want_cookie ? NULL : &foc);
+
+ if (want_cookie && !tmp_opt.saw_tstamp)
+ tcp_clear_options(&tmp_opt);
+
+ if (IS_ENABLED(CONFIG_SMC) && want_cookie)
+ tmp_opt.smc_ok = 0;
+
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+ tcp_openreq_init(req, &tmp_opt, skb, sk);
+ inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);
+
+ /* Note: tcp_v6_init_req() might override ir_iif for link locals */
+ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
+
+ dst = af_ops->route_req(sk, skb, &fl, req, isn);
+ if (!dst)
+ goto drop_and_free;
+
+ if (tmp_opt.tstamp_ok) {
+ tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
+ tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
+ }
+ if (!want_cookie && !isn) {
+ int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
+
+ /* Kill the following clause, if you dislike this way. */
+ if (!syncookies &&
+ (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (max_syn_backlog >> 2)) &&
+ !tcp_peer_is_proven(req, dst)) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations,
+ * proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+ rsk_ops->family);
+ goto drop_and_release;
+ }
+
+ isn = af_ops->init_seq(skb);
+ }
+
+ tcp_ecn_create_request(req, skb, sk, dst);
+
+ if (want_cookie) {
+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ if (!tmp_opt.tstamp_ok)
+ inet_rsk(req)->ecn_ok = 0;
+ }
+
+#ifdef CONFIG_TCP_AO
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ goto drop_and_release; /* Invalid TCP options */
+ if (aoh) {
+ tcp_rsk(req)->used_tcp_ao = true;
+ tcp_rsk(req)->ao_rcv_next = aoh->keyid;
+ tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+
+ } else {
+ tcp_rsk(req)->used_tcp_ao = false;
+ }
+#endif
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->txhash = net_tx_rndhash();
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+ tcp_openreq_init_rwin(req, sk, dst);
+ sk_rx_queue_set(req_to_sk(req), skb);
+ if (!want_cookie) {
+ tcp_reqsk_record_syn(sk, req, skb);
+ fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+ }
+ if (fastopen_sk) {
+ af_ops->send_synack(fastopen_sk, dst, &fl, req,
+ &foc, TCP_SYNACK_FASTOPEN, skb);
+ /* Add the child socket directly into the accept queue */
+ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
+ bh_unlock_sock(fastopen_sk);
+ sock_put(fastopen_sk);
+ goto drop_and_free;
+ }
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(fastopen_sk);
+ sock_put(fastopen_sk);
+ } else {
+ tcp_rsk(req)->tfo_listener = false;
+ if (!want_cookie &&
+ unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) {
+ reqsk_free(req);
+ dst_release(dst);
+ return 0;
+ }
+ af_ops->send_synack(sk, dst, &fl, req, &foc,
+ !want_cookie ? TCP_SYNACK_NORMAL :
+ TCP_SYNACK_COOKIE,
+ skb);
+ if (want_cookie) {
+ reqsk_free(req);
+ return 0;
+ }
+ }
+ reqsk_put(req);
+ return 0;
+
+drop_and_release:
+ dst_release(dst);
+drop_and_free:
+ __reqsk_free(req);
+drop:
+ tcp_listendrop(sk);
+ return 0;
+}
+EXPORT_IPV6_MOD(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4913f25e5ad5..f8a9596e8f4d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,22 +6,14 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
- *
* IPv4 specific functions
*
- *
* code split from:
* linux/ipv4/tcp.c
* linux/ipv4/tcp_input.c
* linux/ipv4/tcp_output.c
*
* See tcp.c for author information
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -52,89 +45,115 @@
* a single port at the same time.
*/
+#define pr_fmt(fmt) "TCP: " fmt
+#include <linux/bottom_half.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
+#include <linux/fips.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sock_diag.h>
+#include <net/aligned_data.h>
+#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
-#include <net/netdma.h>
+#include <net/secure_seq.h>
+#include <net/busy_poll.h>
+#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <linux/btf_ids.h>
+#include <linux/skbuff_ref.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-
-int sysctl_tcp_tw_reuse __read_mostly;
-int sysctl_tcp_low_latency __read_mostly;
-
-/* Check TCP sequence numbers in ICMP packets. */
-#define ICMP_MIN_LENGTH 8
+#include <crypto/md5.h>
-/* Socket used for sending RSTs */
-static struct socket *tcp_socket;
-
-void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
+#include <trace/events/tcp.h>
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
- __be32 addr);
-static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
- __be32 saddr, __be32 daddr,
- struct tcphdr *th, int protocol,
- int tcplen);
+static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif
-struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
- .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-};
+struct inet_hashinfo tcp_hashinfo;
-static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
-{
- return inet_csk_get_port(&tcp_hashinfo, sk, snum,
- inet_csk_bind_conflict);
-}
+static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
-static void tcp_v4_hash(struct sock *sk)
-{
- inet_hash(&tcp_hashinfo, sk);
-}
+static DEFINE_MUTEX(tcp_exit_batch_mutex);
-void tcp_unhash(struct sock *sk)
+static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
- inet_unhash(&tcp_hashinfo, sk);
+ return secure_tcp_seq(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
}
-static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
+static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return secure_tcp_sequence_number(skb->nh.iph->daddr,
- skb->nh.iph->saddr,
- skb->h.th->dest,
- skb->h.th->source);
+ return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
+ int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
+ const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
+ int ts_recent_stamp;
+ u32 reuse_thresh;
+
+ if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
+ reuse = 0;
+
+ if (reuse == 2) {
+ /* Still does not detect *everything* that goes through
+ * lo, since we require a loopback src or dst address
+ * or direct binding to 'lo' interface.
+ */
+ bool loopback = false;
+ if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
+ loopback = true;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
+ ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
+ loopback = true;
+ } else
+#endif
+ {
+ if (ipv4_is_loopback(tw->tw_daddr) ||
+ ipv4_is_loopback(tw->tw_rcv_saddr))
+ loopback = true;
+ }
+ if (!loopback)
+ reuse = 0;
+ }
/* With PAWS, it is safe from the viewpoint
of data integrity. Even without PAWS it is safe provided sequence
@@ -147,32 +166,73 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
If TW bucket has been already destroyed we fall back to VJ's scheme
and use initial timestamp retrieved from peer table.
*/
- if (tcptw->tw_ts_recent_stamp &&
- (twp == NULL || (sysctl_tcp_tw_reuse &&
- xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- sock_hold(sktw);
+ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
+ if (ts_recent_stamp &&
+ (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
+ /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
+ * and releasing the bucket lock.
+ */
+ if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
+ return 0;
+
+ /* In case of repair and re-using TIME-WAIT sockets we still
+ * want to be sure that it is safe as above but honor the
+ * sequence numbers and time stamps set as part of the repair
+ * process.
+ *
+ * Without this check re-using a TIME-WAIT socket with TCP
+ * repair would accumulate a -1 on the repair assigned
+ * sequence number. The first time it is reused the sequence
+ * is -1, the second time -2, etc. This fixes that issue
+ * without appearing to create any others.
+ */
+ if (likely(!tp->repair)) {
+ u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
+
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
+ tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
+ tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
+ }
+
return 1;
}
return 0;
}
+EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
+
+static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from tcp_v4_connect() and intended to
+ * prevent BPF program called below from accessing bytes that are out
+ * of the bound specified by user in addr_len.
+ */
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
-EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+ sock_owned_by_me(sk);
+
+ return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
+}
/* This will initiate an outgoing connection. */
-int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
+ struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct inet_timewait_death_row *tcp_death_row;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct rtable *rt;
+ struct ip_options_rcu *inet_opt;
+ struct net *net = sock_net(sk);
+ __be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
- int tmp;
+ struct flowi4 *fl4;
+ struct rtable *rt;
int err;
if (addr_len < sizeof(struct sockaddr_in))
@@ -182,62 +242,63 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
- if (inet->opt && inet->opt->srr) {
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ lockdep_sock_is_held(sk));
+ if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
- nexthop = inet->opt->faddr;
+ nexthop = inet_opt->opt.faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_TCP,
- inet->sport, usin->sin_port, sk);
- if (tmp < 0)
- return tmp;
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
+ orig_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ return err;
+ }
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
- if (!inet->opt || !inet->opt->srr)
- daddr = rt->rt_dst;
+ if (!inet_opt || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
- if (!inet->saddr)
- inet->saddr = rt->rt_src;
- inet->rcv_saddr = inet->saddr;
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
+ if (!inet->inet_saddr) {
+ err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
+ if (err) {
+ ip_rt_put(rt);
+ return err;
+ }
+ } else {
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
+ }
- if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
+ if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ if (likely(!tp->repair))
+ WRITE_ONCE(tp->write_seq, 0);
}
- if (tcp_death_row.sysctl_tw_recycle &&
- !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
- struct inet_peer *peer = rt_get_peer(rt);
- /*
- * VJ's idea. We save last timestamp seen from
- * the destination in peer table, when entering state
- * TIME-WAIT * and initialize rx_opt.ts_recent from it,
- * when trying new connection.
- */
- if (peer != NULL &&
- peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
- }
- }
+ inet->inet_dport = usin->sin_port;
+ sk_daddr_set(sk, daddr);
- inet->dport = usin->sin_port;
- inet->daddr = daddr;
+ inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
- inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet->opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
-
- tp->rx_opt.mss_clamp = 536;
+ tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
@@ -245,29 +306,46 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
- err = inet_hash_connect(&tcp_death_row, sk);
+ err = inet_hash_connect(tcp_death_row, sk);
if (err)
goto failure;
- err = ip_route_newports(&rt, IPPROTO_TCP,
- inet->sport, inet->dport, sk);
- if (err)
- goto failure;
+ sk_set_txhash(sk);
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto failure;
+ }
+ tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
+ rt = NULL;
+
+ if (likely(!tp->repair)) {
+ if (!tp->write_seq)
+ WRITE_ONCE(tp->write_seq,
+ secure_tcp_seq(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port));
+ WRITE_ONCE(tp->tsoffset,
+ secure_tcp_ts_off(net, inet->inet_saddr,
+ inet->inet_daddr));
+ }
- if (!tp->write_seq)
- tp->write_seq = secure_tcp_sequence_number(inet->saddr,
- inet->daddr,
- inet->sport,
- usin->sin_port);
+ atomic_set(&inet->inet_id, get_random_u16());
- inet->id = tp->write_seq ^ jiffies;
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto failure;
err = tcp_connect(sk);
- rt = NULL;
+
if (err)
goto failure;
@@ -279,47 +357,42 @@ failure:
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
+ inet_bhash2_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
- inet->dport = 0;
+ inet->inet_dport = 0;
return err;
}
+EXPORT_IPV6_MOD(tcp_v4_connect);
/*
- * This routine does path mtu discovery as defined in RFC1191.
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
*/
-static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
+void tcp_v4_mtu_reduced(struct sock *sk)
{
- struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
+ struct dst_entry *dst;
+ u32 mtu;
- /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
- * send out by Linux are always <576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == TCP_LISTEN)
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
-
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
return;
- dst->ops->update_pmtu(dst, mtu);
-
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
- sk->sk_err_soft = EMSGSIZE;
+ WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
@@ -331,6 +404,80 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
}
+EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
+
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
+
+/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
+{
+ struct request_sock *req = inet_reqsk(sk);
+ struct net *net = sock_net(sk);
+
+ /* ICMPs are not backlogged, hence we cannot get
+ * an established socket here.
+ */
+ if (seq != tcp_rsk(req)->snt_isn) {
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ } else if (abort) {
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ tcp_listendrop(req->rsk_listener);
+ }
+ reqsk_put(req);
+}
+EXPORT_IPV6_MOD(tcp_req_err);
+
+/* TCP-LD (RFC 6069) logic */
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ s32 remaining;
+ u32 delta_us;
+
+ if (sock_owned_by_user(sk))
+ return;
+
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
+ !icsk->icsk_backoff)
+ return;
+
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
+ return;
+
+ icsk->icsk_backoff--;
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
+
+ tcp_mstamp_refresh(tp);
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
+ remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
+
+ if (remaining > 0) {
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
+ } else {
+ /* RTO revert clocked out retransmission.
+ * Will retransmit now.
+ */
+ tcp_retransmit_timer(sk);
+ }
+}
+EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
/*
* This routine is called by the ICMP module when it gets some
@@ -348,53 +495,82 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
*
*/
-void tcp_v4_err(struct sk_buff *skb, u32 info)
+int tcp_v4_err(struct sk_buff *skb, u32 info)
{
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+ struct net *net = dev_net_rcu(skb->dev);
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct request_sock *fastopen;
struct tcp_sock *tp;
- struct inet_sock *inet;
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
+ u32 seq, snd_una;
struct sock *sk;
- __u32 seq;
int err;
- if (skb->len < (iph->ihl << 2) + 8) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- return;
- }
-
- sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
- th->source, inet_iif(skb));
+ sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
+ ntohs(th->source), inet_iif(skb), 0);
if (!sk) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- return;
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
+ /* To increase the counter of ignored icmps for TCP-AO */
+ tcp_ao_ignore_icmp(sk, AF_INET, type, code);
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
+ }
+ seq = ntohl(th->seq);
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
+
+ if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
+ sock_put(sk);
+ return 0;
}
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
* servers this needs to be solved differently.
+ * We do take care of PMTU discovery (RFC1191) special case :
+ * we can receive locally generated ICMP messages while socket is held.
*/
- if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
-
+ if (sock_owned_by_user(sk)) {
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ }
if (sk->sk_state == TCP_CLOSE)
goto out;
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+ }
+
tp = tcp_sk(sk);
- seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = rcu_dereference(tp->fastopen_rsk);
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
- !between(seq, tp->snd_una, tp->snd_nxt)) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ !between(seq, snd_una, tp->snd_nxt)) {
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
switch (type) {
+ case ICMP_REDIRECT:
+ if (!sock_owned_by_user(sk))
+ do_redirect(skb, sk);
+ goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
@@ -406,12 +582,30 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
- if (!sock_owned_by_user(sk))
- do_pmtu_discovery(sk, iph, info);
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ WRITE_ONCE(tp->mtu_info, info);
+ if (!sock_owned_by_user(sk)) {
+ tcp_v4_mtu_reduced(sk);
+ } else {
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
goto out;
}
err = icmp_err_convert[code].errno;
+ /* check if this ICMP message allows revert of backoff.
+ * (see RFC 6069)
+ */
+ if (!fastopen &&
+ (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
+ tcp_ld_RTO_revert(sk, seq);
break;
case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH;
@@ -421,48 +615,20 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
}
switch (sk->sk_state) {
- struct request_sock *req, **prev;
- case TCP_LISTEN:
- if (sock_owned_by_user(sk))
- goto out;
-
- req = inet_csk_search_req(sk, &prev, th->dest,
- iph->daddr, iph->saddr);
- if (!req)
- goto out;
-
- /* ICMPs are not backlogged, hence we cannot get
- an established socket here.
- */
- BUG_TRAP(!req->sk);
-
- if (seq != tcp_rsk(req)->snt_isn) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- /*
- * Still in SYN_RECV, just remove it silently.
- * There is no good way to pass the error to the newly
- * created socket, and POSIX does not want network
- * errors returned from accept().
- */
- inet_csk_reqsk_queue_drop(sk, req, prev);
- goto out;
-
case TCP_SYN_SENT:
- case TCP_SYN_RECV: /* Cannot happen.
- It can f.e. if SYNs crossed.
- */
- if (!sock_owned_by_user(sk)) {
- sk->sk_err = err;
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * already accepted it is treated as a connected one below.
+ */
+ if (fastopen && !fastopen->sk)
+ break;
- sk->sk_error_report(sk);
+ ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
- tcp_done(sk);
- } else {
- sk->sk_err_soft = err;
- }
+ if (!sock_owned_by_user(sk))
+ tcp_done_with_error(sk, err);
+ else
+ WRITE_ONCE(sk->sk_err_soft, err);
goto out;
}
@@ -482,53 +648,82 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
* --ANK (980905)
*/
- inet = inet_sk(sk);
- if (!sock_owned_by_user(sk) && inet->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
+ if (!sock_owned_by_user(sk) &&
+ inet_test_bit(RECVERR, sk)) {
+ WRITE_ONCE(sk->sk_err, err);
+ sk_error_report(sk);
} else { /* Only an error on timeout */
- sk->sk_err_soft = err;
+ WRITE_ONCE(sk->sk_err_soft, err);
}
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
-/* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
- struct inet_sock *inet = inet_sk(sk);
- struct tcphdr *th = skb->h.th;
+ struct tcphdr *th = tcp_hdr(skb);
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- th->check = ~tcp_v4_check(th, len,
- inet->saddr, inet->daddr, 0);
- skb->csum_offset = offsetof(struct tcphdr, check);
- } else {
- th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
- csum_partial((char *)th,
- th->doff << 2,
- skb->csum));
- }
+ th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
}
-int tcp_v4_gso_send_check(struct sk_buff *skb)
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
- struct iphdr *iph;
- struct tcphdr *th;
+ const struct inet_sock *inet = inet_sk(sk);
- if (!pskb_may_pull(skb, sizeof(*th)))
- return -EINVAL;
+ __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+EXPORT_IPV6_MOD(tcp_v4_send_check);
- iph = skb->nh.iph;
- th = skb->h.th;
+#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
- th->check = 0;
- th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
- skb->csum_offset = offsetof(struct tcphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
- return 0;
+static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
+ const struct tcp_ao_hdr *aoh,
+ struct ip_reply_arg *arg, struct tcphdr *reply,
+ __be32 reply_options[REPLY_OPTIONS_LEN])
+{
+#ifdef CONFIG_TCP_AO
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index = sdif ? dif : 0;
+ bool allocated_traffic_key;
+ struct tcp_ao_key *key;
+ char *traffic_key;
+ bool drop = true;
+ u32 ao_sne = 0;
+ u8 keyid;
+
+ rcu_read_lock();
+ if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
+ &key, &traffic_key, &allocated_traffic_key,
+ &keyid, &ao_sne))
+ goto out;
+
+ reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
+ (aoh->rnext_keyid << 8) | keyid);
+ arg->iov[0].iov_len += tcp_ao_len_aligned(key);
+ reply->doff = arg->iov[0].iov_len / 4;
+
+ if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
+ key, traffic_key,
+ (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+ (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+ reply, ao_sne))
+ goto out;
+ drop = false;
+out:
+ rcu_read_unlock();
+ if (allocated_traffic_key)
+ kfree(traffic_key);
+ return drop;
+#else
+ return true;
+#endif
}
/*
@@ -544,25 +739,35 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason)
{
- struct tcphdr *th = skb->h.th;
+ const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
-#ifdef CONFIG_TCP_MD5SIG
- __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
-#endif
+ __be32 opt[REPLY_OPTIONS_LEN];
} rep;
+ const __u8 *md5_hash_location = NULL;
+ const struct tcp_ao_hdr *aoh;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *key;
+ struct tcp_md5sig_key *key = NULL;
+ unsigned char newhash[16];
+ struct sock *sk1 = NULL;
#endif
+ u64 transmit_time = 0;
+ struct sock *ctl_sk;
+ struct net *net;
+ u32 txhash = 0;
/* Never send a reset in response to a reset. */
if (th->rst)
return;
- if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
+ /* If sk not NULL, it means we did a successful lookup and incoming
+ * route had to be correct. prequeue might have dropped our dst.
+ */
+ if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
return;
/* Swap the send and the receive. */
@@ -584,8 +789,61 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
+ net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
+
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
+ return;
+
+ if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
+ return;
+
#ifdef CONFIG_TCP_MD5SIG
- key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL;
+ rcu_read_lock();
+ if (sk && sk_fullsock(sk)) {
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+ } else if (md5_hash_location) {
+ const union tcp_md5_addr *addr;
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index;
+
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
+ ntohs(th->source), dif, sdif);
+ /* don't send rst if it can't find key */
+ if (!sk1)
+ goto out;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = sdif ? dif : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
+ if (!key)
+ goto out;
+
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ if (memcmp(md5_hash_location, newhash, 16) != 0)
+ goto out;
+ }
+
if (key) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -595,60 +853,112 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
rep.th.doff = arg.iov[0].iov_len / 4;
- tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
- key,
- skb->nh.iph->daddr,
- skb->nh.iph->saddr,
- &rep.th, IPPROTO_TCP,
- arg.iov[0].iov_len);
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+ key, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &rep.th);
}
#endif
- arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
- skb->nh.iph->saddr, /* XXX */
- sizeof(struct tcphdr), IPPROTO_TCP, 0);
+ /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
+ if (rep.opt[0] == 0) {
+ __be32 mrst = mptcp_reset_option(skb);
+
+ if (mrst) {
+ rep.opt[0] = mrst;
+ arg.iov[0].iov_len += sizeof(mrst);
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ }
+ }
+
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
+ arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+
+ /* When socket is gone, all binding information is lost.
+ * routing might fail in this case. No choice here, if we choose to force
+ * input interface, we will misroute in case of asymmetric route.
+ */
+ if (sk)
+ arg.bound_dev_if = sk->sk_bound_dev_if;
+
+ trace_tcp_send_reset(sk, skb, reason);
+
+ BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
+ offsetof(struct inet_timewait_sock, tw_bound_dev_if));
+
+ /* ECN bits of TW reset are cleared */
+ arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
+ local_bh_disable();
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
- ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
+ sock_net_set(ctl_sk, net);
+ if (sk) {
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
+ transmit_time = tcp_transmit_time(sk);
+ xfrm_sk_clone_policy(ctl_sk, sk);
+ txhash = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_txhash : sk->sk_txhash;
+ } else {
+ ctl_sk->sk_mark = 0;
+ ctl_sk->sk_priority = 0;
+ }
+ ip_send_unicast_reply(ctl_sk, sk,
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len,
+ transmit_time, txhash);
+
+ xfrm_sk_free_policy(ctl_sk);
+ sock_net_set(ctl_sk, &init_net);
+ __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ local_bh_enable();
- TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
- TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
+#ifdef CONFIG_TCP_MD5SIG
+out:
+ rcu_read_unlock();
+#endif
}
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
+static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
- u32 win, u32 ts)
+ u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_key *key,
+ int reply_flags, u8 tos, u32 txhash)
{
- struct tcphdr *th = skb->h.th;
+ const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
- __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
-#ifdef CONFIG_TCP_MD5SIG
- + (TCPOLEN_MD5SIG_ALIGNED >> 2)
-#endif
- ];
+ __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
} rep;
+ struct net *net = sock_net(sk);
struct ip_reply_arg arg;
-#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *key;
- struct tcp_md5sig_key tw_key;
-#endif
+ struct sock *ctl_sk;
+ u64 transmit_time;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- if (ts) {
+ if (tsecr) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
- rep.opt[1] = htonl(tcp_time_stamp);
- rep.opt[2] = htonl(ts);
- arg.iov[0].iov_len = TCPOLEN_TSTAMP_ALIGNED;
+ rep.opt[1] = htonl(tsval);
+ rep.opt[2] = htonl(tsecr);
+ arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
}
/* Swap the send and the receive. */
@@ -661,25 +971,8 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
rep.th.window = htons(win);
#ifdef CONFIG_TCP_MD5SIG
- /*
- * The SKB holds an imcoming packet, but may not have a valid ->sk
- * pointer. This is especially the case when we're dealing with a
- * TIME_WAIT ack, because the sk structure is long gone, and only
- * the tcp_timewait_sock remains. So the md5 key is stashed in that
- * structure, and we use it in preference. I believe that (twsk ||
- * skb->sk) holds true, but we program defensively.
- */
- if (!twsk && skb->sk) {
- key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr);
- } else if (twsk && twsk->tw_md5_keylen) {
- tw_key.key = twsk->tw_md5_key;
- tw_key.keylen = twsk->tw_md5_keylen;
- key = &tw_key;
- } else
- key = NULL;
-
- if (key) {
- int offset = (ts) ? 3 : 0;
+ if (tcp_key_is_md5(key)) {
+ int offset = (tsecr) ? 3 : 0;
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -688,79 +981,248 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
rep.th.doff = arg.iov[0].iov_len/4;
- tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
- key,
- skb->nh.iph->daddr,
- skb->nh.iph->saddr,
- &rep.th, IPPROTO_TCP,
- arg.iov[0].iov_len);
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
+ key->md5_key, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &rep.th);
+ }
+#endif
+#ifdef CONFIG_TCP_AO
+ if (tcp_key_is_ao(key)) {
+ int offset = (tsecr) ? 3 : 0;
+
+ rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
+ (tcp_ao_len(key->ao_key) << 16) |
+ (key->ao_key->sndid << 8) |
+ key->rcv_next);
+ arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
+ rep.th.doff = arg.iov[0].iov_len / 4;
+
+ tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
+ key->ao_key, key->traffic_key,
+ (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+ (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+ &rep.th, key->sne);
}
#endif
- arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
- skb->nh.iph->saddr, /* XXX */
+ arg.flags = reply_flags;
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
-
- ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
-
- TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+ if (oif)
+ arg.bound_dev_if = oif;
+ arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
+ local_bh_disable();
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
+ sock_net_set(ctl_sk, net);
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
+ transmit_time = tcp_transmit_time(sk);
+ ip_send_unicast_reply(ctl_sk, sk,
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len,
+ transmit_time, txhash);
+
+ sock_net_set(ctl_sk, &init_net);
+ __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ local_bh_enable();
}
-static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
+ enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+ struct tcp_key key = {};
+ u8 tos = tw->tw_tos;
- tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
+ * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
+ * being placed in a different service queues (Classic rather than L4S)
+ */
+ if (tw_status == TCP_TW_ACK_OOW)
+ tos &= ~INET_ECN_MASK;
+
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao_info;
+
+ if (static_branch_unlikely(&tcp_ao_needed.key)) {
+ /* FIXME: the segment to-be-acked is not verified yet */
+ ao_info = rcu_dereference(tcptw->ao_info);
+ if (ao_info) {
+ const struct tcp_ao_hdr *aoh;
+
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
+ inet_twsk_put(tw);
+ return;
+ }
+
+ if (aoh)
+ key.ao_key = tcp_ao_established_key(sk, ao_info,
+ aoh->rnext_keyid, -1);
+ }
+ }
+ if (key.ao_key) {
+ struct tcp_ao_key *rnext_key;
+
+ key.traffic_key = snd_other_key(key.ao_key);
+ key.sne = READ_ONCE(ao_info->snd_sne);
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ key.rcv_next = rnext_key->rcvid;
+ key.type = TCP_KEY_AO;
+#else
+ if (0) {
+#endif
+ } else if (static_branch_tcp_md5()) {
+ key.md5_key = tcp_twsk_md5_key(tcptw);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+ }
+
+ tcp_v4_send_ack(sk, skb,
+ tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
- tcptw->tw_ts_recent);
+ tcp_tw_tsval(tcptw),
+ READ_ONCE(tcptw->tw_ts_recent),
+ tw->tw_bound_dev_if, &key,
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ tos,
+ tw->tw_txhash);
inet_twsk_put(tw);
}
-static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
+static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
- tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
- tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
- req->ts_recent);
+ struct tcp_key key = {};
+
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+ tcp_sk(sk)->snd_nxt;
+
+#ifdef CONFIG_TCP_AO
+ if (static_branch_unlikely(&tcp_ao_needed.key) &&
+ tcp_rsk_used_ao(req)) {
+ const union tcp_md5_addr *addr;
+ const struct tcp_ao_hdr *aoh;
+ int l3index;
+
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ return;
+ if (!aoh)
+ return;
+
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
+ aoh->rnext_keyid, -1);
+ if (unlikely(!key.ao_key)) {
+ /* Send ACK with any matching MKT for the peer */
+ key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
+ /* Matching key disappeared (user removed the key?)
+ * let the handshake timeout.
+ */
+ if (!key.ao_key) {
+ net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
+ addr,
+ ntohs(tcp_hdr(skb)->source),
+ &ip_hdr(skb)->daddr,
+ ntohs(tcp_hdr(skb)->dest));
+ return;
+ }
+ }
+ key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+ if (!key.traffic_key)
+ return;
+
+ key.type = TCP_KEY_AO;
+ key.rcv_next = aoh->keyid;
+ tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+ if (0) {
+#endif
+ } else if (static_branch_tcp_md5()) {
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+ }
+
+ /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
+ tcp_v4_send_ack(sk, skb, seq,
+ tcp_rsk(req)->rcv_nxt,
+ tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
+ tcp_rsk_tsval(tcp_rsk(req)),
+ req->ts_recent,
+ 0, &key,
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ ip_hdr(skb)->tos & ~INET_ECN_MASK,
+ READ_ONCE(tcp_rsk(req)->txhash));
+ if (tcp_key_is_ao(&key))
+ kfree(key.traffic_key);
}
/*
- * Send a SYN-ACK after having received an ACK.
+ * Send a SYN-ACK after having received a SYN.
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
- const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct flowi4 fl4;
int err = -1;
- struct sk_buff * skb;
+ struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
- if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto out;
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ return -1;
- skb = tcp_make_synack(sk, dst, req);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
- struct tcphdr *th = skb->h.th;
+ tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+
+ tos = READ_ONCE(inet_sk(sk)->tos);
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (tos & INET_ECN_MASK);
- th->check = tcp_v4_check(th, skb->len,
- ireq->loc_addr,
- ireq->rmt_addr,
- csum_partial((char *)th, skb->len,
- skb->csum));
+ if (!INET_ECN_is_capable(tos) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tos |= INET_ECN_ECT_0;
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
- ireq->opt);
+ rcu_read_lock();
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
+ rcu_dereference(ireq->ireq_opt),
+ tos);
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
-out:
- dst_release(dst);
return err;
}
@@ -769,43 +1231,7 @@ out:
*/
static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
- kfree(inet_rsk(req)->opt);
-}
-
-#ifdef CONFIG_SYN_COOKIES
-static void syn_flood_warning(struct sk_buff *skb)
-{
- static unsigned long warntime;
-
- if (time_after(jiffies, (warntime + HZ * 60))) {
- warntime = jiffies;
- printk(KERN_INFO
- "possible SYN flooding on port %d. Sending cookies.\n",
- ntohs(skb->h.th->dest));
- }
-}
-#endif
-
-/*
- * Save and compile IPv4 options into the request_sock if needed.
- */
-static struct ip_options *tcp_v4_save_options(struct sock *sk,
- struct sk_buff *skb)
-{
- struct ip_options *opt = &(IPCB(skb)->opt);
- struct ip_options *dopt = NULL;
-
- if (opt && opt->optlen) {
- int opt_size = optlength(opt);
- dopt = kmalloc(opt_size, GFP_ATOMIC);
- if (dopt) {
- if (ip_options_echo(dopt, skb)) {
- kfree(dopt);
- dopt = NULL;
- }
- }
- }
- return dopt;
+ kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}
#ifdef CONFIG_TCP_MD5SIG
@@ -815,762 +1241,612 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
* We need to maintain these in the sk structure.
*/
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
+EXPORT_IPV6_MOD(tcp_md5_needed);
+
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
+{
+ if (!old)
+ return true;
+
+ /* l3index always overrides non-l3index */
+ if (old->l3index && new->l3index == 0)
+ return false;
+ if (old->l3index == 0 && new->l3index)
+ return true;
+
+ return old->prefixlen < new->prefixlen;
+}
+
/* Find the Key structure for an address. */
-static struct tcp_md5sig_key *
- tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_md5_addr *addr,
+ int family, bool any_l3index)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int i;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ const struct tcp_md5sig_info *md5sig;
+ __be32 mask;
+ struct tcp_md5sig_key *best_match = NULL;
+ bool match;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ lockdep_sock_is_held(sk));
+ if (!md5sig)
+ return NULL;
- if (!tp->md5sig_info || !tp->md5sig_info->entries4)
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
+ if (key->family != family)
+ continue;
+ if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
+ key->l3index != l3index)
+ continue;
+ if (family == AF_INET) {
+ mask = inet_make_mask(key->prefixlen);
+ match = (key->addr.a4.s_addr & mask) ==
+ (addr->a4.s_addr & mask);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
+ key->prefixlen);
+#endif
+ } else {
+ match = false;
+ }
+
+ if (match && better_md5_match(best_match, key))
+ best_match = key;
+ }
+ return best_match;
+}
+EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
+
+static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family, u8 prefixlen,
+ int l3index, u8 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ unsigned int size = sizeof(struct in_addr);
+ const struct tcp_md5sig_info *md5sig;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ lockdep_sock_is_held(sk));
+ if (!md5sig)
return NULL;
- for (i = 0; i < tp->md5sig_info->entries4; i++) {
- if (tp->md5sig_info->keys4[i].addr == addr)
- return (struct tcp_md5sig_key *)
- &tp->md5sig_info->keys4[i];
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ size = sizeof(struct in6_addr);
+#endif
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
+ if (key->family != family)
+ continue;
+ if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
+ continue;
+ if (key->l3index != l3index)
+ continue;
+ if (!memcmp(&key->addr, addr, size) &&
+ key->prefixlen == prefixlen)
+ return key;
}
return NULL;
}
-struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
- struct sock *addr_sk)
+struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
+ const struct sock *addr_sk)
{
- return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
-}
+ const union tcp_md5_addr *addr;
+ int l3index;
-EXPORT_SYMBOL(tcp_v4_md5_lookup);
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+}
+EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
-struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
- struct request_sock *req)
+static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
{
- return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = kmalloc(sizeof(*md5sig), gfp);
+ if (!md5sig)
+ return -ENOMEM;
+
+ sk_gso_disable(sk);
+ INIT_HLIST_HEAD(&md5sig->head);
+ rcu_assign_pointer(tp->md5sig_info, md5sig);
+ return 0;
}
/* This can be called on a newly created socket, from other files */
-int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
- u8 *newkey, u8 newkeylen)
+static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, u8 prefixlen, int l3index, u8 flags,
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
- struct tcp4_md5sig_key *key;
+ struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp4_md5sig_key *keys;
+ struct tcp_md5sig_info *md5sig;
- key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
if (key) {
- /* Pre-existing entry - just update that one. */
- kfree(key->key);
- key->key = newkey;
- key->keylen = newkeylen;
- } else {
- struct tcp_md5sig_info *md5sig;
-
- if (!tp->md5sig_info) {
- tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
- GFP_ATOMIC);
- if (!tp->md5sig_info) {
- kfree(newkey);
- return -ENOMEM;
- }
- }
- if (tcp_alloc_md5sig_pool() == NULL) {
- kfree(newkey);
- return -ENOMEM;
- }
- md5sig = tp->md5sig_info;
-
- if (md5sig->alloced4 == md5sig->entries4) {
- keys = kmalloc((sizeof(*keys) *
- (md5sig->entries4 + 1)), GFP_ATOMIC);
- if (!keys) {
- kfree(newkey);
- tcp_free_md5sig_pool();
- return -ENOMEM;
- }
+ /* Pre-existing entry - just update that one.
+ * Note that the key might be used concurrently.
+ * data_race() is telling kcsan that we do not care of
+ * key mismatches, since changing MD5 key on live flows
+ * can lead to packet drops.
+ */
+ data_race(memcpy(key->key, newkey, newkeylen));
- if (md5sig->entries4)
- memcpy(keys, md5sig->keys4,
- sizeof(*keys) * md5sig->entries4);
+ /* Pairs with READ_ONCE() in tcp_md5_hash_key().
+ * Also note that a reader could catch new key->keylen value
+ * but old key->key[], this is the reason we use __GFP_ZERO
+ * at sock_kmalloc() time below these lines.
+ */
+ WRITE_ONCE(key->keylen, newkeylen);
- /* Free old key list, and reference new one */
- if (md5sig->keys4)
- kfree(md5sig->keys4);
- md5sig->keys4 = keys;
- md5sig->alloced4++;
- }
- md5sig->entries4++;
- md5sig->keys4[md5sig->entries4 - 1].addr = addr;
- md5sig->keys4[md5sig->entries4 - 1].key = newkey;
- md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
+ return 0;
}
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info,
+ lockdep_sock_is_held(sk));
+
+ key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
+ if (!key)
+ return -ENOMEM;
+
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ key->family = family;
+ key->prefixlen = prefixlen;
+ key->l3index = l3index;
+ key->flags = flags;
+ memcpy(&key->addr, addr,
+ (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
+ hlist_add_head_rcu(&key->node, &md5sig->head);
return 0;
}
-EXPORT_SYMBOL(tcp_v4_md5_do_add);
-
-static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
- u8 *newkey, u8 newkeylen)
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, u8 prefixlen, int l3index, u8 flags,
+ const u8 *newkey, u8 newkeylen)
{
- return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
- newkey, newkeylen);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+ if (fips_enabled) {
+ pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (tcp_md5sig_info_add(sk, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!static_branch_inc(&tcp_md5_needed.key)) {
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+ rcu_assign_pointer(tp->md5sig_info, NULL);
+ kfree_rcu(md5sig, rcu);
+ return -EUSERS;
+ }
+ }
+
+ return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
+ newkey, newkeylen, GFP_KERNEL);
}
+EXPORT_IPV6_MOD(tcp_md5_do_add);
-int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
+int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, u8 prefixlen, int l3index,
+ struct tcp_md5sig_key *key)
{
struct tcp_sock *tp = tcp_sk(sk);
- int i;
- for (i = 0; i < tp->md5sig_info->entries4; i++) {
- if (tp->md5sig_info->keys4[i].addr == addr) {
- /* Free the key */
- kfree(tp->md5sig_info->keys4[i].key);
- tp->md5sig_info->entries4--;
-
- if (tp->md5sig_info->entries4 == 0) {
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- } else if (tp->md5sig_info->entries4 != i) {
- /* Need to do some manipulation */
- memcpy(&tp->md5sig_info->keys4[i],
- &tp->md5sig_info->keys4[i+1],
- (tp->md5sig_info->entries4 - i) *
- sizeof(struct tcp4_md5sig_key));
- }
- tcp_free_md5sig_pool();
- return 0;
+ if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+
+ if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
+ return -ENOMEM;
+
+ if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+ net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
+ rcu_assign_pointer(tp->md5sig_info, NULL);
+ kfree_rcu(md5sig, rcu);
+ return -EUSERS;
}
}
- return -ENOENT;
+
+ return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
+ key->flags, key->key, key->keylen,
+ sk_gfp_mask(sk, GFP_ATOMIC));
}
+EXPORT_IPV6_MOD(tcp_md5_key_copy);
-EXPORT_SYMBOL(tcp_v4_md5_do_del);
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
+ u8 prefixlen, int l3index, u8 flags)
+{
+ struct tcp_md5sig_key *key;
+
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
+ if (!key)
+ return -ENOENT;
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ return 0;
+}
+EXPORT_IPV6_MOD(tcp_md5_do_del);
-static void tcp_v4_clear_md5_list(struct sock *sk)
+void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ struct hlist_node *n;
+ struct tcp_md5sig_info *md5sig;
- /* Free each key, then the set of key keys,
- * the crypto element, and then decrement our
- * hold on the last resort crypto.
- */
- if (tp->md5sig_info->entries4) {
- int i;
- for (i = 0; i < tp->md5sig_info->entries4; i++)
- kfree(tp->md5sig_info->keys4[i].key);
- tp->md5sig_info->entries4 = 0;
- tcp_free_md5sig_pool();
- }
- if (tp->md5sig_info->keys4) {
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- tp->md5sig_info->alloced4 = 0;
+ md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+
+ hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
+ hlist_del(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree(key);
}
}
-static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
- int optlen)
+static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
- u8 *newkey;
+ const union tcp_md5_addr *addr;
+ u8 prefixlen = 32;
+ int l3index = 0;
+ bool l3flag;
+ u8 flags;
if (optlen < sizeof(cmd))
return -EINVAL;
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin->sin_family != AF_INET)
return -EINVAL;
- if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
- if (!tcp_sk(sk)->md5sig_info)
- return -ENOENT;
- return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
- }
-
- if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
- return -EINVAL;
+ flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+ l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
- if (!tcp_sk(sk)->md5sig_info) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
-
- if (!p)
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+ prefixlen = cmd.tcpm_prefixlen;
+ if (prefixlen > 32)
return -EINVAL;
-
- tp->md5sig_info = p;
-
}
- newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
- if (!newkey)
- return -ENOMEM;
- return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
- newkey, cmd.tcpm_keylen);
-}
+ if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
-static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
- __be32 saddr, __be32 daddr,
- struct tcphdr *th, int protocol,
- int tcplen)
-{
- struct scatterlist sg[4];
- __u16 data_len;
- int block = 0;
-#ifdef CONFIG_TCP_MD5SIG_DEBUG
- int i;
-#endif
- __sum16 old_checksum;
- struct tcp_md5sig_pool *hp;
- struct tcp4_pseudohdr *bp;
- struct hash_desc *desc;
- int err;
- unsigned int nbytes = 0;
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
- /*
- * Okay, so RFC2385 is turned on for this connection,
- * so we need to generate the MD5 hash for the packet now.
- */
+ rcu_read_unlock();
- hp = tcp_get_md5sig_pool();
- if (!hp)
- goto clear_hash_noput;
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
- bp = &hp->md5_blk.ip4;
- desc = &hp->md5_desc;
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
- /*
- * 1. the TCP pseudo-header (in the order: source IP address,
- * destination IP address, zero-padded protocol number, and
- * segment length)
- */
- bp->saddr = saddr;
- bp->daddr = daddr;
- bp->pad = 0;
- bp->protocol = protocol;
- bp->len = htons(tcplen);
- sg_set_buf(&sg[block++], bp, sizeof(*bp));
- nbytes += sizeof(*bp);
-
-#ifdef CONFIG_TCP_MD5SIG_DEBUG
- printk("Calcuating hash for: ");
- for (i = 0; i < sizeof(*bp); i++)
- printk("%02x ", (unsigned int)((unsigned char *)bp)[i]);
- printk(" ");
-#endif
+ if (!cmd.tcpm_keylen)
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
- /* 2. the TCP header, excluding options, and assuming a
- * checksum of zero/
- */
- old_checksum = th->check;
- th->check = 0;
- sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
- nbytes += sizeof(struct tcphdr);
-#ifdef CONFIG_TCP_MD5SIG_DEBUG
- for (i = 0; i < sizeof(struct tcphdr); i++)
- printk(" %02x", (unsigned int)((unsigned char *)th)[i]);
-#endif
- /* 3. the TCP segment data (if any) */
- data_len = tcplen - (th->doff << 2);
- if (data_len > 0) {
- unsigned char *data = (unsigned char *)th + (th->doff << 2);
- sg_set_buf(&sg[block++], data, data_len);
- nbytes += data_len;
- }
+ if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+ return -EINVAL;
- /* 4. an independently-specified key or password, known to both
- * TCPs and presumably connection-specific
+ /* Don't allow keys for peers that have a matching TCP-AO key.
+ * See the comment in tcp_ao_add_cmd()
*/
- sg_set_buf(&sg[block++], key->key, key->keylen);
- nbytes += key->keylen;
+ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
+ return -EKEYREJECTED;
-#ifdef CONFIG_TCP_MD5SIG_DEBUG
- printk(" and password: ");
- for (i = 0; i < key->keylen; i++)
- printk("%02x ", (unsigned int)key->key[i]);
-#endif
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
+ cmd.tcpm_key, cmd.tcpm_keylen);
+}
- /* Now store the Hash into the packet */
- err = crypto_hash_init(desc);
- if (err)
- goto clear_hash;
- err = crypto_hash_update(desc, sg, nbytes);
- if (err)
- goto clear_hash;
- err = crypto_hash_final(desc, md5_hash);
- if (err)
- goto clear_hash;
+static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
+ __be32 daddr, __be32 saddr,
+ const struct tcphdr *th, int nbytes)
+{
+ struct {
+ struct tcp4_pseudohdr ip;
+ struct tcphdr tcp;
+ } h;
+
+ h.ip.saddr = saddr;
+ h.ip.daddr = daddr;
+ h.ip.pad = 0;
+ h.ip.protocol = IPPROTO_TCP;
+ h.ip.len = cpu_to_be16(nbytes);
+ h.tcp = *th;
+ h.tcp.check = 0;
+ md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
+}
- /* Reset header, and free up the crypto */
- tcp_put_md5sig_pool();
- th->check = old_checksum;
+static noinline_for_stack void
+tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
+{
+ struct md5_ctx ctx;
-out:
-#ifdef CONFIG_TCP_MD5SIG_DEBUG
- printk(" result:");
- for (i = 0; i < 16; i++)
- printk(" %02x", (unsigned int)(((u8*)md5_hash)[i]));
- printk("\n");
-#endif
- return 0;
-clear_hash:
- tcp_put_md5sig_pool();
-clear_hash_noput:
- memset(md5_hash, 0, 16);
- goto out;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
- struct sock *sk,
- struct dst_entry *dst,
- struct request_sock *req,
- struct tcphdr *th, int protocol,
- int tcplen)
+noinline_for_stack void
+tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb)
{
+ const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
+ struct md5_ctx ctx;
- if (sk) {
- saddr = inet_sk(sk)->saddr;
- daddr = inet_sk(sk)->daddr;
+ if (sk) { /* valid for establish/request sockets */
+ saddr = sk->sk_rcv_saddr;
+ daddr = sk->sk_daddr;
} else {
- struct rtable *rt = (struct rtable *)dst;
- BUG_ON(!rt);
- saddr = rt->rt_src;
- daddr = rt->rt_dst;
+ const struct iphdr *iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
}
- return tcp_v4_do_calc_md5_hash(md5_hash, key,
- saddr, daddr,
- th, protocol, tcplen);
+
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+ tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
+EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
-EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
+#endif
-static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
{
- /*
- * This gets called for each TCP segment that arrives
- * so we want to be efficient.
- * We have 3 drop cases:
- * o No MD5 hash and one expected.
- * o MD5 hash and we're not expecting one.
- * o MD5 hash and its wrong.
- */
- __u8 *hash_location = NULL;
- struct tcp_md5sig_key *hash_expected;
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr *th = skb->h.th;
- int length = (th->doff << 2) - sizeof(struct tcphdr);
- int genhash;
- unsigned char *ptr;
- unsigned char newhash[16];
-
- hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = sock_net(sk_listener);
- /*
- * If the TCP option length is less than the TCP_MD5SIG
- * option length, then we can shortcut
- */
- if (length < TCPOLEN_MD5SIG) {
- if (hash_expected)
- return 1;
- else
- return 0;
- }
-
- /* Okay, we can't shortcut - we have to grub through the options */
- ptr = (unsigned char *)(th + 1);
- while (length > 0) {
- int opcode = *ptr++;
- int opsize;
-
- switch (opcode) {
- case TCPOPT_EOL:
- goto done_opts;
- case TCPOPT_NOP:
- length--;
- continue;
- default:
- opsize = *ptr++;
- if (opsize < 2)
- goto done_opts;
- if (opsize > length)
- goto done_opts;
-
- if (opcode == TCPOPT_MD5SIG) {
- hash_location = ptr;
- goto done_opts;
- }
- }
- ptr += opsize-2;
- length -= opsize;
- }
-done_opts:
- /* We've parsed the options - do we have a hash? */
- if (!hash_expected && !hash_location)
- return 0;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
+}
- if (hash_expected && !hash_location) {
- LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
- "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
- NIPQUAD(iph->saddr), ntohs(th->source),
- NIPQUAD(iph->daddr), ntohs(th->dest));
- return 1;
- }
+static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct sk_buff *skb,
+ struct flowi *fl,
+ struct request_sock *req,
+ u32 tw_isn)
+{
+ tcp_v4_init_req(req, sk, skb);
- if (!hash_expected && hash_location) {
- LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
- "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
- NIPQUAD(iph->saddr), ntohs(th->source),
- NIPQUAD(iph->daddr), ntohs(th->dest));
- return 1;
- }
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
- /* Okay, so this is hash_expected and hash_location -
- * so we need to calculate the checksum.
- */
- genhash = tcp_v4_do_calc_md5_hash(newhash,
- hash_expected,
- iph->saddr, iph->daddr,
- th, sk->sk_protocol,
- skb->len);
-
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- if (net_ratelimit()) {
- printk(KERN_INFO "MD5 Hash failed for "
- "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
- NIPQUAD(iph->saddr), ntohs(th->source),
- NIPQUAD(iph->daddr), ntohs(th->dest),
- genhash ? " tcp_v4_calc_md5_hash failed" : "");
-#ifdef CONFIG_TCP_MD5SIG_DEBUG
- do {
- int i;
- printk("Received: ");
- for (i = 0; i < 16; i++)
- printk("%02x ",
- 0xff & (int)hash_location[i]);
- printk("\n");
- printk("Calculated: ");
- for (i = 0; i < 16; i++)
- printk("%02x ", 0xff & (int)newhash[i]);
- printk("\n");
- } while(0);
-#endif
- }
- return 1;
- }
- return 0;
+ return inet_csk_route_req(sk, &fl->u.ip4, req);
}
-#endif
-
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_v4_send_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
};
-struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+ .mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
- .md5_lookup = tcp_v4_reqsk_md5_lookup,
+ .req_md5_lookup = tcp_v4_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
#endif
-};
-
-static struct timewait_sock_ops tcp_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct tcp_timewait_sock),
- .twsk_unique = tcp_twsk_unique,
- .twsk_destructor= tcp_twsk_destructor,
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v4_ao_lookup_rsk,
+ .ao_calc_key = tcp_v4_ao_calc_key_rsk,
+ .ao_synack_hash = tcp_v4_ao_synack_hash,
+#endif
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v4_init_sequence,
+#endif
+ .route_req = tcp_v4_route_req,
+ .init_seq = tcp_v4_init_seq,
+ .init_ts_off = tcp_v4_init_ts_off,
+ .send_synack = tcp_v4_send_synack,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct inet_request_sock *ireq;
- struct tcp_options_received tmp_opt;
- struct request_sock *req;
- __be32 saddr = skb->nh.iph->saddr;
- __be32 daddr = skb->nh.iph->daddr;
- __u32 isn = TCP_SKB_CB(skb)->when;
- struct dst_entry *dst = NULL;
-#ifdef CONFIG_SYN_COOKIES
- int want_cookie = 0;
-#else
-#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
-#endif
-
/* Never answer to SYNs send to broadcast or multicast */
- if (((struct rtable *)skb->dst)->rt_flags &
- (RTCF_BROADCAST | RTCF_MULTICAST))
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
-#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
- want_cookie = 1;
- } else
-#endif
- goto drop;
- }
-
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
- goto drop;
-
- req = reqsk_alloc(&tcp_request_sock_ops);
- if (!req)
- goto drop;
-
-#ifdef CONFIG_TCP_MD5SIG
- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
-#endif
+ return tcp_conn_request(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, skb);
- tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = 536;
- tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
-
- tcp_parse_options(skb, &tmp_opt, 0);
-
- if (want_cookie) {
- tcp_clear_options(&tmp_opt);
- tmp_opt.saw_tstamp = 0;
- }
-
- if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
- /* Some OSes (unknown ones, but I see them on web server, which
- * contains information interesting only for windows'
- * users) do not send their stamp in SYN. It is easy case.
- * We simply do not advertise TS support.
- */
- tmp_opt.saw_tstamp = 0;
- tmp_opt.tstamp_ok = 0;
- }
- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-
- tcp_openreq_init(req, &tmp_opt, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- ireq = inet_rsk(req);
- ireq->loc_addr = daddr;
- ireq->rmt_addr = saddr;
- ireq->opt = tcp_v4_save_options(sk, skb);
- if (!want_cookie)
- TCP_ECN_create_request(req, skb->h.th);
-
- if (want_cookie) {
-#ifdef CONFIG_SYN_COOKIES
- syn_flood_warning(skb);
-#endif
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
- } else if (!isn) {
- struct inet_peer *peer = NULL;
-
- /* VJ's idea. We save last timestamp seen
- * from the destination in peer table, when entering
- * state TIME-WAIT, and check against it before
- * accepting new connection request.
- *
- * If "isn" is not zero, this request hit alive
- * timewait bucket, so that all the necessary checks
- * are made in the function processing timewait state.
- */
- if (tmp_opt.saw_tstamp &&
- tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, req)) != NULL &&
- (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
- peer->v4daddr == saddr) {
- if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
- (s32)(peer->tcp_ts - req->ts_recent) >
- TCP_PAWS_WINDOW) {
- NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
- dst_release(dst);
- goto drop_and_free;
- }
- }
- /* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (sysctl_max_syn_backlog >> 2)) &&
- (!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
- /* Without syncookies last quarter of
- * backlog is filled with destinations,
- * proven to be alive.
- * It means that we continue to communicate
- * to destinations, already remembered
- * to the moment of synflood.
- */
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
- "request from %u.%u.%u.%u/%u\n",
- NIPQUAD(saddr),
- ntohs(skb->h.th->source));
- dst_release(dst);
- goto drop_and_free;
- }
-
- isn = tcp_v4_init_sequence(skb);
- }
- tcp_rsk(req)->snt_isn = isn;
-
- if (tcp_v4_send_synack(sk, req, dst))
- goto drop_and_free;
-
- if (want_cookie) {
- reqsk_free(req);
- } else {
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- }
- return 0;
-
-drop_and_free:
- reqsk_free(req);
drop:
+ tcp_listendrop(sk);
return 0;
}
+EXPORT_IPV6_MOD(tcp_v4_conn_request);
/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
-struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
+ bool found_dup_sk = false;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
+ const union tcp_md5_addr *addr;
struct tcp_md5sig_key *key;
+ int l3index;
#endif
+ struct ip_options_rcu *inet_opt;
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto exit;
-
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
- goto exit;
+ goto exit_nonewsk;
newsk->sk_gso_type = SKB_GSO_TCPV4;
- sk_setup_caps(newsk, dst);
+ inet_sk_rx_dst_set(newsk, skb);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->daddr = ireq->rmt_addr;
- newinet->rcv_saddr = ireq->loc_addr;
- newinet->saddr = ireq->loc_addr;
- newinet->opt = ireq->opt;
- ireq->opt = NULL;
+ inet_opt = rcu_dereference(ireq->ireq_opt);
+ RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
newinet->mc_index = inet_iif(skb);
- newinet->mc_ttl = skb->nh.iph->ttl;
+ newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newinet->opt)
- inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
- newinet->id = newtp->write_seq ^ jiffies;
+ if (inet_opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ atomic_set(&newinet->inet_id, get_random_u16());
+
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
+ if (!dst) {
+ dst = inet_csk_route_child_sock(sk, newsk, req);
+ if (!dst)
+ goto put_and_exit;
+ } else {
+ /* syncookie case : see end of cookie_v4_check() */
+ }
+ sk_setup_caps(newsk, dst);
+
+ tcp_ca_openreq_child(newsk, dst);
- tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
+
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
/* Copy over the MD5 key from the original socket */
- if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
- /*
- * We're using one, so create a matching key
- * on the newsk structure. If we fail to get
- * memory, then we end up not copying the key
- * across. Shucks.
- */
- char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
- if (newkey != NULL)
- tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
- newkey, key->keylen);
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+ if (key && !tcp_rsk_used_ao(req)) {
+ if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
+ goto put_and_exit;
+ sk_gso_disable(newsk);
}
#endif
+#ifdef CONFIG_TCP_AO
+ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
+ goto put_and_exit; /* OOM, release back memory */
+#endif
- __inet_hash(&tcp_hashinfo, newsk, 0);
- __inet_inherit_port(&tcp_hashinfo, sk, newsk);
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
+ if (likely(*own_req)) {
+ tcp_move_syn(newtp, req);
+ ireq->ireq_opt = NULL;
+ } else {
+ newinet->inet_opt = NULL;
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
+ }
return newsk;
exit_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-exit:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
dst_release(dst);
+exit:
+ tcp_listendrop(sk);
return NULL;
+put_and_exit:
+ newinet->inet_opt = NULL;
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto exit;
}
+EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = skb->h.th;
- struct iphdr *iph = skb->nh.iph;
- struct sock *nsk;
- struct request_sock **prev;
- /* Find possible connection requests. */
- struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
- iph->saddr, iph->daddr);
- if (req)
- return tcp_check_req(sk, skb, req, prev);
-
- nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
- th->source, skb->nh.iph->daddr,
- th->dest, inet_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
#ifdef CONFIG_SYN_COOKIES
- if (!th->rst && !th->syn && th->ack)
- sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ if (!th->syn)
+ sk = cookie_v4_check(sk, skb);
#endif
return sk;
}
-static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+ struct tcphdr *th, u32 *cookie)
{
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr, skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- return 0;
- }
- }
-
- skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->len, IPPROTO_TCP, 0);
-
- if (skb->len <= 76) {
- return __skb_checksum_complete(skb);
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
}
- return 0;
+#endif
+ return mss;
}
-
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
@@ -1579,57 +1855,63 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
struct sock *rsk;
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
- if (tcp_v4_inbound_md5_hash(sk, skb))
- goto discard;
-#endif
+
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- TCP_CHECK_TIMER(sk);
- if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) {
- rsk = sk;
- goto reset;
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
+
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (dst) {
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
+ !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
+ dst, 0)) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
+ dst_release(dst);
+ }
}
- TCP_CHECK_TIMER(sk);
+ tcp_rcv_established(sk, skb);
return 0;
}
- if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
- if (!nsk)
- goto discard;
+ struct sock *nsk = tcp_v4_cookie_check(sk, skb);
+ if (!nsk)
+ return 0;
if (nsk != sk) {
- if (tcp_child_process(sk, nsk, skb)) {
+ reason = tcp_child_process(sk, nsk, skb);
+ if (reason) {
rsk = nsk;
goto reset;
}
return 0;
}
- }
+ } else
+ sock_rps_save_rxhash(sk, skb);
- TCP_CHECK_TIMER(sk);
- if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) {
+ reason = tcp_rcv_state_process(sk, skb);
+ if (reason) {
rsk = sk;
goto reset;
}
- TCP_CHECK_TIMER(sk);
return 0;
reset:
- tcp_v4_send_reset(rsk, skb);
+ tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
discard:
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
@@ -1638,9 +1920,230 @@ discard:
return 0;
csum_err:
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net_rcu(skb->dev);
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return 0;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return 0;
+
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ skb->skb_iif, inet_sdif(skb));
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk_fullsock(sk)) {
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst &&
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
+ return 0;
+}
+
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
+{
+ u32 tail_gso_size, tail_gso_segs;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+ struct sk_buff *tail;
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
+ u32 gso_size;
+ u64 limit;
+ int delta;
+ int err;
+
+ /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
+ * we can fix skb->truesize to its real value to avoid future drops.
+ * This is valid because skb is not yet charged to the socket.
+ * It has been noticed pure SACK packets were sometimes dropped
+ * (if cooked by drivers without copybreak feature).
+ */
+ skb_condense(skb);
+
+ tcp_cleanup_skb(skb);
+
+ if (unlikely(tcp_checksum_complete(skb))) {
+ bh_unlock_sock(sk);
+ trace_tcp_bad_csum(skb);
+ *reason = SKB_DROP_REASON_TCP_CSUM;
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ return true;
+ }
+
+ /* Attempt coalescing to last skb in backlog, even if we are
+ * above the limits.
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+ goto no_coalesce;
+ thtail = (struct tcphdr *)tail->data;
+
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+ ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
+ !((TCP_SKB_CB(tail)->tcp_flags &
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
+ ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) &
+ (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
+ !tcp_skb_can_collapse_rx(tail, skb) ||
+ thtail->doff != th->doff ||
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
+ /* prior to PSP Rx policy check, retain exact PSP metadata */
+ psp_skb_coalesce_diff(tail, skb))
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
+
+ shinfo = skb_shinfo(skb);
+ gso_size = shinfo->gso_size ?: skb->len;
+ gso_segs = shinfo->gso_segs ?: 1;
+
+ shinfo = skb_shinfo(tail);
+ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
+ tail_gso_segs = shinfo->gso_segs ?: 1;
+
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+ thtail->window = th->window;
+ }
+
+ /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
+ * thtail->fin, so that the fast path in tcp_rcv_established()
+ * is not entered if we append a packet with a FIN.
+ * SYN, RST, URG are not present.
+ * ACK is set on both packets.
+ * PSH : we do not really care in TCP stack,
+ * at least for 'GRO' packets.
+ */
+ thtail->fin |= th->fin;
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
+ tail->tstamp = skb->tstamp;
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+ shinfo->gso_size = max(gso_size, tail_gso_size);
+ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPBACKLOGCOALESCE);
+ kfree_skb_partial(skb, fragstolen);
+ return false;
+ }
+ __skb_push(skb, hdrlen);
+
+no_coalesce:
+ /* sk->sk_backlog.len is reset only at the end of __release_sock().
+ * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
+ * sk_rcvbuf in normal conditions.
+ */
+ limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
+
+ limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
+
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64 * 1024;
+
+ limit = min_t(u64, limit, UINT_MAX);
+
+ err = sk_add_backlog(sk, skb, limit);
+ if (unlikely(err)) {
+ bh_unlock_sock(sk);
+ if (err == -ENOMEM) {
+ *reason = SKB_DROP_REASON_PFMEMALLOC;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ } else {
+ *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ }
+ return true;
+ }
+ return false;
+}
+EXPORT_IPV6_MOD(tcp_add_backlog);
+
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
+{
+ struct tcphdr *th = (struct tcphdr *)skb->data;
+
+ return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
+}
+EXPORT_IPV6_MOD(tcp_filter);
+
+static void tcp_v4_restore_cb(struct sk_buff *skb)
+{
+ memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
+ sizeof(struct inet_skb_parm));
+}
+
+static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
+ const struct tcphdr *th)
+{
+ /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+ * barrier() makes sure compiler wont play fool^Waliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+ sizeof(struct inet_skb_parm));
+ barrier();
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff * 4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
+ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->has_rxtstamp =
+ skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
+}
/*
* From tcp_input.c
@@ -1648,23 +2151,34 @@ csum_err:
int tcp_v4_rcv(struct sk_buff *skb)
{
- struct tcphdr *th;
- struct sock *sk;
+ struct net *net = dev_net_rcu(skb->dev);
+ enum skb_drop_reason drop_reason;
+ enum tcp_tw_status tw_status;
+ int sdif = inet_sdif(skb);
+ int dif = inet_iif(skb);
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk = NULL;
+ bool refcounted;
int ret;
+ u32 isn;
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
- TCP_INC_STATS_BH(TCP_MIB_INSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
- th = skb->h.th;
+ th = (const struct tcphdr *)skb->data;
- if (th->doff < sizeof(struct tcphdr) / 4)
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
+ }
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
@@ -1672,392 +2186,437 @@ int tcp_v4_rcv(struct sk_buff *skb)
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
- if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v4_checksum_init(skb)))
- goto bad_packet;
-
- th = skb->h.th;
- TCP_SKB_CB(skb)->seq = ntohl(th->seq);
- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
- skb->len - th->doff * 4);
- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->when = 0;
- TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
- TCP_SKB_CB(skb)->sacked = 0;
- sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, th->dest,
- inet_iif(skb));
+ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
+ goto csum_error;
+ th = (const struct tcphdr *)skb->data;
+ iph = ip_hdr(skb);
+lookup:
+ sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
+ th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
-process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ bool req_stolen = false;
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+ else
+ drop_reason = tcp_inbound_hash(sk, req, skb,
+ &iph->saddr, &iph->daddr,
+ AF_INET, dif, sdif);
+ if (unlikely(drop_reason)) {
+ sk_drops_skbadd(sk, skb);
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (tcp_checksum_complete(skb)) {
+ reqsk_put(req);
+ goto csum_error;
+ }
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+ if (!nsk) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sk = nsk;
+ /* reuseport_migrate_sock() has already held one sk_refcnt
+ * before returning.
+ */
+ } else {
+ /* We own a reference on the listener, increase it again
+ * as we might lose it too soon.
+ */
+ sock_hold(sk);
+ }
+ refcounted = true;
+ nsk = NULL;
+ if (!tcp_filter(sk, skb, &drop_reason)) {
+ th = (const struct tcphdr *)skb->data;
+ iph = ip_hdr(skb);
+ tcp_v4_fill_cb(skb, iph, th);
+ nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
+ &drop_reason);
+ }
+ if (!nsk) {
+ reqsk_put(req);
+ if (req_stolen) {
+ /* Another cpu got exclusive access to req
+ * and created a full blown socket.
+ * Try to feed this packet to this socket
+ * instead of discarding it.
+ */
+ tcp_v4_restore_cb(skb);
+ sock_put(sk);
+ goto lookup;
+ }
+ goto discard_and_relse;
+ }
+ nf_reset_ct(skb);
+ if (nsk == sk) {
+ reqsk_put(req);
+ tcp_v4_restore_cb(skb);
+ } else {
+ drop_reason = tcp_child_process(sk, nsk, skb);
+ if (drop_reason) {
+ enum sk_rst_reason rst_reason;
+
+ rst_reason = sk_rst_convert_drop_reason(drop_reason);
+ tcp_v4_send_reset(nsk, skb, rst_reason);
+ goto discard_and_relse;
+ }
+ sock_put(sk);
+ return 0;
+ }
+ }
+
+process:
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ drop_reason = SKB_DROP_REASON_TCP_MINTTL;
+ goto discard_and_relse;
+ }
+ }
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
- nf_reset(skb);
+ }
- if (sk_filter(sk, skb))
+ drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
+ AF_INET, dif, sdif);
+ if (drop_reason)
goto discard_and_relse;
+ nf_reset_ct(skb);
+
+ if (tcp_filter(sk, skb, &drop_reason))
+ goto discard_and_relse;
+
+ th = (const struct tcphdr *)skb->data;
+ iph = ip_hdr(skb);
+ tcp_v4_fill_cb(skb, iph, th);
+
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v4_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
+ tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
- struct tcp_sock *tp = tcp_sk(sk);
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = get_softnet_dma();
- if (tp->ucopy.dma_chan)
- ret = tcp_v4_do_rcv(sk, skb);
- else
-#endif
- {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
- }
- } else
- sk_add_backlog(sk, skb);
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else {
+ if (tcp_add_backlog(sk, skb, &drop_reason))
+ goto discard_and_relse;
+ }
bh_unlock_sock(sk);
- sock_put(sk);
+put_and_return:
+ if (refcounted)
+ sock_put(sk);
return ret;
no_tcp_socket:
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+ tcp_v4_fill_cb(skb, iph, th);
+
+ if (tcp_checksum_complete(skb)) {
+csum_error:
+ drop_reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
+ __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ __TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
- tcp_v4_send_reset(NULL, skb);
+ tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
}
discard_it:
+ SKB_DR_OR(drop_reason, NOT_SPECIFIED);
/* Discard frame. */
- kfree_skb(skb);
- return 0;
+ sk_skb_reason_drop(sk, skb, drop_reason);
+ return 0;
discard_and_relse:
- sock_put(sk);
+ sk_drops_skbadd(sk, skb);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ tcp_v4_fill_cb(skb, iph, th);
+
+ if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
- goto discard_it;
+ goto csum_error;
}
- switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
+ switch (tw_status) {
case TCP_TW_SYN: {
- struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
- skb->nh.iph->daddr,
- th->dest,
- inet_iif(skb));
+ struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
+ iph->saddr, th->source,
+ iph->daddr, th->dest,
+ inet_iif(skb),
+ sdif);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
+ tcp_v4_restore_cb(skb);
+ refcounted = false;
+ __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
- /* Fall through to ACK */
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
+ /* to ACK */
+ fallthrough;
case TCP_TW_ACK:
- tcp_v4_timewait_ack(sk, skb);
+ case TCP_TW_ACK_OOW:
+ tcp_v4_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
- goto no_tcp_socket;
+ tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-
-int tcp_v4_remember_stamp(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
- struct inet_peer *peer = NULL;
- int release_it = 0;
-
- if (!rt || rt->rt_dst != inet->daddr) {
- peer = inet_getpeer(inet->daddr, 1);
- release_it = 1;
- } else {
- if (!rt->peer)
- rt_bind_peer(rt, 1);
- peer = rt->peer;
- }
-
- if (peer) {
- if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
- (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
- peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
- peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
- peer->tcp_ts = tp->rx_opt.ts_recent;
- }
- if (release_it)
- inet_putpeer(peer);
- return 1;
- }
-
- return 0;
-}
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+};
-int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
+void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
- struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
+ struct dst_entry *dst = skb_dst(skb);
- if (peer) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
-
- if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
- (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
- peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
- peer->tcp_ts = tcptw->tw_ts_recent;
- }
- inet_putpeer(peer);
- return 1;
+ if (dst && dst_hold_safe(dst)) {
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
}
-
- return 0;
}
+EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
-struct inet_connection_sock_af_ops ipv4_specific = {
+const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
- .remember_stamp = tcp_v4_remember_stamp,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
- .addr2sockaddr = inet_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ip_setsockopt,
- .compat_getsockopt = compat_ip_getsockopt,
-#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
};
+EXPORT_IPV6_MOD(ipv4_specific);
-struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
#ifdef CONFIG_TCP_MD5SIG
.md5_lookup = tcp_v4_md5_lookup,
- .calc_md5_hash = tcp_v4_calc_md5_hash,
- .md5_add = tcp_v4_md5_add_func,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
.md5_parse = tcp_v4_parse_md5_keys,
#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v4_ao_lookup,
+ .calc_ao_hash = tcp_v4_ao_hash_skb,
+ .ao_parse = tcp_v4_parse_ao,
+ .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
+#endif
};
+static void tcp4_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet_sock_destruct(sk);
+}
+#endif
+
/* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- skb_queue_head_init(&tp->out_of_order_queue);
- tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
-
- icsk->icsk_rto = TCP_TIMEOUT_INIT;
- tp->mdev = TCP_TIMEOUT_INIT;
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- tp->snd_cwnd = 2;
-
- /* See draft-stevens-tcpca-spec-01 for discussion of the
- * initialization of these values.
- */
- tp->snd_ssthresh = 0x7fffffff; /* Infinity */
- tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = 536;
-
- tp->reordering = sysctl_tcp_reordering;
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
-
- sk->sk_state = TCP_CLOSE;
-
- sk->sk_write_space = sk_stream_write_space;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific;
- icsk->icsk_sync_mss = tcp_sync_mss;
-#ifdef CONFIG_TCP_MD5SIG
- tp->af_specific = &tcp_sock_ipv4_specific;
+
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+ sk->sk_destruct = tcp4_destruct_sock;
#endif
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+ return 0;
+}
- atomic_inc(&tcp_sockets_allocated);
+static void tcp_release_user_frags(struct sock *sk)
+{
+#ifdef CONFIG_PAGE_POOL
+ unsigned long index;
+ void *netmem;
- return 0;
+ xa_for_each(&sk->sk_user_frags, index, netmem)
+ WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
+#endif
}
-int tcp_v4_destroy_sock(struct sock *sk)
+void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ tcp_release_user_frags(sk);
+
+ xa_destroy(&sk->sk_user_frags);
+
+ trace_tcp_destroy_sock(sk);
+
tcp_clear_xmit_timers(sk);
tcp_cleanup_congestion_control(sk);
- /* Cleanup up the write buffer. */
- sk_stream_writequeue_purge(sk);
-
- /* Cleans up our, hopefully empty, out_of_order_queue. */
- __skb_queue_purge(&tp->out_of_order_queue);
+ tcp_cleanup_ulp(sk);
-#ifdef CONFIG_TCP_MD5SIG
- /* Clean up the MD5 key list, if any */
- if (tp->md5sig_info) {
- tcp_v4_clear_md5_list(sk);
- kfree(tp->md5sig_info);
- tp->md5sig_info = NULL;
- }
-#endif
+ /* Cleanup up the write buffer. */
+ tcp_write_queue_purge(sk);
-#ifdef CONFIG_NET_DMA
- /* Cleans up our sk_async_wait_queue */
- __skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
+ /* Check if we want to disable active TFO */
+ tcp_fastopen_active_disable_ofo_check(sk);
- /* Clean prequeue, it must be empty really */
- __skb_queue_purge(&tp->ucopy.prequeue);
+ /* Cleans up our, hopefully empty, out_of_order_queue. */
+ skb_rbtree_purge(&tp->out_of_order_queue);
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
- inet_put_port(&tcp_hashinfo, sk);
+ inet_put_port(sk);
- /*
- * If sendmsg cached page exists, toss it.
- */
- if (sk->sk_sndmsg_page) {
- __free_page(sk->sk_sndmsg_page);
- sk->sk_sndmsg_page = NULL;
- }
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
- atomic_dec(&tcp_sockets_allocated);
+ /* If socket is aborted during connect operation */
+ tcp_free_fastopen_req(tp);
+ tcp_fastopen_destroy_cipher(sk);
+ tcp_saved_syn_free(tp);
- return 0;
+ sk_sockets_allocated_dec(sk);
}
-
-EXPORT_SYMBOL(tcp_v4_destroy_sock);
+EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
+static unsigned short seq_file_family(const struct seq_file *seq);
+
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
- return hlist_empty(head) ? NULL :
- list_entry(head->first, struct inet_timewait_sock, tw_node);
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
}
-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
+/* Find a non empty bucket (starting from st->bucket)
+ * and return the first sk from it.
+ */
+static void *listening_get_first(struct seq_file *seq)
{
- return tw->tw_node.next ?
- hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct tcp_iter_state *st = seq->private;
+
+ st->offset = 0;
+ for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
+ struct inet_listen_hashbucket *ilb2;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+
+ ilb2 = &hinfo->lhash2[st->bucket];
+ if (hlist_nulls_empty(&ilb2->nulls_head))
+ continue;
+
+ spin_lock(&ilb2->lock);
+ sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
+ if (seq_sk_match(seq, sk))
+ return sk;
+ }
+ spin_unlock(&ilb2->lock);
+ }
+
+ return NULL;
}
+/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
+ * If "cur" is the last one in the st->bucket,
+ * call listening_get_first() to return the first sk of the next
+ * non empty bucket.
+ */
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct inet_connection_sock *icsk;
- struct hlist_node *node;
+ struct tcp_iter_state *st = seq->private;
+ struct inet_listen_hashbucket *ilb2;
+ struct hlist_nulls_node *node;
+ struct inet_hashinfo *hinfo;
struct sock *sk = cur;
- struct tcp_iter_state* st = seq->private;
-
- if (!sk) {
- st->bucket = 0;
- sk = sk_head(&tcp_hashinfo.listening_hash[0]);
- goto get_sk;
- }
++st->num;
+ ++st->offset;
- if (st->state == TCP_SEQ_STATE_OPENREQ) {
- struct request_sock *req = cur;
-
- icsk = inet_csk(st->syn_wait_sk);
- req = req->dl_next;
- while (1) {
- while (req) {
- if (req->rsk_ops->family == st->family) {
- cur = req;
- goto out;
- }
- req = req->dl_next;
- }
- if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
- break;
-get_req:
- req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
- }
- sk = sk_next(st->syn_wait_sk);
- st->state = TCP_SEQ_STATE_LISTENING;
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- } else {
- icsk = inet_csk(sk);
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue))
- goto start_req;
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- sk = sk_next(sk);
- }
-get_sk:
- sk_for_each_from(sk, node) {
- if (sk->sk_family == st->family) {
- cur = sk;
- goto out;
- }
- icsk = inet_csk(sk);
- read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
-start_req:
- st->uid = sock_i_uid(sk);
- st->syn_wait_sk = sk;
- st->state = TCP_SEQ_STATE_OPENREQ;
- st->sbucket = 0;
- goto get_req;
- }
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ sk = sk_nulls_next(sk);
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk))
+ return sk;
}
- if (++st->bucket < INET_LHTABLE_SIZE) {
- sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
- goto get_sk;
- }
- cur = NULL;
-out:
- return cur;
+
+ hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ ilb2 = &hinfo->lhash2[st->bucket];
+ spin_unlock(&ilb2->lock);
+ ++st->bucket;
+ return listening_get_first(seq);
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
- void *rc = listening_get_next(seq, NULL);
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ st->offset = 0;
+ rc = listening_get_first(seq);
while (rc && *pos) {
rc = listening_get_next(seq, rc);
@@ -2066,96 +2625,73 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
+static inline bool empty_bucket(struct inet_hashinfo *hinfo,
+ const struct tcp_iter_state *st)
+{
+ return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
+}
+
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
static void *established_get_first(struct seq_file *seq)
{
- struct tcp_iter_state* st = seq->private;
- void *rc = NULL;
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct tcp_iter_state *st = seq->private;
- for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+ st->offset = 0;
+ for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
struct sock *sk;
- struct hlist_node *node;
- struct inet_timewait_sock *tw;
+ struct hlist_nulls_node *node;
+ spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
- /* We can reschedule _before_ having picked the target: */
- cond_resched_softirq();
+ cond_resched();
- read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
- sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
- if (sk->sk_family != st->family) {
- continue;
- }
- rc = sk;
- goto out;
- }
- st->state = TCP_SEQ_STATE_TIME_WAIT;
- inet_twsk_for_each(tw, node,
- &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
- if (tw->tw_family != st->family) {
- continue;
- }
- rc = tw;
- goto out;
+ /* Lockless fast path for the common case of empty buckets */
+ if (empty_bucket(hinfo, st))
+ continue;
+
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
+ if (seq_sk_match(seq, sk))
+ return sk;
}
- read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
- st->state = TCP_SEQ_STATE_ESTABLISHED;
+ spin_unlock_bh(lock);
}
-out:
- return rc;
+
+ return NULL;
}
static void *established_get_next(struct seq_file *seq, void *cur)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct tcp_iter_state *st = seq->private;
+ struct hlist_nulls_node *node;
struct sock *sk = cur;
- struct inet_timewait_sock *tw;
- struct hlist_node *node;
- struct tcp_iter_state* st = seq->private;
++st->num;
+ ++st->offset;
- if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
- tw = cur;
- tw = tw_next(tw);
-get_tw:
- while (tw && tw->tw_family != st->family) {
- tw = tw_next(tw);
- }
- if (tw) {
- cur = tw;
- goto out;
- }
- read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
- st->state = TCP_SEQ_STATE_ESTABLISHED;
+ sk = sk_nulls_next(sk);
- /* We can reschedule between buckets: */
- cond_resched_softirq();
-
- if (++st->bucket < tcp_hashinfo.ehash_size) {
- read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
- sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
- } else {
- cur = NULL;
- goto out;
- }
- } else
- sk = sk_next(sk);
-
- sk_for_each_from(sk, node) {
- if (sk->sk_family == st->family)
- goto found;
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk))
+ return sk;
}
- st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
- goto get_tw;
-found:
- cur = sk;
-out:
- return cur;
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+ ++st->bucket;
+ return established_get_first(seq);
}
static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
- void *rc = established_get_first(seq);
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ rc = established_get_first(seq);
while (rc && pos) {
rc = established_get_next(seq, rc);
@@ -2167,15 +2703,12 @@ static void *established_get_idx(struct seq_file *seq, loff_t pos)
static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
{
void *rc;
- struct tcp_iter_state* st = seq->private;
+ struct tcp_iter_state *st = seq->private;
- inet_listen_lock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_idx(seq, &pos);
if (!rc) {
- inet_listen_unlock(&tcp_hashinfo);
- local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
}
@@ -2183,279 +2716,727 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
return rc;
}
-static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct tcp_iter_state *st = seq->private;
+ int bucket = st->bucket;
+ int offset = st->offset;
+ int orig_num = st->num;
+ void *rc = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ if (st->bucket > hinfo->lhash2_mask)
+ break;
+ rc = listening_get_first(seq);
+ while (offset-- && rc && bucket == st->bucket)
+ rc = listening_get_next(seq, rc);
+ if (rc)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ fallthrough;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (st->bucket > hinfo->ehash_mask)
+ break;
+ rc = established_get_first(seq);
+ while (offset-- && rc && bucket == st->bucket)
+ rc = established_get_next(seq, rc);
+ }
+
+ st->num = orig_num;
+
+ return rc;
+}
+
+void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct tcp_iter_state* st = seq->private;
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ if (*pos && *pos == st->last_pos) {
+ rc = tcp_seek_last_pos(seq);
+ if (rc)
+ goto out;
+ }
+
st->state = TCP_SEQ_STATE_LISTENING;
st->num = 0;
- return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+ st->last_pos = *pos;
+ return rc;
}
+EXPORT_IPV6_MOD(tcp_seq_start);
-static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct tcp_iter_state *st = seq->private;
void *rc = NULL;
- struct tcp_iter_state* st;
if (v == SEQ_START_TOKEN) {
rc = tcp_get_idx(seq, 0);
goto out;
}
- st = seq->private;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
- inet_listen_unlock(&tcp_hashinfo);
- local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ st->offset = 0;
rc = established_get_first(seq);
}
break;
case TCP_SEQ_STATE_ESTABLISHED:
- case TCP_SEQ_STATE_TIME_WAIT:
rc = established_get_next(seq, v);
break;
}
out:
++*pos;
+ st->last_pos = *pos;
return rc;
}
+EXPORT_IPV6_MOD(tcp_seq_next);
-static void tcp_seq_stop(struct seq_file *seq, void *v)
+void tcp_seq_stop(struct seq_file *seq, void *v)
{
- struct tcp_iter_state* st = seq->private;
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct tcp_iter_state *st = seq->private;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
- if (v) {
- struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- }
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- inet_listen_unlock(&tcp_hashinfo);
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
break;
- case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
- local_bh_enable();
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
break;
}
}
+EXPORT_IPV6_MOD(tcp_seq_stop);
-static int tcp_seq_open(struct inode *inode, struct file *file)
-{
- struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
- struct seq_file *seq;
- struct tcp_iter_state *s;
- int rc;
-
- if (unlikely(afinfo == NULL))
- return -EINVAL;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
- s->family = afinfo->family;
- s->seq_ops.start = tcp_seq_start;
- s->seq_ops.next = tcp_seq_next;
- s->seq_ops.show = afinfo->seq_show;
- s->seq_ops.stop = tcp_seq_stop;
-
- rc = seq_open(file, &s->seq_ops);
- if (rc)
- goto out_kfree;
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
-{
- int rc = 0;
- struct proc_dir_entry *p;
-
- if (!afinfo)
- return -EINVAL;
- afinfo->seq_fops->owner = afinfo->owner;
- afinfo->seq_fops->open = tcp_seq_open;
- afinfo->seq_fops->read = seq_read;
- afinfo->seq_fops->llseek = seq_lseek;
- afinfo->seq_fops->release = seq_release_private;
-
- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
- if (p)
- p->data = afinfo;
- else
- rc = -ENOMEM;
- return rc;
-}
-
-void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
-{
- if (!afinfo)
- return;
- proc_net_remove(afinfo->name);
- memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
-}
-
-static void get_openreq4(struct sock *sk, struct request_sock *req,
- char *tmpbuf, int i, int uid)
+static void get_openreq4(const struct request_sock *req,
+ struct seq_file *f, int i)
{
const struct inet_request_sock *ireq = inet_rsk(req);
- int ttd = req->expires - jiffies;
+ long delta = req->rsk_timer.expires - jiffies;
- sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
i,
- ireq->loc_addr,
- ntohs(inet_sk(sk)->sport),
- ireq->rmt_addr,
- ntohs(ireq->rmt_port),
+ ireq->ir_loc_addr,
+ ireq->ir_num,
+ ireq->ir_rmt_addr,
+ ntohs(ireq->ir_rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
- jiffies_to_clock_t(ttd),
- req->retrans,
- uid,
+ jiffies_delta_to_clock_t(delta),
+ req->num_timeout,
+ from_kuid_munged(seq_user_ns(f),
+ sk_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
- atomic_read(&sk->sk_refcnt),
+ 0,
req);
}
-static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
{
int timer_active;
unsigned long timer_expires;
- struct tcp_sock *tp = tcp_sk(sp);
- const struct inet_connection_sock *icsk = inet_csk(sp);
- struct inet_sock *inet = inet_sk(sp);
- __be32 dest = inet->daddr;
- __be32 src = inet->rcv_saddr;
- __u16 destp = ntohs(inet->dport);
- __u16 srcp = ntohs(inet->sport);
-
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+ u8 icsk_pending;
+ int rx_queue;
+ int state;
+
+ icsk_pending = smp_load_acquire(&icsk->icsk_pending);
+ if (icsk_pending == ICSK_TIME_RETRANS ||
+ icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
- timer_expires = icsk->icsk_timeout;
- } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_expires = tcp_timeout_expires(sk);
+ } else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = icsk->icsk_timeout;
- } else if (timer_pending(&sp->sk_timer)) {
+ timer_expires = tcp_timeout_expires(sk);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
- timer_expires = sp->sk_timer.expires;
+ timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
}
- sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
- "%08X %5d %8d %lu %d %p %u %u %u %u %d",
- i, src, srcp, dest, destp, sp->sk_state,
- tp->write_seq - tp->snd_una,
- sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog :
- (tp->rcv_nxt - tp->copied_seq),
+ state = inet_sk_state_load(sk);
+ if (state == TCP_LISTEN)
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
+ else
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
+ */
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
+ i, src, srcp, dest, destp, state,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
+ rx_queue,
timer_active,
- jiffies_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
- sock_i_uid(sp),
- icsk->icsk_probes_out,
- sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp,
- icsk->icsk_rto,
- icsk->icsk_ack.ato,
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
- tp->snd_cwnd,
- tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
-}
-
-static void get_timewait4_sock(struct inet_timewait_sock *tw,
- char *tmpbuf, int i)
+ jiffies_delta_to_clock_t(timer_expires - jiffies),
+ READ_ONCE(icsk->icsk_retransmits),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
+ READ_ONCE(icsk->icsk_probes_out),
+ sock_i_ino(sk),
+ refcount_read(&sk->sk_refcnt), sk,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
+ tcp_snd_cwnd(tp),
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
+}
+
+static void get_timewait4_sock(const struct inet_timewait_sock *tw,
+ struct seq_file *f, int i)
{
+ long delta = tw->tw_timer.expires - jiffies;
__be32 dest, src;
__u16 destp, srcp;
- int ttd = tw->tw_ttd - jiffies;
-
- if (ttd < 0)
- ttd = 0;
dest = tw->tw_daddr;
src = tw->tw_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
- sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
- i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
- 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
- atomic_read(&tw->tw_refcnt), tw);
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
+ i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ refcount_read(&tw->tw_refcnt), tw);
}
#define TMPSZ 150
static int tcp4_seq_show(struct seq_file *seq, void *v)
{
- struct tcp_iter_state* st;
- char tmpbuf[TMPSZ + 1];
+ struct tcp_iter_state *st;
+ struct sock *sk = v;
+ seq_setwidth(seq, TMPSZ - 1);
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-*s\n", TMPSZ - 1,
- " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode");
goto out;
}
st = seq->private;
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq4(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
+out:
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+union bpf_tcp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
+struct bpf_tcp_iter_state {
+ struct tcp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ union bpf_tcp_iter_batch_item *batch;
+};
+
+struct bpf_iter__tcp {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct sock_common *, sk_common);
+ uid_t uid __aligned(8);
+};
+
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+ struct sock_common *sk_common, uid_t uid)
+{
+ struct bpf_iter__tcp ctx;
+
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
+ ctx.meta = meta;
+ ctx.sk_common = sk_common;
+ ctx.uid = uid;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
+{
+ union bpf_tcp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_gen_put(item->sk);
+ item->cookie = cookie;
+ }
+}
+
+static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
+ unsigned int new_batch_sz, gfp_t flags)
+{
+ union bpf_tcp_iter_batch_item *new_batch;
+
+ new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+ flags | __GFP_NOWARN);
+ if (!new_batch)
+ return -ENOMEM;
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
+ union bpf_tcp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ sk_nulls_for_each_from(sk, node)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = listening_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ ++st->bucket;
+ sk = listening_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = established_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+ ++st->bucket;
+ sk = established_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk = NULL;
+
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
+ sk = bpf_iter_tcp_resume_listening(seq);
+ if (sk)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
- get_tcp4_sock(v, tmpbuf, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
- break;
- case TCP_SEQ_STATE_TIME_WAIT:
- get_timewait4_sock(v, tmpbuf, st->num);
+ sk = bpf_iter_tcp_resume_established(seq);
break;
}
- seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
-out:
- return 0;
+
+ return sk;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct hlist_nulls_node *node;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
+
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
+ }
+ expected++;
+ }
+ }
+
+ return expected;
+}
+
+static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct hlist_nulls_node *node;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
+
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
+ }
+ expected++;
+ }
+ }
+
+ return expected;
+}
+
+static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ return bpf_iter_tcp_listening_batch(seq, start_sk);
+ else
+ return bpf_iter_tcp_established_batch(seq, start_sk);
+}
+
+static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ else
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ unsigned int expected;
+ struct sock *sk;
+ int err;
+
+ sk = bpf_iter_tcp_resume(seq);
+ if (!sk)
+ return NULL; /* Done */
+
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
+
+ /* Batch size was too small. */
+ bpf_iter_tcp_unlock_bucket(seq);
+ bpf_iter_tcp_put_batch(iter);
+ err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+
+ sk = bpf_iter_tcp_resume(seq);
+ if (!sk)
+ return NULL; /* Done */
+
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
+
+ /* Batch size was still too small. Hold onto the lock while we try
+ * again with a larger batch to make sure the current bucket's size
+ * does not change in the meantime.
+ */
+ err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
+ if (err) {
+ bpf_iter_tcp_unlock_bucket(seq);
+ return ERR_PTR(err);
+ }
+
+ expected = bpf_iter_fill_batch(seq, &sk);
+ WARN_ON_ONCE(iter->end_sk != expected);
+done:
+ bpf_iter_tcp_unlock_bucket(seq);
+ return iter->batch[0].sk;
+}
+
+static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_tcp_batch(seq);
+
+ return SEQ_START_TOKEN;
+}
+
+static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so advance to the next sk in
+ * the batch.
+ */
+ if (iter->cur_sk < iter->end_sk) {
+ /* Keeping st->num consistent in tcp_iter_state.
+ * bpf_iter_tcp does not use st->num.
+ * meta.seq_num is used instead.
+ */
+ st->num++;
+ sock_gen_put(iter->batch[iter->cur_sk++].sk);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk].sk;
+ else
+ sk = bpf_iter_tcp_batch(seq);
+
+ ++*pos;
+ /* Keeping st->last_pos consistent in tcp_iter_state.
+ * bpf iter does not do lseek, so st->last_pos always equals to *pos.
+ */
+ st->last_pos = *pos;
+ return sk;
+}
+
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ struct sock *sk = v;
+ uid_t uid;
+ int ret;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ if (sk_fullsock(sk))
+ lock_sock(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ uid = 0;
+ } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ const struct request_sock *req = v;
+
+ uid = from_kuid_munged(seq_user_ns(seq),
+ sk_uid(req->rsk_listener));
+ } else {
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
+ }
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ ret = tcp_prog_seq_show(prog, &meta, v, uid);
+
+unlock:
+ if (sk_fullsock(sk))
+ release_sock(sk);
+ return ret;
+
}
-static struct file_operations tcp4_seq_fops;
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)tcp_prog_seq_show(prog, &meta, v, 0);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ bpf_iter_tcp_put_batch(iter);
+}
+
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
+ .show = bpf_iter_tcp_seq_show,
+ .start = bpf_iter_tcp_seq_start,
+ .next = bpf_iter_tcp_seq_next,
+ .stop = bpf_iter_tcp_seq_stop,
+};
+#endif
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+ const struct tcp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
+ if (seq->op == &bpf_iter_tcp_seq_ops)
+ return AF_UNSPEC;
+#endif
+
+ /* Iterated from proc fs */
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->family;
+}
+
+static const struct seq_operations tcp4_seq_ops = {
+ .show = tcp4_seq_show,
+ .start = tcp_seq_start,
+ .next = tcp_seq_next,
+ .stop = tcp_seq_stop,
+};
+
static struct tcp_seq_afinfo tcp4_seq_afinfo = {
- .owner = THIS_MODULE,
- .name = "tcp",
.family = AF_INET,
- .seq_show = tcp4_seq_show,
- .seq_fops = &tcp4_seq_fops,
+};
+
+static int __net_init tcp4_proc_init_net(struct net *net)
+{
+ if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
+ sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit tcp4_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("tcp", net->proc_net);
+}
+
+static struct pernet_operations tcp4_net_ops = {
+ .init = tcp4_proc_init_net,
+ .exit = tcp4_proc_exit_net,
};
int __init tcp4_proc_init(void)
{
- return tcp_proc_register(&tcp4_seq_afinfo);
+ return register_pernet_subsys(&tcp4_net_ops);
}
void tcp4_proc_exit(void)
{
- tcp_proc_unregister(&tcp4_seq_afinfo);
+ unregister_pernet_subsys(&tcp4_net_ops);
}
#endif /* CONFIG_PROC_FS */
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
+ .pre_connect = tcp_v4_pre_connect,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
@@ -2465,52 +3446,318 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
- .sendmsg = tcp_sendmsg,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .splice_eof = tcp_splice_eof,
.backlog_rcv = tcp_v4_do_rcv,
- .hash = tcp_v4_hash,
- .unhash = tcp_unhash,
- .get_port = tcp_v4_get_port,
+ .release_cb = tcp_release_cb,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .leave_memory_pressure = tcp_leave_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
- .memory_allocated = &tcp_memory_allocated,
+
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
- .sysctl_wmem = sysctl_tcp_wmem,
- .sysctl_rmem = sysctl_tcp_rmem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
-#endif
+ .h.hashinfo = NULL,
+ .no_autobind = true,
+ .diag_destroy = tcp_abort,
};
+EXPORT_SYMBOL(tcp_prot);
-void __init tcp_v4_init(struct net_proto_family *ops)
+static void __net_exit tcp_sk_exit(struct net *net)
{
- if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
- IPPROTO_TCP) < 0)
- panic("Failed to create the TCP control socket.\n");
+ if (net->ipv4.tcp_congestion_control)
+ bpf_module_put(net->ipv4.tcp_congestion_control,
+ net->ipv4.tcp_congestion_control->owner);
}
-EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_unhash);
-EXPORT_SYMBOL(tcp_v4_conn_request);
-EXPORT_SYMBOL(tcp_v4_connect);
-EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_remember_stamp);
-EXPORT_SYMBOL(tcp_v4_send_check);
-EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+static void __net_init tcp_set_hashinfo(struct net *net)
+{
+ struct inet_hashinfo *hinfo;
+ unsigned int ehash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
+ if (!ehash_entries)
+ goto fallback;
+
+ ehash_entries = roundup_pow_of_two(ehash_entries);
+ hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
+ if (!hinfo) {
+ pr_warn("Failed to allocate TCP ehash (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ ehash_entries);
+fallback:
+ hinfo = &tcp_hashinfo;
+ ehash_entries = tcp_hashinfo.ehash_mask + 1;
+ }
+
+ net->ipv4.tcp_death_row.hashinfo = hinfo;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
+ net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
+}
+
+static int __net_init tcp_sk_init(struct net *net)
+{
+ net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
+ net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
+ net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
+ net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
+
+ net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+ net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+ net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+
+ net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+ net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
+ net->ipv4.sysctl_tcp_syncookies = 1;
+ net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+ net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
+ net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
+ net->ipv4.sysctl_tcp_orphan_retries = 0;
+ net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+ net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+ net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
+
+ refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
+ tcp_set_hashinfo(net);
+
+ net->ipv4.sysctl_tcp_sack = 1;
+ net->ipv4.sysctl_tcp_window_scaling = 1;
+ net->ipv4.sysctl_tcp_timestamps = 1;
+ net->ipv4.sysctl_tcp_early_retrans = 3;
+ net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
+ net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
+ net->ipv4.sysctl_tcp_retrans_collapse = 1;
+ net->ipv4.sysctl_tcp_max_reordering = 300;
+ net->ipv4.sysctl_tcp_dsack = 1;
+ net->ipv4.sysctl_tcp_app_win = 31;
+ net->ipv4.sysctl_tcp_adv_win_scale = 1;
+ net->ipv4.sysctl_tcp_frto = 2;
+ net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+ net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
+ /* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume. Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+ net->ipv4.sysctl_tcp_tso_win_divisor = 3;
+ /* Default TSQ limit of 4 MB */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
+
+ /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
+ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
+
+ net->ipv4.sysctl_tcp_min_tso_segs = 2;
+ net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
+ net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
+ net->ipv4.sysctl_tcp_autocorking = 1;
+ net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
+ net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
+ net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
+ if (net != &init_net) {
+ memcpy(net->ipv4.sysctl_tcp_rmem,
+ init_net.ipv4.sysctl_tcp_rmem,
+ sizeof(init_net.ipv4.sysctl_tcp_rmem));
+ memcpy(net->ipv4.sysctl_tcp_wmem,
+ init_net.ipv4.sysctl_tcp_wmem,
+ sizeof(init_net.ipv4.sysctl_tcp_wmem));
+ }
+ net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+ net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
+ net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+ net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
+ net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
+ net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
+ /* Set default values for PLB */
+ net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
+ net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
+ net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
+ net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
+ /* Default congestion threshold for PLB to mark a round is 50% */
+ net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
+
+ /* Reno is always built in */
+ if (!net_eq(net, &init_net) &&
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
+ init_net.ipv4.tcp_congestion_control->owner))
+ net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
+ else
+ net->ipv4.tcp_congestion_control = &tcp_reno;
+
+ net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
+ net->ipv4.sysctl_tcp_shrink_window = 0;
+
+ net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+ net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
+ net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
+
+ return 0;
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
+ * and failed setup_net error unwinding path are serialized.
+ *
+ * tcp_twsk_purge() handles twsk in any dead netns, not just those in
+ * net_exit_list, the thread that dismantles a particular twsk must
+ * do so without other thread progressing to refcount_dec_and_test() of
+ * tcp_death_row.tw_refcount.
+ */
+ mutex_lock(&tcp_exit_batch_mutex);
+
+ tcp_twsk_purge(net_exit_list);
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
+ WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
+ tcp_fastopen_ctx_destroy(net);
+ }
+
+ mutex_unlock(&tcp_exit_batch_mutex);
+}
+
+static struct pernet_operations __net_initdata tcp_sk_ops = {
+ .init = tcp_sk_init,
+ .exit = tcp_sk_exit,
+ .exit_batch = tcp_sk_exit_batch,
+};
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
+ struct sock_common *sk_common, uid_t uid)
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_tcp_iter_state *iter = priv_data;
+ int err;
+
+ err = bpf_iter_init_seq_net(priv_data, aux);
+ if (err)
+ return err;
+
+ err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
+ if (err) {
+ bpf_iter_fini_seq_net(priv_data);
+ return err;
+ }
+
+ return 0;
+}
+
+static void bpf_iter_fini_tcp(void *priv_data)
+{
+ struct bpf_tcp_iter_state *iter = priv_data;
+
+ bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
+}
+
+static const struct bpf_iter_seq_info tcp_seq_info = {
+ .seq_ops = &bpf_iter_tcp_seq_ops,
+ .init_seq_private = bpf_iter_init_tcp,
+ .fini_seq_private = bpf_iter_fini_tcp,
+ .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
+};
+
+static const struct bpf_func_proto *
+bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_sk_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ return &bpf_sk_getsockopt_proto;
+ default:
+ return NULL;
+ }
+}
+
+static struct bpf_iter_reg tcp_reg_info = {
+ .target = "tcp",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__tcp, sk_common),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .get_func_proto = bpf_iter_tcp_get_func_proto,
+ .seq_info = &tcp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+ tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
+ if (bpf_iter_reg_target(&tcp_reg_info))
+ pr_warn("Warning: could not register bpf iterator tcp\n");
+}
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(tcp_proc_register);
-EXPORT_SYMBOL(tcp_proc_unregister);
#endif
-EXPORT_SYMBOL(sysctl_local_port_range);
-EXPORT_SYMBOL(sysctl_tcp_low_latency);
+void __init tcp_v4_init(void)
+{
+ int cpu, res;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, &init_net);
+ if (res)
+ panic("Failed to create the TCP control socket.\n");
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Please enforce IP_DF and IPID==0 for RST and
+ * ACK sent in SYN-RECV and TIME-WAIT state.
+ */
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+ sk->sk_clockid = CLOCK_MONOTONIC;
+
+ per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
+ }
+ if (register_pernet_subsys(&tcp_sk_ops))
+ panic("Failed to create the TCP control socket.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_register();
+#endif
+}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index f0ebaf0e21cb..976b56644a8a 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Low Priority (TCP-LP)
*
@@ -12,7 +13,7 @@
* within cong_avoid.
* o Error correcting in remote HZ, therefore remote HZ will be keeped
* on checking and updating.
- * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
+ * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
* OWD have a similar meaning as RTT. Also correct the buggy formular.
* o Handle reaction for Early Congestion Indication (ECI) within
* pkts_acked, as mentioned within pseudo code.
@@ -22,9 +23,9 @@
* Original Author:
* Aleksandar Kuzmanovic <akuzma@northwestern.edu>
* Available from:
- * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * https://users.cs.northwestern.edu/~akuzma/doc/TCP-LP-ToN.pdf
* Original implementation for 2.4.19:
- * http://www-ece.rice.edu/networks/TCP-LP/
+ * https://users.cs.northwestern.edu/~akuzma/rice/TCP-LP/linux/tcp-lp-linux.htm
*
* 2.6.x module Authors:
* Wong Hoi Sing, Edison <hswong3i@gmail.com>
@@ -37,7 +38,7 @@
#include <net/tcp.h>
/* resolution of owd */
-#define LP_RESOL 1000
+#define LP_RESOL TCP_TS_HZ
/**
* enum tcp_lp_state
@@ -62,7 +63,7 @@ enum tcp_lp_state {
* @sowd: smoothed OWD << 3
* @owd_min: min OWD
* @owd_max: max OWD
- * @owd_max_rsv: resrved max owd
+ * @owd_max_rsv: reserved max owd
* @remote_hz: estimated remote HZ
* @remote_ref_time: remote reference time
* @local_ref_time: local reference time
@@ -88,6 +89,7 @@ struct lp {
/**
* tcp_lp_init
+ * @sk: socket to initialize congestion control algorithm for
*
* Init all required variables.
* Clone the handling from Vegas module implementation.
@@ -110,22 +112,25 @@ static void tcp_lp_init(struct sock *sk)
/**
* tcp_lp_cong_avoid
+ * @sk: socket to avoid congesting
+ * @ack: current ack sequence number
+ * @acked: number of ACKed packets
*
* Implementation of cong_avoid.
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
- int flag)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
+ tcp_reno_cong_avoid(sk, ack, acked);
}
/**
* tcp_lp_remote_hz_estimator
+ * @sk: socket which needs an estimate for the remote HZs
*
* Estimate remote HZ.
* We keep on updating the estimated value, where original TCP-LP
@@ -144,13 +149,13 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
goto out;
/* we can't calc remote HZ with no different!! */
- if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
- || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+ if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+ tp->rx_opt.rcv_tsecr == lp->local_ref_time)
goto out;
- m = HZ * (tp->rx_opt.rcv_tsval -
- lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
- lp->local_ref_time);
+ m = TCP_TS_HZ *
+ (tp->rx_opt.rcv_tsval - lp->remote_ref_time) /
+ (tp->rx_opt.rcv_tsecr - lp->local_ref_time);
if (m < 0)
m = -m;
@@ -176,6 +181,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
/**
* tcp_lp_owd_calculator
+ * @sk: socket to calculate one way delay for
*
* Calculate one way delay (in relative format).
* Original implement OWD as minus of remote time difference to local time
@@ -195,7 +201,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
if (lp->flag & LP_VALID_RHZ) {
owd =
tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
- tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
+ tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ);
if (owd < 0)
owd = -owd;
}
@@ -210,6 +216,8 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
/**
* tcp_lp_rtt_sample
+ * @sk: socket to add a rtt sample to
+ * @rtt: round trip time, which is ignored!
*
* Implementation or rtt_sample.
* Will take the following action,
@@ -218,7 +226,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
* 3. calc smoothed OWD (SOWD).
* Most ideas come from the original TCP-LP implementation.
*/
-static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
+static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
{
struct lp *lp = inet_csk_ca(sk);
s64 mowd = tcp_lp_owd_calculator(sk);
@@ -254,6 +262,8 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
/**
* tcp_lp_pkts_acked
+ * @sk: socket requiring congestion avoidance calculations
+ * @sample: ACK sample containing timing and rate information
*
* Implementation of pkts_acked.
* Deal with active drop under Early Congestion Indication.
@@ -261,17 +271,23 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
* newReno in increase case.
* We work it out by following the idea from TCP-LP's paper directly
*/
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
+static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp_ts(tp);
+ u32 delta;
+
+ if (sample->rtt_us > 0)
+ tcp_lp_rtt_sample(sk, sample->rtt_us);
/* calc inference */
- if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
- lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+ delta = now - tp->rx_opt.rcv_tsecr;
+ if ((s32)delta > 0)
+ lp->inference = 3 * delta;
/* test if within inference */
- if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
+ if (lp->last_drop && (now - lp->last_drop < lp->inference))
lp->flag |= LP_WITHIN_INF;
else
lp->flag &= ~LP_WITHIN_INF;
@@ -284,7 +300,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
lp->flag &= ~LP_WITHIN_THR;
pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
- tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+ tcp_snd_cwnd(tp), lp->remote_hz, lp->owd_min, lp->owd_max,
lp->sowd >> 3);
if (lp->flag & LP_WITHIN_THR)
@@ -292,7 +308,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
/* FIXME: try to reset owd_min and owd_max here
* so decrease the chance the min/max is no longer suitable
- * and will usually within threshold when whithin inference */
+ * and will usually within threshold when within inference */
lp->owd_min = lp->sowd >> 3;
lp->owd_max = lp->sowd >> 2;
lp->owd_max_rsv = lp->sowd >> 2;
@@ -300,23 +316,22 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
/* happened within inference
* drop snd_cwnd into 1 */
if (lp->flag & LP_WITHIN_INF)
- tp->snd_cwnd = 1U;
+ tcp_snd_cwnd_set(tp, 1U);
/* happened after inference
* cut snd_cwnd into half */
else
- tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp) >> 1U, 1U));
/* record this drop time */
- lp->last_drop = tcp_time_stamp;
+ lp->last_drop = now;
}
-static struct tcp_congestion_ops tcp_lp = {
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_lp_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
- .rtt_sample = tcp_lp_rtt_sample,
.pkts_acked = tcp_lp_pkts_acked,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..45b6ecd16412
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,1059 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+#include <linux/tcp_metrics.h>
+#include <linux/vmalloc.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/genetlink.h>
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash);
+
+struct tcp_fastopen_metrics {
+ u16 mss;
+ u16 syn_loss:10, /* Recurring Fast Open SYN losses */
+ try_exp:2; /* Request w/ exp. option (once) */
+ unsigned long last_syn_loss; /* Last Fast Open SYN loss */
+ struct tcp_fastopen_cookie cookie;
+};
+
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
+
+struct tcp_metrics_block {
+ struct tcp_metrics_block __rcu *tcpm_next;
+ struct net *tcpm_net;
+ struct inetpeer_addr tcpm_saddr;
+ struct inetpeer_addr tcpm_daddr;
+ unsigned long tcpm_stamp;
+ u32 tcpm_lock;
+ u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
+ struct tcp_fastopen_metrics tcpm_fastopen;
+
+ struct rcu_head rcu_head;
+};
+
+static inline struct net *tm_net(const struct tcp_metrics_block *tm)
+{
+ /* Paired with the WRITE_ONCE() in tcpm_new() */
+ return READ_ONCE(tm->tcpm_net);
+}
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ /* Paired with WRITE_ONCE() in tcpm_suck_dst() */
+ return READ_ONCE(tm->tcpm_lock) & (1 << idx);
+}
+
+static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ /* Paired with WRITE_ONCE() in tcp_metric_set() */
+ return READ_ONCE(tm->tcpm_vals[idx]);
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ /* Paired with READ_ONCE() in tcp_metric_get() */
+ WRITE_ONCE(tm->tcpm_vals[idx], val);
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
+}
+
+struct tcpm_hash_bucket {
+ struct tcp_metrics_block __rcu *chain;
+};
+
+static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
+static unsigned int tcp_metrics_hash_log __read_mostly;
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst,
+ bool fastopen_clear)
+{
+ u32 msval;
+ u32 val;
+
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
+
+ val = 0;
+ if (dst_metric_locked(dst, RTAX_RTT))
+ val |= 1 << TCP_METRIC_RTT;
+ if (dst_metric_locked(dst, RTAX_RTTVAR))
+ val |= 1 << TCP_METRIC_RTTVAR;
+ if (dst_metric_locked(dst, RTAX_SSTHRESH))
+ val |= 1 << TCP_METRIC_SSTHRESH;
+ if (dst_metric_locked(dst, RTAX_CWND))
+ val |= 1 << TCP_METRIC_CWND;
+ if (dst_metric_locked(dst, RTAX_REORDERING))
+ val |= 1 << TCP_METRIC_REORDERING;
+ /* Paired with READ_ONCE() in tcp_metric_locked() */
+ WRITE_ONCE(tm->tcpm_lock, val);
+
+ msval = dst_metric_raw(dst, RTAX_RTT);
+ tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
+
+ msval = dst_metric_raw(dst, RTAX_RTTVAR);
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ dst_metric_raw(dst, RTAX_SSTHRESH));
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ dst_metric_raw(dst, RTAX_CWND));
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ dst_metric_raw(dst, RTAX_REORDERING));
+ if (fastopen_clear) {
+ write_seqlock(&fastopen_seqlock);
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.try_exp = 0;
+ tm->tcpm_fastopen.cookie.exp = false;
+ tm->tcpm_fastopen.cookie.len = 0;
+ write_sequnlock(&fastopen_seqlock);
+ }
+}
+
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst)
+{
+ unsigned long limit;
+
+ if (!tm)
+ return;
+ limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
+ if (unlikely(time_after(jiffies, limit)))
+ tcpm_suck_dst(tm, dst, false);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
+#define deref_locked(p) \
+ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+ struct inetpeer_addr *saddr,
+ struct inetpeer_addr *daddr,
+ unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ bool reclaim = false;
+ struct net *net;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ net = dst_dev_net_rcu(dst);
+
+ /* While waiting for the spin-lock the cache might have been populated
+ * with this entry and so we have to check again.
+ */
+ tm = __tcp_get_metrics(saddr, daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (tm) {
+ tcpm_check_stamp(tm, dst);
+ goto out_unlock;
+ }
+
+ if (unlikely(reclaim)) {
+ struct tcp_metrics_block *oldest;
+
+ oldest = deref_locked(tcp_metrics_hash[hash].chain);
+ for (tm = deref_locked(oldest->tcpm_next); tm;
+ tm = deref_locked(tm->tcpm_next)) {
+ if (time_before(READ_ONCE(tm->tcpm_stamp),
+ READ_ONCE(oldest->tcpm_stamp)))
+ oldest = tm;
+ }
+ tm = oldest;
+ } else {
+ tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
+ if (!tm)
+ goto out_unlock;
+ }
+ /* Paired with the READ_ONCE() in tm_net() */
+ WRITE_ONCE(tm->tcpm_net, net);
+
+ tm->tcpm_saddr = *saddr;
+ tm->tcpm_daddr = *daddr;
+
+ tcpm_suck_dst(tm, dst, reclaim);
+
+ if (likely(!reclaim)) {
+ tm->tcpm_next = tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
+ }
+
+out_unlock:
+ spin_unlock_bh(&tcp_metrics_lock);
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+ if (tm)
+ return tm;
+ if (depth > TCP_METRICS_RECLAIM_DEPTH)
+ return TCP_METRICS_RECLAIM_PTR;
+ return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ int depth = 0;
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, saddr) &&
+ addr_same(&tm->tcpm_daddr, daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ depth++;
+ }
+ return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ saddr.family = req->rsk_ops->family;
+ daddr.family = req->rsk_ops->family;
+ switch (daddr.family) {
+ case AF_INET:
+ inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
+ inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
+ hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
+ inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
+ hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
+ break;
+#endif
+ default:
+ return NULL;
+ }
+
+ net = dst_dev_net_rcu(dst);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ }
+ tcpm_check_stamp(tm, dst);
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+ struct dst_entry *dst,
+ bool create)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (sk->sk_family == AF_INET) {
+ inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+ inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+ hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+ inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+ hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
+ } else {
+ inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
+ inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
+ hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = dst_dev_net_rcu(dst);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR)
+ tm = NULL;
+ if (!tm && create)
+ tm = tcpm_new(dst, &saddr, &daddr, hash);
+ else
+ tcpm_check_stamp(tm, dst);
+
+ return tm;
+}
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ struct tcp_metrics_block *tm;
+ unsigned long rtt;
+ u32 val;
+ int m;
+
+ sk_dst_confirm(sk);
+ if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
+ return;
+
+ rcu_read_lock();
+ if (icsk->icsk_backoff || !tp->srtt_us) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time. Reset our
+ * results.
+ */
+ tm = tcp_get_metrics(sk, dst, false);
+ if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+ tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+ goto out_unlock;
+ } else
+ tm = tcp_get_metrics(sk, dst, true);
+
+ if (!tm)
+ goto out_unlock;
+
+ rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt_us;
+
+ /* If newly calculated rtt larger than stored one, store new
+ * one. Otherwise, use EWMA. Remember, rtt overestimation is
+ * always better than underestimation.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+ if (m <= 0)
+ rtt = tp->srtt_us;
+ else
+ rtt -= (m >> 3);
+ tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
+ }
+
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+ unsigned long var;
+
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev_us)
+ m = tp->mdev_us;
+
+ var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && (tcp_snd_cwnd(tp) >> 1) > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tcp_snd_cwnd(tp) >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ if (tcp_snd_cwnd(tp) > val)
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ tcp_snd_cwnd(tp));
+ }
+ } else if (!tcp_in_slow_start(tp) &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
+ }
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ * ssthresh may be also invalid.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ (val + tp->snd_ssthresh) >> 1);
+ }
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && tp->snd_ssthresh > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_ssthresh);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val < tp->reordering &&
+ tp->reordering !=
+ READ_ONCE(net->ipv4.sysctl_tcp_reordering))
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ tp->reordering);
+ }
+ }
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
+out_unlock:
+ rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ struct tcp_metrics_block *tm;
+ u32 val, crtt = 0; /* cached RTT scaled by 8 */
+
+ sk_dst_confirm(sk);
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ if (!dst)
+ goto reset;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, false);
+ if (!tm) {
+ rcu_read_unlock();
+ goto reset;
+ }
+
+ if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+ tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+ val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
+ 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val) {
+ tp->snd_ssthresh = val;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ }
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val && tp->reordering != val)
+ tp->reordering = val;
+
+ crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ rcu_read_unlock();
+reset:
+ /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
+ * to seed the RTO for later data packets because SYN packets are
+ * small. Use the per-dst cached values to seed the RTO but keep
+ * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
+ * Later the RTO will be updated immediately upon obtaining the first
+ * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+ * influences the first RTO but not later RTT estimation.
+ *
+ * But if RTT is not available from the SYN (due to retransmits or
+ * syn cookies) or the cache, force a conservative 3secs timeout.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ if (crtt > tp->srtt_us) {
+ /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+ crtt /= 8 * USEC_PER_SEC / HZ;
+ inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
+ } else if (tp->srtt_us == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+ tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
+
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ bool ret;
+
+ if (!dst)
+ return false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_req(req, dst);
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
+ ret = true;
+ else
+ ret = false;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ if (tfom->mss)
+ *mss = tfom->mss;
+ *cookie = tfom->cookie;
+ if (cookie->len <= 0 && tfom->try_exp == 1)
+ cookie->exp = true;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+ }
+ rcu_read_unlock();
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost,
+ u16 try_exp)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_metrics_block *tm;
+
+ if (!dst)
+ return;
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+ write_seqlock_bh(&fastopen_seqlock);
+ if (mss)
+ tfom->mss = mss;
+ if (cookie && cookie->len > 0)
+ tfom->cookie = *cookie;
+ else if (try_exp > tfom->try_exp &&
+ tfom->cookie.len <= 0 && !tfom->cookie.exp)
+ tfom->try_exp = try_exp;
+ if (syn_lost) {
+ ++tfom->syn_loss;
+ tfom->last_syn_loss = jiffies;
+ } else
+ tfom->syn_loss = 0;
+ write_sequnlock_bh(&fastopen_seqlock);
+ }
+ rcu_read_unlock();
+}
+
+static struct genl_family tcp_metrics_nl_family;
+
+static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+ [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_ADDR_IPV6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+
+ [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_SADDR_IPV6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+
+ /* Following attributes are not received for GET/DEL,
+ * we keep them for reference
+ */
+#if 0
+ [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
+ [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
+ [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
+ .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+ struct tcp_metrics_block *tm)
+{
+ struct nlattr *nest;
+ int i;
+
+ switch (tm->tcpm_daddr.family) {
+ case AF_INET:
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+ inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
+ goto nla_put_failure;
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+ inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
+ inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
+ goto nla_put_failure;
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
+ inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
+ goto nla_put_failure;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+ jiffies - READ_ONCE(tm->tcpm_stamp),
+ TCP_METRICS_ATTR_PAD) < 0)
+ goto nla_put_failure;
+
+ {
+ int n = 0;
+
+ nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
+ if (!nest)
+ goto nla_put_failure;
+ for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
+ u32 val = tcp_metric_get(tm, i);
+
+ if (!val)
+ continue;
+ if (i == TCP_METRIC_RTT) {
+ if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (i == TCP_METRIC_RTTVAR) {
+ if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (nla_put_u32(msg, i + 1, val) < 0)
+ goto nla_put_failure;
+ n++;
+ }
+ if (n)
+ nla_nest_end(msg, nest);
+ else
+ nla_nest_cancel(msg, nest);
+ }
+
+ {
+ struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ tfom_copy[0] = tm->tcpm_fastopen;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+
+ tfom = tfom_copy;
+ if (tfom->mss &&
+ nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+ tfom->mss) < 0)
+ goto nla_put_failure;
+ if (tfom->syn_loss &&
+ (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+ tfom->syn_loss) < 0 ||
+ nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+ jiffies - tfom->last_syn_loss,
+ TCP_METRICS_ATTR_PAD) < 0))
+ goto nla_put_failure;
+ if (tfom->cookie.len > 0 &&
+ nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+ tfom->cookie.len, tfom->cookie.val) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct tcp_metrics_block *tm)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &tcp_metrics_nl_family, NLM_F_MULTI,
+ TCP_METRICS_CMD_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (tcp_metrics_fill_info(skb, tm) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ unsigned int row, s_row = cb->args[0];
+ int s_col = cb->args[1], col = s_col;
+ int res = 0;
+
+ for (row = s_row; row < max_rows; row++, s_col = 0) {
+ struct tcp_metrics_block *tm;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
+
+ rcu_read_lock();
+ for (col = 0, tm = rcu_dereference(hb->chain); tm;
+ tm = rcu_dereference(tm->tcpm_next), col++) {
+ if (!net_eq(tm_net(tm), net))
+ continue;
+ if (col < s_col)
+ continue;
+ res = tcp_metrics_dump_info(skb, cb, tm);
+ if (res < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = row;
+ cb->args[1] = col;
+ return res;
+}
+
+static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional, int v4, int v6)
+{
+ struct nlattr *a;
+
+ a = info->attrs[v4];
+ if (a) {
+ inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
+ if (hash)
+ *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
+ return 0;
+ }
+ a = info->attrs[v6];
+ if (a) {
+ struct in6_addr in6;
+
+ in6 = nla_get_in6_addr(a);
+ inetpeer_set_addr_v6(addr, &in6);
+ if (hash)
+ *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
+ return 0;
+ }
+ return optional ? 1 : -EAFNOSUPPORT;
+}
+
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional)
+{
+ return __parse_nl_addr(info, addr, hash, optional,
+ TCP_METRICS_ATTR_ADDR_IPV4,
+ TCP_METRICS_ATTR_ADDR_IPV6);
+}
+
+static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
+{
+ return __parse_nl_addr(info, addr, NULL, 0,
+ TCP_METRICS_ATTR_SADDR_IPV4,
+ TCP_METRICS_ATTR_SADDR_IPV6);
+}
+
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct sk_buff *msg;
+ struct net *net = genl_info_net(info);
+ void *reply;
+ int ret;
+ bool src = true;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply)
+ goto nla_put_failure;
+
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ ret = -ESRCH;
+ rcu_read_lock();
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
+ ret = tcp_metrics_fill_info(msg, tm);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out_free;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ ret = -EMSGSIZE;
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static void tcp_metrics_flush_all(struct net *net)
+{
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash;
+ struct tcp_metrics_block *tm;
+ unsigned int row;
+
+ for (row = 0; row < max_rows; row++, hb++) {
+ struct tcp_metrics_block __rcu **pp = &hb->chain;
+ bool match;
+
+ if (!rcu_access_pointer(*pp))
+ continue;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ match = net ? net_eq(tm_net(tm), net) :
+ !check_net(tm_net(tm));
+ if (match) {
+ rcu_assign_pointer(*pp, tm->tcpm_next);
+ kfree_rcu(tm, rcu_head);
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ cond_resched();
+ }
+}
+
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcpm_hash_bucket *hb;
+ struct tcp_metrics_block *tm;
+ struct tcp_metrics_block __rcu **pp;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net = genl_info_net(info);
+ int ret;
+ bool src = true, found = false;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 1);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ tcp_metrics_flush_all(net);
+ return 0;
+ }
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ hb = tcp_metrics_hash + hash;
+ pp = &hb->chain;
+ spin_lock_bh(&tcp_metrics_lock);
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
+ rcu_assign_pointer(*pp, tm->tcpm_next);
+ kfree_rcu(tm, rcu_head);
+ found = true;
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ if (!found)
+ return -ESRCH;
+ return 0;
+}
+
+static const struct genl_small_ops tcp_metrics_nl_ops[] = {
+ {
+ .cmd = TCP_METRICS_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = tcp_metrics_nl_cmd_get,
+ .dumpit = tcp_metrics_nl_dump,
+ },
+ {
+ .cmd = TCP_METRICS_CMD_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = tcp_metrics_nl_cmd_del,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .policy = tcp_metrics_nl_policy,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .small_ops = tcp_metrics_nl_ops,
+ .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+ .resv_start_op = TCP_METRICS_CMD_DEL + 1,
+};
+
+static unsigned int tcpmhash_entries __initdata;
+static int __init set_tcpmhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &tcpmhash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static void __init tcp_metrics_hash_alloc(void)
+{
+ unsigned int slots = tcpmhash_entries;
+ size_t size;
+
+ if (!slots) {
+ if (totalram_pages() >= 128 * 1024)
+ slots = 16 * 1024;
+ else
+ slots = 8 * 1024;
+ }
+
+ tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
+
+ tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
+ if (!tcp_metrics_hash)
+ panic("Could not allocate the tcp_metrics hash table\n");
+}
+
+static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
+{
+ tcp_metrics_flush_all(NULL);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+ .exit_batch = tcp_net_metrics_exit_batch,
+};
+
+void __init tcp_metrics_init(void)
+{
+ int ret;
+
+ tcp_metrics_hash_alloc();
+
+ ret = register_pernet_subsys(&tcp_net_metrics_ops);
+ if (ret < 0)
+ panic("Could not register tcp_net_metrics_ops\n");
+
+ ret = genl_register_family(&tcp_metrics_nl_family);
+ if (ret < 0)
+ panic("Could not register tcp_metrics generic netlink\n");
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6dddf59c1fb9..bd5462154f97 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
@@ -20,52 +19,55 @@
* Jorge Cwik, <jorge@laser.satlink.net>
*/
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
#include <net/tcp.h>
-#include <net/inet_common.h>
+#include <net/tcp_ecn.h>
#include <net/xfrm.h>
+#include <net/busy_poll.h>
+#include <net/rstreason.h>
+#include <net/psp.h>
-#ifdef CONFIG_SYSCTL
-#define SYNC_INIT 0 /* let the user enable it */
-#else
-#define SYNC_INIT 1
-#endif
-
-int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
-int sysctl_tcp_abort_on_overflow __read_mostly;
-
-struct inet_timewait_death_row tcp_death_row = {
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
- .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
- .hashinfo = &tcp_hashinfo,
- .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
- (unsigned long)&tcp_death_row),
- .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
- inet_twdr_twkill_work,
- &tcp_death_row),
-/* Short-time timewait calendar */
-
- .twcal_hand = -1,
- .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
- (unsigned long)&tcp_death_row),
-};
-
-EXPORT_SYMBOL_GPL(tcp_death_row);
-
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
- return 1;
+ return true;
if (after(end_seq, s_win) && before(seq, e_win))
- return 1;
- return (seq == e_win && seq == end_seq);
+ return true;
+ return seq == e_win && seq == end_seq;
}
-/*
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+ const struct sk_buff *skb, int mib_idx)
+{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+ &tcptw->tw_last_oow_ack_time)) {
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK_OOW;
+ }
+
+ /* We are rate-limiting, so just release the tw sock and drop skb. */
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+
+static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
+ u32 rcv_nxt)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ ao = rcu_dereference(tcptw->ao_info);
+ if (unlikely(ao && seq < rcv_nxt))
+ WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
+#endif
+ WRITE_ONCE(tcptw->tw_rcv_nxt, seq);
+}
+
+/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
* (and, probably, tail of data) and one or more our ACKs are lost.
@@ -92,44 +94,64 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
* spinlock it. I do not want! Well, probability of misbehaviour
* is ridiculously low and, seems, we could use some mb() tricks
* to avoid misread sequence numbers, states etc. --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
*/
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
- const struct tcphdr *th)
+ const struct tcphdr *th, u32 *tw_isn,
+ enum skb_drop_reason *drop_reason)
{
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
struct tcp_options_received tmp_opt;
- int paws_reject = 0;
+ enum skb_drop_reason psp_drop;
+ bool paws_reject = false;
+ int ts_recent_stamp;
+
+ /* Instead of dropping immediately, wait to see what value is
+ * returned. We will accept a non psp-encapsulated syn in the
+ * case where TCP_TW_SYN is returned.
+ */
+ psp_drop = psp_twsk_rx_policy_check(tw, skb);
tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, 0);
+ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
+ tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
- tmp_opt.ts_recent = tcptw->tw_ts_recent;
- tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+ tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
+ tmp_opt.ts_recent_stamp = ts_recent_stamp;
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
- if (tw->tw_substate == TCP_FIN_WAIT2) {
+ if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
/* Just repeat all the checks of tcp_rcv_state_process() */
+ if (psp_drop)
+ goto out_put;
+
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcptw->tw_rcv_nxt,
- tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
- return TCP_TW_ACK;
+ rcv_nxt,
+ rcv_nxt + tcptw->tw_rcv_wnd))
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
if (th->rst)
goto kill;
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
- goto kill_with_rst;
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt))
+ return TCP_TW_RST;
/* Dup ACK? */
- if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+ if (!th->ack ||
+ !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
@@ -139,34 +161,25 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
-kill_with_rst:
- inet_twsk_deschedule(tw, &tcp_death_row);
- inet_twsk_put(tw);
+ TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1)
return TCP_TW_RST;
- }
/* FIN arrived, enter true time-wait state. */
- tw->tw_substate = TCP_TIME_WAIT;
- tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT);
+ twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq,
+ rcv_nxt);
+
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent_stamp = xtime.tv_sec;
- tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ u64 ts = tcp_clock_ms();
+
+ WRITE_ONCE(tw->tw_entry_stamp, ts);
+ WRITE_ONCE(tcptw->tw_ts_recent_stamp,
+ div_u64(ts, MSEC_PER_SEC));
+ WRITE_ONCE(tcptw->tw_ts_recent,
+ tmp_opt.rcv_tsval);
}
- /* I am shamed, but failed to make it more elegant.
- * Yes, it is direct reference to IP, which is impossible
- * to generalize to IPv6. Taking into account that IPv6
- * do not understand recycling in any case, it not
- * a big problem in practice. --ANK */
- if (tw->tw_family == AF_INET &&
- tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
- tcp_v4_tw_remember_stamp(tw))
- inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
- TCP_TIMEWAIT_LEN);
- else
- inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -177,39 +190,43 @@ kill_with_rst:
* "When a connection is [...] on TIME-WAIT state [...]
* [a TCP] MAY accept a new SYN from the remote TCP to
* reopen the connection directly, if it:
- *
+ *
* (1) assigns its initial sequence number for the new
* connection to be larger than the largest sequence
* number it used on the previous connection incarnation,
* and
*
- * (2) returns to TIME-WAIT state if the SYN turns out
+ * (2) returns to TIME-WAIT state if the SYN turns out
* to be an old duplicate".
*/
if (!paws_reject &&
- (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == rcv_nxt &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
+ if (psp_drop)
+ goto out_put;
+
if (th->rst) {
/* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
- if (sysctl_tcp_rfc1337 == 0) {
+ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
kill:
- inet_twsk_deschedule(tw, &tcp_death_row);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
}
+ } else {
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
}
- inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
- TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
- tcptw->tw_ts_recent_stamp = xtime.tv_sec;
+ WRITE_ONCE(tcptw->tw_ts_recent,
+ tmp_opt.rcv_tsval);
+ WRITE_ONCE(tcptw->tw_ts_recent_stamp,
+ ktime_get_seconds());
}
inet_twsk_put(tw);
@@ -234,20 +251,25 @@ kill:
*/
if (th->syn && !th->rst && !th->ack && !paws_reject &&
- (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (after(TCP_SKB_CB(skb)->seq, rcv_nxt) ||
(tmp_opt.saw_tstamp &&
- (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) {
u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
- TCP_SKB_CB(skb)->when = isn;
+ *tw_isn = isn;
return TCP_TW_SYN;
}
- if (paws_reject)
- NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+ if (psp_drop)
+ goto out_put;
- if(!th->rst) {
+ if (paws_reject) {
+ *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
+ }
+
+ if (!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
*
* If it is ACKless SYN it may be both old duplicate
@@ -255,119 +277,263 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
- TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
- /* Send ACK. Note, we do not put the bucket,
- * it will be released by caller.
- */
- return TCP_TW_ACK;
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
}
+
+out_put:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
+EXPORT_IPV6_MOD(tcp_timewait_state_process);
+
+static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+
+ /*
+ * The timewait bucket does not have the key DB from the
+ * sock structure. We just make a quick copy of the
+ * md5 key being used (if indeed we are using one)
+ * so the timewait ack generating code has the key.
+ */
+ tcptw->tw_md5_key = NULL;
+ if (!static_branch_unlikely(&tcp_md5_needed.key))
+ return;
+
+ key = tp->af_specific->md5_lookup(sk, sk);
+ if (key) {
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ if (!tcptw->tw_md5_key)
+ return;
+ if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
+ goto out_free;
+ }
+ return;
+out_free:
+ WARN_ON_ONCE(1);
+ kfree(tcptw->tw_md5_key);
+ tcptw->tw_md5_key = NULL;
+#endif
+}
-/*
+/*
* Move a socket to time-wait or dead fin-wait-2 state.
- */
+ */
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
- struct inet_timewait_sock *tw = NULL;
const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
- int recycle_ok = 0;
-
- if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
- recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ struct inet_timewait_sock *tw;
- if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
- tw = inet_twsk_alloc(sk, state);
+ tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
- if (tw != NULL) {
+ if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+ tw->tw_mark = sk->sk_mark;
+ tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ /* refreshed when we enter true TIME-WAIT state */
+ tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ tcptw->tw_ts_offset = tp->tsoffset;
+ tw->tw_usec_ts = tp->tcp_usec_ts;
+ tcptw->tw_last_oow_ack_time = 0;
+ tcptw->tw_tx_delay = tp->tcp_tx_delay;
+ tw->tw_txhash = sk->sk_txhash;
+ tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
- tw->tw_ipv6only = np->ipv6only;
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ tw->tw_tclass = np->tclass;
+ tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+ tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * The timewait bucket does not have the key DB from the
- * sock structure. We just make a quick copy of the
- * md5 key being used (if indeed we are using one)
- * so the timewait ack generating code has the key.
- */
- do {
- struct tcp_md5sig_key *key;
- memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
- tcptw->tw_md5_keylen = 0;
- key = tp->af_specific->md5_lookup(sk, sk);
- if (key != NULL) {
- memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
- tcptw->tw_md5_keylen = key->keylen;
- if (tcp_alloc_md5sig_pool() == NULL)
- BUG();
- }
- } while(0);
-#endif
-
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+ tcp_time_wait_init(sk, tcptw);
+ tcp_ao_time_wait(tcptw, tp);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
timeo = rto;
- if (recycle_ok) {
- tw->tw_timeout = rto;
- } else {
- tw->tw_timeout = TCP_TIMEWAIT_LEN;
- if (state == TCP_TIME_WAIT)
- timeo = TCP_TIMEWAIT_LEN;
- }
+ if (state == TCP_TIME_WAIT)
+ timeo = TCP_TIMEWAIT_LEN;
- inet_twsk_schedule(tw, &tcp_death_row, timeo,
- TCP_TIMEWAIT_LEN);
- inet_twsk_put(tw);
+ /* Linkage updates.
+ * Note that access to tw after this point is illegal.
+ */
+ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
tcp_done(sk);
}
+EXPORT_SYMBOL(tcp_time_wait);
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_keylen)
- tcp_put_md5sig_pool();
+ if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
+ if (twsk->tw_md5_key) {
+ kfree(twsk->tw_md5_key);
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ }
+ }
#endif
+ tcp_ao_destroy_sock(sk, true);
+ psp_twsk_assoc_free(inet_twsk(sk));
+}
+
+void tcp_twsk_purge(struct list_head *net_exit_list)
+{
+ bool purged_once = false;
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ if (net->ipv4.tcp_death_row.hashinfo->pernet) {
+ /* Even if tw_refcount == 1, we must clean up kernel reqsk */
+ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
+ } else if (!purged_once) {
+ inet_twsk_purge(&tcp_hashinfo);
+ purged_once = true;
+ }
+ }
+}
+
+/* Warning : This function is called without sk_listener being locked.
+ * Be sure to read socket fields once, as their value could change under us.
+ */
+void tcp_openreq_init_rwin(struct request_sock *req,
+ const struct sock *sk_listener,
+ const struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ const struct tcp_sock *tp = tcp_sk(sk_listener);
+ int full_space = tcp_full_space(sk_listener);
+ u32 window_clamp;
+ __u8 rcv_wscale;
+ u32 rcv_wnd;
+ int mss;
+
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+ window_clamp = READ_ONCE(tp->window_clamp);
+ /* Set this up on the first call only */
+ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
+ if (rcv_wnd == 0)
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+ else if (full_space < rcv_wnd * mss)
+ full_space = rcv_wnd * mss;
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(sk_listener, full_space,
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rsk_rcv_wnd,
+ &req->rsk_window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+ rcv_wnd);
+ ireq->rcv_wscale = rcv_wscale;
+}
+
+static void tcp_ecn_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ const struct tcp_request_sock *treq = tcp_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (treq->accecn_ok) {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_snt = treq->syn_ect_snt;
+ tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->saw_accecn_opt = treq->saw_accecn_opt;
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
+ tcp_ecn_received_counters_payload(sk, skb);
+ } else {
+ tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+ TCP_ECN_MODE_RFC3168 :
+ TCP_ECN_DISABLED);
+ }
}
-EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ bool ca_got_dst = false;
+
+ if (ca_key != TCP_CA_UNSPEC) {
+ const struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ ca_got_dst = true;
+ }
+ rcu_read_unlock();
+ }
+
+ /* If no valid choice made yet, assign current system default ca. */
+ if (!ca_got_dst &&
+ (!icsk->icsk_ca_setsockopt ||
+ !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
+ tcp_assign_congestion_control(sk);
+
+ tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child);
+
+static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
+ struct request_sock *req,
+ struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ struct inet_request_sock *ireq;
+
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ ireq = inet_rsk(req);
+ if (oldtp->syn_smc && !ireq->smc_ok)
+ newtp->syn_smc = 0;
+ }
+#endif
+}
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
@@ -375,140 +541,181 @@ EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
-struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(const struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *skb)
{
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
-
- if (newsk != NULL) {
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_request_sock *treq = tcp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(sk);
- struct tcp_sock *newtp;
-
- /* Now setup tcp_sock */
- newtp = tcp_sk(newsk);
- newtp->pred_flags = 0;
- newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
-
- tcp_prequeue_init(newtp);
-
- tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
-
- newtp->srtt = 0;
- newtp->mdev = TCP_TIMEOUT_INIT;
- newicsk->icsk_rto = TCP_TIMEOUT_INIT;
-
- newtp->packets_out = 0;
- newtp->left_out = 0;
- newtp->retrans_out = 0;
- newtp->sacked_out = 0;
- newtp->fackets_out = 0;
- newtp->snd_ssthresh = 0x7fffffff;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- newtp->snd_cwnd = 2;
- newtp->snd_cwnd_cnt = 0;
- newtp->bytes_acked = 0;
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk;
+ const struct tcp_sock *oldtp;
+ struct tcp_sock *newtp;
+ u32 seq;
+
+ if (!newsk)
+ return NULL;
- newtp->frto_counter = 0;
- newtp->frto_highmark = 0;
+ newicsk = inet_csk(newsk);
+ newtp = tcp_sk(newsk);
+ oldtp = tcp_sk(sk);
- newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ smc_check_reset_syn_req(oldtp, req, newtp);
- tcp_set_ca_state(newsk, TCP_CA_Open);
- tcp_init_xmit_timers(newsk);
- skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->rcv_wup = treq->rcv_isn + 1;
- newtp->write_seq = treq->snt_isn + 1;
- newtp->pushed_seq = newtp->write_seq;
- newtp->copied_seq = treq->rcv_isn + 1;
+ /* Now setup tcp_sock */
+ newtp->pred_flags = 0;
- newtp->rx_opt.saw_tstamp = 0;
+ seq = treq->rcv_isn + 1;
+ newtp->rcv_wup = seq;
+ WRITE_ONCE(newtp->copied_seq, seq);
+ WRITE_ONCE(newtp->rcv_nxt, seq);
+ newtp->segs_in = 1;
- newtp->rx_opt.dsack = 0;
- newtp->rx_opt.eff_sacks = 0;
+ seq = treq->snt_isn + 1;
+ newtp->snd_sml = newtp->snd_una = seq;
+ WRITE_ONCE(newtp->snd_nxt, seq);
+ newtp->snd_up = seq;
- newtp->rx_opt.num_sacks = 0;
- newtp->urg_data = 0;
+ INIT_LIST_HEAD(&newtp->tsq_node);
+ INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
- if (sock_flag(newsk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
+ tcp_init_wl(newtp, treq->rcv_isn);
- newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
- if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
- if (sysctl_tcp_fack)
- newtp->rx_opt.sack_ok |= 2;
- }
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_ssthresh = req->rcv_wnd;
- newtp->rcv_wnd = req->rcv_wnd;
- newtp->rx_opt.wscale_ok = ireq->wscale_ok;
- if (newtp->rx_opt.wscale_ok) {
- newtp->rx_opt.snd_wscale = ireq->snd_wscale;
- newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
- } else {
- newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
- newtp->window_clamp = min(newtp->window_clamp, 65535U);
- }
- newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
- newtp->max_window = newtp->snd_wnd;
+ minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
+ newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
+
+ newtp->lsndtime = tcp_jiffies32;
+ newsk->sk_txhash = READ_ONCE(treq->txhash);
+ newtp->total_retrans = req->num_retrans;
+
+ tcp_init_xmit_timers(newsk);
+ WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
+
+ if (sock_flag(newsk, SOCK_KEEPOPEN))
+ tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
- if (newtp->rx_opt.tstamp_ok) {
- newtp->rx_opt.ts_recent = req->ts_recent;
- newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
- newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+ newtp->rx_opt.sack_ok = ireq->sack_ok;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
+ newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+ if (newtp->rx_opt.wscale_ok) {
+ newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+ newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+ } else {
+ newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp, 65535U);
+ }
+ newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
+ newtp->max_window = newtp->snd_wnd;
+
+ if (newtp->rx_opt.tstamp_ok) {
+ newtp->tcp_usec_ts = treq->req_usec_ts;
+ newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->tcp_usec_ts = 0;
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ if (req->num_timeout) {
+ newtp->total_rto = req->num_timeout;
+ newtp->undo_marker = treq->snt_isn;
+ if (newtp->tcp_usec_ts) {
+ newtp->retrans_stamp = treq->snt_synack;
+ newtp->total_rto_time = (u32)(tcp_clock_us() -
+ newtp->retrans_stamp) / USEC_PER_MSEC;
} else {
- newtp->rx_opt.ts_recent_stamp = 0;
- newtp->tcp_header_len = sizeof(struct tcphdr);
+ newtp->retrans_stamp = div_u64(treq->snt_synack,
+ USEC_PER_SEC / TCP_TS_HZ);
+ newtp->total_rto_time = tcp_clock_ms() -
+ newtp->retrans_stamp;
}
+ newtp->total_rto_recoveries = 1;
+ }
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
- newtp->md5sig_info = NULL; /*XXX*/
- if (newtp->af_specific->md5_lookup(sk, newsk))
- newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+ newtp->md5sig_info = NULL; /*XXX*/
#endif
- if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
- newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
- newtp->rx_opt.mss_clamp = req->mss;
- TCP_ECN_openreq_child(newtp, req);
+#ifdef CONFIG_TCP_AO
+ newtp->ao_info = NULL;
- TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
+ if (tcp_rsk_used_ao(req)) {
+ struct tcp_ao_key *ao_key;
+
+ ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1);
+ if (ao_key)
+ newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
}
+ #endif
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+ newtp->rx_opt.mss_clamp = req->mss;
+ tcp_ecn_openreq_child(newsk, req, skb);
+ newtp->fastopen_req = NULL;
+ RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+
+ newtp->bpf_chg_cc_inprogress = 0;
+ tcp_bpf_clone(sk, newsk);
+
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+
+ xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
+
return newsk;
}
+EXPORT_SYMBOL(tcp_create_openreq_child);
-/*
- * Process an incoming packet for SYN_RECV sockets represented
- * as a request_sock.
+/*
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
+ *
+ * Note: If @fastopen is true, this can be called from process context.
+ * Otherwise, this is from BH context.
*/
-struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct request_sock **prev)
+ bool fastopen, bool *req_stolen,
+ enum skb_drop_reason *drop_reason)
{
- struct tcphdr *th = skb->h.th;
- __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
- int paws_reject = 0;
struct tcp_options_received tmp_opt;
struct sock *child;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ bool tsecr_reject = false;
+ bool paws_reject = false;
+ bool own_req;
tmp_opt.saw_tstamp = 0;
+ tmp_opt.accecn = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, 0);
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
+ if (tmp_opt.rcv_tsecr) {
+ if (inet_rsk(req)->tstamp_ok && !fastopen)
+ tsecr_reject = !between(tmp_opt.rcv_tsecr,
+ tcp_rsk(req)->snt_tsval_first,
+ READ_ONCE(tcp_rsk(req)->snt_tsval_last));
+ tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
+ }
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
- paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() -
+ tcp_reqsk_timeout(req) / HZ;
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -532,8 +739,26 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
*
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
+ *
+ * Note that even if there is new data in the SYN packet
+ * they will be thrown away too.
+ *
+ * Reset timer after retransmitting SYNACK, similar to
+ * the idea of fast retransmit in recovery.
*/
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time) &&
+
+ !tcp_rtx_synack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ expires += tcp_reqsk_timeout(req);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer, expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
return NULL;
}
@@ -589,149 +814,190 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
* sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
- * Invalid ACK: reset will be sent by listening socket
+ * Invalid ACK: reset will be sent by listening socket.
+ * Note that the ACK validity check for a Fast Open socket is done
+ * elsewhere and is checked directly against the child socket rather
+ * than req because user data may have been sent out.
*/
- if ((flg & TCP_FLAG_ACK) &&
- (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
+ if ((flg & TCP_FLAG_ACK) && !fastopen &&
+ (TCP_SKB_CB(skb)->ack_seq !=
+ tcp_rsk(req)->snt_isn + 1))
return sk;
- /* Also, it would be not so bad idea to check rcv_tsecr, which
- * is essentially ACK extension and too early or too late values
- * should cause reset in unsynchronized states.
- */
-
/* RFC793: "first check sequence number". */
- if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+ if (paws_reject || tsecr_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ tcp_rsk(req)->rcv_nxt,
+ tcp_rsk(req)->rcv_nxt +
+ tcp_synack_window(req))) {
/* Out of window: send ACK and drop. */
- if (!(flg & TCP_FLAG_RST))
- req->rsk_ops->send_ack(skb, req);
- if (paws_reject)
- NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+ if (!(flg & TCP_FLAG_RST) &&
+ !tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time))
+ req->rsk_ops->send_ack(sk, skb, req);
+ if (paws_reject) {
+ SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ } else if (tsecr_reject) {
+ SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED);
+ } else {
+ SKB_DR_SET(*drop_reason, TCP_OVERWINDOW);
+ }
return NULL;
}
/* In sequence, PAWS is OK. */
- if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
- req->ts_recent = tmp_opt.rcv_tsval;
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
+ /* Truncate SYN, it is out of window starting
+ at tcp_rsk(req)->rcv_isn + 1. */
+ flg &= ~TCP_FLAG_SYN;
+ }
- if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
- /* Truncate SYN, it is out of window starting
- at tcp_rsk(req)->rcv_isn + 1. */
- flg &= ~TCP_FLAG_SYN;
- }
+ /* RFC793: "second check the RST bit" and
+ * "fourth, check the SYN bit"
+ */
+ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ goto embryonic_reset;
+ }
- /* RFC793: "second check the RST bit" and
- * "fourth, check the SYN bit"
- */
- if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
- TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
- goto embryonic_reset;
- }
+ /* ACK sequence verified above, just make sure ACK is
+ * set. If ACK not set, just silently drop the packet.
+ *
+ * XXX (TFO) - if we ever allow "data after SYN", the
+ * following check needs to be removed.
+ */
+ if (!(flg & TCP_FLAG_ACK))
+ return NULL;
- /* ACK sequence verified above, just make sure ACK is
- * set. If ACK not set, just silently drop the packet.
- */
- if (!(flg & TCP_FLAG_ACK))
- return NULL;
-
- /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
- if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
- TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
- inet_rsk(req)->acked = 1;
- return NULL;
- }
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
+ tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
- /* OK, ACK is valid, create big socket and
- * feed this segment to it. It will repeat all
- * the tests. THIS SEGMENT MUST MOVE SOCKET TO
- * ESTABLISHED STATE. If it will be dropped after
- * socket is created, wait for troubles.
- */
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
- req, NULL);
- if (child == NULL)
- goto listen_overflow;
-#ifdef CONFIG_TCP_MD5SIG
- else {
- /* Copy over the MD5 key from the original socket */
- struct tcp_md5sig_key *key;
- struct tcp_sock *tp = tcp_sk(sk);
- key = tp->af_specific->md5_lookup(sk, child);
- if (key != NULL) {
- /*
- * We're using one, so create a matching key on the
- * newsk structure. If we fail to get memory then we
- * end up not copying the key across. Shucks.
- */
- char *newkey = kmemdup(key->key, key->keylen,
- GFP_ATOMIC);
- if (newkey) {
- if (!tcp_alloc_md5sig_pool())
- BUG();
- tp->af_specific->md5_add(child, child,
- newkey,
- key->keylen);
- }
- }
+ tcp_rsk(req)->saw_accecn_opt = saw_opt;
+ if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+ u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+ tcp_rsk(req)->accecn_fail_mode |= fail_mode;
}
-#endif
+ }
- inet_csk_reqsk_queue_unlink(sk, req, prev);
- inet_csk_reqsk_queue_removed(sk, req);
+ /* For Fast Open no more processing is needed (sk is the
+ * child socket).
+ */
+ if (fastopen)
+ return sk;
+
+ /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
+ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ inet_rsk(req)->acked = 1;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+ return NULL;
+ }
- inet_csk_reqsk_queue_add(sk, req, child);
+ /* OK, ACK is valid, create big socket and
+ * feed this segment to it. It will repeat all
+ * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ * ESTABLISHED STATE. If it will be dropped after
+ * socket is created, wait for troubles.
+ */
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ req, &own_req);
+ if (!child)
+ goto listen_overflow;
+
+ if (own_req && tmp_opt.saw_tstamp &&
+ !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+ tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval;
+
+ if (own_req && rsk_drop_req(req)) {
+ reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
return child;
+ }
- listen_overflow:
- if (!sysctl_tcp_abort_on_overflow) {
- inet_rsk(req)->acked = 1;
- return NULL;
- }
+ sock_rps_save_rxhash(child, skb);
+ tcp_synack_rtt_meas(child, req);
+ *req_stolen = !own_req;
+ return inet_csk_complete_hashdance(sk, child, req, own_req);
- embryonic_reset:
- NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
- if (!(flg & TCP_FLAG_RST))
- req->rsk_ops->send_reset(sk, skb);
+listen_overflow:
+ SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW);
+ if (sk != req->rsk_listener)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
- inet_csk_reqsk_queue_drop(sk, req, prev);
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
+ inet_rsk(req)->acked = 1;
return NULL;
+ }
+
+embryonic_reset:
+ if (!(flg & TCP_FLAG_RST)) {
+ /* Received a bad SYN pkt - for TFO We try not to reset
+ * the local connection unless it's really necessary to
+ * avoid becoming vulnerable to outside attack aiming at
+ * resetting legit local connections.
+ */
+ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
+ } else if (fastopen) { /* received a valid RST pkt */
+ reqsk_fastopen_remove(sk, req, true);
+ tcp_reset(sk, skb);
+ }
+ if (!fastopen) {
+ bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
+
+ if (unlinked)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ *req_stolen = !unlinked;
+ }
+ return NULL;
}
+EXPORT_IPV6_MOD(tcp_check_req);
/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
*/
-int tcp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb)
+enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+ __releases(&((child)->sk_lock.slock))
{
- int ret = 0;
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
int state = child->sk_state;
- if (!sock_owned_by_user(child)) {
- ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+ /* record sk_napi_id and sk_rx_queue_mapping of child. */
+ sk_mark_napi_id_set(child, skb);
+ tcp_segs_in(tcp_sk(child), skb);
+ if (!sock_owned_by_user(child)) {
+ reason = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
- sk_add_backlog(child, skb);
+ __sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
sock_put(child);
- return ret;
+ return reason;
}
-
-EXPORT_SYMBOL(tcp_check_req);
-EXPORT_SYMBOL(tcp_child_process);
-EXPORT_SYMBOL(tcp_create_openreq_child);
-EXPORT_SYMBOL(tcp_timewait_state_process);
+EXPORT_IPV6_MOD(tcp_child_process);
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 000000000000..a60662f4bdf9
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks.
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ *
+ * NV is only recommeneded for traffic within a data center, and when
+ * all the flows are NV (at least those within the data center). This
+ * is due to the inherent unfairness between flows using losses to
+ * detect congestion (congestion control) and those that use queue
+ * buildup to detect congestion (congestion avoidance).
+ *
+ * Note: High NIC coalescence values may lower the performance of NV
+ * due to the increased noise in RTT values. In particular, we have
+ * seen issues with rx-frames values greater than 8.
+ *
+ * TODO:
+ * 1) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters
+ *
+ * nv_pad Max number of queued packets allowed in network
+ * nv_pad_buffer Do not grow cwnd if this closed to nv_pad
+ * nv_reset_period How often (in) seconds)to reset min_rtt
+ * nv_min_cwnd Don't decrease cwnd below this if there are no losses
+ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected
+ * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8
+ * nv_rtt_factor RTT averaging factor
+ * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur
+ * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd
+ * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd
+ * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping
+ * slow-start due to congestion
+ * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion
+ * nv_rtt_min_cnt Wait these many RTTs before making congesion decision
+ * nv_cwnd_growth_rate_neg
+ * nv_cwnd_growth_rate_pos
+ * How quickly to double growth rate (not rate) of cwnd when not
+ * congested. One value (nv_cwnd_growth_rate_neg) for when
+ * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos)
+ * otherwise.
+ */
+
+static int nv_pad __read_mostly = 10;
+static int nv_pad_buffer __read_mostly = 2;
+static int nv_reset_period __read_mostly = 5; /* in seconds */
+static int nv_min_cwnd __read_mostly = 2;
+static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */
+static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */
+static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */
+static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */
+static int nv_cwnd_growth_rate_neg __read_mostly = 8;
+static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */
+static int nv_dec_eval_min_calls __read_mostly = 60;
+static int nv_inc_eval_min_calls __read_mostly = 20;
+static int nv_ssthresh_eval_min_calls __read_mostly = 30;
+static int nv_stop_rtt_cnt __read_mostly = 10;
+static int nv_rtt_min_cnt __read_mostly = 2;
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+ " without losses");
+
+/* TCP NV Parameters */
+struct tcpnv {
+ unsigned long nv_min_rtt_reset_jiffies; /* when to switch to
+ * nv_min_rtt_new */
+ s8 cwnd_growth_factor; /* Current cwnd growth factor,
+ * < 0 => less than 1 packet/RTT */
+ u8 available8;
+ u16 available16;
+ u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */
+ nv_reset:1, /* whether to reset values */
+ nv_catchup:1; /* whether we are growing because
+ * of temporary cwnd decrease */
+ u8 nv_eval_call_cnt; /* call count since last eval */
+ u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is
+ * smaller than this. It may grow to handle
+ * TSO, LRO and interrupt coalescence because
+ * with these a small cwnd cannot saturate
+ * the link. Note that this is different from
+ * the file local nv_min_cwnd */
+ u8 nv_rtt_cnt; /* RTTs without making ca decision */;
+ u32 nv_last_rtt; /* last rtt */
+ u32 nv_min_rtt; /* active min rtt. Used to determine slope */
+ u32 nv_min_rtt_new; /* min rtt for future use */
+ u32 nv_base_rtt; /* If non-zero it represents the threshold for
+ * congestion */
+ u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is
+ * set to 80% of nv_base_rtt. It helps reduce
+ * unfairness between flows */
+ u32 nv_rtt_max_rate; /* max rate seen during current RTT */
+ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives
+ * acking beyond nv_rtt_start_seq */
+ u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is
+ * used to determine bytes acked since last
+ * call to bictcp_acked */
+ u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */
+};
+
+#define NV_INIT_RTT U32_MAX
+#define NV_MIN_CWND 4
+#define NV_MIN_CWND_GROW 2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->nv_reset = 0;
+ ca->nv_no_cong_cnt = 0;
+ ca->nv_rtt_cnt = 0;
+ ca->nv_last_rtt = 0;
+ ca->nv_rtt_max_rate = 0;
+ ca->nv_rtt_start_seq = tp->snd_una;
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+ int base_rtt;
+
+ tcpnv_reset(ca, sk);
+
+ /* See if base_rtt is available from socket_ops bpf program.
+ * It is meant to be used in environments, such as communication
+ * within a datacenter, where we have reasonable estimates of
+ * RTTs
+ */
+ base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
+ if (base_rtt > 0) {
+ ca->nv_base_rtt = base_rtt;
+ ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
+ } else {
+ ca->nv_base_rtt = 0;
+ ca->nv_lower_bound_rtt = 0;
+ }
+
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ;
+ ca->nv_min_rtt = NV_INIT_RTT;
+ ca->nv_min_rtt_new = NV_INIT_RTT;
+ ca->nv_min_cwnd = NV_MIN_CWND;
+ ca->nv_catchup = 0;
+ ca->cwnd_growth_factor = 0;
+}
+
+/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt)
+ * bounds to RTT.
+ */
+inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val)
+{
+ if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt)
+ return ca->nv_lower_bound_rtt;
+ else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt)
+ return ca->nv_base_rtt;
+ else
+ return val;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+ u32 cnt;
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* Only grow cwnd if NV has not detected congestion */
+ if (!ca->nv_allow_cwnd_growth)
+ return;
+
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+
+ if (ca->cwnd_growth_factor < 0) {
+ cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor;
+ tcp_cong_avoid_ai(tp, cnt, acked);
+ } else {
+ cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor);
+ tcp_cong_avoid_ai(tp, cnt, acked);
+ }
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+ struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Open && ca->nv_reset) {
+ tcpnv_reset(ca, sk);
+ } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR ||
+ new_state == TCP_CA_Recovery) {
+ ca->nv_reset = 1;
+ ca->nv_allow_cwnd_growth = 0;
+ if (new_state == TCP_CA_Loss) {
+ /* Reset cwnd growth factor to Reno value */
+ if (ca->cwnd_growth_factor > 0)
+ ca->cwnd_growth_factor = 0;
+ /* Decrease growth rate if allowed */
+ if (nv_cwnd_growth_rate_neg > 0 &&
+ ca->cwnd_growth_factor > -8)
+ ca->cwnd_growth_factor--;
+ }
+ }
+}
+
+/* Do congestion avoidance calculations for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcpnv *ca = inet_csk_ca(sk);
+ unsigned long now = jiffies;
+ u64 rate64;
+ u32 rate, max_win, cwnd_by_slope;
+ u32 avg_rtt;
+ u32 bytes_acked = 0;
+
+ /* Some calls are for duplicates without timetamps */
+ if (sample->rtt_us < 0)
+ return;
+
+ /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Disorder)
+ return;
+
+ /* Stop cwnd growth if we were in catch up mode */
+ if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) {
+ ca->nv_catchup = 0;
+ ca->nv_allow_cwnd_growth = 0;
+ }
+
+ bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+ ca->nv_last_snd_una = tp->snd_una;
+
+ if (sample->in_flight == 0)
+ return;
+
+ /* Calculate moving average of RTT */
+ if (nv_rtt_factor > 0) {
+ if (ca->nv_last_rtt > 0) {
+ avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
+ ((u64)ca->nv_last_rtt)
+ * (256 - nv_rtt_factor)) >> 8;
+ } else {
+ avg_rtt = sample->rtt_us;
+ ca->nv_min_rtt = avg_rtt << 1;
+ }
+ ca->nv_last_rtt = avg_rtt;
+ } else {
+ avg_rtt = sample->rtt_us;
+ }
+
+ /* rate in 100's bits per second */
+ rate64 = ((u64)sample->in_flight) * 80000;
+ do_div(rate64, avg_rtt ?: 1);
+ rate = (u32)rate64;
+
+ /* Remember the maximum rate seen during this RTT
+ * Note: It may be more than one RTT. This function should be
+ * called at least nv_dec_eval_min_calls times.
+ */
+ if (ca->nv_rtt_max_rate < rate)
+ ca->nv_rtt_max_rate = rate;
+
+ /* We have valid information, increment counter */
+ if (ca->nv_eval_call_cnt < 255)
+ ca->nv_eval_call_cnt++;
+
+ /* Apply bounds to rtt. Only used to update min_rtt */
+ avg_rtt = nv_get_bounded_rtt(ca, avg_rtt);
+
+ /* update min rtt if necessary */
+ if (avg_rtt < ca->nv_min_rtt)
+ ca->nv_min_rtt = avg_rtt;
+
+ /* update future min_rtt if necessary */
+ if (avg_rtt < ca->nv_min_rtt_new)
+ ca->nv_min_rtt_new = avg_rtt;
+
+ /* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+ * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+ * warm reset). This new nv_min_rtt will be continued to be updated
+ * and be used for another sysctl_tcp_nv_reset_period seconds,
+ * when it will be updated again.
+ * In practice we introduce some randomness, so the actual period used
+ * is chosen randomly from the range:
+ * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+ */
+ if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+ unsigned char rand;
+
+ ca->nv_min_rtt = ca->nv_min_rtt_new;
+ ca->nv_min_rtt_new = NV_INIT_RTT;
+ get_random_bytes(&rand, 1);
+ ca->nv_min_rtt_reset_jiffies =
+ now + ((nv_reset_period * (384 + rand) * HZ) >> 9);
+ /* Every so often we decrease ca->nv_min_cwnd in case previous
+ * value is no longer accurate.
+ */
+ ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND);
+ }
+
+ /* Once per RTT check if we need to do congestion avoidance */
+ if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+ ca->nv_rtt_start_seq = tp->snd_nxt;
+ if (ca->nv_rtt_cnt < 0xff)
+ /* Increase counter for RTTs without CA decision */
+ ca->nv_rtt_cnt++;
+
+ /* If this function is only called once within an RTT
+ * the cwnd is probably too small (in some cases due to
+ * tso, lro or interrupt coalescence), so we increase
+ * ca->nv_min_cwnd.
+ */
+ if (ca->nv_eval_call_cnt == 1 &&
+ bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache &&
+ ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) {
+ ca->nv_min_cwnd = min(ca->nv_min_cwnd
+ + NV_MIN_CWND_GROW,
+ NV_TSO_CWND_BOUND + 1);
+ ca->nv_rtt_start_seq = tp->snd_nxt +
+ ca->nv_min_cwnd * tp->mss_cache;
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_allow_cwnd_growth = 1;
+ return;
+ }
+
+ /* Find the ideal cwnd for current rate from slope
+ * slope = 80000.0 * mss / nv_min_rtt
+ * cwnd_by_slope = nv_rtt_max_rate / slope
+ */
+ cwnd_by_slope = (u32)
+ div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+ 80000ULL * tp->mss_cache);
+ max_win = cwnd_by_slope + nv_pad;
+
+ /* If cwnd > max_win, decrease cwnd
+ * if cwnd < max_win, grow cwnd
+ * else leave the same
+ */
+ if (tcp_snd_cwnd(tp) > max_win) {
+ /* there is congestion, check that it is ok
+ * to make a CA decision
+ * 1. We should have at least nv_dec_eval_min_calls
+ * data points before making a CA decision
+ * 2. We only make a congesion decision after
+ * nv_rtt_min_cnt RTTs
+ */
+ if (ca->nv_rtt_cnt < nv_rtt_min_cnt) {
+ return;
+ } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+ if (ca->nv_eval_call_cnt <
+ nv_ssthresh_eval_min_calls)
+ return;
+ /* otherwise we will decrease cwnd */
+ } else if (ca->nv_eval_call_cnt <
+ nv_dec_eval_min_calls) {
+ if (ca->nv_allow_cwnd_growth &&
+ ca->nv_rtt_cnt > nv_stop_rtt_cnt)
+ ca->nv_allow_cwnd_growth = 0;
+ return;
+ }
+
+ /* We have enough data to determine we are congested */
+ ca->nv_allow_cwnd_growth = 0;
+ tp->snd_ssthresh =
+ (nv_ssthresh_factor * max_win) >> 3;
+ if (tcp_snd_cwnd(tp) - max_win > 2) {
+ /* gap > 2, we do exponential cwnd decrease */
+ int dec;
+
+ dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) *
+ nv_cong_dec_mult) >> 7);
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec);
+ } else if (nv_cong_dec_mult > 0) {
+ tcp_snd_cwnd_set(tp, max_win);
+ }
+ if (ca->cwnd_growth_factor > 0)
+ ca->cwnd_growth_factor = 0;
+ ca->nv_no_cong_cnt = 0;
+ } else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) {
+ /* There is no congestion, grow cwnd if allowed*/
+ if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
+ return;
+
+ ca->nv_allow_cwnd_growth = 1;
+ ca->nv_no_cong_cnt++;
+ if (ca->cwnd_growth_factor < 0 &&
+ nv_cwnd_growth_rate_neg > 0 &&
+ ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) {
+ ca->cwnd_growth_factor++;
+ ca->nv_no_cong_cnt = 0;
+ } else if (ca->cwnd_growth_factor >= 0 &&
+ nv_cwnd_growth_rate_pos > 0 &&
+ ca->nv_no_cong_cnt >
+ nv_cwnd_growth_rate_pos) {
+ ca->cwnd_growth_factor++;
+ ca->nv_no_cong_cnt = 0;
+ }
+ } else {
+ /* cwnd is in-between, so do nothing */
+ return;
+ }
+
+ /* update state */
+ ca->nv_eval_call_cnt = 0;
+ ca->nv_rtt_cnt = 0;
+ ca->nv_rtt_max_rate = 0;
+
+ /* Don't want to make cwnd < nv_min_cwnd
+ * (it wasn't before, if it is now is because nv
+ * decreased it).
+ */
+ if (tcp_snd_cwnd(tp) < nv_min_cwnd)
+ tcp_snd_cwnd_set(tp, nv_min_cwnd);
+ }
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct tcpnv *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+ info->vegas.tcpv_rtt = ca->nv_last_rtt;
+ info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+ .init = tcpnv_init,
+ .ssthresh = tcpnv_recalc_ssthresh,
+ .cong_avoid = tcpnv_cong_avoid,
+ .set_state = tcpnv_state,
+ .undo_cwnd = tcp_reno_undo_cwnd,
+ .pkts_acked = tcpnv_acked,
+ .get_info = tcpnv_get_info,
+
+ .owner = THIS_MODULE,
+ .name = "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+ return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
new file mode 100644
index 000000000000..fdda18b1abda
--- /dev/null
+++ b/net/ipv4/tcp_offload.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * TCPv4 GSO/GRO support
+ */
+
+#include <linux/indirect_call_wrapper.h>
+#include <linux/skbuff.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+
+static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb,
+ unsigned int seq, unsigned int mss)
+{
+ u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP;
+ u32 ts_seq = skb_shinfo(gso_skb)->tskey;
+
+ while (skb) {
+ if (before(ts_seq, seq + mss)) {
+ skb_shinfo(skb)->tx_flags |= flags;
+ skb_shinfo(skb)->tskey = ts_seq;
+ return;
+ }
+
+ skb = skb->next;
+ seq += mss;
+ }
+}
+
+static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct tcphdr *th;
+ struct iphdr *iph;
+
+ if (*oldip == newip && *oldport == newport)
+ return;
+
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
+ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
+ *oldport = newport;
+
+ csum_replace4(&iph->check, *oldip, newip);
+ *oldip = newip;
+}
+
+static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct tcphdr *th;
+ const struct iphdr *iph;
+ struct sk_buff *seg;
+ struct tcphdr *th2;
+ struct iphdr *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ip_hdr(seg->next);
+
+ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
+ iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->saddr, iph->saddr,
+ &th2->source, th->source);
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->daddr, iph->daddr,
+ &th2->dest, th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv4_gso_segment_list_csum(skb);
+}
+
+static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
+ return ERR_PTR(-EINVAL);
+
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ return ERR_PTR(-EINVAL);
+
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size)
+ return __tcp4_gso_segment_list(skb, features);
+
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ /* Set up checksum pseudo header, usually expect stack to
+ * have done this already.
+ */
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+ }
+
+ return tcp_gso_segment(skb, features);
+}
+
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
+ struct tcphdr *th;
+ unsigned int thlen;
+ unsigned int seq;
+ unsigned int oldlen;
+ unsigned int mss;
+ struct sk_buff *gso_skb = skb;
+ __sum16 newcheck;
+ bool ooo_okay, copy_destructor;
+ bool ecn_cwr_mask;
+ __wsum delta;
+
+ th = tcp_hdr(skb);
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb)))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = ~skb->len;
+ __skb_pull(skb, thlen);
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ copy_destructor = gso_skb->destructor == tcp_wfree;
+ ooo_okay = gso_skb->ooo_okay;
+ /* All segments but the first should have ooo_okay cleared */
+ skb->ooo_okay = 0;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs))
+ goto out;
+
+ /* Only first segment might have ooo_okay set */
+ segs->ooo_okay = ooo_okay;
+
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
+ delta = (__force __wsum)htonl(oldlen + thlen + mss);
+
+ skb = segs;
+ th = tcp_hdr(skb);
+ seq = ntohl(th->seq);
+
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP))
+ tcp_gso_tstamp(segs, gso_skb, seq, mss);
+
+ newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta));
+
+ ecn_cwr_mask = !!(skb_shinfo(gso_skb)->gso_type & SKB_GSO_TCP_ACCECN);
+
+ while (skb->next) {
+ th->fin = th->psh = 0;
+ th->check = newcheck;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(skb, ~th->check);
+ else
+ th->check = gso_make_checksum(skb, ~th->check);
+
+ seq += mss;
+ if (copy_destructor) {
+ skb->destructor = gso_skb->destructor;
+ skb->sk = gso_skb->sk;
+ sum_truesize += skb->truesize;
+ }
+ skb = skb->next;
+ th = tcp_hdr(skb);
+
+ th->seq = htonl(seq);
+
+ th->cwr &= ecn_cwr_mask;
+ }
+
+ /* Following permits TCP Small Queues to work well with GSO :
+ * The callback to TCP stack will be called at the time last frag
+ * is freed at TX completion, and not right now when gso_skb
+ * is freed by GSO engine
+ */
+ if (copy_destructor) {
+ int delta;
+
+ swap(gso_skb->sk, skb->sk);
+ swap(gso_skb->destructor, skb->destructor);
+ sum_truesize += skb->truesize;
+ delta = sum_truesize - gso_skb->truesize;
+ /* In some pathological cases, delta can be negative.
+ * We need to either use refcount_add() or refcount_sub_and_test()
+ */
+ if (likely(delta >= 0))
+ refcount_add(delta, &skb->sk->sk_wmem_alloc);
+ else
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
+ }
+
+ delta = (__force __wsum)htonl(oldlen +
+ (skb_tail_pointer(skb) -
+ skb_transport_header(skb)) +
+ skb->data_len);
+ th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta));
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(skb, ~th->check);
+ else
+ th->check = gso_make_checksum(skb, ~th->check);
+out:
+ return segs;
+}
+
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
+{
+ struct tcphdr *th2;
+ struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ return p;
+ }
+
+ return NULL;
+}
+
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ unsigned int thlen = th->doff * 4;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th2;
+ unsigned int len;
+ __be32 flags;
+ unsigned int mss = 1;
+ int flush = 1;
+ int i;
+
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
+
+ p = tcp_gro_lookup(head, th);
+ if (!p)
+ goto out_check_final;
+
+ th2 = tcp_hdr(p);
+ flush = (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ ~(TCP_FLAG_FIN | TCP_FLAG_PSH));
+ flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+ for (i = sizeof(*th); i < thlen; i += 4)
+ flush |= *(u32 *)((u8 *)th + i) ^
+ *(u32 *)((u8 *)th2 + i);
+
+ flush |= gro_receive_network_flush(th, th2, p);
+
+ mss = skb_shinfo(p)->gso_size;
+
+ /* If skb is a GRO packet, make sure its gso_size matches prior packet mss.
+ * If it is a single frame, do not aggregate it if its length
+ * is bigger than our mss.
+ */
+ if (unlikely(skb_is_gso(skb)))
+ flush |= (mss != skb_shinfo(skb)->gso_size);
+ else
+ flush |= (len - 1) >= mss;
+
+ flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+ flush |= skb_cmp_decrypted(p, skb);
+
+ if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
+ flush |= (__force int)(flags ^ tcp_flag_word(th2));
+ flush |= skb->ip_summed != p->ip_summed;
+ flush |= skb->csum_level != p->csum_level;
+ flush |= NAPI_GRO_CB(p)->count >= 64;
+ skb_set_network_header(skb, skb_gro_receive_network_offset(skb));
+
+ if (flush || skb_gro_receive_list(p, skb))
+ mss = 1;
+
+ goto out_check_final;
+ }
+
+ if (flush || skb_gro_receive(p, skb)) {
+ mss = 1;
+ goto out_check_final;
+ }
+
+ tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+ /* Force a flush if last segment is smaller than mss. */
+ if (unlikely(skb_is_gso(skb)))
+ flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size;
+ else
+ flush = len < mss;
+
+ flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+ TCP_FLAG_RST | TCP_FLAG_SYN |
+ TCP_FLAG_FIN));
+
+ if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+ pp = p;
+
+ NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+ return pp;
+}
+
+void tcp_gro_complete(struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+ struct skb_shared_info *shinfo;
+
+ if (skb->encapsulation)
+ skb->inner_transport_header = skb->transport_header;
+
+ skb->csum_start = (unsigned char *)th - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ shinfo = skb_shinfo(skb);
+ shinfo->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (th->cwr)
+ shinfo->gso_type |= SKB_GSO_TCP_ACCECN;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ const struct iphdr *iph;
+ struct sk_buff *p;
+ struct sock *sk;
+ struct net *net;
+ int iif, sdif;
+
+ if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+ iph = skb_gro_network_header(skb);
+ net = dev_net_rcu(skb->dev);
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_gen_put(sk);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ struct tcphdr *th;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+ inet_gro_compute_pseudo))
+ goto flush;
+
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
+
+ tcp4_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+{
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
+ th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+ iph->daddr, 0);
+
+ BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
+ (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
+
+ tcp_gro_complete(skb);
+ return 0;
+}
+
+int __init tcpv4_offload_init(void)
+{
+ net_hotdata.tcpv4_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = tcp4_gso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+ };
+ return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 32c1a972fa31..479afb714bdf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
@@ -36,54 +35,78 @@
*
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/mptcp.h>
+#include <net/smc.h>
+#include <net/proto_memory.h>
+#include <net/psp.h>
#include <linux/compiler.h>
+#include <linux/gfp.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
+#include <linux/static_key.h>
+#include <linux/skbuff_ref.h>
-/* People can turn this off for buggy TCP's found in printers etc. */
-int sysctl_tcp_retrans_collapse __read_mostly = 1;
+#include <trace/events/tcp.h>
-/* People can turn this on to work with those rare, broken TCPs that
- * interpret the window field as a signed quantity.
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
*/
-int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-
-/* This limits the percentage of the congestion window which we
- * will allow a single TSO frame to consume. Building TSO frames
- * which are too large can cause TCP streams to be bursty.
- */
-int sysctl_tcp_tso_win_divisor __read_mostly = 3;
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+ u64 val = tcp_clock_ns();
-int sysctl_tcp_mtu_probing __read_mostly = 0;
-int sysctl_tcp_base_mss __read_mostly = 512;
+ tp->tcp_clock_cache = val;
+ tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
+}
-/* By default, RFC2861 behavior. */
-int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp);
-static void update_send_head(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+/* Account for new data that has been sent to the network. */
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
- sk->sk_send_head = skb->next;
- if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
- sk->sk_send_head = NULL;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_packets_out_inc(sk, tp, skb);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int prior_packets = tp->packets_out;
+
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
+
+ __skb_unlink(skb, &sk->sk_write_queue);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
+ if (tp->highest_sack == NULL)
+ tp->highest_sack = skb;
+
+ tp->packets_out += tcp_skb_pcount(skb);
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ tcp_rearm_rto(sk);
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+ tcp_skb_pcount(skb));
+ tcp_check_space(sk);
}
-/* SND.NXT, if window was not shrunk.
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
* invalid. OK, let's make this for now:
*/
-static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
+static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
- if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+ (tp->rx_opt.wscale_ok &&
+ ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
return tp->snd_nxt;
else
- return tp->snd_una+tp->snd_wnd;
+ return tcp_wnd_end(tp);
}
/* Calculate mss to advertise in SYN segment.
@@ -103,25 +126,29 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
static __u16 tcp_advertise_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
int mss = tp->advmss;
- if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
- mss = dst_metric(dst, RTAX_ADVMSS);
- tp->advmss = mss;
+ if (dst) {
+ unsigned int metric = dst_metric_advmss(dst);
+
+ if (metric < mss) {
+ mss = metric;
+ tp->advmss = mss;
+ }
}
return (__u16)mss;
}
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
- * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
- s32 delta = tcp_time_stamp - tp->lsndtime;
- u32 restart_cwnd = tcp_init_cwnd(tp, dst);
- u32 cwnd = tp->snd_cwnd;
+ u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ u32 cwnd = tcp_snd_cwnd(tp);
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -130,33 +157,46 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
- tp->snd_cwnd = max(cwnd, restart_cwnd);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
+ tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}
+/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp,
- struct sk_buff *skb, struct sock *sk)
+ struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const u32 now = tcp_time_stamp;
+ const u32 now = tcp_jiffies32;
- if (sysctl_tcp_slow_start_after_idle &&
- (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
- tcp_cwnd_restart(sk, __sk_dst_get(sk));
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
tp->lsndtime = now;
/* If it is a reply for ato after last received
- * packet, enter pingpong mode.
+ * packet, increase pingpong count.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- icsk->icsk_ack.pingpong = 1;
+ inet_csk_inc_pingpong_cnt(sk);
}
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+/* Account for an ACK we sent. */
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
- tcp_dec_quickack_mode(sk, pkts);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unlikely(tp->compressed_ack)) {
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+ tp->compressed_ack);
+ tp->compressed_ack = 0;
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+ __sock_put(sk);
+ }
+
+ if (unlikely(rcv_nxt != tp->rcv_nxt))
+ return; /* Special ACK sent by DCTCP to reflect ECN */
+ tcp_dec_quickack_mode(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -167,20 +207,22 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
-void tcp_select_initial_window(int __space, __u32 mss,
- __u32 *rcv_wnd, __u32 *window_clamp,
- int wscale_ok, __u8 *rcv_wscale)
+void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
+ __u32 *rcv_wnd, __u32 *__window_clamp,
+ int wscale_ok, __u8 *rcv_wscale,
+ __u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
+ u32 window_clamp = READ_ONCE(*__window_clamp);
/* If no clamp set the clamp to the max possible scaled window */
- if (*window_clamp == 0)
- (*window_clamp) = (65535 << 14);
- space = min(*window_clamp, space);
+ if (window_clamp == 0)
+ window_clamp = (U16_MAX << TCP_MAX_WSCALE);
+ space = min(window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
- space = (space / mss) * mss;
+ space = rounddown(space, mss);
/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. If the admin tells us
@@ -190,41 +232,28 @@ void tcp_select_initial_window(int __space, __u32 mss,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
- if (sysctl_tcp_workaround_signed_windows)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = space;
- (*rcv_wscale) = 0;
- if (wscale_ok) {
- /* Set window scaling on max possible window
- * See RFC1323 for an explanation of the limit to 14
- */
- space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
- space = min_t(u32, space, *window_clamp);
- while (space > 65535 && (*rcv_wscale) < 14) {
- space >>= 1;
- (*rcv_wscale)++;
- }
- }
+ if (init_rcv_wnd)
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- /* Set initial window to value enough for senders,
- * following RFC2414. Senders, not following this RFC,
- * will be satisfied with 2.
- */
- if (mss > (1<<*rcv_wscale)) {
- int init_cwnd = 4;
- if (mss > 1460*3)
- init_cwnd = 2;
- else if (mss > 1460)
- init_cwnd = 3;
- if (*rcv_wnd > init_cwnd*mss)
- *rcv_wnd = init_cwnd*mss;
+ *rcv_wscale = 0;
+ if (wscale_ok) {
+ /* Set window scaling on max possible window */
+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
+ space = min_t(u32, space, window_clamp);
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+ 0, TCP_MAX_WSCALE);
}
-
/* Set the clamp no higher than max representable value */
- (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+ WRITE_ONCE(*__window_clamp,
+ min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
+EXPORT_IPV6_MOD(tcp_select_initial_window);
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
@@ -234,11 +263,23 @@ void tcp_select_initial_window(int __space, __u32 mss,
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 cur_win = tcp_receive_window(tp);
- u32 new_win = __tcp_select_window(sk);
+ struct net *net = sock_net(sk);
+ u32 old_win = tp->rcv_wnd;
+ u32 cur_win, new_win;
+
+ /* Make the window 0 if we failed to queue the data because we
+ * are out of memory.
+ */
+ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) {
+ tp->pred_flags = 0;
+ tp->rcv_wnd = 0;
+ tp->rcv_wup = tp->rcv_nxt;
+ return 0;
+ }
- /* Never shrink the offered window */
- if(new_win < cur_win) {
+ cur_win = tcp_receive_window(tp);
+ new_win = __tcp_select_window(sk);
+ if (new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
* we will not be able to advertise a zero
@@ -246,15 +287,22 @@ static u16 tcp_select_window(struct sock *sk)
*
* Relax Will Robinson.
*/
- new_win = cur_win;
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
+ /* Never shrink the offered window */
+ if (new_win == 0)
+ NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ }
}
+
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
- if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
+ if (!tp->rx_opt.rcv_wscale &&
+ READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -263,118 +311,1131 @@ static u16 tcp_select_window(struct sock *sk)
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
- if (new_win == 0)
+ if (new_win == 0) {
tp->pred_flags = 0;
+ if (old_win)
+ NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
+ } else if (old_win == 0) {
+ NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
+ }
return new_win;
}
-static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
- __u32 tstamp, __u8 **md5_hash)
+/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
+ * be sent.
+ */
+static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, int tcp_header_len)
{
- if (tp->rx_opt.tstamp_ok) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_TIMESTAMP << 8) |
- TCPOLEN_TIMESTAMP);
- *ptr++ = htonl(tstamp);
- *ptr++ = htonl(tp->rx_opt.ts_recent);
- }
- if (tp->rx_opt.eff_sacks) {
- struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
- int this_sack;
+ struct tcp_sock *tp = tcp_sk(sk);
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_SACK << 8) |
- (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
- TCPOLEN_SACK_PERBLOCK)));
- for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
- *ptr++ = htonl(sp[this_sack].start_seq);
- *ptr++ = htonl(sp[this_sack].end_seq);
+ if (!tcp_ecn_mode_any(tp))
+ return;
+
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (!tcp_accecn_ace_fail_recv(tp))
+ INET_ECN_xmit(sk);
+ tcp_accecn_set_ace(tp, skb, th);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
+ } else {
+ /* Not-retransmitted data segment: set ECT and inject CWR. */
+ if (skb->len != tcp_header_len &&
+ !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+ INET_ECN_xmit(sk);
+ if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+ th->cwr = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ }
+ } else if (!tcp_ca_needs_ecn(sk)) {
+ /* ACK or retransmitted segment: clear ECT|CE */
+ INET_ECN_dontxmit(sk);
}
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks--;
+ if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+ th->ece = 1;
+ }
+}
+
+/* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk,
+ u32 seq, u16 flags)
+{
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ TCP_SKB_CB(skb)->tcp_flags = flags;
+
+ tcp_skb_pcount_set(skb, 1);
+ psp_enqueue_set_decrypted(sk, skb);
+
+ TCP_SKB_CB(skb)->seq = seq;
+ if (flags & (TCPHDR_SYN | TCPHDR_FIN))
+ seq++;
+ TCP_SKB_CB(skb)->end_seq = seq;
+}
+
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+{
+ return tp->snd_una != tp->snd_up;
+}
+
+#define OPTION_SACK_ADVERTISE BIT(0)
+#define OPTION_TS BIT(1)
+#define OPTION_MD5 BIT(2)
+#define OPTION_WSCALE BIT(3)
+#define OPTION_FAST_OPEN_COOKIE BIT(8)
+#define OPTION_SMC BIT(9)
+#define OPTION_MPTCP BIT(10)
+#define OPTION_AO BIT(11)
+#define OPTION_ACCECN BIT(12)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc)) {
+ if (unlikely(OPTION_SMC & *options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_EXP << 8) |
+ (TCPOLEN_EXP_SMC_BASE));
+ *ptr++ = htonl(TCPOPT_SMC_MAGIC);
}
}
-#ifdef CONFIG_TCP_MD5SIG
- if (md5_hash) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- *md5_hash = (__u8 *)ptr;
+#endif
+}
+
+struct tcp_out_options {
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
+ u8 ws; /* window scale, 0 to disable */
+ u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 num_accecn_fields:7, /* number of AccECN fields needed */
+ use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
+ u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
+ __u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+ struct mptcp_out_options mptcp;
+};
+
+static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
+ struct tcp_sock *tp,
+ struct tcp_out_options *opts)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+ if (unlikely(OPTION_MPTCP & opts->options))
+ mptcp_write_options(th, ptr, tp, &opts->mptcp);
+#endif
+}
+
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
}
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
#endif
+
+static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
+ const struct tcp_request_sock *tcprsk,
+ struct tcp_out_options *opts,
+ struct tcp_key *key, __be32 *ptr)
+{
+#ifdef CONFIG_TCP_AO
+ u8 maclen = tcp_ao_maclen(key->ao_key);
+
+ if (tcprsk) {
+ u8 aolen = maclen + sizeof(struct tcp_ao_hdr);
+
+ *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) |
+ (tcprsk->ao_keyid << 8) |
+ (tcprsk->ao_rcv_next));
+ } else {
+ struct tcp_ao_key *rnext_key;
+ struct tcp_ao_info *ao_info;
+
+ ao_info = rcu_dereference_check(tp->ao_info,
+ lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ if (WARN_ON_ONCE(!rnext_key))
+ return ptr;
+ *ptr++ = htonl((TCPOPT_AO << 24) |
+ (tcp_ao_len(key->ao_key) << 16) |
+ (key->ao_key->sndid << 8) |
+ (rnext_key->rcvid));
+ }
+ opts->hash_location = (__u8 *)ptr;
+ ptr += maclen / sizeof(*ptr);
+ if (unlikely(maclen % sizeof(*ptr))) {
+ memset(ptr, TCPOPT_NOP, sizeof(*ptr));
+ ptr++;
+ }
+#endif
+ return ptr;
}
-/* Construct a tcp options header for a SYN or SYN_ACK packet.
- * If this is every changed make sure to change the definition of
- * MAX_SYN_SIZE to match the new maximum number of options that you
- * can generate.
+/* Initial values for AccECN option, ordered is based on ECN field bits
+ * similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
+ */
+static const u32 synack_ecn_bytes[3] = { 0, 0, 0 };
+
+/* Write previously computed TCP options to the packet.
*
- * Note - that with the RFC2385 TCP option, we make room for the
- * 16 byte MD5 hash. This will be filled in later, so the pointer for the
- * location to be filled is passed back up.
+ * Beware: Something in the Internet is very sensitive to the ordering of
+ * TCP options, we learned this through the hard way, so be careful here.
+ * Luckily we can at least blame others for their non-compliance but from
+ * inter-operability perspective it seems that we're somewhat stuck with
+ * the ordering which we have been using if we want to keep working with
+ * those broken things (not that it currently hurts anybody as there isn't
+ * particular reason why the ordering would need to be changed).
+ *
+ * At least SACK_PERM as the first option is known to lead to a disaster
+ * (but it may well be that other scenarios fail similarly).
*/
-static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
- int offer_wscale, int wscale, __u32 tstamp,
- __u32 ts_recent, __u8 **md5_hash)
-{
- /* We always get an MSS option.
- * The option bytes which will be seen in normal data
- * packets should timestamps be used, must be in the MSS
- * advertised. But we subtract them from tp->mss_cache so
- * that calculations in tcp_sendmsg are simpler etc.
- * So account for this fact here if necessary. If we
- * don't do this correctly, as a receiver we won't
- * recognize data packets as being full sized when we
- * should, and thus we won't abide by the delayed ACK
- * rules correctly.
- * SACKs don't matter, we never delay an ACK when we
- * have any of those going out.
- */
- *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
- if (ts) {
- if(sack)
+static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
+ const struct tcp_request_sock *tcprsk,
+ struct tcp_out_options *opts,
+ struct tcp_key *key)
+{
+ u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
+ u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */
+ __be32 *ptr = (__be32 *)(th + 1);
+ u16 options = opts->options; /* mungable copy */
+
+ if (tcp_key_is_md5(key)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+ /* overload cookie hash location */
+ opts->hash_location = (__u8 *)ptr;
+ ptr += 4;
+ } else if (tcp_key_is_ao(key)) {
+ ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr);
+ }
+ if (unlikely(opts->mss)) {
+ *ptr++ = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ opts->mss);
+ }
+
+ if (likely(OPTION_TS & options)) {
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
- else
+ options &= ~OPTION_SACK_ADVERTISE;
+ } else {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
- *ptr++ = htonl(tstamp); /* TSVAL */
- *ptr++ = htonl(ts_recent); /* TSECR */
- } else if(sack)
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ }
+ *ptr++ = htonl(opts->tsval);
+ *ptr++ = htonl(opts->tsecr);
+ }
+
+ if (OPTION_ACCECN & options) {
+ const u32 *ecn_bytes = opts->use_synack_ecn_bytes ?
+ synack_ecn_bytes :
+ tp->received_ecn_bytes;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ u32 e0b;
+ u32 e1b;
+ u32 ceb;
+ u8 len;
+
+ e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
+ e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
+ ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
+ len = TCPOLEN_ACCECN_BASE +
+ opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
+
+ if (opts->num_accecn_fields == 2) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ } else if (opts->num_accecn_fields == 1) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ leftover_highbyte = e1b & 0xff;
+ leftover_lowbyte = TCPOPT_NOP;
+ } else if (opts->num_accecn_fields == 0) {
+ leftover_highbyte = TCPOPT_ACCECN1;
+ leftover_lowbyte = len;
+ } else if (opts->num_accecn_fields == 3) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ *ptr++ = htonl(((e0b & 0xffffff) << 8) |
+ TCPOPT_NOP);
+ }
+ if (tp) {
+ tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
+ }
+
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
- if (offer_wscale)
- *ptr++ = htonl((TCPOPT_NOP << 24) |
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
+ }
+
+ if (unlikely(OPTION_WSCALE & options)) {
+ u8 highbyte = TCPOPT_NOP;
+
+ /* Do not split the leftover 2-byte to fit into a single
+ * NOP, i.e., replace this NOP only when 1 byte is leftover
+ * within leftover_highbyte.
+ */
+ if (unlikely(leftover_highbyte != TCPOPT_NOP &&
+ leftover_lowbyte == TCPOPT_NOP)) {
+ highbyte = leftover_highbyte;
+ leftover_highbyte = TCPOPT_NOP;
+ }
+ *ptr++ = htonl((highbyte << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
- (wscale));
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * If MD5 is enabled, then we set the option, and include the size
- * (always 18). The actual MD5 hash is added just before the
- * packet is sent.
- */
- if (md5_hash) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- *md5_hash = (__u8 *) ptr;
+ opts->ws);
+ }
+
+ if (unlikely(opts->num_sack_blocks)) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ?
+ tp->duplicate_sack : tp->selective_acks;
+ int this_sack;
+
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
+ TCPOLEN_SACK_PERBLOCK)));
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
+
+ for (this_sack = 0; this_sack < opts->num_sack_blocks;
+ ++this_sack) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+
+ tp->rx_opt.dsack = 0;
+ } else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
+ leftover_lowbyte != TCPOPT_NOP)) {
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
+ }
+
+ if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+ struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+ u8 *p = (u8 *)ptr;
+ u32 len; /* Fast Open option length */
+
+ if (foc->exp) {
+ len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+ p += TCPOLEN_EXP_FASTOPEN_BASE;
+ } else {
+ len = TCPOLEN_FASTOPEN_BASE + foc->len;
+ *p++ = TCPOPT_FASTOPEN;
+ *p++ = len;
+ }
+
+ memcpy(p, foc->val, foc->len);
+ if ((len & 3) == 2) {
+ p[foc->len] = TCPOPT_NOP;
+ p[foc->len + 1] = TCPOPT_NOP;
+ }
+ ptr += (len + 3) >> 2;
+ }
+
+ smc_options_write(ptr, &options);
+
+ mptcp_options_write(th, ptr, tp, opts);
+}
+
+static void smc_set_option(struct tcp_sock *tp,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
+ tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
+ /* re-check syn_smc */
+ if (tp->syn_smc &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+ }
}
#endif
}
+static void smc_set_option_cond(const struct tcp_sock *tp,
+ struct inet_request_sock *ireq,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
+ ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
+ /* re-check smc_ok */
+ if (ireq->smc_ok &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+ }
+ }
+#endif
+}
+
+static void mptcp_set_option_cond(const struct request_sock *req,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ if (rsk_is_mptcp(req)) {
+ unsigned int size;
+
+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+ if (*remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ *remaining -= size;
+ }
+ }
+ }
+}
+
+static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts)
+{
+ /* How much there's room for combining with the alignment padding? */
+ if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
+ OPTION_SACK_ADVERTISE)
+ return 2;
+ else if (opts->options & OPTION_WSCALE)
+ return 1;
+ return 0;
+}
+
+/* Calculates how long AccECN option will fit to @remaining option space.
+ *
+ * AccECN option can sometimes replace NOPs used for alignment of other
+ * TCP options (up to @max_combine_saving available).
+ *
+ * Only solutions with at least @required AccECN fields are accepted.
+ *
+ * Returns: The size of the AccECN option excluding space repurposed from
+ * the alignment of the other options.
+ */
+static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
+ int remaining)
+{
+ int size = TCP_ACCECN_MAXSIZE;
+ int sack_blocks_reduce = 0;
+ int max_combine_saving;
+ int rem = remaining;
+ int align_size;
+
+ if (opts->use_synack_ecn_bytes)
+ max_combine_saving = tcp_synack_options_combine_saving(opts);
+ else
+ max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ while (opts->num_accecn_fields >= required) {
+ /* Pad to dword if cannot combine */
+ if ((size & 0x3) > max_combine_saving)
+ align_size = ALIGN(size, 4);
+ else
+ align_size = ALIGN_DOWN(size, 4);
+
+ if (rem >= align_size) {
+ size = align_size;
+ break;
+ } else if (opts->num_accecn_fields == required &&
+ opts->num_sack_blocks > 2 &&
+ required > 0) {
+ /* Try to fit the option by removing one SACK block */
+ opts->num_sack_blocks--;
+ sack_blocks_reduce++;
+ rem = rem + TCPOLEN_SACK_PERBLOCK;
+
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ size = TCP_ACCECN_MAXSIZE;
+ continue;
+ }
+
+ opts->num_accecn_fields--;
+ size -= TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (sack_blocks_reduce > 0) {
+ if (opts->num_accecn_fields >= required)
+ size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK;
+ else
+ opts->num_sack_blocks += sack_blocks_reduce;
+ }
+ if (opts->num_accecn_fields < required)
+ return 0;
+
+ opts->options |= OPTION_ACCECN;
+ return size;
+}
+
+/* Compute TCP options for SYN packets. This is not the final
+ * network wire format yet.
+ */
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct tcp_key *key)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+ bool timestamps;
+
+ /* Better than switch (key.type) as it has static branches */
+ if (tcp_key_is_md5(key)) {
+ timestamps = false;
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ } else {
+ timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
+ if (tcp_key_is_ao(key)) {
+ opts->options |= OPTION_AO;
+ remaining -= tcp_ao_len_aligned(key->ao_key);
+ }
+ }
+
+ /* We always get an MSS option. The option bytes which will be seen in
+ * normal data packets should timestamps be used, must be in the MSS
+ * advertised. But we subtract them from tp->mss_cache so that
+ * calculations in tcp_sendmsg are simpler etc. So account for this
+ * fact here if necessary. If we don't do this correctly, as a
+ * receiver we won't recognize data packets as being full sized when we
+ * should, and thus we won't abide by the delayed ACK rules correctly.
+ * SACKs don't matter, we never delay an ACK when we have any of those
+ * going out. */
+ opts->mss = tcp_advertise_mss(sk);
+ remaining -= TCPOLEN_MSS_ALIGNED;
+
+ if (likely(timestamps)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
+ opts->tsecr = tp->rx_opt.ts_recent;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
+ }
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
+ opts->ws = tp->rx_opt.rcv_wscale;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
+ }
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
+ opts->options |= OPTION_SACK_ADVERTISE;
+ if (unlikely(!(OPTION_TS & opts->options)))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
+
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = fastopen->cookie.len;
+
+ need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = &fastopen->cookie;
+ remaining -= need;
+ tp->syn_fastopen = 1;
+ tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
+ }
+ }
+
+ smc_set_option(tp, opts, &remaining);
+
+ if (sk_is_mptcp(sk)) {
+ unsigned int size;
+
+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
+ if (remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ remaining -= size;
+ }
+ }
+ }
+
+ /* Simultaneous open SYN/ACK needs AccECN option but not SYN.
+ * It is attempted to negotiate the use of AccECN also on the first
+ * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs"
+ * of AccECN draft.
+ */
+ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
+ tcp_ecn_mode_accecn(tp) &&
+ inet_csk(sk)->icsk_retransmits < 2 &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ remaining >= TCPOLEN_ACCECN_BASE)) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Set up TCP options for SYN-ACKs. */
+static unsigned int tcp_synack_options(const struct sock *sk,
+ struct request_sock *req,
+ unsigned int mss, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ const struct tcp_key *key,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_request_sock *treq = tcp_rsk(req);
+
+ if (tcp_key_is_md5(key)) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+ /* We can't fit any SACK blocks in a packet with MD5 + TS
+ * options. There was discussion about disabling SACK
+ * rather than TS in order to fit in better with old,
+ * buggy kernels, but that was deemed to be unnecessary.
+ */
+ if (synack_type != TCP_SYNACK_COOKIE)
+ ireq->tstamp_ok &= !ireq->sack_ok;
+ } else if (tcp_key_is_ao(key)) {
+ opts->options |= OPTION_AO;
+ remaining -= tcp_ao_len_aligned(key->ao_key);
+ ireq->tstamp_ok &= !ireq->sack_ok;
+ }
+
+ /* We always send an MSS option. */
+ opts->mss = mss;
+ remaining -= TCPOLEN_MSS_ALIGNED;
+
+ if (likely(ireq->wscale_ok)) {
+ opts->ws = ireq->rcv_wscale;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
+ }
+ if (likely(ireq->tstamp_ok)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
+ tcp_rsk(req)->ts_off;
+ if (!tcp_rsk(req)->snt_tsval_first) {
+ if (!opts->tsval)
+ opts->tsval = ~0U;
+ tcp_rsk(req)->snt_tsval_first = opts->tsval;
+ }
+ WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval);
+ opts->tsecr = req->ts_recent;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
+ }
+ if (likely(ireq->sack_ok)) {
+ opts->options |= OPTION_SACK_ADVERTISE;
+ if (unlikely(!ireq->tstamp_ok))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
+ if (foc != NULL && foc->len >= 0) {
+ u32 need = foc->len;
+
+ need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = foc;
+ remaining -= need;
+ }
+ }
+
+ mptcp_set_option_cond(req, opts, &remaining);
+
+ smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
+ if (treq->accecn_ok &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
+
+ return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Compute TCP options for ESTABLISHED sockets. This is not the
+ * final wire format yet.
+ */
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct tcp_key *key)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int size = 0;
+ unsigned int eff_sacks;
+
+ opts->options = 0;
+
+ /* Better than switch (key.type) as it has static branches */
+ if (tcp_key_is_md5(key)) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ } else if (tcp_key_is_ao(key)) {
+ opts->options |= OPTION_AO;
+ size += tcp_ao_len_aligned(key->ao_key);
+ }
+
+ if (likely(tp->rx_opt.tstamp_ok)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
+ tp->tsoffset : 0;
+ opts->tsecr = tp->rx_opt.ts_recent;
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ }
+
+ /* MPTCP options have precedence over SACK for the limited TCP
+ * option space because a MPTCP connection would be forced to
+ * fall back to regular TCP if a required multipath option is
+ * missing. SACK still gets a chance to use whatever space is
+ * left.
+ */
+ if (sk_is_mptcp(sk)) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ unsigned int opt_size = 0;
+
+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
+ &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ size += opt_size;
+ }
+ }
+
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
+ const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK)) {
+ opts->num_sack_blocks =
+ min_t(unsigned int, eff_sacks,
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+ TCPOLEN_SACK_PERBLOCK);
+
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ } else {
+ opts->num_sack_blocks = 0;
+ }
+ } else {
+ opts->num_sack_blocks = 0;
+ }
+
+ if (tcp_ecn_mode_accecn(tp)) {
+ int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
+
+ if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
+ }
+
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
+ }
+
+ return size;
+}
+
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a BH work item
+ * to process all sockets that eventually need to send more skbs.
+ * We use one work item per cpu, with its own queue of sockets.
+ */
+struct tsq_work {
+ struct work_struct work;
+ struct list_head head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_work, tsq_work);
+
+static void tcp_tsq_write(struct sock *sk)
+{
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->lost_out > tp->retrans_out &&
+ tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
+ tcp_mstamp_refresh(tp);
+ tcp_xmit_retransmit_queue(sk);
+ }
+
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
+ 0, GFP_ATOMIC);
+ }
+}
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ tcp_tsq_write(sk);
+ else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ sock_hold(sk);
+ bh_unlock_sock(sk);
+}
+/*
+ * One work item per cpu tries to send more skbs.
+ * We run in BH context but need to disable irqs when
+ * transferring tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tsq_workfn(struct work_struct *work)
+{
+ struct tsq_work *tsq = container_of(work, struct tsq_work, work);
+ LIST_HEAD(list);
+ unsigned long flags;
+ struct list_head *q, *n;
+ struct tcp_sock *tp;
+ struct sock *sk;
+
+ local_irq_save(flags);
+ list_splice_init(&tsq->head, &list);
+ local_irq_restore(flags);
+
+ list_for_each_safe(q, n, &list) {
+ tp = list_entry(q, struct tcp_sock, tsq_node);
+ list_del(&tp->tsq_node);
+
+ sk = (struct sock *)tp;
+ smp_mb__before_atomic();
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+ tcp_tsq_handler(sk);
+ sk_free(sk);
+ }
+}
+
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED | \
+ TCPF_ACK_DEFERRED)
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+ unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
+ unsigned long nflags;
+
+ /* perform an atomic operation only if at least one flag is set */
+ do {
+ if (!(flags & TCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~TCP_DEFERRED_ALL;
+ } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));
+
+ if (flags & TCPF_TSQ_DEFERRED) {
+ tcp_tsq_write(sk);
+ __sock_put(sk);
+ }
+
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
+ tcp_write_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
+ tcp_delack_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
+ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
+ __sock_put(sk);
+ }
+ if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
+ tcp_send_ack(sk);
+}
+EXPORT_IPV6_MOD(tcp_release_cb);
+
+void __init tcp_tsq_work_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct tsq_work *tsq = &per_cpu(tsq_work, i);
+
+ INIT_LIST_HEAD(&tsq->head);
+ INIT_WORK(&tsq->work, tcp_tsq_workfn);
+ }
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We can't xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
+ struct tsq_work *tsq;
+ bool empty;
+
+ /* Keep one reference on sk_wmem_alloc.
+ * Will be released by sk_free() from here or tcp_tsq_workfn()
+ */
+ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
+
+ /* If this softirq is serviced by ksoftirqd, we are likely under stress.
+ * Wait until our queues (qdisc + devices) are drained.
+ * This gives :
+ * - less callbacks to tcp_write_xmit(), reducing stress (batches)
+ * - chance for incoming ACK (processed by another cpu maybe)
+ * to migrate this flow (skb->ooo_okay will be eventually set)
+ */
+ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ goto out;
+
+ oval = smp_load_acquire(&sk->sk_tsq_flags);
+ do {
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
+ } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
+
+ /* queue this socket to BH workqueue */
+ local_irq_save(flags);
+ tsq = this_cpu_ptr(&tsq_work);
+ empty = list_empty(&tsq->head);
+ list_add(&tp->tsq_node, &tsq->head);
+ if (empty)
+ queue_work(system_bh_wq, &tsq->work);
+ local_irq_restore(flags);
+ return;
+out:
+ sk_free(sk);
+}
+
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+ struct sock *sk = (struct sock *)tp;
+
+ tcp_tsq_handler(sk);
+ sock_put(sk);
+
+ return HRTIMER_NORESTART;
+}
+
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+ u64 prior_wstamp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sk->sk_pacing_status != SK_PACING_NONE) {
+ unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
+
+ /* Original sch_fq does not pace first 10 MSS
+ * Note that tp->data_segs_out overflows after 2^32 packets,
+ * this is a minor annoyance.
+ */
+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
+
+ /* take into account OS jitter */
+ len_ns -= min_t(u64, len_ns / 2, credit);
+ tp->tcp_wstamp_ns += len_ns;
+ }
+ }
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -386,175 +1447,210 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
+static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
+ int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
- int tcp_header_size;
-#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
-#endif
+ struct tcp_out_options opts;
+ unsigned int tcp_options_size, tcp_header_size;
+ struct sk_buff *oskb = NULL;
+ struct tcp_key key;
struct tcphdr *th;
- int sysctl_flags;
+ u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
+ tp = tcp_sk(sk);
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ if (clone_it) {
+ oskb = skb;
+
+ tcp_skb_tsorted_save(oskb) {
+ if (unlikely(skb_cloned(oskb)))
+ skb = pskb_copy(oskb, gfp_mask);
+ else
+ skb = skb_clone(oskb, gfp_mask);
+ } tcp_skb_tsorted_restore(oskb);
- /* If congestion control is doing timestamping, we must
- * take such a timestamp before we potentially clone/copy.
- */
- if (icsk->icsk_ca_ops->rtt_sample)
- __net_timestamp(skb);
-
- if (likely(clone_it)) {
- if (unlikely(skb_cloned(skb)))
- skb = pskb_copy(skb, gfp_mask);
- else
- skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
+ /* retransmit skbs might have a non zero value in skb->dev
+ * because skb->dev is aliased with skb->rbnode.rb_left
+ */
+ skb->dev = NULL;
}
inet = inet_sk(sk);
- tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
- tcp_header_size = tp->tcp_header_len;
-
-#define SYSCTL_FLAG_TSTAMPS 0x1
-#define SYSCTL_FLAG_WSCALE 0x2
-#define SYSCTL_FLAG_SACK 0x4
-
- sysctl_flags = 0;
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
- tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
- if(sysctl_tcp_timestamps) {
- tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
- sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
- }
- if (sysctl_tcp_window_scaling) {
- tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
- sysctl_flags |= SYSCTL_FLAG_WSCALE;
- }
- if (sysctl_tcp_sack) {
- sysctl_flags |= SYSCTL_FLAG_SACK;
- if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
- tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
- }
- } else if (unlikely(tp->rx_opt.eff_sacks)) {
- /* A SACK is 2 pad bytes, a 2 byte header, plus
- * 2 32-bit sequence numbers for each SACK block.
+ memset(&opts, 0, sizeof(opts));
+
+ tcp_get_current_key(sk, &key);
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+ tcp_options_size = tcp_syn_options(sk, skb, &opts, &key);
+ } else {
+ tcp_options_size = tcp_established_options(sk, skb, &opts, &key);
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
+ * at receiver : This slightly improve GRO performance.
+ * Note that we do not force the PSH flag for non GSO packets,
+ * because they might be sent under high congestion events,
+ * and in this case it is better to delay the delivery of 1-MSS
+ * packets and thus the corresponding ACK packet that would
+ * release the following packet.
*/
- tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks *
- TCPOLEN_SACK_PERBLOCK));
+ if (tcp_skb_pcount(skb) > 1)
+ tcb->tcp_flags |= TCPHDR_PSH;
}
-
- if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(sk, CA_EVENT_TX_START);
+ tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
+
+ /* We set skb->ooo_okay to one if this packet can select
+ * a different TX queue than prior packets of this flow,
+ * to avoid self inflicted reorders.
+ * The 'other' queue decision is based on current cpu number
+ * if XPS is enabled, or sk->sk_txhash otherwise.
+ * We can switch to another (and better) queue if:
+ * 1) No packet with payload is in qdisc/device queues.
+ * Delays in TX completion can defeat the test
+ * even if packets were already sent.
+ * 2) Or rtx queue is empty.
+ * This mitigates above case if ACK packets for
+ * all prior packets were already processed.
+ */
+ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) ||
+ tcp_rtx_queue_empty(sk);
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * Are we doing MD5 on this segment? If so - make
- * room for it.
+ /* If we had to use memory reserve to allocate this skb,
+ * this might cause drops if packet is looped back :
+ * Other socket might not have SOCK_MEMALLOC.
+ * Packets not looped back do not care about pfmemalloc.
*/
- md5 = tp->af_specific->md5_lookup(sk, sk);
- if (md5)
- tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
-#endif
+ skb->pfmemalloc = 0;
+
+ skb_push(skb, tcp_header_size);
+ skb_reset_transport_header(skb);
- th = (struct tcphdr *) skb_push(skb, tcp_header_size);
- skb->h.th = th;
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+
+ skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
/* Build TCP header and checksum it. */
- th->source = inet->sport;
- th->dest = inet->dport;
+ th = (struct tcphdr *)skb->data;
+ th->source = inet->inet_sport;
+ th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
- th->ack_seq = htonl(tp->rcv_nxt);
+ th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
- tcb->flags);
+ (tcb->tcp_flags & TCPHDR_FLAGS_MASK));
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- th->window = htons(tp->rcv_wnd);
- } else {
- th->window = htons(tcp_select_window(sk));
- }
th->check = 0;
th->urg_ptr = 0;
- if (unlikely(tp->urg_mode &&
- between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
- th->urg_ptr = htons(tp->snd_up-tcb->seq);
- th->urg = 1;
+ /* The urg_mode check is necessary during a below snd_una win probe */
+ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+ if (before(tp->snd_up, tcb->seq + 0x10000)) {
+ th->urg_ptr = htons(tp->snd_up - tcb->seq);
+ th->urg = 1;
+ } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+ th->urg_ptr = htons(0xFFFF);
+ th->urg = 1;
+ }
}
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
- tcp_syn_build_options((__be32 *)(th + 1),
- tcp_advertise_mss(sk),
- (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
- (sysctl_flags & SYSCTL_FLAG_SACK),
- (sysctl_flags & SYSCTL_FLAG_WSCALE),
- tp->rx_opt.rcv_wscale,
- tcb->when,
- tp->rx_opt.ts_recent,
-
-#ifdef CONFIG_TCP_MD5SIG
- md5 ? &md5_hash_location :
-#endif
- NULL);
+ skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+ if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
+ th->window = htons(tcp_select_window(sk));
+ tcp_ecn_send(sk, skb, th, tcp_header_size);
} else {
- tcp_build_and_update_options((__be32 *)(th + 1),
- tp, tcb->when,
-#ifdef CONFIG_TCP_MD5SIG
- md5 ? &md5_hash_location :
-#endif
- NULL);
- TCP_ECN_send(sk, tp, skb, tcp_header_size);
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(min(tp->rcv_wnd, 65535U));
}
+ tcp_options_write(th, tp, NULL, &opts, &key);
+
+ if (tcp_key_is_md5(&key)) {
#ifdef CONFIG_TCP_MD5SIG
- /* Calculate the MD5 hash, as we have all we need now */
- if (md5) {
- tp->af_specific->calc_md5_hash(md5_hash_location,
- md5,
- sk, NULL, NULL,
- skb->h.th,
- sk->sk_protocol,
- skb->len);
- }
+ /* Calculate the MD5 hash, as we have all we need now */
+ sk_gso_disable(sk);
+ tp->af_specific->calc_md5_hash(opts.hash_location,
+ key.md5_key, sk, skb);
#endif
+ } else if (tcp_key_is_ao(&key)) {
+ int err;
+
+ err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
+ opts.hash_location);
+ if (err) {
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ return -ENOMEM;
+ }
+ }
- icsk->icsk_af_ops->send_check(sk, skb->len, skb);
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
- if (likely(tcb->flags & TCPCB_FLAG_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+ INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
+ tcp_v6_send_check, tcp_v4_send_check,
+ sk, skb);
- if (skb->len != tcp_header_size)
- tcp_event_data_sent(tp, skb, sk);
+ if (likely(tcb->tcp_flags & TCPHDR_ACK))
+ tcp_event_ack_sent(sk, rcv_nxt);
+
+ if (skb->len != tcp_header_size) {
+ tcp_event_data_sent(tp, sk);
+ tp->data_segs_out += tcp_skb_pcount(skb);
+ tp->bytes_sent += skb->len - tcp_header_size;
+ }
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
- TCP_INC_STATS(TCP_MIB_OUTSEGS);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
+ tcp_skb_pcount(skb));
- err = icsk->icsk_af_ops->queue_xmit(skb, sk, 0);
- if (likely(err <= 0))
- return err;
+ tp->segs_out += tcp_skb_pcount(skb);
+ skb_set_hash_from_sk(skb, sk);
+ /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
+ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+ skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
+
+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
- tcp_enter_cwr(sk);
+ /* Cleanup our debris for IP stacks */
+ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
- return net_xmit_eval(err);
+ tcp_add_tx_delay(skb, tp);
-#undef SYSCTL_FLAG_TSTAMPS
-#undef SYSCTL_FLAG_WSCALE
-#undef SYSCTL_FLAG_SACK
+ err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
+ inet6_csk_xmit, ip_queue_xmit,
+ sk, skb, &inet->cork.fl);
+
+ if (unlikely(err > 0)) {
+ tcp_enter_cwr(sk);
+ err = net_xmit_eval(err);
+ }
+ if (!err && oskb) {
+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
+ tcp_rate_skb_sent(sk, oskb);
+ }
+ return err;
}
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask)
+{
+ return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
+ tcp_sk(sk)->rcv_nxt);
+}
-/* This routine just queue's the buffer
+/* This routine just queues the buffer for sending.
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
* otherwise socket can stall.
@@ -564,68 +1660,145 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
/* Advance write_seq and place onto the write_queue. */
- tp->write_seq = TCP_SKB_CB(skb)->end_seq;
- skb_header_release(skb);
- __skb_queue_tail(&sk->sk_write_queue, skb);
- sk_charge_skb(sk, skb);
-
- /* Queue it, remembering where we must start sending. */
- if (sk->sk_send_head == NULL)
- sk->sk_send_head = skb;
+ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
+ __skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
+ tcp_add_write_queue_tail(sk, skb);
+ sk_wmem_queued_add(sk, skb->truesize);
+ sk_mem_charge(sk, skb->truesize);
}
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+/* Initialize TSO segments for a packet. */
+static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
- if (skb->len <= mss_now || !sk_can_gso(sk)) {
+ int tso_segs;
+
+ if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- } else {
- unsigned int factor;
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
+ tcp_skb_pcount_set(skb, 1);
+ return 1;
+ }
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ tcp_skb_pcount_set(skb, tso_segs);
+ return tso_segs;
+}
+
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->packets_out -= decr;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ tp->lost_out -= decr;
- factor = skb->len + (mss_now - 1);
- factor /= mss_now;
- skb_shinfo(skb)->gso_segs = factor;
- skb_shinfo(skb)->gso_size = mss_now;
- skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+ /* Reno case is special. Sigh... */
+ if (tcp_is_reno(tp) && decr > 0)
+ tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+ tcp_verify_left_out(tp);
+}
+
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+ return TCP_SKB_CB(skb)->txstamp_ack ||
+ (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (unlikely(tcp_has_tx_tstamp(skb)) &&
+ !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+ struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+ u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+ shinfo->tx_flags &= ~tsflags;
+ shinfo2->tx_flags |= tsflags;
+ swap(shinfo->tskey, shinfo2->tskey);
+ TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+ TCP_SKB_CB(skb)->txstamp_ack = 0;
}
}
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+ TCP_SKB_CB(skb)->eor = 0;
+}
+
+/* Insert buff after skb on the write or rtx queue of sk. */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+ struct sk_buff *buff,
+ struct sock *sk,
+ enum tcp_queue tcp_queue)
+{
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+ __skb_queue_after(&sk->sk_write_queue, skb, buff);
+ else
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
- * packet to the list. This won't be called frequently, I hope.
+ * packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ struct sk_buff *skb, u32 len,
+ unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int nsize, old_factor;
- int nlen;
+ int old_factor;
+ long limit;
u16 flags;
+ int nlen;
- BUG_ON(len > skb->len);
+ if (WARN_ON(len > skb->len))
+ return -EINVAL;
- clear_all_retrans_hints(tp);
- nsize = skb_headlen(skb) - len;
- if (nsize < 0)
- nsize = 0;
+ DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
+
+ /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
+ * We need some allowance to not penalize applications setting small
+ * SO_SNDBUF values.
+ * Also allow first and last skb in retransmit queue to be split.
+ */
+ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
+ if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
+ tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
+ skb != tcp_rtx_queue_head(sk) &&
+ skb != tcp_rtx_queue_tail(sk))) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
+ return -ENOMEM;
+ }
- if (skb_cloned(skb) &&
- skb_is_nonlinear(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone_keeptruesize(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
- if (buff == NULL)
+ buff = tcp_stream_alloc_skb(sk, gfp, true);
+ if (!buff)
return -ENOMEM; /* We'll just try again later. */
+ skb_copy_decrypted(buff, skb);
+ mptcp_skb_ext_copy(buff, skb);
- sk_charge_skb(sk, buff);
- nlen = skb->len - len - nsize;
+ sk_wmem_queued_add(sk, buff->truesize);
+ sk_mem_charge(sk, buff->truesize);
+ nlen = skb->len - len;
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -635,38 +1808,25 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
- flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
- TCP_SKB_CB(buff)->flags = flags;
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
-
- if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
- /* Copy and checksum data tail into the new buffer. */
- buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
- nsize, 0);
+ tcp_skb_fragment_eor(skb, buff);
- skb_trim(skb, len);
-
- skb->csum = csum_block_sub(skb->csum, buff->csum, len);
- } else {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb_split(skb, buff, len);
- }
-
- buff->ip_summed = skb->ip_summed;
+ skb_split(skb, buff, len);
- /* Looks stupid, but our code really uses when of
- * skbs, which it never sent before. --ANK
- */
- TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
- buff->tstamp = skb->tstamp;
+ skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
+ tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
+
+ /* Update delivered info for the new segment */
+ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
/* If this packet has been sent out already, we must
* adjust the various packet counters.
@@ -675,105 +1835,83 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
int diff = old_factor - tcp_skb_pcount(skb) -
tcp_skb_pcount(buff);
- tp->packets_out -= diff;
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= diff;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= diff;
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out -= diff;
- tp->left_out -= diff;
- }
-
- if (diff > 0) {
- /* Adjust Reno SACK estimate. */
- if (!tp->rx_opt.sack_ok) {
- tp->sacked_out -= diff;
- if ((int)tp->sacked_out < 0)
- tp->sacked_out = 0;
- tcp_sync_left_out(tp);
- }
-
- tp->fackets_out -= diff;
- if ((int)tp->fackets_out < 0)
- tp->fackets_out = 0;
- }
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
}
/* Link BUFF into the send queue. */
- skb_header_release(buff);
- __skb_append(skb, buff, &sk->sk_write_queue);
+ __skb_header_release(buff);
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+ if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
+ list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
return 0;
}
-/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
- * eventually). The difference is that pulled data not copied, but
- * immediately discarded.
+/* This is similar to __pskb_pull_tail(). The difference is that pulled
+ * data is not copied, but immediately discarded.
*/
-static void __pskb_trim_head(struct sk_buff *skb, int len)
+static int __pskb_trim_head(struct sk_buff *skb, int len)
{
+ struct skb_shared_info *shinfo;
int i, k, eat;
+ DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
eat = len;
k = 0;
- for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
+ shinfo = skb_shinfo(skb);
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ int size = skb_frag_size(&shinfo->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
} else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
+ skb_frag_off_add(&shinfo->frags[k], eat);
+ skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
k++;
}
}
- skb_shinfo(skb)->nr_frags = k;
+ shinfo->nr_frags = k;
- skb->tail = skb->data;
skb->data_len -= len;
skb->len = skb->data_len;
+ return len;
}
+/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ u32 delta_truesize;
+
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
- /* If len == headlen, we avoid __skb_pull to preserve alignment. */
- if (unlikely(len < skb_headlen(skb)))
- __skb_pull(skb, len);
- else
- __pskb_trim_head(skb, len - skb_headlen(skb));
+ delta_truesize = __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len;
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->truesize -= len;
- sk->sk_wmem_queued -= len;
- sk->sk_forward_alloc += len;
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ skb->truesize -= delta_truesize;
+ sk_wmem_queued_add(sk, -delta_truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_uncharge(sk, delta_truesize);
- /* Any change of skb->len requires recalculation of tso
- * factor and mss.
- */
+ /* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
+ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
return 0;
}
-/* Not accounting for SACKs here. */
-int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+/* Calculate MSS not accounting any TCP options. */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;
/* Calculate base mss without TCP options:
@@ -789,40 +1927,47 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
- if (mss_now < 48)
- mss_now = 48;
-
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
+ mss_now = max(mss_now,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
return mss_now;
}
+/* Calculate MSS. Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ /* Subtract TCP options size, not including SACKs */
+ return __tcp_mtu_to_mss(sk, pmtu) -
+ (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+EXPORT_IPV6_MOD(tcp_mtu_to_mss);
+
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
- int mtu;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
- mtu = mss +
+ return mss +
tp->tcp_header_len +
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
-
- return mtu;
}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
+/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
- icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
- icsk->icsk_af_ops->net_header_len;
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
/* This function synchronize snd mss to current pmtu/exthdr set.
@@ -847,7 +1992,6 @@ void tcp_mtup_init(struct sock *sk)
NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
are READ ONLY outside this function. --ANK (980731)
*/
-
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -858,10 +2002,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
icsk->icsk_mtup.search_high = pmtu;
mss_now = tcp_mtu_to_mss(sk, pmtu);
-
- /* Bound mss with half of window */
- if (tp->max_window && mss_now > (tp->max_window>>1))
- mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
+ mss_now = tcp_bound_to_half_wnd(tp, mss_now);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
@@ -871,156 +2012,268 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
return mss_now;
}
+EXPORT_IPV6_MOD(tcp_sync_mss);
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
- *
- * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
- * cannot be large. However, taking into account rare use of URG, this
- * is not a big flaw.
*/
-unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
+unsigned int tcp_current_mss(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- u16 xmit_size_goal;
- int doing_tso = 0;
+ unsigned int header_len;
+ struct tcp_out_options opts;
+ struct tcp_key key;
mss_now = tp->mss_cache;
- if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
- doing_tso = 1;
-
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
-
- if (tp->rx_opt.eff_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
-
-#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk))
- mss_now -= TCPOLEN_MD5SIG_ALIGNED;
-#endif
-
- xmit_size_goal = mss_now;
-
- if (doing_tso) {
- xmit_size_goal = (65535 -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
-
- if (tp->max_window &&
- (xmit_size_goal > (tp->max_window >> 1)))
- xmit_size_goal = max((tp->max_window >> 1),
- 68U - tp->tcp_header_len);
-
- xmit_size_goal -= (xmit_size_goal % mss_now);
+ tcp_get_current_key(sk, &key);
+ header_len = tcp_established_options(sk, NULL, &opts, &key) +
+ sizeof(struct tcphdr);
+ /* The mss_cache is sized based on tp->tcp_header_len, which assumes
+ * some common options. If this is an odd packet (because we have SACK
+ * blocks etc) then our calculated header_len will be different, and
+ * we have to adjust mss_now correspondingly */
+ if (header_len != tp->tcp_header_len) {
+ int delta = (int) header_len - tp->tcp_header_len;
+ mss_now -= delta;
}
- tp->xmit_size_goal = xmit_size_goal;
return mss_now;
}
-/* Congestion window validation. (RFC2861) */
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+static void tcp_cwnd_application_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
-static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ /* Limited by application or receiver window. */
+ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ u32 win_used = max(tp->snd_cwnd_used, init_win);
+ if (win_used < tcp_snd_cwnd(tp)) {
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
+ }
+ tp->snd_cwnd_used = 0;
+ }
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+}
+
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
- __u32 packets_out = tp->packets_out;
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Track the strongest available signal of the degree to which the cwnd
+ * is fully utilized. If cwnd-limited then remember that fact for the
+ * current window. If not cwnd-limited then track the maximum number of
+ * outstanding packets in the current window. (If cwnd-limited then we
+ * chose to not update tp->max_packets_out to avoid an extra else
+ * clause with no functional impact.)
+ */
+ if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
+ is_cwnd_limited ||
+ (!tp->is_cwnd_limited &&
+ tp->packets_out > tp->max_packets_out)) {
+ tp->is_cwnd_limited = is_cwnd_limited;
+ tp->max_packets_out = tp->packets_out;
+ tp->cwnd_usage_seq = tp->snd_nxt;
+ }
- if (packets_out >= tp->snd_cwnd) {
+ if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
} else {
/* Network starves. */
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
- if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
+ (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
+ !ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
+
+ /* The following conditions together indicate the starvation
+ * is caused by insufficient sender buffer:
+ * 1) just sent some data (see tcp_write_xmit)
+ * 2) not cwnd limited (this else condition)
+ * 3) no more data to send (tcp_write_queue_empty())
+ * 4) application is hitting buffer limit (SOCK_NOSPACE)
+ */
+ if (tcp_write_queue_empty(sk) && sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
+ (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
-static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+/* Minshall's variant of the Nagle send check. */
+static bool tcp_minshall_check(const struct tcp_sock *tp)
{
- u32 window, cwnd_len;
+ return after(tp->snd_sml, tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
- window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
- cwnd_len = mss_now * cwnd;
- return min(window, cwnd_len);
+/* Update snd_sml if this skb is under mss
+ * Note that a TSO packet might end with a sub-mss segment
+ * The test is really :
+ * if ((skb->len % mss) != 0)
+ * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ * skb_pcount = skb->len / mss_now
+ */
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+ const struct sk_buff *skb)
+{
+ if (skb->len < tcp_skb_pcount(skb) * mss_now)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}
-/* Can at least one segment of SKB be sent right now, according to the
- * congestion window rules? If so, return how many segments are allowed.
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
*/
-static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+ int nonagle)
{
- u32 in_flight, cwnd;
+ return partial &&
+ ((nonagle & TCP_NAGLE_CORK) ||
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
- /* Don't be strict about the congestion window for the final FIN. */
- if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
- return 1;
+/* Return how many segs we'd like on a TSO packet,
+ * depending on current pacing rate, and how close the peer is.
+ *
+ * Rationale is:
+ * - For close peers, we rather send bigger packets to reduce
+ * cpu costs, because occasional losses will be repaired fast.
+ * - For long distance/rtt flows, we would like to get ACK clocking
+ * with 1 ACK per ms.
+ *
+ * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
+{
+ unsigned long bytes;
+ u32 r;
- in_flight = tcp_packets_in_flight(tp);
- cwnd = tp->snd_cwnd;
- if (in_flight < cwnd)
- return (cwnd - in_flight);
+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
- return 0;
+ r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
+ if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
+ bytes += sk->sk_gso_max_size >> r;
+
+ bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
+
+ return max_t(u32, bytes / mss_now, min_tso_segs);
}
-/* This must be invoked the first time we consider transmitting
- * SKB onto the wire.
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
*/
-static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
- int tso_segs = tcp_skb_pcount(skb);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u32 min_tso, tso_segs;
- if (!tso_segs ||
- (tso_segs > 1 &&
- tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tso_segs = tcp_skb_pcount(skb);
- }
- return tso_segs;
+ min_tso = ca_ops->min_tso_segs ?
+ ca_ops->min_tso_segs(sk) :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+
+ tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+ return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+ const struct sk_buff *skb,
+ unsigned int mss_now,
+ unsigned int max_segs,
+ int nonagle)
{
- return after(tp->snd_sml,tp->snd_una) &&
- !after(tp->snd_sml, tp->snd_nxt);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 partial, needed, window, max_len;
+
+ window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+ max_len = mss_now * max_segs;
+
+ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+ return max_len;
+
+ needed = min(skb->len, window);
+
+ if (max_len <= needed)
+ return max_len;
+
+ partial = needed % mss_now;
+ /* If last segment is not a full MSS, check if Nagle rules allow us
+ * to include this last segment in this skb.
+ * Otherwise, we'll split the skb at last MSS boundary
+ */
+ if (tcp_nagle_check(partial != 0, tp, nonagle))
+ return needed - partial;
+
+ return needed;
}
-/* Return 0, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_NODELAY was set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- * With Minshall's modification: all sent small packets are ACKed.
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules? If so, return how many segments are allowed.
*/
+static u32 tcp_cwnd_test(const struct tcp_sock *tp)
+{
+ u32 in_flight, cwnd, halfcwnd;
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
- const struct sk_buff *skb,
- unsigned mss_now, int nonagle)
+ in_flight = tcp_packets_in_flight(tp);
+ cwnd = tcp_snd_cwnd(tp);
+ if (in_flight >= cwnd)
+ return 0;
+
+ /* For better scheduling, ensure we have at least
+ * 2 GSO packets in flight.
+ */
+ halfcwnd = max(cwnd >> 1, 1U);
+ return min(halfcwnd, cwnd - in_flight);
+}
+
+/* Initialize TSO state of a skb.
+ * This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
- return (skb->len < mss_now &&
- ((nonagle&TCP_NAGLE_CORK) ||
- (!nonagle &&
- tp->packets_out &&
- tcp_minshall_check(tp))));
+ int tso_segs = tcp_skb_pcount(skb);
+
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
+ return tcp_set_skb_tso_segs(skb, mss_now);
+
+ return tso_segs;
}
-/* Return non-zero if the Nagle test allows this packet to be
+
+/* Return true if the Nagle test allows this packet to be
* sent now.
*/
-static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
@@ -1029,68 +2282,29 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
- return 1;
+ return true;
- /* Don't use the nagle rule for urgent data (or for the final FIN). */
- if (tp->urg_mode ||
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
- return 1;
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ return true;
- if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
- return 1;
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
+ return true;
- return 0;
+ return false;
}
/* Does at least the first segment of SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (skb->len > cur_mss)
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
- return !after(end_seq, tp->snd_una + tp->snd_wnd);
-}
-
-/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
- * should be put on the wire right now. If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- unsigned int cwnd_quota;
-
- tcp_init_tso_segs(sk, skb, cur_mss);
-
- if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
- return 0;
-
- cwnd_quota = tcp_cwnd_test(tp, skb);
- if (cwnd_quota &&
- !tcp_snd_wnd_test(tp, skb, cur_mss))
- cwnd_quota = 0;
-
- return cwnd_quota;
-}
-
-static inline int tcp_skb_is_last(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return skb->next == (struct sk_buff *)&sk->sk_write_queue;
-}
-
-int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
-{
- struct sk_buff *skb = sk->sk_send_head;
-
- return (skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
- (tcp_skb_is_last(sk, skb) ?
- TCP_NAGLE_PUSH :
- tp->nonagle)));
+ return !after(end_seq, tcp_wnd_end(tp));
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1100,21 +2314,24 @@ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ unsigned int mss_now, gfp_t gfp)
{
- struct sk_buff *buff;
int nlen = skb->len - len;
+ struct sk_buff *buff;
u16 flags;
/* All of a TSO frame must be composed of paged data. */
- if (skb->len != skb->data_len)
- return tcp_fragment(sk, skb, len, mss_now);
+ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
- buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
- if (unlikely(buff == NULL))
+ buff = tcp_stream_alloc_skb(sk, gfp, true);
+ if (unlikely(!buff))
return -ENOMEM;
+ skb_copy_decrypted(buff, skb);
+ mptcp_skb_ext_copy(buff, skb);
- sk_charge_skb(sk, buff);
+ sk_wmem_queued_add(sk, buff->truesize);
+ sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -1124,23 +2341,22 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
- flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
- TCP_SKB_CB(buff)->flags = flags;
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
- /* This packet was never sent out yet, so no SACK bits. */
- TCP_SKB_CB(buff)->sacked = 0;
+ tcp_skb_fragment_eor(skb, buff);
- buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
+ tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* Link BUFF into the send queue. */
- skb_header_release(buff);
- __skb_append(skb, buff, &sk->sk_write_queue);
+ __skb_header_release(buff);
+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
return 0;
}
@@ -1150,44 +2366,59 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
*
* This algorithm is from John Heffner.
*/
-static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ bool *is_cwnd_limited,
+ bool *is_rwnd_limited,
+ u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 send_win, cong_win, limit, in_flight;
-
- if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
- goto send_now;
+ u32 send_win, cong_win, limit, in_flight, threshold;
+ u64 srtt_in_ns, expected_ack, how_far_is_the_ack;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *head;
+ int win_divisor;
+ s64 delta;
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
- /* Defer for less than two clock ticks. */
- if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1)
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
+ */
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
- BUG_ON(tcp_skb_pcount(skb) <= 1 ||
- (tp->snd_cwnd <= in_flight));
+ BUG_ON(tcp_skb_pcount(skb) <= 1);
+ BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
- send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+ send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* From in_flight test above, we know that cwnd > in_flight. */
- cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
- if (limit >= 65536)
+ if (limit >= max_segs * tp->mss_cache)
+ goto send_now;
+
+ /* Middle in queue won't get any more data, full sendable already? */
+ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
- if (sysctl_tcp_tso_win_divisor) {
- u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+ win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
+ if (win_divisor) {
+ u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
/* If at least some fraction of a window is available,
* just use it.
*/
- chunk /= sysctl_tcp_tso_win_divisor;
+ chunk /= win_divisor;
if (limit >= chunk)
goto send_now;
} else {
@@ -1196,141 +2427,296 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
* frame, so if we have space for more than 3 frames
* then send now.
*/
- if (limit > tcp_max_burst(tp) * tp->mss_cache)
+ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
goto send_now;
}
- /* Ok, it looks like it is advisable to defer. */
- tp->tso_deferred = 1 | (jiffies<<1);
+ /* TODO : use tsorted_sent_queue ? */
+ head = tcp_rtx_queue_head(sk);
+ if (!head)
+ goto send_now;
+
+ srtt_in_ns = (u64)(NSEC_PER_USEC >> 3) * tp->srtt_us;
+ /* When is the ACK expected ? */
+ expected_ack = head->tstamp + srtt_in_ns;
+ /* How far from now is the ACK expected ? */
+ how_far_is_the_ack = expected_ack - tp->tcp_clock_cache;
+
+ /* If next ACK is likely to come too late,
+ * ie in more than min(1ms, half srtt), do not defer.
+ */
+ threshold = min(srtt_in_ns >> 1, NSEC_PER_MSEC);
+
+ if ((s64)(how_far_is_the_ack - threshold) > 0)
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer.
+ * Three cases are tracked :
+ * 1) We are cwnd-limited
+ * 2) We are rwnd-limited
+ * 3) We are application limited.
+ */
+ if (cong_win < send_win) {
+ if (cong_win <= skb->len) {
+ *is_cwnd_limited = true;
+ return true;
+ }
+ } else {
+ if (send_win <= skb->len) {
+ *is_rwnd_limited = true;
+ return true;
+ }
+ }
+
+ /* If this packet won't get more data, do not wait. */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+ TCP_SKB_CB(skb)->eor)
+ goto send_now;
- return 1;
+ return true;
send_now:
- tp->tso_deferred = 0;
+ return false;
+}
+
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
+ delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
+ }
+}
+
+static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
+{
+ struct sk_buff *skb, *next;
+
+ skb = tcp_send_head(sk);
+ tcp_for_write_queue_from_safe(skb, next, sk) {
+ if (len <= skb->len)
+ break;
+
+ if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next))
+ return false;
+
+ len -= skb->len;
+ }
+
+ return true;
+}
+
+static int tcp_clone_payload(struct sock *sk, struct sk_buff *to,
+ int probe_size)
+{
+ skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags;
+ int i, todo, len = 0, nr_frags = 0;
+ const struct sk_buff *skb;
+
+ if (!sk_wmem_schedule(sk, to->truesize + probe_size))
+ return -ENOMEM;
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ const skb_frag_t *fragfrom = skb_shinfo(skb)->frags;
+
+ if (skb_headlen(skb))
+ return -EINVAL;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) {
+ if (len >= probe_size)
+ goto commit;
+ todo = min_t(int, skb_frag_size(fragfrom),
+ probe_size - len);
+ len += todo;
+ if (lastfrag &&
+ skb_frag_page(fragfrom) == skb_frag_page(lastfrag) &&
+ skb_frag_off(fragfrom) == skb_frag_off(lastfrag) +
+ skb_frag_size(lastfrag)) {
+ skb_frag_size_add(lastfrag, todo);
+ continue;
+ }
+ if (unlikely(nr_frags == MAX_SKB_FRAGS))
+ return -E2BIG;
+ skb_frag_page_copy(fragto, fragfrom);
+ skb_frag_off_copy(fragto, fragfrom);
+ skb_frag_size_set(fragto, todo);
+ nr_frags++;
+ lastfrag = fragto++;
+ }
+ }
+commit:
+ WARN_ON_ONCE(len != probe_size);
+ for (i = 0; i < nr_frags; i++)
+ skb_frag_ref(to, i);
+
+ skb_shinfo(to)->nr_frags = nr_frags;
+ to->truesize += probe_size;
+ to->len += probe_size;
+ to->data_len += probe_size;
+ __skb_header_release(to);
return 0;
}
+/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if
+ * all its payload was moved to another one (dst).
+ * Make sure to transfer tcp_flags, eor, and tstamp.
+ */
+static void tcp_eat_one_skb(struct sock *sk,
+ struct sk_buff *dst,
+ struct sk_buff *src)
+{
+ TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags;
+ TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor;
+ tcp_skb_collapse_tstamp(dst, src);
+ tcp_unlink_write_queue(src, sk);
+ tcp_wmem_free_skb(sk, src);
+}
+
/* Create a new MTU probe if we are ready.
+ * MTU probe is regularly attempting to increase the path MTU by
+ * deliberately sending larger packets. This discovers routing
+ * changes resulting in larger path MTUs.
+ *
* Returns 0 if we should wait to probe (no cwnd available),
* 1 if a probe was sent,
- * -1 otherwise */
+ * -1 otherwise
+ */
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
- int len;
+ struct net *net = sock_net(sk);
int probe_size;
- unsigned int pif;
- int copy;
+ int size_needed;
+ int copy, len;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.eff_sacks)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tcp_snd_cwnd(tp) < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
- /* Very simple search strategy: just double the MSS. */
- mss_now = tcp_current_mss(sk, 0);
- probe_size = 2*tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
+ mss_now = tcp_current_mss(sk);
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
+ size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
/* Have enough data in the send queue to probe? */
- len = 0;
- if ((skb = sk->sk_send_head) == NULL)
+ if (tp->write_seq - tp->snd_nxt < size_needed)
return -1;
- while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
- skb = skb->next;
- if (len < probe_size)
+
+ if (tp->snd_wnd < size_needed)
return -1;
+ if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
+ return 0;
- /* Receive window check. */
- if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
- if (tp->snd_wnd < probe_size)
+ /* Do we need to wait to drain cwnd? With none in flight, don't stall */
+ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
+ if (!tcp_packets_in_flight(tp))
return -1;
else
return 0;
}
- /* Do we need to wait to drain cwnd? */
- pif = tcp_packets_in_flight(tp);
- if (pif + 2 > tp->snd_cwnd) {
- /* With no packets in flight, don't stall. */
- if (pif == 0)
- return -1;
- else
- return 0;
- }
+ if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
+ return -1;
/* We're allowed to probe. Build it now. */
- if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false);
+ if (!nskb)
return -1;
- sk_charge_skb(sk, nskb);
- skb = sk->sk_send_head;
- __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
- sk->sk_send_head = nskb;
+ /* build the payload, and be prepared to abort if this fails. */
+ if (tcp_clone_payload(sk, nskb, probe_size)) {
+ tcp_skb_tsorted_anchor_cleanup(nskb);
+ consume_skb(nskb);
+ return -1;
+ }
+ sk_wmem_queued_add(sk, nskb->truesize);
+ sk_mem_charge(sk, nskb->truesize);
+
+ skb = tcp_send_head(sk);
+ skb_copy_decrypted(nskb, skb);
+ mptcp_skb_ext_copy(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
- TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
- TCP_SKB_CB(nskb)->sacked = 0;
- nskb->csum = 0;
- nskb->ip_summed = skb->ip_summed;
+ TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
- len = 0;
- while (len < probe_size) {
- next = skb->next;
+ tcp_insert_write_queue_before(nskb, skb, sk);
+ tcp_highest_sack_replace(sk, skb, nskb);
+ len = 0;
+ tcp_for_write_queue_from_safe(skb, next, sk) {
copy = min_t(int, skb->len, probe_size - len);
- if (nskb->ip_summed)
- skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
- else
- nskb->csum = skb_copy_and_csum_bits(skb, 0,
- skb_put(nskb, copy), copy, nskb->csum);
if (skb->len <= copy) {
- /* We've eaten all the data from this skb.
- * Throw it away. */
- TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
- __skb_unlink(skb, &sk->sk_write_queue);
- sk_stream_free_skb(sk, skb);
+ tcp_eat_one_skb(sk, nskb, skb);
} else {
- TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
- ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
- if (!skb_shinfo(skb)->nr_frags) {
- skb_pull(skb, copy);
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->csum = csum_partial(skb->data, skb->len, 0);
- } else {
- __pskb_trim_head(skb, copy);
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- }
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
+ ~(TCPHDR_FIN|TCPHDR_PSH);
+ __pskb_trim_head(skb, copy);
+ tcp_set_skb_tso_segs(skb, mss_now);
TCP_SKB_CB(skb)->seq += copy;
}
len += copy;
- skb = next;
+
+ if (len >= probe_size)
+ break;
}
- tcp_init_tso_segs(sk, nskb, nskb->len);
+ tcp_init_tso_segs(nskb, nskb->len);
/* We're ready to send. If this fails, the probe will
- * be resegmented into mss-sized pieces by tcp_write_xmit(). */
- TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+ * be resegmented into mss-sized pieces by tcp_write_xmit().
+ */
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
/* Decrement cwnd here because we are sending
- * effectively two packets. */
- tp->snd_cwnd--;
- update_send_head(sk, tp, nskb);
+ * effectively two packets. */
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
+ tcp_event_new_data_sent(sk, nskb);
icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
@@ -1342,50 +2728,243 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+static bool tcp_pacing_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_needs_internal_pacing(sk))
+ return false;
+
+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
+ return false;
+
+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
+ hrtimer_start(&tp->pacing_timer,
+ ns_to_ktime(tp->tcp_wstamp_ns),
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
+ }
+ return true;
+}
+
+static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
+{
+ const struct rb_node *node = sk->tcp_rtx_queue.rb_node;
+
+ /* No skb in the rtx queue. */
+ if (!node)
+ return true;
+
+ /* Only one skb in rtx queue. */
+ return !node->rb_left && !node->rb_right;
+}
+
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+ unsigned int factor)
+{
+ unsigned long limit;
+
+ limit = max_t(unsigned long,
+ 2 * skb->truesize,
+ READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
+ limit = min_t(unsigned long, limit,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
+ limit <<= factor;
+
+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+ tcp_sk(sk)->tcp_tx_delay) {
+ u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
+ tcp_sk(sk)->tcp_tx_delay;
+
+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+ * approximate our needs assuming an ~100% skb->truesize overhead.
+ * USEC_PER_SEC is approximated by 2^20.
+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+ */
+ extra_bytes >>= (20 - 1);
+ limit += extra_bytes;
+ }
+ if (refcount_read(&sk->sk_wmem_alloc) > limit) {
+ /* Always send skb if rtx queue is empty or has one skb.
+ * No need to wait for TX completion to call us back,
+ * after softirq schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (tcp_rtx_queue_empty_or_single_skb(sk))
+ return false;
+
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (refcount_read(&sk->sk_wmem_alloc) > limit)
+ return true;
+ }
+ return false;
+}
+
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_jiffies32;
+ enum tcp_chrono old = tp->chrono_type;
+
+ if (old > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[old - 1] += now - tp->chrono_start;
+ tp->chrono_start = now;
+ tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+
+ /* There are multiple conditions worthy of tracking in a
+ * chronograph, so that the highest priority enum takes
+ * precedence over the other conditions (see tcp_chrono_start).
+ * If a condition stops, we only stop chrono tracking if
+ * it's the "most interesting" or current chrono we are
+ * tracking and starts busy chrono if we have pending data.
+ */
+ if (tcp_rtx_and_write_queues_empty(sk))
+ tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+ else if (type == tp->chrono_type)
+ tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
+/* First skb in the write queue is smaller than ideal packet size.
+ * Check if we can move payload from the second skb in the queue.
+ */
+static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
+{
+ struct sk_buff *next_skb = skb->next;
+ unsigned int nlen;
+
+ if (tcp_skb_is_last(sk, skb))
+ return;
+
+ if (!tcp_skb_can_collapse(skb, next_skb))
+ return;
+
+ nlen = min_t(u32, amount, next_skb->len);
+ if (!nlen || !skb_shift(skb, next_skb, nlen))
+ return;
+
+ TCP_SKB_CB(skb)->end_seq += nlen;
+ TCP_SKB_CB(next_skb)->seq += nlen;
+
+ if (!next_skb->len) {
+ /* In case FIN is set, we need to update end_seq */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ tcp_eat_one_skb(sk, skb, next_skb);
+ }
+}
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
*
- * Returns 1, if no segments are in flight and we have queued segments, but
- * cannot send anything now because of SWS or another problem.
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames between
+ * snd_up-64k-mss .. snd_up cannot be large. However, taking into
+ * account rare use of URG, this is not a big flaw.
+ *
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
*/
-static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
- int cwnd_quota;
+ u32 cwnd_quota, max_segs;
int result;
-
- /* If we are closed, the bytes will have to remain here.
- * In time closedown will finish, we empty the write queue and all
- * will be happy.
- */
- if (unlikely(sk->sk_state == TCP_CLOSE))
- return 0;
+ bool is_cwnd_limited = false, is_rwnd_limited = false;
sent_pkts = 0;
- /* Do MTU probing. */
- if ((result = tcp_mtu_probe(sk)) == 0) {
- return 0;
- } else if (result > 0) {
- sent_pkts = 1;
+ tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
+ if (!push_one) {
+ /* Do MTU probing. */
+ result = tcp_mtu_probe(sk);
+ if (!result) {
+ return false;
+ } else if (result > 0) {
+ sent_pkts = 1;
+ }
}
- while ((skb = sk->sk_send_head)) {
+ max_segs = tcp_tso_segs(sk, mss_now);
+ while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ int missing_bytes;
+
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
+ tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ tcp_init_tso_segs(skb, mss_now);
+ goto repair; /* Skip network transmission */
+ }
- tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
- BUG_ON(!tso_segs);
-
- cwnd_quota = tcp_cwnd_test(tp, skb);
- if (!cwnd_quota)
+ if (tcp_pacing_check(sk))
break;
- if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ cwnd_quota = tcp_cwnd_test(tp);
+ if (!cwnd_quota) {
+ if (push_one == 2)
+ /* Force out a loss probe pkt. */
+ cwnd_quota = 1;
+ else
+ break;
+ }
+ cwnd_quota = min(cwnd_quota, max_segs);
+ missing_bytes = cwnd_quota * mss_now - skb->len;
+ if (missing_bytes > 0)
+ tcp_grow_skb(sk, skb, missing_bytes);
+
+ tso_segs = tcp_set_skb_tso_segs(skb, mss_now);
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ is_rwnd_limited = true;
break;
+ }
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -1393,113 +2972,236 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
- if (tcp_tso_should_defer(sk, tp, skb))
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+ &is_rwnd_limited, max_segs))
break;
}
limit = mss_now;
- if (tso_segs > 1) {
- limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
-
- if (skb->len < limit) {
- unsigned int trim = skb->len % mss_now;
-
- if (trim)
- limit = skb->len - trim;
- }
- }
+ if (tso_segs > 1 && !tcp_urg_mode(tp))
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ cwnd_quota,
+ nonagle);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ if (tcp_small_queue_check(sk, skb, 0))
+ break;
- if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
+ /* Argh, we hit an empty skb(), presumably a thread
+ * is sleeping in sendmsg()/sk_stream_wait_memory().
+ * We do not want to send a pure-ack packet and have
+ * a strange looking rtx queue with empty packet(s).
+ */
+ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
break;
+ if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+ break;
+
+repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
- update_send_head(sk, tp, skb);
+ tcp_event_new_data_sent(sk, skb);
tcp_minshall_update(tp, mss_now, skb);
- sent_pkts++;
+ sent_pkts += tcp_skb_pcount(skb);
+
+ if (push_one)
+ break;
}
+ if (is_rwnd_limited)
+ tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+ else
+ tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
+ if (likely(sent_pkts || is_cwnd_limited))
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+
if (likely(sent_pkts)) {
- tcp_cwnd_validate(sk, tp);
- return 0;
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += sent_pkts;
+
+ /* Send one loss probe per tail loss episode. */
+ if (push_one != 2)
+ tcp_schedule_loss_probe(sk, false);
+ return false;
}
- return !tp->packets_out && sk->sk_send_head;
+ return !tp->packets_out && !tcp_write_queue_empty(sk);
}
-/* Push out any pending frames which were held back due to
- * TCP_CORK or attempt at coalescing tiny packets.
- * The socket must be locked by the caller.
- */
-void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
- unsigned int cur_mss, int nonagle)
+bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
- struct sk_buff *skb = sk->sk_send_head;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, timeout_us, rto_delta_us;
+ int early_retrans;
+
+ /* Don't do any loss probe on a Fast Open connection before 3WHS
+ * finishes.
+ */
+ if (rcu_access_pointer(tp->fastopen_rsk))
+ return false;
- if (skb) {
- if (tcp_write_xmit(sk, cur_mss, nonagle))
- tcp_check_probe_timer(sk, tp);
+ early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
+ /* Schedule a loss probe in 2*RTT for SACK capable connections
+ * not in loss recovery, that are either limited by cwnd or application.
+ */
+ if ((early_retrans != 3 && early_retrans != 4) ||
+ !tp->packets_out || !tcp_is_sack(tp) ||
+ (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_CWR))
+ return false;
+
+ /* Probe timeout is 2*rtt. Add minimum RTO to account
+ * for delayed ack when there's one outstanding packet. If no RTT
+ * sample is available then probe after TCP_TIMEOUT_INIT.
+ */
+ if (tp->srtt_us) {
+ timeout_us = tp->srtt_us >> 2;
+ if (tp->packets_out == 1)
+ timeout_us += tcp_rto_min_us(sk);
+ else
+ timeout_us += TCP_TIMEOUT_MIN_US;
+ timeout = usecs_to_jiffies(timeout_us);
+ } else {
+ timeout = TCP_TIMEOUT_INIT;
}
+
+ /* If the RTO formula yields an earlier time, then use that time. */
+ rto_delta_us = advancing_rto ?
+ jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
+ tcp_rto_delta_us(sk); /* How far in future is RTO? */
+ if (rto_delta_us > 0)
+ timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
+
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true);
+ return true;
}
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
+/* Thanks to skb fast clones, we can detect if a prior transmit of
+ * a packet is still in a qdisc or driver queue.
+ * In this case, there is very little point doing a retransmit !
*/
-void tcp_push_one(struct sock *sk, unsigned int mss_now)
+static bool skb_still_in_host_queue(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ if (unlikely(skb_fclone_busy(sk, skb))) {
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ smp_mb__after_atomic();
+ if (skb_fclone_busy(sk, skb)) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
+ }
+ return false;
+}
+
+/* When probe timeout (PTO) fires, try send a new segment if possible, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = sk->sk_send_head;
- unsigned int tso_segs, cwnd_quota;
+ struct sk_buff *skb;
+ int pcount;
+ int mss = tcp_current_mss(sk);
+
+ /* At most one outstanding TLP */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ tp->tlp_retrans = 0;
+ skb = tcp_send_head(sk);
+ if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+ pcount = tp->packets_out;
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ if (tp->packets_out > pcount)
+ goto probe_sent;
+ goto rearm_timer;
+ }
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
+ if (unlikely(!skb)) {
+ tcp_warn_once(sk, tp->packets_out, "invalid inflight: ");
+ smp_store_release(&inet_csk(sk)->icsk_pending, 0);
+ return;
+ }
- BUG_ON(!skb || skb->len < mss_now);
+ if (skb_still_in_host_queue(sk, skb))
+ goto rearm_timer;
- tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
- cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+ pcount = tcp_skb_pcount(skb);
+ if (WARN_ON(!pcount))
+ goto rearm_timer;
- if (likely(cwnd_quota)) {
- unsigned int limit;
+ if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+ if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ (pcount - 1) * mss, mss,
+ GFP_ATOMIC)))
+ goto rearm_timer;
+ skb = skb_rb_next(skb);
+ }
- BUG_ON(!tso_segs);
+ if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ goto rearm_timer;
- limit = mss_now;
- if (tso_segs > 1) {
- limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
+ if (__tcp_retransmit_skb(sk, skb, 1))
+ goto rearm_timer;
- if (skb->len < limit) {
- unsigned int trim = skb->len % mss_now;
+ tp->tlp_retrans = 1;
- if (trim)
- limit = skb->len - trim;
- }
- }
+probe_sent:
+ /* Record snd_nxt for loss detection. */
+ tp->tlp_high_seq = tp->snd_nxt;
- if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now)))
- return;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+ /* Reset s.t. tcp_rearm_rto will restart timer from now */
+ smp_store_release(&inet_csk(sk)->icsk_pending, 0);
+rearm_timer:
+ tcp_rearm_rto(sk);
+}
- /* Send it out now. */
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ int nonagle)
+{
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and
+ * all will be happy.
+ */
+ if (unlikely(sk->sk_state == TCP_CLOSE))
+ return;
- if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
- update_send_head(sk, tp, skb);
- tcp_cwnd_validate(sk, tp);
- return;
- }
- }
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_mask(sk, GFP_ATOMIC)))
+ tcp_check_probe_timer(sk);
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+ struct sk_buff *skb = tcp_send_head(sk);
+
+ BUG_ON(!skb || skb->len < mss_now);
+
+ tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
}
/* This function returns the amount that we can raise the
* usable window based on the following constraints
- *
+ *
* 1. The window can never be shrunk once it is offered (RFC 793)
* 2. We limit memory per socket
*
@@ -1518,12 +3220,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
* side SWS prevention criteria. The problem is that under this rule
* a stream of single byte packets will cause the right side of the
* window to always advance by a single byte.
- *
+ *
* Of course, if the sender implements sender side SWS prevention
* then this will not be a problem.
- *
+ *
* BSD seems to make the following compromise:
- *
+ *
* If the free space is less than the 1/4 of the maximum
* space available and the free space is less than 1/2 mss,
* then set the window to 0.
@@ -1553,6 +3255,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
/* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
@@ -1561,19 +3264,47 @@ u32 __tcp_select_window(struct sock *sk)
*/
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
- int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
- int window;
+ int allowed_space = tcp_full_space(sk);
+ int full_space, window;
+
+ if (sk_is_mptcp(sk))
+ mptcp_space(sk, &free_space, &allowed_space);
+
+ full_space = min_t(int, tp->window_clamp, allowed_space);
+
+ if (unlikely(mss > full_space)) {
+ mss = full_space;
+ if (mss <= 0)
+ return 0;
+ }
+
+ /* Only allow window shrink if the sysctl is enabled and we have
+ * a non-zero scaling factor in effect.
+ */
+ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
+ goto shrink_window_allowed;
- if (mss > full_space)
- mss = full_space;
+ /* do not allow window to shrink */
- if (free_space < full_space/2) {
+ if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (tcp_memory_pressure)
- tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
+ if (tcp_under_memory_pressure(sk))
+ tcp_adjust_rcv_ssthresh(sk);
- if (free_space < mss)
+ /* free_space might become our new window, make sure we don't
+ * increase it due to wscale.
+ */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ /* if free space is less than mss estimate, or is below 1/16th
+ * of the maximum allowed, try to move to zero-window, else
+ * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+ * new incoming data is dropped due to memory limits.
+ * With large window, mss test triggers way too late in order
+ * to announce zero window in time before rmem limit kicks in.
+ */
+ if (free_space < (allowed_space >> 4) || free_space < mss)
return 0;
}
@@ -1583,7 +3314,6 @@ u32 __tcp_select_window(struct sock *sk)
/* Don't do rounding if we are using window scaling, since the
* scaled window will not line up with the MSS boundary anyway.
*/
- window = tp->rcv_wnd;
if (tp->rx_opt.rcv_wscale) {
window = free_space;
@@ -1591,10 +3321,9 @@ u32 __tcp_select_window(struct sock *sk)
* Import case: prevent zero window announcement if
* 1<<rcv_wscale > mss.
*/
- if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
- window = (((window >> tp->rx_opt.rcv_wscale) + 1)
- << tp->rx_opt.rcv_wscale);
+ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
} else {
+ window = tp->rcv_wnd;
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the
@@ -1604,245 +3333,324 @@ u32 __tcp_select_window(struct sock *sk)
* is too small.
*/
if (window <= free_space - mss || window > free_space)
- window = (free_space/mss)*mss;
+ window = rounddown(free_space, mss);
+ else if (mss == full_space &&
+ free_space > window + (full_space >> 1))
+ window = free_space;
}
return window;
-}
-/* Attempt to collapse two adjacent SKB's during retransmission. */
-static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *next_skb = skb->next;
+shrink_window_allowed:
+ /* new window should always be an exact multiple of scaling factor */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
- /* The first test we must make is that neither of these two
- * SKB's are still referenced by someone else.
- */
- if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
- int skb_size = skb->len, next_skb_size = next_skb->len;
- u16 flags = TCP_SKB_CB(skb)->flags;
+ if (free_space < (full_space >> 1)) {
+ icsk->icsk_ack.quick = 0;
- /* Also punt if next skb has been SACK'd. */
- if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
- return;
+ if (tcp_under_memory_pressure(sk))
+ tcp_adjust_rcv_ssthresh(sk);
- /* Next skb is out of window. */
- if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
- return;
+ /* if free space is too low, return a zero window */
+ if (free_space < (allowed_space >> 4) || free_space < mss ||
+ free_space < (1 << tp->rx_opt.rcv_wscale))
+ return 0;
+ }
- /* Punt if not enough space exists in the first SKB for
- * the data in the second, or the total combined payload
- * would exceed the MSS.
+ if (free_space > tp->rcv_ssthresh) {
+ free_space = tp->rcv_ssthresh;
+ /* new window should always be an exact multiple of scaling factor
+ *
+ * For this case, we ALIGN "up" (increase free_space) because
+ * we know free_space is not zero here, it has been reduced from
+ * the memory-based limit, and rcv_ssthresh is not a hard limit
+ * (unlike sk_rcvbuf).
*/
- if ((next_skb_size > skb_tailroom(skb)) ||
- ((skb_size + next_skb_size) > mss_now))
- return;
+ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
+ }
- BUG_ON(tcp_skb_pcount(skb) != 1 ||
- tcp_skb_pcount(next_skb) != 1);
+ return free_space;
+}
- /* changing transmit queue under us so clear hints */
- clear_all_retrans_hints(tp);
+void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+ const struct sk_buff *next_skb)
+{
+ if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+ const struct skb_shared_info *next_shinfo =
+ skb_shinfo(next_skb);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+ shinfo->tskey = next_shinfo->tskey;
+ TCP_SKB_CB(skb)->txstamp_ack |=
+ TCP_SKB_CB(next_skb)->txstamp_ack;
+ }
+}
- /* Ok. We will be able to collapse the packet. */
- __skb_unlink(next_skb, &sk->sk_write_queue);
+/* Collapses two adjacent SKB's during retransmission. */
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *next_skb = skb_rb_next(skb);
+ int next_skb_size;
- memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
+ next_skb_size = next_skb->len;
- skb->ip_summed = next_skb->ip_summed;
+ BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+ if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
+ return false;
- /* Update sequence range on original skb. */
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+ tcp_highest_sack_replace(sk, next_skb, skb);
- /* Merge over control information. */
- flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
- TCP_SKB_CB(skb)->flags = flags;
+ /* Update sequence range on original skb. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
- /* All done, get rid of second SKB and account for it so
- * packet counting does not break.
- */
- TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
- if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(next_skb);
- if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
- tp->lost_out -= tcp_skb_pcount(next_skb);
- tp->left_out -= tcp_skb_pcount(next_skb);
- }
- /* Reno case is special. Sigh... */
- if (!tp->rx_opt.sack_ok && tp->sacked_out) {
- tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
- tp->left_out -= tcp_skb_pcount(next_skb);
- }
+ /* Merge over control information. This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
- /* Not quite right: it can be > snd.fack, but
- * it is better to underestimate fackets.
- */
- tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
- tcp_packets_out_dec(tp, next_skb);
- sk_stream_free_skb(sk, next_skb);
- }
+ /* All done, get rid of second SKB and account for it so
+ * packet counting does not break.
+ */
+ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+ TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
+
+ /* changed transmit queue under us so clear hints */
+ if (next_skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = skb;
+
+ tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
+ tcp_skb_collapse_tstamp(skb, next_skb);
+
+ tcp_rtx_queue_unlink_and_free(next_skb, sk);
+ return true;
}
-/* Do a simple retransmit without using the backoff mechanisms in
- * tcp_timer. This is used for path mtu discovery.
- * The socket is already locked here.
- */
-void tcp_simple_retransmit(struct sock *sk)
+/* Check if coalescing SKBs is legal. */
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- unsigned int mss = tcp_current_mss(sk, 0);
- int lost = 0;
-
- sk_stream_for_retrans_queue(skb, sk) {
- if (skb->len > mss &&
- !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
- if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- lost = 1;
- }
- }
- }
+ if (tcp_skb_pcount(skb) > 1)
+ return false;
+ if (skb_cloned(skb))
+ return false;
+ if (!skb_frags_readable(skb))
+ return false;
+ /* Some heuristics for collapsing over SACK'd could be invented */
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ return false;
+
+ return true;
+}
- clear_all_retrans_hints(tp);
+/* Collapse packets in the retransmit queue to make to create
+ * less packets on the wire. This is only done on retransmission.
+ */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
+ int space)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = to, *tmp;
+ bool first = true;
- if (!lost)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
+ return;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
- tcp_sync_left_out(tp);
+ skb_rbtree_walk_from_safe(skb, tmp) {
+ if (!tcp_can_collapse(sk, skb))
+ break;
- /* Don't muck with the congestion window here.
- * Reason is that we do not increase amount of _data_
- * in network, but units changed and effective
- * cwnd/ssthresh really reduced now.
- */
- if (icsk->icsk_ca_state != TCP_CA_Loss) {
- tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->prior_ssthresh = 0;
- tp->undo_marker = 0;
- tcp_set_ca_state(sk, TCP_CA_Loss);
+ if (!tcp_skb_can_collapse(to, skb))
+ break;
+
+ space -= skb->len;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space < 0)
+ break;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
+ break;
+
+ if (!tcp_collapse_retrans(sk, to))
+ break;
}
- tcp_xmit_retransmit_queue(sk);
}
/* This retransmits one SKB. Policy decisions and retransmit queue
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned int cur_mss = tcp_current_mss(sk, 0);
- int err;
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int cur_mss;
+ int diff, len, err;
+ int avail_wnd;
- /* Inconslusive MTU probe */
- if (icsk->icsk_mtup.probe_size) {
+ /* Inconclusive MTU probe */
+ if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
- }
- /* Do not sent more than we queued. 1/4 is reserved for possible
- * copying overhead: fragmentation, tunneling, mangling etc.
- */
- if (atomic_read(&sk->sk_wmem_alloc) >
- min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
- return -EAGAIN;
+ if (skb_still_in_host_queue(sk, skb)) {
+ err = -EBUSY;
+ goto out;
+ }
+start:
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
- if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- BUG();
- if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
- return -ENOMEM;
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(skb)->seq++;
+ goto start;
+ }
+ if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+ err = -EHOSTUNREACH; /* Routing failure or similar. */
+ goto out;
}
+ cur_mss = tcp_current_mss(sk);
+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
- * our retransmit serves as a zero window probe.
+ * our retransmit of one segment serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
- && TCP_SKB_CB(skb)->seq != tp->snd_una)
- return -EAGAIN;
-
- if (skb->len > cur_mss) {
- if (tcp_fragment(sk, skb, cur_mss, cur_mss))
- return -ENOMEM; /* We'll try again later. */
+ if (avail_wnd <= 0) {
+ if (TCP_SKB_CB(skb)->seq != tp->snd_una) {
+ err = -EAGAIN;
+ goto out;
+ }
+ avail_wnd = cur_mss;
}
- /* Collapse two adjacent packets if worthwhile and we can. */
- if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
- (skb->len < (cur_mss >> 1)) &&
- (skb->next != sk->sk_send_head) &&
- (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
- (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
- (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
- (sysctl_tcp_retrans_collapse != 0))
- tcp_retrans_try_collapse(sk, skb, cur_mss);
+ len = cur_mss * segs;
+ if (len > avail_wnd) {
+ len = rounddown(avail_wnd, cur_mss);
+ if (!len)
+ len = avail_wnd;
+ }
+ if (skb->len > len) {
+ if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+ cur_mss, GFP_ATOMIC)) {
+ err = -ENOMEM; /* We'll try again later. */
+ goto out;
+ }
+ } else {
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) {
+ err = -ENOMEM;
+ goto out;
+ }
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
- return -EHOSTUNREACH; /* Routing failure or similar. */
+ diff = tcp_skb_pcount(skb);
+ tcp_set_skb_tso_segs(skb, cur_mss);
+ diff -= tcp_skb_pcount(skb);
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
+ avail_wnd = min_t(int, avail_wnd, cur_mss);
+ if (skb->len < avail_wnd)
+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
+ }
- /* Some Solaris stacks overoptimize and ignore the FIN on a
- * retransmit when old data is attached. So strip it off
- * since it is cheap to do so and saves bytes on the network.
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check covers both
+ * cases.
+ */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+
+ /* Update global and local TCP statistics. */
+ segs = tcp_skb_pcount(skb);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
+ tp->bytes_retrans += skb->len;
+
+ /* make sure skb->data is aligned on arches that require it
+ * and check if ack-trimming & collapsing extended the headroom
+ * beyond what csum_start can cover.
*/
- if(skb->len > 0 &&
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
- tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
- if (!pskb_trim(skb, 0)) {
- TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum = 0;
+ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
+ skb_headroom(skb) >= 0xFFFF)) {
+ struct sk_buff *nskb;
+
+ tcp_skb_tsorted_save(skb) {
+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+ if (nskb) {
+ nskb->dev = NULL;
+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
+ } else {
+ err = -ENOBUFS;
+ }
+ } tcp_skb_tsorted_restore(skb);
+
+ if (!err) {
+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
+ tcp_rate_skb_sent(sk, skb);
}
+ } else {
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
- /* Make a copy, if the first transmission SKB clone we made
- * is still in somebody's hands, else make a clone.
- */
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+ tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+ TCP_SKB_CB(skb)->seq, segs, err);
- err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ if (unlikely(err) && err != -EBUSY)
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
- if (err == 0) {
- /* Update global TCP statistics. */
- TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
- tp->total_retrans++;
+out:
+ trace_tcp_retransmit_skb(sk, skb, err);
+ return err;
+}
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = __tcp_retransmit_skb(sk, skb, segs);
+
+ if (err == 0) {
#if FASTRETRANS_DEBUG > 0
- if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
- if (net_ratelimit())
- printk(KERN_DEBUG "retrans_out leaked.\n");
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
+ }
- /* Save stamp of the first retransmit. */
- if (!tp->retrans_stamp)
- tp->retrans_stamp = TCP_SKB_CB(skb)->when;
-
- tp->undo_retrans++;
+ /* Save stamp of the first (attempted) retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);
- /* snd_nxt is stored to detect loss of retransmitted segment,
- * see tcp_input.c tcp_sacktag_write_queue().
- */
- TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
- }
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
return err;
}
@@ -1850,451 +3658,712 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *rtx_head, *hole = NULL;
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int packet_cnt;
-
- if (tp->retransmit_skb_hint) {
- skb = tp->retransmit_skb_hint;
- packet_cnt = tp->retransmit_cnt_hint;
- }else{
- skb = sk->sk_write_queue.next;
- packet_cnt = 0;
- }
-
- /* First pass: retransmit lost packets. */
- if (tp->lost_out) {
- sk_stream_for_retrans_queue_from(skb, sk) {
- __u8 sacked = TCP_SKB_CB(skb)->sacked;
+ bool rearm_timer = false;
+ u32 max_segs;
+ int mib_idx;
- /* we could do better than to assign each time */
- tp->retransmit_skb_hint = skb;
- tp->retransmit_cnt_hint = packet_cnt;
-
- /* Assume this retransmit will generate
- * only one packet for congestion window
- * calculation purposes. This works because
- * tcp_retransmit_skb() will chop up the
- * packet to be MSS sized and all the
- * packet counting works out.
- */
- if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
- return;
-
- if (sacked & TCPCB_LOST) {
- if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
- if (tcp_retransmit_skb(sk, skb)) {
- tp->retransmit_skb_hint = NULL;
- return;
- }
- if (icsk->icsk_ca_state != TCP_CA_Loss)
- NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
- else
- NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
-
- if (skb ==
- skb_peek(&sk->sk_write_queue))
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
- }
-
- packet_cnt += tcp_skb_pcount(skb);
- if (packet_cnt >= tp->lost_out)
- break;
- }
- }
- }
+ if (!tp->packets_out)
+ return;
- /* OK, demanded retransmission is finished. */
+ rtx_head = tcp_rtx_queue_head(sk);
+ skb = tp->retransmit_skb_hint ?: rtx_head;
+ max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
+ skb_rbtree_walk_from(skb) {
+ __u8 sacked;
+ int segs;
- /* Forward retransmissions are possible only during Recovery. */
- if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return;
+ if (tcp_pacing_check(sk))
+ break;
- /* No forward retransmissions in Reno are possible. */
- if (!tp->rx_opt.sack_ok)
- return;
+ /* we could do better than to assign each time */
+ if (!hole)
+ tp->retransmit_skb_hint = skb;
- /* Yeah, we have to make difficult choice between forward transmission
- * and retransmission... Both ways have their merits...
- *
- * For now we do not retransmit anything, while we have some new
- * segments to send.
- */
+ segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
+ if (segs <= 0)
+ break;
+ sacked = TCP_SKB_CB(skb)->sacked;
+ /* In case tcp_shift_skb_data() have aggregated large skbs,
+ * we need to make sure not sending too bigs TSO packets
+ */
+ segs = min_t(int, segs, max_segs);
- if (tcp_may_send_now(sk, tp))
- return;
+ if (tp->retrans_out >= tp->lost_out) {
+ break;
+ } else if (!(sacked & TCPCB_LOST)) {
+ if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+ hole = skb;
+ continue;
- if (tp->forward_skb_hint) {
- skb = tp->forward_skb_hint;
- packet_cnt = tp->forward_cnt_hint;
- } else{
- skb = sk->sk_write_queue.next;
- packet_cnt = 0;
- }
+ } else {
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ mib_idx = LINUX_MIB_TCPFASTRETRANS;
+ else
+ mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+ }
- sk_stream_for_retrans_queue_from(skb, sk) {
- tp->forward_cnt_hint = packet_cnt;
- tp->forward_skb_hint = skb;
+ if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+ continue;
- /* Similar to the retransmit loop above we
- * can pretend that the retransmitted SKB
- * we send out here will be composed of one
- * real MSS sized packet because tcp_retransmit_skb()
- * will fragment it if necessary.
- */
- if (++packet_cnt > tp->fackets_out)
+ if (tcp_small_queue_check(sk, skb, 1))
break;
- if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ if (tcp_retransmit_skb(sk, skb, segs))
break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
- continue;
+ NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
- /* Ok, retransmit it. */
- if (tcp_retransmit_skb(sk, skb)) {
- tp->forward_skb_hint = NULL;
- break;
- }
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += tcp_skb_pcount(skb);
- if (skb == skb_peek(&sk->sk_write_queue))
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ if (skb == rtx_head &&
+ icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
+ rearm_timer = true;
- NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
}
+ if (rearm_timer)
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, true);
}
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+ int delta, amt;
+
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
+ return;
+
+ amt = sk_mem_pages(delta);
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-/* Send a fin. The caller locks the socket for us. This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
- int mss_now;
-
- /* Optimization, tack on the FIN if we have a queue of
- * unsent frames. But be careful about outgoing SACKS
- * and IP options.
+ struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
*/
- mss_now = tcp_current_mss(sk, 1);
+ tskb = tail;
+ if (!tskb && tcp_under_memory_pressure(sk))
+ tskb = skb_rb_last(&sk->tcp_rtx_queue);
- if (sk->sk_send_head != NULL) {
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
- TCP_SKB_CB(skb)->end_seq++;
+ if (tskb) {
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- } else {
- /* Socket is locked, keep trying until memory is available. */
- for (;;) {
- skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
- if (skb)
- break;
- yield();
+ if (!tail) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
+ return;
}
+ } else {
+ skb = alloc_skb_fclone(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC |
+ __GFP_NOWARN));
+ if (unlikely(!skb))
+ return;
- /* Reserve space for headers and prepare control bits. */
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
- TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
-
+ sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
- TCP_SKB_CB(skb)->seq = tp->write_seq;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ tcp_init_nondata_skb(skb, sk, tp->write_seq,
+ TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
- __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
* an explicit close() or as a byproduct of exit()'ing) and there
* was unread data in the receive queue. This behavior is recommended
- * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
+ * by RFC 2525, section 2.17. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+ enum sk_rst_reason reason)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+
/* NOTE: No TCP options attached and we never retransmit this. */
skb = alloc_skb(MAX_TCP_HEADER, priority);
if (!skb) {
- NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
- TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
-
+ tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk),
+ TCPHDR_ACK | TCPHDR_RST);
+ tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
- TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
- NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+ /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
+ * skb here is different to the troublesome skb, so use NULL
+ */
+ trace_tcp_send_reset(sk, NULL, reason);
}
-/* WARNING: This routine must only be called when we have already sent
+/* Send a crossed SYN-ACK during socket establishment.
+ * WARNING: This routine must only be called when we have already sent
* a SYN packet that crossed the incoming SYN that caused this routine
* to get called. If this assumption fails then the initial rcv_wnd
* and rcv_wscale values will not be correct.
*/
int tcp_send_synack(struct sock *sk)
{
- struct sk_buff* skb;
+ struct sk_buff *skb;
- skb = skb_peek(&sk->sk_write_queue);
- if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
- printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+ skb = tcp_rtx_queue_head(sk);
+ if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err("%s: wrong queue state\n", __func__);
return -EFAULT;
}
- if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
- struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
- if (nskb == NULL)
+ struct sk_buff *nskb;
+
+ tcp_skb_tsorted_save(skb) {
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ } tcp_skb_tsorted_restore(skb);
+ if (!nskb)
return -ENOMEM;
- __skb_unlink(skb, &sk->sk_write_queue);
- skb_header_release(nskb);
- __skb_queue_head(&sk->sk_write_queue, nskb);
- sk_stream_free_skb(sk, skb);
- sk_charge_skb(sk, nskb);
+ INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+ tcp_highest_sack_replace(sk, skb, nskb);
+ tcp_rtx_queue_unlink_and_free(skb, sk);
+ __skb_header_release(nskb);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
+ sk_wmem_queued_add(sk, nskb->truesize);
+ sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
- TCP_ECN_send_synack(tcp_sk(sk), skb);
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
+ tcp_ecn_send_synack(sk, skb);
}
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
-/*
- * Prepare a SYN-ACK.
+/**
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
-struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
+struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th;
- int tcp_header_size;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_out_options opts;
+ struct tcp_key key = {};
struct sk_buff *skb;
-#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
-#endif
+ int tcp_header_size;
+ struct tcphdr *th;
+ int mss;
+ u64 now;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
- if (skb == NULL)
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ dst_release(dst);
return NULL;
-
+ }
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->dst = dst_clone(dst);
+ switch (synack_type) {
+ case TCP_SYNACK_NORMAL:
+ skb_set_owner_edemux(skb, req_to_sk(req));
+ break;
+ case TCP_SYNACK_COOKIE:
+ /* Under synflood, we do not attach skb to a socket,
+ * to avoid false sharing.
+ */
+ break;
+ case TCP_SYNACK_FASTOPEN:
+ /* sk is a const pointer, because we want to express multiple
+ * cpu might call us concurrently.
+ * sk->sk_wmem_alloc in an atomic, we can promote to rw.
+ */
+ skb_set_owner_w(skb, (struct sock *)sk);
+ break;
+ }
+ skb_dst_set(skb, dst);
+
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
- tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
- (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
- (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
- /* SACK_PERM is in the place of NOP NOP of TS */
- ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+ memset(&opts, 0, sizeof(opts));
+ now = tcp_clock_ns();
+#ifdef CONFIG_SYN_COOKIES
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
+ skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
+ SKB_CLOCK_MONOTONIC);
+ else
+#endif
+ {
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
+ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
+ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
+ }
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ rcu_read_lock();
+#endif
+ if (tcp_rsk_used_ao(req)) {
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_key *ao_key = NULL;
+ u8 keyid = tcp_rsk(req)->ao_keyid;
+ u8 rnext = tcp_rsk(req)->ao_rcv_next;
+
+ ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
+ keyid, -1);
+ /* If there is no matching key - avoid sending anything,
+ * especially usigned segments. It could try harder and lookup
+ * for another peer-matching key, but the peer has requested
+ * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
+ */
+ if (unlikely(!ao_key)) {
+ trace_tcp_ao_synack_no_key(sk, keyid, rnext);
+ rcu_read_unlock();
+ kfree_skb(skb);
+ net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
+ keyid);
+ return NULL;
+ }
+ key.ao_key = ao_key;
+ key.type = TCP_KEY_AO;
+#endif
+ } else {
#ifdef CONFIG_TCP_MD5SIG
- /* Are we doing MD5 on this segment? If so - make room for it */
- md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
- if (md5)
- tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
+ key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk,
+ req_to_sk(req));
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
#endif
- skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+ }
+ skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts,
+ &key, foc, synack_type, syn_skb)
+ + sizeof(*th);
+
+ skb_push(skb, tcp_header_size);
+ skb_reset_transport_header(skb);
+ th = (struct tcphdr *)skb->data;
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- TCP_ECN_make_synack(req, th);
- th->source = inet_sk(sk)->sport;
- th->dest = ireq->rmt_port;
- TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
- TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- th->seq = htonl(TCP_SKB_CB(skb)->seq);
- th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
- if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
- __u8 rcv_wscale;
- /* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
- /* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
- dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
- ireq->wscale_ok,
- &rcv_wscale);
- ireq->rcv_wscale = rcv_wscale;
- }
+ tcp_ecn_make_synack(req, th);
+ th->source = htons(ireq->ir_num);
+ th->dest = ireq->ir_rmt_port;
+ skb->mark = ireq->ir_mark;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ th->seq = htonl(tcp_rsk(req)->snt_isn);
+ /* XXX data is queued and acked as is. No buffer/window check */
+ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
- th->window = htons(req->rcv_wnd);
-
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
- ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
- TCP_SKB_CB(skb)->when,
- req->ts_recent,
- (
-#ifdef CONFIG_TCP_MD5SIG
- md5 ? &md5_hash_location :
-#endif
- NULL)
- );
-
- skb->csum = 0;
+ th->window = htons(min(req->rsk_rcv_wnd, 65535U));
+ tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key);
th->doff = (tcp_header_size >> 2);
- TCP_INC_STATS(TCP_MIB_OUTSEGS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
-#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
- if (md5) {
- tp->af_specific->calc_md5_hash(md5_hash_location,
- md5,
- NULL, dst, req,
- skb->h.th, sk->sk_protocol,
- skb->len);
+ if (tcp_key_is_md5(&key)) {
+#ifdef CONFIG_TCP_MD5SIG
+ tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
+ key.md5_key, req_to_sk(req), skb);
+#endif
+ } else if (tcp_key_is_ao(&key)) {
+#ifdef CONFIG_TCP_AO
+ tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location,
+ key.ao_key, req, skb,
+ opts.hash_location - (u8 *)th, 0);
+#endif
}
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ rcu_read_unlock();
#endif
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
+ tcp_add_tx_delay(skb, tp);
+
return skb;
}
+EXPORT_IPV6_MOD(tcp_make_synack);
-/*
- * Do all connect socket setups that can be done AF independent.
- */
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+ if (ca_key == TCP_CA_UNSPEC)
+ return;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ }
+ rcu_read_unlock();
+}
+
+/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
- struct dst_entry *dst = __sk_dst_get(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u16 user_mss;
+ u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
- tp->tcp_header_len = sizeof(struct tcphdr) +
- (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
+ tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
-#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk) != NULL)
- tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
-#endif
+ tcp_ao_connect_init(sk);
/* If user gave his TCP_MAXSEG, record it to clamp */
- if (tp->rx_opt.user_mss)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss)
+ tp->rx_opt.mss_clamp = user_mss;
tp->max_window = 0;
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
+ tcp_ca_dst_init(sk, dst);
+
if (!tp->window_clamp)
- tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
- tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
+ tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+
tcp_initialize_rcv_mss(sk);
- tcp_select_initial_window(tcp_full_space(sk),
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+ WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));
+
+ rcv_wnd = tcp_rwnd_init_bpf(sk);
+ if (rcv_wnd == 0)
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
+ tcp_select_initial_window(sk, tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
- sysctl_tcp_window_scaling,
- &rcv_wscale);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
+ &rcv_wscale,
+ rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
- sk->sk_err = 0;
+ WRITE_ONCE(sk->sk_err, 0);
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
- tcp_init_wl(tp, tp->write_seq, 0);
+ tcp_init_wl(tp, 0);
+ tcp_write_queue_purge(sk);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
- tp->rcv_nxt = 0;
- tp->rcv_wup = 0;
- tp->copied_seq = 0;
+ tp->snd_up = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
- inet_csk(sk)->icsk_retransmits = 0;
+ if (likely(!tp->repair))
+ tp->rcv_nxt = 0;
+ else
+ tp->rcv_tstamp = tcp_jiffies32;
+ tp->rcv_wup = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+
+ inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
tcp_clear_retrans(tp);
}
-/*
- * Build a SYN and send it off.
- */
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->end_seq += skb->len;
+ __skb_header_release(skb);
+ sk_wmem_queued_add(sk, skb->truesize);
+ sk_mem_charge(sk, skb->truesize);
+ WRITE_ONCE(tp->write_seq, tcb->end_seq);
+ tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_request *fo = tp->fastopen_req;
+ struct page_frag *pfrag = sk_page_frag(sk);
+ struct sk_buff *syn_data;
+ int space, err = 0;
+
+ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
+ goto fallback;
+
+ /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+ * user-MSS. Reserve maximum option space for middleboxes that add
+ * private TCP options. The cost is reduced data space in SYN :(
+ */
+ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+ /* Sync mss_cache after updating the mss_clamp */
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+
+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
+ MAX_TCP_OPTION_SPACE;
+
+ space = min_t(size_t, space, fo->size);
+
+ if (space &&
+ !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE),
+ pfrag, sk->sk_allocation))
+ goto fallback;
+ syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false);
+ if (!syn_data)
+ goto fallback;
+ memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+ if (space) {
+ space = min_t(size_t, space, pfrag->size - pfrag->offset);
+ space = tcp_wmem_schedule(sk, space);
+ }
+ if (space) {
+ space = copy_page_from_iter(pfrag->page, pfrag->offset,
+ space, &fo->data->msg_iter);
+ if (unlikely(!space)) {
+ tcp_skb_tsorted_anchor_cleanup(syn_data);
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ skb_fill_page_desc(syn_data, 0, pfrag->page,
+ pfrag->offset, space);
+ page_ref_inc(pfrag->page);
+ pfrag->offset += space;
+ skb_len_add(syn_data, space);
+ skb_zcopy_set(syn_data, fo->uarg, NULL);
+ }
+ /* No more data pending in inet_wait_for_connect() */
+ if (space == fo->size)
+ fo->data = NULL;
+ fo->copied = space;
+
+ tcp_connect_queue_skb(sk, syn_data);
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
+
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
+
+ skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);
+
+ /* Now full SYN+DATA was cloned and sent (or not),
+ * remove the SYN from the original skb (syn_data)
+ * we keep in write queue in case of a retransmit, as we
+ * also have the SYN packet (with no data) in the same queue.
+ */
+ TCP_SKB_CB(syn_data)->seq++;
+ TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+ if (!err) {
+ tp->syn_data = (fo->copied > 0);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
+ goto done;
+ }
+
+ /* data was not sent, put it in write_queue */
+ __skb_queue_tail(&sk->sk_write_queue, syn_data);
+ tp->packets_out -= tcp_skb_pcount(syn_data);
+
+fallback:
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+done:
+ fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
+ return err;
+}
+
+/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
+ int err;
- tcp_connect_init(sk);
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
- buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
- if (unlikely(buff == NULL))
- return -ENOBUFS;
+#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
+ /* Has to be checked late, after setting daddr/saddr/ops.
+ * Return error if the peer has both a md5 and a tcp-ao key
+ * configured as this is ambiguous.
+ */
+ if (unlikely(rcu_dereference_protected(tp->md5sig_info,
+ lockdep_sock_is_held(sk)))) {
+ bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1);
+ bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk);
+ struct tcp_ao_info *ao_info;
+
+ ao_info = rcu_dereference_check(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (ao_info) {
+ /* This is an extra check: tcp_ao_required() in
+ * tcp_v{4,6}_parse_md5_keys() should prevent adding
+ * md5 keys on ao_required socket.
+ */
+ needs_ao |= ao_info->ao_required;
+ WARN_ON_ONCE(ao_info->ao_required && needs_md5);
+ }
+ if (needs_md5 && needs_ao)
+ return -EKEYREJECTED;
- /* Reserve space for headers. */
- skb_reserve(buff, MAX_TCP_HEADER);
+ /* If we have a matching md5 key and no matching tcp-ao key
+ * then free up ao_info if allocated.
+ */
+ if (needs_md5) {
+ tcp_ao_destroy_sock(sk, false);
+ } else if (needs_ao) {
+ tcp_clear_md5_list(sk);
+ kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
+ lockdep_sock_is_held(sk)));
+ }
+ }
+#endif
+#ifdef CONFIG_TCP_AO
+ if (unlikely(rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk)))) {
+ /* Don't allow connecting if ao is configured but no
+ * matching key is found.
+ */
+ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1))
+ return -EKEYREJECTED;
+ }
+#endif
- TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
- TCP_ECN_send_syn(sk, tp, buff);
- TCP_SKB_CB(buff)->sacked = 0;
- skb_shinfo(buff)->gso_segs = 1;
- skb_shinfo(buff)->gso_size = 0;
- skb_shinfo(buff)->gso_type = 0;
- buff->csum = 0;
- tp->snd_nxt = tp->write_seq;
- TCP_SKB_CB(buff)->seq = tp->write_seq++;
- TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+ return -EHOSTUNREACH; /* Routing failure or similar. */
- /* Send it off. */
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tp->retrans_stamp = TCP_SKB_CB(buff)->when;
- skb_header_release(buff);
- __skb_queue_tail(&sk->sk_write_queue, buff);
- sk_charge_skb(sk, buff);
- tp->packets_out += tcp_skb_pcount(buff);
- tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
+ tcp_connect_init(sk);
+
+ if (unlikely(tp->repair)) {
+ tcp_finish_connect(sk, NULL);
+ return 0;
+ }
+
+ buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true);
+ if (unlikely(!buff))
+ return -ENOBUFS;
+
+ /* SYN eats a sequence byte, write_seq updated by
+ * tcp_connect_queue_skb().
+ */
+ tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN);
+ tcp_mstamp_refresh(tp);
+ tp->retrans_stamp = tcp_time_stamp_ts(tp);
+ tcp_connect_queue_skb(sk, buff);
+ tcp_ecn_send_syn(sk, buff);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+
+ /* Send off SYN; include data in Fast Open. */
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ if (err == -ECONNREFUSED)
+ return err;
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
tp->pushed_seq = tp->write_seq;
- TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
+ buff = tcp_send_head(sk);
+ if (unlikely(buff)) {
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
+ tp->pushed_seq = TCP_SKB_CB(buff)->seq;
+ }
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, false);
return 0;
}
+EXPORT_SYMBOL(tcp_connect);
+
+u32 tcp_delack_max(const struct sock *sk)
+{
+ u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1;
+
+ return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min);
+}
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2308,9 +4377,10 @@ void tcp_send_delayed_ack(struct sock *sk)
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
- int max_ato = HZ/2;
+ int max_ato = HZ / 2;
- if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
+ if (inet_csk_in_pingpong_mode(sk) ||
+ (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
@@ -2319,8 +4389,9 @@ void tcp_send_delayed_ack(struct sock *sk)
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
- if (tp->srtt) {
- int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
+ if (tp->srtt_us) {
+ int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+ TCP_DELACK_MIN);
if (rtt < max_ato)
max_ato = rtt;
@@ -2329,63 +4400,74 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, tcp_delack_max(sk));
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- */
- if (icsk->icsk_ack.blocked ||
- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ /* If delack timer is about to expire, send ACK now. */
+ if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, icsk->icsk_ack.timeout))
- timeout = icsk->icsk_ack.timeout;
+ if (!time_before(timeout, icsk_delack_timeout(icsk)))
+ timeout = icsk_delack_timeout(icsk);
}
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
- icsk->icsk_ack.timeout = timeout;
+ smp_store_release(&icsk->icsk_ack.pending,
+ icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER);
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
-void tcp_send_ack(struct sock *sk)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags)
{
+ struct sk_buff *buff;
+
/* If we have been reset, we may not send again. */
- if (sk->sk_state != TCP_CLOSE) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *buff;
+ if (sk->sk_state == TCP_CLOSE)
+ return;
- /* We are not putting this on the write queue, so
- * tcp_transmit_skb() will set the ownership to this
- * sock.
- */
- buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
- if (buff == NULL) {
- inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
- return;
- }
+ /* We are not putting this on the write queue, so
+ * tcp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ buff = alloc_skb(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+ if (unlikely(!buff)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long delay;
+
+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
+ if (delay < tcp_rto_max(sk))
+ icsk->icsk_ack.retry++;
+ inet_csk_schedule_ack(sk);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+ tcp_init_nondata_skb(buff, sk,
+ tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(buff, MAX_TCP_HEADER);
- buff->csum = 0;
- TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
- TCP_SKB_CB(buff)->sacked = 0;
- skb_shinfo(buff)->gso_segs = 1;
- skb_shinfo(buff)->gso_size = 0;
- skb_shinfo(buff)->gso_type = 0;
+ /* We do not want pure acks influencing TCP Small Queues or fq/pacing
+ * too much.
+ * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
+ */
+ skb_set_tcp_pure_ack(buff);
- /* Send it off, this clears delayed acks for us. */
- TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
- }
+ /* Send it off, this clears delayed acks for us. */
+ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+}
+EXPORT_SYMBOL_GPL(__tcp_send_ack);
+
+void tcp_send_ack(struct sock *sk)
+{
+ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0);
}
/* This routine sends a packet with an out of date sequence
@@ -2399,78 +4481,80 @@ void tcp_send_ack(struct sock *sk)
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
- if (skb == NULL)
+ skb = alloc_skb(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+ if (!skb)
return -1;
/* Reserve space for headers and set control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
- TCP_SKB_CB(skb)->sacked = urgent;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
-
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+ tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK);
+ NET_INC_STATS(sock_net(sk), mib);
+ return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
-int tcp_write_wakeup(struct sock *sk)
+/* Called from setsockopt( ... TCP_REPAIR ) */
+void tcp_send_window_probe(struct sock *sk)
{
- if (sk->sk_state != TCP_CLOSE) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_mstamp_refresh(tcp_sk(sk));
+ tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
+ }
+}
- if ((skb = sk->sk_send_head) != NULL &&
- before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
- int err;
- unsigned int mss = tcp_current_mss(sk, 0);
- unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
+/* Initiate keepalive or window probe from timer. */
+int tcp_write_wakeup(struct sock *sk, int mib)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
- if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
- tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+ if (sk->sk_state == TCP_CLOSE)
+ return -1;
- /* We are probing the opening of a window
- * but the window size is != 0
- * must have been a result SWS avoidance ( sender )
- */
- if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
- skb->len > mss) {
- seg_size = min(seg_size, mss);
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- if (tcp_fragment(sk, skb, seg_size, mss))
- return -1;
- } else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb, mss);
-
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
- if (!err) {
- update_send_head(sk, tp, skb);
- }
- return err;
- } else {
- if (tp->urg_mode &&
- between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
- tcp_xmit_probe_skb(sk, TCPCB_URG);
- return tcp_xmit_probe_skb(sk, 0);
- }
+ skb = tcp_send_head(sk);
+ if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ int err;
+ unsigned int mss = tcp_current_mss(sk);
+ unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+ if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+ /* We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS avoidance ( sender )
+ */
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ skb->len > mss) {
+ seg_size = min(seg_size, mss);
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, seg_size, mss, GFP_ATOMIC))
+ return -1;
+ } else if (!tcp_skb_pcount(skb))
+ tcp_set_skb_tso_segs(skb, mss);
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ if (!err)
+ tcp_event_new_data_sent(sk, skb);
+ return err;
+ } else {
+ if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+ tcp_xmit_probe_skb(sk, 1, mib);
+ return tcp_xmit_probe_skb(sk, 0, mib);
}
- return -1;
}
/* A window probe timeout has occurred. If window is not closed send
@@ -2480,43 +4564,60 @@ void tcp_send_probe0(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ unsigned long timeout;
int err;
- err = tcp_write_wakeup(sk);
+ err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
- if (tp->packets_out || !sk->sk_send_head) {
+ if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
return;
}
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
if (err <= 0) {
- if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
- icsk->icsk_probes_out++;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ timeout = tcp_probe0_when(sk, tcp_rto_max(sk));
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember icsk_probes_out.
- * Let local senders to fight for local resources.
- *
- * Use accumulated backoff yet.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_probes_out)
- icsk->icsk_probes_out = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff,
- TCP_RESOURCE_PROBE_INTERVAL),
- TCP_RTO_MAX);
+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
}
+
+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true);
}
-EXPORT_SYMBOL(tcp_connect);
-EXPORT_SYMBOL(tcp_make_synack);
-EXPORT_SYMBOL(tcp_simple_retransmit);
-EXPORT_SYMBOL(tcp_sync_mss);
-EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
-EXPORT_SYMBOL(tcp_mtup_init);
+int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
+{
+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+ struct flowi fl;
+ int res;
+
+ /* Paired with WRITE_ONCE() in sock_setsockopt() */
+ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+ WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
+ if (!res) {
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk))) {
+ /* sk has const attribute because listeners are lockless.
+ * However in this case, we are dealing with a passive fastopen
+ * socket thus we can change total_retrans value.
+ */
+ tcp_sk_rw(sk)->total_retrans++;
+ }
+ trace_tcp_retransmit_synack(sk, req);
+ WRITE_ONCE(req->num_retrans, req->num_retrans + 1);
+ }
+ return res;
+}
+EXPORT_IPV6_MOD(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
new file mode 100644
index 000000000000..4bcf7eff95e3
--- /dev/null
+++ b/net/ipv4/tcp_plb.c
@@ -0,0 +1,109 @@
+/* Protective Load Balancing (PLB)
+ *
+ * PLB was designed to reduce link load imbalance across datacenter
+ * switches. PLB is a host-based optimization; it leverages congestion
+ * signals from the transport layer to randomly change the path of the
+ * connection experiencing sustained congestion. PLB prefers to repath
+ * after idle periods to minimize packet reordering. It repaths by
+ * changing the IPv6 Flow Label on the packets of a connection, which
+ * datacenter switches include as part of ECMP/WCMP hashing.
+ *
+ * PLB is described in detail in:
+ *
+ * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
+ * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
+ * David Wetherall,Abdul Kabbani:
+ * "PLB: Congestion Signals are Simple and Effective for
+ * Network Load Balancing"
+ * In ACM SIGCOMM 2022, Amsterdam Netherlands.
+ *
+ */
+
+#include <net/tcp.h>
+
+/* Called once per round-trip to update PLB state for a connection. */
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+ const int cong_ratio)
+{
+ struct net *net = sock_net(sk);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ if (cong_ratio >= 0) {
+ if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
+ plb->consec_cong_rounds = 0;
+ else if (plb->consec_cong_rounds <
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
+ plb->consec_cong_rounds++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state);
+
+/* Check whether recent congestion has been persistent enough to warrant
+ * a load balancing decision that switches the connection to another path.
+ */
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
+{
+ struct net *net = sock_net(sk);
+ u32 max_suspend;
+ bool forced_rehash = false, idle_rehash = false;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ forced_rehash = plb->consec_cong_rounds >=
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
+ /* If sender goes idle then we check whether to rehash. */
+ idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
+ !tcp_sk(sk)->packets_out &&
+ plb->consec_cong_rounds >=
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
+
+ if (!forced_rehash && !idle_rehash)
+ return;
+
+ /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
+ * cases where the max suspension end is before the actual suspension
+ * end. We clear pause_until to 0 to indicate there is no recent
+ * RTO event that constrains PLB rehashing.
+ */
+ max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+ if (plb->pause_until &&
+ (!before(tcp_jiffies32, plb->pause_until) ||
+ before(tcp_jiffies32 + max_suspend, plb->pause_until)))
+ plb->pause_until = 0;
+
+ if (plb->pause_until)
+ return;
+
+ sk_rethink_txhash(sk);
+ plb->consec_cong_rounds = 0;
+ tcp_sk(sk)->plb_rehash++;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
+}
+EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
+
+/* Upon RTO, disallow load balancing for a while, to avoid having load
+ * balancing decisions switch traffic to a black-holed path that was
+ * previously avoided with a sk_rethink_txhash() call at RTO time.
+ */
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
+{
+ struct net *net = sock_net(sk);
+ u32 pause;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+ pause += get_random_u32_below(pause);
+ plb->pause_until = tcp_jiffies32 + pause;
+
+ /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
+ * that may switch this connection to a path with completely different
+ * congestion characteristics.
+ */
+ plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
deleted file mode 100644
index f230eeecf092..000000000000
--- a/net/ipv4/tcp_probe.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * tcpprobe - Observe the TCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/vmalloc.h>
-
-#include <net/tcp.h>
-
-MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
-MODULE_DESCRIPTION("TCP cwnd snooper");
-MODULE_LICENSE("GPL");
-
-static int port = 0;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static int bufsize = 64*1024;
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-static const char procname[] = "tcpprobe";
-
-struct {
- struct kfifo *fifo;
- spinlock_t lock;
- wait_queue_head_t wait;
- struct timeval tstart;
-} tcpw;
-
-static void printl(const char *fmt, ...)
-{
- va_list args;
- int len;
- struct timeval now;
- char tbuf[256];
-
- va_start(args, fmt);
- do_gettimeofday(&now);
-
- now.tv_sec -= tcpw.tstart.tv_sec;
- now.tv_usec -= tcpw.tstart.tv_usec;
- if (now.tv_usec < 0) {
- --now.tv_sec;
- now.tv_usec += 1000000;
- }
-
- len = sprintf(tbuf, "%lu.%06lu ",
- (unsigned long) now.tv_sec,
- (unsigned long) now.tv_usec);
- len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
- va_end(args);
-
- kfifo_put(tcpw.fifo, tbuf, len);
- wake_up(&tcpw.wait);
-}
-
-static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t size)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_sock *inet = inet_sk(sk);
-
- if (port == 0 || ntohs(inet->dport) == port ||
- ntohs(inet->sport) == port) {
- printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n",
- NIPQUAD(inet->saddr), ntohs(inet->sport),
- NIPQUAD(inet->daddr), ntohs(inet->dport),
- size, tp->snd_nxt, tp->snd_una,
- tp->snd_cwnd, tcp_current_ssthresh(sk),
- tp->snd_wnd);
- }
-
- jprobe_return();
- return 0;
-}
-
-static struct jprobe tcp_send_probe = {
- .kp = {
- .symbol_name = "tcp_sendmsg",
- },
- .entry = JPROBE_ENTRY(jtcp_sendmsg),
-};
-
-
-static int tcpprobe_open(struct inode * inode, struct file * file)
-{
- kfifo_reset(tcpw.fifo);
- do_gettimeofday(&tcpw.tstart);
- return 0;
-}
-
-static ssize_t tcpprobe_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- int error = 0, cnt = 0;
- unsigned char *tbuf;
-
- if (!buf || len < 0)
- return -EINVAL;
-
- if (len == 0)
- return 0;
-
- tbuf = vmalloc(len);
- if (!tbuf)
- return -ENOMEM;
-
- error = wait_event_interruptible(tcpw.wait,
- __kfifo_len(tcpw.fifo) != 0);
- if (error)
- goto out_free;
-
- cnt = kfifo_get(tcpw.fifo, tbuf, len);
- error = copy_to_user(buf, tbuf, cnt);
-
-out_free:
- vfree(tbuf);
-
- return error ? error : cnt;
-}
-
-static struct file_operations tcpprobe_fops = {
- .owner = THIS_MODULE,
- .open = tcpprobe_open,
- .read = tcpprobe_read,
-};
-
-static __init int tcpprobe_init(void)
-{
- int ret = -ENOMEM;
-
- init_waitqueue_head(&tcpw.wait);
- spin_lock_init(&tcpw.lock);
- tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock);
- if (IS_ERR(tcpw.fifo))
- return PTR_ERR(tcpw.fifo);
-
- if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
- goto err0;
-
- ret = register_jprobe(&tcp_send_probe);
- if (ret)
- goto err1;
-
- pr_info("TCP watch registered (port=%d)\n", port);
- return 0;
- err1:
- proc_net_remove(procname);
- err0:
- kfifo_free(tcpw.fifo);
- return ret;
-}
-module_init(tcpprobe_init);
-
-static __exit void tcpprobe_exit(void)
-{
- kfifo_free(tcpw.fifo);
- proc_net_remove(procname);
- unregister_jprobe(&tcp_send_probe);
-
-}
-module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..a8f6d9d06f2e
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ u64 tx_tstamp;
+
+ if (!scb->tx.delivered_mstamp)
+ return;
+
+ tx_tstamp = tcp_skb_timestamp_us(skb);
+ if (!rs->prior_delivered ||
+ tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ scb->end_seq, rs->last_end_seq)) {
+ rs->prior_delivered_ce = scb->tx.delivered_ce;
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->last_end_seq = scb->end_seq;
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tx_tstamp;
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
+
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ bool is_sack_reneg, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = tp->tcp_mstamp;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available or
+ * in recovery from loss with SACK reneging. Rate samples taken during
+ * a SACK reneging event may overestimate bw by including packets that
+ * were SACKed before the reneg.
+ */
+ if (!rs->prior_mstamp || is_sack_reneg) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ /* delivered_ce occupies less than 32 bits in the skb control block */
+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Record both segment send and ack receive intervals */
+ rs->snd_interval_us = snd_us;
+ rs->rcv_interval_us = ack_us;
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000000..c52fd3254b6e
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+static u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->reord_seen) {
+ /* If reordering has not been observed, be aggressive during
+ * the recovery or starting the recovery by DUPACK threshold.
+ */
+ if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+ return 0;
+
+ if (tp->sacked_out >= tp->reordering &&
+ !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_NO_DUPTHRESH))
+ return 0;
+ }
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay.
+ * Use min_rtt instead of the smoothed RTT because reordering is
+ * often a path property and less related to queuing or delayed ACKs.
+ * Upon receiving DSACKs, linearly increase the window up to the
+ * smoothed RTT.
+ */
+ return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+ tp->srtt_us >> 3);
+}
+
+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
+{
+ return tp->rack.rtt_us + reo_wnd -
+ tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
+}
+
+/* RACK loss detection (IETF RFC8985):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
+ */
+static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb, *n;
+ u32 reo_wnd;
+
+ *reo_timeout = 0;
+ reo_wnd = tcp_rack_reo_wnd(sk);
+ list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
+ tcp_tsorted_anchor) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ s32 remaining;
+
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ if (!tcp_skb_sent_after(tp->rack.mstamp,
+ tcp_skb_timestamp_us(skb),
+ tp->rack.end_seq, scb->end_seq))
+ break;
+
+ /* A packet is lost if it has not been s/acked beyond
+ * the recent RTT plus the reordering window.
+ */
+ remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
+ if (remaining <= 0) {
+ tcp_mark_skb_lost(sk, skb);
+ list_del_init(&skb->tcp_tsorted_anchor);
+ } else {
+ /* Record maximum wait time */
+ *reo_timeout = max_t(u32, *reo_timeout, remaining);
+ }
+ }
+}
+
+bool tcp_rack_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout;
+
+ if (!tp->rack.advanced)
+ return false;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+ tcp_rack_detect_loss(sk, &timeout);
+ if (timeout) {
+ timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+ timeout, inet_csk(sk)->icsk_rto);
+ }
+ return !!timeout;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
+ u64 xmit_time)
+{
+ u32 rtt_us;
+
+ rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
+ if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ return;
+ }
+ tp->rack.advanced = 1;
+ tp->rack.rtt_us = rtt_us;
+ if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
+ tp->rack.mstamp = xmit_time;
+ tp->rack.end_seq = end_seq;
+ }
+}
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, prior_inflight;
+ u32 lost = tp->lost;
+
+ prior_inflight = tcp_packets_in_flight(tp);
+ tcp_rack_detect_loss(sk, &timeout);
+ if (prior_inflight != tcp_packets_in_flight(tp)) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+ tcp_enter_recovery(sk, false);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0);
+ }
+ tcp_xmit_retransmit_queue(sk);
+ }
+ if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+ tcp_rearm_rto(sk);
+}
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_STATIC_REO_WND) ||
+ !rs->prior_delivered)
+ return;
+
+ /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+ if (before(rs->prior_delivered, tp->rack.last_delivered))
+ tp->rack.dsack_seen = 0;
+
+ /* Adjust the reo_wnd if update is pending */
+ if (tp->rack.dsack_seen) {
+ tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+ tp->rack.reo_wnd_steps + 1);
+ tp->rack.dsack_seen = 0;
+ tp->rack.last_delivered = tp->delivered;
+ tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+ } else if (!tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_steps = 1;
+ }
+}
+
+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
+ * the next unacked packet upon receiving
+ * a) three or more DUPACKs to start the fast recovery
+ * b) an ACK acknowledging new data during the fast recovery.
+ */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
+{
+ const u8 state = inet_csk(sk)->icsk_ca_state;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
+ (state == TCP_CA_Recovery && snd_una_advanced)) {
+ struct sk_buff *skb = tcp_rtx_queue_head(sk);
+ u32 mss;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ return;
+
+ mss = tcp_skb_mss(skb);
+ if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
+ tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ mss, mss, GFP_ATOMIC);
+
+ tcp_mark_skb_lost(sk, skb);
+ }
+}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 4624501e9680..862b96248a92 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Tom Kelly's Scalable TCP
*
- * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ * See http://www.deneholme.net/tom/scalable/
*
* John Heffner <jheffner@sc.edu>
*/
@@ -9,43 +10,38 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8. We use 50 instead of 100 to account for
- * delayed ack.
+ * .01 and 7/8.
*/
-#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
- u32 in_flight, int flag)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
- else {
- tp->snd_cwnd_cnt++;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
+ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
+ acked);
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
-}
+ return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U);
+}
-static struct tcp_congestion_ops tcp_scalable = {
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_scalable_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.owner = THIS_MODULE,
.name = "scalable",
diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c
new file mode 100644
index 000000000000..d8a4f192873a
--- /dev/null
+++ b/net/ipv4/tcp_sigpool.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/hash.h>
+#include <linux/cpu.h>
+#include <linux/kref.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+
+static size_t __scratch_size;
+struct sigpool_scratch {
+ local_lock_t bh_lock;
+ void __rcu *pad;
+};
+
+static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+struct sigpool_entry {
+ struct crypto_ahash *hash;
+ const char *alg;
+ struct kref kref;
+ uint16_t needs_key:1,
+ reserved:15;
+};
+
+#define CPOOL_SIZE (PAGE_SIZE / sizeof(struct sigpool_entry))
+static struct sigpool_entry cpool[CPOOL_SIZE];
+static unsigned int cpool_populated;
+static DEFINE_MUTEX(cpool_mutex);
+
+/* Slow-path */
+struct scratches_to_free {
+ struct rcu_head rcu;
+ unsigned int cnt;
+ void *scratches[];
+};
+
+static void free_old_scratches(struct rcu_head *head)
+{
+ struct scratches_to_free *stf;
+
+ stf = container_of(head, struct scratches_to_free, rcu);
+ while (stf->cnt--)
+ kfree(stf->scratches[stf->cnt]);
+ kfree(stf);
+}
+
+/**
+ * sigpool_reserve_scratch - re-allocates scratch buffer, slow-path
+ * @size: request size for the scratch/temp buffer
+ */
+static int sigpool_reserve_scratch(size_t size)
+{
+ struct scratches_to_free *stf;
+ size_t stf_sz = struct_size(stf, scratches, num_possible_cpus());
+ int cpu, err = 0;
+
+ lockdep_assert_held(&cpool_mutex);
+ if (__scratch_size >= size)
+ return 0;
+
+ stf = kmalloc(stf_sz, GFP_KERNEL);
+ if (!stf)
+ return -ENOMEM;
+ stf->cnt = 0;
+
+ size = max(size, __scratch_size);
+ cpus_read_lock();
+ for_each_possible_cpu(cpu) {
+ void *scratch, *old_scratch;
+
+ scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (!scratch) {
+ err = -ENOMEM;
+ break;
+ }
+
+ old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
+ scratch, lockdep_is_held(&cpool_mutex));
+ if (!cpu_online(cpu) || !old_scratch) {
+ kfree(old_scratch);
+ continue;
+ }
+ stf->scratches[stf->cnt++] = old_scratch;
+ }
+ cpus_read_unlock();
+ if (!err)
+ __scratch_size = size;
+
+ call_rcu(&stf->rcu, free_old_scratches);
+ return err;
+}
+
+static void sigpool_scratch_free(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
+ NULL, lockdep_is_held(&cpool_mutex)));
+ __scratch_size = 0;
+}
+
+static int __cpool_try_clone(struct crypto_ahash *hash)
+{
+ struct crypto_ahash *tmp;
+
+ tmp = crypto_clone_ahash(hash);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ crypto_free_ahash(tmp);
+ return 0;
+}
+
+static int __cpool_alloc_ahash(struct sigpool_entry *e, const char *alg)
+{
+ struct crypto_ahash *cpu0_hash;
+ int ret;
+
+ e->alg = kstrdup(alg, GFP_KERNEL);
+ if (!e->alg)
+ return -ENOMEM;
+
+ cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cpu0_hash)) {
+ ret = PTR_ERR(cpu0_hash);
+ goto out_free_alg;
+ }
+
+ e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY;
+
+ ret = __cpool_try_clone(cpu0_hash);
+ if (ret)
+ goto out_free_cpu0_hash;
+ e->hash = cpu0_hash;
+ kref_init(&e->kref);
+ return 0;
+
+out_free_cpu0_hash:
+ crypto_free_ahash(cpu0_hash);
+out_free_alg:
+ kfree(e->alg);
+ e->alg = NULL;
+ return ret;
+}
+
+/**
+ * tcp_sigpool_alloc_ahash - allocates pool for ahash requests
+ * @alg: name of async hash algorithm
+ * @scratch_size: reserve a tcp_sigpool::scratch buffer of this size
+ */
+int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size)
+{
+ int i, ret;
+
+ /* slow-path */
+ mutex_lock(&cpool_mutex);
+ ret = sigpool_reserve_scratch(scratch_size);
+ if (ret)
+ goto out;
+ for (i = 0; i < cpool_populated; i++) {
+ if (!cpool[i].alg)
+ continue;
+ if (strcmp(cpool[i].alg, alg))
+ continue;
+
+ /* pairs with tcp_sigpool_release() */
+ if (!kref_get_unless_zero(&cpool[i].kref))
+ kref_init(&cpool[i].kref);
+ ret = i;
+ goto out;
+ }
+
+ for (i = 0; i < cpool_populated; i++) {
+ if (!cpool[i].alg)
+ break;
+ }
+ if (i >= CPOOL_SIZE) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = __cpool_alloc_ahash(&cpool[i], alg);
+ if (!ret) {
+ ret = i;
+ if (i == cpool_populated)
+ cpool_populated++;
+ }
+out:
+ mutex_unlock(&cpool_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_alloc_ahash);
+
+static void __cpool_free_entry(struct sigpool_entry *e)
+{
+ crypto_free_ahash(e->hash);
+ kfree(e->alg);
+ memset(e, 0, sizeof(*e));
+}
+
+static void cpool_cleanup_work_cb(struct work_struct *work)
+{
+ bool free_scratch = true;
+ unsigned int i;
+
+ mutex_lock(&cpool_mutex);
+ for (i = 0; i < cpool_populated; i++) {
+ if (kref_read(&cpool[i].kref) > 0) {
+ free_scratch = false;
+ continue;
+ }
+ if (!cpool[i].alg)
+ continue;
+ __cpool_free_entry(&cpool[i]);
+ }
+ if (free_scratch)
+ sigpool_scratch_free();
+ mutex_unlock(&cpool_mutex);
+}
+
+static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb);
+static void cpool_schedule_cleanup(struct kref *kref)
+{
+ schedule_work(&cpool_cleanup_work);
+}
+
+/**
+ * tcp_sigpool_release - decreases number of users for a pool. If it was
+ * the last user of the pool, releases any memory that was consumed.
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ */
+void tcp_sigpool_release(unsigned int id)
+{
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg))
+ return;
+
+ /* slow-path */
+ kref_put(&cpool[id].kref, cpool_schedule_cleanup);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_release);
+
+/**
+ * tcp_sigpool_get - increases number of users (refcounter) for a pool
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ */
+void tcp_sigpool_get(unsigned int id)
+{
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg))
+ return;
+ kref_get(&cpool[id].kref);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_get);
+
+int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH)
+{
+ struct crypto_ahash *hash;
+
+ rcu_read_lock_bh();
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) {
+ rcu_read_unlock_bh();
+ return -EINVAL;
+ }
+
+ hash = crypto_clone_ahash(cpool[id].hash);
+ if (IS_ERR(hash)) {
+ rcu_read_unlock_bh();
+ return PTR_ERR(hash);
+ }
+
+ c->req = ahash_request_alloc(hash, GFP_ATOMIC);
+ if (!c->req) {
+ crypto_free_ahash(hash);
+ rcu_read_unlock_bh();
+ return -ENOMEM;
+ }
+ ahash_request_set_callback(c->req, 0, NULL, NULL);
+
+ /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is
+ * valid (allocated) until tcp_sigpool_end().
+ */
+ local_lock_nested_bh(&sigpool_scratch.bh_lock);
+ c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_start);
+
+void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH)
+{
+ struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req);
+
+ local_unlock_nested_bh(&sigpool_scratch.bh_lock);
+ rcu_read_unlock_bh();
+ ahash_request_free(c->req);
+ crypto_free_ahash(hash);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_end);
+
+/**
+ * tcp_sigpool_algo - return algorithm of tcp_sigpool
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ * @buf: buffer to return name of algorithm
+ * @buf_len: size of @buf
+ */
+size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len)
+{
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg))
+ return -EINVAL;
+
+ return strscpy(buf, cpool[id].alg, buf_len);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_algo);
+
+/**
+ * tcp_sigpool_hash_skb_data - hash data in skb with initialized tcp_sigpool
+ * @hp: tcp_sigpool pointer
+ * @skb: buffer to add sign for
+ * @header_len: TCP header length for this segment
+ */
+int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp,
+ const struct sk_buff *skb,
+ unsigned int header_len)
+{
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ const struct tcphdr *tp = tcp_hdr(skb);
+ struct ahash_request *req = hp->req;
+ struct sk_buff *frag_iter;
+ struct scatterlist sg;
+ unsigned int i;
+
+ sg_init_table(&sg, 1);
+
+ sg_set_buf(&sg, ((u8 *)tp) + header_len, head_data_len);
+ ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+ if (crypto_ahash_update(req))
+ return 1;
+
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
+ struct page *page;
+
+ page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+ sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset));
+ ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+ if (crypto_ahash_update(req))
+ return 1;
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ if (tcp_sigpool_hash_skb_data(hp, frag_iter, 0))
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sigpool_hash_skb_data);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Per-CPU pool of crypto requests");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3355c276b611..160080c9021d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,8 +6,6 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
@@ -21,91 +20,137 @@
*/
#include <linux/module.h>
+#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/rstreason.h>
+
+static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 elapsed, user_timeout;
+ s32 remaining;
-int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
-int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
-int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
-int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
-int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
-int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
-int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
-int sysctl_tcp_orphan_retries __read_mostly;
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout)
+ return icsk->icsk_rto;
-static void tcp_write_timer(unsigned long);
-static void tcp_delack_timer(unsigned long);
-static void tcp_keepalive_timer (unsigned long data);
+ elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
+ if (tp->tcp_usec_ts)
+ elapsed /= USEC_PER_MSEC;
-void tcp_init_xmit_timers(struct sock *sk)
+ remaining = user_timeout - elapsed;
+ if (remaining <= 0)
+ return 1; /* user timeout has passed; fire ASAP */
+
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
+}
+
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
- inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
- &tcp_keepalive_timer);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 remaining, user_timeout;
+ s32 elapsed;
+
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout || !icsk->icsk_probes_tstamp)
+ return when;
+
+ elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
+ if (unlikely(elapsed < 0))
+ elapsed = 0;
+ remaining = msecs_to_jiffies(user_timeout) - elapsed;
+ remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
+
+ return min_t(u32, remaining, when);
}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
+/**
+ * tcp_write_err() - close socket and save error info
+ * @sk: The socket the error has appeared on.
+ *
+ * Returns: Nothing (void)
+ */
static void tcp_write_err(struct sock *sk)
{
- sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
- sk->sk_error_report(sk);
-
- tcp_done(sk);
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+ tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
-/* Do not allow orphaned sockets to eat all our resources.
- * This is direct violation of TCP specs, but it is required
- * to prevent DoS attacks. It is called when a retransmission timeout
- * or zero probe timeout occurs on orphaned socket.
+/**
+ * tcp_out_of_resources() - Close socket if out of resources
+ * @sk: pointer to current socket
+ * @do_reset: send a last packet with reset flag
*
- * Criteria is still not confirmed experimentally and may change.
- * We kill the socket, if:
- * 1. If number of orphaned sockets exceeds an administratively configured
- * limit.
- * 2. If we have strong memory pressure.
+ * Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Also close if our net namespace is exiting; in that case there is no
+ * hope of ever communicating again since all netns interfaces are already
+ * down (or about to be down), and we need to release our dst references,
+ * which have been moved to the netns loopback interface, so the namespace
+ * can finish exiting. This condition is only possible if we are a kernel
+ * socket, as those do not hold references to the namespace.
+ *
+ * Criteria is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ * limit.
+ * 2. If we have strong memory pressure.
+ * 3. If our net namespace is exiting.
*/
-static int tcp_out_of_resources(struct sock *sk, int do_reset)
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
struct tcp_sock *tp = tcp_sk(sk);
- int orphans = atomic_read(&tcp_orphan_count);
+ int shift = 0;
- /* If peer does not open window for long time, or did not transmit
+ /* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
- if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
- orphans <<= 1;
+ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset)
+ shift++;
/* If some dubious ICMP arrived, penalize even more. */
- if (sk->sk_err_soft)
- orphans <<= 1;
-
- if (orphans >= sysctl_tcp_max_orphans ||
- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
- if (net_ratelimit())
- printk(KERN_INFO "Out of socket memory\n");
+ if (READ_ONCE(sk->sk_err_soft))
+ shift++;
+ if (tcp_check_oom(sk, shift)) {
/* Catch exceptional cases, when connection requires reset.
* 1. Last segment was sent recently. */
- if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+ if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
/* 2. Window is closed. */
(!tp->snd_wnd && !tp->packets_out))
- do_reset = 1;
+ do_reset = true;
if (do_reset)
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_MEMORY);
tcp_done(sk);
- NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
+
+ if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_done(sk);
+ return 1;
+ }
+
return 0;
}
-/* Calculate maximal number or retries on an orphaned socket. */
-static int tcp_orphan_retries(struct sock *sk, int alive)
+/**
+ * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket
+ * @sk: Pointer to the current socket.
+ * @alive: bool, socket alive state
+ */
+static int tcp_orphan_retries(struct sock *sk, bool alive)
{
- int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+ int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */
/* We know from an ICMP that something is wrong. */
- if (sk->sk_err_soft && !alive)
+ if (READ_ONCE(sk->sk_err_soft) && !alive)
retries = 0;
/* However, if socket sent something recently, select some safe
@@ -116,175 +161,399 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
return retries;
}
+static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+{
+ const struct net *net = sock_net(sk);
+ int mss;
+
+ /* Black hole detection */
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing))
+ return;
+
+ if (!icsk->icsk_mtup.enabled) {
+ icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
+ } else {
+ mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+ mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss);
+ mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor));
+ mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss));
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+ }
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+static unsigned int tcp_model_timeout(struct sock *sk,
+ unsigned int boundary,
+ unsigned int rto_base)
+{
+ unsigned int linear_backoff_thresh, timeout;
+
+ linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base);
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * tcp_rto_max(sk);
+ return jiffies_to_msecs(timeout);
+}
+/**
+ * retransmits_timed_out() - returns true if this connection has timed out
+ * @sk: The current socket
+ * @boundary: max number of retransmissions
+ * @timeout: A custom timeout value.
+ * If set to 0 the default timeout is calculated and used.
+ * Using TCP_RTO_MIN and the number of unsuccessful retransmits.
+ *
+ * The default "timeout" value this function can calculate and use
+ * is equivalent to the timeout of a TCP Connection
+ * after "boundary" unsuccessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+ unsigned int boundary,
+ unsigned int timeout)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int start_ts, delta;
+
+ if (!inet_csk(sk)->icsk_retransmits)
+ return false;
+
+ start_ts = tp->retrans_stamp;
+ if (likely(timeout == 0)) {
+ unsigned int rto_base = TCP_RTO_MIN;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ rto_base = tcp_timeout_init(sk);
+ timeout = tcp_model_timeout(sk, boundary, rto_base);
+ }
+
+ if (tp->tcp_usec_ts) {
+ /* delta maybe off up to a jiffy due to timer granularity. */
+ delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
+ return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
+ }
+ return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
+}
+
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int retry_until;
- int mss;
+ struct net *net = sock_net(sk);
+ bool expired = false, do_reset;
+ int retry_until, max_retransmits;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
- dst_negative_advice(&sk->sk_dst_cache);
- retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ __dst_negative_advice(sk);
+ /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
+ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
+
+ max_retransmits = retry_until;
+ if (sk->sk_state == TCP_SYN_SENT)
+ max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts);
+
+ expired = icsk->icsk_retransmits >= max_retransmits;
} else {
- if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
+ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
/* Black hole detection */
- if (sysctl_tcp_mtu_probing) {
- if (!icsk->icsk_mtup.enabled) {
- icsk->icsk_mtup.enabled = 1;
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- } else {
- mss = min(sysctl_tcp_base_mss,
- tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
- mss = max(mss, 68 - tp->tcp_header_len);
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- }
- }
+ tcp_mtu_probing(icsk, sk);
- dst_negative_advice(&sk->sk_dst_cache);
+ __dst_negative_advice(sk);
}
- retry_until = sysctl_tcp_retries2;
+ retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
-
+ const bool alive = icsk->icsk_rto < tcp_rto_max(sk);
+
retry_until = tcp_orphan_retries(sk, alive);
+ do_reset = alive ||
+ !retransmits_timed_out(sk, retry_until, 0);
- if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
+ if (tcp_out_of_resources(sk, do_reset))
return 1;
}
}
-
- if (icsk->icsk_retransmits >= retry_until) {
+ if (!expired)
+ expired = retransmits_timed_out(sk, retry_until,
+ READ_ONCE(icsk->icsk_user_timeout));
+ tcp_fastopen_active_detect_blackhole(sk, expired);
+ mptcp_active_detect_blackhole(sk, expired);
+
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+ tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+ icsk->icsk_retransmits,
+ icsk->icsk_rto, (int)expired);
+
+ if (expired) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
}
+
+ if (sk_rethink_txhash(sk)) {
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
+ }
+
return 0;
}
-static void tcp_delack_timer(unsigned long data)
+/* Called with BH disabled */
+void tcp_delack_timer_handler(struct sock *sk)
{
- struct sock *sk = (struct sock*)data;
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
- goto out_unlock;
- }
-
- sk_stream_mem_reclaim(sk);
-
- if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ return;
- if (time_after(icsk->icsk_ack.timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
- goto out;
+ /* Handling the sack compression case */
+ if (tp->compressed_ack) {
+ tcp_mstamp_refresh(tp);
+ tcp_sack_compress_send_ack(sk);
+ return;
}
- icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
-
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- struct sk_buff *skb;
- NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
-
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk->sk_backlog_rcv(sk, skb);
+ if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ return;
- tp->ucopy.memory = 0;
+ if (time_after(icsk_delack_timeout(icsk), jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk_delack_timeout(icsk));
+ return;
}
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
- if (!icsk->icsk_ack.pingpong) {
+ if (!inet_csk_in_pingpong_mode(sk)) {
/* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
- icsk->icsk_ack.pingpong = 0;
+ inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
- TCP_CHECK_TIMER(sk);
+}
-out:
- if (tcp_memory_pressure)
- sk_stream_mem_reclaim(sk);
-out_unlock:
+
+/**
+ * tcp_delack_timer() - The TCP delayed ACK timeout handler
+ * @t: Pointer to the timer. (gets casted to struct sock *)
+ *
+ * This function gets (indirectly) called when the kernel timer for a TCP packet
+ * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
+ *
+ * Returns: Nothing (void)
+ */
+static void tcp_delack_timer(struct timer_list *t)
+{
+ struct inet_connection_sock *icsk =
+ timer_container_of(icsk, t, icsk_delack_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
+
+ /* Avoid taking socket spinlock if there is no ACK to send.
+ * The compressed_ack check is racy, but a separate hrtimer
+ * will take care of it eventually.
+ */
+ if (!(smp_load_acquire(&icsk->icsk_ack.pending) & ICSK_ACK_TIMER) &&
+ !READ_ONCE(tcp_sk(sk)->compressed_ack))
+ goto out;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_delack_timer_handler(sk);
+ } else {
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
bh_unlock_sock(sk);
+out:
sock_put(sk);
}
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
- if (tp->packets_out || !sk->sk_send_head) {
- icsk->icsk_probes_out = 0;
+ if (tp->packets_out || !skb) {
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
+ icsk->icsk_probes_tstamp = 0;
return;
}
- /* *WARNING* RFC 1122 forbids this
- *
- * It doesn't AFAIK, because we kill the retransmit timer -AK
- *
- * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
- * this behaviour in Solaris down as a bug fix. [AC]
- *
- * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
- * even if they advertise zero window. Hence, connection is killed only
- * if we received no ACKs for normal connection timeout. It is not killed
- * only because window stays zero for some time, window may be zero
- * until armageddon and even later. We are in full accordance
- * with RFCs, only probe timer combines both retransmission timeout
- * and probe timeout in one bottle. --ANK
+ /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
+ * long as the receiver continues to respond probes. We support this by
+ * default and reset icsk_probes_out with incoming ACKs. But if the
+ * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
+ * kill the socket when the retry count and the time exceeds the
+ * corresponding system limit. We also implement similar policy when
+ * we use RTO to probe window in tcp_retransmit_timer().
*/
- max_probes = sysctl_tcp_retries2;
+ if (!icsk->icsk_probes_tstamp) {
+ icsk->icsk_probes_tstamp = tcp_jiffies32;
+ } else {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (user_timeout &&
+ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+ msecs_to_jiffies(user_timeout))
+ goto abort;
+ }
+ max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
-
- max_probes = tcp_orphan_retries(sk, alive);
+ unsigned int rto_max = tcp_rto_max(sk);
+ const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max;
- if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
+ max_probes = tcp_orphan_retries(sk, alive);
+ if (!alive && icsk->icsk_backoff >= max_probes)
+ goto abort;
+ if (tcp_out_of_resources(sk, true))
return;
}
- if (icsk->icsk_probes_out > max_probes) {
- tcp_write_err(sk);
+ if (icsk->icsk_probes_out >= max_probes) {
+abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
}
+static void tcp_update_rto_stats(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!icsk->icsk_retransmits) {
+ tp->total_rto_recoveries++;
+ tp->rto_stamp = tcp_time_stamp_ms(tp);
+ }
+ WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1);
+ tp->total_rto++;
+}
+
/*
- * The TCP retransmit timer.
+ * Timer for Fast Open socket to retransmit SYNACK. Note that the
+ * sk here is the child socket, not the parent (listener) socket.
*/
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int max_retries;
+
+ tcp_syn_ack_timeout(req);
+
+ /* Add one more retry for fastopen.
+ * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
+ */
+ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
+
+ if (req->num_timeout >= max_retries) {
+ tcp_write_err(sk);
+ return;
+ }
+ /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
+ if (icsk->icsk_retransmits == 1)
+ tcp_enter_loss(sk);
+ /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+ * returned from rtx_syn_ack() to make it more persistent like
+ * regular retransmit because if the child socket has been accepted
+ * it's not good to give up too easily.
+ */
+ tcp_rtx_synack(sk, req);
+ req->num_timeout++;
+ tcp_update_rto_stats(sk);
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_time_stamp_ts(tp);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ req->timeout << req->num_timeout, false);
+}
+
+static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
+ const struct sk_buff *skb,
+ u32 rtx_delta)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int timeout = tcp_rto_max(sk) * 2;
+ s32 rcv_delta;
+
+ if (user_timeout) {
+ /* If user application specified a TCP_USER_TIMEOUT,
+ * it does not want win 0 packets to 'reset the timer'
+ * while retransmits are not making progress.
+ */
+ if (rtx_delta > user_timeout)
+ return true;
+ timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout));
+ }
+ /* Note: timer interrupt might have been delayed by at least one jiffy,
+ * and tp->rcv_tstamp might very well have been written recently.
+ * rcv_delta can thus be negative.
+ */
+ rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp;
+ if (rcv_delta <= timeout)
+ return false;
+
+ return msecs_to_jiffies(rtx_delta) > timeout;
+}
-static void tcp_retransmit_timer(struct sock *sk)
+/**
+ * tcp_retransmit_timer() - The TCP retransmit timeout handler
+ * @sk: Pointer to the current socket.
+ *
+ * This function gets called when the kernel timer for a TCP packet
+ * of this socket expires.
+ *
+ * It handles retransmission, timer adjustment and other necessary measures.
+ *
+ * Returns: Nothing (void)
+ */
+void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
+ struct sk_buff *skb;
+
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+ tcp_fastopen_synack_timer(sk, req);
+ /* Before we receive ACK to our SYN-ACK don't retransmit
+ * anything else (e.g., data or FIN segments).
+ */
+ return;
+ }
if (!tp->packets_out)
- goto out;
+ return;
- BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
+ return;
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
@@ -293,63 +562,75 @@ static void tcp_retransmit_timer(struct sock *sk)
* connection. If the socket is an orphan, time it out,
* we cannot allow such beasts to hang infinitely.
*/
-#ifdef TCP_DEBUG
- if (net_ratelimit()) {
- struct inet_sock *inet = inet_sk(sk);
- printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
- NIPQUAD(inet->daddr), ntohs(inet->dport),
- inet->num, tp->snd_una, tp->snd_nxt);
+ struct inet_sock *inet = inet_sk(sk);
+ u32 rtx_delta;
+
+ rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
+ tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+ if (tp->tcp_usec_ts)
+ rtx_delta /= USEC_PER_MSEC;
+
+ if (sk->sk_family == AF_INET) {
+ net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt,
+ jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+ rtx_delta);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt,
+ jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+ rtx_delta);
}
#endif
- if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+ if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
tcp_write_err(sk);
goto out;
}
- tcp_enter_loss(sk, 0);
- tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+ tcp_enter_loss(sk);
+ tcp_retransmit_skb(sk, skb, 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
if (tcp_write_timeout(sk))
goto out;
if (icsk->icsk_retransmits == 0) {
- if (icsk->icsk_ca_state == TCP_CA_Disorder ||
- icsk->icsk_ca_state == TCP_CA_Recovery) {
- if (tp->rx_opt.sack_ok) {
- if (icsk->icsk_ca_state == TCP_CA_Recovery)
- NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
- else
- NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
- } else {
- if (icsk->icsk_ca_state == TCP_CA_Recovery)
- NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
- else
- NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
- }
+ int mib_idx = 0;
+
+ if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+ else
+ mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
- } else {
- NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
+ mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+ } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+ tp->sacked_out) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
}
+ if (mib_idx)
+ __NET_INC_STATS(sock_net(sk), mib_idx);
}
- if (tcp_use_frto(sk)) {
- tcp_enter_frto(sk);
- } else {
- tcp_enter_loss(sk, 0);
- }
+ tcp_enter_loss(sk);
- if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
+ tcp_update_rto_stats(sk);
+ if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
- * do not backoff.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_retransmits)
- icsk->icsk_retransmits = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_RESOURCE_PROBE_INTERVAL,
+ false);
goto out;
}
@@ -368,67 +649,116 @@ static void tcp_retransmit_timer(struct sock *sk)
* implemented ftp to mars will work nicely. We will have to fix
* the 120 second clamps though!
*/
- icsk->icsk_backoff++;
- icsk->icsk_retransmits++;
out_reset_timer:
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (icsk->icsk_retransmits > sysctl_tcp_retries1)
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+ * exponential backoff behaviour to avoid continue hammering
+ * linear-timeout retransmissions into a black hole
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) &&
+ tcp_stream_is_thin(tp) &&
+ icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = clamp(__tcp_set_rto(tp),
+ tcp_rto_min(sk),
+ tcp_rto_max(sk));
+ } else if (sk->sk_state != TCP_SYN_SENT ||
+ tp->total_rto >
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
+ /* Use normal (exponential) backoff unless linear timeouts are
+ * activated.
+ */
+ icsk->icsk_backoff++;
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk));
+ }
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ tcp_clamp_rto_to_user_timeout(sk), false);
+ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
__sk_dst_reset(sk);
out:;
}
-static void tcp_write_timer(unsigned long data)
+/* Called with bottom-half processing disabled.
+ * Called by tcp_write_timer() and tcp_release_cb().
+ */
+void tcp_write_timer_handler(struct sock *sk)
{
- struct sock *sk = (struct sock*)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later */
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
- goto out_unlock;
- }
-
- if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
- goto out;
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ !icsk->icsk_pending)
+ return;
- if (time_after(icsk->icsk_timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
- goto out;
+ if (time_after(tcp_timeout_expires(sk), jiffies)) {
+ sk_reset_timer(sk, &sk->tcp_retransmit_timer,
+ tcp_timeout_expires(sk));
+ return;
}
-
+ tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
switch (event) {
+ case ICSK_TIME_REO_TIMEOUT:
+ tcp_rack_reo_timeout(sk);
+ break;
+ case ICSK_TIME_LOSS_PROBE:
+ tcp_send_loss_probe(sk);
+ break;
case ICSK_TIME_RETRANS:
+ smp_store_release(&icsk->icsk_pending, 0);
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
+ smp_store_release(&icsk->icsk_pending, 0);
tcp_probe_timer(sk);
break;
}
- TCP_CHECK_TIMER(sk);
+}
-out:
- sk_stream_mem_reclaim(sk);
-out_unlock:
+static void tcp_write_timer(struct timer_list *t)
+{
+ struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer);
+
+ /* Avoid locking the socket when there is no pending event. */
+ if (!smp_load_acquire(&inet_csk(sk)->icsk_pending))
+ goto out;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_write_timer_handler(sk);
+ } else {
+ /* delegate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
bh_unlock_sock(sk);
+out:
sock_put(sk);
}
-/*
- * Timer for listening sockets
- */
+void tcp_syn_ack_timeout(const struct request_sock *req)
+{
+ struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
-static void tcp_synack_timer(struct sock *sk)
+ __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
+}
+
+void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len);
+}
+
+static void tcp_delete_keepalive_timer(struct sock *sk)
{
- inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
- TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+ sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer);
}
void tcp_set_keepalive(struct sock *sk, int val)
@@ -437,34 +767,36 @@ void tcp_set_keepalive(struct sock *sk, int val)
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
- inet_csk_delete_keepalive_timer(sk);
+ tcp_delete_keepalive_timer(sk);
}
+EXPORT_IPV6_MOD_GPL(tcp_set_keepalive);
-
-static void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer(struct timer_list *t)
{
- struct sock *sk = (struct sock *) data;
- struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk =
+ timer_container_of(icsk, t, icsk_keepalive_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
struct tcp_sock *tp = tcp_sk(sk);
- __u32 elapsed;
+ u32 elapsed;
/* Only process if socket is not in use. */
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
- /* Try again later. */
- inet_csk_reset_keepalive_timer (sk, HZ/20);
+ /* Try again later. */
+ tcp_reset_keepalive_timer(sk, HZ/20);
goto out;
}
if (sk->sk_state == TCP_LISTEN) {
- tcp_synack_timer(sk);
+ pr_err("Hmm... keepalive on a LISTEN ???\n");
goto out;
}
+ tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
- if (tp->linger2 >= 0) {
+ if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
@@ -472,30 +804,40 @@ static void tcp_keepalive_timer (unsigned long data)
goto out;
}
}
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE);
goto death;
}
- if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+ if (!sock_flag(sk, SOCK_KEEPOPEN) ||
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)))
goto out;
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
- if (tp->packets_out || sk->sk_send_head)
+ if (tp->packets_out || !tcp_write_queue_empty(sk))
goto resched;
- elapsed = tcp_time_stamp - tp->rcv_tstamp;
+ elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
- if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
- (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
+ /* If the TCP_USER_TIMEOUT option is enabled, use that
+ * to determine when to timeout instead.
+ */
+ if ((user_timeout != 0 &&
+ elapsed >= msecs_to_jiffies(user_timeout) &&
+ icsk->icsk_probes_out > 0) ||
+ (user_timeout == 0 &&
+ icsk->icsk_probes_out >= keepalive_probes(tp))) {
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT);
tcp_write_err(sk);
goto out;
}
- if (tcp_write_wakeup(sk) <= 0) {
- icsk->icsk_probes_out++;
+ if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
@@ -508,17 +850,53 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp) - elapsed;
}
- TCP_CHECK_TIMER(sk);
- sk_stream_mem_reclaim(sk);
-
resched:
- inet_csk_reset_keepalive_timer (sk, elapsed);
+ tcp_reset_keepalive_timer(sk, elapsed);
goto out;
-death:
+death:
tcp_done(sk);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
+
+static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
+ struct sock *sk = (struct sock *)tp;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ if (tp->compressed_ack) {
+ /* Since we have to send one ack finally,
+ * subtract one from tp->compressed_ack to keep
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
+ */
+ tp->compressed_ack--;
+ tcp_mstamp_refresh(tp);
+ tcp_send_ack(sk);
+ }
+ } else {
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+ return HRTIMER_NORESTART;
+}
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
+ hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+
+ hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED_SOFT);
+}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
new file mode 100644
index 000000000000..2aa442128630
--- /dev/null
+++ b/net/ipv4/tcp_ulp.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Pluggable TCP upper layer protocol support.
+ *
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_ulp_list_lock);
+static LIST_HEAD(tcp_ulp_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
+{
+ struct tcp_ulp_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_ulp_list, list,
+ lockdep_is_held(&tcp_ulp_list_lock)) {
+ if (strcmp(e->name, name) == 0)
+ return e;
+ }
+
+ return NULL;
+}
+
+static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
+{
+ const struct tcp_ulp_ops *ulp = NULL;
+
+ rcu_read_lock();
+ ulp = tcp_ulp_find(name);
+
+#ifdef CONFIG_MODULES
+ if (!ulp && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("tcp-ulp-%s", name);
+ rcu_read_lock();
+ ulp = tcp_ulp_find(name);
+ }
+#endif
+ if (!ulp || !try_module_get(ulp->owner))
+ ulp = NULL;
+
+ rcu_read_unlock();
+ return ulp;
+}
+
+/* Attach new upper layer protocol to the list
+ * of available protocols.
+ */
+int tcp_register_ulp(struct tcp_ulp_ops *ulp)
+{
+ int ret = 0;
+
+ spin_lock(&tcp_ulp_list_lock);
+ if (tcp_ulp_find(ulp->name))
+ ret = -EEXIST;
+ else
+ list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
+ spin_unlock(&tcp_ulp_list_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_ulp);
+
+void tcp_unregister_ulp(struct tcp_ulp_ops *ulp)
+{
+ spin_lock(&tcp_ulp_list_lock);
+ list_del_rcu(&ulp->list);
+ spin_unlock(&tcp_ulp_list_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_ulp);
+
+/* Build string with list of available upper layer protocl values */
+void tcp_get_available_ulp(char *buf, size_t maxlen)
+{
+ struct tcp_ulp_ops *ulp_ops;
+ size_t offs = 0;
+
+ *buf = '\0';
+ rcu_read_lock();
+ list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ulp_ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
+ }
+ rcu_read_unlock();
+}
+
+void tcp_update_ulp(struct sock *sk, struct proto *proto,
+ void (*write_space)(struct sock *sk))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ulp_ops->update)
+ icsk->icsk_ulp_ops->update(sk, proto, write_space);
+}
+
+void tcp_cleanup_ulp(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* No sock_owned_by_me() check here as at the time the
+ * stack calls this function, the socket is dead and
+ * about to be destroyed.
+ */
+ if (!icsk->icsk_ulp_ops)
+ return;
+
+ if (icsk->icsk_ulp_ops->release)
+ icsk->icsk_ulp_ops->release(sk);
+ module_put(icsk->icsk_ulp_ops->owner);
+
+ icsk->icsk_ulp_ops = NULL;
+}
+
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int err;
+
+ err = -EEXIST;
+ if (icsk->icsk_ulp_ops)
+ goto out_err;
+
+ if (sk->sk_socket)
+ clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+
+ err = -ENOTCONN;
+ if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN)
+ goto out_err;
+
+ err = ulp_ops->init(sk);
+ if (err)
+ goto out_err;
+
+ icsk->icsk_ulp_ops = ulp_ops;
+ return 0;
+out_err:
+ module_put(ulp_ops->owner);
+ return err;
+}
+
+int tcp_set_ulp(struct sock *sk, const char *name)
+{
+ const struct tcp_ulp_ops *ulp_ops;
+
+ sock_owned_by_me(sk);
+
+ ulp_ops = __tcp_ulp_find_autoload(name);
+ if (!ulp_ops)
+ return -ENOENT;
+
+ return __tcp_set_ulp(sk, ulp_ops);
+}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index ddc4bcc5785e..786848ad37ea 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Vegas congestion control
*
@@ -38,33 +39,19 @@
#include <net/tcp.h>
-/* Default values of the Vegas variables, in fixed-point representation
- * with V_PARAM_SHIFT bits to the right of the binary point.
- */
-#define V_PARAM_SHIFT 1
-static int alpha = 2<<V_PARAM_SHIFT;
-static int beta = 4<<V_PARAM_SHIFT;
-static int gamma = 1<<V_PARAM_SHIFT;
+#include "tcp_vegas.h"
+
+static int alpha = 2;
+static int beta = 4;
+static int gamma = 1;
module_param(alpha, int, 0644);
-MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
+MODULE_PARM_DESC(alpha, "lower bound of packets in network");
module_param(beta, int, 0644);
-MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
+MODULE_PARM_DESC(beta, "upper bound of packets in network");
module_param(gamma, int, 0644);
MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
-
-/* Vegas variables */
-struct vegas {
- u32 beg_snd_nxt; /* right edge during last RTT */
- u32 beg_snd_una; /* left edge during last RTT */
- u32 beg_snd_cwnd; /* saves the size of the cwnd */
- u8 doing_vegas_now;/* if true, do vegas for this RTT */
- u16 cntRTT; /* # of RTTs measured within last RTT */
- u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
- u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
-};
-
/* There are several situations when we must "re-start" Vegas:
*
* o when a connection is established
@@ -81,7 +68,7 @@ struct vegas {
* Instead we must wait until the completion of an RTT during
* which we actually receive ACKs.
*/
-static inline void vegas_enable(struct sock *sk)
+static void vegas_enable(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
@@ -104,13 +91,14 @@ static inline void vegas_disable(struct sock *sk)
vegas->doing_vegas_now = 0;
}
-static void tcp_vegas_init(struct sock *sk)
+void tcp_vegas_init(struct sock *sk)
{
struct vegas *vegas = inet_csk_ca(sk);
vegas->baseRTT = 0x7fffffff;
vegas_enable(sk);
}
+EXPORT_SYMBOL_GPL(tcp_vegas_init);
/* Do RTT sampling needed for Vegas.
* Basically we:
@@ -120,10 +108,16 @@ static void tcp_vegas_init(struct sock *sk)
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
-static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct vegas *vegas = inet_csk_ca(sk);
- u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+ u32 vrtt;
+
+ if (sample->rtt_us < 0)
+ return;
+
+ /* Never allow zero rtt or baseRTT */
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < vegas->baseRTT)
@@ -135,15 +129,16 @@ static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
vegas->minRTT = min(vegas->minRTT, vrtt);
vegas->cntRTT++;
}
+EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
-static void tcp_vegas_state(struct sock *sk, u8 ca_state)
+void tcp_vegas_state(struct sock *sk, u8 ca_state)
{
-
if (ca_state == TCP_CA_Open)
vegas_enable(sk);
else
vegas_disable(sk);
}
+EXPORT_SYMBOL_GPL(tcp_vegas_state);
/*
* If the connection is idle and we are restarting,
@@ -154,65 +149,36 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state)
* packets, _then_ we can make Vegas calculations
* again.
*/
-static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_CWND_RESTART ||
event == CA_EVENT_TX_START)
tcp_vegas_init(sk);
}
+EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
- u32 seq_rtt, u32 in_flight, int flag)
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+ return min(tp->snd_ssthresh, tcp_snd_cwnd(tp));
+}
+
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
- if (!vegas->doing_vegas_now)
- return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
-
- /* The key players are v_beg_snd_una and v_beg_snd_nxt.
- *
- * These are so named because they represent the approximate values
- * of snd_una and snd_nxt at the beginning of the current RTT. More
- * precisely, they represent the amount of data sent during the RTT.
- * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
- * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
- * bytes of data have been ACKed during the course of the RTT, giving
- * an "actual" rate of:
- *
- * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
- *
- * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
- * because delayed ACKs can cover more than one segment, so they
- * don't line up nicely with the boundaries of RTTs.
- *
- * Another unfortunate fact of life is that delayed ACKs delay the
- * advance of the left edge of our send window, so that the number
- * of bytes we send in an RTT is often less than our cwnd will allow.
- * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
- */
+ if (!vegas->doing_vegas_now) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
if (after(ack, vegas->beg_snd_nxt)) {
/* Do the Vegas once-per-RTT cwnd adjustment. */
- u32 old_wnd, old_snd_cwnd;
-
-
- /* Here old_wnd is essentially the window of data that was
- * sent during the previous RTT, and has all
- * been acknowledged in the course of the RTT that ended
- * with the ACK we just received. Likewise, old_snd_cwnd
- * is the cwnd during the previous RTT.
- */
- old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
- tp->mss_cache;
- old_snd_cwnd = vegas->beg_snd_cwnd;
/* Save the extent of the current window so we can use this
* at the end of the next RTT.
*/
- vegas->beg_snd_una = vegas->beg_snd_nxt;
vegas->beg_snd_nxt = tp->snd_nxt;
- vegas->beg_snd_cwnd = tp->snd_cwnd;
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -227,9 +193,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
- u32 rtt, target_cwnd, diff;
+ u32 rtt, diff;
+ u64 target_cwnd;
/* We have enough RTT samples, so, using the Vegas
* algorithm, we determine if we should increase or
@@ -249,46 +216,37 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*
* This is:
* (actual rate in segments) * baseRTT
- * We keep it as a fixed point number with
- * V_PARAM_SHIFT bits to the right of the binary point.
*/
- target_cwnd = ((old_wnd * vegas->baseRTT)
- << V_PARAM_SHIFT) / rtt;
+ target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT;
+ do_div(target_cwnd, rtt);
/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
* is the "Diff" from the Arizona Vegas papers.
- *
- * Again, this is a fixed point number with
- * V_PARAM_SHIFT bits to the right of the binary
- * point.
*/
- diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+ diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* Slow start. */
- if (diff > gamma) {
- /* Going too fast. Time to slow down
- * and switch to congestion avoidance.
- */
- tp->snd_ssthresh = 2;
-
- /* Set cwnd to match the actual rate
- * exactly:
- * cwnd = (actual rate) * baseRTT
- * Then we add 1 because the integer
- * truncation robs us of full link
- * utilization.
- */
- tp->snd_cwnd = min(tp->snd_cwnd,
- (target_cwnd >>
- V_PARAM_SHIFT)+1);
+ if (diff > gamma && tcp_in_slow_start(tp)) {
+ /* Going too fast. Time to slow down
+ * and switch to congestion avoidance.
+ */
- }
- tcp_slow_start(tp);
+ /* Set cwnd to match the actual rate
+ * exactly:
+ * cwnd = (actual rate) * baseRTT
+ * Then we add 1 because the integer
+ * truncation robs us of full link
+ * utilization.
+ */
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp),
+ (u32)target_cwnd + 1));
+ tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+
+ } else if (tcp_in_slow_start(tp)) {
+ /* Slow start. */
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
- u32 next_snd_cwnd;
/* Figure out where we would like cwnd
* to be.
@@ -297,32 +255,27 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* The old window was too fast, so
* we slow down.
*/
- next_snd_cwnd = old_snd_cwnd - 1;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
+ tp->snd_ssthresh
+ = tcp_vegas_ssthresh(tp);
} else if (diff < alpha) {
/* We don't have enough extra packets
* in the network, so speed up.
*/
- next_snd_cwnd = old_snd_cwnd + 1;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
} else {
/* Sending just as fast as we
* should be.
*/
- next_snd_cwnd = old_snd_cwnd;
}
-
- /* Adjust cwnd upward or downward, toward the
- * desired value.
- */
- if (next_snd_cwnd > tp->snd_cwnd)
- tp->snd_cwnd++;
- else if (next_snd_cwnd < tp->snd_cwnd)
- tp->snd_cwnd--;
}
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
- tp->snd_cwnd = tp->snd_cwnd_clamp;
+ if (tcp_snd_cwnd(tp) < 2)
+ tcp_snd_cwnd_set(tp, 2);
+ else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
+
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
}
/* Wipe the slate clean for the next RTT. */
@@ -330,36 +283,35 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
vegas->minRTT = 0x7fffffff;
}
/* Use normal slow start */
- else if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
-
+ else if (tcp_in_slow_start(tp))
+ tcp_slow_start(tp, acked);
}
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_vegas_get_info(struct sock *sk, u32 ext,
- struct sk_buff *skb)
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct vegas *ca = inet_csk_ca(sk);
- if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct tcpvegas_info *info;
- info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
- sizeof(*info)));
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = ca->doing_vegas_now;
+ info->vegas.tcpv_rttcnt = ca->cntRTT;
+ info->vegas.tcpv_rtt = ca->baseRTT;
+ info->vegas.tcpv_minrtt = ca->minRTT;
- info->tcpv_enabled = ca->doing_vegas_now;
- info->tcpv_rttcnt = ca->cntRTT;
- info->tcpv_rtt = ca->baseRTT;
- info->tcpv_minrtt = ca->minRTT;
- rtattr_failure: ;
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
+ return 0;
}
+EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
-static struct tcp_congestion_ops tcp_vegas = {
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_vegas_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
- .rtt_sample = tcp_vegas_rtt_calc,
+ .pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
new file mode 100644
index 000000000000..4f24d0e37d9c
--- /dev/null
+++ b/net/ipv4/tcp_vegas.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TCP Vegas congestion control interface
+ */
+#ifndef __TCP_VEGAS_H
+#define __TCP_VEGAS_H 1
+
+/* Vegas variables */
+struct vegas {
+ u32 beg_snd_nxt; /* right edge during last RTT */
+ u32 beg_snd_una; /* left edge during last RTT */
+ u32 beg_snd_cwnd; /* saves the size of the cwnd */
+ u8 doing_vegas_now;/* if true, do vegas for this RTT */
+ u16 cntRTT; /* # of RTTs measured within last RTT */
+ u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
+ u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
+
+#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ce57bf302f6c..366ff6f214b2 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Veno congestion control
*
@@ -6,7 +7,7 @@
* "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
* IEEE Journal on Selected Areas in Communication,
* Feb. 2003.
- * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ * See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
*/
#include <linux/mm.h>
@@ -69,10 +70,17 @@ static void tcp_veno_init(struct sock *sk)
}
/* Do rtt sampling needed for Veno. */
-static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt)
+static void tcp_veno_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct veno *veno = inet_csk_ca(sk);
- u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */
+ u32 vrtt;
+
+ if (sample->rtt_us < 0)
+ return;
+
+ /* Never allow zero rtt or baseRTT */
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < veno->basertt)
@@ -108,17 +116,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
- u32 seq_rtt, u32 in_flight, int flag)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
- if (!veno->doing_veno_now)
- return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+ if (!veno->doing_veno_now) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
/* limited by applications */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* We do the Veno calculations only if we got enough rtt samples */
@@ -126,9 +135,10 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
- u32 rtt, target_cwnd;
+ u64 target_cwnd;
+ u32 rtt;
/* We have enough rtt samples, so, using the Veno
* algorithm, we determine the state of the network.
@@ -136,48 +146,45 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
rtt = veno->minrtt;
- target_cwnd = ((tp->snd_cwnd * veno->basertt)
- << V_PARAM_SHIFT) / rtt;
+ target_cwnd = (u64)tcp_snd_cwnd(tp) * veno->basertt;
+ target_cwnd <<= V_PARAM_SHIFT;
+ do_div(target_cwnd, rtt);
- veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+ veno->diff = (tcp_snd_cwnd(tp) << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* Slow start. */
- tcp_slow_start(tp);
+ if (tcp_in_slow_start(tp)) {
+ /* Slow start. */
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto done;
+ }
+
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
} else {
- /* Congestion avoidance. */
- if (veno->diff < beta) {
- /* In the "non-congestive state", increase cwnd
- * every rtt.
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ if (veno->inc &&
+ tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+ veno->inc = 0;
} else
- tp->snd_cwnd_cnt++;
- } else {
- /* In the "congestive state", increase cwnd
- * every other rtt.
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc
- && tp->snd_cwnd <
- tp->snd_cwnd_clamp) {
- tp->snd_cwnd++;
- veno->inc = 0;
- } else
- veno->inc = 1;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
- }
-
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt += acked;
}
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
- tp->snd_cwnd = tp->snd_cwnd_clamp;
+done:
+ if (tcp_snd_cwnd(tp) < 2)
+ tcp_snd_cwnd_set(tp, 2);
+ else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
}
/* Wipe the slate clean for the next rtt. */
/* veno->cntrtt = 0; */
@@ -192,17 +199,18 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
- return max(tp->snd_cwnd * 4 / 5, 2U);
+ return max(tcp_snd_cwnd(tp) * 4 / 5, 2U);
else
/* in "congestive state", cut cwnd by 1/2 */
- return max(tp->snd_cwnd >> 1U, 2U);
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
-static struct tcp_congestion_ops tcp_veno = {
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_veno_cong_avoid,
- .rtt_sample = tcp_veno_rtt_calc,
+ .pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
.cwnd_event = tcp_veno_cwnd_event,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4f42a86c77f3..c6e97141eef2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Westwood+: end-to-end bandwidth estimation for TCP
*
@@ -42,7 +43,6 @@ struct westwood {
u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
};
-
/* TCP Westwood functions and constants */
#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
@@ -63,13 +63,13 @@ static void tcp_westwood_init(struct sock *sk)
struct westwood *w = inet_csk_ca(sk);
w->bk = 0;
- w->bw_ns_est = 0;
- w->bw_est = 0;
- w->accounted = 0;
- w->cumul_ack = 0;
+ w->bw_ns_est = 0;
+ w->bw_est = 0;
+ w->accounted = 0;
+ w->cumul_ack = 0;
w->reset_rtt_min = 1;
w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
- w->rtt_win_sx = tcp_time_stamp;
+ w->rtt_win_sx = tcp_jiffies32;
w->snd_una = tcp_sk(sk)->snd_una;
w->first_ack = 1;
}
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
*/
static inline u32 westwood_do_filter(u32 a, u32 b)
{
- return (((7 * a) + b) >> 3);
+ return ((7 * a) + b) >> 3;
}
static void westwood_filter(struct westwood *w, u32 delta)
@@ -100,11 +100,13 @@ static void westwood_filter(struct westwood *w, u32 delta)
* Called after processing group of packets.
* but all westwood needs is the last sample of srtt.
*/
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
+static void tcp_westwood_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct westwood *w = inet_csk_ca(sk);
- if (cnt > 0)
- w->rtt = tcp_sk(sk)->srtt >> 3;
+
+ if (sample->rtt_us > 0)
+ w->rtt = usecs_to_jiffies(sample->rtt_us);
}
/*
@@ -115,13 +117,13 @@ static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
static void westwood_update_window(struct sock *sk)
{
struct westwood *w = inet_csk_ca(sk);
- s32 delta = tcp_time_stamp - w->rtt_win_sx;
+ s32 delta = tcp_jiffies32 - w->rtt_win_sx;
/* Initialize w->snd_una with the first acked sequence number in order
* to fix mismatch between tp->snd_una and w->snd_una for the first
* bandwidth sample
*/
- if (w->first_ack) {
+ if (w->first_ack) {
w->snd_una = tcp_sk(sk)->snd_una;
w->first_ack = 0;
}
@@ -139,7 +141,7 @@ static void westwood_update_window(struct sock *sk)
westwood_filter(w, delta);
w->bk = 0;
- w->rtt_win_sx = tcp_time_stamp;
+ w->rtt_win_sx = tcp_jiffies32;
}
}
@@ -147,12 +149,11 @@ static inline void update_rtt_min(struct westwood *w)
{
if (w->reset_rtt_min) {
w->rtt_min = w->rtt;
- w->reset_rtt_min = 0;
+ w->reset_rtt_min = 0;
} else
w->rtt_min = min(w->rtt, w->rtt_min);
}
-
/*
* @westwood_fast_bw
* It is called when we are in fast path. In particular it is called when
@@ -183,15 +184,15 @@ static inline u32 westwood_acked_count(struct sock *sk)
w->cumul_ack = tp->snd_una - w->snd_una;
- /* If cumul_ack is 0 this is a dupack since it's not moving
- * tp->snd_una.
- */
- if (!w->cumul_ack) {
+ /* If cumul_ack is 0 this is a dupack since it's not moving
+ * tp->snd_una.
+ */
+ if (!w->cumul_ack) {
w->accounted += tp->mss_cache;
w->cumul_ack = tp->mss_cache;
}
- if (w->cumul_ack > tp->mss_cache) {
+ if (w->cumul_ack > tp->mss_cache) {
/* Partial or delayed ack */
if (w->accounted >= w->cumul_ack) {
w->accounted -= w->cumul_ack;
@@ -207,7 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
return w->cumul_ack;
}
-
/*
* TCP Westwood
* Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
@@ -218,68 +218,71 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct westwood *w = inet_csk_ca(sk);
+
return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
}
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+ if (ack_flags & CA_ACK_SLOWPATH) {
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
+
+ update_rtt_min(w);
+ return;
+ }
+
+ westwood_fast_bw(sk);
+}
+
static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
{
struct tcp_sock *tp = tcp_sk(sk);
struct westwood *w = inet_csk_ca(sk);
- switch(event) {
- case CA_EVENT_FAST_ACK:
- westwood_fast_bw(sk);
- break;
-
+ switch (event) {
case CA_EVENT_COMPLETE_CWR:
- tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
break;
-
- case CA_EVENT_FRTO:
+ case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
- /* Update RTT_min when next ack arrives */
+ /* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
break;
-
- case CA_EVENT_SLOW_ACK:
- westwood_update_window(sk);
- w->bk += westwood_acked_count(sk);
- update_rtt_min(w);
- break;
-
default:
/* don't care */
break;
}
}
-
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct sock *sk, u32 ext,
- struct sk_buff *skb)
+static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
{
const struct westwood *ca = inet_csk_ca(sk);
+
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- struct rtattr *rta;
- struct tcpvegas_info *info;
-
- rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
- info = RTA_DATA(rta);
- info->tcpv_enabled = 1;
- info->tcpv_rttcnt = 0;
- info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
- info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
- rtattr_failure: ;
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = 0;
+ info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt);
+ info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
}
+ return 0;
}
-
-static struct tcp_congestion_ops tcp_westwood = {
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_westwood_bw_rttmin,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = tcp_westwood_event,
+ .in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
.pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 000000000000..18b07ff5d20e
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * YeAH TCP
+ *
+ * For further details look at:
+ * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ *
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */
+#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */
+#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */
+#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */
+#define TCP_YEAH_PHY 8 /* maximum delta from base */
+#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */
+#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */
+
+#define TCP_SCALABLE_AI_CNT 100U
+
+/* YeAH variables */
+struct yeah {
+ struct vegas vegas; /* must be first */
+
+ /* YeAH */
+ u32 lastQ;
+ u32 doing_reno_now;
+
+ u32 reno_count;
+ u32 fast_count;
+};
+
+static void tcp_yeah_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ tcp_vegas_init(sk);
+
+ yeah->doing_reno_now = 0;
+ yeah->lastQ = 0;
+
+ yeah->reno_count = 2;
+
+ /* Ensure the MD arithmetic works. This is somewhat pedantic,
+ * since I don't think we will see a cwnd this large. :) */
+ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto do_vegas;
+ }
+
+ if (!yeah->doing_reno_now) {
+ /* Scalable */
+ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
+ acked);
+ } else {
+ /* Reno */
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
+ }
+
+ /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
+ *
+ * These are so named because they represent the approximate values
+ * of snd_una and snd_nxt at the beginning of the current RTT. More
+ * precisely, they represent the amount of data sent during the RTT.
+ * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+ * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
+ * bytes of data have been ACKed during the course of the RTT, giving
+ * an "actual" rate of:
+ *
+ * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
+ *
+ * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
+ * because delayed ACKs can cover more than one segment, so they
+ * don't line up yeahly with the boundaries of RTTs.
+ *
+ * Another unfortunate fact of life is that delayed ACKs delay the
+ * advance of the left edge of our send window, so that the number
+ * of bytes we send in an RTT is often less than our cwnd will allow.
+ * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+ */
+do_vegas:
+ if (after(ack, yeah->vegas.beg_snd_nxt)) {
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (yeah->vegas.cntRTT > 2) {
+ u32 rtt, queue;
+ u64 bw;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = yeah->vegas.minRTT;
+
+ /* Compute excess number of packets above bandwidth
+ * Avoid doing full 64 bit divide.
+ */
+ bw = tcp_snd_cwnd(tp);
+ bw *= rtt - yeah->vegas.baseRTT;
+ do_div(bw, rtt);
+ queue = bw;
+
+ if (queue > TCP_YEAH_ALPHA ||
+ rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
+ if (queue > TCP_YEAH_ALPHA &&
+ tcp_snd_cwnd(tp) > yeah->reno_count) {
+ u32 reduction = min(queue / TCP_YEAH_GAMMA ,
+ tcp_snd_cwnd(tp) >> TCP_YEAH_EPSILON);
+
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - reduction);
+
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp),
+ yeah->reno_count));
+
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ }
+
+ if (yeah->reno_count <= 2)
+ yeah->reno_count = max(tcp_snd_cwnd(tp)>>1, 2U);
+ else
+ yeah->reno_count++;
+
+ yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
+ 0xffffffU);
+ } else {
+ yeah->fast_count++;
+
+ if (yeah->fast_count > TCP_YEAH_ZETA) {
+ yeah->reno_count = 2;
+ yeah->fast_count = 0;
+ }
+
+ yeah->doing_reno_now = 0;
+ }
+
+ yeah->lastQ = queue;
+ }
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
+ yeah->vegas.beg_snd_nxt = tp->snd_nxt;
+ yeah->vegas.beg_snd_cwnd = tcp_snd_cwnd(tp);
+
+ /* Wipe the slate clean for the next RTT. */
+ yeah->vegas.cntRTT = 0;
+ yeah->vegas.minRTT = 0x7fffffff;
+ }
+}
+
+static u32 tcp_yeah_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+ u32 reduction;
+
+ if (yeah->doing_reno_now < TCP_YEAH_RHO) {
+ reduction = yeah->lastQ;
+
+ reduction = min(reduction, max(tcp_snd_cwnd(tp)>>1, 2U));
+
+ reduction = max(reduction, tcp_snd_cwnd(tp) >> TCP_YEAH_DELTA);
+ } else
+ reduction = max(tcp_snd_cwnd(tp)>>1, 2U);
+
+ yeah->fast_count = 0;
+ yeah->reno_count = max(yeah->reno_count>>1, 2U);
+
+ return max_t(int, tcp_snd_cwnd(tp) - reduction, 2);
+}
+
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
+ .init = tcp_yeah_init,
+ .ssthresh = tcp_yeah_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
+ .cong_avoid = tcp_yeah_cong_avoid,
+ .set_state = tcp_vegas_state,
+ .cwnd_event = tcp_vegas_cwnd_event,
+ .get_info = tcp_vegas_get_info,
+ .pkts_acked = tcp_vegas_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "yeah",
+};
+
+static int __init tcp_yeah_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_yeah);
+ return 0;
+}
+
+static void __exit tcp_yeah_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_yeah);
+}
+
+module_init(tcp_yeah_register);
+module_exit(tcp_yeah_unregister);
+
+MODULE_AUTHOR("Angelo P. Castellani");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 8d30c48f090e..4c1f836aae38 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* tunnel4.c: Generic IP tunnel transformer.
*
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
@@ -6,33 +7,49 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/mpls.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/xfrm.h>
-static struct xfrm_tunnel *tunnel4_handlers;
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly;
static DEFINE_MUTEX(tunnel4_mutex);
-int xfrm4_tunnel_register(struct xfrm_tunnel *handler)
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
{
- struct xfrm_tunnel **pprev;
+ return (family == AF_INET) ? &tunnel4_handlers :
+ (family == AF_INET6) ? &tunnel64_handlers :
+ &tunnelmpls4_handlers;
+}
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+
int ret = -EEXIST;
int priority = handler->priority;
mutex_lock(&tunnel4_mutex);
- for (pprev = &tunnel4_handlers; *pprev; pprev = &(*pprev)->next) {
- if ((*pprev)->priority > priority)
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
break;
- if ((*pprev)->priority == priority)
+ if (t->priority == priority)
goto err;
}
handler->next = *pprev;
- *pprev = handler;
+ rcu_assign_pointer(*pprev, handler);
ret = 0;
@@ -41,18 +58,21 @@ err:
return ret;
}
-
EXPORT_SYMBOL(xfrm4_tunnel_register);
-int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
{
- struct xfrm_tunnel **pprev;
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
int ret = -ENOENT;
mutex_lock(&tunnel4_mutex);
- for (pprev = &tunnel4_handlers; *pprev; pprev = &(*pprev)->next) {
- if (*pprev == handler) {
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
*pprev = handler->next;
ret = 0;
break;
@@ -65,9 +85,13 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
return ret;
}
-
EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
static int tunnel4_rcv(struct sk_buff *skb)
{
struct xfrm_tunnel *handler;
@@ -75,7 +99,73 @@ static int tunnel4_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto drop;
- for (handler = tunnel4_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+ struct xfrm_tunnel __rcu *head;
+ struct xfrm_tunnel *handler;
+ int ret;
+
+ head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers;
+
+ for_each_tunnel_rcu(head, handler) {
+ if (handler->cb_handler) {
+ ret = handler->cb_handler(skb, err);
+ if (ret <= 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel4_input_afinfo = {
+ .family = AF_INET,
+ .is_ipip = true,
+ .callback = tunnel4_rcv_cb,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tunnel64_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int tunnelmpls4_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct mpls_label)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -85,37 +175,124 @@ drop:
kfree_skb(skb);
return 0;
}
+#endif
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
- for (handler = tunnel4_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
-static struct net_protocol tunnel4_protocol = {
+#if IS_ENABLED(CONFIG_IPV6)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ return 0;
+
+ return -ENOENT;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ return 0;
+
+ return -ENOENT;
+}
+#endif
+
+static const struct net_protocol tunnel4_protocol = {
.handler = tunnel4_rcv,
.err_handler = tunnel4_err,
.no_policy = 1,
};
+#if IS_ENABLED(CONFIG_IPV6)
+static const struct net_protocol tunnel64_protocol = {
+ .handler = tunnel64_rcv,
+ .err_handler = tunnel64_err,
+ .no_policy = 1,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct net_protocol tunnelmpls4_protocol = {
+ .handler = tunnelmpls4_rcv,
+ .err_handler = tunnelmpls4_err,
+ .no_policy = 1,
+};
+#endif
+
static int __init tunnel4_init(void)
{
- if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
- printk(KERN_ERR "tunnel4 init: can't add protocol\n");
- return -EAGAIN;
+ if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+ goto err;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+ goto err;
}
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+ if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) {
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+ goto err;
+ }
+#endif
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+ if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) {
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+ inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS);
+#endif
+ goto err;
+ }
+#endif
return 0;
+
+err:
+ pr_err("%s: can't add protocol\n", __func__);
+ return -EAGAIN;
}
static void __exit tunnel4_fini(void)
{
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+ if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo))
+ pr_err("tunnel4 close: can't remove input afinfo\n");
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+ if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS))
+ pr_err("tunnelmpls4 close: can't remove protocol\n");
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
+ pr_err("tunnel64 close: can't remove protocol\n");
+#endif
if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
- printk(KERN_ERR "tunnel4 close: can't remove protocol\n");
+ pr_err("tunnel4 close: can't remove protocol\n");
}
module_init(tunnel4_init);
module_exit(tunnel4_fini);
+MODULE_DESCRIPTION("IPv4 XFRM tunnel library");
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 035915fc9ed3..ffe074cb5865 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,12 +6,10 @@
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- * Alan Cox, <Alan.Cox@linux.org>
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
* Hirokazu Takahashi, <taka@valinux.co.jp>
*
* Fixes:
@@ -20,8 +19,8 @@
* for udp at least is 'valid'.
* Alan Cox : Fixed icmp handling properly
* Alan Cox : Correct error for oversized datagrams
- * Alan Cox : Tidied select() semantics.
- * Alan Cox : udp_err() fixed properly, also now
+ * Alan Cox : Tidied select() semantics.
+ * Alan Cox : udp_err() fixed properly, also now
* select and read wake correctly on errors
* Alan Cox : udp_send verify_area moved to avoid mem leak
* Alan Cox : UDP can count its memory
@@ -56,7 +55,7 @@
* does have a high hit rate.
* Olaf Kirch : Don't linearise iovec on sendmsg.
* Andi Kleen : Some cleanups, cache destination entry
- * for connect.
+ * for connect.
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
* Melvin Smith : Check msg_name not msg_namelen in sendto(),
* return ENOTCONN for unconnected sockets (POSIX)
@@ -69,273 +68,899 @@
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
- * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support
+ * James Chapman : Add L2TP encapsulation type.
*/
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
+
+#define pr_fmt(fmt) "UDP: " fmt
+
+#include <linux/bpf-cgroup.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
+#include <linux/memblock.h>
+#include <linux/highmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <net/net_namespace.h>
#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
+#include <net/gso.h>
#include <net/xfrm.h>
+#include <trace/events/udp.h>
+#include <linux/static_key.h>
+#include <linux/btf_ids.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
#include "udp_impl.h"
+#include <net/sock_reuseport.h>
+#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
+#include <net/gro.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6_stubs.h>
+#endif
+#include <net/rps.h>
-/*
- * Snmp MIB for the UDP layer
- */
+struct udp_table udp_table __read_mostly;
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
+long sysctl_udp_mem[3] __read_mostly;
+EXPORT_IPV6_MOD(sysctl_udp_mem);
-struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-DEFINE_RWLOCK(udp_hash_lock);
+DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
-static int udp_port_rover;
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
-static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[])
+static struct udp_table *udp_get_table_prot(struct sock *sk)
{
- struct sock *sk;
- struct hlist_node *node;
+ return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
+}
- sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)])
- if (inet_sk(sk)->num == num)
- return 1;
+static int udp_lib_lport_inuse(struct net *net, __u16 num,
+ const struct udp_hslot *hslot,
+ unsigned long *bitmap,
+ struct sock *sk, unsigned int log)
+{
+ kuid_t uid = sk_uid(sk);
+ struct sock *sk2;
+
+ sk_for_each(sk2, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sk_uid(sk2))) {
+ if (!bitmap)
+ return 0;
+ } else {
+ if (!bitmap)
+ return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log,
+ bitmap);
+ }
+ }
+ }
return 0;
}
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+ struct udp_hslot *hslot2,
+ struct sock *sk)
+{
+ kuid_t uid = sk_uid(sk);
+ struct sock *sk2;
+ int res = 0;
+
+ spin_lock(&hslot2->lock);
+ udp_portaddr_for_each_entry(sk2, &hslot2->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sk_uid(sk2))) {
+ res = 0;
+ } else {
+ res = 1;
+ }
+ break;
+ }
+ }
+ spin_unlock(&hslot2->lock);
+ return res;
+}
+
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
+{
+ struct net *net = sock_net(sk);
+ kuid_t uid = sk_uid(sk);
+ struct sock *sk2;
+
+ sk_for_each(sk2, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
+ (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
+ inet_rcv_saddr_equal(sk, sk2, false)) {
+ return reuseport_add_sock(sk, sk2,
+ inet_rcv_saddr_any(sk));
+ }
+ }
+
+ return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
+}
+
/**
- * __udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
+ * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
*
* @sk: socket struct in question
* @snum: port number to look up
- * @udptable: hash list table, must be of UDP_HTABLE_SIZE
- * @port_rover: pointer to record of last unallocated port
- * @saddr_comp: AF-dependent comparison of bound local IP addresses
+ * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
+ * with NULL address
*/
-int __udp_lib_get_port(struct sock *sk, unsigned short snum,
- struct hlist_head udptable[], int *port_rover,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2 ) )
+int udp_lib_get_port(struct sock *sk, unsigned short snum,
+ unsigned int hash2_nulladdr)
{
- struct hlist_node *node;
- struct hlist_head *head;
- struct sock *sk2;
- int error = 1;
-
- write_lock_bh(&udp_hash_lock);
- if (snum == 0) {
- int best_size_so_far, best, result, i;
-
- if (*port_rover > sysctl_local_port_range[1] ||
- *port_rover < sysctl_local_port_range[0])
- *port_rover = sysctl_local_port_range[0];
- best_size_so_far = 32767;
- best = result = *port_rover;
- for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
- int size;
-
- head = &udptable[result & (UDP_HTABLE_SIZE - 1)];
- if (hlist_empty(head)) {
- if (result > sysctl_local_port_range[1])
- result = sysctl_local_port_range[0] +
- ((result - sysctl_local_port_range[0]) &
- (UDP_HTABLE_SIZE - 1));
- goto gotit;
+ struct udp_table *udptable = udp_get_table_prot(sk);
+ struct udp_hslot *hslot, *hslot2;
+ struct net *net = sock_net(sk);
+ int error = -EADDRINUSE;
+
+ if (!snum) {
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+ unsigned short first, last;
+ int low, high, remaining;
+ unsigned int rand;
+
+ inet_sk_get_local_port_range(sk, &low, &high);
+ remaining = (high - low) + 1;
+
+ rand = get_random_u32();
+ first = reciprocal_scale(rand, remaining) + low;
+ /*
+ * force rand to be an odd multiple of UDP_HTABLE_SIZE
+ */
+ rand = (rand | 1) * (udptable->mask + 1);
+ last = first + udptable->mask + 1;
+ do {
+ hslot = udp_hashslot(udptable, net, first);
+ bitmap_zero(bitmap, PORTS_PER_CHAIN);
+ spin_lock_bh(&hslot->lock);
+ udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
+ udptable->log);
+
+ snum = first;
+ /*
+ * Iterate on all possible values of snum for this hash.
+ * Using steps of an odd multiple of UDP_HTABLE_SIZE
+ * give us randomization and full range coverage.
+ */
+ do {
+ if (low <= snum && snum <= high &&
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_local_reserved_port(net, snum))
+ goto found;
+ snum += rand;
+ } while (snum != first);
+ spin_unlock_bh(&hslot->lock);
+ cond_resched();
+ } while (++first != last);
+ goto fail;
+ } else {
+ hslot = udp_hashslot(udptable, net, snum);
+ spin_lock_bh(&hslot->lock);
+ if (hslot->count > 10) {
+ int exist;
+ unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+ slot2 &= udptable->mask;
+ hash2_nulladdr &= udptable->mask;
+
+ hslot2 = udp_hashslot2(udptable, slot2);
+ if (hslot->count < hslot2->count)
+ goto scan_primary_hash;
+
+ exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
+ if (!exist && (hash2_nulladdr != slot2)) {
+ hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk);
}
- size = 0;
- sk_for_each(sk2, node, head)
- if (++size < best_size_so_far) {
- best_size_so_far = size;
- best = result;
- }
- }
- result = best;
- for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
- if (result > sysctl_local_port_range[1])
- result = sysctl_local_port_range[0]
- + ((result - sysctl_local_port_range[0]) &
- (UDP_HTABLE_SIZE - 1));
- if (! __udp_lib_lport_inuse(result, udptable))
- break;
+ if (exist)
+ goto fail_unlock;
+ else
+ goto found;
}
- if (i >= (1 << 16) / UDP_HTABLE_SIZE)
- goto fail;
-gotit:
- *port_rover = snum = result;
- } else {
- head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
-
- sk_for_each(sk2, node, head)
- if (inet_sk(sk2)->num == snum &&
- sk2 != sk &&
- (!sk2->sk_reuse || !sk->sk_reuse) &&
- (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
- || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (*saddr_comp)(sk, sk2) )
- goto fail;
- }
- inet_sk(sk)->num = snum;
+scan_primary_hash:
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
+ goto fail_unlock;
+ }
+found:
+ inet_sk(sk)->inet_num = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
- head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
- sk_add_node(sk, head);
- sock_prot_inc_use(sk->sk_prot);
+ if (sk->sk_reuseport &&
+ udp_reuseport_add_sock(sk, hslot)) {
+ inet_sk(sk)->inet_num = 0;
+ udp_sk(sk)->udp_port_hash = 0;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
+ goto fail_unlock;
+ }
+
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
+ sk_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+ sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ else
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
}
+
error = 0;
+fail_unlock:
+ spin_unlock_bh(&hslot->lock);
fail:
- write_unlock_bh(&udp_hash_lock);
return error;
}
+EXPORT_IPV6_MOD(udp_lib_get_port);
-__inline__ int udp_get_port(struct sock *sk, unsigned short snum,
- int (*scmp)(const struct sock *, const struct sock *))
+int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
- return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp);
+ unsigned int hash2_nulladdr =
+ ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+ unsigned int hash2_partial =
+ ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
-inline int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
+static int compute_score(struct sock *sk, const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum,
+ int dif, int sdif)
{
- struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
+ int score;
+ struct inet_sock *inet;
+ bool dev_match;
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ ipv6_only_sock(sk))
+ return -1;
+
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
+
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
- return ( !ipv6_only_sock(sk2) &&
- (!inet1->rcv_saddr || !inet2->rcv_saddr ||
- inet1->rcv_saddr == inet2->rcv_saddr ));
+ inet = inet_sk(sk);
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 4;
+ }
+
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ if (sk->sk_bound_dev_if)
+ score += 4;
+
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ score++;
+ return score;
}
-static inline int udp_v4_get_port(struct sock *sk, unsigned short snum)
+u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
+ const __be32 faddr, const __be16 fport)
{
- return udp_get_port(sk, snum, ipv4_rcv_saddr_equal);
+ net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ udp_ehash_secret + net_hash_mix(net));
}
+EXPORT_IPV6_MOD(udp_ehashfn);
-/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
- * harder than this. -DaveM
+/**
+ * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port)
+ * @net: Network namespace
+ * @saddr: Source address, network order
+ * @sport: Source port, network order
+ * @daddr: Destination address, network order
+ * @hnum: Destination port, host order
+ * @dif: Destination interface index
+ * @sdif: Destination bridge port index, if relevant
+ * @udptable: Set of UDP hash tables
+ *
+ * Simplified lookup to be used as fallback if no sockets are found due to a
+ * potential race between (receive) address change, and lookup happening before
+ * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
+ * result sockets, because if we have one, we don't need the fallback at all.
+ *
+ * Called under rcu_read_lock().
+ *
+ * Return: socket with highest matching score if any, NULL if none
*/
-static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
- __be32 daddr, __be16 dport,
- int dif, struct hlist_head udptable[])
+static struct sock *udp4_lib_lookup1(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ const struct udp_table *udptable)
{
+ unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot = &udptable->hash[slot];
struct sock *sk, *result = NULL;
- struct hlist_node *node;
- unsigned short hnum = ntohs(dport);
- int badness = -1;
-
- read_lock(&udp_hash_lock);
- sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (inet->num == hnum && !ipv6_only_sock(sk)) {
- int score = (sk->sk_family == PF_INET ? 1 : 0);
- if (inet->rcv_saddr) {
- if (inet->rcv_saddr != daddr)
- continue;
- score+=2;
- }
- if (inet->daddr) {
- if (inet->daddr != saddr)
- continue;
- score+=2;
- }
- if (inet->dport) {
- if (inet->dport != sport)
- continue;
- score+=2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score+=2;
- }
- if(score == 9) {
+ int score, badness = 0;
+
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net,
+ saddr, sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+
+ return result;
+}
+
+/* called with rcu_read_lock() */
+static struct sock *udp4_lib_lookup2(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_hslot *hslot2,
+ struct sk_buff *skb)
+{
+ struct sock *sk, *result;
+ int score, badness;
+ bool need_rescore;
+
+ result = NULL;
+ badness = 0;
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ need_rescore = false;
+rescore:
+ score = compute_score(need_rescore ? result : sk, net, saddr,
+ sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ badness = score;
+
+ if (need_rescore)
+ continue;
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
result = sk;
- break;
- } else if(score > badness) {
+ continue;
+ }
+
+ result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, udp_ehashfn);
+ if (!result) {
result = sk;
- badness = score;
+ continue;
}
+
+ /* Fall back to scoring if group has connections */
+ if (!reuseport_has_conns(sk))
+ return result;
+
+ /* Reuseport logic returned an error, keep original score. */
+ if (IS_ERR(result))
+ continue;
+
+ /* compute_score is too long of a function to be
+ * inlined, and calling it again here yields
+ * measurable overhead for some
+ * workloads. Work around it by jumping
+ * backwards to rescore 'result'.
+ */
+ need_rescore = true;
+ goto rescore;
}
}
+ return result;
+}
+
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp4_lib_lookup4(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_table *udptable)
+{
+ return NULL;
+}
+
+static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
+ u16 newhash4)
+{
+}
+
+static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp4_lib_lookup4(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_table *udptable)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ const struct hlist_nulls_node *node;
+ struct udp_hslot *hslot4;
+ unsigned int hash4, slot;
+ struct udp_sock *up;
+ struct sock *sk;
+
+ hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash4 & udptable->mask;
+ hslot4 = &udptable->hash4[slot];
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+
+begin:
+ /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */
+ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
+ sk = (struct sock *)up;
+ if (inet_match(net, sk, acookie, ports, dif, sdif))
+ return sk;
+ }
+
+ /* if the nulls value we got at the end of this lookup is not the
+ * expected one, we must restart lookup. We probably met an item that
+ * was moved to another chain due to rehash.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ return NULL;
+}
+
+/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */
+static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
+ u16 newhash4)
+{
+ struct udp_hslot *hslot4, *nhslot4;
+
+ hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+ nhslot4 = udp_hashslot4(udptable, newhash4);
+ udp_sk(sk)->udp_lrpa_hash = newhash4;
+
+ if (hslot4 != nhslot4) {
+ spin_lock_bh(&hslot4->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+ hslot4->count--;
+ spin_unlock_bh(&hslot4->lock);
+
+ spin_lock_bh(&nhslot4->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
+ &nhslot4->nulls_head);
+ nhslot4->count++;
+ spin_unlock_bh(&nhslot4->lock);
+ }
+}
+
+static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+ struct udp_hslot *hslot2, *hslot4;
+
+ if (udp_hashed4(sk)) {
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+
+ spin_lock(&hslot4->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+ hslot4->count--;
+ spin_unlock(&hslot4->lock);
+
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+ }
+}
+
+void udp_lib_hash4(struct sock *sk, u16 hash)
+{
+ struct udp_hslot *hslot, *hslot2, *hslot4;
+ struct net *net = sock_net(sk);
+ struct udp_table *udptable;
+
+ /* Connected udp socket can re-connect to another remote address, which
+ * will be handled by rehash. Thus no need to redo hash4 here.
+ */
+ if (udp_hashed4(sk))
+ return;
+
+ udptable = net->ipv4.udp_table;
+ hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ hslot4 = udp_hashslot4(udptable, hash);
+ udp_sk(sk)->udp_lrpa_hash = hash;
+
+ spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
+ spin_lock(&hslot4->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
+ &hslot4->nulls_head);
+ hslot4->count++;
+ spin_unlock(&hslot4->lock);
+
+ spin_lock(&hslot2->lock);
+ udp_hash4_inc(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+}
+EXPORT_IPV6_MOD(udp_lib_hash4);
+
+/* call with sock lock */
+void udp4_hash4(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ unsigned int hash;
+
+ if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY))
+ return;
+
+ hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+
+ udp_lib_hash4(sk, hash);
+}
+EXPORT_IPV6_MOD(udp4_hash4);
+#endif /* CONFIG_BASE_SMALL */
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
+ __be16 sport, __be32 daddr, __be16 dport, int dif,
+ int sdif, struct udp_table *udptable, struct sk_buff *skb)
+{
+ unsigned short hnum = ntohs(dport);
+ struct udp_hslot *hslot2;
+ struct sock *result, *sk;
+ unsigned int hash2;
+
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ if (udp_has_hash4(hslot2)) {
+ result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum,
+ dif, sdif, udptable);
+ if (result) /* udp4_lib_lookup4 return sk or NULL */
+ return result;
+ }
+
+ /* Lookup connected or non-wildcard socket */
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+ goto done;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+ udptable == net->ipv4.udp_table) {
+ sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, dif,
+ udp_ehashfn);
+ if (sk) {
+ result = sk;
+ goto done;
+ }
+ }
+
+ /* Got non-wildcard socket or error on first lookup */
if (result)
- sock_hold(result);
- read_unlock(&udp_hash_lock);
+ goto done;
+
+ /* Lookup wildcard sockets */
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ htonl(INADDR_ANY), hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result))
+ goto done;
+
+ /* Primary hash (destination port) lookup as fallback for this race:
+ * 1. __ip4_datagram_connect() sets sk_rcv_saddr
+ * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet
+ * 3. rehash operation updating _secondary and four-tuple_ hashes
+ * The primary hash doesn't need an update after 1., so, thanks to this
+ * further step, 1. and 3. don't need to be atomic against the lookup.
+ */
+ result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
+ udptable);
+
+done:
+ if (IS_ERR(result))
+ return NULL;
return result;
}
+EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
-static inline struct sock *udp_v4_mcast_next(struct sock *sk,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif)
+static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport,
+ struct udp_table *udptable)
{
- struct hlist_node *node;
- struct sock *s = sk;
- unsigned short hnum = ntohs(loc_port);
+ const struct iphdr *iph = ip_hdr(skb);
- sk_for_each_from(s, node) {
- struct inet_sock *inet = inet_sk(s);
+ return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ inet_sdif(skb), udptable, skb);
+}
- if (inet->num != hnum ||
- (inet->daddr && inet->daddr != rmt_addr) ||
- (inet->dport != rmt_port && inet->dport) ||
- (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
- ipv6_only_sock(s) ||
- (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
- continue;
- if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
+struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
+ __be16 sport, __be16 dport)
+{
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
+ struct net *net = dev_net(skb->dev);
+ int iif, sdif;
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp4_lib_lookup(net, iph->saddr, sport,
+ iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
+}
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
+struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
+{
+ struct sock *sk;
+
+ sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ dif, 0, net->ipv4.udp_table, NULL);
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
+
+static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, int sdif, unsigned short hnum)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(sk) ||
+ !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return false;
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
+ return false;
+ return true;
+}
+
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+EXPORT_IPV6_MOD(udp_encap_needed_key);
+
+#if IS_ENABLED(CONFIG_IPV6)
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+EXPORT_IPV6_MOD(udpv6_encap_needed_key);
+#endif
+
+void udp_encap_enable(void)
+{
+ static_branch_inc(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+void udp_encap_disable(void)
+{
+ static_branch_dec(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_disable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, u32 info);
+ const struct ip_tunnel_encap_ops *encap;
+
+ encap = rcu_dereference(iptun_encaps[i]);
+ if (!encap)
continue;
- goto found;
- }
- s = NULL;
-found:
- return s;
+ handler = encap->err_handler;
+ if (handler && !handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+ const struct iphdr *iph,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sock *sk,
+ struct sk_buff *skb, u32 info)
+{
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ int network_offset, transport_offset;
+ struct udp_sock *up;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv4 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, iph->ihl << 2);
+
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (lookup && lookup(sk, skb))
+ sk = NULL;
+
+ goto out;
+ }
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+ iph->saddr, uh->dest, skb->dev->ifindex, 0,
+ udptable, NULL);
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+out:
+ if (!sk)
+ sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
}
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
* be closed and the error returned to the user. If err > 0
- * it's just the icmp type << 8 | icmp code.
+ * it's just the icmp type << 8 | icmp code.
* Header points to the ip header of the error packet. We move
* on past this. Then (as it used to claim before adjustment)
* header points to the first 8 bytes of the udp header. We need
* to find the appropriate port.
*/
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
- struct iphdr *iph = (struct iphdr*)skb->data;
- struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
+ struct net *net = dev_net(skb->dev);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
+ iph->saddr, uh->source, skb->dev->ifindex,
+ inet_sdif(skb), udptable, NULL);
+
+ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+ /* No socket for error: try tunnels before discarding */
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
+ info);
+ if (!sk)
+ return 0;
+ } else
+ sk = ERR_PTR(-ENOENT);
+
+ if (IS_ERR(sk)) {
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
- sk = __udp4_lib_lookup(iph->daddr, uh->dest, iph->saddr, uh->source,
- skb->dev->ifindex, udptable );
- if (sk == NULL) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- return; /* No socket for error */
+ tunnel = true;
}
err = 0;
@@ -355,7 +980,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
break;
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
- if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
@@ -368,172 +994,318 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
err = icmp_err_convert[code].errno;
}
break;
+ case ICMP_REDIRECT:
+ ipv4_sk_redirect(skb, sk);
+ goto out;
}
/*
- * RFC1122: OK. Passes ICMP errors back to application, as per
+ * RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
- if (!inet->recverr) {
+ if (tunnel) {
+ /* ...not for tunnels though: we don't have a sending socket */
+ if (udp_sk(sk)->encap_err_rcv)
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+ (u8 *)(uh+1));
+ goto out;
+ }
+ if (!inet_test_bit(RECVERR, sk)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
- } else {
- ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
- }
+ } else
+ ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
+
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
out:
- sock_put(sk);
+ return 0;
}
-__inline__ void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, udp_hash);
+ return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
}
/*
* Throw away all pending data and cancel the corking. Socket is locked.
*/
-static void udp_flush_pending_frames(struct sock *sk)
+void udp_flush_pending_frames(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
if (up->pending) {
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
ip_flush_pending_frames(sk);
}
}
+EXPORT_IPV6_MOD(udp_flush_pending_frames);
/**
- * udp4_hwcsum_outgoing - handle outgoing HW checksumming
- * @sk: socket we are sending on
+ * udp4_hwcsum - handle outgoing HW checksumming
* @skb: sk_buff containing the filled-in UDP header
* (checksum field must be zeroed out)
+ * @src: source IP address
+ * @dst: destination IP address
*/
-static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, int len )
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
- unsigned int offset;
- struct udphdr *uh = skb->h.uh;
+ struct udphdr *uh = udp_hdr(skb);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int hlen = len;
__wsum csum = 0;
- if (skb_queue_len(&sk->sk_write_queue) == 1) {
+ if (!skb_has_frag_list(skb)) {
/*
* Only one fragment on the socket.
*/
+ skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
- uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
+ uh->check = ~csum_tcpudp_magic(src, dst, len,
+ IPPROTO_UDP, 0);
} else {
+ struct sk_buff *frags;
+
/*
* HW-checksum won't work as there are two or more
* fragments on the socket so that all csums of sk_buffs
* should be together
*/
- offset = skb->h.raw - skb->data;
- skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ skb_walk_frags(skb, frags) {
+ csum = csum_add(csum, frags->csum);
+ hlen -= frags->len;
+ }
+ csum = skb_checksum(skb, offset, hlen, csum);
skb->ip_summed = CHECKSUM_NONE;
- skb_queue_walk(&sk->sk_write_queue, skb) {
- csum = csum_add(csum, skb->csum);
- }
-
uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
}
}
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
-/*
- * Push out all pending data as one UDP datagram. Socket is locked.
+/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
*/
-static int udp_push_pending_frames(struct sock *sk)
+void udp_set_csum(bool nocheck, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, int len)
{
- struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck) {
+ uh->check = 0;
+ } else if (skb_is_gso(skb)) {
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ uh->check = 0;
+ uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ }
+}
+EXPORT_SYMBOL(udp_set_csum);
+
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+ struct inet_cork *cork)
+{
+ struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
- struct flowi *fl = &inet->cork.fl;
- struct sk_buff *skb;
struct udphdr *uh;
- int err = 0;
+ int err;
+ int is_udplite = IS_UDPLITE(sk);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
__wsum csum = 0;
- /* Grab the skbuff where UDP header space exists. */
- if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
- goto out;
-
/*
* Create a UDP header
*/
- uh = skb->h.uh;
- uh->source = fl->fl_ip_sport;
- uh->dest = fl->fl_ip_dport;
- uh->len = htons(up->len);
+ uh = udp_hdr(skb);
+ uh->source = inet->inet_sport;
+ uh->dest = fl4->fl4_dport;
+ uh->len = htons(len);
uh->check = 0;
- if (up->pcflag) /* UDP-Lite */
- csum = udplite_csum_outgoing(sk, skb);
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ if (sk->sk_no_check_tx) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ if (is_udplite || dst_xfrm(skb_dst(skb))) {
+ kfree_skb(skb);
+ return -EIO;
+ }
+
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
+
+ /* Don't checksum the payload, skb will get segmented */
+ goto csum_partial;
+ }
+ }
- else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
+ if (is_udplite) /* UDP-Lite */
+ csum = udplite_csum(skb);
+
+ else if (sk->sk_no_check_tx) { /* UDP csum off */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
- udp4_hwcsum_outgoing(sk, skb, fl->fl4_src,fl->fl4_dst, up->len);
+ udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
- } else /* `normal' UDP */
- csum = udp_csum_outgoing(sk, skb);
+ } else
+ csum = udp_csum(skb);
/* add protocol-dependent pseudo-header */
- uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
- sk->sk_protocol, csum );
+ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
+ sk->sk_protocol, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
send:
- err = ip_push_pending_frames(sk);
+ err = ip_send_skb(sock_net(sk), skb);
+ if (err) {
+ if (err == -ENOBUFS &&
+ !inet_test_bit(RECVERR, sk)) {
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ err = 0;
+ }
+ } else
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
+ return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+int udp_push_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ goto out;
+
+ err = udp_send_skb(skb, fl4, &inet->cork.base);
+
out:
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
return err;
}
+EXPORT_IPV6_MOD(udp_push_pending_frames);
+
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+ switch (cmsg->cmsg_type) {
+ case UDP_SEGMENT:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+ return -EINVAL;
+ *gso_size = *(__u16 *)CMSG_DATA(cmsg);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+ struct cmsghdr *cmsg;
+ bool need_ip = false;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_UDP) {
+ need_ip = true;
+ continue;
+ }
+
+ err = __udp_cmsg_send(cmsg, gso_size);
+ if (err)
+ return err;
+ }
+
+ return need_ip;
+}
+EXPORT_IPV6_MOD_GPL(udp_cmsg_send);
-int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ struct flowi4 fl4_stack;
+ struct flowi4 *fl4;
int ulen = len;
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
int free = 0;
int connected = 0;
__be32 daddr, faddr, saddr;
+ u8 scope;
__be16 dport;
- u8 tos;
- int err, is_udplite = up->pcflag;
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int err, is_udplite = IS_UDPLITE(sk);
+ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sk_buff *skb;
+ struct ip_options_data opt_copy;
+ int uc_index;
if (len > 0xFFFF)
return -EMSGSIZE;
- /*
+ /*
* Check the flags.
*/
- if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
- ipc.opt = NULL;
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
- if (up->pending) {
+ fl4 = &inet->cork.fl.u.ip4;
+ if (READ_ONCE(up->pending)) {
/*
* There are pending frames.
- * The socket lock must be held while it's corked.
+ * The socket lock must be held while it's corked.
*/
lock_sock(sk);
if (likely(up->pending)) {
@@ -541,17 +1313,16 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
release_sock(sk);
return -EINVAL;
}
- goto do_append_data;
+ goto do_append_data;
}
release_sock(sk);
}
ulen += sizeof(struct udphdr);
/*
- * Get and verify the address.
+ * Get and verify the address.
*/
- if (msg->msg_name) {
- struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
+ if (usin) {
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -566,85 +1337,153 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- daddr = inet->daddr;
- dport = inet->dport;
+ daddr = inet->inet_daddr;
+ dport = inet->inet_dport;
/* Open fast path for connected socket.
Route will not be used, if at least one option is set.
*/
connected = 1;
- }
- ipc.addr = inet->saddr;
+ }
+
+ ipcm_init_sk(&ipc, inet);
+ ipc.gso_size = READ_ONCE(up->gso_size);
- ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(msg, &ipc);
- if (err)
+ err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+ if (err > 0) {
+ err = ip_cmsg_send(sk, msg, &ipc,
+ sk->sk_family == AF_INET6);
+ connected = 0;
+ }
+ if (unlikely(err < 0)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
- connected = 0;
}
- if (!ipc.opt)
- ipc.opt = inet->opt;
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
+ err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+ (struct sockaddr *)usin,
+ &msg->msg_namelen,
+ &ipc.addr);
+ if (err)
+ goto out_free;
+ if (usin) {
+ if (usin->sin_port == 0) {
+ /* BPF program set invalid port. Reject it. */
+ err = -EINVAL;
+ goto out_free;
+ }
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ }
+ }
saddr = ipc.addr;
ipc.addr = faddr = daddr;
- if (ipc.opt && ipc.opt->srr) {
- if (!daddr)
- return -EINVAL;
- faddr = ipc.opt->faddr;
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ faddr = ipc.opt->opt.faddr;
connected = 0;
}
- tos = RT_TOS(inet->tos);
- if (sock_flag(sk, SOCK_LOCALROUTE) ||
- (msg->msg_flags & MSG_DONTROUTE) ||
- (ipc.opt && ipc.opt->is_strictroute)) {
- tos |= RTO_ONLINK;
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
+ if (scope == RT_SCOPE_LINK)
connected = 0;
- }
- if (MULTICAST(daddr)) {
- if (!ipc.oif)
- ipc.oif = inet->mc_index;
+ uc_index = READ_ONCE(inet->uc_index);
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
connected = 0;
+ } else if (!ipc.oif) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
+ /* oif is set, packet is to local broadcast and
+ * uc_index is set. oif is most likely set
+ * by sk_bound_dev_if. If uc_index != oif check if the
+ * oif is an L3 master and uc_index is an L3 slave.
+ * If so, we want to allow the send using the uc_index.
+ */
+ if (ipc.oif != uc_index &&
+ ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
+ uc_index)) {
+ ipc.oif = uc_index;
+ }
}
if (connected)
- rt = (struct rtable*)sk_dst_check(sk, 0);
-
- if (rt == NULL) {
- struct flowi fl = { .oif = ipc.oif,
- .nl_u = { .ip4_u =
- { .daddr = faddr,
- .saddr = saddr,
- .tos = tos } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = inet->sport,
- .dport = dport } } };
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
- if (err)
+ rt = dst_rtable(sk_dst_check(sk, 0));
+
+ if (!rt) {
+ struct net *net = sock_net(sk);
+ __u8 flow_flags = inet_sk_flowi_flags(sk);
+
+ fl4 = &fl4_stack;
+
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
+ sk->sk_protocol, flow_flags, faddr, saddr,
+ dport, inet->inet_sport,
+ sk_uid(sk));
+
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
+ }
err = -EACCES;
if ((rt->rt_flags & RTCF_BROADCAST) &&
!sock_flag(sk, SOCK_BROADCAST))
goto out;
if (connected)
- sk_dst_set(sk, dst_clone(&rt->u.dst));
+ sk_dst_set(sk, dst_clone(&rt->dst));
}
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
- saddr = rt->rt_src;
+ saddr = fl4->saddr;
if (!ipc.addr)
- daddr = ipc.addr = rt->rt_dst;
+ daddr = ipc.addr = fl4->daddr;
+
+ /* Lockless fast path for the non-corking case. */
+ if (!corkreq) {
+ struct inet_cork cork;
+
+ skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ &cork, msg->msg_flags);
+ err = PTR_ERR(skb);
+ if (!IS_ERR_OR_NULL(skb))
+ err = udp_send_skb(skb, fl4, &cork);
+ goto out;
+ }
lock_sock(sk);
if (unlikely(up->pending)) {
@@ -652,41 +1491,40 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
+ net_dbg_ratelimited("socket already corked\n");
err = -EINVAL;
goto out;
}
/*
* Now cork the socket to pend data.
*/
- inet->cork.fl.fl4_dst = daddr;
- inet->cork.fl.fl_ip_dport = dport;
- inet->cork.fl.fl4_src = saddr;
- inet->cork.fl.fl_ip_sport = inet->sport;
- up->pending = AF_INET;
+ fl4 = &inet->cork.fl.u.ip4;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->fl4_dport = dport;
+ fl4->fl4_sport = inet->inet_sport;
+ WRITE_ONCE(up->pending, AF_INET);
do_append_data:
up->len += ulen;
- getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
- err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
- sizeof(struct udphdr), &ipc, rt,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ err = ip_append_data(sk, fl4, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_flush_pending_frames(sk);
else if (!corkreq)
err = udp_push_pending_frames(sk);
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
release_sock(sk);
out:
ip_rt_put(rt);
+out_free:
if (free)
kfree(ipc.opt);
- if (!err) {
- UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite);
+ if (!err)
return len;
- }
/*
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
* ENOBUFS might not be good (it's not tunable per se), but otherwise
@@ -695,317 +1533,837 @@ out:
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- UDP_INC_STATS_USER(UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err;
do_confirm:
- dst_confirm(&rt->u.dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
goto out;
}
+EXPORT_SYMBOL(udp_sendmsg);
-int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+void udp_splice_eof(struct socket *sock)
{
+ struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
- int ret;
- if (!up->pending) {
- struct msghdr msg = { .msg_flags = flags|MSG_MORE };
+ if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
+ return;
+
+ lock_sock(sk);
+ if (up->pending && !udp_test_bit(CORK, sk))
+ udp_push_pending_frames(sk);
+ release_sock(sk);
+}
+EXPORT_IPV6_MOD_GPL(udp_splice_eof);
+
+#define UDP_SKB_IS_STATELESS 0x80000000
+
+/* all head states (dst, sk, nf conntrack) except skb extensions are
+ * cleared by udp_rcv().
+ *
+ * We need to preserve secpath, if present, to eventually process
+ * IP_CMSG_PASSSEC at recvmsg() time.
+ *
+ * Other extensions can be cleared.
+ */
+static bool udp_try_make_stateless(struct sk_buff *skb)
+{
+ if (!skb_has_extensions(skb))
+ return true;
+
+ if (!secpath_exists(skb)) {
+ skb_ext_reset(skb);
+ return true;
+ }
+
+ return false;
+}
+
+static void udp_set_dev_scratch(struct sk_buff *skb)
+{
+ struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
+ scratch->_tsize_state = skb->truesize;
+#if BITS_PER_LONG == 64
+ scratch->len = skb->len;
+ scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
+ scratch->is_linear = !skb_is_nonlinear(skb);
+#endif
+ if (udp_try_make_stateless(skb))
+ scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
+}
+
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+ /* We come here after udp_lib_checksum_complete() returned 0.
+ * This means that __skb_checksum_complete() might have
+ * set skb->csum_valid to 1.
+ * On 64bit platforms, we can set csum_unnecessary
+ * to true, but only if the skb is not shared.
+ */
+#if BITS_PER_LONG == 64
+ if (!skb_shared(skb))
+ udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
+static int udp_skb_truesize(struct sk_buff *skb)
+{
+ return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
+}
+
+static bool udp_skb_has_head_state(struct sk_buff *skb)
+{
+ return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
+}
+
+/* fully reclaim rmem/fwd memory allocated for skb */
+static void udp_rmem_release(struct sock *sk, unsigned int size,
+ int partial, bool rx_queue_lock_held)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct sk_buff_head *sk_queue;
+ unsigned int amt;
+
+ if (likely(partial)) {
+ up->forward_deficit += size;
+ size = up->forward_deficit;
+ if (size < READ_ONCE(up->forward_threshold) &&
+ !skb_queue_empty(&up->reader_queue))
+ return;
+ } else {
+ size += up->forward_deficit;
+ }
+ up->forward_deficit = 0;
+
+ /* acquire the sk_receive_queue for fwd allocated memory scheduling,
+ * if the called don't held it already
+ */
+ sk_queue = &sk->sk_receive_queue;
+ if (!rx_queue_lock_held)
+ spin_lock(&sk_queue->lock);
+
+ amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+ sk_forward_alloc_add(sk, size - amt);
+
+ if (amt)
+ __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
+
+ atomic_sub(size, &sk->sk_rmem_alloc);
+
+ /* this can save us from acquiring the rx queue lock on next receive */
+ skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
+
+ if (!rx_queue_lock_held)
+ spin_unlock(&sk_queue->lock);
+}
+
+/* Note: called with reader_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
+{
+ prefetch(&skb->data);
+ udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
+}
+EXPORT_IPV6_MOD(udp_skb_destructor);
+
+/* as above, but the caller held the rx queue lock, too */
+static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
+{
+ prefetch(&skb->data);
+ udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
+}
+
+static int udp_rmem_schedule(struct sock *sk, int size)
+{
+ int delta;
+
+ delta = size - sk->sk_forward_alloc;
+ if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+ struct udp_prod_queue *udp_prod_queue;
+ struct sk_buff *next, *to_drop = NULL;
+ struct llist_node *ll_list;
+ unsigned int rmem, rcvbuf;
+ int size, err = -ENOMEM;
+ int total_size = 0;
+ int q_size = 0;
+ int dropcount;
+ int nb = 0;
+
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ size = skb->truesize;
+
+ udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];
+
+ rmem += atomic_read(&udp_prod_queue->rmem_alloc);
+
+ /* Immediately drop when the receive queue is full.
+ * Cast to unsigned int performs the boundary check for INT_MAX.
+ */
+ if (rmem + size > rcvbuf) {
+ if (rcvbuf > INT_MAX >> 1)
+ goto drop;
+
+ /* Accept the packet if queue is empty. */
+ if (rmem)
+ goto drop;
+ }
+
+ /* Under mem pressure, it might be helpful to help udp_recvmsg()
+ * having linear skbs :
+ * - Reduce memory overhead and thus increase receive queue capacity
+ * - Less cache line misses at copyout() time
+ * - Less work at consume_skb() (less alien page frag freeing)
+ */
+ if (rmem > (rcvbuf >> 1)) {
+ skb_condense(skb);
+ size = skb->truesize;
+ }
- /* Call udp_sendmsg to specify destination address which
- * sendpage interface can't pass.
- * This will succeed only when the socket is connected.
+ udp_set_dev_scratch(skb);
+
+ atomic_add(size, &udp_prod_queue->rmem_alloc);
+
+ if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
+ return 0;
+
+ dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0;
+
+ spin_lock(&list->lock);
+
+ ll_list = llist_del_all(&udp_prod_queue->ll_root);
+
+ ll_list = llist_reverse_order(ll_list);
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ size = udp_skb_truesize(skb);
+ total_size += size;
+ err = udp_rmem_schedule(sk, size);
+ if (unlikely(err)) {
+ /* Free the skbs outside of locked section. */
+ skb->next = to_drop;
+ to_drop = skb;
+ continue;
+ }
+
+ q_size += size;
+ sk_forward_alloc_add(sk, -size);
+
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
*/
- ret = udp_sendmsg(NULL, sk, &msg, 0);
- if (ret < 0)
- return ret;
+ SOCK_SKB_CB(skb)->dropcount = dropcount;
+ nb++;
+ __skb_queue_tail(list, skb);
}
- lock_sock(sk);
+ atomic_add(q_size, &sk->sk_rmem_alloc);
- if (unlikely(!up->pending)) {
- release_sock(sk);
+ spin_unlock(&list->lock);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
- return -EINVAL;
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Multiple threads might be blocked in recvmsg(),
+ * using prepare_to_wait_exclusive().
+ */
+ while (nb) {
+ INDIRECT_CALL_1(sk->sk_data_ready,
+ sock_def_readable, sk);
+ nb--;
+ }
}
- ret = ip_append_page(sk, page, offset, size, flags);
- if (ret == -EOPNOTSUPP) {
- release_sock(sk);
- return sock_no_sendpage(sk->sk_socket, page, offset,
- size, flags);
+ if (unlikely(to_drop)) {
+ for (nb = 0; to_drop != NULL; nb++) {
+ skb = to_drop;
+ to_drop = skb->next;
+ skb_mark_not_on_list(skb);
+ /* TODO: update SNMP values. */
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
+ }
+ numa_drop_add(&udp_sk(sk)->drop_counters, nb);
}
- if (ret < 0) {
- udp_flush_pending_frames(sk);
- goto out;
+
+ atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
+
+ return 0;
+
+drop:
+ udp_drops_inc(sk);
+ return err;
+}
+EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
+
+void udp_destruct_common(struct sock *sk)
+{
+ /* reclaim completely the forward allocated memory */
+ struct udp_sock *up = udp_sk(sk);
+ unsigned int total = 0;
+ struct sk_buff *skb;
+
+ skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
+ while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
+ total += skb->truesize;
+ kfree_skb(skb);
}
+ udp_rmem_release(sk, total, 0, true);
+ kfree(up->udp_prod_queue);
+}
+EXPORT_IPV6_MOD_GPL(udp_destruct_common);
- up->len += size;
- if (!(up->corkflag || (flags&MSG_MORE)))
- ret = udp_push_pending_frames(sk);
- if (!ret)
- ret = size;
-out:
- release_sock(sk);
- return ret;
+static void udp_destruct_sock(struct sock *sk)
+{
+ udp_destruct_common(sk);
+ inet_sock_destruct(sk);
+}
+
+int udp_init_sock(struct sock *sk)
+{
+ int res = udp_lib_init_sock(sk);
+
+ sk->sk_destruct = udp_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ return res;
+}
+
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+{
+ if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
+ sk_peek_offset_bwd(sk, len);
+
+ if (!skb_shared(skb)) {
+ skb_attempt_defer_free(skb);
+ return;
+ }
+
+ if (!skb_unref(skb))
+ return;
+
+ /* In the more common cases we cleared the head states previously,
+ * see __udp_queue_rcv_skb().
+ */
+ if (unlikely(udp_skb_has_head_state(skb)))
+ skb_release_head_state(skb);
+ __consume_stateless_skb(skb);
+}
+EXPORT_IPV6_MOD_GPL(skb_consume_udp);
+
+static struct sk_buff *__first_packet_length(struct sock *sk,
+ struct sk_buff_head *rcvq,
+ unsigned int *total)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_peek(rcvq)) != NULL) {
+ if (udp_lib_checksum_complete(skb)) {
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ udp_drops_inc(sk);
+ __skb_unlink(skb, rcvq);
+ *total += skb->truesize;
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
+ } else {
+ udp_skb_csum_unnecessary_set(skb);
+ break;
+ }
+ }
+ return skb;
+}
+
+/**
+ * first_packet_length - return length of first packet in receive queue
+ * @sk: socket
+ *
+ * Drops all bad checksum frames, until a valid one is found.
+ * Returns the length of found skb, or -1 if none is found.
+ */
+static int first_packet_length(struct sock *sk)
+{
+ struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
+ struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+ unsigned int total = 0;
+ struct sk_buff *skb;
+ int res;
+
+ spin_lock_bh(&rcvq->lock);
+ skb = __first_packet_length(sk, rcvq, &total);
+ if (!skb && !skb_queue_empty_lockless(sk_queue)) {
+ spin_lock(&sk_queue->lock);
+ skb_queue_splice_tail_init(sk_queue, rcvq);
+ spin_unlock(&sk_queue->lock);
+
+ skb = __first_packet_length(sk, rcvq, &total);
+ }
+ res = skb ? skb->len : -1;
+ if (total)
+ udp_rmem_release(sk, total, 1, false);
+ spin_unlock_bh(&rcvq->lock);
+ return res;
}
/*
* IOCTL requests applicable to the UDP protocol
*/
-
-int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+
+int udp_ioctl(struct sock *sk, int cmd, int *karg)
{
- switch(cmd)
+ switch (cmd) {
+ case SIOCOUTQ:
{
- case SIOCOUTQ:
- {
- int amount = atomic_read(&sk->sk_wmem_alloc);
- return put_user(amount, (int __user *)arg);
- }
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
+ }
+
+ case SIOCINQ:
+ {
+ *karg = max_t(int, 0, first_packet_length(sk));
+ return 0;
+ }
+
+ default:
+ return -ENOIOCTLCMD;
+ }
- case SIOCINQ:
- {
- struct sk_buff *skb;
- unsigned long amount;
-
- amount = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL) {
- /*
- * We will only return the amount
- * of this packet since that is all
- * that will be read.
- */
- amount = skb->len - sizeof(struct udphdr);
+ return 0;
+}
+EXPORT_IPV6_MOD(udp_ioctl);
+
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
+ int *off, int *err)
+{
+ struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+ struct sk_buff_head *queue;
+ struct sk_buff *last;
+ long timeo;
+ int error;
+
+ queue = &udp_sk(sk)->reader_queue;
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ do {
+ struct sk_buff *skb;
+
+ error = sock_error(sk);
+ if (error)
+ break;
+
+ error = -EAGAIN;
+ do {
+ spin_lock_bh(&queue->lock);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
+ if (skb) {
+ if (!(flags & MSG_PEEK))
+ udp_skb_destructor(sk, skb);
+ spin_unlock_bh(&queue->lock);
+ return skb;
}
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
- }
- default:
- return -ENOIOCTLCMD;
+ if (skb_queue_empty_lockless(sk_queue)) {
+ spin_unlock_bh(&queue->lock);
+ goto busy_check;
+ }
+
+ /* refill the reader queue and walk it again
+ * keep both queues locked to avoid re-acquiring
+ * the sk_receive_queue lock if fwd memory scheduling
+ * is needed.
+ */
+ spin_lock(&sk_queue->lock);
+ skb_queue_splice_tail_init(sk_queue, queue);
+
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
+ if (skb && !(flags & MSG_PEEK))
+ udp_skb_dtor_locked(sk, skb);
+ spin_unlock(&sk_queue->lock);
+ spin_unlock_bh(&queue->lock);
+ if (skb)
+ return skb;
+
+busy_check:
+ if (!sk_can_busy_loop(sk))
+ break;
+
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
+ } while (!skb_queue_empty_lockless(sk_queue));
+
+ /* sk_queue is empty, reader_queue may contain peeked packets */
+ } while (timeo &&
+ !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
+ &error, &timeo,
+ (struct sk_buff *)sk_queue));
+
+ *err = error;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_udp);
+
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ int err;
+
+try_again:
+ skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
+ if (!skb)
+ return err;
+
+ if (udp_lib_checksum_complete(skb)) {
+ int is_udplite = IS_UDPLITE(sk);
+ struct net *net = sock_net(sk);
+
+ __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
+ udp_drops_inc(sk);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
+ goto try_again;
}
- return(0);
+
+ WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+ return recv_actor(sk, skb);
}
+EXPORT_IPV6_MOD(udp_read_skb);
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
*/
-int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
- struct sk_buff *skb;
- int copied, err, copy_only, is_udplite = IS_UDPLITE(sk);
-
- /*
- * Check any passed addresses
- */
- if (addr_len)
- *addr_len=sizeof(*sin);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sk_buff *skb;
+ unsigned int ulen, copied;
+ int off, err, peeking = flags & MSG_PEEK;
+ int is_udplite = IS_UDPLITE(sk);
+ bool checksum_valid = false;
if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len);
+ return ip_recv_error(sk, msg, len, addr_len);
try_again:
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ off = sk_peek_offset(sk, flags);
+ skb = __skb_recv_udp(sk, flags, &off, &err);
if (!skb)
- goto out;
-
- copied = skb->len - sizeof(struct udphdr);
- if (copied > len) {
- copied = len;
+ return err;
+
+ ulen = udp_skb_len(skb);
+ copied = len;
+ if (copied > ulen - off)
+ copied = ulen - off;
+ else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
- }
/*
- * Decide whether to checksum and/or copy data.
- *
- * UDP: checksum may have been computed in HW,
- * (re-)compute it if message is truncated.
- * UDP-Lite: always needs to checksum, no HW support.
+ * If checksum is needed at all, try to do it while copying the
+ * data. If the data is truncated, or if we only want a partial
+ * coverage checksum (UDP-Lite), do it before the copy.
*/
- copy_only = (skb->ip_summed==CHECKSUM_UNNECESSARY);
- if (is_udplite || (!copy_only && msg->msg_flags&MSG_TRUNC)) {
- if (__udp_lib_checksum_complete(skb))
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
+ checksum_valid = udp_skb_csum_unnecessary(skb) ||
+ !__udp_lib_checksum_complete(skb);
+ if (!checksum_valid)
goto csum_copy_err;
- copy_only = 1;
}
- if (copy_only)
- err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
- msg->msg_iov, copied );
- else {
- err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+ if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
+ if (udp_skb_is_linear(skb))
+ err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
+ else
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
+ } else {
+ err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
}
- if (err)
- goto out_free;
+ if (unlikely(err)) {
+ if (!peeking) {
+ udp_drops_inc(sk);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
+ }
+ kfree_skb(skb);
+ return err;
+ }
+
+ if (!peeking)
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
- if (sin)
- {
+ if (sin) {
sin->sin_family = AF_INET;
- sin->sin_port = skb->h.uh->source;
- sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ sin->sin_port = udp_hdr(skb)->source;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- }
- if (inet->cmsg_flags)
- ip_cmsg_recv(msg, skb);
+ *addr_len = sizeof(*sin);
+
+ BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin,
+ addr_len);
+ }
+
+ if (udp_test_bit(GRO_ENABLED, sk))
+ udp_cmsg_recv(msg, sk, skb);
+
+ if (inet_cmsg_flags(inet))
+ ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
if (flags & MSG_TRUNC)
- err = skb->len - sizeof(struct udphdr);
-
-out_free:
- skb_free_datagram(sk, skb);
-out:
- return err;
+ err = ulen;
-csum_copy_err:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
+ skb_consume_udp(sk, skb, peeking ? -err : err);
+ return err;
- skb_kill_datagram(sk, skb, flags);
+csum_copy_err:
+ if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
+ udp_skb_destructor)) {
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ }
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
- if (noblock)
- return -EAGAIN;
+ /* starting over for a new packet, but check if we need to yield */
+ cond_resched();
+ msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
+int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from __ip4_datagram_connect() and
+ * intended to prevent BPF program called below from accessing bytes
+ * that are out of the bound specified by user in addr_len.
+ */
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
-int udp_disconnect(struct sock *sk, int flags)
+ return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+EXPORT_IPV6_MOD(udp_pre_connect);
+
+static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip4_datagram_connect(sk, uaddr, addr_len);
+ if (!res)
+ udp4_hash4(sk);
+ release_sock(sk);
+ return res;
+}
+
+int __udp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
/*
* 1003.1g - break association.
*/
-
+
sk->sk_state = TCP_CLOSE;
- inet->daddr = 0;
- inet->dport = 0;
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ sock_rps_reset_rxhash(sk);
sk->sk_bound_dev_if = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
inet_reset_saddr(sk);
+ if (sk->sk_prot->rehash &&
+ (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ sk->sk_prot->rehash(sk);
+ }
if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
sk->sk_prot->unhash(sk);
- inet->sport = 0;
+ inet->inet_sport = 0;
}
sk_dst_reset(sk);
return 0;
}
+EXPORT_SYMBOL(__udp_disconnect);
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+ lock_sock(sk);
+ __udp_disconnect(sk, flags);
+ release_sock(sk);
+ return 0;
+}
+EXPORT_IPV6_MOD(udp_disconnect);
+
+void udp_lib_unhash(struct sock *sk)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = udp_get_table_prot(sk);
+ struct udp_hslot *hslot, *hslot2;
+
+ sock_rps_delete_flow(sk);
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+
+ spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+ if (sk_del_node_init_rcu(sk)) {
+ hslot->count--;
+ inet_sk(sk)->inet_num = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_lock(&hslot2->lock);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ udp_unhash4(udptable, sk);
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+}
+EXPORT_IPV6_MOD(udp_lib_unhash);
-/* return:
- * 1 if the the UDP system should process it
- * 0 if we should drop this packet
- * -1 if it should get processed by xfrm4_rcv_encap
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
*/
-static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
+void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
{
-#ifndef CONFIG_XFRM
- return 1;
-#else
- struct udp_sock *up = udp_sk(sk);
- struct udphdr *uh;
- struct iphdr *iph;
- int iphlen, len;
-
- __u8 *udpdata;
- __be32 *udpdata32;
- __u16 encap_type = up->encap_type;
-
- /* if we're overly short, let UDP handle it */
- len = skb->len - sizeof(struct udphdr);
- if (len <= 0)
- return 1;
-
- /* if this is not encapsulated socket, then just return now */
- if (!encap_type)
- return 1;
-
- /* If this is a paged skb, make sure we pull up
- * whatever data we need to look at. */
- if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
- return 1;
-
- /* Now we can get the pointers */
- uh = skb->h.uh;
- udpdata = (__u8 *)uh + sizeof(struct udphdr);
- udpdata32 = (__be32 *)udpdata;
-
- switch (encap_type) {
- default:
- case UDP_ENCAP_ESPINUDP:
- /* Check if this is a keepalive packet. If so, eat it. */
- if (len == 1 && udpdata[0] == 0xff) {
- return 0;
- } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) {
- /* ESP Packet without Non-ESP header */
- len = sizeof(struct udphdr);
- } else
- /* Must be an IKE packet.. pass it through */
- return 1;
- break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- /* Check if this is a keepalive packet. If so, eat it. */
- if (len == 1 && udpdata[0] == 0xff) {
- return 0;
- } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
- udpdata32[0] == 0 && udpdata32[1] == 0) {
-
- /* ESP Packet with Non-IKE marker */
- len = sizeof(struct udphdr) + 2 * sizeof(u32);
- } else
- /* Must be an IKE packet.. pass it through */
- return 1;
- break;
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = udp_get_table_prot(sk);
+ struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ nhslot2 = udp_hashslot2(udptable, newhash);
+ udp_sk(sk)->udp_portaddr_hash = newhash;
+
+ if (hslot2 != nhslot2 ||
+ rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* we must lock primary chain too */
+ spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+ }
+
+ spin_unlock_bh(&hslot->lock);
+ }
+
+ /* Now process hash4 if necessary:
+ * (1) update hslot4;
+ * (2) update hslot2->hash4_cnt.
+ * Note that hslot2/hslot4 should be checked separately, as
+ * either of them may change with the other unchanged.
+ */
+ if (udp_hashed4(sk)) {
+ spin_lock_bh(&hslot->lock);
+
+ udp_rehash4(udptable, sk, newhash4);
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ udp_hash4_inc(nhslot2);
+ spin_unlock(&nhslot2->lock);
+ }
+
+ spin_unlock_bh(&hslot->lock);
+ }
}
+}
+EXPORT_IPV6_MOD(udp_lib_rehash);
- /* At this point we are sure that this is an ESPinUDP packet,
- * so we need to remove 'len' bytes from the packet (the UDP
- * header and optional ESP marker bytes) and then modify the
- * protocol to ESP, and then call into the transform receiver.
- */
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
- return 0;
+void udp_v4_rehash(struct sock *sk)
+{
+ u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ u16 new_hash4 = udp_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+
+ udp_lib_rehash(sk, new_hash, new_hash4);
+}
- /* Now we can update and verify the packet length... */
- iph = skb->nh.iph;
- iphlen = iph->ihl << 2;
- iph->tot_len = htons(ntohs(iph->tot_len) - len);
- if (skb->len < iphlen + len) {
- /* packet is too small!?! */
- return 0;
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ if (inet_sk(sk)->inet_daddr) {
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
}
- /* pull the data buffer up to the ESP header and set the
- * transport header to point to ESP. Keep UDP on the stack
- * for later.
- */
- skb->h.raw = skb_pull(skb, len);
+ rc = __udp_enqueue_schedule_skb(sk, skb);
+ if (rc < 0) {
+ int is_udplite = IS_UDPLITE(sk);
+ int drop_reason;
- /* modify the protocol (it's ESP!) */
- iph->protocol = IPPROTO_ESP;
+ /* Note that an ENOMEM error is charged twice */
+ if (rc == -ENOMEM) {
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
+ is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ trace_udp_fail_queue_rcv_skb(rc, sk, skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
+ return -1;
+ }
- /* and let the caller know to send this into the ESP processor... */
- return -1;
-#endif
+ return 0;
}
/* returns:
@@ -1016,49 +2374,62 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
- int rc;
+ int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
- nf_reset(skb);
+ }
+ nf_reset_ct(skb);
+
+ if (static_branch_unlikely(&udp_encap_needed_key) &&
+ READ_ONCE(up->encap_type)) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
- if (up->encap_type) {
/*
- * This is an encapsulation socket, so let's see if this is
- * an encapsulated packet.
- * If it's a keepalive packet, then just eat it.
- * If it's an encapsulateed packet, then pass it to the
- * IPsec xfrm input and return the response
- * appropriately. Otherwise, just fall through and
- * pass this up the UDP socket.
+ * This is an encapsulation socket so pass the skb to
+ * the socket's udp_encap_rcv() hook. Otherwise, just
+ * fall through and pass this up the UDP socket.
+ * up->encap_rcv() returns the following value:
+ * =0 if skb was successfully passed to the encap
+ * handler or was discarded by it.
+ * >0 if skb should be passed on to UDP.
+ * <0 if skb should be resubmitted as proto -N
*/
- int ret;
- ret = udp_encap_rcv(sk, skb);
- if (ret == 0) {
- /* Eat the packet .. */
- kfree_skb(skb);
- return 0;
- }
- if (ret < 0) {
- /* process the ESP packet */
- ret = xfrm4_rcv_encap(skb, up->encap_type);
- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
- return -ret;
+ /* if we're overly short, let UDP handle it */
+ encap_rcv = READ_ONCE(up->encap_rcv);
+ if (encap_rcv) {
+ int ret;
+
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ ret = encap_rcv(sk, skb);
+ if (ret <= 0) {
+ __UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
+ return -ret;
+ }
}
+
/* FALLTHROUGH -- it's a UDP Packet */
}
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ u16 pcrlen = READ_ONCE(up->pcrlen);
/*
* MIB statistics other than incrementing the error count are
@@ -1071,10 +2442,9 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
* delivery of packets with coverage values less than a value
* provided by the application."
*/
- if (up->pcrlen == 0) { /* full coverage was set */
- LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
- "%d while full coverage %d requested\n",
- UDP_SKB_CB(skb)->cscov, skb->len);
+ if (pcrlen == 0) { /* full coverage was set */
+ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
@@ -1083,119 +2453,229 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
* that it wants x while sender emits packets of smaller size y.
* Therefore the above ...()->partial_cov statement is essential.
*/
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
- LIMIT_NETDEBUG(KERN_WARNING
- "UDPLITE: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
+ net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, pcrlen);
goto drop;
}
}
- if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if (__udp_lib_checksum_complete(skb))
- goto drop;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
+ prefetch(&sk->sk_rmem_alloc);
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
- if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
- /* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
- UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
- }
- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
- return 0;
+ udp_csum_pull_header(skb);
+ ipv4_pktinfo_prepare(sk, skb, true);
+ return __udp_queue_rcv_skb(sk, skb);
+
+csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
- kfree_skb(skb);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ udp_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udp_queue_rcv_one_skb(sk, skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, true);
+ skb_list_walk_safe(segs, skb, next) {
+ __skb_pull(skb, skb_transport_offset(skb));
+
+ udp_post_segment_fix_csum(skb);
+ ret = udp_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
+ }
+ return 0;
+}
+
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use xchg() to guard against concurrent changes.
+ */
+bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+ struct dst_entry *old;
+
+ if (dst_hold_safe(dst)) {
+ old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst)));
+ dst_release(old);
+ return old != dst;
+ }
+ return false;
+}
+EXPORT_IPV6_MOD(udp_sk_rx_dst_set);
+
/*
* Multicasts and broadcasts go to each listener.
*
- * Note: called only from the BH handler context,
- * so we don't need to lock the hashes.
+ * Note: called only from the BH handler context.
*/
-static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
+static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udphdr *uh,
__be32 saddr, __be32 daddr,
- struct hlist_head udptable[])
+ struct udp_table *udptable,
+ int proto)
{
- struct sock *sk;
- int dif;
+ struct sock *sk, *first = NULL;
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+ unsigned int offset = offsetof(typeof(*sk), sk_node);
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
+ struct hlist_node *node;
+ struct sk_buff *nskb;
+
+ if (use_hash2) {
+ hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ udptable->mask;
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+start_lookup:
+ hslot = &udptable->hash2[hash2].hslot;
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
- read_lock(&udp_hash_lock);
- sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
- dif = skb->dev->ifindex;
- sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
- if (sk) {
- struct sock *sknext = NULL;
+ sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+ if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+ uh->source, saddr, dif, sdif, hnum))
+ continue;
- do {
- struct sk_buff *skb1 = skb;
-
- sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
- uh->source, saddr, dif);
- if(sknext)
- skb1 = skb_clone(skb, GFP_ATOMIC);
-
- if(skb1) {
- int ret = udp_queue_rcv_skb(sk, skb1);
- if (ret > 0)
- /* we should probably re-process instead
- * of dropping packets here. */
- kfree_skb(skb1);
- }
- sk = sknext;
- } while(sknext);
- } else
+ if (!first) {
+ first = sk;
+ continue;
+ }
+ nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (unlikely(!nskb)) {
+ udp_drops_inc(sk);
+ __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ continue;
+ }
+ if (udp_queue_rcv_skb(sk, nskb) > 0)
+ consume_skb(nskb);
+ }
+
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
+ if (first) {
+ if (udp_queue_rcv_skb(first, skb) > 0)
+ consume_skb(skb);
+ } else {
kfree_skb(skb);
- read_unlock(&udp_hash_lock);
+ __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
+ }
return 0;
}
/* Initialize UDP checksum. If exited with zero value (success),
* CHECKSUM_UNNECESSARY means, that no more checks are required.
- * Otherwise, csum completion requires chacksumming packet body,
+ * Otherwise, csum completion requires checksumming packet body,
* including udp header and folding it to skb->csum.
*/
-static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh)
-{
- if (uh->check == 0) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->len, IPPROTO_UDP, skb->csum ))
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr,
- skb->nh.iph->daddr,
- skb->len, IPPROTO_UDP, 0);
- /* Probably, we should checksum udp header (it should be in cache
- * in any case) and data in tiny packets (< rx copybreak).
- */
+static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
+ int proto)
+{
+ int err;
- /* UDP = UDP-Lite with a non-partial checksum coverage */
UDP_SKB_CB(skb)->partial_cov = 0;
+ UDP_SKB_CB(skb)->cscov = skb->len;
+
+ if (proto == IPPROTO_UDPLITE) {
+ err = udplite_checksum_init(skb, uh);
+ if (err)
+ return err;
+
+ if (UDP_SKB_CB(skb)->partial_cov) {
+ skb->csum = inet_compute_pseudo(skb, proto);
+ return 0;
+ }
+ }
+
+ /* Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
+ */
+ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
+ if (err)
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
+ /* If SW calculated the value, we know it's bad */
+ if (skb->csum_complete_sw)
+ return 1;
+
+ /* HW says the value is bad. Let's validate that.
+ * skb->csum is no longer the full packet checksum,
+ * so don't treat it as such.
+ */
+ skb_checksum_complete_unset(skb);
+ }
+
+ return 0;
+}
+
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+ struct udphdr *uh)
+{
+ int ret;
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
}
/*
- * All we need to do is get the socket, and then do a checksum.
+ * All we need to do is get the socket, and then do a checksum.
*/
-
-int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
- int is_udplite)
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ int proto)
{
- struct sock *sk;
- struct udphdr *uh = skb->h.uh;
+ struct sock *sk = NULL;
+ struct udphdr *uh;
unsigned short ulen;
- struct rtable *rt = (struct rtable*)skb->dst;
- __be32 saddr = skb->nh.iph->saddr;
- __be32 daddr = skb->nh.iph->daddr;
+ struct rtable *rt = skb_rtable(skb);
+ __be32 saddr, daddr;
+ struct net *net = dev_net(skb->dev);
+ bool refcounted;
+ int drop_reason;
+
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/*
* Validate the packet.
@@ -1203,140 +2683,382 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
+ uh = udp_hdr(skb);
ulen = ntohs(uh->len);
+ saddr = ip_hdr(skb)->saddr;
+ daddr = ip_hdr(skb)->daddr;
+
if (ulen > skb->len)
goto short_packet;
- if(! is_udplite ) { /* UDP validates ulen. */
-
+ if (proto == IPPROTO_UDP) {
+ /* UDP validates ulen. */
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
-
- udp4_csum_init(skb, uh);
-
- } else { /* UDP-Lite validates cscov. */
- if (udplite4_csum_init(skb, uh))
- goto csum_error;
+ uh = udp_hdr(skb);
}
- if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable);
+ if (udp4_csum_init(skb, uh, proto))
+ goto csum_error;
- sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest,
- skb->dev->ifindex, udptable );
+ sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+ &refcounted, udp_ehashfn);
+ if (IS_ERR(sk))
+ goto no_sk;
- if (sk != NULL) {
- int ret = udp_queue_rcv_skb(sk, skb);
- sock_put(sk);
+ if (sk) {
+ struct dst_entry *dst = skb_dst(skb);
+ int ret;
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
- if (ret > 0)
- return -ret;
- return 0;
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
+ udp_sk_rx_dst_set(sk, dst);
+
+ ret = udp_unicast_rcv_skb(sk, skb, uh);
+ if (refcounted)
+ sock_put(sk);
+ return ret;
}
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable, proto);
+
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (sk)
+ return udp_unicast_rcv_skb(sk, skb, uh);
+no_sk:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
- UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite);
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
+ __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
- kfree_skb(skb);
- return(0);
+ sk_skb_reason_drop(sk, skb, drop_reason);
+ return 0;
short_packet:
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
- is_udplite? "-Lite" : "",
- NIPQUAD(saddr),
- ntohs(uh->source),
- ulen,
- skb->len,
- NIPQUAD(daddr),
- ntohs(uh->dest));
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+ net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source),
+ ulen, skb->len,
+ &daddr, ntohs(uh->dest));
goto drop;
csum_error:
- /*
- * RFC1122: OK. Discards the bad packet silently (as far as
- * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+ /*
+ * RFC1122: OK. Discards the bad packet silently (as far as
+ * the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
- is_udplite? "-Lite" : "",
- NIPQUAD(saddr),
- ntohs(uh->source),
- NIPQUAD(daddr),
- ntohs(uh->dest),
- ulen);
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
+ net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
+ ulen);
+ __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
- return(0);
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ sk_skb_reason_drop(sk, skb, drop_reason);
+ return 0;
}
-__inline__ int udp_rcv(struct sk_buff *skb)
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, int sdif)
{
- return __udp4_lib_rcv(skb, udp_hash, 0);
+ struct udp_table *udptable = net->ipv4.udp_table;
+ unsigned short hnum = ntohs(loc_port);
+ struct sock *sk, *result;
+ struct udp_hslot *hslot;
+ unsigned int slot;
+
+ slot = udp_hashfn(net, hnum, udptable->mask);
+ hslot = &udptable->hash[slot];
+
+ /* Do not bother scanning a too big list */
+ if (hslot->count > 10)
+ return NULL;
+
+ result = NULL;
+ sk_for_each_rcu(sk, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+ rmt_port, rmt_addr, dif, sdif, hnum)) {
+ if (result)
+ return NULL;
+ result = sk;
+ }
+ }
+
+ return result;
}
-int udp_destroy_sock(struct sock *sk)
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups. The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, int sdif)
{
- lock_sock(sk);
+ struct udp_table *udptable = net->ipv4.udp_table;
+ INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
+ unsigned short hnum = ntohs(loc_port);
+ struct udp_hslot *hslot2;
+ unsigned int hash2;
+ __portpair ports;
+ struct sock *sk;
+
+ hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ if (inet_match(net, sk, acookie, ports, dif, sdif))
+ return sk;
+ /* Only check first socket in chain */
+ break;
+ }
+ return NULL;
+}
+
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct in_device *in_dev = NULL;
+ const struct iphdr *iph;
+ const struct udphdr *uh;
+ struct sock *sk = NULL;
+ struct dst_entry *dst;
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
+ int ours;
+
+ /* validate the packet */
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+ return SKB_NOT_DROPPED_YET;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ if (skb->pkt_type == PACKET_MULTICAST) {
+ in_dev = __in_dev_get_rcu(skb->dev);
+
+ if (!in_dev)
+ return SKB_NOT_DROPPED_YET;
+
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return SKB_NOT_DROPPED_YET;
+
+ sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr,
+ dif, sdif);
+ } else if (skb->pkt_type == PACKET_HOST) {
+ sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif, sdif);
+ }
+
+ if (!sk)
+ return SKB_NOT_DROPPED_YET;
+
+ skb->sk = sk;
+ DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
+ skb->destructor = sock_pfree;
+ dst = rcu_dereference(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst) {
+ u32 itag = 0;
+
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
+
+ /* for unconnected multicast sockets we need to validate
+ * the source on each packet
+ */
+ if (!inet_sk(sk)->inet_daddr && in_dev)
+ return ip_mc_validate_source(skb, iph->daddr,
+ iph->saddr,
+ ip4h_dscp(iph),
+ skb->dev, in_dev, &itag);
+ }
+ return SKB_NOT_DROPPED_YET;
+}
+
+int udp_rcv(struct sk_buff *skb)
+{
+ return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
+}
+
+void udp_destroy_sock(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ bool slow = lock_sock_fast(sk);
+
+ /* protects from races with udp_abort() */
+ sock_set_flag(sk, SOCK_DEAD);
udp_flush_pending_frames(sk);
- release_sock(sk);
- return 0;
+ unlock_sock_fast(sk, slow);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
+ static_branch_dec(&udp_encap_needed_key);
+ udp_tunnel_cleanup_gro(sk);
+ }
+ }
+}
+
+typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
+static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
+ struct sock *sk)
+{
+#ifdef CONFIG_XFRM
+ udp_gro_receive_t new_gro_receive;
+
+ if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv;
+ else
+ new_gro_receive = xfrm4_gro_udp_encap_rcv;
+
+ if (udp_sk(sk)->gro_receive != new_gro_receive) {
+ /*
+ * With IPV6_ADDRFORM the gro callback could change
+ * after being set, unregister the old one, if valid.
+ */
+ if (udp_sk(sk)->gro_receive)
+ udp_tunnel_update_gro_rcv(sk, false);
+
+ WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive);
+ udp_tunnel_update_gro_rcv(sk, true);
+ }
+ }
+#endif
}
/*
* Socket option code for UDP
*/
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen,
+ sockptr_t optval, unsigned int optlen,
int (*push_pending_frames)(struct sock *))
{
struct udp_sock *up = udp_sk(sk);
- int val;
+ int val, valbool;
int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
+
+ if (level == SOL_SOCKET) {
+ err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+ sockopt_lock_sock(sk);
+ /* paired with READ_ONCE in udp_rmem_release() */
+ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+ sockopt_release_sock(sk);
+ }
+ return err;
+ }
- if(optlen<sizeof(int))
+ if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
- switch(optname) {
+ valbool = val ? 1 : 0;
+
+ switch (optname) {
case UDP_CORK:
if (val != 0) {
- up->corkflag = 1;
+ udp_set_bit(CORK, sk);
} else {
- up->corkflag = 0;
+ udp_clear_bit(CORK, sk);
lock_sock(sk);
- (*push_pending_frames)(sk);
+ push_pending_frames(sk);
release_sock(sk);
}
break;
-
+
case UDP_ENCAP:
+ sockopt_lock_sock(sk);
switch (val) {
case 0:
+#ifdef CONFIG_XFRM
case UDP_ENCAP_ESPINUDP:
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- up->encap_type = val;
+ set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ WRITE_ONCE(up->encap_rcv,
+ ipv6_stub->xfrm6_udp_encap_rcv);
+ else
+#endif
+ WRITE_ONCE(up->encap_rcv,
+ xfrm4_udp_encap_rcv);
+#endif
+ fallthrough;
+ case UDP_ENCAP_L2TPINUDP:
+ WRITE_ONCE(up->encap_type, val);
+ udp_tunnel_encap_enable(sk);
break;
default:
err = -ENOPROTOOPT;
break;
}
+ sockopt_release_sock(sk);
+ break;
+
+ case UDP_NO_CHECK6_TX:
+ udp_set_no_check6_tx(sk, valbool);
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ udp_set_no_check6_rx(sk, valbool);
+ break;
+
+ case UDP_SEGMENT:
+ if (val < 0 || val > USHRT_MAX)
+ return -EINVAL;
+ WRITE_ONCE(up->gso_size, val);
+ break;
+
+ case UDP_GRO:
+ sockopt_lock_sock(sk);
+ /* when enabling GRO, accept the related GSO packet type */
+ if (valbool)
+ udp_tunnel_encap_enable(sk);
+ udp_assign_bit(GRO_ENABLED, sk, valbool);
+ udp_assign_bit(ACCEPT_L4, sk, valbool);
+ set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
+ sockopt_release_sock(sk);
break;
/*
@@ -1345,97 +3067,109 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* The sender sets actual checksum coverage length via this option.
* The case coverage > packet length is handled by send module. */
case UDPLITE_SEND_CSCOV:
- if (!up->pcflag) /* Disable the option on UDP sockets */
+ if (!is_udplite) /* Disable the option on UDP sockets */
return -ENOPROTOOPT;
if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
val = 8;
- up->pcslen = val;
- up->pcflag |= UDPLITE_SEND_CC;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
+ WRITE_ONCE(up->pcslen, val);
+ udp_set_bit(UDPLITE_SEND_CC, sk);
break;
- /* The receiver specifies a minimum checksum coverage value. To make
- * sense, this should be set to at least 8 (as done below). If zero is
+ /* The receiver specifies a minimum checksum coverage value. To make
+ * sense, this should be set to at least 8 (as done below). If zero is
* used, this again means full checksum coverage. */
case UDPLITE_RECV_CSCOV:
- if (!up->pcflag) /* Disable the option on UDP sockets */
+ if (!is_udplite) /* Disable the option on UDP sockets */
return -ENOPROTOOPT;
if (val != 0 && val < 8) /* Avoid silly minimal values. */
val = 8;
- up->pcrlen = val;
- up->pcflag |= UDPLITE_RECV_CC;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
+ WRITE_ONCE(up->pcrlen, val);
+ udp_set_bit(UDPLITE_RECV_CC, sk);
break;
default:
err = -ENOPROTOOPT;
break;
- };
+ }
return err;
}
+EXPORT_IPV6_MOD(udp_lib_setsockopt);
-int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
+ return udp_lib_setsockopt(sk, level, optname,
+ optval, optlen,
udp_push_pending_frames);
return ip_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
- udp_push_pending_frames);
- return compat_ip_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
struct udp_sock *up = udp_sk(sk);
int val, len;
- if(get_user(len,optlen))
+ if (get_user(len, optlen))
return -EFAULT;
- len = min_t(unsigned int, len, sizeof(int));
-
- if(len < 0)
+ if (len < 0)
return -EINVAL;
- switch(optname) {
+ len = min_t(unsigned int, len, sizeof(int));
+
+ switch (optname) {
case UDP_CORK:
- val = up->corkflag;
+ val = udp_test_bit(CORK, sk);
break;
case UDP_ENCAP:
- val = up->encap_type;
+ val = READ_ONCE(up->encap_type);
+ break;
+
+ case UDP_NO_CHECK6_TX:
+ val = udp_get_no_check6_tx(sk);
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ val = udp_get_no_check6_rx(sk);
+ break;
+
+ case UDP_SEGMENT:
+ val = READ_ONCE(up->gso_size);
+ break;
+
+ case UDP_GRO:
+ val = udp_test_bit(GRO_ENABLED, sk);
break;
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
- val = up->pcslen;
+ val = READ_ONCE(up->pcslen);
break;
case UDPLITE_RECV_CSCOV:
- val = up->pcrlen;
+ val = READ_ONCE(up->pcrlen);
break;
default:
return -ENOPROTOOPT;
- };
+ }
- if(put_user(len, optlen))
- return -EFAULT;
- if(copy_to_user(optval, &val,len))
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
return -EFAULT;
- return 0;
+ return 0;
}
+EXPORT_IPV6_MOD(udp_lib_getsockopt);
int udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
@@ -1445,101 +3179,151 @@ int udp_getsockopt(struct sock *sk, int level, int optname,
return ip_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_getsockopt(sk, level, optname, optval, optlen);
- return compat_ip_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
/**
* udp_poll - wait for a UDP event.
- * @file - file struct
- * @sock - socket
- * @wait - poll table
+ * @file: - file struct
+ * @sock: - socket
+ * @wait: - poll table
*
- * This is same as datagram poll, except for the special case of
+ * This is same as datagram poll, except for the special case of
* blocking sockets. If application is using a blocking fd
* and a packet with checksum error is in the queue;
* then it could get return from select indicating data available
* but then block when reading it. Add special case code
* to work around these arguably broken applications.
*/
-unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
- unsigned int mask = datagram_poll(file, sock, wait);
+ __poll_t mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
- int is_lite = IS_UDPLITE(sk);
+
+ if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
/* Check for false positives due to checksum errors */
- if ( (mask & POLLRDNORM) &&
- !(file->f_flags & O_NONBLOCK) &&
- !(sk->sk_shutdown & RCV_SHUTDOWN)){
- struct sk_buff_head *rcvq = &sk->sk_receive_queue;
- struct sk_buff *skb;
+ if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
+ mask &= ~(EPOLLIN | EPOLLRDNORM);
- spin_lock_bh(&rcvq->lock);
- while ((skb = skb_peek(rcvq)) != NULL) {
- if (udp_lib_checksum_complete(skb)) {
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite);
- __skb_unlink(skb, rcvq);
- kfree_skb(skb);
- } else {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- }
- spin_unlock_bh(&rcvq->lock);
+ /* psock ingress_msg queue should not contain any bad checksum frames */
+ if (sk_is_readable(sk))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ return mask;
- /* nothing to see, move along */
- if (skb == NULL)
- mask &= ~(POLLIN | POLLRDNORM);
- }
+}
+EXPORT_IPV6_MOD(udp_poll);
- return mask;
-
+int udp_abort(struct sock *sk, int err)
+{
+ if (!has_current_bpf_ctx())
+ lock_sock(sk);
+
+ /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
+ * with close()
+ */
+ if (sock_flag(sk, SOCK_DEAD))
+ goto out;
+
+ sk->sk_err = err;
+ sk_error_report(sk);
+ __udp_disconnect(sk, 0);
+
+out:
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
+
+ return 0;
}
+EXPORT_IPV6_MOD_GPL(udp_abort);
struct proto udp_prot = {
- .name = "UDP",
- .owner = THIS_MODULE,
- .close = udp_lib_close,
- .connect = ip4_datagram_connect,
- .disconnect = udp_disconnect,
- .ioctl = udp_ioctl,
- .destroy = udp_destroy_sock,
- .setsockopt = udp_setsockopt,
- .getsockopt = udp_getsockopt,
- .sendmsg = udp_sendmsg,
- .recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
- .backlog_rcv = udp_queue_rcv_skb,
- .hash = udp_lib_hash,
- .unhash = udp_lib_unhash,
- .get_port = udp_v4_get_port,
- .obj_size = sizeof(struct udp_sock),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udp_setsockopt,
- .compat_getsockopt = compat_udp_getsockopt,
+ .name = "UDP",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .pre_connect = udp_pre_connect,
+ .connect = udp_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .init = udp_init_sock,
+ .destroy = udp_destroy_sock,
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
+ .splice_eof = udp_splice_eof,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
+ .get_port = udp_v4_get_port,
+ .put_port = udp_lib_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
#endif
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
+ .obj_size = sizeof(struct udp_sock),
+ .h.udp_table = NULL,
+ .diag_destroy = udp_abort,
};
+EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
-static struct sock *udp_get_first(struct seq_file *seq)
+static unsigned short seq_file_family(const struct seq_file *seq);
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static const struct seq_operations bpf_iter_udp_seq_ops;
+#endif
+static struct udp_table *udp_get_table_seq(struct seq_file *seq,
+ struct net *net)
+{
+ const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ if (seq->op == &bpf_iter_udp_seq_ops)
+ return net->ipv4.udp_table;
+#endif
+
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->udp_table ? : net->ipv4.udp_table;
+}
+
+static struct sock *udp_get_first(struct seq_file *seq, int start)
{
- struct sock *sk;
struct udp_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
+ struct sock *sk;
+
+ udptable = udp_get_table_seq(seq, net);
+
+ for (state->bucket = start; state->bucket <= udptable->mask;
+ ++state->bucket) {
+ struct udp_hslot *hslot = &udptable->hash[state->bucket];
+
+ if (hlist_empty(&hslot->head))
+ continue;
- for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
- struct hlist_node *node;
- sk_for_each(sk, node, state->hashtable + state->bucket) {
- if (sk->sk_family == state->family)
+ spin_lock_bh(&hslot->lock);
+ sk_for_each(sk, &hslot->head) {
+ if (seq_sk_match(seq, sk))
goto found;
}
+ spin_unlock_bh(&hslot->lock);
}
sk = NULL;
found:
@@ -1549,41 +3333,48 @@ found:
static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
struct udp_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
do {
sk = sk_next(sk);
-try_again:
- ;
- } while (sk && sk->sk_family != state->family);
+ } while (sk && !seq_sk_match(seq, sk));
- if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
- sk = sk_head(state->hashtable + state->bucket);
- goto try_again;
+ if (!sk) {
+ udptable = udp_get_table_seq(seq, net);
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
+
+ return udp_get_first(seq, state->bucket + 1);
}
return sk;
}
static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
{
- struct sock *sk = udp_get_first(seq);
+ struct sock *sk = udp_get_first(seq, 0);
if (sk)
- while(pos && (sk = udp_get_next(seq, sk)) != NULL)
+ while (pos && (sk = udp_get_next(seq, sk)) != NULL)
--pos;
return pos ? NULL : sk;
}
-static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+void *udp_seq_start(struct seq_file *seq, loff_t *pos)
{
- read_lock(&udp_hash_lock);
- return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
+ struct udp_iter_state *state = seq->private;
+ state->bucket = MAX_UDP_PORTS;
+
+ return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
+EXPORT_IPV6_MOD(udp_seq_start);
-static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sock *sk;
- if (v == (void *)1)
+ if (v == SEQ_START_TOKEN)
sk = udp_get_idx(seq, 0);
else
sk = udp_get_next(seq, v);
@@ -1591,140 +3382,659 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
+EXPORT_IPV6_MOD(udp_seq_next);
-static void udp_seq_stop(struct seq_file *seq, void *v)
+void udp_seq_stop(struct seq_file *seq, void *v)
{
- read_unlock(&udp_hash_lock);
+ struct udp_iter_state *state = seq->private;
+ struct udp_table *udptable;
+
+ udptable = udp_get_table_seq(seq, seq_file_net(seq));
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
}
+EXPORT_IPV6_MOD(udp_seq_stop);
-static int udp_seq_open(struct inode *inode, struct file *file)
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
{
- struct udp_seq_afinfo *afinfo = PDE(inode)->data;
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct udp_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ udp_rqueue_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
+ 0, sock_i_ino(sp),
+ refcount_read(&sp->sk_refcnt), sp,
+ sk_drops_read(sp));
+}
- if (!s)
- goto out;
- s->family = afinfo->family;
- s->hashtable = afinfo->hashtable;
- s->seq_ops.start = udp_seq_start;
- s->seq_ops.next = udp_seq_next;
- s->seq_ops.show = afinfo->seq_show;
- s->seq_ops.stop = udp_seq_stop;
-
- rc = seq_open(file, &s->seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+int udp4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct udp_iter_state *state = seq->private;
+
+ udp4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
}
-/* ------------------------------------------------------------------------ */
-int udp_proc_register(struct udp_seq_afinfo *afinfo)
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__udp {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct udp_sock *, udp_sk);
+ uid_t uid __aligned(8);
+ int bucket __aligned(8);
+};
+
+union bpf_udp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
+struct bpf_udp_iter_state {
+ struct udp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ union bpf_udp_iter_batch_item *batch;
+};
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz, gfp_t flags);
+static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
+ union bpf_udp_iter_batch_item *cookies,
+ int n_cookies)
{
- struct proc_dir_entry *p;
- int rc = 0;
+ struct sock *sk = NULL;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ udp_portaddr_for_each_entry_from(sk)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ goto done;
+ }
+done:
+ return sk;
+}
- if (!afinfo)
- return -EINVAL;
- afinfo->seq_fops->owner = afinfo->owner;
- afinfo->seq_fops->open = udp_seq_open;
- afinfo->seq_fops->read = seq_read;
- afinfo->seq_fops->llseek = seq_lseek;
- afinfo->seq_fops->release = seq_release_private;
-
- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
- if (p)
- p->data = afinfo;
- else
- rc = -ENOMEM;
- return rc;
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct udp_iter_state *state = &iter->state;
+ unsigned int find_cookie, end_cookie;
+ struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
+ unsigned int batch_sks = 0;
+ int resume_bucket;
+ int resizes = 0;
+ struct sock *sk;
+ int err = 0;
+
+ resume_bucket = state->bucket;
+
+ /* The current batch is done, so advance the bucket. */
+ if (iter->cur_sk == iter->end_sk)
+ state->bucket++;
+
+ udptable = udp_get_table_seq(seq, net);
+
+again:
+ /* New batch for the next bucket.
+ * Iterate over the hash table to find a bucket with sockets matching
+ * the iterator attributes, and return the first matching socket from
+ * the bucket. The remaining matched sockets from the bucket are batched
+ * before releasing the bucket lock. This allows BPF programs that are
+ * called in seq_show to acquire the bucket lock if needed.
+ */
+ find_cookie = iter->cur_sk;
+ end_cookie = iter->end_sk;
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+ batch_sks = 0;
+
+ for (; state->bucket <= udptable->mask; state->bucket++) {
+ struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
+
+ if (hlist_empty(&hslot2->head))
+ goto next_bucket;
+
+ spin_lock_bh(&hslot2->lock);
+ sk = hlist_entry_safe(hslot2->head.first, struct sock,
+ __sk_common.skc_portaddr_node);
+ /* Resume from the first (in iteration order) unseen socket from
+ * the last batch that still exists in resume_bucket. Most of
+ * the time this will just be where the last iteration left off
+ * in resume_bucket unless that socket disappeared between
+ * reads.
+ */
+ if (state->bucket == resume_bucket)
+ sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+fill_batch:
+ udp_portaddr_for_each_entry_from(sk) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++].sk = sk;
+ }
+ batch_sks++;
+ }
+ }
+
+ /* Allocate a larger batch and try again. */
+ if (unlikely(resizes <= 1 && iter->end_sk &&
+ iter->end_sk != batch_sks)) {
+ resizes++;
+
+ /* First, try with GFP_USER to maximize the chances of
+ * grabbing more memory.
+ */
+ if (resizes == 1) {
+ spin_unlock_bh(&hslot2->lock);
+ err = bpf_iter_udp_realloc_batch(iter,
+ batch_sks * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+ /* Start over. */
+ goto again;
+ }
+
+ /* Next, hold onto the lock, so the bucket doesn't
+ * change while we get the rest of the sockets.
+ */
+ err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+ GFP_NOWAIT);
+ if (err) {
+ spin_unlock_bh(&hslot2->lock);
+ return ERR_PTR(err);
+ }
+
+ /* Pick up where we left off. */
+ sk = iter->batch[iter->end_sk - 1].sk;
+ sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+ struct sock,
+ __sk_common.skc_portaddr_node);
+ batch_sks = iter->end_sk;
+ goto fill_batch;
+ }
+
+ spin_unlock_bh(&hslot2->lock);
+
+ if (iter->end_sk)
+ break;
+next_bucket:
+ resizes = 0;
+ }
+
+ WARN_ON_ONCE(iter->end_sk != batch_sks);
+ return iter->end_sk ? iter->batch[0].sk : NULL;
}
-void udp_proc_unregister(struct udp_seq_afinfo *afinfo)
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- if (!afinfo)
- return;
- proc_net_remove(afinfo->name);
- memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so unref the iter->cur_sk.
+ */
+ if (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++].sk);
+
+ /* After updating iter->cur_sk, check if there are more sockets
+ * available in the current bucket batch.
+ */
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk].sk;
+ else
+ /* Prepare a new batch. */
+ sk = bpf_iter_udp_batch(seq);
+
+ ++*pos;
+ return sk;
}
-/* ------------------------------------------------------------------------ */
-static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket)
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct inet_sock *inet = inet_sk(sp);
- __be32 dest = inet->daddr;
- __be32 src = inet->rcv_saddr;
- __u16 destp = ntohs(inet->dport);
- __u16 srcp = ntohs(inet->sport);
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_udp_batch(seq);
- sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
- bucket, src, srcp, dest, destp, sp->sk_state,
- atomic_read(&sp->sk_wmem_alloc),
- atomic_read(&sp->sk_rmem_alloc),
- 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp);
+ return SEQ_START_TOKEN;
}
-int udp4_seq_show(struct seq_file *seq, void *v)
+static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+ struct udp_sock *udp_sk, uid_t uid, int bucket)
+{
+ struct bpf_iter__udp ctx;
+
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
+ ctx.meta = meta;
+ ctx.udp_sk = udp_sk;
+ ctx.uid = uid;
+ ctx.bucket = bucket;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
{
+ struct udp_iter_state *state = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ struct sock *sk = v;
+ uid_t uid;
+ int ret;
+
if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-127s\n",
- " sl local_address rem_address st tx_queue "
- "rx_queue tr tm->when retrnsmt uid timeout "
- "inode");
- else {
- char tmpbuf[129];
- struct udp_iter_state *state = seq->private;
+ return 0;
- udp4_format_sock(v, tmpbuf, state->bucket);
- seq_printf(seq, "%-127s\n", tmpbuf);
+ lock_sock(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
}
- return 0;
+
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+ release_sock(sk);
+ return ret;
}
-/* ------------------------------------------------------------------------ */
-static struct file_operations udp4_seq_fops;
+static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
+{
+ union bpf_udp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_put(item->sk);
+ item->cookie = cookie;
+ }
+}
+
+static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)udp_prog_seq_show(prog, &meta, v, 0, 0);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ bpf_iter_udp_put_batch(iter);
+}
+
+static const struct seq_operations bpf_iter_udp_seq_ops = {
+ .start = bpf_iter_udp_seq_start,
+ .next = bpf_iter_udp_seq_next,
+ .stop = bpf_iter_udp_seq_stop,
+ .show = bpf_iter_udp_seq_show,
+};
+#endif
+
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+ const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* BPF iterator: bpf programs to filter sockets. */
+ if (seq->op == &bpf_iter_udp_seq_ops)
+ return AF_UNSPEC;
+#endif
+
+ /* Proc fs iterator */
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->family;
+}
+
+const struct seq_operations udp_seq_ops = {
+ .start = udp_seq_start,
+ .next = udp_seq_next,
+ .stop = udp_seq_stop,
+ .show = udp4_seq_show,
+};
+EXPORT_IPV6_MOD(udp_seq_ops);
+
static struct udp_seq_afinfo udp4_seq_afinfo = {
- .owner = THIS_MODULE,
- .name = "udp",
.family = AF_INET,
- .hashtable = udp_hash,
- .seq_show = udp4_seq_show,
- .seq_fops = &udp4_seq_fops,
+ .udp_table = NULL,
+};
+
+static int __net_init udp4_proc_init_net(struct net *net)
+{
+ if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
+ sizeof(struct udp_iter_state), &udp4_seq_afinfo))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit udp4_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("udp", net->proc_net);
+}
+
+static struct pernet_operations udp4_net_ops = {
+ .init = udp4_proc_init_net,
+ .exit = udp4_proc_exit_net,
};
int __init udp4_proc_init(void)
{
- return udp_proc_register(&udp4_seq_afinfo);
+ return register_pernet_subsys(&udp4_net_ops);
}
void udp4_proc_exit(void)
{
- udp_proc_unregister(&udp4_seq_afinfo);
+ unregister_pernet_subsys(&udp4_net_ops);
}
#endif /* CONFIG_PROC_FS */
-EXPORT_SYMBOL(udp_disconnect);
-EXPORT_SYMBOL(udp_hash);
-EXPORT_SYMBOL(udp_hash_lock);
-EXPORT_SYMBOL(udp_ioctl);
-EXPORT_SYMBOL(udp_get_port);
-EXPORT_SYMBOL(udp_prot);
-EXPORT_SYMBOL(udp_sendmsg);
-EXPORT_SYMBOL(udp_lib_getsockopt);
-EXPORT_SYMBOL(udp_lib_setsockopt);
-EXPORT_SYMBOL(udp_poll);
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
+{
+ ssize_t ret;
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(udp_proc_register);
-EXPORT_SYMBOL(udp_proc_unregister);
+ if (!str)
+ return 0;
+
+ ret = kstrtoul(str, 0, &uhash_entries);
+ if (ret)
+ return 0;
+
+ if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+ uhash_entries = UDP_HTABLE_SIZE_MIN;
+ return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
+
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+ unsigned int i, slot_size;
+
+ slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+ udp_hash4_slot_size();
+ table->hash = alloc_large_system_hash(name,
+ slot_size,
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ UDP_HTABLE_SIZE_MIN,
+ UDP_HTABLE_SIZE_MAX);
+
+ table->hash2 = (void *)(table->hash + (table->mask + 1));
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_HEAD(&table->hash[i].head);
+ table->hash[i].count = 0;
+ spin_lock_init(&table->hash[i].lock);
+ }
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_HEAD(&table->hash2[i].hslot.head);
+ table->hash2[i].hslot.count = 0;
+ spin_lock_init(&table->hash2[i].hslot.lock);
+ }
+ udp_table_hash4_init(table);
+}
+
+u32 udp_flow_hashrnd(void)
+{
+ static u32 hashrnd __read_mostly;
+
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+ return hashrnd;
+}
+EXPORT_SYMBOL(udp_flow_hashrnd);
+
+static void __net_init udp_sysctl_init(struct net *net)
+{
+ net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
+ net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+ struct udp_table *udptable;
+ unsigned int slot_size;
+ int i;
+
+ udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+ if (!udptable)
+ goto out;
+
+ slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+ udp_hash4_slot_size();
+ udptable->hash = vmalloc_huge(hash_entries * slot_size,
+ GFP_KERNEL_ACCOUNT);
+ if (!udptable->hash)
+ goto free_table;
+
+ udptable->hash2 = (void *)(udptable->hash + hash_entries);
+ udptable->mask = hash_entries - 1;
+ udptable->log = ilog2(hash_entries);
+
+ for (i = 0; i < hash_entries; i++) {
+ INIT_HLIST_HEAD(&udptable->hash[i].head);
+ udptable->hash[i].count = 0;
+ spin_lock_init(&udptable->hash[i].lock);
+
+ INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head);
+ udptable->hash2[i].hslot.count = 0;
+ spin_lock_init(&udptable->hash2[i].hslot.lock);
+ }
+ udp_table_hash4_init(udptable);
+
+ return udptable;
+
+free_table:
+ kfree(udptable);
+out:
+ return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+
+ if (udptable == &udp_table)
+ return;
+
+ kvfree(udptable->hash);
+ kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+ struct udp_table *udptable;
+ unsigned int hash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+ if (!hash_entries)
+ goto fallback;
+
+ /* Set min to keep the bitmap on stack in udp_lib_get_port() */
+ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+ hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+ else
+ hash_entries = roundup_pow_of_two(hash_entries);
+
+ udptable = udp_pernet_table_alloc(hash_entries);
+ if (udptable) {
+ net->ipv4.udp_table = udptable;
+ } else {
+ pr_warn("Failed to allocate UDP hash table (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ hash_entries);
+fallback:
+ net->ipv4.udp_table = &udp_table;
+ }
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+ int i;
+
+ /* No tunnel is configured */
+ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
+ INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
+ RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
+ }
+#endif
+ udp_sysctl_init(net);
+ udp_set_table(net);
+
+ return 0;
+}
+
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+ udp_pernet_table_free(net);
+}
+
+static struct pernet_operations __net_initdata udp_sysctl_ops = {
+ .init = udp_pernet_init,
+ .exit = udp_pernet_exit,
+};
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
+ struct udp_sock *udp_sk, uid_t uid, int bucket)
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz, gfp_t flags)
+{
+ union bpf_udp_iter_batch_item *new_batch;
+
+ new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+ flags | __GFP_NOWARN);
+ if (!new_batch)
+ return -ENOMEM;
+
+ if (flags != GFP_NOWAIT)
+ bpf_iter_udp_put_batch(iter);
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_udp_iter_state *iter = priv_data;
+ int ret;
+
+ ret = bpf_iter_init_seq_net(priv_data, aux);
+ if (ret)
+ return ret;
+
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
+ if (ret)
+ bpf_iter_fini_seq_net(priv_data);
+
+ iter->state.bucket = -1;
+
+ return ret;
+}
+
+static void bpf_iter_fini_udp(void *priv_data)
+{
+ struct bpf_udp_iter_state *iter = priv_data;
+
+ bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
+}
+
+static const struct bpf_iter_seq_info udp_seq_info = {
+ .seq_ops = &bpf_iter_udp_seq_ops,
+ .init_seq_private = bpf_iter_init_udp,
+ .fini_seq_private = bpf_iter_fini_udp,
+ .seq_priv_size = sizeof(struct bpf_udp_iter_state),
+};
+
+static struct bpf_iter_reg udp_reg_info = {
+ .target = "udp",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__udp, udp_sk),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &udp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+ udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
+ if (bpf_iter_reg_target(&udp_reg_info))
+ pr_warn("Warning: could not register bpf iterator udp\n");
+}
+#endif
+
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ udp_table_init(&udp_table, "UDP");
+ limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_udp_mem[0] = limit / 4 * 3;
+ sysctl_udp_mem[1] = limit;
+ sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+ if (register_pernet_subsys(&udp_sysctl_ops))
+ panic("UDP: failed to init sysctl parameters.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_register();
+#endif
+}
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
new file mode 100644
index 000000000000..0735d820e413
--- /dev/null
+++ b/net/ipv4/udp_bpf.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */
+
+#include <linux/skmsg.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+
+#include "udp_impl.h"
+
+static struct proto *udpv6_prot_saved __read_mostly;
+
+static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len);
+#endif
+ return udp_prot.recvmsg(sk, msg, len, flags, addr_len);
+}
+
+static bool udp_sk_has_data(struct sock *sk)
+{
+ return !skb_queue_empty(&udp_sk(sk)->reader_queue) ||
+ !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static bool psock_has_data(struct sk_psock *psock)
+{
+ return !skb_queue_empty(&psock->ingress_skb) ||
+ !sk_psock_queue_empty(psock);
+}
+
+#define udp_msg_has_data(__sk, __psock) \
+ ({ udp_sk_has_data(__sk) || psock_has_data(__psock); })
+
+static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+ long timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = udp_msg_has_data(sk, psock);
+ if (!ret) {
+ wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ ret = udp_msg_has_data(sk, psock);
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (!len)
+ return 0;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+
+ if (!psock_has_data(psock)) {
+ ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+ goto out;
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ long timeo;
+ int data;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ data = udp_msg_wait_data(sk, psock, timeo);
+ if (data) {
+ if (psock_has_data(psock))
+ goto msg_bytes_ready;
+ ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+enum {
+ UDP_BPF_IPV4,
+ UDP_BPF_IPV6,
+ UDP_BPF_NUM_PROTS,
+};
+
+static DEFINE_SPINLOCK(udpv6_prot_lock);
+static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
+
+static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+ *prot = *base;
+ prot->close = sock_map_close;
+ prot->recvmsg = udp_bpf_recvmsg;
+ prot->sock_is_readable = sk_msg_is_readable;
+}
+
+static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+ if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+ spin_lock_bh(&udpv6_prot_lock);
+ if (likely(ops != udpv6_prot_saved)) {
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
+ smp_store_release(&udpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&udpv6_prot_lock);
+ }
+}
+
+static int __init udp_bpf_v4_build_proto(void)
+{
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot);
+ return 0;
+}
+late_initcall(udp_bpf_v4_build_proto);
+
+int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+ int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+
+ if (restore) {
+ sk->sk_write_space = psock->saved_write_space;
+ sock_replace_proto(sk, psock->sk_proto);
+ return 0;
+ }
+
+ if (sk->sk_family == AF_INET6)
+ udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+
+ sock_replace_proto(sk, &udp_bpf_prots[family]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
new file mode 100644
index 000000000000..6e491c720c90
--- /dev/null
+++ b/net/ipv4/udp_diag.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * udp_diag.c Module for monitoring UDP transport protocols sockets.
+ *
+ * Authors: Pavel Emelyanov, <xemul@parallels.com>
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/sock_diag.h>
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ bool net_admin)
+{
+ if (!inet_diag_bc_sk(cb->data, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
+ net_admin);
+}
+
+static int udp_dump_one(struct udp_table *tbl,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sk_buff *in_skb = cb->skb;
+ int err;
+ struct sock *sk = NULL;
+ struct sk_buff *rep;
+ struct net *net = sock_net(in_skb->sk);
+
+ rcu_read_lock();
+ if (req->sdiag_family == AF_INET)
+ /* src and dst are swapped for historical reasons */
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_if, 0, tbl, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6)
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ req->id.idiag_if, 0, tbl, NULL);
+#endif
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ rcu_read_unlock();
+ err = -ENOENT;
+ if (!sk)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ if (sk)
+ sock_put(sk);
+out_nosk:
+ return err;
+}
+
+static void udp_dump(struct udp_table *table, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
+ struct udp_hslot *hslot = &table->hash[slot];
+ struct sock *sk;
+
+ num = 0;
+
+ if (hlist_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_for_each(sk, &hslot->head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) {
+ spin_unlock_bh(&hslot->lock);
+ goto done;
+ }
+next:
+ num++;
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+done:
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r);
+}
+
+static int udp_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req);
+}
+
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = udp_rqueue_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int __udp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req,
+ struct udp_table *tbl)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ if (req->sdiag_family == AF_INET)
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_if, 0, tbl, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_dst[3], req->id.idiag_dport,
+ req->id.idiag_src[3], req->id.idiag_sport,
+ req->id.idiag_if, 0, tbl, NULL);
+
+ else
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if, 0, tbl, NULL);
+ }
+#endif
+ else {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+
+ rcu_read_unlock();
+
+ if (!sk)
+ return -ENOENT;
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_put(sk);
+ return -ENOENT;
+ }
+
+ err = sock_diag_destroy(sk, ECONNABORTED);
+
+ sock_put(sk);
+
+ return err;
+}
+
+static int udp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table);
+}
+
+static int udplite_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ return __udp_diag_destroy(in_skb, req, &udplite_table);
+}
+
+#endif
+
+static const struct inet_diag_handler udp_diag_handler = {
+ .owner = THIS_MODULE,
+ .dump = udp_diag_dump,
+ .dump_one = udp_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDP,
+ .idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = udp_diag_destroy,
+#endif
+};
+
+static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ udp_dump(&udplite_table, skb, cb, r);
+}
+
+static int udplite_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udplite_table, cb, req);
+}
+
+static const struct inet_diag_handler udplite_diag_handler = {
+ .owner = THIS_MODULE,
+ .dump = udplite_diag_dump,
+ .dump_one = udplite_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDPLITE,
+ .idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = udplite_diag_destroy,
+#endif
+};
+
+static int __init udp_diag_init(void)
+{
+ int err;
+
+ err = inet_diag_register(&udp_diag_handler);
+ if (err)
+ goto out;
+ err = inet_diag_register(&udplite_diag_handler);
+ if (err)
+ goto out_lite;
+out:
+ return err;
+out_lite:
+ inet_diag_unregister(&udp_diag_handler);
+ goto out;
+}
+
+static void __exit udp_diag_exit(void)
+{
+ inet_diag_unregister(&udplite_diag_handler);
+ inet_diag_unregister(&udp_diag_handler);
+}
+
+module_init(udp_diag_init);
+module_exit(udp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index f6f4277ba6dc..c7142213fc21 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -1,38 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDP4_IMPL_H
#define _UDP4_IMPL_H
+#include <net/aligned_data.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/protocol.h>
#include <net/inet_common.h>
-extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int );
-extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []);
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
-extern int __udp_lib_get_port(struct sock *sk, unsigned short snum,
- struct hlist_head udptable[], int *port_rover,
- int (*)(const struct sock*,const struct sock*));
-extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *);
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
+void udp_v4_rehash(struct sock *sk);
+int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
-extern int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
-extern int udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-
-#ifdef CONFIG_COMPAT
-extern int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
-extern int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-#endif
-extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len);
-extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags);
-extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
-extern int udp_destroy_sock(struct sock *sk);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len);
+void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
-extern int udp4_seq_show(struct seq_file *seq, void *v);
+int udp4_seq_show(struct seq_file *seq, void *v);
#endif
#endif /* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
new file mode 100644
index 000000000000..19d0b5b09ffa
--- /dev/null
+++ b/net/ipv4/udp_offload.c
@@ -0,0 +1,996 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * UDPv4 GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/udp.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <net/udp_tunnel.h>
+
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+
+/*
+ * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL
+ * values for the udp tunnel static call.
+ */
+static struct sk_buff *dummy_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
+struct udp_tunnel_type_entry {
+ udp_tunnel_gro_rcv_t gro_receive;
+ refcount_t count;
+};
+
+#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \
+ IS_ENABLED(CONFIG_VXLAN) * 2 + \
+ IS_ENABLED(CONFIG_NET_FOU) * 2 + \
+ IS_ENABLED(CONFIG_XFRM) * 2)
+
+DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv);
+static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call);
+static DEFINE_MUTEX(udp_tunnel_gro_type_lock);
+static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES];
+static unsigned int udp_tunnel_gro_type_nr;
+static DEFINE_SPINLOCK(udp_tunnel_gro_lock);
+
+void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add)
+{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
+ struct udp_sock *tup, *up = udp_sk(sk);
+ struct udp_tunnel_gro *udp_tunnel_gro;
+
+ spin_lock(&udp_tunnel_gro_lock);
+ udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6];
+ if (add)
+ hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list);
+ else if (up->tunnel_list.pprev)
+ hlist_del_init(&up->tunnel_list);
+
+ if (udp_tunnel_gro->list.first &&
+ !udp_tunnel_gro->list.first->next) {
+ tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock,
+ tunnel_list);
+
+ rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup);
+ } else {
+ RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL);
+ }
+
+ spin_unlock(&udp_tunnel_gro_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup);
+
+void udp_tunnel_update_gro_rcv(struct sock *sk, bool add)
+{
+ struct udp_tunnel_type_entry *cur = NULL;
+ struct udp_sock *up = udp_sk(sk);
+ int i, old_gro_type_nr;
+
+ if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive)
+ return;
+
+ mutex_lock(&udp_tunnel_gro_type_lock);
+
+ /* Check if the static call is permanently disabled. */
+ if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES)
+ goto out;
+
+ for (i = 0; i < udp_tunnel_gro_type_nr; i++)
+ if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive)
+ cur = &udp_tunnel_gro_types[i];
+
+ old_gro_type_nr = udp_tunnel_gro_type_nr;
+ if (add) {
+ /*
+ * Update the matching entry, if found, or add a new one
+ * if needed
+ */
+ if (cur) {
+ refcount_inc(&cur->count);
+ goto out;
+ }
+
+ if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) {
+ pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n");
+ /* Ensure static call will never be enabled */
+ udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1;
+ } else {
+ cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++];
+ refcount_set(&cur->count, 1);
+ cur->gro_receive = up->gro_receive;
+ }
+ } else {
+ /*
+ * The stack cleanups only successfully added tunnel, the
+ * lookup on removal should never fail.
+ */
+ if (WARN_ON_ONCE(!cur))
+ goto out;
+
+ if (!refcount_dec_and_test(&cur->count))
+ goto out;
+
+ /* Avoid gaps, so that the enable tunnel has always id 0 */
+ *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr];
+ }
+
+ if (udp_tunnel_gro_type_nr == 1) {
+ static_call_update(udp_tunnel_gro_rcv,
+ udp_tunnel_gro_types[0].gro_receive);
+ static_branch_enable(&udp_tunnel_static_call);
+ } else if (old_gro_type_nr == 1) {
+ static_branch_disable(&udp_tunnel_static_call);
+ static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv);
+ }
+
+out:
+ mutex_unlock(&udp_tunnel_gro_type_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv);
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ if (static_branch_likely(&udp_tunnel_static_call)) {
+ if (unlikely(gro_recursion_inc_test(skb))) {
+ NAPI_GRO_CB(skb)->flush |= 1;
+ return NULL;
+ }
+ return static_call(udp_tunnel_gro_rcv)(sk, head, skb);
+ }
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#else
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#endif
+
+static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features),
+ __be16 new_protocol, bool is_ipv6)
+{
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ bool remcsum, need_csum, offload_csum, gso_partial;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct udphdr *uh = udp_hdr(skb);
+ u16 mac_offset = skb->mac_header;
+ __be16 protocol = skb->protocol;
+ u16 mac_len = skb->mac_len;
+ int udp_offset, outer_hlen;
+ __wsum partial;
+ bool need_ipsec;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ /* Adjust partial header checksum to negate old length.
+ * We cannot rely on the value contained in uh->len as it is
+ * possible that the actual value exceeds the boundaries of the
+ * 16 bit length field due to the header being added outside of an
+ * IP or IPv6 frame that was already limited to 64K - 1.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+ partial = (__force __wsum)uh->len;
+ else
+ partial = (__force __wsum)htonl(skb->len);
+ partial = csum_sub(csum_unfold(uh->check), partial);
+
+ /* setup inner skb. */
+ skb->encapsulation = 0;
+ SKB_GSO_CB(skb)->encap_level = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb_set_transport_header(skb, skb_inner_transport_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = new_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+ skb->encap_hdr_csum = need_csum;
+
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ skb->remcsum_offload = remcsum;
+
+ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ !need_ipsec &&
+ (skb->dev->features &
+ (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
+ (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
+
+ features &= skb->dev->hw_enc_features;
+ if (need_csum)
+ features &= ~NETIF_F_SCTP_CRC;
+
+ /* The only checksum offload we care about from here on out is the
+ * outer one so strip the existing checksum feature flags and
+ * instead set the flag based on our outer checksum offload value.
+ */
+ if (remcsum) {
+ features &= ~NETIF_F_CSUM_MASK;
+ if (!need_csum || offload_csum)
+ features |= NETIF_F_HW_CSUM;
+ }
+
+ /* segment inner packet. */
+ segs = gso_inner_segment(skb, features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
+
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+ outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
+ unsigned int len;
+
+ if (remcsum)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Set up inner headers if we are offloading inner checksum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+
+ __skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
+ uh = udp_hdr(skb);
+
+ /* If we are only performing partial GSO the inner header
+ * will be using a length value equal to only one MSS sized
+ * segment instead of the entire frame.
+ */
+ if (gso_partial && skb_is_gso(skb)) {
+ uh->len = htons(skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)uh);
+ } else {
+ uh->len = htons(len);
+ }
+
+ if (!need_csum)
+ continue;
+
+ uh->check = ~csum_fold(csum_add(partial,
+ (__force __wsum)htonl(len)));
+
+ if (skb->encapsulation || !offload_csum) {
+ uh->check = gso_make_checksum(skb, ~uh->check);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ }
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ bool is_ipv6)
+{
+ const struct net_offload __rcu **offloads;
+ __be16 protocol = skb->protocol;
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features);
+
+ rcu_read_lock();
+
+ switch (skb->inner_protocol_type) {
+ case ENCAP_TYPE_ETHER:
+ protocol = skb->inner_protocol;
+ gso_inner_segment = skb_mac_gso_segment;
+ break;
+ case ENCAP_TYPE_IPPROTO:
+ offloads = is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[skb->inner_ipproto]);
+ if (!ops || !ops->callbacks.gso_segment)
+ goto out_unlock;
+ gso_inner_segment = ops->callbacks.gso_segment;
+ break;
+ default:
+ goto out_unlock;
+ }
+
+ segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
+ protocol, is_ipv6);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
+
+static void __udpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 *newip,
+ __be16 *oldport, __be16 *newport)
+{
+ struct udphdr *uh;
+ struct iphdr *iph;
+
+ if (*oldip == *newip && *oldport == *newport)
+ return;
+
+ uh = udp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ if (uh->check) {
+ inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip,
+ true);
+ inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport,
+ false);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ *oldport = *newport;
+
+ csum_replace4(&iph->check, *oldip, *newip);
+ *oldip = *newip;
+}
+
+static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ struct sk_buff *seg;
+ struct udphdr *uh, *uh2;
+ struct iphdr *iph, *iph2;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) &&
+ (udp_hdr(seg)->source == udp_hdr(seg->next)->source) &&
+ (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) &&
+ (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ uh2 = udp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __udpv4_gso_segment_csum(seg,
+ &iph2->saddr, &iph->saddr,
+ &uh2->source, &uh->source);
+ __udpv4_gso_segment_csum(seg,
+ &iph2->daddr, &iph->daddr,
+ &uh2->dest, &uh->dest);
+ }
+
+ return segs;
+}
+
+static void __udpv6_gso_segment_csum(struct sk_buff *seg,
+ struct in6_addr *oldip,
+ const struct in6_addr *newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct udphdr *uh = udp_hdr(seg);
+
+ if (ipv6_addr_equal(oldip, newip) && *oldport == newport)
+ return;
+
+ if (uh->check) {
+ inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32,
+ newip->s6_addr32, true);
+
+ inet_proto_csum_replace2(&uh->check, seg, *oldport, newport,
+ false);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+
+ *oldip = *newip;
+ *oldport = newport;
+}
+
+static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct ipv6hdr *iph;
+ const struct udphdr *uh;
+ struct ipv6hdr *iph2;
+ struct sk_buff *seg;
+ struct udphdr *uh2;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+ iph = ipv6_hdr(seg);
+ uh2 = udp_hdr(seg->next);
+ iph2 = ipv6_hdr(seg->next);
+
+ if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) &&
+ ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
+ ipv6_addr_equal(&iph->daddr, &iph2->daddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ uh2 = udp_hdr(seg);
+ iph2 = ipv6_hdr(seg);
+
+ __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr,
+ &uh2->source, uh->source);
+ __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr,
+ &uh2->dest, uh->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features,
+ bool is_ipv6)
+{
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
+
+ if (is_ipv6)
+ return __udpv6_gso_segment_list_csum(skb);
+ else
+ return __udpv4_gso_segment_list_csum(skb);
+}
+
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+ netdev_features_t features, bool is_ipv6)
+{
+ struct sock *sk = gso_skb->sk;
+ unsigned int sum_truesize = 0;
+ struct sk_buff *segs, *seg;
+ struct udphdr *uh;
+ unsigned int mss;
+ bool copy_dtor;
+ __sum16 check;
+ __be16 newlen;
+ int ret = 0;
+
+ mss = skb_shinfo(gso_skb)->gso_size;
+ if (gso_skb->len <= sizeof(*uh) + mss)
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(skb_checksum_start(gso_skb) !=
+ skb_transport_header(gso_skb) &&
+ !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)))
+ return ERR_PTR(-EINVAL);
+
+ /* We don't know if egress device can segment and checksum the packet
+ * when IPv6 extension headers are present. Fall back to software GSO.
+ */
+ if (gso_skb->ip_summed != CHECKSUM_PARTIAL)
+ features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK);
+
+ if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
+ mss);
+ return NULL;
+ }
+
+ if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) {
+ /* Detect modified geometry and pass those to skb_segment. */
+ if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size)
+ return __udp_gso_segment_list(gso_skb, features, is_ipv6);
+
+ ret = __skb_linearize(gso_skb);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Setup csum, as fraglist skips this in udp4_gro_receive. */
+ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head;
+ gso_skb->csum_offset = offsetof(struct udphdr, check);
+ gso_skb->ip_summed = CHECKSUM_PARTIAL;
+
+ uh = udp_hdr(gso_skb);
+ if (is_ipv6)
+ uh->check = ~udp_v6_check(gso_skb->len,
+ &ipv6_hdr(gso_skb)->saddr,
+ &ipv6_hdr(gso_skb)->daddr, 0);
+ else
+ uh->check = ~udp_v4_check(gso_skb->len,
+ ip_hdr(gso_skb)->saddr,
+ ip_hdr(gso_skb)->daddr, 0);
+ }
+
+ skb_pull(gso_skb, sizeof(*uh));
+
+ /* clear destructor to avoid skb_segment assigning it to tail */
+ copy_dtor = gso_skb->destructor == sock_wfree;
+ if (copy_dtor) {
+ gso_skb->destructor = NULL;
+ gso_skb->sk = NULL;
+ }
+
+ segs = skb_segment(gso_skb, features);
+ if (IS_ERR_OR_NULL(segs)) {
+ if (copy_dtor) {
+ gso_skb->destructor = sock_wfree;
+ gso_skb->sk = sk;
+ }
+ return segs;
+ }
+
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+
+ /* preserve TX timestamp flags and TS key for first segment */
+ skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey;
+ skb_shinfo(seg)->tx_flags |=
+ (skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP);
+
+ /* compute checksum adjustment based on old length versus new */
+ newlen = htons(sizeof(*uh) + mss);
+ check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+ for (;;) {
+ if (copy_dtor) {
+ seg->destructor = sock_wfree;
+ seg->sk = sk;
+ sum_truesize += seg->truesize;
+ }
+
+ if (!seg->next)
+ break;
+
+ uh->len = newlen;
+ uh->check = check;
+
+ if (seg->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(seg, ~check);
+ else
+ uh->check = gso_make_checksum(seg, ~check) ? :
+ CSUM_MANGLED_0;
+
+ seg = seg->next;
+ uh = udp_hdr(seg);
+ }
+
+ /* last packet can be partial gso_size, account for that in checksum */
+ newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+ seg->data_len);
+ check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+ uh->len = newlen;
+ uh->check = check;
+
+ if (seg->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(seg, ~check);
+ else
+ uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
+ /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same
+ * meaning. However, check for bad offloads in the GSO stack expects the
+ * latter, if the checksum was calculated in software. To vouch for the
+ * segment skbs we actually need to set it on the gso_skb.
+ */
+ if (gso_skb->ip_summed == CHECKSUM_NONE)
+ gso_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ /* update refcount for the packet */
+ if (copy_dtor) {
+ int delta = sum_truesize - gso_skb->truesize;
+
+ /* In some pathological cases, delta can be negative.
+ * We need to either use refcount_add() or refcount_sub_and_test()
+ */
+ if (likely(delta >= 0))
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ else
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
+ }
+ return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+ __wsum csum;
+ struct udphdr *uh;
+ struct iphdr *iph;
+
+ if (skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+ segs = skb_udp_tunnel_segment(skb, features, false);
+ goto out;
+ }
+
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
+ goto out;
+
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto out;
+
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp_gso_segment(skb, features, false);
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+
+ uh = udp_hdr(skb);
+ iph = ip_hdr(skb);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ /* If there is no outer header we can fake a checksum offload
+ * due to the fact that we have already done the checksum in
+ * software prior to segmenting the frame.
+ */
+ if (!skb->encap_hdr_csum)
+ features |= NETIF_F_HW_CSUM;
+
+ /* Fragment the skb. IP headers of the fragments are updated in
+ * inet_gso_segment()
+ */
+ segs = skb_segment(skb, features);
+out:
+ return segs;
+}
+
+
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sk_buff *pp = NULL;
+ struct udphdr *uh2;
+ struct sk_buff *p;
+ unsigned int ulen;
+ int ret = 0;
+ int flush;
+
+ /* requires non zero csum, for symmetry with GSO */
+ if (!uh->check) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ /* Do not deal with padded or malicious packets, sorry ! */
+ ulen = ntohs(uh->len);
+ if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ /* pull encapsulating udp header */
+ skb_gro_pull(skb, sizeof(struct udphdr));
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = udp_hdr(p);
+
+ /* Match ports only, as csum is always non zero */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return p;
+ }
+
+ flush = gro_receive_network_flush(uh, uh2, p);
+
+ /* Terminate the flow on len mismatch or if it grow "too much".
+ * Under small packet flood GRO count could elsewhere grow a lot
+ * leading to excessive truesize values.
+ * On len mismatch merge the first packet shorter than gso_size,
+ * otherwise complete the GRO packet.
+ */
+ if (ulen > ntohs(uh2->len) || flush) {
+ pp = p;
+ } else {
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ if (!pskb_may_pull(skb, skb_gro_offset(skb))) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ if ((skb->ip_summed != p->ip_summed) ||
+ (skb->csum_level != p->csum_level)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ skb_set_network_header(skb, skb_gro_receive_network_offset(skb));
+ ret = skb_gro_receive_list(p, skb);
+ } else {
+ skb_gro_postpull_rcsum(skb, uh,
+ sizeof(struct udphdr));
+
+ ret = skb_gro_receive(p, skb);
+ }
+ }
+
+ if (ret || ulen != ntohs(uh2->len) ||
+ NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+ pp = p;
+
+ return pp;
+ }
+
+ /* mismatch, but we never need to flush */
+ return NULL;
+}
+
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct udphdr *uh, struct sock *sk)
+{
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct udphdr *uh2;
+ unsigned int off = skb_gro_offset(skb);
+ int flush = 1;
+
+ /* We can do L4 aggregation only if the packet can't land in a tunnel
+ * otherwise we could corrupt the inner stream. Detecting such packets
+ * cannot be foolproof and the aggregation might still happen in some
+ * cases. Such packets should be caught in udp_unexpected_gso later.
+ */
+ NAPI_GRO_CB(skb)->is_flist = 0;
+ if (!sk || !udp_sk(sk)->gro_receive) {
+ /* If the packet was locally encapsulated in a UDP tunnel that
+ * wasn't detected above, do not GRO.
+ */
+ if (skb->encapsulation)
+ goto out;
+
+ if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
+
+ if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
+ (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
+ return call_gro_receive(udp_gro_receive_segment, head, skb);
+
+ /* no GRO, be sure flush the current packet */
+ goto out;
+ }
+
+ if (NAPI_GRO_CB(skb)->encap_mark ||
+ (uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
+ NAPI_GRO_CB(skb)->csum_cnt == 0 &&
+ !NAPI_GRO_CB(skb)->csum_valid))
+ goto out;
+
+ /* mark that this skb passed once through the tunnel gro layer */
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ flush = 0;
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = (struct udphdr *)(p->data + off);
+
+ /* Match ports and either checksums are either both zero
+ * or nonzero.
+ */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
+ (!uh->check ^ !uh2->check)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+ pp = udp_tunnel_gro_rcv(sk, head, skb);
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+ return pp;
+}
+EXPORT_SYMBOL(udp_gro_receive);
+
+static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct iphdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
+ int iif, sdif;
+
+ sk = udp_tunnel_sk(net, false);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp4_lib_lookup(net, iph->saddr, sport,
+ iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
+ struct sk_buff *pp;
+
+ if (unlikely(!uh))
+ goto flush;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip;
+
+ if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo))
+ goto flush;
+ else if (uh->check)
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
+ inet_gro_compute_pseudo);
+skip:
+ if (static_branch_unlikely(&udp_encap_needed_key))
+ sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
+
+ pp = udp_gro_receive(head, skb, uh, sk);
+ return pp;
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)uh - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+
+ if (skb->encapsulation)
+ skb->inner_transport_header = skb->transport_header;
+
+ return 0;
+}
+
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+ udp_lookup_t lookup)
+{
+ __be16 newlen = htons(skb->len - nhoff);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ struct sock *sk;
+ int err;
+
+ uh->len = newlen;
+
+ sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+ udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_complete) {
+ skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+ : SKB_GSO_UDP_TUNNEL;
+
+ /* clear the encap mark, so that inner frag_list gro_complete
+ * can take place
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Set encapsulation before calling into inner gro_complete()
+ * functions to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
+ err = udp_sk(sk)->gro_complete(sk, skb,
+ nhoff + sizeof(struct udphdr));
+ } else {
+ err = udp_gro_complete_segment(skb);
+ }
+
+ if (skb->remcsum_offload)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+
+ return err;
+}
+EXPORT_SYMBOL(udp_gro_complete);
+
+INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
+ if (uh->check)
+ uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
+ iph->daddr, 0);
+
+ return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
+}
+
+int __init udpv4_offload_init(void)
+{
+ net_hotdata.udpv4_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = udp4_ufo_fragment,
+ .gro_receive = udp4_gro_receive,
+ .gro_complete = udp4_gro_complete,
+ },
+ };
+
+ return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
new file mode 100644
index 000000000000..b1f667c52cb2
--- /dev/null
+++ b/net/ipv4/udp_tunnel_core.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <net/dst_metadata.h>
+#include <net/flow.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/inet_dscp.h>
+
+int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ int err;
+ struct socket *sock = NULL;
+ struct sockaddr_in udp_addr;
+
+ err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ if (cfg->bind_ifindex) {
+ err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
+ if (err < 0)
+ goto error;
+ }
+
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->local_ip;
+ udp_addr.sin_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr,
+ sizeof(udp_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->peer_ip;
+ udp_addr.sin_port = cfg->peer_udp_port;
+ err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr,
+ sizeof(udp_addr), 0);
+ if (err < 0)
+ goto error;
+ }
+
+ sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sock_release(sock);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL(udp_sock_create4);
+
+static bool sk_saddr_any(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#else
+ return !sk->sk_rcv_saddr;
+#endif
+}
+
+void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
+ struct udp_tunnel_sock_cfg *cfg)
+{
+ struct sock *sk = sock->sk;
+
+ /* Disable multicast loopback */
+ inet_clear_bit(MC_LOOP, sk);
+
+ /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+ inet_inc_convert_csum(sk);
+
+ rcu_assign_sk_user_data(sk, cfg->sk_user_data);
+
+ udp_sk(sk)->encap_type = cfg->encap_type;
+ udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv;
+ udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
+ udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+ udp_sk(sk)->gro_receive = cfg->gro_receive;
+ udp_sk(sk)->gro_complete = cfg->gro_complete;
+
+ udp_tunnel_encap_enable(sk);
+
+ udp_tunnel_update_gro_rcv(sk, true);
+
+ if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) &&
+ sk->sk_kern_sock)
+ udp_tunnel_update_gro_lookup(net, sk, true);
+}
+EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+
+void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
+ unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct udp_tunnel_info ti;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ udp_tunnel_nic_add_port(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+
+void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
+ unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct udp_tunnel_info ti;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ udp_tunnel_nic_del_port(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
+
+/* Notify netdevs that UDP port started listening */
+void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct udp_tunnel_info ti;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ for_each_netdev(net, dev) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_nic_add_port(dev, &ti);
+ udp_tunnel_nic_unlock(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
+
+/* Notify netdevs that UDP port is no more listening */
+void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct udp_tunnel_info ti;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ for_each_netdev(net, dev) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_nic_del_port(dev, &ti);
+ udp_tunnel_nic_unlock(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
+
+void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 tos, __u8 ttl,
+ __be16 df, __be16 src_port, __be16 dst_port,
+ bool xnet, bool nocheck, u16 ipcb_flags)
+{
+ struct udphdr *uh;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+ uh->len = htons(skb->len);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ udp_set_csum(nocheck, skb, src, dst, skb->len);
+
+ iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet,
+ ipcb_flags);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
+
+void udp_tunnel_sock_release(struct socket *sock)
+{
+ rcu_assign_sk_user_data(sock->sk, NULL);
+ synchronize_rcu();
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sock_release(sock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
+ const unsigned long *flags,
+ __be64 tunnel_id, int md_size)
+{
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *info;
+
+ if (family == AF_INET)
+ tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size);
+ else
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size);
+ if (!tun_dst)
+ return NULL;
+
+ info = &tun_dst->u.tun_info;
+ info->key.tp_src = udp_hdr(skb)->source;
+ info->key.tp_dst = udp_hdr(skb)->dest;
+ if (udp_hdr(skb)->check)
+ __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
+ return tun_dst;
+}
+EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
+
+struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net, int oif,
+ __be32 *saddr,
+ const struct ip_tunnel_key *key,
+ __be16 sport, __be16 dport, u8 tos,
+ struct dst_cache *dst_cache)
+{
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache) {
+ rt = dst_cache_get_ip4(dst_cache, saddr);
+ if (rt)
+ return rt;
+ }
+#endif
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_proto = IPPROTO_UDP;
+ fl4.flowi4_oif = oif;
+ fl4.daddr = key->u.ipv4.dst;
+ fl4.saddr = key->u.ipv4.src;
+ fl4.fl4_dport = dport;
+ fl4.fl4_sport = sport;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(tos);
+ fl4.flowi4_flags = key->flow_flags;
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (rt->dst.dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+ ip_rt_put(rt);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache)
+ dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+ *saddr = fl4.saddr;
+ return rt;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup);
+
+MODULE_DESCRIPTION("IPv4 Foo over UDP tunnel driver");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
new file mode 100644
index 000000000000..944b3cf25468
--- /dev/null
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020 Facebook Inc.
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/udp_tunnel.h>
+#include <net/vxlan.h>
+
+enum udp_tunnel_nic_table_entry_flags {
+ UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0),
+ UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1),
+ UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2),
+ UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3),
+};
+
+struct udp_tunnel_nic_table_entry {
+ __be16 port;
+ u8 type;
+ u8 flags;
+ u16 use_cnt;
+#define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX
+ u8 hw_priv;
+};
+
+/**
+ * struct udp_tunnel_nic - UDP tunnel port offload state
+ * @work: async work for talking to hardware from process context
+ * @dev: netdev pointer
+ * @lock: protects all fields
+ * @need_sync: at least one port start changed
+ * @need_replay: space was freed, we need a replay of all ports
+ * @work_pending: @work is currently scheduled
+ * @n_tables: number of tables under @entries
+ * @missed: bitmap of tables which overflown
+ * @entries: table of tables of ports currently offloaded
+ */
+struct udp_tunnel_nic {
+ struct work_struct work;
+
+ struct net_device *dev;
+
+ struct mutex lock;
+
+ u8 need_sync:1;
+ u8 need_replay:1;
+ u8 work_pending:1;
+
+ unsigned int n_tables;
+ unsigned long missed;
+ struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables);
+};
+
+/* We ensure all work structs are done using driver state, but not the code.
+ * We need a workqueue we can flush before module gets removed.
+ */
+static struct workqueue_struct *udp_tunnel_nic_workqueue;
+
+static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type)
+{
+ switch (type) {
+ case UDP_TUNNEL_TYPE_VXLAN:
+ return "vxlan";
+ case UDP_TUNNEL_TYPE_GENEVE:
+ return "geneve";
+ case UDP_TUNNEL_TYPE_VXLAN_GPE:
+ return "vxlan-gpe";
+ default:
+ return "unknown";
+ }
+}
+
+static bool
+udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->use_cnt == 0 && !entry->flags;
+}
+
+static bool
+udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN);
+}
+
+static bool
+udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static void
+udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry)
+{
+ if (!udp_tunnel_nic_entry_is_free(entry))
+ entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static void
+udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry)
+{
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static bool
+udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD |
+ UDP_TUNNEL_NIC_ENTRY_DEL);
+}
+
+static void
+udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn,
+ struct udp_tunnel_nic_table_entry *entry,
+ unsigned int flag)
+{
+ entry->flags |= flag;
+ utn->need_sync = 1;
+}
+
+static void
+udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry,
+ struct udp_tunnel_info *ti)
+{
+ memset(ti, 0, sizeof(*ti));
+ ti->port = entry->port;
+ ti->type = entry->type;
+ ti->hw_priv = entry->hw_priv;
+}
+
+static bool
+udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
+ return false;
+ return true;
+}
+
+static bool
+udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_table_info *table;
+ unsigned int i, j;
+
+ if (!utn->missed)
+ return false;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ table = &dev->udp_tunnel_nic_info->tables[i];
+ if (!test_bit(i, &utn->missed))
+ continue;
+
+ for (j = 0; j < table->n_entries; j++)
+ if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
+ return true;
+ }
+
+ return false;
+}
+
+static void
+__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
+ unsigned int idx, struct udp_tunnel_info *ti)
+{
+ struct udp_tunnel_nic_table_entry *entry;
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ entry = &utn->entries[table][idx];
+
+ if (entry->use_cnt)
+ udp_tunnel_nic_ti_from_entry(entry, ti);
+}
+
+static void
+__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
+ unsigned int idx, u8 priv)
+{
+ dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv;
+}
+
+static void
+udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry,
+ int err)
+{
+ bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+
+ WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
+ entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL);
+
+ if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
+ (!err || (err == -EEXIST && dodgy)))
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD;
+
+ if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL &&
+ (!err || (err == -ENOENT && dodgy)))
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL;
+
+ if (!err)
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+ else
+ entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+}
+
+static void
+udp_tunnel_nic_device_sync_one(struct net_device *dev,
+ struct udp_tunnel_nic *utn,
+ unsigned int table, unsigned int idx)
+{
+ struct udp_tunnel_nic_table_entry *entry;
+ struct udp_tunnel_info ti;
+ int err;
+
+ entry = &utn->entries[table][idx];
+ if (!udp_tunnel_nic_entry_is_queued(entry))
+ return;
+
+ udp_tunnel_nic_ti_from_entry(entry, &ti);
+ if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD)
+ err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti);
+ else
+ err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx,
+ &ti);
+ udp_tunnel_nic_entry_update_done(entry, err);
+
+ if (err)
+ netdev_warn(dev,
+ "UDP tunnel port sync failed port %d type %s: %d\n",
+ be16_to_cpu(entry->port),
+ udp_tunnel_nic_tunnel_type_name(entry->type),
+ err);
+}
+
+static void
+udp_tunnel_nic_device_sync_by_port(struct net_device *dev,
+ struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ udp_tunnel_nic_device_sync_one(dev, utn, i, j);
+}
+
+static void
+udp_tunnel_nic_device_sync_by_table(struct net_device *dev,
+ struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+ int err;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ /* Find something that needs sync in this table */
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j]))
+ break;
+ if (j == info->tables[i].n_entries)
+ continue;
+
+ err = info->sync_table(dev, i);
+ if (err)
+ netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n",
+ i, err);
+
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ struct udp_tunnel_nic_table_entry *entry;
+
+ entry = &utn->entries[i][j];
+ if (udp_tunnel_nic_entry_is_queued(entry))
+ udp_tunnel_nic_entry_update_done(entry, err);
+ }
+ }
+}
+
+static void
+__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ if (!utn->need_sync)
+ return;
+
+ if (dev->udp_tunnel_nic_info->sync_table)
+ udp_tunnel_nic_device_sync_by_table(dev, utn);
+ else
+ udp_tunnel_nic_device_sync_by_port(dev, utn);
+
+ utn->need_sync = 0;
+ /* Can't replay directly here, in case we come from the tunnel driver's
+ * notification - trying to replay may deadlock inside tunnel driver.
+ */
+ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn);
+}
+
+static void
+udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ if (!utn->need_sync)
+ return;
+
+ queue_work(udp_tunnel_nic_workqueue, &utn->work);
+ utn->work_pending = 1;
+}
+
+static bool
+udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table,
+ struct udp_tunnel_info *ti)
+{
+ return table->tunnel_types & ti->type;
+}
+
+static bool
+udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i;
+
+ /* Special case IPv4-only NICs */
+ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY &&
+ ti->sa_family != AF_INET)
+ return false;
+
+ for (i = 0; i < utn->n_tables; i++)
+ if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti))
+ return true;
+ return false;
+}
+
+static int
+udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_table_entry *entry;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ entry = &utn->entries[i][j];
+
+ if (!udp_tunnel_nic_entry_is_free(entry) &&
+ entry->port == ti->port &&
+ entry->type != ti->type) {
+ __set_bit(i, &utn->missed);
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn,
+ unsigned int table, unsigned int idx, int use_cnt_adj)
+{
+ struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx];
+ bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+ unsigned int from, to;
+
+ WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX);
+
+ /* If not going from used to unused or vice versa - all done.
+ * For dodgy entries make sure we try to sync again (queue the entry).
+ */
+ entry->use_cnt += use_cnt_adj;
+ if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj))
+ return;
+
+ /* Cancel the op before it was sent to the device, if possible,
+ * otherwise we'd need to take special care to issue commands
+ * in the same order the ports arrived.
+ */
+ if (use_cnt_adj < 0) {
+ from = UDP_TUNNEL_NIC_ENTRY_ADD;
+ to = UDP_TUNNEL_NIC_ENTRY_DEL;
+ } else {
+ from = UDP_TUNNEL_NIC_ENTRY_DEL;
+ to = UDP_TUNNEL_NIC_ENTRY_ADD;
+ }
+
+ if (entry->flags & from) {
+ entry->flags &= ~from;
+ if (!dodgy)
+ return;
+ }
+
+ udp_tunnel_nic_entry_queue(utn, entry, to);
+}
+
+static bool
+udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn,
+ unsigned int table, unsigned int idx,
+ struct udp_tunnel_info *ti, int use_cnt_adj)
+{
+ struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx];
+
+ if (udp_tunnel_nic_entry_is_free(entry) ||
+ entry->port != ti->port ||
+ entry->type != ti->type)
+ return false;
+
+ if (udp_tunnel_nic_entry_is_frozen(entry))
+ return true;
+
+ udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj);
+ return true;
+}
+
+/* Try to find existing matching entry and adjust its use count, instead of
+ * adding a new one. Returns true if entry was found. In case of delete the
+ * entry may have gotten removed in the process, in which case it will be
+ * queued for removal.
+ */
+static bool
+udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti, int use_cnt_adj)
+{
+ const struct udp_tunnel_nic_table_info *table;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ table = &dev->udp_tunnel_nic_info->tables[i];
+ if (!udp_tunnel_nic_table_is_capable(table, ti))
+ continue;
+
+ for (j = 0; j < table->n_entries; j++)
+ if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti,
+ use_cnt_adj))
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ return udp_tunnel_nic_try_existing(dev, utn, ti, +1);
+}
+
+static bool
+udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ return udp_tunnel_nic_try_existing(dev, utn, ti, -1);
+}
+
+static bool
+udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_table_info *table;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ table = &dev->udp_tunnel_nic_info->tables[i];
+ if (!udp_tunnel_nic_table_is_capable(table, ti))
+ continue;
+
+ for (j = 0; j < table->n_entries; j++) {
+ struct udp_tunnel_nic_table_entry *entry;
+
+ entry = &utn->entries[i][j];
+ if (!udp_tunnel_nic_entry_is_free(entry))
+ continue;
+
+ entry->port = ti->port;
+ entry->type = ti->type;
+ entry->use_cnt = 1;
+ udp_tunnel_nic_entry_queue(utn, entry,
+ UDP_TUNNEL_NIC_ENTRY_ADD);
+ return true;
+ }
+
+ /* The different table may still fit this port in, but there
+ * are no devices currently which have multiple tables accepting
+ * the same tunnel type, and false positives are okay.
+ */
+ __set_bit(i, &utn->missed);
+ }
+
+ return false;
+}
+
+static void
+__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return;
+ if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)
+ return;
+ if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN &&
+ ti->port == htons(IANA_VXLAN_UDP_PORT)) {
+ if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+ netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n");
+ return;
+ }
+
+ if (!udp_tunnel_nic_is_capable(dev, utn, ti))
+ return;
+
+ /* It may happen that a tunnel of one type is removed and different
+ * tunnel type tries to reuse its port before the device was informed.
+ * Rely on utn->missed to re-add this port later.
+ */
+ if (udp_tunnel_nic_has_collision(dev, utn, ti))
+ return;
+
+ if (!udp_tunnel_nic_add_existing(dev, utn, ti))
+ udp_tunnel_nic_add_new(dev, utn, ti);
+
+ udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static void
+__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return;
+
+ if (!udp_tunnel_nic_is_capable(dev, utn, ti))
+ return;
+
+ udp_tunnel_nic_del_existing(dev, utn, ti);
+
+ udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+ unsigned int i, j;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return;
+
+ mutex_lock(&utn->lock);
+
+ utn->need_sync = false;
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ struct udp_tunnel_nic_table_entry *entry;
+
+ entry = &utn->entries[i][j];
+
+ entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL |
+ UDP_TUNNEL_NIC_ENTRY_OP_FAIL);
+ /* We don't release utn lock across ops */
+ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN);
+ if (!entry->use_cnt)
+ continue;
+
+ udp_tunnel_nic_entry_queue(utn, entry,
+ UDP_TUNNEL_NIC_ENTRY_ADD);
+ }
+
+ __udp_tunnel_nic_device_sync(dev, utn);
+
+ mutex_unlock(&utn->lock);
+}
+
+static size_t
+__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+ unsigned int j;
+ size_t size;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return 0;
+
+ size = 0;
+ for (j = 0; j < info->tables[table].n_entries; j++) {
+ if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
+ continue;
+
+ size += nla_total_size(0) + /* _TABLE_ENTRY */
+ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */
+ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */
+ }
+
+ return size;
+}
+
+static int
+__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
+ struct sk_buff *skb)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+ struct nlattr *nest;
+ unsigned int j;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return 0;
+
+ for (j = 0; j < info->tables[table].n_entries; j++) {
+ if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
+ continue;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
+ utn->entries[table][j].port) ||
+ nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE,
+ ilog2(utn->entries[table][j].type)))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void __udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ lockdep_assert_held(&utn->lock);
+}
+
+static void __udp_tunnel_nic_lock(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ mutex_lock(&utn->lock);
+}
+
+static void __udp_tunnel_nic_unlock(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ mutex_unlock(&utn->lock);
+}
+
+static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
+ .get_port = __udp_tunnel_nic_get_port,
+ .set_port_priv = __udp_tunnel_nic_set_port_priv,
+ .add_port = __udp_tunnel_nic_add_port,
+ .del_port = __udp_tunnel_nic_del_port,
+ .reset_ntf = __udp_tunnel_nic_reset_ntf,
+ .dump_size = __udp_tunnel_nic_dump_size,
+ .dump_write = __udp_tunnel_nic_dump_write,
+ .assert_locked = __udp_tunnel_nic_assert_locked,
+ .lock = __udp_tunnel_nic_lock,
+ .unlock = __udp_tunnel_nic_unlock,
+};
+
+static void
+udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ int adj_cnt = -utn->entries[i][j].use_cnt;
+
+ if (adj_cnt)
+ udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt);
+ }
+
+ __udp_tunnel_nic_device_sync(dev, utn);
+
+ for (i = 0; i < utn->n_tables; i++)
+ memset(utn->entries[i], 0, array_size(info->tables[i].n_entries,
+ sizeof(**utn->entries)));
+ WARN_ON(utn->need_sync);
+ utn->need_replay = 0;
+}
+
+static void
+udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_shared_node *node;
+ unsigned int i, j;
+
+ /* Freeze all the ports we are already tracking so that the replay
+ * does not double up the refcount.
+ */
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
+ utn->missed = 0;
+ utn->need_replay = 0;
+
+ if (!info->shared) {
+ udp_tunnel_get_rx_info(dev);
+ } else {
+ list_for_each_entry(node, &info->shared->devices, list)
+ udp_tunnel_get_rx_info(node->dev);
+ }
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]);
+}
+
+static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
+{
+ struct udp_tunnel_nic *utn =
+ container_of(work, struct udp_tunnel_nic, work);
+
+ rtnl_lock();
+ mutex_lock(&utn->lock);
+
+ utn->work_pending = 0;
+ __udp_tunnel_nic_device_sync(utn->dev, utn);
+
+ if (utn->need_replay)
+ udp_tunnel_nic_replay(utn->dev, utn);
+
+ mutex_unlock(&utn->lock);
+ rtnl_unlock();
+}
+
+static struct udp_tunnel_nic *
+udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
+ unsigned int n_tables)
+{
+ struct udp_tunnel_nic *utn;
+ unsigned int i;
+
+ utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL);
+ if (!utn)
+ return NULL;
+ utn->n_tables = n_tables;
+ INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);
+ mutex_init(&utn->lock);
+
+ for (i = 0; i < n_tables; i++) {
+ utn->entries[i] = kcalloc(info->tables[i].n_entries,
+ sizeof(*utn->entries[i]), GFP_KERNEL);
+ if (!utn->entries[i])
+ goto err_free_prev_entries;
+ }
+
+ return utn;
+
+err_free_prev_entries:
+ while (i--)
+ kfree(utn->entries[i]);
+ kfree(utn);
+ return NULL;
+}
+
+static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn)
+{
+ unsigned int i;
+
+ for (i = 0; i < utn->n_tables; i++)
+ kfree(utn->entries[i]);
+ kfree(utn);
+}
+
+static int udp_tunnel_nic_register(struct net_device *dev)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_shared_node *node = NULL;
+ struct udp_tunnel_nic *utn;
+ unsigned int n_tables, i;
+
+ BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE <
+ UDP_TUNNEL_NIC_MAX_TABLES);
+ /* Expect use count of at most 2 (IPv4, IPv6) per device */
+ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX <
+ UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2);
+
+ /* Check that the driver info is sane */
+ if (WARN_ON(!info->set_port != !info->unset_port) ||
+ WARN_ON(!info->set_port == !info->sync_table) ||
+ WARN_ON(!info->tables[0].n_entries))
+ return -EINVAL;
+
+ if (WARN_ON(info->shared &&
+ info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+ return -EINVAL;
+
+ n_tables = 1;
+ for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+ if (!info->tables[i].n_entries)
+ continue;
+
+ n_tables++;
+ if (WARN_ON(!info->tables[i - 1].n_entries))
+ return -EINVAL;
+ }
+
+ /* Create UDP tunnel state structures */
+ if (info->shared) {
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ node->dev = dev;
+ }
+
+ if (info->shared && info->shared->udp_tunnel_nic_info) {
+ utn = info->shared->udp_tunnel_nic_info;
+ } else {
+ utn = udp_tunnel_nic_alloc(info, n_tables);
+ if (!utn) {
+ kfree(node);
+ return -ENOMEM;
+ }
+ }
+
+ if (info->shared) {
+ if (!info->shared->udp_tunnel_nic_info) {
+ INIT_LIST_HEAD(&info->shared->devices);
+ info->shared->udp_tunnel_nic_info = utn;
+ }
+
+ list_add_tail(&node->list, &info->shared->devices);
+ }
+
+ utn->dev = dev;
+ dev_hold(dev);
+ dev->udp_tunnel_nic = utn;
+
+ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_get_rx_info(dev);
+ udp_tunnel_nic_unlock(dev);
+ }
+
+ return 0;
+}
+
+static void
+udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+
+ udp_tunnel_nic_lock(dev);
+
+ /* For a shared table remove this dev from the list of sharing devices
+ * and if there are other devices just detach.
+ */
+ if (info->shared) {
+ struct udp_tunnel_nic_shared_node *node, *first;
+
+ list_for_each_entry(node, &info->shared->devices, list)
+ if (node->dev == dev)
+ break;
+ if (list_entry_is_head(node, &info->shared->devices, list)) {
+ udp_tunnel_nic_unlock(dev);
+ return;
+ }
+
+ list_del(&node->list);
+ kfree(node);
+
+ first = list_first_entry_or_null(&info->shared->devices,
+ typeof(*first), list);
+ if (first) {
+ udp_tunnel_drop_rx_info(dev);
+ utn->dev = first->dev;
+ udp_tunnel_nic_unlock(dev);
+ goto release_dev;
+ }
+
+ info->shared->udp_tunnel_nic_info = NULL;
+ }
+
+ /* Flush before we check work, so we don't waste time adding entries
+ * from the work which we will boot immediately.
+ */
+ udp_tunnel_nic_flush(dev, utn);
+ udp_tunnel_nic_unlock(dev);
+
+ /* Wait for the work to be done using the state, netdev core will
+ * retry unregister until we give up our reference on this device.
+ */
+ if (utn->work_pending)
+ return;
+
+ udp_tunnel_nic_free(utn);
+release_dev:
+ dev->udp_tunnel_nic = NULL;
+ dev_put(dev);
+}
+
+static int
+udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ const struct udp_tunnel_nic_info *info;
+ struct udp_tunnel_nic *utn;
+
+ info = dev->udp_tunnel_nic_info;
+ if (!info)
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_REGISTER) {
+ int err;
+
+ err = udp_tunnel_nic_register(dev);
+ if (err)
+ netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err);
+ return notifier_from_errno(err);
+ }
+ /* All other events will need the udp_tunnel_nic state */
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UNREGISTER) {
+ udp_tunnel_nic_unregister(dev, utn);
+ return NOTIFY_OK;
+ }
+
+ /* All other events only matter if NIC has to be programmed open */
+ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UP) {
+ udp_tunnel_nic_lock(dev);
+ WARN_ON(!udp_tunnel_nic_is_empty(dev, utn));
+ udp_tunnel_get_rx_info(dev);
+ udp_tunnel_nic_unlock(dev);
+ return NOTIFY_OK;
+ }
+ if (event == NETDEV_GOING_DOWN) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_nic_flush(dev, utn);
+ udp_tunnel_nic_unlock(dev);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = {
+ .notifier_call = udp_tunnel_nic_netdevice_event,
+};
+
+static int __init udp_tunnel_nic_init_module(void)
+{
+ int err;
+
+ udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0);
+ if (!udp_tunnel_nic_workqueue)
+ return -ENOMEM;
+
+ rtnl_lock();
+ udp_tunnel_nic_ops = &__udp_tunnel_nic_ops;
+ rtnl_unlock();
+
+ err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block);
+ if (err)
+ goto err_unset_ops;
+
+ return 0;
+
+err_unset_ops:
+ rtnl_lock();
+ udp_tunnel_nic_ops = NULL;
+ rtnl_unlock();
+ destroy_workqueue(udp_tunnel_nic_workqueue);
+ return err;
+}
+late_initcall(udp_tunnel_nic_init_module);
+
+static void __exit udp_tunnel_nic_cleanup_module(void)
+{
+ unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block);
+
+ rtnl_lock();
+ udp_tunnel_nic_ops = NULL;
+ rtnl_unlock();
+
+ destroy_workqueue(udp_tunnel_nic_workqueue);
+}
+module_exit(udp_tunnel_nic_cleanup_module);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c
new file mode 100644
index 000000000000..c4b2888f5fef
--- /dev/null
+++ b/net/ipv4/udp_tunnel_stub.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020 Facebook Inc.
+
+#include <net/udp_tunnel.h>
+
+const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
+EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 561de6d8c734..d3e621a11a1a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -1,45 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* UDPLITE An implementation of the UDP-Lite protocol (RFC 3828).
*
- * Version: $Id: udplite.c,v 1.25 2006/10/19 07:22:36 gerrit Exp $
- *
* Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk>
*
* Changes:
* Fixes:
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-#include "udp_impl.h"
-DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly;
-struct hlist_head udplite_hash[UDP_HTABLE_SIZE];
-static int udplite_port_rover;
+#define pr_fmt(fmt) "UDPLite: " fmt
-__inline__ int udplite_get_port(struct sock *sk, unsigned short p,
- int (*c)(const struct sock *, const struct sock *))
-{
- return __udp_lib_get_port(sk, p, udplite_hash, &udplite_port_rover, c);
-}
+#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include "udp_impl.h"
+
+struct udp_table udplite_table __read_mostly;
+EXPORT_SYMBOL(udplite_table);
-static __inline__ int udplite_v4_get_port(struct sock *sk, unsigned short snum)
+/* Designate sk as UDP-Lite socket */
+static int udplite_sk_init(struct sock *sk)
{
- return udplite_get_port(sk, snum, ipv4_rcv_saddr_equal);
+ udp_init_sock(sk);
+ pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
+ "please contact the netdev mailing list\n");
+ return 0;
}
-__inline__ int udplite_rcv(struct sk_buff *skb)
+static int udplite_rcv(struct sk_buff *skb)
{
- return __udp4_lib_rcv(skb, udplite_hash, 1);
+ return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-__inline__ void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, udplite_hash);
+ return __udp4_lib_err(skb, info, &udplite_table);
}
-static struct net_protocol udplite_protocol = {
+static const struct net_protocol udplite_protocol = {
.handler = udplite_rcv,
.err_handler = udplite_err,
.no_policy = 1,
@@ -58,42 +55,68 @@ struct proto udplite_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
- .backlog_rcv = udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
- .get_port = udplite_v4_get_port,
+ .rehash = udp_v4_rehash,
+ .get_port = udp_v4_get_port,
+
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udp_setsockopt,
- .compat_getsockopt = compat_udp_getsockopt,
-#endif
+ .h.udp_table = &udplite_table,
};
+EXPORT_SYMBOL(udplite_prot);
static struct inet_protosw udplite4_protosw = {
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDPLITE,
.prot = &udplite_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
- .no_check = 0, /* must checksum (RFC 3828) */
.flags = INET_PROTOSW_PERMANENT,
};
#ifdef CONFIG_PROC_FS
-static struct file_operations udplite4_seq_fops;
static struct udp_seq_afinfo udplite4_seq_afinfo = {
- .owner = THIS_MODULE,
- .name = "udplite",
.family = AF_INET,
- .hashtable = udplite_hash,
- .seq_show = udp4_seq_show,
- .seq_fops = &udplite4_seq_fops,
+ .udp_table = &udplite_table,
};
+
+static int __net_init udplite4_proc_init_net(struct net *net)
+{
+ if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops,
+ sizeof(struct udp_iter_state), &udplite4_seq_afinfo))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit udplite4_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("udplite", net->proc_net);
+}
+
+static struct pernet_operations udplite4_net_ops = {
+ .init = udplite4_proc_init_net,
+ .exit = udplite4_proc_exit_net,
+};
+
+static __init int udplite4_proc_init(void)
+{
+ return register_pernet_subsys(&udplite4_net_ops);
+}
+#else
+static inline int udplite4_proc_init(void)
+{
+ return 0;
+}
#endif
void __init udplite4_register(void)
{
+ udp_table_init(&udplite_table, "UDP-Lite");
if (proto_register(&udplite_prot, 1))
goto out_register_err;
@@ -102,18 +125,12 @@ void __init udplite4_register(void)
inet_register_protosw(&udplite4_protosw);
-#ifdef CONFIG_PROC_FS
- if (udp_proc_register(&udplite4_seq_afinfo)) /* udplite4_proc_init() */
- printk(KERN_ERR "%s: Cannot register /proc!\n", __FUNCTION__);
-#endif
+ if (udplite4_proc_init())
+ pr_err("%s: Cannot register /proc!\n", __func__);
return;
out_unregister_proto:
proto_unregister(&udplite_prot);
out_register_err:
- printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __FUNCTION__);
+ pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
}
-
-EXPORT_SYMBOL(udplite_hash);
-EXPORT_SYMBOL(udplite_prot);
-EXPORT_SYMBOL(udplite_get_port);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 8655d038364c..f28cfd88eaf5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* xfrm4_input.c
*
@@ -6,163 +7,222 @@
* Split up af-specific portion
* Derek Atkins <derek@ihtfp.com>
* Add Encapsulation support
- *
+ *
*/
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip.h>
#include <net/xfrm.h>
+#include <net/protocol.h>
+#include <net/gro.h>
-int xfrm4_rcv(struct sk_buff *skb)
-{
- return xfrm4_rcv_encap(skb, 0);
-}
-
-EXPORT_SYMBOL(xfrm4_rcv);
-
-static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
+static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
- switch (nexthdr) {
- case IPPROTO_IPIP:
- *spi = skb->nh.iph->saddr;
- *seq = 0;
- return 0;
- }
-
- return xfrm_parse_spi(skb, nexthdr, spi, seq);
+ return dst_input(skb);
}
-#ifdef CONFIG_NETFILTER
-static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
+static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
- struct iphdr *iph = skb->nh.iph;
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
- if (skb->dst == NULL) {
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev))
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), skb->dev))
goto drop;
}
- return dst_input(skb);
+
+ if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2))
+ goto drop;
+
+ return 0;
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
-#endif
-int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
{
- int err;
- __be32 spi, seq;
- struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH];
- struct xfrm_state *x;
- int xfrm_nr = 0;
- int decaps = 0;
-
- if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0)
- goto drop;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct iphdr *iph = ip_hdr(skb);
- do {
- struct iphdr *iph = skb->nh.iph;
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
- if (xfrm_nr == XFRM_MAX_DEPTH)
- goto drop;
-
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET);
- if (x == NULL)
- goto drop;
+#ifndef CONFIG_NETFILTER
+ if (!async)
+ return -iph->protocol;
+#endif
- spin_lock(&x->lock);
- if (unlikely(x->km.state != XFRM_STATE_VALID))
- goto drop_unlock;
+ __skb_push(skb, -skb_network_offset(skb));
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+
+ if (xo && (xo->flags & XFRM_GRO)) {
+ /* The full l2 header needs to be preserved so that re-injecting the packet at l2
+ * works correctly in the presence of vlan tags.
+ */
+ skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ return 0;
+ }
- if ((x->encap ? x->encap->encap_type : 0) != encap_type)
- goto drop_unlock;
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ xfrm4_rcv_encap_finish);
+ return 0;
+}
- if (x->props.replay_window && xfrm_replay_check(x, seq))
- goto drop_unlock;
+static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh;
+ struct iphdr *iph;
+ int iphlen, len;
+ __u8 *udpdata;
+ __be32 *udpdata32;
+ u16 encap_type;
+
+ encap_type = READ_ONCE(up->encap_type);
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+
+ /* If this is a paged skb, make sure we pull up
+ * whatever data we need to look at. */
+ len = skb->len - sizeof(struct udphdr);
+ if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+ return 1;
+
+ /* Now we can get the pointers */
+ uh = udp_hdr(skb);
+ udpdata = (__u8 *)uh + sizeof(struct udphdr);
+ udpdata32 = (__be32 *)udpdata;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ return -EINVAL;
+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+ /* ESP Packet without Non-ESP header */
+ len = sizeof(struct udphdr);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ }
- if (xfrm_state_check_expire(x))
- goto drop_unlock;
+ /* At this point we are sure that this is an ESPinUDP packet,
+ * so we need to remove 'len' bytes from the packet (the UDP
+ * header and optional ESP marker bytes) and then modify the
+ * protocol to ESP, and then call into the transform receiver.
+ */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -EINVAL;
+
+ /* Now we can update and verify the packet length... */
+ iph = ip_hdr(skb);
+ iphlen = iph->ihl << 2;
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (skb->len < iphlen + len) {
+ /* packet is too small!?! */
+ return -EINVAL;
+ }
- if (x->type->input(x, skb))
- goto drop_unlock;
+ /* pull the data buffer up to the ESP header and set the
+ * transport header to point to ESP. Keep UDP on the stack
+ * for later.
+ */
+ if (pull) {
+ __skb_pull(skb, len);
+ skb_reset_transport_header(skb);
+ } else {
+ skb_set_transport_header(skb, len);
+ }
- /* only the first xfrm gets the encap type */
- encap_type = 0;
+ /* process ESP */
+ return 0;
+}
- if (x->props.replay_window)
- xfrm_replay_advance(x, seq);
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
- x->curlft.bytes += skb->len;
- x->curlft.packets++;
+ ret = __xfrm4_udp_encap_rcv(sk, skb, true);
+ if (!ret)
+ return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0,
+ udp_sk(sk)->encap_type);
- spin_unlock(&x->lock);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return 0;
+ }
- xfrm_vec[xfrm_nr++] = x;
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_udp_encap_rcv);
- if (x->mode->input(x, skb))
- goto drop;
+struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+ int len, dlen;
+ __u8 *udpdata;
+ __be32 *udpdata32;
- if (x->props.mode == XFRM_MODE_TUNNEL) {
- decaps = 1;
- break;
- }
+ len = skb->len - offset;
+ dlen = offset + min(len, 8);
+ udpdata = skb_gro_header(skb, dlen, offset);
+ udpdata32 = (__be32 *)udpdata;
+ if (unlikely(!udpdata))
+ return NULL;
- if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0)
- goto drop;
- } while (!err);
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[IPPROTO_ESP]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
- /* Allocate new secpath or COW existing one. */
+ /* check if it is a keepalive or IKE packet */
+ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
+ goto out;
- if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
- struct sec_path *sp;
- sp = secpath_dup(skb->sp);
- if (!sp)
- goto drop;
- if (skb->sp)
- secpath_put(skb->sp);
- skb->sp = sp;
- }
- if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
- goto drop;
+ /* set the transport header to ESP */
+ skb_set_transport_header(skb, offset);
- memcpy(skb->sp->xvec + skb->sp->len, xfrm_vec,
- xfrm_nr * sizeof(xfrm_vec[0]));
- skb->sp->len += xfrm_nr;
+ NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;
- nf_reset(skb);
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ rcu_read_unlock();
- if (decaps) {
- if (!(skb->dev->flags&IFF_LOOPBACK)) {
- dst_release(skb->dst);
- skb->dst = NULL;
- }
- netif_rx(skb);
- return 0;
- } else {
-#ifdef CONFIG_NETFILTER
- __skb_push(skb, skb->data - skb->nh.raw);
- skb->nh.iph->tot_len = htons(skb->len);
- ip_send_check(skb->nh.iph);
+ return pp;
- NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
- xfrm4_rcv_encap_finish);
- return 0;
-#else
- return -skb->nh.iph->protocol;
-#endif
- }
+out:
+ rcu_read_unlock();
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
-drop_unlock:
- spin_unlock(&x->lock);
- xfrm_state_put(x);
-drop:
- while (--xfrm_nr >= 0)
- xfrm_state_put(xfrm_vec[xfrm_nr]);
+ return NULL;
+}
+EXPORT_SYMBOL(xfrm4_gro_udp_encap_rcv);
- kfree_skb(skb);
- return 0;
+int xfrm4_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
}
+EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
deleted file mode 100644
index 89cf59ea7bbe..000000000000
--- a/net/ipv4/xfrm4_mode_beet.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4.
- *
- * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
- * Miika Komu <miika@iki.fi>
- * Herbert Xu <herbert@gondor.apana.org.au>
- * Abhinav Pathak <abhinav.pathak@hiit.fi>
- * Jeff Ahrenholz <ahrenholz@gmail.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
- * The following fields in it shall be filled in by x->type->output:
- * tot_len
- * check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct iphdr *iph, *top_iph = NULL;
- int hdrlen, optlen;
-
- iph = skb->nh.iph;
- skb->h.ipiph = iph;
-
- hdrlen = 0;
- optlen = iph->ihl * 4 - sizeof(*iph);
- if (unlikely(optlen))
- hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
-
- skb->nh.raw = skb_push(skb, x->props.header_len + hdrlen);
- top_iph = skb->nh.iph;
- hdrlen = iph->ihl * 4 - optlen;
- skb->h.raw += hdrlen;
-
- memmove(top_iph, iph, hdrlen);
- if (unlikely(optlen)) {
- struct ip_beet_phdr *ph;
-
- BUG_ON(optlen < 0);
-
- ph = (struct ip_beet_phdr *)skb->h.raw;
- ph->padlen = 4 - (optlen & 4);
- ph->hdrlen = (optlen + ph->padlen + sizeof(*ph)) / 8;
- ph->nexthdr = top_iph->protocol;
-
- top_iph->protocol = IPPROTO_BEETPH;
- top_iph->ihl = sizeof(struct iphdr) / 4;
- }
-
- top_iph->saddr = x->props.saddr.a4;
- top_iph->daddr = x->id.daddr.a4;
-
- return 0;
-}
-
-static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct iphdr *iph = skb->nh.iph;
- int phlen = 0;
- int optlen = 0;
- __u8 ph_nexthdr = 0, protocol = 0;
- int err = -EINVAL;
-
- protocol = iph->protocol;
-
- if (unlikely(iph->protocol == IPPROTO_BEETPH)) {
- struct ip_beet_phdr *ph = (struct ip_beet_phdr*)(iph + 1);
-
- if (!pskb_may_pull(skb, sizeof(*ph)))
- goto out;
-
- phlen = ph->hdrlen * 8;
- optlen = phlen - ph->padlen - sizeof(*ph);
- if (optlen < 0 || optlen & 3 || optlen > 250)
- goto out;
-
- if (!pskb_may_pull(skb, phlen))
- goto out;
-
- ph_nexthdr = ph->nexthdr;
- }
-
- skb_push(skb, sizeof(*iph) - phlen + optlen);
- memmove(skb->data, skb->nh.raw, sizeof(*iph));
- skb->nh.raw = skb->data;
-
- iph = skb->nh.iph;
- iph->ihl = (sizeof(*iph) + optlen) / 4;
- iph->tot_len = htons(skb->len);
- iph->daddr = x->sel.daddr.a4;
- iph->saddr = x->sel.saddr.a4;
- if (ph_nexthdr)
- iph->protocol = ph_nexthdr;
- else
- iph->protocol = protocol;
- iph->check = 0;
- iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
- err = 0;
-out:
- return err;
-}
-
-static struct xfrm_mode xfrm4_beet_mode = {
- .input = xfrm4_beet_input,
- .output = xfrm4_beet_output,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_BEET,
-};
-
-static int __init xfrm4_beet_init(void)
-{
- return xfrm_register_mode(&xfrm4_beet_mode, AF_INET);
-}
-
-static void __exit xfrm4_beet_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET);
- BUG_ON(err);
-}
-
-module_init(xfrm4_beet_init);
-module_exit(xfrm4_beet_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
deleted file mode 100644
index 92676b7e4034..000000000000
--- a/net/ipv4/xfrm4_mode_transport.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
- *
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-/* Add encapsulation header.
- *
- * The IP header will be moved forward to make space for the encapsulation
- * header.
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct iphdr *iph;
- int ihl;
-
- iph = skb->nh.iph;
- skb->h.ipiph = iph;
-
- ihl = iph->ihl * 4;
- skb->h.raw += ihl;
-
- skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
- return 0;
-}
-
-/* Remove encapsulation header.
- *
- * The IP header will be moved over the top of the encapsulation header.
- *
- * On entry, skb->h shall point to where the IP header should be and skb->nh
- * shall be set to where the IP header currently is. skb->data shall point
- * to the start of the payload.
- */
-static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int ihl = skb->data - skb->h.raw;
-
- if (skb->h.raw != skb->nh.raw)
- skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
- skb->nh.iph->tot_len = htons(skb->len + ihl);
- skb->h.raw = skb->data;
- return 0;
-}
-
-static struct xfrm_mode xfrm4_transport_mode = {
- .input = xfrm4_transport_input,
- .output = xfrm4_transport_output,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_TRANSPORT,
-};
-
-static int __init xfrm4_transport_init(void)
-{
- return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
-}
-
-static void __exit xfrm4_transport_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
- BUG_ON(err);
-}
-
-module_init(xfrm4_transport_init);
-module_exit(xfrm4_transport_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
deleted file mode 100644
index e23c21d31a53..000000000000
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
- *
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
- struct iphdr *outer_iph = skb->nh.iph;
- struct iphdr *inner_iph = skb->h.ipiph;
-
- if (INET_ECN_is_ce(outer_iph->tos))
- IP_ECN_set_ce(inner_iph);
-}
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per RFC 2401. The following fields
- * in it shall be filled in by x->type->output:
- * tot_len
- * check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct iphdr *iph, *top_iph;
- int flags;
-
- iph = skb->nh.iph;
- skb->h.ipiph = iph;
-
- skb->nh.raw = skb_push(skb, x->props.header_len);
- top_iph = skb->nh.iph;
-
- top_iph->ihl = 5;
- top_iph->version = 4;
-
- /* DS disclosed */
- top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-
- flags = x->props.flags;
- if (flags & XFRM_STATE_NOECN)
- IP_ECN_clear(top_iph);
-
- top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
- 0 : (iph->frag_off & htons(IP_DF));
- if (!top_iph->frag_off)
- __ip_select_ident(top_iph, dst->child, 0);
-
- top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-
- top_iph->saddr = x->props.saddr.a4;
- top_iph->daddr = x->id.daddr.a4;
- top_iph->protocol = IPPROTO_IPIP;
-
- memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
- return 0;
-}
-
-static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct iphdr *iph = skb->nh.iph;
- int err = -EINVAL;
-
- if (iph->protocol != IPPROTO_IPIP)
- goto out;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
-
- if (skb_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
- goto out;
-
- if (x->props.flags & XFRM_STATE_DECAP_DSCP)
- ipv4_copy_dscp(iph, skb->h.ipiph);
- if (!(x->props.flags & XFRM_STATE_NOECN))
- ipip_ecn_decapsulate(skb);
- skb->mac.raw = memmove(skb->data - skb->mac_len,
- skb->mac.raw, skb->mac_len);
- skb->nh.raw = skb->data;
- err = 0;
-
-out:
- return err;
-}
-
-static struct xfrm_mode xfrm4_tunnel_mode = {
- .input = xfrm4_tunnel_input,
- .output = xfrm4_tunnel_output,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_TUNNEL,
-};
-
-static int __init xfrm4_tunnel_init(void)
-{
- return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
-}
-
-static void __exit xfrm4_tunnel_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
- BUG_ON(err);
-}
-
-module_init(xfrm4_tunnel_init);
-module_exit(xfrm4_tunnel_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 04403fb01a58..0ae67d537499 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -1,173 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* xfrm4_output.c - Common IPsec encapsulation code for IPv4.
* Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-#include <linux/compiler.h>
#include <linux/if_ether.h>
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/spinlock.h>
#include <linux/netfilter_ipv4.h>
+#include <net/dst.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/icmp.h>
-static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int mtu, ret = 0;
- struct dst_entry *dst;
- struct iphdr *iph = skb->nh.iph;
-
- if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
- goto out;
-
- IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
-
- if (!(iph->frag_off & htons(IP_DF)) || skb->local_df)
- goto out;
-
- dst = skb->dst;
- mtu = dst_mtu(dst);
- if (skb->len > mtu) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ret = -EMSGSIZE;
- }
-out:
- return ret;
-}
-
-static int xfrm4_output_one(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb->dst;
- struct xfrm_state *x = dst->xfrm;
- int err;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- err = skb_checksum_help(skb);
- if (err)
- goto error_nolock;
- }
+#ifdef CONFIG_NETFILTER
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
- if (x->props.mode == XFRM_MODE_TUNNEL) {
- err = xfrm4_tunnel_check_size(skb);
- if (err)
- goto error_nolock;
+ if (!x) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output(net, sk, skb);
}
+#endif
- do {
- spin_lock_bh(&x->lock);
- err = xfrm_state_check(x, skb);
- if (err)
- goto error;
-
- err = x->mode->output(x, skb);
- if (err)
- goto error;
-
- err = x->type->output(x, skb);
- if (err)
- goto error;
-
- x->curlft.bytes += skb->len;
- x->curlft.packets++;
-
- spin_unlock_bh(&x->lock);
-
- if (!(skb->dst = dst_pop(dst))) {
- err = -EHOSTUNREACH;
- goto error_nolock;
- }
- dst = skb->dst;
- x = dst->xfrm;
- } while (x && (x->props.mode != XFRM_MODE_TUNNEL));
-
- IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
- err = 0;
-
-out_exit:
- return err;
-error:
- spin_unlock_bh(&x->lock);
-error_nolock:
- kfree_skb(skb);
- goto out_exit;
+ return xfrm_output(sk, skb);
}
-static int xfrm4_output_finish2(struct sk_buff *skb)
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int err;
-
- while (likely((err = xfrm4_output_one(skb)) == 0)) {
- nf_reset(skb);
-
- err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, &skb, NULL,
- skb->dst->dev, dst_output);
- if (unlikely(err != 1))
- break;
-
- if (!skb->dst->xfrm)
- return dst_output(skb);
-
- err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
- skb->dst->dev, xfrm4_output_finish2);
- if (unlikely(err != 1))
- break;
- }
-
- return err;
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, skb->dev, skb_dst_dev(skb),
+ __xfrm4_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
}
-static int xfrm4_output_finish(struct sk_buff *skb)
+void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
{
- struct sk_buff *segs;
-
-#ifdef CONFIG_NETFILTER
- if (!skb->dst->xfrm) {
- IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(skb);
- }
-#endif
-
- if (!skb_is_gso(skb))
- return xfrm4_output_finish2(skb);
+ struct iphdr *hdr;
- skb->protocol = htons(ETH_P_IP);
- segs = skb_gso_segment(skb, 0);
- kfree_skb(skb);
- if (unlikely(IS_ERR(segs)))
- return PTR_ERR(segs);
-
- do {
- struct sk_buff *nskb = segs->next;
- int err;
-
- segs->next = NULL;
- err = xfrm4_output_finish2(segs);
-
- if (unlikely(err)) {
- while ((segs = nskb)) {
- nskb = segs->next;
- segs->next = NULL;
- kfree_skb(segs);
- }
- return err;
- }
-
- segs = nskb;
- } while (segs);
-
- return 0;
-}
-
-int xfrm4_output(struct sk_buff *skb)
-{
- return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
- xfrm4_output_finish,
- !(IPCB(skb)->flags & IPSKB_REROUTED));
+ hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+ ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
+ inet_sk(skb->sk)->inet_dport, mtu);
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d4107bb701b5..58faf1ddd2b1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -1,342 +1,245 @@
-/*
+// SPDX-License-Identifier: GPL-2.0
+/*
* xfrm4_policy.c
*
* Changes:
* Kazunori MIYAZAWA @USAGI
* YOSHIFUJI Hideaki @USAGI
* Split up af-specific portion
- *
+ *
*/
-#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
#include <linux/inetdevice.h>
+#include <net/dst.h>
#include <net/xfrm.h>
+#include <net/flow.h>
#include <net/ip.h>
+#include <net/l3mdev.h>
-static struct dst_ops xfrm4_dst_ops;
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-
-static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
-{
- return __ip_route_output_key((struct rtable**)dst, fl);
-}
-
-static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr)
+static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4,
+ const struct xfrm_dst_lookup_params *params)
{
struct rtable *rt;
- struct flowi fl_tunnel = {
- .nl_u = {
- .ip4_u = {
- .daddr = daddr->a4,
- },
- },
- };
-
- if (!xfrm4_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) {
- saddr->a4 = rt->rt_src;
- dst_release(&rt->u.dst);
- return 0;
- }
- return -EHOSTUNREACH;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = params->daddr->a4;
+ fl4->flowi4_dscp = params->dscp;
+ fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net,
+ params->oif);
+ fl4->flowi4_mark = params->mark;
+ if (params->saddr)
+ fl4->saddr = params->saddr->a4;
+ fl4->flowi4_proto = params->ipproto;
+ fl4->uli = params->uli;
+
+ rt = __ip_route_output_key(params->net, fl4);
+ if (!IS_ERR(rt))
+ return &rt->dst;
+
+ return ERR_CAST(rt);
}
-static struct dst_entry *
-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
+static struct dst_entry *xfrm4_dst_lookup(const struct xfrm_dst_lookup_params *params)
{
- struct dst_entry *dst;
+ struct flowi4 fl4;
- read_lock_bh(&policy->lock);
- for (dst = policy->bundles; dst; dst = dst->next) {
- struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
- if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
- xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
- xdst->u.rt.fl.fl4_src == fl->fl4_src &&
- xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
- xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
- dst_clone(dst);
- break;
- }
- }
- read_unlock_bh(&policy->lock);
- return dst;
+ return __xfrm4_dst_lookup(&fl4, params);
}
-/* Allocate chain of dst_entry's, attach known xfrm's, calculate
- * all the metrics... Shortly, bundle a bundle.
- */
-
-static int
-__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
- struct flowi *fl, struct dst_entry **dst_p)
+static int xfrm4_get_saddr(xfrm_address_t *saddr,
+ const struct xfrm_dst_lookup_params *params)
{
- struct dst_entry *dst, *dst_prev;
- struct rtable *rt0 = (struct rtable*)(*dst_p);
- struct rtable *rt = rt0;
- __be32 remote = fl->fl4_dst;
- __be32 local = fl->fl4_src;
- struct flowi fl_tunnel = {
- .nl_u = {
- .ip4_u = {
- .saddr = local,
- .daddr = remote,
- .tos = fl->fl4_tos
- }
- }
- };
- int i;
- int err;
- int header_len = 0;
- int trailer_len = 0;
-
- dst = dst_prev = NULL;
- dst_hold(&rt->u.dst);
-
- for (i = 0; i < nx; i++) {
- struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
- struct xfrm_dst *xdst;
- int tunnel = 0;
-
- if (unlikely(dst1 == NULL)) {
- err = -ENOBUFS;
- dst_release(&rt->u.dst);
- goto error;
- }
-
- if (!dst)
- dst = dst1;
- else {
- dst_prev->child = dst1;
- dst1->flags |= DST_NOHASH;
- dst_clone(dst1);
- }
-
- xdst = (struct xfrm_dst *)dst1;
- xdst->route = &rt->u.dst;
- xdst->genid = xfrm[i]->genid;
-
- dst1->next = dst_prev;
- dst_prev = dst1;
- if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
- remote = xfrm[i]->id.daddr.a4;
- local = xfrm[i]->props.saddr.a4;
- tunnel = 1;
- }
- header_len += xfrm[i]->props.header_len;
- trailer_len += xfrm[i]->props.trailer_len;
-
- if (tunnel) {
- fl_tunnel.fl4_src = local;
- fl_tunnel.fl4_dst = remote;
- err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
- &fl_tunnel, AF_INET);
- if (err)
- goto error;
- } else
- dst_hold(&rt->u.dst);
- }
+ struct dst_entry *dst;
+ struct flowi4 fl4;
- dst_prev->child = &rt->u.dst;
- dst->path = &rt->u.dst;
-
- *dst_p = dst;
- dst = dst_prev;
-
- dst_prev = *dst_p;
- i = 0;
- for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
- struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
- x->u.rt.fl = *fl;
-
- dst_prev->xfrm = xfrm[i++];
- dst_prev->dev = rt->u.dst.dev;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
- dst_prev->obsolete = -1;
- dst_prev->flags |= DST_HOST;
- dst_prev->lastuse = jiffies;
- dst_prev->header_len = header_len;
- dst_prev->nfheader_len = 0;
- dst_prev->trailer_len = trailer_len;
- memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
-
- /* Copy neighbout for reachability confirmation */
- dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour);
- dst_prev->input = rt->u.dst.input;
- dst_prev->output = xfrm4_output;
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
- x->u.rt.peer = rt->peer;
- /* Sheit... I remember I did this right. Apparently,
- * it was magically lost, so this code needs audit */
- x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
- x->u.rt.rt_type = rt->rt_type;
- x->u.rt.rt_src = rt0->rt_src;
- x->u.rt.rt_dst = rt0->rt_dst;
- x->u.rt.rt_gateway = rt->rt_gateway;
- x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
- x->u.rt.idev = rt0->idev;
- in_dev_hold(rt0->idev);
- header_len -= x->u.dst.xfrm->props.header_len;
- trailer_len -= x->u.dst.xfrm->props.trailer_len;
- }
+ dst = __xfrm4_dst_lookup(&fl4, params);
+ if (IS_ERR(dst))
+ return -EHOSTUNREACH;
- xfrm_init_pmtu(dst);
+ saddr->a4 = fl4.saddr;
+ dst_release(dst);
return 0;
-
-error:
- if (dst)
- dst_free(dst);
- return err;
}
-static void
-_decode_session4(struct sk_buff *skb, struct flowi *fl)
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ const struct flowi *fl)
{
- struct iphdr *iph = skb->nh.iph;
- u8 *xprth = skb->nh.raw + iph->ihl*4;
-
- memset(fl, 0, sizeof(struct flowi));
- if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
- switch (iph->protocol) {
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_TCP:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ports = (__be16 *)xprth;
-
- fl->fl_ip_sport = ports[0];
- fl->fl_ip_dport = ports[1];
- }
- break;
-
- case IPPROTO_ICMP:
- if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
- u8 *icmp = xprth;
-
- fl->fl_icmp_type = icmp[0];
- fl->fl_icmp_code = icmp[1];
- }
- break;
-
- case IPPROTO_ESP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be32 *ehdr = (__be32 *)xprth;
-
- fl->fl_ipsec_spi = ehdr[0];
- }
- break;
-
- case IPPROTO_AH:
- if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32*)xprth;
-
- fl->fl_ipsec_spi = ah_hdr[1];
- }
- break;
-
- case IPPROTO_COMP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ipcomp_hdr = (__be16 *)xprth;
-
- fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
- }
- break;
- default:
- fl->fl_ipsec_spi = 0;
- break;
- };
- }
- fl->proto = iph->protocol;
- fl->fl4_dst = iph->daddr;
- fl->fl4_src = iph->saddr;
- fl->fl4_tos = iph->tos;
+ struct rtable *rt = dst_rtable(xdst->route);
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ xdst->u.rt.rt_iif = fl4->flowi4_iif;
+
+ xdst->u.dst.dev = dev;
+ netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
+
+ /* Sheit... I remember I did this right. Apparently,
+ * it was magically lost, so this code needs audit */
+ xdst->u.rt.rt_is_input = rt->rt_is_input;
+ xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
+ RTCF_LOCAL);
+ xdst->u.rt.rt_type = rt->rt_type;
+ xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+ xdst->u.rt.rt_gw_family = rt->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ xdst->u.rt.rt_gw4 = rt->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ xdst->u.rt.rt_gw6 = rt->rt_gw6;
+ xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
+ rt_add_uncached_list(&xdst->u.rt);
+
+ return 0;
}
-static inline int xfrm4_garbage_collect(void)
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
- xfrm4_policy_afinfo.garbage_collect();
- return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
}
-static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, mtu);
+ path->ops->redirect(path, sk, skb);
}
static void xfrm4_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- if (likely(xdst->u.rt.idev))
- in_dev_put(xdst->u.rt.idev);
+ dst_destroy_metrics_generic(dst);
+ rt_del_uncached_list(&xdst->u.rt);
xfrm_dst_destroy(xdst);
}
-static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- struct xfrm_dst *xdst;
+static struct dst_ops xfrm4_dst_ops_template = {
+ .family = AF_INET,
+ .update_pmtu = xfrm4_update_pmtu,
+ .redirect = xfrm4_redirect,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = xfrm4_dst_destroy,
+ .ifdown = xfrm_dst_ifdown,
+ .local_out = __ip_local_out,
+ .gc_thresh = 32768,
+};
- if (!unregister)
- return;
+static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+ .dst_ops = &xfrm4_dst_ops_template,
+ .dst_lookup = xfrm4_dst_lookup,
+ .get_saddr = xfrm4_get_saddr,
+ .fill_dst = xfrm4_fill_dst,
+ .blackhole_route = ipv4_blackhole_route,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm4_policy_table[] = {
+ {
+ .procname = "xfrm4_gc_thresh",
+ .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
- xdst = (struct xfrm_dst *)dst;
- if (xdst->u.rt.idev->dev == dev) {
- struct in_device *loopback_idev = in_dev_get(&loopback_dev);
- BUG_ON(!loopback_idev);
+static __net_init int xfrm4_net_sysctl_init(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
- do {
- in_dev_put(xdst->u.rt.idev);
- xdst->u.rt.idev = loopback_idev;
- in_dev_hold(loopback_idev);
- xdst = (struct xfrm_dst *)xdst->u.dst.child;
- } while (xdst->u.dst.xfrm);
+ table = xfrm4_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
- __in_dev_put(loopback_idev);
+ table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
}
- xfrm_dst_ifdown(dst, dev);
+ hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ ARRAY_SIZE(xfrm4_policy_table));
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.xfrm4_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
}
-static struct dst_ops xfrm4_dst_ops = {
- .family = AF_INET,
- .protocol = __constant_htons(ETH_P_IP),
- .gc = xfrm4_garbage_collect,
- .update_pmtu = xfrm4_update_pmtu,
- .destroy = xfrm4_dst_destroy,
- .ifdown = xfrm4_dst_ifdown,
- .gc_thresh = 1024,
- .entry_size = sizeof(struct xfrm_dst),
-};
+static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
+{
+ const struct ctl_table *table;
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
- .family = AF_INET,
- .dst_ops = &xfrm4_dst_ops,
- .dst_lookup = xfrm4_dst_lookup,
- .get_saddr = xfrm4_get_saddr,
- .find_bundle = __xfrm4_find_bundle,
- .bundle_create = __xfrm4_bundle_create,
- .decode_session = _decode_session4,
-};
+ if (!net->ipv4.xfrm4_hdr)
+ return;
-static void __init xfrm4_policy_init(void)
+ table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+#else /* CONFIG_SYSCTL */
+static inline int xfrm4_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static inline void xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+ sizeof(xfrm4_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm4_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
{
- xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+ xfrm4_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
}
-static void __exit xfrm4_policy_fini(void)
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+ .init = xfrm4_net_init,
+ .exit = xfrm4_net_exit,
+};
+
+static void __init xfrm4_policy_init(void)
{
- xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
}
void __init xfrm4_init(void)
{
xfrm4_state_init();
xfrm4_policy_init();
+ xfrm4_protocol_init();
+ register_pernet_subsys(&xfrm4_net_ops);
}
-
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 000000000000..4ee624d8e66f
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_handlers;
+ case IPPROTO_AH:
+ return &ah4_handlers;
+ case IPPROTO_COMP:
+ return &ipcomp4_handlers;
+ }
+
+ return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
+
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->cb_handler(skb, err)) <= 0)
+ return ret;
+
+ return 0;
+}
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ if (!head)
+ goto out;
+
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), skb->dev))
+ goto drop;
+ }
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ return 0;
+
+ return -ENOENT;
+}
+
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ return 0;
+
+ return -ENOENT;
+}
+
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ return 0;
+
+ return -ENOENT;
+}
+
+static const struct net_protocol esp4_protocol = {
+ .handler = xfrm4_esp_rcv,
+ .err_handler = xfrm4_esp_err,
+ .no_policy = 1,
+};
+
+static const struct net_protocol ah4_protocol = {
+ .handler = xfrm4_ah_rcv,
+ .err_handler = xfrm4_ah_err,
+ .no_policy = 1,
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_ipcomp_rcv,
+ .err_handler = xfrm4_ipcomp_err,
+ .no_policy = 1,
+};
+
+static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
+ .family = AF_INET,
+ .callback = xfrm4_rcv_cb,
+};
+
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_protocol;
+ case IPPROTO_AH:
+ return &ah4_protocol;
+ case IPPROTO_COMP:
+ return &ipcomp4_protocol;
+ }
+
+ return NULL;
+}
+
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ bool add_netproto = false;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex)))
+ add_netproto = true;
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority < priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ if (add_netproto) {
+ if (inet_add_protocol(netproto(protocol), protocol)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ int ret = -ENOENT;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex))) {
+ if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+ pr_err("%s: can't remove protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+ xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 3cc3df0c6ece..87d4db591488 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* xfrm4_state.c
*
@@ -7,61 +8,17 @@
*
*/
-#include <net/ip.h>
#include <net/xfrm.h>
-#include <linux/pfkeyv2.h>
-#include <linux/ipsec.h>
-
-static struct xfrm_state_afinfo xfrm4_state_afinfo;
-
-static int xfrm4_init_flags(struct xfrm_state *x)
-{
- if (ipv4_config.no_pmtu_disc)
- x->props.flags |= XFRM_STATE_NOPMTUDISC;
- return 0;
-}
-
-static void
-__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
-{
- x->sel.daddr.a4 = fl->fl4_dst;
- x->sel.saddr.a4 = fl->fl4_src;
- x->sel.dport = xfrm_flowi_dport(fl);
- x->sel.dport_mask = htons(0xffff);
- x->sel.sport = xfrm_flowi_sport(fl);
- x->sel.sport_mask = htons(0xffff);
- x->sel.prefixlen_d = 32;
- x->sel.prefixlen_s = 32;
- x->sel.proto = fl->proto;
- x->sel.ifindex = fl->oif;
- x->id = tmpl->id;
- if (x->id.daddr.a4 == 0)
- x->id.daddr.a4 = daddr->a4;
- x->props.saddr = tmpl->saddr;
- if (x->props.saddr.a4 == 0)
- x->props.saddr.a4 = saddr->a4;
- x->props.mode = tmpl->mode;
- x->props.reqid = tmpl->reqid;
- x->props.family = AF_INET;
-}
static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.family = AF_INET,
- .init_flags = xfrm4_init_flags,
- .init_tempsel = __xfrm4_init_tempsel,
+ .proto = IPPROTO_IPIP,
+ .output = xfrm4_output,
+ .transport_finish = xfrm4_transport_finish,
+ .local_error = xfrm4_local_error,
};
void __init xfrm4_state_init(void)
{
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
-
-#if 0
-void __exit xfrm4_state_fini(void)
-{
- xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
-}
-#endif /* 0 */
-
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index f110af5b1319..8cb266af1393 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -1,38 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* xfrm4_tunnel.c: Generic IP tunnel transformer.
*
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
*/
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <linux/skbuff.h>
#include <linux/module.h>
-#include <linux/mutex.h>
#include <net/xfrm.h>
-#include <net/ip.h>
#include <net/protocol.h>
static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
{
- struct iphdr *iph;
-
- iph = skb->nh.iph;
- iph->tot_len = htons(skb->len);
- ip_send_check(iph);
-
+ skb_push(skb, -skb_network_offset(skb));
return 0;
}
static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
{
- return 0;
+ return ip_hdr(skb)->protocol;
}
-static int ipip_init_state(struct xfrm_state *x)
+static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
- if (x->props.mode != XFRM_MODE_TUNNEL)
+ if (x->props.mode != XFRM_MODE_TUNNEL) {
+ NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode");
return -EINVAL;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation");
return -EINVAL;
+ }
x->props.header_len = sizeof(struct iphdr);
@@ -43,8 +43,7 @@ static void ipip_destroy(struct xfrm_state *x)
{
}
-static struct xfrm_type ipip_type = {
- .description = "IPIP",
+static const struct xfrm_type ipip_type = {
.owner = THIS_MODULE,
.proto = IPPROTO_IPIP,
.init_state = ipip_init_state,
@@ -53,39 +52,68 @@ static struct xfrm_type ipip_type = {
.output = ipip_output
};
+static int xfrm_tunnel_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
+}
+
static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
{
return -ENOENT;
}
-static struct xfrm_tunnel xfrm_tunnel_handler = {
- .handler = xfrm4_rcv,
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
+ .handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 2,
+ .priority = 4,
};
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
+ .handler = xfrm_tunnel_rcv,
+ .err_handler = xfrm_tunnel_err,
+ .priority = 3,
+};
+#endif
+
static int __init ipip_init(void)
{
if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
- printk(KERN_INFO "ipip init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+
+ if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
+ pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
+ xfrm_unregister_type(&ipip_type, AF_INET);
return -EAGAIN;
}
- if (xfrm4_tunnel_register(&xfrm_tunnel_handler)) {
- printk(KERN_INFO "ipip init: can't add xfrm handler\n");
+#if IS_ENABLED(CONFIG_IPV6)
+ if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
+ pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
+ xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
xfrm_unregister_type(&ipip_type, AF_INET);
return -EAGAIN;
}
+#endif
return 0;
}
static void __exit ipip_fini(void)
{
- if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler))
- printk(KERN_INFO "ipip close: can't remove xfrm handler\n");
- if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
- printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+#if IS_ENABLED(CONFIG_IPV6)
+ if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
+ pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+ __func__);
+#endif
+ if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
+ pr_info("%s: can't remove xfrm handler for AF_INET\n",
+ __func__);
+ xfrm_unregister_type(&ipip_type, AF_INET);
}
module_init(ipip_init);
module_exit(ipip_fini);
+MODULE_DESCRIPTION("IPv4 XFRM tunnel driver");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index deb4101a2a81..b8f9a8c0302e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,112 +1,147 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IPv6 configuration
#
# IPv6 as module will cause a CRASH if you try to unload it
-config IPV6
+menuconfig IPV6
tristate "The IPv6 protocol"
- default m
- ---help---
- This is complemental support for the IP version 6.
- You will still be able to do traditional IPv4 networking as well.
+ default y
+ select CRYPTO_LIB_SHA1
+ help
+ Support for IP version 6 (IPv6).
For general information about IPv6, see
- <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
- For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
- For specific information about IPv6 under Linux, read the HOWTO at
- <http://www.bieringer.de/linux/IPv6/>.
+ <https://en.wikipedia.org/wiki/IPv6>.
+ For specific information about IPv6 under Linux, see
+ Documentation/networking/ipv6.rst and read the HOWTO at
+ <https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
- To compile this protocol support as a module, choose M here: the
+ To compile this protocol support as a module, choose M here: the
module will be called ipv6.
-config IPV6_PRIVACY
- bool "IPv6: Privacy Extensions support"
- depends on IPV6
- ---help---
- Privacy Extensions for Stateless Address Autoconfiguration in IPv6
- support. With this option, additional periodically-alter
- pseudo-random global-scope unicast address(es) will assigned to
- your interface(s).
-
- We use our standard pseudo random algorithm to generate randomized
- interface identifier, instead of one described in RFC 3041.
-
- By default, kernel do not generate temporary addresses.
- To use temporary addresses, do
-
- echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr
-
- See <file:Documentation/networking/ip-sysctl.txt> for details.
+if IPV6
config IPV6_ROUTER_PREF
bool "IPv6: Router Preference (RFC 4191) support"
- depends on IPV6
- ---help---
+ help
Router Preference is an optional extension to the Router
- Advertisement message to improve the ability of hosts
- to pick more appropriate router, especially when the hosts
- is placed in a multi-homed network.
+ Advertisement message which improves the ability of hosts
+ to pick an appropriate router, especially when the hosts
+ are placed in a multi-homed network.
If unsure, say N.
config IPV6_ROUTE_INFO
- bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)"
- depends on IPV6_ROUTER_PREF && EXPERIMENTAL
- ---help---
- This is experimental support of Route Information.
+ bool "IPv6: Route Information (RFC 4191) support"
+ depends on IPV6_ROUTER_PREF
+ help
+ Support of Route Information.
+
+ If unsure, say N.
+
+config IPV6_OPTIMISTIC_DAD
+ bool "IPv6: Enable RFC 4429 Optimistic DAD"
+ help
+ Support for optimistic Duplicate Address Detection. It allows for
+ autoconfigured addresses to be used more quickly.
If unsure, say N.
config INET6_AH
tristate "IPv6: AH transformation"
- depends on IPV6
- select XFRM
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_SHA1
- ---help---
- Support for IPsec AH.
+ select XFRM_AH
+ help
+ Support for IPsec AH (Authentication Header).
+
+ AH can be used with various authentication algorithms. Besides
+ enabling AH support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
config INET6_ESP
tristate "IPv6: ESP transformation"
- depends on IPV6
- select XFRM
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_CBC
- select CRYPTO_SHA1
- select CRYPTO_DES
- ---help---
- Support for IPsec ESP.
+ select XFRM_ESP
+ help
+ Support for IPsec ESP (Encapsulating Security Payload).
+
+ ESP can be used with various encryption and authentication algorithms.
+ Besides enabling ESP support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
+config INET6_ESP_OFFLOAD
+ tristate "IPv6: ESP transformation offload"
+ depends on INET6_ESP
+ select XFRM_OFFLOAD
+ default n
+ help
+ Support for ESP transformation offload. This makes sense
+ only if this system really does IPsec and want to do it
+ with high throughput. A typical desktop system does not
+ need it, even if it does IPsec.
+
+ If unsure, say N.
+
+config INET6_ESPINTCP
+ bool "IPv6: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET6_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ select XFRM_ESPINTCP
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv6 sockets.
+
+ If unsure, say N.
+
config INET6_IPCOMP
tristate "IPv6: IPComp transformation"
- depends on IPV6
- select XFRM
select INET6_XFRM_TUNNEL
- select CRYPTO
- select CRYPTO_DEFLATE
- ---help---
+ select XFRM_IPCOMP
+ help
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
If unsure, say Y.
config IPV6_MIP6
- bool "IPv6: Mobility (EXPERIMENTAL)"
- depends on IPV6 && EXPERIMENTAL
+ tristate "IPv6: Mobility"
select XFRM
- ---help---
+ help
Support for IPv6 Mobility described in RFC 3775.
If unsure, say N.
+config IPV6_ILA
+ tristate "IPv6: Identifier Locator Addressing (ILA)"
+ depends on NETFILTER
+ select DST_CACHE
+ select LWTUNNEL
+ help
+ Support for IPv6 Identifier Locator Addressing (ILA).
+
+ ILA is a mechanism to do network virtualization without
+ encapsulation. The basic concept of ILA is that we split an
+ IPv6 address into a 64 bit locator and 64 bit identifier. The
+ identifier is the identity of an entity in communication
+ ("who") and the locator expresses the location of the
+ entity ("where").
+
+ ILA can be configured using the "encap ila" option with
+ "ip -6 route" command. ILA is described in
+ https://tools.ietf.org/html/draft-herbert-nvo3-ila-00.
+
+ If unsure, say N.
+
config INET6_XFRM_TUNNEL
tristate
select INET6_TUNNEL
@@ -116,76 +151,103 @@ config INET6_TUNNEL
tristate
default n
-config INET6_XFRM_MODE_TRANSPORT
- tristate "IPv6: IPsec transport mode"
- depends on IPV6
- default IPV6
- select XFRM
- ---help---
- Support for IPsec transport mode.
-
- If unsure, say Y.
-
-config INET6_XFRM_MODE_TUNNEL
- tristate "IPv6: IPsec tunnel mode"
- depends on IPV6
- default IPV6
- select XFRM
- ---help---
- Support for IPsec tunnel mode.
-
- If unsure, say Y.
-
-config INET6_XFRM_MODE_BEET
- tristate "IPv6: IPsec BEET mode"
- depends on IPV6
- default IPV6
- select XFRM
- ---help---
- Support for IPsec BEET mode.
-
- If unsure, say Y.
-
-config INET6_XFRM_MODE_ROUTEOPTIMIZATION
- tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)"
- depends on IPV6 && EXPERIMENTAL
+config IPV6_VTI
+ tristate "Virtual (secure) IPv6: tunneling"
+ select IPV6_TUNNEL
+ select NET_IP_TUNNEL
select XFRM
- ---help---
- Support for MIPv6 route optimization mode.
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
config IPV6_SIT
tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)"
- depends on IPV6
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ select IPV6_NDISC_NODETYPE
default y
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This driver implements encapsulation of IPv6
into IPv4 packets. This is useful if you want to connect two IPv6
networks over an IPv4-only path.
- Saying M here will produce a module called sit.ko. If unsure, say Y.
+ Saying M here will produce a module called sit. If unsure, say Y.
+
+config IPV6_SIT_6RD
+ bool "IPv6: IPv6 Rapid Deployment (6RD)"
+ depends on IPV6_SIT
+ default n
+ help
+ IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
+ mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
+ deploy IPv6 unicast service to IPv4 sites to which it provides
+ customer premise equipment. Like 6to4, it utilizes stateless IPv6 in
+ IPv4 encapsulation in order to transit IPv4-only network
+ infrastructure. Unlike 6to4, a 6rd service provider uses an IPv6
+ prefix of its own in place of the fixed 6to4 prefix.
+
+ With this option enabled, the SIT driver offers 6rd functionality by
+ providing additional ioctl API to configure the IPv6 Prefix for in
+ stead of static 2002::/16 for 6to4.
+
+ If unsure, say N.
+
+config IPV6_NDISC_NODETYPE
+ bool
config IPV6_TUNNEL
- tristate "IPv6: IPv6-in-IPv6 tunnel"
+ tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
select INET6_TUNNEL
- depends on IPV6
- ---help---
- Support for IPv6-in-IPv6 tunnels described in RFC 2473.
+ select DST_CACHE
+ select GRO_CELLS
+ help
+ Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
+ RFC 2473.
If unsure, say N.
+config IPV6_GRE
+ tristate "IPv6: GRE tunnel"
+ select IPV6_TUNNEL
+ select NET_IP_TUNNEL
+ depends on NET_IPGRE_DEMUX
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv6 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+ Saying M here will produce a module called ip6_gre. If unsure, say N.
+
+config IPV6_FOU
+ tristate
+ default NET_FOU && IPV6
+
+config IPV6_FOU_TUNNEL
+ tristate
+ default NET_FOU_IP_TUNNELS && IPV6_FOU
+ select IPV6_TUNNEL
+
config IPV6_MULTIPLE_TABLES
bool "IPv6: Multiple Routing Tables"
- depends on IPV6 && EXPERIMENTAL
select FIB_RULES
- ---help---
+ help
Support multiple routing tables.
config IPV6_SUBTREES
bool "IPv6: source address based routing"
depends on IPV6_MULTIPLE_TABLES
- ---help---
+ help
Enable routing by source address or prefix.
The destination address is still the primary routing key, so mixing
@@ -196,3 +258,86 @@ config IPV6_SUBTREES
If unsure, say N.
+config IPV6_MROUTE
+ bool "IPv6: multicast routing"
+ depends on IPV6
+ select IP_MROUTE_COMMON
+ help
+ Support for IPv6 multicast forwarding.
+ If unsure, say N.
+
+config IPV6_MROUTE_MULTIPLE_TABLES
+ bool "IPv6: multicast policy routing"
+ depends on IPV6_MROUTE
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
+
+config IPV6_PIMSM_V2
+ bool "IPv6: PIM-SM version 2 support"
+ depends on IPV6_MROUTE
+ help
+ Support for IPv6 PIM multicast routing protocol PIM-SMv2.
+ If unsure, say N.
+
+config IPV6_SEG6_LWTUNNEL
+ bool "IPv6: Segment Routing Header encapsulation support"
+ depends on IPV6
+ select LWTUNNEL
+ select DST_CACHE
+ select IPV6_MULTIPLE_TABLES
+ help
+ Support for encapsulation of packets within an outer IPv6
+ header and a Segment Routing Header using the lightweight
+ tunnels mechanism. Also enable support for advanced local
+ processing of SRv6 packets based on their active segment.
+
+ If unsure, say N.
+
+config IPV6_SEG6_HMAC
+ bool "IPv6: Segment Routing HMAC support"
+ depends on IPV6
+ select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_UTILS
+ help
+ Support for HMAC signature generation and verification
+ of SR-enabled packets.
+
+ If unsure, say N.
+
+config IPV6_SEG6_BPF
+ def_bool y
+ depends on IPV6_SEG6_LWTUNNEL
+ depends on IPV6 = y
+
+config IPV6_RPL_LWTUNNEL
+ bool "IPv6: RPL Source Routing Header support"
+ depends on IPV6
+ select LWTUNNEL
+ select DST_CACHE
+ help
+ Support for RFC6554 RPL Source Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
+config IPV6_IOAM6_LWTUNNEL
+ bool "IPv6: IOAM Pre-allocated Trace insertion support"
+ depends on IPV6
+ select LWTUNNEL
+ select DST_CACHE
+ help
+ Support for the insertion of IOAM Pre-allocated Trace
+ Header using the lightweight tunnels mechanism.
+
+ If unsure, say N.
+
+endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 8bacda109b7f..d283c59df4c1 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -1,37 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux TCP/IP (INET6) layer.
#
obj-$(CONFIG_IPV6) += ipv6.o
-ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
+ipv6-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
+ addrlabel.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
- raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
- exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
- ip6_flowlabel.o ipv6_syms.o inet6_connection_sock.o
+ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
+ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
+ udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o
+
+ipv6-$(CONFIG_SYSCTL) += sysctl_net_ipv6.o
+ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
- xfrm6_output.o
+ xfrm6_output.o xfrm6_protocol.o
ipv6-$(CONFIG_NETFILTER) += netfilter.o
ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
-ipv6-$(CONFIG_IPV6_MIP6) += mip6.o
-
-ipv6-objs += $(ipv6-y)
+ipv6-$(CONFIG_PROC_FS) += proc.o
+ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
+ipv6-$(CONFIG_NETLABEL) += calipso.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
+ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
+ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
+ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o
obj-$(CONFIG_INET6_AH) += ah6.o
obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
-obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
-obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
-obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
-obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
+obj-$(CONFIG_IPV6_MIP6) += mip6.o
+obj-$(CONFIG_IPV6_ILA) += ila/
obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
obj-$(CONFIG_IPV6_SIT) += sit.o
obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
+obj-$(CONFIG_IPV6_FOU) += fou6.o
-obj-y += exthdrs_core.o
+obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
+obj-$(CONFIG_INET) += output_core.o protocol.o \
+ ip6_offload.o tcpv6_offload.o exthdrs_offload.o
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
+
+ifneq ($(CONFIG_IPV6),)
+obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
+obj-y += mcast_snoop.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
+endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a5e8d207a51b..b66217d1b2f8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1,17 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 Address [auto]configuration
* Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
- *
- * $Id: addrconf.c,v 1.69 2001/10/31 21:55:54 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -40,12 +34,16 @@
* status etc.
*/
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/errno.h>
#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
+#include <linux/inet.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
@@ -55,6 +53,7 @@
#include <linux/route.h>
#include <linux/inetdevice.h>
#include <linux/init.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -62,10 +61,15 @@
#include <linux/delay.h>
#include <linux/notifier.h>
#include <linux/string.h>
+#include <linux/hash.h>
+#include <net/ip_tunnels.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/6lowpan.h>
+#include <net/firewire.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/ndisc.h>
@@ -74,76 +78,106 @@
#include <net/tcp.h>
#include <net/ip.h>
#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/l3mdev.h>
+#include <net/netdev_lock.h>
#include <linux/if_tunnel.h>
#include <linux/rtnetlink.h>
-
-#ifdef CONFIG_IPV6_PRIVACY
+#include <linux/netconf.h>
#include <linux/random.h>
-#endif
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/unaligned.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <linux/ioam6.h>
-/* Set to 3 to get tracing... */
-#define ACONF_DEBUG 2
+#define IPV6_MAX_STRLEN \
+ sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")
-#if ACONF_DEBUG >= 3
-#define ADBG(x) printk x
-#else
-#define ADBG(x)
-#endif
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
-#define INFINITY_LIFE_TIME 0xFFFFFFFF
-#define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b)))
+static inline s32 rfc3315_s14_backoff_init(s32 irt)
+{
+ /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
+ u64 tmp = get_random_u32_inclusive(900000, 1100000) * (u64)irt;
+ do_div(tmp, 1000000);
+ return (s32)tmp;
+}
-#ifdef CONFIG_SYSCTL
-static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p);
-static void addrconf_sysctl_unregister(struct ipv6_devconf *p);
-#endif
+static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
+{
+ /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
+ u64 tmp = get_random_u32_inclusive(1900000, 2100000) * (u64)rt;
+ do_div(tmp, 1000000);
+ if ((s32)tmp > mrt) {
+ /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
+ tmp = get_random_u32_inclusive(900000, 1100000) * (u64)mrt;
+ do_div(tmp, 1000000);
+ }
+ return (s32)tmp;
+}
-#ifdef CONFIG_IPV6_PRIVACY
-static int __ipv6_regen_rndid(struct inet6_dev *idev);
-static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
-static void ipv6_regen_rndid(unsigned long data);
+#ifdef CONFIG_SYSCTL
+static int addrconf_sysctl_register(struct inet6_dev *idev);
+static void addrconf_sysctl_unregister(struct inet6_dev *idev);
+#else
+static inline int addrconf_sysctl_register(struct inet6_dev *idev)
+{
+ return 0;
+}
-static int desync_factor = MAX_DESYNC_FACTOR * HZ;
+static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
+{
+}
#endif
-static int ipv6_count_addresses(struct inet6_dev *idev);
+static void ipv6_gen_rnd_iid(struct in6_addr *addr);
-/*
- * Configured unicast address hash table
- */
-static struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE];
-static DEFINE_RWLOCK(addrconf_hash_lock);
+static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
+static int ipv6_count_addresses(const struct inet6_dev *idev);
+static int ipv6_generate_stable_address(struct in6_addr *addr,
+ u8 dad_count,
+ const struct inet6_dev *idev);
+
+#define IN6_ADDR_HSIZE_SHIFT 8
+#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT)
-static void addrconf_verify(unsigned long);
+static void addrconf_verify(struct net *net);
+static void addrconf_verify_rtnl(struct net *net);
-static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0);
-static DEFINE_SPINLOCK(addrconf_verify_lock);
+static struct workqueue_struct *addrconf_wq;
static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
-static int addrconf_ifdown(struct net_device *dev, int how);
-
-static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
-static void addrconf_dad_timer(unsigned long data);
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
-static void addrconf_dad_run(struct inet6_dev *idev);
-static void addrconf_rs_timer(unsigned long data);
+static void addrconf_type_change(struct net_device *dev,
+ unsigned long event);
+static int addrconf_ifdown(struct net_device *dev, bool unregister);
+
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+ int plen,
+ const struct net_device *dev,
+ u32 flags, u32 noflags,
+ bool no_gw);
+
+static void addrconf_dad_start(struct inet6_ifaddr *ifp);
+static void addrconf_dad_work(struct work_struct *w);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
+ bool send_na);
+static void addrconf_dad_run(struct inet6_dev *idev, bool restart);
+static void addrconf_rs_timer(struct timer_list *t);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
-static void inet6_prefix_notify(int event, struct inet6_dev *idev,
+static void inet6_prefix_notify(int event, struct inet6_dev *idev,
struct prefix_info *pinfo);
-static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev);
-
-static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
-struct ipv6_devconf ipv6_devconf __read_mostly = {
+static struct ipv6_devconf ipv6_devconf __read_mostly = {
.forwarding = 0,
.hop_limit = IPV6_DEFAULT_HOPLIMIT,
.mtu6 = IPV6_MIN_MTU,
@@ -151,28 +185,61 @@ struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_redirects = 1,
.autoconf = 1,
.force_mld_version = 0,
+ .mldv1_unsolicited_report_interval = 10 * HZ,
+ .mldv2_unsolicited_report_interval = HZ,
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
-#ifdef CONFIG_IPV6_PRIVACY
- .use_tempaddr = 0,
+ .use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
.temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
+ .regen_min_advance = REGEN_MIN_ADVANCE,
.regen_max_retry = REGEN_MAX_RETRY,
.max_desync_factor = MAX_DESYNC_FACTOR,
-#endif
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
+ .ra_defrtr_metric = IP6_RT_PRIO_USER,
+ .accept_ra_from_local = 0,
+ .accept_ra_min_hop_limit= 1,
+ .accept_ra_min_lft = 0,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
.proxy_ndp = 0,
+ .accept_source_route = 0, /* we do not accept RH0 by default. */
+ .disable_ipv6 = 0,
+ .accept_dad = 0,
+ .suppress_frag_ndisc = 1,
+ .accept_ra_mtu = 1,
+ .stable_secret = {
+ .initialized = false,
+ },
+ .use_oif_addrs_only = 0,
+ .ignore_routes_with_linkdown = 0,
+ .keep_addr_on_down = 0,
+ .seg6_enabled = 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ .seg6_require_hmac = 0,
+#endif
+ .enhanced_dad = 1,
+ .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
+ .disable_policy = 0,
+ .rpl_seg_enabled = 0,
+ .ioam6_enabled = 0,
+ .ioam6_id = IOAM6_DEFAULT_IF_ID,
+ .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
+ .ndisc_evict_nocarrier = 1,
+ .ra_honor_pio_life = 0,
+ .ra_honor_pio_pflag = 0,
+ .force_forwarding = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -182,261 +249,255 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.accept_ra = 1,
.accept_redirects = 1,
.autoconf = 1,
+ .force_mld_version = 0,
+ .mldv1_unsolicited_report_interval = 10 * HZ,
+ .mldv2_unsolicited_report_interval = HZ,
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
-#ifdef CONFIG_IPV6_PRIVACY
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
.temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
+ .regen_min_advance = REGEN_MIN_ADVANCE,
.regen_max_retry = REGEN_MAX_RETRY,
.max_desync_factor = MAX_DESYNC_FACTOR,
-#endif
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
+ .ra_defrtr_metric = IP6_RT_PRIO_USER,
+ .accept_ra_from_local = 0,
+ .accept_ra_min_hop_limit= 1,
+ .accept_ra_min_lft = 0,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
.proxy_ndp = 0,
-};
-
-/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
-#if 0
-const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+ .accept_source_route = 0, /* we do not accept RH0 by default. */
+ .disable_ipv6 = 0,
+ .accept_dad = 1,
+ .suppress_frag_ndisc = 1,
+ .accept_ra_mtu = 1,
+ .stable_secret = {
+ .initialized = false,
+ },
+ .use_oif_addrs_only = 0,
+ .ignore_routes_with_linkdown = 0,
+ .keep_addr_on_down = 0,
+ .seg6_enabled = 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ .seg6_require_hmac = 0,
#endif
-const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
-
-#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16)
+ .enhanced_dad = 1,
+ .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
+ .disable_policy = 0,
+ .rpl_seg_enabled = 0,
+ .ioam6_enabled = 0,
+ .ioam6_id = IOAM6_DEFAULT_IF_ID,
+ .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
+ .ndisc_evict_nocarrier = 1,
+ .ra_honor_pio_life = 0,
+ .ra_honor_pio_pflag = 0,
+ .force_forwarding = 0,
+};
-static inline unsigned ipv6_addr_scope2type(unsigned scope)
+/* Check if link is ready: is it up and is a valid qdisc available */
+static inline bool addrconf_link_ready(const struct net_device *dev)
{
- switch(scope) {
- case IPV6_ADDR_SCOPE_NODELOCAL:
- return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
- IPV6_ADDR_LOOPBACK);
- case IPV6_ADDR_SCOPE_LINKLOCAL:
- return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
- IPV6_ADDR_LINKLOCAL);
- case IPV6_ADDR_SCOPE_SITELOCAL:
- return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
- IPV6_ADDR_SITELOCAL);
- }
- return IPV6_ADDR_SCOPE_TYPE(scope);
+ return netif_oper_up(dev) && !qdisc_tx_is_noop(dev);
}
-int __ipv6_addr_type(const struct in6_addr *addr)
+static void addrconf_del_rs_timer(struct inet6_dev *idev)
{
- __be32 st;
-
- st = addr->s6_addr32[0];
-
- /* Consider all addresses with the first three bits different of
- 000 and 111 as unicasts.
- */
- if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
- (st & htonl(0xE0000000)) != htonl(0xE0000000))
- return (IPV6_ADDR_UNICAST |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));
-
- if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
- /* multicast */
- /* addr-select 3.1 */
- return (IPV6_ADDR_MULTICAST |
- ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
- }
-
- if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
- return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */
- if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
- return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */
-
- if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
- if (addr->s6_addr32[2] == 0) {
- if (addr->s6_addr32[3] == 0)
- return IPV6_ADDR_ANY;
-
- if (addr->s6_addr32[3] == htonl(0x00000001))
- return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */
-
- return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
- }
-
- if (addr->s6_addr32[2] == htonl(0x0000ffff))
- return (IPV6_ADDR_MAPPED |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
- }
-
- return (IPV6_ADDR_RESERVED |
- IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */
+ if (timer_delete(&idev->rs_timer))
+ __in6_dev_put(idev);
}
-static void addrconf_del_timer(struct inet6_ifaddr *ifp)
+static void addrconf_del_dad_work(struct inet6_ifaddr *ifp)
{
- if (del_timer(&ifp->timer))
+ if (cancel_delayed_work(&ifp->dad_work))
__in6_ifa_put(ifp);
}
-enum addrconf_timer_t
+static void addrconf_mod_rs_timer(struct inet6_dev *idev,
+ unsigned long when)
{
- AC_NONE,
- AC_DAD,
- AC_RS,
-};
-
-static void addrconf_mod_timer(struct inet6_ifaddr *ifp,
- enum addrconf_timer_t what,
- unsigned long when)
-{
- if (!del_timer(&ifp->timer))
- in6_ifa_hold(ifp);
-
- switch (what) {
- case AC_DAD:
- ifp->timer.function = addrconf_dad_timer;
- break;
- case AC_RS:
- ifp->timer.function = addrconf_rs_timer;
- break;
- default:;
- }
- ifp->timer.expires = jiffies + when;
- add_timer(&ifp->timer);
+ if (!mod_timer(&idev->rs_timer, jiffies + when))
+ in6_dev_hold(idev);
}
-/* Nobody refers to this device, we may destroy it. */
-
-static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
+static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
+ unsigned long delay)
{
- struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);
- kfree(idev);
+ in6_ifa_hold(ifp);
+ if (mod_delayed_work(addrconf_wq, &ifp->dad_work, delay))
+ in6_ifa_put(ifp);
}
-void in6_dev_finish_destroy(struct inet6_dev *idev)
+static int snmp6_alloc_dev(struct inet6_dev *idev)
{
- struct net_device *dev = idev->dev;
- BUG_TRAP(idev->addr_list==NULL);
- BUG_TRAP(idev->mc_list==NULL);
-#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL");
-#endif
- dev_put(dev);
- if (!idev->dead) {
- printk("Freeing alive inet6 device %p\n", idev);
- return;
+ int i;
+
+ idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT);
+ if (!idev->stats.ipv6)
+ goto err_ip;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *addrconf_stats;
+ addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i);
+ u64_stats_init(&addrconf_stats->syncp);
}
- snmp6_free_dev(idev);
- call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
+
+
+ idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
+ GFP_KERNEL);
+ if (!idev->stats.icmpv6dev)
+ goto err_icmp;
+ idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
+ GFP_KERNEL_ACCOUNT);
+ if (!idev->stats.icmpv6msgdev)
+ goto err_icmpmsg;
+
+ return 0;
+
+err_icmpmsg:
+ kfree(idev->stats.icmpv6dev);
+err_icmp:
+ free_percpu(idev->stats.ipv6);
+err_ip:
+ return -ENOMEM;
}
-static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
+static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
{
struct inet6_dev *ndev;
+ int err = -ENOMEM;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
- if (dev->mtu < IPV6_MIN_MTU)
- return NULL;
-
- ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
+ if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev)
+ return ERR_PTR(-EINVAL);
- if (ndev == NULL)
- return NULL;
+ ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT);
+ if (!ndev)
+ return ERR_PTR(err);
rwlock_init(&ndev->lock);
ndev->dev = dev;
- memcpy(&ndev->cnf, &ipv6_devconf_dflt, sizeof(ndev->cnf));
+ INIT_LIST_HEAD(&ndev->addr_list);
+ timer_setup(&ndev->rs_timer, addrconf_rs_timer, 0);
+ memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
+
+ if (ndev->cnf.stable_secret.initialized)
+ ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+
ndev->cnf.mtu6 = dev->mtu;
- ndev->cnf.sysctl = NULL;
+ ndev->ra_mtu = 0;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
- if (ndev->nd_parms == NULL) {
+ if (!ndev->nd_parms) {
kfree(ndev);
- return NULL;
+ return ERR_PTR(err);
}
+ if (ndev->cnf.forwarding)
+ netif_disable_lro(dev);
/* We refer to the device */
- dev_hold(dev);
+ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL);
if (snmp6_alloc_dev(ndev) < 0) {
- ADBG((KERN_WARNING
- "%s(): cannot allocate memory for statistics; dev=%s.\n",
- __FUNCTION__, dev->name));
+ netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
+ __func__);
neigh_parms_release(&nd_tbl, ndev->nd_parms);
- ndev->dead = 1;
- in6_dev_finish_destroy(ndev);
- return NULL;
+ netdev_put(dev, &ndev->dev_tracker);
+ kfree(ndev);
+ return ERR_PTR(err);
}
- if (snmp6_register_dev(ndev) < 0) {
- ADBG((KERN_WARNING
- "%s(): cannot create /proc/net/dev_snmp6/%s\n",
- __FUNCTION__, dev->name));
- neigh_parms_release(&nd_tbl, ndev->nd_parms);
- ndev->dead = 1;
- in6_dev_finish_destroy(ndev);
- return NULL;
+ if (dev != blackhole_netdev) {
+ if (snmp6_register_dev(ndev) < 0) {
+ netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
+ __func__, dev->name);
+ goto err_release;
+ }
}
+ /* One reference from device. */
+ refcount_set(&ndev->refcnt, 1);
- /* One reference from device. We must do this before
- * we invoke __ipv6_regen_rndid().
- */
- in6_dev_hold(ndev);
+ if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+ ndev->cnf.accept_dad = -1;
-#ifdef CONFIG_IPV6_PRIVACY
- init_timer(&ndev->regen_timer);
- ndev->regen_timer.function = ipv6_regen_rndid;
- ndev->regen_timer.data = (unsigned long) ndev;
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
+ pr_info("%s: Disabled Multicast RS\n", dev->name);
+ ndev->cnf.rtr_solicits = 0;
+ }
+#endif
+
+ INIT_LIST_HEAD(&ndev->tempaddr_list);
+ ndev->desync_factor = U32_MAX;
if ((dev->flags&IFF_LOOPBACK) ||
dev->type == ARPHRD_TUNNEL ||
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+ dev->type == ARPHRD_TUNNEL6 ||
dev->type == ARPHRD_SIT ||
-#endif
dev->type == ARPHRD_NONE) {
- printk(KERN_INFO
- "%s: Disabled Privacy Extensions\n",
- dev->name);
ndev->cnf.use_tempaddr = -1;
- } else {
- in6_dev_hold(ndev);
- ipv6_regen_rndid((unsigned long) ndev);
}
-#endif
- if (netif_carrier_ok(dev))
+ ndev->token = in6addr_any;
+
+ if (netif_running(dev) && addrconf_link_ready(dev))
ndev->if_flags |= IF_READY;
+ ipv6_mc_init_dev(ndev);
+ ndev->tstamp = jiffies;
+ if (dev != blackhole_netdev) {
+ err = addrconf_sysctl_register(ndev);
+ if (err) {
+ ipv6_mc_destroy_dev(ndev);
+ snmp6_unregister_dev(ndev);
+ goto err_release;
+ }
+ }
/* protected by rtnl_lock */
rcu_assign_pointer(dev->ip6_ptr, ndev);
- ipv6_mc_init_dev(ndev);
- ndev->tstamp = jiffies;
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(dev, ndev->nd_parms, NET_IPV6,
- NET_IPV6_NEIGH, "ipv6",
- &ndisc_ifinfo_sysctl_change,
- NULL);
- addrconf_sysctl_register(ndev, &ndev->cnf);
-#endif
+ if (dev != blackhole_netdev) {
+ /* Join interface-local all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
+
+ /* Join all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+
+ /* Join all-router multicast group if forwarding is set */
+ if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ }
return ndev;
+
+err_release:
+ neigh_parms_release(&nd_tbl, ndev->nd_parms);
+ ndev->dead = 1;
+ in6_dev_finish_destroy(ndev);
+ return ERR_PTR(err);
}
-static struct inet6_dev * ipv6_find_idev(struct net_device *dev)
+static struct inet6_dev *ipv6_find_idev(struct net_device *dev)
{
struct inet6_dev *idev;
ASSERT_RTNL();
- if ((idev = __in6_dev_get(dev)) == NULL) {
- if ((idev = ipv6_add_dev(dev)) == NULL)
- return NULL;
+ idev = __in6_dev_get(dev);
+ if (!idev) {
+ idev = ipv6_add_dev(dev);
+ if (IS_ERR(idev))
+ return idev;
}
if (dev->flags&IFF_UP)
@@ -444,906 +505,1752 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev)
return idev;
}
+static int inet6_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+ bool all = false;
+
+ if (type == NETCONFA_ALL)
+ all = true;
+
+ if (all || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+#ifdef CONFIG_IPV6_MROUTE
+ if (all || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+#endif
+ if (all || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+
+ if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv6_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+ bool all = false;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ if (type == NETCONFA_ALL)
+ all = true;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET6;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ if (!devconf)
+ goto out;
+
+ if ((all || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ READ_ONCE(devconf->forwarding)) < 0)
+ goto nla_put_failure;
+#ifdef CONFIG_IPV6_MROUTE
+ if ((all || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ atomic_read(&devconf->mc_forwarding)) < 0)
+ goto nla_put_failure;
+#endif
+ if ((all || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ READ_ONCE(devconf->proxy_ndp)) < 0)
+ goto nla_put_failure;
+
+ if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0)
+ goto nla_put_failure;
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet6_netconf_notify_devconf(struct net *net, int event, int type,
+ int ifindex, struct ipv6_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ event, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+ [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
+};
+
+static int inet6_netconf_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv6_policy, extack);
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv6_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETCONFA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NETCONFA_IFINDEX:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct inet6_dev *in6_dev = NULL;
+ struct net_device *dev = NULL;
+ struct sk_buff *skb;
+ struct ipv6_devconf *devconf;
+ int ifindex;
+ int err;
+
+ err = inet6_netconf_valid_get_req(in_skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[NETCONFA_IFINDEX])
+ return -EINVAL;
+
+ err = -EINVAL;
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv6.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv6.devconf_dflt;
+ break;
+ default:
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return -EINVAL;
+ in6_dev = in6_dev_get(dev);
+ if (!in6_dev)
+ goto errout;
+ devconf = &in6_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ NETCONFA_ALL);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ if (in6_dev)
+ in6_dev_put(in6_dev);
+ dev_put(dev);
+ return err;
+}
+
+/* Combine dev_addr_genid and dev_base_seq to detect changes.
+ */
+static u32 inet6_base_seq(const struct net *net)
+{
+ u32 res = atomic_read(&net->ipv6.dev_addr_genid) +
+ READ_ONCE(net->dev_base_seq);
+
+ /* Must not return 0 (see nl_dump_check_consistent()).
+ * Chose a value far away from 0.
+ */
+ if (!res)
+ res = 0x80000000;
+ return res;
+}
+
+static int inet6_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
+ struct {
+ unsigned long ifindex;
+ unsigned int all_default;
+ } *ctx = (void *)cb->ctx;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ int err = 0;
+
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
+ return -EINVAL;
+ }
+ }
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+ err = inet6_netconf_fill_devconf(skb, dev->ifindex,
+ &idev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ }
+ if (ctx->all_default == 0) {
+ err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ ctx->all_default++;
+ }
+ if (ctx->all_default == 1) {
+ err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ ctx->all_default++;
+ }
+done:
+ rcu_read_unlock();
+ return err;
+}
+
#ifdef CONFIG_SYSCTL
static void dev_forward_change(struct inet6_dev *idev)
{
struct net_device *dev;
struct inet6_ifaddr *ifa;
- struct in6_addr addr;
+ LIST_HEAD(tmp_addr_list);
if (!idev)
return;
dev = idev->dev;
- if (dev && (dev->flags & IFF_MULTICAST)) {
- ipv6_addr_all_routers(&addr);
-
- if (idev->cnf.forwarding)
- ipv6_dev_mc_inc(dev, &addr);
- else
- ipv6_dev_mc_dec(dev, &addr);
+ if (idev->cnf.forwarding)
+ dev_disable_lro(dev);
+ if (dev->flags & IFF_MULTICAST) {
+ if (idev->cnf.forwarding) {
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters);
+ ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters);
+ } else {
+ ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
+ ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters);
+ ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters);
+ }
}
- for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) {
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ if (ifa->flags&IFA_F_TENTATIVE)
+ continue;
+ list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
+ }
+ read_unlock_bh(&idev->lock);
+
+ while (!list_empty(&tmp_addr_list)) {
+ ifa = list_first_entry(&tmp_addr_list,
+ struct inet6_ifaddr, if_list_aux);
+ list_del(&ifa->if_list_aux);
if (idev->cnf.forwarding)
addrconf_join_anycast(ifa);
else
addrconf_leave_anycast(ifa);
}
+
+ inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ dev->ifindex, &idev->cnf);
}
-static void addrconf_forward_change(void)
+static void addrconf_forward_change(struct net *net, __s32 newf)
{
struct net_device *dev;
struct inet6_dev *idev;
- read_lock(&dev_base_lock);
- for (dev=dev_base; dev; dev=dev->next) {
- rcu_read_lock();
- idev = __in6_dev_get(dev);
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
- int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding);
- idev->cnf.forwarding = ipv6_devconf.forwarding;
+ int changed = (!idev->cnf.forwarding) ^ (!newf);
+ /* Disabling all.forwarding sets 0 to force_forwarding for all interfaces */
+ if (newf == 0)
+ WRITE_ONCE(idev->cnf.force_forwarding, 0);
+
+ WRITE_ONCE(idev->cnf.forwarding, newf);
if (changed)
dev_forward_change(idev);
}
- rcu_read_unlock();
}
- read_unlock(&dev_base_lock);
}
+
+static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf)
+{
+ struct net *net = (struct net *)table->extra2;
+ int old;
+
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ old = *p;
+ WRITE_ONCE(*p, newf);
+
+ if (p == &net->ipv6.devconf_dflt->forwarding) {
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ rtnl_net_unlock(net);
+ return 0;
+ }
+
+ if (p == &net->ipv6.devconf_all->forwarding) {
+ int old_dflt = net->ipv6.devconf_dflt->forwarding;
+
+ WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf);
+ if ((!newf) ^ (!old_dflt))
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+
+ addrconf_forward_change(net, newf);
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ } else if ((!newf) ^ (!old))
+ dev_forward_change((struct inet6_dev *)table->extra1);
+ rtnl_net_unlock(net);
+
+ if (newf)
+ rt6_purge_dflt_routers(net);
+ return 1;
+}
+
+static void addrconf_linkdown_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (idev) {
+ int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
+
+ WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf);
+ if (changed)
+ inet6_netconf_notify_devconf(dev_net(dev),
+ RTM_NEWNETCONF,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ dev->ifindex,
+ &idev->cnf);
+ }
+ }
+}
+
+static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf)
+{
+ struct net *net = (struct net *)table->extra2;
+ int old;
+
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ old = *p;
+ WRITE_ONCE(*p, newf);
+
+ if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net,
+ RTM_NEWNETCONF,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ rtnl_net_unlock(net);
+ return 0;
+ }
+
+ if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
+ WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf);
+ addrconf_linkdown_change(net, newf);
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net,
+ RTM_NEWNETCONF,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ }
+
+ rtnl_net_unlock(net);
+
+ return 1;
+}
+
#endif
/* Nobody refers to this ifaddr, destroy it */
-
void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
{
- BUG_TRAP(ifp->if_next==NULL);
- BUG_TRAP(ifp->lst_next==NULL);
+ WARN_ON(!hlist_unhashed(&ifp->addr_lst));
+
#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "inet6_ifa_finish_destroy\n");
+ pr_debug("%s\n", __func__);
#endif
in6_dev_put(ifp->idev);
- if (del_timer(&ifp->timer))
- printk("Timer is still running, when freeing ifa=%p\n", ifp);
+ if (cancel_delayed_work(&ifp->dad_work))
+ pr_notice("delayed DAD work was pending while freeing ifa=%p\n",
+ ifp);
- if (!ifp->dead) {
- printk("Freeing alive inet6 address %p\n", ifp);
+ if (ifp->state != INET6_IFADDR_STATE_DEAD) {
+ pr_warn("Freeing alive inet6 address %p\n", ifp);
return;
}
- dst_release(&ifp->rt->u.dst);
- kfree(ifp);
+ kfree_rcu(ifp, rcu);
}
static void
ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
{
- struct inet6_ifaddr *ifa, **ifap;
+ struct list_head *p;
int ifp_scope = ipv6_addr_src_scope(&ifp->addr);
/*
* Each device address list is sorted in order of scope -
* global before linklocal.
*/
- for (ifap = &idev->addr_list; (ifa = *ifap) != NULL;
- ifap = &ifa->if_next) {
+ list_for_each(p, &idev->addr_list) {
+ struct inet6_ifaddr *ifa
+ = list_entry(p, struct inet6_ifaddr, if_list);
if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
break;
}
- ifp->if_next = *ifap;
- *ifap = ifp;
+ list_add_tail_rcu(&ifp->if_list, p);
+}
+
+static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr)
+{
+ u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net));
+
+ return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
+static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev, unsigned int hash)
+{
+ struct inet6_ifaddr *ifp;
+
+ hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ if (!dev || ifp->idev->dev == dev)
+ return true;
+ }
+ }
+ return false;
+}
+
+static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
+{
+ struct net *net = dev_net(dev);
+ unsigned int hash = inet6_addr_hash(net, &ifa->addr);
+ int err = 0;
+
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
+
+ /* Ignore adding duplicate addresses on an interface */
+ if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
+ netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
+ err = -EEXIST;
+ } else {
+ hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
+ }
+
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
+
+ return err;
}
/* On success it returns ifp with increased reference count */
static struct inet6_ifaddr *
-ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
- int scope, u32 flags)
+ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
+ bool can_block, struct netlink_ext_ack *extack)
{
+ gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
+ int addr_type = ipv6_addr_type(cfg->pfx);
+ struct net *net = dev_net(idev->dev);
struct inet6_ifaddr *ifa = NULL;
- struct rt6_info *rt;
- int hash;
+ struct fib6_info *f6i = NULL;
int err = 0;
- rcu_read_lock_bh();
- if (idev->dead) {
- err = -ENODEV; /*XXX*/
- goto out2;
+ if (addr_type == IPV6_ADDR_ANY) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid address");
+ return ERR_PTR(-EADDRNOTAVAIL);
+ } else if (addr_type & IPV6_ADDR_MULTICAST &&
+ !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag");
+ return ERR_PTR(-EADDRNOTAVAIL);
+ } else if (!(idev->dev->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(idev->dev) &&
+ addr_type & IPV6_ADDR_LOOPBACK) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device");
+ return ERR_PTR(-EADDRNOTAVAIL);
}
- write_lock(&addrconf_hash_lock);
+ if (idev->dead) {
+ NL_SET_ERR_MSG_MOD(extack, "device is going away");
+ err = -ENODEV;
+ goto out;
+ }
- /* Ignore adding duplicate addresses on an interface */
- if (ipv6_chk_same_addr(addr, idev->dev)) {
- ADBG(("ipv6_add_addr: already assigned\n"));
- err = -EEXIST;
+ if (idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
+ err = -EACCES;
goto out;
}
- ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+ /* validator notifier needs to be blocking;
+ * do not call in atomic context
+ */
+ if (can_block) {
+ struct in6_validator_info i6vi = {
+ .i6vi_addr = *cfg->pfx,
+ .i6vi_dev = idev,
+ .extack = extack,
+ };
- if (ifa == NULL) {
- ADBG(("ipv6_add_addr: malloc failed\n"));
+ err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
+ err = notifier_to_errno(err);
+ if (err < 0)
+ goto out;
+ }
+
+ ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT);
+ if (!ifa) {
err = -ENOBUFS;
goto out;
}
- rt = addrconf_dst_alloc(idev, addr, 0);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
+ f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
+ f6i = NULL;
goto out;
}
- ipv6_addr_copy(&ifa->addr, addr);
+ neigh_parms_data_state_setall(idev->nd_parms);
+
+ ifa->addr = *cfg->pfx;
+ if (cfg->peer_pfx)
+ ifa->peer_addr = *cfg->peer_pfx;
spin_lock_init(&ifa->lock);
- init_timer(&ifa->timer);
- ifa->timer.data = (unsigned long) ifa;
- ifa->scope = scope;
- ifa->prefix_len = pfxlen;
- ifa->flags = flags | IFA_F_TENTATIVE;
+ INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
+ INIT_HLIST_NODE(&ifa->addr_lst);
+ ifa->scope = cfg->scope;
+ ifa->prefix_len = cfg->plen;
+ ifa->rt_priority = cfg->rt_priority;
+ ifa->flags = cfg->ifa_flags;
+ ifa->ifa_proto = cfg->ifa_proto;
+ /* No need to add the TENTATIVE flag for addresses with NODAD */
+ if (!(cfg->ifa_flags & IFA_F_NODAD))
+ ifa->flags |= IFA_F_TENTATIVE;
+ ifa->valid_lft = cfg->valid_lft;
+ ifa->prefered_lft = cfg->preferred_lft;
ifa->cstamp = ifa->tstamp = jiffies;
+ ifa->tokenized = false;
- ifa->rt = rt;
+ ifa->rt = f6i;
ifa->idev = idev;
in6_dev_hold(idev);
+
/* For caller */
- in6_ifa_hold(ifa);
+ refcount_set(&ifa->refcnt, 1);
- /* Add to big hash table */
- hash = ipv6_addr_hash(addr);
+ rcu_read_lock();
- ifa->lst_next = inet6_addr_lst[hash];
- inet6_addr_lst[hash] = ifa;
- in6_ifa_hold(ifa);
- write_unlock(&addrconf_hash_lock);
+ err = ipv6_add_addr_hash(idev->dev, ifa);
+ if (err < 0) {
+ rcu_read_unlock();
+ goto out;
+ }
+
+ write_lock_bh(&idev->lock);
- write_lock(&idev->lock);
/* Add to inet6_dev unicast addr list. */
ipv6_link_dev_addr(idev, ifa);
-#ifdef CONFIG_IPV6_PRIVACY
if (ifa->flags&IFA_F_TEMPORARY) {
- ifa->tmp_next = idev->tempaddr_list;
- idev->tempaddr_list = ifa;
+ list_add(&ifa->tmp_list, &idev->tempaddr_list);
in6_ifa_hold(ifa);
}
-#endif
in6_ifa_hold(ifa);
- write_unlock(&idev->lock);
-out2:
- rcu_read_unlock_bh();
+ write_unlock_bh(&idev->lock);
+
+ rcu_read_unlock();
- if (likely(err == 0))
- atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
- else {
- kfree(ifa);
+ inet6addr_notifier_call_chain(NETDEV_UP, ifa);
+out:
+ if (unlikely(err < 0)) {
+ fib6_info_release(f6i);
+
+ if (ifa) {
+ if (ifa->idev)
+ in6_dev_put(ifa->idev);
+ kfree(ifa);
+ }
ifa = ERR_PTR(err);
}
return ifa;
-out:
- write_unlock(&addrconf_hash_lock);
- goto out2;
}
-/* This function wants to get referenced ifp and releases it before return */
+enum cleanup_prefix_rt_t {
+ CLEANUP_PREFIX_RT_NOP, /* no cleanup action for prefix route */
+ CLEANUP_PREFIX_RT_DEL, /* delete the prefix route */
+ CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */
+};
-static void ipv6_del_addr(struct inet6_ifaddr *ifp)
+/*
+ * Check, whether the prefix for ifp would still need a prefix route
+ * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_*
+ * constants.
+ *
+ * 1) we don't purge prefix if address was not permanent.
+ * prefix is managed by its own lifetime.
+ * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE.
+ * 3) if there are no addresses, delete prefix.
+ * 4) if there are still other permanent address(es),
+ * corresponding prefix is still permanent.
+ * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE,
+ * don't purge the prefix, assume user space is managing it.
+ * 6) otherwise, update prefix lifetime to the
+ * longest valid lifetime among the corresponding
+ * addresses on the device.
+ * Note: subsequent RA will update lifetime.
+ **/
+static enum cleanup_prefix_rt_t
+check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
{
- struct inet6_ifaddr *ifa, **ifap;
+ struct inet6_ifaddr *ifa;
struct inet6_dev *idev = ifp->idev;
- int hash;
- int deleted = 0, onlink = 0;
- unsigned long expires = jiffies;
+ unsigned long lifetime;
+ enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL;
- hash = ipv6_addr_hash(&ifp->addr);
+ *expires = jiffies;
- ifp->dead = 1;
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ if (ifa == ifp)
+ continue;
+ if (ifa->prefix_len != ifp->prefix_len ||
+ !ipv6_prefix_equal(&ifa->addr, &ifp->addr,
+ ifp->prefix_len))
+ continue;
+ if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE))
+ return CLEANUP_PREFIX_RT_NOP;
- write_lock_bh(&addrconf_hash_lock);
- for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL;
- ifap = &ifa->lst_next) {
- if (ifa == ifp) {
- *ifap = ifa->lst_next;
- __in6_ifa_put(ifp);
- ifa->lst_next = NULL;
- break;
- }
+ action = CLEANUP_PREFIX_RT_EXPIRE;
+
+ spin_lock(&ifa->lock);
+
+ lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
+ /*
+ * Note: Because this address is
+ * not permanent, lifetime <
+ * LONG_MAX / HZ here.
+ */
+ if (time_before(*expires, ifa->tstamp + lifetime * HZ))
+ *expires = ifa->tstamp + lifetime * HZ;
+ spin_unlock(&ifa->lock);
}
- write_unlock_bh(&addrconf_hash_lock);
- write_lock_bh(&idev->lock);
-#ifdef CONFIG_IPV6_PRIVACY
- if (ifp->flags&IFA_F_TEMPORARY) {
- for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL;
- ifap = &ifa->tmp_next) {
- if (ifa == ifp) {
- *ifap = ifa->tmp_next;
- if (ifp->ifpub) {
- in6_ifa_put(ifp->ifpub);
- ifp->ifpub = NULL;
- }
- __in6_ifa_put(ifp);
- ifa->tmp_next = NULL;
- break;
+ return action;
+}
+
+static void
+cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
+ bool del_rt, bool del_peer)
+{
+ struct fib6_table *table;
+ struct fib6_info *f6i;
+
+ f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev, 0, RTF_DEFAULT, true);
+ if (f6i) {
+ if (del_rt)
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
+ else {
+ if (!(f6i->fib6_flags & RTF_EXPIRES)) {
+ table = f6i->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ fib6_set_expires(f6i, expires);
+ fib6_add_gc_list(f6i);
+
+ spin_unlock_bh(&table->tb6_lock);
}
+ fib6_info_release(f6i);
}
}
-#endif
+}
- for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) {
- if (ifa == ifp) {
- *ifap = ifa->if_next;
- __in6_ifa_put(ifp);
- ifa->if_next = NULL;
- if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0)
- break;
- deleted = 1;
- continue;
- } else if (ifp->flags & IFA_F_PERMANENT) {
- if (ipv6_prefix_equal(&ifa->addr, &ifp->addr,
- ifp->prefix_len)) {
- if (ifa->flags & IFA_F_PERMANENT) {
- onlink = 1;
- if (deleted)
- break;
- } else {
- unsigned long lifetime;
-
- if (!onlink)
- onlink = -1;
-
- spin_lock(&ifa->lock);
- lifetime = min_t(unsigned long,
- ifa->valid_lft, 0x7fffffffUL/HZ);
- if (time_before(expires,
- ifa->tstamp + lifetime * HZ))
- expires = ifa->tstamp + lifetime * HZ;
- spin_unlock(&ifa->lock);
- }
- }
+
+/* This function wants to get referenced ifp and releases it before return */
+
+static void ipv6_del_addr(struct inet6_ifaddr *ifp)
+{
+ enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
+ struct net *net = dev_net(ifp->idev->dev);
+ unsigned long expires;
+ int state;
+
+ ASSERT_RTNL();
+
+ spin_lock_bh(&ifp->lock);
+ state = ifp->state;
+ ifp->state = INET6_IFADDR_STATE_DEAD;
+ spin_unlock_bh(&ifp->lock);
+
+ if (state == INET6_IFADDR_STATE_DEAD)
+ goto out;
+
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
+ hlist_del_init_rcu(&ifp->addr_lst);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
+
+ write_lock_bh(&ifp->idev->lock);
+
+ if (ifp->flags&IFA_F_TEMPORARY) {
+ list_del(&ifp->tmp_list);
+ if (ifp->ifpub) {
+ in6_ifa_put(ifp->ifpub);
+ ifp->ifpub = NULL;
}
- ifap = &ifa->if_next;
+ __in6_ifa_put(ifp);
}
- write_unlock_bh(&idev->lock);
- ipv6_ifa_notify(RTM_DELADDR, ifp);
+ if (!(ifp->flags & IFA_F_NOPREFIXROUTE))
+ action = check_cleanup_prefix_route(ifp, &expires);
- atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
+ list_del_rcu(&ifp->if_list);
+ __in6_ifa_put(ifp);
- addrconf_del_timer(ifp);
+ write_unlock_bh(&ifp->idev->lock);
- /*
- * Purge or update corresponding prefix
- *
- * 1) we don't purge prefix here if address was not permanent.
- * prefix is managed by its own lifetime.
- * 2) if there're no addresses, delete prefix.
- * 3) if there're still other permanent address(es),
- * corresponding prefix is still permanent.
- * 4) otherwise, update prefix lifetime to the
- * longest valid lifetime among the corresponding
- * addresses on the device.
- * Note: subsequent RA will update lifetime.
- *
- * --yoshfuji
- */
- if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) {
- struct in6_addr prefix;
- struct rt6_info *rt;
+ addrconf_del_dad_work(ifp);
+
+ ipv6_ifa_notify(RTM_DELADDR, ifp);
- ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len);
- rt = rt6_lookup(&prefix, NULL, ifp->idev->dev->ifindex, 1);
+ inet6addr_notifier_call_chain(NETDEV_DOWN, ifp);
- if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
- if (onlink == 0) {
- ip6_del_rt(rt);
- rt = NULL;
- } else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
- rt->rt6i_expires = expires;
- rt->rt6i_flags |= RTF_EXPIRES;
- }
- }
- dst_release(&rt->u.dst);
+ if (action != CLEANUP_PREFIX_RT_NOP) {
+ cleanup_prefix_route(ifp, expires,
+ action == CLEANUP_PREFIX_RT_DEL, false);
}
+ /* clean up prefsrc entries */
+ rt6_remove_prefsrc(ifp);
+out:
in6_ifa_put(ifp);
}
-#ifdef CONFIG_IPV6_PRIVACY
-static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift)
+static unsigned long ipv6_get_regen_advance(const struct inet6_dev *idev)
+{
+ return READ_ONCE(idev->cnf.regen_min_advance) +
+ READ_ONCE(idev->cnf.regen_max_retry) *
+ READ_ONCE(idev->cnf.dad_transmits) *
+ max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
+}
+
+static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block)
{
struct inet6_dev *idev = ifp->idev;
- struct in6_addr addr, *tmpaddr;
- unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp;
- int tmp_plen;
+ unsigned long tmp_tstamp, age;
+ unsigned long regen_advance;
+ unsigned long now = jiffies;
+ u32 if_public_preferred_lft;
+ s32 cnf_temp_preferred_lft;
+ struct inet6_ifaddr *ift;
+ struct ifa6_config cfg;
+ long max_desync_factor;
+ struct in6_addr addr;
int ret = 0;
- int max_addresses;
- write_lock(&idev->lock);
- if (ift) {
- spin_lock_bh(&ift->lock);
- memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
- spin_unlock_bh(&ift->lock);
- tmpaddr = &addr;
- } else {
- tmpaddr = NULL;
- }
+ write_lock_bh(&idev->lock);
+
retry:
in6_dev_hold(idev);
- if (idev->cnf.use_tempaddr <= 0) {
- write_unlock(&idev->lock);
- printk(KERN_INFO
- "ipv6_create_tempaddr(): use_tempaddr is disabled.\n");
+ if (READ_ONCE(idev->cnf.use_tempaddr) <= 0) {
+ write_unlock_bh(&idev->lock);
+ pr_info("%s: use_tempaddr is disabled\n", __func__);
in6_dev_put(idev);
ret = -1;
goto out;
}
spin_lock_bh(&ifp->lock);
- if (ifp->regen_count++ >= idev->cnf.regen_max_retry) {
- idev->cnf.use_tempaddr = -1; /*XXX*/
+ if (ifp->regen_count++ >= READ_ONCE(idev->cnf.regen_max_retry)) {
+ WRITE_ONCE(idev->cnf.use_tempaddr, -1); /*XXX*/
spin_unlock_bh(&ifp->lock);
- write_unlock(&idev->lock);
- printk(KERN_WARNING
- "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n");
+ write_unlock_bh(&idev->lock);
+ pr_warn("%s: regeneration time exceeded - disabled temporary address support\n",
+ __func__);
in6_dev_put(idev);
ret = -1;
goto out;
}
in6_ifa_hold(ifp);
memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
- if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) {
- spin_unlock_bh(&ifp->lock);
- write_unlock(&idev->lock);
- printk(KERN_WARNING
- "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n");
- in6_ifa_put(ifp);
- in6_dev_put(idev);
- ret = -1;
- goto out;
+ ipv6_gen_rnd_iid(&addr);
+
+ age = (now - ifp->tstamp) / HZ;
+
+ regen_advance = ipv6_get_regen_advance(idev);
+
+ /* recalculate max_desync_factor each time and update
+ * idev->desync_factor if it's larger
+ */
+ cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft);
+ max_desync_factor = min_t(long,
+ READ_ONCE(idev->cnf.max_desync_factor),
+ cnf_temp_preferred_lft - regen_advance);
+
+ if (unlikely(idev->desync_factor > max_desync_factor)) {
+ if (max_desync_factor > 0) {
+ get_random_bytes(&idev->desync_factor,
+ sizeof(idev->desync_factor));
+ idev->desync_factor %= max_desync_factor;
+ } else {
+ idev->desync_factor = 0;
+ }
}
- memcpy(&addr.s6_addr[8], idev->rndid, 8);
- tmp_valid_lft = min_t(__u32,
- ifp->valid_lft,
- idev->cnf.temp_valid_lft);
- tmp_prefered_lft = min_t(__u32,
- ifp->prefered_lft,
- idev->cnf.temp_prefered_lft - desync_factor / HZ);
- tmp_plen = ifp->prefix_len;
- max_addresses = idev->cnf.max_addresses;
- tmp_cstamp = ifp->cstamp;
+
+ if_public_preferred_lft = ifp->prefered_lft;
+
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.valid_lft = min_t(__u32, ifp->valid_lft,
+ READ_ONCE(idev->cnf.temp_valid_lft) + age);
+ cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
+ cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft);
+ cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft);
+
+ cfg.plen = ifp->prefix_len;
tmp_tstamp = ifp->tstamp;
spin_unlock_bh(&ifp->lock);
- write_unlock(&idev->lock);
- ift = !max_addresses ||
- ipv6_count_addresses(idev) < max_addresses ?
- ipv6_add_addr(idev, &addr, tmp_plen,
- ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, IFA_F_TEMPORARY) : NULL;
- if (!ift || IS_ERR(ift)) {
+ write_unlock_bh(&idev->lock);
+
+ /* From RFC 4941:
+ *
+ * A temporary address is created only if this calculated Preferred
+ * Lifetime is greater than REGEN_ADVANCE time units. In
+ * particular, an implementation must not create a temporary address
+ * with a zero Preferred Lifetime.
+ *
+ * ...
+ *
+ * When creating a temporary address, the lifetime values MUST be
+ * derived from the corresponding prefix as follows:
+ *
+ * ...
+ *
+ * * Its Preferred Lifetime is the lower of the Preferred Lifetime
+ * of the public address or TEMP_PREFERRED_LIFETIME -
+ * DESYNC_FACTOR.
+ *
+ * To comply with the RFC's requirements, clamp the preferred lifetime
+ * to a minimum of regen_advance, unless that would exceed valid_lft or
+ * ifp->prefered_lft.
+ *
+ * Use age calculation as in addrconf_verify to avoid unnecessary
+ * temporary addresses being generated.
+ */
+ age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+ if (cfg.preferred_lft <= regen_advance + age) {
+ cfg.preferred_lft = regen_advance + age + 1;
+ if (cfg.preferred_lft > cfg.valid_lft ||
+ cfg.preferred_lft > if_public_preferred_lft) {
+ in6_ifa_put(ifp);
+ in6_dev_put(idev);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ cfg.ifa_flags = IFA_F_TEMPORARY;
+ /* set in addrconf_prefix_rcv() */
+ if (ifp->flags & IFA_F_OPTIMISTIC)
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
+
+ cfg.pfx = &addr;
+ cfg.scope = ipv6_addr_scope(cfg.pfx);
+
+ ift = ipv6_add_addr(idev, &cfg, block, NULL);
+ if (IS_ERR(ift)) {
in6_ifa_put(ifp);
in6_dev_put(idev);
- printk(KERN_INFO
- "ipv6_create_tempaddr(): retry temporary address regeneration.\n");
- tmpaddr = &addr;
- write_lock(&idev->lock);
+ pr_info("%s: retry temporary address regeneration\n", __func__);
+ write_lock_bh(&idev->lock);
goto retry;
}
spin_lock_bh(&ift->lock);
ift->ifpub = ifp;
- ift->valid_lft = tmp_valid_lft;
- ift->prefered_lft = tmp_prefered_lft;
- ift->cstamp = tmp_cstamp;
+ ift->cstamp = now;
ift->tstamp = tmp_tstamp;
spin_unlock_bh(&ift->lock);
- addrconf_dad_start(ift, 0);
+ addrconf_dad_start(ift);
in6_ifa_put(ift);
in6_dev_put(idev);
out:
return ret;
}
-#endif
/*
* Choose an appropriate source address (RFC3484)
*/
+enum {
+ IPV6_SADDR_RULE_INIT = 0,
+ IPV6_SADDR_RULE_LOCAL,
+ IPV6_SADDR_RULE_SCOPE,
+ IPV6_SADDR_RULE_PREFERRED,
+#ifdef CONFIG_IPV6_MIP6
+ IPV6_SADDR_RULE_HOA,
+#endif
+ IPV6_SADDR_RULE_OIF,
+ IPV6_SADDR_RULE_LABEL,
+ IPV6_SADDR_RULE_PRIVACY,
+ IPV6_SADDR_RULE_ORCHID,
+ IPV6_SADDR_RULE_PREFIX,
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ IPV6_SADDR_RULE_NOT_OPTIMISTIC,
+#endif
+ IPV6_SADDR_RULE_MAX
+};
+
struct ipv6_saddr_score {
- int addr_type;
- unsigned int attrs;
- int matchlen;
- int scope;
- unsigned int rule;
+ int rule;
+ int addr_type;
+ struct inet6_ifaddr *ifa;
+ DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX);
+ int scopedist;
+ int matchlen;
};
-#define IPV6_SADDR_SCORE_LOCAL 0x0001
-#define IPV6_SADDR_SCORE_PREFERRED 0x0004
-#define IPV6_SADDR_SCORE_HOA 0x0008
-#define IPV6_SADDR_SCORE_OIF 0x0010
-#define IPV6_SADDR_SCORE_LABEL 0x0020
-#define IPV6_SADDR_SCORE_PRIVACY 0x0040
+struct ipv6_saddr_dst {
+ const struct in6_addr *addr;
+ int ifindex;
+ int scope;
+ int label;
+ unsigned int prefs;
+};
-static int inline ipv6_saddr_preferred(int type)
+static inline int ipv6_saddr_preferred(int type)
{
- if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|
- IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED))
+ if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK))
return 1;
return 0;
}
-/* static matching label */
-static int inline ipv6_saddr_label(const struct in6_addr *addr, int type)
-{
- /*
- * prefix (longest match) label
- * -----------------------------
- * ::1/128 0
- * ::/0 1
- * 2002::/16 2
- * ::/96 3
- * ::ffff:0:0/96 4
- * fc00::/7 5
- * 2001::/32 6
- */
- if (type & IPV6_ADDR_LOOPBACK)
- return 0;
- else if (type & IPV6_ADDR_COMPATv4)
- return 3;
- else if (type & IPV6_ADDR_MAPPED)
- return 4;
- else if (addr->s6_addr32[0] == htonl(0x20010000))
- return 6;
- else if (addr->s6_addr16[0] == htons(0x2002))
- return 2;
- else if ((addr->s6_addr[0] & 0xfe) == 0xfc)
- return 5;
- return 1;
+static bool ipv6_use_optimistic_addr(const struct net *net,
+ const struct inet6_dev *idev)
+{
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if (!idev)
+ return false;
+ if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
+ !READ_ONCE(idev->cnf.optimistic_dad))
+ return false;
+ if (!READ_ONCE(net->ipv6.devconf_all->use_optimistic) &&
+ !READ_ONCE(idev->cnf.use_optimistic))
+ return false;
+
+ return true;
+#else
+ return false;
+#endif
}
-int ipv6_dev_get_saddr(struct net_device *daddr_dev,
- struct in6_addr *daddr, struct in6_addr *saddr)
+static bool ipv6_allow_optimistic_dad(const struct net *net,
+ const struct inet6_dev *idev)
{
- struct ipv6_saddr_score hiscore;
- struct inet6_ifaddr *ifa_result = NULL;
- int daddr_type = __ipv6_addr_type(daddr);
- int daddr_scope = __ipv6_addr_src_scope(daddr_type);
- u32 daddr_label = ipv6_saddr_label(daddr, daddr_type);
- struct net_device *dev;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if (!idev)
+ return false;
+ if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
+ !READ_ONCE(idev->cnf.optimistic_dad))
+ return false;
+
+ return true;
+#else
+ return false;
+#endif
+}
- memset(&hiscore, 0, sizeof(hiscore));
+static int ipv6_get_saddr_eval(struct net *net,
+ struct ipv6_saddr_score *score,
+ struct ipv6_saddr_dst *dst,
+ int i)
+{
+ int ret;
- read_lock(&dev_base_lock);
- rcu_read_lock();
+ if (i <= score->rule) {
+ switch (i) {
+ case IPV6_SADDR_RULE_SCOPE:
+ ret = score->scopedist;
+ break;
+ case IPV6_SADDR_RULE_PREFIX:
+ ret = score->matchlen;
+ break;
+ default:
+ ret = !!test_bit(i, score->scorebits);
+ }
+ goto out;
+ }
+
+ switch (i) {
+ case IPV6_SADDR_RULE_INIT:
+ /* Rule 0: remember if hiscore is not ready yet */
+ ret = !!score->ifa;
+ break;
+ case IPV6_SADDR_RULE_LOCAL:
+ /* Rule 1: Prefer same address */
+ ret = ipv6_addr_equal(&score->ifa->addr, dst->addr);
+ break;
+ case IPV6_SADDR_RULE_SCOPE:
+ /* Rule 2: Prefer appropriate scope
+ *
+ * ret
+ * ^
+ * -1 | d 15
+ * ---+--+-+---> scope
+ * |
+ * | d is scope of the destination.
+ * B-d | \
+ * | \ <- smaller scope is better if
+ * B-15 | \ if scope is enough for destination.
+ * | ret = B - scope (-1 <= scope >= d <= 15).
+ * d-C-1 | /
+ * |/ <- greater is better
+ * -C / if scope is not enough for destination.
+ * /| ret = scope - C (-1 <= d < scope <= 15).
+ *
+ * d - C - 1 < B -15 (for all -1 <= d <= 15).
+ * C > d + 14 - B >= 15 + 14 - B = 29 - B.
+ * Assume B = 0 and we get C > 29.
+ */
+ ret = __ipv6_addr_src_scope(score->addr_type);
+ if (ret >= dst->scope)
+ ret = -ret;
+ else
+ ret -= 128; /* 30 is enough */
+ score->scopedist = ret;
+ break;
+ case IPV6_SADDR_RULE_PREFERRED:
+ {
+ /* Rule 3: Avoid deprecated and optimistic addresses */
+ u8 avoid = IFA_F_DEPRECATED;
+
+ if (!ipv6_use_optimistic_addr(net, score->ifa->idev))
+ avoid |= IFA_F_OPTIMISTIC;
+ ret = ipv6_saddr_preferred(score->addr_type) ||
+ !(score->ifa->flags & avoid);
+ break;
+ }
+#ifdef CONFIG_IPV6_MIP6
+ case IPV6_SADDR_RULE_HOA:
+ {
+ /* Rule 4: Prefer home address */
+ int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA);
+ ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome;
+ break;
+ }
+#endif
+ case IPV6_SADDR_RULE_OIF:
+ /* Rule 5: Prefer outgoing interface */
+ ret = (!dst->ifindex ||
+ dst->ifindex == score->ifa->idev->dev->ifindex);
+ break;
+ case IPV6_SADDR_RULE_LABEL:
+ /* Rule 6: Prefer matching label */
+ ret = ipv6_addr_label(net,
+ &score->ifa->addr, score->addr_type,
+ score->ifa->idev->dev->ifindex) == dst->label;
+ break;
+ case IPV6_SADDR_RULE_PRIVACY:
+ {
+ /* Rule 7: Prefer public address
+ * Note: prefer temporary address if use_tempaddr >= 2
+ */
+ int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
+ !!(dst->prefs & IPV6_PREFER_SRC_TMP) :
+ READ_ONCE(score->ifa->idev->cnf.use_tempaddr) >= 2;
+ ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
+ break;
+ }
+ case IPV6_SADDR_RULE_ORCHID:
+ /* Rule 8-: Prefer ORCHID vs ORCHID or
+ * non-ORCHID vs non-ORCHID
+ */
+ ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
+ ipv6_addr_orchid(dst->addr));
+ break;
+ case IPV6_SADDR_RULE_PREFIX:
+ /* Rule 8: Use longest matching prefix */
+ ret = ipv6_addr_diff(&score->ifa->addr, dst->addr);
+ if (ret > score->ifa->prefix_len)
+ ret = score->ifa->prefix_len;
+ score->matchlen = ret;
+ break;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
+ /* Optimistic addresses still have lower precedence than other
+ * preferred addresses.
+ */
+ ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
+ break;
+#endif
+ default:
+ ret = 0;
+ }
+
+ if (ret)
+ __set_bit(i, score->scorebits);
+ score->rule = i;
+out:
+ return ret;
+}
+
+static int __ipv6_dev_get_saddr(struct net *net,
+ struct ipv6_saddr_dst *dst,
+ struct inet6_dev *idev,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
+
+ list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) {
+ int i;
- for (dev = dev_base; dev; dev=dev->next) {
- struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
-
- /* Rule 0: Candidate Source Address (section 4)
- * - multicast and link-local destination address,
- * the set of candidate source address MUST only
- * include addresses assigned to interfaces
- * belonging to the same link as the outgoing
- * interface.
- * (- For site-local destination addresses, the
- * set of candidate source addresses MUST only
- * include addresses assigned to interfaces
- * belonging to the same site as the outgoing
- * interface.)
+ /*
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense, unless it is also flagged as optimistic.
+ * - Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
*/
- if ((daddr_type & IPV6_ADDR_MULTICAST ||
- daddr_scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
- daddr_dev && dev != daddr_dev)
+ if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+ (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
continue;
- idev = __in6_dev_get(dev);
- if (!idev)
+ score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+ if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+ score->addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+ idev->dev->name);
continue;
+ }
- read_lock_bh(&idev->lock);
- for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) {
- struct ipv6_saddr_score score;
-
- score.addr_type = __ipv6_addr_type(&ifa->addr);
-
- /* Rule 0:
- * - Tentative Address (RFC2462 section 5.4)
- * - A tentative address is not considered
- * "assigned to an interface" in the traditional
- * sense.
- * - Candidate Source Address (section 4)
- * - In any case, anycast addresses, multicast
- * addresses, and the unspecified address MUST
- * NOT be included in a candidate set.
- */
- if (ifa->flags & IFA_F_TENTATIVE)
- continue;
- if (unlikely(score.addr_type == IPV6_ADDR_ANY ||
- score.addr_type & IPV6_ADDR_MULTICAST)) {
- LIMIT_NETDEBUG(KERN_DEBUG
- "ADDRCONF: unspecified / multicast address"
- "assigned as unicast address on %s",
- dev->name);
- continue;
- }
+ score->rule = -1;
+ bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+ for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+ int minihiscore, miniscore;
+
+ minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
+ miniscore = ipv6_get_saddr_eval(net, score, dst, i);
+
+ if (minihiscore > miniscore) {
+ if (i == IPV6_SADDR_RULE_SCOPE &&
+ score->scopedist > 0) {
+ /*
+ * special case:
+ * each remaining entry
+ * has too small (not enough)
+ * scope, because ifa entries
+ * are sorted by their scope
+ * values.
+ */
+ goto out;
+ }
+ break;
+ } else if (minihiscore < miniscore) {
+ swap(hiscore, score);
+ hiscore_idx = 1 - hiscore_idx;
- score.attrs = 0;
- score.matchlen = 0;
- score.scope = 0;
- score.rule = 0;
+ /* restore our iterator */
+ score->ifa = hiscore->ifa;
- if (ifa_result == NULL) {
- /* record it if the first available entry */
- goto record_it;
+ break;
}
+ }
+ }
+out:
+ return hiscore_idx;
+}
- /* Rule 1: Prefer same address */
- if (hiscore.rule < 1) {
- if (ipv6_addr_equal(&ifa_result->addr, daddr))
- hiscore.attrs |= IPV6_SADDR_SCORE_LOCAL;
- hiscore.rule++;
- }
- if (ipv6_addr_equal(&ifa->addr, daddr)) {
- score.attrs |= IPV6_SADDR_SCORE_LOCAL;
- if (!(hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)) {
- score.rule = 1;
- goto record_it;
- }
- } else {
- if (hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)
- continue;
- }
+static int ipv6_get_saddr_master(struct net *net,
+ const struct net_device *dst_dev,
+ const struct net_device *master,
+ struct ipv6_saddr_dst *dst,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct inet6_dev *idev;
- /* Rule 2: Prefer appropriate scope */
- if (hiscore.rule < 2) {
- hiscore.scope = __ipv6_addr_src_scope(hiscore.addr_type);
- hiscore.rule++;
- }
- score.scope = __ipv6_addr_src_scope(score.addr_type);
- if (hiscore.scope < score.scope) {
- if (hiscore.scope < daddr_scope) {
- score.rule = 2;
- goto record_it;
- } else
- continue;
- } else if (score.scope < hiscore.scope) {
- if (score.scope < daddr_scope)
- break; /* addresses sorted by scope */
- else {
- score.rule = 2;
- goto record_it;
- }
- }
+ idev = __in6_dev_get(dst_dev);
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+ scores, hiscore_idx);
- /* Rule 3: Avoid deprecated address */
- if (hiscore.rule < 3) {
- if (ipv6_saddr_preferred(hiscore.addr_type) ||
- !(ifa_result->flags & IFA_F_DEPRECATED))
- hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED;
- hiscore.rule++;
- }
- if (ipv6_saddr_preferred(score.addr_type) ||
- !(ifa->flags & IFA_F_DEPRECATED)) {
- score.attrs |= IPV6_SADDR_SCORE_PREFERRED;
- if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) {
- score.rule = 3;
- goto record_it;
- }
- } else {
- if (hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)
- continue;
- }
+ idev = __in6_dev_get(master);
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+ scores, hiscore_idx);
- /* Rule 4: Prefer home address */
-#ifdef CONFIG_IPV6_MIP6
- if (hiscore.rule < 4) {
- if (ifa_result->flags & IFA_F_HOMEADDRESS)
- hiscore.attrs |= IPV6_SADDR_SCORE_HOA;
- hiscore.rule++;
- }
- if (ifa->flags & IFA_F_HOMEADDRESS) {
- score.attrs |= IPV6_SADDR_SCORE_HOA;
- if (!(ifa_result->flags & IFA_F_HOMEADDRESS)) {
- score.rule = 4;
- goto record_it;
- }
- } else {
- if (hiscore.attrs & IPV6_SADDR_SCORE_HOA)
- continue;
- }
-#else
- if (hiscore.rule < 4)
- hiscore.rule++;
-#endif
+ return hiscore_idx;
+}
- /* Rule 5: Prefer outgoing interface */
- if (hiscore.rule < 5) {
- if (daddr_dev == NULL ||
- daddr_dev == ifa_result->idev->dev)
- hiscore.attrs |= IPV6_SADDR_SCORE_OIF;
- hiscore.rule++;
- }
- if (daddr_dev == NULL ||
- daddr_dev == ifa->idev->dev) {
- score.attrs |= IPV6_SADDR_SCORE_OIF;
- if (!(hiscore.attrs & IPV6_SADDR_SCORE_OIF)) {
- score.rule = 5;
- goto record_it;
- }
- } else {
- if (hiscore.attrs & IPV6_SADDR_SCORE_OIF)
- continue;
- }
+int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
+ const struct in6_addr *daddr, unsigned int prefs,
+ struct in6_addr *saddr)
+{
+ struct ipv6_saddr_score scores[2], *hiscore;
+ struct ipv6_saddr_dst dst;
+ struct inet6_dev *idev;
+ struct net_device *dev;
+ int dst_type;
+ bool use_oif_addr = false;
+ int hiscore_idx = 0;
+ int ret = 0;
- /* Rule 6: Prefer matching label */
- if (hiscore.rule < 6) {
- if (ipv6_saddr_label(&ifa_result->addr, hiscore.addr_type) == daddr_label)
- hiscore.attrs |= IPV6_SADDR_SCORE_LABEL;
- hiscore.rule++;
- }
- if (ipv6_saddr_label(&ifa->addr, score.addr_type) == daddr_label) {
- score.attrs |= IPV6_SADDR_SCORE_LABEL;
- if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) {
- score.rule = 6;
- goto record_it;
- }
- } else {
- if (hiscore.attrs & IPV6_SADDR_SCORE_LABEL)
- continue;
- }
+ dst_type = __ipv6_addr_type(daddr);
+ dst.addr = daddr;
+ dst.ifindex = dst_dev ? dst_dev->ifindex : 0;
+ dst.scope = __ipv6_addr_src_scope(dst_type);
+ dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
+ dst.prefs = prefs;
+
+ scores[hiscore_idx].rule = -1;
+ scores[hiscore_idx].ifa = NULL;
+
+ rcu_read_lock();
+
+ /* Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ * - "It is RECOMMENDED that the candidate source addresses
+ * be the set of unicast addresses assigned to the
+ * interface that will be used to send to the destination
+ * (the 'outgoing' interface)." (RFC 6724)
+ */
+ if (dst_dev) {
+ idev = __in6_dev_get(dst_dev);
+ if ((dst_type & IPV6_ADDR_MULTICAST) ||
+ dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+ (idev && READ_ONCE(idev->cnf.use_oif_addrs_only))) {
+ use_oif_addr = true;
+ }
+ }
+
+ if (use_oif_addr) {
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
+ } else {
+ const struct net_device *master;
+ int master_idx = 0;
+
+ /* if dst_dev exists and is enslaved to an L3 device, then
+ * prefer addresses from dst_dev and then the master over
+ * any other enslaved devices in the L3 domain.
+ */
+ master = l3mdev_master_dev_rcu(dst_dev);
+ if (master) {
+ master_idx = master->ifindex;
+
+ hiscore_idx = ipv6_get_saddr_master(net, dst_dev,
+ master, &dst,
+ scores, hiscore_idx);
+
+ if (scores[hiscore_idx].ifa &&
+ scores[hiscore_idx].scopedist >= 0)
+ goto out;
+ }
-#ifdef CONFIG_IPV6_PRIVACY
- /* Rule 7: Prefer public address
- * Note: prefer temprary address if use_tempaddr >= 2
+ for_each_netdev_rcu(net, dev) {
+ /* only consider addresses on devices in the
+ * same L3 domain
*/
- if (hiscore.rule < 7) {
- if ((!(ifa_result->flags & IFA_F_TEMPORARY)) ^
- (ifa_result->idev->cnf.use_tempaddr >= 2))
- hiscore.attrs |= IPV6_SADDR_SCORE_PRIVACY;
- hiscore.rule++;
- }
- if ((!(ifa->flags & IFA_F_TEMPORARY)) ^
- (ifa->idev->cnf.use_tempaddr >= 2)) {
- score.attrs |= IPV6_SADDR_SCORE_PRIVACY;
- if (!(hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)) {
- score.rule = 7;
- goto record_it;
- }
- } else {
- if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)
- continue;
- }
-#else
- if (hiscore.rule < 7)
- hiscore.rule++;
-#endif
- /* Rule 8: Use longest matching prefix */
- if (hiscore.rule < 8) {
- hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr);
- hiscore.rule++;
- }
- score.matchlen = ipv6_addr_diff(&ifa->addr, daddr);
- if (score.matchlen > hiscore.matchlen) {
- score.rule = 8;
- goto record_it;
- }
-#if 0
- else if (score.matchlen < hiscore.matchlen)
+ if (l3mdev_master_ifindex_rcu(dev) != master_idx)
continue;
-#endif
-
- /* Final Rule: choose first available one */
- continue;
-record_it:
- if (ifa_result)
- in6_ifa_put(ifa_result);
- in6_ifa_hold(ifa);
- ifa_result = ifa;
- hiscore = score;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
- read_unlock_bh(&idev->lock);
}
- rcu_read_unlock();
- read_unlock(&dev_base_lock);
- if (!ifa_result)
- return -EADDRNOTAVAIL;
-
- ipv6_addr_copy(saddr, &ifa_result->addr);
- in6_ifa_put(ifa_result);
- return 0;
-}
+out:
+ hiscore = &scores[hiscore_idx];
+ if (!hiscore->ifa)
+ ret = -EADDRNOTAVAIL;
+ else
+ *saddr = hiscore->ifa->addr;
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(ipv6_dev_get_saddr);
-int ipv6_get_saddr(struct dst_entry *dst,
- struct in6_addr *daddr, struct in6_addr *saddr)
+static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
+ u32 banned_flags)
{
- return ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, daddr, saddr);
-}
+ struct inet6_ifaddr *ifp;
+ int err = -EADDRNOTAVAIL;
+ list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
+ if (ifp->scope > IFA_LINK)
+ break;
+ if (ifp->scope == IFA_LINK &&
+ !(ifp->flags & banned_flags)) {
+ *addr = ifp->addr;
+ err = 0;
+ break;
+ }
+ }
+ return err;
+}
-int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr)
+int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
+ u32 banned_flags)
{
struct inet6_dev *idev;
int err = -EADDRNOTAVAIL;
rcu_read_lock();
- if ((idev = __in6_dev_get(dev)) != NULL) {
- struct inet6_ifaddr *ifp;
-
+ idev = __in6_dev_get(dev);
+ if (idev) {
read_lock_bh(&idev->lock);
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
- if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
- ipv6_addr_copy(addr, &ifp->addr);
- err = 0;
- break;
- }
- }
+ err = __ipv6_get_lladdr(idev, addr, banned_flags);
read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
return err;
}
-static int ipv6_count_addresses(struct inet6_dev *idev)
+static int ipv6_count_addresses(const struct inet6_dev *idev)
{
+ const struct inet6_ifaddr *ifp;
int cnt = 0;
- struct inet6_ifaddr *ifp;
- read_lock_bh(&idev->lock);
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next)
+ rcu_read_lock();
+ list_for_each_entry_rcu(ifp, &idev->addr_list, if_list)
cnt++;
- read_unlock_bh(&idev->lock);
+ rcu_read_unlock();
return cnt;
}
-int ipv6_chk_addr(struct in6_addr *addr, struct net_device *dev, int strict)
+int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict)
+{
+ return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
+ strict, IFA_F_TENTATIVE);
+}
+EXPORT_SYMBOL(ipv6_chk_addr);
+
+/* device argument is used to find the L3 domain of interest. If
+ * skip_dev_check is set, then the ifp device is not checked against
+ * the passed in dev argument. So the 2 cases for addresses checks are:
+ * 1. does the address exist in the L3 domain that dev is part of
+ * (skip_dev_check = true), or
+ *
+ * 2. does the address exist on the specific device
+ * (skip_dev_check = false)
+ */
+static struct net_device *
+__ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, bool skip_dev_check,
+ int strict, u32 banned_flags)
{
- struct inet6_ifaddr * ifp;
- u8 hash = ipv6_addr_hash(addr);
+ unsigned int hash = inet6_addr_hash(net, addr);
+ struct net_device *l3mdev, *ndev;
+ struct inet6_ifaddr *ifp;
+ u32 ifp_flags;
+
+ rcu_read_lock();
+
+ l3mdev = l3mdev_master_dev_rcu(dev);
+ if (skip_dev_check)
+ dev = NULL;
+
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+ ndev = ifp->idev->dev;
- read_lock_bh(&addrconf_hash_lock);
- for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
+ if (l3mdev_master_dev_rcu(ndev) != l3mdev)
+ continue;
+
+ /* Decouple optimistic from tentative for evaluation here.
+ * Ban optimistic addresses explicitly, when required.
+ */
+ ifp_flags = (ifp->flags&IFA_F_OPTIMISTIC)
+ ? (ifp->flags&~IFA_F_TENTATIVE)
+ : ifp->flags;
if (ipv6_addr_equal(&ifp->addr, addr) &&
- !(ifp->flags&IFA_F_TENTATIVE)) {
- if (dev == NULL || ifp->idev->dev == dev ||
- !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))
- break;
+ !(ifp_flags&banned_flags) &&
+ (!dev || ndev == dev ||
+ !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
+ rcu_read_unlock();
+ return ndev;
}
}
- read_unlock_bh(&addrconf_hash_lock);
- return ifp != NULL;
+
+ rcu_read_unlock();
+ return NULL;
}
-static
-int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev)
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, bool skip_dev_check,
+ int strict, u32 banned_flags)
{
- struct inet6_ifaddr * ifp;
- u8 hash = ipv6_addr_hash(addr);
+ return __ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check,
+ strict, banned_flags) ? 1 : 0;
+}
+EXPORT_SYMBOL(ipv6_chk_addr_and_flags);
- for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
- if (ipv6_addr_equal(&ifp->addr, addr)) {
- if (dev == NULL || ifp->idev->dev == dev)
+
+/* Compares an address/prefix_len with addresses on device @dev.
+ * If one is found it returns true.
+ */
+bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
+ const unsigned int prefix_len, struct net_device *dev)
+{
+ const struct inet6_ifaddr *ifa;
+ const struct inet6_dev *idev;
+ bool ret = false;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
+ ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
+ if (ret)
break;
}
}
- return ifp != NULL;
+ rcu_read_unlock();
+
+ return ret;
}
+EXPORT_SYMBOL(ipv6_chk_custom_prefix);
-struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *dev, int strict)
+int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
{
- struct inet6_ifaddr * ifp;
- u8 hash = ipv6_addr_hash(addr);
+ const struct inet6_ifaddr *ifa;
+ const struct inet6_dev *idev;
+ int onlink;
- read_lock_bh(&addrconf_hash_lock);
- for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
- if (ipv6_addr_equal(&ifp->addr, addr)) {
- if (dev == NULL || ifp->idev->dev == dev ||
- !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
- in6_ifa_hold(ifp);
+ onlink = 0;
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
+ onlink = ipv6_prefix_equal(addr, &ifa->addr,
+ ifa->prefix_len);
+ if (onlink)
break;
- }
}
}
- read_unlock_bh(&addrconf_hash_lock);
-
- return ifp;
+ rcu_read_unlock();
+ return onlink;
}
+EXPORT_SYMBOL(ipv6_chk_prefix);
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
+/**
+ * ipv6_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @dev: used to find the L3 domain of interest
+ *
+ * The caller should be protected by RCU, or RTNL.
+ */
+struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
{
- const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
- const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
- __be32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
- __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
- int sk_ipv6only = ipv6_only_sock(sk);
- int sk2_ipv6only = inet_v6_ipv6only(sk2);
- int addr_type = ipv6_addr_type(sk_rcv_saddr6);
- int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
- if (!sk2_rcv_saddr && !sk_ipv6only)
- return 1;
-
- if (addr_type2 == IPV6_ADDR_ANY &&
- !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
- return 1;
-
- if (addr_type == IPV6_ADDR_ANY &&
- !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
- return 1;
+ return __ipv6_chk_addr_and_flags(net, addr, dev, !dev, 1,
+ IFA_F_TENTATIVE);
+}
+EXPORT_SYMBOL(ipv6_dev_find);
- if (sk2_rcv_saddr6 &&
- ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6))
- return 1;
+struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev, int strict)
+{
+ unsigned int hash = inet6_addr_hash(net, addr);
+ struct inet6_ifaddr *ifp, *result = NULL;
- if (addr_type == IPV6_ADDR_MAPPED &&
- !sk2_ipv6only &&
- (!sk2_rcv_saddr || !sk_rcv_saddr || sk_rcv_saddr == sk2_rcv_saddr))
- return 1;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ if (!dev || ifp->idev->dev == dev ||
+ !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
+ if (in6_ifa_hold_safe(ifp)) {
+ result = ifp;
+ break;
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
- return 0;
+ return result;
}
/* Gets referenced address, destroys ifaddr */
-static void addrconf_dad_stop(struct inet6_ifaddr *ifp)
+static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
{
- if (ifp->flags&IFA_F_PERMANENT) {
- spin_lock_bh(&ifp->lock);
- addrconf_del_timer(ifp);
- ifp->flags |= IFA_F_TENTATIVE;
- spin_unlock_bh(&ifp->lock);
- in6_ifa_put(ifp);
-#ifdef CONFIG_IPV6_PRIVACY
- } else if (ifp->flags&IFA_F_TEMPORARY) {
+ if (dad_failed)
+ ifp->flags |= IFA_F_DADFAILED;
+
+ if (ifp->flags&IFA_F_TEMPORARY) {
struct inet6_ifaddr *ifpub;
spin_lock_bh(&ifp->lock);
ifpub = ifp->ifpub;
if (ifpub) {
in6_ifa_hold(ifpub);
spin_unlock_bh(&ifp->lock);
- ipv6_create_tempaddr(ifpub, ifp);
+ ipv6_create_tempaddr(ifpub, true);
in6_ifa_put(ifpub);
} else {
spin_unlock_bh(&ifp->lock);
}
ipv6_del_addr(ifp);
-#endif
- } else
+ } else if (ifp->flags&IFA_F_PERMANENT || !dad_failed) {
+ spin_lock_bh(&ifp->lock);
+ addrconf_del_dad_work(ifp);
+ ifp->flags |= IFA_F_TENTATIVE;
+ if (dad_failed)
+ ifp->flags &= ~IFA_F_OPTIMISTIC;
+ spin_unlock_bh(&ifp->lock);
+ if (dad_failed)
+ ipv6_ifa_notify(0, ifp);
+ in6_ifa_put(ifp);
+ } else {
ipv6_del_addr(ifp);
+ }
}
-void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+static int addrconf_dad_end(struct inet6_ifaddr *ifp)
{
- if (net_ratelimit())
- printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name);
- addrconf_dad_stop(ifp);
+ int err = -ENOENT;
+
+ spin_lock_bh(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_DAD) {
+ ifp->state = INET6_IFADDR_STATE_POSTDAD;
+ err = 0;
+ }
+ spin_unlock_bh(&ifp->lock);
+
+ return err;
}
-/* Join to solicited addr multicast group. */
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
+{
+ struct inet6_dev *idev = ifp->idev;
+ struct net *net = dev_net(idev->dev);
+ int max_addresses;
+
+ if (addrconf_dad_end(ifp)) {
+ in6_ifa_put(ifp);
+ return;
+ }
+
+ net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM detected!\n",
+ ifp->idev->dev->name, &ifp->addr, eth_hdr(skb)->h_source);
+
+ spin_lock_bh(&ifp->lock);
+
+ if (ifp->flags & IFA_F_STABLE_PRIVACY) {
+ struct in6_addr new_addr;
+ struct inet6_ifaddr *ifp2;
+ int retries = ifp->stable_privacy_retry + 1;
+ struct ifa6_config cfg = {
+ .pfx = &new_addr,
+ .plen = ifp->prefix_len,
+ .ifa_flags = ifp->flags,
+ .valid_lft = ifp->valid_lft,
+ .preferred_lft = ifp->prefered_lft,
+ .scope = ifp->scope,
+ };
+
+ if (retries > net->ipv6.sysctl.idgen_retries) {
+ net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
+ ifp->idev->dev->name);
+ goto errdad;
+ }
+
+ new_addr = ifp->addr;
+ if (ipv6_generate_stable_address(&new_addr, retries,
+ idev))
+ goto errdad;
+
+ spin_unlock_bh(&ifp->lock);
+
+ max_addresses = READ_ONCE(idev->cnf.max_addresses);
+ if (max_addresses &&
+ ipv6_count_addresses(idev) >= max_addresses)
+ goto lock_errdad;
+
+ net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
+ ifp->idev->dev->name);
+
+ ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
+ if (IS_ERR(ifp2))
+ goto lock_errdad;
+
+ spin_lock_bh(&ifp2->lock);
+ ifp2->stable_privacy_retry = retries;
+ ifp2->state = INET6_IFADDR_STATE_PREDAD;
+ spin_unlock_bh(&ifp2->lock);
+
+ addrconf_mod_dad_work(ifp2, net->ipv6.sysctl.idgen_delay);
+ in6_ifa_put(ifp2);
+lock_errdad:
+ spin_lock_bh(&ifp->lock);
+ }
-void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr)
+errdad:
+ /* transition from _POSTDAD to _ERRDAD */
+ ifp->state = INET6_IFADDR_STATE_ERRDAD;
+ spin_unlock_bh(&ifp->lock);
+
+ addrconf_mod_dad_work(ifp, 0);
+ in6_ifa_put(ifp);
+}
+
+/* Join to solicited addr multicast group. */
+void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
{
struct in6_addr maddr;
- if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+ if (READ_ONCE(dev->flags) & (IFF_LOOPBACK | IFF_NOARP))
return;
addrconf_addr_solict_mult(addr, &maddr);
ipv6_dev_mc_inc(dev, &maddr);
}
-void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
+void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct in6_addr maddr;
- if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+ if (READ_ONCE(idev->dev->flags) & (IFF_LOOPBACK | IFF_NOARP))
return;
addrconf_addr_solict_mult(addr, &maddr);
@@ -1353,49 +2260,58 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
+
+ if (ifp->prefix_len >= 127) /* RFC 6164 */
+ return;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
if (ipv6_addr_any(&addr))
return;
- ipv6_dev_ac_inc(ifp->idev->dev, &addr);
+ __ipv6_dev_ac_inc(ifp->idev, &addr);
}
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
+
+ if (ifp->prefix_len >= 127) /* RFC 6164 */
+ return;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
if (ipv6_addr_any(&addr))
return;
__ipv6_dev_ac_dec(ifp->idev, &addr);
}
-static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
{
- if (dev->addr_len != ETH_ALEN)
- return -1;
- memcpy(eui, dev->dev_addr, 3);
- memcpy(eui + 5, dev->dev_addr + 3, 3);
-
- /*
- * The zSeries OSA network cards can be shared among various
- * OS instances, but the OSA cards have only one MAC address.
- * This leads to duplicate address conflicts in conjunction
- * with IPv6 if more than one instance uses the same card.
- *
- * The driver for these cards can deliver a unique 16-bit
- * identifier for each instance sharing the same card. It is
- * placed instead of 0xFFFE in the interface identifier. The
- * "u" bit of the interface identifier is not inverted in this
- * case. Hence the resulting interface identifier has local
- * scope according to RFC2373.
- */
- if (dev->dev_id) {
- eui[3] = (dev->dev_id >> 8) & 0xFF;
- eui[4] = dev->dev_id & 0xFF;
- } else {
+ switch (dev->addr_len) {
+ case ETH_ALEN:
+ memcpy(eui, dev->dev_addr, 3);
eui[3] = 0xFF;
eui[4] = 0xFE;
+ memcpy(eui + 5, dev->dev_addr + 3, 3);
+ break;
+ case EUI64_ADDR_LEN:
+ memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN);
eui[0] ^= 2;
+ break;
+ default:
+ return -1;
}
+
+ return 0;
+}
+
+static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
+{
+ const union fwnet_hwaddr *ha;
+
+ if (dev->addr_len != FWNET_ALEN)
+ return -1;
+
+ ha = (const union fwnet_hwaddr *)dev->dev_addr;
+
+ memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
+ eui[0] ^= 2;
return 0;
}
@@ -1405,7 +2321,7 @@ static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
if (dev->addr_len != ARCNET_ALEN)
return -1;
memset(eui, 0, 7);
- eui[7] = *(u8*)dev->dev_addr;
+ eui[7] = *(u8 *)dev->dev_addr;
return 0;
}
@@ -1418,17 +2334,68 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
return 0;
}
+static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
+{
+ if (addr == 0)
+ return -1;
+ eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) ||
+ ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) ||
+ ipv4_is_private_172(addr) || ipv4_is_test_192(addr) ||
+ ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) ||
+ ipv4_is_test_198(addr) || ipv4_is_multicast(addr) ||
+ ipv4_is_lbcast(addr)) ? 0x00 : 0x02;
+ eui[1] = 0;
+ eui[2] = 0x5E;
+ eui[3] = 0xFE;
+ memcpy(eui + 4, &addr, 4);
+ return 0;
+}
+
+static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
+{
+ if (dev->priv_flags & IFF_ISATAP)
+ return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
+ return -1;
+}
+
+static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
+{
+ return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
+}
+
+static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev)
+{
+ memcpy(eui, dev->perm_addr, 3);
+ memcpy(eui + 5, dev->perm_addr + 3, 3);
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[0] ^= 2;
+ return 0;
+}
+
static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
{
switch (dev->type) {
case ARPHRD_ETHER:
case ARPHRD_FDDI:
- case ARPHRD_IEEE802_TR:
return addrconf_ifid_eui48(eui, dev);
case ARPHRD_ARCNET:
return addrconf_ifid_arcnet(eui, dev);
case ARPHRD_INFINIBAND:
return addrconf_ifid_infiniband(eui, dev);
+ case ARPHRD_SIT:
+ return addrconf_ifid_sit(eui, dev);
+ case ARPHRD_IPGRE:
+ case ARPHRD_TUNNEL:
+ return addrconf_ifid_gre(eui, dev);
+ case ARPHRD_6LOWPAN:
+ return addrconf_ifid_6lowpan(eui, dev);
+ case ARPHRD_IEEE1394:
+ return addrconf_ifid_ieee1394(eui, dev);
+ case ARPHRD_TUNNEL6:
+ case ARPHRD_IP6GRE:
+ case ARPHRD_RAWIP:
+ return addrconf_ifid_ip6tnl(eui, dev);
}
return -1;
}
@@ -1439,7 +2406,9 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
+ list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
+ if (ifp->scope > IFA_LINK)
+ break;
if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
memcpy(eui, ifp->addr.s6_addr+8, 8);
err = 0;
@@ -1450,186 +2419,378 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
return err;
}
-#ifdef CONFIG_IPV6_PRIVACY
-/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
-static int __ipv6_regen_rndid(struct inet6_dev *idev)
+/* Generation of a randomized Interface Identifier
+ * draft-ietf-6man-rfc4941bis, Section 3.3.1
+ */
+
+static void ipv6_gen_rnd_iid(struct in6_addr *addr)
{
regen:
- get_random_bytes(idev->rndid, sizeof(idev->rndid));
- idev->rndid[0] &= ~0x02;
+ get_random_bytes(&addr->s6_addr[8], 8);
- /*
- * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
- * check if generated address is not inappropriate
+ /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1:
+ * check if generated address is not inappropriate:
*
- * - Reserved subnet anycast (RFC 2526)
- * 11111101 11....11 1xxxxxxx
- * - ISATAP (draft-ietf-ngtrans-isatap-13.txt) 5.1
- * 00-00-5E-FE-xx-xx-xx-xx
- * - value 0
- * - XXX: already assigned to an address on the device
+ * - Reserved IPv6 Interface Identifiers
+ * - XXX: already assigned to an address on the device
*/
- if (idev->rndid[0] == 0xfd &&
- (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
- (idev->rndid[7]&0x80))
- goto regen;
- if ((idev->rndid[0]|idev->rndid[1]) == 0) {
- if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
- goto regen;
- if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
- goto regen;
- }
-
- return 0;
-}
-
-static void ipv6_regen_rndid(unsigned long data)
-{
- struct inet6_dev *idev = (struct inet6_dev *) data;
- unsigned long expires;
-
- rcu_read_lock_bh();
- write_lock_bh(&idev->lock);
-
- if (idev->dead)
- goto out;
- if (__ipv6_regen_rndid(idev) < 0)
- goto out;
-
- expires = jiffies +
- idev->cnf.temp_prefered_lft * HZ -
- idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor;
- if (time_before(expires, jiffies)) {
- printk(KERN_WARNING
- "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n",
- idev->dev->name);
- goto out;
- }
-
- if (!mod_timer(&idev->regen_timer, expires))
- in6_dev_hold(idev);
-
-out:
- write_unlock_bh(&idev->lock);
- rcu_read_unlock_bh();
- in6_dev_put(idev);
-}
+ /* Subnet-router anycast: 0000:0000:0000:0000 */
+ if (!(addr->s6_addr32[2] | addr->s6_addr32[3]))
+ goto regen;
-static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) {
- int ret = 0;
+ /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212
+ * Proxy Mobile IPv6: 0200:5EFF:FE00:5213
+ * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF
+ */
+ if (ntohl(addr->s6_addr32[2]) == 0x02005eff &&
+ (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000)
+ goto regen;
- if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
- ret = __ipv6_regen_rndid(idev);
- return ret;
+ /* Reserved subnet anycast addresses */
+ if (ntohl(addr->s6_addr32[2]) == 0xfdffffff &&
+ ntohl(addr->s6_addr32[3]) >= 0Xffffff80)
+ goto regen;
}
-#endif
/*
* Add prefix route.
*/
static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
- unsigned long expires, u32 flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+ struct net_device *dev, unsigned long expires,
+ u32 flags, gfp_t gfp_flags)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_PREFIX,
- .fc_metric = IP6_RT_PRIO_ADDRCONF,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
+ .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
.fc_dst_len = plen,
.fc_flags = RTF_UP | flags,
+ .fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
+ .fc_type = RTN_UNICAST,
};
- ipv6_addr_copy(&cfg.fc_dst, pfx);
+ cfg.fc_dst = *pfx;
/* Prevent useless cloning on PtP SIT.
This thing is done here expecting that the whole
class of non-broadcast devices need not cloning.
*/
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+#if IS_ENABLED(CONFIG_IPV6_SIT)
if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
cfg.fc_flags |= RTF_NONEXTHOP;
#endif
- ip6_route_add(&cfg);
+ ip6_route_add(&cfg, gfp_flags, NULL);
}
+
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+ int plen,
+ const struct net_device *dev,
+ u32 flags, u32 noflags,
+ bool no_gw)
+{
+ struct fib6_node *fn;
+ struct fib6_info *rt = NULL;
+ struct fib6_table *table;
+ u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
+
+ table = fib6_get_table(dev_net(dev), tb_id);
+ if (!table)
+ return NULL;
+
+ rcu_read_lock();
+ fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
+ if (!fn)
+ goto out;
+
+ for_each_fib6_node_rt_rcu(fn) {
+ /* prefix routes only use builtin fib6_nh */
+ if (rt->nh)
+ continue;
+
+ if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
+ continue;
+ if (no_gw && rt->fib6_nh->fib_nh_gw_family)
+ continue;
+ if ((rt->fib6_flags & flags) != flags)
+ continue;
+ if ((rt->fib6_flags & noflags) != 0)
+ continue;
+ if (!fib6_info_hold_safe(rt))
+ continue;
+ break;
+ }
+out:
+ rcu_read_unlock();
+ return rt;
+}
+
+
/* Create "default" multicast route to the interface */
static void addrconf_add_mroute(struct net_device *dev)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_LOCAL,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL,
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
+ .fc_type = RTN_MULTICAST,
+ .fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
};
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
- ip6_route_add(&cfg);
+ ip6_route_add(&cfg, GFP_KERNEL, NULL);
}
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
-static void sit_route_add(struct net_device *dev)
+static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
{
- struct fib6_config cfg = {
- .fc_table = RT6_TABLE_MAIN,
- .fc_metric = IP6_RT_PRIO_ADDRCONF,
- .fc_ifindex = dev->ifindex,
- .fc_dst_len = 96,
- .fc_flags = RTF_UP | RTF_NONEXTHOP,
- };
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ idev = ipv6_find_idev(dev);
+ if (IS_ERR(idev))
+ return idev;
+
+ if (idev->cnf.disable_ipv6)
+ return ERR_PTR(-EACCES);
+
+ /* Add default multicast route */
+ if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev))
+ addrconf_add_mroute(dev);
- /* prefix length - 96 bits "::d.d.d.d" */
- ip6_route_add(&cfg);
+ return idev;
}
-#endif
-static void addrconf_add_lroute(struct net_device *dev)
+static void delete_tempaddrs(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp)
{
- struct in6_addr addr;
+ struct inet6_ifaddr *ift, *tmp;
- ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
- addrconf_prefix_route(&addr, 64, dev, 0, 0);
+ write_lock_bh(&idev->lock);
+ list_for_each_entry_safe(ift, tmp, &idev->tempaddr_list, tmp_list) {
+ if (ift->ifpub != ifp)
+ continue;
+
+ in6_ifa_hold(ift);
+ write_unlock_bh(&idev->lock);
+ ipv6_del_addr(ift);
+ write_lock_bh(&idev->lock);
+ }
+ write_unlock_bh(&idev->lock);
}
-static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
+static void manage_tempaddrs(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp,
+ __u32 valid_lft, __u32 prefered_lft,
+ bool create, unsigned long now)
{
- struct inet6_dev *idev;
+ u32 flags;
+ struct inet6_ifaddr *ift;
- ASSERT_RTNL();
+ read_lock_bh(&idev->lock);
+ /* update all temporary addresses in the list */
+ list_for_each_entry(ift, &idev->tempaddr_list, tmp_list) {
+ int age, max_valid, max_prefered;
- if ((idev = ipv6_find_idev(dev)) == NULL)
- return NULL;
+ if (ifp != ift->ifpub)
+ continue;
- /* Add default multicast route */
- addrconf_add_mroute(dev);
+ /* RFC 4941 section 3.3:
+ * If a received option will extend the lifetime of a public
+ * address, the lifetimes of temporary addresses should
+ * be extended, subject to the overall constraint that no
+ * temporary addresses should ever remain "valid" or "preferred"
+ * for a time longer than (TEMP_VALID_LIFETIME) or
+ * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively.
+ */
+ age = (now - ift->cstamp) / HZ;
+ max_valid = READ_ONCE(idev->cnf.temp_valid_lft) - age;
+ if (max_valid < 0)
+ max_valid = 0;
+
+ max_prefered = READ_ONCE(idev->cnf.temp_prefered_lft) -
+ idev->desync_factor - age;
+ if (max_prefered < 0)
+ max_prefered = 0;
+
+ if (valid_lft > max_valid)
+ valid_lft = max_valid;
+
+ if (prefered_lft > max_prefered)
+ prefered_lft = max_prefered;
+
+ spin_lock(&ift->lock);
+ flags = ift->flags;
+ ift->valid_lft = valid_lft;
+ ift->prefered_lft = prefered_lft;
+ ift->tstamp = now;
+ if (prefered_lft > 0)
+ ift->flags &= ~IFA_F_DEPRECATED;
+
+ spin_unlock(&ift->lock);
+ if (!(flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ift);
+ }
- /* Add link local route */
- addrconf_add_lroute(dev);
- return idev;
+ /* Also create a temporary address if it's enabled but no temporary
+ * address currently exists.
+ * However, we get called with valid_lft == 0, prefered_lft == 0, create == false
+ * as part of cleanup (ie. deleting the mngtmpaddr).
+ * We don't want that to result in creating a new temporary ip address.
+ */
+ if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft))
+ create = true;
+
+ if (create && READ_ONCE(idev->cnf.use_tempaddr) > 0) {
+ /* When a new public address is created as described
+ * in [ADDRCONF], also create a new temporary address.
+ */
+ read_unlock_bh(&idev->lock);
+ ipv6_create_tempaddr(ifp, false);
+ } else {
+ read_unlock_bh(&idev->lock);
+ }
}
-void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
+static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
+{
+ return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+ idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+}
+
+int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
+ const struct prefix_info *pinfo,
+ struct inet6_dev *in6_dev,
+ const struct in6_addr *addr, int addr_type,
+ u32 addr_flags, bool sllao, bool tokenized,
+ __u32 valid_lft, u32 prefered_lft)
+{
+ struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
+ int create = 0, update_lft = 0;
+
+ if (!ifp && valid_lft) {
+ int max_addresses = READ_ONCE(in6_dev->cnf.max_addresses);
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = pinfo->prefix_len,
+ .ifa_flags = addr_flags,
+ .valid_lft = valid_lft,
+ .preferred_lft = prefered_lft,
+ .scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+ .ifa_proto = IFAPROT_KERNEL_RA
+ };
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if ((READ_ONCE(net->ipv6.devconf_all->optimistic_dad) ||
+ READ_ONCE(in6_dev->cnf.optimistic_dad)) &&
+ !net->ipv6.devconf_all->forwarding && sllao)
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
+#endif
+
+ /* Do not allow to create too much of autoconfigured
+ * addresses; this would be too easy way to crash kernel.
+ */
+ if (!max_addresses ||
+ ipv6_count_addresses(in6_dev) < max_addresses)
+ ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
+
+ if (IS_ERR_OR_NULL(ifp))
+ return -1;
+
+ create = 1;
+ spin_lock_bh(&ifp->lock);
+ ifp->flags |= IFA_F_MANAGETEMPADDR;
+ ifp->cstamp = jiffies;
+ ifp->tokenized = tokenized;
+ spin_unlock_bh(&ifp->lock);
+ addrconf_dad_start(ifp);
+ }
+
+ if (ifp) {
+ u32 flags;
+ unsigned long now;
+ u32 stored_lft;
+
+ /* update lifetime (RFC2462 5.5.3 e) */
+ spin_lock_bh(&ifp->lock);
+ now = jiffies;
+ if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
+ stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
+ else
+ stored_lft = 0;
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = !create && stored_lft;
+
+ if (update_lft && !READ_ONCE(in6_dev->cnf.ra_honor_pio_life)) {
+ const u32 minimum_lft = min_t(u32,
+ stored_lft, MIN_VALID_LIFETIME);
+ valid_lft = max(valid_lft, minimum_lft);
+ }
+
+ if (update_lft) {
+ ifp->valid_lft = valid_lft;
+ ifp->prefered_lft = prefered_lft;
+ WRITE_ONCE(ifp->tstamp, now);
+ flags = ifp->flags;
+ ifp->flags &= ~IFA_F_DEPRECATED;
+ spin_unlock_bh(&ifp->lock);
+
+ if (!(flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ifp);
+ } else
+ spin_unlock_bh(&ifp->lock);
+
+ manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
+ create, now);
+
+ in6_ifa_put(ifp);
+ addrconf_verify(net);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);
+
+void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
struct prefix_info *pinfo;
+ struct fib6_table *table;
__u32 valid_lft;
__u32 prefered_lft;
- int addr_type;
- unsigned long rt_expires;
+ int addr_type, err;
+ u32 addr_flags = 0;
struct inet6_dev *in6_dev;
+ struct net *net = dev_net(dev);
+ bool ignore_autoconf = false;
pinfo = (struct prefix_info *) opt;
-
+
if (len < sizeof(struct prefix_info)) {
- ADBG(("addrconf: prefix option too short\n"));
+ netdev_dbg(dev, "addrconf: prefix option too short\n");
return;
}
-
+
/*
* Validation checks ([ADDRCONF], page 19)
*/
@@ -1643,196 +2804,166 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
prefered_lft = ntohl(pinfo->prefered);
if (prefered_lft > valid_lft) {
- if (net_ratelimit())
- printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n");
+ net_warn_ratelimited("addrconf: prefix option has invalid lifetime\n");
return;
}
in6_dev = in6_dev_get(dev);
- if (in6_dev == NULL) {
- if (net_ratelimit())
- printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name);
+ if (!in6_dev) {
+ net_dbg_ratelimited("addrconf: device %s not configured\n",
+ dev->name);
return;
}
+ if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft)
+ goto put;
+
/*
* Two things going on here:
* 1) Add routes for on-link prefixes
* 2) Configure prefixes with the auto flag set
*/
- /* Avoid arithmetic overflow. Really, we could
- save rt_expires in seconds, likely valid_lft,
- but it would require division in fib gc, that it
- not good.
- */
- if (valid_lft >= 0x7FFFFFFF/HZ)
- rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ);
- else
- rt_expires = valid_lft * HZ;
+ if (pinfo->onlink) {
+ struct fib6_info *rt;
+ unsigned long rt_expires;
- /*
- * We convert this (in jiffies) to clock_t later.
- * Avoid arithmetic overflow there as well.
- * Overflow can happen only if HZ < USER_HZ.
- */
- if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ)
- rt_expires = 0x7FFFFFFF / USER_HZ;
+ /* Avoid arithmetic overflow. Really, we could
+ * save rt_expires in seconds, likely valid_lft,
+ * but it would require division in fib gc, that it
+ * not good.
+ */
+ if (HZ > USER_HZ)
+ rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
+ else
+ rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);
- if (pinfo->onlink) {
- struct rt6_info *rt;
- rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1);
-
- if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
- if (rt->rt6i_flags&RTF_EXPIRES) {
- if (valid_lft == 0) {
- ip6_del_rt(rt);
- rt = NULL;
+ if (addrconf_finite_timeout(rt_expires))
+ rt_expires *= HZ;
+
+ rt = addrconf_get_prefix_route(&pinfo->prefix,
+ pinfo->prefix_len,
+ dev,
+ RTF_ADDRCONF | RTF_PREFIX_RT,
+ RTF_DEFAULT, true);
+
+ if (rt) {
+ /* Autoconf prefix route */
+ if (valid_lft == 0) {
+ ip6_del_rt(net, rt, false);
+ rt = NULL;
+ } else {
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ fib6_set_expires(rt, jiffies + rt_expires);
+ fib6_add_gc_list(rt);
} else {
- rt->rt6i_expires = jiffies + rt_expires;
+ fib6_clean_expires(rt);
+ fib6_remove_gc_list(rt);
}
+
+ spin_unlock_bh(&table->tb6_lock);
}
} else if (valid_lft) {
+ clock_t expires = 0;
+ int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
+ if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ flags |= RTF_EXPIRES;
+ expires = jiffies_to_clock_t(rt_expires);
+ }
addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
- dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
+ 0, dev, expires, flags,
+ GFP_ATOMIC);
}
- if (rt)
- dst_release(&rt->u.dst);
+ fib6_info_release(rt);
}
/* Try to figure out our local address for this prefix */
- if (pinfo->autoconf && in6_dev->cnf.autoconf) {
- struct inet6_ifaddr * ifp;
+ ignore_autoconf = READ_ONCE(in6_dev->cnf.ra_honor_pio_pflag) && pinfo->preferpd;
+ if (pinfo->autoconf && in6_dev->cnf.autoconf && !ignore_autoconf) {
struct in6_addr addr;
- int create = 0, update_lft = 0;
+ bool tokenized = false, dev_addr_generated = false;
if (pinfo->prefix_len == 64) {
memcpy(&addr, &pinfo->prefix, 8);
- if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
- ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
- in6_dev_put(in6_dev);
- return;
+
+ if (!ipv6_addr_any(&in6_dev->token)) {
+ read_lock_bh(&in6_dev->lock);
+ memcpy(addr.s6_addr + 8,
+ in6_dev->token.s6_addr + 8, 8);
+ read_unlock_bh(&in6_dev->lock);
+ tokenized = true;
+ } else if (is_addr_mode_generate_stable(in6_dev) &&
+ !ipv6_generate_stable_address(&addr, 0,
+ in6_dev)) {
+ addr_flags |= IFA_F_STABLE_PRIVACY;
+ goto ok;
+ } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
+ ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
+ goto put;
+ } else {
+ dev_addr_generated = true;
}
goto ok;
}
- if (net_ratelimit())
- printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n",
- pinfo->prefix_len);
- in6_dev_put(in6_dev);
- return;
+ net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n",
+ pinfo->prefix_len);
+ goto put;
ok:
+ err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+ &addr, addr_type,
+ addr_flags, sllao,
+ tokenized, valid_lft,
+ prefered_lft);
+ if (err)
+ goto put;
+
+ /* Ignore error case here because previous prefix add addr was
+ * successful which will be notified.
+ */
+ ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr,
+ addr_type, addr_flags, sllao,
+ tokenized, valid_lft,
+ prefered_lft,
+ dev_addr_generated);
+ }
+ inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
+put:
+ in6_dev_put(in6_dev);
+}
- ifp = ipv6_get_ifaddr(&addr, dev, 1);
-
- if (ifp == NULL && valid_lft) {
- int max_addresses = in6_dev->cnf.max_addresses;
-
- /* Do not allow to create too much of autoconfigured
- * addresses; this would be too easy way to crash kernel.
- */
- if (!max_addresses ||
- ipv6_count_addresses(in6_dev) < max_addresses)
- ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len,
- addr_type&IPV6_ADDR_SCOPE_MASK, 0);
-
- if (!ifp || IS_ERR(ifp)) {
- in6_dev_put(in6_dev);
- return;
- }
-
- update_lft = create = 1;
- ifp->cstamp = jiffies;
- addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT);
- }
-
- if (ifp) {
- int flags;
- unsigned long now;
-#ifdef CONFIG_IPV6_PRIVACY
- struct inet6_ifaddr *ift;
-#endif
- u32 stored_lft;
-
- /* update lifetime (RFC2462 5.5.3 e) */
- spin_lock(&ifp->lock);
- now = jiffies;
- if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
- stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
- else
- stored_lft = 0;
- if (!update_lft && stored_lft) {
- if (valid_lft > MIN_VALID_LIFETIME ||
- valid_lft > stored_lft)
- update_lft = 1;
- else if (stored_lft <= MIN_VALID_LIFETIME) {
- /* valid_lft <= stored_lft is always true */
- /* XXX: IPsec */
- update_lft = 0;
- } else {
- valid_lft = MIN_VALID_LIFETIME;
- if (valid_lft < prefered_lft)
- prefered_lft = valid_lft;
- update_lft = 1;
- }
- }
+static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
+ struct in6_ifreq *ireq)
+{
+ struct ip_tunnel_parm_kern p = { };
+ int err;
- if (update_lft) {
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
- ifp->tstamp = now;
- flags = ifp->flags;
- ifp->flags &= ~IFA_F_DEPRECATED;
- spin_unlock(&ifp->lock);
+ if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
+ return -EADDRNOTAVAIL;
- if (!(flags&IFA_F_TENTATIVE))
- ipv6_ifa_notify(0, ifp);
- } else
- spin_unlock(&ifp->lock);
+ p.iph.daddr = ireq->ifr6_addr.s6_addr32[3];
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPV6;
+ p.iph.ttl = 64;
-#ifdef CONFIG_IPV6_PRIVACY
- read_lock_bh(&in6_dev->lock);
- /* update all temporary addresses in the list */
- for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) {
- /*
- * When adjusting the lifetimes of an existing
- * temporary address, only lower the lifetimes.
- * Implementations must not increase the
- * lifetimes of an existing temporary address
- * when processing a Prefix Information Option.
- */
- spin_lock(&ift->lock);
- flags = ift->flags;
- if (ift->valid_lft > valid_lft &&
- ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ)
- ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ;
- if (ift->prefered_lft > prefered_lft &&
- ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ)
- ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ;
- spin_unlock(&ift->lock);
- if (!(flags&IFA_F_TENTATIVE))
- ipv6_ifa_notify(0, ift);
- }
+ if (!dev->netdev_ops->ndo_tunnel_ctl)
+ return -EOPNOTSUPP;
+ err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL);
+ if (err)
+ return err;
- if (create && in6_dev->cnf.use_tempaddr > 0) {
- /*
- * When a new public address is created as described in [ADDRCONF],
- * also create a new temporary address.
- */
- read_unlock_bh(&in6_dev->lock);
- ipv6_create_tempaddr(ifp, NULL);
- } else {
- read_unlock_bh(&in6_dev->lock);
- }
-#endif
- in6_ifa_put(ifp);
- addrconf_verify(0);
- }
- }
- inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
- in6_dev_put(in6_dev);
+ dev = __dev_get_by_name(net, p.name);
+ if (!dev)
+ return -ENOBUFS;
+ return dev_open(dev, NULL);
}
/*
@@ -1840,226 +2971,285 @@ ok:
* Special case for SIT interfaces where we create a new "virtual"
* device.
*/
-int addrconf_set_dstaddr(void __user *arg)
+int addrconf_set_dstaddr(struct net *net, void __user *arg)
{
- struct in6_ifreq ireq;
struct net_device *dev;
- int err = -EINVAL;
-
- rtnl_lock();
+ struct in6_ifreq ireq;
+ int err = -ENODEV;
- err = -EFAULT;
+ if (!IS_ENABLED(CONFIG_IPV6_SIT))
+ return -ENODEV;
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
- goto err_exit;
+ return -EFAULT;
- dev = __dev_get_by_index(ireq.ifr6_ifindex);
+ rtnl_net_lock(net);
+ dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
+ if (dev && dev->type == ARPHRD_SIT)
+ err = addrconf_set_sit_dstaddr(net, dev, &ireq);
+ rtnl_net_unlock(net);
+ return err;
+}
- err = -ENODEV;
- if (dev == NULL)
- goto err_exit;
+static int ipv6_mc_config(struct sock *sk, bool join,
+ const struct in6_addr *addr, int ifindex)
+{
+ int ret;
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
- if (dev->type == ARPHRD_SIT) {
- struct ifreq ifr;
- mm_segment_t oldfs;
- struct ip_tunnel_parm p;
+ ASSERT_RTNL();
- err = -EADDRNOTAVAIL;
- if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
- goto err_exit;
-
- memset(&p, 0, sizeof(p));
- p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
- p.iph.saddr = 0;
- p.iph.version = 4;
- p.iph.ihl = 5;
- p.iph.protocol = IPPROTO_IPV6;
- p.iph.ttl = 64;
- ifr.ifr_ifru.ifru_data = (void __user *)&p;
-
- oldfs = get_fs(); set_fs(KERNEL_DS);
- err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
- set_fs(oldfs);
-
- if (err == 0) {
- err = -ENOBUFS;
- if ((dev = __dev_get_by_name(p.name)) == NULL)
- goto err_exit;
- err = dev_open(dev);
- }
- }
-#endif
+ lock_sock(sk);
+ if (join)
+ ret = ipv6_sock_mc_join(sk, ifindex, addr);
+ else
+ ret = ipv6_sock_mc_drop(sk, ifindex, addr);
+ release_sock(sk);
-err_exit:
- rtnl_unlock();
- return err;
+ return ret;
}
/*
* Manual configuration of address on an interface
*/
-static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen,
- __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft)
+static int inet6_addr_add(struct net *net, struct net_device *dev,
+ struct ifa6_config *cfg, clock_t expires, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
- struct net_device *dev;
- int scope;
- ASSERT_RTNL();
-
- /* check the lifetime */
- if (!valid_lft || prefered_lft > valid_lft)
+ ASSERT_RTNL_NET(net);
+
+ if (cfg->plen > 128) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
return -EINVAL;
+ }
- if ((dev = __dev_get_by_index(ifindex)) == NULL)
- return -ENODEV;
-
- if ((idev = addrconf_add_dev(dev)) == NULL)
- return -ENOBUFS;
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) {
+ NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64");
+ return -EINVAL;
+ }
- scope = ipv6_addr_scope(pfx);
+ idev = addrconf_add_dev(dev);
+ if (IS_ERR(idev)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
+ return PTR_ERR(idev);
+ }
- if (valid_lft == INFINITY_LIFE_TIME)
- ifa_flags |= IFA_F_PERMANENT;
- else if (valid_lft >= 0x7FFFFFFF/HZ)
- valid_lft = 0x7FFFFFFF/HZ;
+ if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
+ int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
+ true, cfg->pfx, dev->ifindex);
- if (prefered_lft == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
- (prefered_lft != INFINITY_LIFE_TIME))
- prefered_lft = 0x7FFFFFFF/HZ;
+ if (ret < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed");
+ return ret;
+ }
+ }
- ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags);
+ cfg->scope = ipv6_addr_scope(cfg->pfx);
+ ifp = ipv6_add_addr(idev, cfg, true, extack);
if (!IS_ERR(ifp)) {
- spin_lock_bh(&ifp->lock);
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
- ifp->tstamp = jiffies;
- spin_unlock_bh(&ifp->lock);
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, dev, expires,
+ flags, GFP_KERNEL);
+ }
- addrconf_dad_start(ifp, 0);
+ /* Send a netlink notification if DAD is enabled and
+ * optimistic flag is not set
+ */
+ if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
+ ipv6_ifa_notify(0, ifp);
+ /*
+ * Note that section 3.1 of RFC 4429 indicates
+ * that the Optimistic flag should not be set for
+ * manually configured addresses
+ */
+ addrconf_dad_start(ifp);
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
+ manage_tempaddrs(idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, true, jiffies);
in6_ifa_put(ifp);
- addrconf_verify(0);
+ addrconf_verify_rtnl(net);
return 0;
+ } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
+ ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
+ cfg->pfx, dev->ifindex);
}
return PTR_ERR(ifp);
}
-static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen)
+static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
+ const struct in6_addr *pfx, unsigned int plen,
+ struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
struct net_device *dev;
-
- if ((dev = __dev_get_by_index(ifindex)) == NULL)
+
+ if (plen > 128) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
return -ENODEV;
+ }
- if ((idev = __in6_dev_get(dev)) == NULL)
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (!idev) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
return -ENXIO;
+ }
read_lock_bh(&idev->lock);
- for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) {
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
if (ifp->prefix_len == plen &&
ipv6_addr_equal(pfx, &ifp->addr)) {
in6_ifa_hold(ifp);
read_unlock_bh(&idev->lock);
-
+
ipv6_del_addr(ifp);
- /* If the last address is deleted administratively,
- disable IPv6 on this interface.
- */
- if (idev->addr_list == NULL)
- addrconf_ifdown(idev->dev, 1);
+ if (!(ifp->flags & IFA_F_TEMPORARY) &&
+ (ifp->flags & IFA_F_MANAGETEMPADDR))
+ delete_tempaddrs(idev, ifp);
+
+ addrconf_verify_rtnl(net);
+ if (ipv6_addr_is_multicast(pfx)) {
+ ipv6_mc_config(net->ipv6.mc_autojoin_sk,
+ false, pfx, dev->ifindex);
+ }
return 0;
}
}
read_unlock_bh(&idev->lock);
+
+ NL_SET_ERR_MSG_MOD(extack, "address not found");
return -EADDRNOTAVAIL;
}
-int addrconf_add_ifaddr(void __user *arg)
+int addrconf_add_ifaddr(struct net *net, void __user *arg)
{
+ struct ifa6_config cfg = {
+ .ifa_flags = IFA_F_PERMANENT,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .valid_lft = INFINITY_LIFE_TIME,
+ };
+ struct net_device *dev;
struct in6_ifreq ireq;
int err;
-
- if (!capable(CAP_NET_ADMIN))
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
-
+
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
- rtnl_lock();
- err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen,
- IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
- rtnl_unlock();
+ cfg.pfx = &ireq.ifr6_addr;
+ cfg.plen = ireq.ifr6_prefixlen;
+
+ rtnl_net_lock(net);
+ dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
+ if (dev) {
+ netdev_lock_ops(dev);
+ err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL);
+ netdev_unlock_ops(dev);
+ } else {
+ err = -ENODEV;
+ }
+ rtnl_net_unlock(net);
return err;
}
-int addrconf_del_ifaddr(void __user *arg)
+int addrconf_del_ifaddr(struct net *net, void __user *arg)
{
struct in6_ifreq ireq;
int err;
-
- if (!capable(CAP_NET_ADMIN))
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
- rtnl_lock();
- err = inet6_addr_del(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen);
- rtnl_unlock();
+ rtnl_net_lock(net);
+ err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr,
+ ireq.ifr6_prefixlen, NULL);
+ rtnl_net_unlock(net);
return err;
}
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
-static void sit_add_v4_addrs(struct inet6_dev *idev)
+static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
+ int plen, int scope, u8 proto)
+{
+ struct inet6_ifaddr *ifp;
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = plen,
+ .ifa_flags = IFA_F_PERMANENT,
+ .valid_lft = INFINITY_LIFE_TIME,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .scope = scope,
+ .ifa_proto = proto
+ };
+
+ ifp = ipv6_add_addr(idev, &cfg, true, NULL);
+ if (!IS_ERR(ifp)) {
+ spin_lock_bh(&ifp->lock);
+ ifp->flags &= ~IFA_F_TENTATIVE;
+ spin_unlock_bh(&ifp->lock);
+ rt_genid_bump_ipv6(dev_net(idev->dev));
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ in6_ifa_put(ifp);
+ }
+}
+
+#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE)
+static void add_v4_addrs(struct inet6_dev *idev)
{
- struct inet6_ifaddr * ifp;
struct in6_addr addr;
struct net_device *dev;
- int scope;
+ struct net *net = dev_net(idev->dev);
+ int scope, plen;
+ u32 pflags = 0;
ASSERT_RTNL();
memset(&addr, 0, sizeof(struct in6_addr));
memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
- if (idev->dev->flags&IFF_POINTOPOINT) {
+ if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) {
+ scope = IPV6_ADDR_COMPATv4;
+ plen = 96;
+ pflags |= RTF_NONEXTHOP;
+ } else {
+ if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE)
+ return;
+
addr.s6_addr32[0] = htonl(0xfe800000);
scope = IFA_LINK;
- } else {
- scope = IPV6_ADDR_COMPATv4;
+ plen = 64;
}
if (addr.s6_addr32[3]) {
- ifp = ipv6_add_addr(idev, &addr, 128, scope, IFA_F_PERMANENT);
- if (!IS_ERR(ifp)) {
- spin_lock_bh(&ifp->lock);
- ifp->flags &= ~IFA_F_TENTATIVE;
- spin_unlock_bh(&ifp->lock);
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
- in6_ifa_put(ifp);
- }
+ add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
+ GFP_KERNEL);
return;
}
- for (dev = dev_base; dev != NULL; dev = dev->next) {
- struct in_device * in_dev = __in_dev_get_rtnl(dev);
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
- struct in_ifaddr * ifa;
-
+ struct in_ifaddr *ifa;
int flag = scope;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
- int plen;
-
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
if (ifa->ifa_scope == RT_SCOPE_LINK)
@@ -2069,256 +3259,515 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
continue;
flag |= IFA_HOST;
}
- if (idev->dev->flags&IFF_POINTOPOINT)
- plen = 64;
- else
- plen = 96;
-
- ifp = ipv6_add_addr(idev, &addr, plen, flag,
- IFA_F_PERMANENT);
- if (!IS_ERR(ifp)) {
- spin_lock_bh(&ifp->lock);
- ifp->flags &= ~IFA_F_TENTATIVE;
- spin_unlock_bh(&ifp->lock);
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
- in6_ifa_put(ifp);
- }
+
+ add_addr(idev, &addr, plen, flag,
+ IFAPROT_UNSPEC);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev,
+ 0, pflags, GFP_KERNEL);
}
}
- }
+ }
}
#endif
static void init_loopback(struct net_device *dev)
{
struct inet6_dev *idev;
- struct inet6_ifaddr * ifp;
/* ::1 */
ASSERT_RTNL();
- if ((idev = ipv6_find_idev(dev)) == NULL) {
- printk(KERN_DEBUG "init loopback: add_dev failed\n");
+ idev = ipv6_find_idev(dev);
+ if (IS_ERR(idev)) {
+ pr_debug("%s: add_dev failed\n", __func__);
return;
}
- ifp = ipv6_add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFA_F_PERMANENT);
+ add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO);
+}
+
+void addrconf_add_linklocal(struct inet6_dev *idev,
+ const struct in6_addr *addr, u32 flags)
+{
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = 64,
+ .ifa_flags = flags | IFA_F_PERMANENT,
+ .valid_lft = INFINITY_LIFE_TIME,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .scope = IFA_LINK,
+ .ifa_proto = IFAPROT_KERNEL_LL
+ };
+ struct inet6_ifaddr *ifp;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad) ||
+ READ_ONCE(idev->cnf.optimistic_dad)) &&
+ !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
+#endif
+
+ ifp = ipv6_add_addr(idev, &cfg, true, NULL);
if (!IS_ERR(ifp)) {
- spin_lock_bh(&ifp->lock);
- ifp->flags &= ~IFA_F_TENTATIVE;
- spin_unlock_bh(&ifp->lock);
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
+ 0, 0, GFP_ATOMIC);
+ addrconf_dad_start(ifp);
in6_ifa_put(ifp);
}
}
+EXPORT_SYMBOL_GPL(addrconf_add_linklocal);
-static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr)
+static bool ipv6_reserved_interfaceid(struct in6_addr address)
{
- struct inet6_ifaddr * ifp;
+ if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0)
+ return true;
- ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, IFA_F_PERMANENT);
- if (!IS_ERR(ifp)) {
- addrconf_dad_start(ifp, 0);
- in6_ifa_put(ifp);
+ if (address.s6_addr32[2] == htonl(0x02005eff) &&
+ ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000)))
+ return true;
+
+ if (address.s6_addr32[2] == htonl(0xfdffffff) &&
+ ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80)))
+ return true;
+
+ return false;
+}
+
+static int ipv6_generate_stable_address(struct in6_addr *address,
+ u8 dad_count,
+ const struct inet6_dev *idev)
+{
+ static DEFINE_SPINLOCK(lock);
+ static __u32 digest[SHA1_DIGEST_WORDS];
+ static __u32 workspace[SHA1_WORKSPACE_WORDS];
+
+ static union {
+ char __data[SHA1_BLOCK_SIZE];
+ struct {
+ struct in6_addr secret;
+ __be32 prefix[2];
+ unsigned char hwaddr[MAX_ADDR_LEN];
+ u8 dad_count;
+ } __packed;
+ } data;
+
+ struct in6_addr secret;
+ struct in6_addr temp;
+ struct net *net = dev_net(idev->dev);
+
+ BUILD_BUG_ON(sizeof(data.__data) != sizeof(data));
+
+ if (idev->cnf.stable_secret.initialized)
+ secret = idev->cnf.stable_secret.secret;
+ else if (net->ipv6.devconf_dflt->stable_secret.initialized)
+ secret = net->ipv6.devconf_dflt->stable_secret.secret;
+ else
+ return -1;
+
+retry:
+ spin_lock_bh(&lock);
+
+ sha1_init_raw(digest);
+ memset(&data, 0, sizeof(data));
+ memset(workspace, 0, sizeof(workspace));
+ memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
+ data.prefix[0] = address->s6_addr32[0];
+ data.prefix[1] = address->s6_addr32[1];
+ data.secret = secret;
+ data.dad_count = dad_count;
+
+ sha1_transform(digest, data.__data, workspace);
+
+ temp = *address;
+ temp.s6_addr32[2] = (__force __be32)digest[0];
+ temp.s6_addr32[3] = (__force __be32)digest[1];
+
+ spin_unlock_bh(&lock);
+
+ if (ipv6_reserved_interfaceid(temp)) {
+ dad_count++;
+ if (dad_count > dev_net(idev->dev)->ipv6.sysctl.idgen_retries)
+ return -1;
+ goto retry;
}
+
+ *address = temp;
+ return 0;
}
-static void addrconf_dev_config(struct net_device *dev)
+static void ipv6_gen_mode_random_init(struct inet6_dev *idev)
+{
+ struct ipv6_stable_secret *s = &idev->cnf.stable_secret;
+
+ if (s->initialized)
+ return;
+ s = &idev->cnf.stable_secret;
+ get_random_bytes(&s->secret, sizeof(s->secret));
+ s->initialized = true;
+}
+
+static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
{
struct in6_addr addr;
- struct inet6_dev * idev;
+
+ /* no link local addresses on L3 master devices */
+ if (netif_is_l3_master(idev->dev))
+ return;
+
+ /* no link local addresses on devices flagged as slaves */
+ if (idev->dev->priv_flags & IFF_NO_ADDRCONF)
+ return;
+
+ ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
+
+ switch (idev->cnf.addr_gen_mode) {
+ case IN6_ADDR_GEN_MODE_RANDOM:
+ ipv6_gen_mode_random_init(idev);
+ fallthrough;
+ case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
+ if (!ipv6_generate_stable_address(&addr, 0, idev))
+ addrconf_add_linklocal(idev, &addr,
+ IFA_F_STABLE_PRIVACY);
+ else if (prefix_route)
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
+ 0, 0, GFP_KERNEL);
+ break;
+ case IN6_ADDR_GEN_MODE_EUI64:
+ /* addrconf_add_linklocal also adds a prefix_route and we
+ * only need to care about prefix routes if ipv6_generate_eui64
+ * couldn't generate one.
+ */
+ if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
+ addrconf_add_linklocal(idev, &addr, 0);
+ else if (prefix_route)
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
+ 0, 0, GFP_KERNEL);
+ break;
+ case IN6_ADDR_GEN_MODE_NONE:
+ default:
+ /* will not add any link local address */
+ break;
+ }
+}
+
+static void addrconf_dev_config(struct net_device *dev)
+{
+ struct inet6_dev *idev;
ASSERT_RTNL();
- if ((dev->type != ARPHRD_ETHER) &&
+ if ((dev->type != ARPHRD_ETHER) &&
(dev->type != ARPHRD_FDDI) &&
- (dev->type != ARPHRD_IEEE802_TR) &&
(dev->type != ARPHRD_ARCNET) &&
- (dev->type != ARPHRD_INFINIBAND)) {
+ (dev->type != ARPHRD_INFINIBAND) &&
+ (dev->type != ARPHRD_IEEE1394) &&
+ (dev->type != ARPHRD_TUNNEL6) &&
+ (dev->type != ARPHRD_6LOWPAN) &&
+ (dev->type != ARPHRD_IP6GRE) &&
+ (dev->type != ARPHRD_TUNNEL) &&
+ (dev->type != ARPHRD_NONE) &&
+ (dev->type != ARPHRD_RAWIP)) {
/* Alas, we support only Ethernet autoconfiguration. */
+ idev = __in6_dev_get(dev);
+ if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP &&
+ dev->flags & IFF_MULTICAST)
+ ipv6_mc_up(idev);
return;
}
idev = addrconf_add_dev(dev);
- if (idev == NULL)
+ if (IS_ERR(idev))
return;
- memset(&addr, 0, sizeof(struct in6_addr));
- addr.s6_addr32[0] = htonl(0xFE800000);
+ /* this device type has no EUI support */
+ if (dev->type == ARPHRD_NONE &&
+ idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ IN6_ADDR_GEN_MODE_RANDOM);
- if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0)
- addrconf_add_linklocal(idev, &addr);
+ addrconf_addr_gen(idev, false);
}
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+#if IS_ENABLED(CONFIG_IPV6_SIT)
static void addrconf_sit_config(struct net_device *dev)
{
struct inet6_dev *idev;
ASSERT_RTNL();
- /*
- * Configure the tunnel with one of our IPv4
- * addresses... we should configure all of
+ /*
+ * Configure the tunnel with one of our IPv4
+ * addresses... we should configure all of
* our v4 addrs in the tunnel
*/
- if ((idev = ipv6_find_idev(dev)) == NULL) {
- printk(KERN_DEBUG "init sit: add_dev failed\n");
+ idev = ipv6_find_idev(dev);
+ if (IS_ERR(idev)) {
+ pr_debug("%s: add_dev failed\n", __func__);
+ return;
+ }
+
+ if (dev->priv_flags & IFF_ISATAP) {
+ addrconf_addr_gen(idev, false);
return;
}
- sit_add_v4_addrs(idev);
+ add_v4_addrs(idev);
- if (dev->flags&IFF_POINTOPOINT) {
+ if (dev->flags&IFF_POINTOPOINT)
addrconf_add_mroute(dev);
- addrconf_add_lroute(dev);
- } else
- sit_route_add(dev);
}
#endif
-static inline int
-ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev)
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+static void addrconf_gre_config(struct net_device *dev)
{
- struct in6_addr lladdr;
+ struct inet6_dev *idev;
- if (!ipv6_get_lladdr(link_dev, &lladdr)) {
- addrconf_add_linklocal(idev, &lladdr);
- return 0;
+ ASSERT_RTNL();
+
+ idev = addrconf_add_dev(dev);
+ if (IS_ERR(idev))
+ return;
+
+ /* Generate the IPv6 link-local address using addrconf_addr_gen(),
+ * unless we have an IPv4 GRE device not bound to an IP address and
+ * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this
+ * case). Such devices fall back to add_v4_addrs() instead.
+ */
+ if (!(*(__be32 *)dev->dev_addr == 0 &&
+ idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) {
+ addrconf_addr_gen(idev, true);
+ return;
}
- return -1;
+
+ add_v4_addrs(idev);
}
+#endif
-static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
+static void addrconf_init_auto_addrs(struct net_device *dev)
{
- struct net_device *link_dev;
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ case ARPHRD_SIT:
+ addrconf_sit_config(dev);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+ case ARPHRD_IPGRE:
+ addrconf_gre_config(dev);
+ break;
+#endif
+ case ARPHRD_LOOPBACK:
+ init_loopback(dev);
+ break;
- /* first try to inherit the link-local address from the link device */
- if (idev->dev->iflink &&
- (link_dev = __dev_get_by_index(idev->dev->iflink))) {
- if (!ipv6_inherit_linklocal(idev, link_dev))
- return;
- }
- /* then try to inherit it from any device */
- for (link_dev = dev_base; link_dev; link_dev = link_dev->next) {
- if (!ipv6_inherit_linklocal(idev, link_dev))
- return;
+ default:
+ addrconf_dev_config(dev);
+ break;
}
- printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n");
}
-/*
- * Autoconfigure tunnel with a link-local address so routing protocols,
- * DHCPv6, MLD etc. can be run over the virtual link
- */
+static int fixup_permanent_addr(struct net *net,
+ struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp)
+{
+ /* !fib6_node means the host route was removed from the
+ * FIB, for example, if 'lo' device is taken down. In that
+ * case regenerate the host route.
+ */
+ if (!ifp->rt || !ifp->rt->fib6_node) {
+ struct fib6_info *f6i, *prev;
+
+ f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
+ GFP_ATOMIC, NULL);
+ if (IS_ERR(f6i))
+ return PTR_ERR(f6i);
+
+ /* ifp->rt can be accessed outside of rtnl */
+ spin_lock(&ifp->lock);
+ prev = ifp->rt;
+ ifp->rt = f6i;
+ spin_unlock(&ifp->lock);
+
+ fib6_info_release(prev);
+ }
+
+ if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, idev->dev, 0, 0,
+ GFP_ATOMIC);
+ }
+
+ if (ifp->state == INET6_IFADDR_STATE_PREDAD)
+ addrconf_dad_start(ifp);
+
+ return 0;
+}
-static void addrconf_ip6_tnl_config(struct net_device *dev)
+static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
{
+ struct inet6_ifaddr *ifp, *tmp;
struct inet6_dev *idev;
- ASSERT_RTNL();
-
- if ((idev = addrconf_add_dev(dev)) == NULL) {
- printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n");
+ idev = __in6_dev_get(dev);
+ if (!idev)
return;
+
+ write_lock_bh(&idev->lock);
+
+ list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+ if ((ifp->flags & IFA_F_PERMANENT) &&
+ fixup_permanent_addr(net, idev, ifp) < 0) {
+ write_unlock_bh(&idev->lock);
+ in6_ifa_hold(ifp);
+ ipv6_del_addr(ifp);
+ write_lock_bh(&idev->lock);
+
+ net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+ idev->dev->name, &ifp->addr);
+ }
}
- ip6_tnl_add_linklocal(idev);
+
+ write_unlock_bh(&idev->lock);
}
-static int addrconf_notify(struct notifier_block *this, unsigned long event,
- void * data)
+static int addrconf_notify(struct notifier_block *this, unsigned long event,
+ void *ptr)
{
- struct net_device *dev = (struct net_device *) data;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
+ struct netdev_notifier_changeupper_info *info;
struct inet6_dev *idev = __in6_dev_get(dev);
+ struct net *net = dev_net(dev);
int run_pending = 0;
+ int err;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (!idev && dev->mtu >= IPV6_MIN_MTU) {
+ idev = ipv6_add_dev(dev);
+ if (IS_ERR(idev))
+ return notifier_from_errno(PTR_ERR(idev));
+ }
+ break;
- switch(event) {
+ case NETDEV_CHANGEMTU:
+ /* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
+ if (dev->mtu < IPV6_MIN_MTU) {
+ addrconf_ifdown(dev, dev != net->loopback_dev);
+ break;
+ }
+
+ if (idev) {
+ rt6_mtu_change(dev, dev->mtu);
+ WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
+ break;
+ }
+
+ /* allocate new idev */
+ idev = ipv6_add_dev(dev);
+ if (IS_ERR(idev))
+ break;
+
+ /* device is still not ready */
+ if (!(idev->if_flags & IF_READY))
+ break;
+
+ run_pending = 1;
+ fallthrough;
case NETDEV_UP:
case NETDEV_CHANGE:
+ if (idev && idev->cnf.disable_ipv6)
+ break;
+
+ if (dev->priv_flags & IFF_NO_ADDRCONF) {
+ if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
+ dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
+ ipv6_mc_up(idev);
+ break;
+ }
+
if (event == NETDEV_UP) {
- if (!netif_carrier_ok(dev)) {
+ /* restore routes for permanent addresses */
+ addrconf_permanent_addr(net, dev);
+
+ if (!addrconf_link_ready(dev)) {
/* device is not ready yet. */
- printk(KERN_INFO
- "ADDRCONF(NETDEV_UP): %s: "
- "link is not ready\n",
- dev->name);
+ pr_debug("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
+ dev->name);
break;
}
- if (idev)
+ if (!idev && dev->mtu >= IPV6_MIN_MTU)
+ idev = ipv6_add_dev(dev);
+
+ if (!IS_ERR_OR_NULL(idev)) {
idev->if_flags |= IF_READY;
- } else {
- if (!netif_carrier_ok(dev)) {
+ run_pending = 1;
+ }
+ } else if (event == NETDEV_CHANGE) {
+ if (!addrconf_link_ready(dev)) {
/* device is still not ready. */
+ rt6_sync_down_dev(dev, event);
break;
}
- if (idev) {
+ if (!IS_ERR_OR_NULL(idev)) {
if (idev->if_flags & IF_READY) {
- /* device is already configured. */
+ /* device is already configured -
+ * but resend MLD reports, we might
+ * have roamed and need to update
+ * multicast snooping switches
+ */
+ ipv6_mc_up(idev);
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ addrconf_dad_run(idev, true);
+ rt6_sync_up(dev, RTNH_F_LINKDOWN);
break;
}
idev->if_flags |= IF_READY;
}
- printk(KERN_INFO
- "ADDRCONF(NETDEV_CHANGE): %s: "
- "link becomes ready\n",
- dev->name);
+ pr_debug("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n",
+ dev->name);
run_pending = 1;
}
- switch(dev->type) {
-#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
- case ARPHRD_SIT:
- addrconf_sit_config(dev);
- break;
-#endif
- case ARPHRD_TUNNEL6:
- addrconf_ip6_tnl_config(dev);
- break;
- case ARPHRD_LOOPBACK:
- init_loopback(dev);
- break;
+ addrconf_init_auto_addrs(dev);
- default:
- addrconf_dev_config(dev);
- break;
- };
- if (idev) {
+ if (!IS_ERR_OR_NULL(idev)) {
if (run_pending)
- addrconf_dad_run(idev);
+ addrconf_dad_run(idev, false);
+
+ /* Device has an address by now */
+ rt6_sync_up(dev, RTNH_F_DEAD);
- /* If the MTU changed during the interface down, when the
- interface up, the changed MTU must be reflected in the
- idev as well as routers.
+ /*
+ * If the MTU changed during the interface down,
+ * when the interface up, the changed MTU must be
+ * reflected in the idev as well as routers.
*/
- if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) {
+ if (idev->cnf.mtu6 != dev->mtu &&
+ dev->mtu >= IPV6_MIN_MTU) {
rt6_mtu_change(dev, dev->mtu);
- idev->cnf.mtu6 = dev->mtu;
+ WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
}
- idev->tstamp = jiffies;
+ WRITE_ONCE(idev->tstamp, jiffies);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
- /* If the changed mtu during down is lower than IPV6_MIN_MTU
- stop IPv6 on this interface.
+
+ /*
+ * If the changed mtu during down is lower than
+ * IPV6_MIN_MTU stop IPv6 on this interface.
*/
if (dev->mtu < IPV6_MIN_MTU)
- addrconf_ifdown(dev, event != NETDEV_DOWN);
+ addrconf_ifdown(dev, dev != net->loopback_dev);
}
break;
- case NETDEV_CHANGEMTU:
- if ( idev && dev->mtu >= IPV6_MIN_MTU) {
- rt6_mtu_change(dev, dev->mtu);
- idev->cnf.mtu6 = dev->mtu;
- break;
- }
-
- /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */
-
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
/*
@@ -2328,19 +3777,35 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
break;
case NETDEV_CHANGENAME:
-#ifdef CONFIG_SYSCTL
if (idev) {
- addrconf_sysctl_unregister(&idev->cnf);
- neigh_sysctl_unregister(idev->nd_parms);
- neigh_sysctl_register(dev, idev->nd_parms,
- NET_IPV6, NET_IPV6_NEIGH, "ipv6",
- &ndisc_ifinfo_sysctl_change,
- NULL);
- addrconf_sysctl_register(idev, &idev->cnf);
+ snmp6_unregister_dev(idev);
+ addrconf_sysctl_unregister(idev);
+ err = addrconf_sysctl_register(idev);
+ if (err)
+ return notifier_from_errno(err);
+ err = snmp6_register_dev(idev);
+ if (err) {
+ addrconf_sysctl_unregister(idev);
+ return notifier_from_errno(err);
+ }
}
-#endif
break;
- };
+
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_POST_TYPE_CHANGE:
+ if (idev)
+ addrconf_type_change(dev, event);
+ break;
+
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ addrconf_ifdown(dev, false);
+ }
return NOTIFY_OK;
}
@@ -2350,75 +3815,114 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
*/
static struct notifier_block ipv6_dev_notf = {
.notifier_call = addrconf_notify,
- .priority = 0
+ .priority = ADDRCONF_NOTIFY_PRIORITY,
};
-static int addrconf_ifdown(struct net_device *dev, int how)
+static void addrconf_type_change(struct net_device *dev, unsigned long event)
{
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa, **bifa;
- int i;
-
ASSERT_RTNL();
- if (dev == &loopback_dev && how == 1)
- how = 0;
+ idev = __in6_dev_get(dev);
+
+ if (event == NETDEV_POST_TYPE_CHANGE)
+ ipv6_mc_remap(idev);
+ else if (event == NETDEV_PRE_TYPE_CHANGE)
+ ipv6_mc_unmap(idev);
+}
+
+static bool addr_is_local(const struct in6_addr *addr)
+{
+ return ipv6_addr_type(addr) &
+ (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
+}
+
+static int addrconf_ifdown(struct net_device *dev, bool unregister)
+{
+ unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
+ struct net *net = dev_net(dev);
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ LIST_HEAD(tmp_addr_list);
+ bool keep_addr = false;
+ bool was_ready;
+ int state, i;
+
+ ASSERT_RTNL();
- rt6_ifdown(dev);
- neigh_ifdown(&nd_tbl, dev);
+ rt6_disable_ip(dev, event);
idev = __in6_dev_get(dev);
- if (idev == NULL)
+ if (!idev)
return -ENODEV;
- /* Step 1: remove reference to ipv6 device from parent device.
- Do not dev_put!
+ /*
+ * Step 1: remove reference to ipv6 device from parent device.
+ * Do not dev_put!
*/
- if (how == 1) {
- idev->dead = 1;
+ if (unregister) {
+ WRITE_ONCE(idev->dead, 1);
/* protected by rtnl_lock */
- rcu_assign_pointer(dev->ip6_ptr, NULL);
+ RCU_INIT_POINTER(dev->ip6_ptr, NULL);
/* Step 1.5: remove snmp6 entry */
snmp6_unregister_dev(idev);
}
+ /* combine the user config with event to determine if permanent
+ * addresses are to be removed from address hash table
+ */
+ if (!unregister && !idev->cnf.disable_ipv6) {
+ /* aggregate the system setting and interface setting */
+ int _keep_addr = READ_ONCE(net->ipv6.devconf_all->keep_addr_on_down);
+
+ if (!_keep_addr)
+ _keep_addr = READ_ONCE(idev->cnf.keep_addr_on_down);
+
+ keep_addr = (_keep_addr > 0);
+ }
+
/* Step 2: clear hash table */
- for (i=0; i<IN6_ADDR_HSIZE; i++) {
- bifa = &inet6_addr_lst[i];
+ for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+ struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
- write_lock_bh(&addrconf_hash_lock);
- while ((ifa = *bifa) != NULL) {
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
+restart:
+ hlist_for_each_entry_rcu(ifa, h, addr_lst) {
if (ifa->idev == idev) {
- *bifa = ifa->lst_next;
- ifa->lst_next = NULL;
- addrconf_del_timer(ifa);
- in6_ifa_put(ifa);
- continue;
+ addrconf_del_dad_work(ifa);
+ /* combined flag + permanent flag decide if
+ * address is retained on a down event
+ */
+ if (!keep_addr ||
+ !(ifa->flags & IFA_F_PERMANENT) ||
+ addr_is_local(&ifa->addr)) {
+ hlist_del_init_rcu(&ifa->addr_lst);
+ goto restart;
+ }
}
- bifa = &ifa->lst_next;
}
- write_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
}
write_lock_bh(&idev->lock);
- /* Step 3: clear flags for stateless addrconf */
- if (how != 1)
- idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
+ addrconf_del_rs_timer(idev);
- /* Step 4: clear address list */
-#ifdef CONFIG_IPV6_PRIVACY
- if (how == 1 && del_timer(&idev->regen_timer))
- in6_dev_put(idev);
+ /* Step 2: clear flags for stateless addrconf, repeated down
+ * detection
+ */
+ was_ready = idev->if_flags & IF_READY;
+ if (!unregister)
+ idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
- /* clear tempaddr list */
- while ((ifa = idev->tempaddr_list) != NULL) {
- idev->tempaddr_list = ifa->tmp_next;
- ifa->tmp_next = NULL;
- ifa->dead = 1;
+ /* Step 3: clear tempaddr list */
+ while (!list_empty(&idev->tempaddr_list)) {
+ ifa = list_first_entry(&idev->tempaddr_list,
+ struct inet6_ifaddr, tmp_list);
+ list_del(&ifa->tmp_list);
write_unlock_bh(&idev->lock);
spin_lock_bh(&ifa->lock);
@@ -2430,39 +3934,76 @@ static int addrconf_ifdown(struct net_device *dev, int how)
in6_ifa_put(ifa);
write_lock_bh(&idev->lock);
}
-#endif
- while ((ifa = idev->addr_list) != NULL) {
- idev->addr_list = ifa->if_next;
- ifa->if_next = NULL;
- ifa->dead = 1;
- addrconf_del_timer(ifa);
- write_unlock_bh(&idev->lock);
- __ipv6_ifa_notify(RTM_DELADDR, ifa);
- in6_ifa_put(ifa);
-
- write_lock_bh(&idev->lock);
- }
+ list_for_each_entry(ifa, &idev->addr_list, if_list)
+ list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
write_unlock_bh(&idev->lock);
- /* Step 5: Discard multicast list */
+ while (!list_empty(&tmp_addr_list)) {
+ struct fib6_info *rt = NULL;
+ bool keep;
+
+ ifa = list_first_entry(&tmp_addr_list,
+ struct inet6_ifaddr, if_list_aux);
+ list_del(&ifa->if_list_aux);
+
+ addrconf_del_dad_work(ifa);
+
+ keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
+ !addr_is_local(&ifa->addr);
+
+ spin_lock_bh(&ifa->lock);
+
+ if (keep) {
+ /* set state to skip the notifier below */
+ state = INET6_IFADDR_STATE_DEAD;
+ ifa->state = INET6_IFADDR_STATE_PREDAD;
+ if (!(ifa->flags & IFA_F_NODAD))
+ ifa->flags |= IFA_F_TENTATIVE;
+
+ rt = ifa->rt;
+ ifa->rt = NULL;
+ } else {
+ state = ifa->state;
+ ifa->state = INET6_IFADDR_STATE_DEAD;
+ }
+
+ spin_unlock_bh(&ifa->lock);
+
+ if (rt)
+ ip6_del_rt(net, rt, false);
- if (how == 1)
+ if (state != INET6_IFADDR_STATE_DEAD) {
+ __ipv6_ifa_notify(RTM_DELADDR, ifa);
+ inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
+ } else {
+ if (idev->cnf.forwarding)
+ addrconf_leave_anycast(ifa);
+ addrconf_leave_solict(ifa->idev, &ifa->addr);
+ }
+
+ if (!keep) {
+ write_lock_bh(&idev->lock);
+ list_del_rcu(&ifa->if_list);
+ write_unlock_bh(&idev->lock);
+ in6_ifa_put(ifa);
+ }
+ }
+
+ /* Step 5: Discard anycast and multicast list */
+ if (unregister) {
+ ipv6_ac_destroy_dev(idev);
ipv6_mc_destroy_dev(idev);
- else
+ } else if (was_ready) {
ipv6_mc_down(idev);
+ }
- /* Step 5: netlink notification of this interface */
- idev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_DELLINK, idev);
-
- /* Shot the device (if unregistered) */
+ WRITE_ONCE(idev->tstamp, jiffies);
+ idev->ra_mtu = 0;
- if (how == 1) {
-#ifdef CONFIG_SYSCTL
- addrconf_sysctl_unregister(&idev->cnf);
- neigh_sysctl_unregister(idev->nd_parms);
-#endif
+ /* Last: Shot the device (if unregistered) */
+ if (unregister) {
+ addrconf_sysctl_unregister(idev);
neigh_parms_release(&nd_tbl, idev->nd_parms);
neigh_ifdown(&nd_tbl, dev);
in6_dev_put(idev);
@@ -2470,47 +4011,55 @@ static int addrconf_ifdown(struct net_device *dev, int how)
return 0;
}
-static void addrconf_rs_timer(unsigned long data)
+static void addrconf_rs_timer(struct timer_list *t)
{
- struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+ struct inet6_dev *idev = timer_container_of(idev, t, rs_timer);
+ struct net_device *dev = idev->dev;
+ struct in6_addr lladdr;
+ int rtr_solicits;
- if (ifp->idev->cnf.forwarding)
+ write_lock(&idev->lock);
+ if (idev->dead || !(idev->if_flags & IF_READY))
goto out;
- if (ifp->idev->if_flags & IF_RA_RCVD) {
- /*
- * Announcement received after solicitation
- * was sent
- */
+ if (!ipv6_accept_ra(idev))
goto out;
- }
- spin_lock(&ifp->lock);
- if (ifp->probes++ < ifp->idev->cnf.rtr_solicits) {
- struct in6_addr all_routers;
+ /* Announcement received after solicitation was sent */
+ if (idev->if_flags & IF_RA_RCVD)
+ goto out;
- /* The wait after the last probe can be shorter */
- addrconf_mod_timer(ifp, AC_RS,
- (ifp->probes == ifp->idev->cnf.rtr_solicits) ?
- ifp->idev->cnf.rtr_solicit_delay :
- ifp->idev->cnf.rtr_solicit_interval);
- spin_unlock(&ifp->lock);
+ rtr_solicits = READ_ONCE(idev->cnf.rtr_solicits);
- ipv6_addr_all_routers(&all_routers);
+ if (idev->rs_probes++ < rtr_solicits || rtr_solicits < 0) {
+ write_unlock(&idev->lock);
+ if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
+ ndisc_send_rs(dev, &lladdr,
+ &in6addr_linklocal_allrouters);
+ else
+ goto put;
- ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers);
+ write_lock(&idev->lock);
+ idev->rs_interval = rfc3315_s14_backoff_update(
+ idev->rs_interval,
+ READ_ONCE(idev->cnf.rtr_solicit_max_interval));
+ /* The wait after the last probe can be shorter */
+ addrconf_mod_rs_timer(idev, (idev->rs_probes ==
+ READ_ONCE(idev->cnf.rtr_solicits)) ?
+ READ_ONCE(idev->cnf.rtr_solicit_delay) :
+ idev->rs_interval);
} else {
- spin_unlock(&ifp->lock);
/*
* Note: we do not support deprecated "all on-link"
* assumption any longer.
*/
- printk(KERN_DEBUG "%s: no IPv6 routers present\n",
- ifp->idev->dev->name);
+ pr_debug("%s: no IPv6 routers present\n", idev->dev->name);
}
out:
- in6_ifa_put(ifp);
+ write_unlock(&idev->lock);
+put:
+ in6_dev_put(idev);
}
/*
@@ -2518,104 +4067,251 @@ out:
*/
static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
- unsigned long rand_num;
struct inet6_dev *idev = ifp->idev;
+ unsigned long rand_num;
+ u64 nonce;
- rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1);
- ifp->probes = idev->cnf.dad_transmits;
- addrconf_mod_timer(ifp, AC_DAD, rand_num);
+ if (ifp->flags & IFA_F_OPTIMISTIC)
+ rand_num = 0;
+ else
+ rand_num = get_random_u32_below(
+ READ_ONCE(idev->cnf.rtr_solicit_delay) ? : 1);
+
+ nonce = 0;
+ if (READ_ONCE(idev->cnf.enhanced_dad) ||
+ READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad)) {
+ do
+ get_random_bytes(&nonce, 6);
+ while (nonce == 0);
+ }
+ ifp->dad_nonce = nonce;
+ ifp->dad_probes = READ_ONCE(idev->cnf.dad_transmits);
+ addrconf_mod_dad_work(ifp, rand_num);
}
-static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
+static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
+ bool bump_id, notify = false;
+ struct net *net;
addrconf_join_solict(dev, &ifp->addr);
- if (ifp->prefix_len != 128 && (ifp->flags&IFA_F_PERMANENT))
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0,
- flags);
-
- net_srandom(ifp->addr.s6_addr32[3]);
-
read_lock_bh(&idev->lock);
- if (ifp->dead)
+ spin_lock(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_DEAD)
goto out;
- spin_lock_bh(&ifp->lock);
+ net = dev_net(dev);
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+ (READ_ONCE(net->ipv6.devconf_all->accept_dad) < 1 &&
+ READ_ONCE(idev->cnf.accept_dad) < 1) ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
- ifp->flags &= ~IFA_F_TENTATIVE;
- spin_unlock_bh(&ifp->lock);
+ bool send_na = false;
+
+ if (ifp->flags & IFA_F_TENTATIVE &&
+ !(ifp->flags & IFA_F_OPTIMISTIC))
+ send_na = true;
+ bump_id = ifp->flags & IFA_F_TENTATIVE;
+ ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
+ spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
- addrconf_dad_completed(ifp);
+ addrconf_dad_completed(ifp, bump_id, send_na);
return;
}
if (!(idev->if_flags & IF_READY)) {
- spin_unlock_bh(&ifp->lock);
+ spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
/*
- * If the defice is not ready:
+ * If the device is not ready:
* - keep it tentative if it is a permanent address.
* - otherwise, kill it.
*/
in6_ifa_hold(ifp);
- addrconf_dad_stop(ifp);
+ addrconf_dad_stop(ifp, 0);
return;
}
+
+ /*
+ * Optimistic nodes can start receiving
+ * Frames right away
+ */
+ if (ifp->flags & IFA_F_OPTIMISTIC) {
+ ip6_ins_rt(net, ifp->rt);
+ if (ipv6_use_optimistic_addr(net, idev)) {
+ /* Because optimistic nodes can use this address,
+ * notify listeners. If DAD fails, RTM_DELADDR is sent.
+ */
+ notify = true;
+ }
+ }
+
addrconf_dad_kick(ifp);
- spin_unlock_bh(&ifp->lock);
out:
+ spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
+ if (notify)
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
}
-static void addrconf_dad_timer(unsigned long data)
+static void addrconf_dad_start(struct inet6_ifaddr *ifp)
{
- struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+ bool begin_dad = false;
+
+ spin_lock_bh(&ifp->lock);
+ if (ifp->state != INET6_IFADDR_STATE_DEAD) {
+ ifp->state = INET6_IFADDR_STATE_PREDAD;
+ begin_dad = true;
+ }
+ spin_unlock_bh(&ifp->lock);
+
+ if (begin_dad)
+ addrconf_mod_dad_work(ifp, 0);
+}
+
+static void addrconf_dad_work(struct work_struct *w)
+{
+ struct inet6_ifaddr *ifp = container_of(to_delayed_work(w),
+ struct inet6_ifaddr,
+ dad_work);
struct inet6_dev *idev = ifp->idev;
- struct in6_addr unspec;
+ bool bump_id, disable_ipv6 = false;
struct in6_addr mcaddr;
+ struct net *net;
- read_lock_bh(&idev->lock);
- if (idev->dead) {
- read_unlock_bh(&idev->lock);
+ enum {
+ DAD_PROCESS,
+ DAD_BEGIN,
+ DAD_ABORT,
+ } action = DAD_PROCESS;
+
+ net = dev_net(idev->dev);
+
+ rtnl_net_lock(net);
+
+ spin_lock_bh(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_PREDAD) {
+ action = DAD_BEGIN;
+ ifp->state = INET6_IFADDR_STATE_DAD;
+ } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
+ action = DAD_ABORT;
+ ifp->state = INET6_IFADDR_STATE_POSTDAD;
+
+ if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 ||
+ READ_ONCE(idev->cnf.accept_dad) > 1) &&
+ !idev->cnf.disable_ipv6 &&
+ !(ifp->flags & IFA_F_STABLE_PRIVACY)) {
+ struct in6_addr addr;
+
+ addr.s6_addr32[0] = htonl(0xfe800000);
+ addr.s6_addr32[1] = 0;
+
+ if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
+ ipv6_addr_equal(&ifp->addr, &addr)) {
+ /* DAD failed for link-local based on MAC */
+ WRITE_ONCE(idev->cnf.disable_ipv6, 1);
+
+ pr_info("%s: IPv6 being disabled!\n",
+ ifp->idev->dev->name);
+ disable_ipv6 = true;
+ }
+ }
+ }
+ spin_unlock_bh(&ifp->lock);
+
+ if (action == DAD_BEGIN) {
+ addrconf_dad_begin(ifp);
+ goto out;
+ } else if (action == DAD_ABORT) {
+ in6_ifa_hold(ifp);
+ addrconf_dad_stop(ifp, 1);
+ if (disable_ipv6)
+ addrconf_ifdown(idev->dev, false);
goto out;
}
- spin_lock_bh(&ifp->lock);
- if (ifp->probes == 0) {
+
+ if (!ifp->dad_probes && addrconf_dad_end(ifp))
+ goto out;
+
+ write_lock_bh(&idev->lock);
+ if (idev->dead || !(idev->if_flags & IF_READY)) {
+ write_unlock_bh(&idev->lock);
+ goto out;
+ }
+
+ spin_lock(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_DEAD) {
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&idev->lock);
+ goto out;
+ }
+
+ if (ifp->dad_probes == 0) {
+ bool send_na = false;
+
/*
* DAD was successful
*/
- ifp->flags &= ~IFA_F_TENTATIVE;
- spin_unlock_bh(&ifp->lock);
- read_unlock_bh(&idev->lock);
+ if (ifp->flags & IFA_F_TENTATIVE &&
+ !(ifp->flags & IFA_F_OPTIMISTIC))
+ send_na = true;
+ bump_id = ifp->flags & IFA_F_TENTATIVE;
+ ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&idev->lock);
- addrconf_dad_completed(ifp);
+ addrconf_dad_completed(ifp, bump_id, send_na);
goto out;
}
- ifp->probes--;
- addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time);
- spin_unlock_bh(&ifp->lock);
- read_unlock_bh(&idev->lock);
+ ifp->dad_probes--;
+ addrconf_mod_dad_work(ifp,
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
+ HZ/100));
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&idev->lock);
/* send a neighbour solicitation for our addr */
- memset(&unspec, 0, sizeof(unspec));
addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
- ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec);
+ ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
+ ifp->dad_nonce);
out:
in6_ifa_put(ifp);
+ rtnl_net_unlock(net);
}
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+/* ifp->idev must be at least read locked */
+static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
{
- struct net_device * dev = ifp->idev->dev;
+ struct inet6_ifaddr *ifpiter;
+ struct inet6_dev *idev = ifp->idev;
+
+ list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) {
+ if (ifpiter->scope > IFA_LINK)
+ break;
+ if (ifp != ifpiter && ifpiter->scope == IFA_LINK &&
+ (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE|
+ IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) ==
+ IFA_F_PERMANENT)
+ return false;
+ }
+ return true;
+}
+
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
+ bool send_na)
+{
+ struct net_device *dev = ifp->idev->dev;
+ struct in6_addr lladdr;
+ bool send_rs, send_mld;
+
+ addrconf_del_dad_work(ifp);
/*
* Configure the address for reception. Now it is valid.
@@ -2623,94 +4319,150 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
ipv6_ifa_notify(RTM_NEWADDR, ifp);
- /* If added prefix is link local and forwarding is off,
- start sending router solicitations.
+ /* If added prefix is link local and we are prepared to process
+ router advertisements, start sending router solicitations.
*/
- if (ifp->idev->cnf.forwarding == 0 &&
- ifp->idev->cnf.rtr_solicits > 0 &&
- (dev->flags&IFF_LOOPBACK) == 0 &&
- (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
- struct in6_addr all_routers;
-
- ipv6_addr_all_routers(&all_routers);
+ read_lock_bh(&ifp->idev->lock);
+ send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
+ send_rs = send_mld &&
+ ipv6_accept_ra(ifp->idev) &&
+ READ_ONCE(ifp->idev->cnf.rtr_solicits) != 0 &&
+ (dev->flags & IFF_LOOPBACK) == 0 &&
+ (dev->type != ARPHRD_TUNNEL) &&
+ !netif_is_team_port(dev);
+ read_unlock_bh(&ifp->idev->lock);
+
+ /* While dad is in progress mld report's source address is in6_addrany.
+ * Resend with proper ll now.
+ */
+ if (send_mld)
+ ipv6_mc_dad_complete(ifp->idev);
+
+ /* send unsolicited NA if enabled */
+ if (send_na &&
+ (READ_ONCE(ifp->idev->cnf.ndisc_notify) ||
+ READ_ONCE(dev_net(dev)->ipv6.devconf_all->ndisc_notify))) {
+ ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr,
+ /*router=*/ !!ifp->idev->cnf.forwarding,
+ /*solicited=*/ false, /*override=*/ true,
+ /*inc_opt=*/ true);
+ }
+ if (send_rs) {
/*
* If a host as already performed a random delay
* [...] as part of DAD [...] there is no need
* to delay again before sending the first RS
*/
- ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers);
+ if (ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
+ return;
+ ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters);
- spin_lock_bh(&ifp->lock);
- ifp->probes = 1;
+ write_lock_bh(&ifp->idev->lock);
+ spin_lock(&ifp->lock);
+ ifp->idev->rs_interval = rfc3315_s14_backoff_init(
+ READ_ONCE(ifp->idev->cnf.rtr_solicit_interval));
+ ifp->idev->rs_probes = 1;
ifp->idev->if_flags |= IF_RS_SENT;
- addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval);
- spin_unlock_bh(&ifp->lock);
+ addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&ifp->idev->lock);
}
+
+ if (bump_id)
+ rt_genid_bump_ipv6(dev_net(dev));
+
+ /* Make sure that a new temporary address will be created
+ * before this temporary address becomes deprecated.
+ */
+ if (ifp->flags & IFA_F_TEMPORARY)
+ addrconf_verify_rtnl(dev_net(dev));
}
-static void addrconf_dad_run(struct inet6_dev *idev) {
+static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
+{
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
- for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) {
- spin_lock_bh(&ifp->lock);
- if (!(ifp->flags & IFA_F_TENTATIVE)) {
- spin_unlock_bh(&ifp->lock);
- continue;
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ spin_lock(&ifp->lock);
+ if ((ifp->flags & IFA_F_TENTATIVE &&
+ ifp->state == INET6_IFADDR_STATE_DAD) || restart) {
+ if (restart)
+ ifp->state = INET6_IFADDR_STATE_PREDAD;
+ addrconf_dad_kick(ifp);
}
- spin_unlock_bh(&ifp->lock);
- addrconf_dad_kick(ifp);
+ spin_unlock(&ifp->lock);
}
read_unlock_bh(&idev->lock);
}
#ifdef CONFIG_PROC_FS
struct if6_iter_state {
+ struct seq_net_private p;
int bucket;
+ int offset;
};
-static struct inet6_ifaddr *if6_get_first(struct seq_file *seq)
+static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
{
- struct inet6_ifaddr *ifa = NULL;
struct if6_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct inet6_ifaddr *ifa = NULL;
+ int p = 0;
- for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
- ifa = inet6_addr_lst[state->bucket];
- if (ifa)
- break;
+ /* initial bucket if pos is 0 */
+ if (pos == 0) {
+ state->bucket = 0;
+ state->offset = 0;
}
- return ifa;
+
+ for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
+ hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
+ addr_lst) {
+ /* sync with offset */
+ if (p < state->offset) {
+ p++;
+ continue;
+ }
+ return ifa;
+ }
+
+ /* prepare for next bucket */
+ state->offset = 0;
+ p = 0;
+ }
+ return NULL;
}
-static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa)
+static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
+ struct inet6_ifaddr *ifa)
{
struct if6_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
- ifa = ifa->lst_next;
-try_again:
- if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) {
- ifa = inet6_addr_lst[state->bucket];
- goto try_again;
+ hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
+ state->offset++;
+ return ifa;
}
- return ifa;
-}
-static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct inet6_ifaddr *ifa = if6_get_first(seq);
+ state->offset = 0;
+ while (++state->bucket < IN6_ADDR_HSIZE) {
+ hlist_for_each_entry_rcu(ifa,
+ &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
+ return ifa;
+ }
+ }
- if (ifa)
- while(pos && (ifa = if6_get_next(seq, ifa)) != NULL)
- --pos;
- return pos ? NULL : ifa;
+ return NULL;
}
static void *if6_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock_bh(&addrconf_hash_lock);
- return if6_get_idx(seq, *pos);
+ rcu_read_lock();
+ return if6_get_first(seq, *pos);
}
static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2723,144 +4475,211 @@ static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void if6_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
- read_unlock_bh(&addrconf_hash_lock);
+ rcu_read_unlock();
}
static int if6_seq_show(struct seq_file *seq, void *v)
{
struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
- seq_printf(seq,
- NIP6_SEQFMT " %02x %02x %02x %02x %8s\n",
- NIP6(ifp->addr),
+ seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n",
+ &ifp->addr,
ifp->idev->dev->ifindex,
ifp->prefix_len,
ifp->scope,
- ifp->flags,
+ (u8) ifp->flags,
ifp->idev->dev->name);
return 0;
}
-static struct seq_operations if6_seq_ops = {
+static const struct seq_operations if6_seq_ops = {
.start = if6_seq_start,
.next = if6_seq_next,
.show = if6_seq_show,
.stop = if6_seq_stop,
};
-static int if6_seq_open(struct inode *inode, struct file *file)
+static int __net_init if6_proc_net_init(struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct if6_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
+ if (!proc_create_net("if_inet6", 0444, net->proc_net, &if6_seq_ops,
+ sizeof(struct if6_iter_state)))
+ return -ENOMEM;
+ return 0;
+}
- rc = seq_open(file, &if6_seq_ops);
- if (rc)
- goto out_kfree;
+static void __net_exit if6_proc_net_exit(struct net *net)
+{
+ remove_proc_entry("if_inet6", net->proc_net);
+}
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations if6_fops = {
- .owner = THIS_MODULE,
- .open = if6_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static struct pernet_operations if6_proc_net_ops = {
+ .init = if6_proc_net_init,
+ .exit = if6_proc_net_exit,
};
int __init if6_proc_init(void)
{
- if (!proc_net_fops_create("if_inet6", S_IRUGO, &if6_fops))
- return -ENOMEM;
- return 0;
+ return register_pernet_subsys(&if6_proc_net_ops);
}
void if6_proc_exit(void)
{
- proc_net_remove("if_inet6");
+ unregister_pernet_subsys(&if6_proc_net_ops);
}
#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
/* Check if address is a home address configured on any interface. */
-int ipv6_chk_home_addr(struct in6_addr *addr)
+int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
{
+ unsigned int hash = inet6_addr_hash(net, addr);
+ struct inet6_ifaddr *ifp = NULL;
int ret = 0;
- struct inet6_ifaddr * ifp;
- u8 hash = ipv6_addr_hash(addr);
- read_lock_bh(&addrconf_hash_lock);
- for (ifp = inet6_addr_lst[hash]; ifp; ifp = ifp->lst_next) {
- if (ipv6_addr_cmp(&ifp->addr, addr) == 0 &&
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+ if (ipv6_addr_equal(&ifp->addr, addr) &&
(ifp->flags & IFA_F_HOMEADDRESS)) {
ret = 1;
break;
}
}
- read_unlock_bh(&addrconf_hash_lock);
+ rcu_read_unlock();
return ret;
}
#endif
+/* RFC6554 has some algorithm to avoid loops in segment routing by
+ * checking if the segments contains any of a local interface address.
+ *
+ * Quote:
+ *
+ * To detect loops in the SRH, a router MUST determine if the SRH
+ * includes multiple addresses assigned to any interface on that router.
+ * If such addresses appear more than once and are separated by at least
+ * one address not assigned to that router.
+ */
+int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
+ unsigned char nsegs)
+{
+ const struct in6_addr *addr;
+ int i, ret = 0, found = 0;
+ struct inet6_ifaddr *ifp;
+ bool separated = false;
+ unsigned int hash;
+ bool hash_found;
+
+ rcu_read_lock();
+ for (i = 0; i < nsegs; i++) {
+ addr = &segs[i];
+ hash = inet6_addr_hash(net, addr);
+
+ hash_found = false;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ hash_found = true;
+ break;
+ }
+ }
+
+ if (hash_found) {
+ if (found > 1 && separated) {
+ ret = 1;
+ break;
+ }
+
+ separated = false;
+ found++;
+ } else {
+ separated = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Periodic address status verification
*/
-static void addrconf_verify(unsigned long foo)
+static void addrconf_verify_rtnl(struct net *net)
{
+ unsigned long now, next, next_sec, next_sched;
struct inet6_ifaddr *ifp;
- unsigned long now, next;
int i;
- spin_lock_bh(&addrconf_verify_lock);
- now = jiffies;
- next = now + ADDR_CHECK_FREQUENCY;
+ ASSERT_RTNL();
- del_timer(&addr_chk_timer);
+ rcu_read_lock_bh();
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
- for (i=0; i < IN6_ADDR_HSIZE; i++) {
+ cancel_delayed_work(&net->ipv6.addr_chk_work);
+ for (i = 0; i < IN6_ADDR_HSIZE; i++) {
restart:
- read_lock(&addrconf_hash_lock);
- for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
+ hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
unsigned long age;
-#ifdef CONFIG_IPV6_PRIVACY
- unsigned long regen_advance;
-#endif
- if (ifp->flags & IFA_F_PERMANENT)
+ /* When setting preferred_lft to a value not zero or
+ * infinity, while valid_lft is infinity
+ * IFA_F_PERMANENT has a non-infinity life time.
+ */
+ if ((ifp->flags & IFA_F_PERMANENT) &&
+ (ifp->prefered_lft == INFINITY_LIFE_TIME))
continue;
spin_lock(&ifp->lock);
- age = (now - ifp->tstamp) / HZ;
+ /* We try to batch several events at once. */
+ age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
-#ifdef CONFIG_IPV6_PRIVACY
- regen_advance = ifp->idev->cnf.regen_max_retry *
- ifp->idev->cnf.dad_transmits *
- ifp->idev->nd_parms->retrans_time / HZ;
-#endif
+ if ((ifp->flags&IFA_F_TEMPORARY) &&
+ !(ifp->flags&IFA_F_TENTATIVE) &&
+ ifp->prefered_lft != INFINITY_LIFE_TIME &&
+ !ifp->regen_count && ifp->ifpub) {
+ /* This is a non-regenerated temporary addr. */
+
+ unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev);
+
+ if (age + regen_advance >= ifp->prefered_lft) {
+ struct inet6_ifaddr *ifpub = ifp->ifpub;
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+
+ ifp->regen_count++;
+ in6_ifa_hold(ifp);
+ in6_ifa_hold(ifpub);
+ spin_unlock(&ifp->lock);
+
+ spin_lock(&ifpub->lock);
+ ifpub->regen_count = 0;
+ spin_unlock(&ifpub->lock);
+ rcu_read_unlock_bh();
+ ipv6_create_tempaddr(ifpub, true);
+ in6_ifa_put(ifpub);
+ in6_ifa_put(ifp);
+ rcu_read_lock_bh();
+ goto restart;
+ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
+ }
if (ifp->valid_lft != INFINITY_LIFE_TIME &&
age >= ifp->valid_lft) {
spin_unlock(&ifp->lock);
in6_ifa_hold(ifp);
- read_unlock(&addrconf_hash_lock);
+ rcu_read_unlock_bh();
ipv6_del_addr(ifp);
+ rcu_read_lock_bh();
goto restart;
} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
spin_unlock(&ifp->lock);
continue;
} else if (age >= ifp->prefered_lft) {
- /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */
+ /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
int deprecate = 0;
if (!(ifp->flags&IFA_F_DEPRECATED)) {
@@ -2868,44 +4687,19 @@ restart:
ifp->flags |= IFA_F_DEPRECATED;
}
- if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next))
+ if ((ifp->valid_lft != INFINITY_LIFE_TIME) &&
+ (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)))
next = ifp->tstamp + ifp->valid_lft * HZ;
spin_unlock(&ifp->lock);
if (deprecate) {
in6_ifa_hold(ifp);
- read_unlock(&addrconf_hash_lock);
ipv6_ifa_notify(0, ifp);
in6_ifa_put(ifp);
goto restart;
}
-#ifdef CONFIG_IPV6_PRIVACY
- } else if ((ifp->flags&IFA_F_TEMPORARY) &&
- !(ifp->flags&IFA_F_TENTATIVE)) {
- if (age >= ifp->prefered_lft - regen_advance) {
- struct inet6_ifaddr *ifpub = ifp->ifpub;
- if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ;
- if (!ifp->regen_count && ifpub) {
- ifp->regen_count++;
- in6_ifa_hold(ifp);
- in6_ifa_hold(ifpub);
- spin_unlock(&ifp->lock);
- read_unlock(&addrconf_hash_lock);
- spin_lock(&ifpub->lock);
- ifpub->regen_count = 0;
- spin_unlock(&ifpub->lock);
- ipv6_create_tempaddr(ifpub, ifp);
- in6_ifa_put(ifpub);
- in6_ifa_put(ifp);
- goto restart;
- }
- } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
- spin_unlock(&ifp->lock);
-#endif
} else {
/* ifp->prefered_lft <= ifp->valid_lft */
if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
@@ -2913,150 +4707,377 @@ restart:
spin_unlock(&ifp->lock);
}
}
- read_unlock(&addrconf_hash_lock);
}
- addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next;
- add_timer(&addr_chk_timer);
- spin_unlock_bh(&addrconf_verify_lock);
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;
+
+ pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
+ now, next, next_sec, next_sched);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
+ rcu_read_unlock_bh();
+}
+
+static void addrconf_verify_work(struct work_struct *w)
+{
+ struct net *net = container_of(to_delayed_work(w), struct net,
+ ipv6.addr_chk_work);
+
+ rtnl_net_lock(net);
+ addrconf_verify_rtnl(net);
+ rtnl_net_unlock(net);
+}
+
+static void addrconf_verify(struct net *net)
+{
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
}
-static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local)
+static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
+ struct in6_addr **peer_pfx)
{
struct in6_addr *pfx = NULL;
+ *peer_pfx = NULL;
+
if (addr)
pfx = nla_data(addr);
if (local) {
if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
- pfx = NULL;
- else
- pfx = nla_data(local);
+ *peer_pfx = pfx;
+ pfx = nla_data(local);
}
return pfx;
}
-static struct nla_policy ifa_ipv6_policy[IFA_MAX+1] __read_mostly = {
+static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_ADDRESS] = { .len = sizeof(struct in6_addr) },
[IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .len = sizeof(u32) },
+ [IFA_RT_PRIORITY] = { .len = sizeof(u32) },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFA_PROTO] = { .type = NLA_U8 },
};
static int
-inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *pfx;
+ struct in6_addr *pfx, *peer_pfx;
+ u32 ifa_flags;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
if (err < 0)
return err;
ifm = nlmsg_data(nlh);
- pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
- if (pfx == NULL)
+ pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+ if (!pfx)
return -EINVAL;
- return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen);
+ ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
+
+ /* We ignore other flags so far. */
+ ifa_flags &= IFA_F_MANAGETEMPADDR;
+
+ rtnl_net_lock(net);
+ err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
+ ifm->ifa_prefixlen, extack);
+ rtnl_net_unlock(net);
+
+ return err;
+}
+
+static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp,
+ unsigned long expires, u32 flags,
+ bool modify_peer)
+{
+ struct fib6_table *table;
+ struct fib6_info *f6i;
+ u32 prio;
+
+ f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev, 0, RTF_DEFAULT, true);
+ if (!f6i)
+ return -ENOENT;
+
+ prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
+ if (f6i->fib6_metric != prio) {
+ /* delete old one */
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
+
+ /* add new one */
+ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ return 0;
+ }
+ if (f6i != net->ipv6.fib6_null_entry) {
+ table = f6i->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (!(flags & RTF_EXPIRES)) {
+ fib6_clean_expires(f6i);
+ fib6_remove_gc_list(f6i);
+ } else {
+ fib6_set_expires(f6i, expires);
+ fib6_add_gc_list(f6i);
+ }
+
+ spin_unlock_bh(&table->tb6_lock);
+ }
+ fib6_info_release(f6i);
+
+ return 0;
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
- u32 prefered_lft, u32 valid_lft)
+static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
+ struct ifa6_config *cfg, clock_t expires,
+ u32 flags)
{
- if (!valid_lft || (prefered_lft > valid_lft))
+ bool was_managetempaddr;
+ bool new_peer = false;
+ bool had_prefixroute;
+
+ ASSERT_RTNL_NET(net);
+
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
+ (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
return -EINVAL;
- if (valid_lft == INFINITY_LIFE_TIME)
- ifa_flags |= IFA_F_PERMANENT;
- else if (valid_lft >= 0x7FFFFFFF/HZ)
- valid_lft = 0x7FFFFFFF/HZ;
+ if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
+ cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
- if (prefered_lft == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
- (prefered_lft != INFINITY_LIFE_TIME))
- prefered_lft = 0x7FFFFFFF/HZ;
+ if (cfg->peer_pfx &&
+ memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) {
+ if (!ipv6_addr_any(&ifp->peer_addr))
+ cleanup_prefix_route(ifp, expires, true, true);
+ new_peer = true;
+ }
spin_lock_bh(&ifp->lock);
- ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags;
- ifp->tstamp = jiffies;
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
+ was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR;
+ had_prefixroute = ifp->flags & IFA_F_PERMANENT &&
+ !(ifp->flags & IFA_F_NOPREFIXROUTE);
+ ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
+ IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
+ IFA_F_NOPREFIXROUTE);
+ ifp->flags |= cfg->ifa_flags;
+ WRITE_ONCE(ifp->tstamp, jiffies);
+ WRITE_ONCE(ifp->valid_lft, cfg->valid_lft);
+ WRITE_ONCE(ifp->prefered_lft, cfg->preferred_lft);
+ WRITE_ONCE(ifp->ifa_proto, cfg->ifa_proto);
+
+ if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+ WRITE_ONCE(ifp->rt_priority, cfg->rt_priority);
+
+ if (new_peer)
+ ifp->peer_addr = *cfg->peer_pfx;
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
ipv6_ifa_notify(0, ifp);
- addrconf_verify(0);
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ int rc = -ENOENT;
+
+ if (had_prefixroute)
+ rc = modify_prefix_route(net, ifp, expires, flags, false);
+
+ /* prefix route could have been deleted; if so restore it */
+ if (rc == -ENOENT) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
+
+ if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr))
+ rc = modify_prefix_route(net, ifp, expires, flags, true);
+
+ if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) {
+ addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
+ } else if (had_prefixroute) {
+ enum cleanup_prefix_rt_t action;
+ unsigned long rt_expires;
+
+ write_lock_bh(&ifp->idev->lock);
+ action = check_cleanup_prefix_route(ifp, &rt_expires);
+ write_unlock_bh(&ifp->idev->lock);
+
+ if (action != CLEANUP_PREFIX_RT_NOP) {
+ cleanup_prefix_route(ifp, rt_expires,
+ action == CLEANUP_PREFIX_RT_DEL, false);
+ }
+ }
+
+ if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
+ if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
+ delete_tempaddrs(ifp->idev, ifp);
+ else
+ manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, !was_managetempaddr,
+ jiffies);
+ }
+
+ addrconf_verify_rtnl(net);
return 0;
}
static int
-inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- struct ifaddrmsg *ifm;
+ struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *pfx;
+ struct in6_addr *peer_pfx;
struct inet6_ifaddr *ifa;
struct net_device *dev;
- u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
- u8 ifa_flags;
+ struct inet6_dev *idev;
+ struct ifa6_config cfg;
+ struct ifaddrmsg *ifm;
+ unsigned long timeout;
+ clock_t expires;
+ u32 flags;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
if (err < 0)
return err;
+ memset(&cfg, 0, sizeof(cfg));
+
ifm = nlmsg_data(nlh);
- pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
- if (pfx == NULL)
+ cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+ if (!cfg.pfx)
return -EINVAL;
+ cfg.peer_pfx = peer_pfx;
+ cfg.plen = ifm->ifa_prefixlen;
+ if (tb[IFA_RT_PRIORITY])
+ cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
+ if (tb[IFA_PROTO])
+ cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
+ cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
+
+ /* We ignore other flags so far. */
+ cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+ IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+ IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+
+ cfg.ifa_flags |= IFA_F_PERMANENT;
+ cfg.valid_lft = INFINITY_LIFE_TIME;
+ cfg.preferred_lft = INFINITY_LIFE_TIME;
+ expires = 0;
+ flags = 0;
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
ci = nla_data(tb[IFA_CACHEINFO]);
- valid_lft = ci->ifa_valid;
- preferred_lft = ci->ifa_prefered;
- } else {
- preferred_lft = INFINITY_LIFE_TIME;
- valid_lft = INFINITY_LIFE_TIME;
+ cfg.valid_lft = ci->ifa_valid;
+ cfg.preferred_lft = ci->ifa_prefered;
+
+ if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) {
+ NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid");
+ return -EINVAL;
+ }
+
+ timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ cfg.ifa_flags &= ~IFA_F_PERMANENT;
+ cfg.valid_lft = timeout;
+ expires = jiffies_to_clock_t(timeout * HZ);
+ flags = RTF_EXPIRES;
+ }
+
+ timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ cfg.ifa_flags |= IFA_F_DEPRECATED;
+
+ cfg.preferred_lft = timeout;
+ }
}
- dev = __dev_get_by_index(ifm->ifa_index);
- if (dev == NULL)
- return -ENODEV;
+ rtnl_net_lock(net);
- /* We ignore other flags so far. */
- ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS);
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
+ err = -ENODEV;
+ goto unlock_rtnl;
+ }
+
+ netdev_lock_ops(dev);
+ idev = ipv6_find_idev(dev);
+ if (IS_ERR(idev)) {
+ err = PTR_ERR(idev);
+ goto unlock;
+ }
+
+ if (!ipv6_allow_optimistic_dad(net, idev))
+ cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
+
+ if (cfg.ifa_flags & IFA_F_NODAD &&
+ cfg.ifa_flags & IFA_F_OPTIMISTIC) {
+ NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
+ err = -EINVAL;
+ goto unlock;
+ }
- ifa = ipv6_get_ifaddr(pfx, dev, 1);
- if (ifa == NULL) {
+ ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
+ if (!ifa) {
/*
* It would be best to check for !NLM_F_CREATE here but
- * userspace alreay relies on not having to provide this.
+ * userspace already relies on not having to provide this.
*/
- return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen,
- ifa_flags, preferred_lft, valid_lft);
+ err = inet6_addr_add(net, dev, &cfg, expires, flags, extack);
+ goto unlock;
}
if (nlh->nlmsg_flags & NLM_F_EXCL ||
- !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "address already assigned");
err = -EEXIST;
- else
- err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+ } else {
+ err = inet6_addr_modify(net, ifa, &cfg, expires, flags);
+ }
in6_ifa_put(ifa);
+unlock:
+ netdev_unlock_ops(dev);
+unlock_rtnl:
+ rtnl_net_unlock(net);
return err;
}
-static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags,
+static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u32 flags,
u8 scope, int ifindex)
{
struct ifaddrmsg *ifm;
@@ -3074,10 +5095,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
{
struct ifa_cacheinfo ci;
- ci.cstamp = (u32)(TIME_DELTA(cstamp, INITIAL_JIFFIES) / HZ * 100
- + TIME_DELTA(cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ);
- ci.tstamp = (u32)(TIME_DELTA(tstamp, INITIAL_JIFFIES) / HZ * 100
- + TIME_DELTA(tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ);
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
ci.ifa_prefered = preferred;
ci.ifa_valid = valid;
@@ -3099,188 +5118,352 @@ static inline int rt_scope(int ifa_scope)
static inline int inet6_ifaddr_msgsize(void)
{
return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(16) /* IFA_LOCAL */
+ nla_total_size(16) /* IFA_ADDRESS */
- + nla_total_size(sizeof(struct ifa_cacheinfo));
+ + nla_total_size(sizeof(struct ifa_cacheinfo))
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(1) /* IFA_PROTO */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */;
}
-static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
- u32 pid, u32 seq, int event, unsigned int flags)
+static int inet6_fill_ifaddr(struct sk_buff *skb,
+ const struct inet6_ifaddr *ifa,
+ struct inet6_fill_args *args)
{
- struct nlmsghdr *nlh;
+ struct nlmsghdr *nlh;
u32 preferred, valid;
+ u32 flags, priority;
+ u8 proto;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+ flags = READ_ONCE(ifa->flags);
put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
ifa->idev->dev->ifindex);
- if (!(ifa->flags&IFA_F_PERMANENT)) {
- preferred = ifa->prefered_lft;
- valid = ifa->valid_lft;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto error;
+
+ preferred = READ_ONCE(ifa->prefered_lft);
+ valid = READ_ONCE(ifa->valid_lft);
+
+ if (!((flags & IFA_F_PERMANENT) &&
+ (preferred == INFINITY_LIFE_TIME))) {
if (preferred != INFINITY_LIFE_TIME) {
- long tval = (jiffies - ifa->tstamp)/HZ;
- preferred -= tval;
- if (valid != INFINITY_LIFE_TIME)
- valid -= tval;
+ long tval = (jiffies - READ_ONCE(ifa->tstamp)) / HZ;
+
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
}
} else {
preferred = INFINITY_LIFE_TIME;
valid = INFINITY_LIFE_TIME;
}
- if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 ||
- put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
- return nlmsg_cancel(skb, nlh);
+ if (!ipv6_addr_any(&ifa->peer_addr)) {
+ if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 ||
+ nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0)
+ goto error;
+ } else {
+ if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
+ goto error;
+ }
+
+ priority = READ_ONCE(ifa->rt_priority);
+ if (priority && nla_put_u32(skb, IFA_RT_PRIORITY, priority))
+ goto error;
+
+ if (put_cacheinfo(skb, ifa->cstamp, READ_ONCE(ifa->tstamp),
+ preferred, valid) < 0)
+ goto error;
- return nlmsg_end(skb, nlh);
+ if (nla_put_u32(skb, IFA_FLAGS, flags) < 0)
+ goto error;
+
+ proto = READ_ONCE(ifa->ifa_proto);
+ if (proto && nla_put_u8(skb, IFA_PROTO, proto))
+ goto error;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+error:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
- u32 pid, u32 seq, int event, u16 flags)
+int inet6_fill_ifmcaddr(struct sk_buff *skb,
+ const struct ifmcaddr6 *ifmca,
+ struct inet6_fill_args *args)
{
- struct nlmsghdr *nlh;
- u8 scope = RT_SCOPE_UNIVERSE;
int ifindex = ifmca->idev->dev->ifindex;
+ u8 scope = RT_SCOPE_UNIVERSE;
+ struct nlmsghdr *nlh;
- if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
+ if (!args->force_rt_scope_universe &&
+ ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
- if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 ||
- put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0)
- return nlmsg_cancel(skb, nlh);
+ if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
+ put_cacheinfo(skb, ifmca->mca_cstamp, READ_ONCE(ifmca->mca_tstamp),
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
}
-static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
- u32 pid, u32 seq, int event, unsigned int flags)
+int inet6_fill_ifacaddr(struct sk_buff *skb,
+ const struct ifacaddr6 *ifaca,
+ struct inet6_fill_args *args)
{
- struct nlmsghdr *nlh;
+ struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
+ int ifindex = dev ? dev->ifindex : 1;
u8 scope = RT_SCOPE_UNIVERSE;
- int ifindex = ifaca->aca_idev->dev->ifindex;
+ struct nlmsghdr *nlh;
if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
- if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 ||
- put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0)
- return nlmsg_cancel(skb, nlh);
+ if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
+ put_cacheinfo(skb, ifaca->aca_cstamp, READ_ONCE(ifaca->aca_tstamp),
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
- return nlmsg_end(skb, nlh);
+ nlmsg_end(skb, nlh);
+ return 0;
}
-enum addr_type_t
+/* called with rcu_read_lock() */
+static int in6_dump_addrs(const struct inet6_dev *idev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet6_fill_args *fillargs)
{
- UNICAST_ADDR,
- MULTICAST_ADDR,
- ANYCAST_ADDR,
-};
+ const struct ifmcaddr6 *ifmca;
+ const struct ifacaddr6 *ifaca;
+ int ip_idx = 0;
+ int err = 0;
+
+ switch (fillargs->type) {
+ case UNICAST_ADDR: {
+ const struct inet6_ifaddr *ifa;
+ fillargs->event = RTM_NEWADDR;
+
+ /* unicast address incl. temp addr */
+ list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
+ if (ip_idx < *s_ip_idx)
+ goto next;
+ err = inet6_fill_ifaddr(skb, ifa, fillargs);
+ if (err < 0)
+ break;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+next:
+ ip_idx++;
+ }
+ break;
+ }
+ case MULTICAST_ADDR:
+ fillargs->event = RTM_GETMULTICAST;
+
+ /* multicast address */
+ for (ifmca = rcu_dereference(idev->mc_list);
+ ifmca;
+ ifmca = rcu_dereference(ifmca->next), ip_idx++) {
+ if (ip_idx < *s_ip_idx)
+ continue;
+ err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
+ if (err < 0)
+ break;
+ }
+ break;
+ case ANYCAST_ADDR:
+ fillargs->event = RTM_GETANYCAST;
+ /* anycast address */
+ for (ifaca = rcu_dereference(idev->ac_list); ifaca;
+ ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) {
+ if (ip_idx < *s_ip_idx)
+ continue;
+ err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
+ if (err < 0)
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ *s_ip_idx = err ? ip_idx : 0;
+ return err;
+}
+
+static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet6_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
+ return -EINVAL;
+ }
+
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ fillargs->netnsid = -1;
+ NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
enum addr_type_t type)
{
- int idx, ip_idx;
- int s_idx, s_ip_idx;
- int err = 1;
+ struct net *tgt_net = sock_net(skb->sk);
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet6_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = cb->nlh->nlmsg_seq,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ .type = type,
+ .force_rt_scope_universe = false,
+ };
+ struct {
+ unsigned long ifindex;
+ int ip_idx;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
- struct inet6_dev *idev = NULL;
- struct inet6_ifaddr *ifa;
- struct ifmcaddr6 *ifmca;
- struct ifacaddr6 *ifaca;
-
- s_idx = cb->args[0];
- s_ip_idx = ip_idx = cb->args[1];
- read_lock(&dev_base_lock);
-
- for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
- if (idx < s_idx)
- continue;
- if (idx > s_idx)
- s_ip_idx = 0;
- ip_idx = 0;
- if ((idev = in6_dev_get(dev)) == NULL)
- continue;
- read_lock_bh(&idev->lock);
- switch (type) {
- case UNICAST_ADDR:
- /* unicast address incl. temp addr */
- for (ifa = idev->addr_list; ifa;
- ifa = ifa->if_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if ((err = inet6_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWADDR,
- NLM_F_MULTI)) <= 0)
- goto done;
- }
- break;
- case MULTICAST_ADDR:
- /* multicast address */
- for (ifmca = idev->mc_list; ifmca;
- ifmca = ifmca->next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if ((err = inet6_fill_ifmcaddr(skb, ifmca,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_GETMULTICAST,
- NLM_F_MULTI)) <= 0)
- goto done;
- }
- break;
- case ANYCAST_ADDR:
- /* anycast address */
- for (ifaca = idev->ac_list; ifaca;
- ifaca = ifaca->aca_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if ((err = inet6_fill_ifacaddr(skb, ifaca,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_GETANYCAST,
- NLM_F_MULTI)) <= 0)
- goto done;
+ struct inet6_dev *idev;
+ int err = 0;
+
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ goto done;
+
+ err = 0;
+ if (fillargs.ifindex) {
+ dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto done;
}
- break;
- default:
- break;
+ idev = __in6_dev_get(dev);
+ if (idev)
+ err = in6_dump_addrs(idev, skb, cb,
+ &ctx->ip_idx,
+ &fillargs);
+ goto done;
}
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
}
-done:
- if (err <= 0) {
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
+
+ cb->seq = inet6_base_seq(tgt_net);
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+ err = in6_dump_addrs(idev, skb, cb, &ctx->ip_idx,
+ &fillargs);
+ if (err < 0)
+ goto done;
}
- read_unlock(&dev_base_lock);
- cb->args[0] = idx;
- cb->args[1] = ip_idx;
- return skb->len;
+done:
+ rcu_read_unlock();
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
+
+ return err;
}
static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
enum addr_type_t type = UNICAST_ADDR;
+
return inet6_dump_addr(skb, cb, type);
}
static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
enum addr_type_t type = MULTICAST_ADDR;
+
return inet6_dump_addr(skb, cb, type);
}
@@ -3288,110 +5471,264 @@ static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
enum addr_type_t type = ANYCAST_ADDR;
+
return inet6_dump_addr(skb, cb, type);
}
-static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh,
- void *arg)
+static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct ifaddrmsg *ifm;
+ int i, err;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFA_TARGET_NETNSID:
+ case IFA_ADDRESS:
+ case IFA_LOCAL:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get address request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *tgt_net = sock_net(in_skb->sk);
+ struct inet6_fill_args fillargs = {
+ .portid = NETLINK_CB(in_skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = RTM_NEWADDR,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = false,
+ };
+ struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *addr = NULL;
+ struct in6_addr *addr = NULL, *peer;
struct net_device *dev = NULL;
struct inet6_ifaddr *ifa;
struct sk_buff *skb;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+ err = inet6_rtm_valid_getaddr_req(in_skb, nlh, tb, extack);
if (err < 0)
- goto errout;
+ return err;
- addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
- if (addr == NULL) {
+ if (tb[IFA_TARGET_NETNSID]) {
+ fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);
+
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
+ fillargs.netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+
+ addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
+ if (!addr) {
err = -EINVAL;
goto errout;
}
-
ifm = nlmsg_data(nlh);
if (ifm->ifa_index)
- dev = __dev_get_by_index(ifm->ifa_index);
+ dev = dev_get_by_index(tgt_net, ifm->ifa_index);
- if ((ifa = ipv6_get_ifaddr(addr, dev, 1)) == NULL) {
+ ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
+ if (!ifa) {
err = -EADDRNOTAVAIL;
goto errout;
}
- if ((skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL)) == NULL) {
+ skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL);
+ if (!skb) {
err = -ENOBUFS;
goto errout_ifa;
}
- err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, RTM_NEWADDR, 0);
- /* failure implies BUG in inet6_ifaddr_msgsize() */
- BUG_ON(err < 0);
-
- err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
+ err = inet6_fill_ifaddr(skb, ifa, &fillargs);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout_ifa;
+ }
+ err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
errout_ifa:
in6_ifa_put(ifa);
errout:
+ dev_put(dev);
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
+
return err;
}
static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
{
struct sk_buff *skb;
+ struct net *net = dev_net(ifa->idev->dev);
+ struct inet6_fill_args fillargs = {
+ .portid = 0,
+ .seq = 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = false,
+ };
int err = -ENOBUFS;
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
goto errout;
- err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
- /* failure implies BUG in inet6_ifaddr_msgsize() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ err = inet6_fill_ifaddr(skb, ifa, &fillargs);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ return;
errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV6_IFADDR, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
}
-static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
- __s32 *array, int bytes)
+static void ipv6_store_devconf(const struct ipv6_devconf *cnf,
+ __s32 *array, int bytes)
{
BUG_ON(bytes < (DEVCONF_MAX * 4));
memset(array, 0, bytes);
- array[DEVCONF_FORWARDING] = cnf->forwarding;
- array[DEVCONF_HOPLIMIT] = cnf->hop_limit;
- array[DEVCONF_MTU6] = cnf->mtu6;
- array[DEVCONF_ACCEPT_RA] = cnf->accept_ra;
- array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects;
- array[DEVCONF_AUTOCONF] = cnf->autoconf;
- array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits;
- array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
- array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval;
- array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay;
- array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
-#ifdef CONFIG_IPV6_PRIVACY
- array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
- array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
- array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft;
- array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry;
- array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
-#endif
- array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
- array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
- array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
+ array[DEVCONF_FORWARDING] = READ_ONCE(cnf->forwarding);
+ array[DEVCONF_HOPLIMIT] = READ_ONCE(cnf->hop_limit);
+ array[DEVCONF_MTU6] = READ_ONCE(cnf->mtu6);
+ array[DEVCONF_ACCEPT_RA] = READ_ONCE(cnf->accept_ra);
+ array[DEVCONF_ACCEPT_REDIRECTS] = READ_ONCE(cnf->accept_redirects);
+ array[DEVCONF_AUTOCONF] = READ_ONCE(cnf->autoconf);
+ array[DEVCONF_DAD_TRANSMITS] = READ_ONCE(cnf->dad_transmits);
+ array[DEVCONF_RTR_SOLICITS] = READ_ONCE(cnf->rtr_solicits);
+ array[DEVCONF_RTR_SOLICIT_INTERVAL] =
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_interval));
+ array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_max_interval));
+ array[DEVCONF_RTR_SOLICIT_DELAY] =
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_delay));
+ array[DEVCONF_FORCE_MLD_VERSION] = READ_ONCE(cnf->force_mld_version);
+ array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
+ jiffies_to_msecs(READ_ONCE(cnf->mldv1_unsolicited_report_interval));
+ array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
+ jiffies_to_msecs(READ_ONCE(cnf->mldv2_unsolicited_report_interval));
+ array[DEVCONF_USE_TEMPADDR] = READ_ONCE(cnf->use_tempaddr);
+ array[DEVCONF_TEMP_VALID_LFT] = READ_ONCE(cnf->temp_valid_lft);
+ array[DEVCONF_TEMP_PREFERED_LFT] = READ_ONCE(cnf->temp_prefered_lft);
+ array[DEVCONF_REGEN_MAX_RETRY] = READ_ONCE(cnf->regen_max_retry);
+ array[DEVCONF_MAX_DESYNC_FACTOR] = READ_ONCE(cnf->max_desync_factor);
+ array[DEVCONF_MAX_ADDRESSES] = READ_ONCE(cnf->max_addresses);
+ array[DEVCONF_ACCEPT_RA_DEFRTR] = READ_ONCE(cnf->accept_ra_defrtr);
+ array[DEVCONF_RA_DEFRTR_METRIC] = READ_ONCE(cnf->ra_defrtr_metric);
+ array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] =
+ READ_ONCE(cnf->accept_ra_min_hop_limit);
+ array[DEVCONF_ACCEPT_RA_PINFO] = READ_ONCE(cnf->accept_ra_pinfo);
#ifdef CONFIG_IPV6_ROUTER_PREF
- array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
- array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval;
-#ifdef CONFIV_IPV6_ROUTE_INFO
- array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
+ array[DEVCONF_ACCEPT_RA_RTR_PREF] = READ_ONCE(cnf->accept_ra_rtr_pref);
+ array[DEVCONF_RTR_PROBE_INTERVAL] =
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_probe_interval));
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] =
+ READ_ONCE(cnf->accept_ra_rt_info_min_plen);
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] =
+ READ_ONCE(cnf->accept_ra_rt_info_max_plen);
+#endif
+#endif
+ array[DEVCONF_PROXY_NDP] = READ_ONCE(cnf->proxy_ndp);
+ array[DEVCONF_ACCEPT_SOURCE_ROUTE] =
+ READ_ONCE(cnf->accept_source_route);
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ array[DEVCONF_OPTIMISTIC_DAD] = READ_ONCE(cnf->optimistic_dad);
+ array[DEVCONF_USE_OPTIMISTIC] = READ_ONCE(cnf->use_optimistic);
+#endif
+#ifdef CONFIG_IPV6_MROUTE
+ array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
#endif
+ array[DEVCONF_DISABLE_IPV6] = READ_ONCE(cnf->disable_ipv6);
+ array[DEVCONF_ACCEPT_DAD] = READ_ONCE(cnf->accept_dad);
+ array[DEVCONF_FORCE_TLLAO] = READ_ONCE(cnf->force_tllao);
+ array[DEVCONF_NDISC_NOTIFY] = READ_ONCE(cnf->ndisc_notify);
+ array[DEVCONF_SUPPRESS_FRAG_NDISC] =
+ READ_ONCE(cnf->suppress_frag_ndisc);
+ array[DEVCONF_ACCEPT_RA_FROM_LOCAL] =
+ READ_ONCE(cnf->accept_ra_from_local);
+ array[DEVCONF_ACCEPT_RA_MTU] = READ_ONCE(cnf->accept_ra_mtu);
+ array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] =
+ READ_ONCE(cnf->ignore_routes_with_linkdown);
+ /* we omit DEVCONF_STABLE_SECRET for now */
+ array[DEVCONF_USE_OIF_ADDRS_ONLY] = READ_ONCE(cnf->use_oif_addrs_only);
+ array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] =
+ READ_ONCE(cnf->drop_unicast_in_l2_multicast);
+ array[DEVCONF_DROP_UNSOLICITED_NA] = READ_ONCE(cnf->drop_unsolicited_na);
+ array[DEVCONF_KEEP_ADDR_ON_DOWN] = READ_ONCE(cnf->keep_addr_on_down);
+ array[DEVCONF_SEG6_ENABLED] = READ_ONCE(cnf->seg6_enabled);
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ array[DEVCONF_SEG6_REQUIRE_HMAC] = READ_ONCE(cnf->seg6_require_hmac);
#endif
- array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
+ array[DEVCONF_ENHANCED_DAD] = READ_ONCE(cnf->enhanced_dad);
+ array[DEVCONF_ADDR_GEN_MODE] = READ_ONCE(cnf->addr_gen_mode);
+ array[DEVCONF_DISABLE_POLICY] = READ_ONCE(cnf->disable_policy);
+ array[DEVCONF_NDISC_TCLASS] = READ_ONCE(cnf->ndisc_tclass);
+ array[DEVCONF_RPL_SEG_ENABLED] = READ_ONCE(cnf->rpl_seg_enabled);
+ array[DEVCONF_IOAM6_ENABLED] = READ_ONCE(cnf->ioam6_enabled);
+ array[DEVCONF_IOAM6_ID] = READ_ONCE(cnf->ioam6_id);
+ array[DEVCONF_IOAM6_ID_WIDE] = READ_ONCE(cnf->ioam6_id_wide);
+ array[DEVCONF_NDISC_EVICT_NOCARRIER] =
+ READ_ONCE(cnf->ndisc_evict_nocarrier);
+ array[DEVCONF_ACCEPT_UNTRACKED_NA] =
+ READ_ONCE(cnf->accept_untracked_na);
+ array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft);
+ array[DEVCONF_FORCE_FORWARDING] = READ_ONCE(cnf->force_forwarding);
+}
+
+static inline size_t inet6_ifla6_size(void)
+{
+ return nla_total_size(4) /* IFLA_INET6_FLAGS */
+ + nla_total_size(sizeof(struct ifla_cacheinfo))
+ + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
+ + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
+ + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
+ + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
+ + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+ + nla_total_size(4) /* IFLA_INET6_RA_MTU */
+ + 0;
}
static inline size_t inet6_if_nlmsg_size(void)
@@ -3401,113 +5738,459 @@ static inline size_t inet6_if_nlmsg_size(void)
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(4) /* IFLA_MTU */
+ nla_total_size(4) /* IFLA_LINK */
- + nla_total_size( /* IFLA_PROTINFO */
- nla_total_size(4) /* IFLA_INET6_FLAGS */
- + nla_total_size(sizeof(struct ifla_cacheinfo))
- + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
- );
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
+}
+
+static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
+ int bytes)
+{
+ int i;
+ int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX;
+ BUG_ON(pad < 0);
+
+ /* Use put_unaligned() because stats may not be aligned for u64. */
+ put_unaligned(ICMP6_MIB_MAX, &stats[0]);
+ for (i = 1; i < ICMP6_MIB_MAX; i++)
+ put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
+
+ memset(&stats[ICMP6_MIB_MAX], 0, pad);
+}
+
+static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
+ int bytes, size_t syncpoff)
+{
+ int i, c;
+ u64 buff[IPSTATS_MIB_MAX];
+ int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX;
+
+ BUG_ON(pad < 0);
+
+ memset(buff, 0, sizeof(buff));
+ buff[0] = IPSTATS_MIB_MAX;
+
+ for_each_possible_cpu(c) {
+ for (i = 1; i < IPSTATS_MIB_MAX; i++)
+ buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff);
+ }
+
+ memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64));
+ memset(&stats[IPSTATS_MIB_MAX], 0, pad);
+}
+
+static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
+ int bytes)
+{
+ switch (attrtype) {
+ case IFLA_INET6_STATS:
+ __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes,
+ offsetof(struct ipstats_mib, syncp));
+ break;
+ case IFLA_INET6_ICMP6STATS:
+ __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes);
+ break;
+ }
+}
+
+static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb,
+ struct inet6_dev *idev)
+{
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
+
+ nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
+ u32 ext_filter_mask)
+{
+ struct ifla_cacheinfo ci;
+ struct nlattr *nla;
+ u32 ra_mtu;
+
+ if (nla_put_u32(skb, IFLA_INET6_FLAGS, READ_ONCE(idev->if_flags)))
+ goto nla_put_failure;
+ ci.max_reasm_len = IPV6_MAXPLEN;
+ ci.tstamp = cstamp_delta(READ_ONCE(idev->tstamp));
+ ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
+ ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME));
+ if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+ nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla));
+
+ /* XXX - MC not implemented */
+
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) {
+ if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0)
+ goto nla_put_failure;
+ }
+
+ nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
+ if (!nla)
+ goto nla_put_failure;
+ read_lock_bh(&idev->lock);
+ memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
+ read_unlock_bh(&idev->lock);
+
+ if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE,
+ READ_ONCE(idev->cnf.addr_gen_mode)))
+ goto nla_put_failure;
+
+ ra_mtu = READ_ONCE(idev->ra_mtu);
+ if (ra_mtu && nla_put_u32(skb, IFLA_INET6_RA_MTU, ra_mtu))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static size_t inet6_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ if (!__in6_dev_get(dev))
+ return 0;
+
+ return inet6_ifla6_size();
+}
+
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (!idev)
+ return -ENODATA;
+
+ if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token,
+ struct netlink_ext_ack *extack)
+{
+ struct inet6_ifaddr *ifp;
+ struct net_device *dev = idev->dev;
+ bool clear_token, update_rs = false;
+ struct in6_addr ll_addr;
+
+ ASSERT_RTNL();
+
+ if (!token)
+ return -EINVAL;
+
+ if (dev->flags & IFF_LOOPBACK) {
+ NL_SET_ERR_MSG_MOD(extack, "Device is loopback");
+ return -EINVAL;
+ }
+
+ if (dev->flags & IFF_NOARP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Device does not do neighbour discovery");
+ return -EINVAL;
+ }
+
+ if (!ipv6_accept_ra(idev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Router advertisement is disabled on device");
+ return -EINVAL;
+ }
+
+ if (READ_ONCE(idev->cnf.rtr_solicits) == 0) {
+ NL_SET_ERR_MSG(extack,
+ "Router solicitation is disabled on device");
+ return -EINVAL;
+ }
+
+ write_lock_bh(&idev->lock);
+
+ BUILD_BUG_ON(sizeof(token->s6_addr) != 16);
+ memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8);
+
+ write_unlock_bh(&idev->lock);
+
+ clear_token = ipv6_addr_any(token);
+ if (clear_token)
+ goto update_lft;
+
+ if (!idev->dead && (idev->if_flags & IF_READY) &&
+ !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
+ IFA_F_OPTIMISTIC)) {
+ /* If we're not ready, then normal ifup will take care
+ * of this. Otherwise, we need to request our rs here.
+ */
+ ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters);
+ update_rs = true;
+ }
+
+update_lft:
+ write_lock_bh(&idev->lock);
+
+ if (update_rs) {
+ idev->if_flags |= IF_RS_SENT;
+ idev->rs_interval = rfc3315_s14_backoff_init(
+ READ_ONCE(idev->cnf.rtr_solicit_interval));
+ idev->rs_probes = 1;
+ addrconf_mod_rs_timer(idev, idev->rs_interval);
+ }
+
+ /* Well, that's kinda nasty ... */
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ spin_lock(&ifp->lock);
+ if (ifp->tokenized) {
+ ifp->valid_lft = 0;
+ ifp->prefered_lft = 0;
+ }
+ spin_unlock(&ifp->lock);
+ }
+
+ write_unlock_bh(&idev->lock);
+ inet6_ifinfo_notify(RTM_NEWLINK, idev);
+ addrconf_verify_rtnl(dev_net(dev));
+ return 0;
+}
+
+static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
+ [IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 },
+ [IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) },
+ [IFLA_INET6_RA_MTU] = { .type = NLA_REJECT,
+ .reject_message =
+ "IFLA_INET6_RA_MTU can not be set" },
+};
+
+static int check_addr_gen_mode(int mode)
+{
+ if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+ mode != IN6_ADDR_GEN_MODE_NONE &&
+ mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ mode != IN6_ADDR_GEN_MODE_RANDOM)
+ return -EINVAL;
+ return 1;
+}
+
+static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
+ int mode)
+{
+ if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ !idev->cnf.stable_secret.initialized &&
+ !net->ipv6.devconf_dflt->stable_secret.initialized)
+ return -EINVAL;
+ return 1;
+}
+
+static int inet6_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_INET6_MAX + 1];
+ struct inet6_dev *idev = NULL;
+ int err;
+
+ if (dev) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return -EAFNOSUPPORT;
+ }
+
+ err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
+ inet6_af_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_INET6_TOKEN] && !tb[IFLA_INET6_ADDR_GEN_MODE])
+ return -EINVAL;
+
+ if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
+ u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
+
+ if (check_addr_gen_mode(mode) < 0)
+ return -EINVAL;
+ if (dev && check_stable_privacy(idev, dev_net(dev), mode) < 0)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+ struct nlattr *tb[IFLA_INET6_MAX + 1];
+ int err;
+
+ if (!idev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
+ return -EINVAL;
+
+ if (tb[IFLA_INET6_TOKEN]) {
+ err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]),
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
+ u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
+
+ WRITE_ONCE(idev->cnf.addr_gen_mode, mode);
+ }
+
+ return 0;
}
-static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
- u32 pid, u32 seq, int event, unsigned int flags)
+static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
+ u32 portid, u32 seq, int event, unsigned int flags)
{
struct net_device *dev = idev->dev;
- struct nlattr *conf;
struct ifinfomsg *hdr;
struct nlmsghdr *nlh;
+ int ifindex, iflink;
void *protoinfo;
- struct ifla_cacheinfo ci;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
+ if (!nlh)
+ return -EMSGSIZE;
hdr = nlmsg_data(nlh);
hdr->ifi_family = AF_INET6;
hdr->__ifi_pad = 0;
hdr->ifi_type = dev->type;
- hdr->ifi_index = dev->ifindex;
- hdr->ifi_flags = dev_get_flags(dev);
+ ifindex = READ_ONCE(dev->ifindex);
+ hdr->ifi_index = ifindex;
+ hdr->ifi_flags = netif_get_flags(dev);
hdr->ifi_change = 0;
- NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+ iflink = dev_get_iflink(dev);
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+ (ifindex != iflink &&
+ nla_put_u32(skb, IFLA_LINK, iflink)) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN))
+ goto nla_put_failure;
+ protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO);
+ if (!protoinfo)
+ goto nla_put_failure;
- if (dev->addr_len)
- NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+ if (inet6_fill_ifla6_attrs(skb, idev, 0) < 0)
+ goto nla_put_failure;
- NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
- if (dev->ifindex != dev->iflink)
- NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+ nla_nest_end(skb, protoinfo);
+ nlmsg_end(skb, nlh);
+ return 0;
- protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
- if (protoinfo == NULL)
- goto nla_put_failure;
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
- NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags);
+static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
- ci.max_reasm_len = IPV6_MAXPLEN;
- ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100
- + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ);
- ci.reachable_time = idev->nd_parms->reachable_time;
- ci.retrans_time = idev->nd_parms->retrans_time;
- NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci);
-
- conf = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
- if (conf == NULL)
- goto nla_put_failure;
- ipv6_store_devconf(&idev->cnf, nla_data(conf), nla_len(conf));
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
+ return -EINVAL;
+ }
- /* XXX - Statistics/MC not implemented */
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header");
+ return -EINVAL;
+ }
- nla_nest_end(skb, protoinfo);
- return nlmsg_end(skb, nlh);
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
+ return -EINVAL;
+ }
-nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ return 0;
}
static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx, err;
- int s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct {
+ unsigned long ifindex;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
struct inet6_dev *idev;
+ int err;
- read_lock(&dev_base_lock);
- for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
- if (idx < s_idx)
- continue;
- if ((idev = in6_dev_get(dev)) == NULL)
+ /* only requests using strict checking can pass data to
+ * influence the dump
+ */
+ if (cb->strict_check) {
+ err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
+ err = 0;
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
continue;
- err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
- in6_dev_put(idev);
- if (err <= 0)
+ err = inet6_fill_ifinfo(skb, idev,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWLINK, NLM_F_MULTI);
+ if (err < 0)
break;
}
- read_unlock(&dev_base_lock);
- cb->args[0] = idx;
+ rcu_read_unlock();
- return skb->len;
+ return err;
}
void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
{
struct sk_buff *skb;
+ struct net *net = dev_net(idev->dev);
int err = -ENOBUFS;
-
+
skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0);
- /* failure implies BUG in inet6_if_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
+ return;
errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV6_IFADDR, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
}
static inline size_t inet6_prefix_nlmsg_size(void)
@@ -3518,16 +6201,16 @@ static inline size_t inet6_prefix_nlmsg_size(void)
}
static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
- struct prefix_info *pinfo, u32 pid, u32 seq,
+ struct prefix_info *pinfo, u32 portid, u32 seq,
int event, unsigned int flags)
{
struct prefixmsg *pmsg;
struct nlmsghdr *nlh;
struct prefix_cacheinfo ci;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
pmsg = nlmsg_data(nlh);
pmsg->prefix_family = AF_INET6;
@@ -3537,618 +6220,1402 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
pmsg->prefix_len = pinfo->prefix_len;
pmsg->prefix_type = pinfo->type;
pmsg->prefix_pad3 = 0;
- pmsg->prefix_flags = 0;
- if (pinfo->onlink)
- pmsg->prefix_flags |= IF_PREFIX_ONLINK;
- if (pinfo->autoconf)
- pmsg->prefix_flags |= IF_PREFIX_AUTOCONF;
-
- NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix);
+ pmsg->prefix_flags = pinfo->flags;
+ if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix))
+ goto nla_put_failure;
ci.preferred_time = ntohl(pinfo->prefered);
ci.valid_time = ntohl(pinfo->valid);
- NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci);
-
- return nlmsg_end(skb, nlh);
+ if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-static void inet6_prefix_notify(int event, struct inet6_dev *idev,
+static void inet6_prefix_notify(int event, struct inet6_dev *idev,
struct prefix_info *pinfo)
{
struct sk_buff *skb;
+ struct net *net = dev_net(idev->dev);
int err = -ENOBUFS;
skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC);
- if (skb == NULL)
+ if (!skb)
goto errout;
err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0);
- /* failure implies BUG in inet6_prefix_nlmsg_size() */
- BUG_ON(err < 0);
-
- err = rtnl_notify(skb, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+ return;
errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV6_PREFIX, err);
-}
-
-static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
- [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo, },
- [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr, },
- [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr, },
- [RTM_GETADDR - RTM_BASE] = { .doit = inet6_rtm_getaddr,
- .dumpit = inet6_dump_ifaddr, },
- [RTM_GETMULTICAST - RTM_BASE] = { .dumpit = inet6_dump_ifmcaddr, },
- [RTM_GETANYCAST - RTM_BASE] = { .dumpit = inet6_dump_ifacaddr, },
- [RTM_NEWROUTE - RTM_BASE] = { .doit = inet6_rtm_newroute, },
- [RTM_DELROUTE - RTM_BASE] = { .doit = inet6_rtm_delroute, },
- [RTM_GETROUTE - RTM_BASE] = { .doit = inet6_rtm_getroute,
- .dumpit = inet6_dump_fib, },
-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
- [RTM_GETRULE - RTM_BASE] = { .dumpit = fib6_rules_dump, },
-#endif
-};
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
+}
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
+ struct net *net = dev_net(ifp->idev->dev);
+
+ if (event)
+ ASSERT_RTNL();
+
inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);
switch (event) {
case RTM_NEWADDR:
- ip6_ins_rt(ifp->rt);
+ /*
+ * If the address was optimistic we inserted the route at the
+ * start of our DAD process, so we don't need to do it again.
+ * If the device was taken down in the middle of the DAD
+ * cycle there is a race where we could get here without a
+ * host route, so nothing to insert. That will be fixed when
+ * the device is brought up.
+ */
+ if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) {
+ ip6_ins_rt(net, ifp->rt);
+ } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) {
+ pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n",
+ &ifp->addr, ifp->idev->dev->name);
+ }
+
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
+ if (!ipv6_addr_any(&ifp->peer_addr))
+ addrconf_prefix_route(&ifp->peer_addr, 128,
+ ifp->rt_priority, ifp->idev->dev,
+ 0, 0, GFP_ATOMIC);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
addrconf_leave_anycast(ifp);
addrconf_leave_solict(ifp->idev, &ifp->addr);
- dst_hold(&ifp->rt->u.dst);
- if (ip6_del_rt(ifp->rt))
- dst_free(&ifp->rt->u.dst);
+ if (!ipv6_addr_any(&ifp->peer_addr)) {
+ struct fib6_info *rt;
+
+ rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
+ ifp->idev->dev, 0, 0,
+ false);
+ if (rt)
+ ip6_del_rt(net, rt, false);
+ }
+ if (ifp->rt) {
+ ip6_del_rt(net, ifp->rt, false);
+ ifp->rt = NULL;
+ }
+ rt_genid_bump_ipv6(net);
break;
}
+ atomic_inc(&net->ipv6.dev_addr_genid);
}
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
- rcu_read_lock_bh();
if (likely(ifp->idev->dead == 0))
__ipv6_ifa_notify(event, ifp);
- rcu_read_unlock_bh();
}
#ifdef CONFIG_SYSCTL
-static
-int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_forward(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
int ret;
- ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ /*
+ * ctl->data points to idev->cnf.forwarding, we should
+ * not modify it until we get the rtnl lock.
+ */
+ lctl = *ctl;
+ lctl.data = &val;
+
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
- if (write && valp != &ipv6_devconf_dflt.forwarding) {
- if (valp != &ipv6_devconf.forwarding) {
- if ((!*valp) ^ (!val)) {
- struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
- if (idev == NULL)
- return ret;
- dev_forward_change(idev);
- }
+ if (write)
+ ret = addrconf_fixup_forwarding(ctl, valp, val);
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
+static int addrconf_sysctl_mtu(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct inet6_dev *idev = ctl->extra1;
+ int min_mtu = IPV6_MIN_MTU;
+ struct ctl_table lctl;
+
+ lctl = *ctl;
+ lctl.extra1 = &min_mtu;
+ lctl.extra2 = idev ? &idev->dev->mtu : NULL;
+
+ return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
+}
+
+static void dev_disable_change(struct inet6_dev *idev)
+{
+ struct netdev_notifier_info info;
+
+ if (!idev || !idev->dev)
+ return;
+
+ netdev_notifier_info_init(&info, idev->dev);
+ if (idev->cnf.disable_ipv6)
+ addrconf_notify(NULL, NETDEV_DOWN, &info);
+ else
+ addrconf_notify(NULL, NETDEV_UP, &info);
+}
+
+static void addrconf_disable_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (idev) {
+ int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
+
+ WRITE_ONCE(idev->cnf.disable_ipv6, newf);
+ if (changed)
+ dev_disable_change(idev);
+ }
+ }
+}
+
+static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf)
+{
+ struct net *net = (struct net *)table->extra2;
+ int old;
+
+ if (p == &net->ipv6.devconf_dflt->disable_ipv6) {
+ WRITE_ONCE(*p, newf);
+ return 0;
+ }
+
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ old = *p;
+ WRITE_ONCE(*p, newf);
+
+ if (p == &net->ipv6.devconf_all->disable_ipv6) {
+ WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf);
+ addrconf_disable_change(net, newf);
+ } else if ((!newf) ^ (!old)) {
+ dev_disable_change((struct inet6_dev *)table->extra1);
+ }
+
+ rtnl_net_unlock(net);
+ return 0;
+}
+
+static int addrconf_sysctl_disable(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
+ int ret;
+
+ /*
+ * ctl->data points to idev->cnf.disable_ipv6, we should
+ * not modify it until we get the rtnl lock.
+ */
+ lctl = *ctl;
+ lctl.data = &val;
+
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+ if (write)
+ ret = addrconf_disable_ipv6(ctl, valp, val);
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
+static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int ret;
+ int old, new;
+
+ old = *valp;
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ new = *valp;
+
+ if (write && old != new) {
+ struct net *net = ctl->extra2;
+
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ if (valp == &net->ipv6.devconf_dflt->proxy_ndp) {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_PROXY_NEIGH,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ } else if (valp == &net->ipv6.devconf_all->proxy_ndp) {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_PROXY_NEIGH,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
} else {
- ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding;
- addrconf_forward_change();
+ struct inet6_dev *idev = ctl->extra1;
+
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_PROXY_NEIGH,
+ idev->dev->ifindex,
+ &idev->cnf);
}
- if (*valp)
- rt6_purge_dflt_routers();
+ rtnl_net_unlock(net);
}
- return ret;
+ return ret;
}
-static int addrconf_sysctl_forward_strategy(ctl_table *table,
- int __user *name, int nlen,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- int *valp = table->data;
- int new;
+ int ret = 0;
+ u32 new_val;
+ struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+ struct net *net = (struct net *)ctl->extra2;
+ struct ctl_table tmp = {
+ .data = &new_val,
+ .maxlen = sizeof(new_val),
+ .mode = ctl->mode,
+ };
- if (!newval || !newlen)
- return 0;
- if (newlen != sizeof(int))
- return -EINVAL;
- if (get_user(new, (int __user *)newval))
- return -EFAULT;
- if (new == *valp)
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ new_val = *((u32 *)ctl->data);
+
+ ret = proc_douintvec(&tmp, write, buffer, lenp, ppos);
+ if (ret != 0)
+ goto out;
+
+ if (write) {
+ if (check_addr_gen_mode(new_val) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (idev) {
+ if (check_stable_privacy(idev, net, new_val) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (idev->cnf.addr_gen_mode != new_val) {
+ WRITE_ONCE(idev->cnf.addr_gen_mode, new_val);
+ netdev_lock_ops(idev->dev);
+ addrconf_init_auto_addrs(idev->dev);
+ netdev_unlock_ops(idev->dev);
+ }
+ } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
+ struct net_device *dev;
+
+ WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val);
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (idev &&
+ idev->cnf.addr_gen_mode != new_val) {
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ new_val);
+ netdev_lock_ops(idev->dev);
+ addrconf_init_auto_addrs(idev->dev);
+ netdev_unlock_ops(idev->dev);
+ }
+ }
+ }
+
+ WRITE_ONCE(*((u32 *)ctl->data), new_val);
+ }
+
+out:
+ rtnl_net_unlock(net);
+
+ return ret;
+}
+
+static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int err;
+ struct in6_addr addr;
+ char str[IPV6_MAX_STRLEN];
+ struct ctl_table lctl = *ctl;
+ struct net *net = ctl->extra2;
+ struct ipv6_stable_secret *secret = ctl->data;
+
+ if (&net->ipv6.devconf_all->stable_secret == ctl->data)
+ return -EIO;
+
+ lctl.maxlen = IPV6_MAX_STRLEN;
+ lctl.data = str;
+
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ if (!write && !secret->initialized) {
+ err = -EIO;
+ goto out;
+ }
+
+ err = snprintf(str, sizeof(str), "%pI6", &secret->secret);
+ if (err >= sizeof(str)) {
+ err = -EIO;
+ goto out;
+ }
+
+ err = proc_dostring(&lctl, write, buffer, lenp, ppos);
+ if (err || !write)
+ goto out;
+
+ if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) {
+ err = -EIO;
+ goto out;
+ }
+
+ secret->initialized = true;
+ secret->secret = addr;
+
+ if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) {
+ struct net_device *dev;
+
+ for_each_netdev(net, dev) {
+ struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev);
+
+ if (idev) {
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
+ }
+ }
+ } else {
+ struct inet6_dev *idev = ctl->extra1;
+
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
+ }
+
+out:
+ rtnl_net_unlock(net);
+
+ return err;
+}
+
+static
+int addrconf_sysctl_ignore_routes_with_linkdown(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
+ int ret;
+
+ /* ctl->data points to idev->cnf.ignore_routes_when_linkdown
+ * we should not modify it until we get the rtnl lock.
+ */
+ lctl = *ctl;
+ lctl.data = &val;
+
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+ if (write)
+ ret = addrconf_fixup_linkdown(ctl, valp, val);
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
+static
+void addrconf_set_nopolicy(struct rt6_info *rt, int action)
+{
+ if (rt) {
+ if (action)
+ rt->dst.flags |= DST_NOPOLICY;
+ else
+ rt->dst.flags &= ~DST_NOPOLICY;
+ }
+}
+
+static
+void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
+{
+ struct inet6_ifaddr *ifa;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ spin_lock(&ifa->lock);
+ if (ifa->rt) {
+ /* host routes only use builtin fib6_nh */
+ struct fib6_nh *nh = ifa->rt->fib6_nh;
+ int cpu;
+
+ rcu_read_lock();
+ ifa->rt->dst_nopolicy = val ? true : false;
+ if (nh->rt6i_pcpu) {
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **rtp;
+
+ rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
+ addrconf_set_nopolicy(*rtp, val);
+ }
+ }
+ rcu_read_unlock();
+ }
+ spin_unlock(&ifa->lock);
+ }
+ read_unlock_bh(&idev->lock);
+}
+
+static
+int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
+{
+ struct net *net = (struct net *)ctl->extra2;
+ struct inet6_dev *idev;
+
+ if (valp == &net->ipv6.devconf_dflt->disable_policy) {
+ WRITE_ONCE(*valp, val);
return 0;
- if (oldval && oldlenp) {
- size_t len;
- if (get_user(len, oldlenp))
- return -EFAULT;
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, valp, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
+ }
+
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ WRITE_ONCE(*valp, val);
+
+ if (valp == &net->ipv6.devconf_all->disable_policy) {
+ struct net_device *dev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (idev)
+ addrconf_disable_policy_idev(idev, val);
}
+ } else {
+ idev = (struct inet6_dev *)ctl->extra1;
+ addrconf_disable_policy_idev(idev, val);
}
- if (valp != &ipv6_devconf_dflt.forwarding) {
- if (valp != &ipv6_devconf.forwarding) {
- struct inet6_dev *idev = (struct inet6_dev *)table->extra1;
- int changed;
- if (unlikely(idev == NULL))
- return -ENODEV;
- changed = (!*valp) ^ (!new);
- *valp = new;
+ rtnl_net_unlock(net);
+ return 0;
+}
+
+static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
+ int ret;
+
+ lctl = *ctl;
+ lctl.data = &val;
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+ if (write && (*valp != val))
+ ret = addrconf_disable_policy(ctl, valp, val);
+
+ if (ret)
+ *ppos = pos;
+
+ return ret;
+}
+
+static void addrconf_force_forward_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (idev) {
+ int changed = (!idev->cnf.force_forwarding) ^ (!newf);
+
+ WRITE_ONCE(idev->cnf.force_forwarding, newf);
if (changed)
- dev_forward_change(idev);
- } else {
- *valp = new;
- addrconf_forward_change();
+ inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ dev->ifindex, &idev->cnf);
}
+ }
+}
- if (*valp)
- rt6_purge_dflt_routers();
- } else
- *valp = new;
+static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct inet6_dev *idev = ctl->extra1;
+ struct ctl_table tmp_ctl = *ctl;
+ struct net *net = ctl->extra2;
+ int *valp = ctl->data;
+ int new_val = *valp;
+ int old_val = *valp;
+ loff_t pos = *ppos;
+ int ret;
- return 1;
+ tmp_ctl.extra1 = SYSCTL_ZERO;
+ tmp_ctl.extra2 = SYSCTL_ONE;
+ tmp_ctl.data = &new_val;
+
+ ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos);
+
+ if (write && old_val != new_val) {
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ WRITE_ONCE(*valp, new_val);
+
+ if (valp == &net->ipv6.devconf_dflt->force_forwarding) {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ } else if (valp == &net->ipv6.devconf_all->force_forwarding) {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+
+ addrconf_force_forward_change(net, new_val);
+ } else {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ idev->dev->ifindex,
+ &idev->cnf);
+ }
+ rtnl_net_unlock(net);
+ }
+
+ if (ret)
+ *ppos = pos;
+ return ret;
}
-static struct addrconf_sysctl_table
-{
- struct ctl_table_header *sysctl_header;
- ctl_table addrconf_vars[__NET_IPV6_MAX];
- ctl_table addrconf_dev[2];
- ctl_table addrconf_conf_dir[2];
- ctl_table addrconf_proto_dir[2];
- ctl_table addrconf_root_dir[2];
-} addrconf_sysctl __read_mostly = {
- .sysctl_header = NULL,
- .addrconf_vars = {
- {
- .ctl_name = NET_IPV6_FORWARDING,
- .procname = "forwarding",
- .data = &ipv6_devconf.forwarding,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &addrconf_sysctl_forward,
- .strategy = &addrconf_sysctl_forward_strategy,
- },
- {
- .ctl_name = NET_IPV6_HOP_LIMIT,
- .procname = "hop_limit",
- .data = &ipv6_devconf.hop_limit,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_MTU,
- .procname = "mtu",
- .data = &ipv6_devconf.mtu6,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_ACCEPT_RA,
- .procname = "accept_ra",
- .data = &ipv6_devconf.accept_ra,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_ACCEPT_REDIRECTS,
- .procname = "accept_redirects",
- .data = &ipv6_devconf.accept_redirects,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_AUTOCONF,
- .procname = "autoconf",
- .data = &ipv6_devconf.autoconf,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_DAD_TRANSMITS,
- .procname = "dad_transmits",
- .data = &ipv6_devconf.dad_transmits,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_RTR_SOLICITS,
- .procname = "router_solicitations",
- .data = &ipv6_devconf.rtr_solicits,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL,
- .procname = "router_solicitation_interval",
- .data = &ipv6_devconf.rtr_solicit_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY,
- .procname = "router_solicitation_delay",
- .data = &ipv6_devconf.rtr_solicit_delay,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_IPV6_FORCE_MLD_VERSION,
- .procname = "force_mld_version",
- .data = &ipv6_devconf.force_mld_version,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#ifdef CONFIG_IPV6_PRIVACY
- {
- .ctl_name = NET_IPV6_USE_TEMPADDR,
- .procname = "use_tempaddr",
- .data = &ipv6_devconf.use_tempaddr,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_TEMP_VALID_LFT,
- .procname = "temp_valid_lft",
- .data = &ipv6_devconf.temp_valid_lft,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_TEMP_PREFERED_LFT,
- .procname = "temp_prefered_lft",
- .data = &ipv6_devconf.temp_prefered_lft,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_REGEN_MAX_RETRY,
- .procname = "regen_max_retry",
- .data = &ipv6_devconf.regen_max_retry,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR,
- .procname = "max_desync_factor",
- .data = &ipv6_devconf.max_desync_factor,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
- {
- .ctl_name = NET_IPV6_MAX_ADDRESSES,
- .procname = "max_addresses",
- .data = &ipv6_devconf.max_addresses,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_ACCEPT_RA_DEFRTR,
- .procname = "accept_ra_defrtr",
- .data = &ipv6_devconf.accept_ra_defrtr,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_ACCEPT_RA_PINFO,
- .procname = "accept_ra_pinfo",
- .data = &ipv6_devconf.accept_ra_pinfo,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
+static int minus_one = -1;
+static const int two_five_five = 255;
+static u32 ioam6_if_id_max = U16_MAX;
+
+static const struct ctl_table addrconf_sysctl[] = {
+ {
+ .procname = "forwarding",
+ .data = &ipv6_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_forward,
+ },
+ {
+ .procname = "hop_limit",
+ .data = &ipv6_devconf.hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)SYSCTL_ONE,
+ .extra2 = (void *)&two_five_five,
+ },
+ {
+ .procname = "mtu",
+ .data = &ipv6_devconf.mtu6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_mtu,
+ },
+ {
+ .procname = "accept_ra",
+ .data = &ipv6_devconf.accept_ra,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_redirects",
+ .data = &ipv6_devconf.accept_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "autoconf",
+ .data = &ipv6_devconf.autoconf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "dad_transmits",
+ .data = &ipv6_devconf.dad_transmits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_solicitations",
+ .data = &ipv6_devconf.rtr_solicits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &minus_one,
+ },
+ {
+ .procname = "router_solicitation_interval",
+ .data = &ipv6_devconf.rtr_solicit_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "router_solicitation_max_interval",
+ .data = &ipv6_devconf.rtr_solicit_max_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "router_solicitation_delay",
+ .data = &ipv6_devconf.rtr_solicit_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "force_mld_version",
+ .data = &ipv6_devconf.force_mld_version,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mldv1_unsolicited_report_interval",
+ .data =
+ &ipv6_devconf.mldv1_unsolicited_report_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "mldv2_unsolicited_report_interval",
+ .data =
+ &ipv6_devconf.mldv2_unsolicited_report_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "use_tempaddr",
+ .data = &ipv6_devconf.use_tempaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "temp_valid_lft",
+ .data = &ipv6_devconf.temp_valid_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "temp_prefered_lft",
+ .data = &ipv6_devconf.temp_prefered_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "regen_min_advance",
+ .data = &ipv6_devconf.regen_min_advance,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "regen_max_retry",
+ .data = &ipv6_devconf.regen_max_retry,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_desync_factor",
+ .data = &ipv6_devconf.max_desync_factor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_addresses",
+ .data = &ipv6_devconf.max_addresses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_defrtr",
+ .data = &ipv6_devconf.accept_ra_defrtr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ra_defrtr_metric",
+ .data = &ipv6_devconf.ra_defrtr_metric,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "accept_ra_min_hop_limit",
+ .data = &ipv6_devconf.accept_ra_min_hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_min_lft",
+ .data = &ipv6_devconf.accept_ra_min_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_pinfo",
+ .data = &ipv6_devconf.accept_ra_pinfo,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ra_honor_pio_life",
+ .data = &ipv6_devconf.ra_honor_pio_life,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "ra_honor_pio_pflag",
+ .data = &ipv6_devconf.ra_honor_pio_pflag,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
#ifdef CONFIG_IPV6_ROUTER_PREF
- {
- .ctl_name = NET_IPV6_ACCEPT_RA_RTR_PREF,
- .procname = "accept_ra_rtr_pref",
- .data = &ipv6_devconf.accept_ra_rtr_pref,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_IPV6_RTR_PROBE_INTERVAL,
- .procname = "router_probe_interval",
- .data = &ipv6_devconf.rtr_probe_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
-#ifdef CONFIV_IPV6_ROUTE_INFO
- {
- .ctl_name = NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,
- .procname = "accept_ra_rt_info_max_plen",
- .data = &ipv6_devconf.accept_ra_rt_info_max_plen,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
+ {
+ .procname = "accept_ra_rtr_pref",
+ .data = &ipv6_devconf.accept_ra_rtr_pref,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_probe_interval",
+ .data = &ipv6_devconf.rtr_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ {
+ .procname = "accept_ra_rt_info_min_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_min_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_rt_info_max_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_max_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#endif
- {
- .ctl_name = NET_IPV6_PROXY_NDP,
- .procname = "proxy_ndp",
- .data = &ipv6_devconf.proxy_ndp,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = 0, /* sentinel */
- }
+ {
+ .procname = "proxy_ndp",
+ .data = &ipv6_devconf.proxy_ndp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_proxy_ndp,
},
- .addrconf_dev = {
- {
- .ctl_name = NET_PROTO_CONF_ALL,
- .procname = "all",
- .mode = 0555,
- .child = addrconf_sysctl.addrconf_vars,
- },
- {
- .ctl_name = 0, /* sentinel */
- }
+ {
+ .procname = "accept_source_route",
+ .data = &ipv6_devconf.accept_source_route,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
- .addrconf_conf_dir = {
- {
- .ctl_name = NET_IPV6_CONF,
- .procname = "conf",
- .mode = 0555,
- .child = addrconf_sysctl.addrconf_dev,
- },
- {
- .ctl_name = 0, /* sentinel */
- }
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ {
+ .procname = "optimistic_dad",
+ .data = &ipv6_devconf.optimistic_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
- .addrconf_proto_dir = {
- {
- .ctl_name = NET_IPV6,
- .procname = "ipv6",
- .mode = 0555,
- .child = addrconf_sysctl.addrconf_conf_dir,
- },
- {
- .ctl_name = 0, /* sentinel */
- }
+ {
+ .procname = "use_optimistic",
+ .data = &ipv6_devconf.use_optimistic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
- .addrconf_root_dir = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = addrconf_sysctl.addrconf_proto_dir,
- },
- {
- .ctl_name = 0, /* sentinel */
- }
+#endif
+#ifdef CONFIG_IPV6_MROUTE
+ {
+ .procname = "mc_forwarding",
+ .data = &ipv6_devconf.mc_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "disable_ipv6",
+ .data = &ipv6_devconf.disable_ipv6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_disable,
+ },
+ {
+ .procname = "accept_dad",
+ .data = &ipv6_devconf.accept_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "force_tllao",
+ .data = &ipv6_devconf.force_tllao,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ndisc_notify",
+ .data = &ipv6_devconf.ndisc_notify,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "suppress_frag_ndisc",
+ .data = &ipv6_devconf.suppress_frag_ndisc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "accept_ra_from_local",
+ .data = &ipv6_devconf.accept_ra_from_local,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_mtu",
+ .data = &ipv6_devconf.accept_ra_mtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "stable_secret",
+ .data = &ipv6_devconf.stable_secret,
+ .maxlen = IPV6_MAX_STRLEN,
+ .mode = 0600,
+ .proc_handler = addrconf_sysctl_stable_secret,
+ },
+ {
+ .procname = "use_oif_addrs_only",
+ .data = &ipv6_devconf.use_oif_addrs_only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ignore_routes_with_linkdown",
+ .data = &ipv6_devconf.ignore_routes_with_linkdown,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown,
+ },
+ {
+ .procname = "drop_unicast_in_l2_multicast",
+ .data = &ipv6_devconf.drop_unicast_in_l2_multicast,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "drop_unsolicited_na",
+ .data = &ipv6_devconf.drop_unsolicited_na,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "keep_addr_on_down",
+ .data = &ipv6_devconf.keep_addr_on_down,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
+ .procname = "seg6_enabled",
+ .data = &ipv6_devconf.seg6_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ {
+ .procname = "seg6_require_hmac",
+ .data = &ipv6_devconf.seg6_require_hmac,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "enhanced_dad",
+ .data = &ipv6_devconf.enhanced_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "addr_gen_mode",
+ .data = &ipv6_devconf.addr_gen_mode,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_addr_gen_mode,
+ },
+ {
+ .procname = "disable_policy",
+ .data = &ipv6_devconf.disable_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_disable_policy,
+ },
+ {
+ .procname = "ndisc_tclass",
+ .data = &ipv6_devconf.ndisc_tclass,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)&two_five_five,
+ },
+ {
+ .procname = "rpl_seg_enabled",
+ .data = &ipv6_devconf.rpl_seg_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "ioam6_enabled",
+ .data = &ipv6_devconf.ioam6_enabled,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "ioam6_id",
+ .data = &ipv6_devconf.ioam6_id,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)&ioam6_if_id_max,
+ },
+ {
+ .procname = "ioam6_id_wide",
+ .data = &ipv6_devconf.ioam6_id_wide,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "ndisc_evict_nocarrier",
+ .data = &ipv6_devconf.ndisc_evict_nocarrier,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "accept_untracked_na",
+ .data = &ipv6_devconf.accept_untracked_na,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "force_forwarding",
+ .data = &ipv6_devconf.force_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_force_forwarding,
},
};
-static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p)
+static int __addrconf_sysctl_register(struct net *net, char *dev_name,
+ struct inet6_dev *idev, struct ipv6_devconf *p)
{
- int i;
- struct net_device *dev = idev ? idev->dev : NULL;
- struct addrconf_sysctl_table *t;
- char *dev_name = NULL;
+ size_t table_size = ARRAY_SIZE(addrconf_sysctl);
+ int i, ifindex;
+ struct ctl_table *table;
+ char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
- t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
- if (t == NULL)
- return;
- for (i=0; t->addrconf_vars[i].data; i++) {
- t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf;
- t->addrconf_vars[i].de = NULL;
- t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
+ table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT);
+ if (!table)
+ goto out;
+
+ for (i = 0; i < table_size; i++) {
+ table[i].data += (char *)p - (char *)&ipv6_devconf;
+ /* If one of these is already set, then it is not safe to
+ * overwrite either of them: this makes proc_dointvec_minmax
+ * usable.
+ */
+ if (!table[i].extra1 && !table[i].extra2) {
+ table[i].extra1 = idev; /* embedded; no ref */
+ table[i].extra2 = net;
+ }
}
- if (dev) {
- dev_name = dev->name;
- t->addrconf_dev[0].ctl_name = dev->ifindex;
- } else {
- dev_name = "default";
- t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
- }
-
- /*
- * Make a copy of dev_name, because '.procname' is regarded as const
- * by sysctl and we wouldn't want anyone to change it under our feet
- * (see SIOCSIFNAME).
- */
- dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!dev_name)
- goto free;
-
- t->addrconf_dev[0].procname = dev_name;
-
- t->addrconf_dev[0].child = t->addrconf_vars;
- t->addrconf_dev[0].de = NULL;
- t->addrconf_conf_dir[0].child = t->addrconf_dev;
- t->addrconf_conf_dir[0].de = NULL;
- t->addrconf_proto_dir[0].child = t->addrconf_conf_dir;
- t->addrconf_proto_dir[0].de = NULL;
- t->addrconf_root_dir[0].child = t->addrconf_proto_dir;
- t->addrconf_root_dir[0].de = NULL;
-
- t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0);
- if (t->sysctl_header == NULL)
- goto free_procname;
- else
- p->sysctl = t;
- return;
- /* error path */
- free_procname:
- kfree(dev_name);
- free:
- kfree(t);
+ snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
- return;
+ p->sysctl_header = register_net_sysctl_sz(net, path, table,
+ table_size);
+ if (!p->sysctl_header)
+ goto free;
+
+ if (!strcmp(dev_name, "all"))
+ ifindex = NETCONFA_IFINDEX_ALL;
+ else if (!strcmp(dev_name, "default"))
+ ifindex = NETCONFA_IFINDEX_DEFAULT;
+ else
+ ifindex = idev->dev->ifindex;
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
+ ifindex, p);
+ return 0;
+
+free:
+ kfree(table);
+out:
+ return -ENOBUFS;
}
-static void addrconf_sysctl_unregister(struct ipv6_devconf *p)
+static void __addrconf_sysctl_unregister(struct net *net,
+ struct ipv6_devconf *p, int ifindex)
{
- if (p->sysctl) {
- struct addrconf_sysctl_table *t = p->sysctl;
- p->sysctl = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->addrconf_dev[0].procname);
- kfree(t);
- }
-}
+ const struct ctl_table *table;
+ if (!p->sysctl_header)
+ return;
-#endif
+ table = p->sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(p->sysctl_header);
+ p->sysctl_header = NULL;
+ kfree(table);
-/*
- * Device notifier
- */
+ inet6_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
+}
-int register_inet6addr_notifier(struct notifier_block *nb)
+static int addrconf_sysctl_register(struct inet6_dev *idev)
{
- return atomic_notifier_chain_register(&inet6addr_chain, nb);
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->nd_parms,
+ &ndisc_ifinfo_sysctl_change);
+ if (err)
+ return err;
+ err = __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ idev, &idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->nd_parms);
+
+ return err;
}
-int unregister_inet6addr_notifier(struct notifier_block *nb)
+static void addrconf_sysctl_unregister(struct inet6_dev *idev)
{
- return atomic_notifier_chain_unregister(&inet6addr_chain,nb);
+ __addrconf_sysctl_unregister(dev_net(idev->dev), &idev->cnf,
+ idev->dev->ifindex);
+ neigh_sysctl_unregister(idev->nd_parms);
}
-/*
- * Init / cleanup code
- */
-int __init addrconf_init(void)
+#endif
+
+static int __net_init addrconf_init_net(struct net *net)
{
- int err = 0;
+ int err = -ENOMEM;
+ struct ipv6_devconf *all, *dflt;
+
+ spin_lock_init(&net->ipv6.addrconf_hash_lock);
+ INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
+ net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->ipv6.inet6_addr_lst)
+ goto err_alloc_addr;
+
+ all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
+
+ dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
+
+ if (!net_eq(net, &init_net)) {
+ switch (net_inherit_devconf()) {
+ case 1: /* copy from init_net */
+ memcpy(all, init_net.ipv6.devconf_all,
+ sizeof(ipv6_devconf));
+ memcpy(dflt, init_net.ipv6.devconf_dflt,
+ sizeof(ipv6_devconf_dflt));
+ break;
+ case 3: /* copy from the current netns */
+ memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all,
+ sizeof(ipv6_devconf));
+ memcpy(dflt,
+ current->nsproxy->net_ns->ipv6.devconf_dflt,
+ sizeof(ipv6_devconf_dflt));
+ break;
+ case 0:
+ case 2:
+ /* use compiled values */
+ break;
+ }
+ }
- /* The addrconf netdev notifier requires that loopback_dev
- * has it's ipv6 private information allocated and setup
- * before it can bring up and give link-local addresses
- * to other devices which are up.
- *
- * Unfortunately, loopback_dev is not necessarily the first
- * entry in the global dev_base list of net devices. In fact,
- * it is likely to be the very last entry on that list.
- * So this causes the notifier registry below to try and
- * give link-local addresses to all devices besides loopback_dev
- * first, then loopback_dev, which cases all the non-loopback_dev
- * devices to fail to get a link-local address.
- *
- * So, as a temporary fix, allocate the ipv6 structure for
- * loopback_dev first by hand.
- * Longer term, all of the dependencies ipv6 has upon the loopback
- * device and it being up should be removed.
- */
- rtnl_lock();
- if (!ipv6_add_dev(&loopback_dev))
- err = -ENOMEM;
- rtnl_unlock();
- if (err)
- return err;
+ /* these will be inherited by all namespaces */
+ dflt->autoconf = ipv6_defaults.autoconf;
+ dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
- ip6_null_entry.rt6i_idev = in6_dev_get(&loopback_dev);
+ dflt->stable_secret.initialized = false;
+ all->stable_secret.initialized = false;
- register_netdevice_notifier(&ipv6_dev_notf);
+ net->ipv6.devconf_all = all;
+ net->ipv6.devconf_dflt = dflt;
- addrconf_verify(0);
- rtnetlink_links[PF_INET6] = inet6_rtnetlink_table;
#ifdef CONFIG_SYSCTL
- addrconf_sysctl.sysctl_header =
- register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0);
- addrconf_sysctl_register(NULL, &ipv6_devconf_dflt);
-#endif
+ err = __addrconf_sysctl_register(net, "all", NULL, all);
+ if (err < 0)
+ goto err_reg_all;
+ err = __addrconf_sysctl_register(net, "default", NULL, dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+#endif
return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_dflt:
+ __addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
+err_reg_all:
+ kfree(dflt);
+ net->ipv6.devconf_dflt = NULL;
+#endif
+err_alloc_dflt:
+ kfree(all);
+ net->ipv6.devconf_all = NULL;
+err_alloc_all:
+ kfree(net->ipv6.inet6_addr_lst);
+err_alloc_addr:
+ return err;
}
-void __exit addrconf_cleanup(void)
+static void __net_exit addrconf_exit_net(struct net *net)
{
- struct net_device *dev;
- struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
int i;
- unregister_netdevice_notifier(&ipv6_dev_notf);
-
- rtnetlink_links[PF_INET6] = NULL;
#ifdef CONFIG_SYSCTL
- addrconf_sysctl_unregister(&ipv6_devconf_dflt);
- addrconf_sysctl_unregister(&ipv6_devconf);
+ __addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
+ NETCONFA_IFINDEX_DEFAULT);
+ __addrconf_sysctl_unregister(net, net->ipv6.devconf_all,
+ NETCONFA_IFINDEX_ALL);
#endif
+ kfree(net->ipv6.devconf_dflt);
+ net->ipv6.devconf_dflt = NULL;
+ kfree(net->ipv6.devconf_all);
+ net->ipv6.devconf_all = NULL;
- rtnl_lock();
-
+ cancel_delayed_work_sync(&net->ipv6.addr_chk_work);
/*
- * clean dev list.
+ * Check hash table, then free it.
*/
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));
- for (dev=dev_base; dev; dev=dev->next) {
- if ((idev = __in6_dev_get(dev)) == NULL)
- continue;
- addrconf_ifdown(dev, 1);
+ kfree(net->ipv6.inet6_addr_lst);
+ net->ipv6.inet6_addr_lst = NULL;
+}
+
+static struct pernet_operations addrconf_ops = {
+ .init = addrconf_init_net,
+ .exit = addrconf_exit_net,
+};
+
+static struct rtnl_af_ops inet6_ops __read_mostly = {
+ .family = AF_INET6,
+ .fill_link_af = inet6_fill_link_af,
+ .get_link_af_size = inet6_get_link_af_size,
+ .validate_link_af = inet6_validate_link_af,
+ .set_link_af = inet6_set_link_af,
+};
+
+static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK,
+ .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR,
+ .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR,
+ .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR,
+ .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETMULTICAST,
+ .dumpit = inet6_dump_ifmcaddr,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETANYCAST,
+ .dumpit = inet6_dump_ifacaddr,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETNETCONF,
+ .doit = inet6_netconf_get_devconf, .dumpit = inet6_netconf_dump_devconf,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
+/*
+ * Init / cleanup code
+ */
+
+int __init addrconf_init(void)
+{
+ struct inet6_dev *idev;
+ int err;
+
+ err = ipv6_addr_label_init();
+ if (err < 0) {
+ pr_crit("%s: cannot initialize default policy table: %d\n",
+ __func__, err);
+ goto out;
}
- addrconf_ifdown(&loopback_dev, 2);
- /*
- * Check hash table.
- */
+ err = register_pernet_subsys(&addrconf_ops);
+ if (err < 0)
+ goto out_addrlabel;
- write_lock_bh(&addrconf_hash_lock);
- for (i=0; i < IN6_ADDR_HSIZE; i++) {
- for (ifa=inet6_addr_lst[i]; ifa; ) {
- struct inet6_ifaddr *bifa;
+ /* All works using addrconf_wq need to lock rtnl. */
+ addrconf_wq = create_singlethread_workqueue("ipv6_addrconf");
+ if (!addrconf_wq) {
+ err = -ENOMEM;
+ goto out_nowq;
+ }
- bifa = ifa;
- ifa = ifa->lst_next;
- printk(KERN_DEBUG "bug: IPv6 address leakage detected: ifa=%p\n", bifa);
- /* Do not free it; something is wrong.
- Now we can investigate it with debugger.
- */
- }
+ rtnl_net_lock(&init_net);
+ idev = ipv6_add_dev(blackhole_netdev);
+ rtnl_net_unlock(&init_net);
+ if (IS_ERR(idev)) {
+ err = PTR_ERR(idev);
+ goto errlo;
}
- write_unlock_bh(&addrconf_hash_lock);
- del_timer(&addr_chk_timer);
+ ip6_route_init_special_entries();
- rtnl_unlock();
+ register_netdevice_notifier(&ipv6_dev_notf);
-#ifdef CONFIG_PROC_FS
- proc_net_remove("if_inet6");
-#endif
+ addrconf_verify(&init_net);
+
+ err = rtnl_af_register(&inet6_ops);
+ if (err)
+ goto erraf;
+
+ err = rtnl_register_many(addrconf_rtnl_msg_handlers);
+ if (err)
+ goto errout;
+
+ err = ipv6_addr_label_rtnl_register();
+ if (err < 0)
+ goto errout;
+
+ return 0;
+errout:
+ rtnl_unregister_all(PF_INET6);
+ rtnl_af_unregister(&inet6_ops);
+erraf:
+ unregister_netdevice_notifier(&ipv6_dev_notf);
+errlo:
+ destroy_workqueue(addrconf_wq);
+out_nowq:
+ unregister_pernet_subsys(&addrconf_ops);
+out_addrlabel:
+ ipv6_addr_label_cleanup();
+out:
+ return err;
+}
+
+void addrconf_cleanup(void)
+{
+ struct net_device *dev;
+
+ unregister_netdevice_notifier(&ipv6_dev_notf);
+ unregister_pernet_subsys(&addrconf_ops);
+ ipv6_addr_label_cleanup();
+
+ rtnl_af_unregister(&inet6_ops);
+
+ rtnl_net_lock(&init_net);
+
+ /* clean dev list */
+ for_each_netdev(&init_net, dev) {
+ if (!__in6_dev_get_rtnl_net(dev))
+ continue;
+ addrconf_ifdown(dev, true);
+ }
+ addrconf_ifdown(init_net.loopback_dev, true);
+
+ rtnl_net_unlock(&init_net);
+
+ destroy_workqueue(addrconf_wq);
}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
new file mode 100644
index 000000000000..c008d21925d7
--- /dev/null
+++ b/net/ipv6/addrconf_core.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+
+#include <linux/export.h>
+#include <net/ipv6.h>
+#include <net/ipv6_stubs.h>
+#include <net/addrconf.h>
+#include <net/ip.h>
+
+/* if ipv6 module registers this function is used by xfrm to force all
+ * sockets to relookup their nodes - this is fairly expensive, be
+ * careful
+ */
+void (*__fib6_flush_trees)(struct net *);
+EXPORT_SYMBOL(__fib6_flush_trees);
+
+#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16)
+
+static inline unsigned int ipv6_addr_scope2type(unsigned int scope)
+{
+ switch (scope) {
+ case IPV6_ADDR_SCOPE_NODELOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
+ IPV6_ADDR_LOOPBACK);
+ case IPV6_ADDR_SCOPE_LINKLOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
+ IPV6_ADDR_LINKLOCAL);
+ case IPV6_ADDR_SCOPE_SITELOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
+ IPV6_ADDR_SITELOCAL);
+ }
+ return IPV6_ADDR_SCOPE_TYPE(scope);
+}
+
+int __ipv6_addr_type(const struct in6_addr *addr)
+{
+ __be32 st;
+
+ st = addr->s6_addr32[0];
+
+ /* Consider all addresses with the first three bits different of
+ 000 and 111 as unicasts.
+ */
+ if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
+ (st & htonl(0xE0000000)) != htonl(0xE0000000))
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));
+
+ if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
+ /* multicast */
+ /* addr-select 3.1 */
+ return (IPV6_ADDR_MULTICAST |
+ ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
+ }
+
+ if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
+ return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */
+ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
+ return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */
+ if ((st & htonl(0xFE000000)) == htonl(0xFC000000))
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */
+
+ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
+ if (addr->s6_addr32[2] == 0) {
+ if (addr->s6_addr32[3] == 0)
+ return IPV6_ADDR_ANY;
+
+ if (addr->s6_addr32[3] == htonl(0x00000001))
+ return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */
+
+ return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
+ }
+
+ if (addr->s6_addr32[2] == htonl(0x0000ffff))
+ return (IPV6_ADDR_MAPPED |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
+ }
+
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */
+}
+EXPORT_SYMBOL(__ipv6_addr_type);
+
+static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
+static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain);
+
+int register_inet6addr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&inet6addr_chain, nb);
+}
+EXPORT_SYMBOL(register_inet6addr_notifier);
+
+int unregister_inet6addr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&inet6addr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inet6addr_notifier);
+
+int inet6addr_notifier_call_chain(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&inet6addr_chain, val, v);
+}
+EXPORT_SYMBOL(inet6addr_notifier_call_chain);
+
+int register_inet6addr_validator_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&inet6addr_validator_chain, nb);
+}
+EXPORT_SYMBOL(register_inet6addr_validator_notifier);
+
+int unregister_inet6addr_validator_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&inet6addr_validator_chain,
+ nb);
+}
+EXPORT_SYMBOL(unregister_inet6addr_validator_notifier);
+
+int inet6addr_validator_notifier_call_chain(unsigned long val, void *v)
+{
+ return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v);
+}
+EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain);
+
+static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
+static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
+{
+ return -EAFNOSUPPORT;
+}
+
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+ return NULL;
+}
+
+static int
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
+{
+ return -EAFNOSUPPORT;
+}
+
+static int
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
+{
+ return -EAFNOSUPPORT;
+}
+
+static void
+eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res,
+ struct flowi6 *fl6, int oif, bool have_oif_match,
+ const struct sk_buff *skb, int strict)
+{
+}
+
+static u32
+eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ return 0;
+}
+
+static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+ struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EAFNOSUPPORT;
+}
+
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
+ bool skip_notify)
+{
+ return -EAFNOSUPPORT;
+}
+
+static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ kfree_skb(skb);
+ return -EAFNOSUPPORT;
+}
+
+static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
+const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
+ .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
+ .ipv6_route_input = eafnosupport_ipv6_route_input,
+ .fib6_get_table = eafnosupport_fib6_get_table,
+ .fib6_table_lookup = eafnosupport_fib6_table_lookup,
+ .fib6_lookup = eafnosupport_fib6_lookup,
+ .fib6_select_path = eafnosupport_fib6_select_path,
+ .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
+ .fib6_nh_init = eafnosupport_fib6_nh_init,
+ .ip6_del_rt = eafnosupport_ip6_del_rt,
+ .ipv6_fragment = eafnosupport_ipv6_fragment,
+ .ipv6_dev_find = eafnosupport_ipv6_dev_find,
+};
+EXPORT_SYMBOL_GPL(ipv6_stub);
+
+/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_LOOPBACK_INIT;
+EXPORT_SYMBOL(in6addr_loopback);
+const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_ANY_INIT;
+EXPORT_SYMBOL(in6addr_any);
+const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allnodes);
+const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allrouters);
+const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
+const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
+const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_sitelocal_allrouters);
+
+static void snmp6_free_dev(struct inet6_dev *idev)
+{
+ kfree(idev->stats.icmpv6msgdev);
+ kfree(idev->stats.icmpv6dev);
+ free_percpu(idev->stats.ipv6);
+}
+
+static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
+{
+ struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);
+
+ snmp6_free_dev(idev);
+ kfree(idev);
+}
+
+/* Nobody refers to this device, we may destroy it. */
+
+void in6_dev_finish_destroy(struct inet6_dev *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ WARN_ON(!list_empty(&idev->addr_list));
+ WARN_ON(rcu_access_pointer(idev->mc_list));
+ WARN_ON(timer_pending(&idev->rs_timer));
+
+#ifdef NET_REFCNT_DEBUG
+ pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
+#endif
+ netdev_put(dev, &idev->dev_tracker);
+ if (!idev->dead) {
+ pr_warn("Freeing alive inet6 device %p\n", idev);
+ return;
+ }
+ call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
+}
+EXPORT_SYMBOL(in6_dev_finish_destroy);
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
new file mode 100644
index 000000000000..567efd626ab4
--- /dev/null
+++ b/net/ipv6/addrlabel.c
@@ -0,0 +1,640 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IPv6 Address Label subsystem
+ * for the IPv6 "Default" Source Address Selection
+ *
+ * Copyright (C)2007 USAGI/WIDE Project
+ */
+/*
+ * Author:
+ * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/addrconf.h>
+#include <linux/if_addrlabel.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+/*
+ * Policy Table
+ */
+struct ip6addrlbl_entry {
+ struct in6_addr prefix;
+ int prefixlen;
+ int ifindex;
+ int addrtype;
+ u32 label;
+ struct hlist_node list;
+ struct rcu_head rcu;
+};
+
+/*
+ * Default policy table (RFC6724 + extensions)
+ *
+ * prefix addr_type label
+ * -------------------------------------------------------------------------
+ * ::1/128 LOOPBACK 0
+ * ::/0 N/A 1
+ * 2002::/16 N/A 2
+ * ::/96 COMPATv4 3
+ * ::ffff:0:0/96 V4MAPPED 4
+ * fc00::/7 N/A 5 ULA (RFC 4193)
+ * 2001::/32 N/A 6 Teredo (RFC 4380)
+ * 2001:10::/28 N/A 7 ORCHID (RFC 4843)
+ * fec0::/10 N/A 11 Site-local
+ * (deprecated by RFC3879)
+ * 3ffe::/16 N/A 12 6bone
+ *
+ * Note: 0xffffffff is used if we do not have any policies.
+ * Note: Labels for ULA and 6to4 are different from labels listed in RFC6724.
+ */
+
+#define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL
+
+static const __net_initconst struct ip6addrlbl_init_table
+{
+ const struct in6_addr *prefix;
+ int prefixlen;
+ u32 label;
+} ip6addrlbl_init_table[] = {
+ { /* ::/0 */
+ .prefix = &in6addr_any,
+ .label = 1,
+ }, { /* fc00::/7 */
+ .prefix = &(struct in6_addr){ { { 0xfc } } } ,
+ .prefixlen = 7,
+ .label = 5,
+ }, { /* fec0::/10 */
+ .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } },
+ .prefixlen = 10,
+ .label = 11,
+ }, { /* 2002::/16 */
+ .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } },
+ .prefixlen = 16,
+ .label = 2,
+ }, { /* 3ffe::/16 */
+ .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } },
+ .prefixlen = 16,
+ .label = 12,
+ }, { /* 2001::/32 */
+ .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } },
+ .prefixlen = 32,
+ .label = 6,
+ }, { /* 2001:10::/28 */
+ .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } },
+ .prefixlen = 28,
+ .label = 7,
+ }, { /* ::ffff:0:0 */
+ .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } },
+ .prefixlen = 96,
+ .label = 4,
+ }, { /* ::/96 */
+ .prefix = &in6addr_any,
+ .prefixlen = 96,
+ .label = 3,
+ }, { /* ::1/128 */
+ .prefix = &in6addr_loopback,
+ .prefixlen = 128,
+ .label = 0,
+ }
+};
+
+/* Find label */
+static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
+ const struct in6_addr *addr,
+ int addrtype, int ifindex)
+{
+ if (p->ifindex && p->ifindex != ifindex)
+ return false;
+ if (p->addrtype && p->addrtype != addrtype)
+ return false;
+ if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen))
+ return false;
+ return true;
+}
+
+static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
+ const struct in6_addr *addr,
+ int type, int ifindex)
+{
+ struct ip6addrlbl_entry *p;
+
+ hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
+ if (__ip6addrlbl_match(p, addr, type, ifindex))
+ return p;
+ }
+ return NULL;
+}
+
+u32 ipv6_addr_label(struct net *net,
+ const struct in6_addr *addr, int type, int ifindex)
+{
+ u32 label;
+ struct ip6addrlbl_entry *p;
+
+ type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK;
+
+ rcu_read_lock();
+ p = __ipv6_addr_label(net, addr, type, ifindex);
+ label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
+ rcu_read_unlock();
+
+ net_dbg_ratelimited("%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", __func__, addr, type,
+ ifindex, label);
+
+ return label;
+}
+
+/* allocate one entry */
+static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
+ int prefixlen, int ifindex,
+ u32 label)
+{
+ struct ip6addrlbl_entry *newp;
+ int addrtype;
+
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", __func__,
+ prefix, prefixlen, ifindex, (unsigned int)label);
+
+ addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);
+
+ switch (addrtype) {
+ case IPV6_ADDR_MAPPED:
+ if (prefixlen > 96)
+ return ERR_PTR(-EINVAL);
+ if (prefixlen < 96)
+ addrtype = 0;
+ break;
+ case IPV6_ADDR_COMPATv4:
+ if (prefixlen != 96)
+ addrtype = 0;
+ break;
+ case IPV6_ADDR_LOOPBACK:
+ if (prefixlen != 128)
+ addrtype = 0;
+ break;
+ }
+
+ newp = kmalloc(sizeof(*newp), GFP_KERNEL);
+ if (!newp)
+ return ERR_PTR(-ENOMEM);
+
+ ipv6_addr_prefix(&newp->prefix, prefix, prefixlen);
+ newp->prefixlen = prefixlen;
+ newp->ifindex = ifindex;
+ newp->addrtype = addrtype;
+ newp->label = label;
+ INIT_HLIST_NODE(&newp->list);
+ return newp;
+}
+
+/* add a label */
+static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
+ int replace)
+{
+ struct ip6addrlbl_entry *last = NULL, *p = NULL;
+ struct hlist_node *n;
+ int ret = 0;
+
+ net_dbg_ratelimited("%s(newp=%p, replace=%d)\n", __func__, newp, replace);
+
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ if (p->prefixlen == newp->prefixlen &&
+ p->ifindex == newp->ifindex &&
+ ipv6_addr_equal(&p->prefix, &newp->prefix)) {
+ if (!replace) {
+ ret = -EEXIST;
+ goto out;
+ }
+ hlist_replace_rcu(&p->list, &newp->list);
+ kfree_rcu(p, rcu);
+ goto out;
+ } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
+ (p->prefixlen < newp->prefixlen)) {
+ hlist_add_before_rcu(&newp->list, &p->list);
+ goto out;
+ }
+ last = p;
+ }
+ if (last)
+ hlist_add_behind_rcu(&newp->list, &last->list);
+ else
+ hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head);
+out:
+ if (!ret)
+ WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq,
+ net->ipv6.ip6addrlbl_table.seq + 1);
+ return ret;
+}
+
+/* add a label */
+static int ip6addrlbl_add(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ int ifindex, u32 label, int replace)
+{
+ struct ip6addrlbl_entry *newp;
+ int ret = 0;
+
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
+ __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace);
+
+ newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label);
+ if (IS_ERR(newp))
+ return PTR_ERR(newp);
+ spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+ ret = __ip6addrlbl_add(net, newp, replace);
+ spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
+ if (ret)
+ kfree(newp);
+ return ret;
+}
+
+/* remove a label */
+static int __ip6addrlbl_del(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ int ifindex)
+{
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+ int ret = -ESRCH;
+
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix,
+ prefixlen, ifindex);
+
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ if (p->prefixlen == prefixlen &&
+ p->ifindex == ifindex &&
+ ipv6_addr_equal(&p->prefix, prefix)) {
+ hlist_del_rcu(&p->list);
+ kfree_rcu(p, rcu);
+ ret = 0;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int ip6addrlbl_del(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ int ifindex)
+{
+ struct in6_addr prefix_buf;
+ int ret;
+
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix,
+ prefixlen, ifindex);
+
+ ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
+ spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+ ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
+ spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
+ return ret;
+}
+
+/* add default label */
+static int __net_init ip6addrlbl_net_init(struct net *net)
+{
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+ int err;
+ int i;
+
+ spin_lock_init(&net->ipv6.ip6addrlbl_table.lock);
+ INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
+
+ for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
+ err = ip6addrlbl_add(net,
+ ip6addrlbl_init_table[i].prefix,
+ ip6addrlbl_init_table[i].prefixlen,
+ 0,
+ ip6addrlbl_init_table[i].label, 0);
+ if (err)
+ goto err_ip6addrlbl_add;
+ }
+ return 0;
+
+err_ip6addrlbl_add:
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ hlist_del_rcu(&p->list);
+ kfree_rcu(p, rcu);
+ }
+ return err;
+}
+
+static void __net_exit ip6addrlbl_net_exit(struct net *net)
+{
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+
+ /* Remove all labels belonging to the exiting net */
+ spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ hlist_del_rcu(&p->list);
+ kfree_rcu(p, rcu);
+ }
+ spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
+}
+
+static struct pernet_operations ipv6_addr_label_ops = {
+ .init = ip6addrlbl_net_init,
+ .exit = ip6addrlbl_net_exit,
+};
+
+int __init ipv6_addr_label_init(void)
+{
+ return register_pernet_subsys(&ipv6_addr_label_ops);
+}
+
+void ipv6_addr_label_cleanup(void)
+{
+ unregister_pernet_subsys(&ipv6_addr_label_ops);
+}
+
+static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
+ [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), },
+ [IFAL_LABEL] = { .len = sizeof(u32), },
+};
+
+static bool addrlbl_ifindex_exists(struct net *net, int ifindex)
+{
+
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ rcu_read_unlock();
+
+ return dev != NULL;
+}
+
+static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifaddrlblmsg *ifal;
+ struct nlattr *tb[IFAL_MAX+1];
+ struct in6_addr *pfx;
+ u32 label;
+ int err = 0;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX,
+ ifal_policy, extack);
+ if (err < 0)
+ return err;
+
+ ifal = nlmsg_data(nlh);
+
+ if (ifal->ifal_family != AF_INET6 ||
+ ifal->ifal_prefixlen > 128)
+ return -EINVAL;
+
+ if (!tb[IFAL_ADDRESS])
+ return -EINVAL;
+ pfx = nla_data(tb[IFAL_ADDRESS]);
+
+ if (!tb[IFAL_LABEL])
+ return -EINVAL;
+ label = nla_get_u32(tb[IFAL_LABEL]);
+ if (label == IPV6_ADDR_LABEL_DEFAULT)
+ return -EINVAL;
+
+ switch (nlh->nlmsg_type) {
+ case RTM_NEWADDRLABEL:
+ if (ifal->ifal_index &&
+ !addrlbl_ifindex_exists(net, ifal->ifal_index))
+ return -EINVAL;
+
+ err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen,
+ ifal->ifal_index, label,
+ nlh->nlmsg_flags & NLM_F_REPLACE);
+ break;
+ case RTM_DELADDRLABEL:
+ err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen,
+ ifal->ifal_index);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
+static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
+ int prefixlen, int ifindex, u32 lseq)
+{
+ struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
+ ifal->ifal_family = AF_INET6;
+ ifal->__ifal_reserved = 0;
+ ifal->ifal_prefixlen = prefixlen;
+ ifal->ifal_flags = 0;
+ ifal->ifal_index = ifindex;
+ ifal->ifal_seq = lseq;
+};
+
+static int ip6addrlbl_fill(struct sk_buff *skb,
+ const struct ip6addrlbl_entry *p,
+ u32 lseq,
+ u32 portid, u32 seq, int event,
+ unsigned int flags)
+{
+ struct nlmsghdr *nlh = nlmsg_put(skb, portid, seq, event,
+ sizeof(struct ifaddrlblmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq);
+
+ if (nla_put_in6_addr(skb, IFAL_ADDRESS, &p->prefix) < 0 ||
+ nla_put_u32(skb, IFAL_LABEL, p->label) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrlblmsg *ifal;
+
+ ifal = nlmsg_payload(nlh, sizeof(*ifal));
+ if (!ifal) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request");
+ return -EINVAL;
+ }
+
+ if (ifal->__ifal_reserved || ifal->ifal_prefixlen ||
+ ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifal))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
+ struct ip6addrlbl_entry *p;
+ int idx = 0, s_idx = cb->args[0];
+ int err = 0;
+ u32 lseq;
+
+ if (cb->strict_check) {
+ err = ip6addrlbl_valid_dump_req(nlh, cb->extack);
+ if (err < 0)
+ return err;
+ }
+
+ rcu_read_lock();
+ lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq);
+ hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
+ if (idx >= s_idx) {
+ err = ip6addrlbl_fill(skb, p,
+ lseq,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWADDRLABEL,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+ }
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return err;
+}
+
+static inline int ip6addrlbl_msgsize(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+ + nla_total_size(16) /* IFAL_ADDRESS */
+ + nla_total_size(4); /* IFAL_LABEL */
+}
+
+static int ip6addrlbl_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrlblmsg *ifal;
+ int i, err;
+
+ ifal = nlmsg_payload(nlh, sizeof(*ifal));
+ if (!ifal) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb,
+ IFAL_MAX, ifal_policy, extack);
+
+ if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifal), tb, IFAL_MAX,
+ ifal_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= IFAL_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFAL_ADDRESS:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in addrlabel get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct ifaddrlblmsg *ifal;
+ struct nlattr *tb[IFAL_MAX+1];
+ struct in6_addr *addr;
+ u32 lseq;
+ int err = 0;
+ struct ip6addrlbl_entry *p;
+ struct sk_buff *skb;
+
+ err = ip6addrlbl_valid_get_req(in_skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+
+ ifal = nlmsg_data(nlh);
+
+ if (ifal->ifal_family != AF_INET6 ||
+ ifal->ifal_prefixlen != 128)
+ return -EINVAL;
+
+ if (ifal->ifal_index &&
+ !addrlbl_ifindex_exists(net, ifal->ifal_index))
+ return -EINVAL;
+
+ if (!tb[IFAL_ADDRESS])
+ return -EINVAL;
+ addr = nla_data(tb[IFAL_ADDRESS]);
+
+ skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = -ESRCH;
+
+ rcu_read_lock();
+ p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
+ lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq);
+ if (p)
+ err = ip6addrlbl_fill(skb, p, lseq,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWADDRLABEL, 0);
+ rcu_read_unlock();
+
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ } else {
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+ }
+ return err;
+}
+
+static const struct rtnl_msg_handler ipv6_adddr_label_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDRLABEL,
+ .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDRLABEL,
+ .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDRLABEL,
+ .doit = ip6addrlbl_get, .dumpit = ip6addrlbl_dump,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
+int __init ipv6_addr_label_rtnl_register(void)
+{
+ return rtnl_register_many(ipv6_adddr_label_rtnl_msg_handlers);
+}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 87c8f54872b7..b705751eb73c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1,25 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PF_INET6 socket protocol family
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
* Adapted from linux/net/ipv4/af_inet.c
*
- * $Id: af_inet6.c,v 1.66 2002/02/01 22:01:04 davem Exp $
- *
- * Fixes:
+ * Fixes:
* piggy, Karl Knutson : Socket protocol table
- * Hideaki YOSHIFUJI : sin6_scope_id support
- * Arnaldo Melo : check proc_net_create return, cleanups
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Hideaki YOSHIFUJI : sin6_scope_id support
+ * Arnaldo Melo : check proc_net_create return, cleanups
*/
+#define pr_fmt(fmt) "IPv6: " fmt
#include <linux/module.h>
#include <linux/capability.h>
@@ -28,7 +23,6 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -39,11 +33,11 @@
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/icmpv6.h>
-#include <linux/smp_lock.h>
#include <linux/netfilter_ipv6.h>
#include <net/ip.h>
@@ -51,62 +45,100 @@
#include <net/udp.h>
#include <net/udplite.h>
#include <net/tcp.h>
-#include <net/ipip.h>
+#include <net/ping.h>
#include <net/protocol.h>
#include <net/inet_common.h>
+#include <net/route.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
+#include <net/ndisc.h>
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
-#ifdef CONFIG_IPV6_MIP6
-#include <net/mip6.h>
-#endif
+#include <net/calipso.h>
+#include <net/seg6.h>
+#include <net/rpl.h>
+#include <net/compat.h>
+#include <net/xfrm.h>
+#include <net/ioam6.h>
+#include <net/rawv6.h>
+#include <net/rps.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <linux/mroute6.h>
+
+#include "ip6_offload.h"
MODULE_AUTHOR("Cast of dozens");
MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
MODULE_LICENSE("GPL");
-int sysctl_ipv6_bindv6only __read_mostly;
-
-/* The inetsw table contains everything that inet_create needs to
+/* The inetsw6 table contains everything that inet6_create needs to
* build a new socket.
*/
static struct list_head inetsw6[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw6_lock);
-static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
+struct ipv6_params ipv6_defaults = {
+ .disable_ipv6 = 0,
+ .autoconf = 1,
+};
+
+static int disable_ipv6_mod;
+
+module_param_named(disable, disable_ipv6_mod, int, 0444);
+MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional");
+
+module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444);
+MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces");
+
+module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444);
+MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces");
+
+bool ipv6_mod_enabled(void)
{
- const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
+ return disable_ipv6_mod == 0;
+}
+EXPORT_SYMBOL_GPL(ipv6_mod_enabled);
+
+static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
+{
+ const int offset = sk->sk_prot->ipv6_pinfo_offset;
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}
-static int inet6_create(struct socket *sock, int protocol)
+void inet6_sock_destruct(struct sock *sk)
+{
+ inet6_cleanup_sock(sk);
+ inet_sock_destruct(sk);
+}
+EXPORT_SYMBOL_GPL(inet6_sock_destruct);
+
+static int inet6_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct inet_sock *inet;
struct ipv6_pinfo *np;
struct sock *sk;
- struct list_head *p;
struct inet_protosw *answer;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
/* Look for the requested type/protocol pair. */
- answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
- list_for_each_rcu(p, &inetsw6[sock->type]) {
- answer = list_entry(p, struct inet_protosw, list);
+ list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) {
+ err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
@@ -121,10 +153,9 @@ lookup_protocol:
break;
}
err = -EPROTONOSUPPORT;
- answer = NULL;
}
- if (!answer) {
+ if (err) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
@@ -147,39 +178,41 @@ lookup_protocol:
}
err = -EPERM;
- if (answer->capability > 0 && !capable(answer->capability))
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
- BUG_TRAP(answer_prot->slab != NULL);
+ WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
- sk = sk_alloc(PF_INET6, GFP_KERNEL, answer_prot, 1);
- if (sk == NULL)
+ sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
+ if (!sk)
goto out;
sock_init_data(sock, sk);
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
+
+ if (INET_PROTOSW_ICSK & answer_flags)
+ inet_init_csk_locks(sk);
inet = inet_sk(sk);
- inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
if (SOCK_RAW == sock->type) {
- inet->num = protocol;
+ inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
- inet->hdrincl = 1;
+ inet_set_bit(HDRINCL, sk);
}
- sk->sk_destruct = inet_sock_destruct;
+ sk->sk_destruct = inet6_sock_destruct;
sk->sk_family = PF_INET6;
sk->sk_protocol = protocol;
@@ -187,97 +220,127 @@ lookup_protocol:
inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
np->hop_limit = -1;
- np->mcast_hops = -1;
- np->mc_loop = 1;
+ np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
+ inet6_set_bit(MC6_LOOP, sk);
+ inet6_set_bit(MC6_ALL, sk);
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->ipv6only = sysctl_ipv6_bindv6only;
-
+ inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
+ FLOWLABEL_REFLECT_ESTABLISHED);
+ sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
+
/* Init the ipv4 part of the socket since we can have sockets
* using v6 API for ipv4.
*/
inet->uc_ttl = -1;
- inet->mc_loop = 1;
+ inet_set_bit(MC_LOOP, sk);
inet->mc_ttl = 1;
inet->mc_index = 0;
- inet->mc_list = NULL;
+ RCU_INIT_POINTER(inet->mc_list, NULL);
+ inet->rcv_tos = 0;
- if (ipv4_config.no_pmtu_disc)
+ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- /*
- * Increment only the relevant sk_prot->socks debug field, this changes
- * the previous behaviour of incrementing both the equivalent to
- * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
- *
- * This allows better debug granularity as we'll know exactly how many
- * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
- * transport protocol socks. -acme
- */
- sk_refcnt_debug_inc(sk);
- if (inet->num) {
+ if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically shares.
*/
- inet->sport = htons(inet->num);
- sk->sk_prot->hash(sk);
+ inet->inet_sport = htons(inet->inet_num);
+ err = sk->sk_prot->hash(sk);
+ if (err)
+ goto out_sk_release;
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
+ }
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err)
+ goto out_sk_release;
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
}
-
-/* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
+ u32 flags)
{
- struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr;
- struct sock *sk = sock->sk;
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
__be32 v4addr = 0;
unsigned short snum;
+ bool saved_ipv6only;
int addr_type = 0;
int err = 0;
- /* If the socket has its own bind function then use it. */
- if (sk->sk_prot->bind)
- return sk->sk_prot->bind(sk, uaddr, addr_len);
+ if (addr->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
addr_type = ipv6_addr_type(&addr->sin6_addr);
- if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM)
+ if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM)
return -EINVAL;
snum = ntohs(addr->sin6_port);
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+ snum && inet_port_requires_bind_service(net, snum) &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
- lock_sock(sk);
+ if (flags & BIND_WITH_LOCK)
+ lock_sock(sk);
/* Check these errors (active socket, double bind). */
- if (sk->sk_state != TCP_CLOSE || inet->num) {
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num) {
err = -EINVAL;
goto out;
}
/* Check if the address belongs to the host. */
if (addr_type == IPV6_ADDR_MAPPED) {
+ struct net_device *dev = NULL;
+ int chk_addr_ret;
+
+ /* Binding to v4-mapped address on a v6-only socket
+ * makes no sense
+ */
+ if (ipv6_only_sock(sk)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ rcu_read_lock();
+ if (sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ /* Reproduce AF_INET checks to make the bindings consistent */
v4addr = addr->sin6_addr.s6_addr32[3];
- if (inet_addr_type(v4addr) != RTN_LOCAL) {
+ chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
+ rcu_read_unlock();
+
+ if (!inet_addr_valid_or_nonlocal(net, inet, v4addr,
+ chk_addr_ret)) {
err = -EADDRNOTAVAIL;
goto out;
}
@@ -285,7 +348,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_type != IPV6_ADDR_ANY) {
struct net_device *dev = NULL;
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ rcu_read_lock();
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
addr->sin6_scope_id) {
/* Override any existing binding, if another one
@@ -293,16 +357,19 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
sk->sk_bound_dev_if = addr->sin6_scope_id;
}
-
+
/* Binding to link-local address requires an interface */
if (!sk->sk_bound_dev_if) {
err = -EINVAL;
- goto out;
+ goto out_unlock;
}
- dev = dev_get_by_index(sk->sk_bound_dev_if);
+ }
+
+ if (sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
if (!dev) {
err = -ENODEV;
- goto out;
+ goto out_unlock;
}
}
@@ -311,50 +378,104 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) {
- if (dev)
- dev_put(dev);
+ if (!ipv6_can_nonlocal_bind(net, inet) &&
+ !ipv6_chk_addr(net, &addr->sin6_addr,
+ dev, 0)) {
err = -EADDRNOTAVAIL;
- goto out;
+ goto out_unlock;
}
}
- if (dev)
- dev_put(dev);
+ rcu_read_unlock();
}
}
- inet->rcv_saddr = v4addr;
- inet->saddr = v4addr;
+ inet->inet_rcv_saddr = v4addr;
+ inet->inet_saddr = v4addr;
+
+ sk->sk_v6_rcv_saddr = addr->sin6_addr;
- ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr);
-
if (!(addr_type & IPV6_ADDR_MULTICAST))
- ipv6_addr_copy(&np->saddr, &addr->sin6_addr);
+ np->saddr = addr->sin6_addr;
+
+ saved_ipv6only = sk->sk_ipv6only;
+ if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED)
+ sk->sk_ipv6only = 1;
/* Make sure we are allowed to bind here. */
- if (sk->sk_prot->get_port(sk, snum)) {
- inet_reset_saddr(sk);
- err = -EADDRINUSE;
- goto out;
+ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
+ (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
+ err = sk->sk_prot->get_port(sk, snum);
+ if (err) {
+ sk->sk_ipv6only = saved_ipv6only;
+ inet_reset_saddr(sk);
+ goto out;
+ }
+ if (!(flags & BIND_FROM_BPF)) {
+ err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
+ if (err) {
+ sk->sk_ipv6only = saved_ipv6only;
+ inet_reset_saddr(sk);
+ if (sk->sk_prot->put_port)
+ sk->sk_prot->put_port(sk);
+ goto out;
+ }
+ }
}
if (addr_type != IPV6_ADDR_ANY)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- inet->sport = htons(inet->num);
- inet->dport = 0;
- inet->daddr = 0;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_dport = 0;
+ inet->inet_daddr = 0;
out:
- release_sock(sk);
+ if (flags & BIND_WITH_LOCK)
+ release_sock(sk);
return err;
+out_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ u32 flags = BIND_WITH_LOCK;
+ const struct proto *prot;
+ int err = 0;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ /* If the socket has its own bind function then use it. */
+ if (prot->bind)
+ return prot->bind(sk, uaddr, addr_len);
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ /* BPF prog is run before any checks are done so that if the prog
+ * changes context in a wrong way it will be caught.
+ */
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+ CGROUP_INET6_BIND, &flags);
+ if (err)
+ return err;
+
+ return __inet6_bind(sk, uaddr, addr_len, flags);
}
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ return inet6_bind_sk(sock->sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet6_bind);
+
int inet6_release(struct socket *sock)
{
struct sock *sk = sock->sk;
- if (sk == NULL)
+ if (!sk)
return -EINVAL;
/* Free mc lists */
@@ -365,8 +486,9 @@ int inet6_release(struct socket *sock)
return inet_release(sock);
}
+EXPORT_SYMBOL(inet6_release);
-int inet6_destroy_sock(struct sock *sk)
+void inet6_cleanup_sock(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
@@ -374,88 +496,190 @@ int inet6_destroy_sock(struct sock *sk)
/* Release rx options */
- if ((skb = xchg(&np->pktoptions, NULL)) != NULL)
- kfree_skb(skb);
+ skb = xchg(&np->pktoptions, NULL);
+ kfree_skb(skb);
+
+ skb = xchg(&np->rxpmtu, NULL);
+ kfree_skb(skb);
/* Free flowlabels */
fl6_free_socklist(sk);
/* Free tx options */
- if ((opt = xchg(&np->opt, NULL)) != NULL)
- sock_kfree_s(sk, opt, opt->tot_len);
-
- return 0;
+ opt = unrcu_pointer(xchg(&np->opt, NULL));
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
}
-
-EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+EXPORT_SYMBOL_GPL(inet6_cleanup_sock);
/*
* This does both peername and sockname.
*/
-
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
- int *uaddr_len, int peer)
+ int peer)
{
- struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr;
+ struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
+ int sin_addr_len = sizeof(*sin);
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
-
+
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
sin->sin6_scope_id = 0;
+ lock_sock(sk);
if (peer) {
- if (!inet->dport)
+ if (!inet->inet_dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1)) {
+ release_sock(sk);
return -ENOTCONN;
- if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
- peer == 1)
- return -ENOTCONN;
- sin->sin6_port = inet->dport;
- ipv6_addr_copy(&sin->sin6_addr, &np->daddr);
- if (np->sndflow)
+ }
+ sin->sin6_port = inet->inet_dport;
+ sin->sin6_addr = sk->sk_v6_daddr;
+ if (inet6_test_bit(SNDFLOW, sk))
sin->sin6_flowinfo = np->flow_label;
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+ CGROUP_INET6_GETPEERNAME);
} else {
- if (ipv6_addr_any(&np->rcv_saddr))
- ipv6_addr_copy(&sin->sin6_addr, &np->saddr);
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ sin->sin6_addr = np->saddr;
else
- ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr);
-
- sin->sin6_port = inet->sport;
+ sin->sin6_addr = sk->sk_v6_rcv_saddr;
+ sin->sin6_port = inet->inet_sport;
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+ CGROUP_INET6_GETSOCKNAME);
}
- if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin->sin6_scope_id = sk->sk_bound_dev_if;
- *uaddr_len = sizeof(*sin);
- return(0);
+ sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
+ sk->sk_bound_dev_if);
+ release_sock(sk);
+ return sin_addr_len;
}
+EXPORT_SYMBOL(inet6_getname);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = (void __user *)arg;
struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ const struct proto *prot;
- switch(cmd)
- {
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
-
+ switch (cmd) {
case SIOCADDRT:
- case SIOCDELRT:
-
- return(ipv6_route_ioctl(cmd,(void __user *)arg));
+ case SIOCDELRT: {
+ struct in6_rtmsg rtmsg;
+ if (copy_from_user(&rtmsg, argp, sizeof(rtmsg)))
+ return -EFAULT;
+ return ipv6_route_ioctl(net, cmd, &rtmsg);
+ }
case SIOCSIFADDR:
- return addrconf_add_ifaddr((void __user *) arg);
+ return addrconf_add_ifaddr(net, argp);
case SIOCDIFADDR:
- return addrconf_del_ifaddr((void __user *) arg);
+ return addrconf_del_ifaddr(net, argp);
case SIOCSIFDSTADDR:
- return addrconf_set_dstaddr((void __user *) arg);
+ return addrconf_set_dstaddr(net, argp);
default:
- if (!sk->sk_prot->ioctl)
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ if (!prot->ioctl)
return -ENOIOCTLCMD;
- return sk->sk_prot->ioctl(sk, cmd, arg);
+ return sk_ioctl(sk, cmd, (void __user *)arg);
}
/*NOTREACHED*/
- return(0);
+ return 0;
+}
+EXPORT_SYMBOL(inet6_ioctl);
+
+#ifdef CONFIG_COMPAT
+struct compat_in6_rtmsg {
+ struct in6_addr rtmsg_dst;
+ struct in6_addr rtmsg_src;
+ struct in6_addr rtmsg_gateway;
+ u32 rtmsg_type;
+ u16 rtmsg_dst_len;
+ u16 rtmsg_src_len;
+ u32 rtmsg_metric;
+ u32 rtmsg_info;
+ u32 rtmsg_flags;
+ s32 rtmsg_ifindex;
+};
+
+static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_in6_rtmsg __user *ur)
+{
+ struct in6_rtmsg rt;
+
+ if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst,
+ 3 * sizeof(struct in6_addr)) ||
+ get_user(rt.rtmsg_type, &ur->rtmsg_type) ||
+ get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) ||
+ get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) ||
+ get_user(rt.rtmsg_metric, &ur->rtmsg_metric) ||
+ get_user(rt.rtmsg_info, &ur->rtmsg_info) ||
+ get_user(rt.rtmsg_flags, &ur->rtmsg_flags) ||
+ get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex))
+ return -EFAULT;
+
+
+ return ipv6_route_ioctl(sock_net(sk), cmd, &rt);
+}
+
+int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return inet6_compat_routing_ioctl(sk, cmd, argp);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+EXPORT_SYMBOL_GPL(inet6_compat_ioctl);
+#endif /* CONFIG_COMPAT */
+
+INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
+ size_t));
+int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return -EAGAIN;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
+ sk, msg, size);
+}
+
+INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int *));
+int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+ int addr_len = 0;
+ int err;
+
+ if (likely(!(flags & MSG_ERRQUEUE)))
+ sock_rps_record_flow(sk);
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
+ sk, msg, size, flags, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
}
const struct proto_ops inet6_stream_ops = {
@@ -469,19 +693,29 @@ const struct proto_ops inet6_stream_ops = {
.getname = inet6_getname,
.poll = tcp_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = inet_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = sock_common_recvmsg, /* ok */
- .mmap = sock_no_mmap,
- .sendpage = tcp_sendpage,
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
+#ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+#endif
+ .splice_eof = inet_splice_eof,
+ .sendmsg_locked = tcp_sendmsg_locked,
+ .splice_read = tcp_splice_read,
+ .set_peek_off = sk_set_peek_off,
+ .read_sock = tcp_read_sock,
+ .read_skb = tcp_read_skb,
+ .peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
+ .set_rcvlowat = tcp_set_rcvlowat,
};
+EXPORT_SYMBOL_GPL(inet6_stream_ops);
const struct proto_ops inet6_dgram_ops = {
.family = PF_INET6,
@@ -494,77 +728,44 @@ const struct proto_ops inet6_dgram_ops = {
.getname = inet6_getname,
.poll = udp_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = sock_common_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
+ .read_skb = udp_read_skb,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
+ .set_peek_off = udp_set_peek_off,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
};
-static struct net_proto_family inet6_family_ops = {
+static const struct net_proto_family inet6_family_ops = {
.family = PF_INET6,
.create = inet6_create,
.owner = THIS_MODULE,
};
-/* Same as inet6_dgram_ops, sans udp_poll. */
-static const struct proto_ops inet6_sockraw_ops = {
- .family = PF_INET6,
- .owner = THIS_MODULE,
- .release = inet6_release,
- .bind = inet6_bind,
- .connect = inet_dgram_connect, /* ok */
- .socketpair = sock_no_socketpair, /* a do nothing */
- .accept = sock_no_accept, /* a do nothing */
- .getname = inet6_getname,
- .poll = datagram_poll, /* ok */
- .ioctl = inet6_ioctl, /* must change */
- .listen = sock_no_listen, /* ok */
- .shutdown = inet_shutdown, /* ok */
- .setsockopt = sock_common_setsockopt, /* ok */
- .getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = sock_common_recvmsg, /* ok */
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
-};
-
-static struct inet_protosw rawv6_protosw = {
- .type = SOCK_RAW,
- .protocol = IPPROTO_IP, /* wild card */
- .prot = &rawv6_prot,
- .ops = &inet6_sockraw_ops,
- .capability = CAP_NET_RAW,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_REUSE,
-};
-
-void
-inet6_register_protosw(struct inet_protosw *p)
+int inet6_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
- int protocol = p->protocol;
struct list_head *last_perm;
+ int protocol = p->protocol;
+ int ret;
spin_lock_bh(&inetsw6_lock);
+ ret = -EINVAL;
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
answer = NULL;
+ ret = -EPERM;
last_perm = &inetsw6[p->type];
list_for_each(lh, &inetsw6[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
@@ -584,32 +785,31 @@ inet6_register_protosw(struct inet_protosw *p)
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
- * non-permanent entry. This means that when we remove this entry, the
+ * non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
+ ret = 0;
out:
spin_unlock_bh(&inetsw6_lock);
- return;
+ return ret;
out_permanent:
- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
- protocol);
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
- printk(KERN_ERR
- "Ignoring attempt to register invalid socket type %d.\n",
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
+EXPORT_SYMBOL(inet6_register_protosw);
void
inet6_unregister_protosw(struct inet_protosw *p)
{
if (INET_PROTOSW_PERMANENT & p->flags) {
- printk(KERN_ERR
- "Attempt to unregister permanent protocol %d.\n",
+ pr_err("Attempt to unregister permanent protocol %d\n",
p->protocol);
} else {
spin_lock_bh(&inetsw6_lock);
@@ -619,165 +819,275 @@ inet6_unregister_protosw(struct inet_protosw *p)
synchronize_net();
}
}
+EXPORT_SYMBOL(inet6_unregister_protosw);
int inet6_sk_rebuild_header(struct sock *sk)
{
- int err;
- struct dst_entry *dst;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dst_entry *dst;
dst = __sk_dst_check(sk, np->dst_cookie);
- if (dst == NULL) {
+ if (!dst) {
struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
- security_sk_classify_flow(sk, &fl);
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = sk->sk_protocol;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = np->saddr;
+ fl6.flowlabel = np->flow_label;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk_uid(sk);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
+ &final);
+ rcu_read_unlock();
+
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
sk->sk_route_caps = 0;
- return err;
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
+ return PTR_ERR(dst);
}
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
- return err;
- }
-
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
}
return 0;
}
-
EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
-int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
+ const struct inet6_skb_parm *opt)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_skb_parm *opt = IP6CB(skb);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
if (np->rxopt.all) {
- if ((opt->hop && (np->rxopt.bits.hopopts ||
- np->rxopt.bits.ohopopts)) ||
- ((IPV6_FLOWINFO_MASK & *(__be32*)skb->nh.raw) &&
+ if (((opt->flags & IP6SKB_HOPBYHOP) &&
+ (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
+ (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
np->rxopt.bits.rxflow) ||
(opt->srcrt && (np->rxopt.bits.srcrt ||
np->rxopt.bits.osrcrt)) ||
((opt->dst1 || opt->dst0) &&
(np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
+static struct packet_type ipv6_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IPV6),
+ .func = ipv6_rcv,
+ .list_func = ipv6_list_rcv,
+};
-int
-snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
+static int __init ipv6_packet_init(void)
{
- if (ptr == NULL)
- return -EINVAL;
-
- ptr[0] = __alloc_percpu(mibsize);
- if (!ptr[0])
- goto err0;
-
- ptr[1] = __alloc_percpu(mibsize);
- if (!ptr[1])
- goto err1;
-
+ dev_add_pack(&ipv6_packet_type);
return 0;
-
-err1:
- free_percpu(ptr[0]);
- ptr[0] = NULL;
-err0:
- return -ENOMEM;
}
-void
-snmp6_mib_free(void *ptr[2])
+static void ipv6_packet_cleanup(void)
{
- if (ptr == NULL)
- return;
- if (ptr[0])
- free_percpu(ptr[0]);
- if (ptr[1])
- free_percpu(ptr[1]);
- ptr[0] = ptr[1] = NULL;
+ dev_remove_pack(&ipv6_packet_type);
}
-static int __init init_ipv6_mibs(void)
+static int __net_init ipv6_init_mibs(struct net *net)
{
- if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
+ int i;
+
+ net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_stats_in6)
+ return -ENOMEM;
+ net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_stats_in6)
+ goto err_udplite_mib;
+ net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ipv6_statistics)
goto err_ip_mib;
- if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
- __alignof__(struct icmpv6_mib)) < 0)
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet6_stats;
+ af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i);
+ u64_stats_init(&af_inet6_stats->syncp);
+ }
+
+
+ net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib);
+ if (!net->mib.icmpv6_statistics)
goto err_icmp_mib;
- if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
- goto err_udp_mib;
- if (snmp6_mib_init((void **)udplite_stats_in6, sizeof (struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
- goto err_udplite_mib;
+ net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpv6msg_statistics)
+ goto err_icmpmsg_mib;
return 0;
-err_udplite_mib:
- snmp6_mib_free((void **)udp_stats_in6);
-err_udp_mib:
- snmp6_mib_free((void **)icmpv6_statistics);
+err_icmpmsg_mib:
+ free_percpu(net->mib.icmpv6_statistics);
err_icmp_mib:
- snmp6_mib_free((void **)ipv6_statistics);
+ free_percpu(net->mib.ipv6_statistics);
err_ip_mib:
+ free_percpu(net->mib.udplite_stats_in6);
+err_udplite_mib:
+ free_percpu(net->mib.udp_stats_in6);
return -ENOMEM;
-
}
-static void cleanup_ipv6_mibs(void)
+static void ipv6_cleanup_mibs(struct net *net)
{
- snmp6_mib_free((void **)ipv6_statistics);
- snmp6_mib_free((void **)icmpv6_statistics);
- snmp6_mib_free((void **)udp_stats_in6);
- snmp6_mib_free((void **)udplite_stats_in6);
+ free_percpu(net->mib.udp_stats_in6);
+ free_percpu(net->mib.udplite_stats_in6);
+ free_percpu(net->mib.ipv6_statistics);
+ free_percpu(net->mib.icmpv6_statistics);
+ kfree(net->mib.icmpv6msg_statistics);
}
-static int __init inet6_init(void)
+static int __net_init inet6_net_init(struct net *net)
{
- struct sk_buff *dummy_skb;
- struct list_head *r;
- int err;
+ int err = 0;
- BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
+ net->ipv6.sysctl.bindv6only = 0;
+ net->ipv6.sysctl.icmpv6_time = 1*HZ;
+ net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
+ net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
+ net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
+ net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;
+ net->ipv6.sysctl.icmpv6_errors_extension_mask = 0;
+
+ /* By default, rate limit error messages.
+ * Except for pmtu discovery, it would break it.
+ * proc_do_large_bitmap needs pointer to the bitmap.
+ */
+ bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1);
+ bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1);
+ net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask;
+
+ net->ipv6.sysctl.flowlabel_consistency = 1;
+ net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
+ net->ipv6.sysctl.idgen_retries = 3;
+ net->ipv6.sysctl.idgen_delay = 1 * HZ;
+ net->ipv6.sysctl.flowlabel_state_ranges = 0;
+ net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT;
+ net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
+ net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
+ net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
+ net->ipv6.sysctl.fib_notify_on_flag_change = 0;
+ atomic_set(&net->ipv6.fib6_sernum, 1);
+
+ net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID;
+ net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE;
+
+ err = ipv6_init_mibs(net);
+ if (err)
+ return err;
+#ifdef CONFIG_PROC_FS
+ err = udp6_proc_init(net);
+ if (err)
+ goto out;
+ err = tcp6_proc_init(net);
+ if (err)
+ goto proc_tcp6_fail;
+ err = ac6_proc_init(net);
+ if (err)
+ goto proc_ac6_fail;
+#endif
+ return err;
-#ifdef MODULE
-#if 0 /* FIXME --RR */
- if (!mod_member_present(&__this_module, can_unload))
- return -EINVAL;
+#ifdef CONFIG_PROC_FS
+proc_ac6_fail:
+ tcp6_proc_exit(net);
+proc_tcp6_fail:
+ udp6_proc_exit(net);
+out:
+ ipv6_cleanup_mibs(net);
+ return err;
+#endif
+}
- __this_module.can_unload = &ipv6_unload;
+static void __net_exit inet6_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ udp6_proc_exit(net);
+ tcp6_proc_exit(net);
+ ac6_proc_exit(net);
#endif
+ ipv6_cleanup_mibs(net);
+}
+
+static struct pernet_operations inet6_net_ops = {
+ .init = inet6_net_init,
+ .exit = inet6_net_exit,
+};
+
+static int ipv6_route_input(struct sk_buff *skb)
+{
+ ip6_route_input(skb);
+ return skb_dst(skb)->error;
+}
+
+static const struct ipv6_stub ipv6_stub_impl = {
+ .ipv6_sock_mc_join = ipv6_sock_mc_join,
+ .ipv6_sock_mc_drop = ipv6_sock_mc_drop,
+ .ipv6_dst_lookup_flow = ip6_dst_lookup_flow,
+ .ipv6_route_input = ipv6_route_input,
+ .fib6_get_table = fib6_get_table,
+ .fib6_table_lookup = fib6_table_lookup,
+ .fib6_lookup = fib6_lookup,
+ .fib6_select_path = fib6_select_path,
+ .ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
+ .fib6_nh_init = fib6_nh_init,
+ .fib6_nh_release = fib6_nh_release,
+ .fib6_nh_release_dsts = fib6_nh_release_dsts,
+ .fib6_update_sernum = fib6_update_sernum_stub,
+ .fib6_rt_update = fib6_rt_update,
+ .ip6_del_rt = ip6_del_rt,
+ .udpv6_encap_enable = udpv6_encap_enable,
+ .ndisc_send_na = ndisc_send_na,
+#if IS_ENABLED(CONFIG_XFRM)
+ .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu,
+ .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
+ .xfrm6_gro_udp_encap_rcv = xfrm6_gro_udp_encap_rcv,
+ .xfrm6_rcv_encap = xfrm6_rcv_encap,
#endif
+ .nd_tbl = &nd_tbl,
+ .ipv6_fragment = ip6_fragment,
+ .ipv6_dev_find = ipv6_dev_find,
+ .ip6_xmit = ip6_xmit,
+};
+
+static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
+ .inet6_bind = __inet6_bind,
+ .udp6_lib_lookup = __udp6_lib_lookup,
+ .ipv6_setsockopt = do_ipv6_setsockopt,
+ .ipv6_getsockopt = do_ipv6_getsockopt,
+ .ipv6_dev_get_saddr = ipv6_dev_get_saddr,
+};
+
+static int __init inet6_init(void)
+{
+ struct list_head *r;
+ int err = 0;
+
+ sock_skb_cb_check_size(sizeof(struct inet6_skb_parm));
+
+ /* Register the socket-side information for inet6_create. */
+ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ raw_hashinfo_init(&raw_v6_hashinfo);
+
+ if (disable_ipv6_mod) {
+ pr_info("Loaded, but administratively disabled, reboot required to enable\n");
+ goto out;
+ }
err = proto_register(&tcpv6_prot, 1);
if (err)
@@ -795,47 +1105,47 @@ static int __init inet6_init(void)
if (err)
goto out_unregister_udplite_proto;
-
- /* Register the socket-side information for inet6_create. */
- for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
- INIT_LIST_HEAD(r);
+ err = proto_register(&pingv6_prot, 1);
+ if (err)
+ goto out_unregister_raw_proto;
/* We MUST register RAW sockets before we create the ICMP6,
* IGMP6, or NDISC control sockets.
*/
- inet6_register_protosw(&rawv6_protosw);
+ err = rawv6_init();
+ if (err)
+ goto out_unregister_ping_proto;
/* Register the family here so that the init calls below will
* be able to create sockets. (?? is this dangerous ??)
*/
err = sock_register(&inet6_family_ops);
if (err)
- goto out_unregister_raw_proto;
+ goto out_sock_register_fail;
- /* Initialise ipv6 mibs */
- err = init_ipv6_mibs();
- if (err)
- goto out_unregister_sock;
-
/*
* ipngwg API draft makes clear that the correct semantics
* for TCP and UDP is to consider one TCP and UDP instance
- * in a host availiable by both INET and INET6 APIs and
+ * in a host available by both INET and INET6 APIs and
* able to communicate via both network protocols.
*/
-#ifdef CONFIG_SYSCTL
- ipv6_sysctl_register();
-#endif
- err = icmpv6_init(&inet6_family_ops);
+ err = register_pernet_subsys(&inet6_net_ops);
+ if (err)
+ goto register_pernet_fail;
+ err = ip6_mr_init();
+ if (err)
+ goto ipmr_fail;
+ err = icmpv6_init();
if (err)
goto icmp_fail;
- err = ndisc_init(&inet6_family_ops);
+ err = ndisc_init();
if (err)
goto ndisc_fail;
- err = igmp6_init(&inet6_family_ops);
+ err = igmp6_init();
if (err)
goto igmp_fail;
+
err = ipv6_netfilter_init();
if (err)
goto netfilter_fail;
@@ -844,61 +1154,142 @@ static int __init inet6_init(void)
err = -ENOMEM;
if (raw6_proc_init())
goto proc_raw6_fail;
- if (tcp6_proc_init())
- goto proc_tcp6_fail;
- if (udp6_proc_init())
- goto proc_udp6_fail;
if (udplite6_proc_init())
goto proc_udplite6_fail;
if (ipv6_misc_proc_init())
goto proc_misc6_fail;
-
- if (ac6_proc_init())
- goto proc_anycast6_fail;
if (if6_proc_init())
goto proc_if6_fail;
#endif
- ip6_route_init();
- ip6_flowlabel_init();
+ err = ip6_route_init();
+ if (err)
+ goto ip6_route_fail;
+ err = ndisc_late_init();
+ if (err)
+ goto ndisc_late_fail;
+ err = ip6_flowlabel_init();
+ if (err)
+ goto ip6_flowlabel_fail;
+ err = ipv6_anycast_init();
+ if (err)
+ goto ipv6_anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
/* Init v6 extension headers. */
- ipv6_rthdr_init();
- ipv6_frag_init();
- ipv6_nodata_init();
- ipv6_destopt_init();
-#ifdef CONFIG_IPV6_MIP6
- mip6_init();
-#endif
+ err = ipv6_exthdrs_init();
+ if (err)
+ goto ipv6_exthdrs_fail;
+
+ err = ipv6_frag_init();
+ if (err)
+ goto ipv6_frag_fail;
/* Init v6 transport protocols. */
- udpv6_init();
- udplitev6_init();
- tcpv6_init();
+ err = udpv6_init();
+ if (err)
+ goto udpv6_fail;
- ipv6_packet_init();
- err = 0;
+ err = udplitev6_init();
+ if (err)
+ goto udplitev6_fail;
+
+ err = udpv6_offload_init();
+ if (err)
+ goto udpv6_offload_fail;
+
+ err = tcpv6_init();
+ if (err)
+ goto tcpv6_fail;
+
+ err = ipv6_packet_init();
+ if (err)
+ goto ipv6_packet_fail;
+
+ err = pingv6_init();
+ if (err)
+ goto pingv6_fail;
+
+ err = calipso_init();
+ if (err)
+ goto calipso_fail;
+
+ err = seg6_init();
+ if (err)
+ goto seg6_fail;
+
+ err = rpl_init();
+ if (err)
+ goto rpl_fail;
+
+ err = ioam6_init();
+ if (err)
+ goto ioam6_fail;
+
+ err = igmp6_late_init();
+ if (err)
+ goto igmp6_late_err;
+
+#ifdef CONFIG_SYSCTL
+ err = ipv6_sysctl_register();
+ if (err)
+ goto sysctl_fail;
+#endif
+
+ /* ensure that ipv6 stubs are visible only after ipv6 is ready */
+ wmb();
+ ipv6_stub = &ipv6_stub_impl;
+ ipv6_bpf_stub = &ipv6_bpf_stub_impl;
out:
return err;
+#ifdef CONFIG_SYSCTL
+sysctl_fail:
+ igmp6_late_cleanup();
+#endif
+igmp6_late_err:
+ ioam6_exit();
+ioam6_fail:
+ rpl_exit();
+rpl_fail:
+ seg6_exit();
+seg6_fail:
+ calipso_exit();
+calipso_fail:
+ pingv6_exit();
+pingv6_fail:
+ ipv6_packet_cleanup();
+ipv6_packet_fail:
+ tcpv6_exit();
+tcpv6_fail:
+ udpv6_offload_exit();
+udpv6_offload_fail:
+ udplitev6_exit();
+udplitev6_fail:
+ udpv6_exit();
+udpv6_fail:
+ ipv6_frag_exit();
+ipv6_frag_fail:
+ ipv6_exthdrs_exit();
+ipv6_exthdrs_fail:
+ addrconf_cleanup();
addrconf_fail:
+ ipv6_anycast_cleanup();
+ipv6_anycast_fail:
ip6_flowlabel_cleanup();
+ip6_flowlabel_fail:
+ ndisc_late_cleanup();
+ndisc_late_fail:
ip6_route_cleanup();
+ip6_route_fail:
#ifdef CONFIG_PROC_FS
if6_proc_exit();
proc_if6_fail:
- ac6_proc_exit();
-proc_anycast6_fail:
ipv6_misc_proc_exit();
proc_misc6_fail:
udplite6_proc_exit();
proc_udplite6_fail:
- udp6_proc_exit();
-proc_udp6_fail:
- tcp6_proc_exit();
-proc_tcp6_fail:
raw6_proc_exit();
proc_raw6_fail:
#endif
@@ -910,12 +1301,16 @@ igmp_fail:
ndisc_fail:
icmpv6_cleanup();
icmp_fail:
-#ifdef CONFIG_SYSCTL
- ipv6_sysctl_unregister();
-#endif
- cleanup_ipv6_mibs();
-out_unregister_sock:
+ ip6_mr_cleanup();
+ipmr_fail:
+ unregister_pernet_subsys(&inet6_net_ops);
+register_pernet_fail:
sock_unregister(PF_INET6);
+ rtnl_unregister_all(PF_INET6);
+out_sock_register_fail:
+ rawv6_exit();
+out_unregister_ping_proto:
+ proto_unregister(&pingv6_prot);
out_unregister_raw_proto:
proto_unregister(&rawv6_prot);
out_unregister_udplite_proto:
@@ -928,39 +1323,4 @@ out_unregister_tcp_proto:
}
module_init(inet6_init);
-static void __exit inet6_exit(void)
-{
- /* First of all disallow new sockets creation. */
- sock_unregister(PF_INET6);
-#ifdef CONFIG_PROC_FS
- if6_proc_exit();
- ac6_proc_exit();
- ipv6_misc_proc_exit();
- udp6_proc_exit();
- udplite6_proc_exit();
- tcp6_proc_exit();
- raw6_proc_exit();
-#endif
-#ifdef CONFIG_IPV6_MIP6
- mip6_fini();
-#endif
- /* Cleanup code parts. */
- ip6_flowlabel_cleanup();
- addrconf_cleanup();
- ip6_route_cleanup();
- ipv6_packet_cleanup();
- igmp6_cleanup();
- ipv6_netfilter_fini();
- ndisc_cleanup();
- icmpv6_cleanup();
-#ifdef CONFIG_SYSCTL
- ipv6_sysctl_unregister();
-#endif
- cleanup_ipv6_mibs();
- proto_unregister(&rawv6_prot);
- proto_unregister(&udpv6_prot);
- proto_unregister(&tcpv6_prot);
-}
-module_exit(inet6_exit);
-
MODULE_ALIAS_NETPROTO(PF_INET6);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 12c5a4dec09e..95372e0f1d21 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -1,42 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2002 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Authors
*
- * Mitsuru KANDA @USAGI : IPv6 Support
- * Kazunori MIYAZAWA @USAGI :
- * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
- *
- * This file is derived from net/ipv4/ah.c.
+ * Mitsuru KANDA @USAGI : IPv6 Support
+ * Kazunori MIYAZAWA @USAGI :
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *
+ * This file is derived from net/ipv4/ah.c.
*/
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <crypto/hash.h>
+#include <crypto/utils.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/xfrm.h>
-#include <asm/scatterlist.h>
-static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
+#define IPV6HDR_BASELEN 8
+
+struct tmp_ext {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ struct in6_addr saddr;
+#endif
+ struct in6_addr daddr;
+ char hdrs[];
+};
+
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+/* Helper to save IPv6 addresses and extension headers to temporary storage */
+static inline void ah6_save_hdrs(struct tmp_ext *iph_ext,
+ struct ipv6hdr *top_iph, int extlen)
+{
+ if (!extlen)
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ iph_ext->saddr = top_iph->saddr;
+#endif
+ iph_ext->daddr = top_iph->daddr;
+ memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext));
+}
+
+/* Helper to restore IPv6 addresses and extension headers from temporary storage */
+static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph,
+ struct tmp_ext *iph_ext, int extlen)
+{
+ if (!extlen)
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ top_iph->saddr = iph_ext->saddr;
+#endif
+ top_iph->daddr = iph_ext->daddr;
+ memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext));
+}
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash);
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline struct tmp_ext *ah_tmp_ext(void *base)
+{
+ return base + IPV6HDR_BASELEN;
+}
+
+static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
+
+static bool zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
{
u8 *opt = (u8 *)opthdr;
int len = ipv6_optlen(opthdr);
@@ -50,11 +141,11 @@ static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
switch (opt[off]) {
- case IPV6_TLV_PAD0:
+ case IPV6_TLV_PAD1:
optlen = 1;
break;
default:
- if (len < 2)
+ if (len < 2)
goto bad;
optlen = opt[off+1]+2;
if (len < optlen)
@@ -68,13 +159,13 @@ static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
len -= optlen;
}
if (len == 0)
- return 1;
+ return true;
bad:
- return 0;
+ return false;
}
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
/**
* ipv6_rearrange_destopt - rearrange IPv6 destination options header
* @iph: IPv6 header
@@ -94,7 +185,7 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
switch (opt[off]) {
- case IPV6_TLV_PAD0:
+ case IPV6_TLV_PAD1:
optlen = 1;
break;
default:
@@ -109,18 +200,15 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
* See 11.3.2 of RFC 3775 for details.
*/
if (opt[off] == IPV6_TLV_HAO) {
- struct in6_addr final_addr;
struct ipv6_destopt_hao *hao;
hao = (struct ipv6_destopt_hao *)&opt[off];
if (hao->length != sizeof(hao->addr)) {
- if (net_ratelimit())
- printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length);
+ net_warn_ratelimited("destopt hao: invalid header length: %u\n",
+ hao->length);
goto bad;
}
- ipv6_addr_copy(&final_addr, &hao->addr);
- ipv6_addr_copy(&hao->addr, &iph->saddr);
- ipv6_addr_copy(&iph->saddr, &final_addr);
+ swap(hao->addr, iph->saddr);
}
break;
}
@@ -132,6 +220,8 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
bad:
return;
}
+#else
+static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {}
#endif
/**
@@ -152,7 +242,7 @@ static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr)
segments_left = rthdr->segments_left;
if (segments_left == 0)
return;
- rthdr->segments_left = 0;
+ rthdr->segments_left = 0;
/* The value of rthdr->hdrlen has been verified either by the system
* call if it is locally generated, or by ipv6_rthdr_rcv() for incoming
@@ -164,13 +254,13 @@ static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr)
segments = rthdr->hdrlen >> 1;
addrs = ((struct rt0_hdr *)rthdr)->addr;
- ipv6_addr_copy(&final_addr, addrs + segments - 1);
+ final_addr = addrs[segments - 1];
addrs += segments - segments_left;
memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs));
- ipv6_addr_copy(addrs, &iph->daddr);
- ipv6_addr_copy(&iph->daddr, &final_addr);
+ addrs[0] = iph->daddr;
+ iph->daddr = final_addr;
}
static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
@@ -189,16 +279,14 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
while (exthdr.raw < end) {
switch (nexthdr) {
case NEXTHDR_DEST:
-#ifdef CONFIG_IPV6_MIP6
if (dir == XFRM_POLICY_OUT)
ipv6_rearrange_destopt(iph, exthdr.opth);
-#endif
+ fallthrough;
case NEXTHDR_HOP:
if (!zero_out_mutable_opts(exthdr.opth)) {
- LIMIT_NETDEBUG(
- KERN_WARNING "overrun %sopts\n",
- nexthdr == NEXTHDR_HOP ?
- "hop" : "dest");
+ net_dbg_ratelimited("overrun %sopts\n",
+ nexthdr == NEXTHDR_HOP ?
+ "hop" : "dest");
return -EINVAL;
}
break;
@@ -207,7 +295,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
ipv6_rearrange_rthdr(iph, exthdr.rth);
break;
- default :
+ default:
return 0;
}
@@ -218,57 +306,110 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
return 0;
}
+static void ah6_output_done(void *data, int err)
+{
+ int extlen;
+ u8 *iph_base;
+ u8 *icv;
+ struct sk_buff *skb = data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct ipv6hdr *top_iph = ipv6_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ struct tmp_ext *iph_ext;
+
+ extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
+ if (extlen)
+ extlen += sizeof(*iph_ext);
+
+ iph_base = AH_SKB_CB(skb)->tmp;
+ iph_ext = ah_tmp_ext(iph_base);
+ icv = ah_tmp_icv(iph_ext, extlen);
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+ memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
+
+ ah6_restore_hdrs(top_iph, iph_ext, extlen);
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb->sk, skb, err);
+}
+
static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
+ int nfrags;
int extlen;
+ u8 *iph_base;
+ u8 *icv;
+ u8 nexthdr;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
struct ipv6hdr *top_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- u8 nexthdr;
- char tmp_base[8];
- struct {
-#ifdef CONFIG_IPV6_MIP6
- struct in6_addr saddr;
-#endif
- struct in6_addr daddr;
- char hdrs[0];
- } *tmp_ext;
+ struct tmp_ext *iph_ext;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+ nfrags = err;
+
+ skb_push(skb, -skb_network_offset(skb));
+ extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
+ if (extlen)
+ extlen += sizeof(*iph_ext);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+ err = -ENOMEM;
+ iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN +
+ extlen + seqhi_len);
+ if (!iph_base)
+ goto out;
+
+ iph_ext = ah_tmp_ext(iph_base);
+ seqhi = (__be32 *)((char *)iph_ext + extlen);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
- top_iph = (struct ipv6hdr *)skb->data;
+ ah = ip_auth_hdr(skb);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ top_iph = ipv6_hdr(skb);
top_iph->payload_len = htons(skb->len - sizeof(*top_iph));
- nexthdr = *skb->nh.raw;
- *skb->nh.raw = IPPROTO_AH;
+ nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
/* When there are no extension headers, we only need to save the first
* 8 bytes of the base IP header.
*/
- memcpy(tmp_base, top_iph, sizeof(tmp_base));
+ memcpy(iph_base, top_iph, IPV6HDR_BASELEN);
- tmp_ext = NULL;
- extlen = skb->h.raw - (unsigned char *)(top_iph + 1);
+ ah6_save_hdrs(iph_ext, top_iph, extlen);
if (extlen) {
- extlen += sizeof(*tmp_ext);
- tmp_ext = kmalloc(extlen, GFP_ATOMIC);
- if (!tmp_ext) {
- err = -ENOMEM;
- goto error;
- }
-#ifdef CONFIG_IPV6_MIP6
- memcpy(tmp_ext, &top_iph->saddr, extlen);
-#else
- memcpy(tmp_ext, &top_iph->daddr, extlen);
-#endif
err = ipv6_clear_mutable_options(top_iph,
- extlen - sizeof(*tmp_ext) +
+ extlen - sizeof(*iph_ext) +
sizeof(*top_iph),
XFRM_POLICY_OUT);
if (err)
- goto error_free_iph;
+ goto out_free;
}
- ah = (struct ip_auth_hdr *)skb->h.raw;
ah->nexthdr = nexthdr;
top_iph->priority = 0;
@@ -277,36 +418,87 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->flow_lbl[2] = 0;
top_iph->hop_limit = 0;
- ahp = x->data;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) +
- ahp->icv_trunc_len) >> 2) - 2;
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(++x->replay.oseq);
- xfrm_aevent_doreplay(x);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- if (err)
- goto error_free_iph;
- memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
- err = 0;
+ sg_init_table(sg, nfrags + sglists);
+ err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out_free;
- memcpy(top_iph, tmp_base, sizeof(tmp_base));
- if (tmp_ext) {
-#ifdef CONFIG_IPV6_MIP6
- memcpy(&top_iph->saddr, tmp_ext, extlen);
-#else
- memcpy(&top_iph->daddr, tmp_ext, extlen);
-#endif
-error_free_iph:
- kfree(tmp_ext);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
}
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah6_output_done, skb);
-error:
+ AH_SKB_CB(skb)->tmp = iph_base;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -ENOSPC)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+ memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
+
+ ah6_restore_hdrs(top_iph, iph_ext, extlen);
+
+out_free:
+ kfree(iph_base);
+out:
return err;
}
+static void ah6_input_done(void *data, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ u8 *work_iph;
+ struct sk_buff *skb = data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int hdr_len = skb_network_header_len(skb);
+ int ah_hlen = ipv6_authlen(ah);
+
+ if (err)
+ goto out;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, hdr_len);
+ icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
+
+ err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, hdr_len);
+ __skb_pull(skb, ah_hlen + hdr_len);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -hdr_len);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
+
+
static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
{
/*
@@ -316,136 +508,206 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
*
* To erase AH:
* Keeping copy of cleared headers. After AH processing,
- * Moving the pointer of skb->nh.raw by using skb_pull as long as AH
- * header length. Then copy back the copy as long as hdr_len
+ * Moving the pointer of skb->network_header by using skb_pull as long
+ * as AH header length. Then copy back the copy as long as hdr_len
* If destination header following AH exists, copy it into after [Ext2].
- *
+ *
* |<>|[IPv6][Ext1][Ext2][Dest][Payload]
* There is offset of AH before IPv6 header after the process.
*/
- struct ipv6_auth_hdr *ah;
+ u8 *auth_data;
+ u8 *icv;
+ u8 *work_iph;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct ip_auth_hdr *ah;
+ struct ipv6hdr *ip6h;
struct ah_data *ahp;
- unsigned char *tmp_hdr = NULL;
u16 hdr_len;
u16 ah_hlen;
int nexthdr;
- int err = -EINVAL;
+ int nfrags;
+ int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
goto out;
/* We are going to _remove_ AH header to keep sockets happy,
* so... Later this can change. */
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto out;
- hdr_len = skb->data - skb->nh.raw;
- ah = (struct ipv6_auth_hdr*)skb->data;
+ skb->ip_summed = CHECKSUM_NONE;
+
+ hdr_len = skb_network_header_len(skb);
+ ah = (struct ip_auth_hdr *)skb->data;
ahp = x->data;
+ ahash = ahp->ahash;
+
nexthdr = ah->nexthdr;
- ah_hlen = (ah->hdrlen + 2) << 2;
+ ah_hlen = ipv6_authlen(ah);
- if (ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len))
- goto out;
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
if (!pskb_may_pull(skb, ah_hlen))
goto out;
- tmp_hdr = kmemdup(skb->nh.raw, hdr_len, GFP_ATOMIC);
- if (!tmp_hdr)
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
goto out;
- if (ipv6_clear_mutable_options(skb->nh.ipv6h, hdr_len, XFRM_POLICY_IN))
- goto free_out;
- skb->nh.ipv6h->priority = 0;
- skb->nh.ipv6h->flow_lbl[0] = 0;
- skb->nh.ipv6h->flow_lbl[1] = 0;
- skb->nh.ipv6h->flow_lbl[2] = 0;
- skb->nh.ipv6h->hop_limit = 0;
-
- {
- u8 auth_data[MAX_AH_AUTH_LEN];
-
- memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
- memset(ah->auth_data, 0, ahp->icv_trunc_len);
- skb_push(skb, hdr_len);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- if (err)
- goto free_out;
- err = -EINVAL;
- if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) {
- LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
- x->stats.integrity_failed++;
- goto free_out;
- }
+ nfrags = err;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ ip6h = ipv6_hdr(skb);
+
+ skb_push(skb, hdr_len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
+ ahp->icv_trunc_len + seqhi_len);
+ if (!work_iph) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
+ seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memcpy(work_iph, ip6h, hdr_len);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN);
+ if (err)
+ goto out_free;
+
+ ip6h->priority = 0;
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+ ip6h->hop_limit = 0;
+
+ sg_init_table(sg, nfrags + sglists);
+ err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out_free;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
}
- skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len);
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah6_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ goto out_free;
+ }
+
+ err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+ if (err)
+ goto out_free;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, hdr_len);
__skb_pull(skb, ah_hlen + hdr_len);
- kfree(tmp_hdr);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -hdr_len);
- return nexthdr;
+ err = nexthdr;
-free_out:
- kfree(tmp_hdr);
+out_free:
+ kfree(work_iph);
out:
return err;
}
-static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
{
- struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
- struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset);
+ struct net *net = dev_net(skb->dev);
+ struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+offset);
struct xfrm_state *x;
- if (type != ICMPV6_DEST_UNREACH &&
- type != ICMPV6_PKT_TOOBIG)
- return;
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
+ x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
if (!x)
- return;
-
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" NIP6_FMT "\n",
- ntohl(ah->spi), NIP6(iph->daddr));
+ return 0;
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
+
+ return 0;
}
-static int ah6_init_state(struct xfrm_state *x)
+static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *tfm;
-
- if (!x->aalg)
- goto error;
+ struct crypto_ahash *ahash;
- /* null auth can use a zero length key */
- if (x->aalg->alg_key_len > 512)
+ if (!x->aalg) {
+ NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
goto error;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
goto error;
+ }
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
- if (ahp == NULL)
+ if (!ahp)
return -ENOMEM;
- ahp->key = x->aalg->alg_key;
- ahp->key_len = (x->aalg->alg_key_len+7)/8;
- tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
- ahp->tfm = tfm;
- if (crypto_hash_setkey(tfm, ahp->key, ahp->key_len))
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
-
+ }
+
/*
* Lookup the algorithm description maintained by xfrm_algo,
* verify crypto transform properties, and store information
@@ -456,33 +718,34 @@ static int ah6_init_state(struct xfrm_state *x)
BUG_ON(!aalg_desc);
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(tfm)) {
- printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_hash_digestsize(tfm),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ crypto_ahash_digestsize(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
}
-
+
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
-
- BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
-
- ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
- if (!ahp->work_icv)
- goto error;
-
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ switch (x->props.mode) {
+ case XFRM_MODE_BEET:
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
x->props.header_len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET");
+ goto error;
+ }
x->data = ahp;
return 0;
error:
if (ahp) {
- kfree(ahp->work_icv);
- crypto_free_hash(ahp->tfm);
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
return -EINVAL;
@@ -495,40 +758,42 @@ static void ah6_destroy(struct xfrm_state *x)
if (!ahp)
return;
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- crypto_free_hash(ahp->tfm);
- ahp->tfm = NULL;
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
-static struct xfrm_type ah6_type =
+static int ah6_rcv_cb(struct sk_buff *skb, int err)
{
- .description = "AH6",
+ return 0;
+}
+
+static const struct xfrm_type ah6_type = {
.owner = THIS_MODULE,
- .proto = IPPROTO_AH,
+ .proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
.init_state = ah6_init_state,
.destructor = ah6_destroy,
.input = ah6_input,
.output = ah6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
-static struct inet6_protocol ah6_protocol = {
+static struct xfrm6_protocol ah6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah6_rcv_cb,
.err_handler = ah6_err,
- .flags = INET6_PROTO_NOPOLICY,
+ .priority = 0,
};
static int __init ah6_init(void)
{
if (xfrm_register_type(&ah6_type, AF_INET6) < 0) {
- printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) {
- printk(KERN_INFO "ipv6 ah init: can't add protocol\n");
+ if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ah6_type, AF_INET6);
return -EAGAIN;
}
@@ -538,15 +803,15 @@ static int __init ah6_init(void)
static void __exit ah6_fini(void)
{
- if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0)
- printk(KERN_INFO "ipv6 ah close: can't remove protocol\n");
-
- if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
- printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n");
+ if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ xfrm_unregister_type(&ah6_type, AF_INET6);
}
module_init(ah6_init);
module_exit(ah6_fini);
+MODULE_DESCRIPTION("IPv6 AH transformation helpers");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index a9604764e015..52599584422b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Anycast support for IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
* David L Stevens (dlstevens@us.ibm.com)
*
* based heavily on net/ipv6/mcast.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
@@ -21,7 +17,6 @@
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
@@ -30,7 +25,9 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -43,83 +40,88 @@
#include <net/checksum.h>
-static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr);
+#define IN6_ADDR_HSIZE_SHIFT 8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
+#define ac_dereference(a, idev) \
+ rcu_dereference_protected(a, lockdep_is_held(&(idev)->lock))
-/* Big ac list lock for all the sockets */
-static DEFINE_RWLOCK(ipv6_sk_ac_lock);
+static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
-static int
-ip6_onlink(struct in6_addr *addr, struct net_device *dev)
+static u32 inet6_acaddr_hash(const struct net *net,
+ const struct in6_addr *addr)
{
- struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
- int onlink;
+ u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net));
- onlink = 0;
- rcu_read_lock();
- idev = __in6_dev_get(dev);
- if (idev) {
- read_lock_bh(&idev->lock);
- for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) {
- onlink = ipv6_prefix_equal(addr, &ifa->addr,
- ifa->prefix_len);
- if (onlink)
- break;
- }
- read_unlock_bh(&idev->lock);
- }
- rcu_read_unlock();
- return onlink;
+ return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
}
/*
* socket join an anycast group
*/
-int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
+int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_ac_socklist *pac = NULL;
+ struct net *net = sock_net(sk);
+ netdevice_tracker dev_tracker;
struct net_device *dev = NULL;
struct inet6_dev *idev;
- struct ipv6_ac_socklist *pac;
- int ishost = !ipv6_devconf.forwarding;
- int err = 0;
+ int err = 0, ishost;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (ipv6_addr_is_multicast(addr))
return -EINVAL;
- if (ipv6_chk_addr(addr, NULL, 0))
- return -EINVAL;
+
+ if (ifindex)
+ dev = netdev_get_by_index(net, ifindex, &dev_tracker, GFP_KERNEL);
+
+ if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) {
+ err = -EINVAL;
+ goto error;
+ }
pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
- if (pac == NULL)
- return -ENOMEM;
+ if (!pac) {
+ err = -ENOMEM;
+ goto error;
+ }
+
pac->acl_next = NULL;
- ipv6_addr_copy(&pac->acl_addr, addr);
+ pac->acl_addr = *addr;
+
+ ishost = !READ_ONCE(net->ipv6.devconf_all->forwarding);
if (ifindex == 0) {
struct rt6_info *rt;
- rt = rt6_lookup(addr, NULL, 0, 0);
+ rcu_read_lock();
+ rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
if (rt) {
- dev = rt->rt6i_dev;
- dev_hold(dev);
- dst_release(&rt->u.dst);
+ dev = dst_dev_rcu(&rt->dst);
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ ip6_rt_put(rt);
} else if (ishost) {
+ rcu_read_unlock();
err = -EADDRNOTAVAIL;
- goto out_free_pac;
+ goto error;
} else {
/* router, no matching interface: just pick one */
-
- dev = dev_get_by_flags(IFF_UP, IFF_UP|IFF_LOOPBACK);
+ dev = netdev_get_by_flags_rcu(net, &dev_tracker, IFF_UP,
+ IFF_UP | IFF_LOOPBACK);
}
- } else
- dev = dev_get_by_index(ifindex);
+ rcu_read_unlock();
+ }
- if (dev == NULL) {
+ if (!dev) {
err = -ENODEV;
- goto out_free_pac;
+ goto error;
}
idev = in6_dev_get(dev);
@@ -128,11 +130,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
err = -ENODEV;
else
err = -EADDRNOTAVAIL;
- goto out_dev_put;
+ goto error;
}
+
/* reset ishost, now that we have a specific device */
- ishost = !idev->cnf.forwarding;
- in6_dev_put(idev);
+ ishost = !READ_ONCE(idev->cnf.forwarding);
pac->acl_ifindex = dev->ifindex;
@@ -141,43 +143,40 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
* This obviates the need for propagating anycast routes while
* still allowing some non-router anycast participation.
*/
- if (!ip6_onlink(addr, dev)) {
+ if (!ipv6_chk_prefix(addr, dev)) {
if (ishost)
err = -EADDRNOTAVAIL;
if (err)
- goto out_dev_put;
+ goto error_idev;
}
- err = ipv6_dev_ac_inc(dev, addr);
- if (err)
- goto out_dev_put;
-
- write_lock_bh(&ipv6_sk_ac_lock);
- pac->acl_next = np->ipv6_ac_list;
- np->ipv6_ac_list = pac;
- write_unlock_bh(&ipv6_sk_ac_lock);
-
- dev_put(dev);
+ err = __ipv6_dev_ac_inc(idev, addr);
+ if (!err) {
+ pac->acl_next = np->ipv6_ac_list;
+ np->ipv6_ac_list = pac;
+ pac = NULL;
+ }
- return 0;
+error_idev:
+ in6_dev_put(idev);
+error:
+ netdev_put(dev, &dev_tracker);
-out_dev_put:
- dev_put(dev);
-out_free_pac:
- sock_kfree_s(sk, pac, sizeof(*pac));
+ if (pac)
+ sock_kfree_s(sk, pac, sizeof(*pac));
return err;
}
/*
* socket leave an anycast group
*/
-int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
+int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
+ struct ipv6_ac_socklist *pac, *prev_pac;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
struct net_device *dev;
- struct ipv6_ac_socklist *pac, *prev_pac;
- write_lock_bh(&ipv6_sk_ac_lock);
prev_pac = NULL;
for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) {
if ((ifindex == 0 || pac->acl_ifindex == ifindex) &&
@@ -185,46 +184,40 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
break;
prev_pac = pac;
}
- if (!pac) {
- write_unlock_bh(&ipv6_sk_ac_lock);
+ if (!pac)
return -ENOENT;
- }
if (prev_pac)
prev_pac->acl_next = pac->acl_next;
else
np->ipv6_ac_list = pac->acl_next;
- write_unlock_bh(&ipv6_sk_ac_lock);
-
- dev = dev_get_by_index(pac->acl_ifindex);
+ dev = dev_get_by_index(net, pac->acl_ifindex);
if (dev) {
ipv6_dev_ac_dec(dev, &pac->acl_addr);
dev_put(dev);
}
+
sock_kfree_s(sk, pac, sizeof(*pac));
return 0;
}
-void ipv6_sock_ac_close(struct sock *sk)
+void __ipv6_sock_ac_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
struct net_device *dev = NULL;
struct ipv6_ac_socklist *pac;
- int prev_index;
+ int prev_index = 0;
- write_lock_bh(&ipv6_sk_ac_lock);
pac = np->ipv6_ac_list;
np->ipv6_ac_list = NULL;
- write_unlock_bh(&ipv6_sk_ac_lock);
- prev_index = 0;
while (pac) {
struct ipv6_ac_socklist *next = pac->acl_next;
if (pac->acl_ifindex != prev_index) {
- if (dev)
- dev_put(dev);
- dev = dev_get_by_index(pac->acl_ifindex);
+ dev_put(dev);
+ dev = dev_get_by_index(net, pac->acl_ifindex);
prev_index = pac->acl_ifindex;
}
if (dev)
@@ -232,70 +225,125 @@ void ipv6_sock_ac_close(struct sock *sk)
sock_kfree_s(sk, pac, sizeof(*pac));
pac = next;
}
- if (dev)
- dev_put(dev);
-}
-#if 0
-/* The function is not used, which is funny. Apparently, author
- * supposed to use it to filter out datagrams inside udp/raw but forgot.
- *
- * It is OK, anycasts are not special comparing to delivery to unicasts.
- */
+ dev_put(dev);
+}
-int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex)
+void ipv6_sock_ac_close(struct sock *sk)
{
- struct ipv6_ac_socklist *pac;
struct ipv6_pinfo *np = inet6_sk(sk);
- int found;
- found = 0;
- read_lock(&ipv6_sk_ac_lock);
- for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) {
- if (ifindex && pac->acl_ifindex != ifindex)
- continue;
- found = ipv6_addr_equal(&pac->acl_addr, addr);
- if (found)
- break;
- }
- read_unlock(&ipv6_sk_ac_lock);
+ if (!np->ipv6_ac_list)
+ return;
- return found;
+ __ipv6_sock_ac_close(sk);
}
-#endif
+static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca)
+{
+ unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr);
+
+ spin_lock(&acaddr_hash_lock);
+ hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]);
+ spin_unlock(&acaddr_hash_lock);
+}
+
+static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca)
+{
+ spin_lock(&acaddr_hash_lock);
+ hlist_del_init_rcu(&aca->aca_addr_lst);
+ spin_unlock(&acaddr_hash_lock);
+}
+
+static void aca_get(struct ifacaddr6 *aca)
+{
+ refcount_inc(&aca->aca_refcnt);
+}
+
+static void aca_free_rcu(struct rcu_head *h)
+{
+ struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu);
+
+ fib6_info_release(aca->aca_rt);
+ kfree(aca);
+}
static void aca_put(struct ifacaddr6 *ac)
{
- if (atomic_dec_and_test(&ac->aca_refcnt)) {
- in6_dev_put(ac->aca_idev);
- dst_release(&ac->aca_rt->u.dst);
- kfree(ac);
+ if (refcount_dec_and_test(&ac->aca_refcnt))
+ call_rcu_hurry(&ac->rcu, aca_free_rcu);
+}
+
+static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
+ const struct in6_addr *addr)
+{
+ struct ifacaddr6 *aca;
+
+ aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
+ if (!aca)
+ return NULL;
+
+ aca->aca_addr = *addr;
+ fib6_info_hold(f6i);
+ aca->aca_rt = f6i;
+ INIT_HLIST_NODE(&aca->aca_addr_lst);
+ aca->aca_users = 1;
+ /* aca_tstamp should be updated upon changes */
+ aca->aca_cstamp = aca->aca_tstamp = jiffies;
+ refcount_set(&aca->aca_refcnt, 1);
+
+ return aca;
+}
+
+static void inet6_ifacaddr_notify(struct net_device *dev,
+ const struct ifacaddr6 *ifaca, int event)
+{
+ struct inet6_fill_args fillargs = {
+ .event = event,
+ .netnsid = -1,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(struct in6_addr)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet6_fill_ifacaddr(skb, ifaca, &fillargs);
+ if (err < 0) {
+ pr_err("Failed to fill in anycast addresses (err %d)\n", err);
+ nlmsg_free(skb);
+ goto error;
}
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err);
}
/*
* device anycast group inc (add if not found)
*/
-int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
+int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca;
- struct inet6_dev *idev;
- struct rt6_info *rt;
+ struct fib6_info *f6i;
+ struct net *net;
int err;
- idev = in6_dev_get(dev);
-
- if (idev == NULL)
- return -EINVAL;
-
write_lock_bh(&idev->lock);
if (idev->dead) {
err = -ENODEV;
goto out;
}
- for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+ for (aca = ac_dereference(idev->ac_list, idev); aca;
+ aca = ac_dereference(aca->aca_next, idev)) {
if (ipv6_addr_equal(&aca->aca_addr, addr)) {
aca->aca_users++;
err = 0;
@@ -303,61 +351,54 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
}
}
- /*
- * not found: create a new one.
- */
-
- aca = kzalloc(sizeof(struct ifacaddr6), GFP_ATOMIC);
-
- if (aca == NULL) {
- err = -ENOMEM;
+ net = dev_net(idev->dev);
+ f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
goto out;
}
-
- rt = addrconf_dst_alloc(idev, addr, 1);
- if (IS_ERR(rt)) {
- kfree(aca);
- err = PTR_ERR(rt);
+ aca = aca_alloc(f6i, addr);
+ if (!aca) {
+ fib6_info_release(f6i);
+ err = -ENOMEM;
goto out;
}
- ipv6_addr_copy(&aca->aca_addr, addr);
- aca->aca_idev = idev;
- aca->aca_rt = rt;
- aca->aca_users = 1;
- /* aca_tstamp should be updated upon changes */
- aca->aca_cstamp = aca->aca_tstamp = jiffies;
- atomic_set(&aca->aca_refcnt, 2);
- spin_lock_init(&aca->aca_lock);
-
+ /* Hold this for addrconf_join_solict() below before we unlock,
+ * it is already exposed via idev->ac_list.
+ */
+ aca_get(aca);
aca->aca_next = idev->ac_list;
- idev->ac_list = aca;
+ rcu_assign_pointer(idev->ac_list, aca);
+
write_unlock_bh(&idev->lock);
- dst_hold(&rt->u.dst);
- if (ip6_ins_rt(rt))
- dst_release(&rt->u.dst);
+ ipv6_add_acaddr_hash(net, aca);
+
+ ip6_ins_rt(net, f6i);
- addrconf_join_solict(dev, &aca->aca_addr);
+ addrconf_join_solict(idev->dev, &aca->aca_addr);
+
+ inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST);
aca_put(aca);
return 0;
out:
write_unlock_bh(&idev->lock);
- in6_dev_put(idev);
return err;
}
/*
* device anycast group decrement
*/
-int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca, *prev_aca;
write_lock_bh(&idev->lock);
prev_aca = NULL;
- for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+ for (aca = ac_dereference(idev->ac_list, idev); aca;
+ aca = ac_dereference(aca->aca_next, idev)) {
if (ipv6_addr_equal(&aca->aca_addr, addr))
break;
prev_aca = aca;
@@ -371,97 +412,143 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
return 0;
}
if (prev_aca)
- prev_aca->aca_next = aca->aca_next;
+ rcu_assign_pointer(prev_aca->aca_next, aca->aca_next);
else
- idev->ac_list = aca->aca_next;
+ rcu_assign_pointer(idev->ac_list, aca->aca_next);
write_unlock_bh(&idev->lock);
+ ipv6_del_acaddr_hash(aca);
addrconf_leave_solict(idev, &aca->aca_addr);
- dst_hold(&aca->aca_rt->u.dst);
- if (ip6_del_rt(aca->aca_rt))
- dst_free(&aca->aca_rt->u.dst);
- else
- dst_release(&aca->aca_rt->u.dst);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
+
+ inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST);
aca_put(aca);
return 0;
}
-static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr)
+static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
{
- int ret;
struct inet6_dev *idev = in6_dev_get(dev);
- if (idev == NULL)
+ int err;
+
+ if (!idev)
return -ENODEV;
- ret = __ipv6_dev_ac_dec(idev, addr);
+
+ err = __ipv6_dev_ac_dec(idev, addr);
in6_dev_put(idev);
- return ret;
+
+ return err;
}
-
+
+void ipv6_ac_destroy_dev(struct inet6_dev *idev)
+{
+ struct ifacaddr6 *aca;
+
+ write_lock_bh(&idev->lock);
+ while ((aca = ac_dereference(idev->ac_list, idev)) != NULL) {
+ rcu_assign_pointer(idev->ac_list, aca->aca_next);
+ write_unlock_bh(&idev->lock);
+
+ ipv6_del_acaddr_hash(aca);
+
+ addrconf_leave_solict(idev, &aca->aca_addr);
+
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
+
+ aca_put(aca);
+
+ write_lock_bh(&idev->lock);
+ }
+ write_unlock_bh(&idev->lock);
+}
+
/*
* check if the interface has this anycast address
+ * called with rcu_read_lock()
*/
-static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr)
+static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr)
{
struct inet6_dev *idev;
struct ifacaddr6 *aca;
- idev = in6_dev_get(dev);
+ idev = __in6_dev_get(dev);
if (idev) {
- read_lock_bh(&idev->lock);
- for (aca = idev->ac_list; aca; aca = aca->aca_next)
+ for (aca = rcu_dereference(idev->ac_list); aca;
+ aca = rcu_dereference(aca->aca_next))
if (ipv6_addr_equal(&aca->aca_addr, addr))
break;
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
- return aca != 0;
+ return aca != NULL;
}
- return 0;
+ return false;
}
/*
* check if given interface (or any, if dev==0) has this anycast address
*/
-int ipv6_chk_acast_addr(struct net_device *dev, struct in6_addr *addr)
+bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
+ const struct in6_addr *addr)
{
+ struct net_device *nh_dev;
+ struct ifacaddr6 *aca;
+ bool found = false;
+
+ rcu_read_lock();
if (dev)
- return ipv6_chk_acast_dev(dev, addr);
- read_lock(&dev_base_lock);
- for (dev=dev_base; dev; dev=dev->next)
- if (ipv6_chk_acast_dev(dev, addr))
- break;
- read_unlock(&dev_base_lock);
- return dev != 0;
+ found = ipv6_chk_acast_dev(dev, addr);
+ else {
+ unsigned int hash = inet6_acaddr_hash(net, addr);
+
+ hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
+ aca_addr_lst) {
+ nh_dev = fib6_info_nh_dev(aca->aca_rt);
+ if (!nh_dev || !net_eq(dev_net(nh_dev), net))
+ continue;
+ if (ipv6_addr_equal(&aca->aca_addr, addr)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return found;
}
+/* check if this anycast address is link-local on given interface or
+ * is global
+ */
+bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ return ipv6_chk_acast_addr(net,
+ (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ?
+ dev : NULL),
+ addr);
+}
#ifdef CONFIG_PROC_FS
struct ac6_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
- struct inet6_dev *idev;
};
#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private)
static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
{
- struct ifacaddr6 *im = NULL;
struct ac6_iter_state *state = ac6_seq_private(seq);
+ struct net *net = seq_file_net(seq);
+ struct ifacaddr6 *im = NULL;
- for (state->dev = dev_base, state->idev = NULL;
- state->dev;
- state->dev = state->dev->next) {
+ for_each_netdev_rcu(net, state->dev) {
struct inet6_dev *idev;
- idev = in6_dev_get(state->dev);
+
+ idev = __in6_dev_get(state->dev);
if (!idev)
continue;
- read_lock_bh(&idev->lock);
- im = idev->ac_list;
- if (im) {
- state->idev = idev;
+ im = rcu_dereference(idev->ac_list);
+ if (im)
break;
- }
- read_unlock_bh(&idev->lock);
}
return im;
}
@@ -469,23 +556,17 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im)
{
struct ac6_iter_state *state = ac6_seq_private(seq);
+ struct inet6_dev *idev;
- im = im->aca_next;
+ im = rcu_dereference(im->aca_next);
while (!im) {
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- in6_dev_put(state->idev);
- }
- state->dev = state->dev->next;
- if (!state->dev) {
- state->idev = NULL;
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev)
break;
- }
- state->idev = in6_dev_get(state->dev);
- if (!state->idev)
+ idev = __in6_dev_get(state->dev);
+ if (!idev)
continue;
- read_lock_bh(&state->idev->lock);
- im = state->idev->ac_list;
+ im = rcu_dereference(idev->ac_list);
}
return im;
}
@@ -500,27 +581,24 @@ static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos)
}
static void *ac6_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return ac6_get_idx(seq, *pos);
}
static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct ifacaddr6 *im;
- im = ac6_get_next(seq, v);
+ struct ifacaddr6 *im = ac6_get_next(seq, v);
+
++*pos;
return im;
}
static void ac6_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
- struct ac6_iter_state *state = ac6_seq_private(seq);
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- in6_dev_put(state->idev);
- }
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int ac6_seq_show(struct seq_file *seq, void *v)
@@ -528,62 +606,51 @@ static int ac6_seq_show(struct seq_file *seq, void *v)
struct ifacaddr6 *im = (struct ifacaddr6 *)v;
struct ac6_iter_state *state = ac6_seq_private(seq);
- seq_printf(seq,
- "%-4d %-15s " NIP6_SEQFMT " %5d\n",
+ seq_printf(seq, "%-4d %-15s %pi6 %5d\n",
state->dev->ifindex, state->dev->name,
- NIP6(im->aca_addr),
- im->aca_users);
+ &im->aca_addr, im->aca_users);
return 0;
}
-static struct seq_operations ac6_seq_ops = {
+static const struct seq_operations ac6_seq_ops = {
.start = ac6_seq_start,
.next = ac6_seq_next,
.stop = ac6_seq_stop,
.show = ac6_seq_show,
};
-static int ac6_seq_open(struct inode *inode, struct file *file)
+int __net_init ac6_proc_init(struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ac6_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &ac6_seq_ops);
- if (rc)
- goto out_kfree;
+ if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops,
+ sizeof(struct ac6_iter_state)))
+ return -ENOMEM;
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return 0;
}
-static struct file_operations ac6_seq_fops = {
- .owner = THIS_MODULE,
- .open = ac6_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
+void ac6_proc_exit(struct net *net)
+{
+ remove_proc_entry("anycast6", net->proc_net);
+}
+#endif
-int __init ac6_proc_init(void)
+/* Init / cleanup code
+ */
+int __init ipv6_anycast_init(void)
{
- if (!proc_net_fops_create("anycast6", S_IRUGO, &ac6_seq_fops))
- return -ENOMEM;
+ int i;
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet6_acaddr_lst[i]);
return 0;
}
-void ac6_proc_exit(void)
+void ipv6_anycast_cleanup(void)
{
- proc_net_remove("anycast6");
-}
-#endif
+ int i;
+ spin_lock(&acaddr_hash_lock);
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON(!hlist_empty(&inet6_acaddr_lst[i]));
+ spin_unlock(&acaddr_hash_lock);
+}
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
new file mode 100644
index 000000000000..df1986973430
--- /dev/null
+++ b/net/ipv6/calipso.c
@@ -0,0 +1,1479 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CALIPSO - Common Architecture Label IPv6 Security Option
+ *
+ * This is an implementation of the CALIPSO protocol as specified in
+ * RFC 5570.
+ *
+ * Authors: Paul Moore <paul.moore@hp.com>
+ * Huw Davies <huw@codeweavers.com>
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/unaligned.h>
+#include <linux/crc-ccitt.h>
+
+/* Maximum size of the calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_OPT_LEN_MAX (2 + 252)
+
+/* Size of the minimum calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_HDR_LEN (2 + 8)
+
+/* Maximum size of the calipso option including
+ * the two-byte TLV header and upto 3 bytes of
+ * leading pad and 7 bytes of trailing pad.
+ */
+#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7)
+
+ /* Maximum size of u32 aligned buffer required to hold calipso
+ * option. Max of 3 initial pad bytes starting from buffer + 3.
+ * i.e. the worst case is when the previous tlv finishes on 4n + 3.
+ */
+#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX)
+
+/* List of available DOI definitions */
+static DEFINE_SPINLOCK(calipso_doi_list_lock);
+static LIST_HEAD(calipso_doi_list);
+
+/* Label mapping cache */
+int calipso_cache_enabled = 1;
+int calipso_cache_bucketsize = 10;
+#define CALIPSO_CACHE_BUCKETBITS 7
+#define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS)
+#define CALIPSO_CACHE_REORDERLIMIT 10
+struct calipso_map_cache_bkt {
+ spinlock_t lock;
+ u32 size;
+ struct list_head list;
+};
+
+struct calipso_map_cache_entry {
+ u32 hash;
+ unsigned char *key;
+ size_t key_len;
+
+ struct netlbl_lsm_cache *lsm_data;
+
+ u32 activity;
+ struct list_head list;
+};
+
+static struct calipso_map_cache_bkt *calipso_cache;
+
+static void calipso_cache_invalidate(void);
+static void calipso_doi_putdef(struct calipso_doi *doi_def);
+
+/* Label Mapping Cache Functions
+ */
+
+/**
+ * calipso_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry)
+{
+ if (entry->lsm_data)
+ netlbl_secattr_cache_free(entry->lsm_data);
+ kfree(entry->key);
+ kfree(entry);
+}
+
+/**
+ * calipso_map_cache_hash - Hashing function for the CALIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CALIPSO tag hashing function. Returns a 32-bit hash value.
+ *
+ */
+static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+/**
+ * calipso_cache_init - Initialize the CALIPSO cache
+ *
+ * Description:
+ * Initializes the CALIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file. Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init calipso_cache_init(void)
+{
+ u32 iter;
+
+ calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS,
+ sizeof(struct calipso_map_cache_bkt),
+ GFP_KERNEL);
+ if (!calipso_cache)
+ return -ENOMEM;
+
+ for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+ spin_lock_init(&calipso_cache[iter].lock);
+ calipso_cache[iter].size = 0;
+ INIT_LIST_HEAD(&calipso_cache[iter].list);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+static void calipso_cache_invalidate(void)
+{
+ struct calipso_map_cache_entry *entry, *tmp_entry;
+ u32 iter;
+
+ for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+ spin_lock_bh(&calipso_cache[iter].lock);
+ list_for_each_entry_safe(entry,
+ tmp_entry,
+ &calipso_cache[iter].list, list) {
+ list_del(&entry->list);
+ calipso_cache_entry_free(entry);
+ }
+ calipso_cache[iter].size = 0;
+ spin_unlock_bh(&calipso_cache[iter].lock);
+ }
+}
+
+/**
+ * calipso_cache_check - Check the CALIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key. If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes. The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ * 1. The cache entry's activity counter is incremented
+ * 2. The previous (higher ranking) entry's activity counter is decremented
+ * 3. If the difference between the two activity counters is geater than
+ * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int calipso_cache_check(const unsigned char *key,
+ u32 key_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ u32 bkt;
+ struct calipso_map_cache_entry *entry;
+ struct calipso_map_cache_entry *prev_entry = NULL;
+ u32 hash;
+
+ if (!calipso_cache_enabled)
+ return -ENOENT;
+
+ hash = calipso_map_cache_hash(key, key_len);
+ bkt = hash & (CALIPSO_CACHE_BUCKETS - 1);
+ spin_lock_bh(&calipso_cache[bkt].lock);
+ list_for_each_entry(entry, &calipso_cache[bkt].list, list) {
+ if (entry->hash == hash &&
+ entry->key_len == key_len &&
+ memcmp(entry->key, key, key_len) == 0) {
+ entry->activity += 1;
+ refcount_inc(&entry->lsm_data->refcount);
+ secattr->cache = entry->lsm_data;
+ secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CALIPSO;
+ if (!prev_entry) {
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+ return 0;
+ }
+
+ if (prev_entry->activity > 0)
+ prev_entry->activity -= 1;
+ if (entry->activity > prev_entry->activity &&
+ entry->activity - prev_entry->activity >
+ CALIPSO_CACHE_REORDERLIMIT) {
+ __list_del(entry->list.prev, entry->list.next);
+ __list_add(&entry->list,
+ prev_entry->list.prev,
+ &prev_entry->list);
+ }
+
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+ return 0;
+ }
+ prev_entry = entry;
+ }
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+
+ return -ENOENT;
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache. Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first. It is important to note that there is
+ * currently no checking for duplicate keys. Returns zero on success,
+ * negative values on failure. The key stored starts at calipso_ptr + 2,
+ * i.e. the type and length bytes are not stored, this corresponds to
+ * calipso_ptr[1] bytes of data.
+ *
+ */
+static int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ u32 bkt;
+ struct calipso_map_cache_entry *entry = NULL;
+ struct calipso_map_cache_entry *old_entry = NULL;
+ u32 calipso_ptr_len;
+
+ if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0)
+ return 0;
+
+ calipso_ptr_len = calipso_ptr[1];
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+ entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC);
+ if (!entry->key) {
+ ret_val = -ENOMEM;
+ goto cache_add_failure;
+ }
+ entry->key_len = calipso_ptr_len;
+ entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len);
+ refcount_inc(&secattr->cache->refcount);
+ entry->lsm_data = secattr->cache;
+
+ bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1);
+ spin_lock_bh(&calipso_cache[bkt].lock);
+ if (calipso_cache[bkt].size < calipso_cache_bucketsize) {
+ list_add(&entry->list, &calipso_cache[bkt].list);
+ calipso_cache[bkt].size += 1;
+ } else {
+ old_entry = list_entry(calipso_cache[bkt].list.prev,
+ struct calipso_map_cache_entry, list);
+ list_del(&old_entry->list);
+ list_add(&entry->list, &calipso_cache[bkt].list);
+ calipso_cache_entry_free(old_entry);
+ }
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+
+ return 0;
+
+cache_add_failure:
+ if (entry)
+ calipso_cache_entry_free(entry);
+ return ret_val;
+}
+
+/* DOI List Functions
+ */
+
+/**
+ * calipso_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct calipso_doi *calipso_doi_search(u32 doi)
+{
+ struct calipso_doi *iter;
+
+ list_for_each_entry_rcu(iter, &calipso_doi_list, list)
+ if (iter->doi == doi && refcount_read(&iter->refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+static int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CALIPSO_DOI_UNKNOWN)
+ goto doi_add_return;
+
+ refcount_set(&doi_def->refcount, 1);
+
+ spin_lock(&calipso_doi_list_lock);
+ if (calipso_doi_search(doi_def->doi)) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
+ list_add_tail_rcu(&doi_def->list, &calipso_doi_list);
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info);
+ if (audit_buf) {
+ const char *type_str;
+
+ switch (doi_type) {
+ case CALIPSO_MAP_PASS:
+ type_str = "pass";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " calipso_doi=%u calipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+static void calipso_doi_free(struct calipso_doi *doi_def)
+{
+ kfree(doi_def);
+}
+
+/**
+ * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void calipso_doi_free_rcu(struct rcu_head *entry)
+{
+ struct calipso_doi *doi_def;
+
+ doi_def = container_of(entry, struct calipso_doi, rcu);
+ calipso_doi_free(doi_def);
+}
+
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct calipso_doi *doi_def;
+ struct audit_buffer *audit_buf;
+
+ spin_lock(&calipso_doi_list_lock);
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -ENOENT;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&calipso_doi_list_lock);
+
+ calipso_doi_putdef(doi_def);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info);
+ if (audit_buf) {
+ audit_log_format(audit_buf,
+ " calipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+static struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+ struct calipso_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def)
+ goto doi_getdef_return;
+ if (!refcount_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+static void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ if (!refcount_dec_and_test(&doi_def->refcount))
+ return;
+
+ calipso_cache_invalidate();
+ call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+}
+
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+static int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def,
+ void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 doi_cnt = 0;
+ struct calipso_doi *iter_doi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list)
+ if (refcount_read(&iter_doi->refcount) > 0) {
+ if (doi_cnt++ < *skip_cnt)
+ continue;
+ ret_val = callback(iter_doi, cb_arg);
+ if (ret_val < 0) {
+ doi_cnt--;
+ goto doi_walk_return;
+ }
+ }
+
+doi_walk_return:
+ rcu_read_unlock();
+ *skip_cnt = doi_cnt;
+ return ret_val;
+}
+
+/**
+ * calipso_validate - Validate a CALIPSO option
+ * @skb: the packet
+ * @option: the start of the option
+ *
+ * Description:
+ * This routine is called to validate a CALIPSO option.
+ * If the option is valid then %true is returned, otherwise
+ * %false is returned.
+ *
+ * The caller should have already checked that the length of the
+ * option (including the TLV header) is >= 10 and that the catmap
+ * length is consistent with the option length.
+ *
+ * We leave checks on the level and categories to the socket layer.
+ */
+bool calipso_validate(const struct sk_buff *skb, const unsigned char *option)
+{
+ struct calipso_doi *doi_def;
+ bool ret_val;
+ u16 crc, len = option[1] + 2;
+ static const u8 zero[2];
+
+ /* The original CRC runs over the option including the TLV header
+ * with the CRC-16 field (at offset 8) zeroed out. */
+ crc = crc_ccitt(0xffff, option, 8);
+ crc = crc_ccitt(crc, zero, sizeof(zero));
+ if (len > 10)
+ crc = crc_ccitt(crc, option + 10, len - 10);
+ crc = ~crc;
+ if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff))
+ return false;
+
+ rcu_read_lock();
+ doi_def = calipso_doi_search(get_unaligned_be32(option + 2));
+ ret_val = !!doi_def;
+ rcu_read_unlock();
+
+ return ret_val;
+}
+
+/**
+ * calipso_map_cat_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CALIPSO bitmap using the given DOI definition. Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int calipso_map_cat_hton(const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int spot = -1;
+ u32 net_spot_max = 0;
+ u32 net_clen_bits = net_cat_len * 8;
+
+ for (;;) {
+ spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+ spot + 1);
+ if (spot < 0)
+ break;
+ if (spot >= net_clen_bits)
+ return -ENOSPC;
+ netlbl_bitmap_setbit(net_cat, spot, 1);
+
+ if (spot > net_spot_max)
+ net_spot_max = spot;
+ }
+
+ return (net_spot_max / 32 + 1) * 4;
+}
+
+/**
+ * calipso_map_cat_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CALIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ int spot = -1;
+ u32 net_clen_bits = net_cat_len * 8;
+
+ for (;;) {
+ spot = netlbl_bitmap_walk(net_cat,
+ net_clen_bits,
+ spot + 1,
+ 1);
+ if (spot < 0)
+ return 0;
+
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * calipso_pad_write - Writes pad bytes in TLV format
+ * @buf: the buffer
+ * @offset: offset from start of buffer to write padding
+ * @count: number of pad bytes to write
+ *
+ * Description:
+ * Write @count bytes of TLV padding into @buffer starting at offset @offset.
+ * @count should be less than 8 - see RFC 4942.
+ *
+ */
+static int calipso_pad_write(unsigned char *buf, unsigned int offset,
+ unsigned int count)
+{
+ if (WARN_ON_ONCE(count >= 8))
+ return -EINVAL;
+
+ switch (count) {
+ case 0:
+ break;
+ case 1:
+ buf[offset] = IPV6_TLV_PAD1;
+ break;
+ default:
+ buf[offset] = IPV6_TLV_PADN;
+ buf[offset + 1] = count - 2;
+ if (count > 2)
+ memset(buf + offset + 2, 0, count - 2);
+ break;
+ }
+ return 0;
+}
+
+/**
+ * calipso_genopt - Generate a CALIPSO option
+ * @buf: the option buffer
+ * @start: offset from which to write
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CALIPSO option using the DOI definition and security attributes
+ * passed to the function. This also generates upto three bytes of leading
+ * padding that ensures that the option is 4n + 2 aligned. It returns the
+ * number of bytes written (including any initial padding).
+ */
+static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 len, pad;
+ u16 crc;
+ static const unsigned char padding[4] = {2, 1, 0, 3};
+ unsigned char *calipso;
+
+ /* CALIPSO has 4n + 2 alignment */
+ pad = padding[start & 3];
+ if (buf_len <= start + pad + CALIPSO_HDR_LEN)
+ return -ENOSPC;
+
+ if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+ return -EPERM;
+
+ len = CALIPSO_HDR_LEN;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = calipso_map_cat_hton(doi_def,
+ secattr,
+ buf + start + pad + len,
+ buf_len - start - pad - len);
+ if (ret_val < 0)
+ return ret_val;
+ len += ret_val;
+ }
+
+ calipso_pad_write(buf, start, pad);
+ calipso = buf + start + pad;
+
+ calipso[0] = IPV6_TLV_CALIPSO;
+ calipso[1] = len - 2;
+ *(__be32 *)(calipso + 2) = htonl(doi_def->doi);
+ calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
+ calipso[7] = secattr->attr.mls.lvl;
+ crc = ~crc_ccitt(0xffff, calipso, len);
+ calipso[8] = crc & 0xff;
+ calipso[9] = (crc >> 8) & 0xff;
+ return pad + len;
+}
+
+/* Hop-by-hop hdr helper functions
+ */
+
+/**
+ * calipso_opt_update - Replaces socket's hop options with a new set
+ * @sk: the socket
+ * @hop: new hop options
+ *
+ * Description:
+ * Replaces @sk's hop options with @hop. @hop may be NULL to leave
+ * the socket with no hop options.
+ *
+ */
+static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
+{
+ struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
+
+ txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
+ txopt_put(old);
+ if (IS_ERR(txopts))
+ return PTR_ERR(txopts);
+
+ txopts = ipv6_update_options(sk, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_tlv_len - Returns the length of the TLV
+ * @opt: the option header
+ * @offset: offset of the TLV within the header
+ *
+ * Description:
+ * Returns the length of the TLV option at offset @offset within
+ * the option header @opt. Checks that the entire TLV fits inside
+ * the option header, returns a negative value if this is not the case.
+ */
+static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset)
+{
+ unsigned char *tlv = (unsigned char *)opt;
+ unsigned int opt_len = ipv6_optlen(opt), tlv_len;
+
+ if (offset < sizeof(*opt) || offset >= opt_len)
+ return -EINVAL;
+ if (tlv[offset] == IPV6_TLV_PAD1)
+ return 1;
+ if (offset + 1 >= opt_len)
+ return -EINVAL;
+ tlv_len = tlv[offset + 1] + 2;
+ if (offset + tlv_len > opt_len)
+ return -EINVAL;
+ return tlv_len;
+}
+
+/**
+ * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header
+ * @hop: the hop options header
+ * @start: on return holds the offset of any leading padding
+ * @end: on return holds the offset of the first non-pad TLV after CALIPSO
+ *
+ * Description:
+ * Finds the space occupied by a CALIPSO option (including any leading and
+ * trailing padding).
+ *
+ * If a CALIPSO option exists set @start and @end to the
+ * offsets within @hop of the start of padding before the first
+ * CALIPSO option and the end of padding after the first CALIPSO
+ * option. In this case the function returns 0.
+ *
+ * In the absence of a CALIPSO option, @start and @end will be
+ * set to the start and end of any trailing padding in the header.
+ * This is useful when appending a new option, as the caller may want
+ * to overwrite some of this padding. In this case the function will
+ * return -ENOENT.
+ */
+static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start,
+ unsigned int *end)
+{
+ int ret_val = -ENOENT, tlv_len;
+ unsigned int opt_len, offset, offset_s = 0, offset_e = 0;
+ unsigned char *opt = (unsigned char *)hop;
+
+ opt_len = ipv6_optlen(hop);
+ offset = sizeof(*hop);
+
+ while (offset < opt_len) {
+ tlv_len = calipso_tlv_len(hop, offset);
+ if (tlv_len < 0)
+ return tlv_len;
+
+ switch (opt[offset]) {
+ case IPV6_TLV_PAD1:
+ case IPV6_TLV_PADN:
+ if (offset_e)
+ offset_e = offset;
+ break;
+ case IPV6_TLV_CALIPSO:
+ ret_val = 0;
+ offset_e = offset;
+ break;
+ default:
+ if (offset_e == 0)
+ offset_s = offset;
+ else
+ goto out;
+ }
+ offset += tlv_len;
+ }
+
+out:
+ if (offset_s)
+ *start = offset_s + calipso_tlv_len(hop, offset_s);
+ else
+ *start = sizeof(*hop);
+ if (offset_e)
+ *end = offset_e + calipso_tlv_len(hop, offset_e);
+ else
+ *end = opt_len;
+
+ return ret_val;
+}
+
+/**
+ * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr
+ * @hop: the original hop options header
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Creates a new hop options header based on @hop with a
+ * CALIPSO option added to it. If @hop already contains a CALIPSO
+ * option this is overwritten, otherwise the new option is appended
+ * after any existing options. If @hop is NULL then the new header
+ * will contain just the CALIPSO option and any needed padding.
+ *
+ */
+static struct ipv6_opt_hdr *
+calipso_opt_insert(struct ipv6_opt_hdr *hop,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ unsigned int start, end, buf_len, pad, hop_len;
+ struct ipv6_opt_hdr *new;
+ int ret_val;
+
+ if (hop) {
+ hop_len = ipv6_optlen(hop);
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val && ret_val != -ENOENT)
+ return ERR_PTR(ret_val);
+ } else {
+ hop_len = 0;
+ start = sizeof(*hop);
+ end = 0;
+ }
+
+ buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD;
+ new = kzalloc(buf_len, GFP_ATOMIC);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (start > sizeof(*hop))
+ memcpy(new, hop, start);
+ ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def,
+ secattr);
+ if (ret_val < 0) {
+ kfree(new);
+ return ERR_PTR(ret_val);
+ }
+
+ buf_len = start + ret_val;
+ /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */
+ pad = ((buf_len & 4) + (end & 7)) & 7;
+ calipso_pad_write((unsigned char *)new, buf_len, pad);
+ buf_len += pad;
+
+ if (end != hop_len) {
+ memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end);
+ buf_len += hop_len - end;
+ }
+ new->nexthdr = 0;
+ new->hdrlen = buf_len / 8 - 1;
+
+ return new;
+}
+
+/**
+ * calipso_opt_del - Removes the CALIPSO option from an option header
+ * @hop: the original header
+ * @new: the new header
+ *
+ * Description:
+ * Creates a new header based on @hop without any CALIPSO option. If @hop
+ * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains
+ * no other non-padding options, it returns zero with @new set to NULL.
+ * Otherwise it returns zero, creates a new header without the CALIPSO
+ * option (and removing as much padding as possible) and returns with
+ * @new set to that header.
+ *
+ */
+static int calipso_opt_del(struct ipv6_opt_hdr *hop,
+ struct ipv6_opt_hdr **new)
+{
+ int ret_val;
+ unsigned int start, end, delta, pad, hop_len;
+
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val)
+ return ret_val;
+
+ hop_len = ipv6_optlen(hop);
+ if (start == sizeof(*hop) && end == hop_len) {
+ /* There's no other option in the header so return NULL */
+ *new = NULL;
+ return 0;
+ }
+
+ delta = (end - start) & ~7;
+ *new = kzalloc(hop_len - delta, GFP_ATOMIC);
+ if (!*new)
+ return -ENOMEM;
+
+ memcpy(*new, hop, start);
+ (*new)->hdrlen -= delta / 8;
+ pad = (end - start) & 7;
+ calipso_pad_write((unsigned char *)*new, start, pad);
+ if (end != hop_len)
+ memcpy((char *)*new + start + pad, (char *)hop + end,
+ hop_len - end);
+
+ return 0;
+}
+
+/**
+ * calipso_opt_getattr - Get the security attributes from a memory block
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_opt_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ u32 doi, len = calipso[1], cat_len = calipso[6] * 4;
+ struct calipso_doi *doi_def;
+
+ if (cat_len + 8 > len)
+ return -EINVAL;
+
+ if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0)
+ return 0;
+
+ doi = get_unaligned_be32(calipso + 2);
+ rcu_read_lock();
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def)
+ goto getattr_return;
+
+ secattr->attr.mls.lvl = calipso[7];
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (cat_len) {
+ ret_val = calipso_map_cat_ntoh(doi_def,
+ calipso + 10,
+ cat_len,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ goto getattr_return;
+ }
+
+ if (secattr->attr.mls.cat)
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ secattr->type = NETLBL_NLTYPE_CALIPSO;
+
+getattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/* sock functions.
+ */
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_sock_getattr(struct sock *sk,
+ struct netlbl_lsm_secattr *secattr)
+{
+ struct ipv6_opt_hdr *hop;
+ int opt_len, len, ret_val = -ENOMSG, offset;
+ unsigned char *opt;
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return -EAFNOSUPPORT;
+
+ txopts = txopt_get(pinfo);
+ if (!txopts || !txopts->hopopt)
+ goto done;
+
+ hop = txopts->hopopt;
+ opt = (unsigned char *)hop;
+ opt_len = ipv6_optlen(hop);
+ offset = sizeof(*hop);
+ while (offset < opt_len) {
+ len = calipso_tlv_len(hop, offset);
+ if (len < 0) {
+ ret_val = len;
+ goto done;
+ }
+ switch (opt[offset]) {
+ case IPV6_TLV_CALIPSO:
+ if (len < CALIPSO_HDR_LEN)
+ ret_val = -EINVAL;
+ else
+ ret_val = calipso_opt_getattr(&opt[offset],
+ secattr);
+ goto done;
+ default:
+ offset += len;
+ break;
+ }
+ }
+done:
+ txopt_put(txopts);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+static int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct ipv6_opt_hdr *old, *new;
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return -EAFNOSUPPORT;
+
+ txopts = txopt_get(pinfo);
+ old = NULL;
+ if (txopts)
+ old = txopts->hopopt;
+
+ new = calipso_opt_insert(old, doi_def, secattr);
+ txopt_put(txopts);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ ret_val = calipso_opt_update(sk, new);
+
+ kfree(new);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+static void calipso_sock_delattr(struct sock *sk)
+{
+ struct ipv6_opt_hdr *new_hop;
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return;
+
+ txopts = txopt_get(pinfo);
+ if (!txopts || !txopts->hopopt)
+ goto done;
+
+ if (calipso_opt_del(txopts->hopopt, &new_hop))
+ goto done;
+
+ calipso_opt_update(sk, new_hop);
+ kfree(new_hop);
+
+done:
+ txopt_put(txopts);
+}
+
+/* request sock functions.
+ */
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+static int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ struct ipv6_txoptions *txopts;
+ struct inet_request_sock *req_inet = inet_rsk(req);
+ struct ipv6_opt_hdr *old, *new;
+ struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+ /* sk is NULL for SYN+ACK w/ SYN Cookie */
+ if (!sk)
+ return -ENOMEM;
+
+ if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt)
+ old = req_inet->ipv6_opt->hopopt;
+ else
+ old = NULL;
+
+ new = calipso_opt_insert(old, doi_def, secattr);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
+
+ kfree(new);
+
+ if (IS_ERR(txopts))
+ return PTR_ERR(txopts);
+
+ txopts = xchg(&req_inet->ipv6_opt, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @req: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+static void calipso_req_delattr(struct request_sock *req)
+{
+ struct inet_request_sock *req_inet = inet_rsk(req);
+ struct ipv6_opt_hdr *new;
+ struct ipv6_txoptions *txopts;
+ struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+ /* sk is NULL for SYN+ACK w/ SYN Cookie */
+ if (!sk)
+ return;
+
+ if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt)
+ return;
+
+ if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
+ return; /* Nothing to do */
+
+ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
+
+ if (!IS_ERR(txopts)) {
+ txopts = xchg(&req_inet->ipv6_opt, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+ }
+ kfree(new);
+}
+
+/* skbuff functions.
+ */
+
+/**
+ * calipso_skbuff_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb);
+ int offset;
+
+ if (ip6_hdr->nexthdr != NEXTHDR_HOP)
+ return NULL;
+
+ offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO);
+ if (offset >= 0)
+ return (unsigned char *)ip6_hdr + offset;
+
+ return NULL;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+static int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct ipv6hdr *ip6_hdr;
+ struct ipv6_opt_hdr *hop;
+ unsigned char buf[CALIPSO_MAX_BUFFER];
+ int len_delta, new_end, pad, payload;
+ unsigned int start, end;
+
+ ip6_hdr = ipv6_hdr(skb);
+ if (ip6_hdr->nexthdr == NEXTHDR_HOP) {
+ hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val && ret_val != -ENOENT)
+ return ret_val;
+ } else {
+ start = 0;
+ end = 0;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+
+ new_end = start + ret_val;
+ /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */
+ pad = ((new_end & 4) + (end & 7)) & 7;
+ len_delta = new_end - (int)end + pad;
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */
+
+ if (len_delta) {
+ if (len_delta > 0)
+ skb_push(skb, len_delta);
+ else
+ skb_pull(skb, -len_delta);
+ memmove((char *)ip6_hdr - len_delta, ip6_hdr,
+ sizeof(*ip6_hdr) + start);
+ skb_reset_network_header(skb);
+ ip6_hdr = ipv6_hdr(skb);
+ payload = ntohs(ip6_hdr->payload_len);
+ ip6_hdr->payload_len = htons(payload + len_delta);
+ }
+
+ hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ if (start == 0) {
+ struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf;
+
+ new_hop->nexthdr = ip6_hdr->nexthdr;
+ new_hop->hdrlen = len_delta / 8 - 1;
+ ip6_hdr->nexthdr = NEXTHDR_HOP;
+ } else {
+ hop->hdrlen += len_delta / 8;
+ }
+ memcpy((char *)hop + start, buf + (start & 3), new_end - start);
+ calipso_pad_write((unsigned char *)hop, new_end, pad);
+
+ return 0;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct ipv6hdr *ip6_hdr;
+ struct ipv6_opt_hdr *old_hop;
+ u32 old_hop_len, start = 0, end = 0, delta, size, pad;
+
+ if (!calipso_skbuff_optptr(skb))
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ ip6_hdr = ipv6_hdr(skb);
+ old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ old_hop_len = ipv6_optlen(old_hop);
+
+ ret_val = calipso_opt_find(old_hop, &start, &end);
+ if (ret_val)
+ return ret_val;
+
+ if (start == sizeof(*old_hop) && end == old_hop_len) {
+ /* There's no other option in the header so we delete
+ * the whole thing. */
+ delta = old_hop_len;
+ size = sizeof(*ip6_hdr);
+ ip6_hdr->nexthdr = old_hop->nexthdr;
+ } else {
+ delta = (end - start) & ~7;
+ if (delta)
+ old_hop->hdrlen -= delta / 8;
+ pad = (end - start) & 7;
+ size = sizeof(*ip6_hdr) + start + pad;
+ calipso_pad_write((unsigned char *)old_hop, start, pad);
+ }
+
+ if (delta) {
+ skb_pull(skb, delta);
+ memmove((char *)ip6_hdr + delta, ip6_hdr, size);
+ skb_reset_network_header(skb);
+ }
+
+ return 0;
+}
+
+static const struct netlbl_calipso_ops ops = {
+ .doi_add = calipso_doi_add,
+ .doi_free = calipso_doi_free,
+ .doi_remove = calipso_doi_remove,
+ .doi_getdef = calipso_doi_getdef,
+ .doi_putdef = calipso_doi_putdef,
+ .doi_walk = calipso_doi_walk,
+ .sock_getattr = calipso_sock_getattr,
+ .sock_setattr = calipso_sock_setattr,
+ .sock_delattr = calipso_sock_delattr,
+ .req_setattr = calipso_req_setattr,
+ .req_delattr = calipso_req_delattr,
+ .opt_getattr = calipso_opt_getattr,
+ .skbuff_optptr = calipso_skbuff_optptr,
+ .skbuff_setattr = calipso_skbuff_setattr,
+ .skbuff_delattr = calipso_skbuff_delattr,
+ .cache_invalidate = calipso_cache_invalidate,
+ .cache_add = calipso_cache_add
+};
+
+/**
+ * calipso_init - Initialize the CALIPSO module
+ *
+ * Description:
+ * Initialize the CALIPSO module and prepare it for use. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int __init calipso_init(void)
+{
+ int ret_val;
+
+ ret_val = calipso_cache_init();
+ if (!ret_val)
+ netlbl_calipso_ops_register(&ops);
+ return ret_val;
+}
+
+void calipso_exit(void)
+{
+ netlbl_calipso_ops_register(NULL);
+ calipso_cache_invalidate();
+ kfree(calipso_cache);
+}
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 5c94fea90e97..83e03176819c 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1,29 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* common UDP/RAW code
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
- *
- * $Id: datagram.c,v 1.24 2002/02/01 22:01:04 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Pedro Roque <roque@di.fc.ul.pt>
*/
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/route.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/icmp.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
@@ -31,61 +27,165 @@
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/tcp_states.h>
+#include <net/dsfield.h>
+#include <net/sock_reuseport.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+
+static bool ipv6_mapped_addr_any(const struct in6_addr *a)
+{
+ return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
+}
+
+static void ip6_datagram_flow_key_init(struct flowi6 *fl6,
+ const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ int oif = sk->sk_bound_dev_if;
+
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->daddr = sk->sk_v6_daddr;
+ fl6->saddr = np->saddr;
+ fl6->flowi6_mark = sk->sk_mark;
+ fl6->fl6_dport = inet->inet_dport;
+ fl6->fl6_sport = inet->inet_sport;
+ fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
+ fl6->flowi6_uid = sk_uid(sk);
+
+ if (!oif)
+ oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ if (!oif) {
+ if (ipv6_addr_is_multicast(&fl6->daddr))
+ oif = READ_ONCE(np->mcast_oif);
+ else
+ oif = READ_ONCE(np->ucast_oif);
+ }
+
+ fl6->flowi6_oif = oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
+}
+
+int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
+{
+ struct ip6_flowlabel *flowlabel = NULL;
+ struct in6_addr *final_p, final;
+ struct ipv6_txoptions *opt;
+ struct dst_entry *dst;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct flowi6 fl6;
+ int err = 0;
+
+ if (inet6_test_bit(SNDFLOW, sk) &&
+ (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+ flowlabel = fl6_sock_lookup(sk, np->flow_label);
+ if (IS_ERR(flowlabel))
+ return -EINVAL;
+ }
+ ip6_datagram_flow_key_init(&fl6, sk);
+
+ rcu_read_lock();
+ opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
+ final_p = fl6_update_dst(&fl6, opt, &final);
+ rcu_read_unlock();
+
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto out;
+ }
+
+ if (fix_sk_saddr) {
+ if (ipv6_addr_any(&np->saddr))
+ np->saddr = fl6.saddr;
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+ sk->sk_v6_rcv_saddr = fl6.saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+ }
+
+ ip6_sk_dst_store_flow(sk, dst, &fl6);
+
+out:
+ fl6_sock_release(flowlabel);
+ return err;
+}
+
+void ip6_datagram_release_cb(struct sock *sk)
+{
+ struct dst_entry *dst;
+
+ if (ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return;
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (!dst || !READ_ONCE(dst->obsolete) ||
+ dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ ip6_datagram_dst_update(sk, false);
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *daddr, *final_p = NULL, final;
- struct dst_entry *dst;
- struct flowi fl;
- struct ip6_flowlabel *flowlabel = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *daddr, old_daddr;
+ __be32 fl6_flowlabel = 0;
+ __be32 old_fl6_flowlabel;
+ __be16 old_dport;
int addr_type;
int err;
if (usin->sin6_family == AF_INET) {
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -EAFNOSUPPORT;
- err = ip4_datagram_connect(sk, uaddr, addr_len);
+ err = __ip4_datagram_connect(sk, uaddr, addr_len);
goto ipv4_connected;
}
if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
+ return -EINVAL;
- if (usin->sin6_family != AF_INET6)
- return -EAFNOSUPPORT;
-
- memset(&fl, 0, sizeof(fl));
- if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
- return -EINVAL;
- ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
- }
- }
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
- addr_type = ipv6_addr_type(&usin->sin6_addr);
+ if (inet6_test_bit(SNDFLOW, sk))
+ fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- if (addr_type == IPV6_ADDR_ANY) {
+ if (ipv6_addr_any(&usin->sin6_addr)) {
/*
* connect to self
*/
- usin->sin6_addr.s6_addr[15] = 0x01;
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ &usin->sin6_addr);
+ else
+ usin->sin6_addr = in6addr_loopback;
}
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
daddr = &usin->sin6_addr;
- if (addr_type == IPV6_ADDR_MAPPED) {
+ if (addr_type & IPV6_ADDR_MAPPED) {
struct sockaddr_in sin;
- if (__ipv6_only_sock(sk)) {
+ if (ipv6_only_sock(sk)) {
err = -ENETUNREACH;
goto out;
}
@@ -93,42 +193,44 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sin.sin_addr.s_addr = daddr->s6_addr32[3];
sin.sin_port = usin->sin6_port;
- err = ip4_datagram_connect(sk,
- (struct sockaddr*) &sin,
- sizeof(sin));
+ err = __ip4_datagram_connect(sk,
+ (struct sockaddr_unsized *)&sin,
+ sizeof(sin));
ipv4_connected:
if (err)
goto out;
-
- ipv6_addr_set(&np->daddr, 0, 0, htonl(0x0000ffff), inet->daddr);
- if (ipv6_addr_any(&np->saddr)) {
- ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000ffff),
- inet->saddr);
- }
+ ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr);
+
+ if (ipv6_addr_any(&np->saddr) ||
+ ipv6_mapped_addr_any(&np->saddr))
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
- if (ipv6_addr_any(&np->rcv_saddr)) {
- ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000ffff),
- inet->rcv_saddr);
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+ ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) {
+ ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
+ &sk->sk_v6_rcv_saddr);
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
}
+
goto out;
}
- if (addr_type&IPV6_ADDR_LINKLOCAL) {
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
usin->sin6_scope_id) {
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != usin->sin6_scope_id) {
+ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) {
err = -EINVAL;
goto out;
}
- sk->sk_bound_dev_if = usin->sin6_scope_id;
- if (!sk->sk_bound_dev_if &&
- (addr_type & IPV6_ADDR_MULTICAST))
- fl.oif = np->mcast_oif;
+ WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id);
}
+ if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST))
+ WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif));
+
/* Connect to link-local address requires an interface */
if (!sk->sk_bound_dev_if) {
err = -EINVAL;
@@ -136,153 +238,225 @@ ipv4_connected:
}
}
- ipv6_addr_copy(&np->daddr, daddr);
- np->flow_label = fl.fl6_flowlabel;
+ /* save the current peer information before updating it */
+ old_daddr = sk->sk_v6_daddr;
+ old_fl6_flowlabel = np->flow_label;
+ old_dport = inet->inet_dport;
- inet->dport = usin->sin6_port;
+ sk->sk_v6_daddr = *daddr;
+ np->flow_label = fl6_flowlabel;
+ inet->inet_dport = usin->sin6_port;
/*
* Check for a route to destination an obtain the
* destination cache for it.
*/
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
-
- if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST))
- fl.oif = np->mcast_oif;
-
- security_sk_classify_flow(sk, &fl);
-
- if (flowlabel) {
- if (flowlabel->opt && flowlabel->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
- } else if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
+ err = ip6_datagram_dst_update(sk, true);
+ if (err) {
+ /* Restore the socket peer info, to keep it consistent with
+ * the old socket state
+ */
+ sk->sk_v6_daddr = old_daddr;
+ np->flow_label = old_fl6_flowlabel;
+ inet->inet_dport = old_dport;
+ goto out;
}
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto out;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
+ reuseport_has_conns_set(sk);
+ sk->sk_state = TCP_ESTABLISHED;
+ sk_set_txhash(sk);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out;
+int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ int res;
- /* source address lookup done in ip6_dst_lookup */
+ lock_sock(sk);
+ res = __ip6_datagram_connect(sk, uaddr, addr_len);
+ release_sock(sk);
+ return res;
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_connect);
- if (ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&np->saddr, &fl.fl6_src);
+int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr);
+ if (sin6->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+ return ip6_datagram_connect(sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only);
- if (ipv6_addr_any(&np->rcv_saddr)) {
- ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
- inet->rcv_saddr = LOOPBACK4_IPV6;
+static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out)
+{
+ switch (icmp6_hdr(skb)->icmp6_type) {
+ case ICMPV6_TIME_EXCEED:
+ case ICMPV6_DEST_UNREACH:
+ ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr),
+ icmp6_hdr(skb)->icmp6_datagram_len * 8);
}
-
- ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ?
- &np->daddr : NULL,
-#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_equal(&fl.fl6_src, &np->saddr) ?
- &np->saddr :
-#endif
- NULL);
-
- sk->sk_state = TCP_ESTABLISHED;
-out:
- fl6_sock_release(flowlabel);
- return err;
}
-void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct icmp6hdr *icmph = (struct icmp6hdr *)skb->h.raw;
+ struct icmp6hdr *icmph = icmp6_hdr(skb);
struct sock_exterr_skb *serr;
- if (!np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk))
return;
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
return;
+ skb->protocol = htons(ETH_P_IPV6);
+
serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6;
- serr->ee.ee_type = icmph->icmp6_type;
+ serr->ee.ee_type = icmph->icmp6_type;
serr->ee.ee_code = icmph->icmp6_code;
serr->ee.ee_pad = 0;
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
- serr->addr_offset = (u8*)&(((struct ipv6hdr*)(icmph+1))->daddr) - skb->nh.raw;
+ serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) -
+ skb_network_header(skb);
serr->port = port;
- skb->h.raw = payload;
__skb_pull(skb, payload - skb->data);
+ if (inet6_test_bit(RECVERR6_RFC4884, sk))
+ ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
+
+ skb_reset_transport_header(skb);
+
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(ipv6_icmp_error);
-void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info)
+void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
struct ipv6hdr *iph;
struct sk_buff *skb;
- if (!np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk))
return;
skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
if (!skb)
return;
- iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr));
- skb->nh.ipv6h = iph;
- ipv6_addr_copy(&iph->daddr, &fl->fl6_dst);
+ skb->protocol = htons(ETH_P_IPV6);
+
+ skb_put(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ iph = ipv6_hdr(skb);
+ iph->daddr = fl6->daddr;
+ ip6_flow_hdr(iph, 0, 0);
serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
- serr->ee.ee_type = 0;
+ serr->ee.ee_type = 0;
serr->ee.ee_code = 0;
serr->ee.ee_pad = 0;
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
- serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
- serr->port = fl->fl_ip_dport;
+ serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+ serr->port = fl6->fl6_dport;
- skb->h.raw = skb->tail;
- __skb_pull(skb, skb->tail - skb->data);
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
-/*
+void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *iph;
+ struct sk_buff *skb;
+ struct ip6_mtuinfo *mtu_info;
+
+ if (!np->rxopt.bits.rxpmtu)
+ return;
+
+ skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_put(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ iph = ipv6_hdr(skb);
+ iph->daddr = fl6->daddr;
+
+ mtu_info = IP6CBMTU(skb);
+
+ mtu_info->ip6m_mtu = mtu;
+ mtu_info->ip6m_addr.sin6_family = AF_INET6;
+ mtu_info->ip6m_addr.sin6_port = 0;
+ mtu_info->ip6m_addr.sin6_flowinfo = 0;
+ mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif;
+ mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr;
+
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
+
+ skb = xchg(&np->rxpmtu, skb);
+ kfree_skb(skb);
+}
+
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+ return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
+/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL.
+ *
+ * At one point, excluding local errors was a quick test to identify icmp/icmp6
+ * errors. This is no longer true, but the test remained, so the v6 stack,
+ * unlike v4, also honors cmsg requests on all wifi and timestamp errors.
+ */
+static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
+ struct sock_exterr_skb *serr)
+{
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6)
+ return true;
+
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
+ return false;
+
+ if (!IP6CB(skb)->iif)
+ return false;
+
+ return true;
+}
+
+/*
* Handle MSG_ERRQUEUE
*/
-int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
- struct sk_buff *skb, *skb2;
- struct sockaddr_in6 *sin;
+ struct sk_buff *skb;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name);
struct {
struct sock_extended_err ee;
struct sockaddr_in6 offender;
@@ -291,8 +465,8 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
int copied;
err = -EAGAIN;
- skb = skb_dequeue(&sk->sk_error_queue);
- if (skb == NULL)
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb)
goto out;
copied = skb->len;
@@ -300,54 +474,56 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
- if (err)
- goto out_free_skb;
-
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
sock_recv_timestamp(msg, sk, skb);
serr = SKB_EXT_ERR(skb);
- sin = (struct sockaddr_in6 *)msg->msg_name;
- if (sin) {
+ if (sin && ipv6_datagram_support_addr(serr)) {
+ const unsigned char *nh = skb_network_header(skb);
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
- sin->sin6_port = serr->port;
- sin->sin6_scope_id = 0;
- if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) {
- ipv6_addr_copy(&sin->sin6_addr,
- (struct in6_addr *)(skb->nh.raw + serr->addr_offset));
- if (np->sndflow)
- sin->sin6_flowinfo = *(__be32*)(skb->nh.raw + serr->addr_offset - 24) & IPV6_FLOWINFO_MASK;
- if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin->sin6_scope_id = IP6CB(skb)->iif;
+ sin->sin6_port = serr->port;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
+ struct ipv6hdr, daddr);
+ sin->sin6_addr = ip6h->daddr;
+ if (inet6_test_bit(SNDFLOW, sk))
+ sin->sin6_flowinfo = ip6_flowinfo(ip6h);
+ sin->sin6_scope_id =
+ ipv6_iface_scope_id(&sin->sin6_addr,
+ IP6CB(skb)->iif);
} else {
- ipv6_addr_set(&sin->sin6_addr, 0, 0,
- htonl(0xffff),
- *(__be32*)(skb->nh.raw + serr->addr_offset));
+ ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset),
+ &sin->sin6_addr);
+ sin->sin6_scope_id = 0;
}
+ *addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
sin = &errhdr.offender;
- sin->sin6_family = AF_UNSPEC;
- if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) {
+ memset(sin, 0, sizeof(*sin));
+
+ if (ip6_datagram_support_cmsg(skb, serr)) {
sin->sin6_family = AF_INET6;
- sin->sin6_flowinfo = 0;
- sin->sin6_scope_id = 0;
- if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) {
- ipv6_addr_copy(&sin->sin6_addr, &skb->nh.ipv6h->saddr);
+ if (np->rxopt.all)
+ ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ sin->sin6_addr = ipv6_hdr(skb)->saddr;
if (np->rxopt.all)
- datagram_recv_ctl(sk, msg, skb);
- if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin->sin6_scope_id = IP6CB(skb)->iif;
+ ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ sin->sin6_scope_id =
+ ipv6_iface_scope_id(&sin->sin6_addr,
+ IP6CB(skb)->iif);
} else {
- struct inet_sock *inet = inet_sk(sk);
-
- ipv6_addr_set(&sin->sin6_addr, 0, 0,
- htonl(0xffff),
- skb->nh.iph->saddr);
- if (inet->cmsg_flags)
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
+ &sin->sin6_addr);
+ if (inet_cmsg_flags(inet_sk(sk)))
ip_cmsg_recv(msg, skb);
}
}
@@ -359,56 +535,114 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
- /* Reset and regenerate socket error */
- spin_lock_bh(&sk->sk_error_queue.lock);
- sk->sk_err = 0;
- if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
- sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_bh(&sk->sk_error_queue.lock);
- sk->sk_error_report(sk);
- } else {
- spin_unlock_bh(&sk->sk_error_queue.lock);
+ consume_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ipv6_recv_error);
+
+/*
+ * Handle IPV6_RECVPATHMTU
+ */
+int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
+ int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *skb;
+ struct ip6_mtuinfo mtu_info;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name);
+ int err;
+ int copied;
+
+ err = -EAGAIN;
+ skb = xchg(&np->rxpmtu, NULL);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info));
+
+ if (sin) {
+ sin->sin6_family = AF_INET6;
+ sin->sin6_flowinfo = 0;
+ sin->sin6_port = 0;
+ sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id;
+ sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr;
+ *addr_len = sizeof(*sin);
}
-out_free_skb:
+ put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info);
+
+ err = copied;
+
+out_free_skb:
kfree_skb(skb);
out:
return err;
}
-
-int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
+void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_skb_parm *opt = IP6CB(skb);
+ bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6);
if (np->rxopt.bits.rxinfo) {
struct in6_pktinfo src_info;
- src_info.ipi6_ifindex = opt->iif;
- ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr);
- put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
+ if (is_ipv6) {
+ src_info.ipi6_ifindex = IP6CB(skb)->iif;
+ src_info.ipi6_addr = ipv6_hdr(skb)->daddr;
+ } else {
+ src_info.ipi6_ifindex =
+ PKTINFO_SKB_CB(skb)->ipi_ifindex;
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+ &src_info.ipi6_addr);
+ }
+
+ if (src_info.ipi6_ifindex >= 0)
+ put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO,
+ sizeof(src_info), &src_info);
}
+}
+
+void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ unsigned char *nh = skb_network_header(skb);
if (np->rxopt.bits.rxhlim) {
- int hlim = skb->nh.ipv6h->hop_limit;
+ int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.rxtclass) {
- int tclass = (ntohl(*(__be32 *)skb->nh.ipv6h) >> 20) & 0xff;
+ int tclass = ipv6_get_dsfield(ipv6_hdr(skb));
put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
}
- if (np->rxopt.bits.rxflow && (*(__be32*)skb->nh.raw & IPV6_FLOWINFO_MASK)) {
- __be32 flowinfo = *(__be32*)skb->nh.raw & IPV6_FLOWINFO_MASK;
- put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
+ if (np->rxopt.bits.rxflow) {
+ __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh);
+ if (flowinfo)
+ put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
}
/* HbH is allowed only once */
- if (np->rxopt.bits.hopopts && opt->hop) {
- u8 *ptr = skb->nh.raw + opt->hop;
+ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
}
@@ -419,18 +653,18 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
* report extension headers (except for HbH)
* in order.
*
- * Also note that IPV6_RECVRTHDRDSTOPTS is NOT
+ * Also note that IPV6_RECVRTHDRDSTOPTS is NOT
* (and WILL NOT be) defined because
* IPV6_RECVDSTOPTS is more generic. --yoshfuji
*/
unsigned int off = sizeof(struct ipv6hdr);
- u8 nexthdr = skb->nh.ipv6h->nexthdr;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
while (off <= opt->lastopt) {
- unsigned len;
- u8 *ptr = skb->nh.raw + off;
+ unsigned int len;
+ u8 *ptr = nh + off;
- switch(nexthdr) {
+ switch (nexthdr) {
case IPPROTO_DSTOPTS:
nexthdr = ptr[0];
len = (ptr[1] + 1) << 3;
@@ -462,115 +696,173 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
struct in6_pktinfo src_info;
src_info.ipi6_ifindex = opt->iif;
- ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr);
+ src_info.ipi6_addr = ipv6_hdr(skb)->daddr;
put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxohlim) {
- int hlim = skb->nh.ipv6h->hop_limit;
+ int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
- if (np->rxopt.bits.ohopopts && opt->hop) {
- u8 *ptr = skb->nh.raw + opt->hop;
+ if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.odstopts && opt->dst0) {
- u8 *ptr = skb->nh.raw + opt->dst0;
+ u8 *ptr = nh + opt->dst0;
put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.osrcrt && opt->srcrt) {
- struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt);
+ struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt);
put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
}
if (np->rxopt.bits.odstopts && opt->dst1) {
- u8 *ptr = skb->nh.raw + opt->dst1;
+ u8 *ptr = nh + opt->dst1;
put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
- return 0;
+ if (np->rxopt.bits.rxorigdstaddr) {
+ struct sockaddr_in6 sin6;
+ __be16 _ports[2], *ports;
+
+ ports = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_ports), &_ports);
+ if (ports) {
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ipv6_hdr(skb)->daddr;
+ sin6.sin6_port = ports[1];
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id =
+ ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr,
+ opt->iif);
+
+ put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+ }
+ }
+ if (np->rxopt.bits.recvfragsize && opt->frag_max_size) {
+ int val = opt->frag_max_size;
+
+ put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val);
+ }
+}
+
+void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ ip6_datagram_recv_common_ctl(sk, msg, skb);
+ ip6_datagram_recv_specific_ctl(sk, msg, skb);
}
+EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
-int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
- struct ipv6_txoptions *opt,
- int *hlimit, int *tclass)
+int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
+ struct msghdr *msg, struct flowi6 *fl6,
+ struct ipcm6_cookie *ipc6)
{
struct in6_pktinfo *src_info;
struct cmsghdr *cmsg;
struct ipv6_rt_hdr *rthdr;
struct ipv6_opt_hdr *hdr;
+ struct ipv6_txoptions *opt = ipc6->opt;
int len;
int err = 0;
- for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ for_each_cmsghdr(cmsg, msg) {
int addr_type;
- struct net_device *dev = NULL;
if (!CMSG_OK(msg, cmsg)) {
err = -EINVAL;
goto exit_f;
}
+ if (cmsg->cmsg_level == SOL_SOCKET) {
+ err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc);
+ if (err)
+ return err;
+ continue;
+ }
+
if (cmsg->cmsg_level != SOL_IPV6)
continue;
switch (cmsg->cmsg_type) {
- case IPV6_PKTINFO:
- case IPV6_2292PKTINFO:
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
+ case IPV6_PKTINFO:
+ case IPV6_2292PKTINFO:
+ {
+ struct net_device *dev = NULL;
+ int src_idx;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
goto exit_f;
}
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
-
- if (src_info->ipi6_ifindex) {
- if (fl->oif && src_info->ipi6_ifindex != fl->oif)
+ src_idx = src_info->ipi6_ifindex;
+
+ if (src_idx) {
+ if (fl6->flowi6_oif &&
+ src_idx != fl6->flowi6_oif &&
+ (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif ||
+ !sk_dev_equal_l3scope(sk, src_idx)))
return -EINVAL;
- fl->oif = src_info->ipi6_ifindex;
+ fl6->flowi6_oif = src_idx;
}
- addr_type = ipv6_addr_type(&src_info->ipi6_addr);
+ addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
- if (addr_type == IPV6_ADDR_ANY)
- break;
-
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (!src_info->ipi6_ifindex)
- return -EINVAL;
- else {
- dev = dev_get_by_index(src_info->ipi6_ifindex);
- if (!dev)
- return -ENODEV;
+ rcu_read_lock();
+ if (fl6->flowi6_oif) {
+ dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
}
+ } else if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ rcu_read_unlock();
+ return -EINVAL;
}
- if (!ipv6_chk_addr(&src_info->ipi6_addr, dev, 0)) {
- if (dev)
- dev_put(dev);
- err = -EINVAL;
- goto exit_f;
+
+ if (addr_type != IPV6_ADDR_ANY) {
+ int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+ if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
+ !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
+ dev, !strict, 0,
+ IFA_F_TENTATIVE) &&
+ !ipv6_chk_acast_addr_src(net, dev,
+ &src_info->ipi6_addr))
+ err = -EINVAL;
+ else
+ fl6->saddr = src_info->ipi6_addr;
}
- if (dev)
- dev_put(dev);
- ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr);
+ rcu_read_unlock();
+
+ if (err)
+ goto exit_f;
+
break;
+ }
case IPV6_FLOWINFO:
- if (cmsg->cmsg_len < CMSG_LEN(4)) {
+ if (cmsg->cmsg_len < CMSG_LEN(4)) {
err = -EINVAL;
goto exit_f;
}
- if (fl->fl6_flowlabel&IPV6_FLOWINFO_MASK) {
- if ((fl->fl6_flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
+ if (fl6->flowlabel&IPV6_FLOWINFO_MASK) {
+ if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
err = -EINVAL;
goto exit_f;
}
}
- fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
+ fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
break;
case IPV6_2292HOPOPTS:
case IPV6_HOPOPTS:
- if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+ if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
goto exit_f;
}
@@ -581,7 +873,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
err = -EINVAL;
goto exit_f;
}
- if (!capable(CAP_NET_RAW)) {
+ if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
err = -EPERM;
goto exit_f;
}
@@ -590,7 +882,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
break;
case IPV6_2292DSTOPTS:
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
goto exit_f;
}
@@ -601,7 +893,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
err = -EINVAL;
goto exit_f;
}
- if (!capable(CAP_NET_RAW)) {
+ if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
err = -EPERM;
goto exit_f;
}
@@ -626,7 +918,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
err = -EINVAL;
goto exit_f;
}
- if (!capable(CAP_NET_RAW)) {
+ if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
err = -EPERM;
goto exit_f;
}
@@ -641,7 +933,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
case IPV6_2292RTHDR:
case IPV6_RTHDR:
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
err = -EINVAL;
goto exit_f;
}
@@ -649,11 +941,15 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
switch (rthdr->type) {
- case IPV6_SRCRT_TYPE_0:
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPV6_SRCRT_TYPE_2:
-#endif
+ if (rthdr->hdrlen != 2 ||
+ rthdr->segments_left != 1) {
+ err = -EINVAL;
+ goto exit_f;
+ }
break;
+#endif
default:
err = -EINVAL;
goto exit_f;
@@ -661,7 +957,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
len = ((rthdr->hdrlen + 1) << 3);
- if (cmsg->cmsg_len < CMSG_LEN(len)) {
+ if (cmsg->cmsg_len < CMSG_LEN(len)) {
err = -EINVAL;
goto exit_f;
}
@@ -693,7 +989,12 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
goto exit_f;
}
- *hlimit = *(int *)CMSG_DATA(cmsg);
+ ipc6->hlimit = *(int *)CMSG_DATA(cmsg);
+ if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
break;
case IPV6_TCLASS:
@@ -701,27 +1002,71 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
int tc;
err = -EINVAL;
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
goto exit_f;
- }
tc = *(int *)CMSG_DATA(cmsg);
if (tc < -1 || tc > 0xff)
goto exit_f;
err = 0;
- *tclass = tc;
+ ipc6->tclass = tc;
break;
}
- default:
- LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
- cmsg->cmsg_type);
+
+ case IPV6_DONTFRAG:
+ {
+ int df;
+
err = -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ goto exit_f;
+
+ df = *(int *)CMSG_DATA(cmsg);
+ if (df < 0 || df > 1)
+ goto exit_f;
+
+ err = 0;
+ ipc6->dontfrag = df;
+
break;
- };
+ }
+ default:
+ net_dbg_ratelimited("invalid cmsg type: %d\n",
+ cmsg->cmsg_type);
+ err = -EINVAL;
+ goto exit_f;
+ }
}
exit_f:
return err;
}
+EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl);
+
+void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
+ __u16 srcp, __u16 destp, int rqueue, int bucket)
+{
+ const struct in6_addr *dest, *src;
+
+ dest = &sp->sk_v6_daddr;
+ src = &sp->sk_v6_rcv_saddr;
+ seq_printf(seq,
+ "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
+ bucket,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp,
+ sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ rqueue,
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
+ 0,
+ sock_i_ino(sp),
+ refcount_read(&sp->sk_refcnt), sp,
+ sk_drops_read(sp));
+}
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 25dcf69cd807..e75da98f5283 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -1,407 +1,1240 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2002 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Authors
*
- * Mitsuru KANDA @USAGI : IPv6 Support
- * Kazunori MIYAZAWA @USAGI :
- * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
- *
- * This file is derived from net/ipv4/esp.c
+ * Mitsuru KANDA @USAGI : IPv6 Support
+ * Kazunori MIYAZAWA @USAGI :
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *
+ * This file is derived from net/ipv4/esp.c
*/
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
-#include <asm/scatterlist.h>
-#include <linux/crypto.h>
+#include <linux/scatterlist.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
+#include <net/udp.h>
#include <linux/icmpv6.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <net/inet6_hashtables.h>
+#include <linux/skbuff_ref.h>
-static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
+#include <linux/highmem.h>
+
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+struct esp_output_extra {
+ __be32 seqhi;
+ u32 esphoff;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the upper 32 bits of the sequence number are
+ * placed at the front, if present. Followed by the IV, the request and finally
+ * the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
+{
+ unsigned int len;
+
+ len = seqihlen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline void *esp_tmp_extra(void *tmp)
{
+ return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
+{
+ struct crypto_aead *aead = x->data;
+ int extralen = 0;
+ u8 *iv;
+ struct aead_request *req;
+ struct scatterlist *sg;
+
+ if (x->props.flags & XFRM_STATE_ESN)
+ extralen += sizeof(struct esp_output_extra);
+
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+
+ /* Unref skb_frag_pages in the src scatterlist if necessary.
+ * Skip the first sg which comes from skb->data.
+ */
+ if (req->src != req->dst)
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+ skb_page_unref(page_to_netmem(sg_page(sg)),
+ skb->pp_recycle);
+}
+
+#ifdef CONFIG_INET6_ESPINTCP
+static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
+ __be16 sport, dport;
+ struct sock *sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ spin_unlock_bh(&x->lock);
+
+ sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport,
+ &x->props.saddr.in6, ntohs(sport), 0, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
int err;
- int hdr_len;
- struct ipv6hdr *top_iph;
- struct ipv6_esp_hdr *esph;
- struct crypto_blkcipher *tfm;
- struct blkcipher_desc desc;
- struct esp_data *esp;
- struct sk_buff *trailer;
- int blksize;
- int clen;
- int alen;
- int nfrags;
- esp = x->data;
- hdr_len = skb->h.raw - skb->data +
- sizeof(*esph) + esp->conf.ivlen;
+ rcu_read_lock();
- /* Strip IP+ESP header. */
- __skb_pull(skb, hdr_len);
+ sk = esp6_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
- /* Now skb is pure payload to encrypt */
- err = -ENOMEM;
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
- /* Round to block size */
- clen = skb->len;
+ sock_put(sk);
- alen = esp->auth.icv_trunc_len;
- tfm = esp->conf.tfm;
- desc.tfm = tfm;
- desc.flags = 0;
- blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
- clen = ALIGN(clen + 2, blksize);
- if (esp->conf.padlen)
- clen = ALIGN(clen, esp->conf.padlen);
+out:
+ rcu_read_unlock();
+ return err;
+}
- if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) {
- goto error;
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+
+ return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_encap_csum(struct sk_buff *skb)
+{
+ /* UDP encap with IPv6 requires a valid checksum */
+ if (*skb_mac_header(skb) == IPPROTO_UDP) {
+ struct udphdr *uh = udp_hdr(skb);
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int len = ntohs(uh->len);
+ unsigned int offset = skb_transport_offset(skb);
+ __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
}
+}
- /* Fill padding... */
- do {
- int i;
- for (i=0; i<clen-skb->len - 2; i++)
- *(u8*)(trailer->tail + i) = i+1;
- } while (0);
- *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
- pskb_put(skb, trailer, clen - skb->len);
+static void esp_output_done(void *data, int err)
+{
+ struct sk_buff *skb = data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ void *tmp;
+ struct xfrm_state *x;
- top_iph = (struct ipv6hdr *)__skb_push(skb, hdr_len);
- esph = (struct ipv6_esp_hdr *)skb->h.raw;
- top_iph->payload_len = htons(skb->len + alen - sizeof(*top_iph));
- *(u8*)(trailer->tail - 1) = *skb->nh.raw;
- *skb->nh.raw = IPPROTO_ESP;
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
- esph->spi = x->id.spi;
- esph->seq_no = htonl(++x->replay.oseq);
- xfrm_aevent_doreplay(x);
+ x = sp->xvec[sp->len - 1];
+ } else {
+ x = skb_dst(skb)->xfrm;
+ }
+
+ tmp = ESP_SKB_CB(skb)->tmp;
+ esp_ssg_unref(x, tmp, skb);
+ kfree(tmp);
- if (esp->conf.ivlen) {
- if (unlikely(!esp->conf.ivinitted)) {
- get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
- esp->conf.ivinitted = 1;
+ esp_output_encap_csum(skb);
+
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ if (err) {
+ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ kfree_skb(skb);
+ return;
}
- crypto_blkcipher_set_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
+
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ secpath_reset(skb);
+ xfrm_dev_resume(skb);
+ } else {
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
+}
+
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+ struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ __be32 *seqhi = esp_tmp_extra(tmp);
- do {
- struct scatterlist *sg = &esp->sgbuf[0];
+ esph->seq_no = esph->spi;
+ esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+ esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+ sizeof(__be32));
+}
- if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
- sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
- if (!sg)
- goto error;
+static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
+ struct xfrm_state *x,
+ struct ip_esp_hdr *esph,
+ struct esp_output_extra *extra)
+{
+ /* For ESN we move the header forward by 4 bytes to
+ * accommodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ __u32 seqhi;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (xo)
+ seqhi = xo->seq.hi;
+ else
+ seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(seqhi);
+ }
+
+ esph->spi = x->id.spi;
+
+ return esph;
+}
+
+static void esp_output_done_esn(void *data, int err)
+{
+ struct sk_buff *skb = data;
+
+ esp_output_restore_header(skb);
+ esp_output_done(data, err);
+}
+
+static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
+{
+ struct udphdr *uh;
+ unsigned int len;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > U16_MAX)
+ return ERR_PTR(-EMSGSIZE);
+
+ uh = (struct udphdr *)esp->esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET6_ESPINTCP
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp6_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ sock_put(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
+ break;
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp6_output_tcp_encap(x, skb, esp);
+ break;
+ }
+
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
+ esp->esph = esph;
+
+ return 0;
+}
+
+int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+ u8 *tail;
+ int nfrags;
+ int esph_offset;
+ struct page *page;
+ struct sk_buff *trailer;
+ int tailen = esp->tailen;
+
+ if (x->encap) {
+ int err = esp6_output_encap(x, skb, esp);
+
+ if (err < 0)
+ return err;
+ }
+
+ if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+ ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ goto cow;
+
+ if (!skb_cloned(skb)) {
+ if (tailen <= skb_tailroom(skb)) {
+ nfrags = 1;
+ trailer = skb;
+ tail = skb_tail_pointer(trailer);
+
+ goto skip_cow;
+ } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+ && !skb_has_frag_list(skb)) {
+ int allocsize;
+ struct sock *sk = skb->sk;
+ struct page_frag *pfrag = &x->xfrag;
+
+ esp->inplace = false;
+
+ allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto cow;
+ }
+
+ page = pfrag->page;
+ get_page(page);
+
+ tail = page_address(page) + pfrag->offset;
+
+ esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+
+ __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+ tailen);
+ skb_shinfo(skb)->nr_frags = ++nfrags;
+
+ pfrag->offset = pfrag->offset + allocsize;
+
+ spin_unlock_bh(&x->lock);
+
+ nfrags++;
+
+ skb->len += tailen;
+ skb->data_len += tailen;
+ skb->truesize += tailen;
+ if (sk && sk_fullsock(sk))
+ refcount_add(tailen, &sk->sk_wmem_alloc);
+
+ goto out;
}
- skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
- err = crypto_blkcipher_encrypt(&desc, sg, sg, clen);
- if (unlikely(sg != &esp->sgbuf[0]))
- kfree(sg);
- } while (0);
+ }
- if (unlikely(err))
+cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
+ nfrags = skb_cow_data(skb, tailen, &trailer);
+ if (nfrags < 0)
+ goto out;
+ tail = skb_tail_pointer(trailer);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
+
+skip_cow:
+ esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+ pskb_put(skb, trailer, tailen);
+
+out:
+ return nfrags;
+}
+EXPORT_SYMBOL_GPL(esp6_output_head);
+
+int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+ u8 *iv;
+ int alen;
+ void *tmp;
+ int ivlen;
+ int assoclen;
+ int extralen;
+ struct page *page;
+ struct ip_esp_hdr *esph;
+ struct aead_request *req;
+ struct crypto_aead *aead;
+ struct scatterlist *sg, *dsg;
+ struct esp_output_extra *extra;
+ int err = -ENOMEM;
+
+ assoclen = sizeof(struct ip_esp_hdr);
+ extralen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ extralen += sizeof(*extra);
+ assoclen += sizeof(__be32);
+ }
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
+
+ tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
+ if (!tmp)
goto error;
- if (esp->conf.ivlen) {
- memcpy(esph->enc_data, esp->conf.ivec, esp->conf.ivlen);
- crypto_blkcipher_get_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+
+ if (esp->inplace)
+ dsg = sg;
+ else
+ dsg = &sg[esp->nfrags];
+
+ esph = esp_output_set_esn(skb, x, esp->esph, extra);
+ esp->esph = esph;
+
+ sg_init_table(sg, esp->nfrags);
+ err = skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ if (unlikely(err < 0))
+ goto error_free;
+
+ if (!esp->inplace) {
+ int allocsize;
+ struct page_frag *pfrag = &x->xfrag;
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto error_free;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+ spin_unlock_bh(&x->lock);
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ err = skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ if (unlikely(err < 0))
+ goto error_free;
}
- if (esp->auth.icv_full_len) {
- err = esp_mac_digest(esp, skb, (u8 *)esph - skb->data,
- sizeof(*esph) + esp->conf.ivlen + clen);
- memcpy(pskb_put(skb, trailer, alen), esp->auth.work_icv, alen);
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ memset(iv, 0, ivlen);
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
+ min(ivlen, 8));
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_encrypt(req);
+
+ switch (err) {
+ case -EINPROGRESS:
+ goto error;
+
+ case -ENOSPC:
+ err = NET_XMIT_DROP;
+ break;
+
+ case 0:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_output_restore_header(skb);
+ esp_output_encap_csum(skb);
}
+ if (sg != dsg)
+ esp_ssg_unref(x, tmp, skb);
+
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
+
+error_free:
+ kfree(tmp);
error:
return err;
}
+EXPORT_SYMBOL_GPL(esp6_output_tail);
+
+static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int alen;
+ int blksize;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct esp_info esp;
+
+ esp.inplace = true;
+
+ esp.proto = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ esp.tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ esp.tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+ esp.plen = esp.clen - skb->len - esp.tfclen;
+ esp.tailen = esp.tfclen + esp.plen + alen;
+
+ esp.esph = ip_esp_hdr(skb);
+
+ esp.nfrags = esp6_output_head(x, skb, &esp);
+ if (esp.nfrags < 0)
+ return esp.nfrags;
+
+ esph = esp.esph;
+ esph->spi = x->id.spi;
+
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ return esp6_output_tail(x, skb, &esp);
+}
+
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct crypto_aead *aead = x->data;
+ int alen, hlen, elen;
+ int padlen, trimlen;
+ __wsum csumdiff;
+ u8 nexthdr[2];
+ int ret;
+
+ alen = crypto_aead_authsize(aead);
+ hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ elen = skb->len - hlen;
+
+ ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2);
+ BUG_ON(ret);
+
+ ret = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen) {
+ net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+ padlen + 2, elen - alen);
+ goto out;
+ }
+
+ trimlen = alen + padlen + 2;
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+ skb->csum = csum_block_sub(skb->csum, csumdiff,
+ skb->len - trimlen);
+ }
+ ret = pskb_trim(skb, skb->len - trimlen);
+ if (unlikely(ret))
+ return ret;
+
+ ret = nexthdr[1];
+
+out:
+ return ret;
+}
+
+int esp6_input_done2(struct sk_buff *skb, int err)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct crypto_aead *aead = x->data;
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int hdr_len = skb_network_header_len(skb);
+
+ if (!xo || !(xo->flags & CRYPTO_DONE))
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ err = esp_remove_trailer(skb);
+ if (unlikely(err < 0))
+ goto out;
+
+ if (x->encap) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int offset = skb_network_offset(skb) + sizeof(*ip6h);
+ struct xfrm_encap_tmpl *encap = x->encap;
+ u8 nexthdr = ip6h->nexthdr;
+ __be16 frag_off, source;
+ struct udphdr *uh;
+ struct tcphdr *th;
+
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+ if (offset == -1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ uh = (void *)(skb->data + offset);
+ th = (void *)(skb->data + offset);
+ hdr_len += offset;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertise the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
+ source != encap->encap_sport) {
+ xfrm_address_t ipaddr;
+
+ memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
+ km_new_mapping(x, &ipaddr, source);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ skb_pull_rcsum(skb, hlen);
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -hdr_len);
+
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(esp6_input_done2);
+
+static void esp_input_done(void *data, int err)
+{
+ struct sk_buff *skb = data;
+
+ xfrm_input_resume(skb, esp6_input_done2(skb, err));
+}
+
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, 0);
+ __skb_pull(skb, 4);
+}
+
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accommodate the high bits. We will move it back after
+ * decryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ struct ip_esp_hdr *esph = skb_push(skb, 4);
+
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ }
+}
+
+static void esp_input_done_esn(void *data, int err)
+{
+ struct sk_buff *skb = data;
+
+ esp_input_restore_header(skb);
+ esp_input_done(data, err);
+}
static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ipv6hdr *iph;
- struct ipv6_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_blkcipher *tfm = esp->conf.tfm;
- struct blkcipher_desc desc = { .tfm = tfm };
+ struct crypto_aead *aead = x->data;
+ struct aead_request *req;
struct sk_buff *trailer;
- int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
- int alen = esp->auth.icv_trunc_len;
- int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen;
-
- int hdr_len = skb->h.raw - skb->nh.raw;
+ int ivlen = crypto_aead_ivsize(aead);
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
int nfrags;
+ int assoclen;
+ int seqhilen;
int ret = 0;
+ void *tmp;
+ __be32 *seqhi;
+ u8 *iv;
+ struct scatterlist *sg;
- if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) {
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) {
ret = -EINVAL;
goto out;
}
- if (elen <= 0 || (elen & (blksize-1))) {
+ if (elen <= 0) {
ret = -EINVAL;
goto out;
}
- /* If integrity check is required, do this. */
- if (esp->auth.icv_full_len) {
- u8 sum[alen];
+ assoclen = sizeof(struct ip_esp_hdr);
+ seqhilen = 0;
- ret = esp_mac_digest(esp, skb, 0, skb->len - alen);
- if (ret)
- goto out;
+ if (x->props.flags & XFRM_STATE_ESN) {
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
- if (skb_copy_bits(skb, skb->len - alen, sum, alen))
- BUG();
+ if (!skb_cloned(skb)) {
+ if (!skb_is_nonlinear(skb)) {
+ nfrags = 1;
- if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) {
- x->stats.integrity_failed++;
- ret = -EINVAL;
- goto out;
+ goto skip_cow;
+ } else if (!skb_has_frag_list(skb)) {
+ nfrags = skb_shinfo(skb)->nr_frags;
+ nfrags++;
+
+ goto skip_cow;
}
}
- if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) {
+ nfrags = skb_cow_data(skb, 0, &trailer);
+ if (nfrags < 0) {
ret = -EINVAL;
goto out;
}
- skb->ip_summed = CHECKSUM_NONE;
+skip_cow:
+ ret = -ENOMEM;
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+ if (!tmp)
+ goto out;
- esph = (struct ipv6_esp_hdr*)skb->data;
- iph = skb->nh.ipv6h;
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
- /* Get ivec. This can be wrong, check against another impls. */
- if (esp->conf.ivlen)
- crypto_blkcipher_set_iv(tfm, esph->enc_data, esp->conf.ivlen);
+ esp_input_set_header(skb, seqhi);
- {
- u8 nexthdr[2];
- struct scatterlist *sg = &esp->sgbuf[0];
- u8 padlen;
+ sg_init_table(sg, nfrags);
+ ret = skb_to_sgvec(skb, sg, 0, skb->len);
+ if (unlikely(ret < 0)) {
+ kfree(tmp);
+ goto out;
+ }
- if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
- sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
- if (!sg) {
- ret = -ENOMEM;
- goto out;
- }
- }
- skb_to_sgvec(skb, sg, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen, elen);
- ret = crypto_blkcipher_decrypt(&desc, sg, sg, elen);
- if (unlikely(sg != &esp->sgbuf[0]))
- kfree(sg);
- if (unlikely(ret))
- goto out;
+ skb->ip_summed = CHECKSUM_NONE;
- if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
- BUG();
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_input_done, skb);
- padlen = nexthdr[0];
- if (padlen+2 >= elen) {
- LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
- ret = -EINVAL;
- goto out;
- }
- /* ... check padding bits here. Silly. :-) */
+ aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+ aead_request_set_ad(req, assoclen);
- pskb_trim(skb, skb->len - alen - padlen - 2);
- ret = nexthdr[1];
- }
+ ret = crypto_aead_decrypt(req);
+ if (ret == -EINPROGRESS)
+ goto out;
+
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_input_restore_header(skb);
- skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len;
+ ret = esp6_input_done2(skb, ret);
out:
return ret;
}
-static u32 esp6_get_max_size(struct xfrm_state *x, int mtu)
+static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
+ struct net *net = dev_net(skb->dev);
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset);
+ struct xfrm_state *x;
- if (x->props.mode == XFRM_MODE_TUNNEL) {
- mtu = ALIGN(mtu + 2, blksize);
- } else {
- /* The worst case. */
- u32 padsize = ((blksize - 1) & 7) + 1;
- mtu = ALIGN(mtu + 2, padsize) + blksize - padsize;
- }
- if (esp->conf.padlen)
- mtu = ALIGN(mtu, esp->conf.padlen);
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
- return mtu + x->props.header_len + esp->auth.icv_trunc_len;
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET6);
+ if (!x)
+ return 0;
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ xfrm_state_put(x);
+
+ return 0;
}
-static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+static void esp6_destroy(struct xfrm_state *x)
{
- struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
- struct ipv6_esp_hdr *esph = (struct ipv6_esp_hdr*)(skb->data+offset);
- struct xfrm_state *x;
+ struct crypto_aead *aead = x->data;
- if (type != ICMPV6_DEST_UNREACH &&
- type != ICMPV6_PKT_TOOBIG)
+ if (!aead)
return;
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6);
- if (!x)
- return;
- printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" NIP6_FMT "\n",
- ntohl(esph->spi), NIP6(iph->daddr));
- xfrm_state_put(x);
+ crypto_free_aead(aead);
}
-static void esp6_destroy(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
- struct esp_data *esp = x->data;
+ char aead_name[CRYPTO_MAX_ALG_NAME];
+ struct crypto_aead *aead;
+ int err;
- if (!esp)
- return;
+ if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ return -ENAMETOOLONG;
+ }
+
+ aead = crypto_alloc_aead(aead_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
- crypto_free_blkcipher(esp->conf.tfm);
- esp->conf.tfm = NULL;
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- crypto_free_hash(esp->auth.tfm);
- esp->auth.tfm = NULL;
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- kfree(esp);
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+ return 0;
+
+error:
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ return err;
}
-static int esp6_init_state(struct xfrm_state *x)
+static int esp_init_authenc(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
- struct esp_data *esp = NULL;
- struct crypto_blkcipher *tfm;
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
- /* null auth and encryption can have zero length keys */
- if (x->aalg) {
- if (x->aalg->alg_key_len > 512)
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "%s%sauthencesn(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ goto error;
+ }
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "%s%sauthenc(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
}
- if (x->ealg == NULL)
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
+
+ x->data = aead;
- if (x->encap)
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
goto error;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
if (x->aalg) {
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *hash;
- esp->auth.key = x->aalg->alg_key;
- esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
- hash = crypto_alloc_hash(x->aalg->alg_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(hash))
- goto error;
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
- esp->auth.tfm = hash;
- if (crypto_hash_setkey(hash, esp->auth.key, esp->auth.key_len))
- goto error;
-
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
BUG_ON(!aalg_desc);
-
- if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(hash)) {
- NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_hash_digestsize(hash),
- aalg_desc->uinfo.auth.icv_fullbits/8);
- goto error;
+
+ err = -EINVAL;
+ if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+ crypto_aead_authsize(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ goto free_key;
+ }
+
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+ goto free_key;
}
-
- esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
-
- esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
- if (!esp->auth.work_icv)
- goto error;
}
- esp->conf.key = x->ealg->alg_key;
- esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
- tfm = crypto_alloc_blkcipher(x->ealg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- goto error;
- esp->conf.tfm = tfm;
- esp->conf.ivlen = crypto_blkcipher_ivsize(tfm);
- esp->conf.padlen = 0;
- if (esp->conf.ivlen) {
- esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
- if (unlikely(esp->conf.ivec == NULL))
- goto error;
- esp->conf.ivinitted = 0;
+
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree(key);
+
+error:
+ return err;
+}
+
+static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ x->data = NULL;
+
+ if (x->aead) {
+ err = esp_init_aead(x, extack);
+ } else if (x->ealg) {
+ err = esp_init_authenc(x, extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+ err = -EINVAL;
}
- if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
+
+ if (err)
goto error;
- x->props.header_len = sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen;
- if (x->props.mode == XFRM_MODE_TUNNEL)
+
+ aead = x->data;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
+ switch (x->props.mode) {
+ case XFRM_MODE_BEET:
+ if (x->sel.family != AF_INET6)
+ x->props.header_len += IPV4_BEET_PHMAXLEN +
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+ break;
+ default:
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
x->props.header_len += sizeof(struct ipv6hdr);
- x->data = esp;
- return 0;
+ break;
+ }
+
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
+ err = -EINVAL;
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_INET6_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
+ break;
+#endif
+ }
+ }
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
- x->data = esp;
- esp6_destroy(x);
- x->data = NULL;
- return -EINVAL;
+ return err;
}
-static struct xfrm_type esp6_type =
+static int esp6_rcv_cb(struct sk_buff *skb, int err)
{
- .description = "ESP6",
- .owner = THIS_MODULE,
- .proto = IPPROTO_ESP,
+ return 0;
+}
+
+static const struct xfrm_type esp6_type = {
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp6_init_state,
.destructor = esp6_destroy,
- .get_max_size = esp6_get_max_size,
.input = esp6_input,
.output = esp6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
-static struct inet6_protocol esp6_protocol = {
- .handler = xfrm6_rcv,
+static struct xfrm6_protocol esp6_protocol = {
+ .handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp6_rcv_cb,
.err_handler = esp6_err,
- .flags = INET6_PROTO_NOPOLICY,
+ .priority = 0,
};
static int __init esp6_init(void)
{
if (xfrm_register_type(&esp6_type, AF_INET6) < 0) {
- printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) {
- printk(KERN_INFO "ipv6 esp init: can't add protocol\n");
+ if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&esp6_type, AF_INET6);
return -EAGAIN;
}
@@ -411,13 +1244,14 @@ static int __init esp6_init(void)
static void __exit esp6_fini(void)
{
- if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0)
- printk(KERN_INFO "ipv6 esp close: can't remove protocol\n");
- if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
- printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n");
+ if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ xfrm_unregister_type(&esp6_type, AF_INET6);
}
module_init(esp6_init);
module_exit(esp6_fini);
+MODULE_DESCRIPTION("IPv6 ESP transformation helpers");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 000000000000..22410243ebe8
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <linux/icmpv6.h>
+
+static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
+{
+ int off = sizeof(struct ipv6hdr);
+ struct ipv6_opt_hdr *exthdr;
+
+ /* ESP or ESPINUDP */
+ if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP ||
+ ipv6_hdr->nexthdr == NEXTHDR_UDP))
+ return offsetof(struct ipv6hdr, nexthdr);
+
+ while (off < nhlen) {
+ exthdr = (void *)ipv6_hdr + off;
+ if (exthdr->nexthdr == NEXTHDR_ESP)
+ return off;
+
+ off += ipv6_optlen(exthdr);
+ }
+
+ return 0;
+}
+
+static struct sk_buff *esp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ struct xfrm_offload *xo;
+ struct xfrm_state *x;
+ int encap_type = 0;
+ __be32 seq;
+ __be32 spi;
+ int nhoff;
+
+ if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP)
+ encap_type = UDP_ENCAP_ESPINUDP;
+
+ if (!pskb_pull(skb, offset))
+ return NULL;
+
+ if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0)
+ goto out;
+
+ xo = xfrm_offload(skb);
+ if (!xo || !(xo->flags & CRYPTO_DONE)) {
+ struct sec_path *sp = secpath_set(skb);
+
+ if (!sp)
+ goto out;
+
+ if (sp->len == XFRM_MAX_DEPTH)
+ goto out_reset;
+
+ x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET6);
+
+ if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+ /* non-offload path will record the error and audit log */
+ xfrm_state_put(x);
+ x = NULL;
+ }
+
+ if (!x)
+ goto out_reset;
+
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ goto out_reset;
+ }
+
+ xo->flags |= XFRM_GRO;
+
+ nhoff = esp6_nexthdr_esp_offset(ipv6_hdr(skb), offset);
+ if (!nhoff)
+ goto out;
+
+ IP6CB(skb)->nhoff = nhoff;
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+ /* We don't need to handle errors from xfrm_input, it does all
+ * the error handling and frees the resources on error. */
+ xfrm_input(skb, IPPROTO_ESP, spi, encap_type);
+
+ return ERR_PTR(-EINPROGRESS);
+out_reset:
+ secpath_reset(skb);
+out:
+ skb_push(skb, offset);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
+}
+
+static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ u8 proto = iph->nexthdr;
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) {
+ __be16 frag;
+
+ ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag);
+ }
+
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ xo->proto = proto;
+}
+
+static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+ XFRM_MODE_SKB_CB(skb)->protocol);
+ __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP)
+ : htons(ETH_P_IPV6);
+
+ return skb_eth_gso_segment(skb, features, type);
+}
+
+static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ skb->transport_header += x->props.header_len;
+ ops = rcu_dereference(inet6_offloads[xo->proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ u8 proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (x->sel.family != AF_INET6) {
+ skb->transport_header -=
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph =
+ (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ }
+
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ } else {
+ __be16 frag;
+
+ skb->transport_header +=
+ ipv6_skip_exthdr(skb, 0, &proto, &frag);
+ }
+
+ if (proto == IPPROTO_IPIP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ switch (x->outer_mode.encap) {
+ case XFRM_MODE_TUNNEL:
+ return xfrm6_tunnel_gso_segment(x, skb, features);
+ case XFRM_MODE_TRANSPORT:
+ return xfrm6_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm6_beet_gso_segment(x, skb, features);
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_state *x;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ netdev_features_t esp_features = features;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
+
+ if (!xo)
+ return ERR_PTR(-EINVAL);
+
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
+ return ERR_PTR(-EINVAL);
+
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
+ aead = x->data;
+ esph = ip_esp_hdr(skb);
+
+ if (esph->spi != x->id.spi)
+ return ERR_PTR(-EINVAL);
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+ skb->encap_hdr_csum = 1;
+
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
+ esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
+ else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
+ esp_features = features & ~(NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
+
+ xo->flags |= XFRM_GSO_SEGMENT;
+
+ return xfrm6_outer_mode_gso_segment(x, skb, esp_features);
+}
+
+static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct crypto_aead *aead = x->data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+ return -EINVAL;
+
+ if (!(xo->flags & CRYPTO_DONE))
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return esp6_input_done2(skb, 0);
+}
+
+static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features)
+{
+ int len;
+ int err;
+ int alen;
+ int blksize;
+ struct xfrm_offload *xo;
+ struct crypto_aead *aead;
+ struct esp_info esp;
+ bool hw_offload = true;
+ __u32 seq;
+
+ esp.inplace = true;
+
+ xo = xfrm_offload(skb);
+
+ if (!xo)
+ return -EINVAL;
+
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
+ xo->flags |= CRYPTO_FALLBACK;
+ hw_offload = false;
+ }
+
+ esp.proto = xo->proto;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ esp.tfclen = 0;
+ /* XXX: Add support for tfc padding here. */
+
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+ esp.plen = esp.clen - skb->len - esp.tfclen;
+ esp.tailen = esp.tfclen + esp.plen + alen;
+
+ if (!hw_offload || !skb_is_gso(skb)) {
+ esp.nfrags = esp6_output_head(x, skb, &esp);
+ if (esp.nfrags < 0)
+ return esp.nfrags;
+ }
+
+ seq = xo->seq.low;
+
+ esp.esph = ip_esp_hdr(skb);
+ esp.esph->spi = x->id.spi;
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ if (xo->flags & XFRM_GSO_SEGMENT) {
+ esp.esph->seq_no = htonl(seq);
+
+ if (!skb_is_gso(skb))
+ xo->seq.low++;
+ else
+ xo->seq.low += skb_shinfo(skb)->gso_segs;
+ }
+
+ if (xo->seq.low < seq)
+ xo->seq.hi++;
+
+ esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
+
+ len = skb->len - sizeof(struct ipv6hdr);
+ if (len > IPV6_MAXPLEN)
+ len = 0;
+
+ ipv6_hdr(skb)->payload_len = htons(len);
+
+ if (hw_offload) {
+ if (!skb_ext_add(skb, SKB_EXT_SEC_PATH))
+ return -ENOMEM;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ return -EINVAL;
+
+ xo->flags |= XFRM_XMIT;
+ return 0;
+ }
+
+ err = esp6_output_tail(x, skb, &esp);
+ if (err)
+ return err;
+
+ secpath_reset(skb);
+
+ if (skb_needs_linearize(skb, skb->dev->features) &&
+ __skb_linearize(skb))
+ return -ENOMEM;
+ return 0;
+}
+
+static const struct net_offload esp6_offload = {
+ .callbacks = {
+ .gro_receive = esp6_gro_receive,
+ .gso_segment = esp6_gso_segment,
+ },
+};
+
+static const struct xfrm_type_offload esp6_type_offload = {
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .input_tail = esp6_input_tail,
+ .xmit = esp6_xmit,
+ .encap = esp6_gso_encap,
+};
+
+static int __init esp6_offload_init(void)
+{
+ if (xfrm_register_type_offload(&esp6_type_offload, AF_INET6) < 0) {
+ pr_info("%s: can't add xfrm type offload\n", __func__);
+ return -EAGAIN;
+ }
+
+ return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+static void __exit esp6_offload_exit(void)
+{
+ xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6);
+ inet6_del_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+module_init(esp6_offload_init);
+module_exit(esp6_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
+MODULE_DESCRIPTION("IPV6 GSO/GRO offload support");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 0711f92d6a12..a23eb8734e15 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Extension Header handling for IPv6
* Linux INET6 implementation
@@ -6,17 +7,10 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Andi Kleen <ak@muc.de>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
- *
- * $Id: exthdrs.c,v 1.13 2001/06/19 15:58:56 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* Changes:
- * yoshfuji : ensure not to overrun while parsing
+ * yoshfuji : ensure not to overrun while parsing
* tlv options.
* Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs().
* YOSHIFUJI Hideaki @USAGI Register inbound extension header
@@ -27,12 +21,14 @@
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/icmpv6.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/dst.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -43,66 +39,22 @@
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
-#ifdef CONFIG_IPV6_MIP6
+#include <net/calipso.h>
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
+#include <linux/seg6.h>
+#include <net/seg6.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+#include <net/rpl.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
+#include <net/ioam6.h>
+#include <net/dst_metadata.h>
-#include <asm/uaccess.h>
-
-int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
-{
- int packet_len = skb->tail - skb->nh.raw;
- struct ipv6_opt_hdr *hdr;
- int len;
-
- if (offset + 2 > packet_len)
- goto bad;
- hdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
- len = ((hdr->hdrlen + 1) << 3);
-
- if (offset + len > packet_len)
- goto bad;
-
- offset += 2;
- len -= 2;
-
- while (len > 0) {
- int opttype = skb->nh.raw[offset];
- int optlen;
-
- if (opttype == type)
- return offset;
-
- switch (opttype) {
- case IPV6_TLV_PAD0:
- optlen = 1;
- break;
- default:
- optlen = skb->nh.raw[offset + 1] + 2;
- if (optlen > len)
- goto bad;
- break;
- }
- offset += optlen;
- len -= optlen;
- }
- /* not_found */
- bad:
- return -1;
-}
-
-/*
- * Parsing tlv encoded headers.
- *
- * Parsing function "func" returns 1, if parsing succeed
- * and 0, if it failed.
- * It MUST NOT touch skb->h.
- */
-
-struct tlvtype_proc {
- int type;
- int (*func)(struct sk_buff **skbp, int offset);
-};
+#include <linux/uaccess.h>
/*********************
Generic functions
@@ -110,13 +62,23 @@ struct tlvtype_proc {
/* An unknown option is detected, decide what to do */
-static int ip6_tlvopt_unknown(struct sk_buff **skbp, int optoff)
+static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
+ bool disallow_unknowns)
{
- struct sk_buff *skb = *skbp;
+ if (disallow_unknowns) {
+ /* If unknown TLVs are disallowed by configuration
+ * then always silently drop packet. Note this also
+ * means no ICMP parameter problem is sent which
+ * could be a good property to mitigate a reflection DOS
+ * attack.
+ */
- switch ((skb->nh.raw[optoff] & 0xC0) >> 6) {
+ goto drop;
+ }
+
+ switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
case 0: /* ignore */
- return 1;
+ return true;
case 1: /* drop packet */
break;
@@ -125,188 +87,245 @@ static int ip6_tlvopt_unknown(struct sk_buff **skbp, int optoff)
/* Actually, it is redundant check. icmp_send
will recheck in any case.
*/
- if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
break;
+ fallthrough;
case 2: /* send ICMP PARM PROB regardless and drop packet */
- icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
- return 0;
- };
+ icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff,
+ SKB_DROP_REASON_UNHANDLED_PROTO);
+ return false;
+ }
- kfree_skb(skb);
- return 0;
+drop:
+ kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
+ return false;
}
+static bool ipv6_hop_ra(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static bool ipv6_dest_hao(struct sk_buff *skb, int optoff);
+#endif
+
/* Parse tlv encoded option header (hop-by-hop or destination) */
-static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff **skbp)
+static bool ip6_parse_tlv(bool hopbyhop,
+ struct sk_buff *skb,
+ int max_count)
{
- struct sk_buff *skb = *skbp;
- struct tlvtype_proc *curr;
- int off = skb->h.raw - skb->nh.raw;
- int len = ((skb->h.raw[1]+1)<<3);
-
- if ((skb->h.raw + len) - skb->data > skb_headlen(skb))
- goto bad;
+ int len = (skb_transport_header(skb)[1] + 1) << 3;
+ const unsigned char *nh = skb_network_header(skb);
+ int off = skb_network_header_len(skb);
+ bool disallow_unknowns = false;
+ int tlv_count = 0;
+ int padlen = 0;
+
+ if (unlikely(max_count < 0)) {
+ disallow_unknowns = true;
+ max_count = -max_count;
+ }
off += 2;
len -= 2;
while (len > 0) {
- int optlen = skb->nh.raw[off+1]+2;
+ int optlen, i;
- switch (skb->nh.raw[off]) {
- case IPV6_TLV_PAD0:
- optlen = 1;
- break;
+ if (nh[off] == IPV6_TLV_PAD1) {
+ padlen++;
+ if (padlen > 7)
+ goto bad;
+ off++;
+ len--;
+ continue;
+ }
+ if (len < 2)
+ goto bad;
+ optlen = nh[off + 1] + 2;
+ if (optlen > len)
+ goto bad;
+
+ if (nh[off] == IPV6_TLV_PADN) {
+ /* RFC 2460 states that the purpose of PadN is
+ * to align the containing header to multiples
+ * of 8. 7 is therefore the highest valid value.
+ * See also RFC 4942, Section 2.1.9.5.
+ */
+ padlen += optlen;
+ if (padlen > 7)
+ goto bad;
+ /* RFC 4942 recommends receiving hosts to
+ * actively check PadN payload to contain
+ * only zeroes.
+ */
+ for (i = 2; i < optlen; i++) {
+ if (nh[off + i] != 0)
+ goto bad;
+ }
+ } else {
+ tlv_count++;
+ if (tlv_count > max_count)
+ goto bad;
- case IPV6_TLV_PADN:
- break;
+ if (hopbyhop) {
+ switch (nh[off]) {
+ case IPV6_TLV_ROUTERALERT:
+ if (!ipv6_hop_ra(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_IOAM:
+ if (!ipv6_hop_ioam(skb, off))
+ return false;
- default: /* Other TLV code so scan list */
- if (optlen > len)
- goto bad;
- for (curr=procs; curr->type >= 0; curr++) {
- if (curr->type == skb->nh.raw[off]) {
- /* type specific length/alignment
- checks will be performed in the
- func(). */
- if (curr->func(skbp, off) == 0)
- return 0;
+ nh = skb_network_header(skb);
+ break;
+ case IPV6_TLV_JUMBO:
+ if (!ipv6_hop_jumbo(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_CALIPSO:
+ if (!ipv6_hop_calipso(skb, off))
+ return false;
+ break;
+ default:
+ if (!ip6_tlvopt_unknown(skb, off,
+ disallow_unknowns))
+ return false;
+ break;
+ }
+ } else {
+ switch (nh[off]) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_TLV_HAO:
+ if (!ipv6_dest_hao(skb, off))
+ return false;
+ break;
+#endif
+ default:
+ if (!ip6_tlvopt_unknown(skb, off,
+ disallow_unknowns))
+ return false;
break;
}
}
- if (curr->type < 0) {
- if (ip6_tlvopt_unknown(skbp, off) == 0)
- return 0;
- }
- break;
+ padlen = 0;
}
off += optlen;
len -= optlen;
}
+
if (len == 0)
- return 1;
+ return true;
bad:
- kfree_skb(skb);
- return 0;
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ return false;
}
/*****************************
Destination options header.
*****************************/
-#ifdef CONFIG_IPV6_MIP6
-static int ipv6_dest_hao(struct sk_buff **skbp, int optoff)
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
{
- struct sk_buff *skb = *skbp;
struct ipv6_destopt_hao *hao;
struct inet6_skb_parm *opt = IP6CB(skb);
- struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->nh.raw;
- struct in6_addr tmp_addr;
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ SKB_DR(reason);
int ret;
if (opt->dsthao) {
- LIMIT_NETDEBUG(KERN_DEBUG "hao duplicated\n");
+ net_dbg_ratelimited("hao duplicated\n");
goto discard;
}
opt->dsthao = opt->dst1;
opt->dst1 = 0;
- hao = (struct ipv6_destopt_hao *)(skb->nh.raw + optoff);
+ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff);
if (hao->length != 16) {
- LIMIT_NETDEBUG(
- KERN_DEBUG "hao invalid option length = %d\n", hao->length);
+ net_dbg_ratelimited("hao invalid option length = %d\n",
+ hao->length);
+ SKB_DR_SET(reason, IP_INHDR);
goto discard;
}
if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
- LIMIT_NETDEBUG(
- KERN_DEBUG "hao is not an unicast addr: " NIP6_FMT "\n", NIP6(hao->addr));
+ net_dbg_ratelimited("hao is not an unicast addr: %pI6\n",
+ &hao->addr);
+ SKB_DR_SET(reason, INVALID_PROTO);
goto discard;
}
ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
(xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
- if (unlikely(ret < 0))
+ if (unlikely(ret < 0)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto discard;
+ }
if (skb_cloned(skb)) {
- struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
- struct inet6_skb_parm *opt2;
-
- if (skb2 == NULL)
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto discard;
- opt2 = IP6CB(skb2);
- memcpy(opt2, opt, sizeof(*opt2));
-
- kfree_skb(skb);
-
/* update all variable using below by copied skbuff */
- *skbp = skb = skb2;
- hao = (struct ipv6_destopt_hao *)(skb2->nh.raw + optoff);
- ipv6h = (struct ipv6hdr *)skb2->nh.raw;
+ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) +
+ optoff);
+ ipv6h = ipv6_hdr(skb);
}
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
- ipv6_addr_copy(&tmp_addr, &ipv6h->saddr);
- ipv6_addr_copy(&ipv6h->saddr, &hao->addr);
- ipv6_addr_copy(&hao->addr, &tmp_addr);
+ swap(ipv6h->saddr, hao->addr);
- if (skb->tstamp.off_sec == 0)
+ if (skb->tstamp == 0)
__net_timestamp(skb);
- return 1;
+ return true;
discard:
- kfree_skb(skb);
- return 0;
+ kfree_skb_reason(skb, reason);
+ return false;
}
#endif
-static struct tlvtype_proc tlvprocdestopt_lst[] = {
-#ifdef CONFIG_IPV6_MIP6
- {
- .type = IPV6_TLV_HAO,
- .func = ipv6_dest_hao,
- },
-#endif
- {-1, NULL}
-};
-
-static int ipv6_destopt_rcv(struct sk_buff **skbp)
+static int ipv6_destopt_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb = *skbp;
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
__u16 dstbuf;
#endif
- struct dst_entry *dst;
-
- if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) ||
- !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
+ struct dst_entry *dst = skb_dst(skb);
+ struct net *net = dev_net(skb->dev);
+ int extlen;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
+ !pskb_may_pull(skb, (skb_transport_offset(skb) +
+ ((skb_transport_header(skb)[1] + 1) << 3)))) {
+ __IP6_INC_STATS(dev_net(dst_dev(dst)), idev,
+ IPSTATS_MIB_INHDRERRORS);
+fail_and_free:
kfree_skb(skb);
return -1;
}
- opt->lastopt = skb->h.raw - skb->nh.raw;
- opt->dst1 = skb->h.raw - skb->nh.raw;
-#ifdef CONFIG_IPV6_MIP6
+ extlen = (skb_transport_header(skb)[1] + 1) << 3;
+ if (extlen > net->ipv6.sysctl.max_dst_opts_len)
+ goto fail_and_free;
+
+ opt->lastopt = opt->dst1 = skb_network_header_len(skb);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
dstbuf = opt->dst1;
#endif
- dst = dst_clone(skb->dst);
- if (ip6_parse_tlv(tlvprocdestopt_lst, skbp)) {
- dst_release(dst);
- skb = *skbp;
- skb->h.raw += ((skb->h.raw[1]+1)<<3);
+ if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) {
+ skb->transport_header += extlen;
opt = IP6CB(skb);
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
opt->nhoff = dstbuf;
#else
opt->nhoff = opt->dst1;
@@ -314,89 +333,372 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
return 1;
}
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
- dst_release(dst);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return -1;
}
-static struct inet6_protocol destopt_protocol = {
- .handler = ipv6_destopt_rcv,
- .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR,
-};
-
-void __init ipv6_destopt_init(void)
+static void seg6_update_csum(struct sk_buff *skb)
{
- if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0)
- printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n");
-}
+ struct ipv6_sr_hdr *hdr;
+ struct in6_addr *addr;
+ __be32 from, to;
-/********************************
- NONE header. No data in packet.
- ********************************/
+ /* srh is at transport offset and seg_left is already decremented
+ * but daddr is not yet updated with next segment
+ */
-static int ipv6_nodata_rcv(struct sk_buff **skbp)
-{
- struct sk_buff *skb = *skbp;
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+ addr = hdr->segments + hdr->segments_left;
- kfree_skb(skb);
- return 0;
+ hdr->segments_left++;
+ from = *(__be32 *)hdr;
+
+ hdr->segments_left--;
+ to = *(__be32 *)hdr;
+
+ /* update skb csum with diff resulting from seg_left decrement */
+
+ update_csum_diff4(skb, from, to);
+
+ /* compute csum diff between current and next segment and update */
+
+ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
+ (__be32 *)addr);
}
-static struct inet6_protocol nodata_protocol = {
- .handler = ipv6_nodata_rcv,
- .flags = INET6_PROTO_NOPOLICY,
-};
+static int ipv6_srh_rcv(struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ipv6_sr_hdr *hdr;
+ struct inet6_dev *idev;
+ struct in6_addr *addr;
+ int accept_seg6;
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ idev = __in6_dev_get(skb->dev);
-void __init ipv6_nodata_init(void)
+ accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled),
+ READ_ONCE(idev->cnf.seg6_enabled));
+
+ if (!accept_seg6) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (!seg6_hmac_validate_skb(skb)) {
+ kfree_skb(skb);
+ return -1;
+ }
+#endif
+
+looped_back:
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ skb_pull(skb, offset);
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+ if (hdr->nexthdr == NEXTHDR_IPV4)
+ skb->protocol = htons(ETH_P_IP);
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+ }
+
+ hdr->segments_left--;
+ addr = hdr->segments + hdr->segments_left;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ seg6_update_csum(skb);
+
+ ipv6_hdr(skb)->daddr = *addr;
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
+static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
{
- if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0)
- printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n");
+ struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct inet6_dev *idev;
+ struct ipv6hdr *oldhdr;
+ unsigned char *buf;
+ int accept_rpl_seg;
+ int i, err;
+ u64 n = 0;
+ u32 r;
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled),
+ READ_ONCE(idev->cnf.rpl_seg_enabled));
+ if (!accept_rpl_seg) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+looped_back:
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ skb_pull(skb, offset);
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
+ r = do_div(n, (16 - hdr->cmpri));
+ /* checks if calculation was without remainder and n fits into
+ * unsigned char which is segments_left field. Should not be
+ * higher than that.
+ */
+ if (r || (n + 1) > 255) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (hdr->segments_left > n + 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ hdr->segments_left--;
+ i = n - hdr->segments_left;
+
+ buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC);
+ if (unlikely(!buf)) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ ohdr = (struct ipv6_rpl_sr_hdr *)buf;
+ ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
+ chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));
+
+ if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) {
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
+ if (err) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]);
+
+ ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, ((hdr->hdrlen + 1) << 3));
+ skb_postpull_rcsum(skb, oldhdr,
+ sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
+ if (unlikely(!hdr->segments_left)) {
+ if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0,
+ GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ oldhdr = ipv6_hdr(skb);
+ }
+ skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
+ memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_postpush_rcsum(skb, ipv6_hdr(skb),
+ sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));
+
+ kfree(buf);
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
}
/********************************
Routing header.
********************************/
-static int ipv6_rthdr_rcv(struct sk_buff **skbp)
+/* called with rcu_read_lock() */
+static int ipv6_rthdr_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb = *skbp;
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
struct in6_addr *addr = NULL;
- struct in6_addr daddr;
int n, i;
-
struct ipv6_rt_hdr *hdr;
struct rt0_hdr *rthdr;
+ struct net *net = dev_net(skb->dev);
+ int accept_source_route;
+
+ accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route);
- if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) ||
- !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
+ if (idev)
+ accept_source_route = min(accept_source_route,
+ READ_ONCE(idev->cnf.accept_source_route));
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
+ !pskb_may_pull(skb, (skb_transport_offset(skb) +
+ ((skb_transport_header(skb)[1] + 1) << 3)))) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
- hdr = (struct ipv6_rt_hdr *) skb->h.raw;
+ hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
- if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr) ||
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
skb->pkt_type != PACKET_HOST) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
+ switch (hdr->type) {
+ case IPV6_SRCRT_TYPE_4:
+ /* segment routing */
+ return ipv6_srh_rcv(skb);
+ case IPV6_SRCRT_TYPE_3:
+ /* rpl segment routing */
+ return ipv6_rpl_srh_rcv(skb);
+ default:
+ break;
+ }
+
looped_back:
if (hdr->segments_left == 0) {
switch (hdr->type) {
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPV6_SRCRT_TYPE_2:
/* Silently discard type 2 header unless it was
* processed by own
*/
if (!addr) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -406,40 +708,29 @@ looped_back:
break;
}
- opt->lastopt = skb->h.raw - skb->nh.raw;
- opt->srcrt = skb->h.raw - skb->nh.raw;
- skb->h.raw += (hdr->hdrlen + 1) << 3;
+ opt->lastopt = opt->srcrt = skb_network_header_len(skb);
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
opt->dst0 = opt->dst1;
opt->dst1 = 0;
- opt->nhoff = (&hdr->nexthdr) - skb->nh.raw;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
return 1;
}
switch (hdr->type) {
- case IPV6_SRCRT_TYPE_0:
- if (hdr->hdrlen & 0x01) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->hdrlen) - skb->nh.raw);
- return -1;
- }
- break;
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPV6_SRCRT_TYPE_2:
+ if (accept_source_route < 0)
+ goto unknown_rh;
/* Silently discard invalid RTH type 2 */
if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
break;
#endif
default:
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb->nh.raw);
- return -1;
+ goto unknown_rh;
}
/*
@@ -450,9 +741,10 @@ looped_back:
n = hdr->hdrlen >> 1;
if (hdr->segments_left > n) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->segments_left) - skb->nh.raw);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
return -1;
}
@@ -460,18 +752,14 @@ looped_back:
Do not damage packets queued somewhere.
*/
if (skb_cloned(skb)) {
- struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
/* the copy is a forwarded packet */
- if (skb2 == NULL) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_OUTDISCARDS);
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
return -1;
}
- kfree_skb(skb);
- *skbp = skb = skb2;
- opt = IP6CB(skb2);
- hdr = (struct ipv6_rt_hdr *) skb2->h.raw;
+ hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
}
if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -484,19 +772,17 @@ looped_back:
addr += i - 1;
switch (hdr->type) {
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPV6_SRCRT_TYPE_2:
if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
- (xfrm_address_t *)&skb->nh.ipv6h->saddr,
+ (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
IPPROTO_ROUTING) < 0) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
- if (!ipv6_chk_home_addr(addr)) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INADDRERRORS);
+ if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -507,214 +793,268 @@ looped_back:
}
if (ipv6_addr_is_multicast(addr)) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
- ipv6_addr_copy(&daddr, addr);
- ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr);
- ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr);
+ swap(*addr, ipv6_hdr(skb)->daddr);
- dst_release(xchg(&skb->dst, NULL));
ip6_route_input(skb);
- if (skb->dst->error) {
- skb_push(skb, skb->data - skb->nh.raw);
+ if (skb_dst(skb)->error) {
+ skb_push(skb, -skb_network_offset(skb));
dst_input(skb);
return -1;
}
- if (skb->dst->dev->flags&IFF_LOOPBACK) {
- if (skb->nh.ipv6h->hop_limit <= 1) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
+ if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
- 0, skb->dev);
+ 0);
kfree_skb(skb);
return -1;
}
- skb->nh.ipv6h->hop_limit--;
+ ipv6_hdr(skb)->hop_limit--;
goto looped_back;
}
- skb_push(skb, skb->data - skb->nh.raw);
+ skb_push(skb, -skb_network_offset(skb));
dst_input(skb);
return -1;
+
+unknown_rh:
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ (&hdr->type) - skb_network_header(skb));
+ return -1;
}
-static struct inet6_protocol rthdr_protocol = {
+static const struct inet6_protocol rthdr_protocol = {
.handler = ipv6_rthdr_rcv,
- .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR,
+ .flags = INET6_PROTO_NOPOLICY,
};
-void __init ipv6_rthdr_init(void)
-{
- if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0)
- printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n");
+static const struct inet6_protocol destopt_protocol = {
+ .handler = ipv6_destopt_rcv,
+ .flags = INET6_PROTO_NOPOLICY,
};
-/*
- This function inverts received rthdr.
- NOTE: specs allow to make it automatically only if
- packet authenticated.
-
- I will not discuss it here (though, I am really pissed off at
- this stupid requirement making rthdr idea useless)
-
- Actually, it creates severe problems for us.
- Embryonic requests has no associated sockets,
- so that user have no control over it and
- cannot not only to set reply options, but
- even to know, that someone wants to connect
- without success. :-(
-
- For now we need to test the engine, so that I created
- temporary (or permanent) backdoor.
- If listening socket set IPV6_RTHDR to 2, then we invert header.
- --ANK (980729)
- */
+static const struct inet6_protocol nodata_protocol = {
+ .handler = dst_discard,
+ .flags = INET6_PROTO_NOPOLICY,
+};
-struct ipv6_txoptions *
-ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
+int __init ipv6_exthdrs_init(void)
{
- /* Received rthdr:
+ int ret;
- [ H1 -> H2 -> ... H_prev ] daddr=ME
+ ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+ if (ret)
+ goto out;
- Inverted result:
- [ H_prev -> ... -> H1 ] daddr =sender
+ ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+ if (ret)
+ goto out_rthdr;
- Note, that IP output engine will rewrite this rthdr
- by rotating it left by one addr.
- */
+ ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE);
+ if (ret)
+ goto out_destopt;
- int n, i;
- struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr;
- struct rt0_hdr *irthdr;
- struct ipv6_txoptions *opt;
- int hdrlen = ipv6_optlen(hdr);
-
- if (hdr->segments_left ||
- hdr->type != IPV6_SRCRT_TYPE_0 ||
- hdr->hdrlen & 0x01)
- return NULL;
+out:
+ return ret;
+out_destopt:
+ inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+out_rthdr:
+ inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+ goto out;
+};
- n = hdr->hdrlen >> 1;
- opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC);
- if (opt == NULL)
- return NULL;
- memset(opt, 0, sizeof(*opt));
- opt->tot_len = sizeof(*opt) + hdrlen;
- opt->srcrt = (void*)(opt+1);
- opt->opt_nflen = hdrlen;
-
- memcpy(opt->srcrt, hdr, sizeof(*hdr));
- irthdr = (struct rt0_hdr*)opt->srcrt;
- irthdr->reserved = 0;
- opt->srcrt->segments_left = n;
- for (i=0; i<n; i++)
- memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16);
- return opt;
+void ipv6_exthdrs_exit(void)
+{
+ inet6_del_protocol(&nodata_protocol, IPPROTO_NONE);
+ inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+ inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
}
-EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
-
/**********************************
Hop-by-hop options.
**********************************/
/* Router Alert as of RFC 2711 */
-static int ipv6_hop_ra(struct sk_buff **skbp, int optoff)
+static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
{
- struct sk_buff *skb = *skbp;
+ const unsigned char *nh = skb_network_header(skb);
- if (skb->nh.raw[optoff+1] == 2) {
- IP6CB(skb)->ra = optoff;
- return 1;
+ if (nh[optoff + 1] == 2) {
+ IP6CB(skb)->flags |= IP6SKB_ROUTERALERT;
+ memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra));
+ return true;
}
- LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
- skb->nh.raw[optoff+1]);
- kfree_skb(skb);
- return 0;
+ net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n",
+ nh[optoff + 1]);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ return false;
+}
+
+/* IOAM */
+
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
+{
+ struct ioam6_trace_hdr *trace;
+ struct ioam6_namespace *ns;
+ struct ioam6_hdr *hdr;
+
+ /* Bad alignment (must be 4n-aligned) */
+ if (optoff & 3)
+ goto drop;
+
+ /* Ignore if IOAM is not enabled on ingress */
+ if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled))
+ goto ignore;
+
+ /* Truncated Option header */
+ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
+ if (hdr->opt_len < 2)
+ goto drop;
+
+ switch (hdr->type) {
+ case IOAM6_TYPE_PREALLOC:
+ /* Truncated Pre-allocated Trace header */
+ if (hdr->opt_len < 2 + sizeof(*trace))
+ goto drop;
+
+ /* Malformed Pre-allocated Trace header */
+ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
+ if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
+ goto drop;
+
+ /* Ignore if the IOAM namespace is unknown */
+ ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id);
+ if (!ns)
+ goto ignore;
+
+ if (!skb_valid_dst(skb))
+ ip6_route_input(skb);
+
+ /* About to mangle packet header */
+ if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len))
+ goto drop;
+
+ /* Trace pointer may have changed */
+ trace = (struct ioam6_trace_hdr *)(skb_network_header(skb)
+ + optoff + sizeof(*hdr));
+
+ ioam6_fill_trace_data(skb, ns, trace, true);
+
+ ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev),
+ GFP_ATOMIC, (void *)trace, hdr->opt_len - 2);
+ break;
+ default:
+ break;
+ }
+
+ignore:
+ return true;
+
+drop:
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ return false;
}
/* Jumbo payload */
-static int ipv6_hop_jumbo(struct sk_buff **skbp, int optoff)
+static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
- struct sk_buff *skb = *skbp;
+ const unsigned char *nh = skb_network_header(skb);
+ SKB_DR(reason);
u32 pkt_len;
- if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
- LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
- skb->nh.raw[optoff+1]);
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
+ if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
+ net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
+ nh[optoff+1]);
+ SKB_DR_SET(reason, IP_INHDR);
goto drop;
}
- pkt_len = ntohl(*(__be32*)(skb->nh.raw+optoff+2));
+ pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
- return 0;
+ icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2,
+ SKB_DROP_REASON_IP_INHDR);
+ return false;
}
- if (skb->nh.ipv6h->payload_len) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
- return 0;
+ if (ipv6_hdr(skb)->payload_len) {
+ icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff,
+ SKB_DROP_REASON_IP_INHDR);
+ return false;
}
if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INTRUNCATEDPKTS);
+ SKB_DR_SET(reason, PKT_TOO_SMALL);
goto drop;
}
if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
goto drop;
- return 1;
+ IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM;
+ return true;
drop:
- kfree_skb(skb);
- return 0;
+ kfree_skb_reason(skb, reason);
+ return false;
}
-static struct tlvtype_proc tlvprochopopt_lst[] = {
- {
- .type = IPV6_TLV_ROUTERALERT,
- .func = ipv6_hop_ra,
- },
- {
- .type = IPV6_TLV_JUMBO,
- .func = ipv6_hop_jumbo,
- },
- { -1, }
-};
+/* CALIPSO RFC 5570 */
-int ipv6_parse_hopopts(struct sk_buff **skbp)
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
+{
+ const unsigned char *nh = skb_network_header(skb);
+
+ if (nh[optoff + 1] < 8)
+ goto drop;
+
+ if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1])
+ goto drop;
+
+ if (!calipso_validate(skb, nh + optoff))
+ goto drop;
+
+ return true;
+
+drop:
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ return false;
+}
+
+int ipv6_parse_hopopts(struct sk_buff *skb)
{
- struct sk_buff *skb = *skbp;
struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ int extlen;
/*
- * skb->nh.raw is equal to skb->data, and
- * skb->h.raw - skb->nh.raw is always equal to
+ * skb_network_header(skb) is equal to skb->data, and
+ * skb_network_header_len(skb) is always equal to
* sizeof(struct ipv6hdr) by definition of
* hop-by-hop options.
*/
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
- !pskb_may_pull(skb, sizeof(struct ipv6hdr) + ((skb->h.raw[1] + 1) << 3))) {
+ !pskb_may_pull(skb, (sizeof(struct ipv6hdr) +
+ ((skb_transport_header(skb)[1] + 1) << 3)))) {
+fail_and_free:
kfree_skb(skb);
return -1;
}
- opt->hop = sizeof(struct ipv6hdr);
- if (ip6_parse_tlv(tlvprochopopt_lst, skbp)) {
- skb = *skbp;
- skb->h.raw += (skb->h.raw[1]+1)<<3;
+ extlen = (skb_transport_header(skb)[1] + 1) << 3;
+ if (extlen > net->ipv6.sysctl.max_hbh_opts_len)
+ goto fail_and_free;
+
+ opt->flags |= IP6SKB_HOPBYHOP;
+ if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) {
+ skb->transport_header += extlen;
opt = IP6CB(skb);
opt->nhoff = sizeof(struct ipv6hdr);
return 1;
@@ -732,16 +1072,16 @@ int ipv6_parse_hopopts(struct sk_buff **skbp)
* for headers.
*/
-static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
- struct ipv6_rt_hdr *opt,
- struct in6_addr **addr_p)
+static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
{
struct rt0_hdr *phdr, *ihdr;
int hops;
ihdr = (struct rt0_hdr *) opt;
-
- phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
+
+ phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
hops = ihdr->rt_hdr.hdrlen >> 1;
@@ -750,16 +1090,83 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
memcpy(phdr->addr, ihdr->addr + 1,
(hops - 1) * sizeof(struct in6_addr));
- ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p);
+ phdr->addr[hops - 1] = **addr_p;
*addr_p = ihdr->addr;
phdr->rt_hdr.nexthdr = *proto;
*proto = NEXTHDR_ROUTING;
}
+static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
+{
+ struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
+ int plen, hops;
+
+ sr_ihdr = (struct ipv6_sr_hdr *)opt;
+ plen = (sr_ihdr->hdrlen + 1) << 3;
+
+ sr_phdr = skb_push(skb, plen);
+ memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr));
+
+ hops = sr_ihdr->first_segment + 1;
+ memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1,
+ (hops - 1) * sizeof(struct in6_addr));
+
+ sr_phdr->segments[0] = **addr_p;
+ *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left];
+
+ if (sr_ihdr->hdrlen > hops * 2) {
+ int tlvs_offset, tlvs_length;
+
+ tlvs_offset = (1 + hops * 2) << 3;
+ tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3;
+ memcpy((char *)sr_phdr + tlvs_offset,
+ (char *)sr_ihdr + tlvs_offset, tlvs_length);
+ }
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(sr_phdr)) {
+ struct net *net = NULL;
+
+ if (skb->dev)
+ net = dev_net(skb->dev);
+ else if (skb->sk)
+ net = sock_net(skb->sk);
+
+ WARN_ON(!net);
+
+ if (net)
+ seg6_push_hmac(net, saddr, sr_phdr);
+ }
+#endif
+
+ sr_phdr->nexthdr = *proto;
+ *proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
+{
+ switch (opt->type) {
+ case IPV6_SRCRT_TYPE_0:
+ case IPV6_SRCRT_STRICT:
+ case IPV6_SRCRT_TYPE_2:
+ ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
+ break;
+ case IPV6_SRCRT_TYPE_4:
+ ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
+ break;
+ default:
+ break;
+ }
+}
+
static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
{
- struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
+ struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt));
memcpy(h, opt, ipv6_optlen(opt));
h->nexthdr = *proto;
@@ -768,10 +1175,10 @@ static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv
void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
u8 *proto,
- struct in6_addr **daddr)
+ struct in6_addr **daddr, struct in6_addr *saddr)
{
if (opt->srcrt) {
- ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+ ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
/*
* IPV6_RTHDRDSTOPTS is ignored
* unless IPV6_RTHDR is set (RFC3542).
@@ -788,64 +1195,74 @@ void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *pr
if (opt->dst1opt)
ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
}
+EXPORT_SYMBOL(ipv6_push_frag_opts);
struct ipv6_txoptions *
ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
{
struct ipv6_txoptions *opt2;
- opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+ opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC);
if (opt2) {
- long dif = (char*)opt2 - (char*)opt;
- memcpy(opt2, opt, opt->tot_len);
+ long dif = (char *)opt2 - (char *)opt;
if (opt2->hopopt)
- *((char**)&opt2->hopopt) += dif;
+ *((char **)&opt2->hopopt) += dif;
if (opt2->dst0opt)
- *((char**)&opt2->dst0opt) += dif;
+ *((char **)&opt2->dst0opt) += dif;
if (opt2->dst1opt)
- *((char**)&opt2->dst1opt) += dif;
+ *((char **)&opt2->dst1opt) += dif;
if (opt2->srcrt)
- *((char**)&opt2->srcrt) += dif;
+ *((char **)&opt2->srcrt) += dif;
+ refcount_set(&opt2->refcnt, 1);
}
return opt2;
}
-
EXPORT_SYMBOL_GPL(ipv6_dup_options);
-static int ipv6_renew_option(void *ohdr,
- struct ipv6_opt_hdr __user *newopt, int newoptlen,
- int inherit,
- struct ipv6_opt_hdr **hdr,
- char **p)
+static void ipv6_renew_option(int renewtype,
+ struct ipv6_opt_hdr **dest,
+ struct ipv6_opt_hdr *old,
+ struct ipv6_opt_hdr *new,
+ int newtype, char **p)
{
- if (inherit) {
- if (ohdr) {
- memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
- *hdr = (struct ipv6_opt_hdr *)*p;
- *p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr));
- }
- } else {
- if (newopt) {
- if (copy_from_user(*p, newopt, newoptlen))
- return -EFAULT;
- *hdr = (struct ipv6_opt_hdr *)*p;
- if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen)
- return -EINVAL;
- *p += CMSG_ALIGN(newoptlen);
- }
- }
- return 0;
+ struct ipv6_opt_hdr *src;
+
+ src = (renewtype == newtype ? new : old);
+ if (!src)
+ return;
+
+ memcpy(*p, src, ipv6_optlen(src));
+ *dest = (struct ipv6_opt_hdr *)*p;
+ *p += CMSG_ALIGN(ipv6_optlen(*dest));
}
+/**
+ * ipv6_renew_options - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (user-mem)
+ *
+ * Returns a new set of options which is a copy of @opt with the
+ * option type @newtype replaced with @newopt.
+ *
+ * @opt may be NULL, in which case a new set of options is returned
+ * containing just @newopt.
+ *
+ * @newopt may be NULL, in which case the specified option type is
+ * not copied into the new set of options.
+ *
+ * The new set of options is allocated from the socket option memory
+ * buffer of @sk.
+ */
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
- int newtype,
- struct ipv6_opt_hdr __user *newopt, int newoptlen)
+ int newtype, struct ipv6_opt_hdr *newopt)
{
int tot_len = 0;
char *p;
struct ipv6_txoptions *opt2;
- int err;
if (opt) {
if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -858,8 +1275,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
}
- if (newopt && newoptlen)
- tot_len += CMSG_ALIGN(newoptlen);
+ if (newopt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
if (!tot_len)
return NULL;
@@ -870,33 +1287,23 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return ERR_PTR(-ENOBUFS);
memset(opt2, 0, tot_len);
-
+ refcount_set(&opt2->refcnt, 1);
opt2->tot_len = tot_len;
p = (char *)(opt2 + 1);
- err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
- newtype != IPV6_HOPOPTS,
- &opt2->hopopt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
- newtype != IPV6_RTHDRDSTOPTS,
- &opt2->dst0opt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
- newtype != IPV6_RTHDR,
- (struct ipv6_opt_hdr **)&opt2->srcrt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
- newtype != IPV6_DSTOPTS,
- &opt2->dst1opt, &p);
- if (err)
- goto out;
+ ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
+ (opt ? opt->hopopt : NULL),
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
+ (opt ? opt->dst0opt : NULL),
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_RTHDR,
+ (struct ipv6_opt_hdr **)&opt2->srcrt,
+ (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
+ (opt ? opt->dst1opt : NULL),
+ newopt, newtype, &p);
opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
(opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -904,19 +1311,16 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
return opt2;
-out:
- sock_kfree_s(sk, opt2, opt2->tot_len);
- return ERR_PTR(err);
}
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
- struct ipv6_txoptions *opt)
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
{
/*
* ignore the dest before srcrt unless srcrt is being included.
* --yoshfuji
*/
- if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt->dst0opt && !opt->srcrt) {
if (opt_space != opt) {
memcpy(opt_space, opt, sizeof(*opt_space));
opt = opt_space;
@@ -927,4 +1331,45 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
return opt;
}
+EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
+
+/**
+ * fl6_update_dst - update flowi destination address with info given
+ * by srcrt option, if any.
+ *
+ * @fl6: flowi6 for which daddr is to be updated
+ * @opt: struct ipv6_txoptions in which to look for srcrt opt
+ * @orig: copy of original daddr address if modified
+ *
+ * Returns NULL if no txoptions or no srcrt, otherwise returns orig
+ * and initial value of fl6->daddr set in orig
+ */
+struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
+ const struct ipv6_txoptions *opt,
+ struct in6_addr *orig)
+{
+ if (!opt || !opt->srcrt)
+ return NULL;
+
+ *orig = fl6->daddr;
+
+ switch (opt->srcrt->type) {
+ case IPV6_SRCRT_TYPE_0:
+ case IPV6_SRCRT_STRICT:
+ case IPV6_SRCRT_TYPE_2:
+ fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
+ break;
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;
+ fl6->daddr = srh->segments[srh->segments_left];
+ break;
+ }
+ default:
+ return NULL;
+ }
+
+ return orig;
+}
+EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 21cbbbddaf4d..49e31e4ae7b7 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -1,42 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 library code, needed by static components when full IPv6 support is
* not configured or static.
*/
+#include <linux/export.h>
#include <net/ipv6.h>
-/*
+/*
* find out if nexthdr is a well-known extension header or a protocol
*/
-int ipv6_ext_hdr(u8 nexthdr)
+bool ipv6_ext_hdr(u8 nexthdr)
{
- /*
+ /*
* find out if nexthdr is an extension header or a protocol
*/
- return ( (nexthdr == NEXTHDR_HOP) ||
+ return (nexthdr == NEXTHDR_HOP) ||
(nexthdr == NEXTHDR_ROUTING) ||
(nexthdr == NEXTHDR_FRAGMENT) ||
(nexthdr == NEXTHDR_AUTH) ||
(nexthdr == NEXTHDR_NONE) ||
- (nexthdr == NEXTHDR_DEST) );
+ (nexthdr == NEXTHDR_DEST);
}
+EXPORT_SYMBOL(ipv6_ext_hdr);
/*
* Skip any extension headers. This is used by the ICMP module.
*
* Note that strictly speaking this conflicts with RFC 2460 4.0:
- * ...The contents and semantics of each extension header determine whether
+ * ...The contents and semantics of each extension header determine whether
* or not to proceed to the next header. Therefore, extension headers must
* be processed strictly in the order they appear in the packet; a
* receiver must not, for example, scan through a packet looking for a
* particular kind of extension header and process that header prior to
* processing all preceding ones.
- *
+ *
* We do exactly this. This is a protocol bug. We can't decide after a
- * seeing an unknown discard-with-error flavour TLV option if it's a
+ * seeing an unknown discard-with-error flavour TLV option if it's a
* ICMP error message or not (errors should never be send in reply to
* ICMP error messages).
- *
+ *
* But I see no other way to do this. This might need to be reexamined
* when Linux implements ESP (and maybe AUTH) headers.
* --AK
@@ -56,6 +59,9 @@ int ipv6_ext_hdr(u8 nexthdr)
* it returns NULL.
* - First fragment header is skipped, not-first ones
* are considered as unparsable.
+ * - Reports the offset field of the final fragment header so it is
+ * possible to tell whether this is a first fragment, later fragment,
+ * or not fragmented.
* - ESP is unparsable for now and considered like
* normal payload protocol.
* - Note also special handling of AUTH header. Thanks to IPsec wizards.
@@ -63,10 +69,13 @@ int ipv6_ext_hdr(u8 nexthdr)
* --ANK (980726)
*/
-int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
+int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
+ __be16 *frag_offp)
{
u8 nexthdr = *nexthdrp;
+ *frag_offp = 0;
+
while (ipv6_ext_hdr(nexthdr)) {
struct ipv6_opt_hdr _hdr, *hp;
int hdrlen;
@@ -74,7 +83,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
if (nexthdr == NEXTHDR_NONE)
return -1;
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
- if (hp == NULL)
+ if (!hp)
return -1;
if (nexthdr == NEXTHDR_FRAGMENT) {
__be16 _frag_off, *fp;
@@ -83,16 +92,17 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
frag_off),
sizeof(_frag_off),
&_frag_off);
- if (fp == NULL)
+ if (!fp)
return -1;
- if (ntohs(*fp) & ~0x7)
+ *frag_offp = *fp;
+ if (ntohs(*frag_offp) & ~0x7)
break;
hdrlen = 8;
} else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
+ hdrlen = ipv6_authlen(hp);
else
- hdrlen = ipv6_optlen(hp);
+ hdrlen = ipv6_optlen(hp);
nexthdr = hp->nexthdr;
start += hdrlen;
@@ -101,6 +111,172 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
*nexthdrp = nexthdr;
return start;
}
-
-EXPORT_SYMBOL(ipv6_ext_hdr);
EXPORT_SYMBOL(ipv6_skip_exthdr);
+
+int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
+{
+ const unsigned char *nh = skb_network_header(skb);
+ int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
+ struct ipv6_opt_hdr *hdr;
+ int len;
+
+ if (offset + 2 > packet_len)
+ goto bad;
+ hdr = (struct ipv6_opt_hdr *)(nh + offset);
+ len = ((hdr->hdrlen + 1) << 3);
+
+ if (offset + len > packet_len)
+ goto bad;
+
+ offset += 2;
+ len -= 2;
+
+ while (len > 0) {
+ int opttype = nh[offset];
+ int optlen;
+
+ if (opttype == type)
+ return offset;
+
+ switch (opttype) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+ default:
+ if (len < 2)
+ goto bad;
+ optlen = nh[offset + 1] + 2;
+ if (optlen > len)
+ goto bad;
+ break;
+ }
+ offset += optlen;
+ len -= optlen;
+ }
+ /* not_found */
+ bad:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(ipv6_find_tlv);
+
+/*
+ * find the offset to specified header or the protocol number of last header
+ * if target < 0. "last header" is transport protocol header, ESP, or
+ * "No next header".
+ *
+ * Note that *offset is used as input/output parameter, and if it is not zero,
+ * then it must be a valid offset to an inner IPv6 header. This can be used
+ * to explore inner IPv6 header, eg. ICMPv6 error messages.
+ *
+ * If target header is found, its offset is set in *offset and return protocol
+ * number. Otherwise, return -1.
+ *
+ * If the first fragment doesn't contain the final protocol header or
+ * NEXTHDR_NONE it is considered invalid.
+ *
+ * Note that non-1st fragment is special case that "the protocol number
+ * of last header" is "next header" field in Fragment header. In this case,
+ * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
+ * isn't NULL.
+ *
+ * if flags is not NULL and it's a fragment, then the frag flag
+ * IP6_FH_F_FRAG will be set. If it's an AH header, the
+ * IP6_FH_F_AUTH flag is set and target < 0, then this function will
+ * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this
+ * function will skip all those routing headers, where segements_left was 0.
+ */
+int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+ int target, unsigned short *fragoff, int *flags)
+{
+ unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ bool found;
+
+ if (fragoff)
+ *fragoff = 0;
+
+ if (*offset) {
+ struct ipv6hdr _ip6, *ip6;
+
+ ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
+ if (!ip6 || (ip6->version != 6))
+ return -EBADMSG;
+ start = *offset + sizeof(struct ipv6hdr);
+ nexthdr = ip6->nexthdr;
+ }
+
+ do {
+ struct ipv6_opt_hdr _hdr, *hp;
+ unsigned int hdrlen;
+ found = (nexthdr == target);
+
+ if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
+ if (target < 0 || found)
+ break;
+ return -ENOENT;
+ }
+
+ hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+ if (!hp)
+ return -EBADMSG;
+
+ if (nexthdr == NEXTHDR_ROUTING) {
+ struct ipv6_rt_hdr _rh, *rh;
+
+ rh = skb_header_pointer(skb, start, sizeof(_rh),
+ &_rh);
+ if (!rh)
+ return -EBADMSG;
+
+ if (flags && (*flags & IP6_FH_F_SKIP_RH) &&
+ rh->segments_left == 0)
+ found = false;
+ }
+
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ unsigned short _frag_off;
+ __be16 *fp;
+
+ if (flags) /* Indicate that this is a fragment */
+ *flags |= IP6_FH_F_FRAG;
+ fp = skb_header_pointer(skb,
+ start+offsetof(struct frag_hdr,
+ frag_off),
+ sizeof(_frag_off),
+ &_frag_off);
+ if (!fp)
+ return -EBADMSG;
+
+ _frag_off = ntohs(*fp) & ~0x7;
+ if (_frag_off) {
+ if (target < 0 &&
+ ((!ipv6_ext_hdr(hp->nexthdr)) ||
+ hp->nexthdr == NEXTHDR_NONE)) {
+ if (fragoff)
+ *fragoff = _frag_off;
+ return hp->nexthdr;
+ }
+ if (!found)
+ return -ENOENT;
+ if (fragoff)
+ *fragoff = _frag_off;
+ break;
+ }
+ hdrlen = 8;
+ } else if (nexthdr == NEXTHDR_AUTH) {
+ if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
+ break;
+ hdrlen = ipv6_authlen(hp);
+ } else
+ hdrlen = ipv6_optlen(hp);
+
+ if (!found) {
+ nexthdr = hp->nexthdr;
+ start += hdrlen;
+ }
+ } while (!found);
+
+ *offset = start;
+ return nexthdr;
+}
+EXPORT_SYMBOL(ipv6_find_hdr);
diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c
new file mode 100644
index 000000000000..4c00398f4dca
--- /dev/null
+++ b/net/ipv6/exthdrs_offload.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * IPV6 Extension Header GSO/GRO support
+ */
+#include <net/protocol.h>
+#include "ip6_offload.h"
+
+static const struct net_offload rthdr_offload = {
+ .flags = INET6_PROTO_GSO_EXTHDR,
+};
+
+static const struct net_offload dstopt_offload = {
+ .flags = INET6_PROTO_GSO_EXTHDR,
+};
+
+static const struct net_offload hbh_offload = {
+ .flags = INET6_PROTO_GSO_EXTHDR,
+};
+
+int __init ipv6_exthdrs_offload_init(void)
+{
+ int ret;
+
+ ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING);
+ if (ret)
+ goto out;
+
+ ret = inet6_add_offload(&dstopt_offload, IPPROTO_DSTOPTS);
+ if (ret)
+ goto out_rt;
+
+ ret = inet6_add_offload(&hbh_offload, IPPROTO_HOPOPTS);
+ if (ret)
+ goto out_dstopts;
+
+out:
+ return ret;
+
+out_dstopts:
+ inet6_del_offload(&dstopt_offload, IPPROTO_DSTOPTS);
+
+out_rt:
+ inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING);
+ goto out;
+}
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
new file mode 100644
index 000000000000..949b72610df7
--- /dev/null
+++ b/net/ipv6/fib6_notifier.c
@@ -0,0 +1,64 @@
+#include <linux/notifier.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+#include <net/netns/ipv6.h>
+#include <net/ip6_fib.h>
+
+int call_fib6_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->family = AF_INET6;
+ return call_fib_notifier(nb, event_type, info);
+}
+
+int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->family = AF_INET6;
+ return call_fib_notifiers(net, event_type, info);
+}
+
+static unsigned int fib6_seq_read(const struct net *net)
+{
+ return fib6_tables_seq_read(net) + fib6_rules_seq_read(net);
+}
+
+static int fib6_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = fib6_rules_dump(net, nb, extack);
+ if (err)
+ return err;
+
+ return fib6_tables_dump(net, nb, extack);
+}
+
+static const struct fib_notifier_ops fib6_notifier_ops_template = {
+ .family = AF_INET6,
+ .fib_seq_read = fib6_seq_read,
+ .fib_dump = fib6_dump,
+ .owner = THIS_MODULE,
+};
+
+int __net_init fib6_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
+
+ ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv6.notifier_ops = ops;
+
+ return 0;
+}
+
+void __net_exit fib6_notifier_exit(struct net *net)
+{
+ fib_notifier_ops_unregister(net->ipv6.notifier_ops);
+}
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 0862809ffcf7..fd5f7112a51f 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -1,177 +1,493 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/ipv6/fib6_rules.c IPv6 Routing Policy Rules
*
* Copyright (C)2003-2006 Helsinki University of Technology
* Copyright (C)2003-2006 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2.
- *
* Authors
* Thomas Graf <tgraf@suug.ch>
* Ville Nuorvala <vnuorval@tcs.hut.fi>
*/
#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/fib_rules.h>
+#include <net/inet_dscp.h>
#include <net/ipv6.h>
+#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/netlink.h>
-struct fib6_rule
-{
+struct fib6_rule {
struct fib_rule common;
struct rt6key src;
struct rt6key dst;
- u8 tclass;
+ __be32 flowlabel;
+ __be32 flowlabel_mask;
+ dscp_t dscp;
+ dscp_t dscp_mask;
+ u8 dscp_full:1; /* DSCP or TOS selector */
};
-static struct fib_rules_ops fib6_rules_ops;
+static bool fib6_rule_matchall(const struct fib_rule *rule)
+{
+ struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
-static struct fib6_rule main_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0x7FFE,
- .action = FR_ACT_TO_TBL,
- .table = RT6_TABLE_MAIN,
- },
-};
+ if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask)
+ return false;
+ return fib_rule_matchall(rule);
+}
-static struct fib6_rule local_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0,
- .action = FR_ACT_TO_TBL,
- .table = RT6_TABLE_LOCAL,
- .flags = FIB_RULE_PERMANENT,
- },
-};
+bool fib6_rule_default(const struct fib_rule *rule)
+{
+ if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
+ rule->l3mdev)
+ return false;
+ if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(fib6_rule_default);
+
+int fib6_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return fib_rules_dump(net, nb, AF_INET6, extack);
+}
+
+unsigned int fib6_rules_seq_read(const struct net *net)
+{
+ return fib_rules_seq_read(net, AF_INET6);
+}
+
+/* called with rcu lock held; no reference taken on fib6_info */
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
+{
+ int err;
+
+ if (net->ipv6.fib6_has_custom_rules) {
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = fib6_table_lookup,
+ .lookup_data = &oif,
+ .result = res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+ err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+ } else {
+ err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif,
+ fl6, res, flags);
+ if (err || res->f6i == net->ipv6.fib6_null_entry)
+ err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, res, flags);
+ }
-static LIST_HEAD(fib6_rules);
+ return err;
+}
-struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
- pol_lookup_t lookup)
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags, pol_lookup_t lookup)
{
- struct fib_lookup_arg arg = {
- .lookup_ptr = lookup,
- };
+ if (net->ipv6.fib6_has_custom_rules) {
+ struct fib6_result res = {};
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = lookup,
+ .lookup_data = skb,
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+ fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+
+ if (res.rt6)
+ return &res.rt6->dst;
+ } else {
+ struct rt6_info *rt;
+
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
+ if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
+ return &rt->dst;
+ ip6_rt_put_flags(rt, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
+ if (rt->dst.error != -EAGAIN)
+ return &rt->dst;
+ ip6_rt_put_flags(rt, flags);
+ }
+
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return &net->ipv6.ip6_null_entry->dst;
+}
+
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+ struct flowi6 *flp6, const struct net_device *dev)
+{
+ struct fib6_rule *r = (struct fib6_rule *)rule;
+
+ /* If we need to find a source address for this traffic,
+ * we check the result if it meets requirement of the rule.
+ */
+ if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+ r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+ struct in6_addr saddr;
+
+ if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+ rt6_flags2srcprefs(flags), &saddr))
+ return -EAGAIN;
+
+ if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+ return -EAGAIN;
+
+ flp6->saddr = saddr;
+ }
+
+ return 0;
+}
+
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct fib6_result *res = arg->result;
+ struct flowi6 *flp6 = &flp->u.ip6;
+ struct net *net = rule->fr_net;
+ struct fib6_table *table;
+ int err, *oif;
+ u32 tb_id;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
- fib_rules_lookup(&fib6_rules_ops, fl, flags, &arg);
- if (arg.rule)
- fib_rule_put(arg.rule);
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
+ if (!table)
+ return -EAGAIN;
- if (arg.result)
- return arg.result;
+ oif = (int *)arg->lookup_data;
+ err = fib6_table_lookup(net, table, *oif, flp6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ res->nh->fib_nh_dev);
+ else
+ err = -EAGAIN;
- dst_hold(&ip6_null_entry.u.dst);
- return &ip6_null_entry.u.dst;
+ return err;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
{
+ struct fib6_result *res = arg->result;
+ struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
struct fib6_table *table;
+ struct net *net = rule->fr_net;
pol_lookup_t lookup = arg->lookup_ptr;
+ int err = 0;
+ u32 tb_id;
switch (rule->action) {
case FR_ACT_TO_TBL:
break;
case FR_ACT_UNREACHABLE:
- rt = &ip6_null_entry;
+ err = -ENETUNREACH;
+ rt = net->ipv6.ip6_null_entry;
goto discard_pkt;
default:
case FR_ACT_BLACKHOLE:
- rt = &ip6_blk_hole_entry;
+ err = -EINVAL;
+ rt = net->ipv6.ip6_blk_hole_entry;
goto discard_pkt;
case FR_ACT_PROHIBIT:
- rt = &ip6_prohibit_entry;
+ err = -EACCES;
+ rt = net->ipv6.ip6_prohibit_entry;
goto discard_pkt;
}
- table = fib6_get_table(rule->table);
- if (table)
- rt = lookup(table, flp, flags);
-
- if (rt != &ip6_null_entry)
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
+ if (!table) {
+ err = -EAGAIN;
goto out;
- dst_release(&rt->u.dst);
+ }
+
+ rt = pol_lookup_func(lookup,
+ net, table, flp6, arg->lookup_data, flags);
+ if (rt != net->ipv6.ip6_null_entry) {
+ struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
+
+ if (!idev)
+ goto again;
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ idev->dev);
+
+ if (err == -EAGAIN)
+ goto again;
+
+ err = rt->dst.error;
+ if (err != -EAGAIN)
+ goto out;
+ }
+again:
+ ip6_rt_put_flags(rt, flags);
+ err = -EAGAIN;
rt = NULL;
goto out;
discard_pkt:
- dst_hold(&rt->u.dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
out:
- arg->result = rt;
- return rt == NULL ? -EAGAIN : 0;
+ res->rt6 = rt;
+ return err;
}
+INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule,
+ struct flowi *flp, int flags,
+ struct fib_lookup_arg *arg)
+{
+ if (arg->lookup_ptr == fib6_table_lookup)
+ return fib6_rule_action_alt(rule, flp, flags, arg);
+
+ return __fib6_rule_action(rule, flp, flags, arg);
+}
-static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
+ int flags,
+ struct fib_lookup_arg *arg)
+{
+ struct fib6_result *res = arg->result;
+ struct rt6_info *rt = res->rt6;
+ struct net_device *dev = NULL;
+
+ if (!rt)
+ return false;
+
+ if (rt->rt6i_idev)
+ dev = rt->rt6i_idev->dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ ip6_rt_put_flags(rt, flags);
+ return true;
+}
+
+INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
+ struct flowi *fl, int flags)
{
struct fib6_rule *r = (struct fib6_rule *) rule;
+ struct flowi6 *fl6 = &fl->u.ip6;
if (r->dst.plen &&
- !ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen))
+ !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
return 0;
+ /*
+ * If FIB_RULE_FIND_SADDR is set and we do not have a
+ * source address for the traffic, we defer check for
+ * source address.
+ */
if (r->src.plen) {
- if (!(flags & RT6_LOOKUP_F_HAS_SADDR) ||
- !ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, r->src.plen))
+ if (flags & RT6_LOOKUP_F_HAS_SADDR) {
+ if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
+ r->src.plen))
+ return 0;
+ } else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
return 0;
}
- if (r->tclass && r->tclass != ((ntohl(fl->fl6_flowlabel) >> 20) & 0xff))
+ if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask)
+ return 0;
+
+ if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask)
+ return 0;
+
+ if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
+ return 0;
+
+ if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask,
+ fl6->fl6_sport))
+ return 0;
+
+ if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask,
+ fl6->fl6_dport))
return 0;
return 1;
}
-static struct nla_policy fib6_rule_policy[FRA_MAX+1] __read_mostly = {
- FRA_GENERIC_POLICY,
- [FRA_SRC] = { .len = sizeof(struct in6_addr) },
- [FRA_DST] = { .len = sizeof(struct in6_addr) },
-};
+static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ if (rule6->dscp) {
+ NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
+ return -EINVAL;
+ }
+
+ rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK);
+ rule6->dscp_full = true;
+
+ return 0;
+}
+
+static int fib6_nl2rule_dscp_mask(const struct nlattr *nla,
+ struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ dscp_t dscp_mask;
+
+ if (!rule6->dscp_full) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Cannot specify DSCP mask without DSCP value");
+ return -EINVAL;
+ }
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ if (rule6->dscp & ~dscp_mask) {
+ NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask");
+ return -EINVAL;
+ }
+
+ rule6->dscp_mask = dscp_mask;
+
+ return 0;
+}
+
+static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ __be32 flowlabel, flowlabel_mask;
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) ||
+ NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK))
+ return -EINVAL;
+
+ flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]);
+ flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]);
+
+ if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK],
+ "Invalid flow label mask");
+ return -EINVAL;
+ }
+
+ if (flowlabel & ~flowlabel_mask) {
+ NL_SET_ERR_MSG(extack, "Flow label and mask do not match");
+ return -EINVAL;
+ }
+
+ rule6->flowlabel = flowlabel;
+ rule6->flowlabel_mask = flowlabel_mask;
+
+ return 0;
+}
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
+ struct fib6_rule *rule6 = (struct fib6_rule *)rule;
+ struct net *net = rule->fr_net;
int err = -EINVAL;
- struct fib6_rule *rule6 = (struct fib6_rule *) rule;
- if (frh->src_len > 128 || frh->dst_len > 128)
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ rule6->dscp = inet_dsfield_to_dscp(frh->tos);
+ rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0;
+
+ if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0)
+ goto errout;
+
+ if (tb[FRA_DSCP_MASK] &&
+ fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0)
goto errout;
- if (rule->action == FR_ACT_TO_TBL) {
- if (rule->table == RT6_TABLE_UNSPEC)
+ if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) &&
+ fib6_nl2rule_flowlabel(tb, rule6, extack) < 0)
+ goto errout;
+
+ if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
+ if (rule->table == RT6_TABLE_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid table");
goto errout;
+ }
- if (fib6_new_table(rule->table) == NULL) {
+ if (fib6_new_table(net, rule->table) == NULL) {
err = -ENOBUFS;
goto errout;
}
}
- if (tb[FRA_SRC])
- nla_memcpy(&rule6->src.addr, tb[FRA_SRC],
- sizeof(struct in6_addr));
+ if (frh->src_len)
+ rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]);
- if (tb[FRA_DST])
- nla_memcpy(&rule6->dst.addr, tb[FRA_DST],
- sizeof(struct in6_addr));
+ if (frh->dst_len)
+ rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]);
rule6->src.plen = frh->src_len;
rule6->dst.plen = frh->dst_len;
- rule6->tclass = frh->tos;
+ if (fib_rule_requires_fldissect(rule))
+ net->ipv6.fib6_rules_require_fldissect++;
+
+ net->ipv6.fib6_has_custom_rules = true;
err = 0;
errout:
return err;
}
+static int fib6_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+
+ if (net->ipv6.fib6_rules_require_fldissect &&
+ fib_rule_requires_fldissect(rule))
+ net->ipv6.fib6_rules_require_fldissect--;
+
+ return 0;
+}
+
static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
@@ -183,14 +499,40 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
return 0;
- if (frh->tos && (rule6->tclass != frh->tos))
+ if (frh->tos &&
+ (rule6->dscp_full ||
+ inet_dscp_to_dsfield(rule6->dscp) != frh->tos))
+ return 0;
+
+ if (tb[FRA_DSCP]) {
+ dscp_t dscp;
+
+ dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
+ if (!rule6->dscp_full || rule6->dscp != dscp)
+ return 0;
+ }
+
+ if (tb[FRA_DSCP_MASK]) {
+ dscp_t dscp_mask;
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2);
+ if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask)
+ return 0;
+ }
+
+ if (tb[FRA_FLOWLABEL] &&
+ nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel)
+ return 0;
+
+ if (tb[FRA_FLOWLABEL_MASK] &&
+ nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask)
return 0;
- if (tb[FRA_SRC] &&
+ if (frh->src_len &&
nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
return 0;
- if (tb[FRA_DST] &&
+ if (frh->dst_len &&
nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
return 0;
@@ -198,70 +540,124 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
}
static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
+ struct fib_rule_hdr *frh)
{
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
- frh->family = AF_INET6;
frh->dst_len = rule6->dst.plen;
frh->src_len = rule6->src.plen;
- frh->tos = rule6->tclass;
- if (rule6->dst.plen)
- NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr),
- &rule6->dst.addr);
+ if (rule6->dscp_full) {
+ frh->tos = 0;
+ if (nla_put_u8(skb, FRA_DSCP,
+ inet_dscp_to_dsfield(rule6->dscp) >> 2) ||
+ nla_put_u8(skb, FRA_DSCP_MASK,
+ inet_dscp_to_dsfield(rule6->dscp_mask) >> 2))
+ goto nla_put_failure;
+ } else {
+ frh->tos = inet_dscp_to_dsfield(rule6->dscp);
+ }
- if (rule6->src.plen)
- NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr),
- &rule6->src.addr);
+ if (rule6->flowlabel_mask &&
+ (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) ||
+ nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask)))
+ goto nla_put_failure;
+ if ((rule6->dst.plen &&
+ nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
+ (rule6->src.plen &&
+ nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
return -ENOBUFS;
}
-int fib6_rules_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
- return fib_rules_dump(skb, cb, AF_INET6);
+ return nla_total_size(16) /* dst */
+ + nla_total_size(16) /* src */
+ + nla_total_size(1) /* dscp */
+ + nla_total_size(1) /* dscp mask */
+ + nla_total_size(4) /* flowlabel */
+ + nla_total_size(4); /* flowlabel mask */
}
-static u32 fib6_rule_default_pref(void)
+static void fib6_rule_flush_cache(struct fib_rules_ops *ops)
{
- return 0x3FFF;
+ rt_genid_bump_ipv6(ops->fro_net);
}
-static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
-{
- return nla_total_size(16) /* dst */
- + nla_total_size(16); /* src */
-}
-
-static struct fib_rules_ops fib6_rules_ops = {
+static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
.family = AF_INET6,
.rule_size = sizeof(struct fib6_rule),
+ .addr_size = sizeof(struct in6_addr),
.action = fib6_rule_action,
.match = fib6_rule_match,
+ .suppress = fib6_rule_suppress,
.configure = fib6_rule_configure,
+ .delete = fib6_rule_delete,
.compare = fib6_rule_compare,
.fill = fib6_rule_fill,
- .default_pref = fib6_rule_default_pref,
.nlmsg_payload = fib6_rule_nlmsg_payload,
+ .flush_cache = fib6_rule_flush_cache,
.nlgroup = RTNLGRP_IPV6_RULE,
- .policy = fib6_rule_policy,
- .rules_list = &fib6_rules,
.owner = THIS_MODULE,
+ .fro_net = &init_net,
};
-void __init fib6_rules_init(void)
+static int __net_init fib6_rules_net_init(struct net *net)
{
- list_add_tail(&local_rule.common.list, &fib6_rules);
- list_add_tail(&main_rule.common.list, &fib6_rules);
+ struct fib_rules_ops *ops;
+ int err;
+
+ ops = fib_rules_register(&fib6_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL);
+ if (err)
+ goto out_fib6_rules_ops;
+
+ err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN);
+ if (err)
+ goto out_fib6_rules_ops;
+
+ net->ipv6.fib6_rules_ops = ops;
+ net->ipv6.fib6_rules_require_fldissect = 0;
+out:
+ return err;
- fib_rules_register(&fib6_rules_ops);
+out_fib6_rules_ops:
+ fib_rules_unregister(ops);
+ goto out;
}
+static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ cond_resched();
+ }
+ rtnl_unlock();
+}
+
+static struct pernet_operations fib6_rules_net_ops = {
+ .init = fib6_rules_net_init,
+ .exit_batch = fib6_rules_net_exit_batch,
+};
+
+int __init fib6_rules_init(void)
+{
+ return register_pernet_subsys(&fib6_rules_net_ops);
+}
+
+
void fib6_rules_cleanup(void)
{
- fib_rules_unregister(&fib6_rules_ops);
+ unregister_pernet_subsys(&fib6_rules_net_ops);
}
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
new file mode 100644
index 000000000000..430518ae26fa
--- /dev/null
+++ b/net/ipv6/fou6.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/fou.h>
+#include <net/ip.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+
+#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL)
+
+static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi6 *fl6, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb,
+ &fl6->saddr, &fl6->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+static int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+ __be16 sport;
+ int err;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ err = __fou_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou6_build_udp(skb, e, fl6, protocol, sport);
+
+ return 0;
+}
+
+static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+ __be16 sport;
+ int err;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou6_build_udp(skb, e, fl6, protocol, sport);
+
+ return 0;
+}
+
+static int gue6_err_proto_handler(int proto, struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct inet6_protocol *ipprot;
+
+ ipprot = rcu_dereference(inet6_protos[proto]);
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t len, optlen;
+ int ret;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, transport_offset + len))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt,
+ type, code, offset, info);
+ goto out;
+ case 6:
+ ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt,
+ type, code, offset, info);
+ goto out;
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (!pskb_may_pull(skb, transport_offset + len + optlen))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ /* Handling exceptions for direct UDP encapsulation in GUE would lead to
+ * recursion. Besides, this kind of encapsulation can't even be
+ * configured currently. Discard this.
+ */
+ if (guehdr->proto_ctype == IPPROTO_UDP ||
+ guehdr->proto_ctype == IPPROTO_UDPLITE)
+ return -EOPNOTSUPP;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+ ret = gue6_err_proto_handler(guehdr->proto_ctype, skb,
+ opt, type, code, offset, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
+
+static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou6_build_header,
+ .err_handler = gue6_err,
+};
+
+static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue6_build_header,
+ .err_handler = gue6_err,
+};
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip6_tnl_encap_add_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou6 ops\n");
+ return ret;
+ }
+
+ ret = ip6_tnl_encap_add_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue6 ops\n");
+ ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+ ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ ip6_tnl_encap_del_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static int __init fou6_init(void)
+{
+ int ret;
+
+ ret = ip6_tnl_encap_add_fou_ops();
+
+ return ret;
+}
+
+static void __exit fou6_fini(void)
+{
+ ip6_tnl_encap_del_fou_ops();
+}
+
+module_init(fou6_init);
+module_exit(fou6_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Foo over UDP (IPv6)");
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3dcc4b7f41b4..5d2f90babaa5 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Internet Control Message Protocol (ICMPv6)
* Linux INET6 implementation
@@ -5,16 +6,9 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: icmp.c,v 1.38 2002/02/08 03:57:19 davem Exp $
- *
* Based on net/ipv4/icmp.c
*
* RFC 1885
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -31,18 +25,20 @@
* Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data
*/
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/netfilter.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -57,63 +53,73 @@
#include <net/ipv6.h>
#include <net/ip6_checksum.h>
+#include <net/ping.h>
#include <net/protocol.h>
#include <net/raw.h>
#include <net/rawv6.h>
+#include <net/seg6.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/dsfield.h>
+#include <net/l3mdev.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
-DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
+static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL;
-#define icmpv6_socket __get_cpu_var(__icmpv6_socket)
+static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
+ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset);
+ struct net *net = dev_net_rcu(skb->dev);
+
+ if (type == ICMPV6_PKT_TOOBIG)
+ ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
+ else if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+
+ if (!(type & ICMPV6_INFOMSG_MASK))
+ if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
+ ping_err(skb, offset, ntohl(info));
+
+ return 0;
+}
-static int icmpv6_rcv(struct sk_buff **pskb);
+static int icmpv6_rcv(struct sk_buff *skb);
-static struct inet6_protocol icmpv6_protocol = {
+static const struct inet6_protocol icmpv6_protocol = {
.handler = icmpv6_rcv,
- .flags = INET6_PROTO_FINAL,
+ .err_handler = icmpv6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
-static __inline__ int icmpv6_xmit_lock(void)
+/* Called with BH disabled */
+static struct sock *icmpv6_xmit_lock(struct net *net)
{
- local_bh_disable();
+ struct sock *sk;
- if (unlikely(!spin_trylock(&icmpv6_socket->sk->sk_lock.slock))) {
+ sk = this_cpu_read(ipv6_icmp_sk);
+ if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path (f.e. SIT or
* ip6ip6 tunnel) signals dst_link_failure() for an
* outgoing ICMP6 packet.
*/
- local_bh_enable();
- return 1;
+ return NULL;
}
- return 0;
+ sock_net_set(sk, net);
+ return sk;
}
-static __inline__ void icmpv6_xmit_unlock(void)
+static void icmpv6_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&icmpv6_socket->sk->sk_lock.slock);
-}
-
-/*
- * Slightly more convenient version of icmpv6_send.
- */
-void icmpv6_param_prob(struct sk_buff *skb, int code, int pos)
-{
- icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
- kfree_skb(skb);
+ sock_net_set(sk, &init_net);
+ spin_unlock(&sk->sk_lock.slock);
}
/*
@@ -127,69 +133,124 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, int pos)
* --ANK (980726)
*/
-static int is_ineligible(struct sk_buff *skb)
+static bool is_ineligible(const struct sk_buff *skb)
{
- int ptr = (u8*)(skb->nh.ipv6h+1) - skb->data;
+ int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
int len = skb->len - ptr;
- __u8 nexthdr = skb->nh.ipv6h->nexthdr;
+ __u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
if (len < 0)
- return 1;
+ return true;
- ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr);
+ ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off);
if (ptr < 0)
- return 0;
+ return false;
if (nexthdr == IPPROTO_ICMPV6) {
u8 _type, *tp;
tp = skb_header_pointer(skb,
ptr+offsetof(struct icmp6hdr, icmp6_type),
sizeof(_type), &_type);
- if (tp == NULL ||
- !(*tp & ICMPV6_INFOMSG_MASK))
- return 1;
+
+ /* Based on RFC 8200, Section 4.5 Fragment Header, return
+ * false if this is a fragment packet with no icmp header info.
+ */
+ if (!tp && frag_off != 0)
+ return false;
+ else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
+ return true;
}
- return 0;
+ return false;
}
-static int sysctl_icmpv6_time __read_mostly = 1*HZ;
+static bool icmpv6_mask_allow(struct net *net, int type)
+{
+ if (type > ICMPV6_MSG_MAX)
+ return true;
+
+ /* Limit if icmp type is set in ratemask. */
+ if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask))
+ return true;
+
+ return false;
+}
+
+static bool icmpv6_global_allow(struct net *net, int type,
+ bool *apply_ratelimit)
+{
+ if (icmpv6_mask_allow(net, type))
+ return true;
+
+ if (icmp_global_allow(net)) {
+ *apply_ratelimit = true;
+ return true;
+ }
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+ return false;
+}
-/*
- * Check the ICMP output rate limit
+/*
+ * Check the ICMP output rate limit
*/
-static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
- struct flowi *fl)
+static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+ struct flowi6 *fl6, bool apply_ratelimit)
{
+ struct net *net = sock_net(sk);
+ struct net_device *dev;
struct dst_entry *dst;
- int res = 0;
+ bool res = false;
- /* Informational messages are not limited. */
- if (type & ICMPV6_INFOMSG_MASK)
- return 1;
+ if (!apply_ratelimit)
+ return true;
- /* Do not limit pmtu discovery, it would break it. */
- if (type == ICMPV6_PKT_TOOBIG)
- return 1;
-
- /*
+ /*
* Look up the output route.
* XXX: perhaps the expire for routing entries cloned by
* this lookup should be more aggressive (not longer than timeout).
*/
- dst = ip6_route_output(sk, fl);
+ dst = ip6_route_output(net, sk, fl6);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dst->error) {
- IP6_INC_STATS(ip6_dst_idev(dst),
+ IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
- } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
- res = 1;
+ } else if (dev && (dev->flags & IFF_LOOPBACK)) {
+ res = true;
} else {
- struct rt6_info *rt = (struct rt6_info *)dst;
- int tmo = sysctl_icmpv6_time;
+ struct rt6_info *rt = dst_rt6_info(dst);
+ int tmo = net->ipv6.sysctl.icmpv6_time;
+ struct inet_peer *peer;
/* Give more bandwidth to wider prefixes. */
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- res = xrlim_allow(dst, tmo);
+ peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
+ res = inet_peer_xrlim_allow(peer, tmo);
+ }
+ rcu_read_unlock();
+ if (!res)
+ __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST);
+ else
+ icmp_global_consume(net);
+ dst_release(dst);
+ return res;
+}
+
+static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type,
+ struct flowi6 *fl6)
+{
+ struct net *net = sock_net(sk);
+ struct dst_entry *dst;
+ bool res = false;
+
+ dst = ip6_route_output(net, sk, fl6);
+ if (!dst->error) {
+ struct rt6_info *rt = dst_rt6_info(dst);
+ struct in6_addr prefsrc;
+
+ rt6_get_prefsrc(rt, &prefsrc);
+ res = !ipv6_addr_any(&prefsrc);
}
dst_release(dst);
return res;
@@ -198,40 +259,41 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
/*
* an inline helper for the "simple" if statement below
* checks if parameter problem report is caused by an
- * unrecognized IPv6 option that has the Option Type
+ * unrecognized IPv6 option that has the Option Type
* highest-order two bits set to 10
*/
-static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset)
+static bool opt_unrec(struct sk_buff *skb, __u32 offset)
{
u8 _optval, *op;
- offset += skb->nh.raw - skb->data;
+ offset += skb_network_offset(skb);
op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval);
- if (op == NULL)
- return 1;
+ if (!op)
+ return true;
return (*op & 0xC0) == 0x80;
}
-static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len)
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+ struct icmp6hdr *thdr, int len)
{
struct sk_buff *skb;
struct icmp6hdr *icmp6h;
- int err = 0;
- if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
- goto out;
+ skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
+ return;
- icmp6h = (struct icmp6hdr*) skb->h.raw;
+ icmp6h = icmp6_hdr(skb);
memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
icmp6h->icmp6_cksum = 0;
if (skb_queue_len(&sk->sk_write_queue) == 1) {
- skb->csum = csum_partial((char *)icmp6h,
+ skb->csum = csum_partial(icmp6h,
sizeof(struct icmp6hdr), skb->csum);
- icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src,
- &fl->fl6_dst,
- len, fl->proto,
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+ &fl6->daddr,
+ len, fl6->flowi6_proto,
skb->csum);
} else {
__wsum tmp_csum = 0;
@@ -240,16 +302,14 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct
tmp_csum = csum_add(tmp_csum, skb->csum);
}
- tmp_csum = csum_partial((char *)icmp6h,
+ tmp_csum = csum_partial(icmp6h,
sizeof(struct icmp6hdr), tmp_csum);
- icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src,
- &fl->fl6_dst,
- len, fl->proto,
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+ &fl6->daddr,
+ len, fl6->flowi6_proto,
tmp_csum);
}
ip6_push_pending_frames(sk);
-out:
- return err;
}
struct icmpv6_msg {
@@ -262,84 +322,374 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
{
struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
struct sk_buff *org_skb = msg->skb;
- __wsum csum = 0;
+ __wsum csum;
csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
- to, len, csum);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (!(msg->type & ICMPV6_INFOMSG_MASK))
nf_ct_attach(skb, org_skb);
return 0;
}
-#ifdef CONFIG_IPV6_MIP6
-static void mip6_addr_swap(struct sk_buff *skb)
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
{
- struct ipv6hdr *iph = skb->nh.ipv6h;
- struct inet6_skb_parm *opt = IP6CB(skb);
+ struct ipv6hdr *iph = ipv6_hdr(skb);
struct ipv6_destopt_hao *hao;
- struct in6_addr tmp;
int off;
if (opt->dsthao) {
off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
if (likely(off >= 0)) {
- hao = (struct ipv6_destopt_hao *)(skb->nh.raw + off);
- ipv6_addr_copy(&tmp, &iph->saddr);
- ipv6_addr_copy(&iph->saddr, &hao->addr);
- ipv6_addr_copy(&hao->addr, &tmp);
+ hao = (struct ipv6_destopt_hao *)
+ (skb_network_header(skb) + off);
+ swap(iph->saddr, hao->addr);
}
}
}
#else
-static inline void mip6_addr_swap(struct sk_buff *skb) {}
+static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
#endif
+static struct dst_entry *icmpv6_route_lookup(struct net *net,
+ struct sk_buff *skb,
+ struct sock *sk,
+ struct flowi6 *fl6)
+{
+ struct dst_entry *dst, *dst2;
+ struct flowi6 fl2;
+ int err;
+
+ err = ip6_dst_lookup(net, sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+
+ /*
+ * We won't send icmp if the destination is known
+ * anycast unless we need to treat anycast as unicast.
+ */
+ if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
+ ipv6_anycast_destination(dst, &fl6->daddr)) {
+ net_dbg_ratelimited("icmp6_send: acast source\n");
+ dst_release(dst);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* No need to clone since we're just using its address. */
+ dst2 = dst;
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0);
+ if (!IS_ERR(dst)) {
+ if (dst != dst2)
+ return dst;
+ } else {
+ if (PTR_ERR(dst) == -EPERM)
+ dst = NULL;
+ else
+ return dst;
+ }
+
+ err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6);
+ if (err)
+ goto relookup_failed;
+
+ err = ip6_dst_lookup(net, sk, &dst2, &fl2);
+ if (err)
+ goto relookup_failed;
+
+ dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(dst2)) {
+ dst_release(dst);
+ dst = dst2;
+ } else {
+ err = PTR_ERR(dst2);
+ if (err == -EPERM) {
+ dst_release(dst);
+ return dst2;
+ } else
+ goto relookup_failed;
+ }
+
+relookup_failed:
+ if (dst)
+ return dst;
+ return ERR_PTR(err);
+}
+
+static struct net_device *icmp6_dev(const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+
+ /* for local traffic to local address, skb dev is the loopback
+ * device. Check if there is a dst attached to the skb and if so
+ * get the real device index. Same is needed for replies to a link
+ * local address on a device enslaved to an L3 master device
+ */
+ if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
+ const struct rt6_info *rt6 = skb_rt6_info(skb);
+
+ /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.),
+ * and ip6_null_entry could be set to skb if no route is found.
+ */
+ if (rt6 && rt6->rt6i_idev)
+ dev = rt6->rt6i_idev->dev;
+ }
+
+ return dev;
+}
+
+static int icmp6_iif(const struct sk_buff *skb)
+{
+ return icmp6_dev(skb)->ifindex;
+}
+
+struct icmp6_ext_iio_addr6_subobj {
+ __be16 afi;
+ __be16 reserved;
+ struct in6_addr addr6;
+};
+
+static unsigned int icmp6_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp6_ext_iio_addr6_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp6_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp6_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev)
+{
+ struct inet6_dev *in6_dev;
+ struct inet6_ifaddr *ifa;
+
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev)
+ return NULL;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED))
+ continue;
+ if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST ||
+ ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL)
+ continue;
+ return &ifa->addr;
+ }
+
+ return NULL;
+}
+
+static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ struct in6_addr *addr6;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ /* Add the fields in the order specified by RFC 5837. */
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ addr6 = icmp6_ext_iio_addr6_find(dev);
+ if (addr6) {
+ struct icmp6_ext_iio_addr6_subobj *addr6_subobj;
+
+ addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj));
+ addr6_subobj->afi = htons(ICMP_AFI_IP6);
+ addr6_subobj->addr6 = *addr6;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+ }
+
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp6_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp6_ext_append(struct net *net, struct sk_buff *skb_in,
+ struct icmp6hdr *icmp6h, unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmp6h->icmp6_type) {
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_TIME_EXCEED:
+ break;
+ default:
+ return NULL;
+ }
+
+ /* Do not overwrite existing extensions. This can happen when we
+ * receive an ICMPv4 message with extensions from a tunnel and
+ * translate it to an ICMPv6 message towards an IPv6 host in the
+ * overlay network.
+ */
+ if (icmp6h->icmp6_datagram_len)
+ return NULL;
+
+ ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp6_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp6_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 64-bit words (RFC 4884). */
+ icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
+}
+
/*
* Send an ICMP message in response to a packet in error
*/
-void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
- struct net_device *dev)
+void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct in6_addr *force_saddr,
+ const struct inet6_skb_parm *parm)
{
struct inet6_dev *idev = NULL;
- struct ipv6hdr *hdr = skb->nh.ipv6h;
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
struct sock *sk;
+ struct net *net;
struct ipv6_pinfo *np;
- struct in6_addr *saddr = NULL;
+ const struct in6_addr *saddr = NULL;
+ bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct dst_entry *dst;
+ unsigned int room;
struct icmp6hdr tmp_hdr;
- struct flowi fl;
+ struct flowi6 fl6;
struct icmpv6_msg msg;
+ struct ipcm6_cookie ipc6;
int iif = 0;
int addr_type = 0;
int len;
- int hlimit, tclass;
- int err = 0;
+ u32 mark;
+
+ if ((u8 *)hdr < skb->head ||
+ (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb))
+ return;
- if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail)
+ if (!skb->dev)
return;
+ rcu_read_lock();
+
+ net = dev_net_rcu(skb->dev);
+ mark = IP6_REPLY_MARK(net, skb->mark);
/*
- * Make sure we respect the rules
+ * Make sure we respect the rules
* i.e. RFC 1885 2.4(e)
- * Rule (e.1) is enforced by not using icmpv6_send
+ * Rule (e.1) is enforced by not using icmp6_send
* in any code that processes icmp errors.
*/
addr_type = ipv6_addr_type(&hdr->daddr);
- if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0))
+ if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) ||
+ ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr))
saddr = &hdr->daddr;
/*
* Dest addr check
*/
- if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) {
+ if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) {
if (type != ICMPV6_PKT_TOOBIG &&
- !(type == ICMPV6_PARAMPROB &&
- code == ICMPV6_UNK_OPTION &&
+ !(type == ICMPV6_PARAMPROB &&
+ code == ICMPV6_UNK_OPTION &&
(opt_unrec(skb, info))))
- return;
+ goto out;
saddr = NULL;
}
@@ -350,8 +700,15 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
* Source addr check
*/
- if (addr_type & IPV6_ADDR_LINKLOCAL)
- iif = skb->dev->ifindex;
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
+ iif = icmp6_iif(skb);
+ } else {
+ /*
+ * The source device is used for looking up which routing table
+ * to use for sending an ICMP error.
+ */
+ iif = l3mdev_master_ifindex(skb->dev);
+ }
/*
* Must not send error if the source does not uniquely
@@ -360,228 +717,352 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
* and anycast addresses will be checked later.
*/
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
- LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
- return;
+ net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n",
+ &hdr->saddr, &hdr->daddr);
+ goto out;
}
- /*
+ /*
* Never answer to a ICMP packet.
*/
if (is_ineligible(skb)) {
- LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
- return;
+ net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n",
+ &hdr->saddr, &hdr->daddr);
+ goto out;
}
- mip6_addr_swap(skb);
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_ICMPV6;
- ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr);
- if (saddr)
- ipv6_addr_copy(&fl.fl6_src, saddr);
- fl.oif = iif;
- fl.fl_icmp_type = type;
- fl.fl_icmp_code = code;
- security_skb_classify_flow(skb, &fl);
+ /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */
+ local_bh_disable();
- if (icmpv6_xmit_lock())
- return;
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit */
+ if (!(skb->dev->flags & IFF_LOOPBACK) &&
+ !icmpv6_global_allow(net, type, &apply_ratelimit))
+ goto out_bh_enable;
+
+ mip6_addr_swap(skb, parm);
+
+ sk = icmpv6_xmit_lock(net);
+ if (!sk)
+ goto out_bh_enable;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ fl6.daddr = hdr->saddr;
+ if (force_saddr)
+ saddr = force_saddr;
+ if (saddr) {
+ fl6.saddr = *saddr;
+ } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) {
+ /* select a more meaningful saddr from input if */
+ struct net_device *in_netdev;
+
+ in_netdev = dev_get_by_index(net, parm->iif);
+ if (in_netdev) {
+ ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
+ inet6_sk(sk)->srcprefs,
+ &fl6.saddr);
+ dev_put(in_netdev);
+ }
+ }
+ fl6.flowi6_mark = mark;
+ fl6.flowi6_oif = iif;
+ fl6.fl6_icmp_type = type;
+ fl6.fl6_icmp_code = code;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+ fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
- sk = icmpv6_socket->sk;
np = inet6_sk(sk);
- if (!icmpv6_xrlim_allow(sk, type, &fl))
- goto out;
+ if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit))
+ goto out_unlock;
tmp_hdr.icmp6_type = type;
tmp_hdr.icmp6_code = code;
tmp_hdr.icmp6_cksum = 0;
tmp_hdr.icmp6_pointer = htonl(info);
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
- fl.oif = np->mcast_oif;
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto out;
+ ipcm6_init_sk(&ipc6, sk);
+ ipc6.sockc.mark = mark;
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
- /*
- * We won't send icmp if the destination is known
- * anycast.
- */
- if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
- LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n");
- goto out_dst_release;
- }
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out;
-
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
- hlimit = np->mcast_hops;
- else
- hlimit = np->hop_limit;
- if (hlimit < 0)
- hlimit = dst_metric(dst, RTAX_HOPLIMIT);
- if (hlimit < 0)
- hlimit = ipv6_get_hoplimit(dst->dev);
+ dst = icmpv6_route_lookup(net, skb, sk, &fl6);
+ if (IS_ERR(dst))
+ goto out_unlock;
- tclass = np->tclass;
- if (tclass < 0)
- tclass = 0;
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
msg.skb = skb;
- msg.offset = skb->nh.raw - skb->data;
+ msg.offset = skb_network_offset(skb);
msg.type = type;
- len = skb->len - msg.offset;
- len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
+ room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr);
+ ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif);
+ if (ext_skb)
+ msg.skb = ext_skb;
+
+ len = msg.skb->len - msg.offset;
+ len = min_t(unsigned int, len, room);
if (len < 0) {
- LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
+ net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n",
+ &hdr->saddr, &hdr->daddr);
goto out_dst_release;
}
- idev = in6_dev_get(skb->dev);
+ idev = __in6_dev_get(skb->dev);
- err = ip6_append_data(sk, icmpv6_getfrag, &msg,
- len + sizeof(struct icmp6hdr),
- sizeof(struct icmp6hdr),
- hlimit, tclass, NULL, &fl, (struct rt6_info*)dst,
- MSG_DONTWAIT);
- if (err) {
+ if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+ len + sizeof(struct icmp6hdr),
+ sizeof(struct icmp6hdr),
+ &ipc6, &fl6, dst_rt6_info(dst),
+ MSG_DONTWAIT)) {
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
- goto out_put;
+ } else {
+ icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+ len + sizeof(struct icmp6hdr));
}
- err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr));
-
- if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
- ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_OUTDESTUNREACHS, type - ICMPV6_DEST_UNREACH);
- ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
-out_put:
- if (likely(idev != NULL))
- in6_dev_put(idev);
out_dst_release:
+ if (ext_skb)
+ consume_skb(ext_skb);
dst_release(dst);
+out_unlock:
+ icmpv6_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
out:
- icmpv6_xmit_unlock();
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(icmp6_send);
+
+/* Slightly more convenient version of icmp6_send with drop reasons.
+ */
+void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos,
+ enum skb_drop_reason reason)
+{
+ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
+ kfree_skb_reason(skb, reason);
+}
+
+/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
+ * if sufficient data bytes are available
+ * @nhs is the size of the tunnel header(s) :
+ * Either an IPv4 header for SIT encap
+ * an IPv4 header + GRE header for GRE encap
+ */
+int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
+ unsigned int data_len)
+{
+ struct in6_addr temp_saddr;
+ struct rt6_info *rt;
+ struct sk_buff *skb2;
+ u32 info = 0;
+
+ if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8))
+ return 1;
+
+ /* RFC 4884 (partial) support for ICMP extensions */
+ if (data_len < 128 || (data_len & 7) || skb->len < data_len)
+ data_len = 0;
+
+ skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb2)
+ return 1;
+
+ skb_dst_drop(skb2);
+ skb_pull(skb2, nhs);
+ skb_reset_network_header(skb2);
+
+ rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr,
+ NULL, 0, skb, 0);
+
+ if (rt && rt->dst.dev)
+ skb2->dev = rt->dst.dev;
+
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr);
+
+ if (data_len) {
+ /* RFC 4884 (partial) support :
+ * insert 0 padding at the end, before the extensions
+ */
+ __skb_push(skb2, nhs);
+ skb_reset_network_header(skb2);
+ memmove(skb2->data, skb2->data + nhs, data_len - nhs);
+ memset(skb2->data + data_len - nhs, 0, nhs);
+ /* RFC 4884 4.5 : Length is measured in 64-bit words,
+ * and stored in reserved[0]
+ */
+ info = (data_len/8) << 24;
+ }
+ if (type == ICMP_TIME_EXCEEDED)
+ icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+ info, &temp_saddr, IP6CB(skb2));
+ else
+ icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
+ info, &temp_saddr, IP6CB(skb2));
+ if (rt)
+ ip6_rt_put(rt);
+
+ kfree_skb(skb2);
+
+ return 0;
}
+EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);
-static void icmpv6_echo_reply(struct sk_buff *skb)
+static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
{
+ struct net *net = dev_net_rcu(skb->dev);
struct sock *sk;
struct inet6_dev *idev;
struct ipv6_pinfo *np;
- struct in6_addr *saddr = NULL;
- struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw;
+ const struct in6_addr *saddr = NULL;
+ struct icmp6hdr *icmph = icmp6_hdr(skb);
+ bool apply_ratelimit = false;
struct icmp6hdr tmp_hdr;
- struct flowi fl;
+ struct flowi6 fl6;
struct icmpv6_msg msg;
struct dst_entry *dst;
- int err = 0;
- int hlimit;
- int tclass;
+ struct ipcm6_cookie ipc6;
+ u32 mark = IP6_REPLY_MARK(net, skb->mark);
+ SKB_DR(reason);
+ bool acast;
+ u8 type;
+
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) &&
+ net->ipv6.sysctl.icmpv6_echo_ignore_multicast)
+ return reason;
+
+ saddr = &ipv6_hdr(skb)->daddr;
- saddr = &skb->nh.ipv6h->daddr;
+ acast = ipv6_anycast_destination(skb_dst(skb), saddr);
+ if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast)
+ return reason;
- if (!ipv6_unicast_destination(skb))
+ if (!ipv6_unicast_destination(skb) &&
+ !(net->ipv6.sysctl.anycast_src_echo_reply && acast))
saddr = NULL;
+ if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
+ type = ICMPV6_EXT_ECHO_REPLY;
+ else
+ type = ICMPV6_ECHO_REPLY;
+
memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
- tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
+ tmp_hdr.icmp6_type = type;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_ICMPV6;
- ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr);
- if (saddr)
- ipv6_addr_copy(&fl.fl6_src, saddr);
- fl.oif = skb->dev->ifindex;
- fl.fl_icmp_type = ICMPV6_ECHO_REPLY;
- security_skb_classify_flow(skb, &fl);
+ memset(&fl6, 0, sizeof(fl6));
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
+ fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
- if (icmpv6_xmit_lock())
- return;
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ fl6.daddr = ipv6_hdr(skb)->saddr;
+ if (saddr)
+ fl6.saddr = *saddr;
+ fl6.flowi6_oif = icmp6_iif(skb);
+ fl6.fl6_icmp_type = type;
+ fl6.flowi6_mark = mark;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
- sk = icmpv6_socket->sk;
+ local_bh_disable();
+ sk = icmpv6_xmit_lock(net);
+ if (!sk)
+ goto out_bh_enable;
np = inet6_sk(sk);
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
- fl.oif = np->mcast_oif;
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ if (ip6_dst_lookup(net, sk, &dst, &fl6))
goto out;
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
+ if (IS_ERR(dst))
goto out;
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
- hlimit = np->mcast_hops;
- else
- hlimit = np->hop_limit;
- if (hlimit < 0)
- hlimit = dst_metric(dst, RTAX_HOPLIMIT);
- if (hlimit < 0)
- hlimit = ipv6_get_hoplimit(dst->dev);
-
- tclass = np->tclass;
- if (tclass < 0)
- tclass = 0;
+ /* Check the ratelimit */
+ if ((!(skb->dev->flags & IFF_LOOPBACK) &&
+ !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) ||
+ !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit))
+ goto out_dst_release;
- idev = in6_dev_get(skb->dev);
+ idev = __in6_dev_get(skb->dev);
msg.skb = skb;
msg.offset = 0;
- msg.type = ICMPV6_ECHO_REPLY;
+ msg.type = type;
+
+ ipcm6_init_sk(&ipc6, sk);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ ipc6.sockc.mark = mark;
- err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
- sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl,
- (struct rt6_info*)dst, MSG_DONTWAIT);
+ if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
+ if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr))
+ goto out_dst_release;
- if (err) {
+ if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+ skb->len + sizeof(struct icmp6hdr),
+ sizeof(struct icmp6hdr), &ipc6, &fl6,
+ dst_rt6_info(dst), MSG_DONTWAIT)) {
+ __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
- goto out_put;
+ } else {
+ icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+ skb->len + sizeof(struct icmp6hdr));
+ reason = SKB_CONSUMED;
}
- err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr));
-
- ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTECHOREPLIES);
- ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
-
-out_put:
- if (likely(idev != NULL))
- in6_dev_put(idev);
+out_dst_release:
dst_release(dst);
-out:
- icmpv6_xmit_unlock();
+out:
+ icmpv6_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
+ return reason;
}
-static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info)
+enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
+ u8 code, __be32 info)
{
- struct in6_addr *saddr, *daddr;
- struct inet6_protocol *ipprot;
- struct sock *sk;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net_rcu(skb->dev);
+ const struct inet6_protocol *ipprot;
+ enum skb_drop_reason reason;
int inner_offset;
- int hash;
+ __be16 frag_off;
u8 nexthdr;
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- return;
+ reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr));
+ if (reason != SKB_NOT_DROPPED_YET)
+ goto out;
+
+ seg6_icmp_srh(skb, opt);
nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
if (ipv6_ext_hdr(nexthdr)) {
/* now skip over extension headers */
- inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
- if (inner_offset<0)
- return;
+ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+ if (inner_offset < 0) {
+ SKB_DR_SET(reason, IPV6_BAD_EXTHDR);
+ goto out;
+ }
} else {
inner_offset = sizeof(struct ipv6hdr);
}
/* Checkin header including 8 bytes of inner protocol header. */
- if (!pskb_may_pull(skb, inner_offset+8))
- return;
-
- saddr = &skb->nh.ipv6h->saddr;
- daddr = &skb->nh.ipv6h->daddr;
+ reason = pskb_may_pull_reason(skb, inner_offset + 8);
+ if (reason != SKB_NOT_DROPPED_YET)
+ goto out;
/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
Without this we will not able f.e. to make source routed
@@ -590,81 +1071,92 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info)
--ANK (980726)
*/
- hash = nexthdr & (MAX_INET_PROTOS - 1);
-
- rcu_read_lock();
- ipprot = rcu_dereference(inet6_protos[hash]);
+ ipprot = rcu_dereference(inet6_protos[nexthdr]);
if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
- rcu_read_unlock();
+ ipprot->err_handler(skb, opt, type, code, inner_offset, info);
- read_lock(&raw_v6_lock);
- if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
- while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr,
- IP6CB(skb)->iif))) {
- rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
- sk = sk_next(sk);
- }
- }
- read_unlock(&raw_v6_lock);
+ raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
+ return SKB_CONSUMED;
+
+out:
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ return reason;
}
-
+
/*
* Handle icmp messages
*/
-static int icmpv6_rcv(struct sk_buff **pskb)
+static int icmpv6_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb = *pskb;
- struct net_device *dev = skb->dev;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct net *net = dev_net_rcu(skb->dev);
+ struct net_device *dev = icmp6_dev(skb);
struct inet6_dev *idev = __in6_dev_get(dev);
- struct in6_addr *saddr, *daddr;
- struct ipv6hdr *orig_hdr;
+ const struct in6_addr *saddr, *daddr;
struct icmp6hdr *hdr;
- int type;
+ u8 type;
- ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS);
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ struct sec_path *sp = skb_sec_path(skb);
+ int nh;
- saddr = &skb->nh.ipv6h->saddr;
- daddr = &skb->nh.ipv6h->daddr;
+ if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
+ goto drop_no_count;
+ }
- /* Perform checksum. */
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
- skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len,
- IPPROTO_ICMPV6, 0));
- if (__skb_checksum_complete(skb)) {
- LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [" NIP6_FMT " > " NIP6_FMT "]\n",
- NIP6(*saddr), NIP6(*daddr));
- goto discard_it;
+ if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr)))
+ goto drop_no_count;
+
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*hdr));
+
+ if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN,
+ skb)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
+ goto drop_no_count;
}
+
+ skb_set_network_header(skb, nh);
+ }
+
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS);
+
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = &ipv6_hdr(skb)->daddr;
+
+ if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) {
+ net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n",
+ saddr, daddr);
+ goto csum_error;
}
- if (!pskb_pull(skb, sizeof(struct icmp6hdr)))
+ if (!pskb_pull(skb, sizeof(*hdr)))
goto discard_it;
- hdr = (struct icmp6hdr *) skb->h.raw;
+ hdr = icmp6_hdr(skb);
type = hdr->icmp6_type;
- if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
- ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INDESTUNREACHS, type - ICMPV6_DEST_UNREACH);
- else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT)
- ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INECHOS, type - ICMPV6_ECHO_REQUEST);
+ ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type);
switch (type) {
case ICMPV6_ECHO_REQUEST:
- icmpv6_echo_reply(skb);
+ if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
+ reason = icmpv6_echo_reply(skb);
+ break;
+ case ICMPV6_EXT_ECHO_REQUEST:
+ if (!net->ipv6.sysctl.icmpv6_echo_ignore_all &&
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
+ reason = icmpv6_echo_reply(skb);
break;
case ICMPV6_ECHO_REPLY:
- /* we couldn't care less */
- break;
+ case ICMPV6_EXT_ECHO_REPLY:
+ ping_rcv(skb);
+ return 0;
case ICMPV6_PKT_TOOBIG:
/* BUGGG_FUTURE: if packet contains rthdr, we cannot update
@@ -674,19 +1166,15 @@ static int icmpv6_rcv(struct sk_buff **pskb)
*/
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto discard_it;
- hdr = (struct icmp6hdr *) skb->h.raw;
- orig_hdr = (struct ipv6hdr *) (hdr + 1);
- rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev,
- ntohl(hdr->icmp6_mtu));
-
- /*
- * Drop through to notify
- */
+ hdr = icmp6_hdr(skb);
+ /* to notify */
+ fallthrough;
case ICMPV6_DEST_UNREACH:
case ICMPV6_TIME_EXCEED:
case ICMPV6_PARAMPROB:
- icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+ reason = icmpv6_notify(skb, type, hdr->icmp6_code,
+ hdr->icmp6_mtu);
break;
case NDISC_ROUTER_SOLICITATION:
@@ -694,16 +1182,16 @@ static int icmpv6_rcv(struct sk_buff **pskb)
case NDISC_NEIGHBOUR_SOLICITATION:
case NDISC_NEIGHBOUR_ADVERTISEMENT:
case NDISC_REDIRECT:
- ndisc_rcv(skb);
+ reason = ndisc_rcv(skb);
break;
case ICMPV6_MGM_QUERY:
igmp6_event_query(skb);
- break;
+ return 0;
case ICMPV6_MGM_REPORT:
igmp6_event_report(skb);
- break;
+ return 0;
case ICMPV6_MGM_REDUCTION:
case ICMPV6_NI_QUERY:
@@ -716,98 +1204,101 @@ static int icmpv6_rcv(struct sk_buff **pskb)
break;
default:
- LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
-
/* informational */
if (type & ICMPV6_INFOMSG_MASK)
break;
- /*
- * error of unknown type.
- * must pass to upper level
+ net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n",
+ saddr, daddr);
+
+ /*
+ * error of unknown type.
+ * must pass to upper level
*/
- icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
- };
- kfree_skb(skb);
+ reason = icmpv6_notify(skb, type, hdr->icmp6_code,
+ hdr->icmp6_mtu);
+ }
+
+ /* until the v6 path can be better sorted assume failure and
+ * preserve the status quo behaviour for the rest of the paths to here
+ */
+ if (reason)
+ kfree_skb_reason(skb, reason);
+ else
+ consume_skb(skb);
+
return 0;
+csum_error:
+ reason = SKB_DROP_REASON_ICMP_CSUM;
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS);
discard_it:
- ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS);
- kfree_skb(skb);
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS);
+drop_no_count:
+ kfree_skb_reason(skb, reason);
return 0;
}
-/*
- * Special lock-class for __icmpv6_socket:
- */
-static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
+void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int oif)
+{
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->saddr = *saddr;
+ fl6->daddr = *daddr;
+ fl6->flowi6_proto = IPPROTO_ICMPV6;
+ fl6->fl6_icmp_type = type;
+ fl6->fl6_icmp_code = 0;
+ fl6->flowi6_oif = oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
+}
-int __init icmpv6_init(struct net_proto_family *ops)
+int __init icmpv6_init(void)
{
struct sock *sk;
- int err, i, j;
+ int err, i;
for_each_possible_cpu(i) {
- err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6,
- &per_cpu(__icmpv6_socket, i));
+ err = inet_ctl_sock_create(&sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, &init_net);
if (err < 0) {
- printk(KERN_ERR
- "Failed to initialize the ICMP6 control socket "
- "(err %d).\n",
+ pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
err);
- goto fail;
+ return err;
}
- sk = per_cpu(__icmpv6_socket, i)->sk;
- sk->sk_allocation = GFP_ATOMIC;
- /*
- * Split off their lock-class, because sk->sk_dst_lock
- * gets used from softirqs, which is safe for
- * __icmpv6_socket (because those never get directly used
- * via userspace syscalls), but unsafe for normal sockets.
- */
- lockdep_set_class(&sk->sk_dst_lock,
- &icmpv6_socket_sk_dst_lock_key);
+ per_cpu(ipv6_icmp_sk, i) = sk;
/* Enough space for 2 64K ICMP packets, including
* sk_buff struct overhead.
*/
- sk->sk_sndbuf =
- (2 * ((64 * 1024) + sizeof(struct sk_buff)));
-
- sk->sk_prot->unhash(sk);
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
}
-
- if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) {
- printk(KERN_ERR "Failed to register ICMP6 protocol\n");
- err = -EAGAIN;
+ err = -EAGAIN;
+ if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
goto fail;
- }
+ err = inet6_register_icmp_sender(icmp6_send);
+ if (err)
+ goto sender_reg_err;
return 0;
- fail:
- for (j = 0; j < i; j++) {
- if (!cpu_possible(j))
- continue;
- sock_release(per_cpu(__icmpv6_socket, j));
- }
-
+sender_reg_err:
+ inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
+fail:
+ pr_err("Failed to register ICMP6 protocol\n");
return err;
}
void icmpv6_cleanup(void)
{
- int i;
-
- for_each_possible_cpu(i) {
- sock_release(per_cpu(__icmpv6_socket, i));
- }
+ inet6_unregister_icmp_sender(icmp6_send);
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}
+
static const struct icmp6_err {
int err;
int fatal;
@@ -832,9 +1323,17 @@ static const struct icmp6_err {
.err = ECONNREFUSED,
.fatal = 1,
},
+ { /* POLICY_FAIL */
+ .err = EACCES,
+ .fatal = 1,
+ },
+ { /* REJECT_ROUTE */
+ .err = EACCES,
+ .fatal = 1,
+ },
};
-int icmpv6_err_convert(int type, int code, int *err)
+int icmpv6_err_convert(u8 type, u8 code, int *err)
{
int fatal = 0;
@@ -843,7 +1342,7 @@ int icmpv6_err_convert(int type, int code, int *err)
switch (type) {
case ICMPV6_DEST_UNREACH:
fatal = 1;
- if (code <= ICMPV6_PORT_UNREACH) {
+ if (code < ARRAY_SIZE(tab_unreach)) {
*err = tab_unreach[code].err;
fatal = tab_unreach[code].fatal;
}
@@ -852,7 +1351,7 @@ int icmpv6_err_convert(int type, int code, int *err)
case ICMPV6_PKT_TOOBIG:
*err = EMSGSIZE;
break;
-
+
case ICMPV6_PARAMPROB:
*err = EPROTO;
fatal = 1;
@@ -861,22 +1360,95 @@ int icmpv6_err_convert(int type, int code, int *err)
case ICMPV6_TIME_EXCEED:
*err = EHOSTUNREACH;
break;
- };
+ }
return fatal;
}
+EXPORT_SYMBOL(icmpv6_err_convert);
#ifdef CONFIG_SYSCTL
-ctl_table ipv6_icmp_table[] = {
+
+static u32 icmpv6_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
+
+static struct ctl_table ipv6_icmp_table_template[] = {
{
- .ctl_name = NET_IPV6_ICMP_RATELIMIT,
.procname = "ratelimit",
- .data = &sysctl_icmpv6_time,
+ .data = &init_net.ipv6.sysctl.icmpv6_time,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "echo_ignore_all",
+ .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "echo_ignore_multicast",
+ .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "echo_ignore_anycast",
+ .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ratemask",
+ .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr,
+ .maxlen = ICMPV6_MSG_MAX + 1,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "error_anycast_as_unicast",
+ .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "errors_extension_mask",
+ .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmpv6_errors_extension_mask_all,
},
- { .ctl_name = 0 },
};
-#endif
+struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(ipv6_icmp_table_template,
+ sizeof(ipv6_icmp_table_template),
+ GFP_KERNEL);
+
+ if (table) {
+ table[0].data = &net->ipv6.sysctl.icmpv6_time;
+ table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
+ table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
+ table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
+ table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
+ table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
+ table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask;
+ }
+ return table;
+}
+
+size_t ipv6_icmp_sysctl_table_size(void)
+{
+ return ARRAY_SIZE(ipv6_icmp_table_template);
+}
+#endif
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
new file mode 100644
index 000000000000..1bc88ed7edc5
--- /dev/null
+++ b/net/ipv6/ila/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for ILA module
+#
+
+obj-$(CONFIG_IPV6_ILA) += ila.o
+
+ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
new file mode 100644
index 000000000000..85b92917849b
--- /dev/null
+++ b/net/ipv6/ila/ila.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2015 Tom Herbert <tom@herbertland.com>
+ */
+
+#ifndef __ILA_H
+#define __ILA_H
+
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/genetlink.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+
+struct ila_locator {
+ union {
+ __u8 v8[8];
+ __be16 v16[4];
+ __be32 v32[2];
+ __be64 v64;
+ };
+};
+
+struct ila_identifier {
+ union {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 __space:4;
+ u8 csum_neutral:1;
+ u8 type:3;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ u8 type:3;
+ u8 csum_neutral:1;
+ u8 __space:4;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ u8 __space2[7];
+ };
+ __u8 v8[8];
+ __be16 v16[4];
+ __be32 v32[2];
+ __be64 v64;
+ };
+};
+
+#define CSUM_NEUTRAL_FLAG htonl(0x10000000)
+
+struct ila_addr {
+ union {
+ struct in6_addr addr;
+ struct {
+ struct ila_locator loc;
+ struct ila_identifier ident;
+ };
+ };
+};
+
+static inline struct ila_addr *ila_a2i(struct in6_addr *addr)
+{
+ return (struct ila_addr *)addr;
+}
+
+struct ila_params {
+ struct ila_locator locator;
+ struct ila_locator locator_match;
+ __wsum csum_diff;
+ u8 csum_mode;
+ u8 ident_type;
+};
+
+static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], to[0], to[1],
+ };
+
+ return csum_partial(diff, sizeof(diff), 0);
+}
+
+static inline bool ila_csum_neutral_set(struct ila_identifier ident)
+{
+ return !!(ident.csum_neutral);
+}
+
+void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
+ bool set_csum_neutral);
+
+void ila_init_saved_csum(struct ila_params *p);
+
+struct ila_net {
+ struct {
+ struct rhashtable rhash_table;
+ spinlock_t *locks; /* Bucket locks for entry manipulation */
+ unsigned int locks_mask;
+ bool hooks_registered;
+ } xlat;
+};
+
+int ila_lwt_init(void);
+void ila_lwt_fini(void);
+
+int ila_xlat_init_net(struct net *net);
+void ila_xlat_pre_exit_net(struct net *net);
+void ila_xlat_exit_net(struct net *net);
+
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_dump_start(struct netlink_callback *cb);
+int ila_xlat_nl_dump_done(struct netlink_callback *cb);
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+extern unsigned int ila_net_id;
+
+extern struct genl_family ila_nl_family;
+
+#endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
new file mode 100644
index 000000000000..b8d43ed4689d
--- /dev/null
+++ b/net/ipv6/ila/ila_common.c
@@ -0,0 +1,155 @@
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+#include <net/lwtunnel.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+#include "ila.h"
+
+void ila_init_saved_csum(struct ila_params *p)
+{
+ if (!p->locator_match.v64)
+ return;
+
+ p->csum_diff = compute_csum_diff8(
+ (__be32 *)&p->locator,
+ (__be32 *)&p->locator_match);
+}
+
+static __wsum get_csum_diff_iaddr(struct ila_addr *iaddr, struct ila_params *p)
+{
+ if (p->locator_match.v64)
+ return p->csum_diff;
+ else
+ return compute_csum_diff8((__be32 *)&p->locator,
+ (__be32 *)&iaddr->loc);
+}
+
+static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
+{
+ return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p);
+}
+
+static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr,
+ struct ila_params *p)
+{
+ __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
+ __wsum diff, fval;
+
+ diff = get_csum_diff_iaddr(iaddr, p);
+
+ fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ?
+ CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG);
+
+ diff = csum_add(diff, fval);
+
+ *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust)));
+
+ /* Flip the csum-neutral bit. Either we are doing a SIR->ILA
+ * translation with ILA_CSUM_NEUTRAL_MAP as the csum_method
+ * and the C-bit is not set, or we are doing an ILA-SIR
+ * tranlsation and the C-bit is set.
+ */
+ iaddr->ident.csum_neutral ^= 1;
+}
+
+static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr,
+ struct ila_params *p)
+{
+ __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
+ __wsum diff;
+
+ diff = get_csum_diff_iaddr(iaddr, p);
+
+ *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust)));
+}
+
+static void ila_csum_adjust_transport(struct sk_buff *skb,
+ struct ila_params *p)
+{
+ size_t nhoff = sizeof(struct ipv6hdr);
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __wsum diff;
+
+ switch (ip6h->nexthdr) {
+ case NEXTHDR_TCP:
+ if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
+ struct tcphdr *th = (struct tcphdr *)
+ (skb_network_header(skb) + nhoff);
+
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&th->check, skb,
+ diff, true, true);
+ }
+ break;
+ case NEXTHDR_UDP:
+ if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
+ struct udphdr *uh = (struct udphdr *)
+ (skb_network_header(skb) + nhoff);
+
+ if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&uh->check, skb,
+ diff, true, true);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ }
+ break;
+ case NEXTHDR_ICMP:
+ if (likely(pskb_may_pull(skb,
+ nhoff + sizeof(struct icmp6hdr)))) {
+ struct icmp6hdr *ih = (struct icmp6hdr *)
+ (skb_network_header(skb) + nhoff);
+
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
+ diff, true, true);
+ }
+ break;
+ }
+}
+
+void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
+ bool sir2ila)
+{
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
+
+ switch (p->csum_mode) {
+ case ILA_CSUM_ADJUST_TRANSPORT:
+ ila_csum_adjust_transport(skb, p);
+ break;
+ case ILA_CSUM_NEUTRAL_MAP:
+ if (sir2ila) {
+ if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) {
+ /* Checksum flag should never be
+ * set in a formatted SIR address.
+ */
+ break;
+ }
+ } else if (!ila_csum_neutral_set(iaddr->ident)) {
+ /* ILA to SIR translation and C-bit isn't
+ * set so we're good.
+ */
+ break;
+ }
+ ila_csum_do_neutral_fmt(iaddr, p);
+ break;
+ case ILA_CSUM_NEUTRAL_MAP_AUTO:
+ ila_csum_do_neutral_nofmt(iaddr, p);
+ break;
+ case ILA_CSUM_NO_ACTION:
+ break;
+ }
+
+ /* Now change destination address */
+ iaddr->loc = p->locator;
+}
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
new file mode 100644
index 000000000000..7bb9edc5c28c
--- /dev/null
+++ b/net/ipv6/ila/ila_lwt.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/dst_cache.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/lwtunnel.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+#include "ila.h"
+
+struct ila_lwt {
+ struct ila_params p;
+ struct dst_cache dst_cache;
+ u32 connected : 1;
+ u32 lwt_output : 1;
+};
+
+static inline struct ila_lwt *ila_lwt_lwtunnel(
+ struct lwtunnel_state *lwt)
+{
+ return (struct ila_lwt *)lwt->data;
+}
+
+static inline struct ila_params *ila_params_lwtunnel(
+ struct lwtunnel_state *lwt)
+{
+ return &ila_lwt_lwtunnel(lwt)->p;
+}
+
+static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct rt6_info *rt = dst_rt6_info(orig_dst);
+ struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
+ struct dst_entry *dst;
+ int err = -EINVAL;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ if (ilwt->lwt_output)
+ ila_update_ipv6_locator(skb,
+ ila_params_lwtunnel(orig_dst->lwtstate),
+ true);
+
+ if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
+ /* Already have a next hop address in route, no need for
+ * dest cache route.
+ */
+ return orig_dst->lwtstate->orig_output(net, sk, skb);
+ }
+
+ local_bh_disable();
+ dst = dst_cache_get(&ilwt->dst_cache);
+ local_bh_enable();
+ if (unlikely(!dst)) {
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ /* Lookup a route for the new destination. Take into
+ * account that the base route may already have a gateway.
+ */
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = dst_dev(orig_dst)->ifindex;
+ fl6.flowi6_iif = LOOPBACK_IFINDEX;
+ fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst),
+ &ip6h->daddr);
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = -EHOSTUNREACH;
+ dst_release(dst);
+ goto drop;
+ }
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto drop;
+ }
+
+ /* cache only if we don't create a dst reference loop */
+ if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr);
+ local_bh_enable();
+ }
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ return dst_output(net, sk, skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int ila_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate);
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ if (!ilwt->lwt_output)
+ ila_update_ipv6_locator(skb,
+ ila_params_lwtunnel(dst->lwtstate),
+ false);
+
+ return dst->lwtstate->orig_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+ [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+ [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+ [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+ [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
+};
+
+static int ila_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct ila_lwt *ilwt;
+ struct ila_params *p;
+ struct nlattr *tb[ILA_ATTR_MAX + 1];
+ struct lwtunnel_state *newts;
+ const struct fib6_config *cfg6 = cfg;
+ struct ila_addr *iaddr;
+ u8 ident_type = ILA_ATYPE_USE_FORMAT;
+ u8 hook_type = ILA_HOOK_ROUTE_OUTPUT;
+ u8 csum_mode = ILA_CSUM_NO_ACTION;
+ bool lwt_output = true;
+ u8 eff_ident_type;
+ int ret;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ ret = nla_parse_nested_deprecated(tb, ILA_ATTR_MAX, nla,
+ ila_nl_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[ILA_ATTR_LOCATOR])
+ return -EINVAL;
+
+ iaddr = (struct ila_addr *)&cfg6->fc_dst;
+
+ if (tb[ILA_ATTR_IDENT_TYPE])
+ ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]);
+
+ if (ident_type == ILA_ATYPE_USE_FORMAT) {
+ /* Infer identifier type from type field in formatted
+ * identifier.
+ */
+
+ if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
+ /* Need to have full locator and at least type field
+ * included in destination
+ */
+ return -EINVAL;
+ }
+
+ eff_ident_type = iaddr->ident.type;
+ } else {
+ eff_ident_type = ident_type;
+ }
+
+ switch (eff_ident_type) {
+ case ILA_ATYPE_IID:
+ /* Don't allow ILA for IID type */
+ return -EINVAL;
+ case ILA_ATYPE_LUID:
+ break;
+ case ILA_ATYPE_VIRT_V4:
+ case ILA_ATYPE_VIRT_UNI_V6:
+ case ILA_ATYPE_VIRT_MULTI_V6:
+ case ILA_ATYPE_NONLOCAL_ADDR:
+ /* These ILA formats are not supported yet. */
+ default:
+ return -EINVAL;
+ }
+
+ if (tb[ILA_ATTR_HOOK_TYPE])
+ hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]);
+
+ switch (hook_type) {
+ case ILA_HOOK_ROUTE_OUTPUT:
+ lwt_output = true;
+ break;
+ case ILA_HOOK_ROUTE_INPUT:
+ lwt_output = false;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (tb[ILA_ATTR_CSUM_MODE])
+ csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
+
+ if (csum_mode == ILA_CSUM_NEUTRAL_MAP &&
+ ila_csum_neutral_set(iaddr->ident)) {
+ /* Don't allow translation if checksum neutral bit is
+ * configured and it's set in the SIR address.
+ */
+ return -EINVAL;
+ }
+
+ newts = lwtunnel_state_alloc(sizeof(*ilwt));
+ if (!newts)
+ return -ENOMEM;
+
+ ilwt = ila_lwt_lwtunnel(newts);
+ ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC);
+ if (ret) {
+ kfree(newts);
+ return ret;
+ }
+
+ ilwt->lwt_output = !!lwt_output;
+
+ p = ila_params_lwtunnel(newts);
+
+ p->csum_mode = csum_mode;
+ p->ident_type = ident_type;
+ p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
+
+ /* Precompute checksum difference for translation since we
+ * know both the old locator and the new one.
+ */
+ p->locator_match = iaddr->loc;
+
+ ila_init_saved_csum(p);
+
+ newts->type = LWTUNNEL_ENCAP_ILA;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+ LWTUNNEL_STATE_INPUT_REDIRECT;
+
+ if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
+ ilwt->connected = 1;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void ila_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache);
+}
+
+static int ila_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ila_params *p = ila_params_lwtunnel(lwtstate);
+ struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate);
+
+ if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
+ ILA_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE,
+ ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT :
+ ILA_HOOK_ROUTE_INPUT))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */
+ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */
+ nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */
+ nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */
+ 0;
+}
+
+static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct ila_params *a_p = ila_params_lwtunnel(a);
+ struct ila_params *b_p = ila_params_lwtunnel(b);
+
+ return (a_p->locator.v64 != b_p->locator.v64);
+}
+
+static const struct lwtunnel_encap_ops ila_encap_ops = {
+ .build_state = ila_build_state,
+ .destroy_state = ila_destroy_state,
+ .output = ila_output,
+ .input = ila_input,
+ .fill_encap = ila_fill_encap_info,
+ .get_encap_size = ila_encap_nlsize,
+ .cmp_encap = ila_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int ila_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
+}
+
+void ila_lwt_fini(void)
+{
+ lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
+}
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
new file mode 100644
index 000000000000..976c78efbae1
--- /dev/null
+++ b/net/ipv6/ila/ila_main.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+#include <uapi/linux/genetlink.h>
+#include "ila.h"
+
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+ [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+ [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
+ [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
+ [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+ [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+};
+
+static const struct genl_ops ila_nl_ops[] = {
+ {
+ .cmd = ILA_CMD_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ila_xlat_nl_cmd_add_mapping,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ila_xlat_nl_cmd_del_mapping,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ila_xlat_nl_cmd_flush,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ila_xlat_nl_cmd_get_mapping,
+ .start = ila_xlat_nl_dump_start,
+ .dumpit = ila_xlat_nl_dump,
+ .done = ila_xlat_nl_dump_done,
+ },
+};
+
+unsigned int ila_net_id;
+
+struct genl_family ila_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = ILA_GENL_NAME,
+ .version = ILA_GENL_VERSION,
+ .maxattr = ILA_ATTR_MAX,
+ .policy = ila_nl_policy,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .ops = ila_nl_ops,
+ .n_ops = ARRAY_SIZE(ila_nl_ops),
+ .resv_start_op = ILA_CMD_FLUSH + 1,
+};
+
+static __net_init int ila_init_net(struct net *net)
+{
+ int err;
+
+ err = ila_xlat_init_net(net);
+ if (err)
+ goto ila_xlat_init_fail;
+
+ return 0;
+
+ila_xlat_init_fail:
+ return err;
+}
+
+static __net_exit void ila_pre_exit_net(struct net *net)
+{
+ ila_xlat_pre_exit_net(net);
+}
+
+static __net_exit void ila_exit_net(struct net *net)
+{
+ ila_xlat_exit_net(net);
+}
+
+static struct pernet_operations ila_net_ops = {
+ .init = ila_init_net,
+ .pre_exit = ila_pre_exit_net,
+ .exit = ila_exit_net,
+ .id = &ila_net_id,
+ .size = sizeof(struct ila_net),
+};
+
+static int __init ila_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&ila_net_ops);
+ if (ret)
+ goto register_device_fail;
+
+ ret = genl_register_family(&ila_nl_family);
+ if (ret)
+ goto register_family_fail;
+
+ ret = ila_lwt_init();
+ if (ret)
+ goto fail_lwt;
+
+ return 0;
+
+fail_lwt:
+ genl_unregister_family(&ila_nl_family);
+register_family_fail:
+ unregister_pernet_device(&ila_net_ops);
+register_device_fail:
+ return ret;
+}
+
+static void __exit ila_fini(void)
+{
+ ila_lwt_fini();
+ genl_unregister_family(&ila_nl_family);
+ unregister_pernet_device(&ila_net_ops);
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
new file mode 100644
index 000000000000..1d41b2ab4884
--- /dev/null
+++ b/net/ipv6/ila/ila_xlat.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/jhash.h>
+#include <linux/netfilter.h>
+#include <linux/rcupdate.h>
+#include <linux/rhashtable.h>
+#include <linux/vmalloc.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+#include <uapi/linux/genetlink.h>
+#include "ila.h"
+
+struct ila_xlat_params {
+ struct ila_params ip;
+ int ifindex;
+};
+
+struct ila_map {
+ struct ila_xlat_params xp;
+ struct rhash_head node;
+ struct ila_map __rcu *next;
+ struct rcu_head rcu;
+};
+
+#define MAX_LOCKS 1024
+#define LOCKS_PER_CPU 10
+
+static int alloc_ila_locks(struct ila_net *ilan)
+{
+ return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask,
+ MAX_LOCKS, LOCKS_PER_CPU,
+ GFP_KERNEL);
+}
+
+static u32 hashrnd __read_mostly;
+static __always_inline void __ila_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static inline u32 ila_locator_hash(struct ila_locator loc)
+{
+ u32 *v = (u32 *)loc.v32;
+
+ __ila_hash_secret_init();
+ return jhash_2words(v[0], v[1], hashrnd);
+}
+
+static inline spinlock_t *ila_get_lock(struct ila_net *ilan,
+ struct ila_locator loc)
+{
+ return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask];
+}
+
+static inline int ila_cmp_wildcards(struct ila_map *ila,
+ struct ila_addr *iaddr, int ifindex)
+{
+ return (ila->xp.ifindex && ila->xp.ifindex != ifindex);
+}
+
+static inline int ila_cmp_params(struct ila_map *ila,
+ struct ila_xlat_params *xp)
+{
+ return (ila->xp.ifindex != xp->ifindex);
+}
+
+static int ila_cmpfn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct ila_map *ila = obj;
+
+ return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key);
+}
+
+static inline int ila_order(struct ila_map *ila)
+{
+ int score = 0;
+
+ if (ila->xp.ifindex)
+ score += 1 << 1;
+
+ return score;
+}
+
+static const struct rhashtable_params rht_params = {
+ .nelem_hint = 1024,
+ .head_offset = offsetof(struct ila_map, node),
+ .key_offset = offsetof(struct ila_map, xp.ip.locator_match),
+ .key_len = sizeof(u64), /* identifier */
+ .max_size = 1048576,
+ .min_size = 256,
+ .automatic_shrinking = true,
+ .obj_cmpfn = ila_cmpfn,
+};
+
+static int parse_nl_config(struct genl_info *info,
+ struct ila_xlat_params *xp)
+{
+ memset(xp, 0, sizeof(*xp));
+
+ if (info->attrs[ILA_ATTR_LOCATOR])
+ xp->ip.locator.v64 = (__force __be64)nla_get_u64(
+ info->attrs[ILA_ATTR_LOCATOR]);
+
+ if (info->attrs[ILA_ATTR_LOCATOR_MATCH])
+ xp->ip.locator_match.v64 = (__force __be64)nla_get_u64(
+ info->attrs[ILA_ATTR_LOCATOR_MATCH]);
+
+ xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE],
+ ILA_CSUM_NO_ACTION);
+
+ xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE],
+ ILA_ATYPE_USE_FORMAT);
+
+ if (info->attrs[ILA_ATTR_IFINDEX])
+ xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
+
+ return 0;
+}
+
+/* Must be called with rcu readlock */
+static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr,
+ int ifindex,
+ struct ila_net *ilan)
+{
+ struct ila_map *ila;
+
+ ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc,
+ rht_params);
+ while (ila) {
+ if (!ila_cmp_wildcards(ila, iaddr, ifindex))
+ return ila;
+ ila = rcu_access_pointer(ila->next);
+ }
+
+ return NULL;
+}
+
+/* Must be called with rcu readlock */
+static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp,
+ struct ila_net *ilan)
+{
+ struct ila_map *ila;
+
+ ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
+ &xp->ip.locator_match,
+ rht_params);
+ while (ila) {
+ if (!ila_cmp_params(ila, xp))
+ return ila;
+ ila = rcu_access_pointer(ila->next);
+ }
+
+ return NULL;
+}
+
+static inline void ila_release(struct ila_map *ila)
+{
+ kfree_rcu(ila, rcu);
+}
+
+static void ila_free_node(struct ila_map *ila)
+{
+ struct ila_map *next;
+
+ /* Assume rcu_readlock held */
+ while (ila) {
+ next = rcu_access_pointer(ila->next);
+ ila_release(ila);
+ ila = next;
+ }
+}
+
+static void ila_free_cb(void *ptr, void *arg)
+{
+ ila_free_node((struct ila_map *)ptr);
+}
+
+static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
+
+static unsigned int
+ila_nf_input(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ ila_xlat_addr(skb, false);
+ return NF_ACCEPT;
+}
+
+static const struct nf_hook_ops ila_nf_hook_ops[] = {
+ {
+ .hook = ila_nf_input,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = -1,
+ },
+};
+
+static DEFINE_MUTEX(ila_mutex);
+
+static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_map *ila, *head;
+ spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
+ int err = 0, order;
+
+ if (!READ_ONCE(ilan->xlat.hooks_registered)) {
+ /* We defer registering net hooks in the namespace until the
+ * first mapping is added.
+ */
+ mutex_lock(&ila_mutex);
+ if (!ilan->xlat.hooks_registered) {
+ err = nf_register_net_hooks(net, ila_nf_hook_ops,
+ ARRAY_SIZE(ila_nf_hook_ops));
+ if (!err)
+ WRITE_ONCE(ilan->xlat.hooks_registered, true);
+ }
+ mutex_unlock(&ila_mutex);
+ if (err)
+ return err;
+ }
+
+ ila = kzalloc(sizeof(*ila), GFP_KERNEL);
+ if (!ila)
+ return -ENOMEM;
+
+ ila_init_saved_csum(&xp->ip);
+
+ ila->xp = *xp;
+
+ order = ila_order(ila);
+
+ spin_lock(lock);
+
+ head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
+ &xp->ip.locator_match,
+ rht_params);
+ if (!head) {
+ /* New entry for the rhash_table */
+ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table,
+ &ila->node, rht_params);
+ } else {
+ struct ila_map *tila = head, *prev = NULL;
+
+ do {
+ if (!ila_cmp_params(tila, xp)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ if (order > ila_order(tila))
+ break;
+
+ prev = tila;
+ tila = rcu_dereference_protected(tila->next,
+ lockdep_is_held(lock));
+ } while (tila);
+
+ if (prev) {
+ /* Insert in sub list of head */
+ RCU_INIT_POINTER(ila->next, tila);
+ rcu_assign_pointer(prev->next, ila);
+ } else {
+ /* Make this ila new head */
+ RCU_INIT_POINTER(ila->next, head);
+ err = rhashtable_replace_fast(&ilan->xlat.rhash_table,
+ &head->node,
+ &ila->node, rht_params);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(lock);
+
+ if (err)
+ kfree(ila);
+
+ return err;
+}
+
+static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_map *ila, *head, *prev;
+ spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
+ int err = -ENOENT;
+
+ spin_lock(lock);
+
+ head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
+ &xp->ip.locator_match, rht_params);
+ ila = head;
+
+ prev = NULL;
+
+ while (ila) {
+ if (ila_cmp_params(ila, xp)) {
+ prev = ila;
+ ila = rcu_dereference_protected(ila->next,
+ lockdep_is_held(lock));
+ continue;
+ }
+
+ err = 0;
+
+ if (prev) {
+ /* Not head, just delete from list */
+ rcu_assign_pointer(prev->next, ila->next);
+ } else {
+ /* It is the head. If there is something in the
+ * sublist we need to make a new head.
+ */
+ head = rcu_dereference_protected(ila->next,
+ lockdep_is_held(lock));
+ if (head) {
+ /* Put first entry in the sublist into the
+ * table
+ */
+ err = rhashtable_replace_fast(
+ &ilan->xlat.rhash_table, &ila->node,
+ &head->node, rht_params);
+ if (err)
+ goto out;
+ } else {
+ /* Entry no longer used */
+ err = rhashtable_remove_fast(
+ &ilan->xlat.rhash_table,
+ &ila->node, rht_params);
+ }
+ }
+
+ ila_release(ila);
+
+ break;
+ }
+
+out:
+ spin_unlock(lock);
+
+ return err;
+}
+
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_xlat_params p;
+ int err;
+
+ err = parse_nl_config(info, &p);
+ if (err)
+ return err;
+
+ return ila_add_mapping(net, &p);
+}
+
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_xlat_params xp;
+ int err;
+
+ err = parse_nl_config(info, &xp);
+ if (err)
+ return err;
+
+ ila_del_mapping(net, &xp);
+
+ return 0;
+}
+
+static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
+ struct ila_map *ila)
+{
+ return ila_get_lock(ilan, ila->xp.ip.locator_match);
+}
+
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct rhashtable_iter iter;
+ struct ila_map *ila;
+ spinlock_t *lock;
+ int ret = 0;
+
+ rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter);
+ rhashtable_walk_start(&iter);
+
+ for (;;) {
+ ila = rhashtable_walk_next(&iter);
+
+ if (IS_ERR(ila)) {
+ if (PTR_ERR(ila) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(ila);
+ goto done;
+ } else if (!ila) {
+ break;
+ }
+
+ lock = lock_from_ila_map(ilan, ila);
+
+ spin_lock(lock);
+
+ ret = rhashtable_remove_fast(&ilan->xlat.rhash_table,
+ &ila->node, rht_params);
+ if (!ret)
+ ila_free_node(ila);
+
+ spin_unlock(lock);
+
+ if (ret)
+ break;
+ }
+
+done:
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
+ return ret;
+}
+
+static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
+{
+ if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
+ (__force u64)ila->xp.ip.locator.v64,
+ ILA_ATTR_PAD) ||
+ nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH,
+ (__force u64)ila->xp.ip.locator_match.v64,
+ ILA_ATTR_PAD) ||
+ nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) ||
+ nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) ||
+ nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type))
+ return -1;
+
+ return 0;
+}
+
+static int ila_dump_info(struct ila_map *ila,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (ila_fill_info(ila, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct sk_buff *msg;
+ struct ila_xlat_params xp;
+ struct ila_map *ila;
+ int ret;
+
+ ret = parse_nl_config(info, &xp);
+ if (ret)
+ return ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ ila = ila_lookup_by_params(&xp, ilan);
+ if (ila) {
+ ret = ila_dump_info(ila,
+ info->snd_portid,
+ info->snd_seq, 0, msg,
+ info->genlhdr->cmd);
+ }
+
+ rcu_read_unlock();
+
+ if (ret < 0)
+ goto out_free;
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+struct ila_dump_iter {
+ struct rhashtable_iter rhiter;
+ int skip;
+};
+
+int ila_xlat_nl_dump_start(struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_dump_iter *iter;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter);
+
+ iter->skip = 0;
+ cb->args[0] = (long)iter;
+
+ return 0;
+}
+
+int ila_xlat_nl_dump_done(struct netlink_callback *cb)
+{
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+
+ rhashtable_walk_exit(&iter->rhiter);
+
+ kfree(iter);
+
+ return 0;
+}
+
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+ struct rhashtable_iter *rhiter = &iter->rhiter;
+ int skip = iter->skip;
+ struct ila_map *ila;
+ int ret;
+
+ rhashtable_walk_start(rhiter);
+
+ /* Get first entry */
+ ila = rhashtable_walk_peek(rhiter);
+
+ if (ila && !IS_ERR(ila) && skip) {
+ /* Skip over visited entries */
+
+ while (ila && skip) {
+ /* Skip over any ila entries in this list that we
+ * have already dumped.
+ */
+ ila = rcu_access_pointer(ila->next);
+ skip--;
+ }
+ }
+
+ skip = 0;
+
+ for (;;) {
+ if (IS_ERR(ila)) {
+ ret = PTR_ERR(ila);
+ if (ret == -EAGAIN) {
+ /* Table has changed and iter has reset. Return
+ * -EAGAIN to the application even if we have
+ * written data to the skb. The application
+ * needs to deal with this.
+ */
+
+ goto out_ret;
+ } else {
+ break;
+ }
+ } else if (!ila) {
+ ret = 0;
+ break;
+ }
+
+ while (ila) {
+ ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, ILA_CMD_GET);
+ if (ret)
+ goto out;
+
+ skip++;
+ ila = rcu_access_pointer(ila->next);
+ }
+
+ skip = 0;
+ ila = rhashtable_walk_next(rhiter);
+ }
+
+out:
+ iter->skip = skip;
+ ret = (skb->len ? : ret);
+
+out_ret:
+ rhashtable_walk_stop(rhiter);
+ return ret;
+}
+
+int ila_xlat_init_net(struct net *net)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ int err;
+
+ err = alloc_ila_locks(ilan);
+ if (err)
+ return err;
+
+ err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
+ if (err) {
+ free_bucket_spinlocks(ilan->xlat.locks);
+ return err;
+ }
+
+ return 0;
+}
+
+void ila_xlat_pre_exit_net(struct net *net)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+
+ if (ilan->xlat.hooks_registered)
+ nf_unregister_net_hooks(net, ila_nf_hook_ops,
+ ARRAY_SIZE(ila_nf_hook_ops));
+}
+
+void ila_xlat_exit_net(struct net *net)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+
+ rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL);
+
+ free_bucket_spinlocks(ilan->xlat.locks);
+}
+
+static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
+{
+ struct ila_map *ila;
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
+
+ /* Assumes skb contains a valid IPv6 header that is pulled */
+
+ /* No check here that ILA type in the mapping matches what is in the
+ * address. We assume that whatever sender gaves us can be translated.
+ * The checksum mode however is relevant.
+ */
+
+ rcu_read_lock();
+
+ ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan);
+ if (ila)
+ ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila);
+
+ rcu_read_unlock();
+
+ return 0;
+}
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index c700302ad51a..ea5cf3fdfdd6 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,17 +7,13 @@
* Support for INET6 connection oriented protocols.
*
* Authors: See the TCPv6 sources
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
*/
#include <linux/module.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/jhash.h>
+#include <linux/slab.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
@@ -25,175 +22,117 @@
#include <net/ip6_route.h>
#include <net/sock.h>
#include <net/inet6_connection_sock.h>
+#include <net/sock_reuseport.h>
-int inet6_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
-{
- const struct sock *sk2;
- const struct hlist_node *node;
-
- /* We must walk the whole port owner list in this case. -DaveM */
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
- (!sk->sk_reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- ipv6_rcv_saddr_equal(sk, sk2))
- break;
- }
-
- return node != NULL;
-}
-
-EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
-
-/*
- * request_sock (formerly open request) hash tables.
- */
-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
- const u32 rnd, const u16 synq_hsize)
-{
- u32 a = (__force u32)raddr->s6_addr32[0];
- u32 b = (__force u32)raddr->s6_addr32[1];
- u32 c = (__force u32)raddr->s6_addr32[2];
-
- a += JHASH_GOLDEN_RATIO;
- b += JHASH_GOLDEN_RATIO;
- c += rnd;
- __jhash_mix(a, b, c);
-
- a += (__force u32)raddr->s6_addr32[3];
- b += (__force u32)rport;
- __jhash_mix(a, b, c);
-
- return c & (synq_hsize - 1);
-}
-
-struct request_sock *inet6_csk_search_req(const struct sock *sk,
- struct request_sock ***prevp,
- const __be16 rport,
- const struct in6_addr *raddr,
- const struct in6_addr *laddr,
- const int iif)
+struct dst_entry *inet6_csk_route_req(const struct sock *sk,
+ struct flowi6 *fl6,
+ const struct request_sock *req,
+ u8 proto)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req, **prev;
-
- for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
- lopt->hash_rnd,
- lopt->nr_table_entries)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- const struct inet6_request_sock *treq = inet6_rsk(req);
-
- if (inet_rsk(req)->rmt_port == rport &&
- req->rsk_ops->family == AF_INET6 &&
- ipv6_addr_equal(&treq->rmt_addr, raddr) &&
- ipv6_addr_equal(&treq->loc_addr, laddr) &&
- (!treq->iif || treq->iif == iif)) {
- BUG_TRAP(req->sk == NULL);
- *prevp = prev;
- return req;
- }
- }
+ struct inet_request_sock *ireq = inet_rsk(req);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *final_p, final;
+ struct dst_entry *dst;
- return NULL;
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = proto;
+ fl6->daddr = ireq->ir_v6_rmt_addr;
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
+ fl6->saddr = ireq->ir_v6_loc_addr;
+ fl6->flowi6_oif = ireq->ir_iif;
+ fl6->flowi6_mark = ireq->ir_mark;
+ fl6->fl6_dport = ireq->ir_rmt_port;
+ fl6->fl6_sport = htons(ireq->ir_num);
+ fl6->flowi6_uid = sk_uid(sk);
+ security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
+
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
+ if (IS_ERR(dst))
+ return NULL;
+
+ return dst;
}
-EXPORT_SYMBOL_GPL(inet6_csk_search_req);
-
-void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
- struct request_sock *req,
- const unsigned long timeout)
+static inline
+struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
- inet_rsk(req)->rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
+ return __sk_dst_check(sk, cookie);
}
-EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
-
-void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
+ struct flowi6 *fl6)
{
+ struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
-
- sin6->sin6_family = AF_INET6;
- ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
- sin6->sin6_port = inet_sk(sk)->dport;
- /* We do not store received flowlabel for TCP */
- sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (sk->sk_bound_dev_if &&
- ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = sk->sk_bound_dev_if;
-}
+ struct in6_addr *final_p, final;
+ struct dst_entry *dst;
-EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->daddr = sk->sk_v6_daddr;
+ fl6->saddr = np->saddr;
+ fl6->flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+ fl6->flowi6_oif = sk->sk_bound_dev_if;
+ fl6->flowi6_mark = sk->sk_mark;
+ fl6->fl6_sport = inet->inet_sport;
+ fl6->fl6_dport = inet->inet_dport;
+ fl6->flowi6_uid = sk_uid(sk);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
+
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
+
+ dst = __inet6_csk_dst_check(sk, np->dst_cookie);
+ if (!dst) {
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
+
+ if (!IS_ERR(dst))
+ ip6_dst_store(sk, dst, false, false);
+ }
+ return dst;
+}
-int inet6_csk_xmit(struct sk_buff *skb, struct sock *sk, int ipfragok)
+int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused)
{
- struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
- struct in6_addr *final_p = NULL, final;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_sport = inet->sport;
- fl.fl_ip_dport = inet->dport;
- security_sk_classify_flow(sk, &fl);
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
+ int res;
+
+ dst = inet6_csk_route_socket(sk, &fl6);
+ if (IS_ERR(dst)) {
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
+ sk->sk_route_caps = 0;
+ kfree_skb(skb);
+ return PTR_ERR(dst);
}
- dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst == NULL) {
- int err = ip6_dst_lookup(sk, &dst, &fl);
-
- if (err) {
- sk->sk_err_soft = -err;
- kfree_skb(skb);
- return err;
- }
+ rcu_read_lock();
+ skb_dst_set_noref(skb, dst);
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_route_caps = 0;
- kfree_skb(skb);
- return err;
- }
+ /* Restore final destination back after routing done */
+ fl6.daddr = sk->sk_v6_daddr;
- __ip6_dst_store(sk, dst, NULL, NULL);
- }
+ res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
+ np->tclass, READ_ONCE(sk->sk_priority));
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
- skb->dst = dst_clone(dst);
+struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct flowi6 fl6;
+ struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6);
- /* Restore final destination back after routing done */
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ if (IS_ERR(dst))
+ return NULL;
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
- return ip6_xmit(sk, skb, &fl, np->opt, 0);
+ dst = inet6_csk_route_socket(sk, &fl6);
+ return IS_ERR(dst) ? NULL : dst;
}
-
-EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b7e5bae0e347..5e1da088d8e1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,48 +7,38 @@
* Generic INET6 transport hashtables
*
* Authors: Lotsa people, from code originally in tcp, generalised here
- * by Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * by Arnaldo Carvalho de Melo <acme@mandriva.com>
*/
#include <linux/module.h>
#include <linux/random.h>
+#include <net/addrconf.h>
+#include <net/hotdata.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
+#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/sock_reuseport.h>
+#include <net/tcp.h>
-void __inet6_hash(struct inet_hashinfo *hashinfo,
- struct sock *sk)
+u32 inet6_ehashfn(const struct net *net,
+ const struct in6_addr *laddr, const u16 lport,
+ const struct in6_addr *faddr, const __be16 fport)
{
- struct hlist_head *list;
- rwlock_t *lock;
-
- BUG_TRAP(sk_unhashed(sk));
-
- if (sk->sk_state == TCP_LISTEN) {
- list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &hashinfo->lhash_lock;
- inet_listen_wlock(hashinfo);
- } else {
- unsigned int hash;
- sk->sk_hash = hash = inet6_sk_ehashfn(sk);
- hash &= (hashinfo->ehash_size - 1);
- list = &hashinfo->ehash[hash].chain;
- lock = &hashinfo->ehash[hash].lock;
- write_lock(lock);
- }
+ u32 lhash, fhash;
+
+ net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
+ net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret));
- __sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
+ lhash = (__force u32)laddr->s6_addr32[3];
+ fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret);
+
+ return lport + __inet6_ehashfn(lhash, 0, fhash, fport,
+ inet6_ehash_secret + net_hash_mix(net));
}
-EXPORT_SYMBOL(__inet6_hash);
+EXPORT_SYMBOL_GPL(inet6_ehashfn);
/*
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
@@ -55,289 +46,325 @@ EXPORT_SYMBOL(__inet6_hash);
*
* The sockhash lock must be held as a reader here.
*/
-struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
- const struct in6_addr *saddr,
- const __be16 sport,
- const struct in6_addr *daddr,
- const u16 hnum,
- const int dif)
+struct sock *__inet6_lookup_established(const struct net *net,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum,
+ const int dif, const int sdif)
{
- struct sock *sk;
- const struct hlist_node *node;
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
-
- prefetch(head->chain.first);
- read_lock(&head->lock);
- sk_for_each(sk, node, &head->chain) {
- /* For IPV6 do the cheaper port and family tests first. */
- if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
- goto hit; /* You sunk my battleship! */
- }
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) {
- const struct inet_timewait_sock *tw = inet_twsk(sk);
-
- if(*((__portpair *)&(tw->tw_dport)) == ports &&
- sk->sk_family == PF_INET6) {
- const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
-
- if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
- (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
- goto hit;
+ const struct hlist_nulls_node *node;
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
+begin:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
+ continue;
+ if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ goto out;
+
+ if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
+ sock_gen_put(sk);
+ goto begin;
}
+ goto found;
}
- read_unlock(&head->lock);
- return NULL;
-
-hit:
- sock_hold(sk);
- read_unlock(&head->lock);
+ if (get_nulls_value(node) != slot)
+ goto begin;
+out:
+ sk = NULL;
+found:
return sk;
}
EXPORT_SYMBOL(__inet6_lookup_established);
-struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
- const struct in6_addr *daddr,
- const unsigned short hnum, const int dif)
+static inline int compute_score(struct sock *sk, const struct net *net,
+ const unsigned short hnum,
+ const struct in6_addr *daddr,
+ const int dif, const int sdif)
{
- struct sock *sk;
- const struct hlist_node *node;
- struct sock *result = NULL;
+ int score = -1;
+
+ if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
+ sk->sk_family == PF_INET6) {
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
+
+ score = sk->sk_bound_dev_if ? 2 : 1;
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ score++;
+ }
+ return score;
+}
+
+/**
+ * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
+ * @net: network namespace.
+ * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
+ * @skb: context for a potential SK_REUSEPORT program.
+ * @doff: header offset.
+ * @saddr: source address.
+ * @sport: source port.
+ * @daddr: destination address.
+ * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
+ *
+ * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
+ * the selected sock or an error.
+ */
+struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned short hnum,
+ inet6_ehashfn_t *ehashfn)
+{
+ struct sock *reuse_sk = NULL;
+ u32 phash;
+
+ if (sk->sk_reuseport) {
+ phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn,
+ net, daddr, hnum, saddr, sport);
+ reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+ }
+ return reuse_sk;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup_reuseport);
+
+/* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(const struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport, const struct in6_addr *daddr,
+ const unsigned short hnum, const int dif, const int sdif)
+{
+ struct sock *sk, *result = NULL;
+ struct hlist_nulls_node *node;
int score, hiscore = 0;
- read_lock(&hashinfo->lhash_lock);
- sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
- if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
- const struct ipv6_pinfo *np = inet6_sk(sk);
-
- score = 1;
- if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
- continue;
- score++;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score++;
- }
- if (score == 3) {
- result = sk;
- break;
- }
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
+ sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
+ if (score > hiscore) {
+ result = inet6_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, inet6_ehashfn);
+ if (result)
+ return result;
+
+ result = sk;
+ hiscore = score;
}
}
- if (result)
- sock_hold(result);
- read_unlock(&hashinfo->lhash_lock);
+
return result;
}
+struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
+ int protocol,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum, const int dif,
+ inet6_ehashfn_t *ehashfn)
+{
+ struct sock *sk, *reuse_sk;
+ bool no_reuseport;
+
+ no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport,
+ daddr, hnum, dif, &sk);
+ if (no_reuseport || IS_ERR_OR_NULL(sk))
+ return sk;
+
+ reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, ehashfn);
+ if (reuse_sk)
+ sk = reuse_sk;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);
+
+struct sock *inet6_lookup_listener(const struct net *net,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const unsigned short hnum,
+ const int dif, const int sdif)
+{
+ struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
+ struct sock *result = NULL;
+ unsigned int hash2;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+ result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
+ saddr, sport, daddr, hnum, dif,
+ inet6_ehashfn);
+ if (result)
+ goto done;
+ }
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+
+ result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ goto done;
+
+ /* Lookup lhash2 with in6addr_any */
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+
+ result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, &in6addr_any, hnum,
+ dif, sdif);
+done:
+ if (IS_ERR(result))
+ return NULL;
+ return result;
+}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
-struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
+struct sock *inet6_lookup(const struct net *net,
+ struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
const int dif)
{
struct sock *sk;
+ bool refcounted;
- local_bh_disable();
- sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
-
+ sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr,
+ ntohs(dport), dif, 0, &refcounted);
+ if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
return sk;
}
-
EXPORT_SYMBOL_GPL(inet6_lookup);
static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup,
+ u32 hash)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
- const struct ipv6_pinfo *np = inet6_sk(sk);
- const struct in6_addr *daddr = &np->rcv_saddr;
- const struct in6_addr *saddr = &np->daddr;
+ const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
+ const struct in6_addr *saddr = &sk->sk_v6_daddr;
const int dif = sk->sk_bound_dev_if;
- const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
- const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
- inet->dport);
+ struct net *net = sock_net(sk);
+ const int sdif = l3mdev_master_ifindex_by_index(net, dif);
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct inet_timewait_sock *tw = NULL;
+ const struct hlist_nulls_node *node;
struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
- const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
-
- tw = inet_twsk(sk2);
-
- if(*((__portpair *)&(tw->tw_dport)) == ports &&
- sk2->sk_family == PF_INET6 &&
- ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
- sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
- if (twsk_unique(sk, sk2, twp))
- goto unique;
- else
- goto not_unique;
+ spinlock_t *lock;
+
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet6_match(net, sk2, saddr, daddr,
+ ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
}
+ return 0;
}
- tw = NULL;
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
+ lock = inet_ehash_lockp(hinfo, hash);
+ spin_lock(lock);
+
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(inet6_match(net, sk2, saddr, daddr, ports,
+ dif, sdif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (tcp_twsk_unique(sk, sk2, twp))
+ break;
+ }
goto not_unique;
+ }
}
-unique:
/* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
+ * in hash table socket with a funny identity.
+ */
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
sk->sk_hash = hash;
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ WARN_ON(!sk_unhashed(sk));
+ __sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
+ __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
+ spin_unlock(lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- if (twp != NULL) {
+ if (twp) {
*twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw != NULL) {
+ } else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
not_unique:
- write_unlock(&head->lock);
+ spin_unlock(lock);
return -EADDRNOTAVAIL;
}
-static inline u32 inet6_sk_port_offset(const struct sock *sk)
+static u64 inet6_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ipv6_pinfo *np = inet6_sk(sk);
- return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->dport);
+
+ return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_dport);
}
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (snum == 0) {
- const int low = sysctl_local_port_range[0];
- const int high = sysctl_local_port_range[1];
- const int range = high - low;
- int i, port;
- static u32 hint;
- const u32 offset = hint + inet6_sk_port_offset(sk);
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__inet6_check_established(death_row,
- sk, port,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- }
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- hint += i;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __inet6_hash(hinfo, sk);
- }
- spin_unlock(&head->lock);
-
- if (tw) {
- inet_twsk_deschedule(tw, death_row);
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
-
- if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
- __inet6_hash(hinfo, sk);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __inet6_check_established(death_row, sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
+ const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
+ const struct in6_addr *saddr = &sk->sk_v6_daddr;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct net *net = sock_net(sk);
+ u64 port_offset = 0;
+ u32 hash_port0;
+
+ if (!inet_sk(sk)->inet_num)
+ port_offset = inet6_sk_port_offset(sk);
+ hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport);
+
+ return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
+ __inet6_check_established);
+}
EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
new file mode 100644
index 000000000000..9553a3200081
--- /dev/null
+++ b/net/ipv6/ioam6.c
@@ -0,0 +1,1040 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IPv6 IOAM implementation
+ *
+ * Author:
+ * Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+
+#include <net/addrconf.h>
+#include <net/genetlink.h>
+#include <net/ioam6.h>
+#include <net/sch_generic.h>
+
+static void ioam6_ns_release(struct ioam6_namespace *ns)
+{
+ kfree_rcu(ns, rcu);
+}
+
+static void ioam6_sc_release(struct ioam6_schema *sc)
+{
+ kfree_rcu(sc, rcu);
+}
+
+static void ioam6_free_ns(void *ptr, void *arg)
+{
+ struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr;
+
+ if (ns)
+ ioam6_ns_release(ns);
+}
+
+static void ioam6_free_sc(void *ptr, void *arg)
+{
+ struct ioam6_schema *sc = (struct ioam6_schema *)ptr;
+
+ if (sc)
+ ioam6_sc_release(sc);
+}
+
+static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct ioam6_namespace *ns = obj;
+
+ return (ns->id != *(__be16 *)arg->key);
+}
+
+static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct ioam6_schema *sc = obj;
+
+ return (sc->id != *(u32 *)arg->key);
+}
+
+static const struct rhashtable_params rht_ns_params = {
+ .key_len = sizeof(__be16),
+ .key_offset = offsetof(struct ioam6_namespace, id),
+ .head_offset = offsetof(struct ioam6_namespace, head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = ioam6_ns_cmpfn,
+};
+
+static const struct rhashtable_params rht_sc_params = {
+ .key_len = sizeof(u32),
+ .key_offset = offsetof(struct ioam6_schema, id),
+ .head_offset = offsetof(struct ioam6_schema, head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = ioam6_sc_cmpfn,
+};
+
+static struct genl_family ioam6_genl_family;
+
+static const struct nla_policy ioam6_genl_policy_addns[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+ [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 },
+ [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 },
+};
+
+static const struct nla_policy ioam6_genl_policy_delns[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+};
+
+static const struct nla_policy ioam6_genl_policy_addsc[] = {
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+ [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY,
+ .len = IOAM6_MAX_SCHEMA_DATA_LEN },
+};
+
+static const struct nla_policy ioam6_genl_policy_delsc[] = {
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ioam6_genl_policy_ns_sc[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+ [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG },
+};
+
+static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ u64 data64;
+ u32 data32;
+ __be16 id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID])
+ return -EINVAL;
+
+ id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+ if (ns) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ns->id = id;
+
+ data32 = nla_get_u32_default(info->attrs[IOAM6_ATTR_NS_DATA],
+ IOAM6_U32_UNAVAILABLE);
+
+ data64 = nla_get_u64_default(info->attrs[IOAM6_ATTR_NS_DATA_WIDE],
+ IOAM6_U64_UNAVAILABLE);
+
+ ns->data = cpu_to_be32(data32);
+ ns->data_wide = cpu_to_be64(data64);
+
+ err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head,
+ rht_ns_params);
+ if (err)
+ kfree(ns);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ struct ioam6_schema *sc;
+ __be16 id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID])
+ return -EINVAL;
+
+ id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+ if (!ns) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ sc = rcu_dereference_protected(ns->schema,
+ lockdep_is_held(&nsdata->lock));
+
+ err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head,
+ rht_ns_params);
+ if (err)
+ goto out_unlock;
+
+ if (sc)
+ rcu_assign_pointer(sc->ns, NULL);
+
+ ioam6_ns_release(ns);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns,
+ u32 portid,
+ u32 seq,
+ u32 flags,
+ struct sk_buff *skb,
+ u8 cmd)
+{
+ struct ioam6_schema *sc;
+ u64 data64;
+ u32 data32;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ data32 = be32_to_cpu(ns->data);
+ data64 = be64_to_cpu(ns->data_wide);
+
+ if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) ||
+ (data32 != IOAM6_U32_UNAVAILABLE &&
+ nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) ||
+ (data64 != IOAM6_U64_UNAVAILABLE &&
+ nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE,
+ data64, IOAM6_ATTR_PAD)))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ sc = rcu_dereference(ns->schema);
+ if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpns_start(struct netlink_callback *cb)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&nsdata->namespaces, iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpns_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter;
+ struct ioam6_namespace *ns;
+ int err;
+
+ iter = (struct rhashtable_iter *)cb->args[0];
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ ns = rhashtable_walk_next(iter);
+
+ if (IS_ERR(ns)) {
+ if (PTR_ERR(ns) == -EAGAIN)
+ continue;
+ err = PTR_ERR(ns);
+ goto done;
+ } else if (!ns) {
+ break;
+ }
+
+ err = __ioam6_genl_dumpns_element(ns,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb,
+ IOAM6_CMD_DUMP_NAMESPACES);
+ if (err)
+ goto done;
+ }
+
+ err = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return err;
+}
+
+static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ int len, len_aligned, err;
+ struct ioam6_schema *sc;
+ u32 id;
+
+ if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA])
+ return -EINVAL;
+
+ id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+ if (sc) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]);
+ len_aligned = ALIGN(len, 4);
+
+ sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL);
+ if (!sc) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ sc->id = id;
+ sc->len = len_aligned;
+ sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24));
+ nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len);
+
+ err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head,
+ rht_sc_params);
+ if (err)
+ goto free_sc;
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+free_sc:
+ kfree(sc);
+ goto out_unlock;
+}
+
+static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ struct ioam6_schema *sc;
+ int err;
+ u32 id;
+
+ if (!info->attrs[IOAM6_ATTR_SC_ID])
+ return -EINVAL;
+
+ id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+ if (!sc) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock));
+
+ err = rhashtable_remove_fast(&nsdata->schemas, &sc->head,
+ rht_sc_params);
+ if (err)
+ goto out_unlock;
+
+ if (ns)
+ rcu_assign_pointer(ns->schema, NULL);
+
+ ioam6_sc_release(sc);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ struct ioam6_namespace *ns;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) ||
+ nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ ns = rcu_dereference(sc->ns);
+ if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpsc_start(struct netlink_callback *cb)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&nsdata->schemas, iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpsc_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter;
+ struct ioam6_schema *sc;
+ int err;
+
+ iter = (struct rhashtable_iter *)cb->args[0];
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ sc = rhashtable_walk_next(iter);
+
+ if (IS_ERR(sc)) {
+ if (PTR_ERR(sc) == -EAGAIN)
+ continue;
+ err = PTR_ERR(sc);
+ goto done;
+ } else if (!sc) {
+ break;
+ }
+
+ err = __ioam6_genl_dumpsc_element(sc,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb,
+ IOAM6_CMD_DUMP_SCHEMAS);
+ if (err)
+ goto done;
+ }
+
+ err = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return err;
+}
+
+static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_namespace *ns, *ns_ref;
+ struct ioam6_schema *sc, *sc_ref;
+ struct ioam6_pernet_data *nsdata;
+ __be16 ns_id;
+ u32 sc_id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID] ||
+ (!info->attrs[IOAM6_ATTR_SC_ID] &&
+ !info->attrs[IOAM6_ATTR_SC_NONE]))
+ return -EINVAL;
+
+ ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params);
+ if (!ns) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (info->attrs[IOAM6_ATTR_SC_NONE]) {
+ sc = NULL;
+ } else {
+ sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id,
+ rht_sc_params);
+ if (!sc) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+ }
+
+ sc_ref = rcu_dereference_protected(ns->schema,
+ lockdep_is_held(&nsdata->lock));
+ if (sc_ref)
+ rcu_assign_pointer(sc_ref->ns, NULL);
+ rcu_assign_pointer(ns->schema, sc);
+
+ if (sc) {
+ ns_ref = rcu_dereference_protected(sc->ns,
+ lockdep_is_held(&nsdata->lock));
+ if (ns_ref)
+ rcu_assign_pointer(ns_ref->schema, NULL);
+ rcu_assign_pointer(sc->ns, ns);
+ }
+
+ err = 0;
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static const struct genl_ops ioam6_genl_ops[] = {
+ {
+ .cmd = IOAM6_CMD_ADD_NAMESPACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_addns,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_addns,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DEL_NAMESPACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_delns,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_delns,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DUMP_NAMESPACES,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = ioam6_genl_dumpns_start,
+ .dumpit = ioam6_genl_dumpns,
+ .done = ioam6_genl_dumpns_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IOAM6_CMD_ADD_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_addsc,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_addsc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DEL_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_delsc,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_delsc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DUMP_SCHEMAS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = ioam6_genl_dumpsc_start,
+ .dumpit = ioam6_genl_dumpsc,
+ .done = ioam6_genl_dumpsc_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IOAM6_CMD_NS_SET_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_ns_set_schema,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_ns_sc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1,
+ },
+};
+
+#define IOAM6_GENL_EV_GRP_OFFSET 0
+
+static const struct genl_multicast_group ioam6_mcgrps[] = {
+ [IOAM6_GENL_EV_GRP_OFFSET] = { .name = IOAM6_GENL_EV_GRP_NAME,
+ .flags = GENL_MCAST_CAP_NET_ADMIN },
+};
+
+static int ioam6_event_put_trace(struct sk_buff *skb,
+ struct ioam6_trace_hdr *trace,
+ unsigned int len)
+{
+ if (nla_put_u16(skb, IOAM6_EVENT_ATTR_TRACE_NAMESPACE,
+ be16_to_cpu(trace->namespace_id)) ||
+ nla_put_u8(skb, IOAM6_EVENT_ATTR_TRACE_NODELEN, trace->nodelen) ||
+ nla_put_u32(skb, IOAM6_EVENT_ATTR_TRACE_TYPE,
+ be32_to_cpu(trace->type_be32)) ||
+ nla_put(skb, IOAM6_EVENT_ATTR_TRACE_DATA,
+ len - sizeof(struct ioam6_trace_hdr) - trace->remlen * 4,
+ trace->data + trace->remlen * 4))
+ return 1;
+
+ return 0;
+}
+
+void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp,
+ void *opt, unsigned int opt_len)
+{
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&ioam6_genl_family, net,
+ IOAM6_GENL_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &ioam6_genl_family, 0, type);
+ if (!nlh)
+ goto nla_put_failure;
+
+ switch (type) {
+ case IOAM6_EVENT_UNSPEC:
+ WARN_ON_ONCE(1);
+ break;
+ case IOAM6_EVENT_TRACE:
+ if (ioam6_event_put_trace(skb, (struct ioam6_trace_hdr *)opt,
+ opt_len))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(skb, nlh);
+ genlmsg_multicast_netns(&ioam6_genl_family, net, skb, 0,
+ IOAM6_GENL_EV_GRP_OFFSET, gfp);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+}
+
+static struct genl_family ioam6_genl_family __ro_after_init = {
+ .name = IOAM6_GENL_NAME,
+ .version = IOAM6_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ioam6_genl_ops,
+ .n_ops = ARRAY_SIZE(ioam6_genl_ops),
+ .resv_start_op = IOAM6_CMD_NS_SET_SCHEMA + 1,
+ .mcgrps = ioam6_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ioam6_mcgrps),
+ .module = THIS_MODULE,
+};
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+ return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+}
+
+static void __ioam6_fill_trace_data(struct sk_buff *skb,
+ struct ioam6_namespace *ns,
+ struct ioam6_trace_hdr *trace,
+ struct ioam6_schema *sc,
+ u8 sclen, bool is_input)
+{
+ struct net_device *dev = skb_dst_dev(skb);
+ struct timespec64 ts;
+ ktime_t tstamp;
+ u64 raw64;
+ u32 raw32;
+ u16 raw16;
+ u8 *data;
+ u8 byte;
+
+ data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4;
+
+ /* hop_lim and node_id */
+ if (trace->type.bit0) {
+ byte = ipv6_hdr(skb)->hop_limit;
+ if (is_input)
+ byte--;
+
+ raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id;
+
+ *(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
+ data += sizeof(__be32);
+ }
+
+ /* ingress_if_id and egress_if_id */
+ if (trace->type.bit1) {
+ if (!skb->dev)
+ raw16 = IOAM6_U16_UNAVAILABLE;
+ else
+ raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id);
+
+ *(__be16 *)data = cpu_to_be16(raw16);
+ data += sizeof(__be16);
+
+ if (dev->flags & IFF_LOOPBACK)
+ raw16 = IOAM6_U16_UNAVAILABLE;
+ else
+ raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id);
+
+ *(__be16 *)data = cpu_to_be16(raw16);
+ data += sizeof(__be16);
+ }
+
+ /* timestamp seconds */
+ if (trace->type.bit2) {
+ if (!skb->dev) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ tstamp = skb_tstamp_cond(skb, true);
+ ts = ktime_to_timespec64(tstamp);
+
+ *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+ }
+ data += sizeof(__be32);
+ }
+
+ /* timestamp subseconds */
+ if (trace->type.bit3) {
+ if (!skb->dev) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ if (!trace->type.bit2) {
+ tstamp = skb_tstamp_cond(skb, true);
+ ts = ktime_to_timespec64(tstamp);
+ }
+
+ *(__be32 *)data = cpu_to_be32((u32)(ts.tv_nsec / NSEC_PER_USEC));
+ }
+ data += sizeof(__be32);
+ }
+
+ /* transit delay */
+ if (trace->type.bit4) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* namespace data */
+ if (trace->type.bit5) {
+ *(__be32 *)data = ns->data;
+ data += sizeof(__be32);
+ }
+
+ /* queue depth */
+ if (trace->type.bit6) {
+ struct netdev_queue *queue;
+ struct Qdisc *qdisc;
+ __u32 qlen, backlog;
+
+ if (dev->flags & IFF_LOOPBACK) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ queue = skb_get_tx_queue(dev, skb);
+ qdisc = rcu_dereference(queue->qdisc);
+ qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog);
+
+ *(__be32 *)data = cpu_to_be32(backlog);
+ }
+ data += sizeof(__be32);
+ }
+
+ /* checksum complement */
+ if (trace->type.bit7) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* hop_lim and node_id (wide) */
+ if (trace->type.bit8) {
+ byte = ipv6_hdr(skb)->hop_limit;
+ if (is_input)
+ byte--;
+
+ raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide;
+
+ *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
+ data += sizeof(__be64);
+ }
+
+ /* ingress_if_id and egress_if_id (wide) */
+ if (trace->type.bit9) {
+ if (!skb->dev)
+ raw32 = IOAM6_U32_UNAVAILABLE;
+ else
+ raw32 = READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id_wide);
+
+ *(__be32 *)data = cpu_to_be32(raw32);
+ data += sizeof(__be32);
+
+ if (dev->flags & IFF_LOOPBACK)
+ raw32 = IOAM6_U32_UNAVAILABLE;
+ else
+ raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide);
+
+ *(__be32 *)data = cpu_to_be32(raw32);
+ data += sizeof(__be32);
+ }
+
+ /* namespace data (wide) */
+ if (trace->type.bit10) {
+ *(__be64 *)data = ns->data_wide;
+ data += sizeof(__be64);
+ }
+
+ /* buffer occupancy */
+ if (trace->type.bit11) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit12 undefined: filled with empty value */
+ if (trace->type.bit12) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit13 undefined: filled with empty value */
+ if (trace->type.bit13) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit14 undefined: filled with empty value */
+ if (trace->type.bit14) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit15 undefined: filled with empty value */
+ if (trace->type.bit15) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit16 undefined: filled with empty value */
+ if (trace->type.bit16) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit17 undefined: filled with empty value */
+ if (trace->type.bit17) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit18 undefined: filled with empty value */
+ if (trace->type.bit18) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit19 undefined: filled with empty value */
+ if (trace->type.bit19) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit20 undefined: filled with empty value */
+ if (trace->type.bit20) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit21 undefined: filled with empty value */
+ if (trace->type.bit21) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* opaque state snapshot */
+ if (trace->type.bit22) {
+ if (!sc) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8);
+ } else {
+ *(__be32 *)data = sc->hdr;
+ data += sizeof(__be32);
+
+ memcpy(data, sc->data, sc->len);
+ }
+ }
+}
+
+/* called with rcu_read_lock() */
+void ioam6_fill_trace_data(struct sk_buff *skb,
+ struct ioam6_namespace *ns,
+ struct ioam6_trace_hdr *trace,
+ bool is_input)
+{
+ struct ioam6_schema *sc;
+ u8 sclen = 0;
+
+ /* Skip if Overflow flag is set
+ */
+ if (trace->overflow)
+ return;
+
+ /* NodeLen does not include Opaque State Snapshot length. We need to
+ * take it into account if the corresponding bit is set (bit 22) and
+ * if the current IOAM namespace has an active schema attached to it
+ */
+ sc = rcu_dereference(ns->schema);
+ if (trace->type.bit22) {
+ sclen = sizeof_field(struct ioam6_schema, hdr) / 4;
+
+ if (sc)
+ sclen += sc->len / 4;
+ }
+
+ /* If there is no space remaining, we set the Overflow flag and we
+ * skip without filling the trace
+ */
+ if (!trace->remlen || trace->remlen < trace->nodelen + sclen) {
+ trace->overflow = 1;
+ return;
+ }
+
+ __ioam6_fill_trace_data(skb, ns, trace, sc, sclen, is_input);
+ trace->remlen -= trace->nodelen + sclen;
+}
+
+static int __net_init ioam6_net_init(struct net *net)
+{
+ struct ioam6_pernet_data *nsdata;
+ int err = -ENOMEM;
+
+ nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL);
+ if (!nsdata)
+ goto out;
+
+ mutex_init(&nsdata->lock);
+ net->ipv6.ioam6_data = nsdata;
+
+ err = rhashtable_init(&nsdata->namespaces, &rht_ns_params);
+ if (err)
+ goto free_nsdata;
+
+ err = rhashtable_init(&nsdata->schemas, &rht_sc_params);
+ if (err)
+ goto free_rht_ns;
+
+out:
+ return err;
+free_rht_ns:
+ rhashtable_destroy(&nsdata->namespaces);
+free_nsdata:
+ kfree(nsdata);
+ net->ipv6.ioam6_data = NULL;
+ goto out;
+}
+
+static void __net_exit ioam6_net_exit(struct net *net)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+ rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL);
+ rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL);
+
+ kfree(nsdata);
+}
+
+static struct pernet_operations ioam6_net_ops = {
+ .init = ioam6_net_init,
+ .exit = ioam6_net_exit,
+};
+
+int __init ioam6_init(void)
+{
+ int err = register_pernet_subsys(&ioam6_net_ops);
+ if (err)
+ goto out;
+
+ err = genl_register_family(&ioam6_genl_family);
+ if (err)
+ goto out_unregister_pernet_subsys;
+
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+ err = ioam6_iptunnel_init();
+ if (err)
+ goto out_unregister_genl;
+#endif
+
+ pr_info("In-situ OAM (IOAM) with IPv6\n");
+
+out:
+ return err;
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+out_unregister_genl:
+ genl_unregister_family(&ioam6_genl_family);
+#endif
+out_unregister_pernet_subsys:
+ unregister_pernet_subsys(&ioam6_net_ops);
+ goto out;
+}
+
+void ioam6_exit(void)
+{
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+ ioam6_iptunnel_exit();
+#endif
+ genl_unregister_family(&ioam6_genl_family);
+ unregister_pernet_subsys(&ioam6_net_ops);
+}
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
new file mode 100644
index 000000000000..1fe7894f14dd
--- /dev/null
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IPv6 IOAM Lightweight Tunnel implementation
+ *
+ * Author:
+ * Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_iptunnel.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/lwtunnel.h>
+#include <net/ioam6.h>
+#include <net/netlink.h>
+#include <net/ipv6.h>
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#define IOAM6_MASK_SHORT_FIELDS 0xff100000
+#define IOAM6_MASK_WIDE_FIELDS 0xe00000
+
+struct ioam6_lwt_encap {
+ struct ipv6_hopopt_hdr eh;
+ u8 pad[2]; /* 2-octet padding for 4n-alignment */
+ struct ioam6_hdr ioamh;
+ struct ioam6_trace_hdr traceh;
+} __packed;
+
+struct ioam6_lwt_freq {
+ u32 k;
+ u32 n;
+};
+
+struct ioam6_lwt {
+ struct dst_entry null_dst;
+ struct dst_cache cache;
+ struct ioam6_lwt_freq freq;
+ atomic_t pkt_cnt;
+ u8 mode;
+ bool has_tunsrc;
+ struct in6_addr tunsrc;
+ struct in6_addr tundst;
+ struct ioam6_lwt_encap tuninfo;
+};
+
+static const struct netlink_range_validation freq_range = {
+ .min = IOAM6_IPTUNNEL_FREQ_MIN,
+ .max = IOAM6_IPTUNNEL_FREQ_MAX,
+};
+
+static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
+{
+ return (struct ioam6_lwt *)lwt->data;
+}
+
+static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
+{
+ return &ioam6_lwt_state(lwt)->tuninfo;
+}
+
+static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
+{
+ return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
+}
+
+static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+ [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
+ IOAM6_IPTUNNEL_MODE_MIN,
+ IOAM6_IPTUNNEL_MODE_MAX),
+ [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(
+ sizeof(struct ioam6_trace_hdr)),
+};
+
+static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
+{
+ u32 fields;
+
+ if (!trace->type_be32 || !trace->remlen ||
+ trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 ||
+ trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+ trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+ trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+ trace->type.bit21 | trace->type.bit23)
+ return false;
+
+ trace->nodelen = 0;
+ fields = be32_to_cpu(trace->type_be32);
+
+ trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
+ * (sizeof(__be32) / 4);
+ trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
+ * (sizeof(__be64) / 4);
+
+ return true;
+}
+
+static int ioam6_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
+ struct ioam6_lwt_encap *tuninfo;
+ struct ioam6_trace_hdr *trace;
+ struct lwtunnel_state *lwt;
+ struct ioam6_lwt *ilwt;
+ int len_aligned, err;
+ u32 freq_k, freq_n;
+ u8 mode;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
+ ioam6_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
+ (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
+ NL_SET_ERR_MSG(extack, "freq: missing parameter");
+ return -EINVAL;
+ } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
+ freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
+ freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
+ } else {
+ freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
+ freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
+
+ if (freq_k > freq_n) {
+ NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
+ return -EINVAL;
+ }
+ }
+
+ mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE],
+ IOAM6_IPTUNNEL_MODE_INLINE);
+
+ if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) {
+ NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode");
+ return -EINVAL;
+ }
+
+ if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination");
+ return -EINVAL;
+ }
+
+ if (!tb[IOAM6_IPTUNNEL_TRACE]) {
+ NL_SET_ERR_MSG(extack, "missing trace");
+ return -EINVAL;
+ }
+
+ trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
+ if (!ioam6_validate_trace_hdr(trace)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
+ "invalid trace validation");
+ return -EINVAL;
+ }
+
+ len_aligned = ALIGN(trace->remlen * 4, 8);
+ lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned);
+ if (!lwt)
+ return -ENOMEM;
+
+ ilwt = ioam6_lwt_state(lwt);
+ err = dst_cache_init(&ilwt->cache, GFP_ATOMIC);
+ if (err)
+ goto free_lwt;
+
+ /* This "fake" dst_entry will be stored in a dst_cache, which will call
+ * dst_hold() and dst_release() on it. We must ensure that dst_destroy()
+ * will never be called. For that, its initial refcount is 1 and +1 when
+ * it is stored in the cache. Then, +1/-1 each time we read the cache
+ * and release it. Long story short, we're fine.
+ */
+ dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT);
+
+ atomic_set(&ilwt->pkt_cnt, 0);
+ ilwt->freq.k = freq_k;
+ ilwt->freq.n = freq_n;
+
+ ilwt->mode = mode;
+
+ if (!tb[IOAM6_IPTUNNEL_SRC]) {
+ ilwt->has_tunsrc = false;
+ } else {
+ ilwt->has_tunsrc = true;
+ ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]);
+
+ if (ipv6_addr_any(&ilwt->tunsrc)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC],
+ "invalid tunnel source address");
+ err = -EINVAL;
+ goto free_cache;
+ }
+ }
+
+ if (tb[IOAM6_IPTUNNEL_DST]) {
+ ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
+
+ if (ipv6_addr_any(&ilwt->tundst)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST],
+ "invalid tunnel dest address");
+ err = -EINVAL;
+ goto free_cache;
+ }
+ }
+
+ tuninfo = ioam6_lwt_info(lwt);
+ tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1;
+ tuninfo->pad[0] = IPV6_TLV_PADN;
+ tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
+ tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
+ tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
+ + trace->remlen * 4;
+
+ memcpy(&tuninfo->traceh, trace, sizeof(*trace));
+
+ if (len_aligned - trace->remlen * 4) {
+ tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
+ tuninfo->traceh.data[trace->remlen * 4 + 1] = 2;
+ }
+
+ lwt->type = LWTUNNEL_ENCAP_IOAM6;
+ lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = lwt;
+
+ return 0;
+free_cache:
+ dst_cache_destroy(&ilwt->cache);
+free_lwt:
+ kfree(lwt);
+ return err;
+}
+
+static int ioam6_do_fill(struct net *net, struct sk_buff *skb)
+{
+ struct ioam6_trace_hdr *trace;
+ struct ioam6_namespace *ns;
+
+ trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
+ + sizeof(struct ipv6_hopopt_hdr) + 2
+ + sizeof(struct ioam6_hdr));
+
+ ns = ioam6_namespace(net, trace->namespace_id);
+ if (ns)
+ ioam6_fill_trace_data(skb, ns, trace, false);
+
+ return 0;
+}
+
+static int ioam6_do_inline(struct net *net, struct sk_buff *skb,
+ struct ioam6_lwt_encap *tuninfo,
+ struct dst_entry *cache_dst)
+{
+ struct ipv6hdr *oldhdr, *hdr;
+ int hdrlen, err;
+
+ hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+
+ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ oldhdr = ipv6_hdr(skb);
+ skb_pull(skb, sizeof(*oldhdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
+
+ skb_push(skb, sizeof(*oldhdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, oldhdr, sizeof(*oldhdr));
+ tuninfo->eh.nexthdr = hdr->nexthdr;
+
+ skb_set_transport_header(skb, sizeof(*hdr));
+ skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
+
+ memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+ hdr->nexthdr = NEXTHDR_HOP;
+ hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+
+ return ioam6_do_fill(net, skb);
+}
+
+static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
+ struct ioam6_lwt_encap *tuninfo,
+ bool has_tunsrc,
+ struct in6_addr *tunsrc,
+ struct in6_addr *tundst,
+ struct dst_entry *cache_dst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *hdr, *inner_hdr;
+ int hdrlen, len, err;
+
+ hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+ len = sizeof(*hdr) + hdrlen;
+
+ err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+
+ skb_push(skb, len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(*hdr));
+
+ tuninfo->eh.nexthdr = NEXTHDR_IPV6;
+ memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+ hdr = ipv6_hdr(skb);
+ memcpy(hdr, inner_hdr, sizeof(*hdr));
+
+ hdr->nexthdr = NEXTHDR_HOP;
+ hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+ hdr->daddr = *tundst;
+
+ if (has_tunsrc)
+ memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc));
+ else
+ ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr,
+ IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
+
+ skb_postpush_rcsum(skb, hdr, len);
+
+ return ioam6_do_fill(net, skb);
+}
+
+static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct ioam6_lwt *ilwt;
+ int err = -EINVAL;
+ u32 pkt_cnt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ ilwt = ioam6_lwt_state(orig_dst->lwtstate);
+
+ /* Check for insertion frequency (i.e., "k over n" insertions) */
+ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
+ if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
+ goto out;
+
+ local_bh_disable();
+ dst = dst_cache_get(&ilwt->cache);
+ local_bh_enable();
+
+ /* This is how we notify that the destination does not change after
+ * transformation and that we need to use orig_dst instead of the cache
+ */
+ if (dst == &ilwt->null_dst) {
+ dst_release(dst);
+
+ dst = orig_dst;
+ /* keep refcount balance: dst_release() is called at the end */
+ dst_hold(dst);
+ }
+
+ switch (ilwt->mode) {
+ case IOAM6_IPTUNNEL_MODE_INLINE:
+do_inline:
+ /* Direct insertion - if there is no Hop-by-Hop yet */
+ if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
+ goto out;
+
+ err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst);
+ if (unlikely(err))
+ goto drop;
+
+ break;
+ case IOAM6_IPTUNNEL_MODE_ENCAP:
+do_encap:
+ /* Encapsulation (ip6ip6) */
+ err = ioam6_do_encap(net, skb, &ilwt->tuninfo,
+ ilwt->has_tunsrc, &ilwt->tunsrc,
+ &ilwt->tundst, dst);
+ if (unlikely(err))
+ goto drop;
+
+ break;
+ case IOAM6_IPTUNNEL_MODE_AUTO:
+ /* Automatic (RFC8200 compliant):
+ * - local packets -> INLINE mode
+ * - in-transit packets -> ENCAP mode
+ */
+ if (!skb->dev)
+ goto do_inline;
+
+ goto do_encap;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ goto drop;
+ }
+
+ /* If the destination is the same after transformation (which is
+ * a valid use case for IOAM), then we don't want to add it to
+ * the cache in order to avoid a reference loop. Instead, we add
+ * our fake dst_entry to the cache as a way to detect this case.
+ * Otherwise, we add the resolved destination to the cache.
+ */
+ local_bh_disable();
+ if (orig_dst->lwtstate == dst->lwtstate)
+ dst_cache_set_ip6(&ilwt->cache,
+ &ilwt->null_dst, &fl6.saddr);
+ else
+ dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ }
+
+ /* avoid lwtunnel_output() reentry loop when destination is the same
+ * after transformation (e.g., with the inline mode)
+ */
+ if (orig_dst->lwtstate != dst->lwtstate) {
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ return dst_output(net, sk, skb);
+ }
+out:
+ dst_release(dst);
+ return orig_dst->lwtstate->orig_output(net, sk, skb);
+drop:
+ dst_release(dst);
+ kfree_skb(skb);
+ return err;
+}
+
+static void ioam6_destroy_state(struct lwtunnel_state *lwt)
+{
+ /* Since the refcount of per-cpu dst_entry caches will never be 0 (see
+ * why above) when our "fake" dst_entry is used, it is not necessary to
+ * remove them before calling dst_cache_destroy()
+ */
+ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
+}
+
+static int ioam6_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
+ int err;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
+ if (err)
+ goto ret;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
+ if (err)
+ goto ret;
+
+ err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
+ if (err)
+ goto ret;
+
+ if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ if (ilwt->has_tunsrc) {
+ err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC,
+ &ilwt->tunsrc);
+ if (err)
+ goto ret;
+ }
+
+ err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst);
+ if (err)
+ goto ret;
+ }
+
+ err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh),
+ &ilwt->tuninfo.traceh);
+ret:
+ return err;
+}
+
+static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
+ int nlsize;
+
+ nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
+ nla_total_size(sizeof(ilwt->freq.n)) +
+ nla_total_size(sizeof(ilwt->mode)) +
+ nla_total_size(sizeof(ilwt->tuninfo.traceh));
+
+ if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ if (ilwt->has_tunsrc)
+ nlsize += nla_total_size(sizeof(ilwt->tunsrc));
+
+ nlsize += nla_total_size(sizeof(ilwt->tundst));
+ }
+
+ return nlsize;
+}
+
+static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a);
+ struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b);
+ struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
+ struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
+
+ return (ilwt_a->freq.k != ilwt_b->freq.k ||
+ ilwt_a->freq.n != ilwt_b->freq.n ||
+ ilwt_a->mode != ilwt_b->mode ||
+ ilwt_a->has_tunsrc != ilwt_b->has_tunsrc ||
+ (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
+ !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
+ (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
+ ilwt_a->has_tunsrc &&
+ !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) ||
+ trace_a->namespace_id != trace_b->namespace_id);
+}
+
+static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
+ .build_state = ioam6_build_state,
+ .destroy_state = ioam6_destroy_state,
+ .output = ioam6_output,
+ .fill_encap = ioam6_fill_encap_info,
+ .get_encap_size = ioam6_encap_nlsize,
+ .cmp_encap = ioam6_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init ioam6_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
+
+void ioam6_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
new file mode 100644
index 000000000000..377717045f8f
--- /dev/null
+++ b/net/ipv6/ip6_checksum.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <asm/checksum.h>
+
+#ifndef _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum csum)
+{
+
+ int carry;
+ __u32 ulen;
+ __u32 uproto;
+ __u32 sum = (__force u32)csum;
+
+ sum += (__force u32)saddr->s6_addr32[0];
+ carry = (sum < (__force u32)saddr->s6_addr32[0]);
+ sum += carry;
+
+ sum += (__force u32)saddr->s6_addr32[1];
+ carry = (sum < (__force u32)saddr->s6_addr32[1]);
+ sum += carry;
+
+ sum += (__force u32)saddr->s6_addr32[2];
+ carry = (sum < (__force u32)saddr->s6_addr32[2]);
+ sum += carry;
+
+ sum += (__force u32)saddr->s6_addr32[3];
+ carry = (sum < (__force u32)saddr->s6_addr32[3]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[0];
+ carry = (sum < (__force u32)daddr->s6_addr32[0]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[1];
+ carry = (sum < (__force u32)daddr->s6_addr32[1]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[2];
+ carry = (sum < (__force u32)daddr->s6_addr32[2]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[3];
+ carry = (sum < (__force u32)daddr->s6_addr32[3]);
+ sum += carry;
+
+ ulen = (__force u32)htonl((__u32) len);
+ sum += ulen;
+ carry = (sum < ulen);
+ sum += carry;
+
+ uproto = (__force u32)htonl(proto);
+ sum += uproto;
+ carry = (sum < uproto);
+ sum += carry;
+
+ return csum_fold((__force __wsum)sum);
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
+#endif
+
+int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
+{
+ int err;
+
+ UDP_SKB_CB(skb)->partial_cov = 0;
+ UDP_SKB_CB(skb)->cscov = skb->len;
+
+ if (proto == IPPROTO_UDPLITE) {
+ err = udplite_checksum_init(skb, uh);
+ if (err)
+ return err;
+
+ if (UDP_SKB_CB(skb)->partial_cov) {
+ skb->csum = ip6_compute_pseudo(skb, proto);
+ return 0;
+ }
+ }
+
+ /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels)
+ * we accept a checksum of zero here. When we find the socket
+ * for the UDP packet we'll check if that socket allows zero checksum
+ * for IPv6 (set by socket option).
+ *
+ * Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
+ */
+ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ ip6_compute_pseudo);
+ if (err)
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
+ /* If SW calculated the value, we know it's bad */
+ if (skb->csum_complete_sw)
+ return 1;
+
+ /* HW says the value is bad. Let's validate that.
+ * skb->csum is no longer the full packet checksum,
+ * so don't treat is as such.
+ */
+ skb_checksum_complete_unset(skb);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(udp6_csum_init);
+
+/* Function to set UDP checksum for an IPv6 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
+ */
+void udp6_set_csum(bool nocheck, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int len)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v6_check(len, saddr, daddr, 0);
+ else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ uh->check = 0;
+ uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v6_check(len, saddr, daddr, 0);
+ }
+}
+EXPORT_SYMBOL(udp6_set_csum);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index bf526115e518..2111af022d94 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1,25 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Linux INET6 implementation
+ * Linux INET6 implementation
* Forwarding Information Database
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Changes:
+ * Yuji SEKIYA @USAGI: Support default route on router node;
+ * remove ip6_null_entry from the top of
+ * routing table.
+ * Ville Nuorvala: Fixed routing subtrees.
*/
-/*
- * Changes:
- * Yuji SEKIYA @USAGI: Support default route on router node;
- * remove ip6_null_entry from the top of
- * routing table.
- * Ville Nuorvala: Fixed routing subtrees.
- */
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
@@ -28,61 +24,44 @@
#include <linux/in6.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
-#ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
-#endif
-
+#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
+#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
+#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
-#define RT6_DEBUG 2
-
-#if RT6_DEBUG >= 3
-#define RT6_TRACE(x...) printk(KERN_DEBUG x)
-#else
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
-
-struct rt6_statistics rt6_stats;
+static struct kmem_cache *fib6_node_kmem __read_mostly;
-static kmem_cache_t * fib6_node_kmem __read_mostly;
-
-enum fib_walk_state_t
-{
-#ifdef CONFIG_IPV6_SUBTREES
- FWS_S,
-#endif
- FWS_L,
- FWS_R,
- FWS_C,
- FWS_U
-};
-
-struct fib6_cleaner_t
-{
- struct fib6_walker_t w;
- int (*func)(struct rt6_info *, void *arg);
+struct fib6_cleaner {
+ struct fib6_walker w;
+ struct net *net;
+ int (*func)(struct fib6_info *, void *arg);
+ int sernum;
void *arg;
+ bool skip_notify;
};
-static DEFINE_RWLOCK(fib6_walker_lock);
-
#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
#define FWS_INIT FWS_L
#endif
-static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
-static struct rt6_info * fib6_find_prefix(struct fib6_node *fn);
-static struct fib6_node * fib6_repair_tree(struct fib6_node *fn);
-static int fib6_walk(struct fib6_walker_t *w);
-static int fib6_walk_continue(struct fib6_walker_t *w);
+static struct fib6_info *fib6_find_prefix(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn);
+static int fib6_walk(struct net *net, struct fib6_walker *w);
+static int fib6_walk_continue(struct fib6_walker *w);
/*
* A routing update causes an increase of the serial number on the
@@ -91,98 +70,147 @@ static int fib6_walk_continue(struct fib6_walker_t *w);
* result of redirects, path MTU changes, etc.
*/
-static __u32 rt_sernum;
-
-static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0);
+static void fib6_gc_timer_cb(struct timer_list *t);
-static struct fib6_walker_t fib6_walker_list = {
- .prev = &fib6_walker_list,
- .next = &fib6_walker_list,
-};
+#define FOR_WALKERS(net, w) \
+ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
-#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
+static void fib6_walker_link(struct net *net, struct fib6_walker *w)
+{
+ write_lock_bh(&net->ipv6.fib6_walker_lock);
+ list_add(&w->lh, &net->ipv6.fib6_walkers);
+ write_unlock_bh(&net->ipv6.fib6_walker_lock);
+}
-static inline void fib6_walker_link(struct fib6_walker_t *w)
+static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
- write_lock_bh(&fib6_walker_lock);
- w->next = fib6_walker_list.next;
- w->prev = &fib6_walker_list;
- w->next->prev = w;
- w->prev->next = w;
- write_unlock_bh(&fib6_walker_lock);
+ write_lock_bh(&net->ipv6.fib6_walker_lock);
+ list_del(&w->lh);
+ write_unlock_bh(&net->ipv6.fib6_walker_lock);
}
-static inline void fib6_walker_unlink(struct fib6_walker_t *w)
+static int fib6_new_sernum(struct net *net)
{
- write_lock_bh(&fib6_walker_lock);
- w->next->prev = w->prev;
- w->prev->next = w->next;
- w->prev = w->next = w;
- write_unlock_bh(&fib6_walker_lock);
+ int new, old = atomic_read(&net->ipv6.fib6_sernum);
+
+ do {
+ new = old < INT_MAX ? old + 1 : 1;
+ } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));
+
+ return new;
}
-static __inline__ u32 fib6_new_sernum(void)
+
+enum {
+ FIB6_NO_SERNUM_CHANGE = 0,
+};
+
+void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
- u32 n = ++rt_sernum;
- if ((__s32)n <= 0)
- rt_sernum = n = 1;
- return n;
+ struct fib6_node *fn;
+
+ fn = rcu_dereference_protected(f6i->fib6_node,
+ lockdep_is_held(&f6i->fib6_table->tb6_lock));
+ if (fn)
+ WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}
/*
* Auxiliary address test functions for the radix tree.
*
- * These assume a 32bit processor (although it will work on
+ * These assume a 32bit processor (although it will work on
* 64bit processors)
*/
/*
* test bit
*/
+#if defined(__LITTLE_ENDIAN)
+# define BITOP_BE32_SWIZZLE (0x1F & ~7)
+#else
+# define BITOP_BE32_SWIZZLE 0
+#endif
+
+static __be32 addr_bit_set(const void *token, int fn_bit)
+{
+ const __be32 *addr = token;
+ /*
+ * Here,
+ * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
+ * is optimized version of
+ * htonl(1 << ((~fn_bit)&0x1F))
+ * See include/asm-generic/bitops/le.h.
+ */
+ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
+ addr[fn_bit >> 5];
+}
+
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
+{
+ struct fib6_info *f6i;
+ size_t sz = sizeof(*f6i);
+
+ if (with_fib6_nh)
+ sz += sizeof(struct fib6_nh);
+
+ f6i = kzalloc(sz, gfp_flags);
+ if (!f6i)
+ return NULL;
+
+ /* fib6_siblings is a union with nh_list, so this initializes both */
+ INIT_LIST_HEAD(&f6i->fib6_siblings);
+ refcount_set(&f6i->fib6_ref, 1);
-static __inline__ __be32 addr_bit_set(void *token, int fn_bit)
+ INIT_HLIST_NODE(&f6i->gc_link);
+
+ return f6i;
+}
+
+void fib6_info_destroy_rcu(struct rcu_head *head)
{
- __be32 *addr = token;
+ struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
- return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
+ WARN_ON(f6i->fib6_node);
+
+ if (f6i->nh)
+ nexthop_put(f6i->nh);
+ else
+ fib6_nh_release(f6i->fib6_nh);
+
+ ip_fib_metrics_put(f6i->fib6_metrics);
+ kfree(f6i);
}
+EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
-static __inline__ struct fib6_node * node_alloc(void)
+static struct fib6_node *node_alloc(struct net *net)
{
struct fib6_node *fn;
- if ((fn = kmem_cache_alloc(fib6_node_kmem, SLAB_ATOMIC)) != NULL)
- memset(fn, 0, sizeof(struct fib6_node));
+ fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+ if (fn)
+ net->ipv6.rt6_stats->fib_nodes++;
return fn;
}
-static __inline__ void node_free(struct fib6_node * fn)
+static void node_free_immediate(struct net *net, struct fib6_node *fn)
{
kmem_cache_free(fib6_node_kmem, fn);
+ net->ipv6.rt6_stats->fib_nodes--;
}
-static __inline__ void rt6_release(struct rt6_info *rt)
+static void node_free(struct net *net, struct fib6_node *fn)
{
- if (atomic_dec_and_test(&rt->rt6i_ref))
- dst_free(&rt->u.dst);
+ kfree_rcu(fn, rcu);
+ net->ipv6.rt6_stats->fib_nodes--;
}
-static struct fib6_table fib6_main_tbl = {
- .tb6_id = RT6_TABLE_MAIN,
- .tb6_root = {
- .leaf = &ip6_null_entry,
- .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
- },
-};
-
-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
-#define FIB_TABLE_HASHSZ 256
-#else
-#define FIB_TABLE_HASHSZ 1
-#endif
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+static void fib6_free_table(struct fib6_table *table)
+{
+ inetpeer_invalidate_tree(&table->tb6_peers);
+ kfree(table);
+}
-static void fib6_link_table(struct fib6_table *tb)
+static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
unsigned int h;
@@ -190,122 +218,339 @@ static void fib6_link_table(struct fib6_table *tb)
* Initialize table lock at a single place to give lockdep a key,
* tables aren't visible prior to being linked to the list.
*/
- rwlock_init(&tb->tb6_lock);
-
- h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
+ spin_lock_init(&tb->tb6_lock);
+ h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
/*
* No protection necessary, this is the only list mutatation
* operation, tables never disappear once they exist.
*/
- hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
+ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
}
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
-static struct fib6_table fib6_local_tbl = {
- .tb6_id = RT6_TABLE_LOCAL,
- .tb6_root = {
- .leaf = &ip6_null_entry,
- .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
- },
-};
-static struct fib6_table *fib6_alloc_table(u32 id)
+static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
{
struct fib6_table *table;
table = kzalloc(sizeof(*table), GFP_ATOMIC);
- if (table != NULL) {
+ if (table) {
table->tb6_id = id;
- table->tb6_root.leaf = &ip6_null_entry;
+ rcu_assign_pointer(table->tb6_root.leaf,
+ net->ipv6.fib6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+ inet_peer_base_init(&table->tb6_peers);
+ INIT_HLIST_HEAD(&table->tb6_gc_hlist);
}
return table;
}
-struct fib6_table *fib6_new_table(u32 id)
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
- struct fib6_table *tb;
+ struct fib6_table *tb, *new_tb;
if (id == 0)
id = RT6_TABLE_MAIN;
- tb = fib6_get_table(id);
+
+ tb = fib6_get_table(net, id);
if (tb)
return tb;
- tb = fib6_alloc_table(id);
- if (tb != NULL)
- fib6_link_table(tb);
+ new_tb = fib6_alloc_table(net, id);
+ if (!new_tb)
+ return NULL;
+
+ spin_lock_bh(&net->ipv6.fib_table_hash_lock);
+
+ tb = fib6_get_table(net, id);
+ if (unlikely(tb)) {
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
+ kfree(new_tb);
+ return tb;
+ }
+
+ fib6_link_table(net, new_tb);
+
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
- return tb;
+ return new_tb;
}
+EXPORT_SYMBOL_GPL(fib6_new_table);
-struct fib6_table *fib6_get_table(u32 id)
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
+ struct hlist_head *head;
struct fib6_table *tb;
- struct hlist_node *node;
- unsigned int h;
- if (id == 0)
+ if (!id)
id = RT6_TABLE_MAIN;
- h = id & (FIB_TABLE_HASHSZ - 1);
- rcu_read_lock();
- hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb6_hlist) {
- if (tb->tb6_id == id) {
- rcu_read_unlock();
+
+ head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];
+
+ /* See comment in fib6_link_table(). RCU is not required,
+ * but rcu_dereference_raw() is used to avoid data-race.
+ */
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
+ if (tb->tb6_id == id)
return tb;
- }
- }
- rcu_read_unlock();
return NULL;
}
+EXPORT_SYMBOL_GPL(fib6_get_table);
-static void __init fib6_tables_init(void)
+static void __net_init fib6_tables_init(struct net *net)
{
- fib6_link_table(&fib6_main_tbl);
- fib6_link_table(&fib6_local_tbl);
+ fib6_link_table(net, net->ipv6.fib6_main_tbl);
+ fib6_link_table(net, net->ipv6.fib6_local_tbl);
}
-
#else
-struct fib6_table *fib6_new_table(u32 id)
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
+{
+ return fib6_get_table(net, id);
+}
+
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
- return fib6_get_table(id);
+ return net->ipv6.fib6_main_tbl;
}
-struct fib6_table *fib6_get_table(u32 id)
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags, pol_lookup_t lookup)
{
- return &fib6_main_tbl;
+ struct rt6_info *rt;
+
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
+ if (rt->dst.error == -EAGAIN) {
+ ip6_rt_put_flags(rt, flags);
+ rt = net->ipv6.ip6_null_entry;
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
+ }
+
+ return &rt->dst;
}
-struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
- pol_lookup_t lookup)
+/* called with rcu lock held; no reference taken on fib6_info */
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags);
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
+ res, flags);
}
-static void __init fib6_tables_init(void)
+static void __net_init fib6_tables_init(struct net *net)
{
- fib6_link_table(&fib6_main_tbl);
+ fib6_link_table(net, net->ipv6.fib6_main_tbl);
}
#endif
-static int fib6_dump_node(struct fib6_walker_t *w)
+unsigned int fib6_tables_seq_read(const struct net *net)
+{
+ unsigned int h, fib_seq = 0;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ const struct fib6_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist)
+ fib_seq += READ_ONCE(tb->fib_seq);
+ }
+ rcu_read_unlock();
+
+ return fib_seq;
+}
+
+static int call_fib6_entry_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ return call_fib6_notifier(nb, event_type, &info.info);
+}
+
+static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ .nsiblings = nsiblings,
+ };
+
+ return call_fib6_notifier(nb, event_type, &info.info);
+}
+
+int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+int call_fib6_multipath_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ .nsiblings = nsiblings,
+ };
+
+ WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ .nsiblings = rt->fib6_nsiblings,
+ };
+
+ WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
+ return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
+}
+
+struct fib6_dump_arg {
+ struct net *net;
+ struct notifier_block *nb;
+ struct netlink_ext_ack *extack;
+};
+
+static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
+{
+ enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
+ unsigned int nsiblings;
+ int err;
+
+ if (!rt || rt == arg->net->ipv6.fib6_null_entry)
+ return 0;
+
+ nsiblings = READ_ONCE(rt->fib6_nsiblings);
+ if (nsiblings)
+ err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
+ rt,
+ nsiblings,
+ arg->extack);
+ else
+ err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
+ arg->extack);
+
+ return err;
+}
+
+static int fib6_node_dump(struct fib6_walker *w)
+{
+ int err;
+
+ err = fib6_rt_dump(w->leaf, w->args);
+ w->leaf = NULL;
+ return err;
+}
+
+static int fib6_table_dump(struct net *net, struct fib6_table *tb,
+ struct fib6_walker *w)
+{
+ int err;
+
+ w->root = &tb->tb6_root;
+ spin_lock_bh(&tb->tb6_lock);
+ err = fib6_walk(net, w);
+ spin_unlock_bh(&tb->tb6_lock);
+ return err;
+}
+
+/* Called with rcu_read_lock() */
+int fib6_tables_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_dump_arg arg;
+ struct fib6_walker *w;
+ unsigned int h;
+ int err = 0;
+
+ w = kzalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return -ENOMEM;
+
+ w->func = fib6_node_dump;
+ arg.net = net;
+ arg.nb = nb;
+ arg.extack = extack;
+ w->args = &arg;
+
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ err = fib6_table_dump(net, tb, w);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ kfree(w);
+
+ /* The tree traversal function should never return a positive value. */
+ return err > 0 ? -EINVAL : err;
+}
+
+static int fib6_dump_node(struct fib6_walker *w)
{
int res;
- struct rt6_info *rt;
+ struct fib6_info *rt;
- for (rt = w->leaf; rt; rt = rt->u.next) {
- res = rt6_dump_route(rt, w->args);
- if (res < 0) {
+ for_each_fib6_walker_rt(w) {
+ res = rt6_dump_route(rt, w->args, w->skip_in_node);
+ if (res >= 0) {
/* Frame is full, suspend walking */
w->leaf = rt;
+
+ /* We'll restart from this node, so if some routes were
+ * already dumped, skip them next time.
+ */
+ w->skip_in_node += res;
+
return 1;
}
- BUG_TRAP(res!=0);
+ w->skip_in_node = 0;
+
+ /* Multipath routes are dumped in one route with the
+ * RTA_MULTIPATH attribute. Jump 'rt' to point to the
+ * last sibling of this route (no need to dump the
+ * sibling routes again)
+ */
+ if (rt->fib6_nsiblings)
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
w->leaf = NULL;
return 0;
@@ -313,13 +558,18 @@ static int fib6_dump_node(struct fib6_walker_t *w)
static void fib6_dump_end(struct netlink_callback *cb)
{
- struct fib6_walker_t *w = (void*)cb->args[2];
+ struct net *net = sock_net(cb->skb->sk);
+ struct fib6_walker *w = (void *)cb->args[2];
if (w) {
+ if (cb->args[4]) {
+ cb->args[4] = 0;
+ fib6_walker_unlink(net, w);
+ }
cb->args[2] = 0;
kfree(w);
}
- cb->done = (void*)cb->args[3];
+ cb->done = (void *)cb->args[3];
cb->args[1] = 3;
}
@@ -332,77 +582,133 @@ static int fib6_dump_done(struct netlink_callback *cb)
static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct fib6_walker_t *w;
+ struct net *net = sock_net(skb->sk);
+ struct fib6_walker *w;
int res;
w = (void *)cb->args[2];
w->root = &table->tb6_root;
if (cb->args[4] == 0) {
- read_lock_bh(&table->tb6_lock);
- res = fib6_walk(w);
- read_unlock_bh(&table->tb6_lock);
- if (res > 0)
+ w->count = 0;
+ w->skip = 0;
+ w->skip_in_node = 0;
+
+ spin_lock_bh(&table->tb6_lock);
+ res = fib6_walk(net, w);
+ spin_unlock_bh(&table->tb6_lock);
+ if (res > 0) {
cb->args[4] = 1;
+ cb->args[5] = READ_ONCE(w->root->fn_sernum);
+ }
} else {
- read_lock_bh(&table->tb6_lock);
+ int sernum = READ_ONCE(w->root->fn_sernum);
+ if (cb->args[5] != sernum) {
+ /* Begin at the root if the tree changed */
+ cb->args[5] = sernum;
+ w->state = FWS_INIT;
+ w->node = w->root;
+ w->skip = w->count;
+ w->skip_in_node = 0;
+ } else
+ w->skip = 0;
+
+ spin_lock_bh(&table->tb6_lock);
res = fib6_walk_continue(w);
- read_unlock_bh(&table->tb6_lock);
- if (res != 0) {
- if (res < 0)
- fib6_walker_unlink(w);
- goto end;
+ spin_unlock_bh(&table->tb6_lock);
+ if (res <= 0) {
+ fib6_walker_unlink(net, w);
+ cb->args[4] = 0;
}
- fib6_walker_unlink(w);
- cb->args[4] = 0;
}
-end:
+
return res;
}
-int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned int h, s_h;
+ struct rt6_rtnl_dump_arg arg = {
+ .filter.dump_exceptions = true,
+ .filter.dump_routes = true,
+ .filter.rtnl_held = false,
+ };
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
unsigned int e = 0, s_e;
- struct rt6_rtnl_dump_arg arg;
- struct fib6_walker_t *w;
+ struct hlist_head *head;
+ struct fib6_walker *w;
struct fib6_table *tb;
- struct hlist_node *node;
- int res = 0;
+ unsigned int h, s_h;
+ int err = 0;
- s_h = cb->args[0];
- s_e = cb->args[1];
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
+ if (err < 0)
+ goto unlock;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ if (rtm->rtm_flags & RTM_F_PREFIX)
+ arg.filter.flags = RTM_F_PREFIX;
+ }
w = (void *)cb->args[2];
- if (w == NULL) {
+ if (!w) {
/* New dump:
*
- * 1. hook callback destructor.
- */
- cb->args[3] = (long)cb->done;
- cb->done = fib6_dump_done;
-
- /*
- * 2. allocate and initialize walker.
+ * 1. allocate and initialize walker.
*/
w = kzalloc(sizeof(*w), GFP_ATOMIC);
- if (w == NULL)
- return -ENOMEM;
+ if (!w) {
+ err = -ENOMEM;
+ goto unlock;
+ }
w->func = fib6_dump_node;
cb->args[2] = (long)w;
+
+ /* 2. hook callback destructor.
+ */
+ cb->args[3] = (long)cb->done;
+ cb->done = fib6_dump_done;
+
}
arg.skb = skb;
arg.cb = cb;
+ arg.net = net;
w->args = &arg;
- for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+ if (arg.filter.table_id) {
+ tb = fib6_get_table(net, arg.filter.table_id);
+ if (!tb) {
+ if (rtnl_msg_family(cb->nlh) != PF_INET6)
+ goto unlock;
+
+ NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ if (!cb->args[0]) {
+ err = fib6_dump_table(tb, skb, cb);
+ if (!err)
+ cb->args[0] = 1;
+ }
+ goto unlock;
+ }
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
+ for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb6_hlist) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
if (e < s_e)
goto next;
- res = fib6_dump_table(tb, skb, cb);
- if (res != 0)
+ err = fib6_dump_table(tb, skb, cb);
+ if (err != 0)
goto out;
next:
e++;
@@ -412,10 +718,29 @@ out:
cb->args[1] = e;
cb->args[0] = h;
- res = res < 0 ? res : skb->len;
- if (res <= 0)
+unlock:
+ rcu_read_unlock();
+ if (err <= 0)
fib6_dump_end(cb);
- return res;
+ return err;
+}
+
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
+{
+ if (!f6i)
+ return;
+
+ if (f6i->fib6_metrics == &dst_default_metrics) {
+ struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
+
+ if (!p)
+ return;
+
+ refcount_set(&p->refcnt, 1);
+ f6i->fib6_metrics = p;
+ }
+
+ f6i->fib6_metrics->metrics[metric - 1] = val;
}
/*
@@ -426,120 +751,156 @@ out:
* node.
*/
-static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
- int addrlen, int plen,
- int offset)
+static struct fib6_node *fib6_add_1(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *root,
+ struct in6_addr *addr, int plen,
+ int offset, int allow_create,
+ int replace_required,
+ struct netlink_ext_ack *extack)
{
struct fib6_node *fn, *in, *ln;
struct fib6_node *pn = NULL;
struct rt6key *key;
int bit;
- __be32 dir = 0;
- __u32 sernum = fib6_new_sernum();
-
- RT6_TRACE("fib6_add_1\n");
+ __be32 dir = 0;
/* insert node in tree */
fn = root;
do {
- key = (struct rt6key *)((u8 *)fn->leaf + offset);
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ key = (struct rt6key *)((u8 *)leaf + offset);
/*
* Prefix match
*/
if (plen < fn->fn_bit ||
- !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
+ !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
+ if (!allow_create) {
+ if (replace_required) {
+ NL_SET_ERR_MSG(extack,
+ "Can not replace route - no match found");
+ pr_warn("Can't replace route, no match found\n");
+ return ERR_PTR(-ENOENT);
+ }
+ pr_warn("NLM_F_CREATE should be set when creating new route\n");
+ }
goto insert_above;
-
+ }
+
/*
* Exact match ?
*/
-
+
if (plen == fn->fn_bit) {
/* clean up an intermediate node */
- if ((fn->fn_flags & RTN_RTINFO) == 0) {
- rt6_release(fn->leaf);
- fn->leaf = NULL;
+ if (!(fn->fn_flags & RTN_RTINFO)) {
+ RCU_INIT_POINTER(fn->leaf, NULL);
+ fib6_info_release(leaf);
+ /* remove null_entry in the root node */
+ } else if (fn->fn_flags & RTN_TL_ROOT &&
+ rcu_access_pointer(fn->leaf) ==
+ net->ipv6.fib6_null_entry) {
+ RCU_INIT_POINTER(fn->leaf, NULL);
}
-
- fn->fn_sernum = sernum;
-
+
return fn;
}
/*
* We have more bits to go
*/
-
+
/* Try to walk down on tree. */
- fn->fn_sernum = sernum;
dir = addr_bit_set(addr, fn->fn_bit);
pn = fn;
- fn = dir ? fn->right: fn->left;
+ fn = dir ?
+ rcu_dereference_protected(fn->right,
+ lockdep_is_held(&table->tb6_lock)) :
+ rcu_dereference_protected(fn->left,
+ lockdep_is_held(&table->tb6_lock));
} while (fn);
+ if (!allow_create) {
+ /* We should not create new node because
+ * NLM_F_REPLACE was specified without NLM_F_CREATE
+ * I assume it is safe to require NLM_F_CREATE when
+ * REPLACE flag is used! Later we may want to remove the
+ * check for replace_required, because according
+ * to netlink specification, NLM_F_CREATE
+ * MUST be specified if new route is created.
+ * That would keep IPv6 consistent with IPv4
+ */
+ if (replace_required) {
+ NL_SET_ERR_MSG(extack,
+ "Can not replace route - no match found");
+ pr_warn("Can't replace route, no match found\n");
+ return ERR_PTR(-ENOENT);
+ }
+ pr_warn("NLM_F_CREATE should be set when creating new route\n");
+ }
/*
* We walked to the bottom of tree.
* Create new leaf node without children.
*/
- ln = node_alloc();
+ ln = node_alloc(net);
- if (ln == NULL)
- return NULL;
+ if (!ln)
+ return ERR_PTR(-ENOMEM);
ln->fn_bit = plen;
-
- ln->parent = pn;
- ln->fn_sernum = sernum;
+ RCU_INIT_POINTER(ln->parent, pn);
if (dir)
- pn->right = ln;
+ rcu_assign_pointer(pn->right, ln);
else
- pn->left = ln;
+ rcu_assign_pointer(pn->left, ln);
return ln;
insert_above:
/*
- * split since we don't have a common prefix anymore or
+ * split since we don't have a common prefix anymore or
* we have a less significant route.
* we've to insert an intermediate node on the list
* this new node will point to the one we need to create
* and the current
*/
- pn = fn->parent;
+ pn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&table->tb6_lock));
/* find 1st bit in difference between the 2 addrs.
See comment in __ipv6_addr_diff: bit may be an invalid value,
but if it is >= plen, the value is ignored in any case.
*/
-
- bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
- /*
- * (intermediate)[in]
+ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
+
+ /*
+ * (intermediate)[in]
* / \
* (new leaf node)[ln] (old node)[fn]
*/
if (plen > bit) {
- in = node_alloc();
- ln = node_alloc();
-
- if (in == NULL || ln == NULL) {
+ in = node_alloc(net);
+ ln = node_alloc(net);
+
+ if (!in || !ln) {
if (in)
- node_free(in);
+ node_free_immediate(net, in);
if (ln)
- node_free(ln);
- return NULL;
+ node_free_immediate(net, ln);
+ return ERR_PTR(-ENOMEM);
}
- /*
- * new intermediate node.
+ /*
+ * new intermediate node.
* RTN_RTINFO will
* be off since that an address that chooses one of
* the branches would not match less specific routes
@@ -548,174 +909,555 @@ insert_above:
in->fn_bit = bit;
- in->parent = pn;
+ RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
- atomic_inc(&in->leaf->rt6i_ref);
-
- in->fn_sernum = sernum;
+ fib6_info_hold(rcu_dereference_protected(in->leaf,
+ lockdep_is_held(&table->tb6_lock)));
/* update parent pointer */
if (dir)
- pn->right = in;
+ rcu_assign_pointer(pn->right, in);
else
- pn->left = in;
+ rcu_assign_pointer(pn->left, in);
ln->fn_bit = plen;
- ln->parent = in;
- fn->parent = in;
-
- ln->fn_sernum = sernum;
+ RCU_INIT_POINTER(ln->parent, in);
+ rcu_assign_pointer(fn->parent, in);
if (addr_bit_set(addr, bit)) {
- in->right = ln;
- in->left = fn;
+ rcu_assign_pointer(in->right, ln);
+ rcu_assign_pointer(in->left, fn);
} else {
- in->left = ln;
- in->right = fn;
+ rcu_assign_pointer(in->left, ln);
+ rcu_assign_pointer(in->right, fn);
}
} else { /* plen <= bit */
- /*
+ /*
* (new leaf node)[ln]
* / \
* (old node)[fn] NULL
*/
- ln = node_alloc();
+ ln = node_alloc(net);
- if (ln == NULL)
- return NULL;
+ if (!ln)
+ return ERR_PTR(-ENOMEM);
ln->fn_bit = plen;
- ln->parent = pn;
-
- ln->fn_sernum = sernum;
-
- if (dir)
- pn->right = ln;
- else
- pn->left = ln;
+ RCU_INIT_POINTER(ln->parent, pn);
if (addr_bit_set(&key->addr, plen))
- ln->right = fn;
+ RCU_INIT_POINTER(ln->right, fn);
else
- ln->left = fn;
+ RCU_INIT_POINTER(ln->left, fn);
- fn->parent = ln;
+ rcu_assign_pointer(fn->parent, ln);
+
+ if (dir)
+ rcu_assign_pointer(pn->right, ln);
+ else
+ rcu_assign_pointer(pn->left, ln);
}
return ln;
}
+static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
+ const struct fib6_info *match)
+{
+ int cpu;
+
+ if (!fib6_nh->rt6i_pcpu)
+ return;
+
+ rcu_read_lock();
+ /* release the reference to this fib entry from
+ * all of its cached pcpu routes
+ */
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+
+ /* Paired with xchg() in rt6_get_pcpu_route() */
+ pcpu_rt = READ_ONCE(*ppcpu_rt);
+
+ /* only dropping the 'from' reference if the cached route
+ * is using 'match'. The cached pcpu_rt->from only changes
+ * from a fib6_info to NULL (ip6_dst_destroy); it can never
+ * change from one fib6_info reference to another
+ */
+ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
+ struct fib6_info *from;
+
+ from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
+ fib6_info_release(from);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_info *arg = _arg;
+
+ __fib6_drop_pcpu_from(nh, arg);
+ return 0;
+}
+
+static void fib6_drop_pcpu_from(struct fib6_info *f6i)
+{
+ /* Make sure rt6_make_pcpu_route() wont add other percpu routes
+ * while we are cleaning them here.
+ */
+ f6i->fib6_destroying = 1;
+ mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+
+ if (f6i->nh) {
+ rcu_read_lock();
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i);
+ rcu_read_unlock();
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = f6i->fib6_nh;
+ __fib6_drop_pcpu_from(fib6_nh, f6i);
+ }
+}
+
+static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
+ struct net *net)
+{
+ struct fib6_table *table = rt->fib6_table;
+
+ /* Flush all cached dst in exception table */
+ rt6_flush_exceptions(rt);
+ fib6_drop_pcpu_from(rt);
+
+ if (rt->nh) {
+ spin_lock(&rt->nh->lock);
+
+ if (!list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
+
+ spin_unlock(&rt->nh->lock);
+ }
+
+ if (refcount_read(&rt->fib6_ref) != 1) {
+ /* This route is used as dummy address holder in some split
+ * nodes. It is not leaked, but it still holds other resources,
+ * which must be released in time. So, scan ascendant nodes
+ * and replace dummy references to this route with references
+ * to still alive ones.
+ */
+ while (fn) {
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_info *new_leaf;
+ if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
+ new_leaf = fib6_find_prefix(net, table, fn);
+ fib6_info_hold(new_leaf);
+
+ rcu_assign_pointer(fn->leaf, new_leaf);
+ fib6_info_release(rt);
+ }
+ fn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&table->tb6_lock));
+ }
+ }
+
+ fib6_clean_expires(rt);
+ fib6_remove_gc_list(rt);
+}
+
/*
* Insert routing information in a node.
*/
-static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
- struct nl_info *info)
+static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack,
+ struct list_head *purge_list)
{
- struct rt6_info *iter = NULL;
- struct rt6_info **ins;
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ struct fib6_info *iter = NULL;
+ struct fib6_info __rcu **ins;
+ struct fib6_info __rcu **fallback_ins = NULL;
+ int replace = (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_REPLACE));
+ int add = (!info->nlh ||
+ (info->nlh->nlmsg_flags & NLM_F_CREATE));
+ int found = 0;
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ bool notify_sibling_rt = false;
+ u16 nlflags = NLM_F_EXCL;
+ int err;
+
+ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+ nlflags |= NLM_F_APPEND;
ins = &fn->leaf;
- if (fn->fn_flags&RTN_TL_ROOT &&
- fn->leaf == &ip6_null_entry &&
- !(rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ){
- fn->leaf = rt;
- rt->u.next = NULL;
- goto out;
- }
-
- for (iter = fn->leaf; iter; iter=iter->u.next) {
+ for (iter = leaf; iter;
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock))) {
/*
* Search for duplicates
*/
- if (iter->rt6i_metric == rt->rt6i_metric) {
+ if (iter->fib6_metric == rt->fib6_metric) {
/*
* Same priority level
*/
+ if (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_EXCL))
+ return -EEXIST;
- if (iter->rt6i_dev == rt->rt6i_dev &&
- iter->rt6i_idev == rt->rt6i_idev &&
- ipv6_addr_equal(&iter->rt6i_gateway,
- &rt->rt6i_gateway)) {
- if (!(iter->rt6i_flags&RTF_EXPIRES))
+ nlflags &= ~NLM_F_EXCL;
+ if (replace) {
+ if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+ found++;
+ break;
+ }
+ fallback_ins = fallback_ins ?: ins;
+ goto next_iter;
+ }
+
+ if (rt6_duplicate_nexthop(iter, rt)) {
+ if (rt->fib6_nsiblings)
+ WRITE_ONCE(rt->fib6_nsiblings, 0);
+ if (!(iter->fib6_flags & RTF_EXPIRES))
return -EEXIST;
- iter->rt6i_expires = rt->rt6i_expires;
- if (!(rt->rt6i_flags&RTF_EXPIRES)) {
- iter->rt6i_flags &= ~RTF_EXPIRES;
- iter->rt6i_expires = 0;
+ if (!(rt->fib6_flags & RTF_EXPIRES)) {
+ fib6_clean_expires(iter);
+ fib6_remove_gc_list(iter);
+ } else {
+ fib6_set_expires(iter, rt->expires);
+ fib6_add_gc_list(iter);
+ }
+ if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT))) {
+ iter->fib6_flags &= ~RTF_ADDRCONF;
+ iter->fib6_flags &= ~RTF_PREFIX_RT;
}
+
+ if (rt->fib6_pmtu)
+ fib6_metric_set(iter, RTAX_MTU,
+ rt->fib6_pmtu);
return -EEXIST;
}
+ /* If we have the same destination and the same metric,
+ * but not the same gateway, then the route we try to
+ * add is sibling to this route, increment our counter
+ * of siblings, and later we will add our route to the
+ * list.
+ * Only static routes (which don't have flag
+ * RTF_EXPIRES) are used for ECMPv6.
+ *
+ * To avoid long list, we only had siblings if the
+ * route have a gateway.
+ */
+ if (rt_can_ecmp &&
+ rt6_qualify_for_ecmp(iter))
+ WRITE_ONCE(rt->fib6_nsiblings,
+ rt->fib6_nsiblings + 1);
}
- if (iter->rt6i_metric > rt->rt6i_metric)
+ if (iter->fib6_metric > rt->fib6_metric)
break;
- ins = &iter->u.next;
+next_iter:
+ ins = &iter->fib6_next;
+ }
+
+ if (fallback_ins && !found) {
+ /* No matching route with same ecmp-able-ness found, replace
+ * first matching route
+ */
+ ins = fallback_ins;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ found++;
+ }
+
+ /* Reset round-robin state, if necessary */
+ if (ins == &fn->leaf)
+ fn->rr_ptr = NULL;
+
+ /* Link this route to others same route. */
+ if (rt->fib6_nsiblings) {
+ unsigned int fib6_nsiblings;
+ struct fib6_info *sibling, *temp_sibling;
+
+ /* Find the first route that have the same metric */
+ sibling = leaf;
+ notify_sibling_rt = true;
+ while (sibling) {
+ if (sibling->fib6_metric == rt->fib6_metric &&
+ rt6_qualify_for_ecmp(sibling)) {
+ list_add_tail_rcu(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
+ break;
+ }
+ sibling = rcu_dereference_protected(sibling->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ notify_sibling_rt = false;
+ }
+ /* For each sibling in the list, increment the counter of
+ * siblings. BUG() if counters does not match, list of siblings
+ * is broken!
+ */
+ fib6_nsiblings = 0;
+ list_for_each_entry_safe(sibling, temp_sibling,
+ &rt->fib6_siblings, fib6_siblings) {
+ WRITE_ONCE(sibling->fib6_nsiblings,
+ sibling->fib6_nsiblings + 1);
+ BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+ fib6_nsiblings++;
+ }
+ BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
+ rcu_read_lock();
+ rt6_multipath_rebalance(temp_sibling);
+ rcu_read_unlock();
}
/*
* insert node
*/
+ if (!replace) {
+ if (!add)
+ pr_warn("NLM_F_CREATE should be set when creating new route\n");
-out:
- rt->u.next = iter;
- *ins = rt;
- rt->rt6i_node = fn;
- atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info);
- rt6_stats.fib_rt_entries++;
-
- if ((fn->fn_flags & RTN_RTINFO) == 0) {
- rt6_stats.fib_route_nodes++;
- fn->fn_flags |= RTN_RTINFO;
+add:
+ nlflags |= NLM_F_CREATE;
+
+ /* The route should only be notified if it is the first
+ * route in the node or if it is added as a sibling
+ * route to the first route in the node.
+ */
+ if (!info->skip_notify_kernel &&
+ (notify_sibling_rt || ins == &fn->leaf)) {
+ enum fib_event_type fib_event;
+
+ if (notify_sibling_rt)
+ fib_event = FIB_EVENT_ENTRY_APPEND;
+ else
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib6_entry_notifiers(info->nl_net,
+ fib_event, rt,
+ extack);
+ if (err) {
+ struct fib6_info *sibling, *next_sibling;
+
+ /* If the route has siblings, then it first
+ * needs to be unlinked from them.
+ */
+ if (!rt->fib6_nsiblings)
+ return err;
+
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->fib6_siblings,
+ fib6_siblings)
+ WRITE_ONCE(sibling->fib6_nsiblings,
+ sibling->fib6_nsiblings - 1);
+ WRITE_ONCE(rt->fib6_nsiblings, 0);
+ list_del_rcu(&rt->fib6_siblings);
+ rcu_read_lock();
+ rt6_multipath_rebalance(next_sibling);
+ rcu_read_unlock();
+ return err;
+ }
+ }
+
+ rcu_assign_pointer(rt->fib6_next, iter);
+ fib6_info_hold(rt);
+ rcu_assign_pointer(rt->fib6_node, fn);
+ rcu_assign_pointer(*ins, rt);
+ if (!info->skip_notify)
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
+
+ if (!(fn->fn_flags & RTN_RTINFO)) {
+ info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+ fn->fn_flags |= RTN_RTINFO;
+ }
+
+ } else {
+ int nsiblings;
+
+ if (!found) {
+ if (add)
+ goto add;
+ pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
+ return -ENOENT;
+ }
+
+ if (!info->skip_notify_kernel && ins == &fn->leaf) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_REPLACE,
+ rt, extack);
+ if (err)
+ return err;
+ }
+
+ fib6_info_hold(rt);
+ rcu_assign_pointer(rt->fib6_node, fn);
+ rt->fib6_next = iter->fib6_next;
+ rcu_assign_pointer(*ins, rt);
+ if (!info->skip_notify)
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
+ if (!(fn->fn_flags & RTN_RTINFO)) {
+ info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+ fn->fn_flags |= RTN_RTINFO;
+ }
+ nsiblings = iter->fib6_nsiblings;
+ iter->fib6_node = NULL;
+ list_add(&iter->purge_link, purge_list);
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
+ fn->rr_ptr = NULL;
+
+ if (nsiblings) {
+ /* Replacing an ECMP route, remove all siblings */
+ ins = &rt->fib6_next;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ while (iter) {
+ if (iter->fib6_metric > rt->fib6_metric)
+ break;
+ if (rt6_qualify_for_ecmp(iter)) {
+ *ins = iter->fib6_next;
+ iter->fib6_node = NULL;
+ list_add(&iter->purge_link, purge_list);
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
+ fn->rr_ptr = NULL;
+ nsiblings--;
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+ } else {
+ ins = &iter->fib6_next;
+ }
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ }
+ WARN_ON(nsiblings != 0);
+ }
}
return 0;
}
-static __inline__ void fib6_start_gc(struct rt6_info *rt)
+static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack,
+ struct list_head *purge_list)
+{
+ int err;
+
+ spin_lock(&rt->nh->lock);
+
+ if (rt->nh->dead) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
+ if (!err)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
+ }
+
+ spin_unlock(&rt->nh->lock);
+
+ return err;
+}
+
+static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
- if (ip6_fib_timer.expires == 0 &&
- (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE)))
- mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
+ if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
+ (rt->fib6_flags & RTF_EXPIRES))
+ mod_timer(&net->ipv6.ip6_fib_timer,
+ jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
-void fib6_force_start_gc(void)
+void fib6_force_start_gc(struct net *net)
{
- if (ip6_fib_timer.expires == 0)
- mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
+ if (!timer_pending(&net->ipv6.ip6_fib_timer))
+ mod_timer(&net->ipv6.ip6_fib_timer,
+ jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+}
+
+static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
+ int sernum)
+{
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+
+ /* paired with smp_rmb() in fib6_get_cookie_safe() */
+ smp_wmb();
+ while (fn) {
+ WRITE_ONCE(fn->fn_sernum, sernum);
+ fn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ }
+}
+
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
+{
+ __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
+}
+
+/* allow ipv4 to update sernum via ipv6_stub */
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum_upto_root(net, f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
}
/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
* with source addr info in sub-trees
+ * Need to own table->tb6_lock
*/
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack)
{
- struct fib6_node *fn, *pn = NULL;
+ struct fib6_table *table = rt->fib6_table;
+ LIST_HEAD(purge_list);
+ struct fib6_node *fn;
+#ifdef CONFIG_IPV6_SUBTREES
+ struct fib6_node *pn = NULL;
+#endif
int err = -ENOMEM;
-
- fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
- rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst));
-
- if (fn == NULL)
+ int allow_create = 1;
+ int replace_required = 0;
+
+ if (info->nlh) {
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
+ allow_create = 0;
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
+ replace_required = 1;
+ }
+ if (!allow_create && !replace_required)
+ pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
+
+ fn = fib6_add_1(info->nl_net, table, root,
+ &rt->fib6_dst.addr, rt->fib6_dst.plen,
+ offsetof(struct fib6_info, fib6_dst), allow_create,
+ replace_required, extack);
+ if (IS_ERR(fn)) {
+ err = PTR_ERR(fn);
+ fn = NULL;
goto out;
+ }
+#ifdef CONFIG_IPV6_SUBTREES
pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
- if (rt->rt6i_src.plen) {
+ if (rt->fib6_src.plen) {
struct fib6_node *sn;
- if (fn->subtree == NULL) {
+ if (!rcu_access_pointer(fn->subtree)) {
struct fib6_node *sfn;
/*
@@ -729,56 +1471,80 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
*/
/* Create subtree root node */
- sfn = node_alloc();
- if (sfn == NULL)
- goto st_failure;
+ sfn = node_alloc(info->nl_net);
+ if (!sfn)
+ goto failure;
- sfn->leaf = &ip6_null_entry;
- atomic_inc(&ip6_null_entry.rt6i_ref);
+ fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
+ rcu_assign_pointer(sfn->leaf,
+ info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
- sfn->fn_sernum = fib6_new_sernum();
/* Now add the first leaf node to new subtree */
- sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
- sizeof(struct in6_addr), rt->rt6i_src.plen,
- offsetof(struct rt6_info, rt6i_src));
+ sn = fib6_add_1(info->nl_net, table, sfn,
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
+ allow_create, replace_required, extack);
- if (sn == NULL) {
+ if (IS_ERR(sn)) {
/* If it is failed, discard just allocated
- root, and then (in st_failure) stale node
+ root, and then (in failure) stale node
in main tree.
*/
- node_free(sfn);
- goto st_failure;
+ node_free_immediate(info->nl_net, sfn);
+ err = PTR_ERR(sn);
+ goto failure;
}
/* Now link new subtree to main tree */
- sfn->parent = fn;
- fn->subtree = sfn;
+ rcu_assign_pointer(sfn->parent, fn);
+ rcu_assign_pointer(fn->subtree, sfn);
} else {
- sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
- sizeof(struct in6_addr), rt->rt6i_src.plen,
- offsetof(struct rt6_info, rt6i_src));
-
- if (sn == NULL)
- goto st_failure;
+ sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
+ allow_create, replace_required, extack);
+
+ if (IS_ERR(sn)) {
+ err = PTR_ERR(sn);
+ goto failure;
+ }
}
- if (fn->leaf == NULL) {
- fn->leaf = rt;
- atomic_inc(&rt->rt6i_ref);
+ if (!rcu_access_pointer(fn->leaf)) {
+ if (fn->fn_flags & RTN_TL_ROOT) {
+ /* put back null_entry for root node */
+ rcu_assign_pointer(fn->leaf,
+ info->nl_net->ipv6.fib6_null_entry);
+ } else {
+ fib6_info_hold(rt);
+ rcu_assign_pointer(fn->leaf, rt);
+ }
}
fn = sn;
}
#endif
- err = fib6_add_rt2node(fn, rt, info);
+ if (rt->nh)
+ err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
+ else
+ err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
+ if (!err) {
+ struct fib6_info *iter, *next;
+
+ list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
+ list_del(&iter->purge_link);
+ fib6_purge_rt(iter, fn, info->nl_net);
+ fib6_info_release(iter);
+ }
+
+ __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
- if (err == 0) {
- fib6_start_gc(rt);
- if (!(rt->rt6i_flags&RTF_CACHE))
- fib6_prune_clones(pn, rt);
+ if (rt->fib6_flags & RTF_EXPIRES)
+ fib6_add_gc_list(rt);
+
+ fib6_start_gc(info->nl_net, rt);
}
out:
@@ -788,31 +1554,46 @@ out:
* If fib6_add_1 has cleared the old leaf pointer in the
* super-tree leaf node we have to find a new one for it.
*/
- if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
- pn->leaf = fib6_find_prefix(pn);
-#if RT6_DEBUG >= 2
- if (!pn->leaf) {
- BUG_TRAP(pn->leaf != NULL);
- pn->leaf = &ip6_null_entry;
+ if (pn != fn) {
+ struct fib6_info *pn_leaf =
+ rcu_dereference_protected(pn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (pn_leaf == rt) {
+ pn_leaf = NULL;
+ RCU_INIT_POINTER(pn->leaf, NULL);
+ fib6_info_release(rt);
+ }
+ if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+ pn_leaf = fib6_find_prefix(info->nl_net, table,
+ pn);
+ if (!pn_leaf)
+ pn_leaf =
+ info->nl_net->ipv6.fib6_null_entry;
+ fib6_info_hold(pn_leaf);
+ rcu_assign_pointer(pn->leaf, pn_leaf);
}
-#endif
- atomic_inc(&pn->leaf->rt6i_ref);
}
#endif
- dst_free(&rt->u.dst);
+ goto failure;
+ } else if (fib6_requires_src(rt)) {
+ fib6_routes_require_src_inc(info->nl_net);
}
return err;
-#ifdef CONFIG_IPV6_SUBTREES
- /* Subtree creation failed, probably main tree node
- is orphan. If it is, shoot it.
+failure:
+ /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
+ * 1. fn is an intermediate node and we failed to add the new
+ * route to it in both subtree creation failure and fib6_add_rt2node()
+ * failure case.
+ * 2. fn is the root node in the table and we fail to add the first
+ * default route to it.
*/
-st_failure:
- if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
- fib6_repair_tree(fn);
- dst_free(&rt->u.dst);
+ if (fn &&
+ (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
+ (fn->fn_flags & RTN_TL_ROOT &&
+ !rcu_access_pointer(fn->leaf))))
+ fib6_repair_tree(info->nl_net, table, fn);
return err;
-#endif
}
/*
@@ -821,12 +1602,12 @@ st_failure:
*/
struct lookup_args {
- int offset; /* key offset on rt6_info */
- struct in6_addr *addr; /* search key */
+ int offset; /* key offset on fib6_info */
+ const struct in6_addr *addr; /* search key */
};
-static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
- struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+ struct lookup_args *args)
{
struct fib6_node *fn;
__be32 dir;
@@ -845,54 +1626,68 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
dir = addr_bit_set(args->addr, fn->fn_bit);
- next = dir ? fn->right : fn->left;
+ next = dir ? rcu_dereference(fn->right) :
+ rcu_dereference(fn->left);
if (next) {
fn = next;
continue;
}
-
break;
}
- while(fn) {
- if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+ while (fn) {
+ struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
+ if (subtree || fn->fn_flags & RTN_RTINFO) {
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
struct rt6key *key;
- key = (struct rt6key *) ((u8 *) fn->leaf +
- args->offset);
+ if (!leaf)
+ goto backtrack;
+
+ key = (struct rt6key *) ((u8 *)leaf + args->offset);
if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
#ifdef CONFIG_IPV6_SUBTREES
- if (fn->subtree)
- fn = fib6_lookup_1(fn->subtree, args + 1);
+ if (subtree) {
+ struct fib6_node *sfn;
+ sfn = fib6_node_lookup_1(subtree,
+ args + 1);
+ if (!sfn)
+ goto backtrack;
+ fn = sfn;
+ }
#endif
- if (!fn || fn->fn_flags & RTN_RTINFO)
+ if (fn->fn_flags & RTN_RTINFO)
return fn;
}
}
-
+backtrack:
if (fn->fn_flags & RTN_ROOT)
break;
- fn = fn->parent;
+ fn = rcu_dereference(fn->parent);
}
return NULL;
}
-struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
- struct in6_addr *saddr)
+/* called with rcu_read_lock() held
+ */
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct fib6_node *fn;
struct lookup_args args[] = {
{
- .offset = offsetof(struct rt6_info, rt6i_dst),
+ .offset = offsetof(struct fib6_info, fib6_dst),
.addr = daddr,
},
#ifdef CONFIG_IPV6_SUBTREES
{
- .offset = offsetof(struct rt6_info, rt6i_src),
+ .offset = offsetof(struct fib6_info, fib6_src),
.addr = saddr,
},
#endif
@@ -901,9 +1696,8 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
}
};
- fn = fib6_lookup_1(root, daddr ? args : args + 1);
-
- if (fn == NULL || fn->fn_flags & RTN_TL_ROOT)
+ fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
+ if (!fn || fn->fn_flags & RTN_TL_ROOT)
fn = root;
return fn;
@@ -912,58 +1706,92 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
/*
* Get node with specified destination prefix (and source prefix,
* if subtrees are used)
+ * exact_match == true means we try to find fn with exact match of
+ * the passed in prefix addr
+ * exact_match == false means we try to find fn with longest prefix
+ * match of the passed in prefix addr. This is useful for finding fn
+ * for cached route as it will be stored in the exception table under
+ * the node with longest prefix length.
*/
-static struct fib6_node * fib6_locate_1(struct fib6_node *root,
- struct in6_addr *addr,
- int plen, int offset)
+static struct fib6_node *fib6_locate_1(struct fib6_node *root,
+ const struct in6_addr *addr,
+ int plen, int offset,
+ bool exact_match)
{
- struct fib6_node *fn;
+ struct fib6_node *fn, *prev = NULL;
for (fn = root; fn ; ) {
- struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
+ struct rt6key *key;
+
+ /* This node is being deleted */
+ if (!leaf) {
+ if (plen <= fn->fn_bit)
+ goto out;
+ else
+ goto next;
+ }
+
+ key = (struct rt6key *)((u8 *)leaf + offset);
/*
* Prefix match
*/
if (plen < fn->fn_bit ||
!ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
- return NULL;
+ goto out;
if (plen == fn->fn_bit)
return fn;
+ if (fn->fn_flags & RTN_RTINFO)
+ prev = fn;
+
+next:
/*
* We have more bits to go
*/
if (addr_bit_set(addr, fn->fn_bit))
- fn = fn->right;
+ fn = rcu_dereference(fn->right);
else
- fn = fn->left;
+ fn = rcu_dereference(fn->left);
}
- return NULL;
+out:
+ if (exact_match)
+ return NULL;
+ else
+ return prev;
}
-struct fib6_node * fib6_locate(struct fib6_node *root,
- struct in6_addr *daddr, int dst_len,
- struct in6_addr *saddr, int src_len)
+struct fib6_node *fib6_locate(struct fib6_node *root,
+ const struct in6_addr *daddr, int dst_len,
+ const struct in6_addr *saddr, int src_len,
+ bool exact_match)
{
struct fib6_node *fn;
fn = fib6_locate_1(root, daddr, dst_len,
- offsetof(struct rt6_info, rt6i_dst));
+ offsetof(struct fib6_info, fib6_dst),
+ exact_match);
#ifdef CONFIG_IPV6_SUBTREES
if (src_len) {
- BUG_TRAP(saddr!=NULL);
- if (fn && fn->subtree)
- fn = fib6_locate_1(fn->subtree, saddr, src_len,
- offsetof(struct rt6_info, rt6i_src));
+ WARN_ON(saddr == NULL);
+ if (fn) {
+ struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
+ if (subtree) {
+ fn = fib6_locate_1(subtree, saddr, src_len,
+ offsetof(struct fib6_info, fib6_src),
+ exact_match);
+ }
+ }
}
#endif
- if (fn && fn->fn_flags&RTN_RTINFO)
+ if (fn && fn->fn_flags & RTN_RTINFO)
return fn;
return NULL;
@@ -975,17 +1803,26 @@ struct fib6_node * fib6_locate(struct fib6_node *root,
*
*/
-static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
+static struct fib6_info *fib6_find_prefix(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn)
{
- if (fn->fn_flags&RTN_ROOT)
- return &ip6_null_entry;
-
- while(fn) {
- if(fn->left)
- return fn->left->leaf;
-
- if(fn->right)
- return fn->right->leaf;
+ struct fib6_node *child_left, *child_right;
+
+ if (fn->fn_flags & RTN_ROOT)
+ return net->ipv6.fib6_null_entry;
+
+ while (fn) {
+ child_left = rcu_dereference_protected(fn->left,
+ lockdep_is_held(&table->tb6_lock));
+ child_right = rcu_dereference_protected(fn->right,
+ lockdep_is_held(&table->tb6_lock));
+ if (child_left)
+ return rcu_dereference_protected(child_left->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (child_right)
+ return rcu_dereference_protected(child_right->leaf,
+ lockdep_is_held(&table->tb6_lock));
fn = FIB6_SUBTREE(fn);
}
@@ -995,206 +1832,266 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
/*
* Called to trim the tree of intermediate nodes when possible. "fn"
* is the node we want to try and remove.
+ * Need to own table->tb6_lock
*/
-static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
+static struct fib6_node *fib6_repair_tree(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn)
{
int children;
int nstate;
- struct fib6_node *child, *pn;
- struct fib6_walker_t *w;
+ struct fib6_node *child;
+ struct fib6_walker *w;
int iter = 0;
+ /* Set fn->leaf to null_entry for root node. */
+ if (fn->fn_flags & RTN_TL_ROOT) {
+ rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
+ return fn;
+ }
+
for (;;) {
- RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+ struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *pn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_info *new_fn_leaf;
+
+ pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
iter++;
- BUG_TRAP(!(fn->fn_flags&RTN_RTINFO));
- BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT));
- BUG_TRAP(fn->leaf==NULL);
+ WARN_ON(fn->fn_flags & RTN_RTINFO);
+ WARN_ON(fn->fn_flags & RTN_TL_ROOT);
+ WARN_ON(fn_leaf);
children = 0;
child = NULL;
- if (fn->right) child = fn->right, children |= 1;
- if (fn->left) child = fn->left, children |= 2;
+ if (fn_r) {
+ child = fn_r;
+ children |= 1;
+ }
+ if (fn_l) {
+ child = fn_l;
+ children |= 2;
+ }
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
/* Subtree root (i.e. fn) may have one child */
- || (children && fn->fn_flags&RTN_ROOT)
+ || (children && fn->fn_flags & RTN_ROOT)
#endif
) {
- fn->leaf = fib6_find_prefix(fn);
+ new_fn_leaf = fib6_find_prefix(net, table, fn);
#if RT6_DEBUG >= 2
- if (fn->leaf==NULL) {
- BUG_TRAP(fn->leaf);
- fn->leaf = &ip6_null_entry;
+ if (!new_fn_leaf) {
+ WARN_ON(!new_fn_leaf);
+ new_fn_leaf = net->ipv6.fib6_null_entry;
}
#endif
- atomic_inc(&fn->leaf->rt6i_ref);
- return fn->parent;
+ fib6_info_hold(new_fn_leaf);
+ rcu_assign_pointer(fn->leaf, new_fn_leaf);
+ return pn;
}
- pn = fn->parent;
#ifdef CONFIG_IPV6_SUBTREES
if (FIB6_SUBTREE(pn) == fn) {
- BUG_TRAP(fn->fn_flags&RTN_ROOT);
- FIB6_SUBTREE(pn) = NULL;
+ WARN_ON(!(fn->fn_flags & RTN_ROOT));
+ RCU_INIT_POINTER(pn->subtree, NULL);
nstate = FWS_L;
} else {
- BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
+ WARN_ON(fn->fn_flags & RTN_ROOT);
#endif
- if (pn->right == fn) pn->right = child;
- else if (pn->left == fn) pn->left = child;
+ if (pn_r == fn)
+ rcu_assign_pointer(pn->right, child);
+ else if (pn_l == fn)
+ rcu_assign_pointer(pn->left, child);
#if RT6_DEBUG >= 2
- else BUG_TRAP(0);
+ else
+ WARN_ON(1);
#endif
if (child)
- child->parent = pn;
+ rcu_assign_pointer(child->parent, pn);
nstate = FWS_R;
#ifdef CONFIG_IPV6_SUBTREES
}
#endif
- read_lock(&fib6_walker_lock);
- FOR_WALKERS(w) {
- if (child == NULL) {
- if (w->root == fn) {
- w->root = w->node = NULL;
- RT6_TRACE("W %p adjusted by delroot 1\n", w);
- } else if (w->node == fn) {
- RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+ read_lock(&net->ipv6.fib6_walker_lock);
+ FOR_WALKERS(net, w) {
+ if (!child) {
+ if (w->node == fn) {
+ pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
+ w, w->state, nstate);
w->node = pn;
w->state = nstate;
}
} else {
- if (w->root == fn) {
- w->root = child;
- RT6_TRACE("W %p adjusted by delroot 2\n", w);
- }
if (w->node == fn) {
w->node = child;
if (children&2) {
- RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
- w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
+ pr_debug("W %p adjusted by delnode 2, s=%d\n",
+ w, w->state);
+ w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
} else {
- RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
- w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
+ pr_debug("W %p adjusted by delnode 2, s=%d\n",
+ w, w->state);
+ w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
}
}
}
}
- read_unlock(&fib6_walker_lock);
+ read_unlock(&net->ipv6.fib6_walker_lock);
- node_free(fn);
- if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn))
+ node_free(net, fn);
+ if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
return pn;
- rt6_release(pn->leaf);
- pn->leaf = NULL;
+ RCU_INIT_POINTER(pn->leaf, NULL);
+ fib6_info_release(pn_leaf);
fn = pn;
}
}
-static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
- struct nl_info *info)
+static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
+ struct fib6_info __rcu **rtp, struct nl_info *info)
{
- struct fib6_walker_t *w;
- struct rt6_info *rt = *rtp;
-
- RT6_TRACE("fib6_del_route\n");
+ struct fib6_info *leaf, *replace_rt = NULL;
+ struct fib6_walker *w;
+ struct fib6_info *rt = rcu_dereference_protected(*rtp,
+ lockdep_is_held(&table->tb6_lock));
+ struct net *net = info->nl_net;
+ bool notify_del = false;
+
+ /* If the deleted route is the first in the node and it is not part of
+ * a multipath route, then we need to replace it with the next route
+ * in the node, if exists.
+ */
+ leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (leaf == rt && !rt->fib6_nsiblings) {
+ if (rcu_access_pointer(rt->fib6_next))
+ replace_rt = rcu_dereference_protected(rt->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ else
+ notify_del = true;
+ }
/* Unlink it */
- *rtp = rt->u.next;
- rt->rt6i_node = NULL;
- rt6_stats.fib_rt_entries--;
- rt6_stats.fib_discarded_routes++;
+ *rtp = rt->fib6_next;
+ rt->fib6_node = NULL;
+ net->ipv6.rt6_stats->fib_rt_entries--;
+ net->ipv6.rt6_stats->fib_discarded_routes++;
+
+ /* Reset round-robin state, if necessary */
+ if (rcu_access_pointer(fn->rr_ptr) == rt)
+ fn->rr_ptr = NULL;
+
+ /* Remove this entry from other siblings */
+ if (rt->fib6_nsiblings) {
+ struct fib6_info *sibling, *next_sibling;
+
+ /* The route is deleted from a multipath route. If this
+ * multipath route is the first route in the node, then we need
+ * to emit a delete notification. Otherwise, we need to skip
+ * the notification.
+ */
+ if (rt->fib6_metric == leaf->fib6_metric &&
+ rt6_qualify_for_ecmp(leaf))
+ notify_del = true;
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->fib6_siblings, fib6_siblings)
+ WRITE_ONCE(sibling->fib6_nsiblings,
+ sibling->fib6_nsiblings - 1);
+ WRITE_ONCE(rt->fib6_nsiblings, 0);
+ list_del_rcu(&rt->fib6_siblings);
+ rt6_multipath_rebalance(next_sibling);
+ }
/* Adjust walkers */
- read_lock(&fib6_walker_lock);
- FOR_WALKERS(w) {
+ read_lock(&net->ipv6.fib6_walker_lock);
+ FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
- RT6_TRACE("walker %p adjusted by delroute\n", w);
- w->leaf = rt->u.next;
- if (w->leaf == NULL)
+ pr_debug("walker %p adjusted by delroute\n", w);
+ w->leaf = rcu_dereference_protected(rt->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ if (!w->leaf)
w->state = FWS_U;
}
}
- read_unlock(&fib6_walker_lock);
-
- rt->u.next = NULL;
+ read_unlock(&net->ipv6.fib6_walker_lock);
- if (fn->leaf == NULL && fn->fn_flags&RTN_TL_ROOT)
- fn->leaf = &ip6_null_entry;
-
- /* If it was last route, expunge its radix tree node */
- if (fn->leaf == NULL) {
- fn->fn_flags &= ~RTN_RTINFO;
- rt6_stats.fib_route_nodes--;
- fn = fib6_repair_tree(fn);
+ /* If it was last route, call fib6_repair_tree() to:
+ * 1. For root node, put back null_entry as how the table was created.
+ * 2. For other nodes, expunge its radix tree node.
+ */
+ if (!rcu_access_pointer(fn->leaf)) {
+ if (!(fn->fn_flags & RTN_TL_ROOT)) {
+ fn->fn_flags &= ~RTN_RTINFO;
+ net->ipv6.rt6_stats->fib_route_nodes--;
+ }
+ fn = fib6_repair_tree(net, table, fn);
}
- if (atomic_read(&rt->rt6i_ref) != 1) {
- /* This route is used as dummy address holder in some split
- * nodes. It is not leaked, but it still holds other resources,
- * which must be released in time. So, scan ascendant nodes
- * and replace dummy references to this route with references
- * to still alive ones.
- */
- while (fn) {
- if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) {
- fn->leaf = fib6_find_prefix(fn);
- atomic_inc(&fn->leaf->rt6i_ref);
- rt6_release(rt);
- }
- fn = fn->parent;
- }
- /* No more references are possible at this point. */
- if (atomic_read(&rt->rt6i_ref) != 1) BUG();
+ fib6_purge_rt(rt, fn, net);
+
+ if (!info->skip_notify_kernel) {
+ if (notify_del)
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ rt, NULL);
+ else if (replace_rt)
+ call_fib6_entry_notifiers_replace(net, replace_rt);
}
+ if (!info->skip_notify)
+ inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
- inet6_rt_notify(RTM_DELROUTE, rt, info);
- rt6_release(rt);
+ fib6_info_release(rt);
}
-int fib6_del(struct rt6_info *rt, struct nl_info *info)
+/* Need to own table->tb6_lock */
+int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
- struct fib6_node *fn = rt->rt6i_node;
- struct rt6_info **rtp;
+ struct net *net = info->nl_net;
+ struct fib6_info __rcu **rtp;
+ struct fib6_info __rcu **rtp_next;
+ struct fib6_table *table;
+ struct fib6_node *fn;
-#if RT6_DEBUG >= 2
- if (rt->u.dst.obsolete>0) {
- BUG_TRAP(fn==NULL);
- return -ENOENT;
- }
-#endif
- if (fn == NULL || rt == &ip6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry)
return -ENOENT;
- BUG_TRAP(fn->fn_flags&RTN_RTINFO);
+ table = rt->fib6_table;
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (!fn)
+ return -ENOENT;
- if (!(rt->rt6i_flags&RTF_CACHE)) {
- struct fib6_node *pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
- /* clones of this route might be in another subtree */
- if (rt->rt6i_src.plen) {
- while (!(pn->fn_flags&RTN_ROOT))
- pn = pn->parent;
- pn = pn->parent;
- }
-#endif
- fib6_prune_clones(pn, rt);
- }
+ WARN_ON(!(fn->fn_flags & RTN_RTINFO));
/*
* Walk the leaf entries looking for ourself
*/
- for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
- if (*rtp == rt) {
- fib6_del_route(fn, rtp, info);
+ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
+ struct fib6_info *cur = rcu_dereference_protected(*rtp,
+ lockdep_is_held(&table->tb6_lock));
+ if (rt == cur) {
+ if (fib6_requires_src(cur))
+ fib6_routes_require_src_dec(info->nl_net);
+ fib6_del_route(table, fn, rtp, info);
return 0;
}
+ rtp_next = &cur->fib6_next;
}
return -ENOENT;
}
@@ -1206,7 +2103,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
* However, it is internally reenterable wrt itself and fib6_add/fib6_del.
* It means, that we can modify tree during walking
* and use this function for garbage collection, clone pruning,
- * cleaning tree when a device goes down etc. etc.
+ * cleaning tree when a device goes down etc. etc.
*
* It guarantees that every node will be traversed,
* and that it will be traversed only once.
@@ -1221,22 +2118,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
* 0 -> walk is complete.
* >0 -> walk is incomplete (i.e. suspended)
* <0 -> walk is terminated by an error.
+ *
+ * This function is called with tb6_lock held.
*/
-static int fib6_walk_continue(struct fib6_walker_t *w)
+static int fib6_walk_continue(struct fib6_walker *w)
{
- struct fib6_node *fn, *pn;
+ struct fib6_node *fn, *pn, *left, *right;
+
+ /* w->root should always be table->tb6_root */
+ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
for (;;) {
fn = w->node;
- if (fn == NULL)
+ if (!fn)
return 0;
- if (w->prune && fn != w->root &&
- fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
- w->state = FWS_C;
- w->leaf = fn->leaf;
- }
switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
case FWS_S:
@@ -1245,92 +2142,133 @@ static int fib6_walk_continue(struct fib6_walker_t *w)
continue;
}
w->state = FWS_L;
-#endif
+ fallthrough;
+#endif
case FWS_L:
- if (fn->left) {
- w->node = fn->left;
+ left = rcu_dereference_protected(fn->left, 1);
+ if (left) {
+ w->node = left;
w->state = FWS_INIT;
continue;
}
w->state = FWS_R;
+ fallthrough;
case FWS_R:
- if (fn->right) {
- w->node = fn->right;
+ right = rcu_dereference_protected(fn->right, 1);
+ if (right) {
+ w->node = right;
w->state = FWS_INIT;
continue;
}
w->state = FWS_C;
- w->leaf = fn->leaf;
+ w->leaf = rcu_dereference_protected(fn->leaf, 1);
+ fallthrough;
case FWS_C:
- if (w->leaf && fn->fn_flags&RTN_RTINFO) {
- int err = w->func(w);
+ if (w->leaf && fn->fn_flags & RTN_RTINFO) {
+ int err;
+
+ if (w->skip) {
+ w->skip--;
+ goto skip;
+ }
+
+ err = w->func(w);
if (err)
return err;
+
+ w->count++;
continue;
}
+skip:
w->state = FWS_U;
+ fallthrough;
case FWS_U:
if (fn == w->root)
return 0;
- pn = fn->parent;
+ pn = rcu_dereference_protected(fn->parent, 1);
+ left = rcu_dereference_protected(pn->left, 1);
+ right = rcu_dereference_protected(pn->right, 1);
w->node = pn;
#ifdef CONFIG_IPV6_SUBTREES
if (FIB6_SUBTREE(pn) == fn) {
- BUG_TRAP(fn->fn_flags&RTN_ROOT);
+ WARN_ON(!(fn->fn_flags & RTN_ROOT));
w->state = FWS_L;
continue;
}
#endif
- if (pn->left == fn) {
+ if (left == fn) {
w->state = FWS_R;
continue;
}
- if (pn->right == fn) {
+ if (right == fn) {
w->state = FWS_C;
- w->leaf = w->node->leaf;
+ w->leaf = rcu_dereference_protected(w->node->leaf, 1);
continue;
}
#if RT6_DEBUG >= 2
- BUG_TRAP(0);
+ WARN_ON(1);
#endif
}
}
}
-static int fib6_walk(struct fib6_walker_t *w)
+static int fib6_walk(struct net *net, struct fib6_walker *w)
{
int res;
w->state = FWS_INIT;
w->node = w->root;
- fib6_walker_link(w);
+ fib6_walker_link(net, w);
res = fib6_walk_continue(w);
if (res <= 0)
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
return res;
}
-static int fib6_clean_node(struct fib6_walker_t *w)
+static int fib6_clean_node(struct fib6_walker *w)
{
int res;
- struct rt6_info *rt;
- struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w;
+ struct fib6_info *rt;
+ struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
+ struct nl_info info = {
+ .nl_net = c->net,
+ .skip_notify = c->skip_notify,
+ };
+
+ if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
+ READ_ONCE(w->node->fn_sernum) != c->sernum)
+ WRITE_ONCE(w->node->fn_sernum, c->sernum);
+
+ if (!c->func) {
+ WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
+ w->leaf = NULL;
+ return 0;
+ }
- for (rt = w->leaf; rt; rt = rt->u.next) {
+ for_each_fib6_walker_rt(w) {
res = c->func(rt, c->arg);
- if (res < 0) {
+ if (res == -1) {
w->leaf = rt;
- res = fib6_del(rt, NULL);
+ res = fib6_del(rt, &info);
if (res) {
#if RT6_DEBUG >= 2
- printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
+ pr_debug("%s: del failed: rt=%p@%p err=%d\n",
+ __func__, rt,
+ rcu_access_pointer(rt->fib6_node),
+ res);
#endif
continue;
}
return 0;
+ } else if (res == -2) {
+ if (WARN_ON(!rt->fib6_nsiblings))
+ continue;
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info, fib6_siblings);
+ continue;
}
- BUG_TRAP(res==0);
+ WARN_ON(res != 0);
}
w->leaf = rt;
return 0;
@@ -1338,151 +2276,551 @@ static int fib6_clean_node(struct fib6_walker_t *w)
/*
* Convenient frontend to tree walker.
- *
+ *
* func is called on each route.
- * It may return -1 -> delete this route.
+ * It may return -2 -> skip multipath route.
+ * -1 -> delete this route.
* 0 -> continue walking
- *
- * prune==1 -> only immediate children of node (certainly,
- * ignoring pure split nodes) will be scanned.
*/
-static void fib6_clean_tree(struct fib6_node *root,
- int (*func)(struct rt6_info *, void *arg),
- int prune, void *arg)
+static void fib6_clean_tree(struct net *net, struct fib6_node *root,
+ int (*func)(struct fib6_info *, void *arg),
+ int sernum, void *arg, bool skip_notify)
{
- struct fib6_cleaner_t c;
+ struct fib6_cleaner c;
c.w.root = root;
c.w.func = fib6_clean_node;
- c.w.prune = prune;
+ c.w.count = 0;
+ c.w.skip = 0;
+ c.w.skip_in_node = 0;
c.func = func;
+ c.sernum = sernum;
c.arg = arg;
+ c.net = net;
+ c.skip_notify = skip_notify;
- fib6_walk(&c.w);
+ fib6_walk(net, &c.w);
}
-void fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
- int prune, void *arg)
+static void __fib6_clean_all(struct net *net,
+ int (*func)(struct fib6_info *, void *),
+ int sernum, void *arg, bool skip_notify)
{
struct fib6_table *table;
- struct hlist_node *node;
+ struct hlist_head *head;
unsigned int h;
rcu_read_lock();
- for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry_rcu(table, node, &fib_table_hash[h],
- tb6_hlist) {
- write_lock_bh(&table->tb6_lock);
- fib6_clean_tree(&table->tb6_root, func, prune, arg);
- write_unlock_bh(&table->tb6_lock);
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ spin_lock_bh(&table->tb6_lock);
+ fib6_clean_tree(net, &table->tb6_root,
+ func, sernum, arg, skip_notify);
+ spin_unlock_bh(&table->tb6_lock);
}
}
rcu_read_unlock();
}
-static int fib6_prune_clone(struct rt6_info *rt, void *arg)
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
+ void *arg)
{
- if (rt->rt6i_flags & RTF_CACHE) {
- RT6_TRACE("pruning clone %p\n", rt);
- return -1;
- }
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
+}
- return 0;
+void fib6_clean_all_skip_notify(struct net *net,
+ int (*func)(struct fib6_info *, void *),
+ void *arg)
+{
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}
-static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt)
+static void fib6_flush_trees(struct net *net)
{
- fib6_clean_tree(fn, fib6_prune_clone, 1, rt);
+ int new_sernum = fib6_new_sernum(net);
+
+ __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}
/*
* Garbage collection
*/
-static struct fib6_gc_args
-{
- int timeout;
- int more;
-} gc_args;
-
-static int fib6_age(struct rt6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
{
unsigned long now = jiffies;
/*
* check addrconf expiration here.
* Routes are expired even if they are in use.
- *
- * Also age clones. Note, that clones are aged out
- * only if they are not in use now.
*/
- if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
- if (time_after(now, rt->rt6i_expires)) {
- RT6_TRACE("expiring %p\n", rt);
+ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
+ if (time_after(now, rt->expires)) {
+ pr_debug("expiring %p\n", rt);
return -1;
}
- gc_args.more++;
- } else if (rt->rt6i_flags & RTF_CACHE) {
- if (atomic_read(&rt->u.dst.__refcnt) == 0 &&
- time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) {
- RT6_TRACE("aging clone %p\n", rt);
- return -1;
- } else if ((rt->rt6i_flags & RTF_GATEWAY) &&
- (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) {
- RT6_TRACE("purging route %p via non-router but gateway\n",
- rt);
- return -1;
- }
- gc_args.more++;
+ gc_args->more++;
}
+ /* Also age clones in the exception table.
+ * Note, that clones are aged out
+ * only if they are not in use now.
+ */
+ rt6_age_exceptions(rt, gc_args, now);
+
return 0;
}
-static DEFINE_SPINLOCK(fib6_gc_lock);
+static void fib6_gc_table(struct net *net,
+ struct fib6_table *tb6,
+ struct fib6_gc_args *gc_args)
+{
+ struct fib6_info *rt;
+ struct hlist_node *n;
+ struct nl_info info = {
+ .nl_net = net,
+ .skip_notify = false,
+ };
-void fib6_run_gc(unsigned long dummy)
+ hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
+ if (fib6_age(rt, gc_args) == -1)
+ fib6_del(rt, &info);
+}
+
+static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
{
- if (dummy != ~0UL) {
- spin_lock_bh(&fib6_gc_lock);
- gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval;
- } else {
- local_bh_disable();
- if (!spin_trylock(&fib6_gc_lock)) {
- mod_timer(&ip6_fib_timer, jiffies + HZ);
- local_bh_enable();
- return;
+ struct fib6_table *table;
+ struct hlist_head *head;
+ unsigned int h;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ spin_lock_bh(&table->tb6_lock);
+
+ fib6_gc_table(net, table, gc_args);
+
+ spin_unlock_bh(&table->tb6_lock);
}
- gc_args.timeout = ip6_rt_gc_interval;
}
+ rcu_read_unlock();
+}
+
+void fib6_run_gc(unsigned long expires, struct net *net, bool force)
+{
+ struct fib6_gc_args gc_args;
+ unsigned long now;
+
+ if (force) {
+ spin_lock_bh(&net->ipv6.fib6_gc_lock);
+ } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
+ mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
+ return;
+ }
+ gc_args.timeout = expires ? (int)expires :
+ net->ipv6.sysctl.ip6_rt_gc_interval;
gc_args.more = 0;
- ndisc_dst_gc(&gc_args.more);
- fib6_clean_all(fib6_age, 0, NULL);
+ fib6_gc_all(net, &gc_args);
+ now = jiffies;
+ net->ipv6.ip6_rt_last_gc = now;
if (gc_args.more)
- mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
- else {
- del_timer(&ip6_fib_timer);
- ip6_fib_timer.expires = 0;
+ mod_timer(&net->ipv6.ip6_fib_timer,
+ round_jiffies(now
+ + net->ipv6.sysctl.ip6_rt_gc_interval));
+ else
+ timer_delete(&net->ipv6.ip6_fib_timer);
+ spin_unlock_bh(&net->ipv6.fib6_gc_lock);
+}
+
+static void fib6_gc_timer_cb(struct timer_list *t)
+{
+ struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer);
+
+ fib6_run_gc(0, arg, true);
+}
+
+static int __net_init fib6_net_init(struct net *net)
+{
+ size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+ int err;
+
+ err = fib6_notifier_init(net);
+ if (err)
+ return err;
+
+ /* Default to 3-tuple */
+ net->ipv6.sysctl.multipath_hash_fields =
+ FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+
+ spin_lock_init(&net->ipv6.fib6_gc_lock);
+ rwlock_init(&net->ipv6.fib6_walker_lock);
+ INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
+ timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
+
+ net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
+ if (!net->ipv6.rt6_stats)
+ goto out_notifier;
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv6.fib_table_hash)
+ goto out_rt6_stats;
+
+ spin_lock_init(&net->ipv6.fib_table_hash_lock);
+
+ net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
+ GFP_KERNEL);
+ if (!net->ipv6.fib6_main_tbl)
+ goto out_fib_table_hash;
+
+ net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
+ rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
+ net->ipv6.fib6_null_entry);
+ net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
+ RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+ inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
+ INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
+ GFP_KERNEL);
+ if (!net->ipv6.fib6_local_tbl)
+ goto out_fib6_main_tbl;
+ net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
+ rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
+ net->ipv6.fib6_null_entry);
+ net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
+ RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+ inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
+ INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
+#endif
+ fib6_tables_init(net);
+
+ return 0;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_fib6_main_tbl:
+ kfree(net->ipv6.fib6_main_tbl);
+#endif
+out_fib_table_hash:
+ kfree(net->ipv6.fib_table_hash);
+out_rt6_stats:
+ kfree(net->ipv6.rt6_stats);
+out_notifier:
+ fib6_notifier_exit(net);
+ return -ENOMEM;
+}
+
+static void fib6_net_exit(struct net *net)
+{
+ unsigned int i;
+
+ timer_delete_sync(&net->ipv6.ip6_fib_timer);
+
+ for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[i];
+ struct hlist_node *tmp;
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
+ hlist_del(&tb->tb6_hlist);
+ fib6_free_table(tb);
+ }
}
- spin_unlock_bh(&fib6_gc_lock);
+
+ kfree(net->ipv6.fib_table_hash);
+ kfree(net->ipv6.rt6_stats);
+ fib6_notifier_exit(net);
}
-void __init fib6_init(void)
+static struct pernet_operations fib6_net_ops = {
+ .init = fib6_net_init,
+ .exit = fib6_net_exit,
+};
+
+static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
+ .dumpit = inet6_dump_fib,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+};
+
+int __init fib6_init(void)
{
- fib6_node_kmem = kmem_cache_create("fib6_nodes",
- sizeof(struct fib6_node),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL, NULL);
+ int ret = -ENOMEM;
+
+ fib6_node_kmem = KMEM_CACHE(fib6_node,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
+ if (!fib6_node_kmem)
+ goto out;
+
+ ret = register_pernet_subsys(&fib6_net_ops);
+ if (ret)
+ goto out_kmem_cache_create;
- fib6_tables_init();
+ ret = rtnl_register_many(fib6_rtnl_msg_handlers);
+ if (ret)
+ goto out_unregister_subsys;
+
+ __fib6_flush_trees = fib6_flush_trees;
+out:
+ return ret;
+
+out_unregister_subsys:
+ unregister_pernet_subsys(&fib6_net_ops);
+out_kmem_cache_create:
+ kmem_cache_destroy(fib6_node_kmem);
+ goto out;
}
void fib6_gc_cleanup(void)
{
- del_timer(&ip6_fib_timer);
+ unregister_pernet_subsys(&fib6_net_ops);
kmem_cache_destroy(fib6_node_kmem);
}
+
+#ifdef CONFIG_PROC_FS
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
+{
+ struct fib6_info *rt = v;
+ struct ipv6_route_iter *iter = seq->private;
+ struct fib6_nh *fib6_nh = rt->fib6_nh;
+ unsigned int flags = rt->fib6_flags;
+ const struct net_device *dev;
+
+ if (rt->nh)
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
+#else
+ seq_puts(seq, "00000000000000000000000000000000 00 ");
+#endif
+ if (fib6_nh->fib_nh_gw_family) {
+ flags |= RTF_GATEWAY;
+ seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
+ } else {
+ seq_puts(seq, "00000000000000000000000000000000");
+ }
+
+ dev = fib6_nh->fib_nh_dev;
+ seq_printf(seq, " %08x %08x %08x %08x %8s\n",
+ rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
+ flags, dev ? dev->name : "");
+ iter->w.leaf = NULL;
+ return 0;
+}
+
+static int ipv6_route_yield(struct fib6_walker *w)
+{
+ struct ipv6_route_iter *iter = w->args;
+
+ if (!iter->skip)
+ return 1;
+
+ do {
+ iter->w.leaf = rcu_dereference_protected(
+ iter->w.leaf->fib6_next,
+ lockdep_is_held(&iter->tbl->tb6_lock));
+ iter->skip--;
+ if (!iter->skip && iter->w.leaf)
+ return 1;
+ } while (iter->w.leaf);
+
+ return 0;
+}
+
+static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
+ struct net *net)
+{
+ memset(&iter->w, 0, sizeof(iter->w));
+ iter->w.func = ipv6_route_yield;
+ iter->w.root = &iter->tbl->tb6_root;
+ iter->w.state = FWS_INIT;
+ iter->w.node = iter->w.root;
+ iter->w.args = iter;
+ iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
+ INIT_LIST_HEAD(&iter->w.lh);
+ fib6_walker_link(net, &iter->w);
+}
+
+static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
+ struct net *net)
+{
+ unsigned int h;
+ struct hlist_node *node;
+
+ if (tbl) {
+ h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
+ node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
+ } else {
+ h = 0;
+ node = NULL;
+ }
+
+ while (!node && h < FIB6_TABLE_HASHSZ) {
+ node = rcu_dereference(
+ hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
+ }
+ return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
+}
+
+static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
+{
+ int sernum = READ_ONCE(iter->w.root->fn_sernum);
+
+ if (iter->sernum != sernum) {
+ iter->sernum = sernum;
+ iter->w.state = FWS_INIT;
+ iter->w.node = iter->w.root;
+ WARN_ON(iter->w.skip);
+ iter->w.skip = iter->w.count;
+ }
+}
+
+static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int r;
+ struct fib6_info *n;
+ struct net *net = seq_file_net(seq);
+ struct ipv6_route_iter *iter = seq->private;
+
+ ++(*pos);
+ if (!v)
+ goto iter_table;
+
+ n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
+ if (n)
+ return n;
+
+iter_table:
+ ipv6_route_check_sernum(iter);
+ spin_lock_bh(&iter->tbl->tb6_lock);
+ r = fib6_walk_continue(&iter->w);
+ spin_unlock_bh(&iter->tbl->tb6_lock);
+ if (r > 0) {
+ return iter->w.leaf;
+ } else if (r < 0) {
+ fib6_walker_unlink(net, &iter->w);
+ return NULL;
+ }
+ fib6_walker_unlink(net, &iter->w);
+
+ iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
+ if (!iter->tbl)
+ return NULL;
+
+ ipv6_route_seq_setup_walk(iter, net);
+ goto iter_table;
+}
+
+static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct net *net = seq_file_net(seq);
+ struct ipv6_route_iter *iter = seq->private;
+
+ rcu_read_lock();
+ iter->tbl = ipv6_route_seq_next_table(NULL, net);
+ iter->skip = *pos;
+
+ if (iter->tbl) {
+ loff_t p = 0;
+
+ ipv6_route_seq_setup_walk(iter, net);
+ return ipv6_route_seq_next(seq, NULL, &p);
+ } else {
+ return NULL;
+ }
+}
+
+static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
+{
+ struct fib6_walker *w = &iter->w;
+ return w->node && !(w->state == FWS_U && w->node == w->root);
+}
+
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ struct net *net = seq_file_net(seq);
+ struct ipv6_route_iter *iter = seq->private;
+
+ if (ipv6_route_iter_active(iter))
+ fib6_walker_unlink(net, &iter->w);
+
+ rcu_read_unlock();
+}
+
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
+ struct bpf_iter_meta *meta,
+ void *v)
+{
+ struct bpf_iter__ipv6_route ctx;
+
+ ctx.meta = meta;
+ ctx.rt = v;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct ipv6_route_iter *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (!prog)
+ return ipv6_route_native_seq_show(seq, v);
+
+ ret = ipv6_route_prog_seq_show(prog, &meta, v);
+ iter->w.leaf = NULL;
+
+ return ret;
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)ipv6_route_prog_seq_show(prog, &meta, v);
+ }
+
+ ipv6_route_native_seq_stop(seq, v);
+}
+#else
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ return ipv6_route_native_seq_show(seq, v);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+ ipv6_route_native_seq_stop(seq, v);
+}
+#endif
+
+const struct seq_operations ipv6_route_seq_ops = {
+ .start = ipv6_route_seq_start,
+ .next = ipv6_route_seq_next,
+ .stop = ipv6_route_seq_stop,
+ .show = ipv6_route_seq_show
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 624fae251f4e..60d0be47a9f3 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip6_flowlabel.c IPv6 flowlabel manager.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -15,29 +11,27 @@
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/netdevice.h>
-#include <linux/if_arp.h>
#include <linux/in6.h>
-#include <linux/route.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/pid_namespace.h>
+#include <linux/jump_label_ratelimit.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/ipv6.h>
-#include <net/ndisc.h>
-#include <net/protocol.h>
-#include <net/ip6_route.h>
-#include <net/addrconf.h>
#include <net/rawv6.h>
-#include <net/icmp.h>
#include <net/transp_v6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified
in old IPv6 RFC. Well, it was reasonable value.
*/
-#define FL_MAX_LINGER 60 /* Maximal linger timeout */
+#define FL_MAX_LINGER 150 /* Maximal linger timeout */
/* FL hash table */
@@ -47,54 +41,91 @@
#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK)
static atomic_t fl_size = ATOMIC_INIT(0);
-static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1];
+static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
-static void ip6_fl_gc(unsigned long dummy);
-static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0);
+static void ip6_fl_gc(struct timer_list *unused);
+static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
/* FL hash table lock: it protects only of GC */
-static DEFINE_RWLOCK(ip6_fl_lock);
+static DEFINE_SPINLOCK(ip6_fl_lock);
/* Big socket sock */
-static DEFINE_RWLOCK(ip6_sk_fl_lock);
+static DEFINE_SPINLOCK(ip6_sk_fl_lock);
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
+EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
+
+#define for_each_fl_rcu(hash, fl) \
+ for (fl = rcu_dereference(fl_ht[(hash)]); \
+ fl != NULL; \
+ fl = rcu_dereference(fl->next))
+#define for_each_fl_continue_rcu(fl) \
+ for (fl = rcu_dereference(fl->next); \
+ fl != NULL; \
+ fl = rcu_dereference(fl->next))
+#define for_each_sk_fl_rcu(sk, sfl) \
+ for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \
+ sfl != NULL; \
+ sfl = rcu_dereference(sfl->next))
-static __inline__ struct ip6_flowlabel * __fl_lookup(__be32 label)
+static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label)
{
struct ip6_flowlabel *fl;
- for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) {
- if (fl->label == label)
+ for_each_fl_rcu(FL_HASH(label), fl) {
+ if (fl->label == label && net_eq(fl->fl_net, net))
return fl;
}
return NULL;
}
-static struct ip6_flowlabel * fl_lookup(__be32 label)
+static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
{
struct ip6_flowlabel *fl;
- read_lock_bh(&ip6_fl_lock);
- fl = __fl_lookup(label);
- if (fl)
- atomic_inc(&fl->users);
- read_unlock_bh(&ip6_fl_lock);
+ rcu_read_lock();
+ fl = __fl_lookup(net, label);
+ if (fl && !atomic_inc_not_zero(&fl->users))
+ fl = NULL;
+ rcu_read_unlock();
return fl;
}
+static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
+{
+ return fl->share == IPV6_FL_S_EXCL ||
+ fl->share == IPV6_FL_S_PROCESS ||
+ fl->share == IPV6_FL_S_USER;
+}
-static void fl_free(struct ip6_flowlabel *fl)
+static void fl_free_rcu(struct rcu_head *head)
{
- if (fl)
- kfree(fl->opt);
+ struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
+
+ if (fl->share == IPV6_FL_S_PROCESS)
+ put_pid(fl->owner.pid);
+ kfree(fl->opt);
kfree(fl);
}
+
+static void fl_free(struct ip6_flowlabel *fl)
+{
+ if (!fl)
+ return;
+
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
+
+ call_rcu(&fl->rcu, fl_free_rcu);
+}
+
static void fl_release(struct ip6_flowlabel *fl)
{
- write_lock_bh(&ip6_fl_lock);
+ spin_lock_bh(&ip6_fl_lock);
fl->lastuse = jiffies;
if (atomic_dec_and_test(&fl->users)) {
@@ -111,22 +142,24 @@ static void fl_release(struct ip6_flowlabel *fl)
time_after(ip6_fl_gc_timer.expires, ttd))
mod_timer(&ip6_fl_gc_timer, ttd);
}
-
- write_unlock_bh(&ip6_fl_lock);
+ spin_unlock_bh(&ip6_fl_lock);
}
-static void ip6_fl_gc(unsigned long dummy)
+static void ip6_fl_gc(struct timer_list *unused)
{
int i;
unsigned long now = jiffies;
unsigned long sched = 0;
- write_lock(&ip6_fl_lock);
+ spin_lock(&ip6_fl_lock);
+
+ for (i = 0; i <= FL_HASH_MASK; i++) {
+ struct ip6_flowlabel *fl;
+ struct ip6_flowlabel __rcu **flp;
- for (i=0; i<=FL_HASH_MASK; i++) {
- struct ip6_flowlabel *fl, **flp;
flp = &fl_ht[i];
- while ((fl=*flp) != NULL) {
+ while ((fl = rcu_dereference_protected(*flp,
+ lockdep_is_held(&ip6_fl_lock))) != NULL) {
if (atomic_read(&fl->users) == 0) {
unsigned long ttd = fl->lastuse + fl->linger;
if (time_after(ttd, fl->expires))
@@ -147,71 +180,126 @@ static void ip6_fl_gc(unsigned long dummy)
if (!sched && atomic_read(&fl_size))
sched = now + FL_MAX_LINGER;
if (sched) {
- ip6_fl_gc_timer.expires = sched;
- add_timer(&ip6_fl_gc_timer);
+ mod_timer(&ip6_fl_gc_timer, sched);
}
- write_unlock(&ip6_fl_lock);
+ spin_unlock(&ip6_fl_lock);
}
-static int fl_intern(struct ip6_flowlabel *fl, __be32 label)
+static void __net_exit ip6_fl_purge(struct net *net)
{
+ int i;
+
+ spin_lock_bh(&ip6_fl_lock);
+ for (i = 0; i <= FL_HASH_MASK; i++) {
+ struct ip6_flowlabel *fl;
+ struct ip6_flowlabel __rcu **flp;
+
+ flp = &fl_ht[i];
+ while ((fl = rcu_dereference_protected(*flp,
+ lockdep_is_held(&ip6_fl_lock))) != NULL) {
+ if (net_eq(fl->fl_net, net) &&
+ atomic_read(&fl->users) == 0) {
+ *flp = fl->next;
+ fl_free(fl);
+ atomic_dec(&fl_size);
+ continue;
+ }
+ flp = &fl->next;
+ }
+ }
+ spin_unlock_bh(&ip6_fl_lock);
+}
+
+static struct ip6_flowlabel *fl_intern(struct net *net,
+ struct ip6_flowlabel *fl, __be32 label)
+{
+ struct ip6_flowlabel *lfl;
+
fl->label = label & IPV6_FLOWLABEL_MASK;
- write_lock_bh(&ip6_fl_lock);
+ rcu_read_lock();
+ spin_lock_bh(&ip6_fl_lock);
if (label == 0) {
for (;;) {
- fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK;
+ fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
if (fl->label) {
- struct ip6_flowlabel *lfl;
- lfl = __fl_lookup(fl->label);
- if (lfl == NULL)
+ lfl = __fl_lookup(net, fl->label);
+ if (!lfl)
break;
}
}
+ } else {
+ /*
+ * we dropper the ip6_fl_lock, so this entry could reappear
+ * and we need to recheck with it.
+ *
+ * OTOH no need to search the active socket first, like it is
+ * done in ipv6_flowlabel_opt - sock is locked, so new entry
+ * with the same label can only appear on another sock
+ */
+ lfl = __fl_lookup(net, fl->label);
+ if (lfl) {
+ atomic_inc(&lfl->users);
+ spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
+ return lfl;
+ }
}
fl->lastuse = jiffies;
fl->next = fl_ht[FL_HASH(fl->label)];
- fl_ht[FL_HASH(fl->label)] = fl;
+ rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
atomic_inc(&fl_size);
- write_unlock_bh(&ip6_fl_lock);
- return 0;
+ spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
+ return NULL;
}
/* Socket flowlabel lists */
-struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label)
+struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
{
struct ipv6_fl_socklist *sfl;
- struct ipv6_pinfo *np = inet6_sk(sk);
label &= IPV6_FLOWLABEL_MASK;
- for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) {
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl) {
struct ip6_flowlabel *fl = sfl->fl;
- if (fl->label == label) {
+
+ if (fl->label == label && atomic_inc_not_zero(&fl->users)) {
fl->lastuse = jiffies;
- atomic_inc(&fl->users);
+ rcu_read_unlock();
return fl;
}
}
+ rcu_read_unlock();
return NULL;
}
-
-EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
void fl6_free_socklist(struct sock *sk)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct ipv6_fl_socklist *sfl;
- while ((sfl = np->ipv6_fl_list) != NULL) {
- np->ipv6_fl_list = sfl->next;
+ if (!rcu_access_pointer(inet->ipv6_fl_list))
+ return;
+
+ spin_lock_bh(&ip6_sk_fl_lock);
+ while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list,
+ lockdep_is_held(&ip6_sk_fl_lock))) != NULL) {
+ inet->ipv6_fl_list = sfl->next;
+ spin_unlock_bh(&ip6_sk_fl_lock);
+
fl_release(sfl->fl);
- kfree(sfl);
+ kfree_rcu(sfl, rcu);
+
+ spin_lock_bh(&ip6_sk_fl_lock);
}
+ spin_unlock_bh(&ip6_sk_fl_lock);
}
/* Service routines */
@@ -223,16 +311,16 @@ void fl6_free_socklist(struct sock *sk)
following rthdr.
*/
-struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
- struct ip6_flowlabel * fl,
- struct ipv6_txoptions * fopt)
+struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
+ struct ip6_flowlabel *fl,
+ struct ipv6_txoptions *fopt)
{
- struct ipv6_txoptions * fl_opt = fl->opt;
-
- if (fopt == NULL || fopt->opt_flen == 0)
+ struct ipv6_txoptions *fl_opt = fl->opt;
+
+ if (!fopt || fopt->opt_flen == 0)
return fl_opt;
-
- if (fl_opt != NULL) {
+
+ if (fl_opt) {
opt_space->hopopt = fl_opt->hopopt;
opt_space->dst0opt = fl_opt->dst0opt;
opt_space->srcrt = fl_opt->srcrt;
@@ -247,8 +335,10 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
}
opt_space->dst1opt = fopt->dst1opt;
opt_space->opt_flen = fopt->opt_flen;
+ opt_space->tot_len = fopt->tot_len;
return opt_space;
}
+EXPORT_SYMBOL_GPL(fl6_merge_options);
static unsigned long check_linger(unsigned long ttl)
{
@@ -267,6 +357,8 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo
expires = check_linger(expires);
if (!expires)
return -EPERM;
+
+ spin_lock_bh(&ip6_fl_lock);
fl->lastuse = jiffies;
if (time_before(fl->linger, linger))
fl->linger = linger;
@@ -274,44 +366,53 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo
expires = fl->linger;
if (time_before(fl->expires, fl->lastuse + expires))
fl->expires = fl->lastuse + expires;
+ spin_unlock_bh(&ip6_fl_lock);
+
return 0;
}
static struct ip6_flowlabel *
-fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *err_p)
+fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
+ sockptr_t optval, int optlen, int *err_p)
{
- struct ip6_flowlabel *fl;
+ struct ip6_flowlabel *fl = NULL;
int olen;
int addr_type;
int err;
+ olen = optlen - CMSG_ALIGN(sizeof(*freq));
+ err = -EINVAL;
+ if (olen > 64 * 1024)
+ goto done;
+
err = -ENOMEM;
fl = kzalloc(sizeof(*fl), GFP_KERNEL);
- if (fl == NULL)
+ if (!fl)
goto done;
- olen = optlen - CMSG_ALIGN(sizeof(*freq));
if (olen > 0) {
struct msghdr msg;
- struct flowi flowi;
- int junk;
+ struct flowi6 flowi6;
+ struct ipcm6_cookie ipc6;
err = -ENOMEM;
fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL);
- if (fl->opt == NULL)
+ if (!fl->opt)
goto done;
memset(fl->opt, 0, sizeof(*fl->opt));
fl->opt->tot_len = sizeof(*fl->opt) + olen;
err = -EFAULT;
- if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen))
+ if (copy_from_sockptr_offset(fl->opt + 1, optval,
+ CMSG_ALIGN(sizeof(*freq)), olen))
goto done;
msg.msg_controllen = olen;
- msg.msg_control = (void*)(fl->opt+1);
- flowi.oif = 0;
+ msg.msg_control = (void *)(fl->opt+1);
+ memset(&flowi6, 0, sizeof(flowi6));
- err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk, &junk);
+ ipc6.opt = fl->opt;
+ err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6);
if (err)
goto done;
err = -EINVAL;
@@ -323,245 +424,330 @@ fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *
}
}
+ fl->fl_net = net;
fl->expires = jiffies;
err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
if (err)
goto done;
fl->share = freq->flr_share;
addr_type = ipv6_addr_type(&freq->flr_dst);
- if ((addr_type&IPV6_ADDR_MAPPED)
- || addr_type == IPV6_ADDR_ANY) {
+ if ((addr_type & IPV6_ADDR_MAPPED) ||
+ addr_type == IPV6_ADDR_ANY) {
err = -EINVAL;
goto done;
}
- ipv6_addr_copy(&fl->dst, &freq->flr_dst);
+ fl->dst = freq->flr_dst;
atomic_set(&fl->users, 1);
switch (fl->share) {
case IPV6_FL_S_EXCL:
case IPV6_FL_S_ANY:
break;
case IPV6_FL_S_PROCESS:
- fl->owner = current->pid;
+ fl->owner.pid = get_task_pid(current, PIDTYPE_PID);
break;
case IPV6_FL_S_USER:
- fl->owner = current->euid;
+ fl->owner.uid = current_euid();
break;
default:
err = -EINVAL;
goto done;
}
+ if (fl_shared_exclusive(fl) || fl->opt) {
+ WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1);
+ static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
+ }
return fl;
done:
- fl_free(fl);
+ if (fl) {
+ kfree(fl->opt);
+ kfree(fl);
+ }
*err_p = err;
return NULL;
}
static int mem_check(struct sock *sk)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_fl_socklist *sfl;
int room = FL_MAX_SIZE - atomic_read(&fl_size);
+ struct ipv6_fl_socklist *sfl;
int count = 0;
if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
return 0;
- for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next)
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl)
count++;
+ rcu_read_unlock();
if (room <= 0 ||
((count >= FL_MAX_PER_SOCK ||
- (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4)
- && !capable(CAP_NET_ADMIN)))
+ (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
+ !capable(CAP_NET_ADMIN)))
return -ENOBUFS;
return 0;
}
-static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2)
+static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl,
+ struct ip6_flowlabel *fl)
{
- if (h1 == h2)
- return 0;
- if (h1 == NULL || h2 == NULL)
- return 1;
- if (h1->hdrlen != h2->hdrlen)
- return 1;
- return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1));
+ struct inet_sock *inet = inet_sk(sk);
+
+ spin_lock_bh(&ip6_sk_fl_lock);
+ sfl->fl = fl;
+ sfl->next = inet->ipv6_fl_list;
+ rcu_assign_pointer(inet->ipv6_fl_list, sfl);
+ spin_unlock_bh(&ip6_sk_fl_lock);
}
-static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2)
+int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
+ int flags)
{
- if (o1 == o2)
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_fl_socklist *sfl;
+
+ if (flags & IPV6_FL_F_REMOTE) {
+ freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK;
return 0;
- if (o1 == NULL || o2 == NULL)
- return 1;
- if (o1->opt_nflen != o2->opt_nflen)
- return 1;
- if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt))
- return 1;
- if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt))
- return 1;
- if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt))
- return 1;
- return 0;
+ }
+
+ if (inet6_test_bit(REPFLOW, sk)) {
+ freq->flr_label = np->flow_label;
+ return 0;
+ }
+
+ rcu_read_lock();
+
+ for_each_sk_fl_rcu(sk, sfl) {
+ if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+ spin_lock_bh(&ip6_fl_lock);
+ freq->flr_label = sfl->fl->label;
+ freq->flr_dst = sfl->fl->dst;
+ freq->flr_share = sfl->fl->share;
+ freq->flr_expires = (sfl->fl->expires - jiffies) / HZ;
+ freq->flr_linger = sfl->fl->linger / HZ;
+
+ spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ return -ENOENT;
}
-int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
+#define socklist_dereference(__sflp) \
+ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock))
+
+static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
{
- int err;
struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_flowlabel_req freq;
- struct ipv6_fl_socklist *sfl1=NULL;
- struct ipv6_fl_socklist *sfl, **sflp;
- struct ip6_flowlabel *fl;
+ struct ipv6_fl_socklist __rcu **sflp;
+ struct ipv6_fl_socklist *sfl;
- if (optlen < sizeof(freq))
- return -EINVAL;
+ if (freq->flr_flags & IPV6_FL_F_REFLECT) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ if (!inet6_test_bit(REPFLOW, sk))
+ return -ESRCH;
+ np->flow_label = 0;
+ inet6_clear_bit(REPFLOW, sk);
+ return 0;
+ }
- if (copy_from_user(&freq, optval, sizeof(freq)))
- return -EFAULT;
+ spin_lock_bh(&ip6_sk_fl_lock);
+ for (sflp = &inet_sk(sk)->ipv6_fl_list;
+ (sfl = socklist_dereference(*sflp)) != NULL;
+ sflp = &sfl->next) {
+ if (sfl->fl->label == freq->flr_label)
+ goto found;
+ }
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ return -ESRCH;
+found:
+ if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK))
+ np->flow_label &= ~IPV6_FLOWLABEL_MASK;
+ *sflp = sfl->next;
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ fl_release(sfl->fl);
+ kfree_rcu(sfl, rcu);
+ return 0;
+}
- switch (freq.flr_action) {
- case IPV6_FL_A_PUT:
- write_lock_bh(&ip6_sk_fl_lock);
- for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) {
- if (sfl->fl->label == freq.flr_label) {
- if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
- np->flow_label &= ~IPV6_FLOWLABEL_MASK;
- *sflp = sfl->next;
- write_unlock_bh(&ip6_sk_fl_lock);
- fl_release(sfl->fl);
- kfree(sfl);
- return 0;
- }
- }
- write_unlock_bh(&ip6_sk_fl_lock);
- return -ESRCH;
+static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq)
+{
+ struct net *net = sock_net(sk);
+ struct ipv6_fl_socklist *sfl;
+ int err;
- case IPV6_FL_A_RENEW:
- read_lock_bh(&ip6_sk_fl_lock);
- for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) {
- if (sfl->fl->label == freq.flr_label) {
- err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires);
- read_unlock_bh(&ip6_sk_fl_lock);
- return err;
- }
- }
- read_unlock_bh(&ip6_sk_fl_lock);
-
- if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) {
- fl = fl_lookup(freq.flr_label);
- if (fl) {
- err = fl6_renew(fl, freq.flr_linger, freq.flr_expires);
- fl_release(fl);
- return err;
- }
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl) {
+ if (sfl->fl->label == freq->flr_label) {
+ err = fl6_renew(sfl->fl, freq->flr_linger,
+ freq->flr_expires);
+ rcu_read_unlock();
+ return err;
}
- return -ESRCH;
+ }
+ rcu_read_unlock();
- case IPV6_FL_A_GET:
- if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
- return -EINVAL;
+ if (freq->flr_share == IPV6_FL_S_NONE &&
+ ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label);
- fl = fl_create(&freq, optval, optlen, &err);
- if (fl == NULL)
+ if (fl) {
+ err = fl6_renew(fl, freq->flr_linger,
+ freq->flr_expires);
+ fl_release(fl);
return err;
- sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+ }
+ }
+ return -ESRCH;
+}
- if (freq.flr_label) {
- struct ip6_flowlabel *fl1 = NULL;
+static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
+ sockptr_t optval, int optlen)
+{
+ struct ipv6_fl_socklist *sfl, *sfl1 = NULL;
+ struct ip6_flowlabel *fl, *fl1 = NULL;
+ struct net *net = sock_net(sk);
+ int err;
- err = -EEXIST;
- read_lock_bh(&ip6_sk_fl_lock);
- for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) {
- if (sfl->fl->label == freq.flr_label) {
- if (freq.flr_flags&IPV6_FL_F_EXCL) {
- read_unlock_bh(&ip6_sk_fl_lock);
- goto done;
- }
- fl1 = sfl->fl;
- atomic_inc(&fl1->users);
- break;
+ if (freq->flr_flags & IPV6_FL_F_REFLECT) {
+ if (net->ipv6.sysctl.flowlabel_consistency) {
+ net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
+ return -EPERM;
+ }
+
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ inet6_set_bit(REPFLOW, sk);
+ return 0;
+ }
+
+ if (freq->flr_label & ~IPV6_FLOWLABEL_MASK)
+ return -EINVAL;
+ if (net->ipv6.sysctl.flowlabel_state_ranges &&
+ (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
+ return -ERANGE;
+
+ fl = fl_create(net, sk, freq, optval, optlen, &err);
+ if (!fl)
+ return err;
+
+ sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+
+ if (freq->flr_label) {
+ err = -EEXIST;
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl) {
+ if (sfl->fl->label == freq->flr_label) {
+ if (freq->flr_flags & IPV6_FL_F_EXCL) {
+ rcu_read_unlock();
+ goto done;
}
+ fl1 = sfl->fl;
+ if (!atomic_inc_not_zero(&fl1->users))
+ fl1 = NULL;
+ break;
}
- read_unlock_bh(&ip6_sk_fl_lock);
-
- if (fl1 == NULL)
- fl1 = fl_lookup(freq.flr_label);
- if (fl1) {
- err = -EEXIST;
- if (freq.flr_flags&IPV6_FL_F_EXCL)
- goto release;
- err = -EPERM;
- if (fl1->share == IPV6_FL_S_EXCL ||
- fl1->share != fl->share ||
- fl1->owner != fl->owner)
- goto release;
-
- err = -EINVAL;
- if (!ipv6_addr_equal(&fl1->dst, &fl->dst) ||
- ipv6_opt_cmp(fl1->opt, fl->opt))
- goto release;
-
- err = -ENOMEM;
- if (sfl1 == NULL)
- goto release;
- if (fl->linger > fl1->linger)
- fl1->linger = fl->linger;
- if ((long)(fl->expires - fl1->expires) > 0)
- fl1->expires = fl->expires;
- write_lock_bh(&ip6_sk_fl_lock);
- sfl1->fl = fl1;
- sfl1->next = np->ipv6_fl_list;
- np->ipv6_fl_list = sfl1;
- write_unlock_bh(&ip6_sk_fl_lock);
- fl_free(fl);
- return 0;
+ }
+ rcu_read_unlock();
+
+ if (!fl1)
+ fl1 = fl_lookup(net, freq->flr_label);
+ if (fl1) {
+recheck:
+ err = -EEXIST;
+ if (freq->flr_flags&IPV6_FL_F_EXCL)
+ goto release;
+ err = -EPERM;
+ if (fl1->share == IPV6_FL_S_EXCL ||
+ fl1->share != fl->share ||
+ ((fl1->share == IPV6_FL_S_PROCESS) &&
+ (fl1->owner.pid != fl->owner.pid)) ||
+ ((fl1->share == IPV6_FL_S_USER) &&
+ !uid_eq(fl1->owner.uid, fl->owner.uid)))
+ goto release;
+
+ err = -ENOMEM;
+ if (!sfl1)
+ goto release;
+ if (fl->linger > fl1->linger)
+ fl1->linger = fl->linger;
+ if ((long)(fl->expires - fl1->expires) > 0)
+ fl1->expires = fl->expires;
+ fl_link(sk, sfl1, fl1);
+ fl_free(fl);
+ return 0;
release:
- fl_release(fl1);
- goto done;
- }
- }
- err = -ENOENT;
- if (!(freq.flr_flags&IPV6_FL_F_CREATE))
+ fl_release(fl1);
goto done;
+ }
+ }
+ err = -ENOENT;
+ if (!(freq->flr_flags & IPV6_FL_F_CREATE))
+ goto done;
- err = -ENOMEM;
- if (sfl1 == NULL || (err = mem_check(sk)) != 0)
- goto done;
+ err = -ENOMEM;
+ if (!sfl1)
+ goto done;
- err = fl_intern(fl, freq.flr_label);
- if (err)
- goto done;
+ err = mem_check(sk);
+ if (err != 0)
+ goto done;
- if (!freq.flr_label) {
- if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
- &fl->label, sizeof(fl->label))) {
- /* Intentionally ignore fault. */
- }
- }
+ fl1 = fl_intern(net, fl, freq->flr_label);
+ if (fl1)
+ goto recheck;
- sfl1->fl = fl;
- sfl1->next = np->ipv6_fl_list;
- np->ipv6_fl_list = sfl1;
- return 0;
+ if (!freq->flr_label) {
+ size_t offset = offsetof(struct in6_flowlabel_req, flr_label);
- default:
- return -EINVAL;
+ if (copy_to_sockptr_offset(optval, offset, &fl->label,
+ sizeof(fl->label))) {
+ /* Intentionally ignore fault. */
+ }
}
+ fl_link(sk, sfl1, fl);
+ return 0;
done:
fl_free(fl);
kfree(sfl1);
return err;
}
+int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen)
+{
+ struct in6_flowlabel_req freq;
+
+ if (optlen < sizeof(freq))
+ return -EINVAL;
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
+ return -EFAULT;
+
+ switch (freq.flr_action) {
+ case IPV6_FL_A_PUT:
+ return ipv6_flowlabel_put(sk, &freq);
+ case IPV6_FL_A_RENEW:
+ return ipv6_flowlabel_renew(sk, &freq);
+ case IPV6_FL_A_GET:
+ return ipv6_flowlabel_get(sk, &freq, optval, optlen);
+ default:
+ return -EINVAL;
+ }
+}
+
#ifdef CONFIG_PROC_FS
struct ip6fl_iter_state {
+ struct seq_net_private p;
+ struct pid_namespace *pid_ns;
int bucket;
};
@@ -571,27 +757,40 @@ static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq)
{
struct ip6_flowlabel *fl = NULL;
struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ struct net *net = seq_file_net(seq);
for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) {
- if (fl_ht[state->bucket]) {
- fl = fl_ht[state->bucket];
- break;
+ for_each_fl_rcu(state->bucket, fl) {
+ if (net_eq(fl->fl_net, net))
+ goto out;
}
}
+ fl = NULL;
+out:
return fl;
}
static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl)
{
struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ struct net *net = seq_file_net(seq);
- fl = fl->next;
- while (!fl) {
- if (++state->bucket <= FL_HASH_MASK)
- fl = fl_ht[state->bucket];
- else
- break;
+ for_each_fl_continue_rcu(fl) {
+ if (net_eq(fl->fl_net, net))
+ goto out;
}
+
+try_again:
+ if (++state->bucket <= FL_HASH_MASK) {
+ for_each_fl_rcu(state->bucket, fl) {
+ if (net_eq(fl->fl_net, net))
+ goto out;
+ }
+ goto try_again;
+ }
+ fl = NULL;
+
+out:
return fl;
}
@@ -605,8 +804,13 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos)
}
static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
- read_lock_bh(&ip6_fl_lock);
+ struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+
+ state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb);
+
+ rcu_read_lock();
return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -623,81 +827,84 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void ip6fl_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
- read_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
}
static int ip6fl_seq_show(struct seq_file *seq, void *v)
{
- if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n",
- "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt");
- else {
+ struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n");
+ } else {
struct ip6_flowlabel *fl = v;
seq_printf(seq,
- "%05X %-1d %-6d %-6d %-6ld %-8ld " NIP6_SEQFMT " %-4d\n",
- (unsigned)ntohl(fl->label),
+ "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n",
+ (unsigned int)ntohl(fl->label),
fl->share,
- (unsigned)fl->owner,
+ ((fl->share == IPV6_FL_S_PROCESS) ?
+ pid_nr_ns(fl->owner.pid, state->pid_ns) :
+ ((fl->share == IPV6_FL_S_USER) ?
+ from_kuid_munged(seq_user_ns(seq), fl->owner.uid) :
+ 0)),
atomic_read(&fl->users),
fl->linger/HZ,
(long)(fl->expires - jiffies)/HZ,
- NIP6(fl->dst),
+ &fl->dst,
fl->opt ? fl->opt->opt_nflen : 0);
}
return 0;
}
-static struct seq_operations ip6fl_seq_ops = {
+static const struct seq_operations ip6fl_seq_ops = {
.start = ip6fl_seq_start,
.next = ip6fl_seq_next,
.stop = ip6fl_seq_stop,
.show = ip6fl_seq_show,
};
-static int ip6fl_seq_open(struct inode *inode, struct file *file)
+static int __net_init ip6_flowlabel_proc_init(struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ip6fl_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net,
+ &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state)))
+ return -ENOMEM;
+ return 0;
+}
- if (!s)
- goto out;
+static void __net_exit ip6_flowlabel_proc_fini(struct net *net)
+{
+ remove_proc_entry("ip6_flowlabel", net->proc_net);
+}
+#else
+static inline int ip6_flowlabel_proc_init(struct net *net)
+{
+ return 0;
+}
+static inline void ip6_flowlabel_proc_fini(struct net *net)
+{
+}
+#endif
- rc = seq_open(file, &ip6fl_seq_ops);
- if (rc)
- goto out_kfree;
+static void __net_exit ip6_flowlabel_net_exit(struct net *net)
+{
+ ip6_fl_purge(net);
+ ip6_flowlabel_proc_fini(net);
+}
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations ip6fl_seq_fops = {
- .owner = THIS_MODULE,
- .open = ip6fl_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static struct pernet_operations ip6_flowlabel_net_ops = {
+ .init = ip6_flowlabel_proc_init,
+ .exit = ip6_flowlabel_net_exit,
};
-#endif
-
-void ip6_flowlabel_init(void)
+int ip6_flowlabel_init(void)
{
-#ifdef CONFIG_PROC_FS
- proc_net_fops_create("ip6_flowlabel", S_IRUGO, &ip6fl_seq_fops);
-#endif
+ return register_pernet_subsys(&ip6_flowlabel_net_ops);
}
void ip6_flowlabel_cleanup(void)
{
- del_timer(&ip6_fl_gc_timer);
-#ifdef CONFIG_PROC_FS
- proc_net_remove("ip6_flowlabel");
-#endif
+ static_key_deferred_flush(&ipv6_flowlabel_exclusive);
+ timer_delete(&ip6_fl_gc_timer);
+ unregister_pernet_subsys(&ip6_flowlabel_net_ops);
}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
new file mode 100644
index 000000000000..c82a75510c0e
--- /dev/null
+++ b/net/ipv6/ip6_gre.c
@@ -0,0 +1,2378 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * GRE over IPv6 protocol decoder.
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/hash.h>
+#include <linux/if_tunnel.h>
+#include <linux/ip6_tunnel.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/addrconf.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
+#include <net/rtnetlink.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/ip6_tunnel.h>
+#include <net/gre.h>
+#include <net/erspan.h>
+#include <net/dst_metadata.h>
+
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+#define IP6_GRE_HASH_SIZE_SHIFT 5
+#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
+
+static unsigned int ip6gre_net_id __read_mostly;
+struct ip6gre_net {
+ struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
+
+ struct ip6_tnl __rcu *collect_md_tun;
+ struct ip6_tnl __rcu *collect_md_tun_erspan;
+ struct net_device *fb_tunnel_dev;
+};
+
+static struct rtnl_link_ops ip6gre_link_ops __read_mostly;
+static struct rtnl_link_ops ip6gre_tap_ops __read_mostly;
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly;
+static int ip6gre_tunnel_init(struct net_device *dev);
+static void ip6gre_tunnel_setup(struct net_device *dev);
+static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
+static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
+static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu);
+
+/* Tunnel hash table */
+
+/*
+ 4 hash tables:
+
+ 3: (remote,local)
+ 2: (remote,*)
+ 1: (*,local)
+ 0: (*,*)
+
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ */
+
+#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
+static u32 HASH_ADDR(const struct in6_addr *addr)
+{
+ u32 hash = ipv6_addr_hash(addr);
+
+ return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT);
+}
+
+#define tunnels_r_l tunnels[3]
+#define tunnels_r tunnels[2]
+#define tunnels_l tunnels[1]
+#define tunnels_wc tunnels[0]
+
+static bool ip6gre_tunnel_match(struct ip6_tnl *t, int dev_type, int link,
+ int *cand_score, struct ip6_tnl **ret)
+{
+ int score = 0;
+
+ if (t->dev->type != ARPHRD_IP6GRE &&
+ t->dev->type != dev_type)
+ return false;
+
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0) {
+ *ret = t;
+ return true;
+ }
+
+ if (score < *cand_score) {
+ *ret = t;
+ *cand_score = score;
+ }
+ return false;
+}
+
+/* Given src, dst and key, find appropriate for input tunnel. */
+static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
+ const struct in6_addr *remote, const struct in6_addr *local,
+ __be32 key, __be16 gre_proto)
+{
+ struct net *net = dev_net(dev);
+ int link = dev->ifindex;
+ unsigned int h0 = HASH_ADDR(remote);
+ unsigned int h1 = HASH_KEY(key);
+ struct ip6_tnl *t, *cand = NULL;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ int dev_type = (gre_proto == htons(ETH_P_TEB) ||
+ gre_proto == htons(ETH_P_ERSPAN) ||
+ gre_proto == htons(ETH_P_ERSPAN2)) ?
+ ARPHRD_ETHER : ARPHRD_IP6GRE;
+ struct net_device *ndev;
+ int cand_score = 4;
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_equal(remote, &t->parms.raddr) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
+ }
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
+ if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
+ }
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
+ if ((!ipv6_addr_equal(local, &t->parms.laddr) &&
+ (!ipv6_addr_equal(local, &t->parms.raddr) ||
+ !ipv6_addr_is_multicast(local))) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
+ }
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
+ if (t->parms.i_key != key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
+ }
+
+ if (cand)
+ return cand;
+
+ if (gre_proto == htons(ETH_P_ERSPAN) ||
+ gre_proto == htons(ETH_P_ERSPAN2))
+ t = rcu_dereference(ign->collect_md_tun_erspan);
+ else
+ t = rcu_dereference(ign->collect_md_tun);
+
+ if (t && t->dev->flags & IFF_UP)
+ return t;
+
+ ndev = READ_ONCE(ign->fb_tunnel_dev);
+ if (ndev && ndev->flags & IFF_UP)
+ return netdev_priv(ndev);
+
+ return NULL;
+}
+
+static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign,
+ const struct __ip6_tnl_parm *p)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ unsigned int h = HASH_KEY(p->i_key);
+ int prio = 0;
+
+ if (!ipv6_addr_any(local))
+ prio |= 1;
+ if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) {
+ prio |= 2;
+ h ^= HASH_ADDR(remote);
+ }
+
+ return &ign->tunnels[prio][h];
+}
+
+static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ign->collect_md_tun, t);
+}
+
+static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ign->collect_md_tun_erspan, t);
+}
+
+static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ign->collect_md_tun, NULL);
+}
+
+static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign,
+ struct ip6_tnl *t)
+{
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ign->collect_md_tun_erspan, NULL);
+}
+
+static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign,
+ const struct ip6_tnl *t)
+{
+ return __ip6gre_bucket(ign, &t->parms);
+}
+
+static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
+
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
+
+ for (tp = ip6gre_bucket(ign, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static struct ip6_tnl *ip6gre_tunnel_find(struct net *net,
+ const struct __ip6_tnl_parm *parms,
+ int type)
+{
+ const struct in6_addr *remote = &parms->raddr;
+ const struct in6_addr *local = &parms->laddr;
+ __be32 key = parms->i_key;
+ int link = parms->link;
+ struct ip6_tnl *t;
+ struct ip6_tnl __rcu **tp;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ for (tp = __ip6gre_bucket(ign, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next)
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ key == t->parms.i_key &&
+ link == t->parms.link &&
+ type == t->dev->type)
+ break;
+
+ return t;
+}
+
+static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
+ const struct __ip6_tnl_parm *parms, int create)
+{
+ struct ip6_tnl *t, *nt;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE);
+ if (t && create)
+ return NULL;
+ if (t || !create)
+ return t;
+
+ if (parms->name[0]) {
+ if (!dev_valid_name(parms->name))
+ return NULL;
+ strscpy(name, parms->name);
+ } else {
+ strscpy(name, "ip6gre%d");
+ }
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+ ip6gre_tunnel_setup);
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ nt = netdev_priv(dev);
+ nt->parms = *parms;
+ dev->rtnl_link_ops = &ip6gre_link_ops;
+
+ nt->dev = dev;
+ nt->net = dev_net(dev);
+
+ if (register_netdevice(dev) < 0)
+ goto failed_free;
+
+ ip6gre_tnl_link_config(nt, 1);
+ ip6gre_tunnel_link(ign, nt);
+ return nt;
+
+failed_free:
+ free_netdev(dev);
+ return NULL;
+}
+
+static void ip6erspan_tunnel_uninit(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+
+ ip6erspan_tunnel_unlink_md(ign, t);
+ ip6gre_tunnel_unlink(ign, t);
+ dst_cache_reset(&t->dst_cache);
+ netdev_put(dev, &t->dev_tracker);
+}
+
+static void ip6gre_tunnel_uninit(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+
+ ip6gre_tunnel_unlink_md(ign, t);
+ ip6gre_tunnel_unlink(ign, t);
+ if (ign->fb_tunnel_dev == dev)
+ WRITE_ONCE(ign->fb_tunnel_dev, NULL);
+ dst_cache_reset(&t->dst_cache);
+ netdev_put(dev, &t->dev_tracker);
+}
+
+
+static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct ipv6hdr *ipv6h;
+ struct tnl_ptk_info tpi;
+ struct ip6_tnl *t;
+
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
+ offset) < 0)
+ return -EINVAL;
+
+ ipv6h = (const struct ipv6hdr *)skb->data;
+ t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
+ tpi.key, tpi.proto);
+ if (!t)
+ return -ENOENT;
+
+ switch (type) {
+ case ICMPV6_DEST_UNREACH:
+ net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
+ t->parms.name);
+ if (code != ICMPV6_PORT_UNREACH)
+ break;
+ return 0;
+ case ICMPV6_TIME_EXCEED:
+ if (code == ICMPV6_EXC_HOPLIMIT) {
+ net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+ t->parms.name);
+ break;
+ }
+ return 0;
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
+ teli = 0;
+ if (code == ICMPV6_HDR_FIELD)
+ teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
+
+ if (teli && teli == be32_to_cpu(info) - 2) {
+ tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
+ if (tel->encap_limit == 0) {
+ net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
+ t->parms.name);
+ }
+ } else {
+ net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+ t->parms.name);
+ }
+ return 0;
+ }
+ case ICMPV6_PKT_TOOBIG:
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ return 0;
+ case NDISC_REDIRECT:
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ return 0;
+ }
+
+ if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+
+ return 0;
+}
+
+static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+{
+ const struct ipv6hdr *ipv6h;
+ struct ip6_tnl *tunnel;
+
+ ipv6h = ipv6_hdr(skb);
+ tunnel = ip6gre_tunnel_lookup(skb->dev,
+ &ipv6h->saddr, &ipv6h->daddr, tpi->key,
+ tpi->proto);
+ if (tunnel) {
+ if (tunnel->parms.collect_md) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ struct metadata_dst *tun_dst;
+ __be64 tun_id;
+
+ ip_tunnel_flags_copy(flags, tpi->flags);
+ tun_id = key32_to_tunnel_id(tpi->key);
+
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
+ if (!tun_dst)
+ return PACKET_REJECT;
+
+ ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+ } else {
+ ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ }
+
+ return PACKET_RCVD;
+ }
+
+ return PACKET_REJECT;
+}
+
+static int ip6erspan_rcv(struct sk_buff *skb,
+ struct tnl_ptk_info *tpi,
+ int gre_hdr_len)
+{
+ struct erspan_base_hdr *ershdr;
+ const struct ipv6hdr *ipv6h;
+ struct erspan_md2 *md2;
+ struct ip6_tnl *tunnel;
+ u8 ver;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
+ return PACKET_REJECT;
+
+ ipv6h = ipv6_hdr(skb);
+ ershdr = (struct erspan_base_hdr *)skb->data;
+ ver = ershdr->ver;
+
+ tunnel = ip6gre_tunnel_lookup(skb->dev,
+ &ipv6h->saddr, &ipv6h->daddr, tpi->key,
+ tpi->proto);
+ if (tunnel) {
+ int len = erspan_hdr_len(ver);
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return PACKET_REJECT;
+
+ if (__iptunnel_pull_header(skb, len,
+ htons(ETH_P_TEB),
+ false, false) < 0)
+ return PACKET_REJECT;
+
+ if (tunnel->parms.collect_md) {
+ struct erspan_metadata *pkt_md, *md;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *info;
+ unsigned char *gh;
+ __be64 tun_id;
+
+ __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+ ip_tunnel_flags_copy(flags, tpi->flags);
+ tun_id = key32_to_tunnel_id(tpi->key);
+
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
+ sizeof(*md));
+ if (!tun_dst)
+ return PACKET_REJECT;
+
+ /* skb can be uncloned in __iptunnel_pull_header, so
+ * old pkt_md is no longer valid and we need to reset
+ * it
+ */
+ gh = skb_network_header(skb) +
+ skb_network_header_len(skb);
+ pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+ sizeof(*ershdr));
+ info = &tun_dst->u.tun_info;
+ md = ip_tunnel_info_opts(info);
+ md->version = ver;
+ md2 = &md->u.md2;
+ memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
+ ERSPAN_V2_MDSIZE);
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ info->key.tun_flags);
+ info->options_len = sizeof(*md);
+
+ ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+
+ } else {
+ ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ }
+
+ return PACKET_RCVD;
+ }
+
+ return PACKET_REJECT;
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int hdr_len;
+
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0);
+ if (hdr_len < 0)
+ goto drop;
+
+ if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
+ goto drop;
+
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+ tpi.proto == htons(ETH_P_ERSPAN2))) {
+ if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+ return 0;
+ goto out;
+ }
+
+ if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
+ return 0;
+
+out:
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int gre_handle_offloads(struct sk_buff *skb, bool csum)
+{
+ return iptunnel_handle_offloads(skb,
+ csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+}
+
+static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ *encap_limit = t->parms.encap_limit;
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = ipv4_get_dsfield(iph);
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+}
+
+static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ struct ipv6hdr *ipv6h;
+ struct ip6_tnl *t = netdev_priv(dev);
+ __u16 offset;
+
+ offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+ ipv6h = ipv6_hdr(skb);
+
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+
+ tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ *encap_limit = tel->encap_limit - 1;
+ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+ *encap_limit = t->parms.encap_limit;
+ }
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = ipv6_get_dsfield(ipv6h);
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6->flowlabel |= ip6_flowlabel(ipv6h);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ return 0;
+}
+
+static int prepare_ip6gre_xmit_other(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ *encap_limit = t->parms.encap_limit;
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = 0;
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ return 0;
+}
+
+static struct ip_tunnel_info *skb_tunnel_info_txcheck(struct sk_buff *skb)
+{
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+ return ERR_PTR(-EINVAL);
+
+ return tun_info;
+}
+
+static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
+ struct net_device *dev, __u8 dsfield,
+ struct flowi6 *fl6, int encap_limit,
+ __u32 *pmtu, __be16 proto)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ __be16 protocol;
+
+ if (dev->type == ARPHRD_ETHER)
+ IPCB(skb)->flags = 0;
+
+ if (dev->header_ops && dev->type == ARPHRD_IP6GRE)
+ fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr;
+ else
+ fl6->daddr = tunnel->parms.raddr;
+
+ /* Push GRE header. */
+ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
+
+ if (tunnel->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ int tun_hlen;
+
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
+ return -EINVAL;
+
+ key = &tun_info->key;
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = IPPROTO_GRE;
+ fl6->daddr = key->u.ipv6.dst;
+ fl6->flowlabel = key->label;
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id);
+
+ dsfield = key->tos;
+ ip_tunnel_flags_zero(flags);
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ ip_tunnel_flags_and(flags, flags, key->tun_flags);
+ tun_hlen = gre_calc_hlen(flags);
+
+ if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
+ return -ENOMEM;
+
+ gre_build_header(skb, tun_hlen,
+ flags, protocol,
+ tunnel_id_to_key32(tun_info->key.tun_id),
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
+ 0);
+
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+ return -ENOMEM;
+
+ ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
+
+ gre_build_header(skb, tunnel->tun_hlen, flags,
+ protocol, tunnel->parms.o_key,
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
+ 0);
+ }
+
+ return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
+ NEXTHDR_GRE);
+}
+
+static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ int encap_limit = -1;
+ struct flowi6 fl6;
+ __u8 dsfield = 0;
+ __u32 mtu;
+ int err;
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ if (!t->parms.collect_md)
+ prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+ &dsfield, &encap_limit);
+
+ err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ t->parms.o_flags));
+ if (err)
+ return -1;
+
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ skb->protocol);
+ if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
+ if (err == -EMSGSIZE)
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ int encap_limit = -1;
+ struct flowi6 fl6;
+ __u8 dsfield = 0;
+ __u32 mtu;
+ int err;
+
+ if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+ return -1;
+
+ if (!t->parms.collect_md &&
+ prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
+ return -1;
+
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ t->parms.o_flags)))
+ return -1;
+
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit,
+ &mtu, skb->protocol);
+ if (err != 0) {
+ if (err == -EMSGSIZE)
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ int encap_limit = -1;
+ struct flowi6 fl6;
+ __u8 dsfield = 0;
+ __u32 mtu;
+ int err;
+
+ if (!t->parms.collect_md &&
+ prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit))
+ return -1;
+
+ err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ t->parms.o_flags));
+ if (err)
+ return err;
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol);
+
+ return err;
+}
+
+static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ __be16 payload_protocol;
+ int ret;
+
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
+ goto tx_err;
+
+ payload_protocol = skb_protocol(skb, true);
+ switch (payload_protocol) {
+ case htons(ETH_P_IP):
+ ret = ip6gre_xmit_ipv4(skb, dev);
+ break;
+ case htons(ETH_P_IPV6):
+ ret = ip6gre_xmit_ipv6(skb, dev);
+ break;
+ default:
+ ret = ip6gre_xmit_other(skb, dev);
+ break;
+ }
+
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb)))
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel_info *tun_info = NULL;
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct dst_entry *dst = skb_dst(skb);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ bool truncate = false;
+ int encap_limit = -1;
+ __u8 dsfield = false;
+ struct flowi6 fl6;
+ int err = -EINVAL;
+ __be16 proto;
+ __u32 mtu;
+ int nhoff;
+
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
+ goto tx_err;
+
+ if (gre_handle_offloads(skb, false))
+ goto tx_err;
+
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto tx_err;
+ truncate = true;
+ }
+
+ nhoff = skb_network_offset(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_offset(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
+
+ if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
+ goto tx_err;
+
+ __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
+ IPCB(skb)->flags = 0;
+
+ /* For collect_md mode, derive fl6 from the tunnel key,
+ * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
+ */
+ if (t->parms.collect_md) {
+ const struct ip_tunnel_key *key;
+ struct erspan_metadata *md;
+ __be32 tun_id;
+
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
+ goto tx_err;
+
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_GRE;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id);
+
+ dsfield = key->tos;
+ if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ tun_info->key.tun_flags))
+ goto tx_err;
+ if (tun_info->options_len < sizeof(*md))
+ goto tx_err;
+ md = ip_tunnel_info_opts(tun_info);
+
+ tun_id = tunnel_id_to_key32(key->tun_id);
+ if (md->version == 1) {
+ erspan_build_header(skb,
+ ntohl(tun_id),
+ ntohl(md->u.index), truncate,
+ false);
+ proto = htons(ETH_P_ERSPAN);
+ } else if (md->version == 2) {
+ erspan_build_header_v2(skb,
+ ntohl(tun_id),
+ md->u.md2.dir,
+ get_hwid(&md->u.md2),
+ truncate, false);
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
+ goto tx_err;
+ }
+ } else {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
+ &dsfield, &encap_limit);
+ break;
+ case htons(ETH_P_IPV6):
+ if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr))
+ goto tx_err;
+ if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
+ &dsfield, &encap_limit))
+ goto tx_err;
+ break;
+ default:
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ break;
+ }
+
+ if (t->parms.erspan_ver == 1) {
+ erspan_build_header(skb, ntohl(t->parms.o_key),
+ t->parms.index,
+ truncate, false);
+ proto = htons(ETH_P_ERSPAN);
+ } else if (t->parms.erspan_ver == 2) {
+ erspan_build_header_v2(skb, ntohl(t->parms.o_key),
+ t->parms.dir,
+ t->parms.hwid,
+ truncate, false);
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
+ goto tx_err;
+ }
+
+ fl6.daddr = t->parms.raddr;
+ }
+
+ /* Push GRE header. */
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ gre_build_header(skb, 8, flags, proto, 0,
+ htonl(atomic_fetch_inc(&t->o_seqno)));
+
+ /* TooBig packet may have updated dst->dev's mtu */
+ if (!t->parms.collect_md && dst) {
+ mtu = READ_ONCE(dst_dev(dst)->mtu);
+ if (dst_mtu(dst) > mtu)
+ dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
+ }
+ err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ NEXTHDR_GRE);
+ if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
+ if (err == -EMSGSIZE) {
+ if (skb->protocol == htons(ETH_P_IP))
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ else
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ }
+
+ goto tx_err;
+ }
+ return NETDEV_TX_OK;
+
+tx_err:
+ if (!IS_ERR(tun_info))
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
+{
+ struct net_device *dev = t->dev;
+ struct __ip6_tnl_parm *p = &t->parms;
+ struct flowi6 *fl6 = &t->fl.u.ip6;
+
+ if (dev->type != ARPHRD_ETHER) {
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+ }
+
+ /* Set up flowi template */
+ fl6->saddr = p->laddr;
+ fl6->daddr = p->raddr;
+ fl6->flowi6_oif = p->link;
+ fl6->flowlabel = 0;
+ fl6->flowi6_proto = IPPROTO_GRE;
+ fl6->fl6_gre_key = t->parms.o_key;
+
+ if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
+ fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
+ if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
+ fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
+
+ p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
+ p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
+
+ if (p->flags&IP6_TNL_F_CAP_XMIT &&
+ p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ else
+ dev->flags &= ~IFF_POINTOPOINT;
+}
+
+static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
+ int t_hlen)
+{
+ const struct __ip6_tnl_parm *p = &t->parms;
+ struct net_device *dev = t->dev;
+
+ if (p->flags & IP6_TNL_F_CAP_XMIT) {
+ int strict = (ipv6_addr_type(&p->raddr) &
+ (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
+
+ struct rt6_info *rt = rt6_lookup(t->net,
+ &p->raddr, &p->laddr,
+ p->link, NULL, strict);
+
+ if (!rt)
+ return;
+
+ if (rt->dst.dev) {
+ unsigned short dst_len = rt->dst.dev->hard_header_len +
+ t_hlen;
+
+ if (t->dev->header_ops)
+ dev->hard_header_len = dst_len;
+ else
+ dev->needed_headroom = dst_len;
+
+ if (set_mtu) {
+ int mtu = rt->dst.dev->mtu - t_hlen;
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ mtu -= 8;
+ if (dev->type == ARPHRD_ETHER)
+ mtu -= ETH_HLEN;
+
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
+ }
+ }
+ ip6_rt_put(rt);
+ }
+}
+
+static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
+{
+ int t_hlen;
+
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+ t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+
+ if (tunnel->dev->header_ops)
+ tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ else
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
+ return t_hlen;
+}
+
+static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+ ip6gre_tnl_link_config_common(t);
+ ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t));
+}
+
+static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t,
+ const struct __ip6_tnl_parm *p)
+{
+ t->parms.laddr = p->laddr;
+ t->parms.raddr = p->raddr;
+ t->parms.flags = p->flags;
+ t->parms.hop_limit = p->hop_limit;
+ t->parms.encap_limit = p->encap_limit;
+ t->parms.flowinfo = p->flowinfo;
+ t->parms.link = p->link;
+ t->parms.proto = p->proto;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
+ t->parms.fwmark = p->fwmark;
+ t->parms.erspan_ver = p->erspan_ver;
+ t->parms.index = p->index;
+ t->parms.dir = p->dir;
+ t->parms.hwid = p->hwid;
+ dst_cache_reset(&t->dst_cache);
+}
+
+static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
+ int set_mtu)
+{
+ ip6gre_tnl_copy_tnl_parm(t, p);
+ ip6gre_tnl_link_config(t, set_mtu);
+ return 0;
+}
+
+static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p,
+ const struct ip6_tnl_parm2 *u)
+{
+ p->laddr = u->laddr;
+ p->raddr = u->raddr;
+ p->flags = u->flags;
+ p->hop_limit = u->hop_limit;
+ p->encap_limit = u->encap_limit;
+ p->flowinfo = u->flowinfo;
+ p->link = u->link;
+ p->i_key = u->i_key;
+ p->o_key = u->o_key;
+ gre_flags_to_tnl_flags(p->i_flags, u->i_flags);
+ gre_flags_to_tnl_flags(p->o_flags, u->o_flags);
+ memcpy(p->name, u->name, sizeof(u->name));
+}
+
+static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
+ const struct __ip6_tnl_parm *p)
+{
+ u->proto = IPPROTO_GRE;
+ u->laddr = p->laddr;
+ u->raddr = p->raddr;
+ u->flags = p->flags;
+ u->hop_limit = p->hop_limit;
+ u->encap_limit = p->encap_limit;
+ u->flowinfo = p->flowinfo;
+ u->link = p->link;
+ u->i_key = p->i_key;
+ u->o_key = p->o_key;
+ u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+ u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+ memcpy(u->name, p->name, sizeof(u->name));
+}
+
+static int ip6gre_tunnel_siocdevprivate(struct net_device *dev,
+ struct ifreq *ifr, void __user *data,
+ int cmd)
+{
+ int err = 0;
+ struct ip6_tnl_parm2 p;
+ struct __ip6_tnl_parm p1;
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ memset(&p1, 0, sizeof(p1));
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == ign->fb_tunnel_dev) {
+ if (copy_from_user(&p, data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ ip6gre_tnl_parm_from_user(&p1, &p);
+ t = ip6gre_tunnel_locate(net, &p1, 0);
+ if (!t)
+ t = netdev_priv(dev);
+ }
+ memset(&p, 0, sizeof(p));
+ ip6gre_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))
+ goto done;
+
+ if (!(p.i_flags&GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags&GRE_KEY))
+ p.o_key = 0;
+
+ ip6gre_tnl_parm_from_user(&p1, &p);
+ t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL);
+
+ if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ t = netdev_priv(dev);
+
+ ip6gre_tunnel_unlink(ign, t);
+ synchronize_net();
+ ip6gre_tnl_change(t, &p1, 1);
+ ip6gre_tunnel_link(ign, t);
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+
+ memset(&p, 0, sizeof(p));
+ ip6gre_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(data, &p, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == ign->fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ ip6gre_tnl_parm_from_user(&p1, &p);
+ t = ip6gre_tunnel_locate(net, &p1, 0);
+ if (!t)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(ign->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h;
+ __be16 *p;
+
+ ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h));
+ ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb,
+ t->fl.u.ip6.flowlabel,
+ true, &t->fl.u.ip6));
+ ipv6h->hop_limit = t->parms.hop_limit;
+ ipv6h->nexthdr = NEXTHDR_GRE;
+ ipv6h->saddr = t->parms.laddr;
+ ipv6h->daddr = t->parms.raddr;
+
+ p = (__be16 *)(ipv6h + 1);
+ p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags);
+ p[1] = htons(type);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if (saddr)
+ memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr));
+ if (daddr)
+ memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr));
+ if (!ipv6_addr_any(&ipv6h->daddr))
+ return t->hlen;
+
+ return -t->hlen;
+}
+
+static const struct header_ops ip6gre_header_ops = {
+ .create = ip6gre_header,
+};
+
+static const struct net_device_ops ip6gre_netdev_ops = {
+ .ndo_init = ip6gre_tunnel_init,
+ .ndo_uninit = ip6gre_tunnel_uninit,
+ .ndo_start_xmit = ip6gre_tunnel_xmit,
+ .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+static void ip6gre_dev_free(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ gro_cells_destroy(&t->gro_cells);
+ dst_cache_destroy(&t->dst_cache);
+}
+
+static void ip6gre_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &ip6gre_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
+
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+ dev->type = ARPHRD_IP6GRE;
+
+ dev->flags |= IFF_NOARP;
+ dev->addr_len = sizeof(struct in6_addr);
+ netif_keep_dst(dev);
+ /* This perm addr will be used as interface identifier by IPv6 */
+ dev->addr_assign_type = NET_ADDR_RANDOM;
+ eth_random_addr(dev->perm_addr);
+}
+
+#define GRE6_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
+
+static void ip6gre_tnl_init_features(struct net_device *dev)
+{
+ struct ip6_tnl *nt = netdev_priv(dev);
+
+ dev->features |= GRE6_FEATURES;
+ dev->hw_features |= GRE6_FEATURES;
+
+ /* TCP offload with GRE SEQ is not supported, nor can we support 2
+ * levels of outer headers requiring an update.
+ */
+ if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags))
+ return;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) &&
+ nt->encap.type != TUNNEL_ENCAP_NONE)
+ return;
+
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->lltx = true;
+}
+
+static int ip6gre_tunnel_init_common(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+ int ret;
+ int t_hlen;
+
+ tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ strscpy(tunnel->parms.name, dev->name);
+
+ ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = gro_cells_init(&tunnel->gro_cells, dev);
+ if (ret)
+ goto cleanup_dst_cache_init;
+
+ t_hlen = ip6gre_calc_hlen(tunnel);
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->mtu -= ETH_HLEN;
+ if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+
+ if (tunnel->parms.collect_md) {
+ netif_keep_dst(dev);
+ }
+ ip6gre_tnl_init_features(dev);
+
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
+ return 0;
+
+cleanup_dst_cache_init:
+ dst_cache_destroy(&tunnel->dst_cache);
+ return ret;
+}
+
+static int ip6gre_tunnel_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+ int ret;
+
+ ret = ip6gre_tunnel_init_common(dev);
+ if (ret)
+ return ret;
+
+ tunnel = netdev_priv(dev);
+
+ if (tunnel->parms.collect_md)
+ return 0;
+
+ __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
+
+ if (ipv6_addr_any(&tunnel->parms.raddr))
+ dev->header_ops = &ip6gre_header_ops;
+
+ return 0;
+}
+
+static void ip6gre_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strscpy(tunnel->parms.name, dev->name);
+
+ tunnel->hlen = sizeof(struct ipv6hdr) + 4;
+}
+
+static struct inet6_protocol ip6gre_protocol __read_mostly = {
+ .handler = gre_rcv,
+ .err_handler = ip6gre_err,
+ .flags = INET6_PROTO_FINAL,
+};
+
+static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head)
+{
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct net_device *dev, *aux;
+ int prio;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &ip6gre_link_ops ||
+ dev->rtnl_link_ops == &ip6gre_tap_ops ||
+ dev->rtnl_link_ops == &ip6erspan_tap_ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (prio = 0; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
+ struct ip6_tnl *t;
+
+ t = rtnl_net_dereference(net, ign->tunnels[prio][h]);
+
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
+
+ t = rtnl_net_dereference(net, t->next);
+ }
+ }
+ }
+}
+
+static int __net_init ip6gre_init_net(struct net *net)
+{
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct net_device *ndev;
+ int err;
+
+ if (!net_has_fallback_tunnels(net))
+ return 0;
+ ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
+ NET_NAME_UNKNOWN, ip6gre_tunnel_setup);
+ if (!ndev) {
+ err = -ENOMEM;
+ goto err_alloc_dev;
+ }
+ ign->fb_tunnel_dev = ndev;
+ dev_net_set(ign->fb_tunnel_dev, net);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ ign->fb_tunnel_dev->netns_immutable = true;
+
+ ip6gre_fb_tunnel_init(ign->fb_tunnel_dev);
+ ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops;
+
+ err = register_netdev(ign->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
+
+ rcu_assign_pointer(ign->tunnels_wc[0],
+ netdev_priv(ign->fb_tunnel_dev));
+ return 0;
+
+err_reg_dev:
+ free_netdev(ndev);
+err_alloc_dev:
+ return err;
+}
+
+static struct pernet_operations ip6gre_net_ops = {
+ .init = ip6gre_init_net,
+ .exit_rtnl = ip6gre_exit_rtnl_net,
+ .id = &ip6gre_net_id,
+ .size = sizeof(struct ip6gre_net),
+};
+
+static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be16 flags;
+
+ if (!data)
+ return 0;
+
+ flags = 0;
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (flags & (GRE_VERSION|GRE_ROUTING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct in6_addr daddr;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ goto out;
+
+ if (data[IFLA_GRE_REMOTE]) {
+ daddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]);
+ if (ipv6_addr_any(&daddr))
+ return -EINVAL;
+ }
+
+out:
+ return ip6gre_tunnel_validate(tb, data, extack);
+}
+
+static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ __be16 flags = 0;
+ int ret, ver = 0;
+
+ if (!data)
+ return 0;
+
+ ret = ip6gre_tap_validate(tb, data, extack);
+ if (ret)
+ return ret;
+
+ /* ERSPAN should only have GRE sequence and key flag */
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (!data[IFLA_GRE_COLLECT_METADATA] &&
+ flags != (GRE_SEQ | GRE_KEY))
+ return -EINVAL;
+
+ /* ERSPAN Session ID only has 10-bit. Since we reuse
+ * 32-bit key field as ID, check it's range.
+ */
+ if (data[IFLA_GRE_IKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_OKEY] &&
+ (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+ return -EINVAL;
+
+ if (data[IFLA_GRE_ERSPAN_VER]) {
+ ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+ if (ver != 1 && ver != 2)
+ return -EINVAL;
+ }
+
+ if (ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX]) {
+ u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+ if (index & ~INDEX_MASK)
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR]) {
+ u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+
+ if (dir & ~(DIR_MASK >> DIR_OFFSET))
+ return -EINVAL;
+ }
+
+ if (data[IFLA_GRE_ERSPAN_HWID]) {
+ u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+
+ if (hwid & ~(HWID_MASK >> HWID_OFFSET))
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static void ip6erspan_set_version(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ if (!data)
+ return;
+
+ parms->erspan_ver = 1;
+ if (data[IFLA_GRE_ERSPAN_VER])
+ parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+ if (parms->erspan_ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX])
+ parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ } else if (parms->erspan_ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR])
+ parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+ if (data[IFLA_GRE_ERSPAN_HWID])
+ parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+ }
+}
+
+static void ip6gre_netlink_parms(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_GRE_LINK])
+ parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+ if (data[IFLA_GRE_IFLAGS])
+ gre_flags_to_tnl_flags(parms->i_flags,
+ nla_get_be16(data[IFLA_GRE_IFLAGS]));
+
+ if (data[IFLA_GRE_OFLAGS])
+ gre_flags_to_tnl_flags(parms->o_flags,
+ nla_get_be16(data[IFLA_GRE_OFLAGS]));
+
+ if (data[IFLA_GRE_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+ if (data[IFLA_GRE_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+ if (data[IFLA_GRE_LOCAL])
+ parms->laddr = nla_get_in6_addr(data[IFLA_GRE_LOCAL]);
+
+ if (data[IFLA_GRE_REMOTE])
+ parms->raddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]);
+
+ if (data[IFLA_GRE_TTL])
+ parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]);
+
+ if (data[IFLA_GRE_ENCAP_LIMIT])
+ parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
+
+ if (data[IFLA_GRE_FLOWINFO])
+ parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
+
+ if (data[IFLA_GRE_FLAGS])
+ parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
+
+ if (data[IFLA_GRE_FWMARK])
+ parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
+ if (data[IFLA_GRE_COLLECT_METADATA])
+ parms->collect_md = true;
+}
+
+static int ip6gre_tap_init(struct net_device *dev)
+{
+ int ret;
+
+ ret = ip6gre_tunnel_init_common(dev);
+ if (ret)
+ return ret;
+
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+ return 0;
+}
+
+static const struct net_device_ops ip6gre_tap_netdev_ops = {
+ .ndo_init = ip6gre_tap_init,
+ .ndo_uninit = ip6gre_tunnel_uninit,
+ .ndo_start_xmit = ip6gre_tunnel_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel)
+{
+ int t_hlen;
+
+ tunnel->tun_hlen = 8;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+ erspan_hdr_len(tunnel->parms.erspan_ver);
+
+ t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+ return t_hlen;
+}
+
+static int ip6erspan_tap_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+ int t_hlen;
+ int ret;
+
+ tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ strscpy(tunnel->parms.name, dev->name);
+
+ ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = gro_cells_init(&tunnel->gro_cells, dev);
+ if (ret)
+ goto cleanup_dst_cache_init;
+
+ t_hlen = ip6erspan_calc_hlen(tunnel);
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->mtu -= ETH_HLEN;
+ if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip6erspan_tnl_link_config(tunnel, 1);
+
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
+ return 0;
+
+cleanup_dst_cache_init:
+ dst_cache_destroy(&tunnel->dst_cache);
+ return ret;
+}
+
+static const struct net_device_ops ip6erspan_netdev_ops = {
+ .ndo_init = ip6erspan_tap_init,
+ .ndo_uninit = ip6erspan_tunnel_uninit,
+ .ndo_start_xmit = ip6erspan_tunnel_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+static void ip6gre_tap_setup(struct net_device *dev)
+{
+
+ ether_setup(dev);
+
+ dev->max_mtu = 0;
+ dev->netdev_ops = &ip6gre_tap_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
+
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
+}
+
+static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip6_tnl *nt;
+ struct ip_tunnel_encap ipencap;
+ int err;
+
+ nt = netdev_priv(dev);
+
+ if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ nt->dev = dev;
+ nt->net = link_net;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (tb[IFLA_MTU])
+ ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+
+out:
+ return err;
+}
+
+static int ip6gre_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = params->link_net ? : dev_net(dev);
+ struct ip6_tnl *nt = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip6gre_net *ign;
+ int err;
+
+ ip6gre_netlink_parms(data, &nt->parms);
+ ign = net_generic(net, ip6gre_net_id);
+
+ if (nt->parms.collect_md) {
+ if (rtnl_dereference(ign->collect_md_tun))
+ return -EEXIST;
+ } else {
+ if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+ return -EEXIST;
+ }
+
+ err = ip6gre_newlink_common(net, dev, tb, data, extack);
+ if (!err) {
+ ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+ ip6gre_tunnel_link_md(ign, nt);
+ ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
+ }
+ return err;
+}
+
+static struct ip6_tnl *
+ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[], struct __ip6_tnl_parm *p_p,
+ struct netlink_ext_ack *extack)
+{
+ struct ip6_tnl *t, *nt = netdev_priv(dev);
+ struct net *net = nt->net;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct ip_tunnel_encap ipencap;
+
+ if (dev == ign->fb_tunnel_dev)
+ return ERR_PTR(-EINVAL);
+
+ if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ ip6gre_netlink_parms(data, p_p);
+
+ t = ip6gre_tunnel_locate(net, p_p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return ERR_PTR(-EEXIST);
+ } else {
+ t = nt;
+ }
+
+ return t;
+}
+
+static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+ struct __ip6_tnl_parm p;
+
+ t = ip6gre_changelink_common(dev, tb, data, &p, extack);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+
+ ip6gre_tunnel_unlink_md(ign, t);
+ ip6gre_tunnel_unlink(ign, t);
+ ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+ ip6gre_tunnel_link_md(ign, t);
+ ip6gre_tunnel_link(ign, t);
+ return 0;
+}
+
+static void ip6gre_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ if (dev != ign->fb_tunnel_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t ip6gre_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_GRE_LINK */
+ nla_total_size(4) +
+ /* IFLA_GRE_IFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_OFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_IKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_OKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_LOCAL */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_GRE_REMOTE */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_GRE_TTL */
+ nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_LIMIT */
+ nla_total_size(1) +
+ /* IFLA_GRE_FLOWINFO */
+ nla_total_size(4) +
+ /* IFLA_GRE_FLAGS */
+ nla_total_size(4) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
+ /* IFLA_GRE_FWMARK */
+ nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_INDEX */
+ nla_total_size(4) +
+ 0;
+}
+
+static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct __ip6_tnl_parm *p = &t->parms;
+ IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+ ip_tunnel_flags_copy(o_flags, p->o_flags);
+
+ if (p->erspan_ver == 1 || p->erspan_ver == 2) {
+ if (!p->collect_md)
+ __set_bit(IP_TUNNEL_KEY_BIT, o_flags);
+
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
+ goto nla_put_failure;
+
+ if (p->erspan_ver == 1) {
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
+ goto nla_put_failure;
+ }
+ }
+
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS,
+ gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS,
+ gre_tnl_flags_to_gre_flags(o_flags)) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
+ nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) ||
+ nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
+ nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
+ nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) ||
+ nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.flags))
+ goto nla_put_failure;
+
+ if (p->collect_md) {
+ if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
+ [IFLA_GRE_LINK] = { .type = NLA_U32 },
+ [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_IKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_OKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct ipv6hdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct ipv6hdr, daddr) },
+ [IFLA_GRE_TTL] = { .type = NLA_U8 },
+ [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
+ [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 },
+ [IFLA_GRE_FLAGS] = { .type = NLA_U32 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_GRE_FWMARK] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 },
+};
+
+static void ip6erspan_tap_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->max_mtu = 0;
+ dev->netdev_ops = &ip6erspan_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
+
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
+}
+
+static int ip6erspan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = params->link_net ? : dev_net(dev);
+ struct ip6_tnl *nt = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip6gre_net *ign;
+ int err;
+
+ ip6gre_netlink_parms(data, &nt->parms);
+ ip6erspan_set_version(data, &nt->parms);
+ ign = net_generic(net, ip6gre_net_id);
+
+ if (nt->parms.collect_md) {
+ if (rtnl_dereference(ign->collect_md_tun_erspan))
+ return -EEXIST;
+ } else {
+ if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+ return -EEXIST;
+ }
+
+ err = ip6gre_newlink_common(net, dev, tb, data, extack);
+ if (!err) {
+ ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
+ ip6erspan_tunnel_link_md(ign, nt);
+ ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
+ }
+ return err;
+}
+
+static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+ ip6gre_tnl_link_config_common(t);
+ ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t));
+}
+
+static int ip6erspan_tnl_change(struct ip6_tnl *t,
+ const struct __ip6_tnl_parm *p, int set_mtu)
+{
+ ip6gre_tnl_copy_tnl_parm(t, p);
+ ip6erspan_tnl_link_config(t, set_mtu);
+ return 0;
+}
+
+static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+ struct __ip6_tnl_parm p;
+ struct ip6_tnl *t;
+
+ t = ip6gre_changelink_common(dev, tb, data, &p, extack);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+
+ ip6erspan_set_version(data, &p);
+ ip6gre_tunnel_unlink_md(ign, t);
+ ip6gre_tunnel_unlink(ign, t);
+ ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
+ ip6erspan_tunnel_link_md(ign, t);
+ ip6gre_tunnel_link(ign, t);
+ return 0;
+}
+
+static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
+ .kind = "ip6gre",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ip6gre_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6gre_tunnel_setup,
+ .validate = ip6gre_tunnel_validate,
+ .newlink = ip6gre_newlink,
+ .changelink = ip6gre_changelink,
+ .dellink = ip6gre_dellink,
+ .get_size = ip6gre_get_size,
+ .fill_info = ip6gre_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
+ .kind = "ip6gretap",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ip6gre_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6gre_tap_setup,
+ .validate = ip6gre_tap_validate,
+ .newlink = ip6gre_newlink,
+ .changelink = ip6gre_changelink,
+ .get_size = ip6gre_get_size,
+ .fill_info = ip6gre_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
+ .kind = "ip6erspan",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ip6gre_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6erspan_tap_setup,
+ .validate = ip6erspan_tap_validate,
+ .newlink = ip6erspan_newlink,
+ .changelink = ip6erspan_changelink,
+ .get_size = ip6gre_get_size,
+ .fill_info = ip6gre_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+static int __init ip6gre_init(void)
+{
+ int err;
+
+ pr_info("GRE over IPv6 tunneling driver\n");
+
+ err = register_pernet_device(&ip6gre_net_ops);
+ if (err < 0)
+ return err;
+
+ err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE);
+ if (err < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ goto add_proto_failed;
+ }
+
+ err = rtnl_link_register(&ip6gre_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ err = rtnl_link_register(&ip6gre_tap_ops);
+ if (err < 0)
+ goto tap_ops_failed;
+
+ err = rtnl_link_register(&ip6erspan_tap_ops);
+ if (err < 0)
+ goto erspan_link_failed;
+
+out:
+ return err;
+
+erspan_link_failed:
+ rtnl_link_unregister(&ip6gre_tap_ops);
+tap_ops_failed:
+ rtnl_link_unregister(&ip6gre_link_ops);
+rtnl_link_failed:
+ inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
+add_proto_failed:
+ unregister_pernet_device(&ip6gre_net_ops);
+ goto out;
+}
+
+static void __exit ip6gre_fini(void)
+{
+ rtnl_link_unregister(&ip6gre_tap_ops);
+ rtnl_link_unregister(&ip6gre_link_ops);
+ rtnl_link_unregister(&ip6erspan_tap_ops);
+ inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
+ unregister_pernet_device(&ip6gre_net_ops);
+}
+
+module_init(ip6gre_init);
+module_exit(ip6gre_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
+MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
+MODULE_ALIAS_RTNL_LINK("ip6gre");
+MODULE_ALIAS_RTNL_LINK("ip6gretap");
+MODULE_ALIAS_RTNL_LINK("ip6erspan");
+MODULE_ALIAS_NETDEV("ip6gre0");
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
new file mode 100644
index 000000000000..233914b63bdb
--- /dev/null
+++ b/net/ipv6/ip6_icmp.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+
+#include <net/ipv6.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#if !IS_BUILTIN(CONFIG_IPV6)
+
+static ip6_icmp_send_t __rcu *ip6_icmp_send;
+
+int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
+{
+ return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL(inet6_register_icmp_sender);
+
+int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
+{
+ int ret;
+
+ ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ?
+ 0 : -EINVAL;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet6_unregister_icmp_sender);
+
+void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct inet6_skb_parm *parm)
+{
+ ip6_icmp_send_t *send;
+
+ rcu_read_lock();
+ send = rcu_dereference(ip6_icmp_send);
+ if (send)
+ send(skb, type, code, info, NULL, parm);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__icmpv6_send);
+#endif
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+{
+ struct inet6_skb_parm parm = { 0 };
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct in6_addr orig_ip;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
+ __icmpv6_send(skb_in, type, code, info, &parm);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct ipv6hdr))))
+ goto out;
+
+ orig_ip = ipv6_hdr(skb_in)->saddr;
+ dir = CTINFO2DIR(ctinfo);
+ ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ __icmpv6_send(skb_in, type, code, info, &parm);
+ ipv6_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmpv6_ndo_send);
+#endif
+#endif
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index ad0b8abcdf4b..168ec07e31cc 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -1,41 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 input
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Ian P. Morris <I.P.Morris@soton.ac.uk>
*
- * $Id: ip6_input.c,v 1.19 2000/12/13 18:31:50 davem Exp $
- *
* Based in linux/net/ipv4/ip_input.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* Changes
*
- * Mitsuru KANDA @USAGI and
- * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
+ * Mitsuru KANDA @USAGI and
+ * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/slab.h>
+#include <linux/indirect_call_wrapper.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/udp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
@@ -45,38 +42,136 @@
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/xfrm.h>
+#include <net/inet_ecn.h>
+#include <net/dst_metadata.h>
+static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+ !skb_dst(skb) && !skb->sk) {
+ switch (ipv6_hdr(skb)->nexthdr) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+ tcp_v6_early_demux(skb);
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+ udp_v6_early_demux(skb);
+ break;
+ }
+ }
+ if (!skb_valid_dst(skb))
+ ip6_route_input(skb);
+}
-inline int ip6_rcv_finish( struct sk_buff *skb)
+int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- if (skb->dst == NULL)
- ip6_route_input(skb);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+ ip6_rcv_finish_core(net, sk, skb);
return dst_input(skb);
}
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static void ip6_sublist_rcv_finish(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ dst_input(skb);
+ }
+}
+
+static bool ip6_can_use_hint(const struct sk_buff *skb,
+ const struct sk_buff *hint)
{
- struct ipv6hdr *hdr;
- u32 pkt_len;
+ return hint && !skb_dst(skb) &&
+ ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr);
+}
+
+static struct sk_buff *ip6_extract_route_hint(const struct net *net,
+ struct sk_buff *skb)
+{
+ if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
+ IP6CB(skb)->flags & IP6SKB_MULTIPATH)
+ return NULL;
+
+ return skb;
+}
+
+static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
+ struct list_head *head)
+{
+ struct sk_buff *skb, *next, *hint = NULL;
+ struct dst_entry *curr_dst = NULL;
+ LIST_HEAD(sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct dst_entry *dst;
+
+ skb_list_del_init(skb);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ continue;
+
+ if (ip6_can_use_hint(skb, hint))
+ skb_dst_copy(skb, hint);
+ else
+ ip6_rcv_finish_core(net, sk, skb);
+ dst = skb_dst(skb);
+ if (curr_dst != dst) {
+ hint = ip6_extract_route_hint(net, skb);
+
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dst = dst;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip6_sublist_rcv_finish(&sublist);
+}
+
+static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
+ struct net *net)
+{
+ enum skb_drop_reason reason;
+ const struct ipv6hdr *hdr;
+ u32 pkt_len;
struct inet6_dev *idev;
if (skb->pkt_type == PACKET_OTHERHOST) {
- kfree_skb(skb);
- return 0;
+ dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+ kfree_skb_reason(skb, SKB_DROP_REASON_OTHERHOST);
+ return NULL;
}
rcu_read_lock();
idev = __in6_dev_get(skb->dev);
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INRECEIVES);
+ __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len);
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDISCARDS);
- rcu_read_unlock();
- goto out;
+ SKB_DR_SET(reason, NOT_SPECIFIED);
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
+ !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6)))
+ SKB_DR_SET(reason, IPV6DISABLED);
+ goto drop;
}
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
@@ -87,190 +182,416 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
*
* BTW, when we send a packet for our own local address on a
* non-loopback interface (e.g. ethX), it is being delivered
- * via the loopback interface (lo) here; skb->dev = &loopback_dev.
+ * via the loopback interface (lo) here; skb->dev = loopback_dev.
* It, however, should be considered as if it is being
* arrived via the sending interface (ethX), because of the
* nature of scoping architecture. --yoshfuji
*/
- IP6CB(skb)->iif = skb->dst ? ip6_dst_idev(skb->dst)->dev->ifindex : dev->ifindex;
+ IP6CB(skb)->iif = skb_valid_dst(skb) ?
+ ip6_dst_idev(skb_dst(skb))->dev->ifindex :
+ dev->ifindex;
if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
goto err;
- hdr = skb->nh.ipv6h;
+ hdr = ipv6_hdr(skb);
+
+ if (hdr->version != 6) {
+ SKB_DR_SET(reason, UNHANDLED_PROTO);
+ goto err;
+ }
+
+ __IP6_ADD_STATS(net, idev,
+ IPSTATS_MIB_NOECTPKTS +
+ (ipv6_get_dsfield(hdr) & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+ /*
+ * RFC4291 2.5.3
+ * The loopback address must not be used as the source address in IPv6
+ * packets that are sent outside of a single node. [..]
+ * A packet received on an interface with a destination address
+ * of loopback must be dropped.
+ */
+ if ((ipv6_addr_loopback(&hdr->saddr) ||
+ ipv6_addr_loopback(&hdr->daddr)) &&
+ !(dev->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev))
+ goto err;
+
+ /* RFC4291 Errata ID: 3480
+ * Interface-Local scope spans only a single interface on a
+ * node and is useful only for loopback transmission of
+ * multicast. Packets with interface-local scope received
+ * from another node must be discarded.
+ */
+ if (!(skb->pkt_type == PACKET_LOOPBACK ||
+ dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_is_multicast(&hdr->daddr) &&
+ IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
+ goto err;
+
+ /* If enabled, drop unicast packets that were encapsulated in link-layer
+ * multicast or broadcast to protected against the so-called "hole-196"
+ * attack in 802.11 wireless.
+ */
+ if (!ipv6_addr_is_multicast(&hdr->daddr) &&
+ (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) &&
+ READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) {
+ SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST);
+ goto err;
+ }
- if (hdr->version != 6)
+ /* RFC4291 2.7
+ * Nodes must not originate a packet to a multicast address whose scope
+ * field contains the reserved value 0; if such a packet is received, it
+ * must be silently dropped.
+ */
+ if (ipv6_addr_is_multicast(&hdr->daddr) &&
+ IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0)
goto err;
- skb->h.raw = (u8 *)(hdr + 1);
+ /*
+ * RFC4291 2.7
+ * Multicast addresses must not be used as source addresses in IPv6
+ * packets or appear in any Routing header.
+ */
+ if (ipv6_addr_is_multicast(&hdr->saddr))
+ goto err;
+
+ skb->transport_header = skb->network_header + sizeof(*hdr);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
pkt_len = ntohs(hdr->payload_len);
/* pkt_len may be zero if Jumbo payload option is present */
if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- goto truncated;
- if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
+ __IP6_INC_STATS(net,
+ idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ SKB_DR_SET(reason, PKT_TOO_SMALL);
goto drop;
}
- hdr = skb->nh.ipv6h;
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+ goto err;
+ hdr = ipv6_hdr(skb);
}
if (hdr->nexthdr == NEXTHDR_HOP) {
- if (ipv6_parse_hopopts(&skb) < 0) {
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS);
+ if (ipv6_parse_hopopts(skb) < 0) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
rcu_read_unlock();
- return 0;
+ return NULL;
}
}
rcu_read_unlock();
- return NF_HOOK(PF_INET6,NF_IP6_PRE_ROUTING, skb, dev, NULL, ip6_rcv_finish);
-truncated:
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ /* Must drop socket now because of tproxy. */
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
+
+ return skb;
err:
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ SKB_DR_OR(reason, IP_INHDR);
drop:
rcu_read_unlock();
- kfree_skb(skb);
-out:
- return 0;
+ kfree_skb_reason(skb, reason);
+ return NULL;
+}
+
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct net *net = dev_net(skb->dev);
+
+ skb = ip6_rcv_core(skb, dev, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip6_rcv_finish);
}
+static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net)
+{
+ NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
+ head, dev, NULL, ip6_rcv_finish);
+ ip6_list_rcv_finish(net, NULL, head);
+}
+
+/* Receive a list of IPv6 packets */
+void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ skb_list_del_init(skb);
+ skb = ip6_rcv_core(skb, dev, net);
+ if (skb == NULL)
+ continue;
+
+ if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+}
+
+INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));
+
/*
* Deliver the packet to the host
*/
-
-
-static inline int ip6_input_finish(struct sk_buff *skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+ bool have_final)
{
- struct inet6_protocol *ipprot;
- struct sock *raw_sk;
- unsigned int nhoff;
- int nexthdr;
- u8 hash;
+ const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
+ unsigned int nhoff;
+ SKB_DR(reason);
+ bool raw;
/*
* Parse extension headers
*/
- rcu_read_lock();
resubmit:
- idev = ip6_dst_idev(skb->dst);
- if (!pskb_pull(skb, skb->h.raw - skb->data))
- goto discard;
+ idev = ip6_dst_idev(skb_dst(skb));
nhoff = IP6CB(skb)->nhoff;
- nexthdr = skb->nh.raw[nhoff];
-
- raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
- if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
- raw_sk = NULL;
+ if (!have_final) {
+ if (!pskb_pull(skb, skb_transport_offset(skb)))
+ goto discard;
+ nexthdr = skb_network_header(skb)[nhoff];
+ }
- hash = nexthdr & (MAX_INET_PROTOS - 1);
- if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
+resubmit_final:
+ raw = raw6_local_deliver(skb, nexthdr);
+ ipprot = rcu_dereference(inet6_protos[nexthdr]);
+ if (ipprot) {
int ret;
-
- if (ipprot->flags & INET6_PROTO_FINAL) {
- struct ipv6hdr *hdr;
-
- /* Free reference early: we don't need it any more,
- and it may hold ip_conntrack module loaded
- indefinitely. */
- nf_reset(skb);
-
- skb_postpull_rcsum(skb, skb->nh.raw,
- skb->h.raw - skb->nh.raw);
- hdr = skb->nh.ipv6h;
+
+ if (have_final) {
+ if (!(ipprot->flags & INET6_PROTO_FINAL)) {
+ /* Once we've seen a final protocol don't
+ * allow encapsulation on any non-final
+ * ones. This allows foo in UDP encapsulation
+ * to work.
+ */
+ goto discard;
+ }
+ } else if (ipprot->flags & INET6_PROTO_FINAL) {
+ const struct ipv6hdr *hdr;
+ int sdif = inet6_sdif(skb);
+ struct net_device *dev;
+
+ /* Only do this once for first final protocol */
+ have_final = true;
+
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ hdr = ipv6_hdr(skb);
+
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ dev = dev_get_by_index_rcu(net, sdif);
+ if (!dev)
+ goto discard;
+ } else {
+ dev = skb->dev;
+ }
+
if (ipv6_addr_is_multicast(&hdr->daddr) &&
- !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
- &hdr->saddr) &&
- !ipv6_is_mld(skb, nexthdr))
+ !ipv6_chk_mcast_addr(dev, &hdr->daddr,
+ &hdr->saddr) &&
+ !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) {
+ SKB_DR_SET(reason, IP_INADDRERRORS);
goto discard;
+ }
+ }
+ if (!(ipprot->flags & INET6_PROTO_NOPOLICY)) {
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
+ goto discard;
+ }
+ nf_reset_ct(skb);
+ }
+
+ ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv,
+ skb);
+ if (ret > 0) {
+ if (ipprot->flags & INET6_PROTO_FINAL) {
+ /* Not an extension header, most likely UDP
+ * encapsulation. Use return value as nexthdr
+ * protocol not nhoff (which presumably is
+ * not set by handler).
+ */
+ nexthdr = ret;
+ goto resubmit_final;
+ } else {
+ goto resubmit;
+ }
+ } else if (ret == 0) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
}
- if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
- !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard;
-
- ret = ipprot->handler(&skb);
- if (ret > 0)
- goto resubmit;
- else if (ret == 0)
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS);
} else {
- if (!raw_sk) {
+ if (!raw) {
if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INUNKNOWNPROTOS);
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INUNKNOWNPROTOS);
icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_UNK_NEXTHDR, nhoff,
- skb->dev);
+ ICMPV6_UNK_NEXTHDR, nhoff);
+ SKB_DR_SET(reason, IP_NOPROTO);
+ } else {
+ SKB_DR_SET(reason, XFRM_POLICY);
}
- } else
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
+ } else {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
+ }
}
- rcu_read_unlock();
- return 0;
+ return;
discard:
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDISCARDS);
- rcu_read_unlock();
- kfree_skb(skb);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ kfree_skb_reason(skb, reason);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INDISCARDS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
+ return 0;
+ }
+
+ skb_clear_delivery_time(skb);
+ ip6_protocol_deliver_rcu(net, skb, 0, false);
+
return 0;
}
int ip6_input(struct sk_buff *skb)
{
- return NF_HOOK(PF_INET6,NF_IP6_LOCAL_IN, skb, skb->dev, NULL, ip6_input_finish);
+ int res;
+
+ rcu_read_lock();
+ res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
+ ip6_input_finish);
+ rcu_read_unlock();
+
+ return res;
}
+EXPORT_SYMBOL_GPL(ip6_input);
int ip6_mc_input(struct sk_buff *skb)
{
- struct ipv6hdr *hdr;
- int deliver;
-
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCASTPKTS);
+ struct net_device *dev = skb->dev;
+ int sdif = inet6_sdif(skb);
+ const struct ipv6hdr *hdr;
+ bool deliver;
+
+ __IP6_UPD_PO_STATS(skb_dst_dev_net_rcu(skb),
+ __in6_dev_get_safely(dev), IPSTATS_MIB_INMCAST,
+ skb->len);
+
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ dev = dev_get_by_index_rcu(dev_net_rcu(dev), sdif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
- hdr = skb->nh.ipv6h;
- deliver = likely(!(skb->dev->flags & (IFF_PROMISC|IFF_ALLMULTI))) ||
- ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+ hdr = ipv6_hdr(skb);
+ deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL);
+#ifdef CONFIG_IPV6_MROUTE
/*
- * IPv6 multicast router mode isnt currently supported.
+ * IPv6 multicast router mode is now supported ;)
*/
-#if 0
- if (ipv6_config.multicast_route) {
- int addr_type;
-
- addr_type = ipv6_addr_type(&hdr->daddr);
-
- if (!(addr_type & (IPV6_ADDR_LOOPBACK | IPV6_ADDR_LINKLOCAL))) {
- struct sk_buff *skb2;
- struct dst_entry *dst;
-
- dst = skb->dst;
-
- if (deliver) {
- skb2 = skb_clone(skb, GFP_ATOMIC);
- dst_output(skb2);
- } else {
- dst_output(skb);
- return 0;
+ if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
+ !(ipv6_addr_type(&hdr->daddr) &
+ (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
+ likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
+ /*
+ * Okay, we try to forward - split and duplicate
+ * packets.
+ */
+ struct sk_buff *skb2;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
+ /* Check for MLD */
+ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
+ /* Check if this is a mld message */
+ u8 nexthdr = hdr->nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ /* Check if the value of Router Alert
+ * is for MLD (0x0000).
+ */
+ if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) {
+ deliver = false;
+
+ if (!ipv6_ext_hdr(nexthdr)) {
+ /* BUG */
+ goto out;
+ }
+ offset = ipv6_skip_exthdr(skb, sizeof(*hdr),
+ &nexthdr, &frag_off);
+ if (offset < 0)
+ goto out;
+
+ if (ipv6_is_mld(skb, nexthdr, offset))
+ deliver = true;
+
+ goto out;
}
+ /* unknown RA - process it normally */
}
+
+ if (deliver) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ } else {
+ skb2 = skb;
+ skb = NULL;
+ }
+
+ if (skb2)
+ ip6_mr_input(skb2);
}
+out:
#endif
-
if (likely(deliver)) {
ip6_input(skb);
- return 0;
+ } else {
+ /* discard */
+ kfree_skb(skb);
}
- /* discard */
- kfree_skb(skb);
return 0;
}
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
new file mode 100644
index 000000000000..fce91183797a
--- /dev/null
+++ b/net/ipv6/ip6_offload.c
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ */
+
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/printk.h>
+
+#include <net/protocol.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/gro.h>
+#include <net/gso.h>
+
+#include "ip6_offload.h"
+
+/* All GRO functions are always builtin, except UDP over ipv6, which lays in
+ * ipv6 module, as it depends on UDPv6 lookup function, so we need special care
+ * when ipv6 is built as a module
+ */
+#if IS_BUILTIN(CONFIG_IPV6)
+#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
+#endif
+
+#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \
+({ \
+ unlikely(gro_recursion_inc_test(skb)) ? \
+ NAPI_GRO_CB(skb)->flush |= 1, NULL : \
+ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
+})
+
+static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
+{
+ const struct net_offload *ops = NULL;
+ struct ipv6_opt_hdr *opth;
+
+ for (;;) {
+ int len;
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+
+ if (unlikely(!ops))
+ break;
+
+ if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+
+ opth = skb_gro_header(skb, off + sizeof(*opth), off);
+ if (unlikely(!opth))
+ break;
+
+ len = ipv6_optlen(opth);
+
+ opth = skb_gro_header(skb, off + len, off);
+ if (unlikely(!opth))
+ break;
+ proto = opth->nexthdr;
+
+ off += len;
+ }
+
+ skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb));
+ return proto;
+}
+
+static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
+{
+ const struct net_offload *ops = NULL;
+
+ for (;;) {
+ struct ipv6_opt_hdr *opth;
+ int len;
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+
+ if (unlikely(!ops))
+ break;
+
+ if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+
+ if (unlikely(!pskb_may_pull(skb, 8)))
+ break;
+
+ opth = (void *)skb->data;
+ len = ipv6_optlen(opth);
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ break;
+
+ opth = (void *)skb->data;
+ proto = opth->nexthdr;
+ __skb_pull(skb, len);
+ }
+
+ return proto;
+}
+
+static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct ipv6hdr *ipv6h;
+ const struct net_offload *ops;
+ int proto, err;
+ struct frag_hdr *fptr;
+ unsigned int payload_len;
+ u8 *prevhdr;
+ int offset = 0;
+ bool encap, udpfrag;
+ int nhoff;
+ bool gso_partial;
+
+ skb_reset_network_header(skb);
+ err = ipv6_hopopt_jumbo_remove(skb);
+ if (err)
+ return ERR_PTR(err);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
+ goto out;
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features &= skb->dev->hw_enc_features;
+ SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h);
+
+ ipv6h = ipv6_hdr(skb);
+ __skb_pull(skb, sizeof(*ipv6h));
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
+
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
+ udpfrag = proto == IPPROTO_UDP && encap &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment)) {
+ if (!skb_reset_transport_header_careful(skb))
+ goto out;
+
+ segs = ops->callbacks.gso_segment(skb, features);
+ if (!segs)
+ skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
+ }
+
+ if (IS_ERR_OR_NULL(segs))
+ goto out;
+
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+ for (skb = segs; skb; skb = skb->next) {
+ ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
+ if (gso_partial && skb_is_gso(skb))
+ payload_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)(ipv6h + 1);
+ else
+ payload_len = skb->len - nhoff - sizeof(*ipv6h);
+ ipv6h->payload_len = htons(payload_len);
+ skb->network_header = (u8 *)ipv6h - skb->head;
+ skb_reset_mac_len(skb);
+
+ if (udpfrag) {
+ int err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0) {
+ kfree_skb_list(segs);
+ return ERR_PTR(err);
+ }
+ fptr = (struct frag_hdr *)((u8 *)ipv6h + err);
+ fptr->frag_off = htons(offset);
+ if (skb->next)
+ fptr->frag_off |= htons(IP6_MF);
+ offset += (ntohs(ipv6h->payload_len) -
+ sizeof(struct frag_hdr));
+ }
+ if (encap)
+ skb_reset_inner_headers(skb);
+ }
+
+out:
+ return segs;
+}
+
+/* Return the total length of all the extension hdrs, following the same
+ * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs.
+ */
+static int ipv6_exthdrs_len(struct ipv6hdr *iph,
+ const struct net_offload **opps)
+{
+ struct ipv6_opt_hdr *opth = (void *)iph;
+ int len = 0, proto, optlen = sizeof(*iph);
+
+ proto = iph->nexthdr;
+ for (;;) {
+ *opps = rcu_dereference(inet6_offloads[proto]);
+ if (unlikely(!(*opps)))
+ break;
+ if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+
+ opth = (void *)opth + optlen;
+ optlen = ipv6_optlen(opth);
+ len += optlen;
+ proto = opth->nexthdr;
+ }
+ return len;
+}
+
+INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct ipv6hdr *iph;
+ unsigned int nlen;
+ unsigned int hlen;
+ unsigned int off;
+ u16 flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;
+
+ flush += ntohs(iph->payload_len) != skb->len - hlen;
+
+ proto = iph->nexthdr;
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive) {
+ proto = ipv6_gro_pull_exthdrs(skb, hlen, proto);
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
+
+ iph = skb_gro_network_header(skb);
+ } else {
+ skb_gro_pull(skb, sizeof(*iph));
+ }
+
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(skb)->proto = proto;
+
+ flush--;
+ nlen = skb_gro_offset(skb) - off;
+
+ list_for_each_entry(p, head, list) {
+ const struct ipv6hdr *iph2;
+ __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = (struct ipv6hdr *)(p->data + off);
+ first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+
+ /* All fields must match except length and Traffic Class.
+ * XXX skbs on the gro_list have all been parsed and pulled
+ * already so we don't need to compare nlen
+ * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
+ * memcmp() alone below is sufficient, right?
+ */
+ if ((first_word & htonl(0xF00FFFFF)) ||
+ !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
+ !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
+ iph->nexthdr != iph2->nexthdr) {
+not_same_flow:
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (unlikely(nlen > sizeof(struct ipv6hdr))) {
+ if (memcmp(iph + 1, iph2 + 1,
+ nlen - sizeof(struct ipv6hdr)))
+ goto not_same_flow;
+ }
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ skb_gro_postpull_rcsum(skb, iph, nlen);
+
+ pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+
+static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ /* Common GRO receive for SIT and IP6IP6 */
+
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return ipv6_gro_receive(head, skb);
+}
+
+static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ /* Common GRO receive for SIT and IP6IP6 */
+
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return inet_gro_receive(head, skb);
+}
+
+INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct net_offload *ops;
+ struct ipv6hdr *iph;
+ int err = -ENOSYS;
+ u32 payload_len;
+
+ if (skb->encapsulation) {
+ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
+ skb_set_inner_network_header(skb, nhoff);
+ }
+
+ payload_len = skb->len - nhoff - sizeof(*iph);
+ if (unlikely(payload_len > IPV6_MAXPLEN)) {
+ struct hop_jumbo_hdr *hop_jumbo;
+ int hoplen = sizeof(*hop_jumbo);
+
+ /* Move network header left */
+ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb),
+ skb->transport_header - skb->mac_header);
+ skb->data -= hoplen;
+ skb->len += hoplen;
+ skb->mac_header -= hoplen;
+ skb->network_header -= hoplen;
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1);
+
+ /* Build hop-by-hop options */
+ hop_jumbo->nexthdr = iph->nexthdr;
+ hop_jumbo->hdrlen = 0;
+ hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+ hop_jumbo->tlv_len = 4;
+ hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen);
+
+ iph->nexthdr = NEXTHDR_HOP;
+ iph->payload_len = 0;
+ } else {
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ iph->payload_len = htons(payload_len);
+ }
+
+ nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out;
+
+ err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
+ udp6_gro_complete, skb, nhoff);
+
+out:
+ return err;
+}
+
+static int sit_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+ return ipv6_gro_complete(skb, nhoff);
+}
+
+static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+ return ipv6_gro_complete(skb, nhoff);
+}
+
+static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+ return inet_gro_complete(skb, nhoff);
+}
+
+
+static struct sk_buff *sit_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
+ return ERR_PTR(-EINVAL);
+
+ return ipv6_gso_segment(skb, features);
+}
+
+static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6))
+ return ERR_PTR(-EINVAL);
+
+ return inet_gso_segment(skb, features);
+}
+
+static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6))
+ return ERR_PTR(-EINVAL);
+
+ return ipv6_gso_segment(skb, features);
+}
+
+static const struct net_offload sit_offload = {
+ .callbacks = {
+ .gso_segment = sit_gso_segment,
+ .gro_receive = sit_ip6ip6_gro_receive,
+ .gro_complete = sit_gro_complete,
+ },
+};
+
+static const struct net_offload ip4ip6_offload = {
+ .callbacks = {
+ .gso_segment = ip4ip6_gso_segment,
+ .gro_receive = ip4ip6_gro_receive,
+ .gro_complete = ip4ip6_gro_complete,
+ },
+};
+
+static const struct net_offload ip6ip6_offload = {
+ .callbacks = {
+ .gso_segment = ip6ip6_gso_segment,
+ .gro_receive = sit_ip6ip6_gro_receive,
+ .gro_complete = ip6ip6_gro_complete,
+ },
+};
+static int __init ipv6_offload_init(void)
+{
+
+ if (tcpv6_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+ if (ipv6_exthdrs_offload_init() < 0)
+ pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
+
+ net_hotdata.ipv6_packet_offload = (struct packet_offload) {
+ .type = cpu_to_be16(ETH_P_IPV6),
+ .callbacks = {
+ .gso_segment = ipv6_gso_segment,
+ .gro_receive = ipv6_gro_receive,
+ .gro_complete = ipv6_gro_complete,
+ },
+ };
+ dev_add_offload(&net_hotdata.ipv6_packet_offload);
+
+ inet_add_offload(&sit_offload, IPPROTO_IPV6);
+ inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6);
+ inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP);
+
+ return 0;
+}
+
+fs_initcall(ipv6_offload_init);
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
new file mode 100644
index 000000000000..e768987604f1
--- /dev/null
+++ b/net/ipv6/ip6_offload.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ */
+
+#ifndef __ip6_offload_h
+#define __ip6_offload_h
+
+int ipv6_exthdrs_offload_init(void);
+int udpv6_offload_init(void);
+int udpv6_offload_exit(void);
+int tcpv6_offload_init(void);
+
+#endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e05ecbb1412d..f904739e99b9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1,19 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 output functions
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
- *
- * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
+ * Pedro Roque <roque@di.fc.ul.pt>
*
* Based on linux/net/ipv4/ip_output.c
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* A.N.Kuznetsov : airthmetics in fragmentation.
* extension headers are implemented.
@@ -22,14 +16,14 @@
* etc.
*
* H. von Brand : Added missing #include <linux/string.h>
- * Imran Patel : frag id should be in NBO
+ * Imran Patel : frag id should be in NBO
* Kazunori MIYAZAWA @USAGI
* : add ip6_append_data and related functions
* for datagram xmit
*/
#include <linux/errno.h>
-#include <linux/types.h>
+#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/net.h>
@@ -39,13 +33,16 @@
#include <linux/tcp.h>
#include <linux/route.h>
#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/gso.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/protocol.h>
@@ -55,233 +52,342 @@
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/checksum.h>
+#include <linux/mroute6.h>
+#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
-static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
-
-static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
+static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- static u32 ipv6_fragmentation_id = 1;
- static DEFINE_SPINLOCK(ip6_id_lock);
-
- spin_lock_bh(&ip6_id_lock);
- fhdr->identification = htonl(ipv6_fragmentation_id);
- if (++ipv6_fragmentation_id == 0)
- ipv6_fragmentation_id = 1;
- spin_unlock_bh(&ip6_id_lock);
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst_dev_rcu(dst);
+ struct inet6_dev *idev = ip6_dst_idev(dst);
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *daddr, *nexthop;
+ struct ipv6hdr *hdr;
+ struct neighbour *neigh;
+ int ret;
+
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
+ /* idev stays alive because we hold rcu_read_lock(). */
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ return -ENOMEM;
+ }
+ }
+
+ hdr = ipv6_hdr(skb);
+ daddr = &hdr->daddr;
+ if (ipv6_addr_is_multicast(daddr)) {
+ if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
+ ((mroute6_is_socket(net, skb) &&
+ !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
+ ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+
+ /* Do not check for IFF_ALLMULTI; multicast routing
+ is not supported in any case.
+ */
+ if (newskb)
+ NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
+
+ if (hdr->hop_limit == 0) {
+ IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
+ if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
+ !(dev->flags & IFF_LOOPBACK)) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+ int res = lwtunnel_xmit(skb);
+
+ if (res != LWTUNNEL_XMIT_CONTINUE)
+ return res;
+ }
+
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+ nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
+ neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
+
+ if (IS_ERR_OR_NULL(neigh)) {
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
+ if (IS_ERR(neigh)) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
+ return -EINVAL;
+ }
+ }
+ sock_confirm_neigh(skb, neigh);
+ ret = neigh_output(neigh, skb, false);
+ return ret;
}
-static inline int ip6_output_finish(struct sk_buff *skb)
+static int
+ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features;
+ int ret = 0;
- struct dst_entry *dst = skb->dst;
- struct hh_cache *hh = dst->hh;
+ /* Please see corresponding comment in ip_finish_output_gso
+ * describing the cases where GSO segment length exceeds the
+ * egress MTU.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
- if (hh) {
- int hh_alen;
+ consume_skb(skb);
- read_lock_bh(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(hh->hh_len);
- memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
- read_unlock_bh(&hh->hh_lock);
- skb_push(skb, hh->hh_len);
- return hh->hh_output(skb);
- } else if (dst->neighbour)
- return dst->neighbour->output(skb);
+ skb_list_walk_safe(segs, segs, nskb) {
+ int err;
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
- return -EINVAL;
+ skb_mark_not_on_list(segs);
+ /* Last GSO segment can be smaller than gso_size (and MTU).
+ * Adding a fragment header would produce an "atomic fragment",
+ * which is considered harmful (RFC-8021). Avoid that.
+ */
+ err = segs->len > mtu ?
+ ip6_fragment(net, sk, segs, ip6_finish_output2) :
+ ip6_finish_output2(net, sk, segs);
+ if (err && ret == 0)
+ ret = err;
+ }
+ return ret;
}
-/* dev_loopback_xmit for use with netfilter. */
-static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
+static int ip6_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
- newskb->mac.raw = newskb->data;
- __skb_pull(newskb, newskb->nh.raw - newskb->data);
- newskb->pkt_type = PACKET_LOOPBACK;
- newskb->ip_summed = CHECKSUM_UNNECESSARY;
- BUG_TRAP(newskb->dst);
+ if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
+ !skb_gso_validate_network_len(skb, mtu))
+ return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
- netif_rx(newskb);
- return 0;
+ return ip6_finish_output2(net, sk, skb);
}
-
-static int ip6_output2(struct sk_buff *skb)
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb->dst;
- struct net_device *dev = dst->dev;
+ unsigned int mtu;
- skb->protocol = htons(ETH_P_IPV6);
- skb->dev = dev;
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+ /* Policy lookup after SNAT yielded a new policy */
+ if (skb_dst(skb)->xfrm) {
+ IP6CB(skb)->flags |= IP6SKB_REROUTED;
+ return dst_output(net, sk, skb);
+ }
+#endif
- if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
- struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
- struct inet6_dev *idev = ip6_dst_idev(skb->dst);
+ mtu = ip6_skb_dst_mtu(skb);
+ if (skb_is_gso(skb))
+ return ip6_finish_output_gso(net, sk, skb, mtu);
- if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
- ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
- &skb->nh.ipv6h->saddr)) {
- struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (skb->len > mtu ||
+ (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
+ return ip6_fragment(net, sk, skb, ip6_finish_output2);
- /* Do not check for IFF_ALLMULTI; multicast routing
- is not supported in any case.
- */
- if (newskb)
- NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
- newskb->dev,
- ip6_dev_loopback_xmit);
+ return ip6_finish_output2(net, sk, skb);
+}
- if (skb->nh.ipv6h->hop_limit == 0) {
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
- return 0;
- }
- }
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ case NET_XMIT_CN:
+ return __ip6_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+ return ret;
+ }
+}
+
+int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev, *indev = skb->dev;
+ struct inet6_dev *idev;
+ int ret;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ idev = ip6_dst_idev(dst);
+ skb->dev = dev;
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
+ if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_unlock();
+ kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
+ return 0;
}
- return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
+ ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip6_finish_output,
+ !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ rcu_read_unlock();
+ return ret;
}
+EXPORT_SYMBOL(ip6_output);
-int ip6_output(struct sk_buff *skb)
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
{
- if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
- dst_allfrag(skb->dst))
- return ip6_fragment(skb, ip6_output2);
- else
- return ip6_output2(skb);
+ if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
+ return ip6_default_np_autolabel(net);
+ return inet6_test_bit(AUTOFLOWLABEL, sk);
}
/*
- * xmit an sk_buff (used by TCP)
+ * xmit an sk_buff (used by TCP and SCTP)
+ * Note : socket lock is not held for SYNACK packets, but might be modified
+ * by calls to skb_set_owner_w() and ipv6_local_error(),
+ * which are using proper atomic operations or spinlocks.
*/
-
-int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
- struct ipv6_txoptions *opt, int ipfragok)
+int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
+ __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *first_hop = &fl->fl6_dst;
- struct dst_entry *dst = skb->dst;
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *first_hop = &fl6->daddr;
+ struct dst_entry *dst = skb_dst(skb);
+ struct inet6_dev *idev = ip6_dst_idev(dst);
+ struct hop_jumbo_hdr *hop_jumbo;
+ int hoplen = sizeof(*hop_jumbo);
+ struct net *net = sock_net(sk);
+ unsigned int head_room;
+ struct net_device *dev;
struct ipv6hdr *hdr;
- u8 proto = fl->proto;
+ u8 proto = fl6->flowi6_proto;
int seg_len = skb->len;
- int hlimit, tclass;
+ int ret, hlimit = -1;
u32 mtu;
- if (opt) {
- int head_room;
+ rcu_read_lock();
- /* First: exthdrs may take lots of space (~8K for now)
- MAX_HEADER is not enough.
- */
- head_room = opt->opt_nflen + opt->opt_flen;
- seg_len += head_room;
- head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
-
- if (skb_headroom(skb) < head_room) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
- if (skb2 == NULL) {
- IP6_INC_STATS(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
- return -ENOBUFS;
- }
- kfree_skb(skb);
- skb = skb2;
- if (sk)
- skb_set_owner_w(skb, sk);
+ dev = dst_dev_rcu(dst);
+ head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
+ if (opt)
+ head_room += opt->opt_nflen + opt->opt_flen;
+
+ if (unlikely(head_room > skb_headroom(skb))) {
+ /* idev stays alive while we hold rcu_read_lock(). */
+ skb = skb_expand_head(skb, head_room);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ ret = -ENOBUFS;
+ goto unlock;
}
+ }
+
+ if (opt) {
+ seg_len += opt->opt_nflen + opt->opt_flen;
+
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
+
if (opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+ ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
+ &fl6->saddr);
+ }
+
+ if (unlikely(seg_len > IPV6_MAXPLEN)) {
+ hop_jumbo = skb_push(skb, hoplen);
+
+ hop_jumbo->nexthdr = proto;
+ hop_jumbo->hdrlen = 0;
+ hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+ hop_jumbo->tlv_len = 4;
+ hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
+
+ proto = IPPROTO_HOPOPTS;
+ seg_len = 0;
+ IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
}
- hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
/*
* Fill in the IPv6 header
*/
-
- hlimit = -1;
if (np)
- hlimit = np->hop_limit;
+ hlimit = READ_ONCE(np->hop_limit);
if (hlimit < 0)
- hlimit = dst_metric(dst, RTAX_HOPLIMIT);
- if (hlimit < 0)
- hlimit = ipv6_get_hoplimit(dst->dev);
-
- tclass = -1;
- if (np)
- tclass = np->tclass;
- if (tclass < 0)
- tclass = 0;
+ hlimit = ip6_dst_hoplimit(dst);
- *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
+ ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
+ ip6_autoflowlabel(net, sk), fl6));
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
hdr->hop_limit = hlimit;
- ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
- ipv6_addr_copy(&hdr->daddr, first_hop);
+ hdr->saddr = fl6->saddr;
+ hdr->daddr = *first_hop;
- skb->priority = sk->sk_priority;
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->priority = priority;
+ skb->mark = mark;
mtu = dst_mtu(dst);
- if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
- IP6_INC_STATS(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_OUTREQUESTS);
- return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
- dst_output);
- }
-
- if (net_ratelimit())
- printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
- skb->dev = dst->dev;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
- return -EMSGSIZE;
-}
+ if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
-/*
- * To avoid extra problems ND packets are send through this
- * routine. It's code duplication but I really want to avoid
- * extra checks since ipv6_build_header is used by TCP (which
- * is for us performance critical)
- */
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out((struct sock *)sk, skb);
+ if (unlikely(!skb)) {
+ ret = 0;
+ goto unlock;
+ }
-int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
- struct in6_addr *saddr, struct in6_addr *daddr,
- int proto, int len)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6hdr *hdr;
- int totlen;
+ /* hooks should never assume socket lock is held.
+ * we promote our socket to non const
+ */
+ ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, (struct sock *)sk, skb, NULL, dev,
+ dst_output);
+ goto unlock;
+ }
- skb->protocol = htons(ETH_P_IPV6);
+ ret = -EMSGSIZE;
skb->dev = dev;
+ /* ipv6_local_error() does not require socket lock,
+ * we promote our socket to non const
+ */
+ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
- totlen = len + sizeof(struct ipv6hdr);
-
- hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
- skb->nh.ipv6h = hdr;
-
- *(__be32*)hdr = htonl(0x60000000);
-
- hdr->payload_len = htons(len);
- hdr->nexthdr = proto;
- hdr->hop_limit = np->hop_limit;
-
- ipv6_addr_copy(&hdr->saddr, saddr);
- ipv6_addr_copy(&hdr->daddr, daddr);
-
- return 0;
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+unlock:
+ rcu_read_unlock();
+ return ret;
}
+EXPORT_SYMBOL(ip6_xmit);
static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
{
@@ -294,6 +400,11 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
if (sk && ra->sel == sel &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == skb->dev->ifindex)) {
+
+ if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
+ !net_eq(sock_net(sk), dev_net(skb->dev))) {
+ continue;
+ }
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
@@ -314,12 +425,13 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
static int ip6_forward_proxy_check(struct sk_buff *skb)
{
- struct ipv6hdr *hdr = skb->nh.ipv6h;
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
u8 nexthdr = hdr->nexthdr;
+ __be16 frag_off;
int offset;
if (ipv6_ext_hdr(nexthdr)) {
- offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
+ offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
if (offset < 0)
return 0;
} else
@@ -328,10 +440,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
if (nexthdr == IPPROTO_ICMPV6) {
struct icmp6hdr *icmp6;
- if (!pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data))
+ if (!pskb_may_pull(skb, (skb_network_header(skb) +
+ offset + 1 - skb->data)))
return 0;
- icmp6 = (struct icmp6hdr *)(skb->nh.raw + offset);
+ icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
switch (icmp6->icmp6_type) {
case NDISC_ROUTER_SOLICITATION:
@@ -362,26 +475,71 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
return 0;
}
-static inline int ip6_forward_finish(struct sk_buff *skb)
+static inline int ip6_forward_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
- return dst_output(skb);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
+
+ skb_clear_tstamp(skb);
+ return dst_output(net, sk, skb);
+}
+
+static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
+ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
+ return true;
+
+ if (skb->ignore_df)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+ return false;
+
+ return true;
}
int ip6_forward(struct sk_buff *skb)
{
- struct dst_entry *dst = skb->dst;
- struct ipv6hdr *hdr = skb->nh.ipv6h;
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
-
- if (ipv6_devconf.forwarding == 0)
+ struct net *net = dev_net(dst_dev(dst));
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ SKB_DR(reason);
+ u32 mtu;
+
+ idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
+ if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
+ (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
goto error;
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
- IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if (unlikely(skb->sk))
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
+ (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
+ !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
- skb->ip_summed = CHECKSUM_NONE;
+ skb_forward_csum(skb);
/*
* We DO NOT make any processing on
@@ -396,9 +554,8 @@ int ip6_forward(struct sk_buff *skb)
* cannot be fragmented, because there is no warranty
* that different fragments will go along one path. --ANK
*/
- if (opt->ra) {
- u8 *ptr = skb->nh.raw + opt->ra;
- if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
+ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
+ if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
return 0;
}
@@ -406,92 +563,128 @@ int ip6_forward(struct sk_buff *skb)
* check and decrement ttl
*/
if (hdr->hop_limit <= 1) {
- /* Force OUTPUT device used as source address */
- skb->dev = dst->dev;
- icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
- 0, skb->dev);
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return -ETIMEDOUT;
}
/* XXX: idev->cnf.proxy_ndp? */
- if (ipv6_devconf.proxy_ndp &&
- pneigh_lookup(&nd_tbl, &hdr->daddr, skb->dev, 0)) {
+ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
+ pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
int proxied = ip6_forward_proxy_check(skb);
- if (proxied > 0)
+ if (proxied > 0) {
+ /* It's tempting to decrease the hop limit
+ * here by 1, as we do at the end of the
+ * function too.
+ *
+ * But that would be incorrect, as proxying is
+ * not forwarding. The ip6_input function
+ * will handle this packet locally, and it
+ * depends on the hop limit being unchanged.
+ *
+ * One example is the NDP hop limit, that
+ * always has to stay 255, but other would be
+ * similar checks around RA packets, where the
+ * user can even change the desired limit.
+ */
return ip6_input(skb);
- else if (proxied < 0) {
- IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+ } else if (proxied < 0) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
}
if (!xfrm6_route_forward(skb)) {
- IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
}
- dst = skb->dst;
-
+ dst = skb_dst(skb);
+ dev = dst_dev(dst);
/* IPv6 specs say nothing about it, but it is clear that we cannot
send redirects to source routed frames.
+ We don't send redirects to frames decapsulated from IPsec.
*/
- if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
+ if (IP6CB(skb)->iif == dev->ifindex &&
+ opt->srcrt == 0 && !skb_sec_path(skb)) {
struct in6_addr *target = NULL;
+ struct inet_peer *peer;
struct rt6_info *rt;
- struct neighbour *n = dst->neighbour;
/*
* incoming and outgoing devices are the same
* send a redirect.
*/
- rt = (struct rt6_info *) dst;
- if ((rt->rt6i_flags & RTF_GATEWAY))
- target = (struct in6_addr*)&n->primary_key;
+ rt = dst_rt6_info(dst);
+ if (rt->rt6i_flags & RTF_GATEWAY)
+ target = &rt->rt6i_gateway;
else
target = &hdr->daddr;
+ rcu_read_lock();
+ peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
+
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
*/
- if (xrlim_allow(dst, 1*HZ))
- ndisc_send_redirect(skb, n, target);
- } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
- |IPV6_ADDR_LINKLOCAL)) {
+ if (inet_peer_xrlim_allow(peer, 1*HZ))
+ ndisc_send_redirect(skb, target);
+ rcu_read_unlock();
+ } else {
+ int addrtype = ipv6_addr_type(&hdr->saddr);
+
/* This check is security critical. */
- goto error;
+ if (addrtype == IPV6_ADDR_ANY ||
+ addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
+ goto error;
+ if (addrtype & IPV6_ADDR_LINKLOCAL) {
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH,
+ ICMPV6_NOT_NEIGHBOUR, 0);
+ goto error;
+ }
}
- if (skb->len > dst_mtu(dst)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+ mtu = ip6_dst_mtu_maybe_forward(dst, true);
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ if (ip6_pkt_too_big(skb, mtu)) {
/* Again, force OUTPUT device used as source address */
- skb->dev = dst->dev;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
+ skb->dev = dev;
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_FRAGFAILS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
return -EMSGSIZE;
}
- if (skb_cow(skb, dst->dev->hard_header_len)) {
- IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+ if (skb_cow(skb, dev->hard_header_len)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_OUTDISCARDS);
goto drop;
}
- hdr = skb->nh.ipv6h;
+ hdr = ipv6_hdr(skb);
/* Mangling hops number delayed to point after skb COW */
-
+
hdr->hop_limit--;
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
- return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dev,
+ ip6_forward_finish);
error:
- IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
+ SKB_DR_SET(reason, IP_INADDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return -EINVAL;
}
@@ -500,333 +693,400 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
- dst_release(to->dst);
- to->dst = dst_clone(from->dst);
+ skb_dst_drop(to);
+ skb_dst_set(to, dst_clone(skb_dst(from)));
to->dev = from->dev;
to->mark = from->mark;
+ skb_copy_hash(to, from);
+
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
-#ifdef CONFIG_NETFILTER
- /* Connection association is same as pre-frag packet */
- nf_conntrack_put(to->nfct);
- to->nfct = from->nfct;
- nf_conntrack_get(to->nfct);
- to->nfctinfo = from->nfctinfo;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- nf_conntrack_put_reasm(to->nfct_reasm);
- to->nfct_reasm = from->nfct_reasm;
- nf_conntrack_get_reasm(to->nfct_reasm);
-#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- nf_bridge_put(to->nf_bridge);
- to->nf_bridge = from->nf_bridge;
- nf_bridge_get(to->nf_bridge);
-#endif
-#endif
+ nf_copy(to, from);
+ skb_ext_copy(to, from);
skb_copy_secmark(to, from);
}
-int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id,
+ struct ip6_fraglist_iter *iter)
{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
- unsigned int packet_len = skb->tail - skb->nh.raw;
- int found_rhdr = 0;
- *nexthdr = &skb->nh.ipv6h->nexthdr;
+ unsigned int first_len;
+ struct frag_hdr *fh;
- while (offset + 1 <= packet_len) {
+ /* BUILD HEADER */
+ *prevhdr = NEXTHDR_FRAGMENT;
+ iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+ if (!iter->tmp_hdr)
+ return -ENOMEM;
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->hlen = hlen;
+ iter->frag_id = frag_id;
+ iter->nexthdr = nexthdr;
+
+ __skb_pull(skb, hlen);
+ fh = __skb_push(skb, sizeof(struct frag_hdr));
+ __skb_push(skb, hlen);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
+
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(IP6_MF);
+ fh->identification = frag_id;
+
+ first_len = skb_pagelen(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
- switch (**nexthdr) {
+ return 0;
+}
+EXPORT_SYMBOL(ip6_fraglist_init);
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
-#ifdef CONFIG_IPV6_MIP6
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
- break;
-#endif
- if (found_rhdr)
- return offset;
- break;
- default :
- return offset;
- }
+void ip6_fraglist_prepare(struct sk_buff *skb,
+ struct ip6_fraglist_iter *iter)
+{
+ struct sk_buff *frag = iter->frag;
+ unsigned int hlen = iter->hlen;
+ struct frag_hdr *fh;
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
- }
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ fh = __skb_push(frag, sizeof(struct frag_hdr));
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
+ iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
+ fh->nexthdr = iter->nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(iter->offset);
+ if (frag->next)
+ fh->frag_off |= htons(IP6_MF);
+ fh->identification = iter->frag_id;
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+ ip6_copy_metadata(frag, skb);
+}
+EXPORT_SYMBOL(ip6_fraglist_prepare);
+
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+ unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
+{
+ state->prevhdr = prevhdr;
+ state->nexthdr = nexthdr;
+ state->frag_id = frag_id;
+
+ state->hlen = hlen;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->hroom = hdr_room;
+ state->troom = needed_tailroom;
- return offset;
+ state->offset = 0;
}
-EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
+EXPORT_SYMBOL(ip6_frag_init);
-static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
{
- struct net_device *dev;
+ u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
struct sk_buff *frag;
- struct rt6_info *rt = (struct rt6_info*)skb->dst;
- struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
- struct ipv6hdr *tmp_hdr;
struct frag_hdr *fh;
- unsigned int mtu, hlen, left, len;
- __be32 frag_id = 0;
- int ptr, offset = 0, err=0;
+ unsigned int len;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left)
+ len &= ~7;
+
+ /* Allocate buffer */
+ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
+ state->hroom + state->troom, GFP_ATOMIC);
+ if (!frag)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip6_copy_metadata(frag, skb);
+ skb_reserve(frag, state->hroom);
+ skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
+ skb_reset_network_header(frag);
+ fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
+ frag->transport_header = (frag->network_header + state->hlen +
+ sizeof(struct frag_hdr));
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+ if (skb->sk)
+ skb_set_owner_w(frag, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
+
+ fragnexthdr_offset = skb_network_header(frag);
+ fragnexthdr_offset += prevhdr - skb_network_header(skb);
+ *fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
+ /*
+ * Build fragment header.
+ */
+ fh->nexthdr = state->nexthdr;
+ fh->reserved = 0;
+ fh->identification = state->frag_id;
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
+ len));
+ state->left -= len;
+
+ fh->frag_off = htons(state->offset);
+ if (state->left > 0)
+ fh->frag_off |= htons(IP6_MF);
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+
+ state->ptr += len;
+ state->offset += len;
+
+ return frag;
+}
+EXPORT_SYMBOL(ip6_frag_next);
+
+int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ struct sk_buff *frag;
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
+ struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+ inet6_sk(skb->sk) : NULL;
+ u8 tstamp_type = skb->tstamp_type;
+ struct ip6_frag_state state;
+ unsigned int mtu, hlen, nexthdr_offset;
+ ktime_t tstamp = skb->tstamp;
+ int hroom, err = 0;
+ __be32 frag_id;
u8 *prevhdr, nexthdr = 0;
- dev = rt->u.dst.dev;
- hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto fail;
+ hlen = err;
nexthdr = *prevhdr;
+ nexthdr_offset = prevhdr - skb_network_header(skb);
+
+ mtu = ip6_skb_dst_mtu(skb);
+
+ /* We must not fragment if the socket is set to force MTU discovery
+ * or if the skb it not generated by a local socket.
+ */
+ if (unlikely(!skb->ignore_df && skb->len > mtu))
+ goto fail_toobig;
+
+ if (IP6CB(skb)->frag_max_size) {
+ if (IP6CB(skb)->frag_max_size > mtu)
+ goto fail_toobig;
+
+ /* don't send fragments larger than what we received */
+ mtu = IP6CB(skb)->frag_max_size;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ }
- mtu = dst_mtu(&rt->u.dst);
- if (np && np->frag_size < mtu) {
- if (np->frag_size)
- mtu = np->frag_size;
+ if (np) {
+ u32 frag_size = READ_ONCE(np->frag_size);
+
+ if (frag_size && frag_size < mtu)
+ mtu = frag_size;
}
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto fail_toobig;
mtu -= hlen + sizeof(struct frag_hdr);
- if (skb_shinfo(skb)->frag_list) {
- int first_len = skb_pagelen(skb);
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
+
+ prevhdr = skb_network_header(skb) + nexthdr_offset;
+ hroom = LL_RESERVED_SPACE(rt->dst.dev);
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
+ struct sk_buff *frag2;
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
- skb_cloned(skb))
+ skb_cloned(skb) ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
goto slow_path;
- for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+ skb_walk_frags(skb, frag) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
- skb_headroom(frag) < hlen)
- goto slow_path;
+ skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
+ goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
- goto slow_path;
+ goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
- sock_hold(skb->sk);
frag->sk = skb->sk;
frag->destructor = sock_wfree;
- skb->truesize -= frag->truesize;
}
+ skb->truesize -= frag->truesize;
}
- err = 0;
- offset = 0;
- frag = skb_shinfo(skb)->frag_list;
- skb_shinfo(skb)->frag_list = NULL;
- /* BUILD HEADER */
-
- tmp_hdr = kmemdup(skb->nh.raw, hlen, GFP_ATOMIC);
- if (!tmp_hdr) {
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
- return -ENOMEM;
- }
-
- *prevhdr = NEXTHDR_FRAGMENT;
- __skb_pull(skb, hlen);
- fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
- skb->nh.raw = __skb_push(skb, hlen);
- memcpy(skb->nh.raw, tmp_hdr, hlen);
-
- ipv6_select_ident(skb, fh);
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(IP6_MF);
- frag_id = fh->identification;
-
- first_len = skb_pagelen(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
+ goto fail;
- dst_hold(&rt->u.dst);
+ /* We prevent @rt from being freed. */
+ rcu_read_lock();
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- frag->h.raw = frag->data;
- fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
- frag->nh.raw = __skb_push(frag, hlen);
- memcpy(frag->nh.raw, tmp_hdr, hlen);
- offset += skb->len - hlen - sizeof(struct frag_hdr);
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(offset);
- if (frag->next != NULL)
- fh->frag_off |= htons(IP6_MF);
- fh->identification = frag_id;
- frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
- ip6_copy_metadata(frag, skb);
- }
-
- err = output(skb);
- if(!err)
- IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
+ err = output(net, sk, skb);
+ if (!err)
+ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+ IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb->next = NULL;
+ skb = ip6_fraglist_next(&iter);
}
- kfree(tmp_hdr);
+ kfree(iter.tmp_hdr);
if (err == 0) {
- IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
- dst_release(&rt->u.dst);
+ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+ IPSTATS_MIB_FRAGOKS);
+ rcu_read_unlock();
return 0;
}
- while (frag) {
- skb = frag->next;
- kfree_skb(frag);
- frag = skb;
- }
+ kfree_skb_list(iter.frag);
- IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
- dst_release(&rt->u.dst);
+ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+ IPSTATS_MIB_FRAGFAILS);
+ rcu_read_unlock();
return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
}
slow_path:
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- *prevhdr = NEXTHDR_FRAGMENT;
+ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
+ LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
+ &state);
/*
* Keep copying data until we run out.
*/
- while(left > 0) {
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending upto and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
- /*
- * Allocate buffer.
- */
- if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
- NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
- IP6_INC_STATS(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_FRAGFAILS);
- err = -ENOMEM;
+ while (state.left > 0) {
+ frag = ip6_frag_next(skb, &state);
+ if (IS_ERR(frag)) {
+ err = PTR_ERR(frag);
goto fail;
}
/*
- * Set up data on packet
- */
-
- ip6_copy_metadata(frag, skb);
- skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
- skb_put(frag, len + hlen + sizeof(struct frag_hdr));
- frag->nh.raw = frag->data;
- fh = (struct frag_hdr*)(frag->data + hlen);
- frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
- if (skb->sk)
- skb_set_owner_w(frag, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
- memcpy(frag->nh.raw, skb->data, hlen);
-
- /*
- * Build fragment header.
- */
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- if (!frag_id) {
- ipv6_select_ident(skb, fh);
- frag_id = fh->identification;
- } else
- fh->identification = frag_id;
-
- /*
- * Copy a block of the IP datagram.
- */
- if (skb_copy_bits(skb, ptr, frag->h.raw, len))
- BUG();
- left -= len;
-
- fh->frag_off = htons(offset);
- if (left > 0)
- fh->frag_off |= htons(IP6_MF);
- frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
-
- ptr += len;
- offset += len;
-
- /*
* Put this fragment into the sending queue.
*/
- err = output(frag);
+ skb_set_delivery_time(frag, tstamp, tstamp_type);
+ err = output(net, sk, frag);
if (err)
goto fail;
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGCREATES);
}
- IP6_INC_STATS(ip6_dst_idev(skb->dst),
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_FRAGOKS);
- kfree_skb(skb);
+ consume_skb(skb);
return err;
+fail_toobig:
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ err = -EMSGSIZE;
+
fail:
- IP6_INC_STATS(ip6_dst_idev(skb->dst),
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
+ kfree_skb(skb);
return err;
}
-static inline int ip6_rt_check(struct rt6key *rt_key,
- struct in6_addr *fl_addr,
- struct in6_addr *addr_cache)
+static inline int ip6_rt_check(const struct rt6key *rt_key,
+ const struct in6_addr *fl_addr,
+ const struct in6_addr *addr_cache)
{
- return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
- (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
+ return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
+ (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
}
static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
struct dst_entry *dst,
- struct flowi *fl)
+ const struct flowi6 *fl6)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- struct rt6_info *rt = (struct rt6_info *)dst;
+ struct rt6_info *rt;
if (!dst)
goto out;
+ if (dst->ops->family != AF_INET6) {
+ dst_release(dst);
+ return NULL;
+ }
+
+ rt = dst_rt6_info(dst);
/* Yes, checking route validity in not connected
* case is not very simple. Take into account,
* that we do not support routing by source, TOS,
- * and MSG_DONTROUTE --ANK (980726)
+ * and MSG_DONTROUTE --ANK (980726)
*
* 1. ip6_rt_check(): If route was host route,
* check that cached destination is current.
@@ -840,11 +1100,13 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
* sockets.
* 2. oif also should be the same.
*/
- if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
+ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
+ np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
#ifdef CONFIG_IPV6_SUBTREES
- ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
+ ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
+ np->saddr_cache ? &np->saddr : NULL) ||
#endif
- (fl->oif && fl->oif != dst->dev->ifindex)) {
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
dst_release(dst);
dst = NULL;
}
@@ -853,21 +1115,110 @@ out:
return dst;
}
-static int ip6_dst_lookup_tail(struct sock *sk,
- struct dst_entry **dst, struct flowi *fl)
+static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
+ struct dst_entry **dst, struct flowi6 *fl6)
{
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ struct neighbour *n;
+ struct rt6_info *rt;
+#endif
int err;
+ int flags = 0;
- if (*dst == NULL)
- *dst = ip6_route_output(sk, fl);
+ /* The correct way to handle this would be to do
+ * ip6_route_get_saddr, and then ip6_route_output; however,
+ * the route-specific preferred source forces the
+ * ip6_route_output call _before_ ip6_route_get_saddr.
+ *
+ * In source specific routing (no src=any default route),
+ * ip6_route_output will fail given src=any saddr, though, so
+ * that's why we try it again later.
+ */
+ if (ipv6_addr_any(&fl6->saddr)) {
+ struct fib6_info *from;
+ struct rt6_info *rt;
- if ((err = (*dst)->error))
- goto out_err_release;
+ *dst = ip6_route_output(net, sk, fl6);
+ rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
+
+ rcu_read_lock();
+ from = rt ? rcu_dereference(rt->from) : NULL;
+ err = ip6_route_get_saddr(net, from, &fl6->daddr,
+ sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
+ fl6->flowi6_l3mdev,
+ &fl6->saddr);
+ rcu_read_unlock();
- if (ipv6_addr_any(&fl->fl6_src)) {
- err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
if (err)
goto out_err_release;
+
+ /* If we had an erroneous initial result, pretend it
+ * never existed and let the SA-enabled version take
+ * over.
+ */
+ if ((*dst)->error) {
+ dst_release(*dst);
+ *dst = NULL;
+ }
+
+ if (fl6->flowi6_oif)
+ flags |= RT6_LOOKUP_F_IFACE;
+ }
+
+ if (!*dst)
+ *dst = ip6_route_output_flags(net, sk, fl6, flags);
+
+ err = (*dst)->error;
+ if (err)
+ goto out_err_release;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ /*
+ * Here if the dst entry we've looked up
+ * has a neighbour entry that is in the INCOMPLETE
+ * state and the src address from the flow is
+ * marked as OPTIMISTIC, we release the found
+ * dst entry and replace it instead with the
+ * dst entry of the nexthop router
+ */
+ rt = dst_rt6_info(*dst);
+ rcu_read_lock();
+ n = __ipv6_neigh_lookup_noref(rt->dst.dev,
+ rt6_nexthop(rt, &fl6->daddr));
+ err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
+ rcu_read_unlock();
+
+ if (err) {
+ struct inet6_ifaddr *ifp;
+ struct flowi6 fl_gw6;
+ int redirect;
+
+ ifp = ipv6_get_ifaddr(net, &fl6->saddr,
+ (*dst)->dev, 1);
+
+ redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
+ if (ifp)
+ in6_ifa_put(ifp);
+
+ if (redirect) {
+ /*
+ * We need to get the dst entry for the
+ * default router instead
+ */
+ dst_release(*dst);
+ memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
+ memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
+ *dst = ip6_route_output(net, sk, &fl_gw6);
+ err = (*dst)->error;
+ if (err)
+ goto out_err_release;
+ }
+ }
+#endif
+ if (ipv6_addr_v4mapped(&fl6->saddr) &&
+ !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
+ err = -EAFNOSUPPORT;
+ goto out_err_release;
}
return 0;
@@ -875,188 +1226,349 @@ static int ip6_dst_lookup_tail(struct sock *sk,
out_err_release:
dst_release(*dst);
*dst = NULL;
+
+ if (err == -ENETUNREACH)
+ IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
return err;
}
/**
* ip6_dst_lookup - perform route lookup on flow
+ * @net: Network namespace to perform lookup in
* @sk: socket which provides route info
* @dst: pointer to dst_entry * for result
- * @fl: flow to lookup
+ * @fl6: flow to lookup
*
* This function performs a route lookup on the given flow.
*
* It returns zero on success, or a standard errno code on error.
*/
-int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
+int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
+ struct flowi6 *fl6)
{
*dst = NULL;
- return ip6_dst_lookup_tail(sk, dst, fl);
+ return ip6_dst_lookup_tail(net, sk, dst, fl6);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
/**
- * ip6_sk_dst_lookup - perform socket cached route lookup on flow
+ * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
+ * @net: Network namespace to perform lookup in
+ * @sk: socket which provides route info
+ * @fl6: flow to lookup
+ * @final_dst: final destination address for ipsec lookup
+ *
+ * This function performs a route lookup on the given flow.
+ *
+ * It returns a valid dst pointer on success, or a pointer encoded
+ * error code.
+ */
+struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
+{
+ struct dst_entry *dst = NULL;
+ int err;
+
+ err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+ if (final_dst)
+ fl6->daddr = *final_dst;
+
+ return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
+
+/**
+ * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
* @sk: socket which provides the dst cache and route info
- * @dst: pointer to dst_entry * for result
- * @fl: flow to lookup
+ * @fl6: flow to lookup
+ * @final_dst: final destination address for ipsec lookup
+ * @connected: whether @sk is connected or not
*
* This function performs a route lookup on the given flow with the
* possibility of using the cached route in the socket if it is valid.
* It will take the socket dst lock when operating on the dst cache.
* As a result, this function can only be used in process context.
*
- * It returns zero on success, or a standard errno code on error.
+ * In addition, for a connected socket, cache the dst in the socket
+ * if the current cache is not valid.
+ *
+ * It returns a valid dst pointer on success, or a pointer encoded
+ * error code.
*/
-int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
+struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+ const struct in6_addr *final_dst,
+ bool connected)
{
- *dst = NULL;
- if (sk) {
- *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
- *dst = ip6_sk_dst_check(sk, *dst, fl);
- }
+ struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
+
+ dst = ip6_sk_dst_check(sk, dst, fl6);
+ if (dst)
+ return dst;
- return ip6_dst_lookup_tail(sk, dst, fl);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
+ if (connected && !IS_ERR(dst))
+ ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
+
+ return dst;
}
-EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
+EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
-static inline int ip6_ufo_append_data(struct sock *sk,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int mtu,unsigned int flags)
+static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
+ gfp_t gfp)
+{
+ return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
+}
+static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
+ gfp_t gfp)
{
- struct sk_buff *skb;
- int err;
+ return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
+}
+
+static void ip6_append_data_mtu(unsigned int *mtu,
+ int *maxfraglen,
+ unsigned int fragheaderlen,
+ struct sk_buff *skb,
+ struct rt6_info *rt,
+ unsigned int orig_mtu)
+{
+ if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
+ if (!skb) {
+ /* first fragment, reserve header_len */
+ *mtu = orig_mtu - rt->dst.header_len;
+
+ } else {
+ /*
+ * this fragment is not first, the headers
+ * space is regarded as data space.
+ */
+ *mtu = orig_mtu;
+ }
+ *maxfraglen = ((*mtu - fragheaderlen) & ~7)
+ + fragheaderlen - sizeof(struct frag_hdr);
+ }
+}
+
+static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
+ struct rt6_info *rt)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ unsigned int mtu, frag_size;
+ struct ipv6_txoptions *nopt, *opt = ipc6->opt;
- /* There is support for UDP large send offload by network
- * device, so create one single skb packet containing complete
- * udp datagram
+ /* callers pass dst together with a reference, set it first so
+ * ip6_cork_release() can put it down even in case of an error.
*/
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
- skb = sock_alloc_send_skb(sk,
- hh_len + fragheaderlen + transhdrlen + 20,
- (flags & MSG_DONTWAIT), &err);
- if (skb == NULL)
- return -ENOMEM;
+ cork->base.dst = &rt->dst;
- /* reserve space for Hardware header */
- skb_reserve(skb, hh_len);
+ /*
+ * setup for corking
+ */
+ if (opt) {
+ if (WARN_ON(v6_cork->opt))
+ return -EINVAL;
- /* create space for UDP/IP header */
- skb_put(skb,fragheaderlen + transhdrlen);
+ nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+ if (unlikely(!nopt))
+ return -ENOBUFS;
- /* initialize network header pointer */
- skb->nh.raw = skb->data;
+ nopt->tot_len = sizeof(*opt);
+ nopt->opt_flen = opt->opt_flen;
+ nopt->opt_nflen = opt->opt_nflen;
- /* initialize protocol header pointer */
- skb->h.raw = skb->data + fragheaderlen;
+ nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
+ if (opt->dst0opt && !nopt->dst0opt)
+ return -ENOBUFS;
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum = 0;
- sk->sk_sndmsg_off = 0;
- }
+ nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
+ if (opt->dst1opt && !nopt->dst1opt)
+ return -ENOBUFS;
- err = skb_append_datato_frags(sk,skb, getfrag, from,
- (length - transhdrlen));
- if (!err) {
- struct frag_hdr fhdr;
+ nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
+ if (opt->hopopt && !nopt->hopopt)
+ return -ENOBUFS;
- /* specify the length of each IP datagram fragment*/
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
- sizeof(struct frag_hdr);
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- ipv6_select_ident(skb, &fhdr);
- skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
- __skb_queue_tail(&sk->sk_write_queue, skb);
+ nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
+ if (opt->srcrt && !nopt->srcrt)
+ return -ENOBUFS;
- return 0;
+ /* need source address above miyazawa*/
}
- /* There is not enough support do UPD LSO,
- * so follow normal path
- */
- kfree_skb(skb);
+ v6_cork->hop_limit = ipc6->hlimit;
+ v6_cork->tclass = ipc6->tclass;
+ v6_cork->dontfrag = ipc6->dontfrag;
+ if (rt->dst.flags & DST_XFRM_TUNNEL)
+ mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
+ READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
+ else
+ mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
+ READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
+
+ frag_size = READ_ONCE(np->frag_size);
+ if (frag_size && frag_size < mtu)
+ mtu = frag_size;
+
+ cork->base.fragsize = mtu;
+ cork->base.gso_size = ipc6->gso_size;
+ cork->base.tx_flags = 0;
+ cork->base.mark = ipc6->sockc.mark;
+ cork->base.priority = ipc6->sockc.priority;
+ sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
+ if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
+ cork->base.flags |= IPCORK_TS_OPT_ID;
+ cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
+ }
+ cork->base.length = 0;
+ cork->base.transmit_time = ipc6->sockc.transmit_time;
- return err;
+ return 0;
}
-int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
- int offset, int len, int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
- struct rt6_info *rt, unsigned int flags)
+static int __ip6_append_data(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork_full *cork_full,
+ struct inet6_cork *v6_cork,
+ struct page_frag *pfrag,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, size_t length, int transhdrlen,
+ unsigned int flags)
{
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *skb;
- unsigned int maxfraglen, fragheaderlen;
- int exthdrlen;
+ struct sk_buff *skb, *skb_prev = NULL;
+ struct inet_cork *cork = &cork_full->base;
+ struct flowi6 *fl6 = &cork_full->fl.u.ip6;
+ unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
+ struct ubuf_info *uarg = NULL;
+ int exthdrlen = 0;
+ int dst_exthdrlen = 0;
int hh_len;
- int mtu;
int copy;
int err;
int offset = 0;
+ bool zc = false;
+ u32 tskey = 0;
+ struct rt6_info *rt = dst_rt6_info(cork->dst);
+ bool paged, hold_tskey = false, extra_uref = false;
+ struct ipv6_txoptions *opt = v6_cork->opt;
int csummode = CHECKSUM_NONE;
+ unsigned int maxnonfragsize, headersize;
+ unsigned int wmem_alloc_delta = 0;
- if (flags&MSG_PROBE)
- return 0;
- if (skb_queue_empty(&sk->sk_write_queue)) {
- /*
- * setup for corking
- */
- if (opt) {
- if (np->cork.opt == NULL) {
- np->cork.opt = kmalloc(opt->tot_len,
- sk->sk_allocation);
- if (unlikely(np->cork.opt == NULL))
- return -ENOBUFS;
- } else if (np->cork.opt->tot_len < opt->tot_len) {
- printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
+ skb = skb_peek_tail(queue);
+ if (!skb) {
+ exthdrlen = opt ? opt->opt_flen : 0;
+ dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
+ }
+
+ paged = !!cork->gso_size;
+ mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
+ orig_mtu = mtu;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+ fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
+ (opt ? opt->opt_nflen : 0);
+
+ headersize = sizeof(struct ipv6hdr) +
+ (opt ? opt->opt_flen + opt->opt_nflen : 0) +
+ rt->rt6i_nfheader_len;
+
+ if (mtu <= fragheaderlen ||
+ ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
+ goto emsgsize;
+
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
+ sizeof(struct frag_hdr);
+
+ /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
+ * the first fragment
+ */
+ if (headersize + transhdrlen > mtu)
+ goto emsgsize;
+
+ if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
+ (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_ICMPV6 ||
+ sk->sk_protocol == IPPROTO_RAW)) {
+ ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
+ sizeof(struct ipv6hdr));
+ goto emsgsize;
+ }
+
+ if (ip6_sk_ignore_df(sk))
+ maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
+ else
+ maxnonfragsize = mtu;
+
+ if (cork->length + length > maxnonfragsize - headersize) {
+emsgsize:
+ pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
+ ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
+ return -EMSGSIZE;
+ }
+
+ /* CHECKSUM_PARTIAL only with no extension headers and when
+ * we are not going to fragment
+ */
+ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
+ headersize == sizeof(struct ipv6hdr) &&
+ length <= mtu - headersize &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
+ rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
+ csummode = CHECKSUM_PARTIAL;
+
+ if ((flags & MSG_ZEROCOPY) && length) {
+ struct msghdr *msg = from;
+
+ if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
+ if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
return -EINVAL;
+
+ /* Leave uarg NULL if can't zerocopy, callers should
+ * be able to handle it.
+ */
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ uarg = msg->msg_ubuf;
+ }
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ } else {
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
}
- memcpy(np->cork.opt, opt, opt->tot_len);
- inet->cork.flags |= IPCORK_OPT;
- /* need source address above miyazawa*/
- }
- dst_hold(&rt->u.dst);
- np->cork.rt = rt;
- inet->cork.fl = *fl;
- np->cork.hop_limit = hlimit;
- np->cork.tclass = tclass;
- mtu = dst_mtu(rt->u.dst.path);
- if (np->frag_size < mtu) {
- if (np->frag_size)
- mtu = np->frag_size;
}
- inet->cork.fragsize = mtu;
- if (dst_allfrag(rt->u.dst.path))
- inet->cork.flags |= IPCORK_ALLFRAG;
- inet->cork.length = 0;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
- exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
- length += exthdrlen;
- transhdrlen += exthdrlen;
- } else {
- rt = np->cork.rt;
- fl = &inet->cork.fl;
- if (inet->cork.flags & IPCORK_OPT)
- opt = np->cork.opt;
- transhdrlen = 0;
- exthdrlen = 0;
- mtu = inet->cork.fragsize;
+ } else if ((flags & MSG_SPLICE_PAGES) && length) {
+ if (inet_test_bit(HDRINCL, sk))
+ return -EPERM;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ getfrag == ip_generic_getfrag)
+ /* We need an empty buffer to attach stuff to */
+ paged = true;
+ else
+ flags &= ~MSG_SPLICE_PAGES;
}
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
-
- fragheaderlen = sizeof(struct ipv6hdr) + rt->u.dst.nfheader_len + (opt ? opt->opt_nflen : 0);
- maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
-
- if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
- if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
- ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
- return -EMSGSIZE;
+ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ if (cork->flags & IPCORK_TS_OPT_ID) {
+ tskey = cork->ts_opt_id;
+ } else {
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+ hold_tskey = true;
}
}
@@ -1066,34 +1578,23 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
* Otherwise, we need to reserve fragment header and
* fragment alignment (= 8-15 octects, in total).
*
- * Note that we may need to "move" the data from the tail of
- * of the buffer to the new fragment when we split
+ * Note that we may need to "move" the data from the tail
+ * of the buffer to the new fragment when we split
* the message.
*
- * FIXME: It may be fragmented into multiple chunks
+ * FIXME: It may be fragmented into multiple chunks
* at once if non-fragmentable extension headers
* are too large.
- * --yoshfuji
+ * --yoshfuji
*/
- inet->cork.length += length;
- if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
-
- err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
- fragheaderlen, transhdrlen, mtu,
- flags);
- if (err)
- goto error;
- return 0;
- }
-
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ cork->length += length;
+ if (!skb)
goto alloc_new_skb;
while (length > 0) {
/* Check if the remaining data fits into current packet. */
- copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+ copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
if (copy < length)
copy = maxfraglen - skb->len;
@@ -1102,283 +1603,480 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
- unsigned int alloclen;
- struct sk_buff *skb_prev;
+ unsigned int alloclen, alloc_extra;
+ unsigned int pagedlen;
alloc_new_skb:
- skb_prev = skb;
-
/* There's no room in the current skb */
- if (skb_prev)
- fraggap = skb_prev->len - maxfraglen;
+ if (skb)
+ fraggap = skb->len - maxfraglen;
else
fraggap = 0;
+ /* update mtu and maxfraglen if necessary */
+ if (!skb || !skb_prev)
+ ip6_append_data_mtu(&mtu, &maxfraglen,
+ fragheaderlen, skb, rt,
+ orig_mtu);
+
+ skb_prev = skb;
/*
* If remaining data exceeds the mtu,
* we know we need more fragment(s).
*/
datalen = length + fraggap;
- if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
- datalen = maxfraglen - fragheaderlen;
+ if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
+ datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
- if ((flags & MSG_MORE) &&
- !(rt->u.dst.dev->features&NETIF_F_SG))
- alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ pagedlen = 0;
- /*
- * The last fragment gets additional space at tail.
- * Note: we overallocate on fragments with MSG_MODE
- * because we have no idea if we're the last one.
- */
- if (datalen == length + fraggap)
- alloclen += rt->u.dst.trailer_len;
+ alloc_extra = hh_len;
+ alloc_extra += dst_exthdrlen;
+ alloc_extra += rt->dst.trailer_len;
- /*
- * We just reserve space for fragment header.
- * Note: this may be overallocation if the message
+ /* We just reserve space for fragment header.
+ * Note: this may be overallocation if the message
* (without MSG_MORE) fits into the MTU.
*/
- alloclen += sizeof(struct frag_hdr);
+ alloc_extra += sizeof(struct frag_hdr);
+ if ((flags & MSG_MORE) &&
+ !(rt->dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else if (!paged &&
+ (fraglen + alloc_extra < SKB_MAX_ALLOC ||
+ !(rt->dst.dev->features & NETIF_F_SG)))
+ alloclen = fraglen;
+ else {
+ alloclen = fragheaderlen + transhdrlen;
+ pagedlen = datalen - transhdrlen;
+ }
+ alloclen += alloc_extra;
+
+ if (datalen != length + fraggap) {
+ /*
+ * this is not the last fragment, the trailer
+ * space is regarded as data space.
+ */
+ datalen += rt->dst.trailer_len;
+ }
+
+ fraglen = datalen + fragheaderlen;
+
+ copy = datalen - transhdrlen - fraggap - pagedlen;
+ /* [!] NOTE: copy may be negative if pagedlen>0
+ * because then the equation may reduces to -fraggap.
+ */
+ if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
+ err = -EINVAL;
+ goto error;
+ }
if (transhdrlen) {
- skb = sock_alloc_send_skb(sk,
- alloclen + hh_len,
+ skb = sock_alloc_send_skb(sk, alloclen,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
- if (atomic_read(&sk->sk_wmem_alloc) <=
+ if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
2 * sk->sk_sndbuf)
- skb = sock_wmalloc(sk,
- alloclen + hh_len, 1,
- sk->sk_allocation);
- if (unlikely(skb == NULL))
+ skb = alloc_skb(alloclen,
+ sk->sk_allocation);
+ if (unlikely(!skb))
err = -ENOBUFS;
}
- if (skb == NULL)
+ if (!skb)
goto error;
/*
* Fill in the control structures
*/
+ skb->protocol = htons(ETH_P_IPV6);
skb->ip_summed = csummode;
skb->csum = 0;
- /* reserve for fragmentation */
- skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
+ /* reserve for fragmentation and ipsec header */
+ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
+ dst_exthdrlen);
/*
* Find where to start putting bytes
*/
- data = skb_put(skb, fraglen);
- skb->nh.raw = data + exthdrlen;
+ data = skb_put(skb, fraglen - pagedlen);
+ skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
- skb->h.raw = data + exthdrlen;
-
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
- if (copy < 0) {
- err = -EINVAL;
- kfree_skb(skb);
- goto error;
- } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ if (copy > 0 &&
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
+ } else if (flags & MSG_SPLICE_PAGES) {
+ copy = 0;
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
- csummode = CHECKSUM_NONE;
+ dst_exthdrlen = 0;
+
+ /* Only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
+ if ((flags & MSG_CONFIRM) && !skb_prev)
+ skb_set_dst_pending_confirm(skb, 1);
/*
* Put the packet on the pending queue
*/
- __skb_queue_tail(&sk->sk_write_queue, skb);
+ if (!skb->destructor) {
+ skb->destructor = sock_wfree;
+ skb->sk = sk;
+ wmem_alloc_delta += skb->truesize;
+ }
+ __skb_queue_tail(queue, skb);
continue;
}
if (copy > length)
copy = length;
- if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+ if (!(rt->dst.dev->features&NETIF_F_SG) &&
+ skb_tailroom(skb) >= copy) {
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
- } else {
- int i = skb_shinfo(skb)->nr_frags;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
- struct page *page = sk->sk_sndmsg_page;
- int off = sk->sk_sndmsg_off;
- unsigned int left;
-
- if (page && (left = PAGE_SIZE - off) > 0) {
- if (copy >= left)
- copy = left;
- if (page != frag->page) {
- if (i == MAX_SKB_FRAGS) {
- err = -EMSGSIZE;
- goto error;
- }
- get_page(page);
- skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
- frag = &skb_shinfo(skb)->frags[i];
- }
- } else if(i < MAX_SKB_FRAGS) {
- if (copy > PAGE_SIZE)
- copy = PAGE_SIZE;
- page = alloc_pages(sk->sk_allocation, 0);
- if (page == NULL) {
- err = -ENOMEM;
- goto error;
- }
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
-
- skb_fill_page_desc(skb, i, page, 0, 0);
- frag = &skb_shinfo(skb)->frags[i];
- skb->truesize += PAGE_SIZE;
- atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
- } else {
- err = -EMSGSIZE;
+ } else if (flags & MSG_SPLICE_PAGES) {
+ struct msghdr *msg = from;
+
+ err = -EIO;
+ if (WARN_ON_ONCE(copy > msg->msg_iter.count))
goto error;
- }
- if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
- err = -EFAULT;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0)
+ goto error;
+ copy = err;
+ wmem_alloc_delta += copy;
+ } else if (!zc) {
+ int i = skb_shinfo(skb)->nr_frags;
+
+ err = -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag))
goto error;
+
+ skb_zcopy_downgrade_managed(skb);
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ err = -EMSGSIZE;
+ if (i == MAX_SKB_FRAGS)
+ goto error;
+
+ __skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, 0);
+ skb_shinfo(skb)->nr_frags = ++i;
+ get_page(pfrag->page);
}
- sk->sk_sndmsg_off += copy;
- frag->size += copy;
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
+ page_address(pfrag->page) + pfrag->offset,
+ offset, copy, skb->len, skb) < 0)
+ goto error_efault;
+
+ pfrag->offset += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
skb->len += copy;
skb->data_len += copy;
+ skb->truesize += copy;
+ wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
}
+
+ if (wmem_alloc_delta)
+ refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
return 0;
+
+error_efault:
+ err = -EFAULT;
error:
- inet->cork.length -= length;
- IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ net_zcopy_put_abort(uarg, extra_uref);
+ cork->length -= length;
+ IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+ if (hold_tskey)
+ atomic_dec(&sk->sk_tskey);
return err;
}
-int ip6_push_pending_frames(struct sock *sk)
+int ip6_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, size_t length, int transhdrlen,
+ struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
+ struct rt6_info *rt, unsigned int flags)
{
- struct sk_buff *skb, *tmp_skb;
- struct sk_buff **tail_skb;
- struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
+ int exthdrlen;
+ int err;
+
+ if (flags&MSG_PROBE)
+ return 0;
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ /*
+ * setup for corking
+ */
+ dst_hold(&rt->dst);
+ err = ip6_setup_cork(sk, &inet->cork, &np->cork,
+ ipc6, rt);
+ if (err)
+ return err;
+
+ inet->cork.fl.u.ip6 = *fl6;
+ exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
+ } else {
+ transhdrlen = 0;
+ }
+
+ return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
+ &np->cork, sk_page_frag(sk), getfrag,
+ from, length, transhdrlen, flags);
+}
+EXPORT_SYMBOL_GPL(ip6_append_data);
+
+static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
+{
+ struct dst_entry *dst = cork->base.dst;
+
+ cork->base.dst = NULL;
+ skb_dst_set(skb, dst);
+}
+
+static void ip6_cork_release(struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork)
+{
+ if (v6_cork->opt) {
+ struct ipv6_txoptions *opt = v6_cork->opt;
+
+ kfree(opt->dst0opt);
+ kfree(opt->dst1opt);
+ kfree(opt->hopopt);
+ kfree(opt->srcrt);
+ kfree(opt);
+ v6_cork->opt = NULL;
+ }
+
+ if (cork->base.dst) {
+ dst_release(cork->base.dst);
+ cork->base.dst = NULL;
+ }
+}
+
+struct sk_buff *__ip6_make_skb(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork)
+{
+ struct sk_buff *skb, *tmp_skb;
+ struct sk_buff **tail_skb;
+ struct in6_addr *final_dst;
+ struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
- struct ipv6_txoptions *opt = np->cork.opt;
- struct rt6_info *rt = np->cork.rt;
- struct flowi *fl = &inet->cork.fl;
- unsigned char proto = fl->proto;
- int err = 0;
+ struct ipv6_txoptions *opt = v6_cork->opt;
+ struct rt6_info *rt = dst_rt6_info(cork->base.dst);
+ struct flowi6 *fl6 = &cork->fl.u.ip6;
+ unsigned char proto = fl6->flowi6_proto;
- if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+ skb = __skb_dequeue(queue);
+ if (!skb)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */
- if (skb->data < skb->nh.raw)
- __skb_pull(skb, skb->nh.raw - skb->data);
- while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
- __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+ if (skb->data < skb_network_header(skb))
+ __skb_pull(skb, skb_network_offset(skb));
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+ __skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
- __sock_put(tmp_skb->sk);
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
}
- ipv6_addr_copy(final_dst, &fl->fl6_dst);
- __skb_pull(skb, skb->h.raw - skb->nh.raw);
+ /* Allow local fragmentation. */
+ skb->ignore_df = ip6_sk_ignore_df(sk);
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ final_dst = &fl6->daddr;
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
+ ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
- skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
-
- *(__be32*)hdr = fl->fl6_flowlabel |
- htonl(0x60000000 | ((int)np->cork.tclass << 20));
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
- if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
- hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
- else
- hdr->payload_len = 0;
- hdr->hop_limit = np->cork.hop_limit;
+ ip6_flow_hdr(hdr, v6_cork->tclass,
+ ip6_make_flowlabel(net, skb, fl6->flowlabel,
+ ip6_autoflowlabel(net, sk), fl6));
+ hdr->hop_limit = v6_cork->hop_limit;
hdr->nexthdr = proto;
- ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
- ipv6_addr_copy(&hdr->daddr, final_dst);
+ hdr->saddr = fl6->saddr;
+ hdr->daddr = *final_dst;
+
+ skb->priority = cork->base.priority;
+ skb->mark = cork->base.mark;
+ if (sk_is_tcp(sk))
+ skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
+ else
+ skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
+
+ ip6_cork_steal_dst(skb, cork);
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
+ if (proto == IPPROTO_ICMPV6) {
+ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ u8 icmp6_type;
- skb->priority = sk->sk_priority;
+ if (sk->sk_socket->type == SOCK_RAW &&
+ !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
+ icmp6_type = fl6->fl6_icmp_type;
+ else
+ icmp6_type = icmp6_hdr(skb)->icmp6_type;
+ ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ }
+
+ ip6_cork_release(cork, v6_cork);
+out:
+ return skb;
+}
+
+int ip6_send_skb(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
+ int err;
- skb->dst = dst_clone(&rt->u.dst);
- IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
+ rcu_read_lock();
+ err = ip6_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
- err = np->recverr ? net_xmit_errno(err) : 0;
+ err = net_xmit_errno(err);
if (err)
- goto error;
+ IP6_INC_STATS(net, rt->rt6i_idev,
+ IPSTATS_MIB_OUTDISCARDS);
}
-out:
- inet->cork.flags &= ~IPCORK_OPT;
- kfree(np->cork.opt);
- np->cork.opt = NULL;
- if (np->cork.rt) {
- dst_release(&np->cork.rt->u.dst);
- np->cork.rt = NULL;
- inet->cork.flags &= ~IPCORK_ALLFRAG;
- }
- memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
+ rcu_read_unlock();
return err;
-error:
- goto out;
}
-void ip6_flush_pending_frames(struct sock *sk)
+int ip6_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ skb = ip6_finish_skb(sk);
+ if (!skb)
+ return 0;
+
+ return ip6_send_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
+
+static void __ip6_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork)
{
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
- while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
- IP6_INC_STATS(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_OUTDISCARDS);
+ while ((skb = __skb_dequeue_tail(queue)) != NULL) {
+ if (skb_dst(skb))
+ IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
}
- inet->cork.flags &= ~IPCORK_OPT;
+ ip6_cork_release(cork, v6_cork);
+}
+
+void ip6_flush_pending_frames(struct sock *sk)
+{
+ __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
+ &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
+}
+EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
+
+struct sk_buff *ip6_make_skb(struct sock *sk,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, size_t length, int transhdrlen,
+ struct ipcm6_cookie *ipc6, struct rt6_info *rt,
+ unsigned int flags, struct inet_cork_full *cork)
+{
+ struct inet6_cork v6_cork;
+ struct sk_buff_head queue;
+ int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
+ int err;
- kfree(np->cork.opt);
- np->cork.opt = NULL;
- if (np->cork.rt) {
- dst_release(&np->cork.rt->u.dst);
- np->cork.rt = NULL;
- inet->cork.flags &= ~IPCORK_ALLFRAG;
+ if (flags & MSG_PROBE) {
+ dst_release(&rt->dst);
+ return NULL;
}
- memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
+
+ __skb_queue_head_init(&queue);
+
+ cork->base.flags = 0;
+ cork->base.addr = 0;
+ cork->base.opt = NULL;
+ v6_cork.opt = NULL;
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
+ if (err) {
+ ip6_cork_release(cork, &v6_cork);
+ return ERR_PTR(err);
+ }
+
+ err = __ip6_append_data(sk, &queue, cork, &v6_cork,
+ &current->task_frag, getfrag, from,
+ length + exthdrlen, transhdrlen + exthdrlen,
+ flags);
+ if (err) {
+ __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip6_make_skb(sk, &queue, cork, &v6_cork);
}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 8d918348f5bb..6405072050e0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1,33 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * IPv6 over IPv6 tunnel device
+ * IPv6 tunneling device
* Linux INET6 implementation
*
* Authors:
- * Ville Nuorvala <vnuorval@tcs.hut.fi>
- *
- * $Id$
+ * Ville Nuorvala <vnuorval@tcs.hut.fi>
+ * Yasuyuki Kozakai <kozakai@linux-ipv6.org>
*
* Based on:
- * linux/net/ipv6/sit.c
+ * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
*
* RFC 2473
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/sockios.h>
+#include <linux/icmp.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/ip.h>
-#include <linux/if_tunnel.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
@@ -37,11 +33,16 @@
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/etherdevice.h>
-#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <net/icmp.h>
#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
@@ -49,315 +50,398 @@
#include <net/xfrm.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
+#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
MODULE_AUTHOR("Ville Nuorvala");
-MODULE_DESCRIPTION("IPv6-in-IPv6 tunnel");
+MODULE_DESCRIPTION("IPv6 tunneling device");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ip6tnl");
+MODULE_ALIAS_NETDEV("ip6tnl0");
-#define IPV6_TLV_TEL_DST_SIZE 8
-
-#ifdef IP6_TNL_DEBUG
-#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __FUNCTION__)
-#else
-#define IP6_TNL_TRACE(x...) do {;} while(0)
-#endif
-
-#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
-
-#define HASH_SIZE 32
-
-#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
- (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
- (HASH_SIZE - 1))
-
-static int ip6ip6_fb_tnl_dev_init(struct net_device *dev);
-static int ip6ip6_tnl_dev_init(struct net_device *dev);
-static void ip6ip6_tnl_dev_setup(struct net_device *dev);
+#define IP6_TUNNEL_HASH_SIZE_SHIFT 5
+#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT)
-/* the IPv6 tunnel fallback device */
-static struct net_device *ip6ip6_fb_tnl_dev;
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-
-/* lists for storing tunnels in use */
-static struct ip6_tnl *tnls_r_l[HASH_SIZE];
-static struct ip6_tnl *tnls_wc[1];
-static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l };
-
-/* lock for the tunnel lists */
-static DEFINE_RWLOCK(ip6ip6_lock);
-
-static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
+static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
{
- struct dst_entry *dst = t->dst_cache;
-
- if (dst && dst->obsolete &&
- dst->ops->check(dst, t->dst_cookie) == NULL) {
- t->dst_cache = NULL;
- dst_release(dst);
- return NULL;
- }
+ u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
- return dst;
+ return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT);
}
-static inline void ip6_tnl_dst_reset(struct ip6_tnl *t)
-{
- dst_release(t->dst_cache);
- t->dst_cache = NULL;
-}
+static int ip6_tnl_dev_init(struct net_device *dev);
+static void ip6_tnl_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops ip6_link_ops __read_mostly;
+
+static unsigned int ip6_tnl_net_id __read_mostly;
+struct ip6_tnl_net {
+ /* the IPv6 tunnel fallback device */
+ struct net_device *fb_tnl_dev;
+ /* lists for storing tunnels in use */
+ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_wc[1];
+ struct ip6_tnl __rcu **tnls[2];
+ struct ip6_tnl __rcu *collect_md_tun;
+};
-static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
+static inline int ip6_tnl_mpls_supported(void)
{
- struct rt6_info *rt = (struct rt6_info *) dst;
- t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- dst_release(t->dst_cache);
- t->dst_cache = dst;
+ return IS_ENABLED(CONFIG_MPLS);
}
+#define for_each_ip6_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
/**
- * ip6ip6_tnl_lookup - fetch tunnel matching the end-point addresses
- * @remote: the address of the tunnel exit-point
- * @local: the address of the tunnel entry-point
+ * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @net: network namespace
+ * @link: ifindex of underlying interface
+ * @remote: the address of the tunnel exit-point
+ * @local: the address of the tunnel entry-point
*
- * Return:
+ * Return:
* tunnel matching given end-points if found,
- * else fallback tunnel if its device is up,
+ * else fallback tunnel if its device is up,
* else %NULL
**/
static struct ip6_tnl *
-ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local)
+ip6_tnl_lookup(struct net *net, int link,
+ const struct in6_addr *remote, const struct in6_addr *local)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
- struct ip6_tnl *t;
+ unsigned int hash = HASH(remote, local);
+ struct ip6_tnl *t, *cand = NULL;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct in6_addr any;
+
+ for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
+ return t;
+ else
+ cand = t;
+ }
- for (t = tnls_r_l[h0 ^ h1]; t; t = t->next) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ memset(&any, 0, sizeof(any));
+ hash = HASH(&any, local);
+ for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_any(&t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ hash = HASH(remote, &any);
+ for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !ipv6_addr_any(&t->parms.laddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
- if ((t = tnls_wc[0]) != NULL && (t->dev->flags & IFF_UP))
+
+ if (cand)
+ return cand;
+
+ t = rcu_dereference(ip6n->collect_md_tun);
+ if (t && t->dev->flags & IFF_UP)
+ return t;
+
+ t = rcu_dereference(ip6n->tnls_wc[0]);
+ if (t && (t->dev->flags & IFF_UP))
return t;
return NULL;
}
/**
- * ip6ip6_bucket - get head of list matching given tunnel parameters
- * @p: parameters containing tunnel end-points
+ * ip6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
+ * @p: parameters containing tunnel end-points
*
* Description:
- * ip6ip6_bucket() returns the head of the list matching the
+ * ip6_tnl_bucket() returns the head of the list matching the
* &struct in6_addr entries laddr and raddr in @p.
*
- * Return: head of IPv6 tunnel list
+ * Return: head of IPv6 tunnel list
**/
-static struct ip6_tnl **
-ip6ip6_bucket(struct ip6_tnl_parm *p)
+static struct ip6_tnl __rcu **
+ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
{
- struct in6_addr *remote = &p->raddr;
- struct in6_addr *local = &p->laddr;
- unsigned h = 0;
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ unsigned int h = 0;
int prio = 0;
if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
prio = 1;
- h = HASH(remote) ^ HASH(local);
+ h = HASH(remote, local);
}
- return &tnls[prio][h];
+ return &ip6n->tnls[prio][h];
}
/**
- * ip6ip6_tnl_link - add tunnel to hash table
+ * ip6_tnl_link - add tunnel to hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be added
**/
static void
-ip6ip6_tnl_link(struct ip6_tnl *t)
+ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
- struct ip6_tnl **tp = ip6ip6_bucket(&t->parms);
+ struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
- t->next = *tp;
- write_lock_bh(&ip6ip6_lock);
- *tp = t;
- write_unlock_bh(&ip6ip6_lock);
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ip6n->collect_md_tun, t);
+ rcu_assign_pointer(t->next , rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
}
/**
- * ip6ip6_tnl_unlink - remove tunnel from hash table
+ * ip6_tnl_unlink - remove tunnel from hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be removed
**/
static void
-ip6ip6_tnl_unlink(struct ip6_tnl *t)
+ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
- struct ip6_tnl **tp;
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
- for (tp = ip6ip6_bucket(&t->parms); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- write_lock_bh(&ip6ip6_lock);
- *tp = t->next;
- write_unlock_bh(&ip6ip6_lock);
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
+ for (tp = ip6_tnl_bucket(ip6n, &t->parms);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
}
+static void ip6_dev_free(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ gro_cells_destroy(&t->gro_cells);
+ dst_cache_destroy(&t->dst_cache);
+}
+
+static int ip6_tnl_create2(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id);
+ int err;
+
+ dev->rtnl_link_ops = &ip6_link_ops;
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(t->parms.name, dev->name);
+
+ ip6_tnl_link(ip6n, t);
+ return 0;
+
+out:
+ return err;
+}
+
/**
- * ip6_tnl_create() - create a new tunnel
+ * ip6_tnl_create - create a new tunnel
+ * @net: network namespace
* @p: tunnel parameters
- * @pt: pointer to new tunnel
*
* Description:
* Create tunnel matching given parameters.
- *
- * Return:
- * created tunnel or NULL
+ *
+ * Return:
+ * created tunnel or error pointer
**/
-static struct ip6_tnl *ip6_tnl_create(struct ip6_tnl_parm *p)
+static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
{
struct net_device *dev;
struct ip6_tnl *t;
char name[IFNAMSIZ];
- int err;
+ int err = -E2BIG;
if (p->name[0]) {
- strlcpy(name, p->name, IFNAMSIZ);
- } else {
- int i;
- for (i = 1; i < IP6_TNL_MAX; i++) {
- sprintf(name, "ip6tnl%d", i);
- if (__dev_get_by_name(name) == NULL)
- break;
- }
- if (i == IP6_TNL_MAX)
+ if (!dev_valid_name(p->name))
goto failed;
+ strscpy(name, p->name, IFNAMSIZ);
+ } else {
+ sprintf(name, "ip6tnl%%d");
}
- dev = alloc_netdev(sizeof (*t), name, ip6ip6_tnl_dev_setup);
- if (dev == NULL)
+ err = -ENOMEM;
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+ ip6_tnl_dev_setup);
+ if (!dev)
goto failed;
+ dev_net_set(dev, net);
+
t = netdev_priv(dev);
- dev->init = ip6ip6_tnl_dev_init;
t->parms = *p;
+ t->net = dev_net(dev);
+ err = ip6_tnl_create2(dev);
+ if (err < 0)
+ goto failed_free;
- if ((err = register_netdevice(dev)) < 0) {
- free_netdev(dev);
- goto failed;
- }
- dev_hold(dev);
- ip6ip6_tnl_link(t);
return t;
+
+failed_free:
+ free_netdev(dev);
failed:
- return NULL;
+ return ERR_PTR(err);
}
/**
- * ip6ip6_tnl_locate - find or create tunnel matching given parameters
- * @p: tunnel parameters
+ * ip6_tnl_locate - find or create tunnel matching given parameters
+ * @net: network namespace
+ * @p: tunnel parameters
* @create: != 0 if allowed to create new tunnel if no match found
*
* Description:
- * ip6ip6_tnl_locate() first tries to locate an existing tunnel
+ * ip6_tnl_locate() first tries to locate an existing tunnel
* based on @parms. If this is unsuccessful, but @create is set a new
* tunnel device is created and registered for use.
*
* Return:
- * matching tunnel or NULL
+ * matching tunnel or error pointer
**/
-static struct ip6_tnl *ip6ip6_tnl_locate(struct ip6_tnl_parm *p, int create)
+static struct ip6_tnl *ip6_tnl_locate(struct net *net,
+ struct __ip6_tnl_parm *p, int create)
{
- struct in6_addr *remote = &p->raddr;
- struct in6_addr *local = &p->laddr;
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ struct ip6_tnl __rcu **tp;
struct ip6_tnl *t;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
- for (t = *ip6ip6_bucket(p); t; t = t->next) {
+ for (tp = ip6_tnl_bucket(ip6n, p);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr))
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ p->link == t->parms.link) {
+ if (create)
+ return ERR_PTR(-EEXIST);
+
return t;
+ }
}
if (!create)
- return NULL;
- return ip6_tnl_create(p);
+ return ERR_PTR(-ENODEV);
+ return ip6_tnl_create(net, p);
}
/**
- * ip6ip6_tnl_dev_uninit - tunnel device uninitializer
+ * ip6_tnl_dev_uninit - tunnel device uninitializer
* @dev: the device to be destroyed
- *
+ *
* Description:
- * ip6ip6_tnl_dev_uninit() removes tunnel from its list
+ * ip6_tnl_dev_uninit() removes tunnel from its list
**/
static void
-ip6ip6_tnl_dev_uninit(struct net_device *dev)
+ip6_tnl_dev_uninit(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
- if (dev == ip6ip6_fb_tnl_dev) {
- write_lock_bh(&ip6ip6_lock);
- tnls_wc[0] = NULL;
- write_unlock_bh(&ip6ip6_lock);
- } else {
- ip6ip6_tnl_unlink(t);
- }
- ip6_tnl_dst_reset(t);
- dev_put(dev);
+ if (dev == ip6n->fb_tnl_dev)
+ RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
+ else
+ ip6_tnl_unlink(ip6n, t);
+ dst_cache_reset(&t->dst_cache);
+ netdev_put(dev, &t->dev_tracker);
}
/**
- * parse_tvl_tnl_enc_lim - handle encapsulation limit option
+ * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option
* @skb: received socket buffer
+ * @raw: the ICMPv6 error message data
*
- * Return:
- * 0 if none was found,
+ * Return:
+ * 0 if none was found,
* else index to encapsulation limit
**/
-static __u16
-parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
+__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
{
- struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw;
- __u8 nexthdr = ipv6h->nexthdr;
- __u16 off = sizeof (*ipv6h);
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw;
+ unsigned int nhoff = raw - skb->data;
+ unsigned int off = nhoff + sizeof(*ipv6h);
+ u8 nexthdr = ipv6h->nexthdr;
while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
- __u16 optlen = 0;
struct ipv6_opt_hdr *hdr;
- if (raw + off + sizeof (*hdr) > skb->data &&
- !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
+ u16 optlen;
+
+ if (!pskb_may_pull(skb, off + sizeof(*hdr)))
break;
- hdr = (struct ipv6_opt_hdr *) (raw + off);
+ hdr = (struct ipv6_opt_hdr *)(skb->data + off);
if (nexthdr == NEXTHDR_FRAGMENT) {
- struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
- if (frag_hdr->frag_off)
- break;
optlen = 8;
} else if (nexthdr == NEXTHDR_AUTH) {
- optlen = (hdr->hdrlen + 2) << 2;
+ optlen = ipv6_authlen(hdr);
} else {
optlen = ipv6_optlen(hdr);
}
+
+ if (!pskb_may_pull(skb, off + optlen))
+ break;
+
+ hdr = (struct ipv6_opt_hdr *)(skb->data + off);
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr;
+
+ if (frag_hdr->frag_off)
+ break;
+ }
if (nexthdr == NEXTHDR_DEST) {
- __u16 i = off + 2;
+ u16 i = 2;
+
while (1) {
struct ipv6_tlv_tnl_enc_lim *tel;
/* No more room for encapsulation limit */
- if (i + sizeof (*tel) > off + optlen)
+ if (i + sizeof(*tel) > optlen)
break;
- tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
+ tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i);
/* return index of option if found and valid */
if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
tel->length == 1)
- return i;
+ return i + off - nhoff;
/* else jump to next option */
if (tel->type)
i += tel->length + 2;
@@ -370,87 +454,85 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
}
return 0;
}
+EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim);
-/**
- * ip6ip6_err - tunnel error handler
- *
- * Description:
- * ip6ip6_err() should handle errors in the tunnel according
- * to the specifications in RFC 2473.
- **/
-
+/* ip6_tnl_err() should handle errors in the tunnel according to the
+ * specifications in RFC 2473.
+ */
static int
-ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
+ u8 *type, u8 *code, int *msg, __u32 *info, int offset)
{
- struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data;
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ u8 rel_type = ICMPV6_DEST_UNREACH;
+ u8 rel_code = ICMPV6_ADDR_UNREACH;
+ __u32 rel_info = 0;
struct ip6_tnl *t;
+ int err = -ENOENT;
int rel_msg = 0;
- int rel_type = ICMPV6_DEST_UNREACH;
- int rel_code = ICMPV6_ADDR_UNREACH;
- __u32 rel_info = 0;
+ u8 tproto;
__u16 len;
- int err = -ENOENT;
- /* If the packet doesn't contain the original IPv6 header we are
- in trouble since we might need the source address for further
+ /* If the packet doesn't contain the original IPv6 header we are
+ in trouble since we might need the source address for further
processing of the error. */
- read_lock(&ip6ip6_lock);
- if ((t = ip6ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL)
+ rcu_read_lock();
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr);
+ if (!t)
+ goto out;
+
+ tproto = READ_ONCE(t->parms.proto);
+ if (tproto != ipproto && tproto != 0)
goto out;
err = 0;
- switch (type) {
- __u32 teli;
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 mtu;
+ switch (*type) {
case ICMPV6_DEST_UNREACH:
- if (net_ratelimit())
- printk(KERN_WARNING
- "%s: Path to destination invalid "
- "or inactive!\n", t->parms.name);
+ net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
+ t->parms.name);
rel_msg = 1;
break;
case ICMPV6_TIME_EXCEED:
- if (code == ICMPV6_EXC_HOPLIMIT) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "%s: Too small hop limit or "
- "routing loop in tunnel!\n",
- t->parms.name);
+ if ((*code) == ICMPV6_EXC_HOPLIMIT) {
+ net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+ t->parms.name);
rel_msg = 1;
}
break;
- case ICMPV6_PARAMPROB:
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
- if (code == ICMPV6_HDR_FIELD)
- teli = parse_tlv_tnl_enc_lim(skb, skb->data);
+ if ((*code) == ICMPV6_HDR_FIELD)
+ teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
- if (teli && teli == ntohl(info) - 2) {
+ if (teli && teli == *info - 2) {
tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
if (tel->encap_limit == 0) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "%s: Too small encapsulation "
- "limit or routing loop in "
- "tunnel!\n", t->parms.name);
+ net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
+ t->parms.name);
rel_msg = 1;
}
- } else if (net_ratelimit()) {
- printk(KERN_WARNING
- "%s: Recipient unable to parse tunneled "
- "packet!\n ", t->parms.name);
+ } else {
+ net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+ t->parms.name);
}
break;
- case ICMPV6_PKT_TOOBIG:
- mtu = ntohl(info) - offset;
+ }
+ case ICMPV6_PKT_TOOBIG: {
+ __u32 mtu;
+
+ ip6_update_pmtu(skb, net, htonl(*info), 0, 0,
+ sock_net_uid(net, NULL));
+ mtu = *info - offset;
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- t->dev->mtu = mtu;
-
- if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) {
+ len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len);
+ if (len > mtu) {
rel_type = ICMPV6_PKT_TOOBIG;
rel_code = 0;
rel_info = mtu;
@@ -458,121 +540,451 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
break;
}
- if (rel_msg && pskb_may_pull(skb, offset + sizeof (*ipv6h))) {
+ case NDISC_REDIRECT:
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ break;
+ }
+
+ *type = rel_type;
+ *code = rel_code;
+ *info = rel_info;
+ *msg = rel_msg;
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int
+ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __u32 rel_info = ntohl(info);
+ const struct iphdr *eiph;
+ struct sk_buff *skb2;
+ int err, rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ if (err < 0)
+ return err;
+
+ if (rel_msg == 0)
+ return 0;
+
+ switch (rel_type) {
+ case ICMPV6_DEST_UNREACH:
+ if (rel_code != ICMPV6_ADDR_UNREACH)
+ return 0;
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ case ICMPV6_PKT_TOOBIG:
+ if (rel_code != 0)
+ return 0;
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_FRAG_NEEDED;
+ break;
+ default:
+ return 0;
+ }
+
+ if (!pskb_may_pull(skb, offset + sizeof(struct iphdr)))
+ return 0;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ return 0;
+
+ skb_dst_drop(skb2);
+
+ skb_pull(skb2, offset);
+ skb_reset_network_header(skb2);
+ eiph = ip_hdr(skb2);
+
+ /* Try to guess incoming interface */
+ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr,
+ 0, 0, 0, IPPROTO_IPIP,
+ eiph->tos & INET_DSCP_MASK, 0);
+ if (IS_ERR(rt))
+ goto out;
+
+ skb2->dev = rt->dst.dev;
+ ip_rt_put(rt);
+
+ /* route "incoming" packet */
+ if (rt->rt_flags & RTCF_LOCAL) {
+ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
+ eiph->daddr, eiph->saddr, 0, 0,
+ IPPROTO_IPIP,
+ eiph->tos & INET_DSCP_MASK, 0);
+ if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) {
+ if (!IS_ERR(rt))
+ ip_rt_put(rt);
+ goto out;
+ }
+ skb_dst_set(skb2, &rt->dst);
+ } else {
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr,
+ ip4h_dscp(eiph), skb2->dev) ||
+ skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6)
+ goto out;
+ }
+
+ /* change mtu on this route */
+ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
+ if (rel_info > dst_mtu(skb_dst(skb2)))
+ goto out;
+
+ skb_dst_update_pmtu_no_confirm(skb2, rel_info);
+ }
+
+ icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
+
+out:
+ kfree_skb(skb2);
+ return 0;
+}
+
+static int
+ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __u32 rel_info = ntohl(info);
+ int err, rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+
+ err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ if (err < 0)
+ return err;
+
+ if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) {
struct rt6_info *rt;
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
- goto out;
+ return 0;
- dst_release(skb2->dst);
- skb2->dst = NULL;
+ skb_dst_drop(skb2);
skb_pull(skb2, offset);
- skb2->nh.raw = skb2->data;
+ skb_reset_network_header(skb2);
/* Try to guess incoming interface */
- rt = rt6_lookup(&skb2->nh.ipv6h->saddr, NULL, 0, 0);
+ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
+ NULL, 0, skb2, 0);
- if (rt && rt->rt6i_dev)
- skb2->dev = rt->rt6i_dev;
+ if (rt && rt->dst.dev)
+ skb2->dev = rt->dst.dev;
- icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev);
+ icmpv6_send(skb2, rel_type, rel_code, rel_info);
- if (rt)
- dst_release(&rt->u.dst);
+ ip6_rt_put(rt);
kfree_skb(skb2);
}
-out:
- read_unlock(&ip6ip6_lock);
+
+ return 0;
+}
+
+static int
+mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __u32 rel_info = ntohl(info);
+ int err, rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+
+ err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
return err;
}
-static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph,
- struct sk_buff *skb)
+static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;
+
+ if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
+
+ return IP6_ECN_decapsulate(ipv6h, skb);
+}
+
+static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
{
- struct ipv6hdr *inner_iph = skb->nh.ipv6h;
+ if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
+ ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
- if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
- IP6_ECN_set_ce(inner_iph);
+ return IP6_ECN_decapsulate(ipv6h, skb);
}
-static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t)
+
+static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ /* ECN is not supported in AF_MPLS */
+ return 0;
+}
+
+__u32 ip6_tnl_get_cap(struct ip6_tnl *t,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr)
{
- struct ip6_tnl_parm *p = &t->parms;
+ struct __ip6_tnl_parm *p = &t->parms;
+ int ltype = ipv6_addr_type(laddr);
+ int rtype = ipv6_addr_type(raddr);
+ __u32 flags = 0;
+
+ if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) {
+ flags = IP6_TNL_F_CAP_PER_PACKET;
+ } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+ rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+ !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
+ (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
+ if (ltype&IPV6_ADDR_UNICAST)
+ flags |= IP6_TNL_F_CAP_XMIT;
+ if (rtype&IPV6_ADDR_UNICAST)
+ flags |= IP6_TNL_F_CAP_RCV;
+ }
+ return flags;
+}
+EXPORT_SYMBOL(ip6_tnl_get_cap);
+
+/* called with rcu_read_lock() */
+int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr)
+{
+ struct __ip6_tnl_parm *p = &t->parms;
int ret = 0;
+ struct net *net = t->net;
- if (p->flags & IP6_TNL_F_CAP_RCV) {
- struct net_device *ldev = NULL;
+ if ((p->flags & IP6_TNL_F_CAP_RCV) ||
+ ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
+ (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) {
+ struct net_device *ldev = NULL;
if (p->link)
- ldev = dev_get_by_index(p->link);
-
- if ((ipv6_addr_is_multicast(&p->laddr) ||
- likely(ipv6_chk_addr(&p->laddr, ldev, 0))) &&
- likely(!ipv6_chk_addr(&p->raddr, NULL, 0)))
+ ldev = dev_get_by_index_rcu(net, p->link);
+
+ if ((ipv6_addr_is_multicast(laddr) ||
+ likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
+ 0, IFA_F_TENTATIVE))) &&
+ ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
+ likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
+ 0, IFA_F_TENTATIVE))))
ret = 1;
-
- if (ldev)
- dev_put(ldev);
}
return ret;
}
+EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl);
+
+static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi,
+ struct metadata_dst *tun_dst,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb),
+ bool log_ecn_err)
+{
+ const struct ipv6hdr *ipv6h;
+ int nh, err;
+
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+ test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
+ DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
-/**
- * ip6ip6_rcv - decapsulate IPv6 packet and retransmit it locally
- * @skb: received socket buffer
- *
- * Return: 0
- **/
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+ if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
+ (tunnel->i_seqno &&
+ (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
-static int
-ip6ip6_rcv(struct sk_buff *skb)
-{
- struct ipv6hdr *ipv6h;
- struct ip6_tnl *t;
+ skb->protocol = tpi->proto;
+
+ /* Warning: All skb pointers will be invalidated! */
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ if (!pskb_may_pull(skb, ETH_HLEN)) {
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
- ipv6h = skb->nh.ipv6h;
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ skb_reset_mac_header(skb);
+ }
+
+ /* Save offset of outer header relative to skb->head,
+ * because we are going to reset the network header to the inner header
+ * and might change skb->head.
+ */
+ nh = skb_network_header(skb) - skb->head;
- read_lock(&ip6ip6_lock);
+ skb_reset_network_header(skb);
- if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) {
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- read_unlock(&ip6ip6_lock);
- goto discard;
+ if (!pskb_inet_may_pull(skb)) {
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+
+ /* Get the outer header. */
+ ipv6h = (struct ipv6hdr *)(skb->head + nh);
+
+ memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+
+ __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+
+ err = dscp_ecn_decapsulate(tunnel, ipv6h, skb);
+ if (unlikely(err)) {
+ if (log_ecn_err)
+ net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n",
+ &ipv6h->saddr,
+ ipv6_get_dsfield(ipv6h));
+ if (err > 1) {
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
}
+ }
+
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
- if (!ip6_tnl_rcv_ctl(t)) {
- t->stat.rx_dropped++;
- read_unlock(&ip6ip6_lock);
- goto discard;
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
+ kfree_skb(skb);
+ return 0;
+}
+
+int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi,
+ struct metadata_dst *tun_dst,
+ bool log_ecn_err)
+{
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb);
+
+ dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate;
+ if (tpi->proto == htons(ETH_P_IP))
+ dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate;
+
+ return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
+ log_ecn_err);
+}
+EXPORT_SYMBOL(ip6_tnl_rcv);
+
+static const struct tnl_ptk_info tpi_v6 = {
+ /* no tunnel info required for ipxip6. */
+ .proto = htons(ETH_P_IPV6),
+};
+
+static const struct tnl_ptk_info tpi_v4 = {
+ /* no tunnel info required for ipxip6. */
+ .proto = htons(ETH_P_IP),
+};
+
+static const struct tnl_ptk_info tpi_mpls = {
+ /* no tunnel info required for mplsip6. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+
+static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
+ const struct tnl_ptk_info *tpi,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb))
+{
+ struct ip6_tnl *t;
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct metadata_dst *tun_dst = NULL;
+ int ret = -1;
+
+ rcu_read_lock();
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr);
+
+ if (t) {
+ u8 tproto = READ_ONCE(t->parms.proto);
+
+ if (tproto != ipproto && tproto != 0)
+ goto drop;
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ ipv6h = ipv6_hdr(skb);
+ if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr))
+ goto drop;
+ if (iptunnel_pull_header(skb, 0, tpi->proto, false))
+ goto drop;
+ if (t->parms.collect_md) {
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0);
+ if (!tun_dst)
+ goto drop;
}
- secpath_reset(skb);
- skb->mac.raw = skb->nh.raw;
- skb->nh.raw = skb->data;
- skb->protocol = htons(ETH_P_IPV6);
- skb->pkt_type = PACKET_HOST;
- memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
- skb->dev = t->dev;
- dst_release(skb->dst);
- skb->dst = NULL;
- nf_reset(skb);
- if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
- ipv6_copy_dscp(ipv6h, skb->nh.ipv6h);
- ip6ip6_ecn_decapsulate(ipv6h, skb);
- t->stat.rx_packets++;
- t->stat.rx_bytes += skb->len;
- netif_rx(skb);
- read_unlock(&ip6ip6_lock);
- return 0;
+ ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
+ log_ecn_error);
}
- read_unlock(&ip6ip6_lock);
- return 1;
-discard:
+ rcu_read_unlock();
+
+ return ret;
+
+drop:
+ rcu_read_unlock();
kfree_skb(skb);
return 0;
}
+static int ip4ip6_rcv(struct sk_buff *skb)
+{
+ return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4,
+ ip4ip6_dscp_ecn_decapsulate);
+}
+
+static int ip6ip6_rcv(struct sk_buff *skb)
+{
+ return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6,
+ ip6ip6_dscp_ecn_decapsulate);
+}
+
+static int mplsip6_rcv(struct sk_buff *skb)
+{
+ return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
+ mplsip6_dscp_ecn_decapsulate);
+}
+
struct ipv6_tel_txoption {
struct ipv6_txoptions ops;
__u8 dst_opt[8];
@@ -588,324 +1000,623 @@ static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
opt->dst_opt[5] = IPV6_TLV_PADN;
opt->dst_opt[6] = 1;
- opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
+ opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt;
opt->ops.opt_nflen = 8;
}
/**
- * ip6ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
* @t: the outgoing tunnel device
- * @hdr: IPv6 header from the incoming packet
+ * @hdr: IPv6 header from the incoming packet
*
* Description:
- * Avoid trivial tunneling loop by checking that tunnel exit-point
+ * Avoid trivial tunneling loop by checking that tunnel exit-point
* doesn't match source of incoming packet.
*
- * Return:
+ * Return:
* 1 if conflict,
* 0 else
**/
-static inline int
-ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr)
+static inline bool
+ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
{
return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
}
-static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t)
+int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr)
{
- struct ip6_tnl_parm *p = &t->parms;
+ struct __ip6_tnl_parm *p = &t->parms;
int ret = 0;
+ struct net *net = t->net;
+
+ if (t->parms.collect_md)
+ return 1;
- if (p->flags & IP6_TNL_F_CAP_XMIT) {
+ if ((p->flags & IP6_TNL_F_CAP_XMIT) ||
+ ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
+ (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) {
struct net_device *ldev = NULL;
+ rcu_read_lock();
if (p->link)
- ldev = dev_get_by_index(p->link);
-
- if (unlikely(!ipv6_chk_addr(&p->laddr, ldev, 0)))
- printk(KERN_WARNING
- "%s xmit: Local address not yet configured!\n",
- p->name);
- else if (!ipv6_addr_is_multicast(&p->raddr) &&
- unlikely(ipv6_chk_addr(&p->raddr, NULL, 0)))
- printk(KERN_WARNING
- "%s xmit: Routing loop! "
- "Remote address found on this node!\n",
- p->name);
+ ldev = dev_get_by_index_rcu(net, p->link);
+
+ if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
+ 0, IFA_F_TENTATIVE)))
+ pr_warn_ratelimited("%s xmit: Local address not yet configured!\n",
+ p->name);
+ else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
+ !ipv6_addr_is_multicast(raddr) &&
+ unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
+ true, 0, IFA_F_TENTATIVE)))
+ pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n",
+ p->name);
else
ret = 1;
- if (ldev)
- dev_put(ldev);
+ rcu_read_unlock();
}
return ret;
}
+EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl);
+
/**
- * ip6ip6_tnl_xmit - encapsulate packet and send
+ * ip6_tnl_xmit - encapsulate packet and send
* @skb: the outgoing socket buffer
- * @dev: the outgoing tunnel device
+ * @dev: the outgoing tunnel device
+ * @dsfield: dscp code for outer header
+ * @fl6: flow of tunneled packet
+ * @encap_limit: encapsulation limit
+ * @pmtu: Path MTU is stored if packet is too big
+ * @proto: next header value
*
* Description:
* Build new header and do some sanity checks on the packet before sending
* it.
*
- * Return:
- * 0
+ * Return:
+ * 0 on success
+ * -1 fail
+ * %-EMSGSIZE message too big. return mtu in this case.
**/
-static int
-ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
+ struct flowi6 *fl6, int encap_limit, __u32 *pmtu,
+ __u8 proto)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->stat;
- struct ipv6hdr *ipv6h = skb->nh.ipv6h;
- int encap_limit = -1;
+ struct net *net = t->net;
+ struct ipv6hdr *ipv6h;
struct ipv6_tel_txoption opt;
- __u16 offset;
- struct flowi fl;
- struct dst_entry *dst;
+ struct dst_entry *dst = NULL, *ndst = NULL;
struct net_device *tdev;
int mtu;
- int max_headroom = sizeof(struct ipv6hdr);
- u8 proto;
- int err;
- int pkt_len;
- int dsfield;
-
- if (t->recursion++) {
- stats->collisions++;
- goto tx_err;
+ unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0;
+ unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
+ unsigned int max_headroom = psh_hlen;
+ __be16 payload_protocol;
+ bool use_cache = false;
+ u8 hop_limit;
+ int err = -1;
+
+ payload_protocol = skb_protocol(skb, true);
+
+ if (t->parms.collect_md) {
+ hop_limit = skb_tunnel_info(skb)->key.ttl;
+ goto route_lookup;
+ } else {
+ hop_limit = t->parms.hop_limit;
}
- if (skb->protocol != htons(ETH_P_IPV6) ||
- !ip6_tnl_xmit_ctl(t) || ip6ip6_tnl_addr_conflict(t, ipv6h))
- goto tx_err;
- if ((offset = parse_tlv_tnl_enc_lim(skb, skb->nh.raw)) > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->nh.raw[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2, skb->dev);
- goto tx_err;
+ /* NBMA tunnel */
+ if (ipv6_addr_any(&t->parms.raddr)) {
+ if (payload_protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *addr6;
+ struct neighbour *neigh;
+ int addr_type;
+
+ if (!skb_dst(skb))
+ goto tx_err_link_failure;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_err_link_failure;
+
+ addr6 = (struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY)
+ addr6 = &ipv6_hdr(skb)->daddr;
+
+ memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
+ neigh_release(neigh);
+ } else if (payload_protocol == htons(ETH_P_IP)) {
+ const struct rtable *rt = skb_rtable(skb);
+
+ if (!rt)
+ goto tx_err_link_failure;
+
+ if (rt->rt_gw_family == AF_INET6)
+ memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr));
}
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
+ } else if (t->parms.proto != 0 && !(t->parms.flags &
+ (IP6_TNL_F_USE_ORIG_TCLASS |
+ IP6_TNL_F_USE_ORIG_FWMARK))) {
+ /* enable the cache only if neither the outer protocol nor the
+ * routing decision depends on the current inner header value
+ */
+ use_cache = true;
+ }
- memcpy(&fl, &t->fl, sizeof (fl));
- proto = fl.proto;
+ if (use_cache)
+ dst = dst_cache_get(&t->dst_cache);
- dsfield = ipv6_get_dsfield(ipv6h);
- if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
- fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
- if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL))
- fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK);
+ if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
+ goto tx_err_link_failure;
- if ((dst = ip6_tnl_dst_check(t)) != NULL)
- dst_hold(dst);
- else {
- dst = ip6_route_output(NULL, &fl);
+ if (!dst) {
+route_lookup:
+ /* add dsfield to flowlabel for route lookup */
+ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel);
- if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0)
+ dst = ip6_route_output(net, NULL, fl6);
+
+ if (dst->error)
+ goto tx_err_link_failure;
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
goto tx_err_link_failure;
+ }
+ if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl6->daddr, 0, &fl6->saddr))
+ goto tx_err_link_failure;
+ ndst = dst;
}
- tdev = dst->dev;
+ tdev = dst_dev(dst);
if (tdev == dev) {
- stats->collisions++;
- if (net_ratelimit())
- printk(KERN_WARNING
- "%s: Local routing loop detected!\n",
- t->parms.name);
+ DEV_STATS_INC(dev, collisions);
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ t->parms.name);
goto tx_err_dst_release;
}
- mtu = dst_mtu(dst) - sizeof (*ipv6h);
+ mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
}
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
- if (skb->dst && mtu < dst_mtu(skb->dst)) {
- struct rt6_info *rt = (struct rt6_info *) skb->dst;
- rt->rt6i_flags |= RTF_MODIFIED;
- rt->u.dst.metrics[RTAX_MTU-1] = mtu;
- }
- if (skb->len > mtu) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+ mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
+ IPV6_MIN_MTU : IPV4_MIN_MTU);
+
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+ if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
+ *pmtu = mtu;
+ err = -EMSGSIZE;
goto tx_err_dst_release;
}
+ if (t->err_count > 0) {
+ if (time_before(jiffies,
+ t->err_time + IP6TUNNEL_ERR_TIMEO)) {
+ t->err_count--;
+
+ dst_link_failure(skb);
+ } else {
+ t->err_count = 0;
+ }
+ }
+
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom += LL_RESERVED_SPACE(tdev);
-
- if (skb_headroom(skb) < max_headroom ||
- skb_cloned(skb) || skb_shared(skb)) {
+
+ if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+ (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
struct sk_buff *new_skb;
-
- if (!(new_skb = skb_realloc_headroom(skb, max_headroom)))
+
+ new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb)
goto tx_err_dst_release;
if (skb->sk)
skb_set_owner_w(new_skb, skb->sk);
- kfree_skb(skb);
+ consume_skb(skb);
skb = new_skb;
}
- dst_release(skb->dst);
- skb->dst = dst_clone(dst);
- skb->h.raw = skb->nh.raw;
+ if (t->parms.collect_md) {
+ if (t->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_err_dst_release;
+ } else {
+ if (use_cache && ndst)
+ dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+ }
+ skb_dst_set(skb, dst);
+
+ if (hop_limit == 0) {
+ if (payload_protocol == htons(ETH_P_IP))
+ hop_limit = ip_hdr(skb)->ttl;
+ else if (payload_protocol == htons(ETH_P_IPV6))
+ hop_limit = ipv6_hdr(skb)->hop_limit;
+ else
+ hop_limit = ip6_dst_hoplimit(dst);
+ }
+
+ /* Calculate max headroom for all the headers and adjust
+ * needed_headroom if necessary.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr)
+ + dst->header_len + t->hlen;
+ ip_tunnel_adj_headroom(dev, max_headroom);
+
+ err = ip6_tnl_encap(skb, t, &proto, fl6);
+ if (err)
+ return err;
if (encap_limit >= 0) {
init_tel_txopt(&opt, encap_limit);
- ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
+ ipv6_push_frag_opts(skb, &opt.ops, &proto);
}
- skb->nh.raw = skb_push(skb, sizeof(struct ipv6hdr));
- ipv6h = skb->nh.ipv6h;
- *(__be32*)ipv6h = fl.fl6_flowlabel | htonl(0x60000000);
- dsfield = INET_ECN_encapsulate(0, dsfield);
- ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield);
- ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
- ipv6h->hop_limit = t->parms.hop_limit;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ ipv6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ipv6h, dsfield,
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
+ ipv6h->hop_limit = hop_limit;
ipv6h->nexthdr = proto;
- ipv6_addr_copy(&ipv6h->saddr, &fl.fl6_src);
- ipv6_addr_copy(&ipv6h->daddr, &fl.fl6_dst);
- nf_reset(skb);
- pkt_len = skb->len;
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL,
- skb->dst->dev, dst_output);
-
- if (net_xmit_eval(err) == 0) {
- stats->tx_bytes += pkt_len;
- stats->tx_packets++;
- } else {
- stats->tx_errors++;
- stats->tx_aborted_errors++;
- }
- ip6_tnl_dst_store(t, dst);
- t->recursion--;
+ ipv6h->saddr = fl6->saddr;
+ ipv6h->daddr = fl6->daddr;
+ ip6tunnel_xmit(NULL, skb, dev, 0);
return 0;
tx_err_link_failure:
- stats->tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_link_failure(skb);
tx_err_dst_release:
dst_release(dst);
-tx_err:
- stats->tx_errors++;
- stats->tx_dropped++;
- kfree_skb(skb);
- t->recursion--;
+ return err;
+}
+EXPORT_SYMBOL(ip6_tnl_xmit);
+
+static inline int
+ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 protocol)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+ int encap_limit = -1;
+ __u16 offset;
+ struct flowi6 fl6;
+ __u8 dsfield, orig_dsfield;
+ __u32 mtu;
+ u8 tproto;
+ int err;
+
+ tproto = READ_ONCE(t->parms.proto);
+ if (tproto != protocol && tproto != 0)
+ return -1;
+
+ if (t->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
+ return -1;
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = protocol;
+ fl6.saddr = key->u.ipv6.src;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ dsfield = key->tos;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield;
+ break;
+ }
+ } else {
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
+ if (protocol == IPPROTO_IPV6) {
+ offset = ip6_tnl_parse_tlv_enc_lim(skb,
+ skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have
+ * reallocated skb->head
+ */
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+
+ tel = (void *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
+ }
+ }
+
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = protocol;
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+ else
+ fl6.flowi6_mark = t->parms.fwmark;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ }
+ }
+
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);
+
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+ return -1;
+
+ skb_set_inner_ipproto(skb, protocol);
+
+ err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
+ protocol);
+ if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
+ if (err == -EMSGSIZE)
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ break;
+ case IPPROTO_IPV6:
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ break;
+ default:
+ break;
+ }
+ return -1;
+ }
+
return 0;
}
-static void ip6_tnl_set_cap(struct ip6_tnl *t)
+static netdev_tx_t
+ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct ip6_tnl_parm *p = &t->parms;
- int ltype = ipv6_addr_type(&p->laddr);
- int rtype = ipv6_addr_type(&p->raddr);
+ struct ip6_tnl *t = netdev_priv(dev);
+ u8 ipproto;
+ int ret;
- p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV);
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
- if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
- rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
- !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
- (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
- if (ltype&IPV6_ADDR_UNICAST)
- p->flags |= IP6_TNL_F_CAP_XMIT;
- if (rtype&IPV6_ADDR_UNICAST)
- p->flags |= IP6_TNL_F_CAP_RCV;
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipproto = IPPROTO_IPIP;
+ break;
+ case htons(ETH_P_IPV6):
+ if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
+ goto tx_err;
+ ipproto = IPPROTO_IPV6;
+ break;
+ case htons(ETH_P_MPLS_UC):
+ ipproto = IPPROTO_MPLS;
+ break;
+ default:
+ goto tx_err;
}
+
+ ret = ipxip6_tnl_xmit(skb, dev, ipproto);
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
}
-static void ip6ip6_tnl_link_config(struct ip6_tnl *t)
+static void ip6_tnl_link_config(struct ip6_tnl *t)
{
struct net_device *dev = t->dev;
- struct ip6_tnl_parm *p = &t->parms;
- struct flowi *fl = &t->fl;
+ struct net_device *tdev = NULL;
+ struct __ip6_tnl_parm *p = &t->parms;
+ struct flowi6 *fl6 = &t->fl.u.ip6;
+ int t_hlen;
+ int mtu;
- memcpy(&dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
- memcpy(&dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
/* Set up flowi template */
- ipv6_addr_copy(&fl->fl6_src, &p->laddr);
- ipv6_addr_copy(&fl->fl6_dst, &p->raddr);
- fl->oif = p->link;
- fl->fl6_flowlabel = 0;
+ fl6->saddr = p->laddr;
+ fl6->daddr = p->raddr;
+ fl6->flowi6_oif = p->link;
+ fl6->flowlabel = 0;
if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
- fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
+ fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
- fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
+ fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
- ip6_tnl_set_cap(t);
+ p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
+ p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
dev->flags |= IFF_POINTOPOINT;
else
dev->flags &= ~IFF_POINTOPOINT;
- dev->iflink = p->link;
+ t->tun_hlen = 0;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
if (p->flags & IP6_TNL_F_CAP_XMIT) {
int strict = (ipv6_addr_type(&p->raddr) &
(IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
- struct rt6_info *rt = rt6_lookup(&p->raddr, &p->laddr,
- p->link, strict);
+ struct rt6_info *rt = rt6_lookup(t->net,
+ &p->raddr, &p->laddr,
+ p->link, NULL, strict);
+ if (rt) {
+ tdev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
- if (rt == NULL)
- return;
+ if (!tdev && p->link)
+ tdev = __dev_get_by_index(t->net, p->link);
- if (rt->rt6i_dev) {
- dev->hard_header_len = rt->rt6i_dev->hard_header_len +
- sizeof (struct ipv6hdr);
+ if (tdev) {
+ dev->needed_headroom = tdev->hard_header_len +
+ tdev->needed_headroom + t_hlen;
+ mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
- dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr);
+ mtu = mtu - t_hlen;
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ mtu -= 8;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
- dst_release(&rt->u.dst);
}
}
/**
- * ip6ip6_tnl_change - update the tunnel parameters
+ * ip6_tnl_change - update the tunnel parameters
* @t: tunnel to be changed
* @p: tunnel configuration parameters
- * @active: != 0 if tunnel is ready for use
*
* Description:
- * ip6ip6_tnl_change() updates the tunnel parameters
+ * ip6_tnl_change() updates the tunnel parameters
**/
-static int
-ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
+static void
+ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
{
- ipv6_addr_copy(&t->parms.laddr, &p->laddr);
- ipv6_addr_copy(&t->parms.raddr, &p->raddr);
+ t->parms.laddr = p->laddr;
+ t->parms.raddr = p->raddr;
t->parms.flags = p->flags;
t->parms.hop_limit = p->hop_limit;
t->parms.encap_limit = p->encap_limit;
t->parms.flowinfo = p->flowinfo;
t->parms.link = p->link;
- ip6_tnl_dst_reset(t);
- ip6ip6_tnl_link_config(t);
+ t->parms.proto = p->proto;
+ t->parms.fwmark = p->fwmark;
+ dst_cache_reset(&t->dst_cache);
+ ip6_tnl_link_config(t);
+}
+
+static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ ip6_tnl_unlink(ip6n, t);
+ synchronize_net();
+ ip6_tnl_change(t, p);
+ ip6_tnl_link(ip6n, t);
+ netdev_state_change(t->dev);
+}
+
+static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p,
+ bool strict)
+{
+ /* For the default ip6tnl0 device, allow changing only the protocol
+ * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other
+ * parameters are 0).
+ */
+ if (strict &&
+ (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) ||
+ p->flags != t->parms.flags || p->hop_limit || p->encap_limit ||
+ p->flowinfo || p->link || p->fwmark || p->collect_md))
+ return -EINVAL;
+
+ t->parms.proto = p->proto;
+ netdev_state_change(t->dev);
return 0;
}
+static void
+ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
+{
+ p->laddr = u->laddr;
+ p->raddr = u->raddr;
+ p->flags = u->flags;
+ p->hop_limit = u->hop_limit;
+ p->encap_limit = u->encap_limit;
+ p->flowinfo = u->flowinfo;
+ p->link = u->link;
+ p->proto = u->proto;
+ memcpy(p->name, u->name, sizeof(u->name));
+}
+
+static void
+ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
+{
+ u->laddr = p->laddr;
+ u->raddr = p->raddr;
+ u->flags = p->flags;
+ u->hop_limit = p->hop_limit;
+ u->encap_limit = p->encap_limit;
+ u->flowinfo = p->flowinfo;
+ u->link = p->link;
+ u->proto = p->proto;
+ memcpy(u->name, p->name, sizeof(u->name));
+}
+
/**
- * ip6ip6_tnl_ioctl - configure ipv6 tunnels from userspace
+ * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace
* @dev: virtual device associated with tunnel
- * @ifr: parameters passed from userspace
+ * @ifr: unused
+ * @data: parameters passed from userspace
* @cmd: command to be performed
*
* Description:
- * ip6ip6_tnl_ioctl() is used for managing IPv6 tunnels
- * from userspace.
+ * ip6_tnl_ioctl() is used for managing IPv6 tunnels
+ * from userspace.
*
* The possible commands are the following:
* %SIOCGETTUNNEL: get tunnel parameters for device
@@ -913,7 +1624,7 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
* %SIOCCHGTUNNEL: change tunnel parameters to those given
* %SIOCDELTUNNEL: delete tunnel
*
- * The fallback device "ip6tnl0", created during module
+ * The fallback device "ip6tnl0", created during module
* initialization, can be used for creating other tunnel devices.
*
* Return:
@@ -926,80 +1637,94 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
**/
static int
-ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
{
int err = 0;
struct ip6_tnl_parm p;
- struct ip6_tnl *t = NULL;
+ struct __ip6_tnl_parm p1;
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ memset(&p1, 0, sizeof(p1));
switch (cmd) {
case SIOCGETTUNNEL:
- if (dev == ip6ip6_fb_tnl_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) {
+ if (dev == ip6n->fb_tnl_dev) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
- t = ip6ip6_tnl_locate(&p, 0);
+ ip6_tnl_parm_from_user(&p1, &p);
+ t = ip6_tnl_locate(net, &p1, 0);
+ if (IS_ERR(t))
+ t = netdev_priv(dev);
+ } else {
+ memset(&p, 0, sizeof(p));
}
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof (p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) {
+ ip6_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
- }
break;
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -EINVAL;
- if (p.proto != IPPROTO_IPV6)
+ if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
+ p.proto != 0)
break;
- t = ip6ip6_tnl_locate(&p, cmd == SIOCADDTUNNEL);
- if (dev != ip6ip6_fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
+ ip6_tnl_parm_from_user(&p1, &p);
+ t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
+ if (cmd == SIOCCHGTUNNEL) {
+ if (!IS_ERR(t)) {
if (t->dev != dev) {
err = -EEXIST;
break;
}
} else
t = netdev_priv(dev);
-
- ip6ip6_tnl_unlink(t);
- err = ip6ip6_tnl_change(t, &p);
- ip6ip6_tnl_link(t);
- netdev_state_change(dev);
+ if (dev == ip6n->fb_tnl_dev)
+ ip6_tnl0_update(t, &p1, false);
+ else
+ ip6_tnl_update(t, &p1);
}
- if (t) {
+ if (!IS_ERR(t)) {
err = 0;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p)))
+ ip6_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ } else {
+ err = PTR_ERR(t);
+ }
break;
case SIOCDELTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
- if (dev == ip6ip6_fb_tnl_dev) {
+ if (dev == ip6n->fb_tnl_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -ENOENT;
- if ((t = ip6ip6_tnl_locate(&p, 0)) == NULL)
+ ip6_tnl_parm_from_user(&p1, &p);
+ t = ip6_tnl_locate(net, &p1, 0);
+ if (IS_ERR(t))
break;
err = -EPERM;
- if (t->dev == ip6ip6_fb_tnl_dev)
+ if (t->dev == ip6n->fb_tnl_dev)
break;
dev = t->dev;
}
- err = unregister_netdevice(dev);
+ err = 0;
+ unregister_netdevice(dev);
break;
default:
err = -EINVAL;
@@ -1008,20 +1733,7 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
}
/**
- * ip6ip6_tnl_get_stats - return the stats for tunnel device
- * @dev: virtual device associated with tunnel
- *
- * Return: stats for device
- **/
-
-static struct net_device_stats *
-ip6ip6_tnl_get_stats(struct net_device *dev)
-{
- return &(((struct ip6_tnl *)netdev_priv(dev))->stat);
-}
-
-/**
- * ip6ip6_tnl_change_mtu - change mtu manually for tunnel device
+ * ip6_tnl_change_mtu - change mtu manually for tunnel device
* @dev: virtual device associated with tunnel
* @new_mtu: the new mtu
*
@@ -1030,93 +1742,590 @@ ip6ip6_tnl_get_stats(struct net_device *dev)
* %-EINVAL if mtu too small
**/
-static int
-ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
+int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
{
- if (new_mtu < IPV6_MIN_MTU) {
- return -EINVAL;
+ struct ip6_tnl *tnl = netdev_priv(dev);
+ int t_hlen;
+
+ t_hlen = tnl->hlen + sizeof(struct ipv6hdr);
+ if (tnl->parms.proto == IPPROTO_IPV6) {
+ if (new_mtu < IPV6_MIN_MTU)
+ return -EINVAL;
+ } else {
+ if (new_mtu < ETH_MIN_MTU)
+ return -EINVAL;
+ }
+ if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) {
+ if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ } else {
+ if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen)
+ return -EINVAL;
}
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
+EXPORT_SYMBOL(ip6_tnl_change_mtu);
+
+int ip6_tnl_get_iflink(const struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ return READ_ONCE(t->parms.link);
+}
+EXPORT_SYMBOL(ip6_tnl_get_iflink);
+
+int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip6_tnl_encap_ops **)
+ &ip6tun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_add_ops);
+
+int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip6_tnl_encap_ops **)
+ &ip6tun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_del_ops);
+
+int ip6_tnl_encap_setup(struct ip6_tnl *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip6_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+
+static const struct net_device_ops ip6_tnl_netdev_ops = {
+ .ndo_init = ip6_tnl_dev_init,
+ .ndo_uninit = ip6_tnl_dev_uninit,
+ .ndo_start_xmit = ip6_tnl_start_xmit,
+ .ndo_siocdevprivate = ip6_tnl_siocdevprivate,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_stats64 = dev_get_tstats64,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+#define IPXIPX_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
/**
- * ip6ip6_tnl_dev_setup - setup virtual tunnel device
+ * ip6_tnl_dev_setup - setup virtual tunnel device
* @dev: virtual device associated with tunnel
*
* Description:
* Initialize function pointers and device parameters
**/
-static void ip6ip6_tnl_dev_setup(struct net_device *dev)
+static void ip6_tnl_dev_setup(struct net_device *dev)
{
- SET_MODULE_OWNER(dev);
- dev->uninit = ip6ip6_tnl_dev_uninit;
- dev->destructor = free_netdev;
- dev->hard_start_xmit = ip6ip6_tnl_xmit;
- dev->get_stats = ip6ip6_tnl_get_stats;
- dev->do_ioctl = ip6ip6_tnl_ioctl;
- dev->change_mtu = ip6ip6_tnl_change_mtu;
+ dev->netdev_ops = &ip6_tnl_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
- dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
- dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr);
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
+ dev->lltx = true;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+ netif_keep_dst(dev);
+
+ dev->features |= IPXIPX_FEATURES;
+ dev->hw_features |= IPXIPX_FEATURES;
+
+ /* This perm addr will be used as interface identifier by IPv6 */
+ dev->addr_assign_type = NET_ADDR_RANDOM;
+ eth_random_addr(dev->perm_addr);
}
/**
- * ip6ip6_tnl_dev_init_gen - general initializer for all tunnel devices
+ * ip6_tnl_dev_init_gen - general initializer for all tunnel devices
* @dev: virtual device associated with tunnel
**/
-static inline void
-ip6ip6_tnl_dev_init_gen(struct net_device *dev)
+static inline int
+ip6_tnl_dev_init_gen(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- t->fl.proto = IPPROTO_IPV6;
+ int ret;
+ int t_hlen;
+
t->dev = dev;
- strcpy(t->parms.name, dev->name);
+
+ ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = gro_cells_init(&t->gro_cells, dev);
+ if (ret)
+ goto destroy_dst;
+
+ t->tun_hlen = 0;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
+ dev->type = ARPHRD_TUNNEL6;
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen;
+
+ netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
+ return 0;
+
+destroy_dst:
+ dst_cache_destroy(&t->dst_cache);
+
+ return ret;
}
/**
- * ip6ip6_tnl_dev_init - initializer for all non fallback tunnel devices
+ * ip6_tnl_dev_init - initializer for all non fallback tunnel devices
* @dev: virtual device associated with tunnel
**/
-static int
-ip6ip6_tnl_dev_init(struct net_device *dev)
+static int ip6_tnl_dev_init(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- ip6ip6_tnl_dev_init_gen(dev);
- ip6ip6_tnl_link_config(t);
+ int err = ip6_tnl_dev_init_gen(dev);
+
+ if (err)
+ return err;
+ ip6_tnl_link_config(t);
+ if (t->parms.collect_md)
+ netif_keep_dst(dev);
return 0;
}
/**
- * ip6ip6_fb_tnl_dev_init - initializer for fallback tunnel device
+ * ip6_fb_tnl_dev_init - initializer for fallback tunnel device
* @dev: fallback device
*
* Return: 0
**/
-static int
-ip6ip6_fb_tnl_dev_init(struct net_device *dev)
+static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- ip6ip6_tnl_dev_init_gen(dev);
- dev_hold(dev);
- tnls_wc[0] = t;
+ struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ t->net = net;
+ t->parms.proto = IPPROTO_IPV6;
+
+ rcu_assign_pointer(ip6n->tnls_wc[0], t);
+ return 0;
+}
+
+static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ u8 proto;
+
+ if (!data || !data[IFLA_IPTUN_PROTO])
+ return 0;
+
+ proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ if (proto != IPPROTO_IPV6 &&
+ proto != IPPROTO_IPIP &&
+ proto != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void ip6_tnl_netlink_parms(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL])
+ parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]);
+
+ if (data[IFLA_IPTUN_ENCAP_LIMIT])
+ parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]);
+
+ if (data[IFLA_IPTUN_FLOWINFO])
+ parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]);
+
+ if (data[IFLA_IPTUN_FLAGS])
+ parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]);
+
+ if (data[IFLA_IPTUN_PROTO])
+ parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ parms->collect_md = true;
+
+ if (data[IFLA_IPTUN_FWMARK])
+ parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
+}
+
+static int ip6_tnl_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel_encap ipencap;
+ struct ip6_tnl_net *ip6n;
+ struct ip6_tnl *nt, *t;
+ struct net *net;
+ int err;
+
+ net = params->link_net ? : dev_net(dev);
+ ip6n = net_generic(net, ip6_tnl_net_id);
+ nt = netdev_priv(dev);
+ nt->net = net;
+
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ err = ip6_tnl_encap_setup(nt, &ipencap);
+ if (err < 0)
+ return err;
+ }
+
+ ip6_tnl_netlink_parms(data, &nt->parms);
+
+ if (nt->parms.collect_md) {
+ if (rtnl_dereference(ip6n->collect_md_tun))
+ return -EEXIST;
+ } else {
+ t = ip6_tnl_locate(net, &nt->parms, 0);
+ if (!IS_ERR(t))
+ return -EEXIST;
+ }
+
+ err = ip6_tnl_create2(dev);
+ if (!err && tb[IFLA_MTU])
+ ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+
+ return err;
+}
+
+static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct __ip6_tnl_parm p;
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct ip_tunnel_encap ipencap;
+
+ if (dev == ip6n->fb_tnl_dev) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so
+ * let's ignore this flag.
+ */
+ ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6;
+ if (memchr_inv(&ipencap, 0, sizeof(ipencap))) {
+ NL_SET_ERR_MSG(extack,
+ "Only protocol can be changed for fallback tunnel, not encap params");
+ return -EINVAL;
+ }
+ }
+
+ ip6_tnl_netlink_parms(data, &p);
+ if (ip6_tnl0_update(t, &p, true) < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Only protocol can be changed for fallback tunnel");
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+ ip6_tnl_netlink_parms(data, &p);
+ if (p.collect_md)
+ return -EINVAL;
+
+ t = ip6_tnl_locate(net, &p, 0);
+ if (!IS_ERR(t)) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ ip6_tnl_update(t, &p);
+ return 0;
+}
+
+static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ if (dev != ip6n->fb_tnl_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t ip6_tnl_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_LIMIT */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FLOWINFO */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_FLAGS */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_PROTO */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
+ /* IFLA_IPTUN_FWMARK */
+ nla_total_size(4) +
+ 0;
+}
+
+static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+ struct __ip6_tnl_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) ||
+ nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) ||
+ nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) ||
+ nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) ||
+ nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
+ nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) ||
+ nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
+ goto nla_put_failure;
+
+ if (parm->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
+
return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+struct net *ip6_tnl_get_link_net(const struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+
+ return READ_ONCE(tunnel->net);
}
+EXPORT_SYMBOL(ip6_tnl_get_link_net);
+
+static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 },
+ [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 },
+ [IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops ip6_link_ops __read_mostly = {
+ .kind = "ip6tnl",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ip6_tnl_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6_tnl_dev_setup,
+ .validate = ip6_tnl_validate,
+ .newlink = ip6_tnl_newlink,
+ .changelink = ip6_tnl_changelink,
+ .dellink = ip6_tnl_dellink,
+ .get_size = ip6_tnl_get_size,
+ .fill_info = ip6_tnl_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
-static struct xfrm6_tunnel ip6ip6_handler = {
+static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
+ .handler = ip4ip6_rcv,
+ .err_handler = ip4ip6_err,
+ .priority = 1,
+};
+
+static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
.handler = ip6ip6_rcv,
.err_handler = ip6ip6_err,
.priority = 1,
};
+static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
+ .handler = mplsip6_rcv,
+ .err_handler = mplsip6_err,
+ .priority = 1,
+};
+
+static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list)
+{
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct net_device *dev, *aux;
+ int h;
+ struct ip6_tnl *t;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &ip6_link_ops)
+ unregister_netdevice_queue(dev, list);
+
+ for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
+ t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, list);
+
+ t = rtnl_net_dereference(net, t->next);
+ }
+ }
+
+ t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, list);
+
+ t = rtnl_net_dereference(net, t->next);
+ }
+}
+
+static int __net_init ip6_tnl_init_net(struct net *net)
+{
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct ip6_tnl *t = NULL;
+ int err;
+
+ ip6n->tnls[0] = ip6n->tnls_wc;
+ ip6n->tnls[1] = ip6n->tnls_r_l;
+
+ if (!net_has_fallback_tunnels(net))
+ return 0;
+ err = -ENOMEM;
+ ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
+ NET_NAME_UNKNOWN, ip6_tnl_dev_setup);
+
+ if (!ip6n->fb_tnl_dev)
+ goto err_alloc_dev;
+ dev_net_set(ip6n->fb_tnl_dev, net);
+ ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops;
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ ip6n->fb_tnl_dev->netns_immutable = true;
+
+ err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ err = register_netdev(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ t = netdev_priv(ip6n->fb_tnl_dev);
+
+ strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
+ return 0;
+
+err_register:
+ free_netdev(ip6n->fb_tnl_dev);
+err_alloc_dev:
+ return err;
+}
+
+static struct pernet_operations ip6_tnl_net_ops = {
+ .init = ip6_tnl_init_net,
+ .exit_rtnl = ip6_tnl_exit_rtnl_net,
+ .id = &ip6_tnl_net_id,
+ .size = sizeof(struct ip6_tnl_net),
+};
+
/**
* ip6_tunnel_init - register protocol and reserve needed resources
*
@@ -1127,41 +2336,50 @@ static int __init ip6_tunnel_init(void)
{
int err;
- if (xfrm6_tunnel_register(&ip6ip6_handler)) {
- printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
- return -EAGAIN;
+ if (!ipv6_mod_enabled())
+ return -EOPNOTSUPP;
+
+ err = register_pernet_device(&ip6_tnl_net_ops);
+ if (err < 0)
+ goto out_pernet;
+
+ err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET);
+ if (err < 0) {
+ pr_err("%s: can't register ip4ip6\n", __func__);
+ goto out_ip4ip6;
}
- ip6ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
- ip6ip6_tnl_dev_setup);
- if (!ip6ip6_fb_tnl_dev) {
- err = -ENOMEM;
- goto fail;
+ err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6);
+ if (err < 0) {
+ pr_err("%s: can't register ip6ip6\n", __func__);
+ goto out_ip6ip6;
}
- ip6ip6_fb_tnl_dev->init = ip6ip6_fb_tnl_dev_init;
- if ((err = register_netdev(ip6ip6_fb_tnl_dev))) {
- free_netdev(ip6ip6_fb_tnl_dev);
- goto fail;
+ if (ip6_tnl_mpls_supported()) {
+ err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
+ if (err < 0) {
+ pr_err("%s: can't register mplsip6\n", __func__);
+ goto out_mplsip6;
+ }
}
- return 0;
-fail:
- xfrm6_tunnel_deregister(&ip6ip6_handler);
- return err;
-}
-static void __exit ip6ip6_destroy_tunnels(void)
-{
- int h;
- struct ip6_tnl *t;
+ err = rtnl_link_register(&ip6_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
- for (h = 0; h < HASH_SIZE; h++) {
- while ((t = tnls_r_l[h]) != NULL)
- unregister_netdevice(t->dev);
- }
+ return 0;
- t = tnls_wc[0];
- unregister_netdevice(t->dev);
+rtnl_link_failed:
+ if (ip6_tnl_mpls_supported())
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
+out_mplsip6:
+ xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
+out_ip6ip6:
+ xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
+out_ip4ip6:
+ unregister_pernet_device(&ip6_tnl_net_ops);
+out_pernet:
+ return err;
}
/**
@@ -1170,12 +2388,17 @@ static void __exit ip6ip6_destroy_tunnels(void)
static void __exit ip6_tunnel_cleanup(void)
{
- if (xfrm6_tunnel_deregister(&ip6ip6_handler))
- printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
+ rtnl_link_unregister(&ip6_link_ops);
+ if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET))
+ pr_info("%s: can't deregister ip4ip6\n", __func__);
+
+ if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
+ pr_info("%s: can't deregister ip6ip6\n", __func__);
- rtnl_lock();
- ip6ip6_destroy_tunnels();
- rtnl_unlock();
+ if (ip6_tnl_mpls_supported() &&
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
+ pr_info("%s: can't deregister mplsip6\n", __func__);
+ unregister_pernet_device(&ip6_tnl_net_ops);
}
module_init(ip6_tunnel_init);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
new file mode 100644
index 000000000000..cef3e0210744
--- /dev/null
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -0,0 +1,186 @@
+
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/in6.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+
+int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ struct sockaddr_in6 udp6_addr = {};
+ int err;
+ struct socket *sock = NULL;
+
+ err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ if (cfg->ipv6_v6only) {
+ err = ip6_sock_set_v6only(sock->sk);
+ if (err < 0)
+ goto error;
+ }
+ if (cfg->bind_ifindex) {
+ err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
+ if (err < 0)
+ goto error;
+ }
+
+ udp6_addr.sin6_family = AF_INET6;
+ memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
+ sizeof(udp6_addr.sin6_addr));
+ udp6_addr.sin6_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr,
+ sizeof(udp6_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ memset(&udp6_addr, 0, sizeof(udp6_addr));
+ udp6_addr.sin6_family = AF_INET6;
+ memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
+ sizeof(udp6_addr.sin6_addr));
+ udp6_addr.sin6_port = cfg->peer_udp_port;
+ err = kernel_connect(sock,
+ (struct sockaddr_unsized *)&udp6_addr,
+ sizeof(udp6_addr), 0);
+ }
+ if (err < 0)
+ goto error;
+
+ udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums);
+ udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums);
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sock_release(sock);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(udp_sock_create6);
+
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be32 label,
+ __be16 src_port, __be16 dst_port, bool nocheck,
+ u16 ip6cb_flags)
+{
+ struct udphdr *uh;
+ struct ipv6hdr *ip6h;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+
+ uh->len = htons(skb->len);
+
+ skb_dst_set(skb, dst);
+
+ udp6_set_csum(nocheck, skb, saddr, daddr, skb->len);
+
+ __skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, prio, label);
+ ip6h->payload_len = htons(skb->len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = ttl;
+ ip6h->daddr = *daddr;
+ ip6h->saddr = *saddr;
+
+ ip6tunnel_xmit(sk, skb, dev, ip6cb_flags);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
+
+/**
+ * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel
+ * @skb: Packet for which lookup is done
+ * @dev: Tunnel device
+ * @net: Network namespace of tunnel device
+ * @sock: Socket which provides route info
+ * @oif: Index of the output interface
+ * @saddr: Memory to store the src ip address
+ * @key: Tunnel information
+ * @sport: UDP source port
+ * @dport: UDP destination port
+ * @dsfield: The traffic class field
+ * @dst_cache: The dst cache to use for lookup
+ * This function performs a route lookup on a UDP tunnel
+ *
+ * It returns a valid dst pointer and stores src address to be used in
+ * tunnel in param saddr on success, else a pointer encoded error code.
+ */
+
+struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net,
+ struct socket *sock,
+ int oif,
+ struct in6_addr *saddr,
+ const struct ip_tunnel_key *key,
+ __be16 sport, __be16 dport, u8 dsfield,
+ struct dst_cache *dst_cache)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi6 fl6;
+
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache) {
+ dst = dst_cache_get_ip6(dst_cache, saddr);
+ if (dst)
+ return dst;
+ }
+#endif
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = IPPROTO_UDP;
+ fl6.flowi6_oif = oif;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.saddr = key->u.ipv6.src;
+ fl6.fl6_sport = sport;
+ fl6.fl6_dport = dport;
+ fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label);
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
+ NULL);
+ if (IS_ERR(dst)) {
+ netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (dst_dev(dst) == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
+ dst_release(dst);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache)
+ dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
+#endif
+ *saddr = fl6.saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup);
+
+MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
new file mode 100644
index 000000000000..ad5290be4dd6
--- /dev/null
+++ b/net/ipv6/ip6_vti.c
@@ -0,0 +1,1314 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPv6 virtual tunneling interface
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv6/ip6_tunnel.c
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
+#include <linux/etherdevice.h>
+
+#define IP6_VTI_HASH_SIZE_SHIFT 5
+#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
+
+static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
+{
+ u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
+
+ return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT);
+}
+
+static int vti6_dev_init(struct net_device *dev);
+static void vti6_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops vti6_link_ops __read_mostly;
+
+static unsigned int vti6_net_id __read_mostly;
+struct vti6_net {
+ /* the vti6 tunnel fallback device */
+ struct net_device *fb_tnl_dev;
+ /* lists for storing tunnels in use */
+ struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_wc[1];
+ struct ip6_tnl __rcu **tnls[2];
+};
+
+#define for_each_vti6_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/**
+ * vti6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @net: network namespace
+ * @remote: the address of the tunnel exit-point
+ * @local: the address of the tunnel entry-point
+ *
+ * Return:
+ * tunnel matching given end-points if found,
+ * else fallback tunnel if its device is up,
+ * else %NULL
+ **/
+static struct ip6_tnl *
+vti6_tnl_lookup(struct net *net, const struct in6_addr *remote,
+ const struct in6_addr *local)
+{
+ unsigned int hash = HASH(remote, local);
+ struct ip6_tnl *t;
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct in6_addr any;
+
+ for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ memset(&any, 0, sizeof(any));
+ hash = HASH(&any, local);
+ for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ hash = HASH(remote, &any);
+ for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(remote, &t->parms.raddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ t = rcu_dereference(ip6n->tnls_wc[0]);
+ if (t && (t->dev->flags & IFF_UP))
+ return t;
+
+ return NULL;
+}
+
+/**
+ * vti6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
+ * @p: parameters containing tunnel end-points
+ *
+ * Description:
+ * vti6_tnl_bucket() returns the head of the list matching the
+ * &struct in6_addr entries laddr and raddr in @p.
+ *
+ * Return: head of IPv6 tunnel list
+ **/
+static struct ip6_tnl __rcu **
+vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ unsigned int h = 0;
+ int prio = 0;
+
+ if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
+ prio = 1;
+ h = HASH(remote, local);
+ }
+ return &ip6n->tnls[prio][h];
+}
+
+static void
+vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms);
+
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+static void
+vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
+
+ for (tp = vti6_tnl_bucket(ip6n, &t->parms);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static int vti6_tnl_create2(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct vti6_net *ip6n = net_generic(t->net, vti6_net_id);
+ int err;
+
+ dev->rtnl_link_ops = &vti6_link_ops;
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(t->parms.name, dev->name);
+
+ vti6_tnl_link(ip6n, t);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
+{
+ struct net_device *dev;
+ struct ip6_tnl *t;
+ char name[IFNAMSIZ];
+ int err;
+
+ if (p->name[0]) {
+ if (!dev_valid_name(p->name))
+ goto failed;
+ strscpy(name, p->name, IFNAMSIZ);
+ } else {
+ sprintf(name, "ip6_vti%%d");
+ }
+
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup);
+ if (!dev)
+ goto failed;
+
+ dev_net_set(dev, net);
+
+ t = netdev_priv(dev);
+ t->parms = *p;
+ t->net = dev_net(dev);
+
+ err = vti6_tnl_create2(dev);
+ if (err < 0)
+ goto failed_free;
+
+ return t;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return NULL;
+}
+
+/**
+ * vti6_locate - find or create tunnel matching given parameters
+ * @net: network namespace
+ * @p: tunnel parameters
+ * @create: != 0 if allowed to create new tunnel if no match found
+ *
+ * Description:
+ * vti6_locate() first tries to locate an existing tunnel
+ * based on @parms. If this is unsuccessful, but @create is set a new
+ * tunnel device is created and registered for use.
+ *
+ * Return:
+ * matching tunnel or NULL
+ **/
+static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p,
+ int create)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *t;
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ for (tp = vti6_tnl_bucket(ip6n, p);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr)) {
+ if (create)
+ return NULL;
+
+ return t;
+ }
+ }
+ if (!create)
+ return NULL;
+ return vti6_tnl_create(net, p);
+}
+
+/**
+ * vti6_dev_uninit - tunnel device uninitializer
+ * @dev: the device to be destroyed
+ *
+ * Description:
+ * vti6_dev_uninit() removes tunnel from its list
+ **/
+static void vti6_dev_uninit(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct vti6_net *ip6n = net_generic(t->net, vti6_net_id);
+
+ if (dev == ip6n->fb_tnl_dev)
+ RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
+ else
+ vti6_tnl_unlink(ip6n, t);
+ netdev_put(dev, &t->dev_tracker);
+}
+
+static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ struct ip6_tnl *t;
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+ rcu_read_lock();
+ t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+ if (t) {
+ if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) {
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ ipv6h = ipv6_hdr(skb);
+ if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
+ DEV_STATS_INC(t->dev, rx_dropped);
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ rcu_read_unlock();
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+ }
+ rcu_read_unlock();
+ return -EINVAL;
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int vti6_rcv(struct sk_buff *skb)
+{
+ int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff];
+
+ return vti6_input_proto(skb, nexthdr, 0, 0);
+}
+
+static int vti6_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct xfrm_state *x;
+ const struct xfrm_mode *inner_mode;
+ struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
+ u32 orig_mark = skb->mark;
+ int ret;
+
+ if (!t)
+ return 1;
+
+ dev = t->dev;
+
+ if (err) {
+ DEV_STATS_INC(dev, rx_errors);
+ DEV_STATS_INC(dev, rx_dropped);
+
+ return 0;
+ }
+
+ x = xfrm_input_state(skb);
+
+ inner_mode = &x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->family;
+
+ skb->mark = be32_to_cpu(t->parms.i_key);
+ ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+ skb->mark = orig_mark;
+
+ if (!ret)
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
+ skb->dev = dev;
+ dev_sw_netstats_rx_add(dev, skb->len);
+
+ return 0;
+}
+
+/**
+ * vti6_addr_conflict - compare packet addresses to tunnel's own
+ * @t: the outgoing tunnel device
+ * @hdr: IPv6 header from the incoming packet
+ *
+ * Description:
+ * Avoid trivial tunneling loop by checking that tunnel exit-point
+ * doesn't match source of incoming packet.
+ *
+ * Return:
+ * 1 if conflict,
+ * 0 else
+ **/
+static inline bool
+vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
+{
+ return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
+}
+
+static bool vti6_state_check(const struct xfrm_state *x,
+ const struct in6_addr *dst,
+ const struct in6_addr *src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)src;
+
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET6)
+ return false;
+
+ if (ipv6_addr_any(dst))
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6))
+ return false;
+
+ return true;
+}
+
+/**
+ * vti6_xmit - send a packet
+ * @skb: the outgoing socket buffer
+ * @dev: the outgoing tunnel device
+ * @fl: the flow informations for the xfrm_lookup
+ **/
+static int
+vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *tdev;
+ struct xfrm_state *x;
+ int pkt_len = skb->len;
+ int err = -1;
+ int mtu;
+
+ if (!dst) {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt))
+ goto tx_err_link_failure;
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+ skb_dst_set(skb, dst);
+ break;
+ default:
+ goto tx_err_link_failure;
+ }
+ }
+
+ dst_hold(dst);
+ dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+
+ if (dst->flags & DST_XFRM_QUEUE)
+ goto xmit;
+
+ x = dst->xfrm;
+ if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr))
+ goto tx_err_link_failure;
+
+ if (!ip6_tnl_xmit_ctl(t, (const struct in6_addr *)&x->props.saddr,
+ (const struct in6_addr *)&x->id.daddr))
+ goto tx_err_link_failure;
+
+ tdev = dst_dev(dst);
+
+ if (tdev == dev) {
+ DEV_STATS_INC(dev, collisions);
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ t->parms.name);
+ goto tx_err_dst_release;
+ }
+
+ mtu = dst_mtu(dst);
+ if (skb->len > mtu) {
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ } else {
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
+ goto xmit;
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ }
+
+ err = -EMSGSIZE;
+ goto tx_err_dst_release;
+ }
+
+xmit:
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = dst_dev(dst);
+
+ err = dst_output(t->net, skb->sk, skb);
+ if (net_xmit_eval(err) == 0)
+ err = pkt_len;
+ iptunnel_xmit_stats(dev, err);
+
+ return 0;
+tx_err_link_failure:
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(dst);
+ return err;
+}
+
+static netdev_tx_t
+vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct flowi fl;
+ int ret;
+
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IPV6):
+ if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
+ vti6_addr_conflict(t, ipv6_hdr(skb)))
+ goto tx_err;
+
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6);
+ break;
+ case htons(ETH_P_IP):
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET);
+ break;
+ default:
+ goto tx_err;
+ }
+
+ /* override mark with tunnel output key */
+ fl.flowi_mark = be32_to_cpu(t->parms.o_key);
+
+ ret = vti6_xmit(skb, dev, &fl);
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip6_tnl *t;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ int protocol = iph->nexthdr;
+
+ t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr);
+ if (!t)
+ return -1;
+
+ mark = be32_to_cpu(t->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data + offset);
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data + offset);
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data + offset);
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET6);
+ if (!x)
+ return 0;
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu)
+{
+ struct net_device *dev = t->dev;
+ struct __ip6_tnl_parm *p = &t->parms;
+ struct net_device *tdev = NULL;
+ int mtu;
+
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+
+ p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV |
+ IP6_TNL_F_CAP_PER_PACKET);
+ p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
+
+ if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV)
+ dev->flags |= IFF_POINTOPOINT;
+ else
+ dev->flags &= ~IFF_POINTOPOINT;
+
+ if (keep_mtu && dev->mtu) {
+ WRITE_ONCE(dev->mtu,
+ clamp(dev->mtu, dev->min_mtu, dev->max_mtu));
+ return;
+ }
+
+ if (p->flags & IP6_TNL_F_CAP_XMIT) {
+ int strict = (ipv6_addr_type(&p->raddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
+ struct rt6_info *rt = rt6_lookup(t->net,
+ &p->raddr, &p->laddr,
+ p->link, NULL, strict);
+
+ if (rt)
+ tdev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
+
+ if (!tdev && p->link)
+ tdev = __dev_get_by_index(t->net, p->link);
+
+ if (tdev)
+ mtu = tdev->mtu - sizeof(struct ipv6hdr);
+ else
+ mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof(struct ipv6hdr);
+
+ dev->mtu = max_t(int, mtu, IPV4_MIN_MTU);
+}
+
+/**
+ * vti6_tnl_change - update the tunnel parameters
+ * @t: tunnel to be changed
+ * @p: tunnel configuration parameters
+ * @keep_mtu: MTU was set from userspace, don't re-compute it
+ *
+ * Description:
+ * vti6_tnl_change() updates the tunnel parameters
+ **/
+static int
+vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
+ bool keep_mtu)
+{
+ t->parms.laddr = p->laddr;
+ t->parms.raddr = p->raddr;
+ t->parms.link = p->link;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ t->parms.proto = p->proto;
+ t->parms.fwmark = p->fwmark;
+ dst_cache_reset(&t->dst_cache);
+ vti6_link_config(t, keep_mtu);
+ return 0;
+}
+
+static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p,
+ bool keep_mtu)
+{
+ struct net *net = dev_net(t->dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ int err;
+
+ vti6_tnl_unlink(ip6n, t);
+ synchronize_net();
+ err = vti6_tnl_change(t, p, keep_mtu);
+ vti6_tnl_link(ip6n, t);
+ netdev_state_change(t->dev);
+ return err;
+}
+
+static void
+vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u)
+{
+ p->laddr = u->laddr;
+ p->raddr = u->raddr;
+ p->link = u->link;
+ p->i_key = u->i_key;
+ p->o_key = u->o_key;
+ p->proto = u->proto;
+
+ memcpy(p->name, u->name, sizeof(u->name));
+}
+
+static void
+vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
+{
+ u->laddr = p->laddr;
+ u->raddr = p->raddr;
+ u->link = p->link;
+ u->i_key = p->i_key;
+ u->o_key = p->o_key;
+ if (u->i_key)
+ u->i_flags |= GRE_KEY;
+ if (u->o_key)
+ u->o_flags |= GRE_KEY;
+ u->proto = p->proto;
+
+ memcpy(u->name, p->name, sizeof(u->name));
+}
+
+/**
+ * vti6_siocdevprivate - configure vti6 tunnels from userspace
+ * @dev: virtual device associated with tunnel
+ * @ifr: unused
+ * @data: parameters passed from userspace
+ * @cmd: command to be performed
+ *
+ * Description:
+ * vti6_siocdevprivate() is used for managing vti6 tunnels
+ * from userspace.
+ *
+ * The possible commands are the following:
+ * %SIOCGETTUNNEL: get tunnel parameters for device
+ * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
+ * %SIOCCHGTUNNEL: change tunnel parameters to those given
+ * %SIOCDELTUNNEL: delete tunnel
+ *
+ * The fallback device "ip6_vti0", created during module
+ * initialization, can be used for creating other tunnel devices.
+ *
+ * Return:
+ * 0 on success,
+ * %-EFAULT if unable to copy data to or from userspace,
+ * %-EPERM if current process hasn't %CAP_NET_ADMIN set
+ * %-EINVAL if passed tunnel parameters are invalid,
+ * %-EEXIST if changing a tunnel's parameters would cause a conflict
+ * %-ENODEV if attempting to change or delete a nonexisting device
+ **/
+static int
+vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd)
+{
+ int err = 0;
+ struct ip6_tnl_parm2 p;
+ struct __ip6_tnl_parm p1;
+ struct ip6_tnl *t = NULL;
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ memset(&p1, 0, sizeof(p1));
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == ip6n->fb_tnl_dev) {
+ if (copy_from_user(&p, data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ vti6_parm_from_user(&p1, &p);
+ t = vti6_locate(net, &p1, 0);
+ } else {
+ memset(&p, 0, sizeof(p));
+ }
+ if (!t)
+ t = netdev_priv(dev);
+ vti6_parm_to_user(&p, &t->parms);
+ if (copy_to_user(data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = -EFAULT;
+ if (copy_from_user(&p, data, sizeof(p)))
+ break;
+ err = -EINVAL;
+ if (p.proto != IPPROTO_IPV6 && p.proto != 0)
+ break;
+ vti6_parm_from_user(&p1, &p);
+ t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL);
+ if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else
+ t = netdev_priv(dev);
+
+ err = vti6_update(t, &p1, false);
+ }
+ if (t) {
+ err = 0;
+ vti6_parm_to_user(&p, &t->parms);
+ if (copy_to_user(data, &p, sizeof(p)))
+ err = -EFAULT;
+
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+
+ if (dev == ip6n->fb_tnl_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, data, sizeof(p)))
+ break;
+ err = -ENOENT;
+ vti6_parm_from_user(&p1, &p);
+ t = vti6_locate(net, &p1, 0);
+ if (!t)
+ break;
+ err = -EPERM;
+ if (t->dev == ip6n->fb_tnl_dev)
+ break;
+ dev = t->dev;
+ }
+ err = 0;
+ unregister_netdevice(dev);
+ break;
+ default:
+ err = -EINVAL;
+ }
+ return err;
+}
+
+static const struct net_device_ops vti6_netdev_ops = {
+ .ndo_init = vti6_dev_init,
+ .ndo_uninit = vti6_dev_uninit,
+ .ndo_start_xmit = vti6_tnl_xmit,
+ .ndo_siocdevprivate = vti6_siocdevprivate,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+/**
+ * vti6_dev_setup - setup virtual tunnel device
+ * @dev: virtual device associated with tunnel
+ *
+ * Description:
+ * Initialize function pointers and device parameters
+ **/
+static void vti6_dev_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti6_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
+ dev->needs_free_netdev = true;
+
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+ dev->type = ARPHRD_TUNNEL6;
+ dev->min_mtu = IPV4_MIN_MTU;
+ dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr);
+ dev->flags |= IFF_NOARP;
+ dev->addr_len = sizeof(struct in6_addr);
+ netif_keep_dst(dev);
+ /* This perm addr will be used as interface identifier by IPv6 */
+ dev->addr_assign_type = NET_ADDR_RANDOM;
+ eth_random_addr(dev->perm_addr);
+}
+
+/**
+ * vti6_dev_init_gen - general initializer for all tunnel devices
+ * @dev: virtual device associated with tunnel
+ **/
+static inline int vti6_dev_init_gen(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ t->dev = dev;
+ netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
+ return 0;
+}
+
+/**
+ * vti6_dev_init - initializer for all non fallback tunnel devices
+ * @dev: virtual device associated with tunnel
+ **/
+static int vti6_dev_init(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ int err = vti6_dev_init_gen(dev);
+
+ if (err)
+ return err;
+ vti6_link_config(t, true);
+ return 0;
+}
+
+/**
+ * vti6_fb_tnl_dev_init - initializer for fallback tunnel device
+ * @dev: fallback device
+ *
+ * Return: 0
+ **/
+static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ t->net = net;
+ t->parms.proto = IPPROTO_IPV6;
+
+ rcu_assign_pointer(ip6n->tnls_wc[0], t);
+ return 0;
+}
+
+static int vti6_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void vti6_netlink_parms(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+ if (data[IFLA_VTI_FWMARK])
+ parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
+}
+
+static int vti6_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct ip6_tnl *nt;
+ struct net *net;
+
+ net = params->link_net ? : dev_net(dev);
+ nt = netdev_priv(dev);
+ vti6_netlink_parms(data, &nt->parms);
+
+ nt->parms.proto = IPPROTO_IPV6;
+ nt->net = net;
+
+ if (vti6_locate(net, &nt->parms, 0))
+ return -EEXIST;
+
+ return vti6_tnl_create2(dev);
+}
+
+static void vti6_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ if (dev != ip6n->fb_tnl_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static int vti6_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip6_tnl *t;
+ struct __ip6_tnl_parm p;
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ if (dev == ip6n->fb_tnl_dev)
+ return -EINVAL;
+
+ vti6_netlink_parms(data, &p);
+
+ t = vti6_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ return vti6_update(t, &p, tb && tb[IFLA_MTU]);
+}
+
+static size_t vti6_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_FWMARK */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+ struct __ip6_tnl_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) ||
+ nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) ||
+ nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) ||
+ nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) ||
+ nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key) ||
+ nla_put_u32(skb, IFLA_VTI_FWMARK, parm->fwmark))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) },
+ [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_FWMARK] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops vti6_link_ops __read_mostly = {
+ .kind = "vti6",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti6_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = vti6_dev_setup,
+ .validate = vti6_validate,
+ .newlink = vti6_newlink,
+ .dellink = vti6_dellink,
+ .changelink = vti6_changelink,
+ .get_size = vti6_get_size,
+ .fill_info = vti6_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list)
+{
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct ip6_tnl *t;
+ int h;
+
+ for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
+ t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
+ while (t) {
+ unregister_netdevice_queue(t->dev, list);
+ t = rtnl_net_dereference(net, t->next);
+ }
+ }
+
+ t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
+ if (t)
+ unregister_netdevice_queue(t->dev, list);
+}
+
+static int __net_init vti6_init_net(struct net *net)
+{
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct ip6_tnl *t = NULL;
+ int err;
+
+ ip6n->tnls[0] = ip6n->tnls_wc;
+ ip6n->tnls[1] = ip6n->tnls_r_l;
+
+ if (!net_has_fallback_tunnels(net))
+ return 0;
+ err = -ENOMEM;
+ ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0",
+ NET_NAME_UNKNOWN, vti6_dev_setup);
+
+ if (!ip6n->fb_tnl_dev)
+ goto err_alloc_dev;
+ dev_net_set(ip6n->fb_tnl_dev, net);
+ ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops;
+
+ err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ err = register_netdev(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ t = netdev_priv(ip6n->fb_tnl_dev);
+
+ strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
+ return 0;
+
+err_register:
+ free_netdev(ip6n->fb_tnl_dev);
+err_alloc_dev:
+ return err;
+}
+
+static struct pernet_operations vti6_net_ops = {
+ .init = vti6_init_net,
+ .exit_rtnl = vti6_exit_rtnl_net,
+ .id = &vti6_net_id,
+ .size = sizeof(struct vti6_net),
+};
+
+static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
+ .handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 100,
+};
+
+static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
+ .handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 100,
+};
+
+static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
+ .handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 100,
+};
+
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+static int vti6_rcv_tunnel(struct sk_buff *skb)
+{
+ const xfrm_address_t *saddr;
+ __be32 spi;
+
+ saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr;
+ spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr);
+
+ return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0);
+}
+
+static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = {
+ .handler = vti6_rcv_tunnel,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 0,
+};
+
+static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = {
+ .handler = vti6_rcv_tunnel,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 0,
+};
+#endif
+
+/**
+ * vti6_tunnel_init - register protocol and reserve needed resources
+ *
+ * Return: 0 on success
+ **/
+static int __init vti6_tunnel_init(void)
+{
+ const char *msg;
+ int err;
+
+ msg = "tunnel device";
+ err = register_pernet_device(&vti6_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "tunnel protocols";
+ err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ msg = "ipv6 tunnel";
+ err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6);
+ if (err < 0)
+ goto vti_tunnel_ipv6_failed;
+ err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET);
+ if (err < 0)
+ goto vti_tunnel_ip6ip_failed;
+#endif
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&vti6_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return 0;
+
+rtnl_link_failed:
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET);
+vti_tunnel_ip6ip_failed:
+ err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6);
+vti_tunnel_ipv6_failed:
+#endif
+ xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
+ xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ unregister_pernet_device(&vti6_net_ops);
+pernet_dev_failed:
+ pr_err("vti6 init: failed to register %s\n", msg);
+ return err;
+}
+
+/**
+ * vti6_tunnel_cleanup - free resources and unregister protocol
+ **/
+static void __exit vti6_tunnel_cleanup(void)
+{
+ rtnl_link_unregister(&vti6_link_ops);
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET);
+ xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6);
+#endif
+ xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
+ xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
+ xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti6_net_ops);
+}
+
+module_init(vti6_tunnel_init);
+module_exit(vti6_tunnel_cleanup);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti6");
+MODULE_ALIAS_NETDEV("ip6_vti0");
+MODULE_AUTHOR("Steffen Klassert");
+MODULE_DESCRIPTION("IPv6 virtual tunnel interface");
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
new file mode 100644
index 000000000000..e047a4680ab0
--- /dev/null
+++ b/net/ipv6/ip6mr.c
@@ -0,0 +1,2774 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Linux IPv6 multicast routing support for BSD pim6sd
+ * Based on net/ipv4/ipmr.c.
+ *
+ * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
+ * LSIIT Laboratory, Strasbourg, France
+ * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
+ * 6WIND, Paris, France
+ * Copyright (C)2007,2008 USAGI/WIDE Project
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/rhashtable.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <linux/mroute6.h>
+#include <linux/pim.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/export.h>
+#include <net/ip6_checksum.h>
+#include <linux/netconf.h>
+#include <net/ip_tunnels.h>
+
+#include <linux/nospec.h>
+
+struct ip6mr_rule {
+ struct fib_rule common;
+};
+
+struct ip6mr_result {
+ struct mr_table *mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
+
+/* Multicast router control variables */
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ entries is changed only in process context and protected
+ with weak lock mrt_lock. Queue of unresolved entries is protected
+ with strong spinlock mfc_unres_lock.
+
+ In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __read_mostly;
+
+static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
+static void ip6mr_free_table(struct mr_table *mrt);
+
+static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *cache);
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
+ mifi_t mifi, int assert);
+static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
+ int cmd);
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
+static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+static int ip6mr_rtm_dumproute(struct sk_buff *skb,
+ struct netlink_callback *cb);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
+static void ipmr_expire_process(struct timer_list *t);
+
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+#define ip6mr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \
+ lockdep_rtnl_is_held() || \
+ list_empty(&net->ipv6.mr6_tables))
+
+static struct mr_table *ip6mr_mr_table_iter(struct net *net,
+ struct mr_table *mrt)
+{
+ struct mr_table *ret;
+
+ if (!mrt)
+ ret = list_entry_rcu(net->ipv6.mr6_tables.next,
+ struct mr_table, list);
+ else
+ ret = list_entry_rcu(mrt->list.next,
+ struct mr_table, list);
+
+ if (&ret->list == &net->ipv6.mr6_tables)
+ return NULL;
+ return ret;
+}
+
+static struct mr_table *__ip6mr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ ip6mr_for_each_table(mrt, net) {
+ if (mrt->id == id)
+ return mrt;
+ }
+ return NULL;
+}
+
+static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ rcu_read_lock();
+ mrt = __ip6mr_get_table(net, id);
+ rcu_read_unlock();
+ return mrt;
+}
+
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
+ struct mr_table **mrt)
+{
+ int err;
+ struct ip6mr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi6_to_flowi(flp6));
+
+ err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
+ flowi6_to_flowi(flp6), 0, &arg);
+ if (err < 0)
+ return err;
+ *mrt = res.mrt;
+ return 0;
+}
+
+static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct ip6mr_result *res = arg->result;
+ struct mr_table *mrt;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = __ip6mr_get_table(rule->fr_net, arg->table);
+ if (!mrt)
+ return -EAGAIN;
+ res->mrt = mrt;
+ return 0;
+}
+
+static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
+{
+ return 1;
+}
+
+static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ return 1;
+}
+
+static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ frh->dst_len = 0;
+ frh->src_len = 0;
+ frh->tos = 0;
+ return 0;
+}
+
+static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
+ .family = RTNL_FAMILY_IP6MR,
+ .rule_size = sizeof(struct ip6mr_rule),
+ .addr_size = sizeof(struct in6_addr),
+ .action = ip6mr_rule_action,
+ .match = ip6mr_rule_match,
+ .configure = ip6mr_rule_configure,
+ .compare = ip6mr_rule_compare,
+ .fill = ip6mr_rule_fill,
+ .nlgroup = RTNLGRP_IPV6_RULE,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ip6mr_rules_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ struct mr_table *mrt;
+ int err;
+
+ ops = fib_rules_register(&ip6mr_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ INIT_LIST_HEAD(&net->ipv6.mr6_tables);
+
+ mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
+ goto err1;
+ }
+
+ err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT);
+ if (err < 0)
+ goto err2;
+
+ net->ipv6.mr6_rules_ops = ops;
+ return 0;
+
+err2:
+ rtnl_lock();
+ ip6mr_free_table(mrt);
+ rtnl_unlock();
+err1:
+ fib_rules_unregister(ops);
+ return err;
+}
+
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+ struct mr_table *mrt, *next;
+
+ ASSERT_RTNL();
+ list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
+ list_del(&mrt->list);
+ ip6mr_free_table(mrt);
+ }
+ fib_rules_unregister(net->ipv6.mr6_rules_ops);
+}
+
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack);
+}
+
+static unsigned int ip6mr_rules_seq_read(const struct net *net)
+{
+ return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR);
+}
+
+bool ip6mr_rule_default(const struct fib_rule *rule)
+{
+ return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL &&
+ rule->table == RT6_TABLE_DFLT && !rule->l3mdev;
+}
+EXPORT_SYMBOL(ip6mr_rule_default);
+#else
+#define ip6mr_for_each_table(mrt, net) \
+ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
+
+static struct mr_table *ip6mr_mr_table_iter(struct net *net,
+ struct mr_table *mrt)
+{
+ if (!mrt)
+ return net->ipv6.mrt6;
+ return NULL;
+}
+
+static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
+{
+ return net->ipv6.mrt6;
+}
+
+#define __ip6mr_get_table ip6mr_get_table
+
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
+ struct mr_table **mrt)
+{
+ *mrt = net->ipv6.mrt6;
+ return 0;
+}
+
+static int __net_init ip6mr_rules_init(struct net *net)
+{
+ struct mr_table *mrt;
+
+ mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+ if (IS_ERR(mrt))
+ return PTR_ERR(mrt);
+ net->ipv6.mrt6 = mrt;
+ return 0;
+}
+
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+ ASSERT_RTNL();
+ ip6mr_free_table(net->ipv6.mrt6);
+ net->ipv6.mrt6 = NULL;
+}
+
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static unsigned int ip6mr_rules_seq_read(const struct net *net)
+{
+ return 0;
+}
+#endif
+
+static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct mfc6_cache_cmp_arg *cmparg = arg->key;
+ struct mfc6_cache *c = (struct mfc6_cache *)ptr;
+
+ return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) ||
+ !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin);
+}
+
+static const struct rhashtable_params ip6mr_rht_params = {
+ .head_offset = offsetof(struct mr_mfc, mnode),
+ .key_offset = offsetof(struct mfc6_cache, cmparg),
+ .key_len = sizeof(struct mfc6_cache_cmp_arg),
+ .nelem_hint = 3,
+ .obj_cmpfn = ip6mr_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+static void ip6mr_new_table_set(struct mr_table *mrt,
+ struct net *net)
+{
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+ list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
+#endif
+}
+
+static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = {
+ .mf6c_origin = IN6ADDR_ANY_INIT,
+ .mf6c_mcastgrp = IN6ADDR_ANY_INIT,
+};
+
+static struct mr_table_ops ip6mr_mr_table_ops = {
+ .rht_params = &ip6mr_rht_params,
+ .cmparg_any = &ip6mr_mr_table_ops_cmparg_any,
+};
+
+static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ mrt = __ip6mr_get_table(net, id);
+ if (mrt)
+ return mrt;
+
+ return mr_table_alloc(net, id, &ip6mr_mr_table_ops,
+ ipmr_expire_process, ip6mr_new_table_set);
+}
+
+static void ip6mr_free_table(struct mr_table *mrt)
+{
+ struct net *net = read_pnet(&mrt->net);
+
+ WARN_ON_ONCE(!mr_can_free_table(net));
+
+ timer_shutdown_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
+ MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC);
+ rhltable_destroy(&mrt->mfc_hash);
+ kfree(mrt);
+}
+
+#ifdef CONFIG_PROC_FS
+/* The /proc interfaces to multicast routing
+ * /proc/ip6_mr_cache /proc/ip6_mr_vif
+ */
+
+static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct mr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ rcu_read_lock();
+ mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt) {
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+ }
+
+ iter->mrt = mrt;
+
+ return mr_vif_seq_start(seq, pos);
+}
+
+static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
+{
+ struct mr_vif_iter *iter = seq->private;
+ struct mr_table *mrt = iter->mrt;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
+ } else {
+ const struct vif_device *vif = v;
+ const struct net_device *vif_dev;
+ const char *name;
+
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
+
+ seq_printf(seq,
+ "%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
+ vif - mrt->vif_table,
+ name, vif->bytes_in, vif->pkt_in,
+ vif->bytes_out, vif->pkt_out,
+ vif->flags);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip6mr_vif_seq_ops = {
+ .start = ip6mr_vif_seq_start,
+ .next = mr_vif_seq_next,
+ .stop = ip6mr_vif_seq_stop,
+ .show = ip6mr_vif_seq_show,
+};
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+ int n;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Group "
+ "Origin "
+ "Iif Pkts Bytes Wrong Oifs\n");
+ } else {
+ const struct mfc6_cache *mfc = v;
+ const struct mr_mfc_iter *it = seq->private;
+ struct mr_table *mrt = it->mrt;
+
+ seq_printf(seq, "%pI6 %pI6 %-3hd",
+ &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
+ mfc->_c.mfc_parent);
+
+ if (it->cache != &mrt->mfc_unres_queue) {
+ seq_printf(seq, " %8lu %8lu %8lu",
+ atomic_long_read(&mfc->_c.mfc_un.res.pkt),
+ atomic_long_read(&mfc->_c.mfc_un.res.bytes),
+ atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
+ for (n = mfc->_c.mfc_un.res.minvif;
+ n < mfc->_c.mfc_un.res.maxvif; n++) {
+ if (VIF_EXISTS(mrt, n) &&
+ mfc->_c.mfc_un.res.ttls[n] < 255)
+ seq_printf(seq,
+ " %2d:%-3d", n,
+ mfc->_c.mfc_un.res.ttls[n]);
+ }
+ } else {
+ /* unresolved mfc_caches don't contain
+ * pkt, bytes and wrong_if values
+ */
+ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+ }
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+ .start = ipmr_mfc_seq_start,
+ .next = mr_mfc_seq_next,
+ .stop = mr_mfc_seq_stop,
+ .show = ipmr_mfc_seq_show,
+};
+#endif
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+
+static int pim6_rcv(struct sk_buff *skb)
+{
+ struct pimreghdr *pim;
+ struct ipv6hdr *encap;
+ struct net_device *reg_dev = NULL;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
+ };
+ int reg_vif_num;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
+ goto drop;
+
+ pim = (struct pimreghdr *)skb_transport_header(skb);
+ if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
+ (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+ sizeof(*pim), IPPROTO_PIM,
+ csum_partial((void *)pim, sizeof(*pim), 0)) &&
+ csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+ goto drop;
+
+ /* check if the inner packet is destined to mcast group */
+ encap = (struct ipv6hdr *)(skb_transport_header(skb) +
+ sizeof(*pim));
+
+ if (!ipv6_addr_is_multicast(&encap->daddr) ||
+ encap->payload_len == 0 ||
+ ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
+ goto drop;
+
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+ goto drop;
+
+ /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */
+ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+ if (reg_vif_num >= 0)
+ reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]);
+
+ if (!reg_dev)
+ goto drop;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ return 0;
+ drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static const struct inet6_protocol pim6_protocol = {
+ .handler = pim6_rcv,
+};
+
+/* Service routines creating virtual interfaces: PIMREG */
+
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_oif = dev->ifindex,
+ .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi6_mark = skb->mark,
+ };
+
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+ goto tx_err;
+
+ DEV_STATS_ADD(dev, tx_bytes, skb->len);
+ DEV_STATS_INC(dev, tx_packets);
+ rcu_read_lock();
+ ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ MRT6MSG_WHOLEPKT);
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int reg_vif_get_iflink(const struct net_device *dev)
+{
+ return 0;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+ .ndo_start_xmit = reg_vif_xmit,
+ .ndo_get_iflink = reg_vif_get_iflink,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+ dev->type = ARPHRD_PIMREG;
+ dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8;
+ dev->flags = IFF_NOARP;
+ dev->netdev_ops = &reg_vif_netdev_ops;
+ dev->needs_free_netdev = true;
+ dev->netns_immutable = true;
+}
+
+static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (mrt->id == RT6_TABLE_DFLT)
+ sprintf(name, "pim6reg");
+ else
+ sprintf(name, "pim6reg%u", mrt->id);
+
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ if (register_netdevice(dev)) {
+ free_netdev(dev);
+ return NULL;
+ }
+
+ if (dev_open(dev, NULL))
+ goto failure;
+
+ dev_hold(dev);
+ return dev;
+
+failure:
+ unregister_netdevice(dev);
+ return NULL;
+}
+#endif
+
+static int call_ip6mr_vif_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct vif_device *vif,
+ struct net_device *vif_dev,
+ mifi_t vif_index, u32 tb_id)
+{
+ return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
+ vif, vif_dev, vif_index, tb_id,
+ &net->ipv6.ipmr_seq);
+}
+
+static int call_ip6mr_mfc_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct mfc6_cache *mfc, u32 tb_id)
+{
+ return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
+ &mfc->_c, tb_id, &net->ipv6.ipmr_seq);
+}
+
+/* Delete a VIF entry */
+static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
+ struct list_head *head)
+{
+ struct vif_device *v;
+ struct net_device *dev;
+ struct inet6_dev *in6_dev;
+
+ if (vifi < 0 || vifi >= mrt->maxvif)
+ return -EADDRNOTAVAIL;
+
+ v = &mrt->vif_table[vifi];
+
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+
+ call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
+ FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ spin_lock(&mrt_lock);
+ RCU_INIT_POINTER(v->dev, NULL);
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
+#endif
+
+ if (vifi + 1 == mrt->maxvif) {
+ int tmp;
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
+ if (VIF_EXISTS(mrt, tmp))
+ break;
+ }
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
+ }
+
+ spin_unlock(&mrt_lock);
+
+ dev_set_allmulti(dev, -1);
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev) {
+ atomic_dec(&in6_dev->cnf.mc_forwarding);
+ inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in6_dev->cnf);
+ }
+
+ if ((v->flags & MIFF_REGISTER) && !notify)
+ unregister_netdevice_queue(dev, head);
+
+ netdev_put(dev, &v->dev_tracker);
+ return 0;
+}
+
+static inline void ip6mr_cache_free_rcu(struct rcu_head *head)
+{
+ struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
+
+ kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c);
+}
+
+static inline void ip6mr_cache_free(struct mfc6_cache *c)
+{
+ call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ and reporting error to netlink readers.
+ */
+
+static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+
+ atomic_dec(&mrt->cache_resolve_queue_len);
+
+ while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) {
+ if (ipv6_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = skb_pull(skb,
+ sizeof(struct ipv6hdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT;
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else
+ kfree_skb(skb);
+ }
+
+ ip6mr_cache_free(c);
+}
+
+
+/* Timer process for all the unresolved queue. */
+
+static void ipmr_do_expire_process(struct mr_table *mrt)
+{
+ unsigned long now = jiffies;
+ unsigned long expires = 10 * HZ;
+ struct mr_mfc *c, *next;
+
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ if (time_after(c->mfc_un.unres.expires, now)) {
+ /* not yet... */
+ unsigned long interval = c->mfc_un.unres.expires - now;
+ if (interval < expires)
+ expires = interval;
+ continue;
+ }
+
+ list_del(&c->list);
+ mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
+ ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
+ }
+
+ if (!list_empty(&mrt->mfc_unres_queue))
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+}
+
+static void ipmr_expire_process(struct timer_list *t)
+{
+ struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer);
+
+ if (!spin_trylock(&mfc_unres_lock)) {
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
+ return;
+ }
+
+ if (!list_empty(&mrt->mfc_unres_queue))
+ ipmr_do_expire_process(mrt);
+
+ spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under locked mrt_lock. */
+
+static void ip6mr_update_thresholds(struct mr_table *mrt,
+ struct mr_mfc *cache,
+ unsigned char *ttls)
+{
+ int vifi;
+
+ cache->mfc_un.res.minvif = MAXMIFS;
+ cache->mfc_un.res.maxvif = 0;
+ memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
+
+ for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+ if (VIF_EXISTS(mrt, vifi) &&
+ ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+ if (cache->mfc_un.res.minvif > vifi)
+ cache->mfc_un.res.minvif = vifi;
+ if (cache->mfc_un.res.maxvif <= vifi)
+ cache->mfc_un.res.maxvif = vifi + 1;
+ }
+ }
+ WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
+}
+
+static int mif6_add(struct net *net, struct mr_table *mrt,
+ struct mif6ctl *vifc, int mrtsock)
+{
+ int vifi = vifc->mif6c_mifi;
+ struct vif_device *v = &mrt->vif_table[vifi];
+ struct net_device *dev;
+ struct inet6_dev *in6_dev;
+ int err;
+
+ /* Is vif busy ? */
+ if (VIF_EXISTS(mrt, vifi))
+ return -EADDRINUSE;
+
+ switch (vifc->mif6c_flags) {
+#ifdef CONFIG_IPV6_PIMSM_V2
+ case MIFF_REGISTER:
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (mrt->mroute_reg_vif_num >= 0)
+ return -EADDRINUSE;
+ dev = ip6mr_reg_vif(net, mrt);
+ if (!dev)
+ return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ unregister_netdevice(dev);
+ dev_put(dev);
+ return err;
+ }
+ break;
+#endif
+ case 0:
+ dev = dev_get_by_index(net, vifc->mif6c_pifi);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ dev_put(dev);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev) {
+ atomic_inc(&in6_dev->cnf.mc_forwarding);
+ inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in6_dev->cnf);
+ }
+
+ /* Fill in the VIF structures */
+ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold,
+ vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0),
+ MIFF_REGISTER);
+
+ /* And finish update writing critical data */
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
+ netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (v->flags & MIFF_REGISTER)
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
+#endif
+ if (vifi + 1 > mrt->maxvif)
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
+ call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
+ v, dev, vifi, mrt->id);
+ return 0;
+}
+
+static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt,
+ const struct in6_addr *origin,
+ const struct in6_addr *mcastgrp)
+{
+ struct mfc6_cache_cmp_arg arg = {
+ .mf6c_origin = *origin,
+ .mf6c_mcastgrp = *mcastgrp,
+ };
+
+ return mr_mfc_find(mrt, &arg);
+}
+
+/* Look for a (*,G) entry */
+static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
+ struct in6_addr *mcastgrp,
+ mifi_t mifi)
+{
+ struct mfc6_cache_cmp_arg arg = {
+ .mf6c_origin = in6addr_any,
+ .mf6c_mcastgrp = *mcastgrp,
+ };
+
+ if (ipv6_addr_any(mcastgrp))
+ return mr_mfc_find_any_parent(mrt, mifi);
+ return mr_mfc_find_any(mrt, mifi, &arg);
+}
+
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc6_cache *
+ip6mr_cache_find_parent(struct mr_table *mrt,
+ const struct in6_addr *origin,
+ const struct in6_addr *mcastgrp,
+ int parent)
+{
+ struct mfc6_cache_cmp_arg arg = {
+ .mf6c_origin = *origin,
+ .mf6c_mcastgrp = *mcastgrp,
+ };
+
+ return mr_mfc_find_parent(mrt, &arg, parent);
+}
+
+/* Allocate a multicast cache entry */
+static struct mfc6_cache *ip6mr_cache_alloc(void)
+{
+ struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+ if (!c)
+ return NULL;
+ c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
+ c->_c.mfc_un.res.minvif = MAXMIFS;
+ c->_c.free = ip6mr_cache_free_rcu;
+ refcount_set(&c->_c.mfc_un.res.refcount, 1);
+ return c;
+}
+
+static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
+{
+ struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+ if (!c)
+ return NULL;
+ skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
+ c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
+ return c;
+}
+
+/*
+ * A cache entry has gone into a resolved state from queued
+ */
+
+static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
+ struct mfc6_cache *uc, struct mfc6_cache *c)
+{
+ struct sk_buff *skb;
+
+ /*
+ * Play the pending entries through our router
+ */
+
+ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
+ if (ipv6_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = skb_pull(skb,
+ sizeof(struct ipv6hdr));
+
+ if (mr_fill_mroute(mrt, skb, &c->_c,
+ nlmsg_data(nlh)) > 0) {
+ nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
+ } else {
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
+ }
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
+ rcu_read_lock();
+ ip6_mr_forward(net, mrt, skb->dev, skb, c);
+ rcu_read_unlock();
+ }
+ }
+}
+
+/*
+ * Bounce a cache query up to pim6sd and netlink.
+ *
+ * Called under rcu_read_lock()
+ */
+
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
+ mifi_t mifi, int assert)
+{
+ struct sock *mroute6_sk;
+ struct sk_buff *skb;
+ struct mrt6msg *msg;
+ int ret;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE)
+ skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
+ +sizeof(*msg));
+ else
+#endif
+ skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);
+
+ if (!skb)
+ return -ENOBUFS;
+
+ /* I suppose that internal messages
+ * do not require checksums */
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) {
+ /* Ugly, but we have no choice with this interface.
+ Duplicate old header, fix length etc.
+ And all this only to mangle msg->im6_msgtype and
+ to set msg->im6_mbz to "mbz" :-)
+ */
+ __skb_pull(skb, skb_network_offset(pkt));
+
+ skb_push(skb, sizeof(*msg));
+ skb_reset_transport_header(skb);
+ msg = (struct mrt6msg *)skb_transport_header(skb);
+ msg->im6_mbz = 0;
+ msg->im6_msgtype = assert;
+ if (assert == MRT6MSG_WRMIFWHOLE)
+ msg->im6_mif = mifi;
+ else
+ msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num);
+ msg->im6_pad = 0;
+ msg->im6_src = ipv6_hdr(pkt)->saddr;
+ msg->im6_dst = ipv6_hdr(pkt)->daddr;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else
+#endif
+ {
+ /*
+ * Copy the IP header
+ */
+
+ skb_put(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));
+
+ /*
+ * Add our header
+ */
+ skb_put(skb, sizeof(*msg));
+ skb_reset_transport_header(skb);
+ msg = (struct mrt6msg *)skb_transport_header(skb);
+
+ msg->im6_mbz = 0;
+ msg->im6_msgtype = assert;
+ msg->im6_mif = mifi;
+ msg->im6_pad = 0;
+ msg->im6_src = ipv6_hdr(pkt)->saddr;
+ msg->im6_dst = ipv6_hdr(pkt)->daddr;
+
+ skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ mroute6_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute6_sk) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ mrt6msg_netlink_event(mrt, skb);
+
+ /* Deliver to user space multicast routing algorithms */
+ ret = sock_queue_rcv_skb(mroute6_sk, skb);
+
+ if (ret < 0) {
+ net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/* Queue a packet for resolution. It gets locked cache entry! */
+static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
+ struct sk_buff *skb, struct net_device *dev)
+{
+ struct mfc6_cache *c;
+ bool found = false;
+ int err;
+
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
+ if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
+ ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ /*
+ * Create a new entry if allowable
+ */
+
+ c = ip6mr_cache_alloc_unres();
+ if (!c) {
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ /* Fill in the new cache entry */
+ c->_c.mfc_parent = -1;
+ c->mf6c_origin = ipv6_hdr(skb)->saddr;
+ c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;
+
+ /*
+ * Reflect first query at pim6sd
+ */
+ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE);
+ if (err < 0) {
+ /* If the report failed throw the cache entry
+ out - Brad Parker
+ */
+ spin_unlock_bh(&mfc_unres_lock);
+
+ ip6mr_cache_free(c);
+ kfree_skb(skb);
+ return err;
+ }
+
+ atomic_inc(&mrt->cache_resolve_queue_len);
+ list_add(&c->_c.list, &mrt->mfc_unres_queue);
+ mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+
+ ipmr_do_expire_process(mrt);
+ }
+
+ /* See if we can append the packet */
+ if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
+ kfree_skb(skb);
+ err = -ENOBUFS;
+ } else {
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
+ skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
+ err = 0;
+ }
+
+ spin_unlock_bh(&mfc_unres_lock);
+ return err;
+}
+
+/*
+ * MFC6 cache manipulation by user space
+ */
+
+static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
+ int parent)
+{
+ struct mfc6_cache *c;
+
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
+ &mfc->mf6cc_mcastgrp.sin6_addr, parent);
+ rcu_read_unlock();
+ if (!c)
+ return -ENOENT;
+ rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params);
+ list_del_rcu(&c->_c.list);
+
+ call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+ FIB_EVENT_ENTRY_DEL, c, mrt->id);
+ mr6_netlink_event(mrt, c, RTM_DELROUTE);
+ mr_cache_put(&c->_c);
+ return 0;
+}
+
+static int ip6mr_device_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct vif_device *v;
+ int ct;
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ ip6mr_for_each_table(mrt, net) {
+ v = &mrt->vif_table[0];
+ for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+ if (rcu_access_pointer(v->dev) == dev)
+ mif6_delete(mrt, ct, 1, NULL);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static unsigned int ip6mr_seq_read(const struct net *net)
+{
+ return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net);
+}
+
+static int ip6mr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
+ ip6mr_mr_table_iter, extack);
+}
+
+static struct notifier_block ip6_mr_notifier = {
+ .notifier_call = ip6mr_device_event
+};
+
+static const struct fib_notifier_ops ip6mr_notifier_ops_template = {
+ .family = RTNL_FAMILY_IP6MR,
+ .fib_seq_read = ip6mr_seq_read,
+ .fib_dump = ip6mr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ip6mr_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
+
+ net->ipv6.ipmr_seq = 0;
+
+ ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ net->ipv6.ip6mr_notifier_ops = ops;
+
+ return 0;
+}
+
+static void __net_exit ip6mr_notifier_exit(struct net *net)
+{
+ fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops);
+ net->ipv6.ip6mr_notifier_ops = NULL;
+}
+
+/* Setup for IP multicast routing */
+static int __net_init ip6mr_net_init(struct net *net)
+{
+ int err;
+
+ err = ip6mr_notifier_init(net);
+ if (err)
+ return err;
+
+ err = ip6mr_rules_init(net);
+ if (err < 0)
+ goto ip6mr_rules_fail;
+
+#ifdef CONFIG_PROC_FS
+ err = -ENOMEM;
+ if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops,
+ sizeof(struct mr_vif_iter)))
+ goto proc_vif_fail;
+ if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+ sizeof(struct mr_mfc_iter)))
+ goto proc_cache_fail;
+#endif
+
+ return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+ remove_proc_entry("ip6_mr_vif", net->proc_net);
+proc_vif_fail:
+ rtnl_lock();
+ ip6mr_rules_exit(net);
+ rtnl_unlock();
+#endif
+ip6mr_rules_fail:
+ ip6mr_notifier_exit(net);
+ return err;
+}
+
+static void __net_exit ip6mr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip6_mr_cache", net->proc_net);
+ remove_proc_entry("ip6_mr_vif", net->proc_net);
+#endif
+ ip6mr_notifier_exit(net);
+}
+
+static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip6mr_rules_exit(net);
+ rtnl_unlock();
+}
+
+static struct pernet_operations ip6mr_net_ops = {
+ .init = ip6mr_net_init,
+ .exit = ip6mr_net_exit,
+ .exit_batch = ip6mr_net_exit_batch,
+};
+
+static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR,
+ .msgtype = RTM_GETROUTE,
+ .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute},
+};
+
+int __init ip6_mr_init(void)
+{
+ int err;
+
+ mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN);
+ if (!mrt_cachep)
+ return -ENOMEM;
+
+ err = register_pernet_subsys(&ip6mr_net_ops);
+ if (err)
+ goto reg_pernet_fail;
+
+ err = register_netdevice_notifier(&ip6_mr_notifier);
+ if (err)
+ goto reg_notif_fail;
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) {
+ pr_err("%s: can't add PIM protocol\n", __func__);
+ err = -EAGAIN;
+ goto add_proto_fail;
+ }
+#endif
+ err = rtnl_register_many(ip6mr_rtnl_msg_handlers);
+ if (!err)
+ return 0;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
+add_proto_fail:
+ unregister_netdevice_notifier(&ip6_mr_notifier);
+#endif
+reg_notif_fail:
+ unregister_pernet_subsys(&ip6mr_net_ops);
+reg_pernet_fail:
+ kmem_cache_destroy(mrt_cachep);
+ return err;
+}
+
+void __init ip6_mr_cleanup(void)
+{
+ rtnl_unregister_many(ip6mr_rtnl_msg_handlers);
+#ifdef CONFIG_IPV6_PIMSM_V2
+ inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
+#endif
+ unregister_netdevice_notifier(&ip6_mr_notifier);
+ unregister_pernet_subsys(&ip6mr_net_ops);
+ kmem_cache_destroy(mrt_cachep);
+}
+
+static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
+ struct mf6cctl *mfc, int mrtsock, int parent)
+{
+ unsigned char ttls[MAXMIFS];
+ struct mfc6_cache *uc, *c;
+ struct mr_mfc *_uc;
+ bool found;
+ int i, err;
+
+ if (mfc->mf6cc_parent >= MAXMIFS)
+ return -ENFILE;
+
+ memset(ttls, 255, MAXMIFS);
+ for (i = 0; i < MAXMIFS; i++) {
+ if (IF_ISSET(i, &mfc->mf6cc_ifset))
+ ttls[i] = 1;
+ }
+
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
+ &mfc->mf6cc_mcastgrp.sin6_addr, parent);
+ rcu_read_unlock();
+ if (c) {
+ spin_lock(&mrt_lock);
+ c->_c.mfc_parent = mfc->mf6cc_parent;
+ ip6mr_update_thresholds(mrt, &c->_c, ttls);
+ if (!mrtsock)
+ c->_c.mfc_flags |= MFC_STATIC;
+ spin_unlock(&mrt_lock);
+ call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+ c, mrt->id);
+ mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+ }
+
+ if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) &&
+ !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
+ return -EINVAL;
+
+ c = ip6mr_cache_alloc();
+ if (!c)
+ return -ENOMEM;
+
+ c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
+ c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
+ c->_c.mfc_parent = mfc->mf6cc_parent;
+ ip6mr_update_thresholds(mrt, &c->_c, ttls);
+ if (!mrtsock)
+ c->_c.mfc_flags |= MFC_STATIC;
+
+ err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
+ ip6mr_rht_params);
+ if (err) {
+ pr_err("ip6mr: rhtable insert error %d\n", err);
+ ip6mr_cache_free(c);
+ return err;
+ }
+ list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
+
+ /* Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
+ */
+ found = false;
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
+ uc = (struct mfc6_cache *)_uc;
+ if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
+ ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
+ list_del(&_uc->list);
+ atomic_dec(&mrt->cache_resolve_queue_len);
+ found = true;
+ break;
+ }
+ }
+ if (list_empty(&mrt->mfc_unres_queue))
+ timer_delete(&mrt->ipmr_expire_timer);
+ spin_unlock_bh(&mfc_unres_lock);
+
+ if (found) {
+ ip6mr_cache_resolve(net, mrt, uc, c);
+ ip6mr_cache_free(uc);
+ }
+ call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+ c, mrt->id);
+ mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+}
+
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
+{
+ struct mr_mfc *c, *tmp;
+ LIST_HEAD(list);
+ int i;
+
+ /* Shut down all active vif entries */
+ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) {
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+ !(flags & MRT6_FLUSH_MIFS_STATIC)) ||
+ (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS)))
+ continue;
+ mif6_delete(mrt, i, 0, &list);
+ }
+ unregister_netdevice_many(&list);
+ }
+
+ /* Wipe the cache */
+ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) ||
+ (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC)))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
+ list_del_rcu(&c->list);
+ call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+ FIB_EVENT_ENTRY_DEL,
+ (struct mfc6_cache *)c, mrt->id);
+ mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
+ mr_cache_put(c);
+ }
+ }
+
+ if (flags & MRT6_FLUSH_MFC) {
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ mr6_netlink_event(mrt, (struct mfc6_cache *)c,
+ RTM_DELROUTE);
+ ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ }
+ }
+}
+
+static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
+{
+ int err = 0;
+ struct net *net = sock_net(sk);
+
+ rtnl_lock();
+ spin_lock(&mrt_lock);
+ if (rtnl_dereference(mrt->mroute_sk)) {
+ err = -EADDRINUSE;
+ } else {
+ rcu_assign_pointer(mrt->mroute_sk, sk);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
+ }
+ spin_unlock(&mrt_lock);
+
+ if (!err)
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ rtnl_unlock();
+
+ return err;
+}
+
+int ip6mr_sk_done(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct ipv6_devconf *devconf;
+ struct mr_table *mrt;
+ int err = -EACCES;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return err;
+
+ devconf = net->ipv6.devconf_all;
+ if (!devconf || !atomic_read(&devconf->mc_forwarding))
+ return err;
+
+ rtnl_lock();
+ ip6mr_for_each_table(mrt, net) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ spin_lock(&mrt_lock);
+ RCU_INIT_POINTER(mrt->mroute_sk, NULL);
+ /* Note that mroute_sk had SOCK_RCU_FREE set,
+ * so the RCU grace period before sk freeing
+ * is guaranteed by sk_destruct()
+ */
+ atomic_dec(&devconf->mc_forwarding);
+ spin_unlock(&mrt_lock);
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+
+ mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC);
+ err = 0;
+ break;
+ }
+ }
+ rtnl_unlock();
+
+ return err;
+}
+
+bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
+{
+ struct mr_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi6_oif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
+ };
+
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+ return NULL;
+
+ return rcu_access_pointer(mrt->mroute_sk);
+}
+EXPORT_SYMBOL(mroute6_is_socket);
+
+/*
+ * Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
+ unsigned int optlen)
+{
+ int ret, parent = 0;
+ struct mif6ctl vif;
+ struct mf6cctl mfc;
+ mifi_t mifi;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ if (optname != MRT6_INIT) {
+ if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EACCES;
+ }
+
+ switch (optname) {
+ case MRT6_INIT:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ return ip6mr_sk_init(mrt, sk);
+
+ case MRT6_DONE:
+ return ip6mr_sk_done(sk);
+
+ case MRT6_ADD_MIF:
+ if (optlen < sizeof(vif))
+ return -EINVAL;
+ if (copy_from_sockptr(&vif, optval, sizeof(vif)))
+ return -EFAULT;
+ if (vif.mif6c_mifi >= MAXMIFS)
+ return -ENFILE;
+ rtnl_lock();
+ ret = mif6_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
+ rtnl_unlock();
+ return ret;
+
+ case MRT6_DEL_MIF:
+ if (optlen < sizeof(mifi_t))
+ return -EINVAL;
+ if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t)))
+ return -EFAULT;
+ rtnl_lock();
+ ret = mif6_delete(mrt, mifi, 0, NULL);
+ rtnl_unlock();
+ return ret;
+
+ /*
+ * Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT6_ADD_MFC:
+ case MRT6_DEL_MFC:
+ parent = -1;
+ fallthrough;
+ case MRT6_ADD_MFC_PROXY:
+ case MRT6_DEL_MFC_PROXY:
+ if (optlen < sizeof(mfc))
+ return -EINVAL;
+ if (copy_from_sockptr(&mfc, optval, sizeof(mfc)))
+ return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mf6cc_parent;
+ rtnl_lock();
+ if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY)
+ ret = ip6mr_mfc_delete(mrt, &mfc, parent);
+ else
+ ret = ip6mr_mfc_add(net, mrt, &mfc,
+ sk ==
+ rtnl_dereference(mrt->mroute_sk),
+ parent);
+ rtnl_unlock();
+ return ret;
+
+ case MRT6_FLUSH:
+ {
+ int flags;
+
+ if (optlen != sizeof(flags))
+ return -EINVAL;
+ if (copy_from_sockptr(&flags, optval, sizeof(flags)))
+ return -EFAULT;
+ rtnl_lock();
+ mroute_clean_tables(mrt, flags);
+ rtnl_unlock();
+ return 0;
+ }
+
+ /*
+ * Control PIM assert (to activate pim will activate assert)
+ */
+ case MRT6_ASSERT:
+ {
+ int v;
+
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
+ return -EFAULT;
+ mrt->mroute_do_assert = v;
+ return 0;
+ }
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ case MRT6_PIM:
+ {
+ bool do_wrmifwhole;
+ int v;
+
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
+ return -EFAULT;
+
+ do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE);
+ v = !!v;
+ rtnl_lock();
+ ret = 0;
+ if (v != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = v;
+ mrt->mroute_do_assert = v;
+ mrt->mroute_do_wrvifwhole = do_wrmifwhole;
+ }
+ rtnl_unlock();
+ return ret;
+ }
+
+#endif
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+ case MRT6_TABLE:
+ {
+ u32 v;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
+ return -EFAULT;
+ /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (v != RT_TABLE_DEFAULT && v >= 100000000)
+ return -EINVAL;
+ if (sk == rcu_access_pointer(mrt->mroute_sk))
+ return -EBUSY;
+
+ rtnl_lock();
+ ret = 0;
+ mrt = ip6mr_new_table(net, v);
+ if (IS_ERR(mrt))
+ ret = PTR_ERR(mrt);
+ else
+ raw6_sk(sk)->ip6mr_table = v;
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+ /*
+ * Spurious command, or MRT6_VERSION which you cannot
+ * set.
+ */
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+/*
+ * Getsock opt support for the multicast routing system.
+ */
+
+int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
+{
+ int olr;
+ int val;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (optname) {
+ case MRT6_VERSION:
+ val = 0x0305;
+ break;
+#ifdef CONFIG_IPV6_PIMSM_V2
+ case MRT6_PIM:
+ val = mrt->mroute_do_pim;
+ break;
+#endif
+ case MRT6_ASSERT:
+ val = mrt->mroute_do_assert;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
+ return -EFAULT;
+
+ olr = min_t(int, olr, sizeof(int));
+ if (olr < 0)
+ return -EINVAL;
+
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, &val, olr))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The IP multicast ioctl support routines.
+ */
+int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
+{
+ struct sioc_sg_req6 *sr;
+ struct sioc_mif_req6 *vr;
+ struct vif_device *vif;
+ struct mfc6_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETMIFCNT_IN6:
+ vr = (struct sioc_mif_req6 *)arg;
+ if (vr->mifi >= mrt->maxvif)
+ return -EINVAL;
+ vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif);
+ rcu_read_lock();
+ vif = &mrt->vif_table[vr->mifi];
+ if (VIF_EXISTS(mrt, vr->mifi)) {
+ vr->icount = READ_ONCE(vif->pkt_in);
+ vr->ocount = READ_ONCE(vif->pkt_out);
+ vr->ibytes = READ_ONCE(vif->bytes_in);
+ vr->obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT_IN6:
+ sr = (struct sioc_sg_req6 *)arg;
+
+ rcu_read_lock();
+ c = ip6mr_cache_find(mrt, &sr->src.sin6_addr,
+ &sr->grp.sin6_addr);
+ if (c) {
+ sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req6 {
+ struct sockaddr_in6 src;
+ struct sockaddr_in6 grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_mif_req6 {
+ mifi_t mifi;
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req6 sr;
+ struct compat_sioc_mif_req6 vr;
+ struct vif_device *vif;
+ struct mfc6_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETMIFCNT_IN6:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.mifi >= mrt->maxvif)
+ return -EINVAL;
+ vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
+ rcu_read_lock();
+ vif = &mrt->vif_table[vr.mifi];
+ if (VIF_EXISTS(mrt, vr.mifi)) {
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT_IN6:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+ if (c) {
+ sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTFORWDATAGRAMS);
+ return dst_output(net, sk, skb);
+}
+
+/*
+ * Processing handlers for ip6mr_forward
+ */
+
+static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ struct vif_device *vif = &mrt->vif_table[vifi];
+ struct net_device *vif_dev;
+ struct ipv6hdr *ipv6h;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
+ return -1;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (vif->flags & MIFF_REGISTER) {
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+ DEV_STATS_INC(vif_dev, tx_packets);
+ ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
+ return -1;
+ }
+#endif
+
+ ipv6h = ipv6_hdr(skb);
+
+ fl6 = (struct flowi6) {
+ .flowi6_oif = vif->link,
+ .daddr = ipv6h->daddr,
+ };
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return -1;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /*
+ * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ * not only before forwarding, but after forwarding on all output
+ * interfaces. It is clear, if mrouter runs a multicasting
+ * program, it should receive packets not depending to what interface
+ * program is joined.
+ * If we will not make it, the program will have to join on all
+ * interfaces. On the other hand, multihoming host (or router, but
+ * not mrouter) cannot join to more than one interface - it will
+ * result in receiving multiple packets.
+ */
+ skb->dev = vif_dev;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+
+ /* We are about to write */
+ /* XXX: extension headers? */
+ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev)))
+ return -1;
+
+ ipv6h = ipv6_hdr(skb);
+ ipv6h->hop_limit--;
+ return 0;
+}
+
+static void ip6mr_forward2(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ struct net_device *indev = skb->dev;
+
+ if (ip6mr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ IP6CB(skb)->flags |= IP6SKB_FORWARDED;
+
+ NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
+ net, NULL, skb, indev, skb->dev,
+ ip6mr_forward2_finish);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
+static void ip6mr_output2(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ if (ip6mr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ ip6_output(net, NULL, skb);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
+/* Called with rcu_read_lock() */
+static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
+{
+ int ct;
+
+ /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
+ break;
+ }
+ return ct;
+}
+
+/* Called under rcu_read_lock() */
+static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *c)
+{
+ int psend = -1;
+ int vif, ct;
+ int true_vifi = ip6mr_find_vif(mrt, dev);
+
+ vif = c->_c.mfc_parent;
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+ if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) {
+ struct mfc6_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incoming
+ * interface is part of the static tree.
+ */
+ cache_proxy = mr_mfc_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
+ /*
+ * Wrong interface: drop packet and (maybe) send PIM assert.
+ */
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
+ atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
+
+ if (true_vifi >= 0 && mrt->mroute_do_assert &&
+ /* pimsm uses asserts, when switching from RPT to SPT,
+ so that we cannot check that packet arrived on an oif.
+ It is bad, but otherwise we would need to move pretty
+ large chunk of pimd to kernel. Ough... --ANK
+ */
+ (mrt->mroute_do_pim ||
+ c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
+ time_after(jiffies,
+ c->_c.mfc_un.res.last_assert +
+ MFC_ASSERT_THRESH)) {
+ c->_c.mfc_un.res.last_assert = jiffies;
+ ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ip6mr_cache_report(mrt, skb, true_vifi,
+ MRT6MSG_WRMIFWHOLE);
+ }
+ goto dont_forward;
+ }
+
+forward:
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
+
+ /*
+ * Forward the frame
+ */
+ if (ipv6_addr_any(&c->mf6c_origin) &&
+ ipv6_addr_any(&c->mf6c_mcastgrp)) {
+ if (true_vifi >= 0 &&
+ true_vifi != c->_c.mfc_parent &&
+ ipv6_hdr(skb)->hop_limit >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) &&
+ ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ip6mr_forward2(net, mrt, skb2, psend);
+ }
+ psend = ct;
+ }
+ }
+last_forward:
+ if (psend != -1) {
+ ip6mr_forward2(net, mrt, skb, psend);
+ return;
+ }
+
+dont_forward:
+ kfree_skb(skb);
+}
+
+/* Called under rcu_read_lock() */
+static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *c)
+{
+ int psend = -1;
+ int ct;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+ /* Forward the frame */
+ if (ipv6_addr_any(&c->mf6c_origin) &&
+ ipv6_addr_any(&c->mf6c_mcastgrp)) {
+ if (ipv6_hdr(skb)->hop_limit >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ip6mr_output2(net, mrt, skb2, psend);
+ }
+ psend = ct;
+ }
+ }
+last_forward:
+ if (psend != -1) {
+ ip6mr_output2(net, mrt, skb, psend);
+ return;
+ }
+
+dont_forward:
+ kfree_skb(skb);
+}
+
+/*
+ * Multicast packets for forwarding arrive here
+ */
+
+int ip6_mr_input(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net_rcu(dev);
+ struct mfc6_cache *cache;
+ struct mr_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = dev->ifindex,
+ .flowi6_mark = skb->mark,
+ };
+ int err;
+
+ /* skb->dev passed in is the master dev for vrfs.
+ * Get the proper interface that does have a vif associated with it.
+ */
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
+
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ cache = ip6mr_cache_find(mrt,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+ if (!cache) {
+ int vif = ip6mr_find_vif(mrt, dev);
+
+ if (vif >= 0)
+ cache = ip6mr_cache_find_any(mrt,
+ &ipv6_hdr(skb)->daddr,
+ vif);
+ }
+
+ /*
+ * No usable cache entry
+ */
+ if (!cache) {
+ int vif;
+
+ vif = ip6mr_find_vif(mrt, dev);
+ if (vif >= 0) {
+ int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);
+
+ return err;
+ }
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ ip6_mr_forward(net, mrt, dev, skb, cache);
+
+ return 0;
+}
+
+int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+ struct flowi6 fl6 = (struct flowi6) {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_mark = skb->mark,
+ };
+ struct mfc6_cache *cache;
+ struct mr_table *mrt;
+ int err;
+ int vif;
+
+ guard(rcu)();
+
+ if (IP6CB(skb)->flags & IP6SKB_FORWARDED)
+ goto ip6_output;
+ if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE))
+ goto ip6_output;
+
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ cache = ip6mr_cache_find(mrt,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+ if (!cache) {
+ vif = ip6mr_find_vif(mrt, dev);
+ if (vif >= 0)
+ cache = ip6mr_cache_find_any(mrt,
+ &ipv6_hdr(skb)->daddr,
+ vif);
+ }
+
+ /* No usable cache entry */
+ if (!cache) {
+ vif = ip6mr_find_vif(mrt, dev);
+ if (vif >= 0)
+ return ip6mr_cache_unresolved(mrt, vif, skb, dev);
+ goto ip6_output;
+ }
+
+ /* Wrong interface */
+ vif = cache->_c.mfc_parent;
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+ goto ip6_output;
+
+ ip6_mr_output_finish(net, mrt, dev, skb, cache);
+ return 0;
+
+ip6_output:
+ return ip6_output(net, sk, skb);
+}
+
+int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
+ u32 portid)
+{
+ int err;
+ struct mr_table *mrt;
+ struct mfc6_cache *cache;
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
+
+ rcu_read_lock();
+ mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
+ if (!cache && skb->dev) {
+ int vif = ip6mr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr,
+ vif);
+ }
+
+ if (!cache) {
+ struct sk_buff *skb2;
+ struct ipv6hdr *iph;
+ struct net_device *dev;
+ int vif;
+
+ dev = skb->dev;
+ if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ /* really correct? */
+ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+ if (!skb2) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
+ NETLINK_CB(skb2).portid = portid;
+ skb_reset_transport_header(skb2);
+
+ skb_put(skb2, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb2);
+
+ iph = ipv6_hdr(skb2);
+ iph->version = 0;
+ iph->priority = 0;
+ iph->flow_lbl[0] = 0;
+ iph->flow_lbl[1] = 0;
+ iph->flow_lbl[2] = 0;
+ iph->payload_len = 0;
+ iph->nexthdr = IPPROTO_NONE;
+ iph->hop_limit = 0;
+ iph->saddr = rt->rt6i_src.addr;
+ iph->daddr = rt->rt6i_dst.addr;
+
+ err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
+ rcu_read_unlock();
+
+ return err;
+ }
+
+ err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
+ rcu_read_unlock();
+ return err;
+}
+
+static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mfc6_cache *c, int cmd,
+ int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+ int err;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = RTNL_FAMILY_IP6MR;
+ rtm->rtm_dst_len = 128;
+ rtm->rtm_src_len = 128;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = mrt->id;
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
+ rtm->rtm_type = RTN_MULTICAST;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (c->_c.mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
+ rtm->rtm_flags = 0;
+
+ if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) ||
+ nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp))
+ goto nla_put_failure;
+ err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c,
+ int cmd, int flags)
+{
+ return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c,
+ cmd, flags);
+}
+
+static int mr6_msgsize(bool unresolved, int maxvif)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */
+ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
+}
+
+static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
+ int cmd)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif),
+ GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
+}
+
+static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */
+ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */
+ /* IP6MRA_CREPORT_SRC_ADDR */
+ + nla_total_size(sizeof(struct in6_addr))
+ /* IP6MRA_CREPORT_DST_ADDR */
+ + nla_total_size(sizeof(struct in6_addr))
+ /* IP6MRA_CREPORT_PKT */
+ + nla_total_size(payloadlen)
+ ;
+
+ return len;
+}
+
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct nlmsghdr *nlh;
+ struct rtgenmsg *rtgenm;
+ struct mrt6msg *msg;
+ struct sk_buff *skb;
+ struct nlattr *nla;
+ int payloadlen;
+
+ payloadlen = pkt->len - sizeof(struct mrt6msg);
+ msg = (struct mrt6msg *)skb_transport_header(pkt);
+
+ skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+ sizeof(struct rtgenmsg), 0);
+ if (!nlh)
+ goto errout;
+ rtgenm = nlmsg_data(nlh);
+ rtgenm->rtgen_family = RTNL_FAMILY_IP6MR;
+ if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) ||
+ nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) ||
+ nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR,
+ &msg->im6_src) ||
+ nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR,
+ &msg->im6_dst))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen);
+ if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg),
+ nla_data(nla), payloadlen))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+errout:
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS);
+}
+
+static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = {
+ [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [RTA_TABLE] = { .type = NLA_U32 },
+};
+
+static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy,
+ extack);
+ if (err)
+ return err;
+
+ rtm = nlmsg_data(nlh);
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
+ rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid values in header for multicast route get request");
+ return -EINVAL;
+ }
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct in6_addr src = {}, grp = {};
+ struct nlattr *tb[RTA_MAX + 1];
+ struct mfc6_cache *cache;
+ struct mr_table *mrt;
+ struct sk_buff *skb;
+ u32 tableid;
+ int err;
+
+ err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[RTA_SRC])
+ src = nla_get_in6_addr(tb[RTA_SRC]);
+ if (tb[RTA_DST])
+ grp = nla_get_in6_addr(tb[RTA_DST]);
+ tableid = nla_get_u32_default(tb[RTA_TABLE], 0);
+
+ mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
+ if (!mrt) {
+ NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
+ return -ENOENT;
+ }
+
+ /* entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ cache = ip6mr_cache_find(mrt, &src, &grp);
+ rcu_read_unlock();
+ if (!cache) {
+ NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
+ return -ENOENT;
+ }
+
+ skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+}
+
+static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct fib_dump_filter filter = {
+ .rtnl_held = true,
+ };
+ int err;
+
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
+
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR)
+ return skb->len;
+
+ NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
+ return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
+ _ip6mr_fill_mroute, &mfc_unres_lock, &filter);
+}
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 511730b67e97..8607569de34f 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -1,42 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173
*
* Copyright (C)2003 USAGI/WIDE Project
*
* Author Mitsuru KANDA <mk@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-/*
+/*
* [Memo]
*
* Outbound:
- * The compression of IP datagram MUST be done before AH/ESP processing,
- * fragmentation, and the addition of Hop-by-Hop/Routing header.
+ * The compression of IP datagram MUST be done before AH/ESP processing,
+ * fragmentation, and the addition of Hop-by-Hop/Routing header.
*
* Inbound:
- * The decompression of IP datagram MUST be done after the reassembly,
+ * The decompression of IP datagram MUST be done after the reassembly,
* AH/ESP processing.
*/
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ipcomp.h>
-#include <asm/scatterlist.h>
-#include <asm/semaphore.h>
#include <linux/crypto.h>
+#include <linux/err.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <linux/percpu.h>
@@ -44,6 +33,7 @@
#include <linux/list.h>
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
+#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
@@ -51,172 +41,59 @@
#include <linux/icmpv6.h>
#include <linux/mutex.h>
-struct ipcomp6_tfms {
- struct list_head list;
- struct crypto_comp **tfms;
- int users;
-};
-
-static DEFINE_MUTEX(ipcomp6_resource_mutex);
-static void **ipcomp6_scratches;
-static int ipcomp6_scratch_users;
-static LIST_HEAD(ipcomp6_tfms_list);
-
-static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err = -ENOMEM;
- struct ipv6hdr *iph;
- struct ipv6_comp_hdr *ipch;
- int plen, dlen;
- struct ipcomp_data *ipcd = x->data;
- u8 *start, *scratch;
- struct crypto_comp *tfm;
- int cpu;
-
- if (skb_linearize_cow(skb))
- goto out;
-
- skb->ip_summed = CHECKSUM_NONE;
-
- /* Remove ipcomp header and decompress original payload */
- iph = skb->nh.ipv6h;
- ipch = (void *)skb->data;
- skb->h.raw = skb->nh.raw + sizeof(*ipch);
- __skb_pull(skb, sizeof(*ipch));
-
- /* decompression */
- plen = skb->len;
- dlen = IPCOMP_SCRATCH_SIZE;
- start = skb->data;
-
- cpu = get_cpu();
- scratch = *per_cpu_ptr(ipcomp6_scratches, cpu);
- tfm = *per_cpu_ptr(ipcd->tfms, cpu);
-
- err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
- if (err) {
- err = -EINVAL;
- goto out_put_cpu;
- }
-
- if (dlen < (plen + sizeof(struct ipv6_comp_hdr))) {
- err = -EINVAL;
- goto out_put_cpu;
- }
-
- err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
- if (err) {
- goto out_put_cpu;
- }
-
- skb->truesize += dlen - plen;
- __skb_put(skb, dlen - plen);
- memcpy(skb->data, scratch, dlen);
- err = ipch->nexthdr;
-
-out_put_cpu:
- put_cpu();
-out:
- return err;
-}
-
-static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
- struct ipv6hdr *top_iph;
- int hdr_len;
- struct ipv6_comp_hdr *ipch;
- struct ipcomp_data *ipcd = x->data;
- int plen, dlen;
- u8 *start, *scratch;
- struct crypto_comp *tfm;
- int cpu;
-
- hdr_len = skb->h.raw - skb->data;
-
- /* check whether datagram len is larger than threshold */
- if ((skb->len - hdr_len) < ipcd->threshold) {
- goto out_ok;
- }
-
- if (skb_linearize_cow(skb))
- goto out_ok;
-
- /* compression */
- plen = skb->len - hdr_len;
- dlen = IPCOMP_SCRATCH_SIZE;
- start = skb->h.raw;
-
- cpu = get_cpu();
- scratch = *per_cpu_ptr(ipcomp6_scratches, cpu);
- tfm = *per_cpu_ptr(ipcd->tfms, cpu);
-
- err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
- if (err || (dlen + sizeof(struct ipv6_comp_hdr)) >= plen) {
- put_cpu();
- goto out_ok;
- }
- memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
- put_cpu();
- pskb_trim(skb, hdr_len + dlen + sizeof(struct ip_comp_hdr));
-
- /* insert ipcomp header and replace datagram */
- top_iph = (struct ipv6hdr *)skb->data;
-
- top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
-
- ipch = (struct ipv6_comp_hdr *)start;
- ipch->nexthdr = *skb->nh.raw;
- ipch->flags = 0;
- ipch->cpi = htons((u16 )ntohl(x->id.spi));
- *skb->nh.raw = IPPROTO_COMP;
-
-out_ok:
- return 0;
-}
-
-static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
{
+ struct net *net = dev_net(skb->dev);
__be32 spi;
- struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
- struct ipv6_comp_hdr *ipcomph = (struct ipv6_comp_hdr*)(skb->data+offset);
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct ip_comp_hdr *ipcomph =
+ (struct ip_comp_hdr *)(skb->data + offset);
struct xfrm_state *x;
- if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG)
- return;
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
spi = htonl(ntohs(ipcomph->cpi));
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET6);
if (!x)
- return;
+ return 0;
- printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIP6_FMT "\n",
- spi, NIP6(iph->daddr));
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
+
+ return 0;
}
+static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
{
+ struct net *net = xs_net(x);
struct xfrm_state *t = NULL;
- u8 mode = XFRM_MODE_TUNNEL;
- t = xfrm_state_alloc();
+ t = xfrm_state_alloc(net);
if (!t)
goto out;
+ lockdep_set_class(&t->lock, &xfrm_state_lock_key);
t->id.proto = IPPROTO_IPV6;
- t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr);
+ t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr);
if (!t->id.spi)
goto error;
memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr));
memcpy(&t->sel, &x->sel, sizeof(t->sel));
t->props.family = AF_INET6;
- if (x->props.mode == XFRM_MODE_BEET)
- mode = x->props.mode;
- t->props.mode = mode;
+ t->props.mode = x->props.mode;
memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
+ memcpy(&t->mark, &x->mark, sizeof(t->mark));
+ t->if_id = x->if_id;
if (xfrm_init_state(t))
goto error;
@@ -235,13 +112,15 @@ error:
static int ipcomp6_tunnel_attach(struct xfrm_state *x)
{
+ struct net *net = xs_net(x);
int err = 0;
struct xfrm_state *t = NULL;
__be32 spi;
+ u32 mark = x->mark.m & x->mark.v;
- spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&x->props.saddr);
+ spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr);
if (spi)
- t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr,
+ t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr,
spi, IPPROTO_IPV6, AF_INET6);
if (!t) {
t = ipcomp6_tunnel_create(x);
@@ -259,229 +138,70 @@ out:
return err;
}
-static void ipcomp6_free_scratches(void)
-{
- int i;
- void **scratches;
-
- if (--ipcomp6_scratch_users)
- return;
-
- scratches = ipcomp6_scratches;
- if (!scratches)
- return;
-
- for_each_possible_cpu(i) {
- void *scratch = *per_cpu_ptr(scratches, i);
-
- vfree(scratch);
- }
-
- free_percpu(scratches);
-}
-
-static void **ipcomp6_alloc_scratches(void)
+static int ipcomp6_init_state(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
- int i;
- void **scratches;
-
- if (ipcomp6_scratch_users++)
- return ipcomp6_scratches;
-
- scratches = alloc_percpu(void *);
- if (!scratches)
- return NULL;
-
- ipcomp6_scratches = scratches;
-
- for_each_possible_cpu(i) {
- void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
- if (!scratch)
- return NULL;
- *per_cpu_ptr(scratches, i) = scratch;
- }
-
- return scratches;
-}
-
-static void ipcomp6_free_tfms(struct crypto_comp **tfms)
-{
- struct ipcomp6_tfms *pos;
- int cpu;
-
- list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
- if (pos->tfms == tfms)
- break;
- }
-
- BUG_TRAP(pos);
-
- if (--pos->users)
- return;
-
- list_del(&pos->list);
- kfree(pos);
-
- if (!tfms)
- return;
-
- for_each_possible_cpu(cpu) {
- struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu);
- crypto_free_comp(tfm);
- }
- free_percpu(tfms);
-}
-
-static struct crypto_comp **ipcomp6_alloc_tfms(const char *alg_name)
-{
- struct ipcomp6_tfms *pos;
- struct crypto_comp **tfms;
- int cpu;
-
- /* This can be any valid CPU ID so we don't need locking. */
- cpu = raw_smp_processor_id();
-
- list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
- struct crypto_comp *tfm;
-
- tfms = pos->tfms;
- tfm = *per_cpu_ptr(tfms, cpu);
-
- if (!strcmp(crypto_comp_name(tfm), alg_name)) {
- pos->users++;
- return tfms;
- }
- }
-
- pos = kmalloc(sizeof(*pos), GFP_KERNEL);
- if (!pos)
- return NULL;
-
- pos->users = 1;
- INIT_LIST_HEAD(&pos->list);
- list_add(&pos->list, &ipcomp6_tfms_list);
-
- pos->tfms = tfms = alloc_percpu(struct crypto_comp *);
- if (!tfms)
- goto error;
-
- for_each_possible_cpu(cpu) {
- struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0,
- CRYPTO_ALG_ASYNC);
- if (!tfm)
- goto error;
- *per_cpu_ptr(tfms, cpu) = tfm;
- }
-
- return tfms;
-
-error:
- ipcomp6_free_tfms(tfms);
- return NULL;
-}
-
-static void ipcomp6_free_data(struct ipcomp_data *ipcd)
-{
- if (ipcd->tfms)
- ipcomp6_free_tfms(ipcd->tfms);
- ipcomp6_free_scratches();
-}
-
-static void ipcomp6_destroy(struct xfrm_state *x)
-{
- struct ipcomp_data *ipcd = x->data;
- if (!ipcd)
- return;
- xfrm_state_delete_tunnel(x);
- mutex_lock(&ipcomp6_resource_mutex);
- ipcomp6_free_data(ipcd);
- mutex_unlock(&ipcomp6_resource_mutex);
- kfree(ipcd);
-
- xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
-}
-
-static int ipcomp6_init_state(struct xfrm_state *x)
-{
- int err;
- struct ipcomp_data *ipcd;
- struct xfrm_algo_desc *calg_desc;
-
- err = -EINVAL;
- if (!x->calg)
- goto out;
-
- if (x->encap)
- goto out;
-
- err = -ENOMEM;
- ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
- if (!ipcd)
- goto out;
+ int err = -EINVAL;
x->props.header_len = 0;
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
x->props.header_len += sizeof(struct ipv6hdr);
-
- mutex_lock(&ipcomp6_resource_mutex);
- if (!ipcomp6_alloc_scratches())
- goto error;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
+ goto out;
+ }
- ipcd->tfms = ipcomp6_alloc_tfms(x->calg->alg_name);
- if (!ipcd->tfms)
- goto error;
- mutex_unlock(&ipcomp6_resource_mutex);
+ err = ipcomp_init_state(x, extack);
+ if (err)
+ goto out;
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = ipcomp6_tunnel_attach(x);
- if (err)
- goto error_tunnel;
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
+ goto out;
+ }
}
- calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
- BUG_ON(!calg_desc);
- ipcd->threshold = calg_desc->uinfo.comp.threshold;
- x->data = ipcd;
err = 0;
out:
return err;
-error_tunnel:
- mutex_lock(&ipcomp6_resource_mutex);
-error:
- ipcomp6_free_data(ipcd);
- mutex_unlock(&ipcomp6_resource_mutex);
- kfree(ipcd);
-
- goto out;
}
-static struct xfrm_type ipcomp6_type =
+static int ipcomp6_rcv_cb(struct sk_buff *skb, int err)
{
- .description = "IPCOMP6",
+ return 0;
+}
+
+static const struct xfrm_type ipcomp6_type = {
.owner = THIS_MODULE,
.proto = IPPROTO_COMP,
.init_state = ipcomp6_init_state,
- .destructor = ipcomp6_destroy,
- .input = ipcomp6_input,
- .output = ipcomp6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
+ .destructor = ipcomp_destroy,
+ .input = ipcomp_input,
+ .output = ipcomp_output,
};
-static struct inet6_protocol ipcomp6_protocol =
-{
+static struct xfrm6_protocol ipcomp6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ipcomp6_rcv_cb,
.err_handler = ipcomp6_err,
- .flags = INET6_PROTO_NOPOLICY,
+ .priority = 0,
};
static int __init ipcomp6_init(void)
{
if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) {
- printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
- printk(KERN_INFO "ipcomp6 init: can't add protocol\n");
+ if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ipcomp6_type, AF_INET6);
return -EAGAIN;
}
@@ -490,10 +210,9 @@ static int __init ipcomp6_init(void)
static void __exit ipcomp6_fini(void)
{
- if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0)
- printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n");
- if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
- printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n");
+ if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ xfrm_unregister_type(&ipcomp6_type, AF_INET6);
}
module_init(ipcomp6_init);
@@ -502,4 +221,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173");
MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>");
-
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 1eafcfc95e81..a61e742794f9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1,22 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 BSD socket options interface
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
* Based on linux/net/ipv4/ip_sockglue.c
*
- * $Id: ipv6_sockglue.c,v 1.41 2002/02/01 22:01:04 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* FIXME: Make the setsockopt code POSIX compliant: That is
*
- * o Return -EINVAL for setsockopt of short lengths
* o Truncate getsockopt returns
* o Return an optlen of the truncated length if need be
*
@@ -31,14 +24,15 @@
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
+#include <linux/mroute6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/netfilter.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -53,196 +47,537 @@
#include <net/udp.h>
#include <net/udplite.h>
#include <net/xfrm.h>
+#include <net/compat.h>
+#include <net/seg6.h>
+#include <net/psp.h>
+
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
+struct ip6_ra_chain *ip6_ra_chain;
+DEFINE_RWLOCK(ip6_ra_lock);
-DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
+DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);
-static struct inet6_protocol *ipv6_gso_pull_exthdrs(struct sk_buff *skb,
- int proto)
+int ip6_ra_control(struct sock *sk, int sel)
{
- struct inet6_protocol *ops = NULL;
+ struct ip6_ra_chain *ra, *new_ra, **rap;
- for (;;) {
- struct ipv6_opt_hdr *opth;
- int len;
+ /* RA packet may be delivered ONLY to IPPROTO_RAW socket */
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW)
+ return -ENOPROTOOPT;
- if (proto != NEXTHDR_HOP) {
- ops = rcu_dereference(inet6_protos[proto]);
+ new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+ if (sel >= 0 && !new_ra)
+ return -ENOMEM;
- if (unlikely(!ops))
- break;
+ write_lock_bh(&ip6_ra_lock);
+ for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (sel >= 0) {
+ write_unlock_bh(&ip6_ra_lock);
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
- if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
- break;
+ *rap = ra->next;
+ write_unlock_bh(&ip6_ra_lock);
+
+ sock_put(sk);
+ kfree(ra);
+ return 0;
}
+ }
+ if (!new_ra) {
+ write_unlock_bh(&ip6_ra_lock);
+ return -ENOBUFS;
+ }
+ new_ra->sk = sk;
+ new_ra->sel = sel;
+ new_ra->next = ra;
+ *rap = new_ra;
+ sock_hold(sk);
+ write_unlock_bh(&ip6_ra_lock);
+ return 0;
+}
- if (unlikely(!pskb_may_pull(skb, 8)))
- break;
+struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
+ struct ipv6_txoptions *opt)
+{
+ if (inet_test_bit(IS_ICSK, sk)) {
+ if (opt &&
+ !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_ext_hdr_len =
+ psp_sk_overhead(sk) +
+ opt->opt_flen + opt->opt_nflen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
+ }
+ opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt)));
+ sk_dst_reset(sk);
- opth = (void *)skb->data;
- len = opth->hdrlen * 8 + 8;
+ return opt;
+}
- if (unlikely(!pskb_may_pull(skb, len)))
- break;
+static int copy_group_source_from_sockptr(struct group_source_req *greqs,
+ sockptr_t optval, int optlen)
+{
+ if (in_compat_syscall()) {
+ struct compat_group_source_req gr32;
- proto = opth->nexthdr;
- __skb_pull(skb, len);
+ if (optlen < sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+ greqs->gsr_interface = gr32.gsr_interface;
+ greqs->gsr_group = gr32.gsr_group;
+ greqs->gsr_source = gr32.gsr_source;
+ } else {
+ if (optlen < sizeof(*greqs))
+ return -EINVAL;
+ if (copy_from_sockptr(greqs, optval, sizeof(*greqs)))
+ return -EFAULT;
}
- return ops;
+ return 0;
}
-static int ipv6_gso_send_check(struct sk_buff *skb)
+static int do_ipv6_mcast_group_source(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
{
- struct ipv6hdr *ipv6h;
- struct inet6_protocol *ops;
- int err = -EINVAL;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
- goto out;
-
- ipv6h = skb->nh.ipv6h;
- __skb_pull(skb, sizeof(*ipv6h));
- err = -EPROTONOSUPPORT;
-
- rcu_read_lock();
- ops = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
- if (likely(ops && ops->gso_send_check)) {
- skb->h.raw = skb->data;
- err = ops->gso_send_check(skb);
+ struct group_source_req greqs;
+ int omode, add;
+ int ret;
+
+ ret = copy_group_source_from_sockptr(&greqs, optval, optlen);
+ if (ret)
+ return ret;
+
+ if (greqs.gsr_group.ss_family != AF_INET6 ||
+ greqs.gsr_source.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct sockaddr_in6 *psin6;
+ int retv;
+
+ psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
+ retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
+ &psin6->sin6_addr,
+ MCAST_INCLUDE);
+ /* prior join w/ different source is ok */
+ if (retv && retv != -EADDRINUSE)
+ return retv;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
}
- rcu_read_unlock();
-
-out:
- return err;
+ return ip6_mc_source(add, omode, sk, &greqs);
}
-static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features)
+static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct ipv6hdr *ipv6h;
- struct inet6_protocol *ops;
-
- if (!(features & NETIF_F_HW_CSUM))
- features &= ~NETIF_F_SG;
-
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCPV6 |
- 0)))
- goto out;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
- goto out;
-
- ipv6h = skb->nh.ipv6h;
- __skb_pull(skb, sizeof(*ipv6h));
- segs = ERR_PTR(-EPROTONOSUPPORT);
-
- rcu_read_lock();
- ops = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
- if (likely(ops && ops->gso_segment)) {
- skb->h.raw = skb->data;
- segs = ops->gso_segment(skb, features);
- }
- rcu_read_unlock();
+ struct group_filter *gsf;
+ int ret;
- if (unlikely(IS_ERR(segs)))
- goto out;
+ if (optlen < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
+ return -ENOBUFS;
- for (skb = segs; skb; skb = skb->next) {
- ipv6h = skb->nh.ipv6h;
- ipv6h->payload_len = htons(skb->len - skb->mac_len -
- sizeof(*ipv6h));
- }
+ gsf = memdup_sockptr(optval, optlen);
+ if (IS_ERR(gsf))
+ return PTR_ERR(gsf);
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ ret = -ENOBUFS;
+ if (gsf->gf_numsrc >= 0x1ffffffU ||
+ gsf->gf_numsrc > sysctl_mld_max_msf)
+ goto out_free_gsf;
+
+ ret = -EINVAL;
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
+ goto out_free_gsf;
-out:
- return segs;
+ ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex);
+out_free_gsf:
+ kfree(gsf);
+ return ret;
}
-static struct packet_type ipv6_packet_type = {
- .type = __constant_htons(ETH_P_IPV6),
- .func = ipv6_rcv,
- .gso_send_check = ipv6_gso_send_check,
- .gso_segment = ipv6_gso_segment,
-};
+static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter *gf32;
+ void *p;
+ int ret;
+ int n;
-struct ip6_ra_chain *ip6_ra_chain;
-DEFINE_RWLOCK(ip6_ra_lock);
+ if (optlen < size0)
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
+ return -ENOBUFS;
+
+ p = kmalloc(optlen + 4, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
+ ret = -EFAULT;
+ if (copy_from_sockptr(gf32, optval, optlen))
+ goto out_free_p;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ ret = -ENOBUFS;
+ n = gf32->gf_numsrc;
+ if (n >= 0x1ffffffU || n > sysctl_mld_max_msf)
+ goto out_free_p;
+
+ ret = -EINVAL;
+ if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
+ goto out_free_p;
+
+ ret = ip6_mc_msfilter(sk, &(struct group_filter){
+ .gf_interface = gf32->gf_interface,
+ .gf_group = gf32->gf_group,
+ .gf_fmode = gf32->gf_fmode,
+ .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex);
+
+out_free_p:
+ kfree(p);
+ return ret;
+}
-int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
+static int ipv6_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
{
- struct ip6_ra_chain *ra, *new_ra, **rap;
+ struct sockaddr_in6 *psin6;
+ struct group_req greq;
- /* RA packet may be delivered ONLY to IPPROTO_RAW socket */
- if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW)
+ if (optlen < sizeof(greq))
return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
- new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+ if (greq.gr_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ psin6 = (struct sockaddr_in6 *)&greq.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ return ipv6_sock_mc_join(sk, greq.gr_interface,
+ &psin6->sin6_addr);
+ return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr);
+}
- write_lock_bh(&ip6_ra_lock);
- for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
- if (ra->sk == sk) {
- if (sel>=0) {
- write_unlock_bh(&ip6_ra_lock);
- kfree(new_ra);
- return -EADDRINUSE;
- }
+static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct compat_group_req gr32;
+ struct sockaddr_in6 *psin6;
- *rap = ra->next;
- write_unlock_bh(&ip6_ra_lock);
+ if (optlen < sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
- if (ra->destructor)
- ra->destructor(sk);
- sock_put(sk);
- kfree(ra);
- return 0;
+ if (gr32.gr_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ psin6 = (struct sockaddr_in6 *)&gr32.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ return ipv6_sock_mc_join(sk, gr32.gr_interface,
+ &psin6->sin6_addr);
+ return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr);
+}
+
+static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval,
+ int optlen)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_opt_hdr *new = NULL;
+ struct net *net = sock_net(sk);
+ struct ipv6_txoptions *opt;
+ int err;
+
+ /* hop-by-hop / destination options are privileged option */
+ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+
+ /* remove any sticky options header with a zero option
+ * length, per RFC3542.
+ */
+ if (optlen > 0) {
+ if (sockptr_is_null(optval))
+ return -EINVAL;
+ if (optlen < sizeof(struct ipv6_opt_hdr) ||
+ optlen & 0x7 ||
+ optlen > 8 * 255)
+ return -EINVAL;
+
+ new = memdup_sockptr(optval, optlen);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ if (unlikely(ipv6_optlen(new) > optlen)) {
+ kfree(new);
+ return -EINVAL;
}
}
- if (new_ra == NULL) {
- write_unlock_bh(&ip6_ra_lock);
- return -ENOBUFS;
+
+ opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
+ opt = ipv6_renew_options(sk, opt, optname, new);
+ kfree(new);
+ if (IS_ERR(opt))
+ return PTR_ERR(opt);
+
+ /* routing header option needs extra check */
+ err = -EINVAL;
+ if (optname == IPV6_RTHDR && opt && opt->srcrt) {
+ struct ipv6_rt_hdr *rthdr = opt->srcrt;
+ switch (rthdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (rthdr->hdrlen != 2 || rthdr->segments_left != 1)
+ goto sticky_done;
+ break;
+#endif
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh =
+ (struct ipv6_sr_hdr *)opt->srcrt;
+
+ if (!seg6_validate_srh(srh, optlen, false))
+ goto sticky_done;
+ break;
+ }
+ default:
+ goto sticky_done;
+ }
}
- new_ra->sk = sk;
- new_ra->sel = sel;
- new_ra->destructor = destructor;
- new_ra->next = ra;
- *rap = new_ra;
- sock_hold(sk);
- write_unlock_bh(&ip6_ra_lock);
- return 0;
+
+ err = 0;
+ opt = ipv6_update_options(sk, opt);
+sticky_done:
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
+ return err;
}
-static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- int val, valbool;
+ struct net *net = sock_net(sk);
int retv = -ENOPROTOOPT;
+ int val, valbool;
- if (optval == NULL)
- val=0;
- else if (get_user(val, (int __user *) optval))
- return -EFAULT;
+ if (sockptr_is_null(optval))
+ val = 0;
+ else {
+ if (optlen >= sizeof(int)) {
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+ } else
+ val = 0;
+ }
- valbool = (val!=0);
+ valbool = (val != 0);
- lock_sock(sk);
+ if (ip6_mroute_opt(optname))
+ return ip6_mroute_setsockopt(sk, optname, optval, optlen);
+
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case IPV6_UNICAST_HOPS:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val > 255 || val < -1)
+ return -EINVAL;
+ WRITE_ONCE(np->hop_limit, val);
+ return 0;
+ case IPV6_MULTICAST_LOOP:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val != valbool)
+ return -EINVAL;
+ inet6_assign_bit(MC6_LOOP, sk, valbool);
+ return 0;
+ case IPV6_MULTICAST_HOPS:
+ if (sk->sk_type == SOCK_STREAM)
+ return retv;
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val > 255 || val < -1)
+ return -EINVAL;
+ WRITE_ONCE(np->mcast_hops,
+ val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
+ return 0;
+ case IPV6_MTU:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val && val < IPV6_MIN_MTU)
+ return -EINVAL;
+ WRITE_ONCE(np->frag_size, val);
+ return 0;
+ case IPV6_MINHOPCOUNT:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&ip6_min_hopcount);
+
+ /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
+ * while we are changing it.
+ */
+ WRITE_ONCE(np->min_hopcount, val);
+ return 0;
+ case IPV6_RECVERR_RFC4884:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+ inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
+ return 0;
+ case IPV6_MULTICAST_ALL:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(MC6_ALL, sk, valbool);
+ return 0;
+ case IPV6_AUTOFLOWLABEL:
+ inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
+ inet6_set_bit(AUTOFLOWLABEL_SET, sk);
+ return 0;
+ case IPV6_DONTFRAG:
+ inet6_assign_bit(DONTFRAG, sk, valbool);
+ return 0;
+ case IPV6_RECVERR:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(RECVERR6, sk, valbool);
+ if (!val)
+ skb_errqueue_purge(&sk->sk_error_queue);
+ return 0;
+ case IPV6_ROUTER_ALERT_ISOLATE:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
+ return 0;
+ case IPV6_MTU_DISCOVER:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
+ return -EINVAL;
+ WRITE_ONCE(np->pmtudisc, val);
+ return 0;
+ case IPV6_FLOWINFO_SEND:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(SNDFLOW, sk, valbool);
+ return 0;
+ case IPV6_ADDR_PREFERENCES:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ return ip6_sock_set_addr_preferences(sk, val);
+ case IPV6_MULTICAST_IF:
+ if (sk->sk_type == SOCK_STREAM)
+ return -ENOPROTOOPT;
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val) {
+ struct net_device *dev;
+ int bound_dev_if, midx;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, val);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ midx = l3mdev_master_ifindex_rcu(dev);
+
+ rcu_read_unlock();
+
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ if (bound_dev_if &&
+ bound_dev_if != val &&
+ (!midx || midx != bound_dev_if))
+ return -EINVAL;
+ }
+ WRITE_ONCE(np->mcast_oif, val);
+ return 0;
+ case IPV6_UNICAST_IF:
+ {
+ struct net_device *dev;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (!ifindex) {
+ WRITE_ONCE(np->ucast_oif, 0);
+ return 0;
+ }
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ dev_put(dev);
+
+ if (READ_ONCE(sk->sk_bound_dev_if))
+ return -EINVAL;
+
+ WRITE_ONCE(np->ucast_oif, ifindex);
+ return 0;
+ }
+ }
+
+ sockopt_lock_sock(sk);
+
+ /* Another thread has converted the socket into IPv4 with
+ * IPV6_ADDRFORM concurrently.
+ */
+ if (unlikely(sk->sk_family != AF_INET6))
+ goto unlock;
switch (optname) {
case IPV6_ADDRFORM:
+ if (optlen < sizeof(int))
+ goto e_inval;
if (val == PF_INET) {
- struct ipv6_txoptions *opt;
- struct sk_buff *pktopt;
+ if (sk->sk_type == SOCK_RAW)
+ break;
- if (sk->sk_protocol != IPPROTO_UDP &&
- sk->sk_protocol != IPPROTO_UDPLITE &&
- sk->sk_protocol != IPPROTO_TCP)
+ if (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_UDPLITE) {
+ struct udp_sock *up = udp_sk(sk);
+ if (up->pending == AF_INET6) {
+ retv = -EBUSY;
+ break;
+ }
+ } else if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_prot != &tcpv6_prot) {
+ retv = -EBUSY;
+ break;
+ }
+ } else {
break;
+ }
if (sk->sk_state != TCP_ESTABLISHED) {
retv = -ENOTCONN;
@@ -250,59 +585,50 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
}
if (ipv6_only_sock(sk) ||
- !(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) {
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
retv = -EADDRNOTAVAIL;
break;
}
- fl6_free_socklist(sk);
- ipv6_sock_mc_close(sk);
-
- /*
- * Sock is moving from IPv6 to IPv4 (sk_prot), so
- * remove it from the refcnt debug socks count in the
- * original family...
- */
- sk_refcnt_debug_dec(sk);
+ __ipv6_sock_mc_close(sk);
+ __ipv6_sock_ac_close(sk);
if (sk->sk_protocol == IPPROTO_TCP) {
struct inet_connection_sock *icsk = inet_csk(sk);
- local_bh_disable();
- sock_prot_dec_use(sk->sk_prot);
- sock_prot_inc_use(&tcp_prot);
- local_bh_enable();
- sk->sk_prot = &tcp_prot;
- icsk->icsk_af_ops = &ipv4_specific;
- sk->sk_socket->ops = &inet_stream_ops;
- sk->sk_family = PF_INET;
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_prot_inuse_add(net, &tcp_prot, 1);
+
+ /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */
+ WRITE_ONCE(sk->sk_prot, &tcp_prot);
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific);
+ WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops);
+ WRITE_ONCE(sk->sk_family, PF_INET);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
struct proto *prot = &udp_prot;
if (sk->sk_protocol == IPPROTO_UDPLITE)
prot = &udplite_prot;
- local_bh_disable();
- sock_prot_dec_use(sk->sk_prot);
- sock_prot_inc_use(prot);
- local_bh_enable();
- sk->sk_prot = prot;
- sk->sk_socket->ops = &inet_dgram_ops;
- sk->sk_family = PF_INET;
+
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_prot_inuse_add(net, prot, 1);
+
+ /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */
+ WRITE_ONCE(sk->sk_prot, prot);
+ WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops);
+ WRITE_ONCE(sk->sk_family, PF_INET);
}
- opt = xchg(&np->opt, NULL);
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- pktopt = xchg(&np->pktoptions, NULL);
- if (pktopt)
- kfree_skb(pktopt);
-
- sk->sk_destruct = inet_sock_destruct;
- /*
- * ... and add it to the refcnt debug socks count
- * in the new family. -acme
+
+ /* Disable all options not to allocate memory anymore,
+ * but there is still a race. See the lockless path
+ * in udpv6_sendmsg() and ipv6_local_rxpmtu().
*/
- sk_refcnt_debug_inc(sk);
+ np->rxopt.all = 0;
+
+ inet6_cleanup_sock(sk);
+
module_put(THIS_MODULE);
retv = 0;
break;
@@ -310,150 +636,178 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
case IPV6_V6ONLY:
- if (inet_sk(sk)->num)
+ if (optlen < sizeof(int) ||
+ inet_sk(sk)->inet_num)
goto e_inval;
- np->ipv6only = valbool;
+ sk->sk_ipv6only = valbool;
retv = 0;
break;
case IPV6_RECVPKTINFO:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.rxinfo = valbool;
retv = 0;
break;
-
+
case IPV6_2292PKTINFO:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.rxoinfo = valbool;
retv = 0;
break;
case IPV6_RECVHOPLIMIT:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.rxhlim = valbool;
retv = 0;
break;
case IPV6_2292HOPLIMIT:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.rxohlim = valbool;
retv = 0;
break;
case IPV6_RECVRTHDR:
- if (val < 0 || val > 2)
+ if (optlen < sizeof(int))
goto e_inval;
- np->rxopt.bits.srcrt = val;
+ np->rxopt.bits.srcrt = valbool;
retv = 0;
break;
case IPV6_2292RTHDR:
- if (val < 0 || val > 2)
+ if (optlen < sizeof(int))
goto e_inval;
- np->rxopt.bits.osrcrt = val;
+ np->rxopt.bits.osrcrt = valbool;
retv = 0;
break;
case IPV6_RECVHOPOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.hopopts = valbool;
retv = 0;
break;
case IPV6_2292HOPOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.ohopopts = valbool;
retv = 0;
break;
case IPV6_RECVDSTOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.dstopts = valbool;
retv = 0;
break;
case IPV6_2292DSTOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.odstopts = valbool;
retv = 0;
break;
case IPV6_TCLASS:
+ if (optlen < sizeof(int))
+ goto e_inval;
if (val < -1 || val > 0xff)
goto e_inval;
- np->tclass = val;
+ /* RFC 3542, 6.5: default traffic class of 0x0 */
+ if (val == -1)
+ val = 0;
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= np->tclass & INET_ECN_MASK;
+ }
+ if (np->tclass != val) {
+ np->tclass = val;
+ sk_dst_reset(sk);
+ }
retv = 0;
break;
-
+
case IPV6_RECVTCLASS:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.rxtclass = valbool;
retv = 0;
break;
case IPV6_FLOWINFO:
+ if (optlen < sizeof(int))
+ goto e_inval;
np->rxopt.bits.rxflow = valbool;
retv = 0;
break;
+ case IPV6_RECVPATHMTU:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxpmtu = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_TRANSPARENT:
+ if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ retv = -EPERM;
+ break;
+ }
+ if (optlen < sizeof(int))
+ goto e_inval;
+ /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+ inet_assign_bit(TRANSPARENT, sk, valbool);
+ retv = 0;
+ break;
+
+ case IPV6_FREEBIND:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ /* we also don't have a separate freebind bit for IPV6 */
+ inet_assign_bit(FREEBIND, sk, valbool);
+ retv = 0;
+ break;
+
+ case IPV6_RECVORIGDSTADDR:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxorigdstaddr = valbool;
+ retv = 0;
+ break;
+
case IPV6_HOPOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
- {
- struct ipv6_txoptions *opt;
- if (optlen == 0)
- optval = NULL;
+ retv = ipv6_set_opt_hdr(sk, optname, optval, optlen);
+ break;
- /* hop-by-hop / destination options are privileged option */
- retv = -EPERM;
- if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW))
- break;
+ case IPV6_PKTINFO:
+ {
+ struct in6_pktinfo pkt;
- retv = -EINVAL;
- if (optlen & 0x7 || optlen > 8 * 255)
- break;
+ if (optlen == 0)
+ goto e_inval;
+ else if (optlen < sizeof(struct in6_pktinfo) ||
+ sockptr_is_null(optval))
+ goto e_inval;
- opt = ipv6_renew_options(sk, np->opt, optname,
- (struct ipv6_opt_hdr __user *)optval,
- optlen);
- if (IS_ERR(opt)) {
- retv = PTR_ERR(opt);
+ if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) {
+ retv = -EFAULT;
break;
}
+ if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
+ goto e_inval;
- /* routing header option needs extra check */
- if (optname == IPV6_RTHDR && opt->srcrt) {
- struct ipv6_rt_hdr *rthdr = opt->srcrt;
- switch (rthdr->type) {
- case IPV6_SRCRT_TYPE_0:
-#ifdef CONFIG_IPV6_MIP6
- case IPV6_SRCRT_TYPE_2:
-#endif
- break;
- default:
- goto sticky_done;
- }
-
- if ((rthdr->hdrlen & 1) ||
- (rthdr->hdrlen >> 1) != rthdr->segments_left)
- goto sticky_done;
- }
-
+ np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
+ np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr;
retv = 0;
- if (inet_sk(sk)->is_icsk) {
- if (opt) {
- struct inet_connection_sock *icsk = inet_csk(sk);
- if (!((1 << sk->sk_state) &
- (TCPF_LISTEN | TCPF_CLOSE))
- && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
- icsk->icsk_ext_hdr_len =
- opt->opt_flen + opt->opt_nflen;
- icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
- }
- }
- opt = xchg(&np->opt, opt);
- sk_dst_reset(sk);
- } else {
- write_lock(&sk->sk_dst_lock);
- opt = xchg(&np->opt, opt);
- write_unlock(&sk->sk_dst_lock);
- sk_dst_reset(sk);
- }
-sticky_done:
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
break;
}
@@ -461,11 +815,12 @@ sticky_done:
{
struct ipv6_txoptions *opt = NULL;
struct msghdr msg;
- struct flowi fl;
- int junk;
+ struct flowi6 fl6;
+ struct ipcm6_cookie ipc6;
- fl.fl6_flowlabel = 0;
- fl.oif = sk->sk_bound_dev_if;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
if (optlen == 0)
goto update;
@@ -479,89 +834,49 @@ sticky_done:
opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL);
retv = -ENOBUFS;
- if (opt == NULL)
+ if (!opt)
break;
memset(opt, 0, sizeof(*opt));
+ refcount_set(&opt->refcnt, 1);
opt->tot_len = sizeof(*opt) + optlen;
retv = -EFAULT;
- if (copy_from_user(opt+1, optval, optlen))
+ if (copy_from_sockptr(opt + 1, optval, optlen))
goto done;
msg.msg_controllen = optlen;
- msg.msg_control = (void*)(opt+1);
+ msg.msg_control_is_user = false;
+ msg.msg_control = (void *)(opt+1);
+ ipc6.opt = opt;
- retv = datagram_send_ctl(&msg, &fl, opt, &junk, &junk);
+ retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6);
if (retv)
goto done;
update:
retv = 0;
- if (inet_sk(sk)->is_icsk) {
- if (opt) {
- struct inet_connection_sock *icsk = inet_csk(sk);
- if (!((1 << sk->sk_state) &
- (TCPF_LISTEN | TCPF_CLOSE))
- && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
- icsk->icsk_ext_hdr_len =
- opt->opt_flen + opt->opt_nflen;
- icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
- }
- }
- opt = xchg(&np->opt, opt);
- sk_dst_reset(sk);
- } else {
- write_lock(&sk->sk_dst_lock);
- opt = xchg(&np->opt, opt);
- write_unlock(&sk->sk_dst_lock);
- sk_dst_reset(sk);
- }
-
+ opt = ipv6_update_options(sk, opt);
done:
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
break;
}
- case IPV6_UNICAST_HOPS:
- if (val > 255 || val < -1)
- goto e_inval;
- np->hop_limit = val;
- retv = 0;
- break;
-
- case IPV6_MULTICAST_HOPS:
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- if (val > 255 || val < -1)
- goto e_inval;
- np->mcast_hops = val;
- retv = 0;
- break;
- case IPV6_MULTICAST_LOOP:
- np->mc_loop = valbool;
- retv = 0;
- break;
-
- case IPV6_MULTICAST_IF:
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
- goto e_inval;
-
- if (__dev_get_by_index(val) == NULL) {
- retv = -ENODEV;
- break;
- }
- np->mcast_oif = val;
- retv = 0;
- break;
case IPV6_ADD_MEMBERSHIP:
case IPV6_DROP_MEMBERSHIP:
{
struct ipv6_mreq mreq;
+ if (optlen < sizeof(struct ipv6_mreq))
+ goto e_inval;
+
+ retv = -EPROTO;
+ if (inet_test_bit(IS_ICSK, sk))
+ break;
+
retv = -EFAULT;
- if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq)))
break;
if (optname == IPV6_ADD_MEMBERSHIP)
@@ -575,11 +890,11 @@ done:
{
struct ipv6_mreq mreq;
- if (optlen != sizeof(struct ipv6_mreq))
+ if (optlen < sizeof(struct ipv6_mreq))
goto e_inval;
retv = -EFAULT;
- if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq)))
break;
if (optname == IPV6_JOIN_ANYCAST)
@@ -590,131 +905,32 @@ done:
}
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
- {
- struct group_req greq;
- struct sockaddr_in6 *psin6;
-
- retv = -EFAULT;
- if (copy_from_user(&greq, optval, sizeof(struct group_req)))
- break;
- if (greq.gr_group.ss_family != AF_INET6) {
- retv = -EADDRNOTAVAIL;
- break;
- }
- psin6 = (struct sockaddr_in6 *)&greq.gr_group;
- if (optname == MCAST_JOIN_GROUP)
- retv = ipv6_sock_mc_join(sk, greq.gr_interface,
- &psin6->sin6_addr);
+ if (in_compat_syscall())
+ retv = compat_ipv6_mcast_join_leave(sk, optname, optval,
+ optlen);
else
- retv = ipv6_sock_mc_drop(sk, greq.gr_interface,
- &psin6->sin6_addr);
+ retv = ipv6_mcast_join_leave(sk, optname, optval,
+ optlen);
break;
- }
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
- {
- struct group_source_req greqs;
- int omode, add;
-
- if (optlen != sizeof(struct group_source_req))
- goto e_inval;
- if (copy_from_user(&greqs, optval, sizeof(greqs))) {
- retv = -EFAULT;
- break;
- }
- if (greqs.gsr_group.ss_family != AF_INET6 ||
- greqs.gsr_source.ss_family != AF_INET6) {
- retv = -EADDRNOTAVAIL;
- break;
- }
- if (optname == MCAST_BLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 1;
- } else if (optname == MCAST_UNBLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 0;
- } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
- struct sockaddr_in6 *psin6;
-
- psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
- retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
- &psin6->sin6_addr);
- /* prior join w/ different source is ok */
- if (retv && retv != -EADDRINUSE)
- break;
- omode = MCAST_INCLUDE;
- add = 1;
- } else /* MCAST_LEAVE_SOURCE_GROUP */ {
- omode = MCAST_INCLUDE;
- add = 0;
- }
- retv = ip6_mc_source(add, omode, sk, &greqs);
+ retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen);
break;
- }
case MCAST_MSFILTER:
- {
- extern int sysctl_mld_max_msf;
- struct group_filter *gsf;
-
- if (optlen < GROUP_FILTER_SIZE(0))
- goto e_inval;
- if (optlen > sysctl_optmem_max) {
- retv = -ENOBUFS;
- break;
- }
- gsf = kmalloc(optlen,GFP_KERNEL);
- if (gsf == 0) {
- retv = -ENOBUFS;
- break;
- }
- retv = -EFAULT;
- if (copy_from_user(gsf, optval, optlen)) {
- kfree(gsf);
- break;
- }
- /* numsrc >= (4G-140)/128 overflow in 32 bits */
- if (gsf->gf_numsrc >= 0x1ffffffU ||
- gsf->gf_numsrc > sysctl_mld_max_msf) {
- kfree(gsf);
- retv = -ENOBUFS;
- break;
- }
- if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
- kfree(gsf);
- retv = -EINVAL;
- break;
- }
- retv = ip6_mc_msfilter(sk, gsf);
- kfree(gsf);
-
+ if (in_compat_syscall())
+ retv = compat_ipv6_set_mcast_msfilter(sk, optval,
+ optlen);
+ else
+ retv = ipv6_set_mcast_msfilter(sk, optval, optlen);
break;
- }
case IPV6_ROUTER_ALERT:
- retv = ip6_ra_control(sk, val, NULL);
- break;
- case IPV6_MTU_DISCOVER:
- if (val<0 || val>2)
- goto e_inval;
- np->pmtudisc = val;
- retv = 0;
- break;
- case IPV6_MTU:
- if (val && val < IPV6_MIN_MTU)
+ if (optlen < sizeof(int))
goto e_inval;
- np->frag_size = val;
- retv = 0;
- break;
- case IPV6_RECVERR:
- np->recverr = valbool;
- if (!val)
- skb_queue_purge(&sk->sk_error_queue);
- retv = 0;
- break;
- case IPV6_FLOWINFO_SEND:
- np->sndflow = valbool;
- retv = 0;
+ retv = ip6_ra_control(sk, val);
+ if (retv == 0)
+ inet6_assign_bit(RTALERT, sk, valbool);
break;
case IPV6_FLOWLABEL_MGR:
retv = ipv6_flowlabel_opt(sk, optval, optlen);
@@ -722,28 +938,34 @@ done:
case IPV6_IPSEC_POLICY:
case IPV6_XFRM_POLICY:
retv = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
+ case IPV6_RECVFRAGSIZE:
+ np->rxopt.bits.recvfragsize = valbool;
+ retv = 0;
+ break;
}
- release_sock(sk);
+
+unlock:
+ sockopt_release_sock(sk);
return retv;
e_inval:
- release_sock(sk);
- return -EINVAL;
+ retv = -EINVAL;
+ goto unlock;
}
-int ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int err;
if (level == SOL_IP && sk->sk_type != SOCK_RAW)
- return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+ return ip_setsockopt(sk, level, optname, optval, optlen);
if (level != SOL_IPV6)
return -ENOPROTOOPT;
@@ -752,96 +974,141 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
- optname != IPV6_XFRM_POLICY) {
- lock_sock(sk);
- err = nf_setsockopt(sk, PF_INET6, optname, optval,
- optlen);
- release_sock(sk);
- }
+ optname != IPV6_XFRM_POLICY)
+ err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen);
#endif
return err;
}
+EXPORT_SYMBOL(ipv6_setsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
+ int optname, sockptr_t optval, int len)
{
- int err;
+ struct ipv6_opt_hdr *hdr;
- if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
- if (udp_prot.compat_setsockopt != NULL)
- return udp_prot.compat_setsockopt(sk, level, optname,
- optval, optlen);
- return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+ if (!opt)
+ return 0;
+
+ switch (optname) {
+ case IPV6_HOPOPTS:
+ hdr = opt->hopopt;
+ break;
+ case IPV6_RTHDRDSTOPTS:
+ hdr = opt->dst0opt;
+ break;
+ case IPV6_RTHDR:
+ hdr = (struct ipv6_opt_hdr *)opt->srcrt;
+ break;
+ case IPV6_DSTOPTS:
+ hdr = opt->dst1opt;
+ break;
+ default:
+ return -EINVAL; /* should not happen */
}
- if (level != SOL_IPV6)
- return -ENOPROTOOPT;
+ if (!hdr)
+ return 0;
- err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
- optname != IPV6_XFRM_POLICY) {
- lock_sock(sk);
- err = compat_nf_setsockopt(sk, PF_INET6, optname,
- optval, optlen);
- release_sock(sk);
+ len = min_t(unsigned int, len, ipv6_optlen(hdr));
+ if (copy_to_sockptr(optval, hdr, len))
+ return -EFAULT;
+ return len;
+}
+
+static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct group_filter, gf_slist_flex);
+ struct group_filter gsf;
+ int num;
+ int err;
+
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gsf, optval, size0))
+ return -EFAULT;
+ if (gsf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ num = gsf.gf_numsrc;
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gsf, optval, size0);
+ if (!err) {
+ if (num > gsf.gf_numsrc)
+ num = gsf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
+ err = -EFAULT;
}
-#endif
+ sockopt_release_sock(sk);
return err;
}
-EXPORT_SYMBOL(compat_ipv6_setsockopt);
-#endif
-
-static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr,
- char __user *optval, int len)
+static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
- if (!hdr)
- return 0;
- len = min_t(int, len, ipv6_optlen(hdr));
- if (copy_to_user(optval, hdr, ipv6_optlen(hdr)))
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter gf32;
+ struct group_filter gf;
+ int err;
+ int num;
+
+ if (len < size0)
+ return -EINVAL;
+
+ if (copy_from_sockptr(&gf32, optval, size0))
return -EFAULT;
- return len;
+ gf.gf_interface = gf32.gf_interface;
+ gf.gf_fmode = gf32.gf_fmode;
+ num = gf.gf_numsrc = gf32.gf_numsrc;
+ gf.gf_group = gf32.gf_group;
+
+ if (gf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gf, optval, size0);
+ sockopt_release_sock(sk);
+ if (err)
+ return err;
+ if (num > gf.gf_numsrc)
+ num = gf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32));
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf32.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf32.gf_numsrc)))
+ return -EFAULT;
+ return 0;
}
-static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
int len;
int val;
- if (get_user(len, optlen))
+ if (ip6_mroute_opt(optname))
+ return ip6_mroute_getsockopt(sk, optname, optval, optlen);
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
switch (optname) {
case IPV6_ADDRFORM:
if (sk->sk_protocol != IPPROTO_UDP &&
sk->sk_protocol != IPPROTO_UDPLITE &&
sk->sk_protocol != IPPROTO_TCP)
- return -EINVAL;
+ return -ENOPROTOOPT;
if (sk->sk_state != TCP_ESTABLISHED)
return -ENOTCONN;
val = sk->sk_family;
break;
case MCAST_MSFILTER:
- {
- struct group_filter gsf;
- int err;
-
- if (len < GROUP_FILTER_SIZE(0))
- return -EINVAL;
- if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
- return -EFAULT;
- lock_sock(sk);
- err = ip6_mc_msfget(sk, &gsf,
- (struct group_filter __user *)optval, optlen);
- release_sock(sk);
- return err;
- }
-
+ if (in_compat_syscall())
+ return compat_ipv6_get_msfilter(sk, optval, optlen, len);
+ return ipv6_get_msfilter(sk, optval, optlen, len);
case IPV6_2292PKTOPTIONS:
{
struct msghdr msg;
@@ -850,64 +1117,82 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control = optval;
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
+ }
msg.msg_controllen = len;
msg.msg_flags = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
skb = np->pktoptions;
if (skb)
- atomic_inc(&skb->users);
- release_sock(sk);
-
- if (skb) {
- int err = datagram_recv_ctl(sk, &msg, skb);
- kfree_skb(skb);
- if (err)
- return err;
- } else {
+ ip6_datagram_recv_ctl(sk, &msg, skb);
+ sockopt_release_sock(sk);
+ if (!skb) {
if (np->rxopt.bits.rxinfo) {
+ int mcast_oif = READ_ONCE(np->mcast_oif);
struct in6_pktinfo src_info;
- src_info.ipi6_ifindex = np->mcast_oif;
- ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr);
+
+ src_info.ipi6_ifindex = mcast_oif ? :
+ np->sticky_pktinfo.ipi6_ifindex;
+ src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr;
put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxhlim) {
- int hlim = np->mcast_hops;
+ int hlim = READ_ONCE(np->mcast_hops);
+
put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
+ if (np->rxopt.bits.rxtclass) {
+ int tclass = (int)ip6_tclass(np->rcv_flowinfo);
+
+ put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
+ }
if (np->rxopt.bits.rxoinfo) {
+ int mcast_oif = READ_ONCE(np->mcast_oif);
struct in6_pktinfo src_info;
- src_info.ipi6_ifindex = np->mcast_oif;
- ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr);
+
+ src_info.ipi6_ifindex = mcast_oif ? :
+ np->sticky_pktinfo.ipi6_ifindex;
+ src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr :
+ np->sticky_pktinfo.ipi6_addr;
put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxohlim) {
- int hlim = np->mcast_hops;
+ int hlim = READ_ONCE(np->mcast_hops);
+
put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
+ if (np->rxopt.bits.rxflow) {
+ __be32 flowinfo = np->rcv_flowinfo;
+
+ put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
+ }
}
len -= msg.msg_controllen;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_MTU:
{
struct dst_entry *dst;
- val = 0;
- lock_sock(sk);
- dst = sk_dst_get(sk);
- if (dst) {
+
+ val = 0;
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst)
val = dst_mtu(dst);
- dst_release(dst);
- }
- release_sock(sk);
+ rcu_read_unlock();
if (!val)
return -ENOTCONN;
break;
}
case IPV6_V6ONLY:
- val = np->ipv6only;
+ val = sk->sk_ipv6only;
break;
case IPV6_RECVPKTINFO:
@@ -939,12 +1224,17 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
case IPV6_RTHDR:
case IPV6_DSTOPTS:
{
+ struct ipv6_txoptions *opt;
- lock_sock(sk);
- len = ipv6_getsockopt_sticky(sk, np->opt->hopopt,
- optval, len);
- release_sock(sk);
- return put_user(len, optlen);
+ sockopt_lock_sock(sk);
+ opt = rcu_dereference_protected(np->opt,
+ lockdep_sock_is_held(sk));
+ len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
+ sockopt_release_sock(sk);
+ /* check if ipv6_getsockopt_sticky() returns err code */
+ if (len < 0)
+ return len;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_RECVHOPOPTS:
@@ -965,8 +1255,6 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
case IPV6_TCLASS:
val = np->tclass;
- if (val < 0)
- val = 0;
break;
case IPV6_RECVTCLASS:
@@ -977,41 +1265,186 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = np->rxopt.bits.rxflow;
break;
- case IPV6_UNICAST_HOPS:
- val = np->hop_limit;
+ case IPV6_RECVPATHMTU:
+ val = np->rxopt.bits.rxpmtu;
+ break;
+
+ case IPV6_PATHMTU:
+ {
+ struct dst_entry *dst;
+ struct ip6_mtuinfo mtuinfo;
+
+ if (len < sizeof(mtuinfo))
+ return -EINVAL;
+
+ len = sizeof(mtuinfo);
+ memset(&mtuinfo, 0, sizeof(mtuinfo));
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst)
+ mtuinfo.ip6m_mtu = dst_mtu(dst);
+ rcu_read_unlock();
+ if (!mtuinfo.ip6m_mtu)
+ return -ENOTCONN;
+
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, &mtuinfo, len))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case IPV6_TRANSPARENT:
+ val = inet_test_bit(TRANSPARENT, sk);
+ break;
+
+ case IPV6_FREEBIND:
+ val = inet_test_bit(FREEBIND, sk);
+ break;
+
+ case IPV6_RECVORIGDSTADDR:
+ val = np->rxopt.bits.rxorigdstaddr;
break;
+ case IPV6_UNICAST_HOPS:
case IPV6_MULTICAST_HOPS:
- val = np->mcast_hops;
+ {
+ struct dst_entry *dst;
+
+ if (optname == IPV6_UNICAST_HOPS)
+ val = READ_ONCE(np->hop_limit);
+ else
+ val = READ_ONCE(np->mcast_hops);
+
+ if (val < 0) {
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst)
+ val = ip6_dst_hoplimit(dst);
+ rcu_read_unlock();
+ }
+
+ if (val < 0)
+ val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit);
break;
+ }
case IPV6_MULTICAST_LOOP:
- val = np->mc_loop;
+ val = inet6_test_bit(MC6_LOOP, sk);
break;
case IPV6_MULTICAST_IF:
- val = np->mcast_oif;
+ val = READ_ONCE(np->mcast_oif);
+ break;
+
+ case IPV6_MULTICAST_ALL:
+ val = inet6_test_bit(MC6_ALL, sk);
+ break;
+
+ case IPV6_UNICAST_IF:
+ val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif));
break;
case IPV6_MTU_DISCOVER:
- val = np->pmtudisc;
+ val = READ_ONCE(np->pmtudisc);
break;
case IPV6_RECVERR:
- val = np->recverr;
+ val = inet6_test_bit(RECVERR6, sk);
break;
case IPV6_FLOWINFO_SEND:
- val = np->sndflow;
+ val = inet6_test_bit(SNDFLOW, sk);
+ break;
+
+ case IPV6_FLOWLABEL_MGR:
+ {
+ struct in6_flowlabel_req freq;
+ int flags;
+
+ if (len < sizeof(freq))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
+ return -EFAULT;
+
+ if (freq.flr_action != IPV6_FL_A_GET)
+ return -EINVAL;
+
+ len = sizeof(freq);
+ flags = freq.flr_flags;
+
+ memset(&freq, 0, sizeof(freq));
+
+ val = ipv6_flowlabel_opt_get(sk, &freq, flags);
+ if (val < 0)
+ return val;
+
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ if (copy_to_sockptr(optval, &freq, len))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case IPV6_ADDR_PREFERENCES:
+ {
+ u8 srcprefs = READ_ONCE(np->srcprefs);
+ val = 0;
+
+ if (srcprefs & IPV6_PREFER_SRC_TMP)
+ val |= IPV6_PREFER_SRC_TMP;
+ else if (srcprefs & IPV6_PREFER_SRC_PUBLIC)
+ val |= IPV6_PREFER_SRC_PUBLIC;
+ else {
+ /* XXX: should we return system default? */
+ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
+ }
+
+ if (srcprefs & IPV6_PREFER_SRC_COA)
+ val |= IPV6_PREFER_SRC_COA;
+ else
+ val |= IPV6_PREFER_SRC_HOME;
+ break;
+ }
+ case IPV6_MINHOPCOUNT:
+ val = READ_ONCE(np->min_hopcount);
+ break;
+
+ case IPV6_DONTFRAG:
+ val = inet6_test_bit(DONTFRAG, sk);
+ break;
+
+ case IPV6_AUTOFLOWLABEL:
+ val = ip6_autoflowlabel(sock_net(sk), sk);
+ break;
+
+ case IPV6_RECVFRAGSIZE:
+ val = np->rxopt.bits.recvfragsize;
+ break;
+
+ case IPV6_ROUTER_ALERT:
+ val = inet6_test_bit(RTALERT, sk);
+ break;
+
+ case IPV6_ROUTER_ALERT_ISOLATE:
+ val = inet6_test_bit(RTALERT_ISOLATE, sk);
+ break;
+
+ case IPV6_RECVERR_RFC4884:
+ val = inet6_test_bit(RECVERR6_RFC4884, sk);
break;
default:
- return -EINVAL;
+ return -ENOPROTOOPT;
}
len = min_t(unsigned int, sizeof(int), len);
- if(put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if(copy_to_user(optval,&val,len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -1022,78 +1455,26 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
int err;
if (level == SOL_IP && sk->sk_type != SOCK_RAW)
- return udp_prot.getsockopt(sk, level, optname, optval, optlen);
-
- if(level != SOL_IPV6)
- return -ENOPROTOOPT;
-
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible EINVALs except default case */
- if (err == -EINVAL && optname != IPV6_ADDRFORM &&
- optname != MCAST_MSFILTER) {
- int len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- lock_sock(sk);
- err = nf_getsockopt(sk, PF_INET6, optname, optval,
- &len);
- release_sock(sk);
- if (err >= 0)
- err = put_user(len, optlen);
- }
-#endif
- return err;
-}
-
-#ifdef CONFIG_COMPAT
-int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- int err;
-
- if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
- if (udp_prot.compat_getsockopt != NULL)
- return udp_prot.compat_getsockopt(sk, level, optname,
- optval, optlen);
- return udp_prot.getsockopt(sk, level, optname, optval, optlen);
- }
+ return ip_getsockopt(sk, level, optname, optval, optlen);
if (level != SOL_IPV6)
return -ENOPROTOOPT;
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen);
+ err = do_ipv6_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible EINVALs except default case */
- if (err == -EINVAL && optname != IPV6_ADDRFORM &&
- optname != MCAST_MSFILTER) {
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
int len;
if (get_user(len, optlen))
return -EFAULT;
- lock_sock(sk);
- err = compat_nf_getsockopt(sk, PF_INET6,
- optname, optval, &len);
- release_sock(sk);
+ err = nf_getsockopt(sk, PF_INET6, optname, optval, &len);
if (err >= 0)
err = put_user(len, optlen);
}
#endif
return err;
}
-
-EXPORT_SYMBOL(compat_ipv6_getsockopt);
-#endif
-
-void __init ipv6_packet_init(void)
-{
- dev_add_pack(&ipv6_packet_type);
-}
-
-void ipv6_packet_cleanup(void)
-{
- dev_remove_pack(&ipv6_packet_type);
-}
+EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
deleted file mode 100644
index 0e8e0676a033..000000000000
--- a/net/ipv6/ipv6_syms.c
+++ /dev/null
@@ -1,37 +0,0 @@
-
-#include <linux/module.h>
-#include <net/protocol.h>
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-#include <net/ip6_route.h>
-#include <net/xfrm.h>
-
-EXPORT_SYMBOL(__ipv6_addr_type);
-EXPORT_SYMBOL(icmpv6_send);
-EXPORT_SYMBOL(icmpv6_statistics);
-EXPORT_SYMBOL(icmpv6_err_convert);
-EXPORT_SYMBOL(ndisc_mc_map);
-EXPORT_SYMBOL(register_inet6addr_notifier);
-EXPORT_SYMBOL(unregister_inet6addr_notifier);
-EXPORT_SYMBOL(ip6_route_output);
-EXPORT_SYMBOL(ipv6_setsockopt);
-EXPORT_SYMBOL(ipv6_getsockopt);
-EXPORT_SYMBOL(inet6_register_protosw);
-EXPORT_SYMBOL(inet6_unregister_protosw);
-EXPORT_SYMBOL(inet6_add_protocol);
-EXPORT_SYMBOL(inet6_del_protocol);
-EXPORT_SYMBOL(ip6_xmit);
-EXPORT_SYMBOL(inet6_release);
-EXPORT_SYMBOL(inet6_bind);
-EXPORT_SYMBOL(inet6_getname);
-EXPORT_SYMBOL(inet6_ioctl);
-EXPORT_SYMBOL(ipv6_get_saddr);
-EXPORT_SYMBOL(ipv6_chk_addr);
-EXPORT_SYMBOL(in6_dev_finish_destroy);
-#ifdef CONFIG_XFRM
-EXPORT_SYMBOL(xfrm6_rcv);
-EXPORT_SYMBOL(xfrm6_input_addr);
-EXPORT_SYMBOL(xfrm6_find_1stfragopt);
-#endif
-EXPORT_SYMBOL(rt6_lookup);
-EXPORT_SYMBOL(ipv6_push_nfrag_opts);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index a1c231a04ac2..016b572e7d6f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1,18 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Multicast support for IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: mcast.c,v 1.40 2002/02/08 03:57:19 davem Exp $
- *
- * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
*/
/* Changes:
@@ -35,20 +29,27 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/jiffies.h>
-#include <linux/times.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
+#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/route.h>
+#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#include <net/mld.h>
+#include <linux/workqueue.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -58,332 +59,330 @@
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
+#include <net/inet_common.h>
#include <net/ip6_checksum.h>
-/* Set to 3 to get tracing... */
-#define MCAST_DEBUG 2
-
-#if MCAST_DEBUG >= 3
-#define MDBG(x) printk x
-#else
-#define MDBG(x)
-#endif
-
-/*
- * These header formats should be in a separate include file, but icmpv6.h
- * doesn't have in6_addr defined in all cases, there is no __u128, and no
- * other files reference these.
- *
- * +-DLS 4/14/03
- */
-
-/* Multicast Listener Discovery version 2 headers */
-
-struct mld2_grec {
- __u8 grec_type;
- __u8 grec_auxwords;
- __be16 grec_nsrcs;
- struct in6_addr grec_mca;
- struct in6_addr grec_src[0];
-};
-
-struct mld2_report {
- __u8 type;
- __u8 resv1;
- __sum16 csum;
- __be16 resv2;
- __be16 ngrec;
- struct mld2_grec grec[0];
-};
-
-struct mld2_query {
- __u8 type;
- __u8 code;
- __sum16 csum;
- __be16 mrc;
- __be16 resv1;
- struct in6_addr mca;
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 qrv:3,
- suppress:1,
- resv2:4;
-#elif defined(__BIG_ENDIAN_BITFIELD)
- __u8 resv2:4,
- suppress:1,
- qrv:3;
-#else
-#error "Please fix <asm/byteorder.h>"
-#endif
- __u8 qqic;
- __be16 nsrcs;
- struct in6_addr srcs[0];
+/* Ensure that we have struct in6_addr aligned on 32bit word. */
+static int __mld2_query_bugs[] __attribute__((__unused__)) = {
+ BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4),
+ BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4),
+ BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4)
};
+static struct workqueue_struct *mld_wq;
static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
-/* Big mc list lock for all the sockets */
-static DEFINE_RWLOCK(ipv6_sk_mc_lock);
-
-static struct socket *igmp6_socket;
-
-int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr);
-
static void igmp6_join_group(struct ifmcaddr6 *ma);
static void igmp6_leave_group(struct ifmcaddr6 *ma);
-static void igmp6_timer_handler(unsigned long data);
+static void mld_mca_work(struct work_struct *work);
-static void mld_gq_timer_expire(unsigned long data);
-static void mld_ifc_timer_expire(unsigned long data);
static void mld_ifc_event(struct inet6_dev *idev);
-static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *addr);
-static void mld_clear_delrec(struct inet6_dev *idev);
+static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
static void sf_markstate(struct ifmcaddr6 *pmc);
static void ip6_mc_clear_src(struct ifmcaddr6 *pmc);
-static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca,
- int sfmode, int sfcount, struct in6_addr *psfsrc,
+static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
int delta);
-static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca,
- int sfmode, int sfcount, struct in6_addr *psfsrc,
+static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
int delta);
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
struct inet6_dev *idev);
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+ const struct in6_addr *addr, unsigned int mode);
-
-#define IGMP6_UNSOLICITED_IVAL (10*HZ)
#define MLD_QRV_DEFAULT 2
+/* RFC3810, 9.2. Query Interval */
+#define MLD_QI_DEFAULT (125 * HZ)
+/* RFC3810, 9.3. Query Response Interval */
+#define MLD_QRI_DEFAULT (10 * HZ)
-#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \
- (idev)->cnf.force_mld_version == 1 || \
- ((idev)->mc_v1_seen && \
- time_before(jiffies, (idev)->mc_v1_seen)))
-
-#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value))
-#define MLDV2_EXP(thresh, nbmant, nbexp, value) \
- ((value) < (thresh) ? (value) : \
- ((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \
- (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp))))
-
-#define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value)
-#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value)
+/* RFC3810, 8.1 Query Version Distinctions */
+#define MLD_V1_QUERY_LEN 24
+#define MLD_V2_QUERY_LEN_MIN 28
#define IPV6_MLD_MAX_MSF 64
int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
+int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT;
+
+#define mc_assert_locked(idev) \
+ lockdep_assert_held(&(idev)->mc_lock)
+
+#define mc_dereference(e, idev) \
+ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock))
+
+#define sock_dereference(e, sk) \
+ rcu_dereference_protected(e, lockdep_sock_is_held(sk))
+
+#define for_each_pmc_socklock(np, sk, pmc) \
+ for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \
+ pmc; \
+ pmc = sock_dereference(pmc->next, sk))
+
+#define for_each_pmc_rcu(np, pmc) \
+ for (pmc = rcu_dereference((np)->ipv6_mc_list); \
+ pmc; \
+ pmc = rcu_dereference(pmc->next))
+
+#define for_each_psf_mclock(mc, psf) \
+ for (psf = mc_dereference((mc)->mca_sources, mc->idev); \
+ psf; \
+ psf = mc_dereference(psf->sf_next, mc->idev))
+
+#define for_each_psf_rcu(mc, psf) \
+ for (psf = rcu_dereference((mc)->mca_sources); \
+ psf; \
+ psf = rcu_dereference(psf->sf_next))
+
+#define for_each_psf_tomb(mc, psf) \
+ for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \
+ psf; \
+ psf = mc_dereference(psf->sf_next, mc->idev))
+
+#define for_each_mc_mclock(idev, mc) \
+ for (mc = mc_dereference((idev)->mc_list, idev); \
+ mc; \
+ mc = mc_dereference(mc->next, idev))
+
+#define for_each_mc_rcu(idev, mc) \
+ for (mc = rcu_dereference((idev)->mc_list); \
+ mc; \
+ mc = rcu_dereference(mc->next))
+
+#define for_each_mc_tomb(idev, mc) \
+ for (mc = mc_dereference((idev)->mc_tomb, idev); \
+ mc; \
+ mc = mc_dereference(mc->next, idev))
+
+static int unsolicited_report_interval(struct inet6_dev *idev)
+{
+ int iv;
+
+ if (mld_in_v1_mode(idev))
+ iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval);
+ else
+ iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval);
+
+ return iv > 0 ? iv : 1;
+}
+
+static struct net_device *ip6_mc_find_dev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
+{
+ struct net_device *dev = NULL;
+ struct rt6_info *rt;
+
+ if (ifindex == 0) {
+ rcu_read_lock();
+ rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
+ if (rt) {
+ dev = dst_dev_rcu(&rt->dst);
+ dev_hold(dev);
+ ip6_rt_put(rt);
+ }
+ rcu_read_unlock();
+ } else {
+ dev = dev_get_by_index(net, ifindex);
+ }
+
+ return dev;
+}
/*
* socket join on multicast group
*/
-
-int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
+static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
+ const struct in6_addr *addr, unsigned int mode)
{
- struct net_device *dev = NULL;
- struct ipv6_mc_socklist *mc_lst;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_mc_socklist *mc_lst;
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
int err;
if (!ipv6_addr_is_multicast(addr))
return -EINVAL;
- read_lock_bh(&ipv6_sk_mc_lock);
- for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
+ for_each_pmc_socklock(np, sk, mc_lst) {
if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
- ipv6_addr_equal(&mc_lst->addr, addr)) {
- read_unlock_bh(&ipv6_sk_mc_lock);
+ ipv6_addr_equal(&mc_lst->addr, addr))
return -EADDRINUSE;
- }
}
- read_unlock_bh(&ipv6_sk_mc_lock);
mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
-
- if (mc_lst == NULL)
+ if (!mc_lst)
return -ENOMEM;
mc_lst->next = NULL;
- ipv6_addr_copy(&mc_lst->addr, addr);
+ mc_lst->addr = *addr;
- if (ifindex == 0) {
- struct rt6_info *rt;
- rt = rt6_lookup(addr, NULL, 0, 0);
- if (rt) {
- dev = rt->rt6i_dev;
- dev_hold(dev);
- dst_release(&rt->u.dst);
- }
- } else
- dev = dev_get_by_index(ifindex);
-
- if (dev == NULL) {
+ dev = ip6_mc_find_dev(net, addr, ifindex);
+ if (!dev) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return -ENODEV;
}
mc_lst->ifindex = dev->ifindex;
- mc_lst->sfmode = MCAST_EXCLUDE;
- rwlock_init(&mc_lst->sflock);
- mc_lst->sflist = NULL;
+ mc_lst->sfmode = mode;
+ RCU_INIT_POINTER(mc_lst->sflist, NULL);
- /*
- * now add/increase the group membership on the device
- */
+ /* now add/increase the group membership on the device */
+ err = __ipv6_dev_mc_inc(dev, addr, mode);
- err = ipv6_dev_mc_inc(dev, addr);
+ dev_put(dev);
if (err) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
- dev_put(dev);
return err;
}
- write_lock_bh(&ipv6_sk_mc_lock);
mc_lst->next = np->ipv6_mc_list;
- np->ipv6_mc_list = mc_lst;
- write_unlock_bh(&ipv6_sk_mc_lock);
-
- dev_put(dev);
+ rcu_assign_pointer(np->ipv6_mc_list, mc_lst);
return 0;
}
+int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+ return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
+}
+EXPORT_SYMBOL(ipv6_sock_mc_join);
+
+int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
+ const struct in6_addr *addr, unsigned int mode)
+{
+ return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
+}
+
/*
* socket leave on multicast group
*/
-int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
+static void __ipv6_sock_mc_drop(struct sock *sk, struct ipv6_mc_socklist *mc_lst)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, mc_lst->ifindex);
+ if (dev) {
+ struct inet6_dev *idev = in6_dev_get(dev);
+
+ ip6_mc_leave_src(sk, mc_lst, idev);
+
+ if (idev) {
+ __ipv6_dev_mc_dec(idev, &mc_lst->addr);
+ in6_dev_put(idev);
+ }
+
+ dev_put(dev);
+ } else {
+ ip6_mc_leave_src(sk, mc_lst, NULL);
+ }
+
+ atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
+ kfree_rcu(mc_lst, rcu);
+}
+
+int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_mc_socklist *mc_lst, **lnk;
+ struct ipv6_mc_socklist __rcu **lnk;
+ struct ipv6_mc_socklist *mc_lst;
- write_lock_bh(&ipv6_sk_mc_lock);
- for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) {
+ if (!ipv6_addr_is_multicast(addr))
+ return -EINVAL;
+
+ for (lnk = &np->ipv6_mc_list;
+ (mc_lst = sock_dereference(*lnk, sk)) != NULL;
+ lnk = &mc_lst->next) {
if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
ipv6_addr_equal(&mc_lst->addr, addr)) {
- struct net_device *dev;
-
*lnk = mc_lst->next;
- write_unlock_bh(&ipv6_sk_mc_lock);
-
- if ((dev = dev_get_by_index(mc_lst->ifindex)) != NULL) {
- struct inet6_dev *idev = in6_dev_get(dev);
-
- (void) ip6_mc_leave_src(sk, mc_lst, idev);
- if (idev) {
- __ipv6_dev_mc_dec(idev, &mc_lst->addr);
- in6_dev_put(idev);
- }
- dev_put(dev);
- } else
- (void) ip6_mc_leave_src(sk, mc_lst, NULL);
- sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
+ __ipv6_sock_mc_drop(sk, mc_lst);
return 0;
}
}
- write_unlock_bh(&ipv6_sk_mc_lock);
return -EADDRNOTAVAIL;
}
+EXPORT_SYMBOL(ipv6_sock_mc_drop);
-static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
+static struct inet6_dev *ip6_mc_find_idev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
{
- struct net_device *dev = NULL;
- struct inet6_dev *idev = NULL;
-
- if (ifindex == 0) {
- struct rt6_info *rt;
-
- rt = rt6_lookup(group, NULL, 0, 0);
- if (rt) {
- dev = rt->rt6i_dev;
- dev_hold(dev);
- dst_release(&rt->u.dst);
- }
- } else
- dev = dev_get_by_index(ifindex);
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ dev = ip6_mc_find_dev(net, group, ifindex);
if (!dev)
return NULL;
+
idev = in6_dev_get(dev);
- if (!idev) {
- dev_put(dev);
- return NULL;
- }
- read_lock_bh(&idev->lock);
- if (idev->dead) {
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
- dev_put(dev);
- return NULL;
- }
+ dev_put(dev);
+
return idev;
}
-void ipv6_sock_mc_close(struct sock *sk)
+void __ipv6_sock_mc_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6_mc_socklist *mc_lst;
- write_lock_bh(&ipv6_sk_mc_lock);
- while ((mc_lst = np->ipv6_mc_list) != NULL) {
- struct net_device *dev;
-
+ while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) {
np->ipv6_mc_list = mc_lst->next;
- write_unlock_bh(&ipv6_sk_mc_lock);
-
- dev = dev_get_by_index(mc_lst->ifindex);
- if (dev) {
- struct inet6_dev *idev = in6_dev_get(dev);
+ __ipv6_sock_mc_drop(sk, mc_lst);
+ }
+}
- (void) ip6_mc_leave_src(sk, mc_lst, idev);
- if (idev) {
- __ipv6_dev_mc_dec(idev, &mc_lst->addr);
- in6_dev_put(idev);
- }
- dev_put(dev);
- } else
- (void) ip6_mc_leave_src(sk, mc_lst, NULL);
+void ipv6_sock_mc_close(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
- sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
+ if (!rcu_access_pointer(np->ipv6_mc_list))
+ return;
- write_lock_bh(&ipv6_sk_mc_lock);
- }
- write_unlock_bh(&ipv6_sk_mc_lock);
+ lock_sock(sk);
+ __ipv6_sock_mc_close(sk);
+ release_sock(sk);
}
int ip6_mc_source(int add, int omode, struct sock *sk,
- struct group_source_req *pgsr)
+ struct group_source_req *pgsr)
{
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct in6_addr *source, *group;
+ struct net *net = sock_net(sk);
struct ipv6_mc_socklist *pmc;
- struct net_device *dev;
- struct inet6_dev *idev;
- struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
- int i, j, rv;
+ struct inet6_dev *idev;
int leavegroup = 0;
- int pmclocked = 0;
+ int i, j, rv;
int err;
- if (pgsr->gsr_group.ss_family != AF_INET6 ||
- pgsr->gsr_source.ss_family != AF_INET6)
- return -EINVAL;
-
source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr;
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- idev = ip6_mc_find_dev(group, pgsr->gsr_interface);
+ idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface);
if (!idev)
return -ENODEV;
- dev = idev->dev;
+
+ mutex_lock(&idev->mc_lock);
+
+ if (idev->dead) {
+ err = -ENODEV;
+ goto done;
+ }
err = -EADDRNOTAVAIL;
- read_lock_bh(&ipv6_sk_mc_lock);
- for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
@@ -394,7 +393,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
goto done;
}
/* if a source filter was set, must be the same mode as before */
- if (pmc->sflist) {
+ if (rcu_access_pointer(pmc->sflist)) {
if (pmc->sfmode != omode) {
err = -EINVAL;
goto done;
@@ -406,17 +405,13 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
pmc->sfmode = omode;
}
- write_lock_bh(&pmc->sflock);
- pmclocked = 1;
-
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
- for (i=0; i<psl->sl_count; i++) {
- rv = memcmp(&psl->sl_addr[i], source,
- sizeof(struct in6_addr));
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
if (rv == 0)
break;
}
@@ -432,7 +427,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
/* update the interface filter */
ip6_mc_del_src(idev, group, omode, 1, source, 1);
- for (j=i+1; j<psl->sl_count; j++)
+ for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
@@ -450,7 +445,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (psl)
count += psl->sl_max;
- newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
@@ -458,21 +454,22 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
newpsl->sl_max = count;
newpsl->sl_count = count - IP6_SFBLOCK;
if (psl) {
- for (i=0; i<psl->sl_count; i++)
+ for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
- sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
}
- pmc->sflist = psl = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ kfree_rcu(psl, rcu);
+ psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
- for (i=0; i<psl->sl_count; i++) {
- rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr));
- if (rv == 0)
- break;
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
+ if (rv == 0) /* There is an error in the address. */
+ goto done;
}
- if (rv == 0) /* address already there is an error */
- goto done;
- for (j=psl->sl_count-1; j>=i; j--)
+ for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = *source;
psl->sl_count++;
@@ -480,25 +477,22 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
/* update the interface list */
ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
- if (pmclocked)
- write_unlock_bh(&pmc->sflock);
- read_unlock_bh(&ipv6_sk_mc_lock);
- read_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
- dev_put(dev);
if (leavegroup)
- return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
+ err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
return err;
}
-int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
+ struct sockaddr_storage *list)
{
- struct in6_addr *group;
- struct ipv6_mc_socklist *pmc;
- struct net_device *dev;
- struct inet6_dev *idev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *newpsl, *psl;
+ struct net *net = sock_net(sk);
+ const struct in6_addr *group;
+ struct ipv6_mc_socklist *pmc;
+ struct inet6_dev *idev;
int leavegroup = 0;
int i, err;
@@ -510,21 +504,25 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
gsf->gf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- idev = ip6_mc_find_dev(group, gsf->gf_interface);
-
+ idev = ip6_mc_find_idev(net, group, gsf->gf_interface);
if (!idev)
return -ENODEV;
- dev = idev->dev;
+
+ mutex_lock(&idev->mc_lock);
+
+ if (idev->dead) {
+ err = -ENODEV;
+ goto done;
+ }
err = 0;
- read_lock_bh(&ipv6_sk_mc_lock);
if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
leavegroup = 1;
goto done;
}
- for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pmc->ifindex != gsf->gf_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
@@ -535,108 +533,86 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
goto done;
}
if (gsf->gf_numsrc) {
- newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc),
- GFP_ATOMIC);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+ gsf->gf_numsrc),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
- for (i=0; i<newpsl->sl_count; ++i) {
+ for (i = 0; i < newpsl->sl_count; ++i, ++list) {
struct sockaddr_in6 *psin6;
- psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i];
+ psin6 = (struct sockaddr_in6 *)list;
newpsl->sl_addr[i] = psin6->sin6_addr;
}
+
err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
- newpsl->sl_count, newpsl->sl_addr, 0);
+ newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
- sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
+ sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr,
+ newpsl->sl_max));
goto done;
}
} else {
newpsl = NULL;
- (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+ ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
}
- write_lock_bh(&pmc->sflock);
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
if (psl) {
- (void) ip6_mc_del_src(idev, group, pmc->sfmode,
- psl->sl_count, psl->sl_addr, 0);
- sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
- } else
- (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
- pmc->sflist = newpsl;
+ ip6_mc_del_src(idev, group, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ } else {
+ ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+ }
+
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ kfree_rcu(psl, rcu);
pmc->sfmode = gsf->gf_fmode;
- write_unlock_bh(&pmc->sflock);
err = 0;
done:
- read_unlock_bh(&ipv6_sk_mc_lock);
- read_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
- dev_put(dev);
if (leavegroup)
err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
return err;
}
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen)
+ sockptr_t optval, size_t ss_offset)
{
- int err, i, count, copycount;
- struct in6_addr *group;
- struct ipv6_mc_socklist *pmc;
- struct inet6_dev *idev;
- struct net_device *dev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ const struct in6_addr *group;
+ struct ipv6_mc_socklist *pmc;
struct ip6_sf_socklist *psl;
+ unsigned int count;
+ int i, copycount;
group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- idev = ip6_mc_find_dev(group, gsf->gf_interface);
-
- if (!idev)
- return -ENODEV;
-
- dev = idev->dev;
-
- err = -EADDRNOTAVAIL;
- /*
- * changes to the ipv6_mc_list require the socket lock and
- * a read lock on ip6_sk_mc_lock. We have the socket lock,
- * so reading the list is safe.
- */
-
- for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pmc->ifindex != gsf->gf_interface)
continue;
if (ipv6_addr_equal(group, &pmc->addr))
break;
}
if (!pmc) /* must have a prior join */
- goto done;
+ return -EADDRNOTAVAIL;
+
gsf->gf_fmode = pmc->sfmode;
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
count = psl ? psl->sl_count : 0;
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
- dev_put(dev);
- copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ copycount = min(count, gsf->gf_numsrc);
gsf->gf_numsrc = count;
- if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
- return -EFAULT;
- }
- /* changes to psl require the socket lock, a read lock on
- * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We
- * have the socket lock, so reading here is safe.
- */
- for (i=0; i<copycount; i++) {
+ for (i = 0; i < copycount; i++) {
struct sockaddr_in6 *psin6;
struct sockaddr_storage ss;
@@ -644,87 +620,83 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
memset(&ss, 0, sizeof(ss));
psin6->sin6_family = AF_INET6;
psin6->sin6_addr = psl->sl_addr[i];
- if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
-done:
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
- dev_put(dev);
- return err;
}
-int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr,
- struct in6_addr *src_addr)
+bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
+ const struct in6_addr *src_addr)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_mc_socklist *mc;
- struct ip6_sf_socklist *psl;
- int rv = 1;
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_mc_socklist *mc;
+ const struct ip6_sf_socklist *psl;
+ bool rv = true;
- read_lock(&ipv6_sk_mc_lock);
- for (mc = np->ipv6_mc_list; mc; mc = mc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(np, mc) {
if (ipv6_addr_equal(&mc->addr, mc_addr))
break;
}
if (!mc) {
- read_unlock(&ipv6_sk_mc_lock);
- return 1;
+ rcu_read_unlock();
+ return inet6_test_bit(MC6_ALL, sk);
}
- read_lock(&mc->sflock);
- psl = mc->sflist;
+ psl = rcu_dereference(mc->sflist);
if (!psl) {
rv = mc->sfmode == MCAST_EXCLUDE;
} else {
int i;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
if (ipv6_addr_equal(&psl->sl_addr[i], src_addr))
break;
}
if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
- rv = 0;
+ rv = false;
if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
- rv = 0;
+ rv = false;
}
- read_unlock(&mc->sflock);
- read_unlock(&ipv6_sk_mc_lock);
+ rcu_read_unlock();
return rv;
}
-static void ma_put(struct ifmcaddr6 *mc)
-{
- if (atomic_dec_and_test(&mc->mca_refcnt)) {
- in6_dev_put(mc->idev);
- kfree(mc);
- }
-}
-
static void igmp6_group_added(struct ifmcaddr6 *mc)
{
struct net_device *dev = mc->idev->dev;
char buf[MAX_ADDR_LEN];
- spin_lock_bh(&mc->mca_lock);
+ mc_assert_locked(mc->idev);
+
+ if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ return;
+
if (!(mc->mca_flags&MAF_LOADED)) {
mc->mca_flags |= MAF_LOADED;
if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
- dev_mc_add(dev, buf, dev->addr_len, 0);
+ dev_mc_add(dev, buf);
}
- spin_unlock_bh(&mc->mca_lock);
if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
return;
- if (MLD_V1_SEEN(mc->idev)) {
+ if (mld_in_v1_mode(mc->idev)) {
igmp6_join_group(mc);
return;
}
/* else v2 */
- mc->mca_crcount = mc->idev->mc_qrv;
+ /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
+ * should not send filter-mode change record as the mode
+ * should be from IN() to IN(A).
+ */
+ if (mc->mca_sfmode == MCAST_EXCLUDE)
+ mc->mca_crcount = mc->idev->mc_qrv;
+
mld_ifc_event(mc->idev);
}
@@ -733,47 +705,45 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
struct net_device *dev = mc->idev->dev;
char buf[MAX_ADDR_LEN];
- spin_lock_bh(&mc->mca_lock);
+ mc_assert_locked(mc->idev);
+
+ if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ return;
+
if (mc->mca_flags&MAF_LOADED) {
mc->mca_flags &= ~MAF_LOADED;
if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
- dev_mc_delete(dev, buf, dev->addr_len, 0);
+ dev_mc_del(dev, buf);
}
if (mc->mca_flags & MAF_NOREPORT)
- goto done;
- spin_unlock_bh(&mc->mca_lock);
+ return;
if (!mc->idev->dead)
igmp6_leave_group(mc);
- spin_lock_bh(&mc->mca_lock);
- if (del_timer(&mc->mca_timer))
- atomic_dec(&mc->mca_refcnt);
-done:
- ip6_mc_clear_src(mc);
- spin_unlock_bh(&mc->mca_lock);
+ if (cancel_delayed_work(&mc->mca_work))
+ refcount_dec(&mc->mca_refcnt);
}
-/*
- * deleted ifmcaddr6 manipulation
- */
+/* deleted ifmcaddr6 manipulation */
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
struct ifmcaddr6 *pmc;
+ mc_assert_locked(idev);
+
/* this is an "ifmcaddr6" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
* used for management of the delete list. Using the same structure
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
- pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC);
+ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
- spin_lock_bh(&im->mca_lock);
- spin_lock_init(&pmc->mca_lock);
pmc->idev = im->idev;
in6_dev_hold(idev);
pmc->mca_addr = im->mca_addr;
@@ -782,332 +752,440 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
if (pmc->mca_sfmode == MCAST_INCLUDE) {
struct ip6_sf_list *psf;
- pmc->mca_tomb = im->mca_tomb;
- pmc->mca_sources = im->mca_sources;
- im->mca_tomb = im->mca_sources = NULL;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next)
+ rcu_assign_pointer(pmc->mca_tomb,
+ mc_dereference(im->mca_tomb, idev));
+ rcu_assign_pointer(pmc->mca_sources,
+ mc_dereference(im->mca_sources, idev));
+ RCU_INIT_POINTER(im->mca_tomb, NULL);
+ RCU_INIT_POINTER(im->mca_sources, NULL);
+
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = pmc->mca_crcount;
}
- spin_unlock_bh(&im->mca_lock);
- write_lock_bh(&idev->mc_lock);
- pmc->next = idev->mc_tomb;
- idev->mc_tomb = pmc;
- write_unlock_bh(&idev->mc_lock);
+ rcu_assign_pointer(pmc->next, idev->mc_tomb);
+ rcu_assign_pointer(idev->mc_tomb, pmc);
}
-static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca)
+static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
+ struct ip6_sf_list *psf, *sources, *tomb;
+ struct in6_addr *pmca = &im->mca_addr;
struct ifmcaddr6 *pmc, *pmc_prev;
- struct ip6_sf_list *psf, *psf_next;
- write_lock_bh(&idev->mc_lock);
+ mc_assert_locked(idev);
+
pmc_prev = NULL;
- for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) {
+ for_each_mc_tomb(idev, pmc) {
if (ipv6_addr_equal(&pmc->mca_addr, pmca))
break;
pmc_prev = pmc;
}
- if (pmc) {
- if (pmc_prev)
- pmc_prev->next = pmc->next;
- else
- idev->mc_tomb = pmc->next;
- }
- write_unlock_bh(&idev->mc_lock);
- if (pmc) {
- for (psf=pmc->mca_tomb; psf; psf=psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
- }
- in6_dev_put(pmc->idev);
- kfree(pmc);
+ if (!pmc)
+ return;
+ if (pmc_prev)
+ rcu_assign_pointer(pmc_prev->next, pmc->next);
+ else
+ rcu_assign_pointer(idev->mc_tomb, pmc->next);
+
+ im->idev = pmc->idev;
+ if (im->mca_sfmode == MCAST_INCLUDE) {
+ tomb = rcu_replace_pointer(im->mca_tomb,
+ mc_dereference(pmc->mca_tomb, pmc->idev),
+ lockdep_is_held(&im->idev->mc_lock));
+ rcu_assign_pointer(pmc->mca_tomb, tomb);
+
+ sources = rcu_replace_pointer(im->mca_sources,
+ mc_dereference(pmc->mca_sources, pmc->idev),
+ lockdep_is_held(&im->idev->mc_lock));
+ rcu_assign_pointer(pmc->mca_sources, sources);
+ for_each_psf_mclock(im, psf)
+ psf->sf_crcount = idev->mc_qrv;
+ } else {
+ im->mca_crcount = idev->mc_qrv;
}
+ ip6_mc_clear_src(pmc);
+ in6_dev_put(pmc->idev);
+ kfree_rcu(pmc, rcu);
}
static void mld_clear_delrec(struct inet6_dev *idev)
{
struct ifmcaddr6 *pmc, *nextpmc;
- write_lock_bh(&idev->mc_lock);
- pmc = idev->mc_tomb;
- idev->mc_tomb = NULL;
- write_unlock_bh(&idev->mc_lock);
+ mc_assert_locked(idev);
+
+ pmc = mc_dereference(idev->mc_tomb, idev);
+ RCU_INIT_POINTER(idev->mc_tomb, NULL);
for (; pmc; pmc = nextpmc) {
- nextpmc = pmc->next;
+ nextpmc = mc_dereference(pmc->next, idev);
ip6_mc_clear_src(pmc);
in6_dev_put(pmc->idev);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
}
/* clear dead sources, too */
- read_lock_bh(&idev->lock);
- for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+ for_each_mc_mclock(idev, pmc) {
struct ip6_sf_list *psf, *psf_next;
- spin_lock_bh(&pmc->mca_lock);
- psf = pmc->mca_tomb;
- pmc->mca_tomb = NULL;
- spin_unlock_bh(&pmc->mca_lock);
- for (; psf; psf=psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ psf = mc_dereference(pmc->mca_tomb, idev);
+ RCU_INIT_POINTER(pmc->mca_tomb, NULL);
+ for (; psf; psf = psf_next) {
+ psf_next = mc_dereference(psf->sf_next, idev);
+ kfree_rcu(psf, rcu);
}
}
- read_unlock_bh(&idev->lock);
}
+static void mld_clear_query(struct inet6_dev *idev)
+{
+ spin_lock_bh(&idev->mc_query_lock);
+ __skb_queue_purge(&idev->mc_query_queue);
+ spin_unlock_bh(&idev->mc_query_lock);
+}
+
+static void mld_clear_report(struct inet6_dev *idev)
+{
+ spin_lock_bh(&idev->mc_report_lock);
+ __skb_queue_purge(&idev->mc_report_queue);
+ spin_unlock_bh(&idev->mc_report_lock);
+}
+
+static void ma_put(struct ifmcaddr6 *mc)
+{
+ if (refcount_dec_and_test(&mc->mca_refcnt)) {
+ in6_dev_put(mc->idev);
+ kfree_rcu(mc, rcu);
+ }
+}
+
+static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
+ const struct in6_addr *addr,
+ unsigned int mode)
+{
+ struct ifmcaddr6 *mc;
+
+ mc_assert_locked(idev);
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc)
+ return NULL;
+
+ INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work);
+
+ mc->mca_addr = *addr;
+ mc->idev = idev; /* reference taken by caller */
+ mc->mca_users = 1;
+ /* mca_stamp should be updated upon changes */
+ mc->mca_cstamp = mc->mca_tstamp = jiffies;
+ refcount_set(&mc->mca_refcnt, 1);
+
+ mc->mca_sfmode = mode;
+ mc->mca_sfcount[mode] = 1;
+
+ if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
+ IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
+ mc->mca_flags |= MAF_NOREPORT;
+
+ return mc;
+}
+
+static void inet6_ifmcaddr_notify(struct net_device *dev,
+ const struct ifmcaddr6 *ifmca, int event)
+{
+ struct inet6_fill_args fillargs = {
+ .portid = 0,
+ .seq = 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = true,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(struct in6_addr)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err);
+}
/*
* device multicast group inc (add if not found)
*/
-int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr)
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+ const struct in6_addr *addr, unsigned int mode)
{
- struct ifmcaddr6 *mc;
struct inet6_dev *idev;
+ struct ifmcaddr6 *mc;
+ /* we need to take a reference on idev */
idev = in6_dev_get(dev);
-
- if (idev == NULL)
+ if (!idev)
return -EINVAL;
- write_lock_bh(&idev->lock);
- if (idev->dead) {
- write_unlock_bh(&idev->lock);
+ mutex_lock(&idev->mc_lock);
+
+ if (READ_ONCE(idev->dead)) {
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return -ENODEV;
}
- for (mc = idev->mc_list; mc; mc = mc->next) {
+ for_each_mc_mclock(idev, mc) {
if (ipv6_addr_equal(&mc->mca_addr, addr)) {
mc->mca_users++;
- write_unlock_bh(&idev->lock);
- ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0,
- NULL, 0);
+ ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return 0;
}
}
- /*
- * not found: create a new one.
- */
-
- mc = kzalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC);
-
- if (mc == NULL) {
- write_unlock_bh(&idev->lock);
+ mc = mca_alloc(idev, addr, mode);
+ if (!mc) {
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return -ENOMEM;
}
- init_timer(&mc->mca_timer);
- mc->mca_timer.function = igmp6_timer_handler;
- mc->mca_timer.data = (unsigned long) mc;
+ rcu_assign_pointer(mc->next, idev->mc_list);
+ rcu_assign_pointer(idev->mc_list, mc);
- ipv6_addr_copy(&mc->mca_addr, addr);
- mc->idev = idev;
- mc->mca_users = 1;
- /* mca_stamp should be updated upon changes */
- mc->mca_cstamp = mc->mca_tstamp = jiffies;
- atomic_set(&mc->mca_refcnt, 2);
- spin_lock_init(&mc->mca_lock);
-
- /* initial mode is (EX, empty) */
- mc->mca_sfmode = MCAST_EXCLUDE;
- mc->mca_sfcount[MCAST_EXCLUDE] = 1;
-
- if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
- IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
- mc->mca_flags |= MAF_NOREPORT;
-
- mc->next = idev->mc_list;
- idev->mc_list = mc;
- write_unlock_bh(&idev->lock);
-
- mld_del_delrec(idev, &mc->mca_addr);
+ mld_del_delrec(idev, mc);
igmp6_group_added(mc);
- ma_put(mc);
+ inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
+ mutex_unlock(&idev->mc_lock);
+
return 0;
}
+int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+ return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
+}
+EXPORT_SYMBOL(ipv6_dev_mc_inc);
+
/*
- * device multicast group del
+ * device multicast group del
*/
-int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr)
+int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
- struct ifmcaddr6 *ma, **map;
+ struct ifmcaddr6 *ma, __rcu **map;
+
+ mutex_lock(&idev->mc_lock);
- write_lock_bh(&idev->lock);
- for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) {
+ for (map = &idev->mc_list;
+ (ma = mc_dereference(*map, idev));
+ map = &ma->next) {
if (ipv6_addr_equal(&ma->mca_addr, addr)) {
if (--ma->mca_users == 0) {
*map = ma->next;
- write_unlock_bh(&idev->lock);
igmp6_group_dropped(ma);
+ inet6_ifmcaddr_notify(idev->dev, ma,
+ RTM_DELMULTICAST);
+ ip6_mc_clear_src(ma);
+ mutex_unlock(&idev->mc_lock);
ma_put(ma);
return 0;
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
return 0;
}
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
return -ENOENT;
}
-int ipv6_dev_mc_dec(struct net_device *dev, struct in6_addr *addr)
+int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
{
- struct inet6_dev *idev = in6_dev_get(dev);
+ struct inet6_dev *idev;
int err;
+ idev = in6_dev_get(dev);
if (!idev)
return -ENODEV;
err = __ipv6_dev_mc_dec(idev, addr);
-
in6_dev_put(idev);
return err;
}
-
-/*
- * identify MLD packets for MLD filter exceptions
- */
-int ipv6_is_mld(struct sk_buff *skb, int nexthdr)
-{
- struct icmp6hdr *pic;
-
- if (nexthdr != IPPROTO_ICMPV6)
- return 0;
-
- if (!pskb_may_pull(skb, sizeof(struct icmp6hdr)))
- return 0;
-
- pic = (struct icmp6hdr *)skb->h.raw;
-
- switch (pic->icmp6_type) {
- case ICMPV6_MGM_QUERY:
- case ICMPV6_MGM_REPORT:
- case ICMPV6_MGM_REDUCTION:
- case ICMPV6_MLD2_REPORT:
- return 1;
- default:
- break;
- }
- return 0;
-}
+EXPORT_SYMBOL(ipv6_dev_mc_dec);
/*
* check if the interface/address pair is valid
*/
-int ipv6_chk_mcast_addr(struct net_device *dev, struct in6_addr *group,
- struct in6_addr *src_addr)
+bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
+ const struct in6_addr *src_addr)
{
struct inet6_dev *idev;
struct ifmcaddr6 *mc;
- int rv = 0;
+ bool rv = false;
- idev = in6_dev_get(dev);
- if (idev) {
- read_lock_bh(&idev->lock);
- for (mc = idev->mc_list; mc; mc=mc->next) {
- if (ipv6_addr_equal(&mc->mca_addr, group))
- break;
- }
- if (mc) {
- if (src_addr && !ipv6_addr_any(src_addr)) {
- struct ip6_sf_list *psf;
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto unlock;
+ for_each_mc_rcu(idev, mc) {
+ if (ipv6_addr_equal(&mc->mca_addr, group))
+ break;
+ }
+ if (!mc)
+ goto unlock;
+ if (src_addr && !ipv6_addr_any(src_addr)) {
+ struct ip6_sf_list *psf;
- spin_lock_bh(&mc->mca_lock);
- for (psf=mc->mca_sources;psf;psf=psf->sf_next) {
- if (ipv6_addr_equal(&psf->sf_addr, src_addr))
- break;
- }
- if (psf)
- rv = psf->sf_count[MCAST_INCLUDE] ||
- psf->sf_count[MCAST_EXCLUDE] !=
- mc->mca_sfcount[MCAST_EXCLUDE];
- else
- rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0;
- spin_unlock_bh(&mc->mca_lock);
- } else
- rv = 1; /* don't filter unspecified source */
+ for_each_psf_rcu(mc, psf) {
+ if (ipv6_addr_equal(&psf->sf_addr, src_addr))
+ break;
}
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
+ if (psf)
+ rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) ||
+ READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) !=
+ READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]);
+ else
+ rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0;
+ } else {
+ rv = true; /* don't filter unspecified source */
}
+unlock:
+ rcu_read_unlock();
return rv;
}
-static void mld_gq_start_timer(struct inet6_dev *idev)
+static void mld_gq_start_work(struct inet6_dev *idev)
{
- int tv = net_random() % idev->mc_maxdelay;
+ unsigned long tv = get_random_u32_below(idev->mc_maxdelay);
+
+ mc_assert_locked(idev);
idev->mc_gq_running = 1;
- if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2))
+ if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_ifc_start_timer(struct inet6_dev *idev, int delay)
+static void mld_gq_stop_work(struct inet6_dev *idev)
+{
+ mc_assert_locked(idev);
+
+ idev->mc_gq_running = 0;
+ if (cancel_delayed_work(&idev->mc_gq_work))
+ __in6_dev_put(idev);
+}
+
+static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay)
{
- int tv = net_random() % delay;
+ unsigned long tv = get_random_u32_below(delay);
- if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2))
+ mc_assert_locked(idev);
+
+ if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2))
in6_dev_hold(idev);
}
-/*
- * IGMP handling (alias multicast ICMPv6 messages)
- */
+static void mld_ifc_stop_work(struct inet6_dev *idev)
+{
+ mc_assert_locked(idev);
+ idev->mc_ifc_count = 0;
+ if (cancel_delayed_work(&idev->mc_ifc_work))
+ __in6_dev_put(idev);
+}
+
+static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay)
+{
+ unsigned long tv = get_random_u32_below(delay);
+
+ mc_assert_locked(idev);
+
+ if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2))
+ in6_dev_hold(idev);
+}
+
+static void mld_dad_stop_work(struct inet6_dev *idev)
+{
+ if (cancel_delayed_work(&idev->mc_dad_work))
+ __in6_dev_put(idev);
+}
+
+static void mld_query_stop_work(struct inet6_dev *idev)
+{
+ spin_lock_bh(&idev->mc_query_lock);
+ if (cancel_delayed_work(&idev->mc_query_work))
+ __in6_dev_put(idev);
+ spin_unlock_bh(&idev->mc_query_lock);
+}
+
+static void mld_report_stop_work(struct inet6_dev *idev)
+{
+ if (cancel_delayed_work_sync(&idev->mc_report_work))
+ __in6_dev_put(idev);
+}
+
+/* IGMP handling (alias multicast ICMPv6 messages) */
static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
{
unsigned long delay = resptime;
- /* Do not start timer for these addresses */
+ mc_assert_locked(ma->idev);
+
+ /* Do not start work for these addresses */
if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- if (del_timer(&ma->mca_timer)) {
- atomic_dec(&ma->mca_refcnt);
- delay = ma->mca_timer.expires - jiffies;
+ if (cancel_delayed_work(&ma->mca_work)) {
+ refcount_dec(&ma->mca_refcnt);
+ delay = ma->mca_work.timer.expires - jiffies;
}
- if (delay >= resptime) {
- if (resptime)
- delay = net_random() % resptime;
- else
- delay = 1;
- }
- ma->mca_timer.expires = jiffies + delay;
- if (!mod_timer(&ma->mca_timer, jiffies + delay))
- atomic_inc(&ma->mca_refcnt);
+ if (delay >= resptime)
+ delay = get_random_u32_below(resptime);
+
+ if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
+ refcount_inc(&ma->mca_refcnt);
ma->mca_flags |= MAF_TIMER_RUNNING;
}
/* mark EXCLUDE-mode sources */
-static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
- struct in6_addr *srcs)
+static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
+ const struct in6_addr *srcs)
{
struct ip6_sf_list *psf;
int i, scount;
+ mc_assert_locked(pmc->idev);
+
scount = 0;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++) {
+ for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
- if (pmc->mca_sfcount[MCAST_INCLUDE] ||
+ if (psf->sf_count[MCAST_INCLUDE] ||
pmc->mca_sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
- continue;
+ break;
if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
scount++;
break;
@@ -1116,26 +1194,28 @@ static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
}
pmc->mca_flags &= ~MAF_GSQUERY;
if (scount == nsrcs) /* all sources excluded */
- return 0;
- return 1;
+ return false;
+ return true;
}
-static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
- struct in6_addr *srcs)
+static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
+ const struct in6_addr *srcs)
{
struct ip6_sf_list *psf;
int i, scount;
+ mc_assert_locked(pmc->idev);
+
if (pmc->mca_sfmode == MCAST_EXCLUDE)
return mld_xmarksources(pmc, nsrcs, srcs);
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++) {
+ for (i = 0; i < nsrcs; i++) {
if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
psf->sf_gsresp = 1;
scount++;
@@ -1145,116 +1225,292 @@ static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
}
if (!scount) {
pmc->mca_flags &= ~MAF_GSQUERY;
- return 0;
+ return false;
}
pmc->mca_flags |= MAF_GSQUERY;
- return 1;
+ return true;
+}
+
+static int mld_force_mld_version(const struct inet6_dev *idev)
+{
+ const struct net *net = dev_net(idev->dev);
+ int all_force;
+
+ all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version);
+ /* Normally, both are 0 here. If enforcement to a particular is
+ * being used, individual device enforcement will have a lower
+ * precedence over 'all' device (.../conf/all/force_mld_version).
+ */
+ return all_force ?: READ_ONCE(idev->cnf.force_mld_version);
+}
+
+static bool mld_in_v2_mode_only(const struct inet6_dev *idev)
+{
+ return mld_force_mld_version(idev) == 2;
+}
+
+static bool mld_in_v1_mode_only(const struct inet6_dev *idev)
+{
+ return mld_force_mld_version(idev) == 1;
+}
+
+static bool mld_in_v1_mode(const struct inet6_dev *idev)
+{
+ if (mld_in_v2_mode_only(idev))
+ return false;
+ if (mld_in_v1_mode_only(idev))
+ return true;
+ if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen))
+ return true;
+
+ return false;
+}
+
+static void mld_set_v1_mode(struct inet6_dev *idev)
+{
+ /* RFC3810, relevant sections:
+ * - 9.1. Robustness Variable
+ * - 9.2. Query Interval
+ * - 9.3. Query Response Interval
+ * - 9.12. Older Version Querier Present Timeout
+ */
+ unsigned long switchback;
+
+ switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri;
+
+ idev->mc_v1_seen = jiffies + switchback;
+}
+
+static void mld_update_qrv(struct inet6_dev *idev,
+ const struct mld2_query *mlh2)
+{
+ /* RFC3810, relevant sections:
+ * - 5.1.8. QRV (Querier's Robustness Variable)
+ * - 9.1. Robustness Variable
+ */
+
+ /* The value of the Robustness Variable MUST NOT be zero,
+ * and SHOULD NOT be one. Catch this here if we ever run
+ * into such a case in future.
+ */
+ const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv);
+ WARN_ON(idev->mc_qrv == 0);
+
+ if (mlh2->mld2q_qrv > 0)
+ idev->mc_qrv = mlh2->mld2q_qrv;
+
+ if (unlikely(idev->mc_qrv < min_qrv)) {
+ net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n",
+ idev->mc_qrv, min_qrv);
+ idev->mc_qrv = min_qrv;
+ }
+}
+
+static void mld_update_qi(struct inet6_dev *idev,
+ const struct mld2_query *mlh2)
+{
+ /* RFC3810, relevant sections:
+ * - 5.1.9. QQIC (Querier's Query Interval Code)
+ * - 9.2. Query Interval
+ * - 9.12. Older Version Querier Present Timeout
+ * (the [Query Interval] in the last Query received)
+ */
+ unsigned long mc_qqi;
+
+ if (mlh2->mld2q_qqic < 128) {
+ mc_qqi = mlh2->mld2q_qqic;
+ } else {
+ unsigned long mc_man, mc_exp;
+
+ mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic);
+ mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic);
+
+ mc_qqi = (mc_man | 0x10) << (mc_exp + 3);
+ }
+
+ idev->mc_qi = mc_qqi * HZ;
+}
+
+static void mld_update_qri(struct inet6_dev *idev,
+ const struct mld2_query *mlh2)
+{
+ /* RFC3810, relevant sections:
+ * - 5.1.3. Maximum Response Code
+ * - 9.3. Query Response Interval
+ */
+ idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2));
+}
+
+static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
+ unsigned long *max_delay, bool v1_query)
+{
+ unsigned long mldv1_md;
+
+ /* Ignore v1 queries */
+ if (mld_in_v2_mode_only(idev))
+ return -EINVAL;
+
+ mldv1_md = ntohs(mld->mld_maxdelay);
+
+ /* When in MLDv1 fallback and a MLDv2 router start-up being
+ * unaware of current MLDv1 operation, the MRC == MRD mapping
+ * only works when the exponential algorithm is not being
+ * used (as MLDv1 is unaware of such things).
+ *
+ * According to the RFC author, the MLDv2 implementations
+ * he's aware of all use a MRC < 32768 on start up queries.
+ *
+ * Thus, should we *ever* encounter something else larger
+ * than that, just assume the maximum possible within our
+ * reach.
+ */
+ if (!v1_query)
+ mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT);
+
+ *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);
+
+ /* MLDv1 router present: we need to go into v1 mode *only*
+ * when an MLDv1 query is received as per section 9.12. of
+ * RFC3810! And we know from RFC2710 section 3.7 that MLDv1
+ * queries MUST be of exactly 24 octets.
+ */
+ if (v1_query)
+ mld_set_v1_mode(idev);
+
+ /* cancel MLDv2 report work */
+ mld_gq_stop_work(idev);
+ /* cancel the interface change work */
+ mld_ifc_stop_work(idev);
+ /* clear deleted report items */
+ mld_clear_delrec(idev);
+
+ return 0;
+}
+
+static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
+ unsigned long *max_delay)
+{
+ *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);
+
+ mld_update_qrv(idev, mld);
+ mld_update_qi(idev, mld);
+ mld_update_qri(idev, mld);
+
+ idev->mc_maxdelay = *max_delay;
+
+ return;
}
-int igmp6_event_query(struct sk_buff *skb)
+/* called with rcu_read_lock() */
+void igmp6_event_query(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev || idev->dead)
+ goto out;
+
+ spin_lock_bh(&idev->mc_query_lock);
+ if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) {
+ __skb_queue_tail(&idev->mc_query_queue, skb);
+ if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ in6_dev_hold(idev);
+ skb = NULL;
+ }
+ spin_unlock_bh(&idev->mc_query_lock);
+out:
+ kfree_skb(skb);
+}
+
+static void __mld_query_work(struct sk_buff *skb)
{
struct mld2_query *mlh2 = NULL;
- struct ifmcaddr6 *ma;
- struct in6_addr *group;
+ const struct in6_addr *group;
unsigned long max_delay;
struct inet6_dev *idev;
- struct icmp6hdr *hdr;
+ struct ifmcaddr6 *ma;
+ struct mld_msg *mld;
int group_type;
int mark = 0;
- int len;
+ int len, err;
if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
- return -EINVAL;
+ goto kfree_skb;
/* compute payload length excluding extension headers */
- len = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr);
- len -= (char *)skb->h.raw - (char *)skb->nh.ipv6h;
-
- /* Drop queries with not link local source */
- if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
- return -EINVAL;
+ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
+ len -= skb_network_header_len(skb);
+
+ /* RFC3810 6.2
+ * Upon reception of an MLD message that contains a Query, the node
+ * checks if the source address of the message is a valid link-local
+ * address, if the Hop Limit is set to 1, and if the Router Alert
+ * option is present in the Hop-By-Hop Options header of the IPv6
+ * packet. If any of these checks fails, the packet is dropped.
+ */
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) ||
+ ipv6_hdr(skb)->hop_limit != 1 ||
+ !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
+ IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
+ goto kfree_skb;
idev = in6_dev_get(skb->dev);
+ if (!idev)
+ goto kfree_skb;
- if (idev == NULL)
- return 0;
-
- hdr = (struct icmp6hdr *) skb->h.raw;
- group = (struct in6_addr *) (hdr + 1);
+ mld = (struct mld_msg *)icmp6_hdr(skb);
+ group = &mld->mld_mca;
group_type = ipv6_addr_type(group);
if (group_type != IPV6_ADDR_ANY &&
- !(group_type&IPV6_ADDR_MULTICAST)) {
- in6_dev_put(idev);
- return -EINVAL;
- }
+ !(group_type&IPV6_ADDR_MULTICAST))
+ goto out;
- if (len == 24) {
- int switchback;
- /* MLDv1 router present */
+ if (len < MLD_V1_QUERY_LEN) {
+ goto out;
+ } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) {
+ err = mld_process_v1(idev, mld, &max_delay,
+ len == MLD_V1_QUERY_LEN);
+ if (err < 0)
+ goto out;
+ } else if (len >= MLD_V2_QUERY_LEN_MIN) {
+ int srcs_offset = sizeof(struct mld2_query) -
+ sizeof(struct icmp6hdr);
- /* Translate milliseconds to jiffies */
- max_delay = (ntohs(hdr->icmp6_maxdelay)*HZ)/1000;
+ if (!pskb_may_pull(skb, srcs_offset))
+ goto out;
- switchback = (idev->mc_qrv + 1) * max_delay;
- idev->mc_v1_seen = jiffies + switchback;
+ mlh2 = (struct mld2_query *)skb_transport_header(skb);
+
+ mld_process_v2(idev, mlh2, &max_delay);
- /* cancel the interface change timer */
- idev->mc_ifc_count = 0;
- if (del_timer(&idev->mc_ifc_timer))
- __in6_dev_put(idev);
- /* clear deleted report items */
- mld_clear_delrec(idev);
- } else if (len >= 28) {
- int srcs_offset = sizeof(struct mld2_query) -
- sizeof(struct icmp6hdr);
- if (!pskb_may_pull(skb, srcs_offset)) {
- in6_dev_put(idev);
- return -EINVAL;
- }
- mlh2 = (struct mld2_query *) skb->h.raw;
- max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000;
- if (!max_delay)
- max_delay = 1;
- idev->mc_maxdelay = max_delay;
- if (mlh2->qrv)
- idev->mc_qrv = mlh2->qrv;
if (group_type == IPV6_ADDR_ANY) { /* general query */
- if (mlh2->nsrcs) {
- in6_dev_put(idev);
- return -EINVAL; /* no sources allowed */
- }
- mld_gq_start_timer(idev);
- in6_dev_put(idev);
- return 0;
+ if (mlh2->mld2q_nsrcs)
+ goto out; /* no sources allowed */
+
+ mld_gq_start_work(idev);
+ goto out;
}
/* mark sources to include, if group & source-specific */
- if (mlh2->nsrcs != 0) {
- if (!pskb_may_pull(skb, srcs_offset +
- ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) {
- in6_dev_put(idev);
- return -EINVAL;
- }
- mlh2 = (struct mld2_query *) skb->h.raw;
+ if (mlh2->mld2q_nsrcs != 0) {
+ if (!pskb_may_pull(skb, srcs_offset +
+ ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
+ goto out;
+
+ mlh2 = (struct mld2_query *)skb_transport_header(skb);
mark = 1;
}
} else {
- in6_dev_put(idev);
- return -EINVAL;
+ goto out;
}
- read_lock_bh(&idev->lock);
if (group_type == IPV6_ADDR_ANY) {
- for (ma = idev->mc_list; ma; ma=ma->next) {
- spin_lock_bh(&ma->mca_lock);
+ for_each_mc_mclock(idev, ma) {
igmp6_group_queried(ma, max_delay);
- spin_unlock_bh(&ma->mca_lock);
}
} else {
- for (ma = idev->mc_list; ma; ma=ma->next) {
+ for_each_mc_mclock(idev, ma) {
if (!ipv6_addr_equal(group, &ma->mca_addr))
continue;
- spin_lock_bh(&ma->mca_lock);
if (ma->mca_flags & MAF_TIMER_RUNNING) {
/* gsquery <- gsquery && mark */
if (!mark)
@@ -1267,84 +1523,166 @@ int igmp6_event_query(struct sk_buff *skb)
ma->mca_flags &= ~MAF_GSQUERY;
}
if (!(ma->mca_flags & MAF_GSQUERY) ||
- mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs))
+ mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
igmp6_group_queried(ma, max_delay);
- spin_unlock_bh(&ma->mca_lock);
break;
}
}
- read_unlock_bh(&idev->lock);
+
+out:
in6_dev_put(idev);
+kfree_skb:
+ consume_skb(skb);
+}
- return 0;
+static void mld_query_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_query_work);
+ struct sk_buff_head q;
+ struct sk_buff *skb;
+ bool rework = false;
+ int cnt = 0;
+
+ skb_queue_head_init(&q);
+
+ spin_lock_bh(&idev->mc_query_lock);
+ while ((skb = __skb_dequeue(&idev->mc_query_queue))) {
+ __skb_queue_tail(&q, skb);
+
+ if (++cnt >= MLD_MAX_QUEUE) {
+ rework = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&idev->mc_query_lock);
+
+ mutex_lock(&idev->mc_lock);
+ while ((skb = __skb_dequeue(&q)))
+ __mld_query_work(skb);
+ mutex_unlock(&idev->mc_lock);
+
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
+/* called with rcu_read_lock() */
+void igmp6_event_report(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev || idev->dead)
+ goto out;
-int igmp6_event_report(struct sk_buff *skb)
+ spin_lock_bh(&idev->mc_report_lock);
+ if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) {
+ __skb_queue_tail(&idev->mc_report_queue, skb);
+ if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ in6_dev_hold(idev);
+ skb = NULL;
+ }
+ spin_unlock_bh(&idev->mc_report_lock);
+out:
+ kfree_skb(skb);
+}
+
+static void __mld_report_work(struct sk_buff *skb)
{
- struct ifmcaddr6 *ma;
- struct in6_addr *addrp;
struct inet6_dev *idev;
- struct icmp6hdr *hdr;
+ struct ifmcaddr6 *ma;
+ struct mld_msg *mld;
int addr_type;
/* Our own report looped back. Ignore it. */
if (skb->pkt_type == PACKET_LOOPBACK)
- return 0;
+ goto kfree_skb;
/* send our report if the MC router may not have heard this report */
if (skb->pkt_type != PACKET_MULTICAST &&
skb->pkt_type != PACKET_BROADCAST)
- return 0;
+ goto kfree_skb;
- if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
- return -EINVAL;
+ if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
+ goto kfree_skb;
- hdr = (struct icmp6hdr*) skb->h.raw;
+ mld = (struct mld_msg *)icmp6_hdr(skb);
/* Drop reports with not link local source */
- addr_type = ipv6_addr_type(&skb->nh.ipv6h->saddr);
- if (addr_type != IPV6_ADDR_ANY &&
+ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
+ if (addr_type != IPV6_ADDR_ANY &&
!(addr_type&IPV6_ADDR_LINKLOCAL))
- return -EINVAL;
-
- addrp = (struct in6_addr *) (hdr + 1);
+ goto kfree_skb;
idev = in6_dev_get(skb->dev);
- if (idev == NULL)
- return -ENODEV;
+ if (!idev)
+ goto kfree_skb;
/*
- * Cancel the timer for this group
+ * Cancel the work for this group
*/
- read_lock_bh(&idev->lock);
- for (ma = idev->mc_list; ma; ma=ma->next) {
- if (ipv6_addr_equal(&ma->mca_addr, addrp)) {
- spin_lock(&ma->mca_lock);
- if (del_timer(&ma->mca_timer))
- atomic_dec(&ma->mca_refcnt);
- ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING);
- spin_unlock(&ma->mca_lock);
+ for_each_mc_mclock(idev, ma) {
+ if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
+ if (cancel_delayed_work(&ma->mca_work))
+ refcount_dec(&ma->mca_refcnt);
+ ma->mca_flags &= ~(MAF_LAST_REPORTER |
+ MAF_TIMER_RUNNING);
break;
}
}
- read_unlock_bh(&idev->lock);
+
+ in6_dev_put(idev);
+kfree_skb:
+ consume_skb(skb);
+}
+
+static void mld_report_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_report_work);
+ struct sk_buff_head q;
+ struct sk_buff *skb;
+ bool rework = false;
+ int cnt = 0;
+
+ skb_queue_head_init(&q);
+ spin_lock_bh(&idev->mc_report_lock);
+ while ((skb = __skb_dequeue(&idev->mc_report_queue))) {
+ __skb_queue_tail(&q, skb);
+
+ if (++cnt >= MLD_MAX_QUEUE) {
+ rework = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&idev->mc_report_lock);
+
+ mutex_lock(&idev->mc_lock);
+ while ((skb = __skb_dequeue(&q)))
+ __mld_report_work(skb);
+ mutex_unlock(&idev->mc_lock);
+
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ return;
+
in6_dev_put(idev);
- return 0;
}
-static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
- int gdeleted, int sdeleted)
+static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
{
switch (type) {
case MLD2_MODE_IS_INCLUDE:
case MLD2_MODE_IS_EXCLUDE:
if (gdeleted || sdeleted)
- return 0;
+ return false;
if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) {
if (pmc->mca_sfmode == MCAST_INCLUDE)
- return 1;
+ return true;
/* don't include if this source is excluded
* in all filters
*/
@@ -1353,29 +1691,29 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
return pmc->mca_sfcount[MCAST_EXCLUDE] ==
psf->sf_count[MCAST_EXCLUDE];
}
- return 0;
+ return false;
case MLD2_CHANGE_TO_INCLUDE:
if (gdeleted || sdeleted)
- return 0;
+ return false;
return psf->sf_count[MCAST_INCLUDE] != 0;
case MLD2_CHANGE_TO_EXCLUDE:
if (gdeleted || sdeleted)
- return 0;
+ return false;
if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 ||
psf->sf_count[MCAST_INCLUDE])
- return 0;
+ return false;
return pmc->mca_sfcount[MCAST_EXCLUDE] ==
psf->sf_count[MCAST_EXCLUDE];
case MLD2_ALLOW_NEW_SOURCES:
if (gdeleted || !psf->sf_crcount)
- return 0;
+ return false;
return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted;
case MLD2_BLOCK_OLD_SOURCES:
if (pmc->mca_sfmode == MCAST_INCLUDE)
return gdeleted || (psf->sf_crcount && sdeleted);
return psf->sf_crcount && !gdeleted && !sdeleted;
}
- return 0;
+ return false;
}
static int
@@ -1384,7 +1722,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
struct ip6_sf_list *psf;
int scount = 0;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -1392,97 +1730,145 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
return scount;
}
-static struct sk_buff *mld_newpack(struct net_device *dev, int size)
+static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
+ struct net_device *dev, const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int proto, int len)
{
- struct sock *sk = igmp6_socket->sk;
- struct sk_buff *skb;
- struct mld2_report *pmr;
- struct in6_addr addr_buf;
- int err;
- u8 ra[8] = { IPPROTO_ICMPV6, 0,
- IPV6_TLV_ROUTERALERT, 2, 0, 0,
- IPV6_TLV_PADN, 0 };
+ struct ipv6hdr *hdr;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
- /* we assume size > sizeof(ra) here */
- skb = sock_alloc_send_skb(sk, size + LL_RESERVED_SPACE(dev), 1, &err);
+ skb_reset_network_header(skb);
+ skb_put(skb, sizeof(struct ipv6hdr));
+ hdr = ipv6_hdr(skb);
- if (skb == 0)
+ ip6_flow_hdr(hdr, 0, 0);
+
+ hdr->payload_len = htons(len);
+ hdr->nexthdr = proto;
+ hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);
+
+ hdr->saddr = *saddr;
+ hdr->daddr = *daddr;
+}
+
+static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
+{
+ u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT,
+ 2, 0, 0, IPV6_TLV_PADN, 0 };
+ struct net_device *dev = idev->dev;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ const struct in6_addr *saddr;
+ struct in6_addr addr_buf;
+ struct mld2_report *pmr;
+ struct sk_buff *skb;
+ unsigned int size;
+ struct sock *sk;
+ struct net *net;
+
+ /* we assume size > sizeof(ra) here
+ * Also try to not allocate high-order pages for big MTU
+ */
+ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen;
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
return NULL;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
+
+ rcu_read_lock();
- if (ipv6_get_lladdr(dev, &addr_buf)) {
+ net = dev_net_rcu(dev);
+ sk = net->ipv6.igmp_sk;
+ skb_set_owner_w(skb, sk);
+
+ if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
/* <draft-ietf-magma-mld-source-05.txt>:
- * use unspecified address as the source address
+ * use unspecified address as the source address
* when a valid link-local address is not available.
*/
- memset(&addr_buf, 0, sizeof(addr_buf));
- }
-
- ip6_nd_hdr(sk, skb, dev, &addr_buf, &mld2_all_mcr, NEXTHDR_HOP, 0);
-
- memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
-
- pmr =(struct mld2_report *)skb_put(skb, sizeof(*pmr));
- skb->h.raw = (unsigned char *)pmr;
- pmr->type = ICMPV6_MLD2_REPORT;
- pmr->resv1 = 0;
- pmr->csum = 0;
- pmr->resv2 = 0;
- pmr->ngrec = 0;
- return skb;
-}
+ saddr = &in6addr_any;
+ } else
+ saddr = &addr_buf;
-static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
+ ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);
- if (dev->hard_header) {
- unsigned char ha[MAX_ADDR_LEN];
- int err;
+ rcu_read_unlock();
- ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
- err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
- if (err < 0) {
- kfree_skb(skb);
- return err;
- }
- }
- return dev_queue_xmit(skb);
-}
+ skb_put_data(skb, ra, sizeof(ra));
-static inline int mld_dev_queue_xmit(struct sk_buff *skb)
-{
- return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
- mld_dev_queue_xmit2);
+ skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
+ skb_put(skb, sizeof(*pmr));
+ pmr = (struct mld2_report *)skb_transport_header(skb);
+ pmr->mld2r_type = ICMPV6_MLD2_REPORT;
+ pmr->mld2r_resv1 = 0;
+ pmr->mld2r_cksum = 0;
+ pmr->mld2r_resv2 = 0;
+ pmr->mld2r_ngrec = 0;
+ return skb;
}
static void mld_sendpack(struct sk_buff *skb)
{
- struct ipv6hdr *pip6 = skb->nh.ipv6h;
- struct mld2_report *pmr = (struct mld2_report *)skb->h.raw;
+ struct ipv6hdr *pip6 = ipv6_hdr(skb);
+ struct mld2_report *pmr =
+ (struct mld2_report *)skb_transport_header(skb);
int payload_len, mldlen;
- struct inet6_dev *idev = in6_dev_get(skb->dev);
+ struct inet6_dev *idev;
+ struct net *net = dev_net(skb->dev);
int err;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
- payload_len = skb->tail - (unsigned char *)skb->nh.ipv6h -
- sizeof(struct ipv6hdr);
- mldlen = skb->tail - skb->h.raw;
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
+
+ payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) -
+ sizeof(*pip6);
+ mldlen = skb_tail_pointer(skb) - skb_transport_header(skb);
pip6->payload_len = htons(payload_len);
- pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
- IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
- mld_dev_queue_xmit);
+ pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
+ IPPROTO_ICMPV6,
+ csum_partial(skb_transport_header(skb),
+ mldlen, 0));
+
+ icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+ skb->dev->ifindex);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+
+ err = 0;
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ }
+ skb_dst_set(skb, dst);
+ if (err)
+ goto err_out;
+
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, net->ipv6.igmp_sk, skb, NULL, skb->dev,
+ dst_output);
+out:
if (!err) {
- ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
- } else
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
+ ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ } else {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ }
- if (likely(idev != NULL))
- in6_dev_put(idev);
+ rcu_read_unlock();
+ return;
+
+err_out:
+ kfree_skb(skb);
+ goto out;
}
static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
@@ -1491,42 +1877,51 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
- int type, struct mld2_grec **ppgr)
+ int type, struct mld2_grec **ppgr, unsigned int mtu)
{
- struct net_device *dev = pmc->idev->dev;
struct mld2_report *pmr;
struct mld2_grec *pgr;
- if (!skb)
- skb = mld_newpack(dev, dev->mtu);
- if (!skb)
- return NULL;
- pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec));
+ if (!skb) {
+ skb = mld_newpack(pmc->idev, mtu);
+ if (!skb)
+ return NULL;
+ }
+ pgr = skb_put(skb, sizeof(struct mld2_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
pgr->grec_nsrcs = 0;
pgr->grec_mca = pmc->mca_addr; /* structure copy */
- pmr = (struct mld2_report *)skb->h.raw;
- pmr->ngrec = htons(ntohs(pmr->ngrec)+1);
+ pmr = (struct mld2_report *)skb_transport_header(skb);
+ pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1);
*ppgr = pgr;
return skb;
}
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
- skb_tailroom(skb)) : 0)
+#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
- int type, int gdeleted, int sdeleted)
+ int type, int gdeleted, int sdeleted,
+ int crsend)
{
- struct net_device *dev = pmc->idev->dev;
- struct mld2_report *pmr;
- struct mld2_grec *pgr = NULL;
- struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+ struct ip6_sf_list *psf, *psf_prev, *psf_next;
int scount, stotal, first, isquery, truncate;
+ struct ip6_sf_list __rcu **psf_list;
+ struct inet6_dev *idev = pmc->idev;
+ struct net_device *dev = idev->dev;
+ struct mld2_grec *pgr = NULL;
+ struct mld2_report *pmr;
+ unsigned int mtu;
+
+ mc_assert_locked(idev);
if (pmc->mca_flags & MAF_NOREPORT)
return skb;
+ mtu = READ_ONCE(dev->mtu);
+ if (mtu < IPV6_MIN_MTU)
+ return skb;
+
isquery = type == MLD2_MODE_IS_INCLUDE ||
type == MLD2_MODE_IS_EXCLUDE;
truncate = type == MLD2_MODE_IS_EXCLUDE ||
@@ -1536,32 +1931,43 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;
- if (!*psf_list)
+ if (!rcu_access_pointer(*psf_list))
goto empty_source;
- pmr = skb ? (struct mld2_report *)skb->h.raw : NULL;
+ pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;
/* EX and TO_EX get a fresh packet, if needed */
if (truncate) {
- if (pmr && pmr->ngrec &&
+ if (pmr && pmr->mld2r_ngrec &&
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
mld_sendpack(skb);
- skb = mld_newpack(dev, dev->mtu);
+ skb = mld_newpack(idev, mtu);
}
}
first = 1;
psf_prev = NULL;
- for (psf=*psf_list; psf; psf=psf_next) {
+ for (psf = mc_dereference(*psf_list, idev);
+ psf;
+ psf = psf_next) {
struct in6_addr *psrc;
- psf_next = psf->sf_next;
+ psf_next = mc_dereference(psf->sf_next, idev);
- if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
psf_prev = psf;
continue;
}
+ /* Based on RFC3810 6.1. Should not send source-list change
+ * records when there is a filter mode change.
+ */
+ if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) ||
+ (!gdeleted && pmc->mca_crcount)) &&
+ (type == MLD2_ALLOW_NEW_SOURCES ||
+ type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount)
+ goto decrease_sf_crcount;
+
/* clear marks on query responses */
if (isquery)
psf->sf_gsresp = 0;
@@ -1574,26 +1980,31 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
pgr->grec_nsrcs = htons(scount);
if (skb)
mld_sendpack(skb);
- skb = mld_newpack(dev, dev->mtu);
+ skb = mld_newpack(idev, mtu);
first = 1;
scount = 0;
}
if (first) {
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
first = 0;
}
- psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc));
+ if (!skb)
+ return NULL;
+ psrc = skb_put(skb, sizeof(*psrc));
*psrc = psf->sf_addr;
scount++; stotal++;
if ((type == MLD2_ALLOW_NEW_SOURCES ||
type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- *psf_list = psf->sf_next;
- kfree(psf);
+ rcu_assign_pointer(*psf_list,
+ mc_dereference(psf->sf_next, idev));
+ kfree_rcu(psf, rcu);
continue;
}
}
@@ -1605,13 +2016,13 @@ empty_source:
if (type == MLD2_ALLOW_NEW_SOURCES ||
type == MLD2_BLOCK_OLD_SOURCES)
return skb;
- if (pmc->mca_crcount || isquery) {
+ if (pmc->mca_crcount || isquery || crsend) {
/* make sure we have room for group header */
if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) {
mld_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
}
}
if (pgr)
@@ -1627,51 +2038,50 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
struct sk_buff *skb = NULL;
int type;
+ mc_assert_locked(idev);
+
if (!pmc) {
- read_lock_bh(&idev->lock);
- for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_flags & MAF_NOREPORT)
continue;
- spin_lock_bh(&pmc->mca_lock);
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_MODE_IS_EXCLUDE;
else
type = MLD2_MODE_IS_INCLUDE;
- skb = add_grec(skb, pmc, type, 0, 0);
- spin_unlock_bh(&pmc->mca_lock);
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
}
- read_unlock_bh(&idev->lock);
} else {
- spin_lock_bh(&pmc->mca_lock);
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_MODE_IS_EXCLUDE;
else
type = MLD2_MODE_IS_INCLUDE;
- skb = add_grec(skb, pmc, type, 0, 0);
- spin_unlock_bh(&pmc->mca_lock);
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
}
if (skb)
mld_sendpack(skb);
}
-/*
- * remove zero-count source records from a source filter list
- */
-static void mld_clear_zeros(struct ip6_sf_list **ppsf)
+/* remove zero-count source records from a source filter list */
+static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev)
{
struct ip6_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf=*ppsf; psf; psf = psf_next) {
- psf_next = psf->sf_next;
+ for (psf = mc_dereference(*ppsf, idev);
+ psf;
+ psf = psf_next) {
+ psf_next = mc_dereference(psf->sf_next, idev);
if (psf->sf_crcount == 0) {
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- *ppsf = psf->sf_next;
- kfree(psf);
- } else
+ rcu_assign_pointer(*ppsf,
+ mc_dereference(psf->sf_next, idev));
+ kfree_rcu(psf, rcu);
+ } else {
psf_prev = psf;
+ }
}
}
@@ -1681,46 +2091,44 @@ static void mld_send_cr(struct inet6_dev *idev)
struct sk_buff *skb = NULL;
int type, dtype;
- read_lock_bh(&idev->lock);
- write_lock_bh(&idev->mc_lock);
-
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) {
- pmc_next = pmc->next;
+ for (pmc = mc_dereference(idev->mc_tomb, idev);
+ pmc;
+ pmc = pmc_next) {
+ pmc_next = mc_dereference(pmc->next, idev);
if (pmc->mca_sfmode == MCAST_INCLUDE) {
type = MLD2_BLOCK_OLD_SOURCES;
dtype = MLD2_BLOCK_OLD_SOURCES;
- skb = add_grec(skb, pmc, type, 1, 0);
- skb = add_grec(skb, pmc, dtype, 1, 1);
+ skb = add_grec(skb, pmc, type, 1, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 1, 1, 0);
}
if (pmc->mca_crcount) {
if (pmc->mca_sfmode == MCAST_EXCLUDE) {
type = MLD2_CHANGE_TO_INCLUDE;
- skb = add_grec(skb, pmc, type, 1, 0);
+ skb = add_grec(skb, pmc, type, 1, 0, 0);
}
pmc->mca_crcount--;
if (pmc->mca_crcount == 0) {
- mld_clear_zeros(&pmc->mca_tomb);
- mld_clear_zeros(&pmc->mca_sources);
+ mld_clear_zeros(&pmc->mca_tomb, idev);
+ mld_clear_zeros(&pmc->mca_sources, idev);
}
}
- if (pmc->mca_crcount == 0 && !pmc->mca_tomb &&
- !pmc->mca_sources) {
+ if (pmc->mca_crcount == 0 &&
+ !rcu_access_pointer(pmc->mca_tomb) &&
+ !rcu_access_pointer(pmc->mca_sources)) {
if (pmc_prev)
- pmc_prev->next = pmc_next;
+ rcu_assign_pointer(pmc_prev->next, pmc_next);
else
- idev->mc_tomb = pmc_next;
+ rcu_assign_pointer(idev->mc_tomb, pmc_next);
in6_dev_put(pmc->idev);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
} else
pmc_prev = pmc;
}
- write_unlock_bh(&idev->mc_lock);
/* change recs */
- for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
- spin_lock_bh(&pmc->mca_lock);
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
type = MLD2_BLOCK_OLD_SOURCES;
dtype = MLD2_ALLOW_NEW_SOURCES;
@@ -1728,8 +2136,8 @@ static void mld_send_cr(struct inet6_dev *idev)
type = MLD2_ALLOW_NEW_SOURCES;
dtype = MLD2_BLOCK_OLD_SOURCES;
}
- skb = add_grec(skb, pmc, type, 0, 0);
- skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */
/* filter mode changes */
if (pmc->mca_crcount) {
@@ -1737,12 +2145,10 @@ static void mld_send_cr(struct inet6_dev *idev)
type = MLD2_CHANGE_TO_EXCLUDE;
else
type = MLD2_CHANGE_TO_INCLUDE;
- skb = add_grec(skb, pmc, type, 0, 0);
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
pmc->mca_crcount--;
}
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (!skb)
return;
(void) mld_sendpack(skb);
@@ -1750,95 +2156,162 @@ static void mld_send_cr(struct inet6_dev *idev)
static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
{
- struct sock *sk = igmp6_socket->sk;
- struct inet6_dev *idev;
- struct sk_buff *skb;
- struct icmp6hdr *hdr;
- struct in6_addr *snd_addr;
- struct in6_addr *addrp;
- struct in6_addr addr_buf;
- struct in6_addr all_routers;
+ const struct in6_addr *snd_addr, *saddr;
int err, len, payload_len, full_len;
+ struct in6_addr addr_buf;
+ struct inet6_dev *idev;
+ struct sk_buff *skb;
+ struct mld_msg *hdr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
u8 ra[8] = { IPPROTO_ICMPV6, 0,
IPV6_TLV_ROUTERALERT, 2, 0, 0,
IPV6_TLV_PADN, 0 };
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ struct net *net;
+ struct sock *sk;
- rcu_read_lock();
- IP6_INC_STATS(__in6_dev_get(dev),
- IPSTATS_MIB_OUTREQUESTS);
- rcu_read_unlock();
- snd_addr = addr;
- if (type == ICMPV6_MGM_REDUCTION) {
- snd_addr = &all_routers;
- ipv6_addr_all_routers(&all_routers);
- }
+ if (type == ICMPV6_MGM_REDUCTION)
+ snd_addr = &in6addr_linklocal_allrouters;
+ else
+ snd_addr = addr;
len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
payload_len = len + sizeof(ra);
full_len = sizeof(struct ipv6hdr) + payload_len;
- skb = sock_alloc_send_skb(sk, LL_RESERVED_SPACE(dev) + full_len, 1, &err);
+ skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL);
- if (skb == NULL) {
- rcu_read_lock();
- IP6_INC_STATS(__in6_dev_get(dev),
- IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_lock();
+
+ net = dev_net_rcu(dev);
+ idev = __in6_dev_get(dev);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
rcu_read_unlock();
return;
}
+ sk = net->ipv6.igmp_sk;
+ skb_set_owner_w(skb, sk);
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, hlen);
- if (ipv6_get_lladdr(dev, &addr_buf)) {
+ if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
/* <draft-ietf-magma-mld-source-05.txt>:
- * use unspecified address as the source address
+ * use unspecified address as the source address
* when a valid link-local address is not available.
*/
- memset(&addr_buf, 0, sizeof(addr_buf));
- }
-
- ip6_nd_hdr(sk, skb, dev, &addr_buf, snd_addr, NEXTHDR_HOP, payload_len);
+ saddr = &in6addr_any;
+ } else
+ saddr = &addr_buf;
- memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
+ ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len);
- hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr));
- memset(hdr, 0, sizeof(struct icmp6hdr));
- hdr->icmp6_type = type;
+ skb_put_data(skb, ra, sizeof(ra));
- addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr));
- ipv6_addr_copy(addrp, addr);
+ hdr = skb_put_zero(skb, sizeof(struct mld_msg));
+ hdr->mld_type = type;
+ hdr->mld_mca = *addr;
- hdr->icmp6_cksum = csum_ipv6_magic(&addr_buf, snd_addr, len,
- IPPROTO_ICMPV6,
- csum_partial((__u8 *) hdr, len, 0));
+ hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len,
+ IPPROTO_ICMPV6,
+ csum_partial(hdr, len, 0));
- idev = in6_dev_get(skb->dev);
+ icmpv6_flow_init(sk, &fl6, type,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+ skb->dev->ifindex);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto err_out;
+ }
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
- mld_dev_queue_xmit);
+ skb_dst_set(skb, dst);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb->dev,
+ dst_output);
+out:
if (!err) {
- if (type == ICMPV6_MGM_REDUCTION)
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
- else
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBRESPONSES);
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
+ ICMP6MSGOUT_INC_STATS(net, idev, type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
} else
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- if (likely(idev != NULL))
- in6_dev_put(idev);
+ rcu_read_unlock();
return;
+
+err_out:
+ kfree_skb(skb);
+ goto out;
+}
+
+static void mld_send_initial_cr(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *pmc;
+ struct sk_buff *skb;
+ int type;
+
+ mc_assert_locked(idev);
+
+ if (mld_in_v1_mode(idev))
+ return;
+
+ skb = NULL;
+ for_each_mc_mclock(idev, pmc) {
+ if (pmc->mca_sfcount[MCAST_EXCLUDE])
+ type = MLD2_CHANGE_TO_EXCLUDE;
+ else
+ type = MLD2_ALLOW_NEW_SOURCES;
+ skb = add_grec(skb, pmc, type, 0, 0, 1);
+ }
+ if (skb)
+ mld_sendpack(skb);
+}
+
+void ipv6_mc_dad_complete(struct inet6_dev *idev)
+{
+ mutex_lock(&idev->mc_lock);
+ idev->mc_dad_count = idev->mc_qrv;
+ if (idev->mc_dad_count) {
+ mld_send_initial_cr(idev);
+ idev->mc_dad_count--;
+ if (idev->mc_dad_count)
+ mld_dad_start_work(idev,
+ unsolicited_report_interval(idev));
+ }
+ mutex_unlock(&idev->mc_lock);
+}
+
+static void mld_dad_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_dad_work);
+ mutex_lock(&idev->mc_lock);
+ mld_send_initial_cr(idev);
+ if (idev->mc_dad_count) {
+ idev->mc_dad_count--;
+ if (idev->mc_dad_count)
+ mld_dad_start_work(idev,
+ unsolicited_report_interval(idev));
+ }
+ mutex_unlock(&idev->mc_lock);
+ in6_dev_put(idev);
}
static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
- struct in6_addr *psfsrc)
+ const struct in6_addr *psfsrc)
{
struct ip6_sf_list *psf, *psf_prev;
int rv = 0;
+ mc_assert_locked(pmc->idev);
+
psf_prev = NULL;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
break;
psf_prev = psf;
@@ -1847,29 +2320,34 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
/* source filter not found, or count wrong => bug */
return -ESRCH;
}
- psf->sf_count[sfmode]--;
+ WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1);
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
struct inet6_dev *idev = pmc->idev;
/* no more filters for this source */
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- pmc->mca_sources = psf->sf_next;
+ rcu_assign_pointer(pmc->mca_sources,
+ mc_dereference(psf->sf_next, idev));
+
if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
- !MLD_V1_SEEN(idev)) {
+ !mld_in_v1_mode(idev)) {
psf->sf_crcount = idev->mc_qrv;
- psf->sf_next = pmc->mca_tomb;
- pmc->mca_tomb = psf;
+ rcu_assign_pointer(psf->sf_next,
+ mc_dereference(pmc->mca_tomb, idev));
+ rcu_assign_pointer(pmc->mca_tomb, psf);
rv = 1;
- } else
- kfree(psf);
+ } else {
+ kfree_rcu(psf, rcu);
+ }
}
return rv;
}
-static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca,
- int sfmode, int sfcount, struct in6_addr *psfsrc,
+static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
int delta)
{
struct ifmcaddr6 *pmc;
@@ -1878,28 +2356,25 @@ static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca,
if (!idev)
return -ENODEV;
- read_lock_bh(&idev->lock);
- for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+
+ mc_assert_locked(idev);
+
+ for_each_mc_mclock(idev, pmc) {
if (ipv6_addr_equal(pmca, &pmc->mca_addr))
break;
}
- if (!pmc) {
- /* MCA not found?? bug */
- read_unlock_bh(&idev->lock);
+ if (!pmc)
return -ESRCH;
- }
- spin_lock_bh(&pmc->mca_lock);
+
sf_markstate(pmc);
if (!delta) {
- if (!pmc->mca_sfcount[sfmode]) {
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ if (!pmc->mca_sfcount[sfmode])
return -EINVAL;
- }
+
pmc->mca_sfcount[sfmode]--;
}
err = 0;
- for (i=0; i<sfcount; i++) {
+ for (i = 0; i < sfcount; i++) {
int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
@@ -1915,68 +2390,75 @@ static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca,
pmc->mca_sfmode = MCAST_INCLUDE;
pmc->mca_crcount = idev->mc_qrv;
idev->mc_ifc_count = pmc->mca_crcount;
- for (psf=pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = 0;
mld_ifc_event(pmc->idev);
- } else if (sf_setstate(pmc) || changerec)
+ } else if (sf_setstate(pmc) || changerec) {
mld_ifc_event(pmc->idev);
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ }
+
return err;
}
-/*
- * Add multicast single-source filter to the interface list
- */
+/* Add multicast single-source filter to the interface list */
static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
- struct in6_addr *psfsrc, int delta)
+ const struct in6_addr *psfsrc)
{
struct ip6_sf_list *psf, *psf_prev;
+ mc_assert_locked(pmc->idev);
+
psf_prev = NULL;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
break;
psf_prev = psf;
}
if (!psf) {
- psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+ psf = kzalloc(sizeof(*psf), GFP_KERNEL);
if (!psf)
return -ENOBUFS;
psf->sf_addr = *psfsrc;
if (psf_prev) {
- psf_prev->sf_next = psf;
- } else
- pmc->mca_sources = psf;
+ rcu_assign_pointer(psf_prev->sf_next, psf);
+ } else {
+ rcu_assign_pointer(pmc->mca_sources, psf);
+ }
}
- psf->sf_count[sfmode]++;
+ WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1);
return 0;
}
static void sf_markstate(struct ifmcaddr6 *pmc)
{
- struct ip6_sf_list *psf;
int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+ struct ip6_sf_list *psf;
+
+ mc_assert_locked(pmc->idev);
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next)
+ for_each_psf_mclock(pmc, psf) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
- } else
+ } else {
psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+ }
+ }
}
static int sf_setstate(struct ifmcaddr6 *pmc)
{
- struct ip6_sf_list *psf, *dpsf;
int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+ struct ip6_sf_list *psf, *dpsf;
int qrv = pmc->idev->mc_qrv;
int new_in, rv;
+ mc_assert_locked(pmc->idev);
+
rv = 0;
- for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -1986,8 +2468,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
if (!psf->sf_oldin) {
struct ip6_sf_list *prev = NULL;
- for (dpsf=pmc->mca_tomb; dpsf;
- dpsf=dpsf->sf_next) {
+ for_each_psf_tomb(pmc, dpsf) {
if (ipv6_addr_equal(&dpsf->sf_addr,
&psf->sf_addr))
break;
@@ -1995,10 +2476,14 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
}
if (dpsf) {
if (prev)
- prev->sf_next = dpsf->sf_next;
+ rcu_assign_pointer(prev->sf_next,
+ mc_dereference(dpsf->sf_next,
+ pmc->idev));
else
- pmc->mca_tomb = dpsf->sf_next;
- kfree(dpsf);
+ rcu_assign_pointer(pmc->mca_tomb,
+ mc_dereference(dpsf->sf_next,
+ pmc->idev));
+ kfree_rcu(dpsf, rcu);
}
psf->sf_crcount = qrv;
rv++;
@@ -2009,19 +2494,19 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf=pmc->mca_tomb; dpsf; dpsf=dpsf->sf_next)
+
+ for_each_psf_tomb(pmc, dpsf)
if (ipv6_addr_equal(&dpsf->sf_addr,
&psf->sf_addr))
break;
if (!dpsf) {
- dpsf = (struct ip6_sf_list *)
- kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL);
if (!dpsf)
continue;
*dpsf = *psf;
- /* pmc->mca_lock held by callers */
- dpsf->sf_next = pmc->mca_tomb;
- pmc->mca_tomb = dpsf;
+ rcu_assign_pointer(dpsf->sf_next,
+ mc_dereference(pmc->mca_tomb, pmc->idev));
+ rcu_assign_pointer(pmc->mca_tomb, dpsf);
}
dpsf->sf_crcount = qrv;
rv++;
@@ -2030,11 +2515,9 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
return rv;
}
-/*
- * Add multicast source filter list to the interface list
- */
-static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca,
- int sfmode, int sfcount, struct in6_addr *psfsrc,
+/* Add multicast source filter list to the interface list */
+static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
int delta)
{
struct ifmcaddr6 *pmc;
@@ -2043,25 +2526,24 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca,
if (!idev)
return -ENODEV;
- read_lock_bh(&idev->lock);
- for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+
+ mc_assert_locked(idev);
+
+ for_each_mc_mclock(idev, pmc) {
if (ipv6_addr_equal(pmca, &pmc->mca_addr))
break;
}
- if (!pmc) {
- /* MCA not found?? bug */
- read_unlock_bh(&idev->lock);
+ if (!pmc)
return -ESRCH;
- }
- spin_lock_bh(&pmc->mca_lock);
sf_markstate(pmc);
isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
if (!delta)
- pmc->mca_sfcount[sfmode]++;
+ WRITE_ONCE(pmc->mca_sfcount[sfmode],
+ pmc->mca_sfcount[sfmode] + 1);
err = 0;
- for (i=0; i<sfcount; i++) {
- err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+ for (i = 0; i < sfcount; i++) {
+ err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
}
@@ -2069,11 +2551,11 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca,
int j;
if (!delta)
- pmc->mca_sfcount[sfmode]--;
- for (j=0; j<i; j++)
- (void) ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+ WRITE_ONCE(pmc->mca_sfcount[sfmode],
+ pmc->mca_sfcount[sfmode] - 1);
+ for (j = 0; j < i; j++)
+ ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
- struct inet6_dev *idev = pmc->idev;
struct ip6_sf_list *psf;
/* filter mode change */
@@ -2085,13 +2567,12 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca,
pmc->mca_crcount = idev->mc_qrv;
idev->mc_ifc_count = pmc->mca_crcount;
- for (psf=pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = 0;
mld_ifc_event(idev);
- } else if (sf_setstate(pmc))
+ } else if (sf_setstate(pmc)) {
mld_ifc_event(idev);
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ }
return err;
}
@@ -2099,146 +2580,208 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
{
struct ip6_sf_list *psf, *nextpsf;
- for (psf=pmc->mca_tomb; psf; psf=nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
+ mc_assert_locked(pmc->idev);
+
+ for (psf = mc_dereference(pmc->mca_tomb, pmc->idev);
+ psf;
+ psf = nextpsf) {
+ nextpsf = mc_dereference(psf->sf_next, pmc->idev);
+ kfree_rcu(psf, rcu);
}
- pmc->mca_tomb = NULL;
- for (psf=pmc->mca_sources; psf; psf=nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
+ RCU_INIT_POINTER(pmc->mca_tomb, NULL);
+ for (psf = mc_dereference(pmc->mca_sources, pmc->idev);
+ psf;
+ psf = nextpsf) {
+ nextpsf = mc_dereference(psf->sf_next, pmc->idev);
+ kfree_rcu(psf, rcu);
}
- pmc->mca_sources = NULL;
+ RCU_INIT_POINTER(pmc->mca_sources, NULL);
pmc->mca_sfmode = MCAST_EXCLUDE;
pmc->mca_sfcount[MCAST_INCLUDE] = 0;
- pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
+ /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */
+ WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1);
}
-
static void igmp6_join_group(struct ifmcaddr6 *ma)
{
unsigned long delay;
+ mc_assert_locked(ma->idev);
+
if (ma->mca_flags & MAF_NOREPORT)
return;
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
- delay = net_random() % IGMP6_UNSOLICITED_IVAL;
+ delay = get_random_u32_below(unsolicited_report_interval(ma->idev));
- spin_lock_bh(&ma->mca_lock);
- if (del_timer(&ma->mca_timer)) {
- atomic_dec(&ma->mca_refcnt);
- delay = ma->mca_timer.expires - jiffies;
+ if (cancel_delayed_work(&ma->mca_work)) {
+ refcount_dec(&ma->mca_refcnt);
+ delay = ma->mca_work.timer.expires - jiffies;
}
- if (!mod_timer(&ma->mca_timer, jiffies + delay))
- atomic_inc(&ma->mca_refcnt);
+ if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
+ refcount_inc(&ma->mca_refcnt);
ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
- spin_unlock_bh(&ma->mca_lock);
}
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
struct inet6_dev *idev)
{
+ struct ip6_sf_socklist *psl;
int err;
- /* callers have the socket lock and a write lock on ipv6_sk_mc_lock,
- * so no other readers or writers of iml or its sflist
- */
- if (iml->sflist == 0) {
+ psl = sock_dereference(iml->sflist, sk);
+
+ if (idev)
+ mutex_lock(&idev->mc_lock);
+
+ if (!psl) {
/* any-source empty exclude case */
- return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ } else {
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
}
- err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
- iml->sflist->sl_count, iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+
+ if (idev)
+ mutex_unlock(&idev->mc_lock);
+
return err;
}
static void igmp6_leave_group(struct ifmcaddr6 *ma)
{
- if (MLD_V1_SEEN(ma->idev)) {
- if (ma->mca_flags & MAF_LAST_REPORTER)
+ mc_assert_locked(ma->idev);
+
+ if (mld_in_v1_mode(ma->idev)) {
+ if (ma->mca_flags & MAF_LAST_REPORTER) {
igmp6_send(&ma->mca_addr, ma->idev->dev,
ICMPV6_MGM_REDUCTION);
+ }
} else {
mld_add_delrec(ma->idev, ma);
mld_ifc_event(ma->idev);
}
}
-static void mld_gq_timer_expire(unsigned long data)
+static void mld_gq_work(struct work_struct *work)
{
- struct inet6_dev *idev = (struct inet6_dev *)data;
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_gq_work);
- idev->mc_gq_running = 0;
+ mutex_lock(&idev->mc_lock);
mld_send_report(idev, NULL);
- __in6_dev_put(idev);
+ idev->mc_gq_running = 0;
+ mutex_unlock(&idev->mc_lock);
+
+ in6_dev_put(idev);
}
-static void mld_ifc_timer_expire(unsigned long data)
+static void mld_ifc_work(struct work_struct *work)
{
- struct inet6_dev *idev = (struct inet6_dev *)data;
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_ifc_work);
+ mutex_lock(&idev->mc_lock);
mld_send_cr(idev);
+
if (idev->mc_ifc_count) {
idev->mc_ifc_count--;
if (idev->mc_ifc_count)
- mld_ifc_start_timer(idev, idev->mc_maxdelay);
+ mld_ifc_start_work(idev,
+ unsolicited_report_interval(idev));
}
- __in6_dev_put(idev);
+ mutex_unlock(&idev->mc_lock);
+ in6_dev_put(idev);
}
static void mld_ifc_event(struct inet6_dev *idev)
{
- if (MLD_V1_SEEN(idev))
+ mc_assert_locked(idev);
+
+ if (mld_in_v1_mode(idev))
return;
+
idev->mc_ifc_count = idev->mc_qrv;
- mld_ifc_start_timer(idev, 1);
+ mld_ifc_start_work(idev, 1);
}
-
-static void igmp6_timer_handler(unsigned long data)
+static void mld_mca_work(struct work_struct *work)
{
- struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+ struct ifmcaddr6 *ma = container_of(to_delayed_work(work),
+ struct ifmcaddr6, mca_work);
- if (MLD_V1_SEEN(ma->idev))
+ mutex_lock(&ma->idev->mc_lock);
+ if (mld_in_v1_mode(ma->idev))
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
else
mld_send_report(ma->idev, ma);
-
- spin_lock(&ma->mca_lock);
ma->mca_flags |= MAF_LAST_REPORTER;
ma->mca_flags &= ~MAF_TIMER_RUNNING;
- spin_unlock(&ma->mca_lock);
+ mutex_unlock(&ma->idev->mc_lock);
+
ma_put(ma);
}
-/* Device going down */
+/* Device changing type */
+
+void ipv6_mc_unmap(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *i;
+
+ /* Install multicast list, except for all-nodes (already installed) */
+
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, i)
+ igmp6_group_dropped(i);
+ mutex_unlock(&idev->mc_lock);
+}
+
+void ipv6_mc_remap(struct inet6_dev *idev)
+{
+ ipv6_mc_up(idev);
+}
+/* Device going down */
void ipv6_mc_down(struct inet6_dev *idev)
{
struct ifmcaddr6 *i;
+ mutex_lock(&idev->mc_lock);
/* Withdraw multicast list */
+ for_each_mc_mclock(idev, i)
+ igmp6_group_dropped(i);
+ mutex_unlock(&idev->mc_lock);
- read_lock_bh(&idev->lock);
- idev->mc_ifc_count = 0;
- if (del_timer(&idev->mc_ifc_timer))
- __in6_dev_put(idev);
- idev->mc_gq_running = 0;
- if (del_timer(&idev->mc_gq_timer))
- __in6_dev_put(idev);
+ /* Should stop work after group drop. or we will
+ * start work again in mld_ifc_event()
+ */
+ mld_query_stop_work(idev);
+ mld_report_stop_work(idev);
- for (i = idev->mc_list; i; i=i->next)
- igmp6_group_dropped(i);
- read_unlock_bh(&idev->lock);
+ mutex_lock(&idev->mc_lock);
+ mld_ifc_stop_work(idev);
+ mld_gq_stop_work(idev);
+ mutex_unlock(&idev->mc_lock);
- mld_clear_delrec(idev);
+ mld_dad_stop_work(idev);
}
+static void ipv6_mc_reset(struct inet6_dev *idev)
+{
+ idev->mc_qrv = sysctl_mld_qrv;
+ idev->mc_qi = MLD_QI_DEFAULT;
+ idev->mc_qri = MLD_QRI_DEFAULT;
+ idev->mc_v1_seen = 0;
+ idev->mc_maxdelay = unsolicited_report_interval(idev);
+}
/* Device going up */
@@ -2248,37 +2791,33 @@ void ipv6_mc_up(struct inet6_dev *idev)
/* Install multicast list, except for all-nodes (already installed) */
- read_lock_bh(&idev->lock);
- for (i = idev->mc_list; i; i=i->next)
+ ipv6_mc_reset(idev);
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, i) {
+ mld_del_delrec(idev, i);
igmp6_group_added(i);
- read_unlock_bh(&idev->lock);
+ }
+ mutex_unlock(&idev->mc_lock);
}
/* IPv6 device initialization. */
void ipv6_mc_init_dev(struct inet6_dev *idev)
{
- struct in6_addr maddr;
-
- write_lock_bh(&idev->lock);
- rwlock_init(&idev->mc_lock);
idev->mc_gq_running = 0;
- init_timer(&idev->mc_gq_timer);
- idev->mc_gq_timer.data = (unsigned long) idev;
- idev->mc_gq_timer.function = &mld_gq_timer_expire;
- idev->mc_tomb = NULL;
+ INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work);
+ RCU_INIT_POINTER(idev->mc_tomb, NULL);
idev->mc_ifc_count = 0;
- init_timer(&idev->mc_ifc_timer);
- idev->mc_ifc_timer.data = (unsigned long) idev;
- idev->mc_ifc_timer.function = &mld_ifc_timer_expire;
- idev->mc_qrv = MLD_QRV_DEFAULT;
- idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL;
- idev->mc_v1_seen = 0;
- write_unlock_bh(&idev->lock);
-
- /* Add all-nodes address. */
- ipv6_addr_all_nodes(&maddr);
- ipv6_dev_mc_inc(idev->dev, &maddr);
+ INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work);
+ INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work);
+ INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work);
+ INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work);
+ skb_queue_head_init(&idev->mc_query_queue);
+ skb_queue_head_init(&idev->mc_report_queue);
+ spin_lock_init(&idev->mc_query_lock);
+ spin_lock_init(&idev->mc_report_lock);
+ mutex_init(&idev->mc_lock);
+ ipv6_mc_reset(idev);
}
/*
@@ -2288,40 +2827,75 @@ void ipv6_mc_init_dev(struct inet6_dev *idev)
void ipv6_mc_destroy_dev(struct inet6_dev *idev)
{
struct ifmcaddr6 *i;
- struct in6_addr maddr;
- /* Deactivate timers */
+ /* Deactivate works */
ipv6_mc_down(idev);
+ mutex_lock(&idev->mc_lock);
+ mld_clear_delrec(idev);
+ mutex_unlock(&idev->mc_lock);
+ mld_clear_query(idev);
+ mld_clear_report(idev);
/* Delete all-nodes address. */
- ipv6_addr_all_nodes(&maddr);
-
/* We cannot call ipv6_dev_mc_dec() directly, our caller in
* addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will
* fail.
*/
- __ipv6_dev_mc_dec(idev, &maddr);
+ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes);
- if (idev->cnf.forwarding) {
- ipv6_addr_all_routers(&maddr);
- __ipv6_dev_mc_dec(idev, &maddr);
- }
+ if (idev->cnf.forwarding)
+ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);
- write_lock_bh(&idev->lock);
- while ((i = idev->mc_list) != NULL) {
- idev->mc_list = i->next;
- write_unlock_bh(&idev->lock);
+ mutex_lock(&idev->mc_lock);
+ while ((i = mc_dereference(idev->mc_list, idev))) {
+ rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev));
- igmp6_group_dropped(i);
+ ip6_mc_clear_src(i);
ma_put(i);
+ }
+ mutex_unlock(&idev->mc_lock);
+}
+
+static void ipv6_mc_rejoin_groups(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *pmc;
- write_lock_bh(&idev->lock);
+ mutex_lock(&idev->mc_lock);
+ if (mld_in_v1_mode(idev)) {
+ for_each_mc_mclock(idev, pmc)
+ igmp6_join_group(pmc);
+ } else {
+ mld_send_report(idev, NULL);
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
+static int ipv6_mc_netdev_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ if (idev)
+ ipv6_mc_rejoin_groups(idev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp6_netdev_notifier = {
+ .notifier_call = ipv6_mc_netdev_event,
+};
+
#ifdef CONFIG_PROC_FS
struct igmp6_mc_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
struct inet6_dev *idev;
};
@@ -2332,22 +2906,20 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
{
struct ifmcaddr6 *im = NULL;
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+ struct net *net = seq_file_net(seq);
- for (state->dev = dev_base, state->idev = NULL;
- state->dev;
- state->dev = state->dev->next) {
+ state->idev = NULL;
+ for_each_netdev_rcu(net, state->dev) {
struct inet6_dev *idev;
- idev = in6_dev_get(state->dev);
+ idev = __in6_dev_get(state->dev);
if (!idev)
continue;
- read_lock_bh(&idev->lock);
- im = idev->mc_list;
+
+ im = rcu_dereference(idev->mc_list);
if (im) {
state->idev = idev;
break;
}
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
}
return im;
}
@@ -2356,22 +2928,17 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr
{
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
- im = im->next;
+ im = rcu_dereference(im->next);
while (!im) {
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- in6_dev_put(state->idev);
- }
- state->dev = state->dev->next;
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
break;
}
- state->idev = in6_dev_get(state->dev);
+ state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
- read_lock_bh(&state->idev->lock);
- im = state->idev->mc_list;
+ im = rcu_dereference(state->idev->mc_list);
}
return im;
}
@@ -2386,29 +2953,29 @@ static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return igmp6_mc_get_idx(seq, *pos);
}
static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct ifmcaddr6 *im;
- im = igmp6_mc_get_next(seq, v);
+ struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v);
+
++*pos;
return im;
}
static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- in6_dev_put(state->idev);
+
+ if (likely(state->idev))
state->idev = NULL;
- }
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
@@ -2417,53 +2984,24 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
seq_printf(seq,
- "%-4d %-15s " NIP6_SEQFMT " %5d %08X %ld\n",
+ "%-4d %-15s %pi6 %5d %08X %ld\n",
state->dev->ifindex, state->dev->name,
- NIP6(im->mca_addr),
+ &im->mca_addr,
im->mca_users, im->mca_flags,
- (im->mca_flags&MAF_TIMER_RUNNING) ?
- jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0);
+ (im->mca_flags & MAF_TIMER_RUNNING) ?
+ jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0);
return 0;
}
-static struct seq_operations igmp6_mc_seq_ops = {
+static const struct seq_operations igmp6_mc_seq_ops = {
.start = igmp6_mc_seq_start,
.next = igmp6_mc_seq_next,
.stop = igmp6_mc_seq_stop,
.show = igmp6_mc_seq_show,
};
-static int igmp6_mc_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct igmp6_mc_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &igmp6_mc_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations igmp6_mc_seq_fops = {
- .owner = THIS_MODULE,
- .open = igmp6_mc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
struct igmp6_mcf_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
struct inet6_dev *idev;
struct ifmcaddr6 *im;
@@ -2476,28 +3014,25 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
struct ip6_sf_list *psf = NULL;
struct ifmcaddr6 *im = NULL;
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+ struct net *net = seq_file_net(seq);
- for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
- state->dev;
- state->dev = state->dev->next) {
+ state->idev = NULL;
+ state->im = NULL;
+ for_each_netdev_rcu(net, state->dev) {
struct inet6_dev *idev;
- idev = in6_dev_get(state->dev);
+ idev = __in6_dev_get(state->dev);
if (unlikely(idev == NULL))
continue;
- read_lock_bh(&idev->lock);
- im = idev->mc_list;
- if (likely(im != NULL)) {
- spin_lock_bh(&im->mca_lock);
- psf = im->mca_sources;
- if (likely(psf != NULL)) {
+
+ im = rcu_dereference(idev->mc_list);
+ if (likely(im)) {
+ psf = rcu_dereference(im->mca_sources);
+ if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
}
- spin_unlock_bh(&im->mca_lock);
}
- read_unlock_bh(&idev->lock);
- in6_dev_put(idev);
}
return psf;
}
@@ -2506,30 +3041,21 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
{
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
- psf = psf->sf_next;
+ psf = rcu_dereference(psf->sf_next);
while (!psf) {
- spin_unlock_bh(&state->im->mca_lock);
- state->im = state->im->next;
+ state->im = rcu_dereference(state->im->next);
while (!state->im) {
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- in6_dev_put(state->idev);
- }
- state->dev = state->dev->next;
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
- state->idev = in6_dev_get(state->dev);
+ state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
- read_lock_bh(&state->idev->lock);
- state->im = state->idev->mc_list;
+ state->im = rcu_dereference(state->idev->mc_list);
}
- if (!state->im)
- break;
- spin_lock_bh(&state->im->mca_lock);
- psf = state->im->mca_sources;
+ psf = rcu_dereference(state->im->mca_sources);
}
out:
return psf;
@@ -2545,8 +3071,9 @@ static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2562,19 +3089,17 @@ static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
- if (likely(state->im != NULL)) {
- spin_unlock_bh(&state->im->mca_lock);
+
+ if (likely(state->im))
state->im = NULL;
- }
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- in6_dev_put(state->idev);
+ if (likely(state->idev))
state->idev = NULL;
- }
+
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
@@ -2583,98 +3108,141 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
- seq_printf(seq,
- "%3s %6s "
- "%32s %32s %6s %6s\n", "Idx",
- "Device", "Multicast Address",
- "Source Address", "INC", "EXC");
+ seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n");
} else {
seq_printf(seq,
- "%3d %6.6s " NIP6_SEQFMT " " NIP6_SEQFMT " %6lu %6lu\n",
+ "%3d %6.6s %pi6 %pi6 %6lu %6lu\n",
state->dev->ifindex, state->dev->name,
- NIP6(state->im->mca_addr),
- NIP6(psf->sf_addr),
- psf->sf_count[MCAST_INCLUDE],
- psf->sf_count[MCAST_EXCLUDE]);
+ &state->im->mca_addr,
+ &psf->sf_addr,
+ READ_ONCE(psf->sf_count[MCAST_INCLUDE]),
+ READ_ONCE(psf->sf_count[MCAST_EXCLUDE]));
}
return 0;
}
-static struct seq_operations igmp6_mcf_seq_ops = {
+static const struct seq_operations igmp6_mcf_seq_ops = {
.start = igmp6_mcf_seq_start,
.next = igmp6_mcf_seq_next,
.stop = igmp6_mcf_seq_stop,
.show = igmp6_mcf_seq_show,
};
-static int igmp6_mcf_seq_open(struct inode *inode, struct file *file)
+static int __net_init igmp6_proc_init(struct net *net)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct igmp6_mcf_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
+ int err;
- rc = seq_open(file, &igmp6_mcf_seq_ops);
- if (rc)
- goto out_kfree;
+ err = -ENOMEM;
+ if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops,
+ sizeof(struct igmp6_mc_iter_state)))
+ goto out;
+ if (!proc_create_net("mcfilter6", 0444, net->proc_net,
+ &igmp6_mcf_seq_ops,
+ sizeof(struct igmp6_mcf_iter_state)))
+ goto out_proc_net_igmp6;
- seq = file->private_data;
- seq->private = s;
+ err = 0;
out:
- return rc;
-out_kfree:
- kfree(s);
+ return err;
+
+out_proc_net_igmp6:
+ remove_proc_entry("igmp6", net->proc_net);
goto out;
}
-static struct file_operations igmp6_mcf_seq_fops = {
- .owner = THIS_MODULE,
- .open = igmp6_mcf_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
+static void __net_exit igmp6_proc_exit(struct net *net)
+{
+ remove_proc_entry("mcfilter6", net->proc_net);
+ remove_proc_entry("igmp6", net->proc_net);
+}
+#else
+static inline int igmp6_proc_init(struct net *net)
+{
+ return 0;
+}
+static inline void igmp6_proc_exit(struct net *net)
+{
+}
#endif
-int __init igmp6_init(struct net_proto_family *ops)
+static int __net_init igmp6_net_init(struct net *net)
{
- struct ipv6_pinfo *np;
- struct sock *sk;
int err;
- err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &igmp6_socket);
+ err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
if (err < 0) {
- printk(KERN_ERR
- "Failed to initialize the IGMP6 control socket (err %d).\n",
+ pr_err("Failed to initialize the IGMP6 control socket (err %d)\n",
err);
- igmp6_socket = NULL; /* For safety. */
- return err;
+ goto out;
}
- sk = igmp6_socket->sk;
- sk->sk_allocation = GFP_ATOMIC;
- sk->sk_prot->unhash(sk);
+ inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
+ net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL;
- np = inet6_sk(sk);
- np->hop_limit = 1;
+ err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n",
+ err);
+ goto out_sock_create;
+ }
-#ifdef CONFIG_PROC_FS
- proc_net_fops_create("igmp6", S_IRUGO, &igmp6_mc_seq_fops);
- proc_net_fops_create("mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops);
-#endif
+ err = igmp6_proc_init(net);
+ if (err)
+ goto out_sock_create_autojoin;
return 0;
+
+out_sock_create_autojoin:
+ inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
+out_sock_create:
+ inet_ctl_sock_destroy(net->ipv6.igmp_sk);
+out:
+ return err;
+}
+
+static void __net_exit igmp6_net_exit(struct net *net)
+{
+ inet_ctl_sock_destroy(net->ipv6.igmp_sk);
+ inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
+ igmp6_proc_exit(net);
+}
+
+static struct pernet_operations igmp6_net_ops = {
+ .init = igmp6_net_init,
+ .exit = igmp6_net_exit,
+};
+
+int __init igmp6_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&igmp6_net_ops);
+ if (err)
+ return err;
+
+ mld_wq = create_workqueue("mld");
+ if (!mld_wq) {
+ unregister_pernet_subsys(&igmp6_net_ops);
+ return -ENOMEM;
+ }
+
+ return err;
+}
+
+int __init igmp6_late_init(void)
+{
+ return register_netdevice_notifier(&igmp6_netdev_notifier);
}
void igmp6_cleanup(void)
{
- sock_release(igmp6_socket);
- igmp6_socket = NULL; /* for safety */
+ unregister_pernet_subsys(&igmp6_net_ops);
+ destroy_workqueue(mld_wq);
+}
-#ifdef CONFIG_PROC_FS
- proc_net_remove("mcfilter6");
- proc_net_remove("igmp6");
-#endif
+void igmp6_late_cleanup(void)
+{
+ unregister_netdevice_notifier(&igmp6_netdev_notifier);
}
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
new file mode 100644
index 000000000000..04d5fcdfa6e0
--- /dev/null
+++ b/net/ipv6/mcast_snoop.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
+ *
+ * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
+ */
+
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+
+static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h;
+ unsigned int len;
+ unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->version != 6)
+ return -EINVAL;
+
+ len = offset + ntohs(ip6h->payload_len);
+ if (skb->len < len || len <= offset)
+ return -EINVAL;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h;
+ int offset;
+ u8 nexthdr;
+ __be16 frag_off;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_HOPOPTS)
+ return -ENOMSG;
+
+ nexthdr = ip6h->nexthdr;
+ offset = skb_network_offset(skb) + sizeof(*ip6h);
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+
+ if (offset < 0)
+ return -EINVAL;
+
+ if (nexthdr != IPPROTO_ICMPV6)
+ return -ENOMSG;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct mld2_report);
+
+ return ipv6_mc_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ipv6_mc_check_mld_query(struct sk_buff *skb)
+{
+ unsigned int transport_len = ipv6_transport_len(skb);
+ struct mld_msg *mld;
+ unsigned int len;
+
+ /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+ return -EINVAL;
+
+ /* MLDv1? */
+ if (transport_len != sizeof(struct mld_msg)) {
+ /* or MLDv2? */
+ if (transport_len < sizeof(struct mld2_query))
+ return -EINVAL;
+
+ len = skb_transport_offset(skb) + sizeof(struct mld2_query);
+ if (!ipv6_mc_may_pull(skb, len))
+ return -EINVAL;
+ }
+
+ mld = (struct mld_msg *)skb_transport_header(skb);
+
+ /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
+ * all-nodes destination address (ff02::1) for general queries
+ */
+ if (ipv6_addr_any(&mld->mld_mca) &&
+ !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
+ struct mld_msg *mld;
+
+ if (!ipv6_mc_may_pull(skb, len))
+ return -ENODATA;
+
+ mld = (struct mld_msg *)skb_transport_header(skb);
+
+ switch (mld->mld_type) {
+ case ICMPV6_MGM_REDUCTION:
+ case ICMPV6_MGM_REPORT:
+ return 0;
+ case ICMPV6_MLD2_REPORT:
+ return ipv6_mc_check_mld_reportv2(skb);
+ case ICMPV6_MGM_QUERY:
+ return ipv6_mc_check_mld_query(skb);
+ default:
+ return -ENODATA;
+ }
+}
+
+static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
+{
+ return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
+}
+
+static int ipv6_mc_check_icmpv6(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr);
+ unsigned int transport_len = ipv6_transport_len(skb);
+ struct sk_buff *skb_chk;
+
+ if (!ipv6_mc_may_pull(skb, len))
+ return -EINVAL;
+
+ skb_chk = skb_checksum_trimmed(skb, transport_len,
+ ipv6_mc_validate_checksum);
+ if (!skb_chk)
+ return -EINVAL;
+
+ if (skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return 0;
+}
+
+/**
+ * ipv6_mc_check_mld - checks whether this is a sane MLD packet
+ * @skb: the skb to validate
+ *
+ * Checks whether an IPv6 packet is a valid MLD packet. If so sets
+ * skb transport header accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ * standard
+ * -ENOMSG: IP header validation succeeded but it is not an ICMPv6 packet
+ * with a hop-by-hop option.
+ * -ENODATA: IP+ICMPv6 header with hop-by-hop option validation succeeded
+ * but it is not an MLD packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Caller needs to set the skb network header and free any returned skb if it
+ * differs from the provided skb.
+ */
+int ipv6_mc_check_mld(struct sk_buff *skb)
+{
+ int ret;
+
+ ret = ipv6_mc_check_ip6hdr(skb);
+ if (ret < 0)
+ return ret;
+
+ ret = ipv6_mc_check_exthdrs(skb);
+ if (ret < 0)
+ return ret;
+
+ ret = ipv6_mc_check_icmpv6(skb);
+ if (ret < 0)
+ return ret;
+
+ return ipv6_mc_check_mld_msg(skb);
+}
+EXPORT_SYMBOL(ipv6_mc_check_mld);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index be7dd7db65d7..6a16a5bd0d91 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2003-2006 Helsinki University of Technology
* Copyright (C)2003-2006 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Authors:
@@ -22,6 +9,8 @@
* Masahide NAKAMURA @USAGI
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/time.h>
@@ -30,14 +19,10 @@
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/ip6_checksum.h>
+#include <net/rawv6.h>
#include <net/xfrm.h>
#include <net/mip6.h>
-static xfrm_address_t *mip6_xfrm_addr(struct xfrm_state *x, xfrm_address_t *addr)
-{
- return x->coaddr;
-}
-
static inline unsigned int calc_padlen(unsigned int len, unsigned int n)
{
return (n - len + 16) & 0x7;
@@ -48,9 +33,9 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen)
if (!data)
return NULL;
if (padlen == 1) {
- data[0] = MIP6_OPT_PAD_1;
+ data[0] = IPV6_TLV_PAD1;
} else if (padlen > 1) {
- data[0] = MIP6_OPT_PAD_N;
+ data[0] = IPV6_TLV_PADN;
data[1] = padlen - 2;
if (padlen > 2)
memset(data+2, 0, data[1]);
@@ -58,9 +43,9 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen)
return data + padlen;
}
-static inline void mip6_param_prob(struct sk_buff *skb, int code, int pos)
+static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos)
{
- icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
+ icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos);
}
static int mip6_mh_len(int type)
@@ -86,53 +71,33 @@ static int mip6_mh_len(int type)
return len;
}
-int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
+static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
{
- struct ip6_mh *mh;
- int mhlen;
+ struct ip6_mh _hdr;
+ const struct ip6_mh *mh;
- if (!pskb_may_pull(skb, (skb->h.raw - skb->data) + 8) ||
- !pskb_may_pull(skb, (skb->h.raw - skb->data) + ((skb->h.raw[1] + 1) << 3)))
+ mh = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_hdr), &_hdr);
+ if (!mh)
return -1;
- mh = (struct ip6_mh *)skb->h.raw;
+ if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len)
+ return -1;
if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) {
- LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n",
- mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type));
- mip6_param_prob(skb, 0, (&mh->ip6mh_hdrlen) - skb->nh.raw);
+ net_dbg_ratelimited("mip6: MH message too short: %d vs >=%d\n",
+ mh->ip6mh_hdrlen,
+ mip6_mh_len(mh->ip6mh_type));
+ mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) +
+ skb_network_header_len(skb));
return -1;
}
- mhlen = (mh->ip6mh_hdrlen + 1) << 3;
-
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- mhlen, IPPROTO_MH,
- skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH hw checksum failed\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
- if (skb->ip_summed == CHECKSUM_NONE) {
- if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- mhlen, IPPROTO_MH,
- skb_checksum(skb, 0, mhlen, 0))) {
- LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH checksum failed "
- "[" NIP6_FMT " > " NIP6_FMT "]\n",
- NIP6(skb->nh.ipv6h->saddr),
- NIP6(skb->nh.ipv6h->daddr));
- return -1;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
if (mh->ip6mh_proto != IPPROTO_NONE) {
- LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n",
- mh->ip6mh_proto);
- mip6_param_prob(skb, 0, (&mh->ip6mh_proto) - skb->nh.raw);
+ net_dbg_ratelimited("mip6: MH invalid payload proto = %d\n",
+ mh->ip6mh_proto);
+ mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) +
+ skb_network_header_len(skb));
return -1;
}
@@ -141,26 +106,29 @@ int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
struct mip6_report_rate_limiter {
spinlock_t lock;
- struct timeval stamp;
+ ktime_t stamp;
int iif;
struct in6_addr src;
struct in6_addr dst;
};
static struct mip6_report_rate_limiter mip6_report_rl = {
- .lock = SPIN_LOCK_UNLOCKED
+ .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock)
};
static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ipv6hdr *iph = skb->nh.ipv6h;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data;
+ int err = destopt->nexthdr;
+ spin_lock(&x->lock);
if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) &&
!ipv6_addr_any((struct in6_addr *)x->coaddr))
- return -ENOENT;
+ err = -ENOENT;
+ spin_unlock(&x->lock);
- return destopt->nexthdr;
+ return err;
}
/* Destination Option Header is inserted.
@@ -175,168 +143,125 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb)
u8 nexthdr;
int len;
- iph = (struct ipv6hdr *)skb->data;
- iph->payload_len = htons(skb->len - sizeof(*iph));
+ skb_push(skb, -skb_network_offset(skb));
+ iph = ipv6_hdr(skb);
- nexthdr = *skb->nh.raw;
- *skb->nh.raw = IPPROTO_DSTOPTS;
+ nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_DSTOPTS;
- dstopt = (struct ipv6_destopt_hdr *)skb->h.raw;
+ dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb);
dstopt->nexthdr = nexthdr;
hao = mip6_padn((char *)(dstopt + 1),
calc_padlen(sizeof(*dstopt), 6));
hao->type = IPV6_TLV_HAO;
+ BUILD_BUG_ON(sizeof(*hao) != 18);
hao->length = sizeof(*hao) - 2;
- BUG_TRAP(hao->length == 16);
len = ((char *)hao - (char *)dstopt) + sizeof(*hao);
memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr));
+ spin_lock_bh(&x->lock);
memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr));
+ spin_unlock_bh(&x->lock);
- BUG_TRAP(len == x->props.header_len);
+ WARN_ON(len != x->props.header_len);
dstopt->hdrlen = (x->props.header_len >> 3) - 1;
return 0;
}
-static inline int mip6_report_rl_allow(struct timeval *stamp,
- struct in6_addr *dst,
- struct in6_addr *src, int iif)
+static inline int mip6_report_rl_allow(ktime_t stamp,
+ const struct in6_addr *dst,
+ const struct in6_addr *src, int iif)
{
int allow = 0;
spin_lock_bh(&mip6_report_rl.lock);
- if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec ||
- mip6_report_rl.stamp.tv_usec != stamp->tv_usec ||
+ if (mip6_report_rl.stamp != stamp ||
mip6_report_rl.iif != iif ||
!ipv6_addr_equal(&mip6_report_rl.src, src) ||
!ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
- mip6_report_rl.stamp.tv_sec = stamp->tv_sec;
- mip6_report_rl.stamp.tv_usec = stamp->tv_usec;
+ mip6_report_rl.stamp = stamp;
mip6_report_rl.iif = iif;
- ipv6_addr_copy(&mip6_report_rl.src, src);
- ipv6_addr_copy(&mip6_report_rl.dst, dst);
+ mip6_report_rl.src = *src;
+ mip6_report_rl.dst = *dst;
allow = 1;
}
spin_unlock_bh(&mip6_report_rl.lock);
return allow;
}
-static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct flowi *fl)
+static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
+ const struct flowi *fl)
{
+ struct net *net = xs_net(x);
struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+ const struct flowi6 *fl6 = &fl->u.ip6;
struct ipv6_destopt_hao *hao = NULL;
struct xfrm_selector sel;
int offset;
- struct timeval stamp;
+ ktime_t stamp;
int err = 0;
- if (unlikely(fl->proto == IPPROTO_MH &&
- fl->fl_mh_type <= IP6_MH_TYPE_MAX))
+ if (unlikely(fl6->flowi6_proto == IPPROTO_MH &&
+ fl6->fl6_mh_type <= IP6_MH_TYPE_MAX))
goto out;
if (likely(opt->dsthao)) {
offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
if (likely(offset >= 0))
- hao = (struct ipv6_destopt_hao *)(skb->nh.raw + offset);
+ hao = (struct ipv6_destopt_hao *)
+ (skb_network_header(skb) + offset);
}
- skb_get_timestamp(skb, &stamp);
+ stamp = skb_get_ktime(skb);
- if (!mip6_report_rl_allow(&stamp, &skb->nh.ipv6h->daddr,
- hao ? &hao->addr : &skb->nh.ipv6h->saddr,
+ if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr,
+ hao ? &hao->addr : &ipv6_hdr(skb)->saddr,
opt->iif))
goto out;
memset(&sel, 0, sizeof(sel));
- memcpy(&sel.daddr, (xfrm_address_t *)&skb->nh.ipv6h->daddr,
+ memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
sizeof(sel.daddr));
sel.prefixlen_d = 128;
- memcpy(&sel.saddr, (xfrm_address_t *)&skb->nh.ipv6h->saddr,
+ memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
sizeof(sel.saddr));
sel.prefixlen_s = 128;
sel.family = AF_INET6;
- sel.proto = fl->proto;
- sel.dport = xfrm_flowi_dport(fl);
+ sel.proto = fl6->flowi6_proto;
+ sel.dport = xfrm_flowi_dport(fl, &fl6->uli);
if (sel.dport)
sel.dport_mask = htons(~0);
- sel.sport = xfrm_flowi_sport(fl);
+ sel.sport = xfrm_flowi_sport(fl, &fl6->uli);
if (sel.sport)
sel.sport_mask = htons(~0);
- sel.ifindex = fl->oif;
+ sel.ifindex = fl6->flowi6_oif;
- err = km_report(IPPROTO_DSTOPTS, &sel,
+ err = km_report(net, IPPROTO_DSTOPTS, &sel,
(hao ? (xfrm_address_t *)&hao->addr : NULL));
out:
return err;
}
-static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb,
- u8 **nexthdr)
-{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
- unsigned int packet_len = skb->tail - skb->nh.raw;
- int found_rhdr = 0;
-
- *nexthdr = &skb->nh.ipv6h->nexthdr;
-
- while (offset + 1 <= packet_len) {
-
- switch (**nexthdr) {
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
- /*
- * HAO MUST NOT appear more than once.
- * XXX: It is better to try to find by the end of
- * XXX: packet if HAO exists.
- */
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) {
- LIMIT_NETDEBUG(KERN_WARNING "mip6: hao exists already, override\n");
- return offset;
- }
-
- if (found_rhdr)
- return offset;
-
- break;
- default:
- return offset;
- }
-
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
- }
-
- return offset;
-}
-
-static int mip6_destopt_init_state(struct xfrm_state *x)
+static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
if (x->id.spi) {
- printk(KERN_INFO "%s: spi is not 0: %u\n", __FUNCTION__,
- x->id.spi);
+ NL_SET_ERR_MSG(extack, "SPI must be 0");
return -EINVAL;
}
if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
- printk(KERN_INFO "%s: state's mode is not %u: %u\n",
- __FUNCTION__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
return -EINVAL;
}
x->props.header_len = sizeof(struct ipv6_destopt_hdr) +
calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) +
sizeof(struct ipv6_destopt_hao);
- BUG_TRAP(x->props.header_len == 24);
+ WARN_ON(x->props.header_len != 24);
return 0;
}
@@ -349,30 +274,30 @@ static void mip6_destopt_destroy(struct xfrm_state *x)
{
}
-static struct xfrm_type mip6_destopt_type =
-{
- .description = "MIP6DESTOPT",
+static const struct xfrm_type mip6_destopt_type = {
.owner = THIS_MODULE,
- .proto = IPPROTO_DSTOPTS,
- .flags = XFRM_TYPE_NON_FRAGMENT,
+ .proto = IPPROTO_DSTOPTS,
+ .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
.init_state = mip6_destopt_init_state,
.destructor = mip6_destopt_destroy,
.input = mip6_destopt_input,
.output = mip6_destopt_output,
- .reject = mip6_destopt_reject,
- .hdr_offset = mip6_destopt_offset,
- .local_addr = mip6_xfrm_addr,
+ .reject = mip6_destopt_reject,
};
static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data;
+ int err = rt2->rt_hdr.nexthdr;
- if (!ipv6_addr_equal(&rt2->addr, (struct in6_addr *)x->coaddr) &&
+ spin_lock(&x->lock);
+ if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) &&
!ipv6_addr_any((struct in6_addr *)x->coaddr))
- return -ENOENT;
+ err = -ENOENT;
+ spin_unlock(&x->lock);
- return rt2->rt_hdr.nexthdr;
+ return err;
}
/* Routing Header type 2 is inserted.
@@ -384,81 +309,37 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
struct rt2_hdr *rt2;
u8 nexthdr;
- iph = (struct ipv6hdr *)skb->data;
- iph->payload_len = htons(skb->len - sizeof(*iph));
+ skb_push(skb, -skb_network_offset(skb));
+ iph = ipv6_hdr(skb);
- nexthdr = *skb->nh.raw;
- *skb->nh.raw = IPPROTO_ROUTING;
+ nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_ROUTING;
- rt2 = (struct rt2_hdr *)skb->h.raw;
+ rt2 = (struct rt2_hdr *)skb_transport_header(skb);
rt2->rt_hdr.nexthdr = nexthdr;
rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1;
rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2;
rt2->rt_hdr.segments_left = 1;
memset(&rt2->reserved, 0, sizeof(rt2->reserved));
- BUG_TRAP(rt2->rt_hdr.hdrlen == 2);
+ WARN_ON(rt2->rt_hdr.hdrlen != 2);
memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr));
+ spin_lock_bh(&x->lock);
memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr));
+ spin_unlock_bh(&x->lock);
return 0;
}
-static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb,
- u8 **nexthdr)
-{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
- unsigned int packet_len = skb->tail - skb->nh.raw;
- int found_rhdr = 0;
-
- *nexthdr = &skb->nh.ipv6h->nexthdr;
-
- while (offset + 1 <= packet_len) {
-
- switch (**nexthdr) {
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- if (offset + 3 <= packet_len) {
- struct ipv6_rt_hdr *rt;
- rt = (struct ipv6_rt_hdr *)(skb->nh.raw + offset);
- if (rt->type != 0)
- return offset;
- }
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
- return offset;
-
- if (found_rhdr)
- return offset;
-
- break;
- default:
- return offset;
- }
-
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
- }
-
- return offset;
-}
-
-static int mip6_rthdr_init_state(struct xfrm_state *x)
+static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
if (x->id.spi) {
- printk(KERN_INFO "%s: spi is not 0: %u\n", __FUNCTION__,
- x->id.spi);
+ NL_SET_ERR_MSG(extack, "SPI must be 0");
return -EINVAL;
}
if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
- printk(KERN_INFO "%s: state's mode is not %u: %u\n",
- __FUNCTION__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
return -EINVAL;
}
@@ -475,44 +356,56 @@ static void mip6_rthdr_destroy(struct xfrm_state *x)
{
}
-static struct xfrm_type mip6_rthdr_type =
-{
- .description = "MIP6RT",
+static const struct xfrm_type mip6_rthdr_type = {
.owner = THIS_MODULE,
- .proto = IPPROTO_ROUTING,
- .flags = XFRM_TYPE_NON_FRAGMENT,
+ .proto = IPPROTO_ROUTING,
+ .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
.init_state = mip6_rthdr_init_state,
.destructor = mip6_rthdr_destroy,
.input = mip6_rthdr_input,
.output = mip6_rthdr_output,
- .hdr_offset = mip6_rthdr_offset,
- .remote_addr = mip6_xfrm_addr,
};
-int __init mip6_init(void)
+static int __init mip6_init(void)
{
- printk(KERN_INFO "Mobile IPv6\n");
+ pr_info("Mobile IPv6\n");
if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) {
- printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __FUNCTION__);
+ pr_info("%s: can't add xfrm type(destopt)\n", __func__);
goto mip6_destopt_xfrm_fail;
}
if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) {
- printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __FUNCTION__);
+ pr_info("%s: can't add xfrm type(rthdr)\n", __func__);
goto mip6_rthdr_xfrm_fail;
}
+ if (rawv6_mh_filter_register(mip6_mh_filter) < 0) {
+ pr_info("%s: can't add rawv6 mh filter\n", __func__);
+ goto mip6_rawv6_mh_fail;
+ }
+
+
return 0;
+ mip6_rawv6_mh_fail:
+ xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
mip6_rthdr_xfrm_fail:
xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
mip6_destopt_xfrm_fail:
return -EAGAIN;
}
-void __exit mip6_fini(void)
+static void __exit mip6_fini(void)
{
- if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
- printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __FUNCTION__);
- if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
- printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __FUNCTION__);
+ if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
+ pr_info("%s: can't remove rawv6 mh filter\n", __func__);
+ xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+ xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
}
+
+module_init(mip6_init);
+module_exit(mip6_fini);
+
+MODULE_DESCRIPTION("IPv6 Mobility driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS);
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 56ea92837307..59d17b6f06bf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1,23 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Neighbour Discovery for IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
* Mike Shaver <shaver@ingenia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
* Changes:
*
+ * Alexey I. Froloff : RFC6106 (DNSSL) support
+ * Pierre Ynard : export userland ND options
+ * through netlink (RDNSS support)
* Lars Fenneberg : fixed MTU setting on receipt
* of an RA.
- *
* Janos Farkas : kmalloc failure checks
* Alexey Kuznetsov : state machine reworked
* and moved to net/core.
@@ -25,27 +23,7 @@
* YOSHIFUJI Hideaki @USAGI : Verify ND options properly
*/
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0)
-#define ND_NOPRINTK(x...) do { ; } while(0)
-#define ND_PRINTK0 ND_PRINTK
-#define ND_PRINTK1 ND_NOPRINTK
-#define ND_PRINTK2 ND_NOPRINTK
-#define ND_PRINTK3 ND_NOPRINTK
-#if ND_DEBUG >= 1
-#undef ND_PRINTK1
-#define ND_PRINTK1 ND_PRINTK
-#endif
-#if ND_DEBUG >= 2
-#undef ND_PRINTK2
-#define ND_PRINTK2 ND_PRINTK
-#endif
-#if ND_DEBUG >= 3
-#undef ND_PRINTK3
-#define ND_PRINTK3 ND_PRINTK
-#endif
+#define pr_fmt(fmt) "ICMPv6: " fmt
#include <linux/module.h>
#include <linux/errno.h>
@@ -58,11 +36,13 @@
#include <linux/route.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/if_addr.h>
+#include <linux/if_ether.h>
#include <linux/if_arp.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
@@ -78,124 +58,96 @@
#include <net/addrconf.h>
#include <net/icmp.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
#include <net/flow.h>
#include <net/ip6_checksum.h>
+#include <net/inet_common.h>
#include <linux/proc_fs.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
-static struct socket *ndisc_socket;
-
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev);
+static u32 ndisc_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd);
+static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
+static bool ndisc_allow_add(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
static int pndisc_constructor(struct pneigh_entry *n);
static void pndisc_destructor(struct pneigh_entry *n);
static void pndisc_redo(struct sk_buff *skb);
+static int ndisc_is_multicast(const void *pkey);
-static struct neigh_ops ndisc_generic_ops = {
+static const struct neigh_ops ndisc_generic_ops = {
.family = AF_INET6,
.solicit = ndisc_solicit,
.error_report = ndisc_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
-static struct neigh_ops ndisc_hh_ops = {
+static const struct neigh_ops ndisc_hh_ops = {
.family = AF_INET6,
.solicit = ndisc_solicit,
.error_report = ndisc_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
-static struct neigh_ops ndisc_direct_ops = {
+static const struct neigh_ops ndisc_direct_ops = {
.family = AF_INET6,
- .output = dev_queue_xmit,
- .connected_output = dev_queue_xmit,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
};
struct neigh_table nd_tbl = {
.family = AF_INET6,
- .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr),
.key_len = sizeof(struct in6_addr),
+ .protocol = cpu_to_be16(ETH_P_IPV6),
.hash = ndisc_hash,
+ .key_eq = ndisc_key_eq,
.constructor = ndisc_constructor,
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
.proxy_redo = pndisc_redo,
+ .is_multicast = ndisc_is_multicast,
+ .allow_add = ndisc_allow_add,
.id = "ndisc_cache",
.parms = {
- .tbl = &nd_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
+ .tbl = &nd_tbl,
+ .reachable_time = ND_REACHABLE_TIME,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ },
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
+EXPORT_SYMBOL_GPL(nd_tbl);
-/* ND options */
-struct ndisc_options {
- struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
-#ifdef CONFIG_IPV6_ROUTE_INFO
- struct nd_opt_hdr *nd_opts_ri;
- struct nd_opt_hdr *nd_opts_ri_end;
-#endif
-};
-
-#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
-#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR]
-#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO]
-#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END]
-#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR]
-#define nd_opts_mtu nd_opt_array[ND_OPT_MTU]
-
-#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
-
-/*
- * Return the padding between the option length and the start of the
- * link addr. Currently only IP-over-InfiniBand needs this, although
- * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may
- * also need a pad of 2.
- */
-static int ndisc_addr_option_pad(unsigned short type)
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
+ int data_len, int pad)
{
- switch (type) {
- case ARPHRD_INFINIBAND: return 2;
- default: return 0;
- }
-}
-
-static inline int ndisc_opt_addr_space(struct net_device *dev)
-{
- return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type));
-}
-
-static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len,
- unsigned short addr_type)
-{
- int space = NDISC_OPT_SPACE(data_len);
- int pad = ndisc_addr_option_pad(addr_type);
+ int space = __ndisc_opt_addr_space(data_len, pad);
+ u8 *opt = skb_put(skb, space);
opt[0] = type;
opt[1] = space>>3;
@@ -207,9 +159,26 @@ static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len,
memcpy(opt+2, data, data_len);
data_len += 2;
opt += data_len;
- if ((space -= data_len) > 0)
+ space -= data_len;
+ if (space > 0)
memset(opt, 0, space);
- return opt + space;
+}
+EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
+
+static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
+ const void *data, u8 icmp6_type)
+{
+ __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
+ ndisc_addr_option_pad(skb->dev->type));
+ ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type);
+}
+
+static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb,
+ void *ha,
+ const u8 *ops_data)
+{
+ ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT);
+ ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data);
}
static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
@@ -221,12 +190,36 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
type = cur->nd_opt_type;
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
- } while(cur < end && cur->nd_opt_type != type);
- return (cur <= end && cur->nd_opt_type == type ? cur : NULL);
+ } while (cur < end && cur->nd_opt_type != type);
+ return cur <= end && cur->nd_opt_type == type ? cur : NULL;
+}
+
+static inline int ndisc_is_useropt(const struct net_device *dev,
+ struct nd_opt_hdr *opt)
+{
+ return opt->nd_opt_type == ND_OPT_PREFIX_INFO ||
+ opt->nd_opt_type == ND_OPT_RDNSS ||
+ opt->nd_opt_type == ND_OPT_DNSSL ||
+ opt->nd_opt_type == ND_OPT_6CO ||
+ opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
+ opt->nd_opt_type == ND_OPT_PREF64;
}
-static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
- struct ndisc_options *ndopts)
+static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
+ struct nd_opt_hdr *cur,
+ struct nd_opt_hdr *end)
+{
+ if (!cur || !end || cur >= end)
+ return NULL;
+ do {
+ cur = ((void *)cur) + (cur->nd_opt_len << 3);
+ } while (cur < end && !ndisc_is_useropt(dev, cur));
+ return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
+}
+
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+ u8 *opt, int opt_len,
+ struct ndisc_options *ndopts)
{
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
@@ -234,29 +227,31 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
return NULL;
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
+ bool unknown = false;
int l;
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
l = nd_opt->nd_opt_len << 3;
if (opt_len < l || l == 0)
return NULL;
+ if (ndisc_ops_parse_options(dev, nd_opt, ndopts))
+ goto next_opt;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
+ case ND_OPT_NONCE:
case ND_OPT_REDIRECT_HDR:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
- ND_PRINTK2(KERN_WARNING
- "%s(): duplicated ND6 option found: type=%d\n",
- __FUNCTION__,
- nd_opt->nd_opt_type);
+ net_dbg_ratelimited("%s: duplicated ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
} else {
ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
}
break;
case ND_OPT_PREFIX_INFO:
ndopts->nd_opts_pi_end = nd_opt;
- if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0)
+ if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
break;
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -267,33 +262,29 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
break;
#endif
default:
+ unknown = true;
+ }
+ if (ndisc_is_useropt(dev, nd_opt)) {
+ ndopts->nd_useropts_end = nd_opt;
+ if (!ndopts->nd_useropts)
+ ndopts->nd_useropts = nd_opt;
+ } else if (unknown) {
/*
* Unknown options must be silently ignored,
- * to accommodate future extension to the protocol.
+ * to accommodate future extension to the
+ * protocol.
*/
- ND_PRINTK2(KERN_NOTICE
- "%s(): ignored unsupported option; type=%d, len=%d\n",
- __FUNCTION__,
- nd_opt->nd_opt_type, nd_opt->nd_opt_len);
+ net_dbg_ratelimited("%s: ignored unsupported option; type=%d, len=%d\n",
+ __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len);
}
+next_opt:
opt_len -= l;
nd_opt = ((void *)nd_opt) + l;
}
return ndopts;
}
-static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
- struct net_device *dev)
-{
- u8 *lladdr = (u8 *)(p + 1);
- int lladdrlen = p->nd_opt_len << 3;
- int prepad = ndisc_addr_option_pad(dev->type);
- if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad))
- return NULL;
- return (lladdr + prepad);
-}
-
-int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
+int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
{
switch (dev->type) {
case ARPHRD_ETHER:
@@ -301,15 +292,14 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
case ARPHRD_FDDI:
ipv6_eth_mc_map(addr, buf);
return 0;
- case ARPHRD_IEEE802_TR:
- ipv6_tr_mc_map(addr,buf);
- return 0;
case ARPHRD_ARCNET:
ipv6_arcnet_mc_map(addr, buf);
return 0;
case ARPHRD_INFINIBAND:
- ipv6_ib_mc_map(addr, buf);
+ ipv6_ib_mc_map(addr, dev->broadcast, buf);
return 0;
+ case ARPHRD_IPGRE:
+ return ipv6_ipgre_mc_map(addr, dev->broadcast, buf);
default:
if (dir) {
memcpy(buf, dev->broadcast, dev->addr_len);
@@ -318,44 +308,42 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
}
return -EINVAL;
}
+EXPORT_SYMBOL(ndisc_mc_map);
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
+static u32 ndisc_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
{
- const u32 *p32 = pkey;
- u32 addr_hash, i;
-
- addr_hash = 0;
- for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
- addr_hash ^= *p32++;
+ return ndisc_hashfn(pkey, dev, hash_rnd);
+}
- return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd);
+static bool ndisc_key_eq(const struct neighbour *n, const void *pkey)
+{
+ return neigh_key_eq128(n, pkey);
}
static int ndisc_constructor(struct neighbour *neigh)
{
- struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key;
+ struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key;
struct net_device *dev = neigh->dev;
struct inet6_dev *in6_dev;
struct neigh_parms *parms;
- int is_multicast = ipv6_addr_is_multicast(addr);
+ bool is_multicast = ipv6_addr_is_multicast(addr);
- rcu_read_lock();
in6_dev = in6_dev_get(dev);
- if (in6_dev == NULL) {
- rcu_read_unlock();
+ if (!in6_dev) {
return -EINVAL;
}
parms = in6_dev->nd_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
- if (dev->hard_header == NULL) {
+ if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &ndisc_direct_ops;
- neigh->output = neigh->ops->queue_xmit;
+ neigh->output = neigh_direct_output;
} else {
if (is_multicast) {
neigh->nud_state = NUD_NOARP;
@@ -369,7 +357,7 @@ static int ndisc_constructor(struct neighbour *neigh)
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
- if (dev->hard_header_cache)
+ if (dev->header_ops->cache)
neigh->ops = &ndisc_hh_ops;
else
neigh->ops = &ndisc_generic_ops;
@@ -384,309 +372,356 @@ static int ndisc_constructor(struct neighbour *neigh)
static int pndisc_constructor(struct pneigh_entry *n)
{
- struct in6_addr *addr = (struct in6_addr*)&n->key;
- struct in6_addr maddr;
+ struct in6_addr *addr = (struct in6_addr *)&n->key;
struct net_device *dev = n->dev;
+ struct in6_addr maddr;
- if (dev == NULL || __in6_dev_get(dev) == NULL)
+ if (!dev)
return -EINVAL;
+
addrconf_addr_solict_mult(addr, &maddr);
- ipv6_dev_mc_inc(dev, &maddr);
- return 0;
+ return ipv6_dev_mc_inc(dev, &maddr);
}
static void pndisc_destructor(struct pneigh_entry *n)
{
- struct in6_addr *addr = (struct in6_addr*)&n->key;
- struct in6_addr maddr;
+ struct in6_addr *addr = (struct in6_addr *)&n->key;
struct net_device *dev = n->dev;
+ struct in6_addr maddr;
- if (dev == NULL || __in6_dev_get(dev) == NULL)
+ if (!dev)
return;
+
addrconf_addr_solict_mult(addr, &maddr);
ipv6_dev_mc_dec(dev, &maddr);
}
-/*
- * Send a Neighbour Advertisement
- */
+/* called with rtnl held */
+static bool ndisc_allow_add(const struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (!idev || idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device");
+ return false;
+ }
-static inline void ndisc_flow_init(struct flowi *fl, u8 type,
- struct in6_addr *saddr, struct in6_addr *daddr,
- int oif)
+ return true;
+}
+
+static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
+ int len)
{
- memset(fl, 0, sizeof(*fl));
- ipv6_addr_copy(&fl->fl6_src, saddr);
- ipv6_addr_copy(&fl->fl6_dst, daddr);
- fl->proto = IPPROTO_ICMPV6;
- fl->fl_icmp_type = type;
- fl->fl_icmp_code = 0;
- fl->oif = oif;
- security_sk_classify_flow(ndisc_socket->sk, fl);
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
+ skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
+
+ /* Manually assign socket ownership as we avoid calling
+ * sock_alloc_send_pskb() to bypass wmem buffer limits
+ */
+ rcu_read_lock();
+ skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk);
+ rcu_read_unlock();
+
+ return skb;
}
-static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
- struct in6_addr *daddr, struct in6_addr *solicited_addr,
- int router, int solicited, int override, int inc_opt)
+static void ip6_nd_hdr(struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ int hop_limit, int len)
{
- struct in6_addr tmpaddr;
- struct inet6_ifaddr *ifp;
+ struct ipv6hdr *hdr;
struct inet6_dev *idev;
- struct flowi fl;
- struct dst_entry* dst;
- struct sock *sk = ndisc_socket->sk;
- struct in6_addr *src_addr;
- struct nd_msg *msg;
- int len;
- struct sk_buff *skb;
- int err;
+ unsigned tclass;
- len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+ tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0;
+ rcu_read_unlock();
- /* for anycast or proxy, solicited_addr != src_addr */
- ifp = ipv6_get_ifaddr(solicited_addr, dev, 1);
- if (ifp) {
- src_addr = solicited_addr;
- in6_ifa_put(ifp);
- } else {
- if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr))
- return;
- src_addr = &tmpaddr;
- }
+ skb_push(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
- ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr,
- dev->ifindex);
+ ip6_flow_hdr(hdr, tclass, 0);
- dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output);
- if (!dst)
- return;
+ hdr->payload_len = htons(len);
+ hdr->nexthdr = IPPROTO_ICMPV6;
+ hdr->hop_limit = hop_limit;
- err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err < 0)
- return;
+ hdr->saddr = *saddr;
+ hdr->daddr = *daddr;
+}
- if (inc_opt) {
- if (dev->addr_len)
- len += ndisc_opt_addr_space(dev);
- else
- inc_opt = 0;
- }
+void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct icmp6hdr *icmp6h = icmp6_hdr(skb);
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct net *net;
+ struct sock *sk;
+ int err;
+ u8 type;
- skb = sock_alloc_send_skb(sk,
- (MAX_HEADER + sizeof(struct ipv6hdr) +
- len + LL_RESERVED_SPACE(dev)),
- 1, &err);
+ type = icmp6h->icmp6_type;
- if (skb == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 NA: %s() failed to allocate an skb.\n",
- __FUNCTION__);
- dst_release(dst);
- return;
- }
+ rcu_read_lock();
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
- ip6_nd_hdr(sk, skb, dev, src_addr, daddr, IPPROTO_ICMPV6, len);
+ net = dev_net_rcu(skb->dev);
+ sk = net->ipv6.ndisc_sk;
+ if (!dst) {
+ struct flowi6 fl6;
+ int oif = skb->dev->ifindex;
+
+ icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+ if (IS_ERR(dst)) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return;
+ }
- msg = (struct nd_msg *)skb_put(skb, len);
- skb->h.raw = (unsigned char*)msg;
+ skb_dst_set(skb, dst);
+ }
- msg->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
- msg->icmph.icmp6_code = 0;
- msg->icmph.icmp6_cksum = 0;
+ icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len,
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ skb->len, 0));
- msg->icmph.icmp6_unused = 0;
- msg->icmph.icmp6_router = router;
- msg->icmph.icmp6_solicited = solicited;
- msg->icmph.icmp6_override = override;
+ ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
- /* Set the target address. */
- ipv6_addr_copy(&msg->target, solicited_addr);
+ dev = dst_dev_rcu(dst);
+ idev = __in6_dev_get(dev);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
- if (inc_opt)
- ndisc_fill_addr_option(msg->opt, ND_OPT_TARGET_LL_ADDR, dev->dev_addr,
- dev->addr_len, dev->type);
-
- /* checksum */
- msg->icmph.icmp6_cksum = csum_ipv6_magic(src_addr, daddr, len,
- IPPROTO_ICMPV6,
- csum_partial((__u8 *) msg,
- len, 0));
-
- skb->dst = dst;
- idev = in6_dev_get(dst->dev);
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, dev,
+ dst_output);
if (!err) {
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS);
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
+ ICMP6MSGOUT_INC_STATS(net, idev, type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
}
- if (likely(idev != NULL))
- in6_dev_put(idev);
-}
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ndisc_send_skb);
-void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
- struct in6_addr *solicit,
- struct in6_addr *daddr, struct in6_addr *saddr)
+void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
+ const struct in6_addr *solicited_addr,
+ bool router, bool solicited, bool override, bool inc_opt)
{
- struct flowi fl;
- struct dst_entry* dst;
- struct inet6_dev *idev;
- struct sock *sk = ndisc_socket->sk;
- struct sk_buff *skb;
- struct nd_msg *msg;
- struct in6_addr addr_buf;
- int len;
- int err;
- int send_llinfo;
+ struct sk_buff *skb;
+ struct in6_addr tmpaddr;
+ struct inet6_ifaddr *ifp;
+ const struct in6_addr *src_addr;
+ struct nd_msg *msg;
+ int optlen = 0;
- if (saddr == NULL) {
- if (ipv6_get_lladdr(dev, &addr_buf))
+ /* for anycast or proxy, solicited_addr != src_addr */
+ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1);
+ if (ifp) {
+ src_addr = solicited_addr;
+ if (ifp->flags & IFA_F_OPTIMISTIC)
+ override = false;
+ inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao);
+ in6_ifa_put(ifp);
+ } else {
+ if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr,
+ inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs,
+ &tmpaddr))
return;
- saddr = &addr_buf;
+ src_addr = &tmpaddr;
}
- ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr,
- dev->ifindex);
+ if (!dev->addr_len)
+ inc_opt = false;
+ if (inc_opt)
+ optlen += ndisc_opt_addr_space(dev,
+ NDISC_NEIGHBOUR_ADVERTISEMENT);
- dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output);
- if (!dst)
+ skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!skb)
return;
- err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err < 0)
- return;
+ msg = skb_put(skb, sizeof(*msg));
+ *msg = (struct nd_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
+ .icmp6_router = router,
+ .icmp6_solicited = solicited,
+ .icmp6_override = override,
+ },
+ .target = *solicited_addr,
+ };
- len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
- send_llinfo = dev->addr_len && !ipv6_addr_any(saddr);
- if (send_llinfo)
- len += ndisc_opt_addr_space(dev);
-
- skb = sock_alloc_send_skb(sk,
- (MAX_HEADER + sizeof(struct ipv6hdr) +
- len + LL_RESERVED_SPACE(dev)),
- 1, &err);
- if (skb == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 NA: %s() failed to allocate an skb.\n",
- __FUNCTION__);
- dst_release(dst);
+ if (inc_opt)
+ ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+ dev->dev_addr,
+ NDISC_NEIGHBOUR_ADVERTISEMENT);
+
+ ndisc_send_skb(skb, daddr, src_addr);
+}
+
+static void ndisc_send_unsol_na(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+
+ idev = in6_dev_get(dev);
+ if (!idev)
return;
- }
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
- ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len);
-
- msg = (struct nd_msg *)skb_put(skb, len);
- skb->h.raw = (unsigned char*)msg;
- msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION;
- msg->icmph.icmp6_code = 0;
- msg->icmph.icmp6_cksum = 0;
- msg->icmph.icmp6_unused = 0;
-
- /* Set the target address. */
- ipv6_addr_copy(&msg->target, solicit);
-
- if (send_llinfo)
- ndisc_fill_addr_option(msg->opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr,
- dev->addr_len, dev->type);
-
- /* checksum */
- msg->icmph.icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- daddr, len,
- IPPROTO_ICMPV6,
- csum_partial((__u8 *) msg,
- len, 0));
- /* send it! */
- skb->dst = dst;
- idev = in6_dev_get(dst->dev);
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
- if (!err) {
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORSOLICITS);
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ /* skip tentative addresses until dad completes */
+ if (ifa->flags & IFA_F_TENTATIVE &&
+ !(ifa->flags & IFA_F_OPTIMISTIC))
+ continue;
+
+ ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr,
+ /*router=*/ !!idev->cnf.forwarding,
+ /*solicited=*/ false, /*override=*/ true,
+ /*inc_opt=*/ true);
}
+ read_unlock_bh(&idev->lock);
- if (likely(idev != NULL))
- in6_dev_put(idev);
+ in6_dev_put(idev);
}
-void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
- struct in6_addr *daddr)
+struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
+ const struct in6_addr *saddr, u64 nonce)
{
- struct flowi fl;
- struct dst_entry* dst;
- struct inet6_dev *idev;
- struct sock *sk = ndisc_socket->sk;
- struct sk_buff *skb;
- struct icmp6hdr *hdr;
- __u8 * opt;
- int len;
- int err;
+ int inc_opt = dev->addr_len;
+ struct sk_buff *skb;
+ struct nd_msg *msg;
+ int optlen = 0;
- ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr,
- dev->ifindex);
+ if (!saddr)
+ return NULL;
- dst = ndisc_dst_alloc(dev, NULL, daddr, ip6_output);
- if (!dst)
- return;
+ if (ipv6_addr_any(saddr))
+ inc_opt = false;
+ if (inc_opt)
+ optlen += ndisc_opt_addr_space(dev,
+ NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0)
+ optlen += 8;
- err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err < 0)
- return;
+ skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!skb)
+ return NULL;
- len = sizeof(struct icmp6hdr);
- if (dev->addr_len)
- len += ndisc_opt_addr_space(dev);
-
- skb = sock_alloc_send_skb(sk,
- (MAX_HEADER + sizeof(struct ipv6hdr) +
- len + LL_RESERVED_SPACE(dev)),
- 1, &err);
- if (skb == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 RS: %s() failed to allocate an skb.\n",
- __FUNCTION__);
- dst_release(dst);
- return;
+ msg = skb_put(skb, sizeof(*msg));
+ *msg = (struct nd_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION,
+ },
+ .target = *solicit,
+ };
+
+ if (inc_opt)
+ ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
+ dev->dev_addr,
+ NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0) {
+ u8 *opt = skb_put(skb, 8);
+
+ opt[0] = ND_OPT_NONCE;
+ opt[1] = 8 >> 3;
+ memcpy(opt + 2, &nonce, 6);
}
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
- ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len);
+ return skb;
+}
+EXPORT_SYMBOL(ndisc_ns_create);
- hdr = (struct icmp6hdr *)skb_put(skb, len);
- skb->h.raw = (unsigned char*)hdr;
- hdr->icmp6_type = NDISC_ROUTER_SOLICITATION;
- hdr->icmp6_code = 0;
- hdr->icmp6_cksum = 0;
- hdr->icmp6_unused = 0;
+void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
+ const struct in6_addr *daddr, const struct in6_addr *saddr,
+ u64 nonce)
+{
+ struct in6_addr addr_buf;
+ struct sk_buff *skb;
- opt = (u8*) (hdr + 1);
+ if (!saddr) {
+ if (ipv6_get_lladdr(dev, &addr_buf,
+ (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
+ return;
+ saddr = &addr_buf;
+ }
- if (dev->addr_len)
- ndisc_fill_addr_option(opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr,
- dev->addr_len, dev->type);
+ skb = ndisc_ns_create(dev, solicit, saddr, nonce);
- /* checksum */
- hdr->icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, daddr, len,
- IPPROTO_ICMPV6,
- csum_partial((__u8 *) hdr, len, 0));
+ if (skb)
+ ndisc_send_skb(skb, daddr, saddr);
+}
- /* send it! */
- skb->dst = dst;
- idev = in6_dev_get(dst->dev);
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
- if (!err) {
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTROUTERSOLICITS);
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
+void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ struct sk_buff *skb;
+ struct rs_msg *msg;
+ int send_sllao = dev->addr_len;
+ int optlen = 0;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ /*
+ * According to section 2.2 of RFC 4429, we must not
+ * send router solicitations with a sllao from
+ * optimistic addresses, but we may send the solicitation
+ * if we don't include the sllao. So here we check
+ * if our address is optimistic, and if so, we
+ * suppress the inclusion of the sllao.
+ */
+ if (send_sllao) {
+ struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr,
+ dev, 1);
+ if (ifp) {
+ if (ifp->flags & IFA_F_OPTIMISTIC) {
+ send_sllao = 0;
+ }
+ in6_ifa_put(ifp);
+ } else {
+ send_sllao = 0;
+ }
}
+#endif
+ if (send_sllao)
+ optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION);
- if (likely(idev != NULL))
- in6_dev_put(idev);
+ skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!skb)
+ return;
+
+ msg = skb_put(skb, sizeof(*msg));
+ *msg = (struct rs_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_ROUTER_SOLICITATION,
+ },
+ };
+
+ if (send_sllao)
+ ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
+ dev->dev_addr,
+ NDISC_ROUTER_SOLICITATION);
+
+ ndisc_send_skb(skb, daddr, saddr);
}
-
+
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
@@ -708,139 +743,174 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
int probes = atomic_read(&neigh->probes);
- if (skb && ipv6_chk_addr(&skb->nh.ipv6h->saddr, dev, 1))
- saddr = &skb->nh.ipv6h->saddr;
-
- if ((probes -= neigh->parms->ucast_probes) < 0) {
- if (!(neigh->nud_state & NUD_VALID)) {
- ND_PRINTK1(KERN_DEBUG
- "%s(): trying to ucast probe in NUD_INVALID: "
- NIP6_FMT "\n",
- __FUNCTION__,
- NIP6(*target));
+ if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
+ dev, false, 1,
+ IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
+ saddr = &ipv6_hdr(skb)->saddr;
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+ if (probes < 0) {
+ if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
+ net_dbg_ratelimited("%s: trying to ucast probe in NUD_INVALID: %pI6\n",
+ __func__, target);
}
- ndisc_send_ns(dev, neigh, target, target, saddr);
- } else if ((probes -= neigh->parms->app_probes) < 0) {
-#ifdef CONFIG_ARPD
+ ndisc_send_ns(dev, target, target, saddr, 0);
+ } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
-#endif
} else {
addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
+ ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
}
}
-static void ndisc_recv_ns(struct sk_buff *skb)
+static int pndisc_is_router(const void *pkey,
+ struct net_device *dev)
{
- struct nd_msg *msg = (struct nd_msg *)skb->h.raw;
- struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
- struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
+ struct pneigh_entry *n;
+ int ret = -1;
+
+ n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
+ if (n)
+ ret = !!(READ_ONCE(n->flags) & NTF_ROUTER);
+
+ return ret;
+}
+
+void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
+ const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
+ struct ndisc_options *ndopts)
+{
+ neigh_update(neigh, lladdr, new, flags, 0);
+ /* report ndisc ops about neighbour update */
+ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
+}
+
+static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
+{
+ struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+ const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+ const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
u8 *lladdr = NULL;
- u32 ndoptlen = skb->tail - msg->opt;
+ u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
+ offsetof(struct nd_msg, opt));
struct ndisc_options ndopts;
struct net_device *dev = skb->dev;
struct inet6_ifaddr *ifp;
struct inet6_dev *idev = NULL;
struct neighbour *neigh;
- struct pneigh_entry *pneigh = NULL;
int dad = ipv6_addr_any(saddr);
- int inc;
- int is_router;
+ int is_router = -1;
+ SKB_DR(reason);
+ u64 nonce = 0;
+ bool inc;
+
+ if (skb->len < sizeof(struct nd_msg))
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
if (ipv6_addr_is_multicast(&msg->target)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NS: multicast target address");
- return;
+ net_dbg_ratelimited("NS: multicast target address\n");
+ return reason;
}
/*
* RFC2461 7.1.1:
* DAD has to be destined for solicited node multicast address.
*/
- if (dad &&
- !(daddr->s6_addr32[0] == htonl(0xff020000) &&
- daddr->s6_addr32[1] == htonl(0x00000000) &&
- daddr->s6_addr32[2] == htonl(0x00000001) &&
- daddr->s6_addr [12] == 0xff )) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NS: bad DAD packet (wrong destination)\n");
- return;
+ if (dad && !ipv6_addr_is_solict_mult(daddr)) {
+ net_dbg_ratelimited("NS: bad DAD packet (wrong destination)\n");
+ return reason;
}
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NS: invalid ND options\n");
- return;
- }
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
if (!lladdr) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NS: invalid link-layer address length\n");
- return;
+ net_dbg_ratelimited("NS: invalid link-layer address length\n");
+ return reason;
}
/* RFC2461 7.1.1:
- * If the IP source address is the unspecified address,
- * there MUST NOT be source link-layer address option
+ * If the IP source address is the unspecified address,
+ * there MUST NOT be source link-layer address option
* in the message.
*/
if (dad) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NS: bad DAD packet (link-layer address option)\n");
- return;
+ net_dbg_ratelimited("NS: bad DAD packet (link-layer address option)\n");
+ return reason;
}
}
+ if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
+ memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
inc = ipv6_addr_is_multicast(daddr);
- if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1)) != NULL) {
- if (ifp->flags & IFA_F_TENTATIVE) {
- /* Address is tentative. If the source
- is unspecified address, it is someone
- does DAD, otherwise we ignore solicitations
- until DAD timer expires.
- */
- if (!dad)
- goto out;
- if (dev->type == ARPHRD_IEEE802_TR) {
- unsigned char *sadr = skb->mac.raw;
- if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 &&
- sadr[9] == dev->dev_addr[1] &&
- sadr[10] == dev->dev_addr[2] &&
- sadr[11] == dev->dev_addr[3] &&
- sadr[12] == dev->dev_addr[4] &&
- sadr[13] == dev->dev_addr[5]) {
- /* looped-back to us */
+ ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+ if (ifp) {
+have_ifp:
+ if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
+ if (dad) {
+ if (nonce != 0 && ifp->dad_nonce == nonce) {
+ u8 *np = (u8 *)&nonce;
+ /* Matching nonce if looped back */
+ net_dbg_ratelimited("%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+ ifp->idev->dev->name, &ifp->addr, np);
goto out;
}
+ /*
+ * We are colliding with another node
+ * who is doing DAD
+ * so fail our DAD process
+ */
+ addrconf_dad_failure(skb, ifp);
+ return reason;
+ } else {
+ /*
+ * This is not a dad solicitation.
+ * If we are an optimistic node,
+ * we should respond.
+ * Otherwise, we should ignore it.
+ */
+ if (!(ifp->flags & IFA_F_OPTIMISTIC))
+ goto out;
}
- addrconf_dad_failure(ifp);
- return;
}
idev = ifp->idev;
} else {
+ struct net *net = dev_net(dev);
+
+ /* perhaps an address on the master device */
+ if (netif_is_l3_slave(dev)) {
+ struct net_device *mdev;
+
+ mdev = netdev_master_upper_dev_get_rcu(dev);
+ if (mdev) {
+ ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1);
+ if (ifp)
+ goto have_ifp;
+ }
+ }
+
idev = in6_dev_get(dev);
if (!idev) {
/* XXX: count this drop? */
- return;
+ return reason;
}
- if (ipv6_chk_acast_addr(dev, &msg->target) ||
- (idev->cnf.forwarding &&
- (ipv6_devconf.proxy_ndp || idev->cnf.proxy_ndp) &&
- (pneigh = pneigh_lookup(&nd_tbl,
- &msg->target, dev, 0)) != NULL)) {
+ if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
+ (READ_ONCE(idev->cnf.forwarding) &&
+ (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) ||
+ READ_ONCE(idev->cnf.proxy_ndp)) &&
+ (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
skb->pkt_type != PACKET_HOST &&
- inc != 0 &&
- idev->nd_parms->proxy_delay != 0) {
+ inc &&
+ NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) {
/*
* for anycast or proxy,
- * sender should delay its response
- * by a random time between 0 and
+ * sender should delay its response
+ * by a random time between 0 and
* MAX_ANYCAST_DELAY_TIME seconds.
* (RFC2461) -- yoshfuji
*/
@@ -849,18 +919,18 @@ static void ndisc_recv_ns(struct sk_buff *skb)
pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
goto out;
}
- } else
+ } else {
+ SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST);
goto out;
+ }
}
- is_router = !!(pneigh ? pneigh->flags & NTF_ROUTER : idev->cnf.forwarding);
+ if (is_router < 0)
+ is_router = READ_ONCE(idev->cnf.forwarding);
if (dad) {
- struct in6_addr maddr;
-
- ipv6_addr_all_nodes(&maddr);
- ndisc_send_na(dev, NULL, &maddr, &msg->target,
- is_router, 0, (ifp != NULL), 1);
+ ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target,
+ !!is_router, false, (ifp != NULL), true);
goto out;
}
@@ -869,22 +939,23 @@ static void ndisc_recv_ns(struct sk_buff *skb)
else
NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);
- /*
+ /*
* update / create cache entry
* for the source address
*/
neigh = __neigh_lookup(&nd_tbl, saddr, dev,
!inc || lladdr || !dev->addr_len);
if (neigh)
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE);
- if (neigh || !dev->hard_header) {
- ndisc_send_na(dev, neigh, saddr, &msg->target,
- is_router,
- 1, (ifp != NULL && inc), inc);
+ NEIGH_UPDATE_F_OVERRIDE,
+ NDISC_NEIGHBOUR_SOLICITATION, &ndopts);
+ if (neigh || !dev->header_ops) {
+ ndisc_send_na(dev, saddr, &msg->target, !!is_router,
+ true, (ifp != NULL && inc), inc);
if (neigh)
neigh_release(neigh);
+ reason = SKB_CONSUMED;
}
out:
@@ -892,76 +963,131 @@ out:
in6_ifa_put(ifp);
else
in6_dev_put(idev);
+ return reason;
+}
- return;
+static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ switch (READ_ONCE(idev->cnf.accept_untracked_na)) {
+ case 0: /* Don't accept untracked na (absent in neighbor cache) */
+ return 0;
+ case 1: /* Create new entries from na if currently untracked */
+ return 1;
+ case 2: /* Create new entries from untracked na only if saddr is in the
+ * same subnet as an address configured on the interface that
+ * received the na
+ */
+ return !!ipv6_chk_prefix(saddr, dev);
+ default:
+ return 0;
+ }
}
-static void ndisc_recv_na(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
{
- struct nd_msg *msg = (struct nd_msg *)skb->h.raw;
- struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
- struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
+ struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+ struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+ const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
u8 *lladdr = NULL;
- u32 ndoptlen = skb->tail - msg->opt;
+ u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
+ offsetof(struct nd_msg, opt));
struct ndisc_options ndopts;
struct net_device *dev = skb->dev;
+ struct inet6_dev *idev = __in6_dev_get(dev);
struct inet6_ifaddr *ifp;
struct neighbour *neigh;
+ SKB_DR(reason);
+ u8 new_state;
- if (skb->len < sizeof(struct nd_msg)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NA: packet too short\n");
- return;
- }
+ if (skb->len < sizeof(struct nd_msg))
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
if (ipv6_addr_is_multicast(&msg->target)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NA: target address is multicast.\n");
- return;
+ net_dbg_ratelimited("NA: target address is multicast\n");
+ return reason;
}
if (ipv6_addr_is_multicast(daddr) &&
msg->icmph.icmp6_solicited) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NA: solicited NA is multicasted.\n");
- return;
- }
-
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NS: invalid ND option\n");
- return;
+ net_dbg_ratelimited("NA: solicited NA is multicasted\n");
+ return reason;
}
+
+ /* For some 802.11 wireless deployments (and possibly other networks),
+ * there will be a NA proxy and unsolicitd packets are attacks
+ * and thus should not be accepted.
+ * drop_unsolicited_na takes precedence over accept_untracked_na
+ */
+ if (!msg->icmph.icmp6_solicited && idev &&
+ READ_ONCE(idev->cnf.drop_unsolicited_na))
+ return reason;
+
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
+
if (ndopts.nd_opts_tgt_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
if (!lladdr) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NA: invalid link-layer address length\n");
- return;
+ net_dbg_ratelimited("NA: invalid link-layer address length\n");
+ return reason;
}
}
- if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1))) {
- if (ifp->flags & IFA_F_TENTATIVE) {
- addrconf_dad_failure(ifp);
- return;
+ ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+ if (ifp) {
+ if (skb->pkt_type != PACKET_LOOPBACK
+ && (ifp->flags & IFA_F_TENTATIVE)) {
+ addrconf_dad_failure(skb, ifp);
+ return reason;
}
/* What should we make now? The advertisement
is invalid, but ndisc specs say nothing
about it. It could be misconfiguration, or
an smart proxy agent tries to help us :-)
+
+ We should not print the error if NA has been
+ received from loopback - it is just our own
+ unsolicited advertisement.
*/
- ND_PRINTK1(KERN_WARNING
- "ICMPv6 NA: someone advertises our address on %s!\n",
- ifp->idev->dev->name);
+ if (skb->pkt_type != PACKET_LOOPBACK)
+ net_warn_ratelimited("NA: %pM advertised our address %pI6c on %s!\n",
+ eth_hdr(skb)->h_source, &ifp->addr,
+ ifp->idev->dev->name);
in6_ifa_put(ifp);
- return;
+ return reason;
}
+
neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
- if (neigh) {
+ /* RFC 9131 updates original Neighbour Discovery RFC 4861.
+ * NAs with Target LL Address option without a corresponding
+ * entry in the neighbour cache can now create a STALE neighbour
+ * cache entry on routers.
+ *
+ * entry accept fwding solicited behaviour
+ * ------- ------ ------ --------- ----------------------
+ * present X X 0 Set state to STALE
+ * present X X 1 Set state to REACHABLE
+ * absent 0 X X Do nothing
+ * absent 1 0 X Do nothing
+ * absent 1 1 X Add a new STALE entry
+ *
+ * Note that we don't do a (daddr == all-routers-mcast) check.
+ */
+ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
+ if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) {
+ if (accept_untracked_na(dev, saddr)) {
+ neigh = neigh_create(&nd_tbl, &msg->target, dev);
+ new_state = NUD_STALE;
+ }
+ }
+
+ if (neigh && !IS_ERR(neigh)) {
u8 old_flags = neigh->flags;
+ struct net *net = dev_net(dev);
- if (neigh->nud_state & NUD_FAILED)
+ if (READ_ONCE(neigh->nud_state) & NUD_FAILED)
goto out;
/*
@@ -970,56 +1096,56 @@ static void ndisc_recv_na(struct sk_buff *skb)
* has already sent a NA to us.
*/
if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
- ipv6_devconf.forwarding && ipv6_devconf.proxy_ndp &&
- pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) {
- /* XXX: idev->cnf.prixy_ndp */
+ READ_ONCE(net->ipv6.devconf_all->forwarding) &&
+ READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
+ pneigh_lookup(&nd_tbl, net, &msg->target, dev)) {
+ /* XXX: idev->cnf.proxy_ndp */
goto out;
}
- neigh_update(neigh, lladdr,
- msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+ ndisc_update(dev, neigh, lladdr,
+ new_state,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+ (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0),
+ NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts);
if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
/*
* Change: router to host
*/
- struct rt6_info *rt;
- rt = rt6_get_dflt_router(saddr, dev);
- if (rt)
- ip6_del_rt(rt);
+ rt6_clean_tohost(dev_net(dev), saddr);
}
-
+ reason = SKB_CONSUMED;
out:
neigh_release(neigh);
}
+ return reason;
}
-static void ndisc_recv_rs(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb)
{
- struct rs_msg *rs_msg = (struct rs_msg *) skb->h.raw;
+ struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb);
unsigned long ndoptlen = skb->len - sizeof(*rs_msg);
struct neighbour *neigh;
struct inet6_dev *idev;
- struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+ const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
struct ndisc_options ndopts;
u8 *lladdr = NULL;
+ SKB_DR(reason);
if (skb->len < sizeof(*rs_msg))
- return;
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
- idev = in6_dev_get(skb->dev);
+ idev = __in6_dev_get(skb->dev);
if (!idev) {
- if (net_ratelimit())
- ND_PRINTK1("ICMP6 RS: can't find in6 device\n");
- return;
+ net_err_ratelimited("RS: can't find in6 device\n");
+ return reason;
}
/* Don't accept RS if we're not in router mode */
- if (!idev->cnf.forwarding)
+ if (!READ_ONCE(idev->cnf.forwarding))
goto out;
/*
@@ -1030,11 +1156,8 @@ static void ndisc_recv_rs(struct sk_buff *skb)
goto out;
/* Parse ND options */
- if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
- if (net_ratelimit())
- ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n");
- goto out;
- }
+ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
@@ -1045,64 +1168,124 @@ static void ndisc_recv_rs(struct sk_buff *skb)
neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
if (neigh) {
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
+ NDISC_ROUTER_SOLICITATION, &ndopts);
neigh_release(neigh);
+ reason = SKB_CONSUMED;
}
out:
- in6_dev_put(idev);
+ return reason;
}
-static void ndisc_router_discovery(struct sk_buff *skb)
+static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
{
- struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw;
+ struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra);
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ struct nduseroptmsg *ndmsg;
+ struct net *net = dev_net(ra->dev);
+ int err;
+ int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg)
+ + (opt->nd_opt_len << 3));
+ size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr));
+
+ skb = nlmsg_new(msg_size, GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0);
+ if (!nlh) {
+ goto nla_put_failure;
+ }
+
+ ndmsg = nlmsg_data(nlh);
+ ndmsg->nduseropt_family = AF_INET6;
+ ndmsg->nduseropt_ifindex = ra->dev->ifindex;
+ ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type;
+ ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code;
+ ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3;
+
+ memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3);
+
+ if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+ err = -EMSGSIZE;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
+}
+
+static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
+{
+ struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
+ bool send_ifinfo_notify = false;
struct neighbour *neigh = NULL;
+ struct ndisc_options ndopts;
+ struct fib6_info *rt = NULL;
struct inet6_dev *in6_dev;
- struct rt6_info *rt = NULL;
+ struct fib6_table *table;
+ u32 defrtr_usr_metric;
+ unsigned int pref = 0;
+ __u32 old_if_flags;
+ struct net *net;
+ SKB_DR(reason);
int lifetime;
- struct ndisc_options ndopts;
int optlen;
- unsigned int pref = 0;
- __u8 * opt = (__u8 *)(ra_msg + 1);
+ __u8 *opt = (__u8 *)(ra_msg + 1);
- optlen = (skb->tail - skb->h.raw) - sizeof(struct ra_msg);
+ optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) -
+ sizeof(struct ra_msg);
- if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 RA: source address is not link-local.\n");
- return;
- }
- if (optlen < 0) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 RA: packet too short\n");
- return;
+ net_dbg_ratelimited("RA: %s, dev: %s\n", __func__, skb->dev->name);
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ net_dbg_ratelimited("RA: source address is not link-local\n");
+ return reason;
}
+ if (optlen < 0)
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
- /*
- * set the RA_RECV flag in the interface
- */
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
+ net_dbg_ratelimited("RA: from host or unauthorized router\n");
+ return reason;
+ }
+#endif
- in6_dev = in6_dev_get(skb->dev);
- if (in6_dev == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 RA: can't find inet6 device for %s.\n",
- skb->dev->name);
- return;
+ in6_dev = __in6_dev_get(skb->dev);
+ if (!in6_dev) {
+ net_err_ratelimited("RA: can't find inet6 device for %s\n", skb->dev->name);
+ return reason;
}
- if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) {
- in6_dev_put(in6_dev);
- return;
+
+ if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
+
+ if (!ipv6_accept_ra(in6_dev)) {
+ net_dbg_ratelimited("RA: %s, did not accept ra for dev: %s\n", __func__,
+ skb->dev->name);
+ goto skip_linkparms;
}
- if (!ndisc_parse_options(opt, optlen, &ndopts)) {
- in6_dev_put(in6_dev);
- ND_PRINTK2(KERN_WARNING
- "ICMP6 RA: invalid ND options\n");
- return;
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ /* skip link-specific parameters from interior routers */
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
+ net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__,
+ skb->dev->name);
+ goto skip_linkparms;
}
+#endif
if (in6_dev->if_flags & IF_RS_SENT) {
/*
@@ -1116,6 +1299,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
* Remember the managed/otherconf flags from most recently
* received RA message (RFC 2462) -- yoshfuji
*/
+ old_if_flags = in6_dev->if_flags;
in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
IF_RA_OTHERCONF)) |
(ra_msg->icmph.icmp6_addrconf_managed ?
@@ -1123,64 +1307,116 @@ static void ndisc_router_discovery(struct sk_buff *skb)
(ra_msg->icmph.icmp6_addrconf_other ?
IF_RA_OTHERCONF : 0);
- if (!in6_dev->cnf.accept_ra_defrtr)
+ if (old_if_flags != in6_dev->if_flags)
+ send_ifinfo_notify = true;
+
+ if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) {
+ net_dbg_ratelimited("RA: %s, defrtr is false for dev: %s\n", __func__,
+ skb->dev->name);
goto skip_defrtr;
+ }
lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
+ if (lifetime != 0 &&
+ lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) {
+ net_dbg_ratelimited("RA: router lifetime (%ds) is too short: %s\n", lifetime,
+ skb->dev->name);
+ goto skip_defrtr;
+ }
+
+ /* Do not accept RA with source-addr found on local machine unless
+ * accept_ra_from_local is set to true.
+ */
+ net = dev_net(in6_dev->dev);
+ if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
+ ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
+ net_dbg_ratelimited("RA from local address detected on dev: %s: default router ignored\n",
+ skb->dev->name);
+ goto skip_defrtr;
+ }
#ifdef CONFIG_IPV6_ROUTER_PREF
pref = ra_msg->icmph.icmp6_router_pref;
/* 10b is handled as if it were 00b (medium) */
if (pref == ICMPV6_ROUTER_PREF_INVALID ||
- in6_dev->cnf.accept_ra_rtr_pref)
+ !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref))
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
-
- rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-
- if (rt)
- neigh = rt->rt6i_nexthop;
-
- if (rt && lifetime == 0) {
- neigh_clone(neigh);
- ip6_del_rt(rt);
+ /* routes added from RAs do not use nexthop objects */
+ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
+ if (rt) {
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
+ &ipv6_hdr(skb)->saddr);
+ if (!neigh) {
+ net_err_ratelimited("RA: %s got default router without neighbour\n",
+ __func__);
+ fib6_info_release(rt);
+ return reason;
+ }
+ }
+ /* Set default route metric as specified by user */
+ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric;
+ /* delete the route if lifetime is 0 or if metric needs change */
+ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) {
+ ip6_del_rt(net, rt, false);
rt = NULL;
}
- if (rt == NULL && lifetime) {
- ND_PRINTK3(KERN_DEBUG
- "ICMPv6 RA: adding default router.\n");
+ net_dbg_ratelimited("RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime,
+ defrtr_usr_metric, skb->dev->name);
+ if (!rt && lifetime) {
+ net_dbg_ratelimited("RA: adding default router\n");
- rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev, pref);
- if (rt == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 RA: %s() failed to add default route.\n",
- __FUNCTION__);
- in6_dev_put(in6_dev);
- return;
+ if (neigh)
+ neigh_release(neigh);
+
+ rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
+ skb->dev, pref, defrtr_usr_metric,
+ lifetime);
+ if (!rt) {
+ net_err_ratelimited("RA: %s failed to add default route\n", __func__);
+ return reason;
}
- neigh = rt->rt6i_nexthop;
- if (neigh == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 RA: %s() got default router without neighbour.\n",
- __FUNCTION__);
- dst_release(&rt->u.dst);
- in6_dev_put(in6_dev);
- return;
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
+ &ipv6_hdr(skb)->saddr);
+ if (!neigh) {
+ net_err_ratelimited("RA: %s got default router without neighbour\n",
+ __func__);
+ fib6_info_release(rt);
+ return reason;
}
neigh->flags |= NTF_ROUTER;
- } else if (rt) {
- rt->rt6i_flags |= (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
- }
-
- if (rt)
- rt->rt6i_expires = jiffies + (HZ * lifetime);
-
- if (ra_msg->icmph.icmp6_hop_limit) {
- in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
- if (rt)
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit;
+ } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) {
+ struct nl_info nlinfo = {
+ .nl_net = net,
+ };
+ rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
+ }
+
+ if (rt) {
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ fib6_set_expires(rt, jiffies + (HZ * lifetime));
+ fib6_add_gc_list(rt);
+
+ spin_unlock_bh(&table->tb6_lock);
+ }
+ if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 &&
+ ra_msg->icmph.icmp6_hop_limit) {
+ if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <=
+ ra_msg->icmph.icmp6_hop_limit) {
+ WRITE_ONCE(in6_dev->cnf.hop_limit,
+ ra_msg->icmph.icmp6_hop_limit);
+ fib6_metric_set(rt, RTAX_HOPLIMIT,
+ ra_msg->icmph.icmp6_hop_limit);
+ } else {
+ net_dbg_ratelimited("RA: Got route advertisement with lower hop_limit than minimum\n");
+ }
}
skip_defrtr:
@@ -1194,11 +1430,11 @@ skip_defrtr:
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
- if (rtime < HZ/10)
- rtime = HZ/10;
- in6_dev->nd_parms->retrans_time = rtime;
+ if (rtime < HZ/100)
+ rtime = HZ/100;
+ NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+ send_ifinfo_notify = true;
}
rtime = ntohl(ra_msg->reachable_time);
@@ -1208,22 +1444,26 @@ skip_defrtr:
if (rtime < HZ/10)
rtime = HZ/10;
- if (rtime != in6_dev->nd_parms->base_reachable_time) {
- in6_dev->nd_parms->base_reachable_time = rtime;
- in6_dev->nd_parms->gc_staletime = 3 * rtime;
- in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
+ if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) {
+ NEIGH_VAR_SET(in6_dev->nd_parms,
+ BASE_REACHABLE_TIME, rtime);
+ NEIGH_VAR_SET(in6_dev->nd_parms,
+ GC_STALETIME, 3 * rtime);
+ neigh_set_reach_time(in6_dev->nd_parms);
in6_dev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+ send_ifinfo_notify = true;
}
}
}
+skip_linkparms:
+
/*
* Process options.
*/
if (!neigh)
- neigh = __neigh_lookup(&nd_tbl, &skb->nh.ipv6h->saddr,
+ neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr,
skb->dev, 1);
if (neigh) {
u8 *lladdr = NULL;
@@ -1231,369 +1471,428 @@ skip_defrtr:
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
skb->dev);
if (!lladdr) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 RA: invalid link-layer address length\n");
+ net_dbg_ratelimited("RA: invalid link-layer address length\n");
goto out;
}
}
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- NEIGH_UPDATE_F_ISROUTER);
+ NEIGH_UPDATE_F_ISROUTER,
+ NDISC_ROUTER_ADVERTISEMENT, &ndopts);
+ reason = SKB_CONSUMED;
+ }
+
+ if (!ipv6_accept_ra(in6_dev)) {
+ net_dbg_ratelimited("RA: %s, accept_ra is false for dev: %s\n", __func__,
+ skb->dev->name);
+ goto out;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
- if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) {
+ if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
+ ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
+ in6_dev->dev, 0)) {
+ net_dbg_ratelimited("RA from local address detected on dev: %s: router info ignored.\n",
+ skb->dev->name);
+ goto skip_routeinfo;
+ }
+
+ if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_ri;
p;
p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
- if (((struct route_info *)p)->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
+ struct route_info *ri = (struct route_info *)p;
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT &&
+ ri->prefix_len == 0)
continue;
- rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3,
- &skb->nh.ipv6h->saddr);
+#endif
+ if (ri->prefix_len == 0 &&
+ !READ_ONCE(in6_dev->cnf.accept_ra_defrtr))
+ continue;
+ if (ri->lifetime != 0 &&
+ ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft))
+ continue;
+ if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen))
+ continue;
+ if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen))
+ continue;
+ rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
+ &ipv6_hdr(skb)->saddr);
}
}
+
+skip_routeinfo:
#endif
- if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ /* skip link-specific ndopts from interior routers */
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
+ net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
+ __func__, skb->dev->name);
+ goto out;
+ }
+#endif
+
+ if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_pi;
p;
p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) {
- addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3);
+ addrconf_prefix_rcv(skb->dev, (u8 *)p,
+ (p->nd_opt_len) << 3,
+ ndopts.nd_opts_src_lladdr != NULL);
}
}
- if (ndopts.nd_opts_mtu) {
+ if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) {
__be32 n;
u32 mtu;
- memcpy(&n, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
+ memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
mtu = ntohl(n);
- if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 RA: invalid mtu: %d\n",
- mtu);
- } else if (in6_dev->cnf.mtu6 != mtu) {
- in6_dev->cnf.mtu6 = mtu;
-
- if (rt)
- rt->u.dst.metrics[RTAX_MTU-1] = mtu;
+ if (in6_dev->ra_mtu != mtu) {
+ in6_dev->ra_mtu = mtu;
+ send_ifinfo_notify = true;
+ }
+ if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
+ net_dbg_ratelimited("RA: invalid mtu: %d\n", mtu);
+ } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) {
+ WRITE_ONCE(in6_dev->cnf.mtu6, mtu);
+ fib6_metric_set(rt, RTAX_MTU, mtu);
rt6_mtu_change(skb->dev, mtu);
}
}
-
+
+ if (ndopts.nd_useropts) {
+ struct nd_opt_hdr *p;
+ for (p = ndopts.nd_useropts;
+ p;
+ p = ndisc_next_useropt(skb->dev, p,
+ ndopts.nd_useropts_end)) {
+ ndisc_ra_useropt(skb, p);
+ }
+ }
+
if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 RA: invalid RA options");
+ net_dbg_ratelimited("RA: invalid RA options\n");
}
out:
- if (rt)
- dst_release(&rt->u.dst);
- else if (neigh)
+ /* Send a notify if RA changed managed/otherconf flags or
+ * timer settings or ra_mtu value
+ */
+ if (send_ifinfo_notify)
+ inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+
+ fib6_info_release(rt);
+ if (neigh)
neigh_release(neigh);
- in6_dev_put(in6_dev);
+ return reason;
}
-static void ndisc_redirect_rcv(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb)
{
- struct inet6_dev *in6_dev;
- struct icmp6hdr *icmph;
- struct in6_addr *dest;
- struct in6_addr *target; /* new first hop to destination */
- struct neighbour *neigh;
- int on_link = 0;
+ struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb);
+ u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
+ offsetof(struct rd_msg, opt));
struct ndisc_options ndopts;
- int optlen;
- u8 *lladdr = NULL;
+ SKB_DR(reason);
+ u8 *hdr;
- if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: source address is not link-local.\n");
- return;
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ switch (skb->ndisc_nodetype) {
+ case NDISC_NODETYPE_HOST:
+ case NDISC_NODETYPE_NODEFAULT:
+ net_dbg_ratelimited("Redirect: from host or unauthorized router\n");
+ return reason;
}
+#endif
- optlen = skb->tail - skb->h.raw;
- optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
-
- if (optlen < 0) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: packet too short\n");
- return;
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ net_dbg_ratelimited("Redirect: source address is not link-local\n");
+ return reason;
}
- icmph = (struct icmp6hdr *) skb->h.raw;
- target = (struct in6_addr *) (icmph + 1);
- dest = target + 1;
+ if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
- if (ipv6_addr_is_multicast(dest)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: destination address is multicast.\n");
- return;
+ if (!ndopts.nd_opts_rh) {
+ ip6_redirect_no_header(skb, dev_net(skb->dev),
+ skb->dev->ifindex);
+ return reason;
}
- if (ipv6_addr_equal(dest, target)) {
- on_link = 1;
- } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: target address is not link-local.\n");
- return;
- }
+ hdr = (u8 *)ndopts.nd_opts_rh;
+ hdr += 8;
+ if (!pskb_pull(skb, hdr - skb_transport_header(skb)))
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
- in6_dev = in6_dev_get(skb->dev);
- if (!in6_dev)
- return;
- if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) {
- in6_dev_put(in6_dev);
- return;
- }
+ return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
+}
- /* RFC2461 8.1:
- * The IP source address of the Redirect MUST be the same as the current
- * first-hop router for the specified ICMP Destination Address.
- */
-
- if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: invalid ND options\n");
- in6_dev_put(in6_dev);
- return;
- }
- if (ndopts.nd_opts_tgt_lladdr) {
- lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
- skb->dev);
- if (!lladdr) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: invalid link-layer address length\n");
- in6_dev_put(in6_dev);
- return;
- }
- }
+static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb,
+ struct sk_buff *orig_skb,
+ int rd_len)
+{
+ u8 *opt = skb_put(skb, rd_len);
- neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
- if (neigh) {
- rt6_redirect(dest, &skb->nh.ipv6h->daddr,
- &skb->nh.ipv6h->saddr, neigh, lladdr,
- on_link);
- neigh_release(neigh);
- }
- in6_dev_put(in6_dev);
+ memset(opt, 0, 8);
+ *(opt++) = ND_OPT_REDIRECT_HDR;
+ *(opt++) = (rd_len >> 3);
+ opt += 6;
+
+ skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt,
+ rd_len - 8);
}
-void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
- struct in6_addr *target)
+void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
{
- struct sock *sk = ndisc_socket->sk;
- int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net_rcu(dev);
+ struct sock *sk = net->ipv6.ndisc_sk;
+ int optlen = 0;
+ struct inet_peer *peer;
struct sk_buff *buff;
- struct icmp6hdr *icmph;
+ struct rd_msg *msg;
struct in6_addr saddr_buf;
- struct in6_addr *addrp;
- struct net_device *dev;
struct rt6_info *rt;
struct dst_entry *dst;
- struct inet6_dev *idev;
- struct flowi fl;
- u8 *opt;
+ struct flowi6 fl6;
int rd_len;
- int err;
- int hlen;
- u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
-
- dev = skb->dev;
+ u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
+ ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
+ bool ret;
- if (ipv6_get_lladdr(dev, &saddr_buf)) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: no link-local address on %s\n",
- dev->name);
- return;
- }
-
- ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &skb->nh.ipv6h->saddr,
- dev->ifindex);
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev)
+ return;
+ }
- dst = ip6_route_output(NULL, &fl);
- if (dst == NULL)
+ if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
+ net_dbg_ratelimited("Redirect: no link-local address on %s\n", dev->name);
return;
+ }
- err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err)
+ if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
+ ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
+ net_dbg_ratelimited("Redirect: target address is not link-local unicast\n");
return;
+ }
- rt = (struct rt6_info *) dst;
+ icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
+ &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
- if (rt->rt6i_flags & RTF_GATEWAY) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 Redirect: destination is not a neighbour.\n");
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
dst_release(dst);
return;
}
- if (!xrlim_allow(dst, 1*HZ)) {
- dst_release(dst);
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
return;
+
+ rt = dst_rt6_info(dst);
+
+ if (rt->rt6i_flags & RTF_GATEWAY) {
+ net_dbg_ratelimited("Redirect: destination is not a neighbour\n");
+ goto release;
}
+ peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr);
+ ret = inet_peer_xrlim_allow(peer, 1*HZ);
+
+ if (!ret)
+ goto release;
+
if (dev->addr_len) {
+ struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target);
+ if (!neigh) {
+ net_dbg_ratelimited("Redirect: no neigh for target address\n");
+ goto release;
+ }
+
read_lock_bh(&neigh->lock);
if (neigh->nud_state & NUD_VALID) {
memcpy(ha_buf, neigh->ha, dev->addr_len);
read_unlock_bh(&neigh->lock);
ha = ha_buf;
- len += ndisc_opt_addr_space(dev);
+ optlen += ndisc_redirect_opt_addr_space(dev, neigh,
+ ops_data_buf,
+ &ops_data);
} else
read_unlock_bh(&neigh->lock);
+
+ neigh_release(neigh);
}
rd_len = min_t(unsigned int,
- IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8);
+ IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen,
+ skb->len + 8);
rd_len &= ~0x7;
- len += rd_len;
-
- buff = sock_alloc_send_skb(sk,
- (MAX_HEADER + sizeof(struct ipv6hdr) +
- len + LL_RESERVED_SPACE(dev)),
- 1, &err);
- if (buff == NULL) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 Redirect: %s() failed to allocate an skb.\n",
- __FUNCTION__);
- dst_release(dst);
- return;
- }
-
- hlen = 0;
-
- skb_reserve(buff, LL_RESERVED_SPACE(dev));
- ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr,
- IPPROTO_ICMPV6, len);
-
- icmph = (struct icmp6hdr *)skb_put(buff, len);
- buff->h.raw = (unsigned char*)icmph;
-
- memset(icmph, 0, sizeof(struct icmp6hdr));
- icmph->icmp6_type = NDISC_REDIRECT;
-
- /*
- * copy target and destination addresses
- */
-
- addrp = (struct in6_addr *)(icmph + 1);
- ipv6_addr_copy(addrp, target);
- addrp++;
- ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr);
-
- opt = (u8*) (addrp + 1);
+ optlen += rd_len;
+
+ buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!buff)
+ goto release;
+
+ msg = skb_put(buff, sizeof(*msg));
+ *msg = (struct rd_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_REDIRECT,
+ },
+ .target = *target,
+ .dest = ipv6_hdr(skb)->daddr,
+ };
/*
* include target_address option
*/
if (ha)
- opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha,
- dev->addr_len, dev->type);
+ ndisc_fill_redirect_addr_option(buff, ha, ops_data);
/*
* build redirect option and copy skb over to the new packet.
*/
- memset(opt, 0, 8);
- *(opt++) = ND_OPT_REDIRECT_HDR;
- *(opt++) = (rd_len >> 3);
- opt += 6;
+ if (rd_len)
+ ndisc_fill_redirect_hdr_option(buff, skb, rd_len);
- memcpy(opt, skb->nh.ipv6h, rd_len - 8);
+ skb_dst_set(buff, dst);
+ ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf);
+ return;
- icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &skb->nh.ipv6h->saddr,
- len, IPPROTO_ICMPV6,
- csum_partial((u8 *) icmph, len, 0));
+release:
+ dst_release(dst);
+}
- buff->dst = dst;
- idev = in6_dev_get(dst->dev);
- IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output);
- if (!err) {
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS);
- ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
- }
+static void pndisc_redo(struct sk_buff *skb)
+{
+ enum skb_drop_reason reason = ndisc_recv_ns(skb);
- if (likely(idev != NULL))
- in6_dev_put(idev);
+ kfree_skb_reason(skb, reason);
}
-static void pndisc_redo(struct sk_buff *skb)
+static int ndisc_is_multicast(const void *pkey)
{
- ndisc_recv_ns(skb);
- kfree_skb(skb);
+ return ipv6_addr_is_multicast((struct in6_addr *)pkey);
+}
+
+static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev)
+ return true;
+ if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
+ READ_ONCE(idev->cnf.suppress_frag_ndisc)) {
+ net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
+ return true;
+ }
+ return false;
}
-int ndisc_rcv(struct sk_buff *skb)
+enum skb_drop_reason ndisc_rcv(struct sk_buff *skb)
{
struct nd_msg *msg;
+ SKB_DR(reason);
- if (!pskb_may_pull(skb, skb->len))
- return 0;
+ if (ndisc_suppress_frag_ndisc(skb))
+ return SKB_DROP_REASON_IPV6_NDISC_FRAG;
- msg = (struct nd_msg *) skb->h.raw;
+ if (skb_linearize(skb))
+ return SKB_DROP_REASON_NOMEM;
- __skb_push(skb, skb->data-skb->h.raw);
+ msg = (struct nd_msg *)skb_transport_header(skb);
- if (skb->nh.ipv6h->hop_limit != 255) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NDISC: invalid hop-limit: %d\n",
- skb->nh.ipv6h->hop_limit);
- return 0;
+ __skb_push(skb, skb->data - skb_transport_header(skb));
+
+ if (ipv6_hdr(skb)->hop_limit != 255) {
+ net_dbg_ratelimited("NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit);
+ return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT;
}
if (msg->icmph.icmp6_code != 0) {
- ND_PRINTK2(KERN_WARNING
- "ICMPv6 NDISC: invalid ICMPv6 code: %d\n",
- msg->icmph.icmp6_code);
- return 0;
+ net_dbg_ratelimited("NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code);
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE;
}
- memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
-
switch (msg->icmph.icmp6_type) {
case NDISC_NEIGHBOUR_SOLICITATION:
- ndisc_recv_ns(skb);
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+ reason = ndisc_recv_ns(skb);
break;
case NDISC_NEIGHBOUR_ADVERTISEMENT:
- ndisc_recv_na(skb);
+ reason = ndisc_recv_na(skb);
break;
case NDISC_ROUTER_SOLICITATION:
- ndisc_recv_rs(skb);
+ reason = ndisc_recv_rs(skb);
break;
case NDISC_ROUTER_ADVERTISEMENT:
- ndisc_router_discovery(skb);
+ reason = ndisc_router_discovery(skb);
break;
case NDISC_REDIRECT:
- ndisc_redirect_rcv(skb);
+ reason = ndisc_redirect_rcv(skb);
break;
- };
+ }
- return 0;
+ return reason;
}
static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
+ struct net *net = dev_net(dev);
+ struct inet6_dev *idev;
+ bool evict_nocarrier;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&nd_tbl, dev);
- fib6_run_gc(~0UL);
+ fib6_run_gc(0, net, false);
+ fallthrough;
+ case NETDEV_UP:
+ idev = in6_dev_get(dev);
+ if (!idev)
+ break;
+ if (READ_ONCE(idev->cnf.ndisc_notify) ||
+ READ_ONCE(net->ipv6.devconf_all->ndisc_notify))
+ ndisc_send_unsol_na(dev);
+ in6_dev_put(idev);
+ break;
+ case NETDEV_CHANGE:
+ idev = in6_dev_get(dev);
+ if (!idev)
+ evict_nocarrier = true;
+ else {
+ evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) &&
+ READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier);
+ in6_dev_put(idev);
+ }
+
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&nd_tbl, dev);
+ if (evict_nocarrier && !netif_carrier_ok(dev))
+ neigh_carrier_down(&nd_tbl, dev);
break;
case NETDEV_DOWN:
neigh_ifdown(&nd_tbl, dev);
- fib6_run_gc(~0UL);
+ fib6_run_gc(0, net, false);
+ break;
+ case NETDEV_NOTIFY_PEERS:
+ ndisc_send_unsol_na(dev);
break;
default:
break;
@@ -1604,21 +1903,18 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
static struct notifier_block ndisc_netdev_notifier = {
.notifier_call = ndisc_netdev_event,
+ .priority = ADDRCONF_NOTIFY_PRIORITY - 5,
};
#ifdef CONFIG_SYSCTL
-static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
+static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl,
const char *func, const char *dev_name)
{
static char warncomm[TASK_COMM_LEN];
static int warned;
if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- printk(KERN_WARNING
- "process `%s' is using deprecated sysctl (%s) "
- "net.ipv6.neigh.%s.%s; "
- "Use net.ipv6.neigh.%s.%s_ms "
- "instead.\n",
+ strscpy(warncomm, current->comm);
+ pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n",
warncomm, func,
dev_name, ctl->procname,
dev_name, ctl->procname);
@@ -1626,136 +1922,122 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
}
}
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct net_device *dev = ctl->extra1;
struct inet6_dev *idev;
int ret;
- if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME ||
- ctl->ctl_name == NET_NEIGH_REACHABLE_TIME)
+ if ((strcmp(ctl->procname, "retrans_time") == 0) ||
+ (strcmp(ctl->procname, "base_reachable_time") == 0))
ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
- switch (ctl->ctl_name) {
- case NET_NEIGH_RETRANS_TIME:
- ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
- break;
- case NET_NEIGH_REACHABLE_TIME:
- ret = proc_dointvec_jiffies(ctl, write,
- filp, buffer, lenp, ppos);
- break;
- case NET_NEIGH_RETRANS_TIME_MS:
- case NET_NEIGH_REACHABLE_TIME_MS:
- ret = proc_dointvec_ms_jiffies(ctl, write,
- filp, buffer, lenp, ppos);
- break;
- default:
- ret = -1;
- }
-
- if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
- if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME ||
- ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS)
- idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time);
- idev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_NEWLINK, idev);
- in6_dev_put(idev);
- }
- return ret;
-}
+ if (strcmp(ctl->procname, "retrans_time") == 0)
+ ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos);
-static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
- int nlen, void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
-{
- struct net_device *dev = ctl->extra1;
- struct inet6_dev *idev;
- int ret;
+ else if (strcmp(ctl->procname, "base_reachable_time") == 0)
+ ret = neigh_proc_dointvec_jiffies(ctl, write,
+ buffer, lenp, ppos);
- if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME ||
- ctl->ctl_name == NET_NEIGH_REACHABLE_TIME)
- ndisc_warn_deprecated_sysctl(ctl, "procfs", dev ? dev->name : "default");
+ else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
+ (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
+ ret = neigh_proc_dointvec_ms_jiffies(ctl, write,
+ buffer, lenp, ppos);
+ else
+ ret = -1;
- switch (ctl->ctl_name) {
- case NET_NEIGH_REACHABLE_TIME:
- ret = sysctl_jiffies(ctl, name, nlen,
- oldval, oldlenp, newval, newlen,
- context);
- break;
- case NET_NEIGH_RETRANS_TIME_MS:
- case NET_NEIGH_REACHABLE_TIME_MS:
- ret = sysctl_ms_jiffies(ctl, name, nlen,
- oldval, oldlenp, newval, newlen,
- context);
- break;
- default:
- ret = 0;
- }
+ if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
+ if (ctl->data == NEIGH_VAR_PTR(idev->nd_parms, BASE_REACHABLE_TIME))
+ neigh_set_reach_time(idev->nd_parms);
- if (newval && newlen && ret > 0 &&
- dev && (idev = in6_dev_get(dev)) != NULL) {
- if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME ||
- ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS)
- idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time);
- idev->tstamp = jiffies;
+ WRITE_ONCE(idev->tstamp, jiffies);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
in6_dev_put(idev);
}
-
return ret;
}
+
#endif
-int __init ndisc_init(struct net_proto_family *ops)
+static int __net_init ndisc_net_init(struct net *net)
{
struct ipv6_pinfo *np;
struct sock *sk;
- int err;
+ int err;
- err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &ndisc_socket);
+ err = inet_ctl_sock_create(&sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
if (err < 0) {
- ND_PRINTK0(KERN_ERR
- "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n",
- err);
- ndisc_socket = NULL; /* For safety. */
+ net_err_ratelimited("NDISC: Failed to initialize the control socket (err %d)\n",
+ err);
return err;
}
- sk = ndisc_socket->sk;
+ net->ipv6.ndisc_sk = sk;
+
np = inet6_sk(sk);
- sk->sk_allocation = GFP_ATOMIC;
np->hop_limit = 255;
/* Do not loopback ndisc messages */
- np->mc_loop = 0;
- sk->sk_prot->unhash(sk);
+ inet6_clear_bit(MC6_LOOP, sk);
+
+ return 0;
+}
+
+static void __net_exit ndisc_net_exit(struct net *net)
+{
+ inet_ctl_sock_destroy(net->ipv6.ndisc_sk);
+}
+
+static struct pernet_operations ndisc_net_ops = {
+ .init = ndisc_net_init,
+ .exit = ndisc_net_exit,
+};
+
+int __init ndisc_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&ndisc_net_ops);
+ if (err)
+ return err;
+ /*
+ * Initialize the neighbour table
+ */
+ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl);
- /*
- * Initialize the neighbour table
- */
-
- neigh_table_init(&nd_tbl);
+#ifdef CONFIG_SYSCTL
+ err = neigh_sysctl_register(NULL, &nd_tbl.parms,
+ ndisc_ifinfo_sysctl_change);
+ if (err)
+ goto out_unregister_pernet;
+out:
+#endif
+ return err;
#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH,
- "ipv6",
- &ndisc_ifinfo_sysctl_change,
- &ndisc_ifinfo_sysctl_strategy);
+out_unregister_pernet:
+ unregister_pernet_subsys(&ndisc_net_ops);
+ goto out;
#endif
+}
- register_netdevice_notifier(&ndisc_netdev_notifier);
- return 0;
+int __init ndisc_late_init(void)
+{
+ return register_netdevice_notifier(&ndisc_netdev_notifier);
}
-void ndisc_cleanup(void)
+void ndisc_late_cleanup(void)
{
unregister_netdevice_notifier(&ndisc_netdev_notifier);
+}
+
+void ndisc_cleanup(void)
+{
#ifdef CONFIG_SYSCTL
neigh_sysctl_unregister(&nd_tbl.parms);
#endif
- neigh_table_clear(&nd_tbl);
- sock_release(ndisc_socket);
- ndisc_socket = NULL; /* For safety. */
+ neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl);
+ unregister_pernet_subsys(&ndisc_net_ops);
}
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index f6294e5bcb31..46540a5a4331 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -1,130 +1,271 @@
+/*
+ * IPv6 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/ipv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
+#include <linux/export.h>
+#include <net/addrconf.h>
#include <net/dst.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/xfrm.h>
-#include <net/ip6_checksum.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
-int ip6_route_me_harder(struct sk_buff *skb)
+int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
{
- struct ipv6hdr *iph = skb->nh.ipv6h;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct sock *sk = sk_to_full_sk(sk_partial);
+ struct net_device *dev = skb_dst_dev(skb);
+ struct flow_keys flkeys;
+ unsigned int hh_len;
struct dst_entry *dst;
- struct flowi fl = {
- .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
- .nl_u =
- { .ip6_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr, } },
+ int strict = (ipv6_addr_type(&iph->daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
+ struct flowi6 fl6 = {
+ .flowi6_l3mdev = l3mdev_master_ifindex(dev),
+ .flowi6_mark = skb->mark,
+ .flowi6_uid = sock_net_uid(net, sk),
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
};
+ int err;
- dst = ip6_route_output(skb->sk, &fl);
+ if (sk && sk->sk_bound_dev_if)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ else if (strict)
+ fl6.flowi6_oif = dev->ifindex;
-#ifdef CONFIG_XFRM
- if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(skb, &fl, AF_INET6) == 0)
- if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0))
- return -1;
-#endif
-
- if (dst->error) {
- IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
- LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
+ fib6_rules_early_flow_dissect(net, skb, &fl6, &flkeys);
+ dst = ip6_route_output(net, sk, &fl6);
+ err = dst->error;
+ if (err) {
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+ net_dbg_ratelimited("ip6_route_me_harder: No more route\n");
dst_release(dst);
- return -EINVAL;
+ return err;
}
/* Drop old route. */
- dst_release(skb->dst);
+ skb_dst_drop(skb);
+
+ skb_dst_set(skb, dst);
+
+#ifdef CONFIG_XFRM
+ if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ skb_dst_set(skb, dst);
+ }
+#endif
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst_dev(skb)->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -ENOMEM;
- skb->dst = dst;
return 0;
}
EXPORT_SYMBOL(ip6_route_me_harder);
-/*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
-
-struct ip6_rt_info {
- struct in6_addr daddr;
- struct in6_addr saddr;
-};
-
-static void nf_ip6_saveroute(const struct sk_buff *skb, struct nf_info *info)
+static int nf_ip6_reroute(struct sk_buff *skb,
+ const struct nf_queue_entry *entry)
{
- struct ip6_rt_info *rt_info = nf_info_reroute(info);
+ struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
- if (info->hook == NF_IP6_LOCAL_OUT) {
- struct ipv6hdr *iph = skb->nh.ipv6h;
-
- rt_info->daddr = iph->daddr;
- rt_info->saddr = iph->saddr;
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+ !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
+ skb->mark != rt_info->mark)
+ return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
}
+ return 0;
}
-static int nf_ip6_reroute(struct sk_buff **pskb, const struct nf_info *info)
+int __nf_ip6_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict)
{
- struct ip6_rt_info *rt_info = nf_info_reroute(info);
+ static const struct ipv6_pinfo fake_pinfo;
+ static const struct inet_sock fake_sk = {
+ /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */
+ .sk.sk_bound_dev_if = 1,
+ .pinet6 = (struct ipv6_pinfo *) &fake_pinfo,
+ };
+ const void *sk = strict ? &fake_sk : NULL;
+ struct dst_entry *result;
+ int err;
- if (info->hook == NF_IP6_LOCAL_OUT) {
- struct ipv6hdr *iph = (*pskb)->nh.ipv6h;
- if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
- !ipv6_addr_equal(&iph->saddr, &rt_info->saddr))
- return ip6_route_me_harder(*pskb);
- }
- return 0;
+ result = ip6_route_output(net, sk, &fl->u.ip6);
+ err = result->error;
+ if (err)
+ dst_release(result);
+ else
+ *dst = result;
+ return err;
}
+EXPORT_SYMBOL_GPL(__nf_ip6_route);
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ struct nf_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *))
{
- struct ipv6hdr *ip6h = skb->nh.ipv6h;
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_IP6_PRE_ROUTING && hook != NF_IP6_LOCAL_IN)
- break;
- if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- skb->len - dataoff, protocol,
- csum_sub(skb->csum,
- skb_checksum(skb, 0,
- dataoff, 0)))) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ u8 tstamp_type = skb->tstamp_type;
+ ktime_t tstamp = skb->tstamp;
+ struct ip6_frag_state state;
+ u8 *prevhdr, nexthdr = 0;
+ unsigned int mtu, hlen;
+ int hroom, err = 0;
+ __be32 frag_id;
+
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto blackhole;
+ hlen = err;
+ nexthdr = *prevhdr;
+
+ mtu = skb->dev->mtu;
+ if (frag_max_size > mtu ||
+ frag_max_size < IPV6_MIN_MTU)
+ goto blackhole;
+
+ mtu = frag_max_size;
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto blackhole;
+ mtu -= hlen + sizeof(struct frag_hdr);
+
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ hroom = LL_RESERVED_SPACE(skb->dev);
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
+ struct sk_buff *frag2;
+
+ if (first_len - hlen > mtu)
+ goto blackhole;
+
+ if (skb_cloned(skb) ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag2) {
+ if (frag2->len > mtu)
+ goto blackhole;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag2) ||
+ skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+ goto slow_path;
}
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = ~csum_unfold(
- csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- skb->len - dataoff,
- protocol,
- csum_sub(0,
- skb_checksum(skb, 0,
- dataoff, 0))));
- csum = __skb_checksum_complete(skb);
+
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
+ goto blackhole;
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down.
+ */
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip6_fraglist_next(&iter);
+ }
+
+ kfree(iter.tmp_hdr);
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+ return err;
}
- return csum;
-}
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+ LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+ &state);
-EXPORT_SYMBOL(nf_ip6_checksum);
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip6_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
-static struct nf_afinfo nf_ip6_afinfo = {
- .family = AF_INET6,
- .checksum = nf_ip6_checksum,
- .saveroute = nf_ip6_saveroute,
- .reroute = nf_ip6_reroute,
- .route_key_size = sizeof(struct ip6_rt_info),
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
+
+static const struct nf_ipv6_ops ipv6ops = {
+#if IS_MODULE(CONFIG_IPV6)
+ .chk_addr = ipv6_chk_addr,
+ .route_me_harder = ip6_route_me_harder,
+ .dev_get_saddr = ipv6_dev_get_saddr,
+ .route = __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ .cookie_init_sequence = __cookie_v6_init_sequence,
+ .cookie_v6_check = __cookie_v6_check,
+#endif
+#endif
+ .route_input = ip6_route_input,
+ .fragment = ip6_fragment,
+ .reroute = nf_ip6_reroute,
+#if IS_MODULE(CONFIG_IPV6)
+ .br_fragment = br_ip6_fragment,
+#endif
};
int __init ipv6_netfilter_init(void)
{
- return nf_register_afinfo(&nf_ip6_afinfo);
+ RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops);
+ return 0;
}
/* This can be called from inet6_init() on errors, so it cannot
@@ -132,5 +273,5 @@ int __init ipv6_netfilter_init(void)
*/
void ipv6_netfilter_fini(void)
{
- nf_unregister_afinfo(&nf_ip6_afinfo);
+ RCU_INIT_POINTER(nf_ipv6_ops, NULL);
}
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index d7c45a9c15fe..81daf82ddc2d 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -1,47 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP netfilter configuration
#
-menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
- depends on INET && IPV6 && NETFILTER && EXPERIMENTAL
+menu "IPv6: Netfilter Configuration"
+ depends on INET && IPV6 && NETFILTER
-config NF_CONNTRACK_IPV6
- tristate "IPv6 support for new connection tracking (EXPERIMENTAL)"
- depends on EXPERIMENTAL && NF_CONNTRACK
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
+# old sockopt interface and eval loop
+config IP6_NF_IPTABLES_LEGACY
+ tristate "Legacy IP6 tables support"
+ depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default m if NETFILTER_XTABLES_LEGACY
+ help
+ ip6tables is a legacy packet classifier.
+ This is not needed if you are using iptables over nftables
+ (iptables-nft).
- This is IPv6 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
+config NF_SOCKET_IPV6
+ tristate "IPv6 socket lookup support"
+ help
+ This option enables the IPv6 socket lookup infrastructure. This
+ is used by the {ip6,nf}tables socket match.
- To compile it as a module, choose M here. If unsure, say N.
+config NF_TPROXY_IPV6
+ tristate "IPv6 tproxy support"
+
+if NF_TABLES
-config IP6_NF_QUEUE
- tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
- ---help---
+config NF_TABLES_IPV6
+ bool "IPv6 nf_tables support"
+ help
+ This option enables the IPv6 support for nf_tables.
- This option adds a queue handler to the kernel for IPv6
- packets which enables users to receive the filtered packets
- with QUEUE target using libipq.
+if NF_TABLES_IPV6
- THis option enables the old IPv6-only "ip6_queue" implementation
- which has been obsoleted by the new "nfnetlink_queue" code (see
- CONFIG_NETFILTER_NETLINK_QUEUE).
+config NFT_REJECT_IPV6
+ select NF_REJECT_IPV6
+ default NFT_REJECT
+ tristate
- (C) Fernando Anton 2001
- IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
- Universidad Carlos III de Madrid
- Universidad Politecnica de Alcala de Henares
- email: <fanton@it.uc3m.es>.
+config NFT_DUP_IPV6
+ tristate "IPv6 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV6
+ help
+ This module enables IPv6 packet duplication support for nf_tables.
- To compile it as a module, choose M here. If unsure, say N.
+config NFT_FIB_IPV6
+ tristate "nf_tables fib / ipv6 route lookup support"
+ select NFT_FIB
+ help
+ This module enables IPv6 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+
+endif # NF_TABLES_IPV6
+endif # NF_TABLES
+
+config NF_DUP_IPV6
+ tristate "Netfilter IPv6 packet duplication to alternate destination"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ help
+ This option enables the nf_dup_ipv6 core, which duplicates an IPv6
+ packet to be rerouted to another destination.
+
+config NF_REJECT_IPV6
+ tristate "IPv6 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
+config NF_LOG_IPV6
+ tristate "IPv6 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_SYSLOG
+ help
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
config IP6_NF_IPTABLES
tristate "IP6 tables support (required for filtering)"
- depends on NETFILTER_XTABLES
+ depends on INET && IPV6
+ select NETFILTER_XTABLES
+ default m if NETFILTER_ADVANCED=n
help
ip6tables is a general, extensible packet identification framework.
Currently only the packet filtering and packet mangling subsystem
@@ -50,83 +90,115 @@ config IP6_NF_IPTABLES
To compile it as a module, choose M here. If unsure, say N.
+if IP6_NF_IPTABLES
+
# The simple matches.
-config IP6_NF_MATCH_RT
- tristate "Routing header match support"
- depends on IP6_NF_IPTABLES
+config IP6_NF_MATCH_AH
+ tristate '"ah" match support'
+ depends on NETFILTER_ADVANCED
help
- rt matching allows you to match packets based on the routing
- header of the packet.
+ This module allows one to match AH packets.
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_MATCH_OPTS
- tristate "Hop-by-hop and Dst opts header match support"
- depends on IP6_NF_IPTABLES
+config IP6_NF_MATCH_EUI64
+ tristate '"eui64" address check'
+ depends on NETFILTER_ADVANCED
help
- This allows one to match packets based on the hop-by-hop
- and destination options headers of a packet.
+ This module performs checking on the IPv6 source address
+ Compares the last 64 bits with the EUI64 (delivered
+ from the MAC address) address
To compile it as a module, choose M here. If unsure, say N.
config IP6_NF_MATCH_FRAG
- tristate "Fragmentation header match support"
- depends on IP6_NF_IPTABLES
+ tristate '"frag" Fragmentation header match support'
+ depends on NETFILTER_ADVANCED
help
frag matching allows you to match packets based on the fragmentation
header of the packet.
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_MATCH_HL
- tristate "HL match support"
- depends on IP6_NF_IPTABLES
+config IP6_NF_MATCH_OPTS
+ tristate '"hbh" hop-by-hop and "dst" opts header match support'
+ depends on NETFILTER_ADVANCED
help
- HL matching allows you to match packets based on the hop
- limit of the packet.
+ This allows one to match packets based on the hop-by-hop
+ and destination options headers of a packet.
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_MATCH_OWNER
- tristate "Owner match support"
- depends on IP6_NF_IPTABLES
+config IP6_NF_MATCH_HL
+ tristate '"hl" hoplimit match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_HL
help
- Packet owner matching allows you to match locally-generated packets
- based on who created them: the user, group, process or session.
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
config IP6_NF_MATCH_IPV6HEADER
- tristate "IPv6 Extension Headers Match"
- depends on IP6_NF_IPTABLES
+ tristate '"ipv6header" IPv6 Extension Headers Match'
+ default m if NETFILTER_ADVANCED=n
help
This module allows one to match packets based upon
the ipv6 extension headers.
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_MATCH_AH
- tristate "AH match support"
- depends on IP6_NF_IPTABLES
+config IP6_NF_MATCH_MH
+ tristate '"mh" match support'
+ depends on NETFILTER_ADVANCED
help
- This module allows one to match AH packets.
+ This module allows one to match MH packets.
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_MATCH_EUI64
- tristate "EUI64 address check"
- depends on IP6_NF_IPTABLES
+config IP6_NF_MATCH_RPFILTER
+ tristate '"rpfilter" reverse path filter match support'
+ depends on NETFILTER_ADVANCED
+ depends on IP6_NF_MANGLE || IP6_NF_RAW || NFT_COMPAT
help
- This module performs checking on the IPv6 source address
- Compares the last 64 bits with the EUI64 (delivered
- from the MAC address) address
+ This option allows you to match packets whose replies would
+ go out via the interface the packet came in.
+
+ To compile it as a module, choose M here. If unsure, say N.
+ The module will be called ip6t_rpfilter.
+
+config IP6_NF_MATCH_RT
+ tristate '"rt" Routing header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ rt matching allows you to match packets based on the routing
+ header of the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_SRH
+ tristate '"srh" Segment Routing header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ srh matching allows you to match packets based on the segment
+ routing header of the packet.
To compile it as a module, choose M here. If unsure, say N.
# The targets
+config IP6_NF_TARGET_HL
+ tristate '"HL" hoplimit target support'
+ depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
+ select NETFILTER_XT_TARGET_HL
+ help
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
+
config IP6_NF_FILTER
tristate "Packet filtering"
- depends on IP6_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY
+ depends on IP6_NF_IPTABLES_LEGACY
+ tristate
help
Packet filtering defines a table `filter', which has a series of
rules for simple packet filtering at local input, forwarding and
@@ -134,18 +206,11 @@ config IP6_NF_FILTER
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_TARGET_LOG
- tristate "LOG target support"
- depends on IP6_NF_FILTER
- help
- This option adds a `LOG' target, which allows you to create rules in
- any iptables table which records the packet header to the syslog.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP6_NF_TARGET_REJECT
tristate "REJECT target support"
- depends on IP6_NF_FILTER
+ depends on IP6_NF_FILTER || NFT_COMPAT
+ select NF_REJECT_IPV6
+ default m if NETFILTER_ADVANCED=n
help
The REJECT target allows a filtering rule to specify that an ICMPv6
error should be issued in response to an incoming packet, rather
@@ -153,9 +218,23 @@ config IP6_NF_TARGET_REJECT
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_SYNPROXY
+ tristate "SYNPROXY target support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
+ help
+ The SYNPROXY target allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP6_NF_MANGLE
tristate "Packet mangling"
- depends on IP6_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY
+ depends on IP6_NF_IPTABLES_LEGACY
help
This option adds a `mangle' table to iptables: see the man page for
iptables(8). This table is used for various packet alterations
@@ -163,32 +242,62 @@ config IP6_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
-config IP6_NF_TARGET_HL
- tristate 'HL (hoplimit) target support'
- depends on IP6_NF_MANGLE
- help
- This option adds a `HL' target, which enables the user to decrement
- the hoplimit value of the IPv6 header or set it to a given (lower)
- value.
-
- While it is safe to decrement the hoplimit value, this option also
- enables functionality to increment and set the hoplimit value of the
- IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since
- you can easily create immortal packets that loop forever on the
- network.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP6_NF_RAW
tristate 'raw table support (required for TRACE)'
- depends on IP6_NF_IPTABLES
+ depends on IP6_NF_IPTABLES_LEGACY
help
This option adds a `raw' table to ip6tables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
-
+
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
+
+# security table for MAC policy
+config IP6_NF_SECURITY
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ depends on IP6_NF_IPTABLES_LEGACY
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
+
+config IP6_NF_NAT
+ tristate "ip6tables NAT support"
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ depends on IP6_NF_IPTABLES_LEGACY
+ select NF_NAT
+ select NETFILTER_XT_NAT
+ help
+ This enables the `nat' table in ip6tables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ select NETFILTER_XT_TARGET_MASQUERADE
+ depends on IP6_NF_NAT
+ help
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
+
+config IP6_NF_TARGET_NPT
+ tristate "NPT (Network Prefix translation) target support"
+ depends on IP6_NF_NAT || NFT_COMPAT
+ help
+ This option adds the `SNPT' and `DNPT' target, which perform
+ stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296.
+
+ To compile it as a module, choose M here. If unsure, say N.
+endif # IP6_NF_IPTABLES
endmenu
+config NF_DEFRAG_IPV6
+ tristate
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index ac1dfebde175..66ce6fa5b2f5 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -1,27 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the netfilter modules on top of IPv6.
#
# Link order matters here.
-obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
-obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
-obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o
-obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
-obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
-obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
-obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
-obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
+obj-$(CONFIG_IP6_NF_IPTABLES_LEGACY) += ip6_tables.o
obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
-obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
-obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
-obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
-obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
-obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
+obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
+
+# defrag
+nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
+obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
+
+obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
+
+# reject
+obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
-# objects for l3 independent conntrack
-nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o
+obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o
-# l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
+# nf_tables
+obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
+obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
+obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
+
+# matches
+obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
+obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
+obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
+obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o
+obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o
+obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o
+obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
+obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o
+
+# targets
+obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
deleted file mode 100644
index 21908c9a10da..000000000000
--- a/net/ipv6/netfilter/ip6_queue.c
+++ /dev/null
@@ -1,729 +0,0 @@
-/*
- * This is a module which is used for queueing IPv6 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2001 Fernando Anton, this code is GPL.
- * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
- * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain
- * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain
- * email: fanton@it.uc3m.es
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 2001-11-06: First try. Working with ip_queue.c for IPv4 and trying
- * to adapt it to IPv6
- * HEAVILY based in ipqueue.c by James Morris. It's just
- * a little modified version of it, so he's nearly the
- * real coder of this.
- * Few changes needed, mainly the hard_routing code and
- * the netlink socket protocol (we're NETLINK_IP6_FW).
- * 2002-06-25: Code cleanup. [JM: ported cleanup over from ip_queue.c]
- * 2005-02-04: Added /proc counter for dropped packets; fixed so
- * packets aren't delivered to user space if they're going
- * to be dropped.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ipv6.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/mutex.h>
-#include <net/sock.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip6_queue"
-#define NET_IPQ_QMAX 2088
-#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
-
-struct ipq_queue_entry {
- struct list_head list;
- struct nf_info *info;
- struct sk_buff *skb;
-};
-
-typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_RWLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static void
-ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
-{
- local_bh_disable();
- nf_reinject(entry->skb, entry->info, verdict);
- local_bh_enable();
- kfree(entry);
-}
-
-static inline void
-__ipq_enqueue_entry(struct ipq_queue_entry *entry)
-{
- list_add(&entry->list, &queue_list);
- queue_total++;
-}
-
-/*
- * Find and return a queued entry matched by cmpfn, or return the last
- * entry if cmpfn is NULL.
- */
-static inline struct ipq_queue_entry *
-__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct list_head *p;
-
- list_for_each_prev(p, &queue_list) {
- struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
-
- if (!cmpfn || cmpfn(entry, data))
- return entry;
- }
- return NULL;
-}
-
-static inline void
-__ipq_dequeue_entry(struct ipq_queue_entry *entry)
-{
- list_del(&entry->list);
- queue_total--;
-}
-
-static inline struct ipq_queue_entry *
-__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct ipq_queue_entry *entry;
-
- entry = __ipq_find_entry(cmpfn, data);
- if (entry == NULL)
- return NULL;
-
- __ipq_dequeue_entry(entry);
- return entry;
-}
-
-
-static inline void
-__ipq_flush(int verdict)
-{
- struct ipq_queue_entry *entry;
-
- while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
- ipq_issue_verdict(entry, verdict);
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- copy_mode = mode;
- copy_range = range;
- if (copy_range > 0xFFFF)
- copy_range = 0xFFFF;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NF_DROP);
-}
-
-static struct ipq_queue_entry *
-ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct ipq_queue_entry *entry;
-
- write_lock_bh(&queue_lock);
- entry = __ipq_find_dequeue_entry(cmpfn, data);
- write_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-ipq_flush(int verdict)
-{
- write_lock_bh(&queue_lock);
- __ipq_flush(verdict);
- write_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
-{
- unsigned char *old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
-
- read_lock_bh(&queue_lock);
-
- switch (copy_mode) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- data_len = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
- entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
- (*errp = skb_checksum_help(entry->skb))) {
- read_unlock_bh(&queue_lock);
- return NULL;
- }
- if (copy_range == 0 || copy_range > entry->skb->len)
- data_len = entry->skb->len;
- else
- data_len = copy_range;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- read_unlock_bh(&queue_lock);
- return NULL;
- }
-
- read_unlock_bh(&queue_lock);
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail= skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->tstamp.off_sec;
- pmsg->timestamp_usec = entry->skb->tstamp.off_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->info->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->info->indev)
- strcpy(pmsg->indev_name, entry->info->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->info->outdev)
- strcpy(pmsg->outdev_name, entry->info->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->info->indev && entry->skb->dev) {
- pmsg->hw_type = entry->skb->dev->type;
- if (entry->skb->dev->hard_header_parse)
- pmsg->hw_addrlen =
- entry->skb->dev->hard_header_parse(entry->skb,
- pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- if (skb)
- kfree_skb(skb);
- *errp = -EINVAL;
- printk(KERN_ERR "ip6_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
- unsigned int queuenum, void *data)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
- struct ipq_queue_entry *entry;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
- if (entry == NULL) {
- printk(KERN_ERR "ip6_queue: OOM in ipq_enqueue_packet()\n");
- return -ENOMEM;
- }
-
- entry->info = info;
- entry->skb = skb;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- goto err_out_free;
-
- write_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip6_queue: fill at %d entries, "
- "dropping packet(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- write_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- write_unlock_bh(&queue_lock);
-
-err_out_free:
- kfree(entry);
- return status;
-}
-
-static int
-ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
-{
- int diff;
- struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- struct sk_buff *newskb;
-
- newskb = skb_copy_expand(e->skb,
- skb_headroom(e->skb),
- diff,
- GFP_ATOMIC);
- if (newskb == NULL) {
- printk(KERN_WARNING "ip6_queue: OOM "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- if (e->skb->sk)
- skb_set_owner_w(newskb, e->skb->sk);
- kfree_skb(e->skb);
- e->skb = newskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(&e->skb, v->data_len))
- return -ENOMEM;
- memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static inline int
-id_cmp(struct ipq_queue_entry *e, unsigned long id)
-{
- return (id == (unsigned long )e);
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct ipq_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv6(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- ipq_issue_verdict(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- write_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- write_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
- status = -EINVAL;
- else
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->info->indev)
- if (entry->info->indev->ifindex == ifindex)
- return 1;
-
- if (entry->info->outdev)
- if (entry->info->outdev->ifindex == ifindex)
- return 1;
-
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- struct ipq_queue_entry *entry;
-
- while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
- ipq_issue_verdict(entry, NF_DROP);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags, nlmsglen, skblen;
- struct nlmsghdr *nlh;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = (struct nlmsghdr *)skb->data;
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- write_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- write_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- net_enable_timestamp();
- peer_pid = pid;
- }
-
- write_unlock_bh(&queue_lock);
-
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- return;
-}
-
-static void
-ipq_rcv_sk(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen;
-
- mutex_lock(&ipqnl_mutex);
-
- for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
- skb = skb_dequeue(&sk->sk_receive_queue);
- ipq_rcv_skb(skb);
- kfree_skb(skb);
- }
-
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE &&
- n->protocol == NETLINK_IP6_FW && n->pid) {
- write_lock_bh(&queue_lock);
- if (n->pid == peer_pid)
- __ipq_reset();
- write_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .ctl_name = NET_IPQ_QMAX,
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipq_dir_table[] = {
- {
- .ctl_name = NET_IPV6,
- .procname = "ipv6",
- .mode = 0555,
- .child = ipq_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipq_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipq_dir_table
- },
- { .ctl_name = 0 }
-};
-
-static int
-ipq_get_info(char *buffer, char **start, off_t offset, int length)
-{
- int len;
-
- read_lock_bh(&queue_lock);
-
- len = sprintf(buffer,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netfilter dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- read_unlock_bh(&queue_lock);
-
- *start = buffer + offset;
- len -= offset;
- if (len > length)
- len = length;
- else if (len < 0)
- len = 0;
- return len;
-}
-
-static struct nf_queue_handler nfqh = {
- .name = "ip6_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip6_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk,
- THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
- proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
- if (proc)
- proc->owner = THIS_MODULE;
- else {
- printk(KERN_ERR "ip6_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-
- register_netdevice_notifier(&ipq_dev_notifier);
- ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
-
- status = nf_register_queue_handler(PF_INET6, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
- unregister_sysctl_table(ipq_sysctl_header);
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(IPQ_PROC_FS_NAME);
-
-cleanup_ipqnl:
- sock_release(ipqnl->sk_socket);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip6_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
- synchronize_net();
- ipq_flush(NF_DROP);
-
- unregister_sysctl_table(ipq_sysctl_header);
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(IPQ_PROC_FS_NAME);
-
- sock_release(ipqnl->sk_socket);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv6 packet queue handler");
-MODULE_LICENSE("GPL");
-
-module_init(ip6_queue_init);
-module_exit(ip6_queue_fini);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f63fb86d7c7b..d585ac3c1113 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1,23 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Packet matching code.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
- * - increase module usage count as soon as we have rules inside
- * a table
- * 06 Jun 2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- * - new extension header parser code
- * 15 Oct 2005 Harald Welte <laforge@netfilter.org>
- * - Unification of {ip,ip6}_tables into x_tables
- * - Removed tcp and udp code, since it's not ipv6 specific
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/in.h>
#include <linux/skbuff.h>
@@ -26,393 +18,381 @@
#include <linux/netdevice.h>
#include <linux/module.h>
#include <linux/poison.h>
-#include <linux/icmpv6.h>
#include <net/ipv6.h>
-#include <asm/uaccess.h>
+#include <net/compat.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
+#include <linux/err.h>
#include <linux/cpumask.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv6 packet filter");
-#define IPV6_HDR_LEN (sizeof(struct ipv6hdr))
-#define IPV6_OPTHDR_LEN (sizeof(struct ipv6_opt_hdr))
-
-/*#define DEBUG_IP_FIREWALL*/
-/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
-
-#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) printk(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x) \
-do { \
- if (!(x)) \
- printk("IP_NF_ASSERT: %s:%s:%u\n", \
- __FUNCTION__, __FILE__, __LINE__); \
-} while(0)
-#else
-#define IP_NF_ASSERT(x)
-#endif
-
-#if 0
-/* All the better to debug you with... */
-#define static
-#define inline
-#endif
-
-/*
- We keep a set of rules for each CPU, so we can avoid write-locking
- them in the softirq when updating the counters and therefore
- only need to read-lock in the softirq; doing a write_lock_bh() in user
- context stops packets coming through and allows user context to read
- the counters or update the rules.
-
- Hence the start of any table is given by get_table() below. */
-
-#if 0
-#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
-#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
-#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
-#endif
-
-/* Check for an extension */
-int
-ip6t_ext_hdr(u8 nexthdr)
+void *ip6t_alloc_initial_table(const struct xt_table *info)
{
- return ( (nexthdr == IPPROTO_HOPOPTS) ||
- (nexthdr == IPPROTO_ROUTING) ||
- (nexthdr == IPPROTO_FRAGMENT) ||
- (nexthdr == IPPROTO_ESP) ||
- (nexthdr == IPPROTO_AH) ||
- (nexthdr == IPPROTO_NONE) ||
- (nexthdr == IPPROTO_DSTOPTS) );
+ return xt_alloc_initial_table(ip6t, IP6T);
}
+EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
/* Returns whether matches rule or not. */
-static inline int
+/* Performance critical - called for every packet */
+static inline bool
ip6_packet_match(const struct sk_buff *skb,
const char *indev,
const char *outdev,
const struct ip6t_ip6 *ip6info,
unsigned int *protoff,
- int *fragoff, int *hotdrop)
+ u16 *fragoff, bool *hotdrop)
{
- size_t i;
unsigned long ret;
- const struct ipv6hdr *ipv6 = skb->nh.ipv6h;
-
-#define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg))
+ const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
- if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
- &ip6info->src), IP6T_INV_SRCIP)
- || FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
- &ip6info->dst), IP6T_INV_DSTIP)) {
- dprintf("Source or dest mismatch.\n");
-/*
- dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
- ipinfo->smsk.s_addr, ipinfo->src.s_addr,
- ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : "");
- dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
- ipinfo->dmsk.s_addr, ipinfo->dst.s_addr,
- ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/
- return 0;
- }
+ if (NF_INVF(ip6info, IP6T_INV_SRCIP,
+ ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
+ &ip6info->src)) ||
+ NF_INVF(ip6info, IP6T_INV_DSTIP,
+ ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
+ &ip6info->dst)))
+ return false;
- /* Look for ifname matches; this should unroll nicely. */
- for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
- ret |= (((const unsigned long *)indev)[i]
- ^ ((const unsigned long *)ip6info->iniface)[i])
- & ((const unsigned long *)ip6info->iniface_mask)[i];
- }
+ ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);
- if (FWINV(ret != 0, IP6T_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, ip6info->iniface,
- ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":"");
- return 0;
- }
+ if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0))
+ return false;
- for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
- ret |= (((const unsigned long *)outdev)[i]
- ^ ((const unsigned long *)ip6info->outiface)[i])
- & ((const unsigned long *)ip6info->outiface_mask)[i];
- }
+ ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);
- if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, ip6info->outiface,
- ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":"");
- return 0;
- }
+ if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0))
+ return false;
/* ... might want to do something with class and flowlabel here ... */
/* look for the desired protocol header */
- if((ip6info->flags & IP6T_F_PROTO)) {
+ if (ip6info->flags & IP6T_F_PROTO) {
int protohdr;
unsigned short _frag_off;
- protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off);
+ protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL);
if (protohdr < 0) {
if (_frag_off == 0)
- *hotdrop = 1;
- return 0;
+ *hotdrop = true;
+ return false;
}
*fragoff = _frag_off;
- dprintf("Packet protocol %hi ?= %s%hi.\n",
- protohdr,
- ip6info->invflags & IP6T_INV_PROTO ? "!":"",
- ip6info->proto);
-
if (ip6info->proto == protohdr) {
- if(ip6info->invflags & IP6T_INV_PROTO) {
- return 0;
- }
- return 1;
+ if (ip6info->invflags & IP6T_INV_PROTO)
+ return false;
+
+ return true;
}
/* We need match for the '-p all', too! */
if ((ip6info->proto != 0) &&
!(ip6info->invflags & IP6T_INV_PROTO))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* should be ip6 safe */
-static inline int
+static bool
ip6_checkentry(const struct ip6t_ip6 *ipv6)
{
- if (ipv6->flags & ~IP6T_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- ipv6->flags & ~IP6T_F_MASK);
- return 0;
- }
- if (ipv6->invflags & ~IP6T_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- ipv6->invflags & ~IP6T_INV_MASK);
- return 0;
- }
- return 1;
+ if (ipv6->flags & ~IP6T_F_MASK)
+ return false;
+ if (ipv6->invflags & ~IP6T_INV_MASK)
+ return false;
+
+ return true;
}
static unsigned int
-ip6t_error(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+ip6t_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- printk("ip6_tables: error: `%s'\n", (char *)targinfo);
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
return NF_DROP;
}
-static inline
-int do_match(struct ip6t_entry_match *m,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static inline struct ip6t_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct ip6t_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ip6t_entry *e)
+{
+ static const struct ip6t_ip6 uncond;
+
+ return e->target_offset == sizeof(struct ip6t_entry) &&
+ memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0;
+}
+
+static inline const struct xt_entry_target *
+ip6t_get_target_c(const struct ip6t_entry *e)
+{
+ return ip6t_get_target((struct ip6t_entry *)e);
+}
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+/* This cries for unification! */
+static const char *const hooknames[] = {
+ [NF_INET_PRE_ROUTING] = "PREROUTING",
+ [NF_INET_LOCAL_IN] = "INPUT",
+ [NF_INET_FORWARD] = "FORWARD",
+ [NF_INET_LOCAL_OUT] = "OUTPUT",
+ [NF_INET_POST_ROUTING] = "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+ NF_IP6_TRACE_COMMENT_RULE,
+ NF_IP6_TRACE_COMMENT_RETURN,
+ NF_IP6_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+ [NF_IP6_TRACE_COMMENT_RULE] = "rule",
+ [NF_IP6_TRACE_COMMENT_RETURN] = "return",
+ [NF_IP6_TRACE_COMMENT_POLICY] = "policy",
+};
+
+static const struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_WARNING,
+ .logflags = NF_LOG_DEFAULT_MASK,
+ },
+ },
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
+ const char *hookname, const char **chainname,
+ const char **comment, unsigned int *rulenum)
{
- /* Stop iteration if it doesn't match */
- if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
- offset, protoff, hotdrop))
+ const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
+
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+ /* Head of user chain: ERROR target with chainname */
+ *chainname = t->target.data;
+ (*rulenum) = 0;
+ } else if (s == e) {
+ (*rulenum)++;
+
+ if (unconditional(s) &&
+ strcmp(t->target.u.kernel.target->name,
+ XT_STANDARD_TARGET) == 0 &&
+ t->verdict < 0) {
+ /* Tail of chains: STANDARD target (return/policy) */
+ *comment = *chainname == hookname
+ ? comments[NF_IP6_TRACE_COMMENT_POLICY]
+ : comments[NF_IP6_TRACE_COMMENT_RETURN];
+ }
return 1;
- else
- return 0;
+ } else
+ (*rulenum)++;
+
+ return 0;
+}
+
+static void trace_packet(struct net *net,
+ const struct sk_buff *skb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *tablename,
+ const struct xt_table_info *private,
+ const struct ip6t_entry *e)
+{
+ const struct ip6t_entry *root;
+ const char *hookname, *chainname, *comment;
+ const struct ip6t_entry *iter;
+ unsigned int rulenum = 0;
+
+ root = get_entry(private->entries, private->hook_entry[hook]);
+
+ hookname = chainname = hooknames[hook];
+ comment = comments[NF_IP6_TRACE_COMMENT_RULE];
+
+ xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+ if (get_chainname_rulenum(iter, e, hookname,
+ &chainname, &comment, &rulenum) != 0)
+ break;
+
+ nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ tablename, chainname, comment, rulenum);
}
+#endif
static inline struct ip6t_entry *
-get_entry(void *base, unsigned int offset)
+ip6t_next_entry(const struct ip6t_entry *entry)
{
- return (struct ip6t_entry *)(base + offset);
+ return (void *)entry + entry->next_offset;
}
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
-ip6t_do_table(struct sk_buff **pskb,
- unsigned int hook,
- const struct net_device *in,
- const struct net_device *out,
- struct xt_table *table)
+ip6t_do_table(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
- int offset = 0;
- unsigned int protoff = 0;
- int hotdrop = 0;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
- void *table_base;
- struct ip6t_entry *e, *back;
- struct xt_table_info *private;
+ const void *table_base;
+ struct ip6t_entry *e, **jumpstack;
+ unsigned int stackidx, cpu;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
/* Initialization */
- indev = in ? in->name : nulldevname;
- outdev = out ? out->name : nulldevname;
+ stackidx = 0;
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
+ acpar.fragoff = 0;
+ acpar.hotdrop = false;
+ acpar.state = state;
+
+ WARN_ON(!(table->valid_hooks & (1 << hook)));
+
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = READ_ONCE(table->private); /* Address dependency. */
+ cpu = smp_processor_id();
+ table_base = private->entries;
+ jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * current->in_nf_duplicate;
- read_lock_bh(&table->lock);
- private = table->private;
- IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
- /* For return from builtin chain */
- back = get_entry(table_base, private->underflow[hook]);
-
do {
- IP_NF_ASSERT(e);
- IP_NF_ASSERT(back);
- if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6,
- &protoff, &offset, &hotdrop)) {
- struct ip6t_entry_target *t;
-
- if (IP6T_MATCH_ITERATE(e, do_match,
- *pskb, in, out,
- offset, protoff, &hotdrop) != 0)
+ const struct xt_entry_target *t;
+ const struct xt_entry_match *ematch;
+ struct xt_counters *counter;
+
+ WARN_ON(!e);
+ acpar.thoff = 0;
+ if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
+ &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
+ no_match:
+ e = ip6t_next_entry(e);
+ continue;
+ }
+
+ xt_ematch_foreach(ematch, e) {
+ acpar.match = ematch->u.kernel.match;
+ acpar.matchinfo = ematch->data;
+ if (!acpar.match->match(skb, &acpar))
goto no_match;
+ }
- ADD_COUNTER(e->counters,
- ntohs((*pskb)->nh.ipv6h->payload_len)
- + IPV6_HDR_LEN,
- 1);
-
- t = ip6t_get_target(e);
- IP_NF_ASSERT(t->u.kernel.target);
- /* Standard target? */
- if (!t->u.kernel.target->target) {
- int v;
-
- v = ((struct ip6t_standard_target *)t)->verdict;
- if (v < 0) {
- /* Pop from stack? */
- if (v != IP6T_RETURN) {
- verdict = (unsigned)(-v) - 1;
- break;
- }
- e = back;
- back = get_entry(table_base,
- back->comefrom);
- continue;
- }
- if (table_base + v != (void *)e + e->next_offset
- && !(e->ipv6.flags & IP6T_F_GOTO)) {
- /* Save old back ptr in next entry */
- struct ip6t_entry *next
- = (void *)e + e->next_offset;
- next->comefrom
- = (void *)back - table_base;
- /* set back pointer to next entry */
- back = next;
- }
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
- e = get_entry(table_base, v);
- } else {
- /* Targets which reenter must return
- abs. verdicts */
-#ifdef CONFIG_NETFILTER_DEBUG
- ((struct ip6t_entry *)table_base)->comefrom
- = 0xeeeeeeec;
+ t = ip6t_get_target_c(e);
+ WARN_ON(!t->u.kernel.target);
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+ /* The packet is traced: log it */
+ if (unlikely(skb->nf_trace))
+ trace_packet(state->net, skb, hook, state->in,
+ state->out, table->name, private, e);
#endif
- verdict = t->u.kernel.target->target(pskb,
- in, out,
- hook,
- t->u.kernel.target,
- t->data);
-
-#ifdef CONFIG_NETFILTER_DEBUG
- if (((struct ip6t_entry *)table_base)->comefrom
- != 0xeeeeeeec
- && verdict == IP6T_CONTINUE) {
- printk("Target %s reentered!\n",
- t->u.kernel.target->name);
- verdict = NF_DROP;
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
}
- ((struct ip6t_entry *)table_base)->comefrom
- = 0x57acc001;
-#endif
- if (verdict == IP6T_CONTINUE)
- e = (void *)e + e->next_offset;
+ if (stackidx == 0)
+ e = get_entry(table_base,
+ private->underflow[hook]);
else
- /* Verdict */
+ e = ip6t_next_entry(jumpstack[--stackidx]);
+ continue;
+ }
+ if (table_base + v != ip6t_next_entry(e) &&
+ !(e->ipv6.flags & IP6T_F_GOTO)) {
+ if (unlikely(stackidx >= private->stacksize)) {
+ verdict = NF_DROP;
break;
+ }
+ jumpstack[stackidx++] = e;
}
- } else {
- no_match:
- e = (void *)e + e->next_offset;
+ e = get_entry(table_base, v);
+ continue;
}
- } while (!hotdrop);
-
-#ifdef CONFIG_NETFILTER_DEBUG
- ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
-#endif
- read_unlock_bh(&table->lock);
-
-#ifdef DEBUG_ALLOW_ALL
- return NF_ACCEPT;
-#else
- if (hotdrop)
- return NF_DROP;
- else return verdict;
-#endif
-}
-/* All zeroes == unconditional rule. */
-static inline int
-unconditional(const struct ip6t_ip6 *ipv6)
-{
- unsigned int i;
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
- for (i = 0; i < sizeof(*ipv6); i++)
- if (((char *)ipv6)[i])
+ verdict = t->u.kernel.target->target(skb, &acpar);
+ if (verdict == XT_CONTINUE)
+ e = ip6t_next_entry(e);
+ else
+ /* Verdict */
break;
+ } while (!acpar.hotdrop);
- return (i == sizeof(*ipv6));
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else return verdict;
}
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct xt_table_info *newinfo,
- unsigned int valid_hooks, void *entry0)
+mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0,
+ unsigned int *offsets)
{
unsigned int hook;
/* No recursion; use packet counter to save back ptrs (reset
to 0 as we leave), and comefrom to save source hook bitmask */
- for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) {
+ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
- struct ip6t_entry *e
- = (struct ip6t_entry *)(entry0 + pos);
+ struct ip6t_entry *e = entry0 + pos;
if (!(valid_hooks & (1 << hook)))
continue;
@@ -421,38 +401,26 @@ mark_source_chains(struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- struct ip6t_standard_target *t
- = (void *)ip6t_get_target(e);
+ const struct xt_standard_target *t
+ = (void *)ip6t_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_IP6_NUMHOOKS)) {
- printk("iptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS))
return 0;
- }
- e->comefrom
- |= ((1 << hook) | (1 << NF_IP6_NUMHOOKS));
+
+ e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
- if (e->target_offset == sizeof(struct ip6t_entry)
- && (strcmp(t->target.u.user.name,
- IP6T_STANDARD_TARGET) == 0)
- && t->verdict < 0
- && unconditional(&e->ipv6)) {
+ if ((unconditional(e) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
/* Return: backtrack through the last
big jump. */
do {
- e->comefrom ^= (1<<NF_IP6_NUMHOOKS);
-#ifdef DEBUG_IP_FIREWALL_USER
- if (e->comefrom
- & (1 << NF_IP6_NUMHOOKS)) {
- duprintf("Back unset "
- "on hook %u "
- "rule %u\n",
- hook, pos);
- }
-#endif
+ e->comefrom ^= (1<<NF_INET_NUMHOOKS);
oldpos = pos;
pos = e->counters.pcnt;
e->counters.pcnt = 0;
@@ -461,425 +429,429 @@ mark_source_chains(struct xt_table_info *newinfo,
if (pos == oldpos)
goto next;
- e = (struct ip6t_entry *)
- (entry0 + pos);
+ e = entry0 + pos;
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
- e = (struct ip6t_entry *)
- (entry0 + pos + size);
+ e = entry0 + pos + size;
+ if (pos + size >= newinfo->size)
+ return 0;
e->counters.pcnt = pos;
pos += size;
} else {
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- IP6T_STANDARD_TARGET) == 0
- && newpos >= 0) {
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
+ if (!xt_find_jump_offset(offsets, newpos,
+ newinfo->number))
+ return 0;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
+ if (newpos >= newinfo->size)
+ return 0;
}
- e = (struct ip6t_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
e->counters.pcnt = pos;
pos = newpos;
}
}
- next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
-static inline int
-cleanup_match(struct ip6t_entry_match *m, unsigned int *i)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
- if (i && (*i)-- == 0)
- return 1;
-
- if (m->u.kernel.match->destroy)
- m->u.kernel.match->destroy(m->u.kernel.match, m->data);
- module_put(m->u.kernel.match->me);
- return 0;
+ struct xt_mtdtor_param par;
+
+ par.net = net;
+ par.match = m->u.kernel.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_IPV6;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
}
-static inline int
-standard_check(const struct ip6t_entry_target *t,
- unsigned int max_offset)
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
- struct ip6t_standard_target *targ = (void *)t;
+ const struct ip6t_ip6 *ipv6 = par->entryinfo;
- /* Check standard info. */
- if (targ->verdict >= 0
- && targ->verdict > max_offset - sizeof(struct ip6t_entry)) {
- duprintf("ip6t_standard_check: bad verdict (%i)\n",
- targ->verdict);
- return 0;
- }
- if (targ->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("ip6t_standard_check: bad negative verdict (%i)\n",
- targ->verdict);
- return 0;
- }
- return 1;
+ par->match = m->u.kernel.match;
+ par->matchinfo = m->data;
+
+ return xt_check_match(par, m->u.match_size - sizeof(*m),
+ ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
}
-static inline int
-check_match(struct ip6t_entry_match *m,
- const char *name,
- const struct ip6t_ip6 *ipv6,
- unsigned int hookmask,
- unsigned int *i)
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
- struct ip6t_match *match;
+ struct xt_match *match;
int ret;
- match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name,
- m->u.user.revision),
- "ip6t_%s", m->u.user.name);
- if (IS_ERR(match) || !match) {
- duprintf("check_match: `%s' not found\n", m->u.user.name);
- return match ? PTR_ERR(match) : -ENOENT;
- }
+ match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+
m->u.kernel.match = match;
- ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m),
- name, hookmask, ipv6->proto,
- ipv6->invflags & IP6T_INV_PROTO);
+ ret = check_match(m, par);
if (ret)
goto err;
- if (m->u.kernel.match->checkentry
- && !m->u.kernel.match->checkentry(name, ipv6, match, m->data,
- hookmask)) {
- duprintf("ip_tables: check failed for `%s'.\n",
- m->u.kernel.match->name);
- ret = -EINVAL;
- goto err;
- }
-
- (*i)++;
return 0;
err:
module_put(m->u.kernel.match->me);
return ret;
}
-static struct ip6t_target ip6t_standard_target;
+static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_target *t = ip6t_get_target(e);
+ struct xt_tgchk_param par = {
+ .net = net,
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_IPV6,
+ };
+
+ return xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ipv6.proto,
+ e->ipv6.invflags & IP6T_INV_PROTO);
+}
-static inline int
-check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+static int
+find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
- struct ip6t_entry_target *t;
- struct ip6t_target *target;
+ struct xt_entry_target *t;
+ struct xt_target *target;
int ret;
unsigned int j;
+ struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
- if (!ip6_checkentry(&e->ipv6)) {
- duprintf("ip_tables: ip check failed %p %s.\n", e, name);
- return -EINVAL;
- }
-
- if (e->target_offset + sizeof(struct ip6t_entry_target) >
- e->next_offset)
- return -EINVAL;
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
+ return -ENOMEM;
j = 0;
- ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, e->comefrom, &j);
- if (ret != 0)
- goto cleanup_matches;
+ memset(&mtpar, 0, sizeof(mtpar));
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ipv6;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV6;
+ xt_ematch_foreach(ematch, e) {
+ ret = find_check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
t = ip6t_get_target(e);
- ret = -EINVAL;
- if (e->target_offset + t->u.target_size > e->next_offset)
- goto cleanup_matches;
- target = try_then_request_module(xt_find_target(AF_INET6,
- t->u.user.name,
- t->u.user.revision),
- "ip6t_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
- duprintf("check_entry: `%s' not found\n", t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
goto cleanup_matches;
}
t->u.kernel.target = target;
- ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t),
- name, e->comefrom, e->ipv6.proto,
- e->ipv6.invflags & IP6T_INV_PROTO);
+ ret = check_target(e, net, name);
if (ret)
goto err;
-
- if (t->u.kernel.target == &ip6t_standard_target) {
- if (!standard_check(t, size)) {
- ret = -EINVAL;
- goto err;
- }
- } else if (t->u.kernel.target->checkentry
- && !t->u.kernel.target->checkentry(name, e, target, t->data,
- e->comefrom)) {
- duprintf("ip_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- ret = -EINVAL;
- goto err;
- }
-
- (*i)++;
return 0;
err:
module_put(t->u.kernel.target->me);
cleanup_matches:
- IP6T_MATCH_ITERATE(e, cleanup_match, &j);
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+
+ xt_percpu_counter_free(&e->counters);
+
return ret;
}
-static inline int
+static bool check_underflow(const struct ip6t_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(e))
+ return false;
+ t = ip6t_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
check_entry_size_and_hooks(struct ip6t_entry *e,
struct xt_table_info *newinfo,
- unsigned char *base,
- unsigned char *limit,
+ const unsigned char *base,
+ const unsigned char *limit,
const unsigned int *hook_entries,
const unsigned int *underflows,
- unsigned int *i)
+ unsigned int valid_hooks)
{
unsigned int h;
+ int err;
- if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0
- || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) {
- duprintf("Bad offset %p\n", e);
+ if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct ip6t_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
+
+ if (!ip6_checkentry(&e->ipv6))
+ return -EINVAL;
+
+ err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+ e->next_offset);
+ if (err)
+ return err;
/* Check hooks & underflows */
- for (h = 0; h < NF_IP6_NUMHOOKS; h++) {
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
- if ((unsigned char *)e - base == underflows[h])
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e))
+ return -EINVAL;
+
newinfo->underflow[h] = underflows[h];
+ }
}
- /* FIXME: underflows must be unconditional, standard verdicts
- < 0 (not IP6T_RETURN). --RR */
-
/* Clear counters and comefrom */
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
-
- (*i)++;
return 0;
}
-static inline int
-cleanup_entry(struct ip6t_entry *e, unsigned int *i)
+static void cleanup_entry(struct ip6t_entry *e, struct net *net)
{
- struct ip6t_entry_target *t;
-
- if (i && (*i)-- == 0)
- return 1;
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
/* Cleanup all matches */
- IP6T_MATCH_ITERATE(e, cleanup_match, NULL);
+ xt_ematch_foreach(ematch, e)
+ cleanup_match(ematch, net);
t = ip6t_get_target(e);
- if (t->u.kernel.target->destroy)
- t->u.kernel.target->destroy(t->u.kernel.target, t->data);
- module_put(t->u.kernel.target->me);
- return 0;
+
+ par.net = net;
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_IPV6;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
newinfo) */
static int
-translate_table(const char *name,
- unsigned int valid_hooks,
- struct xt_table_info *newinfo,
- void *entry0,
- unsigned int size,
- unsigned int number,
- const unsigned int *hook_entries,
- const unsigned int *underflows)
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+ const struct ip6t_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
+ struct ip6t_entry *iter;
+ unsigned int *offsets;
unsigned int i;
- int ret;
+ int ret = 0;
- newinfo->size = size;
- newinfo->number = number;
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
/* Init all hooks to impossible value. */
- for (i = 0; i < NF_IP6_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = 0xFFFFFFFF;
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
+ offsets = xt_alloc_entry_offsets(newinfo->number);
+ if (!offsets)
+ return -ENOMEM;
i = 0;
/* Walk through entries, checking offsets. */
- ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry_size_and_hooks,
- newinfo,
- entry0,
- entry0 + size,
- hook_entries, underflows, &i);
- if (ret != 0)
- return ret;
-
- if (i != number) {
- duprintf("translate_table: %u not %u entries\n",
- i, number);
- return -EINVAL;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ goto out_free;
+ if (i < repl->num_entries)
+ offsets[i] = (void *)iter - entry0;
+ ++i;
+ if (strcmp(ip6t_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
- /* Check hooks all assigned */
- for (i = 0; i < NF_IP6_NUMHOOKS; i++) {
- /* Only hooks which are valid */
- if (!(valid_hooks & (1 << i)))
- continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, hook_entries[i]);
- return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, underflows[i]);
- return -EINVAL;
- }
+ ret = -EINVAL;
+ if (i != repl->num_entries)
+ goto out_free;
+
+ ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+ if (ret)
+ goto out_free;
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+ ret = -ELOOP;
+ goto out_free;
}
+ kvfree(offsets);
/* Finally, each sanity check must pass */
i = 0;
- ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry, name, size, &i);
-
- if (ret != 0)
- goto cleanup;
-
- ret = -ELOOP;
- if (!mark_source_chains(newinfo, valid_hooks, entry0))
- goto cleanup;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
+ if (ret != 0)
+ break;
+ ++i;
+ }
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
+ return ret;
}
- return 0;
-cleanup:
- IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i);
return ret;
-}
-
-/* Gets counters. */
-static inline int
-add_entry_to_counter(const struct ip6t_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
-static inline int
-set_entry_to_counter(const struct ip6t_entry *e,
- struct ip6t_counters total[],
- unsigned int *i)
-{
- SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
+ out_free:
+ kvfree(offsets);
+ return ret;
}
static void
get_counters(const struct xt_table_info *t,
struct xt_counters counters[])
{
+ struct ip6t_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU
- * We dont care about preemption here.
- */
- curcpu = raw_smp_processor_id();
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
- i = 0;
- IP6T_ENTRY_ITERATE(t->entries[curcpu],
- t->size,
- set_entry_to_counter,
- counters,
- &i);
+ i = 0;
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i;
+ cond_resched();
+ }
+ }
+}
+
+static void get_old_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ip6t_entry *iter;
+ unsigned int cpu, i;
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
i = 0;
- IP6T_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries, t->size) {
+ const struct xt_counters *tmp;
+
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+ ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+ ++i;
+ }
+ cond_resched();
}
}
-static int
-copy_entries_to_user(unsigned int total_size,
- struct xt_table *table,
- void __user *userptr)
+static struct xt_counters *alloc_counters(const struct xt_table *table)
{
- unsigned int off, num, countersize;
- struct ip6t_entry *e;
+ unsigned int countersize;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
- int ret = 0;
- void *loc_cpu_entry;
+ const struct xt_table_info *private = table->private;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc(countersize);
+ counters = vzalloc(countersize);
if (counters == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- /* First, sum counters... */
- write_lock_bh(&table->lock);
get_counters(private, counters);
- write_unlock_bh(&table->lock);
- /* choose the copy that is on ourc node/cpu */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
+ return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct ip6t_entry *e;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ int ret = 0;
+ const void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ loc_cpu_entry = private->entries;
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
unsigned int i;
- struct ip6t_entry_match *m;
- struct ip6t_entry_target *t;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
- e = (struct ip6t_entry *)(loc_cpu_entry + off);
+ e = loc_cpu_entry + off;
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct ip6t_entry, counters),
&counters[num],
@@ -893,23 +865,14 @@ copy_entries_to_user(unsigned int total_size,
i += m->u.match_size) {
m = (void *)e + i;
- if (copy_to_user(userptr + off + i
- + offsetof(struct ip6t_entry_match,
- u.user.name),
- m->u.kernel.match->name,
- strlen(m->u.kernel.match->name)+1)
- != 0) {
+ if (xt_match_to_user(m, userptr + off + i)) {
ret = -EFAULT;
goto free_counters;
}
}
- t = ip6t_get_target(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct ip6t_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ t = ip6t_get_target_c(e);
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -920,187 +883,314 @@ copy_entries_to_user(unsigned int total_size,
return ret;
}
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(AF_INET6, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(AF_INET6, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ip6t_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_match *ematch;
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+ entry_offset = (void *)e - base;
+ xt_ematch_foreach(ematch, e)
+ off += xt_compat_match_offset(ematch->u.kernel.match);
+ t = ip6t_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct ip6t_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct ip6t_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct ip6t_entry *iter;
+ const void *loc_cpu_entry;
+ int ret;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries;
+ ret = xt_compat_init_offsets(AF_INET6, info->number);
+ if (ret)
+ return ret;
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user, const int *len)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct ip6t_getinfo))
+ return -EINVAL;
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ xt_compat_lock(AF_INET6);
+#endif
+ t = xt_request_find_table_lock(net, AF_INET6, name);
+ if (!IS_ERR(t)) {
+ struct ip6t_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ struct xt_table_info tmp;
+
+ if (in_compat_syscall()) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(AF_INET6);
+ private = &tmp;
+ }
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = PTR_ERR(t);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ xt_compat_unlock(AF_INET6);
+#endif
+ return ret;
+}
+
static int
-get_entries(const struct ip6t_get_entries *entries,
- struct ip6t_get_entries __user *uptr)
+get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
+ const int *len)
{
int ret;
+ struct ip6t_get_entries get;
struct xt_table *t;
- t = xt_find_table_lock(AF_INET6, entries->name);
- if (t && !IS_ERR(t)) {
+ if (*len < sizeof(get))
+ return -EINVAL;
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct ip6t_get_entries) + get.size)
+ return -EINVAL;
+
+ get.name[sizeof(get.name) - 1] = '\0';
+
+ t = xt_find_table_lock(net, AF_INET6, get.name);
+ if (!IS_ERR(t)) {
struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n", private->number);
- if (entries->size == private->size)
+ if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, entries->size);
- ret = -EINVAL;
- }
+ else
+ ret = -EAGAIN;
+
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = PTR_ERR(t);
return ret;
}
static int
-do_replace(void __user *user, unsigned int len)
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+ struct xt_table_info *newinfo, unsigned int num_counters,
+ void __user *counters_ptr)
{
int ret;
- struct ip6t_replace tmp;
struct xt_table *t;
- struct xt_table_info *newinfo, *oldinfo;
+ struct xt_table_info *oldinfo;
struct xt_counters *counters;
- void *loc_cpu_entry, *loc_cpu_old_entry;
-
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
- return -EFAULT;
-
- /* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
- return -ENOMEM;
- if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
- return -ENOMEM;
-
- newinfo = xt_alloc_table_info(tmp.size);
- if (!newinfo)
- return -ENOMEM;
+ struct ip6t_entry *iter;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
- ret = -EFAULT;
- goto free_newinfo;
- }
-
- counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
+ counters = xt_counters_alloc(num_counters);
if (!counters) {
ret = -ENOMEM;
- goto free_newinfo;
+ goto out;
}
- ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
- tmp.hook_entry, tmp.underflow);
- if (ret != 0)
- goto free_newinfo_counters;
-
- duprintf("ip_tables: Translated table\n");
-
- t = try_then_request_module(xt_find_table_lock(AF_INET6, tmp.name),
- "ip6table_%s", tmp.name);
- if (!t || IS_ERR(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ t = xt_request_find_table_lock(net, AF_INET6, name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free_newinfo_counters_untrans;
}
/* You lied! */
- if (tmp.valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- tmp.valid_hooks, t->valid_hooks);
+ if (valid_hooks != t->valid_hooks) {
ret = -EINVAL;
goto put_module;
}
- oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
if (!oldinfo)
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
- if ((oldinfo->number > oldinfo->initial_entries) ||
- (newinfo->number <= oldinfo->initial_entries))
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
if ((oldinfo->number > oldinfo->initial_entries) &&
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
- /* Get the old counters. */
- get_counters(oldinfo, counters);
+ xt_table_unlock(t);
+
+ get_old_counters(oldinfo, counters);
+
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
+ cleanup_entry(iter, net);
+
xt_free_table_info(oldinfo);
- if (copy_to_user(tmp.counters, counters,
- sizeof(struct xt_counters) * tmp.num_counters) != 0)
- ret = -EFAULT;
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
- xt_table_unlock(t);
- return ret;
+ return 0;
put_module:
module_put(t->me);
xt_table_unlock(t);
free_newinfo_counters_untrans:
- IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
- free_newinfo_counters:
vfree(counters);
- free_newinfo:
- xt_free_table_info(newinfo);
+ out:
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
-#if 0
- duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
- *i,
- (long unsigned int)e->counters.pcnt,
- (long unsigned int)e->counters.bcnt,
- (long unsigned int)addme[*i].pcnt,
- (long unsigned int)addme[*i].bcnt);
-#endif
-
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
static int
-do_add_counters(void __user *user, unsigned int len)
+do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
- unsigned int i;
- struct xt_counters_info tmp, *paddc;
- struct xt_table_info *private;
- struct xt_table *t;
- int ret = 0;
+ int ret;
+ struct ip6t_replace tmp;
+ struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct ip6t_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
- if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
return -EINVAL;
- paddc = vmalloc(len);
- if (!paddc)
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
return -ENOMEM;
- if (copy_from_user(paddc, user, len) != 0) {
+ loc_cpu_entry = newinfo->entries;
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
- goto free;
+ goto free_newinfo;
}
- t = xt_find_table_lock(AF_INET6, tmp.name);
- if (!t || IS_ERR(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
+{
+ unsigned int i;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ struct ip6t_entry *iter;
+ unsigned int addend;
+
+ paddc = xt_copy_counters(arg, len, &tmp);
+ if (IS_ERR(paddc))
+ return PTR_ERR(paddc);
+ t = xt_find_table_lock(net, AF_INET6, tmp.name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
goto free;
}
- write_lock_bh(&t->lock);
+ local_bh_disable();
private = t->private;
if (private->number != tmp.num_counters) {
ret = -EINVAL;
@@ -1108,15 +1198,17 @@ do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- /* Choose the copy that is on our node */
- loc_cpu_entry = private->entries[smp_processor_id()];
- IP6T_ENTRY_ITERATE(loc_cpu_entry,
- private->size,
- add_counter_to_entry,
- paddc->counters,
- &i);
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ local_bh_enable();
xt_table_unlock(t);
module_put(t->me);
free:
@@ -1125,25 +1217,438 @@ do_add_counters(void __user *user, unsigned int len)
return ret;
}
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+struct compat_ip6t_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_INET_NUMHOOKS];
+ u32 underflow[NF_INET_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters; /* struct xt_counters * */
+ struct compat_ip6t_entry entries[];
+};
+
+static int
+compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
+ unsigned int *size, struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_ip6t_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ const struct xt_entry_match *ematch;
+ int ret = 0;
+
+ origsize = *size;
+ ce = *dstptr;
+ if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_ip6t_entry);
+ *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ target_offset = e->target_offset - (origsize - *size);
+ t = ip6t_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+ const struct ip6t_ip6 *ipv6,
+ int *size)
+{
+ struct xt_match *match;
+
+ match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+
+ m->u.kernel.match = match;
+ *size += xt_compat_match_offset(match);
+ return 0;
+}
+
+static void compat_release_entry(struct compat_ip6t_entry *e)
+{
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ module_put(ematch->u.kernel.match->me);
+ t = compat_ip6t_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit)
+{
+ struct xt_entry_match *ematch;
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ unsigned int j;
+ int ret, off;
+
+ if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit)
+ return -EINVAL;
+
+ if (e->next_offset < sizeof(struct compat_ip6t_entry) +
+ sizeof(struct compat_xt_entry_target))
+ return -EINVAL;
+
+ if (!ip6_checkentry(&e->ipv6))
+ return -EINVAL;
+
+ ret = xt_compat_check_entry_offsets(e, e->elems,
+ e->target_offset, e->next_offset);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+ entry_offset = (void *)e - (void *)base;
+ j = 0;
+ xt_ematch_foreach(ematch, e) {
+ ret = compat_find_calc_match(ematch, &e->ipv6, &off);
+ if (ret != 0)
+ goto release_matches;
+ ++j;
+ }
+
+ t = compat_ip6t_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
+ goto release_matches;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
+ if (ret)
+ goto out;
+
+ return 0;
+
+out:
+ module_put(t->u.kernel.target->me);
+release_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ module_put(ematch->u.kernel.match->me);
+ }
+ return ret;
+}
+
+static void
+compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
+ unsigned int *size,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct ip6t_entry *de;
+ unsigned int origsize;
+ int h;
+ struct xt_entry_match *ematch;
+
+ origsize = *size;
+ de = *dstptr;
+ memcpy(de, e, sizeof(struct ip6t_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct ip6t_entry);
+ *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+
+ xt_ematch_foreach(ematch, e)
+ xt_compat_match_from_user(ematch, dstptr, size);
+
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_ip6t_get_target(e);
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+}
+
+static int
+translate_compat_table(struct net *net,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ const struct compat_ip6t_replace *compatr)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_ip6t_entry *iter0;
+ struct ip6t_replace repl;
+ unsigned int size;
+ int ret;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = compatr->size;
+ info->number = compatr->num_entries;
+
+ j = 0;
+ xt_compat_lock(AF_INET6);
+ ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries);
+ if (ret)
+ goto out_unlock;
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, compatr->size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + compatr->size);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != compatr->num_entries)
+ goto out_unlock;
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ memset(newinfo->entries, 0, size);
+
+ newinfo->number = compatr->num_entries;
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = compatr->hook_entry[i];
+ newinfo->underflow[i] = compatr->underflow[i];
+ }
+ entry1 = newinfo->entries;
+ pos = entry1;
+ size = compatr->size;
+ xt_entry_foreach(iter0, entry0, compatr->size)
+ compat_copy_entry_from_user(iter0, &pos, &size,
+ newinfo, entry1);
+
+ /* all module references in entry0 are now gone. */
+ xt_compat_flush_offsets(AF_INET6);
+ xt_compat_unlock(AF_INET6);
+
+ memcpy(&repl, compatr, sizeof(*compatr));
+
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ repl.hook_entry[i] = newinfo->hook_entry[i];
+ repl.underflow[i] = newinfo->underflow[i];
+ }
+
+ repl.num_counters = 0;
+ repl.counters = NULL;
+ repl.size = newinfo->size;
+ ret = translate_table(net, newinfo, entry1, &repl);
+ if (ret)
+ goto free_newinfo;
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(AF_INET6);
+ xt_compat_unlock(AF_INET6);
+ xt_entry_foreach(iter0, entry0, compatr->size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+}
+
static int
-do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
+ struct compat_ip6t_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ip6t_entry *iter;
- if (!capable(CAP_NET_ADMIN))
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ loc_cpu_entry = newinfo->entries;
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+struct compat_ip6t_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_ip6t_entry entrytable[];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ unsigned int i = 0;
+ struct ip6t_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, private->entries, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+
+ vfree(counters);
+ return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_ip6t_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get))
+ return -EINVAL;
+
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+
+ if (*len != sizeof(struct compat_ip6t_get_entries) + get.size)
+ return -EINVAL;
+
+ get.name[sizeof(get.name) - 1] = '\0';
+
+ xt_compat_lock(AF_INET6);
+ t = xt_find_table_lock(net, AF_INET6, get.name);
+ if (!IS_ERR(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size)
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else if (!ret)
+ ret = -EAGAIN;
+
+ xt_compat_flush_offsets(AF_INET6);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = PTR_ERR(t);
+
+ xt_compat_unlock(AF_INET6);
+ return ret;
+}
+#endif
+
+static int
+do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IP6T_SO_SET_REPLACE:
- ret = do_replace(user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case IP6T_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(user, len);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
- duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1155,73 +1660,26 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
- case IP6T_SO_GET_INFO: {
- char name[IP6T_TABLE_MAXNAMELEN];
- struct xt_table *t;
-
- if (*len != sizeof(struct ip6t_getinfo)) {
- duprintf("length %u != %u\n", *len,
- sizeof(struct ip6t_getinfo));
- ret = -EINVAL;
- break;
- }
-
- if (copy_from_user(name, user, sizeof(name)) != 0) {
- ret = -EFAULT;
- break;
- }
- name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
-
- t = try_then_request_module(xt_find_table_lock(AF_INET6, name),
- "ip6table_%s", name);
- if (t && !IS_ERR(t)) {
- struct ip6t_getinfo info;
- struct xt_table_info *private = t->private;
-
- info.valid_hooks = t->valid_hooks;
- memcpy(info.hook_entry, private->hook_entry,
- sizeof(info.hook_entry));
- memcpy(info.underflow, private->underflow,
- sizeof(info.underflow));
- info.num_entries = private->number;
- info.size = private->size;
- memcpy(info.name, name, sizeof(info.name));
-
- if (copy_to_user(user, &info, *len) != 0)
- ret = -EFAULT;
- else
- ret = 0;
- xt_table_unlock(t);
- module_put(t->me);
- } else
- ret = t ? PTR_ERR(t) : -ENOENT;
- }
- break;
-
- case IP6T_SO_GET_ENTRIES: {
- struct ip6t_get_entries get;
+ case IP6T_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len);
+ break;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %u\n", *len, sizeof(get));
- ret = -EINVAL;
- } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
- ret = -EFAULT;
- } else if (*len != sizeof(struct ip6t_get_entries) + get.size) {
- duprintf("get_entries: %u != %u\n", *len,
- sizeof(struct ip6t_get_entries) + get.size);
- ret = -EINVAL;
- } else
- ret = get_entries(&get, user);
+ case IP6T_SO_GET_ENTRIES:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
- }
case IP6T_SO_GET_REVISION_MATCH:
case IP6T_SO_GET_REVISION_TARGET: {
- struct ip6t_get_revision rev;
+ struct xt_get_revision rev;
int target;
if (*len != sizeof(rev)) {
@@ -1232,6 +1690,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EFAULT;
break;
}
+ rev.name[sizeof(rev.name)-1] = 0;
if (cmd == IP6T_SO_GET_REVISION_TARGET)
target = 1;
@@ -1246,131 +1705,130 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
}
default:
- duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
return ret;
}
-int ip6t_register_table(struct xt_table *table,
- const struct ip6t_replace *repl)
+static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
{
- int ret;
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ip6t_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+ const struct ip6t_replace *repl,
+ const struct nf_hook_ops *template_ops)
+{
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
- static struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
+ struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- /* choose the copy on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(table->name, table->valid_hooks,
- newinfo, loc_cpu_entry, repl->size,
- repl->num_entries,
- repl->hook_entry,
- repl->underflow);
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0) {
xt_free_table_info(newinfo);
return ret;
}
- ret = xt_register_table(table, &bootstrap, newinfo);
- if (ret != 0) {
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ struct ip6t_entry *iter;
+
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
xt_free_table_info(newinfo);
- return ret;
+ return PTR_ERR(new_table);
}
- return 0;
-}
+ if (!template_ops)
+ return 0;
-void ip6t_unregister_table(struct xt_table *table)
-{
- struct xt_table_info *private;
- void *loc_cpu_entry;
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
- private = xt_unregister_table(table);
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
- IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
- xt_free_table_info(private);
-}
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
-/* Returns 1 if the type and code is matched by the range, 0 otherwise */
-static inline int
-icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
- u_int8_t type, u_int8_t code,
- int invert)
-{
- return (type == test_type && code >= min_code && code <= max_code)
- ^ invert;
-}
+ new_table->ops = ops;
-static int
-icmp6_match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
-{
- struct icmp6hdr _icmp, *ic;
- const struct ip6t_icmp *icmpinfo = matchinfo;
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
- /* Must not be a fragment. */
- if (offset)
- return 0;
+ return ret;
- ic = skb_header_pointer(skb, protoff, sizeof(_icmp), &_icmp);
- if (ic == NULL) {
- /* We've been asked to examine this packet, and we
- can't. Hence, no choice but to drop. */
- duprintf("Dropping evil ICMP tinygram.\n");
- *hotdrop = 1;
- return 0;
- }
+out_free:
+ __ip6t_unregister_table(net, new_table);
+ return ret;
+}
+
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
+{
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
- return icmp6_type_code_match(icmpinfo->type,
- icmpinfo->code[0],
- icmpinfo->code[1],
- ic->icmp6_type, ic->icmp6_code,
- !!(icmpinfo->invflags&IP6T_ICMP_INV));
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
-/* Called when user tries to insert an entry of this type. */
-static int
-icmp6_checkentry(const char *tablename,
- const void *entry,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+void ip6t_unregister_table_exit(struct net *net, const char *name)
{
- const struct ip6t_icmp *icmpinfo = matchinfo;
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
- /* Must specify no unknown invflags */
- return !(icmpinfo->invflags & ~IP6T_ICMP_INV);
+ if (table)
+ __ip6t_unregister_table(net, table);
}
/* The built-in targets: standard (NULL) and error. */
-static struct ip6t_target ip6t_standard_target = {
- .name = IP6T_STANDARD_TARGET,
- .targetsize = sizeof(int),
- .family = AF_INET6,
-};
-
-static struct ip6t_target ip6t_error_target = {
- .name = IP6T_ERROR_TARGET,
- .target = ip6t_error,
- .targetsize = IP6T_FUNCTION_MAXNAMELEN,
- .family = AF_INET6,
+static struct xt_target ip6t_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_IPV6,
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = ip6t_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_IPV6,
+ },
};
static struct nf_sockopt_ops ip6t_sockopts = {
@@ -1381,52 +1839,48 @@ static struct nf_sockopt_ops ip6t_sockopts = {
.get_optmin = IP6T_BASE_CTL,
.get_optmax = IP6T_SO_GET_MAX+1,
.get = do_ip6t_get_ctl,
+ .owner = THIS_MODULE,
};
-static struct ip6t_match icmp6_matchstruct = {
- .name = "icmp6",
- .match = &icmp6_match,
- .matchsize = sizeof(struct ip6t_icmp),
- .checkentry = icmp6_checkentry,
- .proto = IPPROTO_ICMPV6,
- .family = AF_INET6,
+static int __net_init ip6_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_IPV6);
+}
+
+static void __net_exit ip6_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_IPV6);
+}
+
+static struct pernet_operations ip6_tables_net_ops = {
+ .init = ip6_tables_net_init,
+ .exit = ip6_tables_net_exit,
};
static int __init ip6_tables_init(void)
{
int ret;
- ret = xt_proto_init(AF_INET6);
+ ret = register_pernet_subsys(&ip6_tables_net_ops);
if (ret < 0)
goto err1;
- /* Noone else will be downing sem now, so we won't sleep */
- ret = xt_register_target(&ip6t_standard_target);
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_target(&ip6t_error_target);
- if (ret < 0)
- goto err3;
- ret = xt_register_match(&icmp6_matchstruct);
- if (ret < 0)
- goto err4;
/* Register setsockopt */
ret = nf_register_sockopt(&ip6t_sockopts);
if (ret < 0)
- goto err5;
+ goto err4;
- printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n");
return 0;
-err5:
- xt_unregister_match(&icmp6_matchstruct);
err4:
- xt_unregister_target(&ip6t_error_target);
-err3:
- xt_unregister_target(&ip6t_standard_target);
+ xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
err2:
- xt_proto_fini(AF_INET6);
+ unregister_pernet_subsys(&ip6_tables_net_ops);
err1:
return ret;
}
@@ -1434,94 +1888,15 @@ err1:
static void __exit ip6_tables_fini(void)
{
nf_unregister_sockopt(&ip6t_sockopts);
- xt_unregister_match(&icmp6_matchstruct);
- xt_unregister_target(&ip6t_error_target);
- xt_unregister_target(&ip6t_standard_target);
- xt_proto_fini(AF_INET6);
-}
-
-/*
- * find the offset to specified header or the protocol number of last header
- * if target < 0. "last header" is transport protocol header, ESP, or
- * "No next header".
- *
- * If target header is found, its offset is set in *offset and return protocol
- * number. Otherwise, return -1.
- *
- * If the first fragment doesn't contain the final protocol header or
- * NEXTHDR_NONE it is considered invalid.
- *
- * Note that non-1st fragment is special case that "the protocol number
- * of last header" is "next header" field in Fragment header. In this case,
- * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
- * isn't NULL.
- *
- */
-int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
- int target, unsigned short *fragoff)
-{
- unsigned int start = (u8*)(skb->nh.ipv6h + 1) - skb->data;
- u8 nexthdr = skb->nh.ipv6h->nexthdr;
- unsigned int len = skb->len - start;
-
- if (fragoff)
- *fragoff = 0;
-
- while (nexthdr != target) {
- struct ipv6_opt_hdr _hdr, *hp;
- unsigned int hdrlen;
-
- if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
- if (target < 0)
- break;
- return -ENOENT;
- }
-
- hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return -EBADMSG;
- if (nexthdr == NEXTHDR_FRAGMENT) {
- unsigned short _frag_off;
- __be16 *fp;
- fp = skb_header_pointer(skb,
- start+offsetof(struct frag_hdr,
- frag_off),
- sizeof(_frag_off),
- &_frag_off);
- if (fp == NULL)
- return -EBADMSG;
-
- _frag_off = ntohs(*fp) & ~0x7;
- if (_frag_off) {
- if (target < 0 &&
- ((!ipv6_ext_hdr(hp->nexthdr)) ||
- hp->nexthdr == NEXTHDR_NONE)) {
- if (fragoff)
- *fragoff = _frag_off;
- return hp->nexthdr;
- }
- return -ENOENT;
- }
- hdrlen = 8;
- } else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen + 2) << 2;
- else
- hdrlen = ipv6_optlen(hp);
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- start += hdrlen;
- }
- *offset = start;
- return nexthdr;
+ xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+ unregister_pernet_subsys(&ip6_tables_net_ops);
}
EXPORT_SYMBOL(ip6t_register_table);
-EXPORT_SYMBOL(ip6t_unregister_table);
+EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
+EXPORT_SYMBOL(ip6t_unregister_table_exit);
EXPORT_SYMBOL(ip6t_do_table);
-EXPORT_SYMBOL(ip6t_ext_hdr);
-EXPORT_SYMBOL(ipv6_find_hdr);
module_init(ip6_tables_init);
module_exit(ip6_tables_fini);
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
deleted file mode 100644
index 435750f664dd..000000000000
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Hop Limit modification target for ip6tables
- * Maciej Soltysiak <solt@dns.toxicfilms.tv>
- * Based on HW's TTL module
- *
- * This software is distributed under the terms of GNU GPL
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter_ipv6/ip6t_HL.h>
-
-MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
-MODULE_DESCRIPTION("IP tables Hop Limit modification module");
-MODULE_LICENSE("GPL");
-
-static unsigned int ip6t_hl_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct ipv6hdr *ip6h;
- const struct ip6t_HL_info *info = targinfo;
- int new_hl;
-
- if (!skb_make_writable(pskb, (*pskb)->len))
- return NF_DROP;
-
- ip6h = (*pskb)->nh.ipv6h;
-
- switch (info->mode) {
- case IP6T_HL_SET:
- new_hl = info->hop_limit;
- break;
- case IP6T_HL_INC:
- new_hl = ip6h->hop_limit + info->hop_limit;
- if (new_hl > 255)
- new_hl = 255;
- break;
- case IP6T_HL_DEC:
- new_hl = ip6h->hop_limit - info->hop_limit;
- if (new_hl < 0)
- new_hl = 0;
- break;
- default:
- new_hl = ip6h->hop_limit;
- break;
- }
-
- if (new_hl != ip6h->hop_limit)
- ip6h->hop_limit = new_hl;
-
- return IP6T_CONTINUE;
-}
-
-static int ip6t_hl_checkentry(const char *tablename,
- const void *entry,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- struct ip6t_HL_info *info = targinfo;
-
- if (info->mode > IP6T_HL_MAXMODE) {
- printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n",
- info->mode);
- return 0;
- }
- if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
- printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
- "make sense with value 0\n");
- return 0;
- }
- return 1;
-}
-
-static struct ip6t_target ip6t_HL = {
- .name = "HL",
- .target = ip6t_hl_target,
- .targetsize = sizeof(struct ip6t_HL_info),
- .table = "mangle",
- .checkentry = ip6t_hl_checkentry,
- .me = THIS_MODULE
-};
-
-static int __init ip6t_hl_init(void)
-{
- return ip6t_register_target(&ip6t_HL);
-}
-
-static void __exit ip6t_hl_fini(void)
-{
- ip6t_unregister_target(&ip6t_HL);
-}
-
-module_init(ip6t_hl_init);
-module_exit(ip6t_hl_fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
deleted file mode 100644
index f4857cf97f05..000000000000
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * This is a module which is used for logging packets.
- */
-
-/* (C) 2001 Jan Rekorajski <baggins@pld.org.pl>
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <linux/spinlock.h>
-#include <linux/icmpv6.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
-MODULE_DESCRIPTION("IP6 tables LOG target module");
-MODULE_LICENSE("GPL");
-
-struct in_device;
-#include <net/route.h>
-#include <linux/netfilter_ipv6/ip6t_LOG.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
-/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
- const struct sk_buff *skb, unsigned int ip6hoff,
- int recurse)
-{
- u_int8_t currenthdr;
- int fragment;
- struct ipv6hdr _ip6h, *ih;
- unsigned int ptr;
- unsigned int hdrlen = 0;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_MASK;
-
- ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
- if (ih == NULL) {
- printk("TRUNCATED");
- return;
- }
-
- /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
- printk("SRC=" NIP6_FMT " DST=" NIP6_FMT " ", NIP6(ih->saddr), NIP6(ih->daddr));
-
- /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
- printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
- ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
- (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
- ih->hop_limit,
- (ntohl(*(__be32 *)ih) & 0x000fffff));
-
- fragment = 0;
- ptr = ip6hoff + sizeof(struct ipv6hdr);
- currenthdr = ih->nexthdr;
- while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- printk("TRUNCATED");
- return;
- }
-
- /* Max length: 48 "OPT (...) " */
- if (logflags & IP6T_LOG_IPOPT)
- printk("OPT ( ");
-
- switch (currenthdr) {
- case IPPROTO_FRAGMENT: {
- struct frag_hdr _fhdr, *fh;
-
- printk("FRAG:");
- fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
- &_fhdr);
- if (fh == NULL) {
- printk("TRUNCATED ");
- return;
- }
-
- /* Max length: 6 "65535 " */
- printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
-
- /* Max length: 11 "INCOMPLETE " */
- if (fh->frag_off & htons(0x0001))
- printk("INCOMPLETE ");
-
- printk("ID:%08x ", ntohl(fh->identification));
-
- if (ntohs(fh->frag_off) & 0xFFF8)
- fragment = 1;
-
- hdrlen = 8;
-
- break;
- }
- case IPPROTO_DSTOPTS:
- case IPPROTO_ROUTING:
- case IPPROTO_HOPOPTS:
- if (fragment) {
- if (logflags & IP6T_LOG_IPOPT)
- printk(")");
- return;
- }
- hdrlen = ipv6_optlen(hp);
- break;
- /* Max Length */
- case IPPROTO_AH:
- if (logflags & IP6T_LOG_IPOPT) {
- struct ip_auth_hdr _ahdr, *ah;
-
- /* Max length: 3 "AH " */
- printk("AH ");
-
- if (fragment) {
- printk(")");
- return;
- }
-
- ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
- &_ahdr);
- if (ah == NULL) {
- /*
- * Max length: 26 "INCOMPLETE [65535
- * bytes] )"
- */
- printk("INCOMPLETE [%u bytes] )",
- skb->len - ptr);
- return;
- }
-
- /* Length: 15 "SPI=0xF1234567 */
- printk("SPI=0x%x ", ntohl(ah->spi));
-
- }
-
- hdrlen = (hp->hdrlen+2)<<2;
- break;
- case IPPROTO_ESP:
- if (logflags & IP6T_LOG_IPOPT) {
- struct ip_esp_hdr _esph, *eh;
-
- /* Max length: 4 "ESP " */
- printk("ESP ");
-
- if (fragment) {
- printk(")");
- return;
- }
-
- /*
- * Max length: 26 "INCOMPLETE [65535 bytes] )"
- */
- eh = skb_header_pointer(skb, ptr, sizeof(_esph),
- &_esph);
- if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] )",
- skb->len - ptr);
- return;
- }
-
- /* Length: 16 "SPI=0xF1234567 )" */
- printk("SPI=0x%x )", ntohl(eh->spi) );
-
- }
- return;
- default:
- /* Max length: 20 "Unknown Ext Hdr 255" */
- printk("Unknown Ext Hdr %u", currenthdr);
- return;
- }
- if (logflags & IP6T_LOG_IPOPT)
- printk(") ");
-
- currenthdr = hp->nexthdr;
- ptr += hdrlen;
- }
-
- switch (currenthdr) {
- case IPPROTO_TCP: {
- struct tcphdr _tcph, *th;
-
- /* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
-
- if (fragment)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
- if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
- return;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & IP6T_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- /* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3C " */
- printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- printk("CWR ");
- if (th->ece)
- printk("ECE ");
- if (th->urg)
- printk("URG ");
- if (th->ack)
- printk("ACK ");
- if (th->psh)
- printk("PSH ");
- if (th->rst)
- printk("RST ");
- if (th->syn)
- printk("SYN ");
- if (th->fin)
- printk("FIN ");
- /* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & IP6T_LOG_TCPOPT)
- && th->doff * 4 > sizeof(struct tcphdr)) {
- u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
- unsigned int i;
- unsigned int optsize = th->doff * 4
- - sizeof(struct tcphdr);
-
- op = skb_header_pointer(skb,
- ptr + sizeof(struct tcphdr),
- optsize, _opt);
- if (op == NULL) {
- printk("OPT (TRUNCATED)");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
- for (i =0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
- }
- break;
- }
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE: {
- struct udphdr _udph, *uh;
-
- if (currenthdr == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
- else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
-
- if (fragment)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
- if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
- return;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest),
- ntohs(uh->len));
- break;
- }
- case IPPROTO_ICMPV6: {
- struct icmp6hdr _icmp6h, *ic;
-
- /* Max length: 13 "PROTO=ICMPv6 " */
- printk("PROTO=ICMPv6 ");
-
- if (fragment)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
- if (ic == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
- return;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
-
- switch (ic->icmp6_type) {
- case ICMPV6_ECHO_REQUEST:
- case ICMPV6_ECHO_REPLY:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
- ntohs(ic->icmp6_identifier),
- ntohs(ic->icmp6_sequence));
- break;
- case ICMPV6_MGM_QUERY:
- case ICMPV6_MGM_REPORT:
- case ICMPV6_MGM_REDUCTION:
- break;
-
- case ICMPV6_PARAMPROB:
- /* Max length: 17 "POINTER=ffffffff " */
- printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
- /* Fall through */
- case ICMPV6_DEST_UNREACH:
- case ICMPV6_PKT_TOOBIG:
- case ICMPV6_TIME_EXCEED:
- /* Max length: 3+maxlen */
- if (recurse) {
- printk("[");
- dump_packet(info, skb, ptr + sizeof(_icmp6h),
- 0);
- printk("] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
- printk("MTU=%u ", ntohl(ic->icmp6_mtu));
- }
- break;
- }
- /* Max length: 10 "PROTO=255 " */
- default:
- printk("PROTO=%u ", currenthdr);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
- }
-}
-
-static struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = 0,
- .logflags = NF_LOG_MASK,
- },
- },
-};
-
-static void
-ip6t_log_packet(unsigned int pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- if (!loginfo)
- loginfo = &default_loginfo;
-
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
- prefix,
- in ? in->name : "",
- out ? out->name : "");
- if (in && !out) {
- unsigned int len;
- /* MAC logging for input chain only. */
- printk("MAC=");
- if (skb->dev && (len = skb->dev->hard_header_len) &&
- skb->mac.raw != skb->nh.raw) {
- unsigned char *p = skb->mac.raw;
- int i;
-
- if (skb->dev->type == ARPHRD_SIT &&
- (p -= ETH_HLEN) < skb->head)
- p = NULL;
-
- if (p != NULL) {
- for (i = 0; i < len; i++)
- printk("%02x%s", p[i],
- i == len - 1 ? "" : ":");
- }
- printk(" ");
-
- if (skb->dev->type == ARPHRD_SIT) {
- struct iphdr *iph = (struct iphdr *)skb->mac.raw;
- printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
- NIPQUAD(iph->saddr),
- NIPQUAD(iph->daddr));
- }
- } else
- printk(" ");
- }
-
- dump_packet(loginfo, skb, (u8*)skb->nh.ipv6h - skb->data, 1);
- printk("\n");
- spin_unlock_bh(&log_lock);
-}
-
-static unsigned int
-ip6t_log_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ip6t_log_info *loginfo = targinfo;
- struct nf_loginfo li;
-
- li.type = NF_LOG_TYPE_LOG;
- li.u.log.level = loginfo->level;
- li.u.log.logflags = loginfo->logflags;
-
- if (loginfo->logflags & IP6T_LOG_NFLOG)
- nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li,
- "%s", loginfo->prefix);
- else
- ip6t_log_packet(PF_INET6, hooknum, *pskb, in, out, &li,
- loginfo->prefix);
-
- return IP6T_CONTINUE;
-}
-
-
-static int ip6t_log_checkentry(const char *tablename,
- const void *entry,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const struct ip6t_log_info *loginfo = targinfo;
-
- if (loginfo->level >= 8) {
- DEBUGP("LOG: level %u >= 8\n", loginfo->level);
- return 0;
- }
- if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
- DEBUGP("LOG: prefix term %i\n",
- loginfo->prefix[sizeof(loginfo->prefix)-1]);
- return 0;
- }
- return 1;
-}
-
-static struct ip6t_target ip6t_log_reg = {
- .name = "LOG",
- .target = ip6t_log_target,
- .targetsize = sizeof(struct ip6t_log_info),
- .checkentry = ip6t_log_checkentry,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ip6t_logger = {
- .name = "ip6t_LOG",
- .logfn = &ip6t_log_packet,
- .me = THIS_MODULE,
-};
-
-static int __init ip6t_log_init(void)
-{
- if (ip6t_register_target(&ip6t_log_reg))
- return -EINVAL;
- if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
- printk(KERN_WARNING "ip6t_LOG: not logging via system console "
- "since somebody else already registered for PF_INET6\n");
- /* we cannot make module load fail here, since otherwise
- * ip6tables userspace would abort */
- }
-
- return 0;
-}
-
-static void __exit ip6t_log_fini(void)
-{
- nf_log_unregister_logger(&ip6t_logger);
- ip6t_unregister_target(&ip6t_log_reg);
-}
-
-module_init(ip6t_log_init);
-module_exit(ip6t_log_fini);
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
new file mode 100644
index 000000000000..787c74aa85e3
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6t_NPT.h>
+#include <linux/netfilter/x_tables.h>
+
+static int ip6t_npt_checkentry(const struct xt_tgchk_param *par)
+{
+ struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct in6_addr pfx;
+ __wsum src_sum, dst_sum;
+
+ if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64)
+ return -EINVAL;
+
+ /* Ensure that LSB of prefix is zero */
+ ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len);
+ if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6))
+ return -EINVAL;
+ ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len);
+ if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6))
+ return -EINVAL;
+
+ src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0);
+ dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0);
+
+ npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum));
+ return 0;
+}
+
+static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
+ struct in6_addr *addr)
+{
+ unsigned int pfx_len;
+ unsigned int i, idx;
+ __be32 mask;
+ __sum16 sum;
+
+ pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len);
+ for (i = 0; i < pfx_len; i += 32) {
+ if (pfx_len - i >= 32)
+ mask = 0;
+ else
+ mask = htonl((1 << (i - pfx_len + 32)) - 1);
+
+ idx = i / 32;
+ addr->s6_addr32[idx] &= mask;
+ addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx];
+ }
+
+ if (pfx_len <= 48)
+ idx = 3;
+ else {
+ for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) {
+ if ((__force __sum16)addr->s6_addr16[idx] !=
+ CSUM_MANGLED_0)
+ break;
+ }
+ if (idx == ARRAY_SIZE(addr->s6_addr16))
+ return false;
+ }
+
+ sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]),
+ csum_unfold(npt->adjustment)));
+ if (sum == CSUM_MANGLED_0)
+ sum = 0;
+ *(__force __sum16 *)&addr->s6_addr16[idx] = sum;
+
+ return true;
+}
+
+static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb,
+ struct ipv6hdr *_bounced_hdr)
+{
+ if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ return NULL;
+
+ if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type))
+ return NULL;
+
+ return skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(struct icmp6hdr),
+ sizeof(struct ipv6hdr),
+ _bounced_hdr);
+}
+
+static unsigned int
+ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
+
+ if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
+ offsetof(struct ipv6hdr, saddr));
+ return NF_DROP;
+ }
+
+ /* rewrite dst addr of bounced packet which was sent to dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->daddr);
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int
+ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
+
+ if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
+ offsetof(struct ipv6hdr, daddr));
+ return NF_DROP;
+ }
+
+ /* rewrite src addr of bounced packet which was sent from dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->saddr);
+ }
+
+ return XT_CONTINUE;
+}
+
+static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
+ {
+ .name = "SNPT",
+ .table = "mangle",
+ .target = ip6t_snpt_tg,
+ .targetsize = sizeof(struct ip6t_npt_tginfo),
+ .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
+ .checkentry = ip6t_npt_checkentry,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNPT",
+ .table = "mangle",
+ .target = ip6t_dnpt_tg,
+ .targetsize = sizeof(struct ip6t_npt_tginfo),
+ .usersize = offsetof(struct ip6t_npt_tginfo, adjustment),
+ .checkentry = ip6t_npt_checkentry,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init ip6t_npt_init(void)
+{
+ return xt_register_targets(ip6t_npt_target_reg,
+ ARRAY_SIZE(ip6t_npt_target_reg));
+}
+
+static void __exit ip6t_npt_exit(void)
+{
+ xt_unregister_targets(ip6t_npt_target_reg,
+ ARRAY_SIZE(ip6t_npt_target_reg));
+}
+
+module_init(ip6t_npt_init);
+module_exit(ip6t_npt_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS("ip6t_SNPT");
+MODULE_ALIAS("ip6t_DNPT");
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 311eae82feb3..a35019d2e480 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP6 tables REJECT target module
* Linux INET6 implementation
@@ -7,261 +8,114 @@
* Authors:
* Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
*
- * Based on net/ipv4/netfilter/ipt_REJECT.c
+ * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/icmpv6.h>
#include <linux/netdevice.h>
-#include <net/ipv6.h>
-#include <net/tcp.h>
#include <net/icmp.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_fib.h>
-#include <net/ip6_route.h>
#include <net/flow.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
-MODULE_DESCRIPTION("IP6 tables REJECT target module");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
MODULE_LICENSE("GPL");
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Send RST reply */
-static void send_reset(struct sk_buff *oldskb)
-{
- struct sk_buff *nskb;
- struct tcphdr otcph, *tcph;
- unsigned int otcplen, hh_len;
- int tcphoff, needs_ack;
- struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h;
- struct dst_entry *dst = NULL;
- u8 proto;
- struct flowi fl;
-
- if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
- (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
- DEBUGP("ip6t_REJECT: addr is not unicast.\n");
- return;
- }
-
- proto = oip6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
-
- if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
- DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
- return;
- }
-
- otcplen = oldskb->len - tcphoff;
-
- /* IP header checks: fragment, too short. */
- if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) {
- DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
- proto, otcplen);
- return;
- }
-
- if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
- BUG();
-
- /* No RST for RST. */
- if (otcph.rst) {
- DEBUGP("ip6t_REJECT: RST is set\n");
- return;
- }
-
- /* Check checksum. */
- if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
- skb_checksum(oldskb, tcphoff, otcplen, 0))) {
- DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
- return;
- }
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
- ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
- fl.fl_ip_sport = otcph.dest;
- fl.fl_ip_dport = otcph.source;
- security_skb_classify_flow(oldskb, &fl);
- dst = ip6_route_output(NULL, &fl);
- if (dst == NULL)
- return;
- if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0))
- return;
-
- hh_len = (dst->dev->hard_header_len + 15)&~15;
- nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
- + sizeof(struct tcphdr) + dst->trailer_len,
- GFP_ATOMIC);
-
- if (!nskb) {
- if (net_ratelimit())
- printk("ip6t_REJECT: Can't alloc skb\n");
- dst_release(dst);
- return;
- }
-
- nskb->dst = dst;
-
- skb_reserve(nskb, hh_len + dst->header_len);
-
- ip6h = nskb->nh.ipv6h = (struct ipv6hdr *)
- skb_put(nskb, sizeof(struct ipv6hdr));
- ip6h->version = 6;
- ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT);
- ip6h->nexthdr = IPPROTO_TCP;
- ip6h->payload_len = htons(sizeof(struct tcphdr));
- ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
- ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
-
- tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
- /* Truncate to length (no data) */
- tcph->doff = sizeof(struct tcphdr)/4;
- tcph->source = otcph.dest;
- tcph->dest = otcph.source;
-
- if (otcph.ack) {
- needs_ack = 0;
- tcph->seq = otcph.ack_seq;
- tcph->ack_seq = 0;
- } else {
- needs_ack = 1;
- tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
- + otcplen - (otcph.doff<<2));
- tcph->seq = 0;
- }
-
- /* Reset flags */
- ((u_int8_t *)tcph)[13] = 0;
- tcph->rst = 1;
- tcph->ack = needs_ack;
- tcph->window = 0;
- tcph->urg_ptr = 0;
- tcph->check = 0;
-
- /* Adjust TCP checksum */
- tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr,
- &nskb->nh.ipv6h->daddr,
- sizeof(struct tcphdr), IPPROTO_TCP,
- csum_partial((char *)tcph,
- sizeof(struct tcphdr), 0));
-
- nf_ct_attach(nskb, oldskb);
-
- NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
- dst_output);
-}
-
-static inline void
-send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
-{
- if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL)
- skb_in->dev = &loopback_dev;
-
- icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
-}
-
-static unsigned int reject6_target(struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+static unsigned int
+reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct ip6t_reject_info *reject = targinfo;
+ const struct ip6t_reject_info *reject = par->targinfo;
+ struct net *net = xt_net(par);
- DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
- /* WARNING: This code causes reentry within ip6tables.
- This means that the ip6tables jump stack is now crap. We
- must return an absolute verdict. --RR */
- switch (reject->with) {
- case IP6T_ICMP6_NO_ROUTE:
- send_unreach(*pskb, ICMPV6_NOROUTE, hooknum);
- break;
- case IP6T_ICMP6_ADM_PROHIBITED:
- send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum);
- break;
- case IP6T_ICMP6_NOT_NEIGHBOUR:
- send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum);
- break;
- case IP6T_ICMP6_ADDR_UNREACH:
- send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum);
- break;
- case IP6T_ICMP6_PORT_UNREACH:
- send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum);
- break;
- case IP6T_ICMP6_ECHOREPLY:
+ switch (reject->with) {
+ case IP6T_ICMP6_NO_ROUTE:
+ nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par));
+ break;
+ case IP6T_ICMP6_ADM_PROHIBITED:
+ nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED,
+ xt_hooknum(par));
+ break;
+ case IP6T_ICMP6_NOT_NEIGHBOUR:
+ nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR,
+ xt_hooknum(par));
+ break;
+ case IP6T_ICMP6_ADDR_UNREACH:
+ nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH,
+ xt_hooknum(par));
+ break;
+ case IP6T_ICMP6_PORT_UNREACH:
+ nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH,
+ xt_hooknum(par));
+ break;
+ case IP6T_ICMP6_ECHOREPLY:
/* Do nothing */
break;
case IP6T_TCP_RESET:
- send_reset(*pskb);
+ nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par));
+ break;
+ case IP6T_ICMP6_POLICY_FAIL:
+ nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
break;
- default:
- if (net_ratelimit())
- printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with);
+ case IP6T_ICMP6_REJECT_ROUTE:
+ nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE,
+ xt_hooknum(par));
break;
}
return NF_DROP;
}
-static int check(const char *tablename,
- const void *entry,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+static int reject_tg6_check(const struct xt_tgchk_param *par)
{
- const struct ip6t_reject_info *rejinfo = targinfo;
- const struct ip6t_entry *e = entry;
+ const struct ip6t_reject_info *rejinfo = par->targinfo;
+ const struct ip6t_entry *e = par->entryinfo;
if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
- printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
- return 0;
+ pr_info_ratelimited("ECHOREPLY is not supported\n");
+ return -EINVAL;
} else if (rejinfo->with == IP6T_TCP_RESET) {
/* Must specify that it's a TCP packet */
- if (e->ipv6.proto != IPPROTO_TCP
- || (e->ipv6.invflags & IP6T_INV_PROTO)) {
- DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
- return 0;
+ if (!(e->ipv6.flags & IP6T_F_PROTO) ||
+ e->ipv6.proto != IPPROTO_TCP ||
+ (e->ipv6.invflags & XT_INV_PROTO)) {
+ pr_info_ratelimited("TCP_RESET illegal for non-tcp\n");
+ return -EINVAL;
}
}
- return 1;
+ return 0;
}
-static struct ip6t_target ip6t_reject_reg = {
+static struct xt_target reject_tg6_reg __read_mostly = {
.name = "REJECT",
- .target = reject6_target,
+ .family = NFPROTO_IPV6,
+ .target = reject_tg6,
.targetsize = sizeof(struct ip6t_reject_info),
.table = "filter",
- .hooks = (1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) |
- (1 << NF_IP6_LOCAL_OUT),
- .checkentry = check,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = reject_tg6_check,
.me = THIS_MODULE
};
-static int __init ip6t_reject_init(void)
+static int __init reject_tg6_init(void)
{
- return ip6t_register_target(&ip6t_reject_reg);
+ return xt_register_target(&reject_tg6_reg);
}
-static void __exit ip6t_reject_fini(void)
+static void __exit reject_tg6_exit(void)
{
- ip6t_unregister_target(&ip6t_reject_reg);
+ xt_unregister_target(&reject_tg6_reg);
}
-module_init(ip6t_reject_init);
-module_exit(ip6t_reject_fini);
+module_init(reject_tg6_init);
+module_exit(reject_tg6_exit);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
new file mode 100644
index 000000000000..d51d0c3e5fe9
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+
+#include <net/netfilter/nf_synproxy.h>
+
+static unsigned int
+synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_synproxy_info *info = par->targinfo;
+ struct net *net = xt_net(par);
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct synproxy_options opts = {};
+ struct tcphdr *th, _th;
+
+ if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
+ return NF_DROP;
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
+
+ if (th->syn && !(th->ack || th->fin || th->rst)) {
+ /* Initial SYN from client */
+ this_cpu_inc(snet->stats->syn_received);
+
+ if (th->ece && th->cwr)
+ opts.options |= XT_SYNPROXY_OPT_ECN;
+
+ opts.options &= info->options;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, &opts);
+ else
+ opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM |
+ XT_SYNPROXY_OPT_ECN);
+
+ synproxy_send_client_synack_ipv6(net, skb, th, &opts);
+ consume_skb(skb);
+ return NF_STOLEN;
+
+ } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+ /* ACK from client */
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq))) {
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ return XT_CONTINUE;
+}
+
+static int synproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+ struct synproxy_net *snet = synproxy_pernet(par->net);
+ const struct ip6t_entry *e = par->entryinfo;
+ int err;
+
+ if (!(e->ipv6.flags & IP6T_F_PROTO) ||
+ e->ipv6.proto != IPPROTO_TCP ||
+ e->ipv6.invflags & XT_INV_PROTO)
+ return -EINVAL;
+
+ err = nf_ct_netns_get(par->net, par->family);
+ if (err)
+ return err;
+
+ err = nf_synproxy_ipv6_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
+ }
+
+ return err;
+}
+
+static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+ struct synproxy_net *snet = synproxy_pernet(par->net);
+
+ nf_synproxy_ipv6_fini(snet, par->net);
+ nf_ct_netns_put(par->net, par->family);
+}
+
+static struct xt_target synproxy_tg6_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+ .target = synproxy_tg6,
+ .targetsize = sizeof(struct xt_synproxy_info),
+ .checkentry = synproxy_tg6_check,
+ .destroy = synproxy_tg6_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init synproxy_tg6_init(void)
+{
+ return xt_register_target(&synproxy_tg6_reg);
+}
+
+static void __exit synproxy_tg6_exit(void)
+{
+ xt_unregister_target(&synproxy_tg6_reg);
+}
+
+module_init(synproxy_tg6_init);
+module_exit(synproxy_tg6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies");
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 46486645eb75..70da2f2ce064 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match AH parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
@@ -15,126 +12,106 @@
#include <net/checksum.h>
#include <net/ipv6.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter_ipv6/ip6t_ah.h>
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("IPv6 AH match");
+MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match");
MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
/* Returns 1 if the spi is matched by the range, 0 otherwise */
-static inline int
-spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
{
- int r=0;
- DEBUGP("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
- min,spi,max);
+ bool r;
+
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
r = (spi >= min && spi <= max) ^ invert;
- DEBUGP(" result %s\n",r? "PASS\n" : "FAILED\n");
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ip_auth_hdr *ah, _ah;
- const struct ip6t_ah *ahinfo = matchinfo;
- unsigned int ptr;
+ struct ip_auth_hdr _ah;
+ const struct ip_auth_hdr *ah;
+ const struct ip6t_ah *ahinfo = par->matchinfo;
+ unsigned int ptr = 0;
unsigned int hdrlen = 0;
int err;
- err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL);
+ err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL, NULL);
if (err < 0) {
if (err != -ENOENT)
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
if (ah == NULL) {
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
- hdrlen = (ah->hdrlen + 2) << 2;
-
- DEBUGP("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
- DEBUGP("RES %04X ", ah->reserved);
- DEBUGP("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi));
-
- DEBUGP("IPv6 AH spi %02X ",
- (spi_match(ahinfo->spis[0], ahinfo->spis[1],
- ntohl(ah->spi),
- !!(ahinfo->invflags & IP6T_AH_INV_SPI))));
- DEBUGP("len %02X %04X %02X ",
- ahinfo->hdrlen, hdrlen,
- (!ahinfo->hdrlen ||
- (ahinfo->hdrlen == hdrlen) ^
- !!(ahinfo->invflags & IP6T_AH_INV_LEN)));
- DEBUGP("res %02X %04X %02X\n",
- ahinfo->hdrres, ah->reserved,
- !(ahinfo->hdrres && ah->reserved));
-
- return (ah != NULL)
- &&
- (spi_match(ahinfo->spis[0], ahinfo->spis[1],
- ntohl(ah->spi),
- !!(ahinfo->invflags & IP6T_AH_INV_SPI)))
- &&
- (!ahinfo->hdrlen ||
- (ahinfo->hdrlen == hdrlen) ^
- !!(ahinfo->invflags & IP6T_AH_INV_LEN))
- &&
- !(ahinfo->hdrres && ah->reserved);
+ hdrlen = ipv6_authlen(ah);
+
+ pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
+ pr_debug("RES %04X ", ah->reserved);
+ pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi));
+
+ pr_debug("IPv6 AH spi %02X ",
+ spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IP6T_AH_INV_SPI)));
+ pr_debug("len %02X %04X %02X ",
+ ahinfo->hdrlen, hdrlen,
+ (!ahinfo->hdrlen ||
+ (ahinfo->hdrlen == hdrlen) ^
+ !!(ahinfo->invflags & IP6T_AH_INV_LEN)));
+ pr_debug("res %02X %04X %02X\n",
+ ahinfo->hdrres, ah->reserved,
+ !(ahinfo->hdrres && ah->reserved));
+
+ return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IP6T_AH_INV_SPI)) &&
+ (!ahinfo->hdrlen ||
+ (ahinfo->hdrlen == hdrlen) ^
+ !!(ahinfo->invflags & IP6T_AH_INV_LEN)) &&
+ !(ahinfo->hdrres && ah->reserved);
}
-/* Called when user tries to insert an entry of this type. */
-static int
-checkentry(const char *tablename,
- const void *entry,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+static int ah_mt6_check(const struct xt_mtchk_param *par)
{
- const struct ip6t_ah *ahinfo = matchinfo;
+ const struct ip6t_ah *ahinfo = par->matchinfo;
if (ahinfo->invflags & ~IP6T_AH_INV_MASK) {
- DEBUGP("ip6t_ah: unknown flags %X\n", ahinfo->invflags);
- return 0;
+ pr_debug("unknown flags %X\n", ahinfo->invflags);
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static struct ip6t_match ah_match = {
+static struct xt_match ah_mt6_reg __read_mostly = {
.name = "ah",
- .match = match,
+ .family = NFPROTO_IPV6,
+ .match = ah_mt6,
.matchsize = sizeof(struct ip6t_ah),
- .checkentry = checkentry,
+ .checkentry = ah_mt6_check,
.me = THIS_MODULE,
};
-static int __init ip6t_ah_init(void)
+static int __init ah_mt6_init(void)
{
- return ip6t_register_match(&ah_match);
+ return xt_register_match(&ah_mt6_reg);
}
-static void __exit ip6t_ah_fini(void)
+static void __exit ah_mt6_exit(void)
{
- ip6t_unregister_match(&ah_match);
+ xt_unregister_match(&ah_mt6_reg);
}
-module_init(ip6t_ah_init);
-module_exit(ip6t_ah_fini);
+module_init(ah_mt6_init);
+module_exit(ah_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index 4f6b84c8f4ab..d704f7ed300c 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match EUI64 address parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -12,73 +9,63 @@
#include <linux/ipv6.h>
#include <linux/if_ether.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
-MODULE_DESCRIPTION("IPv6 EUI64 address checking match");
+MODULE_DESCRIPTION("Xtables: IPv6 EUI64 address match");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static bool
+eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
unsigned char eui64[8];
- int i = 0;
- if (!(skb->mac.raw >= skb->head &&
- (skb->mac.raw + ETH_HLEN) <= skb->data) &&
- offset != 0) {
- *hotdrop = 1;
- return 0;
+ if (!(skb_mac_header(skb) >= skb->head &&
+ skb_mac_header(skb) + ETH_HLEN <= skb->data) &&
+ par->fragoff != 0) {
+ par->hotdrop = true;
+ return false;
}
memset(eui64, 0, sizeof(eui64));
if (eth_hdr(skb)->h_proto == htons(ETH_P_IPV6)) {
- if (skb->nh.ipv6h->version == 0x6) {
+ if (ipv6_hdr(skb)->version == 0x6) {
memcpy(eui64, eth_hdr(skb)->h_source, 3);
memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3);
eui64[3] = 0xff;
eui64[4] = 0xfe;
- eui64[0] |= 0x02;
+ eui64[0] ^= 0x02;
- i = 0;
- while ((skb->nh.ipv6h->saddr.s6_addr[8+i] == eui64[i])
- && (i < 8))
- i++;
-
- if (i == 8)
- return 1;
+ if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64,
+ sizeof(eui64)))
+ return true;
}
}
- return 0;
+ return false;
}
-static struct ip6t_match eui64_match = {
+static struct xt_match eui64_mt6_reg __read_mostly = {
.name = "eui64",
- .match = match,
+ .family = NFPROTO_IPV6,
+ .match = eui64_mt6,
.matchsize = sizeof(int),
- .hooks = (1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) |
- (1 << NF_IP6_FORWARD),
+ .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD),
.me = THIS_MODULE,
};
-static int __init ip6t_eui64_init(void)
+static int __init eui64_mt6_init(void)
{
- return ip6t_register_match(&eui64_match);
+ return xt_register_match(&eui64_mt6_reg);
}
-static void __exit ip6t_eui64_fini(void)
+static void __exit eui64_mt6_exit(void)
{
- ip6t_unregister_match(&eui64_match);
+ xt_unregister_match(&eui64_mt6_reg);
}
-module_init(ip6t_eui64_init);
-module_exit(ip6t_eui64_fini);
+module_init(eui64_mt6_init);
+module_exit(eui64_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index cd22eaaccdca..3aad6439386b 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match FRAG parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/ipv6.h>
@@ -14,144 +11,122 @@
#include <net/checksum.h>
#include <net/ipv6.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter_ipv6/ip6t_frag.h>
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("IPv6 FRAG match");
+MODULE_DESCRIPTION("Xtables: IPv6 fragment match");
MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
/* Returns 1 if the id is matched by the range, 0 otherwise */
-static inline int
-id_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert)
+static inline bool
+id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
{
- int r = 0;
- DEBUGP("frag id_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ',
- min, id, max);
+ bool r;
+ pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ',
+ min, id, max);
r = (id >= min && id <= max) ^ invert;
- DEBUGP(" result %s\n", r ? "PASS" : "FAILED");
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static bool
+frag_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct frag_hdr _frag, *fh;
- const struct ip6t_frag *fraginfo = matchinfo;
- unsigned int ptr;
+ struct frag_hdr _frag;
+ const struct frag_hdr *fh;
+ const struct ip6t_frag *fraginfo = par->matchinfo;
+ unsigned int ptr = 0;
int err;
- err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL);
+ err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL);
if (err < 0) {
if (err != -ENOENT)
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
if (fh == NULL) {
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
- DEBUGP("INFO %04X ", fh->frag_off);
- DEBUGP("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
- DEBUGP("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6);
- DEBUGP("MF %04X ", fh->frag_off & htons(IP6_MF));
- DEBUGP("ID %u %08X\n", ntohl(fh->identification),
- ntohl(fh->identification));
-
- DEBUGP("IPv6 FRAG id %02X ",
- (id_match(fraginfo->ids[0], fraginfo->ids[1],
- ntohl(fh->identification),
- !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))));
- DEBUGP("res %02X %02X%04X %02X ",
- (fraginfo->flags & IP6T_FRAG_RES), fh->reserved,
- ntohs(fh->frag_off) & 0x6,
- !((fraginfo->flags & IP6T_FRAG_RES)
- && (fh->reserved || (ntohs(fh->frag_off) & 0x06))));
- DEBUGP("first %02X %02X %02X ",
- (fraginfo->flags & IP6T_FRAG_FST),
- ntohs(fh->frag_off) & ~0x7,
- !((fraginfo->flags & IP6T_FRAG_FST)
- && (ntohs(fh->frag_off) & ~0x7)));
- DEBUGP("mf %02X %02X %02X ",
- (fraginfo->flags & IP6T_FRAG_MF),
- ntohs(fh->frag_off) & IP6_MF,
- !((fraginfo->flags & IP6T_FRAG_MF)
- && !((ntohs(fh->frag_off) & IP6_MF))));
- DEBUGP("last %02X %02X %02X\n",
- (fraginfo->flags & IP6T_FRAG_NMF),
- ntohs(fh->frag_off) & IP6_MF,
- !((fraginfo->flags & IP6T_FRAG_NMF)
- && (ntohs(fh->frag_off) & IP6_MF)));
-
- return (fh != NULL)
- &&
- (id_match(fraginfo->ids[0], fraginfo->ids[1],
+ pr_debug("INFO %04X ", fh->frag_off);
+ pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
+ pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6);
+ pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF));
+ pr_debug("ID %u %08X\n", ntohl(fh->identification),
+ ntohl(fh->identification));
+
+ pr_debug("IPv6 FRAG id %02X ",
+ id_match(fraginfo->ids[0], fraginfo->ids[1],
+ ntohl(fh->identification),
+ !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)));
+ pr_debug("res %02X %02X%04X %02X ",
+ fraginfo->flags & IP6T_FRAG_RES, fh->reserved,
+ ntohs(fh->frag_off) & 0x6,
+ !((fraginfo->flags & IP6T_FRAG_RES) &&
+ (fh->reserved || (ntohs(fh->frag_off) & 0x06))));
+ pr_debug("first %02X %02X %02X ",
+ fraginfo->flags & IP6T_FRAG_FST,
+ ntohs(fh->frag_off) & ~0x7,
+ !((fraginfo->flags & IP6T_FRAG_FST) &&
+ (ntohs(fh->frag_off) & ~0x7)));
+ pr_debug("mf %02X %02X %02X ",
+ fraginfo->flags & IP6T_FRAG_MF,
+ ntohs(fh->frag_off) & IP6_MF,
+ !((fraginfo->flags & IP6T_FRAG_MF) &&
+ !((ntohs(fh->frag_off) & IP6_MF))));
+ pr_debug("last %02X %02X %02X\n",
+ fraginfo->flags & IP6T_FRAG_NMF,
+ ntohs(fh->frag_off) & IP6_MF,
+ !((fraginfo->flags & IP6T_FRAG_NMF) &&
+ (ntohs(fh->frag_off) & IP6_MF)));
+
+ return id_match(fraginfo->ids[0], fraginfo->ids[1],
ntohl(fh->identification),
- !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)))
- &&
- !((fraginfo->flags & IP6T_FRAG_RES)
- && (fh->reserved || (ntohs(fh->frag_off) & 0x6)))
- &&
- !((fraginfo->flags & IP6T_FRAG_FST)
- && (ntohs(fh->frag_off) & ~0x7))
- &&
- !((fraginfo->flags & IP6T_FRAG_MF)
- && !(ntohs(fh->frag_off) & IP6_MF))
- &&
- !((fraginfo->flags & IP6T_FRAG_NMF)
- && (ntohs(fh->frag_off) & IP6_MF));
+ !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) &&
+ !((fraginfo->flags & IP6T_FRAG_RES) &&
+ (fh->reserved || (ntohs(fh->frag_off) & 0x6))) &&
+ !((fraginfo->flags & IP6T_FRAG_FST) &&
+ (ntohs(fh->frag_off) & ~0x7)) &&
+ !((fraginfo->flags & IP6T_FRAG_MF) &&
+ !(ntohs(fh->frag_off) & IP6_MF)) &&
+ !((fraginfo->flags & IP6T_FRAG_NMF) &&
+ (ntohs(fh->frag_off) & IP6_MF));
}
-/* Called when user tries to insert an entry of this type. */
-static int
-checkentry(const char *tablename,
- const void *ip,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+static int frag_mt6_check(const struct xt_mtchk_param *par)
{
- const struct ip6t_frag *fraginfo = matchinfo;
+ const struct ip6t_frag *fraginfo = par->matchinfo;
if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) {
- DEBUGP("ip6t_frag: unknown flags %X\n", fraginfo->invflags);
- return 0;
+ pr_debug("unknown flags %X\n", fraginfo->invflags);
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static struct ip6t_match frag_match = {
+static struct xt_match frag_mt6_reg __read_mostly = {
.name = "frag",
- .match = match,
+ .family = NFPROTO_IPV6,
+ .match = frag_mt6,
.matchsize = sizeof(struct ip6t_frag),
- .checkentry = checkentry,
+ .checkentry = frag_mt6_check,
.me = THIS_MODULE,
};
-static int __init ip6t_frag_init(void)
+static int __init frag_mt6_init(void)
{
- return ip6t_register_match(&frag_match);
+ return xt_register_match(&frag_mt6_reg);
}
-static void __exit ip6t_frag_fini(void)
+static void __exit frag_mt6_exit(void)
{
- ip6t_unregister_match(&frag_match);
+ xt_unregister_match(&frag_mt6_reg);
}
-module_init(ip6t_frag_init);
-module_exit(ip6t_frag_fini);
+module_init(frag_mt6_init);
+module_exit(frag_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 3f25babe0440..e7a3fb9355ee 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match Hop-by-Hop and Destination parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/ipv6.h>
@@ -16,20 +13,15 @@
#include <asm/byteorder.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter_ipv6/ip6t_opts.h>
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("IPv6 opts match");
+MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match");
MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
MODULE_ALIAS("ip6t_dst");
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
/*
* (Type & 0xC0) >> 6
* 0 -> ignorable
@@ -46,56 +38,55 @@ MODULE_ALIAS("ip6t_dst");
* 5 -> RTALERT 2 x x
*/
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static struct xt_match hbh_mt6_reg[] __read_mostly;
+
+static bool
+hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ipv6_opt_hdr _optsh, *oh;
- const struct ip6t_opts *optinfo = matchinfo;
+ struct ipv6_opt_hdr _optsh;
+ const struct ipv6_opt_hdr *oh;
+ const struct ip6t_opts *optinfo = par->matchinfo;
unsigned int temp;
- unsigned int ptr;
+ unsigned int ptr = 0;
unsigned int hdrlen = 0;
- unsigned int ret = 0;
- u8 _opttype, *tp = NULL;
- u8 _optlen, *lp = NULL;
+ bool ret = false;
+ u8 _opttype;
+ u8 _optlen;
+ const u_int8_t *tp = NULL;
+ const u_int8_t *lp = NULL;
unsigned int optlen;
int err;
- err = ipv6_find_hdr(skb, &ptr, match->data, NULL);
+ err = ipv6_find_hdr(skb, &ptr,
+ (par->match == &hbh_mt6_reg[0]) ?
+ NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL);
if (err < 0) {
if (err != -ENOENT)
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
if (oh == NULL) {
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
hdrlen = ipv6_optlen(oh);
if (skb->len - ptr < hdrlen) {
/* Packet smaller than it's length field */
- return 0;
+ return false;
}
- DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
+ pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
- DEBUGP("len %02X %04X %02X ",
- optinfo->hdrlen, hdrlen,
- (!(optinfo->flags & IP6T_OPTS_LEN) ||
- ((optinfo->hdrlen == hdrlen) ^
- !!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
+ pr_debug("len %02X %04X %02X ",
+ optinfo->hdrlen, hdrlen,
+ (!(optinfo->flags & IP6T_OPTS_LEN) ||
+ ((optinfo->hdrlen == hdrlen) ^
+ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
- ret = (oh != NULL) &&
- (!(optinfo->flags & IP6T_OPTS_LEN) ||
+ ret = (!(optinfo->flags & IP6T_OPTS_LEN) ||
((optinfo->hdrlen == hdrlen) ^
!!(optinfo->invflags & IP6T_OPTS_INV_LEN)));
@@ -103,11 +94,9 @@ match(const struct sk_buff *skb,
hdrlen -= 2;
if (!(optinfo->flags & IP6T_OPTS_OPTS)) {
return ret;
- } else if (optinfo->flags & IP6T_OPTS_NSTRICT) {
- DEBUGP("Not strict - not implemented");
} else {
- DEBUGP("Strict ");
- DEBUGP("#%d ", optinfo->optsnr);
+ pr_debug("Strict ");
+ pr_debug("#%d ", optinfo->optsnr);
for (temp = 0; temp < optinfo->optsnr; temp++) {
/* type field exists ? */
if (hdrlen < 1)
@@ -119,12 +108,11 @@ match(const struct sk_buff *skb,
/* Type check */
if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) {
- DEBUGP("Tbad %02X %02X\n",
- *tp,
- (optinfo->opts[temp] & 0xFF00) >> 8);
- return 0;
+ pr_debug("Tbad %02X %02X\n", *tp,
+ (optinfo->opts[temp] & 0xFF00) >> 8);
+ return false;
} else {
- DEBUGP("Tok ");
+ pr_debug("Tok ");
}
/* Length check */
if (*tp) {
@@ -141,23 +129,23 @@ match(const struct sk_buff *skb,
spec_len = optinfo->opts[temp] & 0x00FF;
if (spec_len != 0x00FF && spec_len != *lp) {
- DEBUGP("Lbad %02X %04X\n", *lp,
- spec_len);
- return 0;
+ pr_debug("Lbad %02X %04X\n", *lp,
+ spec_len);
+ return false;
}
- DEBUGP("Lok ");
+ pr_debug("Lok ");
optlen = *lp + 2;
} else {
- DEBUGP("Pad1\n");
+ pr_debug("Pad1\n");
optlen = 1;
}
/* Step to the next */
- DEBUGP("len%04X \n", optlen);
+ pr_debug("len%04X\n", optlen);
if ((ptr > skb->len - optlen || hdrlen < optlen) &&
- (temp < optinfo->optsnr - 1)) {
- DEBUGP("new pointer is too large! \n");
+ temp < optinfo->optsnr - 1) {
+ pr_debug("new pointer is too large!\n");
break;
}
ptr += optlen;
@@ -166,59 +154,58 @@ match(const struct sk_buff *skb,
if (temp == optinfo->optsnr)
return ret;
else
- return 0;
+ return false;
}
- return 0;
+ return false;
}
-/* Called when user tries to insert an entry of this type. */
-static int
-checkentry(const char *tablename,
- const void *entry,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+static int hbh_mt6_check(const struct xt_mtchk_param *par)
{
- const struct ip6t_opts *optsinfo = matchinfo;
+ const struct ip6t_opts *optsinfo = par->matchinfo;
if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) {
- DEBUGP("ip6t_opts: unknown flags %X\n", optsinfo->invflags);
- return 0;
+ pr_debug("unknown flags %X\n", optsinfo->invflags);
+ return -EINVAL;
}
- return 1;
+
+ if (optsinfo->flags & IP6T_OPTS_NSTRICT) {
+ pr_debug("Not strict - not implemented");
+ return -EINVAL;
+ }
+
+ return 0;
}
-static struct xt_match opts_match[] = {
+static struct xt_match hbh_mt6_reg[] __read_mostly = {
{
+ /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */
.name = "hbh",
- .family = AF_INET6,
- .match = match,
+ .family = NFPROTO_IPV6,
+ .match = hbh_mt6,
.matchsize = sizeof(struct ip6t_opts),
- .checkentry = checkentry,
+ .checkentry = hbh_mt6_check,
.me = THIS_MODULE,
- .data = NEXTHDR_HOP,
},
{
.name = "dst",
- .family = AF_INET6,
- .match = match,
+ .family = NFPROTO_IPV6,
+ .match = hbh_mt6,
.matchsize = sizeof(struct ip6t_opts),
- .checkentry = checkentry,
+ .checkentry = hbh_mt6_check,
.me = THIS_MODULE,
- .data = NEXTHDR_DEST,
},
};
-static int __init ip6t_hbh_init(void)
+static int __init hbh_mt6_init(void)
{
- return xt_register_matches(opts_match, ARRAY_SIZE(opts_match));
+ return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg));
}
-static void __exit ip6t_hbh_fini(void)
+static void __exit hbh_mt6_exit(void)
{
- xt_unregister_matches(opts_match, ARRAY_SIZE(opts_match));
+ xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg));
}
-module_init(ip6t_hbh_init);
-module_exit(ip6t_hbh_fini);
+module_init(hbh_mt6_init);
+module_exit(hbh_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c
deleted file mode 100644
index 44a729e17c48..000000000000
--- a/net/ipv6/netfilter/ip6t_hl.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Hop Limit matching module */
-
-/* (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv>
- * Based on HW's ttl module
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv6/ip6t_hl.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
-MODULE_DESCRIPTION("IP tables Hop Limit matching module");
-MODULE_LICENSE("GPL");
-
-static int match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, int *hotdrop)
-{
- const struct ip6t_hl_info *info = matchinfo;
- const struct ipv6hdr *ip6h = skb->nh.ipv6h;
-
- switch (info->mode) {
- case IP6T_HL_EQ:
- return (ip6h->hop_limit == info->hop_limit);
- break;
- case IP6T_HL_NE:
- return (!(ip6h->hop_limit == info->hop_limit));
- break;
- case IP6T_HL_LT:
- return (ip6h->hop_limit < info->hop_limit);
- break;
- case IP6T_HL_GT:
- return (ip6h->hop_limit > info->hop_limit);
- break;
- default:
- printk(KERN_WARNING "ip6t_hl: unknown mode %d\n",
- info->mode);
- return 0;
- }
-
- return 0;
-}
-
-static struct ip6t_match hl_match = {
- .name = "hl",
- .match = match,
- .matchsize = sizeof(struct ip6t_hl_info),
- .me = THIS_MODULE,
-};
-
-static int __init ip6t_hl_init(void)
-{
- return ip6t_register_match(&hl_match);
-}
-
-static void __exit ip6t_hl_fini(void)
-{
- ip6t_unregister_match(&hl_match);
-
-}
-
-module_init(ip6t_hl_init);
-module_exit(ip6t_hl_fini);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 3093c398002f..c52ff929c93b 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -1,14 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* ipv6header match - matches IPv6 packets based
on whether they contain certain headers */
-/* Original idea: Brad Chapman
+/* Original idea: Brad Chapman
* Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -18,24 +15,18 @@
#include <net/checksum.h>
#include <net/ipv6.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_ipv6/ip6t_ipv6header.h>
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("IPv6 headers match");
+MODULE_DESCRIPTION("Xtables: IPv6 header types match");
MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
-static int
-ipv6header_match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static bool
+ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
- const struct ip6t_ipv6header_info *info = matchinfo;
+ const struct ip6t_ipv6header_info *info = par->matchinfo;
unsigned int temp;
int len;
u8 nexthdr;
@@ -44,25 +35,26 @@ ipv6header_match(const struct sk_buff *skb,
/* Make sure this isn't an evil packet */
/* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
+ nexthdr = ipv6_hdr(skb)->nexthdr;
/* pointer to the 1st exthdr */
ptr = sizeof(struct ipv6hdr);
/* available length */
len = skb->len - ptr;
temp = 0;
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
+ while (nf_ip6_ext_hdr(nexthdr)) {
+ const struct ipv6_opt_hdr *hp;
+ struct ipv6_opt_hdr _hdr;
int hdrlen;
- /* Is there enough space for the next ext header? */
- if (len < (int)sizeof(struct ipv6_opt_hdr))
- return 0;
/* No more exthdr -> evaluate */
if (nexthdr == NEXTHDR_NONE) {
temp |= MASK_NONE;
break;
}
+ /* Is there enough space for the next ext header? */
+ if (len < (int)sizeof(struct ipv6_opt_hdr))
+ return false;
/* ESP -> evaluate */
if (nexthdr == NEXTHDR_ESP) {
temp |= MASK_ESP;
@@ -70,13 +62,16 @@ ipv6header_match(const struct sk_buff *skb,
}
hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
+ if (!hp) {
+ par->hotdrop = true;
+ return false;
+ }
/* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT) {
+ if (nexthdr == NEXTHDR_FRAGMENT)
hdrlen = 8;
- } else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen + 2) << 2;
+ else if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = ipv6_authlen(hp);
else
hdrlen = ipv6_optlen(hp);
@@ -98,8 +93,7 @@ ipv6header_match(const struct sk_buff *skb,
temp |= MASK_DSTOPTS;
break;
default:
- return 0;
- break;
+ return false;
}
nexthdr = hp->nexthdr;
@@ -109,7 +103,7 @@ ipv6header_match(const struct sk_buff *skb,
break;
}
- if ((nexthdr != NEXTHDR_NONE) && (nexthdr != NEXTHDR_ESP))
+ if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP)
temp |= MASK_PROTO;
if (info->modeflag)
@@ -123,41 +117,37 @@ ipv6header_match(const struct sk_buff *skb,
}
}
-static int
-ipv6header_checkentry(const char *tablename,
- const void *ip,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+static int ipv6header_mt6_check(const struct xt_mtchk_param *par)
{
- const struct ip6t_ipv6header_info *info = matchinfo;
+ const struct ip6t_ipv6header_info *info = par->matchinfo;
/* invflags is 0 or 0xff in hard mode */
if ((!info->modeflag) && info->invflags != 0x00 &&
info->invflags != 0xFF)
- return 0;
+ return -EINVAL;
- return 1;
+ return 0;
}
-static struct ip6t_match ip6t_ipv6header_match = {
+static struct xt_match ipv6header_mt6_reg __read_mostly = {
.name = "ipv6header",
- .match = &ipv6header_match,
+ .family = NFPROTO_IPV6,
+ .match = ipv6header_mt6,
.matchsize = sizeof(struct ip6t_ipv6header_info),
- .checkentry = &ipv6header_checkentry,
+ .checkentry = ipv6header_mt6_check,
.destroy = NULL,
.me = THIS_MODULE,
};
-static int __init ipv6header_init(void)
+static int __init ipv6header_mt6_init(void)
{
- return ip6t_register_match(&ip6t_ipv6header_match);
+ return xt_register_match(&ipv6header_mt6_reg);
}
-static void __exit ipv6header_exit(void)
+static void __exit ipv6header_mt6_exit(void)
{
- ip6t_unregister_match(&ip6t_ipv6header_match);
+ xt_unregister_match(&ipv6header_mt6_reg);
}
-module_init(ipv6header_init);
-module_exit(ipv6header_exit);
+module_init(ipv6header_mt6_init);
+module_exit(ipv6header_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
new file mode 100644
index 000000000000..fd492b69acbc
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C)2006 USAGI/WIDE Project
+ *
+ * Author:
+ * Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com>
+ *
+ * Based on net/netfilter/xt_tcpudp.c
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/mip6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6t_mh.h>
+
+MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match");
+MODULE_LICENSE("GPL");
+
+/* Returns 1 if the type is matched by the range, 0 otherwise */
+static inline bool
+type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert)
+{
+ return (type >= min && type <= max) ^ invert;
+}
+
+static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip6_mh _mh;
+ const struct ip6_mh *mh;
+ const struct ip6t_mh *mhinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh);
+ if (mh == NULL) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ pr_debug("Dropping evil MH tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ if (mh->ip6mh_proto != IPPROTO_NONE) {
+ pr_debug("Dropping invalid MH Payload Proto: %u\n",
+ mh->ip6mh_proto);
+ par->hotdrop = true;
+ return false;
+ }
+
+ return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type,
+ !!(mhinfo->invflags & IP6T_MH_INV_TYPE));
+}
+
+static int mh_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_mh *mhinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0;
+}
+
+static struct xt_match mh_mt6_reg __read_mostly = {
+ .name = "mh",
+ .family = NFPROTO_IPV6,
+ .checkentry = mh_mt6_check,
+ .match = mh_mt6,
+ .matchsize = sizeof(struct ip6t_mh),
+ .proto = IPPROTO_MH,
+ .me = THIS_MODULE,
+};
+
+static int __init mh_mt6_init(void)
+{
+ return xt_register_match(&mh_mt6_reg);
+}
+
+static void __exit mh_mt6_exit(void)
+{
+ xt_unregister_match(&mh_mt6_reg);
+}
+
+module_init(mh_mt6_init);
+module_exit(mh_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
deleted file mode 100644
index 4eb9bbc4ebc3..000000000000
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Kernel module to match various things tied to sockets associated with
- locally generated outgoing packets. */
-
-/* (C) 2000-2001 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/file.h>
-#include <linux/rcupdate.h>
-#include <net/sock.h>
-
-#include <linux/netfilter_ipv6/ip6t_owner.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("IP6 tables owner matching module");
-MODULE_LICENSE("GPL");
-
-
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
-{
- const struct ip6t_owner_info *info = matchinfo;
-
- if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
- return 0;
-
- if (info->match & IP6T_OWNER_UID) {
- if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
- !!(info->invert & IP6T_OWNER_UID))
- return 0;
- }
-
- if (info->match & IP6T_OWNER_GID) {
- if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
- !!(info->invert & IP6T_OWNER_GID))
- return 0;
- }
-
- return 1;
-}
-
-static int
-checkentry(const char *tablename,
- const void *ip,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
-{
- const struct ip6t_owner_info *info = matchinfo;
-
- if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) {
- printk("ipt_owner: pid and sid matching "
- "not supported anymore\n");
- return 0;
- }
- return 1;
-}
-
-static struct ip6t_match owner_match = {
- .name = "owner",
- .match = match,
- .matchsize = sizeof(struct ip6t_owner_info),
- .hooks = (1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING),
- .checkentry = checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ip6t_owner_init(void)
-{
- return ip6t_register_match(&owner_match);
-}
-
-static void __exit ip6t_owner_fini(void)
-{
- ip6t_unregister_match(&owner_match);
-}
-
-module_init(ip6t_owner_init);
-module_exit(ip6t_owner_fini);
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
new file mode 100644
index 000000000000..67c87a88cde4
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/route.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("Xtables: IPv6 reverse path filter match");
+
+static bool rpfilter_addr_unicast(const struct in6_addr *addr)
+{
+ int addr_type = ipv6_addr_type(addr);
+ return addr_type & IPV6_ADDR_UNICAST;
+}
+
+static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
+{
+ int addr_type = ipv6_addr_type(addr);
+ return addr_type & IPV6_ADDR_LINKLOCAL;
+}
+
+static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
+ const struct net_device *dev, u8 flags)
+{
+ struct rt6_info *rt;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ bool ret = false;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev),
+ .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
+ .flowi6_proto = iph->nexthdr,
+ .flowi6_uid = sock_net_uid(net, NULL),
+ .daddr = iph->saddr,
+ };
+ int lookup_flags;
+
+ if (rpfilter_addr_unicast(&iph->daddr)) {
+ memcpy(&fl6.saddr, &iph->daddr, sizeof(struct in6_addr));
+ lookup_flags = RT6_LOOKUP_F_HAS_SADDR;
+ } else {
+ lookup_flags = 0;
+ }
+
+ fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+
+ if (rpfilter_addr_linklocal(&iph->saddr)) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6.flowi6_oif = dev->ifindex;
+ } else if ((flags & XT_RPFILTER_LOOSE) == 0)
+ fl6.flowi6_oif = dev->ifindex;
+
+ rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
+ if (rt->dst.error)
+ goto out;
+
+ if (rt->rt6i_flags & (RTF_REJECT|RTF_ANYCAST))
+ goto out;
+
+ if (rt->rt6i_flags & RTF_LOCAL) {
+ ret = flags & XT_RPFILTER_ACCEPT_LOCAL;
+ goto out;
+ }
+
+ if (rt->rt6i_idev->dev == dev ||
+ l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex ||
+ (flags & XT_RPFILTER_LOOSE))
+ ret = true;
+ out:
+ ip6_rt_put(rt);
+ return ret;
+}
+
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
+{
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ int saddrtype;
+ struct ipv6hdr *iph;
+ bool invert = info->flags & XT_RPFILTER_INVERT;
+
+ if (rpfilter_is_loopback(skb, xt_in(par)))
+ return true ^ invert;
+
+ iph = ipv6_hdr(skb);
+ saddrtype = ipv6_addr_type(&iph->saddr);
+ if (unlikely(saddrtype == IPV6_ADDR_ANY))
+ return true ^ invert; /* not routable: forward path will drop it */
+
+ return rpfilter_lookup_reverse6(xt_net(par), skb, xt_in(par),
+ info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+
+ if (info->flags & options) {
+ pr_info_ratelimited("unknown options\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "raw") != 0) {
+ pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n",
+ par->table);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+ .name = "rpfilter",
+ .family = NFPROTO_IPV6,
+ .checkentry = rpfilter_check,
+ .match = rpfilter_mt,
+ .matchsize = sizeof(struct xt_rpfilter_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING),
+ .me = THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+ return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+ xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 54d7d14134fd..4ad8b2032f1f 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match ROUTING parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/ipv6.h>
@@ -16,130 +13,87 @@
#include <asm/byteorder.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter_ipv6/ip6t_rt.h>
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("IPv6 RT match");
+MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match");
MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
/* Returns 1 if the id is matched by the range, 0 otherwise */
-static inline int
-segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert)
+static inline bool
+segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
{
- int r = 0;
- DEBUGP("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x",
- invert ? '!' : ' ', min, id, max);
- r = (id >= min && id <= max) ^ invert;
- DEBUGP(" result %s\n", r ? "PASS" : "FAILED");
- return r;
+ return (id >= min && id <= max) ^ invert;
}
-static int
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- int *hotdrop)
+static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct ipv6_rt_hdr _route, *rh;
- const struct ip6t_rt *rtinfo = matchinfo;
+ struct ipv6_rt_hdr _route;
+ const struct ipv6_rt_hdr *rh;
+ const struct ip6t_rt *rtinfo = par->matchinfo;
unsigned int temp;
- unsigned int ptr;
+ unsigned int ptr = 0;
unsigned int hdrlen = 0;
- unsigned int ret = 0;
- struct in6_addr *ap, _addr;
+ bool ret = false;
+ struct in6_addr _addr;
+ const struct in6_addr *ap;
int err;
- err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL);
+ err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL, NULL);
if (err < 0) {
if (err != -ENOENT)
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
if (rh == NULL) {
- *hotdrop = 1;
- return 0;
+ par->hotdrop = true;
+ return false;
}
hdrlen = ipv6_optlen(rh);
if (skb->len - ptr < hdrlen) {
/* Pcket smaller than its length field */
- return 0;
+ return false;
}
- DEBUGP("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
- DEBUGP("TYPE %04X ", rh->type);
- DEBUGP("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
-
- DEBUGP("IPv6 RT segsleft %02X ",
- (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
- rh->segments_left,
- !!(rtinfo->invflags & IP6T_RT_INV_SGS))));
- DEBUGP("type %02X %02X %02X ",
- rtinfo->rt_type, rh->type,
- (!(rtinfo->flags & IP6T_RT_TYP) ||
- ((rtinfo->rt_type == rh->type) ^
- !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
- DEBUGP("len %02X %04X %02X ",
- rtinfo->hdrlen, hdrlen,
- (!(rtinfo->flags & IP6T_RT_LEN) ||
- ((rtinfo->hdrlen == hdrlen) ^
- !!(rtinfo->invflags & IP6T_RT_INV_LEN))));
- DEBUGP("res %02X %02X %02X ",
- (rtinfo->flags & IP6T_RT_RES),
- ((struct rt0_hdr *)rh)->reserved,
- !((rtinfo->flags & IP6T_RT_RES) &&
- (((struct rt0_hdr *)rh)->reserved)));
-
- ret = (rh != NULL)
- &&
- (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+ ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
rh->segments_left,
- !!(rtinfo->invflags & IP6T_RT_INV_SGS)))
- &&
+ !!(rtinfo->invflags & IP6T_RT_INV_SGS))) &&
(!(rtinfo->flags & IP6T_RT_LEN) ||
((rtinfo->hdrlen == hdrlen) ^
- !!(rtinfo->invflags & IP6T_RT_INV_LEN)))
- &&
+ !!(rtinfo->invflags & IP6T_RT_INV_LEN))) &&
(!(rtinfo->flags & IP6T_RT_TYP) ||
((rtinfo->rt_type == rh->type) ^
!!(rtinfo->invflags & IP6T_RT_INV_TYP)));
if (ret && (rtinfo->flags & IP6T_RT_RES)) {
- u_int32_t *rp, _reserved;
+ const u_int32_t *rp;
+ u_int32_t _reserved;
rp = skb_header_pointer(skb,
ptr + offsetof(struct rt0_hdr,
reserved),
sizeof(_reserved),
&_reserved);
+ if (!rp) {
+ par->hotdrop = true;
+ return false;
+ }
ret = (*rp == 0);
}
- DEBUGP("#%d ", rtinfo->addrnr);
if (!(rtinfo->flags & IP6T_RT_FST)) {
return ret;
} else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) {
- DEBUGP("Not strict ");
if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
- DEBUGP("There isn't enough space\n");
- return 0;
+ return false;
} else {
unsigned int i = 0;
- DEBUGP("#%d ", rtinfo->addrnr);
for (temp = 0;
temp < (unsigned int)((hdrlen - 8) / 16);
temp++) {
@@ -150,28 +104,25 @@ match(const struct sk_buff *skb,
sizeof(_addr),
&_addr);
- BUG_ON(ap == NULL);
+ if (ap == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
- if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
- DEBUGP("i=%d temp=%d;\n", i, temp);
+ if (ipv6_addr_equal(ap, &rtinfo->addrs[i]))
i++;
- }
if (i == rtinfo->addrnr)
break;
}
- DEBUGP("i=%d #%d\n", i, rtinfo->addrnr);
if (i == rtinfo->addrnr)
return ret;
else
- return 0;
+ return false;
}
} else {
- DEBUGP("Strict ");
if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
- DEBUGP("There isn't enough space\n");
- return 0;
+ return false;
} else {
- DEBUGP("#%d ", rtinfo->addrnr);
for (temp = 0; temp < rtinfo->addrnr; temp++) {
ap = skb_header_pointer(skb,
ptr
@@ -179,65 +130,62 @@ match(const struct sk_buff *skb,
+ temp * sizeof(_addr),
sizeof(_addr),
&_addr);
- BUG_ON(ap == NULL);
+ if (ap == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
break;
}
- DEBUGP("temp=%d #%d\n", temp, rtinfo->addrnr);
- if ((temp == rtinfo->addrnr) &&
- (temp == (unsigned int)((hdrlen - 8) / 16)))
+ if (temp == rtinfo->addrnr &&
+ temp == (unsigned int)((hdrlen - 8) / 16))
return ret;
else
- return 0;
+ return false;
}
}
- return 0;
+ return false;
}
-/* Called when user tries to insert an entry of this type. */
-static int
-checkentry(const char *tablename,
- const void *entry,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+static int rt_mt6_check(const struct xt_mtchk_param *par)
{
- const struct ip6t_rt *rtinfo = matchinfo;
+ const struct ip6t_rt *rtinfo = par->matchinfo;
if (rtinfo->invflags & ~IP6T_RT_INV_MASK) {
- DEBUGP("ip6t_rt: unknown flags %X\n", rtinfo->invflags);
- return 0;
+ pr_debug("unknown flags %X\n", rtinfo->invflags);
+ return -EINVAL;
}
if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) &&
(!(rtinfo->flags & IP6T_RT_TYP) ||
(rtinfo->rt_type != 0) ||
(rtinfo->invflags & IP6T_RT_INV_TYP))) {
- DEBUGP("`--rt-type 0' required before `--rt-0-*'");
- return 0;
+ pr_debug("`--rt-type 0' required before `--rt-0-*'");
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static struct ip6t_match rt_match = {
+static struct xt_match rt_mt6_reg __read_mostly = {
.name = "rt",
- .match = match,
+ .family = NFPROTO_IPV6,
+ .match = rt_mt6,
.matchsize = sizeof(struct ip6t_rt),
- .checkentry = checkentry,
+ .checkentry = rt_mt6_check,
.me = THIS_MODULE,
};
-static int __init ip6t_rt_init(void)
+static int __init rt_mt6_init(void)
{
- return ip6t_register_match(&rt_match);
+ return xt_register_match(&rt_mt6_reg);
}
-static void __exit ip6t_rt_fini(void)
+static void __exit rt_mt6_exit(void)
{
- ip6t_unregister_match(&rt_match);
+ xt_unregister_match(&rt_mt6_reg);
}
-module_init(ip6t_rt_init);
-module_exit(ip6t_rt_fini);
+module_init(rt_mt6_init);
+module_exit(rt_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
new file mode 100644
index 000000000000..db0fd64d8986
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Kernel module to match Segment Routing Header (SRH) parameters. */
+
+/* Author:
+ * Ahmed Abdelsalam <amsalam20@gmail.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/ipv6.h>
+#include <net/seg6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6t_srh.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+/* Test a struct->mt_invflags and a boolean for inequality */
+#define NF_SRH_INVF(ptr, flag, boolean) \
+ ((boolean) ^ !!((ptr)->mt_invflags & (flag)))
+
+static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ip6t_srh *srhinfo = par->matchinfo;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6_sr_hdr _srh;
+ int hdrlen, srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return false;
+ srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+ if (!srh)
+ return false;
+
+ hdrlen = ipv6_optlen(srh);
+ if (skb->len - srhoff < hdrlen)
+ return false;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (srh->segments_left > srh->first_segment)
+ return false;
+
+ /* Next Header matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+ !(srh->nexthdr == srhinfo->next_hdr)))
+ return false;
+
+ /* Header Extension Length matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+ !(srh->hdrlen == srhinfo->hdr_len)))
+ return false;
+
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+ !(srh->hdrlen > srhinfo->hdr_len)))
+ return false;
+
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+ !(srh->hdrlen < srhinfo->hdr_len)))
+ return false;
+
+ /* Segments Left matching */
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+ !(srh->segments_left == srhinfo->segs_left)))
+ return false;
+
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+ !(srh->segments_left > srhinfo->segs_left)))
+ return false;
+
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+ !(srh->segments_left < srhinfo->segs_left)))
+ return false;
+
+ /**
+ * Last Entry matching
+ * Last_Entry field was introduced in revision 6 of the SRH draft.
+ * It was called First_Segment in the previous revision
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+ !(srh->first_segment == srhinfo->last_entry)))
+ return false;
+
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+ !(srh->first_segment > srhinfo->last_entry)))
+ return false;
+
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+ !(srh->first_segment < srhinfo->last_entry)))
+ return false;
+
+ /**
+ * Tag matchig
+ * Tag field was introduced in revision 6 of the SRH draft.
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_TAG)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+ !(srh->tag == srhinfo->tag)))
+ return false;
+ return true;
+}
+
+static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
+ const struct ip6t_srh1 *srhinfo = par->matchinfo;
+ struct in6_addr *psid, *nsid, *lsid;
+ struct in6_addr _psid, _nsid, _lsid;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6_sr_hdr _srh;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return false;
+ srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+ if (!srh)
+ return false;
+
+ hdrlen = ipv6_optlen(srh);
+ if (skb->len - srhoff < hdrlen)
+ return false;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (srh->segments_left > srh->first_segment)
+ return false;
+
+ /* Next Header matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+ !(srh->nexthdr == srhinfo->next_hdr)))
+ return false;
+
+ /* Header Extension Length matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+ !(srh->hdrlen == srhinfo->hdr_len)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+ !(srh->hdrlen > srhinfo->hdr_len)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+ !(srh->hdrlen < srhinfo->hdr_len)))
+ return false;
+
+ /* Segments Left matching */
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+ !(srh->segments_left == srhinfo->segs_left)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+ !(srh->segments_left > srhinfo->segs_left)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+ !(srh->segments_left < srhinfo->segs_left)))
+ return false;
+
+ /**
+ * Last Entry matching
+ * Last_Entry field was introduced in revision 6 of the SRH draft.
+ * It was called First_Segment in the previous revision
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+ !(srh->first_segment == srhinfo->last_entry)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+ !(srh->first_segment > srhinfo->last_entry)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+ !(srh->first_segment < srhinfo->last_entry)))
+ return false;
+
+ /**
+ * Tag matchig
+ * Tag field was introduced in revision 6 of the SRH draft
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_TAG)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+ !(srh->tag == srhinfo->tag)))
+ return false;
+
+ /* Previous SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_PSID) {
+ if (srh->segments_left == srh->first_segment)
+ return false;
+ psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+ ((srh->segments_left + 1) * sizeof(struct in6_addr));
+ psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+ if (!psid)
+ return false;
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
+ ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
+ &srhinfo->psid_addr)))
+ return false;
+ }
+
+ /* Next SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NSID) {
+ if (srh->segments_left == 0)
+ return false;
+ nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+ ((srh->segments_left - 1) * sizeof(struct in6_addr));
+ nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+ if (!nsid)
+ return false;
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
+ ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
+ &srhinfo->nsid_addr)))
+ return false;
+ }
+
+ /* Last SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LSID) {
+ lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
+ lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+ if (!lsid)
+ return false;
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
+ ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
+ &srhinfo->lsid_addr)))
+ return false;
+ }
+ return true;
+}
+
+static int srh_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_srh *srhinfo = par->matchinfo;
+
+ if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+ pr_info_ratelimited("unknown srh match flags %X\n",
+ srhinfo->mt_flags);
+ return -EINVAL;
+ }
+
+ if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+ pr_info_ratelimited("unknown srh invflags %X\n",
+ srhinfo->mt_invflags);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int srh1_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_srh1 *srhinfo = par->matchinfo;
+
+ if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+ pr_info_ratelimited("unknown srh match flags %X\n",
+ srhinfo->mt_flags);
+ return -EINVAL;
+ }
+
+ if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+ pr_info_ratelimited("unknown srh invflags %X\n",
+ srhinfo->mt_invflags);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match srh_mt6_reg[] __read_mostly = {
+ {
+ .name = "srh",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = srh_mt6,
+ .matchsize = sizeof(struct ip6t_srh),
+ .checkentry = srh_mt6_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "srh",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = srh1_mt6,
+ .matchsize = sizeof(struct ip6t_srh1),
+ .checkentry = srh1_mt6_check,
+ .me = THIS_MODULE,
+ }
+};
+
+static int __init srh_mt6_init(void)
+{
+ return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
+}
+
+static void __exit srh_mt6_exit(void)
+{
+ xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
+}
+
+module_init(srh_mt6_init);
+module_exit(srh_mt6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match");
+MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>");
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 2fc07c74decf..e8992693e14a 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -1,197 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("ip6tables filter table");
-#define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT))
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT))
-/* Standard entry. */
-struct ip6t_standard
-{
- struct ip6t_entry entry;
- struct ip6t_standard_target target;
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_FILTER,
};
-struct ip6t_error_target
-{
- struct ip6t_entry_target target;
- char errorname[IP6T_FUNCTION_MAXNAMELEN];
-};
+static struct nf_hook_ops *filter_ops __read_mostly;
-struct ip6t_error
-{
- struct ip6t_entry entry;
- struct ip6t_error_target target;
-};
+/* Default to forward because I got too much mail already. */
+static bool forward = true;
+module_param(forward, bool, 0000);
-static struct
+static int ip6table_filter_table_init(struct net *net)
{
- struct ip6t_replace repl;
- struct ip6t_standard entries[3];
- struct ip6t_error term;
-} initial_table __initdata
-= { { "filter", FILTER_VALID_HOOKS, 4,
- sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error),
- { [NF_IP6_LOCAL_IN] = 0,
- [NF_IP6_FORWARD] = sizeof(struct ip6t_standard),
- [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 },
- { [NF_IP6_LOCAL_IN] = 0,
- [NF_IP6_FORWARD] = sizeof(struct ip6t_standard),
- [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 },
- 0, NULL, { } },
- {
- /* LOCAL_IN */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* FORWARD */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_OUT */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } }
- },
- /* ERROR */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_error),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } },
- { } },
- "ERROR"
- }
- }
-};
+ struct ip6t_replace *repl;
+ int err;
-static struct ip6t_table packet_filter = {
- .name = "filter",
- .valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .me = THIS_MODULE,
- .af = AF_INET6,
-};
+ repl = ip6t_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ /* Entry 1 is the FORWARD hook */
+ ((struct ip6t_standard *)repl->entries)[1].target.verdict =
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ip6t_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ err = ip6t_register_table(net, &packet_filter, repl, filter_ops);
+ kfree(repl);
+ return err;
+}
+
+static int __net_init ip6table_filter_net_init(struct net *net)
{
- return ip6t_do_table(pskb, hook, in, out, &packet_filter);
+ if (!forward)
+ return ip6table_filter_table_init(net);
+
+ return 0;
}
-static unsigned int
-ip6t_local_out_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static void __net_exit ip6table_filter_net_pre_exit(struct net *net)
{
-#if 0
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
- if (net_ratelimit())
- printk("ip6t_hook: happy cracking.\n");
- return NF_ACCEPT;
- }
-#endif
+ ip6t_unregister_table_pre_exit(net, "filter");
+}
- return ip6t_do_table(pskb, hook, in, out, &packet_filter);
+static void __net_exit ip6table_filter_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "filter");
}
-static struct nf_hook_ops ip6t_ops[] = {
- {
- .hook = ip6t_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_IN,
- .priority = NF_IP6_PRI_FILTER,
- },
- {
- .hook = ip6t_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_FORWARD,
- .priority = NF_IP6_PRI_FILTER,
- },
- {
- .hook = ip6t_local_out_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_FILTER,
- },
+static struct pernet_operations ip6table_filter_net_ops = {
+ .init = ip6table_filter_net_init,
+ .pre_exit = ip6table_filter_net_pre_exit,
+ .exit = ip6table_filter_net_exit,
};
-/* Default to forward because I got too much mail already. */
-static int forward = NF_ACCEPT;
-module_param(forward, bool, 0000);
-
static int __init ip6table_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ ip6table_filter_table_init);
- if (forward < 0 || forward > NF_MAX_VERDICT) {
- printk("iptables forward must be 0 or 1\n");
- return -EINVAL;
- }
-
- /* Entry 1 is the FORWARD hook */
- initial_table.entries[1].target.verdict = -forward - 1;
-
- /* Register table */
- ret = ip6t_register_table(&packet_filter, &initial_table.repl);
if (ret < 0)
return ret;
- /* Register hooks */
- ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops));
- if (ret < 0)
- goto cleanup_table;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table);
+ if (IS_ERR(filter_ops)) {
+ xt_unregister_template(&packet_filter);
+ return PTR_ERR(filter_ops);
+ }
- return ret;
+ ret = register_pernet_subsys(&ip6table_filter_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
+ kfree(filter_ops);
+ return ret;
+ }
- cleanup_table:
- ip6t_unregister_table(&packet_filter);
return ret;
}
static void __exit ip6table_filter_fini(void)
{
- nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops));
- ip6t_unregister_table(&packet_filter);
+ unregister_pernet_subsys(&ip6table_filter_net_ops);
+ xt_unregister_template(&packet_filter);
+ kfree(filter_ops);
}
module_init(ip6table_filter_init);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 6250e86a6ddc..8dd4cd0c47bd 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -1,253 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6
*
* Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
*/
#include <linux/module.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("ip6tables mangle table");
-#define MANGLE_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | \
- (1 << NF_IP6_LOCAL_IN) | \
- (1 << NF_IP6_FORWARD) | \
- (1 << NF_IP6_LOCAL_OUT) | \
- (1 << NF_IP6_POST_ROUTING))
-
-#if 0
-#define DEBUGP(x, args...) printk(KERN_DEBUG x, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
-
-/* Standard entry. */
-struct ip6t_standard
-{
- struct ip6t_entry entry;
- struct ip6t_standard_target target;
-};
-
-struct ip6t_error_target
-{
- struct ip6t_entry_target target;
- char errorname[IP6T_FUNCTION_MAXNAMELEN];
-};
-
-struct ip6t_error
-{
- struct ip6t_entry entry;
- struct ip6t_error_target target;
-};
-
-static struct
-{
- struct ip6t_replace repl;
- struct ip6t_standard entries[5];
- struct ip6t_error term;
-} initial_table __initdata
-= { { "mangle", MANGLE_VALID_HOOKS, 6,
- sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error),
- { [NF_IP6_PRE_ROUTING] = 0,
- [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard),
- [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2,
- [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3,
- [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4},
- { [NF_IP6_PRE_ROUTING] = 0,
- [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard),
- [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2,
- [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3,
- [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4},
- 0, NULL, { } },
- {
- /* PRE_ROUTING */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_IN */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* FORWARD */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* LOCAL_OUT */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } },
- /* POST_ROUTING */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_standard),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } },
- -NF_ACCEPT - 1 } }
- },
- /* ERROR */
- { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 },
- 0,
- sizeof(struct ip6t_entry),
- sizeof(struct ip6t_error),
- 0, { 0, 0 }, { } },
- { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } },
- { } },
- "ERROR"
- }
- }
-};
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_POST_ROUTING))
-static struct ip6t_table packet_mangler = {
+static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
.me = THIS_MODULE,
- .af = AF_INET6,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_MANGLE,
};
-/* The work comes in here from netfilter.c. */
static unsigned int
-ip6t_route_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
- return ip6t_do_table(pskb, hook, in, out, &packet_mangler);
-}
-
-static unsigned int
-ip6t_local_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
-
- unsigned int ret;
struct in6_addr saddr, daddr;
- u_int8_t hop_limit;
- u_int32_t flowlabel, mark;
-
-#if 0
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct iphdr)
- || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
- if (net_ratelimit())
- printk("ip6t_hook: happy cracking.\n");
- return NF_ACCEPT;
- }
-#endif
+ unsigned int ret, verdict;
+ u32 flowlabel, mark;
+ u8 hop_limit;
+ int err;
/* save source/dest address, mark, hoplimit, flowlabel, priority, */
- memcpy(&saddr, &(*pskb)->nh.ipv6h->saddr, sizeof(saddr));
- memcpy(&daddr, &(*pskb)->nh.ipv6h->daddr, sizeof(daddr));
- mark = (*pskb)->mark;
- hop_limit = (*pskb)->nh.ipv6h->hop_limit;
+ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+ memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+ mark = skb->mark;
+ hop_limit = ipv6_hdr(skb)->hop_limit;
/* flowlabel and prio (includes version, which shouldn't change either */
- flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h);
+ flowlabel = *((u_int32_t *)ipv6_hdr(skb));
+
+ ret = ip6t_do_table(priv, skb, state);
+ verdict = ret & NF_VERDICT_MASK;
+
+ if (verdict != NF_DROP && verdict != NF_STOLEN &&
+ (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
+ !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) ||
+ skb->mark != mark ||
+ ipv6_hdr(skb)->hop_limit != hop_limit ||
+ flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+ err = ip6_route_me_harder(state->net, state->sk, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
- ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler);
+ return ret;
+}
- if (ret != NF_DROP && ret != NF_STOLEN
- && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr))
- || memcmp(&(*pskb)->nh.ipv6h->daddr, &daddr, sizeof(daddr))
- || (*pskb)->mark != mark
- || (*pskb)->nh.ipv6h->hop_limit != hop_limit))
- return ip6_route_me_harder(*pskb) == 0 ? ret : NF_DROP;
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_mangle_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (state->hook == NF_INET_LOCAL_OUT)
+ return ip6t_mangle_out(priv, skb, state);
+ return ip6t_do_table(priv, skb, state);
+}
+static struct nf_hook_ops *mangle_ops __read_mostly;
+static int ip6table_mangle_table_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+ int ret;
+
+ repl = ip6t_alloc_initial_table(&packet_mangler);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops);
+ kfree(repl);
return ret;
}
-static struct nf_hook_ops ip6t_ops[] = {
- {
- .hook = ip6t_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_PRE_ROUTING,
- .priority = NF_IP6_PRI_MANGLE,
- },
- {
- .hook = ip6t_local_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_IN,
- .priority = NF_IP6_PRI_MANGLE,
- },
- {
- .hook = ip6t_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_FORWARD,
- .priority = NF_IP6_PRI_MANGLE,
- },
- {
- .hook = ip6t_local_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_MANGLE,
- },
- {
- .hook = ip6t_route_hook,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_POST_ROUTING,
- .priority = NF_IP6_PRI_MANGLE,
- },
+static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "mangle");
+}
+
+static void __net_exit ip6table_mangle_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "mangle");
+}
+
+static struct pernet_operations ip6table_mangle_net_ops = {
+ .pre_exit = ip6table_mangle_net_pre_exit,
+ .exit = ip6table_mangle_net_exit,
};
static int __init ip6table_mangle_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_mangler,
+ ip6table_mangle_table_init);
- /* Register table */
- ret = ip6t_register_table(&packet_mangler, &initial_table.repl);
if (ret < 0)
return ret;
- /* Register hooks */
- ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops));
- if (ret < 0)
- goto cleanup_table;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ xt_unregister_template(&packet_mangler);
+ return PTR_ERR(mangle_ops);
+ }
- return ret;
+ ret = register_pernet_subsys(&ip6table_mangle_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&packet_mangler);
+ kfree(mangle_ops);
+ return ret;
+ }
- cleanup_table:
- ip6t_unregister_table(&packet_mangler);
return ret;
}
static void __exit ip6table_mangle_fini(void)
{
- nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops));
- ip6t_unregister_table(&packet_mangler);
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ xt_unregister_template(&packet_mangler);
+ kfree(mangle_ops);
}
module_init(ip6table_mangle_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
new file mode 100644
index 000000000000..e119d4f090cc
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on Rusty Russell's IPv4 NAT code. Development of IPv6 NAT
+ * funded by Astaro.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+#include <net/netfilter/nf_nat.h>
+
+struct ip6table_nat_pernet {
+ struct nf_hook_ops *nf_nat_ops;
+};
+
+static unsigned int ip6table_nat_net_id __read_mostly;
+
+static const struct xt_table nf_nat_ipv6_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+};
+
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+ {
+ .hook = ip6t_do_table,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ {
+ .hook = ip6t_do_table,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+ {
+ .hook = ip6t_do_table,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ {
+ .hook = ip6t_do_table,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+};
+
+static int ip6t_nat_register_lookups(struct net *net)
+{
+ struct ip6table_nat_pernet *xt_nat_net;
+ struct nf_hook_ops *ops;
+ struct xt_table *table;
+ int i, ret;
+
+ table = xt_find_table(net, NFPROTO_IPV6, "nat");
+ if (WARN_ON_ONCE(!table))
+ return -ENOENT;
+
+ xt_nat_net = net_generic(net, ip6table_nat_net_id);
+ ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
+ ops[i].priv = table;
+ ret = nf_nat_ipv6_register_fn(net, &ops[i]);
+ if (ret) {
+ while (i)
+ nf_nat_ipv6_unregister_fn(net, &ops[--i]);
+
+ kfree(ops);
+ return ret;
+ }
+ }
+
+ xt_nat_net->nf_nat_ops = ops;
+ return 0;
+}
+
+static void ip6t_nat_unregister_lookups(struct net *net)
+{
+ struct ip6table_nat_pernet *xt_nat_net = net_generic(net, ip6table_nat_net_id);
+ struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops;
+ int i;
+
+ if (!ops)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
+ nf_nat_ipv6_unregister_fn(net, &ops[i]);
+
+ kfree(ops);
+}
+
+static int ip6table_nat_table_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+ int ret;
+
+ repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
+ NULL);
+ if (ret < 0) {
+ kfree(repl);
+ return ret;
+ }
+
+ ret = ip6t_nat_register_lookups(net);
+ if (ret < 0)
+ ip6t_unregister_table_exit(net, "nat");
+
+ kfree(repl);
+ return ret;
+}
+
+static void __net_exit ip6table_nat_net_pre_exit(struct net *net)
+{
+ ip6t_nat_unregister_lookups(net);
+}
+
+static void __net_exit ip6table_nat_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "nat");
+}
+
+static struct pernet_operations ip6table_nat_net_ops = {
+ .pre_exit = ip6table_nat_net_pre_exit,
+ .exit = ip6table_nat_net_exit,
+ .id = &ip6table_nat_net_id,
+ .size = sizeof(struct ip6table_nat_pernet),
+};
+
+static int __init ip6table_nat_init(void)
+{
+ int ret;
+
+ /* net->gen->ptr[ip6table_nat_net_id] must be allocated
+ * before calling ip6t_nat_register_lookups().
+ */
+ ret = register_pernet_subsys(&ip6table_nat_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_register_template(&nf_nat_ipv6_table,
+ ip6table_nat_table_init);
+ if (ret)
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+
+ return ret;
+}
+
+static void __exit ip6table_nat_exit(void)
+{
+ xt_unregister_template(&nf_nat_ipv6_table);
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+}
+
+module_init(ip6table_nat_init);
+module_exit(ip6table_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ip6tables legacy nat table");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index b4154da575c0..fc9f6754028f 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -1,174 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 raw table, a port of the IPv4 raw table to IPv6
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
-#define RAW_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_OUT))
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-#if 0
-#define DEBUGP(x, args...) printk(KERN_DEBUG x, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
-/* Standard entry. */
-struct ip6t_standard
-{
- struct ip6t_entry entry;
- struct ip6t_standard_target target;
+static const struct xt_table packet_raw = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_RAW,
};
-struct ip6t_error_target
-{
- struct ip6t_entry_target target;
- char errorname[IP6T_FUNCTION_MAXNAMELEN];
+static const struct xt_table packet_raw_before_defrag = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG,
};
-struct ip6t_error
-{
- struct ip6t_entry entry;
- struct ip6t_error_target target;
-};
+static struct nf_hook_ops *rawtable_ops __read_mostly;
-static struct
+static int ip6table_raw_table_init(struct net *net)
{
- struct ip6t_replace repl;
- struct ip6t_standard entries[2];
- struct ip6t_error term;
-} initial_table __initdata = {
- .repl = {
- .name = "raw",
- .valid_hooks = RAW_VALID_HOOKS,
- .num_entries = 3,
- .size = sizeof(struct ip6t_standard) * 2 + sizeof(struct ip6t_error),
- .hook_entry = {
- [NF_IP6_PRE_ROUTING] = 0,
- [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard)
- },
- .underflow = {
- [NF_IP6_PRE_ROUTING] = 0,
- [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard)
- },
- },
- .entries = {
- /* PRE_ROUTING */
- {
- .entry = {
- .target_offset = sizeof(struct ip6t_entry),
- .next_offset = sizeof(struct ip6t_standard),
- },
- .target = {
- .target = {
- .u = {
- .target_size = IP6T_ALIGN(sizeof(struct ip6t_standard_target)),
- },
- },
- .verdict = -NF_ACCEPT - 1,
- },
- },
-
- /* LOCAL_OUT */
- {
- .entry = {
- .target_offset = sizeof(struct ip6t_entry),
- .next_offset = sizeof(struct ip6t_standard),
- },
- .target = {
- .target = {
- .u = {
- .target_size = IP6T_ALIGN(sizeof(struct ip6t_standard_target)),
- },
- },
- .verdict = -NF_ACCEPT - 1,
- },
- },
- },
- /* ERROR */
- .term = {
- .entry = {
- .target_offset = sizeof(struct ip6t_entry),
- .next_offset = sizeof(struct ip6t_error),
- },
- .target = {
- .target = {
- .u = {
- .user = {
- .target_size = IP6T_ALIGN(sizeof(struct ip6t_error_target)),
- .name = IP6T_ERROR_TARGET,
- },
- },
- },
- .errorname = "ERROR",
- },
- }
-};
+ struct ip6t_replace *repl;
+ const struct xt_table *table = &packet_raw;
+ int ret;
-static struct xt_table packet_raw = {
- .name = "raw",
- .valid_hooks = RAW_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .me = THIS_MODULE,
- .af = AF_INET6,
-};
+ if (raw_before_defrag)
+ table = &packet_raw_before_defrag;
+
+ repl = ip6t_alloc_initial_table(table);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ip6t_register_table(net, table, repl, rawtable_ops);
+ kfree(repl);
+ return ret;
+}
+
+static void __net_exit ip6table_raw_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "raw");
+}
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ip6t_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static void __net_exit ip6table_raw_net_exit(struct net *net)
{
- return ip6t_do_table(pskb, hook, in, out, &packet_raw);
+ ip6t_unregister_table_exit(net, "raw");
}
-static struct nf_hook_ops ip6t_ops[] = {
- {
- .hook = ip6t_hook,
- .pf = PF_INET6,
- .hooknum = NF_IP6_PRE_ROUTING,
- .priority = NF_IP6_PRI_FIRST,
- .owner = THIS_MODULE,
- },
- {
- .hook = ip6t_hook,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_FIRST,
- .owner = THIS_MODULE,
- },
+static struct pernet_operations ip6table_raw_net_ops = {
+ .pre_exit = ip6table_raw_net_pre_exit,
+ .exit = ip6table_raw_net_exit,
};
static int __init ip6table_raw_init(void)
{
+ const struct xt_table *table = &packet_raw;
int ret;
- /* Register table */
- ret = ip6t_register_table(&packet_raw, &initial_table.repl);
+ if (raw_before_defrag) {
+ table = &packet_raw_before_defrag;
+ pr_info("Enabling raw table before defrag\n");
+ }
+
+ ret = xt_register_template(table, ip6table_raw_table_init);
if (ret < 0)
return ret;
/* Register hooks */
- ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops));
- if (ret < 0)
- goto cleanup_table;
+ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table);
+ if (IS_ERR(rawtable_ops)) {
+ xt_unregister_template(table);
+ return PTR_ERR(rawtable_ops);
+ }
- return ret;
+ ret = register_pernet_subsys(&ip6table_raw_net_ops);
+ if (ret < 0) {
+ kfree(rawtable_ops);
+ xt_unregister_template(table);
+ return ret;
+ }
- cleanup_table:
- ip6t_unregister_table(&packet_raw);
return ret;
}
static void __exit ip6table_raw_fini(void)
{
- nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops));
- ip6t_unregister_table(&packet_raw);
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
+ xt_unregister_template(&packet_raw);
+ kfree(rawtable_ops);
}
module_init(ip6table_raw_init);
module_exit(ip6table_raw_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ip6tables legacy raw table");
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
new file mode 100644
index 000000000000..4df14a9bae78
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * "security" table for IPv6
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+ .name = "security",
+ .valid_hooks = SECURITY_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_SECURITY,
+};
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int ip6table_security_table_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+ int ret;
+
+ repl = ip6t_alloc_initial_table(&security_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ ret = ip6t_register_table(net, &security_table, repl, sectbl_ops);
+ kfree(repl);
+ return ret;
+}
+
+static void __net_exit ip6table_security_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "security");
+}
+
+static void __net_exit ip6table_security_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "security");
+}
+
+static struct pernet_operations ip6table_security_net_ops = {
+ .pre_exit = ip6table_security_net_pre_exit,
+ .exit = ip6table_security_net_exit,
+};
+
+static int __init ip6table_security_init(void)
+{
+ int ret = xt_register_template(&security_table,
+ ip6table_security_table_init);
+
+ if (ret < 0)
+ return ret;
+
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table);
+ if (IS_ERR(sectbl_ops)) {
+ xt_unregister_template(&security_table);
+ return PTR_ERR(sectbl_ops);
+ }
+
+ ret = register_pernet_subsys(&ip6table_security_net_ops);
+ if (ret < 0) {
+ kfree(sectbl_ops);
+ xt_unregister_template(&security_table);
+ return ret;
+ }
+
+ return ret;
+}
+
+static void __exit ip6table_security_fini(void)
+{
+ unregister_pernet_subsys(&ip6table_security_net_ops);
+ xt_unregister_template(&security_table);
+ kfree(sectbl_ops);
+}
+
+module_init(ip6table_security_init);
+module_exit(ip6table_security_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
deleted file mode 100644
index a20615ffccff..000000000000
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- * Copyright (C)2004 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - support Layer 3 protocol independent connection tracking.
- * Based on the original ip_conntrack code which had the following
- * copyright information:
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - add get_features() to support various size of conntrack
- * structures.
- */
-
-#include <linux/types.h>
-#include <linux/ipv6.h>
-#include <linux/in6.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/ipv6.h>
-
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- u_int32_t _addrs[8], *ap;
-
- ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
- sizeof(_addrs), _addrs);
- if (ap == NULL)
- return 0;
-
- memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
- memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
-
- return 1;
-}
-
-static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
- memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
-
- return 1;
-}
-
-static int ipv6_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ",
- NIP6(*((struct in6_addr *)tuple->src.u3.ip6)),
- NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)));
-}
-
-static int ipv6_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
-/*
- * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c
- *
- * This function parses (probably truncated) exthdr set "hdr"
- * of length "len". "nexthdrp" initially points to some place,
- * where type of the first header can be found.
- *
- * It skips all well-known exthdrs, and returns pointer to the start
- * of unparsable area i.e. the first header with unknown type.
- * if success, *nexthdr is updated by type/protocol of this header.
- *
- * NOTES: - it may return pointer pointing beyond end of packet,
- * if the last recognized header is truncated in the middle.
- * - if packet is truncated, so that all parsed headers are skipped,
- * it returns -1.
- * - if packet is fragmented, return pointer of the fragment header.
- * - ESP is unparsable for now and considered like
- * normal payload protocol.
- * - Note also special handling of AUTH header. Thanks to IPsec wizards.
- */
-
-int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp,
- int len)
-{
- u8 nexthdr = *nexthdrp;
-
- while (ipv6_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr hdr;
- int hdrlen;
-
- if (len < (int)sizeof(struct ipv6_opt_hdr))
- return -1;
- if (nexthdr == NEXTHDR_NONE)
- break;
- if (nexthdr == NEXTHDR_FRAGMENT)
- break;
- if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
- BUG();
- if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hdr.hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(&hdr);
-
- nexthdr = hdr.nexthdr;
- len -= hdrlen;
- start += hdrlen;
- }
-
- *nexthdrp = nexthdr;
- return start;
-}
-
-static int
-ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
- u_int8_t *protonum)
-{
- unsigned int extoff;
- unsigned char pnum;
- int protoff;
-
- extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data;
- pnum = (*pskb)->nh.ipv6h->nexthdr;
-
- protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
- (*pskb)->len - extoff);
-
- /*
- * (protoff == (*pskb)->len) mean that the packet doesn't have no data
- * except of IPv6 & ext headers. but it's tracked anyway. - YK
- */
- if ((protoff < 0) || (protoff > (*pskb)->len)) {
- DEBUGP("ip6_conntrack_core: can't find proto in pkt\n");
- NF_CT_STAT_INC(error);
- NF_CT_STAT_INC(invalid);
- return -NF_ACCEPT;
- }
-
- *dataoff = protoff;
- *protonum = pnum;
- return NF_ACCEPT;
-}
-
-static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple)
-{
- return NF_CT_F_BASIC;
-}
-
-static unsigned int ipv6_confirm(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct nf_conn *ct;
- struct nf_conn_help *help;
- enum ip_conntrack_info ctinfo;
- unsigned int ret, protoff;
- unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1)
- - (*pskb)->data;
- unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr;
-
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(*pskb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
- goto out;
-
- help = nfct_help(ct);
- if (!help || !help->helper)
- goto out;
-
- protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
- (*pskb)->len - extoff);
- if (protoff < 0 || protoff > (*pskb)->len ||
- pnum == NEXTHDR_FRAGMENT) {
- DEBUGP("proto header not found\n");
- return NF_ACCEPT;
- }
-
- ret = help->helper->help(pskb, protoff, ct, ctinfo);
- if (ret != NF_ACCEPT)
- return ret;
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(pskb);
-}
-
-static unsigned int ipv6_defrag(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *reasm;
-
- /* Previously seen (loopback)? */
- if ((*pskb)->nfct)
- return NF_ACCEPT;
-
- reasm = nf_ct_frag6_gather(*pskb);
-
- /* queued */
- if (reasm == NULL)
- return NF_STOLEN;
-
- /* error occured or not fragmented */
- if (reasm == *pskb)
- return NF_ACCEPT;
-
- nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
- (struct net_device *)out, okfn);
-
- return NF_STOLEN;
-}
-
-static unsigned int ipv6_conntrack_in(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *reasm = (*pskb)->nfct_reasm;
-
- /* This packet is fragmented and has reassembled packet. */
- if (reasm) {
- /* Reassembled packet isn't parsed yet ? */
- if (!reasm->nfct) {
- unsigned int ret;
-
- ret = nf_conntrack_in(PF_INET6, hooknum, &reasm);
- if (ret != NF_ACCEPT)
- return ret;
- }
- nf_conntrack_get(reasm->nfct);
- (*pskb)->nfct = reasm->nfct;
- return NF_ACCEPT;
- }
-
- return nf_conntrack_in(PF_INET6, hooknum, pskb);
-}
-
-static unsigned int ipv6_conntrack_local(unsigned int hooknum,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* root is playing with raw sockets. */
- if ((*pskb)->len < sizeof(struct ipv6hdr)) {
- if (net_ratelimit())
- printk("ipv6_conntrack_local: packet too short\n");
- return NF_ACCEPT;
- }
- return ipv6_conntrack_in(hooknum, pskb, in, out, okfn);
-}
-
-static struct nf_hook_ops ipv6_conntrack_ops[] = {
- {
- .hook = ipv6_defrag,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_PRE_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
- },
- {
- .hook = ipv6_conntrack_in,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_PRE_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK,
- },
- {
- .hook = ipv6_conntrack_local,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_CONNTRACK,
- },
- {
- .hook = ipv6_defrag,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
- },
- {
- .hook = ipv6_confirm,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_POST_ROUTING,
- .priority = NF_IP6_PRI_LAST,
- },
- {
- .hook = ipv6_confirm,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_IP6_LOCAL_IN,
- .priority = NF_IP6_PRI_LAST-1,
- },
-};
-
-#ifdef CONFIG_SYSCTL
-static ctl_table nf_ct_ipv6_sysctl_table[] = {
- {
- .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT,
- .procname = "nf_conntrack_frag6_timeout",
- .data = &nf_ct_frag6_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH,
- .procname = "nf_conntrack_frag6_low_thresh",
- .data = &nf_ct_frag6_low_thresh,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,
- .procname = "nf_conntrack_frag6_high_thresh",
- .data = &nf_ct_frag6_high_thresh,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- { .ctl_name = 0 }
-};
-#endif
-
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv6_tuple_to_nfattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- NFA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4,
- &tuple->src.u3.ip6);
- NFA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4,
- &tuple->dst.u3.ip6);
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static const size_t cta_min_ip[CTA_IP_MAX] = {
- [CTA_IP_V6_SRC-1] = sizeof(u_int32_t)*4,
- [CTA_IP_V6_DST-1] = sizeof(u_int32_t)*4,
-};
-
-static int ipv6_nfattr_to_tuple(struct nfattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V6_SRC-1] || !tb[CTA_IP_V6_DST-1])
- return -EINVAL;
-
- if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
- return -EINVAL;
-
- memcpy(&t->src.u3.ip6, NFA_DATA(tb[CTA_IP_V6_SRC-1]),
- sizeof(u_int32_t) * 4);
- memcpy(&t->dst.u3.ip6, NFA_DATA(tb[CTA_IP_V6_DST-1]),
- sizeof(u_int32_t) * 4);
-
- return 0;
-}
-#endif
-
-struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
- .l3proto = PF_INET6,
- .name = "ipv6",
- .pkt_to_tuple = ipv6_pkt_to_tuple,
- .invert_tuple = ipv6_invert_tuple,
- .print_tuple = ipv6_print_tuple,
- .print_conntrack = ipv6_print_conntrack,
- .prepare = ipv6_prepare,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = ipv6_tuple_to_nfattr,
- .nfattr_to_tuple = ipv6_nfattr_to_tuple,
-#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_path = nf_net_netfilter_sysctl_path,
- .ctl_table = nf_ct_ipv6_sysctl_table,
-#endif
- .get_features = ipv6_get_features,
- .me = THIS_MODULE,
-};
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
-
-static int __init nf_conntrack_l3proto_ipv6_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
- ret = nf_ct_frag6_init();
- if (ret < 0) {
- printk("nf_conntrack_ipv6: can't initialize frag6.\n");
- return ret;
- }
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
- if (ret < 0) {
- printk("nf_conntrack_ipv6: can't register tcp.\n");
- goto cleanup_frag6;
- }
-
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
- if (ret < 0) {
- printk("nf_conntrack_ipv6: can't register udp.\n");
- goto cleanup_tcp;
- }
-
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6);
- if (ret < 0) {
- printk("nf_conntrack_ipv6: can't register icmpv6.\n");
- goto cleanup_udp;
- }
-
- ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6);
- if (ret < 0) {
- printk("nf_conntrack_ipv6: can't register ipv6\n");
- goto cleanup_icmpv6;
- }
-
- ret = nf_register_hooks(ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- if (ret < 0) {
- printk("nf_conntrack_ipv6: can't register pre-routing defrag "
- "hook.\n");
- goto cleanup_ipv6;
- }
- return ret;
-
- cleanup_ipv6:
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
- cleanup_icmpv6:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
- cleanup_udp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
- cleanup_tcp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- cleanup_frag6:
- nf_ct_frag6_cleanup();
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv6_fini(void)
-{
- synchronize_net();
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- nf_ct_frag6_cleanup();
-}
-
-module_init(nf_conntrack_l3proto_ipv6_init);
-module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
deleted file mode 100644
index 1e8e700f6135..000000000000
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Copyright (C)2003,2004 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - ICMPv6 tracking support. Derived from the original ip_conntrack code
- * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following
- * copyright information:
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/in6.h>
-#include <linux/icmpv6.h>
-#include <linux/ipv6.h>
-#include <net/ipv6.h>
-#include <net/ip6_checksum.h>
-#include <linux/seq_file.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_conntrack_tuple.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
-
-static unsigned long nf_ct_icmpv6_timeout __read_mostly = 30*HZ;
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int icmpv6_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
-{
- struct icmp6hdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return 0;
- tuple->dst.u.icmp.type = hp->icmp6_type;
- tuple->src.u.icmp.id = hp->icmp6_identifier;
- tuple->dst.u.icmp.code = hp->icmp6_code;
-
- return 1;
-}
-
-/* Add 1; spaces filled with 0. */
-static u_int8_t invmap[] = {
- [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1,
- [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1,
- [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1,
- [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1
-};
-
-static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- int type = orig->dst.u.icmp.type - 128;
- if (type < 0 || type >= sizeof(invmap) || !invmap[type])
- return 0;
-
- tuple->src.u.icmp.id = orig->src.u.icmp.id;
- tuple->dst.u.icmp.type = invmap[type] - 1;
- tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
- return 1;
-}
-
-/* Print out the per-protocol part of the tuple. */
-static int icmpv6_print_tuple(struct seq_file *s,
- const struct nf_conntrack_tuple *tuple)
-{
- return seq_printf(s, "type=%u code=%u id=%u ",
- tuple->dst.u.icmp.type,
- tuple->dst.u.icmp.code,
- ntohs(tuple->src.u.icmp.id));
-}
-
-/* Print out the private part of the conntrack. */
-static int icmpv6_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int icmpv6_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- int pf,
- unsigned int hooknum)
-{
- /* Try to delete connection immediately after all replies:
- won't actually vanish as we still have skb, and del_timer
- means this will only run once even if count hits zero twice
- (theoretically possible with SMP) */
- if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
- if (atomic_dec_and_test(&ct->proto.icmp.count)
- && del_timer(&ct->timeout))
- ct->timeout.function((unsigned long)ct);
- } else {
- atomic_inc(&ct->proto.icmp.count);
- nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout);
- }
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static int icmpv6_new(struct nf_conn *conntrack,
- const struct sk_buff *skb,
- unsigned int dataoff)
-{
- static u_int8_t valid_new[] = {
- [ICMPV6_ECHO_REQUEST - 128] = 1,
- [ICMPV6_NI_QUERY - 128] = 1
- };
- int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128;
-
- if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
- /* Can't create a new ICMPv6 `conn' with this. */
- DEBUGP("icmpv6: can't create new conn with type %u\n",
- type + 128);
- NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
- return 0;
- }
- atomic_set(&conntrack->proto.icmp.count, 0);
- return 1;
-}
-
-static int
-icmpv6_error_message(struct sk_buff *skb,
- unsigned int icmp6off,
- enum ip_conntrack_info *ctinfo,
- unsigned int hooknum)
-{
- struct nf_conntrack_tuple intuple, origtuple;
- struct nf_conntrack_tuple_hash *h;
- struct icmp6hdr _hdr, *hp;
- unsigned int inip6off;
- struct nf_conntrack_l4proto *inproto;
- u_int8_t inprotonum;
- unsigned int inprotoff;
-
- NF_CT_ASSERT(skb->nfct == NULL);
-
- hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n");
- return -NF_ACCEPT;
- }
-
- inip6off = icmp6off + sizeof(_hdr);
- if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr),
- &inprotonum, sizeof(inprotonum)) != 0) {
- DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n");
- return -NF_ACCEPT;
- }
- inprotoff = nf_ct_ipv6_skip_exthdr(skb,
- inip6off + sizeof(struct ipv6hdr),
- &inprotonum,
- skb->len - inip6off
- - sizeof(struct ipv6hdr));
-
- if ((inprotoff < 0) || (inprotoff > skb->len) ||
- (inprotonum == NEXTHDR_FRAGMENT)) {
- DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n");
- return -NF_ACCEPT;
- }
-
- inproto = __nf_ct_l4proto_find(PF_INET6, inprotonum);
-
- /* Are they talking about one of our connections? */
- if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum,
- &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) {
- DEBUGP("icmpv6_error: Can't get tuple\n");
- return -NF_ACCEPT;
- }
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&intuple, &origtuple,
- &nf_conntrack_l3proto_ipv6, inproto)) {
- DEBUGP("icmpv6_error: Can't invert tuple\n");
- return -NF_ACCEPT;
- }
-
- *ctinfo = IP_CT_RELATED;
-
- h = nf_conntrack_find_get(&intuple, NULL);
- if (!h) {
- DEBUGP("icmpv6_error: no match\n");
- return -NF_ACCEPT;
- } else {
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
- }
-
- /* Update skb to refer to this connection */
- skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
- return -NF_ACCEPT;
-}
-
-static int
-icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
-{
- struct icmp6hdr _ih, *icmp6h;
-
- icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
- if (icmp6h == NULL) {
- if (LOG_INVALID(IPPROTO_ICMPV6))
- nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmpv6: short packet ");
- return -NF_ACCEPT;
- }
-
- if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING &&
- nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
- nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmpv6: ICMPv6 checksum failed\n");
- return -NF_ACCEPT;
- }
-
- /* is not error message ? */
- if (icmp6h->icmp6_type >= 128)
- return NF_ACCEPT;
-
- return icmpv6_error_message(skb, dataoff, ctinfo, hooknum);
-}
-
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-static int icmpv6_tuple_to_nfattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *t)
-{
- NFA_PUT(skb, CTA_PROTO_ICMPV6_ID, sizeof(u_int16_t),
- &t->src.u.icmp.id);
- NFA_PUT(skb, CTA_PROTO_ICMPV6_TYPE, sizeof(u_int8_t),
- &t->dst.u.icmp.type);
- NFA_PUT(skb, CTA_PROTO_ICMPV6_CODE, sizeof(u_int8_t),
- &t->dst.u.icmp.code);
-
- return 0;
-
-nfattr_failure:
- return -1;
-}
-
-static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_ICMPV6_TYPE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMPV6_CODE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMPV6_ID-1] = sizeof(u_int16_t)
-};
-
-static int icmpv6_nfattr_to_tuple(struct nfattr *tb[],
- struct nf_conntrack_tuple *tuple)
-{
- if (!tb[CTA_PROTO_ICMPV6_TYPE-1]
- || !tb[CTA_PROTO_ICMPV6_CODE-1]
- || !tb[CTA_PROTO_ICMPV6_ID-1])
- return -EINVAL;
-
- if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
- return -EINVAL;
-
- tuple->dst.u.icmp.type =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_TYPE-1]);
- tuple->dst.u.icmp.code =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_CODE-1]);
- tuple->src.u.icmp.id =
- *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_ID-1]);
-
- if (tuple->dst.u.icmp.type < 128
- || tuple->dst.u.icmp.type - 128 >= sizeof(invmap)
- || !invmap[tuple->dst.u.icmp.type - 128])
- return -EINVAL;
-
- return 0;
-}
-#endif
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmpv6_sysctl_header;
-static struct ctl_table icmpv6_sysctl_table[] = {
- {
- .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT,
- .procname = "nf_conntrack_icmpv6_timeout",
- .data = &nf_ct_icmpv6_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = 0
- }
-};
-#endif /* CONFIG_SYSCTL */
-
-struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
-{
- .l3proto = PF_INET6,
- .l4proto = IPPROTO_ICMPV6,
- .name = "icmpv6",
- .pkt_to_tuple = icmpv6_pkt_to_tuple,
- .invert_tuple = icmpv6_invert_tuple,
- .print_tuple = icmpv6_print_tuple,
- .print_conntrack = icmpv6_print_conntrack,
- .packet = icmpv6_packet,
- .new = icmpv6_new,
- .error = icmpv6_error,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = icmpv6_tuple_to_nfattr,
- .nfattr_to_tuple = icmpv6_nfattr_to_tuple,
-#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_header = &icmpv6_sysctl_header,
- .ctl_table = icmpv6_sysctl_table,
-#endif
-};
-
-EXPORT_SYMBOL(nf_conntrack_l4proto_icmpv6);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index bf93c1ea6be9..64ab23ff559b 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 fragment reassembly for connection tracking
*
@@ -7,425 +8,212 @@
* Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
*
* Based on: net/ipv6/reassembly.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv6-nf: " fmt
+
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/jiffies.h>
#include <linux/net.h>
-#include <linux/list.h>
#include <linux/netdevice.h>
-#include <linux/in6.h>
#include <linux/ipv6.h>
-#include <linux/icmpv6.h>
-#include <linux/random.h>
-#include <linux/jhash.h>
-
-#include <net/sock.h>
-#include <net/snmp.h>
-
-#include <net/ipv6.h>
-#include <net/protocol.h>
-#include <net/transp_v6.h>
-#include <net/rawv6.h>
-#include <net/ndisc.h>
-#include <net/addrconf.h>
+#include <linux/slab.h>
+
+#include <net/ipv6_frag.h>
+
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <linux/sysctl.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netns/generic.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */
-#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */
-#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT
-
-unsigned int nf_ct_frag6_high_thresh __read_mostly = 256*1024;
-unsigned int nf_ct_frag6_low_thresh __read_mostly = 192*1024;
-unsigned long nf_ct_frag6_timeout __read_mostly = IPV6_FRAG_TIMEOUT;
-
-struct nf_ct_frag6_skb_cb
-{
- struct inet6_skb_parm h;
- int offset;
- struct sk_buff *orig;
-};
-
-#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb))
-
-struct nf_ct_frag6_queue
-{
- struct hlist_node list;
- struct list_head lru_list; /* lru list member */
-
- __u32 id; /* fragment id */
- struct in6_addr saddr;
- struct in6_addr daddr;
-
- spinlock_t lock;
- atomic_t refcnt;
- struct timer_list timer; /* expire timer */
- struct sk_buff *fragments;
- int len;
- int meat;
- struct timeval stamp;
- unsigned int csum;
- __u8 last_in; /* has first/last segment arrived? */
-#define COMPLETE 4
-#define FIRST_IN 2
-#define LAST_IN 1
- __u16 nhoffset;
-};
-
-/* Hash table. */
-
-#define FRAG6Q_HASHSZ 64
-
-static struct hlist_head nf_ct_frag6_hash[FRAG6Q_HASHSZ];
-static DEFINE_RWLOCK(nf_ct_frag6_lock);
-static u32 nf_ct_frag6_hash_rnd;
-static LIST_HEAD(nf_ct_frag6_lru_list);
-int nf_ct_frag6_nqueues = 0;
+static const char nf_frags_cache_name[] = "nf-frags";
-static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq)
-{
- hlist_del(&fq->list);
- list_del(&fq->lru_list);
- nf_ct_frag6_nqueues--;
-}
+static unsigned int nf_frag_pernet_id __read_mostly;
+static struct inet_frags nf_frags;
-static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq)
+static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net)
{
- write_lock(&nf_ct_frag6_lock);
- __fq_unlink(fq);
- write_unlock(&nf_ct_frag6_lock);
+ return net_generic(net, nf_frag_pernet_id);
}
-static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr,
- struct in6_addr *daddr)
-{
- u32 a, b, c;
-
- a = saddr->s6_addr32[0];
- b = saddr->s6_addr32[1];
- c = saddr->s6_addr32[2];
-
- a += JHASH_GOLDEN_RATIO;
- b += JHASH_GOLDEN_RATIO;
- c += nf_ct_frag6_hash_rnd;
- __jhash_mix(a, b, c);
-
- a += saddr->s6_addr32[3];
- b += daddr->s6_addr32[0];
- c += daddr->s6_addr32[1];
- __jhash_mix(a, b, c);
-
- a += daddr->s6_addr32[2];
- b += daddr->s6_addr32[3];
- c += id;
- __jhash_mix(a, b, c);
-
- return c & (FRAG6Q_HASHSZ - 1);
-}
-
-static struct timer_list nf_ct_frag6_secret_timer;
-int nf_ct_frag6_secret_interval = 10 * 60 * HZ;
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table nf_ct_frag6_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_frag6_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_frag6_low_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "nf_conntrack_frag6_high_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+};
-static void nf_ct_frag6_secret_rebuild(unsigned long dummy)
+static int nf_ct_frag6_sysctl_register(struct net *net)
{
- unsigned long now = jiffies;
- int i;
-
- write_lock(&nf_ct_frag6_lock);
- get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32));
- for (i = 0; i < FRAG6Q_HASHSZ; i++) {
- struct nf_ct_frag6_queue *q;
- struct hlist_node *p, *n;
-
- hlist_for_each_entry_safe(q, p, n, &nf_ct_frag6_hash[i], list) {
- unsigned int hval = ip6qhashfn(q->id,
- &q->saddr,
- &q->daddr);
- if (hval != i) {
- hlist_del(&q->list);
- /* Relink to new hash chain. */
- hlist_add_head(&q->list,
- &nf_ct_frag6_hash[hval]);
- }
- }
+ struct nft_ct_frag6_pernet *nf_frag;
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = nf_ct_frag6_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table),
+ GFP_KERNEL);
+ if (table == NULL)
+ goto err_alloc;
}
- write_unlock(&nf_ct_frag6_lock);
- mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval);
-}
+ nf_frag = nf_frag_pernet(net);
-atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0);
+ table[0].data = &nf_frag->fqdir->timeout;
+ table[1].data = &nf_frag->fqdir->low_thresh;
+ table[1].extra2 = &nf_frag->fqdir->high_thresh;
+ table[2].data = &nf_frag->fqdir->high_thresh;
+ table[2].extra1 = &nf_frag->fqdir->low_thresh;
-/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work)
-{
- if (work)
- *work -= skb->truesize;
- atomic_sub(skb->truesize, &nf_ct_frag6_mem);
- if (NFCT_FRAG6_CB(skb)->orig)
- kfree_skb(NFCT_FRAG6_CB(skb)->orig);
+ hdr = register_net_sysctl_sz(net, "net/netfilter", table,
+ ARRAY_SIZE(nf_ct_frag6_sysctl_table));
+ if (hdr == NULL)
+ goto err_reg;
- kfree_skb(skb);
-}
+ nf_frag->nf_frag_frags_hdr = hdr;
+ return 0;
-static inline void frag_free_queue(struct nf_ct_frag6_queue *fq,
- unsigned int *work)
-{
- if (work)
- *work -= sizeof(struct nf_ct_frag6_queue);
- atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
- kfree(fq);
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
}
-static inline struct nf_ct_frag6_queue *frag_alloc_queue(void)
+static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
{
- struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC);
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+ const struct ctl_table *table;
- if (!fq)
- return NULL;
- atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
- return fq;
+ table = nf_frag->nf_frag_frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
}
-/* Destruction primitives. */
-
-/* Complete destruction of fq. */
-static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq,
- unsigned int *work)
+#else
+static int nf_ct_frag6_sysctl_register(struct net *net)
{
- struct sk_buff *fp;
-
- BUG_TRAP(fq->last_in&COMPLETE);
- BUG_TRAP(del_timer(&fq->timer) == 0);
-
- /* Release all fragment data. */
- fp = fq->fragments;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- frag_kfree_skb(fp, work);
- fp = xp;
- }
-
- frag_free_queue(fq, work);
+ return 0;
}
-
-static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work)
+static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
{
- if (atomic_dec_and_test(&fq->refcnt))
- nf_ct_frag6_destroy(fq, work);
}
+#endif
-/* Kill fq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq)
-{
- if (del_timer(&fq->timer))
- atomic_dec(&fq->refcnt);
-
- if (!(fq->last_in & COMPLETE)) {
- fq_unlink(fq);
- atomic_dec(&fq->refcnt);
- fq->last_in |= COMPLETE;
- }
-}
+static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
-static void nf_ct_frag6_evictor(void)
+static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
- struct nf_ct_frag6_queue *fq;
- struct list_head *tmp;
- unsigned int work;
-
- work = atomic_read(&nf_ct_frag6_mem);
- if (work <= nf_ct_frag6_low_thresh)
- return;
-
- work -= nf_ct_frag6_low_thresh;
- while (work > 0) {
- read_lock(&nf_ct_frag6_lock);
- if (list_empty(&nf_ct_frag6_lru_list)) {
- read_unlock(&nf_ct_frag6_lock);
- return;
- }
- tmp = nf_ct_frag6_lru_list.next;
- BUG_ON(tmp == NULL);
- fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list);
- atomic_inc(&fq->refcnt);
- read_unlock(&nf_ct_frag6_lock);
-
- spin_lock(&fq->lock);
- if (!(fq->last_in&COMPLETE))
- fq_kill(fq);
- spin_unlock(&fq->lock);
-
- fq_put(fq, &work);
- }
+ return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
}
-static void nf_ct_frag6_expire(unsigned long data)
+static void nf_ct_frag6_expire(struct timer_list *t)
{
- struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data;
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
+ struct frag_queue *fq;
- spin_lock(&fq->lock);
+ fq = container_of(frag, struct frag_queue, q);
- if (fq->last_in & COMPLETE)
- goto out;
-
- fq_kill(fq);
-
-out:
- spin_unlock(&fq->lock);
- fq_put(fq, NULL);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
/* Creation primitives. */
-
-static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash,
- struct nf_ct_frag6_queue *fq_in)
-{
- struct nf_ct_frag6_queue *fq;
-#ifdef CONFIG_SMP
- struct hlist_node *n;
-#endif
-
- write_lock(&nf_ct_frag6_lock);
-#ifdef CONFIG_SMP
- hlist_for_each_entry(fq, n, &nf_ct_frag6_hash[hash], list) {
- if (fq->id == fq_in->id &&
- ipv6_addr_equal(&fq_in->saddr, &fq->saddr) &&
- ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) {
- atomic_inc(&fq->refcnt);
- write_unlock(&nf_ct_frag6_lock);
- fq_in->last_in |= COMPLETE;
- fq_put(fq_in, NULL);
- return fq;
- }
- }
-#endif
- fq = fq_in;
-
- if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout))
- atomic_inc(&fq->refcnt);
-
- atomic_inc(&fq->refcnt);
- hlist_add_head(&fq->list, &nf_ct_frag6_hash[hash]);
- INIT_LIST_HEAD(&fq->lru_list);
- list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
- nf_ct_frag6_nqueues++;
- write_unlock(&nf_ct_frag6_lock);
- return fq;
-}
-
-
-static struct nf_ct_frag6_queue *
-nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst)
+static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
+ const struct ipv6hdr *hdr, int iif)
{
- struct nf_ct_frag6_queue *fq;
-
- if ((fq = frag_alloc_queue()) == NULL) {
- DEBUGP("Can't alloc new queue\n");
- goto oom;
- }
-
- memset(fq, 0, sizeof(struct nf_ct_frag6_queue));
-
- fq->id = id;
- ipv6_addr_copy(&fq->saddr, src);
- ipv6_addr_copy(&fq->daddr, dst);
-
- init_timer(&fq->timer);
- fq->timer.function = nf_ct_frag6_expire;
- fq->timer.data = (long) fq;
- spin_lock_init(&fq->lock);
- atomic_set(&fq->refcnt, 1);
-
- return nf_ct_frag6_intern(hash, fq);
-
-oom:
- return NULL;
-}
-
-static __inline__ struct nf_ct_frag6_queue *
-fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
-{
- struct nf_ct_frag6_queue *fq;
- struct hlist_node *n;
- unsigned int hash = ip6qhashfn(id, src, dst);
-
- read_lock(&nf_ct_frag6_lock);
- hlist_for_each_entry(fq, n, &nf_ct_frag6_hash[hash], list) {
- if (fq->id == id &&
- ipv6_addr_equal(src, &fq->saddr) &&
- ipv6_addr_equal(dst, &fq->daddr)) {
- atomic_inc(&fq->refcnt);
- read_unlock(&nf_ct_frag6_lock);
- return fq;
- }
- }
- read_unlock(&nf_ct_frag6_lock);
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+ struct frag_v6_compare_key key = {
+ .id = id,
+ .saddr = hdr->saddr,
+ .daddr = hdr->daddr,
+ .user = user,
+ .iif = iif,
+ };
+ struct inet_frag_queue *q;
+
+ if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+ IPV6_ADDR_LINKLOCAL)))
+ key.iif = 0;
+
+ q = inet_frag_find(nf_frag->fqdir, &key);
+ if (!q)
+ return NULL;
- return nf_ct_frag6_create(hash, id, src, dst);
+ return container_of(q, struct frag_queue, q);
}
-static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
- struct frag_hdr *fhdr, int nhoff)
+static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
+ const struct frag_hdr *fhdr, int nhoff,
+ int *refs)
{
- struct sk_buff *prev, *next;
- int offset, end;
-
- if (fq->last_in & COMPLETE) {
- DEBUGP("Allready completed\n");
+ unsigned int payload_len;
+ struct net_device *dev;
+ struct sk_buff *prev;
+ int offset, end, err;
+ u8 ecn;
+
+ if (fq->q.flags & INET_FRAG_COMPLETE) {
+ pr_debug("Already completed\n");
goto err;
}
+ payload_len = ntohs(ipv6_hdr(skb)->payload_len);
+
offset = ntohs(fhdr->frag_off) & ~0x7;
- end = offset + (ntohs(skb->nh.ipv6h->payload_len) -
- ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
+ end = offset + (payload_len -
+ ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
if ((unsigned int)end > IPV6_MAXPLEN) {
- DEBUGP("offset is too large.\n");
- return -1;
+ pr_debug("offset is too large.\n");
+ return -EINVAL;
}
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_sub(skb->csum,
- csum_partial(skb->nh.raw,
- (u8*)(fhdr + 1) - skb->nh.raw,
+ ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ const unsigned char *nh = skb_network_header(skb);
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(nh, (u8 *)(fhdr + 1) - nh,
0));
+ }
/* Is this the final fragment? */
if (!(fhdr->frag_off & htons(IP6_MF))) {
/* If we already have some bits beyond end
* or have different end, the segment is corrupted.
*/
- if (end < fq->len ||
- ((fq->last_in & LAST_IN) && end != fq->len)) {
- DEBUGP("already received last fragment\n");
+ if (end < fq->q.len ||
+ ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) {
+ pr_debug("already received last fragment\n");
goto err;
}
- fq->last_in |= LAST_IN;
- fq->len = end;
+ fq->q.flags |= INET_FRAG_LAST_IN;
+ fq->q.len = end;
} else {
/* Check if the fragment is rounded to 8 bytes.
* Required by the RFC.
@@ -434,16 +222,17 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
/* RFC2460 says always send parameter problem in
* this case. -DaveM
*/
- DEBUGP("the end of this fragment is not rounded to 8 bytes.\n");
- return -1;
+ pr_debug("end of fragment not rounded to 8 bytes.\n");
+ inet_frag_kill(&fq->q, refs);
+ return -EPROTO;
}
- if (end > fq->len) {
+ if (end > fq->q.len) {
/* Some bits beyond end -> corruption. */
- if (fq->last_in & LAST_IN) {
- DEBUGP("last packet already reached.\n");
+ if (fq->q.flags & INET_FRAG_LAST_IN) {
+ pr_debug("last packet already reached.\n");
goto err;
}
- fq->len = end;
+ fq->q.len = end;
}
}
@@ -452,237 +241,142 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
/* Point into the IP datagram 'data' part. */
if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
- DEBUGP("queue: message is too short.\n");
+ pr_debug("queue: message is too short.\n");
goto err;
}
if (pskb_trim_rcsum(skb, end - offset)) {
- DEBUGP("Can't trim\n");
+ pr_debug("Can't trim\n");
goto err;
}
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = NULL;
- for (next = fq->fragments; next != NULL; next = next->next) {
- if (NFCT_FRAG6_CB(next)->offset >= offset)
- break; /* bingo! */
- prev = next;
- }
-
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
- */
- if (prev) {
- int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset;
-
- if (i > 0) {
- offset += i;
- if (end <= offset) {
- DEBUGP("overlap\n");
- goto err;
- }
- if (!pskb_pull(skb, i)) {
- DEBUGP("Can't pull\n");
- goto err;
- }
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
-
- /* Look for overlap with succeeding segments.
- * If we can merge fragments, do it.
- */
- while (next && NFCT_FRAG6_CB(next)->offset < end) {
- /* overlap is 'i' bytes */
- int i = end - NFCT_FRAG6_CB(next)->offset;
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- DEBUGP("Eat head of the overlapped parts.: %d", i);
- if (!pskb_pull(next, i))
- goto err;
-
- /* next fragment */
- NFCT_FRAG6_CB(next)->offset += i;
- fq->meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragmnet is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- fq->fragments = next;
-
- fq->meat -= free_it->len;
- frag_kfree_skb(free_it, NULL);
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
+
+ prev = fq->q.fragments_tail;
+ err = inet_frag_queue_insert(&fq->q, skb, offset, end);
+ if (err) {
+ if (err == IPFRAG_DUP) {
+ /* No error for duplicates, pretend they got queued. */
+ kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
+ return -EINPROGRESS;
}
+ goto insert_error;
}
- NFCT_FRAG6_CB(skb)->offset = offset;
+ if (dev)
+ fq->iif = dev->ifindex;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (prev)
- prev->next = skb;
- else
- fq->fragments = skb;
-
- skb->dev = NULL;
- skb_get_timestamp(skb, &fq->stamp);
- fq->meat += skb->len;
- atomic_add(skb->truesize, &nf_ct_frag6_mem);
+ fq->q.stamp = skb->tstamp;
+ fq->q.tstamp_type = skb->tstamp_type;
+ fq->q.meat += skb->len;
+ fq->ecn |= ecn;
+ if (payload_len > fq->q.max_size)
+ fq->q.max_size = payload_len;
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
*/
if (offset == 0) {
fq->nhoffset = nhoff;
- fq->last_in |= FIRST_IN;
+ fq->q.flags |= INET_FRAG_FIRST_IN;
}
- write_lock(&nf_ct_frag6_lock);
- list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
- write_unlock(&nf_ct_frag6_lock);
- return 0;
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs);
+ skb->_skb_refdst = orefdst;
+
+ /* After queue has assumed skb ownership, only 0 or
+ * -EINPROGRESS must be returned.
+ */
+ return err ? -EINPROGRESS : 0;
+ }
+
+ skb_dst_drop(skb);
+ skb_orphan(skb);
+ return -EINPROGRESS;
+
+insert_error:
+ inet_frag_kill(&fq->q, refs);
err:
- return -1;
+ skb_dst_drop(skb);
+ return -EINVAL;
}
/*
* Check if this packet is complete.
- * Returns NULL on failure by any reason, and pointer
- * to current nexthdr field in reassembled frame.
*
* It is called with locked fq, and caller must check that
* queue is eligible for reassembly i.e. it is not COMPLETE,
* the last and the first frames arrived and all the bits are here.
*/
-static struct sk_buff *
-nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
- struct sk_buff *fp, *op, *head = fq->fragments;
- int payload_len;
-
- fq_kill(fq);
+ void *reasm_data;
+ int payload_len;
+ u8 ecn;
- BUG_TRAP(head != NULL);
- BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0);
+ inet_frag_kill(&fq->q, refs);
- /* Unfragmented part is taken from the first segment. */
- payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr);
- if (payload_len > IPV6_MAXPLEN) {
- DEBUGP("payload len is too large.\n");
- goto out_oversize;
- }
-
- /* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) {
- DEBUGP("skb is cloned but can't expand head");
- goto out_oom;
- }
+ ecn = ip_frag_ecn_table[fq->ecn];
+ if (unlikely(ecn == 0xff))
+ goto err;
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_shinfo(head)->frag_list) {
- struct sk_buff *clone;
- int i, plen = 0;
+ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
+ if (!reasm_data)
+ goto err;
- if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) {
- DEBUGP("Can't alloc skb\n");
- goto out_oom;
- }
- clone->next = head->next;
- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_shinfo(head)->frag_list = NULL;
- for (i=0; i<skb_shinfo(head)->nr_frags; i++)
- plen += skb_shinfo(head)->frags[i].size;
- clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
-
- NFCT_FRAG6_CB(clone)->orig = NULL;
- atomic_add(clone->truesize, &nf_ct_frag6_mem);
+ payload_len = -skb_network_offset(skb) -
+ sizeof(struct ipv6hdr) + fq->q.len -
+ sizeof(struct frag_hdr);
+ if (payload_len > IPV6_MAXPLEN) {
+ net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n",
+ payload_len);
+ goto err;
}
/* We have to remove fragment header from datagram and to relocate
* header in order to calculate ICV correctly. */
- head->nh.raw[fq->nhoffset] = head->h.raw[0];
- memmove(head->head + sizeof(struct frag_hdr), head->head,
- (head->data - head->head) - sizeof(struct frag_hdr));
- head->mac.raw += sizeof(struct frag_hdr);
- head->nh.raw += sizeof(struct frag_hdr);
-
- skb_shinfo(head)->frag_list = head->next;
- head->h.raw = head->data;
- skb_push(head, head->data - head->nh.raw);
- atomic_sub(head->truesize, &nf_ct_frag6_mem);
-
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &nf_ct_frag6_mem);
- }
+ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0];
+ memmove(skb->head + sizeof(struct frag_hdr), skb->head,
+ (skb->data - skb->head) - sizeof(struct frag_hdr));
+ skb->mac_header += sizeof(struct frag_hdr);
+ skb->network_header += sizeof(struct frag_hdr);
- head->next = NULL;
- head->dev = dev;
- skb_set_timestamp(head, &fq->stamp);
- head->nh.ipv6h->payload_len = htons(payload_len);
+ skb_reset_transport_header(skb);
- /* Yes, and fold redundant checksum back. 8) */
- if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum);
+ inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);
- fq->fragments = NULL;
+ skb->ignore_df = 1;
+ skb->dev = dev;
+ ipv6_hdr(skb)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
+ IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
- /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
- fp = skb_shinfo(head)->frag_list;
- if (NFCT_FRAG6_CB(fp)->orig == NULL)
- /* at above code, head skb is divided into two skbs. */
- fp = fp->next;
+ /* Yes, and fold redundant checksum back. 8) */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_partial(skb_network_header(skb),
+ skb_network_header_len(skb),
+ skb->csum);
- op = NFCT_FRAG6_CB(head)->orig;
- for (; fp; fp = fp->next) {
- struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig;
+ fq->q.rb_fragments = RB_ROOT;
+ fq->q.fragments_tail = NULL;
+ fq->q.last_run_head = NULL;
- op->next = orig;
- op = orig;
- NFCT_FRAG6_CB(fp)->orig = NULL;
- }
+ return 0;
- return head;
-
-out_oversize:
- if (net_ratelimit())
- printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len);
- goto out_fail;
-out_oom:
- if (net_ratelimit())
- printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n");
-out_fail:
- return NULL;
+err:
+ inet_frag_kill(&fq->q, refs);
+ return -EINVAL;
}
/*
@@ -701,41 +395,42 @@ out_fail:
static int
find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
{
- u8 nexthdr = skb->nh.ipv6h->nexthdr;
- u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data;
- int start = (u8 *)(skb->nh.ipv6h+1) - skb->data;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ const int netoff = skb_network_offset(skb);
+ u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr);
+ int start = netoff + sizeof(struct ipv6hdr);
int len = skb->len - start;
u8 prevhdr = NEXTHDR_IPV6;
- while (nexthdr != NEXTHDR_FRAGMENT) {
- struct ipv6_opt_hdr hdr;
- int hdrlen;
+ while (nexthdr != NEXTHDR_FRAGMENT) {
+ struct ipv6_opt_hdr hdr;
+ int hdrlen;
if (!ipv6_ext_hdr(nexthdr)) {
return -1;
}
- if (len < (int)sizeof(struct ipv6_opt_hdr)) {
- DEBUGP("too short\n");
+ if (nexthdr == NEXTHDR_NONE) {
+ pr_debug("next header is none\n");
return -1;
}
- if (nexthdr == NEXTHDR_NONE) {
- DEBUGP("next header is none\n");
+ if (len < (int)sizeof(struct ipv6_opt_hdr)) {
+ pr_debug("too short\n");
return -1;
}
- if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
- BUG();
- if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hdr.hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(&hdr);
+ if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+ BUG();
+ if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = ipv6_authlen(&hdr);
+ else
+ hdrlen = ipv6_optlen(&hdr);
prevhdr = nexthdr;
prev_nhoff = start;
- nexthdr = hdr.nexthdr;
- len -= hdrlen;
- start += hdrlen;
- }
+ nexthdr = hdr.nexthdr;
+ len -= hdrlen;
+ start += hdrlen;
+ }
if (len < 0)
return -1;
@@ -747,133 +442,138 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
return 0;
}
-struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
- struct sk_buff *clone;
- struct net_device *dev = skb->dev;
+ u16 savethdr = skb->transport_header;
+ u8 nexthdr = NEXTHDR_FRAGMENT;
+ int fhoff, nhoff, ret;
struct frag_hdr *fhdr;
- struct nf_ct_frag6_queue *fq;
+ struct frag_queue *fq;
struct ipv6hdr *hdr;
- int fhoff, nhoff;
+ int refs = 0;
u8 prevhdr;
- struct sk_buff *ret_skb = NULL;
/* Jumbo payload inhibits frag. header */
- if (skb->nh.ipv6h->payload_len == 0) {
- DEBUGP("payload len = 0\n");
- return skb;
+ if (ipv6_hdr(skb)->payload_len == 0) {
+ pr_debug("payload len = 0\n");
+ return 0;
}
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
- return skb;
+ return 0;
- clone = skb_clone(skb, GFP_ATOMIC);
- if (clone == NULL) {
- DEBUGP("Can't clone skb\n");
- return skb;
+ /* Discard the first fragment if it does not include all headers
+ * RFC 8200, Section 4.5
+ */
+ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) {
+ pr_debug("Drop incomplete fragment\n");
+ return 0;
}
- NFCT_FRAG6_CB(clone)->orig = skb;
-
- if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
- DEBUGP("message is too short.\n");
- goto ret_orig;
- }
+ if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
+ return -ENOMEM;
- clone->h.raw = clone->data + fhoff;
- hdr = clone->nh.ipv6h;
- fhdr = (struct frag_hdr *)clone->h.raw;
+ skb_set_transport_header(skb, fhoff);
+ hdr = ipv6_hdr(skb);
+ fhdr = (struct frag_hdr *)skb_transport_header(skb);
- if (!(fhdr->frag_off & htons(0xFFF9))) {
- DEBUGP("Invalid fragment offset\n");
- /* It is not a fragmented frame */
- goto ret_orig;
+ rcu_read_lock();
+ fq = fq_find(net, fhdr->identification, user, hdr,
+ skb->dev ? skb->dev->ifindex : 0);
+ if (fq == NULL) {
+ rcu_read_unlock();
+ pr_debug("Can't find and can't create new queue\n");
+ return -ENOMEM;
}
- if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh)
- nf_ct_frag6_evictor();
+ spin_lock_bh(&fq->q.lock);
- fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr);
- if (fq == NULL) {
- DEBUGP("Can't find and can't create new queue\n");
- goto ret_orig;
+ ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs);
+ if (ret == -EPROTO) {
+ skb->transport_header = savethdr;
+ ret = 0;
}
- spin_lock(&fq->lock);
+ spin_unlock_bh(&fq->q.lock);
+ rcu_read_unlock();
+ inet_frag_putn(&fq->q, refs);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
- if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
- spin_unlock(&fq->lock);
- DEBUGP("Can't insert skb to queue\n");
- fq_put(fq, NULL);
- goto ret_orig;
- }
+static int nf_ct_net_init(struct net *net)
+{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+ int res;
- if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) {
- ret_skb = nf_ct_frag6_reasm(fq, dev);
- if (ret_skb == NULL)
- DEBUGP("Can't reassemble fragmented packets\n");
- }
- spin_unlock(&fq->lock);
+ res = fqdir_init(&nf_frag->fqdir, &nf_frags, net);
+ if (res < 0)
+ return res;
- fq_put(fq, NULL);
- return ret_skb;
+ nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT;
-ret_orig:
- kfree_skb(clone);
- return skb;
+ res = nf_ct_frag6_sysctl_register(net);
+ if (res < 0)
+ fqdir_exit(nf_frag->fqdir);
+ return res;
}
-void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
- struct net_device *in, struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static void nf_ct_net_pre_exit(struct net *net)
{
- struct sk_buff *s, *s2;
-
- for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
- nf_conntrack_put_reasm(s->nfct_reasm);
- nf_conntrack_get_reasm(skb);
- s->nfct_reasm = skb;
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
- s2 = s->next;
- NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn,
- NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
- s = s2;
- }
- nf_conntrack_put_reasm(skb);
+ fqdir_pre_exit(nf_frag->fqdir);
}
-int nf_ct_frag6_kfree_frags(struct sk_buff *skb)
+static void nf_ct_net_exit(struct net *net)
{
- struct sk_buff *s, *s2;
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
- for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) {
-
- s2 = s->next;
- kfree_skb(s);
- }
+ nf_ct_frags6_sysctl_unregister(net);
+ fqdir_exit(nf_frag->fqdir);
+}
- kfree_skb(skb);
+static struct pernet_operations nf_ct_net_ops = {
+ .init = nf_ct_net_init,
+ .pre_exit = nf_ct_net_pre_exit,
+ .exit = nf_ct_net_exit,
+ .id = &nf_frag_pernet_id,
+ .size = sizeof(struct nft_ct_frag6_pernet),
+};
- return 0;
-}
+static const struct rhashtable_params nfct_rhash_params = {
+ .head_offset = offsetof(struct inet_frag_queue, node),
+ .hashfn = ip6frag_key_hashfn,
+ .obj_hashfn = ip6frag_obj_hashfn,
+ .obj_cmpfn = ip6frag_obj_cmpfn,
+ .automatic_shrinking = true,
+};
int nf_ct_frag6_init(void)
{
- nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
- (jiffies ^ (jiffies >> 6)));
-
- init_timer(&nf_ct_frag6_secret_timer);
- nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild;
- nf_ct_frag6_secret_timer.expires = jiffies
- + nf_ct_frag6_secret_interval;
- add_timer(&nf_ct_frag6_secret_timer);
+ int ret = 0;
+
+ nf_frags.constructor = ip6frag_init;
+ nf_frags.destructor = NULL;
+ nf_frags.qsize = sizeof(struct frag_queue);
+ nf_frags.frag_expire = nf_ct_frag6_expire;
+ nf_frags.frags_cache_name = nf_frags_cache_name;
+ nf_frags.rhash_params = nfct_rhash_params;
+ ret = inet_frags_init(&nf_frags);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nf_ct_net_ops);
+ if (ret)
+ inet_frags_fini(&nf_frags);
- return 0;
+out:
+ return ret;
}
void nf_ct_frag6_cleanup(void)
{
- del_timer(&nf_ct_frag6_secret_timer);
- nf_ct_frag6_low_thresh = 0;
- nf_ct_frag6_evictor();
+ unregister_pernet_subsys(&nf_ct_net_ops);
+ inet_frags_fini(&nf_frags);
}
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644
index 000000000000..be7817fbc024
--- /dev/null
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/rcupdate.h>
+#include <linux/sysctl.h>
+#include <net/ipv6_frag.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static DEFINE_MUTEX(defrag6_mutex);
+
+static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (skb_nfct(skb)) {
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+ }
+#endif
+ if (nf_bridge_in_prerouting(skb))
+ return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
+
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP6_DEFRAG_CONNTRACK_IN + zone_id;
+ else
+ return IP6_DEFRAG_CONNTRACK_OUT + zone_id;
+}
+
+static unsigned int ipv6_defrag(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int err;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Previously seen (loopback)? */
+ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
+ return NF_ACCEPT;
+
+ if (skb->_nfct == IP_CT_UNTRACKED)
+ return NF_ACCEPT;
+#endif
+
+ err = nf_ct_frag6_gather(state->net, skb,
+ nf_ct6_defrag_user(state->hook, skb));
+ /* queued */
+ if (err == -EINPROGRESS)
+ return NF_STOLEN;
+
+ return err == 0 ? NF_ACCEPT : NF_DROP;
+}
+
+static const struct nf_hook_ops ipv6_defrag_ops[] = {
+ {
+ .hook = ipv6_defrag,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+ },
+ {
+ .hook = ipv6_defrag,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+ },
+};
+
+static void __net_exit defrag6_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv6_users) {
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ net->nf.defrag_ipv6_users = 0;
+ }
+}
+
+static const struct nf_defrag_hook defrag_hook = {
+ .owner = THIS_MODULE,
+ .enable = nf_defrag_ipv6_enable,
+ .disable = nf_defrag_ipv6_disable,
+};
+
+static struct pernet_operations defrag6_net_ops = {
+ .exit = defrag6_net_exit,
+};
+
+static int __init nf_defrag_init(void)
+{
+ int ret = 0;
+
+ ret = nf_ct_frag6_init();
+ if (ret < 0) {
+ pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
+ return ret;
+ }
+ ret = register_pernet_subsys(&defrag6_net_ops);
+ if (ret < 0) {
+ pr_err("nf_defrag_ipv6: can't register pernet ops\n");
+ goto cleanup_frag6;
+ }
+
+ rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook);
+
+ return ret;
+
+cleanup_frag6:
+ nf_ct_frag6_cleanup();
+ return ret;
+
+}
+
+static void __exit nf_defrag_fini(void)
+{
+ rcu_assign_pointer(nf_defrag_v6_hook, NULL);
+ unregister_pernet_subsys(&defrag6_net_ops);
+ nf_ct_frag6_cleanup();
+}
+
+int nf_defrag_ipv6_enable(struct net *net)
+{
+ int err = 0;
+
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6_users == UINT_MAX) {
+ err = -EOVERFLOW;
+ goto out_unlock;
+ }
+
+ if (net->nf.defrag_ipv6_users) {
+ net->nf.defrag_ipv6_users++;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv6_users = 1;
+
+ out_unlock:
+ mutex_unlock(&defrag6_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+
+void nf_defrag_ipv6_disable(struct net *net)
+{
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6_users) {
+ net->nf.defrag_ipv6_users--;
+ if (net->nf.defrag_ipv6_users == 0)
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ }
+ mutex_unlock(&defrag6_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6 defragmentation support");
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
new file mode 100644
index 000000000000..6da3102b7c1b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *gw, int oif)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ if (oif != -1)
+ fl6.flowi6_oif = oif;
+
+ fl6.daddr = *gw;
+ fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) |
+ (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]);
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return false;
+ }
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ skb->dev = dst_dev(dst);
+ skb->protocol = htons(ETH_P_IPV6);
+
+ return true;
+}
+
+void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
+ const struct in6_addr *gw, int oif)
+{
+ local_bh_disable();
+ if (current->in_nf_duplicate)
+ goto out;
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ nf_reset_ct(skb);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_IN) {
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ --iph->hop_limit;
+ }
+ if (nf_dup_ipv6_route(net, skb, gw, oif)) {
+ current->in_nf_duplicate = true;
+ ip6_local_out(net, skb->sk, skb);
+ current->in_nf_duplicate = false;
+ } else {
+ kfree_skb(skb);
+ }
+out:
+ local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv6);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv6: IPv6 packet duplication");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
new file mode 100644
index 000000000000..ef5b7e85cffa
--- /dev/null
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/module.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_checksum.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit);
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen);
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook);
+
+static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto = ip6h->nexthdr;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ if (ip6h->payload_len &&
+ pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ if (!nf_reject_verify_csum(skb, thoff, proto))
+ return true;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+static int nf_reject_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+ unsigned int otcplen;
+ struct ipv6hdr *nip6h;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+ READ_ONCE(net->ipv6.devconf_all->hop_limit));
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset);
+
+static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ u8 proto = ip6h->nexthdr;
+ u8 _type, *tp;
+ int thoff;
+ __be16 fo;
+
+ thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo);
+
+ if (thoff < 0 || thoff >= skb->len || fo != 0)
+ return false;
+
+ if (proto != IPPROTO_ICMPV6)
+ return false;
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmp6hdr, icmp6_type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMPV6_DEST_UNREACH;
+}
+
+struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *nip6h;
+ struct icmp6hdr *icmp6h;
+ unsigned int len;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */
+ if (nf_skb_is_icmp6_unreach(oldskb))
+ return NULL;
+
+ /* Include "As much of invoking packet as possible without the ICMPv6
+ * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
+ */
+ len = min_t(unsigned int, 1220, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (!nf_reject_v6_csum_ok(oldskb, hook))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
+ READ_ONCE(net->ipv6.devconf_all->hop_limit));
+
+ skb_reset_transport_header(nskb);
+ icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr));
+ icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
+ icmp6h->icmp6_code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ icmp6h->icmp6_cksum =
+ csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
+ nskb->len - sizeof(struct ipv6hdr),
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ nskb->len - sizeof(struct ipv6hdr),
+ 0));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach);
+
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook)
+{
+ const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+ u8 proto;
+ __be16 frag_off;
+ int tcphoff;
+
+ proto = oip6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(oldskb, ((u8 *)(oip6h + 1) - oldskb->data),
+ &proto, &frag_off);
+
+ if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+ pr_debug("Cannot get TCP header.\n");
+ return NULL;
+ }
+
+ *otcplen = oldskb->len - tcphoff;
+
+ /* IP header checks: fragment, too short. */
+ if (proto != IPPROTO_TCP || *otcplen < sizeof(struct tcphdr)) {
+ pr_debug("proto(%d) != IPPROTO_TCP or too short (len = %d)\n",
+ proto, *otcplen);
+ return NULL;
+ }
+
+ otcph = skb_header_pointer(oldskb, tcphoff, sizeof(struct tcphdr),
+ otcph);
+ if (otcph == NULL)
+ return NULL;
+
+ /* No RST for RST. */
+ if (otcph->rst) {
+ pr_debug("RST is set\n");
+ return NULL;
+ }
+
+ /* Check checksum. */
+ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) {
+ pr_debug("TCP checksum is invalid\n");
+ return NULL;
+ }
+
+ return otcph;
+}
+
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit)
+{
+ struct ipv6hdr *ip6h;
+ const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+#define DEFAULT_TOS_VALUE 0x0U
+ const __u8 tclass = DEFAULT_TOS_VALUE;
+
+ skb_put(nskb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(nskb);
+ ip6h = ipv6_hdr(nskb);
+ ip6_flow_hdr(ip6h, tclass, 0);
+ ip6h->hop_limit = hoplimit;
+ ip6h->nexthdr = protocol;
+ ip6h->saddr = oip6h->daddr;
+ ip6h->daddr = oip6h->saddr;
+
+ nskb->protocol = htons(ETH_P_IPV6);
+
+ return ip6h;
+}
+
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen)
+{
+ struct tcphdr *tcph;
+
+ skb_reset_transport_header(nskb);
+ tcph = skb_put_zero(nskb, sizeof(struct tcphdr));
+ /* Truncate to length (no data) */
+ tcph->doff = sizeof(struct tcphdr)/4;
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+
+ if (oth->ack) {
+ tcph->seq = oth->ack_seq;
+ } else {
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+ otcplen - (oth->doff<<2));
+ tcph->ack = 1;
+ }
+
+ tcph->rst = 1;
+
+ /* Adjust TCP checksum */
+ tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr,
+ &ipv6_hdr(nskb)->daddr,
+ sizeof(struct tcphdr), IPPROTO_TCP,
+ csum_partial(tcph,
+ sizeof(struct tcphdr), 0));
+}
+
+static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(struct flowi));
+ fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr;
+ nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false);
+ if (!dst)
+ return -1;
+
+ skb_dst_set(skb_in, dst);
+ return 0;
+}
+
+void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
+{
+ const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+ struct dst_entry *dst = NULL;
+ const struct tcphdr *otcph;
+ struct sk_buff *nskb;
+ struct tcphdr _otcph;
+ unsigned int otcplen;
+ struct flowi6 fl6;
+
+ if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+ (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+ pr_debug("addr is not unicast.\n");
+ return;
+ }
+
+ otcph = nf_reject_ip6_tcphdr_get(oldskb, &_otcph, &otcplen, hook);
+ if (!otcph)
+ return;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.saddr = oip6h->daddr;
+ fl6.daddr = oip6h->saddr;
+ fl6.fl6_sport = otcph->dest;
+ fl6.fl6_dport = otcph->source;
+
+ if (!skb_dst(oldskb)) {
+ nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+ if (!dst)
+ return;
+ skb_dst_set(oldskb, dst);
+ }
+
+ fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst_dev(oldskb));
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
+ security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return;
+ }
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ return;
+
+ nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr) +
+ sizeof(struct tcphdr) + dst->trailer_len,
+ GFP_ATOMIC);
+
+ if (!nskb) {
+ net_dbg_ratelimited("cannot alloc skb\n");
+ dst_release(dst);
+ return;
+ }
+
+ skb_dst_set(nskb, dst);
+
+ nskb->mark = fl6.flowi6_mark;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst));
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
+
+ nf_ct_attach(nskb, oldskb);
+ nf_ct_set_closing(skb_nfct(oldskb));
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip6_local_out for bridged traffic, the MAC source on
+ * the RST will be ours, instead of the destination's. This confuses
+ * some routers/firewalls, and they drop the packet. So we need to
+ * build the eth header using the original destination's MAC as the
+ * source, and send the RST packet directly.
+ */
+ if (nf_bridge_info_exists(oldskb)) {
+ struct ethhdr *oeth = eth_hdr(oldskb);
+ struct ipv6hdr *ip6h = ipv6_hdr(nskb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(oldskb, net);
+ if (!br_indev) {
+ kfree_skb(nskb);
+ return;
+ }
+
+ nskb->dev = br_indev;
+ nskb->protocol = htons(ETH_P_IPV6);
+ ip6h->payload_len = htons(sizeof(struct tcphdr));
+ if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+ oeth->h_source, oeth->h_dest, nskb->len) < 0) {
+ kfree_skb(nskb);
+ return;
+ }
+ dev_queue_xmit(nskb);
+ } else
+#endif
+ ip6_local_out(net, sk, nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset6);
+
+static bool reject6_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ proto = ip6h->nexthdr;
+ thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo);
+
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ if (!nf_reject_verify_csum(skb, thoff, proto))
+ return true;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+void nf_send_unreach6(struct net *net, struct sk_buff *skb_in,
+ unsigned char code, unsigned int hooknum)
+{
+ if (!reject6_csum_ok(skb_in, hooknum))
+ return;
+
+ if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
+ skb_in->dev = net->loopback_dev;
+
+ if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0)
+ return;
+
+ icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach6);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6 packet rejection core");
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
new file mode 100644
index 000000000000..ced8bd44828e
--- /dev/null
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+ unsigned int outside_hdrlen,
+ int *protocol,
+ const struct in6_addr **raddr,
+ const struct in6_addr **laddr,
+ __be16 *rport,
+ __be16 *lport,
+ struct ipv6hdr *ipv6_var)
+{
+ const struct ipv6hdr *inside_iph;
+ struct icmp6hdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+ u8 inside_nexthdr;
+ __be16 inside_fragoff;
+ int inside_hdrlen;
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
+ sizeof(*ipv6_var), ipv6_var);
+ if (inside_iph == NULL)
+ return 1;
+ inside_nexthdr = inside_iph->nexthdr;
+
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
+ sizeof(*ipv6_var),
+ &inside_nexthdr, &inside_fragoff);
+ if (inside_hdrlen < 0)
+ return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+ if (inside_nexthdr != IPPROTO_TCP &&
+ inside_nexthdr != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, inside_hdrlen,
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_nexthdr;
+ *laddr = &inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = &inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
+ const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be16 dport, sport;
+ const struct in6_addr *daddr = NULL, *saddr = NULL;
+ struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
+ struct sk_buff *data_skb = NULL;
+ int doff = 0;
+ int thoff = 0, tproto;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NULL;
+ }
+
+ if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+ struct tcphdr _hdr;
+ struct udphdr *hp;
+
+ hp = skb_header_pointer(skb, thoff, tproto == IPPROTO_UDP ?
+ sizeof(*hp) : sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ saddr = &iph->saddr;
+ sport = hp->source;
+ daddr = &iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = tproto == IPPROTO_TCP ?
+ thoff + __tcp_hdrlen((struct tcphdr *)hp) :
+ thoff + sizeof(*hp);
+
+ } else if (tproto == IPPROTO_ICMPV6) {
+ if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+ &sport, &dport, &ipv6_var))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct &&
+ ((tproto != IPPROTO_ICMPV6 &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (tproto == IPPROTO_ICMPV6 &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+ daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
+ dport = (tproto == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
+ return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
+ sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv6 socket lookup infrastructure");
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
new file mode 100644
index 000000000000..b2f59ed9d7cc
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+const struct in6_addr *
+nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+ const struct in6_addr *daddr)
+{
+ struct inet6_dev *indev;
+ struct inet6_ifaddr *ifa;
+ struct in6_addr *laddr;
+
+ if (!ipv6_addr_any(user_laddr))
+ return user_laddr;
+ laddr = NULL;
+
+ indev = __in6_dev_get(skb->dev);
+ if (indev) {
+ read_lock_bh(&indev->lock);
+ list_for_each_entry(ifa, &indev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+ continue;
+
+ laddr = &ifa->addr;
+ break;
+ }
+ read_unlock_bh(&indev->lock);
+ }
+
+ return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr6);
+
+struct sock *
+nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ struct net *net,
+ const struct in6_addr *laddr,
+ const __be16 lport,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto,
+ &iph->saddr,
+ nf_tproxy_laddr6(skb, laddr, &iph->daddr),
+ hp->source,
+ lport ? lport : hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ if (sk2) {
+ nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
+
+struct sock *
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
+ const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff,
+ sizeof(struct tcphdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ switch (lookup_type) {
+ case NF_TPROXY_LOOKUP_LISTENER:
+ sk = inet6_lookup_listener(net, skb,
+ thoff + __tcp_hdrlen(hp),
+ saddr, sport,
+ daddr, ntohs(dport),
+ in->ifindex, 0);
+
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NF_TPROXY_LOOKUP_ESTABLISHED:
+ sk = __inet6_lookup_established(net, saddr, sport, daddr,
+ ntohs(dport), in->ifindex, 0);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ }
+ case IPPROTO_UDP:
+ sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+ (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+ protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support");
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
new file mode 100644
index 000000000000..492a811828a7
--- /dev/null
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+
+struct nft_dup_ipv6 {
+ u8 sreg_addr;
+ u8 sreg_dev;
+};
+
+static void nft_dup_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+ struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
+ int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
+
+ nf_dup_ipv6(nft_net(pkt), pkt->skb, nft_hook(pkt), gw, oif);
+}
+
+static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+ return -EINVAL;
+
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in6_addr));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
+}
+
+static int nft_dup_ipv6_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr))
+ goto nla_put_failure;
+ if (priv->sreg_dev &&
+ nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv6_type;
+static const struct nft_expr_ops nft_dup_ipv6_ops = {
+ .type = &nft_dup_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv6)),
+ .eval = nft_dup_ipv6_eval,
+ .init = nft_dup_ipv6_init,
+ .dump = nft_dup_ipv6_dump,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = {
+ [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "dup",
+ .ops = &nft_dup_ipv6_ops,
+ .policy = nft_dup_ipv6_policy,
+ .maxattr = NFTA_DUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_dup_ipv6_module_init(void)
+{
+ return nft_register_expr(&nft_dup_ipv6_type);
+}
+
+static void __exit nft_dup_ipv6_module_exit(void)
+{
+ nft_unregister_expr(&nft_dup_ipv6_type);
+}
+
+module_init(nft_dup_ipv6_module_init);
+module_exit(nft_dup_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup");
+MODULE_DESCRIPTION("IPv6 nftables packet duplication support");
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
new file mode 100644
index 000000000000..421036a3605b
--- /dev/null
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+static int get_ifindex(const struct net_device *dev)
+{
+ return dev ? dev->ifindex : 0;
+}
+
+static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
+ const struct nft_pktinfo *pkt,
+ const struct net_device *dev,
+ struct ipv6hdr *iph)
+{
+ int lookup_flags = 0;
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl6->daddr = iph->daddr;
+ fl6->saddr = iph->saddr;
+ } else {
+ if (nft_hook(pkt) == NF_INET_FORWARD &&
+ priv->flags & NFTA_FIB_F_IIF)
+ fl6->flowi6_iif = nft_out(pkt)->ifindex;
+
+ fl6->daddr = iph->saddr;
+ fl6->saddr = iph->daddr;
+ }
+
+ if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev);
+ }
+
+ if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST)
+ lookup_flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl6->flowi6_mark = pkt->skb->mark;
+
+ fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
+ fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);
+
+ return lookup_flags;
+}
+
+static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
+ const struct nft_pktinfo *pkt,
+ struct ipv6hdr *iph)
+{
+ const struct net_device *dev = NULL;
+ int route_err, addrtype;
+ struct rt6_info *rt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
+ };
+ u32 ret = 0;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
+
+ if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
+ ret = RTN_LOCAL;
+
+ route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt,
+ flowi6_to_flowi(&fl6), false);
+ if (route_err)
+ goto err;
+
+ if (rt->rt6i_flags & RTF_REJECT) {
+ route_err = rt->dst.error;
+ dst_release(&rt->dst);
+ goto err;
+ }
+
+ if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr))
+ ret = RTN_ANYCAST;
+ else if (!dev && rt->rt6i_flags & RTF_LOCAL)
+ ret = RTN_LOCAL;
+
+ dst_release(&rt->dst);
+
+ if (ret)
+ return ret;
+
+ addrtype = ipv6_addr_type(&fl6.daddr);
+
+ if (addrtype & IPV6_ADDR_MULTICAST)
+ return RTN_MULTICAST;
+ if (addrtype & IPV6_ADDR_UNICAST)
+ return RTN_UNICAST;
+
+ return RTN_UNSPEC;
+ err:
+ switch (route_err) {
+ case -EINVAL:
+ return RTN_BLACKHOLE;
+ case -EACCES:
+ return RTN_PROHIBIT;
+ case -EAGAIN:
+ return RTN_THROW;
+ default:
+ break;
+ }
+
+ return RTN_UNREACHABLE;
+}
+
+void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ int noff = skb_network_offset(pkt->skb);
+ u32 *dest = &regs->data[priv->dreg];
+ struct ipv6hdr *iph, _iph;
+
+ iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+ if (!iph) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ *dest = __nft_fib6_eval_type(priv, pkt, iph);
+}
+EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
+
+static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const struct ipv6hdr *iph)
+{
+ if (likely(next != IPPROTO_ICMPV6))
+ return false;
+
+ if (ipv6_addr_type(&iph->saddr) != IPV6_ADDR_ANY)
+ return false;
+
+ return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL;
+}
+
+void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ int noff = skb_network_offset(pkt->skb);
+ const struct net_device *found = NULL;
+ const struct net_device *oif = NULL;
+ u32 *dest = &regs->data[priv->dreg];
+ struct ipv6hdr *iph, _iph;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
+ };
+ struct rt6_info *rt;
+ int lookup_flags;
+
+ if (nft_fib_can_skip(pkt)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+
+ iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+ if (!iph) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
+
+ lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
+
+ *dest = 0;
+ rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb,
+ lookup_flags);
+ if (rt->dst.error)
+ goto put_rt_err;
+
+ /* Should not see RTF_LOCAL here */
+ if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
+ goto put_rt_err;
+
+ if (!oif) {
+ found = rt->rt6i_idev->dev;
+ } else {
+ if (oif == rt->rt6i_idev->dev ||
+ l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex)
+ found = oif;
+ }
+
+ nft_fib_store_result(dest, priv, found);
+ put_rt_err:
+ ip6_rt_put(rt);
+}
+EXPORT_SYMBOL_GPL(nft_fib6_eval);
+
+static struct nft_expr_type nft_fib6_type;
+
+static const struct nft_expr_ops nft_fib6_type_ops = {
+ .type = &nft_fib6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib6_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
+};
+
+static const struct nft_expr_ops nft_fib6_ops = {
+ .type = &nft_fib6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib6_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
+};
+
+static const struct nft_expr_ops *
+nft_fib6_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib6_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib6_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib6_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib6_type __read_mostly = {
+ .name = "fib",
+ .select_ops = nft_fib6_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib6_module_init(void)
+{
+ return nft_register_expr(&nft_fib6_type);
+}
+
+static void __exit nft_fib6_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib6_type);
+}
+module_init(nft_fib6_module_init);
+module_exit(nft_fib6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(10, "fib");
+MODULE_DESCRIPTION("nftables fib / ipv6 route lookup support");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
new file mode 100644
index 000000000000..5c61294f410e
--- /dev/null
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach6(nft_net(pkt), pkt->skb, priv->icmp_code,
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb,
+ nft_hook(pkt));
+ break;
+ default:
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_reject_ipv6_type;
+static const struct nft_expr_ops nft_reject_ipv6_ops = {
+ .type = &nft_reject_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv6_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "reject",
+ .ops = &nft_reject_ipv6_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv6_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv6_type);
+}
+
+static void __exit nft_reject_ipv6_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv6_type);
+}
+
+module_init(nft_reject_ipv6_module_init);
+module_exit(nft_reject_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject");
+MODULE_DESCRIPTION("IPv6 packet rejection for nftables");
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
new file mode 100644
index 000000000000..1c9b283a4132
--- /dev/null
+++ b/net/ipv6/output_core.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static. These functions are needed by GSO/GRO implementation.
+ */
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/addrconf.h>
+#include <net/secure_seq.h>
+#include <linux/netfilter.h>
+
+static u32 __ipv6_select_ident(struct net *net,
+ const struct in6_addr *dst,
+ const struct in6_addr *src)
+{
+ return get_random_u32_above(0);
+}
+
+/* This function exists only for tap drivers that must support broken
+ * clients requesting UFO without specifying an IPv6 fragment ID.
+ *
+ * This is similar to ipv6_select_ident() but we use an independent hash
+ * seed to limit information leakage.
+ *
+ * The network header must be set before calling this.
+ */
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
+{
+ struct in6_addr buf[2];
+ struct in6_addr *addrs;
+ u32 id;
+
+ addrs = skb_header_pointer(skb,
+ skb_network_offset(skb) +
+ offsetof(struct ipv6hdr, saddr),
+ sizeof(buf), buf);
+ if (!addrs)
+ return 0;
+
+ id = __ipv6_select_ident(net, &addrs[1], &addrs[0]);
+ return htonl(id);
+}
+EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
+
+__be32 ipv6_select_ident(struct net *net,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ u32 id;
+
+ id = __ipv6_select_ident(net, daddr, saddr);
+ return htonl(id);
+}
+EXPORT_SYMBOL(ipv6_select_ident);
+
+int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
+{
+ unsigned int offset = sizeof(struct ipv6hdr);
+ unsigned int packet_len = skb_tail_pointer(skb) -
+ skb_network_header(skb);
+ int found_rhdr = 0;
+ *nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+ while (offset <= packet_len) {
+ struct ipv6_opt_hdr *exthdr;
+
+ switch (**nexthdr) {
+
+ case NEXTHDR_HOP:
+ break;
+ case NEXTHDR_ROUTING:
+ found_rhdr = 1;
+ break;
+ case NEXTHDR_DEST:
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
+ break;
+#endif
+ if (found_rhdr)
+ return offset;
+ break;
+ default:
+ return offset;
+ }
+
+ if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
+ return -EINVAL;
+
+ exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
+ offset);
+ offset += ipv6_optlen(exthdr);
+ if (offset > IPV6_MAXPLEN)
+ return -EINVAL;
+ *nexthdr = &exthdr->nexthdr;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ip6_find_1stfragopt);
+
+#if IS_ENABLED(CONFIG_IPV6)
+int ip6_dst_hoplimit(struct dst_entry *dst)
+{
+ int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+
+ rcu_read_lock();
+ if (hoplimit == 0) {
+ struct net_device *dev = dst_dev_rcu(dst);
+ struct inet6_dev *idev;
+
+ idev = __in6_dev_get(dev);
+ if (idev)
+ hoplimit = READ_ONCE(idev->cnf.hop_limit);
+ else
+ hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
+ }
+ rcu_read_unlock();
+
+ return hoplimit;
+}
+EXPORT_SYMBOL(ip6_dst_hoplimit);
+#endif
+
+int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int len;
+
+ len = skb->len - sizeof(struct ipv6hdr);
+ if (len > IPV6_MAXPLEN)
+ len = 0;
+ ipv6_hdr(skb)->payload_len = htons(len);
+ IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
+ skb->protocol = htons(ETH_P_IPV6);
+
+ return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst_dev(skb),
+ dst_output);
+}
+EXPORT_SYMBOL_GPL(__ip6_local_out);
+
+int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = __ip6_local_out(net, sk, skb);
+ if (likely(err == 1))
+ err = dst_output(net, sk, skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip6_local_out);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
new file mode 100644
index 000000000000..e4afc651731a
--- /dev/null
+++ b/net/ipv6/ping.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * "Ping" sockets
+ *
+ * Based on ipv4/ping.c code.
+ *
+ * Authors: Lorenzo Colitti (IPv6 support)
+ * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6),
+ * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32)
+ */
+
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/transp_v6.h>
+#include <linux/proc_fs.h>
+#include <linux/bpf-cgroup.h>
+#include <net/ping.h>
+
+/* Compatibility glue so we can support IPv6 when it's compiled as a module */
+static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
+ int *addr_len)
+{
+ return -EAFNOSUPPORT;
+}
+static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+}
+static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err)
+{
+ return -EAFNOSUPPORT;
+}
+static void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload) {}
+static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict)
+{
+ return 0;
+}
+
+static int ping_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from __ip6_datagram_connect() and
+ * intended to prevent BPF program called below from accessing
+ * bytes that are out of the bound specified by user in addr_len.
+ */
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+
+static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct icmp6hdr user_icmph;
+ int addr_type;
+ struct in6_addr *daddr;
+ int oif = 0;
+ struct flowi6 fl6;
+ int err;
+ struct dst_entry *dst;
+ struct rt6_info *rt;
+ struct pingfakehdr pfh;
+ struct ipcm6_cookie ipc6;
+
+ err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
+ memset(&fl6, 0, sizeof(fl6));
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*u))
+ return -EINVAL;
+ if (u->sin6_family != AF_INET6) {
+ return -EAFNOSUPPORT;
+ }
+ daddr = &(u->sin6_addr);
+ if (inet6_test_bit(SNDFLOW, sk))
+ fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
+ oif = u->sin6_scope_id;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
+ }
+
+ if (!oif)
+ oif = sk->sk_bound_dev_if;
+
+ if (!oif)
+ oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ if (!oif && ipv6_addr_is_multicast(daddr))
+ oif = READ_ONCE(np->mcast_oif);
+ else if (!oif)
+ oif = READ_ONCE(np->ucast_oif);
+
+ addr_type = ipv6_addr_type(daddr);
+ if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) ||
+ (addr_type & IPV6_ADDR_MAPPED) ||
+ (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if &&
+ l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if))
+ return -EINVAL;
+
+ ipcm6_init_sk(&ipc6, sk);
+
+ fl6.flowi6_oif = oif;
+
+ if (msg->msg_controllen) {
+ struct ipv6_txoptions opt = {};
+
+ opt.tot_len = sizeof(opt);
+ ipc6.opt = &opt;
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
+ if (err < 0)
+ return err;
+
+ /* Changes to txoptions and flow info are not implemented, yet.
+ * Drop the options.
+ */
+ ipc6.opt = NULL;
+ }
+
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ fl6.saddr = np->saddr;
+ fl6.daddr = *daddr;
+ fl6.flowi6_mark = ipc6.sockc.mark;
+ fl6.flowi6_uid = sk_uid(sk);
+ fl6.fl6_icmp_type = user_icmph.icmp6_type;
+ fl6.fl6_icmp_code = user_icmph.icmp6_code;
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+
+ dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ rt = dst_rt6_info(dst);
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
+
+ pfh.icmph.type = user_icmph.icmp6_type;
+ pfh.icmph.code = user_icmph.icmp6_code;
+ pfh.icmph.checksum = 0;
+ pfh.icmph.un.echo.id = inet->inet_sport;
+ pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence;
+ pfh.msg = msg;
+ pfh.wcheck = 0;
+ pfh.family = AF_INET6;
+
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ lock_sock(sk);
+ err = ip6_append_data(sk, ping_getfrag, &pfh, len,
+ sizeof(struct icmp6hdr), &ipc6, &fl6, rt,
+ MSG_DONTWAIT);
+
+ if (err) {
+ ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev,
+ ICMP6_MIB_OUTERRORS);
+ ip6_flush_pending_frames(sk);
+ } else {
+ icmpv6_push_pending_frames(sk, &fl6,
+ (struct icmp6hdr *)&pfh.icmph, len);
+ }
+ release_sock(sk);
+
+ dst_release(dst);
+
+ if (err)
+ return err;
+
+ return len;
+}
+
+struct proto pingv6_prot = {
+ .name = "PINGv6",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .pre_connect = ping_v6_pre_connect,
+ .connect = ip6_datagram_connect_v6_only,
+ .disconnect = __udp_disconnect,
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .sendmsg = ping_v6_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .put_port = ping_unhash,
+ .obj_size = sizeof(struct raw6_sock),
+ .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6),
+};
+EXPORT_SYMBOL_GPL(pingv6_prot);
+
+static struct inet_protosw pingv6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMPV6,
+ .prot = &pingv6_prot,
+ .ops = &inet6_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET6);
+}
+
+static int ping_v6_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
+ } else {
+ int bucket = ((struct ping_iter_state *) seq->private)->bucket;
+ struct inet_sock *inet = inet_sk((struct sock *)v);
+ __u16 srcp = ntohs(inet->inet_sport);
+ __u16 destp = ntohs(inet->inet_dport);
+ ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
+ }
+ return 0;
+}
+
+static const struct seq_operations ping_v6_seq_ops = {
+ .start = ping_v6_seq_start,
+ .show = ping_v6_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+};
+
+static int __net_init ping_v6_proc_init_net(struct net *net)
+{
+ if (!proc_create_net("icmp6", 0444, net->proc_net, &ping_v6_seq_ops,
+ sizeof(struct ping_iter_state)))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit ping_v6_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("icmp6", net->proc_net);
+}
+
+static struct pernet_operations ping_v6_net_ops = {
+ .init = ping_v6_proc_init_net,
+ .exit = ping_v6_proc_exit_net,
+};
+#endif
+
+int __init pingv6_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ int ret = register_pernet_subsys(&ping_v6_net_ops);
+ if (ret)
+ return ret;
+#endif
+ pingv6_ops.ipv6_recv_error = ipv6_recv_error;
+ pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl;
+ pingv6_ops.ip6_datagram_recv_specific_ctl =
+ ip6_datagram_recv_specific_ctl;
+ pingv6_ops.icmpv6_err_convert = icmpv6_err_convert;
+ pingv6_ops.ipv6_icmp_error = ipv6_icmp_error;
+ pingv6_ops.ipv6_chk_addr = ipv6_chk_addr;
+ return inet6_register_protosw(&pingv6_protosw);
+}
+
+/* This never gets called because it's not possible to unload the ipv6 module,
+ * but just in case.
+ */
+void pingv6_exit(void)
+{
+ pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error;
+ pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl;
+ pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl;
+ pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert;
+ pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error;
+ pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr;
+#ifdef CONFIG_PROC_FS
+ unregister_pernet_subsys(&ping_v6_net_ops);
+#endif
+ inet6_unregister_protosw(&pingv6_protosw);
+}
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 35249d8487bb..73296f38c252 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,60 +8,50 @@
* PROC file system. This is very similar to the IPv4 version,
* except it reports the sockets in the INET6 address family.
*
- * Version: $Id: proc.c,v 1.17 2002/02/01 22:01:04 davem Exp $
- *
* Authors: David S. Miller (davem@caip.rutgers.edu)
- * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
*/
-#include <linux/sched.h>
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/ipv6.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stddef.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_net_devsnmp6;
-
-static int fold_prot_inuse(struct proto *proto)
-{
- int res = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- res += proto->stats[cpu].inuse;
-
- return res;
-}
+#define MAX4(a, b, c, d) \
+ MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d))
+#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
+ IPSTATS_MIB_MAX, ICMP_MIB_MAX)
static int sockstat6_seq_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq->private;
+
seq_printf(seq, "TCP6: inuse %d\n",
- fold_prot_inuse(&tcpv6_prot));
+ sock_prot_inuse_get(net, &tcpv6_prot));
seq_printf(seq, "UDP6: inuse %d\n",
- fold_prot_inuse(&udpv6_prot));
+ sock_prot_inuse_get(net, &udpv6_prot));
seq_printf(seq, "UDPLITE6: inuse %d\n",
- fold_prot_inuse(&udplitev6_prot));
+ sock_prot_inuse_get(net, &udplitev6_prot));
seq_printf(seq, "RAW6: inuse %d\n",
- fold_prot_inuse(&rawv6_prot));
- seq_printf(seq, "FRAG6: inuse %d memory %d\n",
- ip6_frag_nqueues, atomic_read(&ip6_frag_mem));
+ sock_prot_inuse_get(net, &rawv6_prot));
+ seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
+ atomic_read(&net->ipv6.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv6.fqdir));
return 0;
}
-static struct snmp_mib snmp6_ipstats_list[] = {
+static const struct snmp_mib snmp6_ipstats_list[] = {
/* ipv6 mib according to RFC 2465 */
- SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INRECEIVES),
+ SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS),
SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS),
SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS),
SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES),
@@ -82,237 +73,258 @@ static struct snmp_mib snmp6_ipstats_list[] = {
SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES),
SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS),
+ SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS),
+ SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+ SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+ SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+ SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */
+ SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS),
};
-static struct snmp_mib snmp6_icmp6_list[] = {
-/* icmpv6 mib according to RFC 2466
-
- Exceptions: {In|Out}AdminProhibs are removed, because I see
- no good reasons to account them separately
- of another dest.unreachs.
- OutErrs is zero identically.
- OutEchos too.
- OutRouterAdvertisements too.
- OutGroupMembQueries too.
- */
+static const struct snmp_mib snmp6_icmp6_list[] = {
+/* icmpv6 mib according to RFC 2466 */
SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS),
SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS),
- SNMP_MIB_ITEM("Icmp6InDestUnreachs", ICMP6_MIB_INDESTUNREACHS),
- SNMP_MIB_ITEM("Icmp6InPktTooBigs", ICMP6_MIB_INPKTTOOBIGS),
- SNMP_MIB_ITEM("Icmp6InTimeExcds", ICMP6_MIB_INTIMEEXCDS),
- SNMP_MIB_ITEM("Icmp6InParmProblems", ICMP6_MIB_INPARMPROBLEMS),
- SNMP_MIB_ITEM("Icmp6InEchos", ICMP6_MIB_INECHOS),
- SNMP_MIB_ITEM("Icmp6InEchoReplies", ICMP6_MIB_INECHOREPLIES),
- SNMP_MIB_ITEM("Icmp6InGroupMembQueries", ICMP6_MIB_INGROUPMEMBQUERIES),
- SNMP_MIB_ITEM("Icmp6InGroupMembResponses", ICMP6_MIB_INGROUPMEMBRESPONSES),
- SNMP_MIB_ITEM("Icmp6InGroupMembReductions", ICMP6_MIB_INGROUPMEMBREDUCTIONS),
- SNMP_MIB_ITEM("Icmp6InRouterSolicits", ICMP6_MIB_INROUTERSOLICITS),
- SNMP_MIB_ITEM("Icmp6InRouterAdvertisements", ICMP6_MIB_INROUTERADVERTISEMENTS),
- SNMP_MIB_ITEM("Icmp6InNeighborSolicits", ICMP6_MIB_INNEIGHBORSOLICITS),
- SNMP_MIB_ITEM("Icmp6InNeighborAdvertisements", ICMP6_MIB_INNEIGHBORADVERTISEMENTS),
- SNMP_MIB_ITEM("Icmp6InRedirects", ICMP6_MIB_INREDIRECTS),
SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
- SNMP_MIB_ITEM("Icmp6OutDestUnreachs", ICMP6_MIB_OUTDESTUNREACHS),
- SNMP_MIB_ITEM("Icmp6OutPktTooBigs", ICMP6_MIB_OUTPKTTOOBIGS),
- SNMP_MIB_ITEM("Icmp6OutTimeExcds", ICMP6_MIB_OUTTIMEEXCDS),
- SNMP_MIB_ITEM("Icmp6OutParmProblems", ICMP6_MIB_OUTPARMPROBLEMS),
- SNMP_MIB_ITEM("Icmp6OutEchoReplies", ICMP6_MIB_OUTECHOREPLIES),
- SNMP_MIB_ITEM("Icmp6OutRouterSolicits", ICMP6_MIB_OUTROUTERSOLICITS),
- SNMP_MIB_ITEM("Icmp6OutNeighborSolicits", ICMP6_MIB_OUTNEIGHBORSOLICITS),
- SNMP_MIB_ITEM("Icmp6OutNeighborAdvertisements", ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS),
- SNMP_MIB_ITEM("Icmp6OutRedirects", ICMP6_MIB_OUTREDIRECTS),
- SNMP_MIB_ITEM("Icmp6OutGroupMembResponses", ICMP6_MIB_OUTGROUPMEMBRESPONSES),
- SNMP_MIB_ITEM("Icmp6OutGroupMembReductions", ICMP6_MIB_OUTGROUPMEMBREDUCTIONS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
+ SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS),
+/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */
+ SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST),
};
-static struct snmp_mib snmp6_udp6_list[] = {
+static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS),
SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+ SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS),
};
-static struct snmp_mib snmp6_udplite6_list[] = {
+static const struct snmp_mib snmp6_udplite6_list[] = {
SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS),
SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS),
SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS),
SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+ SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS),
};
-static unsigned long
-fold_field(void *mib[], int offt)
+static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
{
- unsigned long res = 0;
- int i;
-
- for_each_possible_cpu(i) {
- res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt);
- res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt);
- }
- return res;
+ char name[32];
+ int i;
+
+ /* print by name -- deprecated items */
+ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ const char *p = NULL;
+ int icmptype;
+
+#define CASE(TYP, STR) case TYP: p = STR; break;
+
+ icmptype = i & 0xff;
+ switch (icmptype) {
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+ CASE(ICMPV6_DEST_UNREACH, "DestUnreachs")
+ CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs")
+ CASE(ICMPV6_TIME_EXCEED, "TimeExcds")
+ CASE(ICMPV6_PARAMPROB, "ParmProblems")
+ CASE(ICMPV6_ECHO_REQUEST, "Echos")
+ CASE(ICMPV6_ECHO_REPLY, "EchoReplies")
+ CASE(ICMPV6_MGM_QUERY, "GroupMembQueries")
+ CASE(ICMPV6_MGM_REPORT, "GroupMembResponses")
+ CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions")
+ CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports")
+ CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements")
+ CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits")
+ CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements")
+ CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits")
+ CASE(NDISC_REDIRECT, "Redirects")
+ }
+#undef CASE
+ if (!p) /* don't print un-named types here */
+ continue;
+ snprintf(name, sizeof(name), "Icmp6%s%s",
+ i & 0x100 ? "Out" : "In", p);
+ seq_printf(seq, "%-32s\t%lu\n", name,
+ atomic_long_read(smib + i));
+ }
+
+ /* print by number (nonzero only) - ICMPMsgStat format */
+ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ unsigned long val;
+
+ val = atomic_long_read(smib + i);
+ if (!val)
+ continue;
+ snprintf(name, sizeof(name), "Icmp6%sType%u",
+ i & 0x100 ? "Out" : "In", i & 0xff);
+ seq_printf(seq, "%-32s\t%lu\n", name, val);
+ }
}
-static inline void
-snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist)
+/* can be called either with percpu mib (pcpumib != NULL),
+ * or shared one (smib != NULL)
+ */
+static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
+ atomic_long_t *smib,
+ const struct snmp_mib *itemlist,
+ int cnt)
{
+ unsigned long buff[SNMP_MIB_MAX];
int i;
- for (i=0; itemlist[i].name; i++)
- seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
- fold_field(mib, itemlist[i].entry));
-}
-static int snmp6_seq_show(struct seq_file *seq, void *v)
-{
- struct inet6_dev *idev = (struct inet6_dev *)seq->private;
+ if (pcpumib) {
+ memset(buff, 0, sizeof(unsigned long) * cnt);
- if (idev) {
- seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
- snmp6_seq_show_item(seq, (void **)idev->stats.ipv6, snmp6_ipstats_list);
- snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list);
+ snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib);
+ for (i = 0; i < cnt; i++)
+ seq_printf(seq, "%-32s\t%lu\n",
+ itemlist[i].name, buff[i]);
} else {
- snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
- snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
- snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
- snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
+ for (i = 0; i < cnt; i++)
+ seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
+ atomic_long_read(smib + itemlist[i].entry));
}
- return 0;
}
-static int sockstat6_seq_open(struct inode *inode, struct file *file)
+static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
+ const struct snmp_mib *itemlist,
+ int cnt, size_t syncpoff)
{
- return single_open(file, sockstat6_seq_show, NULL);
-}
+ u64 buff64[SNMP_MIB_MAX];
+ int i;
-static struct file_operations sockstat6_seq_fops = {
- .owner = THIS_MODULE,
- .open = sockstat6_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+ memset(buff64, 0, sizeof(u64) * cnt);
-static int snmp6_seq_open(struct inode *inode, struct file *file)
+ snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff);
+ for (i = 0; i < cnt; i++)
+ seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
+}
+
+static int snmp6_seq_show(struct seq_file *seq, void *v)
{
- return single_open(file, snmp6_seq_show, PDE(inode)->data);
+ struct net *net = (struct net *)seq->private;
+
+ snmp6_seq_show_item64(seq, net->mib.ipv6_statistics,
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
+ snmp6_seq_show_item(seq, net->mib.icmpv6_statistics,
+ NULL, snmp6_icmp6_list,
+ ARRAY_SIZE(snmp6_icmp6_list));
+ snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs);
+ snmp6_seq_show_item(seq, net->mib.udp_stats_in6,
+ NULL, snmp6_udp6_list,
+ ARRAY_SIZE(snmp6_udp6_list));
+ snmp6_seq_show_item(seq, net->mib.udplite_stats_in6,
+ NULL, snmp6_udplite6_list,
+ ARRAY_SIZE(snmp6_udplite6_list));
+ return 0;
}
-static struct file_operations snmp6_seq_fops = {
- .owner = THIS_MODULE,
- .open = snmp6_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
+{
+ struct inet6_dev *idev = (struct inet6_dev *)seq->private;
+
+ seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
+ snmp6_seq_show_item64(seq, idev->stats.ipv6,
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
+
+ /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */
+ snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs,
+ snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1);
+
+ snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs);
+ return 0;
+}
int snmp6_register_dev(struct inet6_dev *idev)
{
struct proc_dir_entry *p;
+ struct net *net;
if (!idev || !idev->dev)
return -EINVAL;
- if (!proc_net_devsnmp6)
+ net = dev_net(idev->dev);
+ if (!net->mib.proc_net_devsnmp6)
return -ENOENT;
- p = create_proc_entry(idev->dev->name, S_IRUGO, proc_net_devsnmp6);
+ p = proc_create_single_data(idev->dev->name, 0444,
+ net->mib.proc_net_devsnmp6, snmp6_dev_seq_show, idev);
if (!p)
return -ENOMEM;
- p->data = idev;
- p->proc_fops = &snmp6_seq_fops;
-
idev->stats.proc_dir_entry = p;
return 0;
}
int snmp6_unregister_dev(struct inet6_dev *idev)
{
- if (!proc_net_devsnmp6)
+ struct net *net = dev_net(idev->dev);
+ if (!net->mib.proc_net_devsnmp6)
return -ENOENT;
- if (!idev || !idev->stats.proc_dir_entry)
+ if (!idev->stats.proc_dir_entry)
return -EINVAL;
- remove_proc_entry(idev->stats.proc_dir_entry->name,
- proc_net_devsnmp6);
+ proc_remove(idev->stats.proc_dir_entry);
+ idev->stats.proc_dir_entry = NULL;
return 0;
}
-int __init ipv6_misc_proc_init(void)
+static int __net_init ipv6_proc_init_net(struct net *net)
{
- int rc = 0;
+ if (!proc_create_net_single("sockstat6", 0444, net->proc_net,
+ sockstat6_seq_show, NULL))
+ return -ENOMEM;
- if (!proc_net_fops_create("snmp6", S_IRUGO, &snmp6_seq_fops))
+ if (!proc_create_net_single("snmp6", 0444, net->proc_net,
+ snmp6_seq_show, NULL))
goto proc_snmp6_fail;
- proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net);
- if (!proc_net_devsnmp6)
+ net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+ if (!net->mib.proc_net_devsnmp6)
goto proc_dev_snmp6_fail;
+ return 0;
- if (!proc_net_fops_create("sockstat6", S_IRUGO, &sockstat6_seq_fops))
- goto proc_sockstat6_fail;
-out:
- return rc;
-
-proc_sockstat6_fail:
- proc_net_remove("dev_snmp6");
proc_dev_snmp6_fail:
- proc_net_remove("snmp6");
+ remove_proc_entry("snmp6", net->proc_net);
proc_snmp6_fail:
- rc = -ENOMEM;
- goto out;
+ remove_proc_entry("sockstat6", net->proc_net);
+ return -ENOMEM;
}
-void ipv6_misc_proc_exit(void)
+static void __net_exit ipv6_proc_exit_net(struct net *net)
{
- proc_net_remove("sockstat6");
- proc_net_remove("dev_snmp6");
- proc_net_remove("snmp6");
+ remove_proc_entry("sockstat6", net->proc_net);
+ remove_proc_entry("dev_snmp6", net->proc_net);
+ remove_proc_entry("snmp6", net->proc_net);
}
-#else /* CONFIG_PROC_FS */
-
-
-int snmp6_register_dev(struct inet6_dev *idev)
-{
- return 0;
-}
-
-int snmp6_unregister_dev(struct inet6_dev *idev)
-{
- return 0;
-}
-#endif /* CONFIG_PROC_FS */
+static struct pernet_operations ipv6_proc_ops = {
+ .init = ipv6_proc_init_net,
+ .exit = ipv6_proc_exit_net,
+};
-int snmp6_alloc_dev(struct inet6_dev *idev)
+int __init ipv6_misc_proc_init(void)
{
- int err = -ENOMEM;
-
- if (!idev || !idev->dev)
- return -EINVAL;
-
- if (snmp6_mib_init((void **)idev->stats.ipv6, sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
- goto err_ip;
- if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib),
- __alignof__(struct icmpv6_mib)) < 0)
- goto err_icmp;
-
- return 0;
-
-err_icmp:
- snmp6_mib_free((void **)idev->stats.ipv6);
-err_ip:
- return err;
+ return register_pernet_subsys(&ipv6_proc_ops);
}
-int snmp6_free_dev(struct inet6_dev *idev)
+void ipv6_misc_proc_exit(void)
{
- snmp6_mib_free((void **)idev->stats.icmpv6);
- snmp6_mib_free((void **)idev->stats.ipv6);
- return 0;
+ unregister_pernet_subsys(&ipv6_proc_ops);
}
-
-
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 52c1d58b6ca6..d4b1806bab1b 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,14 +6,7 @@
*
* PF_INET6 protocol dispatch tables.
*
- * Version: $Id: protocol.c,v 1.10 2001/05/18 02:25:49 davem Exp $
- *
* Authors: Pedro Roque <roque@di.fc.ul.pt>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -22,65 +16,55 @@
* - Removed unused variable 'inet6_protocol_base'
* - Modified inet6_del_protocol() to correctly maintain copy bit.
*/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/sched.h>
-#include <linux/net.h>
-#include <linux/in6.h>
+#include <linux/module.h>
#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-
-#include <net/sock.h>
-#include <net/snmp.h>
-
-#include <net/ipv6.h>
+#include <linux/spinlock.h>
#include <net/protocol.h>
-struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
-static DEFINE_SPINLOCK(inet6_proto_lock);
-
+#if IS_ENABLED(CONFIG_IPV6)
+struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet6_protos);
-int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol)
+int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
{
- int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+ return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet6_add_protocol);
- spin_lock_bh(&inet6_proto_lock);
+int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
+{
+ int ret;
- if (inet6_protos[hash]) {
- ret = -1;
- } else {
- inet6_protos[hash] = prot;
- ret = 0;
- }
+ ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
+ prot, NULL) == prot) ? 0 : -1;
- spin_unlock_bh(&inet6_proto_lock);
+ synchronize_net();
return ret;
}
+EXPORT_SYMBOL(inet6_del_protocol);
+#endif
-/*
- * Remove a protocol from the hash tables.
- */
-
-int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol)
-{
- int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet6_offloads);
- spin_lock_bh(&inet6_proto_lock);
+int inet6_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_offload **)&inet6_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet6_add_offload);
- if (inet6_protos[hash] != prot) {
- ret = -1;
- } else {
- inet6_protos[hash] = NULL;
- ret = 0;
- }
+int inet6_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ int ret;
- spin_unlock_bh(&inet6_proto_lock);
+ ret = (cmpxchg((const struct net_offload **)&inet6_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
synchronize_net();
return ret;
}
+EXPORT_SYMBOL(inet6_del_offload);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index c2e629d6aea4..b4cd05dba9b6 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1,30 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* RAW sockets for IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
* Adapted from linux/net/ipv4/raw.c
*
- * $Id: raw.c,v 1.51 2002/02/01 22:01:04 davem Exp $
- *
* Fixes:
* Hideaki YOSHIFUJI : sin6_scope_id support
- * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance)
+ * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance)
* Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
+#include <linux/slab.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
@@ -33,9 +27,11 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -50,93 +46,90 @@
#include <net/udp.h>
#include <net/inet_common.h>
#include <net/tcp_states.h>
-#ifdef CONFIG_IPV6_MIP6
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
+#include <linux/mroute6.h>
+#include <net/raw.h>
#include <net/rawv6.h>
#include <net/xfrm.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/export.h>
-struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE];
-DEFINE_RWLOCK(raw_v6_lock);
-
-static void raw_v6_hash(struct sock *sk)
-{
- struct hlist_head *list = &raw_v6_htable[inet_sk(sk)->num &
- (RAWV6_HTABLE_SIZE - 1)];
-
- write_lock_bh(&raw_v6_lock);
- sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock_bh(&raw_v6_lock);
-}
-
-static void raw_v6_unhash(struct sock *sk)
-{
- write_lock_bh(&raw_v6_lock);
- if (sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(&raw_v6_lock);
-}
+#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
+struct raw_hashinfo raw_v6_hashinfo;
+EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
-/* Grumble... icmp and ip_input want to get at this... */
-struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
- struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
- int dif)
+bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
+ const struct in6_addr *loc_addr,
+ const struct in6_addr *rmt_addr, int dif, int sdif)
{
- struct hlist_node *node;
- int is_multicast = ipv6_addr_is_multicast(loc_addr);
-
- sk_for_each_from(sk, node)
- if (inet_sk(sk)->num == num) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- if (!ipv6_addr_any(&np->daddr) &&
- !ipv6_addr_equal(&np->daddr, rmt_addr))
- continue;
-
- if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
- continue;
-
- if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
- goto found;
- if (is_multicast &&
- inet6_mc_check(sk, loc_addr, rmt_addr))
- goto found;
- continue;
- }
- goto found;
- }
- sk = NULL;
-found:
- return sk;
+ if (inet_sk(sk)->inet_num != num ||
+ !net_eq(sock_net(sk), net) ||
+ (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+ !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif))
+ return false;
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+ ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) ||
+ (ipv6_addr_is_multicast(loc_addr) &&
+ inet6_mc_check(sk, loc_addr, rmt_addr)))
+ return true;
+
+ return false;
}
+EXPORT_SYMBOL_GPL(raw_v6_match);
/*
* 0 - deliver
* 1 - block
*/
-static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
+static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb)
{
- struct icmp6hdr *icmph;
- struct raw6_sock *rp = raw6_sk(sk);
-
- if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) {
- __u32 *data = &rp->filter.data[0];
- int bit_nr;
+ struct icmp6hdr _hdr;
+ const struct icmp6hdr *hdr;
- icmph = (struct icmp6hdr *) skb->data;
- bit_nr = icmph->icmp6_type;
+ /* We require only the four bytes of the ICMPv6 header, not any
+ * additional bytes of message body in "struct icmp6hdr".
+ */
+ hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+ ICMPV6_HDRLEN, &_hdr);
+ if (hdr) {
+ const __u32 *data = &raw6_sk(sk)->filter.data[0];
+ unsigned int type = hdr->icmp6_type;
- return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0;
+ return (data[type >> 5] & (1U << (type & 31))) != 0;
}
+ return 1;
+}
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
+
+static mh_filter_t __rcu *mh_filter __read_mostly;
+
+int rawv6_mh_filter_register(mh_filter_t filter)
+{
+ rcu_assign_pointer(mh_filter, filter);
+ return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_register);
+
+int rawv6_mh_filter_unregister(mh_filter_t filter)
+{
+ RCU_INIT_POINTER(mh_filter, NULL);
+ synchronize_rcu();
return 0;
}
+EXPORT_SYMBOL(rawv6_mh_filter_unregister);
+
+#endif
/*
* demultiplex raw sockets.
@@ -145,50 +138,56 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
*
* Caller owns SKB so we must make clones.
*/
-int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
- struct in6_addr *saddr;
- struct in6_addr *daddr;
+ struct net *net = dev_net(skb->dev);
+ const struct in6_addr *saddr;
+ const struct in6_addr *daddr;
+ struct hlist_head *hlist;
struct sock *sk;
- int delivered = 0;
+ bool delivered = false;
__u8 hash;
- saddr = &skb->nh.ipv6h->saddr;
+ saddr = &ipv6_hdr(skb)->saddr;
daddr = saddr + 1;
- hash = nexthdr & (MAX_INET_PROTOS - 1);
-
- read_lock(&raw_v6_lock);
- sk = sk_head(&raw_v6_htable[hash]);
-
- /*
- * The first socket found will be delivered after
- * delivery to transport protocols.
- */
-
- if (sk == NULL)
- goto out;
+ hash = raw_hashfunc(net, nexthdr);
+ hlist = &raw_v6_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
+ int filtered;
- sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
+ if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
+ inet6_iif(skb), inet6_sdif(skb)))
+ continue;
- while (sk) {
- int filtered;
+ if (atomic_read(&sk->sk_rmem_alloc) >=
+ READ_ONCE(sk->sk_rcvbuf)) {
+ sk_drops_inc(sk);
+ continue;
+ }
- delivered = 1;
+ delivered = true;
switch (nexthdr) {
case IPPROTO_ICMPV6:
filtered = icmpv6_filter(sk, skb);
break;
-#ifdef CONFIG_IPV6_MIP6
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPPROTO_MH:
+ {
/* XXX: To validate MH only once for each packet,
* this is placed here. It should be after checking
* xfrm policy, however it doesn't. The checking xfrm
* policy is placed in rawv6_rcv() because it is
* required for each socket.
*/
- filtered = mip6_mh_filter(sk, skb);
+ mh_filter_t *filter;
+
+ filter = rcu_dereference(mh_filter);
+ filtered = filter ? (*filter)(sk, skb) : 0;
break;
+ }
#endif
default:
filtered = 0;
@@ -201,21 +200,22 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
- if (clone) {
- nf_reset(clone);
+ if (clone)
rawv6_rcv(sk, clone);
- }
}
- sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
- IP6CB(skb)->iif);
}
-out:
- read_unlock(&raw_v6_lock);
+ rcu_read_unlock();
return delivered;
}
+bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
+{
+ return ipv6_raw_deliver(skb, nexthdr);
+}
+
/* This cleans up af_inet6 a bit. -DaveM */
-static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -226,11 +226,15 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EINVAL;
+
addr_type = ipv6_addr_type(&addr->sin6_addr);
/* Raw sockets are IPv6 only */
if (addr_type == IPV6_ADDR_MAPPED)
- return(-EADDRNOTAVAIL);
+ return -EADDRNOTAVAIL;
lock_sock(sk);
@@ -238,11 +242,12 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (sk->sk_state != TCP_CLOSE)
goto out;
+ rcu_read_lock();
/* Check if the address belongs to the host. */
if (addr_type != IPV6_ADDR_ANY) {
struct net_device *dev = NULL;
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
addr->sin6_scope_id) {
/* Override any existing binding, if another
@@ -250,49 +255,50 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
*/
sk->sk_bound_dev_if = addr->sin6_scope_id;
}
-
+
/* Binding to link-local address requires an interface */
if (!sk->sk_bound_dev_if)
- goto out;
+ goto out_unlock;
+ }
- dev = dev_get_by_index(sk->sk_bound_dev_if);
- if (!dev) {
- err = -ENODEV;
- goto out;
- }
+ if (sk->sk_bound_dev_if) {
+ err = -ENODEV;
+ dev = dev_get_by_index_rcu(sock_net(sk),
+ sk->sk_bound_dev_if);
+ if (!dev)
+ goto out_unlock;
}
-
+
/* ipv4 addr of the socket is invalid. Only the
* unspecified and mapped address have a v4 equivalent.
*/
v4addr = LOOPBACK4_IPV6;
- if (!(addr_type & IPV6_ADDR_MULTICAST)) {
+ if (!(addr_type & IPV6_ADDR_MULTICAST) &&
+ !ipv6_can_nonlocal_bind(sock_net(sk), inet)) {
err = -EADDRNOTAVAIL;
- if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) {
- if (dev)
- dev_put(dev);
- goto out;
+ if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
+ dev, 0)) {
+ goto out_unlock;
}
}
- if (dev)
- dev_put(dev);
}
- inet->rcv_saddr = inet->saddr = v4addr;
- ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr);
+ inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+ sk->sk_v6_rcv_saddr = addr->sin6_addr;
if (!(addr_type & IPV6_ADDR_MULTICAST))
- ipv6_addr_copy(&np->saddr, &addr->sin6_addr);
+ np->saddr = addr->sin6_addr;
err = 0;
+out_unlock:
+ rcu_read_unlock();
out:
release_sock(sk);
return err;
}
-void rawv6_err(struct sock *sk, struct sk_buff *skb,
- struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+static void rawv6_err(struct sock *sk, struct sk_buff *skb,
+ u8 type, u8 code, int offset, __be32 info)
{
- struct inet_sock *inet = inet_sk(sk);
+ bool recverr = inet6_test_bit(RECVERR6, sk);
struct ipv6_pinfo *np = inet6_sk(sk);
int err;
int harderr;
@@ -302,50 +308,80 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb,
2. Socket is connected (otherwise the error indication
is useless without recverr and error is hard.
*/
- if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+ if (!recverr && sk->sk_state != TCP_ESTABLISHED)
return;
harderr = icmpv6_err_convert(type, code, &err);
- if (type == ICMPV6_PKT_TOOBIG)
- harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
-
- if (np->recverr) {
+ if (type == ICMPV6_PKT_TOOBIG) {
+ ip6_sk_update_pmtu(skb, sk, info);
+ harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
+ }
+ if (type == NDISC_REDIRECT) {
+ ip6_sk_redirect(skb, sk);
+ return;
+ }
+ if (recverr) {
u8 *payload = skb->data;
- if (!inet->hdrincl)
+ if (!inet_test_bit(HDRINCL, sk))
payload += offset;
ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
}
- if (np->recverr || harderr) {
+ if (recverr || harderr) {
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
+ }
+}
+
+void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
+ u8 type, u8 code, int inner_offset, __be32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ struct hlist_head *hlist;
+ struct sock *sk;
+ int hash;
+
+ hash = raw_hashfunc(net, nexthdr);
+ hlist = &raw_v6_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
+ /* Note: ipv6_hdr(skb) != skb->data */
+ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
+
+ if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
+ inet6_iif(skb), inet6_iif(skb)))
+ continue;
+ rawv6_err(sk, skb, type, code, inner_offset, info);
}
+ rcu_read_unlock();
}
-static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
+static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- if ((raw6_sk(sk)->checksum || sk->sk_filter) &&
+ enum skb_drop_reason reason;
+
+ if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
skb_checksum_complete(skb)) {
- /* FIXME: increment a raw6 drops counter here */
- kfree_skb(skb);
- return 0;
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
+ return NET_RX_DROP;
}
/* Charge it to the socket. */
- if (sock_queue_rcv_skb(sk,skb)<0) {
- /* FIXME: increment a raw6 drops counter here */
- kfree_skb(skb);
- return 0;
+ skb_dst_drop(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ sk_skb_reason_drop(sk, skb, reason);
+ return NET_RX_DROP;
}
return 0;
}
/*
- * This is next to useless...
+ * This is next to useless...
* if we demultiplex in network layer we don't need the extra call
- * just to queue the skb...
- * maybe we could have the network decide upon a hint if it
+ * just to queue the skb...
+ * maybe we could have the network decide upon a hint if it
* should call raw_rcv for demultiplexing
*/
int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
@@ -353,32 +389,35 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
struct inet_sock *inet = inet_sk(sk);
struct raw6_sock *rp = raw6_sk(sk);
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- return NET_RX_DROP;
- }
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
+ return NET_RX_DROP;
+ }
+ nf_reset_ct(skb);
if (!rp->checksum)
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (skb->ip_summed == CHECKSUM_COMPLETE) {
- skb_postpull_rcsum(skb, skb->nh.raw,
- skb->h.raw - skb->nh.raw);
- if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, inet->num, skb->csum))
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len, inet->inet_num, skb->csum))
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->csum = ~csum_unfold(csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, inet->num, 0));
+ if (!skb_csum_unnecessary(skb))
+ skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len,
+ inet->inet_num, 0));
- if (inet->hdrincl) {
+ if (inet_test_bit(HDRINCL, sk)) {
if (skb_checksum_complete(skb)) {
- /* FIXME: increment a raw6 drops counter here */
- kfree_skb(skb);
- return 0;
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
+ return NET_RX_DROP;
}
}
@@ -392,43 +431,42 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
* we return it, otherwise we block.
*/
-static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
struct sk_buff *skb;
size_t copied;
int err;
if (flags & MSG_OOB)
return -EOPNOTSUPP;
-
- if (addr_len)
- *addr_len=sizeof(*sin6);
if (flags & MSG_ERRQUEUE)
- return ipv6_recv_error(sk, msg, len);
+ return ipv6_recv_error(sk, msg, len, addr_len);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
+ return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
+
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
copied = skb->len;
- if (copied > len) {
- copied = len;
- msg->msg_flags |= MSG_TRUNC;
- }
+ if (copied > len) {
+ copied = len;
+ msg->msg_flags |= MSG_TRUNC;
+ }
- if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (skb_csum_unnecessary(skb)) {
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
} else if (msg->msg_flags&MSG_TRUNC) {
if (__skb_checksum_complete(skb))
goto csum_copy_err;
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
} else {
- err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
+ err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
if (err == -EINVAL)
goto csum_copy_err;
}
@@ -439,17 +477,17 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
if (sin6) {
sin6->sin6_family = AF_INET6;
sin6->sin6_port = 0;
- ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr);
+ sin6->sin6_addr = ipv6_hdr(skb)->saddr;
sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = IP6CB(skb)->iif;
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
+ *addr_len = sizeof(*sin6);
}
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (np->rxopt.all)
- datagram_recv_ctl(sk, msg, skb);
+ ip6_datagram_recv_ctl(sk, msg, skb);
err = copied;
if (flags & MSG_TRUNC)
@@ -467,13 +505,13 @@ csum_copy_err:
as some normal condition.
*/
err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
- /* FIXME: increment a raw6 drops counter here */
goto out;
}
-static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
+static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
struct raw6_sock *rp)
{
+ struct ipv6_txoptions *opt;
struct sk_buff *skb;
int err = 0;
int offset;
@@ -485,11 +523,15 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
if (!rp->checksum)
goto send;
- if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+ skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
goto out;
offset = rp->offset;
- total_len = inet_sk(sk)->cork.length - (skb->nh.raw - skb->data);
+ total_len = inet_sk(sk)->cork.base.length;
+ opt = inet6_sk(sk)->cork.opt;
+ total_len -= opt ? opt->opt_flen : 0;
+
if (offset >= total_len - 1) {
err = -EINVAL;
ip6_flush_pending_frames(sk);
@@ -512,7 +554,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
if (csum_skb)
continue;
- len = skb->len - (skb->h.raw - skb->data);
+ len = skb->len - skb_transport_offset(skb);
if (offset >= len) {
offset -= len;
continue;
@@ -524,23 +566,24 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
skb = csum_skb;
}
- offset += skb->h.raw - skb->data;
- if (skb_copy_bits(skb, offset, &csum, 2))
- BUG();
+ offset += skb_transport_offset(skb);
+ err = skb_copy_bits(skb, offset, &csum, 2);
+ if (err < 0) {
+ ip6_flush_pending_frames(sk);
+ goto out;
+ }
/* in case cksum was not initialized */
if (unlikely(csum))
tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));
- csum = csum_ipv6_magic(&fl->fl6_src,
- &fl->fl6_dst,
- total_len, fl->proto, tmp_csum);
+ csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+ total_len, fl6->flowi6_proto, tmp_csum);
- if (csum == 0 && fl->proto == IPPROTO_UDP)
+ if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP)
csum = CSUM_MANGLED_0;
- if (skb_store_bits(skb, offset, &csum, 2))
- BUG();
+ BUG_ON(skb_store_bits(skb, offset, &csum, 2));
send:
err = ip6_push_pending_frames(sk);
@@ -548,186 +591,221 @@ out:
return err;
}
-static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
- struct flowi *fl, struct rt6_info *rt,
- unsigned int flags)
+static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
+ struct flowi6 *fl6, struct dst_entry **dstp,
+ unsigned int flags, const struct sockcm_cookie *sockc)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
struct ipv6hdr *iph;
struct sk_buff *skb;
- unsigned int hh_len;
int err;
+ struct rt6_info *rt = dst_rt6_info(*dstp);
+ int hlen = LL_RESERVED_SPACE(rt->dst.dev);
+ int tlen = rt->dst.dev->needed_tailroom;
- if (length > rt->u.dst.dev->mtu) {
- ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu);
+ if (length > rt->dst.dev->mtu) {
+ ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
return -EMSGSIZE;
}
+ if (length < sizeof(struct ipv6hdr))
+ return -EINVAL;
if (flags&MSG_PROBE)
goto out;
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
-
- skb = sock_alloc_send_skb(sk, length+hh_len+15,
- flags&MSG_DONTWAIT, &err);
- if (skb == NULL)
- goto error;
- skb_reserve(skb, hh_len);
+ skb = sock_alloc_send_skb(sk,
+ length + hlen + tlen + 15,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto error;
+ skb_reserve(skb, hlen);
- skb->priority = sk->sk_priority;
- skb->dst = dst_clone(&rt->u.dst);
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->priority = sockc->priority;
+ skb->mark = sockc->mark;
+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
- skb->nh.ipv6h = iph = (struct ipv6hdr *)skb_put(skb, length);
+ skb_put(skb, length);
+ skb_reset_network_header(skb);
+ iph = ipv6_hdr(skb);
skb->ip_summed = CHECKSUM_NONE;
- skb->h.raw = skb->nh.raw;
- err = memcpy_fromiovecend((void *)iph, from, 0, length);
- if (err)
- goto error_fault;
+ skb_setup_tx_timestamp(skb, sockc);
- IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
- err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
- if (err > 0)
- err = np->recverr ? net_xmit_errno(err) : 0;
- if (err)
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
+ skb->transport_header = skb->network_header;
+ err = memcpy_from_msg(iph, msg, length);
+ if (err) {
+ err = -EFAULT;
+ kfree_skb(skb);
goto error;
+ }
+
+ skb_dst_set(skb, &rt->dst);
+ *dstp = NULL;
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
+ /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev
+ * in the error path. Since skb has been freed, the dst could
+ * have been queued for deletion.
+ */
+ rcu_read_lock();
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
+ NULL, rt->dst.dev, dst_output);
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err) {
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_unlock();
+ goto error_check;
+ }
+ rcu_read_unlock();
out:
return 0;
-error_fault:
- err = -EFAULT;
- kfree_skb(skb);
error:
- IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
- return err;
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+error_check:
+ if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
+ err = 0;
+ return err;
}
-static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
-{
- struct iovec *iov;
- u8 __user *type = NULL;
- u8 __user *code = NULL;
-#ifdef CONFIG_IPV6_MIP6
- u8 len = 0;
-#endif
- int probed = 0;
- int i;
+struct raw6_frag_vec {
+ struct msghdr *msg;
+ int hlen;
+ char c[4];
+};
- if (!msg->msg_iov)
- return 0;
+static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6)
+{
+ int err = 0;
+ switch (fl6->flowi6_proto) {
+ case IPPROTO_ICMPV6:
+ rfv->hlen = 2;
+ err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
+ if (!err) {
+ fl6->fl6_icmp_type = rfv->c[0];
+ fl6->fl6_icmp_code = rfv->c[1];
+ }
+ break;
+ case IPPROTO_MH:
+ rfv->hlen = 4;
+ err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
+ if (!err)
+ fl6->fl6_mh_type = rfv->c[2];
+ }
+ return err;
+}
- for (i = 0; i < msg->msg_iovlen; i++) {
- iov = &msg->msg_iov[i];
- if (!iov)
- continue;
+static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct raw6_frag_vec *rfv = from;
- switch (fl->proto) {
- case IPPROTO_ICMPV6:
- /* check if one-byte field is readable or not. */
- if (iov->iov_base && iov->iov_len < 1)
- break;
-
- if (!type) {
- type = iov->iov_base;
- /* check if code field is readable or not. */
- if (iov->iov_len > 1)
- code = type + 1;
- } else if (!code)
- code = iov->iov_base;
-
- if (type && code) {
- if (get_user(fl->fl_icmp_type, type) ||
- get_user(fl->fl_icmp_code, code))
- return -EFAULT;
- probed = 1;
- }
- break;
-#ifdef CONFIG_IPV6_MIP6
- case IPPROTO_MH:
- if (iov->iov_base && iov->iov_len < 1)
- break;
- /* check if type field is readable or not. */
- if (iov->iov_len > 2 - len) {
- u8 __user *p = iov->iov_base;
- if (get_user(fl->fl_mh_type, &p[2 - len]))
- return -EFAULT;
- probed = 1;
- } else
- len += iov->iov_len;
+ if (offset < rfv->hlen) {
+ int copy = min(rfv->hlen - offset, len);
- break;
-#endif
- default:
- probed = 1;
- break;
- }
- if (probed)
- break;
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ memcpy(to, rfv->c + offset, copy);
+ else
+ skb->csum = csum_block_add(
+ skb->csum,
+ csum_partial_copy_nocheck(rfv->c + offset,
+ to, copy),
+ odd);
+
+ odd = 0;
+ offset += copy;
+ to += copy;
+ len -= copy;
+
+ if (!len)
+ return 0;
}
- return 0;
+
+ offset -= rfv->hlen;
+
+ return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
}
-static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len)
+static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ipv6_txoptions opt_space;
- struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
- struct in6_addr *daddr, *final_p = NULL, final;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ struct in6_addr *daddr, *final_p, final;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct raw6_sock *rp = raw6_sk(sk);
struct ipv6_txoptions *opt = NULL;
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
- struct flowi fl;
+ struct raw6_frag_vec rfv;
+ struct flowi6 fl6;
+ struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
- int hlimit = -1;
- int tclass = -1;
+ int hdrincl;
u16 proto;
int err;
/* Rough check on arithmetic overflow,
- better check is made in ip6_build_xmit
+ better check is made in ip6_append_data().
*/
- if (len < 0)
+ if (len > INT_MAX)
return -EMSGSIZE;
/* Mirror BSD error message compatibility */
- if (msg->msg_flags & MSG_OOB)
+ if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
+ hdrincl = inet_test_bit(HDRINCL, sk);
+
+ ipcm6_init_sk(&ipc6, sk);
+
/*
- * Get and verify the address.
+ * Get and verify the address.
*/
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
+
+ fl6.flowi6_mark = ipc6.sockc.mark;
+ fl6.flowi6_uid = sk_uid(sk);
if (sin6) {
- if (addr_len < SIN6_LEN_RFC2133)
+ if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
- if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
- return(-EAFNOSUPPORT);
+ if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
/* port is the proto value [0..255] carried in nexthdr */
proto = ntohs(sin6->sin6_port);
if (!proto)
- proto = inet->num;
- else if (proto != inet->num)
- return(-EINVAL);
+ proto = inet->inet_num;
+ else if (proto != inet->inet_num &&
+ inet->inet_num != IPPROTO_RAW)
+ return -EINVAL;
if (proto > 255)
- return(-EINVAL);
+ return -EINVAL;
daddr = &sin6->sin6_addr;
- if (np->sndflow) {
- fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
+ if (inet6_test_bit(SNDFLOW, sk)) {
+ fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
- daddr = &flowlabel->dst;
}
}
@@ -736,154 +814,144 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
* sk->sk_dst_cache.
*/
if (sk->sk_state == TCP_ESTABLISHED &&
- ipv6_addr_equal(daddr, &np->daddr))
- daddr = &np->daddr;
+ ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+ daddr = &sk->sk_v6_daddr;
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
- ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
- fl.oif = sin6->sin6_scope_id;
+ __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
+ fl6.flowi6_oif = sin6->sin6_scope_id;
} else {
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
-
- proto = inet->num;
- daddr = &np->daddr;
- fl.fl6_flowlabel = np->flow_label;
- }
- if (ipv6_addr_any(daddr)) {
- /*
- * unspecified destination address
- * treated as error... is this correct ?
- */
- fl6_sock_release(flowlabel);
- return(-EINVAL);
+ proto = inet->inet_num;
+ daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
}
- if (fl.oif == 0)
- fl.oif = sk->sk_bound_dev_if;
+ if (fl6.flowi6_oif == 0)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
+ ipc6.opt = opt;
- err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
+ if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
opt = NULL;
}
- if (opt == NULL)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
- fl.proto = proto;
- err = rawv6_probe_proto_opt(&fl, msg);
- if (err)
- goto out;
-
- ipv6_addr_copy(&fl.fl6_dst, daddr);
- if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
-
- /* merge ip6_build_xmit from ip6_output */
- if (opt && opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
+ fl6.flowi6_proto = proto;
+ fl6.flowi6_mark = ipc6.sockc.mark;
+
+ if (!hdrincl) {
+ rfv.msg = msg;
+ rfv.hlen = 0;
+ err = rawv6_probe_proto_opt(&rfv, &fl6);
+ if (err)
+ goto out;
}
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
- fl.oif = np->mcast_oif;
- security_sk_classify_flow(sk, &fl);
+ if (!ipv6_addr_any(daddr))
+ fl6.daddr = *daddr;
+ else
+ fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+ fl6.saddr = np->saddr;
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto out;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
+ final_p = fl6_update_dst(&fl6, opt, &final);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out;
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- if (hlimit < 0) {
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
- hlimit = np->mcast_hops;
- else
- hlimit = np->hop_limit;
- if (hlimit < 0)
- hlimit = dst_metric(dst, RTAX_HOPLIMIT);
- if (hlimit < 0)
- hlimit = ipv6_get_hoplimit(dst->dev);
- }
+ if (hdrincl)
+ fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
- if (tclass < 0) {
- tclass = np->tclass;
- if (tclass < 0)
- tclass = 0;
+ fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto out;
}
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
- if (inet->hdrincl) {
- err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags);
- } else {
+ if (hdrincl)
+ err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
+ msg->msg_flags, &ipc6.sockc);
+ else {
+ ipc6.opt = opt;
lock_sock(sk);
- err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov,
- len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst,
+ err = ip6_append_data(sk, raw6_getfrag, &rfv,
+ len, 0, &ipc6, &fl6, dst_rt6_info(dst),
msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE))
- err = rawv6_push_pending_frames(sk, &fl, rp);
+ err = rawv6_push_pending_frames(sk, &fl6, rp);
+ release_sock(sk);
}
done:
dst_release(dst);
- release_sock(sk);
-out:
+out:
fl6_sock_release(flowlabel);
- return err<0?err:len;
+ txopt_put(opt_to_free);
+ return err < 0 ? err : len;
do_confirm:
- dst_confirm(dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(dst, &fl6.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
goto done;
}
-static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static int rawv6_seticmpfilter(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
{
switch (optname) {
case ICMPV6_FILTER:
if (optlen > sizeof(struct icmp6_filter))
optlen = sizeof(struct icmp6_filter);
- if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen))
+ if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen))
return -EFAULT;
return 0;
default:
return -ENOPROTOOPT;
- };
+ }
return 0;
}
-static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
+static int rawv6_geticmpfilter(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
int len;
@@ -903,85 +971,84 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
return 0;
default:
return -ENOPROTOOPT;
- };
+ }
return 0;
}
static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct raw6_sock *rp = raw6_sk(sk);
int val;
- if (get_user(val, (int __user *)optval))
+ if (optlen < sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
switch (optname) {
- case IPV6_CHECKSUM:
- /* You may get strange result with a positive odd offset;
- RFC2292bis agrees with me. */
- if (val > 0 && (val&1))
- return(-EINVAL);
- if (val < 0) {
- rp->checksum = 0;
- } else {
- rp->checksum = 1;
- rp->offset = val;
- }
+ case IPV6_HDRINCL:
+ if (sk->sk_type != SOCK_RAW)
+ return -EINVAL;
+ inet_assign_bit(HDRINCL, sk, val);
+ return 0;
+ case IPV6_CHECKSUM:
+ if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
+ level == IPPROTO_IPV6) {
+ /*
+ * RFC3542 tells that IPV6_CHECKSUM socket
+ * option in the IPPROTO_IPV6 level is not
+ * allowed on ICMPv6 sockets.
+ * If you want to set it, use IPPROTO_RAW
+ * level IPV6_CHECKSUM socket option
+ * (Linux extension).
+ */
+ return -EINVAL;
+ }
- return 0;
- break;
+ /* You may get strange result with a positive odd offset;
+ RFC2292bis agrees with me. */
+ if (val > 0 && (val&1))
+ return -EINVAL;
+ if (val < 0) {
+ rp->checksum = 0;
+ } else {
+ rp->checksum = 1;
+ rp->offset = val;
+ }
- default:
- return(-ENOPROTOOPT);
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
}
}
static int rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- switch(level) {
- case SOL_RAW:
- break;
-
- case SOL_ICMPV6:
- if (inet_sk(sk)->num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
- return rawv6_seticmpfilter(sk, level, optname, optval,
- optlen);
- case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
- break;
- default:
- return ipv6_setsockopt(sk, level, optname, optval,
- optlen);
- };
- return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
-}
-
-#ifdef CONFIG_COMPAT
-static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, unsigned int optlen)
{
switch (level) {
case SOL_RAW:
break;
+
case SOL_ICMPV6:
- if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return -EOPNOTSUPP;
- return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
+ return rawv6_seticmpfilter(sk, optname, optval, optlen);
case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
+ if (optname == IPV6_CHECKSUM ||
+ optname == IPV6_HDRINCL)
break;
+ fallthrough;
default:
- return compat_ipv6_setsockopt(sk, level, optname,
- optval, optlen);
- };
+ return ipv6_setsockopt(sk, level, optname, optval, optlen);
+ }
+
return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
}
-#endif
static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
@@ -989,11 +1056,19 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
struct raw6_sock *rp = raw6_sk(sk);
int val, len;
- if (get_user(len,optlen))
+ if (get_user(len, optlen))
return -EFAULT;
switch (optname) {
+ case IPV6_HDRINCL:
+ val = inet_test_bit(HDRINCL, sk);
+ break;
case IPV6_CHECKSUM:
+ /*
+ * We allow getsockopt() for IPPROTO_IPV6-level
+ * IPV6_CHECKSUM socket option on ICMPv6 sockets
+ * since RFC3542 is silent about it.
+ */
if (rp->checksum == 0)
val = -1;
else
@@ -1008,7 +1083,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
if (put_user(len, optlen))
return -EFAULT;
- if (copy_to_user(optval,&val,len))
+ if (copy_to_user(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -1016,270 +1091,225 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
static int rawv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
- switch(level) {
- case SOL_RAW:
- break;
-
- case SOL_ICMPV6:
- if (inet_sk(sk)->num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
- return rawv6_geticmpfilter(sk, level, optname, optval,
- optlen);
- case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
- break;
- default:
- return ipv6_getsockopt(sk, level, optname, optval,
- optlen);
- };
- return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
-}
-
-#ifdef CONFIG_COMPAT
-static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
switch (level) {
case SOL_RAW:
break;
+
case SOL_ICMPV6:
- if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return -EOPNOTSUPP;
- return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
+ return rawv6_geticmpfilter(sk, optname, optval, optlen);
case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
+ if (optname == IPV6_CHECKSUM ||
+ optname == IPV6_HDRINCL)
break;
+ fallthrough;
default:
- return compat_ipv6_getsockopt(sk, level, optname,
- optval, optlen);
- };
+ return ipv6_getsockopt(sk, level, optname, optval, optlen);
+ }
+
return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
}
-#endif
-static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
+static int rawv6_ioctl(struct sock *sk, int cmd, int *karg)
{
- switch(cmd) {
- case SIOCOUTQ:
- {
- int amount = atomic_read(&sk->sk_wmem_alloc);
- return put_user(amount, (int __user *)arg);
- }
- case SIOCINQ:
- {
- struct sk_buff *skb;
- int amount = 0;
-
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL)
- amount = skb->tail - skb->h.raw;
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
- }
+ switch (cmd) {
+ case SIOCOUTQ: {
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
- default:
- return -ENOIOCTLCMD;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ *karg = skb->len;
+ else
+ *karg = 0;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return 0;
+ }
+
+ default:
+#ifdef CONFIG_IPV6_MROUTE
+ return ip6mr_ioctl(sk, cmd, karg);
+#else
+ return -ENOIOCTLCMD;
+#endif
}
}
-static void rawv6_close(struct sock *sk, long timeout)
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
{
- if (inet_sk(sk)->num == IPPROTO_RAW)
- ip6_ra_control(sk, -1, NULL);
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IPV6_MROUTE
+ return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+static void rawv6_close(struct sock *sk, long timeout)
+{
+ if (inet_sk(sk)->inet_num == IPPROTO_RAW)
+ ip6_ra_control(sk, -1);
+ ip6mr_sk_done(sk);
sk_common_release(sk);
}
+static void raw6_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip6_flush_pending_frames(sk);
+ release_sock(sk);
+}
+
static int rawv6_init_sk(struct sock *sk)
{
- if (inet_sk(sk)->num == IPPROTO_ICMPV6) {
- struct raw6_sock *rp = raw6_sk(sk);
+ struct raw6_sock *rp = raw6_sk(sk);
+
+ sk->sk_drop_counters = &rp->drop_counters;
+ switch (inet_sk(sk)->inet_num) {
+ case IPPROTO_ICMPV6:
rp->checksum = 1;
rp->offset = 2;
+ break;
+ case IPPROTO_MH:
+ rp->checksum = 1;
+ rp->offset = 4;
+ break;
+ default:
+ break;
}
- return(0);
+ return 0;
}
struct proto rawv6_prot = {
.name = "RAWv6",
.owner = THIS_MODULE,
.close = rawv6_close,
- .connect = ip6_datagram_connect,
- .disconnect = udp_disconnect,
+ .destroy = raw6_destroy,
+ .connect = ip6_datagram_connect_v6_only,
+ .disconnect = __udp_disconnect,
.ioctl = rawv6_ioctl,
.init = rawv6_init_sk,
- .destroy = inet6_destroy_sock,
.setsockopt = rawv6_setsockopt,
.getsockopt = rawv6_getsockopt,
.sendmsg = rawv6_sendmsg,
.recvmsg = rawv6_recvmsg,
.bind = rawv6_bind,
.backlog_rcv = rawv6_rcv_skb,
- .hash = raw_v6_hash,
- .unhash = raw_v6_unhash,
+ .hash = raw_hash_sk,
+ .unhash = raw_unhash_sk,
.obj_size = sizeof(struct raw6_sock),
+ .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6),
+ .useroffset = offsetof(struct raw6_sock, filter),
+ .usersize = sizeof_field(struct raw6_sock, filter),
+ .h.raw_hash = &raw_v6_hashinfo,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_rawv6_setsockopt,
- .compat_getsockopt = compat_rawv6_getsockopt,
+ .compat_ioctl = compat_rawv6_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
-struct raw6_iter_state {
- int bucket;
-};
-
-#define raw6_seq_private(seq) ((struct raw6_iter_state *)(seq)->private)
-
-static struct sock *raw6_get_first(struct seq_file *seq)
-{
- struct sock *sk;
- struct hlist_node *node;
- struct raw6_iter_state* state = raw6_seq_private(seq);
-
- for (state->bucket = 0; state->bucket < RAWV6_HTABLE_SIZE; ++state->bucket)
- sk_for_each(sk, node, &raw_v6_htable[state->bucket])
- if (sk->sk_family == PF_INET6)
- goto out;
- sk = NULL;
-out:
- return sk;
-}
-
-static struct sock *raw6_get_next(struct seq_file *seq, struct sock *sk)
+static int raw6_seq_show(struct seq_file *seq, void *v)
{
- struct raw6_iter_state* state = raw6_seq_private(seq);
-
- do {
- sk = sk_next(sk);
-try_again:
- ;
- } while (sk && sk->sk_family != PF_INET6);
-
- if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) {
- sk = sk_head(&raw_v6_htable[state->bucket]);
- goto try_again;
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
+ } else {
+ struct sock *sp = v;
+ __u16 srcp = inet_sk(sp)->inet_num;
+ ip6_dgram_sock_seq_show(seq, v, srcp, 0,
+ raw_seq_private(seq)->bucket);
}
- return sk;
-}
-
-static struct sock *raw6_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct sock *sk = raw6_get_first(seq);
- if (sk)
- while (pos && (sk = raw6_get_next(seq, sk)) != NULL)
- --pos;
- return pos ? NULL : sk;
+ return 0;
}
-static void *raw6_seq_start(struct seq_file *seq, loff_t *pos)
-{
- read_lock(&raw_v6_lock);
- return *pos ? raw6_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-}
+static const struct seq_operations raw6_seq_ops = {
+ .start = raw_seq_start,
+ .next = raw_seq_next,
+ .stop = raw_seq_stop,
+ .show = raw6_seq_show,
+};
-static void *raw6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static int __net_init raw6_init_net(struct net *net)
{
- struct sock *sk;
+ if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops,
+ sizeof(struct raw_iter_state), &raw_v6_hashinfo))
+ return -ENOMEM;
- if (v == SEQ_START_TOKEN)
- sk = raw6_get_first(seq);
- else
- sk = raw6_get_next(seq, v);
- ++*pos;
- return sk;
+ return 0;
}
-static void raw6_seq_stop(struct seq_file *seq, void *v)
+static void __net_exit raw6_exit_net(struct net *net)
{
- read_unlock(&raw_v6_lock);
+ remove_proc_entry("raw6", net->proc_net);
}
-static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
+static struct pernet_operations raw6_net_ops = {
+ .init = raw6_init_net,
+ .exit = raw6_exit_net,
+};
+
+int __init raw6_proc_init(void)
{
- struct ipv6_pinfo *np = inet6_sk(sp);
- struct in6_addr *dest, *src;
- __u16 destp, srcp;
-
- dest = &np->daddr;
- src = &np->rcv_saddr;
- destp = 0;
- srcp = inet_sk(sp)->num;
- seq_printf(seq,
- "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
- "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n",
- i,
- src->s6_addr32[0], src->s6_addr32[1],
- src->s6_addr32[2], src->s6_addr32[3], srcp,
- dest->s6_addr32[0], dest->s6_addr32[1],
- dest->s6_addr32[2], dest->s6_addr32[3], destp,
- sp->sk_state,
- atomic_read(&sp->sk_wmem_alloc),
- atomic_read(&sp->sk_rmem_alloc),
- 0, 0L, 0,
- sock_i_uid(sp), 0,
- sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp);
+ return register_pernet_subsys(&raw6_net_ops);
}
-static int raw6_seq_show(struct seq_file *seq, void *v)
+void raw6_proc_exit(void)
{
- if (v == SEQ_START_TOKEN)
- seq_printf(seq,
- " sl "
- "local_address "
- "remote_address "
- "st tx_queue rx_queue tr tm->when retrnsmt"
- " uid timeout inode\n");
- else
- raw6_sock_seq_show(seq, v, raw6_seq_private(seq)->bucket);
- return 0;
+ unregister_pernet_subsys(&raw6_net_ops);
}
+#endif /* CONFIG_PROC_FS */
-static struct seq_operations raw6_seq_ops = {
- .start = raw6_seq_start,
- .next = raw6_seq_next,
- .stop = raw6_seq_stop,
- .show = raw6_seq_show,
+/* Same as inet6_dgram_ops, sans udp_poll. */
+const struct proto_ops inet6_sockraw_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_dgram_connect, /* ok */
+ .socketpair = sock_no_socketpair, /* a do nothing */
+ .accept = sock_no_accept, /* a do nothing */
+ .getname = inet6_getname,
+ .poll = datagram_poll, /* ok */
+ .ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen, /* ok */
+ .shutdown = inet_shutdown, /* ok */
+ .setsockopt = sock_common_setsockopt, /* ok */
+ .getsockopt = sock_common_getsockopt, /* ok */
+ .sendmsg = inet_sendmsg, /* ok */
+ .recvmsg = sock_common_recvmsg, /* ok */
+ .mmap = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = inet6_compat_ioctl,
+#endif
};
-static int raw6_seq_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct raw6_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- goto out;
- rc = seq_open(file, &raw6_seq_ops);
- if (rc)
- goto out_kfree;
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-}
-
-static struct file_operations raw6_seq_fops = {
- .owner = THIS_MODULE,
- .open = raw6_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static struct inet_protosw rawv6_protosw = {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &rawv6_prot,
+ .ops = &inet6_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
};
-int __init raw6_proc_init(void)
+int __init rawv6_init(void)
{
- if (!proc_net_fops_create("raw6", S_IRUGO, &raw6_seq_fops))
- return -ENOMEM;
- return 0;
+ return inet6_register_protosw(&rawv6_protosw);
}
-void raw6_proc_exit(void)
+void rawv6_exit(void)
{
- proc_net_remove("raw6");
+ inet6_unregister_protosw(&rawv6_protosw);
}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 6f9a9046510f..25ec8001898d 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -1,22 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 fragment reassembly
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
- *
- * $Id: reassembly.c,v 1.26 2001/03/07 22:00:57 davem Exp $
+ * Pedro Roque <roque@di.fc.ul.pt>
*
* Based on: net/ipv4/ip_fragment.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
-/*
- * Fixes:
+/*
+ * Fixes:
* Andi Kleen Make it work with multiple hosts.
* More RFC compliance.
*
@@ -28,6 +22,9 @@
* YOSHIFUJI,H. @USAGI Always remove fragment header to
* calculate ICV correctly.
*/
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -42,6 +39,11 @@
#include <linux/icmpv6.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -53,407 +55,105 @@
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
+#include <net/ipv6_frag.h>
+#include <net/inet_ecn.h>
-int sysctl_ip6frag_high_thresh __read_mostly = 256*1024;
-int sysctl_ip6frag_low_thresh __read_mostly = 192*1024;
-
-int sysctl_ip6frag_time __read_mostly = IPV6_FRAG_TIMEOUT;
-
-struct ip6frag_skb_cb
-{
- struct inet6_skb_parm h;
- int offset;
-};
-
-#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb))
-
-
-/*
- * Equivalent of ipv4 struct ipq
- */
-
-struct frag_queue
-{
- struct hlist_node list;
- struct list_head lru_list; /* lru list member */
-
- __be32 id; /* fragment id */
- struct in6_addr saddr;
- struct in6_addr daddr;
-
- spinlock_t lock;
- atomic_t refcnt;
- struct timer_list timer; /* expire timer */
- struct sk_buff *fragments;
- int len;
- int meat;
- int iif;
- struct timeval stamp;
- unsigned int csum;
- __u8 last_in; /* has first/last segment arrived? */
-#define COMPLETE 4
-#define FIRST_IN 2
-#define LAST_IN 1
- __u16 nhoffset;
-};
-
-/* Hash table. */
-
-#define IP6Q_HASHSZ 64
+static const char ip6_frag_cache_name[] = "ip6-frags";
-static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ];
-static DEFINE_RWLOCK(ip6_frag_lock);
-static u32 ip6_frag_hash_rnd;
-static LIST_HEAD(ip6_frag_lru_list);
-int ip6_frag_nqueues = 0;
-
-static __inline__ void __fq_unlink(struct frag_queue *fq)
+static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
- hlist_del(&fq->list);
- list_del(&fq->lru_list);
- ip6_frag_nqueues--;
+ return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
}
-static __inline__ void fq_unlink(struct frag_queue *fq)
-{
- write_lock(&ip6_frag_lock);
- __fq_unlink(fq);
- write_unlock(&ip6_frag_lock);
-}
+static struct inet_frags ip6_frags;
-/*
- * callers should be careful not to use the hash value outside the ipfrag_lock
- * as doing so could race with ipfrag_hash_rnd being recalculated.
- */
-static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr,
- struct in6_addr *daddr)
-{
- u32 a, b, c;
-
- a = (__force u32)saddr->s6_addr32[0];
- b = (__force u32)saddr->s6_addr32[1];
- c = (__force u32)saddr->s6_addr32[2];
-
- a += JHASH_GOLDEN_RATIO;
- b += JHASH_GOLDEN_RATIO;
- c += ip6_frag_hash_rnd;
- __jhash_mix(a, b, c);
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
- a += (__force u32)saddr->s6_addr32[3];
- b += (__force u32)daddr->s6_addr32[0];
- c += (__force u32)daddr->s6_addr32[1];
- __jhash_mix(a, b, c);
-
- a += (__force u32)daddr->s6_addr32[2];
- b += (__force u32)daddr->s6_addr32[3];
- c += (__force u32)id;
- __jhash_mix(a, b, c);
-
- return c & (IP6Q_HASHSZ - 1);
-}
-
-static struct timer_list ip6_frag_secret_timer;
-int sysctl_ip6frag_secret_interval __read_mostly = 10 * 60 * HZ;
-
-static void ip6_frag_secret_rebuild(unsigned long dummy)
-{
- unsigned long now = jiffies;
- int i;
-
- write_lock(&ip6_frag_lock);
- get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32));
- for (i = 0; i < IP6Q_HASHSZ; i++) {
- struct frag_queue *q;
- struct hlist_node *p, *n;
-
- hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) {
- unsigned int hval = ip6qhashfn(q->id,
- &q->saddr,
- &q->daddr);
-
- if (hval != i) {
- hlist_del(&q->list);
-
- /* Relink to new hash chain. */
- hlist_add_head(&q->list,
- &ip6_frag_hash[hval]);
-
- }
- }
- }
- write_unlock(&ip6_frag_lock);
-
- mod_timer(&ip6_frag_secret_timer, now + sysctl_ip6frag_secret_interval);
-}
-
-atomic_t ip6_frag_mem = ATOMIC_INIT(0);
-
-/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct sk_buff *skb, int *work)
-{
- if (work)
- *work -= skb->truesize;
- atomic_sub(skb->truesize, &ip6_frag_mem);
- kfree_skb(skb);
-}
-
-static inline void frag_free_queue(struct frag_queue *fq, int *work)
-{
- if (work)
- *work -= sizeof(struct frag_queue);
- atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem);
- kfree(fq);
-}
-
-static inline struct frag_queue *frag_alloc_queue(void)
-{
- struct frag_queue *fq = kzalloc(sizeof(struct frag_queue), GFP_ATOMIC);
-
- if(!fq)
- return NULL;
- atomic_add(sizeof(struct frag_queue), &ip6_frag_mem);
- return fq;
-}
-
-/* Destruction primitives. */
-
-/* Complete destruction of fq. */
-static void ip6_frag_destroy(struct frag_queue *fq, int *work)
-{
- struct sk_buff *fp;
-
- BUG_TRAP(fq->last_in&COMPLETE);
- BUG_TRAP(del_timer(&fq->timer) == 0);
-
- /* Release all fragment data. */
- fp = fq->fragments;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- frag_kfree_skb(fp, work);
- fp = xp;
- }
-
- frag_free_queue(fq, work);
-}
-
-static __inline__ void fq_put(struct frag_queue *fq, int *work)
-{
- if (atomic_dec_and_test(&fq->refcnt))
- ip6_frag_destroy(fq, work);
-}
-
-/* Kill fq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static __inline__ void fq_kill(struct frag_queue *fq)
-{
- if (del_timer(&fq->timer))
- atomic_dec(&fq->refcnt);
-
- if (!(fq->last_in & COMPLETE)) {
- fq_unlink(fq);
- atomic_dec(&fq->refcnt);
- fq->last_in |= COMPLETE;
- }
-}
-
-static void ip6_evictor(struct inet6_dev *idev)
+static void ip6_frag_expire(struct timer_list *t)
{
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
struct frag_queue *fq;
- struct list_head *tmp;
- int work;
-
- work = atomic_read(&ip6_frag_mem) - sysctl_ip6frag_low_thresh;
- if (work <= 0)
- return;
-
- while(work > 0) {
- read_lock(&ip6_frag_lock);
- if (list_empty(&ip6_frag_lru_list)) {
- read_unlock(&ip6_frag_lock);
- return;
- }
- tmp = ip6_frag_lru_list.next;
- fq = list_entry(tmp, struct frag_queue, lru_list);
- atomic_inc(&fq->refcnt);
- read_unlock(&ip6_frag_lock);
-
- spin_lock(&fq->lock);
- if (!(fq->last_in&COMPLETE))
- fq_kill(fq);
- spin_unlock(&fq->lock);
-
- fq_put(fq, &work);
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_REASMFAILS);
- }
-}
-
-static void ip6_frag_expire(unsigned long data)
-{
- struct frag_queue *fq = (struct frag_queue *) data;
- struct net_device *dev = NULL;
-
- spin_lock(&fq->lock);
-
- if (fq->last_in & COMPLETE)
- goto out;
-
- fq_kill(fq);
-
- dev = dev_get_by_index(fq->iif);
- if (!dev)
- goto out;
-
- rcu_read_lock();
- IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
- IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
- rcu_read_unlock();
-
- /* Don't send error if the first segment did not arrive. */
- if (!(fq->last_in&FIRST_IN) || !fq->fragments)
- goto out;
-
- /*
- But use as source device on which LAST ARRIVED
- segment was received. And do not use fq->dev
- pointer directly, device might already disappeared.
- */
- fq->fragments->dev = dev;
- icmpv6_send(fq->fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, dev);
-out:
- if (dev)
- dev_put(dev);
- spin_unlock(&fq->lock);
- fq_put(fq, NULL);
-}
-
-/* Creation primitives. */
+ fq = container_of(frag, struct frag_queue, q);
-static struct frag_queue *ip6_frag_intern(struct frag_queue *fq_in)
-{
- struct frag_queue *fq;
- unsigned int hash;
-#ifdef CONFIG_SMP
- struct hlist_node *n;
-#endif
-
- write_lock(&ip6_frag_lock);
- hash = ip6qhashfn(fq_in->id, &fq_in->saddr, &fq_in->daddr);
-#ifdef CONFIG_SMP
- hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
- if (fq->id == fq_in->id &&
- ipv6_addr_equal(&fq_in->saddr, &fq->saddr) &&
- ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) {
- atomic_inc(&fq->refcnt);
- write_unlock(&ip6_frag_lock);
- fq_in->last_in |= COMPLETE;
- fq_put(fq_in, NULL);
- return fq;
- }
- }
-#endif
- fq = fq_in;
-
- if (!mod_timer(&fq->timer, jiffies + sysctl_ip6frag_time))
- atomic_inc(&fq->refcnt);
-
- atomic_inc(&fq->refcnt);
- hlist_add_head(&fq->list, &ip6_frag_hash[hash]);
- INIT_LIST_HEAD(&fq->lru_list);
- list_add_tail(&fq->lru_list, &ip6_frag_lru_list);
- ip6_frag_nqueues++;
- write_unlock(&ip6_frag_lock);
- return fq;
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
-
static struct frag_queue *
-ip6_frag_create(__be32 id, struct in6_addr *src, struct in6_addr *dst,
- struct inet6_dev *idev)
+fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
{
- struct frag_queue *fq;
-
- if ((fq = frag_alloc_queue()) == NULL)
- goto oom;
-
- fq->id = id;
- ipv6_addr_copy(&fq->saddr, src);
- ipv6_addr_copy(&fq->daddr, dst);
-
- init_timer(&fq->timer);
- fq->timer.function = ip6_frag_expire;
- fq->timer.data = (long) fq;
- spin_lock_init(&fq->lock);
- atomic_set(&fq->refcnt, 1);
-
- return ip6_frag_intern(fq);
+ struct frag_v6_compare_key key = {
+ .id = id,
+ .saddr = hdr->saddr,
+ .daddr = hdr->daddr,
+ .user = IP6_DEFRAG_LOCAL_DELIVER,
+ .iif = iif,
+ };
+ struct inet_frag_queue *q;
+
+ if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+ IPV6_ADDR_LINKLOCAL)))
+ key.iif = 0;
+
+ q = inet_frag_find(net->ipv6.fqdir, &key);
+ if (!q)
+ return NULL;
-oom:
- IP6_INC_STATS_BH(idev, IPSTATS_MIB_REASMFAILS);
- return NULL;
+ return container_of(q, struct frag_queue, q);
}
-static __inline__ struct frag_queue *
-fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst,
- struct inet6_dev *idev)
+static int ip6_frag_queue(struct net *net,
+ struct frag_queue *fq, struct sk_buff *skb,
+ struct frag_hdr *fhdr, int nhoff,
+ u32 *prob_offset, int *refs)
{
- struct frag_queue *fq;
- struct hlist_node *n;
- unsigned int hash;
-
- read_lock(&ip6_frag_lock);
- hash = ip6qhashfn(id, src, dst);
- hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
- if (fq->id == id &&
- ipv6_addr_equal(src, &fq->saddr) &&
- ipv6_addr_equal(dst, &fq->daddr)) {
- atomic_inc(&fq->refcnt);
- read_unlock(&ip6_frag_lock);
- return fq;
- }
- }
- read_unlock(&ip6_frag_lock);
-
- return ip6_frag_create(id, src, dst, idev);
-}
-
-
-static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
- struct frag_hdr *fhdr, int nhoff)
-{
- struct sk_buff *prev, *next;
- int offset, end;
-
- if (fq->last_in & COMPLETE)
+ int offset, end, fragsize;
+ struct sk_buff *prev_tail;
+ struct net_device *dev;
+ int err = -ENOENT;
+ SKB_DR(reason);
+ u8 ecn;
+
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (fq->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
+ err = -EINVAL;
offset = ntohs(fhdr->frag_off) & ~0x7;
- end = offset + (ntohs(skb->nh.ipv6h->payload_len) -
- ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
+ end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
+ ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
if ((unsigned int)end > IPV6_MAXPLEN) {
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off - skb->nh.raw);
- return;
+ *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb);
+ /* note that if prob_offset is set, the skb is freed elsewhere,
+ * we do not free it here.
+ */
+ return -1;
}
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_sub(skb->csum,
- csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0));
+ ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ const unsigned char *nh = skb_network_header(skb);
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(nh, (u8 *)(fhdr + 1) - nh,
+ 0));
+ }
/* Is this the final fragment? */
if (!(fhdr->frag_off & htons(IP6_MF))) {
/* If we already have some bits beyond end
* or have different end, the segment is corrupted.
*/
- if (end < fq->len ||
- ((fq->last_in & LAST_IN) && end != fq->len))
- goto err;
- fq->last_in |= LAST_IN;
- fq->len = end;
+ if (end < fq->q.len ||
+ ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
+ goto discard_fq;
+ fq->q.flags |= INET_FRAG_LAST_IN;
+ fq->q.len = end;
} else {
/* Check if the fragment is rounded to 8 bytes.
* Required by the RFC.
@@ -462,314 +162,456 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
/* RFC2460 says always send parameter problem in
* this case. -DaveM
*/
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
- offsetof(struct ipv6hdr, payload_len));
- return;
+ *prob_offset = offsetof(struct ipv6hdr, payload_len);
+ return -1;
}
- if (end > fq->len) {
+ if (end > fq->q.len) {
/* Some bits beyond end -> corruption. */
- if (fq->last_in & LAST_IN)
- goto err;
- fq->len = end;
+ if (fq->q.flags & INET_FRAG_LAST_IN)
+ goto discard_fq;
+ fq->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_fq;
+ err = -ENOMEM;
/* Point into the IP datagram 'data' part. */
if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
- goto err;
-
- if (pskb_trim_rcsum(skb, end - offset))
- goto err;
+ goto discard_fq;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = NULL;
- for(next = fq->fragments; next != NULL; next = next->next) {
- if (FRAG6_CB(next)->offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ err = pskb_trim_rcsum(skb, end - offset);
+ if (err)
+ goto discard_fq;
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
- */
- if (prev) {
- int i = (FRAG6_CB(prev)->offset + prev->len) - offset;
-
- if (i > 0) {
- offset += i;
- if (end <= offset)
- goto err;
- if (!pskb_pull(skb, i))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
- /* Look for overlap with succeeding segments.
- * If we can merge fragments, do it.
- */
- while (next && FRAG6_CB(next)->offset < end) {
- int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- if (!pskb_pull(next, i))
- goto err;
- FRAG6_CB(next)->offset += i; /* next fragment */
- fq->meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragment is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- fq->fragments = next;
-
- fq->meat -= free_it->len;
- frag_kfree_skb(free_it, NULL);
- }
- }
+ prev_tail = fq->q.fragments_tail;
+ err = inet_frag_queue_insert(&fq->q, skb, offset, end);
+ if (err)
+ goto insert_error;
- FRAG6_CB(skb)->offset = offset;
+ if (dev)
+ fq->iif = dev->ifindex;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (prev)
- prev->next = skb;
- else
- fq->fragments = skb;
+ fq->q.stamp = skb->tstamp;
+ fq->q.tstamp_type = skb->tstamp_type;
+ fq->q.meat += skb->len;
+ fq->ecn |= ecn;
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
- if (skb->dev)
- fq->iif = skb->dev->ifindex;
- skb->dev = NULL;
- skb_get_timestamp(skb, &fq->stamp);
- fq->meat += skb->len;
- atomic_add(skb->truesize, &ip6_frag_mem);
+ fragsize = -skb_network_offset(skb) + skb->len;
+ if (fragsize > fq->q.max_size)
+ fq->q.max_size = fragsize;
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
*/
if (offset == 0) {
fq->nhoffset = nhoff;
- fq->last_in |= FIRST_IN;
+ fq->q.flags |= INET_FRAG_FIRST_IN;
}
- write_lock(&ip6_frag_lock);
- list_move_tail(&fq->lru_list, &ip6_frag_lru_list);
- write_unlock(&ip6_frag_lock);
- return;
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ return -EINPROGRESS;
+
+insert_error:
+ if (err == IPFRAG_DUP) {
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
+ }
+ err = -EINVAL;
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_REASM_OVERLAPS);
+discard_fq:
+ inet_frag_kill(&fq->q, refs);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_REASMFAILS);
err:
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
+ return err;
}
/*
* Check if this packet is complete.
- * Returns NULL on failure by any reason, and pointer
- * to current nexthdr field in reassembled frame.
*
* It is called with locked fq, and caller must check that
* queue is eligible for reassembly i.e. it is not COMPLETE,
* the last and the first frames arrived and all the bits are here.
*/
-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
- struct net_device *dev)
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
- struct sk_buff *fp, *head = fq->fragments;
- int payload_len;
+ struct net *net = fq->q.fqdir->net;
unsigned int nhoff;
+ void *reasm_data;
+ int payload_len;
+ u8 ecn;
- fq_kill(fq);
+ inet_frag_kill(&fq->q, refs);
- BUG_TRAP(head != NULL);
- BUG_TRAP(FRAG6_CB(head)->offset == 0);
-
- /* Unfragmented part is taken from the first segment. */
- payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr);
- if (payload_len > IPV6_MAXPLEN)
- goto out_oversize;
+ ecn = ip_frag_ecn_table[fq->ecn];
+ if (unlikely(ecn == 0xff))
+ goto out_fail;
- /* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
+ if (!reasm_data)
goto out_oom;
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_shinfo(head)->frag_list) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
- goto out_oom;
- clone->next = head->next;
- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_shinfo(head)->frag_list = NULL;
- for (i=0; i<skb_shinfo(head)->nr_frags; i++)
- plen += skb_shinfo(head)->frags[i].size;
- clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &ip6_frag_mem);
- }
+ payload_len = -skb_network_offset(skb) -
+ sizeof(struct ipv6hdr) + fq->q.len -
+ sizeof(struct frag_hdr);
+ if (payload_len > IPV6_MAXPLEN)
+ goto out_oversize;
/* We have to remove fragment header from datagram and to relocate
* header in order to calculate ICV correctly. */
nhoff = fq->nhoffset;
- head->nh.raw[nhoff] = head->h.raw[0];
- memmove(head->head + sizeof(struct frag_hdr), head->head,
- (head->data - head->head) - sizeof(struct frag_hdr));
- head->mac.raw += sizeof(struct frag_hdr);
- head->nh.raw += sizeof(struct frag_hdr);
-
- skb_shinfo(head)->frag_list = head->next;
- head->h.raw = head->data;
- skb_push(head, head->data - head->nh.raw);
- atomic_sub(head->truesize, &ip6_frag_mem);
-
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &ip6_frag_mem);
- }
+ skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0];
+ memmove(skb->head + sizeof(struct frag_hdr), skb->head,
+ (skb->data - skb->head) - sizeof(struct frag_hdr));
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += sizeof(struct frag_hdr);
+ skb->network_header += sizeof(struct frag_hdr);
+
+ skb_reset_transport_header(skb);
- head->next = NULL;
- head->dev = dev;
- skb_set_timestamp(head, &fq->stamp);
- head->nh.ipv6h->payload_len = htons(payload_len);
- IP6CB(head)->nhoff = nhoff;
+ inet_frag_reasm_finish(&fq->q, skb, reasm_data, true);
- *skb_in = head;
+ skb->dev = dev;
+ ipv6_hdr(skb)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
+ IP6CB(skb)->nhoff = nhoff;
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(skb)->frag_max_size = fq->q.max_size;
/* Yes, and fold redundant checksum back. 8) */
- if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum);
+ skb_postpush_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
- rcu_read_lock();
- IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
- rcu_read_unlock();
- fq->fragments = NULL;
+ __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS);
+ fq->q.rb_fragments = RB_ROOT;
+ fq->q.fragments_tail = NULL;
+ fq->q.last_run_head = NULL;
return 1;
out_oversize:
- if (net_ratelimit())
- printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len);
+ net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len);
goto out_fail;
out_oom:
- if (net_ratelimit())
- printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
+ net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
out_fail:
- rcu_read_lock();
- IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
- rcu_read_unlock();
+ __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS);
+ inet_frag_kill(&fq->q, refs);
return -1;
}
-static int ipv6_frag_rcv(struct sk_buff **skbp)
+static int ipv6_frag_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb = *skbp;
- struct net_device *dev = skb->dev;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct net *net = skb_dst_dev_net(skb);
struct frag_hdr *fhdr;
struct frag_queue *fq;
- struct ipv6hdr *hdr;
+ u8 nexthdr;
+ int iif;
- hdr = skb->nh.ipv6h;
+ if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
+ goto fail_hdr;
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
/* Jumbo payload inhibits frag. header */
- if (hdr->payload_len==0) {
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw);
- return -1;
+ if (hdr->payload_len == 0)
+ goto fail_hdr;
+
+ if (!pskb_may_pull(skb, (skb_transport_offset(skb) +
+ sizeof(struct frag_hdr))))
+ goto fail_hdr;
+
+ hdr = ipv6_hdr(skb);
+ fhdr = (struct frag_hdr *)skb_transport_header(skb);
+
+ if (!(fhdr->frag_off & htons(IP6_OFFSET | IP6_MF))) {
+ /* It is not a fragmented frame */
+ skb->transport_header += sizeof(struct frag_hdr);
+ __IP6_INC_STATS(net,
+ ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
+
+ IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(skb)->frag_max_size = ntohs(hdr->payload_len) +
+ sizeof(struct ipv6hdr);
+ return 1;
}
- if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+sizeof(struct frag_hdr))) {
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw);
+
+ /* RFC 8200, Section 4.5 Fragment Header:
+ * If the first fragment does not include all headers through an
+ * Upper-Layer header, then that fragment should be discarded and
+ * an ICMP Parameter Problem, Code 3, message should be sent to
+ * the source of the fragment, with the Pointer field set to zero.
+ */
+ nexthdr = hdr->nexthdr;
+ if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) {
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0);
return -1;
}
- hdr = skb->nh.ipv6h;
- fhdr = (struct frag_hdr *)skb->h.raw;
+ iif = skb->dev ? skb->dev->ifindex : 0;
+ rcu_read_lock();
+ fq = fq_find(net, fhdr->identification, hdr, iif);
+ if (fq) {
+ u32 prob_offset = 0;
+ int ret, refs = 0;
+
+ spin_lock(&fq->q.lock);
+
+ fq->iif = iif;
+ ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff,
+ &prob_offset, &refs);
+
+ spin_unlock(&fq->q.lock);
+ rcu_read_unlock();
+ inet_frag_putn(&fq->q, refs);
+ if (prob_offset) {
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ /* icmpv6_param_prob() calls kfree_skb(skb) */
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset);
+ }
+ return ret;
+ }
+ rcu_read_unlock();
- if (!(fhdr->frag_off & htons(0xFFF9))) {
- /* It is not a fragmented frame */
- skb->h.raw += sizeof(struct frag_hdr);
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMOKS);
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return -1;
- IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw;
- return 1;
- }
+fail_hdr:
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
+ return -1;
+}
- if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
- ip6_evictor(ip6_dst_idev(skb->dst));
+static const struct inet6_protocol frag_protocol = {
+ .handler = ipv6_frag_rcv,
+ .flags = INET6_PROTO_NOPOLICY,
+};
- if ((fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr,
- ip6_dst_idev(skb->dst))) != NULL) {
- int ret = -1;
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table ip6_frags_ns_ctl_table[] = {
+ {
+ .procname = "ip6frag_high_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "ip6frag_low_thresh",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "ip6frag_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+};
- spin_lock(&fq->lock);
+/* secret interval has been deprecated */
+static int ip6_frags_secret_interval_unused;
+static struct ctl_table ip6_frags_ctl_table[] = {
+ {
+ .procname = "ip6frag_secret_interval",
+ .data = &ip6_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+};
- ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
- if (fq->last_in == (FIRST_IN|LAST_IN) &&
- fq->meat == fq->len)
- ret = ip6_frag_reasm(fq, skbp, dev);
+ table = ip6_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
- spin_unlock(&fq->lock);
- fq_put(fq, NULL);
- return ret;
}
+ table[0].data = &net->ipv6.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv6.fqdir->low_thresh;
+ table[1].data = &net->ipv6.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv6.fqdir->high_thresh;
+ table[2].data = &net->ipv6.fqdir->timeout;
+
+ hdr = register_net_sysctl_sz(net, "net/ipv6", table,
+ ARRAY_SIZE(ip6_frags_ns_ctl_table));
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv6.sysctl.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
- IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
- kfree_skb(skb);
- return -1;
+static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net)
+{
+ const struct ctl_table *table;
+
+ table = net->ipv6.sysctl.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
}
-static struct inet6_protocol frag_protocol =
+static struct ctl_table_header *ip6_ctl_header;
+
+static int ip6_frags_sysctl_register(void)
{
- .handler = ipv6_frag_rcv,
- .flags = INET6_PROTO_NOPOLICY,
+ ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6",
+ ip6_frags_ctl_table);
+ return ip6_ctl_header == NULL ? -ENOMEM : 0;
+}
+
+static void ip6_frags_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(ip6_ctl_header);
+}
+#else
+static int ip6_frags_ns_sysctl_register(struct net *net)
+{
+ return 0;
+}
+
+static void ip6_frags_ns_sysctl_unregister(struct net *net)
+{
+}
+
+static int ip6_frags_sysctl_register(void)
+{
+ return 0;
+}
+
+static void ip6_frags_sysctl_unregister(void)
+{
+}
+#endif
+
+static int __net_init ipv6_frags_init_net(struct net *net)
+{
+ int res;
+
+ res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
+ if (res < 0)
+ return res;
+
+ net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
+ res = ip6_frags_ns_sysctl_register(net);
+ if (res < 0)
+ fqdir_exit(net->ipv6.fqdir);
+ return res;
+}
+
+static void __net_exit ipv6_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv6.fqdir);
+}
+
+static void __net_exit ipv6_frags_exit_net(struct net *net)
+{
+ ip6_frags_ns_sysctl_unregister(net);
+ fqdir_exit(net->ipv6.fqdir);
+}
+
+static struct pernet_operations ip6_frags_ops = {
+ .init = ipv6_frags_init_net,
+ .pre_exit = ipv6_frags_pre_exit_net,
+ .exit = ipv6_frags_exit_net,
+};
+
+static const struct rhashtable_params ip6_rhash_params = {
+ .head_offset = offsetof(struct inet_frag_queue, node),
+ .hashfn = ip6frag_key_hashfn,
+ .obj_hashfn = ip6frag_obj_hashfn,
+ .obj_cmpfn = ip6frag_obj_cmpfn,
+ .automatic_shrinking = true,
};
-void __init ipv6_frag_init(void)
+int __init ipv6_frag_init(void)
{
- if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0)
- printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n");
+ int ret;
+
+ ip6_frags.constructor = ip6frag_init;
+ ip6_frags.destructor = NULL;
+ ip6_frags.qsize = sizeof(struct frag_queue);
+ ip6_frags.frag_expire = ip6_frag_expire;
+ ip6_frags.frags_cache_name = ip6_frag_cache_name;
+ ip6_frags.rhash_params = ip6_rhash_params;
+ ret = inet_frags_init(&ip6_frags);
+ if (ret)
+ goto out;
+
+ ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ if (ret)
+ goto err_protocol;
- ip6_frag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
- (jiffies ^ (jiffies >> 6)));
+ ret = ip6_frags_sysctl_register();
+ if (ret)
+ goto err_sysctl;
- init_timer(&ip6_frag_secret_timer);
- ip6_frag_secret_timer.function = ip6_frag_secret_rebuild;
- ip6_frag_secret_timer.expires = jiffies + sysctl_ip6frag_secret_interval;
- add_timer(&ip6_frag_secret_timer);
+ ret = register_pernet_subsys(&ip6_frags_ops);
+ if (ret)
+ goto err_pernet;
+
+out:
+ return ret;
+
+err_pernet:
+ ip6_frags_sysctl_unregister();
+err_sysctl:
+ inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+err_protocol:
+ inet_frags_fini(&ip6_frags);
+ goto out;
+}
+
+void ipv6_frag_exit(void)
+{
+ ip6_frags_sysctl_unregister();
+ unregister_pernet_subsys(&ip6_frags_ops);
+ inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ inet_frags_fini(&ip6_frags);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9f80518aacbd..aee6a10b112a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1,16 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux INET6 implementation
* FIB front-end.
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
- *
- * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Pedro Roque <roque@di.fc.ul.pt>
*/
/* Changes:
@@ -26,8 +20,11 @@
* Fixed routing subtrees.
*/
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/capability.h>
#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
@@ -36,14 +33,16 @@
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
+#include <linux/mroute6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
-
-#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#endif
-
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/siphash.h>
+#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
@@ -53,226 +52,610 @@
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
-
-#include <asm/uaccess.h>
+#include <net/rtnh.h>
+#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
+#include <net/ip.h>
+#include <linux/uaccess.h>
+#include <linux/btf_ids.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
-/* Set to 3 to get tracing. */
-#define RT6_DEBUG 2
+static int ip6_rt_type_to_error(u8 fib6_type);
-#if RT6_DEBUG >= 3
-#define RDBG(x) printk x
-#define RT6_TRACE(x...) printk(KERN_DEBUG x)
-#else
-#define RDBG(x)
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
+#define CREATE_TRACE_POINTS
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#undef CREATE_TRACE_POINTS
-#define CLONE_OFFLINK_ROUTE 0
-
-static int ip6_rt_max_size = 4096;
-static int ip6_rt_gc_min_interval = HZ / 2;
-static int ip6_rt_gc_timeout = 60*HZ;
-int ip6_rt_gc_interval = 30*HZ;
-static int ip6_rt_gc_elasticity = 9;
-static int ip6_rt_mtu_expires = 10*60*HZ;
-static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+enum rt6_nud_state {
+ RT6_NUD_FAIL_HARD = -3,
+ RT6_NUD_FAIL_PROBE = -2,
+ RT6_NUD_FAIL_DO_RR = -1,
+ RT6_NUD_SUCCEED = 1
+};
-static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
-static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
-static struct dst_entry *ip6_negative_advice(struct dst_entry *);
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int ip6_default_advmss(const struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+unsigned int ip6_mtu(const struct dst_entry *dst);
+static void ip6_negative_advice(struct sock *sk,
+ struct dst_entry *dst);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
- struct net_device *dev, int how);
-static int ip6_dst_gc(void);
+ struct net_device *dev);
+static void ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb);
-static int ip6_pkt_discard_out(struct sk_buff *skb);
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_prohibit(struct sk_buff *skb);
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
-static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
+static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
+ int strict);
+static size_t rt6_nlmsg_size(struct fib6_info *f6i);
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+ struct fib6_info *rt, struct dst_entry *dst,
+ struct in6_addr *dest, struct in6_addr *src,
+ int iif, int type, u32 portid, u32 seq,
+ unsigned int flags);
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr);
#ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
- struct in6_addr *gwaddr, int ifindex,
- unsigned pref);
-static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
- struct in6_addr *gwaddr, int ifindex);
+static struct fib6_info *rt6_add_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr,
+ struct net_device *dev,
+ unsigned int pref);
+static struct fib6_info *rt6_get_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr,
+ struct net_device *dev);
#endif
-static struct dst_ops ip6_dst_ops = {
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
+
+void rt6_uncached_list_add(struct rt6_info *rt)
+{
+ struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
+
+ rt->dst.rt_uncached_list = ul;
+
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
+}
+
+void rt6_uncached_list_del(struct rt6_info *rt)
+{
+ if (!list_empty(&rt->dst.rt_uncached)) {
+ struct uncached_list *ul = rt->dst.rt_uncached_list;
+
+ spin_lock_bh(&ul->lock);
+ list_del_init(&rt->dst.rt_uncached);
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+static void rt6_uncached_list_flush_dev(struct net_device *dev)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+ struct rt6_info *rt, *safe;
+
+ if (list_empty(&ul->head))
+ continue;
+
+ spin_lock_bh(&ul->lock);
+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
+ struct inet6_dev *rt_idev = rt->rt6i_idev;
+ struct net_device *rt_dev = rt->dst.dev;
+ bool handled = false;
+
+ if (rt_idev && rt_idev->dev == dev) {
+ rt->rt6i_idev = in6_dev_get(blackhole_netdev);
+ in6_dev_put(rt_idev);
+ handled = true;
+ }
+
+ if (rt_dev == dev) {
+ rt->dst.dev = blackhole_netdev;
+ netdev_ref_replace(rt_dev, blackhole_netdev,
+ &rt->dst.dev_tracker,
+ GFP_ATOMIC);
+ handled = true;
+ }
+ if (handled)
+ list_del_init(&rt->dst.rt_uncached);
+ }
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+static inline const void *choose_neigh_daddr(const struct in6_addr *p,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ if (!ipv6_addr_any(p))
+ return (const void *) p;
+ else if (skb)
+ return &ipv6_hdr(skb)->daddr;
+ return daddr;
+}
+
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+ struct net_device *dev,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ struct neighbour *n;
+
+ daddr = choose_neigh_daddr(gw, skb, daddr);
+ n = __ipv6_neigh_lookup(dev, daddr);
+ if (n)
+ return n;
+
+ n = neigh_create(&nd_tbl, daddr, dev);
+ return IS_ERR(n) ? NULL : n;
+}
+
+static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ const struct rt6_info *rt = dst_rt6_info(dst);
+
+ return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
+ dst_dev(dst), skb, daddr);
+}
+
+static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ const struct rt6_info *rt = dst_rt6_info(dst);
+ struct net_device *dev = dst_dev(dst);
+
+ daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
+ if (!daddr)
+ return;
+ if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+ return;
+ if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
+ return;
+ __ipv6_confirm_neigh(dev, daddr);
+}
+
+static struct dst_ops ip6_dst_ops_template = {
.family = AF_INET6,
- .protocol = __constant_htons(ETH_P_IPV6),
.gc = ip6_dst_gc,
.gc_thresh = 1024,
.check = ip6_dst_check,
+ .default_advmss = ip6_default_advmss,
+ .mtu = ip6_mtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
.link_failure = ip6_link_failure,
.update_pmtu = ip6_rt_update_pmtu,
- .entry_size = sizeof(struct rt6_info),
+ .redirect = rt6_do_redirect,
+ .local_out = __ip6_local_out,
+ .neigh_lookup = ip6_dst_neigh_lookup,
+ .confirm_neigh = ip6_confirm_neigh,
};
-struct rt6_info ip6_null_entry = {
- .u = {
- .dst = {
- .__refcnt = ATOMIC_INIT(1),
- .__use = 1,
- .dev = &loopback_dev,
- .obsolete = -1,
- .error = -ENETUNREACH,
- .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
- .input = ip6_pkt_discard,
- .output = ip6_pkt_discard_out,
- .ops = &ip6_dst_ops,
- .path = (struct dst_entry*)&ip6_null_entry,
- }
+static struct dst_ops ip6_dst_blackhole_ops = {
+ .family = AF_INET6,
+ .default_advmss = ip6_default_advmss,
+ .neigh_lookup = ip6_dst_neigh_lookup,
+ .check = ip6_dst_check,
+ .destroy = ip6_dst_destroy,
+ .cow_metrics = dst_cow_metrics_generic,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
+};
+
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+ [RTAX_HOPLIMIT - 1] = 0,
+};
+
+static const struct fib6_info fib6_null_entry_template = {
+ .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .fib6_protocol = RTPROT_KERNEL,
+ .fib6_metric = ~(u32)0,
+ .fib6_ref = REFCOUNT_INIT(1),
+ .fib6_type = RTN_UNREACHABLE,
+ .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
+};
+
+static const struct rt6_info ip6_null_entry_template = {
+ .dst = {
+ .__rcuref = RCUREF_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -ENETUNREACH,
+ .input = ip6_pkt_discard,
+ .output = ip6_pkt_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_metric = ~(u32) 0,
- .rt6i_ref = ATOMIC_INIT(1),
};
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
-static int ip6_pkt_prohibit(struct sk_buff *skb);
-static int ip6_pkt_prohibit_out(struct sk_buff *skb);
-static int ip6_pkt_blk_hole(struct sk_buff *skb);
-
-struct rt6_info ip6_prohibit_entry = {
- .u = {
- .dst = {
- .__refcnt = ATOMIC_INIT(1),
- .__use = 1,
- .dev = &loopback_dev,
- .obsolete = -1,
- .error = -EACCES,
- .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
- .input = ip6_pkt_prohibit,
- .output = ip6_pkt_prohibit_out,
- .ops = &ip6_dst_ops,
- .path = (struct dst_entry*)&ip6_prohibit_entry,
- }
+static const struct rt6_info ip6_prohibit_entry_template = {
+ .dst = {
+ .__rcuref = RCUREF_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -EACCES,
+ .input = ip6_pkt_prohibit,
+ .output = ip6_pkt_prohibit_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_metric = ~(u32) 0,
- .rt6i_ref = ATOMIC_INIT(1),
};
-struct rt6_info ip6_blk_hole_entry = {
- .u = {
- .dst = {
- .__refcnt = ATOMIC_INIT(1),
- .__use = 1,
- .dev = &loopback_dev,
- .obsolete = -1,
- .error = -EINVAL,
- .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
- .input = ip6_pkt_blk_hole,
- .output = ip6_pkt_blk_hole,
- .ops = &ip6_dst_ops,
- .path = (struct dst_entry*)&ip6_blk_hole_entry,
- }
+static const struct rt6_info ip6_blk_hole_entry_template = {
+ .dst = {
+ .__rcuref = RCUREF_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -EINVAL,
+ .input = dst_discard,
+ .output = dst_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_metric = ~(u32) 0,
- .rt6i_ref = ATOMIC_INIT(1),
};
#endif
+static void rt6_info_init(struct rt6_info *rt)
+{
+ memset_after(rt, 0, dst);
+}
+
/* allocate dst with ip6_dst_ops */
-static __inline__ struct rt6_info *ip6_dst_alloc(void)
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+ int flags)
{
- return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
+ struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
+ DST_OBSOLETE_FORCE_CHK, flags);
+
+ if (rt) {
+ rt6_info_init(rt);
+ atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+ }
+
+ return rt;
}
+EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
- struct rt6_info *rt = (struct rt6_info *)dst;
- struct inet6_dev *idev = rt->rt6i_idev;
+ struct rt6_info *rt = dst_rt6_info(dst);
+ struct fib6_info *from;
+ struct inet6_dev *idev;
- if (idev != NULL) {
+ ip_dst_metrics_put(dst);
+ rt6_uncached_list_del(rt);
+
+ idev = rt->rt6i_idev;
+ if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
- }
+ }
+
+ from = unrcu_pointer(xchg(&rt->from, NULL));
+ fib6_info_release(from);
}
-static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int how)
+static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
- struct rt6_info *rt = (struct rt6_info *)dst;
+ struct rt6_info *rt = dst_rt6_info(dst);
struct inet6_dev *idev = rt->rt6i_idev;
+ struct fib6_info *from;
- if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
- struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
- if (loopback_idev != NULL) {
- rt->rt6i_idev = loopback_idev;
+ if (idev && idev->dev != blackhole_netdev) {
+ struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
+
+ if (blackhole_idev) {
+ rt->rt6i_idev = blackhole_idev;
in6_dev_put(idev);
}
}
+ from = unrcu_pointer(xchg(&rt->from, NULL));
+ fib6_info_release(from);
+}
+
+static bool __rt6_check_expired(const struct rt6_info *rt)
+{
+ if (rt->rt6i_flags & RTF_EXPIRES)
+ return time_after(jiffies, READ_ONCE(rt->dst.expires));
+ return false;
+}
+
+static bool rt6_check_expired(const struct rt6_info *rt)
+{
+ struct fib6_info *from;
+
+ from = rcu_dereference(rt->from);
+
+ if (rt->rt6i_flags & RTF_EXPIRES) {
+ if (time_after(jiffies, READ_ONCE(rt->dst.expires)))
+ return true;
+ } else if (from) {
+ return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK ||
+ fib6_check_expired(from);
+ }
+ return false;
}
-static __inline__ int rt6_check_expired(const struct rt6_info *rt)
+static struct fib6_info *
+rt6_multipath_first_sibling_rcu(const struct fib6_info *rt)
{
- return (rt->rt6i_flags & RTF_EXPIRES &&
- time_after(jiffies, rt->rt6i_expires));
+ struct fib6_info *iter;
+ struct fib6_node *fn;
+
+ fn = rcu_dereference(rt->fib6_node);
+ if (!fn)
+ goto out;
+ iter = rcu_dereference(fn->leaf);
+ if (!iter)
+ goto out;
+
+ while (iter) {
+ if (iter->fib6_metric == rt->fib6_metric &&
+ rt6_qualify_for_ecmp(iter))
+ return iter;
+ iter = rcu_dereference(iter->fib6_next);
+ }
+
+out:
+ return NULL;
}
-static inline int rt6_need_strict(struct in6_addr *daddr)
+void fib6_select_path(const struct net *net, struct fib6_result *res,
+ struct flowi6 *fl6, int oif, bool have_oif_match,
+ const struct sk_buff *skb, int strict)
{
- return (ipv6_addr_type(daddr) &
- (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
+ struct fib6_info *first, *match = res->f6i;
+ struct fib6_info *sibling;
+ int hash;
+
+ if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
+ goto out;
+
+ if (match->nh && have_oif_match && res->nh)
+ return;
+
+ if (skb)
+ IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
+
+ /* We might have already computed the hash for ICMPv6 errors. In such
+ * case it will always be non-zero. Otherwise now is the time to do it.
+ */
+ if (!fl6->mp_hash &&
+ (!match->nh || nexthop_is_multipath(match->nh)))
+ fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
+
+ if (unlikely(match->nh)) {
+ nexthop_path_fib6_result(res, fl6->mp_hash);
+ return;
+ }
+
+ first = rt6_multipath_first_sibling_rcu(match);
+ if (!first)
+ goto out;
+
+ hash = fl6->mp_hash;
+ if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) {
+ if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
+ strict) >= 0)
+ match = first;
+ goto out;
+ }
+
+ list_for_each_entry_rcu(sibling, &first->fib6_siblings,
+ fib6_siblings) {
+ const struct fib6_nh *nh = sibling->fib6_nh;
+ int nh_upper_bound;
+
+ nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
+ if (hash > nh_upper_bound)
+ continue;
+ if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
+ break;
+ match = sibling;
+ break;
+ }
+
+out:
+ res->f6i = match;
+ res->nh = match->fib6_nh;
}
/*
- * Route lookup. Any table->tb6_lock is implied.
+ * Route lookup. rcu_read_lock() should be held.
*/
-static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
- int oif,
- int strict)
+static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
+ const struct in6_addr *saddr, int oif, int flags)
{
- struct rt6_info *local = NULL;
- struct rt6_info *sprt;
+ const struct net_device *dev;
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
+ return false;
+
+ dev = nh->fib_nh_dev;
if (oif) {
- for (sprt = rt; sprt; sprt = sprt->u.next) {
- struct net_device *dev = sprt->rt6i_dev;
- if (dev->ifindex == oif)
- return sprt;
- if (dev->flags & IFF_LOOPBACK) {
- if (sprt->rt6i_idev == NULL ||
- sprt->rt6i_idev->dev->ifindex != oif) {
- if (strict && oif)
- continue;
- if (local && (!oif ||
- local->rt6i_idev->dev->ifindex == oif))
- continue;
- }
- local = sprt;
- }
+ if (dev->ifindex == oif)
+ return true;
+ } else {
+ if (ipv6_chk_addr(net, saddr, dev,
+ flags & RT6_LOOKUP_F_IFACE))
+ return true;
+ }
+
+ return false;
+}
+
+struct fib6_nh_dm_arg {
+ struct net *net;
+ const struct in6_addr *saddr;
+ int oif;
+ int flags;
+ struct fib6_nh *nh;
+};
+
+static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_dm_arg *arg = _arg;
+
+ arg->nh = nh;
+ return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
+ arg->flags);
+}
+
+/* returns fib6_nh from nexthop or NULL */
+static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
+ struct fib6_result *res,
+ const struct in6_addr *saddr,
+ int oif, int flags)
+{
+ struct fib6_nh_dm_arg arg = {
+ .net = net,
+ .saddr = saddr,
+ .oif = oif,
+ .flags = flags,
+ };
+
+ if (nexthop_is_blackhole(nh))
+ return NULL;
+
+ if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
+ return arg.nh;
+
+ return NULL;
+}
+
+static void rt6_device_match(struct net *net, struct fib6_result *res,
+ const struct in6_addr *saddr, int oif, int flags)
+{
+ struct fib6_info *f6i = res->f6i;
+ struct fib6_info *spf6i;
+ struct fib6_nh *nh;
+
+ if (!oif && ipv6_addr_any(saddr)) {
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
+ if (!(nh->fib_nh_flags & RTNH_F_DEAD))
+ goto out;
+ }
+
+ for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
+ bool matched = false;
+
+ if (unlikely(spf6i->nh)) {
+ nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
+ oif, flags);
+ if (nh)
+ matched = true;
+ } else {
+ nh = spf6i->fib6_nh;
+ if (__rt6_device_match(net, nh, saddr, oif, flags))
+ matched = true;
}
+ if (matched) {
+ res->f6i = spf6i;
+ goto out;
+ }
+ }
- if (local)
- return local;
+ if (oif && flags & RT6_LOOKUP_F_IFACE) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ nh = res->f6i->fib6_nh;
+ goto out;
+ }
- if (strict)
- return &ip6_null_entry;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
}
- return rt;
+
+ if (nh->fib_nh_flags & RTNH_F_DEAD) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ nh = res->f6i->fib6_nh;
+ }
+out:
+ res->nh = nh;
+ res->fib6_type = res->f6i->fib6_type;
+ res->fib6_flags = res->f6i->fib6_flags;
+ return;
+
+out_blackhole:
+ res->fib6_flags |= RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->nh = nh;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
-static void rt6_probe(struct rt6_info *rt)
+struct __rt6_probe_work {
+ struct work_struct work;
+ struct in6_addr target;
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+};
+
+static void rt6_probe_deferred(struct work_struct *w)
+{
+ struct in6_addr mcaddr;
+ struct __rt6_probe_work *work =
+ container_of(w, struct __rt6_probe_work, work);
+
+ addrconf_addr_solict_mult(&work->target, &mcaddr);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
+ netdev_put(work->dev, &work->dev_tracker);
+ kfree(work);
+}
+
+static void rt6_probe(struct fib6_nh *fib6_nh)
{
- struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
+ struct __rt6_probe_work *work = NULL;
+ const struct in6_addr *nh_gw;
+ unsigned long last_probe;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
/*
* Okay, this does not seem to be appropriate
* for now, however, we need to check if it
@@ -281,141 +664,315 @@ static void rt6_probe(struct rt6_info *rt)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (!neigh || (neigh->nud_state & NUD_VALID))
+ if (!fib6_nh->fib_nh_gw_family)
return;
- read_lock_bh(&neigh->lock);
- if (!(neigh->nud_state & NUD_VALID) &&
- time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
- struct in6_addr mcaddr;
- struct in6_addr *target;
- neigh->updated = jiffies;
- read_unlock_bh(&neigh->lock);
+ nh_gw = &fib6_nh->fib_nh_gw6;
+ dev = fib6_nh->fib_nh_dev;
+ rcu_read_lock();
+ last_probe = READ_ONCE(fib6_nh->last_probe);
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto out;
+ neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
+ if (neigh) {
+ if (READ_ONCE(neigh->nud_state) & NUD_VALID)
+ goto out;
+
+ write_lock_bh(&neigh->lock);
+ if (!(neigh->nud_state & NUD_VALID) &&
+ time_after(jiffies,
+ neigh->updated +
+ READ_ONCE(idev->cnf.rtr_probe_interval))) {
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work)
+ __neigh_set_probe_once(neigh);
+ }
+ write_unlock_bh(&neigh->lock);
+ } else if (time_after(jiffies, last_probe +
+ READ_ONCE(idev->cnf.rtr_probe_interval))) {
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ }
- target = (struct in6_addr *)&neigh->primary_key;
- addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
- } else
- read_unlock_bh(&neigh->lock);
+ if (!work || cmpxchg(&fib6_nh->last_probe,
+ last_probe, jiffies) != last_probe) {
+ kfree(work);
+ } else {
+ INIT_WORK(&work->work, rt6_probe_deferred);
+ work->target = *nh_gw;
+ netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
+ work->dev = dev;
+ schedule_work(&work->work);
+ }
+
+out:
+ rcu_read_unlock();
}
#else
-static inline void rt6_probe(struct rt6_info *rt)
+static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
- return;
}
#endif
/*
* Default Router Selection (RFC 2461 6.3.6)
*/
-static int inline rt6_check_dev(struct rt6_info *rt, int oif)
+static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
- struct net_device *dev = rt->rt6i_dev;
- if (!oif || dev->ifindex == oif)
- return 2;
- if ((dev->flags & IFF_LOOPBACK) &&
- rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
- return 1;
- return 0;
-}
+ enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
+ struct neighbour *neigh;
-static int inline rt6_check_neigh(struct rt6_info *rt)
-{
- struct neighbour *neigh = rt->rt6i_nexthop;
- int m = 0;
- if (rt->rt6i_flags & RTF_NONEXTHOP ||
- !(rt->rt6i_flags & RTF_GATEWAY))
- m = 1;
- else if (neigh) {
- read_lock_bh(&neigh->lock);
- if (neigh->nud_state & NUD_VALID)
- m = 2;
- else if (!(neigh->nud_state & NUD_FAILED))
- m = 1;
- read_unlock_bh(&neigh->lock);
+ rcu_read_lock();
+ neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
+ &fib6_nh->fib_nh_gw6);
+ if (neigh) {
+ u8 nud_state = READ_ONCE(neigh->nud_state);
+
+ if (nud_state & NUD_VALID)
+ ret = RT6_NUD_SUCCEED;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ else if (!(nud_state & NUD_FAILED))
+ ret = RT6_NUD_SUCCEED;
+ else
+ ret = RT6_NUD_FAIL_PROBE;
+#endif
+ } else {
+ ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
+ RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
}
- return m;
+ rcu_read_unlock();
+
+ return ret;
}
-static int rt6_score_route(struct rt6_info *rt, int oif,
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
int strict)
{
- int m, n;
-
- m = rt6_check_dev(rt, oif);
+ int m = 0;
+
+ if (!oif || nh->fib_nh_dev->ifindex == oif)
+ m = 2;
+
if (!m && (strict & RT6_LOOKUP_F_IFACE))
- return -1;
+ return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
- m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
- n = rt6_check_neigh(rt);
- if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
- return -1;
+ if ((strict & RT6_LOOKUP_F_REACHABLE) &&
+ !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
+ int n = rt6_check_neigh(nh);
+ if (n < 0)
+ return n;
+ }
return m;
}
-static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
- int strict)
+static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
+ int oif, int strict, int *mpri, bool *do_rr)
{
- struct rt6_info *match = NULL, *last = NULL;
- struct rt6_info *rt, *rt0 = *head;
- u32 metric;
- int mpri = -1;
+ bool match_do_rr = false;
+ bool rc = false;
+ int m;
- RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
- __FUNCTION__, head, head ? *head : NULL, oif);
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
+ goto out;
- for (rt = rt0, metric = rt0->rt6i_metric;
- rt && rt->rt6i_metric == metric && (!last || rt != rt0);
- rt = rt->u.next) {
- int m;
+ if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
+ nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+ !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
+ goto out;
- if (rt6_check_expired(rt))
- continue;
+ m = rt6_score_route(nh, fib6_flags, oif, strict);
+ if (m == RT6_NUD_FAIL_DO_RR) {
+ match_do_rr = true;
+ m = 0; /* lowest valid score */
+ } else if (m == RT6_NUD_FAIL_HARD) {
+ goto out;
+ }
+
+ if (strict & RT6_LOOKUP_F_REACHABLE)
+ rt6_probe(nh);
+
+ /* note that m can be RT6_NUD_FAIL_PROBE at this point */
+ if (m > *mpri) {
+ *do_rr = match_do_rr;
+ *mpri = m;
+ rc = true;
+ }
+out:
+ return rc;
+}
+
+struct fib6_nh_frl_arg {
+ u32 flags;
+ int oif;
+ int strict;
+ int *mpri;
+ bool *do_rr;
+ struct fib6_nh *nh;
+};
+
+static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_frl_arg *arg = _arg;
+
+ arg->nh = nh;
+ return find_match(nh, arg->flags, arg->oif, arg->strict,
+ arg->mpri, arg->do_rr);
+}
+
+static void __find_rr_leaf(struct fib6_info *f6i_start,
+ struct fib6_info *nomatch, u32 metric,
+ struct fib6_result *res, struct fib6_info **cont,
+ int oif, int strict, bool *do_rr, int *mpri)
+{
+ struct fib6_info *f6i;
- last = rt;
+ for (f6i = f6i_start;
+ f6i && f6i != nomatch;
+ f6i = rcu_dereference(f6i->fib6_next)) {
+ bool matched = false;
+ struct fib6_nh *nh;
- m = rt6_score_route(rt, oif, strict);
- if (m < 0)
+ if (cont && f6i->fib6_metric != metric) {
+ *cont = f6i;
+ return;
+ }
+
+ if (fib6_check_expired(f6i))
continue;
- if (m > mpri) {
- if (strict & RT6_LOOKUP_F_REACHABLE)
- rt6_probe(match);
- match = rt;
- mpri = m;
- } else if (strict & RT6_LOOKUP_F_REACHABLE) {
- rt6_probe(rt);
+ if (unlikely(f6i->nh)) {
+ struct fib6_nh_frl_arg arg = {
+ .flags = f6i->fib6_flags,
+ .oif = oif,
+ .strict = strict,
+ .mpri = mpri,
+ .do_rr = do_rr
+ };
+
+ if (nexthop_is_blackhole(f6i->nh)) {
+ res->fib6_flags = RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->f6i = f6i;
+ res->nh = nexthop_fib6_nh(f6i->nh);
+ return;
+ }
+ if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
+ &arg)) {
+ matched = true;
+ nh = arg.nh;
+ }
+ } else {
+ nh = f6i->fib6_nh;
+ if (find_match(nh, f6i->fib6_flags, oif, strict,
+ mpri, do_rr))
+ matched = true;
+ }
+ if (matched) {
+ res->f6i = f6i;
+ res->nh = nh;
+ res->fib6_flags = f6i->fib6_flags;
+ res->fib6_type = f6i->fib6_type;
}
}
+}
+
+static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
+ struct fib6_info *rr_head, int oif, int strict,
+ bool *do_rr, struct fib6_result *res)
+{
+ u32 metric = rr_head->fib6_metric;
+ struct fib6_info *cont = NULL;
+ int mpri = -1;
+
+ __find_rr_leaf(rr_head, NULL, metric, res, &cont,
+ oif, strict, do_rr, &mpri);
+
+ __find_rr_leaf(leaf, rr_head, metric, res, &cont,
+ oif, strict, do_rr, &mpri);
+
+ if (res->f6i || !cont)
+ return;
+
+ __find_rr_leaf(cont, NULL, metric, res, NULL,
+ oif, strict, do_rr, &mpri);
+}
+
+static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
+ struct fib6_result *res, int strict)
+{
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
+ struct fib6_info *rt0;
+ bool do_rr = false;
+ int key_plen;
+
+ /* make sure this function or its helpers sets f6i */
+ res->f6i = NULL;
+
+ if (!leaf || leaf == net->ipv6.fib6_null_entry)
+ goto out;
+
+ rt0 = rcu_dereference(fn->rr_ptr);
+ if (!rt0)
+ rt0 = leaf;
+
+ /* Double check to make sure fn is not an intermediate node
+ * and fn->leaf does not points to its child's leaf
+ * (This might happen if all routes under fn are deleted from
+ * the tree and fib6_repair_tree() is called on the node.)
+ */
+ key_plen = rt0->fib6_dst.plen;
+#ifdef CONFIG_IPV6_SUBTREES
+ if (rt0->fib6_src.plen)
+ key_plen = rt0->fib6_src.plen;
+#endif
+ if (fn->fn_bit != key_plen)
+ goto out;
+
+ find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
+ if (do_rr) {
+ struct fib6_info *next = rcu_dereference(rt0->fib6_next);
- if (!match &&
- (strict & RT6_LOOKUP_F_REACHABLE) &&
- last && last != rt0) {
/* no entries matched; do round-robin */
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
- *head = rt0->u.next;
- rt0->u.next = last->u.next;
- last->u.next = rt0;
- spin_unlock(&lock);
+ if (!next || next->fib6_metric != rt0->fib6_metric)
+ next = leaf;
+
+ if (next != rt0) {
+ spin_lock_bh(&leaf->fib6_table->tb6_lock);
+ /* make sure next is not being deleted from the tree */
+ if (next->fib6_node)
+ rcu_assign_pointer(fn->rr_ptr, next);
+ spin_unlock_bh(&leaf->fib6_table->tb6_lock);
+ }
}
- RT6_TRACE("%s() => %p, score=%d\n",
- __FUNCTION__, match, mpri);
+out:
+ if (!res->f6i) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ res->nh = res->f6i->fib6_nh;
+ res->fib6_flags = res->f6i->fib6_flags;
+ res->fib6_type = res->f6i->fib6_type;
+ }
+}
- return (match ? match : &ip6_null_entry);
+static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
+{
+ return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
+ res->nh->fib_nh_gw_family;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
- struct in6_addr *gwaddr)
+ const struct in6_addr *gwaddr)
{
+ struct net *net = dev_net(dev);
struct route_info *rinfo = (struct route_info *) opt;
struct in6_addr prefix_buf, *prefix;
+ struct fib6_table *table;
unsigned int pref;
- u32 lifetime;
- struct rt6_info *rt;
+ unsigned long lifetime;
+ struct fib6_info *rt;
if (len < sizeof(struct route_info)) {
return -EINVAL;
@@ -438,15 +995,9 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
pref = rinfo->route_pref;
if (pref == ICMPV6_ROUTER_PREF_INVALID)
- pref = ICMPV6_ROUTER_PREF_MEDIUM;
+ return -EINVAL;
- lifetime = ntohl(rinfo->lifetime);
- if (lifetime == 0xffffffff) {
- /* infinity */
- } else if (lifetime > 0x7fffffff/HZ) {
- /* Avoid arithmetic overflow */
- lifetime = 0x7fffffff/HZ - 1;
- }
+ lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
if (rinfo->length == 3)
prefix = (struct in6_addr *)rinfo->prefix;
@@ -458,430 +1009,2251 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
prefix = &prefix_buf;
}
- rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
+ if (rinfo->prefix_len == 0)
+ rt = rt6_get_dflt_router(net, gwaddr, dev);
+ else
+ rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
+ gwaddr, dev);
if (rt && !lifetime) {
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt, false);
rt = NULL;
}
if (!rt && lifetime)
- rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
- pref);
+ rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
+ dev, pref);
else if (rt)
- rt->rt6i_flags = RTF_ROUTEINFO |
- (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ rt->fib6_flags = RTF_ROUTEINFO |
+ (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
- if (lifetime == 0xffffffff) {
- rt->rt6i_flags &= ~RTF_EXPIRES;
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (!addrconf_finite_timeout(lifetime)) {
+ fib6_clean_expires(rt);
+ fib6_remove_gc_list(rt);
} else {
- rt->rt6i_expires = jiffies + HZ * lifetime;
- rt->rt6i_flags |= RTF_EXPIRES;
+ fib6_set_expires(rt, jiffies + HZ * lifetime);
+ fib6_add_gc_list(rt);
}
- dst_release(&rt->u.dst);
+
+ spin_unlock_bh(&table->tb6_lock);
+
+ fib6_info_release(rt);
}
return 0;
}
#endif
-#define BACKTRACK(saddr) \
-do { \
- if (rt == &ip6_null_entry) { \
- struct fib6_node *pn; \
- while (1) { \
- if (fn->fn_flags & RTN_TL_ROOT) \
- goto out; \
- pn = fn->parent; \
- if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
- fn = fib6_lookup(pn->subtree, NULL, saddr); \
- else \
- fn = pn; \
- if (fn->fn_flags & RTN_RTINFO) \
- goto restart; \
- } \
- } \
-} while(0)
-
-static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
- struct flowi *fl, int flags)
+/*
+ * Misc support functions
+ */
+
+/* called with rcu_lock held */
+static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
+ struct net_device *dev = res->nh->fib_nh_dev;
+
+ if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+ /* for copies of local routes, dst->dev needs to be the
+ * device if it is a master device, the master device if
+ * device is enslaved, and the loopback as the default
+ */
+ if (netif_is_l3_slave(dev) &&
+ !rt6_need_strict(&res->f6i->fib6_dst.addr))
+ dev = l3mdev_master_dev_rcu(dev);
+ else if (!netif_is_l3_master(dev))
+ dev = dev_net(dev)->loopback_dev;
+ /* last case is netif_is_l3_master(dev) is true in which
+ * case we want dev returned to be dev
+ */
+ }
+
+ return dev;
+}
+
+static const int fib6_prop[RTN_MAX + 1] = {
+ [RTN_UNSPEC] = 0,
+ [RTN_UNICAST] = 0,
+ [RTN_LOCAL] = 0,
+ [RTN_BROADCAST] = 0,
+ [RTN_ANYCAST] = 0,
+ [RTN_MULTICAST] = 0,
+ [RTN_BLACKHOLE] = -EINVAL,
+ [RTN_UNREACHABLE] = -EHOSTUNREACH,
+ [RTN_PROHIBIT] = -EACCES,
+ [RTN_THROW] = -EAGAIN,
+ [RTN_NAT] = -EINVAL,
+ [RTN_XRESOLVE] = -EINVAL,
+};
+
+static int ip6_rt_type_to_error(u8 fib6_type)
+{
+ return fib6_prop[fib6_type];
+}
+
+static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
+{
+ unsigned short flags = 0;
+
+ if (rt->dst_nocount)
+ flags |= DST_NOCOUNT;
+ if (rt->dst_nopolicy)
+ flags |= DST_NOPOLICY;
+
+ return flags;
+}
+
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
+{
+ rt->dst.error = ip6_rt_type_to_error(fib6_type);
+
+ switch (fib6_type) {
+ case RTN_BLACKHOLE:
+ rt->dst.output = dst_discard_out;
+ rt->dst.input = dst_discard;
+ break;
+ case RTN_PROHIBIT:
+ rt->dst.output = ip6_pkt_prohibit_out;
+ rt->dst.input = ip6_pkt_prohibit;
+ break;
+ case RTN_THROW:
+ case RTN_UNREACHABLE:
+ default:
+ rt->dst.output = ip6_pkt_discard_out;
+ rt->dst.input = ip6_pkt_discard;
+ break;
+ }
+}
+
+static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
+{
+ struct fib6_info *f6i = res->f6i;
+
+ if (res->fib6_flags & RTF_REJECT) {
+ ip6_rt_init_dst_reject(rt, res->fib6_type);
+ return;
+ }
+
+ rt->dst.error = 0;
+ rt->dst.output = ip6_output;
+
+ if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
+ rt->dst.input = ip6_input;
+ } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
+ rt->dst.input = ip6_mc_input;
+ rt->dst.output = ip6_mr_output;
+ } else {
+ rt->dst.input = ip6_forward;
+ }
+
+ if (res->nh->fib_nh_lws) {
+ rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
+ lwtunnel_set_redirect(&rt->dst);
+ }
+
+ rt->dst.lastuse = jiffies;
+}
+
+/* Caller must already hold reference to @from */
+static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
+{
+ rt->rt6i_flags &= ~RTF_EXPIRES;
+ rcu_assign_pointer(rt->from, from);
+ ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
+}
+
+/* Caller must already hold reference to f6i in result */
+static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
+{
+ const struct fib6_nh *nh = res->nh;
+ const struct net_device *dev = nh->fib_nh_dev;
+ struct fib6_info *f6i = res->f6i;
+
+ ip6_rt_init_dst(rt, res);
+
+ rt->rt6i_dst = f6i->fib6_dst;
+ rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
+ rt->rt6i_flags = res->fib6_flags;
+ if (nh->fib_nh_gw_family) {
+ rt->rt6i_gateway = nh->fib_nh_gw6;
+ rt->rt6i_flags |= RTF_GATEWAY;
+ }
+ rt6_set_from(rt, f6i);
+#ifdef CONFIG_IPV6_SUBTREES
+ rt->rt6i_src = f6i->fib6_src;
+#endif
+}
+
+static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
+ struct in6_addr *saddr)
+{
+ struct fib6_node *pn, *sn;
+ while (1) {
+ if (fn->fn_flags & RTN_TL_ROOT)
+ return NULL;
+ pn = rcu_dereference(fn->parent);
+ sn = FIB6_SUBTREE(pn);
+ if (sn && sn != fn)
+ fn = fib6_node_lookup(sn, NULL, saddr);
+ else
+ fn = pn;
+ if (fn->fn_flags & RTN_RTINFO)
+ return fn;
+ }
+}
+
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
+{
+ struct rt6_info *rt = *prt;
+
+ if (dst_hold_safe(&rt->dst))
+ return true;
+ if (net) {
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ } else {
+ rt = NULL;
+ }
+ *prt = rt;
+ return false;
+}
+
+/* called with rcu_lock held */
+static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
+{
+ struct net_device *dev = res->nh->fib_nh_dev;
+ struct fib6_info *f6i = res->f6i;
+ unsigned short flags;
+ struct rt6_info *nrt;
+
+ if (!fib6_info_hold_safe(f6i))
+ goto fallback;
+
+ flags = fib6_info_dst_flags(f6i);
+ nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
+ if (!nrt) {
+ fib6_info_release(f6i);
+ goto fallback;
+ }
+
+ ip6_rt_copy_init(nrt, res);
+ return nrt;
+
+fallback:
+ nrt = dev_net(dev)->ipv6.ip6_null_entry;
+ dst_hold(&nrt->dst);
+ return nrt;
+}
+
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
+ struct fib6_table *table,
+ struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags)
+{
+ struct fib6_result res = {};
struct fib6_node *fn;
struct rt6_info *rt;
- read_lock_bh(&table->tb6_lock);
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+ rcu_read_lock();
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
- rt = fn->leaf;
- rt = rt6_device_match(rt, fl->oif, flags);
- BACKTRACK(&fl->fl6_src);
+ res.f6i = rcu_dereference(fn->leaf);
+ if (!res.f6i)
+ res.f6i = net->ipv6.fib6_null_entry;
+ else
+ rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
+ flags);
+
+ if (res.f6i == net->ipv6.fib6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto restart;
+
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ goto out;
+ } else if (res.fib6_flags & RTF_REJECT) {
+ goto do_create;
+ }
+
+ fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
+ fl6->flowi6_oif != 0, skb, flags);
+
+ /* Search through exception table */
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
+ if (rt) {
+ if (ip6_hold_safe(net, &rt))
+ dst_use_noref(&rt->dst, jiffies);
+ } else {
+do_create:
+ rt = ip6_create_rt_rcu(&res);
+ }
+
out:
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
+ trace_fib6_table_lookup(net, &res, table, fl6);
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.__use++;
+ rcu_read_unlock();
return rt;
+}
+struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
+ const struct sk_buff *skb, int flags)
+{
+ return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
}
+EXPORT_SYMBOL_GPL(ip6_route_lookup);
-struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
- int oif, int strict)
+struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
+ const struct in6_addr *saddr, int oif,
+ const struct sk_buff *skb, int strict)
{
- struct flowi fl = {
- .oif = oif,
- .nl_u = {
- .ip6_u = {
- .daddr = *daddr,
- },
- },
+ struct flowi6 fl6 = {
+ .flowi6_oif = oif,
+ .daddr = *daddr,
};
struct dst_entry *dst;
int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
if (saddr) {
- memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
+ memcpy(&fl6.saddr, saddr, sizeof(*saddr));
flags |= RT6_LOOKUP_F_HAS_SADDR;
}
- dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
+ dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
if (dst->error == 0)
- return (struct rt6_info *) dst;
+ return dst_rt6_info(dst);
dst_release(dst);
return NULL;
}
+EXPORT_SYMBOL(rt6_lookup);
/* ip6_ins_rt is called with FREE table->tb6_lock.
- It takes new route entry, the addition fails by any reason the
- route is freed. In any case, if caller does not hold it, it may
- be destroyed.
+ * It takes new route entry, the addition fails by any reason the
+ * route is released.
+ * Caller must hold dst before calling it.
*/
-static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
+static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
+ struct netlink_ext_ack *extack)
{
int err;
struct fib6_table *table;
- table = rt->rt6i_table;
- write_lock_bh(&table->tb6_lock);
- err = fib6_add(&table->tb6_root, rt, info);
- write_unlock_bh(&table->tb6_lock);
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+ err = fib6_add(&table->tb6_root, rt, info, extack);
+ spin_unlock_bh(&table->tb6_lock);
return err;
}
-int ip6_ins_rt(struct rt6_info *rt)
+int ip6_ins_rt(struct net *net, struct fib6_info *rt)
{
- return __ip6_ins_rt(rt, NULL);
+ struct nl_info info = { .nl_net = net, };
+
+ return __ip6_ins_rt(rt, &info, NULL);
}
-static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
- struct in6_addr *saddr)
+static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
+ struct fib6_info *f6i = res->f6i;
+ struct net_device *dev;
struct rt6_info *rt;
/*
* Clone the route.
*/
- rt = ip6_rt_copy(ort);
+ if (!fib6_info_hold_safe(f6i))
+ return NULL;
- if (rt) {
- if (!(rt->rt6i_flags&RTF_GATEWAY)) {
- if (rt->rt6i_dst.plen != 128 &&
- ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
- rt->rt6i_flags |= RTF_ANYCAST;
- ipv6_addr_copy(&rt->rt6i_gateway, daddr);
- }
+ dev = ip6_rt_get_dev_rcu(res);
+ rt = ip6_dst_alloc(dev_net(dev), dev, 0);
+ if (!rt) {
+ fib6_info_release(f6i);
+ return NULL;
+ }
- ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
- rt->rt6i_dst.plen = 128;
- rt->rt6i_flags |= RTF_CACHE;
- rt->u.dst.flags |= DST_HOST;
+ ip6_rt_copy_init(rt, res);
+ rt->rt6i_flags |= RTF_CACHE;
+ rt->rt6i_dst.addr = *daddr;
+ rt->rt6i_dst.plen = 128;
+ if (!rt6_is_gw_or_nonexthop(res)) {
+ if (f6i->fib6_dst.plen != 128 &&
+ ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
+ rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
- ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
+ rt->rt6i_src.addr = *saddr;
rt->rt6i_src.plen = 128;
}
#endif
+ }
- rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+ return rt;
+}
+static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
+{
+ struct fib6_info *f6i = res->f6i;
+ unsigned short flags = fib6_info_dst_flags(f6i);
+ struct net_device *dev;
+ struct rt6_info *pcpu_rt;
+
+ if (!fib6_info_hold_safe(f6i))
+ return NULL;
+
+ rcu_read_lock();
+ dev = ip6_rt_get_dev_rcu(res);
+ pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
+ rcu_read_unlock();
+ if (!pcpu_rt) {
+ fib6_info_release(f6i);
+ return NULL;
}
+ ip6_rt_copy_init(pcpu_rt, res);
+ pcpu_rt->rt6i_flags |= RTF_PCPU;
- return rt;
+ if (f6i->nh)
+ pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
+
+ return pcpu_rt;
}
-static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
+static bool rt6_is_valid(const struct rt6_info *rt6)
{
- struct rt6_info *rt = ip6_rt_copy(ort);
- if (rt) {
- ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
- rt->rt6i_dst.plen = 128;
- rt->rt6i_flags |= RTF_CACHE;
- rt->u.dst.flags |= DST_HOST;
- rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
+ return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
+}
+
+/* It should be called with rcu_read_lock() acquired */
+static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
+{
+ struct rt6_info *pcpu_rt;
+
+ pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
+
+ if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
+ struct rt6_info *prev, **p;
+
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
+ /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
+ prev = xchg(p, NULL);
+ if (prev) {
+ dst_dev_put(&prev->dst);
+ dst_release(&prev->dst);
+ }
+
+ pcpu_rt = NULL;
}
- return rt;
+
+ return pcpu_rt;
}
-static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
- struct flowi *fl, int flags)
+static struct rt6_info *rt6_make_pcpu_route(struct net *net,
+ const struct fib6_result *res)
{
- struct fib6_node *fn;
- struct rt6_info *rt, *nrt;
- int strict = 0;
- int attempts = 3;
- int err;
- int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
+ struct rt6_info *pcpu_rt, *prev, **p;
- strict |= flags & RT6_LOOKUP_F_IFACE;
+ pcpu_rt = ip6_rt_pcpu_alloc(res);
+ if (!pcpu_rt)
+ return NULL;
-relookup:
- read_lock_bh(&table->tb6_lock);
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
+ prev = cmpxchg(p, NULL, pcpu_rt);
+ BUG_ON(prev);
-restart_2:
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+ if (res->f6i->fib6_destroying) {
+ struct fib6_info *from;
-restart:
- rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
- BACKTRACK(&fl->fl6_src);
- if (rt == &ip6_null_entry ||
- rt->rt6i_flags & RTF_CACHE)
- goto out;
+ from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
+ fib6_info_release(from);
+ }
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
+ return pcpu_rt;
+}
- if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
- nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
- else {
-#if CLONE_OFFLINK_ROUTE
- nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
-#else
- goto out2;
+/* exception hash table implementation
+ */
+static DEFINE_SPINLOCK(rt6_exception_lock);
+
+/* Remove rt6_ex from hash table and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
+ struct rt6_exception *rt6_ex)
+{
+ struct net *net;
+
+ if (!bucket || !rt6_ex)
+ return;
+
+ net = dev_net(rt6_ex->rt6i->dst.dev);
+ net->ipv6.rt6_stats->fib_rt_cache--;
+
+ /* purge completely the exception to allow releasing the held resources:
+ * some [sk] cache may keep the dst around for unlimited time
+ */
+ dst_dev_put(&rt6_ex->rt6i->dst);
+
+ hlist_del_rcu(&rt6_ex->hlist);
+ dst_release(&rt6_ex->rt6i->dst);
+ kfree_rcu(rt6_ex, rcu);
+ WARN_ON_ONCE(!bucket->depth);
+ bucket->depth--;
+}
+
+/* Remove oldest rt6_ex in bucket and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
+{
+ struct rt6_exception *rt6_ex, *oldest = NULL;
+
+ if (!bucket)
+ return;
+
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
+ oldest = rt6_ex;
+ }
+ rt6_remove_exception(bucket, oldest);
+}
+
+static u32 rt6_exception_hash(const struct in6_addr *dst,
+ const struct in6_addr *src)
+{
+ static siphash_aligned_key_t rt6_exception_key;
+ struct {
+ struct in6_addr dst;
+ struct in6_addr src;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .dst = *dst,
+ };
+ u64 val;
+
+ net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (src)
+ combined.src = *src;
+#endif
+ val = siphash(&combined, sizeof(combined), &rt6_exception_key);
+
+ return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rt6_exception_lock
+ */
+static struct rt6_exception *
+__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct rt6_exception *rt6_ex;
+ u32 hval;
+
+ if (!(*bucket) || !daddr)
+ return NULL;
+
+ hval = rt6_exception_hash(daddr, saddr);
+ *bucket += hval;
+
+ hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
+ struct rt6_info *rt6 = rt6_ex->rt6i;
+ bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (matched && saddr)
+ matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+ if (matched)
+ return rt6_ex;
+ }
+ return NULL;
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rcu_read_lock()
+ */
+static struct rt6_exception *
+__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct rt6_exception *rt6_ex;
+ u32 hval;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!(*bucket) || !daddr)
+ return NULL;
+
+ hval = rt6_exception_hash(daddr, saddr);
+ *bucket += hval;
+
+ hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
+ struct rt6_info *rt6 = rt6_ex->rt6i;
+ bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (matched && saddr)
+ matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
+ if (matched)
+ return rt6_ex;
}
+ return NULL;
+}
- dst_release(&rt->u.dst);
- rt = nrt ? : &ip6_null_entry;
+static unsigned int fib6_mtu(const struct fib6_result *res)
+{
+ const struct fib6_nh *nh = res->nh;
+ unsigned int mtu;
+
+ if (res->f6i->fib6_pmtu) {
+ mtu = res->f6i->fib6_pmtu;
+ } else {
+ struct net_device *dev = nh->fib_nh_dev;
+ struct inet6_dev *idev;
- dst_hold(&rt->u.dst);
- if (nrt) {
- err = ip6_ins_rt(nrt);
- if (!err)
- goto out2;
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ mtu = READ_ONCE(idev->cnf.mtu6);
+ rcu_read_unlock();
}
- if (--attempts <= 0)
- goto out2;
+ mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
- /*
- * Race condition! In the gap, when table->tb6_lock was
- * released someone could insert this route. Relookup.
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
+}
+
+#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
+
+/* used when the flushed bit is not relevant, only access to the bucket
+ * (ie., all bucket users except rt6_insert_exception);
+ *
+ * called under rcu lock; sometimes called with rt6_exception_lock held
+ */
+static
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+
+ if (lock)
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+ else
+ bucket = rcu_dereference(nh->rt6i_exception_bucket);
+
+ /* remove bucket flushed bit if set */
+ if (bucket) {
+ unsigned long p = (unsigned long)bucket;
+
+ p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ }
+
+ return bucket;
+}
+
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
+{
+ unsigned long p = (unsigned long)bucket;
+
+ return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
+}
+
+/* called with rt6_exception_lock held */
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+ unsigned long p;
+
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+
+ p = (unsigned long)bucket;
+ p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+}
+
+static int rt6_insert_exception(struct rt6_info *nrt,
+ const struct fib6_result *res)
+{
+ struct net *net = dev_net(nrt->dst.dev);
+ struct rt6_exception_bucket *bucket;
+ struct fib6_info *f6i = res->f6i;
+ struct in6_addr *src_key = NULL;
+ struct rt6_exception *rt6_ex;
+ struct fib6_nh *nh = res->nh;
+ int max_depth;
+ int err = 0;
+
+ spin_lock_bh(&rt6_exception_lock);
+
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+ if (!bucket) {
+ bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
+ GFP_ATOMIC);
+ if (!bucket) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+ } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* fib6_src.plen != 0 indicates f6i is in subtree
+ * and exception table is indexed by a hash of
+ * both fib6_dst and fib6_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only fib6_dst.
*/
- dst_release(&rt->u.dst);
- goto relookup;
+ if (f6i->fib6_src.plen)
+ src_key = &nrt->rt6i_src.addr;
+#endif
+ /* rt6_mtu_change() might lower mtu on f6i.
+ * Only insert this exception route if its mtu
+ * is less than f6i's mtu value.
+ */
+ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
+ src_key);
+ if (rt6_ex)
+ rt6_remove_exception(bucket, rt6_ex);
+
+ rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
+ if (!rt6_ex) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rt6_ex->rt6i = nrt;
+ rt6_ex->stamp = jiffies;
+ hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
+ bucket->depth++;
+ net->ipv6.rt6_stats->fib_rt_cache++;
+
+ /* Randomize max depth to avoid some side channels attacks. */
+ max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
+ while (bucket->depth > max_depth)
+ rt6_exception_remove_oldest(bucket);
out:
- if (reachable) {
- reachable = 0;
- goto restart_2;
+ spin_unlock_bh(&rt6_exception_lock);
+
+ /* Update fn->fn_sernum to invalidate all cached dst */
+ if (!err) {
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum(net, f6i);
+ fib6_add_gc_list(f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_force_start_gc(net);
}
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
-out2:
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.__use++;
- return rt;
+ return err;
}
-void ip6_route_input(struct sk_buff *skb)
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
- struct ipv6hdr *iph = skb->nh.ipv6h;
- int flags = RT6_LOOKUP_F_HAS_SADDR;
- struct flowi fl = {
- .iif = skb->dev->ifindex,
- .nl_u = {
- .ip6_u = {
- .daddr = iph->daddr,
- .saddr = iph->saddr,
- .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
- },
- },
- .mark = skb->mark,
- .proto = iph->nexthdr,
- };
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct hlist_node *tmp;
+ int i;
- if (rt6_need_strict(&iph->daddr))
- flags |= RT6_LOOKUP_F_IFACE;
+ spin_lock_bh(&rt6_exception_lock);
- skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+ if (!bucket)
+ goto out;
+
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
+ if (!from)
+ fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+ if (!from ||
+ rcu_access_pointer(rt6_ex->rt6i->from) == from)
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ WARN_ON_ONCE(!from && bucket->depth);
+ bucket++;
+ }
+out:
+ spin_unlock_bh(&rt6_exception_lock);
}
-static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
- struct flowi *fl, int flags)
+static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
{
- struct fib6_node *fn;
- struct rt6_info *rt, *nrt;
- int strict = 0;
- int attempts = 3;
+ struct fib6_info *f6i = arg;
+
+ fib6_nh_flush_exceptions(nh, f6i);
+
+ return 0;
+}
+
+void rt6_flush_exceptions(struct fib6_info *f6i)
+{
+ if (f6i->nh) {
+ rcu_read_lock();
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i);
+ rcu_read_unlock();
+ } else {
+ fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
+ }
+}
+
+/* Find cached rt in the hash table inside passed in rt
+ * Caller has to hold rcu_read_lock()
+ */
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ const struct in6_addr *src_key = NULL;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct rt6_info *ret = NULL;
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* fib6i_src.plen != 0 indicates f6i is in subtree
+ * and exception table is indexed by a hash of
+ * both fib6_dst and fib6_src.
+ * However, the src addr used to create the hash
+ * might not be exactly the passed in saddr which
+ * is a /128 addr from the flow.
+ * So we need to use f6i->fib6_src to redo lookup
+ * if the passed in saddr does not find anything.
+ * (See the logic in ip6_rt_cache_alloc() on how
+ * rt->rt6i_src is updated.)
+ */
+ if (res->f6i->fib6_src.plen)
+ src_key = saddr;
+find_ex:
+#endif
+ bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
+ rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+
+ if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+ ret = rt6_ex->rt6i;
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* Use fib6_src as src_key and redo lookup */
+ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
+ src_key = &res->f6i->fib6_src.addr;
+ goto find_ex;
+ }
+#endif
+
+ return ret;
+}
+
+/* Remove the passed in cached rt from the hash table that contains it */
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
+{
+ const struct in6_addr *src_key = NULL;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
int err;
- int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
- strict |= flags & RT6_LOOKUP_F_IFACE;
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
+ return -ENOENT;
-relookup:
- read_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&rt6_exception_lock);
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* rt6i_src.plen != 0 indicates 'from' is in subtree
+ * and exception table is indexed by a hash of
+ * both rt6i_dst and rt6i_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only rt6i_dst.
+ */
+ if (plen)
+ src_key = &rt->rt6i_src.addr;
+#endif
+ rt6_ex = __rt6_find_exception_spinlock(&bucket,
+ &rt->rt6i_dst.addr,
+ src_key);
+ if (rt6_ex) {
+ rt6_remove_exception(bucket, rt6_ex);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
-restart_2:
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+ spin_unlock_bh(&rt6_exception_lock);
+ return err;
+}
-restart:
- rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
- BACKTRACK(&fl->fl6_src);
- if (rt == &ip6_null_entry ||
- rt->rt6i_flags & RTF_CACHE)
- goto out;
+struct fib6_nh_excptn_arg {
+ struct rt6_info *rt;
+ int plen;
+};
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
+static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_excptn_arg *arg = _arg;
+ int err;
- if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
- nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
- else {
-#if CLONE_OFFLINK_ROUTE
- nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
-#else
- goto out2;
+ err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
+ if (err == 0)
+ return 1;
+
+ return 0;
+}
+
+static int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+ struct fib6_info *from;
+
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ return -EINVAL;
+
+ if (from->nh) {
+ struct fib6_nh_excptn_arg arg = {
+ .rt = rt,
+ .plen = from->fib6_src.plen
+ };
+ int rc;
+
+ /* rc = 1 means an entry was found */
+ rc = nexthop_for_each_fib6_nh(from->nh,
+ rt6_nh_remove_exception_rt,
+ &arg);
+ return rc ? 0 : -ENOENT;
+ }
+
+ return fib6_nh_remove_exception(from->fib6_nh,
+ from->fib6_src.plen, rt);
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
+{
+ const struct in6_addr *src_key = NULL;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+#ifdef CONFIG_IPV6_SUBTREES
+ /* rt6i_src.plen != 0 indicates 'from' is in subtree
+ * and exception table is indexed by a hash of
+ * both rt6i_dst and rt6i_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only rt6i_dst.
+ */
+ if (plen)
+ src_key = &rt->rt6i_src.addr;
#endif
+ rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
+ if (rt6_ex)
+ rt6_ex->stamp = jiffies;
+}
+
+struct fib6_nh_match_arg {
+ const struct net_device *dev;
+ const struct in6_addr *gw;
+ struct fib6_nh *match;
+};
+
+/* determine if fib6_nh has given device and gateway */
+static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_match_arg *arg = _arg;
+
+ if (arg->dev != nh->fib_nh_dev ||
+ (arg->gw && !nh->fib_nh_gw_family) ||
+ (!arg->gw && nh->fib_nh_gw_family) ||
+ (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
+ return 0;
+
+ arg->match = nh;
+
+ /* found a match, break the loop */
+ return 1;
+}
+
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+ struct fib6_info *from;
+ struct fib6_nh *fib6_nh;
+
+ rcu_read_lock();
+
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ goto unlock;
+
+ if (from->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = rt->dst.dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
+
+ if (!arg.match)
+ goto unlock;
+ fib6_nh = arg.match;
+ } else {
+ fib6_nh = from->fib6_nh;
}
+ fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
+unlock:
+ rcu_read_unlock();
+}
+
+static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
+ struct rt6_info *rt, int mtu)
+{
+ /* If the new MTU is lower than the route PMTU, this new MTU will be the
+ * lowest MTU in the path: always allow updating the route PMTU to
+ * reflect PMTU decreases.
+ *
+ * If the new MTU is higher, and the route PMTU is equal to the local
+ * MTU, this means the old MTU is the lowest in the path, so allow
+ * updating it: if other nodes now have lower MTUs, PMTU discovery will
+ * handle this.
+ */
+
+ if (dst_mtu(&rt->dst) >= mtu)
+ return true;
+
+ if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
+ return true;
+
+ return false;
+}
+
+static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
+ const struct fib6_nh *nh, int mtu)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+ if (!bucket)
+ return;
- dst_release(&rt->u.dst);
- rt = nrt ? : &ip6_null_entry;
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ struct rt6_info *entry = rt6_ex->rt6i;
- dst_hold(&rt->u.dst);
- if (nrt) {
- err = ip6_ins_rt(nrt);
- if (!err)
- goto out2;
+ /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
+ * route), the metrics of its rt->from have already
+ * been updated.
+ */
+ if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
+ rt6_mtu_change_route_allowed(idev, entry, mtu))
+ dst_metric_set(&entry->dst, RTAX_MTU, mtu);
+ }
+ bucket++;
}
+}
- if (--attempts <= 0)
- goto out2;
+#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
- /*
- * Race condition! In the gap, when table->tb6_lock was
- * released someone could insert this route. Relookup.
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
+ const struct in6_addr *gateway)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct hlist_node *tmp;
+ int i;
+
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
+ return;
+
+ spin_lock_bh(&rt6_exception_lock);
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+ if (bucket) {
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry_safe(rt6_ex, tmp,
+ &bucket->chain, hlist) {
+ struct rt6_info *entry = rt6_ex->rt6i;
+
+ if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
+ RTF_CACHE_GATEWAY &&
+ ipv6_addr_equal(gateway,
+ &entry->rt6i_gateway)) {
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ }
+ bucket++;
+ }
+ }
+
+ spin_unlock_bh(&rt6_exception_lock);
+}
+
+static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
+ struct rt6_exception *rt6_ex,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ struct rt6_info *rt = rt6_ex->rt6i;
+
+ /* we are pruning and obsoleting aged-out and non gateway exceptions
+ * even if others have still references to them, so that on next
+ * dst_check() such references can be dropped.
+ * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
+ * expired, independently from their aging, as per RFC 8201 section 4
*/
- dst_release(&rt->u.dst);
- goto relookup;
+ if (!(rt->rt6i_flags & RTF_EXPIRES)) {
+ if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) +
+ gc_args->timeout)) {
+ pr_debug("aging clone %p\n", rt);
+ rt6_remove_exception(bucket, rt6_ex);
+ return;
+ }
+ } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) {
+ pr_debug("purging expired route %p\n", rt);
+ rt6_remove_exception(bucket, rt6_ex);
+ return;
+ }
+
+ if (rt->rt6i_flags & RTF_GATEWAY) {
+ struct neighbour *neigh;
+
+ neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+
+ if (!(neigh && (neigh->flags & NTF_ROUTER))) {
+ pr_debug("purging route %p via non-router but gateway\n",
+ rt);
+ rt6_remove_exception(bucket, rt6_ex);
+ return;
+ }
+ }
+
+ gc_args->more++;
+}
+
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct hlist_node *tmp;
+ int i;
+
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
+ return;
+
+ rcu_read_lock_bh();
+ spin_lock(&rt6_exception_lock);
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+ if (bucket) {
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry_safe(rt6_ex, tmp,
+ &bucket->chain, hlist) {
+ rt6_age_examine_exception(bucket, rt6_ex,
+ gc_args, now);
+ }
+ bucket++;
+ }
+ }
+ spin_unlock(&rt6_exception_lock);
+ rcu_read_unlock_bh();
+}
+
+struct fib6_nh_age_excptn_arg {
+ struct fib6_gc_args *gc_args;
+ unsigned long now;
+};
+
+static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_age_excptn_arg *arg = _arg;
+
+ fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
+ return 0;
+}
+
+void rt6_age_exceptions(struct fib6_info *f6i,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ if (f6i->nh) {
+ struct fib6_nh_age_excptn_arg arg = {
+ .gc_args = gc_args,
+ .now = now
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
+ &arg);
+ } else {
+ fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
+ }
+}
+/* must be called with rcu lock held */
+int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
+ struct flowi6 *fl6, struct fib6_result *res, int strict)
+{
+ struct fib6_node *fn, *saved_fn;
+
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ saved_fn = fn;
+
+redo_rt6_select:
+ rt6_select(net, fn, oif, res, strict);
+ if (res->f6i == net->ipv6.fib6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto redo_rt6_select;
+ else if (strict & RT6_LOOKUP_F_REACHABLE) {
+ /* also consider unreachable route */
+ strict &= ~RT6_LOOKUP_F_REACHABLE;
+ fn = saved_fn;
+ goto redo_rt6_select;
+ }
+ }
+
+ trace_fib6_table_lookup(net, res, table, fl6);
+
+ return 0;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6,
+ const struct sk_buff *skb, int flags)
+{
+ struct fib6_result res = {};
+ struct rt6_info *rt = NULL;
+ int strict = 0;
+
+ WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
+ !rcu_read_lock_held());
+
+ strict |= flags & RT6_LOOKUP_F_IFACE;
+ strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+ if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
+ strict |= RT6_LOOKUP_F_REACHABLE;
+
+ rcu_read_lock();
+
+ fib6_table_lookup(net, table, oif, fl6, &res, strict);
+ if (res.f6i == net->ipv6.fib6_null_entry)
+ goto out;
+
+ fib6_select_path(net, &res, fl6, oif, false, skb, strict);
+
+ /*Search through exception table */
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
+ if (rt) {
+ goto out;
+ } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
+ !res.nh->fib_nh_gw_family)) {
+ /* Create a RTF_CACHE clone which will not be
+ * owned by the fib6 tree. It is for the special case where
+ * the daddr in the skb during the neighbor look-up is different
+ * from the fl6->daddr used to look-up route here.
+ */
+ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
+
+ if (rt) {
+ /* 1 refcnt is taken during ip6_rt_cache_alloc().
+ * As rt6_uncached_list_add() does not consume refcnt,
+ * this refcnt is always returned to the caller even
+ * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
+ */
+ rt6_uncached_list_add(rt);
+ rcu_read_unlock();
+
+ return rt;
+ }
+ } else {
+ /* Get a percpu copy */
+ local_bh_disable();
+ rt = rt6_get_pcpu_route(&res);
+
+ if (!rt)
+ rt = rt6_make_pcpu_route(net, &res);
+
+ local_bh_enable();
+ }
out:
- if (reachable) {
- reachable = 0;
- goto restart_2;
- }
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
-out2:
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.__use++;
+ if (!rt)
+ rt = net->ipv6.ip6_null_entry;
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ ip6_hold_safe(net, &rt);
+ rcu_read_unlock();
+
return rt;
}
+EXPORT_SYMBOL_GPL(ip6_pol_route);
-struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
+ struct fib6_table *table,
+ struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags)
{
- int flags = 0;
+ return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
+}
- if (rt6_need_strict(&fl->fl6_dst))
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+ struct net_device *dev,
+ struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags)
+{
+ if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
- if (!ipv6_addr_any(&fl->fl6_src))
+ return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
+}
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
+
+static void ip6_multipath_l3_keys(const struct sk_buff *skb,
+ struct flow_keys *keys,
+ struct flow_keys *flkeys)
+{
+ const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+ const struct ipv6hdr *key_iph = outer_iph;
+ struct flow_keys *_flkeys = flkeys;
+ const struct ipv6hdr *inner_iph;
+ const struct icmp6hdr *icmph;
+ struct ipv6hdr _inner_iph;
+ struct icmp6hdr _icmph;
+
+ if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
+ goto out;
+
+ icmph = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_icmph), &_icmph);
+ if (!icmph)
+ goto out;
+
+ if (!icmpv6_is_err(icmph->icmp6_type))
+ goto out;
+
+ inner_iph = skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(*icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto out;
+
+ key_iph = inner_iph;
+ _flkeys = NULL;
+out:
+ if (_flkeys) {
+ keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
+ keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
+ keys->tags.flow_label = _flkeys->tags.flow_label;
+ keys->basic.ip_proto = _flkeys->basic.ip_proto;
+ } else {
+ keys->addrs.v6addrs.src = key_iph->saddr;
+ keys->addrs.v6addrs.dst = key_iph->daddr;
+ keys->tags.flow_label = ip6_flowlabel(key_iph);
+ keys->basic.ip_proto = key_iph->nexthdr;
+ }
+}
+
+static u32 rt6_multipath_custom_hash_outer(const struct net *net,
+ const struct sk_buff *skb,
+ bool *p_has_inner)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys keys, hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 rt6_multipath_custom_hash_inner(const struct net *net,
+ const struct sk_buff *skb,
+ bool has_inner)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys keys, hash_keys;
+
+ /* We assume the packet carries an encapsulation, but if none was
+ * encountered during dissection of the outer flow, then there is no
+ * point in calling the flow dissector again.
+ */
+ if (!has_inner)
+ return 0;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+ if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
+ return 0;
+
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ }
+
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 rt6_multipath_custom_hash_skb(const struct net *net,
+ const struct sk_buff *skb)
+{
+ u32 mhash, mhash_inner;
+ bool has_inner = true;
+
+ mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
+ mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
+
+ return jhash_2words(mhash, mhash_inner, 0);
+}
+
+static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
+ const struct flowi6 *fl6)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl6->fl6_sport;
+ }
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = fl6->fl6_dport;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+/* if skb is set it will be used and fl6 can be NULL */
+u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
+ const struct sk_buff *skb, struct flow_keys *flkeys)
+{
+ struct flow_keys hash_keys;
+ u32 mhash = 0;
+
+ switch (ip6_multipath_hash_policy(net)) {
+ case 0:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (skb) {
+ ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+ } else {
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 1:
+ if (skb) {
+ unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+ struct flow_keys keys;
+
+ /* short-circuit if we already have L4 hash present */
+ if (skb->l4_hash)
+ return skb_get_hash_raw(skb) >> 1;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+
+ if (!flkeys) {
+ skb_flow_dissect_flow_keys(skb, &keys, flag);
+ flkeys = &keys;
+ }
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+ hash_keys.ports.src = flkeys->ports.src;
+ hash_keys.ports.dst = flkeys->ports.dst;
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+ } else {
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl6->fl6_sport;
+ hash_keys.ports.dst = fl6->fl6_dport;
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (skb) {
+ struct flow_keys keys;
+
+ if (!flkeys) {
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ flkeys = &keys;
+ }
+
+ /* Inner can be v4 or v6 */
+ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+ } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = flkeys->tags.flow_label;
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 3:
+ if (skb)
+ mhash = rt6_multipath_custom_hash_skb(net, skb);
+ else
+ mhash = rt6_multipath_custom_hash_fl6(net, fl6);
+ break;
+ }
+
+ return mhash >> 1;
+}
+
+/* Called with rcu held */
+void ip6_route_input(struct sk_buff *skb)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
+ struct ip_tunnel_info *tun_info;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_mark = skb->mark,
+ .flowi6_proto = iph->nexthdr,
+ };
+ struct flow_keys *flkeys = NULL, _flkeys;
+
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+
+ if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
+ flkeys = &_flkeys;
+
+ if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+ fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
+ skb_dst_drop(skb);
+ skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
+ &fl6, skb, flags));
+}
+
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
+ struct fib6_table *table,
+ struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags)
+{
+ return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
+}
+
+static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
+{
+ bool any_src;
+
+ if (ipv6_addr_type(&fl6->daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
+ struct dst_entry *dst;
+
+ /* This function does not take refcnt on the dst */
+ dst = l3mdev_link_scope_lookup(net, fl6);
+ if (dst)
+ return dst;
+ }
+
+ fl6->flowi6_iif = LOOPBACK_IFINDEX;
+
+ flags |= RT6_LOOKUP_F_DST_NOREF;
+ any_src = ipv6_addr_any(&fl6->saddr);
+ if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
+ (fl6->flowi6_oif && any_src))
+ flags |= RT6_LOOKUP_F_IFACE;
+
+ if (!any_src)
flags |= RT6_LOOKUP_F_HAS_SADDR;
+ else if (sk)
+ flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
+
+ return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
+}
+
+struct dst_entry *ip6_route_output_flags(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
+{
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
+
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = dst_rt6_info(dst);
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
- return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
+ return dst;
}
+EXPORT_SYMBOL_GPL(ip6_route_output_flags);
+struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+ struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
+ struct net_device *loopback_dev = net->loopback_dev;
+ struct dst_entry *new = NULL;
+
+ rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
+ DST_OBSOLETE_DEAD, 0);
+ if (rt) {
+ rt6_info_init(rt);
+ atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+
+ new = &rt->dst;
+ new->__use = 1;
+ new->input = dst_discard;
+ new->output = dst_discard_out;
+
+ dst_copy_metrics(new, &ort->dst);
+
+ rt->rt6i_idev = in6_dev_get(loopback_dev);
+ rt->rt6i_gateway = ort->rt6i_gateway;
+ rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
+
+ memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
+ memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
+ }
+
+ dst_release(dst_orig);
+ return new ? new : ERR_PTR(-ENOMEM);
+}
/*
* Destination cache support functions
*/
-static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
+static bool fib6_check(struct fib6_info *f6i, u32 cookie)
{
- struct rt6_info *rt;
+ u32 rt_cookie = 0;
+
+ if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
+ return false;
- rt = (struct rt6_info *) dst;
+ if (fib6_check_expired(f6i))
+ return false;
- if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
- return dst;
+ return true;
+}
+static struct dst_entry *rt6_check(struct rt6_info *rt,
+ struct fib6_info *from,
+ u32 cookie)
+{
+ u32 rt_cookie = 0;
+
+ if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
+ rt_cookie != cookie)
+ return NULL;
+
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return &rt->dst;
+}
+
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
+ struct fib6_info *from,
+ u32 cookie)
+{
+ if (!__rt6_check_expired(rt) &&
+ READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
+ fib6_check(from, cookie))
+ return &rt->dst;
return NULL;
}
-static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
+ u32 cookie)
{
- struct rt6_info *rt = (struct rt6_info *) dst;
+ struct dst_entry *dst_ret;
+ struct fib6_info *from;
+ struct rt6_info *rt;
- if (rt) {
- if (rt->rt6i_flags & RTF_CACHE)
- ip6_del_rt(rt);
- else
- dst_release(dst);
+ rt = dst_rt6_info(dst);
+
+ if (rt->sernum)
+ return rt6_is_valid(rt) ? dst : NULL;
+
+ rcu_read_lock();
+
+ /* All IPV6 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ */
+
+ from = rcu_dereference(rt->from);
+
+ if (from && (rt->rt6i_flags & RTF_PCPU ||
+ unlikely(!list_empty(&rt->dst.rt_uncached))))
+ dst_ret = rt6_dst_from_check(rt, from, cookie);
+ else
+ dst_ret = rt6_check(rt, from, cookie);
+
+ rcu_read_unlock();
+
+ return dst_ret;
+}
+EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
+
+static void ip6_negative_advice(struct sock *sk,
+ struct dst_entry *dst)
+{
+ struct rt6_info *rt = dst_rt6_info(dst);
+
+ if (rt->rt6i_flags & RTF_CACHE) {
+ rcu_read_lock();
+ if (rt6_check_expired(rt)) {
+ /* rt/dst can not be destroyed yet,
+ * because of rcu_read_lock()
+ */
+ sk_dst_reset(sk);
+ rt6_remove_exception_rt(rt);
+ }
+ rcu_read_unlock();
+ return;
}
- return NULL;
+ sk_dst_reset(sk);
}
static void ip6_link_failure(struct sk_buff *skb)
{
struct rt6_info *rt;
- icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
- rt = (struct rt6_info *) skb->dst;
+ rt = dst_rt6_info(skb_dst(skb));
if (rt) {
- if (rt->rt6i_flags&RTF_CACHE) {
- dst_set_expires(&rt->u.dst, 0);
- rt->rt6i_flags |= RTF_EXPIRES;
- } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
- rt->rt6i_node->fn_sernum = -1;
+ rcu_read_lock();
+ if (rt->rt6i_flags & RTF_CACHE) {
+ rt6_remove_exception_rt(rt);
+ } else {
+ struct fib6_info *from;
+ struct fib6_node *fn;
+
+ from = rcu_dereference(rt->from);
+ if (from) {
+ fn = rcu_dereference(from->fib6_node);
+ if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+ WRITE_ONCE(fn->fn_sernum, -1);
+ }
+ }
+ rcu_read_unlock();
+ }
+}
+
+static void rt6_update_expires(struct rt6_info *rt0, int timeout)
+{
+ if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
+ struct fib6_info *from;
+
+ rcu_read_lock();
+ from = rcu_dereference(rt0->from);
+ if (from)
+ WRITE_ONCE(rt0->dst.expires, from->expires);
+ rcu_read_unlock();
}
+
+ dst_set_expires(&rt0->dst, timeout);
+ rt0->rt6i_flags |= RTF_EXPIRES;
}
-static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
- struct rt6_info *rt6 = (struct rt6_info*)dst;
+ struct net *net = dev_net(rt->dst.dev);
+
+ dst_metric_set(&rt->dst, RTAX_MTU, mtu);
+ rt->rt6i_flags |= RTF_MODIFIED;
+ rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
+{
+ return !(rt->rt6i_flags & RTF_CACHE) &&
+ (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
+}
+
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+ const struct ipv6hdr *iph, u32 mtu,
+ bool confirm_neigh)
+{
+ const struct in6_addr *daddr, *saddr;
+ struct rt6_info *rt6 = dst_rt6_info(dst);
+
+ /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
+ * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
+ * [see also comment in rt6_mtu_change_route()]
+ */
+
+ if (iph) {
+ daddr = &iph->daddr;
+ saddr = &iph->saddr;
+ } else if (sk) {
+ daddr = &sk->sk_v6_daddr;
+ saddr = &inet6_sk(sk)->saddr;
+ } else {
+ daddr = NULL;
+ saddr = NULL;
+ }
+
+ if (confirm_neigh)
+ dst_confirm_neigh(dst, daddr);
- if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
- rt6->rt6i_flags |= RTF_MODIFIED;
- if (mtu < IPV6_MIN_MTU) {
- mtu = IPV6_MIN_MTU;
- dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
+ if (mtu < IPV6_MIN_MTU)
+ return;
+ if (mtu >= dst_mtu(dst))
+ return;
+
+ if (!rt6_cache_allowed_for_pmtu(rt6)) {
+ rt6_do_update_pmtu(rt6, mtu);
+ /* update rt6_ex->stamp for cache */
+ if (rt6->rt6i_flags & RTF_CACHE)
+ rt6_update_exception_stamp_rt(rt6);
+ } else if (daddr) {
+ struct fib6_result res = {};
+ struct rt6_info *nrt6;
+
+ rcu_read_lock();
+ res.f6i = rcu_dereference(rt6->from);
+ if (!res.f6i)
+ goto out_unlock;
+
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst_dev_rcu(dst),
+ .gw = &rt6->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev + gw. Should be impossible.
+ */
+ if (!arg.match)
+ goto out_unlock;
+
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
+ nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
+ if (nrt6) {
+ rt6_do_update_pmtu(nrt6, mtu);
+ if (rt6_insert_exception(nrt6, &res))
+ dst_release_immediate(&nrt6->dst);
}
- dst->metrics[RTAX_MTU-1] = mtu;
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+out_unlock:
+ rcu_read_unlock();
}
}
-static int ipv6_get_mtu(struct net_device *dev);
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
+{
+ __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
+ confirm_neigh);
+}
+
+void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
+ int oif, u32 mark, kuid_t uid)
+{
+ const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_oif = oif,
+ .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_uid = uid,
+ };
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (!dst->error)
+ __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
+ dst_release(dst);
+}
+EXPORT_SYMBOL_GPL(ip6_update_pmtu);
+
+void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
+{
+ int oif = sk->sk_bound_dev_if;
+ struct dst_entry *dst;
+
+ if (!oif && skb->dev)
+ oif = l3mdev_master_ifindex(skb->dev);
+
+ ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
+ sk_uid(sk));
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !READ_ONCE(dst->obsolete) ||
+ dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
+ return;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ ip6_datagram_dst_update(sk, false);
+ bh_unlock_sock(sk);
+}
+EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
+
+void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
+ const struct flowi6 *fl6)
+{
+#ifdef CONFIG_IPV6_SUBTREES
+ struct ipv6_pinfo *np = inet6_sk(sk);
+#endif
+
+ ip6_dst_store(sk, dst,
+ ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr),
+#ifdef CONFIG_IPV6_SUBTREES
+ ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
+ true :
+#endif
+ false);
+}
+
+static bool ip6_redirect_nh_match(const struct fib6_result *res,
+ struct flowi6 *fl6,
+ const struct in6_addr *gw,
+ struct rt6_info **ret)
+{
+ const struct fib6_nh *nh = res->nh;
+
+ if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
+ fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
+ return false;
+
+ /* rt_cache's gateway might be different from its 'parent'
+ * in the case of an ip redirect.
+ * So we keep searching in the exception table if the gateway
+ * is different.
+ */
+ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
+ if (rt_cache &&
+ ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
+ *ret = rt_cache;
+ return true;
+ }
+ return false;
+ }
+ return true;
+}
+
+struct fib6_nh_rd_arg {
+ struct fib6_result *res;
+ struct flowi6 *fl6;
+ const struct in6_addr *gw;
+ struct rt6_info **ret;
+};
+
+static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_rd_arg *arg = _arg;
+
+ arg->res->nh = nh;
+ return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
+}
+
+/* Handle redirects */
+struct ip6rd_flowi {
+ struct flowi6 fl6;
+ struct in6_addr gateway;
+};
+
+INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
+ struct fib6_table *table,
+ struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ int flags)
+{
+ struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
+ struct rt6_info *ret = NULL;
+ struct fib6_result res = {};
+ struct fib6_nh_rd_arg arg = {
+ .res = &res,
+ .fl6 = fl6,
+ .gw = &rdfl->gateway,
+ .ret = &ret
+ };
+ struct fib6_info *rt;
+ struct fib6_node *fn;
+
+ /* Get the "current" route for this destination and
+ * check if the redirect has come from appropriate router.
+ *
+ * RFC 4861 specifies that redirects should only be
+ * accepted if they come from the nexthop to the target.
+ * Due to the way the routes are chosen, this notion
+ * is a bit fuzzy and one might need to check all possible
+ * routes.
+ */
+
+ rcu_read_lock();
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+restart:
+ for_each_fib6_node_rt_rcu(fn) {
+ res.f6i = rt;
+ if (fib6_check_expired(rt))
+ continue;
+ if (rt->fib6_flags & RTF_REJECT)
+ break;
+ if (unlikely(rt->nh)) {
+ if (nexthop_is_blackhole(rt->nh))
+ continue;
+ /* on match, res->nh is filled in and potentially ret */
+ if (nexthop_for_each_fib6_nh(rt->nh,
+ fib6_nh_redirect_match,
+ &arg))
+ goto out;
+ } else {
+ res.nh = rt->fib6_nh;
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
+ &ret))
+ goto out;
+ }
+ }
+
+ if (!rt)
+ rt = net->ipv6.fib6_null_entry;
+ else if (rt->fib6_flags & RTF_REJECT) {
+ ret = net->ipv6.ip6_null_entry;
+ goto out;
+ }
+
+ if (rt == net->ipv6.fib6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto restart;
+ }
+
+ res.f6i = rt;
+ res.nh = rt->fib6_nh;
+out:
+ if (ret) {
+ ip6_hold_safe(net, &ret);
+ } else {
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+ ret = ip6_create_rt_rcu(&res);
+ }
+
+ rcu_read_unlock();
+
+ trace_fib6_table_lookup(net, &res, table, fl6);
+ return ret;
+};
+
+static struct dst_entry *ip6_route_redirect(struct net *net,
+ const struct flowi6 *fl6,
+ const struct sk_buff *skb,
+ const struct in6_addr *gateway)
+{
+ int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct ip6rd_flowi rdfl;
+
+ rdfl.fl6 = *fl6;
+ rdfl.gateway = *gateway;
+
+ return fib6_rule_lookup(net, &rdfl.fl6, skb,
+ flags, __ip6_route_redirect);
+}
+
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid)
+{
+ const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_oif = oif,
+ .flowi6_mark = mark,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_uid = uid,
+ };
+
+ dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
+ rt6_do_redirect(dst, NULL, skb);
+ dst_release(dst);
+}
+EXPORT_SYMBOL_GPL(ip6_redirect);
+
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_oif = oif,
+ .daddr = msg->dest,
+ .saddr = iph->daddr,
+ .flowi6_uid = sock_net_uid(net, NULL),
+ };
-static inline unsigned int ipv6_advmss(unsigned int mtu)
+ dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
+ rt6_do_redirect(dst, NULL, skb);
+ dst_release(dst);
+}
+
+void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
+ READ_ONCE(sk->sk_mark), sk_uid(sk));
+}
+EXPORT_SYMBOL_GPL(ip6_sk_redirect);
+
+static unsigned int ip6_default_advmss(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_mtu(dst);
+ struct net *net;
+
mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
- if (mtu < ip6_rt_min_advmss)
- mtu = ip6_rt_min_advmss;
+ rcu_read_lock();
+
+ net = dst_dev_net_rcu(dst);
+ if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
+ mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+
+ rcu_read_unlock();
/*
- * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
- * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
- * IPV6_MAXPLEN is also valid and means: "any MSS,
+ * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
+ * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
+ * IPV6_MAXPLEN is also valid and means: "any MSS,
* rely only on pmtu discovery"
*/
if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
@@ -889,162 +3261,334 @@ static inline unsigned int ipv6_advmss(unsigned int mtu)
return mtu;
}
-static struct dst_entry *ndisc_dst_gc_list;
-static DEFINE_SPINLOCK(ndisc_lock);
+INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
+{
+ return ip6_dst_mtu_maybe_forward(dst, false);
+}
+EXPORT_INDIRECT_CALLABLE(ip6_mtu);
+
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ *
+ * based on ip6_dst_mtu_forward and exception logic of
+ * rt6_find_cached_rt; called with rcu_read_lock
+ */
+u32 ip6_mtu_from_fib6(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ const struct fib6_nh *nh = res->nh;
+ struct fib6_info *f6i = res->f6i;
+ struct inet6_dev *idev;
+ struct rt6_info *rt;
+ u32 mtu = 0;
+
+ if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
+ mtu = f6i->fib6_pmtu;
+ if (mtu)
+ goto out;
+ }
+
+ rt = rt6_find_cached_rt(res, daddr, saddr);
+ if (unlikely(rt)) {
+ mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
+ } else {
+ struct net_device *dev = nh->fib_nh_dev;
-struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
- struct neighbour *neigh,
- struct in6_addr *addr,
- int (*output)(struct sk_buff *))
+ mtu = IPV6_MIN_MTU;
+ idev = __in6_dev_get(dev);
+ if (idev)
+ mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
+ }
+
+ mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+out:
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
+}
+
+struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
+ struct flowi6 *fl6)
{
+ struct dst_entry *dst;
struct rt6_info *rt;
struct inet6_dev *idev = in6_dev_get(dev);
+ struct net *net = dev_net(dev);
- if (unlikely(idev == NULL))
- return NULL;
+ if (unlikely(!idev))
+ return ERR_PTR(-ENODEV);
- rt = ip6_dst_alloc();
- if (unlikely(rt == NULL)) {
+ rt = ip6_dst_alloc(net, dev, 0);
+ if (unlikely(!rt)) {
in6_dev_put(idev);
+ dst = ERR_PTR(-ENOMEM);
goto out;
}
- dev_hold(dev);
- if (neigh)
- neigh_hold(neigh);
- else
- neigh = ndisc_get_neigh(dev, addr);
-
- rt->rt6i_dev = dev;
- rt->rt6i_idev = idev;
- rt->rt6i_nexthop = neigh;
- atomic_set(&rt->u.dst.__refcnt, 1);
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
- rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
- rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
- rt->u.dst.output = output;
-
-#if 0 /* there's no chance to use these for ndisc */
- rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
- ? DST_HOST
- : 0;
- ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
+ rt->dst.input = ip6_input;
+ rt->dst.output = ip6_output;
+ rt->rt6i_gateway = fl6->daddr;
+ rt->rt6i_dst.addr = fl6->daddr;
rt->rt6i_dst.plen = 128;
-#endif
+ rt->rt6i_idev = idev;
+ dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
- spin_lock_bh(&ndisc_lock);
- rt->u.dst.next = ndisc_dst_gc_list;
- ndisc_dst_gc_list = &rt->u.dst;
- spin_unlock_bh(&ndisc_lock);
+ /* Add this dst into uncached_list so that rt6_disable_ip() can
+ * do proper release of the net_device
+ */
+ rt6_uncached_list_add(rt);
- fib6_force_start_gc();
+ dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
out:
- return &rt->u.dst;
+ return dst;
}
-int ndisc_dst_gc(int *more)
+static void ip6_dst_gc(struct dst_ops *ops)
{
- struct dst_entry *dst, *next, **pprev;
- int freed;
+ struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
+ int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
+ int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
+ int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
+ unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ unsigned int val;
+ int entries;
+
+ if (time_after(rt_last_gc + rt_min_interval, jiffies))
+ goto out;
- next = NULL;
- freed = 0;
+ fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
+ entries = dst_entries_get_slow(ops);
+ if (entries < ops->gc_thresh)
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
+out:
+ val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
+}
- spin_lock_bh(&ndisc_lock);
- pprev = &ndisc_dst_gc_list;
+static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
+ const struct in6_addr *gw_addr, u32 tbid,
+ int flags, struct fib6_result *res)
+{
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ .saddr = cfg->fc_prefsrc,
+ };
+ struct fib6_table *table;
+ int err;
- while ((dst = *pprev) != NULL) {
- if (!atomic_read(&dst->__refcnt)) {
- *pprev = dst->next;
- dst_free(dst);
- freed++;
- } else {
- pprev = &dst->next;
- (*more)++;
- }
- }
+ table = fib6_get_table(net, tbid);
+ if (!table)
+ return -EINVAL;
+
+ if (!ipv6_addr_any(&cfg->fc_prefsrc))
+ flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
- spin_unlock_bh(&ndisc_lock);
+ err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
+ fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
- return freed;
+ return err;
}
-static int ip6_dst_gc(void)
+static int ip6_route_check_nh_onlink(struct net *net,
+ struct fib6_config *cfg,
+ const struct net_device *dev,
+ struct netlink_ext_ack *extack)
{
- static unsigned expire = 30*HZ;
- static unsigned long last_gc;
- unsigned long now = jiffies;
-
- if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
- atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
- goto out;
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ const struct in6_addr *gw_addr = &cfg->fc_gateway;
+ struct fib6_result res = {};
+ int err;
- expire++;
- fib6_run_gc(expire);
- last_gc = now;
- if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
- expire = ip6_rt_gc_timeout>>1;
+ err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
+ if (!err && !(res.fib6_flags & RTF_REJECT) &&
+ /* ignore match if it is the default route */
+ !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
+ (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop has invalid gateway or device mismatch");
+ err = -EINVAL;
+ }
-out:
- expire -= expire>>ip6_rt_gc_elasticity;
- return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
+ return err;
}
-/* Clean host part of a prefix. Not necessary in radix tree,
- but results in cleaner routing tables.
+static int ip6_route_check_nh(struct net *net,
+ struct fib6_config *cfg,
+ struct net_device **_dev,
+ netdevice_tracker *dev_tracker,
+ struct inet6_dev **idev)
+{
+ const struct in6_addr *gw_addr = &cfg->fc_gateway;
+ struct net_device *dev = _dev ? *_dev : NULL;
+ int flags = RT6_LOOKUP_F_IFACE;
+ struct fib6_result res = {};
+ int err = -EHOSTUNREACH;
+
+ if (cfg->fc_table) {
+ err = ip6_nh_lookup_table(net, cfg, gw_addr,
+ cfg->fc_table, flags, &res);
+ /* gw_addr can not require a gateway or resolve to a reject
+ * route. If a device is given, it must match the result.
+ */
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family ||
+ (dev && dev != res.nh->fib_nh_dev))
+ err = -EHOSTUNREACH;
+ }
- Remove it only when all the things will work!
- */
+ if (err < 0) {
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ };
+
+ err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family)
+ err = -EHOSTUNREACH;
-static int ipv6_get_mtu(struct net_device *dev)
-{
- int mtu = IPV6_MIN_MTU;
- struct inet6_dev *idev;
+ if (err)
+ return err;
- idev = in6_dev_get(dev);
- if (idev) {
- mtu = idev->cnf.mtu6;
- in6_dev_put(idev);
+ fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
}
- return mtu;
+
+ err = 0;
+ if (dev) {
+ if (dev != res.nh->fib_nh_dev)
+ err = -EHOSTUNREACH;
+ } else {
+ *_dev = dev = res.nh->fib_nh_dev;
+ netdev_hold(dev, dev_tracker, GFP_ATOMIC);
+ *idev = in6_dev_get(dev);
+ }
+
+ return err;
}
-int ipv6_get_hoplimit(struct net_device *dev)
+static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
+ struct net_device **_dev,
+ netdevice_tracker *dev_tracker,
+ struct inet6_dev **idev,
+ struct netlink_ext_ack *extack)
{
- int hoplimit = ipv6_devconf.hop_limit;
- struct inet6_dev *idev;
+ const struct in6_addr *gw_addr = &cfg->fc_gateway;
+ int gwa_type = ipv6_addr_type(gw_addr);
+ bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
+ const struct net_device *dev = *_dev;
+ bool need_addr_check = !dev;
+ int err = -EINVAL;
+
+ /* if gw_addr is local we will fail to detect this in case
+ * address is still TENTATIVE (DAD in progress). rt6_lookup()
+ * will return already-added prefix route via interface that
+ * prefix route was assigned to, which might be non-loopback.
+ */
+ if (dev &&
+ ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
+ NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
+ goto out;
+ }
- idev = in6_dev_get(dev);
- if (idev) {
- hoplimit = idev->cnf.hop_limit;
- in6_dev_put(idev);
+ if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
+ /* IPv6 strictly inhibits using not link-local
+ * addresses as nexthop address.
+ * Otherwise, router will not able to send redirects.
+ * It is very good, but in some (rare!) circumstances
+ * (SIT, PtP, NBMA NOARP links) it is handy to allow
+ * some exceptions. --ANK
+ * We allow IPv4-mapped nexthops to support RFC4798-type
+ * addressing
+ */
+ if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway address");
+ goto out;
+ }
+
+ rcu_read_lock();
+
+ if (cfg->fc_flags & RTNH_F_ONLINK)
+ err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
+ else
+ err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
+ idev);
+
+ rcu_read_unlock();
+
+ if (err)
+ goto out;
}
- return hoplimit;
+
+ /* reload in case device was changed */
+ dev = *_dev;
+
+ err = -EINVAL;
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Egress device not specified");
+ goto out;
+ } else if (dev->flags & IFF_LOOPBACK) {
+ NL_SET_ERR_MSG(extack,
+ "Egress device can not be loopback device for this route");
+ goto out;
+ }
+
+ /* if we did not check gw_addr above, do so now that the
+ * egress device has been resolved.
+ */
+ if (need_addr_check &&
+ ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
+ NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
}
-/*
- *
- */
+static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
+{
+ if ((flags & RTF_REJECT) ||
+ (dev && (dev->flags & IFF_LOOPBACK) &&
+ !(addr_type & IPV6_ADDR_LOOPBACK) &&
+ !(flags & (RTF_ANYCAST | RTF_LOCAL))))
+ return true;
-int ip6_route_add(struct fib6_config *cfg)
+ return false;
+}
+
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+ struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
{
- int err;
- struct rt6_info *rt = NULL;
+ netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
- struct fib6_table *table;
int addr_type;
+ int err;
- if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
- return -EINVAL;
-#ifndef CONFIG_IPV6_SUBTREES
- if (cfg->fc_src_len)
- return -EINVAL;
+ fib6_nh->fib_nh_family = AF_INET6;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ fib6_nh->last_probe = jiffies;
#endif
+ if (cfg->fc_is_fdb) {
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
+ return 0;
+ }
+
+ err = -ENODEV;
if (cfg->fc_ifindex) {
- err = -ENODEV;
- dev = dev_get_by_index(cfg->fc_ifindex);
+ dev = netdev_get_by_index(net, cfg->fc_ifindex,
+ dev_tracker, gfp_flags);
if (!dev)
goto out;
idev = in6_dev_get(dev);
@@ -1052,811 +3596,1453 @@ int ip6_route_add(struct fib6_config *cfg)
goto out;
}
- if (cfg->fc_metric == 0)
- cfg->fc_metric = IP6_RT_PRIO_USER;
-
- table = fib6_new_table(cfg->fc_table);
- if (table == NULL) {
- err = -ENOBUFS;
- goto out;
- }
+ if (cfg->fc_flags & RTNH_F_ONLINK) {
+ if (!dev) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop device required for onlink");
+ goto out;
+ }
- rt = ip6_dst_alloc();
+ if (!(dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ }
- if (rt == NULL) {
- err = -ENOMEM;
- goto out;
+ fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
}
- rt->u.dst.obsolete = -1;
- rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
-
- if (cfg->fc_protocol == RTPROT_UNSPEC)
- cfg->fc_protocol = RTPROT_BOOT;
- rt->rt6i_protocol = cfg->fc_protocol;
-
- addr_type = ipv6_addr_type(&cfg->fc_dst);
-
- if (addr_type & IPV6_ADDR_MULTICAST)
- rt->u.dst.input = ip6_mc_input;
- else
- rt->u.dst.input = ip6_forward;
-
- rt->u.dst.output = ip6_output;
-
- ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
- rt->rt6i_dst.plen = cfg->fc_dst_len;
- if (rt->rt6i_dst.plen == 128)
- rt->u.dst.flags = DST_HOST;
-
-#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
- rt->rt6i_src.plen = cfg->fc_src_len;
-#endif
-
- rt->rt6i_metric = cfg->fc_metric;
+ fib6_nh->fib_nh_weight = 1;
/* We cannot add true routes via loopback here,
- they would result in kernel looping; promote them to reject routes
+ * they would result in kernel looping; promote them to reject routes
*/
- if ((cfg->fc_flags & RTF_REJECT) ||
- (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
/* hold loopback dev/idev if we haven't done so. */
- if (dev != &loopback_dev) {
+ if (dev != net->loopback_dev) {
if (dev) {
- dev_put(dev);
+ netdev_put(dev, dev_tracker);
in6_dev_put(idev);
}
- dev = &loopback_dev;
- dev_hold(dev);
+ dev = net->loopback_dev;
+ netdev_hold(dev, dev_tracker, gfp_flags);
idev = in6_dev_get(dev);
if (!idev) {
err = -ENODEV;
goto out;
}
}
- rt->u.dst.output = ip6_pkt_discard_out;
- rt->u.dst.input = ip6_pkt_discard;
- rt->u.dst.error = -ENETUNREACH;
- rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
- goto install_route;
+ goto pcpu_alloc;
}
if (cfg->fc_flags & RTF_GATEWAY) {
- struct in6_addr *gw_addr;
- int gwa_type;
-
- gw_addr = &cfg->fc_gateway;
- ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
- gwa_type = ipv6_addr_type(gw_addr);
-
- if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
- struct rt6_info *grt;
-
- /* IPv6 strictly inhibits using not link-local
- addresses as nexthop address.
- Otherwise, router will not able to send redirects.
- It is very good, but in some (rare!) circumstances
- (SIT, PtP, NBMA NOARP links) it is handy to allow
- some exceptions. --ANK
- */
- err = -EINVAL;
- if (!(gwa_type&IPV6_ADDR_UNICAST))
- goto out;
-
- grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
-
- err = -EHOSTUNREACH;
- if (grt == NULL)
- goto out;
- if (dev) {
- if (dev != grt->rt6i_dev) {
- dst_release(&grt->u.dst);
- goto out;
- }
- } else {
- dev = grt->rt6i_dev;
- idev = grt->rt6i_idev;
- dev_hold(dev);
- in6_dev_hold(grt->rt6i_idev);
- }
- if (!(grt->rt6i_flags&RTF_GATEWAY))
- err = 0;
- dst_release(&grt->u.dst);
-
- if (err)
- goto out;
- }
- err = -EINVAL;
- if (dev == NULL || (dev->flags&IFF_LOOPBACK))
+ err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
+ &idev, extack);
+ if (err)
goto out;
+
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
}
err = -ENODEV;
- if (dev == NULL)
+ if (!dev)
goto out;
- if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
- rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
- if (IS_ERR(rt->rt6i_nexthop)) {
- err = PTR_ERR(rt->rt6i_nexthop);
- rt->rt6i_nexthop = NULL;
- goto out;
- }
+ if (!idev || idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
+ err = -EACCES;
+ goto out;
}
- rt->rt6i_flags = cfg->fc_flags;
-
-install_route:
- if (cfg->fc_mx) {
- struct nlattr *nla;
- int remaining;
+ if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ }
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla->nla_type;
+ if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+ !netif_carrier_ok(dev))
+ fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- if (type) {
- if (type > RTAX_MAX) {
- err = -EINVAL;
- goto out;
- }
+ err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
+ cfg->fc_encap_type, cfg, gfp_flags, extack);
+ if (err)
+ goto out;
- rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
- }
- }
+pcpu_alloc:
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu) {
+ err = -ENOMEM;
+ goto out;
}
- if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
- if (!rt->u.dst.metrics[RTAX_MTU-1])
- rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
- if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
- rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
- rt->u.dst.dev = dev;
- rt->rt6i_idev = idev;
- rt->rt6i_table = table;
- return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
-
+ fib6_nh->fib_nh_dev = dev;
+ fib6_nh->fib_nh_oif = dev->ifindex;
+ err = 0;
out:
- if (dev)
- dev_put(dev);
if (idev)
in6_dev_put(idev);
- if (rt)
- dst_free(&rt->u.dst);
+
+ if (err) {
+ fib_nh_common_release(&fib6_nh->nh_common);
+ fib6_nh->nh_common.nhc_pcpu_rth_output = NULL;
+ fib6_nh->fib_nh_lws = NULL;
+ netdev_put(dev, dev_tracker);
+ }
+
return err;
}
-static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
+void fib6_nh_release(struct fib6_nh *fib6_nh)
{
- int err;
- struct fib6_table *table;
+ struct rt6_exception_bucket *bucket;
- if (rt == &ip6_null_entry)
- return -ENOENT;
+ rcu_read_lock();
- table = rt->rt6i_table;
- write_lock_bh(&table->tb6_lock);
+ fib6_nh_flush_exceptions(fib6_nh, NULL);
+ bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
+ if (bucket) {
+ rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
+ kfree(bucket);
+ }
- err = fib6_del(rt, info);
- dst_release(&rt->u.dst);
+ rcu_read_unlock();
- write_unlock_bh(&table->tb6_lock);
+ fib6_nh_release_dsts(fib6_nh);
+ free_percpu(fib6_nh->rt6i_pcpu);
- return err;
+ fib_nh_common_release(&fib6_nh->nh_common);
+}
+
+void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
+{
+ int cpu;
+
+ if (!fib6_nh->rt6i_pcpu)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info *pcpu_rt, **ppcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+ pcpu_rt = xchg(ppcpu_rt, NULL);
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ }
+ }
}
-int ip6_del_rt(struct rt6_info *rt)
+static int fib6_config_validate(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
{
- return __ip6_del_rt(rt, NULL);
+ /* RTF_PCPU is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_PCPU) {
+ NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
+ goto errout;
+ }
+
+ /* RTF_CACHE is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_CACHE) {
+ NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
+ goto errout;
+ }
+
+ if (cfg->fc_type > RTN_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid route type");
+ goto errout;
+ }
+
+ if (cfg->fc_dst_len > 128) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ goto errout;
+ }
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (cfg->fc_src_len > 128) {
+ NL_SET_ERR_MSG(extack, "Invalid source address length");
+ goto errout;
+ }
+
+ if (cfg->fc_nh_id && cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto errout;
+ }
+#else
+ if (cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack,
+ "Specifying source address requires IPV6_SUBTREES to be enabled");
+ goto errout;
+ }
+#endif
+ return 0;
+errout:
+ return -EINVAL;
}
-static int ip6_route_del(struct fib6_config *cfg)
+static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
struct fib6_table *table;
- struct fib6_node *fn;
- struct rt6_info *rt;
- int err = -ESRCH;
+ struct fib6_info *rt;
+ int err;
- table = fib6_get_table(cfg->fc_table);
- if (table == NULL)
- return err;
+ if (cfg->fc_nlinfo.nlh &&
+ !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
+ table = fib6_get_table(net, cfg->fc_table);
+ if (!table) {
+ pr_warn("NLM_F_CREATE should be specified when creating new route\n");
+ table = fib6_new_table(net, cfg->fc_table);
+ }
+ } else {
+ table = fib6_new_table(net, cfg->fc_table);
+ }
+ if (!table) {
+ err = -ENOBUFS;
+ goto err;
+ }
- read_lock_bh(&table->tb6_lock);
+ rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id);
+ if (!rt) {
+ err = -ENOMEM;
+ goto err;
+ }
- fn = fib6_locate(&table->tb6_root,
- &cfg->fc_dst, cfg->fc_dst_len,
- &cfg->fc_src, cfg->fc_src_len);
-
- if (fn) {
- for (rt = fn->leaf; rt; rt = rt->u.next) {
- if (cfg->fc_ifindex &&
- (rt->rt6i_dev == NULL ||
- rt->rt6i_dev->ifindex != cfg->fc_ifindex))
- continue;
- if (cfg->fc_flags & RTF_GATEWAY &&
- !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
- continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
- continue;
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
+ rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
+ extack);
+ if (IS_ERR(rt->fib6_metrics)) {
+ err = PTR_ERR(rt->fib6_metrics);
+ goto free;
+ }
+
+ if (cfg->fc_flags & RTF_ADDRCONF)
+ rt->dst_nocount = true;
+
+ if (cfg->fc_flags & RTF_EXPIRES)
+ fib6_set_expires(rt, jiffies +
+ clock_t_to_jiffies(cfg->fc_expires));
+
+ if (cfg->fc_protocol == RTPROT_UNSPEC)
+ cfg->fc_protocol = RTPROT_BOOT;
+
+ rt->fib6_protocol = cfg->fc_protocol;
+ rt->fib6_table = table;
+ rt->fib6_metric = cfg->fc_metric;
+ rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
+ rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
+
+ ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+ rt->fib6_dst.plen = cfg->fc_dst_len;
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+#ifdef CONFIG_IPV6_SUBTREES
+ ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
+ rt->fib6_src.plen = cfg->fc_src_len;
+#endif
+ return rt;
+free:
+ kfree(rt);
+err:
+ return ERR_PTR(err);
+}
+
+static int ip6_route_info_create_nh(struct fib6_info *rt,
+ struct fib6_config *cfg,
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ struct fib6_nh *fib6_nh;
+ int err;
+
+ if (cfg->fc_nh_id) {
+ struct nexthop *nh;
+
+ rcu_read_lock();
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto out_free;
}
+
+ err = fib6_check_nexthop(nh, cfg, extack);
+ if (err)
+ goto out_free;
+
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -ENOENT;
+ goto out_free;
+ }
+
+ rt->nh = nh;
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+
+ rcu_read_unlock();
+ } else {
+ int addr_type;
+
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+ if (err)
+ goto out_release;
+
+ fib6_nh = rt->fib6_nh;
+
+ /* We cannot add true routes via loopback here, they would
+ * result in kernel looping; promote them to reject routes
+ */
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+ addr_type))
+ rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
}
- read_unlock_bh(&table->tb6_lock);
+ if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
+ struct net_device *dev = fib6_nh->fib_nh_dev;
+
+ if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ err = -EINVAL;
+ goto out_release;
+ }
+ rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
+ rt->fib6_prefsrc.plen = 128;
+ }
+
+ return 0;
+out_release:
+ fib6_info_release(rt);
+ return err;
+out_free:
+ rcu_read_unlock();
+ ip_fib_metrics_put(rt->fib6_metrics);
+ kfree(rt);
return err;
}
-/*
- * Handle redirects
- */
-struct ip6rd_flowi {
- struct flowi fl;
- struct in6_addr gateway;
-};
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_info *rt;
+ int err;
-static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
- struct flowi *fl,
- int flags)
+ err = fib6_config_validate(cfg, extack);
+ if (err)
+ return err;
+
+ rt = ip6_route_info_create(cfg, gfp_flags, extack);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack);
+ if (err)
+ return err;
+
+ err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
+ fib6_info_release(rt);
+
+ return err;
+}
+
+static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
- struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
- struct rt6_info *rt;
- struct fib6_node *fn;
+ struct net *net = info->nl_net;
+ struct fib6_table *table;
+ int err;
- /*
- * Get the "current" route for this destination and
- * check if the redirect has come from approriate router.
- *
- * RFC 2461 specifies that redirects should only be
- * accepted if they come from the nexthop to the target.
- * Due to the way the routes are chosen, this notion
- * is a bit fuzzy and one might need to check all possible
- * routes.
- */
+ if (rt == net->ipv6.fib6_null_entry) {
+ err = -ENOENT;
+ goto out;
+ }
- read_lock_bh(&table->tb6_lock);
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
-restart:
- for (rt = fn->leaf; rt; rt = rt->u.next) {
- /*
- * Current route is on-link; redirect is always invalid.
- *
- * Seems, previous statement is not true. It could
- * be node, which looks for us as on-link (f.e. proxy ndisc)
- * But then router serving it might decide, that we should
- * know truth 8)8) --ANK (980726).
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+ err = fib6_del(rt, info);
+ spin_unlock_bh(&table->tb6_lock);
+
+out:
+ fib6_info_release(rt);
+ return err;
+}
+
+int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
+{
+ struct nl_info info = {
+ .nl_net = net,
+ .skip_notify = skip_notify
+ };
+
+ return __ip6_del_rt(rt, &info);
+}
+
+static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
+{
+ struct nl_info *info = &cfg->fc_nlinfo;
+ struct net *net = info->nl_net;
+ struct sk_buff *skb = NULL;
+ struct fib6_table *table;
+ int err = -ENOENT;
+
+ if (rt == net->ipv6.fib6_null_entry)
+ goto out_put;
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
+ struct fib6_info *sibling, *next_sibling;
+ struct fib6_node *fn;
+
+ /* prefer to send a single notification with all hops */
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (skb) {
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+
+ if (rt6_fill_node(net, skb, rt, NULL,
+ NULL, NULL, 0, RTM_DELROUTE,
+ info->portid, seq, 0) < 0) {
+ kfree_skb(skb);
+ skb = NULL;
+ } else
+ info->skip_notify = 1;
+ }
+
+ /* 'rt' points to the first sibling route. If it is not the
+ * leaf, then we do not need to send a notification. Otherwise,
+ * we need to check if the last sibling has a next route or not
+ * and emit a replace or delete notification, respectively.
*/
- if (rt6_check_expired(rt))
- continue;
- if (!(rt->rt6i_flags & RTF_GATEWAY))
- continue;
- if (fl->oif != rt->rt6i_dev->ifindex)
- continue;
- if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
- continue;
- break;
+ info->skip_notify_kernel = 1;
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (rcu_access_pointer(fn->leaf) == rt) {
+ struct fib6_info *last_sibling, *replace_rt;
+
+ last_sibling = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
+ replace_rt = rcu_dereference_protected(
+ last_sibling->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ if (replace_rt)
+ call_fib6_entry_notifiers_replace(net,
+ replace_rt);
+ else
+ call_fib6_multipath_entry_notifiers(net,
+ FIB_EVENT_ENTRY_DEL,
+ rt, rt->fib6_nsiblings,
+ NULL);
+ }
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->fib6_siblings,
+ fib6_siblings) {
+ err = fib6_del(sibling, info);
+ if (err)
+ goto out_unlock;
+ }
}
- if (!rt)
- rt = &ip6_null_entry;
- BACKTRACK(&fl->fl6_src);
-out:
- dst_hold(&rt->u.dst);
+ err = fib6_del(rt, info);
+out_unlock:
+ spin_unlock_bh(&table->tb6_lock);
+out_put:
+ fib6_info_release(rt);
+
+ if (skb) {
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ }
+ return err;
+}
- read_unlock_bh(&table->tb6_lock);
+static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+{
+ int rc = -ESRCH;
- return rt;
-};
+ if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
+ goto out;
-static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
- struct in6_addr *src,
- struct in6_addr *gateway,
- struct net_device *dev)
+ if (cfg->fc_flags & RTF_GATEWAY &&
+ !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+ goto out;
+
+ rc = rt6_remove_exception_rt(rt);
+out:
+ return rc;
+}
+
+static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
+ struct fib6_nh *nh)
{
- int flags = RT6_LOOKUP_F_HAS_SADDR;
- struct ip6rd_flowi rdfl = {
- .fl = {
- .oif = dev->ifindex,
- .nl_u = {
- .ip6_u = {
- .daddr = *dest,
- .saddr = *src,
- },
- },
- },
- .gateway = *gateway,
+ struct fib6_result res = {
+ .f6i = rt,
+ .nh = nh,
};
+ struct rt6_info *rt_cache;
- if (rt6_need_strict(dest))
- flags |= RT6_LOOKUP_F_IFACE;
+ rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
+ if (rt_cache)
+ return __ip6_del_cached_rt(rt_cache, cfg);
- return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
+ return 0;
}
-void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
- struct in6_addr *saddr,
- struct neighbour *neigh, u8 *lladdr, int on_link)
+struct fib6_nh_del_cached_rt_arg {
+ struct fib6_config *cfg;
+ struct fib6_info *f6i;
+};
+
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
{
- struct rt6_info *rt, *nrt = NULL;
- struct netevent_redirect netevent;
+ struct fib6_nh_del_cached_rt_arg *arg = _arg;
+ int rc;
- rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
+ rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
+ return rc != -ESRCH ? rc : 0;
+}
- if (rt == &ip6_null_entry) {
- if (net_ratelimit())
- printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
- "for redirect target\n");
- goto out;
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
+{
+ struct fib6_nh_del_cached_rt_arg arg = {
+ .cfg = cfg,
+ .f6i = f6i
+ };
+
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
+}
+
+static int ip6_route_del(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_table *table;
+ struct fib6_info *rt;
+ struct fib6_node *fn;
+ int err = -ESRCH;
+
+ table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
+ if (!table) {
+ NL_SET_ERR_MSG(extack, "FIB table does not exist");
+ return err;
}
- /*
- * We have finally decided to accept it.
- */
+ rcu_read_lock();
- neigh_update(neigh, lladdr, NUD_STALE,
- NEIGH_UPDATE_F_WEAK_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE|
- (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- NEIGH_UPDATE_F_ISROUTER))
- );
+ fn = fib6_locate(&table->tb6_root,
+ &cfg->fc_dst, cfg->fc_dst_len,
+ &cfg->fc_src, cfg->fc_src_len,
+ !(cfg->fc_flags & RTF_CACHE));
- /*
- * Redirect received -> path was valid.
- * Look, redirects are sent only in response to data packets,
- * so that this nexthop apparently is reachable. --ANK
- */
- dst_confirm(&rt->u.dst);
+ if (fn) {
+ for_each_fib6_node_rt_rcu(fn) {
+ struct fib6_nh *nh;
- /* Duplicate redirect: silently ignore. */
- if (neigh == rt->u.dst.neighbour)
- goto out;
+ if (rt->nh && cfg->fc_nh_id &&
+ rt->nh->id != cfg->fc_nh_id)
+ continue;
- nrt = ip6_rt_copy(rt);
- if (nrt == NULL)
- goto out;
+ if (cfg->fc_flags & RTF_CACHE) {
+ int rc = 0;
- nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
- if (on_link)
- nrt->rt6i_flags &= ~RTF_GATEWAY;
+ if (rt->nh) {
+ rc = ip6_del_cached_rt_nh(cfg, rt);
+ } else if (cfg->fc_nh_id) {
+ continue;
+ } else {
+ nh = rt->fib6_nh;
+ rc = ip6_del_cached_rt(cfg, rt, nh);
+ }
+ if (rc != -ESRCH) {
+ rcu_read_unlock();
+ return rc;
+ }
+ continue;
+ }
- ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
- nrt->rt6i_dst.plen = 128;
- nrt->u.dst.flags |= DST_HOST;
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
+ continue;
+ if (cfg->fc_protocol &&
+ cfg->fc_protocol != rt->fib6_protocol)
+ continue;
- ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
- nrt->rt6i_nexthop = neigh_clone(neigh);
- /* Reset pmtu, it may be better */
- nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
- nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
+ if (rt->nh) {
+ if (!fib6_info_hold_safe(rt))
+ continue;
- if (ip6_ins_rt(nrt))
- goto out;
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ break;
+ }
+ if (cfg->fc_nh_id)
+ continue;
- netevent.old = &rt->u.dst;
- netevent.new = &nrt->u.dst;
- call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
+ nh = rt->fib6_nh;
+ if (cfg->fc_ifindex &&
+ (!nh->fib_nh_dev ||
+ nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
+ continue;
+ if (cfg->fc_flags & RTF_GATEWAY &&
+ !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
+ continue;
+ if (!fib6_info_hold_safe(rt))
+ continue;
- if (rt->rt6i_flags&RTF_CACHE) {
- ip6_del_rt(rt);
- return;
+ /* if gateway was specified only delete the one hop */
+ if (cfg->fc_flags & RTF_GATEWAY)
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ else
+ err = __ip6_del_rt_siblings(rt, cfg);
+ break;
+ }
}
+ rcu_read_unlock();
-out:
- dst_release(&rt->u.dst);
- return;
+ return err;
}
-/*
- * Handle ICMP "packet too big" messages
- * i.e. Path MTU discovery
- */
-
-void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
- struct net_device *dev, u32 pmtu)
+static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
- struct rt6_info *rt, *nrt;
- int allfrag = 0;
+ struct netevent_redirect netevent;
+ struct rt6_info *rt, *nrt = NULL;
+ struct fib6_result res = {};
+ struct ndisc_options ndopts;
+ struct inet6_dev *in6_dev;
+ struct neighbour *neigh;
+ struct rd_msg *msg;
+ int optlen, on_link;
+ u8 *lladdr;
+
+ optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
+ optlen -= sizeof(*msg);
+
+ if (optlen < 0) {
+ net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
+ return;
+ }
+
+ msg = (struct rd_msg *)icmp6_hdr(skb);
- rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
- if (rt == NULL)
+ if (ipv6_addr_is_multicast(&msg->dest)) {
+ net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
return;
+ }
- if (pmtu >= dst_mtu(&rt->u.dst))
- goto out;
+ on_link = 0;
+ if (ipv6_addr_equal(&msg->dest, &msg->target)) {
+ on_link = 1;
+ } else if (ipv6_addr_type(&msg->target) !=
+ (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
+ net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
+ return;
+ }
- if (pmtu < IPV6_MIN_MTU) {
- /*
- * According to RFC2460, PMTU is set to the IPv6 Minimum Link
- * MTU (1280) and a fragment header should always be included
- * after a node receiving Too Big message reporting PMTU is
- * less than the IPv6 Minimum Link MTU.
- */
- pmtu = IPV6_MIN_MTU;
- allfrag = 1;
+ in6_dev = __in6_dev_get(skb->dev);
+ if (!in6_dev)
+ return;
+ if (READ_ONCE(in6_dev->cnf.forwarding) ||
+ !READ_ONCE(in6_dev->cnf.accept_redirects))
+ return;
+
+ /* RFC2461 8.1:
+ * The IP source address of the Redirect MUST be the same as the current
+ * first-hop router for the specified ICMP Destination Address.
+ */
+
+ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
+ net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
+ return;
+ }
+
+ lladdr = NULL;
+ if (ndopts.nd_opts_tgt_lladdr) {
+ lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
+ skb->dev);
+ if (!lladdr) {
+ net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
+ return;
+ }
}
- /* New mtu received -> path was valid.
- They are sent only in response to data packets,
- so that this nexthop apparently is reachable. --ANK
+ rt = dst_rt6_info(dst);
+ if (rt->rt6i_flags & RTF_REJECT) {
+ net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
+ return;
+ }
+
+ /* Redirect received -> path was valid.
+ * Look, redirects are sent only in response to data packets,
+ * so that this nexthop apparently is reachable. --ANK
*/
- dst_confirm(&rt->u.dst);
+ dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
- /* Host route. If it is static, it would be better
- not to override it, but add new one, so that
- when cache entry will expire old pmtu
- would return automatically.
+ neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
+ if (!neigh)
+ return;
+
+ /*
+ * We have finally decided to accept it.
*/
- if (rt->rt6i_flags & RTF_CACHE) {
- rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
- if (allfrag)
- rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
- dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
- rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
+
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_WEAK_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE|
+ (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+ NEIGH_UPDATE_F_ISROUTER)),
+ NDISC_REDIRECT, &ndopts);
+
+ rcu_read_lock();
+ res.f6i = rcu_dereference(rt->from);
+ if (!res.f6i)
goto out;
- }
- /* Network route.
- Two cases are possible:
- 1. It is connected route. Action: COW
- 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
- */
- if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
- nrt = rt6_alloc_cow(rt, daddr, saddr);
- else
- nrt = rt6_alloc_clone(rt, daddr);
-
- if (nrt) {
- nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
- if (allfrag)
- nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
-
- /* According to RFC 1981, detecting PMTU increase shouldn't be
- * happened within 5 mins, the recommended timer is 10 mins.
- * Here this route expiration time is set to ip6_rt_mtu_expires
- * which is 10 mins. After 10 mins the decreased pmtu is expired
- * and detecting PMTU increase will be automatically happened.
- */
- dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
- nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst_dev_rcu(dst),
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
- ip6_ins_rt(nrt);
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev. Should be impossible
+ */
+ if (!arg.match)
+ goto out;
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
}
-out:
- dst_release(&rt->u.dst);
-}
-/*
- * Misc support functions
- */
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+ nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
+ if (!nrt)
+ goto out;
-static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
-{
- struct rt6_info *rt = ip6_dst_alloc();
+ nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
+ if (on_link)
+ nrt->rt6i_flags &= ~RTF_GATEWAY;
- if (rt) {
- rt->u.dst.input = ort->u.dst.input;
- rt->u.dst.output = ort->u.dst.output;
-
- memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
- rt->u.dst.error = ort->u.dst.error;
- rt->u.dst.dev = ort->u.dst.dev;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
- rt->u.dst.lastuse = jiffies;
- rt->rt6i_expires = 0;
-
- ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
- rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
- rt->rt6i_metric = 0;
+ nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
- memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
-#ifdef CONFIG_IPV6_SUBTREES
- memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
-#endif
- rt->rt6i_table = ort->rt6i_table;
+ /* rt6_insert_exception() will take care of duplicated exceptions */
+ if (rt6_insert_exception(nrt, &res)) {
+ dst_release_immediate(&nrt->dst);
+ goto out;
}
- return rt;
+
+ netevent.old = &rt->dst;
+ netevent.new = &nrt->dst;
+ netevent.daddr = &msg->dest;
+ netevent.neigh = neigh;
+ call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
+
+out:
+ rcu_read_unlock();
+ neigh_release(neigh);
}
#ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
- struct in6_addr *gwaddr, int ifindex)
+static struct fib6_info *rt6_get_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr,
+ struct net_device *dev)
{
+ u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
+ int ifindex = dev->ifindex;
struct fib6_node *fn;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
struct fib6_table *table;
- table = fib6_get_table(RT6_TABLE_INFO);
- if (table == NULL)
+ table = fib6_get_table(net, tb_id);
+ if (!table)
return NULL;
- write_lock_bh(&table->tb6_lock);
- fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
+ rcu_read_lock();
+ fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
if (!fn)
goto out;
- for (rt = fn->leaf; rt; rt = rt->u.next) {
- if (rt->rt6i_dev->ifindex != ifindex)
+ for_each_fib6_node_rt_rcu(fn) {
+ /* these routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
continue;
- if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+ if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
+ !rt->fib6_nh->fib_nh_gw_family)
continue;
- if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+ if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
+ continue;
+ if (!fib6_info_hold_safe(rt))
continue;
- dst_hold(&rt->u.dst);
break;
}
out:
- write_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
return rt;
}
-static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
- struct in6_addr *gwaddr, int ifindex,
- unsigned pref)
+static struct fib6_info *rt6_add_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr,
+ struct net_device *dev,
+ unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_INFO,
- .fc_metric = 1024,
- .fc_ifindex = ifindex,
+ .fc_metric = IP6_RT_PRIO_USER,
+ .fc_ifindex = dev->ifindex,
.fc_dst_len = prefixlen,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
RTF_UP | RTF_PREF(pref),
+ .fc_protocol = RTPROT_RA,
+ .fc_type = RTN_UNICAST,
+ .fc_nlinfo.portid = 0,
+ .fc_nlinfo.nlh = NULL,
+ .fc_nlinfo.nl_net = net,
};
- ipv6_addr_copy(&cfg.fc_dst, prefix);
- ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
+ cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
+ cfg.fc_dst = *prefix;
+ cfg.fc_gateway = *gwaddr;
/* We should treat it as a default route if prefix length is 0. */
if (!prefixlen)
cfg.fc_flags |= RTF_DEFAULT;
- ip6_route_add(&cfg);
+ ip6_route_add(&cfg, GFP_ATOMIC, NULL);
- return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
+ return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
}
#endif
-struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
-{
- struct rt6_info *rt;
+struct fib6_info *rt6_get_dflt_router(struct net *net,
+ const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
+ struct fib6_info *rt;
struct fib6_table *table;
- table = fib6_get_table(RT6_TABLE_DFLT);
- if (table == NULL)
+ table = fib6_get_table(net, tb_id);
+ if (!table)
return NULL;
- write_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
- if (dev == rt->rt6i_dev &&
- ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
- ipv6_addr_equal(&rt->rt6i_gateway, addr))
+ rcu_read_lock();
+ for_each_fib6_node_rt_rcu(&table->tb6_root) {
+ struct fib6_nh *nh;
+
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ continue;
+
+ nh = rt->fib6_nh;
+ if (dev == nh->fib_nh_dev &&
+ ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+ ipv6_addr_equal(&nh->fib_nh_gw6, addr))
break;
}
- if (rt)
- dst_hold(&rt->u.dst);
- write_unlock_bh(&table->tb6_lock);
+ if (rt && !fib6_info_hold_safe(rt))
+ rt = NULL;
+ rcu_read_unlock();
return rt;
}
-struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
+struct fib6_info *rt6_add_dflt_router(struct net *net,
+ const struct in6_addr *gwaddr,
struct net_device *dev,
- unsigned int pref)
+ unsigned int pref,
+ u32 defrtr_usr_metric,
+ int lifetime)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_DFLT,
- .fc_metric = 1024,
+ .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
+ .fc_metric = defrtr_usr_metric,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
+ .fc_protocol = RTPROT_RA,
+ .fc_type = RTN_UNICAST,
+ .fc_nlinfo.portid = 0,
+ .fc_nlinfo.nlh = NULL,
+ .fc_nlinfo.nl_net = net,
+ .fc_expires = jiffies_to_clock_t(lifetime * HZ),
};
- ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
+ cfg.fc_gateway = *gwaddr;
- ip6_route_add(&cfg);
+ if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
+ struct fib6_table *table;
- return rt6_get_dflt_router(gwaddr, dev);
+ table = fib6_get_table(dev_net(dev), cfg.fc_table);
+ if (table)
+ table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
+ }
+
+ return rt6_get_dflt_router(net, gwaddr, dev);
}
-void rt6_purge_dflt_routers(void)
+static void __rt6_purge_dflt_routers(struct net *net,
+ struct fib6_table *table)
{
- struct rt6_info *rt;
- struct fib6_table *table;
-
- /* NOTE: Keep consistent with rt6_get_dflt_router */
- table = fib6_get_table(RT6_TABLE_DFLT);
- if (table == NULL)
- return;
+ struct fib6_info *rt;
restart:
- read_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
- if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
- dst_hold(&rt->u.dst);
- read_unlock_bh(&table->tb6_lock);
- ip6_del_rt(rt);
+ rcu_read_lock();
+ for_each_fib6_node_rt_rcu(&table->tb6_root) {
+ struct net_device *dev = fib6_info_nh_dev(rt);
+ struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
+
+ if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!idev || idev->cnf.accept_ra != 2) &&
+ fib6_info_hold_safe(rt)) {
+ rcu_read_unlock();
+ ip6_del_rt(net, rt, false);
goto restart;
}
}
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
+
+ table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}
-static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
- struct fib6_config *cfg)
+void rt6_purge_dflt_routers(struct net *net)
{
- memset(cfg, 0, sizeof(*cfg));
+ struct fib6_table *table;
+ struct hlist_head *head;
+ unsigned int h;
+
+ rcu_read_lock();
- cfg->fc_table = RT6_TABLE_MAIN;
- cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
- cfg->fc_metric = rtmsg->rtmsg_metric;
- cfg->fc_expires = rtmsg->rtmsg_info;
- cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
- cfg->fc_src_len = rtmsg->rtmsg_src_len;
- cfg->fc_flags = rtmsg->rtmsg_flags;
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
+ __rt6_purge_dflt_routers(net, table);
+ }
+ }
- ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
- ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
- ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
+ rcu_read_unlock();
}
-int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
+static void rtmsg_to_fib6_config(struct net *net,
+ struct in6_rtmsg *rtmsg,
+ struct fib6_config *cfg)
+{
+ *cfg = (struct fib6_config){
+ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+ : RT6_TABLE_MAIN,
+ .fc_ifindex = rtmsg->rtmsg_ifindex,
+ .fc_metric = rtmsg->rtmsg_metric,
+ .fc_expires = rtmsg->rtmsg_info,
+ .fc_dst_len = rtmsg->rtmsg_dst_len,
+ .fc_src_len = rtmsg->rtmsg_src_len,
+ .fc_flags = rtmsg->rtmsg_flags,
+ .fc_type = rtmsg->rtmsg_type,
+
+ .fc_nlinfo.nl_net = net,
+
+ .fc_dst = rtmsg->rtmsg_dst,
+ .fc_src = rtmsg->rtmsg_src,
+ .fc_gateway = rtmsg->rtmsg_gateway,
+ };
+}
+
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
{
struct fib6_config cfg;
- struct in6_rtmsg rtmsg;
int err;
- switch(cmd) {
- case SIOCADDRT: /* Add a route */
- case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- err = copy_from_user(&rtmsg, arg,
- sizeof(struct in6_rtmsg));
- if (err)
- return -EFAULT;
+ if (cmd != SIOCADDRT && cmd != SIOCDELRT)
+ return -EINVAL;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
- rtmsg_to_fib6_config(&rtmsg, &cfg);
+ rtmsg_to_fib6_config(net, rtmsg, &cfg);
- rtnl_lock();
- switch (cmd) {
- case SIOCADDRT:
- err = ip6_route_add(&cfg);
- break;
- case SIOCDELRT:
- err = ip6_route_del(&cfg);
- break;
- default:
- err = -EINVAL;
- }
- rtnl_unlock();
-
- return err;
- };
+ switch (cmd) {
+ case SIOCADDRT:
+ /* Only do the default setting of fc_metric in route adding */
+ if (cfg.fc_metric == 0)
+ cfg.fc_metric = IP6_RT_PRIO_USER;
+ err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
+ break;
+ case SIOCDELRT:
+ err = ip6_route_del(&cfg, NULL);
+ break;
+ }
- return -EINVAL;
+ return err;
}
/*
* Drop the packet on the floor
*/
-static inline int ip6_pkt_drop(struct sk_buff *skb, int code)
+static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
{
- int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
- if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst_dev(dst);
+ struct net *net = dev_net(dev);
+ struct inet6_dev *idev;
+ SKB_DR(reason);
+ int type;
+
+ if (netif_is_l3_master(skb->dev) ||
+ dev == net->loopback_dev)
+ idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
+ else
+ idev = ip6_dst_idev(dst);
+
+ switch (ipstats_mib_noroutes) {
+ case IPSTATS_MIB_INNOROUTES:
+ type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
+ if (type == IPV6_ADDR_ANY) {
+ SKB_DR_SET(reason, IP_INADDRERRORS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
+ break;
+ }
+ SKB_DR_SET(reason, IP_INNOROUTES);
+ fallthrough;
+ case IPSTATS_MIB_OUTNOROUTES:
+ SKB_DR_OR(reason, IP_OUTNOROUTES);
+ IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
+ break;
+ }
- IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTNOROUTES);
- icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
- kfree_skb(skb);
+ /* Start over by dropping the dst for l3mdev case */
+ if (netif_is_l3_master(skb->dev))
+ skb_dst_drop(skb);
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
+ kfree_skb_reason(skb, reason);
return 0;
}
static int ip6_pkt_discard(struct sk_buff *skb)
{
- return ip6_pkt_drop(skb, ICMPV6_NOROUTE);
+ return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
}
-static int ip6_pkt_discard_out(struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- skb->dev = skb->dst->dev;
- return ip6_pkt_discard(skb);
+ skb->dev = skb_dst_dev(skb);
+ return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
}
-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
-
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
- return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED);
+ return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
+}
+
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ skb->dev = skb_dst_dev(skb);
+ return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
}
-static int ip6_pkt_prohibit_out(struct sk_buff *skb)
+/*
+ * Allocate a dst for local (unicast / anycast) address.
+ */
+
+struct fib6_info *addrconf_f6i_alloc(struct net *net,
+ struct inet6_dev *idev,
+ const struct in6_addr *addr,
+ bool anycast, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
{
- skb->dev = skb->dst->dev;
- return ip6_pkt_prohibit(skb);
+ struct fib6_config cfg = {
+ .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
+ .fc_ifindex = idev->dev->ifindex,
+ .fc_flags = RTF_UP | RTF_NONEXTHOP,
+ .fc_dst = *addr,
+ .fc_dst_len = 128,
+ .fc_protocol = RTPROT_KERNEL,
+ .fc_nlinfo.nl_net = net,
+ .fc_ignore_dev_down = true,
+ };
+ struct fib6_info *f6i;
+ int err;
+
+ if (anycast) {
+ cfg.fc_type = RTN_ANYCAST;
+ cfg.fc_flags |= RTF_ANYCAST;
+ } else {
+ cfg.fc_type = RTN_LOCAL;
+ cfg.fc_flags |= RTF_LOCAL;
+ }
+
+ f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
+ if (IS_ERR(f6i))
+ return f6i;
+
+ err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i->dst_nocount = true;
+
+ if (!anycast &&
+ (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
+ READ_ONCE(idev->cnf.disable_policy)))
+ f6i->dst_nopolicy = true;
+
+ return f6i;
}
-static int ip6_pkt_blk_hole(struct sk_buff *skb)
+/* remove deleted ip from prefsrc entries */
+struct arg_dev_net_ip {
+ struct net *net;
+ struct in6_addr *addr;
+};
+
+static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
{
- kfree_skb(skb);
+ struct net *net = ((struct arg_dev_net_ip *)arg)->net;
+ struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
+
+ if (!rt->nh &&
+ rt != net->ipv6.fib6_null_entry &&
+ ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
+ !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
+ spin_lock_bh(&rt6_exception_lock);
+ /* remove prefsrc entry */
+ rt->fib6_prefsrc.plen = 0;
+ spin_unlock_bh(&rt6_exception_lock);
+ }
return 0;
}
-#endif
+void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
+{
+ struct net *net = dev_net(ifp->idev->dev);
+ struct arg_dev_net_ip adni = {
+ .net = net,
+ .addr = &ifp->addr,
+ };
+ fib6_clean_all(net, fib6_remove_prefsrc, &adni);
+}
-/*
- * Allocate a dst for local (unicast / anycast) address.
- */
+#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
- const struct in6_addr *addr,
- int anycast)
+/* Remove routers and update dst entries when gateway turn into host. */
+static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
- struct rt6_info *rt = ip6_dst_alloc();
+ struct in6_addr *gateway = (struct in6_addr *)arg;
+ struct fib6_nh *nh;
- if (rt == NULL)
- return ERR_PTR(-ENOMEM);
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ return 0;
- dev_hold(&loopback_dev);
- in6_dev_hold(idev);
+ nh = rt->fib6_nh;
+ if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+ nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
+ return -1;
- rt->u.dst.flags = DST_HOST;
- rt->u.dst.input = ip6_input;
- rt->u.dst.output = ip6_output;
- rt->rt6i_dev = &loopback_dev;
- rt->rt6i_idev = idev;
- rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
- rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
- rt->u.dst.obsolete = -1;
+ /* Further clean up cached routes in exception table.
+ * This is needed because cached route may have a different
+ * gateway than its 'parent' in the case of an ip redirect.
+ */
+ fib6_nh_exceptions_clean_tohost(nh, gateway);
- rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
- if (anycast)
- rt->rt6i_flags |= RTF_ANYCAST;
- else
- rt->rt6i_flags |= RTF_LOCAL;
- rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
- if (rt->rt6i_nexthop == NULL) {
- dst_free(&rt->u.dst);
- return ERR_PTR(-ENOMEM);
+ return 0;
+}
+
+void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
+{
+ fib6_clean_all(net, fib6_clean_tohost, gateway);
+}
+
+struct arg_netdev_event {
+ const struct net_device *dev;
+ union {
+ unsigned char nh_flags;
+ unsigned long event;
+ };
+};
+
+static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
+{
+ struct fib6_info *iter;
+ struct fib6_node *fn;
+
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ iter = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ while (iter) {
+ if (iter->fib6_metric == rt->fib6_metric &&
+ rt6_qualify_for_ecmp(iter))
+ return iter;
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
- ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
- rt->rt6i_dst.plen = 128;
- rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
+ return NULL;
+}
- atomic_set(&rt->u.dst.__refcnt, 1);
+/* only called for fib entries with builtin fib6_nh */
+static bool rt6_is_dead(const struct fib6_info *rt)
+{
+ if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
+ (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+ ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
+ return true;
- return rt;
+ return false;
}
-static int fib6_ifdown(struct rt6_info *rt, void *arg)
+static int rt6_multipath_total_weight(const struct fib6_info *rt)
{
- if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
- rt != &ip6_null_entry) {
- RT6_TRACE("deleted by ifdown %p\n", rt);
- return -1;
+ struct fib6_info *iter;
+ int total = 0;
+
+ if (!rt6_is_dead(rt))
+ total += rt->fib6_nh->fib_nh_weight;
+
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
+ if (!rt6_is_dead(iter))
+ total += iter->fib6_nh->fib_nh_weight;
+ }
+
+ return total;
+}
+
+static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
+{
+ int upper_bound = -1;
+
+ if (!rt6_is_dead(rt)) {
+ *weight += rt->fib6_nh->fib_nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
+ total) - 1;
}
+ atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
+}
+
+static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
+{
+ struct fib6_info *iter;
+ int weight = 0;
+
+ rt6_upper_bound_set(rt, &weight, total);
+
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ rt6_upper_bound_set(iter, &weight, total);
+}
+
+void rt6_multipath_rebalance(struct fib6_info *rt)
+{
+ struct fib6_info *first;
+ int total;
+
+ /* In case the entire multipath route was marked for flushing,
+ * then there is no need to rebalance upon the removal of every
+ * sibling route.
+ */
+ if (!rt->fib6_nsiblings || rt->should_flush)
+ return;
+
+ /* During lookup routes are evaluated in order, so we need to
+ * make sure upper bounds are assigned from the first sibling
+ * onwards.
+ */
+ first = rt6_multipath_first_sibling(rt);
+ if (WARN_ON_ONCE(!first))
+ return;
+
+ total = rt6_multipath_total_weight(first);
+ rt6_multipath_upper_bound_set(first, total);
+}
+
+static int fib6_ifup(struct fib6_info *rt, void *p_arg)
+{
+ const struct arg_netdev_event *arg = p_arg;
+ struct net *net = dev_net(arg->dev);
+
+ if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
+ rt->fib6_nh->fib_nh_dev == arg->dev) {
+ rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
+ fib6_update_sernum_upto_root(net, rt);
+ rt6_multipath_rebalance(rt);
+ }
+
return 0;
}
-void rt6_ifdown(struct net_device *dev)
+void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
+{
+ struct arg_netdev_event arg = {
+ .dev = dev,
+ {
+ .nh_flags = nh_flags,
+ },
+ };
+
+ if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
+ arg.nh_flags |= RTNH_F_LINKDOWN;
+
+ fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
+}
+
+/* only called for fib entries with inline fib6_nh */
+static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
+ const struct net_device *dev)
+{
+ struct fib6_info *iter;
+
+ if (rt->fib6_nh->fib_nh_dev == dev)
+ return true;
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ if (iter->fib6_nh->fib_nh_dev == dev)
+ return true;
+
+ return false;
+}
+
+static void rt6_multipath_flush(struct fib6_info *rt)
{
- fib6_clean_all(fib6_ifdown, 0, dev);
+ struct fib6_info *iter;
+
+ rt->should_flush = 1;
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ iter->should_flush = 1;
}
-struct rt6_mtu_change_arg
+static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
+ const struct net_device *down_dev)
{
+ struct fib6_info *iter;
+ unsigned int dead = 0;
+
+ if (rt->fib6_nh->fib_nh_dev == down_dev ||
+ rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
+ dead++;
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ if (iter->fib6_nh->fib_nh_dev == down_dev ||
+ iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
+ dead++;
+
+ return dead;
+}
+
+static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
+ const struct net_device *dev,
+ unsigned char nh_flags)
+{
+ struct fib6_info *iter;
+
+ if (rt->fib6_nh->fib_nh_dev == dev)
+ rt->fib6_nh->fib_nh_flags |= nh_flags;
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ if (iter->fib6_nh->fib_nh_dev == dev)
+ iter->fib6_nh->fib_nh_flags |= nh_flags;
+}
+
+/* called with write lock held for table with rt */
+static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
+{
+ const struct arg_netdev_event *arg = p_arg;
+ const struct net_device *dev = arg->dev;
+ struct net *net = dev_net(dev);
+
+ if (rt == net->ipv6.fib6_null_entry || rt->nh)
+ return 0;
+
+ switch (arg->event) {
+ case NETDEV_UNREGISTER:
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
+ case NETDEV_DOWN:
+ if (rt->should_flush)
+ return -1;
+ if (!rt->fib6_nsiblings)
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
+ if (rt6_multipath_uses_dev(rt, dev)) {
+ unsigned int count;
+
+ count = rt6_multipath_dead_count(rt, dev);
+ if (rt->fib6_nsiblings + 1 == count) {
+ rt6_multipath_flush(rt);
+ return -1;
+ }
+ rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
+ RTNH_F_LINKDOWN);
+ fib6_update_sernum(net, rt);
+ rt6_multipath_rebalance(rt);
+ }
+ return -2;
+ case NETDEV_CHANGE:
+ if (rt->fib6_nh->fib_nh_dev != dev ||
+ rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
+ break;
+ rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ rt6_multipath_rebalance(rt);
+ break;
+ }
+
+ return 0;
+}
+
+void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
+{
+ struct arg_netdev_event arg = {
+ .dev = dev,
+ {
+ .event = event,
+ },
+ };
+ struct net *net = dev_net(dev);
+
+ if (net->ipv6.sysctl.skip_notify_on_dev_down)
+ fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
+ else
+ fib6_clean_all(net, fib6_ifdown, &arg);
+}
+
+void rt6_disable_ip(struct net_device *dev, unsigned long event)
+{
+ rt6_sync_down_dev(dev, event);
+ rt6_uncached_list_flush_dev(dev);
+ neigh_ifdown(&nd_tbl, dev);
+}
+
+struct rt6_mtu_change_arg {
struct net_device *dev;
- unsigned mtu;
+ unsigned int mtu;
+ struct fib6_info *f6i;
};
-static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
+{
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
+ struct fib6_info *f6i = arg->f6i;
+
+ /* For administrative MTU increase, there is no way to discover
+ * IPv6 PMTU increase, so PMTU increase should be updated here.
+ * Since RFC 1981 doesn't include administrative MTU increase
+ * update PMTU increase is a MUST. (i.e. jumbo frame)
+ */
+ if (nh->fib_nh_dev == arg->dev) {
+ struct inet6_dev *idev = __in6_dev_get(arg->dev);
+ u32 mtu = f6i->fib6_pmtu;
+
+ if (mtu >= arg->mtu ||
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+ fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
+
+ spin_lock_bh(&rt6_exception_lock);
+ rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
+ spin_unlock_bh(&rt6_exception_lock);
+ }
+
+ return 0;
+}
+
+static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@@ -1868,82 +5054,177 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
*/
idev = __in6_dev_get(arg->dev);
- if (idev == NULL)
+ if (!idev)
return 0;
- /* For administrative MTU increase, there is no way to discover
- IPv6 PMTU increase, so PMTU increase should be updated here.
- Since RFC 1981 doesn't include administrative MTU increase
- update PMTU increase is a MUST. (i.e. jumbo frame)
- */
- /*
- If new MTU is less than route PMTU, this new MTU will be the
- lowest MTU in the path, update the route PMTU to reflect PMTU
- decreases; if new MTU is greater than route PMTU, and the
- old MTU is the lowest MTU in the path, update the route PMTU
- to reflect the increase. In this case if the other nodes' MTU
- also have the lowest MTU, TOO BIG MESSAGE will be lead to
- PMTU discouvery.
- */
- if (rt->rt6i_dev == arg->dev &&
- !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
- (dst_mtu(&rt->u.dst) > arg->mtu ||
- (dst_mtu(&rt->u.dst) < arg->mtu &&
- dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
- rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
- rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
- return 0;
+ if (fib6_metric_locked(f6i, RTAX_MTU))
+ return 0;
+
+ arg->f6i = f6i;
+ if (f6i->nh) {
+ /* fib6_nh_mtu_change only returns 0, so this is safe */
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
+ arg);
+ }
+
+ return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}
-void rt6_mtu_change(struct net_device *dev, unsigned mtu)
+void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
{
struct rt6_mtu_change_arg arg = {
.dev = dev,
.mtu = mtu,
};
- fib6_clean_all(rt6_mtu_change_route, 0, &arg);
+ fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
}
-static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
+static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
+ [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
[RTA_PRIORITY] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_PREF] = { .type = NLA_U8 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_EXPIRES] = { .type = NLA_U32 },
+ [RTA_UID] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_IP_PROTO] = { .type = NLA_U8 },
+ [RTA_SPORT] = { .type = NLA_U16 },
+ [RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
+ [RTA_FLOWLABEL] = { .type = NLA_BE32 },
};
+static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack,
+ bool newroute)
+{
+ struct rtnexthop *rtnh;
+ int remaining;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ if (!rtnh_ok(rtnh, remaining)) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
+ do {
+ bool has_gateway = cfg->fc_flags & RTF_GATEWAY;
+ int attrlen = rtnh_attrlen(rtnh);
+
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs;
+
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ if (nla_len(nla) < sizeof(cfg->fc_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid IPv6 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ has_gateway = true;
+ }
+ }
+
+ if (newroute && (cfg->fc_nh_id || !has_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Device only routes can not be added for IPv6 using the multipath API.");
+ return -EINVAL;
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } while (rtnh_ok(rtnh, remaining));
+
+ return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack);
+}
+
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct fib6_config *cfg)
+ struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
{
- struct rtmsg *rtm;
+ bool newroute = nlh->nlmsg_type == RTM_NEWROUTE;
struct nlattr *tb[RTA_MAX+1];
+ struct rtmsg *rtm;
+ unsigned int pref;
int err;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv6_policy, extack);
if (err < 0)
goto errout;
err = -EINVAL;
rtm = nlmsg_data(nlh);
- memset(cfg, 0, sizeof(*cfg));
- cfg->fc_table = rtm->rtm_table;
- cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_src_len = rtm->rtm_src_len;
- cfg->fc_flags = RTF_UP;
- cfg->fc_protocol = rtm->rtm_protocol;
+ if (rtm->rtm_tos) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): option not available for IPv6");
+ goto errout;
+ }
+
+ if (tb[RTA_FLOWLABEL]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
+ "Flow label cannot be specified for this operation");
+ goto errout;
+ }
+
+ *cfg = (struct fib6_config){
+ .fc_table = rtm->rtm_table,
+ .fc_dst_len = rtm->rtm_dst_len,
+ .fc_src_len = rtm->rtm_src_len,
+ .fc_flags = RTF_UP,
+ .fc_protocol = rtm->rtm_protocol,
+ .fc_type = rtm->rtm_type,
+
+ .fc_nlinfo.portid = NETLINK_CB(skb).portid,
+ .fc_nlinfo.nlh = nlh,
+ .fc_nlinfo.nl_net = sock_net(skb->sk),
+ };
- if (rtm->rtm_type == RTN_UNREACHABLE)
+ if (rtm->rtm_type == RTN_UNREACHABLE ||
+ rtm->rtm_type == RTN_BLACKHOLE ||
+ rtm->rtm_type == RTN_PROHIBIT ||
+ rtm->rtm_type == RTN_THROW)
cfg->fc_flags |= RTF_REJECT;
- cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
- cfg->fc_nlinfo.nlh = nlh;
+ if (rtm->rtm_type == RTN_LOCAL)
+ cfg->fc_flags |= RTF_LOCAL;
+
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ cfg->fc_flags |= RTF_CACHE;
+
+ cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
+
+ if (tb[RTA_NH_ID]) {
+ if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
+ tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ goto errout;
+ }
+ cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+ }
if (tb[RTA_GATEWAY]) {
- nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
+ cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
}
+ if (tb[RTA_VIA]) {
+ NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
+ goto errout;
+ }
if (tb[RTA_DST]) {
int plen = (rtm->rtm_dst_len + 7) >> 3;
@@ -1963,6 +5244,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
}
+ if (tb[RTA_PREFSRC])
+ cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
+
if (tb[RTA_OIF])
cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
@@ -1977,37 +5261,443 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_TABLE])
cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
+ if (tb[RTA_MULTIPATH]) {
+ cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
+ cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+
+ err = rtm_to_fib6_multipath_config(cfg, extack, newroute);
+ if (err < 0)
+ goto errout;
+ }
+
+ if (tb[RTA_PREF]) {
+ pref = nla_get_u8(tb[RTA_PREF]);
+ if (pref != ICMPV6_ROUTER_PREF_LOW &&
+ pref != ICMPV6_ROUTER_PREF_HIGH)
+ pref = ICMPV6_ROUTER_PREF_MEDIUM;
+ cfg->fc_flags |= RTF_PREF(pref);
+ }
+
+ if (tb[RTA_ENCAP])
+ cfg->fc_encap = tb[RTA_ENCAP];
+
+ if (tb[RTA_ENCAP_TYPE]) {
+ cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
+ if (err < 0)
+ goto errout;
+ }
+
+ if (tb[RTA_EXPIRES]) {
+ unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
+
+ if (addrconf_finite_timeout(timeout)) {
+ cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
+ cfg->fc_flags |= RTF_EXPIRES;
+ }
+ }
+
err = 0;
errout:
return err;
}
-int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+struct rt6_nh {
+ struct fib6_info *fib6_info;
+ struct fib6_config r_cfg;
+ struct list_head list;
+};
+
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
+ struct fib6_info *rt,
+ struct fib6_config *r_cfg)
+{
+ struct rt6_nh *nh;
+
+ list_for_each_entry(nh, rt6_nh_list, list) {
+ /* check if fib6_info already exists */
+ if (rt6_duplicate_nexthop(nh->fib6_info, rt))
+ return -EEXIST;
+ }
+
+ nh = kzalloc(sizeof(*nh), GFP_KERNEL);
+ if (!nh)
+ return -ENOMEM;
+
+ nh->fib6_info = rt;
+ memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
+ list_add_tail(&nh->list, rt6_nh_list);
+
+ return 0;
+}
+
+static void ip6_route_mpath_notify(struct fib6_info *rt,
+ struct fib6_info *rt_last,
+ struct nl_info *info,
+ __u16 nlflags)
+{
+ /* if this is an APPEND route, then rt points to the first route
+ * inserted and rt_last points to last route inserted. Userspace
+ * wants a consistent dump of the route which starts at the first
+ * nexthop. Since sibling routes are always added at the end of
+ * the list, find the first sibling of the last route appended
+ */
+ rcu_read_lock();
+
+ if ((nlflags & NLM_F_APPEND) && rt_last &&
+ READ_ONCE(rt_last->fib6_nsiblings)) {
+ rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
+ }
+
+ if (rt)
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+
+ rcu_read_unlock();
+}
+
+static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
+{
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ bool should_notify = false;
+ struct fib6_info *leaf;
+ struct fib6_node *fn;
+
+ rcu_read_lock();
+ fn = rcu_dereference(rt->fib6_node);
+ if (!fn)
+ goto out;
+
+ leaf = rcu_dereference(fn->leaf);
+ if (!leaf)
+ goto out;
+
+ if (rt == leaf ||
+ (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
+ rt6_qualify_for_ecmp(leaf)))
+ should_notify = true;
+out:
+ rcu_read_unlock();
+
+ return should_notify;
+}
+
+static int ip6_route_multipath_add(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_info *rt_notif = NULL, *rt_last = NULL;
+ struct nl_info *info = &cfg->fc_nlinfo;
+ struct rt6_nh *nh, *nh_safe;
+ struct fib6_config r_cfg;
+ struct rtnexthop *rtnh;
+ LIST_HEAD(rt6_nh_list);
+ struct rt6_nh *err_nh;
+ struct fib6_info *rt;
+ __u16 nlflags;
+ int remaining;
+ int attrlen;
+ int replace;
+ int nhn = 0;
+ int err;
+
+ err = fib6_config_validate(cfg, extack);
+ if (err)
+ return err;
+
+ replace = (cfg->fc_nlinfo.nlh &&
+ (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
+
+ nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
+ if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
+ nlflags |= NLM_F_APPEND;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ /* Parse a Multipath Entry and build a list (rt6_nh_list) of
+ * fib6_info structs per nexthop
+ */
+ while (rtnh_ok(rtnh, remaining)) {
+ memcpy(&r_cfg, cfg, sizeof(*cfg));
+ if (rtnh->rtnh_ifindex)
+ r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
+ r_cfg.fc_flags |= RTF_GATEWAY;
+ }
+
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ r_cfg.fc_encap_type = nla_get_u16(nla);
+ }
+
+ r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
+ rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto cleanup;
+ }
+
+ err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack);
+ if (err) {
+ rt = NULL;
+ goto cleanup;
+ }
+
+ rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
+
+ err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
+ if (err) {
+ fib6_info_release(rt);
+ goto cleanup;
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ /* for add and replace send one notification with all nexthops.
+ * Skip the notification in fib6_add_rt2node and send one with
+ * the full route when done
+ */
+ info->skip_notify = 1;
+
+ /* For add and replace, send one notification with all nexthops. For
+ * append, send one notification with all appended nexthops.
+ */
+ info->skip_notify_kernel = 1;
+
+ err_nh = NULL;
+ list_for_each_entry(nh, &rt6_nh_list, list) {
+ err = __ip6_ins_rt(nh->fib6_info, info, extack);
+
+ if (err) {
+ if (replace && nhn)
+ NL_SET_ERR_MSG_MOD(extack,
+ "multipath route replace failed (check consistency of installed routes)");
+ err_nh = nh;
+ goto add_errout;
+ }
+ /* save reference to last route successfully inserted */
+ rt_last = nh->fib6_info;
+
+ /* save reference to first route for notification */
+ if (!rt_notif)
+ rt_notif = nh->fib6_info;
+
+ /* Because each route is added like a single route we remove
+ * these flags after the first nexthop: if there is a collision,
+ * we have already failed to add the first nexthop:
+ * fib6_add_rt2node() has rejected it; when replacing, old
+ * nexthops have been replaced by first new, the rest should
+ * be added to it.
+ */
+ if (cfg->fc_nlinfo.nlh) {
+ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+ NLM_F_REPLACE);
+ cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
+ }
+ nhn++;
+ }
+
+ /* An in-kernel notification should only be sent in case the new
+ * multipath route is added as the first route in the node, or if
+ * it was appended to it. We pass 'rt_notif' since it is the first
+ * sibling and might allow us to skip some checks in the replace case.
+ */
+ if (ip6_route_mpath_should_notify(rt_notif)) {
+ enum fib_event_type fib_event;
+
+ if (rt_notif->fib6_nsiblings != nhn - 1)
+ fib_event = FIB_EVENT_ENTRY_APPEND;
+ else
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+
+ err = call_fib6_multipath_entry_notifiers(info->nl_net,
+ fib_event, rt_notif,
+ nhn - 1, extack);
+ if (err) {
+ /* Delete all the siblings that were just added */
+ err_nh = NULL;
+ goto add_errout;
+ }
+ }
+
+ /* success ... tell user about new route */
+ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
+ goto cleanup;
+
+add_errout:
+ /* send notification for routes that were added so that
+ * the delete notifications sent by ip6_route_del are
+ * coherent
+ */
+ if (rt_notif)
+ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
+
+ /* Delete routes that were already added */
+ list_for_each_entry(nh, &rt6_nh_list, list) {
+ if (err_nh == nh)
+ break;
+ ip6_route_del(&nh->r_cfg, extack);
+ }
+
+cleanup:
+ list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) {
+ fib6_info_release(nh->fib6_info);
+ list_del(&nh->list);
+ kfree(nh);
+ }
+
+ return err;
+}
+
+static int ip6_route_multipath_del(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_config r_cfg;
+ struct rtnexthop *rtnh;
+ int last_err = 0;
+ int remaining;
+ int attrlen;
+ int err;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ /* Parse a Multipath Entry */
+ while (rtnh_ok(rtnh, remaining)) {
+ memcpy(&r_cfg, cfg, sizeof(*cfg));
+ if (rtnh->rtnh_ifindex)
+ r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
+ r_cfg.fc_flags |= RTF_GATEWAY;
+ }
+ }
+
+ err = ip6_route_del(&r_cfg, extack);
+ if (err)
+ last_err = err;
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return last_err;
+}
+
+static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct fib6_config cfg;
int err;
- err = rtm_to_fib6_config(skb, nlh, &cfg);
+ err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
if (err < 0)
return err;
- return ip6_route_del(&cfg);
+ if (cfg.fc_nh_id) {
+ rcu_read_lock();
+ err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id);
+ rcu_read_unlock();
+
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
+ }
+
+ if (cfg.fc_mp) {
+ return ip6_route_multipath_del(&cfg, extack);
+ } else {
+ cfg.fc_delete_all_nh = 1;
+ return ip6_route_del(&cfg, extack);
+ }
}
-int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct fib6_config cfg;
int err;
- err = rtm_to_fib6_config(skb, nlh, &cfg);
+ err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
if (err < 0)
return err;
- return ip6_route_add(&cfg);
+ if (cfg.fc_metric == 0)
+ cfg.fc_metric = IP6_RT_PRIO_USER;
+
+ if (cfg.fc_mp)
+ return ip6_route_multipath_add(&cfg, extack);
+ else
+ return ip6_route_add(&cfg, GFP_KERNEL, extack);
}
-static inline size_t rt6_nlmsg_size(void)
+/* add the overhead of this fib6_nh to nexthop_len */
+static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
+ int *nexthop_len = arg;
+
+ *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16); /* RTA_GATEWAY */
+
+ if (nh->fib_nh_lws) {
+ /* RTA_ENCAP_TYPE */
+ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+ /* RTA_ENCAP */
+ *nexthop_len += nla_total_size(2);
+ }
+
+ return 0;
+}
+
+static size_t rt6_nlmsg_size(struct fib6_info *f6i)
+{
+ struct fib6_info *sibling;
+ struct fib6_nh *nh;
+ int nexthop_len;
+
+ if (f6i->nh) {
+ nexthop_len = nla_total_size(4); /* RTA_NH_ID */
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
+ &nexthop_len);
+ goto common;
+ }
+
+ rcu_read_lock();
+retry:
+ nh = f6i->fib6_nh;
+ nexthop_len = 0;
+ if (READ_ONCE(f6i->fib6_nsiblings)) {
+ rt6_nh_nlmsg_size(nh, &nexthop_len);
+
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
+ rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
+ if (!READ_ONCE(f6i->fib6_nsiblings))
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+ nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+common:
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
+ nla_total_size(16) /* RTA_DST */
@@ -2017,474 +5707,1261 @@ static inline size_t rt6_nlmsg_size(void)
+ nla_total_size(4) /* RTA_IIF */
+ nla_total_size(4) /* RTA_OIF */
+ nla_total_size(4) /* RTA_PRIORITY */
- + nla_total_size(sizeof(struct rta_cacheinfo));
+ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ + nla_total_size(sizeof(struct rta_cacheinfo))
+ + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ + nla_total_size(1) /* RTA_PREF */
+ + nexthop_len;
}
-static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
- struct in6_addr *dst, struct in6_addr *src,
- int iif, int type, u32 pid, u32 seq,
- int prefix, unsigned int flags)
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+ unsigned char *flags)
{
- struct rtmsg *rtm;
- struct nlmsghdr *nlh;
- long expires;
- u32 table;
+ if (nexthop_is_multipath(nh)) {
+ struct nlattr *mp;
- if (prefix) { /* user wants prefix routes only */
- if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
- /* success since this is not a prefix route */
- return 1;
- }
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, mp);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = nexthop_fib6_nh(nh);
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
+ flags, false) < 0)
+ goto nla_put_failure;
}
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
- if (nlh == NULL)
- return -ENOBUFS;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+ struct fib6_info *rt, struct dst_entry *dst,
+ struct in6_addr *dest, struct in6_addr *src,
+ int iif, int type, u32 portid, u32 seq,
+ unsigned int flags)
+{
+ struct rt6_info *rt6 = dst_rt6_info(dst);
+ struct rt6key *rt6_dst, *rt6_src;
+ u32 *pmetrics, table, rt6_flags;
+ unsigned char nh_flags = 0;
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+ long expires = 0;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ if (rt6) {
+ rt6_dst = &rt6->rt6i_dst;
+ rt6_src = &rt6->rt6i_src;
+ rt6_flags = rt6->rt6i_flags;
+ } else {
+ rt6_dst = &rt->fib6_dst;
+ rt6_src = &rt->fib6_src;
+ rt6_flags = rt->fib6_flags;
+ }
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET6;
- rtm->rtm_dst_len = rt->rt6i_dst.plen;
- rtm->rtm_src_len = rt->rt6i_src.plen;
+ rtm->rtm_dst_len = rt6_dst->plen;
+ rtm->rtm_src_len = rt6_src->plen;
rtm->rtm_tos = 0;
- if (rt->rt6i_table)
- table = rt->rt6i_table->tb6_id;
+ if (rt->fib6_table)
+ table = rt->fib6_table->tb6_id;
else
table = RT6_TABLE_UNSPEC;
- rtm->rtm_table = table;
- NLA_PUT_U32(skb, RTA_TABLE, table);
- if (rt->rt6i_flags&RTF_REJECT)
- rtm->rtm_type = RTN_UNREACHABLE;
- else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
- rtm->rtm_type = RTN_LOCAL;
- else
- rtm->rtm_type = RTN_UNICAST;
+ rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, RTA_TABLE, table))
+ goto nla_put_failure;
+
+ rtm->rtm_type = rt->fib6_type;
rtm->rtm_flags = 0;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
- rtm->rtm_protocol = rt->rt6i_protocol;
- if (rt->rt6i_flags&RTF_DYNAMIC)
- rtm->rtm_protocol = RTPROT_REDIRECT;
- else if (rt->rt6i_flags & RTF_ADDRCONF)
- rtm->rtm_protocol = RTPROT_KERNEL;
- else if (rt->rt6i_flags&RTF_DEFAULT)
- rtm->rtm_protocol = RTPROT_RA;
-
- if (rt->rt6i_flags&RTF_CACHE)
+ rtm->rtm_protocol = rt->fib6_protocol;
+
+ if (rt6_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
- if (dst) {
- NLA_PUT(skb, RTA_DST, 16, dst);
- rtm->rtm_dst_len = 128;
+ if (dest) {
+ if (nla_put_in6_addr(skb, RTA_DST, dest))
+ goto nla_put_failure;
+ rtm->rtm_dst_len = 128;
} else if (rtm->rtm_dst_len)
- NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
+ if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
+ goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
if (src) {
- NLA_PUT(skb, RTA_SRC, 16, src);
- rtm->rtm_src_len = 128;
- } else if (rtm->rtm_src_len)
- NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
+ if (nla_put_in6_addr(skb, RTA_SRC, src))
+ goto nla_put_failure;
+ rtm->rtm_src_len = 128;
+ } else if (rtm->rtm_src_len &&
+ nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
+ goto nla_put_failure;
#endif
- if (iif)
- NLA_PUT_U32(skb, RTA_IIF, iif);
- else if (dst) {
+ if (iif) {
+#ifdef CONFIG_IPV6_MROUTE
+ if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
+ int err = ip6mr_get_route(net, skb, rtm, portid);
+
+ if (err == 0)
+ return 0;
+ if (err < 0)
+ goto nla_put_failure;
+ } else
+#endif
+ if (nla_put_u32(skb, RTA_IIF, iif))
+ goto nla_put_failure;
+ } else if (dest) {
+ struct in6_addr saddr_buf;
+ if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
+ nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
+ goto nla_put_failure;
+ }
+
+ if (rt->fib6_prefsrc.plen) {
struct in6_addr saddr_buf;
- if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
- NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
+ saddr_buf = rt->fib6_prefsrc.addr;
+ if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
+ goto nla_put_failure;
}
- if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
+ if (rtnetlink_put_metrics(skb, pmetrics) < 0)
goto nla_put_failure;
- if (rt->u.dst.neighbour)
- NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
+ if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
+ goto nla_put_failure;
- if (rt->u.dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
+ /* For multipath routes, walk the siblings list and add
+ * each as a nexthop within RTA_MULTIPATH.
+ */
+ if (rt6) {
+ struct net_device *dev;
- NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
+ if (rt6_flags & RTF_GATEWAY &&
+ nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
+ goto nla_put_failure;
- expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
- if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
- expires, rt->u.dst.error) < 0)
+ dev = dst_dev(dst);
+ if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
+ goto nla_put_failure;
+
+ if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+ } else if (READ_ONCE(rt->fib6_nsiblings)) {
+ struct fib6_info *sibling;
+ struct nlattr *mp;
+
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
+ rt->fib6_nh->fib_nh_weight, AF_INET6,
+ 0) < 0)
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
+ fib6_siblings) {
+ if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
+ sibling->fib6_nh->fib_nh_weight,
+ AF_INET6, 0) < 0) {
+ rcu_read_unlock();
+
+ goto nla_put_failure;
+ }
+ }
+
+ rcu_read_unlock();
+
+ nla_nest_end(skb, mp);
+ } else if (rt->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+ goto nla_put_failure;
+
+ if (nexthop_is_blackhole(rt->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+
+ if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
+ rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+ goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
+ } else {
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
+ &nh_flags, false) < 0)
+ goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
+ }
+
+ if (rt6_flags & RTF_EXPIRES) {
+ expires = dst ? READ_ONCE(dst->expires) : rt->expires;
+ expires -= jiffies;
+ }
+
+ if (!dst) {
+ if (READ_ONCE(rt->offload))
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (READ_ONCE(rt->trap))
+ rtm->rtm_flags |= RTM_F_TRAP;
+ if (READ_ONCE(rt->offload_failed))
+ rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
+ }
+
+ if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;
- return nlmsg_end(skb, nlh);
+ if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
+ goto nla_put_failure;
+
+
+ nlmsg_end(skb, nlh);
+ return 0;
nla_put_failure:
- return nlmsg_cancel(skb, nlh);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ if (nh->fib_nh_dev == dev)
+ return 1;
+
+ return 0;
+}
+
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+ const struct net_device *dev)
+{
+ if (f6i->nh) {
+ struct net_device *_dev = (struct net_device *)dev;
+
+ return !!nexthop_for_each_fib6_nh(f6i->nh,
+ fib6_info_nh_uses_dev,
+ _dev);
+ }
+
+ if (f6i->fib6_nh->fib_nh_dev == dev)
+ return true;
+
+ if (READ_ONCE(f6i->fib6_nsiblings)) {
+ const struct fib6_info *sibling;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
+ if (sibling->fib6_nh->fib_nh_dev == dev) {
+ rcu_read_unlock();
+ return true;
+ }
+ if (!READ_ONCE(f6i->fib6_nsiblings))
+ break;
+ }
+ rcu_read_unlock();
+ }
+ return false;
+}
+
+struct fib6_nh_exception_dump_walker {
+ struct rt6_rtnl_dump_arg *dump;
+ struct fib6_info *rt;
+ unsigned int flags;
+ unsigned int skip;
+ unsigned int count;
+};
+
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_nh_exception_dump_walker *w = arg;
+ struct rt6_rtnl_dump_arg *dump = w->dump;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i, err;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+ if (!bucket)
+ return 0;
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (w->skip) {
+ w->skip--;
+ continue;
+ }
+
+ /* Expiration of entries doesn't bump sernum, insertion
+ * does. Removal is triggered by insertion, so we can
+ * rely on the fact that if entries change between two
+ * partial dumps, this node is scanned again completely,
+ * see rt6_insert_exception() and fib6_dump_table().
+ *
+ * Count expired entries we go through as handled
+ * entries that we'll skip next time, in case of partial
+ * node dump. Otherwise, if entries expire meanwhile,
+ * we'll skip the wrong amount.
+ */
+ if (rt6_check_expired(rt6_ex->rt6i)) {
+ w->count++;
+ continue;
+ }
+
+ err = rt6_fill_node(dump->net, dump->skb, w->rt,
+ &rt6_ex->rt6i->dst, NULL, NULL, 0,
+ RTM_NEWROUTE,
+ NETLINK_CB(dump->cb->skb).portid,
+ dump->cb->nlh->nlmsg_seq, w->flags);
+ if (err)
+ return err;
+
+ w->count++;
+ }
+ bucket++;
+ }
+
+ return 0;
+}
+
+/* Return -1 if done with node, number of handled routes on partial dump */
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
- int prefix;
+ struct fib_dump_filter *filter = &arg->filter;
+ unsigned int flags = NLM_F_MULTI;
+ struct net *net = arg->net;
+ int count = 0;
- if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
- struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
- prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
- } else
- prefix = 0;
+ if (rt == net->ipv6.fib6_null_entry)
+ return -1;
- return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
- NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
- prefix, NLM_F_MULTI);
+ if ((filter->flags & RTM_F_PREFIX) &&
+ !(rt->fib6_flags & RTF_PREFIX_RT)) {
+ /* success since this is not a prefix route */
+ return -1;
+ }
+ if (filter->filter_set &&
+ ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+ (filter->protocol && rt->fib6_protocol != filter->protocol))) {
+ return -1;
+ }
+
+ if (filter->filter_set ||
+ !filter->dump_routes || !filter->dump_exceptions) {
+ flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ if (filter->dump_routes) {
+ if (skip) {
+ skip--;
+ } else {
+ if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
+ 0, RTM_NEWROUTE,
+ NETLINK_CB(arg->cb->skb).portid,
+ arg->cb->nlh->nlmsg_seq, flags)) {
+ return 0;
+ }
+ count++;
+ }
+ }
+
+ if (filter->dump_exceptions) {
+ struct fib6_nh_exception_dump_walker w = { .dump = arg,
+ .rt = rt,
+ .flags = flags,
+ .skip = skip,
+ .count = 0 };
+ int err;
+
+ rcu_read_lock();
+ if (rt->nh) {
+ err = nexthop_for_each_fib6_nh(rt->nh,
+ rt6_nh_dump_exceptions,
+ &w);
+ } else {
+ err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
+ }
+ rcu_read_unlock();
+
+ if (err)
+ return count + w.count;
+ }
+
+ return -1;
+}
+
+static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid header for get route request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv6_policy, extack);
+
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
+ rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
+ rtm->rtm_type) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
+ return -EINVAL;
+ }
+ if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid flags for get route request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv6_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
+ return -EINVAL;
+ }
+
+ if (tb[RTA_FLOWLABEL] &&
+ (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
+ "Invalid flow label");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_IIF:
+ case RTA_OIF:
+ case RTA_MARK:
+ case RTA_UID:
+ case RTA_SPORT:
+ case RTA_DPORT:
+ case RTA_IP_PROTO:
+ case RTA_FLOWLABEL:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
}
-int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
+ struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[RTA_MAX+1];
+ int err, iif = 0, oif = 0;
+ struct fib6_info *from;
+ struct dst_entry *dst;
struct rt6_info *rt;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi fl;
- int err, iif = 0;
+ struct flowi6 fl6 = {};
+ __be32 flowlabel;
+ bool fibmatch;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+ err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
goto errout;
err = -EINVAL;
- memset(&fl, 0, sizeof(fl));
+ rtm = nlmsg_data(nlh);
+ fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
if (tb[RTA_SRC]) {
if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
goto errout;
- ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
+ fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
}
if (tb[RTA_DST]) {
if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
goto errout;
- ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
+ fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
}
if (tb[RTA_IIF])
iif = nla_get_u32(tb[RTA_IIF]);
if (tb[RTA_OIF])
- fl.oif = nla_get_u32(tb[RTA_OIF]);
+ oif = nla_get_u32(tb[RTA_OIF]);
+
+ if (tb[RTA_MARK])
+ fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+
+ if (tb[RTA_UID])
+ fl6.flowi6_uid = make_kuid(current_user_ns(),
+ nla_get_u32(tb[RTA_UID]));
+ else
+ fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
+ if (tb[RTA_SPORT])
+ fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
+
+ if (tb[RTA_DPORT])
+ fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
+
+ if (tb[RTA_IP_PROTO]) {
+ err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+ &fl6.flowi6_proto, AF_INET6,
+ extack);
+ if (err)
+ goto errout;
+ }
+
+ flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0);
+ fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel);
if (iif) {
struct net_device *dev;
- dev = __dev_get_by_index(iif);
+ int flags = 0;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
+ rcu_read_unlock();
err = -ENODEV;
goto errout;
}
+
+ fl6.flowi6_iif = iif;
+
+ if (!ipv6_addr_any(&fl6.saddr))
+ flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
+
+ rcu_read_unlock();
+ } else {
+ fl6.flowi6_oif = oif;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ }
+
+
+ rt = dst_rt6_info(dst);
+ if (rt->dst.error) {
+ err = rt->dst.error;
+ ip6_rt_put(rt);
+ goto errout;
+ }
+
+ if (rt == net->ipv6.ip6_null_entry) {
+ err = rt->dst.error;
+ ip6_rt_put(rt);
+ goto errout;
}
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (skb == NULL) {
+ if (!skb) {
+ ip6_rt_put(rt);
err = -ENOBUFS;
goto errout;
}
- /* Reserve room for dummy headers, this skb can pass
- through good chunk of routing engine.
- */
- skb->mac.raw = skb->data;
- skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
+ skb_dst_set(skb, &rt->dst);
- rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
- skb->dst = &rt->u.dst;
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+ if (from) {
+ if (fibmatch)
+ err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
+ iif, RTM_NEWROUTE,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
+ else
+ err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
+ &fl6.saddr, iif, RTM_NEWROUTE,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
+ } else {
+ err = -ENETUNREACH;
+ }
+ rcu_read_unlock();
- err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
- RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, 0, 0);
if (err < 0) {
kfree_skb(skb);
goto errout;
}
- err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
return err;
}
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
+ unsigned int nlm_flags)
{
+ struct net *net = info->nl_net;
struct sk_buff *skb;
- u32 pid = 0, seq = 0;
- struct nlmsghdr *nlh = NULL;
- int err = -ENOBUFS;
+ size_t sz;
+ u32 seq;
+ int err;
- if (info) {
- pid = info->pid;
- nlh = info->nlh;
- if (nlh)
- seq = nlh->nlmsg_seq;
- }
+ err = -ENOBUFS;
+ seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+
+ rcu_read_lock();
+ sz = rt6_nlmsg_size(rt);
+retry:
+ skb = nlmsg_new(sz, GFP_ATOMIC);
+ if (!skb)
+ goto errout;
- skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
- if (skb == NULL)
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ event, info->portid, seq, nlm_flags);
+ if (err < 0) {
+ kfree_skb(skb);
+ /* -EMSGSIZE implies needed space grew under us. */
+ if (err == -EMSGSIZE) {
+ sz = max(rt6_nlmsg_size(rt), sz << 1);
+ goto retry;
+ }
goto errout;
+ }
- err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
- /* failure implies BUG in rt6_nlmsg_size() */
- BUG_ON(err < 0);
+ rcu_read_unlock();
- err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, GFP_ATOMIC);
+ return;
errout:
- if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
+ rcu_read_unlock();
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
-/*
- * /proc
- */
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+ struct nl_info *info)
+{
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
-#ifdef CONFIG_PROC_FS
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (!skb)
+ goto errout;
-#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
-struct rt6_proc_arg
+void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
+ bool offload, bool trap, bool offload_failed)
{
- char *buffer;
- int offset;
- int length;
- int skip;
- int len;
-};
+ struct sk_buff *skb;
+ int err;
-static int rt6_info_route(struct rt6_info *rt, void *p_arg)
-{
- struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
+ if (READ_ONCE(f6i->offload) == offload &&
+ READ_ONCE(f6i->trap) == trap &&
+ READ_ONCE(f6i->offload_failed) == offload_failed)
+ return;
- if (arg->skip < arg->offset / RT6_INFO_LEN) {
- arg->skip++;
- return 0;
- }
+ WRITE_ONCE(f6i->offload, offload);
+ WRITE_ONCE(f6i->trap, trap);
- if (arg->len >= arg->length)
- return 0;
+ /* 2 means send notifications only if offload_failed was changed. */
+ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
+ READ_ONCE(f6i->offload_failed) == offload_failed)
+ return;
- arg->len += sprintf(arg->buffer + arg->len,
- NIP6_SEQFMT " %02x ",
- NIP6(rt->rt6i_dst.addr),
- rt->rt6i_dst.plen);
+ WRITE_ONCE(f6i->offload_failed, offload_failed);
-#ifdef CONFIG_IPV6_SUBTREES
- arg->len += sprintf(arg->buffer + arg->len,
- NIP6_SEQFMT " %02x ",
- NIP6(rt->rt6i_src.addr),
- rt->rt6i_src.plen);
-#else
- arg->len += sprintf(arg->buffer + arg->len,
- "00000000000000000000000000000000 00 ");
-#endif
+ if (!rcu_access_pointer(f6i->fib6_node))
+ /* The route was removed from the tree, do not send
+ * notification.
+ */
+ return;
- if (rt->rt6i_nexthop) {
- arg->len += sprintf(arg->buffer + arg->len,
- NIP6_SEQFMT,
- NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
- } else {
- arg->len += sprintf(arg->buffer + arg->len,
- "00000000000000000000000000000000");
- }
- arg->len += sprintf(arg->buffer + arg->len,
- " %08x %08x %08x %08x %8s\n",
- rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
- rt->u.dst.__use, rt->rt6i_flags,
- rt->rt6i_dev ? rt->rt6i_dev->name : "");
- return 0;
-}
+ if (!net->ipv6.sysctl.fib_notify_on_flag_change)
+ return;
-static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
-{
- struct rt6_proc_arg arg = {
- .buffer = buffer,
- .offset = offset,
- .length = length,
- };
+ skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
+ 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
- fib6_clean_all(rt6_info_route, 0, &arg);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+EXPORT_SYMBOL(fib6_info_hw_flags_set);
- *start = buffer;
- if (offset)
- *start += offset % RT6_INFO_LEN;
+static int ip6_route_dev_notify(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
- arg.len -= offset % RT6_INFO_LEN;
+ if (!(dev->flags & IFF_LOOPBACK))
+ return NOTIFY_OK;
- if (arg.len > length)
- arg.len = length;
- if (arg.len < 0)
- arg.len = 0;
+ if (event == NETDEV_REGISTER) {
+ net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
+ net->ipv6.ip6_null_entry->dst.dev = dev;
+ net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.ip6_prohibit_entry->dst.dev = dev;
+ net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
+ net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
+ net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
+#endif
+ } else if (event == NETDEV_UNREGISTER &&
+ dev->reg_state != NETREG_UNREGISTERED) {
+ /* NETDEV_UNREGISTER could be fired for multiple times by
+ * netdev_wait_allrefs(). Make sure we only call this once.
+ */
+ in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
+ in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
+#endif
+ }
- return arg.len;
+ return NOTIFY_OK;
}
+/*
+ * /proc
+ */
+
+#ifdef CONFIG_PROC_FS
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
+ struct net *net = (struct net *)seq->private;
seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
- rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
- rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
- rt6_stats.fib_rt_cache,
- atomic_read(&ip6_dst_ops.entries),
- rt6_stats.fib_discarded_routes);
+ net->ipv6.rt6_stats->fib_nodes,
+ net->ipv6.rt6_stats->fib_route_nodes,
+ atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
+ net->ipv6.rt6_stats->fib_rt_entries,
+ net->ipv6.rt6_stats->fib_rt_cache,
+ dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
+ net->ipv6.rt6_stats->fib_discarded_routes);
return 0;
}
-
-static int rt6_stats_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rt6_stats_seq_show, NULL);
-}
-
-static struct file_operations rt6_stats_seq_fops = {
- .owner = THIS_MODULE,
- .open = rt6_stats_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SYSCTL
-static int flush_delay;
-
-static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- if (write) {
- proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
- fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
- return 0;
- } else
+ struct net *net;
+ int delay;
+ int ret;
+ if (!write)
return -EINVAL;
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ net = (struct net *)ctl->extra1;
+ delay = net->ipv6.sysctl.flush_delay;
+ fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
+ return 0;
}
-ctl_table ipv6_route_table[] = {
- {
- .ctl_name = NET_IPV6_ROUTE_FLUSH,
- .procname = "flush",
- .data = &flush_delay,
+static struct ctl_table ipv6_route_table_template[] = {
+ {
+ .procname = "max_size",
+ .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
.maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = &ipv6_sysctl_rtcache_flush
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
.procname = "gc_thresh",
- .data = &ip6_dst_ops.gc_thresh,
+ .data = &ip6_dst_ops_template.gc_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
- .procname = "max_size",
- .data = &ip6_rt_max_size,
+ .procname = "flush",
+ .data = &init_net.ipv6.sysctl.flush_delay,
.maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
+ .mode = 0200,
+ .proc_handler = ipv6_sysctl_rtcache_flush
},
{
- .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
.procname = "gc_min_interval",
- .data = &ip6_rt_gc_min_interval,
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
.procname = "gc_timeout",
- .data = &ip6_rt_gc_timeout,
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
.procname = "gc_interval",
- .data = &ip6_rt_gc_interval,
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
.procname = "gc_elasticity",
- .data = &ip6_rt_gc_elasticity,
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
.procname = "mtu_expires",
- .data = &ip6_rt_mtu_expires,
+ .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
.procname = "min_adv_mss",
- .data = &ip6_rt_min_advmss,
+ .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
.procname = "gc_min_interval_ms",
- .data = &ip6_rt_gc_min_interval,
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_ms_jiffies,
- .strategy = &sysctl_ms_jiffies,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "skip_notify_on_dev_down",
+ .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
- { .ctl_name = 0 }
};
+struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(ipv6_route_table_template,
+ sizeof(ipv6_route_table_template),
+ GFP_KERNEL);
+
+ if (table) {
+ table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
+ table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
+ table[2].data = &net->ipv6.sysctl.flush_delay;
+ table[2].extra1 = net;
+ table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+ table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
+ table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
+ table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
+ table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
+ table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
+ table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+ table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
+ }
+
+ return table;
+}
+
+size_t ipv6_route_sysctl_table_size(struct net *net)
+{
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ return 1;
+
+ return ARRAY_SIZE(ipv6_route_table_template);
+}
#endif
-void __init ip6_route_init(void)
+static int __net_init ip6_route_net_init(struct net *net)
{
- struct proc_dir_entry *p;
+ int ret = -ENOMEM;
+
+ memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
+ sizeof(net->ipv6.ip6_dst_ops));
+
+ if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+ goto out_ip6_dst_ops;
+
+ net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
+ if (!net->ipv6.fib6_null_entry)
+ goto out_ip6_dst_entries;
+ memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+ sizeof(*net->ipv6.fib6_null_entry));
+
+ net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
+ sizeof(*net->ipv6.ip6_null_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.ip6_null_entry)
+ goto out_fib6_null_entry;
+ net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+ ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
- ip6_dst_ops.kmem_cachep =
- kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
- fib6_init();
-#ifdef CONFIG_PROC_FS
- p = proc_net_create("ipv6_route", 0, rt6_proc_info);
- if (p)
- p->owner = THIS_MODULE;
-
- proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.fib6_has_custom_rules = false;
+ net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
+ sizeof(*net->ipv6.ip6_prohibit_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.ip6_prohibit_entry)
+ goto out_ip6_null_entry;
+ net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+ ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
+
+ net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
+ sizeof(*net->ipv6.ip6_blk_hole_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.ip6_blk_hole_entry)
+ goto out_ip6_prohibit_entry;
+ net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+ ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
+#ifdef CONFIG_IPV6_SUBTREES
+ net->ipv6.fib6_routes_require_src = 0;
#endif
-#ifdef CONFIG_XFRM
- xfrm6_init();
#endif
+
+ net->ipv6.sysctl.flush_delay = 0;
+ net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
+ net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
+ net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
+ net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
+ net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
+ net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
+ net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+ net->ipv6.sysctl.skip_notify_on_dev_down = 0;
+
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
+
+ ret = 0;
+out:
+ return ret;
+
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
- fib6_rules_init();
+out_ip6_prohibit_entry:
+ kfree(net->ipv6.ip6_prohibit_entry);
+out_ip6_null_entry:
+ kfree(net->ipv6.ip6_null_entry);
#endif
+out_fib6_null_entry:
+ kfree(net->ipv6.fib6_null_entry);
+out_ip6_dst_entries:
+ dst_entries_destroy(&net->ipv6.ip6_dst_ops);
+out_ip6_dst_ops:
+ goto out;
}
-void ip6_route_cleanup(void)
+static void __net_exit ip6_route_net_exit(struct net *net)
{
+ kfree(net->ipv6.fib6_null_entry);
+ kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
- fib6_rules_cleanup();
+ kfree(net->ipv6.ip6_prohibit_entry);
+ kfree(net->ipv6.ip6_blk_hole_entry);
#endif
+ dst_entries_destroy(&net->ipv6.ip6_dst_ops);
+}
+
+static int __net_init ip6_route_net_init_late(struct net *net)
+{
#ifdef CONFIG_PROC_FS
- proc_net_remove("ipv6_route");
- proc_net_remove("rt6_stats");
+ if (!proc_create_net("ipv6_route", 0, net->proc_net,
+ &ipv6_route_seq_ops,
+ sizeof(struct ipv6_route_iter)))
+ return -ENOMEM;
+
+ if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
+ rt6_stats_seq_show, NULL)) {
+ remove_proc_entry("ipv6_route", net->proc_net);
+ return -ENOMEM;
+ }
#endif
-#ifdef CONFIG_XFRM
+ return 0;
+}
+
+static void __net_exit ip6_route_net_exit_late(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ipv6_route", net->proc_net);
+ remove_proc_entry("rt6_stats", net->proc_net);
+#endif
+}
+
+static struct pernet_operations ip6_route_net_ops = {
+ .init = ip6_route_net_init,
+ .exit = ip6_route_net_exit,
+};
+
+static int __net_init ipv6_inetpeer_init(struct net *net)
+{
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv6.peers = bp;
+ return 0;
+}
+
+static void __net_exit ipv6_inetpeer_exit(struct net *net)
+{
+ struct inet_peer_base *bp = net->ipv6.peers;
+
+ net->ipv6.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
+}
+
+static struct pernet_operations ipv6_inetpeer_ops = {
+ .init = ipv6_inetpeer_init,
+ .exit = ipv6_inetpeer_exit,
+};
+
+static struct pernet_operations ip6_route_net_late_ops = {
+ .init = ip6_route_net_init_late,
+ .exit = ip6_route_net_exit_late,
+};
+
+static struct notifier_block ip6_route_dev_notifier = {
+ .notifier_call = ip6_route_dev_notify,
+ .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
+};
+
+void __init ip6_route_init_special_entries(void)
+{
+ /* Registering of the loopback is done before this portion of code,
+ * the loopback reference in rt6_info will not be taken, do it
+ * manually for init_net */
+ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ #endif
+}
+
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
+
+BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info)
+
+static const struct bpf_iter_seq_info ipv6_route_seq_info = {
+ .seq_ops = &ipv6_route_seq_ops,
+ .init_seq_private = bpf_iter_init_seq_net,
+ .fini_seq_private = bpf_iter_fini_seq_net,
+ .seq_priv_size = sizeof(struct ipv6_route_iter),
+};
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
+ .target = "ipv6_route",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ipv6_route, rt),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ipv6_route_seq_info,
+};
+
+static int __init bpf_iter_register(void)
+{
+ ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
+ return bpf_iter_reg_target(&ipv6_route_reg_info);
+}
+
+static void bpf_iter_unregister(void)
+{
+ bpf_iter_unreg_target(&ipv6_route_reg_info);
+}
+#endif
+#endif
+
+static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
+ .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
+ .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
+ .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+};
+
+int __init ip6_route_init(void)
+{
+ int ret;
+ int cpu;
+
+ ret = -ENOMEM;
+ ip6_dst_ops_template.kmem_cachep =
+ kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
+ if (!ip6_dst_ops_template.kmem_cachep)
+ goto out;
+
+ ret = dst_entries_init(&ip6_dst_blackhole_ops);
+ if (ret)
+ goto out_kmem_cache;
+
+ ret = register_pernet_subsys(&ipv6_inetpeer_ops);
+ if (ret)
+ goto out_dst_entries;
+
+ ret = register_pernet_subsys(&ip6_route_net_ops);
+ if (ret)
+ goto out_register_inetpeer;
+
+ ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
+
+ ret = fib6_init();
+ if (ret)
+ goto out_register_subsys;
+
+ ret = xfrm6_init();
+ if (ret)
+ goto out_fib6_init;
+
+ ret = fib6_rules_init();
+ if (ret)
+ goto xfrm6_init;
+
+ ret = register_pernet_subsys(&ip6_route_net_late_ops);
+ if (ret)
+ goto fib6_rules_init;
+
+ ret = rtnl_register_many(ip6_route_rtnl_msg_handlers);
+ if (ret < 0)
+ goto out_register_late_subsys;
+
+ ret = register_netdevice_notifier(&ip6_route_dev_notifier);
+ if (ret)
+ goto out_register_late_subsys;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ ret = bpf_iter_register();
+ if (ret)
+ goto out_register_late_subsys;
+#endif
+#endif
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+
+ INIT_LIST_HEAD(&ul->head);
+ spin_lock_init(&ul->lock);
+ }
+
+out:
+ return ret;
+
+out_register_late_subsys:
+ rtnl_unregister_all(PF_INET6);
+ unregister_pernet_subsys(&ip6_route_net_late_ops);
+fib6_rules_init:
+ fib6_rules_cleanup();
+xfrm6_init:
xfrm6_fini();
+out_fib6_init:
+ fib6_gc_cleanup();
+out_register_subsys:
+ unregister_pernet_subsys(&ip6_route_net_ops);
+out_register_inetpeer:
+ unregister_pernet_subsys(&ipv6_inetpeer_ops);
+out_dst_entries:
+ dst_entries_destroy(&ip6_dst_blackhole_ops);
+out_kmem_cache:
+ kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
+ goto out;
+}
+
+void ip6_route_cleanup(void)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_unregister();
+#endif
#endif
- rt6_ifdown(NULL);
+ unregister_netdevice_notifier(&ip6_route_dev_notifier);
+ unregister_pernet_subsys(&ip6_route_net_late_ops);
+ fib6_rules_cleanup();
+ xfrm6_fini();
fib6_gc_cleanup();
- kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
+ unregister_pernet_subsys(&ipv6_inetpeer_ops);
+ unregister_pernet_subsys(&ip6_route_net_ops);
+ dst_entries_destroy(&ip6_dst_blackhole_ops);
+ kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
}
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
new file mode 100644
index 000000000000..e186998bfbf7
--- /dev/null
+++ b/net/ipv6/rpl.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x))
+#define IPV6_RPL_BEST_ADDR_COMPRESSION 15
+
+static void ipv6_rpl_addr_decompress(struct in6_addr *dst,
+ const struct in6_addr *daddr,
+ const void *post, unsigned char pfx)
+{
+ memcpy(dst, daddr, pfx);
+ memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr,
+ unsigned char pfx)
+{
+ memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
+{
+ return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)];
+}
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ int i;
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3);
+ outhdr->pad = 0;
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = 0;
+ outhdr->cmpre = 0;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
+ ipv6_rpl_segdata_pos(inhdr, i),
+ inhdr->cmpri);
+
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr,
+ ipv6_rpl_segdata_pos(inhdr, n),
+ inhdr->cmpre);
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr,
+ unsigned char n)
+{
+ unsigned char plen;
+ int i;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ for (i = 0; i < n; i++) {
+ if (daddr->s6_addr[plen] !=
+ inhdr->rpl_segaddr[i].s6_addr[plen])
+ return plen;
+ }
+ }
+
+ return IPV6_RPL_BEST_ADDR_COMPRESSION;
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,
+ const struct in6_addr *last_segment)
+{
+ unsigned int plen;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ if (daddr->s6_addr[plen] != last_segment->s6_addr[plen])
+ return plen;
+ }
+
+ return IPV6_RPL_BEST_ADDR_COMPRESSION;
+}
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ unsigned char cmpri, cmpre;
+ size_t seglen;
+ int i;
+
+ cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n);
+ cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]);
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+ outhdr->hdrlen = seglen >> 3;
+ if (seglen & 0x7) {
+ outhdr->hdrlen++;
+ outhdr->pad = 8 - (seglen & 0x7);
+ } else {
+ outhdr->pad = 0;
+ }
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = cmpri;
+ outhdr->cmpre = cmpre;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
+ &inhdr->rpl_segaddr[i], cmpri);
+
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n),
+ &inhdr->rpl_segaddr[n], cmpre);
+}
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
new file mode 100644
index 000000000000..c7942cf65567
--- /dev/null
+++ b/net/ipv6/rpl_iptunnel.c
@@ -0,0 +1,394 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <linux/rpl_iptunnel.h>
+
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+struct rpl_iptunnel_encap {
+ DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh);
+};
+
+struct rpl_lwt {
+ struct dst_cache cache;
+ struct rpl_iptunnel_encap tuninfo;
+};
+
+static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct rpl_lwt *)lwt->data;
+}
+
+static inline struct rpl_iptunnel_encap *
+rpl_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return &rpl_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = {
+ [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh,
+ size_t seglen)
+{
+ int err;
+
+ if ((srh->hdrlen << 3) != seglen)
+ return false;
+
+ /* check at least one segment and seglen fit with segments_left */
+ if (!srh->segments_left ||
+ (srh->segments_left * sizeof(struct in6_addr)) != seglen)
+ return false;
+
+ if (srh->cmpri || srh->cmpre)
+ return false;
+
+ err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr,
+ srh->segments_left);
+ if (err)
+ return false;
+
+ if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) &
+ IPV6_ADDR_MULTICAST)
+ return false;
+
+ return true;
+}
+
+static int rpl_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RPL_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct ipv6_rpl_sr_hdr *srh;
+ struct rpl_lwt *rlwt;
+ int err, srh_len;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla,
+ rpl_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[RPL_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ srh = nla_data(tb[RPL_IPTUNNEL_SRH]);
+ srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]);
+
+ if (srh_len < sizeof(*srh))
+ return -EINVAL;
+
+ /* verify that SRH is consistent */
+ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt));
+ if (!newts)
+ return -ENOMEM;
+
+ rlwt = rpl_lwt_lwtunnel(newts);
+
+ err = dst_cache_init(&rlwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+
+ memcpy(&rlwt->tuninfo.srh, srh, srh_len);
+
+ newts->type = LWTUNNEL_ENCAP_RPL;
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void rpl_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache);
+}
+
+static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ const struct ipv6_rpl_sr_hdr *srh,
+ struct dst_entry *cache_dst)
+{
+ struct ipv6_rpl_sr_hdr *isrh, *csrh;
+ struct ipv6hdr oldhdr;
+ struct ipv6hdr *hdr;
+ unsigned char *buf;
+ size_t hdrlen;
+ int err;
+
+ memcpy(&oldhdr, ipv6_hdr(skb), sizeof(oldhdr));
+
+ buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC);
+ if (!buf)
+ return -ENOMEM;
+
+ isrh = (struct ipv6_rpl_sr_hdr *)buf;
+ csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3));
+
+ memcpy(isrh, srh, sizeof(*isrh));
+ memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1],
+ (srh->segments_left - 1) * 16);
+ isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr.daddr;
+
+ ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0],
+ isrh->segments_left - 1);
+
+ hdrlen = ((csrh->hdrlen + 1) << 3);
+
+ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err)) {
+ kfree(buf);
+ return err;
+ }
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, &oldhdr, sizeof(*hdr));
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, csrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ hdr->daddr = srh->rpl_segaddr[0];
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ kfree(buf);
+
+ return 0;
+}
+
+static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ struct dst_entry *cache_dst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rpl_iptunnel_encap *tinfo;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ tinfo = rpl_encap_lwtunnel(dst->lwtstate);
+
+ return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst);
+}
+
+static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ local_bh_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ local_bh_enable();
+
+ err = rpl_do_srh(skb, rlwt, dst);
+ if (unlikely(err))
+ goto drop;
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ goto drop;
+ }
+
+ /* cache only if we don't create a dst reference loop */
+ if (orig_dst->lwtstate != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ return dst_output(net, sk, skb);
+
+drop:
+ dst_release(dst);
+ kfree_skb(skb);
+ return err;
+}
+
+static int rpl_input(struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct lwtunnel_state *lwtst;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ /* We cannot dereference "orig_dst" once ip6_route_input() or
+ * skb_dst_drop() is called. However, in order to detect a dst loop, we
+ * need the address of its lwtstate. So, save the address of lwtstate
+ * now and use it later as a comparison.
+ */
+ lwtst = orig_dst->lwtstate;
+
+ rlwt = rpl_lwt_lwtunnel(lwtst);
+
+ local_bh_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ local_bh_enable();
+
+ err = rpl_do_srh(skb, rlwt, dst);
+ if (unlikely(err)) {
+ dst_release(dst);
+ goto drop;
+ }
+
+ if (!dst) {
+ ip6_route_input(skb);
+ dst = skb_dst(skb);
+
+ /* cache only if we don't create a dst reference loop */
+ if (!dst->error && lwtst != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst,
+ &ipv6_hdr(skb)->saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ } else {
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ }
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype,
+ struct rpl_iptunnel_encap *tuninfo)
+{
+ struct rpl_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo->srh, len);
+
+ return 0;
+}
+
+static int rpl_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh));
+}
+
+static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a);
+ struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b);
+ int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh);
+
+ if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops rpl_ops = {
+ .build_state = rpl_build_state,
+ .destroy_state = rpl_destroy_state,
+ .output = rpl_output,
+ .input = rpl_input,
+ .fill_encap = rpl_fill_encap_info,
+ .get_encap_size = rpl_encap_nlsize,
+ .cmp_encap = rpl_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init rpl_init(void)
+{
+ int err;
+
+ err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+ if (err)
+ goto out;
+
+ pr_info("RPL Segment Routing with IPv6\n");
+
+ return 0;
+
+out:
+ return err;
+}
+
+void rpl_exit(void)
+{
+ lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+}
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
new file mode 100644
index 000000000000..a5c4c629b788
--- /dev/null
+++ b/net/ipv6/seg6.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SR-IPv6 implementation
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <linux/rhashtable.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <linux/seg6.h>
+#include <linux/seg6_genl.h>
+#include <net/seg6_hmac.h>
+
+bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced)
+{
+ unsigned int tlv_offset;
+ int max_last_entry;
+ int trailing;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (((srh->hdrlen + 1) << 3) != len)
+ return false;
+
+ if (!reduced && srh->segments_left > srh->first_segment) {
+ return false;
+ } else {
+ max_last_entry = (srh->hdrlen / 2) - 1;
+
+ if (srh->first_segment > max_last_entry)
+ return false;
+
+ if (srh->segments_left > srh->first_segment + 1)
+ return false;
+ }
+
+ tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
+
+ trailing = len - tlv_offset;
+ if (trailing < 0)
+ return false;
+
+ while (trailing) {
+ struct sr6_tlv *tlv;
+ unsigned int tlv_len;
+
+ if (trailing < sizeof(*tlv))
+ return false;
+
+ tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
+ tlv_len = sizeof(*tlv) + tlv->len;
+
+ trailing -= tlv_len;
+ if (trailing < 0)
+ return false;
+
+ tlv_offset += tlv_len;
+ }
+
+ return true;
+}
+
+struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags)
+{
+ struct ipv6_sr_hdr *srh;
+ int len, srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0)
+ return NULL;
+
+ if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
+ return NULL;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ len = (srh->hdrlen + 1) << 3;
+
+ if (!pskb_may_pull(skb, srhoff + len))
+ return NULL;
+
+ /* note that pskb_may_pull may change pointers in header;
+ * for this reason it is necessary to reload them when needed.
+ */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ if (!seg6_validate_srh(srh, len, true))
+ return NULL;
+
+ return srh;
+}
+
+/* Determine if an ICMP invoking packet contains a segment routing
+ * header. If it does, extract the offset to the true destination
+ * address, which is in the first segment address.
+ */
+void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt)
+{
+ __u16 network_header = skb->network_header;
+ struct ipv6_sr_hdr *srh;
+
+ /* Update network header to point to the invoking packet
+ * inside the ICMP packet, so we can use the seg6_get_srh()
+ * helper.
+ */
+ skb_reset_network_header(skb);
+
+ srh = seg6_get_srh(skb, 0);
+ if (!srh)
+ goto out;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ goto out;
+
+ opt->flags |= IP6SKB_SEG6;
+ opt->srhoff = (unsigned char *)srh - skb->data;
+
+out:
+ /* Restore the network header back to the ICMP packet */
+ skb->network_header = network_header;
+}
+
+static struct genl_family seg6_genl_family;
+
+static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
+ [SEG6_ATTR_DST] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, },
+ [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, },
+ [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, },
+ [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, },
+ [SEG6_ATTR_ALGID] = { .type = NLA_U8, },
+ [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, },
+};
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct seg6_pernet_data *sdata;
+ struct seg6_hmac_info *hinfo;
+ u32 hmackeyid;
+ char *secret;
+ int err = 0;
+ u8 algid;
+ u8 slen;
+
+ sdata = seg6_pernet(net);
+
+ if (!info->attrs[SEG6_ATTR_HMACKEYID] ||
+ !info->attrs[SEG6_ATTR_SECRETLEN] ||
+ !info->attrs[SEG6_ATTR_ALGID])
+ return -EINVAL;
+
+ hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]);
+ slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]);
+ algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]);
+
+ if (hmackeyid == 0)
+ return -EINVAL;
+
+ if (slen > SEG6_HMAC_SECRET_LEN)
+ return -EINVAL;
+
+ mutex_lock(&sdata->lock);
+ hinfo = seg6_hmac_info_lookup(net, hmackeyid);
+
+ if (!slen) {
+ err = seg6_hmac_info_del(net, hmackeyid);
+
+ goto out_unlock;
+ }
+
+ if (!info->attrs[SEG6_ATTR_SECRET]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (hinfo) {
+ err = seg6_hmac_info_del(net, hmackeyid);
+ if (err)
+ goto out_unlock;
+ }
+
+ secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]);
+
+ hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL);
+ if (!hinfo) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(hinfo->secret, secret, slen);
+ hinfo->slen = slen;
+ hinfo->alg_id = algid;
+ hinfo->hmackeyid = hmackeyid;
+
+ err = seg6_hmac_info_add(net, hmackeyid, hinfo);
+ if (err)
+ kfree(hinfo);
+
+out_unlock:
+ mutex_unlock(&sdata->lock);
+ return err;
+}
+
+#else
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+#endif
+
+static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct in6_addr *val, *t_old, *t_new;
+ struct seg6_pernet_data *sdata;
+
+ sdata = seg6_pernet(net);
+
+ if (!info->attrs[SEG6_ATTR_DST])
+ return -EINVAL;
+
+ val = nla_data(info->attrs[SEG6_ATTR_DST]);
+ t_new = kmemdup(val, sizeof(*val), GFP_KERNEL);
+ if (!t_new)
+ return -ENOMEM;
+
+ mutex_lock(&sdata->lock);
+
+ t_old = sdata->tun_src;
+ rcu_assign_pointer(sdata->tun_src, t_new);
+
+ mutex_unlock(&sdata->lock);
+
+ synchronize_net();
+ kfree(t_old);
+
+ return 0;
+}
+
+static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct in6_addr *tun_src;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC);
+ if (!hdr)
+ goto free_msg;
+
+ rcu_read_lock();
+ tun_src = rcu_dereference(seg6_pernet(net)->tun_src);
+
+ if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src))
+ goto nla_put_failure;
+
+ rcu_read_unlock();
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ rcu_read_unlock();
+free_msg:
+ nlmsg_free(msg);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+
+static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo,
+ struct sk_buff *msg)
+{
+ if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) ||
+ nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) ||
+ nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) ||
+ nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id))
+ return -1;
+
+ return 0;
+}
+
+static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (__seg6_hmac_fill_info(hinfo, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct seg6_pernet_data *sdata;
+ struct rhashtable_iter *iter;
+
+ sdata = seg6_pernet(net);
+ iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&sdata->hmac_infos, iter);
+
+ return 0;
+}
+
+static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+
+ kfree(iter);
+
+ return 0;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+ struct seg6_hmac_info *hinfo;
+ int ret;
+
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ hinfo = rhashtable_walk_next(iter);
+
+ if (IS_ERR(hinfo)) {
+ if (PTR_ERR(hinfo) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(hinfo);
+ goto done;
+ } else if (!hinfo) {
+ break;
+ }
+
+ ret = __seg6_genl_dumphmac_element(hinfo,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb, SEG6_CMD_DUMPHMAC);
+ if (ret)
+ goto done;
+ }
+
+ ret = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return ret;
+}
+
+#else
+
+static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return -ENOTSUPP;
+}
+
+#endif
+
+static int __net_init seg6_net_init(struct net *net)
+{
+ struct seg6_pernet_data *sdata;
+
+ sdata = kzalloc(sizeof(*sdata), GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+
+ mutex_init(&sdata->lock);
+
+ sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL);
+ if (!sdata->tun_src) {
+ kfree(sdata);
+ return -ENOMEM;
+ }
+
+ net->ipv6.seg6_data = sdata;
+
+ if (seg6_hmac_net_init(net)) {
+ kfree(rcu_dereference_raw(sdata->tun_src));
+ kfree(sdata);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit seg6_net_exit(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ seg6_hmac_net_exit(net);
+
+ kfree(rcu_dereference_raw(sdata->tun_src));
+ kfree(sdata);
+}
+
+static struct pernet_operations ip6_segments_ops = {
+ .init = seg6_net_init,
+ .exit = seg6_net_exit,
+};
+
+static const struct genl_ops seg6_genl_ops[] = {
+ {
+ .cmd = SEG6_CMD_SETHMAC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = seg6_genl_sethmac,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_DUMPHMAC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = seg6_genl_dumphmac_start,
+ .dumpit = seg6_genl_dumphmac,
+ .done = seg6_genl_dumphmac_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_SET_TUNSRC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = seg6_genl_set_tunsrc,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = SEG6_CMD_GET_TUNSRC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = seg6_genl_get_tunsrc,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static struct genl_family seg6_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SEG6_GENL_NAME,
+ .version = SEG6_GENL_VERSION,
+ .maxattr = SEG6_ATTR_MAX,
+ .policy = seg6_genl_policy,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = seg6_genl_ops,
+ .n_ops = ARRAY_SIZE(seg6_genl_ops),
+ .resv_start_op = SEG6_CMD_GET_TUNSRC + 1,
+ .module = THIS_MODULE,
+};
+
+int __init seg6_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&ip6_segments_ops);
+ if (err)
+ goto out;
+
+ err = genl_register_family(&seg6_genl_family);
+ if (err)
+ goto out_unregister_pernet;
+
+ err = seg6_iptunnel_init();
+ if (err)
+ goto out_unregister_genl;
+
+ err = seg6_local_init();
+ if (err)
+ goto out_unregister_iptun;
+
+ pr_info("Segment Routing with IPv6\n");
+
+out:
+ return err;
+out_unregister_iptun:
+ seg6_iptunnel_exit();
+out_unregister_genl:
+ genl_unregister_family(&seg6_genl_family);
+out_unregister_pernet:
+ unregister_pernet_subsys(&ip6_segments_ops);
+ goto out;
+}
+
+void seg6_exit(void)
+{
+ seg6_local_exit();
+ seg6_iptunnel_exit();
+ genl_unregister_family(&seg6_genl_family);
+ unregister_pernet_subsys(&ip6_segments_ops);
+}
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
new file mode 100644
index 000000000000..ee6bac0160ac
--- /dev/null
+++ b/net/ipv6/seg6_hmac.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SR-IPv6 implementation -- HMAC functions
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/rhashtable.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
+#include <crypto/utils.h>
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <net/seg6_hmac.h>
+#include <linux/random.h>
+
+struct hmac_storage {
+ local_lock_t bh_lock;
+ char hmac_ring[SEG6_HMAC_RING_SIZE];
+};
+
+static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct seg6_hmac_info *hinfo = obj;
+
+ return (hinfo->hmackeyid != *(__u32 *)arg->key);
+}
+
+static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo)
+{
+ kfree_rcu(hinfo, rcu);
+}
+
+static void seg6_free_hi(void *ptr, void *arg)
+{
+ struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr;
+
+ if (hinfo)
+ seg6_hinfo_release(hinfo);
+}
+
+static const struct rhashtable_params rht_params = {
+ .head_offset = offsetof(struct seg6_hmac_info, node),
+ .key_offset = offsetof(struct seg6_hmac_info, hmackeyid),
+ .key_len = sizeof(u32),
+ .automatic_shrinking = true,
+ .obj_cmpfn = seg6_hmac_cmpfn,
+};
+
+static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
+{
+ struct sr6_tlv_hmac *tlv;
+
+ if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5)
+ return NULL;
+
+ if (!sr_has_hmac(srh))
+ return NULL;
+
+ tlv = (struct sr6_tlv_hmac *)
+ ((char *)srh + ((srh->hdrlen + 1) << 3) - 40);
+
+ if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38)
+ return NULL;
+
+ return tlv;
+}
+
+int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
+ struct in6_addr *saddr, u8 *output)
+{
+ __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
+ int plen, i, ret = 0;
+ char *ring, *off;
+
+ /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
+ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
+
+ /* this limit allows for 14 segments */
+ if (plen >= SEG6_HMAC_RING_SIZE)
+ return -EMSGSIZE;
+
+ /* Let's build the HMAC text on the ring buffer. The text is composed
+ * as follows, in order:
+ *
+ * 1. Source IPv6 address (128 bits)
+ * 2. first_segment value (8 bits)
+ * 3. Flags (8 bits)
+ * 4. HMAC Key ID (32 bits)
+ * 5. All segments in the segments list (n * 128 bits)
+ */
+
+ local_bh_disable();
+ local_lock_nested_bh(&hmac_storage.bh_lock);
+ ring = this_cpu_ptr(hmac_storage.hmac_ring);
+ off = ring;
+
+ /* source address */
+ memcpy(off, saddr, 16);
+ off += 16;
+
+ /* first_segment value */
+ *off++ = hdr->first_segment;
+
+ /* flags */
+ *off++ = hdr->flags;
+
+ /* HMAC Key ID */
+ memcpy(off, &hmackeyid, 4);
+ off += 4;
+
+ /* all segments in the list */
+ for (i = 0; i < hdr->first_segment + 1; i++) {
+ memcpy(off, hdr->segments + i, 16);
+ off += 16;
+ }
+
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1(&hinfo->key.sha1, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE);
+ memset(&output[SHA1_DIGEST_SIZE], 0,
+ SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256(&hinfo->key.sha256, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ break;
+ }
+ local_unlock_nested_bh(&hmac_storage.bh_lock);
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(seg6_hmac_compute);
+
+/* checks if an incoming SR-enabled packet's HMAC status matches
+ * the incoming policy.
+ *
+ * called with rcu_read_lock()
+ */
+bool seg6_hmac_validate_skb(struct sk_buff *skb)
+{
+ u8 hmac_output[SEG6_HMAC_FIELD_LEN];
+ struct net *net = dev_net(skb->dev);
+ struct seg6_hmac_info *hinfo;
+ struct sr6_tlv_hmac *tlv;
+ struct ipv6_sr_hdr *srh;
+ struct inet6_dev *idev;
+ int require_hmac;
+
+ idev = __in6_dev_get(skb->dev);
+
+ srh = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+ tlv = seg6_get_tlv_hmac(srh);
+
+ require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac);
+ /* mandatory check but no tlv */
+ if (require_hmac > 0 && !tlv)
+ return false;
+
+ /* no check */
+ if (require_hmac < 0)
+ return true;
+
+ /* check only if present */
+ if (require_hmac == 0 && !tlv)
+ return true;
+
+ /* now, seg6_require_hmac >= 0 && tlv */
+
+ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+ if (!hinfo)
+ return false;
+
+ if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
+ return false;
+
+ if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL(seg6_hmac_validate_skb);
+
+/* called with rcu_read_lock() */
+struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct seg6_hmac_info *hinfo;
+
+ hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+
+ return hinfo;
+}
+EXPORT_SYMBOL(seg6_hmac_info_lookup);
+
+int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ int err;
+
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1_preparekey(&hinfo->key.sha1,
+ hinfo->secret, hinfo->slen);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256_preparekey(&hinfo->key.sha256,
+ hinfo->secret, hinfo->slen);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
+ rht_params);
+
+ return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_add);
+
+int seg6_hmac_info_del(struct net *net, u32 key)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct seg6_hmac_info *hinfo;
+ int err = -ENOENT;
+
+ hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+ if (!hinfo)
+ goto out;
+
+ err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node,
+ rht_params);
+ if (err)
+ goto out;
+
+ seg6_hinfo_release(hinfo);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_del);
+
+int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+ struct ipv6_sr_hdr *srh)
+{
+ struct seg6_hmac_info *hinfo;
+ struct sr6_tlv_hmac *tlv;
+ int err = -ENOENT;
+
+ tlv = seg6_get_tlv_hmac(srh);
+ if (!tlv)
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+ if (!hinfo)
+ goto out;
+
+ memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN);
+ err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(seg6_push_hmac);
+
+int __net_init seg6_hmac_net_init(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ return rhashtable_init(&sdata->hmac_infos, &rht_params);
+}
+
+void __net_exit seg6_hmac_net_exit(struct net *net)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+ rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL);
+}
+EXPORT_SYMBOL(seg6_hmac_net_exit);
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
new file mode 100644
index 000000000000..3e1b9991131a
--- /dev/null
+++ b/net/ipv6/seg6_iptunnel.c
@@ -0,0 +1,779 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SR-IPv6 implementation
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_iptunnel.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/dst_cache.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+#include <linux/netfilter.h>
+
+static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
+{
+ int head = 0;
+
+ switch (tuninfo->mode) {
+ case SEG6_IPTUN_MODE_INLINE:
+ break;
+ case SEG6_IPTUN_MODE_ENCAP:
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ head = sizeof(struct ipv6hdr);
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP:
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ return 0;
+ }
+
+ return ((tuninfo->srh->hdrlen + 1) << 3) + head;
+}
+
+struct seg6_lwt {
+ struct dst_cache cache;
+ struct seg6_iptunnel_encap tuninfo[];
+};
+
+static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct seg6_lwt *)lwt->data;
+}
+
+static inline struct seg6_iptunnel_encap *
+seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return seg6_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
+ [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static int nla_put_srh(struct sk_buff *skb, int attrtype,
+ struct seg6_iptunnel_encap *tuninfo)
+{
+ struct seg6_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo, len);
+
+ return 0;
+}
+
+static void set_tun_src(struct net *net, struct net_device *dev,
+ struct in6_addr *daddr, struct in6_addr *saddr)
+{
+ struct seg6_pernet_data *sdata = seg6_pernet(net);
+ struct in6_addr *tun_src;
+
+ rcu_read_lock();
+
+ tun_src = rcu_dereference(sdata->tun_src);
+
+ if (!ipv6_addr_any(tun_src)) {
+ memcpy(saddr, tun_src, sizeof(struct in6_addr));
+ } else {
+ ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
+ saddr);
+ }
+
+ rcu_read_unlock();
+}
+
+/* Compute flowlabel for outer IPv6 header */
+static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
+ struct ipv6hdr *inner_hdr)
+{
+ int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
+ __be32 flowlabel = 0;
+ u32 hash;
+
+ if (do_flowlabel > 0) {
+ hash = skb_get_hash(skb);
+ hash = rol32(hash, 16);
+ flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+ } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
+ flowlabel = ip6_flowlabel(inner_hdr);
+ }
+ return flowlabel;
+}
+
+static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
+ int proto, struct dst_entry *cache_dst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst_dev(dst);
+ struct net *net = dev_net(dev);
+ struct ipv6hdr *hdr, *inner_hdr;
+ struct ipv6_sr_hdr *isrh;
+ int hdrlen, tot_len, err;
+ __be32 flowlabel;
+
+ hdrlen = (osrh->hdrlen + 1) << 3;
+ tot_len = hdrlen + sizeof(*hdr);
+
+ err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+ flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
+
+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ hdr = ipv6_hdr(skb);
+
+ /* inherit tc, flowlabel and hlim
+ * hlim will be decremented in ip6_forward() afterwards and
+ * decapsulation will overwrite inner hlim with outer hlim
+ */
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+ flowlabel);
+ hdr->hop_limit = inner_hdr->hop_limit;
+ } else {
+ ip6_flow_hdr(hdr, 0, flowlabel);
+ hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+
+ /* the control block has been erased, so we have to set the
+ * iif once again.
+ * We read the receiving interface index directly from the
+ * skb->skb_iif as it is done in the IPv4 receiving path (i.e.:
+ * ip_rcv_core(...)).
+ */
+ IP6CB(skb)->iif = skb->skb_iif;
+ }
+
+ hdr->nexthdr = NEXTHDR_ROUTING;
+
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, osrh, hdrlen);
+
+ isrh->nexthdr = proto;
+
+ hdr->daddr = isrh->segments[isrh->first_segment];
+ set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(isrh)) {
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, tot_len);
+
+ return 0;
+}
+
+/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
+int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
+{
+ return __seg6_do_srh_encap(skb, osrh, proto, NULL);
+}
+EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
+
+/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
+static int seg6_do_srh_encap_red(struct sk_buff *skb,
+ struct ipv6_sr_hdr *osrh, int proto,
+ struct dst_entry *cache_dst)
+{
+ __u8 first_seg = osrh->first_segment;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst_dev(dst);
+ struct net *net = dev_net(dev);
+ struct ipv6hdr *hdr, *inner_hdr;
+ int hdrlen = ipv6_optlen(osrh);
+ int red_tlv_offset, tlv_offset;
+ struct ipv6_sr_hdr *isrh;
+ bool skip_srh = false;
+ __be32 flowlabel;
+ int tot_len, err;
+ int red_hdrlen;
+ int tlvs_len;
+
+ if (first_seg > 0) {
+ red_hdrlen = hdrlen - sizeof(struct in6_addr);
+ } else {
+ /* NOTE: if tag/flags and/or other TLVs are introduced in the
+ * seg6_iptunnel infrastructure, they should be considered when
+ * deciding to skip the SRH.
+ */
+ skip_srh = !sr_has_hmac(osrh);
+
+ red_hdrlen = skip_srh ? 0 : hdrlen;
+ }
+
+ tot_len = red_hdrlen + sizeof(struct ipv6hdr);
+
+ err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+ flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
+
+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ hdr = ipv6_hdr(skb);
+
+ /* based on seg6_do_srh_encap() */
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+ flowlabel);
+ hdr->hop_limit = inner_hdr->hop_limit;
+ } else {
+ ip6_flow_hdr(hdr, 0, flowlabel);
+ hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ IP6CB(skb)->iif = skb->skb_iif;
+ }
+
+ /* no matter if we have to skip the SRH or not, the first segment
+ * always comes in the pushed IPv6 header.
+ */
+ hdr->daddr = osrh->segments[first_seg];
+
+ if (skip_srh) {
+ hdr->nexthdr = proto;
+
+ set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
+ goto out;
+ }
+
+ /* we cannot skip the SRH, slow path */
+
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ isrh = (void *)hdr + sizeof(struct ipv6hdr);
+
+ if (unlikely(!first_seg)) {
+ /* this is a very rare case; we have only one SID but
+ * we cannot skip the SRH since we are carrying some
+ * other info.
+ */
+ memcpy(isrh, osrh, hdrlen);
+ goto srcaddr;
+ }
+
+ tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr);
+ red_tlv_offset = tlv_offset - sizeof(struct in6_addr);
+
+ memcpy(isrh, osrh, red_tlv_offset);
+
+ tlvs_len = hdrlen - tlv_offset;
+ if (unlikely(tlvs_len > 0)) {
+ const void *s = (const void *)osrh + tlv_offset;
+ void *d = (void *)isrh + red_tlv_offset;
+
+ memcpy(d, s, tlvs_len);
+ }
+
+ --isrh->first_segment;
+ isrh->hdrlen -= 2;
+
+srcaddr:
+ isrh->nexthdr = proto;
+ set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (unlikely(!skip_srh && sr_has_hmac(isrh))) {
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+out:
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, tot_len);
+
+ return 0;
+}
+
+static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
+ struct dst_entry *cache_dst)
+{
+ struct ipv6hdr *hdr, *oldhdr;
+ struct ipv6_sr_hdr *isrh;
+ int hdrlen, err;
+
+ hdrlen = (osrh->hdrlen + 1) << 3;
+
+ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+
+ memmove(hdr, oldhdr, sizeof(*hdr));
+
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, osrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+
+ isrh->segments[0] = hdr->daddr;
+ hdr->daddr = isrh->segments[isrh->first_segment];
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (sr_has_hmac(isrh)) {
+ struct net *net = skb_dst_dev_net(skb);
+
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ return 0;
+}
+
+static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct seg6_iptunnel_encap *tinfo;
+ int proto, err = 0;
+
+ tinfo = seg6_encap_lwtunnel(dst->lwtstate);
+
+ switch (tinfo->mode) {
+ case SEG6_IPTUN_MODE_INLINE:
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst);
+ if (err)
+ return err;
+ break;
+ case SEG6_IPTUN_MODE_ENCAP:
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
+ if (err)
+ return err;
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ proto = IPPROTO_IPV6;
+ else if (skb->protocol == htons(ETH_P_IP))
+ proto = IPPROTO_IPIP;
+ else
+ return -EINVAL;
+
+ if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP)
+ err = __seg6_do_srh_encap(skb, tinfo->srh,
+ proto, cache_dst);
+ else
+ err = seg6_do_srh_encap_red(skb, tinfo->srh,
+ proto, cache_dst);
+
+ if (err)
+ return err;
+
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ skb_set_inner_protocol(skb, skb->protocol);
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP:
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ if (!skb_mac_header_was_set(skb))
+ return -EINVAL;
+
+ if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0)
+ return -ENOMEM;
+
+ skb_mac_header_rebuild(skb);
+ skb_push(skb, skb->mac_len);
+
+ if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP)
+ err = __seg6_do_srh_encap(skb, tinfo->srh,
+ IPPROTO_ETHERNET,
+ cache_dst);
+ else
+ err = seg6_do_srh_encap_red(skb, tinfo->srh,
+ IPPROTO_ETHERNET,
+ cache_dst);
+
+ if (err)
+ return err;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ }
+
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+ nf_reset_ct(skb);
+
+ return 0;
+}
+
+/* insert an SRH within an IPv6 packet, just after the IPv6 header */
+int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+ return __seg6_do_srh_inline(skb, osrh, NULL);
+}
+EXPORT_SYMBOL_GPL(seg6_do_srh_inline);
+
+static int seg6_input_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dst_input(skb);
+}
+
+static int seg6_input_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct lwtunnel_state *lwtst;
+ struct seg6_lwt *slwt;
+ int err;
+
+ /* We cannot dereference "orig_dst" once ip6_route_input() or
+ * skb_dst_drop() is called. However, in order to detect a dst loop, we
+ * need the address of its lwtstate. So, save the address of lwtstate
+ * now and use it later as a comparison.
+ */
+ lwtst = orig_dst->lwtstate;
+
+ slwt = seg6_lwt_lwtunnel(lwtst);
+
+ local_bh_disable();
+ dst = dst_cache_get(&slwt->cache);
+ local_bh_enable();
+
+ err = seg6_do_srh(skb, dst);
+ if (unlikely(err)) {
+ dst_release(dst);
+ goto drop;
+ }
+
+ if (!dst) {
+ ip6_route_input(skb);
+ dst = skb_dst(skb);
+
+ /* cache only if we don't create a dst reference loop */
+ if (!dst->error && lwtst != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&slwt->cache, dst,
+ &ipv6_hdr(skb)->saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ } else {
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ }
+
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ dev_net(skb->dev), NULL, skb, NULL,
+ skb_dst_dev(skb), seg6_input_finish);
+
+ return seg6_input_finish(dev_net(skb->dev), NULL, skb);
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int seg6_input_nf(struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst_dev(skb);
+ struct net *net = dev_net(skb->dev);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL,
+ skb, NULL, dev, seg6_input_core);
+ case htons(ETH_P_IPV6):
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL,
+ skb, NULL, dev, seg6_input_core);
+ }
+
+ return -EINVAL;
+}
+
+static int seg6_input(struct sk_buff *skb)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return seg6_input_nf(skb);
+
+ return seg6_input_core(dev_net(skb->dev), NULL, skb);
+}
+
+static int seg6_output_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct seg6_lwt *slwt;
+ int err;
+
+ slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
+
+ local_bh_disable();
+ dst = dst_cache_get(&slwt->cache);
+ local_bh_enable();
+
+ err = seg6_do_srh(skb, dst);
+ if (unlikely(err))
+ goto drop;
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ goto drop;
+ }
+
+ /* cache only if we don't create a dst reference loop */
+ if (orig_dst->lwtstate != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
+ NULL, dst_dev(dst), dst_output);
+
+ return dst_output(net, sk, skb);
+drop:
+ dst_release(dst);
+ kfree_skb(skb);
+ return err;
+}
+
+static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst_dev(skb);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
+ NULL, dev, seg6_output_core);
+ case htons(ETH_P_IPV6):
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
+ NULL, dev, seg6_output_core);
+ }
+
+ return -EINVAL;
+}
+
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return seg6_output_nf(net, sk, skb);
+
+ return seg6_output_core(net, sk, skb);
+}
+
+static int seg6_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
+ struct seg6_iptunnel_encap *tuninfo;
+ struct lwtunnel_state *newts;
+ int tuninfo_len, min_size;
+ struct seg6_lwt *slwt;
+ int err;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla,
+ seg6_iptunnel_policy, extack);
+
+ if (err < 0)
+ return err;
+
+ if (!tb[SEG6_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
+ tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
+
+ /* tuninfo must contain at least the iptunnel encap structure,
+ * the SRH and one segment
+ */
+ min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
+ sizeof(struct in6_addr);
+ if (tuninfo_len < min_size)
+ return -EINVAL;
+
+ switch (tuninfo->mode) {
+ case SEG6_IPTUN_MODE_INLINE:
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ break;
+ case SEG6_IPTUN_MODE_ENCAP:
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP:
+ break;
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* verify that SRH is consistent */
+ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
+ if (!newts)
+ return -ENOMEM;
+
+ slwt = seg6_lwt_lwtunnel(newts);
+
+ err = dst_cache_init(&slwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+
+ memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
+
+ newts->type = LWTUNNEL_ENCAP_SEG6;
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+
+ if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ newts->headroom = seg6_lwt_headroom(tuninfo);
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void seg6_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
+}
+
+static int seg6_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+ if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
+}
+
+static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
+ struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
+ int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
+
+ if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops seg6_iptun_ops = {
+ .build_state = seg6_build_state,
+ .destroy_state = seg6_destroy_state,
+ .output = seg6_output,
+ .input = seg6_input,
+ .fill_encap = seg6_fill_encap_info,
+ .get_encap_size = seg6_encap_nlsize,
+ .cmp_encap = seg6_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init seg6_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
+
+void seg6_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
new file mode 100644
index 000000000000..2b41e4c0dddd
--- /dev/null
+++ b/net/ipv6/seg6_local.c
@@ -0,0 +1,2720 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SR-IPv6 implementation
+ *
+ * Authors:
+ * David Lebrun <david.lebrun@uclouvain.be>
+ * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
+ */
+
+#include <linux/filter.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_local.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/dst_cache.h>
+#include <net/ip_tunnels.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+#include <net/seg6_local.h>
+#include <linux/etherdevice.h>
+#include <linux/bpf.h>
+#include <linux/netfilter.h>
+
+#define SEG6_F_ATTR(i) BIT(i)
+
+struct seg6_local_lwt;
+
+/* callbacks used for customizing the creation and destruction of a behavior */
+struct seg6_local_lwtunnel_ops {
+ int (*build_state)(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack);
+ void (*destroy_state)(struct seg6_local_lwt *slwt);
+};
+
+struct seg6_action_desc {
+ int action;
+ unsigned long attrs;
+
+ /* The optattrs field is used for specifying all the optional
+ * attributes supported by a specific behavior.
+ * It means that if one of these attributes is not provided in the
+ * netlink message during the behavior creation, no errors will be
+ * returned to the userspace.
+ *
+ * Each attribute can be only of two types (mutually exclusive):
+ * 1) required or 2) optional.
+ * Every user MUST obey to this rule! If you set an attribute as
+ * required the same attribute CANNOT be set as optional and vice
+ * versa.
+ */
+ unsigned long optattrs;
+
+ int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+ int static_headroom;
+
+ struct seg6_local_lwtunnel_ops slwt_ops;
+};
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+/* default length values (expressed in bits) for both Locator-Block and
+ * Locator-Node Function.
+ *
+ * Both SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS *must* be:
+ * i) greater than 0;
+ * ii) evenly divisible by 8. In other terms, the lengths of the
+ * Locator-Block and Locator-Node Function must be byte-aligned (we can
+ * relax this constraint in the future if really needed).
+ *
+ * Moreover, a third condition must hold:
+ * iii) SEG6_LOCAL_LCBLOCK_DBITS + SEG6_LOCAL_LCNODE_FN_DBITS <= 128.
+ *
+ * The correctness of SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS
+ * values are checked during the kernel compilation. If the compilation stops,
+ * check the value of these parameters to see if they meet conditions (i), (ii)
+ * and (iii).
+ */
+#define SEG6_LOCAL_LCBLOCK_DBITS 32
+#define SEG6_LOCAL_LCNODE_FN_DBITS 16
+
+/* The following next_csid_chk_{cntr,lcblock,lcblock_fn}_bits macros can be
+ * used directly to check whether the lengths (in bits) of Locator-Block and
+ * Locator-Node Function are valid according to (i), (ii), (iii).
+ */
+#define next_csid_chk_cntr_bits(blen, flen) \
+ ((blen) + (flen) > 128)
+
+#define next_csid_chk_lcblock_bits(blen) \
+({ \
+ typeof(blen) __tmp = blen; \
+ (!__tmp || __tmp > 120 || (__tmp & 0x07)); \
+})
+
+#define next_csid_chk_lcnode_fn_bits(flen) \
+ next_csid_chk_lcblock_bits(flen)
+
+/* flag indicating that flavors are set up for a given End* behavior */
+#define SEG6_F_LOCAL_FLAVORS SEG6_F_ATTR(SEG6_LOCAL_FLAVORS)
+
+#define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname)
+#define SEG6_F_LOCAL_FLV_NEXT_CSID SEG6_F_LOCAL_FLV_OP(NEXT_CSID)
+#define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP)
+
+/* Supported RFC8986 Flavor operations are reported in this bitmask */
+#define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP
+
+#define SEG6_LOCAL_END_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_NEXT_CSID | \
+ SEG6_LOCAL_FLV8986_SUPP_OPS)
+#define SEG6_LOCAL_END_X_FLV_SUPP_OPS SEG6_F_LOCAL_FLV_NEXT_CSID
+
+struct seg6_flavors_info {
+ /* Flavor operations */
+ __u32 flv_ops;
+
+ /* Locator-Block length, expressed in bits */
+ __u8 lcblock_bits;
+ /* Locator-Node Function length, expressed in bits*/
+ __u8 lcnode_func_bits;
+};
+
+enum seg6_end_dt_mode {
+ DT_INVALID_MODE = -EINVAL,
+ DT_LEGACY_MODE = 0,
+ DT_VRF_MODE = 1,
+};
+
+struct seg6_end_dt_info {
+ enum seg6_end_dt_mode mode;
+
+ struct net *net;
+ /* VRF device associated to the routing table used by the SRv6
+ * End.DT4/DT6 behavior for routing IPv4/IPv6 packets.
+ */
+ int vrf_ifindex;
+ int vrf_table;
+
+ /* tunneled packet family (IPv4 or IPv6).
+ * Protocol and header length are inferred from family.
+ */
+ u16 family;
+};
+
+struct pcpu_seg6_local_counters {
+ u64_stats_t packets;
+ u64_stats_t bytes;
+ u64_stats_t errors;
+
+ struct u64_stats_sync syncp;
+};
+
+/* This struct groups all the SRv6 Behavior counters supported so far.
+ *
+ * put_nla_counters() makes use of this data structure to collect all counter
+ * values after the per-CPU counter evaluation has been performed.
+ * Finally, each counter value (in seg6_local_counters) is stored in the
+ * corresponding netlink attribute and sent to user space.
+ *
+ * NB: we don't want to expose this structure to user space!
+ */
+struct seg6_local_counters {
+ __u64 packets;
+ __u64 bytes;
+ __u64 errors;
+};
+
+#define seg6_local_alloc_pcpu_counters(__gfp) \
+ __netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters, \
+ ((__gfp) | __GFP_ZERO))
+
+#define SEG6_F_LOCAL_COUNTERS SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
+
+struct seg6_local_lwt {
+ int action;
+ struct ipv6_sr_hdr *srh;
+ int table;
+ struct in_addr nh4;
+ struct in6_addr nh6;
+ int iif;
+ int oif;
+ struct bpf_lwt_prog bpf;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ struct seg6_end_dt_info dt_info;
+#endif
+ struct seg6_flavors_info flv_info;
+
+ struct pcpu_seg6_local_counters __percpu *pcpu_counters;
+
+ int headroom;
+ struct seg6_action_desc *desc;
+ /* unlike the required attrs, we have to track the optional attributes
+ * that have been effectively parsed.
+ */
+ unsigned long parsed_optattrs;
+};
+
+static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct seg6_local_lwt *)lwt->data;
+}
+
+static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
+{
+ struct ipv6_sr_hdr *srh;
+
+ srh = seg6_get_srh(skb, IP6_FH_F_SKIP_RH);
+ if (!srh)
+ return NULL;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (!seg6_hmac_validate_skb(skb))
+ return NULL;
+#endif
+
+ return srh;
+}
+
+static bool decap_and_validate(struct sk_buff *skb, int proto)
+{
+ struct ipv6_sr_hdr *srh;
+ unsigned int off = 0;
+
+ srh = seg6_get_srh(skb, 0);
+ if (srh && srh->segments_left > 0)
+ return false;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (srh && !seg6_hmac_validate_skb(skb))
+ return false;
+#endif
+
+ if (ipv6_find_hdr(skb, &off, proto, NULL, NULL) < 0)
+ return false;
+
+ if (!pskb_pull(skb, off))
+ return false;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb), off);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ if (iptunnel_pull_offloads(skb))
+ return false;
+
+ return true;
+}
+
+static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
+{
+ struct in6_addr *addr;
+
+ srh->segments_left--;
+ addr = srh->segments + srh->segments_left;
+ *daddr = *addr;
+}
+
+static int
+seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+ u32 tbl_id, bool local_delivery, int oif)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct dst_entry *dst = NULL;
+ struct rt6_info *rt;
+ struct flowi6 fl6;
+ int dev_flags = 0;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_iif = skb->dev->ifindex;
+ fl6.flowi6_oif = oif;
+ fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ if (nhaddr)
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
+
+ if (!tbl_id && !oif) {
+ dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags);
+ } else if (tbl_id) {
+ struct fib6_table *table;
+
+ table = fib6_get_table(net, tbl_id);
+ if (!table)
+ goto out;
+
+ rt = ip6_pol_route(net, table, oif, &fl6, skb, flags);
+ dst = &rt->dst;
+ } else {
+ dst = ip6_route_output(net, NULL, &fl6);
+ }
+
+ /* we want to discard traffic destined for local packet processing,
+ * if @local_delivery is set to false.
+ */
+ if (!local_delivery)
+ dev_flags |= IFF_LOOPBACK;
+
+ if (dst && (dst_dev(dst)->flags & dev_flags) && !dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ }
+
+out:
+ if (!dst) {
+ rt = net->ipv6.ip6_blk_hole_entry;
+ dst = &rt->dst;
+ dst_hold(dst);
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ return dst->error;
+}
+
+int seg6_lookup_nexthop(struct sk_buff *skb,
+ struct in6_addr *nhaddr, u32 tbl_id)
+{
+ return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false, 0);
+}
+
+static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo)
+{
+ return finfo->lcblock_bits >> 3;
+}
+
+static __u8 seg6_flv_lcnode_func_octects(const struct seg6_flavors_info *finfo)
+{
+ return finfo->lcnode_func_bits >> 3;
+}
+
+static bool seg6_next_csid_is_arg_zero(const struct in6_addr *addr,
+ const struct seg6_flavors_info *finfo)
+{
+ __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+ __u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+ __u8 arg_octects;
+ int i;
+
+ arg_octects = 16 - blk_octects - fnc_octects;
+ for (i = 0; i < arg_octects; ++i) {
+ if (addr->s6_addr[blk_octects + fnc_octects + i] != 0x00)
+ return false;
+ }
+
+ return true;
+}
+
+/* assume that DA.Argument length > 0 */
+static void seg6_next_csid_advance_arg(struct in6_addr *addr,
+ const struct seg6_flavors_info *finfo)
+{
+ __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+ __u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+
+ /* advance DA.Argument */
+ memmove(&addr->s6_addr[blk_octects],
+ &addr->s6_addr[blk_octects + fnc_octects],
+ 16 - blk_octects - fnc_octects);
+
+ memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects);
+}
+
+static int input_action_end_finish(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+}
+
+static int input_action_end_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct ipv6_sr_hdr *srh;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh)
+ goto drop;
+
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ return input_action_end_finish(skb, slwt);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+ if (seg6_next_csid_is_arg_zero(daddr, finfo))
+ return input_action_end_core(skb, slwt);
+
+ /* update DA */
+ seg6_next_csid_advance_arg(daddr, finfo);
+
+ return input_action_end_finish(skb, slwt);
+}
+
+static int input_action_end_x_finish(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ seg6_lookup_any_nexthop(skb, &slwt->nh6, 0, false, slwt->oif);
+
+ return dst_input(skb);
+}
+
+static int input_action_end_x_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct ipv6_sr_hdr *srh;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh)
+ goto drop;
+
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ return input_action_end_x_finish(skb, slwt);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int end_x_next_csid_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+ if (seg6_next_csid_is_arg_zero(daddr, finfo))
+ return input_action_end_x_core(skb, slwt);
+
+ /* update DA */
+ seg6_next_csid_advance_arg(daddr, finfo);
+
+ return input_action_end_x_finish(skb, slwt);
+}
+
+static bool seg6_next_csid_enabled(__u32 fops)
+{
+ return fops & SEG6_F_LOCAL_FLV_NEXT_CSID;
+}
+
+/* Processing of SRv6 End, End.X, and End.T behaviors can be extended through
+ * the flavors framework. These behaviors must report the subset of (flavor)
+ * operations they currently implement. In this way, if a user specifies a
+ * flavor combination that is not supported by a given End* behavior, the
+ * kernel refuses to instantiate the tunnel reporting the error.
+ */
+static int seg6_flv_supp_ops_by_action(int action, __u32 *fops)
+{
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END:
+ *fops = SEG6_LOCAL_END_FLV_SUPP_OPS;
+ break;
+ case SEG6_LOCAL_ACTION_END_X:
+ *fops = SEG6_LOCAL_END_X_FLV_SUPP_OPS;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+/* We describe the packet state in relation to the absence/presence of the SRH
+ * and the Segment Left (SL) field.
+ * For our purposes, it is not necessary to record the exact value of the SL
+ * when the SID List consists of two or more segments.
+ */
+enum seg6_local_pktinfo {
+ /* the order really matters! */
+ SEG6_LOCAL_PKTINFO_NOHDR = 0,
+ SEG6_LOCAL_PKTINFO_SL_ZERO,
+ SEG6_LOCAL_PKTINFO_SL_ONE,
+ SEG6_LOCAL_PKTINFO_SL_MORE,
+ __SEG6_LOCAL_PKTINFO_MAX,
+};
+
+#define SEG6_LOCAL_PKTINFO_MAX (__SEG6_LOCAL_PKTINFO_MAX - 1)
+
+static enum seg6_local_pktinfo seg6_get_srh_pktinfo(struct ipv6_sr_hdr *srh)
+{
+ __u8 sgl;
+
+ if (!srh)
+ return SEG6_LOCAL_PKTINFO_NOHDR;
+
+ sgl = srh->segments_left;
+ if (sgl < 2)
+ return SEG6_LOCAL_PKTINFO_SL_ZERO + sgl;
+
+ return SEG6_LOCAL_PKTINFO_SL_MORE;
+}
+
+enum seg6_local_flv_action {
+ SEG6_LOCAL_FLV_ACT_UNSPEC = 0,
+ SEG6_LOCAL_FLV_ACT_END,
+ SEG6_LOCAL_FLV_ACT_PSP,
+ SEG6_LOCAL_FLV_ACT_USP,
+ SEG6_LOCAL_FLV_ACT_USD,
+ __SEG6_LOCAL_FLV_ACT_MAX
+};
+
+#define SEG6_LOCAL_FLV_ACT_MAX (__SEG6_LOCAL_FLV_ACT_MAX - 1)
+
+/* The action table for RFC8986 flavors (see the flv8986_act_tbl below)
+ * contains the actions (i.e. processing operations) to be applied on packets
+ * when flavors are configured for an End* behavior.
+ * By combining the pkinfo data and from the flavors mask, the macro
+ * computes the index used to access the elements (actions) stored in the
+ * action table. The index is structured as follows:
+ *
+ * index
+ * _______________/\________________
+ * / \
+ * +----------------+----------------+
+ * | pf | afm |
+ * +----------------+----------------+
+ * ph-1 ... p1 p0 fk-1 ... f1 f0
+ * MSB LSB
+ *
+ * where:
+ * - 'afm' (adjusted flavor mask) is the mask containing a combination of the
+ * RFC8986 flavors currently supported. 'afm' corresponds to the @fm
+ * argument of the macro whose value is righ-shifted by 1 bit. By doing so,
+ * we discard the SEG6_LOCAL_FLV_OP_UNSPEC flag (bit 0 in @fm) which is
+ * never used here;
+ * - 'pf' encodes the packet info (pktinfo) regarding the presence/absence of
+ * the SRH, SL = 0, etc. 'pf' is set with the value of @pf provided as
+ * argument to the macro.
+ */
+#define flv8986_act_tbl_idx(pf, fm) \
+ ((((pf) << bits_per(SEG6_LOCAL_FLV8986_SUPP_OPS)) | \
+ ((fm) & SEG6_LOCAL_FLV8986_SUPP_OPS)) >> SEG6_LOCAL_FLV_OP_PSP)
+
+/* We compute the size of the action table by considering the RFC8986 flavors
+ * actually supported by the kernel. In this way, the size is automatically
+ * adjusted when new flavors are supported.
+ */
+#define FLV8986_ACT_TBL_SIZE \
+ roundup_pow_of_two(flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_MAX, \
+ SEG6_LOCAL_FLV8986_SUPP_OPS))
+
+/* tbl_cfg(act, pf, fm) macro is used to easily configure the action
+ * table; it accepts 3 arguments:
+ * i) @act, the suffix from SEG6_LOCAL_FLV_ACT_{act} representing
+ * the action that should be applied on the packet;
+ * ii) @pf, the suffix from SEG6_LOCAL_PKTINFO_{pf} reporting the packet
+ * info about the lack/presence of SRH, SRH with SL = 0, etc;
+ * iii) @fm, the mask of flavors.
+ */
+#define tbl_cfg(act, pf, fm) \
+ [flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_##pf, \
+ (fm))] = SEG6_LOCAL_FLV_ACT_##act
+
+/* shorthand for improving readability */
+#define F_PSP SEG6_F_LOCAL_FLV_PSP
+
+/* The table contains, for each combination of the pktinfo data and
+ * flavors, the action that should be taken on a packet (e.g.
+ * "standard" Endpoint processing, Penultimate Segment Pop, etc).
+ *
+ * By default, table entries not explicitly configured are initialized with the
+ * SEG6_LOCAL_FLV_ACT_UNSPEC action, which generally has the effect of
+ * discarding the processed packet.
+ */
+static const u8 flv8986_act_tbl[FLV8986_ACT_TBL_SIZE] = {
+ /* PSP variant for packet where SRH with SL = 1 */
+ tbl_cfg(PSP, SL_ONE, F_PSP),
+ /* End for packet where the SRH with SL > 1*/
+ tbl_cfg(END, SL_MORE, F_PSP),
+};
+
+#undef F_PSP
+#undef tbl_cfg
+
+/* For each flavor defined in RFC8986 (or a combination of them) an action is
+ * performed on the packet. The specific action depends on:
+ * - info extracted from the packet (i.e. pktinfo data) regarding the
+ * lack/presence of the SRH, and if the SRH is available, on the value of
+ * Segment Left field;
+ * - the mask of flavors configured for the specific SRv6 End* behavior.
+ *
+ * The function combines both the pkinfo and the flavors mask to evaluate the
+ * corresponding action to be taken on the packet.
+ */
+static enum seg6_local_flv_action
+seg6_local_flv8986_act_lookup(enum seg6_local_pktinfo pinfo, __u32 flvmask)
+{
+ unsigned long index;
+
+ /* check if the provided mask of flavors is supported */
+ if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS))
+ return SEG6_LOCAL_FLV_ACT_UNSPEC;
+
+ index = flv8986_act_tbl_idx(pinfo, flvmask);
+ if (unlikely(index >= FLV8986_ACT_TBL_SIZE))
+ return SEG6_LOCAL_FLV_ACT_UNSPEC;
+
+ return flv8986_act_tbl[index];
+}
+
+/* skb->data must be aligned with skb->network_header */
+static bool seg6_pop_srh(struct sk_buff *skb, int srhoff)
+{
+ struct ipv6_sr_hdr *srh;
+ struct ipv6hdr *iph;
+ __u8 srh_nexthdr;
+ int thoff = -1;
+ int srhlen;
+ int nhlen;
+
+ if (unlikely(srhoff < sizeof(*iph) ||
+ !pskb_may_pull(skb, srhoff + sizeof(*srh))))
+ return false;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srhlen = ipv6_optlen(srh);
+
+ /* we are about to mangle the pkt, let's check if we can write on it */
+ if (unlikely(skb_ensure_writable(skb, srhoff + srhlen)))
+ return false;
+
+ /* skb_ensure_writable() may change skb pointers; evaluate srh again */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_nexthdr = srh->nexthdr;
+
+ if (unlikely(!skb_transport_header_was_set(skb)))
+ goto pull;
+
+ nhlen = skb_network_header_len(skb);
+ /* we have to deal with the transport header: it could be set before
+ * the SRH, after the SRH, or within it (which is considered wrong,
+ * however).
+ */
+ if (likely(nhlen <= srhoff))
+ thoff = nhlen;
+ else if (nhlen >= srhoff + srhlen)
+ /* transport_header is set after the SRH */
+ thoff = nhlen - srhlen;
+ else
+ /* transport_header falls inside the SRH; hence, we can't
+ * restore the transport_header pointer properly after
+ * SRH removing operation.
+ */
+ return false;
+pull:
+ /* we need to pop the SRH:
+ * 1) first of all, we pull out everything from IPv6 header up to SRH
+ * (included) evaluating also the rcsum;
+ * 2) we overwrite (and then remove) the SRH by properly moving the
+ * IPv6 along with any extension header that precedes the SRH;
+ * 3) At the end, we push back the pulled headers (except for SRH,
+ * obviously).
+ */
+ skb_pull_rcsum(skb, srhoff + srhlen);
+ memmove(skb_network_header(skb) + srhlen, skb_network_header(skb),
+ srhoff);
+ skb_push(skb, srhoff);
+
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ if (likely(thoff >= 0))
+ skb_set_transport_header(skb, thoff);
+
+ iph = ipv6_hdr(skb);
+ if (iph->nexthdr == NEXTHDR_ROUTING) {
+ iph->nexthdr = srh_nexthdr;
+ } else {
+ /* we must look for the extension header (EXTH, for short) that
+ * immediately precedes the SRH we have just removed.
+ * Then, we update the value of the EXTH nexthdr with the one
+ * contained in the SRH nexthdr.
+ */
+ unsigned int off = sizeof(*iph);
+ struct ipv6_opt_hdr *hp, _hdr;
+ __u8 nexthdr = iph->nexthdr;
+
+ for (;;) {
+ if (unlikely(!ipv6_ext_hdr(nexthdr) ||
+ nexthdr == NEXTHDR_NONE))
+ return false;
+
+ hp = skb_header_pointer(skb, off, sizeof(_hdr), &_hdr);
+ if (unlikely(!hp))
+ return false;
+
+ if (hp->nexthdr == NEXTHDR_ROUTING) {
+ hp->nexthdr = srh_nexthdr;
+ break;
+ }
+
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ fallthrough;
+ case NEXTHDR_AUTH:
+ /* we expect SRH before FRAG and AUTH */
+ return false;
+ default:
+ off += ipv6_optlen(hp);
+ break;
+ }
+
+ nexthdr = hp->nexthdr;
+ }
+ }
+
+ iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, iph, srhoff);
+
+ return true;
+}
+
+/* process the packet on the basis of the RFC8986 flavors set for the given
+ * SRv6 End behavior instance.
+ */
+static int end_flv8986_core(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ enum seg6_local_flv_action action;
+ enum seg6_local_pktinfo pinfo;
+ struct ipv6_sr_hdr *srh;
+ __u32 flvmask;
+ int srhoff;
+
+ srh = seg6_get_srh(skb, 0);
+ srhoff = srh ? ((unsigned char *)srh - skb->data) : 0;
+ pinfo = seg6_get_srh_pktinfo(srh);
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (srh && !seg6_hmac_validate_skb(skb))
+ goto drop;
+#endif
+ flvmask = finfo->flv_ops;
+ if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) {
+ pr_warn_once("seg6local: invalid RFC8986 flavors\n");
+ goto drop;
+ }
+
+ /* retrieve the action triggered by the combination of pktinfo data and
+ * the flavors mask.
+ */
+ action = seg6_local_flv8986_act_lookup(pinfo, flvmask);
+ switch (action) {
+ case SEG6_LOCAL_FLV_ACT_END:
+ /* process the packet as the "standard" End behavior */
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+ break;
+ case SEG6_LOCAL_FLV_ACT_PSP:
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ if (unlikely(!seg6_pop_srh(skb, srhoff)))
+ goto drop;
+ break;
+ case SEG6_LOCAL_FLV_ACT_UNSPEC:
+ fallthrough;
+ default:
+ /* by default, we drop the packet since we could not find a
+ * suitable action.
+ */
+ goto drop;
+ }
+
+ return input_action_end_finish(skb, slwt);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+/* regular endpoint function */
+static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+
+ if (!fops)
+ return input_action_end_core(skb, slwt);
+
+ /* check for the presence of NEXT-C-SID since it applies first */
+ if (seg6_next_csid_enabled(fops))
+ return end_next_csid_core(skb, slwt);
+
+ /* the specific processing function to be performed on the packet
+ * depends on the combination of flavors defined in RFC8986 and some
+ * information extracted from the packet, e.g. presence/absence of SRH,
+ * Segment Left = 0, etc.
+ */
+ return end_flv8986_core(skb, slwt);
+}
+
+/* regular endpoint, and forward to specified nexthop */
+static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+
+ /* check for the presence of NEXT-C-SID since it applies first */
+ if (seg6_next_csid_enabled(fops))
+ return end_x_next_csid_core(skb, slwt);
+
+ return input_action_end_x_core(skb, slwt);
+}
+
+static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct ipv6_sr_hdr *srh;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh)
+ goto drop;
+
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ seg6_lookup_nexthop(skb, NULL, slwt->table);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+/* decapsulate and forward inner L2 frame on specified interface */
+static int input_action_end_dx2(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct net *net = dev_net(skb->dev);
+ struct net_device *odev;
+ struct ethhdr *eth;
+
+ if (!decap_and_validate(skb, IPPROTO_ETHERNET))
+ goto drop;
+
+ if (!pskb_may_pull(skb, ETH_HLEN))
+ goto drop;
+
+ skb_reset_mac_header(skb);
+ eth = (struct ethhdr *)skb->data;
+
+ /* To determine the frame's protocol, we assume it is 802.3. This avoids
+ * a call to eth_type_trans(), which is not really relevant for our
+ * use case.
+ */
+ if (!eth_proto_is_802_3(eth->h_proto))
+ goto drop;
+
+ odev = dev_get_by_index_rcu(net, slwt->oif);
+ if (!odev)
+ goto drop;
+
+ /* As we accept Ethernet frames, make sure the egress device is of
+ * the correct type.
+ */
+ if (odev->type != ARPHRD_ETHER)
+ goto drop;
+
+ if (!(odev->flags & IFF_UP) || !netif_carrier_ok(odev))
+ goto drop;
+
+ skb_orphan(skb);
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ skb_forward_csum(skb);
+
+ if (skb->len - ETH_HLEN > odev->mtu)
+ goto drop;
+
+ skb->dev = odev;
+ skb->protocol = eth->h_proto;
+
+ return dev_queue_xmit(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int input_action_end_dx6_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct in6_addr *nhaddr = NULL;
+ struct seg6_local_lwt *slwt;
+
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+
+ /* The inner packet is not associated to any local interface,
+ * so we do not call netif_rx().
+ *
+ * If slwt->nh6 is set to ::, then lookup the nexthop for the
+ * inner packet's DA. Otherwise, use the specified nexthop.
+ */
+ if (!ipv6_addr_any(&slwt->nh6))
+ nhaddr = &slwt->nh6;
+
+ seg6_lookup_nexthop(skb, nhaddr, 0);
+
+ return dst_input(skb);
+}
+
+/* decapsulate and forward to specified nexthop */
+static int input_action_end_dx6(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ /* this function accepts IPv6 encapsulated packets, with either
+ * an SRH with SL=0, or no SRH.
+ */
+
+ if (!decap_and_validate(skb, IPPROTO_IPV6))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+ nf_reset_ct(skb);
+
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev,
+ NULL, input_action_end_dx6_finish);
+
+ return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb);
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int input_action_end_dx4_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ enum skb_drop_reason reason;
+ struct seg6_local_lwt *slwt;
+ struct iphdr *iph;
+ __be32 nhaddr;
+
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+
+ iph = ip_hdr(skb);
+
+ nhaddr = slwt->nh4.s_addr ?: iph->daddr;
+
+ skb_dst_drop(skb);
+
+ reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
+ if (reason) {
+ kfree_skb_reason(skb, reason);
+ return -EINVAL;
+ }
+
+ return dst_input(skb);
+}
+
+static int input_action_end_dx4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ if (!decap_and_validate(skb, IPPROTO_IPIP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ skb->protocol = htons(ETH_P_IP);
+ skb_set_transport_header(skb, sizeof(struct iphdr));
+ nf_reset_ct(skb);
+
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev,
+ NULL, input_action_end_dx4_finish);
+
+ return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb);
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg)
+{
+ const struct nl_info *nli = &fib6_cfg->fc_nlinfo;
+
+ return nli->nl_net;
+}
+
+static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg,
+ u16 family, struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ int vrf_ifindex;
+ struct net *net;
+
+ net = fib6_config_get_net(cfg);
+
+ /* note that vrf_table was already set by parse_nla_vrftable() */
+ vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net,
+ info->vrf_table);
+ if (vrf_ifindex < 0) {
+ if (vrf_ifindex == -EPERM) {
+ NL_SET_ERR_MSG(extack,
+ "Strict mode for VRF is disabled");
+ } else if (vrf_ifindex == -ENODEV) {
+ NL_SET_ERR_MSG(extack,
+ "Table has no associated VRF device");
+ } else {
+ pr_debug("seg6local: SRv6 End.DT* creation error=%d\n",
+ vrf_ifindex);
+ }
+
+ return vrf_ifindex;
+ }
+
+ info->net = net;
+ info->vrf_ifindex = vrf_ifindex;
+
+ info->family = family;
+ info->mode = DT_VRF_MODE;
+
+ return 0;
+}
+
+/* The SRv6 End.DT4/DT6 behavior extracts the inner (IPv4/IPv6) packet and
+ * routes the IPv4/IPv6 packet by looking at the configured routing table.
+ *
+ * In the SRv6 End.DT4/DT6 use case, we can receive traffic (IPv6+Segment
+ * Routing Header packets) from several interfaces and the outer IPv6
+ * destination address (DA) is used for retrieving the specific instance of the
+ * End.DT4/DT6 behavior that should process the packets.
+ *
+ * However, the inner IPv4/IPv6 packet is not really bound to any receiving
+ * interface and thus the End.DT4/DT6 sets the VRF (associated with the
+ * corresponding routing table) as the *receiving* interface.
+ * In other words, the End.DT4/DT6 processes a packet as if it has been received
+ * directly by the VRF (and not by one of its slave devices, if any).
+ * In this way, the VRF interface is used for routing the IPv4/IPv6 packet in
+ * according to the routing table configured by the End.DT4/DT6 instance.
+ *
+ * This design allows you to get some interesting features like:
+ * 1) the statistics on rx packets;
+ * 2) the possibility to install a packet sniffer on the receiving interface
+ * (the VRF one) for looking at the incoming packets;
+ * 3) the possibility to leverage the netfilter prerouting hook for the inner
+ * IPv4 packet.
+ *
+ * This function returns:
+ * - the sk_buff* when the VRF rcv handler has processed the packet correctly;
+ * - NULL when the skb is consumed by the VRF rcv handler;
+ * - a pointer which encodes a negative error number in case of error.
+ * Note that in this case, the function takes care of freeing the skb.
+ */
+static struct sk_buff *end_dt_vrf_rcv(struct sk_buff *skb, u16 family,
+ struct net_device *dev)
+{
+ /* based on l3mdev_ip_rcv; we are only interested in the master */
+ if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev)))
+ goto drop;
+
+ if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv))
+ goto drop;
+
+ /* the decap packet IPv4/IPv6 does not come with any mac header info.
+ * We must unset the mac header to allow the VRF device to rebuild it,
+ * just in case there is a sniffer attached on the device.
+ */
+ skb_unset_mac_header(skb);
+
+ skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, family);
+ if (!skb)
+ /* the skb buffer was consumed by the handler */
+ return NULL;
+
+ /* when a packet is received by a VRF or by one of its slaves, the
+ * master device reference is set into the skb.
+ */
+ if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex))
+ goto drop;
+
+ return skb;
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct net_device *end_dt_get_vrf_rcu(struct sk_buff *skb,
+ struct seg6_end_dt_info *info)
+{
+ int vrf_ifindex = info->vrf_ifindex;
+ struct net *net = info->net;
+
+ if (unlikely(vrf_ifindex < 0))
+ goto error;
+
+ if (unlikely(!net_eq(dev_net(skb->dev), net)))
+ goto error;
+
+ return dev_get_by_index_rcu(net, vrf_ifindex);
+
+error:
+ return NULL;
+}
+
+static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt, u16 family)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ struct net_device *vrf;
+ __be16 protocol;
+ int hdrlen;
+
+ vrf = end_dt_get_vrf_rcu(skb, info);
+ if (unlikely(!vrf))
+ goto drop;
+
+ switch (family) {
+ case AF_INET:
+ protocol = htons(ETH_P_IP);
+ hdrlen = sizeof(struct iphdr);
+ break;
+ case AF_INET6:
+ protocol = htons(ETH_P_IPV6);
+ hdrlen = sizeof(struct ipv6hdr);
+ break;
+ case AF_UNSPEC:
+ fallthrough;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(info->family != AF_UNSPEC && info->family != family)) {
+ pr_warn_once("seg6local: SRv6 End.DT* family mismatch");
+ goto drop;
+ }
+
+ skb->protocol = protocol;
+
+ skb_dst_drop(skb);
+
+ skb_set_transport_header(skb, hdrlen);
+ nf_reset_ct(skb);
+
+ return end_dt_vrf_rcv(skb, family, vrf);
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static int input_action_end_dt4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ enum skb_drop_reason reason;
+ struct iphdr *iph;
+
+ if (!decap_and_validate(skb, IPPROTO_IPIP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ skb = end_dt_vrf_core(skb, slwt, AF_INET);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ iph = ip_hdr(skb);
+
+ reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev);
+ if (unlikely(reason))
+ goto drop;
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET, extack);
+}
+
+static enum
+seg6_end_dt_mode seg6_end_dt6_parse_mode(struct seg6_local_lwt *slwt)
+{
+ unsigned long parsed_optattrs = slwt->parsed_optattrs;
+ bool legacy, vrfmode;
+
+ legacy = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE));
+ vrfmode = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE));
+
+ if (!(legacy ^ vrfmode))
+ /* both are absent or present: invalid DT6 mode */
+ return DT_INVALID_MODE;
+
+ return legacy ? DT_LEGACY_MODE : DT_VRF_MODE;
+}
+
+static enum seg6_end_dt_mode seg6_end_dt6_get_mode(struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ return info->mode;
+}
+
+static int seg6_end_dt6_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ enum seg6_end_dt_mode mode = seg6_end_dt6_parse_mode(slwt);
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ switch (mode) {
+ case DT_LEGACY_MODE:
+ info->mode = DT_LEGACY_MODE;
+ return 0;
+ case DT_VRF_MODE:
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET6, extack);
+ default:
+ NL_SET_ERR_MSG(extack, "table or vrftable must be specified");
+ return -EINVAL;
+ }
+}
+#endif
+
+static int input_action_end_dt6(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ if (!decap_and_validate(skb, IPPROTO_IPV6))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ if (seg6_end_dt6_get_mode(slwt) == DT_LEGACY_MODE)
+ goto legacy_mode;
+
+ /* DT6_VRF_MODE */
+ skb = end_dt_vrf_core(skb, slwt, AF_INET6);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* note: this time we do not need to specify the table because the VRF
+ * takes care of selecting the correct table.
+ */
+ seg6_lookup_any_nexthop(skb, NULL, 0, true, 0);
+
+ return dst_input(skb);
+
+legacy_mode:
+#endif
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ seg6_lookup_any_nexthop(skb, NULL, slwt->table, true, 0);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int seg6_end_dt46_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_UNSPEC, extack);
+}
+
+static int input_action_end_dt46(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ unsigned int off = 0;
+ int nexthdr;
+
+ nexthdr = ipv6_find_hdr(skb, &off, -1, NULL, NULL);
+ if (unlikely(nexthdr < 0))
+ goto drop;
+
+ switch (nexthdr) {
+ case IPPROTO_IPIP:
+ return input_action_end_dt4(skb, slwt);
+ case IPPROTO_IPV6:
+ return input_action_end_dt6(skb, slwt);
+ }
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+#endif
+
+/* push an SRH on top of the current one */
+static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct ipv6_sr_hdr *srh;
+ int err = -EINVAL;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh)
+ goto drop;
+
+ err = seg6_do_srh_inline(skb, slwt->srh);
+ if (err)
+ goto drop;
+
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+/* encapsulate within an outer IPv6 header and a specified SRH */
+static int input_action_end_b6_encap(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct ipv6_sr_hdr *srh;
+ int err = -EINVAL;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh)
+ goto drop;
+
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ err = seg6_do_srh_encap(skb, slwt->srh, IPPROTO_IPV6);
+ if (err)
+ goto drop;
+
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
+
+ lockdep_assert_held(&srh_state->bh_lock);
+ if (unlikely(srh == NULL))
+ return false;
+
+ if (unlikely(!srh_state->valid)) {
+ if ((srh_state->hdrlen & 7) != 0)
+ return false;
+
+ srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+ if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true))
+ return false;
+
+ srh_state->valid = true;
+ }
+
+ return true;
+}
+
+static int input_action_end_bpf(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_bpf_srh_state *srh_state;
+ struct ipv6_sr_hdr *srh;
+ int ret;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ /* The access to the per-CPU buffer srh_state is protected by running
+ * always in softirq context (with disabled BH). On PREEMPT_RT the
+ * required locking is provided by the following local_lock_nested_bh()
+ * statement. It is also accessed by the bpf_lwt_seg6_* helpers via
+ * bpf_prog_run_save_cb().
+ */
+ local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock);
+ srh_state = this_cpu_ptr(&seg6_bpf_srh_states);
+ srh_state->srh = srh;
+ srh_state->hdrlen = srh->hdrlen << 3;
+ srh_state->valid = true;
+
+ rcu_read_lock();
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ case BPF_REDIRECT:
+ break;
+ case BPF_DROP:
+ goto drop;
+ default:
+ pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
+ goto drop;
+ }
+
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ goto drop;
+ local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
+
+ if (ret != BPF_REDIRECT)
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+
+drop:
+ local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static struct seg6_action_desc seg6_action_table[] = {
+ {
+ .action = SEG6_LOCAL_ACTION_END,
+ .attrs = 0,
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_LOCAL_FLAVORS,
+ .input = input_action_end,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_X,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_LOCAL_FLAVORS |
+ SEG6_F_ATTR(SEG6_LOCAL_OIF),
+ .input = input_action_end_x,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_T,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_t,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_DX2,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_OIF),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_dx2,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_DX6,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_dx6,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_DX4,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH4),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_dx4,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_DT4,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt4,
+ .slwt_ops = {
+ .build_state = seg6_end_dt4_build,
+ },
+#endif
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_DT6,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .attrs = 0,
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
+ SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .slwt_ops = {
+ .build_state = seg6_end_dt6_build,
+ },
+#else
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#endif
+ .input = input_action_end_dt6,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_DT46,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt46,
+ .slwt_ops = {
+ .build_state = seg6_end_dt46_build,
+ },
+#endif
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_B6,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_b6,
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_B6_ENCAP,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_b6_encap,
+ .static_headroom = sizeof(struct ipv6hdr),
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_BPF,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_BPF),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+ .input = input_action_end_bpf,
+ },
+
+};
+
+static struct seg6_action_desc *__get_action_desc(int action)
+{
+ struct seg6_action_desc *desc;
+ int i, count;
+
+ count = ARRAY_SIZE(seg6_action_table);
+ for (i = 0; i < count; i++) {
+ desc = &seg6_action_table[i];
+ if (desc->action == action)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
+{
+ return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
+}
+
+static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
+ unsigned int len, int err)
+{
+ struct pcpu_seg6_local_counters *pcounters;
+
+ pcounters = this_cpu_ptr(slwt->pcpu_counters);
+ u64_stats_update_begin(&pcounters->syncp);
+
+ if (likely(!err)) {
+ u64_stats_inc(&pcounters->packets);
+ u64_stats_add(&pcounters->bytes, len);
+ } else {
+ u64_stats_inc(&pcounters->errors);
+ }
+
+ u64_stats_update_end(&pcounters->syncp);
+}
+
+static int seg6_local_input_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct seg6_action_desc *desc;
+ struct seg6_local_lwt *slwt;
+ unsigned int len = skb->len;
+ int rc;
+
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+ desc = slwt->desc;
+
+ rc = desc->input(skb, slwt);
+
+ if (!seg6_lwtunnel_counters_enabled(slwt))
+ return rc;
+
+ seg6_local_update_counters(slwt, len, rc);
+
+ return rc;
+}
+
+static int seg6_local_input(struct sk_buff *skb)
+{
+ if (skb->protocol != htons(ETH_P_IPV6)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ seg6_local_input_core);
+
+ return seg6_local_input_core(dev_net(skb->dev), NULL, skb);
+}
+
+static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
+ [SEG6_LOCAL_ACTION] = { .type = NLA_U32 },
+ [SEG6_LOCAL_SRH] = { .type = NLA_BINARY },
+ [SEG6_LOCAL_TABLE] = { .type = NLA_U32 },
+ [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 },
+ [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)),
+ [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [SEG6_LOCAL_IIF] = { .type = NLA_U32 },
+ [SEG6_LOCAL_OIF] = { .type = NLA_U32 },
+ [SEG6_LOCAL_BPF] = { .type = NLA_NESTED },
+ [SEG6_LOCAL_COUNTERS] = { .type = NLA_NESTED },
+ [SEG6_LOCAL_FLAVORS] = { .type = NLA_NESTED },
+};
+
+static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct ipv6_sr_hdr *srh;
+ int len;
+
+ srh = nla_data(attrs[SEG6_LOCAL_SRH]);
+ len = nla_len(attrs[SEG6_LOCAL_SRH]);
+
+ /* SRH must contain at least one segment */
+ if (len < sizeof(*srh) + sizeof(struct in6_addr))
+ return -EINVAL;
+
+ if (!seg6_validate_srh(srh, len, false))
+ return -EINVAL;
+
+ slwt->srh = kmemdup(srh, len, GFP_KERNEL);
+ if (!slwt->srh)
+ return -ENOMEM;
+
+ slwt->headroom += len;
+
+ return 0;
+}
+
+static int put_nla_srh(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct ipv6_sr_hdr *srh;
+ struct nlattr *nla;
+ int len;
+
+ srh = slwt->srh;
+ len = (srh->hdrlen + 1) << 3;
+
+ nla = nla_reserve(skb, SEG6_LOCAL_SRH, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), srh, len);
+
+ return 0;
+}
+
+static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ int len = (a->srh->hdrlen + 1) << 3;
+
+ if (len != ((b->srh->hdrlen + 1) << 3))
+ return 1;
+
+ return memcmp(a->srh, b->srh, len);
+}
+
+static void destroy_attr_srh(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->srh);
+}
+
+static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]);
+
+ return 0;
+}
+
+static int put_nla_table(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ if (nla_put_u32(skb, SEG6_LOCAL_TABLE, slwt->table))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ if (a->table != b->table)
+ return 1;
+
+ return 0;
+}
+
+static struct
+seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ return &slwt->dt_info;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+static int parse_nla_vrftable(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ info->vrf_table = nla_get_u32(attrs[SEG6_LOCAL_VRFTABLE]);
+
+ return 0;
+}
+
+static int put_nla_vrftable(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ if (nla_put_u32(skb, SEG6_LOCAL_VRFTABLE, info->vrf_table))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_end_dt_info *info_a = seg6_possible_end_dt_info(a);
+ struct seg6_end_dt_info *info_b = seg6_possible_end_dt_info(b);
+
+ if (info_a->vrf_table != info_b->vrf_table)
+ return 1;
+
+ return 0;
+}
+
+static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]),
+ sizeof(struct in_addr));
+
+ return 0;
+}
+
+static int put_nla_nh4(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, SEG6_LOCAL_NH4, sizeof(struct in_addr));
+ if (!nla)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), &slwt->nh4, sizeof(struct in_addr));
+
+ return 0;
+}
+
+static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr));
+}
+
+static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]),
+ sizeof(struct in6_addr));
+
+ return 0;
+}
+
+static int put_nla_nh6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, SEG6_LOCAL_NH6, sizeof(struct in6_addr));
+ if (!nla)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), &slwt->nh6, sizeof(struct in6_addr));
+
+ return 0;
+}
+
+static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr));
+}
+
+static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]);
+
+ return 0;
+}
+
+static int put_nla_iif(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ if (nla_put_u32(skb, SEG6_LOCAL_IIF, slwt->iif))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ if (a->iif != b->iif)
+ return 1;
+
+ return 0;
+}
+
+static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]);
+
+ return 0;
+}
+
+static int put_nla_oif(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ if (nla_put_u32(skb, SEG6_LOCAL_OIF, slwt->oif))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ if (a->oif != b->oif)
+ return 1;
+
+ return 0;
+}
+
+#define MAX_PROG_NAME 256
+static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
+ [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, },
+ [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_BPF_PROG_MAX,
+ attrs[SEG6_LOCAL_BPF],
+ bpf_prog_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
+ return -EINVAL;
+
+ slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
+ if (!slwt->bpf.name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
+ p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
+ if (IS_ERR(p)) {
+ kfree(slwt->bpf.name);
+ return PTR_ERR(p);
+ }
+
+ slwt->bpf.prog = p;
+ return 0;
+}
+
+static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct nlattr *nest;
+
+ if (!slwt->bpf.prog)
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, SEG6_LOCAL_BPF);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
+ return -EMSGSIZE;
+
+ if (slwt->bpf.name &&
+ nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ if (!a->bpf.name && !b->bpf.name)
+ return 0;
+
+ if (!a->bpf.name || !b->bpf.name)
+ return 1;
+
+ return strcmp(a->bpf.name, b->bpf.name);
+}
+
+static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->bpf.name);
+ if (slwt->bpf.prog)
+ bpf_prog_put(slwt->bpf.prog);
+}
+
+static const struct
+nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
+ [SEG6_LOCAL_CNT_PACKETS] = { .type = NLA_U64 },
+ [SEG6_LOCAL_CNT_BYTES] = { .type = NLA_U64 },
+ [SEG6_LOCAL_CNT_ERRORS] = { .type = NLA_U64 },
+};
+
+static int parse_nla_counters(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct pcpu_seg6_local_counters __percpu *pcounters;
+ struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
+ int ret;
+
+ ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
+ attrs[SEG6_LOCAL_COUNTERS],
+ seg6_local_counters_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ /* basic support for SRv6 Behavior counters requires at least:
+ * packets, bytes and errors.
+ */
+ if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
+ !tb[SEG6_LOCAL_CNT_ERRORS])
+ return -EINVAL;
+
+ /* counters are always zero initialized */
+ pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
+ if (!pcounters)
+ return -ENOMEM;
+
+ slwt->pcpu_counters = pcounters;
+
+ return 0;
+}
+
+static int seg6_local_fill_nla_counters(struct sk_buff *skb,
+ struct seg6_local_counters *counters)
+{
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_local_counters counters = { 0, 0, 0 };
+ struct nlattr *nest;
+ int rc, i;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for_each_possible_cpu(i) {
+ struct pcpu_seg6_local_counters *pcounters;
+ u64 packets, bytes, errors;
+ unsigned int start;
+
+ pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
+ do {
+ start = u64_stats_fetch_begin(&pcounters->syncp);
+
+ packets = u64_stats_read(&pcounters->packets);
+ bytes = u64_stats_read(&pcounters->bytes);
+ errors = u64_stats_read(&pcounters->errors);
+
+ } while (u64_stats_fetch_retry(&pcounters->syncp, start));
+
+ counters.packets += packets;
+ counters.bytes += bytes;
+ counters.errors += errors;
+ }
+
+ rc = seg6_local_fill_nla_counters(skb, &counters);
+ if (rc < 0) {
+ nla_nest_cancel(skb, nest);
+ return rc;
+ }
+
+ return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ /* a and b are equal if both have pcpu_counters set or not */
+ return (!!((unsigned long)a->pcpu_counters)) ^
+ (!!((unsigned long)b->pcpu_counters));
+}
+
+static void destroy_attr_counters(struct seg6_local_lwt *slwt)
+{
+ free_percpu(slwt->pcpu_counters);
+}
+
+static const
+struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = {
+ [SEG6_LOCAL_FLV_OPERATION] = { .type = NLA_U32 },
+ [SEG6_LOCAL_FLV_LCBLOCK_BITS] = { .type = NLA_U8 },
+ [SEG6_LOCAL_FLV_LCNODE_FN_BITS] = { .type = NLA_U8 },
+};
+
+/* check whether the lengths of the Locator-Block and Locator-Node Function
+ * are compatible with the dimension of a C-SID container.
+ */
+static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len)
+{
+ /* Locator-Block and Locator-Node Function cannot exceed 128 bits
+ * (i.e. C-SID container length).
+ */
+ if (next_csid_chk_cntr_bits(block_len, func_len))
+ return -EINVAL;
+
+ /* Locator-Block length must be greater than zero and evenly divisible
+ * by 8. There must be room for a Locator-Node Function, at least.
+ */
+ if (next_csid_chk_lcblock_bits(block_len))
+ return -EINVAL;
+
+ /* Locator-Node Function length must be greater than zero and evenly
+ * divisible by 8. There must be room for the Locator-Block.
+ */
+ if (next_csid_chk_lcnode_fn_bits(func_len))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int seg6_parse_nla_next_csid_cfg(struct nlattr **tb,
+ struct seg6_flavors_info *finfo,
+ struct netlink_ext_ack *extack)
+{
+ __u8 func_len = SEG6_LOCAL_LCNODE_FN_DBITS;
+ __u8 block_len = SEG6_LOCAL_LCBLOCK_DBITS;
+ int rc;
+
+ if (tb[SEG6_LOCAL_FLV_LCBLOCK_BITS])
+ block_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCBLOCK_BITS]);
+
+ if (tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS])
+ func_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS]);
+
+ rc = seg6_chk_next_csid_cfg(block_len, func_len);
+ if (rc < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid Locator Block/Node Function lengths");
+ return rc;
+ }
+
+ finfo->lcblock_bits = block_len;
+ finfo->lcnode_func_bits = func_len;
+
+ return 0;
+}
+
+static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1];
+ int action = slwt->action;
+ __u32 fops, supp_fops;
+ int rc;
+
+ rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX,
+ attrs[SEG6_LOCAL_FLAVORS],
+ seg6_local_flavors_policy, NULL);
+ if (rc < 0)
+ return rc;
+
+ /* this attribute MUST always be present since it represents the Flavor
+ * operation(s) to be carried out.
+ */
+ if (!tb[SEG6_LOCAL_FLV_OPERATION])
+ return -EINVAL;
+
+ fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]);
+ rc = seg6_flv_supp_ops_by_action(action, &supp_fops);
+ if (rc < 0 || (fops & ~supp_fops)) {
+ NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)");
+ return -EOPNOTSUPP;
+ }
+
+ finfo->flv_ops = fops;
+
+ if (seg6_next_csid_enabled(fops)) {
+ /* Locator-Block and Locator-Node Function lengths can be
+ * provided by the user space. Otherwise, default values are
+ * applied.
+ */
+ rc = seg6_parse_nla_next_csid_cfg(tb, finfo, extack);
+ if (rc < 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int seg6_fill_nla_next_csid_cfg(struct sk_buff *skb,
+ struct seg6_flavors_info *finfo)
+{
+ if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCBLOCK_BITS, finfo->lcblock_bits))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCNODE_FN_BITS,
+ finfo->lcnode_func_bits))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int put_nla_flavors(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+ struct nlattr *nest;
+ int rc;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_FLAVORS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, SEG6_LOCAL_FLV_OPERATION, fops)) {
+ rc = -EMSGSIZE;
+ goto err;
+ }
+
+ if (seg6_next_csid_enabled(fops)) {
+ rc = seg6_fill_nla_next_csid_cfg(skb, finfo);
+ if (rc < 0)
+ goto err;
+ }
+
+ return nla_nest_end(skb, nest);
+
+err:
+ nla_nest_cancel(skb, nest);
+ return rc;
+}
+
+static int seg6_cmp_nla_next_csid_cfg(struct seg6_flavors_info *finfo_a,
+ struct seg6_flavors_info *finfo_b)
+{
+ if (finfo_a->lcblock_bits != finfo_b->lcblock_bits)
+ return 1;
+
+ if (finfo_a->lcnode_func_bits != finfo_b->lcnode_func_bits)
+ return 1;
+
+ return 0;
+}
+
+static int cmp_nla_flavors(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_flavors_info *finfo_a = &a->flv_info;
+ struct seg6_flavors_info *finfo_b = &b->flv_info;
+
+ if (finfo_a->flv_ops != finfo_b->flv_ops)
+ return 1;
+
+ if (seg6_next_csid_enabled(finfo_a->flv_ops)) {
+ if (seg6_cmp_nla_next_csid_cfg(finfo_a, finfo_b))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int encap_size_flavors(struct seg6_local_lwt *slwt)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ int nlsize;
+
+ nlsize = nla_total_size(0) + /* nest SEG6_LOCAL_FLAVORS */
+ nla_total_size(4); /* SEG6_LOCAL_FLV_OPERATION */
+
+ if (seg6_next_csid_enabled(finfo->flv_ops))
+ nlsize += nla_total_size(1) + /* SEG6_LOCAL_FLV_LCBLOCK_BITS */
+ nla_total_size(1); /* SEG6_LOCAL_FLV_LCNODE_FN_BITS */
+
+ return nlsize;
+}
+
+struct seg6_action_param {
+ int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack);
+ int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+ int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+
+ /* optional destroy() callback useful for releasing resources which
+ * have been previously acquired in the corresponding parse()
+ * function.
+ */
+ void (*destroy)(struct seg6_local_lwt *slwt);
+};
+
+static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
+ [SEG6_LOCAL_SRH] = { .parse = parse_nla_srh,
+ .put = put_nla_srh,
+ .cmp = cmp_nla_srh,
+ .destroy = destroy_attr_srh },
+
+ [SEG6_LOCAL_TABLE] = { .parse = parse_nla_table,
+ .put = put_nla_table,
+ .cmp = cmp_nla_table },
+
+ [SEG6_LOCAL_NH4] = { .parse = parse_nla_nh4,
+ .put = put_nla_nh4,
+ .cmp = cmp_nla_nh4 },
+
+ [SEG6_LOCAL_NH6] = { .parse = parse_nla_nh6,
+ .put = put_nla_nh6,
+ .cmp = cmp_nla_nh6 },
+
+ [SEG6_LOCAL_IIF] = { .parse = parse_nla_iif,
+ .put = put_nla_iif,
+ .cmp = cmp_nla_iif },
+
+ [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif,
+ .put = put_nla_oif,
+ .cmp = cmp_nla_oif },
+
+ [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf,
+ .put = put_nla_bpf,
+ .cmp = cmp_nla_bpf,
+ .destroy = destroy_attr_bpf },
+
+ [SEG6_LOCAL_VRFTABLE] = { .parse = parse_nla_vrftable,
+ .put = put_nla_vrftable,
+ .cmp = cmp_nla_vrftable },
+
+ [SEG6_LOCAL_COUNTERS] = { .parse = parse_nla_counters,
+ .put = put_nla_counters,
+ .cmp = cmp_nla_counters,
+ .destroy = destroy_attr_counters },
+
+ [SEG6_LOCAL_FLAVORS] = { .parse = parse_nla_flavors,
+ .put = put_nla_flavors,
+ .cmp = cmp_nla_flavors },
+};
+
+/* call the destroy() callback (if available) for each set attribute in
+ * @parsed_attrs, starting from the first attribute up to the @max_parsed
+ * (excluded) attribute.
+ */
+static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_param *param;
+ int i;
+
+ /* Every required seg6local attribute is identified by an ID which is
+ * encoded as a flag (i.e: 1 << ID) in the 'attrs' bitmask;
+ *
+ * We scan the 'parsed_attrs' bitmask, starting from the first attribute
+ * up to the @max_parsed (excluded) attribute.
+ * For each set attribute, we retrieve the corresponding destroy()
+ * callback. If the callback is not available, then we skip to the next
+ * attribute; otherwise, we call the destroy() callback.
+ */
+ for (i = SEG6_LOCAL_SRH; i < max_parsed; ++i) {
+ if (!(parsed_attrs & SEG6_F_ATTR(i)))
+ continue;
+
+ param = &seg6_action_params[i];
+
+ if (param->destroy)
+ param->destroy(slwt);
+ }
+}
+
+/* release all the resources that may have been acquired during parsing
+ * operations.
+ */
+static void destroy_attrs(struct seg6_local_lwt *slwt)
+{
+ unsigned long attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ __destroy_attrs(attrs, SEG6_LOCAL_MAX + 1, slwt);
+}
+
+static int parse_nla_optional_attrs(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ unsigned long parsed_optattrs = 0;
+ struct seg6_action_param *param;
+ int err, i;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; ++i) {
+ if (!(desc->optattrs & SEG6_F_ATTR(i)) || !attrs[i])
+ continue;
+
+ /* once here, the i-th attribute is provided by the
+ * userspace AND it is identified optional as well.
+ */
+ param = &seg6_action_params[i];
+
+ err = param->parse(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_optattrs_err;
+
+ /* current attribute has been correctly parsed */
+ parsed_optattrs |= SEG6_F_ATTR(i);
+ }
+
+ /* store in the tunnel state all the optional attributed successfully
+ * parsed.
+ */
+ slwt->parsed_optattrs = parsed_optattrs;
+
+ return 0;
+
+parse_optattrs_err:
+ __destroy_attrs(parsed_optattrs, i, slwt);
+
+ return err;
+}
+
+/* call the custom constructor of the behavior during its initialization phase
+ * and after that all its attributes have been parsed successfully.
+ */
+static int
+seg6_local_lwtunnel_build_state(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->build_state)
+ return 0;
+
+ return ops->build_state(slwt, cfg, extack);
+}
+
+/* call the custom destructor of the behavior which is invoked before the
+ * tunnel is going to be destroyed.
+ */
+static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->destroy_state)
+ return;
+
+ ops->destroy_state(slwt);
+}
+
+static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_param *param;
+ struct seg6_action_desc *desc;
+ unsigned long invalid_attrs;
+ int i, err;
+
+ desc = __get_action_desc(slwt->action);
+ if (!desc)
+ return -EINVAL;
+
+ if (!desc->input)
+ return -EOPNOTSUPP;
+
+ slwt->desc = desc;
+ slwt->headroom += desc->static_headroom;
+
+ /* Forcing the desc->optattrs *set* and the desc->attrs *set* to be
+ * disjoined, this allow us to release acquired resources by optional
+ * attributes and by required attributes independently from each other
+ * without any interference.
+ * In other terms, we are sure that we do not release some the acquired
+ * resources twice.
+ *
+ * Note that if an attribute is configured both as required and as
+ * optional, it means that the user has messed something up in the
+ * seg6_action_table. Therefore, this check is required for SRv6
+ * behaviors to work properly.
+ */
+ invalid_attrs = desc->attrs & desc->optattrs;
+ if (invalid_attrs) {
+ WARN_ONCE(1,
+ "An attribute cannot be both required AND optional");
+ return -EINVAL;
+ }
+
+ /* parse the required attributes */
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (desc->attrs & SEG6_F_ATTR(i)) {
+ if (!attrs[i])
+ return -EINVAL;
+
+ param = &seg6_action_params[i];
+
+ err = param->parse(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_attrs_err;
+ }
+ }
+
+ /* parse the optional attributes, if any */
+ err = parse_nla_optional_attrs(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_attrs_err;
+
+ return 0;
+
+parse_attrs_err:
+ /* release any resource that may have been acquired during the i-1
+ * parse() operations.
+ */
+ __destroy_attrs(desc->attrs, i, slwt);
+
+ return err;
+}
+
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[SEG6_LOCAL_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct seg6_local_lwt *slwt;
+ int err;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(tb, SEG6_LOCAL_MAX, nla,
+ seg6_local_policy, extack);
+
+ if (err < 0)
+ return err;
+
+ if (!tb[SEG6_LOCAL_ACTION])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*slwt));
+ if (!newts)
+ return -ENOMEM;
+
+ slwt = seg6_local_lwtunnel(newts);
+ slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
+
+ err = parse_nla_action(tb, slwt, extack);
+ if (err < 0)
+ goto out_free;
+
+ err = seg6_local_lwtunnel_build_state(slwt, cfg, extack);
+ if (err < 0)
+ goto out_destroy_attrs;
+
+ newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
+ newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->headroom = slwt->headroom;
+
+ *ts = newts;
+
+ return 0;
+
+out_destroy_attrs:
+ destroy_attrs(slwt);
+out_free:
+ kfree(newts);
+ return err;
+}
+
+static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+
+ seg6_local_lwtunnel_destroy_state(slwt);
+
+ destroy_attrs(slwt);
+
+ return;
+}
+
+static int seg6_local_fill_encap(struct sk_buff *skb,
+ struct lwtunnel_state *lwt)
+{
+ struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+ struct seg6_action_param *param;
+ unsigned long attrs;
+ int i, err;
+
+ if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
+ return -EMSGSIZE;
+
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (attrs & SEG6_F_ATTR(i)) {
+ param = &seg6_action_params[i];
+ err = param->put(skb, slwt);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
+{
+ struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+ unsigned long attrs;
+ int nlsize;
+
+ nlsize = nla_total_size(4); /* action */
+
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_SRH))
+ nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH4))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH6))
+ nlsize += nla_total_size(16);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_IIF))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_OIF))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_BPF))
+ nlsize += nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) +
+ nla_total_size(4);
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_LOCAL_COUNTERS)
+ nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
+ /* SEG6_LOCAL_CNT_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* SEG6_LOCAL_CNT_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* SEG6_LOCAL_CNT_ERRORS */
+ nla_total_size_64bit(sizeof(__u64));
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_FLAVORS))
+ nlsize += encap_size_flavors(slwt);
+
+ return nlsize;
+}
+
+static int seg6_local_cmp_encap(struct lwtunnel_state *a,
+ struct lwtunnel_state *b)
+{
+ struct seg6_local_lwt *slwt_a, *slwt_b;
+ struct seg6_action_param *param;
+ unsigned long attrs_a, attrs_b;
+ int i;
+
+ slwt_a = seg6_local_lwtunnel(a);
+ slwt_b = seg6_local_lwtunnel(b);
+
+ if (slwt_a->action != slwt_b->action)
+ return 1;
+
+ attrs_a = slwt_a->desc->attrs | slwt_a->parsed_optattrs;
+ attrs_b = slwt_b->desc->attrs | slwt_b->parsed_optattrs;
+
+ if (attrs_a != attrs_b)
+ return 1;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (attrs_a & SEG6_F_ATTR(i)) {
+ param = &seg6_action_params[i];
+ if (param->cmp(slwt_a, slwt_b))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static const struct lwtunnel_encap_ops seg6_local_ops = {
+ .build_state = seg6_local_build_state,
+ .destroy_state = seg6_local_destroy_state,
+ .input = seg6_local_input,
+ .fill_encap = seg6_local_fill_encap,
+ .get_encap_size = seg6_local_get_encap_size,
+ .cmp_encap = seg6_local_cmp_encap,
+ .owner = THIS_MODULE,
+};
+
+int __init seg6_local_init(void)
+{
+ /* If the max total number of defined attributes is reached, then your
+ * kernel build stops here.
+ *
+ * This check is required to avoid arithmetic overflows when processing
+ * behavior attributes and the maximum number of defined attributes
+ * exceeds the allowed value.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long));
+
+ /* Check whether the number of defined flavors exceeds the maximum
+ * allowed value.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_FLV_OP_MAX + 1 > BITS_PER_TYPE(__u32));
+
+ /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in
+ * bits) have been changed with invalid values, kernel build stops
+ * here.
+ */
+ BUILD_BUG_ON(next_csid_chk_cntr_bits(SEG6_LOCAL_LCBLOCK_DBITS,
+ SEG6_LOCAL_LCNODE_FN_DBITS));
+ BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS));
+ BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS));
+
+ /* To be memory efficient, we use 'u8' to represent the different
+ * actions related to RFC8986 flavors. If the kernel build stops here,
+ * it means that it is not possible to correctly encode these actions
+ * with the data type chosen for the action table.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_FLV_ACT_MAX > (typeof(flv8986_act_tbl[0]))~0U);
+
+ return lwtunnel_encap_add_ops(&seg6_local_ops,
+ LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
+
+void seg6_local_exit(void)
+{
+ lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 77b7b0911438..cf37ad9686e6 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1,36 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT)
* Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * $Id: sit.c,v 1.53 2001/09/25 05:09:53 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Roger Venning <r.venning@telstra.com>: 6to4 support
* Nate Thompson <nate@thebog.net>: 6to4 support
+ * Fred Templin <fred.l.templin@boeing.com>: isatap support
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmp.h>
-#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -48,10 +45,14 @@
#include <net/ip.h>
#include <net/udp.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/dsfield.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
+#include <net/inet_dscp.h>
/*
This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
@@ -59,52 +60,86 @@
For comments look at net/ipv4/ip_gre.c --ANK
*/
-#define HASH_SIZE 16
+#define IP6_SIT_HASH_SIZE 16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-static int ipip6_fb_tunnel_init(struct net_device *dev);
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
static int ipip6_tunnel_init(struct net_device *dev);
static void ipip6_tunnel_setup(struct net_device *dev);
+static void ipip6_dev_free(struct net_device *dev);
+static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
+ __be32 *v4dst);
+static struct rtnl_link_ops sit_link_ops __read_mostly;
+
+static unsigned int sit_net_id __read_mostly;
+struct sit_net {
+ struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel __rcu **tunnels[4];
+
+ struct net_device *fb_tunnel_dev;
+};
-static struct net_device *ipip6_fb_tunnel_dev;
-
-static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
-static struct ip_tunnel *tunnels_r[HASH_SIZE];
-static struct ip_tunnel *tunnels_l[HASH_SIZE];
-static struct ip_tunnel *tunnels_wc[1];
-static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+static inline struct sit_net *dev_to_sit_net(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
-static DEFINE_RWLOCK(ipip6_lock);
+ return net_generic(t->net, sit_net_id);
+}
-static struct ip_tunnel * ipip6_tunnel_lookup(__be32 remote, __be32 local)
+/*
+ * Must be invoked with rcu_read_lock
+ */
+static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
+ struct net_device *dev,
+ __be32 remote, __be32 local,
+ int sifindex)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
struct ip_tunnel *t;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+ int ifindex = dev ? dev->ifindex : 0;
- for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+ for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ remote == t->parms.iph.daddr &&
+ (!dev || !t->parms.link || ifindex == t->parms.link ||
+ sifindex == t->parms.link) &&
+ (t->dev->flags & IFF_UP))
return t;
}
- for (t = tunnels_r[h0]; t; t = t->next) {
- if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) {
+ if (remote == t->parms.iph.daddr &&
+ (!dev || !t->parms.link || ifindex == t->parms.link ||
+ sifindex == t->parms.link) &&
+ (t->dev->flags & IFF_UP))
return t;
}
- for (t = tunnels_l[h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+ for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) {
+ if (local == t->parms.iph.saddr &&
+ (!dev || !t->parms.link || ifindex == t->parms.link ||
+ sifindex == t->parms.link) &&
+ (t->dev->flags & IFF_UP))
return t;
}
- if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
+ t = rcu_dereference(sitn->tunnels_wc[0]);
+ if (t && (t->dev->flags & IFF_UP))
return t;
return NULL;
}
-static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t)
+static struct ip_tunnel __rcu **
+__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms)
{
- __be32 remote = t->parms.iph.daddr;
- __be32 local = t->parms.iph.saddr;
- unsigned h = 0;
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ unsigned int h = 0;
int prio = 0;
if (remote) {
@@ -115,134 +150,401 @@ static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t)
prio |= 1;
h ^= HASH(local);
}
- return &tunnels[prio][h];
+ return &sitn->tunnels[prio][h];
}
-static void ipip6_tunnel_unlink(struct ip_tunnel *t)
+static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
+ struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
+ return __ipip6_bucket(sitn, &t->parms);
+}
- for (tp = ipip6_bucket(t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- write_lock_bh(&ipip6_lock);
- *tp = t->next;
- write_unlock_bh(&ipip6_lock);
+static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
+{
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipip6_bucket(sitn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
}
-static void ipip6_tunnel_link(struct ip_tunnel *t)
+static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipip6_bucket(t);
+ struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
- t->next = *tp;
- write_lock_bh(&ipip6_lock);
- *tp = t;
- write_unlock_bh(&ipip6_lock);
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
}
-static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
+{
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) {
+ ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
+ t->ip6rd.relay_prefix = 0;
+ t->ip6rd.prefixlen = 16;
+ t->ip6rd.relay_prefixlen = 0;
+ } else {
+ struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev);
+ memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd));
+ }
+#endif
+}
+
+static int ipip6_tunnel_create(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct sit_net *sitn = net_generic(t->net, sit_net_id);
+ int err;
+
+ __dev_addr_set(dev, &t->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
+
+ if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags))
+ dev->priv_flags |= IFF_ISATAP;
+
+ dev->rtnl_link_ops = &sit_link_ops;
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ ipip6_tunnel_clone_6rd(dev, sitn);
+
+ ipip6_tunnel_link(sitn, t);
+ return 0;
+
+out:
+ return err;
+}
+
+static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
+ struct ip_tunnel_parm_kern *parms,
+ int create)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- struct ip_tunnel *t, **tp, *nt;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
struct net_device *dev;
- unsigned h = 0;
- int prio = 0;
char name[IFNAMSIZ];
+ struct sit_net *sitn = net_generic(net, sit_net_id);
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
- return t;
+ for (tp = __ipip6_bucket(sitn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ parms->link == t->parms.link) {
+ if (create)
+ return NULL;
+ else
+ return t;
+ }
}
if (!create)
goto failed;
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else {
- int i;
- for (i=1; i<100; i++) {
- sprintf(name, "sit%d", i);
- if (__dev_get_by_name(name) == NULL)
- break;
- }
- if (i==100)
+ if (parms->name[0]) {
+ if (!dev_valid_name(parms->name))
goto failed;
+ strscpy(name, parms->name, IFNAMSIZ);
+ } else {
+ strcpy(name, "sit%d");
}
-
- dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup);
- if (dev == NULL)
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+ ipip6_tunnel_setup);
+ if (!dev)
return NULL;
+ dev_net_set(dev, net);
+
nt = netdev_priv(dev);
- dev->init = ipip6_tunnel_init;
- nt->parms = *parms;
- if (register_netdevice(dev) < 0) {
- free_netdev(dev);
- goto failed;
- }
+ nt->net = net;
+ nt->parms = *parms;
+ if (ipip6_tunnel_create(dev) < 0)
+ goto failed_free;
- dev_hold(dev);
+ if (!parms->name[0])
+ strcpy(parms->name, dev->name);
- ipip6_tunnel_link(nt);
return nt;
+failed_free:
+ free_netdev(dev);
failed:
return NULL;
}
-static void ipip6_tunnel_uninit(struct net_device *dev)
+#define for_each_prl_rcu(start) \
+ for (prl = rcu_dereference(start); \
+ prl; \
+ prl = rcu_dereference(prl->next))
+
+static struct ip_tunnel_prl_entry *
+__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
+{
+ struct ip_tunnel_prl_entry *prl;
+
+ for_each_prl_rcu(t->prl)
+ if (prl->addr == addr)
+ break;
+ return prl;
+
+}
+
+static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_prl kprl, *kp;
+ struct ip_tunnel_prl_entry *prl;
+ unsigned int cmax, c = 0, ca, len;
+ int ret = 0;
+
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ return -EINVAL;
+
+ if (copy_from_user(&kprl, a, sizeof(kprl)))
+ return -EFAULT;
+ cmax = kprl.datalen / sizeof(kprl);
+ if (cmax > 1 && kprl.addr != htonl(INADDR_ANY))
+ cmax = 1;
+
+ /* For simple GET or for root users,
+ * we try harder to allocate.
+ */
+ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
+ kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) :
+ NULL;
+
+ ca = min(t->prl_count, cmax);
+
+ if (!kp) {
+ /* We don't try hard to allocate much memory for
+ * non-root users.
+ * For root users, retry allocating enough memory for
+ * the answer.
+ */
+ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT |
+ __GFP_NOWARN);
+ if (!kp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ rcu_read_lock();
+ for_each_prl_rcu(t->prl) {
+ if (c >= cmax)
+ break;
+ if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr)
+ continue;
+ kp[c].addr = prl->addr;
+ kp[c].flags = prl->flags;
+ c++;
+ if (kprl.addr != htonl(INADDR_ANY))
+ break;
+ }
+
+ rcu_read_unlock();
+
+ len = sizeof(*kp) * c;
+ ret = 0;
+ if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen))
+ ret = -EFAULT;
+
+ kfree(kp);
+out:
+ return ret;
+}
+
+static int
+ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
{
- if (dev == ipip6_fb_tunnel_dev) {
- write_lock_bh(&ipip6_lock);
- tunnels_wc[0] = NULL;
- write_unlock_bh(&ipip6_lock);
- dev_put(dev);
+ struct ip_tunnel_prl_entry *p;
+ int err = 0;
+
+ if (a->addr == htonl(INADDR_ANY))
+ return -EINVAL;
+
+ ASSERT_RTNL();
+
+ for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
+ if (p->addr == a->addr) {
+ if (chg) {
+ p->flags = a->flags;
+ goto out;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (chg) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL);
+ if (!p) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ p->next = t->prl;
+ p->addr = a->addr;
+ p->flags = a->flags;
+ t->prl_count++;
+ rcu_assign_pointer(t->prl, p);
+out:
+ return err;
+}
+
+static void prl_list_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_tunnel_prl_entry *p, *n;
+
+ p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
+ do {
+ n = rcu_dereference_protected(p->next, 1);
+ kfree(p);
+ p = n;
+ } while (p);
+}
+
+static int
+ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
+{
+ struct ip_tunnel_prl_entry *x;
+ struct ip_tunnel_prl_entry __rcu **p;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ if (a && a->addr != htonl(INADDR_ANY)) {
+ for (p = &t->prl;
+ (x = rtnl_dereference(*p)) != NULL;
+ p = &x->next) {
+ if (x->addr == a->addr) {
+ *p = x->next;
+ kfree_rcu(x, rcu_head);
+ t->prl_count--;
+ goto out;
+ }
+ }
+ err = -ENXIO;
} else {
- ipip6_tunnel_unlink(netdev_priv(dev));
- dev_put(dev);
+ x = rtnl_dereference(t->prl);
+ if (x) {
+ t->prl_count = 0;
+ call_rcu(&x->rcu_head, prl_list_destroy_rcu);
+ t->prl = NULL;
+ }
}
+out:
+ return err;
}
+static int ipip6_tunnel_prl_ctl(struct net_device *dev,
+ struct ip_tunnel_prl __user *data, int cmd)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_prl prl;
+ int err;
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ return -EINVAL;
+
+ if (copy_from_user(&prl, data, sizeof(prl)))
+ return -EFAULT;
-static void ipip6_err(struct sk_buff *skb, u32 info)
+ switch (cmd) {
+ case SIOCDELPRL:
+ err = ipip6_tunnel_del_prl(t, &prl);
+ break;
+ case SIOCADDPRL:
+ case SIOCCHGPRL:
+ err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+ break;
+ }
+ dst_cache_reset(&t->dst_cache);
+ netdev_state_change(dev);
+ return err;
+}
+
+static int
+isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
{
-#ifndef I_WISH_WORLD_WERE_PERFECT
+ struct ip_tunnel_prl_entry *p;
+ int ok = 1;
+
+ rcu_read_lock();
+ p = __ipip6_tunnel_locate_prl(t, iph->saddr);
+ if (p) {
+ if (p->flags & PRL_DEFAULT)
+ skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT;
+ else
+ skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT;
+ } else {
+ const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr;
+
+ if (ipv6_addr_is_isatap(addr6) &&
+ (addr6->s6_addr32[3] == iph->saddr) &&
+ ipv6_chk_prefix(addr6, t->dev))
+ skb->ndisc_nodetype = NDISC_NODETYPE_HOST;
+ else
+ ok = 0;
+ }
+ rcu_read_unlock();
+ return ok;
+}
-/* It is not :-( All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
- */
- struct iphdr *iph = (struct iphdr*)skb->data;
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
+static void ipip6_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct sit_net *sitn = net_generic(tunnel->net, sit_net_id);
+
+ if (dev == sitn->fb_tunnel_dev) {
+ RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL);
+ } else {
+ ipip6_tunnel_unlink(sitn, tunnel);
+ ipip6_tunnel_del_prl(tunnel, NULL);
+ }
+ dst_cache_reset(&tunnel->dst_cache);
+ netdev_put(dev, &tunnel->dev_tracker);
+}
+
+static int ipip6_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ unsigned int data_len = 0;
struct ip_tunnel *t;
+ int sifindex;
+ int err;
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return 0;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return;
+ return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -253,275 +555,439 @@ static void ipip6_err(struct sk_buff *skb, u32 info)
break;
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return 0;
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
+ case ICMP_REDIRECT:
+ break;
+ }
+
+ err = -ENOENT;
+
+ sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0;
+ t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
+ iph->daddr, iph->saddr, sifindex);
+ if (!t)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, iph->protocol);
+ err = 0;
+ goto out;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link,
+ iph->protocol);
+ err = 0;
+ goto out;
}
- read_lock(&ipip6_lock);
- t = ipip6_tunnel_lookup(iph->daddr, iph->saddr);
- if (t == NULL || t->parms.iph.daddr == 0)
+ err = 0;
+ if (__in6_dev_get(skb->dev) &&
+ !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
+ goto out;
+
+ if (t->parms.iph.daddr == 0)
goto out;
+
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
out:
- read_unlock(&ipip6_lock);
- return;
+ return err;
+}
+
+static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr,
+ const struct in6_addr *v6addr)
+{
+ __be32 v4embed = 0;
+ if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed)
+ return true;
+ return false;
+}
+
+/* Checks if an address matches an address on the tunnel interface.
+ * Used to detect the NAT of proto 41 packets and let them pass spoofing test.
+ * Long story:
+ * This function is called after we considered the packet as spoofed
+ * in is_spoofed_6rd.
+ * We may have a router that is doing NAT for proto 41 packets
+ * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb
+ * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd
+ * function will return true, dropping the packet.
+ * But, we can still check if is spoofed against the IP
+ * addresses associated with the interface.
+ */
+static bool only_dnatted(const struct ip_tunnel *tunnel,
+ const struct in6_addr *v6dst)
+{
+ int prefix_len;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ prefix_len = tunnel->ip6rd.prefixlen + 32
+ - tunnel->ip6rd.relay_prefixlen;
#else
- struct iphdr *iph = (struct iphdr*)dp;
- int hlen = iph->ihl<<2;
- struct ipv6hdr *iph6;
- int type = skb->h.icmph->type;
- int code = skb->h.icmph->code;
- int rel_type = 0;
- int rel_code = 0;
- int rel_info = 0;
- struct sk_buff *skb2;
- struct rt6_info *rt6i;
-
- if (len < hlen + sizeof(struct ipv6hdr))
- return;
- iph6 = (struct ipv6hdr*)(dp + hlen);
+ prefix_len = 48;
+#endif
+ return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev);
+}
- switch (type) {
- default:
- return;
- case ICMP_PARAMETERPROB:
- if (skb->h.icmph->un.gateway < hlen)
- return;
+/* Returns true if a packet is spoofed */
+static bool packet_is_spoofed(struct sk_buff *skb,
+ const struct iphdr *iph,
+ struct ip_tunnel *tunnel)
+{
+ const struct ipv6hdr *ipv6h;
- /* So... This guy found something strange INSIDE encapsulated
- packet. Well, he is fool, but what can we do ?
- */
- rel_type = ICMPV6_PARAMPROB;
- rel_info = skb->h.icmph->un.gateway - hlen;
- break;
+ if (tunnel->dev->priv_flags & IFF_ISATAP) {
+ if (!isatap_chksrc(skb, iph, tunnel))
+ return true;
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* Too complicated case ... */
- return;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe, it is just ether pollution. --ANK
- */
- rel_type = ICMPV6_DEST_UNREACH;
- rel_code = ICMPV6_ADDR_UNREACH;
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return;
- rel_type = ICMPV6_TIME_EXCEED;
- rel_code = ICMPV6_EXC_HOPLIMIT;
- break;
+ return false;
}
- /* Prepare fake skb to feed it to icmpv6_send */
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
- return;
- dst_release(skb2->dst);
- skb2->dst = NULL;
- skb_pull(skb2, skb->data - (u8*)iph6);
- skb2->nh.raw = skb2->data;
-
- /* Try to guess incoming interface */
- rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0);
- if (rt6i && rt6i->rt6i_dev) {
- skb2->dev = rt6i->rt6i_dev;
-
- rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0);
-
- if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) {
- struct ip_tunnel *t = netdev_priv(rt6i->rt6i_dev);
- if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) {
- rel_type = ICMPV6_DEST_UNREACH;
- rel_code = ICMPV6_ADDR_UNREACH;
- }
- icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev);
- }
+ if (tunnel->dev->flags & IFF_POINTOPOINT)
+ return false;
+
+ ipv6h = ipv6_hdr(skb);
+
+ if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) {
+ net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
+ &iph->saddr, &ipv6h->saddr,
+ &iph->daddr, &ipv6h->daddr);
+ return true;
}
- kfree_skb(skb2);
- return;
-#endif
-}
-static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
-{
- if (INET_ECN_is_ce(iph->tos))
- IP6_ECN_set_ce(skb->nh.ipv6h);
+ if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr)))
+ return false;
+
+ if (only_dnatted(tunnel, &ipv6h->daddr))
+ return false;
+
+ net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
+ &iph->saddr, &ipv6h->saddr,
+ &iph->daddr, &ipv6h->daddr);
+ return true;
}
static int ipip6_rcv(struct sk_buff *skb)
{
- struct iphdr *iph;
+ const struct iphdr *iph = ip_hdr(skb);
struct ip_tunnel *tunnel;
+ int sifindex;
+ int err;
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- goto out;
-
- iph = skb->nh.iph;
+ sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0;
+ tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
+ iph->saddr, iph->daddr, sifindex);
+ if (tunnel) {
+ if (tunnel->parms.iph.protocol != IPPROTO_IPV6 &&
+ tunnel->parms.iph.protocol != 0)
+ goto out;
- read_lock(&ipip6_lock);
- if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
- secpath_reset(skb);
- skb->mac.raw = skb->nh.raw;
- skb->nh.raw = skb->data;
+ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
IPCB(skb)->flags = 0;
- skb->protocol = htons(ETH_P_IPV6);
- skb->pkt_type = PACKET_HOST;
- tunnel->stat.rx_packets++;
- tunnel->stat.rx_bytes += skb->len;
skb->dev = tunnel->dev;
- dst_release(skb->dst);
- skb->dst = NULL;
- nf_reset(skb);
- ipip6_ecn_decapsulate(iph, skb);
+
+ if (packet_is_spoofed(skb, iph, tunnel)) {
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto out;
+ }
+
+ if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6),
+ !net_eq(tunnel->net, dev_net(tunnel->dev))))
+ goto out;
+
+ /* skb can be uncloned in iptunnel_pull_header, so
+ * old iph is no longer valid
+ */
+ iph = (const struct iphdr *)skb_mac_header(skb);
+ skb_reset_mac_header(skb);
+
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto out;
+ }
+ }
+
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
+
netif_rx(skb);
- read_unlock(&ipip6_lock);
+
return 0;
}
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- kfree_skb(skb);
- read_unlock(&ipip6_lock);
+ /* no tunnel matched, let upstream know, ipsec may handle it */
+ return 1;
out:
+ kfree_skb(skb);
return 0;
}
-/* Returns the embedded IPv4 address if the IPv6 address
- comes from 6to4 (RFC 3056) addr space */
+static const struct tnl_ptk_info ipip_tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
+
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct tnl_ptk_info mplsip_tpi = {
+ /* no tunnel info required for mplsip. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+#endif
-static inline __be32 try_6to4(struct in6_addr *v6dst)
+static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
- __be32 dst = 0;
+ const struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+ int sifindex;
+
+ sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0;
+
+ iph = ip_hdr(skb);
+ tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
+ iph->saddr, iph->daddr, sifindex);
+ if (tunnel) {
+ const struct tnl_ptk_info *tpi;
+
+ if (tunnel->parms.iph.protocol != ipproto &&
+ tunnel->parms.iph.protocol != 0)
+ goto drop;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+#if IS_ENABLED(CONFIG_MPLS)
+ if (ipproto == IPPROTO_MPLS)
+ tpi = &mplsip_tpi;
+ else
+#endif
+ tpi = &ipip_tpi;
+ if (iptunnel_pull_header(skb, 0, tpi->proto, false))
+ goto drop;
+ skb_reset_mac_header(skb);
+
+ return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ }
+
+ return 1;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+ return sit_tunnel_rcv(skb, IPPROTO_IPIP);
+}
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int mplsip_rcv(struct sk_buff *skb)
+{
+ return sit_tunnel_rcv(skb, IPPROTO_MPLS);
+}
+#endif
+/*
+ * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function
+ * stores the embedded IPv4 address in v4dst and returns true.
+ */
+static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
+ __be32 *v4dst)
+{
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
+ tunnel->ip6rd.prefixlen)) {
+ unsigned int pbw0, pbi0;
+ int pbi1;
+ u32 d;
+
+ pbw0 = tunnel->ip6rd.prefixlen >> 5;
+ pbi0 = tunnel->ip6rd.prefixlen & 0x1f;
+
+ d = tunnel->ip6rd.relay_prefixlen < 32 ?
+ (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
+ tunnel->ip6rd.relay_prefixlen : 0;
+
+ pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
+ if (pbi1 > 0)
+ d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >>
+ (32 - pbi1);
+
+ *v4dst = tunnel->ip6rd.relay_prefix | htonl(d);
+ return true;
+ }
+#else
if (v6dst->s6_addr16[0] == htons(0x2002)) {
- /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */
- memcpy(&dst, &v6dst->s6_addr16[1], 4);
+ /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */
+ memcpy(v4dst, &v6dst->s6_addr16[1], 4);
+ return true;
}
+#endif
+ return false;
+}
+
+static inline __be32 try_6rd(struct ip_tunnel *tunnel,
+ const struct in6_addr *v6dst)
+{
+ __be32 dst = 0;
+ check_6rd(tunnel, v6dst, &dst);
return dst;
}
+static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst,
+ bool is_isatap)
+{
+ const struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct neighbour *neigh = NULL;
+ const struct in6_addr *addr6;
+ bool found = false;
+ int addr_type;
+
+ if (skb_dst(skb))
+ neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
+
+ if (!neigh) {
+ net_dbg_ratelimited("nexthop == NULL\n");
+ return false;
+ }
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (is_isatap) {
+ if ((addr_type & IPV6_ADDR_UNICAST) &&
+ ipv6_addr_is_isatap(addr6)) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ } else {
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) != 0) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ }
+
+ neigh_release(neigh);
+
+ return found;
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
*/
-static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &tunnel->stat;
- struct iphdr *tiph = &tunnel->parms.iph;
- struct ipv6hdr *iph6 = skb->nh.ipv6h;
+ const struct iphdr *tiph = &tunnel->parms.iph;
+ const struct ipv6hdr *iph6 = ipv6_hdr(skb);
u8 tos = tunnel->parms.iph.tos;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *iph; /* Our new IP header */
- int max_headroom; /* The extra header space needed */
+ __be16 df = tiph->frag_off;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ unsigned int max_headroom; /* The extra header space needed */
__be32 dst = tiph->daddr;
+ struct flowi4 fl4;
int mtu;
- struct in6_addr *addr6;
- int addr_type;
+ u8 ttl;
+ u8 protocol = IPPROTO_IPV6;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
- if (tunnel->recursion++) {
- tunnel->stat.collisions++;
- goto tx_error;
- }
+ if (tos == 1)
+ tos = ipv6_get_dsfield(iph6);
- if (skb->protocol != htons(ETH_P_IPV6))
+ /* ISATAP (RFC4214) - must come before 6to4 */
+ if ((dev->priv_flags & IFF_ISATAP) &&
+ !ipip6_tunnel_dst_find(skb, &dst, true))
goto tx_error;
if (!dst)
- dst = try_6to4(&iph6->daddr);
-
- if (!dst) {
- struct neighbour *neigh = NULL;
-
- if (skb->dst)
- neigh = skb->dst->neighbour;
-
- if (neigh == NULL) {
- if (net_ratelimit())
- printk(KERN_DEBUG "sit: nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (struct in6_addr*)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
+ dst = try_6rd(tunnel, &iph6->daddr);
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &skb->nh.ipv6h->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
+ if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false))
+ goto tx_error;
- if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
- goto tx_error_icmp;
+ flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark,
+ tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE,
+ IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0,
+ sock_net_uid(tunnel->net, NULL));
- dst = addr6->s6_addr32[3];
- }
-
- {
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .oif = tunnel->parms.link,
- .proto = IPPROTO_IPV6 };
- if (ip_route_output_key(&rt, &fl)) {
- tunnel->stat.tx_carrier_errors++;
+ rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr);
+ if (!rt) {
+ rt = ip_route_output_flow(tunnel->net, &fl4, NULL);
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
+ dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr);
}
- if (rt->rt_type != RTN_UNICAST) {
+
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
- tunnel->stat.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
- tunnel->stat.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
- if (tiph->frag_off)
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
- else
- mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
-
- if (mtu < 68) {
- tunnel->stat.collisions++;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
ip_rt_put(rt);
goto tx_error;
}
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
- if (tunnel->parms.iph.daddr && skb->dst)
- skb->dst->ops->update_pmtu(skb->dst, mtu);
- if (skb->len > mtu) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
- ip_rt_put(rt);
- goto tx_error;
+ if (df) {
+ mtu = dst_mtu(&rt->dst) - t_hlen;
+
+ if (mtu < IPV4_MIN_MTU) {
+ DEV_STATS_INC(dev, collisions);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (mtu < IPV6_MIN_MTU) {
+ mtu = IPV6_MIN_MTU;
+ df = 0;
+ }
+
+ if (tunnel->parms.iph.daddr)
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (skb->len > mtu && !skb_is_gso(skb)) {
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
}
if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
} else
@@ -531,327 +997,950 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
- max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr);
+ max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen;
- if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+ (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- stats->tx_dropped++;
- dev_kfree_skb(skb);
- tunnel->recursion--;
- return 0;
+ DEV_STATS_INC(dev, tx_dropped);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
}
if (skb->sk)
skb_set_owner_w(new_skb, skb->sk);
dev_kfree_skb(skb);
skb = new_skb;
- iph6 = skb->nh.ipv6h;
+ iph6 = ipv6_hdr(skb);
}
+ ttl = tiph->ttl;
+ if (ttl == 0)
+ ttl = iph6->hop_limit;
+ tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
- skb->h.raw = skb->nh.raw;
- skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags = 0;
- dst_release(skb->dst);
- skb->dst = &rt->u.dst;
+ if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
- /*
- * Push down and install the IPIP header.
- */
+ skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ kfree_skb(skb);
+ DEV_STATS_INC(dev, tx_errors);
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb,
+ struct net_device *dev, u8 ipproto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
+ goto tx_error;
- iph = skb->nh.iph;
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- if (mtu > IPV6_MIN_MTU)
- iph->frag_off = htons(IP_DF);
+ skb_set_inner_ipproto(skb, ipproto);
+
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
+ return NETDEV_TX_OK;
+tx_error:
+ kfree_skb(skb);
+ DEV_STATS_INC(dev, tx_errors);
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP);
+ break;
+ case htons(ETH_P_IPV6):
+ ipip6_tunnel_xmit(skb, dev);
+ break;
+#if IS_ENABLED(CONFIG_MPLS)
+ case htons(ETH_P_MPLS_UC):
+ sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS);
+ break;
+#endif
+ default:
+ goto tx_err;
+ }
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+
+}
+
+static void ipip6_tunnel_bind_dev(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ struct net_device *tdev = NULL;
+ int hlen = LL_MAX_HEADER;
+ const struct iphdr *iph;
+ struct flowi4 fl4;
+
+ iph = &tunnel->parms.iph;
+
+ if (iph->daddr) {
+ struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4,
+ NULL,
+ iph->daddr, iph->saddr,
+ 0, 0,
+ IPPROTO_IPV6,
+ iph->tos & INET_DSCP_MASK,
+ tunnel->parms.link);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev && !netif_is_l3_master(tdev)) {
+ int mtu;
+
+ mtu = tdev->mtu - t_hlen;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ }
+ dev->needed_headroom = t_hlen + hlen;
+}
+
+static void ipip6_tunnel_update(struct ip_tunnel *t,
+ struct ip_tunnel_parm_kern *p,
+ __u32 fwmark)
+{
+ struct net *net = t->net;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ ipip6_tunnel_unlink(sitn, t);
+ synchronize_net();
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ __dev_addr_set(t->dev, &p->iph.saddr, 4);
+ memcpy(t->dev->broadcast, &p->iph.daddr, 4);
+ ipip6_tunnel_link(sitn, t);
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+ if (t->parms.link != p->link || t->fwmark != fwmark) {
+ t->parms.link = p->link;
+ t->fwmark = fwmark;
+ ipip6_tunnel_bind_dev(t->dev);
+ }
+ dst_cache_reset(&t->dst_cache);
+ netdev_state_change(t->dev);
+}
+
+#ifdef CONFIG_IPV6_SIT_6RD
+static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
+ struct ip_tunnel_6rd *ip6rd)
+{
+ struct in6_addr prefix;
+ __be32 relay_prefix;
+
+ if (ip6rd->relay_prefixlen > 32 ||
+ ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64)
+ return -EINVAL;
+
+ ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen);
+ if (!ipv6_addr_equal(&prefix, &ip6rd->prefix))
+ return -EINVAL;
+ if (ip6rd->relay_prefixlen)
+ relay_prefix = ip6rd->relay_prefix &
+ htonl(0xffffffffUL <<
+ (32 - ip6rd->relay_prefixlen));
else
- iph->frag_off = 0;
+ relay_prefix = 0;
+ if (relay_prefix != ip6rd->relay_prefix)
+ return -EINVAL;
- iph->protocol = IPPROTO_IPV6;
- iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
+ t->ip6rd.prefix = prefix;
+ t->ip6rd.relay_prefix = relay_prefix;
+ t->ip6rd.prefixlen = ip6rd->prefixlen;
+ t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen;
+ dst_cache_reset(&t->dst_cache);
+ netdev_state_change(t->dev);
+ return 0;
+}
- if ((iph->ttl = tiph->ttl) == 0)
- iph->ttl = iph6->hop_limit;
+static int
+ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
+ struct ip_tunnel_6rd ip6rd;
+
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ if (!ip_tunnel_parm_from_user(&p, data))
+ return -EFAULT;
+ t = ipip6_tunnel_locate(t->net, &p, 0);
+ }
+ if (!t)
+ t = netdev_priv(dev);
+
+ ip6rd.prefix = t->ip6rd.prefix;
+ ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+ ip6rd.prefixlen = t->ip6rd.prefixlen;
+ ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+ if (copy_to_user(data, &ip6rd, sizeof(ip6rd)))
+ return -EFAULT;
+ return 0;
+}
- nf_reset(skb);
+static int
+ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data,
+ int cmd)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_6rd ip6rd;
+ int err;
- IPTUNNEL_XMIT();
- tunnel->recursion--;
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&ip6rd, data, sizeof(ip6rd)))
+ return -EFAULT;
+
+ if (cmd != SIOCDEL6RD) {
+ err = ipip6_tunnel_update_6rd(t, &ip6rd);
+ if (err < 0)
+ return err;
+ } else
+ ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev));
return 0;
+}
-tx_error_icmp:
- dst_link_failure(skb);
-tx_error:
- stats->tx_errors++;
- dev_kfree_skb(skb);
- tunnel->recursion--;
+#endif /* CONFIG_IPV6_SIT_6RD */
+
+static bool ipip6_valid_ip_proto(u8 ipproto)
+{
+ return ipproto == IPPROTO_IPV6 ||
+ ipproto == IPPROTO_IPIP ||
+#if IS_ENABLED(CONFIG_MPLS)
+ ipproto == IPPROTO_MPLS ||
+#endif
+ ipproto == 0;
+}
+
+static int
+__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p)
+{
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!ipip6_valid_ip_proto(p->iph.protocol))
+ return -EINVAL;
+ if (p->iph.version != 4 ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
+ return -EINVAL;
+
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
return 0;
}
static int
-ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p)
{
- int err = 0;
- struct ip_tunnel_parm p;
- struct ip_tunnel *t;
+ struct ip_tunnel *t = netdev_priv(dev);
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipip6_fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip6_tunnel_locate(&p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (!t)
+ t = netdev_priv(dev);
+ memcpy(p, &t->parms, sizeof(*p));
+ return 0;
+}
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipip6_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- ipip6_tunnel_unlink(t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipip6_tunnel_link(t);
- netdev_state_change(dev);
- }
- }
+static int
+ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
+ err = __ipip6_tunnel_ioctl_validate(t->net, p);
+ if (err)
+ return err;
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipip6_fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipip6_tunnel_locate(&p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(ipip6_fb_tunnel_dev))
- goto done;
- dev = t->dev;
+ t = ipip6_tunnel_locate(t->net, p, 1);
+ if (!t)
+ return -ENOBUFS;
+ return 0;
+}
+
+static int
+ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = __ipip6_tunnel_ioctl_validate(t->net, p);
+ if (err)
+ return err;
+
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ if (!t)
+ return -ENOENT;
+ } else {
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr))
+ return -EINVAL;
+ t = netdev_priv(dev);
}
- err = unregister_netdevice(dev);
- break;
- default:
- err = -EINVAL;
+ ipip6_tunnel_update(t, p, t->fwmark);
}
-done:
- return err;
+ return 0;
}
-static struct net_device_stats *ipip6_tunnel_get_stats(struct net_device *dev)
+static int
+ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p)
{
- return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (!t)
+ return -ENOENT;
+ if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev))
+ return -EPERM;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ return 0;
}
-static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+static int
+ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+ int cmd)
{
- if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ return ipip6_tunnel_get(dev, p);
+ case SIOCADDTUNNEL:
+ return ipip6_tunnel_add(dev, p);
+ case SIOCCHGTUNNEL:
+ return ipip6_tunnel_change(dev, p);
+ case SIOCDELTUNNEL:
+ return ipip6_tunnel_del(dev, p);
+ default:
return -EINVAL;
- dev->mtu = new_mtu;
- return 0;
+ }
+}
+
+static int
+ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
+{
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ case SIOCDELTUNNEL:
+ return ip_tunnel_siocdevprivate(dev, ifr, data, cmd);
+ case SIOCGETPRL:
+ return ipip6_tunnel_get_prl(dev, data);
+ case SIOCADDPRL:
+ case SIOCDELPRL:
+ case SIOCCHGPRL:
+ return ipip6_tunnel_prl_ctl(dev, data, cmd);
+#ifdef CONFIG_IPV6_SIT_6RD
+ case SIOCGET6RD:
+ return ipip6_tunnel_get6rd(dev, data);
+ case SIOCADD6RD:
+ case SIOCCHG6RD:
+ case SIOCDEL6RD:
+ return ipip6_tunnel_6rdctl(dev, data, cmd);
+#endif
+ default:
+ return -EINVAL;
+ }
}
+static const struct net_device_ops ipip6_netdev_ops = {
+ .ndo_init = ipip6_tunnel_init,
+ .ndo_uninit = ipip6_tunnel_uninit,
+ .ndo_start_xmit = sit_tunnel_xmit,
+ .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipip6_tunnel_ctl,
+};
+
+static void ipip6_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ dst_cache_destroy(&tunnel->dst_cache);
+}
+
+#define SIT_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
+
static void ipip6_tunnel_setup(struct net_device *dev)
{
- SET_MODULE_OWNER(dev);
- dev->uninit = ipip6_tunnel_uninit;
- dev->destructor = free_netdev;
- dev->hard_start_xmit = ipip6_tunnel_xmit;
- dev->get_stats = ipip6_tunnel_get_stats;
- dev->do_ioctl = ipip6_tunnel_ioctl;
- dev->change_mtu = ipip6_tunnel_change_mtu;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->netdev_ops = &ipip6_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ dev->min_mtu = IPV6_MIN_MTU;
+ dev->max_mtu = IP6_MAX_MTU - t_hlen;
dev->flags = IFF_NOARP;
- dev->iflink = 0;
+ netif_keep_dst(dev);
dev->addr_len = 4;
+ dev->lltx = true;
+ dev->features |= SIT_FEATURES;
+ dev->hw_features |= SIT_FEATURES;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
}
static int ipip6_tunnel_init(struct net_device *dev)
{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int err;
tunnel->dev = dev;
strcpy(tunnel->parms.name, dev->name);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
-
- if (iph->daddr) {
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .oif = tunnel->parms.link,
- .proto = IPPROTO_IPV6 };
- struct rtable *rt;
- if (!ip_route_output_key(&rt, &fl)) {
- tdev = rt->u.dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(tunnel->parms.link);
+ ipip6_tunnel_bind_dev(dev);
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - sizeof(struct iphdr);
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
- }
- dev->iflink = tunnel->parms.link;
+ err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (err)
+ return err;
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
return 0;
}
-static int __init ipip6_fb_tunnel_init(struct net_device *dev)
+static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ struct net *net = dev_net(dev);
+ struct sit_net *sitn = net_generic(net, sit_net_id);
iph->version = 4;
iph->protocol = IPPROTO_IPV6;
iph->ihl = 5;
iph->ttl = 64;
- dev_hold(dev);
- tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
+}
+
+static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ u8 proto;
+
+ if (!data || !data[IFLA_IPTUN_PROTO])
+ return 0;
+
+ proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ if (!ipip6_valid_ip_proto(proto))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void ipip6_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm_kern *parms,
+ __u32 *fwmark)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPV6;
+ parms->iph.ihl = 5;
+ parms->iph.ttl = 64;
+
+ if (!data)
+ return;
+
+ ip_tunnel_netlink_parms(data, parms);
+
+ if (data[IFLA_IPTUN_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
+}
+
+#ifdef CONFIG_IPV6_SIT_6RD
+/* This function returns true when 6RD attributes are present in the nl msg */
+static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
+ struct ip_tunnel_6rd *ip6rd)
+{
+ bool ret = false;
+ memset(ip6rd, 0, sizeof(*ip6rd));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_6RD_PREFIX]) {
+ ret = true;
+ ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]);
+ }
+
+ if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) {
+ ret = true;
+ ip6rd->relay_prefix =
+ nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]);
+ }
+
+ if (data[IFLA_IPTUN_6RD_PREFIXLEN]) {
+ ret = true;
+ ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]);
+ }
+
+ if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) {
+ ret = true;
+ ip6rd->relay_prefixlen =
+ nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]);
+ }
+
+ return ret;
+}
+#endif
+
+static int ipip6_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel *nt;
+ struct ip_tunnel_encap ipencap;
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel_6rd ip6rd;
+#endif
+ struct net *net;
+ int err;
+
+ net = params->link_net ? : dev_net(dev);
+ nt = netdev_priv(dev);
+ nt->net = net;
+
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ err = ip_tunnel_encap_setup(nt, &ipencap);
+ if (err < 0)
+ return err;
+ }
+
+ ipip6_netlink_parms(data, &nt->parms, &nt->fwmark);
+
+ if (ipip6_tunnel_locate(net, &nt->parms, 0))
+ return -EEXIST;
+
+ err = ipip6_tunnel_create(dev);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
+
+ if (mtu >= IPV6_MIN_MTU &&
+ mtu <= IP6_MAX_MTU - dev->hard_header_len)
+ dev->mtu = mtu;
+ }
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (ipip6_netlink_6rd_parms(data, &ip6rd)) {
+ err = ipip6_tunnel_update_6rd(nt, &ip6rd);
+ if (err < 0)
+ unregister_netdevice_queue(dev, NULL);
+ }
+#endif
+
+ return err;
+}
+
+static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
+ struct net *net = t->net;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel_6rd ip6rd;
+#endif
+ __u32 fwmark = t->fwmark;
+ int err;
+
+ if (dev == sitn->fb_tunnel_dev)
+ return -EINVAL;
+
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ err = ip_tunnel_encap_setup(t, &ipencap);
+ if (err < 0)
+ return err;
+ }
+
+ ipip6_netlink_parms(data, &p, &fwmark);
+
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
+
+ t = ipip6_tunnel_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ ipip6_tunnel_update(t, &p, fwmark);
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (ipip6_netlink_6rd_parms(data, &ip6rd))
+ return ipip6_tunnel_update_6rd(t, &ip6rd);
+#endif
+
return 0;
}
-static struct net_protocol sit_protocol = {
+static size_t ipip6_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_PROTO */
+ nla_total_size(1) +
+#ifdef CONFIG_IPV6_SIT_6RD
+ /* IFLA_IPTUN_6RD_PREFIX */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_6RD_RELAY_PREFIX */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_6RD_PREFIXLEN */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */
+ nla_total_size(2) +
+#endif
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_FWMARK */
+ nla_total_size(4) +
+ 0;
+}
+
+static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm_kern *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))) ||
+ nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
+ nla_put_be16(skb, IFLA_IPTUN_FLAGS,
+ ip_tunnel_flags_to_be16(parm->i_flags)) ||
+ nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
+ goto nla_put_failure;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX,
+ &tunnel->ip6rd.prefix) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX,
+ tunnel->ip6rd.relay_prefix) ||
+ nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN,
+ tunnel->ip6rd.prefixlen) ||
+ nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN,
+ tunnel->ip6rd.relay_prefixlen))
+ goto nla_put_failure;
+#endif
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+#ifdef CONFIG_IPV6_SIT_6RD
+ [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 },
+ [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 },
+ [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 },
+#endif
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 },
+};
+
+static void ipip6_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ if (dev != sitn->fb_tunnel_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static struct rtnl_link_ops sit_link_ops __read_mostly = {
+ .kind = "sit",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip6_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip6_tunnel_setup,
+ .validate = ipip6_validate,
+ .newlink = ipip6_newlink,
+ .changelink = ipip6_changelink,
+ .get_size = ipip6_get_size,
+ .fill_info = ipip6_fill_info,
+ .dellink = ipip6_dellink,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct xfrm_tunnel sit_handler __read_mostly = {
.handler = ipip6_rcv,
.err_handler = ipip6_err,
+ .priority = 1,
};
-static void __exit sit_destroy_tunnels(void)
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+ .handler = ipip_rcv,
+ .err_handler = ipip6_err,
+ .priority = 2,
+};
+
+#if IS_ENABLED(CONFIG_MPLS)
+static struct xfrm_tunnel mplsip_handler __read_mostly = {
+ .handler = mplsip_rcv,
+ .err_handler = ipip6_err,
+ .priority = 2,
+};
+#endif
+
+static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head)
{
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+ struct net_device *dev, *aux;
int prio;
- for (prio = 1; prio < 4; prio++) {
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &sit_link_ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (prio = 0; prio < 4; prio++) {
int h;
- for (h = 0; h < HASH_SIZE; h++) {
+ for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) {
struct ip_tunnel *t;
- while ((t = tunnels[prio][h]) != NULL)
- unregister_netdevice(t->dev);
+
+ t = rtnl_net_dereference(net, sitn->tunnels[prio][h]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
+
+ t = rtnl_net_dereference(net, t->next);
+ }
}
}
}
+static int __net_init sit_init_net(struct net *net)
+{
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+ struct ip_tunnel *t;
+ int err;
+
+ sitn->tunnels[0] = sitn->tunnels_wc;
+ sitn->tunnels[1] = sitn->tunnels_l;
+ sitn->tunnels[2] = sitn->tunnels_r;
+ sitn->tunnels[3] = sitn->tunnels_r_l;
+
+ if (!net_has_fallback_tunnels(net))
+ return 0;
+
+ sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
+ NET_NAME_UNKNOWN,
+ ipip6_tunnel_setup);
+ if (!sitn->fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err_alloc_dev;
+ }
+ dev_net_set(sitn->fb_tunnel_dev, net);
+ sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops;
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ sitn->fb_tunnel_dev->netns_immutable = true;
+
+ t = netdev_priv(sitn->fb_tunnel_dev);
+ t->net = net;
+
+ err = register_netdev(sitn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
+
+ ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
+ ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+
+ strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
+ return 0;
+
+err_reg_dev:
+ free_netdev(sitn->fb_tunnel_dev);
+err_alloc_dev:
+ return err;
+}
+
+static struct pernet_operations sit_net_ops = {
+ .init = sit_init_net,
+ .exit_rtnl = sit_exit_rtnl_net,
+ .id = &sit_net_id,
+ .size = sizeof(struct sit_net),
+};
+
static void __exit sit_cleanup(void)
{
- inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
+ rtnl_link_unregister(&sit_link_ops);
+ xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+#if IS_ENABLED(CONFIG_MPLS)
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
+#endif
- rtnl_lock();
- sit_destroy_tunnels();
- unregister_netdevice(ipip6_fb_tunnel_dev);
- rtnl_unlock();
+ unregister_pernet_device(&sit_net_ops);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
static int __init sit_init(void)
{
int err;
- printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n");
+ pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n");
- if (inet_add_protocol(&sit_protocol, IPPROTO_IPV6) < 0) {
- printk(KERN_INFO "sit init: Can't add protocol\n");
- return -EAGAIN;
+ err = register_pernet_device(&sit_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_tunnel_register(&sit_handler, AF_INET6);
+ if (err < 0) {
+ pr_info("%s: can't register ip6ip4\n", __func__);
+ goto xfrm_tunnel_failed;
}
-
- ipip6_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
- ipip6_tunnel_setup);
- if (!ipip6_fb_tunnel_dev) {
- err = -ENOMEM;
- goto err1;
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ pr_info("%s: can't register ip4ip4\n", __func__);
+ goto xfrm_tunnel4_failed;
}
+#if IS_ENABLED(CONFIG_MPLS)
+ err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
+ if (err < 0) {
+ pr_info("%s: can't register mplsip\n", __func__);
+ goto xfrm_tunnel_mpls_failed;
+ }
+#endif
+ err = rtnl_link_register(&sit_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
- ipip6_fb_tunnel_dev->init = ipip6_fb_tunnel_init;
-
- if ((err = register_netdev(ipip6_fb_tunnel_dev)))
- goto err2;
-
- out:
+out:
return err;
- err2:
- free_netdev(ipip6_fb_tunnel_dev);
- err1:
- inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
+
+rtnl_link_failed:
+#if IS_ENABLED(CONFIG_MPLS)
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
+xfrm_tunnel_mpls_failed:
+#endif
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel4_failed:
+ xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
+xfrm_tunnel_failed:
+ unregister_pernet_device(&sit_net_ops);
goto out;
}
module_init(sit_init);
module_exit(sit_cleanup);
+MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("sit0");
+MODULE_ALIAS_RTNL_LINK("sit");
+MODULE_ALIAS_NETDEV("sit0");
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
new file mode 100644
index 000000000000..7e007f013ec8
--- /dev/null
+++ b/net/ipv6/syncookies.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPv6 Syncookies implementation for the Linux kernel
+ *
+ * Authors:
+ * Glenn Griffin <ggriffin.kernel@gmail.com>
+ *
+ * Based on IPv4 implementation by Andi Kleen
+ * linux/net/ipv4/syncookies.c
+ */
+
+#include <linux/tcp.h>
+#include <linux/random.h>
+#include <linux/siphash.h>
+#include <linux/kernel.h>
+#include <net/secure_seq.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+
+#define COOKIEBITS 24 /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+static siphash_aligned_key_t syncookie6_secret[2];
+
+/* RFC 2460, Section 8.3:
+ * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
+ *
+ * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows
+ * using higher values than ipv4 tcp syncookies.
+ * The other values are chosen based on ethernet (1500 and 9k MTU), plus
+ * one that accounts for common encap (PPPoe) overhead. Table must be sorted.
+ */
+static __u16 const msstab[] = {
+ 1280 - 60, /* IPV6_MIN_MTU - 60 */
+ 1480 - 60,
+ 1500 - 60,
+ 9000 - 60,
+};
+
+static u32 cookie_hash(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __be16 sport, __be16 dport, u32 count, int c)
+{
+ const struct {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ u32 count;
+ __be16 sport;
+ __be16 dport;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .saddr = *saddr,
+ .daddr = *daddr,
+ .count = count,
+ .sport = sport,
+ .dport = dport
+ };
+
+ net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
+ return siphash(&combined, offsetofend(typeof(combined), dport),
+ &syncookie6_secret[c]);
+}
+
+static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __be16 sport, __be16 dport, __u32 sseq,
+ __u32 data)
+{
+ u32 count = tcp_cookie_time();
+ return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+ sseq + (count << COOKIEBITS) +
+ ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+ & COOKIEMASK));
+}
+
+static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr,
+ const struct in6_addr *daddr, __be16 sport,
+ __be16 dport, __u32 sseq)
+{
+ __u32 diff, count = tcp_cookie_time();
+
+ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
+ return (__u32)-1;
+
+ return (cookie -
+ cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+ & COOKIEMASK;
+}
+
+u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
+ const struct tcphdr *th, __u16 *mssp)
+{
+ int mssind;
+ const __u16 mss = *mssp;
+
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+
+ *mssp = msstab[mssind];
+
+ return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source,
+ th->dest, ntohl(th->seq), mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
+
+__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ return __cookie_v6_init_sequence(iph, th, mssp);
+}
+
+int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th)
+{
+ __u32 cookie = ntohl(th->ack_seq) - 1;
+ __u32 seq = ntohl(th->seq) - 1;
+ __u32 mssind;
+
+ mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
+ th->source, th->dest, seq);
+
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+EXPORT_SYMBOL_GPL(__cookie_v6_check);
+
+static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct tcp_options_received tcp_opt;
+ u32 tsoff = 0;
+ int mss;
+
+ if (tcp_synq_no_recent_overflow(sk))
+ goto out;
+
+ mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb));
+ if (!mss) {
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
+ goto out;
+ }
+
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV);
+
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+ tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
+
+ if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
+ tsoff = secure_tcpv6_ts_off(net,
+ ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32);
+ tcp_opt.rcv_tsecr -= tsoff;
+ }
+
+ if (!cookie_timestamp_decode(net, &tcp_opt))
+ goto out;
+
+ return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb,
+ &tcp_opt, mss, tsoff);
+out:
+ return ERR_PTR(-EINVAL);
+}
+
+struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_request_sock *ireq;
+ struct net *net = sock_net(sk);
+ struct request_sock *req;
+ struct dst_entry *dst;
+ struct sock *ret = sk;
+ __u8 rcv_wscale;
+ int full_space;
+ SKB_DR(reason);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) ||
+ !th->ack || th->rst)
+ goto out;
+
+ if (cookie_bpf_ok(skb)) {
+ req = cookie_bpf_check(sk, skb);
+ } else {
+ req = cookie_tcp_check(net, sk, skb);
+ if (IS_ERR(req))
+ goto out;
+ }
+ if (!req) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
+
+ ireq = inet_rsk(req);
+
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+
+ if (security_inet_conn_request(sk, skb, req)) {
+ SKB_DR_SET(reason, SECURITY_HOOK);
+ goto out_free;
+ }
+
+ if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ refcount_inc(&skb->users);
+ ireq->pktopts = skb;
+ }
+
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = tcp_v6_iif(skb);
+
+ tcp_ao_syncookie(sk, skb, req, AF_INET6);
+
+ /*
+ * We need to lookup the dst_entry to get the correct window size.
+ * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten
+ * me if there is a preferred way.
+ */
+ {
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowi6_oif = ireq->ir_iif;
+ fl6.flowi6_mark = ireq->ir_mark;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ fl6.flowi6_uid = sk_uid(sk);
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
+
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ SKB_DR_SET(reason, IP_OUTNOROUTES);
+ goto out_free;
+ }
+ }
+
+ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ tcp_select_initial_window(sk, full_space, req->mss,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
+ ireq->wscale_ok, &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+
+ /* req->syncookie is set true only if ACK is validated
+ * by BPF kfunc, then, rcv_wscale is already configured.
+ */
+ if (!req->syncookie)
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok &= cookie_ecn_ok(net, dst);
+ tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
+
+ ret = tcp_get_cookie_sock(sk, skb, req, dst);
+ if (!ret) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
+out:
+ return ret;
+out_free:
+ reqsk_free(req);
+out_drop:
+ sk_skb_reason_drop(sk, skb, reason);
+ return NULL;
+}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 7a4639db1346..d2cd33e2698d 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem.
*
@@ -9,113 +10,352 @@
#include <linux/sysctl.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
+#include <linux/slab.h>
+#include <linux/export.h>
#include <net/ndisc.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
+#include <net/inet_frag.h>
+#include <net/netevent.h>
+#include <net/ip_fib.h>
+#ifdef CONFIG_NETLABEL
+#include <net/calipso.h>
+#endif
+#include <linux/ioam6.h>
-#ifdef CONFIG_SYSCTL
+static int flowlabel_reflect_max = 0x7;
+static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
+static u32 rt6_multipath_hash_fields_all_mask =
+ FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static u32 ioam6_id_max = IOAM6_DEFAULT_ID;
+static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE;
-static ctl_table ipv6_table[] = {
+static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv6.sysctl.multipath_hash_policy);
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+
+static int
+proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv6.sysctl.multipath_hash_fields);
+ ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+
+static struct ctl_table ipv6_table_template[] = {
{
- .ctl_name = NET_IPV6_ROUTE,
- .procname = "route",
- .maxlen = 0,
- .mode = 0555,
- .child = ipv6_route_table
+ .procname = "bindv6only",
+ .data = &init_net.ipv6.sysctl.bindv6only,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_IPV6_ICMP,
- .procname = "icmp",
- .maxlen = 0,
- .mode = 0555,
- .child = ipv6_icmp_table
+ .procname = "anycast_src_echo_reply",
+ .data = &init_net.ipv6.sysctl.anycast_src_echo_reply,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
},
{
- .ctl_name = NET_IPV6_BINDV6ONLY,
- .procname = "bindv6only",
- .data = &sysctl_ipv6_bindv6only,
+ .procname = "flowlabel_consistency",
+ .data = &init_net.ipv6.sysctl.flowlabel_consistency,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "auto_flowlabels",
+ .data = &init_net.ipv6.sysctl.auto_flowlabels,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &auto_flowlabels_max
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv6.sysctl.fwmark_reflect,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "idgen_retries",
+ .data = &init_net.ipv6.sysctl.idgen_retries,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH,
- .procname = "ip6frag_high_thresh",
- .data = &sysctl_ip6frag_high_thresh,
+ .procname = "idgen_delay",
+ .data = &init_net.ipv6.sysctl.idgen_delay,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH,
- .procname = "ip6frag_low_thresh",
- .data = &sysctl_ip6frag_low_thresh,
+ .procname = "flowlabel_state_ranges",
+ .data = &init_net.ipv6.sysctl.flowlabel_state_ranges,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv6.sysctl.ip_nonlocal_bind,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "flowlabel_reflect",
+ .data = &init_net.ipv6.sysctl.flowlabel_reflect,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &flowlabel_reflect_max,
},
{
- .ctl_name = NET_IPV6_IP6FRAG_TIME,
- .procname = "ip6frag_time",
- .data = &sysctl_ip6frag_time,
+ .procname = "max_dst_opts_number",
+ .data = &init_net.ipv6.sysctl.max_dst_opts_cnt,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV6_IP6FRAG_SECRET_INTERVAL,
- .procname = "ip6frag_secret_interval",
- .data = &sysctl_ip6frag_secret_interval,
+ .procname = "max_hbh_opts_number",
+ .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV6_MLD_MAX_MSF,
- .procname = "mld_max_msf",
- .data = &sysctl_mld_max_msf,
+ .procname = "max_dst_opts_length",
+ .data = &init_net.ipv6.sysctl.max_dst_opts_len,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header *ipv6_sysctl_header;
-
-static ctl_table ipv6_net_table[] = {
{
- .ctl_name = NET_IPV6,
- .procname = "ipv6",
- .mode = 0555,
- .child = ipv6_table
+ .procname = "max_hbh_length",
+ .data = &init_net.ipv6.sysctl.max_hbh_opts_len,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "fib_multipath_hash_policy",
+ .data = &init_net.ipv6.sysctl.multipath_hash_policy,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_rt6_multipath_hash_policy,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
+ .procname = "fib_multipath_hash_fields",
+ .data = &init_net.ipv6.sysctl.multipath_hash_fields,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_rt6_multipath_hash_fields,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &rt6_multipath_hash_fields_all_mask,
+ },
+ {
+ .procname = "seg6_flowlabel",
+ .data = &init_net.ipv6.sysctl.seg6_flowlabel,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "fib_notify_on_flag_change",
+ .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "ioam6_id",
+ .data = &init_net.ipv6.sysctl.ioam6_id,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = &ioam6_id_max,
+ },
+ {
+ .procname = "ioam6_id_wide",
+ .data = &init_net.ipv6.sysctl.ioam6_id_wide,
+ .maxlen = sizeof(u64),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra2 = &ioam6_id_wide_max,
},
- { .ctl_name = 0 }
};
-static ctl_table ipv6_root_table[] = {
+static struct ctl_table ipv6_rotable[] = {
{
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipv6_net_table
+ .procname = "mld_max_msf",
+ .data = &sysctl_mld_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "mld_qrv",
+ .data = &sysctl_mld_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE
},
- { .ctl_name = 0 }
+#ifdef CONFIG_NETLABEL
+ {
+ .procname = "calipso_cache_enable",
+ .data = &calipso_cache_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "calipso_cache_bucket_size",
+ .data = &calipso_cache_bucketsize,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NETLABEL */
};
-void ipv6_sysctl_register(void)
+static int __net_init ipv6_sysctl_net_init(struct net *net)
{
- ipv6_sysctl_header = register_sysctl_table(ipv6_root_table, 0);
+ size_t table_size = ARRAY_SIZE(ipv6_table_template);
+ struct ctl_table *ipv6_table;
+ struct ctl_table *ipv6_route_table;
+ struct ctl_table *ipv6_icmp_table;
+ int err, i;
+
+ err = -ENOMEM;
+ ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
+ GFP_KERNEL);
+ if (!ipv6_table)
+ goto out;
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < table_size; i++)
+ ipv6_table[i].data += (void *)net - (void *)&init_net;
+
+ ipv6_route_table = ipv6_route_sysctl_init(net);
+ if (!ipv6_route_table)
+ goto out_ipv6_table;
+
+ ipv6_icmp_table = ipv6_icmp_sysctl_init(net);
+ if (!ipv6_icmp_table)
+ goto out_ipv6_route_table;
+
+ net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6",
+ ipv6_table, table_size);
+ if (!net->ipv6.sysctl.hdr)
+ goto out_ipv6_icmp_table;
+
+ net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net,
+ "net/ipv6/route",
+ ipv6_route_table,
+ ipv6_route_sysctl_table_size(net));
+ if (!net->ipv6.sysctl.route_hdr)
+ goto out_unregister_ipv6_table;
+
+ net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net,
+ "net/ipv6/icmp",
+ ipv6_icmp_table,
+ ipv6_icmp_sysctl_table_size());
+ if (!net->ipv6.sysctl.icmp_hdr)
+ goto out_unregister_route_table;
+
+ err = 0;
+out:
+ return err;
+out_unregister_route_table:
+ unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr);
+out_unregister_ipv6_table:
+ unregister_net_sysctl_table(net->ipv6.sysctl.hdr);
+out_ipv6_icmp_table:
+ kfree(ipv6_icmp_table);
+out_ipv6_route_table:
+ kfree(ipv6_route_table);
+out_ipv6_table:
+ kfree(ipv6_table);
+ goto out;
}
-void ipv6_sysctl_unregister(void)
+static void __net_exit ipv6_sysctl_net_exit(struct net *net)
{
- unregister_sysctl_table(ipv6_sysctl_header);
+ const struct ctl_table *ipv6_table;
+ const struct ctl_table *ipv6_route_table;
+ const struct ctl_table *ipv6_icmp_table;
+
+ ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg;
+ ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg;
+ ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr);
+ unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr);
+ unregister_net_sysctl_table(net->ipv6.sysctl.hdr);
+
+ kfree(ipv6_table);
+ kfree(ipv6_route_table);
+ kfree(ipv6_icmp_table);
}
-#endif /* CONFIG_SYSCTL */
+static struct pernet_operations ipv6_sysctl_net_ops = {
+ .init = ipv6_sysctl_net_init,
+ .exit = ipv6_sysctl_net_exit,
+};
+
+static struct ctl_table_header *ip6_header;
+
+int ipv6_sysctl_register(void)
+{
+ int err = -ENOMEM;
+ ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable);
+ if (!ip6_header)
+ goto out;
+ err = register_pernet_subsys(&ipv6_sysctl_net_ops);
+ if (err)
+ goto err_pernet;
+out:
+ return err;
+err_pernet:
+ unregister_net_sysctl_table(ip6_header);
+ goto out;
+}
+
+void ipv6_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(ip6_header);
+ unregister_pernet_subsys(&ipv6_sysctl_net_ops);
+}
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
new file mode 100644
index 000000000000..3c09ac26206e
--- /dev/null
+++ b/net/ipv6/tcp_ao.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET An implementation of the TCP Authentication Option (TCP-AO).
+ * See RFC5925.
+ *
+ * Authors: Dmitry Safonov <dima@arista.com>
+ * Francesco Ruggeri <fruggeri@arista.com>
+ * Salam Noureddine <noureddine@arista.com>
+ */
+#include <crypto/hash.h>
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+static int tcp_v6_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __be16 sport, __be16 dport,
+ __be32 sisn, __be32 disn)
+{
+ struct kdf_input_block {
+ u8 counter;
+ u8 label[6];
+ struct tcp6_ao_context ctx;
+ __be16 outlen;
+ } __packed * tmp;
+ struct tcp_sigpool hp;
+ int err;
+
+ err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp);
+ if (err)
+ return err;
+
+ tmp = hp.scratch;
+ tmp->counter = 1;
+ memcpy(tmp->label, "TCP-AO", 6);
+ tmp->ctx.saddr = *saddr;
+ tmp->ctx.daddr = *daddr;
+ tmp->ctx.sport = sport;
+ tmp->ctx.dport = dport;
+ tmp->ctx.sisn = sisn;
+ tmp->ctx.disn = disn;
+ tmp->outlen = htons(tcp_ao_digest_size(mkt) * 8); /* in bits */
+
+ err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp);
+ tcp_sigpool_end(&hp);
+
+ return err;
+}
+
+int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+ const struct sk_buff *skb,
+ __be32 sisn, __be32 disn)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ return tcp_v6_ao_calc_key(mkt, key, &iph->saddr,
+ &iph->daddr, th->source,
+ th->dest, sisn, disn);
+}
+
+int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+ const struct sock *sk, __be32 sisn,
+ __be32 disn, bool send)
+{
+ if (send)
+ return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_rcv_saddr,
+ &sk->sk_v6_daddr, htons(sk->sk_num),
+ sk->sk_dport, sisn, disn);
+ else
+ return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr, sk->sk_dport,
+ htons(sk->sk_num), disn, sisn);
+}
+
+int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+ struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ return tcp_v6_ao_calc_key(mkt, key,
+ &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr,
+ htons(ireq->ir_num), ireq->ir_rmt_port,
+ htonl(tcp_rsk(req)->snt_isn),
+ htonl(tcp_rsk(req)->rcv_isn));
+}
+
+struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
+ struct sock *addr_sk,
+ int sndid, int rcvid)
+{
+ int l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ struct in6_addr *addr = &addr_sk->sk_v6_daddr;
+
+ return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr,
+ AF_INET6, sndid, rcvid);
+}
+
+struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
+ struct request_sock *req,
+ int sndid, int rcvid)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct in6_addr *addr = &ireq->ir_v6_rmt_addr;
+ int l3index;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+ return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr,
+ AF_INET6, sndid, rcvid);
+}
+
+int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr, int nbytes)
+{
+ struct tcp6_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = hp->scratch;
+ /* 1. TCP pseudo-header (RFC2460) */
+ bp->saddr = *saddr;
+ bp->daddr = *daddr;
+ bp->len = cpu_to_be32(nbytes);
+ bp->protocol = cpu_to_be32(IPPROTO_TCP);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->req);
+}
+
+int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+ const struct sock *sk, const struct sk_buff *skb,
+ const u8 *tkey, int hash_offset, u32 sne)
+{
+ return tcp_ao_hash_skb(AF_INET6, ao_hash, key, sk, skb, tkey,
+ hash_offset, sne);
+}
+
+int tcp_v6_parse_ao(struct sock *sk, int cmd,
+ sockptr_t optval, int optlen)
+{
+ return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen);
+}
+
+int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+ struct request_sock *req, const struct sk_buff *skb,
+ int hash_offset, u32 sne)
+{
+ void *hash_buf = NULL;
+ int err;
+
+ hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC);
+ if (!hash_buf)
+ return -ENOMEM;
+
+ err = tcp_v6_ao_calc_key_rsk(ao_key, hash_buf, req);
+ if (err)
+ goto out;
+
+ err = tcp_ao_hash_skb(AF_INET6, ao_hash, ao_key, req_to_sk(req), skb,
+ hash_buf, hash_offset, sne);
+out:
+ kfree(hash_buf);
+ return err;
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 517c50024bfc..280fe5978559 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1,13 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* TCP over IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: tcp_ipv6.c,v 1.144 2002/02/01 22:01:04 davem Exp $
- *
- * Based on:
+ * Based on:
* linux/net/ipv4/tcp.c
* linux/net/ipv4/tcp_input.c
* linux/net/ipv4/tcp_output.c
@@ -18,13 +17,9 @@
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/bottom_half.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -39,11 +34,14 @@
#include <linux/jhash.h>
#include <linux/ipsec.h>
#include <linux/times.h>
-
+#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/random.h>
+#include <linux/indirect_call_wrapper.h>
+#include <net/aligned_data.h>
#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
@@ -56,116 +54,137 @@
#include <net/inet_ecn.h>
#include <net/protocol.h>
#include <net/xfrm.h>
-#include <net/addrconf.h>
#include <net/snmp.h>
#include <net/dsfield.h>
#include <net/timewait_sock.h>
-
-#include <asm/uaccess.h>
+#include <net/inet_common.h>
+#include <net/secure_seq.h>
+#include <net/hotdata.h>
+#include <net/busy_poll.h>
+#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
-/* Socket used for sending RSTs and ACKs */
-static struct socket *tcp6_socket;
+#include <trace/events/tcp.h>
-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
-static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
-static void tcp_v6_send_check(struct sock *sk, int len,
- struct sk_buff *skb);
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason);
+static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req);
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-static struct inet_connection_sock_af_ops ipv6_mapped;
-static struct inet_connection_sock_af_ops ipv6_specific;
-#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_sock_af_ops tcp_sock_ipv6_specific;
-static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
+static const struct inet_connection_sock_af_ops ipv6_mapped;
+const struct inet_connection_sock_af_ops ipv6_specific;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
+static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
#endif
-static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
+/* Helper returning the inet6 address from a given tcp socket.
+ * It can be used in TCP stack instead of inet6_sk(sk).
+ * This avoids a dereference and allow compiler optimizations.
+ * It is a specialized version of inet6_sk_generic().
+ */
+#define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \
+ struct tcp6_sock, tcp)->inet6)
+
+static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
- return inet_csk_get_port(&tcp_hashinfo, sk, snum,
- inet6_csk_bind_conflict);
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (dst && dst_hold_safe(dst)) {
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
+ sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
+ }
}
-static void tcp_v6_hash(struct sock *sk)
+static u32 tcp_v6_init_seq(const struct sk_buff *skb)
{
- if (sk->sk_state != TCP_CLOSE) {
- if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
- tcp_prot.hash(sk);
- return;
- }
- local_bh_disable();
- __inet6_hash(&tcp_hashinfo, sk);
- local_bh_enable();
- }
+ return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
}
-static __inline__ __sum16 tcp_v6_check(struct tcphdr *th, int len,
- struct in6_addr *saddr,
- struct in6_addr *daddr,
- __wsum base)
+static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base);
+ return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32);
}
-static __u32 tcp_v6_init_sequence(struct sk_buff *skb)
+static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
- return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
- skb->nh.ipv6h->saddr.s6_addr32,
- skb->h.th->dest,
- skb->h.th->source);
+ /* This check is replicated from tcp_v6_connect() and intended to
+ * prevent BPF program called below from accessing bytes that are out
+ * of the bound specified by user in addr_len.
+ */
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ sock_owned_by_me(sk);
+
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
}
-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct inet_timewait_death_row *tcp_death_row;
+ struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p = NULL, final;
- struct flowi fl;
+ struct net *net = sock_net(sk);
+ struct ipv6_txoptions *opt;
struct dst_entry *dst;
+ struct flowi6 fl6;
int addr_type;
int err;
- if (addr_len < SIN6_LEN_RFC2133)
+ if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
- if (usin->sin6_family != AF_INET6)
- return(-EAFNOSUPPORT);
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
- if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
+ fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
- ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
fl6_sock_release(flowlabel);
}
}
/*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
-
- if(ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 0x1;
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+
+ if (ipv6_addr_any(&usin->sin6_addr)) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ &usin->sin6_addr);
+ else
+ usin->sin6_addr = in6addr_loopback;
+ }
addr_type = ipv6_addr_type(&usin->sin6_addr);
- if(addr_type & IPV6_ADDR_MULTICAST)
+ if (addr_type & IPV6_ADDR_MULTICAST)
return -ENETUNREACH;
if (addr_type&IPV6_ADDR_LINKLOCAL) {
@@ -174,8 +193,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
/* If interface is set while binding, indices
* must coincide.
*/
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != usin->sin6_scope_id)
+ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id))
return -EINVAL;
sk->sk_bound_dev_if = usin->sin6_scope_id;
@@ -187,118 +205,133 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
if (tp->rx_opt.ts_recent_stamp &&
- !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) {
+ !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ WRITE_ONCE(tp->write_seq, 0);
}
- ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
+ sk->sk_v6_daddr = usin->sin6_addr;
+ np->flow_label = fl6.flowlabel;
/*
* TCP over IPv4
*/
- if (addr_type == IPV6_ADDR_MAPPED) {
+ if (addr_type & IPV6_ADDR_MAPPED) {
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
- SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -ENETUNREACH;
sin.sin_family = AF_INET;
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
- icsk->icsk_af_ops = &ipv6_mapped;
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped);
+ if (sk_is_mptcp(sk))
+ mptcpv6_handle_mapped(sk, true);
sk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
- err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+ err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin));
if (err) {
icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &ipv6_specific;
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific);
+ if (sk_is_mptcp(sk))
+ mptcpv6_handle_mapped(sk, false);
sk->sk_backlog_rcv = tcp_v6_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tp->af_specific = &tcp_sock_ipv6_specific;
#endif
goto failure;
- } else {
- ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
- inet->saddr);
- ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
- inet->rcv_saddr);
}
+ np->saddr = sk->sk_v6_rcv_saddr;
return err;
}
- if (!ipv6_addr_any(&np->rcv_saddr))
- saddr = &np->rcv_saddr;
-
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src,
- (saddr ? saddr : &np->saddr));
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->sport;
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ saddr = &sk->sk_v6_rcv_saddr;
+
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
+ fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
+ fl6.flowi6_uid = sk_uid(sk);
+
+ opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
+ final_p = fl6_update_dst(&fl6, opt, &final);
+
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto failure;
}
- security_sk_classify_flow(sk, &fl);
+ tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto failure;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto failure;
+ if (!saddr) {
+ saddr = &fl6.saddr;
- if (saddr == NULL) {
- saddr = &fl.fl6_src;
- ipv6_addr_copy(&np->rcv_saddr, saddr);
+ err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
+ if (err)
+ goto failure;
}
/* set the source address */
- ipv6_addr_copy(&np->saddr, saddr);
- inet->rcv_saddr = LOOPBACK4_IPV6;
+ np->saddr = *saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
sk->sk_gso_type = SKB_GSO_TCPV6;
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
- icsk->icsk_ext_hdr_len = 0;
- if (np->opt)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
+ icsk->icsk_ext_hdr_len = psp_sk_overhead(sk);
+ if (opt)
+ icsk->icsk_ext_hdr_len += opt->opt_flen +
+ opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
- inet->dport = usin->sin6_port;
+ inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
- err = inet6_hash_connect(&tcp_death_row, sk);
+ err = inet6_hash_connect(tcp_death_row, sk);
if (err)
goto late_failure;
- if (!tp->write_seq)
- tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->sport,
- inet->dport);
+ sk_set_txhash(sk);
+
+ if (likely(!tp->repair)) {
+ if (!tp->write_seq)
+ WRITE_ONCE(tp->write_seq,
+ secure_tcpv6_seq(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport));
+ tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32);
+ }
+
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto late_failure;
err = tcp_connect(sk);
if (err)
@@ -308,1062 +341,1033 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
- __sk_dst_reset(sk);
+ inet_bhash2_reset_saddr(sk);
failure:
- inet->dport = 0;
+ inet->inet_dport = 0;
sk->sk_route_caps = 0;
return err;
}
-static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+static void tcp_v6_mtu_reduced(struct sock *sk)
{
- struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
+ struct dst_entry *dst;
+ u32 mtu;
+
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ return;
+
+ mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+
+ /* Drop requests trying to increase our current mss.
+ * Check done in __ip6_rt_update_pmtu() is too late.
+ */
+ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
+ return;
+
+ dst = inet6_csk_update_pmtu(sk, mtu);
+ if (!dst)
+ return;
+
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ tcp_sync_mss(sk, dst_mtu(dst));
+ tcp_simple_retransmit(sk);
+ }
+}
+
+static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct request_sock *fastopen;
struct ipv6_pinfo *np;
+ struct tcp_sock *tp;
+ __u32 seq, snd_una;
struct sock *sk;
+ bool fatal;
int err;
- struct tcp_sock *tp;
- __u32 seq;
- sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr,
- th->source, skb->dev->ifindex);
+ sk = __inet6_lookup_established(net, &hdr->daddr, th->dest,
+ &hdr->saddr, ntohs(th->source),
+ skb->dev->ifindex, inet6_sdif(skb));
- if (sk == NULL) {
- ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
- return;
+ if (!sk) {
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
+ /* To increase the counter of ignored icmps for TCP-AO */
+ tcp_ao_ignore_icmp(sk, AF_INET6, type, code);
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
+ }
+ seq = ntohl(th->seq);
+ fatal = icmpv6_err_convert(type, code, &err);
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, fatal);
+ return 0;
+ }
+
+ if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) {
+ sock_put(sk);
+ return 0;
}
bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+ if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == TCP_CLOSE)
goto out;
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+ }
+
tp = tcp_sk(sk);
- seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = rcu_dereference(tp->fastopen_rsk);
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
- !between(seq, tp->snd_una, tp->snd_nxt)) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ !between(seq, snd_una, tp->snd_nxt)) {
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
- np = inet6_sk(sk);
+ np = tcp_inet6_sk(sk);
- if (type == ICMPV6_PKT_TOOBIG) {
- struct dst_entry *dst = NULL;
+ if (type == NDISC_REDIRECT) {
+ if (!sock_owned_by_user(sk)) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
- if (sock_owned_by_user(sk))
- goto out;
- if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
- goto out;
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ }
+ goto out;
+ }
- /* icmp should have updated the destination cache entry */
- dst = __sk_dst_check(sk, np->dst_cookie);
+ if (type == ICMPV6_PKT_TOOBIG) {
+ u32 mtu = ntohl(info);
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
- /* BUGGG_FUTURE: Again, it is not clear how
- to handle rthdr case. Ignore this complexity
- for now.
- */
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
- security_skb_classify_flow(skb, &fl);
-
- if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
- sk->sk_err_soft = -err;
- goto out;
- }
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
- goto out;
- }
+ if (mtu < IPV6_MIN_MTU)
+ goto out;
- } else
- dst_hold(dst);
+ WRITE_ONCE(tp->mtu_info, mtu);
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
- tcp_sync_mss(sk, dst_mtu(dst));
- tcp_simple_retransmit(sk);
- } /* else let the usual retransmit timer handle it */
- dst_release(dst);
+ if (!sock_owned_by_user(sk))
+ tcp_v6_mtu_reduced(sk);
+ else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
goto out;
}
- icmpv6_err_convert(type, code, &err);
/* Might be for an request_sock */
switch (sk->sk_state) {
- struct request_sock *req, **prev;
- case TCP_LISTEN:
- if (sock_owned_by_user(sk))
- goto out;
-
- req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
- &hdr->saddr, inet6_iif(skb));
- if (!req)
- goto out;
-
- /* ICMPs are not backlogged, hence we cannot get
- * an established socket here.
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * already accepted it is treated as a connected one below.
*/
- BUG_TRAP(req->sk == NULL);
+ if (fastopen && !fastopen->sk)
+ break;
- if (seq != tcp_rsk(req)->snt_isn) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- inet_csk_reqsk_queue_drop(sk, req, prev);
- goto out;
-
- case TCP_SYN_SENT:
- case TCP_SYN_RECV: /* Cannot happen.
- It can, it SYNs are crossed. --ANK */
- if (!sock_owned_by_user(sk)) {
- sk->sk_err = err;
- sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+ ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);
- tcp_done(sk);
- } else
- sk->sk_err_soft = err;
+ if (!sock_owned_by_user(sk))
+ tcp_done_with_error(sk, err);
+ else
+ WRITE_ONCE(sk->sk_err_soft, err);
goto out;
+ case TCP_LISTEN:
+ break;
+ default:
+ /* check if this ICMP message allows revert of backoff.
+ * (see RFC 6069)
+ */
+ if (!fastopen && type == ICMPV6_DEST_UNREACH &&
+ code == ICMPV6_NOROUTE)
+ tcp_ld_RTO_revert(sk, seq);
}
- if (!sock_owned_by_user(sk) && np->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
- } else
- sk->sk_err_soft = err;
-
+ if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
+ WRITE_ONCE(sk->sk_err, err);
+ sk_error_report(sk);
+ } else {
+ WRITE_ONCE(sk->sk_err_soft, err);
+ }
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
-static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
- struct inet6_request_sock *treq = inet6_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff * skb;
- struct ipv6_txoptions *opt = NULL;
- struct in6_addr * final_p = NULL, final;
- struct flowi fl;
- int err = -1;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = treq->iif;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
- security_req_classify_flow(req, &fl);
-
- if (dst == NULL) {
- opt = np->opt;
- if (opt == NULL &&
- np->rxopt.bits.osrcrt == 2 &&
- treq->pktopts) {
- struct sk_buff *pktopts = treq->pktopts;
- struct inet6_skb_parm *rxopt = IP6CB(pktopts);
- if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
- }
+ struct inet_request_sock *ireq = inet_rsk(req);
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct ipv6_txoptions *opt;
+ struct flowi6 *fl6 = &fl->u.ip6;
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+ u8 tclass;
- if (opt && opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
+ /* First, grab a route. */
+ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
+ IPPROTO_TCP)) == NULL)
+ goto done;
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto done;
- }
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
- skb = tcp_make_synack(sk, dst, req);
if (skb) {
- struct tcphdr *th = skb->h.th;
-
- th->check = tcp_v6_check(th, skb->len,
- &treq->loc_addr, &treq->rmt_addr,
- csum_partial((char *)th, skb->len, skb->csum));
-
- ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt, 0);
+ tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK;
+ __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
+ &ireq->ir_v6_rmt_addr);
+
+ fl6->daddr = ireq->ir_v6_rmt_addr;
+ if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
+ fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
+
+ tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (np->tclass & INET_ECN_MASK) :
+ np->tclass;
+
+ if (!INET_ECN_is_capable(tclass) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tclass |= INET_ECN_ECT_0;
+
+ rcu_read_lock();
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
+ err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
+ opt, tclass, READ_ONCE(sk->sk_priority));
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
done:
- if (opt && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- dst_release(dst);
return err;
}
+
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
- if (inet6_rsk(req)->pktopts)
- kfree_skb(inet6_rsk(req)->pktopts);
+ kfree(inet_rsk(req)->ipv6_opt);
+ consume_skb(inet_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
- struct in6_addr *addr)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int i;
-
- BUG_ON(tp == NULL);
-
- if (!tp->md5sig_info || !tp->md5sig_info->entries6)
- return NULL;
-
- for (i = 0; i < tp->md5sig_info->entries6; i++) {
- if (ipv6_addr_cmp(&tp->md5sig_info->keys6[i].addr, addr) == 0)
- return (struct tcp_md5sig_key *)&tp->md5sig_info->keys6[i];
- }
- return NULL;
-}
-
-static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk,
- struct sock *addr_sk)
-{
- return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr);
-}
-
-static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk,
- struct request_sock *req)
-{
- return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr);
-}
-
-static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer,
- char *newkey, u8 newkeylen)
-{
- /* Add key to the list */
- struct tcp6_md5sig_key *key;
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp6_md5sig_key *keys;
-
- key = (struct tcp6_md5sig_key*) tcp_v6_md5_do_lookup(sk, peer);
- if (key) {
- /* modify existing entry - just update that one */
- kfree(key->key);
- key->key = newkey;
- key->keylen = newkeylen;
- } else {
- /* reallocate new list if current one is full. */
- if (!tp->md5sig_info) {
- tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC);
- if (!tp->md5sig_info) {
- kfree(newkey);
- return -ENOMEM;
- }
- }
- tcp_alloc_md5sig_pool();
- if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) {
- keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) *
- (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC);
-
- if (!keys) {
- tcp_free_md5sig_pool();
- kfree(newkey);
- return -ENOMEM;
- }
-
- if (tp->md5sig_info->entries6)
- memmove(keys, tp->md5sig_info->keys6,
- (sizeof (tp->md5sig_info->keys6[0]) *
- tp->md5sig_info->entries6));
-
- kfree(tp->md5sig_info->keys6);
- tp->md5sig_info->keys6 = keys;
- tp->md5sig_info->alloced6++;
- }
-
- ipv6_addr_copy(&tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr,
- peer);
- tp->md5sig_info->keys6[tp->md5sig_info->entries6].key = newkey;
- tp->md5sig_info->keys6[tp->md5sig_info->entries6].keylen = newkeylen;
-
- tp->md5sig_info->entries6++;
- }
- return 0;
-}
-
-static int tcp_v6_md5_add_func(struct sock *sk, struct sock *addr_sk,
- u8 *newkey, __u8 newkeylen)
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
+ const struct in6_addr *addr,
+ int l3index)
{
- return tcp_v6_md5_do_add(sk, &inet6_sk(addr_sk)->daddr,
- newkey, newkeylen);
+ return tcp_md5_do_lookup(sk, l3index,
+ (union tcp_md5_addr *)addr, AF_INET6);
}
-static int tcp_v6_md5_do_del(struct sock *sk, struct in6_addr *peer)
+static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
+ const struct sock *addr_sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int i;
-
- for (i = 0; i < tp->md5sig_info->entries6; i++) {
- if (ipv6_addr_cmp(&tp->md5sig_info->keys6[i].addr, peer) == 0) {
- /* Free the key */
- kfree(tp->md5sig_info->keys6[i].key);
- tp->md5sig_info->entries6--;
-
- if (tp->md5sig_info->entries6 == 0) {
- kfree(tp->md5sig_info->keys6);
- tp->md5sig_info->keys6 = NULL;
-
- tcp_free_md5sig_pool();
-
- return 0;
- } else {
- /* shrink the database */
- if (tp->md5sig_info->entries6 != i)
- memmove(&tp->md5sig_info->keys6[i],
- &tp->md5sig_info->keys6[i+1],
- (tp->md5sig_info->entries6 - i)
- * sizeof (tp->md5sig_info->keys6[0]));
- }
- }
- }
- return -ENOENT;
-}
-
-static void tcp_v6_clear_md5_list (struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int i;
+ int l3index;
- if (tp->md5sig_info->entries6) {
- for (i = 0; i < tp->md5sig_info->entries6; i++)
- kfree(tp->md5sig_info->keys6[i].key);
- tp->md5sig_info->entries6 = 0;
- tcp_free_md5sig_pool();
- }
-
- kfree(tp->md5sig_info->keys6);
- tp->md5sig_info->keys6 = NULL;
- tp->md5sig_info->alloced6 = 0;
-
- if (tp->md5sig_info->entries4) {
- for (i = 0; i < tp->md5sig_info->entries4; i++)
- kfree(tp->md5sig_info->keys4[i].key);
- tp->md5sig_info->entries4 = 0;
- tcp_free_md5sig_pool();
- }
-
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- tp->md5sig_info->alloced4 = 0;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr,
+ l3index);
}
-static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval,
- int optlen)
+static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
- u8 *newkey;
+ union tcp_ao_addr *addr;
+ int l3index = 0;
+ u8 prefixlen;
+ bool l3flag;
+ u8 flags;
if (optlen < sizeof(cmd))
return -EINVAL;
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin6->sin6_family != AF_INET6)
return -EINVAL;
+ flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+ l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+ prefixlen = cmd.tcpm_prefixlen;
+ if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) &&
+ prefixlen > 32))
+ return -EINVAL;
+ } else {
+ prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
+ }
+
+ if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
if (!cmd.tcpm_keylen) {
- if (!tcp_sk(sk)->md5sig_info)
- return -ENOENT;
- if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_MAPPED)
- return tcp_v4_md5_do_del(sk, sin6->sin6_addr.s6_addr32[3]);
- return tcp_v6_md5_do_del(sk, &sin6->sin6_addr);
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
+ AF_INET, prefixlen,
+ l3index, flags);
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
+ AF_INET6, prefixlen, l3index, flags);
}
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- if (!tcp_sk(sk)->md5sig_info) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_md5sig_info *p;
-
- p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL);
- if (!p)
- return -ENOMEM;
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+ addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3];
- tp->md5sig_info = p;
+ /* Don't allow keys for peers that have a matching TCP-AO key.
+ * See the comment in tcp_ao_add_cmd()
+ */
+ if (tcp_ao_required(sk, addr, AF_INET,
+ l3flag ? l3index : -1, false))
+ return -EKEYREJECTED;
+ return tcp_md5_do_add(sk, addr,
+ AF_INET, prefixlen, l3index, flags,
+ cmd.tcpm_key, cmd.tcpm_keylen);
}
- newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
- if (!newkey)
- return -ENOMEM;
- if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_MAPPED) {
- return tcp_v4_md5_do_add(sk, sin6->sin6_addr.s6_addr32[3],
- newkey, cmd.tcpm_keylen);
- }
- return tcp_v6_md5_do_add(sk, &sin6->sin6_addr, newkey, cmd.tcpm_keylen);
+ addr = (union tcp_md5_addr *)&sin6->sin6_addr;
+
+ /* Don't allow keys for peers that have a matching TCP-AO key.
+ * See the comment in tcp_ao_add_cmd()
+ */
+ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false))
+ return -EKEYREJECTED;
+
+ return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
+ cmd.tcpm_key, cmd.tcpm_keylen);
}
-static int tcp_v6_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
- struct in6_addr *saddr,
- struct in6_addr *daddr,
- struct tcphdr *th, int protocol,
- int tcplen)
+static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr,
+ const struct tcphdr *th, int nbytes)
{
- struct scatterlist sg[4];
- __u16 data_len;
- int block = 0;
- __sum16 cksum;
- struct tcp_md5sig_pool *hp;
- struct tcp6_pseudohdr *bp;
- struct hash_desc *desc;
- int err;
- unsigned int nbytes = 0;
-
- hp = tcp_get_md5sig_pool();
- if (!hp) {
- printk(KERN_WARNING "%s(): hash pool not found...\n", __FUNCTION__);
- goto clear_hash_noput;
- }
- bp = &hp->md5_blk.ip6;
- desc = &hp->md5_desc;
-
- /* 1. TCP pseudo-header (RFC2460) */
- ipv6_addr_copy(&bp->saddr, saddr);
- ipv6_addr_copy(&bp->daddr, daddr);
- bp->len = htonl(tcplen);
- bp->protocol = htonl(protocol);
-
- sg_set_buf(&sg[block++], bp, sizeof(*bp));
- nbytes += sizeof(*bp);
-
- /* 2. TCP header, excluding options */
- cksum = th->check;
- th->check = 0;
- sg_set_buf(&sg[block++], th, sizeof(*th));
- nbytes += sizeof(*th);
-
- /* 3. TCP segment data (if any) */
- data_len = tcplen - (th->doff << 2);
- if (data_len > 0) {
- u8 *data = (u8 *)th + (th->doff << 2);
- sg_set_buf(&sg[block++], data, data_len);
- nbytes += data_len;
- }
-
- /* 4. shared key */
- sg_set_buf(&sg[block++], key->key, key->keylen);
- nbytes += key->keylen;
-
- /* Now store the hash into the packet */
- err = crypto_hash_init(desc);
- if (err) {
- printk(KERN_WARNING "%s(): hash_init failed\n", __FUNCTION__);
- goto clear_hash;
- }
- err = crypto_hash_update(desc, sg, nbytes);
- if (err) {
- printk(KERN_WARNING "%s(): hash_update failed\n", __FUNCTION__);
- goto clear_hash;
- }
- err = crypto_hash_final(desc, md5_hash);
- if (err) {
- printk(KERN_WARNING "%s(): hash_final failed\n", __FUNCTION__);
- goto clear_hash;
- }
-
- /* Reset header, and free up the crypto */
- tcp_put_md5sig_pool();
- th->check = cksum;
-out:
- return 0;
-clear_hash:
- tcp_put_md5sig_pool();
-clear_hash_noput:
- memset(md5_hash, 0, 16);
- goto out;
+ struct {
+ struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */
+ struct tcphdr tcp;
+ } h;
+
+ h.ip.saddr = *saddr;
+ h.ip.daddr = *daddr;
+ h.ip.protocol = cpu_to_be32(IPPROTO_TCP);
+ h.ip.len = cpu_to_be32(nbytes);
+ h.tcp = *th;
+ h.tcp.check = 0;
+ md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}
-static int tcp_v6_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
- struct sock *sk,
- struct dst_entry *dst,
- struct request_sock *req,
- struct tcphdr *th, int protocol,
- int tcplen)
+static noinline_for_stack void
+tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct in6_addr *daddr, struct in6_addr *saddr,
+ const struct tcphdr *th)
{
- struct in6_addr *saddr, *daddr;
+ struct md5_ctx ctx;
- if (sk) {
- saddr = &inet6_sk(sk)->saddr;
- daddr = &inet6_sk(sk)->daddr;
- } else {
- saddr = &inet6_rsk(req)->loc_addr;
- daddr = &inet6_rsk(req)->rmt_addr;
- }
- return tcp_v6_do_calc_md5_hash(md5_hash, key,
- saddr, daddr,
- th, protocol, tcplen);
+ md5_init(&ctx);
+ tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb)
+static noinline_for_stack void
+tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb)
{
- __u8 *hash_location = NULL;
- struct tcp_md5sig_key *hash_expected;
- struct ipv6hdr *ip6h = skb->nh.ipv6h;
- struct tcphdr *th = skb->h.th;
- int length = (th->doff << 2) - sizeof (*th);
- int genhash;
- u8 *ptr;
- u8 newhash[16];
-
- hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr);
-
- /* If the TCP option is too short, we can short cut */
- if (length < TCPOLEN_MD5SIG)
- return hash_expected ? 1 : 0;
-
- /* parse options */
- ptr = (u8*)(th + 1);
- while (length > 0) {
- int opcode = *ptr++;
- int opsize;
-
- switch(opcode) {
- case TCPOPT_EOL:
- goto done_opts;
- case TCPOPT_NOP:
- length--;
- continue;
- default:
- opsize = *ptr++;
- if (opsize < 2 || opsize > length)
- goto done_opts;
- if (opcode == TCPOPT_MD5SIG) {
- hash_location = ptr;
- goto done_opts;
- }
- }
- ptr += opsize - 2;
- length -= opsize;
- }
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct in6_addr *saddr, *daddr;
+ struct md5_ctx ctx;
-done_opts:
- /* do we have a hash as expected? */
- if (!hash_expected) {
- if (!hash_location)
- return 0;
- if (net_ratelimit()) {
- printk(KERN_INFO "MD5 Hash NOT expected but found "
- "(" NIP6_FMT ", %u)->"
- "(" NIP6_FMT ", %u)\n",
- NIP6(ip6h->saddr), ntohs(th->source),
- NIP6(ip6h->daddr), ntohs(th->dest));
- }
- return 1;
+ if (sk) { /* valid for establish/request sockets */
+ saddr = &sk->sk_v6_rcv_saddr;
+ daddr = &sk->sk_v6_daddr;
+ } else {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ saddr = &ip6h->saddr;
+ daddr = &ip6h->daddr;
}
- if (!hash_location) {
- if (net_ratelimit()) {
- printk(KERN_INFO "MD5 Hash expected but NOT found "
- "(" NIP6_FMT ", %u)->"
- "(" NIP6_FMT ", %u)\n",
- NIP6(ip6h->saddr), ntohs(th->source),
- NIP6(ip6h->daddr), ntohs(th->dest));
- }
- return 1;
- }
-
- /* check the signature */
- genhash = tcp_v6_do_calc_md5_hash(newhash,
- hash_expected,
- &ip6h->saddr, &ip6h->daddr,
- th, sk->sk_protocol,
- skb->len);
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- if (net_ratelimit()) {
- printk(KERN_INFO "MD5 Hash %s for "
- "(" NIP6_FMT ", %u)->"
- "(" NIP6_FMT ", %u)\n",
- genhash ? "failed" : "mismatch",
- NIP6(ip6h->saddr), ntohs(th->source),
- NIP6(ip6h->daddr), ntohs(th->dest));
- }
- return 1;
- }
- return 0;
+ md5_init(&ctx);
+ tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+ tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
#endif
-static struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
- .family = AF_INET6,
- .obj_size = sizeof(struct tcp6_request_sock),
- .rtx_syn_ack = tcp_v6_send_synack,
- .send_ack = tcp_v6_reqsk_send_ack,
- .destructor = tcp_v6_reqsk_destructor,
- .send_reset = tcp_v6_send_reset
-};
-
-struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
-#ifdef CONFIG_TCP_MD5SIG
- .md5_lookup = tcp_v6_reqsk_md5_lookup,
-#endif
-};
-
-static struct timewait_sock_ops tcp6_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
- .twsk_unique = tcp_twsk_unique,
- .twsk_destructor= tcp_twsk_destructor,
-};
-
-static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
+static void tcp_v6_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb,
+ u32 tw_isn)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcphdr *th = skb->h.th;
+ bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
+ struct inet_request_sock *ireq = inet_rsk(req);
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener);
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0);
- skb->csum_offset = offsetof(struct tcphdr, check);
- } else {
- th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP,
- csum_partial((char *)th, th->doff<<2,
- skb->csum));
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = LOOPBACK4_IPV6;
+ ireq->ir_loc_addr = LOOPBACK4_IPV6;
+
+ /* So that link locals have meaning */
+ if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = tcp_v6_iif(skb);
+
+ if (!tw_isn &&
+ (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
+ np->rxopt.bits.rxinfo ||
+ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
+ np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
+ refcount_inc(&skb->users);
+ ireq->pktopts = skb;
}
}
-static int tcp_v6_gso_send_check(struct sk_buff *skb)
+static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
+ struct sk_buff *skb,
+ struct flowi *fl,
+ struct request_sock *req,
+ u32 tw_isn)
{
- struct ipv6hdr *ipv6h;
- struct tcphdr *th;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- return -EINVAL;
+ tcp_v6_init_req(req, sk, skb, tw_isn);
- ipv6h = skb->nh.ipv6h;
- th = skb->h.th;
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
- th->check = 0;
- th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
- IPPROTO_TCP, 0);
- skb->csum_offset = offsetof(struct tcphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
- return 0;
+ return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}
-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
-{
- struct tcphdr *th = skb->h.th, *t1;
- struct sk_buff *buff;
- struct flowi fl;
- int tot_len = sizeof(*th);
+struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct tcp6_request_sock),
+ .send_ack = tcp_v6_reqsk_send_ack,
+ .destructor = tcp_v6_reqsk_destructor,
+ .send_reset = tcp_v6_send_reset,
+};
+
+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+ sizeof(struct ipv6hdr),
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *key;
+ .req_md5_lookup = tcp_v6_md5_lookup,
+ .calc_md5_hash = tcp_v6_md5_hash_skb,
+#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v6_ao_lookup_rsk,
+ .ao_calc_key = tcp_v6_ao_calc_key_rsk,
+ .ao_synack_hash = tcp_v6_ao_synack_hash,
#endif
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v6_init_sequence,
+#endif
+ .route_req = tcp_v6_route_req,
+ .init_seq = tcp_v6_init_seq,
+ .init_ts_off = tcp_v6_init_ts_off,
+ .send_synack = tcp_v6_send_synack,
+};
- if (th->rst)
- return;
+static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
+ u32 ack, u32 win, u32 tsval, u32 tsecr,
+ int oif, int rst, u8 tclass, __be32 label,
+ u32 priority, u32 txhash, struct tcp_key *key)
+{
+ struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
+ unsigned int tot_len = sizeof(struct tcphdr);
+ struct sock *ctl_sk = net->ipv6.tcp_sk;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 mrst = 0, *topt;
+ struct dst_entry *dst;
+ struct sk_buff *buff;
+ struct tcphdr *t1;
+ struct flowi6 fl6;
+ u32 mark = 0;
- if (!ipv6_unicast_destination(skb))
- return;
+ if (tsecr)
+ tot_len += TCPOLEN_TSTAMP_ALIGNED;
+ if (tcp_key_is_md5(key))
+ tot_len += TCPOLEN_MD5SIG_ALIGNED;
+ if (tcp_key_is_ao(key))
+ tot_len += tcp_ao_len_aligned(key->ao_key);
-#ifdef CONFIG_TCP_MD5SIG
- if (sk)
- key = tcp_v6_md5_do_lookup(sk, &skb->nh.ipv6h->daddr);
- else
- key = NULL;
+#ifdef CONFIG_MPTCP
+ if (rst && !tcp_key_is_md5(key)) {
+ mrst = mptcp_reset_option(skb);
- if (key)
- tot_len += TCPOLEN_MD5SIG_ALIGNED;
+ if (mrst)
+ tot_len += sizeof(__be32);
+ }
#endif
- /*
- * We need to grab some memory, and put together an RST,
- * and then put it into the queue to be sent.
- */
-
- buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
- GFP_ATOMIC);
- if (buff == NULL)
- return;
+ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ if (!buff)
+ return;
- skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
+ skb_reserve(buff, MAX_TCP_HEADER);
- t1 = (struct tcphdr *) skb_push(buff, tot_len);
+ t1 = skb_push(buff, tot_len);
+ skb_reset_transport_header(buff);
/* Swap the send and the receive. */
memset(t1, 0, sizeof(*t1));
t1->dest = th->source;
t1->source = th->dest;
t1->doff = tot_len / 4;
- t1->rst = 1;
-
- if(th->ack) {
- t1->seq = th->ack_seq;
- } else {
- t1->ack = 1;
- t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
- + skb->len - (th->doff<<2));
+ t1->seq = htonl(seq);
+ t1->ack_seq = htonl(ack);
+ t1->ack = !rst || !th->ack;
+ t1->rst = rst;
+ t1->window = htons(win);
+
+ topt = (__be32 *)(t1 + 1);
+
+ if (tsecr) {
+ *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ *topt++ = htonl(tsval);
+ *topt++ = htonl(tsecr);
}
+ if (mrst)
+ *topt++ = mrst;
+
#ifdef CONFIG_TCP_MD5SIG
- if (key) {
- __be32 *opt = (__be32*)(t1 + 1);
- opt[0] = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- tcp_v6_do_calc_md5_hash((__u8*)&opt[1],
- key,
- &skb->nh.ipv6h->daddr,
- &skb->nh.ipv6h->saddr,
- t1, IPPROTO_TCP,
- tot_len);
+ if (tcp_key_is_md5(key)) {
+ *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+ tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key,
+ &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr, t1);
+ }
+#endif
+#ifdef CONFIG_TCP_AO
+ if (tcp_key_is_ao(key)) {
+ *topt++ = htonl((TCPOPT_AO << 24) |
+ (tcp_ao_len(key->ao_key) << 16) |
+ (key->ao_key->sndid << 8) |
+ (key->rcv_next));
+
+ tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key,
+ key->traffic_key,
+ (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr,
+ (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr,
+ t1, key->sne);
}
#endif
- buff->csum = csum_partial((char *)t1, sizeof(*t1), 0);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = ipv6_hdr(skb)->saddr;
+ fl6.saddr = ipv6_hdr(skb)->daddr;
+ fl6.flowlabel = label;
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr);
+ buff->ip_summed = CHECKSUM_PARTIAL;
- t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
- sizeof(*t1), IPPROTO_TCP,
- buff->csum);
+ __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
- fl.proto = IPPROTO_TCP;
- fl.oif = inet6_iif(skb);
- fl.fl_ip_dport = t1->dest;
- fl.fl_ip_sport = t1->source;
- security_skb_classify_flow(skb, &fl);
+ fl6.flowi6_proto = IPPROTO_TCP;
+ if (rt6_need_strict(&fl6.daddr) && !oif)
+ fl6.flowi6_oif = tcp_v6_iif(skb);
+ else {
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+ oif = skb->skb_iif;
- /* sk = NULL, but it is safe for now. RST socket required. */
- if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
+ fl6.flowi6_oif = oif;
+ }
- if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
- ip6_xmit(tcp6_socket->sk, buff, &fl, NULL, 0);
- TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
- TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
- return;
- }
+ if (sk) {
+ /* unconstify the socket only to attach it to buff with care. */
+ skb_set_owner_edemux(buff, (struct sock *)sk);
+ psp_reply_set_decrypted(sk, buff);
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ mark = inet_twsk(sk)->tw_mark;
+ else
+ mark = READ_ONCE(sk->sk_mark);
+ skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
+ }
+ if (txhash) {
+ /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
+ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4);
+ }
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
+ fl6.fl6_dport = t1->dest;
+ fl6.fl6_sport = t1->source;
+ fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
+
+ /* Pass a socket to ip6_dst_lookup either it is for RST
+ * Underlying function will use this to retrieve the network
+ * namespace
+ */
+ if (sk && sk->sk_state != TCP_TIME_WAIT)
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/
+ else
+ dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(buff, dst);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
+ tclass, priority);
+ TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ if (rst)
+ TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ return;
}
kfree_skb(buff);
}
-static void tcp_v6_send_ack(struct tcp_timewait_sock *tw,
- struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason)
{
- struct tcphdr *th = skb->h.th, *t1;
- struct sk_buff *buff;
- struct flowi fl;
- int tot_len = sizeof(struct tcphdr);
- __be32 *topt;
-#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *key;
- struct tcp_md5sig_key tw_key;
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ const __u8 *md5_hash_location = NULL;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ bool allocated_traffic_key = false;
#endif
-
+ const struct tcp_ao_hdr *aoh;
+ struct tcp_key key = {};
+ u32 seq = 0, ack_seq = 0;
+ __be32 label = 0;
+ u32 priority = 0;
+ struct net *net;
+ u32 txhash = 0;
+ int oif = 0;
#ifdef CONFIG_TCP_MD5SIG
- if (!tw && skb->sk) {
- key = tcp_v6_md5_do_lookup(skb->sk, &skb->nh.ipv6h->daddr);
- } else if (tw && tw->tw_md5_keylen) {
- tw_key.key = tw->tw_md5_key;
- tw_key.keylen = tw->tw_md5_keylen;
- key = &tw_key;
- } else {
- key = NULL;
- }
+ unsigned char newhash[16];
+ struct sock *sk1 = NULL;
#endif
- if (ts)
- tot_len += TCPOLEN_TSTAMP_ALIGNED;
-#ifdef CONFIG_TCP_MD5SIG
- if (key)
- tot_len += TCPOLEN_MD5SIG_ALIGNED;
-#endif
+ if (th->rst)
+ return;
- buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
- GFP_ATOMIC);
- if (buff == NULL)
+ /* If sk not NULL, it means we did a successful lookup and incoming
+ * route had to be correct. prequeue might have dropped our dst.
+ */
+ if (!sk && !ipv6_unicast_destination(skb))
return;
- skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
+ net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
+ return;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ rcu_read_lock();
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+ if (sk && sk_fullsock(sk)) {
+ int l3index;
- t1 = (struct tcphdr *) skb_push(buff,tot_len);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+ } else if (md5_hash_location) {
+ int dif = tcp_v6_iif_l3_slave(skb);
+ int sdif = tcp_v6_sdif(skb);
+ int l3index;
- /* Swap the send and the receive. */
- memset(t1, 0, sizeof(*t1));
- t1->dest = th->source;
- t1->source = th->dest;
- t1->doff = tot_len/4;
- t1->seq = htonl(seq);
- t1->ack_seq = htonl(ack);
- t1->ack = 1;
- t1->window = htons(win);
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source,
+ &ipv6h->daddr, ntohs(th->source),
+ dif, sdif);
+ if (!sk1)
+ goto out;
- topt = (__be32 *)(t1 + 1);
-
- if (ts) {
- *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
- (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
- *topt++ = htonl(tcp_time_stamp);
- *topt = htonl(ts);
- }
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = tcp_v6_sdif(skb) ? dif : 0;
-#ifdef CONFIG_TCP_MD5SIG
- if (key) {
- *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
- tcp_v6_do_calc_md5_hash((__u8 *)topt,
- key,
- &skb->nh.ipv6h->daddr,
- &skb->nh.ipv6h->saddr,
- t1, IPPROTO_TCP,
- tot_len);
+ key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
+ if (!key.md5_key)
+ goto out;
+ key.type = TCP_KEY_MD5;
+
+ tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
+ if (memcmp(md5_hash_location, newhash, 16) != 0)
+ goto out;
}
#endif
- buff->csum = csum_partial((char *)t1, tot_len, 0);
+ if (th->ack)
+ seq = ntohl(th->ack_seq);
+ else
+ ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
+ (th->doff << 2);
+
+#ifdef CONFIG_TCP_AO
+ if (aoh) {
+ int l3index;
+
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq,
+ &key.ao_key, &key.traffic_key,
+ &allocated_traffic_key,
+ &key.rcv_next, &key.sne))
+ goto out;
+ key.type = TCP_KEY_AO;
+ }
+#endif
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr);
+ if (sk) {
+ oif = sk->sk_bound_dev_if;
+ if (sk_fullsock(sk)) {
+ if (inet6_test_bit(REPFLOW, sk))
+ label = ip6_flowlabel(ipv6h);
+ priority = READ_ONCE(sk->sk_priority);
+ txhash = sk->sk_txhash;
+ }
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ priority = inet_twsk(sk)->tw_priority;
+ txhash = inet_twsk(sk)->tw_txhash;
+ }
+ } else {
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
+ label = ip6_flowlabel(ipv6h);
+ }
- t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
- tot_len, IPPROTO_TCP,
- buff->csum);
+ trace_tcp_send_reset(sk, skb, reason);
- fl.proto = IPPROTO_TCP;
- fl.oif = inet6_iif(skb);
- fl.fl_ip_dport = t1->dest;
- fl.fl_ip_sport = t1->source;
- security_skb_classify_flow(skb, &fl);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
+ ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK,
+ label, priority, txhash,
+ &key);
- if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
- if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
- ip6_xmit(tcp6_socket->sk, buff, &fl, NULL, 0);
- TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
- return;
- }
- }
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+out:
+ if (allocated_traffic_key)
+ kfree(key.traffic_key);
+ rcu_read_unlock();
+#endif
+}
- kfree_skb(buff);
+static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
+ u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_key *key, u8 tclass,
+ __be32 label, u32 priority, u32 txhash)
+{
+ tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
+ tclass, label, priority, txhash, key);
}
-static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb,
+ enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+ u8 tclass = tw->tw_tclass;
+ struct tcp_key key = {};
+
+ if (tw_status == TCP_TW_ACK_OOW)
+ tclass &= ~INET_ECN_MASK;
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao_info;
+
+ if (static_branch_unlikely(&tcp_ao_needed.key)) {
+
+ /* FIXME: the segment to-be-acked is not verified yet */
+ ao_info = rcu_dereference(tcptw->ao_info);
+ if (ao_info) {
+ const struct tcp_ao_hdr *aoh;
- tcp_v6_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ goto out;
+ if (aoh)
+ key.ao_key = tcp_ao_established_key(sk, ao_info,
+ aoh->rnext_keyid, -1);
+ }
+ }
+ if (key.ao_key) {
+ struct tcp_ao_key *rnext_key;
+
+ key.traffic_key = snd_other_key(key.ao_key);
+ /* rcv_next switches to our rcv_next */
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ key.rcv_next = rnext_key->rcvid;
+ key.sne = READ_ONCE(ao_info->snd_sne);
+ key.type = TCP_KEY_AO;
+#else
+ if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+ } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ key.md5_key = tcp_twsk_md5_key(tcptw);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+#endif
+ }
+
+ tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt,
+ READ_ONCE(tcptw->tw_rcv_nxt),
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
- tcptw->tw_ts_recent);
+ tcp_tw_tsval(tcptw),
+ READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if,
+ &key, tclass, cpu_to_be32(tw->tw_flowlabel),
+ tw->tw_priority, tw->tw_txhash);
+#ifdef CONFIG_TCP_AO
+out:
+#endif
inet_twsk_put(tw);
}
-static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
+static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
{
- tcp_v6_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent);
+ struct tcp_key key = {};
+
+#ifdef CONFIG_TCP_AO
+ if (static_branch_unlikely(&tcp_ao_needed.key) &&
+ tcp_rsk_used_ao(req)) {
+ const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
+ const struct tcp_ao_hdr *aoh;
+ int l3index;
+
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ return;
+ if (!aoh)
+ return;
+ key.ao_key = tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)addr,
+ AF_INET6, aoh->rnext_keyid, -1);
+ if (unlikely(!key.ao_key)) {
+ /* Send ACK with any matching MKT for the peer */
+ key.ao_key = tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)addr,
+ AF_INET6, -1, -1);
+ /* Matching key disappeared (user removed the key?)
+ * let the handshake timeout.
+ */
+ if (!key.ao_key) {
+ net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n",
+ addr,
+ ntohs(tcp_hdr(skb)->source),
+ &ipv6_hdr(skb)->daddr,
+ ntohs(tcp_hdr(skb)->dest));
+ return;
+ }
+ }
+ key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+ if (!key.traffic_key)
+ return;
+
+ key.type = TCP_KEY_AO;
+ key.rcv_next = aoh->keyid;
+ tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+ if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+ } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+
+ key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
+ l3index);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+#endif
+ }
+
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ tcp_rsk(req)->rcv_nxt,
+ tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
+ tcp_rsk_tsval(tcp_rsk(req)),
+ req->ts_recent, sk->sk_bound_dev_if,
+ &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK,
+ 0,
+ READ_ONCE(sk->sk_priority),
+ READ_ONCE(tcp_rsk(req)->txhash));
+ if (tcp_key_is_ao(&key))
+ kfree(key.traffic_key);
}
-static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
- struct request_sock *req, **prev;
- const struct tcphdr *th = skb->h.th;
- struct sock *nsk;
-
- /* Find possible connection requests. */
- req = inet6_csk_search_req(sk, &prev, th->source,
- &skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr, inet6_iif(skb));
- if (req)
- return tcp_check_req(sk, skb, req, prev);
-
- nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr,
- th->source, &skb->nh.ipv6h->daddr,
- ntohs(th->dest), inet6_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
+#ifdef CONFIG_SYN_COOKIES
+ const struct tcphdr *th = tcp_hdr(skb);
-#if 0 /*def CONFIG_SYN_COOKIES*/
- if (!th->rst && !th->syn && th->ack)
- sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt));
+ if (!th->syn)
+ sk = cookie_v6_check(sk, skb);
#endif
return sk;
}
-/* FIXME: this is substantially similar to the ipv4 code.
- * Can some kind of merge be done? -- erics
- */
-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
+ struct tcphdr *th, u32 *cookie)
{
- struct inet6_request_sock *treq;
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp_options_received tmp_opt;
- struct tcp_sock *tp = tcp_sk(sk);
- struct request_sock *req = NULL;
- __u32 isn = TCP_SKB_CB(skb)->when;
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops,
+ &tcp_request_sock_ipv6_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v6_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_conn_request(sk, skb);
if (!ipv6_unicast_destination(skb))
- goto drop;
-
- /*
- * There are no SYN attacks on IPv6, yet...
- */
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
- if (net_ratelimit())
- printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
- goto drop;
- }
-
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
- if (req == NULL)
- goto drop;
-
-#ifdef CONFIG_TCP_MD5SIG
- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
-#endif
-
- tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
- tmp_opt.user_mss = tp->rx_opt.user_mss;
-
- tcp_parse_options(skb, &tmp_opt, 0);
-
- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
- tcp_openreq_init(req, &tmp_opt, skb);
-
- treq = inet6_rsk(req);
- ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
- ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
- TCP_ECN_create_request(req, skb->h.th);
- treq->pktopts = NULL;
- if (ipv6_opt_accepted(sk, skb) ||
- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
- atomic_inc(&skb->users);
- treq->pktopts = skb;
+ if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+ __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+ return 0;
}
- treq->iif = sk->sk_bound_dev_if;
-
- /* So that link locals have meaning */
- if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- treq->iif = inet6_iif(skb);
-
- if (isn == 0)
- isn = tcp_v6_init_sequence(skb);
-
- tcp_rsk(req)->snt_isn = isn;
- security_inet_conn_request(sk, skb, req);
-
- if (tcp_v6_send_synack(sk, req, NULL))
- goto drop;
-
- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- return 0;
+ return tcp_conn_request(&tcp6_request_sock_ops,
+ &tcp_request_sock_ipv6_ops, sk, skb);
drop:
- if (req)
- reqsk_free(req);
-
+ tcp_listendrop(sk);
return 0; /* don't send reset */
}
-static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+ /* We need to move header back to the beginning if xfrm6_policy_check()
+ * and tcp_v6_fill_cb() are going to be called again.
+ * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
+ */
+ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+ sizeof(struct inet6_skb_parm));
+}
+
+static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
- struct inet6_request_sock *treq = inet6_rsk(req);
- struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
- struct tcp6_sock *newtcp6sk;
+ struct inet_request_sock *ireq;
+ struct ipv6_pinfo *newnp;
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct inet_sock *newinet;
+ bool found_dup_sk = false;
struct tcp_sock *newtp;
struct sock *newsk;
- struct ipv6_txoptions *opt;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
+ int l3index;
#endif
+ struct flowi6 fl6;
if (skb->protocol == htons(ETH_P_IP)) {
/*
* v6 mapped
*/
- newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst);
+ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst,
+ req_unhash, own_req);
- if (newsk == NULL)
+ if (!newsk)
return NULL;
- newtcp6sk = (struct tcp6_sock *)newsk;
- inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
-
newinet = inet_sk(newsk);
- newnp = inet6_sk(newsk);
+ newinet->pinet6 = tcp_inet6_sk(newsk);
+ newinet->ipv6_fl_list = NULL;
+
+ newnp = tcp_inet6_sk(newsk);
newtp = tcp_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
- newinet->daddr);
-
- ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
- newinet->saddr);
-
- ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+ newnp->saddr = newsk->sk_v6_rcv_saddr;
inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+ if (sk_is_mptcp(newsk))
+ mptcpv6_handle_mapped(newsk, true);
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
newnp->pktoptions = NULL;
newnp->opt = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+ newnp->mcast_oif = inet_iif(skb);
+ newnp->mcast_hops = ip_hdr(skb)->ttl;
+ newnp->rcv_flowinfo = 0;
+ if (inet6_test_bit(REPFLOW, sk))
+ newnp->flow_label = 0;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks count
@@ -1380,50 +1384,20 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
return newsk;
}
- opt = np->opt;
+ ireq = inet_rsk(req);
if (sk_acceptq_is_full(sk))
- goto out_overflow;
-
- if (np->rxopt.bits.osrcrt == 2 &&
- opt == NULL && treq->pktopts) {
- struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
- if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr *)(treq->pktopts->nh.raw + rxopt->srcrt));
- }
-
- if (dst == NULL) {
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
- if (opt && opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
- ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
- security_req_classify_flow(req, &fl);
+ goto exit_overflow;
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out;
- }
+ if (!dst) {
+ dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
+ if (!dst)
+ goto exit;
+ }
newsk = tcp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto out;
+ if (!newsk)
+ goto exit_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1432,43 +1406,45 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
*/
newsk->sk_gso_type = SKB_GSO_TCPV6;
- __ip6_dst_store(newsk, dst, NULL, NULL);
+ inet6_sk_rx_dst_set(newsk, skb);
- newtcp6sk = (struct tcp6_sock *)newsk;
- inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = tcp_inet6_sk(newsk);
+ newinet->ipv6_fl_list = NULL;
+ newinet->inet_opt = NULL;
newtp = tcp_sk(newsk);
- newinet = inet_sk(newsk);
- newnp = inet6_sk(newsk);
+ newnp = tcp_inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr);
- ipv6_addr_copy(&newnp->saddr, &treq->loc_addr);
- ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr);
- newsk->sk_bound_dev_if = treq->iif;
+ ip6_dst_store(newsk, dst, false, false);
- /* Now IPv6 options...
+ newnp->saddr = ireq->ir_v6_loc_addr;
+
+ /* Now IPv6 options...
First: no IPv4 options.
*/
- newinet->opt = NULL;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
- /* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (treq->pktopts != NULL) {
- newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
- kfree_skb(treq->pktopts);
- treq->pktopts = NULL;
- if (newnp->pktoptions)
- skb_set_owner_r(newnp->pktoptions, newsk);
- }
newnp->opt = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+ newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+ newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
+ if (inet6_test_bit(REPFLOW, sk))
+ newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
/* Clone native IPv6 options from listening socket (if any)
@@ -1476,86 +1452,104 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
but we make one more one thing there: reattach optmem
to newsk.
*/
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
if (opt) {
- newnp->opt = ipv6_dup_options(newsk, opt);
- if (opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
}
-
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt)
- inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
- newnp->opt->opt_flen);
+ if (opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+ opt->opt_flen;
+
+ tcp_ca_openreq_child(newsk, dst);
- tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
- tcp_initialize_rcv_mss(newsk);
+ newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
- newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+ tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
- /* Copy over the MD5 key from the original socket */
- if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) {
- /* We're using one, so create a matching key
- * on the newsk structure. If we fail to get
- * memory, then we end up not copying the key
- * across. Shucks.
- */
- char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
- if (newkey != NULL)
- tcp_v6_md5_do_add(newsk, &inet6_sk(sk)->daddr,
- newkey, key->keylen);
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+
+ if (!tcp_rsk_used_ao(req)) {
+ /* Copy over the MD5 key from the original socket */
+ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
+ if (key) {
+ const union tcp_md5_addr *addr;
+
+ addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
+ if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key))
+ goto put_and_exit;
+ }
}
#endif
+#ifdef CONFIG_TCP_AO
+ /* Copy over tcp_ao_info if any */
+ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
+ goto put_and_exit; /* OOM */
+#endif
- __inet6_hash(&tcp_hashinfo, newsk);
- inet_inherit_port(&tcp_hashinfo, sk, newsk);
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
+ if (*own_req) {
+ tcp_move_syn(newtp, req);
+
+ /* Clone pktoptions received with SYN, if we own the req */
+ if (ireq->pktopts) {
+ newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
+ if (newnp->pktoptions)
+ tcp_v6_restore_cb(newnp->pktoptions);
+ }
+ } else {
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
+ }
return newsk;
-out_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-out:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
- if (opt && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+exit_overflow:
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
dst_release(dst);
+exit:
+ tcp_listendrop(sk);
return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto exit;
}
-static __sum16 tcp_v6_checksum_init(struct sk_buff *skb)
-{
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- return 0;
- }
- }
-
- skb->csum = ~csum_unfold(tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr, 0));
-
- if (skb->len <= 76) {
- return __skb_checksum_complete(skb);
- }
- return 0;
-}
-
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp_sock *tp;
+ struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct sk_buff *opt_skb = NULL;
+ enum skb_drop_reason reason;
+ struct tcp_sock *tp;
/* Imagine: socket is IPv6. IPv4 packet arrives,
goes to IPv4 receive handler and backlogged.
@@ -1568,13 +1562,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
-#ifdef CONFIG_TCP_MD5SIG
- if (tcp_v6_inbound_md5_hash (sk, skb))
- goto discard;
-#endif
-
- if (sk_filter(sk, skb))
- goto discard;
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
/*
* socket locking is here for SMP purposes as backlog rcv
@@ -1592,60 +1582,71 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
looks not very well thought. For now we latch
options, received in the last packet, enqueued
by tcp. Feel free to propose better solution.
- --ANK (980728)
+ --ANK (980728)
*/
- if (np->rxopt.all)
- opt_skb = skb_clone(skb, GFP_ATOMIC);
+ if (np->rxopt.all && sk->sk_state != TCP_LISTEN)
+ opt_skb = skb_clone_and_charge_r(skb, sk);
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- TCP_CHECK_TIMER(sk);
- if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
- goto reset;
- TCP_CHECK_TIMER(sk);
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
+
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (dst) {
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
+ INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
+ dst, sk->sk_rx_dst_cookie) == NULL) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
+ dst_release(dst);
+ }
+ }
+
+ tcp_rcv_established(sk, skb);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
}
- if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
- if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v6_hnd_req(sk, skb);
- if (!nsk)
- goto discard;
+ if (sk->sk_state == TCP_LISTEN) {
+ struct sock *nsk = tcp_v6_cookie_check(sk, skb);
- /*
- * Queue it on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket..
- */
- if(nsk != sk) {
- if (tcp_child_process(sk, nsk, skb))
- goto reset;
- if (opt_skb)
- __kfree_skb(opt_skb);
+ if (nsk != sk) {
+ if (nsk) {
+ reason = tcp_child_process(sk, nsk, skb);
+ if (reason)
+ goto reset;
+ }
return 0;
}
- }
+ } else
+ sock_rps_save_rxhash(sk, skb);
- TCP_CHECK_TIMER(sk);
- if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
+ reason = tcp_rcv_state_process(sk, skb);
+ if (reason)
goto reset;
- TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
reset:
- tcp_v6_send_reset(sk, skb);
+ tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
discard:
if (opt_skb)
__kfree_skb(opt_skb);
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
return 0;
csum_err:
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1661,11 +1662,16 @@ ipv6_pktoptions:
if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
- np->mcast_oif = inet6_iif(opt_skb);
+ WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb));
if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
- np->mcast_hops = opt_skb->nh.ipv6h->hop_limit;
- if (ipv6_opt_accepted(sk, opt_skb)) {
- skb_set_owner_r(opt_skb, sk);
+ WRITE_ONCE(np->mcast_hops,
+ ipv6_hdr(opt_skb)->hop_limit);
+ if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
+ np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
+ if (inet6_test_bit(REPFLOW, sk))
+ np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
+ if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
+ tcp_v6_restore_cb(opt_skb);
opt_skb = xchg(&np->pktoptions, opt_skb);
} else {
__kfree_skb(opt_skb);
@@ -1673,213 +1679,411 @@ ipv6_pktoptions:
}
}
- if (opt_skb)
- kfree_skb(opt_skb);
+ consume_skb(opt_skb);
return 0;
}
-static int tcp_v6_rcv(struct sk_buff **pskb)
+static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ const struct tcphdr *th)
{
- struct sk_buff *skb = *pskb;
- struct tcphdr *th;
- struct sock *sk;
+ /* This is tricky: we move IP6CB at its correct location into
+ * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
+ * _decode_session6() uses IP6CB().
+ * barrier() makes sure compiler won't play aliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
+ sizeof(struct inet6_skb_parm));
+ barrier();
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff*4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
+ TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->has_rxtstamp =
+ skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
+}
+
+INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
+{
+ struct net *net = dev_net_rcu(skb->dev);
+ enum skb_drop_reason drop_reason;
+ enum tcp_tw_status tw_status;
+ int sdif = inet6_sdif(skb);
+ int dif = inet6_iif(skb);
+ const struct tcphdr *th;
+ const struct ipv6hdr *hdr;
+ struct sock *sk = NULL;
+ bool refcounted;
int ret;
+ u32 isn;
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/*
* Count it even if it's bad.
*/
- TCP_INC_STATS_BH(TCP_MIB_INSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
- th = skb->h.th;
+ th = (const struct tcphdr *)skb->data;
- if (th->doff < sizeof(struct tcphdr)/4)
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
+ }
if (!pskb_may_pull(skb, th->doff*4))
goto discard_it;
- if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v6_checksum_init(skb)))
- goto bad_packet;
-
- th = skb->h.th;
- TCP_SKB_CB(skb)->seq = ntohl(th->seq);
- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
- skb->len - th->doff*4);
- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->when = 0;
- TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h);
- TCP_SKB_CB(skb)->sacked = 0;
+ if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo))
+ goto csum_error;
- sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source,
- &skb->nh.ipv6h->daddr, ntohs(th->dest),
- inet6_iif(skb));
+ th = (const struct tcphdr *)skb->data;
+ hdr = ipv6_hdr(skb);
+lookup:
+ sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th),
+ th->source, th->dest, inet6_iif(skb), sdif,
+ &refcounted);
if (!sk)
goto no_tcp_socket;
-process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ bool req_stolen = false;
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+ else
+ drop_reason = tcp_inbound_hash(sk, req, skb,
+ &hdr->saddr, &hdr->daddr,
+ AF_INET6, dif, sdif);
+ if (drop_reason) {
+ sk_drops_skbadd(sk, skb);
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (tcp_checksum_complete(skb)) {
+ reqsk_put(req);
+ goto csum_error;
+ }
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+ if (!nsk) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sk = nsk;
+ /* reuseport_migrate_sock() has already held one sk_refcnt
+ * before returning.
+ */
+ } else {
+ sock_hold(sk);
+ }
+ refcounted = true;
+ nsk = NULL;
+ if (!tcp_filter(sk, skb, &drop_reason)) {
+ th = (const struct tcphdr *)skb->data;
+ hdr = ipv6_hdr(skb);
+ tcp_v6_fill_cb(skb, hdr, th);
+ nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
+ &drop_reason);
+ }
+ if (!nsk) {
+ reqsk_put(req);
+ if (req_stolen) {
+ /* Another cpu got exclusive access to req
+ * and created a full blown socket.
+ * Try to feed this packet to this socket
+ * instead of discarding it.
+ */
+ tcp_v6_restore_cb(skb);
+ sock_put(sk);
+ goto lookup;
+ }
+ goto discard_and_relse;
+ }
+ nf_reset_ct(skb);
+ if (nsk == sk) {
+ reqsk_put(req);
+ tcp_v6_restore_cb(skb);
+ } else {
+ drop_reason = tcp_child_process(sk, nsk, skb);
+ if (drop_reason) {
+ enum sk_rst_reason rst_reason;
+
+ rst_reason = sk_rst_convert_drop_reason(drop_reason);
+ tcp_v6_send_reset(nsk, skb, rst_reason);
+ goto discard_and_relse;
+ }
+ sock_put(sk);
+ return 0;
+ }
+ }
+
+process:
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ drop_reason = SKB_DROP_REASON_TCP_MINTTL;
+ goto discard_and_relse;
+ }
+ }
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
+ }
- if (sk_filter(sk, skb))
+ drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr,
+ AF_INET6, dif, sdif);
+ if (drop_reason)
goto discard_and_relse;
+ nf_reset_ct(skb);
+
+ if (tcp_filter(sk, skb, &drop_reason))
+ goto discard_and_relse;
+
+ th = (const struct tcphdr *)skb->data;
+ hdr = ipv6_hdr(skb);
+ tcp_v6_fill_cb(skb, hdr, th);
+
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v6_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
+ tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
- struct tcp_sock *tp = tcp_sk(sk);
- if (tp->ucopy.dma_chan)
- ret = tcp_v6_do_rcv(sk, skb);
- else
-#endif
- {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v6_do_rcv(sk, skb);
- }
- } else
- sk_add_backlog(sk, skb);
+ ret = tcp_v6_do_rcv(sk, skb);
+ } else {
+ if (tcp_add_backlog(sk, skb, &drop_reason))
+ goto discard_and_relse;
+ }
bh_unlock_sock(sk);
-
- sock_put(sk);
+put_and_return:
+ if (refcounted)
+ sock_put(sk);
return ret ? -1 : 0;
no_tcp_socket:
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+ tcp_v6_fill_cb(skb, hdr, th);
+
+ if (tcp_checksum_complete(skb)) {
+csum_error:
+ drop_reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
+ __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ __TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
- tcp_v6_send_reset(NULL, skb);
+ tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
}
discard_it:
-
- /*
- * Discard frame
- */
-
- kfree_skb(skb);
+ SKB_DR_OR(drop_reason, NOT_SPECIFIED);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
discard_and_relse:
- sock_put(sk);
+ sk_drops_skbadd(sk, skb);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
- if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
- TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ tcp_v6_fill_cb(skb, hdr, th);
+
+ if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
- goto discard_it;
+ goto csum_error;
}
- switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
+ switch (tw_status) {
case TCP_TW_SYN:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(&tcp_hashinfo,
- &skb->nh.ipv6h->daddr,
- ntohs(th->dest), inet6_iif(skb));
- if (sk2 != NULL) {
+ sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th),
+ &ipv6_hdr(skb)->saddr, th->source,
+ &ipv6_hdr(skb)->daddr,
+ ntohs(th->dest),
+ tcp_v6_iif_l3_slave(skb),
+ sdif);
+ if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
- inet_twsk_deschedule(tw, &tcp_death_row);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
sk = sk2;
+ tcp_v6_restore_cb(skb);
+ refcounted = false;
+ __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
- /* Fall through to ACK */
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
+ /* to ACK */
+ fallthrough;
case TCP_TW_ACK:
- tcp_v6_timewait_ack(sk, skb);
+ case TCP_TW_ACK_OOW:
+ tcp_v6_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
- goto no_tcp_socket;
- case TCP_TW_SUCCESS:;
+ tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ goto discard_it;
+ case TCP_TW_SUCCESS:
+ ;
}
goto discard_it;
}
-static int tcp_v6_remember_stamp(struct sock *sk)
+void tcp_v6_early_demux(struct sk_buff *skb)
{
- /* Alas, not yet... */
- return 0;
+ struct net *net = dev_net_rcu(skb->dev);
+ const struct ipv6hdr *hdr;
+ const struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+ return;
+
+ hdr = ipv6_hdr(skb);
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return;
+
+ /* Note : We use inet6_iif() here, not tcp_v6_iif() */
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
+ &hdr->daddr, ntohs(th->dest),
+ inet6_iif(skb), inet6_sdif(skb));
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk_fullsock(sk)) {
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
+ if (dst &&
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
}
-static struct inet_connection_sock_af_ops ipv6_specific = {
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+};
+
+INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr);
+}
+
+const struct inet_connection_sock_af_ops ipv6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
+ .sk_rx_dst_set = inet6_sk_rx_dst_set,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
- .remember_stamp = tcp_v6_remember_stamp,
.net_header_len = sizeof(struct ipv6hdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
+ .mtu_reduced = tcp_v6_mtu_reduced,
};
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
.md5_lookup = tcp_v6_md5_lookup,
- .calc_md5_hash = tcp_v6_calc_md5_hash,
- .md5_add = tcp_v6_md5_add_func,
+ .calc_md5_hash = tcp_v6_md5_hash_skb,
.md5_parse = tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v6_ao_lookup,
+ .calc_ao_hash = tcp_v6_ao_hash_skb,
+ .ao_parse = tcp_v6_parse_ao,
+ .ao_calc_key_sk = tcp_v6_ao_calc_key_sk,
+#endif
};
#endif
/*
* TCP over IPv4 via INET6 API
*/
-
-static struct inet_connection_sock_af_ops ipv6_mapped = {
+static const struct inet_connection_sock_af_ops ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
- .remember_stamp = tcp_v4_remember_stamp,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
};
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
.md5_lookup = tcp_v4_md5_lookup,
- .calc_md5_hash = tcp_v4_calc_md5_hash,
- .md5_add = tcp_v6_md5_add_func,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
.md5_parse = tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v6_ao_lookup,
+ .calc_ao_hash = tcp_v4_ao_hash_skb,
+ .ao_parse = tcp_v6_parse_ao,
+ .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
+#endif
};
+
+static void tcp6_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet6_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -1888,182 +2092,156 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
static int tcp_v6_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- skb_queue_head_init(&tp->out_of_order_queue);
- tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
-
- icsk->icsk_rto = TCP_TIMEOUT_INIT;
- tp->mdev = TCP_TIMEOUT_INIT;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- tp->snd_cwnd = 2;
-
- /* See draft-stevens-tcpca-spec-01 for discussion of the
- * initialization of these values.
- */
- tp->snd_ssthresh = 0x7fffffff;
- tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = 536;
-
- tp->reordering = sysctl_tcp_reordering;
-
- sk->sk_state = TCP_CLOSE;
+ tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv6_specific;
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
- icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_write_space = sk_stream_write_space;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-#ifdef CONFIG_TCP_MD5SIG
- tp->af_specific = &tcp_sock_ipv6_specific;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
+ sk->sk_destruct = tcp6_destruct_sock;
#endif
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
-
- atomic_inc(&tcp_sockets_allocated);
-
return 0;
}
-static int tcp_v6_destroy_sock(struct sock *sk)
-{
-#ifdef CONFIG_TCP_MD5SIG
- /* Clean up the MD5 key list */
- if (tcp_sk(sk)->md5sig_info)
- tcp_v6_clear_md5_list(sk);
-#endif
- tcp_v4_destroy_sock(sk);
- return inet6_destroy_sock(sk);
-}
-
+#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
-static void get_openreq6(struct seq_file *seq,
- struct sock *sk, struct request_sock *req, int i, int uid)
+static void get_openreq6(struct seq_file *seq,
+ const struct request_sock *req, int i)
{
- int ttd = req->expires - jiffies;
- struct in6_addr *src = &inet6_rsk(req)->loc_addr;
- struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
+ long ttd = req->rsk_timer.expires - jiffies;
+ const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
+ const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr;
if (ttd < 0)
ttd = 0;
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
- "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n",
i,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3],
- ntohs(inet_sk(sk)->sport),
+ inet_rsk(req)->ir_num,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3],
- ntohs(inet_rsk(req)->rmt_port),
+ ntohs(inet_rsk(req)->ir_rmt_port),
TCP_SYN_RECV,
- 0,0, /* could print option size, but that is af dependent. */
- 1, /* timers active (only the expire timer) */
- jiffies_to_clock_t(ttd),
- req->retrans,
- uid,
- 0, /* non standard timer */
+ 0, 0, /* could print option size, but that is af dependent. */
+ 1, /* timers active (only the expire timer) */
+ jiffies_to_clock_t(ttd),
+ req->num_timeout,
+ from_kuid_munged(seq_user_ns(seq),
+ sk_uid(req->rsk_listener)),
+ 0, /* non standard timer */
0, /* open_requests have no inode */
0, req);
}
static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
{
- struct in6_addr *dest, *src;
+ const struct in6_addr *dest, *src;
__u16 destp, srcp;
int timer_active;
unsigned long timer_expires;
- struct inet_sock *inet = inet_sk(sp);
- struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_sock *inet = inet_sk(sp);
+ const struct tcp_sock *tp = tcp_sk(sp);
const struct inet_connection_sock *icsk = inet_csk(sp);
- struct ipv6_pinfo *np = inet6_sk(sp);
-
- dest = &np->daddr;
- src = &np->rcv_saddr;
- destp = ntohs(inet->dport);
- srcp = ntohs(inet->sport);
-
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
+ u8 icsk_pending;
+ int rx_queue;
+ int state;
+
+ dest = &sp->sk_v6_daddr;
+ src = &sp->sk_v6_rcv_saddr;
+ destp = ntohs(inet->inet_dport);
+ srcp = ntohs(inet->inet_sport);
+
+ icsk_pending = smp_load_acquire(&icsk->icsk_pending);
+ if (icsk_pending == ICSK_TIME_RETRANS ||
+ icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
- timer_expires = icsk->icsk_timeout;
- } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_expires = tcp_timeout_expires(sp);
+ } else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = icsk->icsk_timeout;
- } else if (timer_pending(&sp->sk_timer)) {
+ timer_expires = tcp_timeout_expires(sp);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
- timer_expires = sp->sk_timer.expires;
+ timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
}
+ state = inet_sk_state_load(sp);
+ if (state == TCP_LISTEN)
+ rx_queue = READ_ONCE(sp->sk_ack_backlog);
+ else
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
+ */
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
+
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
- "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d\n",
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
i,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
- sp->sk_state,
- tp->write_seq-tp->snd_una,
- (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
+ state,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
+ rx_queue,
timer_active,
- jiffies_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
- sock_i_uid(sp),
- icsk->icsk_probes_out,
+ jiffies_delta_to_clock_t(timer_expires - jiffies),
+ READ_ONCE(icsk->icsk_retransmits),
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp,
- icsk->icsk_rto,
- icsk->icsk_ack.ato,
- (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
- tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
+ refcount_read(&sp->sk_refcnt), sp,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp),
+ tcp_snd_cwnd(tp),
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
);
}
-static void get_timewait6_sock(struct seq_file *seq,
+static void get_timewait6_sock(struct seq_file *seq,
struct inet_timewait_sock *tw, int i)
{
- struct in6_addr *dest, *src;
+ long delta = tw->tw_timer.expires - jiffies;
+ const struct in6_addr *dest, *src;
__u16 destp, srcp;
- struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
- int ttd = tw->tw_ttd - jiffies;
-
- if (ttd < 0)
- ttd = 0;
- dest = &tw6->tw_v6_daddr;
- src = &tw6->tw_v6_rcv_saddr;
+ dest = &tw->tw_v6_daddr;
+ src = &tw->tw_v6_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
- "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
+ "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
i,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
- tw->tw_substate, 0, 0,
- 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
- atomic_read(&tw->tw_refcnt), tw);
+ READ_ONCE(tw->tw_substate), 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ refcount_read(&tw->tw_refcnt), tw);
}
-#ifdef CONFIG_PROC_FS
static int tcp6_seq_show(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st;
+ struct sock *sk = v;
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
@@ -2076,39 +2254,38 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
- switch (st->state) {
- case TCP_SEQ_STATE_LISTENING:
- case TCP_SEQ_STATE_ESTABLISHED:
- get_tcp6_sock(seq, v, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid);
- break;
- case TCP_SEQ_STATE_TIME_WAIT:
+ if (sk->sk_state == TCP_TIME_WAIT)
get_timewait6_sock(seq, v, st->num);
- break;
- }
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq6(seq, v, st->num);
+ else
+ get_tcp6_sock(seq, v, st->num);
out:
return 0;
}
-static struct file_operations tcp6_seq_fops;
+static const struct seq_operations tcp6_seq_ops = {
+ .show = tcp6_seq_show,
+ .start = tcp_seq_start,
+ .next = tcp_seq_next,
+ .stop = tcp_seq_stop,
+};
+
static struct tcp_seq_afinfo tcp6_seq_afinfo = {
- .owner = THIS_MODULE,
- .name = "tcp6",
.family = AF_INET6,
- .seq_show = tcp6_seq_show,
- .seq_fops = &tcp6_seq_fops,
};
-int __init tcp6_proc_init(void)
+int __net_init tcp6_proc_init(struct net *net)
{
- return tcp_proc_register(&tcp6_seq_afinfo);
+ if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops,
+ sizeof(struct tcp_iter_state), &tcp6_seq_afinfo))
+ return -ENOMEM;
+ return 0;
}
-void tcp6_proc_exit(void)
+void tcp6_proc_exit(struct net *net)
{
- tcp_proc_unregister(&tcp6_seq_afinfo);
+ remove_proc_entry("tcp6", net->proc_net);
}
#endif
@@ -2116,66 +2293,127 @@ struct proto tcpv6_prot = {
.name = "TCPv6",
.owner = THIS_MODULE,
.close = tcp_close,
+ .pre_connect = tcp_v6_pre_connect,
.connect = tcp_v6_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v6_init_sock,
- .destroy = tcp_v6_destroy_sock,
+ .destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
- .sendmsg = tcp_sendmsg,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .splice_eof = tcp_splice_eof,
.backlog_rcv = tcp_v6_do_rcv,
- .hash = tcp_v6_hash,
- .unhash = tcp_unhash,
- .get_port = tcp_v6_get_port,
+ .release_cb = tcp_release_cb,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .leave_memory_pressure = tcp_leave_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .memory_allocated = &tcp_memory_allocated,
+
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
- .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
- .sysctl_wmem = sysctl_tcp_wmem,
- .sysctl_rmem = sysctl_tcp_rmem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
-#endif
+ .h.hashinfo = NULL,
+ .no_autobind = true,
+ .diag_destroy = tcp_abort,
};
+EXPORT_SYMBOL_GPL(tcpv6_prot);
-static struct inet6_protocol tcpv6_protocol = {
- .handler = tcp_v6_rcv,
- .err_handler = tcp_v6_err,
- .gso_send_check = tcp_v6_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-};
static struct inet_protosw tcpv6_protosw = {
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcpv6_prot,
.ops = &inet6_stream_ops,
- .capability = -1,
- .no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
};
-void __init tcpv6_init(void)
+static int __net_init tcpv6_net_init(struct net *net)
{
+ int res;
+
+ res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_TCP, net);
+ if (!res)
+ net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC;
+
+ return res;
+}
+
+static void __net_exit tcpv6_net_exit(struct net *net)
+{
+ inet_ctl_sock_destroy(net->ipv6.tcp_sk);
+}
+
+static struct pernet_operations tcpv6_net_ops = {
+ .init = tcpv6_net_init,
+ .exit = tcpv6_net_exit,
+};
+
+int __init tcpv6_init(void)
+{
+ int ret;
+
+ net_hotdata.tcpv6_protocol = (struct inet6_protocol) {
+ .handler = tcp_v6_rcv,
+ .err_handler = tcp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+ };
+ ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
+ if (ret)
+ goto out;
+
/* register inet6 protocol */
- if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0)
- printk(KERN_ERR "tcpv6_init: Could not register protocol\n");
- inet6_register_protosw(&tcpv6_protosw);
+ ret = inet6_register_protosw(&tcpv6_protosw);
+ if (ret)
+ goto out_tcpv6_protocol;
+
+ ret = register_pernet_subsys(&tcpv6_net_ops);
+ if (ret)
+ goto out_tcpv6_protosw;
- if (inet_csk_ctl_sock_create(&tcp6_socket, PF_INET6, SOCK_RAW,
- IPPROTO_TCP) < 0)
- panic("Failed to create the TCPv6 control socket.\n");
+ ret = mptcpv6_init();
+ if (ret)
+ goto out_tcpv6_pernet_subsys;
+
+out:
+ return ret;
+
+out_tcpv6_pernet_subsys:
+ unregister_pernet_subsys(&tcpv6_net_ops);
+out_tcpv6_protosw:
+ inet6_unregister_protosw(&tcpv6_protosw);
+out_tcpv6_protocol:
+ inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
+ goto out;
+}
+
+void tcpv6_exit(void)
+{
+ unregister_pernet_subsys(&tcpv6_net_ops);
+ inet6_unregister_protosw(&tcpv6_protosw);
+ inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
}
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
new file mode 100644
index 000000000000..effeba58630b
--- /dev/null
+++ b/net/ipv6/tcpv6_offload.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * TCPv6 GSO/GRO support
+ */
+#include <linux/indirect_call_wrapper.h>
+#include <linux/skbuff.h>
+#include <net/inet6_hashtables.h>
+#include <net/gro.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip6_checksum.h>
+#include "ip6_offload.h"
+
+static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct ipv6hdr *hdr;
+ struct sk_buff *p;
+ struct sock *sk;
+ struct net *net;
+ int iif, sdif;
+
+ if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+ hdr = skb_gro_network_header(skb);
+ net = dev_net_rcu(skb->dev);
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
+ &hdr->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_gen_put(sk);
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ struct tcphdr *th;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+ ip6_gro_compute_pseudo))
+ goto flush;
+
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
+
+ tcp6_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
+{
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
+ th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
+ &iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+
+ tcp_gro_complete(skb);
+ return 0;
+}
+
+static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
+ struct in6_addr *oldip,
+ const struct in6_addr *newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct tcphdr *th = tcp_hdr(seg);
+
+ if (!ipv6_addr_equal(oldip, newip)) {
+ inet_proto_csum_replace16(&th->check, seg,
+ oldip->s6_addr32,
+ newip->s6_addr32,
+ true);
+ *oldip = *newip;
+ }
+
+ if (*oldport == newport)
+ return;
+
+ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
+ *oldport = newport;
+}
+
+static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct tcphdr *th;
+ const struct ipv6hdr *iph;
+ struct sk_buff *seg;
+ struct tcphdr *th2;
+ struct ipv6hdr *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ipv6_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ipv6_hdr(seg->next);
+
+ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
+ ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
+ ipv6_addr_equal(&iph->daddr, &iph2->daddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ipv6_hdr(seg);
+
+ __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr,
+ &th2->source, th->source);
+ __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr,
+ &th2->dest, th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv6_gso_segment_list_csum(skb);
+}
+
+static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct tcphdr *th;
+
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6))
+ return ERR_PTR(-EINVAL);
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ return ERR_PTR(-EINVAL);
+
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size)
+ return __tcp6_gso_segment_list(skb, features);
+
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ /* Set up pseudo header, usually expect stack to have done
+ * this.
+ */
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr);
+ }
+
+ return tcp_gso_segment(skb, features);
+}
+
+int __init tcpv6_offload_init(void)
+{
+ net_hotdata.tcpv6_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = tcp6_gso_segment,
+ .gro_receive = tcp6_gro_receive,
+ .gro_complete = tcp6_gro_complete,
+ },
+ };
+ return inet6_add_offload(&net_hotdata.tcpv6_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 918d07dd1219..dc4ea9b11794 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -1,54 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2003,2004 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Authors Mitsuru KANDA <mk@linux-ipv6.org>
- * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
*/
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/icmpv6.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/xfrm.h>
-static struct xfrm6_tunnel *tunnel6_handlers;
+static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly;
static DEFINE_MUTEX(tunnel6_mutex);
-int xfrm6_tunnel_register(struct xfrm6_tunnel *handler)
+static inline int xfrm6_tunnel_mpls_supported(void)
{
- struct xfrm6_tunnel **pprev;
+ return IS_ENABLED(CONFIG_MPLS);
+}
+
+int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
+{
+ struct xfrm6_tunnel __rcu **pprev;
+ struct xfrm6_tunnel *t;
int ret = -EEXIST;
int priority = handler->priority;
mutex_lock(&tunnel6_mutex);
- for (pprev = &tunnel6_handlers; *pprev; pprev = &(*pprev)->next) {
- if ((*pprev)->priority > priority)
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel6_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
break;
- if ((*pprev)->priority == priority)
+ if (t->priority == priority)
goto err;
}
handler->next = *pprev;
- *pprev = handler;
+ rcu_assign_pointer(*pprev, handler);
ret = 0;
@@ -57,83 +71,236 @@ err:
return ret;
}
-
EXPORT_SYMBOL(xfrm6_tunnel_register);
-int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler)
+int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
{
- struct xfrm6_tunnel **pprev;
+ struct xfrm6_tunnel __rcu **pprev;
+ struct xfrm6_tunnel *t;
int ret = -ENOENT;
mutex_lock(&tunnel6_mutex);
- for (pprev = &tunnel6_handlers; *pprev; pprev = &(*pprev)->next) {
- if (*pprev == handler) {
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel6_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
*pprev = handler->next;
ret = 0;
break;
}
}
+err:
mutex_unlock(&tunnel6_mutex);
synchronize_net();
return ret;
}
-
EXPORT_SYMBOL(xfrm6_tunnel_deregister);
-static int tunnel6_rcv(struct sk_buff **pskb)
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+static int tunnelmpls6_rcv(struct sk_buff *skb)
+{
+ struct xfrm6_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int tunnel6_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb = *pskb;
struct xfrm6_tunnel *handler;
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- for (handler = tunnel6_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->handler(skb))
return 0;
- icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, skb->dev);
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
}
-static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+ struct xfrm6_tunnel __rcu *head;
+ struct xfrm6_tunnel *handler;
+ int ret;
+
+ head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers;
+
+ for_each_tunnel_rcu(head, handler) {
+ if (handler->cb_handler) {
+ ret = handler->cb_handler(skb, err);
+ if (ret <= 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel6_input_afinfo = {
+ .family = AF_INET6,
+ .is_ipip = true,
+ .callback = tunnel6_rcv_cb,
+};
+#endif
+
+static int tunnel46_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
- for (handler = tunnel6_handlers; handler; handler = handler->next)
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel46_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
-static struct inet6_protocol tunnel6_protocol = {
+static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel46_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+
+ return -ENOENT;
+}
+
+static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+
+ return -ENOENT;
+}
+
+static const struct inet6_protocol tunnel6_protocol = {
.handler = tunnel6_rcv,
.err_handler = tunnel6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+static const struct inet6_protocol tunnel46_protocol = {
+ .handler = tunnel46_rcv,
+ .err_handler = tunnel46_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static const struct inet6_protocol tunnelmpls6_protocol = {
+ .handler = tunnelmpls6_rcv,
+ .err_handler = tunnelmpls6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
static int __init tunnel6_init(void)
{
if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
- printk(KERN_ERR "tunnel6 init(): can't add protocol\n");
+ pr_err("%s: can't add protocol\n", __func__);
+ return -EAGAIN;
+ }
+ if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ return -EAGAIN;
+ }
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+ if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) {
+ pr_err("%s: can't add input afinfo\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ if (xfrm6_tunnel_mpls_supported())
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS);
return -EAGAIN;
}
+#endif
return 0;
}
static void __exit tunnel6_fini(void)
{
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+ if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo))
+ pr_err("%s: can't remove input afinfo\n", __func__);
+#endif
+ if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP))
+ pr_err("%s: can't remove protocol\n", __func__);
if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
- printk(KERN_ERR "tunnel6 close: can't remove protocol\n");
+ pr_err("%s: can't remove protocol\n", __func__);
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS))
+ pr_err("%s: can't remove protocol\n", __func__);
}
module_init(tunnel6_init);
module_exit(tunnel6_fini);
+MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver");
MODULE_LICENSE("GPL");
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f52a5c3cc0a3..794c13674e8a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1,14 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* UDP over IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Authors:
- * Pedro Roque <roque@di.fc.ul.pt>
+ * Pedro Roque <roque@di.fc.ul.pt>
*
* Based on linux/ipv4/udp.c
*
- * $Id: udp.c,v 1.65 2002/02/01 22:01:04 davem Exp $
- *
* Fixes:
* Hideaki YOSHIFUJI : sin6_scope_id support
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
@@ -16,18 +15,13 @@
* a single port at the same time.
* Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/bpf-cgroup.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
-#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
@@ -35,384 +29,1069 @@
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
+#include <linux/module.h>
#include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/indirect_call_wrapper.h>
+#include <trace/events/udp.h>
+#include <net/addrconf.h>
#include <net/ndisc.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/raw.h>
+#include <net/seg6.h>
#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
+#include <net/udp_tunnel.h>
#include <net/xfrm.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/busy_poll.h>
+#include <net/sock_reuseport.h>
+#include <net/gro.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <trace/events/skb.h>
#include "udp_impl.h"
-DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
+static void udpv6_destruct_sock(struct sock *sk)
+{
+ udp_destruct_common(sk);
+ inet6_sock_destruct(sk);
+}
+
+int udpv6_init_sock(struct sock *sk)
+{
+ int res = udp_lib_init_sock(sk);
+
+ sk->sk_destruct = udpv6_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ return res;
+}
+
+INDIRECT_CALLABLE_SCOPE
+u32 udp6_ehashfn(const struct net *net,
+ const struct in6_addr *laddr,
+ const u16 lport,
+ const struct in6_addr *faddr,
+ const __be16 fport)
+{
+ u32 lhash, fhash;
+
+ net_get_random_once(&udp6_ehash_secret,
+ sizeof(udp6_ehash_secret));
+ net_get_random_once(&udp_ipv6_hash_secret,
+ sizeof(udp_ipv6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+ fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret);
+
+ return __inet6_ehashfn(lhash, lport, fhash, fport,
+ udp6_ehash_secret + net_hash_mix(net));
+}
+
+int udp_v6_get_port(struct sock *sk, unsigned short snum)
+{
+ unsigned int hash2_nulladdr =
+ ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+ unsigned int hash2_partial =
+ ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, hash2_nulladdr);
+}
+
+void udp_v6_rehash(struct sock *sk)
+{
+ u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ u16 new_hash4;
+
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) {
+ new_hash4 = udp_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+ } else {
+ new_hash4 = udp6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+ }
+
+ udp_lib_rehash(sk, new_hash, new_hash4);
+}
-static inline int udp_v6_get_port(struct sock *sk, unsigned short snum)
+static int compute_score(struct sock *sk, const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, unsigned short hnum,
+ int dif, int sdif)
{
- return udp_get_port(sk, snum, ipv6_rcv_saddr_equal);
+ int bound_dev_if, score;
+ struct inet_sock *inet;
+ bool dev_match;
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ sk->sk_family != PF_INET6)
+ return -1;
+
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+
+ score = 0;
+ inet = inet_sk(sk);
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score++;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
+ return -1;
+ score++;
+ }
+
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif);
+ if (!dev_match)
+ return -1;
+ if (bound_dev_if)
+ score++;
+
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ score++;
+
+ return score;
}
-static struct sock *__udp6_lib_lookup(struct in6_addr *saddr, __be16 sport,
- struct in6_addr *daddr, __be16 dport,
- int dif, struct hlist_head udptable[])
+/**
+ * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port)
+ * @net: Network namespace
+ * @saddr: Source address, network order
+ * @sport: Source port, network order
+ * @daddr: Destination address, network order
+ * @hnum: Destination port, host order
+ * @dif: Destination interface index
+ * @sdif: Destination bridge port index, if relevant
+ * @udptable: Set of UDP hash tables
+ *
+ * Simplified lookup to be used as fallback if no sockets are found due to a
+ * potential race between (receive) address change, and lookup happening before
+ * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
+ * result sockets, because if we have one, we don't need the fallback at all.
+ *
+ * Called under rcu_read_lock().
+ *
+ * Return: socket with highest matching score if any, NULL if none
+ */
+static struct sock *udp6_lib_lookup1(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ const struct udp_table *udptable)
{
+ unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot = &udptable->hash[slot];
struct sock *sk, *result = NULL;
- struct hlist_node *node;
- unsigned short hnum = ntohs(dport);
- int badness = -1;
-
- read_lock(&udp_hash_lock);
- sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (inet->num == hnum && sk->sk_family == PF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- int score = 0;
- if (inet->dport) {
- if (inet->dport != sport)
- continue;
- score++;
- }
- if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
- continue;
- score++;
- }
- if (!ipv6_addr_any(&np->daddr)) {
- if (!ipv6_addr_equal(&np->daddr, saddr))
- continue;
- score++;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score++;
- }
- if(score == 4) {
+ int score, badness = 0;
+
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net,
+ saddr, sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+
+ return result;
+}
+
+/* called with rcu_read_lock() */
+static struct sock *udp6_lib_lookup2(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, unsigned int hnum,
+ int dif, int sdif, struct udp_hslot *hslot2,
+ struct sk_buff *skb)
+{
+ struct sock *sk, *result;
+ int score, badness;
+ bool need_rescore;
+
+ result = NULL;
+ badness = -1;
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ need_rescore = false;
+rescore:
+ score = compute_score(need_rescore ? result : sk, net, saddr,
+ sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ badness = score;
+
+ if (need_rescore)
+ continue;
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
result = sk;
- break;
- } else if(score > badness) {
+ continue;
+ }
+
+ result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, udp6_ehashfn);
+ if (!result) {
result = sk;
- badness = score;
+ continue;
}
+
+ /* Fall back to scoring if group has connections */
+ if (!reuseport_has_conns(sk))
+ return result;
+
+ /* Reuseport logic returned an error, keep original score. */
+ if (IS_ERR(result))
+ continue;
+
+ /* compute_score is too long of a function to be
+ * inlined, and calling it again here yields
+ * measurable overhead for some
+ * workloads. Work around it by jumping
+ * backwards to rescore 'result'.
+ */
+ need_rescore = true;
+ goto rescore;
+ }
+ }
+ return result;
+}
+
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp6_lib_lookup4(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ struct udp_table *udptable)
+{
+ return NULL;
+}
+
+static void udp6_hash4(struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp6_lib_lookup4(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ struct udp_table *udptable)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ const struct hlist_nulls_node *node;
+ struct udp_hslot *hslot4;
+ unsigned int hash4, slot;
+ struct udp_sock *up;
+ struct sock *sk;
+
+ hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash4 & udptable->mask;
+ hslot4 = &udptable->hash4[slot];
+
+begin:
+ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
+ sk = (struct sock *)up;
+ if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
+ return sk;
+ }
+
+ /* if the nulls value we got at the end of this lookup is not the
+ * expected one, we must restart lookup. We probably met an item that
+ * was moved to another chain due to rehash.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ return NULL;
+}
+
+static void udp6_hash4(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ unsigned int hash;
+
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) {
+ udp4_hash4(sk);
+ return;
+ }
+
+ if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ return;
+
+ hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+
+ udp_lib_hash4(sk, hash);
+}
+#endif /* CONFIG_BASE_SMALL */
+
+/* rcu_read_lock() must be held */
+struct sock *__udp6_lib_lookup(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
+ int dif, int sdif, struct udp_table *udptable,
+ struct sk_buff *skb)
+{
+ unsigned short hnum = ntohs(dport);
+ struct udp_hslot *hslot2;
+ struct sock *result, *sk;
+ unsigned int hash2;
+
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ if (udp_has_hash4(hslot2)) {
+ result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum,
+ dif, sdif, udptable);
+ if (result) /* udp6_lib_lookup4 return sk or NULL */
+ return result;
+ }
+
+ /* Lookup connected or non-wildcard sockets */
+ result = udp6_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+ goto done;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+ udptable == net->ipv4.udp_table) {
+ sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, dif,
+ udp6_ehashfn);
+ if (sk) {
+ result = sk;
+ goto done;
}
}
+
+ /* Got non-wildcard socket or error on first lookup */
if (result)
- sock_hold(result);
- read_unlock(&udp_hash_lock);
+ goto done;
+
+ /* Lookup wildcard sockets */
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ result = udp6_lib_lookup2(net, saddr, sport,
+ &in6addr_any, hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result))
+ goto done;
+
+ /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */
+ result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
+ udptable);
+
+done:
+ if (IS_ERR(result))
+ return NULL;
return result;
}
+EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
+
+static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport,
+ struct udp_table *udptable)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ &iph->daddr, dport, inet6_iif(skb),
+ inet6_sdif(skb), udptable, skb);
+}
+
+struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
+ __be16 sport, __be16 dport)
+{
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
+ struct net *net = dev_net(skb->dev);
+ int iif, sdif;
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp6_lib_lookup(net, &iph->saddr, sport,
+ &iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
+}
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport, int dif)
+{
+ struct sock *sk;
+
+ sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ dif, 0, net->ipv4.udp_table, NULL);
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+#endif
+
+/* do not use the scratch area len for jumbogram: their length exceeds the
+ * scratch area space; note that the IP6CB flags is still in the first
+ * cacheline, so checking for jumbograms is cheap
+ */
+static int udp6_skb_len(struct sk_buff *skb)
+{
+ return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb);
+}
/*
- * This should be easy, if there is something there we
- * return it, otherwise we block.
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
*/
-int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet_sock *inet = inet_sk(sk);
- struct sk_buff *skb;
- size_t copied;
- int err, copy_only, is_udplite = IS_UDPLITE(sk);
+ struct sk_buff *skb;
+ unsigned int ulen, copied;
+ int off, err, peeking = flags & MSG_PEEK;
+ int is_udplite = IS_UDPLITE(sk);
+ struct udp_mib __percpu *mib;
+ bool checksum_valid = false;
+ int is_udp4;
- if (addr_len)
- *addr_len=sizeof(struct sockaddr_in6);
-
if (flags & MSG_ERRQUEUE)
- return ipv6_recv_error(sk, msg, len);
+ return ipv6_recv_error(sk, msg, len, addr_len);
+
+ if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
+ return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ off = sk_peek_offset(sk, flags);
+ skb = __skb_recv_udp(sk, flags, &off, &err);
if (!skb)
- goto out;
+ return err;
- copied = skb->len - sizeof(struct udphdr);
- if (copied > len) {
- copied = len;
- msg->msg_flags |= MSG_TRUNC;
- }
+ ulen = udp6_skb_len(skb);
+ copied = len;
+ if (copied > ulen - off)
+ copied = ulen - off;
+ else if (copied < ulen)
+ msg->msg_flags |= MSG_TRUNC;
+
+ is_udp4 = (skb->protocol == htons(ETH_P_IP));
+ mib = __UDPX_MIB(sk, is_udp4);
/*
- * Decide whether to checksum and/or copy data.
+ * If checksum is needed at all, try to do it while copying the
+ * data. If the data is truncated, or if we only want a partial
+ * coverage checksum (UDP-Lite), do it before the copy.
*/
- copy_only = (skb->ip_summed==CHECKSUM_UNNECESSARY);
- if (is_udplite || (!copy_only && msg->msg_flags&MSG_TRUNC)) {
- if (__udp_lib_checksum_complete(skb))
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
+ checksum_valid = udp_skb_csum_unnecessary(skb) ||
+ !__udp_lib_checksum_complete(skb);
+ if (!checksum_valid)
goto csum_copy_err;
- copy_only = 1;
}
- if (copy_only)
- err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
- msg->msg_iov, copied );
- else {
- err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+ if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
+ if (udp_skb_is_linear(skb))
+ err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
+ else
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
+ } else {
+ err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
}
- if (err)
- goto out_free;
+ if (unlikely(err)) {
+ if (!peeking) {
+ udp_drops_inc(sk);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
+ }
+ kfree_skb(skb);
+ return err;
+ }
+ if (!peeking)
+ SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (msg->msg_name) {
- struct sockaddr_in6 *sin6;
-
- sin6 = (struct sockaddr_in6 *) msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = skb->h.uh->source;
+ sin6->sin6_port = udp_hdr(skb)->source;
sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
-
- if (skb->protocol == htons(ETH_P_IP))
- ipv6_addr_set(&sin6->sin6_addr, 0, 0,
- htonl(0xffff), skb->nh.iph->saddr);
- else {
- ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr);
- if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = IP6CB(skb)->iif;
+
+ if (is_udp4) {
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
+ &sin6->sin6_addr);
+ sin6->sin6_scope_id = 0;
+ } else {
+ sin6->sin6_addr = ipv6_hdr(skb)->saddr;
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
}
+ *addr_len = sizeof(*sin6);
+ BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin6,
+ addr_len);
}
- if (skb->protocol == htons(ETH_P_IP)) {
- if (inet->cmsg_flags)
- ip_cmsg_recv(msg, skb);
+
+ if (udp_test_bit(GRO_ENABLED, sk))
+ udp_cmsg_recv(msg, sk, skb);
+
+ if (np->rxopt.all)
+ ip6_datagram_recv_common_ctl(sk, msg, skb);
+
+ if (is_udp4) {
+ if (inet_cmsg_flags(inet))
+ ip_cmsg_recv_offset(msg, sk, skb,
+ sizeof(struct udphdr), off);
} else {
if (np->rxopt.all)
- datagram_recv_ctl(sk, msg, skb);
- }
+ ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ }
err = copied;
if (flags & MSG_TRUNC)
- err = skb->len - sizeof(struct udphdr);
+ err = ulen;
-out_free:
- skb_free_datagram(sk, skb);
-out:
+ skb_consume_udp(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
- skb_kill_datagram(sk, skb, flags);
-
- if (flags & MSG_DONTWAIT) {
- UDP6_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite);
- return -EAGAIN;
+ if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
+ udp_skb_destructor)) {
+ SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
+
+ /* starting over for a new packet, but check if we need to yield */
+ cond_resched();
+ msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
-void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info,
- struct hlist_head udptable[] )
+DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+ static_branch_inc(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info);
+ const struct ip6_tnl_encap_ops *encap;
+
+ encap = rcu_dereference(ip6tun_encaps[i]);
+ if (!encap)
+ continue;
+ handler = encap->err_handler;
+ if (handler && !handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+ const struct ipv6hdr *hdr, int offset,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sock *sk,
+ struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, __be32 info)
+{
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ int network_offset, transport_offset;
+ struct udp_sock *up;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv6 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, offset);
+
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (lookup && lookup(sk, skb))
+ sk = NULL;
+
+ goto out;
+ }
+
+ sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+ &hdr->saddr, uh->dest,
+ inet6_iif(skb), 0, udptable, skb);
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+out:
+ if (!sk) {
+ sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
+ offset, info));
+ }
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
+int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info,
+ struct udp_table *udptable)
{
struct ipv6_pinfo *np;
- struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
- struct in6_addr *saddr = &hdr->saddr;
- struct in6_addr *daddr = &hdr->daddr;
- struct udphdr *uh = (struct udphdr*)(skb->data+offset);
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+ const struct in6_addr *saddr = &hdr->saddr;
+ const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr;
+ struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+ bool tunnel = false;
struct sock *sk;
+ int harderr;
int err;
+ struct net *net = dev_net(skb->dev);
+
+ sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
+ inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
+
+ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+ /* No socket for error: try tunnels before discarding */
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+ udptable, sk, skb,
+ opt, type, code, info);
+ if (!sk)
+ return 0;
+ } else
+ sk = ERR_PTR(-ENOENT);
+
+ if (IS_ERR(sk)) {
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
- sk = __udp6_lib_lookup(daddr, uh->dest,
- saddr, uh->source, inet6_iif(skb), udptable);
- if (sk == NULL)
- return;
+ tunnel = true;
+ }
+ harderr = icmpv6_err_convert(type, code, &err);
np = inet6_sk(sk);
- if (!icmpv6_err_convert(type, code, &err) && !np->recverr)
+ if (type == ICMPV6_PKT_TOOBIG) {
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+ ip6_sk_update_pmtu(skb, sk, info);
+ if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT)
+ harderr = 1;
+ }
+ if (type == NDISC_REDIRECT) {
+ if (tunnel) {
+ ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+ READ_ONCE(sk->sk_mark),
+ sk_uid(sk));
+ } else {
+ ip6_sk_redirect(skb, sk);
+ }
goto out;
+ }
- if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
+ /* Tunnels don't have an application socket: don't pass errors back */
+ if (tunnel) {
+ if (udp_sk(sk)->encap_err_rcv)
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest,
+ ntohl(info), (u8 *)(uh+1));
goto out;
+ }
- if (np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk)) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
+ }
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
out:
- sock_put(sk);
+ return 0;
+}
+
+static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
+ }
+
+ rc = __udp_enqueue_schedule_skb(sk, skb);
+ if (rc < 0) {
+ int is_udplite = IS_UDPLITE(sk);
+ enum skb_drop_reason drop_reason;
+
+ /* Note that an ENOMEM error is charged twice */
+ if (rc == -ENOMEM) {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_RCVBUFERRORS, is_udplite);
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_MEMERRORS, is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
+ UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ trace_udp_fail_queue_rcv_skb(rc, sk, skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
+ return -1;
+ }
+
+ return 0;
}
-static __inline__ void udpv6_err(struct sk_buff *skb,
- struct inet6_skb_parm *opt, int type,
- int code, int offset, __be32 info )
+static __inline__ int udpv6_err(struct sk_buff *skb,
+ struct inet6_skb_parm *opt, u8 type,
+ u8 code, int offset, __be32 info)
{
- return __udp6_lib_err(skb, opt, type, code, offset, info, udp_hash);
+ return __udp6_lib_err(skb, opt, type, code, offset, info,
+ dev_net(skb->dev)->ipv4.udp_table);
}
-int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
- int rc;
+ int is_udplite = IS_UDPLITE(sk);
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
+ nf_reset_ct(skb);
+
+ if (static_branch_unlikely(&udpv6_encap_needed_key) &&
+ READ_ONCE(up->encap_type)) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+ /*
+ * This is an encapsulation socket so pass the skb to
+ * the socket's udp_encap_rcv() hook. Otherwise, just
+ * fall through and pass this up the UDP socket.
+ * up->encap_rcv() returns the following value:
+ * =0 if skb was successfully passed to the encap
+ * handler or was discarded by it.
+ * >0 if skb should be passed on to UDP.
+ * <0 if skb should be resubmitted as proto -N
+ */
+
+ /* if we're overly short, let UDP handle it */
+ encap_rcv = READ_ONCE(up->encap_rcv);
+ if (encap_rcv) {
+ int ret;
+
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ ret = encap_rcv(sk, skb);
+ if (ret <= 0) {
+ __UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
+ return -ret;
+ }
+ }
+
+ /* FALLTHROUGH -- it's a UDP Packet */
+ }
/*
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
*/
- if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ u16 pcrlen = READ_ONCE(up->pcrlen);
- if (up->pcrlen == 0) { /* full coverage was set */
- LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: partial coverage"
- " %d while full coverage %d requested\n",
- UDP_SKB_CB(skb)->cscov, skb->len);
+ if (pcrlen == 0) { /* full coverage was set */
+ net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
- LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: coverage %d "
- "too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
+ net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, pcrlen);
goto drop;
}
}
- if (udp_lib_checksum_complete(skb))
- goto drop;
+ prefetch(&sk->sk_rmem_alloc);
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
- if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
- /* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
- UDP6_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
- }
- UDP6_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
- return 0;
+
+ udp_csum_pull_header(skb);
+
+ skb_dst_drop(skb);
+
+ return __udpv6_queue_rcv_skb(sk, skb);
+
+csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
+ __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
- UDP6_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
- kfree_skb(skb);
+ __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ udp_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
-static struct sock *udp_v6_mcast_next(struct sock *sk,
- __be16 loc_port, struct in6_addr *loc_addr,
- __be16 rmt_port, struct in6_addr *rmt_addr,
- int dif)
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- struct hlist_node *node;
- struct sock *s = sk;
- unsigned short num = ntohs(loc_port);
-
- sk_for_each_from(s, node) {
- struct inet_sock *inet = inet_sk(s);
-
- if (inet->num == num && s->sk_family == PF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(s);
- if (inet->dport) {
- if (inet->dport != rmt_port)
- continue;
- }
- if (!ipv6_addr_any(&np->daddr) &&
- !ipv6_addr_equal(&np->daddr, rmt_addr))
- continue;
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udpv6_queue_rcv_one_skb(sk, skb);
+
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, false);
+ skb_list_walk_safe(segs, skb, next) {
+ __skb_pull(skb, skb_transport_offset(skb));
+
+ udp_post_segment_fix_csum(skb);
+ ret = udpv6_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+ true);
+ }
+ return 0;
+}
- if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)
- continue;
+static bool __udp_v6_is_mcast_sock(struct net *net, const struct sock *sk,
+ __be16 loc_port, const struct in6_addr *loc_addr,
+ __be16 rmt_port, const struct in6_addr *rmt_addr,
+ int dif, int sdif, unsigned short hnum)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ return false;
+
+ if (udp_sk(sk)->udp_port_hash != hnum ||
+ sk->sk_family != PF_INET6 ||
+ (inet->inet_dport && inet->inet_dport != rmt_port) ||
+ (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+ !udp_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif) ||
+ (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
+ return false;
+ if (!inet6_mc_check(sk, loc_addr, rmt_addr))
+ return false;
+ return true;
+}
- if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr))
- continue;
- }
- if(!inet6_mc_check(s, loc_addr, rmt_addr))
- continue;
- return s;
- }
- }
- return NULL;
+static void udp6_csum_zero_error(struct sk_buff *skb)
+{
+ /* RFC 2460 section 8.1 says that we SHOULD log
+ * this error. Well, it is reasonable.
+ */
+ net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n",
+ &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source),
+ &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest));
}
/*
* Note: called only from the BH handler context,
* so we don't need to lock the hashes.
*/
-static int __udp6_lib_mcast_deliver(struct sk_buff *skb, struct in6_addr *saddr,
- struct in6_addr *daddr, struct hlist_head udptable[])
+static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ struct udp_table *udptable, int proto)
{
- struct sock *sk, *sk2;
- const struct udphdr *uh = skb->h.uh;
- int dif;
+ struct sock *sk, *first = NULL;
+ const struct udphdr *uh = udp_hdr(skb);
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ unsigned int offset = offsetof(typeof(*sk), sk_node);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+ int dif = inet6_iif(skb);
+ int sdif = inet6_sdif(skb);
+ struct hlist_node *node;
+ struct sk_buff *nskb;
+
+ if (use_hash2) {
+ hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
+ udptable->mask;
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
+start_lookup:
+ hslot = &udptable->hash2[hash2].hslot;
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
- read_lock(&udp_hash_lock);
- sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
- dif = inet6_iif(skb);
- sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
- if (!sk) {
- kfree_skb(skb);
- goto out;
+ sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+ if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
+ uh->source, saddr, dif, sdif,
+ hnum))
+ continue;
+ /* If zero checksum and no_check is not on for
+ * the socket then skip it.
+ */
+ if (!uh->check && !udp_get_no_check6_rx(sk))
+ continue;
+ if (!first) {
+ first = sk;
+ continue;
+ }
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!nskb)) {
+ udp_drops_inc(sk);
+ __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ __UDP6_INC_STATS(net, UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ continue;
+ }
+
+ if (udpv6_queue_rcv_skb(sk, nskb) > 0)
+ consume_skb(nskb);
}
- sk2 = sk;
- while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
- uh->source, saddr, dif))) {
- struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
- if (buff)
- udpv6_queue_rcv_skb(sk2, buff);
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
+ if (first) {
+ if (udpv6_queue_rcv_skb(first, skb) > 0)
+ consume_skb(skb);
+ } else {
+ kfree_skb(skb);
+ __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
}
- udpv6_queue_rcv_skb(sk, skb);
-out:
- read_unlock(&udp_hash_lock);
return 0;
}
-static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh)
+static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+ if (udp_sk_rx_dst_set(sk, dst))
+ sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
+}
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+ struct udphdr *uh)
{
- if (uh->check == 0) {
- /* RFC 2460 section 8.1 says that we SHOULD log
- this error. Well, it is reasonable.
- */
- LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
- return 1;
- }
- if (skb->ip_summed == CHECKSUM_COMPLETE &&
- !csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
- skb->len, IPPROTO_UDP, skb->csum ))
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ int ret;
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->csum = ~csum_unfold(csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, IPPROTO_UDP,
- 0));
+ ret = udpv6_queue_rcv_skb(sk, skb);
- return (UDP_SKB_CB(skb)->partial_cov = 0);
+ /* a return value > 0 means to resubmit the input */
+ if (ret > 0)
+ return ret;
+ return 0;
}
-int __udp6_lib_rcv(struct sk_buff **pskb, struct hlist_head udptable[],
- int is_udplite)
+int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ int proto)
{
- struct sk_buff *skb = *pskb;
- struct sock *sk;
- struct udphdr *uh;
- struct net_device *dev = skb->dev;
- struct in6_addr *saddr, *daddr;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ const struct in6_addr *saddr, *daddr;
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk = NULL;
+ struct udphdr *uh;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- goto short_packet;
+ goto discard;
- saddr = &skb->nh.ipv6h->saddr;
- daddr = &skb->nh.ipv6h->daddr;
- uh = skb->h.uh;
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = &ipv6_hdr(skb)->daddr;
+ uh = udp_hdr(skb);
ulen = ntohs(uh->len);
if (ulen > skb->len)
goto short_packet;
- if(! is_udplite ) { /* UDP validates ulen. */
+ if (proto == IPPROTO_UDP) {
+ /* UDP validates ulen. */
/* Check for jumbo payload */
if (ulen == 0)
@@ -424,67 +1103,167 @@ int __udp6_lib_rcv(struct sk_buff **pskb, struct hlist_head udptable[],
if (ulen < skb->len) {
if (pskb_trim_rcsum(skb, ulen))
goto short_packet;
- saddr = &skb->nh.ipv6h->saddr;
- daddr = &skb->nh.ipv6h->daddr;
- uh = skb->h.uh;
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = &ipv6_hdr(skb)->daddr;
+ uh = udp_hdr(skb);
}
+ }
+
+ if (udp6_csum_init(skb, uh, proto))
+ goto csum_error;
+
+ /* Check if the socket is already available, e.g. due to early demux */
+ sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+ &refcounted, udp6_ehashfn);
+ if (IS_ERR(sk))
+ goto no_sk;
- if (udp6_csum_init(skb, uh))
- goto discard;
+ if (sk) {
+ struct dst_entry *dst = skb_dst(skb);
+ int ret;
+
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
+ udp6_sk_rx_dst_set(sk, dst);
+
+ if (!uh->check && !udp_get_no_check6_rx(sk)) {
+ if (refcounted)
+ sock_put(sk);
+ goto report_csum_error;
+ }
- } else { /* UDP-Lite validates cscov. */
- if (udplite6_csum_init(skb, uh))
- goto discard;
+ ret = udp6_unicast_rcv_skb(sk, skb, uh);
+ if (refcounted)
+ sock_put(sk);
+ return ret;
}
- /*
- * Multicast receive code
+ /*
+ * Multicast receive code
*/
if (ipv6_addr_is_multicast(daddr))
- return __udp6_lib_mcast_deliver(skb, saddr, daddr, udptable);
+ return __udp6_lib_mcast_deliver(net, skb,
+ saddr, daddr, udptable, proto);
/* Unicast */
+ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (sk) {
+ if (!uh->check && !udp_get_no_check6_rx(sk))
+ goto report_csum_error;
+ return udp6_unicast_rcv_skb(sk, skb, uh);
+ }
+no_sk:
+ reason = SKB_DROP_REASON_NO_SOCKET;
- /*
- * check socket cache ... must talk to Alan about his plans
- * for sock caches... i'll skip this for now.
- */
- sk = __udp6_lib_lookup(saddr, uh->source,
- daddr, uh->dest, inet6_iif(skb), udptable);
+ if (!uh->check)
+ goto report_csum_error;
- if (sk == NULL) {
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard;
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard;
+ nf_reset_ct(skb);
- if (udp_lib_checksum_complete(skb))
- goto discard;
- UDP6_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite);
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
- icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev);
+ __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
- kfree_skb(skb);
- return(0);
+ sk_skb_reason_drop(sk, skb, reason);
+ return 0;
+
+short_packet:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+ net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
+ proto == IPPROTO_UDPLITE ? "-Lite" : "",
+ saddr, ntohs(uh->source),
+ ulen, skb->len,
+ daddr, ntohs(uh->dest));
+ goto discard;
+
+report_csum_error:
+ udp6_csum_zero_error(skb);
+csum_error:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_UDP_CSUM;
+ __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+discard:
+ __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ sk_skb_reason_drop(sk, skb, reason);
+ return 0;
+}
+
+
+static struct sock *__udp6_lib_demux_lookup(struct net *net,
+ __be16 loc_port, const struct in6_addr *loc_addr,
+ __be16 rmt_port, const struct in6_addr *rmt_addr,
+ int dif, int sdif)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+ unsigned short hnum = ntohs(loc_port);
+ struct udp_hslot *hslot2;
+ unsigned int hash2;
+ __portpair ports;
+ struct sock *sk;
+
+ hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif))
+ return sk;
+ /* Only check first socket in chain */
+ break;
}
-
- /* deliver */
-
- udpv6_queue_rcv_skb(sk, skb);
- sock_put(sk);
- return(0);
+ return NULL;
+}
+
+void udp_v6_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct udphdr *uh;
+ struct sock *sk;
+ struct dst_entry *dst;
+ int dif = skb->dev->ifindex;
+ int sdif = inet6_sdif(skb);
-short_packet:
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: %d/%u\n",
- is_udplite? "-Lite" : "", ulen, skb->len);
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct udphdr)))
+ return;
-discard:
- UDP6_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
- return(0);
+ uh = udp_hdr(skb);
+
+ if (skb->pkt_type == PACKET_HOST)
+ sk = __udp6_lib_demux_lookup(net, uh->dest,
+ &ipv6_hdr(skb)->daddr,
+ uh->source, &ipv6_hdr(skb)->saddr,
+ dif, sdif);
+ else
+ return;
+
+ if (!sk)
+ return;
+
+ skb->sk = sk;
+ DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
+ skb->destructor = sock_pfree;
+ dst = rcu_dereference(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
+ if (dst) {
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
+ }
}
-static __inline__ int udpv6_rcv(struct sk_buff **pskb)
+INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb)
{
- return __udp6_lib_rcv(pskb, udp_hash, 0);
+ return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
/*
@@ -494,81 +1273,232 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
- if (up->pending) {
+ if (up->pending == AF_INET)
+ udp_flush_pending_frames(sk);
+ else if (up->pending) {
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
ip6_flush_pending_frames(sk);
- }
+ }
+}
+
+static int udpv6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
+ /* The following checks are replicated from __ip6_datagram_connect()
+ * and intended to prevent BPF program called below from accessing
+ * bytes that are out of the bound specified by user in addr_len.
+ */
+ if (uaddr->sa_family == AF_INET) {
+ if (ipv6_only_sock(sk))
+ return -EAFNOSUPPORT;
+ return udp_pre_connect(sk, uaddr, addr_len);
+ }
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+
+static int udpv6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip6_datagram_connect(sk, uaddr, addr_len);
+ if (!res)
+ udp6_hash4(sk);
+ release_sock(sk);
+ return res;
+}
+
+/**
+ * udp6_hwcsum_outgoing - handle outgoing HW checksumming
+ * @sk: socket we are sending on
+ * @skb: sk_buff containing the filled-in UDP header
+ * (checksum field must be zeroed out)
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ */
+static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int len)
+{
+ unsigned int offset;
+ struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *frags = skb_shinfo(skb)->frag_list;
+ __wsum csum = 0;
+
+ if (!frags) {
+ /* Only one fragment on the socket. */
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0);
+ } else {
+ /*
+ * HW-checksum won't work as there are two or more
+ * fragments on the socket so that all csums of sk_buffs
+ * should be together
+ */
+ offset = skb_transport_offset(skb);
+ skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ csum = skb->csum;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ do {
+ csum = csum_add(csum, frags->csum);
+ } while ((frags = frags->next));
+
+ uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP,
+ csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
}
/*
* Sending
*/
-static int udp_v6_push_pending_frames(struct sock *sk)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+ struct inet_cork *cork)
{
- struct sk_buff *skb;
+ struct sock *sk = skb->sk;
struct udphdr *uh;
- struct udp_sock *up = udp_sk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct flowi *fl = &inet->cork.fl;
int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
__wsum csum = 0;
-
- /* Grab the skbuff where UDP header space exists. */
- if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
- goto out;
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
/*
* Create a UDP header
*/
- uh = skb->h.uh;
- uh->source = fl->fl_ip_sport;
- uh->dest = fl->fl_ip_dport;
- uh->len = htons(up->len);
+ uh = udp_hdr(skb);
+ uh->source = fl6->fl6_sport;
+ uh->dest = fl6->fl6_dport;
+ uh->len = htons(len);
uh->check = 0;
- if (up->pcflag)
- csum = udplite_csum_outgoing(sk, skb);
- else
- csum = udp_csum_outgoing(sk, skb);
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ if (udp_get_no_check6_tx(sk)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ if (is_udplite || dst_xfrm(skb_dst(skb))) {
+ kfree_skb(skb);
+ return -EIO;
+ }
+
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
+
+ /* Don't checksum the payload, skb will get segmented */
+ goto csum_partial;
+ }
+ }
+
+ if (is_udplite)
+ csum = udplite_csum(skb);
+ else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */
+ skb->ip_summed = CHECKSUM_NONE;
+ goto send;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
+ udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
+ goto send;
+ } else
+ csum = udp_csum(skb);
/* add protocol-dependent pseudo-header */
- uh->check = csum_ipv6_magic(&fl->fl6_src, &fl->fl6_dst,
- up->len, fl->proto, csum );
+ uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+ len, fl6->flowi6_proto, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
- err = ip6_push_pending_frames(sk);
+send:
+ err = ip6_send_skb(skb);
+ if (err) {
+ if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ err = 0;
+ }
+ } else {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
+ }
+ return err;
+}
+
+static int udp_v6_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct udp_sock *up = udp_sk(sk);
+ int err = 0;
+
+ if (up->pending == AF_INET)
+ return udp_push_pending_frames(sk);
+
+ skb = ip6_finish_skb(sk);
+ if (!skb)
+ goto out;
+
+ err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
+ &inet_sk(sk)->cork.base);
out:
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
return err;
}
-int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len)
+int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct ipv6_txoptions opt_space;
struct udp_sock *up = udp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
- struct in6_addr *daddr, *final_p = NULL, final;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ struct in6_addr *daddr, *final_p, final;
struct ipv6_txoptions *opt = NULL;
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
- struct flowi fl;
+ struct inet_cork_full cork;
+ struct flowi6 *fl6 = &cork.fl.u.ip6;
struct dst_entry *dst;
+ struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
+ bool connected = false;
int ulen = len;
- int hlimit = -1;
- int tclass = -1;
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int err;
- int connected = 0;
- int is_udplite = up->pcflag;
+ int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ ipcm6_init_sk(&ipc6, sk);
+ ipc6.gso_size = READ_ONCE(up->gso_size);
+
/* destination address check */
if (sin6) {
if (addr_len < offsetof(struct sockaddr, sa_data))
@@ -579,6 +1509,10 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
daddr = &sin6->sin6_addr;
+ if (ipv6_addr_any(daddr) &&
+ ipv6_addr_v4mapped(&np->saddr))
+ ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
+ daddr);
break;
case AF_INET:
goto do_udp_sendmsg;
@@ -590,38 +1524,40 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
default:
return -EINVAL;
}
- } else if (!up->pending) {
+ } else if (!READ_ONCE(up->pending)) {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- daddr = &np->daddr;
- } else
+ daddr = &sk->sk_v6_daddr;
+ } else
daddr = NULL;
if (daddr) {
- if (ipv6_addr_type(daddr) == IPV6_ADDR_MAPPED) {
+ if (ipv6_addr_v4mapped(daddr)) {
struct sockaddr_in sin;
sin.sin_family = AF_INET;
- sin.sin_port = sin6 ? sin6->sin6_port : inet->dport;
+ sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport;
sin.sin_addr.s_addr = daddr->s6_addr32[3];
msg->msg_name = &sin;
msg->msg_namelen = sizeof(sin);
do_udp_sendmsg:
- if (__ipv6_only_sock(sk))
- return -ENETUNREACH;
- return udp_sendmsg(iocb, sk, msg, len);
+ err = ipv6_only_sock(sk) ?
+ -ENETUNREACH : udp_sendmsg(sk, msg, len);
+ msg->msg_name = sin6;
+ msg->msg_namelen = addr_len;
+ return err;
}
}
- if (up->pending == AF_INET)
- return udp_sendmsg(iocb, sk, msg, len);
-
/* Rough check on arithmetic overflow,
- better check is made in ip6_build_xmit
+ better check is made in ip6_append_data().
*/
if (len > INT_MAX - sizeof(struct udphdr))
return -EMSGSIZE;
-
- if (up->pending) {
+
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+ if (READ_ONCE(up->pending)) {
+ if (READ_ONCE(up->pending) == AF_INET)
+ return udp_sendmsg(sk, msg, len);
/*
* There are pending frames.
* The socket lock must be held while it's corked.
@@ -639,22 +1575,21 @@ do_udp_sendmsg:
}
ulen += sizeof(struct udphdr);
- memset(&fl, 0, sizeof(fl));
+ memset(fl6, 0, sizeof(*fl6));
if (sin6) {
if (sin6->sin6_port == 0)
return -EINVAL;
- fl.fl_ip_dport = sin6->sin6_port;
+ fl6->fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
- if (np->sndflow) {
- fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
+ if (inet6_test_bit(SNDFLOW, sk)) {
+ fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
- daddr = &flowlabel->dst;
}
}
@@ -663,154 +1598,178 @@ do_udp_sendmsg:
* sk->sk_dst_cache.
*/
if (sk->sk_state == TCP_ESTABLISHED &&
- ipv6_addr_equal(daddr, &np->daddr))
- daddr = &np->daddr;
+ ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+ daddr = &sk->sk_v6_daddr;
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
- ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
- fl.oif = sin6->sin6_scope_id;
+ __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
+ fl6->flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- fl.fl_ip_dport = inet->dport;
- daddr = &np->daddr;
- fl.fl6_flowlabel = np->flow_label;
- connected = 1;
+ fl6->fl6_dport = inet->inet_dport;
+ daddr = &sk->sk_v6_daddr;
+ fl6->flowlabel = np->flow_label;
+ connected = true;
}
- if (!fl.oif)
- fl.oif = sk->sk_bound_dev_if;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = READ_ONCE(sk->sk_bound_dev_if);
+
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ fl6->flowi6_uid = sk_uid(sk);
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(*opt);
+ ipc6.opt = opt;
- err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
+ err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
+ if (err > 0) {
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
+ &ipc6);
+ connected = false;
+ }
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
+ if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
opt = NULL;
- connected = 0;
}
- if (opt == NULL)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
-
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, daddr);
- if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl_ip_sport = inet->sport;
-
- /* merge ip6_build_xmit from ip6_output */
- if (opt && opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- connected = 0;
- }
-
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) {
- fl.oif = np->mcast_oif;
- connected = 0;
+ ipc6.opt = opt;
+
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->flowi6_mark = ipc6.sockc.mark;
+ fl6->daddr = *daddr;
+ if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
+ fl6->saddr = np->saddr;
+ fl6->fl6_sport = inet->inet_sport;
+
+ if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
+ err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+ (struct sockaddr *)sin6,
+ &addr_len,
+ &fl6->saddr);
+ if (err)
+ goto out_no_dst;
+ if (sin6) {
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+ /* BPF program rewrote IPv6-only by IPv4-mapped
+ * IPv6. It's currently unsupported.
+ */
+ err = -ENOTSUPP;
+ goto out_no_dst;
+ }
+ if (sin6->sin6_port == 0) {
+ /* BPF program set invalid port. Reject it. */
+ err = -EINVAL;
+ goto out_no_dst;
+ }
+ fl6->fl6_dport = sin6->sin6_port;
+ fl6->daddr = sin6->sin6_addr;
+ }
}
- security_sk_classify_flow(sk, &fl);
+ if (ipv6_addr_any(&fl6->daddr))
+ fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- err = ip6_sk_dst_lookup(sk, &dst, &fl);
- if (err)
- goto out;
+ final_p = fl6_update_dst(fl6, opt, &final);
if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
+ connected = false;
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out;
+ if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
+ fl6->flowi6_oif = READ_ONCE(np->mcast_oif);
+ connected = false;
+ } else if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = READ_ONCE(np->ucast_oif);
- if (hlimit < 0) {
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
- hlimit = np->mcast_hops;
- else
- hlimit = np->hop_limit;
- if (hlimit < 0)
- hlimit = dst_metric(dst, RTAX_HOPLIMIT);
- if (hlimit < 0)
- hlimit = ipv6_get_hoplimit(dst->dev);
- }
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
- if (tclass < 0) {
- tclass = np->tclass;
- if (tclass < 0)
- tclass = 0;
+ fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
+
+ dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto out;
}
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
+
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
+ /* Lockless fast path for the non-corking case */
+ if (!corkreq) {
+ struct sk_buff *skb;
+
+ skb = ip6_make_skb(sk, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc6,
+ dst_rt6_info(dst),
+ msg->msg_flags, &cork);
+ err = PTR_ERR(skb);
+ if (!IS_ERR_OR_NULL(skb))
+ err = udp_v6_send_skb(skb, fl6, &cork.base);
+ /* ip6_make_skb steals dst reference */
+ goto out_no_dst;
+ }
+
lock_sock(sk);
if (unlikely(up->pending)) {
/* The socket is already corked while preparing it. */
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
+ net_dbg_ratelimited("udp cork app bug 2\n");
err = -EINVAL;
goto out;
}
- up->pending = AF_INET6;
+ WRITE_ONCE(up->pending, AF_INET6);
do_append_data:
up->len += ulen;
- getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
- err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen,
- sizeof(struct udphdr), hlimit, tclass, opt, &fl,
- (struct rt6_info*)dst,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
+ &ipc6, fl6, dst_rt6_info(dst),
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
else if (!corkreq)
err = udp_v6_push_pending_frames(sk);
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
- up->pending = 0;
-
- if (dst) {
- if (connected) {
- ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ?
- &np->daddr : NULL,
-#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_equal(&fl.fl6_src, &np->saddr) ?
- &np->saddr :
-#endif
- NULL);
- } else {
- dst_release(dst);
- }
- }
+ WRITE_ONCE(up->pending, 0);
if (err > 0)
- err = np->recverr ? net_xmit_errno(err) : 0;
+ err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0;
release_sock(sk);
+
out:
+ dst_release(dst);
+out_no_dst:
fl6_sock_release(flowlabel);
- if (!err) {
- UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite);
+ txopt_put(opt_to_free);
+ if (!err)
return len;
- }
/*
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
* ENOBUFS might not be good (it's not tunable per se), but otherwise
@@ -819,52 +1778,73 @@ out:
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- UDP6_INC_STATS_USER(UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err;
do_confirm:
- dst_confirm(dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(dst, &fl6->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
goto out;
}
+EXPORT_SYMBOL(udpv6_sendmsg);
-int udpv6_destroy_sock(struct sock *sk)
+static void udpv6_splice_eof(struct socket *sock)
{
+ struct sock *sk = sock->sk;
+ struct udp_sock *up = udp_sk(sk);
+
+ if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
+ return;
+
lock_sock(sk);
- udp_v6_flush_pending_frames(sk);
+ if (up->pending && !udp_test_bit(CORK, sk))
+ udp_v6_push_pending_frames(sk);
release_sock(sk);
+}
+
+void udpv6_destroy_sock(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ lock_sock(sk);
- inet6_destroy_sock(sk);
+ /* protects from races with udp_abort() */
+ sock_set_flag(sk, SOCK_DEAD);
+ udp_v6_flush_pending_frames(sk);
+ release_sock(sk);
- return 0;
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
+ static_branch_dec(&udpv6_encap_needed_key);
+ udp_encap_disable();
+ udp_tunnel_cleanup_gro(sk);
+ }
+ }
}
/*
* Socket option code for UDP
*/
-int udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
+ return udp_lib_setsockopt(sk, level, optname,
+ optval, optlen,
udp_v6_push_pending_frames);
return ipv6_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
- udp_v6_push_pending_frames);
- return compat_ipv6_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
int udpv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -873,110 +1853,88 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname,
return ipv6_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_getsockopt(sk, level, optname, optval, optlen);
- return compat_ipv6_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
-static struct inet6_protocol udpv6_protocol = {
- .handler = udpv6_rcv,
- .err_handler = udpv6_err,
- .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-};
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
-
-static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket)
-{
- struct inet_sock *inet = inet_sk(sp);
- struct ipv6_pinfo *np = inet6_sk(sp);
- struct in6_addr *dest, *src;
- __u16 destp, srcp;
-
- dest = &np->daddr;
- src = &np->rcv_saddr;
- destp = ntohs(inet->dport);
- srcp = ntohs(inet->sport);
- seq_printf(seq,
- "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
- "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n",
- bucket,
- src->s6_addr32[0], src->s6_addr32[1],
- src->s6_addr32[2], src->s6_addr32[3], srcp,
- dest->s6_addr32[0], dest->s6_addr32[1],
- dest->s6_addr32[2], dest->s6_addr32[3], destp,
- sp->sk_state,
- atomic_read(&sp->sk_wmem_alloc),
- atomic_read(&sp->sk_rmem_alloc),
- 0, 0L, 0,
- sock_i_uid(sp), 0,
- sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp);
-}
-
int udp6_seq_show(struct seq_file *seq, void *v)
{
- if (v == SEQ_START_TOKEN)
- seq_printf(seq,
- " sl "
- "local_address "
- "remote_address "
- "st tx_queue rx_queue tr tm->when retrnsmt"
- " uid timeout inode\n");
- else
- udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket);
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
+ } else {
+ int bucket = ((struct udp_iter_state *)seq->private)->bucket;
+ const struct inet_sock *inet = inet_sk((const struct sock *)v);
+ __u16 srcp = ntohs(inet->inet_sport);
+ __u16 destp = ntohs(inet->inet_dport);
+ __ip6_dgram_sock_seq_show(seq, v, srcp, destp,
+ udp_rqueue_get(v), bucket);
+ }
return 0;
}
-static struct file_operations udp6_seq_fops;
+const struct seq_operations udp6_seq_ops = {
+ .start = udp_seq_start,
+ .next = udp_seq_next,
+ .stop = udp_seq_stop,
+ .show = udp6_seq_show,
+};
+EXPORT_SYMBOL(udp6_seq_ops);
+
static struct udp_seq_afinfo udp6_seq_afinfo = {
- .owner = THIS_MODULE,
- .name = "udp6",
.family = AF_INET6,
- .hashtable = udp_hash,
- .seq_show = udp6_seq_show,
- .seq_fops = &udp6_seq_fops,
+ .udp_table = NULL,
};
-int __init udp6_proc_init(void)
+int __net_init udp6_proc_init(struct net *net)
{
- return udp_proc_register(&udp6_seq_afinfo);
+ if (!proc_create_net_data("udp6", 0444, net->proc_net, &udp6_seq_ops,
+ sizeof(struct udp_iter_state), &udp6_seq_afinfo))
+ return -ENOMEM;
+ return 0;
}
-void udp6_proc_exit(void) {
- udp_proc_unregister(&udp6_seq_afinfo);
+void udp6_proc_exit(struct net *net)
+{
+ remove_proc_entry("udp6", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
/* ------------------------------------------------------------------------ */
struct proto udpv6_prot = {
- .name = "UDPv6",
- .owner = THIS_MODULE,
- .close = udp_lib_close,
- .connect = ip6_datagram_connect,
- .disconnect = udp_disconnect,
- .ioctl = udp_ioctl,
- .destroy = udpv6_destroy_sock,
- .setsockopt = udpv6_setsockopt,
- .getsockopt = udpv6_getsockopt,
- .sendmsg = udpv6_sendmsg,
- .recvmsg = udpv6_recvmsg,
- .backlog_rcv = udpv6_queue_rcv_skb,
- .hash = udp_lib_hash,
- .unhash = udp_lib_unhash,
- .get_port = udp_v6_get_port,
- .obj_size = sizeof(struct udp6_sock),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udpv6_setsockopt,
- .compat_getsockopt = compat_udpv6_getsockopt,
+ .name = "UDPv6",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .pre_connect = udpv6_pre_connect,
+ .connect = udpv6_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .init = udpv6_init_sock,
+ .destroy = udpv6_destroy_sock,
+ .setsockopt = udpv6_setsockopt,
+ .getsockopt = udpv6_getsockopt,
+ .sendmsg = udpv6_sendmsg,
+ .recvmsg = udpv6_recvmsg,
+ .splice_eof = udpv6_splice_eof,
+ .release_cb = ip6_datagram_release_cb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .rehash = udp_v6_rehash,
+ .get_port = udp_v6_get_port,
+ .put_port = udp_lib_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
#endif
+
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
+ .obj_size = sizeof(struct udp6_sock),
+ .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6),
+ .h.udp_table = NULL,
+ .diag_destroy = udp_abort,
};
static struct inet_protosw udpv6_protosw = {
@@ -984,15 +1942,35 @@ static struct inet_protosw udpv6_protosw = {
.protocol = IPPROTO_UDP,
.prot = &udpv6_prot,
.ops = &inet6_dgram_ops,
- .capability =-1,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
};
+int __init udpv6_init(void)
+{
+ int ret;
+
+ net_hotdata.udpv6_protocol = (struct inet6_protocol) {
+ .handler = udpv6_rcv,
+ .err_handler = udpv6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+ };
+ ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
+ if (ret)
+ goto out;
+
+ ret = inet6_register_protosw(&udpv6_protosw);
+ if (ret)
+ goto out_udpv6_protocol;
+out:
+ return ret;
+
+out_udpv6_protocol:
+ inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
+ goto out;
+}
-void __init udpv6_init(void)
+void udpv6_exit(void)
{
- if (inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP) < 0)
- printk(KERN_ERR "udpv6_init: Could not register protocol\n");
- inet6_register_protosw(&udpv6_protosw);
+ inet6_unregister_protosw(&udpv6_protosw);
+ inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
}
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index ec9878899128..8a406be25a3a 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -1,34 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDP6_IMPL_H
#define _UDP6_IMPL_H
+#include <net/aligned_data.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/protocol.h>
#include <net/addrconf.h>
#include <net/inet_common.h>
+#include <net/transp_v6.h>
-extern int __udp6_lib_rcv(struct sk_buff **, struct hlist_head [], int );
-extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
- int , int , int , __be32 , struct hlist_head []);
+int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
+int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+ __be32, struct udp_table *);
-extern int udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-extern int udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
-#ifdef CONFIG_COMPAT
-extern int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
-extern int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-#endif
-extern int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len);
-extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len);
-extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
-extern int udpv6_destroy_sock(struct sock *sk);
+int udpv6_init_sock(struct sock *sk);
+int udp_v6_get_port(struct sock *sk, unsigned short snum);
+void udp_v6_rehash(struct sock *sk);
+
+int udpv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
+int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len);
+void udpv6_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
-extern int udp6_seq_show(struct seq_file *seq, void *v);
+int udp6_seq_show(struct seq_file *seq, void *v);
#endif
#endif /* _UDP6_IMPL_H */
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
new file mode 100644
index 000000000000..046f13b1d77a
--- /dev/null
+++ b/net/ipv6/udp_offload.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * UDPv6 GSO support
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/indirect_call_wrapper.h>
+#include <net/protocol.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+#include "ip6_offload.h"
+#include <net/gro.h>
+#include <net/gso.h>
+
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+ unsigned int unfrag_ip6hlen, unfrag_len;
+ struct frag_hdr *fptr;
+ u8 *packet_start, *prevhdr;
+ u8 nexthdr;
+ u8 frag_hdr_sz = sizeof(struct frag_hdr);
+ __wsum csum;
+ int tnl_hlen;
+ int err;
+
+ if (skb->encapsulation && skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
+ segs = skb_udp_tunnel_segment(skb, features, true);
+ else {
+ const struct ipv6hdr *ipv6h;
+ struct udphdr *uh;
+
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
+ goto out;
+
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto out;
+
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp_gso_segment(skb, features, true);
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+ * do checksum of UDP packets sent as multiple IP fragments.
+ */
+
+ uh = udp_hdr(skb);
+ ipv6h = ipv6_hdr(skb);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
+ &ipv6h->daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ /* If there is no outer header we can fake a checksum offload
+ * due to the fact that we have already done the checksum in
+ * software prior to segmenting the frame.
+ */
+ if (!skb->encap_hdr_csum)
+ features |= NETIF_F_HW_CSUM;
+
+ /* Check if there is enough headroom to insert fragment header. */
+ tnl_hlen = skb_tnl_header_len(skb);
+ if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
+ if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+ goto out;
+ }
+
+ /* Find the unfragmentable header and shift it left by frag_hdr_sz
+ * bytes to insert fragment header.
+ */
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ return ERR_PTR(err);
+ unfrag_ip6hlen = err;
+ nexthdr = *prevhdr;
+ *prevhdr = NEXTHDR_FRAGMENT;
+ unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+ unfrag_ip6hlen + tnl_hlen;
+ packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+ memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+ SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+ skb->mac_header -= frag_hdr_sz;
+ skb->network_header -= frag_hdr_sz;
+
+ fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+ fptr->nexthdr = nexthdr;
+ fptr->reserved = 0;
+ fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb);
+
+ /* Fragment the skb. ipv6 header and the remaining fields of the
+ * fragment header are updated in ipv6_gso_segment()
+ */
+ segs = skb_segment(skb, features);
+ }
+
+out:
+ return segs;
+}
+
+static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct ipv6hdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
+ int iif, sdif;
+
+ sk = udp_tunnel_sk(net, true);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp6_lib_lookup(net, &iph->saddr, sport,
+ &iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
+ struct sk_buff *pp;
+
+ if (unlikely(!uh))
+ goto flush;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip;
+
+ if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+ ip6_gro_compute_pseudo))
+ goto flush;
+ else if (uh->check)
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
+ ip6_gro_compute_pseudo);
+
+skip:
+ if (static_branch_unlikely(&udpv6_encap_needed_key))
+ sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest);
+
+ pp = udp_gro_receive(head, skb, uh, sk);
+ return pp;
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
+ if (uh->check)
+ uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
+ &ipv6h->daddr, 0);
+
+ return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
+}
+
+int __init udpv6_offload_init(void)
+{
+ net_hotdata.udpv6_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = udp6_ufo_fragment,
+ .gro_receive = udp6_gro_receive,
+ .gro_complete = udp6_gro_complete,
+ },
+ };
+ return inet6_add_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP);
+}
+
+int udpv6_offload_exit(void)
+{
+ return inet6_del_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index d4cafacc235b..2cec542437f7 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -1,45 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6.
* See also net/ipv4/udplite.c
*
- * Version: $Id: udplite.c,v 1.9 2006/10/19 08:28:10 gerrit Exp $
- *
* Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk>
*
* Changes:
* Fixes:
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "UDPLite6: " fmt
+
+#include <linux/export.h>
+#include <linux/proc_fs.h>
#include "udp_impl.h"